Issue #1937 - Part 2: Update libaom source.

author: Job Bautista <jobbautista9@protonmail.com> 2022-06-25 18:15:40 +0800
committer: Job Bautista <jobbautista9@protonmail.com> 2022-06-25 18:15:40 +0800
commit: b900a6e486a83a5c1690de314e272a4907a54750 (patch)
tree: 8db913f0d8651b97f95b0716c88c8733b500cf1f /media/libaom
parent: e3fd2e48e03fecc04ae5462ae4a8b5c61b1458fb (diff)
download: uxp-b900a6e486a83a5c1690de314e272a4907a54750.tar.gz
948 files changed, 244301 insertions, 87447 deletions
diff --git a/media/libaom/src/.mailmap b/media/libaom/src/.mailmap
index 30fae4de78..1f218688c5 100644
--- a/media/libaom/src/.mailmap
+++ b/media/libaom/src/.mailmap
@@ -7,6 +7,8 @@ Andrey Norkin <anorkin@netflix.com>
 Angie Chiang <angiebird@google.com>
 Arild Fuldseth <arilfuld@cisco.com> <arild.fuldseth@gmail.com>
 Arild Fuldseth <arilfuld@cisco.com> <arilfuld@cisco.com>
+Aasaipriya Chandran <aasaipriya.c@ittiam.com>
+Aasaipriya Chandran <aasaipriya.c@ittiam.com> Aasaipriya C <100778@ittiam.com>
 Bohan Li <bohanli@google.com>
 Changjun Yang <changjun.yang@intel.com>
 Chi Yo Tsai <chiyotsai@google.com>
@@ -56,14 +58,15 @@ Paul Wilkins <paulwilkins@google.com>
 Peng Bin <binpengsmail@gmail.com>
 Peng Bin <binpengsmail@gmail.com> <pengbin@kingsoft.com>
 Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
+Rachel Barker <rachelbarker@google.com> David Barker <david.barker@argondesign.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Remya Prakasan <remya.prakasan@ittiam.com>
 Roger Zhou <youzhou@microsoft.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
-Ryan Lei <ryan.z.lei@intel.com>
-Ryan Lei <ryan.z.lei@intel.com> <ryan.lei@intel.com>
-Ryan Lei <ryan.z.lei@intel.com> <zlei3@ZLEI3-DESK.amr.corp.intel.com>
+Ryan Lei <ryanlei@fb.com> <ryan.z.lei@intel.com>
+Ryan Lei <ryanlei@fb.com> <ryan.lei@intel.com>
+Ryan Lei <ryanlei@fb.com> <zlei3@ZLEI3-DESK.amr.corp.intel.com>
 Sachin Kumar Garg <sachin.kumargarg@ittiam.com>
 Sai Deng <sdeng@google.com>
 Sami Pietilä <samipietila@google.com>
@@ -82,6 +85,7 @@ Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
 Wei-Ting Lin <weitinglin@google.com>
 Wei-Ting Lin <weitinglin@google.com> <weitingco@gmail.com>
 Wenyao Liu <wenyao.liu@cidana.com>
+Will Bresnahan <bill.wresnahan@gmail.com>
 Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@yaowu-macbookpro.roam.corp.google.com>
diff --git a/media/libaom/src/AUTHORS b/media/libaom/src/AUTHORS
index f61026fc03..84ef6fb842 100644
--- a/media/libaom/src/AUTHORS
+++ b/media/libaom/src/AUTHORS
@@ -3,7 +3,7 @@
 
 Aamir Anis <aanis@google.com>
 Aaron Watry <awatry@gmail.com>
-Aasaipriya <aasaipriya.c@ittiam.com>
+Aasaipriya Chandran <aasaipriya.c@ittiam.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
 Adrian Grange <agrange@google.com>
 Ahmad Sharif <asharif@google.com>
@@ -12,6 +12,7 @@ Alexander Bokov <alexanderbokov@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
 Aℓex Converse <aconverse@google.com>
 Alexis Ballier <aballier@gentoo.org>
+Alex Peterson <petersonab@google.com>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
 A.Mahfoodh <ab.mahfoodh@gmail.com>
@@ -22,9 +23,11 @@ Andrew Russell <anrussell@google.com>
 Andrey Norkin <anorkin@netflix.com>
 Angie Chiang <angiebird@google.com>
 Aniket Dhok <aniket.dhok@ittiam.com>
+Aniket Wanare <Aniket.wanare@ittiam.com>
 Ankur Saxena <ankurs@nvidia.com>
 Arild Fuldseth <arilfuld@cisco.com>
 Aron Rosenberg <arosenberg@logitech.com>
+Arun Singh Negi <arun.negi@ittiam.com>
 Attila Nagy <attilanagy@google.com>
 Bohan Li <bohanli@google.com>
 Brennan Shacklett <bshacklett@mozilla.com>
@@ -34,9 +37,11 @@ Changjun Yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
 Cheng Chen <chengchen@google.com>
 Cherma Rajan A <cherma.rajan@ittiam.com>
+Chethan Kumar R E <chethan.kumar@ittiam.com>
 Chi Yo Tsai <chiyotsai@google.com>
 Chm <chm@rock-chips.com>
 Christian Duvivier <cduvivier@google.com>
+Christopher Degawa <christopher.degawa@intel.com>
 Cyril Concolato <cconcolato@netflix.com>
 Dake He <dkhe@google.com>
 Damon Shen <yjshen@google.com>
@@ -45,13 +50,11 @@ Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
 Daniel Max Valenzuela <daniel.vt@samsung.com>
 Danil Chapovalov <danilchap@google.com>
-David Barker <david.barker@argondesign.com>
 David Major <dmajor@mozilla.com>
 David Michael Barr <b@rr-dav.id.au>
 David Turner <david.turner@argondesign.com>
 Deb Mukherjee <debargha@google.com>
 Deepa K G <deepa.kg@ittiam.com>
-Deng <zhipin.deng@intel.com>
 Di Chen <chendixi@google.com>
 Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
@@ -90,7 +93,7 @@ Hui Su <huisu@google.com>
 Ilie Halip <ilie.halip@gmail.com>
 Ilya Brailovskiy <brailovs@lab126.com>
 Imdad Sardharwalla <imdad.sardharwalla@argondesign.com>
-iole moccagatta <iole.moccagatta@gmail.com>
+Iole Moccagatta <iole.moccagatta@gmail.com>
 Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
@@ -103,7 +106,8 @@ Jan Gerber <j@mailb.org>
 Jan Kratochvil <jan.kratochvil@redhat.com>
 Janne Salonen <jsalonen@google.com>
 Jayasanker J <jayasanker.j@ittiam.com>
-Jean-Marc Valin <jmvalin@mozilla.com>
+Jayashri Murugan <jayashri.murugan@ittiam.com>
+Jean-Marc Valin <jmvalin@jmvalin.ca>
 Jean-Yves Avenard <jyavenard@mozilla.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
@@ -122,33 +126,43 @@ John Stark <jhnstrk@gmail.com>
 Jonathan Matthews <jonathan.matthews@argondesign.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
+Josh Verdejo <joverdejo@google.com>
 Julia Robson <juliamrobson@gmail.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 Katsuhisa Yuasa <berupon@gmail.com>
+Kavi Ramamurthy <kavii@google.com>
 KO Myung-Hun <komh@chollian.net>
 Krishna Malladi <kmalladi@google.com>
 Kyle Siefring <kylesiefring@gmail.com>
 Larisa Markeeva <lmarkeeva@google.com>
+Lauren Partin <lpartin@google.com>
 Lawrence Velázquez <larryv@macports.org>
+leolzhao <leolzhao@tencent.com>
 Lester Lu <kslu@google.com>
+liang zhao <leolzhao@tencent.com>
 Linfeng Zhang <linfengz@google.com>
+Link.Meng <monthev@gmail.com>
 Logan Goldberg <logangw@google.com>
 Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
-Luc Trudeau <ltrudeau@mozilla.com>
+Luca Versari <veluca@google.com>
+Luc Trudeau <luc@trud.ca>
+Madhu Peringassery Krishnan <mpkrishnan@tencent.com>
 Makoto Kato <makoto.kt@gmail.com>
 Mans Rullgard <mans@mansr.com>
 Marco Paniconi <marpan@google.com>
 Mark Mentovai <mark@chromium.org>
+Mark Wachsler <wachsler@google.com>
 Martin Ettl <ettl.martin78@googlemail.com>
 Martin Storsjo <martin@martin.st>
+Maryla <maryla@google.com>
 Matthew Heaney <matthewjheaney@chromium.org>
 Matthieu Vaudano <matthieu.vaudano@allegrodvt.com>
 Mattias Hansson <mattias.hansson@arm.com>
 Maxym Dmytrychenko <maxim.d33@gmail.com>
-Michael Bebenita <mbebenita@mozilla.com>
+Michael Bebenita <mbebenita@gmail.com>
 Michael Horowitz <mhoro@webrtc.org>
 Michael Kohler <michaelkohler@live.com>
 Michelle Findlay-Olynyk <mfo@google.com>
@@ -160,8 +174,10 @@ Mingliang Chen <mlchen@google.com>
 Mirko Bonadei <mbonadei@google.com>
 Monty Montgomery <cmontgomery@mozilla.com>
 Morton Jonuschat <yabawock@gmail.com>
+Mudassir Galagnath <mudassir.galaganath@ittiam.com>
 Mufaddal Chakera <mufaddal.chakera@ittiam.com>
 Nathan E. Egge <negge@mozilla.com>
+Neeraj Gadgil <neeraj.gadgil@ittiam.com>
 Neil Birkbeck <birkbeck@google.com>
 Nico Weber <thakis@chromium.org>
 Nithya V S <nithya.vs@ittiam.com>
@@ -178,8 +194,11 @@ Peng Bin <binpengsmail@gmail.com>
 Pengchong Jin <pengchong@google.com>
 Peter Boström <pbos@google.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
+Peter Kasting <pkasting@chromium.org>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
+Qiu Jianlin <jianlin.qiu@intel.com>
+Rachel Barker <rachelbarker@google.com>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
 Ralph Giles <giles@xiph.org>
@@ -189,17 +208,19 @@ Remya Prakasan <remya.prakasan@ittiam.com>
 Remy Foray <remy.foray@allegrodvt.com>
 Rob Bradford <rob@linux.intel.com>
 Robert-André Mauchin <zebob.m@gmail.com>
-RogerZhou <youzhou@microsoft.com>
+Robert Chin <robertchin@google.com>
+Roger Zhou <youzhou@microsoft.com>
 Rohit Athavale <rathaval@xilinx.com>
 Ronald S. Bultje <rsbultje@gmail.com>
 Rostislav Pehlivanov <rpehlivanov@mozilla.com>
 Ruiling Song <ruiling.song@intel.com>
 Rui Ueyama <ruiu@google.com>
 Rupert Swarbrick <rupert.swarbrick@argondesign.com>
-Ryan Lei <ryan.lei@intel.com>
+Ryan Lei <ryanlei@fb.com>
 Ryan Overbeck <rover@google.com>
 Sachin Kumar Garg <sachin.kumargarg@ittiam.com>
 Sai Deng <sdeng@google.com>
+Sami Boukortt <sboukortt@google.com>
 Sami Pietilä <samipietila@google.com>
 Sarah Parker <sarahparker@google.com>
 Sasi Inguva <isasi@google.com>
@@ -212,6 +233,7 @@ Sean Purser-Haskell <seanhaskell@google.com>
 Sebastien Alaiwan <sebastien.alaiwan@allegrodvt.com>
 Sergey Kolomenkin <kolomenkin@gmail.com>
 Sergey Ulanov <sergeyu@chromium.org>
+S Hamsalekha <hamsalekha.s@ittiam.com>
 Shimon Doodkin <helpmepro1@gmail.com>
 Shunyao Li <shunyaoli@google.com>
 SmilingWolf <lupo996@gmail.com>
@@ -220,11 +242,13 @@ Stanislav Vitvitskyy <vitvitskyy@google.com>
 Stefan Holmer <holmer@google.com>
 Steinar Midtskogen <stemidts@cisco.com>
 Suman Sunkara <sunkaras@google.com>
+susannad <susannad@google.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
 Tao Bai <michaelbai@chromium.org>
 Tarek AMARA <amatarek@justin.tv>
+Tarundeep Singh <tarundeep.singh@ittiam.com>
 Tero Rintaluoma <teror@google.com>
 Thijs Vermeir <thijsvermeir@gmail.com>
 Thomas Daede <tdaede@mozilla.com>
@@ -241,14 +265,22 @@ Urvang Joshi <urvang@google.com>
 Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
 Victoria Zhislina <niva213@gmail.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
+Vikas Prasad <vikas.prasad@ittiam.com>
+Vincent Rabaud <vrabaud@google.com>
 Vishesh <vishesh.garg@ittiam.com>
+Vishnu Teja Manyam <vishnu.teja@ittiam.com>
+Vitalii Dziumenko <vdziumenko@luxoft.com>
+Vitalii Dziumenko <vdziumenko@luxoft.corp-partner.google.com>
 Wan-Teh Chang <wtc@google.com>
 Wei-Ting Lin <weitinglin@google.com>
 Wenyao Liu <wenyao.liu@cidana.com>
+Will Bresnahan <bill.wresnahan@gmail.com>
+Xiaoqing Zhu <xzhu@netflix.com>
 Xing Jin <ddvfinite@gmail.com>
 Xin Zhao <xinzzhao@tencent.com>
-Yaowu Xu <yaowu.google.com>
+Yannis Guyon <yguyon@google.com>
 Yaowu Xu <yaowu@google.com>
+Yeqing Wu <yeqing_wu@apple.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
 Yue Chen <yuec@google.com>
@@ -256,5 +288,5 @@ Yunqing Wang <yunqingwang@google.com>
 Yury Gitman <yuryg@google.com>
 Yushin Cho <ycho@mozilla.com>
 Zhijie Yang <zhijie.yang@broadcom.com>
-zhipin deng <zhipin.deng@intel.com>
+Zhipin Deng <zhipin.deng@intel.com>
 Zoe Liu <zoeliu@gmail.com>
diff --git a/media/libaom/src/CHANGELOG b/media/libaom/src/CHANGELOG
index 11da097af2..6e9dbc4a3e 100644
--- a/media/libaom/src/CHANGELOG
+++ b/media/libaom/src/CHANGELOG
@@ -1,3 +1,411 @@
+2022-06-17 v3.4.0
+  This release includes compression efficiency and perceptual quality
+  improvements, speedup and memory optimizations, and some new features.
+  There are no ABI or API breaking changes in this release.
+
+  - New Features
+    * New --dist-metric flag with "qm-psnr" value to use quantization
+      matrices in the distortion computation for RD search. The default
+      value is "psnr".
+    * New command line option "--auto-intra-tools-off=1" to make
+      all-intra encoding faster for high bit rate under
+      "--deltaq-mode=3" mode.
+    * New rate control library aom_av1_rc for real-time hardware
+      encoders. Supports CBR for both one spatial layer and SVC.
+    * New image format AOM_IMG_FMT_NV12 can be used as input to the
+      encoder. The presence of AOM_IMG_FMT_NV12 can be detected at
+      compile time by checking if the macro AOM_HAVE_IMG_FMT_NV12 is
+      defined.
+    * New codec controls for the encoder:
+      o AV1E_SET_AUTO_INTRA_TOOLS_OFF. Only in effect if
+        --deltaq-mode=3.
+      o AV1E_SET_RTC_EXTERNAL_RC
+      o AV1E_SET_FP_MT. Only supported if libaom is built with
+        -DCONFIG_FRAME_PARALLEL_ENCODE=1.
+      o AV1E_GET_TARGET_SEQ_LEVEL_IDX
+    * New key-value pairs for the key-value API:
+      o --auto-intra-tools-off=0 (default) or 1. Only in effect if
+        --deltaq-mode=3.
+      o --strict-level-conformance=0 (default) or 1
+      o --fp-mt=0 (default) or 1. Only supported if libaom is built
+        with -DCONFIG_FRAME_PARALLEL_ENCODE=1.
+    * New aomenc options (not supported by the key-value API):
+      o --nv12
+
+  - Compression Efficiency Improvements
+    * Correctly calculate SSE for high bitdepth in skip mode, 0.2% to
+      0.6% coding gain.
+    * RTC at speed 9/10: BD-rate gain of ~4/5%
+    * RTC screen content coding: many improvements for real-time screen
+      at speed 10 (quality, speedup, and rate control), up to high
+      resolutions (1080p).
+    * RTC-SVC: fixes to make intra-only frames work for spatial layers.
+    * RTC-SVC: quality improvements for temporal layers.
+    * AV1 RT: A new passive rate control strategy for screen content, an
+      average of 7.5% coding gain, with some clips of 20+%. The feature
+      is turned off by default due to higher bit rate variation.
+
+  - Perceptual Quality Improvements
+    * RTC: Visual quality improvements for high speeds (9/10)
+    * Improvements in coding quality for all intra mode
+
+  - Speedup and Memory Optimizations
+    * ~10% speedup in good quality mode encoding.
+    * ~7% heap memory reduction in good quality encoding mode for speed
+      5 and 6.
+    * Ongoing improvements to intra-frame encoding performance on Arm
+    * Faster encoding speed for "--deltaq-mode=3" mode.
+    * ~10% speedup for speed 5/6, ~15% speedup for speed 7/8, and
+      ~10% speedup for speed 9/10 in real time encoding mode
+    * ~20% heap memory reduction in still-picture encoding mode for
+      360p-720p resolutions with multiple threads
+    * ~13% speedup for speed 6 and ~12% speedup for speed 9 in
+      still-picture encoding mode.
+    * Optimizations to improve multi-thread efficiency for still-picture
+      encoding mode.
+
+  - Bug Fixes
+    * b/204460717: README.md: replace master with main
+    * b/210677928: libaom disable_order is surprising for
+      max_reference_frames=3
+    * b/222461449: -DCONFIG_TUNE_BUTTERAUGLI=1 broken
+    * b/227207606: write_greyscale writes incorrect chroma in highbd
+      mode
+    * b/229955363: Integer-overflow in linsolve_wiener
+    * https://crbug.com/aomedia/2032
+    * https://crbug.com/aomedia/2397
+    * https://crbug.com/aomedia/2563
+    * https://crbug.com/aomedia/2815
+    * https://crbug.com/aomedia/3009
+    * https://crbug.com/aomedia/3018
+    * https://crbug.com/aomedia/3045
+    * https://crbug.com/aomedia/3101
+    * https://crbug.com/aomedia/3130
+    * https://crbug.com/aomedia/3173
+    * https://crbug.com/aomedia/3184
+    * https://crbug.com/aomedia/3187
+    * https://crbug.com/aomedia/3190
+    * https://crbug.com/aomedia/3195
+    * https://crbug.com/aomedia/3197
+    * https://crbug.com/aomedia/3201
+    * https://crbug.com/aomedia/3202
+    * https://crbug.com/aomedia/3204
+    * https://crbug.com/aomedia/3205
+    * https://crbug.com/aomedia/3207
+    * https://crbug.com/aomedia/3208
+    * https://crbug.com/aomedia/3209
+    * https://crbug.com/aomedia/3213
+    * https://crbug.com/aomedia/3214
+    * https://crbug.com/aomedia/3219
+    * https://crbug.com/aomedia/3222
+    * https://crbug.com/aomedia/3223
+    * https://crbug.com/aomedia/3225
+    * https://crbug.com/aomedia/3226
+    * https://crbug.com/aomedia/3228
+    * https://crbug.com/aomedia/3232
+    * https://crbug.com/aomedia/3236
+    * https://crbug.com/aomedia/3237
+    * https://crbug.com/aomedia/3238
+    * https://crbug.com/aomedia/3240
+    * https://crbug.com/aomedia/3243
+    * https://crbug.com/aomedia/3244
+    * https://crbug.com/aomedia/3246
+    * https://crbug.com/aomedia/3248
+    * https://crbug.com/aomedia/3250
+    * https://crbug.com/aomedia/3251
+    * https://crbug.com/aomedia/3252
+    * https://crbug.com/aomedia/3255
+    * https://crbug.com/aomedia/3257
+    * https://crbug.com/aomedia/3259
+    * https://crbug.com/aomedia/3260
+    * https://crbug.com/aomedia/3267
+    * https://crbug.com/aomedia/3268
+    * https://crbug.com/aomedia/3269
+    * https://crbug.com/aomedia/3276
+    * https://crbug.com/aomedia/3278
+    * https://crbug.com/chromium/1290068
+    * https://crbug.com/chromium/1303237
+    * https://crbug.com/chromium/1304990
+    * https://crbug.com/chromium/1321141
+    * https://crbug.com/chromium/1321388
+    * https://crbug.com/oss-fuzz/44846
+    * https://crbug.com/oss-fuzz/44856
+    * https://crbug.com/oss-fuzz/44862
+    * https://crbug.com/oss-fuzz/44904
+    * https://crbug.com/oss-fuzz/45056
+
+2022-01-28 v3.3.0
+  This release includes compression efficiency and perceptual quality
+  improvements, speedup and memory optimizations, some new features, and
+  several bug fixes.
+
+  - New Features
+    * AV1 RT: Introducing CDEF search level 5
+    * Changed real time speed 4 to behave the same as real time speed 5
+    * Add --deltaq-strength
+    * rtc: Allow scene-change and overshoot detection for svc
+    * rtc: Intra-only frame for svc
+    * AV1 RT: Option 2 for codec control AV1E_SET_ENABLE_CDEF to disable
+      CDEF on non-ref frames
+    * New codec controls AV1E_SET_LOOPFILTER_CONTROL and
+      AOME_GET_LOOPFILTER_LEVEL
+    * Improvements to three pass encoding
+
+  - Compression Efficiency Improvements
+    * Overall compression gains: 0.6%
+
+  - Perceptual Quality Improvements
+    * Improves the perceptual quality of high QP encoding for delta-q mode 4
+    * Auto select noise synthesis level for all intra
+
+  - Speedup and Memory Optimizations
+    * Added many SSE2 optimizations.
+    * Good quality 2-pass encoder speedups:
+      o Speed 2: 9%
+      o Speed 3: 12.5%
+      o Speed 4: 8%
+      o Speed 5: 3%
+      o Speed 6: 4%
+    * Real time mode encoder speedups:
+      o Speed 5: 2.6% BDRate gain, 4% speedup
+      o Speed 6: 3.5% BDRate gain, 4% speedup
+      o Speed 9: 1% BDRate gain, 3% speedup
+      o Speed 10: 3% BDRate gain, neutral speedup
+    * All intra encoding speedups (AVIF):
+      o Single thread - speed 6: 8%
+      o Single thread - speed 9: 15%
+      o Multi thread(8) - speed 6: 14%
+      o Multi thread(8) - speed 9: 34%
+
+  - Bug Fixes
+    * Issue 3163: Segmentation fault when using --enable-keyframe-filtering=2
+    * Issue 2436: Integer overflow in av1_warp_affine_c()
+    * Issue 3226: armv7 build failure due to gcc-11
+    * Issue 3195: Bug report on libaom (AddressSanitizer: heap-buffer-overflow)
+    * Issue 3191: Bug report on libaom (AddressSanitizer: SEGV on unknown
+      address)
+    * Issue 3176: Some SSE2/SADx4AvgTest.* tests fail on Windows
+    * Issue 3175: Some SSE2/SADSkipTest.* tests fail on Windows
+
+2021-10-13 v3.2.0
+  This release includes compression efficiency and perceptual quality
+  improvements, speedup and memory optimizations, as well as some new
+  features.
+
+  - New Features
+    * Introduced speeds 7, 8, and 9 for all intra mode.
+    * Introduced speed 10 for real time mode.
+    * Introduced an API that allows external partition decisions.
+    * SVC: added support for compound prediction.
+    * SVC: added support for fixed SVC modes.
+
+  - Compression Efficiency Improvements
+    * Intra-mode search improvement.
+    * Improved real time (RT) mode BDrate savings by ~5% (RT speed 5)
+      and ~12% (RT speed 6). The improvement was measured on the video
+      conference set.
+    * Improved real time mode for nonrd path (speed 7, 8, 9): BDrate
+      gains of ~3-5%.
+    * Rate control and RD adjustments based on ML research in VP9.
+      Gains of ~0.5-1.0% for HD.
+
+  - Perceptual Quality Improvements
+    * Added a new mode --deltaq-mode=3 to improve perceptual quality
+      based on a differential contrast model for still images.
+    * Added a new mode –deltaq-mode=4 to improve perceptual quality
+      based on user rated cq_level data set for still images.
+    * Weighting of some intra mode and partition size choices to better
+      manage and retain texture.
+
+  - Speedup and Memory Optimizations
+    * Further improved 2-pass good quality encoder speed:
+      o Speed 2 speedup: 18%
+      o Speed 3 speedup: 22%
+      o Speed 4 speedup: 37%
+      o Speed 5 speedup: 30%
+      o Speed 6 speedup: 20%
+    * Optimized the real time encoder (measured on the video conference
+      set):
+      o RT speed 5 speedup: 110%
+      o RT speed 6 speedup: 77%
+
+  - Bug Fixes
+    * Issue 3069: Fix one-pass mode keyframe placement off-by-one error.
+    * Issue 3156: Fix a bug in av1_quantize_lp AVX2 optimization.
+
+2021-09-29 v3.1.3
+  This release includes several bug fixes.
+
+  - Bug fixes:
+    The following four cmake changes should help the people building
+    libaom using MSVC.
+      1. exports: use CMAKE_SHARED_LIBRARY_PREFIX to determine lib name
+         https://aomedia-review.googlesource.com/c/aom/+/142342
+      2. aom_install: Install lib dlls to bindir
+         https://aomedia-review.googlesource.com/c/aom/+/146546
+      3. aom_install: use relpath for install
+         https://aomedia-review.googlesource.com/c/aom/+/146550
+      4. aom_install: don't exclude msvc from install
+         https://aomedia-review.googlesource.com/c/aom/+/146547
+
+    aom/aom_encoder.h: remove configure option reference
+    https://aomedia-review.googlesource.com/c/aom/+/146743
+
+    Issue 3113: Tests for detecting chroma subsampling in
+    av1_copy_and_extend_frame() do not work when y_width or y_height is
+    1
+
+    Issue 3115: image2yuvconfig() should calculate uv_crop_width and
+    uv_crop_height from y_crop_width and y_crop_height
+
+    Issue 3140: rc_overshoot_pct is documented as having a range of
+    0-1000, but is range checked against 0-100
+
+    Issue 3147: Build failure on Apple M1 arm64
+
+2021-07-20 v3.1.2
+  This release includes several bug fixes.
+
+  - Bug fixes:
+    exports.cmake: use APPLE and WIN32 and use def for mingw-w64
+    https://aomedia-review.googlesource.com/c/aom/+/139882
+
+    Issue 2993: Incorrect spatial_id when decoding base layer of
+    multi-layer stream
+
+    Issue 3080: Chroma Resampling by Encoder on Y4M Inputs Files Tagged
+    as C420mpeg2
+
+    Issue 3081: Use of uninitialized value $version_extra in
+    concatenation (.) or string at aom/build/cmake/version.pl line 88.
+
+2021-06-08 v3.1.1
+  This release includes several bug fixes.
+
+  - Bug fixes:
+    Issue 2965: Cherry-picked the following four commits for the
+    tune=butteraugli mode.
+      1. Add libjxl to pkg_config if enabled:
+         https://aomedia-review.googlesource.com/c/aom/+/136044
+      2. Declare set_mb_butteraugli_rdmult_scaling static:
+         https://aomedia-review.googlesource.com/c/aom/+/134506
+      3. Add color range detection in tune=butteraugli mode:
+         https://aomedia-review.googlesource.com/c/aom/+/135521
+      4. Enable tune=butteraugli in all-intra mode:
+         https://aomedia-review.googlesource.com/c/aom/+/136082
+
+    Issue 3021: Fix vmaf model initialization error when not set to
+    tune=vmaf
+
+    Issue 3050: Compilation fails with -DCONFIG_TUNE_VMAF=1
+
+    Issue 3054: Consistent crash on near-static screen content, keyframe
+    related
+
+2021-05-03 v3.1.0
+  This release adds an "all intra" mode to the encoder, which significantly
+  speeds up the encoding of AVIF still images at speed 6.
+
+  - Upgrading:
+    All intra mode for encoding AVIF still images and AV1 all intra videos:
+    AOM_USAGE_ALL_INTRA (2) can be passed as the 'usage' argument to
+    aom_codec_enc_config_default().
+
+    New encoder control IDs added:
+      - AV1E_SET_ENABLE_DIAGONAL_INTRA: Enable diagonal (D45 to D203) intra
+        prediction modes (0: false, 1: true (default)). Also available as
+        "enable-diagonal-intra" for the aom_codec_set_option() function.
+
+    New aom_tune_metric enum value: AOM_TUNE_BUTTERAUGLI. The new aomenc option
+    --tune=butteraugli was added to optimize the encoder’s perceptual quality by
+    optimizing the Butteraugli metric. Install libjxl (JPEG XL) and then pass
+    -DCONFIG_TUNE_BUTTERAUGLI=1 to the cmake command to enable it.
+
+    Addition of support for libvmaf 2.x.
+
+  - Enhancements:
+    Heap memory consumption for encoding AVIF still images is significantly
+    reduced.
+
+  - Bug fixes:
+    Issue 2601: third_party/libaom fails licensecheck
+
+    Issue 2950: Conditional expression for rc->this_key_frame_forced is always
+    true in find_next_key_frame()
+
+    Issue 2988: "make install" installs the aom.h header twice
+
+    Issue 2992: Incorrectly printing the temporal_id twice in dump_obu tool
+
+    Issue 2998:
+
+    Issue 2999:
+
+    Issue 3000:
+
+2021-02-24 v3.0.0
+  This release includes compression efficiency improvement, speed improvement
+  for realtime mode, as well as some new APIs.
+
+  - Upgrading:
+    Support for PSNR calculation based on stream bit-depth.
+
+    New encoder control IDs added:
+      - AV1E_SET_ENABLE_RECT_TX
+      - AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP
+      - AV1E_GET_BASELINE_GF_INTERVAL
+      - AV1E_SET_ENABLE_DNL_DENOISING
+
+    New decoder control IDs added:
+      - AOMD_GET_FWD_KF_PRESENT
+      - AOMD_GET_FRAME_FLAGS
+      - AOMD_GET_ALTREF_PRESENT
+      - AOMD_GET_TILE_INFO
+      - AOMD_GET_SCREEN_CONTENT_TOOLS_INFO
+      - AOMD_GET_STILL_PICTURE
+      - AOMD_GET_SB_SIZE
+      - AOMD_GET_SHOW_EXISTING_FRAME_FLAG
+      - AOMD_GET_S_FRAME_INFO
+
+    New aom_tune_content enum value: AOM_CONTENT_FILM
+
+    New aom_tune_metric enum value: AOM_TUNE_VMAF_NEG_MAX_GAIN
+
+    Coefficient and mode update can be turned off via
+    AV1E_SET_{COEFF/MODE}_COST_UPD_FREQ.
+
+    New key & value API added, available with aom_codec_set_option() function.
+
+    Scaling API expanded to include 1/4, 3/4 and 1/8.
+
+  - Enhancements:
+    Better multithreading performance with realtime mode.
+
+    New speed 9 setting for faster realtime encoding.
+
+    Smaller binary size with low bitdepth and realtime only build.
+
+    Temporal denoiser and its optimizations on x86 and Neon.
+
+    Optimizations for scaling.
+
+    Faster encoding with speed settings 2 to 6 for good encoding mode.
+
+    Improved documentation throughout the library, with function level
+    documentation, tree view and support for the dot tool.
+
+  - Bug fixes:
+    Aside from those mentioned in v2.0.1 and v2.0.2, this release includes the
+    following bug fixes:
+
+    Issue 2940: Segfault when encoding with --use-16bit-internal and --limit > 1
+
+    Issue 2941: Decoder mismatch with --rt --bit-depth=10 and --cpu-used=8
+
+    Issue 2895: mingw-w64 i686 gcc fails to build
+
+    Issue 2874: Separate ssse3 functions from sse2 file.
+
 2021-02-09 v2.0.2
   This release includes several bug fixes.
 
diff --git a/media/libaom/src/CMakeLists.txt b/media/libaom/src/CMakeLists.txt
index 84c8995a50..92bae7df43 100644
--- a/media/libaom/src/CMakeLists.txt
+++ b/media/libaom/src/CMakeLists.txt
@@ -8,21 +8,14 @@
 # License 1.0 was not distributed with this source code in the PATENTS file, you
 # can obtain it at www.aomedia.org/license/patent.
 #
-cmake_minimum_required(VERSION 3.5)
-project(AOM C CXX)
-
-if(NOT EMSCRIPTEN)
-  if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-    set(CMAKE_BUILD_TYPE
-        "Release"
-        CACHE STRING "Build type: Debug, Release, RelWithDebInfo or MinSizeRel"
-              FORCE)
-  endif()
+if(CONFIG_TFLITE)
+  cmake_minimum_required(VERSION 3.11)
+else()
+  cmake_minimum_required(VERSION 3.7)
 endif()
 
 set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
 set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-
 if("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}")
   message(
     FATAL_ERROR "Building from within the aom source tree is not supported.\n"
@@ -32,12 +25,48 @@ if("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}")
                 "And re-run CMake from the aom_build directory.")
 endif()
 
-# Updating version info.
+project(AOM C CXX)
+
+# GENERATED source property global visibility.
+if(POLICY CMP0118)
+  cmake_policy(SET CMP0118 NEW)
+endif()
+
+if(NOT EMSCRIPTEN)
+  if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    set(CMAKE_BUILD_TYPE
+        "Release"
+        CACHE STRING "Build type: Debug, Release, RelWithDebInfo or MinSizeRel"
+              FORCE)
+  endif()
+endif()
+
+# Library version info. Update LT_CURRENT, LT_REVISION and LT_AGE when making a
+# public release by following the guidelines in the libtool document:
 # https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
-set(SO_VERSION 2)
-set(SO_FILE_VERSION 2.0.2)
+#
+# c=<current>, r=<revision>, a=<age>
+#
+# libtool generates a .so file as .so.[c-a].a.r, while -version-info c:r:a is
+# passed to libtool.
+#
+# We set SO_FILE_VERSION = [c-a].a.r
+set(LT_CURRENT 7)
+set(LT_REVISION 0)
+set(LT_AGE 4)
+math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}")
+set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}")
+unset(LT_CURRENT)
+unset(LT_REVISION)
+unset(LT_AGE)
+
+# Enable generators like Xcode and Visual Studio to place projects in folders.
+set_property(GLOBAL PROPERTY USE_FOLDERS TRUE)
 
 include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
+if(CONFIG_THREE_PASS)
+  include("${AOM_ROOT}/common/ivf_dec.cmake")
+endif()
 include("${AOM_ROOT}/aom_dsp/aom_dsp.cmake")
 include("${AOM_ROOT}/aom_mem/aom_mem.cmake")
 include("${AOM_ROOT}/aom_ports/aom_ports.cmake")
@@ -88,6 +117,7 @@ list(APPEND AOM_LIBYUV_SOURCES
             "${AOM_ROOT}/third_party/libyuv/include/libyuv/row.h"
             "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale.h"
             "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale_row.h"
+            "${AOM_ROOT}/third_party/libyuv/source/convert_argb.cc"
             "${AOM_ROOT}/third_party/libyuv/source/cpu_id.cc"
             "${AOM_ROOT}/third_party/libyuv/source/planar_functions.cc"
             "${AOM_ROOT}/third_party/libyuv/source/row_any.cc"
@@ -104,7 +134,8 @@ list(APPEND AOM_LIBYUV_SOURCES
             "${AOM_ROOT}/third_party/libyuv/source/scale_mips.cc"
             "${AOM_ROOT}/third_party/libyuv/source/scale_neon.cc"
             "${AOM_ROOT}/third_party/libyuv/source/scale_neon64.cc"
-            "${AOM_ROOT}/third_party/libyuv/source/scale_win.cc")
+            "${AOM_ROOT}/third_party/libyuv/source/scale_win.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_uv.cc")
 
 list(APPEND AOM_SOURCES
             "${AOM_CONFIG_DIR}/config/aom_config.c"
@@ -113,6 +144,7 @@ list(APPEND AOM_SOURCES
             "${AOM_ROOT}/aom/aom_codec.h"
             "${AOM_ROOT}/aom/aom_decoder.h"
             "${AOM_ROOT}/aom/aom_encoder.h"
+            "${AOM_ROOT}/aom/aom_external_partition.h"
             "${AOM_ROOT}/aom/aom_frame_buffer.h"
             "${AOM_ROOT}/aom/aom_image.h"
             "${AOM_ROOT}/aom/aom_integer.h"
@@ -127,6 +159,10 @@ list(APPEND AOM_SOURCES
             "${AOM_ROOT}/aom/src/aom_integer.c")
 
 list(APPEND AOM_COMMON_APP_UTIL_SOURCES
+            "${AOM_ROOT}/av1/arg_defs.c"
+            "${AOM_ROOT}/av1/arg_defs.h"
+            "${AOM_ROOT}/common/args_helper.c"
+            "${AOM_ROOT}/common/args_helper.h"
             "${AOM_ROOT}/common/args.c"
             "${AOM_ROOT}/common/args.h"
             "${AOM_ROOT}/common/av1_config.c"
@@ -139,10 +175,11 @@ list(APPEND AOM_COMMON_APP_UTIL_SOURCES
             "${AOM_ROOT}/common/rawenc.c"
             "${AOM_ROOT}/common/rawenc.h"
             "${AOM_ROOT}/common/y4menc.c"
-            "${AOM_ROOT}/common/y4menc.h")
+            "${AOM_ROOT}/common/y4menc.h"
+            "${AOM_ROOT}/common/ivfdec.c"
+            "${AOM_ROOT}/common/ivfdec.h")
 
-list(APPEND AOM_DECODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/ivfdec.c"
-            "${AOM_ROOT}/common/ivfdec.h" "${AOM_ROOT}/common/obudec.c"
+list(APPEND AOM_DECODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/obudec.c"
             "${AOM_ROOT}/common/obudec.h" "${AOM_ROOT}/common/video_reader.c"
             "${AOM_ROOT}/common/video_reader.h")
 
@@ -173,6 +210,10 @@ list(APPEND AOM_WEBM_ENCODER_SOURCES "${AOM_ROOT}/common/webmenc.cc"
 include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR} ${AOM_ROOT}/apps
                     ${AOM_ROOT}/common ${AOM_ROOT}/examples ${AOM_ROOT}/stats)
 
+if(CONFIG_RUNTIME_CPU_DETECT AND ANDROID_NDK)
+  include_directories(${ANDROID_NDK}/sources/android/cpufeatures)
+endif()
+
 # Targets
 add_library(aom_version ${AOM_VERSION_SOURCES})
 add_dummy_source_file_to_target(aom_version c)
@@ -263,13 +304,48 @@ if(NOT MSVC AND NOT APPLE)
   endif()
 endif()
 
+if(CONFIG_AV1_RC_RTC AND CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
+  list(APPEND AOM_AV1_RC_SOURCES "${AOM_ROOT}/av1/ratectrl_rtc.h"
+              "${AOM_ROOT}/av1/ratectrl_rtc.cc")
+  add_library(aom_av1_rc ${AOM_AV1_RC_SOURCES})
+  target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom)
+  if(NOT MSVC AND NOT APPLE)
+    target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} m)
+  endif()
+endif()
+
+if(CONFIG_AV1_ENCODER AND NOT CONFIG_REALTIME_ONLY AND NOT BUILD_SHARED_LIBS)
+  list(APPEND AOM_AV1_RC_QMODE_SOURCES
+              "${AOM_ROOT}/av1/ratectrl_qmode_interface.h"
+              "${AOM_ROOT}/av1/ratectrl_qmode_interface.cc"
+              "${AOM_ROOT}/av1/reference_manager.h"
+              "${AOM_ROOT}/av1/reference_manager.cc"
+              "${AOM_ROOT}/av1/ratectrl_qmode.h"
+              "${AOM_ROOT}/av1/ratectrl_qmode.cc")
+  add_library(av1_rc_qmode ${AOM_AV1_RC_QMODE_SOURCES})
+  target_link_libraries(av1_rc_qmode ${AOM_LIB_LINK_TYPE} aom)
+  if(NOT MSVC AND NOT APPLE)
+    target_link_libraries(av1_rc_qmode ${AOM_LIB_LINK_TYPE} m)
+  endif()
+  set_target_properties(av1_rc_qmode PROPERTIES LINKER_LANGUAGE CXX)
+endif()
+
 # List of object and static library targets.
 set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom)
+if(CONFIG_AV1_RC_RTC AND CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_rc)
+endif()
+if(CONFIG_AV1_ENCODER AND NOT CONFIG_REALTIME_ONLY AND NOT BUILD_SHARED_LIBS)
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} av1_rc_qmode)
+endif()
 if(BUILD_SHARED_LIBS)
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_static)
 endif()
 
 # Setup dependencies.
+if(CONFIG_THREE_PASS)
+  setup_ivf_dec_targets()
+endif()
 setup_aom_dsp_targets()
 setup_aom_mem_targets()
 setup_aom_ports_targets()
@@ -297,19 +373,23 @@ file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.cc"
 #
 if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
   add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES})
+  set_property(TARGET ${example} PROPERTY FOLDER examples)
   if(CONFIG_AV1_DECODER)
     add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES})
+    set_property(TARGET ${example} PROPERTY FOLDER examples)
     # obudec depends on internal headers that require *rtcd.h
     add_dependencies(aom_decoder_app_util aom_rtcd)
   endif()
   if(CONFIG_AV1_ENCODER)
     add_library(aom_encoder_app_util OBJECT ${AOM_ENCODER_APP_UTIL_SOURCES})
+    set_property(TARGET ${example} PROPERTY FOLDER examples)
   endif()
 endif()
 
 if((CONFIG_AV1_DECODER OR CONFIG_AV1_ENCODER) AND ENABLE_EXAMPLES)
   add_executable(resize_util "${AOM_ROOT}/examples/resize_util.c"
                              $<TARGET_OBJECTS:aom_common_app_util>)
+  set_property(TARGET ${example} PROPERTY FOLDER examples)
   list(APPEND AOM_APP_TARGETS resize_util)
 endif()
 
@@ -376,6 +456,14 @@ if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
   list(APPEND AOM_APP_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS})
 endif()
 
+if(CONFIG_LIBYUV OR CONFIG_TUNE_BUTTERAUGLI)
+  add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES})
+  if(NOT MSVC)
+    target_compile_options(yuv PRIVATE -Wno-unused-parameter)
+  endif()
+  include_directories("${AOM_ROOT}/third_party/libyuv/include")
+endif()
+
 if(CONFIG_AV1_ENCODER)
   if(ENABLE_EXAMPLES)
     add_executable(aomenc "${AOM_ROOT}/apps/aomenc.c"
@@ -397,6 +485,10 @@ if(CONFIG_AV1_ENCODER)
     add_executable(noise_model "${AOM_ROOT}/examples/noise_model.c"
                                $<TARGET_OBJECTS:aom_common_app_util>
                                $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(photon_noise_table
+                   "${AOM_ROOT}/examples/photon_noise_table.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
     add_executable(scalable_encoder "${AOM_ROOT}/examples/scalable_encoder.c"
                                     $<TARGET_OBJECTS:aom_common_app_util>
                                     $<TARGET_OBJECTS:aom_encoder_app_util>)
@@ -407,8 +499,8 @@ if(CONFIG_AV1_ENCODER)
 
     # Maintain a list of encoder example targets.
     list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model
-                set_maps simple_encoder scalable_encoder twopass_encoder
-                svc_encoder_rtc)
+                photon_noise_table set_maps simple_encoder scalable_encoder
+                twopass_encoder svc_encoder_rtc)
   endif()
 
   if(ENABLE_TOOLS)
@@ -432,17 +524,131 @@ if(CONFIG_AV1_ENCODER)
   list(APPEND AOM_APP_TARGETS ${AOM_ENCODER_EXAMPLE_TARGETS}
               ${AOM_ENCODER_TOOL_TARGETS})
 
+  if(CONFIG_TUNE_BUTTERAUGLI)
+    find_package(PkgConfig)
+    # Use find_library() with STATIC_LINK_JXL for static build since
+    # pkg_check_modules() with LIBJXL_STATIC is not working.
+    if(STATIC_LINK_JXL OR NOT PKG_CONFIG_FOUND)
+      find_library(LIBJXL_LIBRARIES libjxl.a)
+      find_library(LIBHWY_LIBRARIES libhwy.a)
+      find_library(LIBSKCMS_LIBRARIES libskcms.a)
+      find_library(LIBBROTLICOMMON_LIBRARIES libbrotlicommon-static.a)
+      find_library(LIBBROTLIENC_LIBRARIES libbrotlienc-static.a)
+      find_library(LIBBROTLIDEC_LIBRARIES libbrotlidec-static.a)
+      find_path(LIBJXL_INCLUDE_DIRS butteraugli.h PATH_SUFFIXES jxl)
+      if(LIBJXL_LIBRARIES
+         AND LIBHWY_LIBRARIES
+         AND LIBSKCMS_LIBRARIES
+         AND LIBBROTLICOMMON_LIBRARIES
+         AND LIBBROTLIENC_LIBRARIES
+         AND LIBBROTLIDEC_LIBRARIES
+         AND LIBJXL_INCLUDE_DIRS)
+        message(STATUS "Found JXL library: ${LIBJXL_LIBRARIES} "
+                       "${LIBHWY_LIBRARIES} ${LIBSKCMS_LIBRARIES} "
+                       "${LIBBROTLICOMMON_LIBRARIES} ${LIBBROTLIENC_LIBRARIES}"
+                       "${LIBBROTLIDEC_LIBRARIES}")
+        message(STATUS "Found JXL include: ${LIBJXL_INCLUDE_DIRS}")
+      else()
+        message(FATAL_ERROR "JXL library not found.")
+      endif()
+      target_link_libraries(aom
+                            PRIVATE ${LIBJXL_LIBRARIES} ${LIBHWY_LIBRARIES}
+                                    ${LIBSKCMS_LIBRARIES}
+                                    ${LIBBROTLIENC_LIBRARIES}
+                                    ${LIBBROTLIDEC_LIBRARIES}
+                                    ${LIBBROTLICOMMON_LIBRARIES})
+      target_include_directories(aom_dsp_encoder PRIVATE ${LIBJXL_INCLUDE_DIRS})
+    else()
+      pkg_check_modules(LIBJXL REQUIRED libjxl)
+      target_link_libraries(aom PRIVATE ${LIBJXL_LDFLAGS} ${LIBJXL_LIBRARIES})
+      target_include_directories(aom_dsp_encoder PRIVATE ${LIBJXL_INCLUDE_DIRS})
+      if(LIBJXL_CFLAGS)
+        append_compiler_flag("${LIBJXL_CFLAGS}")
+      endif()
+      pkg_check_modules(LIBHWY REQUIRED libhwy)
+      target_link_libraries(aom PRIVATE ${LIBHWY_LDFLAGS} ${LIBHWY_LIBRARIES})
+      target_include_directories(aom_dsp_encoder
+                                 PRIVATE ${LIBLIBHWY_INCLUDE_DIRS})
+      if(LIBHWY_CFLAGS)
+        append_compiler_flag("${LIBHWY_CFLAGS}")
+      endif()
+    endif()
+
+    set_target_properties(aom PROPERTIES LINKER_LANGUAGE CXX)
+    if(BUILD_SHARED_LIBS)
+      set_target_properties(aom_static PROPERTIES LINKER_LANGUAGE CXX)
+    endif()
+
+    list(APPEND AOM_LIB_TARGETS yuv)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:yuv>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:yuv>)
+    endif()
+  endif()
+
+  if(CONFIG_TFLITE)
+    include(FetchContent)
+
+    set(TFLITE_TAG "v2.6.1")
+
+    message(STATUS "Fetching TFLite ${TFLITE_TAG}...")
+
+    # static linking makes life with TFLite much easier
+    set(TFLITE_C_BUILD_SHARED_LIBS OFF)
+
+    # We don't care about comparing against these delegates (yet), and disabling
+    # it reduces compile time meaningfully
+    set(TFLITE_ENABLE_RUY OFF)
+    set(TFLITE_ENABLE_XNNPACK OFF)
+
+    fetchcontent_declare(tflite
+                         GIT_REPOSITORY https://github.com/tensorflow/tensorflow
+                         GIT_TAG ${TFLITE_TAG}
+                         GIT_SHALLOW TRUE)
+
+    fetchcontent_getproperties(tflite)
+    if(NOT tflite_POPULATED)
+      fetchcontent_populate(tflite)
+      # Some of the subprojects (e.g. Eigen) are very noisy and emit status
+      # messages all the time. Temporary ignore status messages while adding
+      # this to silence it. Ugly but effective.
+      set(OLD_CMAKE_MESSAGE_LOG_LEVEL ${CMAKE_MESSAGE_LOG_LEVEL})
+      set(CMAKE_MESSAGE_LOG_LEVEL WARNING)
+      add_subdirectory(${tflite_SOURCE_DIR}/tensorflow/lite/c
+                       ${tflite_BINARY_DIR})
+      set(CMAKE_MESSAGE_LOG_LEVEL ${OLD_CMAKE_MESSAGE_LOG_LEVEL})
+    endif()
+
+    # Disable some noisy warnings in tflite
+    target_compile_options(tensorflow-lite PRIVATE -w)
+
+    # tensorflowlite_c is implicitly declared by this FetchContent
+    include_directories(${tflite_SOURCE_DIR})
+    target_link_libraries(aom PRIVATE tensorflow-lite)
+  endif()
+
   if(CONFIG_TUNE_VMAF)
-    find_library(VMAF libvmaf.a vmaf)
-    if(NOT VMAF)
-      message(FATAL_ERROR "VMAF library not found.")
+    find_package(PkgConfig)
+    if(PKG_CONFIG_FOUND)
+      pkg_check_modules(VMAF REQUIRED libvmaf)
+      if(BUILD_SHARED_LIBS)
+        target_link_libraries(aom PRIVATE ${VMAF_LDFLAGS} ${VMAF_LIBRARIES})
+      else()
+        target_link_libraries(aom
+                              PRIVATE ${VMAF_LDFLAGS} ${VMAF_LIBRARIES} -static)
+      endif()
+      target_include_directories(aom PRIVATE ${VMAF_INCLUDE_DIRS})
+      target_include_directories(aom_dsp_encoder PRIVATE ${VMAF_INCLUDE_DIRS})
+      if(VMAF_CFLAGS)
+        append_compiler_flag("${VMAF_CFLAGS}")
+      endif()
+    else()
+      message(FATAL_ERROR "CONFIG_TUNE_VMAF error: pkg-config not found.")
     endif()
-    message("-- Found VMAF library: " ${VMAF})
     set_target_properties(aom PROPERTIES LINKER_LANGUAGE CXX)
     if(BUILD_SHARED_LIBS)
       set_target_properties(aom_static PROPERTIES LINKER_LANGUAGE CXX)
     endif()
-    target_link_libraries(aom PRIVATE ${VMAF})
   endif()
 endif()
 
@@ -524,12 +730,6 @@ endforeach()
 
 if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
   if(CONFIG_LIBYUV)
-    add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES})
-    if(NOT MSVC)
-      target_compile_options(yuv PRIVATE -Wno-unused-parameter)
-    endif()
-    include_directories("${AOM_ROOT}/third_party/libyuv/include")
-
     # Add to existing targets.
     foreach(aom_app ${AOM_APP_TARGETS})
       target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:yuv>)
@@ -622,6 +822,17 @@ if(ENABLE_EXAMPLES AND "${CMAKE_GENERATOR}" MATCHES "Makefiles$")
 endif()
 
 if(BUILD_SHARED_LIBS)
+  if(NOT WIN32 AND NOT APPLE)
+    # The -z defs linker option reports unresolved symbol references from object
+    # files when building a shared library.
+    if("${CMAKE_VERSION}" VERSION_LESS "3.13")
+      # target_link_options() is not available before CMake 3.13.
+      target_link_libraries(aom PRIVATE -Wl,-z,defs)
+    else()
+      target_link_options(aom PRIVATE LINKER:-z,defs)
+    endif()
+  endif()
+
   include("${AOM_ROOT}/build/cmake/exports.cmake")
   setup_exports_target()
 endif()
@@ -630,13 +841,44 @@ endif()
 set_user_flags()
 
 # Aomedia documentation rule.
+set(DOXYGEN_VERSION_VALUE 0)
 if(ENABLE_DOCS)
   include(FindDoxygen)
   if(DOXYGEN_FOUND)
+    # Check if Doxygen version is >= minimum required version(i.e. 1.8.10).
+    set(MINIMUM_DOXYGEN_VERSION 1008010)
+
+    if(DOXYGEN_VERSION)
+      # Strip SHA1 from version string if present.
+      string(REGEX
+             REPLACE "^([0-9]+\\.[0-9]+\\.[0-9]+).*" "\\1" DOXYGEN_VERSION
+                     ${DOXYGEN_VERSION})
+      # Replace dots with semicolons to create a list.
+      string(REGEX REPLACE "\\." ";" DOXYGEN_VERSION_LIST ${DOXYGEN_VERSION})
+      # Parse version components from the list.
+      list(GET DOXYGEN_VERSION_LIST 0 DOXYGEN_MAJOR)
+      list(GET DOXYGEN_VERSION_LIST 1 DOXYGEN_MINOR)
+      list(GET DOXYGEN_VERSION_LIST 2 DOXYGEN_PATCH)
+    endif()
+
+    # Construct a version value for comparison.
+    math(EXPR DOXYGEN_MAJOR "${DOXYGEN_MAJOR}*1000000")
+    math(EXPR DOXYGEN_MINOR "${DOXYGEN_MINOR}*1000")
+    math(EXPR DOXYGEN_VERSION_VALUE
+         "${DOXYGEN_MAJOR} + ${DOXYGEN_MINOR} + ${DOXYGEN_PATCH}")
+
+    if(${DOXYGEN_VERSION_VALUE} LESS ${MINIMUM_DOXYGEN_VERSION})
+      set(DOXYGEN_FOUND NO)
+    endif()
+  endif()
+
+  if(DOXYGEN_FOUND)
     include("${AOM_ROOT}/docs.cmake")
     setup_documentation_targets()
   else()
-    message("--- Cannot find doxygen, ENABLE_DOCS turned off.")
+    message(
+      "--- Cannot find doxygen(version 1.8.10 or newer), ENABLE_DOCS turned off."
+      )
     set(ENABLE_DOCS OFF)
   endif()
 endif()
@@ -652,12 +894,14 @@ endif()
 if(ENABLE_EXAMPLES)
   foreach(example ${AOM_EXAMPLE_TARGETS})
     list(APPEND AOM_DIST_EXAMPLES $<TARGET_FILE:${example}>)
+    set_property(TARGET ${example} PROPERTY FOLDER examples)
   endforeach()
 endif()
 
 if(ENABLE_TOOLS)
   foreach(tool ${AOM_TOOL_TARGETS})
     list(APPEND AOM_DIST_TOOLS $<TARGET_FILE:${tool}>)
+    set_property(TARGET ${tool} PROPERTY FOLDER tools)
   endforeach()
 endif()
 
@@ -694,6 +938,10 @@ foreach(var ${all_cmake_vars})
   endif()
 endforeach()
 
+if(NOT CONFIG_AV1_DECODER)
+  list(FILTER aom_source_vars EXCLUDE REGEX "_DECODER_")
+endif()
+
 # Libaom_srcs.txt generation.
 set(libaom_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_srcs.txt")
 file(WRITE "${libaom_srcs_txt_file}" "# This file is generated. DO NOT EDIT.\n")
@@ -703,6 +951,9 @@ foreach(aom_source_var ${aom_source_vars})
   foreach(file ${${aom_source_var}})
     if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
       string(REPLACE "${AOM_ROOT}/" "" file "${file}")
+      if(NOT CONFIG_AV1_DECODER AND "${file}" MATCHES "aom_decoder")
+        continue()
+      endif()
       file(APPEND "${libaom_srcs_txt_file}" "${file}\n")
     endif()
   endforeach()
@@ -733,6 +984,9 @@ foreach(aom_source_var ${aom_source_vars})
     if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
       string(REPLACE "${AOM_ROOT}" "//third_party/libaom/source/libaom" file
                      "${file}")
+      if(NOT CONFIG_AV1_DECODER AND "${file}" MATCHES "aom_decoder")
+        continue()
+      endif()
       file(APPEND "${libaom_srcs_gni_file}" "  \"${file}\",\n")
     endif()
   endforeach()
diff --git a/media/libaom/src/README.md b/media/libaom/src/README.md
index cf057ae6c4..0146003db9 100644
--- a/media/libaom/src/README.md
+++ b/media/libaom/src/README.md
@@ -1,3 +1,5 @@
+README.md                {#LREADME}
+=========
 # AV1 Codec Library
 
 ## Contents
@@ -40,23 +42,24 @@
 5. [Support](#support)
 6. [Bug reports](#bug-reports)
 
-## Building the library and applications
+## Building the library and applications {#building-the-library-and-applications}
 
-### Prerequisites
+### Prerequisites {#prerequisites}
 
- 1. [CMake](https://cmake.org) version 3.5 or higher.
+ 1. [CMake](https://cmake.org). See CMakeLists.txt for the minimum version
+    required.
  2. [Git](https://git-scm.com/).
  3. [Perl](https://www.perl.org/).
  4. For x86 targets, [yasm](http://yasm.tortall.net/), which is preferred, or a
     recent version of [nasm](http://www.nasm.us/). If you download yasm with
     the intention to work with Visual Studio, please download win32.exe or
     win64.exe and rename it into yasm.exe. DO NOT download or use vsyasm.exe.
- 5. Building the documentation requires [doxygen](http://doxygen.org).
- 6. Building the unit tests requires [Python](https://www.python.org/).
- 7. Emscripten builds require the portable
+ 5. Building the documentation requires
+   [doxygen version 1.8.10 or newer](http://doxygen.org).
+ 6. Emscripten builds require the portable
    [EMSDK](https://kripken.github.io/emscripten-site/index.html).
 
-### Get the code
+### Get the code {#get-the-code}
 
 The AV1 library source code is stored in the Alliance for Open Media Git
 repository:
@@ -67,7 +70,7 @@ repository:
     $ cd aom
 ~~~
 
-### Basic build
+### Basic build {#basic-build}
 
 CMake replaces the configure step typical of many projects. Running CMake will
 produce configuration and build files for the currently selected CMake
@@ -85,7 +88,7 @@ successfully. The compiler chosen varies by host platform, but a general rule
 applies: On systems where cc and c++ are present in $PATH at the time CMake is
 run the generated build will use cc and c++ by default.
 
-### Configuration options
+### Configuration options {#configuration-options}
 
 The AV1 codec library has a great many configuration options. These come in two
 varieties:
@@ -106,7 +109,7 @@ configuration options can be found at the top of the CMakeLists.txt file found
 in the root of the AV1 repository, and AV1 codec configuration options can
 currently be found in the file `build/cmake/aom_config_defaults.cmake`.
 
-### Dylib builds
+### Dylib builds {#dylib-builds}
 
 A dylib (shared object) build of the AV1 codec library can be enabled via the
 CMake built in variable `BUILD_SHARED_LIBS`:
@@ -118,7 +121,7 @@ CMake built in variable `BUILD_SHARED_LIBS`:
 
 This is currently only supported on non-Windows targets.
 
-### Debugging
+### Debugging {#debugging}
 
 Depending on the generator used there are multiple ways of going about
 debugging AV1 components. For single configuration generators like the Unix
@@ -147,7 +150,7 @@ generic at generation time:
     $ cmake path/to/aom -DAOM_TARGET_CPU=generic
 ~~~
 
-### Cross compiling
+### Cross compiling {#cross-compiling}
 
 For the purposes of building the AV1 codec and applications and relative to the
 scope of this guide, all builds for architectures differing from the native host
@@ -197,7 +200,7 @@ In addition to the above it's important to note that the toolchain files
 suffixed with gcc behave differently than the others. These toolchain files
 attempt to obey the $CROSS environment variable.
 
-### Sanitizers
+### Sanitizers {#sanitizers}
 
 Sanitizer integration is built-in to the CMake build system. To enable a
 sanitizer, add `-DSANITIZE=<type>` to the CMake command line. For example, to
@@ -211,7 +214,7 @@ enable address sanitizer:
 Sanitizers available vary by platform, target, and compiler. Consult your
 compiler documentation to determine which, if any, are available.
 
-### Microsoft Visual Studio builds
+### Microsoft Visual Studio builds {#microsoft-visual-studio-builds}
 
 Building the AV1 codec library in Microsoft Visual Studio is supported. Visual
 Studio 2017 (15.0) or later is required. The following example demonstrates
@@ -241,7 +244,7 @@ generating projects and a solution for the Microsoft IDE:
 NOTE: The build system targets Windows 7 or later by compiling files with
 `-D_WIN32_WINNT=0x0601`.
 
-### Xcode builds
+### Xcode builds {#xcode-builds}
 
 Building the AV1 codec library in Xcode is supported. The following example
 demonstrates generating an Xcode project:
@@ -250,7 +253,7 @@ demonstrates generating an Xcode project:
     $ cmake path/to/aom -G Xcode
 ~~~
 
-### Emscripten builds
+### Emscripten builds {#emscripten-builds}
 
 Building the AV1 codec library with Emscripten is supported. Typically this is
 used to hook into the AOMAnalyzer GUI application. These instructions focus on
@@ -261,7 +264,7 @@ It is assumed here that you have already downloaded and installed the EMSDK,
 installed and activated at least one toolchain, and setup your environment
 appropriately using the emsdk\_env script.
 
-1. Download [AOMAnalyzer](https://people.xiph.org/~mbebenita/analyzer/).
+1. Build [AOM Analyzer](https://github.com/xiph/aomanalyzer).
 
 2. Configure the build:
 
@@ -293,7 +296,7 @@ appropriately using the emsdk\_env script.
     $ path/to/AOMAnalyzer path/to/examples/inspect.js path/to/av1/input/file
 ~~~
 
-### Extra build flags
+### Extra build flags {#extra-build-flags}
 
 Three variables allow for passing of additional flags to the build system.
 
@@ -312,10 +315,10 @@ These flags can be used, for example, to enable asserts in a release build:
         -DAOM_EXTRA_CXX_FLAGS=-UNDEBUG
 ~~~
 
-### Build with VMAF support
+### Build with VMAF support {#build-with-vmaf}
 
 After installing
-[libvmaf.a](https://github.com/Netflix/vmaf/blob/master/resource/doc/libvmaf.md),
+[libvmaf.a](https://github.com/Netflix/vmaf/tree/master/libvmaf),
 you can use it with the encoder:
 
 ~~~
@@ -323,22 +326,22 @@ you can use it with the encoder:
 ~~~
 
 Please note that the default VMAF model
-("/usr/local/share/model/vmaf_v0.6.1.pkl")
+("/usr/local/share/model/vmaf_v0.6.1.json")
 will be used unless you set the following flag when running the encoder:
 
 ~~~
     # --vmaf-model-path=path/to/model
 ~~~
 
-## Testing the AV1 codec
+## Testing the AV1 codec {#testing-the-av1-codec}
 
-### Testing basics
+### Testing basics {#testing-basics}
 
 There are several methods of testing the AV1 codec. All of these methods require
 the presence of the AV1 source code and a working build of the AV1 library and
 applications.
 
-#### 1. Unit tests:
+#### 1. Unit tests: {#1_unit-tests}
 
 The unit tests can be run at build time:
 
@@ -352,7 +355,7 @@ The unit tests can be run at build time:
     $ make runtests
 ~~~
 
-#### 2. Example tests:
+#### 2. Example tests: {#2_example-tests}
 
 The example tests require a bash shell and can be run in the following manner:
 
@@ -367,7 +370,7 @@ The example tests require a bash shell and can be run in the following manner:
     $ path/to/aom/test/examples.sh --bin-path examples
 ~~~
 
-#### 3. Encoder tests:
+#### 3. Encoder tests: {#3_encoder-tests}
 
 When making a change to the encoder run encoder tests to confirm that your
 change has a positive or negligible impact on encode quality. When running these
@@ -418,7 +421,7 @@ report that can be viewed in a web browser:
 You can view the report by opening mytweak.html in a web browser.
 
 
-### IDE hosted tests
+### IDE hosted tests {#ide-hosted-tests}
 
 By default the generated projects files created by CMake will not include the
 runtests and testdata rules when generating for IDEs like Microsoft Visual
@@ -434,11 +437,13 @@ options in MSVS and Xcode. To enable the test rules in IDEs the
     $ cmake path/to/aom -DENABLE_IDE_TEST_HOSTING=1 -G Xcode
 ~~~
 
-### Downloading the test data
+### Downloading the test data {#downloading-the-test-data}
 
 The fastest and easiest way to obtain the test data is to use CMake to generate
 a build using the Unix Makefiles generator, and then to build only the testdata
-rule:
+rule. By default the test files will be downloaded to the current directory. The
+`LIBAOM_TEST_DATA_PATH` environment variable can be used to set a
+custom one.
 
 ~~~
     $ cmake path/to/aom -G "Unix Makefiles"
@@ -448,7 +453,7 @@ rule:
 
 The above make command will only download and verify the test data.
 
-### Adding a new test data file
+### Adding a new test data file {#adding-a-new-test-data-file}
 
 First, add the new test data file to the `aom-test-data` bucket of the
 `aomedia-testing` project on Google Cloud Platform. You may need to ask someone
@@ -470,19 +475,19 @@ the SHA1 checksum of the new test data file to `test/test-data.sha1`. (The SHA1
 checksum of a file can be calculated by running the `sha1sum` command on the
 file.)
 
-### Additional test data
+### Additional test data {#additional-test-data}
 
 The test data mentioned above is strictly intended for unit testing.
 
 Additional input data for testing the encoder can be obtained from:
 https://media.xiph.org/video/derf/
 
-### Sharded testing
+### Sharded testing {#sharded-testing}
 
 The AV1 codec library unit tests are built upon gtest which supports sharding of
 test jobs. Sharded test runs can be achieved in a couple of ways.
 
-#### 1. Running test\_libaom directly:
+#### 1. Running test\_libaom directly: {#1_running-test_libaom-directly}
 
 ~~~
    # Set the environment variable GTEST_TOTAL_SHARDS to control the number of
@@ -496,7 +501,7 @@ test jobs. Sharded test runs can be achieved in a couple of ways.
 To create a test shard for each CPU core available on the current system set
 `GTEST_TOTAL_SHARDS` to the number of CPU cores on your system minus one.
 
-#### 2. Running the tests via the CMake build:
+#### 2. Running the tests via the CMake build: {#2_running-the-tests-via-the-cmake-build}
 
 ~~~
     # For IDE based builds, ENABLE_IDE_TEST_HOSTING must be enabled. See
@@ -515,14 +520,14 @@ CMake. A system with 24 cores can run 24 test shards using a value of 24 with
 the `-j` parameter. When CMake is unable to detect the number of cores 10 shards
 is the default maximum value.
 
-## Coding style
+## Coding style {#coding-style}
 
 We are using the Google C Coding Style defined by the
 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
 
 The coding style used by this project is enforced with clang-format using the
 configuration contained in the
-[.clang-format](https://chromium.googlesource.com/webm/aom/+/master/.clang-format)
+[.clang-format](https://chromium.googlesource.com/webm/aom/+/main/.clang-format)
 file in the root of the repository.
 
 You can download clang-format using your system's package manager, or directly
@@ -556,27 +561,27 @@ Some Git installations have clang-format integration. Here are some examples:
     $ git clang-format -f -p
 ~~~
 
-## Submitting patches
+## Submitting patches {#submitting-patches}
 
 We manage the submission of patches using the
 [Gerrit](https://www.gerritcodereview.com/) code review tool. This tool
 implements a workflow on top of the Git version control system to ensure that
 all changes get peer reviewed and tested prior to their distribution.
 
-### Login cookie
+### Login cookie {#login-cookie}
 
 Browse to [AOMedia Git index](https://aomedia.googlesource.com/) and login with
 your account (Gmail credentials, for example). Next, follow the
 `Generate Password` Password link at the top of the page. You’ll be given
 instructions for creating a cookie to use with our Git repos.
 
-### Contributor agreement
+### Contributor agreement {#contributor-agreement}
 
 You will be required to execute a
 [contributor agreement](http://aomedia.org/license) to ensure that the AOMedia
 Project has the right to distribute your changes.
 
-### Testing your code
+### Testing your code {#testing-your-code}
 
 The testing basics are covered in the [testing section](#testing-the-av1-codec)
 above.
@@ -584,7 +589,7 @@ above.
 In addition to the local tests, many more (e.g. asan, tsan, valgrind) will run
 through Jenkins instances upon upload to gerrit.
 
-### Commit message hook
+### Commit message hook {#commit-message-hook}
 
 Gerrit requires that each submission include a unique Change-Id. You can assign
 one manually using git commit --amend, but it’s easier to automate it with the
@@ -604,15 +609,15 @@ See the Gerrit
 [documentation](https://gerrit-review.googlesource.com/Documentation/user-changeid.html)
 for more information.
 
-### Upload your change
+### Upload your change {#upload-your-change}
 
 The command line to upload your patch looks like this:
 
 ~~~
-    $ git push https://aomedia-review.googlesource.com/aom HEAD:refs/for/master
+    $ git push https://aomedia-review.googlesource.com/aom HEAD:refs/for/main
 ~~~
 
-### Incorporating reviewer comments
+### Incorporating reviewer comments {#incorporating-reviewer-comments}
 
 If you previously uploaded a change to Gerrit and the Approver has asked for
 changes, follow these steps:
@@ -631,7 +636,7 @@ In general, you should not rebase your changes when doing updates in response to
 review. Doing so can make it harder to follow the evolution of your change in
 the diff view.
 
-### Submitting your change
+### Submitting your change {#submitting-your-change}
 
 Once your change has been Approved and Verified, you can “submit” it through the
 Gerrit UI. This will usually automatically rebase your change onto the branch
@@ -648,18 +653,18 @@ must rebase your changes manually:
 If there are any conflicts, resolve them as you normally would with Git. When
 you’re done, reupload your change.
 
-### Viewing the status of uploaded changes
+### Viewing the status of uploaded changes {#viewing-the-status-of-uploaded-changes}
 
 To check the status of a change that you uploaded, open
 [Gerrit](https://aomedia-review.googlesource.com/), sign in, and click My >
 Changes.
 
-## Support
+## Support {#support}
 
 This library is an open source project supported by its community. Please
 please email aomediacodec@jointdevelopment.kavi.com for help.
 
-## Bug reports
+## Bug reports {#bug-reports}
 
 Bug reports can be filed in the Alliance for Open Media
 [issue tracker](https://bugs.chromium.org/p/aomedia/issues/list).
diff --git a/media/libaom/src/aom/aom.h b/media/libaom/src/aom/aom.h
index c591dc9a43..0650a11f6b 100644
--- a/media/libaom/src/aom/aom.h
+++ b/media/libaom/src/aom/aom.h
@@ -41,27 +41,45 @@ extern "C" {
 /*!\brief Control functions
  *
  * The set of macros define the control functions of AOM interface
+ * The range for common control IDs is 230-255(max).
  */
 enum aom_com_control_id {
-  /* TODO(https://crbug.com/aomedia/2671): The encoder overlaps the range of
-   * these values for its control ids, see the NOTEs in aom/aomcx.h. These
-   * should be migrated to something like the AOM_DECODER_CTRL_ID_START range
-   * next time we're ready to break the ABI.
+  /*!\brief Codec control function to get a pointer to a reference frame
+   *
+   * av1_ref_frame_t* parameter
    */
-  AV1_GET_REFERENCE = 128,  /**< get a pointer to a reference frame,
-                               av1_ref_frame_t* parameter */
-  AV1_SET_REFERENCE = 129,  /**< write a frame into a reference buffer,
-                               av1_ref_frame_t* parameter */
-  AV1_COPY_REFERENCE = 130, /**< get a copy of reference frame from the decoderm
-                               av1_ref_frame_t* parameter */
-  AOM_COMMON_CTRL_ID_MAX,
-
-  AV1_GET_NEW_FRAME_IMAGE =
-      192, /**< get a pointer to the new frame, aom_image_t* parameter */
-  AV1_COPY_NEW_FRAME_IMAGE = 193, /**< copy the new frame to an external buffer,
-                                     aom_image_t* parameter */
+  AV1_GET_REFERENCE = 230,
 
+  /*!\brief Codec control function to write a frame into a reference buffer
+   *
+   * av1_ref_frame_t* parameter
+   */
+  AV1_SET_REFERENCE = 231,
+
+  /*!\brief Codec control function to get a copy of reference frame from the
+   * decoder
+   *
+   * av1_ref_frame_t* parameter
+   */
+  AV1_COPY_REFERENCE = 232,
+
+  /*!\brief Codec control function to get a pointer to the new frame
+   *
+   * aom_image_t* parameter
+   */
+  AV1_GET_NEW_FRAME_IMAGE = 233,
+
+  /*!\brief Codec control function to copy the new frame to an external buffer
+   *
+   * aom_image_t* parameter
+   */
+  AV1_COPY_NEW_FRAME_IMAGE = 234,
+
+  /*!\brief Start point of control IDs for aom_dec_control_id.
+   * Any new common control IDs should be added above.
+   */
   AOM_DECODER_CTRL_ID_START = 256
+  // No common control IDs should be added after AOM_DECODER_CTRL_ID_START.
 };
 
 /*!\brief AV1 specific reference frame data struct
diff --git a/media/libaom/src/aom/aom_codec.h b/media/libaom/src/aom/aom_codec.h
index 75f6a1af22..49d48cf153 100644
--- a/media/libaom/src/aom/aom_codec.h
+++ b/media/libaom/src/aom/aom_codec.h
@@ -9,6 +9,57 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+///////////////////////////////////////////////////////////////////////////////
+// Internal implementation details
+///////////////////////////////////////////////////////////////////////////////
+//
+// There are two levels of interfaces used to access the AOM codec: the
+// the aom_codec_iface and the aom_codec_ctx.
+//
+// 1. aom_codec_iface_t
+//    (Related files: aom/aom_codec.h, aom/src/aom_codec.c,
+//    aom/internal/aom_codec_internal.h, av1/av1_cx_iface.c,
+//    av1/av1_dx_iface.c)
+//
+// Used to initialize the codec context, which contains the configuration for
+// for modifying the encoder/decoder during run-time. See the other
+// documentation in this header file for more details. For the most part,
+// users will call helper functions, such as aom_codec_iface_name,
+// aom_codec_get_caps, etc., to interact with it.
+//
+// The main purpose of the aom_codec_iface_t is to provide a way to generate
+// a default codec config, find out what capabilities the implementation has,
+// and create an aom_codec_ctx_t (which is actually used to interact with the
+// codec).
+//
+// Note that the implementations for the AV1 algorithm are located in
+// av1/av1_cx_iface.c and av1/av1_dx_iface.c
+//
+//
+// 2. aom_codec_ctx_t
+//  (Related files: aom/aom_codec.h, av1/av1_cx_iface.c, av1/av1_dx_iface.c,
+//   aom/aomcx.h, aom/aomdx.h, aom/src/aom_encoder.c, aom/src/aom_decoder.c)
+//
+// The actual interface between user code and the codec. It stores the name
+// of the codec, a pointer back to the aom_codec_iface_t that initialized it,
+// initialization flags, a config for either encoder or the decoder, and a
+// pointer to internal data.
+//
+// The codec is configured / queried through calls to aom_codec_control,
+// which takes a control ID (listed in aomcx.h and aomdx.h) and a parameter.
+// In the case of "getter" control IDs, the parameter is modified to have
+// the requested value; in the case of "setter" control IDs, the codec's
+// configuration is changed based on the parameter. Note that a aom_codec_err_t
+// is returned, which indicates if the operation was successful or not.
+//
+// Note that for the encoder, the aom_codec_alg_priv_t points to the
+// the aom_codec_alg_priv structure in av1/av1_cx_iface.c, and for the decoder,
+// the struct in av1/av1_dx_iface.c. Variables such as AV1_COMP cpi are stored
+// here and also used in the core algorithm.
+//
+// At the end, aom_codec_destroy should be called for each initialized
+// aom_codec_ctx_t.
+
 /*!\defgroup codec Common Algorithm Interface
  * This abstraction allows applications to easily support multiple video
  * formats with minimal code duplication. This section describes the interface
@@ -23,13 +74,16 @@
  * video codec algorithm.
  *
  * An application instantiates a specific codec instance by using
- * aom_codec_init() and a pointer to the algorithm's interface structure:
+ * aom_codec_dec_init() or aom_codec_enc_init() and a pointer to the
+ * algorithm's interface structure:
  *     <pre>
  *     my_app.c:
  *       extern aom_codec_iface_t my_codec;
  *       {
  *           aom_codec_ctx_t algo;
- *           res = aom_codec_init(&algo, &my_codec);
+ *           int threads = 4;
+ *           aom_codec_dec_cfg_t cfg = { threads, 0, 0, 1 };
+ *           res = aom_codec_dec_init(&algo, &my_codec, &cfg, 0);
  *       }
  *     </pre>
  *
@@ -95,7 +149,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define AOM_CODEC_ABI_VERSION (5 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+#define AOM_CODEC_ABI_VERSION (7 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
 
 /*!\brief Algorithm return codes */
 typedef enum {
@@ -185,13 +239,17 @@ typedef int64_t aom_codec_pts_t;
  * Contains function pointers and other data private to the codec
  * implementation. This structure is opaque to the application. Common
  * functions used with this structure:
- *   - aom_codec_iface_name: get the name of the codec
- *   - aom_codec_get_caps: returns the capabilities of the codec (see
- *     aom_encoder.h for more details)
- *   - aom_codec_enc_config_default: generate the default config to use
- *     when initializing the encoder
+ *   - aom_codec_iface_name(aom_codec_iface_t *iface): get the
+ *     name of the codec
+ *   - aom_codec_get_caps(aom_codec_iface_t *iface): returns
+ *     the capabilities of the codec
+ *   - aom_codec_enc_config_default: generate the default config for
+ *     initializing the encoder (see documention in aom_encoder.h)
  *   - aom_codec_dec_init, aom_codec_enc_init: initialize the codec context
- *     structure (see documentation on aom_codec_ctx for more information).
+ *     structure (see documentation on aom_codec_ctx).
+ *
+ * To get access to the AV1 encoder and decoder, use aom_codec_av1_cx() and
+ *  aom_codec_av1_dx().
  */
 typedef const struct aom_codec_iface aom_codec_iface_t;
 
@@ -202,6 +260,27 @@ typedef const struct aom_codec_iface aom_codec_iface_t;
  */
 typedef struct aom_codec_priv aom_codec_priv_t;
 
+/*!\brief Compressed Frame Flags
+ *
+ * This type represents a bitfield containing information about a compressed
+ * frame that may be useful to an application. The most significant 16 bits
+ * can be used by an algorithm to provide additional detail, for example to
+ * support frame types that are codec specific (MPEG-1 D-frames for example)
+ */
+typedef uint32_t aom_codec_frame_flags_t;
+#define AOM_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+/*!\brief frame can be dropped without affecting the stream (no future frame
+ * depends on this one) */
+#define AOM_FRAME_IS_DROPPABLE 0x2
+/*!\brief this is an INTRA_ONLY frame */
+#define AOM_FRAME_IS_INTRAONLY 0x10
+/*!\brief this is an S-frame */
+#define AOM_FRAME_IS_SWITCH 0x20
+/*!\brief this is an error-resilient frame */
+#define AOM_FRAME_IS_ERROR_RESILIENT 0x40
+/*!\brief this is a key-frame dependent recovery-point frame */
+#define AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT 0x80
+
 /*!\brief Iterator
  *
  * Opaque storage used for iterating over lists.
@@ -266,31 +345,27 @@ typedef enum aom_superblock_size {
 /*!\brief Return the version information (as an integer)
  *
  * Returns a packed encoding of the library version number. This will only
- * include
- * the major.minor.patch component of the version number. Note that this encoded
- * value should be accessed through the macros provided, as the encoding may
- * change
- * in the future.
+ * include the major.minor.patch component of the version number. Note that this
+ * encoded value should be accessed through the macros provided, as the encoding
+ * may change in the future.
  *
  */
 int aom_codec_version(void);
 
-/*!\brief Return the version major number */
+/*!\brief Return the major version number */
 #define aom_codec_version_major() ((aom_codec_version() >> 16) & 0xff)
 
-/*!\brief Return the version minor number */
+/*!\brief Return the minor version number */
 #define aom_codec_version_minor() ((aom_codec_version() >> 8) & 0xff)
 
-/*!\brief Return the version patch number */
+/*!\brief Return the patch version number */
 #define aom_codec_version_patch() ((aom_codec_version() >> 0) & 0xff)
 
 /*!\brief Return the version information (as a string)
  *
  * Returns a printable string containing the full library version number. This
- * may
- * contain additional text following the three digit version number, as to
- * indicate
- * release candidates, prerelease versions, etc.
+ * may contain additional text following the three digit version number, as to
+ * indicate release candidates, prerelease versions, etc.
  *
  */
 const char *aom_codec_version_str(void);
@@ -298,8 +373,7 @@ const char *aom_codec_version_str(void);
 /*!\brief Return the version information (as a string)
  *
  * Returns a printable "extra string". This is the component of the string
- * returned
- * by aom_codec_version_str() following the three digit version number.
+ * returned by aom_codec_version_str() following the three digit version number.
  *
  */
 const char *aom_codec_version_extra_str(void);
@@ -405,17 +479,38 @@ aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface);
  * ctx->err will be set to the same value as the return value.
  *
  * \param[in]     ctx              Pointer to this instance's context
- * \param[in]     ctrl_id          Algorithm specific control identifier
+ * \param[in]     ctrl_id          Algorithm specific control identifier.
+ *                                 Must be nonzero.
  *
  * \retval #AOM_CODEC_OK
  *     The control request was processed.
  * \retval #AOM_CODEC_ERROR
  *     The control request was not processed.
  * \retval #AOM_CODEC_INVALID_PARAM
- *     The data was not valid.
+ *     The control ID was zero, or the data was not valid.
  */
 aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...);
 
+/*!\brief Key & Value API
+ *
+ * aom_codec_set_option() takes a context, a key (option name) and a value. If
+ * the context is non-null and an error occurs, ctx->err will be set to the same
+ * value as the return value.
+ *
+ * \param[in]     ctx              Pointer to this instance's context
+ * \param[in]     name             The name of the option (key)
+ * \param[in]     value            The value of the option
+ *
+ * \retval #AOM_CODEC_OK
+ *     The value of the option was set.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     The data was not valid.
+ * \retval #AOM_CODEC_ERROR
+ *     The option was not successfully set.
+ */
+aom_codec_err_t aom_codec_set_option(aom_codec_ctx_t *ctx, const char *name,
+                                     const char *value);
+
 /*!\brief aom_codec_control wrapper macro (adds type-checking, less flexible)
  *
  * This macro allows for type safe conversions across the variadic parameter
diff --git a/media/libaom/src/aom/aom_encoder.h b/media/libaom/src/aom/aom_encoder.h
index a494c17a49..09617c01b0 100644
--- a/media/libaom/src/aom/aom_encoder.h
+++ b/media/libaom/src/aom/aom_encoder.h
@@ -31,17 +31,28 @@ extern "C" {
 #endif
 
 #include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
 
 /*!\brief Current ABI version number
  *
+ * \hideinitializer
  * \internal
  * If this file is altered in any way that changes the ABI, this value
  * must be bumped.  Examples include, but are not limited to, changing
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
+ *
+ * Note: In the definition of AOM_ENCODER_ABI_VERSION, 3 is the value of
+ * AOM_EXT_PART_ABI_VERSION in libaom v3.2.0. The old value of
+ * AOM_EXT_PART_ABI_VERSION is used so as to not break the ABI version check in
+ * aom_codec_enc_init_ver() when an application compiled against libaom v3.2.0
+ * passes the old value of AOM_ENCODER_ABI_VERSION to aom_codec_enc_init_ver().
+ * The external partition API is still experimental. When it is declared stable,
+ * we will replace 3 with AOM_EXT_PART_ABI_VERSION in the definition of
+ * AOM_ENCODER_ABI_VERSION.
  */
 #define AOM_ENCODER_ABI_VERSION \
-  (8 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
+  (10 + AOM_CODEC_ABI_VERSION + /*AOM_EXT_PART_ABI_VERSION=*/3)
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -78,27 +89,6 @@ typedef struct aom_fixed_buf {
   size_t sz;       /**< Length of the buffer, in chars */
 } aom_fixed_buf_t; /**< alias for struct aom_fixed_buf */
 
-/*!\brief Compressed Frame Flags
- *
- * This type represents a bitfield containing information about a compressed
- * frame that may be useful to an application. The most significant 16 bits
- * can be used by an algorithm to provide additional detail, for example to
- * support frame types that are codec specific (MPEG-1 D-frames for example)
- */
-typedef uint32_t aom_codec_frame_flags_t;
-#define AOM_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
-/*!\brief frame can be dropped without affecting the stream (no future frame
- * depends on this one) */
-#define AOM_FRAME_IS_DROPPABLE 0x2
-/*!\brief this is an INTRA_ONLY frame */
-#define AOM_FRAME_IS_INTRAONLY 0x10
-/*!\brief this is an S-frame */
-#define AOM_FRAME_IS_SWITCH 0x20
-/*!\brief this is an error-resilient frame */
-#define AOM_FRAME_IS_ERROR_RESILIENT 0x40
-/*!\brief this is a key-frame dependent recovery-point frame */
-#define AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT 0x80
-
 /*!\brief Error Resilient flags
  *
  * These flags define which error resilient features to enable in the
@@ -152,17 +142,19 @@ typedef struct aom_codec_cx_pkt {
       unsigned int samples[4]; /**< Number of samples, total/y/u/v */
       uint64_t sse[4];         /**< sum squared error, total/y/u/v */
       double psnr[4];          /**< PSNR, total/y/u/v */
-    } psnr;                    /**< data for PSNR packet */
-    aom_fixed_buf_t raw;       /**< data for arbitrary packets */
-
-    /* This packet size is fixed to allow codecs to extend this
-     * interface without having to manage storage for raw packets,
-     * i.e., if it's smaller than 128 bytes, you can store in the
-     * packet list directly.
-     */
-    char pad[128 - sizeof(enum aom_codec_cx_pkt_kind)]; /**< fixed sz */
-  } data;                                               /**< packet data */
-} aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */
+      /*!\brief Number of samples, total/y/u/v when
+       * input bit-depth < stream bit-depth.*/
+      unsigned int samples_hbd[4];
+      /*!\brief sum squared error, total/y/u/v when
+       * input bit-depth < stream bit-depth.*/
+      uint64_t sse_hbd[4];
+      /*!\brief PSNR, total/y/u/v when
+       * input bit-depth < stream bit-depth.*/
+      double psnr_hbd[4];
+    } psnr;              /**< data for PSNR packet */
+    aom_fixed_buf_t raw; /**< data for arbitrary packets */
+  } data;                /**< packet data */
+} aom_codec_cx_pkt_t;    /**< alias for struct aom_codec_cx_pkt */
 
 /*!\brief Rational Number
  *
@@ -173,11 +165,19 @@ typedef struct aom_rational {
   int den;        /**< fraction denominator */
 } aom_rational_t; /**< alias for struct aom_rational */
 
-/*!\brief Multi-pass Encoding Pass */
+/*!\brief Multi-pass Encoding Pass
+ *
+ * AOM_RC_LAST_PASS is kept for backward compatibility.
+ * If passes is not given and pass==2, the codec will assume passes=2.
+ * For new code, it is recommended to use AOM_RC_SECOND_PASS and set
+ * the "passes" member to 2 via the key & val API for two-pass encoding.
+ */
 enum aom_enc_pass {
-  AOM_RC_ONE_PASS,   /**< Single pass mode */
-  AOM_RC_FIRST_PASS, /**< First pass of multi-pass mode */
-  AOM_RC_LAST_PASS   /**< Final pass of multi-pass mode */
+  AOM_RC_ONE_PASS = 0,    /**< Single pass mode */
+  AOM_RC_FIRST_PASS = 1,  /**< First pass of multi-pass mode */
+  AOM_RC_SECOND_PASS = 2, /**< Second pass of multi-pass mode */
+  AOM_RC_THIRD_PASS = 3,  /**< Third pass of multi-pass mode */
+  AOM_RC_LAST_PASS = 2,   /**< Final pass of two-pass mode */
 };
 
 /*!\brief Rate control mode */
@@ -202,6 +202,22 @@ enum aom_kf_mode {
   AOM_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
 };
 
+/*!\brief Frame super-resolution mode. */
+typedef enum {
+  /**< Frame super-resolution is disabled for all frames. */
+  AOM_SUPERRES_NONE,
+  /**< All frames are coded at the specified scale and super-resolved. */
+  AOM_SUPERRES_FIXED,
+  /**< All frames are coded at a random scale and super-resolved. */
+  AOM_SUPERRES_RANDOM,
+  /**< Super-resolution scale for each frame is determined based on the q index
+     of that frame. */
+  AOM_SUPERRES_QTHRESH,
+  /**< Full-resolution or super-resolution and the scale (in case of
+     super-resolution) are automatically selected for each frame. */
+  AOM_SUPERRES_AUTO,
+} aom_superres_mode;
+
 /*!\brief Encoder Config Options
  *
  * This type allows to enumerate and control flags defined for encoder control
@@ -358,7 +374,8 @@ typedef struct cfg_options {
  * /algo/_eflag_*. The lower order 16 bits are reserved for common use.
  */
 typedef long aom_enc_frame_flags_t;
-#define AOM_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */
+/*!\brief Force this frame to be a keyframe */
+#define AOM_EFLAG_FORCE_KF (1 << 0)
 
 /*!\brief Encoder configuration structure
  *
@@ -546,10 +563,8 @@ typedef struct aom_codec_enc_cfg {
    * Similar to spatial resampling, frame super-resolution integrates
    * upscaling after the encode/decode process. Taking control of upscaling and
    * using restoration filters should allow it to outperform normal resizing.
-   *
-   * Valid values are 0 to 4 as defined in enum SUPERRES_MODE.
    */
-  unsigned int rc_superres_mode;
+  aom_superres_mode rc_superres_mode;
 
   /*!\brief Frame super-resolution denominator.
    *
@@ -559,7 +574,7 @@ typedef struct aom_codec_enc_cfg {
    *
    * Valid denominators are 8 to 16.
    *
-   * Used only by SUPERRES_FIXED.
+   * Used only by AOM_SUPERRES_FIXED.
    */
   unsigned int rc_superres_denominator;
 
@@ -578,7 +593,7 @@ typedef struct aom_codec_enc_cfg {
    * The q level threshold after which superres is used.
    * Valid values are 1 to 63.
    *
-   * Used only by SUPERRES_QTHRESH
+   * Used only by AOM_SUPERRES_QTHRESH
    */
   unsigned int rc_superres_qthresh;
 
@@ -587,7 +602,7 @@ typedef struct aom_codec_enc_cfg {
    * The q level threshold after which superres is used for key frames.
    * Valid values are 1 to 63.
    *
-   * Used only by SUPERRES_QTHRESH
+   * Used only by AOM_SUPERRES_QTHRESH
    */
   unsigned int rc_superres_kf_qthresh;
 
@@ -617,7 +632,7 @@ typedef struct aom_codec_enc_cfg {
 
   /*!\brief Target data rate
    *
-   * Target bandwidth to use for this stream, in kilobits per second.
+   * Target bitrate to use for this stream, in kilobits per second.
    */
   unsigned int rc_target_bitrate;
 
@@ -651,25 +666,19 @@ typedef struct aom_codec_enc_cfg {
 
   /*!\brief Rate control adaptation undershoot control
    *
-   * This value, expressed as a percentage of the target bitrate,
-   * controls the maximum allowed adaptation speed of the codec.
-   * This factor controls the maximum amount of bits that can
-   * be subtracted from the target bitrate in order to compensate
-   * for prior overshoot.
+   * This value, controls the tolerance of the VBR algorithm to undershoot
+   * and is used as a trigger threshold for more aggressive adaptation of Q.
    *
-   * Valid values in the range 0-1000.
+   * Valid values in the range 0-100.
    */
   unsigned int rc_undershoot_pct;
 
   /*!\brief Rate control adaptation overshoot control
    *
-   * This value, expressed as a percentage of the target bitrate,
-   * controls the maximum allowed adaptation speed of the codec.
-   * This factor controls the maximum amount of bits that can
-   * be added to the target bitrate in order to compensate for
-   * prior undershoot.
+   * This value, controls the tolerance of the VBR algorithm to overshoot
+   * and is used as a trigger threshold for more aggressive adaptation of Q.
    *
-   * Valid values in the range 0-1000.
+   * Valid values in the range 0-100.
    */
   unsigned int rc_overshoot_pct;
 
@@ -879,27 +888,11 @@ typedef struct aom_codec_enc_cfg {
    */
   unsigned int use_fixed_qp_offsets;
 
-/*!\brief Number of fixed QP offsets
- *
- * This defines the number of elements in the fixed_qp_offsets array.
- */
-#define FIXED_QP_OFFSET_COUNT 5
-
-  /*!\brief Array of fixed QP offsets
+  /*!\brief Deprecated and ignored. DO NOT USE.
    *
-   * This array specifies fixed QP offsets (range: 0 to 63) for frames at
-   * different levels of the pyramid. It is a comma-separated list of 5 values:
-   * - QP offset for keyframe
-   * - QP offset for ALTREF frame
-   * - QP offset for 1st level internal ARF
-   * - QP offset for 2nd level internal ARF
-   * - QP offset for 3rd level internal ARF
-   * Notes:
-   * - QP offset for leaf level frames is not explicitly specified. These frames
-   *   use the worst quality allowed (--cq-level).
-   * - This option is only relevant for --end-usage=q.
+   * TODO(aomedia:3269): Remove fixed_qp_offsets in libaom v4.0.0.
    */
-  int fixed_qp_offsets[FIXED_QP_OFFSET_COUNT];
+  int fixed_qp_offsets[5];
 
   /*!\brief Options defined per config file
    *
@@ -914,7 +907,7 @@ typedef struct aom_codec_enc_cfg {
  * function directly, to ensure that the ABI version number parameter
  * is properly initialized.
  *
- * If the library was configured with --disable-multithread, this call
+ * If the library was configured with -DCONFIG_MULTITHREAD=0, this call
  * is not thread safe and should be guarded with a lock if being used
  * in a multithreaded context.
  *
@@ -952,8 +945,8 @@ aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
  * \param[in]    iface     Pointer to the algorithm interface to use.
  * \param[out]   cfg       Configuration buffer to populate.
  * \param[in]    usage     Algorithm specific usage value. For AV1, must be
- *                         set to AOM_USAGE_GOOD_QUALITY (0) or
- *                         AOM_USAGE_REALTIME (1).
+ *                         set to AOM_USAGE_GOOD_QUALITY (0),
+ *                         AOM_USAGE_REALTIME (1), or AOM_USAGE_ALL_INTRA (2).
  *
  * \retval #AOM_CODEC_OK
  *     The configuration was populated.
@@ -1012,6 +1005,8 @@ aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
 #define AOM_USAGE_GOOD_QUALITY (0)
 /*!\brief usage parameter analogous to AV1 REALTIME mode. */
 #define AOM_USAGE_REALTIME (1)
+/*!\brief usage parameter analogous to AV1 all intra mode. */
+#define AOM_USAGE_ALL_INTRA (2)
 
 /*!\brief Encode a frame
  *
@@ -1019,15 +1014,20 @@ aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
  * time stamp (PTS) \ref MUST be strictly increasing.
  *
  * When the last frame has been passed to the encoder, this function should
- * continue to be called, with the img parameter set to NULL. This will
- * signal the end-of-stream condition to the encoder and allow it to encode
- * any held buffers. Encoding is complete when aom_codec_encode() is called
- * and aom_codec_get_cx_data() returns no data.
+ * continue to be called in a loop, with the img parameter set to NULL. This
+ * will signal the end-of-stream condition to the encoder and allow it to
+ * encode any held buffers. Encoding is complete when aom_codec_encode() is
+ * called with img set to NULL and aom_codec_get_cx_data() returns no data.
  *
  * \param[in]    ctx       Pointer to this instance's context
  * \param[in]    img       Image data to encode, NULL to flush.
- * \param[in]    pts       Presentation time stamp, in timebase units.
- * \param[in]    duration  Duration to show frame, in timebase units.
+ *                         Encoding sample values outside the range
+ *                         [0..(1<<img->bit_depth)-1] is undefined behavior.
+ * \param[in]    pts       Presentation time stamp, in timebase units. If img
+ *                         is NULL, pts is ignored.
+ * \param[in]    duration  Duration to show frame, in timebase units. If img
+ *                         is not NULL, duration must be nonzero. If img is
+ *                         NULL, duration is ignored.
  * \param[in]    flags     Flags to use for encoding this frame.
  *
  * \retval #AOM_CODEC_OK
diff --git a/media/libaom/src/aom/aom_external_partition.h b/media/libaom/src/aom/aom_external_partition.h
new file mode 100644
index 0000000000..55c59a5746
--- /dev/null
+++ b/media/libaom/src/aom/aom_external_partition.h
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_EXTERNAL_PARTITION_H_
+#define AOM_AOM_AOM_EXTERNAL_PARTITION_H_
+
+/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
+ * \ingroup aom
+ *
+ * @{
+ */
+#include <stdint.h>
+
+/*!\file
+ * \brief Provides function pointer definitions for the external partition.
+ *
+ * \note The external partition API should be considered experimental. Until the
+ * external partition API is declared stable, breaking changes may be made to
+ * this API in a future libaom release.
+ */
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures.
+ */
+#define AOM_EXT_PART_ABI_VERSION 8
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Abstract external partition model handler
+ */
+typedef void *aom_ext_part_model_t;
+
+/*!\brief Number of features to determine whether to skip partition none and
+ * do partition split directly. The same as "FEATURE_SIZE_SMS_SPLIT".
+ */
+#define AOM_EXT_PART_SIZE_DIRECT_SPLIT 17
+
+/*!\brief Number of features to use simple motion search to prune out
+ * rectangular partition in some direction. The same as
+ * "FEATURE_SIZE_SMS_PRUNE_PART".
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_PART 25
+
+/*!\brief Number of features to prune split and rectangular partition
+ * after PARTITION_NONE.
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_NONE 4
+
+/*!\brief Number of features to terminates partition after partition none using
+ * simple_motion_search features and the rate, distortion, and rdcost of
+ * PARTITION_NONE. The same as "FEATURE_SIZE_SMS_TERM_NONE".
+ */
+#define AOM_EXT_PART_SIZE_TERM_NONE 28
+
+/*!\brief Number of features to terminates partition after partition split.
+ */
+#define AOM_EXT_PART_SIZE_TERM_SPLIT 31
+
+/*!\brief Number of features to prune rectangular partition using stats
+ * collected after partition split.
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_RECT 9
+
+/*!\brief Number of features to prune AB partition using stats
+ * collected after rectangular partition..
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_AB 10
+
+/*!\brief Number of features to prune 4-way partition using stats
+ * collected after AB partition.
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_4_WAY 18
+
+/*!\brief Decision mode of the external partition model.
+ * AOM_EXT_PART_WHOLE_TREE: the external partition model should provide the
+ * whole partition tree for the superblock.
+ *
+ * AOM_EXT_PART_RECURSIVE: the external partition model provides the partition
+ * decision of the current block only. The decision process starts from
+ * the superblock size, down to the smallest block size (4x4) recursively.
+ */
+typedef enum aom_ext_part_decision_mode {
+  AOM_EXT_PART_WHOLE_TREE = 0,
+  AOM_EXT_PART_RECURSIVE = 1,
+} aom_ext_part_decision_mode_t;
+
+/*!\brief Config information sent to the external partition model.
+ *
+ * For example, the maximum superblock size determined by the sequence header.
+ */
+typedef struct aom_ext_part_config {
+  int superblock_size;  ///< super block size (either 64x64 or 128x128)
+} aom_ext_part_config_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected before NONE partition.
+ * Features "f" are used to determine:
+ * partition_none_allowed, partition_horz_allowed, partition_vert_allowed,
+ * do_rectangular_split, do_square_split
+ * Features "f_part2" are used to determine:
+ * prune_horz, prune_vert.
+ */
+typedef struct aom_partition_features_before_none {
+  /*! features to determine whether skip partition none and do split directly */
+  float f[AOM_EXT_PART_SIZE_DIRECT_SPLIT];
+  /*! features to determine whether to prune rectangular partition */
+  float f_part2[AOM_EXT_PART_SIZE_PRUNE_PART];
+} aom_partition_features_before_none_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after NONE partition.
+ */
+typedef struct aom_partition_features_none {
+  /*! features to prune split and rectangular partition */
+  float f[AOM_EXT_PART_SIZE_PRUNE_NONE];
+  /*! features to determine termination of partition */
+  float f_terminate[AOM_EXT_PART_SIZE_TERM_NONE];
+} aom_partition_features_none_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after SPLIT partition.
+ */
+typedef struct aom_partition_features_split {
+  /*! features to determine termination of  partition */
+  float f_terminate[AOM_EXT_PART_SIZE_TERM_SPLIT];
+  /*! features to determine pruning rect partition */
+  float f_prune_rect[AOM_EXT_PART_SIZE_PRUNE_RECT];
+} aom_partition_features_split_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after RECTANGULAR partition.
+ */
+typedef struct aom_partition_features_rect {
+  /*! features to determine pruning AB partition */
+  float f[AOM_EXT_PART_SIZE_PRUNE_AB];
+} aom_partition_features_rect_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after AB partition: HORZ_A, HORZ_B, VERT_A,
+ * VERT_B.
+ */
+typedef struct aom_partition_features_ab {
+  /*! features to determine pruning 4-way partition */
+  float f[AOM_EXT_PART_SIZE_PRUNE_4_WAY];
+} aom_partition_features_ab_t;
+
+/*!\brief Feature id to tell the external model the current stage in partition
+ * pruning and what features to use to make decisions accordingly.
+ */
+typedef enum {
+  AOM_EXT_PART_FEATURE_BEFORE_NONE,
+  AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2,
+  AOM_EXT_PART_FEATURE_AFTER_NONE,
+  AOM_EXT_PART_FEATURE_AFTER_NONE_PART2,
+  AOM_EXT_PART_FEATURE_AFTER_SPLIT,
+  AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2,
+  AOM_EXT_PART_FEATURE_AFTER_RECT,
+  AOM_EXT_PART_FEATURE_AFTER_AB
+} AOM_EXT_PART_FEATURE_ID;
+
+/*!\brief Features collected from the tpl process.
+ *
+ * The tpl process collects information that help measure the inter-frame
+ * dependency.
+ * The tpl process is computed in the unit of tpl_bsize_1d (16x16).
+ * Therefore, the max number of units inside a superblock is
+ * 128x128 / (16x16) = 64. Change it if the tpl process changes.
+ */
+typedef struct aom_sb_tpl_features {
+  int available;        ///< If tpl stats are available
+  int tpl_unit_length;  ///< The block length of tpl process
+  int num_units;        ///< The number of units inside the current superblock
+  int64_t intra_cost[64];   ///< The intra cost of each unit
+  int64_t inter_cost[64];   ///< The inter cost of each unit
+  int64_t mc_dep_cost[64];  ///< The motion compensated dependency cost
+} aom_sb_tpl_features_t;
+
+/*!\brief Features collected from the simple motion process.
+ *
+ * The simple motion process collects information by applying motion compensated
+ * prediction on each block.
+ * The block size is 16x16, which could be changed. If it is changed, update
+ * comments and the array size here.
+ */
+typedef struct aom_sb_simple_motion_features {
+  int unit_length;    ///< The block length of the simple motion process
+  int num_units;      ///< The number of units inside the current superblock
+  int block_sse[64];  ///< Sum of squared error of each unit
+  int block_var[64];  ///< Variance of each unit
+} aom_sb_simple_motion_features_t;
+
+/*!\brief Features of each super block.
+ *
+ * Features collected for each super block before partition search.
+ */
+typedef struct aom_sb_features {
+  /*! Features from motion search */
+  aom_sb_simple_motion_features_t motion_features;
+  /*! Features from tpl process */
+  aom_sb_tpl_features_t tpl_features;
+} aom_sb_features_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ *
+ * The encoder sends these features to the external model through
+ * "func()" defined in .....
+ *
+ * NOTE: new member variables may be added to this structure in the future.
+ * Once new features are finalized, bump the major version of libaom.
+ */
+typedef struct aom_partition_features {
+  // Features for the current supervised multi-stage ML model.
+  /*! Feature ID to indicate active features */
+  AOM_EXT_PART_FEATURE_ID id;
+  /*! Features collected before NONE partition */
+  aom_partition_features_before_none_t before_part_none;
+  /*! Features collected after NONE partition */
+  aom_partition_features_none_t after_part_none;
+  /*! Features collected after SPLIT partition */
+  aom_partition_features_split_t after_part_split;
+  /*! Features collected after RECTANGULAR partition */
+  aom_partition_features_rect_t after_part_rect;
+  /*! Features collected after AB partition */
+  aom_partition_features_ab_t after_part_ab;
+
+  // Features for a new ML model.
+  aom_sb_features_t sb_features;  ///< Features collected for the super block
+  int mi_row;                     ///< Mi_row position of the block
+  int mi_col;                     ///< Mi_col position of the block
+  int frame_width;                ///< Frame width
+  int frame_height;               ///< Frame height
+  int block_size;                 ///< As "BLOCK_SIZE" in av1/common/enums.h
+  /*!
+   * Valid partition types. A bitmask is used.  "1" represents the
+   * corresponding type is vaild. The bitmask follows the enum order for
+   * PARTITION_TYPE in "enums.h" to represent one partition type at a bit.
+   * For example, 0x01 stands for only PARTITION_NONE is valid,
+   * 0x09 (00...001001) stands for PARTITION_NONE and PARTITION_SPLIT are valid.
+   */
+  int valid_partition_types;
+  int update_type;    ///< Frame update type, defined in ratectrl.h
+  int qindex;         ///< Quantization index, range: [0, 255]
+  int rdmult;         ///< Rate-distortion multiplier
+  int pyramid_level;  ///< The level of this frame in the hierarchical structure
+  int has_above_block;     ///< Has above neighbor block
+  int above_block_width;   ///< Width of the above block, -1 if not exist
+  int above_block_height;  ///< Height of the above block, -1 if not exist
+  int has_left_block;      ///< Has left neighbor block
+  int left_block_width;    ///< Width of the left block, -1 if not exist
+  int left_block_height;   ///< Height of the left block, -1 if not exist
+  /*!
+   * The following parameters are collected from applying simple motion search.
+   * Sum of squared error (SSE) and variance of motion compensated residual
+   * are good indicators of block partitioning.
+   * If a block is a square, we also apply motion search for its 4 sub blocks.
+   * If not a square, their values are -1.
+   * If a block is able to split horizontally, we apply motion search and get
+   * stats for horizontal blocks. If not, their values are -1.
+   * If a block is able to split vertically, we apply motion search and get
+   * stats for vertical blocks. If not, their values are -1.
+   */
+  unsigned int block_sse;          ///< SSE of motion compensated residual
+  unsigned int block_var;          ///< Variance of motion compensated residual
+  unsigned int sub_block_sse[4];   ///< SSE of sub blocks.
+  unsigned int sub_block_var[4];   ///< Variance of sub blocks.
+  unsigned int horz_block_sse[2];  ///< SSE of horz sub blocks
+  unsigned int horz_block_var[2];  ///< Variance of horz sub blocks
+  unsigned int vert_block_sse[2];  ///< SSE of vert sub blocks
+  unsigned int vert_block_var[2];  ///< Variance of vert sub blocks
+  /*!
+   * The following parameters are calculated from tpl model.
+   * If tpl model is not available, their values are -1.
+   */
+  int64_t tpl_intra_cost;   ///< Intra cost, ref to "TplDepStats" in tpl_model.h
+  int64_t tpl_inter_cost;   ///< Inter cost in tpl model
+  int64_t tpl_mc_dep_cost;  ///< Motion compensated dependency cost in tpl model
+} aom_partition_features_t;
+
+/*!\brief Partition decisions received from the external model.
+ *
+ * The encoder receives partition decisions and encodes the superblock
+ * with the given partition type.
+ * The encoder receives it from "func()" define in ....
+ *
+ * NOTE: new member variables may be added to this structure in the future.
+ * Once new features are finalized, bump the major version of libaom.
+ */
+typedef struct aom_partition_decision {
+  // Decisions for directly set partition types
+  int is_final_decision;         ///< The flag whether it's the final decision
+  int num_nodes;                 ///< The number of leaf nodes
+  int partition_decision[2048];  ///< Partition decisions
+  int current_decision;          ///< Partition decision for the current block
+
+  // Decisions for partition type pruning
+  int terminate_partition_search;  ///< Terminate further partition search
+  int partition_none_allowed;      ///< Allow partition none type
+  int partition_rect_allowed[2];   ///< Allow rectangular partitions
+  int do_rectangular_split;        ///< Try rectangular split partition
+  int do_square_split;             ///< Try square split partition
+  int prune_rect_part[2];          ///< Prune rectangular partition
+  int horza_partition_allowed;     ///< Allow HORZ_A partitioin
+  int horzb_partition_allowed;     ///< Allow HORZ_B partitioin
+  int verta_partition_allowed;     ///< Allow VERT_A partitioin
+  int vertb_partition_allowed;     ///< Allow VERT_B partitioin
+  int partition_horz4_allowed;     ///< Allow HORZ4 partition
+  int partition_vert4_allowed;     ///< Allow VERT4 partition
+} aom_partition_decision_t;
+
+/*!\brief Encoding stats for the given partition decision.
+ *
+ * The encoding stats collected by encoding the superblock with the
+ * given partition types.
+ * The encoder sends the stats to the external model for training
+ * or inference though "func()" defined in ....
+ */
+typedef struct aom_partition_stats {
+  int rate;        ///< Rate cost of the block
+  int64_t dist;    ///< Distortion of the block
+  int64_t rdcost;  ///< Rate-distortion cost of the block
+} aom_partition_stats_t;
+
+/*!\brief Enum for return status.
+ */
+typedef enum aom_ext_part_status {
+  AOM_EXT_PART_OK = 0,     ///< Status of success
+  AOM_EXT_PART_ERROR = 1,  ///< Status of failure
+  AOM_EXT_PART_TEST = 2,   ///< Status used for tests
+} aom_ext_part_status_t;
+
+/*!\brief Callback of creating an external partition model.
+ *
+ * The callback is invoked by the encoder to create an external partition
+ * model.
+ *
+ * \param[in] priv Callback's private data
+ * \param[in] part_config Config information pointer for model creation
+ * \param[out] ext_part_model Pointer to the model
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_create_model_fn_t)(
+    void *priv, const aom_ext_part_config_t *part_config,
+    aom_ext_part_model_t *ext_part_model);
+
+/*!\brief Callback of sending features to the external partition model.
+ *
+ * The callback is invoked by the encoder to send features to the external
+ * partition model.
+ *
+ * \param[in] ext_part_model The external model
+ * \param[in] part_features Pointer to the features
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_send_features_fn_t)(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_features_t *part_features);
+
+/*!\brief Callback of receiving partition decisions from the external
+ * partition model.
+ *
+ * The callback is invoked by the encoder to receive partition decisions from
+ * the external partition model.
+ *
+ * \param[in] ext_part_model The external model
+ * \param[in] ext_part_decision Pointer to the partition decisions
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_get_decision_fn_t)(
+    aom_ext_part_model_t ext_part_model,
+    aom_partition_decision_t *ext_part_decision);
+
+/*!\brief Callback of sending stats to the external partition model.
+ *
+ * The callback is invoked by the encoder to send encoding stats to
+ * the external partition model.
+ *
+ * \param[in] ext_part_model The external model
+ * \param[in] ext_part_stats Pointer to the encoding stats
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_send_partition_stats_fn_t)(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_stats_t *ext_part_stats);
+
+/*!\brief Callback of deleting the external partition model.
+ *
+ * The callback is invoked by the encoder to delete the external partition
+ * model.
+ *
+ * \param[in] ext_part_model The external model
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_delete_model_fn_t)(
+    aom_ext_part_model_t ext_part_model);
+
+/*!\brief Callback function set for external partition model.
+ *
+ * Uses can enable external partition model by registering a set of
+ * callback functions with the flag: AV1E_SET_EXTERNAL_PARTITION_MODEL
+ */
+typedef struct aom_ext_part_funcs {
+  /*!
+   * Create an external partition model.
+   */
+  aom_ext_part_create_model_fn_t create_model;
+
+  /*!
+   * Send features to the external partition model to make partition decisions.
+   */
+  aom_ext_part_send_features_fn_t send_features;
+
+  /*!
+   * Get partition decisions from the external partition model.
+   */
+  aom_ext_part_get_decision_fn_t get_partition_decision;
+
+  /*!
+   * Send stats of the current partition to the external model.
+   */
+  aom_ext_part_send_partition_stats_fn_t send_partition_stats;
+
+  /*!
+   * Delete the external partition model.
+   */
+  aom_ext_part_delete_model_fn_t delete_model;
+
+  /*!
+   * The decision mode of the model.
+   */
+  aom_ext_part_decision_mode_t decision_mode;
+
+  /*!
+   * Private data for the external partition model.
+   */
+  void *priv;
+} aom_ext_part_funcs_t;
+
+/*!@} - end defgroup aom_encoder*/
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOM_EXTERNAL_PARTITION_H_
diff --git a/media/libaom/src/aom/aom_frame_buffer.h b/media/libaom/src/aom/aom_frame_buffer.h
index a715645a7a..0e80373ddd 100644
--- a/media/libaom/src/aom/aom_frame_buffer.h
+++ b/media/libaom/src/aom/aom_frame_buffer.h
@@ -58,7 +58,7 @@ typedef struct aom_codec_frame_buffer {
  * must return 0. Any failure the callback must return a value less than 0.
  *
  * \param[in] priv         Callback's private data
- * \param[in] new_size     Size in bytes needed by the buffer
+ * \param[in] min_size     Size in bytes needed by the buffer
  * \param[in,out] fb       Pointer to aom_codec_frame_buffer_t
  */
 typedef int (*aom_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
diff --git a/media/libaom/src/aom/aom_image.h b/media/libaom/src/aom/aom_image.h
index bb6973f9c3..d5f0c087e6 100644
--- a/media/libaom/src/aom/aom_image.h
+++ b/media/libaom/src/aom/aom_image.h
@@ -48,6 +48,11 @@ typedef enum aom_img_fmt {
   AOM_IMG_FMT_AOMI420 = AOM_IMG_FMT_PLANAR | 4,
   AOM_IMG_FMT_I422 = AOM_IMG_FMT_PLANAR | 5,
   AOM_IMG_FMT_I444 = AOM_IMG_FMT_PLANAR | 6,
+/*!\brief Allows detection of the presence of AOM_IMG_FMT_NV12 at compile time.
+ */
+#define AOM_HAVE_IMG_FMT_NV12 1
+  AOM_IMG_FMT_NV12 =
+      AOM_IMG_FMT_PLANAR | 7, /**< 4:2:0 with U and V interleaved */
   AOM_IMG_FMT_I42016 = AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH,
   AOM_IMG_FMT_YV1216 = AOM_IMG_FMT_YV12 | AOM_IMG_FMT_HIGHBITDEPTH,
   AOM_IMG_FMT_I42216 = AOM_IMG_FMT_I422 | AOM_IMG_FMT_HIGHBITDEPTH,
@@ -124,8 +129,12 @@ typedef enum aom_matrix_coefficients {
 
 /*!\brief List of supported color range */
 typedef enum aom_color_range {
-  AOM_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */
-  AOM_CR_FULL_RANGE = 1    /**< YUV/RGB [0..255] */
+  AOM_CR_STUDIO_RANGE = 0, /**<- Y  [16..235],  UV  [16..240]  (bit depth 8) */
+                           /**<- Y  [64..940],  UV  [64..960]  (bit depth 10) */
+                           /**<- Y [256..3760], UV [256..3840] (bit depth 12) */
+  AOM_CR_FULL_RANGE = 1    /**<- YUV/RGB [0..255]  (bit depth 8) */
+                           /**<- YUV/RGB [0..1023] (bit depth 10) */
+                           /**<- YUV/RGB [0..4095] (bit depth 12) */
 } aom_color_range_t;       /**< alias for enum aom_color_range */
 
 /*!\brief List of chroma sample positions */
@@ -195,10 +204,12 @@ typedef struct aom_image {
   unsigned int y_chroma_shift; /**< subsampling order, Y */
 
 /* Image data pointers. */
-#define AOM_PLANE_PACKED 0  /**< To be used for all packed formats */
-#define AOM_PLANE_Y 0       /**< Y (Luminance) plane */
-#define AOM_PLANE_U 1       /**< U (Chroma) plane */
-#define AOM_PLANE_V 2       /**< V (Chroma) plane */
+#define AOM_PLANE_PACKED 0 /**< To be used for all packed formats */
+#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
+#define AOM_PLANE_U 1      /**< U (Chroma) plane */
+#define AOM_PLANE_V 2      /**< V (Chroma) plane */
+  /* planes[AOM_PLANE_V] = NULL and stride[AOM_PLANE_V] = 0 when fmt ==
+   * AOM_IMG_FMT_NV12 */
   unsigned char *planes[3]; /**< pointer to the top left pixel for each plane */
   int stride[3];            /**< stride between rows for each plane */
   size_t sz;                /**< data size */
@@ -300,7 +311,8 @@ aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt,
 /*!\brief Set the rectangle identifying the displayed portion of the image
  *
  * Updates the displayed rectangle (aka viewport) on the image surface to
- * match the specified coordinates and size.
+ * match the specified coordinates and size. Specifically, sets img->d_w,
+ * img->d_h, and elements of the img->planes[] array.
  *
  * \param[in]    img       Image descriptor
  * \param[in]    x         leftmost column
@@ -309,7 +321,7 @@ aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt,
  * \param[in]    h         height
  * \param[in]    border    A border that is padded on four sides of the image.
  *
- * \return 0 if the requested rectangle is valid, nonzero otherwise.
+ * \return 0 if the requested rectangle is valid, nonzero (-1) otherwise.
  */
 int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
                      unsigned int w, unsigned int h, unsigned int border);
@@ -360,6 +372,9 @@ int aom_img_plane_height(const aom_image_t *img, int plane);
  * \param[in]    data         Metadata contents
  * \param[in]    sz           Metadata contents size
  * \param[in]    insert_flag  Metadata insert flag
+ *
+ * \return Returns 0 on success. If img or data is NULL, sz is 0, or memory
+ * allocation fails, it returns -1.
  */
 int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data,
                          size_t sz, aom_metadata_insert_flags_t insert_flag);
@@ -410,6 +425,9 @@ void aom_img_remove_metadata(aom_image_t *img);
  * \param[in]    data         Metadata data pointer
  * \param[in]    sz           Metadata size
  * \param[in]    insert_flag  Metadata insert flag
+ *
+ * \return Returns the newly allocated aom_metadata struct. If data is NULL,
+ * sz is 0, or memory allocation fails, it returns NULL.
  */
 aom_metadata_t *aom_img_metadata_alloc(uint32_t type, const uint8_t *data,
                                        size_t sz,
diff --git a/media/libaom/src/aom/aom_integer.h b/media/libaom/src/aom/aom_integer.h
index 113671e820..d9bba09f25 100644
--- a/media/libaom/src/aom/aom_integer.h
+++ b/media/libaom/src/aom/aom_integer.h
@@ -22,22 +22,7 @@
 #define AOM_INLINE inline
 #endif
 
-#if defined(AOM_EMULATE_INTTYPES)
-typedef signed char int8_t;
-typedef signed short int16_t;
-typedef signed int int32_t;
-
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-
-#ifndef _UINTPTR_T_DEFINED
-typedef size_t uintptr_t;
-#endif
-
-#else
-
-/* Most platforms have the C99 standard integer types. */
+/* Assume platforms have the C99 standard integer types. */
 
 #if defined(__cplusplus)
 #if !defined(__STDC_FORMAT_MACROS)
@@ -49,27 +34,7 @@ typedef size_t uintptr_t;
 #endif  // __cplusplus
 
 #include <stdint.h>
-
-#endif
-
-/* VS2010 defines stdint.h, but not inttypes.h */
-#if defined(_MSC_VER) && _MSC_VER < 1800
-#define PRId64 "I64d"
-#else
 #include <inttypes.h>
-#endif
-
-#if !defined(INT8_MAX)
-#define INT8_MAX 127
-#endif
-
-#if !defined(INT32_MAX)
-#define INT32_MAX 2147483647
-#endif
-
-#if !defined(INT32_MIN)
-#define INT32_MIN (-2147483647 - 1)
-#endif
 
 #if defined(__cplusplus)
 extern "C" {
diff --git a/media/libaom/src/aom/aomcx.h b/media/libaom/src/aom/aomcx.h
index 051d33e7bf..0dd200da3c 100644
--- a/media/libaom/src/aom/aomcx.h
+++ b/media/libaom/src/aom/aomcx.h
@@ -18,10 +18,23 @@
  */
 #include "aom/aom.h"
 #include "aom/aom_encoder.h"
+#include "aom/aom_external_partition.h"
 
 /*!\file
  * \brief Provides definitions for using AOM or AV1 encoder algorithm within the
  *        aom Codec Interface.
+ *
+ * Several interfaces are excluded with CONFIG_REALTIME_ONLY build:
+ * Global motion
+ * Warped motion
+ * OBMC
+ * TPL model
+ * Loop restoration
+ *
+ * The following features are also disabled with CONFIG_REALTIME_ONLY:
+ * CNN
+ * 4X rectangular blocks
+ * 4X rectangular transform in intra prediction
  */
 
 #ifdef __cplusplus
@@ -31,11 +44,19 @@ extern "C" {
 /*!\name Algorithm interface for AV1
  *
  * This interface provides the capability to encode raw AV1 streams.
- * @{
+ *@{
+ */
+
+/*!\brief A single instance of the AV1 encoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer aom_codec_av1_cx().
  */
 extern aom_codec_iface_t aom_codec_av1_cx_algo;
+
+/*!\brief The interface to the AV1 encoder.
+ */
 extern aom_codec_iface_t *aom_codec_av1_cx(void);
-/*!@} - end algorithm interface member group*/
+/*!@} - end algorithm interface member group */
 
 /*
  * Algorithm Flags
@@ -147,6 +168,7 @@ extern aom_codec_iface_t *aom_codec_av1_cx(void);
  *
  * This set of macros define the control functions available for AVx
  * encoder interface.
+ * The range of encode control ID is 7-229(max).
  *
  * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...)
  */
@@ -185,9 +207,14 @@ enum aome_enc_control_id {
    * encoding process, values greater than 0 will increase encoder speed at
    * the expense of quality.
    *
-   * Valid range: 0..8. 0 runs the slowest, and 8 runs the fastest;
+   * Valid range: 0..10. 0 runs the slowest, and 10 runs the fastest;
    * quality improves as speed decreases (since more compression
    * possibilities are explored).
+   *
+   * NOTE: 10 is only allowed in AOM_USAGE_REALTIME. In AOM_USAGE_GOOD_QUALITY
+   * and AOM_USAGE_ALL_INTRA, 9 is the highest allowed value. However,
+   * AOM_USAGE_GOOD_QUALITY treats 7..9 the same as 6. Also, AOM_USAGE_REALTIME
+   * treats 0..4 the same as 5.
    */
   AOME_SET_CPUUSED = 13,
 
@@ -201,7 +228,14 @@ enum aome_enc_control_id {
 
   /* NOTE: enum 15 unused */
 
-  /*!\brief Codec control function to set sharpness, unsigned int parameter.
+  /*!\brief Codec control function to set the sharpness parameter,
+   * unsigned int parameter.
+   *
+   * This parameter controls the level at which rate-distortion optimization of
+   * transform coefficients favours sharpness in the block.
+   *
+   * Valid range: 0..7. The default is 0. Values 1-7 will avoid eob and skip
+   * block optimization and will change rdmult in favour of block sharpness.
    */
   AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2,  // 16
 
@@ -241,6 +275,8 @@ enum aome_enc_control_id {
 
   /*!\brief Codec control function to set visual tuning, aom_tune_metric (int)
    * parameter
+   *
+   * The default is AOM_TUNE_PSNR.
    */
   AOME_SET_TUNING = AOME_SET_ARNR_STRENGTH + 2,  // 24
 
@@ -365,6 +401,8 @@ enum aome_enc_control_id {
    *
    * - 0 = disable
    * - 1 = enable (default)
+   *
+   * \note Excluded from CONFIG_REALTIME_ONLY build.
    */
   AV1E_SET_ENABLE_TPL_MODEL = 35,
 
@@ -372,7 +410,8 @@ enum aome_enc_control_id {
    * unsigned int parameter
    *
    * - 0 = disable
-   * - 1 = enable (default)
+   * - 1 = enable without overlay (default)
+   * - 2 = enable with overlay
    */
   AV1E_SET_ENABLE_KEYFRAME_FILTERING = 36,
 
@@ -382,7 +421,7 @@ enum aome_enc_control_id {
    * AV1 has a bitstream feature to reduce decoding dependency between frames
    * by turning off backward update of probability context used in encoding
    * and decoding. This allows staged parallel processing of more than one
-   * video frames in the decoder. This control function provides a mean to
+   * video frames in the decoder. This control function provides a means to
    * turn this feature on or off for bitstreams produced by encoder.
    *
    * - 0 = disable (default)
@@ -418,10 +457,12 @@ enum aome_enc_control_id {
    * AV1 has a segment based feature that allows encoder to adaptively change
    * quantization parameter for each segment within a frame to improve the
    * subjective quality. This control makes encoder operate in one of the
-   * several AQ_modes supported.
+   * several AQ modes supported.
    *
    * - 0 = disable (default)
-   * - 1 = enable
+   * - 1 = variance
+   * - 2 = complexity
+   * - 3 = cyclic refresh
    */
   AV1E_SET_AQ_MODE = 40,
 
@@ -429,7 +470,7 @@ enum aome_enc_control_id {
    * int parameter
    *
    * One AV1 encoder speed feature is to enable quality boost by lowering
-   * frame level Q periodically. This control function provides a mean to
+   * frame level Q periodically. This control function provides a means to
    * turn on/off this feature.
    *
    * - 0 = disable (default)
@@ -450,6 +491,7 @@ enum aome_enc_control_id {
    *
    *  - AOM_CONTENT_DEFAULT = Regular video content (default)
    *  - AOM_CONTENT_SCREEN  = Screen capture content
+   *  - AOM_CONTENT_FILM = Film content
    */
   AV1E_SET_TUNE_CONTENT = 43,
 
@@ -570,18 +612,18 @@ enum aome_enc_control_id {
   AV1E_SET_RENDER_SIZE = 53,
 
   /*!\brief Control to set target sequence level index for a certain operating
-   * point(OP), int parameter
-   * Possible values are in the form of "ABxy"(pad leading zeros if less than
-   * 4 digits).
+   * point (OP), int parameter
+   * Possible values are in the form of "ABxy".
    *  - AB: OP index.
-   *  - xy: Target level index for the OP. Can be values 0~23(corresponding to
-   *    level 2.0 ~ 7.3) or 24(keep level stats only for level monitoring) or
-   *    31(maximum level parameter, no level-based constraints).
+   *  - xy: Target level index for the OP. Can be values 0~23 (corresponding to
+   *    level 2.0 ~ 7.3, note levels 2.2, 2.3, 3.2, 3.3, 4.2, 4.3, 7.0, 7.1, 7.2
+   *    & 7.3 are undefined) or 24 (keep level stats only for level monitoring)
+   *    or 31 (maximum level parameter, no level-based constraints).
    *
    * E.g.:
-   * - "0" means target level index 0 for the 0th OP;
-   * - "111" means target level index 11 for the 1st OP;
-   * - "1021" means target level index 21 for the 10th OP.
+   * - "0" means target level index 0 (2.0) for the 0th OP;
+   * - "109" means target level index 9 (4.1) for the 1st OP;
+   * - "1019" means target level index 19 (6.3) for the 10th OP.
    *
    * If the target level is not specified for an OP, the maximum level parameter
    * of 31 is used as default.
@@ -617,7 +659,8 @@ enum aome_enc_control_id {
    * in-loop filter aiming to remove coding artifacts
    *
    * - 0 = disable
-   * - 1 = enable (default)
+   * - 1 = enable for all frames (default)
+   * - 2 = disable for non-reference frames
    */
   AV1E_SET_ENABLE_CDEF = 58,
 
@@ -626,6 +669,8 @@ enum aome_enc_control_id {
    *
    * - 0 = disable
    * - 1 = enable (default)
+   *
+   * \note Excluded from CONFIG_REALTIME_ONLY build.
    */
   AV1E_SET_ENABLE_RESTORATION = 59,
 
@@ -641,6 +686,8 @@ enum aome_enc_control_id {
    *
    * - 0 = disable
    * - 1 = enable (default)
+   *
+   * \note Excluded from CONFIG_REALTIME_ONLY build.
    */
   AV1E_SET_ENABLE_OBMC = 61,
 
@@ -847,7 +894,17 @@ enum aome_enc_control_id {
    */
   AV1E_SET_ENABLE_FLIP_IDTX = 81,
 
-  /* Note: enum value 82 unused */
+  /*!\brief Codec control function to turn on / off rectangular transforms, int
+   * parameter
+   *
+   * This will enable or disable usage of rectangular transforms. NOTE:
+   * Rectangular transforms only enabled when corresponding rectangular
+   * partitions are.
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_RECT_TX = 82,
 
   /*!\brief Codec control function to turn on / off dist-wtd compound mode
    * at sequence level, int parameter
@@ -892,7 +949,7 @@ enum aome_enc_control_id {
   AV1E_SET_ENABLE_DUAL_FILTER = 86,
 
   /*!\brief Codec control function to turn on / off delta quantization in chroma
-   * planes usage for a sequence, int parameter
+   * planes for a sequence, int parameter
    *
    * - 0 = disable (default)
    * - 1 = enable
@@ -960,6 +1017,8 @@ enum aome_enc_control_id {
    *
    * - 0 = disable
    * - 1 = enable (default)
+   *
+   * \note Excluded from CONFIG_REALTIME_ONLY build.
    */
   AV1E_SET_ENABLE_GLOBAL_MOTION = 95,
 
@@ -968,6 +1027,8 @@ enum aome_enc_control_id {
    *
    * - 0 = disable
    * - 1 = enable (default)
+   *
+   * \note Excluded from CONFIG_REALTIME_ONLY build.
    */
   AV1E_SET_ENABLE_WARPED_MOTION = 96,
 
@@ -979,15 +1040,14 @@ enum aome_enc_control_id {
    *
    * - 0 = disable
    * - 1 = enable (default)
+   *
+   * \note Excluded from CONFIG_REALTIME_ONLY build.
    */
   AV1E_SET_ALLOW_WARPED_MOTION = 97,
 
   /*!\brief Codec control function to turn on / off filter intra usage at
    * sequence level, int parameter
    *
-   * \attention If AV1E_SET_ENABLE_FILTER_INTRA is 0, then this flag is
-   * forced to 0.
-   *
    * - 0 = disable
    * - 1 = enable (default)
    */
@@ -1025,8 +1085,6 @@ enum aome_enc_control_id {
   /*!\brief Codec control function to turn on / off frame superresolution, int
    * parameter
    *
-   * \attention If AV1E_SET_ENABLE_SUPERRES is 0, then this flag is forced to 0.
-   *
    * - 0 = disable
    * - 1 = enable (default)
    */
@@ -1061,7 +1119,9 @@ enum aome_enc_control_id {
    *
    * - 0 = deltaq signaling off
    * - 1 = use modulation to maximize objective quality (default)
-   * - 2 = use modulation to maximize perceptual quality
+   * - 2 = use modulation for local test
+   * - 3 = use modulation for key frame perceptual quality optimization
+   * - 4 = use modulation for user rating based perceptual quality optimization
    */
   AV1E_SET_DELTAQ_MODE = 107,
 
@@ -1143,7 +1203,7 @@ enum aome_enc_control_id {
   /*!\brief Control to select maximum height for the GF group pyramid structure,
    * unsigned int parameter
    *
-   * Valid range: 0..4
+   * Valid range: 0..5
    */
   AV1E_SET_GF_MAX_PYRAMID_HEIGHT = 123,
 
@@ -1158,9 +1218,6 @@ enum aome_enc_control_id {
      parameter */
   AV1E_SET_REDUCED_REFERENCE_SET = 125,
 
-  /* NOTE: enums 126-139 unused */
-  /* NOTE: Need a gap in enum values to avoud conflict with 128, 129, 130 */
-
   /*!\brief Control to set frequency of the cost updates for coefficients,
    * unsigned int parameter
    *
@@ -1169,7 +1226,7 @@ enum aome_enc_control_id {
    * - 2 = update at tile level
    * - 3 = turn off
    */
-  AV1E_SET_COEFF_COST_UPD_FREQ = 140,
+  AV1E_SET_COEFF_COST_UPD_FREQ = 126,
 
   /*!\brief Control to set frequency of the cost updates for mode, unsigned int
    * parameter
@@ -1179,7 +1236,7 @@ enum aome_enc_control_id {
    * - 2 = update at tile level
    * - 3 = turn off
    */
-  AV1E_SET_MODE_COST_UPD_FREQ = 141,
+  AV1E_SET_MODE_COST_UPD_FREQ = 127,
 
   /*!\brief Control to set frequency of the cost updates for motion vectors,
    * unsigned int parameter
@@ -1189,7 +1246,7 @@ enum aome_enc_control_id {
    * - 2 = update at tile level
    * - 3 = turn off
    */
-  AV1E_SET_MV_COST_UPD_FREQ = 142,
+  AV1E_SET_MV_COST_UPD_FREQ = 128,
 
   /*!\brief Control to set bit mask that specifies which tier each of the 32
    * possible operating points conforms to, unsigned int parameter
@@ -1197,37 +1254,37 @@ enum aome_enc_control_id {
    * - 0 = main tier (default)
    * - 1 = high tier
    */
-  AV1E_SET_TIER_MASK = 143,
+  AV1E_SET_TIER_MASK = 129,
 
   /*!\brief Control to set minimum compression ratio, unsigned int parameter
    * Take integer values. If non-zero, encoder will try to keep the compression
    * ratio of each frame to be higher than the given value divided by 100.
    * E.g. 850 means minimum compression ratio of 8.5.
    */
-  AV1E_SET_MIN_CR = 144,
+  AV1E_SET_MIN_CR = 130,
 
   /* NOTE: enums 145-149 unused */
 
   /*!\brief Codec control function to set the layer id, aom_svc_layer_id_t*
    * parameter
    */
-  AV1E_SET_SVC_LAYER_ID = 150,
+  AV1E_SET_SVC_LAYER_ID = 131,
 
   /*!\brief Codec control function to set SVC paramaeters, aom_svc_params_t*
    * parameter
    */
-  AV1E_SET_SVC_PARAMS = 151,
+  AV1E_SET_SVC_PARAMS = 132,
 
   /*!\brief Codec control function to set reference frame config:
    * the ref_idx and the refresh flags for each buffer slot.
    * aom_svc_ref_frame_config_t* parameter
    */
-  AV1E_SET_SVC_REF_FRAME_CONFIG = 152,
+  AV1E_SET_SVC_REF_FRAME_CONFIG = 133,
 
   /*!\brief Codec control function to set the path to the VMAF model used when
    * tuning the encoder for VMAF, const char* parameter
    */
-  AV1E_SET_VMAF_MODEL_PATH = 153,
+  AV1E_SET_VMAF_MODEL_PATH = 134,
 
   /*!\brief Codec control function to enable EXT_TILE_DEBUG in AV1 encoder,
    * unsigned int parameter
@@ -1237,7 +1294,7 @@ enum aome_enc_control_id {
    *
    * \note This is only used in lightfield example test.
    */
-  AV1E_ENABLE_EXT_TILE_DEBUG = 154,
+  AV1E_ENABLE_EXT_TILE_DEBUG = 135,
 
   /*!\brief Codec control function to enable the superblock multipass unit test
    * in AV1 to ensure that the encoder does not leak state between different
@@ -1248,14 +1305,150 @@ enum aome_enc_control_id {
    *
    * \note This is only used in sb_multipass unit test.
    */
-  AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST = 155,
+  AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST = 136,
 
   /*!\brief Control to select minimum height for the GF group pyramid structure,
    * unsigned int parameter
    *
-   * Valid values: 0..4
+   * Valid values: 0..5
+   */
+  AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 137,
+
+  /*!\brief Control to set average complexity of the corpus in the case of
+   * single pass vbr based on LAP, unsigned int parameter
+   */
+  AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP = 138,
+
+  /*!\brief Control to get baseline gf interval
+   */
+  AV1E_GET_BASELINE_GF_INTERVAL = 139,
+
+  /*\brief Control to set encoding the denoised frame from denoise-noise-level
+   *
+   * - 0 = disabled/encode the original frame
+   * - 1 = enabled/encode the denoised frame (default)
    */
-  AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 156,
+  AV1E_SET_ENABLE_DNL_DENOISING = 140,
+
+  /*!\brief Codec control function to turn on / off D45 to D203 intra mode
+   * usage, int parameter
+   *
+   * This will enable or disable usage of D45 to D203 intra modes, which are a
+   * subset of directional modes. This control has no effect if directional
+   * modes are disabled (AV1E_SET_ENABLE_DIRECTIONAL_INTRA set to 0).
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_DIAGONAL_INTRA = 141,
+
+  /*!\brief Control to set frequency of the cost updates for intrabc motion
+   * vectors, unsigned int parameter
+   *
+   * - 0 = update at SB level (default)
+   * - 1 = update at SB row level in tile
+   * - 2 = update at tile level
+   * - 3 = turn off
+   */
+  AV1E_SET_DV_COST_UPD_FREQ = 142,
+
+  /*!\brief Codec control to set the path for partition stats read and write.
+   * const char * parameter.
+   */
+  AV1E_SET_PARTITION_INFO_PATH = 143,
+
+  /*!\brief Codec control to use an external partition model
+   * A set of callback functions is passed through this control
+   * to let the encoder encode with given partitions.
+   */
+  AV1E_SET_EXTERNAL_PARTITION = 144,
+
+  /*!\brief Codec control function to turn on / off directional intra mode
+   * usage, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_DIRECTIONAL_INTRA = 145,
+
+  /*!\brief Control to turn on / off transform size search.
+   *
+   * - 0 = disable, transforms always have the largest possible size
+   * - 1 = enable, search for the best transform size for each block (default)
+   */
+  AV1E_SET_ENABLE_TX_SIZE_SEARCH = 146,
+
+  /*!\brief Codec control function to set reference frame compound prediction.
+   * aom_svc_ref_frame_comp_pred_t* parameter
+   */
+  AV1E_SET_SVC_REF_FRAME_COMP_PRED = 147,
+
+  /*!\brief Set --deltaq-mode strength.
+   *
+   * Valid range: [0, 1000]
+   */
+  AV1E_SET_DELTAQ_STRENGTH = 148,
+
+  /*!\brief Codec control to control loop filter
+   *
+   * - 0 = Loop filter is disabled for all frames
+   * - 1 = Loop filter is enabled for all frames
+   * - 2 = Loop filter is disabled for non-reference frames
+   * - 3 = Loop filter is disabled for the frames with low motion
+   */
+  AV1E_SET_LOOPFILTER_CONTROL = 149,
+
+  /*!\brief Codec control function to get the loopfilter chosen by the encoder,
+   * int* parameter
+   */
+  AOME_GET_LOOPFILTER_LEVEL = 150,
+
+  /*!\brief Codec control to automatically turn off several intra coding tools,
+   * unsigned int parameter
+   * - 0 = do not use the feature
+   * - 1 = enable the automatic decision to turn off several intra tools
+   */
+  AV1E_SET_AUTO_INTRA_TOOLS_OFF = 151,
+
+  /*!\brief Codec control function to set flag for rate control used by external
+   * encoders.
+   * - 1 = Enable rate control for external encoders. This will disable content
+   * dependency in rate control and cyclic refresh.
+   * - 0 = Default. Disable rate control for external encoders.
+   */
+  AV1E_SET_RTC_EXTERNAL_RC = 152,
+
+  /*!\brief Codec control function to enable frame parallel multi-threading
+   * of the encoder, unsigned int parameter
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   */
+  AV1E_SET_FP_MT = 153,
+
+  /*!\brief Codec control to enable actual frame parallel encode or
+   * simulation of frame parallel encode in FPMT unit test, unsigned int
+   * parameter
+   *
+   * - 0 = simulate frame parallel encode
+   * - 1 = actual frame parallel encode (default)
+   *
+   * \note This is only used in FPMT unit test.
+   */
+  AV1E_SET_FP_MT_UNIT_TEST = 154,
+
+  /*!\brief Codec control function to get the target sequence level index for
+   * each operating point. int* parameter. There can be at most 32 operating
+   * points. The results will be written into a provided integer array of
+   * sufficient size. If a target level is not set, the result will be 31.
+   * Please refer to https://aomediacodec.github.io/av1-spec/#levels for more
+   * details on level definitions and indices.
+   */
+  AV1E_GET_TARGET_SEQ_LEVEL_IDX = 155,
+
+  // Any new encoder control IDs should be added above.
+  // Maximum allowed encoder control ID is 229.
+  // No encoder control ID should be added below.
 };
 
 /*!\brief aom 1-D scaling mode
@@ -1266,7 +1459,10 @@ typedef enum aom_scaling_mode_1d {
   AOME_NORMAL = 0,
   AOME_FOURFIVE = 1,
   AOME_THREEFIVE = 2,
-  AOME_ONETWO = 3
+  AOME_THREEFOUR = 3,
+  AOME_ONEFOUR = 4,
+  AOME_ONEEIGHT = 5,
+  AOME_ONETWO = 6
 } AOM_SCALING_MODE;
 
 /*!\brief Max number of segments
@@ -1323,6 +1519,7 @@ typedef struct aom_scaling_mode {
 typedef enum {
   AOM_CONTENT_DEFAULT,
   AOM_CONTENT_SCREEN,
+  AOM_CONTENT_FILM,
   AOM_CONTENT_INVALID
 } aom_tune_content;
 
@@ -1344,9 +1541,28 @@ typedef enum {
   /* NOTE: enums 2 and 3 unused */
   AOM_TUNE_VMAF_WITH_PREPROCESSING = 4,
   AOM_TUNE_VMAF_WITHOUT_PREPROCESSING = 5,
-  AOM_TUNE_VMAF_MAX_GAIN = 6
+  AOM_TUNE_VMAF_MAX_GAIN = 6,
+  AOM_TUNE_VMAF_NEG_MAX_GAIN = 7,
+  AOM_TUNE_BUTTERAUGLI = 8,
 } aom_tune_metric;
 
+/*!\brief Distortion metric to use for RD optimization.
+ *
+ * Changes the encoder to use a different distortion metric for RD search. Note
+ * that this value operates on a "lower level" compared to aom_tune_metric - it
+ * affects the distortion metric inside a block, while aom_tune_metric only
+ * affects RD across blocks.
+ *
+ */
+typedef enum {
+  // Use PSNR for in-block rate-distortion optimization.
+  AOM_DIST_METRIC_PSNR,
+  // Use quantization matrix-weighted PSNR for in-block rate-distortion
+  // optimization. If --enable-qm=1 is not specified, this falls back to
+  // behaving in the same way as AOM_DIST_METRIC_PSNR.
+  AOM_DIST_METRIC_QM_PSNR,
+} aom_dist_metric;
+
 #define AOM_MAX_LAYERS 32   /**< Max number of layers */
 #define AOM_MAX_SS_LAYERS 4 /**< Max number of spatial layers */
 #define AOM_MAX_TS_LAYERS 8 /**< Max number of temporal layers */
@@ -1381,6 +1597,13 @@ typedef struct aom_svc_ref_frame_config {
   int refresh[8]; /**< Refresh flag for each of the 8 slots. */
 } aom_svc_ref_frame_config_t;
 
+/*!brief Parameters for setting ref frame compound prediction */
+typedef struct aom_svc_ref_frame_comp_pred {
+  // Use compound prediction for the ref_frame pairs GOLDEN_LAST (0),
+  // LAST2_LAST (1), and ALTREF_LAST (2).
+  int use_comp_pred[3]; /**<Compound reference flag. */
+} aom_svc_ref_frame_comp_pred_t;
+
 /*!\cond */
 /*!\brief Encoder control function parameter type
  *
@@ -1414,15 +1637,18 @@ AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
 AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
 #define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
 
-AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOBWDREF, unsigned int)
-#define AOM_CTRL_AOME_SET_ENABLEAUTOBWDREF
-
 AOM_CTRL_USE_TYPE(AOME_SET_SHARPNESS, unsigned int)
 #define AOM_CTRL_AOME_SET_SHARPNESS
 
 AOM_CTRL_USE_TYPE(AOME_SET_STATIC_THRESHOLD, unsigned int)
 #define AOM_CTRL_AOME_SET_STATIC_THRESHOLD
 
+AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER, int *)
+#define AOM_CTRL_AOME_GET_LAST_QUANTIZER
+
+AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER_64, int *)
+#define AOM_CTRL_AOME_GET_LAST_QUANTIZER_64
+
 AOM_CTRL_USE_TYPE(AOME_SET_ARNR_MAXFRAMES, unsigned int)
 #define AOM_CTRL_AOME_SET_ARNR_MAXFRAMES
 
@@ -1435,6 +1661,25 @@ AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
 AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
 #define AOM_CTRL_AOME_SET_CQ_LEVEL
 
+AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
+#define AOM_CTRL_AOME_SET_MAX_INTRA_BITRATE_PCT
+
+AOM_CTRL_USE_TYPE(AOME_SET_NUMBER_SPATIAL_LAYERS, int)
+#define AOM_CTRL_AOME_SET_NUMBER_SPATIAL_LAYERS
+// TODO(aomedia:3231): Deprecated. Remove it.
+#define AOME_CTRL_AOME_SET_NUMBER_SPATIAL_LAYERS
+
+AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+#define AOM_CTRL_AV1E_SET_MAX_INTER_BITRATE_PCT
+// TODO(aomedia:3231): Deprecated. Remove it.
+#define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_CBR_BOOST_PCT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int)
+#define AOM_CTRL_AV1E_SET_LOSSLESS
+
 AOM_CTRL_USE_TYPE(AV1E_SET_ROW_MT, unsigned int)
 #define AOM_CTRL_AV1E_SET_ROW_MT
 
@@ -1450,26 +1695,68 @@ AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TPL_MODEL, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_KEYFRAME_FILTERING, unsigned int)
 #define AOM_CTRL_AV1E_SET_ENABLE_KEYFRAME_FILTERING
 
-AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER, int *)
-#define AOM_CTRL_AOME_GET_LAST_QUANTIZER
+AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
+#define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
 
-AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER_64, int *)
-#define AOM_CTRL_AOME_GET_LAST_QUANTIZER_64
+AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, int)
+#define AOM_CTRL_AV1E_SET_ERROR_RESILIENT_MODE
 
-AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
-#define AOM_CTRL_AOME_SET_MAX_INTRA_BITRATE_PCT
+AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, int)
+#define AOM_CTRL_AV1E_SET_S_FRAME_MODE
 
-AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int)
-#define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT
+AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_AQ_MODE
 
-AOM_CTRL_USE_TYPE(AOME_SET_NUMBER_SPATIAL_LAYERS, int)
-#define AOME_CTRL_AOME_SET_NUMBER_SPATIAL_LAYERS
+AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PERIODIC_BOOST, unsigned int)
+#define AOM_CTRL_AV1E_SET_FRAME_PERIODIC_BOOST
 
-AOM_CTRL_USE_TYPE(AV1E_SET_GF_CBR_BOOST_PCT, unsigned int)
-#define AOM_CTRL_AV1E_SET_GF_CBR_BOOST_PCT
+AOM_CTRL_USE_TYPE(AV1E_SET_NOISE_SENSITIVITY, unsigned int)
+#define AOM_CTRL_AV1E_SET_NOISE_SENSITIVITY
 
-AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int)
-#define AOM_CTRL_AV1E_SET_LOSSLESS
+AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */
+#define AOM_CTRL_AV1E_SET_TUNE_CONTENT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_PRIMARIES, int)
+#define AOM_CTRL_AV1E_SET_COLOR_PRIMARIES
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TRANSFER_CHARACTERISTICS, int)
+#define AOM_CTRL_AV1E_SET_TRANSFER_CHARACTERISTICS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MATRIX_COEFFICIENTS, int)
+#define AOM_CTRL_AV1E_SET_MATRIX_COEFFICIENTS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SAMPLE_POSITION, int)
+#define AOM_CTRL_AV1E_SET_CHROMA_SAMPLE_POSITION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MIN_GF_INTERVAL, unsigned int)
+#define AOM_CTRL_AV1E_SET_MIN_GF_INTERVAL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_GF_INTERVAL, unsigned int)
+#define AOM_CTRL_AV1E_SET_MAX_GF_INTERVAL
+
+AOM_CTRL_USE_TYPE(AV1E_GET_ACTIVEMAP, aom_active_map_t *)
+#define AOM_CTRL_AV1E_GET_ACTIVEMAP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int)
+#define AOM_CTRL_AV1E_SET_COLOR_RANGE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
+#define AOM_CTRL_AV1E_SET_RENDER_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_SEQ_LEVEL_IDX, int)
+#define AOM_CTRL_AV1E_SET_TARGET_SEQ_LEVEL_IDX
+
+AOM_CTRL_USE_TYPE(AV1E_GET_SEQ_LEVEL_IDX, int *)
+#define AOM_CTRL_AV1E_GET_SEQ_LEVEL_IDX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
+#define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
+
+AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOBWDREF, unsigned int)
+#define AOM_CTRL_AOME_SET_ENABLEAUTOBWDREF
 
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CDEF, unsigned int)
 #define AOM_CTRL_AV1E_SET_ENABLE_CDEF
@@ -1489,6 +1776,7 @@ AOM_CTRL_USE_TYPE(AV1E_SET_DISABLE_TRELLIS_QUANT, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_QM, unsigned int)
 #define AOM_CTRL_AV1E_SET_ENABLE_QM
 
+// TODO(aomedia:3231): Remove these two lines.
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_8X8, unsigned int)
 #define AOM_CTRL_AV1E_SET_ENABLE_DIST_8X8
 
@@ -1513,9 +1801,6 @@ AOM_CTRL_USE_TYPE(AV1E_SET_NUM_TG, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_MTU, unsigned int)
 #define AOM_CTRL_AV1E_SET_MTU
 
-AOM_CTRL_USE_TYPE(AV1E_SET_TIMING_INFO_TYPE, int) /* aom_timing_info_type_t */
-#define AOM_CTRL_AV1E_SET_TIMING_INFO_TYPE
-
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RECT_PARTITIONS, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_RECT_PARTITIONS
 
@@ -1543,6 +1828,9 @@ AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TX64, int)
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FLIP_IDTX, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_FLIP_IDTX
 
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RECT_TX, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_RECT_TX
+
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_WTD_COMP, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_DIST_WTD_COMP
 
@@ -1615,77 +1903,20 @@ AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRABC, int)
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ANGLE_DELTA, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_ANGLE_DELTA
 
-AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
-#define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
-
-AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, int)
-#define AOM_CTRL_AV1E_SET_ERROR_RESILIENT_MODE
-
-AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, int)
-#define AOM_CTRL_AV1E_SET_S_FRAME_MODE
-
-AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
-#define AOM_CTRL_AV1E_SET_AQ_MODE
-
 AOM_CTRL_USE_TYPE(AV1E_SET_DELTAQ_MODE, unsigned int)
 #define AOM_CTRL_AV1E_SET_DELTAQ_MODE
 
 AOM_CTRL_USE_TYPE(AV1E_SET_DELTALF_MODE, unsigned int)
 #define AOM_CTRL_AV1E_SET_DELTALF_MODE
 
-AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PERIODIC_BOOST, unsigned int)
-#define AOM_CTRL_AV1E_SET_FRAME_PERIODIC_BOOST
-
-AOM_CTRL_USE_TYPE(AV1E_SET_NOISE_SENSITIVITY, unsigned int)
-#define AOM_CTRL_AV1E_SET_NOISE_SENSITIVITY
-
-AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */
-#define AOM_CTRL_AV1E_SET_TUNE_CONTENT
-
-AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_PRIMARIES, int)
-#define AOM_CTRL_AV1E_SET_COLOR_PRIMARIES
-
-AOM_CTRL_USE_TYPE(AV1E_SET_TRANSFER_CHARACTERISTICS, int)
-#define AOM_CTRL_AV1E_SET_TRANSFER_CHARACTERISTICS
-
-AOM_CTRL_USE_TYPE(AV1E_SET_MATRIX_COEFFICIENTS, int)
-#define AOM_CTRL_AV1E_SET_MATRIX_COEFFICIENTS
-
-AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SAMPLE_POSITION, int)
-#define AOM_CTRL_AV1E_SET_CHROMA_SAMPLE_POSITION
-
-AOM_CTRL_USE_TYPE(AV1E_SET_MIN_GF_INTERVAL, unsigned int)
-#define AOM_CTRL_AV1E_SET_MIN_GF_INTERVAL
-
-AOM_CTRL_USE_TYPE(AV1E_SET_MAX_GF_INTERVAL, unsigned int)
-#define AOM_CTRL_AV1E_SET_MAX_GF_INTERVAL
-
-AOM_CTRL_USE_TYPE(AV1E_GET_ACTIVEMAP, aom_active_map_t *)
-#define AOM_CTRL_AV1E_GET_ACTIVEMAP
-
-AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int)
-#define AOM_CTRL_AV1E_SET_COLOR_RANGE
-
-#define AOM_CTRL_AV1E_SET_RENDER_SIZE
-AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
-
-AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
-#define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
-
-AOM_CTRL_USE_TYPE(AV1E_GET_SEQ_LEVEL_IDX, int *)
-#define AOM_CTRL_AV1E_GET_SEQ_LEVEL_IDX
-
 AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int)
 #define AOM_CTRL_AV1E_SET_SINGLE_TILE_DECODING
 
 AOM_CTRL_USE_TYPE(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
 #define AOM_CTRL_AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST
 
-AOM_CTRL_USE_TYPE(AV1E_ENABLE_EXT_TILE_DEBUG, unsigned int)
-#define AOM_CTRL_AV1E_ENABLE_EXT_TILE_DEBUG
-
-AOM_CTRL_USE_TYPE(AV1E_SET_VMAF_MODEL_PATH, const char *)
-#define AOM_CTRL_AV1E_SET_VMAF_MODEL_PATH
+AOM_CTRL_USE_TYPE(AV1E_SET_TIMING_INFO_TYPE, int) /* aom_timing_info_type_t */
+#define AOM_CTRL_AV1E_SET_TIMING_INFO_TYPE
 
 AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, int)
 #define AOM_CTRL_AV1E_SET_FILM_GRAIN_TEST_VECTOR
@@ -1693,9 +1924,6 @@ AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, int)
 AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TABLE, const char *)
 #define AOM_CTRL_AV1E_SET_FILM_GRAIN_TABLE
 
-AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, unsigned int)
-#define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE
-
 AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_NOISE_LEVEL, int)
 #define AOM_CTRL_AV1E_SET_DENOISE_NOISE_LEVEL
 
@@ -1723,9 +1951,6 @@ AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DEFAULT_TX_ONLY, int)
 AOM_CTRL_USE_TYPE(AV1E_SET_QUANT_B_ADAPT, int)
 #define AOM_CTRL_AV1E_SET_QUANT_B_ADAPT
 
-AOM_CTRL_USE_TYPE(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, unsigned int)
-#define AOM_CTRL_AV1E_SET_GF_MIN_PYRAMID_HEIGHT
-
 AOM_CTRL_USE_TYPE(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, unsigned int)
 #define AOM_CTRL_AV1E_SET_GF_MAX_PYRAMID_HEIGHT
 
@@ -1744,9 +1969,6 @@ AOM_CTRL_USE_TYPE(AV1E_SET_MODE_COST_UPD_FREQ, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_MV_COST_UPD_FREQ, unsigned int)
 #define AOM_CTRL_AV1E_SET_MV_COST_UPD_FREQ
 
-AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_SEQ_LEVEL_IDX, int)
-#define AOM_CTRL_AV1E_SET_TARGET_SEQ_LEVEL_IDX
-
 AOM_CTRL_USE_TYPE(AV1E_SET_TIER_MASK, unsigned int)
 #define AOM_CTRL_AV1E_SET_TIER_MASK
 
@@ -1754,17 +1976,89 @@ AOM_CTRL_USE_TYPE(AV1E_SET_MIN_CR, unsigned int)
 #define AOM_CTRL_AV1E_SET_MIN_CR
 
 AOM_CTRL_USE_TYPE(AV1E_SET_SVC_LAYER_ID, aom_svc_layer_id_t *)
+#define AOM_CTRL_AV1E_SET_SVC_LAYER_ID
+// TODO(aomedia:3231): Deprecated. Remove it.
 #define AOME_CTRL_AV1E_SET_SVC_LAYER_ID
 
 AOM_CTRL_USE_TYPE(AV1E_SET_SVC_PARAMS, aom_svc_params_t *)
+#define AOM_CTRL_AV1E_SET_SVC_PARAMS
+// TODO(aomedia:3231): Deprecated. Remove it.
 #define AOME_CTRL_AV1E_SET_SVC_PARAMS
 
 AOM_CTRL_USE_TYPE(AV1E_SET_SVC_REF_FRAME_CONFIG, aom_svc_ref_frame_config_t *)
+#define AOM_CTRL_AV1E_SET_SVC_REF_FRAME_CONFIG
+// TODO(aomedia:3231): Deprecated. Remove it.
 #define AOME_CTRL_AV1E_SET_SVC_REF_FRAME_CONFIG
 
+AOM_CTRL_USE_TYPE(AV1E_SET_VMAF_MODEL_PATH, const char *)
+#define AOM_CTRL_AV1E_SET_VMAF_MODEL_PATH
+
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_EXT_TILE_DEBUG, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_EXT_TILE_DEBUG
+
 AOM_CTRL_USE_TYPE(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, unsigned int)
 #define AOM_CTRL_AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST
 
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_MIN_PYRAMID_HEIGHT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, unsigned int)
+#define AOM_CTRL_AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP
+
+AOM_CTRL_USE_TYPE(AV1E_GET_BASELINE_GF_INTERVAL, int *)
+#define AOM_CTRL_AV1E_GET_BASELINE_GF_INTERVAL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DNL_DENOISING, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DNL_DENOISING
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIAGONAL_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIAGONAL_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DV_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_DV_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_PARTITION_INFO_PATH, const char *)
+#define AOM_CTRL_AV1E_SET_PARTITION_INFO_PATH
+
+AOM_CTRL_USE_TYPE(AV1E_SET_EXTERNAL_PARTITION, aom_ext_part_funcs_t *)
+#define AOM_CTRL_AV1E_SET_EXTERNAL_PARTITION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIRECTIONAL_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIRECTIONAL_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TX_SIZE_SEARCH, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_TX_SIZE_SEARCH
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_REF_FRAME_COMP_PRED,
+                  aom_svc_ref_frame_comp_pred_t *)
+#define AOM_CTRL_AV1E_SET_SVC_REF_FRAME_COMP_PRED
+// TODO(aomedia:3231): Deprecated. Remove it.
+#define AOME_CTRL_AV1E_SET_SVC_REF_FRAME_COMP_PRED
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DELTAQ_STRENGTH, unsigned int)
+#define AOM_CTRL_AV1E_SET_DELTAQ_STRENGTH
+
+AOM_CTRL_USE_TYPE(AV1E_SET_LOOPFILTER_CONTROL, int)
+#define AOM_CTRL_AV1E_SET_LOOPFILTER_CONTROL
+
+AOM_CTRL_USE_TYPE(AOME_GET_LOOPFILTER_LEVEL, int *)
+#define AOM_CTRL_AOME_GET_LOOPFILTER_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_AUTO_INTRA_TOOLS_OFF, unsigned int)
+#define AOM_CTRL_AV1E_SET_AUTO_INTRA_TOOLS_OFF
+
+AOM_CTRL_USE_TYPE(AV1E_SET_RTC_EXTERNAL_RC, int)
+#define AOM_CTRL_AV1E_SET_RTC_EXTERNAL_RC
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FP_MT, unsigned int)
+#define AOM_CTRL_AV1E_SET_FP_MT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FP_MT_UNIT_TEST, unsigned int)
+#define AOM_CTRL_AV1E_SET_FP_MT_UNIT_TEST
+
+AOM_CTRL_USE_TYPE(AV1E_GET_TARGET_SEQ_LEVEL_IDX, int *)
+#define AOM_CTRL_AV1E_GET_TARGET_SEQ_LEVEL_IDX
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/media/libaom/src/aom/aomdx.h b/media/libaom/src/aom/aomdx.h
index 8cd5de3959..02ea19597c 100644
--- a/media/libaom/src/aom/aomdx.h
+++ b/media/libaom/src/aom/aomdx.h
@@ -33,9 +33,17 @@ extern "C" {
  * This interface provides the capability to decode AV1 streams.
  * @{
  */
+
+/*!\brief A single instance of the AV1 decoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer aom_codec_av1_dx().
+ */
 extern aom_codec_iface_t aom_codec_av1_dx_algo;
+/*!\brief The interface to the AV1 decoder.
+ */
 extern aom_codec_iface_t *aom_codec_av1_dx(void);
-/*!@} - end algorithm interface member group*/
+
+/*!@} - end algorithm interface member group */
 
 /** Data structure that stores bit accounting for debug
  */
@@ -89,6 +97,81 @@ typedef struct aom_tile_data {
   size_t extra_size;
 } aom_tile_data;
 
+/*!\brief Max number of tile columns
+ *
+ * This is the limit of number of tile columns allowed within a frame.
+ *
+ * Currently same as "MAX_TILE_COLS" in AV1, the maximum that AV1 supports.
+ *
+ */
+#define AOM_MAX_TILE_COLS 64
+/*!\brief Max number of tile rows
+ *
+ * This is the limit of number of tile rows allowed within a frame.
+ *
+ * Currently same as "MAX_TILE_ROWS" in AV1, the maximum that AV1 supports.
+ *
+ */
+#define AOM_MAX_TILE_ROWS 64
+
+/*!\brief Structure to hold information about tiles in a frame.
+ *
+ * Defines a structure to hold a frame's tile information, namely
+ * number of tile columns, number of tile_rows, and the width and
+ * height of each tile.
+ */
+typedef struct aom_tile_info {
+  /*! Indicates the number of tile columns. */
+  int tile_columns;
+  /*! Indicates the number of tile rows. */
+  int tile_rows;
+  /*! Indicates the tile widths in units of SB. */
+  int tile_widths[AOM_MAX_TILE_COLS];
+  /*! Indicates the tile heights in units of SB. */
+  int tile_heights[AOM_MAX_TILE_ROWS];
+  /*! Indicates the number of tile groups present in a frame. */
+  int num_tile_groups;
+} aom_tile_info;
+
+/*!\brief Structure to hold information about still image coding.
+ *
+ * Defines a structure to hold a information regarding still picture
+ * and its header type.
+ */
+typedef struct aom_still_picture_info {
+  /*! Video is a single frame still picture */
+  int is_still_picture;
+  /*! Use full header for still picture */
+  int is_reduced_still_picture_hdr;
+} aom_still_picture_info;
+
+/*!\brief Structure to hold information about S_FRAME.
+ *
+ * Defines a structure to hold a information regarding S_FRAME
+ * and its position.
+ */
+typedef struct aom_s_frame_info {
+  /*! Indicates if current frame is S_FRAME */
+  int is_s_frame;
+  /*! Indicates if current S_FRAME is present at ALTREF frame*/
+  int is_s_frame_at_altref;
+} aom_s_frame_info;
+
+/*!\brief Structure to hold information about screen content tools.
+ *
+ * Defines a structure to hold information about screen content
+ * tools, namely: allow_screen_content_tools, allow_intrabc, and
+ * force_integer_mv.
+ */
+typedef struct aom_screen_content_tools_info {
+  /*! Are screen content tools allowed */
+  int allow_screen_content_tools;
+  /*! Is intrabc allowed */
+  int allow_intrabc;
+  /*! Is integer mv forced */
+  int force_integer_mv;
+} aom_screen_content_tools_info;
+
 /*!\brief Structure to hold the external reference frame pointer.
  *
  * Define a structure to hold the external reference frame pointer.
@@ -105,6 +188,7 @@ typedef struct av1_ext_ref_frame {
  *
  * This set of macros define the control functions available for the AOM
  * decoder interface.
+ * The range for decoder control ID is >= 256.
  *
  * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...)
  */
@@ -125,14 +209,16 @@ enum aom_dec_control_id {
   AOMD_GET_LAST_REF_USED,
 
   /*!\brief Codec control function to get the dimensions that the current
-   * frame is decoded at, int* parameter. This may be different to the
-   * intended display size for the frame as specified in the wrapper or frame
-   * header (see AV1D_GET_DISPLAY_SIZE).
+   * frame is decoded at, int* parameter
+   *
+   * This may be different to the intended display size for the frame as
+   * specified in the wrapper or frame header (see AV1D_GET_DISPLAY_SIZE).
    */
   AV1D_GET_FRAME_SIZE,
 
   /*!\brief Codec control function to get the current frame's intended display
-   * dimensions (as specified in the wrapper or frame header), int* parameter.
+   * dimensions (as specified in the wrapper or frame header), int* parameter
+   *
    * This may be different to the decoded dimensions of this frame (see
    * AV1D_GET_FRAME_SIZE).
    */
@@ -148,12 +234,13 @@ enum aom_dec_control_id {
    */
   AV1D_GET_IMG_FORMAT,
 
-  /*!\brief Codec control function to get the size of the tile, unsigned int
-    parameter */
+  /*!\brief Codec control function to get the size of the tile, unsigned int*
+   * parameter
+   */
   AV1D_GET_TILE_SIZE,
 
-  /*!\brief Codec control function to get the tile count in a tile list, int*
-   * parameter
+  /*!\brief Codec control function to get the tile count in a tile list,
+   * unsigned int* parameter
    */
   AV1D_GET_TILE_COUNT,
 
@@ -194,8 +281,8 @@ enum aom_dec_control_id {
    * The caller should ensure that AOM_CODEC_OK is returned before attempting
    * to dereference the Accounting pointer.
    *
-   * \attention When compiled without --enable-accounting, this returns
-   * AOM_CODEC_INCAPABLE.
+   * \attention When configured with -DCONFIG_ACCOUNTING=0, the default, this
+   * returns AOM_CODEC_INCAPABLE.
    */
   AV1_GET_ACCOUNTING,
 
@@ -217,7 +304,8 @@ enum aom_dec_control_id {
   AV1_SET_DECODE_TILE_ROW,
   AV1_SET_DECODE_TILE_COL,
 
-  /*!\brief Codec control function to set the tile coding mode, int parameter
+  /*!\brief Codec control function to set the tile coding mode, unsigned int
+   * parameter
    *
    * - 0 = tiles are coded in normal tile mode
    * - 1 = tiles are coded in large-scale tile mode
@@ -225,7 +313,7 @@ enum aom_dec_control_id {
   AV1_SET_TILE_MODE,
 
   /*!\brief Codec control function to get the frame header information of an
-   * encoded frame, unsigned int* parameter
+   * encoded frame, aom_tile_data* parameter
    */
   AV1D_GET_FRAME_HEADER_INFO,
 
@@ -271,7 +359,7 @@ enum aom_dec_control_id {
   AV1D_SET_OPERATING_POINT,
 
   /*!\brief Codec control function to indicate whether to output one frame per
-   * temporal unit (the default), or one frame per spatial layer. int parameter
+   * temporal unit (the default), or one frame per spatial layer, int parameter
    *
    * In a scalable stream, each temporal unit corresponds to a single "frame"
    * of video, and within a temporal unit there may be multiple spatial layers
@@ -285,7 +373,7 @@ enum aom_dec_control_id {
   /*!\brief Codec control function to set an aom_inspect_cb callback that is
    * invoked each time a frame is decoded, aom_inspect_init* parameter
    *
-   * \attention When compiled without --enable-inspection, this
+   * \attention When configured with -DCONFIG_INSPECTION=0, the default, this
    * returns AOM_CODEC_INCAPABLE.
    */
   AV1_SET_INSPECTION_CALLBACK,
@@ -298,7 +386,83 @@ enum aom_dec_control_id {
    */
   AV1D_SET_SKIP_FILM_GRAIN,
 
-  AOM_DECODER_CTRL_ID_MAX,
+  /*!\brief Codec control function to check the presence of forward key frames,
+   * int* parameter
+   */
+  AOMD_GET_FWD_KF_PRESENT,
+
+  /*!\brief Codec control function to get the frame flags of the previous frame
+   * decoded, int* parameter
+   *
+   * This will return a flag of type aom_codec_frame_flags_t.
+   */
+  AOMD_GET_FRAME_FLAGS,
+
+  /*!\brief Codec control function to check the presence of altref frames, int*
+   * parameter
+   */
+  AOMD_GET_ALTREF_PRESENT,
+
+  /*!\brief Codec control function to get tile information of the previous frame
+   * decoded, aom_tile_info* parameter
+   *
+   * This will return a struct of type aom_tile_info.
+   */
+  AOMD_GET_TILE_INFO,
+
+  /*!\brief Codec control function to get screen content tools information,
+   * aom_screen_content_tools_info* parameter
+   *
+   * It returns a struct of type aom_screen_content_tools_info, which contains
+   * the header flags allow_screen_content_tools, allow_intrabc, and
+   * force_integer_mv.
+   */
+  AOMD_GET_SCREEN_CONTENT_TOOLS_INFO,
+
+  /*!\brief Codec control function to get the still picture coding information,
+   * aom_still_picture_info* parameter
+   */
+  AOMD_GET_STILL_PICTURE,
+
+  /*!\brief Codec control function to get superblock size,
+   * aom_superblock_size_t* parameter
+   *
+   * It returns an enum, indicating the superblock size read from the sequence
+   * header(0 for BLOCK_64X64 and 1 for BLOCK_128X128)
+   */
+  AOMD_GET_SB_SIZE,
+
+  /*!\brief Codec control function to check if the previous frame
+   * decoded has show existing frame flag set, int* parameter
+   */
+  AOMD_GET_SHOW_EXISTING_FRAME_FLAG,
+
+  /*!\brief Codec control function to get the S_FRAME coding information,
+   * aom_s_frame_info* parameter
+   */
+  AOMD_GET_S_FRAME_INFO,
+
+  /*!\brief Codec control function to get the show frame flag, int* parameter
+   */
+  AOMD_GET_SHOW_FRAME_FLAG,
+
+  /*!\brief Codec control function to get the base q index of a frame, int*
+   * parameter
+   */
+  AOMD_GET_BASE_Q_IDX,
+
+  /*!\brief Codec control function to get the order hint of a frame, unsigned
+   * int* parameter
+   */
+  AOMD_GET_ORDER_HINT,
+
+  /*!\brief Codec control function to get the info of a 4x4 block.
+   * Parameters: int mi_row, int mi_col, and MB_MODE_INFO*.
+   *
+   * \note This only returns a shallow copy, so all pointer members should not
+   * be used.
+   */
+  AV1D_GET_MI_INFO,
 };
 
 /*!\cond */
@@ -322,8 +486,8 @@ AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *)
 AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *)
 #define AOM_CTRL_AOMD_GET_LAST_REF_USED
 
-AOM_CTRL_USE_TYPE(AOMD_GET_LAST_QUANTIZER, int *)
-#define AOM_CTRL_AOMD_GET_LAST_QUANTIZER
+AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
+#define AOM_CTRL_AV1D_GET_FRAME_SIZE
 
 AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
 #define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
@@ -340,15 +504,18 @@ AOM_CTRL_USE_TYPE(AV1D_GET_TILE_SIZE, unsigned int *)
 AOM_CTRL_USE_TYPE(AV1D_GET_TILE_COUNT, unsigned int *)
 #define AOM_CTRL_AV1D_GET_TILE_COUNT
 
-AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
-#define AOM_CTRL_AV1D_GET_FRAME_SIZE
-
 AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
 #define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER
 
+AOM_CTRL_USE_TYPE(AV1_SET_SKIP_LOOP_FILTER, int)
+#define AOM_CTRL_AV1_SET_SKIP_LOOP_FILTER
+
 AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **)
 #define AOM_CTRL_AV1_GET_ACCOUNTING
 
+AOM_CTRL_USE_TYPE(AOMD_GET_LAST_QUANTIZER, int *)
+#define AOM_CTRL_AOMD_GET_LAST_QUANTIZER
+
 AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
 #define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
 
@@ -373,9 +540,6 @@ AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int)
 AOM_CTRL_USE_TYPE(AV1D_SET_ROW_MT, unsigned int)
 #define AOM_CTRL_AV1D_SET_ROW_MT
 
-AOM_CTRL_USE_TYPE(AV1D_SET_SKIP_FILM_GRAIN, int)
-#define AOM_CTRL_AV1D_SET_SKIP_FILM_GRAIN
-
 AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int)
 #define AOM_CTRL_AV1D_SET_IS_ANNEXB
 
@@ -387,9 +551,52 @@ AOM_CTRL_USE_TYPE(AV1D_SET_OUTPUT_ALL_LAYERS, int)
 
 AOM_CTRL_USE_TYPE(AV1_SET_INSPECTION_CALLBACK, aom_inspect_init *)
 #define AOM_CTRL_AV1_SET_INSPECTION_CALLBACK
+
+AOM_CTRL_USE_TYPE(AV1D_SET_SKIP_FILM_GRAIN, int)
+#define AOM_CTRL_AV1D_SET_SKIP_FILM_GRAIN
+
+AOM_CTRL_USE_TYPE(AOMD_GET_FWD_KF_PRESENT, int *)
+#define AOM_CTRL_AOMD_GET_FWD_KF_PRESENT
+
+AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_FLAGS, int *)
+#define AOM_CTRL_AOMD_GET_FRAME_FLAGS
+
+AOM_CTRL_USE_TYPE(AOMD_GET_ALTREF_PRESENT, int *)
+#define AOM_CTRL_AOMD_GET_ALTREF_PRESENT
+
+AOM_CTRL_USE_TYPE(AOMD_GET_TILE_INFO, aom_tile_info *)
+#define AOM_CTRL_AOMD_GET_TILE_INFO
+
+AOM_CTRL_USE_TYPE(AOMD_GET_SCREEN_CONTENT_TOOLS_INFO,
+                  aom_screen_content_tools_info *)
+#define AOM_CTRL_AOMD_GET_SCREEN_CONTENT_TOOLS_INFO
+
+AOM_CTRL_USE_TYPE(AOMD_GET_STILL_PICTURE, aom_still_picture_info *)
+#define AOM_CTRL_AOMD_GET_STILL_PICTURE
+
+AOM_CTRL_USE_TYPE(AOMD_GET_SB_SIZE, aom_superblock_size_t *)
+#define AOMD_CTRL_AOMD_GET_SB_SIZE
+
+AOM_CTRL_USE_TYPE(AOMD_GET_SHOW_EXISTING_FRAME_FLAG, int *)
+#define AOMD_CTRL_AOMD_GET_SHOW_EXISTING_FRAME_FLAG
+
+AOM_CTRL_USE_TYPE(AOMD_GET_S_FRAME_INFO, aom_s_frame_info *)
+#define AOMD_CTRL_AOMD_GET_S_FRAME_INFO
+
+AOM_CTRL_USE_TYPE(AOMD_GET_SHOW_FRAME_FLAG, int *)
+#define AOM_CTRL_AOMD_GET_SHOW_FRAME_FLAG
+
+AOM_CTRL_USE_TYPE(AOMD_GET_BASE_Q_IDX, int *)
+#define AOM_CTRL_AOMD_GET_BASE_Q_IDX
+
+AOM_CTRL_USE_TYPE(AOMD_GET_ORDER_HINT, unsigned int *)
+#define AOM_CTRL_AOMD_GET_ORDER_HINT
+
+// The AOM_CTRL_USE_TYPE macro can't be used with AV1D_GET_MI_INFO because
+// AV1D_GET_MI_INFO takes more than one parameter.
+#define AOM_CTRL_AV1D_GET_MI_INFO
 /*!\endcond */
 /*! @} - end defgroup aom_decoder */
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/aom/exports_com b/media/libaom/src/aom/exports_com
index 6f796f5db0..266e2943a3 100644
--- a/media/libaom/src/aom/exports_com
+++ b/media/libaom/src/aom/exports_com
@@ -6,6 +6,7 @@ text aom_codec_error
 text aom_codec_error_detail
 text aom_codec_get_caps
 text aom_codec_iface_name
+text aom_codec_set_option
 text aom_codec_version
 text aom_codec_version_extra_str
 text aom_codec_version_str
diff --git a/media/libaom/src/aom/internal/aom_codec_internal.h b/media/libaom/src/aom/internal/aom_codec_internal.h
index efe09acc91..fc2975d5e1 100644
--- a/media/libaom/src/aom/internal/aom_codec_internal.h
+++ b/media/libaom/src/aom/internal/aom_codec_internal.h
@@ -28,13 +28,15 @@
  *     </pre>
  *
  * An application instantiates a specific decoder instance by using
- * aom_codec_init() and a pointer to the algorithm's interface structure:
+ * aom_codec_dec_init() and a pointer to the algorithm's interface structure:
  *     <pre>
  *     my_app.c:
  *       extern aom_codec_iface_t my_codec;
  *       {
  *           aom_codec_ctx_t algo;
- *           res = aom_codec_init(&algo, &my_codec);
+ *           int threads = 4;
+ *           aom_codec_dec_cfg_t cfg = { threads, 0, 0, 1 };
+ *           res = aom_codec_dec_init(&algo, &my_codec, &cfg, 0);
  *       }
  *     </pre>
  *
@@ -45,6 +47,7 @@
 #define AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
 #include "../aom_decoder.h"
 #include "../aom_encoder.h"
+#include "common/args_helper.h"
 #include <stdarg.h>
 
 #ifdef __cplusplus
@@ -66,7 +69,7 @@ typedef struct aom_codec_alg_priv aom_codec_alg_priv_t;
 /*!\brief init function pointer prototype
  *
  * Performs algorithm-specific initialization of the decoder context. This
- * function is called by the generic aom_codec_init() wrapper function, so
+ * function is called by aom_codec_dec_init() and aom_codec_enc_init(), so
  * plugins implementing this interface may trust the input parameters to be
  * properly initialized.
  *
@@ -151,22 +154,45 @@ typedef aom_codec_err_t (*aom_codec_get_si_fn_t)(aom_codec_alg_priv_t *ctx,
 typedef aom_codec_err_t (*aom_codec_control_fn_t)(aom_codec_alg_priv_t *ctx,
                                                   va_list ap);
 
+/*!\brief codec option setter function pointer prototype
+ * This function is used to set a codec option using a key (option name) & value
+ * pair.
+ *
+ * \param[in]     ctx              Pointer to this instance's context
+ * \param[in]     name             A string of the option's name (key)
+ * \param[in]     value            A string of the value to be set to
+ *
+ * \retval #AOM_CODEC_OK
+ *     The option is successfully set to the value
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     The data was not valid.
+ */
+typedef aom_codec_err_t (*aom_codec_set_option_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                     const char *name,
+                                                     const char *value);
+
 /*!\brief control function pointer mapping
  *
  * This structure stores the mapping between control identifiers and
  * implementing functions. Each algorithm provides a list of these
- * mappings. This list is searched by the aom_codec_control() wrapper
+ * mappings. This list is searched by the aom_codec_control()
  * function to determine which function to invoke. The special
- * value {0, NULL} is used to indicate end-of-list, and must be
- * present. The special value {0, <non-null>} can be used as a catch-all
- * mapping. This implies that ctrl_id values chosen by the algorithm
- * \ref MUST be non-zero.
+ * value defined by CTRL_MAP_END is used to indicate end-of-list, and must be
+ * present. It can be tested with the at_ctrl_map_end function. Note that
+ * ctrl_id values \ref MUST be non-zero.
  */
 typedef const struct aom_codec_ctrl_fn_map {
   int ctrl_id;
   aom_codec_control_fn_t fn;
 } aom_codec_ctrl_fn_map_t;
 
+#define CTRL_MAP_END \
+  { 0, NULL }
+
+static AOM_INLINE int at_ctrl_map_end(aom_codec_ctrl_fn_map_t *e) {
+  return e->ctrl_id == 0 && e->fn == NULL;
+}
+
 /*!\brief decode data function pointer prototype
  *
  * Processes a buffer of coded data. This function is called by the generic
@@ -252,7 +278,7 @@ typedef aom_fixed_buf_t *(*aom_codec_get_global_headers_fn_t)(
 typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)(
     aom_codec_alg_priv_t *ctx);
 
-/*!\brief Decoder algorithm interface interface
+/*!\brief Decoder algorithm interface
  *
  * All decoders \ref MUST expose a variable of this type.
  */
@@ -284,6 +310,7 @@ struct aom_codec_iface {
     aom_codec_get_preview_frame_fn_t
         get_preview; /**< \copydoc ::aom_codec_get_preview_frame_fn_t */
   } enc;
+  aom_codec_set_option_fn_t set_option;
 };
 
 /*!\brief Instance private storage
@@ -307,19 +334,6 @@ struct aom_codec_priv {
 
 #define CAST(id, arg) va_arg((arg), aom_codec_control_type_##id)
 
-/* CODEC_INTERFACE convenience macro
- *
- * By convention, each codec interface is a struct with extern linkage, where
- * the symbol is suffixed with _algo. A getter function is also defined to
- * return a pointer to the struct, since in some cases it's easier to work
- * with text symbols than data symbols (see issue #169). This function has
- * the same name as the struct, less the _algo suffix. The CODEC_INTERFACE
- * macro is provided to define this getter function automatically.
- */
-#define CODEC_INTERFACE(id)                          \
-  aom_codec_iface_t *id(void) { return &id##_algo; } \
-  aom_codec_iface_t id##_algo
-
 /* Internal Utility Functions
  *
  * The following functions are intended to be used inside algorithms as
@@ -356,7 +370,7 @@ const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
 struct aom_internal_error_info {
   aom_codec_err_t error_code;
   int has_detail;
-  char detail[80];
+  char detail[ARG_ERR_MSG_MAX_LEN];
   int setjmp;  // Boolean: whether 'jmp' is valid.
   jmp_buf jmp;
 };
@@ -369,9 +383,21 @@ struct aom_internal_error_info {
 #endif
 #endif
 
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+#define LIBAOM_FORMAT_PRINTF(string_index, first_to_check)
+#if defined(__has_attribute)
+#if __has_attribute(format)
+#undef LIBAOM_FORMAT_PRINTF
+#define LIBAOM_FORMAT_PRINTF(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#endif
+
 void aom_internal_error(struct aom_internal_error_info *info,
-                        aom_codec_err_t error, const char *fmt,
-                        ...) CLANG_ANALYZER_NORETURN;
+                        aom_codec_err_t error, const char *fmt, ...)
+    LIBAOM_FORMAT_PRINTF(3, 4) CLANG_ANALYZER_NORETURN;
 
 void aom_merge_corrupted_flag(int *corrupted, int value);
 #ifdef __cplusplus
diff --git a/media/libaom/src/aom/internal/aom_image_internal.h b/media/libaom/src/aom/internal/aom_image_internal.h
index 7f2fd1891d..1b04c9ec3f 100644
--- a/media/libaom/src/aom/internal/aom_image_internal.h
+++ b/media/libaom/src/aom/internal/aom_image_internal.h
@@ -32,8 +32,8 @@ struct aom_metadata_array {
 /*!\brief Alloc memory for aom_metadata_array struct.
  *
  * Allocate memory for aom_metadata_array struct.
- * If sz is 0 the aom_metadata_array structs internal buffer list will be NULL,
- * but the aom_metadata_array struct itself will still be allocated.
+ * If sz is 0 the aom_metadata_array struct's internal buffer list will be
+ * NULL, but the aom_metadata_array struct itself will still be allocated.
  * Returns a pointer to the allocated struct or NULL on failure.
  *
  * \param[in]    sz       Size of internal metadata list buffer
diff --git a/media/libaom/src/aom/src/aom_codec.c b/media/libaom/src/aom/src/aom_codec.c
index 196ab8354d..bc2039a6a8 100644
--- a/media/libaom/src/aom/src/aom_codec.c
+++ b/media/libaom/src/aom/src/aom_codec.c
@@ -22,8 +22,6 @@
 #include "aom/aom_integer.h"
 #include "aom/internal/aom_codec_internal.h"
 
-#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
-
 int aom_codec_version(void) { return VERSION_PACKED; }
 
 const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; }
@@ -67,22 +65,19 @@ const char *aom_codec_error_detail(aom_codec_ctx_t *ctx) {
 }
 
 aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) {
-  aom_codec_err_t res;
-
-  if (!ctx)
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv)
-    res = AOM_CODEC_ERROR;
-  else {
-    ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
-
-    ctx->iface = NULL;
-    ctx->name = NULL;
-    ctx->priv = NULL;
-    res = AOM_CODEC_OK;
+  if (!ctx) {
+    return AOM_CODEC_INVALID_PARAM;
   }
-
-  return SAVE_STATUS(ctx, res);
+  if (!ctx->iface || !ctx->priv) {
+    ctx->err = AOM_CODEC_ERROR;
+    return AOM_CODEC_ERROR;
+  }
+  ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
+  ctx->iface = NULL;
+  ctx->name = NULL;
+  ctx->priv = NULL;
+  ctx->err = AOM_CODEC_OK;
+  return AOM_CODEC_OK;
 }
 
 aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) {
@@ -90,30 +85,48 @@ aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) {
 }
 
 aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
-  aom_codec_err_t res;
-
-  if (!ctx || !ctrl_id)
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps)
-    res = AOM_CODEC_ERROR;
-  else {
-    aom_codec_ctrl_fn_map_t *entry;
-
-    res = AOM_CODEC_ERROR;
-
-    for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
-      if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
-        va_list ap;
-
-        va_start(ap, ctrl_id);
-        res = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
-        va_end(ap);
-        break;
-      }
+  if (!ctx) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  // Control ID must be non-zero.
+  if (!ctrl_id) {
+    ctx->err = AOM_CODEC_INVALID_PARAM;
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps) {
+    ctx->err = AOM_CODEC_ERROR;
+    return AOM_CODEC_ERROR;
+  }
+
+  // "ctrl_maps" is an array of (control ID, function pointer) elements,
+  // with CTRL_MAP_END as a sentinel.
+  for (aom_codec_ctrl_fn_map_t *entry = ctx->iface->ctrl_maps;
+       !at_ctrl_map_end(entry); ++entry) {
+    if (entry->ctrl_id == ctrl_id) {
+      va_list ap;
+      va_start(ap, ctrl_id);
+      ctx->err = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
+      va_end(ap);
+      return ctx->err;
     }
   }
+  ctx->err = AOM_CODEC_ERROR;
+  ctx->priv->err_detail = "Invalid control ID";
+  return AOM_CODEC_ERROR;
+}
 
-  return SAVE_STATUS(ctx, res);
+aom_codec_err_t aom_codec_set_option(aom_codec_ctx_t *ctx, const char *name,
+                                     const char *value) {
+  if (!ctx) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  if (!ctx->iface || !ctx->priv || !ctx->iface->set_option) {
+    ctx->err = AOM_CODEC_ERROR;
+    return AOM_CODEC_ERROR;
+  }
+  ctx->err =
+      ctx->iface->set_option((aom_codec_alg_priv_t *)ctx->priv, name, value);
+  return ctx->err;
 }
 
 void aom_internal_error(struct aom_internal_error_info *info,
diff --git a/media/libaom/src/aom/src/aom_encoder.c b/media/libaom/src/aom/src/aom_encoder.c
index bb51c9388f..6ec2f349df 100644
--- a/media/libaom/src/aom/src/aom_encoder.c
+++ b/media/libaom/src/aom/src/aom_encoder.c
@@ -39,8 +39,25 @@ aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
                                        const aom_codec_enc_cfg_t *cfg,
                                        aom_codec_flags_t flags, int ver) {
   aom_codec_err_t res;
-
-  if (ver != AOM_ENCODER_ABI_VERSION)
+  // The value of AOM_ENCODER_ABI_VERSION in libaom v3.0.0 and v3.1.0 - v3.1.3.
+  //
+  // We are compatible with these older libaom releases. AOM_ENCODER_ABI_VERSION
+  // was incremented after these releases for two reasons:
+  // 1. AOM_ENCODER_ABI_VERSION takes contribution from
+  //    AOM_EXT_PART_ABI_VERSION. The external partition API is still
+  //    experimental, so it should not be considered as part of the stable ABI.
+  //    fd9ed8366 External partition: Define APIs
+  //    https://aomedia-review.googlesource.com/c/aom/+/135663
+  // 2. As a way to detect the presence of speeds 7-9 in all-intra mode. I (wtc)
+  //    suggested this change because I misunderstood how
+  //    AOM_ENCODER_ABI_VERSION was used.
+  //    bbdfa68d1 AllIntra: Redefine all-intra mode speed features for speed 7+
+  //    https://aomedia-review.googlesource.com/c/aom/+/140624
+  const int aom_encoder_abi_version_25 = 25;
+
+  // TODO(bug aomedia:3228): Remove the check for aom_encoder_abi_version_25 in
+  // libaom v4.0.0.
+  if (ver != AOM_ENCODER_ABI_VERSION && ver != aom_encoder_abi_version_25)
     res = AOM_CODEC_ABI_MISMATCH;
   else if (!ctx || !iface || !cfg)
     res = AOM_CODEC_INVALID_PARAM;
@@ -50,7 +67,11 @@ aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
     res = AOM_CODEC_INCAPABLE;
   else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
     res = AOM_CODEC_INCAPABLE;
-  else {
+  else if (cfg->g_bit_depth > 8 && (flags & AOM_CODEC_USE_HIGHBITDEPTH) == 0) {
+    res = AOM_CODEC_INVALID_PARAM;
+    ctx->err_detail =
+        "High bit-depth used without the AOM_CODEC_USE_HIGHBITDEPTH flag.";
+  } else {
     ctx->iface = iface;
     ctx->name = iface->name;
     ctx->priv = NULL;
diff --git a/media/libaom/src/aom/src/aom_image.c b/media/libaom/src/aom/src/aom_image.c
index cd0b5ed835..8e94d5dd4f 100644
--- a/media/libaom/src/aom/src/aom_image.c
+++ b/media/libaom/src/aom/src/aom_image.c
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -38,6 +39,8 @@ static aom_image_t *img_alloc_helper(
   unsigned int h, w, s, xcs, ycs, bps, bit_depth;
   unsigned int stride_in_bytes;
 
+  if (img != NULL) memset(img, 0, sizeof(aom_image_t));
+
   /* Treat align==0 like align==1 */
   if (!buf_align) buf_align = 1;
 
@@ -60,6 +63,7 @@ static aom_image_t *img_alloc_helper(
   switch (fmt) {
     case AOM_IMG_FMT_I420:
     case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_NV12:
     case AOM_IMG_FMT_AOMI420:
     case AOM_IMG_FMT_AOMYV12: bps = 12; break;
     case AOM_IMG_FMT_I422: bps = 16; break;
@@ -77,6 +81,7 @@ static aom_image_t *img_alloc_helper(
   switch (fmt) {
     case AOM_IMG_FMT_I420:
     case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_NV12:
     case AOM_IMG_FMT_AOMI420:
     case AOM_IMG_FMT_AOMYV12:
     case AOM_IMG_FMT_I422:
@@ -89,6 +94,7 @@ static aom_image_t *img_alloc_helper(
   switch (fmt) {
     case AOM_IMG_FMT_I420:
     case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_NV12:
     case AOM_IMG_FMT_AOMI420:
     case AOM_IMG_FMT_AOMYV12:
     case AOM_IMG_FMT_YV1216:
@@ -111,8 +117,6 @@ static aom_image_t *img_alloc_helper(
     if (!img) goto fail;
 
     img->self_allocd = 1;
-  } else {
-    memset(img, 0, sizeof(aom_image_t));
   }
 
   img->img_data = img_data;
@@ -154,6 +158,13 @@ static aom_image_t *img_alloc_helper(
   img->stride[AOM_PLANE_Y] = stride_in_bytes;
   img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
 
+  if (fmt == AOM_IMG_FMT_NV12) {
+    // Each row is a row of U and a row of V interleaved, so the stride is twice
+    // as long.
+    img->stride[AOM_PLANE_U] *= 2;
+    img->stride[AOM_PLANE_V] = 0;
+  }
+
   /* Default viewport to entire image. (This aom_img_set_rect call always
    * succeeds.) */
   aom_img_set_rect(img, 0, 0, d_w, d_h, border);
@@ -200,9 +211,8 @@ aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt,
 
 int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
                      unsigned int w, unsigned int h, unsigned int border) {
-  unsigned char *data;
-
-  if (x + w <= img->w && y + h <= img->h) {
+  if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h &&
+      y + h <= img->h) {
     img->d_w = w;
     img->d_h = h;
 
@@ -216,7 +226,7 @@ int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
     } else {
       const int bytes_per_sample =
           (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
-      data = img->img_data;
+      unsigned char *data = img->img_data;
 
       img->planes[AOM_PLANE_Y] =
           data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];
@@ -225,7 +235,11 @@ int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
       unsigned int uv_border_h = border >> img->y_chroma_shift;
       unsigned int uv_x = x >> img->x_chroma_shift;
       unsigned int uv_y = y >> img->y_chroma_shift;
-      if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
+      if (img->fmt == AOM_IMG_FMT_NV12) {
+        img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample * 2 +
+                                   uv_y * img->stride[AOM_PLANE_U];
+        img->planes[AOM_PLANE_V] = NULL;
+      } else if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
         img->planes[AOM_PLANE_U] =
             data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
         data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
@@ -350,26 +364,18 @@ int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data,
   }
   aom_metadata_t *metadata =
       aom_img_metadata_alloc(type, data, sz, insert_flag);
-  if (!metadata) goto fail;
-  if (!img->metadata->metadata_array) {
-    img->metadata->metadata_array =
-        (aom_metadata_t **)calloc(1, sizeof(metadata));
-    if (!img->metadata->metadata_array || img->metadata->sz != 0) {
-      aom_img_metadata_free(metadata);
-      goto fail;
-    }
-  } else {
-    img->metadata->metadata_array =
-        (aom_metadata_t **)realloc(img->metadata->metadata_array,
-                                   (img->metadata->sz + 1) * sizeof(metadata));
+  if (!metadata) return -1;
+  aom_metadata_t **metadata_array =
+      (aom_metadata_t **)realloc(img->metadata->metadata_array,
+                                 (img->metadata->sz + 1) * sizeof(metadata));
+  if (!metadata_array) {
+    aom_img_metadata_free(metadata);
+    return -1;
   }
+  img->metadata->metadata_array = metadata_array;
   img->metadata->metadata_array[img->metadata->sz] = metadata;
   img->metadata->sz++;
   return 0;
-fail:
-  aom_img_metadata_array_free(img->metadata);
-  img->metadata = NULL;
-  return -1;
 }
 
 void aom_img_remove_metadata(aom_image_t *img) {
diff --git a/media/libaom/src/aom_dsp/aom_convolve.c b/media/libaom/src/aom_dsp/aom_convolve.c
index 7879b88f64..254f6401c7 100644
--- a/media/libaom/src/aom_dsp/aom_convolve.c
+++ b/media/libaom/src/aom_dsp/aom_convolve.c
@@ -111,19 +111,52 @@ void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                 w, h);
 }
 
-void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int filter_x_stride, const int16_t *filter_y,
-                         int filter_y_stride, int w, int h) {
-  int r;
+void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  uint8_t temp[64 * 135];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+                 filter, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+                y0_q4, y_step_q4, w, h);
+}
+
+void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  aom_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                  y0_q4, y_step_q4, w, h);
+}
 
-  for (r = h; r > 0; --r) {
-    memcpy(dst, src, w);
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, int w, int h) {
+  for (int r = h; r > 0; --r) {
+    memmove(dst, src, w);
     src += src_stride;
     dst += dst_stride;
   }
@@ -216,22 +249,11 @@ void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                        y_step_q4, w, h, bd);
 }
 
-void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
-                                uint8_t *dst8, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int filter_x_stride,
-                                const int16_t *filter_y, int filter_y_stride,
-                                int w, int h, int bd) {
-  int r;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
-
-  for (r = h; r > 0; --r) {
-    memcpy(dst, src, w * sizeof(uint16_t));
+void aom_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
+                                uint16_t *dst, ptrdiff_t dst_stride, int w,
+                                int h) {
+  for (int y = 0; y < h; ++y) {
+    memmove(dst, src, w * sizeof(src[0]));
     src += src_stride;
     dst += dst_stride;
   }
diff --git a/media/libaom/src/aom_dsp/aom_dsp.cmake b/media/libaom/src/aom_dsp/aom_dsp.cmake
index f1b61f010b..0f65315929 100644
--- a/media/libaom/src/aom_dsp/aom_dsp.cmake
+++ b/media/libaom/src/aom_dsp/aom_dsp.cmake
@@ -31,9 +31,12 @@ list(APPEND AOM_DSP_COMMON_SOURCES
             "${AOM_ROOT}/aom_dsp/entcode.h"
             "${AOM_ROOT}/aom_dsp/fft.c"
             "${AOM_ROOT}/aom_dsp/fft_common.h"
+            "${AOM_ROOT}/aom_dsp/grain_params.h"
             "${AOM_ROOT}/aom_dsp/intrapred.c"
             "${AOM_ROOT}/aom_dsp/intrapred_common.h"
             "${AOM_ROOT}/aom_dsp/loopfilter.c"
+            "${AOM_ROOT}/aom_dsp/odintrin.c"
+            "${AOM_ROOT}/aom_dsp/odintrin.h"
             "${AOM_ROOT}/aom_dsp/prob.h"
             "${AOM_ROOT}/aom_dsp/recenter.h"
             "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
@@ -44,11 +47,9 @@ list(APPEND AOM_DSP_COMMON_SOURCES
             "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
             "${AOM_ROOT}/aom_dsp/subtract.c"
             "${AOM_ROOT}/aom_dsp/txfm_common.h"
-            "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h"
-            "${AOM_ROOT}/aom_dsp/avg.c")
+            "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h")
 
 list(APPEND AOM_DSP_COMMON_ASM_SSE2
-            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm"
             "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
             "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
@@ -58,14 +59,13 @@ list(APPEND AOM_DSP_COMMON_ASM_SSE2
             "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
+            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
             "${AOM_ROOT}/aom_dsp/x86/convolve.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
             "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
-            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
-            "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h"
             "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c"
@@ -74,67 +74,53 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
             "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h"
             "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
             "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h"
-            "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c"
             "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_sse2.h")
 
-if(NOT CONFIG_AV1_HIGHBITDEPTH)
-  list(REMOVE_ITEM AOM_DSP_COMMON_INTRIN_SSE2
-                   "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
-                   "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
-endif()
-
 list(APPEND AOM_DSP_COMMON_ASM_SSSE3
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
-            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_ssse3.h"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
 
-if(NOT CONFIG_AV1_HIGHBITDEPTH)
-  list(REMOVE_ITEM AOM_DSP_COMMON_INTRIN_SSSE3
-                   "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c")
-endif()
-
 list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
             "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h"
             "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
             "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
-            "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_utils.h")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
             "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
             "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
-            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
-            "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c"
-            "${AOM_ROOT}/aom_dsp/x86/avg_intrin_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h")
 
-if(NOT CONFIG_AV1_HIGHBITDEPTH)
-  list(REMOVE_ITEM AOM_DSP_COMMON_INTRIN_AVX2
-                   "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
-                   "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
-endif()
-
-list(APPEND AOM_DSP_COMMON_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c")
 
 list(APPEND AOM_DSP_COMMON_INTRIN_DSPR2
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h"
             "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c"
             "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h"
@@ -151,19 +137,34 @@ list(APPEND AOM_DSP_COMMON_INTRIN_MSA
             "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
             "${AOM_ROOT}/aom_dsp/mips/macros_msa.h")
 
+if(CONFIG_AV1_HIGHBITDEPTH)
+  list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
+              "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
+
+  list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
+              "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c")
+
+  list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+              "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
+
+  list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+              "${AOM_ROOT}/aom_dsp/arm/highbd_loopfilter_neon.c")
+endif()
+
 if(CONFIG_AV1_DECODER)
   list(APPEND AOM_DSP_DECODER_SOURCES
               "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
               "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
               "${AOM_ROOT}/aom_dsp/bitreader.c"
               "${AOM_ROOT}/aom_dsp/bitreader.h" "${AOM_ROOT}/aom_dsp/entdec.c"
-              "${AOM_ROOT}/aom_dsp/entdec.h"
-              "${AOM_ROOT}/aom_dsp/grain_synthesis.c"
-              "${AOM_ROOT}/aom_dsp/grain_synthesis.h")
+              "${AOM_ROOT}/aom_dsp/entdec.h")
 endif()
 
 if(CONFIG_AV1_ENCODER)
   list(APPEND AOM_DSP_ENCODER_SOURCES
+              "${AOM_ROOT}/aom_dsp/avg.c"
               "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
               "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
               "${AOM_ROOT}/aom_dsp/bitwriter.c"
@@ -183,18 +184,15 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/quantize.c"
               "${AOM_ROOT}/aom_dsp/quantize.h"
               "${AOM_ROOT}/aom_dsp/sad.c"
-              "${AOM_ROOT}/aom_dsp/sse.c"
               "${AOM_ROOT}/aom_dsp/sad_av1.c"
+              "${AOM_ROOT}/aom_dsp/sse.c"
+              "${AOM_ROOT}/aom_dsp/ssim.c"
+              "${AOM_ROOT}/aom_dsp/ssim.h"
               "${AOM_ROOT}/aom_dsp/sum_squares.c"
               "${AOM_ROOT}/aom_dsp/variance.c"
               "${AOM_ROOT}/aom_dsp/variance.h")
 
-  list(APPEND AOM_DSP_ENCODER_ASM_SSE2
-              "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm"
-              "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
+  list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
               "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
               "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm"
               "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm")
@@ -203,32 +201,23 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/ssim_sse2_x86_64.asm")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
+              "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
               "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h"
               "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
               "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
-  if(NOT CONFIG_AV1_HIGHBITDEPTH)
-    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2
-                     "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c"
-                     "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
-                     "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
-                     "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c")
-  endif()
 
   list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
               "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
               "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+              "${AOM_ROOT}/aom_dsp/x86/avg_intrin_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
@@ -236,10 +225,9 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_sad_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
@@ -247,8 +235,8 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
 
-  list(APPEND AOM_DSP_ENCODER_AVX_ASM_X86_64
-              "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")
+  list(APPEND AOM_DSP_ENCODER_INTRIN_AVX
+              "${AOM_ROOT}/aom_dsp/x86/aom_quantize_avx.c")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
               "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h"
@@ -261,40 +249,72 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
               "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
 
-  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
-              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c"
-              "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
               "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
               "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
 
-  if(NOT CONFIG_AV1_HIGHBITDEPTH)
-    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE4_1
-                     "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c")
-  endif()
-
   list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/variance_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
-              "${AOM_ROOT}/aom_dsp/arm/sse_neon.c")
+              "${AOM_ROOT}/aom_dsp/arm/sse_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
               "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
               "${AOM_ROOT}/aom_dsp/mips/variance_msa.c"
               "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c")
 
+  if(CONFIG_AV1_HIGHBITDEPTH)
+    list(APPEND AOM_DSP_ENCODER_ASM_SSE2
+                "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
+                "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+                "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+                "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+                "${AOM_ROOT}/aom_dsp/arm/highbd_quantize_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon.c")
+  endif()
+
   if(CONFIG_INTERNAL_STATS)
     list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c"
-                "${AOM_ROOT}/aom_dsp/psnrhvs.c" "${AOM_ROOT}/aom_dsp/ssim.c"
-                "${AOM_ROOT}/aom_dsp/ssim.h")
+                "${AOM_ROOT}/aom_dsp/psnrhvs.c")
   endif()
 
   if(CONFIG_TUNE_VMAF)
     list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/vmaf.c"
                 "${AOM_ROOT}/aom_dsp/vmaf.h")
   endif()
+
+  if(CONFIG_TUNE_BUTTERAUGLI)
+    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/butteraugli.c"
+                "${AOM_ROOT}/aom_dsp/butteraugli.h")
+  endif()
+
+  if(CONFIG_REALTIME_ONLY)
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_AVX2
+                     "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
+                     "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c")
+
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE4_1
+                     "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
+                     "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
+  endif()
 endif()
 
 # Creates aom_dsp build targets. Must not be called until after libaom target
@@ -330,6 +350,9 @@ function(setup_aom_dsp_targets)
     if(BUILD_SHARED_LIBS)
       target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
     endif()
+    if(CONFIG_TUNE_VMAF)
+      target_include_directories(aom_dsp_encoder PRIVATE ${VMAF_INCLUDE_DIRS})
+    endif()
   endif()
 
   if(HAVE_SSE2)
@@ -372,9 +395,10 @@ function(setup_aom_dsp_targets)
     endif()
   endif()
 
-  if(HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+  if(HAVE_AVX)
     if(CONFIG_AV1_ENCODER)
-      add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64")
+      add_intrinsics_object_library("-mavx" "avx" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_AVX")
     endif()
   endif()
 
diff --git a/media/libaom/src/aom_dsp/aom_dsp_common.h b/media/libaom/src/aom_dsp/aom_dsp_common.h
index 150d35dd15..efb634ac97 100644
--- a/media/libaom/src/aom_dsp/aom_dsp_common.h
+++ b/media/libaom/src/aom_dsp/aom_dsp_common.h
@@ -21,6 +21,8 @@
 extern "C" {
 #endif
 
+#define PI 3.141592653589793238462643383279502884
+
 #ifndef MAX_SB_SIZE
 #define MAX_SB_SIZE 128
 #endif  // ndef MAX_SB_SIZE
diff --git a/media/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl b/media/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl
index b7d5a41ba1..96d8d6e420 100644..100755
--- a/media/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/media/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -41,20 +41,23 @@ if ($opts{arch} eq "x86_64") {
 
 @block_widths = (4, 8, 16, 32, 64, 128);
 
-@block_sizes = ();
+@encoder_block_sizes = ();
 foreach $w (@block_widths) {
   foreach $h (@block_widths) {
-    push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
+    push @encoder_block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w);
   }
 }
-push @block_sizes, [4, 16];
-push @block_sizes, [16, 4];
-push @block_sizes, [8, 32];
-push @block_sizes, [32, 8];
-push @block_sizes, [16, 64];
-push @block_sizes, [64, 16];
-
-@tx_dims = (2, 4, 8, 16, 32, 64);
+
+if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+  push @encoder_block_sizes, [4, 16];
+  push @encoder_block_sizes, [16, 4];
+  push @encoder_block_sizes, [8, 32];
+  push @encoder_block_sizes, [32, 8];
+  push @encoder_block_sizes, [16, 64];
+  push @encoder_block_sizes, [64, 16];
+}
+
+@tx_dims = (4, 8, 16, 32, 64);
 @tx_sizes = ();
 foreach $w (@tx_dims) {
   push @tx_sizes, [$w, $w];
@@ -84,183 +87,192 @@ foreach (@tx_sizes) {
 
 specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_top_predictor_4x8 sse2/;
-specialize qw/aom_dc_top_predictor_4x16 sse2/;
 specialize qw/aom_dc_top_predictor_8x4 sse2/;
 specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_top_predictor_8x16 sse2/;
-specialize qw/aom_dc_top_predictor_8x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x4 sse2/;
 specialize qw/aom_dc_top_predictor_16x8 sse2/;
 specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
-
 specialize qw/aom_dc_top_predictor_16x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x64 sse2/;
-specialize qw/aom_dc_top_predictor_32x8 sse2/;
 specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
 specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
+
 specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_left_predictor_4x8 sse2/;
-specialize qw/aom_dc_left_predictor_4x16 sse2/;
 specialize qw/aom_dc_left_predictor_8x4 sse2/;
 specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_left_predictor_8x16 sse2/;
-specialize qw/aom_dc_left_predictor_8x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x4 sse2/;
 specialize qw/aom_dc_left_predictor_16x8 sse2/;
 specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
 specialize qw/aom_dc_left_predictor_16x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x64 sse2/;
-specialize qw/aom_dc_left_predictor_32x8 sse2/;
 specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/;
 specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
+
 specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_128_predictor_4x8 sse2/;
-specialize qw/aom_dc_128_predictor_4x16 sse2/;
 specialize qw/aom_dc_128_predictor_8x4 sse2/;
 specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_128_predictor_8x16 sse2/;
-specialize qw/aom_dc_128_predictor_8x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x4 sse2/;
 specialize qw/aom_dc_128_predictor_16x8 sse2/;
 specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
 specialize qw/aom_dc_128_predictor_16x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x64 sse2/;
-specialize qw/aom_dc_128_predictor_32x8 sse2/;
 specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
 specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
+
 specialize qw/aom_v_predictor_4x4 neon msa sse2/;
 specialize qw/aom_v_predictor_4x8 sse2/;
-specialize qw/aom_v_predictor_4x16 sse2/;
 specialize qw/aom_v_predictor_8x4 sse2/;
 specialize qw/aom_v_predictor_8x8 neon msa sse2/;
 specialize qw/aom_v_predictor_8x16 sse2/;
-specialize qw/aom_v_predictor_8x32 sse2/;
-specialize qw/aom_v_predictor_16x4 sse2/;
 specialize qw/aom_v_predictor_16x8 sse2/;
 specialize qw/aom_v_predictor_16x16 neon msa sse2/;
 specialize qw/aom_v_predictor_16x32 sse2/;
-specialize qw/aom_v_predictor_16x64 sse2/;
-specialize qw/aom_v_predictor_32x8 sse2/;
 specialize qw/aom_v_predictor_32x16 sse2 avx2/;
 specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
 specialize qw/aom_v_predictor_32x64 sse2 avx2/;
 specialize qw/aom_v_predictor_64x64 sse2 avx2/;
 specialize qw/aom_v_predictor_64x32 sse2 avx2/;
-specialize qw/aom_v_predictor_64x16 sse2 avx2/;
+
 specialize qw/aom_h_predictor_4x8 sse2/;
-specialize qw/aom_h_predictor_4x16 sse2/;
 specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_8x4 sse2/;
 specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_8x16 sse2/;
-specialize qw/aom_h_predictor_8x32 sse2/;
-specialize qw/aom_h_predictor_16x4 sse2/;
 specialize qw/aom_h_predictor_16x8 sse2/;
 specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_16x32 sse2/;
-specialize qw/aom_h_predictor_16x64 sse2/;
-specialize qw/aom_h_predictor_32x8 sse2/;
 specialize qw/aom_h_predictor_32x16 sse2/;
 specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/;
 specialize qw/aom_h_predictor_32x64 sse2/;
 specialize qw/aom_h_predictor_64x64 sse2/;
 specialize qw/aom_h_predictor_64x32 sse2/;
+
+specialize qw/aom_paeth_predictor_4x4 ssse3 neon/;
+specialize qw/aom_paeth_predictor_4x8 ssse3 neon/;
+specialize qw/aom_paeth_predictor_8x4 ssse3 neon/;
+specialize qw/aom_paeth_predictor_8x8 ssse3 neon/;
+specialize qw/aom_paeth_predictor_8x16 ssse3 neon/;
+specialize qw/aom_paeth_predictor_16x8 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_16x16 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_16x32 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_32x16 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_32x32 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_32x64 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_64x32 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_64x64 ssse3 avx2 neon/;
+
+specialize qw/aom_smooth_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x32 neon ssse3/;
+
+specialize qw/aom_smooth_v_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x32 neon ssse3/;
+
+specialize qw/aom_smooth_h_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_64x64 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_64x32 neon ssse3/;
+
+specialize qw/aom_dc_top_predictor_4x16 sse2/;
+specialize qw/aom_dc_top_predictor_8x32 sse2/;
+specialize qw/aom_dc_top_predictor_16x4 sse2/;
+specialize qw/aom_dc_top_predictor_16x64 sse2/;
+specialize qw/aom_dc_top_predictor_32x8 sse2/;
+specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
+
+specialize qw/aom_dc_left_predictor_4x16 sse2/;
+specialize qw/aom_dc_left_predictor_8x32 sse2/;
+specialize qw/aom_dc_left_predictor_16x4 sse2/;
+specialize qw/aom_dc_left_predictor_16x64 sse2/;
+specialize qw/aom_dc_left_predictor_32x8 sse2/;
+specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
+
+specialize qw/aom_dc_128_predictor_4x16 sse2/;
+specialize qw/aom_dc_128_predictor_8x32 sse2/;
+specialize qw/aom_dc_128_predictor_16x4 sse2/;
+specialize qw/aom_dc_128_predictor_16x64 sse2/;
+specialize qw/aom_dc_128_predictor_32x8 sse2/;
+specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
+
+specialize qw/aom_v_predictor_4x16 sse2/;
+specialize qw/aom_v_predictor_8x32 sse2/;
+specialize qw/aom_v_predictor_16x4 sse2/;
+specialize qw/aom_v_predictor_16x64 sse2/;
+specialize qw/aom_v_predictor_32x8 sse2/;
+specialize qw/aom_v_predictor_64x16 sse2 avx2/;
+
+specialize qw/aom_h_predictor_4x16 sse2/;
+specialize qw/aom_h_predictor_8x32 sse2/;
+specialize qw/aom_h_predictor_16x4 sse2/;
+specialize qw/aom_h_predictor_16x64 sse2/;
+specialize qw/aom_h_predictor_32x8 sse2/;
 specialize qw/aom_h_predictor_64x16 sse2/;
-specialize qw/aom_paeth_predictor_4x4 ssse3/;
-specialize qw/aom_paeth_predictor_4x8 ssse3/;
-specialize qw/aom_paeth_predictor_4x16 ssse3/;
-specialize qw/aom_paeth_predictor_8x4 ssse3/;
-specialize qw/aom_paeth_predictor_8x8 ssse3/;
-specialize qw/aom_paeth_predictor_8x16 ssse3/;
-specialize qw/aom_paeth_predictor_8x32 ssse3/;
-specialize qw/aom_paeth_predictor_16x4 ssse3/;
-specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x8 ssse3/;
-specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x8 ssse3/;
-specialize qw/aom_paeth_predictor_16x16 ssse3/;
-specialize qw/aom_paeth_predictor_16x32 ssse3/;
-specialize qw/aom_paeth_predictor_32x16 ssse3/;
-specialize qw/aom_paeth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_predictor_64x16 ssse3/;
-
-specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
-
-specialize qw/aom_smooth_h_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_h_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x16 ssse3/;
+
+specialize qw/aom_paeth_predictor_4x16 ssse3 neon/;
+specialize qw/aom_paeth_predictor_8x32 ssse3 neon/;
+specialize qw/aom_paeth_predictor_16x4 ssse3 neon/;
+specialize qw/aom_paeth_predictor_16x64 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_32x8 ssse3 neon/;
+specialize qw/aom_paeth_predictor_64x16 ssse3 avx2 neon/;
+
+specialize qw/aom_smooth_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x16 neon ssse3/;
+
+specialize qw/aom_smooth_v_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x16 neon ssse3/;
+
+specialize qw/aom_smooth_h_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_64x16 neon ssse3/;
 
 # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
 # by multiply and shift.
@@ -284,16 +296,25 @@ specialize qw/aom_dc_predictor_64x64 sse2 avx2/;
 specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
 specialize qw/aom_dc_predictor_64x16 sse2 avx2/;
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-  specialize qw/aom_highbd_v_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_v_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_v_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_v_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_v_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_v_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_v_predictor_16x16 sse2/;
-  specialize qw/aom_highbd_v_predictor_16x32 sse2/;
-  specialize qw/aom_highbd_v_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_v_predictor_32x32 sse2/;
+  specialize qw/aom_highbd_v_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_4x8 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_8x4 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_8x8 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_8x16 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_16x8 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_16x32 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_32x16 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_32x64 neon/;
+  specialize qw/aom_highbd_v_predictor_64x32 neon/;
+  specialize qw/aom_highbd_v_predictor_64x64 neon/;
+  specialize qw/aom_highbd_v_predictor_4x16 neon/;
+  specialize qw/aom_highbd_v_predictor_8x32 neon/;
+  specialize qw/aom_highbd_v_predictor_16x4 neon/;
+  specialize qw/aom_highbd_v_predictor_16x64 neon/;
+  specialize qw/aom_highbd_v_predictor_32x8 neon/;
+  specialize qw/aom_highbd_v_predictor_64x16 neon/;
 
   # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
   # by multiply and shift.
@@ -349,20 +370,104 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
   specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
   specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/;
+
+  specialize qw/aom_highbd_paeth_predictor_4x4 neon/;
+  specialize qw/aom_highbd_paeth_predictor_4x8 neon/;
+  specialize qw/aom_highbd_paeth_predictor_8x4 neon/;
+  specialize qw/aom_highbd_paeth_predictor_8x8 neon/;
+  specialize qw/aom_highbd_paeth_predictor_8x16 neon/;
+  specialize qw/aom_highbd_paeth_predictor_16x8 neon/;
+  specialize qw/aom_highbd_paeth_predictor_16x16 neon/;
+  specialize qw/aom_highbd_paeth_predictor_16x32 neon/;
+  specialize qw/aom_highbd_paeth_predictor_32x16 neon/;
+  specialize qw/aom_highbd_paeth_predictor_32x32 neon/;
+  specialize qw/aom_highbd_paeth_predictor_32x64 neon/;
+  specialize qw/aom_highbd_paeth_predictor_64x32 neon/;
+  specialize qw/aom_highbd_paeth_predictor_64x64 neon/;
+  specialize qw/aom_highbd_paeth_predictor_4x16 neon/;
+  specialize qw/aom_highbd_paeth_predictor_8x32 neon/;
+  specialize qw/aom_highbd_paeth_predictor_16x4 neon/;
+  specialize qw/aom_highbd_paeth_predictor_16x64 neon/;
+  specialize qw/aom_highbd_paeth_predictor_32x8 neon/;
+  specialize qw/aom_highbd_paeth_predictor_64x16 neon/;
+
+  specialize qw/aom_highbd_smooth_predictor_4x4 neon/;
+  specialize qw/aom_highbd_smooth_predictor_4x8 neon/;
+  specialize qw/aom_highbd_smooth_predictor_8x4 neon/;
+  specialize qw/aom_highbd_smooth_predictor_8x8 neon/;
+  specialize qw/aom_highbd_smooth_predictor_8x16 neon/;
+  specialize qw/aom_highbd_smooth_predictor_16x8 neon/;
+  specialize qw/aom_highbd_smooth_predictor_16x16 neon/;
+  specialize qw/aom_highbd_smooth_predictor_16x32 neon/;
+  specialize qw/aom_highbd_smooth_predictor_32x16 neon/;
+  specialize qw/aom_highbd_smooth_predictor_32x32 neon/;
+  specialize qw/aom_highbd_smooth_predictor_32x64 neon/;
+  specialize qw/aom_highbd_smooth_predictor_64x32 neon/;
+  specialize qw/aom_highbd_smooth_predictor_64x64 neon/;
+  specialize qw/aom_highbd_smooth_predictor_4x16 neon/;
+  specialize qw/aom_highbd_smooth_predictor_8x32 neon/;
+  specialize qw/aom_highbd_smooth_predictor_16x4 neon/;
+  specialize qw/aom_highbd_smooth_predictor_16x64 neon/;
+  specialize qw/aom_highbd_smooth_predictor_32x8 neon/;
+  specialize qw/aom_highbd_smooth_predictor_64x16 neon/;
+
+  specialize qw/aom_highbd_smooth_v_predictor_4x4 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_4x8 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_8x4 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_8x8 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_8x16 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_16x8 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_16x16 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_16x32 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_32x16 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_32x32 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_32x64 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_64x32 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_64x64 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_4x16 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_8x32 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_16x4 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_16x64 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_32x8 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_64x16 neon/;
+
+  specialize qw/aom_highbd_smooth_h_predictor_4x4 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_4x8 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_8x4 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_8x8 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_8x16 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_16x8 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_16x16 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_16x32 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_32x16 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_32x32 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_32x64 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_64x32 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_64x64 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_4x16 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_8x32 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_16x4 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_16x64 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_32x8 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_64x16 neon/;
 }
 #
 # Sub Pixel Filters
 #
-add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h";
+add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h";
 add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 
-specialize qw/aom_convolve_copy       sse2      /;
+specialize qw/aom_convolve_copy       neon dspr2 msa sse2 avx2/;
 specialize qw/aom_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
 specialize qw/aom_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
 
+add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/aom_scaled_2d ssse3 neon/;
+
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd";
+  add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h";
   specialize qw/aom_highbd_convolve_copy sse2 avx2/;
 
   add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
@@ -379,7 +484,10 @@ add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *b
 specialize qw/aom_lpf_vertical_14 sse2 neon/;
 
 add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_14_dual sse2/;
+specialize qw/aom_lpf_vertical_14_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_14_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_vertical_14_quad sse2 neon/;
 
 add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/aom_lpf_vertical_6 sse2 neon/;
@@ -388,89 +496,110 @@ add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *bl
 specialize qw/aom_lpf_vertical_8 sse2 neon/;
 
 add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_8_dual sse2/;
+specialize qw/aom_lpf_vertical_8_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_8_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_vertical_8_quad sse2 neon/;
 
 add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/aom_lpf_vertical_4 sse2 neon/;
 
 add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_4_dual sse2/;
+specialize qw/aom_lpf_vertical_4_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_4_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_vertical_4_quad sse2 neon/;
 
 add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/aom_lpf_horizontal_14 sse2 neon/;
 
 add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_14_dual sse2/;
+specialize qw/aom_lpf_horizontal_14_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_14_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_horizontal_14_quad sse2 avx2 neon/;
 
 add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/aom_lpf_horizontal_6 sse2 neon/;
 
 add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_6_dual sse2/;
+specialize qw/aom_lpf_horizontal_6_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_6_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_horizontal_6_quad sse2 avx2 neon/;
 
 add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/aom_lpf_horizontal_8 sse2 neon/;
 
 add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_8_dual sse2/;
+specialize qw/aom_lpf_horizontal_8_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_8_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_horizontal_8_quad sse2 avx2 neon/;
 
 add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/aom_lpf_horizontal_4 sse2 neon/;
 
 add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_4_dual sse2/;
+specialize qw/aom_lpf_horizontal_4_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_4_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_horizontal_4_quad sse2 neon/;
 
 add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_6_dual sse2/;
+specialize qw/aom_lpf_vertical_6_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_6_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_vertical_6_quad sse2 neon/;
 
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_vertical_14 sse2/;
+  specialize qw/aom_highbd_lpf_vertical_14 neon sse2/;
 
   add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
+  specialize qw/aom_highbd_lpf_vertical_14_dual neon sse2 avx2/;
 
   add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_vertical_8 sse2/;
+  specialize qw/aom_highbd_lpf_vertical_8 neon sse2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_8_dual neon sse2 avx2/;
 
   add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_vertical_6 sse2/;
+  specialize qw/aom_highbd_lpf_vertical_6 neon sse2/;
 
   add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_vertical_6_dual sse2/;
-
-  add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
+  specialize qw/aom_highbd_lpf_vertical_6_dual neon sse2/;
 
   add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_vertical_4 sse2/;
+  specialize qw/aom_highbd_lpf_vertical_4 neon sse2/;
 
   add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
+  specialize qw/aom_highbd_lpf_vertical_4_dual neon sse2 avx2/;
 
   add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
+  specialize qw/aom_highbd_lpf_horizontal_14 neon sse2/;
 
   add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd";
-  specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
+  specialize qw/aom_highbd_lpf_horizontal_14_dual neon sse2 avx2/;
 
   add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_6 sse2/;
+  specialize qw/aom_highbd_lpf_horizontal_6 neon sse2/;
 
   add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/;
+  specialize qw/aom_highbd_lpf_horizontal_6_dual neon sse2/;
 
   add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
+  specialize qw/aom_highbd_lpf_horizontal_8 neon sse2/;
 
   add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
+  specialize qw/aom_highbd_lpf_horizontal_8_dual neon sse2 avx2/;
 
   add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
+  specialize qw/aom_highbd_lpf_horizontal_4 neon sse2/;
 
   add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
+  specialize qw/aom_highbd_lpf_horizontal_4_dual neon sse2 avx2/;
 }
 
 #
@@ -529,19 +658,19 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
 #
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
+  specialize qw/aom_quantize_b sse2 neon avx/, "$ssse3_x86_64";
 
   add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_quantize_b_adaptive sse2 avx2/;
 
   add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
+  specialize qw/aom_quantize_b_32x32 neon avx/, "$ssse3_x86_64";
 
   add_proto qw/void aom_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_quantize_b_32x32_adaptive sse2/;
 
   add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_quantize_b_64x64 ssse3/;
+  specialize qw/aom_quantize_b_64x64 neon ssse3/;
 
   add_proto qw/void aom_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_quantize_b_64x64_adaptive sse2/;
@@ -549,19 +678,19 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes" && aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_highbd_quantize_b sse2 avx2/;
+  specialize qw/aom_highbd_quantize_b sse2 avx2 neon/;
 
   add_proto qw/void aom_highbd_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_highbd_quantize_b_adaptive sse2 avx2/;
 
   add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_highbd_quantize_b_32x32 sse2/;
+  specialize qw/aom_highbd_quantize_b_32x32 sse2 neon/;
 
   add_proto qw/void aom_highbd_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_highbd_quantize_b_32x32_adaptive sse2 avx2/;
 
   add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_highbd_quantize_b_64x64 sse2/;
+  specialize qw/aom_highbd_quantize_b_64x64 sse2 neon/;
 
   add_proto qw/void aom_highbd_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/aom_highbd_quantize_b_64x64_adaptive sse2/;
@@ -604,8 +733,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_get_blk_sse_sum sse2 avx2/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-    specialize qw/aom_highbd_subtract_block sse2/;
+    add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+    specialize qw/aom_highbd_subtract_block sse2 neon/;
 
     add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
     specialize qw/aom_highbd_sse  sse4_1 avx2 neon/;
@@ -616,7 +745,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     # Sum of Squares
     #
     add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
-    specialize qw/aom_sum_squares_2d_i16 sse2 avx2/;
+    specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon/;
 
     add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
     specialize qw/aom_sum_squares_i16 sse2/;
@@ -631,14 +760,17 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   #
   # Single block SAD / Single block Avg SAD
   #
-  foreach (@block_sizes) {
+  foreach (@encoder_block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+    add_proto qw/unsigned int/, "aom_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
     add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
     add_proto qw/unsigned int/, "aom_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
   }
 
-  specialize qw/aom_sad128x128    avx2          sse2/;
+  add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum";
+  specialize qw/aom_sum_sse_2d_i16 sse2 avx2/;
+  specialize qw/aom_sad128x128    avx2 neon     sse2/;
   specialize qw/aom_sad128x64     avx2          sse2/;
   specialize qw/aom_sad64x128     avx2          sse2/;
   specialize qw/aom_sad64x64      avx2 neon msa sse2/;
@@ -655,6 +787,34 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sad4x8                  msa sse2/;
   specialize qw/aom_sad4x4             neon msa sse2/;
 
+  specialize qw/aom_sad4x16                     sse2/;
+  specialize qw/aom_sad16x4                     sse2/;
+  specialize qw/aom_sad8x32                     sse2/;
+  specialize qw/aom_sad32x8                     sse2/;
+  specialize qw/aom_sad16x64                    sse2/;
+  specialize qw/aom_sad64x16                    sse2/;
+
+  specialize qw/aom_sad_skip_128x128    avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_128x64     avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_64x128     avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_64x64      avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_64x32      avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_32x64      avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_32x32      avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_32x16      avx2          sse2  neon/;
+  specialize qw/aom_sad_skip_16x32                    sse2  neon/;
+  specialize qw/aom_sad_skip_16x16                    sse2  neon/;
+  specialize qw/aom_sad_skip_16x8                     sse2  neon/;
+  specialize qw/aom_sad_skip_8x16                     sse2  neon/;
+  specialize qw/aom_sad_skip_8x8                      sse2  neon/;
+  specialize qw/aom_sad_skip_4x8                      sse2  neon/;
+
+  specialize qw/aom_sad_skip_4x16                     sse2  neon/;
+  specialize qw/aom_sad_skip_8x32                     sse2  neon/;
+  specialize qw/aom_sad_skip_32x8                     sse2  neon/;
+  specialize qw/aom_sad_skip_16x64                    sse2  neon/;
+  specialize qw/aom_sad_skip_64x16                    sse2  neon/;
+
   specialize qw/aom_sad128x128_avg avx2     sse2/;
   specialize qw/aom_sad128x64_avg  avx2     sse2/;
   specialize qw/aom_sad64x128_avg  avx2     sse2/;
@@ -672,19 +832,12 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sad4x8_avg          msa sse2/;
   specialize qw/aom_sad4x4_avg          msa sse2/;
 
-  specialize qw/aom_sad4x16      sse2/;
-  specialize qw/aom_sad16x4      sse2/;
-  specialize qw/aom_sad8x32      sse2/;
-  specialize qw/aom_sad32x8      sse2/;
-  specialize qw/aom_sad16x64     sse2/;
-  specialize qw/aom_sad64x16     sse2/;
-
-  specialize qw/aom_sad4x16_avg  sse2/;
-  specialize qw/aom_sad16x4_avg  sse2/;
-  specialize qw/aom_sad8x32_avg  sse2/;
-  specialize qw/aom_sad32x8_avg  sse2/;
-  specialize qw/aom_sad16x64_avg sse2/;
-  specialize qw/aom_sad64x16_avg sse2/;
+  specialize qw/aom_sad4x16_avg             sse2/;
+  specialize qw/aom_sad16x4_avg             sse2/;
+  specialize qw/aom_sad8x32_avg             sse2/;
+  specialize qw/aom_sad32x8_avg             sse2/;
+  specialize qw/aom_sad16x64_avg            sse2/;
+  specialize qw/aom_sad64x16_avg            sse2/;
 
   specialize qw/aom_dist_wtd_sad128x128_avg ssse3/;
   specialize qw/aom_dist_wtd_sad128x64_avg  ssse3/;
@@ -725,9 +878,10 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sad128xh sse2/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    foreach (@block_sizes) {
+    foreach (@encoder_block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+      add_proto qw/unsigned int/, "aom_highbd_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
       add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
       if ($w != 128 && $h != 128 && $w != 4) {
         specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
@@ -746,9 +900,39 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize qw/aom_highbd_sad16x32   avx2 sse2/;
     specialize qw/aom_highbd_sad16x16   avx2 sse2/;
     specialize qw/aom_highbd_sad16x8    avx2 sse2/;
-    specialize qw/aom_highbd_sad8x4     sse2/;
-    specialize qw/aom_highbd_sad4x8     sse2/;
-    specialize qw/aom_highbd_sad4x4     sse2/;
+    specialize qw/aom_highbd_sad8x16         sse2/;
+    specialize qw/aom_highbd_sad8x8          sse2/;
+    specialize qw/aom_highbd_sad8x4          sse2/;
+    specialize qw/aom_highbd_sad4x8          sse2/;
+    specialize qw/aom_highbd_sad4x4          sse2/;
+
+    specialize qw/aom_highbd_sad4x16         sse2/;
+    specialize qw/aom_highbd_sad16x4    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x32         sse2/;
+    specialize qw/aom_highbd_sad32x8    avx2 sse2/;
+    specialize qw/aom_highbd_sad16x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x16   avx2 sse2/;
+
+    specialize qw/aom_highbd_sad_skip_128x128 avx2/;
+    specialize qw/aom_highbd_sad_skip_128x64  avx2/;
+    specialize qw/aom_highbd_sad_skip_64x128  avx2/;
+    specialize qw/aom_highbd_sad_skip_64x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_64x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_32x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_32x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_32x16   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x16   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x8    avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_8x16         sse2/;
+    specialize qw/aom_highbd_sad_skip_8x8          sse2/;
+    specialize qw/aom_highbd_sad_skip_4x8          sse2/;
+
+    specialize qw/aom_highbd_sad_skip_4x16         sse2/;
+    specialize qw/aom_highbd_sad_skip_8x32         sse2/;
+    specialize qw/aom_highbd_sad_skip_32x8    avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_64x16   avx2 sse2/;
 
     specialize qw/aom_highbd_sad128x128_avg avx2/;
     specialize qw/aom_highbd_sad128x64_avg  avx2/;
@@ -765,13 +949,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize qw/aom_highbd_sad4x8_avg     sse2/;
     specialize qw/aom_highbd_sad4x4_avg     sse2/;
 
-    specialize qw/aom_highbd_sad4x16        sse2/;
-    specialize qw/aom_highbd_sad16x4        avx2 sse2/;
-    specialize qw/aom_highbd_sad8x32        sse2/;
-    specialize qw/aom_highbd_sad32x8        avx2 sse2/;
-    specialize qw/aom_highbd_sad16x64       avx2 sse2/;
-    specialize qw/aom_highbd_sad64x16       avx2 sse2/;
-
     specialize qw/aom_highbd_sad4x16_avg    sse2/;
     specialize qw/aom_highbd_sad16x4_avg    avx2 sse2/;
     specialize qw/aom_highbd_sad8x32_avg    sse2/;
@@ -782,14 +959,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   #
   # Masked SAD
   #
-  foreach (@block_sizes) {
+  foreach (@encoder_block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
     specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/;
   }
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    foreach (@block_sizes) {
+    foreach (@encoder_block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
       specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
@@ -799,20 +976,22 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   #
   # OBMC SAD
   #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
-    if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-       specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
-    }
-  }
-
-  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    foreach (@block_sizes) {
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    foreach (@encoder_block_sizes) {
       ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+      add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
       if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-        specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+        specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+      }
+    }
+
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      foreach (@encoder_block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+        if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
+          specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+        }
       }
     }
   }
@@ -820,11 +999,12 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
-  foreach (@block_sizes) {
+  foreach (@encoder_block_sizes) {
     ($w, $h) = @$_;
-    add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-    add_proto qw/void/, "aom_sad${w}x${h}x4d_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, const uint8_t *second_pred, uint32_t *sad_array";
-    add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[]";
+    add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+    add_proto qw/void/, "aom_sad${w}x${h}x4d_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]";
+    add_proto qw/void/, "aom_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+    add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
   }
 
   specialize qw/aom_sad128x128x4d avx2          sse2/;
@@ -856,34 +1036,61 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sad32x8x4d  sse2/;
   specialize qw/aom_sad64x16x4d sse2/;
 
-  specialize qw/aom_sad128x128x4d_avg sse2/;
-  specialize qw/aom_sad128x64x4d_avg  sse2/;
-  specialize qw/aom_sad64x128x4d_avg  sse2/;
-  specialize qw/aom_sad64x64x4d_avg   sse2/;
-  specialize qw/aom_sad64x32x4d_avg   sse2/;
-  specialize qw/aom_sad64x16x4d_avg   sse2/;
-  specialize qw/aom_sad32x64x4d_avg   sse2/;
-  specialize qw/aom_sad32x32x4d_avg   sse2/;
-  specialize qw/aom_sad32x16x4d_avg   sse2/;
-  specialize qw/aom_sad32x8x4d_avg    sse2/;
-  specialize qw/aom_sad16x64x4d_avg   sse2/;
-  specialize qw/aom_sad16x32x4d_avg   sse2/;
-  specialize qw/aom_sad16x16x4d_avg   sse2/;
-  specialize qw/aom_sad16x8x4d_avg    sse2/;
-
-  specialize qw/aom_sad8x16x4d_avg    sse2/;
-  specialize qw/aom_sad8x8x4d_avg     sse2/;
-  specialize qw/aom_sad8x4x4d_avg     sse2/;
-  specialize qw/aom_sad4x16x4d_avg    sse2/;
-  specialize qw/aom_sad4x8x4d_avg     sse2/;
-  specialize qw/aom_sad4x4x4d_avg     sse2/;
-
-  specialize qw/aom_sad4x32x4d_avg    sse2/;
-  specialize qw/aom_sad4x16x4d_avg    sse2/;
-  specialize qw/aom_sad16x4x4d_avg    sse2/;
-  specialize qw/aom_sad8x32x4d_avg    sse2/;
-  specialize qw/aom_sad32x8x4d_avg    sse2/;
-  specialize qw/aom_sad64x16x4d_avg   sse2/;
+  specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_128x64x4d  avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_64x128x4d  avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_64x64x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_64x32x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_64x16x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_32x64x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_32x32x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_32x16x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_32x8x4d    avx2 sse2 neon/;
+
+  specialize qw/aom_sad_skip_16x64x4d        sse2 neon/;
+  specialize qw/aom_sad_skip_16x32x4d        sse2 neon/;
+  specialize qw/aom_sad_skip_16x16x4d        sse2 neon/;
+  specialize qw/aom_sad_skip_16x8x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_8x16x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_8x8x4d          sse2 neon/;
+  specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_4x8x4d          sse2 neon/;
+  specialize qw/aom_sad_skip_4x32x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_8x32x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_32x8x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_64x16x4d        sse2 neon/;
+
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    specialize qw/aom_sad128x128x4d_avg sse2/;
+    specialize qw/aom_sad128x64x4d_avg  sse2/;
+    specialize qw/aom_sad64x128x4d_avg  sse2/;
+    specialize qw/aom_sad64x64x4d_avg   sse2/;
+    specialize qw/aom_sad64x32x4d_avg   sse2/;
+    specialize qw/aom_sad64x16x4d_avg   sse2/;
+    specialize qw/aom_sad32x64x4d_avg   sse2/;
+    specialize qw/aom_sad32x32x4d_avg   sse2/;
+    specialize qw/aom_sad32x16x4d_avg   sse2/;
+    specialize qw/aom_sad32x8x4d_avg    sse2/;
+    specialize qw/aom_sad16x64x4d_avg   sse2/;
+    specialize qw/aom_sad16x32x4d_avg   sse2/;
+    specialize qw/aom_sad16x16x4d_avg   sse2/;
+    specialize qw/aom_sad16x8x4d_avg    sse2/;
+
+    specialize qw/aom_sad8x16x4d_avg    sse2/;
+    specialize qw/aom_sad8x8x4d_avg     sse2/;
+    specialize qw/aom_sad8x4x4d_avg     sse2/;
+    specialize qw/aom_sad4x16x4d_avg    sse2/;
+    specialize qw/aom_sad4x8x4d_avg     sse2/;
+    specialize qw/aom_sad4x4x4d_avg     sse2/;
+
+    specialize qw/aom_sad4x32x4d_avg    sse2/;
+    specialize qw/aom_sad4x16x4d_avg    sse2/;
+    specialize qw/aom_sad16x4x4d_avg    sse2/;
+    specialize qw/aom_sad8x32x4d_avg    sse2/;
+    specialize qw/aom_sad32x8x4d_avg    sse2/;
+    specialize qw/aom_sad64x16x4d_avg   sse2/;
+  }
 
   specialize qw/aom_masked_sad128x128x4d  ssse3/;
   specialize qw/aom_masked_sad128x64x4d   ssse3/;
@@ -917,9 +1124,10 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   # Multi-block SAD, comparing a reference to N independent blocks
   #
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    foreach (@block_sizes) {
+    foreach (@encoder_block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+      add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
       if ($w != 128 && $h != 128) {
         specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
       }
@@ -947,6 +1155,27 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize qw/aom_highbd_sad32x8x4d    avx2 sse2/;
     specialize qw/aom_highbd_sad16x64x4d   avx2 sse2/;
     specialize qw/aom_highbd_sad64x16x4d   avx2 sse2/;
+
+    specialize qw/aom_highbd_sad_skip_128x128x4d avx2/;
+    specialize qw/aom_highbd_sad_skip_128x64x4d  avx2/;
+    specialize qw/aom_highbd_sad_skip_64x128x4d  avx2/;
+    specialize qw/aom_highbd_sad_skip_64x64x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_64x32x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_32x64x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_32x32x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_32x16x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x32x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x16x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x8x4d    avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_8x16x4d         sse2/;
+    specialize qw/aom_highbd_sad_skip_8x8x4d          sse2/;
+    specialize qw/aom_highbd_sad_skip_4x8x4d          sse2/;
+
+    specialize qw/aom_highbd_sad_skip_4x16x4d         sse2/;
+    specialize qw/aom_highbd_sad_skip_8x32x4d         sse2/;
+    specialize qw/aom_highbd_sad_skip_32x8x4d    avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_16x64x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad_skip_64x16x4d   avx2 sse2/;
   }
   #
   # Avg
@@ -957,28 +1186,36 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
   specialize qw/aom_avg_4x4 sse2 neon/;
 
+  add_proto qw/void aom_avg_8x8_quad/, "const uint8_t *s, int p, int x16_idx, int y16_idx, int *avg";
+  specialize qw/aom_avg_8x8_quad avx2 sse2 neon/;
+
   add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
   specialize qw/aom_minmax_8x8 sse2/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p";
     add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p";
+    specialize qw/aom_highbd_avg_4x4 neon/;
     add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
   }
 
-  add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
-  specialize qw/aom_int_pro_row sse2/;
+  add_proto qw/void aom_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
+  specialize qw/aom_int_pro_row sse2 neon/;
 
   add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
-  specialize qw/aom_int_pro_col sse2/;
+  specialize qw/aom_int_pro_col sse2 neon/;
 
   add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
+  specialize qw/aom_vector_var neon/;
   # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
-  #specialize qw/aom_vector_var sse2/;
+  #specialize qw/aom_vector_var neon sse2/;
 
   #
   # hamadard transform and satd for implmenting temporal dependency model
   #
+  add_proto qw/void aom_hadamard_4x4/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+  specialize qw/aom_hadamard_4x4 sse2/;
+
   add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
   specialize qw/aom_hadamard_8x8 sse2 neon/;
 
@@ -992,8 +1229,13 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_hadamard_lp_8x8 sse2 neon/;
 
   add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-  specialize qw/aom_hadamard_lp_16x16 avx2 neon/;
+  specialize qw/aom_hadamard_lp_16x16 sse2 avx2 neon/;
+
+  add_proto qw/void aom_hadamard_8x8_dual/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+  specialize qw/aom_hadamard_8x8_dual sse2 avx2 neon/;
 
+  add_proto qw/void aom_pixel_scale/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff, int log_scale, int h8, int w8";
+  specialize qw/aom_pixel_scale sse2/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void aom_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
@@ -1006,25 +1248,25 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize qw/aom_highbd_hadamard_32x32 avx2/;
   }
   add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length";
-  specialize qw/aom_satd avx2/;
+  specialize qw/aom_satd neon sse2 avx2/;
 
   add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length";
-  specialize qw/aom_satd_lp avx2 neon/;
+  specialize qw/aom_satd_lp sse2 avx2 neon/;
 
 
   #
   # Structured Similarity (SSIM)
   #
-  if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
+  add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
 
+  if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
     add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
     specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
+  }
 
-    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    }
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
   }
 }  # CONFIG_AV1_ENCODER
 
@@ -1034,12 +1276,13 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   # Specialty Variance
   #
   add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
   add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 
   specialize qw/aom_get16x16var                neon msa/;
   specialize qw/aom_get8x8var             sse2 neon msa/;
 
+  add_proto qw/void aom_get_sse_sum_8x8_quad/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/aom_get_sse_sum_8x8_quad        avx2 sse2 neon/;
 
   add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
@@ -1069,51 +1312,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   #
   #
   #
-  add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                          const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
-                                          int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search";
-  specialize qw/aom_upsampled_pred sse2/;
-
-  add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                   const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                   int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                   int ref_stride, int subpel_search";
-  specialize qw/aom_comp_avg_upsampled_pred sse2/;
-
-  add_proto qw/void aom_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                       int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
-  specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3/;
-
-  add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                       int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-                                                       int subpel_search";
-  specialize qw/aom_comp_mask_upsampled_pred sse2/;
-
-  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                   const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
-                                                   int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
-    specialize qw/aom_highbd_upsampled_pred sse2/;
-
-    add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                            const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-                                                            int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
-    specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
-
-    add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                                const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-                                                                int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-                                                                int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
-    specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/;
-  }
-
-  #
-  #
-  #
   add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
   add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
 
@@ -1129,7 +1327,10 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
-  foreach (@block_sizes) {
+  add_proto qw/uint64_t/, "aom_mse_wxh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
+  specialize qw/aom_mse_wxh_16bit  sse2 avx2/;
+
+  foreach (@encoder_block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
@@ -1137,38 +1338,38 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
   }
   specialize qw/aom_variance128x128   sse2 avx2 neon    /;
-  specialize qw/aom_variance128x64    sse2 avx2         /;
-  specialize qw/aom_variance64x128    sse2 avx2         /;
+  specialize qw/aom_variance128x64    sse2 avx2 neon    /;
+  specialize qw/aom_variance64x128    sse2 avx2 neon    /;
   specialize qw/aom_variance64x64     sse2 avx2 neon msa/;
   specialize qw/aom_variance64x32     sse2 avx2 neon msa/;
   specialize qw/aom_variance32x64     sse2 avx2 neon msa/;
   specialize qw/aom_variance32x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x16     sse2 avx2      msa/;
-  specialize qw/aom_variance16x32     sse2 avx2      msa/;
+  specialize qw/aom_variance32x16     sse2 avx2 neon msa/;
+  specialize qw/aom_variance16x32     sse2 avx2 neon msa/;
   specialize qw/aom_variance16x16     sse2 avx2 neon msa/;
   specialize qw/aom_variance16x8      sse2 avx2 neon msa/;
   specialize qw/aom_variance8x16      sse2      neon msa/;
   specialize qw/aom_variance8x8       sse2      neon msa/;
-  specialize qw/aom_variance8x4       sse2           msa/;
-  specialize qw/aom_variance4x8       sse2           msa/;
-  specialize qw/aom_variance4x4       sse2           msa/;
+  specialize qw/aom_variance8x4       sse2      neon msa/;
+  specialize qw/aom_variance4x8       sse2      neon msa/;
+  specialize qw/aom_variance4x4       sse2      neon msa/;
 
-  specialize qw/aom_sub_pixel_variance128x128   avx2          sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance128x64    avx2          sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x128    avx2          sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance128x128   avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance128x64    avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x128    avx2 neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance64x64     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x32     avx2      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x64     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x32     avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x64     avx2 neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance32x32     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x16     avx2      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x32     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x16     avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x32     avx2 neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance16x16     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x8      avx2      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x16                msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x8      avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x16           neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance8x8            neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x4                 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance4x8                 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance4x4                 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x4            neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance4x8            neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance4x4            neon msa sse2 ssse3/;
 
   specialize qw/aom_sub_pixel_avg_variance128x128 avx2     sse2 ssse3/;
   specialize qw/aom_sub_pixel_avg_variance128x64  avx2     sse2 ssse3/;
@@ -1187,25 +1388,34 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sub_pixel_avg_variance4x8          msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_avg_variance4x4          msa sse2 ssse3/;
 
-  specialize qw/aom_variance4x16 sse2/;
-  specialize qw/aom_variance16x4 sse2 avx2/;
-  specialize qw/aom_variance8x32 sse2/;
-  specialize qw/aom_variance32x8 sse2 avx2/;
-  specialize qw/aom_variance16x64 sse2 avx2/;
-  specialize qw/aom_variance64x16 sse2 avx2/;
-
-  specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x4 avx2 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x64 avx2 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    specialize qw/aom_variance4x16 sse2/;
+    specialize qw/aom_variance16x4 sse2 avx2/;
+    specialize qw/aom_variance8x32 sse2/;
+    specialize qw/aom_variance32x8 sse2 avx2/;
+    specialize qw/aom_variance16x64 sse2 avx2/;
+    specialize qw/aom_variance64x16 sse2 avx2/;
+
+    specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance8x32 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance32x8 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance16x64 neon avx2 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance64x16 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
+
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32  ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8  ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/;
+  }
 
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 ssse3/;
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 ssse3/;
@@ -1221,13 +1431,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8   ssse3/;
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4   ssse3/;
 
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/;
-
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128  ssse3/;
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64   ssse3/;
   specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128   ssse3/;
@@ -1240,19 +1443,36 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
       add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
-      foreach (@block_sizes) {
+      foreach (@encoder_block_sizes) {
         ($w, $h) = @$_;
         add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
         add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
         add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
         if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
-          specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
+          if ($bd == 10) {
+            specialize "aom_highbd_${bd}_variance${w}x${h}", qw/sse2 neon/;
+          } else {
+            specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
+          }
         }
-        # TODO(david.barker): When ext-partition-types is enabled, we currently
-        # don't have vectorized 4x16 highbd variance functions
-        if ($w == 4 && $h == 4) {
-            specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
+
+        if ($w == 4 || $h == 4) {
+          # TODO(rachelbarker): When ext-partition-types is enabled, we currently
+          # don't have vectorized 4x16 highbd variance functions
+          if ($w == 4 && $h == 4) {
+            if ($bd == 10) {
+              specialize "aom_highbd_${bd}_variance${w}x${h}", qw/sse4_1 neon/;
+            } else {
+              specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
+            }
+          } else {
+            if ($bd == 10) {
+              specialize "aom_highbd_${bd}_variance${w}x${h}", qw/neon/;
+            }
           }
+        }
+
+
         if ($w != 128 && $h != 128 && $w != 4) {
           specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
           specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
@@ -1269,7 +1489,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   #
   # Masked Variance / Masked Subpixel Variance
   #
-  foreach (@block_sizes) {
+  foreach (@encoder_block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
     specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
@@ -1277,7 +1497,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     foreach $bd ("_8_", "_10_", "_12_") {
-      foreach (@block_sizes) {
+      foreach (@encoder_block_sizes) {
         ($w, $h) = @$_;
         add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
         specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
@@ -1288,21 +1508,23 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   #
   # OBMC Variance / OBMC Subpixel Variance
   #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-    add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-    specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/;
-    specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
-  }
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    foreach (@encoder_block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+      specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/;
+      specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
+    }
 
-  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    foreach $bd ("_", "_10_", "_12_") {
-      foreach (@block_sizes) {
-        ($w, $h) = @$_;
-        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-        specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      foreach $bd ("_", "_10_", "_12_") {
+        foreach (@encoder_block_sizes) {
+          ($w, $h) = @$_;
+          add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+          add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+          specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
+        }
       }
     }
   }
@@ -1400,43 +1622,43 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
     add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance128x128 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance128x64 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance64x128 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance64x64 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance64x32 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance32x64 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance32x32 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance32x16 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance16x32 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance16x16 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance16x8 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance8x16 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance8x8 sse2 avx2/;
+    specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon/;
 
     add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
@@ -1522,6 +1744,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
     add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
     specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2/;
+
+    add_proto qw/uint64_t/, "aom_mse_wxh_16bit_highbd", "uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
+    specialize qw/aom_mse_wxh_16bit_highbd   sse2 avx2/;
   }
     #
     # Subpixel Variance
@@ -1573,43 +1798,43 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
       add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/;
+      specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2/;
 
       add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
       specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
diff --git a/media/libaom/src/aom_dsp/arm/aom_convolve_copy_neon.c b/media/libaom/src/aom_dsp/arm/aom_convolve_copy_neon.c
new file mode 100644
index 0000000000..583d832114
--- /dev/null
+++ b/media/libaom/src/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+  const uint8_t *src1;
+  uint8_t *dst1;
+  int y;
+
+  if (!(w & 0x0F)) {
+    for (y = 0; y < h; ++y) {
+      src1 = src;
+      dst1 = dst;
+      for (int x = 0; x < (w >> 4); ++x) {
+        vst1q_u8(dst1, vld1q_u8(src1));
+        src1 += 16;
+        dst1 += 16;
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x07)) {
+    for (y = 0; y < h; ++y) {
+      vst1_u8(dst, vld1_u8(src));
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x03)) {
+    for (y = 0; y < h; ++y) {
+      vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x01)) {
+    for (y = 0; y < h; ++y) {
+      vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
diff --git a/media/libaom/src/aom_dsp/arm/avg_neon.c b/media/libaom/src/aom_dsp/arm/avg_neon.c
index af3769edf8..42133b80cf 100644
--- a/media/libaom/src/aom_dsp/arm/avg_neon.c
+++ b/media/libaom/src/aom_dsp/arm/avg_neon.c
@@ -12,9 +12,10 @@
 
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
 
 unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
   const uint8x16_t b = load_unaligned_u8q(a, a_stride);
@@ -48,6 +49,16 @@ unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) {
   return vget_lane_u32(vrshr_n_u32(d, 6), 0);
 }
 
+void aom_avg_8x8_quad_neon(const uint8_t *s, int p, int x16_idx, int y16_idx,
+                           int *avg) {
+  for (int k = 0; k < 4; k++) {
+    const int x8_idx = x16_idx + ((k & 1) << 3);
+    const int y8_idx = y16_idx + ((k >> 1) << 3);
+    const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
+    avg[k] = aom_avg_8x8_neon(s_tmp, p);
+  }
+}
+
 int aom_satd_lp_neon(const int16_t *coeff, int length) {
   const int16x4_t zero = vdup_n_s16(0);
   int32x4_t accum = vdupq_n_s32(0);
@@ -72,3 +83,142 @@ int aom_satd_lp_neon(const int16_t *coeff, int length) {
     return satd;
   }
 }
+
+void aom_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref,
+                          const int ref_stride, const int height) {
+  int i;
+  const uint8_t *idx = ref;
+  uint16x8_t vec0 = vdupq_n_u16(0);
+  uint16x8_t vec1 = vec0;
+  uint8x16_t tmp;
+
+  for (i = 0; i < height; ++i) {
+    tmp = vld1q_u8(idx);
+    idx += ref_stride;
+    vec0 = vaddw_u8(vec0, vget_low_u8(tmp));
+    vec1 = vaddw_u8(vec1, vget_high_u8(tmp));
+  }
+
+  if (128 == height) {
+    vec0 = vshrq_n_u16(vec0, 6);
+    vec1 = vshrq_n_u16(vec1, 6);
+  } else if (64 == height) {
+    vec0 = vshrq_n_u16(vec0, 5);
+    vec1 = vshrq_n_u16(vec1, 5);
+  } else if (32 == height) {
+    vec0 = vshrq_n_u16(vec0, 4);
+    vec1 = vshrq_n_u16(vec1, 4);
+  } else if (16 == height) {
+    vec0 = vshrq_n_u16(vec0, 3);
+    vec1 = vshrq_n_u16(vec1, 3);
+  }
+
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec0));
+  hbuf += 8;
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec1));
+}
+
+int16_t aom_int_pro_col_neon(const uint8_t *ref, const int width) {
+  const uint8_t *idx;
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  for (idx = ref; idx < (ref + width); idx += 16) {
+    uint8x16_t vec = vld1q_u8(idx);
+    sum = vaddq_u16(sum, vpaddlq_u8(vec));
+  }
+
+#if defined(__aarch64__)
+  return (int16_t)vaddvq_u16(sum);
+#else
+  const uint32x4_t a = vpaddlq_u16(sum);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return (int16_t)vget_lane_u32(c, 0);
+#endif
+}
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int aom_satd_neon(const tran_low_t *coeff, int length) {
+  const int32x4_t zero = vdupq_n_s32(0);
+  int32x4_t accum = zero;
+  do {
+    const int32x4_t src0 = vld1q_s32(&coeff[0]);
+    const int32x4_t src8 = vld1q_s32(&coeff[4]);
+    const int32x4_t src16 = vld1q_s32(&coeff[8]);
+    const int32x4_t src24 = vld1q_s32(&coeff[12]);
+    accum = vabaq_s32(accum, src0, zero);
+    accum = vabaq_s32(accum, src8, zero);
+    accum = vabaq_s32(accum, src16, zero);
+    accum = vabaq_s32(accum, src24, zero);
+    length -= 16;
+    coeff += 16;
+  } while (length != 0);
+
+  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+#ifdef __aarch64__
+  return vaddvq_s32(accum);
+#else
+  return horizontal_add_s32x4(accum);
+#endif  // __aarch64__
+}
+
+int aom_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl) {
+  int32x4_t v_mean = vdupq_n_s32(0);
+  int32x4_t v_sse = v_mean;
+  int16x8_t v_ref, v_src;
+  int16x4_t v_low;
+
+  int i, width = 4 << bwl;
+  for (i = 0; i < width; i += 8) {
+    v_ref = vld1q_s16(&ref[i]);
+    v_src = vld1q_s16(&src[i]);
+    const int16x8_t diff = vsubq_s16(v_ref, v_src);
+    // diff: dynamic range [-510, 510], 10 bits.
+    v_mean = vpadalq_s16(v_mean, diff);
+    v_low = vget_low_s16(diff);
+    v_sse = vmlal_s16(v_sse, v_low, v_low);
+#if defined(__aarch64__)
+    v_sse = vmlal_high_s16(v_sse, diff, diff);
+#else
+    const int16x4_t v_high = vget_high_s16(diff);
+    v_sse = vmlal_s16(v_sse, v_high, v_high);
+#endif
+  }
+#if defined(__aarch64__)
+  int mean = vaddvq_s32(v_mean);
+  int sse = (int)vaddvq_s32(v_sse);
+#else
+  int mean = horizontal_add_s32x4(v_mean);
+  int sse = horizontal_add_s32x4(v_sse);
+#endif
+  // (mean * mean): dynamic range 31 bits.
+  int var = sse - ((mean * mean) >> (bwl + 2));
+  return var;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+unsigned int aom_highbd_avg_4x4_neon(const uint8_t *s, int p) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(s);
+  const uint16x4_t r0 = vld1_u16(src);
+  src += p;
+  uint16x4_t r1, r2, r3;
+  r1 = vld1_u16(src);
+  src += p;
+  r2 = vld1_u16(src);
+  src += p;
+  r3 = vld1_u16(src);
+  const uint16x4_t s1 = vadd_u16(r0, r1);
+  const uint16x4_t s2 = vadd_u16(r2, r3);
+  const uint16x4_t s3 = vadd_u16(s1, s2);
+#if defined(__aarch64__)
+  return (vaddv_u16(s3) + 8) >> 4;
+#else
+  const uint16x4_t h1 = vpadd_u16(s3, s3);
+  const uint16x4_t h2 = vpadd_u16(h1, h1);
+  const uint16x4_t res = vrshr_n_u16(h2, 4);
+  return vget_lane_u16(res, 0);
+#endif
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c b/media/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c
index e7f08a5fdb..8709e38b80 100644
--- a/media/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/media/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c
@@ -15,8 +15,8 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/blend.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
 static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
diff --git a/media/libaom/src/aom_dsp/arm/fwd_txfm_neon.c b/media/libaom/src/aom_dsp/arm/fwd_txfm_neon.c
index ce93523478..7fccdabc4c 100644
--- a/media/libaom/src/aom_dsp/arm/fwd_txfm_neon.c
+++ b/media/libaom/src/aom_dsp/arm/fwd_txfm_neon.c
@@ -14,8 +14,8 @@
 #include "config/aom_config.h"
 
 #include "aom_dsp/txfm_common.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 
 static void aom_fdct4x4_helper(const int16_t *input, int stride,
                                int16x4_t *input_0, int16x4_t *input_1,
diff --git a/media/libaom/src/aom_dsp/arm/hadamard_neon.c b/media/libaom/src/aom_dsp/arm/hadamard_neon.c
index 929792ab33..336f53f155 100644
--- a/media/libaom/src/aom_dsp/arm/hadamard_neon.c
+++ b/media/libaom/src/aom_dsp/arm/hadamard_neon.c
@@ -12,8 +12,8 @@
 
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 
 static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
                                  int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
@@ -104,6 +104,13 @@ void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
   vst1q_s16(coeff + 56, a7);
 }
 
+void aom_hadamard_8x8_dual_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  for (int i = 0; i < 2; i++) {
+    aom_hadamard_lp_8x8_neon(src_diff + (i * 8), src_stride, coeff + (i * 64));
+  }
+}
+
 void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
                                 int16_t *coeff) {
   /* Rearrange 16x16 to 8x32 and remove stride.
diff --git a/media/libaom/src/aom_dsp/arm/highbd_intrapred_neon.c b/media/libaom/src/aom_dsp/arm/highbd_intrapred_neon.c
new file mode 100644
index 0000000000..7544777afe
--- /dev/null
+++ b/media/libaom/src/aom_dsp/arm/highbd_intrapred_neon.c
@@ -0,0 +1,835 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/intrapred_common.h"
+
+// -----------------------------------------------------------------------------
+// DC
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                       const uint16_t *above,
+                                       const uint16_t *left) {
+  assert(bw >= 4);
+  assert(IS_POWER_OF_TWO(bw));
+  int expected_dc, sum = 0;
+  const int count = bw * 2;
+  uint32x4_t sum_q = vdupq_n_u32(0);
+  uint32x2_t sum_d;
+  uint16_t *dst_1;
+  if (bw >= 8) {
+    for (int i = 0; i < bw; i += 8) {
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
+      above += 8;
+      left += 8;
+    }
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      dst_1 = dst;
+      for (int i = 0; i < bw; i += 8) {
+        vst1q_u16(dst_1, dc);
+        dst_1 += 8;
+      }
+      dst += stride;
+    }
+  } else {  // 4x4
+    sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      vst1_u16(dst, dc);
+      dst += stride;
+    }
+  }
+}
+
+#define INTRA_PRED_HIGHBD_SIZED_NEON(type, width)               \
+  void aom_highbd_##type##_predictor_##width##x##width##_neon(  \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
+      const uint16_t *left, int bd) {                           \
+    (void)bd;                                                   \
+    highbd_##type##_predictor(dst, stride, width, above, left); \
+  }
+
+#define INTRA_PRED_SQUARE(type)          \
+  INTRA_PRED_HIGHBD_SIZED_NEON(type, 4)  \
+  INTRA_PRED_HIGHBD_SIZED_NEON(type, 8)  \
+  INTRA_PRED_HIGHBD_SIZED_NEON(type, 16) \
+  INTRA_PRED_HIGHBD_SIZED_NEON(type, 32) \
+  INTRA_PRED_HIGHBD_SIZED_NEON(type, 64)
+
+INTRA_PRED_SQUARE(dc)
+
+#undef INTRA_PRED_SQUARE
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+#define HIGHBD_V_NXM(W, H)                                    \
+  void aom_highbd_v_predictor_##W##x##H##_neon(               \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+      const uint16_t *left, int bd) {                         \
+    (void)left;                                               \
+    (void)bd;                                                 \
+    vertical##W##xh_neon(dst, stride, above, H);              \
+  }
+
+static INLINE uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
+  uint16x8x2_t x;
+  // Clang/gcc uses ldp here.
+  x.val[0] = vld1q_u16(ptr);
+  x.val[1] = vld1q_u16(ptr + 8);
+  return x;
+}
+
+static INLINE void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
+  vst1q_u16(ptr, x.val[0]);
+  vst1q_u16(ptr + 8, x.val[1]);
+}
+
+static INLINE void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *const above, int height) {
+  const uint16x4_t row = vld1_u16(above);
+  int y = height;
+  do {
+    vst1_u16(dst, row);
+    vst1_u16(dst + stride, row);
+    dst += stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+static INLINE void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *const above, int height) {
+  const uint16x8_t row = vld1q_u16(above);
+  int y = height;
+  do {
+    vst1q_u16(dst, row);
+    vst1q_u16(dst + stride, row);
+    dst += stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+static INLINE void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *const above, int height) {
+  const uint16x8x2_t row = load_uint16x8x2(above);
+  int y = height;
+  do {
+    store_uint16x8x2(dst, row);
+    store_uint16x8x2(dst + stride, row);
+    dst += stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+static INLINE uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
+  uint16x8x4_t x;
+  // Clang/gcc uses ldp here.
+  x.val[0] = vld1q_u16(ptr);
+  x.val[1] = vld1q_u16(ptr + 8);
+  x.val[2] = vld1q_u16(ptr + 16);
+  x.val[3] = vld1q_u16(ptr + 24);
+  return x;
+}
+
+static INLINE void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
+  vst1q_u16(ptr, x.val[0]);
+  vst1q_u16(ptr + 8, x.val[1]);
+  vst1q_u16(ptr + 16, x.val[2]);
+  vst1q_u16(ptr + 24, x.val[3]);
+}
+
+static INLINE void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *const above, int height) {
+  const uint16x8x4_t row = load_uint16x8x4(above);
+  int y = height;
+  do {
+    store_uint16x8x4(dst, row);
+    store_uint16x8x4(dst + stride, row);
+    dst += stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+static INLINE void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *const above, int height) {
+  uint16_t *dst32 = dst + 32;
+  const uint16x8x4_t row = load_uint16x8x4(above);
+  const uint16x8x4_t row32 = load_uint16x8x4(above + 32);
+  int y = height;
+  do {
+    store_uint16x8x4(dst, row);
+    store_uint16x8x4(dst32, row32);
+    store_uint16x8x4(dst + stride, row);
+    store_uint16x8x4(dst32 + stride, row32);
+    dst += stride << 1;
+    dst32 += stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+HIGHBD_V_NXM(4, 4)
+HIGHBD_V_NXM(4, 8)
+HIGHBD_V_NXM(4, 16)
+
+HIGHBD_V_NXM(8, 4)
+HIGHBD_V_NXM(8, 8)
+HIGHBD_V_NXM(8, 16)
+HIGHBD_V_NXM(8, 32)
+
+HIGHBD_V_NXM(16, 4)
+HIGHBD_V_NXM(16, 8)
+HIGHBD_V_NXM(16, 16)
+HIGHBD_V_NXM(16, 32)
+HIGHBD_V_NXM(16, 64)
+
+HIGHBD_V_NXM(32, 8)
+HIGHBD_V_NXM(32, 16)
+HIGHBD_V_NXM(32, 32)
+HIGHBD_V_NXM(32, 64)
+
+HIGHBD_V_NXM(64, 16)
+HIGHBD_V_NXM(64, 32)
+HIGHBD_V_NXM(64, 64)
+
+// -----------------------------------------------------------------------------
+// PAETH
+
+static INLINE void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
+                                              const uint16_t *const top_row,
+                                              const uint16_t *const left_column,
+                                              int width, int height) {
+  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+  uint16x8_t top;
+  if (width == 4) {
+    top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0));
+  } else {  // width == 8
+    top = vld1q_u16(top_row);
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint16x8_t left = vdupq_n_u16(left_column[y]);
+
+    const uint16x8_t left_dist = vabdq_u16(top, top_left);
+    const uint16x8_t top_dist = vabdq_u16(left, top_left);
+    const uint16x8_t top_left_dist =
+        vabdq_u16(vaddq_u16(top, left), top_left_x2);
+
+    const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+    const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+    const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint16x8_t result = vbslq_u16(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbslq_u16(left_or_top_mask, result, top_left);
+
+    if (width == 4) {
+      vst1_u16(dest, vget_low_u16(result));
+    } else {  // width == 8
+      vst1q_u16(dest, result);
+    }
+    dest += stride;
+  }
+}
+
+#define HIGHBD_PAETH_NXM(W, H)                                  \
+  void aom_highbd_paeth_predictor_##W##x##H##_neon(             \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
+      const uint16_t *left, int bd) {                           \
+    (void)bd;                                                   \
+    highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
+  }
+
+HIGHBD_PAETH_NXM(4, 4)
+HIGHBD_PAETH_NXM(4, 8)
+HIGHBD_PAETH_NXM(4, 16)
+HIGHBD_PAETH_NXM(8, 4)
+HIGHBD_PAETH_NXM(8, 8)
+HIGHBD_PAETH_NXM(8, 16)
+HIGHBD_PAETH_NXM(8, 32)
+
+// Select the closest values and collect them.
+static INLINE uint16x8_t select_paeth(const uint16x8_t top,
+                                      const uint16x8_t left,
+                                      const uint16x8_t top_left,
+                                      const uint16x8_t left_le_top,
+                                      const uint16x8_t left_le_top_left,
+                                      const uint16x8_t top_le_top_left) {
+  // if (left_dist <= top_dist && left_dist <= top_left_dist)
+  const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+  //   dest[x] = left_column[y];
+  // Fill all the unused spaces with 'top'. They will be overwritten when
+  // the positions for top_left are known.
+  const uint16x8_t result = vbslq_u16(left_mask, left, top);
+  // else if (top_dist <= top_left_dist)
+  //   dest[x] = top_row[x];
+  // Add these values to the mask. They were already set.
+  const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+  // else
+  //   dest[x] = top_left;
+  return vbslq_u16(left_or_top_mask, result, top_left);
+}
+
+#define PAETH_PREDICTOR(num)                                                  \
+  do {                                                                        \
+    const uint16x8_t left_dist = vabdq_u16(top[num], top_left);               \
+    const uint16x8_t top_left_dist =                                          \
+        vabdq_u16(vaddq_u16(top[num], left), top_left_x2);                    \
+    const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);            \
+    const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);  \
+    const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);    \
+    const uint16x8_t result =                                                 \
+        select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \
+                     top_le_top_left);                                        \
+    vst1q_u16(dest + (num * 8), result);                                      \
+  } while (0)
+
+#define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8))
+
+static INLINE void highbd_paeth16_plus_x_h_neon(
+    uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row,
+    const uint16_t *const left_column, int width, int height) {
+  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+  uint16x8_t top[8];
+  top[0] = LOAD_TOP_ROW(0);
+  top[1] = LOAD_TOP_ROW(1);
+  if (width > 16) {
+    top[2] = LOAD_TOP_ROW(2);
+    top[3] = LOAD_TOP_ROW(3);
+    if (width == 64) {
+      top[4] = LOAD_TOP_ROW(4);
+      top[5] = LOAD_TOP_ROW(5);
+      top[6] = LOAD_TOP_ROW(6);
+      top[7] = LOAD_TOP_ROW(7);
+    }
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint16x8_t left = vdupq_n_u16(left_column[y]);
+    const uint16x8_t top_dist = vabdq_u16(left, top_left);
+    PAETH_PREDICTOR(0);
+    PAETH_PREDICTOR(1);
+    if (width > 16) {
+      PAETH_PREDICTOR(2);
+      PAETH_PREDICTOR(3);
+      if (width == 64) {
+        PAETH_PREDICTOR(4);
+        PAETH_PREDICTOR(5);
+        PAETH_PREDICTOR(6);
+        PAETH_PREDICTOR(7);
+      }
+    }
+    dest += stride;
+  }
+}
+
+#define HIGHBD_PAETH_NXM_WIDE(W, H)                               \
+  void aom_highbd_paeth_predictor_##W##x##H##_neon(               \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,     \
+      const uint16_t *left, int bd) {                             \
+    (void)bd;                                                     \
+    highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
+  }
+
+HIGHBD_PAETH_NXM_WIDE(16, 4)
+HIGHBD_PAETH_NXM_WIDE(16, 8)
+HIGHBD_PAETH_NXM_WIDE(16, 16)
+HIGHBD_PAETH_NXM_WIDE(16, 32)
+HIGHBD_PAETH_NXM_WIDE(16, 64)
+HIGHBD_PAETH_NXM_WIDE(32, 8)
+HIGHBD_PAETH_NXM_WIDE(32, 16)
+HIGHBD_PAETH_NXM_WIDE(32, 32)
+HIGHBD_PAETH_NXM_WIDE(32, 64)
+HIGHBD_PAETH_NXM_WIDE(64, 16)
+HIGHBD_PAETH_NXM_WIDE(64, 32)
+HIGHBD_PAETH_NXM_WIDE(64, 64)
+
+// -----------------------------------------------------------------------------
+// SMOOTH
+
+// 256 - v = vneg_s8(v)
+static INLINE uint16x4_t negate_s8(const uint16x4_t v) {
+  return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
+}
+
+static INLINE void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *const top_row,
+                                          const uint16_t *const left_column,
+                                          const int height) {
+  const uint16_t top_right = top_row[3];
+  const uint16_t bottom_left = left_column[height - 1];
+  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+  const uint16x4_t top_v = vld1_u16(top_row);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16);
+  const uint16x4_t scaled_weights_x = negate_s8(weights_x_v);
+  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+
+  for (int y = 0; y < height; ++y) {
+    // Each variable in the running summation is named for the last item to be
+    // accumulated.
+    const uint32x4_t weighted_top =
+        vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
+    const uint32x4_t weighted_left =
+        vmlal_n_u16(weighted_top, weights_x_v, left_column[y]);
+    const uint32x4_t weighted_bl =
+        vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
+
+    const uint16x4_t pred =
+        vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1);
+    vst1_u16(dst, pred);
+    dst += stride;
+  }
+}
+
+// Common code between 8xH and [16|32|64]xH.
+static INLINE void highbd_calculate_pred8(
+    uint16_t *dst, const uint32x4_t weighted_corners_low,
+    const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals,
+    const uint16x4x2_t weights_x, const uint16_t left_y,
+    const uint16_t weight_y) {
+  // Each variable in the running summation is named for the last item to be
+  // accumulated.
+  const uint32x4_t weighted_top_low =
+      vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
+  const uint32x4_t weighted_edges_low =
+      vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
+
+  const uint16x4_t pred_low =
+      vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1);
+  vst1_u16(dst, pred_low);
+
+  const uint32x4_t weighted_top_high =
+      vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
+  const uint32x4_t weighted_edges_high =
+      vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
+
+  const uint16x4_t pred_high =
+      vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1);
+  vst1_u16(dst + 4, pred_high);
+}
+
+static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *const top_row,
+                                   const uint16_t *const left_column,
+                                   const int height) {
+  const uint16_t top_right = top_row[7];
+  const uint16_t bottom_left = left_column[height - 1];
+  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+  const uint16x4x2_t top_vals = { { vld1_u16(top_row),
+                                    vld1_u16(top_row + 4) } };
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
+                                     vld1_u16(smooth_weights_u16 + 8) } };
+  const uint32x4_t weighted_tr_low =
+      vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
+  const uint32x4_t weighted_tr_high =
+      vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
+
+  for (int y = 0; y < height; ++y) {
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    const uint32x4_t weighted_corners_low =
+        vaddq_u32(weighted_bl, weighted_tr_low);
+    const uint32x4_t weighted_corners_high =
+        vaddq_u32(weighted_bl, weighted_tr_high);
+    highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high,
+                           top_vals, weights_x, left_column[y], weights_y[y]);
+    dst += stride;
+  }
+}
+
+#define HIGHBD_SMOOTH_NXM(W, H)                                 \
+  void aom_highbd_smooth_predictor_##W##x##H##_neon(            \
+      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+      const uint16_t *left, int bd) {                           \
+    (void)bd;                                                   \
+    highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \
+  }
+
+HIGHBD_SMOOTH_NXM(4, 4)
+HIGHBD_SMOOTH_NXM(4, 8)
+HIGHBD_SMOOTH_NXM(8, 4)
+HIGHBD_SMOOTH_NXM(8, 8)
+HIGHBD_SMOOTH_NXM(4, 16)
+HIGHBD_SMOOTH_NXM(8, 16)
+HIGHBD_SMOOTH_NXM(8, 32)
+
+#undef HIGHBD_SMOOTH_NXM
+
+// For width 16 and above.
+#define HIGHBD_SMOOTH_PREDICTOR(W)                                             \
+  static void highbd_smooth_##W##xh_neon(                                      \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,          \
+      const uint16_t *const left_column, const int height) {                   \
+    const uint16_t top_right = top_row[(W)-1];                                 \
+    const uint16_t bottom_left = left_column[height - 1];                      \
+    const uint16_t *const weights_y = smooth_weights_u16 + height - 4;         \
+                                                                               \
+    /* Precompute weighted values that don't vary with |y|. */                 \
+    uint32x4_t weighted_tr_low[(W) >> 3];                                      \
+    uint32x4_t weighted_tr_high[(W) >> 3];                                     \
+    for (int i = 0; i<(W)>> 3; ++i) {                                          \
+      const int x = i << 3;                                                    \
+      const uint16x4_t weights_x_low =                                         \
+          vld1_u16(smooth_weights_u16 + (W)-4 + x);                            \
+      weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right);   \
+      const uint16x4_t weights_x_high =                                        \
+          vld1_u16(smooth_weights_u16 + (W) + x);                              \
+      weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \
+    }                                                                          \
+                                                                               \
+    const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                  \
+    for (int y = 0; y < height; ++y) {                                         \
+      const uint32x4_t weighted_bl =                                           \
+          vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                      \
+      uint16_t *dst_x = dst;                                                   \
+      for (int i = 0; i<(W)>> 3; ++i) {                                        \
+        const int x = i << 3;                                                  \
+        const uint16x4x2_t top_vals = { { vld1_u16(top_row + x),               \
+                                          vld1_u16(top_row + x + 4) } };       \
+        const uint32x4_t weighted_corners_low =                                \
+            vaddq_u32(weighted_bl, weighted_tr_low[i]);                        \
+        const uint32x4_t weighted_corners_high =                               \
+            vaddq_u32(weighted_bl, weighted_tr_high[i]);                       \
+        /* Accumulate weighted edge values and store. */                       \
+        const uint16x4x2_t weights_x = {                                       \
+          { vld1_u16(smooth_weights_u16 + (W)-4 + x),                          \
+            vld1_u16(smooth_weights_u16 + (W) + x) }                           \
+        };                                                                     \
+        highbd_calculate_pred8(dst_x, weighted_corners_low,                    \
+                               weighted_corners_high, top_vals, weights_x,     \
+                               left_column[y], weights_y[y]);                  \
+        dst_x += 8;                                                            \
+      }                                                                        \
+      dst += stride;                                                           \
+    }                                                                          \
+  }
+
+HIGHBD_SMOOTH_PREDICTOR(16)
+HIGHBD_SMOOTH_PREDICTOR(32)
+HIGHBD_SMOOTH_PREDICTOR(64)
+
+#undef HIGHBD_SMOOTH_PREDICTOR
+
+#define HIGHBD_SMOOTH_NXM_WIDE(W, H)                            \
+  void aom_highbd_smooth_predictor_##W##x##H##_neon(            \
+      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+      const uint16_t *left, int bd) {                           \
+    (void)bd;                                                   \
+    highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \
+  }
+
+HIGHBD_SMOOTH_NXM_WIDE(16, 4)
+HIGHBD_SMOOTH_NXM_WIDE(16, 8)
+HIGHBD_SMOOTH_NXM_WIDE(16, 16)
+HIGHBD_SMOOTH_NXM_WIDE(16, 32)
+HIGHBD_SMOOTH_NXM_WIDE(16, 64)
+HIGHBD_SMOOTH_NXM_WIDE(32, 8)
+HIGHBD_SMOOTH_NXM_WIDE(32, 16)
+HIGHBD_SMOOTH_NXM_WIDE(32, 32)
+HIGHBD_SMOOTH_NXM_WIDE(32, 64)
+HIGHBD_SMOOTH_NXM_WIDE(64, 16)
+HIGHBD_SMOOTH_NXM_WIDE(64, 32)
+HIGHBD_SMOOTH_NXM_WIDE(64, 64)
+
+#undef HIGHBD_SMOOTH_NXM_WIDE
+
+static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *const top_row,
+                                     const uint16_t *const left_column,
+                                     const int height) {
+  const uint16_t bottom_left = left_column[height - 1];
+  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+  const uint16x4_t top_v = vld1_u16(top_row);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    const uint32x4_t weighted_top =
+        vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
+    vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE));
+
+    dst += stride;
+  }
+}
+
+static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride,
+                                     const uint16_t *const top_row,
+                                     const uint16_t *const left_column,
+                                     const int height) {
+  const uint16_t bottom_left = left_column[height - 1];
+  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+  const uint16x4_t top_low = vld1_u16(top_row);
+  const uint16x4_t top_high = vld1_u16(top_row + 4);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+
+    const uint32x4_t weighted_top_low =
+        vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
+    vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));
+
+    const uint32x4_t weighted_top_high =
+        vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
+    vst1_u16(dst + 4,
+             vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE));
+    dst += stride;
+  }
+}
+
+#define HIGHBD_SMOOTH_V_NXM(W, H)                                \
+  void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \
+      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
+      const uint16_t *left, int bd) {                            \
+    (void)bd;                                                    \
+    highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+  }
+
+HIGHBD_SMOOTH_V_NXM(4, 4)
+HIGHBD_SMOOTH_V_NXM(4, 8)
+HIGHBD_SMOOTH_V_NXM(4, 16)
+HIGHBD_SMOOTH_V_NXM(8, 4)
+HIGHBD_SMOOTH_V_NXM(8, 8)
+HIGHBD_SMOOTH_V_NXM(8, 16)
+HIGHBD_SMOOTH_V_NXM(8, 32)
+
+#undef HIGHBD_SMOOTH_V_NXM
+
+// For width 16 and above.
+#define HIGHBD_SMOOTH_V_PREDICTOR(W)                                         \
+  static void highbd_smooth_v_##W##xh_neon(                                  \
+      uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row,  \
+      const uint16_t *const left_column, const int height) {                 \
+    const uint16_t bottom_left = left_column[height - 1];                    \
+    const uint16_t *const weights_y = smooth_weights_u16 + height - 4;       \
+                                                                             \
+    uint16x4x2_t top_vals[(W) >> 3];                                         \
+    for (int i = 0; i<(W)>> 3; ++i) {                                        \
+      const int x = i << 3;                                                  \
+      top_vals[i].val[0] = vld1_u16(top_row + x);                            \
+      top_vals[i].val[1] = vld1_u16(top_row + x + 4);                        \
+    }                                                                        \
+                                                                             \
+    const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                \
+    for (int y = 0; y < height; ++y) {                                       \
+      const uint32x4_t weighted_bl =                                         \
+          vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                    \
+                                                                             \
+      uint16_t *dst_x = dst;                                                 \
+      for (int i = 0; i<(W)>> 3; ++i) {                                      \
+        const uint32x4_t weighted_top_low =                                  \
+            vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);      \
+        vst1_u16(dst_x,                                                      \
+                 vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));  \
+                                                                             \
+        const uint32x4_t weighted_top_high =                                 \
+            vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);      \
+        vst1_u16(dst_x + 4,                                                  \
+                 vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
+        dst_x += 8;                                                          \
+      }                                                                      \
+      dst += stride;                                                         \
+    }                                                                        \
+  }
+
+HIGHBD_SMOOTH_V_PREDICTOR(16)
+HIGHBD_SMOOTH_V_PREDICTOR(32)
+HIGHBD_SMOOTH_V_PREDICTOR(64)
+
+#undef HIGHBD_SMOOTH_V_PREDICTOR
+
+#define HIGHBD_SMOOTH_V_NXM_WIDE(W, H)                           \
+  void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \
+      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
+      const uint16_t *left, int bd) {                            \
+    (void)bd;                                                    \
+    highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+  }
+
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 4)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 64)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 8)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
+HIGHBD_SMOOTH_V_NXM_WIDE(64, 16)
+HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
+HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
+
+#undef HIGHBD_SMOOTH_V_NXM_WIDE
+
+static INLINE void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *const top_row,
+                                            const uint16_t *const left_column,
+                                            const int height) {
+  const uint16_t top_right = top_row[3];
+
+  const uint16x4_t weights_x = vld1_u16(smooth_weights_u16);
+  const uint16x4_t scaled_weights_x = negate_s8(weights_x);
+
+  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+  for (int y = 0; y < height; ++y) {
+    const uint32x4_t weighted_left =
+        vmlal_n_u16(weighted_tr, weights_x, left_column[y]);
+    vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE));
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *const top_row,
+                                            const uint16_t *const left_column,
+                                            const int height) {
+  const uint16_t top_right = top_row[7];
+
+  const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
+                                     vld1_u16(smooth_weights_u16 + 8) } };
+
+  const uint32x4_t weighted_tr_low =
+      vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
+  const uint32x4_t weighted_tr_high =
+      vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
+
+  for (int y = 0; y < height; ++y) {
+    const uint16_t left_y = left_column[y];
+    const uint32x4_t weighted_left_low =
+        vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
+    vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));
+
+    const uint32x4_t weighted_left_high =
+        vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
+    vst1_u16(dst + 4,
+             vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE));
+    dst += stride;
+  }
+}
+
+#define HIGHBD_SMOOTH_H_NXM(W, H)                                \
+  void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \
+      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
+      const uint16_t *left, int bd) {                            \
+    (void)bd;                                                    \
+    highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
+  }
+
+HIGHBD_SMOOTH_H_NXM(4, 4)
+HIGHBD_SMOOTH_H_NXM(4, 8)
+HIGHBD_SMOOTH_H_NXM(4, 16)
+HIGHBD_SMOOTH_H_NXM(8, 4)
+HIGHBD_SMOOTH_H_NXM(8, 8)
+HIGHBD_SMOOTH_H_NXM(8, 16)
+HIGHBD_SMOOTH_H_NXM(8, 32)
+
+#undef HIGHBD_SMOOTH_H_NXM
+
+// For width 16 and above.
+#define HIGHBD_SMOOTH_H_PREDICTOR(W)                                          \
+  void highbd_smooth_h_##W##xh_neon(                                          \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,         \
+      const uint16_t *const left_column, const int height) {                  \
+    const uint16_t top_right = top_row[(W)-1];                                \
+                                                                              \
+    uint16x4_t weights_x_low[(W) >> 3];                                       \
+    uint16x4_t weights_x_high[(W) >> 3];                                      \
+    uint32x4_t weighted_tr_low[(W) >> 3];                                     \
+    uint32x4_t weighted_tr_high[(W) >> 3];                                    \
+    for (int i = 0; i<(W)>> 3; ++i) {                                         \
+      const int x = i << 3;                                                   \
+      weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x);            \
+      weighted_tr_low[i] =                                                    \
+          vmull_n_u16(negate_s8(weights_x_low[i]), top_right);                \
+      weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x);             \
+      weighted_tr_high[i] =                                                   \
+          vmull_n_u16(negate_s8(weights_x_high[i]), top_right);               \
+    }                                                                         \
+                                                                              \
+    for (int y = 0; y < height; ++y) {                                        \
+      uint16_t *dst_x = dst;                                                  \
+      const uint16_t left_y = left_column[y];                                 \
+      for (int i = 0; i<(W)>> 3; ++i) {                                       \
+        const uint32x4_t weighted_left_low =                                  \
+            vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);        \
+        vst1_u16(dst_x,                                                       \
+                 vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));  \
+                                                                              \
+        const uint32x4_t weighted_left_high =                                 \
+            vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);      \
+        vst1_u16(dst_x + 4,                                                   \
+                 vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
+        dst_x += 8;                                                           \
+      }                                                                       \
+      dst += stride;                                                          \
+    }                                                                         \
+  }
+
+HIGHBD_SMOOTH_H_PREDICTOR(16)
+HIGHBD_SMOOTH_H_PREDICTOR(32)
+HIGHBD_SMOOTH_H_PREDICTOR(64)
+
+#undef HIGHBD_SMOOTH_H_PREDICTOR
+
+#define HIGHBD_SMOOTH_H_NXM_WIDE(W, H)                           \
+  void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \
+      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
+      const uint16_t *left, int bd) {                            \
+    (void)bd;                                                    \
+    highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
+  }
+
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 4)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 64)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 8)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
+HIGHBD_SMOOTH_H_NXM_WIDE(64, 16)
+HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
+HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
+
+#undef HIGHBD_SMOOTH_H_NXM_WIDE
diff --git a/media/libaom/src/aom_dsp/arm/highbd_loopfilter_neon.c b/media/libaom/src/aom_dsp/arm/highbd_loopfilter_neon.c
new file mode 100644
index 0000000000..0b720ce9c7
--- /dev/null
+++ b/media/libaom/src/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -0,0 +1,1265 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low,
+                                  const int16x4_t high) {
+  return vmin_s16(vmax_s16(val, low), high);
+}
+
+static INLINE uint16x8_t convert_to_unsigned_pixel_u16(int16x8_t val,
+                                                       int bitdepth) {
+  const int16x8_t low = vdupq_n_s16(0);
+  const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
+
+  return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high);
+}
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+static INLINE uint16x4_t hev(const uint16x8_t abd_p0p1_q0q1,
+                             const uint16_t thresh) {
+  const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
+  return vorr_u16(vget_low_u16(a), vget_high_u16(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+static INLINE uint16x4_t outer_threshold(const uint16x4_t p1,
+                                         const uint16x4_t p0,
+                                         const uint16x4_t q0,
+                                         const uint16x4_t q1,
+                                         const uint16_t outer_thresh) {
+  const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
+  const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
+  const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
+  const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
+  const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
+  return vcle_u16(sum, vdup_n_u16(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   outer_threshold()
+static INLINE uint16x4_t needs_filter4(const uint16x8_t abd_p0p1_q0q1,
+                                       const uint16_t inner_thresh,
+                                       const uint16x4_t outer_mask) {
+  const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+//   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+//   outer_threshold()
+static INLINE uint16x4_t needs_filter6(const uint16x8_t abd_p0p1_q0q1,
+                                       const uint16x8_t abd_p1p2_q1q2,
+                                       const uint16_t inner_thresh,
+                                       const uint16x4_t outer_mask) {
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+//   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+//   outer_threshold()
+static INLINE uint16x4_t needs_filter8(const uint16x8_t abd_p0p1_q0q1,
+                                       const uint16x8_t abd_p1p2_q1q2,
+                                       const uint16x8_t abd_p2p3_q2q3,
+                                       const uint16_t inner_thresh,
+                                       const uint16x4_t outer_mask) {
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
+  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// -----------------------------------------------------------------------------
+// filterN_masks functions.
+
+static INLINE void filter4_masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
+                                 const uint16_t hev_thresh,
+                                 const uint16x4_t outer_mask,
+                                 const uint16_t inner_thresh,
+                                 uint16x4_t *const hev_mask,
+                                 uint16x4_t *const needs_filter4_mask) {
+  const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  // This includes cases where needs_filter4() is not true and so filter2() will
+  // not be applied.
+  const uint16x4_t hev_tmp_mask = hev(p0p1_q0q1, hev_thresh);
+
+  *needs_filter4_mask = needs_filter4(p0p1_q0q1, inner_thresh, outer_mask);
+
+  // filter2() will only be applied if both needs_filter4() and hev() are true.
+  *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+//   abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+static INLINE uint16x4_t is_flat3(const uint16x8_t abd_p0p1_q0q1,
+                                  const uint16x8_t abd_p0p2_q0q2,
+                                  const int bitdepth) {
+  const int flat_thresh = 1 << (bitdepth - 8);
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
+  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
+  return vand_u16(vget_low_u16(b), vget_high_u16(b));
+}
+
+static INLINE void filter6_masks(
+    const uint16x8_t p2q2, const uint16x8_t p1q1, const uint16x8_t p0q0,
+    const uint16_t hev_thresh, const uint16x4_t outer_mask,
+    const uint16_t inner_thresh, const int bitdepth,
+    uint16x4_t *const needs_filter6_mask, uint16x4_t *const is_flat3_mask,
+    uint16x4_t *const hev_mask) {
+  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  *hev_mask = hev(abd_p0p1_q0q1, hev_thresh);
+  *is_flat3_mask = is_flat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), bitdepth);
+  *needs_filter6_mask = needs_filter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
+                                      inner_thresh, outer_mask);
+}
+
+// is_flat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+//   abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+//   abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+static INLINE uint16x4_t is_flat4(const uint16x8_t abd_pnp0_qnq0,
+                                  const uint16x8_t abd_pn1p0_qn1q0,
+                                  const uint16x8_t abd_pn2p0_qn2q0,
+                                  const int bitdepth) {
+  const int flat_thresh = 1 << (bitdepth - 8);
+  const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
+  const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
+  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
+  return vand_u16(vget_low_u16(c), vget_high_u16(c));
+}
+
+static INLINE void filter8_masks(
+    const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1,
+    const uint16x8_t p0q0, const uint16_t hev_thresh,
+    const uint16x4_t outer_mask, const uint16_t inner_thresh,
+    const int bitdepth, uint16x4_t *const needs_filter8_mask,
+    uint16x4_t *const is_flat4_mask, uint16x4_t *const hev_mask) {
+  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  *hev_mask = hev(abd_p0p1_q0q1, hev_thresh);
+  const uint16x4_t v_is_flat4 = is_flat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2),
+                                         vabdq_u16(p0q0, p3q3), bitdepth);
+  *needs_filter8_mask =
+      needs_filter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
+                    inner_thresh, outer_mask);
+  // |is_flat4_mask| is used to decide where to use the result of filter8.
+  // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
+  // overriding the question of whether to use filter8. Because filter4 doesn't
+  // apply to p2q2, |is_flat4_mask| chooses directly between filter8 and the
+  // source value. To be correct, the mask must account for this override.
+  *is_flat4_mask = vand_u16(v_is_flat4, *needs_filter8_mask);
+}
+
+// -----------------------------------------------------------------------------
+// filterN functions.
+
+// Calculate filter4() or filter2() based on |hev_mask|.
+static INLINE void filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
+                           const uint16x8_t p1q1, const uint16x4_t hev_mask,
+                           int bitdepth, uint16x8_t *const p1q1_result,
+                           uint16x8_t *const p0q0_result) {
+  const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
+  // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+  // q0mp0 means "q0 minus p0".
+  const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
+  const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+  // If this is for filter2() then include |p1mq1|. Otherwise zero it.
+  const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (bitdepth - 1)));
+  const int16x4_t max_signed_pixel = vdup_n_s16((1 << (bitdepth - 1)) - 1);
+  const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+  const int16x4_t p1mq1_saturated =
+      clip3_s16(p1mq1, min_signed_pixel, max_signed_pixel);
+  const int16x4_t hev_option =
+      vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
+
+  const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
+
+  // Need to figure out what's going on here because there are some unnecessary
+  // tricks to accommodate 8x8 as smallest 8bpp vector
+
+  // We can not shift with rounding because the clamp comes *before* the
+  // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+  // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int16x4_t plus_four =
+      clip3_s16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
+  const int16x4_t plus_three =
+      clip3_s16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
+  const int16x4_t a1 = vshr_n_s16(plus_four, 3);
+  const int16x4_t a2 = vshr_n_s16(plus_three, 3);
+
+  // a3 = (a1 + 1) >> 1;
+  const int16x4_t a3 = vrshr_n_s16(a1, 1);
+
+  const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
+  const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
+
+  // Need to shift the second term or we end up with a2_ma2.
+  const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
+  const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
+  *p1q1_result = convert_to_unsigned_pixel_u16(p1q1_a3, bitdepth);
+  *p0q0_result = convert_to_unsigned_pixel_u16(p0q0_a, bitdepth);
+}
+
+void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
+  uint16_t *const dst_p1 = (uint16_t *)(s - 2 * pitch);
+  uint16_t *const dst_p0 = (uint16_t *)(s - pitch);
+  uint16_t *const dst_q0 = (uint16_t *)(s);
+  uint16_t *const dst_q1 = (uint16_t *)(s + pitch);
+
+  const uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0),
+                              vld1_u16(dst_q0), vld1_u16(dst_q1) };
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask =
+      outer_threshold(src[0], src[1], src[2], src[3], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+  filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+                &needs_filter4_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter4_mask_8 =
+      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+  uint16x8_t f_p1q1;
+  uint16x8_t f_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f_p1q1, &f_p0q0);
+
+  // Already integrated the hev mask when calculating the filtered values.
+  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void aom_highbd_lpf_horizontal_4_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_4_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_4_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  // Offset by 2 uint16_t values to load from first p1 position.
+  uint16_t *dst = s - 2;
+  uint16_t *dst_p1 = dst;
+  uint16_t *dst_p0 = dst + pitch;
+  uint16_t *dst_q0 = dst + pitch * 2;
+  uint16_t *dst_q1 = dst + pitch * 3;
+
+  uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+                        vld1_u16(dst_q1) };
+  transpose_u16_4x4(src);
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask =
+      outer_threshold(src[0], src[1], src[2], src[3], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+  filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+                &needs_filter4_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter4_mask_8 =
+      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+  uint16x8_t f_p1q1;
+  uint16x8_t f_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f_p1q1, &f_p0q0);
+
+  // Already integrated the hev mask when calculating the filtered values.
+  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+  uint16x4_t output[4] = {
+    vget_low_u16(p1q1_output),
+    vget_low_u16(p0q0_output),
+    vget_high_u16(p0q0_output),
+    vget_high_u16(p1q1_output),
+  };
+  transpose_u16_4x4(output);
+
+  vst1_u16(dst_p1, output[0]);
+  vst1_u16(dst_p0, output[1]);
+  vst1_u16(dst_q0, output[2]);
+  vst1_u16(dst_q1, output[3]);
+}
+
+void aom_highbd_lpf_vertical_4_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_4_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_4_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                                 bd);
+}
+
+static INLINE void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
+                           const uint16x8_t p0q0, uint16x8_t *const p1q1_output,
+                           uint16x8_t *const p0q0_output) {
+  // Sum p1 and q1 output from opposite directions.
+  // The formula is regrouped to allow 3 doubling operations to be combined.
+  //
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //      ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //                                 ^^^^^^^^
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //                    ^^^^^^^^^^^
+  uint16x8_t sum = vaddq_u16(p2q2, p1q1);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //                                ^^^^^^
+  sum = vaddq_u16(sum, p0q0);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //               ^^^^^
+  sum = vshlq_n_u16(sum, 1);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //        ^^^^^^                          ^^^^^^
+  // Should dual issue with the left shift.
+  const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+  const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
+  sum = vaddq_u16(sum, outer_sum);
+
+  *p1q1_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - (2 * p2) + q0 + q1
+  // q0 = q1 - (2 * q2) + p0 + p1
+  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+  //                ^^^^^^^^
+  const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
+  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+  //        ^^^^^^^^
+  sum = vsubq_u16(sum, p2q2_double);
+  const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+  sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
+
+  *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
+  uint16_t *const dst_p2 = s - 3 * pitch;
+  uint16_t *const dst_p1 = s - 2 * pitch;
+  uint16_t *const dst_p0 = s - pitch;
+  uint16_t *const dst_q0 = s;
+  uint16_t *const dst_q1 = s + pitch;
+  uint16_t *const dst_q2 = s + 2 * pitch;
+
+  const uint16x4_t src[6] = { vld1_u16(dst_p2), vld1_u16(dst_p1),
+                              vld1_u16(dst_p0), vld1_u16(dst_q0),
+                              vld1_u16(dst_q1), vld1_u16(dst_q2) };
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask =
+      outer_threshold(src[1], src[2], src[3], src[4], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat3_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+  filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
+                &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
+  // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
+  // output is not used.
+  uint16x8_t f6_p1q1, f6_p0q0;
+  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+  if (vget_lane_u64(need_filter6, 0) == 0) {
+    // filter6() does not apply, but filter4() applies to one or more values.
+    p0q0_output = p0q0;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void aom_highbd_lpf_horizontal_6_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_6_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_6_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  // Left side of the filter window.
+  uint16_t *const dst = s - 3;
+  uint16_t *const dst_0 = dst;
+  uint16_t *const dst_1 = dst + pitch;
+  uint16_t *const dst_2 = dst + 2 * pitch;
+  uint16_t *const dst_3 = dst + 3 * pitch;
+
+  // Overread by 2 values. These overreads become the high halves of src_raw[2]
+  // and src_raw[3] after transpose.
+  uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1),
+                            vld1q_u16(dst_2), vld1q_u16(dst_3) };
+  transpose_u16_4x8q(src_raw);
+  // p2, p1, p0, q0, q1, q2
+  const uint16x4_t src[6] = {
+    vget_low_u16(src_raw[0]),  vget_low_u16(src_raw[1]),
+    vget_low_u16(src_raw[2]),  vget_low_u16(src_raw[3]),
+    vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
+  };
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask =
+      outer_threshold(src[1], src[2], src[3], src[4], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat3_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+  filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
+                &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
+  // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
+  // output is not used.
+  uint16x8_t f6_p1q1, f6_p0q0;
+  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+  if (vget_lane_u64(need_filter6, 0) == 0) {
+    // filter6() does not apply, but filter4() applies to one or more values.
+    p0q0_output = p0q0;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  uint16x4_t output[4] = {
+    vget_low_u16(p1q1_output),
+    vget_low_u16(p0q0_output),
+    vget_high_u16(p0q0_output),
+    vget_high_u16(p1q1_output),
+  };
+  transpose_u16_4x4(output);
+
+  // dst_n starts at p2, so adjust to p1.
+  vst1_u16(dst_0 + 1, output[0]);
+  vst1_u16(dst_1 + 1, output[1]);
+  vst1_u16(dst_2 + 1, output[2]);
+  vst1_u16(dst_3 + 1, output[3]);
+}
+
+void aom_highbd_lpf_vertical_6_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_6_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_6_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                                 bd);
+}
+
+static INLINE void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
+                           const uint16x8_t p1q1, const uint16x8_t p0q0,
+                           uint16x8_t *const p2q2_output,
+                           uint16x8_t *const p1q1_output,
+                           uint16x8_t *const p0q0_output) {
+  // Sum p2 and q2 output from opposite directions.
+  // The formula is regrouped to allow 2 doubling operations to be combined.
+  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+  //                                ^^^^^^^^
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                    ^^^^^^^^^^^
+  const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //               ^^^^^
+  uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+  // Add two other terms to make dual issue with shift more likely.
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                   ^^^^^^^^^^^
+  const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                 ^^^^^^^^^^^^^
+  sum = vaddq_u16(sum, p01q01);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //        ^^^^^^
+  sum = vaddq_u16(sum, p3q3);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                               ^^^^^^
+  const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+  sum = vaddq_u16(sum, q0p0);
+
+  *p2q2_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p3 - p2 + p1 + q1
+  // q1 = q2 - q3 - q2 + q0 + p1
+  sum = vsubq_u16(sum, p23q23);
+  const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+  sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
+
+  *p1q1_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p3 - p1 + p0 + q2
+  // q0 = q1 - q3 - q1 + q0 + p2
+  sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
+  const uint16x8_t q2p2 = transpose64_u16q(p2q2);
+  sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
+
+  *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
+  uint16_t *const dst_p3 = s - 4 * pitch;
+  uint16_t *const dst_p2 = s - 3 * pitch;
+  uint16_t *const dst_p1 = s - 2 * pitch;
+  uint16_t *const dst_p0 = s - pitch;
+  uint16_t *const dst_q0 = s;
+  uint16_t *const dst_q1 = s + pitch;
+  uint16_t *const dst_q2 = s + 2 * pitch;
+  uint16_t *const dst_q3 = s + 3 * pitch;
+
+  const uint16x4_t src[8] = { vld1_u16(dst_p3), vld1_u16(dst_p2),
+                              vld1_u16(dst_p1), vld1_u16(dst_p0),
+                              vld1_u16(dst_q0), vld1_u16(dst_q1),
+                              vld1_u16(dst_q2), vld1_u16(dst_q3) };
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask =
+      outer_threshold(src[2], src[3], src[4], src[5], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
+  const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
+  const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
+  const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
+  filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+                bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // filter8() does not apply, but filter4() applies to one or more values.
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t is_flat4_mask_8 =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+}
+
+void aom_highbd_lpf_horizontal_8_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_8_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+static INLINE uint16x8_t reverse_low_half(const uint16x8_t a) {
+  return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
+}
+
+void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  uint16_t *const dst = s - 4;
+  uint16_t *const dst_0 = dst;
+  uint16_t *const dst_1 = dst + pitch;
+  uint16_t *const dst_2 = dst + 2 * pitch;
+  uint16_t *const dst_3 = dst + 3 * pitch;
+
+  // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
+  // To get desired pairs after transpose, one half should be reversed.
+  uint16x8_t src[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                        vld1q_u16(dst_3) };
+
+  // src[0] = p0q0
+  // src[1] = p1q1
+  // src[2] = p2q2
+  // src[3] = p3q3
+  loop_filter_transpose_u16_4x8q(src);
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask = outer_threshold(
+      vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
+      vget_high_u16(src[1]), outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = src[0];
+  const uint16x8_t p1q1 = src[1];
+  const uint16x8_t p2q2 = src[2];
+  const uint16x8_t p3q3 = src[3];
+  filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+                bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+  // output is not used.
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // filter8() does not apply, but filter4() applies to one or more values.
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t is_flat4_mask_8 =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 };
+  // After transpose, |output| will contain rows of the form:
+  // p0 p1 p2 p3 q0 q1 q2 q3
+  transpose_u16_4x8q(output);
+
+  // Reverse p values to produce original order:
+  // p3 p2 p1 p0 q0 q1 q2 q3
+  vst1q_u16(dst_0, reverse_low_half(output[0]));
+  vst1q_u16(dst_1, reverse_low_half(output[1]));
+  vst1q_u16(dst_2, reverse_low_half(output[2]));
+  vst1q_u16(dst_3, reverse_low_half(output[3]));
+}
+
+void aom_highbd_lpf_vertical_8_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_8_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_8_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                                 bd);
+}
+
+static INLINE void filter14(
+    const uint16x8_t p6q6, const uint16x8_t p5q5, const uint16x8_t p4q4,
+    const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1,
+    const uint16x8_t p0q0, uint16x8_t *const p5q5_output,
+    uint16x8_t *const p4q4_output, uint16x8_t *const p3q3_output,
+    uint16x8_t *const p2q2_output, uint16x8_t *const p1q1_output,
+    uint16x8_t *const p0q0_output) {
+  // Sum p5 and q5 output from opposite directions.
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                                                     ^^^^^^^^
+  const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                 ^^^^^^^^^^^^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                               ^^^^^^^^^^^^^^^^^^^
+  uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
+  sum = vaddq_u16(sum, p6q6_x7);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                       ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                     ^^^^^^^
+  sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                 ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //           ^^^^^^^
+  sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                           ^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //      ^^
+  const uint16x8_t q0p0 = transpose64_u16q(p0q0);
+  sum = vaddq_u16(sum, q0p0);
+
+  *p5q5_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p4 and q4 output:
+  // p4 = p5 - (2 * p6) + p3 + q1
+  // q4 = q5 - (2 * q6) + q3 + p1
+  sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
+  const uint16x8_t q1p1 = transpose64_u16q(p1q1);
+  sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
+
+  *p4q4_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p3 and q3 output:
+  // p3 = p4 - p6 - p5 + p2 + q2
+  // q3 = q4 - q6 - q5 + q2 + p2
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
+  const uint16x8_t q2p2 = transpose64_u16q(p2q2);
+  sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
+
+  *p3q3_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p2 and q2 output:
+  // p2 = p3 - p6 - p4 + p1 + q3
+  // q2 = q3 - q6 - q4 + q1 + p3
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
+  const uint16x8_t q3p3 = transpose64_u16q(p3q3);
+  sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
+
+  *p2q2_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p6 - p3 + p0 + q4
+  // q1 = q2 - q6 - q3 + q0 + p4
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
+  const uint16x8_t q4p4 = transpose64_u16q(p4q4);
+  sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
+
+  *p1q1_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p6 - p2 + q0 + q5
+  // q0 = q1 - q6 - q2 + p0 + p5
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
+  const uint16x8_t q5p5 = transpose64_u16q(p5q5);
+  sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
+
+  *p0q0_output = vrshrq_n_u16(sum, 4);
+}
+
+void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh, int bd) {
+  uint16_t *const dst_p6 = s - 7 * pitch;
+  uint16_t *const dst_p5 = s - 6 * pitch;
+  uint16_t *const dst_p4 = s - 5 * pitch;
+  uint16_t *const dst_p3 = s - 4 * pitch;
+  uint16_t *const dst_p2 = s - 3 * pitch;
+  uint16_t *const dst_p1 = s - 2 * pitch;
+  uint16_t *const dst_p0 = s - pitch;
+  uint16_t *const dst_q0 = s;
+  uint16_t *const dst_q1 = s + pitch;
+  uint16_t *const dst_q2 = s + 2 * pitch;
+  uint16_t *const dst_q3 = s + 3 * pitch;
+  uint16_t *const dst_q4 = s + 4 * pitch;
+  uint16_t *const dst_q5 = s + 5 * pitch;
+  uint16_t *const dst_q6 = s + 6 * pitch;
+
+  const uint16x4_t src[14] = {
+    vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
+    vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+    vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
+    vld1_u16(dst_q5), vld1_u16(dst_q6)
+  };
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask =
+      outer_threshold(src[5], src[6], src[7], src[8], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
+  const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
+  const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
+  const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
+  filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+                bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+  const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
+  const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
+  const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
+  // Mask to choose between the outputs of filter8 and filter14.
+  // As with the derivation of |is_flat4_mask|, the question of whether to use
+  // filter14 is only raised where |is_flat4_mask| is true.
+  const uint16x4_t is_flat4_outer_mask = vand_u16(
+      is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+                              vabdq_u16(p0q0, p6q6), bd));
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+      p5q5_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // filter8() and filter14() do not apply, but filter4() applies to one or
+    // more values.
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t use_filter8_mask =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+    if (vget_lane_u64(need_filter14, 0) == 0) {
+      // filter14() does not apply, but filter8() and filter4() apply to one or
+      // more values.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    } else {
+      // All filters may contribute values to final outputs.
+      const uint16x8_t use_filter14_mask =
+          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
+  }
+
+  vst1_u16(dst_p5, vget_low_u16(p5q5_output));
+  vst1_u16(dst_p4, vget_low_u16(p4q4_output));
+  vst1_u16(dst_p3, vget_low_u16(p3q3_output));
+  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+  vst1_u16(dst_q3, vget_high_u16(p3q3_output));
+  vst1_u16(dst_q4, vget_high_u16(p4q4_output));
+  vst1_u16(dst_q5, vget_high_u16(p5q5_output));
+}
+
+void aom_highbd_lpf_horizontal_14_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_14_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+static INLINE uint16x8x2_t permute_acdb64(const uint16x8_t ab,
+                                          const uint16x8_t cd) {
+  uint16x8x2_t acdb;
+#if defined(__aarch64__)
+  // a[b] <- [c]d
+  acdb.val[0] = vreinterpretq_u16_u64(
+      vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
+  // [a]b <- c[d]
+  acdb.val[1] = vreinterpretq_u16_u64(
+      vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
+#else
+  // a[b] <- [c]d
+  acdb.val[0] = vreinterpretq_u16_u64(
+      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
+                     vreinterpretq_u64_u16(ab), 1));
+  // [a]b <- c[d]
+  acdb.val[1] = vreinterpretq_u16_u64(
+      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
+                     vreinterpretq_u64_u16(ab), 0));
+#endif  // defined(__aarch64__)
+  return acdb;
+}
+
+void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int bd) {
+  uint16_t *const dst = s - 8;
+  uint16_t *const dst_0 = dst;
+  uint16_t *const dst_1 = dst + pitch;
+  uint16_t *const dst_2 = dst + 2 * pitch;
+  uint16_t *const dst_3 = dst + 3 * pitch;
+
+  // Low halves:  p7 p6 p5 p4
+  // High halves: p3 p2 p1 p0
+  uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                          vld1q_u16(dst_3) };
+  // p7 will be the low half of src_p[0]. Not used until the end.
+  transpose_u16_4x8q(src_p);
+
+  // Low halves:  q0 q1 q2 q3
+  // High halves: q4 q5 q6 q7
+  uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
+                          vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) };
+  // q7 will be the high half of src_q[3]. Not used until the end.
+  transpose_u16_4x8q(src_q);
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask = outer_threshold(
+      vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
+      vget_low_u16(src_q[1]), outer_thresh);
+  const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
+  const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
+  const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
+  const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+                bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+  const uint16x8_t p4q4 =
+      vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
+  const uint16x8_t p5q5 =
+      vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
+  const uint16x8_t p6q6 =
+      vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
+  const uint16x8_t p7q7 =
+      vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
+  // Mask to choose between the outputs of filter8 and filter14.
+  // As with the derivation of |is_flat4_mask|, the question of whether to use
+  // filter14 is only raised where |is_flat4_mask| is true.
+  const uint16x4_t is_flat4_outer_mask = vand_u16(
+      is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+                              vabdq_u16(p0q0, p6q6), bd));
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+      p5q5_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // filter8() and filter14() do not apply, but filter4() applies to one or
+    // more values.
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t use_filter8_mask =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+    if (vget_lane_u64(need_filter14, 0) == 0) {
+      // filter14() does not apply, but filter8() and filter4() apply to one or
+      // more values.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    } else {
+      // All filters may contribute values to final outputs.
+      const uint16x8_t use_filter14_mask =
+          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
+  }
+  // To get the correctly ordered rows from the transpose, we need:
+  // p7p3 p6p2 p5p1 p4p0
+  // q0q4 q1q5 q2q6 q3q7
+  const uint16x8x2_t p7p3_q3q7 = permute_acdb64(p7q7, p3q3_output);
+  const uint16x8x2_t p6p2_q2q6 = permute_acdb64(p6q6, p2q2_output);
+  const uint16x8x2_t p5p1_q1q5 = permute_acdb64(p5q5_output, p1q1_output);
+  const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output);
+  uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0],
+                             p5p1_q1q5.val[0], p4p0_q0q4.val[0] };
+  transpose_u16_4x8q(output_p);
+  uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1],
+                             p6p2_q2q6.val[1], p7p3_q3q7.val[1] };
+  transpose_u16_4x8q(output_q);
+
+  // Reverse p values to produce original order:
+  // p3 p2 p1 p0 q0 q1 q2 q3
+  vst1q_u16(dst_0, output_p[0]);
+  vst1q_u16(dst_0 + 8, output_q[0]);
+  vst1q_u16(dst_1, output_p[1]);
+  vst1q_u16(dst_1 + 8, output_q[1]);
+  vst1q_u16(dst_2, output_p[2]);
+  vst1q_u16(dst_2 + 8, output_q[2]);
+  vst1q_u16(dst_3, output_p[3]);
+  vst1q_u16(dst_3 + 8, output_q[3]);
+}
+
+void aom_highbd_lpf_vertical_14_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_14_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_14_neon(s + 4 * pitch, pitch, blimit1, limit1,
+                                  thresh1, bd);
+}
diff --git a/media/libaom/src/aom_dsp/arm/highbd_quantize_neon.c b/media/libaom/src/aom_dsp/arm/highbd_quantize_neon.c
new file mode 100644
index 0000000000..bf218e9126
--- /dev/null
+++ b/media/libaom/src/aom_dsp/arm/highbd_quantize_neon.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/encoder/av1_quantize.h"
+
+static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+  return (uint32_t)vget_lane_u64(c, 0);
+#endif
+}
+
+static INLINE uint16x4_t
+quantize_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+           tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+           int32x4_t v_dequant_s32, int32x4_t v_round_s32, int32x4_t v_zbin_s32,
+           int32x4_t v_quant_shift_s32, int log_scale) {
+  const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+  const int32x4_t v_coeff_sign =
+      vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+  // if (abs_coeff < zbins[rc != 0]),
+  const uint32x4_t v_zbin_mask = vcgeq_s32(v_abs_coeff, v_zbin_s32);
+  const int32x4_t v_log_scale = vdupq_n_s32(log_scale);
+  // const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
+  const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
+  //  const int32_t tmpw32 = tmp * wt;
+  const int32x4_t v_tmpw32 = vmulq_s32(v_tmp, vdupq_n_s32((1 << AOM_QM_BITS)));
+  //  const int32_t tmp2 = (int32_t)((tmpw32 * quant64) >> 16);
+  const int32x4_t v_tmp2 = vqdmulhq_s32(v_tmpw32, v_quant_s32);
+  // const int32_t tmp3 =
+  //    ((((tmp2 + tmpw32)<< log_scale) * (int64_t)(quant_shift << 15)) >> 32);
+  const int32x4_t v_tmp3 = vqdmulhq_s32(
+      vshlq_s32(vaddq_s32(v_tmp2, v_tmpw32), v_log_scale), v_quant_shift_s32);
+  // const int abs_qcoeff = vmask ? (int)tmp3 >> AOM_QM_BITS : 0;
+  const int32x4_t v_abs_qcoeff = vandq_s32(vreinterpretq_s32_u32(v_zbin_mask),
+                                           vshrq_n_s32(v_tmp3, AOM_QM_BITS));
+  // const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant_iwt) >> log_scale;
+  // vshlq_s32 will shift right if shift value is negative.
+  const int32x4_t v_abs_dqcoeff =
+      vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale));
+  //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_qcoeff =
+      vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_dqcoeff =
+      vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+  vst1q_s32(qcoeff_ptr, v_qcoeff);
+  vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+  // Used to find eob.
+  const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0));
+  return vmovn_u32(nz_qcoeff_mask);
+}
+
+static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+                                         int16x8_t v_eobmax,
+                                         uint16x8_t v_mask) {
+  const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+  return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#ifdef __aarch64__
+  return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+  const int16x4_t v_eobmax_3210 =
+      vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+  const int64x1_t v_eobmax_xx32 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+  const int16x4_t v_eobmax_tmp =
+      vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+  const int64x1_t v_eobmax_xxx3 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+  const int16x4_t v_eobmax_final =
+      vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif
+}
+
+static void highbd_quantize_b_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const int log_scale) {
+  (void)scan;
+  const int16x4_t v_quant = vld1_s16(quant_ptr);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
+  const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
+  const int16x4_t v_round_log_scale =
+      vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+  const int16x4_t v_round =
+      vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
+  const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr);
+  const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr);
+  const int16x4_t v_zbin_log_scale =
+      vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale)));
+  const int16x4_t v_zbin =
+      vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale);
+  int32x4_t v_round_s32 = vmovl_s16(v_round);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15);
+  int32x4_t v_dequant_s32 = vmovl_s16(v_dequant);
+  int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15);
+  int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+  intptr_t non_zero_count = n_coeffs;
+
+  assert(n_coeffs > 8);
+  // Pre-scan pass
+  const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+  intptr_t i = n_coeffs;
+  do {
+    const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4);
+    const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8);
+    const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a);
+    const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b);
+    const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_s32x);
+    const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_s32x);
+    // If the coefficient is in the base ZBIN range, then discard.
+    if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) {
+      non_zero_count -= 8;
+    } else {
+      break;
+    }
+    i -= 8;
+  } while (i > 0);
+
+  const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
+  memset(qcoeff_ptr + non_zero_count, 0,
+         remaining_zcoeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr + non_zero_count, 0,
+         remaining_zcoeffs * sizeof(*dqcoeff_ptr));
+
+  // DC and first 3 AC
+  v_mask_lo =
+      quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32,
+                 v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+  v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1);
+  v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+
+  // 4 more AC
+  v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                         v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+                         v_quant_shift_s32, log_scale);
+
+  v_eobmax =
+      get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+  intptr_t count = non_zero_count - 8;
+  for (; count > 0; count -= 8) {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+                           v_dequant_s32, v_round_s32, v_zbin_s32,
+                           v_quant_shift_s32, log_scale);
+    v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                           v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+                           v_quant_shift_s32, log_scale);
+    // Find the max lane eob for 8 coeffs.
+    v_eobmax =
+        get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+  }
+
+  *eob_ptr = get_max_eob(v_eobmax);
+}
+
+void aom_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                         quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                         eob_ptr, scan, iscan, 0);
+}
+
+void aom_highbd_quantize_b_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                         quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                         eob_ptr, scan, iscan, 1);
+}
+
+void aom_highbd_quantize_b_64x64_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                         quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                         eob_ptr, scan, iscan, 2);
+}
diff --git a/media/libaom/src/aom_dsp/arm/highbd_variance_neon.c b/media/libaom/src/aom_dsp/arm/highbd_variance_neon.c
new file mode 100644
index 0000000000..3c3877aa6e
--- /dev/null
+++ b/media/libaom/src/aom_dsp/arm/highbd_variance_neon.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/variance.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride,
+                                   uint32_t *sse, int *sum);
+
+void aom_highbd_calc16x16var_neon(const uint16_t *src, int src_stride,
+                                  const uint16_t *ref, int ref_stride,
+                                  uint32_t *sse, int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; j += 8) {
+      const uint16x8_t v_a = vld1q_u16(&src[j]);
+      const uint16x8_t v_b = vld1q_u16(&ref[j]);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(vsubq_u16(v_a, v_b));
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo =
+          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
+      v_sse_hi =
+          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
+    }
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void aom_highbd_calc8x8var_neon(const uint16_t *src, int src_stride,
+                                const uint16_t *ref, int ref_stride,
+                                uint32_t *sse, int *sum) {
+  int i;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < 8; ++i) {
+    const uint16x8_t v_a = vld1q_u16(&src[0]);
+    const uint16x8_t v_b = vld1q_u16(&ref[0]);
+    const int16x8_t sv_diff = vreinterpretq_s16_u16(vsubq_u16(v_a, v_b));
+    v_sum = vaddq_s16(v_sum, sv_diff);
+    v_sse_lo =
+        vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
+    v_sse_hi =
+        vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void aom_highbd_calc4x4var_neon(const uint16_t *src, int src_stride,
+                                const uint16_t *ref, int ref_stride,
+                                uint32_t *sse, int *sum) {
+  int i;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < 4; i += 2) {
+    const uint16x4_t v_a_r0 = vld1_u16(&src[0]);
+    const uint16x4_t v_b_r0 = vld1_u16(&ref[0]);
+    const uint16x4_t v_a_r1 = vld1_u16(&src[src_stride]);
+    const uint16x4_t v_b_r1 = vld1_u16(&ref[ref_stride]);
+    const uint16x8_t v_a = vcombine_u16(v_a_r0, v_a_r1);
+    const uint16x8_t v_b = vcombine_u16(v_b_r0, v_b_r1);
+    const int16x8_t sv_diff = vreinterpretq_s16_u16(vsubq_u16(v_a, v_b));
+    v_sum = vaddq_s16(v_sum, sv_diff);
+    v_sse_lo =
+        vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
+    v_sse_hi =
+        vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+static void highbd_10_variance_neon(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+#define VAR_FN(w, h, block_size, shift)                                    \
+  uint32_t aom_highbd_10_variance##w##x##h##_neon(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_10_variance_neon(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_neon, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+VAR_FN(128, 128, 16, 14)
+VAR_FN(128, 64, 16, 13)
+VAR_FN(64, 128, 16, 13)
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
+
+VAR_FN(16, 4, 4, 6)
+VAR_FN(4, 16, 4, 6)
+
+VAR_FN(8, 4, 4, 5)
+VAR_FN(4, 8, 4, 5)
+VAR_FN(4, 4, 4, 4)
+
+#if !CONFIG_REALTIME_ONLY
+VAR_FN(64, 16, 16, 10)
+VAR_FN(16, 64, 16, 10)
+VAR_FN(8, 32, 8, 8)
+VAR_FN(32, 8, 8, 8)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef VAR_FN
diff --git a/media/libaom/src/aom_dsp/arm/intrapred_neon.c b/media/libaom/src/aom_dsp/arm/intrapred_neon.c
index c85b1e9100..8e6dc12003 100644
--- a/media/libaom/src/aom_dsp/arm/intrapred_neon.c
+++ b/media/libaom/src/aom_dsp/arm/intrapred_neon.c
@@ -10,11 +10,14 @@
  */
 
 #include <arm_neon.h>
+#include <assert.h>
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/intrapred_common.h"
 
 //------------------------------------------------------------------------------
 // DC 4x4
@@ -529,62 +532,2890 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
-static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
-                                       const uint16_t *above,
-                                       const uint16_t *left) {
-  assert(bw >= 4);
-  assert(IS_POWER_OF_TWO(bw));
-  int expected_dc, sum = 0;
-  const int count = bw * 2;
-  uint32x4_t sum_q = vdupq_n_u32(0);
-  uint32x2_t sum_d;
-  uint16_t *dst_1;
-  if (bw >= 8) {
-    for (int i = 0; i < bw; i += 8) {
-      sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
-      sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
-      above += 8;
-      left += 8;
+/* ---------------------P R E D I C T I O N   Z 1--------------------------- */
+
+static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
+  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+  { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
+  { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
+  { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
+  { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
+  { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
+  { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
+};
+
+// Low bit depth functions
+static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+};
+
+/* clang-format on */
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
+    int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
+    int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((W + H) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+  uint16x8_t a0, a1;
+  uint16x8_t diff, a32;
+  uint16x8_t a16;
+  uint8x8_t a_mbase_x;
+
+  a16 = vdupq_n_u16(16);
+  a_mbase_x = vdup_n_u8(above[max_base_x]);
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
+  uint16x8_t c3f = vdupq_n_u16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < W; r++) {
+    uint16x8_t res;
+    uint16x8_t shift;
+    uint8x8x2_t v_tmp_a0_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < W; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
     }
-    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
-    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
-    expected_dc = (sum + (count >> 1)) / count;
-    const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
-    for (int r = 0; r < bw; r++) {
-      dst_1 = dst;
-      for (int i = 0; i < bw; i += 8) {
-        vst1q_u16(dst_1, dc);
-        dst_1 += 8;
+
+    if (base_max_diff > H) base_max_diff = H;
+
+    if (upsample_above) {
+      v_tmp_a0_128 = vld2_u8(above + base);
+      shift = vshrq_n_u16(
+          vandq_u16(vshlq_u16(vdupq_n_u16(x), v_upsample_above), c3f), 1);
+    } else {
+      v_tmp_a0_128.val[0] = vld1_u8(above + base);
+      v_tmp_a0_128.val[1] = vld1_u8(above + base + 1);
+      shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
+    }
+    a0 = vmovl_u8(v_tmp_a0_128.val[0]);
+    a1 = vmovl_u8(v_tmp_a0_128.val[1]);
+    diff = vsubq_u16(a1, a0);        // a[x+1] - a[x]
+    a32 = vmlaq_u16(a16, a0, v_32);  // a[x] * 32 + 16
+    res = vmlaq_u16(a32, diff, shift);
+
+    uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
+    dst[r] =
+        vorr_u8(vand_u8(mask, vshrn_n_u16(res, 5)), vbic_u8(a_mbase_x, mask));
+
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  uint8x8_t dstvec[16];
+
+  dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
+                                        dx);
+  for (int i = 0; i < N; i++) {
+    vst1_lane_u32((uint32_t *)(dst + stride * i),
+                  vreinterpret_u32_u8(dstvec[i]), 0);
+  }
+}
+
+static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  uint8x8_t dstvec[32];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
+                                        dx);
+  for (int i = 0; i < N; i++) {
+    vst1_u8(dst + stride * i, dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
+    int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
+    int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((W + H) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+  uint8x16x2_t a0, a1;
+  uint16x8x2_t diff, a32;
+  uint16x8_t a16, c3f;
+  uint8x16_t a_mbase_x;
+
+  a16 = vdupq_n_u16(16);
+  a_mbase_x = vdupq_n_u8(above[max_base_x]);
+  c3f = vdupq_n_u16(0x3f);
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  uint8x16_t v_zero = vdupq_n_u8(0);
+  int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
+
+  int x = dx;
+  for (int r = 0; r < W; r++) {
+    uint16x8x2_t res;
+    uint16x8_t shift;
+    uint8x16_t a0_128, a1_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < W; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
       }
-      dst += stride;
+      return;
     }
-  } else {  // 4x4
-    sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
-    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
-    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
-    expected_dc = (sum + (count >> 1)) / count;
-    const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
-    for (int r = 0; r < bw; r++) {
-      vst1_u16(dst, dc);
-      dst += stride;
+
+    if (base_max_diff > H) base_max_diff = H;
+
+    if (upsample_above) {
+      uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
+      a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
+      a1_128 = vextq_u8(a0_128, v_zero, 8);
+      shift = vshrq_n_u16(
+          vandq_u16(vshlq_u16(vdupq_n_u16(x), v_upsample_above), c3f), 1);
+    } else {
+      a0_128 = vld1q_u8(above + base);
+      a1_128 = vld1q_u8(above + base + 1);
+      shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
+    }
+    a0 = vzipq_u8(a0_128, v_zero);
+    a1 = vzipq_u8(a1_128, v_zero);
+    diff.val[0] = vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
+                            vreinterpretq_u16_u8(a0.val[0]));  // a[x+1] - a[x]
+    diff.val[1] = vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
+                            vreinterpretq_u16_u8(a0.val[1]));  // a[x+1] - a[x]
+    a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
+                           v_32);  // a[x] * 32 + 16
+    a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
+                           v_32);  // a[x] * 32 + 16
+    res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
+    res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
+    uint8x16_t v_temp =
+        vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
+
+    uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
+    dst[r] = vorrq_u8(vandq_u8(mask, v_temp), vbicq_u8(a_mbase_x, mask));
+
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  uint8x16_t dstvec[64];
+
+  dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    vst1q_u8(dst + stride * i, dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
+    int N, uint8x16x2_t *dstvec, const uint8_t *above, int upsample_above,
+    int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+  uint8x16_t a_mbase_x;
+  uint8x16x2_t a0, a1;
+  uint16x8x2_t diff, a32;
+  uint16x8_t a16, c3f;
+
+  a_mbase_x = vdupq_n_u8(above[max_base_x]);
+  a16 = vdupq_n_u16(16);
+  c3f = vdupq_n_u16(0x3f);
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  uint8x16_t v_zero = vdupq_n_u8(0);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    uint16x8x2_t res;
+    uint8x16_t res16[2];
+    uint8x16_t a0_128, a1_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base);
+    if (base_max_diff <= 0) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i].val[0] = a_mbase_x;  // save 32 values
+        dstvec[i].val[1] = a_mbase_x;
+      }
+      return;
     }
+    if (base_max_diff > 32) base_max_diff = 32;
+
+    uint16x8_t shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
+
+    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+      int mdiff = base_max_diff - j;
+      if (mdiff <= 0) {
+        res16[jj] = a_mbase_x;
+      } else {
+        a0_128 = vld1q_u8(above + base + j);
+        a1_128 = vld1q_u8(above + base + j + 1);
+        a0 = vzipq_u8(a0_128, v_zero);
+        a1 = vzipq_u8(a1_128, v_zero);
+        diff.val[0] =
+            vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
+                      vreinterpretq_u16_u8(a0.val[0]));  // a[x+1] - a[x]
+        diff.val[1] =
+            vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
+                      vreinterpretq_u16_u8(a0.val[1]));  // a[x+1] - a[x]
+        a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
+                               v_32);  // a[x] * 32 + 16
+        a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
+                               v_32);  // a[x] * 32 + 16
+        res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
+        res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
+
+        res16[jj] =
+            vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
+      }
+    }
+
+    uint8x16x2_t mask;
+
+    mask.val[0] = vld1q_u8(BaseMask[base_max_diff]);
+    mask.val[1] = vld1q_u8(BaseMask[base_max_diff] + 16);
+    dstvec[r].val[0] = vorrq_u8(vandq_u8(mask.val[0], res16[0]),
+                                vbicq_u8(a_mbase_x, mask.val[0]));
+    dstvec[r].val[1] = vorrq_u8(vandq_u8(mask.val[1], res16[1]),
+                                vbicq_u8(a_mbase_x, mask.val[1]));
+    x += dx;
   }
 }
 
-#define intra_pred_highbd_sized_neon(type, width)               \
-  void aom_highbd_##type##_predictor_##width##x##width##_neon(  \
-      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
-      const uint16_t *left, int bd) {                           \
-    (void)bd;                                                   \
-    highbd_##type##_predictor(dst, stride, width, above, left); \
+static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  uint8x16x2_t dstvec[64];
+
+  dr_prediction_z1_32xN_internal_neon(N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    vst1q_u8(dst + stride * i, dstvec[i].val[0]);
+    vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
   }
+}
+
+static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+  uint8x16x2_t a0, a1;
+  uint16x8x2_t a32, diff;
+  uint16x8_t a16, c3f;
+  uint8x16_t a_mbase_x, max_base_x128, mask128;
+
+  a16 = vdupq_n_u16(16);
+  a_mbase_x = vdupq_n_u8(above[max_base_x]);
+  max_base_x128 = vdupq_n_u8(max_base_x);
+  c3f = vdupq_n_u16(0x3f);
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  uint8x16_t v_zero = vdupq_n_u8(0);
+  uint8x16_t step = vdupq_n_u8(16);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    uint16x8x2_t res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        vst1q_u8(dst, a_mbase_x);
+        vst1q_u8(dst + 16, a_mbase_x);
+        vst1q_u8(dst + 32, a_mbase_x);
+        vst1q_u8(dst + 48, a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    uint16x8_t shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
+    uint8x16_t a0_128, a1_128, res128;
+    uint8x16_t base_inc128 =
+        vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
+                                               vcreate_u8(0x0F0E0D0C0B0A0908)));
+
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        vst1q_u8(dst + j, a_mbase_x);
+      } else {
+        a0_128 = vld1q_u8(above + base + j);
+        a1_128 = vld1q_u8(above + base + 1 + j);
+        a0 = vzipq_u8(a0_128, v_zero);
+        a1 = vzipq_u8(a1_128, v_zero);
+        diff.val[0] =
+            vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
+                      vreinterpretq_u16_u8(a0.val[0]));  // a[x+1] - a[x]
+        diff.val[1] =
+            vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
+                      vreinterpretq_u16_u8(a0.val[1]));  // a[x+1] - a[x]
+        a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
+                               v_32);  // a[x] * 32 + 16
+        a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
+                               v_32);  // a[x] * 32 + 16
+        res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
+        res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
+        uint8x16_t v_temp =
+            vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
+
+        mask128 = vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), v_zero);
+        res128 =
+            vorrq_u8(vandq_u8(mask128, v_temp), vbicq_u8(a_mbase_x, mask128));
+        vst1q_u8(dst + j, res128);
+
+        base_inc128 = vaddq_u8(base_inc128, step);
+      }
+    }
+    x += dx;
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int dx, int dy) {
+  (void)left;
+  (void)dy;
+
+  switch (bw) {
+    case 4:
+      dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 8:
+      dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 16:
+      dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 32:
+      dr_prediction_z1_32xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 64:
+      dr_prediction_z1_64xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    default: break;
+  }
+  return;
+}
+
+/* ---------------------P R E D I C T I O N   Z 2--------------------------- */
+
+static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
+  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff }
+};
+
+static AOM_FORCE_INLINE void vector_shift_x4(uint8x8_t *vec, uint8x8_t *v_zero,
+                                             int shift_value) {
+  switch (shift_value) {
+    case 1: *vec = vext_u8(*v_zero, *vec, 7); break;
+    case 2: *vec = vext_u8(*v_zero, *vec, 6); break;
+    case 3: *vec = vext_u8(*v_zero, *vec, 5); break;
+    default: break;
+  }
+}
+
+static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  uint16x8_t a0_x, a1_x, a32, diff;
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  uint16x8_t v_zero = vdupq_n_u16(0);
+  uint16x8_t a16 = vdupq_n_u16(16);
+
+  uint8x8_t v_zero_u8 = vdup_n_u8(0);
+  uint16x4_t v_c3f = vdup_n_u16(0x3f);
+  uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
+  int16x4_t v_upsample_left = vdup_n_s16(upsample_left);
+  int16x4_t v_upsample_above = vdup_n_s16(upsample_above);
+  int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
+  int16x4_t dy64 = vdup_n_s16(dy);
+  int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
+  int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
+  int16x4_t v_one = vdup_lane_s16(v_1234, 0);
+
+  for (int r = 0; r < N; r++) {
+    uint16x8_t res, shift;
+    uint16x4_t ydx;
+    uint8x8_t resx, resy;
+    uint16x4x2_t v_shift;
+    v_shift.val[1] = vdup_n_u16(0);
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = v_zero;
+      a1_x = v_zero;
+      v_shift.val[0] = vreinterpret_u16_u8(v_zero_u8);
+      v_shift.val[1] = vreinterpret_u16_u8(v_zero_u8);
+    } else {
+      ydx = vdup_n_u16(y * dx);
+
+      if (upsample_above) {
+        uint8x8x2_t v_tmp;
+        v_tmp.val[0] = vld1_u8(above + base_x + base_shift);
+        v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8);
+        uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]);
+        uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
+        a0_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_low));
+        a1_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_high));
+        v_shift.val[0] = vshr_n_u16(
+            vand_u16(vshl_u16(vsub_u16(r6, ydx), v_upsample_above), v_c3f), 1);
+      } else {
+        uint8x8_t v_a0_x64 = vld1_u8(above + base_x + base_shift);
+        vector_shift_x4(&v_a0_x64, &v_zero_u8, base_shift);
+        uint8x8_t v_a1_x64 = vext_u8(v_a0_x64, v_zero_u8, 1);
+        v_shift.val[0] = vshr_n_u16(vand_u16(vsub_u16(r6, ydx), v_c3f), 1);
+        a0_x = vmovl_u8(v_a0_x64);
+        a1_x = vmovl_u8(v_a1_x64);
+      }
+    }
+
+    // y calc
+    uint8x8_t a0_y, a1_y;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
+      int16x4_t v_r6 = vdup_n_s16(r << 6);
+      int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
+      int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
+      uint16x4_t mask64 = vcgt_s16(min_base_y64, base_y_c64);
+
+      base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
+      vst1_s16(base_y_c, base_y_c64);
+      a0_y = v_zero_u8;
+      a0_y = vld1_lane_u8(left + base_y_c[0], a0_y, 0);
+      a0_y = vld1_lane_u8(left + base_y_c[1], a0_y, 2);
+      a0_y = vld1_lane_u8(left + base_y_c[2], a0_y, 4);
+      a0_y = vld1_lane_u8(left + base_y_c[3], a0_y, 6);
+
+      base_y_c64 = vadd_s16(base_y_c64, v_one);
+      vst1_s16(base_y_c, base_y_c64);
+      a1_y = v_zero_u8;
+      a1_y = vld1_lane_u8(left + base_y_c[0], a1_y, 0);
+      a1_y = vld1_lane_u8(left + base_y_c[1], a1_y, 2);
+      a1_y = vld1_lane_u8(left + base_y_c[2], a1_y, 4);
+      a1_y = vld1_lane_u8(left + base_y_c[3], a1_y, 6);
+
+      if (upsample_left) {
+        v_shift.val[1] = vshr_n_u16(
+            vand_u16(vshl_u16(vreinterpret_u16_s16(y_c64), v_upsample_left),
+                     v_c3f),
+            1);
+      } else {
+        v_shift.val[1] =
+            vshr_n_u16(vand_u16(vreinterpret_u16_s16(y_c64), v_c3f), 1);
+      }
+
+      a0_x = vcombine_u16(vget_low_u16(a0_x), vreinterpret_u16_u8(a0_y));
+      a1_x = vcombine_u16(vget_low_u16(a1_x), vreinterpret_u16_u8(a1_y));
+    }
+    shift = vcombine_u16(v_shift.val[0], v_shift.val[1]);
+    diff = vsubq_u16(a1_x, a0_x);      // a[x+1] - a[x]
+    a32 = vmlaq_u16(a16, a0_x, v_32);  // a[x] * 32 + 16
+    res = vmlaq_u16(a32, diff, shift);
+    resx = vshrn_n_u16(res, 5);
+    resy = vext_u8(resx, v_zero_u8, 4);
+
+    uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
+    uint8x8_t v_resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
+
+    dst += stride;
+  }
+}
+
+static AOM_FORCE_INLINE void vector_shuffle(uint8x16_t *vec, uint8x16_t *vzero,
+                                            int shift_value) {
+  switch (shift_value) {
+    case 1: *vec = vextq_u8(*vzero, *vec, 15); break;
+    case 2: *vec = vextq_u8(*vzero, *vec, 14); break;
+    case 3: *vec = vextq_u8(*vzero, *vec, 13); break;
+    case 4: *vec = vextq_u8(*vzero, *vec, 12); break;
+    case 5: *vec = vextq_u8(*vzero, *vec, 11); break;
+    case 6: *vec = vextq_u8(*vzero, *vec, 10); break;
+    case 7: *vec = vextq_u8(*vzero, *vec, 9); break;
+    case 8: *vec = vextq_u8(*vzero, *vec, 8); break;
+    case 9: *vec = vextq_u8(*vzero, *vec, 7); break;
+    case 10: *vec = vextq_u8(*vzero, *vec, 6); break;
+    case 11: *vec = vextq_u8(*vzero, *vec, 5); break;
+    case 12: *vec = vextq_u8(*vzero, *vec, 4); break;
+    case 13: *vec = vextq_u8(*vzero, *vec, 3); break;
+    case 14: *vec = vextq_u8(*vzero, *vec, 2); break;
+    case 15: *vec = vextq_u8(*vzero, *vec, 1); break;
+    default: break;
+  }
+}
+
+static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  uint8x16x2_t a0_x, a1_x;
+  uint16x8x2_t diff, a32;
+  uint16x8_t c1234, a16, c3f;
+  uint8x16_t a0_x128, a1_x128;
+  int16x8_t min_base_y128, dy128;
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  uint8x16_t v_zero = vdupq_n_u8(0);
+  int16x8_t v_upsample_left = vdupq_n_s16(upsample_left);
+  int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
+  int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
+
+  a16 = vdupq_n_u16(16);
+  c3f = vdupq_n_u16(0x3f);
+  min_base_y128 = vdupq_n_s16(min_base_y);
+  dy128 = vdupq_n_s16(dy);
+  c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
+                       vcreate_u16(0x0008000700060005));
+
+  for (int r = 0; r < N; r++) {
+    uint8x8_t resx, resy, resxy;
+    uint16x8_t r6, ydx;
+    uint16x8x2_t res, shift;
+    shift.val[1] = vdupq_n_u16(0);
+
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
 
-#define intra_pred_square(type)           \
-  intra_pred_highbd_sized_neon(type, 4);  \
-  intra_pred_highbd_sized_neon(type, 8);  \
-  intra_pred_highbd_sized_neon(type, 16); \
-  intra_pred_highbd_sized_neon(type, 32); \
-  intra_pred_highbd_sized_neon(type, 64);
+    if (base_shift > 7) {
+      a0_x.val[0] = v_zero;
+      a0_x.val[1] = v_zero;
+      a1_x.val[0] = v_zero;
+      a1_x.val[1] = v_zero;
+      shift.val[0] = vreinterpretq_u16_u8(v_zero);
+      shift.val[1] = vreinterpretq_u16_u8(v_zero);
+    } else {
+      ydx = vdupq_n_u16(y * dx);
+      r6 = vshlq_n_u16(vextq_u16(c1234, vreinterpretq_u16_u8(v_zero), 2), 6);
+
+      if (upsample_above) {
+        uint8x8x2_t v_tmp;
+        v_tmp.val[0] = vld1_u8(above + base_x + base_shift);
+        v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8);
+        uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]);
+        uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
+        shift.val[0] = vshrq_n_u16(
+            vandq_u16(vshlq_u16(vsubq_u16(r6, ydx), v_upsample_above), c3f), 1);
+        a0_x.val[0] =
+            vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_low)));
+        a1_x.val[0] =
+            vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_high)));
+      } else {
+        a0_x128 = vld1q_u8(above + base_x + base_shift);
+        a1_x128 = vextq_u8(a0_x128, v_zero, 1);
+        vector_shuffle(&a0_x128, &v_zero, base_shift);
+        vector_shuffle(&a1_x128, &v_zero, base_shift);
+        shift.val[0] = vshrq_n_u16(vandq_u16(vsubq_u16(r6, ydx), c3f), 1);
+        a0_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a0_x128)));
+        a1_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a1_x128)));
+      }
+    }
+
+    // y calc
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+      int16x8_t y_c128, base_y_c128;
+      uint16x8_t mask128;
+      int16x8_t v_r6 = vdupq_n_s16(r << 6);
+
+      y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
+      base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
+      mask128 = vcgtq_s16(min_base_y128, base_y_c128);
+
+      base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
+      vst1q_s16(base_y_c, base_y_c128);
+      a0_x.val[1] = v_zero;
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a0_x.val[1], 0);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a0_x.val[1], 2);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a0_x.val[1], 4);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a0_x.val[1], 6);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a0_x.val[1], 8);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a0_x.val[1], 10);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a0_x.val[1], 12);
+      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a0_x.val[1], 14);
+
+      base_y_c128 =
+          vaddq_s16(base_y_c128, vreinterpretq_s16_u16(vshrq_n_u16(a16, 4)));
+      vst1q_s16(base_y_c, base_y_c128);
+      a1_x.val[1] = v_zero;
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a1_x.val[1], 0);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a1_x.val[1], 2);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a1_x.val[1], 4);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a1_x.val[1], 6);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a1_x.val[1], 8);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a1_x.val[1], 10);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a1_x.val[1], 12);
+      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a1_x.val[1], 14);
+
+      if (upsample_left) {
+        shift.val[1] = vshrq_n_u16(
+            vandq_u16(vshlq_u16(vreinterpretq_u16_s16(y_c128), v_upsample_left),
+                      c3f),
+            1);
+      } else {
+        shift.val[1] =
+            vshrq_n_u16(vandq_u16(vreinterpretq_u16_s16(y_c128), c3f), 1);
+      }
+    }
+    diff.val[0] =
+        vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
+                  vreinterpretq_u16_u8(a0_x.val[0]));  // a[x+1] - a[x]
+    diff.val[1] =
+        vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
+                  vreinterpretq_u16_u8(a0_x.val[1]));  // a[x+1] - a[x]
+    a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
+                           v_32);  // a[x] * 32 + 16
+    a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
+                           v_32);  // a[x] * 32 + 16
+    res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
+    res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
+    resx = vshrn_n_u16(res.val[0], 5);
+    resy = vshrn_n_u16(res.val[1], 5);
+
+    uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
+
+    resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
+    vst1_u8(dst, resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
+                                      ptrdiff_t stride, const uint8_t *above,
+                                      const uint8_t *left, int upsample_above,
+                                      int upsample_left, int dx, int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  uint16x8_t a16, c1, c3f;
+  int16x8_t min_base_y256, dy256;
+  uint16x8x2_t a32, c0123, c1234, diff, shifty;
+  uint8x16x2_t a0_x, a1_x, a0_y, a1_y;
+  uint8x16_t a0_x128, a1_x128;
+  uint16x8_t v_32 = vdupq_n_u16(32);
+  uint8x16_t v_zero = vdupq_n_u8(0);
+  int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
+
+  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+
+  a16 = vdupq_n_u16(16);
+  c1 = vshrq_n_u16(a16, 4);
+  min_base_y256 = vdupq_n_s16(min_base_y);
+  c3f = vdupq_n_u16(0x3f);
+  dy256 = vdupq_n_s16(dy);
+  c0123.val[0] = vcombine_u16(vcreate_u16(0x0003000200010000),
+                              vcreate_u16(0x0007000600050004));
+  c0123.val[1] = vcombine_u16(vcreate_u16(0x000B000A00090008),
+                              vcreate_u16(0x000F000E000D000C));
+  c1234.val[0] = vaddq_u16(c0123.val[0], c1);
+  c1234.val[1] = vaddq_u16(c0123.val[1], c1);
+
+  for (int r = 0; r < H; r++) {
+    uint16x8x2_t res, r6, shift;
+    uint16x8_t ydx, j256;
+    uint8x16_t resx, resy, resxy;
+    int y = r + 1;
+    ydx = vdupq_n_u16((uint16_t)(y * dx));
+
+    int base_x = (-y * dx) >> frac_bits_x;
+    for (int j = 0; j < W; j += 16) {
+      j256 = vdupq_n_u16(j);
+
+      int base_shift = 0;
+      if ((base_x + j) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x + j) - 1);
+      }
+      int base_min_diff = (min_base_x - base_x - j);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift < 16) {
+        a0_x128 = vld1q_u8(above + base_x + base_shift + j);
+        a1_x128 = vld1q_u8(above + base_x + base_shift + 1 + j);
+        vector_shuffle(&a0_x128, &v_zero, base_shift);
+        vector_shuffle(&a1_x128, &v_zero, base_shift);
+        a0_x = vzipq_u8(a0_x128, v_zero);
+        a1_x = vzipq_u8(a1_x128, v_zero);
+        r6.val[0] = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
+        r6.val[1] = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
+        shift.val[0] =
+            vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[0], ydx), c3f), 1);
+        shift.val[1] =
+            vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[1], ydx), c3f), 1);
+        diff.val[0] =
+            vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
+                      vreinterpretq_u16_u8(a0_x.val[0]));  // a[x+1] - a[x]
+        diff.val[1] =
+            vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
+                      vreinterpretq_u16_u8(a0_x.val[1]));  // a[x+1] - a[x]
+        a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
+                               v_32);  // a[x] * 32 + 16
+        a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
+                               v_32);  // a[x] * 32 + 16
+        res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
+        res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
+        resx =
+            vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
+      } else {
+        resx = v_zero;
+      }
+
+      // y calc
+      if (base_x < min_base_x) {
+        uint16x8x2_t mask256;
+        int16x8x2_t c256, y_c256, base_y_c256, mul16;
+        int16x8_t v_r6 = vdupq_n_s16(r << 6);
+
+        c256.val[0] = vaddq_s16(vreinterpretq_s16_u16(j256),
+                                vreinterpretq_s16_u16(c1234.val[0]));
+        c256.val[1] = vaddq_s16(vreinterpretq_s16_u16(j256),
+                                vreinterpretq_s16_u16(c1234.val[1]));
+        mul16.val[0] = vreinterpretq_s16_u16(
+            vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256.val[0], dy256)),
+                      vshrq_n_u16(vreinterpretq_u16_s16(min_base_y256), 1)));
+        mul16.val[1] = vreinterpretq_s16_u16(
+            vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256.val[1], dy256)),
+                      vshrq_n_u16(vreinterpretq_u16_s16(min_base_y256), 1)));
+        y_c256.val[0] = vsubq_s16(v_r6, mul16.val[0]);
+        y_c256.val[1] = vsubq_s16(v_r6, mul16.val[1]);
+
+        base_y_c256.val[0] = vshlq_s16(y_c256.val[0], v_frac_bits_y);
+        base_y_c256.val[1] = vshlq_s16(y_c256.val[1], v_frac_bits_y);
+        mask256.val[0] = vcgtq_s16(min_base_y256, base_y_c256.val[0]);
+        mask256.val[1] = vcgtq_s16(min_base_y256, base_y_c256.val[1]);
+
+        base_y_c256.val[0] = vorrq_s16(
+            vandq_s16(vreinterpretq_s16_u16(mask256.val[0]), min_base_y256),
+            vbicq_s16(base_y_c256.val[0],
+                      vreinterpretq_s16_u16(mask256.val[0])));
+        base_y_c256.val[1] = vorrq_s16(
+            vandq_s16(vreinterpretq_s16_u16(mask256.val[1]), min_base_y256),
+            vbicq_s16(base_y_c256.val[1],
+                      vreinterpretq_s16_u16(mask256.val[1])));
+
+        int16_t min_y = vgetq_lane_s16(base_y_c256.val[1], 7);
+        int16_t max_y = vgetq_lane_s16(base_y_c256.val[0], 0);
+        int16_t offset_diff = max_y - min_y;
+
+        if (offset_diff < 16) {
+          assert(offset_diff >= 0);
+          int16x8_t min_y256 =
+              vdupq_lane_s16(vget_high_s16(base_y_c256.val[1]), 3);
+
+          int16x8x2_t base_y_offset;
+          base_y_offset.val[0] = vsubq_s16(base_y_c256.val[0], min_y256);
+          base_y_offset.val[1] = vsubq_s16(base_y_c256.val[1], min_y256);
+
+          int8x16_t base_y_offset128 =
+              vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
+                          vqmovn_s16(base_y_offset.val[1]));
+
+          uint8x16_t a0_y128, a1_y128;
+          uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
+          a0_y128 = vld1q_u8(left + min_y);
+          a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
+          a1_y128 = vld1q_u8(left + min_y + 1);
+          a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
+#if defined(__aarch64__)
+          a0_y128 = vqtbl1q_u8(a0_y128, vreinterpretq_u8_s8(base_y_offset128));
+          a1_y128 = vqtbl1q_u8(a1_y128, vreinterpretq_u8_s8(base_y_offset128));
+#else
+          uint8x8x2_t v_tmp;
+          uint8x8x2_t v_res;
+          uint8x8_t v_index_low =
+              vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
+          uint8x8_t v_index_high =
+              vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
+          v_tmp.val[0] = vget_low_u8(a0_y128);
+          v_tmp.val[1] = vget_high_u8(a0_y128);
+          v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
+          v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
+          a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
+          v_tmp.val[0] = vget_low_u8(a1_y128);
+          v_tmp.val[1] = vget_high_u8(a1_y128);
+          v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
+          v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
+          a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
+#endif
+          a0_y = vzipq_u8(a0_y128, v_zero);
+          a1_y = vzipq_u8(a1_y128, v_zero);
+        } else {
+          base_y_c256.val[0] = vbicq_s16(base_y_c256.val[0],
+                                         vreinterpretq_s16_u16(mask256.val[0]));
+          base_y_c256.val[1] = vbicq_s16(base_y_c256.val[1],
+                                         vreinterpretq_s16_u16(mask256.val[1]));
+          vst1q_s16(base_y_c, base_y_c256.val[0]);
+          vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
+          a0_y.val[0] = v_zero;
+          a0_y.val[1] = v_zero;
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a0_y.val[0], 0);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a0_y.val[0], 2);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a0_y.val[0], 4);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a0_y.val[0], 6);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a0_y.val[0], 8);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a0_y.val[0], 10);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a0_y.val[0], 12);
+          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a0_y.val[0], 14);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a0_y.val[1], 0);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a0_y.val[1], 2);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a0_y.val[1], 4);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a0_y.val[1], 6);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a0_y.val[1], 8);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a0_y.val[1], 10);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a0_y.val[1], 12);
+          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a0_y.val[1], 14);
+
+          base_y_c256.val[0] =
+              vaddq_s16(base_y_c256.val[0], vreinterpretq_s16_u16(c1));
+          base_y_c256.val[1] =
+              vaddq_s16(base_y_c256.val[1], vreinterpretq_s16_u16(c1));
+          vst1q_s16(base_y_c, base_y_c256.val[0]);
+          vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
+          a1_y.val[0] = v_zero;
+          a1_y.val[1] = v_zero;
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a1_y.val[0], 0);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a1_y.val[0], 2);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a1_y.val[0], 4);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a1_y.val[0], 6);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a1_y.val[0], 8);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a1_y.val[0], 10);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a1_y.val[0], 12);
+          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a1_y.val[0], 14);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a1_y.val[1], 0);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a1_y.val[1], 2);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a1_y.val[1], 4);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a1_y.val[1], 6);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a1_y.val[1], 8);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a1_y.val[1], 10);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a1_y.val[1], 12);
+          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a1_y.val[1], 14);
+        }
+        shifty.val[0] = vshrq_n_u16(
+            vandq_u16(vreinterpretq_u16_s16(y_c256.val[0]), c3f), 1);
+        shifty.val[1] = vshrq_n_u16(
+            vandq_u16(vreinterpretq_u16_s16(y_c256.val[1]), c3f), 1);
+        diff.val[0] =
+            vsubq_u16(vreinterpretq_u16_u8(a1_y.val[0]),
+                      vreinterpretq_u16_u8(a0_y.val[0]));  // a[x+1] - a[x]
+        diff.val[1] =
+            vsubq_u16(vreinterpretq_u16_u8(a1_y.val[1]),
+                      vreinterpretq_u16_u8(a0_y.val[1]));  // a[x+1] - a[x]
+        a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[0]),
+                               v_32);  // a[x] * 32 + 16
+        a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[1]),
+                               v_32);  // a[x] * 32 + 16
+        res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shifty.val[0]);
+        res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shifty.val[1]);
+
+        resy =
+            vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
+      } else {
+        resy = v_zero;
+      }
+      uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
+      resxy = vorrq_u8(vandq_u8(mask, resy), vbicq_u8(resx, mask));
+      vst1q_u8(dst + j, resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int upsample_left, int dx,
+                               int dy) {
+  assert(dx > 0);
+  assert(dy > 0);
+
+  switch (bw) {
+    case 4:
+      dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    case 8:
+      dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    default:
+      dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left,
+                                upsample_above, upsample_left, dx, dy);
+      break;
+  }
+  return;
+}
+
+/* ---------------------P R E D I C T I O N   Z 3--------------------------- */
+
+static AOM_FORCE_INLINE void transpose4x16_neon(uint8x16_t *x,
+                                                uint16x8x2_t *d) {
+  uint8x16x2_t w0, w1;
+
+  w0 = vzipq_u8(x[0], x[1]);
+  w1 = vzipq_u8(x[2], x[3]);
+
+  d[0] = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                   vreinterpretq_u16_u8(w1.val[0]));
+  d[1] = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                   vreinterpretq_u16_u8(w1.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose4x8_8x4_low_neon(uint8x8_t *x,
+                                                       uint16x4x2_t *d) {
+  uint8x8x2_t w0, w1;
+
+  w0 = vzip_u8(x[0], x[1]);
+  w1 = vzip_u8(x[2], x[3]);
+
+  *d = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+}
+
+static AOM_FORCE_INLINE void transpose4x8_8x4_neon(uint8x8_t *x,
+                                                   uint16x4x2_t *d) {
+  uint8x8x2_t w0, w1;
+
+  w0 = vzip_u8(x[0], x[1]);
+  w1 = vzip_u8(x[2], x[3]);
+
+  d[0] =
+      vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+  d[1] =
+      vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose8x8_low_neon(uint8x8_t *x,
+                                                   uint32x2x2_t *d) {
+  uint8x8x2_t w0, w1, w2, w3;
+  uint16x4x2_t w4, w5;
+
+  w0 = vzip_u8(x[0], x[1]);
+  w1 = vzip_u8(x[2], x[3]);
+  w2 = vzip_u8(x[4], x[5]);
+  w3 = vzip_u8(x[6], x[7]);
+
+  w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+  w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
+
+  d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+                  vreinterpret_u32_u16(w5.val[0]));
+  d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+                  vreinterpret_u32_u16(w5.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose8x8_neon(uint8x8_t *x, uint32x2x2_t *d) {
+  uint8x8x2_t w0, w1, w2, w3;
+  uint16x4x2_t w4, w5, w6, w7;
+
+  w0 = vzip_u8(x[0], x[1]);
+  w1 = vzip_u8(x[2], x[3]);
+  w2 = vzip_u8(x[4], x[5]);
+  w3 = vzip_u8(x[6], x[7]);
+
+  w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+  w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
+
+  d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+                  vreinterpret_u32_u16(w5.val[0]));
+  d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+                  vreinterpret_u32_u16(w5.val[1]));
+
+  w6 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
+  w7 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
+
+  d[2] = vzip_u32(vreinterpret_u32_u16(w6.val[0]),
+                  vreinterpret_u32_u16(w7.val[0]));
+  d[3] = vzip_u32(vreinterpret_u32_u16(w6.val[1]),
+                  vreinterpret_u32_u16(w7.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose16x8_8x16_neon(uint8x8_t *x,
+                                                     uint64x2_t *d) {
+  uint8x8x2_t w0, w1, w2, w3, w8, w9, w10, w11;
+  uint16x4x2_t w4, w5, w12, w13;
+  uint32x2x2_t w6, w7, w14, w15;
+
+  w0 = vzip_u8(x[0], x[1]);
+  w1 = vzip_u8(x[2], x[3]);
+  w2 = vzip_u8(x[4], x[5]);
+  w3 = vzip_u8(x[6], x[7]);
+
+  w8 = vzip_u8(x[8], x[9]);
+  w9 = vzip_u8(x[10], x[11]);
+  w10 = vzip_u8(x[12], x[13]);
+  w11 = vzip_u8(x[14], x[15]);
+
+  w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+  w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
+  w12 =
+      vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
+  w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
+                 vreinterpret_u16_u8(w11.val[0]));
+
+  w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+                vreinterpret_u32_u16(w5.val[0]));
+  w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+                vreinterpret_u32_u16(w5.val[1]));
+  w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
+                 vreinterpret_u32_u16(w13.val[0]));
+  w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
+                 vreinterpret_u32_u16(w13.val[1]));
+
+  // Store first 4-line result
+  d[0] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]),
+                      vreinterpret_u64_u32(w14.val[0]));
+  d[1] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]),
+                      vreinterpret_u64_u32(w14.val[1]));
+  d[2] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]),
+                      vreinterpret_u64_u32(w15.val[0]));
+  d[3] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]),
+                      vreinterpret_u64_u32(w15.val[1]));
+
+  w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
+  w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
+  w12 =
+      vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
+  w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
+                 vreinterpret_u16_u8(w11.val[1]));
+
+  w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+                vreinterpret_u32_u16(w5.val[0]));
+  w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+                vreinterpret_u32_u16(w5.val[1]));
+  w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
+                 vreinterpret_u32_u16(w13.val[0]));
+  w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
+                 vreinterpret_u32_u16(w13.val[1]));
+
+  // Store second 4-line result
+  d[4] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]),
+                      vreinterpret_u64_u32(w14.val[0]));
+  d[5] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]),
+                      vreinterpret_u64_u32(w14.val[1]));
+  d[6] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]),
+                      vreinterpret_u64_u32(w15.val[0]));
+  d[7] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]),
+                      vreinterpret_u64_u32(w15.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose8x16_16x8_neon(uint8x16_t *x,
+                                                     uint64x2_t *d) {
+  uint8x16x2_t w0, w1, w2, w3;
+  uint16x8x2_t w4, w5, w6, w7;
+  uint32x4x2_t w8, w9, w10, w11;
+
+  w0 = vzipq_u8(x[0], x[1]);
+  w1 = vzipq_u8(x[2], x[3]);
+  w2 = vzipq_u8(x[4], x[5]);
+  w3 = vzipq_u8(x[6], x[7]);
+
+  w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                 vreinterpretq_u16_u8(w1.val[0]));
+  w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+                 vreinterpretq_u16_u8(w3.val[0]));
+  w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                 vreinterpretq_u16_u8(w1.val[1]));
+  w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+                 vreinterpretq_u16_u8(w3.val[1]));
+
+  w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+                 vreinterpretq_u32_u16(w5.val[0]));
+  w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
+                 vreinterpretq_u32_u16(w7.val[0]));
+  w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+                  vreinterpretq_u32_u16(w5.val[1]));
+  w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
+                  vreinterpretq_u32_u16(w7.val[1]));
+
+#if defined(__aarch64__)
+  d[0] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[0]),
+                    vreinterpretq_u64_u32(w9.val[0]));
+  d[1] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[0]),
+                    vreinterpretq_u64_u32(w9.val[0]));
+  d[2] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[1]),
+                    vreinterpretq_u64_u32(w9.val[1]));
+  d[3] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[1]),
+                    vreinterpretq_u64_u32(w9.val[1]));
+  d[4] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[0]),
+                    vreinterpretq_u64_u32(w11.val[0]));
+  d[5] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[0]),
+                    vreinterpretq_u64_u32(w11.val[0]));
+  d[6] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[1]),
+                    vreinterpretq_u64_u32(w11.val[1]));
+  d[7] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[1]),
+                    vreinterpretq_u64_u32(w11.val[1]));
+#else
+  d[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w8.val[0]), vget_low_u32(w9.val[0])));
+  d[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w8.val[0]), vget_high_u32(w9.val[0])));
+  d[2] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w8.val[1]), vget_low_u32(w9.val[1])));
+  d[3] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w8.val[1]), vget_high_u32(w9.val[1])));
+  d[4] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w10.val[0]), vget_low_u32(w11.val[0])));
+  d[5] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w10.val[0]), vget_high_u32(w11.val[0])));
+  d[6] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w10.val[1]), vget_low_u32(w11.val[1])));
+  d[7] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w10.val[1]), vget_high_u32(w11.val[1])));
+#endif
+}
+
+static AOM_FORCE_INLINE void transpose16x16_neon(uint8x16_t *x, uint64x2_t *d) {
+  uint8x16x2_t w0, w1, w2, w3, w4, w5, w6, w7;
+  uint16x8x2_t w8, w9, w10, w11;
+  uint32x4x2_t w12, w13, w14, w15;
+
+  w0 = vzipq_u8(x[0], x[1]);
+  w1 = vzipq_u8(x[2], x[3]);
+  w2 = vzipq_u8(x[4], x[5]);
+  w3 = vzipq_u8(x[6], x[7]);
+
+  w4 = vzipq_u8(x[8], x[9]);
+  w5 = vzipq_u8(x[10], x[11]);
+  w6 = vzipq_u8(x[12], x[13]);
+  w7 = vzipq_u8(x[14], x[15]);
+
+  w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                 vreinterpretq_u16_u8(w1.val[0]));
+  w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+                 vreinterpretq_u16_u8(w3.val[0]));
+  w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
+                  vreinterpretq_u16_u8(w5.val[0]));
+  w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
+                  vreinterpretq_u16_u8(w7.val[0]));
+
+  w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
+                  vreinterpretq_u32_u16(w9.val[0]));
+  w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
+                  vreinterpretq_u32_u16(w11.val[0]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
+                  vreinterpretq_u32_u16(w9.val[1]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
+                  vreinterpretq_u32_u16(w11.val[1]));
+
+#if defined(__aarch64__)
+  d[0] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
+                    vreinterpretq_u64_u32(w13.val[0]));
+  d[1] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
+                    vreinterpretq_u64_u32(w13.val[0]));
+  d[2] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]),
+                    vreinterpretq_u64_u32(w13.val[1]));
+  d[3] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]),
+                    vreinterpretq_u64_u32(w13.val[1]));
+  d[4] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]),
+                    vreinterpretq_u64_u32(w15.val[0]));
+  d[5] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]),
+                    vreinterpretq_u64_u32(w15.val[0]));
+  d[6] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]),
+                    vreinterpretq_u64_u32(w15.val[1]));
+  d[7] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]),
+                    vreinterpretq_u64_u32(w15.val[1]));
+#else
+  d[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0])));
+  d[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0])));
+  d[2] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1])));
+  d[3] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1])));
+  d[4] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0])));
+  d[5] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0])));
+  d[6] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1])));
+  d[7] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1])));
+#endif
+
+  // upper half
+  w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                 vreinterpretq_u16_u8(w1.val[1]));
+  w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+                 vreinterpretq_u16_u8(w3.val[1]));
+  w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
+                  vreinterpretq_u16_u8(w5.val[1]));
+  w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
+                  vreinterpretq_u16_u8(w7.val[1]));
+
+  w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
+                  vreinterpretq_u32_u16(w9.val[0]));
+  w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
+                  vreinterpretq_u32_u16(w11.val[0]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
+                  vreinterpretq_u32_u16(w9.val[1]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
+                  vreinterpretq_u32_u16(w11.val[1]));
+
+#if defined(__aarch64__)
+  d[8] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
+                    vreinterpretq_u64_u32(w13.val[0]));
+  d[9] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
+                    vreinterpretq_u64_u32(w13.val[0]));
+  d[10] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]),
+                     vreinterpretq_u64_u32(w13.val[1]));
+  d[11] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]),
+                     vreinterpretq_u64_u32(w13.val[1]));
+  d[12] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]),
+                     vreinterpretq_u64_u32(w15.val[0]));
+  d[13] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]),
+                     vreinterpretq_u64_u32(w15.val[0]));
+  d[14] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]),
+                     vreinterpretq_u64_u32(w15.val[1]));
+  d[15] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]),
+                     vreinterpretq_u64_u32(w15.val[1]));
+#else
+  d[8] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0])));
+  d[9] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0])));
+  d[10] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1])));
+  d[11] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1])));
+  d[12] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0])));
+  d[13] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0])));
+  d[14] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1])));
+  d[15] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1])));
+#endif
+}
+
+static AOM_FORCE_INLINE void transpose16x32_neon(uint8x16x2_t *x,
+                                                 uint64x2x2_t *d) {
+  uint8x16x2_t w0, w1, w2, w3, w8, w9, w10, w11;
+  uint16x8x2_t w4, w5, w12, w13;
+  uint32x4x2_t w6, w7, w14, w15;
+
+  w0 = vzipq_u8(x[0].val[0], x[1].val[0]);
+  w1 = vzipq_u8(x[2].val[0], x[3].val[0]);
+  w2 = vzipq_u8(x[4].val[0], x[5].val[0]);
+  w3 = vzipq_u8(x[6].val[0], x[7].val[0]);
+
+  w8 = vzipq_u8(x[8].val[0], x[9].val[0]);
+  w9 = vzipq_u8(x[10].val[0], x[11].val[0]);
+  w10 = vzipq_u8(x[12].val[0], x[13].val[0]);
+  w11 = vzipq_u8(x[14].val[0], x[15].val[0]);
+
+  w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                 vreinterpretq_u16_u8(w1.val[0]));
+  w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+                 vreinterpretq_u16_u8(w3.val[0]));
+  w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]),
+                  vreinterpretq_u16_u8(w9.val[0]));
+  w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]),
+                  vreinterpretq_u16_u8(w11.val[0]));
+
+  w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+                 vreinterpretq_u32_u16(w5.val[0]));
+  w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+                 vreinterpretq_u32_u16(w5.val[1]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
+                  vreinterpretq_u32_u16(w13.val[0]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
+                  vreinterpretq_u32_u16(w13.val[1]));
+
+  // Store first 4-line result
+
+#if defined(__aarch64__)
+  d[0].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                           vreinterpretq_u64_u32(w14.val[0]));
+  d[0].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                           vreinterpretq_u64_u32(w14.val[0]));
+  d[1].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                           vreinterpretq_u64_u32(w14.val[1]));
+  d[1].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                           vreinterpretq_u64_u32(w14.val[1]));
+  d[2].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                           vreinterpretq_u64_u32(w15.val[0]));
+  d[2].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                           vreinterpretq_u64_u32(w15.val[0]));
+  d[3].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                           vreinterpretq_u64_u32(w15.val[1]));
+  d[3].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                           vreinterpretq_u64_u32(w15.val[1]));
+#else
+  d[0].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
+  d[0].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
+  d[1].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
+  d[1].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
+  d[2].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
+  d[2].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
+  d[3].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
+  d[3].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
+#endif
+
+  w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                 vreinterpretq_u16_u8(w1.val[1]));
+  w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+                 vreinterpretq_u16_u8(w3.val[1]));
+  w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]),
+                  vreinterpretq_u16_u8(w9.val[1]));
+  w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]),
+                  vreinterpretq_u16_u8(w11.val[1]));
+
+  w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+                 vreinterpretq_u32_u16(w5.val[0]));
+  w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+                 vreinterpretq_u32_u16(w5.val[1]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
+                  vreinterpretq_u32_u16(w13.val[0]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
+                  vreinterpretq_u32_u16(w13.val[1]));
+
+  // Store second 4-line result
+
+#if defined(__aarch64__)
+  d[4].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                           vreinterpretq_u64_u32(w14.val[0]));
+  d[4].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                           vreinterpretq_u64_u32(w14.val[0]));
+  d[5].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                           vreinterpretq_u64_u32(w14.val[1]));
+  d[5].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                           vreinterpretq_u64_u32(w14.val[1]));
+  d[6].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                           vreinterpretq_u64_u32(w15.val[0]));
+  d[6].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                           vreinterpretq_u64_u32(w15.val[0]));
+  d[7].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                           vreinterpretq_u64_u32(w15.val[1]));
+  d[7].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                           vreinterpretq_u64_u32(w15.val[1]));
+#else
+  d[4].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
+  d[4].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
+  d[5].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
+  d[5].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
+  d[6].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
+  d[6].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
+  d[7].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
+  d[7].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
+#endif
+
+  // upper half
+  w0 = vzipq_u8(x[0].val[1], x[1].val[1]);
+  w1 = vzipq_u8(x[2].val[1], x[3].val[1]);
+  w2 = vzipq_u8(x[4].val[1], x[5].val[1]);
+  w3 = vzipq_u8(x[6].val[1], x[7].val[1]);
+
+  w8 = vzipq_u8(x[8].val[1], x[9].val[1]);
+  w9 = vzipq_u8(x[10].val[1], x[11].val[1]);
+  w10 = vzipq_u8(x[12].val[1], x[13].val[1]);
+  w11 = vzipq_u8(x[14].val[1], x[15].val[1]);
+
+  w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                 vreinterpretq_u16_u8(w1.val[0]));
+  w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+                 vreinterpretq_u16_u8(w3.val[0]));
+  w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]),
+                  vreinterpretq_u16_u8(w9.val[0]));
+  w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]),
+                  vreinterpretq_u16_u8(w11.val[0]));
+
+  w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+                 vreinterpretq_u32_u16(w5.val[0]));
+  w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+                 vreinterpretq_u32_u16(w5.val[1]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
+                  vreinterpretq_u32_u16(w13.val[0]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
+                  vreinterpretq_u32_u16(w13.val[1]));
+
+  // Store first 4-line result
+
+#if defined(__aarch64__)
+  d[8].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                           vreinterpretq_u64_u32(w14.val[0]));
+  d[8].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                           vreinterpretq_u64_u32(w14.val[0]));
+  d[9].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                           vreinterpretq_u64_u32(w14.val[1]));
+  d[9].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                           vreinterpretq_u64_u32(w14.val[1]));
+  d[10].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                            vreinterpretq_u64_u32(w15.val[0]));
+  d[10].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                            vreinterpretq_u64_u32(w15.val[0]));
+  d[11].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                            vreinterpretq_u64_u32(w15.val[1]));
+  d[11].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                            vreinterpretq_u64_u32(w15.val[1]));
+#else
+  d[8].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
+  d[8].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
+  d[9].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
+  d[9].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
+  d[10].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
+  d[10].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
+  d[11].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
+  d[11].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
+#endif
+
+  w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                 vreinterpretq_u16_u8(w1.val[1]));
+  w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+                 vreinterpretq_u16_u8(w3.val[1]));
+  w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]),
+                  vreinterpretq_u16_u8(w9.val[1]));
+  w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]),
+                  vreinterpretq_u16_u8(w11.val[1]));
+
+  w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+                 vreinterpretq_u32_u16(w5.val[0]));
+  w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+                 vreinterpretq_u32_u16(w5.val[1]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
+                  vreinterpretq_u32_u16(w13.val[0]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
+                  vreinterpretq_u32_u16(w13.val[1]));
+
+  // Store second 4-line result
+
+#if defined(__aarch64__)
+  d[12].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                            vreinterpretq_u64_u32(w14.val[0]));
+  d[12].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
+                            vreinterpretq_u64_u32(w14.val[0]));
+  d[13].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                            vreinterpretq_u64_u32(w14.val[1]));
+  d[13].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
+                            vreinterpretq_u64_u32(w14.val[1]));
+  d[14].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                            vreinterpretq_u64_u32(w15.val[0]));
+  d[14].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
+                            vreinterpretq_u64_u32(w15.val[0]));
+  d[15].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                            vreinterpretq_u64_u32(w15.val[1]));
+  d[15].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
+                            vreinterpretq_u64_u32(w15.val[1]));
+#else
+  d[12].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
+  d[12].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
+  d[13].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
+  d[13].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
+  d[14].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
+  d[14].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
+  d[15].val[0] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
+  d[15].val[1] = vreinterpretq_u64_u32(
+      vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
+#endif
+}
+
+static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
+                               uint8_t *dst, ptrdiff_t pitchDst) {
+  uint8x16_t r[16];
+  uint64x2_t d[16];
+  for (int i = 0; i < 16; i++) {
+    r[i] = vld1q_u8(src + i * pitchSrc);
+  }
+  transpose16x16_neon(r, d);
+  for (int i = 0; i < 16; i++) {
+    vst1q_u8(dst + i * pitchDst, vreinterpretq_u8_u64(d[i]));
+  }
+}
+
+static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
+                      ptrdiff_t pitchDst, int width, int height) {
+  for (int j = 0; j < height; j += 16) {
+    for (int i = 0; i < width; i += 16) {
+      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+                         dst + j * pitchDst + i, pitchDst);
+    }
+  }
+}
+
+static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  uint8x8_t dstvec[4];
+  uint16x4x2_t dest;
+
+  dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_low_neon(dstvec, &dest);
+  vst1_lane_u32((uint32_t *)(dst + stride * 0),
+                vreinterpret_u32_u16(dest.val[0]), 0);
+  vst1_lane_u32((uint32_t *)(dst + stride * 1),
+                vreinterpret_u32_u16(dest.val[0]), 1);
+  vst1_lane_u32((uint32_t *)(dst + stride * 2),
+                vreinterpret_u32_u16(dest.val[1]), 0);
+  vst1_lane_u32((uint32_t *)(dst + stride * 3),
+                vreinterpret_u32_u16(dest.val[1]), 1);
+}
+
+static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  uint8x8_t dstvec[8];
+  uint32x2x2_t d[4];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_neon(dstvec, d);
+  vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]);
+  vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]);
+  vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]);
+  vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]);
+  vst1_u32((uint32_t *)(dst + 4 * stride), d[2].val[0]);
+  vst1_u32((uint32_t *)(dst + 5 * stride), d[2].val[1]);
+  vst1_u32((uint32_t *)(dst + 6 * stride), d[3].val[0]);
+  vst1_u32((uint32_t *)(dst + 7 * stride), d[3].val[1]);
+}
+
+static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  uint8x8_t dstvec[4];
+  uint16x4x2_t d[2];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_neon(dstvec, d);
+  vst1_lane_u32((uint32_t *)(dst + stride * 0),
+                vreinterpret_u32_u16(d[0].val[0]), 0);
+  vst1_lane_u32((uint32_t *)(dst + stride * 1),
+                vreinterpret_u32_u16(d[0].val[0]), 1);
+  vst1_lane_u32((uint32_t *)(dst + stride * 2),
+                vreinterpret_u32_u16(d[0].val[1]), 0);
+  vst1_lane_u32((uint32_t *)(dst + stride * 3),
+                vreinterpret_u32_u16(d[0].val[1]), 1);
+  vst1_lane_u32((uint32_t *)(dst + stride * 4),
+                vreinterpret_u32_u16(d[1].val[0]), 0);
+  vst1_lane_u32((uint32_t *)(dst + stride * 5),
+                vreinterpret_u32_u16(d[1].val[0]), 1);
+  vst1_lane_u32((uint32_t *)(dst + stride * 6),
+                vreinterpret_u32_u16(d[1].val[1]), 0);
+  vst1_lane_u32((uint32_t *)(dst + stride * 7),
+                vreinterpret_u32_u16(d[1].val[1]), 1);
+}
+
+static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  uint8x8_t dstvec[8];
+  uint32x2x2_t d[2];
+
+  dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_low_neon(dstvec, d);
+  vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]);
+  vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]);
+  vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]);
+  vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]);
+}
+
+static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x16_t dstvec[8];
+  uint64x2_t d[8];
+
+  dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
+  transpose8x16_16x8_neon(dstvec, d);
+  for (int i = 0; i < 8; i++) {
+    vst1_u8(dst + i * stride, vreinterpret_u8_u64(vget_low_u64(d[i])));
+    vst1_u8(dst + (i + 8) * stride, vreinterpret_u8_u64(vget_high_u64(d[i])));
+  }
+}
+
+static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x8_t dstvec[16];
+  uint64x2_t d[8];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
+  transpose16x8_8x16_neon(dstvec, d);
+  for (int i = 0; i < 8; i++) {
+    vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
+  }
+}
+
+static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x16_t dstvec[4];
+  uint16x8x2_t d[2];
+
+  dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
+  transpose4x16_neon(dstvec, d);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 0),
+                 vreinterpretq_u32_u16(d[0].val[0]), 0);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 1),
+                 vreinterpretq_u32_u16(d[0].val[0]), 1);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 2),
+                 vreinterpretq_u32_u16(d[0].val[0]), 2);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 3),
+                 vreinterpretq_u32_u16(d[0].val[0]), 3);
+
+  vst1q_lane_u32((uint32_t *)(dst + stride * 4),
+                 vreinterpretq_u32_u16(d[0].val[1]), 0);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 5),
+                 vreinterpretq_u32_u16(d[0].val[1]), 1);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 6),
+                 vreinterpretq_u32_u16(d[0].val[1]), 2);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 7),
+                 vreinterpretq_u32_u16(d[0].val[1]), 3);
+
+  vst1q_lane_u32((uint32_t *)(dst + stride * 8),
+                 vreinterpretq_u32_u16(d[1].val[0]), 0);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 9),
+                 vreinterpretq_u32_u16(d[1].val[0]), 1);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 10),
+                 vreinterpretq_u32_u16(d[1].val[0]), 2);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 11),
+                 vreinterpretq_u32_u16(d[1].val[0]), 3);
+
+  vst1q_lane_u32((uint32_t *)(dst + stride * 12),
+                 vreinterpretq_u32_u16(d[1].val[1]), 0);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 13),
+                 vreinterpretq_u32_u16(d[1].val[1]), 1);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 14),
+                 vreinterpretq_u32_u16(d[1].val[1]), 2);
+  vst1q_lane_u32((uint32_t *)(dst + stride * 15),
+                 vreinterpretq_u32_u16(d[1].val[1]), 3);
+}
+
+static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x8_t dstvec[16];
+  uint64x2_t d[8];
+
+  dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
+  transpose16x8_8x16_neon(dstvec, d);
+  for (int i = 0; i < 4; i++) {
+    vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
+  }
+}
+
+static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x16x2_t dstvec[16];
+  uint64x2x2_t d[16];
+  uint8x16_t v_zero = vdupq_n_u8(0);
+
+  dr_prediction_z1_32xN_internal_neon(8, dstvec, left, upsample_left, dy);
+  for (int i = 8; i < 16; i++) {
+    dstvec[i].val[0] = v_zero;
+    dstvec[i].val[1] = v_zero;
+  }
+  transpose16x32_neon(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    vst1_u8(dst + 2 * i * stride,
+            vreinterpret_u8_u64(vget_low_u64(d[i].val[0])));
+    vst1_u8(dst + (2 * i + 1) * stride,
+            vreinterpret_u8_u64(vget_low_u64(d[i].val[1])));
+  }
+}
+
+static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x8_t dstvec[32];
+  uint64x2_t d[16];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
+  transpose16x8_8x16_neon(dstvec, d);
+  transpose16x8_8x16_neon(dstvec + 16, d + 8);
+  for (int i = 0; i < 8; i++) {
+    vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
+    vst1q_u8(dst + i * stride + 16, vreinterpretq_u8_u64(d[i + 8]));
+  }
+}
+
+static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16_t dstvec[16];
+  uint64x2_t d[16];
+
+  dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
+  transpose16x16_neon(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
+  }
+}
+
+static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16x2_t dstvec[32];
+  uint64x2x2_t d[32];
+
+  dr_prediction_z1_32xN_internal_neon(32, dstvec, left, upsample_left, dy);
+  transpose16x32_neon(dstvec, d);
+  transpose16x32_neon(dstvec + 16, d + 16);
+  for (int i = 0; i < 16; i++) {
+    vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0]));
+    vst1q_u8(dst + 2 * i * stride + 16, vreinterpretq_u8_u64(d[i + 16].val[0]));
+    vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1]));
+    vst1q_u8(dst + (2 * i + 1) * stride + 16,
+             vreinterpretq_u8_u64(d[i + 16].val[1]));
+  }
+}
+
+static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
+
+  dr_prediction_z1_64xN_neon(64, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16x2_t dstvec[16];
+  uint64x2x2_t d[16];
+
+  dr_prediction_z1_32xN_internal_neon(16, dstvec, left, upsample_left, dy);
+  transpose16x32_neon(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0]));
+    vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1]));
+  }
+}
+
+static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16_t dstvec[32];
+  uint64x2_t d[16];
+
+  dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 32; i += 16) {
+    transpose16x16_neon((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j]));
+    }
+  }
+}
+
+static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[64 * 32];
+
+  dr_prediction_z1_64xN_neon(32, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[32 * 64];
+
+  dr_prediction_z1_32xN_neon(64, dstT, 32, left, upsample_left, dy);
+  transpose(dstT, 32, dst, stride, 64, 32);
+}
+
+static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[64 * 16];
+
+  dr_prediction_z1_64xN_neon(16, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16_t dstvec[64];
+  uint64x2_t d[16];
+
+  dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 64; i += 16) {
+    transpose16x16_neon((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j]));
+    }
+  }
+}
+
+void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_left, int dx, int dy) {
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy > 0);
+
+  if (bw == bh) {
+    switch (bw) {
+      case 4:
+        dr_prediction_z3_4x4_neon(dst, stride, left, upsample_left, dy);
+        break;
+      case 8:
+        dr_prediction_z3_8x8_neon(dst, stride, left, upsample_left, dy);
+        break;
+      case 16:
+        dr_prediction_z3_16x16_neon(dst, stride, left, upsample_left, dy);
+        break;
+      case 32:
+        dr_prediction_z3_32x32_neon(dst, stride, left, upsample_left, dy);
+        break;
+      case 64:
+        dr_prediction_z3_64x64_neon(dst, stride, left, upsample_left, dy);
+        break;
+    }
+  } else {
+    if (bw < bh) {
+      if (bw + bw == bh) {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x8_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x16_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x32_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_32x64_neon(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      } else {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x16_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x32_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x64_neon(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      }
+    } else {
+      if (bh + bh == bw) {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_8x4_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_16x8_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_32x16_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_64x32_neon(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      } else {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_16x4_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_32x8_neon(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_64x16_neon(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      }
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// 256 - v = vneg_s8(v)
+static INLINE uint8x8_t negate_s8(const uint8x8_t v) {
+  return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
+}
+
+static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *const top_row,
+                            const uint8_t *const left_column,
+                            const int height) {
+  const uint8_t top_right = top_row[3];
+  const uint8_t bottom_left = left_column[height - 1];
+  const uint8_t *const weights_y = smooth_weights + height - 4;
+
+  uint8x8_t UNINITIALIZED_IS_SAFE(top_v);
+  load_u8_4x1(top_row, &top_v, 0);
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+  uint8x8_t UNINITIALIZED_IS_SAFE(weights_x_v);
+  load_u8_4x1(smooth_weights, &weights_x_v, 0);
+  const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
+  const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+  assert(height > 0);
+  int y = 0;
+  do {
+    const uint8x8_t left_v = vdup_n_u8(left_column[y]);
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+    const uint16x8_t weighted_top_bl =
+        vmlal_u8(weighted_bl, weights_y_v, top_v);
+    const uint16x8_t weighted_left_tr =
+        vmlal_u8(weighted_tr, weights_x_v, left_v);
+    // Maximum value of each parameter: 0xFF00
+    const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+    const uint8x8_t result = vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
+
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(result), 0);
+    dst += stride;
+  } while (++y != height);
+}
+
+static INLINE uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl,
+                                       const uint16x8_t weighted_left_tr) {
+  // Maximum value of each parameter: 0xFF00
+  const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+  return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
+}
+
+static INLINE uint8x8_t calculate_weights_and_pred(
+    const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
+    const uint8x8_t bottom_left, const uint8x8_t weights_x,
+    const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
+  const uint16x8_t weighted_top = vmull_u8(weights_y, top);
+  const uint16x8_t weighted_top_bl =
+      vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
+  const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
+  return calculate_pred(weighted_top_bl, weighted_left_tr);
+}
+
+static void smooth_8xh_neon(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *const top_row,
+                            const uint8_t *const left_column,
+                            const int height) {
+  const uint8_t top_right = top_row[7];
+  const uint8_t bottom_left = left_column[height - 1];
+  const uint8_t *const weights_y = smooth_weights + height - 4;
+
+  const uint8x8_t top_v = vld1_u8(top_row);
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+  const uint8x8_t weights_x_v = vld1_u8(smooth_weights + 4);
+  const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
+  const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+  assert(height > 0);
+  int y = 0;
+  do {
+    const uint8x8_t left_v = vdup_n_u8(left_column[y]);
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
+    const uint8x8_t result =
+        calculate_weights_and_pred(top_v, left_v, weighted_tr, bottom_left_v,
+                                   weights_x_v, scaled_weights_y, weights_y_v);
+
+    vst1_u8(dst, result);
+    dst += stride;
+  } while (++y != height);
+}
+
+#define SMOOTH_NXM(W, H)                                                       \
+  void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
+                                             const uint8_t *above,             \
+                                             const uint8_t *left) {            \
+    smooth_##W##xh_neon(dst, y_stride, above, left, H);                        \
+  }
+
+SMOOTH_NXM(4, 4)
+SMOOTH_NXM(4, 8)
+SMOOTH_NXM(8, 4)
+SMOOTH_NXM(8, 8)
+SMOOTH_NXM(4, 16)
+SMOOTH_NXM(8, 16)
+SMOOTH_NXM(8, 32)
+
+#undef SMOOTH_NXM
+
+static INLINE uint8x16_t calculate_weights_and_predq(
+    const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
+    const uint8x8_t weights_y, const uint8x16_t weights_x,
+    const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
+  const uint16x8_t weighted_top_bl_low =
+      vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+  const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+  const uint16x8_t weighted_left_tr_low =
+      vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+  const uint8x8_t result_low =
+      calculate_pred(weighted_top_bl_low, weighted_left_tr_low);
+
+  const uint16x8_t weighted_top_bl_high =
+      vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+  const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+  const uint16x8_t weighted_left_tr_high =
+      vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+  const uint8x8_t result_high =
+      calculate_pred(weighted_top_bl_high, weighted_left_tr_high);
+
+  return vcombine_u8(result_low, result_high);
+}
+
+// 256 - v = vneg_s8(v)
+static INLINE uint8x16_t negate_s8q(const uint8x16_t v) {
+  return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
+}
+
+// For width 16 and above.
+#define SMOOTH_PREDICTOR(W)                                                 \
+  static void smooth_##W##xh_neon(                                          \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,         \
+      const uint8_t *const left_column, const int height) {                 \
+    const uint8_t top_right = top_row[(W)-1];                               \
+    const uint8_t bottom_left = left_column[height - 1];                    \
+    const uint8_t *const weights_y = smooth_weights + height - 4;           \
+                                                                            \
+    uint8x16_t top_v[4];                                                    \
+    top_v[0] = vld1q_u8(top_row);                                           \
+    if ((W) > 16) {                                                         \
+      top_v[1] = vld1q_u8(top_row + 16);                                    \
+      if ((W) == 64) {                                                      \
+        top_v[2] = vld1q_u8(top_row + 32);                                  \
+        top_v[3] = vld1q_u8(top_row + 48);                                  \
+      }                                                                     \
+    }                                                                       \
+                                                                            \
+    const uint8x8_t top_right_v = vdup_n_u8(top_right);                     \
+    const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);                 \
+                                                                            \
+    uint8x16_t weights_x_v[4];                                              \
+    weights_x_v[0] = vld1q_u8(smooth_weights + (W)-4);                      \
+    if ((W) > 16) {                                                         \
+      weights_x_v[1] = vld1q_u8(smooth_weights + (W) + 16 - 4);             \
+      if ((W) == 64) {                                                      \
+        weights_x_v[2] = vld1q_u8(smooth_weights + (W) + 32 - 4);           \
+        weights_x_v[3] = vld1q_u8(smooth_weights + (W) + 48 - 4);           \
+      }                                                                     \
+    }                                                                       \
+                                                                            \
+    uint8x16_t scaled_weights_x[4];                                         \
+    scaled_weights_x[0] = negate_s8q(weights_x_v[0]);                       \
+    if ((W) > 16) {                                                         \
+      scaled_weights_x[1] = negate_s8q(weights_x_v[1]);                     \
+      if ((W) == 64) {                                                      \
+        scaled_weights_x[2] = negate_s8q(weights_x_v[2]);                   \
+        scaled_weights_x[3] = negate_s8q(weights_x_v[3]);                   \
+      }                                                                     \
+    }                                                                       \
+                                                                            \
+    for (int y = 0; y < height; ++y) {                                      \
+      const uint8x8_t left_v = vdup_n_u8(left_column[y]);                   \
+      const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);                \
+      const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);            \
+      const uint16x8_t weighted_bl =                                        \
+          vmull_u8(scaled_weights_y, bottom_left_v);                        \
+                                                                            \
+      vst1q_u8(dst, calculate_weights_and_predq(                            \
+                        top_v[0], left_v, top_right_v, weights_y_v,         \
+                        weights_x_v[0], scaled_weights_x[0], weighted_bl)); \
+                                                                            \
+      if ((W) > 16) {                                                       \
+        vst1q_u8(dst + 16,                                                  \
+                 calculate_weights_and_predq(                               \
+                     top_v[1], left_v, top_right_v, weights_y_v,            \
+                     weights_x_v[1], scaled_weights_x[1], weighted_bl));    \
+        if ((W) == 64) {                                                    \
+          vst1q_u8(dst + 32,                                                \
+                   calculate_weights_and_predq(                             \
+                       top_v[2], left_v, top_right_v, weights_y_v,          \
+                       weights_x_v[2], scaled_weights_x[2], weighted_bl));  \
+          vst1q_u8(dst + 48,                                                \
+                   calculate_weights_and_predq(                             \
+                       top_v[3], left_v, top_right_v, weights_y_v,          \
+                       weights_x_v[3], scaled_weights_x[3], weighted_bl));  \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      dst += stride;                                                        \
+    }                                                                       \
+  }
+
+SMOOTH_PREDICTOR(16)
+SMOOTH_PREDICTOR(32)
+SMOOTH_PREDICTOR(64)
+
+#undef SMOOTH_PREDICTOR
+
+#define SMOOTH_NXM_WIDE(W, H)                                                  \
+  void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
+                                             const uint8_t *above,             \
+                                             const uint8_t *left) {            \
+    smooth_##W##xh_neon(dst, y_stride, above, left, H);                        \
+  }
+
+SMOOTH_NXM_WIDE(16, 4)
+SMOOTH_NXM_WIDE(16, 8)
+SMOOTH_NXM_WIDE(16, 16)
+SMOOTH_NXM_WIDE(16, 32)
+SMOOTH_NXM_WIDE(16, 64)
+SMOOTH_NXM_WIDE(32, 8)
+SMOOTH_NXM_WIDE(32, 16)
+SMOOTH_NXM_WIDE(32, 32)
+SMOOTH_NXM_WIDE(32, 64)
+SMOOTH_NXM_WIDE(64, 16)
+SMOOTH_NXM_WIDE(64, 32)
+SMOOTH_NXM_WIDE(64, 64)
+
+#undef SMOOTH_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+// For widths 4 and 8.
+#define SMOOTH_V_PREDICTOR(W)                                         \
+  static void smooth_v_##W##xh_neon(                                  \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,   \
+      const uint8_t *const left_column, const int height) {           \
+    const uint8_t bottom_left = left_column[height - 1];              \
+    const uint8_t *const weights_y = smooth_weights + height - 4;     \
+                                                                      \
+    uint8x8_t UNINITIALIZED_IS_SAFE(top_v);                           \
+    if ((W) == 4) {                                                   \
+      load_u8_4x1(top_row, &top_v, 0);                                \
+    } else { /* width == 8 */                                         \
+      top_v = vld1_u8(top_row);                                       \
+    }                                                                 \
+                                                                      \
+    const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);           \
+                                                                      \
+    assert(height > 0);                                               \
+    int y = 0;                                                        \
+    do {                                                              \
+      const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);          \
+      const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);      \
+                                                                      \
+      const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);   \
+      const uint16x8_t weighted_top_bl =                              \
+          vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);    \
+      const uint8x8_t pred =                                          \
+          vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE);    \
+                                                                      \
+      if ((W) == 4) {                                                 \
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
+      } else { /* width == 8 */                                       \
+        vst1_u8(dst, pred);                                           \
+      }                                                               \
+      dst += stride;                                                  \
+    } while (++y != height);                                          \
+  }
+
+SMOOTH_V_PREDICTOR(4)
+SMOOTH_V_PREDICTOR(8)
+
+#undef SMOOTH_V_PREDICTOR
+
+#define SMOOTH_V_NXM(W, H)                                    \
+  void aom_smooth_v_predictor_##W##x##H##_neon(               \
+      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+      const uint8_t *left) {                                  \
+    smooth_v_##W##xh_neon(dst, y_stride, above, left, H);     \
+  }
+
+SMOOTH_V_NXM(4, 4)
+SMOOTH_V_NXM(4, 8)
+SMOOTH_V_NXM(4, 16)
+SMOOTH_V_NXM(8, 4)
+SMOOTH_V_NXM(8, 8)
+SMOOTH_V_NXM(8, 16)
+SMOOTH_V_NXM(8, 32)
+
+#undef SMOOTH_V_NXM
+
+static INLINE uint8x16_t calculate_vertical_weights_and_pred(
+    const uint8x16_t top, const uint8x8_t weights_y,
+    const uint16x8_t weighted_bl) {
+  const uint16x8_t pred_low =
+      vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+  const uint16x8_t pred_high =
+      vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+  const uint8x8_t pred_scaled_low =
+      vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE);
+  const uint8x8_t pred_scaled_high =
+      vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE);
+  return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+// For width 16 and above.
+#define SMOOTH_V_PREDICTOR(W)                                            \
+  static void smooth_v_##W##xh_neon(                                     \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,      \
+      const uint8_t *const left_column, const int height) {              \
+    const uint8_t bottom_left = left_column[height - 1];                 \
+    const uint8_t *const weights_y = smooth_weights + height - 4;        \
+                                                                         \
+    uint8x16_t top_v[4];                                                 \
+    top_v[0] = vld1q_u8(top_row);                                        \
+    if ((W) > 16) {                                                      \
+      top_v[1] = vld1q_u8(top_row + 16);                                 \
+      if ((W) == 64) {                                                   \
+        top_v[2] = vld1q_u8(top_row + 32);                               \
+        top_v[3] = vld1q_u8(top_row + 48);                               \
+      }                                                                  \
+    }                                                                    \
+                                                                         \
+    const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);              \
+                                                                         \
+    assert(height > 0);                                                  \
+    int y = 0;                                                           \
+    do {                                                                 \
+      const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);             \
+      const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);         \
+      const uint16x8_t weighted_bl =                                     \
+          vmull_u8(scaled_weights_y, bottom_left_v);                     \
+                                                                         \
+      const uint8x16_t pred_0 = calculate_vertical_weights_and_pred(     \
+          top_v[0], weights_y_v, weighted_bl);                           \
+      vst1q_u8(dst, pred_0);                                             \
+                                                                         \
+      if ((W) > 16) {                                                    \
+        const uint8x16_t pred_1 = calculate_vertical_weights_and_pred(   \
+            top_v[1], weights_y_v, weighted_bl);                         \
+        vst1q_u8(dst + 16, pred_1);                                      \
+                                                                         \
+        if ((W) == 64) {                                                 \
+          const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \
+              top_v[2], weights_y_v, weighted_bl);                       \
+          vst1q_u8(dst + 32, pred_2);                                    \
+                                                                         \
+          const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \
+              top_v[3], weights_y_v, weighted_bl);                       \
+          vst1q_u8(dst + 48, pred_3);                                    \
+        }                                                                \
+      }                                                                  \
+                                                                         \
+      dst += stride;                                                     \
+    } while (++y != height);                                             \
+  }
+
+SMOOTH_V_PREDICTOR(16)
+SMOOTH_V_PREDICTOR(32)
+SMOOTH_V_PREDICTOR(64)
+
+#undef SMOOTH_V_PREDICTOR
+
+#define SMOOTH_V_NXM_WIDE(W, H)                               \
+  void aom_smooth_v_predictor_##W##x##H##_neon(               \
+      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+      const uint8_t *left) {                                  \
+    smooth_v_##W##xh_neon(dst, y_stride, above, left, H);     \
+  }
+
+SMOOTH_V_NXM_WIDE(16, 4)
+SMOOTH_V_NXM_WIDE(16, 8)
+SMOOTH_V_NXM_WIDE(16, 16)
+SMOOTH_V_NXM_WIDE(16, 32)
+SMOOTH_V_NXM_WIDE(16, 64)
+SMOOTH_V_NXM_WIDE(32, 8)
+SMOOTH_V_NXM_WIDE(32, 16)
+SMOOTH_V_NXM_WIDE(32, 32)
+SMOOTH_V_NXM_WIDE(32, 64)
+SMOOTH_V_NXM_WIDE(64, 16)
+SMOOTH_V_NXM_WIDE(64, 32)
+SMOOTH_V_NXM_WIDE(64, 64)
+
+#undef SMOOTH_V_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+
+// For widths 4 and 8.
+#define SMOOTH_H_PREDICTOR(W)                                               \
+  static void smooth_h_##W##xh_neon(                                        \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,         \
+      const uint8_t *const left_column, const int height) {                 \
+    const uint8_t top_right = top_row[(W)-1];                               \
+                                                                            \
+    const uint8x8_t top_right_v = vdup_n_u8(top_right);                     \
+    /* Over-reads for 4xN but still within the array. */                    \
+    const uint8x8_t weights_x = vld1_u8(smooth_weights + (W)-4);            \
+    const uint8x8_t scaled_weights_x = negate_s8(weights_x);                \
+    const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); \
+                                                                            \
+    assert(height > 0);                                                     \
+    int y = 0;                                                              \
+    do {                                                                    \
+      const uint8x8_t left_v = vdup_n_u8(left_column[y]);                   \
+      const uint16x8_t weighted_left_tr =                                   \
+          vmlal_u8(weighted_tr, weights_x, left_v);                         \
+      const uint8x8_t pred =                                                \
+          vrshrn_n_u16(weighted_left_tr, SMOOTH_WEIGHT_LOG2_SCALE);         \
+                                                                            \
+      if ((W) == 4) {                                                       \
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0);       \
+      } else { /* width == 8 */                                             \
+        vst1_u8(dst, pred);                                                 \
+      }                                                                     \
+      dst += stride;                                                        \
+    } while (++y != height);                                                \
+  }
+
+SMOOTH_H_PREDICTOR(4)
+SMOOTH_H_PREDICTOR(8)
+
+#undef SMOOTH_H_PREDICTOR
+
+#define SMOOTH_H_NXM(W, H)                                    \
+  void aom_smooth_h_predictor_##W##x##H##_neon(               \
+      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+      const uint8_t *left) {                                  \
+    smooth_h_##W##xh_neon(dst, y_stride, above, left, H);     \
+  }
+
+SMOOTH_H_NXM(4, 4)
+SMOOTH_H_NXM(4, 8)
+SMOOTH_H_NXM(4, 16)
+SMOOTH_H_NXM(8, 4)
+SMOOTH_H_NXM(8, 8)
+SMOOTH_H_NXM(8, 16)
+SMOOTH_H_NXM(8, 32)
+
+#undef SMOOTH_H_NXM
+
+static INLINE uint8x16_t calculate_horizontal_weights_and_pred(
+    const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
+    const uint8x16_t scaled_weights_x) {
+  const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+  const uint16x8_t weighted_left_tr_low =
+      vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+  const uint8x8_t pred_scaled_low =
+      vrshrn_n_u16(weighted_left_tr_low, SMOOTH_WEIGHT_LOG2_SCALE);
+
+  const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+  const uint16x8_t weighted_left_tr_high =
+      vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+  const uint8x8_t pred_scaled_high =
+      vrshrn_n_u16(weighted_left_tr_high, SMOOTH_WEIGHT_LOG2_SCALE);
+
+  return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+// For width 16 and above.
+#define SMOOTH_H_PREDICTOR(W)                                              \
+  static void smooth_h_##W##xh_neon(                                       \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,        \
+      const uint8_t *const left_column, const int height) {                \
+    const uint8_t top_right = top_row[(W)-1];                              \
+                                                                           \
+    const uint8x8_t top_right_v = vdup_n_u8(top_right);                    \
+                                                                           \
+    uint8x16_t weights_x[4];                                               \
+    weights_x[0] = vld1q_u8(smooth_weights + (W)-4);                       \
+    if ((W) > 16) {                                                        \
+      weights_x[1] = vld1q_u8(smooth_weights + (W) + 16 - 4);              \
+      if ((W) == 64) {                                                     \
+        weights_x[2] = vld1q_u8(smooth_weights + (W) + 32 - 4);            \
+        weights_x[3] = vld1q_u8(smooth_weights + (W) + 48 - 4);            \
+      }                                                                    \
+    }                                                                      \
+                                                                           \
+    uint8x16_t scaled_weights_x[4];                                        \
+    scaled_weights_x[0] = negate_s8q(weights_x[0]);                        \
+    if ((W) > 16) {                                                        \
+      scaled_weights_x[1] = negate_s8q(weights_x[1]);                      \
+      if ((W) == 64) {                                                     \
+        scaled_weights_x[2] = negate_s8q(weights_x[2]);                    \
+        scaled_weights_x[3] = negate_s8q(weights_x[3]);                    \
+      }                                                                    \
+    }                                                                      \
+                                                                           \
+    assert(height > 0);                                                    \
+    int y = 0;                                                             \
+    do {                                                                   \
+      const uint8x8_t left_v = vdup_n_u8(left_column[y]);                  \
+                                                                           \
+      const uint8x16_t pred_0 = calculate_horizontal_weights_and_pred(     \
+          left_v, top_right_v, weights_x[0], scaled_weights_x[0]);         \
+      vst1q_u8(dst, pred_0);                                               \
+                                                                           \
+      if ((W) > 16) {                                                      \
+        const uint8x16_t pred_1 = calculate_horizontal_weights_and_pred(   \
+            left_v, top_right_v, weights_x[1], scaled_weights_x[1]);       \
+        vst1q_u8(dst + 16, pred_1);                                        \
+                                                                           \
+        if ((W) == 64) {                                                   \
+          const uint8x16_t pred_2 = calculate_horizontal_weights_and_pred( \
+              left_v, top_right_v, weights_x[2], scaled_weights_x[2]);     \
+          vst1q_u8(dst + 32, pred_2);                                      \
+                                                                           \
+          const uint8x16_t pred_3 = calculate_horizontal_weights_and_pred( \
+              left_v, top_right_v, weights_x[3], scaled_weights_x[3]);     \
+          vst1q_u8(dst + 48, pred_3);                                      \
+        }                                                                  \
+      }                                                                    \
+      dst += stride;                                                       \
+    } while (++y != height);                                               \
+  }
+
+SMOOTH_H_PREDICTOR(16)
+SMOOTH_H_PREDICTOR(32)
+SMOOTH_H_PREDICTOR(64)
+
+#undef SMOOTH_H_PREDICTOR
+
+#define SMOOTH_H_NXM_WIDE(W, H)                               \
+  void aom_smooth_h_predictor_##W##x##H##_neon(               \
+      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+      const uint8_t *left) {                                  \
+    smooth_h_##W##xh_neon(dst, y_stride, above, left, H);     \
+  }
+
+SMOOTH_H_NXM_WIDE(16, 4)
+SMOOTH_H_NXM_WIDE(16, 8)
+SMOOTH_H_NXM_WIDE(16, 16)
+SMOOTH_H_NXM_WIDE(16, 32)
+SMOOTH_H_NXM_WIDE(16, 64)
+SMOOTH_H_NXM_WIDE(32, 8)
+SMOOTH_H_NXM_WIDE(32, 16)
+SMOOTH_H_NXM_WIDE(32, 32)
+SMOOTH_H_NXM_WIDE(32, 64)
+SMOOTH_H_NXM_WIDE(64, 16)
+SMOOTH_H_NXM_WIDE(64, 32)
+SMOOTH_H_NXM_WIDE(64, 64)
+
+#undef SMOOTH_H_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// PAETH
+
+static INLINE void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
+                                       const uint8_t *const top_row,
+                                       const uint8_t *const left_column,
+                                       int width, int height) {
+  const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+  uint8x8_t top;
+  if (width == 4) {
+    load_u8_4x1(top_row, &top, 0);
+  } else {  // width == 8
+    top = vld1_u8(top_row);
+  }
+
+  assert(height > 0);
+  int y = 0;
+  do {
+    const uint8x8_t left = vdup_n_u8(left_column[y]);
+
+    const uint8x8_t left_dist = vabd_u8(top, top_left);
+    const uint8x8_t top_dist = vabd_u8(left, top_left);
+    const uint16x8_t top_left_dist =
+        vabdq_u16(vaddl_u8(top, left), top_left_x2);
+
+    const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
+    const uint8x8_t left_le_top_left =
+        vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
+    const uint8x8_t top_le_top_left =
+        vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint8x8_t result = vbsl_u8(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbsl_u8(left_or_top_mask, result, top_left);
+
+    if (width == 4) {
+      store_unaligned_u8_4x1(dest, result, 0);
+    } else {  // width == 8
+      vst1_u8(dest, result);
+    }
+    dest += stride;
+  } while (++y != height);
+}
+
+#define PAETH_NXM(W, H)                                                     \
+  void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
+                                            const uint8_t *above,           \
+                                            const uint8_t *left) {          \
+    paeth_4or8_x_h_neon(dst, stride, above, left, W, H);                    \
+  }
+
+PAETH_NXM(4, 4)
+PAETH_NXM(4, 8)
+PAETH_NXM(8, 4)
+PAETH_NXM(8, 8)
+PAETH_NXM(8, 16)
+
+PAETH_NXM(4, 16)
+PAETH_NXM(8, 32)
+
+// Calculate X distance <= TopLeft distance and pack the resulting mask into
+// uint8x8_t.
+static INLINE uint8x16_t x_le_top_left(const uint8x16_t x_dist,
+                                       const uint16x8_t top_left_dist_low,
+                                       const uint16x8_t top_left_dist_high) {
+  const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
+                                               vqmovn_u16(top_left_dist_high));
+  return vcleq_u8(x_dist, top_left_dist);
+}
+
+// Select the closest values and collect them.
+static INLINE uint8x16_t select_paeth(const uint8x16_t top,
+                                      const uint8x16_t left,
+                                      const uint8x16_t top_left,
+                                      const uint8x16_t left_le_top,
+                                      const uint8x16_t left_le_top_left,
+                                      const uint8x16_t top_le_top_left) {
+  // if (left_dist <= top_dist && left_dist <= top_left_dist)
+  const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
+  //   dest[x] = left_column[y];
+  // Fill all the unused spaces with 'top'. They will be overwritten when
+  // the positions for top_left are known.
+  uint8x16_t result = vbslq_u8(left_mask, left, top);
+  // else if (top_dist <= top_left_dist)
+  //   dest[x] = top_row[x];
+  // Add these values to the mask. They were already set.
+  const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
+  // else
+  //   dest[x] = top_left;
+  return vbslq_u8(left_or_top_mask, result, top_left);
+}
+
+// Generate numbered and high/low versions of top_left_dist.
+#define TOP_LEFT_DIST(num)                                              \
+  const uint16x8_t top_left_##num##_dist_low = vabdq_u16(               \
+      vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
+  const uint16x8_t top_left_##num##_dist_high = vabdq_u16(              \
+      vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
+
+// Generate numbered versions of XLeTopLeft with x = left.
+#define LEFT_LE_TOP_LEFT(num)                                     \
+  const uint8x16_t left_le_top_left_##num =                       \
+      x_le_top_left(left_##num##_dist, top_left_##num##_dist_low, \
+                    top_left_##num##_dist_high)
+
+// Generate numbered versions of XLeTopLeft with x = top.
+#define TOP_LE_TOP_LEFT(num)                              \
+  const uint8x16_t top_le_top_left_##num = x_le_top_left( \
+      top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
+
+static INLINE void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride,
+                                         const uint8_t *const top_row,
+                                         const uint8_t *const left_column,
+                                         int width, int height) {
+  const uint8x16_t top_left = vdupq_n_u8(top_row[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+  uint8x16_t top[4];
+  top[0] = vld1q_u8(top_row);
+  if (width > 16) {
+    top[1] = vld1q_u8(top_row + 16);
+    if (width == 64) {
+      top[2] = vld1q_u8(top_row + 32);
+      top[3] = vld1q_u8(top_row + 48);
+    }
+  }
+
+  assert(height > 0);
+  int y = 0;
+  do {
+    const uint8x16_t left = vdupq_n_u8(left_column[y]);
+
+    const uint8x16_t top_dist = vabdq_u8(left, top_left);
+
+    const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
+    TOP_LEFT_DIST(0);
+    const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
+    LEFT_LE_TOP_LEFT(0);
+    TOP_LE_TOP_LEFT(0);
+
+    const uint8x16_t result_0 =
+        select_paeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
+                     top_le_top_left_0);
+    vst1q_u8(dest, result_0);
+
+    if (width > 16) {
+      const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
+      TOP_LEFT_DIST(1);
+      const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
+      LEFT_LE_TOP_LEFT(1);
+      TOP_LE_TOP_LEFT(1);
+
+      const uint8x16_t result_1 =
+          select_paeth(top[1], left, top_left, left_1_le_top,
+                       left_le_top_left_1, top_le_top_left_1);
+      vst1q_u8(dest + 16, result_1);
+
+      if (width == 64) {
+        const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
+        TOP_LEFT_DIST(2);
+        const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
+        LEFT_LE_TOP_LEFT(2);
+        TOP_LE_TOP_LEFT(2);
+
+        const uint8x16_t result_2 =
+            select_paeth(top[2], left, top_left, left_2_le_top,
+                         left_le_top_left_2, top_le_top_left_2);
+        vst1q_u8(dest + 32, result_2);
+
+        const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
+        TOP_LEFT_DIST(3);
+        const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
+        LEFT_LE_TOP_LEFT(3);
+        TOP_LE_TOP_LEFT(3);
+
+        const uint8x16_t result_3 =
+            select_paeth(top[3], left, top_left, left_3_le_top,
+                         left_le_top_left_3, top_le_top_left_3);
+        vst1q_u8(dest + 48, result_3);
+      }
+    }
+
+    dest += stride;
+  } while (++y != height);
+}
+
+#define PAETH_NXM_WIDE(W, H)                                                \
+  void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
+                                            const uint8_t *above,           \
+                                            const uint8_t *left) {          \
+    paeth16_plus_x_h_neon(dst, stride, above, left, W, H);                  \
+  }
 
-intra_pred_square(dc);
-#undef intra_pred_square
+PAETH_NXM_WIDE(16, 8)
+PAETH_NXM_WIDE(16, 16)
+PAETH_NXM_WIDE(16, 32)
+PAETH_NXM_WIDE(32, 16)
+PAETH_NXM_WIDE(32, 32)
+PAETH_NXM_WIDE(32, 64)
+PAETH_NXM_WIDE(64, 32)
+PAETH_NXM_WIDE(64, 64)
+
+PAETH_NXM_WIDE(16, 4)
+PAETH_NXM_WIDE(16, 64)
+PAETH_NXM_WIDE(32, 8)
+PAETH_NXM_WIDE(64, 16)
diff --git a/media/libaom/src/aom_dsp/arm/loopfilter_neon.c b/media/libaom/src/aom_dsp/arm/loopfilter_neon.c
index aafac8966d..f3f86a2b0e 100644
--- a/media/libaom/src/aom_dsp/arm/loopfilter_neon.c
+++ b/media/libaom/src/aom_dsp/arm/loopfilter_neon.c
@@ -15,8 +15,8 @@
 #include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 
 static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
                                  uint8x8_t p0q0, const uint8_t blimit,
@@ -695,6 +695,23 @@ void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
   store_u8_8x16(src - 8, stride, row0, row1, row2, row3);
 }
 
+void aom_lpf_vertical_14_dual_neon(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
+  aom_lpf_vertical_14_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_14_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_14_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  aom_lpf_vertical_14_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                                thresh);
+  aom_lpf_vertical_14_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+                                thresh, blimit, limit, thresh);
+}
+
 void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
   uint32x2x2_t p2q2_p1q1, p3q3_p0q0;
@@ -738,6 +755,22 @@ void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
   store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
 }
 
+void aom_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  aom_lpf_vertical_8_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh) {
+  aom_lpf_vertical_8_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                               thresh);
+  aom_lpf_vertical_8_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+                               thresh, blimit, limit, thresh);
+}
+
 void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
   uint32x2x2_t p2q2_p1q1, pxqy_p0q0;
@@ -781,6 +814,22 @@ void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
   store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
 }
 
+void aom_lpf_vertical_6_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  aom_lpf_vertical_6_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_6_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh) {
+  aom_lpf_vertical_6_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                               thresh);
+  aom_lpf_vertical_6_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+                               thresh, blimit, limit, thresh);
+}
+
 void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
   uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
@@ -820,9 +869,28 @@ void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
   store_unaligned_u8_4x1((src - 2) + 3 * stride, q0q1, 1);
 }
 
+void aom_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  aom_lpf_vertical_4_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_4_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh) {
+  aom_lpf_vertical_4_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                               thresh);
+  aom_lpf_vertical_4_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+                               thresh, blimit, limit, thresh);
+}
+
 void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, UNINITIALIZED_IS_SAFE(p6q6);
+  uint8x8_t UNINITIALIZED_IS_SAFE(p0q0), UNINITIALIZED_IS_SAFE(p1q1),
+      UNINITIALIZED_IS_SAFE(p2q2), UNINITIALIZED_IS_SAFE(p3q3),
+      UNINITIALIZED_IS_SAFE(p4q4), UNINITIALIZED_IS_SAFE(p5q5),
+      UNINITIALIZED_IS_SAFE(p6q6);
 
   load_u8_4x1(src - 7 * stride, &p6q6, 0);
   load_u8_4x1(src - 6 * stride, &p5q5, 0);
@@ -856,6 +924,26 @@ void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
   store_u8_4x1(src + 5 * stride, p5q5, 1);
 }
 
+void aom_lpf_horizontal_14_dual_neon(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
+  aom_lpf_horizontal_14_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_14_quad_neon(uint8_t *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh) {
+  aom_lpf_horizontal_14_dual_neon(s, pitch, blimit, limit, thresh, blimit,
+                                  limit, thresh);
+  aom_lpf_horizontal_14_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+                                  blimit, limit, thresh);
+}
+
 void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
                                const uint8_t *limit, const uint8_t *thresh) {
   uint8x8_t p0q0, p1q1, p2q2, p3q3;
@@ -885,6 +973,25 @@ void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
   vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1);
 }
 
+void aom_lpf_horizontal_8_dual_neon(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
+  aom_lpf_horizontal_8_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_8_quad_neon(uint8_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh) {
+  aom_lpf_horizontal_8_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                                 thresh);
+  aom_lpf_horizontal_8_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+                                 blimit, limit, thresh);
+}
+
 void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
                                const uint8_t *limit, const uint8_t *thresh) {
   uint8x8_t p0q0, p1q1, p2q2;
@@ -909,6 +1016,25 @@ void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
   vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
 }
 
+void aom_lpf_horizontal_6_dual_neon(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
+  aom_lpf_horizontal_6_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_6_quad_neon(uint8_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh) {
+  aom_lpf_horizontal_6_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                                 thresh);
+  aom_lpf_horizontal_6_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+                                 blimit, limit, thresh);
+}
+
 void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
                                const uint8_t *limit, const uint8_t *thresh) {
   uint8x8_t p0q0, UNINITIALIZED_IS_SAFE(p1q1);
@@ -925,3 +1051,22 @@ void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
   store_u8_4x1(src + 0 * stride, p0q0, 1);
   store_u8_4x1(src + 1 * stride, p1q1, 1);
 }
+
+void aom_lpf_horizontal_4_dual_neon(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
+  aom_lpf_horizontal_4_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_4_quad_neon(uint8_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh) {
+  aom_lpf_horizontal_4_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                                 thresh);
+  aom_lpf_horizontal_4_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+                                 blimit, limit, thresh);
+}
diff --git a/media/libaom/src/av1/common/arm/mem_neon.h b/media/libaom/src/aom_dsp/arm/mem_neon.h
index 171055fe14..c8236dad3a 100644
--- a/media/libaom/src/av1/common/arm/mem_neon.h
+++ b/media/libaom/src/aom_dsp/arm/mem_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
-#define AOM_AV1_COMMON_ARM_MEM_NEON_H_
+#ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
+#define AOM_AOM_DSP_ARM_MEM_NEON_H_
 
 #include <arm_neon.h>
 #include <string.h>
@@ -536,4 +536,4 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
   vst1q_s32(buf + 4, v1);
 }
 
-#endif  // AOM_AV1_COMMON_ARM_MEM_NEON_H_
+#endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/media/libaom/src/aom_dsp/arm/sad4d_neon.c b/media/libaom/src/aom_dsp/arm/sad4d_neon.c
index 606950ab25..22f2e643e7 100644
--- a/media/libaom/src/aom_dsp/arm/sad4d_neon.c
+++ b/media/libaom/src/aom_dsp/arm/sad4d_neon.c
@@ -82,7 +82,7 @@ static void sad_neon_32(const uint8x16_t vec_src_00,
 
 void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
                           const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t res[4]) {
   int i;
   uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
   uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
@@ -128,7 +128,7 @@ void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
 
 void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
                           const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t res[4]) {
   int i;
   uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
   uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
@@ -172,7 +172,7 @@ void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
 
 void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
                           const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t res[4]) {
   int i;
   uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
   uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
@@ -224,3 +224,369 @@ void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
   res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
   res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
 }
+
+static INLINE unsigned int horizontal_add_16x4(const uint16x4_t vec_16x4) {
+  const uint32x2_t a = vpaddl_u16(vec_16x4);
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+}
+
+static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
+  const uint32x4_t a = vpaddlq_u16(vec_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+static void sad_row4_neon(uint16x4_t *vec_src, const uint8x8_t q0,
+                          const uint8x8_t ref) {
+  uint8x8_t q2 = vabd_u8(q0, ref);
+  *vec_src = vpadal_u8(*vec_src, q2);
+}
+
+static void sad_row8_neon(uint16x4_t *vec_src, const uint8x8_t *q0,
+                          const uint8_t *ref_ptr) {
+  uint8x8_t q1 = vld1_u8(ref_ptr);
+  uint8x8_t q2 = vabd_u8(*q0, q1);
+  *vec_src = vpadal_u8(*vec_src, q2);
+}
+
+static void sad_row16_neon(uint16x8_t *vec_src, const uint8x16_t *q0,
+                           const uint8_t *ref_ptr) {
+  uint8x16_t q1 = vld1q_u8(ref_ptr);
+  uint8x16_t q2 = vabdq_u8(*q0, q1);
+  *vec_src = vpadalq_u8(*vec_src, q2);
+}
+
+void aom_sadMxNx4d_neon(int width, int height, const uint8_t *src,
+                        int src_stride, const uint8_t *const ref[4],
+                        int ref_stride, uint32_t res[4]) {
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  res[0] = 0;
+  res[1] = 0;
+  res[2] = 0;
+  res[3] = 0;
+
+  switch (width) {
+    case 4: {
+      uint32_t src4, ref40, ref41, ref42, ref43;
+      uint32x2_t q8 = vdup_n_u32(0);
+      uint32x2_t q4 = vdup_n_u32(0);
+      uint32x2_t q5 = vdup_n_u32(0);
+      uint32x2_t q6 = vdup_n_u32(0);
+      uint32x2_t q7 = vdup_n_u32(0);
+
+      for (int i = 0; i < height / 2; i++) {
+        uint16x4_t q0 = vdup_n_u16(0);
+        uint16x4_t q1 = vdup_n_u16(0);
+        uint16x4_t q2 = vdup_n_u16(0);
+        uint16x4_t q3 = vdup_n_u16(0);
+
+        memcpy(&src4, src, 4);
+        memcpy(&ref40, ref0, 4);
+        memcpy(&ref41, ref1, 4);
+        memcpy(&ref42, ref2, 4);
+        memcpy(&ref43, ref3, 4);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        q8 = vset_lane_u32(src4, q8, 0);
+        q4 = vset_lane_u32(ref40, q4, 0);
+        q5 = vset_lane_u32(ref41, q5, 0);
+        q6 = vset_lane_u32(ref42, q6, 0);
+        q7 = vset_lane_u32(ref43, q7, 0);
+
+        memcpy(&src4, src, 4);
+        memcpy(&ref40, ref0, 4);
+        memcpy(&ref41, ref1, 4);
+        memcpy(&ref42, ref2, 4);
+        memcpy(&ref43, ref3, 4);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        q8 = vset_lane_u32(src4, q8, 1);
+        q4 = vset_lane_u32(ref40, q4, 1);
+        q5 = vset_lane_u32(ref41, q5, 1);
+        q6 = vset_lane_u32(ref42, q6, 1);
+        q7 = vset_lane_u32(ref43, q7, 1);
+
+        sad_row4_neon(&q0, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q4));
+        sad_row4_neon(&q1, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q5));
+        sad_row4_neon(&q2, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q6));
+        sad_row4_neon(&q3, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q7));
+
+        res[0] += horizontal_add_16x4(q0);
+        res[1] += horizontal_add_16x4(q1);
+        res[2] += horizontal_add_16x4(q2);
+        res[3] += horizontal_add_16x4(q3);
+      }
+      break;
+    }
+    case 8: {
+      for (int i = 0; i < height; i++) {
+        uint16x4_t q0 = vdup_n_u16(0);
+        uint16x4_t q1 = vdup_n_u16(0);
+        uint16x4_t q2 = vdup_n_u16(0);
+        uint16x4_t q3 = vdup_n_u16(0);
+
+        uint8x8_t q5 = vld1_u8(src);
+
+        sad_row8_neon(&q0, &q5, ref0);
+        sad_row8_neon(&q1, &q5, ref1);
+        sad_row8_neon(&q2, &q5, ref2);
+        sad_row8_neon(&q3, &q5, ref3);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        res[0] += horizontal_add_16x4(q0);
+        res[1] += horizontal_add_16x4(q1);
+        res[2] += horizontal_add_16x4(q2);
+        res[3] += horizontal_add_16x4(q3);
+      }
+      break;
+    }
+    case 16: {
+      for (int i = 0; i < height; i++) {
+        uint16x8_t q0 = vdupq_n_u16(0);
+        uint16x8_t q1 = vdupq_n_u16(0);
+        uint16x8_t q2 = vdupq_n_u16(0);
+        uint16x8_t q3 = vdupq_n_u16(0);
+
+        uint8x16_t q4 = vld1q_u8(src);
+
+        sad_row16_neon(&q0, &q4, ref0);
+        sad_row16_neon(&q1, &q4, ref1);
+        sad_row16_neon(&q2, &q4, ref2);
+        sad_row16_neon(&q3, &q4, ref3);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        res[0] += horizontal_add_16x8(q0);
+        res[1] += horizontal_add_16x8(q1);
+        res[2] += horizontal_add_16x8(q2);
+        res[3] += horizontal_add_16x8(q3);
+      }
+      break;
+    }
+    case 32: {
+      for (int i = 0; i < height; i++) {
+        uint16x8_t q0 = vdupq_n_u16(0);
+        uint16x8_t q1 = vdupq_n_u16(0);
+        uint16x8_t q2 = vdupq_n_u16(0);
+        uint16x8_t q3 = vdupq_n_u16(0);
+
+        uint8x16_t q4 = vld1q_u8(src);
+
+        sad_row16_neon(&q0, &q4, ref0);
+        sad_row16_neon(&q1, &q4, ref1);
+        sad_row16_neon(&q2, &q4, ref2);
+        sad_row16_neon(&q3, &q4, ref3);
+
+        q4 = vld1q_u8(src + 16);
+
+        sad_row16_neon(&q0, &q4, ref0 + 16);
+        sad_row16_neon(&q1, &q4, ref1 + 16);
+        sad_row16_neon(&q2, &q4, ref2 + 16);
+        sad_row16_neon(&q3, &q4, ref3 + 16);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        res[0] += horizontal_add_16x8(q0);
+        res[1] += horizontal_add_16x8(q1);
+        res[2] += horizontal_add_16x8(q2);
+        res[3] += horizontal_add_16x8(q3);
+      }
+      break;
+    }
+    case 64: {
+      for (int i = 0; i < height; i++) {
+        uint16x8_t q0 = vdupq_n_u16(0);
+        uint16x8_t q1 = vdupq_n_u16(0);
+        uint16x8_t q2 = vdupq_n_u16(0);
+        uint16x8_t q3 = vdupq_n_u16(0);
+
+        uint8x16_t q4 = vld1q_u8(src);
+
+        sad_row16_neon(&q0, &q4, ref0);
+        sad_row16_neon(&q1, &q4, ref1);
+        sad_row16_neon(&q2, &q4, ref2);
+        sad_row16_neon(&q3, &q4, ref3);
+
+        q4 = vld1q_u8(src + 16);
+
+        sad_row16_neon(&q0, &q4, ref0 + 16);
+        sad_row16_neon(&q1, &q4, ref1 + 16);
+        sad_row16_neon(&q2, &q4, ref2 + 16);
+        sad_row16_neon(&q3, &q4, ref3 + 16);
+
+        q4 = vld1q_u8(src + 32);
+
+        sad_row16_neon(&q0, &q4, ref0 + 32);
+        sad_row16_neon(&q1, &q4, ref1 + 32);
+        sad_row16_neon(&q2, &q4, ref2 + 32);
+        sad_row16_neon(&q3, &q4, ref3 + 32);
+
+        q4 = vld1q_u8(src + 48);
+
+        sad_row16_neon(&q0, &q4, ref0 + 48);
+        sad_row16_neon(&q1, &q4, ref1 + 48);
+        sad_row16_neon(&q2, &q4, ref2 + 48);
+        sad_row16_neon(&q3, &q4, ref3 + 48);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        res[0] += horizontal_add_16x8(q0);
+        res[1] += horizontal_add_16x8(q1);
+        res[2] += horizontal_add_16x8(q2);
+        res[3] += horizontal_add_16x8(q3);
+      }
+      break;
+    }
+    case 128: {
+      for (int i = 0; i < height; i++) {
+        uint16x8_t q0 = vdupq_n_u16(0);
+        uint16x8_t q1 = vdupq_n_u16(0);
+        uint16x8_t q2 = vdupq_n_u16(0);
+        uint16x8_t q3 = vdupq_n_u16(0);
+
+        uint8x16_t q4 = vld1q_u8(src);
+
+        sad_row16_neon(&q0, &q4, ref0);
+        sad_row16_neon(&q1, &q4, ref1);
+        sad_row16_neon(&q2, &q4, ref2);
+        sad_row16_neon(&q3, &q4, ref3);
+
+        q4 = vld1q_u8(src + 16);
+
+        sad_row16_neon(&q0, &q4, ref0 + 16);
+        sad_row16_neon(&q1, &q4, ref1 + 16);
+        sad_row16_neon(&q2, &q4, ref2 + 16);
+        sad_row16_neon(&q3, &q4, ref3 + 16);
+
+        q4 = vld1q_u8(src + 32);
+
+        sad_row16_neon(&q0, &q4, ref0 + 32);
+        sad_row16_neon(&q1, &q4, ref1 + 32);
+        sad_row16_neon(&q2, &q4, ref2 + 32);
+        sad_row16_neon(&q3, &q4, ref3 + 32);
+
+        q4 = vld1q_u8(src + 48);
+
+        sad_row16_neon(&q0, &q4, ref0 + 48);
+        sad_row16_neon(&q1, &q4, ref1 + 48);
+        sad_row16_neon(&q2, &q4, ref2 + 48);
+        sad_row16_neon(&q3, &q4, ref3 + 48);
+
+        q4 = vld1q_u8(src + 64);
+
+        sad_row16_neon(&q0, &q4, ref0 + 64);
+        sad_row16_neon(&q1, &q4, ref1 + 64);
+        sad_row16_neon(&q2, &q4, ref2 + 64);
+        sad_row16_neon(&q3, &q4, ref3 + 64);
+
+        q4 = vld1q_u8(src + 80);
+
+        sad_row16_neon(&q0, &q4, ref0 + 80);
+        sad_row16_neon(&q1, &q4, ref1 + 80);
+        sad_row16_neon(&q2, &q4, ref2 + 80);
+        sad_row16_neon(&q3, &q4, ref3 + 80);
+
+        q4 = vld1q_u8(src + 96);
+
+        sad_row16_neon(&q0, &q4, ref0 + 96);
+        sad_row16_neon(&q1, &q4, ref1 + 96);
+        sad_row16_neon(&q2, &q4, ref2 + 96);
+        sad_row16_neon(&q3, &q4, ref3 + 96);
+
+        q4 = vld1q_u8(src + 112);
+
+        sad_row16_neon(&q0, &q4, ref0 + 112);
+        sad_row16_neon(&q1, &q4, ref1 + 112);
+        sad_row16_neon(&q2, &q4, ref2 + 112);
+        sad_row16_neon(&q3, &q4, ref3 + 112);
+
+        src += src_stride;
+        ref0 += ref_stride;
+        ref1 += ref_stride;
+        ref2 += ref_stride;
+        ref3 += ref_stride;
+
+        res[0] += horizontal_add_16x8(q0);
+        res[1] += horizontal_add_16x8(q1);
+        res[2] += horizontal_add_16x8(q2);
+        res[3] += horizontal_add_16x8(q3);
+      }
+    }
+  }
+}
+
+#define SAD_SKIP_MXN_NEON(m, n)                                             \
+  void aom_sad_skip_##m##x##n##x4d_neon(const uint8_t *src, int src_stride, \
+                                        const uint8_t *const ref[4],        \
+                                        int ref_stride, uint32_t res[4]) {  \
+    aom_sadMxNx4d_neon(m, ((n) >> 1), src, 2 * src_stride, ref,             \
+                       2 * ref_stride, res);                                \
+    res[0] <<= 1;                                                           \
+    res[1] <<= 1;                                                           \
+    res[2] <<= 1;                                                           \
+    res[3] <<= 1;                                                           \
+  }
+
+SAD_SKIP_MXN_NEON(4, 8)
+SAD_SKIP_MXN_NEON(4, 16)
+SAD_SKIP_MXN_NEON(4, 32)
+
+SAD_SKIP_MXN_NEON(8, 8)
+SAD_SKIP_MXN_NEON(8, 16)
+SAD_SKIP_MXN_NEON(8, 32)
+
+SAD_SKIP_MXN_NEON(16, 8)
+SAD_SKIP_MXN_NEON(16, 16)
+SAD_SKIP_MXN_NEON(16, 32)
+SAD_SKIP_MXN_NEON(16, 64)
+
+SAD_SKIP_MXN_NEON(32, 8)
+SAD_SKIP_MXN_NEON(32, 16)
+SAD_SKIP_MXN_NEON(32, 32)
+SAD_SKIP_MXN_NEON(32, 64)
+
+SAD_SKIP_MXN_NEON(64, 16)
+SAD_SKIP_MXN_NEON(64, 32)
+SAD_SKIP_MXN_NEON(64, 64)
+SAD_SKIP_MXN_NEON(64, 128)
+
+SAD_SKIP_MXN_NEON(128, 64)
+SAD_SKIP_MXN_NEON(128, 128)
+
+#undef SAD_SKIP_MXN_NEON
diff --git a/media/libaom/src/aom_dsp/arm/sad_neon.c b/media/libaom/src/aom_dsp/arm/sad_neon.c
index a39de91d60..4f0a1990ca 100644
--- a/media/libaom/src/aom_dsp/arm/sad_neon.c
+++ b/media/libaom/src/aom_dsp/arm/sad_neon.c
@@ -10,13 +10,12 @@
  */
 
 #include <arm_neon.h>
-
 #include "config/aom_config.h"
-
+#include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
 
-unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride,
-                              unsigned char *ref_ptr, int ref_stride) {
+unsigned int aom_sad8x16_neon(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride) {
   uint8x8_t d0, d8;
   uint16x8_t q12;
   uint32x4_t q1;
@@ -46,8 +45,8 @@ unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride,
   return vget_lane_u32(d5, 0);
 }
 
-unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride,
-                             unsigned char *ref_ptr, int ref_stride) {
+unsigned int aom_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride) {
   uint8x8_t d0, d8;
   uint16x8_t q12;
   uint32x2_t d1;
@@ -74,8 +73,8 @@ unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride,
   return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
 }
 
-unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride,
-                              unsigned char *ref_ptr, int ref_stride) {
+unsigned int aom_sad16x8_neon(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride) {
   uint8x16_t q0, q4;
   uint16x8_t q12, q13;
   uint32x4_t q1;
@@ -164,6 +163,77 @@ unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
   return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
 }
 
+unsigned int aom_sad128x128_neon(const uint8_t *src, int src_stride,
+                                 const uint8_t *ref, int ref_stride) {
+  uint16x8_t vec_accum_lo, vec_accum_hi;
+  uint32x4_t vec_accum_32lo = vdupq_n_u32(0);
+  uint32x4_t vec_accum_32hi = vdupq_n_u32(0);
+  uint16x8_t tmp;
+  for (int i = 0; i < 128; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+    const uint8x16_t vec_src_64 = vld1q_u8(src + 64);
+    const uint8x16_t vec_src_80 = vld1q_u8(src + 80);
+    const uint8x16_t vec_src_96 = vld1q_u8(src + 96);
+    const uint8x16_t vec_src_112 = vld1q_u8(src + 112);
+    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+    const uint8x16_t vec_ref_64 = vld1q_u8(ref + 64);
+    const uint8x16_t vec_ref_80 = vld1q_u8(ref + 80);
+    const uint8x16_t vec_ref_96 = vld1q_u8(ref + 96);
+    const uint8x16_t vec_ref_112 = vld1q_u8(ref + 112);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vdupq_n_u16(0);
+    vec_accum_hi = vdupq_n_u16(0);
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+                            vget_low_u8(vec_ref_00));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+                            vget_high_u8(vec_ref_00));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+                            vget_low_u8(vec_ref_16));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+                            vget_high_u8(vec_ref_16));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
+                            vget_low_u8(vec_ref_32));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
+                            vget_high_u8(vec_ref_32));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
+                            vget_low_u8(vec_ref_48));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
+                            vget_high_u8(vec_ref_48));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_64),
+                            vget_low_u8(vec_ref_64));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_64),
+                            vget_high_u8(vec_ref_64));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_80),
+                            vget_low_u8(vec_ref_80));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_80),
+                            vget_high_u8(vec_ref_80));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_96),
+                            vget_low_u8(vec_ref_96));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_96),
+                            vget_high_u8(vec_ref_96));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_112),
+                            vget_low_u8(vec_ref_112));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_112),
+                            vget_high_u8(vec_ref_112));
+
+    tmp = vaddq_u16(vec_accum_lo, vec_accum_hi);
+    vec_accum_32lo = vaddw_u16(vec_accum_32lo, vget_low_u16(tmp));
+    vec_accum_32hi = vaddw_u16(vec_accum_32hi, vget_high_u16(tmp));
+  }
+  const uint32x4_t a = vaddq_u32(vec_accum_32lo, vec_accum_32hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
 unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
                                const uint8_t *ref, int ref_stride) {
   int i;
@@ -222,3 +292,273 @@ unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
   }
   return horizontal_add_16x8(vec_accum);
 }
+
+static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         int h) {
+  int sum = 0;
+  for (int i = 0; i < h; i++) {
+    uint16x8_t q3 = vdupq_n_u16(0);
+
+    uint8x16_t q0 = vld1q_u8(src_ptr);
+    uint8x16_t q1 = vld1q_u8(ref_ptr);
+    uint8x16_t q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 16);
+    q1 = vld1q_u8(ref_ptr + 16);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 32);
+    q1 = vld1q_u8(ref_ptr + 32);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 48);
+    q1 = vld1q_u8(ref_ptr + 48);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 64);
+    q1 = vld1q_u8(ref_ptr + 64);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 80);
+    q1 = vld1q_u8(ref_ptr + 80);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 96);
+    q1 = vld1q_u8(ref_ptr + 96);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 112);
+    q1 = vld1q_u8(ref_ptr + 112);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    sum += horizontal_add_16x8(q3);
+  }
+
+  return sum;
+}
+
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int sum = 0;
+  for (int i = 0; i < h; i++) {
+    uint16x8_t q3 = vdupq_n_u16(0);
+
+    uint8x16_t q0 = vld1q_u8(src_ptr);
+    uint8x16_t q1 = vld1q_u8(ref_ptr);
+    uint8x16_t q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 16);
+    q1 = vld1q_u8(ref_ptr + 16);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 32);
+    q1 = vld1q_u8(ref_ptr + 32);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 48);
+    q1 = vld1q_u8(ref_ptr + 48);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    sum += horizontal_add_16x8(q3);
+  }
+
+  return sum;
+}
+
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int sum = 0;
+  for (int i = 0; i < h; i++) {
+    uint16x8_t q3 = vdupq_n_u16(0);
+
+    uint8x16_t q0 = vld1q_u8(src_ptr);
+    uint8x16_t q1 = vld1q_u8(ref_ptr);
+    uint8x16_t q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    q0 = vld1q_u8(src_ptr + 16);
+    q1 = vld1q_u8(ref_ptr + 16);
+    q2 = vabdq_u8(q0, q1);
+    q3 = vpadalq_u8(q3, q2);
+
+    sum += horizontal_add_16x8(q3);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+
+  return sum;
+}
+
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int sum = 0;
+  for (int i = 0; i < h; i++) {
+    uint8x8_t q0 = vld1_u8(src_ptr);
+    uint8x8_t q1 = vld1_u8(ref_ptr);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 0);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 1);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 2);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 3);
+    q0 = vld1_u8(src_ptr + 8);
+    q1 = vld1_u8(ref_ptr + 8);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 0);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 1);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 2);
+    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 3);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+
+  return sum;
+}
+
+static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t q3 = vdupq_n_u16(0);
+  for (int y = 0; y < h; y++) {
+    uint8x8_t q0 = vld1_u8(src_ptr);
+    uint8x8_t q1 = vld1_u8(ref_ptr);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    q3 = vabal_u8(q3, q0, q1);
+  }
+  return horizontal_add_16x8(q3);
+}
+
+static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t q3 = vdupq_n_u16(0);
+  uint32x2_t q0 = vdup_n_u32(0);
+  uint32x2_t q1 = vdup_n_u32(0);
+  uint32_t src4, ref4;
+  for (int y = 0; y < h / 2; y++) {
+    memcpy(&src4, src_ptr, 4);
+    memcpy(&ref4, ref_ptr, 4);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    q0 = vset_lane_u32(src4, q0, 0);
+    q1 = vset_lane_u32(ref4, q1, 0);
+
+    memcpy(&src4, src_ptr, 4);
+    memcpy(&ref4, ref_ptr, 4);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    q0 = vset_lane_u32(src4, q0, 1);
+    q1 = vset_lane_u32(ref4, q1, 1);
+
+    q3 = vabal_u8(q3, vreinterpret_u8_u32(q0), vreinterpret_u8_u32(q1));
+  }
+  return horizontal_add_16x8(q3);
+}
+
+#define FSADS128_H(h)                                                    \
+  unsigned int aom_sad_skip_128x##h##_neon(                              \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,    \
+      int ref_stride) {                                                  \
+    const uint32_t sum = sad128xh_neon(src_ptr, 2 * src_stride, ref_ptr, \
+                                       2 * ref_stride, h / 2);           \
+    return 2 * sum;                                                      \
+  }
+
+FSADS128_H(128)
+FSADS128_H(64)
+
+#undef FSADS128_H
+
+#define FSADS64_H(h)                                                          \
+  unsigned int aom_sad_skip_64x##h##_neon(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad64xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
+  }
+
+FSADS64_H(128)
+FSADS64_H(64)
+FSADS64_H(32)
+FSADS64_H(16)
+
+#undef FSADS64_H
+
+#define FSADS32_H(h)                                                          \
+  unsigned int aom_sad_skip_32x##h##_neon(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad32xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
+  }
+
+FSADS32_H(64)
+FSADS32_H(32)
+FSADS32_H(16)
+FSADS32_H(8)
+
+#undef FSADS32_H
+
+#define FSADS16_H(h)                                                          \
+  unsigned int aom_sad_skip_16x##h##_neon(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad16xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
+  }
+
+FSADS16_H(64)
+FSADS16_H(32)
+FSADS16_H(16)
+FSADS16_H(8)
+
+#undef FSADS16_H
+
+#define FSADS8_H(h)                                                          \
+  unsigned int aom_sad_skip_8x##h##_neon(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride) {                                                      \
+    return 2 * sad8xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                           h / 2);                                           \
+  }
+
+FSADS8_H(32)
+FSADS8_H(16)
+FSADS8_H(8)
+
+#undef FSADS8_H
+
+#define FSADS4_H(h)                                                          \
+  unsigned int aom_sad_skip_4x##h##_neon(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride) {                                                      \
+    return 2 * sad4xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                           h / 2);                                           \
+  }
+
+FSADS4_H(16)
+FSADS4_H(8)
+
+#undef FSADS4_H
diff --git a/media/libaom/src/aom_dsp/arm/sse_neon.c b/media/libaom/src/aom_dsp/arm/sse_neon.c
index 06b81cc3d3..35b784a52d 100644
--- a/media/libaom/src/aom_dsp/arm/sse_neon.c
+++ b/media/libaom/src/aom_dsp/arm/sse_neon.c
@@ -9,217 +9,176 @@
  */
 
 #include <arm_neon.h>
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
 
+#include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
-
-static INLINE uint32_t sse_W16x1_neon(uint8x16_t q2, uint8x16_t q3) {
-  const uint16_t sse1 = 0;
-  const uint16x8_t q1 = vld1q_dup_u16(&sse1);
-
-  uint32_t sse;
-
-  uint8x16_t q4 = vabdq_u8(q2, q3);  // diff = abs(a[x] - b[x])
-  uint8x8_t d0 = vget_low_u8(q4);
-  uint8x8_t d1 = vget_high_u8(q4);
-
-  uint16x8_t q6 = vmlal_u8(q1, d0, d0);
-  uint16x8_t q7 = vmlal_u8(q1, d1, d1);
-
-  uint32x4_t q8 = vaddl_u16(vget_low_u16(q6), vget_high_u16(q6));
-  uint32x4_t q9 = vaddl_u16(vget_low_u16(q7), vget_high_u16(q7));
-
-  uint32x2_t d4 = vadd_u32(vget_low_u32(q8), vget_high_u32(q8));
-  uint32x2_t d5 = vadd_u32(vget_low_u32(q9), vget_high_u32(q9));
-
-  uint32x2_t d6 = vadd_u32(d4, d5);
-
-  sse = vget_lane_u32(d6, 0);
-  sse += vget_lane_u32(d6, 1);
-
-  return sse;
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE void sse_w16_neon(uint32x4_t *sum, const uint8_t *a,
+                                const uint8_t *b) {
+  const uint8x16_t v_a0 = vld1q_u8(a);
+  const uint8x16_t v_b0 = vld1q_u8(b);
+  const uint8x16_t diff = vabdq_u8(v_a0, v_b0);
+  const uint8x8_t diff_lo = vget_low_u8(diff);
+  const uint8x8_t diff_hi = vget_high_u8(diff);
+  *sum = vpadalq_u16(*sum, vmull_u8(diff_lo, diff_lo));
+  *sum = vpadalq_u16(*sum, vmull_u8(diff_hi, diff_hi));
+}
+static INLINE void aom_sse4x2_neon(const uint8_t *a, int a_stride,
+                                   const uint8_t *b, int b_stride,
+                                   uint32x4_t *sum) {
+  uint8x8_t v_a0, v_b0;
+  v_a0 = v_b0 = vcreate_u8(0);
+  // above line is only to shadow [-Werror=uninitialized]
+  v_a0 = vreinterpret_u8_u32(
+      vld1_lane_u32((uint32_t *)a, vreinterpret_u32_u8(v_a0), 0));
+  v_a0 = vreinterpret_u8_u32(
+      vld1_lane_u32((uint32_t *)(a + a_stride), vreinterpret_u32_u8(v_a0), 1));
+  v_b0 = vreinterpret_u8_u32(
+      vld1_lane_u32((uint32_t *)b, vreinterpret_u32_u8(v_b0), 0));
+  v_b0 = vreinterpret_u8_u32(
+      vld1_lane_u32((uint32_t *)(b + b_stride), vreinterpret_u32_u8(v_b0), 1));
+  const uint8x8_t v_a_w = vabd_u8(v_a0, v_b0);
+  *sum = vpadalq_u16(*sum, vmull_u8(v_a_w, v_a_w));
+}
+static INLINE void aom_sse8_neon(const uint8_t *a, const uint8_t *b,
+                                 uint32x4_t *sum) {
+  const uint8x8_t v_a_w = vld1_u8(a);
+  const uint8x8_t v_b_w = vld1_u8(b);
+  const uint8x8_t v_d_w = vabd_u8(v_a_w, v_b_w);
+  *sum = vpadalq_u16(*sum, vmull_u8(v_d_w, v_d_w));
 }
-
 int64_t aom_sse_neon(const uint8_t *a, int a_stride, const uint8_t *b,
                      int b_stride, int width, int height) {
-  const uint8x16_t q0 = {
-    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-  };
-  int addinc, x, y;
-  uint8x8_t d0, d1, d2, d3;
-  uint8_t dx;
-  uint8x16_t q2, q3, q4, q5;
-  uint32_t sse = 0;
-  uint8x8x2_t tmp, tmp2;
-
+  int y = 0;
+  int64_t sse = 0;
+  uint32x4_t sum = vdupq_n_u32(0);
   switch (width) {
     case 4:
-      for (y = 0; y < height; y += 4) {
-        d0 = vld1_u8(a);  // load 4 data
-        a += a_stride;
-        d1 = vld1_u8(a);
-        a += a_stride;
-        d2 = vld1_u8(a);
-        a += a_stride;
-        d3 = vld1_u8(a);
-        a += a_stride;
-        tmp = vzip_u8(d0, d1);
-        tmp2 = vzip_u8(d2, d3);
-        q2 = vcombine_u8(tmp.val[0], tmp2.val[0]);  // make a 16 data vector
-
-        d0 = vld1_u8(b);
-        b += b_stride;
-        d1 = vld1_u8(b);
-        b += b_stride;
-        d2 = vld1_u8(b);
-        b += b_stride;
-        d3 = vld1_u8(b);
-        b += b_stride;
-        tmp = vzip_u8(d0, d1);
-        tmp2 = vzip_u8(d2, d3);
-        q3 = vcombine_u8(tmp.val[0], tmp2.val[0]);
-
-        sse += sse_W16x1_neon(q2, q3);
-      }
+      do {
+        aom_sse4x2_neon(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
       break;
     case 8:
-      for (y = 0; y < height; y += 2) {
-        d0 = vld1_u8(a);  // load 8 data
-        d1 = vld1_u8(a + a_stride);
-        q2 = vcombine_u8(d0, d1);  // make a 16 data vector
-
-        d0 = vld1_u8(b);
-        d1 = vld1_u8(b + b_stride);
-        q3 = vcombine_u8(d0, d1);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        a += 2 * a_stride;
-        b += 2 * b_stride;
-      }
+      do {
+        aom_sse8_neon(a, b, &sum);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
       break;
     case 16:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u8(a);
-        q3 = vld1q_u8(b);
-
-        sse += sse_W16x1_neon(q2, q3);
-
+      do {
+        sse_w16_neon(&sum, a, b);
         a += a_stride;
         b += b_stride;
-      }
+        y += 1;
+      } while (y < height);
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
       break;
     case 32:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u8(a);
-        q3 = vld1q_u8(b);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 16);
-        q3 = vld1q_u8(b + 16);
-
-        sse += sse_W16x1_neon(q2, q3);
-
+      do {
+        sse_w16_neon(&sum, a, b);
+        sse_w16_neon(&sum, a + 16, b + 16);
         a += a_stride;
         b += b_stride;
-      }
+        y += 1;
+      } while (y < height);
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
       break;
     case 64:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u8(a);
-        q3 = vld1q_u8(b);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 16);
-        q3 = vld1q_u8(b + 16);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 32);
-        q3 = vld1q_u8(b + 32);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 48);
-        q3 = vld1q_u8(b + 48);
-
-        sse += sse_W16x1_neon(q2, q3);
-
+      do {
+        sse_w16_neon(&sum, a, b);
+        sse_w16_neon(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_neon(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_neon(&sum, a + 16 * 3, b + 16 * 3);
         a += a_stride;
         b += b_stride;
-      }
+        y += 1;
+      } while (y < height);
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
       break;
     case 128:
-      for (y = 0; y < height; y++) {
-        q2 = vld1q_u8(a);
-        q3 = vld1q_u8(b);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 16);
-        q3 = vld1q_u8(b + 16);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 32);
-        q3 = vld1q_u8(b + 32);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 48);
-        q3 = vld1q_u8(b + 48);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 64);
-        q3 = vld1q_u8(b + 64);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 80);
-        q3 = vld1q_u8(b + 80);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 96);
-        q3 = vld1q_u8(b + 96);
-
-        sse += sse_W16x1_neon(q2, q3);
-
-        q2 = vld1q_u8(a + 112);
-        q3 = vld1q_u8(b + 112);
-
-        sse += sse_W16x1_neon(q2, q3);
-
+      do {
+        sse_w16_neon(&sum, a, b);
+        sse_w16_neon(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_neon(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_neon(&sum, a + 16 * 3, b + 16 * 3);
+        sse_w16_neon(&sum, a + 16 * 4, b + 16 * 4);
+        sse_w16_neon(&sum, a + 16 * 5, b + 16 * 5);
+        sse_w16_neon(&sum, a + 16 * 6, b + 16 * 6);
+        sse_w16_neon(&sum, a + 16 * 7, b + 16 * 7);
         a += a_stride;
         b += b_stride;
-      }
+        y += 1;
+      } while (y < height);
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
       break;
     default:
-      for (y = 0; y < height; y++) {
-        x = width;
-        while (x > 0) {
-          addinc = width - x;
-          q2 = vld1q_u8(a + addinc);
-          q3 = vld1q_u8(b + addinc);
-          if (x < 16) {
-            dx = x;
-            q4 = vld1q_dup_u8(&dx);
-            q5 = vcltq_u8(q0, q4);
-            q2 = vandq_u8(q2, q5);
-            q3 = vandq_u8(q3, q5);
-          }
-          sse += sse_W16x1_neon(q2, q3);
-          x -= 16;
-        }
-        a += a_stride;
-        b += b_stride;
+      if (width & 0x07) {
+        do {
+          int i = 0;
+          do {
+            aom_sse8_neon(a + i, b + i, &sum);
+            aom_sse8_neon(a + i + a_stride, b + i + b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          aom_sse4x2_neon(a + i, a_stride, b + i, b_stride, &sum);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            aom_sse8_neon(a + i, b + i, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride;
+          b += b_stride;
+          y += 1;
+        } while (y < height);
       }
+#if defined(__aarch64__)
+      sse = vaddvq_u32(sum);
+#else
+      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
+#endif  // __aarch64__
+      break;
   }
-  return (int64_t)sse;
+  return sse;
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/aom_dsp/arm/subpel_variance_neon.c b/media/libaom/src/aom_dsp/arm/subpel_variance_neon.c
index cf618eee77..4ecf891cbe 100644
--- a/media/libaom/src/aom_dsp/arm/subpel_variance_neon.c
+++ b/media/libaom/src/aom_dsp/arm/subpel_variance_neon.c
@@ -20,6 +20,42 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/variance.h"
 
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x2_t a_u32 = vdup_n_u32(0);
+  if (stride == 4) return vld1_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1_lane_u32(&a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  a_u32 = vld1_lane_u32(&a, a_u32, 1);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+// Process a block exactly 4 wide and a multiple of 2 high.
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; i += 2) {
+    const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
+    const uint8x8_t src_1 =
+        load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(output_ptr, out);
+    src_ptr += 2 * src_pixels_per_line;
+    output_ptr += 8;
+  }
+}
+
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                       uint8_t *output_ptr,
                                       unsigned int src_pixels_per_line,
@@ -27,8 +63,8 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                       unsigned int output_height,
                                       unsigned int output_width,
                                       const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
   unsigned int i;
   for (i = 0; i < output_height; ++i) {
     const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
@@ -36,13 +72,14 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
     const uint16x8_t a = vmull_u8(src_0, f0);
     const uint16x8_t b = vmlal_u8(a, src_1, f1);
     const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(&output_ptr[0], out);
+    vst1_u8(output_ptr, out);
     // Next row...
     src_ptr += src_pixels_per_line;
     output_ptr += output_width;
   }
 }
 
+// Process a block which is a mutiple of 16 wide and any height.
 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
                                        uint8_t *output_ptr,
                                        unsigned int src_pixels_per_line,
@@ -50,8 +87,8 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
                                        unsigned int output_height,
                                        unsigned int output_width,
                                        const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; j += 16) {
@@ -63,9 +100,8 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
       const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
       const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
       const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+      vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
     }
-    // Next row...
     src_ptr += src_pixels_per_line;
     output_ptr += output_width;
   }
@@ -129,3 +165,276 @@ unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
                              bilinear_filters_2t[yoffset]);
   return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
 }
+
+unsigned int aom_sub_pixel_variance4x4_neon(const uint8_t *a, int a_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  uint8_t temp0[4 * (4 + 2)];
+  uint8_t temp1[4 * 4];
+
+  var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (4 + 2),
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 4,
+                            bilinear_filters_2t[yoffset]);
+
+  return aom_variance4x4(temp1, 4, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance4x8_neon(const uint8_t *a, int a_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  uint8_t temp0[4 * (8 + 2)];
+  uint8_t temp1[4 * 8];
+
+  var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (8 + 2),
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 8,
+                            bilinear_filters_2t[yoffset]);
+
+  return aom_variance4x8(temp1, 4, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance8x4_neon(const uint8_t *a, int a_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  uint8_t temp0[8 * (4 + 1)];
+  uint8_t temp1[8 * 4];
+
+  var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (4 + 1), 8,
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 4, 8,
+                            bilinear_filters_2t[yoffset]);
+
+  return aom_variance8x4(temp1, 8, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance8x16_neon(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  uint8_t temp0[8 * (16 + 1)];
+  uint8_t temp1[8 * 16];
+
+  var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (16 + 1), 8,
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 16, 8,
+                            bilinear_filters_2t[yoffset]);
+
+  return aom_variance8x16(temp1, 8, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance16x8_neon(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  uint8_t temp0[16 * (8 + 1)];
+  uint8_t temp1[16 * 8];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (8 + 1), 16,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 8, 16,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance16x8(temp1, 16, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance16x32_neon(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              const uint8_t *b, int b_stride,
+                                              uint32_t *sse) {
+  uint8_t temp0[16 * (32 + 1)];
+  uint8_t temp1[16 * 32];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (32 + 1), 16,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 32, 16,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance16x32(temp1, 16, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance32x16_neon(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              const uint8_t *b, int b_stride,
+                                              uint32_t *sse) {
+  uint8_t temp0[32 * (16 + 1)];
+  uint8_t temp1[32 * 16];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (16 + 1), 32,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 16, 32,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance32x16(temp1, 32, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance32x64_neon(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              const uint8_t *b, int b_stride,
+                                              uint32_t *sse) {
+  uint8_t temp0[32 * (64 + 1)];
+  uint8_t temp1[32 * 64];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 32,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 64, 32,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance32x64(temp1, 32, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance64x32_neon(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              const uint8_t *b, int b_stride,
+                                              uint32_t *sse) {
+  uint8_t temp0[64 * (32 + 1)];
+  uint8_t temp1[64 * 32];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (32 + 1), 64,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 32, 64,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance64x32(temp1, 64, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance64x128_neon(const uint8_t *a, int a_stride,
+                                               int xoffset, int yoffset,
+                                               const uint8_t *b, int b_stride,
+                                               uint32_t *sse) {
+  uint8_t temp0[64 * (128 + 1)];
+  uint8_t temp1[64 * 128];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (128 + 1), 64,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 128, 64,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance64x128(temp1, 64, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance128x64_neon(const uint8_t *a, int a_stride,
+                                               int xoffset, int yoffset,
+                                               const uint8_t *b, int b_stride,
+                                               uint32_t *sse) {
+  uint8_t temp0[128 * (64 + 1)];
+  uint8_t temp1[128 * 64];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 128,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 128, 128, 64, 128,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance128x64(temp1, 128, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance128x128_neon(const uint8_t *a, int a_stride,
+                                                int xoffset, int yoffset,
+                                                const uint8_t *b, int b_stride,
+                                                uint32_t *sse) {
+  uint8_t temp0[128 * (128 + 1)];
+  uint8_t temp1[128 * 128];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (128 + 1), 128,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 128, 128, 128, 128,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance128x128(temp1, 128, b, b_stride, sse);
+}
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+unsigned int aom_sub_pixel_variance4x16_neon(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  uint8_t temp0[4 * (16 + 2)];
+  uint8_t temp1[4 * 16];
+
+  var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (16 + 2),
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 16,
+                            bilinear_filters_2t[yoffset]);
+
+  return aom_variance4x16(temp1, 4, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance8x32_neon(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  uint8_t temp0[8 * (32 + 1)];
+  uint8_t temp1[8 * 32];
+
+  var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (32 + 1), 8,
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 32, 8,
+                            bilinear_filters_2t[yoffset]);
+
+  return aom_variance8x32(temp1, 8, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance16x4_neon(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  uint8_t temp0[16 * (4 + 1)];
+  uint8_t temp1[16 * 4];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (4 + 1), 16,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 4, 16,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance16x4(temp1, 16, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance64x16_neon(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              const uint8_t *b, int b_stride,
+                                              uint32_t *sse) {
+  uint8_t temp0[64 * (16 + 1)];
+  uint8_t temp1[64 * 16];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (16 + 1), 64,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 16, 64,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance64x16(temp1, 64, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance16x64_neon(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              const uint8_t *b, int b_stride,
+                                              uint32_t *sse) {
+  uint8_t temp0[16 * (64 + 1)];
+  uint8_t temp1[16 * 64];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 16,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 64, 16,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance16x64(temp1, 16, b, b_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance32x8_neon(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  uint8_t temp0[32 * (8 + 1)];
+  uint8_t temp1[32 * 8];
+
+  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (8 + 1), 32,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 8, 32,
+                             bilinear_filters_2t[yoffset]);
+
+  return aom_variance32x8(temp1, 32, b, b_stride, sse);
+}
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/media/libaom/src/aom_dsp/arm/subtract_neon.c b/media/libaom/src/aom_dsp/arm/subtract_neon.c
index 28f5ace8e1..a195c40d19 100644
--- a/media/libaom/src/aom_dsp/arm/subtract_neon.c
+++ b/media/libaom/src/aom_dsp/arm/subtract_neon.c
@@ -14,16 +14,17 @@
 #include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
 
 void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
                              ptrdiff_t diff_stride, const uint8_t *src,
                              ptrdiff_t src_stride, const uint8_t *pred,
                              ptrdiff_t pred_stride) {
-  int r, c;
-
   if (cols > 16) {
-    for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; c += 32) {
+    int r = rows;
+    do {
+      int c = 0;
+      do {
         const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
         const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
         const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
@@ -40,13 +41,15 @@ void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
         vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
         vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
         vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
-      }
+        c += 32;
+      } while (c < cols);
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r != 0);
   } else if (cols > 8) {
-    for (r = 0; r < rows; ++r) {
+    int r = rows;
+    do {
       const uint8x16_t v_src = vld1q_u8(&src[0]);
       const uint8x16_t v_pred = vld1q_u8(&pred[0]);
       const uint16x8_t v_diff_lo =
@@ -58,9 +61,10 @@ void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r != 0);
   } else if (cols > 4) {
-    for (r = 0; r < rows; ++r) {
+    int r = rows;
+    do {
       const uint8x8_t v_src = vld1_u8(&src[0]);
       const uint8x8_t v_pred = vld1_u8(&pred[0]);
       const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
@@ -68,14 +72,95 @@ void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r != 0);
   } else {
-    for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
+    int r = rows;
+    do {
+      int c = 0;
+      do {
+        diff[c] = src[c] - pred[c];
+      } while (++c < cols);
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r != 0);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_subtract_block_neon(int rows, int cols, int16_t *diff,
+                                    ptrdiff_t diff_stride, const uint8_t *src8,
+                                    ptrdiff_t src_stride, const uint8_t *pred8,
+                                    ptrdiff_t pred_stride) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
 
+  if (cols > 16) {
+    int r = rows;
+    do {
+      int c = 0;
+      do {
+        const uint16x8_t v_src_00 = vld1q_u16(&src[c + 0]);
+        const uint16x8_t v_pred_00 = vld1q_u16(&pred[c + 0]);
+        const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00);
+        const uint16x8_t v_src_08 = vld1q_u16(&src[c + 8]);
+        const uint16x8_t v_pred_08 = vld1q_u16(&pred[c + 8]);
+        const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08);
+        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_00));
+        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_08));
+        c += 16;
+      } while (c < cols);
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r != 0);
+  } else if (cols > 8) {
+    int r = rows;
+    do {
+      const uint16x8_t v_src_00 = vld1q_u16(&src[0]);
+      const uint16x8_t v_pred_00 = vld1q_u16(&pred[0]);
+      const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00);
+      const uint16x8_t v_src_08 = vld1q_u16(&src[8]);
+      const uint16x8_t v_pred_08 = vld1q_u16(&pred[8]);
+      const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08);
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_00));
+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_08));
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r != 0);
+  } else if (cols > 4) {
+    int r = rows;
+    do {
+      const uint16x8_t v_src_r0 = vld1q_u16(&src[0]);
+      const uint16x8_t v_src_r1 = vld1q_u16(&src[src_stride]);
+      const uint16x8_t v_pred_r0 = vld1q_u16(&pred[0]);
+      const uint16x8_t v_pred_r1 = vld1q_u16(&pred[pred_stride]);
+      const uint16x8_t v_diff_r0 = vsubq_u16(v_src_r0, v_pred_r0);
+      const uint16x8_t v_diff_r1 = vsubq_u16(v_src_r1, v_pred_r1);
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_r0));
+      vst1q_s16(&diff[diff_stride], vreinterpretq_s16_u16(v_diff_r1));
+      diff += diff_stride << 1;
+      pred += pred_stride << 1;
+      src += src_stride << 1;
+      r -= 2;
+    } while (r != 0);
+  } else {
+    int r = rows;
+    do {
+      const uint16x4_t v_src_r0 = vld1_u16(&src[0]);
+      const uint16x4_t v_src_r1 = vld1_u16(&src[src_stride]);
+      const uint16x4_t v_pred_r0 = vld1_u16(&pred[0]);
+      const uint16x4_t v_pred_r1 = vld1_u16(&pred[pred_stride]);
+      const uint16x4_t v_diff_r0 = vsub_u16(v_src_r0, v_pred_r0);
+      const uint16x4_t v_diff_r1 = vsub_u16(v_src_r1, v_pred_r1);
+      vst1_s16(&diff[0], vreinterpret_s16_u16(v_diff_r0));
+      vst1_s16(&diff[diff_stride], vreinterpret_s16_u16(v_diff_r1));
+      diff += diff_stride << 1;
+      pred += pred_stride << 1;
+      src += src_stride << 1;
+      r -= 2;
+    } while (r != 0);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/aom_dsp/arm/sum_squares_neon.c b/media/libaom/src/aom_dsp/arm/sum_squares_neon.c
new file mode 100644
index 0000000000..0b7337a941
--- /dev/null
+++ b/media/libaom/src/aom_dsp/arm/sum_squares_neon.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint32x4_t sum_squares_i16_4x4_neon(const int16_t *src,
+                                                  int stride) {
+  const int16x4_t v_val_01_lo = vld1_s16(src + 0 * stride);
+  const int16x4_t v_val_01_hi = vld1_s16(src + 1 * stride);
+  const int16x4_t v_val_23_lo = vld1_s16(src + 2 * stride);
+  const int16x4_t v_val_23_hi = vld1_s16(src + 3 * stride);
+  int32x4_t v_sq_01_d = vmull_s16(v_val_01_lo, v_val_01_lo);
+  v_sq_01_d = vmlal_s16(v_sq_01_d, v_val_01_hi, v_val_01_hi);
+  int32x4_t v_sq_23_d = vmull_s16(v_val_23_lo, v_val_23_lo);
+  v_sq_23_d = vmlal_s16(v_sq_23_d, v_val_23_hi, v_val_23_hi);
+#if defined(__aarch64__)
+  return vreinterpretq_u32_s32(vpaddq_s32(v_sq_01_d, v_sq_23_d));
+#else
+  return vreinterpretq_u32_s32(vcombine_s32(
+      vqmovn_s64(vpaddlq_s32(v_sq_01_d)), vqmovn_s64(vpaddlq_s32(v_sq_23_d))));
+#endif
+}
+
+uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src, int stride) {
+  const uint32x4_t v_sum_0123_d = sum_squares_i16_4x4_neon(src, stride);
+#if defined(__aarch64__)
+  return (uint64_t)vaddvq_u32(v_sum_0123_d);
+#else
+  uint64x2_t v_sum_d = vpaddlq_u32(v_sum_0123_d);
+  v_sum_d = vaddq_u64(v_sum_d, vextq_u64(v_sum_d, v_sum_d, 1));
+  return vgetq_lane_u64(v_sum_d, 0);
+#endif
+}
+
+uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src, int stride,
+                                         int height) {
+  int r = 0;
+  uint32x4_t v_acc_q = vdupq_n_u32(0);
+  do {
+    const uint32x4_t v_acc_d = sum_squares_i16_4x4_neon(src, stride);
+    v_acc_q = vaddq_u32(v_acc_q, v_acc_d);
+    src += stride << 2;
+    r += 4;
+  } while (r < height);
+
+  uint64x2_t v_acc_64 = vpaddlq_u32(v_acc_q);
+#if defined(__aarch64__)
+  return vaddvq_u64(v_acc_64);
+#else
+  v_acc_64 = vaddq_u64(v_acc_64, vextq_u64(v_acc_64, v_acc_64, 1));
+  return vgetq_lane_u64(v_acc_64, 0);
+#endif
+}
+
+uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src, int stride,
+                                         int width, int height) {
+  int r = 0;
+  const int32x4_t zero = vdupq_n_s32(0);
+  uint64x2_t v_acc_q = vreinterpretq_u64_s32(zero);
+  do {
+    int32x4_t v_sum = zero;
+    int c = 0;
+    do {
+      const int16_t *b = src + c;
+      const int16x8_t v_val_0 = vld1q_s16(b + 0 * stride);
+      const int16x8_t v_val_1 = vld1q_s16(b + 1 * stride);
+      const int16x8_t v_val_2 = vld1q_s16(b + 2 * stride);
+      const int16x8_t v_val_3 = vld1q_s16(b + 3 * stride);
+      const int16x4_t v_val_0_lo = vget_low_s16(v_val_0);
+      const int16x4_t v_val_1_lo = vget_low_s16(v_val_1);
+      const int16x4_t v_val_2_lo = vget_low_s16(v_val_2);
+      const int16x4_t v_val_3_lo = vget_low_s16(v_val_3);
+      int32x4_t v_sum_01 = vmull_s16(v_val_0_lo, v_val_0_lo);
+      v_sum_01 = vmlal_s16(v_sum_01, v_val_1_lo, v_val_1_lo);
+      int32x4_t v_sum_23 = vmull_s16(v_val_2_lo, v_val_2_lo);
+      v_sum_23 = vmlal_s16(v_sum_23, v_val_3_lo, v_val_3_lo);
+#if defined(__aarch64__)
+      v_sum_01 = vmlal_high_s16(v_sum_01, v_val_0, v_val_0);
+      v_sum_01 = vmlal_high_s16(v_sum_01, v_val_1, v_val_1);
+      v_sum_23 = vmlal_high_s16(v_sum_23, v_val_2, v_val_2);
+      v_sum_23 = vmlal_high_s16(v_sum_23, v_val_3, v_val_3);
+      v_sum = vaddq_s32(v_sum, vpaddq_s32(v_sum_01, v_sum_23));
+#else
+      const int16x4_t v_val_0_hi = vget_high_s16(v_val_0);
+      const int16x4_t v_val_1_hi = vget_high_s16(v_val_1);
+      const int16x4_t v_val_2_hi = vget_high_s16(v_val_2);
+      const int16x4_t v_val_3_hi = vget_high_s16(v_val_3);
+      v_sum_01 = vmlal_s16(v_sum_01, v_val_0_hi, v_val_0_hi);
+      v_sum_01 = vmlal_s16(v_sum_01, v_val_1_hi, v_val_1_hi);
+      v_sum_23 = vmlal_s16(v_sum_23, v_val_2_hi, v_val_2_hi);
+      v_sum_23 = vmlal_s16(v_sum_23, v_val_3_hi, v_val_3_hi);
+      v_sum = vaddq_s32(v_sum, vcombine_s32(vqmovn_s64(vpaddlq_s32(v_sum_01)),
+                                            vqmovn_s64(vpaddlq_s32(v_sum_23))));
+#endif
+      c += 8;
+    } while (c < width);
+
+    v_acc_q = vpadalq_u32(v_acc_q, vreinterpretq_u32_s32(v_sum));
+
+    src += 4 * stride;
+    r += 4;
+  } while (r < height);
+#if defined(__aarch64__)
+  return vaddvq_u64(v_acc_q);
+#else
+  v_acc_q = vaddq_u64(v_acc_q, vextq_u64(v_acc_q, v_acc_q, 1));
+  return vgetq_lane_u64(v_acc_q, 0);
+#endif
+}
+
+uint64_t aom_sum_squares_2d_i16_neon(const int16_t *src, int stride, int width,
+                                     int height) {
+  // 4 elements per row only requires half an SIMD register, so this
+  // must be a special case, but also note that over 75% of all calls
+  // are with size == 4, so it is also the common case.
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_squares_2d_i16_4x4_neon(src, stride);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_neon(src, stride, height);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+    // Generic case
+    return aom_sum_squares_2d_i16_nxn_neon(src, stride, width, height);
+  } else {
+    return aom_sum_squares_2d_i16_c(src, stride, width, height);
+  }
+}
diff --git a/media/libaom/src/av1/common/arm/transpose_neon.h b/media/libaom/src/aom_dsp/arm/transpose_neon.h
index 91d89b43f7..26fc1fd740 100644
--- a/media/libaom/src/av1/common/arm/transpose_neon.h
+++ b/media/libaom/src/aom_dsp/arm/transpose_neon.h
@@ -8,11 +8,16 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
-#define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
+#define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
 
 #include <arm_neon.h>
 
+// Swap high and low halves.
+static INLINE uint16x8_t transpose64_u16q(const uint16x8_t a) {
+  return vextq_u16(a, a, 4);
+}
+
 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
                                     uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
                                     uint8x8_t *a6, uint8x8_t *a7) {
@@ -185,6 +190,153 @@ static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
   *a3 = d1.val[1];
 }
 
+// Input:
+// 00 01 02 03
+// 10 11 12 13
+// 20 21 22 23
+// 30 31 32 33
+// Output:
+// 00 10 20 30
+// 01 11 21 31
+// 02 12 22 32
+// 03 13 23 33
+static INLINE void transpose_u16_4x4(uint16x4_t a[4]) {
+  // b:
+  // 00 10 02 12
+  // 01 11 03 13
+  const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+  // c:
+  // 20 30 22 32
+  // 21 31 23 33
+  const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+  // d:
+  // 00 10 20 30
+  // 02 12 22 32
+  const uint32x2x2_t d =
+      vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+  // e:
+  // 01 11 21 31
+  // 03 13 23 33
+  const uint32x2x2_t e =
+      vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+  a[0] = vreinterpret_u16_u32(d.val[0]);
+  a[1] = vreinterpret_u16_u32(e.val[0]);
+  a[2] = vreinterpret_u16_u32(d.val[1]);
+  a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+// 4x8 Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 00 10 20 30 04 14 24 34
+// a[1]: 01 11 21 31 05 15 25 35
+// a[2]: 02 12 22 32 06 16 26 36
+// a[3]: 03 13 23 33 07 17 27 37
+static INLINE void transpose_u16_4x8q(uint16x8_t a[4]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+
+  a[0] = vreinterpretq_u16_u32(c0.val[0]);
+  a[1] = vreinterpretq_u16_u32(c1.val[0]);
+  a[2] = vreinterpretq_u16_u32(c0.val[1]);
+  a[3] = vreinterpretq_u16_u32(c1.val[1]);
+}
+
+static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(const uint32x4_t a0,
+                                                const uint32x4_t a1) {
+  uint16x8x2_t b0;
+  b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+                           vreinterpret_u16_u32(vget_low_u32(a1)));
+  b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+                           vreinterpret_u16_u32(vget_high_u32(a1)));
+  return b0;
+}
+
+// Special transpose for loop filter.
+// 4x8 Input:
+// p_q:  p3 p2 p1 p0 q0 q1 q2 q3
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 03 13 23 33 04 14 24 34  p0q0
+// a[1]: 02 12 22 32 05 15 25 35  p1q1
+// a[2]: 01 11 21 31 06 16 26 36  p2q2
+// a[3]: 00 10 20 30 07 17 27 37  p3q3
+// Direct reapplication of the function will reset the high halves, but
+// reverse the low halves:
+// p_q:  p0 p1 p2 p3 q0 q1 q2 q3
+// a[0]: 33 32 31 30 04 05 06 07
+// a[1]: 23 22 21 20 14 15 16 17
+// a[2]: 13 12 11 10 24 25 26 27
+// a[3]: 03 02 01 00 34 35 36 37
+// Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
+// reverse the high halves.
+// The standard transpose_u16_4x8q will produce the same reversals, but with the
+// order of the low halves also restored relative to the high halves. This is
+// preferable because it puts all values from the same source row back together,
+// but some post-processing is inevitable.
+static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+  // Reverse odd vectors to bring the appropriate items to the front of zips.
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // r0       : 03 13 01 11 07 17 05 15
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // r1       : 23 33 21 31 27 37 25 35
+  const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
+  const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
+
+  // Zip to complete the halves.
+  // c0.val[0]: 00 10 20 30 02 12 22 32  p3p1
+  // c0.val[1]: 04 14 24 34 06 16 26 36  q0q2
+  // c1.val[0]: 03 13 23 33 01 11 21 31  p0p2
+  // c1.val[1]: 07 17 27 37 05 15 25 35  q3q1
+  const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vzipq_u32(r0, r1);
+
+  // d0.val[0]: 00 10 20 30 07 17 27 37  p3q3
+  // d0.val[1]: 02 12 22 32 05 15 25 35  p1q1
+  // d1.val[0]: 03 13 23 33 04 14 24 34  p0q0
+  // d1.val[1]: 01 11 21 31 06 16 26 36  p2q2
+  const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
+  // The third row of c comes first here to swap p2 with q0.
+  const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
+
+  // 8x4 Output:
+  // a[0]: 03 13 23 33 04 14 24 34  p0q0
+  // a[1]: 02 12 22 32 05 15 25 35  p1q1
+  // a[2]: 01 11 21 31 06 16 26 36  p2q2
+  // a[3]: 00 10 20 30 07 17 27 37  p3q3
+  a[0] = d1.val[0];  // p0q0
+  a[1] = d0.val[1];  // p1q1
+  a[2] = d1.val[1];  // p2q2
+  a[3] = d0.val[0];  // p3q3
+}
+
 static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
                                      uint16x4_t *a2, uint16x4_t *a3,
                                      uint16x4_t *a4, uint16x4_t *a5,
@@ -599,4 +751,4 @@ static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
   *a3 = c1.val[1];
 }
 
-#endif  // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#endif  // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/media/libaom/src/aom_dsp/arm/variance_neon.c b/media/libaom/src/aom_dsp/arm/variance_neon.c
index d4107ce0d2..e840f1307e 100644
--- a/media/libaom/src/aom_dsp/arm/variance_neon.c
+++ b/media/libaom/src/aom_dsp/arm/variance_neon.c
@@ -56,6 +56,18 @@ void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
   variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
 }
 
+// TODO(yunqingwang): Perform variance of two/four 8x8 blocks similar to that of
+// AVX2.
+void aom_get_sse_sum_8x8_quad_neon(const uint8_t *a, int a_stride,
+                                   const uint8_t *b, int b_stride,
+                                   unsigned int *sse, int *sum) {
+  // Loop over 4 8x8 blocks. Process one 8x32 block.
+  for (int k = 0; k < 4; k++) {
+    variance_neon_w8(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8,
+                     &sse[k], &sum[k]);
+  }
+}
+
 unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride,
                                   const uint8_t *b, int b_stride,
                                   unsigned int *sse) {
@@ -399,3 +411,257 @@ unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr,
 
   return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
 }
+
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x4_t a_u32 = vdupq_n_u32(0);
+  if (stride == 4) return vld1q_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 2);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 3);
+  return vreinterpretq_u8_u32(a_u32);
+}
+
+// The variance helper functions use int16_t for sum. 8 values are accumulated
+// and then added (at which point they expand up to int32_t). To avoid overflow,
+// there can be no more than 32767 / 255 ~= 128 values accumulated in each
+// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32
+// rows = 128. Asserts have been added to each function to warn against reaching
+// this limit.
+
+// Process a block of width 4 four rows at a time.
+static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int h, uint32_t *sse, int *sum) {
+  const int32x4_t zero = vdupq_n_s32(0);
+  int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
+  int32x4_t sse_s32 = zero;
+
+  // Since width is only 4, sum_s16 only loads a half row per loop.
+  assert(h <= 256);
+
+  int i;
+  for (i = 0; i < h; i += 4) {
+    const uint8x16_t a_u8 = load_unaligned_u8q(a, a_stride);
+    const uint8x16_t b_u8 = load_unaligned_u8q(b, b_stride);
+    const int16x8_t diff_lo_s16 =
+        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)));
+    const int16x8_t diff_hi_s16 =
+        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)));
+
+    sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
+    sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_lo_s16),
+                        vget_low_s16(diff_lo_s16));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_lo_s16),
+                        vget_high_s16(diff_lo_s16));
+
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_hi_s16),
+                        vget_low_s16(diff_hi_s16));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_hi_s16),
+                        vget_high_s16(diff_hi_s16));
+
+    a += 4 * a_stride;
+    b += 4 * b_stride;
+  }
+
+#if defined(__aarch64__)
+  *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
+  *sse = (uint32_t)vaddvq_s32(sse_s32);
+#else
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
+#endif
+}
+
+// Process a block of any size where the width is divisible by 16.
+static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int w, int h, uint32_t *sse,
+                              int *sum) {
+  const int32x4_t zero = vdupq_n_s32(0);
+  int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
+  int32x4_t sse_s32 = zero;
+
+  // The loop loads 16 values at a time but doubles them up when accumulating
+  // into sum_s16.
+  assert(w / 8 * h <= 128);
+
+  int i, j;
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 16) {
+      const uint8x16_t a_u8 = vld1q_u8(a + j);
+      const uint8x16_t b_u8 = vld1q_u8(b + j);
+
+      const int16x8_t diff_lo_s16 =
+          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)));
+      const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(
+          vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)));
+
+      sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
+      sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+
+      sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_lo_s16),
+                          vget_low_s16(diff_lo_s16));
+      sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_lo_s16),
+                          vget_high_s16(diff_lo_s16));
+
+      sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_hi_s16),
+                          vget_low_s16(diff_hi_s16));
+      sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_hi_s16),
+                          vget_high_s16(diff_hi_s16));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+#if defined(__aarch64__)
+  *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
+  *sse = (uint32_t)vaddvq_s32(sse_s32);
+#else
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
+#endif
+}
+
+// Process a block of width 8 two rows at a time.
+static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int h, uint32_t *sse, int *sum) {
+  const int32x4_t zero = vdupq_n_s32(0);
+  int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
+  int32x4_t sse_s32 = zero;
+
+  // Each column has it's own accumulator entry in sum_s16.
+  assert(h <= 128);
+
+  int i = 0;
+  do {
+    const uint8x8_t a_0_u8 = vld1_u8(a);
+    const uint8x8_t a_1_u8 = vld1_u8(a + a_stride);
+    const uint8x8_t b_0_u8 = vld1_u8(b);
+    const uint8x8_t b_1_u8 = vld1_u8(b + b_stride);
+    const int16x8_t diff_0_s16 =
+        vreinterpretq_s16_u16(vsubl_u8(a_0_u8, b_0_u8));
+    const int16x8_t diff_1_s16 =
+        vreinterpretq_s16_u16(vsubl_u8(a_1_u8, b_1_u8));
+    sum_s16 = vaddq_s16(sum_s16, diff_0_s16);
+    sum_s16 = vaddq_s16(sum_s16, diff_1_s16);
+    sse_s32 =
+        vmlal_s16(sse_s32, vget_low_s16(diff_0_s16), vget_low_s16(diff_0_s16));
+    sse_s32 =
+        vmlal_s16(sse_s32, vget_low_s16(diff_1_s16), vget_low_s16(diff_1_s16));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_0_s16),
+                        vget_high_s16(diff_0_s16));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_1_s16),
+                        vget_high_s16(diff_1_s16));
+    a += a_stride + a_stride;
+    b += b_stride + b_stride;
+    i += 2;
+  } while (i < h);
+
+#if defined(__aarch64__)
+  *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
+  *sse = (uint32_t)vaddvq_s32(sse_s32);
+#else
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
+#endif
+}
+
+#define VARIANCE_NXM(n, m, shift)                                           \
+  unsigned int aom_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \
+                                            const uint8_t *b, int b_stride, \
+                                            unsigned int *sse) {            \
+    int sum;                                                                \
+    if (n == 4)                                                             \
+      variance_neon_w4x4(a, a_stride, b, b_stride, m, sse, &sum);           \
+    else if (n == 8)                                                        \
+      variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum);           \
+    else                                                                    \
+      variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum);         \
+    if (n * m < 16 * 16)                                                    \
+      return *sse - ((sum * sum) >> shift);                                 \
+    else                                                                    \
+      return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);              \
+  }
+
+static void variance_neon_wide_block(const uint8_t *a, int a_stride,
+                                     const uint8_t *b, int b_stride, int w,
+                                     int h, uint32_t *sse, int *sum) {
+  const int32x4_t zero = vdupq_n_s32(0);
+  int32x4_t v_diff = zero;
+  int64x2_t v_sse = vreinterpretq_s64_s32(zero);
+
+  int s, i, j;
+  for (s = 0; s < 16; s++) {
+    int32x4_t sse_s32 = zero;
+    int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
+    for (i = (s * h) >> 4; i < (((s + 1) * h) >> 4); ++i) {
+      for (j = 0; j < w; j += 16) {
+        const uint8x16_t a_u8 = vld1q_u8(a + j);
+        const uint8x16_t b_u8 = vld1q_u8(b + j);
+
+        const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(
+            vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)));
+        const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(
+            vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)));
+
+        sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
+        sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+
+        sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_lo_s16),
+                            vget_low_s16(diff_lo_s16));
+        sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_lo_s16),
+                            vget_high_s16(diff_lo_s16));
+        sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_hi_s16),
+                            vget_low_s16(diff_hi_s16));
+        sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_hi_s16),
+                            vget_high_s16(diff_hi_s16));
+      }
+
+      a += a_stride;
+      b += b_stride;
+    }
+
+    v_diff = vpadalq_s16(v_diff, sum_s16);
+    v_sse = vpadalq_s32(v_sse, sse_s32);
+  }
+#if defined(__aarch64__)
+  int diff = vaddvq_s32(v_diff);
+  uint32_t sq = (uint32_t)vaddvq_u64(vreinterpretq_u64_s64(v_sse));
+#else
+  int diff = horizontal_add_s32x4(v_diff);
+  uint32_t sq = vget_lane_u32(
+      vreinterpret_u32_s64(vadd_s64(vget_low_s64(v_sse), vget_high_s64(v_sse))),
+      0);
+#endif
+
+  *sum = diff;
+  *sse = sq;
+}
+
+#define VARIANCE_NXM_WIDE(W, H)                                             \
+  unsigned int aom_variance##W##x##H##_neon(const uint8_t *a, int a_stride, \
+                                            const uint8_t *b, int b_stride, \
+                                            uint32_t *sse) {                \
+    int sum;                                                                \
+    variance_neon_wide_block(a, a_stride, b, b_stride, W, H, sse, &sum);    \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
+  }
+
+VARIANCE_NXM(4, 4, 4)
+VARIANCE_NXM(4, 8, 5)
+VARIANCE_NXM(8, 4, 5)
+VARIANCE_NXM(16, 32, 9)
+VARIANCE_NXM(32, 16, 9)
+VARIANCE_NXM_WIDE(128, 64)
+VARIANCE_NXM_WIDE(64, 128)
diff --git a/media/libaom/src/aom_dsp/avg.c b/media/libaom/src/aom_dsp/avg.c
index 7386296fd0..eb0f0ab6f9 100644
--- a/media/libaom/src/aom_dsp/avg.c
+++ b/media/libaom/src/aom_dsp/avg.c
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <assert.h>
 #include <stdlib.h>
 
 #include "config/aom_dsp_rtcd.h"
@@ -48,6 +49,16 @@ unsigned int aom_avg_8x8_c(const uint8_t *s, int p) {
   return (sum + 32) >> 6;
 }
 
+void aom_avg_8x8_quad_c(const uint8_t *s, int p, int x16_idx, int y16_idx,
+                        int *avg) {
+  for (int k = 0; k < 4; k++) {
+    const int x8_idx = x16_idx + ((k & 1) << 3);
+    const int y8_idx = y16_idx + ((k >> 1) << 3);
+    const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
+    avg[k] = aom_avg_8x8_c(s_tmp, p);
+  }
+}
+
 #if CONFIG_AV1_HIGHBITDEPTH
 unsigned int aom_highbd_avg_8x8_c(const uint8_t *s8, int p) {
   int i, j;
@@ -88,6 +99,52 @@ void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
+void aom_pixel_scale_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                       int16_t *coeff, int log_scale, int h8, int w8) {
+  for (int idy = 0; idy < h8 * 8; ++idy)
+    for (int idx = 0; idx < w8 * 8; ++idx)
+      coeff[idy * (h8 * 8) + idx] = src_diff[idy * src_stride + idx]
+                                    << log_scale;
+}
+
+static void hadamard_col4(const int16_t *src_diff, ptrdiff_t src_stride,
+                          int16_t *coeff) {
+  int16_t b0 = (src_diff[0 * src_stride] + src_diff[1 * src_stride]) >> 1;
+  int16_t b1 = (src_diff[0 * src_stride] - src_diff[1 * src_stride]) >> 1;
+  int16_t b2 = (src_diff[2 * src_stride] + src_diff[3 * src_stride]) >> 1;
+  int16_t b3 = (src_diff[2 * src_stride] - src_diff[3 * src_stride]) >> 1;
+
+  coeff[0] = b0 + b2;
+  coeff[1] = b1 + b3;
+  coeff[2] = b0 - b2;
+  coeff[3] = b1 - b3;
+}
+
+void aom_hadamard_4x4_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                        tran_low_t *coeff) {
+  int idx;
+  int16_t buffer[16];
+  int16_t buffer2[16];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 4; ++idx) {
+    hadamard_col4(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
+                                                   // dynamic range [-255, 255]
+    tmp_buf += 4;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 4; ++idx) {
+    hadamard_col4(tmp_buf, 4, buffer2 + 4 * idx);  // tmp_buf: 12 bit
+    // dynamic range [-2040, 2040]
+    // buffer2: 15 bit
+    // dynamic range [-16320, 16320]
+    ++tmp_buf;
+  }
+
+  for (idx = 0; idx < 16; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+}
+
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
 //           second pass, 12 bit, dynamic range [-2040, 2040]
 static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
@@ -171,6 +228,14 @@ void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
   for (int idx = 0; idx < 64; ++idx) coeff[idx] = buffer2[idx];
 }
 
+void aom_hadamard_8x8_dual_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                             int16_t *coeff) {
+  for (int i = 0; i < 2; i++) {
+    aom_hadamard_lp_8x8_c(src_diff + (i * 8), src_stride,
+                          (int16_t *)coeff + (i * 64));
+  }
+}
+
 // In place 16x16 2D Hadamard transform
 void aom_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
                           tran_low_t *coeff) {
@@ -446,6 +511,7 @@ void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
                        const int ref_stride, const int height) {
   int idx;
   const int norm_factor = height >> 1;
+  assert(height >= 2);
   for (idx = 0; idx < 16; ++idx) {
     int i;
     hbuf[idx] = 0;
diff --git a/media/libaom/src/aom_dsp/binary_codes_reader.c b/media/libaom/src/aom_dsp/binary_codes_reader.c
index 7cd903d821..ee0ce62278 100644
--- a/media/libaom/src/aom_dsp/binary_codes_reader.c
+++ b/media/libaom/src/aom_dsp/binary_codes_reader.c
@@ -11,7 +11,6 @@
 
 #include "aom_dsp/binary_codes_reader.h"
 #include "aom_dsp/recenter.h"
-#include "av1/common/common.h"
 
 uint16_t aom_read_primitive_quniform_(aom_reader *r,
                                       uint16_t n ACCT_STR_PARAM) {
diff --git a/media/libaom/src/aom_dsp/binary_codes_writer.c b/media/libaom/src/aom_dsp/binary_codes_writer.c
index adf1c1304c..55ce8429d7 100644
--- a/media/libaom/src/aom_dsp/binary_codes_writer.c
+++ b/media/libaom/src/aom_dsp/binary_codes_writer.c
@@ -13,7 +13,6 @@
 #include "aom_dsp/binary_codes_writer.h"
 #include "aom_dsp/recenter.h"
 #include "aom_ports/bitops.h"
-#include "av1/common/common.h"
 
 // Codes a symbol v in [-2^mag_bits, 2^mag_bits].
 // mag_bits is number of bits for magnitude. The alphabet is of size
diff --git a/media/libaom/src/aom_dsp/bitreader.h b/media/libaom/src/aom_dsp/bitreader.h
index a8b3f55efc..29321f916e 100644
--- a/media/libaom/src/aom_dsp/bitreader.h
+++ b/media/libaom/src/aom_dsp/bitreader.h
@@ -20,8 +20,12 @@
 #include "aom/aomdx.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/entdec.h"
+#include "aom_dsp/odintrin.h"
 #include "aom_dsp/prob.h"
-#include "av1/common/odintrin.h"
+
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
 
 #if CONFIG_ACCOUNTING
 #include "av1/decoder/accounting.h"
diff --git a/media/libaom/src/aom_dsp/bitwriter.c b/media/libaom/src/aom_dsp/bitwriter.c
index 41fcc51754..23d28a1abb 100644
--- a/media/libaom/src/aom_dsp/bitwriter.c
+++ b/media/libaom/src/aom_dsp/bitwriter.c
@@ -29,3 +29,8 @@ int aom_stop_encode(aom_writer *w) {
   od_ec_enc_clear(&w->ec);
   return nb_bits;
 }
+
+int aom_tell_size(aom_writer *w) {
+  const int nb_bits = od_ec_enc_tell(&w->ec);
+  return nb_bits;
+}
diff --git a/media/libaom/src/aom_dsp/bitwriter.h b/media/libaom/src/aom_dsp/bitwriter.h
index 4e77a17944..fb33909968 100644
--- a/media/libaom/src/aom_dsp/bitwriter.h
+++ b/media/libaom/src/aom_dsp/bitwriter.h
@@ -24,6 +24,10 @@
 #include "av1/encoder/cost.h"
 #endif
 
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -60,18 +64,12 @@ void aom_start_encode(aom_writer *w, uint8_t *buffer);
 
 int aom_stop_encode(aom_writer *w);
 
+int aom_tell_size(aom_writer *w);
+
 static INLINE void aom_write(aom_writer *w, int bit, int probability) {
   int p = (0x7FFFFF - (probability << 15) + probability) >> 8;
 #if CONFIG_BITSTREAM_DEBUG
   aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
-  /*int queue_r = 0;
-  int frame_idx_r = 0;
-  int queue_w = bitstream_queue_get_write();
-  int frame_idx_w = aom_bitstream_queue_get_frame_writee();
-  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
-    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
-    frame_idx_w, queue_w);
-  }*/
   bitstream_queue_push(bit, cdf, 2);
 #endif
 
@@ -91,14 +89,6 @@ static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
 static INLINE void aom_write_cdf(aom_writer *w, int symb,
                                  const aom_cdf_prob *cdf, int nsymbs) {
 #if CONFIG_BITSTREAM_DEBUG
-  /*int queue_r = 0;
-  int frame_idx_r = 0;
-  int queue_w = bitstream_queue_get_write();
-  int frame_idx_w = aom_bitstream_queue_get_frame_writee();
-  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
-    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
-    frame_idx_w, queue_w);
-  }*/
   bitstream_queue_push(symb, cdf, nsymbs);
 #endif
 
diff --git a/media/libaom/src/aom_dsp/blend_a64_mask.c b/media/libaom/src/aom_dsp/blend_a64_mask.c
index 32f2dc6d81..35017fd737 100644
--- a/media/libaom/src/aom_dsp/blend_a64_mask.c
+++ b/media/libaom/src/aom_dsp/blend_a64_mask.c
@@ -22,7 +22,7 @@
 // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
 // be the same as dst, or dst can be different from both sources.
 
-// NOTE(david.barker): The input and output of aom_blend_a64_d16_mask_c() are
+// NOTE(rachelbarker): The input and output of aom_blend_a64_d16_mask_c() are
 // in a higher intermediate precision, and will later be rounded down to pixel
 // precision.
 // Thus, in order to avoid double-rounding, we want to use normal right shifts
diff --git a/media/libaom/src/aom_dsp/butteraugli.c b/media/libaom/src/aom_dsp/butteraugli.c
new file mode 100644
index 0000000000..8d2a29f7a3
--- /dev/null
+++ b/media/libaom/src/aom_dsp/butteraugli.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <jxl/butteraugli.h>
+
+#include "aom_dsp/butteraugli.h"
+#include "aom_mem/aom_mem.h"
+#include "third_party/libyuv/include/libyuv/convert_argb.h"
+
+int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                         aom_matrix_coefficients_t matrix_coefficients,
+                         aom_color_range_t color_range, float *dist_map) {
+  (void)bit_depth;
+  assert(bit_depth == 8);
+  const int width = source->y_crop_width;
+  const int height = source->y_crop_height;
+  const int ss_x = source->subsampling_x;
+  const int ss_y = source->subsampling_y;
+
+  const struct YuvConstants *yuv_constants;
+  if (matrix_coefficients == AOM_CICP_MC_BT_709) {
+    if (color_range == AOM_CR_FULL_RANGE) return 0;
+    yuv_constants = &kYuvH709Constants;
+  } else {
+    yuv_constants = color_range == AOM_CR_FULL_RANGE ? &kYuvJPEGConstants
+                                                     : &kYuvI601Constants;
+  }
+
+  const int stride_argb = width * 4;
+  const size_t buffer_size = (size_t)height * stride_argb;
+  uint8_t *src_argb = (uint8_t *)aom_malloc(buffer_size);
+  uint8_t *distorted_argb = (uint8_t *)aom_malloc(buffer_size);
+  if (!src_argb || !distorted_argb) {
+    aom_free(src_argb);
+    aom_free(distorted_argb);
+    return 0;
+  }
+
+  if (ss_x == 1 && ss_y == 1) {
+    I420ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                     source->uv_stride, source->v_buffer, source->uv_stride,
+                     src_argb, stride_argb, yuv_constants, width, height);
+    I420ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                     distorted->u_buffer, distorted->uv_stride,
+                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                     stride_argb, yuv_constants, width, height);
+  } else if (ss_x == 1 && ss_y == 0) {
+    I422ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                     source->uv_stride, source->v_buffer, source->uv_stride,
+                     src_argb, stride_argb, yuv_constants, width, height);
+    I422ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                     distorted->u_buffer, distorted->uv_stride,
+                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                     stride_argb, yuv_constants, width, height);
+  } else if (ss_x == 0 && ss_y == 0) {
+    I444ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                     source->uv_stride, source->v_buffer, source->uv_stride,
+                     src_argb, stride_argb, yuv_constants, width, height);
+    I444ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                     distorted->u_buffer, distorted->uv_stride,
+                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                     stride_argb, yuv_constants, width, height);
+  } else {
+    aom_free(src_argb);
+    aom_free(distorted_argb);
+    return 0;
+  }
+
+  JxlPixelFormat pixel_format = { 4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0 };
+  JxlButteraugliApi *api = JxlButteraugliApiCreate(NULL);
+  JxlButteraugliApiSetHFAsymmetry(api, 0.8f);
+
+  JxlButteraugliResult *result = JxlButteraugliCompute(
+      api, width, height, &pixel_format, src_argb, buffer_size, &pixel_format,
+      distorted_argb, buffer_size);
+
+  const float *distmap = NULL;
+  uint32_t row_stride;
+  JxlButteraugliResultGetDistmap(result, &distmap, &row_stride);
+  if (distmap == NULL) {
+    JxlButteraugliApiDestroy(api);
+    JxlButteraugliResultDestroy(result);
+    aom_free(src_argb);
+    aom_free(distorted_argb);
+    return 0;
+  }
+
+  for (int j = 0; j < height; ++j) {
+    for (int i = 0; i < width; ++i) {
+      dist_map[j * width + i] = distmap[j * row_stride + i];
+    }
+  }
+
+  JxlButteraugliApiDestroy(api);
+  JxlButteraugliResultDestroy(result);
+  aom_free(src_argb);
+  aom_free(distorted_argb);
+  return 1;
+}
diff --git a/media/libaom/src/aom_dsp/butteraugli.h b/media/libaom/src/aom_dsp/butteraugli.h
new file mode 100644
index 0000000000..5304092ccb
--- /dev/null
+++ b/media/libaom/src/aom_dsp/butteraugli.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BUTTERAUGLI_H_
+#define AOM_AOM_DSP_BUTTERAUGLI_H_
+
+#include "aom_scale/yv12config.h"
+
+// Returns a boolean that indicates success/failure.
+int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                         aom_matrix_coefficients_t matrix_coefficients,
+                         aom_color_range_t color_range, float *dist_map);
+
+#endif  // AOM_AOM_DSP_BUTTERAUGLI_H_
diff --git a/media/libaom/src/aom_dsp/entcode.h b/media/libaom/src/aom_dsp/entcode.h
index 7518879217..526ca598d3 100644
--- a/media/libaom/src/aom_dsp/entcode.h
+++ b/media/libaom/src/aom_dsp/entcode.h
@@ -14,7 +14,7 @@
 
 #include <limits.h>
 #include <stddef.h>
-#include "av1/common/odintrin.h"
+#include "aom_dsp/odintrin.h"
 #include "aom_dsp/prob.h"
 
 #define EC_PROB_SHIFT 6
diff --git a/media/libaom/src/aom_dsp/fastssim.c b/media/libaom/src/aom_dsp/fastssim.c
index 3804519b31..0ef0590e89 100644
--- a/media/libaom/src/aom_dsp/fastssim.c
+++ b/media/libaom/src/aom_dsp/fastssim.c
@@ -20,7 +20,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/ssim.h"
-#include "aom_ports/system_state.h"
 
 typedef struct fs_level fs_level;
 typedef struct fs_ctx fs_ctx;
@@ -31,6 +30,7 @@ typedef struct fs_ctx fs_ctx;
 #define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
 #define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
 #define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
+#define MAX_SSIM_DB 100.0
 
 #define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
 #define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
@@ -49,7 +49,7 @@ struct fs_ctx {
   unsigned *col_buf;
 };
 
-static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
   unsigned char *data;
   size_t data_size;
   int lw;
@@ -73,6 +73,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
     lh = (lh + 1) >> 1;
   }
   data = (unsigned char *)malloc(data_size);
+  if (!data) return -1;
   _ctx->level = (fs_level *)data;
   _ctx->nlevels = _nlevels;
   data += _nlevels * sizeof(*_ctx->level);
@@ -97,6 +98,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
     lh = (lh + 1) >> 1;
   }
   _ctx->col_buf = (unsigned *)data;
+  return 0;
 }
 
 static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
@@ -446,7 +448,7 @@ static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
   double ret;
   int l;
   ret = 1;
-  fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
+  if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0;
   fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift,
                        buf_is_hbd);
   for (l = 0; l < FS_NLEVELS - 1; l++) {
@@ -467,7 +469,6 @@ double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
                          uint32_t in_bd) {
   double ssimv;
   uint32_t bd_shift = 0;
-  aom_clear_system_state();
   assert(bd >= in_bd);
   assert(source->flags == dest->flags);
   int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;
diff --git a/media/libaom/src/aom_dsp/fft.c b/media/libaom/src/aom_dsp/fft.c
index 0ba71cfb34..cad4a6563f 100644
--- a/media/libaom/src/aom_dsp/fft.c
+++ b/media/libaom/src/aom_dsp/fft.c
@@ -76,15 +76,15 @@ static INLINE float add_float(float a, float b) { return a + b; }
 static INLINE float sub_float(float a, float b) { return a - b; }
 static INLINE float mul_float(float a, float b) { return a * b; }
 
-GEN_FFT_2(void, float, float, float, *, store_float);
+GEN_FFT_2(void, float, float, float, *, store_float)
 GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
-          sub_float);
+          sub_float)
 GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
-          sub_float, mul_float);
+          sub_float, mul_float)
 GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
-           sub_float, mul_float);
+           sub_float, mul_float)
 GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
-           sub_float, mul_float);
+           sub_float, mul_float)
 
 void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
   aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
@@ -183,15 +183,15 @@ void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
   transpose(temp, output, n);
 }
 
-GEN_IFFT_2(void, float, float, float, *, store_float);
+GEN_IFFT_2(void, float, float, float, *, store_float)
 GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
-           sub_float);
+           sub_float)
 GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
-           sub_float, mul_float);
+           sub_float, mul_float)
 GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
-            sub_float, mul_float);
+            sub_float, mul_float)
 GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
-            sub_float, mul_float);
+            sub_float, mul_float)
 
 void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
   aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
diff --git a/media/libaom/src/aom_dsp/grain_synthesis.h b/media/libaom/src/aom_dsp/grain_params.h
index 9155b39035..5a28afc2a1 100644
--- a/media/libaom/src/aom_dsp/grain_synthesis.h
+++ b/media/libaom/src/aom_dsp/grain_params.h
@@ -10,20 +10,20 @@
  */
 
 /*!\file
- * \brief Describes film grain parameters and film grain synthesis
+ * \brief Describes film grain parameters
  *
  */
-#ifndef AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
-#define AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
+#ifndef AOM_AOM_DSP_GRAIN_PARAMS_H_
+#define AOM_AOM_DSP_GRAIN_PARAMS_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#include <stdint.h>
 #include <string.h>
 
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom/aom_image.h"
+#include "config/aom_config.h"
 
 /*!\brief Structure containing film grain synthesis parameters for a frame
  *
@@ -31,7 +31,7 @@ extern "C" {
  */
 typedef struct {
   // This structure is compared element-by-element in the function
-  // av1_check_grain_params_equiv: this function must be updated if any changes
+  // aom_check_grain_params_equiv: this function must be updated if any changes
   // are made to this structure.
   int apply_grain;
 
@@ -85,7 +85,7 @@ typedef struct {
 
   uint16_t random_seed;
   // This structure is compared element-by-element in the function
-  // av1_check_grain_params_equiv: this function must be updated if any changes
+  // aom_check_grain_params_equiv: this function must be updated if any changes
   // are made to this structure.
 } aom_film_grain_t;
 
@@ -98,7 +98,7 @@ typedef struct {
  * \param[in]    pb               The second set of parameters to compare
  * \return       Returns 1 if the params are equivalent, 0 otherwise
  */
-static INLINE int av1_check_grain_params_equiv(
+static INLINE int aom_check_grain_params_equiv(
     const aom_film_grain_t *const pa, const aom_film_grain_t *const pb) {
   if (pa->apply_grain != pb->apply_grain) return 0;
   // Don't compare update_parameters
@@ -151,42 +151,8 @@ static INLINE int av1_check_grain_params_equiv(
   return 1;
 }
 
-/*!\brief Add film grain
- *
- * Add film grain to an image
- *
- * Returns 0 for success, -1 for failure
- *
- * \param[in]    grain_params     Grain parameters
- * \param[in]    luma             luma plane
- * \param[in]    cb               cb plane
- * \param[in]    cr               cr plane
- * \param[in]    height           luma plane height
- * \param[in]    width            luma plane width
- * \param[in]    luma_stride      luma plane stride
- * \param[in]    chroma_stride    chroma plane stride
- */
-int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
-                           uint8_t *cb, uint8_t *cr, int height, int width,
-                           int luma_stride, int chroma_stride,
-                           int use_high_bit_depth, int chroma_subsamp_y,
-                           int chroma_subsamp_x, int mc_identity);
-
-/*!\brief Add film grain
- *
- * Add film grain to an image
- *
- * Returns 0 for success, -1 for failure
- *
- * \param[in]    grain_params     Grain parameters
- * \param[in]    src              Source image
- * \param[out]   dst              Resulting image with grain
- */
-int av1_add_film_grain(const aom_film_grain_t *grain_params,
-                       const aom_image_t *src, aom_image_t *dst);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
+#endif  // AOM_AOM_DSP_GRAIN_PARAMS_H_
diff --git a/media/libaom/src/aom_dsp/grain_table.c b/media/libaom/src/aom_dsp/grain_table.c
index e03f04d5da..3505f9f2c8 100644
--- a/media/libaom/src/aom_dsp/grain_table.c
+++ b/media/libaom/src/aom_dsp/grain_table.c
@@ -105,7 +105,11 @@ static void grain_table_entry_read(FILE *file,
       }
     }
 
-    fscanf(file, "\n\tcY");
+    if (fscanf(file, "\n\tcY")) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read Y coeffs header (cY)");
+      return;
+    }
     const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
     for (int i = 0; i < n; ++i) {
       if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) {
@@ -114,7 +118,11 @@ static void grain_table_entry_read(FILE *file,
         return;
       }
     }
-    fscanf(file, "\n\tcCb");
+    if (fscanf(file, "\n\tcCb")) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read Cb coeffs header (cCb)");
+      return;
+    }
     for (int i = 0; i <= n; ++i) {
       if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) {
         aom_internal_error(error_info, AOM_CODEC_ERROR,
@@ -122,7 +130,11 @@ static void grain_table_entry_read(FILE *file,
         return;
       }
     }
-    fscanf(file, "\n\tcCr");
+    if (fscanf(file, "\n\tcCr")) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable read to Cr coeffs header (cCr)");
+      return;
+    }
     for (int i = 0; i <= n; ++i) {
       if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) {
         aom_internal_error(error_info, AOM_CODEC_ERROR,
@@ -130,7 +142,7 @@ static void grain_table_entry_read(FILE *file,
         return;
       }
     }
-    fscanf(file, "\n");
+    (void)fscanf(file, "\n");
   }
 }
 
@@ -179,11 +191,14 @@ static void grain_table_entry_write(FILE *file,
   }
 }
 
+// TODO(https://crbug.com/aomedia/3228): Update this function to return an
+// integer status.
 void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp,
                                  int64_t end_time,
                                  const aom_film_grain_t *grain) {
   if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) {
     aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail));
+    if (!new_tail) return;
     memset(new_tail, 0, sizeof(*new_tail));
     if (t->tail) t->tail->next = new_tail;
     if (!t->head) t->head = new_tail;
@@ -202,7 +217,7 @@ int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
                                 int64_t end_time, int erase,
                                 aom_film_grain_t *grain) {
   aom_film_grain_table_entry_t *entry = t->head;
-  aom_film_grain_table_entry_t *prev_entry = 0;
+  aom_film_grain_table_entry_t *prev_entry = NULL;
   uint16_t random_seed = grain ? grain->random_seed : 0;
   if (grain) memset(grain, 0, sizeof(*grain));
 
@@ -233,6 +248,7 @@ int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
       } else {
         aom_film_grain_table_entry_t *new_entry =
             aom_malloc(sizeof(*new_entry));
+        if (!new_entry) return 0;
         new_entry->next = entry->next;
         new_entry->start_time = end_time;
         new_entry->end_time = entry->end_time;
@@ -241,10 +257,13 @@ int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
         entry->end_time = time_stamp;
         if (t->tail == entry) t->tail = new_entry;
       }
-      // If segments aren't aligned, delete from the beggining of subsequent
+      // If segments aren't aligned, delete from the beginning of subsequent
       // segments
       if (end_time > entry_end_time) {
-        aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0);
+        // Ignoring the return value here is safe since we're erasing from the
+        // beginning of subsequent entries.
+        aom_film_grain_table_lookup(t, entry_end_time, end_time, /*erase=*/1,
+                                    NULL);
       }
       return 1;
     }
@@ -275,12 +294,17 @@ aom_codec_err_t aom_film_grain_table_read(
     return error_info->error_code;
   }
 
-  aom_film_grain_table_entry_t *prev_entry = 0;
+  aom_film_grain_table_entry_t *prev_entry = NULL;
   while (!feof(file)) {
     aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry));
+    if (!entry) {
+      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                         "Unable to allocate grain table entry");
+      break;
+    }
     memset(entry, 0, sizeof(*entry));
     grain_table_entry_read(file, error_info, entry);
-    entry->next = 0;
+    entry->next = NULL;
 
     if (prev_entry) prev_entry->next = entry;
     if (!t->head) t->head = entry;
diff --git a/media/libaom/src/aom_dsp/grain_table.h b/media/libaom/src/aom_dsp/grain_table.h
index a8ac50730e..3f75101ad5 100644
--- a/media/libaom/src/aom_dsp/grain_table.h
+++ b/media/libaom/src/aom_dsp/grain_table.h
@@ -34,7 +34,7 @@
 extern "C" {
 #endif
 
-#include "aom_dsp/grain_synthesis.h"
+#include "aom_dsp/grain_params.h"
 #include "aom/internal/aom_codec_internal.h"
 
 typedef struct aom_film_grain_table_entry_t {
diff --git a/media/libaom/src/aom_dsp/intrapred.c b/media/libaom/src/aom_dsp/intrapred.c
index 72ccfd8358..00396c8e70 100644
--- a/media/libaom/src/aom_dsp/intrapred.c
+++ b/media/libaom/src/aom_dsp/intrapred.c
@@ -86,11 +86,11 @@ static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                     const uint8_t *left) {
   const uint8_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
   const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
-  // scale = 2 * 2^sm_weight_log2_scale
-  const int log2_scale = 1 + sm_weight_log2_scale;
-  const uint16_t scale = (1 << sm_weight_log2_scale);
+  const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
+  const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
+  // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE
+  const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE;
+  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
   sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
                            log2_scale + sizeof(*dst));
   int r;
@@ -116,10 +116,10 @@ static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint8_t *above,
                                       const uint8_t *left) {
   const uint8_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
-  const uint8_t *const sm_weights = sm_weight_arrays + bh;
-  // scale = 2^sm_weight_log2_scale
-  const int log2_scale = sm_weight_log2_scale;
-  const uint16_t scale = (1 << sm_weight_log2_scale);
+  const uint8_t *const sm_weights = smooth_weights + bh - 4;
+  // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
+  const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
+  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
   sm_weights_sanity_checks(sm_weights, sm_weights, scale,
                            log2_scale + sizeof(*dst));
 
@@ -145,10 +145,10 @@ static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint8_t *above,
                                       const uint8_t *left) {
   const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
-  const uint8_t *const sm_weights = sm_weight_arrays + bw;
-  // scale = 2^sm_weight_log2_scale
-  const int log2_scale = sm_weight_log2_scale;
-  const uint16_t scale = (1 << sm_weight_log2_scale);
+  const uint8_t *const sm_weights = smooth_weights + bw - 4;
+  // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
+  const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
+  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
   sm_weights_sanity_checks(sm_weights, sm_weights, scale,
                            log2_scale + sizeof(*dst));
 
@@ -405,11 +405,11 @@ static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
   (void)bd;
   const uint16_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
   const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
-  // scale = 2 * 2^sm_weight_log2_scale
-  const int log2_scale = 1 + sm_weight_log2_scale;
-  const uint16_t scale = (1 << sm_weight_log2_scale);
+  const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
+  const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
+  // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE
+  const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE;
+  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
   sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
                            log2_scale + sizeof(*dst));
   int r;
@@ -437,10 +437,10 @@ static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *left, int bd) {
   (void)bd;
   const uint16_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
-  const uint8_t *const sm_weights = sm_weight_arrays + bh;
-  // scale = 2^sm_weight_log2_scale
-  const int log2_scale = sm_weight_log2_scale;
-  const uint16_t scale = (1 << sm_weight_log2_scale);
+  const uint8_t *const sm_weights = smooth_weights + bh - 4;
+  // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
+  const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
+  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
   sm_weights_sanity_checks(sm_weights, sm_weights, scale,
                            log2_scale + sizeof(*dst));
 
@@ -468,10 +468,10 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *left, int bd) {
   (void)bd;
   const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
-  const uint8_t *const sm_weights = sm_weight_arrays + bw;
-  // scale = 2^sm_weight_log2_scale
-  const int log2_scale = sm_weight_log2_scale;
-  const uint16_t scale = (1 << sm_weight_log2_scale);
+  const uint8_t *const sm_weights = smooth_weights + bw - 4;
+  // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
+  const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
+  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
   sm_weights_sanity_checks(sm_weights, sm_weights, scale,
                            log2_scale + sizeof(*dst));
 
@@ -752,6 +752,7 @@ void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
   intra_pred_highbd_sized(type, 32, 8) \
   intra_pred_highbd_sized(type, 16, 64) \
   intra_pred_highbd_sized(type, 64, 16)
+
 #define intra_pred_above_4x4(type) \
   intra_pred_sized(type, 8, 8) \
   intra_pred_sized(type, 16, 16) \
diff --git a/media/libaom/src/aom_dsp/intrapred_common.h b/media/libaom/src/aom_dsp/intrapred_common.h
index 3ec62a86ef..6172224be1 100644
--- a/media/libaom/src/aom_dsp/intrapred_common.h
+++ b/media/libaom/src/aom_dsp/intrapred_common.h
@@ -15,18 +15,14 @@
 #include "config/aom_config.h"
 
 // Weights are quadratic from '1' to '1 / block_size', scaled by
-// 2^sm_weight_log2_scale.
-static const int sm_weight_log2_scale = 8;
+// 2^SMOOTH_WEIGHT_LOG2_SCALE.
+#define SMOOTH_WEIGHT_LOG2_SCALE 8
 
-// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
-#define MAX_BLOCK_DIM 64
-
-/* clang-format off */
-static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
-  // Unused, because we always offset by bs, which is at least 2.
-  0, 0,
-  // bs = 2
-  255, 128,
+// Note these arrays are aligned to ensure NEON loads using a cast to uint32_t*
+// have sufficient alignment. Using 8 preserves the potential for an alignment
+// hint in load_weight_w8(). For that case, this could be increased to 16 to
+// allow an aligned load in x86.
+DECLARE_ALIGNED(8, static const uint8_t, smooth_weights[]) = {
   // bs = 4
   255, 149, 85, 64,
   // bs = 8
@@ -40,8 +36,24 @@ static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
   255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
   150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
   65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
-  13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
+  13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4
+};
+
+DECLARE_ALIGNED(8, static const uint16_t, smooth_weights_u16[]) = {
+  // block dimension = 4
+  255, 149, 85, 64,
+  // block dimension = 8
+  255, 197, 146, 105, 73, 50, 37, 32,
+  // block dimension = 16
+  255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+  // block dimension = 32
+  255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+  66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+  // block dimension = 64
+  255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+  150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
+  65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
+  13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4
 };
-/* clang-format on */
 
 #endif  // AOM_AOM_DSP_INTRAPRED_COMMON_H_
diff --git a/media/libaom/src/aom_dsp/loopfilter.c b/media/libaom/src/aom_dsp/loopfilter.c
index 903ebcd7c8..075f13689c 100644
--- a/media/libaom/src/aom_dsp/loopfilter.c
+++ b/media/libaom/src/aom_dsp/loopfilter.c
@@ -158,6 +158,15 @@ void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
   aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1);
 }
 
+void aom_lpf_horizontal_4_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0,
+                                 const uint8_t *thresh0) {
+  aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_c(s + 4, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_c(s + 8, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_c(s + 12, p, blimit0, limit0, thresh0);
+}
+
 void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh) {
   int i;
@@ -182,6 +191,14 @@ void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
+void aom_lpf_vertical_4_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0) {
+  aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
+}
+
 static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
                            uint8_t *op2, uint8_t *op1, uint8_t *op0,
                            uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
@@ -247,6 +264,15 @@ void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
   aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1);
 }
 
+void aom_lpf_horizontal_6_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0,
+                                 const uint8_t *thresh0) {
+  aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_c(s + 4, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_c(s + 8, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_c(s + 12, p, blimit0, limit0, thresh0);
+}
+
 void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
   int i;
@@ -275,6 +301,15 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
   aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1);
 }
 
+void aom_lpf_horizontal_8_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0,
+                                 const uint8_t *thresh0) {
+  aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_c(s + 4, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_c(s + 8, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_c(s + 12, p, blimit0, limit0, thresh0);
+}
+
 void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh) {
   int i;
@@ -299,6 +334,14 @@ void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
+void aom_lpf_vertical_6_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0) {
+  aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
+}
+
 void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh) {
   int i;
@@ -324,6 +367,14 @@ void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
+void aom_lpf_vertical_8_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0) {
+  aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
+}
+
 static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
                             int8_t flat2, uint8_t *op6, uint8_t *op5,
                             uint8_t *op4, uint8_t *op3, uint8_t *op2,
@@ -410,6 +461,15 @@ void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
   mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
 }
 
+void aom_lpf_horizontal_14_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0) {
+  mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
+  mb_lpf_horizontal_edge_w(s + 4, p, blimit0, limit0, thresh0, 1);
+  mb_lpf_horizontal_edge_w(s + 8, p, blimit0, limit0, thresh0, 1);
+  mb_lpf_horizontal_edge_w(s + 12, p, blimit0, limit0, thresh0, 1);
+}
+
 static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count) {
@@ -444,6 +504,14 @@ void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
 }
 
+void aom_lpf_vertical_14_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                const uint8_t *limit0, const uint8_t *thresh0) {
+  mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
+  mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit0, limit0, thresh0, 4);
+  mb_lpf_vertical_edge_w(s + 8 * pitch, pitch, blimit0, limit0, thresh0, 4);
+  mb_lpf_vertical_edge_w(s + 12 * pitch, pitch, blimit0, limit0, thresh0, 4);
+}
+
 #if CONFIG_AV1_HIGHBITDEPTH
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
 static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
diff --git a/media/libaom/src/av1/encoder/mathutils.h b/media/libaom/src/aom_dsp/mathutils.h
index 64f9361767..3ffca8a17e 100644
--- a/media/libaom/src/av1/encoder/mathutils.h
+++ b/media/libaom/src/aom_dsp/mathutils.h
@@ -9,14 +9,15 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_ENCODER_MATHUTILS_H_
-#define AOM_AV1_ENCODER_MATHUTILS_H_
+#ifndef AOM_AOM_DSP_MATHUTILS_H_
+#define AOM_AOM_DSP_MATHUTILS_H_
 
-#include <memory.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
 #include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
 
 static const double TINY_NEAR_ZERO = 1.0E-16;
 
@@ -69,6 +70,7 @@ static INLINE int least_squares(int n, double *A, int rows, int stride,
   double *AtA, *Atb;
   if (!scratch) {
     scratch_ = (double *)aom_malloc(sizeof(*scratch) * n * (n + 1));
+    if (!scratch_) return 0;
     scratch = scratch_;
   }
   AtA = scratch;
@@ -85,7 +87,7 @@ static INLINE int least_squares(int n, double *A, int rows, int stride,
     for (k = 0; k < rows; ++k) Atb[i] += A[k * stride + i] * b[k];
   }
   int ret = linsolve(n, AtA, n, Atb, x);
-  if (scratch_) aom_free(scratch_);
+  aom_free(scratch_);
   return ret;
 }
 
@@ -114,7 +116,7 @@ static INLINE void multiply_mat(const double *m1, const double *m2, double *res,
 // svdcmp
 // Adopted from Numerical Recipes in C
 
-static INLINE double sign(double a, double b) {
+static INLINE double apply_sign(double a, double b) {
   return ((b) >= 0 ? fabs(a) : -fabs(a));
 }
 
@@ -137,6 +139,7 @@ static INLINE int svdcmp(double **u, int m, int n, double w[], double **v) {
   int flag, i, its, j, jj, k, l, nm;
   double anorm, c, f, g, h, s, scale, x, y, z;
   double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1));
+  if (!rv1) return 0;
   g = scale = anorm = 0.0;
   for (i = 0; i < n; i++) {
     l = i + 1;
@@ -150,7 +153,7 @@ static INLINE int svdcmp(double **u, int m, int n, double w[], double **v) {
           s += u[k][i] * u[k][i];
         }
         f = u[i][i];
-        g = -sign(sqrt(s), f);
+        g = -apply_sign(sqrt(s), f);
         h = f * g - s;
         u[i][i] = f - g;
         for (j = l; j < n; j++) {
@@ -171,7 +174,7 @@ static INLINE int svdcmp(double **u, int m, int n, double w[], double **v) {
           s += u[i][k] * u[i][k];
         }
         f = u[i][l];
-        g = -sign(sqrt(s), f);
+        g = -apply_sign(sqrt(s), f);
         h = f * g - s;
         u[i][l] = f - g;
         for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
@@ -269,7 +272,7 @@ static INLINE int svdcmp(double **u, int m, int n, double w[], double **v) {
       h = rv1[k];
       f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
       g = pythag(f, 1.0);
-      f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x;
+      f = ((x - z) * (x + z) + h * ((y / (f + apply_sign(g, f))) - h)) / x;
       c = s = 1.0;
       for (j = l; j <= nm; j++) {
         i = j + 1;
@@ -332,8 +335,8 @@ static INLINE int SVD(double *U, double *W, double *V, double *matx, int M,
       nrV[i] = &V[i * N];
     }
   } else {
-    if (nrU) aom_free(nrU);
-    if (nrV) aom_free(nrV);
+    aom_free(nrU);
+    aom_free(nrV);
     return 1;
   }
 
@@ -356,4 +359,4 @@ static INLINE int SVD(double *U, double *W, double *V, double *matx, int M,
   return 0;
 }
 
-#endif  // AOM_AV1_ENCODER_MATHUTILS_H_
+#endif  // AOM_AOM_DSP_MATHUTILS_H_
diff --git a/media/libaom/src/aom_dsp/mips/convolve8_dspr2.c b/media/libaom/src/aom_dsp/mips/aom_convolve_copy_dspr2.c
index af54b42647..12a213eaa3 100644
--- a/media/libaom/src/aom_dsp/mips/convolve8_dspr2.c
+++ b/media/libaom/src/aom_dsp/mips/aom_convolve_copy_dspr2.c
@@ -21,17 +21,9 @@
 
 #if HAVE_DSPR2
 void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h) {
+                             uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
   int x, y;
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
   /* prefetch data to cache memory */
   prefetch_load(src);
   prefetch_load(src + 32);
diff --git a/media/libaom/src/aom_dsp/mips/aom_convolve_copy_msa.c b/media/libaom/src/aom_dsp/mips/aom_convolve_copy_msa.c
index f7f116f4da..12e7d9539a 100644
--- a/media/libaom/src/aom_dsp/mips/aom_convolve_copy_msa.c
+++ b/media/libaom/src/aom_dsp/mips/aom_convolve_copy_msa.c
@@ -198,15 +198,8 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
 }
 
 void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int32_t filter_x_stride,
-                           const int16_t *filter_y, int32_t filter_y_stride,
-                           int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-
+                           uint8_t *dst, ptrdiff_t dst_stride, int32_t w,
+                           int32_t h) {
   switch (w) {
     case 4: {
       uint32_t cnt, tmp;
@@ -238,7 +231,7 @@ void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
     default: {
       uint32_t cnt;
       for (cnt = h; cnt--;) {
-        memcpy(dst, src, w);
+        memmove(dst, src, w);
         src += src_stride;
         dst += dst_stride;
       }
diff --git a/media/libaom/src/aom_dsp/mips/sad_msa.c b/media/libaom/src/aom_dsp/mips/sad_msa.c
index 58cdd80d99..01d4a5239c 100644
--- a/media/libaom/src/aom_dsp/mips/sad_msa.c
+++ b/media/libaom/src/aom_dsp/mips/sad_msa.c
@@ -162,9 +162,9 @@ static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
 }
 
 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *const aref_ptr[],
+                               const uint8_t *const aref_ptr[4],
                                int32_t ref_stride, int32_t height,
-                               uint32_t *sad_array) {
+                               uint32_t sad_array[4]) {
   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
   int32_t ht_cnt;
   uint32_t src0, src1, src2, src3;
@@ -223,9 +223,9 @@ static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
 }
 
 static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *const aref_ptr[],
+                               const uint8_t *const aref_ptr[4],
                                int32_t ref_stride, int32_t height,
-                               uint32_t *sad_array) {
+                               uint32_t sad_array[4]) {
   int32_t ht_cnt;
   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
   v16u8 src0, src1, src2, src3;
@@ -274,9 +274,9 @@ static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
 }
 
 static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                const uint8_t *const aref_ptr[],
+                                const uint8_t *const aref_ptr[4],
                                 int32_t ref_stride, int32_t height,
-                                uint32_t *sad_array) {
+                                uint32_t sad_array[4]) {
   int32_t ht_cnt;
   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
   v16u8 src, ref0, ref1, ref2, ref3, diff;
@@ -339,9 +339,9 @@ static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
 }
 
 static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t *const aref_ptr[],
+                                const uint8_t *const aref_ptr[4],
                                 int32_t ref_stride, int32_t height,
-                                uint32_t *sad_array) {
+                                uint32_t sad_array[4]) {
   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
   int32_t ht_cnt;
   v16u8 src0, src1, ref0, ref1;
@@ -383,9 +383,9 @@ static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
 }
 
 static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t *const aref_ptr[],
+                                const uint8_t *const aref_ptr[4],
                                 int32_t ref_stride, int32_t height,
-                                uint32_t *sad_array) {
+                                uint32_t sad_array[4]) {
   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
   int32_t ht_cnt;
   v16u8 src0, src1, src2, src3;
@@ -659,36 +659,36 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
 
 #define AOM_SAD_4xHEIGHTx4D_MSA(height)                                   \
   void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[],            \
-                                  int32_t ref_stride, uint32_t *sads) {   \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
     sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define AOM_SAD_8xHEIGHTx4D_MSA(height)                                   \
   void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[],            \
-                                  int32_t ref_stride, uint32_t *sads) {   \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
     sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define AOM_SAD_16xHEIGHTx4D_MSA(height)                                   \
   void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define AOM_SAD_32xHEIGHTx4D_MSA(height)                                   \
   void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define AOM_SAD_64xHEIGHTx4D_MSA(height)                                   \
   void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
diff --git a/media/libaom/src/aom_dsp/noise_model.c b/media/libaom/src/aom_dsp/noise_model.c
index c7a0003a80..70fdb7bcad 100644
--- a/media/libaom/src/aom_dsp/noise_model.c
+++ b/media/libaom/src/aom_dsp/noise_model.c
@@ -15,11 +15,10 @@
 #include <string.h>
 
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/mathutils.h"
 #include "aom_dsp/noise_model.h"
 #include "aom_dsp/noise_util.h"
 #include "aom_mem/aom_mem.h"
-#include "av1/common/common.h"
-#include "av1/encoder/mathutils.h"
 
 #define kLowPolyNumParams 3
 
@@ -42,8 +41,8 @@ static const int kMaxLag = 4;
     return block_mean / (max_w * max_h);                                    \
   }
 
-GET_BLOCK_MEAN(uint8_t, lowbd);
-GET_BLOCK_MEAN(uint16_t, highbd);
+GET_BLOCK_MEAN(uint8_t, lowbd)
+GET_BLOCK_MEAN(uint16_t, highbd)
 
 static INLINE double get_block_mean(const uint8_t *data, int w, int h,
                                     int stride, int x_o, int y_o,
@@ -76,8 +75,8 @@ static INLINE double get_block_mean(const uint8_t *data, int w, int h,
     return noise_var / (max_w * max_h) - noise_mean * noise_mean;        \
   }
 
-GET_NOISE_VAR(uint8_t, lowbd);
-GET_NOISE_VAR(uint16_t, highbd);
+GET_NOISE_VAR(uint8_t, lowbd)
+GET_NOISE_VAR(uint16_t, highbd)
 
 static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised,
                                    int w, int h, int stride, int x_o, int y_o,
@@ -214,6 +213,7 @@ static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) {
 
 int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
   if (!lut) return 0;
+  if (num_points <= 0) return 0;
   lut->num_points = 0;
   lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
   if (!lut->points) return 0;
@@ -388,6 +388,10 @@ int aom_noise_strength_solver_fit_piecewise(
   }
 
   double *residual = aom_malloc(solver->num_bins * sizeof(*residual));
+  if (!residual) {
+    aom_noise_strength_lut_free(lut);
+    return 0;
+  }
   memset(residual, 0, sizeof(*residual) * solver->num_bins);
 
   update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins);
@@ -694,6 +698,10 @@ int aom_noise_model_init(aom_noise_model_t *model,
             kMaxLag);
     return 0;
   }
+  if (!(params.bit_depth == 8 || params.bit_depth == 10 ||
+        params.bit_depth == 12)) {
+    return 0;
+  }
 
   memcpy(&model->params, &params, sizeof(params));
   for (c = 0; c < 3; ++c) {
@@ -710,6 +718,10 @@ int aom_noise_model_init(aom_noise_model_t *model,
   }
   model->n = n;
   model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n);
+  if (!model->coords) {
+    aom_noise_model_free(model);
+    return 0;
+  }
 
   for (y = -lag; y <= 0; ++y) {
     const int max_x = y == 0 ? -1 : lag;
@@ -787,8 +799,8 @@ void aom_noise_model_free(aom_noise_model_t *model) {
     return val;                                                            \
   }
 
-EXTRACT_AR_ROW(uint8_t, lowbd);
-EXTRACT_AR_ROW(uint16_t, highbd);
+EXTRACT_AR_ROW(uint8_t, lowbd)
+EXTRACT_AR_ROW(uint16_t, highbd)
 
 static int add_block_observations(
     aom_noise_model_t *noise_model, int c, const uint8_t *const data,
@@ -1152,12 +1164,24 @@ int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
 
   // Convert the scaling functions to 8 bit values
   aom_noise_strength_lut_t scaling_points[3];
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0);
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1);
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2);
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[0].strength_solver, 14,
+          scaling_points + 0)) {
+    return 0;
+  }
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[1].strength_solver, 10,
+          scaling_points + 1)) {
+    aom_noise_strength_lut_free(scaling_points + 0);
+    return 0;
+  }
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[2].strength_solver, 10,
+          scaling_points + 2)) {
+    aom_noise_strength_lut_free(scaling_points + 0);
+    aom_noise_strength_lut_free(scaling_points + 1);
+    return 0;
+  }
 
   // Both the domain and the range of the scaling functions in the film_grain
   // are normalized to 8-bit (e.g., they are implicitly scaled during grain
@@ -1287,6 +1311,7 @@ static void pointwise_multiply(const float *a, float *b, int n) {
 static float *get_half_cos_window(int block_size) {
   float *window_function =
       (float *)aom_malloc(block_size * block_size * sizeof(*window_function));
+  if (!window_function) return NULL;
   for (int y = 0; y < block_size; ++y) {
     const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2);
     for (int x = 0; x < block_size; ++x) {
@@ -1329,8 +1354,8 @@ static float *get_half_cos_window(int block_size) {
     }                                                                       \
   }
 
-DITHER_AND_QUANTIZE(uint8_t, lowbd);
-DITHER_AND_QUANTIZE(uint16_t, highbd);
+DITHER_AND_QUANTIZE(uint8_t, lowbd)
+DITHER_AND_QUANTIZE(uint16_t, highbd)
 
 int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
                           int w, int h, int stride[3], int chroma_sub[2],
@@ -1353,7 +1378,7 @@ int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
   if (chroma_sub[0] != chroma_sub[1]) {
     fprintf(stderr,
             "aom_wiener_denoise_2d doesn't handle different chroma "
-            "subsampling");
+            "subsampling\n");
     return 0;
   }
   init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
@@ -1560,6 +1585,10 @@ static int denoise_and_model_realloc_if_necessary(
   ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size;
   ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size;
   ctx->flat_blocks = aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h);
+  if (!ctx->flat_blocks) {
+    fprintf(stderr, "Unable to allocate flat_blocks buffer\n");
+    return 0;
+  }
 
   aom_flat_block_finder_free(&ctx->flat_block_finder);
   if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size,
@@ -1591,7 +1620,7 @@ static int denoise_and_model_realloc_if_necessary(
 
 int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
                               YV12_BUFFER_CONFIG *sd,
-                              aom_film_grain_t *film_grain) {
+                              aom_film_grain_t *film_grain, int apply_denoise) {
   const int block_size = ctx->block_size;
   const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
   uint8_t *raw_data[3] = {
@@ -1643,12 +1672,14 @@ int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
     if (!film_grain->random_seed) {
       film_grain->random_seed = 7391;
     }
-    memcpy(raw_data[0], ctx->denoised[0],
-           (strides[0] * sd->y_height) << use_highbd);
-    memcpy(raw_data[1], ctx->denoised[1],
-           (strides[1] * sd->uv_height) << use_highbd);
-    memcpy(raw_data[2], ctx->denoised[2],
-           (strides[2] * sd->uv_height) << use_highbd);
+    if (apply_denoise) {
+      memcpy(raw_data[0], ctx->denoised[0],
+             (strides[0] * sd->y_height) << use_highbd);
+      memcpy(raw_data[1], ctx->denoised[1],
+             (strides[1] * sd->uv_height) << use_highbd);
+      memcpy(raw_data[2], ctx->denoised[2],
+             (strides[2] * sd->uv_height) << use_highbd);
+    }
   }
   return 1;
 }
diff --git a/media/libaom/src/aom_dsp/noise_model.h b/media/libaom/src/aom_dsp/noise_model.h
index 5e7de9bf2c..f385251d7c 100644
--- a/media/libaom/src/aom_dsp/noise_model.h
+++ b/media/libaom/src/aom_dsp/noise_model.h
@@ -17,7 +17,8 @@ extern "C" {
 #endif  // __cplusplus
 
 #include <stdint.h>
-#include "aom_dsp/grain_synthesis.h"
+#include "aom_dsp/grain_params.h"
+#include "aom_ports/mem.h"
 #include "aom_scale/yv12config.h"
 
 /*!\brief Wrapper of data required to represent linear system of eqns and soln.
@@ -292,14 +293,18 @@ struct aom_denoise_and_model_t;
  * parameter will be true when the input buffer was successfully denoised and
  * grain was modelled. Returns false on error.
  *
- * \param[in]      ctx   Struct allocated with aom_denoise_and_model_alloc
- *                       that holds some buffers for denoising and the current
- *                       noise estimate.
- * \param[in/out]   buf  The raw input buffer to be denoised.
- * \param[out]    grain  Output film grain parameters
+ * \param[in]      ctx          Struct allocated with
+ *                              aom_denoise_and_model_alloc that holds some
+ *                              buffers for denoising and the current noise
+ *                              estimate.
+ * \param[in/out]   buf         The raw input buffer to be denoised.
+ * \param[out]    grain         Output film grain parameters
+ * \param[out]    apply_denoise Whether or not to apply the denoising to the
+ *                              frame that will be encoded
  */
 int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
-                              YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain);
+                              YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain,
+                              int apply_denoise);
 
 /*!\brief Allocates a context that can be used for denoising and noise modeling.
  *
diff --git a/media/libaom/src/aom_dsp/noise_util.c b/media/libaom/src/aom_dsp/noise_util.c
index 7e7e380c68..3ded8cb099 100644
--- a/media/libaom/src/aom_dsp/noise_util.c
+++ b/media/libaom/src/aom_dsp/noise_util.c
@@ -160,15 +160,17 @@ int aom_noise_data_validate(const double *data, int w, int h) {
 
   // Check that noise variance is not increasing in x or y
   // and that the data is zero mean.
-  mean_x = (double *)aom_malloc(sizeof(*mean_x) * w);
-  var_x = (double *)aom_malloc(sizeof(*var_x) * w);
-  mean_y = (double *)aom_malloc(sizeof(*mean_x) * h);
-  var_y = (double *)aom_malloc(sizeof(*var_y) * h);
-
-  memset(mean_x, 0, sizeof(*mean_x) * w);
-  memset(var_x, 0, sizeof(*var_x) * w);
-  memset(mean_y, 0, sizeof(*mean_y) * h);
-  memset(var_y, 0, sizeof(*var_y) * h);
+  mean_x = (double *)aom_calloc(w, sizeof(*mean_x));
+  var_x = (double *)aom_calloc(w, sizeof(*var_x));
+  mean_y = (double *)aom_calloc(h, sizeof(*mean_x));
+  var_y = (double *)aom_calloc(h, sizeof(*var_y));
+  if (!(mean_x && var_x && mean_y && var_y)) {
+    aom_free(mean_x);
+    aom_free(mean_y);
+    aom_free(var_x);
+    aom_free(var_y);
+    return 0;
+  }
 
   for (y = 0; y < h; ++y) {
     for (x = 0; x < w; ++x) {
diff --git a/media/libaom/src/av1/common/odintrin.c b/media/libaom/src/aom_dsp/odintrin.c
index 7584b2e52f..eb6d8d8771 100644
--- a/media/libaom/src/av1/common/odintrin.c
+++ b/media/libaom/src/aom_dsp/odintrin.c
@@ -11,7 +11,7 @@
 
 /* clang-format off */
 
-#include "av1/common/odintrin.h"
+#include "aom_dsp/odintrin.h"
 
 /*Constants for use with OD_DIVU_SMALL().
   See \cite{Rob05} for details on computing these constants.
diff --git a/media/libaom/src/av1/common/odintrin.h b/media/libaom/src/aom_dsp/odintrin.h
index e1db0f44d8..20a7f583bc 100644
--- a/media/libaom/src/av1/common/odintrin.h
+++ b/media/libaom/src/aom_dsp/odintrin.h
@@ -11,8 +11,8 @@
 
 /* clang-format off */
 
-#ifndef AOM_AV1_COMMON_ODINTRIN_H_
-#define AOM_AV1_COMMON_ODINTRIN_H_
+#ifndef AOM_AOM_DSP_ODINTRIN_H_
+#define AOM_AOM_DSP_ODINTRIN_H_
 
 #include <stdlib.h>
 #include <string.h>
@@ -20,7 +20,6 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/bitops.h"
-#include "av1/common/enums.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -93,4 +92,4 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
 }  // extern "C"
 #endif
 
-#endif  // AOM_AV1_COMMON_ODINTRIN_H_
+#endif  // AOM_AOM_DSP_ODINTRIN_H_
diff --git a/media/libaom/src/aom_dsp/psnr.c b/media/libaom/src/aom_dsp/psnr.c
index c66dd52d05..d846a102eb 100644
--- a/media/libaom/src/aom_dsp/psnr.c
+++ b/media/libaom/src/aom_dsp/psnr.c
@@ -363,6 +363,10 @@ int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
 void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
                           uint32_t bit_depth, uint32_t in_bit_depth) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
   const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
   const int heights[3] = { a->y_crop_height, a->uv_crop_height,
                            a->uv_crop_height };
@@ -371,7 +375,7 @@ void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
   int i;
   uint64_t total_sse = 0;
   uint32_t total_samples = 0;
-  const double peak = (double)((1 << in_bit_depth) - 1);
+  double peak = (double)((1 << in_bit_depth) - 1);
   const unsigned int input_shift = bit_depth - in_bit_depth;
 
   for (i = 0; i < 3; ++i) {
@@ -403,11 +407,40 @@ void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
   psnr->samples[0] = total_samples;
   psnr->psnr[0] =
       aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+
+  // Compute PSNR based on stream bit depth
+  if ((a->flags & YV12_FLAG_HIGHBITDEPTH) && (in_bit_depth < bit_depth)) {
+    peak = (double)((1 << bit_depth) - 1);
+    total_sse = 0;
+    total_samples = 0;
+    for (i = 0; i < 3; ++i) {
+      const int w = widths[i];
+      const int h = heights[i];
+      const uint32_t samples = w * h;
+      uint64_t sse;
+      sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
+                           b_strides[i], w, h);
+      psnr->sse_hbd[1 + i] = sse;
+      psnr->samples_hbd[1 + i] = samples;
+      psnr->psnr_hbd[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+      total_sse += sse;
+      total_samples += samples;
+    }
+
+    psnr->sse_hbd[0] = total_sse;
+    psnr->samples_hbd[0] = total_samples;
+    psnr->psnr_hbd[0] =
+        aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+  }
 }
 #endif
 
 void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                    PSNR_STATS *psnr) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
   static const double peak = 255.0;
   const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
   const int heights[3] = { a->y_crop_height, a->uv_crop_height,
diff --git a/media/libaom/src/aom_dsp/psnr.h b/media/libaom/src/aom_dsp/psnr.h
index 7f40b8b575..96a17f4dc1 100644
--- a/media/libaom/src/aom_dsp/psnr.h
+++ b/media/libaom/src/aom_dsp/psnr.h
@@ -21,9 +21,12 @@ extern "C" {
 #endif
 
 typedef struct {
-  double psnr[4];       // total/y/u/v
-  uint64_t sse[4];      // total/y/u/v
-  uint32_t samples[4];  // total/y/u/v
+  double psnr[4];           // total/y/u/v
+  uint64_t sse[4];          // total/y/u/v
+  uint32_t samples[4];      // total/y/u/v
+  double psnr_hbd[4];       // total/y/u/v when input-bit-depth < bit-depth
+  uint64_t sse_hbd[4];      // total/y/u/v when input-bit-depth < bit-depth
+  uint32_t samples_hbd[4];  // total/y/u/v when input-bit-depth < bit-depth
 } PSNR_STATS;
 
 /*!\brief Converts SSE to PSNR
diff --git a/media/libaom/src/aom_dsp/psnrhvs.c b/media/libaom/src/aom_dsp/psnrhvs.c
index 69a1d99bf2..966ba007ed 100644
--- a/media/libaom/src/aom_dsp/psnrhvs.c
+++ b/media/libaom/src/aom_dsp/psnrhvs.c
@@ -22,7 +22,6 @@
 
 #include "aom_dsp/psnr.h"
 #include "aom_dsp/ssim.h"
-#include "aom_ports/system_state.h"
 
 static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                            int xstride) {
@@ -34,6 +33,7 @@ static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
       *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                                int xstride) {
   int i, j;
@@ -43,6 +43,7 @@ static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
     for (j = 0; j < 8; j++)
       *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 /* Normalized inverse quantization matrix for 8x8 DCT at the point of
  * transparency. This is not the JPEG based matrix from the paper,
@@ -210,6 +211,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
         }
       }
       s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f;
+#if CONFIG_AV1_HIGHBITDEPTH
       if (!buf_is_hbd) {
         od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
         od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
@@ -217,6 +219,10 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
         hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
         hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
       }
+#else
+      od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+      od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
       for (i = 0; i < 8; i++)
         for (j = (i == 0); j < 8; j++)
           s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
@@ -246,7 +252,6 @@ double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst,
   const double par = 1.0;
   const int step = 7;
   uint32_t bd_shift = 0;
-  aom_clear_system_state();
   assert(bd == 8 || bd == 10 || bd == 12);
   assert(bd >= in_bd);
   assert(src->flags == dst->flags);
diff --git a/media/libaom/src/aom_dsp/quantize.c b/media/libaom/src/aom_dsp/quantize.c
index edd4d96480..36ca58f6b2 100644
--- a/media/libaom/src/aom_dsp/quantize.c
+++ b/media/libaom/src/aom_dsp/quantize.c
@@ -11,7 +11,6 @@
 
 #include "aom_dsp/quantize.h"
 #include "aom_mem/aom_mem.h"
-#include "av1/encoder/av1_quantize.h"
 
 void aom_quantize_b_adaptive_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
diff --git a/media/libaom/src/aom_dsp/quantize.h b/media/libaom/src/aom_dsp/quantize.h
index 395631814b..efe253ddb9 100644
--- a/media/libaom/src/aom_dsp/quantize.h
+++ b/media/libaom/src/aom_dsp/quantize.h
@@ -20,6 +20,9 @@
 extern "C" {
 #endif
 
+#define EOB_FACTOR 325
+#define SKIP_EOB_FACTOR_ADJUST 200
+
 void aom_quantize_b_adaptive_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
diff --git a/media/libaom/src/aom_dsp/sad.c b/media/libaom/src/aom_dsp/sad.c
index 8ddc683d6f..94260ce11a 100644
--- a/media/libaom/src/aom_dsp/sad.c
+++ b/media/libaom/src/aom_dsp/sad.c
@@ -35,14 +35,14 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
   return sad;
 }
 
-#define sadMxh(m)                                                          \
+#define SAD_MXH(m)                                                         \
   unsigned int aom_sad##m##xh_c(const uint8_t *a, int a_stride,            \
                                 const uint8_t *b, int b_stride, int width, \
                                 int height) {                              \
     return sad(a, a_stride, b, b_stride, width, height);                   \
   }
 
-#define sadMxN(m, n)                                                          \
+#define SADMXN(m, n)                                                          \
   unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride,       \
                                     const uint8_t *ref, int ref_stride) {     \
     return sad(src, src_stride, ref, ref_stride, m, n);                       \
@@ -61,112 +61,149 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
     aom_dist_wtd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref,           \
                                  ref_stride, jcp_param);                      \
     return sad(src, src_stride, comp_pred, m, m, n);                          \
+  }                                                                           \
+  unsigned int aom_sad_skip_##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref,                 \
+                                          int ref_stride) {                   \
+    return 2 * sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2));   \
   }
 
+#if CONFIG_REALTIME_ONLY
 // Calculate sad against 4 reference locations and store each in sad_array
-#define sadMxNx4D(m, n)                                                      \
-  void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,           \
-                               const uint8_t *const ref_array[],             \
-                               int ref_stride, uint32_t *sad_array) {        \
-    int i;                                                                   \
-    for (i = 0; i < 4; ++i) {                                                \
-      sad_array[i] =                                                         \
-          aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride);   \
-    }                                                                        \
-  }                                                                          \
-  void aom_sad##m##x##n##x4d_avg_c(                                          \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],  \
-      int ref_stride, const uint8_t *second_pred, uint32_t *sad_array) {     \
-    int i;                                                                   \
-    for (i = 0; i < 4; ++i) {                                                \
-      sad_array[i] = aom_sad##m##x##n##_avg_c(src, src_stride, ref_array[i], \
-                                              ref_stride, second_pred);      \
-    }                                                                        \
+#define SAD_MXNX4D(m, n)                                                      \
+  void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,            \
+                               const uint8_t *const ref_array[4],             \
+                               int ref_stride, uint32_t sad_array[4]) {       \
+    int i;                                                                    \
+    for (i = 0; i < 4; ++i) {                                                 \
+      sad_array[i] =                                                          \
+          aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride);    \
+    }                                                                         \
+  }                                                                           \
+  void aom_sad_skip_##m##x##n##x4d_c(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref_array[4],       \
+                                     int ref_stride, uint32_t sad_array[4]) { \
+    int i;                                                                    \
+    for (i = 0; i < 4; ++i) {                                                 \
+      sad_array[i] = 2 * sad(src, 2 * src_stride, ref_array[i],               \
+                             2 * ref_stride, (m), (n / 2));                   \
+    }                                                                         \
   }
+#else  // !CONFIG_REALTIME_ONLY
+// Calculate sad against 4 reference locations and store each in sad_array
+#define SAD_MXNX4D(m, n)                                                      \
+  void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,            \
+                               const uint8_t *const ref_array[4],             \
+                               int ref_stride, uint32_t sad_array[4]) {       \
+    int i;                                                                    \
+    for (i = 0; i < 4; ++i) {                                                 \
+      sad_array[i] =                                                          \
+          aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride);    \
+    }                                                                         \
+  }                                                                           \
+  void aom_sad##m##x##n##x4d_avg_c(                                           \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],  \
+      int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]) {    \
+    int i;                                                                    \
+    for (i = 0; i < 4; ++i) {                                                 \
+      sad_array[i] = aom_sad##m##x##n##_avg_c(src, src_stride, ref_array[i],  \
+                                              ref_stride, second_pred);       \
+    }                                                                         \
+  }                                                                           \
+  void aom_sad_skip_##m##x##n##x4d_c(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref_array[4],       \
+                                     int ref_stride, uint32_t sad_array[4]) { \
+    int i;                                                                    \
+    for (i = 0; i < 4; ++i) {                                                 \
+      sad_array[i] = 2 * sad(src, 2 * src_stride, ref_array[i],               \
+                             2 * ref_stride, (m), (n / 2));                   \
+    }                                                                         \
+  }
+#endif  // CONFIG_REALTIME_ONLY
 
 // 128x128
-sadMxN(128, 128);
-sadMxNx4D(128, 128);
+SADMXN(128, 128)
+SAD_MXNX4D(128, 128)
 
 // 128x64
-sadMxN(128, 64);
-sadMxNx4D(128, 64);
+SADMXN(128, 64)
+SAD_MXNX4D(128, 64)
 
 // 64x128
-sadMxN(64, 128);
-sadMxNx4D(64, 128);
+SADMXN(64, 128)
+SAD_MXNX4D(64, 128)
 
 // 64x64
-sadMxN(64, 64);
-sadMxNx4D(64, 64);
+SADMXN(64, 64)
+SAD_MXNX4D(64, 64)
 
 // 64x32
-sadMxN(64, 32);
-sadMxNx4D(64, 32);
+SADMXN(64, 32)
+SAD_MXNX4D(64, 32)
 
 // 32x64
-sadMxN(32, 64);
-sadMxNx4D(32, 64);
+SADMXN(32, 64)
+SAD_MXNX4D(32, 64)
 
 // 32x32
-sadMxN(32, 32);
-sadMxNx4D(32, 32);
+SADMXN(32, 32)
+SAD_MXNX4D(32, 32)
 
 // 32x16
-sadMxN(32, 16);
-sadMxNx4D(32, 16);
+SADMXN(32, 16)
+SAD_MXNX4D(32, 16)
 
 // 16x32
-sadMxN(16, 32);
-sadMxNx4D(16, 32);
+SADMXN(16, 32)
+SAD_MXNX4D(16, 32)
 
 // 16x16
-sadMxN(16, 16);
-sadMxNx4D(16, 16);
+SADMXN(16, 16)
+SAD_MXNX4D(16, 16)
 
 // 16x8
-sadMxN(16, 8);
-sadMxNx4D(16, 8);
+SADMXN(16, 8)
+SAD_MXNX4D(16, 8)
 
 // 8x16
-sadMxN(8, 16);
-sadMxNx4D(8, 16);
+SADMXN(8, 16)
+SAD_MXNX4D(8, 16)
 
 // 8x8
-sadMxN(8, 8);
-sadMxNx4D(8, 8);
+SADMXN(8, 8)
+SAD_MXNX4D(8, 8)
 
 // 8x4
-sadMxN(8, 4);
-sadMxNx4D(8, 4);
+SADMXN(8, 4)
+SAD_MXNX4D(8, 4)
 
 // 4x8
-sadMxN(4, 8);
-sadMxNx4D(4, 8);
+SADMXN(4, 8)
+SAD_MXNX4D(4, 8)
 
 // 4x4
-sadMxN(4, 4);
-sadMxNx4D(4, 4);
-
-sadMxh(128);
-sadMxh(64);
-sadMxh(32);
-sadMxh(16);
-sadMxh(8);
-sadMxh(4);
-
-sadMxN(4, 16);
-sadMxNx4D(4, 16);
-sadMxN(16, 4);
-sadMxNx4D(16, 4);
-sadMxN(8, 32);
-sadMxNx4D(8, 32);
-sadMxN(32, 8);
-sadMxNx4D(32, 8);
-sadMxN(16, 64);
-sadMxNx4D(16, 64);
-sadMxN(64, 16);
-sadMxNx4D(64, 16);
+SADMXN(4, 4)
+SAD_MXNX4D(4, 4)
+
+SAD_MXH(128)
+SAD_MXH(64)
+SAD_MXH(32)
+SAD_MXH(16)
+SAD_MXH(8)
+SAD_MXH(4)
+
+SADMXN(4, 16)
+SAD_MXNX4D(4, 16)
+SADMXN(16, 4)
+SAD_MXNX4D(16, 4)
+SADMXN(8, 32)
+SAD_MXNX4D(8, 32)
+SADMXN(32, 8)
+SAD_MXNX4D(32, 8)
+SADMXN(16, 64)
+SAD_MXNX4D(16, 64)
+SADMXN(64, 16)
+SAD_MXNX4D(64, 16)
 
 #if CONFIG_AV1_HIGHBITDEPTH
 static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
@@ -205,7 +242,7 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
   return sad;
 }
 
-#define highbd_sadMxN(m, n)                                                    \
+#define HIGHBD_SADMXN(m, n)                                                    \
   unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
                                            const uint8_t *ref,                 \
                                            int ref_stride) {                   \
@@ -227,9 +264,15 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
     aom_highbd_dist_wtd_comp_avg_pred(comp_pred8, second_pred, m, n, ref,      \
                                       ref_stride, jcp_param);                  \
     return highbd_sadb(src, src_stride, comp_pred8, m, m, n);                  \
+  }                                                                            \
+  unsigned int aom_highbd_sad_skip_##m##x##n##_c(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
   }
 
-#define highbd_sadMxNx4D(m, n)                                               \
+#define HIGHBD_SAD_MXNX4D(m, n)                                              \
   void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
                                       const uint8_t *const ref_array[],      \
                                       int ref_stride, uint32_t *sad_array) { \
@@ -238,82 +281,91 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
       sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride,            \
                                                  ref_array[i], ref_stride);  \
     }                                                                        \
+  }                                                                          \
+  void aom_highbd_sad_skip_##m##x##n##x4d_c(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],  \
+      int ref_stride, uint32_t *sad_array) {                                 \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i) {                                                \
+      sad_array[i] = 2 * highbd_sad(src, 2 * src_stride, ref_array[i],       \
+                                    2 * ref_stride, (m), (n / 2));           \
+    }                                                                        \
   }
 
 // 128x128
-highbd_sadMxN(128, 128);
-highbd_sadMxNx4D(128, 128);
+HIGHBD_SADMXN(128, 128)
+HIGHBD_SAD_MXNX4D(128, 128)
 
 // 128x64
-highbd_sadMxN(128, 64);
-highbd_sadMxNx4D(128, 64);
+HIGHBD_SADMXN(128, 64)
+HIGHBD_SAD_MXNX4D(128, 64)
 
 // 64x128
-highbd_sadMxN(64, 128);
-highbd_sadMxNx4D(64, 128);
+HIGHBD_SADMXN(64, 128)
+HIGHBD_SAD_MXNX4D(64, 128)
 
 // 64x64
-highbd_sadMxN(64, 64);
-highbd_sadMxNx4D(64, 64);
+HIGHBD_SADMXN(64, 64)
+HIGHBD_SAD_MXNX4D(64, 64)
 
 // 64x32
-highbd_sadMxN(64, 32);
-highbd_sadMxNx4D(64, 32);
+HIGHBD_SADMXN(64, 32)
+HIGHBD_SAD_MXNX4D(64, 32)
 
 // 32x64
-highbd_sadMxN(32, 64);
-highbd_sadMxNx4D(32, 64);
+HIGHBD_SADMXN(32, 64)
+HIGHBD_SAD_MXNX4D(32, 64)
 
 // 32x32
-highbd_sadMxN(32, 32);
-highbd_sadMxNx4D(32, 32);
+HIGHBD_SADMXN(32, 32)
+HIGHBD_SAD_MXNX4D(32, 32)
 
 // 32x16
-highbd_sadMxN(32, 16);
-highbd_sadMxNx4D(32, 16);
+HIGHBD_SADMXN(32, 16)
+HIGHBD_SAD_MXNX4D(32, 16)
 
 // 16x32
-highbd_sadMxN(16, 32);
-highbd_sadMxNx4D(16, 32);
+HIGHBD_SADMXN(16, 32)
+HIGHBD_SAD_MXNX4D(16, 32)
 
 // 16x16
-highbd_sadMxN(16, 16);
-highbd_sadMxNx4D(16, 16);
+HIGHBD_SADMXN(16, 16)
+HIGHBD_SAD_MXNX4D(16, 16)
 
 // 16x8
-highbd_sadMxN(16, 8);
-highbd_sadMxNx4D(16, 8);
+HIGHBD_SADMXN(16, 8)
+HIGHBD_SAD_MXNX4D(16, 8)
 
 // 8x16
-highbd_sadMxN(8, 16);
-highbd_sadMxNx4D(8, 16);
+HIGHBD_SADMXN(8, 16)
+HIGHBD_SAD_MXNX4D(8, 16)
 
 // 8x8
-highbd_sadMxN(8, 8);
-highbd_sadMxNx4D(8, 8);
+HIGHBD_SADMXN(8, 8)
+HIGHBD_SAD_MXNX4D(8, 8)
 
 // 8x4
-highbd_sadMxN(8, 4);
-highbd_sadMxNx4D(8, 4);
+HIGHBD_SADMXN(8, 4)
+HIGHBD_SAD_MXNX4D(8, 4)
 
 // 4x8
-highbd_sadMxN(4, 8);
-highbd_sadMxNx4D(4, 8);
+HIGHBD_SADMXN(4, 8)
+HIGHBD_SAD_MXNX4D(4, 8)
 
 // 4x4
-highbd_sadMxN(4, 4);
-highbd_sadMxNx4D(4, 4);
-
-highbd_sadMxN(4, 16);
-highbd_sadMxNx4D(4, 16);
-highbd_sadMxN(16, 4);
-highbd_sadMxNx4D(16, 4);
-highbd_sadMxN(8, 32);
-highbd_sadMxNx4D(8, 32);
-highbd_sadMxN(32, 8);
-highbd_sadMxNx4D(32, 8);
-highbd_sadMxN(16, 64);
-highbd_sadMxNx4D(16, 64);
-highbd_sadMxN(64, 16);
-highbd_sadMxNx4D(64, 16);
+HIGHBD_SADMXN(4, 4)
+HIGHBD_SAD_MXNX4D(4, 4)
+
+HIGHBD_SADMXN(4, 16)
+HIGHBD_SAD_MXNX4D(4, 16)
+HIGHBD_SADMXN(16, 4)
+HIGHBD_SAD_MXNX4D(16, 4)
+HIGHBD_SADMXN(8, 32)
+HIGHBD_SAD_MXNX4D(8, 32)
+HIGHBD_SADMXN(32, 8)
+HIGHBD_SAD_MXNX4D(32, 8)
+HIGHBD_SADMXN(16, 64)
+HIGHBD_SAD_MXNX4D(16, 64)
+HIGHBD_SADMXN(64, 16)
+HIGHBD_SAD_MXNX4D(64, 16)
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/aom_dsp/sad_av1.c b/media/libaom/src/aom_dsp/sad_av1.c
index 467518163b..f3d5847bd5 100644
--- a/media/libaom/src/aom_dsp/sad_av1.c
+++ b/media/libaom/src/aom_dsp/sad_av1.c
@@ -51,9 +51,9 @@ static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride,
                         msk_stride, m, n);                                     \
   }                                                                            \
   void aom_masked_sad##m##x##n##x4d_c(                                         \
-      const uint8_t *src, int src_stride, const uint8_t *ref[],                \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],               \
       int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
-      int msk_stride, int invert_mask, unsigned sads[]) {                      \
+      int msk_stride, int invert_mask, unsigned sads[4]) {                     \
     if (!invert_mask)                                                          \
       for (int i = 0; i < 4; i++) {                                            \
         sads[i] = masked_sad(src, src_stride, ref[i], ref_stride, second_pred, \
@@ -156,6 +156,7 @@ HIGHBD_MASKSADMXN(16, 64)
 HIGHBD_MASKSADMXN(64, 16)
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
+#if !CONFIG_REALTIME_ONLY
 // pre: predictor being evaluated
 // wsrc: target weighted prediction (has been *4096 to keep precision)
 // mask: 2d weights (scaled by 4096)
@@ -262,3 +263,4 @@ HIGHBD_OBMCSADMXN(16, 64)
 HIGHBD_OBMCSADMXN(64, 16)
 /* clang-format on */
 #endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/media/libaom/src/aom_dsp/simd/v128_intrinsics_c.h b/media/libaom/src/aom_dsp/simd/v128_intrinsics_c.h
index 466a41e107..f0480209c2 100644
--- a/media/libaom/src/aom_dsp/simd/v128_intrinsics_c.h
+++ b/media/libaom/src/aom_dsp/simd/v128_intrinsics_c.h
@@ -64,9 +64,9 @@ SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
 SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
   c_v128 t;
   uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&t;
   int c;
-  for (c = 0; c < 16; c++) q[c] = pp[c];
+  // Note memcpy is avoided due to some versions of gcc issuing -Warray-bounds.
+  for (c = 0; c < 16; c++) t.u8[c] = pp[c];
   return t;
 }
 
@@ -80,9 +80,8 @@ SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
 
 SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
   uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&a;
   int c;
-  for (c = 0; c < 16; c++) pp[c] = q[c];
+  for (c = 0; c < 16; c++) pp[c] = a.u8[c];
 }
 
 SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
diff --git a/media/libaom/src/aom_dsp/simd/v256_intrinsics_c.h b/media/libaom/src/aom_dsp/simd/v256_intrinsics_c.h
index 8127ee3566..dcfe33de1f 100644
--- a/media/libaom/src/aom_dsp/simd/v256_intrinsics_c.h
+++ b/media/libaom/src/aom_dsp/simd/v256_intrinsics_c.h
@@ -71,9 +71,9 @@ SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
 SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
   c_v256 t;
   uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&t;
   int c;
-  for (c = 0; c < 32; c++) q[c] = pp[c];
+  // Note memcpy is avoided due to some versions of gcc issuing -Warray-bounds.
+  for (c = 0; c < 32; c++) t.u8[c] = pp[c];
   return t;
 }
 
@@ -87,9 +87,8 @@ SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
 
 SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
   uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&a;
   int c;
-  for (c = 0; c < 32; c++) pp[c] = q[c];
+  for (c = 0; c < 32; c++) pp[c] = a.u8[c];
 }
 
 SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
diff --git a/media/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h b/media/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h
index 5983cb80ce..eb5eaf0632 100644
--- a/media/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/media/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h
@@ -664,15 +664,14 @@ SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
                   v128_shl_n_byte(v256_low_v128(a), (n)-16), 1))
 
 // _mm256_srli_si256 works on 128 bit lanes and can't be used
-#define v256_shr_n_byte(a, n)                                                \
-  ((n) < 16                                                                  \
-       ? _mm256_alignr_epi8(                                                 \
-             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
-       : ((n) == 16                                                          \
-              ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3)      \
-              : _mm256_inserti128_si256(                                     \
-                    _mm256_setzero_si256(),                                  \
-                    v128_align(v256_high_v128(a), v256_high_v128(a), n), 0)))
+#define v256_shr_n_byte(a, n)                                                 \
+  ((n) < 16                                                                   \
+       ? _mm256_alignr_epi8(                                                  \
+             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n)  \
+       : ((n) == 16 ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3) \
+                    : _mm256_inserti128_si256(                                \
+                          _mm256_setzero_si256(),                             \
+                          v128_shr_n_byte(v256_high_v128(a), (n)-16), 0)))
 
 // _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
 #define v256_align(a, b, c) \
diff --git a/media/libaom/src/aom_dsp/ssim.c b/media/libaom/src/aom_dsp/ssim.c
index 95b88887bc..35d493b038 100644
--- a/media/libaom/src/aom_dsp/ssim.c
+++ b/media/libaom/src/aom_dsp/ssim.c
@@ -16,8 +16,8 @@
 
 #include "aom_dsp/ssim.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 
+#if CONFIG_INTERNAL_STATS
 void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
                             uint32_t *sum_s, uint32_t *sum_r,
                             uint32_t *sum_sq_s, uint32_t *sum_sq_r,
@@ -33,6 +33,7 @@ void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
     }
   }
 }
+#endif  // CONFIG_INTERNAL_STATS
 
 void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
@@ -49,24 +50,6 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
   }
 }
 
-#if CONFIG_AV1_HIGHBITDEPTH
-void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
-                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
-                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
-                                 uint32_t *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 8; i++, s += sp, r += rp) {
-    for (j = 0; j < 8; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-#endif
-
 static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
 static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
 static const int64_t cc1_10 = 428658;    // (64^2*(.01*1023)^2
@@ -77,8 +60,8 @@ static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
 static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
                          uint32_t sum_sq_r, uint32_t sum_sxr, int count,
                          uint32_t bd) {
-  int64_t ssim_n, ssim_d;
-  int64_t c1, c2;
+  double ssim_n, ssim_d;
+  int64_t c1 = 0, c2 = 0;
   if (bd == 8) {
     // scale the constants by number of pixels
     c1 = (cc1 * count * count) >> 12;
@@ -90,18 +73,19 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
     c1 = (cc1_12 * count * count) >> 12;
     c2 = (cc2_12 * count * count) >> 12;
   } else {
-    c1 = c2 = 0;
     assert(0);
+    // Return similarity as zero for unsupported bit-depth values.
+    return 0;
   }
 
-  ssim_n = (2 * sum_s * sum_r + c1) *
-           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+  ssim_n = (2.0 * sum_s * sum_r + c1) *
+           (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2);
 
-  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
-           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
-            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+  ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) *
+           ((double)count * sum_sq_s - (double)sum_s * sum_s +
+            (double)count * sum_sq_r - (double)sum_r * sum_r + c2);
 
-  return ssim_n * 1.0 / ssim_d;
+  return ssim_n / ssim_d;
 }
 
 static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
@@ -111,21 +95,11 @@ static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
 }
 
-static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
-                              int rp, uint32_t bd, uint32_t shift) {
-  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                            &sum_sxr);
-  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
-                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
-}
-
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
-static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
-                        int stride_img1, int stride_img2, int width,
-                        int height) {
+double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                 int stride_img2, int width, int height) {
   int i, j;
   int samples = 0;
   double ssim_total = 0;
@@ -143,30 +117,10 @@ static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
   return ssim_total;
 }
 
-static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
-                               int stride_img1, int stride_img2, int width,
-                               int height, uint32_t bd, uint32_t shift) {
-  int i, j;
-  int samples = 0;
-  double ssim_total = 0;
-
-  // sample point start with each 4x4 location
-  for (i = 0; i <= height - 8;
-       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
-    for (j = 0; j <= width - 8; j += 4) {
-      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
-                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
-                                 shift);
-      ssim_total += v;
-      samples++;
-    }
-  }
-  ssim_total /= samples;
-  return ssim_total;
-}
-
-double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                     const YV12_BUFFER_CONFIG *dest, double *weight) {
+#if CONFIG_INTERNAL_STATS
+void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *weight,
+                         double *fast_ssim) {
   double abc[3];
   for (int i = 0; i < 3; ++i) {
     const int is_uv = i > 0;
@@ -176,7 +130,7 @@ double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
   }
 
   *weight = 1;
-  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
+  *fast_ssim = abc[0] * .8 + .1 * (abc[1] + abc[2]);
 }
 
 // traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
@@ -272,7 +226,6 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
   int c = 0;
   double norm;
   double old_ssim_total = 0;
-  aom_clear_system_state();
   // We can sample points as frequently as we like start with 1 per 4x4.
   for (i = 0; i < height;
        i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
@@ -420,12 +373,62 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
   m->dssim = dssim_total;
   return inconsistency_total;
 }
+#endif  // CONFIG_INTERNAL_STATS
 
-double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                            const YV12_BUFFER_CONFIG *dest, double *weight,
-                            uint32_t bd, uint32_t in_bd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
+                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
+                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                 uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                              int rp, uint32_t bd, uint32_t shift) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                            &sum_sxr);
+  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+
+double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width, int height,
+                        uint32_t bd, uint32_t shift) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+                                 shift);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+#if CONFIG_INTERNAL_STATS
+void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                          const YV12_BUFFER_CONFIG *dest, double *weight,
+                          uint32_t bd, uint32_t in_bd, double *fast_ssim) {
   assert(bd >= in_bd);
-  const uint32_t shift = bd - in_bd;
+  uint32_t shift = bd - in_bd;
 
   double abc[3];
   for (int i = 0; i < 3; ++i) {
@@ -436,6 +439,43 @@ double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                               source->crop_heights[is_uv], in_bd, shift);
   }
 
-  *weight = 1;
-  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
+  weight[0] = 1;
+  fast_ssim[0] = abc[0] * .8 + .1 * (abc[1] + abc[2]);
+
+  if (bd > in_bd) {
+    // Compute SSIM based on stream bit depth
+    shift = 0;
+    for (int i = 0; i < 3; ++i) {
+      const int is_uv = i > 0;
+      abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i],
+                                source->strides[is_uv], dest->strides[is_uv],
+                                source->crop_widths[is_uv],
+                                source->crop_heights[is_uv], bd, shift);
+    }
+
+    weight[1] = 1;
+    fast_ssim[1] = abc[0] * .8 + .1 * (abc[1] + abc[2]);
+  }
+}
+#endif  // CONFIG_INTERNAL_STATS
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if CONFIG_INTERNAL_STATS
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig,
+                   const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth,
+                   const uint32_t in_bit_depth, int is_hbd, double *weight,
+                   double *frame_ssim2) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_hbd) {
+    aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth,
+                         frame_ssim2);
+    return;
+  }
+#else
+  (void)bit_depth;
+  (void)in_bit_depth;
+  (void)is_hbd;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  aom_lowbd_calc_ssim(orig, recon, weight, frame_ssim2);
 }
+#endif  // CONFIG_INTERNAL_STATS
diff --git a/media/libaom/src/aom_dsp/ssim.h b/media/libaom/src/aom_dsp/ssim.h
index 55038f4c22..fb92556a8c 100644
--- a/media/libaom/src/aom_dsp/ssim.h
+++ b/media/libaom/src/aom_dsp/ssim.h
@@ -12,14 +12,13 @@
 #ifndef AOM_AOM_DSP_SSIM_H_
 #define AOM_AOM_DSP_SSIM_H_
 
-#define MAX_SSIM_DB 100.0;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #include "config/aom_config.h"
 
+#if CONFIG_INTERNAL_STATS
 #include "aom_scale/yv12config.h"
 
 // metrics used for calculating ssim, ssim2, dssim, and ssimc
@@ -68,17 +67,35 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
                             int img2_pitch, int width, int height, Ssimv *sv2,
                             Metrics *m, int do_inconsistency);
 
-double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                     const YV12_BUFFER_CONFIG *dest, double *weight);
+void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *weight,
+                         double *fast_ssim);
 
 double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
                          const YV12_BUFFER_CONFIG *dest, double *ssim_y,
                          double *ssim_u, double *ssim_v, uint32_t bd,
                          uint32_t in_bd);
 
-double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                            const YV12_BUFFER_CONFIG *dest, double *weight,
-                            uint32_t bd, uint32_t in_bd);
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                          const YV12_BUFFER_CONFIG *dest, double *weight,
+                          uint32_t bd, uint32_t in_bd, double *fast_ssim);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig,
+                   const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth,
+                   const uint32_t in_bit_depth, int is_hbd, double *weight,
+                   double *frame_ssim2);
+#endif  // CONFIG_INTERNAL_STATS
+
+double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                 int stride_img2, int width, int height);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width, int height,
+                        uint32_t bd, uint32_t shift);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/aom_dsp/subtract.c b/media/libaom/src/aom_dsp/subtract.c
index 4f4e355974..4f47e553d4 100644
--- a/media/libaom/src/aom_dsp/subtract.c
+++ b/media/libaom/src/aom_dsp/subtract.c
@@ -36,11 +36,10 @@ void aom_subtract_block_c(int rows, int cols, int16_t *diff,
 void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
                                  ptrdiff_t diff_stride, const uint8_t *src8,
                                  ptrdiff_t src_stride, const uint8_t *pred8,
-                                 ptrdiff_t pred_stride, int bd) {
+                                 ptrdiff_t pred_stride) {
   int r, c;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  (void)bd;
 
   for (r = 0; r < rows; r++) {
     for (c = 0; c < cols; c++) {
diff --git a/media/libaom/src/aom_dsp/sum_squares.c b/media/libaom/src/aom_dsp/sum_squares.c
index d739a60833..f58defaa11 100644
--- a/media/libaom/src/aom_dsp/sum_squares.c
+++ b/media/libaom/src/aom_dsp/sum_squares.c
@@ -71,3 +71,20 @@ uint64_t aom_var_2d_u16_c(uint8_t *src, int src_stride, int width, int height) {
 
   return (ss - s * s / (width * height));
 }
+
+uint64_t aom_sum_sse_2d_i16_c(const int16_t *src, int src_stride, int width,
+                              int height, int *sum) {
+  int r, c;
+  int16_t *srcp = (int16_t *)src;
+  int64_t ss = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const int16_t v = srcp[c];
+      ss += v * v;
+      *sum += v;
+    }
+    srcp += src_stride;
+  }
+  return ss;
+}
diff --git a/media/libaom/src/aom_dsp/txfm_common.h b/media/libaom/src/aom_dsp/txfm_common.h
index f13d69092d..67d9e90ca9 100644
--- a/media/libaom/src/aom_dsp/txfm_common.h
+++ b/media/libaom/src/aom_dsp/txfm_common.h
@@ -13,7 +13,6 @@
 #define AOM_AOM_DSP_TXFM_COMMON_H_
 
 #include "aom_dsp/aom_dsp_common.h"
-#include "av1/common/enums.h"
 
 // Constants and Macros used by all idct/dct functions
 #define DCT_CONST_BITS 14
@@ -22,6 +21,71 @@
 #define UNIT_QUANT_SHIFT 2
 #define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
 
+// block transform size
+enum {
+  TX_4X4,             // 4x4 transform
+  TX_8X8,             // 8x8 transform
+  TX_16X16,           // 16x16 transform
+  TX_32X32,           // 32x32 transform
+  TX_64X64,           // 64x64 transform
+  TX_4X8,             // 4x8 transform
+  TX_8X4,             // 8x4 transform
+  TX_8X16,            // 8x16 transform
+  TX_16X8,            // 16x8 transform
+  TX_16X32,           // 16x32 transform
+  TX_32X16,           // 32x16 transform
+  TX_32X64,           // 32x64 transform
+  TX_64X32,           // 64x32 transform
+  TX_4X16,            // 4x16 transform
+  TX_16X4,            // 16x4 transform
+  TX_8X32,            // 8x32 transform
+  TX_32X8,            // 32x8 transform
+  TX_16X64,           // 16x64 transform
+  TX_64X16,           // 64x16 transform
+  TX_SIZES_ALL,       // Includes rectangular transforms
+  TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
+  TX_SIZES_LARGEST = TX_64X64,
+  TX_INVALID = 255  // Invalid transform size
+} UENUM1BYTE(TX_SIZE);
+
+enum {
+  DCT_DCT,            // DCT in both horizontal and vertical
+  ADST_DCT,           // ADST in vertical, DCT in horizontal
+  DCT_ADST,           // DCT in vertical, ADST in horizontal
+  ADST_ADST,          // ADST in both directions
+  FLIPADST_DCT,       // FLIPADST in vertical, DCT in horizontal
+  DCT_FLIPADST,       // DCT in vertical, FLIPADST in horizontal
+  FLIPADST_FLIPADST,  // FLIPADST in both directions
+  ADST_FLIPADST,      // ADST in vertical, FLIPADST in horizontal
+  FLIPADST_ADST,      // FLIPADST in vertical, ADST in horizontal
+  IDTX,               // Identity in both directions
+  V_DCT,              // DCT in vertical, identity in horizontal
+  H_DCT,              // Identity in vertical, DCT in horizontal
+  V_ADST,             // ADST in vertical, identity in horizontal
+  H_ADST,             // Identity in vertical, ADST in horizontal
+  V_FLIPADST,         // FLIPADST in vertical, identity in horizontal
+  H_FLIPADST,         // Identity in vertical, FLIPADST in horizontal
+  TX_TYPES,
+  DCT_ADST_TX_MASK = 0x000F,  // Either DCT or ADST in each direction
+  TX_TYPE_INVALID = 255,      // Invalid transform type
+} UENUM1BYTE(TX_TYPE);
+
+enum {
+  // DCT only
+  EXT_TX_SET_DCTONLY,
+  // DCT + Identity only
+  EXT_TX_SET_DCT_IDTX,
+  // Discrete Trig transforms w/o flip (4) + Identity (1)
+  EXT_TX_SET_DTT4_IDTX,
+  // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2)
+  EXT_TX_SET_DTT4_IDTX_1DDCT,
+  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2)
+  EXT_TX_SET_DTT9_IDTX_1DDCT,
+  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
+  EXT_TX_SET_ALL16,
+  EXT_TX_SET_TYPES
+} UENUM1BYTE(TxSetType);
+
 typedef struct txfm_param {
   // for both forward and inverse transforms
   TX_TYPE tx_type;
diff --git a/media/libaom/src/aom_dsp/variance.c b/media/libaom/src/aom_dsp/variance.c
index 695f12a524..d7641607f4 100644
--- a/media/libaom/src/aom_dsp/variance.c
+++ b/media/libaom/src/aom_dsp/variance.c
@@ -14,7 +14,6 @@
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
@@ -23,10 +22,8 @@
 #include "aom_dsp/blend.h"
 #include "aom_dsp/variance.h"
 
-#include "av1/common/av1_common_int.h"
 #include "av1/common/filter.h"
 #include "av1/common/reconinter.h"
-#include "av1/encoder/reconinter_enc.h"
 
 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
                             int b_stride) {
@@ -212,6 +209,16 @@ void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
     variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
   }
 
+void aom_get_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
+                                const uint8_t *b, int b_stride, uint32_t *sse,
+                                int *sum) {
+  // Loop over 4 8x8 blocks. Process one 8x32 block.
+  for (int k = 0; k < 4; k++) {
+    variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse[k],
+             &sum[k]);
+  }
+}
+
 /* Identical to the variance call except it does not calculate the
  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
  * variable.
@@ -250,12 +257,16 @@ VARIANCES(4, 4)
 VARIANCES(4, 2)
 VARIANCES(2, 4)
 VARIANCES(2, 2)
+
+// Realtime mode doesn't use rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
 VARIANCES(4, 16)
 VARIANCES(16, 4)
 VARIANCES(8, 32)
 VARIANCES(32, 8)
 VARIANCES(16, 64)
 VARIANCES(64, 16)
+#endif
 
 GET_VAR(16, 16)
 GET_VAR(8, 8)
@@ -280,100 +291,6 @@ void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
   }
 }
 
-// Get pred block from up-sampled reference.
-void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                          int mi_row, int mi_col, const MV *const mv,
-                          uint8_t *comp_pred, int width, int height,
-                          int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                          int ref_stride, int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      int plane = 0;
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-
-      InterPredParams inter_pred_params;
-      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
-      const int_interpfilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-      av1_init_inter_params(
-          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
-          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
-          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
-      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
-                                        &inter_pred_params);
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter = av1_get_filter(subpel_search);
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    for (int i = 0; i < height; i++) {
-      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
-      comp_pred += width;
-      ref += ref_stride;
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
-                          -1, width, height);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
-                         16, width, height);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
-                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
-                          width, intermediate_height);
-    aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
-                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
-                         width, height);
-  }
-}
-
-void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                                   int mi_row, int mi_col, const MV *const mv,
-                                   uint8_t *comp_pred, const uint8_t *pred,
-                                   int width, int height, int subpel_x_q3,
-                                   int subpel_y_q3, const uint8_t *ref,
-                                   int ref_stride, int subpel_search) {
-  int i, j;
-
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
                                   int width, int height, const uint8_t *ref,
                                   int ref_stride,
@@ -394,30 +311,6 @@ void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
   }
 }
 
-void aom_dist_wtd_comp_avg_upsampled_pred_c(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
-  int i, j;
-  const int fwd_offset = jcp_param->fwd_offset;
-  const int bck_offset = jcp_param->bck_offset;
-
-  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
-      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
-      comp_pred[j] = (uint8_t)tmp;
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
 #if CONFIG_AV1_HIGHBITDEPTH
 static void highbd_variance64(const uint8_t *a8, int a_stride,
                               const uint8_t *b8, int b_stride, int w, int h,
@@ -789,12 +682,16 @@ HIGHBD_VARIANCES(4, 4)
 HIGHBD_VARIANCES(4, 2)
 HIGHBD_VARIANCES(2, 4)
 HIGHBD_VARIANCES(2, 2)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_VARIANCES(4, 16)
 HIGHBD_VARIANCES(16, 4)
 HIGHBD_VARIANCES(8, 32)
 HIGHBD_VARIANCES(32, 8)
 HIGHBD_VARIANCES(16, 64)
 HIGHBD_VARIANCES(64, 16)
+#endif
 
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
@@ -822,107 +719,6 @@ void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
   }
 }
 
-void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
-                                 const struct AV1Common *const cm, int mi_row,
-                                 int mi_col, const MV *const mv,
-                                 uint8_t *comp_pred8, int width, int height,
-                                 int subpel_x_q3, int subpel_y_q3,
-                                 const uint8_t *ref8, int ref_stride, int bd,
-                                 int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      int plane = 0;
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-
-      InterPredParams inter_pred_params;
-      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
-      const int_interpfilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-      av1_init_inter_params(
-          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
-          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
-          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
-      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
-                                        &inter_pred_params);
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter = av1_get_filter(subpel_search);
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-    for (int i = 0; i < height; i++) {
-      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
-      comp_pred += width;
-      ref += ref_stride;
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
-                                 16, NULL, -1, width, height, bd);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
-                                kernel, 16, width, height, bd);
-  } else {
-    DECLARE_ALIGNED(16, uint16_t,
-                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
-                                 ref_stride, CONVERT_TO_BYTEPTR(temp),
-                                 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                                 intermediate_height, bd);
-    aom_highbd_convolve8_vert_c(
-        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
-        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
-        bd);
-  }
-}
-
-void aom_highbd_comp_avg_upsampled_pred_c(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, int subpel_search) {
-  int i, j;
-
-  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
 void aom_highbd_dist_wtd_comp_avg_pred_c(
     uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
     const uint8_t *ref8, int ref_stride,
@@ -945,32 +741,6 @@ void aom_highbd_dist_wtd_comp_avg_pred_c(
     ref += ref_stride;
   }
 }
-
-void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
-    int subpel_search) {
-  int i, j;
-  const int fwd_offset = jcp_param->fwd_offset;
-  const int bck_offset = jcp_param->bck_offset;
-  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-  aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                              height, subpel_x_q3, subpel_y_q3, ref8,
-                              ref_stride, bd, subpel_search);
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
-      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
-      comp_pred[j] = (uint16_t)tmp;
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
@@ -993,25 +763,6 @@ void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
   }
 }
 
-void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                                    int mi_row, int mi_col, const MV *const mv,
-                                    uint8_t *comp_pred, const uint8_t *pred,
-                                    int width, int height, int subpel_x_q3,
-                                    int subpel_y_q3, const uint8_t *ref,
-                                    int ref_stride, const uint8_t *mask,
-                                    int mask_stride, int invert_mask,
-                                    int subpel_search) {
-  if (subpel_x_q3 | subpel_y_q3) {
-    aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                         subpel_search);
-    ref = comp_pred;
-    ref_stride = width;
-  }
-  aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
-                       mask_stride, invert_mask);
-}
-
 #define MASK_SUBPIX_VAR(W, H)                                                  \
   unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
@@ -1048,12 +799,16 @@ MASK_SUBPIX_VAR(64, 64)
 MASK_SUBPIX_VAR(64, 128)
 MASK_SUBPIX_VAR(128, 64)
 MASK_SUBPIX_VAR(128, 128)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
 MASK_SUBPIX_VAR(4, 16)
 MASK_SUBPIX_VAR(16, 4)
 MASK_SUBPIX_VAR(8, 32)
 MASK_SUBPIX_VAR(32, 8)
 MASK_SUBPIX_VAR(16, 64)
 MASK_SUBPIX_VAR(64, 16)
+#endif
 
 #if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
@@ -1078,19 +833,6 @@ void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
   }
 }
 
-void aom_highbd_comp_mask_upsampled_pred(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int bd, int subpel_search) {
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
-                            mask, mask_stride, invert_mask);
-}
-
 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
   unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
@@ -1174,14 +916,17 @@ HIGHBD_MASK_SUBPIX_VAR(64, 64)
 HIGHBD_MASK_SUBPIX_VAR(64, 128)
 HIGHBD_MASK_SUBPIX_VAR(128, 64)
 HIGHBD_MASK_SUBPIX_VAR(128, 128)
+#if !CONFIG_REALTIME_ONLY
 HIGHBD_MASK_SUBPIX_VAR(4, 16)
 HIGHBD_MASK_SUBPIX_VAR(16, 4)
 HIGHBD_MASK_SUBPIX_VAR(8, 32)
 HIGHBD_MASK_SUBPIX_VAR(32, 8)
 HIGHBD_MASK_SUBPIX_VAR(16, 64)
 HIGHBD_MASK_SUBPIX_VAR(64, 16)
+#endif
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
+#if !CONFIG_REALTIME_ONLY
 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
                                  const int32_t *wsrc, const int32_t *mask,
                                  int w, int h, unsigned int *sse, int *sum) {
@@ -1481,3 +1226,28 @@ HIGHBD_OBMC_SUBPIX_VAR(16, 64)
 HIGHBD_OBMC_VAR(64, 16)
 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
 #endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // !CONFIG_REALTIME_ONLY
+
+uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
+                             int sstride, int w, int h) {
+  uint64_t sum = 0;
+  for (int i = 0; i < h; i++) {
+    for (int j = 0; j < w; j++) {
+      int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
+      sum += e * e;
+    }
+  }
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
+                                    int sstride, int w, int h) {
+  uint64_t sum = 0;
+  for (int i = 0; i < h; i++) {
+    for (int j = 0; j < w; j++) {
+      int e = dst[i * dstride + j] - src[i * sstride + j];
+      sum += e * e;
+    }
+  }
+  return sum;
+}
diff --git a/media/libaom/src/aom_dsp/variance.h b/media/libaom/src/aom_dsp/variance.h
index 4550c17b34..dae419763a 100644
--- a/media/libaom/src/aom_dsp/variance.h
+++ b/media/libaom/src/aom_dsp/variance.h
@@ -69,13 +69,6 @@ typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
     const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
     const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
 
-void aom_highbd_comp_mask_upsampled_pred(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int bd, int subpel_search);
-
 typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
                                           const int32_t *wsrc,
                                           const int32_t *msk);
@@ -90,11 +83,15 @@ typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
 
 typedef struct aom_variance_vtable {
   aom_sad_fn_t sdf;
+  // Same as normal sad, but downsample the rows by a factor of 2.
+  aom_sad_fn_t sdsf;
   aom_sad_avg_fn_t sdaf;
   aom_variance_fn_t vf;
   aom_subpixvariance_fn_t svf;
   aom_subp_avg_variance_fn_t svaf;
   aom_sad_multi_d_fn_t sdx4df;
+  // Same as sadx4, but downsample the rows by a factor of 2.
+  aom_sad_multi_d_fn_t sdsx4df;
   aom_masked_sad_fn_t msdf;
   aom_masked_subpixvariance_fn_t msvf;
   aom_obmc_sad_fn_t osdf;
diff --git a/media/libaom/src/aom_dsp/vmaf.c b/media/libaom/src/aom_dsp/vmaf.c
index 3a012e7680..a40e00cb23 100644
--- a/media/libaom/src/aom_dsp/vmaf.c
+++ b/media/libaom/src/aom_dsp/vmaf.c
@@ -9,151 +9,184 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "aom_dsp/vmaf.h"
+
 #include <assert.h>
-#include <libvmaf/libvmaf.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#ifdef _WIN32
+#include <process.h>
+#else
+#include <unistd.h>
+#endif
 
 #include "aom_dsp/blend.h"
-#include "aom_dsp/vmaf.h"
-#include "aom_ports/system_state.h"
-
-typedef struct FrameData {
-  const YV12_BUFFER_CONFIG *source;
-  const YV12_BUFFER_CONFIG *distorted;
-  int frame_set;
-  int bit_depth;
-} FrameData;
 
 static void vmaf_fatal_error(const char *message) {
   fprintf(stderr, "Fatal error: %s\n", message);
   exit(EXIT_FAILURE);
 }
 
-// A callback function used to pass data to VMAF.
-// Returns 0 after reading a frame.
-// Returns 2 when there is no more frame to read.
-static int read_frame(float *ref_data, float *main_data, float *temp_data,
-                      int stride, void *user_data) {
-  FrameData *frames = (FrameData *)user_data;
-
-  if (!frames->frame_set) {
-    const int width = frames->source->y_width;
-    const int height = frames->source->y_height;
-    assert(width == frames->distorted->y_width);
-    assert(height == frames->distorted->y_height);
-
-    if (frames->bit_depth > 8) {
-      const float scale_factor = 1.0f / (float)(1 << (frames->bit_depth - 8));
-      uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(frames->source->y_buffer);
-      uint16_t *main_ptr = CONVERT_TO_SHORTPTR(frames->distorted->y_buffer);
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          ref_data[col] = scale_factor * (float)ref_ptr[col];
-        }
-        ref_ptr += frames->source->y_stride;
-        ref_data += stride / sizeof(*ref_data);
-      }
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          main_data[col] = scale_factor * (float)main_ptr[col];
-        }
-        main_ptr += frames->distorted->y_stride;
-        main_data += stride / sizeof(*main_data);
-      }
-    } else {
-      uint8_t *ref_ptr = frames->source->y_buffer;
-      uint8_t *main_ptr = frames->distorted->y_buffer;
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          ref_data[col] = (float)ref_ptr[col];
-        }
-        ref_ptr += frames->source->y_stride;
-        ref_data += stride / sizeof(*ref_data);
-      }
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          main_data[col] = (float)main_ptr[col];
-        }
-        main_ptr += frames->distorted->y_stride;
-        main_data += stride / sizeof(*main_data);
-      }
-    }
-    frames->frame_set = 1;
-    return 0;
+void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path) {
+  if (*vmaf_model != NULL) return;
+  VmafModelConfig model_cfg;
+  model_cfg.flags = VMAF_MODEL_FLAG_DISABLE_CLIP;
+  model_cfg.name = "vmaf";
+
+  if (vmaf_model_load_from_path(vmaf_model, &model_cfg, model_path)) {
+    vmaf_fatal_error("Failed to load VMAF model.");
   }
+}
 
-  (void)temp_data;
-  return 2;
+void aom_close_vmaf_model(VmafModel *vmaf_model) {
+  vmaf_model_destroy(vmaf_model);
 }
 
-void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *distorted, const int bit_depth,
-                   double *const vmaf) {
-  aom_clear_system_state();
-  const int width = source->y_width;
-  const int height = source->y_height;
-  FrameData frames = { source, distorted, 0, bit_depth };
-  char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p";
-  double vmaf_score;
-  const int ret =
-      compute_vmaf(&vmaf_score, fmt, width, height, read_frame,
-                   /*user_data=*/&frames, (char *)model_path,
-                   /*log_path=*/NULL, /*log_fmt=*/NULL, /*disable_clip=*/1,
-                   /*disable_avx=*/0, /*enable_transform=*/0,
-                   /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0,
-                   /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0,
-                   /*n_subsample=*/1, /*enable_conf_interval=*/0);
-  if (ret) vmaf_fatal_error("Failed to compute VMAF scores.");
-
-  aom_clear_system_state();
-  *vmaf = vmaf_score;
+static void copy_picture(const int bit_depth, const YV12_BUFFER_CONFIG *src,
+                         VmafPicture *dst) {
+  const int width = src->y_width;
+  const int height = src->y_height;
+
+  if (bit_depth > 8) {
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src->y_buffer);
+    uint16_t *dst_ptr = dst->data[0];
+
+    for (int row = 0; row < height; ++row) {
+      memcpy(dst_ptr, src_ptr, width * sizeof(dst_ptr[0]));
+      src_ptr += src->y_stride;
+      dst_ptr += dst->stride[0] / 2;
+    }
+  } else {
+    uint8_t *src_ptr = src->y_buffer;
+    uint8_t *dst_ptr = (uint8_t *)dst->data[0];
+
+    for (int row = 0; row < height; ++row) {
+      memcpy(dst_ptr, src_ptr, width * sizeof(dst_ptr[0]));
+      src_ptr += src->y_stride;
+      dst_ptr += dst->stride[0];
+    }
+  }
 }
 
-void aom_calc_vmaf_multi_frame(
-    void *user_data, const char *model_path,
-    int (*read_frame)(float *ref_data, float *main_data, float *temp_data,
-                      int stride_byte, void *user_data),
-    int frame_width, int frame_height, int bit_depth, double *vmaf) {
-  aom_clear_system_state();
-
-  char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p";
-  double vmaf_score;
-  const int ret = compute_vmaf(
-      &vmaf_score, fmt, frame_width, frame_height, read_frame,
-      /*user_data=*/user_data, (char *)model_path,
-      /*log_path=*/"vmaf_scores.xml", /*log_fmt=*/NULL, /*disable_clip=*/0,
-      /*disable_avx=*/0, /*enable_transform=*/0,
-      /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0,
-      /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0,
-      /*n_subsample=*/1, /*enable_conf_interval=*/0);
-  FILE *vmaf_log = fopen("vmaf_scores.xml", "r");
-  if (vmaf_log == NULL || ret) {
-    vmaf_fatal_error("Failed to compute VMAF scores.");
+void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model,
+                           bool cal_vmaf_neg) {
+  // TODO(sdeng): make them CLI arguments.
+  VmafConfiguration cfg;
+  cfg.log_level = VMAF_LOG_LEVEL_NONE;
+  cfg.n_threads = 0;
+  cfg.n_subsample = 0;
+  cfg.cpumask = 0;
+
+  if (vmaf_init(vmaf_context, cfg)) {
+    vmaf_fatal_error("Failed to init VMAF context.");
   }
 
-  int frame_index = 0;
-  char buf[512];
-  while (fgets(buf, 511, vmaf_log) != NULL) {
-    if (memcmp(buf, "\t\t<frame ", 9) == 0) {
-      char *p = strstr(buf, "vmaf=");
-      if (p != NULL && p[5] == '"') {
-        char *p2 = strstr(&p[6], "\"");
-        *p2 = '\0';
-        const double score = atof(&p[6]);
-        if (score < 0.0 || score > 100.0) {
-          vmaf_fatal_error("Failed to compute VMAF scores.");
-        }
-        vmaf[frame_index++] = score;
-      }
+  if (cal_vmaf_neg) {
+    VmafFeatureDictionary *vif_feature = NULL;
+    if (vmaf_feature_dictionary_set(&vif_feature, "vif_enhn_gain_limit",
+                                    "1.0")) {
+      vmaf_fatal_error("Failed to set vif_enhn_gain_limit.");
+    }
+    if (vmaf_model_feature_overload(vmaf_model, "float_vif", vif_feature)) {
+      vmaf_fatal_error("Failed to use feature float_vif.");
+    }
+
+    VmafFeatureDictionary *adm_feature = NULL;
+    if (vmaf_feature_dictionary_set(&adm_feature, "adm_enhn_gain_limit",
+                                    "1.0")) {
+      vmaf_fatal_error("Failed to set adm_enhn_gain_limit.");
+    }
+    if (vmaf_model_feature_overload(vmaf_model, "adm", adm_feature)) {
+      vmaf_fatal_error("Failed to use feature float_adm.");
     }
   }
-  fclose(vmaf_log);
 
-  aom_clear_system_state();
+  VmafFeatureDictionary *motion_force_zero = NULL;
+  if (vmaf_feature_dictionary_set(&motion_force_zero, "motion_force_zero",
+                                  "1")) {
+    vmaf_fatal_error("Failed to set motion_force_zero.");
+  }
+  if (vmaf_model_feature_overload(vmaf_model, "float_motion",
+                                  motion_force_zero)) {
+    vmaf_fatal_error("Failed to use feature float_motion.");
+  }
+
+  if (vmaf_use_features_from_model(*vmaf_context, vmaf_model)) {
+    vmaf_fatal_error("Failed to load feature extractors from VMAF model.");
+  }
+}
+
+void aom_close_vmaf_context(VmafContext *vmaf_context) {
+  if (vmaf_close(vmaf_context)) {
+    vmaf_fatal_error("Failed to close VMAF context.");
+  }
+}
+
+void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                   bool cal_vmaf_neg, double *vmaf) {
+  VmafContext *vmaf_context;
+  aom_init_vmaf_context(&vmaf_context, vmaf_model, cal_vmaf_neg);
+  const int frame_index = 0;
+  VmafPicture ref, dist;
+  if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width,
+                         source->y_height) ||
+      vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth,
+                         source->y_width, source->y_height)) {
+    vmaf_fatal_error("Failed to alloc VMAF pictures.");
+  }
+  copy_picture(bit_depth, source, &ref);
+  copy_picture(bit_depth, distorted, &dist);
+  if (vmaf_read_pictures(vmaf_context, &ref, &dist,
+                         /*picture index=*/frame_index)) {
+    vmaf_fatal_error("Failed to read VMAF pictures.");
+  }
+
+  if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) {
+    vmaf_fatal_error("Failed to flush context.");
+  }
+
+  vmaf_picture_unref(&ref);
+  vmaf_picture_unref(&dist);
+
+  vmaf_score_at_index(vmaf_context, vmaf_model, vmaf, frame_index);
+  aom_close_vmaf_context(vmaf_context);
+}
+
+void aom_read_vmaf_image(VmafContext *vmaf_context,
+                         const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                         int frame_index) {
+  VmafPicture ref, dist;
+  if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width,
+                         source->y_height) ||
+      vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth,
+                         source->y_width, source->y_height)) {
+    vmaf_fatal_error("Failed to alloc VMAF pictures.");
+  }
+  copy_picture(bit_depth, source, &ref);
+  copy_picture(bit_depth, distorted, &dist);
+  if (vmaf_read_pictures(vmaf_context, &ref, &dist,
+                         /*picture index=*/frame_index)) {
+    vmaf_fatal_error("Failed to read VMAF pictures.");
+  }
+
+  vmaf_picture_unref(&ref);
+  vmaf_picture_unref(&dist);
+}
+
+double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model,
+                              int frame_index) {
+  double vmaf;
+  if (vmaf_score_at_index(vmaf_context, vmaf_model, &vmaf, frame_index)) {
+    vmaf_fatal_error("Failed to calc VMAF scores.");
+  }
+  return vmaf;
+}
+
+void aom_flush_vmaf_context(VmafContext *vmaf_context) {
+  if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) {
+    vmaf_fatal_error("Failed to flush context.");
+  }
 }
diff --git a/media/libaom/src/aom_dsp/vmaf.h b/media/libaom/src/aom_dsp/vmaf.h
index fb8bf4613a..b539cf8b76 100644
--- a/media/libaom/src/aom_dsp/vmaf.h
+++ b/media/libaom/src/aom_dsp/vmaf.h
@@ -12,16 +12,30 @@
 #ifndef AOM_AOM_DSP_VMAF_H_
 #define AOM_AOM_DSP_VMAF_H_
 
+#include <libvmaf/libvmaf.h>
+#include <stdbool.h>
+
 #include "aom_scale/yv12config.h"
 
-void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source,
+void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model,
+                           bool cal_vmaf_neg);
+void aom_close_vmaf_context(VmafContext *vmaf_context);
+
+void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path);
+void aom_close_vmaf_model(VmafModel *vmaf_model);
+
+void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source,
                    const YV12_BUFFER_CONFIG *distorted, int bit_depth,
-                   double *vmaf);
+                   bool cal_vmaf_neg, double *vmaf);
+
+void aom_read_vmaf_image(VmafContext *vmaf_context,
+                         const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                         int frame_index);
+
+double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model,
+                              int frame_index);
 
-void aom_calc_vmaf_multi_frame(
-    void *user_data, const char *model_path,
-    int (*read_frame)(float *ref_data, float *main_data, float *temp_data,
-                      int stride_byte, void *user_data),
-    int frame_width, int frame_height, int bit_depth, double *vmaf);
+void aom_flush_vmaf_context(VmafContext *vmaf_context);
 
 #endif  // AOM_AOM_DSP_VMAF_H_
diff --git a/media/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c b/media/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c
index e33dff20c2..b3dede75d5 100644
--- a/media/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c
@@ -12,7 +12,7 @@
 #include <immintrin.h>
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
-#include "av1/encoder/av1_quantize.h"
+#include "aom_dsp/quantize.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
 static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
diff --git a/media/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c b/media/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c
index 584cd671f1..503b9b4682 100644
--- a/media/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c
@@ -13,7 +13,7 @@
 #include <emmintrin.h>
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
-#include "av1/encoder/av1_quantize.h"
+#include "aom_dsp/quantize.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
 void aom_quantize_b_adaptive_sse2(
diff --git a/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c b/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c
index ce8285e43d..b08ec2546b 100644
--- a/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c
+++ b/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c
@@ -46,8 +46,8 @@ filter8_1dfunction aom_filter_block1d4_h2_sse2;
 //                              const int16_t *filter_x, int x_step_q4,
 //                              const int16_t *filter_y, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
 
 #if CONFIG_AV1_HIGHBITDEPTH
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
@@ -89,7 +89,7 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
 //                                     const int16_t *filter_y,
 //                                     int y_step_q4,
 //                                     int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
 #endif
 #endif  // HAVE_SSE2
diff --git a/media/libaom/src/aom_dsp/x86/aom_convolve_copy_avx2.c b/media/libaom/src/aom_dsp/x86/aom_convolve_copy_avx2.c
new file mode 100644
index 0000000000..a1043828fe
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/aom_convolve_copy_avx2.c
@@ -0,0 +1,256 @@
+/*
+ *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
+}
+
+void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+}
+
+static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
+  __m256i s[8];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+  s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
+  s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
+  s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
+
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+  _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
+  _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
+  _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
+  _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
+}
+
+void aom_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride, int w,
+                                   int h) {
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      highbd_copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      highbd_copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    assert(w == 128);
+    do {
+      highbd_copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      highbd_copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm b/media/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm
deleted file mode 100644
index 7283c32b8c..0000000000
--- a/media/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm
+++ /dev/null
@@ -1,297 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro convolve_fn 1-2
-%ifidn %1, avg
-%define AUX_XMM_REGS 4
-%else
-%define AUX_XMM_REGS 0
-%endif
-%ifidn %2, highbd
-%define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
-                                              dst, dst_stride, \
-                                              fx, fxs, fy, fys, w, h, bd
-%else
-%define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
-                                           dst, dst_stride, \
-                                           fx, fxs, fy, fys, w, h
-%endif
-  mov r4d, dword wm
-%ifidn %2, highbd
-  shl r4d, 1
-  shl srcq, 1
-  shl src_strideq, 1
-  shl dstq, 1
-  shl dst_strideq, 1
-%else
-  cmp r4d, 4
-  je .w4
-%endif
-  cmp r4d, 8
-  je .w8
-  cmp r4d, 16
-  je .w16
-  cmp r4d, 32
-  je .w32
-
-  cmp r4d, 64
-  je .w64
-%ifidn %2, highbd
-  cmp r4d, 128
-  je .w128
-
-.w256:
-  mov                    r4d, dword hm
-.loop256:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  movu                    m0, [srcq+128]
-  movu                    m1, [srcq+128+16]
-  movu                    m2, [srcq+128+32]
-  movu                    m3, [srcq+128+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq+128]
-  pavg                    m1, [dstq+128+16]
-  pavg                    m2, [dstq+128+32]
-  pavg                    m3, [dstq+128+48]
-%endif
-  mova         [dstq+128   ], m0
-  mova         [dstq+128+16], m1
-  mova         [dstq+128+32], m2
-  mova         [dstq+128+48], m3
-  movu                    m0, [srcq+128+64]
-  movu                    m1, [srcq+128+80]
-  movu                    m2, [srcq+128+96]
-  movu                    m3, [srcq+128+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+128+64]
-  pavg                    m1, [dstq+128+80]
-  pavg                    m2, [dstq+128+96]
-  pavg                    m3, [dstq+128+112]
-%endif
-  mova         [dstq+128+64], m0
-  mova         [dstq+128+80], m1
-  mova         [dstq+128+96], m2
-  mova        [dstq+128+112], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop256
-  RET
-%endif
-
-.w128:
-  mov                    r4d, dword hm
-.loop128:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop128
-  RET
-
-.w64:
-  mov                    r4d, dword hm
-.loop64:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop64
-  RET
-
-.w32:
-  mov                    r4d, dword hm
-.loop32:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+src_strideq]
-  movu                    m3, [srcq+src_strideq+16]
-  lea                   srcq, [srcq+src_strideq*2]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq            +16]
-  pavg                    m2, [dstq+dst_strideq]
-  pavg                    m3, [dstq+dst_strideq+16]
-%endif
-  mova [dstq               ], m0
-  mova [dstq            +16], m1
-  mova [dstq+dst_strideq   ], m2
-  mova [dstq+dst_strideq+16], m3
-  lea                   dstq, [dstq+dst_strideq*2]
-  sub                    r4d, 2
-  jnz .loop32
-  RET
-
-.w16:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop16:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+src_strideq]
-  movu                    m2, [srcq+src_strideq*2]
-  movu                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+dst_strideq]
-  pavg                    m2, [dstq+dst_strideq*2]
-  pavg                    m3, [dstq+r6q]
-%endif
-  mova  [dstq              ], m0
-  mova  [dstq+dst_strideq  ], m1
-  mova  [dstq+dst_strideq*2], m2
-  mova  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop16
-  RET
-
-.w8:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop8:
-  movh                    m0, [srcq]
-  movh                    m1, [srcq+src_strideq]
-  movh                    m2, [srcq+src_strideq*2]
-  movh                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  movh                    m4, [dstq]
-  movh                    m5, [dstq+dst_strideq]
-  movh                    m6, [dstq+dst_strideq*2]
-  movh                    m7, [dstq+r6q]
-  pavg                    m0, m4
-  pavg                    m1, m5
-  pavg                    m2, m6
-  pavg                    m3, m7
-%endif
-  movh  [dstq              ], m0
-  movh  [dstq+dst_strideq  ], m1
-  movh  [dstq+dst_strideq*2], m2
-  movh  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop8
-  RET
-
-%ifnidn %2, highbd
-.w4:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop4:
-  movd                    m0, [srcq]
-  movd                    m1, [srcq+src_strideq]
-  movd                    m2, [srcq+src_strideq*2]
-  movd                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  movd                    m4, [dstq]
-  movd                    m5, [dstq+dst_strideq]
-  movd                    m6, [dstq+dst_strideq*2]
-  movd                    m7, [dstq+r6q]
-  pavg                    m0, m4
-  pavg                    m1, m5
-  pavg                    m2, m6
-  pavg                    m3, m7
-%endif
-  movd  [dstq              ], m0
-  movd  [dstq+dst_strideq  ], m1
-  movd  [dstq+dst_strideq*2], m2
-  movd  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop4
-  RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-convolve_fn copy
-convolve_fn avg
-convolve_fn copy, highbd
diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c b/media/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.c
index f758775eef..f7b468a229 100644
--- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.c
@@ -1,21 +1,146 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <emmintrin.h>
-#include <assert.h>
+
+#include <immintrin.h>
 
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_dsp/aom_filter.h"
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
+  _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
+}
 
-static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
+
+static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
   __m128i s[8];
   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
@@ -35,7 +160,7 @@ static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
   _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
 }
 
-static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
   __m128i s[16];
   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
@@ -71,17 +196,9 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
   _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
 }
 
-void av1_highbd_convolve_2d_copy_sr_sse2(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-  (void)bd;
+void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride, int w,
+                                   int h) {
   if (w >= 16) {
     assert(!((intptr_t)dst % 16));
     assert(!(dst_stride % 16));
@@ -169,20 +286,20 @@ void av1_highbd_convolve_2d_copy_sr_sse2(
     } while (h);
   } else if (w == 64) {
     do {
-      copy_64(src, dst);
+      highbd_copy_64(src, dst);
       src += src_stride;
       dst += dst_stride;
-      copy_64(src, dst);
+      highbd_copy_64(src, dst);
       src += src_stride;
       dst += dst_stride;
       h -= 2;
     } while (h);
   } else {
     do {
-      copy_128(src, dst);
+      highbd_copy_128(src, dst);
       src += src_stride;
       dst += dst_stride;
-      copy_128(src, dst);
+      highbd_copy_128(src, dst);
       src += src_stride;
       dst += dst_stride;
       h -= 2;
diff --git a/media/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/media/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
index b6f040791a..d392225906 100644
--- a/media/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -211,7 +211,7 @@ SECTION .text
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d4_v8_sse2)
 sym(aom_highbd_filter_block1d4_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -281,7 +281,7 @@ sym(aom_highbd_filter_block1d4_v8_sse2):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d8_v8_sse2)
 sym(aom_highbd_filter_block1d8_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -340,7 +340,7 @@ sym(aom_highbd_filter_block1d8_v8_sse2):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d16_v8_sse2)
 sym(aom_highbd_filter_block1d16_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -403,7 +403,7 @@ sym(aom_highbd_filter_block1d16_v8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d4_h8_sse2)
 sym(aom_highbd_filter_block1d4_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -478,7 +478,7 @@ sym(aom_highbd_filter_block1d4_h8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d8_h8_sse2)
 sym(aom_highbd_filter_block1d8_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -544,7 +544,7 @@ sym(aom_highbd_filter_block1d8_h8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d16_h8_sse2)
 sym(aom_highbd_filter_block1d16_h8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
index a7152be57c..db4cad9bcb 100644
--- a/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -177,7 +177,7 @@
 
 SECTION .text
 
-global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d4_v2_sse2)
 sym(aom_highbd_filter_block1d4_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -201,7 +201,7 @@ sym(aom_highbd_filter_block1d4_v2_sse2):
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d8_v2_sse2)
 sym(aom_highbd_filter_block1d8_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -235,7 +235,7 @@ sym(aom_highbd_filter_block1d8_v2_sse2):
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d16_v2_sse2)
 sym(aom_highbd_filter_block1d16_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -271,7 +271,7 @@ sym(aom_highbd_filter_block1d16_v2_sse2):
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d4_h2_sse2)
 sym(aom_highbd_filter_block1d4_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -296,7 +296,7 @@ sym(aom_highbd_filter_block1d4_h2_sse2):
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d8_h2_sse2)
 sym(aom_highbd_filter_block1d8_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -330,7 +330,7 @@ sym(aom_highbd_filter_block1d8_h2_sse2):
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE
+globalsym(aom_highbd_filter_block1d16_h2_sse2)
 sym(aom_highbd_filter_block1d16_h2_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libaom/src/aom_dsp/x86/aom_quantize_avx.c b/media/libaom/src/aom_dsp/x86/aom_quantize_avx.c
new file mode 100644
index 0000000000..b2d6d4b76d
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/aom_quantize_avx.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+                                               tran_low_t *dqcoeff) {
+  const __m128i low = _mm_mullo_epi16(qcoeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(qcoeff, dequant);
+
+  const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        const int16_t *zbin_ptr, const int16_t *round_ptr,
+                        const int16_t *quant_ptr,
+                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan,
+                        const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m256i big_zero = _mm256_setzero_si256();
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+
+  *eob_ptr = 0;
+
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_test_all_zeros(all_zero, all_zero)) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+
+    if (n_coeffs == 16) return;
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < n_coeffs; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_test_all_zeros(all_zero, all_zero)) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+      continue;
+    }
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void aom_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m256i big_zero = _mm256_setzero_si256();
+  int index;
+  const int log_scale = 1;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+
+  // Setup global values.
+  // The 32x32 halves zbin and round.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, one);
+  zbin = _mm_srli_epi16(zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  zbin = _mm_sub_epi16(zbin, one);
+
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  round = _mm_add_epi16(round, one);
+  round = _mm_srli_epi16(round, 1);
+
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_test_all_zeros(all_zero, all_zero)) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+                                          &log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + 8, &log_scale);
+
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < n_coeffs; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_test_all_zeros(all_zero, all_zero)) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+      continue;
+    }
+
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+                                          dqcoeff_ptr + index, &log_scale);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + index + 8, &log_scale);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index 94b5da1714..d8d353c066 100644
--- a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -1435,7 +1435,7 @@ filter8_1dfunction aom_filter_block1d4_h2_ssse3;
 //                               const int16_t *filter_x, int x_step_q4,
 //                               const int16_t *filter_y, int y_step_q4,
 //                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
 
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index f64b821ea4..8a18279070 100644
--- a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -15,6 +15,10 @@
 
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_ssse3.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/emmintrin_compat.h"
@@ -742,6 +746,340 @@ static void aom_filter_block1d16_v4_ssse3(
   }
 }
 
+static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
+    const __m128i *const s, const int16_t *const filter) {
+  __m128i f[4];
+  shuffle_filter_ssse3(filter, f);
+  return convolve8_8_ssse3(s, f);
+}
+
+static void filter_horiz_w8_ssse3(const uint8_t *const src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const x_filter) {
+  __m128i s[8], ss[4], temp;
+
+  load_8bit_8x8(src, src_stride, s);
+  // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+  // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+  // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+  // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+  transpose_16bit_4x8(s, ss);
+  temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 8 bytes convolve result
+  _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void transpose8x8_to_dst(const uint8_t *const src,
+                                const ptrdiff_t src_stride, uint8_t *const dst,
+                                const ptrdiff_t dst_stride) {
+  __m128i s[8];
+
+  load_8bit_8x8(src, src_stride, s);
+  transpose_8bit_8x8(s, s);
+  store_8bit_8x8(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src,
+                                    const ptrdiff_t src_stride, uint8_t *dst,
+                                    const ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    const int x0_q4, const int x_step_q4,
+                                    const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas. The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = h + (8 - (h & 0x7));
+
+  do {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 8) {
+      // process 8 src_x steps
+      for (z = 0; z < 8; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
+        } else {
+          int i;
+          for (i = 0; i < 8; ++i) {
+            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 8x8 filters values back to dst
+      transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
+    }
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static void filter_horiz_w4_ssse3(const uint8_t *const src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const filter) {
+  __m128i s[4];
+  __m128i temp;
+
+  load_8bit_8x4(src, src_stride, s);
+  transpose_16bit_4x4(s, s);
+
+  temp = shuffle_filter_convolve8_8_ssse3(s, filter);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 4 bytes
+  *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void transpose4x4_to_dst(const uint8_t *const src,
+                                const ptrdiff_t src_stride, uint8_t *const dst,
+                                const ptrdiff_t dst_stride) {
+  __m128i s[4];
+
+  load_8bit_4x4(src, src_stride, s);
+  s[0] = transpose_8bit_4x4(s);
+  s[1] = _mm_srli_si128(s[0], 4);
+  s[2] = _mm_srli_si128(s[0], 8);
+  s[3] = _mm_srli_si128(s[0], 12);
+  store_8bit_4x4(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src,
+                                    const ptrdiff_t src_stride, uint8_t *dst,
+                                    const ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    const int x0_q4, const int x_step_q4,
+                                    const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; y += 4) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 4) {
+      // process 4 src_x steps
+      for (z = 0; z < 4; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
+        } else {
+          int i;
+          for (i = 0; i < 4; ++i) {
+            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 4x4 filters values back to dst
+      transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
+    }
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+  }
+}
+
+static __m128i filter_vert_kernel(const __m128i *const s,
+                                  const int16_t *const filter) {
+  __m128i ss[4];
+  __m128i temp;
+
+  // 00 10 01 11 02 12 03 13
+  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  // 20 30 21 31 22 32 23 33
+  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+  // 40 50 41 51 42 52 43 53
+  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+  // 60 70 61 71 62 72 63 73
+  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+  temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
+  // shrink to 8 bit each 16 bits
+  return _mm_packus_epi16(temp, temp);
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *const src,
+                                 const ptrdiff_t src_stride, uint8_t *const dst,
+                                 const int16_t *const filter) {
+  __m128i s[8];
+  __m128i temp;
+
+  load_8bit_4x8(src, src_stride, s);
+  temp = filter_vert_kernel(s, filter);
+  // save only 4 bytes
+  *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void scaledconvolve_vert_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+static void filter_vert_w8_ssse3(const uint8_t *const src,
+                                 const ptrdiff_t src_stride, uint8_t *const dst,
+                                 const int16_t *const filter) {
+  __m128i s[8], temp;
+
+  load_8bit_8x8(src, src_stride, s);
+  temp = filter_vert_kernel(s, filter);
+  // save only 8 bytes convolve result
+  _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void scaledconvolve_vert_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+    y_q4 += y_step_q4;
+  }
+}
+
+static void filter_vert_w16_ssse3(const uint8_t *src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const filter, const int w) {
+  int i;
+  __m128i f[4];
+  shuffle_filter_ssse3(filter, f);
+
+  for (i = 0; i < w; i += 16) {
+    __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
+
+    loadu_8bit_16x8(src, src_stride, s);
+
+    // merge the result together
+    s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
+    s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
+    s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
+    s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
+    s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
+    s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
+    s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
+    s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
+    temp_lo = convolve8_8_ssse3(s_lo, f);
+    temp_hi = convolve8_8_ssse3(s_hi, f);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first convolve
+    // result and the second lane contain the second convolve result
+    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
+    src += 16;
+    // save 16 bytes convolve result
+    _mm_store_si128((__m128i *)&dst[i], temp_hi);
+  }
+}
+
+static void scaledconvolve_vert_w16(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
+                            w);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+    y_q4 += y_step_q4;
+  }
+}
+
+void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                         int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if (w >= 8) {
+    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  } else {
+    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  }
+
+  if (w >= 16) {
+    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else if (w == 8) {
+    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else {
+    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  }
+}
+
 filter8_1dfunction aom_filter_block1d16_v8_ssse3;
 filter8_1dfunction aom_filter_block1d16_h8_ssse3;
 filter8_1dfunction aom_filter_block1d8_v8_ssse3;
@@ -766,5 +1104,5 @@ filter8_1dfunction aom_filter_block1d4_h2_ssse3;
 //                               const int16_t *filter_x, int x_step_q4,
 //                               const int16_t *filter_y, int y_step_q4,
 //                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3)
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3)
diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm
index c88fc9ffb2..640c5b2416 100644
--- a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm
@@ -190,7 +190,7 @@ SECTION .text
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(aom_filter_block1d4_v8_sse2) PRIVATE
+globalsym(aom_filter_block1d4_v8_sse2)
 sym(aom_filter_block1d4_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -257,7 +257,7 @@ sym(aom_filter_block1d4_v8_sse2):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(aom_filter_block1d8_v8_sse2) PRIVATE
+globalsym(aom_filter_block1d8_v8_sse2)
 sym(aom_filter_block1d8_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -316,7 +316,7 @@ sym(aom_filter_block1d8_v8_sse2):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(aom_filter_block1d16_v8_sse2) PRIVATE
+globalsym(aom_filter_block1d16_v8_sse2)
 sym(aom_filter_block1d16_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -379,7 +379,7 @@ sym(aom_filter_block1d16_v8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(aom_filter_block1d4_h8_sse2) PRIVATE
+globalsym(aom_filter_block1d4_h8_sse2)
 sym(aom_filter_block1d4_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -453,7 +453,7 @@ sym(aom_filter_block1d4_h8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(aom_filter_block1d8_h8_sse2) PRIVATE
+globalsym(aom_filter_block1d8_h8_sse2)
 sym(aom_filter_block1d8_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -528,7 +528,7 @@ sym(aom_filter_block1d8_h8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(aom_filter_block1d16_h8_sse2) PRIVATE
+globalsym(aom_filter_block1d16_h8_sse2)
 sym(aom_filter_block1d16_h8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
index d0b4b2839e..90dd55a4be 100644
--- a/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
@@ -136,7 +136,7 @@
 
 SECTION .text
 
-global sym(aom_filter_block1d4_v2_sse2) PRIVATE
+globalsym(aom_filter_block1d4_v2_sse2)
 sym(aom_filter_block1d4_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -160,7 +160,7 @@ sym(aom_filter_block1d4_v2_sse2):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d8_v2_sse2) PRIVATE
+globalsym(aom_filter_block1d8_v2_sse2)
 sym(aom_filter_block1d8_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -186,7 +186,7 @@ sym(aom_filter_block1d8_v2_sse2):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d16_v2_sse2) PRIVATE
+globalsym(aom_filter_block1d16_v2_sse2)
 sym(aom_filter_block1d16_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -214,7 +214,7 @@ sym(aom_filter_block1d16_v2_sse2):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_h2_sse2) PRIVATE
+globalsym(aom_filter_block1d4_h2_sse2)
 sym(aom_filter_block1d4_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -239,7 +239,7 @@ sym(aom_filter_block1d4_h2_sse2):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d8_h2_sse2) PRIVATE
+globalsym(aom_filter_block1d8_h2_sse2)
 sym(aom_filter_block1d8_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -266,7 +266,7 @@ sym(aom_filter_block1d8_h2_sse2):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d16_h2_sse2) PRIVATE
+globalsym(aom_filter_block1d16_h2_sse2)
 sym(aom_filter_block1d16_h2_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
index 59edc49a9b..253bc26d38 100644
--- a/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
+++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
@@ -110,7 +110,7 @@
 
 SECTION .text
 
-global sym(aom_filter_block1d4_v2_ssse3) PRIVATE
+globalsym(aom_filter_block1d4_v2_ssse3)
 sym(aom_filter_block1d4_v2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -134,7 +134,7 @@ sym(aom_filter_block1d4_v2_ssse3):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d8_v2_ssse3) PRIVATE
+globalsym(aom_filter_block1d8_v2_ssse3)
 sym(aom_filter_block1d8_v2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -160,7 +160,7 @@ sym(aom_filter_block1d8_v2_ssse3):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d16_v2_ssse3) PRIVATE
+globalsym(aom_filter_block1d16_v2_ssse3)
 sym(aom_filter_block1d16_v2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -187,7 +187,7 @@ sym(aom_filter_block1d16_v2_ssse3):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_h2_ssse3) PRIVATE
+globalsym(aom_filter_block1d4_h2_ssse3)
 sym(aom_filter_block1d4_h2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -212,7 +212,7 @@ sym(aom_filter_block1d4_h2_ssse3):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d8_h2_ssse3) PRIVATE
+globalsym(aom_filter_block1d8_h2_ssse3)
 sym(aom_filter_block1d8_h2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -239,7 +239,7 @@ sym(aom_filter_block1d8_h2_ssse3):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d16_h2_ssse3) PRIVATE
+globalsym(aom_filter_block1d16_h2_ssse3)
 sym(aom_filter_block1d16_h2_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libaom/src/aom_dsp/x86/avg_intrin_avx2.c b/media/libaom/src/aom_dsp/x86/avg_intrin_avx2.c
index 3bbffbd805..4039759263 100644
--- a/media/libaom/src/aom_dsp/x86/avg_intrin_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/avg_intrin_avx2.c
@@ -92,7 +92,7 @@ static void hadamard_col8x2_avx2(__m256i *in, int iter) {
   }
 }
 
-static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+void aom_hadamard_8x8_dual_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
                                 int16_t *coeff) {
   __m256i src[8];
   src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
@@ -141,7 +141,7 @@ static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
   int idx;
   for (idx = 0; idx < 2; ++idx) {
     const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
-    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+    aom_hadamard_8x8_dual_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
   }
 
   for (idx = 0; idx < 64; idx += 16) {
@@ -186,7 +186,7 @@ void aom_hadamard_lp_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
   int16_t *t_coeff = coeff;
   for (int idx = 0; idx < 2; ++idx) {
     const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
-    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+    aom_hadamard_8x8_dual_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
   }
 
   for (int idx = 0; idx < 64; idx += 16) {
@@ -502,3 +502,51 @@ int aom_satd_lp_avx2(const int16_t *coeff, int length) {
     return _mm_cvtsi128_si32(accum_128);
   }
 }
+
+static INLINE __m256i calc_avg_8x8_dual_avx2(const uint8_t *s, int p) {
+  const __m256i s0 =
+      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s)));
+  const __m256i s1 =
+      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + p)));
+  const __m256i s2 =
+      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 2 * p)));
+  const __m256i s3 =
+      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 3 * p)));
+  const __m256i sum0 =
+      _mm256_add_epi16(_mm256_add_epi16(s0, s1), _mm256_add_epi16(s2, s3));
+  const __m256i s4 =
+      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 4 * p)));
+  const __m256i s5 =
+      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 5 * p)));
+  const __m256i s6 =
+      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 6 * p)));
+  const __m256i s7 =
+      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 7 * p)));
+  const __m256i sum1 =
+      _mm256_add_epi16(_mm256_add_epi16(s4, s5), _mm256_add_epi16(s6, s7));
+
+  // The result of two 8x8 sub-blocks in 16x16 block.
+  return _mm256_add_epi16(sum0, sum1);
+}
+
+void aom_avg_8x8_quad_avx2(const uint8_t *s, int p, int x16_idx, int y16_idx,
+                           int *avg) {
+  // Process 1st and 2nd 8x8 sub-blocks in a 16x16 block.
+  const uint8_t *s_tmp = s + y16_idx * p + x16_idx;
+  __m256i result_0 = calc_avg_8x8_dual_avx2(s_tmp, p);
+
+  // Process 3rd and 4th 8x8 sub-blocks in a 16x16 block.
+  s_tmp = s + ((y16_idx + 8) * p) + x16_idx;
+  __m256i result_1 = calc_avg_8x8_dual_avx2(s_tmp, p);
+
+  const __m256i constant_32 = _mm256_set1_epi16(32);
+  result_0 = _mm256_hadd_epi16(result_0, result_1);
+  result_1 = _mm256_adds_epu16(result_0, _mm256_srli_si256(result_0, 4));
+  result_0 = _mm256_adds_epu16(result_1, _mm256_srli_si256(result_1, 2));
+  result_0 = _mm256_adds_epu16(result_0, constant_32);
+  result_0 = _mm256_srli_epi16(result_0, 6);
+  avg[0] = _mm_extract_epi16(_mm256_castsi256_si128(result_0), 0);
+  avg[1] = _mm_extract_epi16(_mm256_extracti128_si256(result_0, 1), 0);
+  avg[2] = _mm_extract_epi16(_mm256_castsi256_si128(result_0), 4);
+  avg[3] = _mm_extract_epi16(_mm256_extracti128_si256(result_0, 1), 4);
+}
diff --git a/media/libaom/src/aom_dsp/x86/avg_intrin_sse2.c b/media/libaom/src/aom_dsp/x86/avg_intrin_sse2.c
index 260ca2ad17..3fce9118c4 100644
--- a/media/libaom/src/aom_dsp/x86/avg_intrin_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/avg_intrin_sse2.c
@@ -121,6 +121,16 @@ unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
   return (avg + 32) >> 6;
 }
 
+void aom_avg_8x8_quad_sse2(const uint8_t *s, int p, int x16_idx, int y16_idx,
+                           int *avg) {
+  for (int k = 0; k < 4; k++) {
+    const int x8_idx = x16_idx + ((k & 1) << 3);
+    const int y8_idx = y16_idx + ((k >> 1) << 3);
+    const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
+    avg[k] = aom_avg_8x8_sse2(s_tmp, p);
+  }
+}
+
 unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
   unsigned int avg = 0;
@@ -139,6 +149,48 @@ unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
   return (avg + 8) >> 4;
 }
 
+static INLINE void hadamard_col4_sse2(__m128i *in, int iter) {
+  const __m128i a0 = in[0];
+  const __m128i a1 = in[1];
+  const __m128i a2 = in[2];
+  const __m128i a3 = in[3];
+  const __m128i b0 = _mm_srai_epi16(_mm_add_epi16(a0, a1), 1);
+  const __m128i b1 = _mm_srai_epi16(_mm_sub_epi16(a0, a1), 1);
+  const __m128i b2 = _mm_srai_epi16(_mm_add_epi16(a2, a3), 1);
+  const __m128i b3 = _mm_srai_epi16(_mm_sub_epi16(a2, a3), 1);
+  in[0] = _mm_add_epi16(b0, b2);
+  in[1] = _mm_add_epi16(b1, b3);
+  in[2] = _mm_sub_epi16(b0, b2);
+  in[3] = _mm_sub_epi16(b1, b3);
+
+  if (iter == 0) {
+    const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]);
+    const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]);
+    const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc);
+    const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc);
+    in[0] = dcba_lo;
+    in[1] = _mm_srli_si128(dcba_lo, 8);
+    in[2] = dcba_hi;
+    in[3] = _mm_srli_si128(dcba_hi, 8);
+  }
+}
+
+void aom_hadamard_4x4_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
+  __m128i src[4];
+  src[0] = _mm_loadl_epi64((const __m128i *)src_diff);
+  src[1] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
+
+  hadamard_col4_sse2(src, 0);
+  hadamard_col4_sse2(src, 1);
+
+  store_tran_low(_mm_unpacklo_epi64(src[0], src[1]), coeff);
+  coeff += 8;
+  store_tran_low(_mm_unpacklo_epi64(src[2], src[3]), coeff);
+}
+
 static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
   __m128i a0 = in[0];
   __m128i a1 = in[1];
@@ -272,8 +324,58 @@ void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
   hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
 }
 
-void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
-                              int16_t *coeff) {
+void aom_pixel_scale_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                          int16_t *coeff, int log_scale, int h8, int w8) {
+  __m128i src[8];
+  const int16_t *org_src_diff = src_diff;
+  int16_t *org_coeff = coeff;
+  int coeff_stride = w8 << 3;
+  for (int idy = 0; idy < h8; ++idy) {
+    for (int idx = 0; idx < w8; ++idx) {
+      src_diff = org_src_diff + (idx << 3);
+      coeff = org_coeff + (idx << 3);
+
+      src[0] = _mm_load_si128((const __m128i *)src_diff);
+      src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+      src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+      src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+      src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+      src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+      src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+      src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+      src[0] = _mm_slli_epi16(src[0], log_scale);
+      src[1] = _mm_slli_epi16(src[1], log_scale);
+      src[2] = _mm_slli_epi16(src[2], log_scale);
+      src[3] = _mm_slli_epi16(src[3], log_scale);
+      src[4] = _mm_slli_epi16(src[4], log_scale);
+      src[5] = _mm_slli_epi16(src[5], log_scale);
+      src[6] = _mm_slli_epi16(src[6], log_scale);
+      src[7] = _mm_slli_epi16(src[7], log_scale);
+
+      _mm_store_si128((__m128i *)coeff, src[0]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[1]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[2]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[3]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[4]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[5]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[6]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[7]);
+    }
+    org_src_diff += (src_stride << 3);
+    org_coeff += (coeff_stride << 3);
+  }
+}
+
+static INLINE void hadamard_lp_8x8_sse2(const int16_t *src_diff,
+                                        ptrdiff_t src_stride, int16_t *coeff) {
   __m128i src[8];
   src[0] = _mm_load_si128((const __m128i *)src_diff);
   src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
@@ -304,6 +406,57 @@ void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
   _mm_store_si128((__m128i *)coeff, src[7]);
 }
 
+void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                              int16_t *coeff) {
+  hadamard_lp_8x8_sse2(src_diff, src_stride, coeff);
+}
+
+void aom_hadamard_8x8_dual_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  for (int i = 0; i < 2; i++) {
+    hadamard_lp_8x8_sse2(src_diff + (i * 8), src_stride, coeff + (i * 64));
+  }
+}
+
+void aom_hadamard_lp_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  for (int idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    hadamard_lp_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  int16_t *t_coeff = coeff;
+  for (int idx = 0; idx < 64; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm_srai_epi16(b0, 1);
+    b1 = _mm_srai_epi16(b1, 1);
+    b2 = _mm_srai_epi16(b2, 1);
+    b3 = _mm_srai_epi16(b3, 1);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+
+    _mm_store_si128((__m128i *)t_coeff, coeff0);
+    _mm_store_si128((__m128i *)(t_coeff + 64), coeff1);
+    _mm_store_si128((__m128i *)(t_coeff + 128), coeff2);
+    _mm_store_si128((__m128i *)(t_coeff + 192), coeff3);
+
+    t_coeff += 8;
+  }
+}
+
 static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
                                        ptrdiff_t src_stride, tran_low_t *coeff,
                                        int is_final) {
@@ -416,17 +569,50 @@ void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
 int aom_satd_sse2(const tran_low_t *coeff, int length) {
   int i;
   const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
   __m128i accum = zero;
 
-  for (i = 0; i < length; i += 8) {
-    const __m128i src_line = load_tran_low(coeff);
-    const __m128i inv = _mm_sub_epi16(zero, src_line);
-    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
-    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
-    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
-    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
-    accum = _mm_add_epi32(accum, sum);
-    coeff += 8;
+  for (i = 0; i < length; i += 16) {
+    const __m128i src_line0 = load_tran_low(coeff);
+    const __m128i src_line1 = load_tran_low(coeff + 8);
+    const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
+    const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
+    const __m128i abs0 = _mm_max_epi16(src_line0, inv0);  // abs(src_line)
+    const __m128i abs1 = _mm_max_epi16(src_line1, inv1);  // abs(src_line)
+    const __m128i sum0 = _mm_madd_epi16(abs0, one);
+    const __m128i sum1 = _mm_madd_epi16(abs1, one);
+    accum = _mm_add_epi32(accum, sum0);
+    accum = _mm_add_epi32(accum, sum1);
+    coeff += 16;
+  }
+
+  {  // cascading summation of accum
+    __m128i hi = _mm_srli_si128(accum, 8);
+    accum = _mm_add_epi32(accum, hi);
+    hi = _mm_srli_epi64(accum, 32);
+    accum = _mm_add_epi32(accum, hi);
+  }
+
+  return _mm_cvtsi128_si32(accum);
+}
+
+int aom_satd_lp_sse2(const int16_t *coeff, int length) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i accum = zero;
+
+  for (int i = 0; i < length; i += 16) {
+    const __m128i src_line0 = _mm_loadu_si128((const __m128i *)coeff);
+    const __m128i src_line1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+    const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
+    const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
+    const __m128i abs0 = _mm_max_epi16(src_line0, inv0);  // abs(src_line)
+    const __m128i abs1 = _mm_max_epi16(src_line1, inv1);  // abs(src_line)
+    const __m128i sum0 = _mm_madd_epi16(abs0, one);
+    const __m128i sum1 = _mm_madd_epi16(abs1, one);
+    accum = _mm_add_epi32(accum, sum0);
+    accum = _mm_add_epi32(accum, sum1);
+    coeff += 16;
   }
 
   {  // cascading summation of accum
@@ -439,7 +625,7 @@ int aom_satd_sse2(const tran_low_t *coeff, int length) {
   return _mm_cvtsi128_si32(accum);
 }
 
-void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+void aom_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
                           const int ref_stride, const int height) {
   int idx = 1;
   __m128i zero = _mm_setzero_si128();
diff --git a/media/libaom/src/aom_dsp/x86/convolve_avx2.h b/media/libaom/src/aom_dsp/x86/convolve_avx2.h
index d516de5f2f..785ba39d9c 100644
--- a/media/libaom/src/aom_dsp/x86/convolve_avx2.h
+++ b/media/libaom/src/aom_dsp/x86/convolve_avx2.h
@@ -59,6 +59,166 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
+#define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP                                     \
+  for (i = 0; i < (im_h - 2); i += 2) {                                        \
+    __m256i data = _mm256_castsi128_si256(                                     \
+        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
+    data = _mm256_inserti128_si256(                                            \
+        data,                                                                  \
+        _mm_loadu_si128(                                                       \
+            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
+        1);                                                                    \
+    __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt);             \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }                                                                            \
+  __m256i data_1 = _mm256_castsi128_si256(                                     \
+      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));             \
+  __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt);             \
+  res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_4TAP                                      \
+  __m256i s[6];                                                               \
+  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
+  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
+  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
+  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
+                                                                              \
+  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
+  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
+  s[3] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
+  s[4] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
+                                                                              \
+  for (i = 0; i < h; i += 2) {                                                \
+    const int16_t *data = &im_block[i * im_stride];                           \
+    const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
+    const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
+    s[2] = _mm256_unpacklo_epi16(s4, s5);                                     \
+    s[5] = _mm256_unpackhi_epi16(s4, s5);                                     \
+                                                                              \
+    __m256i res_a = convolve_4tap(s, coeffs_v + 1);                           \
+    __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1);                       \
+                                                                              \
+    res_a =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);  \
+    res_b =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);  \
+    const __m256i res_a_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);               \
+    const __m256i res_b_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_b, round_const_v), round_shift_v);               \
+    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);   \
+    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);         \
+    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                     \
+    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                \
+                                                                              \
+    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                 \
+    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];    \
+    if (w - j > 4) {                                                          \
+      _mm_storel_epi64(p_0, res_0);                                           \
+      _mm_storel_epi64(p_1, res_1);                                           \
+    } else if (w == 4) {                                                      \
+      xx_storel_32(p_0, res_0);                                               \
+      xx_storel_32(p_1, res_1);                                               \
+    } else {                                                                  \
+      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                  \
+      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                  \
+    }                                                                         \
+                                                                              \
+    s[0] = s[1];                                                              \
+    s[1] = s[2];                                                              \
+    s[3] = s[4];                                                              \
+    s[4] = s[5];                                                              \
+  }
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP                                     \
+  for (i = 0; i < (im_h - 2); i += 2) {                                        \
+    __m256i data = _mm256_castsi128_si256(                                     \
+        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
+    data = _mm256_inserti128_si256(                                            \
+        data,                                                                  \
+        _mm_loadu_si128(                                                       \
+            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
+        1);                                                                    \
+                                                                               \
+    __m256i res = convolve_lowbd_x_6tap(data, coeffs_h, filt);                 \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }                                                                            \
+                                                                               \
+  __m256i data_1 = _mm256_castsi128_si256(                                     \
+      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));             \
+                                                                               \
+  __m256i res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt);                 \
+                                                                               \
+  res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+                                                                               \
+  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_6TAP                                      \
+  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
+  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
+  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
+  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
+                                                                              \
+  __m256i s[8];                                                               \
+  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
+  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
+                                                                              \
+  s[3] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
+  s[4] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
+                                                                              \
+  for (i = 0; i < h; i += 2) {                                                \
+    const int16_t *data = &im_block[i * im_stride];                           \
+                                                                              \
+    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
+    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
+                                                                              \
+    s[2] = _mm256_unpacklo_epi16(s6, s7);                                     \
+    s[5] = _mm256_unpackhi_epi16(s6, s7);                                     \
+                                                                              \
+    __m256i res_a = convolve_6tap(s, coeffs_v);                               \
+    __m256i res_b = convolve_6tap(s + 3, coeffs_v);                           \
+                                                                              \
+    res_a =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);  \
+    res_b =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);  \
+                                                                              \
+    const __m256i res_a_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);               \
+    const __m256i res_b_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_b, round_const_v), round_shift_v);               \
+                                                                              \
+    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);   \
+    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);         \
+                                                                              \
+    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                     \
+    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                \
+                                                                              \
+    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                 \
+    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];    \
+    if (w - j > 4) {                                                          \
+      _mm_storel_epi64(p_0, res_0);                                           \
+      _mm_storel_epi64(p_1, res_1);                                           \
+    } else if (w == 4) {                                                      \
+      xx_storel_32(p_0, res_0);                                               \
+      xx_storel_32(p_1, res_1);                                               \
+    } else {                                                                  \
+      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                  \
+      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                  \
+    }                                                                         \
+                                                                              \
+    s[0] = s[1];                                                              \
+    s[1] = s[2];                                                              \
+                                                                              \
+    s[3] = s[4];                                                              \
+    s[4] = s[5];                                                              \
+  }
+
 #define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP                                     \
   for (i = 0; i < (im_h - 2); i += 2) {                                        \
     __m256i data = _mm256_castsi128_si256(                                     \
@@ -151,122 +311,296 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
     s[6] = s[7];                                                              \
   }
 
-#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP                               \
-  for (i = 0; i < im_h; i += 2) {                                              \
-    __m256i data = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));  \
-    if (i + 1 < im_h)                                                          \
-      data = _mm256_inserti128_si256(                                          \
-          data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);          \
-    src_h += (src_stride << 1);                                                \
-    __m256i res = convolve_lowbd_x(data, coeffs_x, filt);                      \
+#define CONVOLVE_SR_HORIZONTAL_FILTER_12TAP                                    \
+  const __m256i v_zero = _mm256_setzero_si256();                               \
+  __m256i s[12];                                                               \
+  if (w <= 4) {                                                                \
+    for (i = 0; i < im_h; i += 2) {                                            \
+      const __m256i data = _mm256_permute2x128_si256(                          \
+          _mm256_castsi128_si256(                                              \
+              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),     \
+          _mm256_castsi128_si256(_mm_loadu_si128(                              \
+              (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))),        \
+          0x20);                                                               \
+      const __m256i s_16l = _mm256_unpacklo_epi8(data, v_zero);                \
+      const __m256i s_16h = _mm256_unpackhi_epi8(data, v_zero);                \
+      const __m256i s_ll = _mm256_unpacklo_epi16(s_16l, s_16l);                \
+      const __m256i s_lh = _mm256_unpackhi_epi16(s_16l, s_16l);                \
                                                                                \
-    res =                                                                      \
-        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+      const __m256i s_hl = _mm256_unpacklo_epi16(s_16h, s_16h);                \
+      const __m256i s_hh = _mm256_unpackhi_epi16(s_16h, s_16h);                \
                                                                                \
-    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+      s[0] = _mm256_alignr_epi8(s_lh, s_ll, 2);                                \
+      s[1] = _mm256_alignr_epi8(s_lh, s_ll, 10);                               \
+      s[2] = _mm256_alignr_epi8(s_hl, s_lh, 2);                                \
+      s[3] = _mm256_alignr_epi8(s_hl, s_lh, 10);                               \
+      s[4] = _mm256_alignr_epi8(s_hh, s_hl, 2);                                \
+      s[5] = _mm256_alignr_epi8(s_hh, s_hl, 10);                               \
+                                                                               \
+      const __m256i res_lo = convolve_12taps(s, coeffs_h);                     \
+                                                                               \
+      __m256i res_32b_lo = _mm256_sra_epi32(                                   \
+          _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);         \
+      __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);         \
+      const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0);           \
+      const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1);           \
+      if (w > 2) {                                                             \
+        _mm_storel_epi64((__m128i *)&im_block[i * im_stride], res_0);          \
+        _mm_storel_epi64((__m128i *)&im_block[i * im_stride + im_stride],      \
+                         res_1);                                               \
+      } else {                                                                 \
+        uint32_t horiz_2;                                                      \
+        horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0);                          \
+        im_block[i * im_stride] = (uint16_t)horiz_2;                           \
+        im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16);               \
+        horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1);                          \
+        im_block[i * im_stride + im_stride] = (uint16_t)horiz_2;               \
+        im_block[i * im_stride + im_stride + 1] = (uint16_t)(horiz_2 >> 16);   \
+      }                                                                        \
+    }                                                                          \
+  } else {                                                                     \
+    for (i = 0; i < im_h; i++) {                                               \
+      const __m256i data = _mm256_permute2x128_si256(                          \
+          _mm256_castsi128_si256(                                              \
+              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),     \
+          _mm256_castsi128_si256(                                              \
+              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j + 4]))), \
+          0x20);                                                               \
+      const __m256i s_16l = _mm256_unpacklo_epi8(data, v_zero);                \
+      const __m256i s_16h = _mm256_unpackhi_epi8(data, v_zero);                \
+                                                                               \
+      const __m256i s_ll = _mm256_unpacklo_epi16(s_16l, s_16l);                \
+      const __m256i s_lh = _mm256_unpackhi_epi16(s_16l, s_16l);                \
+                                                                               \
+      const __m256i s_hl = _mm256_unpacklo_epi16(s_16h, s_16h);                \
+      const __m256i s_hh = _mm256_unpackhi_epi16(s_16h, s_16h);                \
+                                                                               \
+      s[0] = _mm256_alignr_epi8(s_lh, s_ll, 2);                                \
+      s[1] = _mm256_alignr_epi8(s_lh, s_ll, 10);                               \
+      s[2] = _mm256_alignr_epi8(s_hl, s_lh, 2);                                \
+      s[3] = _mm256_alignr_epi8(s_hl, s_lh, 10);                               \
+      s[4] = _mm256_alignr_epi8(s_hh, s_hl, 2);                                \
+      s[5] = _mm256_alignr_epi8(s_hh, s_hl, 10);                               \
+                                                                               \
+      const __m256i res_lo = convolve_12taps(s, coeffs_h);                     \
+                                                                               \
+      __m256i res_32b_lo = _mm256_sra_epi32(                                   \
+          _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);         \
+                                                                               \
+      __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);         \
+      _mm_store_si128((__m128i *)&im_block[i * im_stride],                     \
+                      _mm256_extracti128_si256(                                \
+                          _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0));     \
+    }                                                                          \
   }
 
-#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP                                 \
-  __m256i s[8];                                                                \
-  __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));      \
-  __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));      \
-  __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));      \
-  __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));      \
-  __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));      \
-  __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));      \
-                                                                               \
-  s[0] = _mm256_unpacklo_epi16(s0, s1);                                        \
-  s[1] = _mm256_unpacklo_epi16(s2, s3);                                        \
-  s[2] = _mm256_unpacklo_epi16(s4, s5);                                        \
-                                                                               \
-  s[4] = _mm256_unpackhi_epi16(s0, s1);                                        \
-  s[5] = _mm256_unpackhi_epi16(s2, s3);                                        \
-  s[6] = _mm256_unpackhi_epi16(s4, s5);                                        \
+#define CONVOLVE_SR_VERTICAL_FILTER_12TAP                                      \
+  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));   \
+  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));   \
+  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));   \
+  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));   \
+  __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));   \
+  __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));   \
+  __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride));   \
+  __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride));   \
+  __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride));   \
+  __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride));   \
+                                                                               \
+  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                  \
+  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                  \
+  s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                  \
+  s[3] = _mm256_unpacklo_epi16(src_6, src_7);                                  \
+  s[4] = _mm256_unpacklo_epi16(src_8, src_9);                                  \
+                                                                               \
+  s[6] = _mm256_unpackhi_epi16(src_0, src_1);                                  \
+  s[7] = _mm256_unpackhi_epi16(src_2, src_3);                                  \
+  s[8] = _mm256_unpackhi_epi16(src_4, src_5);                                  \
+  s[9] = _mm256_unpackhi_epi16(src_6, src_7);                                  \
+  s[10] = _mm256_unpackhi_epi16(src_8, src_9);                                 \
                                                                                \
   for (i = 0; i < h; i += 2) {                                                 \
     const int16_t *data = &im_block[i * im_stride];                            \
                                                                                \
-    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));  \
-    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));  \
+    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \
+    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \
+                                                                               \
+    s[5] = _mm256_unpacklo_epi16(s6, s7);                                      \
+    s[11] = _mm256_unpackhi_epi16(s6, s7);                                     \
                                                                                \
-    s[3] = _mm256_unpacklo_epi16(s6, s7);                                      \
-    s[7] = _mm256_unpackhi_epi16(s6, s7);                                      \
+    __m256i res_a = convolve_12taps(s, coeffs_v);                              \
+    __m256i res_b = convolve_12taps(s + 6, coeffs_v);                          \
+                                                                               \
+    res_a =                                                                    \
+        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);   \
+    res_b =                                                                    \
+        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);   \
                                                                                \
-    const __m256i res_a = convolve(s, coeffs_y);                               \
     const __m256i res_a_round = _mm256_sra_epi32(                              \
         _mm256_add_epi32(res_a, round_const_v), round_shift_v);                \
+    const __m256i res_b_round = _mm256_sra_epi32(                              \
+        _mm256_add_epi32(res_b, round_const_v), round_shift_v);                \
                                                                                \
-    if (w - j > 4) {                                                           \
-      const __m256i res_b = convolve(s + 4, coeffs_y);                         \
-      const __m256i res_b_round = _mm256_sra_epi32(                            \
-          _mm256_add_epi32(res_b, round_const_v), round_shift_v);              \
-      const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);    \
-      const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);    \
-                                                                               \
-      if (do_average) {                                                        \
-        const __m256i data_ref_0 = load_line2_avx2(                            \
-            &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);  \
-        const __m256i comp_avg_res =                                           \
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);  \
-                                                                               \
-        const __m256i round_result = convolve_rounding(                        \
-            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);    \
-                                                                               \
-        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \
-        const __m128i res_0 = _mm256_castsi256_si128(res_8);                   \
-        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);              \
-                                                                               \
-        _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);      \
-        _mm_storel_epi64(                                                      \
-            (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);   \
-      } else {                                                                 \
-        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);            \
-        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);         \
+    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);    \
+    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);          \
                                                                                \
-        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);       \
-        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),    \
-                        res_1);                                                \
-      }                                                                        \
+    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                      \
+    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                 \
+                                                                               \
+    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                  \
+    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];     \
+    if (w - j > 4) {                                                           \
+      _mm_storel_epi64(p_0, res_0);                                            \
+      _mm_storel_epi64(p_1, res_1);                                            \
+    } else if (w == 4) {                                                       \
+      xx_storel_32(p_0, res_0);                                                \
+      xx_storel_32(p_1, res_1);                                                \
     } else {                                                                   \
-      const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);    \
-      const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);    \
+      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                   \
+      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                   \
+    }                                                                          \
+                                                                               \
+    s[0] = s[1];                                                               \
+    s[1] = s[2];                                                               \
+    s[2] = s[3];                                                               \
+    s[3] = s[4];                                                               \
+    s[4] = s[5];                                                               \
+                                                                               \
+    s[6] = s[7];                                                               \
+    s[7] = s[8];                                                               \
+    s[8] = s[9];                                                               \
+    s[9] = s[10];                                                              \
+    s[10] = s[11];                                                             \
+  }
+
+#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP                        \
+  do {                                                                  \
+    for (i = 0; i < im_h; i += 2) {                                     \
+      __m256i data =                                                    \
+          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));    \
+      if (i + 1 < im_h)                                                 \
+        data = _mm256_inserti128_si256(                                 \
+            data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
+      src_h += (src_stride << 1);                                       \
+      __m256i res = convolve_lowbd_x(data, coeffs_x, filt);             \
+                                                                        \
+      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),      \
+                             round_shift_h);                            \
+                                                                        \
+      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);     \
+    }                                                                   \
+  } while (0)
+
+#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP                                 \
+  do {                                                                         \
+    __m256i s[8];                                                              \
+    __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));    \
+    __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));    \
+    __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));    \
+    __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));    \
+    __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));    \
+    __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));    \
+                                                                               \
+    s[0] = _mm256_unpacklo_epi16(s0, s1);                                      \
+    s[1] = _mm256_unpacklo_epi16(s2, s3);                                      \
+    s[2] = _mm256_unpacklo_epi16(s4, s5);                                      \
+                                                                               \
+    s[4] = _mm256_unpackhi_epi16(s0, s1);                                      \
+    s[5] = _mm256_unpackhi_epi16(s2, s3);                                      \
+    s[6] = _mm256_unpackhi_epi16(s4, s5);                                      \
+                                                                               \
+    for (i = 0; i < h; i += 2) {                                               \
+      const int16_t *data = &im_block[i * im_stride];                          \
                                                                                \
-      if (do_average) {                                                        \
-        const __m256i data_ref_0 = load_line2_avx2(                            \
-            &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);  \
+      const __m256i s6 =                                                       \
+          _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));               \
+      const __m256i s7 =                                                       \
+          _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));               \
                                                                                \
-        const __m256i comp_avg_res =                                           \
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);  \
+      s[3] = _mm256_unpacklo_epi16(s6, s7);                                    \
+      s[7] = _mm256_unpackhi_epi16(s6, s7);                                    \
                                                                                \
-        const __m256i round_result = convolve_rounding(                        \
-            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);    \
+      const __m256i res_a = convolve(s, coeffs_y);                             \
+      const __m256i res_a_round = _mm256_sra_epi32(                            \
+          _mm256_add_epi32(res_a, round_const_v), round_shift_v);              \
                                                                                \
-        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \
-        const __m128i res_0 = _mm256_castsi256_si128(res_8);                   \
-        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);              \
+      if (w - j > 4) {                                                         \
+        const __m256i res_b = convolve(s + 4, coeffs_y);                       \
+        const __m256i res_b_round = _mm256_sra_epi32(                          \
+            _mm256_add_epi32(res_b, round_const_v), round_shift_v);            \
+        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);  \
+        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
                                                                                \
-        *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);  \
-        *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =              \
-            _mm_cvtsi128_si32(res_1);                                          \
+        if (do_average) {                                                      \
+          const __m256i data_ref_0 =                                           \
+              load_line2_avx2(&dst[i * dst_stride + j],                        \
+                              &dst[i * dst_stride + j + dst_stride]);          \
+          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
+                                                &wt, use_dist_wtd_comp_avg);   \
                                                                                \
+          const __m256i round_result = convolve_rounding(                      \
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
+                                                                               \
+          const __m256i res_8 =                                                \
+              _mm256_packus_epi16(round_result, round_result);                 \
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
+                                                                               \
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);    \
+          _mm_storel_epi64(                                                    \
+              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
+        } else {                                                               \
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
+                                                                               \
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
+                          res_1);                                              \
+        }                                                                      \
       } else {                                                                 \
-        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);            \
-        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);         \
+        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);  \
+        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
+                                                                               \
+        if (do_average) {                                                      \
+          const __m256i data_ref_0 =                                           \
+              load_line2_avx2(&dst[i * dst_stride + j],                        \
+                              &dst[i * dst_stride + j + dst_stride]);          \
+                                                                               \
+          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
+                                                &wt, use_dist_wtd_comp_avg);   \
+                                                                               \
+          const __m256i round_result = convolve_rounding(                      \
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
+                                                                               \
+          const __m256i res_8 =                                                \
+              _mm256_packus_epi16(round_result, round_result);                 \
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
+                                                                               \
+          *(uint32_t *)(&dst0[i * dst_stride0 + j]) =                          \
+              _mm_cvtsi128_si32(res_0);                                        \
+          *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =            \
+              _mm_cvtsi128_si32(res_1);                                        \
+                                                                               \
+        } else {                                                               \
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
                                                                                \
-        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);       \
-        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),    \
-                        res_1);                                                \
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
+                          res_1);                                              \
+        }                                                                      \
       }                                                                        \
-    }                                                                          \
                                                                                \
-    s[0] = s[1];                                                               \
-    s[1] = s[2];                                                               \
-    s[2] = s[3];                                                               \
+      s[0] = s[1];                                                             \
+      s[1] = s[2];                                                             \
+      s[2] = s[3];                                                             \
                                                                                \
-    s[4] = s[5];                                                               \
-    s[5] = s[6];                                                               \
-    s[6] = s[7];                                                               \
-  }
+      s[4] = s[5];                                                             \
+      s[5] = s[6];                                                             \
+      s[6] = s[7];                                                             \
+    }                                                                          \
+  } while (0)
+
 static INLINE void prepare_coeffs_lowbd(
     const InterpFilterParams *const filter_params, const int subpel_q4,
     __m256i *const coeffs /* [4] */) {
@@ -295,6 +629,49 @@ static INLINE void prepare_coeffs_lowbd(
   coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
 }
 
+static INLINE void prepare_coeffs_6t_lowbd(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((int16_t)0xffff)));
+
+  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+  // coeffs 1 2 1 2 1 2 1 2
+  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
+  // coeffs 3 4 3 4 3 4 3 4
+  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
+  // coeffs 5 6 5 6 5 6 5 6
+  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
+}
+
+static INLINE void prepare_coeffs_6t(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 1 2 1 2 1 2 1 2
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+  // coeffs 3 4 3 4 3 4 3 4
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 5 6 5 6 5 6 5 6
+  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+}
+
 static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
                                   const int subpel_q4,
                                   __m256i *const coeffs /* [4] */) {
@@ -314,6 +691,30 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
   coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
 }
 
+static INLINE void prepare_coeffs_12taps(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+  __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+  // coeffs 8 9 10 11 0 0 0 0
+  coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8));
+  coeff = _mm256_broadcastq_epi64(coeff_8);
+  coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00);  // coeffs 8 9 8 9 8 9 8 9
+  coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55);  // coeffs 10 11 10 11.. 10 11
+}
+
 static INLINE __m256i convolve_lowbd(const __m256i *const s,
                                      const __m256i *const coeffs) {
   const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
@@ -328,6 +729,19 @@ static INLINE __m256i convolve_lowbd(const __m256i *const s,
   return res;
 }
 
+static INLINE __m256i convolve_lowbd_6tap(const __m256i *const s,
+                                          const __m256i *const coeffs) {
+  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res =
+      _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
+
+  return res;
+}
+
 static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
                                           const __m256i *const coeffs) {
   const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
@@ -339,6 +753,33 @@ static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
   return res;
 }
 
+static INLINE __m256i convolve_6tap(const __m256i *const s,
+                                    const __m256i *const coeffs) {
+  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+
+  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2);
+
+  return res;
+}
+
+static INLINE __m256i convolve_12taps(const __m256i *const s,
+                                      const __m256i *const coeffs) {
+  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
+  const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]);
+  const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]);
+
+  const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+                                        _mm256_add_epi32(res_2, res_3));
+  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1);
+
+  return res;
+}
+
 static INLINE __m256i convolve(const __m256i *const s,
                                const __m256i *const coeffs) {
   const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
@@ -374,6 +815,18 @@ static INLINE __m256i convolve_lowbd_x(const __m256i data,
   return convolve_lowbd(s, coeffs);
 }
 
+static INLINE __m256i convolve_lowbd_x_6tap(const __m256i data,
+                                            const __m256i *const coeffs,
+                                            const __m256i *const filt) {
+  __m256i s[4];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+  s[2] = _mm256_shuffle_epi8(data, filt[2]);
+
+  return convolve_lowbd_6tap(s, coeffs);
+}
+
 static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data,
                                             const __m256i *const coeffs,
                                             const __m256i *const filt) {
diff --git a/media/libaom/src/aom_dsp/x86/convolve_common_intrin.h b/media/libaom/src/aom_dsp/x86/convolve_common_intrin.h
index 707bd2d782..9e8662af46 100644
--- a/media/libaom/src/aom_dsp/x86/convolve_common_intrin.h
+++ b/media/libaom/src/aom_dsp/x86/convolve_common_intrin.h
@@ -28,4 +28,75 @@ static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
   _mm_store_si128((__m128i *)dst, d);
 }
 
+static INLINE void prepare_coeffs_12tap(const InterpFilterParams *filter_params,
+                                        int subpel_q4,
+                                        __m128i *coeffs /* [6] */) {
+  const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+  coeffs[0] = _mm_shuffle_epi32(coeffs_y, 0);    // coeffs 0 1 0 1 0 1 0 1
+  coeffs[1] = _mm_shuffle_epi32(coeffs_y, 85);   // coeffs 2 3 2 3 2 3 2 3
+  coeffs[2] = _mm_shuffle_epi32(coeffs_y, 170);  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[3] = _mm_shuffle_epi32(coeffs_y, 255);  // coeffs 6 7 6 7 6 7 6 7
+
+  coeffs_y = _mm_loadl_epi64((__m128i *)(y_filter + 8));
+
+  coeffs[4] = _mm_shuffle_epi32(coeffs_y, 0);  // coeffs 8 9 8 9 8 9 8 9
+  coeffs[5] =
+      _mm_shuffle_epi32(coeffs_y, 85);  // coeffs 10 11 10 11 10 11 10 11
+}
+
+static INLINE __m128i convolve_12tap(const __m128i *s, const __m128i *coeffs) {
+  const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
+  const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
+  const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
+  const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
+  const __m128i d4 = _mm_madd_epi16(s[4], coeffs[4]);
+  const __m128i d5 = _mm_madd_epi16(s[5], coeffs[5]);
+  const __m128i d_0123 =
+      _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
+  const __m128i d = _mm_add_epi32(_mm_add_epi32(d4, d5), d_0123);
+  return d;
+}
+
+static INLINE __m128i convolve_lo_x_12tap(const __m128i *s,
+                                          const __m128i *coeffs,
+                                          const __m128i zero) {
+  __m128i ss[6];
+  ss[0] = _mm_unpacklo_epi8(s[0], zero);  //  0  1  1  2  2  3  3  4
+  ss[1] = _mm_unpacklo_epi8(s[1], zero);  //  2  3  3  4  4  5  5  6
+  ss[2] = _mm_unpacklo_epi8(s[2], zero);  //  4  5  5  6  6  7  7  8
+  ss[3] = _mm_unpacklo_epi8(s[3], zero);  //  6  7  7  8  8  9  9 10
+  ss[4] = _mm_unpackhi_epi8(s[2], zero);  //  8  9  9 10 10 11 11 12
+  ss[5] = _mm_unpackhi_epi8(s[3], zero);  // 10 11 11 12 12 13 13 14
+  return convolve_12tap(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y_12tap(const __m128i *s,
+                                          const __m128i *coeffs) {
+  __m128i ss[6];
+  const __m128i zero = _mm_setzero_si128();
+  ss[0] = _mm_unpacklo_epi8(s[0], zero);
+  ss[1] = _mm_unpacklo_epi8(s[2], zero);
+  ss[2] = _mm_unpacklo_epi8(s[4], zero);
+  ss[3] = _mm_unpacklo_epi8(s[6], zero);
+  ss[4] = _mm_unpacklo_epi8(s[8], zero);
+  ss[5] = _mm_unpacklo_epi8(s[10], zero);
+  return convolve_12tap(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y_12tap(const __m128i *s,
+                                          const __m128i *coeffs) {
+  __m128i ss[6];
+  const __m128i zero = _mm_setzero_si128();
+  ss[0] = _mm_unpackhi_epi8(s[0], zero);
+  ss[1] = _mm_unpackhi_epi8(s[2], zero);
+  ss[2] = _mm_unpackhi_epi8(s[4], zero);
+  ss[3] = _mm_unpackhi_epi8(s[6], zero);
+  ss[4] = _mm_unpackhi_epi8(s[8], zero);
+  ss[5] = _mm_unpackhi_epi8(s[10], zero);
+  return convolve_12tap(ss, coeffs);
+}
 #endif  // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/media/libaom/src/aom_dsp/x86/convolve_sse2.h b/media/libaom/src/aom_dsp/x86/convolve_sse2.h
index 385c7c7e18..36b7d62b98 100644
--- a/media/libaom/src/aom_dsp/x86/convolve_sse2.h
+++ b/media/libaom/src/aom_dsp/x86/convolve_sse2.h
@@ -12,9 +12,10 @@
 #ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
 #define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
 
+#include "config/aom_scale_rtcd.h"
+
 // Note:
 //  This header file should be put below any x86 intrinsics head file
-
 static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
                                   const int subpel_q4,
                                   __m128i *const coeffs /* [4] */) {
diff --git a/media/libaom/src/aom_dsp/x86/convolve_ssse3.h b/media/libaom/src/aom_dsp/x86/convolve_ssse3.h
new file mode 100644
index 0000000000..b1abead146
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/convolve_ssse3.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
+
+#include <tmmintrin.h>  // SSSE3
+
+static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
+                                        __m128i *const f) {
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
+                                        const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/media/libaom/src/aom_dsp/x86/fft_avx2.c b/media/libaom/src/aom_dsp/x86/fft_avx2.c
index 4cccc5f00f..3f5a9bbeff 100644
--- a/media/libaom/src/aom_dsp/x86/fft_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/fft_avx2.c
@@ -22,13 +22,13 @@ extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
 // Generate the 1d forward transforms for float using _mm256
 GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-          _mm256_mul_ps);
+          _mm256_mul_ps)
 GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-           _mm256_mul_ps);
+           _mm256_mul_ps)
 GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-           _mm256_mul_ps);
+           _mm256_mul_ps)
 
 void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
   aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
@@ -48,13 +48,13 @@ void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
 // Generate the 1d inverse transforms for float using _mm256
 GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-           _mm256_mul_ps);
+           _mm256_mul_ps)
 GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
             _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-            _mm256_mul_ps);
+            _mm256_mul_ps)
 GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
             _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-            _mm256_mul_ps);
+            _mm256_mul_ps)
 
 void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
   aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
diff --git a/media/libaom/src/aom_dsp/x86/fft_sse2.c b/media/libaom/src/aom_dsp/x86/fft_sse2.c
index 6f20a3cc01..c6023afabd 100644
--- a/media/libaom/src/aom_dsp/x86/fft_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/fft_sse2.c
@@ -106,13 +106,13 @@ void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
 
 // Generate definitions for 1d transforms using float and __mm128
 GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-          _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
 GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-          _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
 GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
 GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
 
 void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
   aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
@@ -136,13 +136,13 @@ void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
 
 // Generate definitions for 1d inverse transforms using float and mm128
 GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-           _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
 GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
 GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
 GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
 
 void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
   aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
diff --git a/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
index c500b0a26c..05c87bcff9 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
@@ -14,10 +14,9 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
-#include "av1/encoder/av1_quantize.h"
-
 static INLINE void highbd_load_b_values_avx2(
     const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
     __m256i *round, const int16_t *quant_ptr, __m256i *quant,
diff --git a/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
index 8f31f3596f..ae31116e9d 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
@@ -13,8 +13,8 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
 #include "aom_dsp/x86/quantize_x86.h"
-#include "av1/encoder/av1_quantize.h"
 
 static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
   a = _mm_xor_si128(a, sign);
diff --git a/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c b/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c
index b43a7d7b5b..fdf9524ad6 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c
@@ -28,105 +28,28 @@ static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
                                              4, 5, 6,  7,  6,  7,  8,  9,
                                              8, 9, 10, 11, 10, 11, 12, 13 };
 
-void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
-                                   uint8_t *dst8, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int filter_x_stride,
-                                   const int16_t *filter_y, int filter_y_stride,
-                                   int width, int h, int bd) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
-
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
-    do {
-      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
-      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
-      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)dst, p0);
-      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
-      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
-      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 16) {  // width = 32
-    do {
-      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
-      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)dst, p0);
-      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 8) {  // width = 16
-    __m256i p0, p1;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      src += src_stride;
-      p1 = _mm256_loadu_si256((const __m256i *)src);
-      src += src_stride;
-
-      _mm256_storeu_si256((__m256i *)dst, p0);
-      dst += dst_stride;
-      _mm256_storeu_si256((__m256i *)dst, p1);
-      dst += dst_stride;
-      h -= 2;
-    } while (h > 0);
-  } else if (width > 4) {  // width = 8
-    __m128i p0, p1;
-    do {
-      p0 = _mm_loadu_si128((const __m128i *)src);
-      src += src_stride;
-      p1 = _mm_loadu_si128((const __m128i *)src);
-      src += src_stride;
-
-      _mm_storeu_si128((__m128i *)dst, p0);
-      dst += dst_stride;
-      _mm_storeu_si128((__m128i *)dst, p1);
-      dst += dst_stride;
-      h -= 2;
-    } while (h > 0);
-  } else {  // width = 4
-    __m128i p0, p1;
-    do {
-      p0 = _mm_loadl_epi64((const __m128i *)src);
-      src += src_stride;
-      p1 = _mm_loadl_epi64((const __m128i *)src);
-      src += src_stride;
-
-      _mm_storel_epi64((__m128i *)dst, p0);
-      dst += dst_stride;
-      _mm_storel_epi64((__m128i *)dst, p1);
-      dst += dst_stride;
-      h -= 2;
-    } while (h > 0);
-  }
-}
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const int subpel_x_qn,
+                                    ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_y_qn, int bd);
 
 void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
                                    const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_qn, const int subpel_y_qn,
-                                   ConvolveParams *conv_params, int bd) {
+                                   const int subpel_y_qn, int bd) {
+  if (filter_params_y->taps == 12) {
+    av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_y, subpel_y_qn, bd);
+    return;
+  }
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
   __m256i s[8], coeffs_y[4];
 
@@ -263,14 +186,17 @@ void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
 void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
                                    const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   const int subpel_x_qn,
                                    ConvolveParams *conv_params, int bd) {
+  if (filter_params_x->taps == 12) {
+    av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_x, subpel_x_qn, conv_params,
+                                   bd);
+    return;
+  }
   int i, j;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
-  (void)subpel_y_qn;
-  (void)filter_params_y;
 
   // Check that, even with 12-bit input, the intermediate values will fit
   // into an unsigned 16-bit intermediate array.
@@ -1317,7 +1243,7 @@ void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
 #define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
 #define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
 
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
 
 #undef HIGHBD_FUNC
diff --git a/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c b/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c
index a79350f5a6..5293e27644 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -15,27 +15,15 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
 
 void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
-                                    const InterpFilterParams *filter_params_x,
                                     const InterpFilterParams *filter_params_y,
-                                    const int subpel_x_qn,
-                                    const int subpel_y_qn,
-                                    ConvolveParams *conv_params, int bd) {
+                                    const int subpel_y_qn, int bd) {
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
-  __m128i s[16], coeffs_y[4];
-
   const int bits = FILTER_BITS;
 
   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
@@ -43,13 +31,14 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
   const __m128i clip_pixel =
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m128i zero = _mm_setzero_si128();
+  if (filter_params_y->taps == 12) {
+    __m128i s[24], coeffs_y[6];
 
-  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+    prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y);
 
-  for (j = 0; j < w; j += 8) {
-    const uint16_t *data = &src_ptr[j];
-    /* Vertical filter */
-    {
+    for (j = 0; j < w; j += 8) {
+      const uint16_t *data = &src_ptr[j];
+      /* Vertical filter */
       __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
       __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
       __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
@@ -57,49 +46,61 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
       __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
       __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
       __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+      __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+      __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+      __m128i s9 = _mm_loadu_si128((__m128i *)(data + 9 * src_stride));
+      __m128i s10 = _mm_loadu_si128((__m128i *)(data + 10 * src_stride));
 
       s[0] = _mm_unpacklo_epi16(s0, s1);
       s[1] = _mm_unpacklo_epi16(s2, s3);
       s[2] = _mm_unpacklo_epi16(s4, s5);
-
-      s[4] = _mm_unpackhi_epi16(s0, s1);
-      s[5] = _mm_unpackhi_epi16(s2, s3);
-      s[6] = _mm_unpackhi_epi16(s4, s5);
-
-      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
-      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
-      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
-
-      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
-      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
-      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+      s[3] = _mm_unpacklo_epi16(s6, s7);
+      s[4] = _mm_unpacklo_epi16(s8, s9);
+
+      s[6] = _mm_unpackhi_epi16(s0, s1);
+      s[7] = _mm_unpackhi_epi16(s2, s3);
+      s[8] = _mm_unpackhi_epi16(s4, s5);
+      s[9] = _mm_unpackhi_epi16(s6, s7);
+      s[10] = _mm_unpackhi_epi16(s8, s9);
+
+      s[12] = _mm_unpacklo_epi16(s1, s2);
+      s[13] = _mm_unpacklo_epi16(s3, s4);
+      s[14] = _mm_unpacklo_epi16(s5, s6);
+      s[15] = _mm_unpacklo_epi16(s7, s8);
+      s[16] = _mm_unpacklo_epi16(s9, s10);
+
+      s[18] = _mm_unpackhi_epi16(s1, s2);
+      s[19] = _mm_unpackhi_epi16(s3, s4);
+      s[20] = _mm_unpackhi_epi16(s5, s6);
+      s[21] = _mm_unpackhi_epi16(s7, s8);
+      s[22] = _mm_unpackhi_epi16(s9, s10);
 
       for (i = 0; i < h; i += 2) {
         data = &src_ptr[i * src_stride + j];
 
-        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
-        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+        __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * src_stride));
+        __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * src_stride));
 
-        s[3] = _mm_unpacklo_epi16(s6, s7);
-        s[7] = _mm_unpackhi_epi16(s6, s7);
+        s[5] = _mm_unpacklo_epi16(s10, s11);
+        s[11] = _mm_unpackhi_epi16(s10, s11);
 
-        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
-        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+        s[17] = _mm_unpacklo_epi16(s11, s12);
+        s[23] = _mm_unpackhi_epi16(s11, s12);
 
-        const __m128i res_a0 = convolve(s, coeffs_y);
+        const __m128i res_a0 = convolve_12tap(s, coeffs_y);
         __m128i res_a_round0 = _mm_sra_epi32(
             _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
 
-        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y);
         __m128i res_a_round1 = _mm_sra_epi32(
             _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
 
         if (w - j > 4) {
-          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y);
           __m128i res_b_round0 = _mm_sra_epi32(
               _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
 
-          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y);
           __m128i res_b_round1 = _mm_sra_epi32(
               _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
 
@@ -145,20 +146,149 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
         s[0] = s[1];
         s[1] = s[2];
         s[2] = s[3];
-
+        s[3] = s[4];
         s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
-
-        s[0 + 8] = s[1 + 8];
-        s[1 + 8] = s[2 + 8];
-        s[2 + 8] = s[3 + 8];
 
-        s[4 + 8] = s[5 + 8];
-        s[5 + 8] = s[6 + 8];
-        s[6 + 8] = s[7 + 8];
+        s[6] = s[7];
+        s[7] = s[8];
+        s[8] = s[9];
+        s[9] = s[10];
+        s[10] = s[11];
+
+        s[12] = s[13];
+        s[13] = s[14];
+        s[14] = s[15];
+        s[15] = s[16];
+        s[16] = s[17];
+
+        s[18] = s[19];
+        s[19] = s[20];
+        s[20] = s[21];
+        s[21] = s[22];
+        s[22] = s[23];
+
+        s10 = s12;
+      }
+    }
+  } else {
+    __m128i s[16], coeffs_y[4];
+
+    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+    for (j = 0; j < w; j += 8) {
+      const uint16_t *data = &src_ptr[j];
+      /* Vertical filter */
+      {
+        __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+        __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+        __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+        __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+        __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+        __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+        __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+        s[0] = _mm_unpacklo_epi16(s0, s1);
+        s[1] = _mm_unpacklo_epi16(s2, s3);
+        s[2] = _mm_unpacklo_epi16(s4, s5);
+
+        s[4] = _mm_unpackhi_epi16(s0, s1);
+        s[5] = _mm_unpackhi_epi16(s2, s3);
+        s[6] = _mm_unpackhi_epi16(s4, s5);
+
+        s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+        s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+        s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+        s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+        s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+        s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+        for (i = 0; i < h; i += 2) {
+          data = &src_ptr[i * src_stride + j];
+
+          __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+          __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+          s[3] = _mm_unpacklo_epi16(s6, s7);
+          s[7] = _mm_unpackhi_epi16(s6, s7);
+
+          s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+          s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+          const __m128i res_a0 = convolve(s, coeffs_y);
+          __m128i res_a_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+          const __m128i res_a1 = convolve(s + 8, coeffs_y);
+          __m128i res_a_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+          if (w - j > 4) {
+            const __m128i res_b0 = convolve(s + 4, coeffs_y);
+            __m128i res_b_round0 = _mm_sra_epi32(
+                _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+            const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+            __m128i res_b_round1 = _mm_sra_epi32(
+                _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+            __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+            res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+            res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+            __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+            res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+            res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_16bit1);
+          } else if (w == 4) {
+            res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+            res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+            res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+            res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+            res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+            res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_a_round1);
+          } else {
+            res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+            res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+            res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+            res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+            res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+            res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+            *((uint32_t *)(&dst[i * dst_stride + j])) =
+                _mm_cvtsi128_si32(res_a_round0);
+
+            *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+                _mm_cvtsi128_si32(res_a_round1);
+          }
+
+          s[0] = s[1];
+          s[1] = s[2];
+          s[2] = s[3];
+
+          s[4] = s[5];
+          s[5] = s[6];
+          s[6] = s[7];
+
+          s[0 + 8] = s[1 + 8];
+          s[1 + 8] = s[2 + 8];
+          s[2 + 8] = s[3 + 8];
+
+          s[4 + 8] = s[5 + 8];
+          s[5 + 8] = s[6 + 8];
+          s[6 + 8] = s[7 + 8];
 
-        s6 = s8;
+          s6 = s8;
+        }
       }
     }
   }
@@ -167,22 +297,16 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
 void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_x,
-                                    const InterpFilterParams *filter_params_y,
                                     const int subpel_x_qn,
-                                    const int subpel_y_qn,
                                     ConvolveParams *conv_params, int bd) {
   int i, j;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
-  (void)subpel_y_qn;
-  (void)filter_params_y;
 
   // Check that, even with 12-bit input, the intermediate values will fit
   // into an unsigned 16-bit intermediate array.
   assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
 
-  __m128i s[4], coeffs_x[4];
-
   const __m128i round_const_x =
       _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
   const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
@@ -195,55 +319,119 @@ void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m128i zero = _mm_setzero_si128();
 
-  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
-
-  for (j = 0; j < w; j += 8) {
-    /* Horizontal filter */
-    {
-      for (i = 0; i < h; i += 1) {
-        const __m128i row00 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        const __m128i row01 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
-
-        // even pixels
-        s[0] = _mm_alignr_epi8(row01, row00, 0);
-        s[1] = _mm_alignr_epi8(row01, row00, 4);
-        s[2] = _mm_alignr_epi8(row01, row00, 8);
-        s[3] = _mm_alignr_epi8(row01, row00, 12);
-
-        __m128i res_even = convolve(s, coeffs_x);
-        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
-                                 round_shift_x);
-
-        // odd pixels
-        s[0] = _mm_alignr_epi8(row01, row00, 2);
-        s[1] = _mm_alignr_epi8(row01, row00, 6);
-        s[2] = _mm_alignr_epi8(row01, row00, 10);
-        s[3] = _mm_alignr_epi8(row01, row00, 14);
-
-        __m128i res_odd = convolve(s, coeffs_x);
-        res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
-
-        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
-                                 round_shift_bits);
-        res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
-                                round_shift_bits);
-
-        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
-        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
-        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
-
-        res = _mm_min_epi16(res, clip_pixel);
-        res = _mm_max_epi16(res, zero);
-
-        if (w - j > 4) {
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
-        } else if (w == 4) {
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
-        } else {
-          *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+  if (filter_params_x->taps == 12) {
+    __m128i s[6], coeffs_x[6];
+
+    prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x);
+
+    for (j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      {
+        for (i = 0; i < h; i += 1) {
+          const __m128i row00 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+          const __m128i row01 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+          const __m128i row02 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]);
+
+          // even pixels
+          s[0] = _mm_alignr_epi8(row01, row00, 0);
+          s[1] = _mm_alignr_epi8(row01, row00, 4);
+          s[2] = _mm_alignr_epi8(row01, row00, 8);
+          s[3] = _mm_alignr_epi8(row01, row00, 12);
+          s[4] = _mm_alignr_epi8(row02, row01, 0);
+          s[5] = _mm_alignr_epi8(row02, row01, 4);
+
+          __m128i res_even = convolve_12tap(s, coeffs_x);
+          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                   round_shift_x);
+          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+                                   round_shift_bits);
+
+          // odd pixels
+          s[0] = _mm_alignr_epi8(row01, row00, 2);
+          s[1] = _mm_alignr_epi8(row01, row00, 6);
+          s[2] = _mm_alignr_epi8(row01, row00, 10);
+          s[3] = _mm_alignr_epi8(row01, row00, 14);
+          s[4] = _mm_alignr_epi8(row02, row01, 2);
+          s[5] = _mm_alignr_epi8(row02, row01, 6);
+
+          __m128i res_odd = convolve_12tap(s, coeffs_x);
+          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+                                  round_shift_x);
+          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+                                  round_shift_bits);
+
+          __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+          __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+          __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+          res = _mm_min_epi16(res, clip_pixel);
+          res = _mm_max_epi16(res, zero);
+
+          if (w - j > 4) {
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+          } else if (w == 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+          } else {
+            *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+          }
+        }
+      }
+    }
+  } else {
+    __m128i s[4], coeffs_x[4];
+    prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+    for (j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      {
+        for (i = 0; i < h; i += 1) {
+          const __m128i row00 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+          const __m128i row01 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+          // even pixels
+          s[0] = _mm_alignr_epi8(row01, row00, 0);
+          s[1] = _mm_alignr_epi8(row01, row00, 4);
+          s[2] = _mm_alignr_epi8(row01, row00, 8);
+          s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+          __m128i res_even = convolve(s, coeffs_x);
+          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                   round_shift_x);
+
+          // odd pixels
+          s[0] = _mm_alignr_epi8(row01, row00, 2);
+          s[1] = _mm_alignr_epi8(row01, row00, 6);
+          s[2] = _mm_alignr_epi8(row01, row00, 10);
+          s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+          __m128i res_odd = convolve(s, coeffs_x);
+          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+                                  round_shift_x);
+
+          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+                                   round_shift_bits);
+          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+                                  round_shift_bits);
+
+          __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+          __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+          __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+          res = _mm_min_epi16(res, clip_pixel);
+          res = _mm_max_epi16(res, zero);
+
+          if (w - j > 4) {
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+          } else if (w == 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+          } else {
+            *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+          }
         }
       }
     }
diff --git a/media/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm b/media/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm
index e0d22522d2..9442cd0bfd 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -215,15 +215,28 @@ SECTION .text
 ; void aom_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro HIGH_SADNXN4D 2
-%if UNIX64
+; Macro Arguments:
+;   1: Width
+;   2: Height
+;   3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0  ; normal sad
+%if ARCH_X86_64
 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
 %else
 cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
-%endif
+%endif  ; ARCH_X86_64
+%else  ; %3 == 2, downsample
+%if ARCH_X86_64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif  ; ARCH_X86_64
+%endif  ; sad/avg/skip
 
 ; set m1
   push                srcq
@@ -232,6 +245,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   pshufd                m1, m1, 0x0
   pop                 srcq
 
+%if %3 == 2  ; skip rows
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
+%endif  ; skip rows
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
   mov                ref2q, [ref1q+gprsize*1]
@@ -247,9 +264,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   shl                ref1q, 1
 
   HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 2  ;  Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
 %endrep
+%undef rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
   ; N.B. HIGH_PROCESS outputs dwords (32 bits)
   ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
@@ -268,6 +291,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   paddd                 m4, m0
   paddd                 m6, m1
   punpcklqdq            m4, m6
+%if %3 == 2  ; skip rows
+  pslld                 m4, 1
+%endif
   movifnidn             r4, r4mp
   movu                [r4], m4
   RET
@@ -294,3 +320,25 @@ HIGH_SADNXN4D  8, 32
 HIGH_SADNXN4D 32,  8
 HIGH_SADNXN4D 16, 64
 HIGH_SADNXN4D 64, 16
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16,  8, 2
+HIGH_SADNXN4D  8, 16, 2
+HIGH_SADNXN4D  8,  8, 2
+HIGH_SADNXN4D  4,  8, 2
+HIGH_SADNXN4D  4, 16, 2
+HIGH_SADNXN4D  8, 32, 2
+HIGH_SADNXN4D 32,  8, 2
+HIGH_SADNXN4D 16, 64, 2
+HIGH_SADNXN4D 64, 16, 2
+
+; Current code cannot handle the case when the height is downsampled to 2
+; HIGH_SADNXN4D 16,  4, 2
+; HIGH_SADNXN4D  8,  4, 2
+; HIGH_SADNXN4D  4,  4, 2
diff --git a/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c b/media/libaom/src/aom_dsp/x86/highbd_sad_avx2.c
index 2cff2e6a9f..ad4db2f8c9 100644
--- a/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_sad_avx2.c
@@ -252,31 +252,57 @@ static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2(
   return get_sad_from_mm256_epi32(&sad);
 }
 
-#define highbd_sadMxN_avx2(m, n)                                            \
+#define HIGHBD_SADMXN_AVX2(m, n)                                            \
   unsigned int aom_highbd_sad##m##x##n##_avx2(                              \
       const uint8_t *src, int src_stride, const uint8_t *ref,               \
       int ref_stride) {                                                     \
     return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \
   }
 
-highbd_sadMxN_avx2(16, 4);
-highbd_sadMxN_avx2(16, 8);
-highbd_sadMxN_avx2(16, 16);
-highbd_sadMxN_avx2(16, 32);
-highbd_sadMxN_avx2(16, 64);
+#define HIGHBD_SAD_SKIP_MXN_AVX2(m, n)                                       \
+  unsigned int aom_highbd_sad_skip_##m##x##n##_avx2(                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * aom_highbd_sad##m##xN_avx2((n / 2), src, 2 * src_stride, ref, \
+                                          2 * ref_stride);                   \
+  }
+
+HIGHBD_SADMXN_AVX2(16, 4)
+HIGHBD_SADMXN_AVX2(16, 8)
+HIGHBD_SADMXN_AVX2(16, 16)
+HIGHBD_SADMXN_AVX2(16, 32)
+HIGHBD_SADMXN_AVX2(16, 64)
+
+HIGHBD_SADMXN_AVX2(32, 8)
+HIGHBD_SADMXN_AVX2(32, 16)
+HIGHBD_SADMXN_AVX2(32, 32)
+HIGHBD_SADMXN_AVX2(32, 64)
+
+HIGHBD_SADMXN_AVX2(64, 16)
+HIGHBD_SADMXN_AVX2(64, 32)
+HIGHBD_SADMXN_AVX2(64, 64)
+HIGHBD_SADMXN_AVX2(64, 128)
+
+HIGHBD_SADMXN_AVX2(128, 64)
+HIGHBD_SADMXN_AVX2(128, 128)
 
-highbd_sadMxN_avx2(32, 8);
-highbd_sadMxN_avx2(32, 16);
-highbd_sadMxN_avx2(32, 32);
-highbd_sadMxN_avx2(32, 64);
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 8)
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 16)
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 32)
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 64)
 
-highbd_sadMxN_avx2(64, 16);
-highbd_sadMxN_avx2(64, 32);
-highbd_sadMxN_avx2(64, 64);
-highbd_sadMxN_avx2(64, 128);
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 8)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 16)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 32)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 64)
 
-highbd_sadMxN_avx2(128, 64);
-highbd_sadMxN_avx2(128, 128);
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 16)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 32)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 64)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 128)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(128, 64)
+HIGHBD_SAD_SKIP_MXN_AVX2(128, 128)
 
 unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride,
                                          const uint8_t *ref, int ref_stride,
@@ -671,29 +697,58 @@ static AOM_FORCE_INLINE void aom_highbd_sad128xNx4d_avx2(
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }
 
-#define highbd_sadMxNx4d_avx2(m, n)                                          \
+#define HIGHBD_SAD_MXNX4D_AVX2(m, n)                                         \
   void aom_highbd_sad##m##x##n##x4d_avx2(                                    \
       const uint8_t *src, int src_stride, const uint8_t *const ref_array[],  \
       int ref_stride, uint32_t *sad_array) {                                 \
     aom_highbd_sad##m##xNx4d_avx2(n, src, src_stride, ref_array, ref_stride, \
                                   sad_array);                                \
   }
+#define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n)                                   \
+  void aom_highbd_sad_skip_##m##x##n##x4d_avx2(                             \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
+      int ref_stride, uint32_t *sad_array) {                                \
+    aom_highbd_sad##m##xNx4d_avx2((n / 2), src, 2 * src_stride, ref_array,  \
+                                  2 * ref_stride, sad_array);               \
+    sad_array[0] <<= 1;                                                     \
+    sad_array[1] <<= 1;                                                     \
+    sad_array[2] <<= 1;                                                     \
+    sad_array[3] <<= 1;                                                     \
+  }
 
-highbd_sadMxNx4d_avx2(16, 4);
-highbd_sadMxNx4d_avx2(16, 8);
-highbd_sadMxNx4d_avx2(16, 16);
-highbd_sadMxNx4d_avx2(16, 32);
-highbd_sadMxNx4d_avx2(16, 64);
-
-highbd_sadMxNx4d_avx2(32, 8);
-highbd_sadMxNx4d_avx2(32, 16);
-highbd_sadMxNx4d_avx2(32, 32);
-highbd_sadMxNx4d_avx2(32, 64);
-
-highbd_sadMxNx4d_avx2(64, 16);
-highbd_sadMxNx4d_avx2(64, 32);
-highbd_sadMxNx4d_avx2(64, 64);
-highbd_sadMxNx4d_avx2(64, 128);
-
-highbd_sadMxNx4d_avx2(128, 64);
-highbd_sadMxNx4d_avx2(128, 128);
+HIGHBD_SAD_MXNX4D_AVX2(16, 4)
+HIGHBD_SAD_MXNX4D_AVX2(16, 8)
+HIGHBD_SAD_MXNX4D_AVX2(16, 16)
+HIGHBD_SAD_MXNX4D_AVX2(16, 32)
+HIGHBD_SAD_MXNX4D_AVX2(16, 64)
+
+HIGHBD_SAD_MXNX4D_AVX2(32, 8)
+HIGHBD_SAD_MXNX4D_AVX2(32, 16)
+HIGHBD_SAD_MXNX4D_AVX2(32, 32)
+HIGHBD_SAD_MXNX4D_AVX2(32, 64)
+
+HIGHBD_SAD_MXNX4D_AVX2(64, 16)
+HIGHBD_SAD_MXNX4D_AVX2(64, 32)
+HIGHBD_SAD_MXNX4D_AVX2(64, 64)
+HIGHBD_SAD_MXNX4D_AVX2(64, 128)
+
+HIGHBD_SAD_MXNX4D_AVX2(128, 64)
+HIGHBD_SAD_MXNX4D_AVX2(128, 128)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 8)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 16)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 32)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 64)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 8)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 16)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 32)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 64)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 16)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 32)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 64)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 128)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 64)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 128)
diff --git a/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm b/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm
index 09e64d510e..48b93bf955 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm
@@ -15,20 +15,26 @@
 
 SECTION .text
 
-%macro HIGH_SAD_FN 4
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
+; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7
+%macro HIGH_SAD_FN 4-5 7
 %if %4 == 0
 %if %3 == 5
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+%elif %4 == 1 ; avg
 %if %3 == 5
-cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
 %else ; %3 == 7
-cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, %5, src, src_stride, \
                                               ref, ref_stride, \
                                               second_pred, \
                                               src_stride3, ref_stride3
@@ -38,7 +44,18 @@ cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%else  ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2  ; double the stride if we are skipping rows
+  lea          src_strided, [src_strided*2]
+  lea          ref_strided, [ref_strided*2]
+%endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -57,7 +74,11 @@ cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD64XN 1-2 0
   HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -149,6 +170,9 @@ cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -156,16 +180,23 @@ cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
 INIT_XMM sse2
 HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
-HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
 HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
+HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2
 
 ; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD32XN 1-2 0
   HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -217,6 +248,9 @@ HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -225,17 +259,25 @@ INIT_XMM sse2
 HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
 HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
 HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN  8 ; highbd_sad_32x8_sse2
 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
-HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2
-HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
+HIGH_SAD32XN  8, 1 ; highbd_sad_32x8_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
+HIGH_SAD32XN  8, 2 ; highbd_sad_skip_32x8_sse2
 
 ; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD16XN 1-2 0
   HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -287,27 +329,40 @@ HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
+HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
 HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
 HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
 HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN  4 ; highbd_sad_16x4_sse2
+HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
 HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
-HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
-HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
-HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
-HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
+HIGH_SAD16XN  4, 1 ; highbd_sad_16x4_avg_sse2
+HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD16XN  4, 2 ; highbd_sad_skip_16x4_sse2
 
 ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD8XN 1-2 0
-  HIGH_SAD_FN 8, %1, 7, %2
+  HIGH_SAD_FN 8, %1, 7, %2, 8
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -323,22 +378,30 @@ HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
   pavgw                 m4, [second_predq+mmsize*3]
   lea         second_predq, [second_predq+mmsize*4]
 %endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
+  mova                  m7, m1
+  movu                  m5, [srcq]
+  psubusw               m1, m5
+  psubusw               m5, m7
   por                   m1, m5
-  mova                  m5, [srcq+src_strideq*2]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+src_strideq*2]
+
+  mova                  m7, m2
+  movu                  m5, [srcq+src_strideq*2]
+  psubusw               m2, m5
+  psubusw               m5, m7
   por                   m2, m5
-  mova                  m5, [srcq+src_strideq*4]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+src_strideq*4]
+
+  mova                  m7, m3
+  movu                  m5, [srcq+src_strideq*4]
+  psubusw               m3, m5
+  psubusw               m5, m7
   por                   m3, m5
-  mova                  m5, [srcq+src_stride3q*2]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+src_stride3q*2]
+
+  mova                  m7, m4
+  movu                  m5, [srcq+src_stride3q*2]
+  psubusw               m4, m5
+  psubusw               m5, m7
   por                   m4, m5
+
   paddw                 m1, m2
   paddw                 m3, m4
   movhlps               m2, m1
@@ -359,25 +422,37 @@ HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
+HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
 HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
 HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
 HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
 HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
-HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
-HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
+HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN  8, 2 ; highbd_sad_skip_8x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD8XN  4, 2 ; highbd_sad8x4_avg_sse2
 
 ; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD4XN 1-2 0
   HIGH_SAD_FN 4, %1, 7, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -429,6 +504,9 @@ HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -440,3 +518,7 @@ HIGH_SAD4XN  4 ; highbd_sad4x4_sse2
 HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2
 HIGH_SAD4XN  8, 1 ; highbd_sad4x8_avg_sse2
 HIGH_SAD4XN  4, 1 ; highbd_sad4x4_avg_sse2
+HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2
+HIGH_SAD4XN  8, 2 ; highbd_sad_skip_4x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD4XN  4, 2 ; highbd_sad_skip_4x4_sse2
diff --git a/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c
index b72d1cf8ba..3c3253bdf9 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c
@@ -256,11 +256,10 @@ static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
 void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
                                     ptrdiff_t diff_stride, const uint8_t *src8,
                                     ptrdiff_t src_stride, const uint8_t *pred8,
-                                    ptrdiff_t pred_stride, int bd) {
+                                    ptrdiff_t pred_stride) {
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   SubtractWxHFuncType func;
-  (void)bd;
 
   func = getSubtractFunc(rows, cols);
   func(diff, diff_stride, src, src_stride, pred, pred_stride);
diff --git a/media/libaom/src/aom_dsp/x86/highbd_variance_avx2.c b/media/libaom/src/aom_dsp/x86/highbd_variance_avx2.c
index 9b1b4c9de7..49912ac191 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_variance_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_variance_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -13,11 +13,611 @@
 #include <immintrin.h>  // AVX2
 
 #include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
 
 typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
                                    const uint16_t *ref, int ref_stride,
                                    uint32_t *sse, int *sum);
 
+static uint32_t aom_highbd_var_filter_block2d_bil_avx2(
+    const uint8_t *src_ptr8, unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint32_t xoffset, const uint32_t yoffset, const uint8_t *dst_ptr8,
+    int dst_stride, uint32_t *sse) {
+  const __m256i filter1 =
+      _mm256_set1_epi32((uint32_t)(bilinear_filters_2t[xoffset][1] << 16) |
+                        bilinear_filters_2t[xoffset][0]);
+  const __m256i filter2 =
+      _mm256_set1_epi32((uint32_t)(bilinear_filters_2t[yoffset][1] << 16) |
+                        bilinear_filters_2t[yoffset][0]);
+  const __m256i one = _mm256_set1_epi16(1);
+  const uint32_t bitshift = (uint32_t)0x40;
+  (void)pixel_step;
+  unsigned int i, j, prev = 0, curr = 2;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+  uint16_t *dst_ptr = CONVERT_TO_SHORTPTR(dst_ptr8);
+  uint16_t *src_ptr_ref = src_ptr;
+  uint16_t *dst_ptr_ref = dst_ptr;
+  int64_t sum_long = 0;
+  uint64_t sse_long = 0;
+  unsigned int rshift = 0, inc = 1;
+  __m256i rbias = _mm256_set1_epi32(bitshift);
+  __m256i opointer[8];
+  unsigned int range;
+  if (xoffset == 0) {
+    if (yoffset == 0) {  // xoffset==0 && yoffset==0
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+        }
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+        for (i = 0; i < 16 / inc; ++i) {
+          __m256i V_S_SRC = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+
+    } else if (yoffset == 4) {  // xoffset==0 && yoffset==4
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+
+          opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+
+          __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+
+    } else {  // xoffset==0 && yoffset==1,2,3,5,6,7
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+
+          opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+
+          __m256i V_S_M1 =
+              _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+          __m256i V_S_M2 =
+              _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+          __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+          __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+          __m256i V_S_S1 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+          __m256i V_S_S2 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+          __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+    }
+  } else if (xoffset == 4) {
+    if (yoffset == 0) {  // xoffset==4 && yoffset==0
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+          __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+
+          opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+
+          opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+
+          __m256i V_S_M1 =
+              _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+          __m256i V_S_M2 =
+              _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+          __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+          __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+          __m256i V_S_S1 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+          __m256i V_S_S2 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+          __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+
+    } else if (yoffset == 4) {  // xoffset==4 && yoffset==4
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+
+          __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+          opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+          opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+          __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+
+    } else {  // xoffset==4 && yoffset==1,2,3,5,6,7
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+
+          __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+          opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+          opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+
+          __m256i V_S_M1 =
+              _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+          __m256i V_S_M2 =
+              _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+          __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+          __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+          __m256i V_S_S1 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+          __m256i V_S_S2 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+          __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+    }
+  } else if (yoffset == 0) {  // xoffset==1,2,3,5,6,7 && yoffset==0
+    range = output_width / 16;
+    if (output_height == 8) inc = 2;
+    if (output_height == 4) inc = 4;
+    for (j = 0; j < range * output_height * inc / 16; j++) {
+      if (j % (output_height * inc / 16) == 0) {
+        src_ptr = src_ptr_ref;
+        src_ptr_ref += 16;
+        dst_ptr = dst_ptr_ref;
+        dst_ptr_ref += 16;
+
+        curr = 0;
+      }
+
+      __m256i sum1 = _mm256_setzero_si256();
+      __m256i sse1 = _mm256_setzero_si256();
+
+      for (i = 0; i < 16 / inc; ++i) {
+        __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+        __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+        __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+        __m256i V_V_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+        __m256i V_V_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+        opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+        __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+        dst_ptr += dst_stride;
+        __m256i V_R_SUB = _mm256_sub_epi16(opointer[curr], V_D_DST);
+        __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+        sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+        sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+      }
+
+      __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+      __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+      __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+      __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+      const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+      const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+      __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+      v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+      sum_long += _mm_extract_epi32(v_d, 0);
+      sse_long += _mm_extract_epi32(v_d, 1);
+    }
+
+    rshift = get_msb(output_height) + get_msb(output_width);
+
+  } else if (yoffset == 4) {  // xoffset==1,2,3,5,6,7 && yoffset==4
+
+    range = output_width / 16;
+    if (output_height == 8) inc = 2;
+    if (output_height == 4) inc = 4;
+    for (j = 0; j < range * output_height * inc / 16; j++) {
+      if (j % (output_height * inc / 16) == 0) {
+        src_ptr = src_ptr_ref;
+        src_ptr_ref += 16;
+        dst_ptr = dst_ptr_ref;
+        dst_ptr_ref += 16;
+
+        __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+
+        __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
+        __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
+
+        __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
+        __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
+
+        __m256i V_H_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
+        __m256i V_H_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
+
+        opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
+
+        curr = 0;
+      }
+
+      __m256i sum1 = _mm256_setzero_si256();
+      __m256i sse1 = _mm256_setzero_si256();
+
+      for (i = 0; i < 16 / inc; ++i) {
+        prev = curr;
+        curr = (curr == 0) ? 1 : 0;
+        __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+        __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+        __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+        __m256i V_V_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+        __m256i V_V_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+        opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+        __m256i V_S_SRC = _mm256_avg_epu16(opointer[prev], opointer[curr]);
+
+        __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+        dst_ptr += dst_stride;
+
+        __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+        __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+        sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+        sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+      }
+
+      __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+      __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+      __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+      __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+      const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+      const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+      __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+      v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+      sum_long += _mm_extract_epi32(v_d, 0);
+      sse_long += _mm_extract_epi32(v_d, 1);
+    }
+
+    rshift = get_msb(output_height) + get_msb(output_width);
+
+  } else {  // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7
+    range = output_width / 16;
+    if (output_height == 8) inc = 2;
+    if (output_height == 4) inc = 4;
+    unsigned int nloop = 16 / inc;
+    for (j = 0; j < range * output_height * inc / 16; j++) {
+      if (j % (output_height * inc / 16) == 0) {
+        src_ptr = src_ptr_ref;
+        src_ptr_ref += 16;
+        dst_ptr = dst_ptr_ref;
+        dst_ptr_ref += 16;
+
+        __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+
+        __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
+        __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
+
+        __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
+        __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
+
+        __m256i V_H_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
+        __m256i V_H_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
+
+        opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
+
+        curr = 0;
+      }
+
+      __m256i sum1 = _mm256_setzero_si256();
+      __m256i sse1 = _mm256_setzero_si256();
+
+      for (i = 0; i < nloop; ++i) {
+        prev = curr;
+        curr = !curr;
+        __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+        __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+        __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+        __m256i V_V_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+        __m256i V_V_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+        opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+        __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+        __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+        __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+        __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+        __m256i V_S_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+        __m256i V_S_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+        __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+        __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+        dst_ptr += dst_stride;
+
+        __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+        __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+        sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+        sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+      }
+
+      __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+      __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+      __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+      __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+      const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+      const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+      __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+      v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+      sum_long += _mm_extract_epi32(v_d, 0);
+      sse_long += _mm_extract_epi32(v_d, 1);
+    }
+
+    rshift = get_msb(output_height) + get_msb(output_width);
+  }
+
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+  int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+
+  int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift);
+
+  return (var > 0) ? var : 0;
+}
+
 void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
                                 const uint16_t *ref, int ref_stride,
                                 uint32_t *sse, int *sum) {
@@ -118,23 +718,185 @@ static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
-VAR_FN(128, 128, 16, 14);
-VAR_FN(128, 64, 16, 13);
-VAR_FN(64, 128, 16, 13);
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
-VAR_FN(16, 4, 16, 6);
-VAR_FN(8, 32, 8, 8);
-VAR_FN(32, 8, 8, 8);
-VAR_FN(16, 64, 16, 10);
-VAR_FN(64, 16, 16, 10);
+VAR_FN(128, 128, 16, 14)
+VAR_FN(128, 64, 16, 13)
+VAR_FN(64, 128, 16, 13)
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(16, 4, 16, 6)
+VAR_FN(8, 32, 8, 8)
+VAR_FN(32, 8, 8, 8)
+VAR_FN(16, 64, 16, 10)
+VAR_FN(64, 16, 16, 10)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
 
 #undef VAR_FN
+
+#define SSE2_HEIGHT(H)                                                 \
+  uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2(               \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr);
+
+SSE2_HEIGHT(8)
+SSE2_HEIGHT(16)
+
+#undef SSE2_Height
+
+#define HIGHBD_SUBPIX_VAR(W, H)                                              \
+  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    if (W == 8 && H == 16)                                                   \
+      return aom_highbd_10_sub_pixel_variance8x16_sse2(                      \
+          src, src_stride, xoffset, yoffset, dst, dst_stride, sse);          \
+    else if (W == 8 && H == 8)                                               \
+      return aom_highbd_10_sub_pixel_variance8x8_sse2(                       \
+          src, src_stride, xoffset, yoffset, dst, dst_stride, sse);          \
+    else                                                                     \
+      return aom_highbd_var_filter_block2d_bil_avx2(                         \
+          src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \
+  }
+
+HIGHBD_SUBPIX_VAR(128, 128)
+HIGHBD_SUBPIX_VAR(128, 64)
+HIGHBD_SUBPIX_VAR(64, 128)
+HIGHBD_SUBPIX_VAR(64, 64)
+HIGHBD_SUBPIX_VAR(64, 32)
+HIGHBD_SUBPIX_VAR(32, 64)
+HIGHBD_SUBPIX_VAR(32, 32)
+HIGHBD_SUBPIX_VAR(32, 16)
+HIGHBD_SUBPIX_VAR(16, 32)
+HIGHBD_SUBPIX_VAR(16, 16)
+HIGHBD_SUBPIX_VAR(16, 8)
+HIGHBD_SUBPIX_VAR(8, 16)
+HIGHBD_SUBPIX_VAR(8, 8)
+
+#undef HIGHBD_SUBPIX_VAR
+
+uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16;
+  __m256i src0_8x16, src1_8x16, src_16x16;
+  __m256i dst0_8x16, dst1_8x16, dst_16x16;
+  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m256i sub_result;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  for (int i = 0; i < h; i += 4) {
+    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+    reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 2) * dstride]));
+    reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 3) * dstride]));
+    dst0_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
+    dst1_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
+    dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
+
+    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+    reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
+    reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
+    src0_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
+    src1_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
+    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
+    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+
+    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+    square_result = _mm256_add_epi64(
+        square_result,
+        _mm256_add_epi64(
+            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+  const __m128i sum_2x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(square_result),
+                    _mm256_extracti128_si256(square_result, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int h) {
+  uint64_t sum = 0;
+  __m256i src0_8x16, src1_8x16, src_16x16;
+  __m256i dst0_8x16, dst1_8x16, dst_16x16;
+  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m256i sub_result;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+
+  for (int i = 0; i < h; i += 2) {
+    dst0_8x16 =
+        _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&dst[i * dstride]));
+    dst1_8x16 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&dst[(i + 1) * dstride]));
+    dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
+
+    src0_8x16 =
+        _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
+    src1_8x16 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
+    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
+    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+
+    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+    square_result = _mm256_add_epi64(
+        square_result,
+        _mm256_add_epi64(
+            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+
+  const __m128i sum_2x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(square_result),
+                    _mm256_extracti128_si256(square_result, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int w,
+                                       int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must satisfy");
+  switch (w) {
+    case 4: return aom_mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+    case 8: return aom_mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
diff --git a/media/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm b/media/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm
index 0d954e1788..ec6c7e9fa7 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm
@@ -25,7 +25,7 @@ SECTION .text
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(aom_highbd_calc16x16var_sse2) PRIVATE
+globalsym(aom_highbd_calc16x16var_sse2)
 sym(aom_highbd_calc16x16var_sse2):
     push        rbp
     mov         rbp, rsp
@@ -178,7 +178,7 @@ sym(aom_highbd_calc16x16var_sse2):
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(aom_highbd_calc8x8var_sse2) PRIVATE
+globalsym(aom_highbd_calc8x8var_sse2)
 sym(aom_highbd_calc8x8var_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c
index b7d15f93ec..6bd6a5a3fc 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c
@@ -14,16 +14,12 @@
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
 
 #include "aom_dsp/x86/synonyms.h"
-
 #include "aom_ports/mem.h"
 
-#include "av1/common/av1_common_int.h"
 #include "av1/common/filter.h"
 #include "av1/common/reconinter.h"
-#include "av1/encoder/reconinter_enc.h"
 
 typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
                                        const uint16_t *ref, int ref_stride,
@@ -134,8 +130,8 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
     *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
   }
 
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
+HIGH_GET_VAR(16)
+HIGH_GET_VAR(8)
 
 #undef HIGH_GET_VAR
 
@@ -180,23 +176,23 @@ HIGH_GET_VAR(8);
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
-VAR_FN(128, 128, 16, 14);
-VAR_FN(128, 64, 16, 13);
-VAR_FN(64, 128, 16, 13);
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
-VAR_FN(8, 32, 8, 8);
-VAR_FN(32, 8, 8, 8);
-VAR_FN(16, 64, 16, 10);
-VAR_FN(64, 16, 16, 10);
+VAR_FN(128, 128, 16, 14)
+VAR_FN(128, 64, 16, 13)
+VAR_FN(64, 128, 16, 13)
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
+VAR_FN(8, 32, 8, 8)
+VAR_FN(32, 8, 8, 8)
+VAR_FN(16, 64, 16, 10)
+VAR_FN(64, 16, 16, 10)
 
 #undef VAR_FN
 
@@ -275,10 +271,10 @@ unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
       const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
       unsigned int *sse, void *unused0, void *unused);
 #define DECLS(opt) \
-  DECL(8, opt);    \
+  DECL(8, opt)     \
   DECL(16, opt)
 
-DECLS(sse2);
+DECLS(sse2)
 
 #undef DECLS
 #undef DECL
@@ -423,28 +419,28 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt)                          \
-  FN(128, 128, 16, 7, 7, opt, (int64_t)); \
-  FN(128, 64, 16, 7, 6, opt, (int64_t));  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t));  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t));   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t));   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t));   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t));   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t));   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t));   \
-  FN(16, 16, 16, 4, 4, opt, (int64_t));   \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));    \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));     \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));      \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));      \
-  FN(16, 4, 16, 4, 2, opt, (int64_t));    \
-  FN(8, 32, 8, 3, 5, opt, (int64_t));     \
-  FN(32, 8, 16, 5, 3, opt, (int64_t));    \
-  FN(16, 64, 16, 4, 6, opt, (int64_t));   \
+#define FNS(opt)                         \
+  FN(128, 128, 16, 7, 7, opt, (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (int64_t))   \
+  FN(16, 8, 16, 4, 3, opt, (int64_t))    \
+  FN(8, 16, 8, 3, 4, opt, (int64_t))     \
+  FN(8, 8, 8, 3, 3, opt, (int64_t))      \
+  FN(8, 4, 8, 3, 2, opt, (int64_t))      \
+  FN(16, 4, 16, 4, 2, opt, (int64_t))    \
+  FN(8, 32, 8, 3, 5, opt, (int64_t))     \
+  FN(32, 8, 16, 5, 3, opt, (int64_t))    \
+  FN(16, 64, 16, 4, 6, opt, (int64_t))   \
   FN(64, 16, 16, 6, 4, opt, (int64_t))
 
-FNS(sse2);
+FNS(sse2)
 
 #undef FNS
 #undef FN
@@ -460,7 +456,7 @@ FNS(sse2);
   DECL(16, opt)    \
   DECL(8, opt)
 
-DECLS(sse2);
+DECLS(sse2)
 #undef DECL
 #undef DECLS
 
@@ -590,156 +586,29 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int64_t));  \
-  FN(8, 32, 8, 3, 5, opt, (int64_t));   \
-  FN(32, 8, 16, 5, 3, opt, (int64_t));  \
-  FN(16, 64, 16, 4, 6, opt, (int64_t)); \
-  FN(64, 16, 16, 6, 4, opt, (int64_t));
-
-FNS(sse2);
+#define FNS(opt)                       \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt, (int64_t))  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t))    \
+  FN(16, 4, 16, 4, 2, opt, (int64_t))  \
+  FN(8, 32, 8, 3, 5, opt, (int64_t))   \
+  FN(32, 8, 16, 5, 3, opt, (int64_t))  \
+  FN(16, 64, 16, 4, 6, opt, (int64_t)) \
+  FN(64, 16, 16, 6, 4, opt, (int64_t))
+
+FNS(sse2)
 
 #undef FNS
 #undef FN
 
-void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
-                                    const struct AV1Common *const cm,
-                                    int mi_row, int mi_col, const MV *const mv,
-                                    uint8_t *comp_pred8, int width, int height,
-                                    int subpel_x_q3, int subpel_y_q3,
-                                    const uint8_t *ref8, int ref_stride, int bd,
-                                    int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      int plane = 0;
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-
-      InterPredParams inter_pred_params;
-      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
-      const int_interpfilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-      av1_init_inter_params(
-          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
-          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
-          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
-      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
-                                        &inter_pred_params);
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter = av1_get_filter(subpel_search);
-  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-    if (width >= 8) {
-      int i;
-      assert(!(width & 7));
-      /*Read 8 pixels one row at a time.*/
-      for (i = 0; i < height; i++) {
-        int j;
-        for (j = 0; j < width; j += 8) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          _mm_storeu_si128((__m128i *)comp_pred, s0);
-          comp_pred += 8;
-          ref += 8;
-        }
-        ref += ref_stride - width;
-      }
-    } else {
-      int i;
-      assert(!(width & 3));
-      /*Read 4 pixels two rows at a time.*/
-      for (i = 0; i < height; i += 2) {
-        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
-        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
-        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
-        _mm_storeu_si128((__m128i *)comp_pred, t0);
-        comp_pred += 8;
-        ref += 2 * ref_stride;
-      }
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
-                               NULL, -1, width, height, bd);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
-                              kernel, 16, width, height, bd);
-  } else {
-    DECLARE_ALIGNED(16, uint16_t,
-                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
-    uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
-                                     ? temp + (filter_taps >> 1) * MAX_SB_SIZE
-                                     : temp;
-    uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_highbd_convolve8_horiz(
-        ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
-        MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
-    aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE,
-                              comp_pred8, width, NULL, -1, kernel_y, 16, width,
-                              height, bd);
-  }
-}
-
-void aom_highbd_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, int subpel_search) {
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
-  /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
-  assert(!(width * height & 7));
-  int n = width * height >> 3;
-  for (int i = 0; i < n; i++) {
-    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
-    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-    _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
-    comp_pred16 += 8;
-    pred += 8;
-  }
-}
-
 static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
                                                     const __m128i *w0,
                                                     const __m128i *w1,
@@ -806,37 +675,99 @@ void aom_highbd_dist_wtd_comp_avg_pred_sse2(
   }
 }
 
-void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
-    int subpel_search) {
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  int n;
-  int i;
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  assert(!(width * height & 7));
-  n = width * height >> 3;
+uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i reg0_4x16, reg1_4x16;
+  __m128i src_8x16;
+  __m128i dst_8x16;
+  __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m128i sub_result_8x16;
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i square_result = _mm_setzero_si128();
+  for (int i = 0; i < h; i += 2) {
+    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+    dst_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16);
+
+    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+    src_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16);
+
+    sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16);
+
+    res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros);
+    res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros);
+
+    res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32);
+    res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32);
+
+    res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros);
+    res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros);
+    res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros);
+    res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros);
+
+    square_result = _mm_add_epi64(
+        square_result,
+        _mm_add_epi64(
+            _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
 
-  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
-  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
-  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
-  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  const __m128i sum_1x64 =
+      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
 
-  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
-  for (i = 0; i < n; i++) {
-    __m128i p0 = xx_loadu_128(comp_pred16);
-    __m128i p1 = xx_loadu_128(pred);
+uint64_t aom_mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i src_8x16;
+  __m128i dst_8x16;
+  __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m128i sub_result_8x16;
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i square_result = _mm_setzero_si128();
+
+  for (int i = 0; i < h; i++) {
+    dst_8x16 = _mm_loadu_si128((__m128i *)&dst[i * dstride]);
+    src_8x16 = _mm_loadu_si128((__m128i *)&src[i * sstride]);
+
+    sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16);
+
+    res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros);
+    res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros);
+
+    res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32);
+    res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32);
+
+    res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros);
+    res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros);
+    res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros);
+    res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros);
+
+    square_result = _mm_add_epi64(
+        square_result,
+        _mm_add_epi64(
+            _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
 
-    highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
+  const __m128i sum_1x64 =
+      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
 
-    comp_pred16 += 8;
-    pred += 8;
+uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int w,
+                                       int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must satisfy");
+  switch (w) {
+    case 4: return aom_mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+    case 8: return aom_mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
   }
 }
diff --git a/media/libaom/src/aom_dsp/x86/intrapred_avx2.c b/media/libaom/src/aom_dsp/x86/intrapred_avx2.c
index 546ee74bb3..b4b5ce2880 100644
--- a/media/libaom/src/aom_dsp/x86/intrapred_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/intrapred_avx2.c
@@ -13,6 +13,7 @@
 
 #include "config/aom_dsp_rtcd.h"
 #include "aom_dsp/x86/intrapred_x86.h"
+#include "aom_dsp/x86/intrapred_utils.h"
 #include "aom_dsp/x86/lpf_common_sse2.h"
 
 static INLINE __m256i dc_sum_64(const uint8_t *ref) {
@@ -1989,7 +1990,7 @@ static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
     int base_x = (-y * dx) >> frac_bits_x;
     int base_shift = 0;
     if (base_x < (min_base_x - 1)) {
-      base_shift = (min_base_x - base_x) >> upsample_above;
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
     }
     int base_min_diff =
         (min_base_x - base_x + upsample_above) >> upsample_above;
@@ -2115,7 +2116,7 @@ static void highbd_dr_prediction_z2_Nx4_avx2(
     int base_x = (-y * dx) >> frac_bits_x;
     int base_shift = 0;
     if (base_x < (min_base_x - 1)) {
-      base_shift = (min_base_x - base_x) >> upsample_above;
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
     }
     int base_min_diff =
         (min_base_x - base_x + upsample_above) >> upsample_above;
@@ -2237,7 +2238,7 @@ static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
     int base_x = (-y * dx) >> frac_bits_x;
     int base_shift = 0;
     if (base_x < (min_base_x - 1)) {
-      base_shift = (min_base_x - base_x) >> upsample_above;
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
     }
     int base_min_diff =
         (min_base_x - base_x + upsample_above) >> upsample_above;
@@ -2387,7 +2388,7 @@ static void highbd_dr_prediction_z2_Nx8_avx2(
     int base_x = (-y * dx) >> frac_bits_x;
     int base_shift = 0;
     if (base_x < (min_base_x - 1)) {
-      base_shift = (min_base_x - base_x) >> upsample_above;
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
     }
     int base_min_diff =
         (min_base_x - base_x + upsample_above) >> upsample_above;
@@ -2529,7 +2530,7 @@ static void highbd_dr_prediction_32bit_z2_HxW_avx2(
   a16 = _mm256_set1_epi32(16);
   c1 = _mm256_srli_epi32(a16, 4);
   c8 = _mm256_srli_epi32(a16, 1);
-  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  min_base_y256 = _mm256_set1_epi32(min_base_y);
   c3f = _mm256_set1_epi32(0x3f);
   dy256 = _mm256_set1_epi32(dy);
   c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
@@ -3517,46 +3518,6 @@ static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
 };
 
-static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
-  { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
-  { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
-  { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
-  { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
-  { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-};
-
-static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
-  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
-  { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
-  { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
-  { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
-  { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
-  { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
-  { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
-  { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
-};
-/* clang-format off */
-static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
-  { -1,  0,  0,  0,  0,  0,  0,  0},
-  { -1, -1,  0,  0,  0,  0,  0,  0},
-  { -1, -1, -1,  0,  0,  0,  0,  0},
-  { -1, -1, -1, -1,  0,  0,  0,  0},
-  { -1, -1, -1, -1, -1,  0,  0,  0},
-  { -1, -1, -1, -1, -1, -1,  0,  0},
-  { -1, -1, -1, -1, -1, -1, -1,  0},
-  { -1, -1, -1, -1, -1, -1, -1, -1},
-};
 /* clang-format on */
 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
     int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
@@ -4278,44 +4239,6 @@ void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
 }
 
 // z3 functions
-static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
-  __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
-  w0 = _mm_unpacklo_epi8(x[0], x[1]);
-  w1 = _mm_unpacklo_epi8(x[2], x[3]);
-  w2 = _mm_unpackhi_epi8(x[0], x[1]);
-  w3 = _mm_unpackhi_epi8(x[2], x[3]);
-
-  ww0 = _mm_unpacklo_epi16(w0, w1);
-  ww1 = _mm_unpacklo_epi16(w2, w3);
-  ww2 = _mm_unpackhi_epi16(w0, w1);
-  ww3 = _mm_unpackhi_epi16(w2, w3);
-
-  w0 = _mm_unpacklo_epi32(ww0, ww1);
-  w2 = _mm_unpacklo_epi32(ww2, ww3);
-  w1 = _mm_unpackhi_epi32(ww0, ww1);
-  w3 = _mm_unpackhi_epi32(ww2, ww3);
-
-  d[0] = _mm_unpacklo_epi64(w0, w2);
-  d[1] = _mm_unpackhi_epi64(w0, w2);
-  d[2] = _mm_unpacklo_epi64(w1, w3);
-  d[3] = _mm_unpackhi_epi64(w1, w3);
-
-  d[4] = _mm_srli_si128(d[0], 8);
-  d[5] = _mm_srli_si128(d[1], 8);
-  d[6] = _mm_srli_si128(d[2], 8);
-  d[7] = _mm_srli_si128(d[3], 8);
-
-  d[8] = _mm_srli_si128(d[0], 4);
-  d[9] = _mm_srli_si128(d[1], 4);
-  d[10] = _mm_srli_si128(d[2], 4);
-  d[11] = _mm_srli_si128(d[3], 4);
-
-  d[12] = _mm_srli_si128(d[0], 12);
-  d[13] = _mm_srli_si128(d[1], 12);
-  d[14] = _mm_srli_si128(d[2], 12);
-  d[15] = _mm_srli_si128(d[3], 12);
-}
-
 static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
   __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
   __m256i w10, w11, w12, w13, w14, w15;
@@ -4406,117 +4329,6 @@ static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
   d[15] = _mm256_unpackhi_epi64(w7, w15);
 }
 
-static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
-  __m128i w10, w11, w12, w13, w14, w15;
-
-  w0 = _mm_unpacklo_epi8(x[0], x[1]);
-  w1 = _mm_unpacklo_epi8(x[2], x[3]);
-  w2 = _mm_unpacklo_epi8(x[4], x[5]);
-  w3 = _mm_unpacklo_epi8(x[6], x[7]);
-
-  w8 = _mm_unpacklo_epi8(x[8], x[9]);
-  w9 = _mm_unpacklo_epi8(x[10], x[11]);
-  w10 = _mm_unpacklo_epi8(x[12], x[13]);
-  w11 = _mm_unpacklo_epi8(x[14], x[15]);
-
-  w4 = _mm_unpacklo_epi16(w0, w1);
-  w5 = _mm_unpacklo_epi16(w2, w3);
-  w12 = _mm_unpacklo_epi16(w8, w9);
-  w13 = _mm_unpacklo_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store first 4-line result
-  d[0] = _mm_unpacklo_epi64(w6, w14);
-  d[1] = _mm_unpackhi_epi64(w6, w14);
-  d[2] = _mm_unpacklo_epi64(w7, w15);
-  d[3] = _mm_unpackhi_epi64(w7, w15);
-
-  w4 = _mm_unpackhi_epi16(w0, w1);
-  w5 = _mm_unpackhi_epi16(w2, w3);
-  w12 = _mm_unpackhi_epi16(w8, w9);
-  w13 = _mm_unpackhi_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store second 4-line result
-  d[4] = _mm_unpacklo_epi64(w6, w14);
-  d[5] = _mm_unpackhi_epi64(w6, w14);
-  d[6] = _mm_unpacklo_epi64(w7, w15);
-  d[7] = _mm_unpackhi_epi64(w7, w15);
-
-  // upper half
-  w0 = _mm_unpackhi_epi8(x[0], x[1]);
-  w1 = _mm_unpackhi_epi8(x[2], x[3]);
-  w2 = _mm_unpackhi_epi8(x[4], x[5]);
-  w3 = _mm_unpackhi_epi8(x[6], x[7]);
-
-  w8 = _mm_unpackhi_epi8(x[8], x[9]);
-  w9 = _mm_unpackhi_epi8(x[10], x[11]);
-  w10 = _mm_unpackhi_epi8(x[12], x[13]);
-  w11 = _mm_unpackhi_epi8(x[14], x[15]);
-
-  w4 = _mm_unpacklo_epi16(w0, w1);
-  w5 = _mm_unpacklo_epi16(w2, w3);
-  w12 = _mm_unpacklo_epi16(w8, w9);
-  w13 = _mm_unpacklo_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store first 4-line result
-  d[8] = _mm_unpacklo_epi64(w6, w14);
-  d[9] = _mm_unpackhi_epi64(w6, w14);
-  d[10] = _mm_unpacklo_epi64(w7, w15);
-  d[11] = _mm_unpackhi_epi64(w7, w15);
-
-  w4 = _mm_unpackhi_epi16(w0, w1);
-  w5 = _mm_unpackhi_epi16(w2, w3);
-  w12 = _mm_unpackhi_epi16(w8, w9);
-  w13 = _mm_unpackhi_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store second 4-line result
-  d[12] = _mm_unpacklo_epi64(w6, w14);
-  d[13] = _mm_unpackhi_epi64(w6, w14);
-  d[14] = _mm_unpacklo_epi64(w7, w15);
-  d[15] = _mm_unpackhi_epi64(w7, w15);
-}
-
-static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
-                               uint8_t *dst, ptrdiff_t pitchDst) {
-  __m128i r[16];
-  __m128i d[16];
-  for (int j = 0; j < 16; j++) {
-    r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
-  }
-  transpose16x16_sse2(r, d);
-  for (int j = 0; j < 16; j++) {
-    _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
-  }
-}
-
-static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
-                      ptrdiff_t pitchDst, int width, int height) {
-  for (int j = 0; j < height; j += 16)
-    for (int i = 0; i < width; i += 16)
-      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
-                         dst + j * pitchDst + i, pitchDst);
-}
-
 static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *left, int upsample_left,
                                       int dy) {
diff --git a/media/libaom/src/aom_dsp/x86/intrapred_sse4.c b/media/libaom/src/aom_dsp/x86/intrapred_sse4.c
new file mode 100644
index 0000000000..b73258038b
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/intrapred_sse4.c
@@ -0,0 +1,1312 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "aom_dsp/x86/intrapred_utils.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+// Low bit depth functions
+static DECLARE_ALIGNED(16, uint8_t, Mask[2][33][16]) = {
+  { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+      0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
+      0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
+      0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+      0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff } },
+  {
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+        0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+        0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
+        0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
+        0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0xff, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0xff, 0xff, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0xff, 0xff, 0xff, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0xff, 0xff, 0xff, 0xff },
+  },
+};
+
+/* clang-format on */
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1(
+    int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
+    int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((W + H) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i a0, a1, a32, a16;
+  __m128i diff, c3f;
+  __m128i a_mbase_x;
+
+  a16 = _mm_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  c3f = _mm_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < W; r++) {
+    __m128i b, res, res1, shift;
+    __m128i a0_above, a1_above;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < W; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+    if (base_max_diff > H) base_max_diff = H;
+    a0_above = _mm_loadu_si128((__m128i *)(above + base));
+    a1_above = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+    if (upsample_above) {
+      a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[0]);
+      a1_above = _mm_srli_si128(a0_above, 8);
+
+      shift = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(_mm_set1_epi16(x), upsample_above), c3f),
+          1);
+    } else {
+      shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
+    }
+    // lower half
+    a0 = _mm_cvtepu8_epi16(a0_above);
+    a1 = _mm_cvtepu8_epi16(a1_above);
+
+    diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm_mullo_epi16(diff, shift);
+    res = _mm_add_epi16(a32, b);
+    res = _mm_srli_epi16(res, 5);
+
+    // uppar half
+    a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+    a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+    diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm_mullo_epi16(diff, shift);
+    res1 = _mm_add_epi16(a32, b);
+    res1 = _mm_srli_epi16(res1, 5);
+
+    res = _mm_packus_epi16(res, res1);
+
+    dst[r] =
+        _mm_blendv_epi8(a_mbase_x, res, *(__m128i *)Mask[0][base_max_diff]);
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_4xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        int upsample_above, int dx) {
+  __m128i dstvec[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(4, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_8xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        int upsample_above, int dx) {
+  __m128i dstvec[32];
+
+  dr_prediction_z1_HxW_internal_sse4_1(8, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_16xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *above,
+                                         int upsample_above, int dx) {
+  __m128i dstvec[64];
+
+  dr_prediction_z1_HxW_internal_sse4_1(16, N, dstvec, above, upsample_above,
+                                       dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1(
+    int N, __m128i *dstvec, __m128i *dstvec_h, const uint8_t *above,
+    int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i a0, a1, a32, a16;
+  __m128i a_mbase_x, diff, c3f;
+
+  a16 = _mm_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  c3f = _mm_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m128i b, res, res1, res16[2];
+    __m128i a0_above, a1_above;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base);
+    if (base_max_diff <= 0) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+        dstvec_h[i] = a_mbase_x;
+      }
+      return;
+    }
+    if (base_max_diff > 32) base_max_diff = 32;
+    __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
+
+    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+      int mdiff = base_max_diff - j;
+      if (mdiff <= 0) {
+        res16[jj] = a_mbase_x;
+      } else {
+        a0_above = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_above = _mm_loadu_si128((__m128i *)(above + base + j + 1));
+
+        // lower half
+        a0 = _mm_cvtepu8_epi16(a0_above);
+        a1 = _mm_cvtepu8_epi16(a1_above);
+
+        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm_mullo_epi16(diff, shift);
+
+        res = _mm_add_epi16(a32, b);
+        res = _mm_srli_epi16(res, 5);
+
+        // uppar half
+        a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+        a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+        b = _mm_mullo_epi16(diff, shift);
+        res1 = _mm_add_epi16(a32, b);
+        res1 = _mm_srli_epi16(res1, 5);
+
+        res16[jj] = _mm_packus_epi16(res, res1);  // 16 8bit values
+      }
+    }
+
+    dstvec[r] =
+        _mm_blendv_epi8(a_mbase_x, res16[0],
+                        *(__m128i *)Mask[0][base_max_diff]);  // 16 8bit values
+
+    dstvec_h[r] =
+        _mm_blendv_epi8(a_mbase_x, res16[1],
+                        *(__m128i *)Mask[1][base_max_diff]);  // 16 8bit values
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_32xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *above,
+                                         int upsample_above, int dx) {
+  __m128i dstvec[64], dstvec_h[64];
+  dr_prediction_z1_32xN_internal_sse4_1(N, dstvec, dstvec_h, above,
+                                        upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+    _mm_storeu_si128((__m128i *)(dst + stride * i + 16), dstvec_h[i]);
+  }
+}
+
+static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *above,
+                                         int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i a0, a1, a32, a16;
+  __m128i a_mbase_x, diff, c3f;
+  __m128i max_base, base_inc, mask;
+
+  a16 = _mm_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  max_base = _mm_set1_epi8(max_base_x);
+  c3f = _mm_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m128i b, res, res1;
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm_storeu_si128((__m128i *)dst, a_mbase_x);  // save 32 values
+        _mm_storeu_si128((__m128i *)(dst + 16), a_mbase_x);
+        _mm_storeu_si128((__m128i *)(dst + 32), a_mbase_x);
+        _mm_storeu_si128((__m128i *)(dst + 48), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m128i shift =
+        _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);  // 8 element
+
+    __m128i a0_above, a1_above, res_val;
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm_storeu_si128((__m128i *)(dst + j), a_mbase_x);
+      } else {
+        a0_above =
+            _mm_loadu_si128((__m128i *)(above + base + j));  // load 16 element
+        a1_above = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+
+        // lower half
+        a0 = _mm_cvtepu8_epi16(a0_above);
+        a1 = _mm_cvtepu8_epi16(a1_above);
+
+        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm_mullo_epi16(diff, shift);
+
+        res = _mm_add_epi16(a32, b);
+        res = _mm_srli_epi16(res, 5);
+
+        // uppar half
+        a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+        a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+        b = _mm_mullo_epi16(diff, shift);
+        res1 = _mm_add_epi16(a32, b);
+        res1 = _mm_srli_epi16(res1, 5);
+
+        res = _mm_packus_epi16(res, res1);  // 16 8bit values
+
+        base_inc =
+            _mm_setr_epi8((uint8_t)(base + j), (uint8_t)(base + j + 1),
+                          (uint8_t)(base + j + 2), (uint8_t)(base + j + 3),
+                          (uint8_t)(base + j + 4), (uint8_t)(base + j + 5),
+                          (uint8_t)(base + j + 6), (uint8_t)(base + j + 7),
+                          (uint8_t)(base + j + 8), (uint8_t)(base + j + 9),
+                          (uint8_t)(base + j + 10), (uint8_t)(base + j + 11),
+                          (uint8_t)(base + j + 12), (uint8_t)(base + j + 13),
+                          (uint8_t)(base + j + 14), (uint8_t)(base + j + 15));
+
+        mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc),
+                              _mm_setzero_si128());
+        res_val = _mm_blendv_epi8(a_mbase_x, res, mask);
+        _mm_storeu_si128((__m128i *)(dst + j), res_val);
+      }
+    }
+    x += dx;
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                                 const uint8_t *above, const uint8_t *left,
+                                 int upsample_above, int dx, int dy) {
+  (void)left;
+  (void)dy;
+  switch (bw) {
+    case 4:
+      dr_prediction_z1_4xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 8:
+      dr_prediction_z1_8xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 16:
+      dr_prediction_z1_16xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 32:
+      dr_prediction_z1_32xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 64:
+      dr_prediction_z1_64xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+      break;
+    default: assert(0 && "Invalid block size");
+  }
+  return;
+}
+
+static void dr_prediction_z2_Nx4_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left, int upsample_above,
+                                        int upsample_left, int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i a0_x, a1_x, a32, diff;
+
+  const __m128i c3f = _mm_set1_epi16(0x3f);
+  const __m128i min_y_base = _mm_set1_epi16(min_base_y);
+  const __m128i c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
+  const __m128i dy_reg = _mm_set1_epi16(dy);
+  const __m128i a16 = _mm_set1_epi16(16);
+
+  for (int r = 0; r < N; r++) {
+    __m128i b, res, shift, r6, ydx;
+    __m128i resx, resy, resxy;
+    __m128i a0_above, a1_above;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm_setzero_si128();
+      a1_x = _mm_setzero_si128();
+      shift = _mm_setzero_si128();
+    } else {
+      a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      ydx = _mm_set1_epi16(y * dx);
+      r6 = _mm_slli_epi16(c1234, 6);
+
+      if (upsample_above) {
+        a0_above =
+            _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_above = _mm_srli_si128(a0_above, 8);
+
+        shift = _mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+            1);
+      } else {
+        a0_above =
+            _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
+        a1_above = _mm_srli_si128(a0_above, 1);
+
+        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+      }
+      a0_x = _mm_cvtepu8_epi16(a0_above);
+      a1_x = _mm_cvtepu8_epi16(a1_above);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      __m128i y_c, base_y_c_reg, mask, c1234_;
+      c1234_ = _mm_srli_si128(c1234, 2);
+      r6 = _mm_set1_epi16(r << 6);
+      y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy_reg));
+      base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
+      mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
+      base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+      base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
+      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
+      }
+      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
+      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
+      shift = _mm_unpacklo_epi64(shift, shifty);
+    }
+
+    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm_mullo_epi16(diff, shift);
+    res = _mm_add_epi16(a32, b);
+    res = _mm_srli_epi16(res, 5);
+
+    resx = _mm_packus_epi16(res, res);
+    resy = _mm_srli_si128(resx, 4);
+
+    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
+    *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_Nx8_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left, int upsample_above,
+                                        int upsample_left, int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i diff, a32;
+  __m128i a0_x, a1_x, a0_y, a1_y;
+  __m128i a0_above, a1_above;
+
+  const __m128i a16 = _mm_set1_epi16(16);
+  const __m128i c3f = _mm_set1_epi16(0x3f);
+  const __m128i min_y_base = _mm_set1_epi16(min_base_y);
+  const __m128i dy_reg = _mm_set1_epi16(dy);
+  const __m128i c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+
+  for (int r = 0; r < N; r++) {
+    __m128i b, res, res1, shift, shifty;
+    __m128i resx, resy, resxy, r6, ydx;
+
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      a0_x = _mm_setzero_si128();
+      a1_x = _mm_setzero_si128();
+      a0_y = _mm_setzero_si128();
+      a1_y = _mm_setzero_si128();
+      shift = _mm_setzero_si128();
+    } else {
+      a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      ydx = _mm_set1_epi16(y * dx);
+      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
+      if (upsample_above) {
+        a0_above =
+            _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_above = _mm_srli_si128(a0_above, 8);
+
+        shift = _mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+            1);
+      } else {
+        a1_above = _mm_srli_si128(a0_above, 1);
+        a0_above =
+            _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
+        a1_above =
+            _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
+
+        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+      }
+      a0_x = _mm_cvtepu8_epi16(a0_above);
+      a1_x = _mm_cvtepu8_epi16(a1_above);
+      a0_y = _mm_setzero_si128();
+      a1_y = _mm_setzero_si128();
+      shifty = shift;
+    }
+
+    // y calc
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+      __m128i y_c, base_y_c_reg, mask;
+      r6 = _mm_set1_epi16(r << 6);
+      y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy_reg));
+      base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
+      mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
+      base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+      base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
+      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+
+      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
+      }
+    }
+
+    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm_mullo_epi16(diff, shift);
+    res = _mm_add_epi16(a32, b);
+    res = _mm_srli_epi16(res, 5);
+
+    diff = _mm_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
+    a32 = _mm_slli_epi16(a0_y, 5);     // a[x] * 32
+    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm_mullo_epi16(diff, shifty);
+    res1 = _mm_add_epi16(a32, b);
+    res1 = _mm_srli_epi16(res1, 5);
+
+    resx = _mm_packus_epi16(res, res);
+    resy = _mm_packus_epi16(res1, res1);
+
+    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst,
+                                        ptrdiff_t stride, const uint8_t *above,
+                                        const uint8_t *left, int upsample_above,
+                                        int upsample_left, int dx, int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  __m128i a0_x, a1_x, a0_y, a1_y, a0_y_h, a1_y_h, a32;
+  __m128i diff, shifty, shifty_h;
+  __m128i a0_above, a1_above;
+
+  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+  const __m128i a16 = _mm_set1_epi16(16);
+  const __m128i c1 = _mm_srli_epi16(a16, 4);
+  const __m128i min_y_base = _mm_set1_epi16(min_base_y);
+  const __m128i c3f = _mm_set1_epi16(0x3f);
+  const __m128i dy256 = _mm_set1_epi16(dy);
+  const __m128i c0123 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+  const __m128i c0123_h = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+  const __m128i c1234 = _mm_add_epi16(c0123, c1);
+  const __m128i c1234_h = _mm_add_epi16(c0123_h, c1);
+
+  for (int r = 0; r < H; r++) {
+    __m128i b, res, res1, shift, reg_j, r6, ydx;
+    __m128i resx, resy;
+    __m128i resxy;
+    int y = r + 1;
+    ydx = _mm_set1_epi16((uint16_t)(y * dx));
+
+    int base_x = (-y * dx) >> frac_bits_x;
+    for (int j = 0; j < W; j += 16) {
+      reg_j = _mm_set1_epi16(j);
+      int base_shift = 0;
+      if ((base_x + j) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x + j) - 1);
+      }
+      int base_min_diff = (min_base_x - base_x - j);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift < 16) {
+        a0_above =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+        a1_above =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+        a0_above =
+            _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
+        a1_above =
+            _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
+
+        a0_x = _mm_cvtepu8_epi16(a0_above);
+        a1_x = _mm_cvtepu8_epi16(a1_above);
+
+        r6 = _mm_slli_epi16(_mm_add_epi16(c0123, reg_j), 6);
+        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+
+        diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm_mullo_epi16(diff, shift);
+        res = _mm_add_epi16(a32, b);
+        res = _mm_srli_epi16(res, 5);  // 16 16-bit values
+
+        a0_x = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+        a1_x = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+        r6 = _mm_slli_epi16(_mm_add_epi16(c0123_h, reg_j), 6);
+        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+
+        diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm_mullo_epi16(diff, shift);
+        res1 = _mm_add_epi16(a32, b);
+        res1 = _mm_srli_epi16(res1, 5);  // 16 16-bit values
+
+        resx = _mm_packus_epi16(res, res1);
+      } else {
+        resx = _mm_setzero_si128();
+      }
+
+      // y calc
+      if (base_x < min_base_x) {
+        __m128i c_reg, c_reg_h, y_reg, y_reg_h, base_y, base_y_h;
+        __m128i mask, mask_h, mul16, mul16_h;
+        r6 = _mm_set1_epi16(r << 6);
+        c_reg = _mm_add_epi16(reg_j, c1234);
+        c_reg_h = _mm_add_epi16(reg_j, c1234_h);
+        mul16 = _mm_min_epu16(_mm_mullo_epi16(c_reg, dy256),
+                              _mm_srli_epi16(min_y_base, 1));
+        mul16_h = _mm_min_epu16(_mm_mullo_epi16(c_reg_h, dy256),
+                                _mm_srli_epi16(min_y_base, 1));
+        y_reg = _mm_sub_epi16(r6, mul16);
+        y_reg_h = _mm_sub_epi16(r6, mul16_h);
+
+        base_y = _mm_srai_epi16(y_reg, frac_bits_y);
+        base_y_h = _mm_srai_epi16(y_reg_h, frac_bits_y);
+        mask = _mm_cmpgt_epi16(min_y_base, base_y);
+        mask_h = _mm_cmpgt_epi16(min_y_base, base_y_h);
+
+        base_y = _mm_blendv_epi8(base_y, min_y_base, mask);
+        base_y_h = _mm_blendv_epi8(base_y_h, min_y_base, mask_h);
+        int16_t min_y = (int16_t)_mm_extract_epi16(base_y_h, 7);
+        int16_t max_y = (int16_t)_mm_extract_epi16(base_y, 0);
+        int16_t offset_diff = max_y - min_y;
+
+        if (offset_diff < 16) {
+          __m128i min_y_reg = _mm_set1_epi16(min_y);
+
+          __m128i base_y_offset = _mm_sub_epi16(base_y, min_y_reg);
+          __m128i base_y_offset_h = _mm_sub_epi16(base_y_h, min_y_reg);
+          __m128i y_offset = _mm_packs_epi16(base_y_offset, base_y_offset_h);
+
+          __m128i a0_mask = _mm_loadu_si128((__m128i *)(left + min_y));
+          __m128i a1_mask = _mm_loadu_si128((__m128i *)(left + min_y + 1));
+          __m128i LoadMask =
+              _mm_loadu_si128((__m128i *)(LoadMaskz2[offset_diff / 4]));
+
+          a0_mask = _mm_and_si128(a0_mask, LoadMask);
+          a1_mask = _mm_and_si128(a1_mask, LoadMask);
+
+          a0_mask = _mm_shuffle_epi8(a0_mask, y_offset);
+          a1_mask = _mm_shuffle_epi8(a1_mask, y_offset);
+          a0_y = _mm_cvtepu8_epi16(a0_mask);
+          a1_y = _mm_cvtepu8_epi16(a1_mask);
+          a0_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a0_mask, 8));
+          a1_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a1_mask, 8));
+        } else {
+          base_y = _mm_andnot_si128(mask, base_y);
+          base_y_h = _mm_andnot_si128(mask_h, base_y_h);
+          _mm_store_si128((__m128i *)base_y_c, base_y);
+          _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
+
+          a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                                left[base_y_c[2]], left[base_y_c[3]],
+                                left[base_y_c[4]], left[base_y_c[5]],
+                                left[base_y_c[6]], left[base_y_c[7]]);
+          a0_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
+                                  left[base_y_c[10]], left[base_y_c[11]],
+                                  left[base_y_c[12]], left[base_y_c[13]],
+                                  left[base_y_c[14]], left[base_y_c[15]]);
+          base_y = _mm_add_epi16(base_y, c1);
+          base_y_h = _mm_add_epi16(base_y_h, c1);
+          _mm_store_si128((__m128i *)base_y_c, base_y);
+          _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
+
+          a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                                left[base_y_c[2]], left[base_y_c[3]],
+                                left[base_y_c[4]], left[base_y_c[5]],
+                                left[base_y_c[6]], left[base_y_c[7]]);
+          a1_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
+                                  left[base_y_c[10]], left[base_y_c[11]],
+                                  left[base_y_c[12]], left[base_y_c[13]],
+                                  left[base_y_c[14]], left[base_y_c[15]]);
+        }
+        shifty = _mm_srli_epi16(_mm_and_si128(y_reg, c3f), 1);
+        shifty_h = _mm_srli_epi16(_mm_and_si128(y_reg_h, c3f), 1);
+
+        diff = _mm_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0_y, 5);     // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm_mullo_epi16(diff, shifty);
+        res = _mm_add_epi16(a32, b);
+        res = _mm_srli_epi16(res, 5);  // 16 16-bit values
+
+        diff = _mm_sub_epi16(a1_y_h, a0_y_h);  // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0_y_h, 5);       // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);         // a[x] * 32 + 16
+
+        b = _mm_mullo_epi16(diff, shifty_h);
+        res1 = _mm_add_epi16(a32, b);
+        res1 = _mm_srli_epi16(res1, 5);  // 16 16-bit values
+        resy = _mm_packus_epi16(res, res1);
+      } else {
+        resy = _mm_setzero_si128();
+      }
+      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
+      _mm_storeu_si128((__m128i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                                 const uint8_t *above, const uint8_t *left,
+                                 int upsample_above, int upsample_left, int dx,
+                                 int dy) {
+  assert(dx > 0);
+  assert(dy > 0);
+  switch (bw) {
+    case 4:
+      dr_prediction_z2_Nx4_sse4_1(bh, dst, stride, above, left, upsample_above,
+                                  upsample_left, dx, dy);
+      break;
+    case 8:
+      dr_prediction_z2_Nx8_sse4_1(bh, dst, stride, above, left, upsample_above,
+                                  upsample_left, dx, dy);
+      break;
+    default:
+      dr_prediction_z2_HxW_sse4_1(bh, bw, dst, stride, above, left,
+                                  upsample_above, upsample_left, dx, dy);
+  }
+  return;
+}
+
+// z3 functions
+static void dr_prediction_z3_4x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[4], d[4];
+
+  dr_prediction_z1_HxW_internal_sse4_1(4, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                            &d[0], &d[1], &d[2], &d[3]);
+
+  *(uint32_t *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
+  *(uint32_t *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
+  *(uint32_t *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
+  *(uint32_t *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
+  return;
+}
+
+static void dr_prediction_z3_8x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[8], d[8];
+
+  dr_prediction_z1_HxW_internal_sse4_1(8, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
+                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
+                    &d[3]);
+
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
+  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
+  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
+}
+
+static void dr_prediction_z3_4x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[4], d[8];
+
+  dr_prediction_z1_HxW_internal_sse4_1(8, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
+                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+  for (int i = 0; i < 8; i++) {
+    *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[8], d[4];
+
+  dr_prediction_z1_HxW_internal_sse4_1(4, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
+                        &d[1], &d[2], &d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void dr_prediction_z3_8x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *left, int upsample_left,
+                                         int dy) {
+  __m128i dstvec[8], d[8];
+
+  dr_prediction_z1_HxW_internal_sse4_1(16, 8, dstvec, left, upsample_left, dy);
+  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
+                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
+                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
+  for (int i = 0; i < 8; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+                     _mm_srli_si128(d[i], 8));
+  }
+}
+
+static void dr_prediction_z3_16x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *left, int upsample_left,
+                                         int dy) {
+  __m128i dstvec[16], d[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(8, 16, dstvec, left, upsample_left, dy);
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_4x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *left, int upsample_left,
+                                         int dy) {
+  __m128i dstvec[4], d[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(16, 4, dstvec, left, upsample_left, dy);
+  transpose4x16_sse2(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+  }
+}
+
+static void dr_prediction_z3_16x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *left, int upsample_left,
+                                         int dy) {
+  __m128i dstvec[16], d[8];
+
+  dr_prediction_z1_HxW_internal_sse4_1(4, 16, dstvec, left, upsample_left, dy);
+  for (int i = 4; i < 8; i++) {
+    d[i] = _mm_setzero_si128();
+  }
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+  for (int i = 0; i < 4; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *left, int upsample_left,
+                                         int dy) {
+  __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
+
+  dr_prediction_z1_32xN_internal_sse4_1(8, dstvec, dstvec_h, left,
+                                        upsample_left, dy);
+  for (int i = 8; i < 16; i++) {
+    dstvec[i] = _mm_setzero_si128();
+    dstvec_h[i] = _mm_setzero_si128();
+  }
+  transpose16x16_sse2(dstvec, d);
+  transpose16x16_sse2(dstvec_h, d_h);
+
+  for (int i = 0; i < 16; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+  }
+  for (int i = 0; i < 16; i++) {
+    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), d_h[i]);
+  }
+}
+
+static void dr_prediction_z3_32x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *left, int upsample_left,
+                                         int dy) {
+  __m128i dstvec[32], d[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(8, 32, dstvec, left, upsample_left, dy);
+
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+  transpose16x8_8x16_sse2(
+      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
+      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
+      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
+      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
+      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
+      &d[6 + 8], &d[7 + 8]);
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
+  }
+}
+
+static void dr_prediction_z3_16x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  __m128i dstvec[16], d[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(16, 16, dstvec, left, upsample_left, dy);
+  transpose16x16_sse2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_32x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  __m128i dstvec[32], d[32], dstvec_h[32], d_h[32];
+
+  dr_prediction_z1_32xN_internal_sse4_1(32, dstvec, dstvec_h, left,
+                                        upsample_left, dy);
+  transpose16x16_sse2(dstvec, d);
+  transpose16x16_sse2(dstvec_h, d_h);
+  transpose16x16_sse2(dstvec + 16, d + 16);
+  transpose16x16_sse2(dstvec_h + 16, d_h + 16);
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
+    _mm_storeu_si128((__m128i *)(dst + j * stride + 16), d[j + 16]);
+  }
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), d_h[j + 16]);
+  }
+}
+
+static void dr_prediction_z3_64x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  uint8_t dstT[64 * 64];
+  dr_prediction_z1_64xN_sse4_1(64, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
+
+  dr_prediction_z1_32xN_internal_sse4_1(16, dstvec, dstvec_h, left,
+                                        upsample_left, dy);
+  transpose16x16_sse2(dstvec, d);
+  transpose16x16_sse2(dstvec_h, d_h);
+  // store
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
+  }
+}
+
+static void dr_prediction_z3_32x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  __m128i dstvec[32], d[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(16, 32, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 32; i += 16) {
+    transpose16x16_sse2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+static void dr_prediction_z3_32x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  uint8_t dstT[64 * 32];
+  dr_prediction_z1_64xN_sse4_1(32, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  uint8_t dstT[32 * 64];
+  dr_prediction_z1_32xN_sse4_1(64, dstT, 32, left, upsample_left, dy);
+  transpose(dstT, 32, dst, stride, 64, 32);
+  return;
+}
+
+static void dr_prediction_z3_16x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  uint8_t dstT[64 * 16];
+  dr_prediction_z1_64xN_sse4_1(16, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  __m128i dstvec[64], d[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(16, 64, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 64; i += 16) {
+    transpose16x16_sse2(dstvec + i, d);
+    for (int j = 0; j < 16; j++) {
+      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+void av1_dr_prediction_z3_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                                 const uint8_t *above, const uint8_t *left,
+                                 int upsample_left, int dx, int dy) {
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy > 0);
+
+  if (bw == bh) {
+    switch (bw) {
+      case 4:
+        dr_prediction_z3_4x4_sse4_1(dst, stride, left, upsample_left, dy);
+        break;
+      case 8:
+        dr_prediction_z3_8x8_sse4_1(dst, stride, left, upsample_left, dy);
+        break;
+      case 16:
+        dr_prediction_z3_16x16_sse4_1(dst, stride, left, upsample_left, dy);
+        break;
+      case 32:
+        dr_prediction_z3_32x32_sse4_1(dst, stride, left, upsample_left, dy);
+        break;
+      case 64:
+        dr_prediction_z3_64x64_sse4_1(dst, stride, left, upsample_left, dy);
+        break;
+      default: assert(0 && "Invalid block size");
+    }
+  } else {
+    if (bw < bh) {
+      if (bw + bw == bh) {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x8_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x16_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x32_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_32x64_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          default: assert(0 && "Invalid block size");
+        }
+      } else {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x16_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x32_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x64_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          default: assert(0 && "Invalid block size");
+        }
+      }
+    } else {
+      if (bh + bh == bw) {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_8x4_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_16x8_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_32x16_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_64x32_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          default: assert(0 && "Invalid block size");
+        }
+      } else {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_16x4_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_32x8_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_64x16_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          default: assert(0 && "Invalid block size");
+        }
+      }
+    }
+  }
+}
diff --git a/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c b/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c
index 5a34ea0c8e..f0bd040087 100644
--- a/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c
@@ -607,21 +607,22 @@ static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
 // weight_h[2]: same as [0], second half for height = 16 only
 // weight_h[3]: same as [1], second half for height = 16 only
 // weight_w[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
-                                  __m128i *weight_h, __m128i *weight_w) {
+static INLINE void load_weight_w4(int height, __m128i *weight_h,
+                                  __m128i *weight_w) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)smooth_weights)[0]);
   weight_h[0] = _mm_unpacklo_epi8(t, zero);
   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
   weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
 
   if (height == 8) {
-    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
   } else if (height == 16) {
-    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    const __m128i weight =
+        _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     weight_h[2] = _mm_unpackhi_epi8(weight, zero);
@@ -632,7 +633,7 @@ static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
 static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
                                    const __m128i *ww, int h, uint8_t *dst,
                                    ptrdiff_t stride, int second_half) {
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
+  const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set1_epi32(0xc080400);
@@ -652,7 +653,7 @@ static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
 
     sum = _mm_add_epi32(s, sum);
     sum = _mm_add_epi32(sum, round);
-    sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
+    sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
 
     sum = _mm_shuffle_epi8(sum, gat);
     *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
@@ -669,7 +670,7 @@ void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_w4(above, left, 4, pixels);
 
   __m128i wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 4, wh, ww);
+  load_weight_w4(4, wh, ww);
 
   smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
 }
@@ -680,7 +681,7 @@ void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_w4(above, left, 8, pixels);
 
   __m128i wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 8, wh, ww);
+  load_weight_w4(8, wh, ww);
 
   smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
 }
@@ -692,7 +693,7 @@ void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_w4(above, left, 16, pixels);
 
   __m128i wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 16, wh, ww);
+  load_weight_w4(16, wh, ww);
 
   smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
   dst += stride << 3;
@@ -743,13 +744,13 @@ static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
 // weight_h[7]: same as [1], offset 24
 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
-                                  __m128i *weight_h, __m128i *weight_w) {
+static INLINE void load_weight_w8(int height, __m128i *weight_h,
+                                  __m128i *weight_w) {
   const __m128i zero = _mm_setzero_si128();
-  const int we_offset = height < 8 ? 4 : 8;
-  __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
+  const int we_offset = height < 8 ? 0 : 4;
+  __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
   weight_h[0] = _mm_unpacklo_epi8(we, zero);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
 
   if (height == 4) {
@@ -764,20 +765,20 @@ static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
   }
 
   if (height == 16) {
-    we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
     weight_h[0] = _mm_unpacklo_epi8(we, zero);
     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     weight_h[2] = _mm_unpackhi_epi8(we, zero);
     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
   } else if (height == 32) {
     const __m128i weight_lo =
-        _mm_loadu_si128((const __m128i *)&weight_array[32]);
+        _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
     weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
     const __m128i weight_hi =
-        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+        _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
     weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
     weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
     weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
@@ -788,7 +789,7 @@ static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
 static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
                                    const __m128i *ww, int h, uint8_t *dst,
                                    ptrdiff_t stride, int second_half) {
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
+  const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
@@ -812,11 +813,11 @@ static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
 
     s0 = _mm_add_epi32(s0, sum0);
     s0 = _mm_add_epi32(s0, round);
-    s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
+    s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
 
     s1 = _mm_add_epi32(s1, sum1);
     s1 = _mm_add_epi32(s1, round);
-    s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
+    s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
 
     sum0 = _mm_packus_epi16(s0, s1);
     sum0 = _mm_shuffle_epi8(sum0, gat);
@@ -834,7 +835,7 @@ void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_w8(above, left, 4, pixels);
 
   __m128i wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 4, wh, ww);
+  load_weight_w8(4, wh, ww);
 
   smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
 }
@@ -845,7 +846,7 @@ void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_w8(above, left, 8, pixels);
 
   __m128i wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 8, wh, ww);
+  load_weight_w8(8, wh, ww);
 
   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
 }
@@ -857,7 +858,7 @@ void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_w8(above, left, 16, pixels);
 
   __m128i wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 16, wh, ww);
+  load_weight_w8(16, wh, ww);
 
   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
   dst += stride << 3;
@@ -871,7 +872,7 @@ void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_w8(above, left, 32, pixels);
 
   __m128i wh[8], ww[2];
-  load_weight_w8(sm_weight_arrays, 32, wh, ww);
+  load_weight_w8(32, wh, ww);
 
   smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
   dst += stride << 3;
@@ -886,17 +887,18 @@ static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
                                         const uint8_t *above,
                                         const uint8_t *left, uint32_t bw,
                                         uint32_t bh) {
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
+  const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
   const __m128i zero = _mm_setzero_si128();
   const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+      _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
   const __m128i dup16 = _mm_set1_epi32(0x01000100);
   const __m128i top_right =
       _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i round =
+      _mm_set1_epi32((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
 
   for (uint32_t y = 0; y < bh; ++y) {
     const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
@@ -931,8 +933,8 @@ static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
       pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
       pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
 
-      pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
-      pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
+      pred_lo = _mm_srai_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
+      pred_hi = _mm_srai_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
 
       __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
       pred = _mm_shuffle_epi8(pred, gat);
@@ -1029,22 +1031,22 @@ static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
 
 // weights[0]: weights_h vector
 // weights[1]: scale - weights_h vector
-static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
-                                    __m128i *weights) {
+static INLINE void load_weight_v_w4(int height, __m128i *weights) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
 
   if (height == 4) {
     const __m128i weight =
-        _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
+        _mm_cvtsi32_si128(((const uint32_t *)smooth_weights)[0]);
     weights[0] = _mm_unpacklo_epi8(weight, zero);
     weights[1] = _mm_sub_epi16(d, weights[0]);
   } else if (height == 8) {
-    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
     weights[0] = _mm_unpacklo_epi8(weight, zero);
     weights[1] = _mm_sub_epi16(d, weights[0]);
   } else {
-    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    const __m128i weight =
+        _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
     weights[0] = _mm_unpacklo_epi8(weight, zero);
     weights[1] = _mm_sub_epi16(d, weights[0]);
     weights[2] = _mm_unpackhi_epi8(weight, zero);
@@ -1055,7 +1057,8 @@ static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
 static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
                                      const __m128i *weight, int h, uint8_t *dst,
                                      ptrdiff_t stride) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i pred_round =
+      _mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set1_epi32(0xc080400);
   __m128i d = _mm_set1_epi16(0x100);
@@ -1066,7 +1069,7 @@ static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
     __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
     sum = _mm_add_epi32(sum, pred_round);
-    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
+    sum = _mm_srai_epi32(sum, SMOOTH_WEIGHT_LOG2_SCALE);
     sum = _mm_shuffle_epi8(sum, gat);
     *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
     dst += stride;
@@ -1081,7 +1084,7 @@ void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_v_w4(above, left, 4, &pixels);
 
   __m128i weights[2];
-  load_weight_v_w4(sm_weight_arrays, 4, weights);
+  load_weight_v_w4(4, weights);
 
   smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
 }
@@ -1093,7 +1096,7 @@ void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_v_w4(above, left, 8, &pixels);
 
   __m128i weights[2];
-  load_weight_v_w4(sm_weight_arrays, 8, weights);
+  load_weight_v_w4(8, weights);
 
   smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
 }
@@ -1105,7 +1108,7 @@ void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_v_w4(above, left, 16, &pixels);
 
   __m128i weights[4];
-  load_weight_v_w4(sm_weight_arrays, 16, weights);
+  load_weight_v_w4(16, weights);
 
   smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
   dst += stride << 3;
@@ -1132,32 +1135,32 @@ static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
 // weight_h[5]: same as [1], offset 16
 // weight_h[6]: same as [0], offset 24
 // weight_h[7]: same as [1], offset 24
-static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
-                                    __m128i *weight_h) {
+static INLINE void load_weight_v_w8(int height, __m128i *weight_h) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
 
   if (height < 16) {
-    const int offset = height < 8 ? 4 : 8;
+    const int offset = height < 8 ? 0 : 4;
     const __m128i weight =
-        _mm_loadu_si128((const __m128i *)&weight_array[offset]);
+        _mm_loadu_si128((const __m128i *)&smooth_weights[offset]);
     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
   } else if (height == 16) {
-    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    const __m128i weight =
+        _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     weight_h[2] = _mm_unpackhi_epi8(weight, zero);
     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
   } else {
     const __m128i weight_lo =
-        _mm_loadu_si128((const __m128i *)&weight_array[32]);
+        _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
     weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
     const __m128i weight_hi =
-        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+        _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
     weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
     weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
     weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
@@ -1167,7 +1170,8 @@ static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
 
 static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
                                      int h, uint8_t *dst, ptrdiff_t stride) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i pred_round =
+      _mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
   __m128i d = _mm_set1_epi16(0x100);
@@ -1180,10 +1184,10 @@ static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
     __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
 
     s0 = _mm_add_epi32(s0, pred_round);
-    s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
+    s0 = _mm_srai_epi32(s0, SMOOTH_WEIGHT_LOG2_SCALE);
 
     s1 = _mm_add_epi32(s1, pred_round);
-    s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
+    s1 = _mm_srai_epi32(s1, SMOOTH_WEIGHT_LOG2_SCALE);
 
     __m128i sum01 = _mm_packus_epi16(s0, s1);
     sum01 = _mm_shuffle_epi8(sum01, gat);
@@ -1201,7 +1205,7 @@ void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_v_w8(above, left, 4, pixels);
 
   __m128i wh[2];
-  load_weight_v_w8(sm_weight_arrays, 4, wh);
+  load_weight_v_w8(4, wh);
 
   smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
 }
@@ -1213,7 +1217,7 @@ void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_v_w8(above, left, 8, pixels);
 
   __m128i wh[2];
-  load_weight_v_w8(sm_weight_arrays, 8, wh);
+  load_weight_v_w8(8, wh);
 
   smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
 }
@@ -1225,7 +1229,7 @@ void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_v_w8(above, left, 16, pixels);
 
   __m128i wh[4];
-  load_weight_v_w8(sm_weight_arrays, 16, wh);
+  load_weight_v_w8(16, wh);
 
   smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
   dst += stride << 3;
@@ -1239,7 +1243,7 @@ void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_v_w8(above, left, 32, pixels);
 
   __m128i wh[8];
-  load_weight_v_w8(sm_weight_arrays, 32, wh);
+  load_weight_v_w8(32, wh);
 
   smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
   dst += stride << 3;
@@ -1254,16 +1258,16 @@ static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
                                           const uint8_t *above,
                                           const uint8_t *left, uint32_t bw,
                                           uint32_t bh) {
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
   const __m128i zero = _mm_setzero_si128();
   const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+      _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i dup16 = _mm_set1_epi32(0x01000100);
   const __m128i bottom_left =
       _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
   const __m128i round =
-      _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
+      _mm_set1_epi32((uint16_t)(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
 
   for (uint32_t y = 0; y < bh; ++y) {
     const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
@@ -1284,8 +1288,8 @@ static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
 
       pred_lo = _mm_add_epi32(pred_lo, round);
       pred_hi = _mm_add_epi32(pred_hi, round);
-      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
-      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+      pred_lo = _mm_srai_epi32(pred_lo, SMOOTH_WEIGHT_LOG2_SCALE);
+      pred_hi = _mm_srai_epi32(pred_hi, SMOOTH_WEIGHT_LOG2_SCALE);
 
       __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
       pred = _mm_shuffle_epi8(pred, gat);
@@ -1384,14 +1388,13 @@ static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
 }
 
 // weights[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
-                                    __m128i *weights) {
+static INLINE void load_weight_h_w4(int height, __m128i *weights) {
   (void)height;
-  const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
+  const __m128i t = _mm_loadu_si128((const __m128i *)&smooth_weights[0]);
   const __m128i zero = _mm_setzero_si128();
 
   const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
   weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
 }
@@ -1399,7 +1402,8 @@ static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
 static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
                                      const __m128i *weight, int h, uint8_t *dst,
                                      ptrdiff_t stride) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i pred_round =
+      _mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i gat = _mm_set1_epi32(0xc080400);
   __m128i rep = _mm_set1_epi16((short)0x8000);
@@ -1410,7 +1414,7 @@ static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
     __m128i sum = _mm_madd_epi16(b, weight[0]);
 
     sum = _mm_add_epi32(sum, pred_round);
-    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
+    sum = _mm_srai_epi32(sum, SMOOTH_WEIGHT_LOG2_SCALE);
 
     sum = _mm_shuffle_epi8(sum, gat);
     *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
@@ -1427,7 +1431,7 @@ void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_h_w4(above, left, 4, pixels);
 
   __m128i weights;
-  load_weight_h_w4(sm_weight_arrays, 4, &weights);
+  load_weight_h_w4(4, &weights);
 
   smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
 }
@@ -1439,7 +1443,7 @@ void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_h_w4(above, left, 8, pixels);
 
   __m128i weights;
-  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+  load_weight_h_w4(8, &weights);
 
   smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
 }
@@ -1451,7 +1455,7 @@ void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_h_w4(above, left, 16, pixels);
 
   __m128i weights;
-  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+  load_weight_h_w4(8, &weights);
 
   smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
   dst += stride << 3;
@@ -1483,12 +1487,11 @@ static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
 
 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
-                                    __m128i *weight_w) {
+static INLINE void load_weight_h_w8(int height, __m128i *weight_w) {
   (void)height;
   const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[4]);
   const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
   const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
   weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
@@ -1498,7 +1501,8 @@ static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
 static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
                                      int h, uint8_t *dst, ptrdiff_t stride,
                                      int second_half) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i pred_round =
+      _mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
   __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
@@ -1511,10 +1515,10 @@ static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
     __m128i sum1 = _mm_madd_epi16(b, ww[1]);
 
     sum0 = _mm_add_epi32(sum0, pred_round);
-    sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
+    sum0 = _mm_srai_epi32(sum0, SMOOTH_WEIGHT_LOG2_SCALE);
 
     sum1 = _mm_add_epi32(sum1, pred_round);
-    sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
+    sum1 = _mm_srai_epi32(sum1, SMOOTH_WEIGHT_LOG2_SCALE);
 
     sum0 = _mm_packus_epi16(sum0, sum1);
     sum0 = _mm_shuffle_epi8(sum0, gat);
@@ -1532,7 +1536,7 @@ void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_h_w8(above, left, 4, pixels);
 
   __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 4, ww);
+  load_weight_h_w8(4, ww);
 
   smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
 }
@@ -1544,7 +1548,7 @@ void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_h_w8(above, left, 8, pixels);
 
   __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 8, ww);
+  load_weight_h_w8(8, ww);
 
   smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
 }
@@ -1556,7 +1560,7 @@ void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_h_w8(above, left, 16, pixels);
 
   __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 16, ww);
+  load_weight_h_w8(16, ww);
 
   smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
   dst += stride << 3;
@@ -1570,7 +1574,7 @@ void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   load_pixel_h_w8(above, left, 32, pixels);
 
   __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 32, ww);
+  load_weight_h_w8(32, ww);
 
   smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
   dst += stride << 3;
@@ -1585,13 +1589,14 @@ static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
                                           const uint8_t *above,
                                           const uint8_t *left, uint32_t bw,
                                           uint32_t bh) {
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
   const __m128i zero = _mm_setzero_si128();
   const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+      _mm_set1_epi16((uint16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
   const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i pred_round =
+      _mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
 
   for (uint32_t y = 0; y < bh; ++y) {
     const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
@@ -1611,8 +1616,8 @@ static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
       pred_lo = _mm_add_epi32(pred_lo, pred_round);
       pred_hi = _mm_add_epi32(pred_hi, pred_round);
 
-      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
-      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+      pred_lo = _mm_srai_epi32(pred_lo, SMOOTH_WEIGHT_LOG2_SCALE);
+      pred_hi = _mm_srai_epi32(pred_hi, SMOOTH_WEIGHT_LOG2_SCALE);
 
       __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
       pred = _mm_shuffle_epi8(pred, gat);
diff --git a/media/libaom/src/aom_dsp/x86/intrapred_utils.h b/media/libaom/src/aom_dsp/x86/intrapred_utils.h
new file mode 100644
index 0000000000..502574673e
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/intrapred_utils.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
+#define AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
+
+#include <emmintrin.h>  // SSE2
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
+  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+  { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
+  { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
+  { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
+  { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
+  { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
+  { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
+};
+
+static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+  { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+  { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
+  { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
+  { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+};
+
+static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
+  { -1, 0, 0, 0, 0, 0, 0, 0 },       { -1, -1, 0, 0, 0, 0, 0, 0 },
+  { -1, -1, -1, 0, 0, 0, 0, 0 },     { -1, -1, -1, -1, 0, 0, 0, 0 },
+  { -1, -1, -1, -1, -1, 0, 0, 0 },   { -1, -1, -1, -1, -1, -1, 0, 0 },
+  { -1, -1, -1, -1, -1, -1, -1, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 },
+};
+
+static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
+  __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
+  w0 = _mm_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm_unpackhi_epi8(x[0], x[1]);
+  w3 = _mm_unpackhi_epi8(x[2], x[3]);
+
+  ww0 = _mm_unpacklo_epi16(w0, w1);
+  ww1 = _mm_unpacklo_epi16(w2, w3);
+  ww2 = _mm_unpackhi_epi16(w0, w1);
+  ww3 = _mm_unpackhi_epi16(w2, w3);
+
+  w0 = _mm_unpacklo_epi32(ww0, ww1);
+  w2 = _mm_unpacklo_epi32(ww2, ww3);
+  w1 = _mm_unpackhi_epi32(ww0, ww1);
+  w3 = _mm_unpackhi_epi32(ww2, ww3);
+
+  d[0] = _mm_unpacklo_epi64(w0, w2);
+  d[1] = _mm_unpackhi_epi64(w0, w2);
+  d[2] = _mm_unpacklo_epi64(w1, w3);
+  d[3] = _mm_unpackhi_epi64(w1, w3);
+
+  d[4] = _mm_srli_si128(d[0], 8);
+  d[5] = _mm_srli_si128(d[1], 8);
+  d[6] = _mm_srli_si128(d[2], 8);
+  d[7] = _mm_srli_si128(d[3], 8);
+
+  d[8] = _mm_srli_si128(d[0], 4);
+  d[9] = _mm_srli_si128(d[1], 4);
+  d[10] = _mm_srli_si128(d[2], 4);
+  d[11] = _mm_srli_si128(d[3], 4);
+
+  d[12] = _mm_srli_si128(d[0], 12);
+  d[13] = _mm_srli_si128(d[1], 12);
+  d[14] = _mm_srli_si128(d[2], 12);
+  d[15] = _mm_srli_si128(d[3], 12);
+}
+
+static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm_unpacklo_epi8(x[4], x[5]);
+  w3 = _mm_unpacklo_epi8(x[6], x[7]);
+
+  w8 = _mm_unpacklo_epi8(x[8], x[9]);
+  w9 = _mm_unpacklo_epi8(x[10], x[11]);
+  w10 = _mm_unpacklo_epi8(x[12], x[13]);
+  w11 = _mm_unpacklo_epi8(x[14], x[15]);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[0] = _mm_unpacklo_epi64(w6, w14);
+  d[1] = _mm_unpackhi_epi64(w6, w14);
+  d[2] = _mm_unpacklo_epi64(w7, w15);
+  d[3] = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[4] = _mm_unpacklo_epi64(w6, w14);
+  d[5] = _mm_unpackhi_epi64(w6, w14);
+  d[6] = _mm_unpacklo_epi64(w7, w15);
+  d[7] = _mm_unpackhi_epi64(w7, w15);
+
+  // upper half
+  w0 = _mm_unpackhi_epi8(x[0], x[1]);
+  w1 = _mm_unpackhi_epi8(x[2], x[3]);
+  w2 = _mm_unpackhi_epi8(x[4], x[5]);
+  w3 = _mm_unpackhi_epi8(x[6], x[7]);
+
+  w8 = _mm_unpackhi_epi8(x[8], x[9]);
+  w9 = _mm_unpackhi_epi8(x[10], x[11]);
+  w10 = _mm_unpackhi_epi8(x[12], x[13]);
+  w11 = _mm_unpackhi_epi8(x[14], x[15]);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[8] = _mm_unpacklo_epi64(w6, w14);
+  d[9] = _mm_unpackhi_epi64(w6, w14);
+  d[10] = _mm_unpacklo_epi64(w7, w15);
+  d[11] = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[12] = _mm_unpacklo_epi64(w6, w14);
+  d[13] = _mm_unpackhi_epi64(w6, w14);
+  d[14] = _mm_unpacklo_epi64(w7, w15);
+  d[15] = _mm_unpackhi_epi64(w7, w15);
+}
+
+static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
+                               uint8_t *dst, ptrdiff_t pitchDst) {
+  __m128i r[16];
+  __m128i d[16];
+  for (int j = 0; j < 16; j++) {
+    r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
+  }
+  transpose16x16_sse2(r, d);
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
+  }
+}
+
+static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
+                      ptrdiff_t pitchDst, int width, int height) {
+  for (int j = 0; j < height; j += 16)
+    for (int i = 0; i < width; i += 16)
+      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+                         dst + j * pitchDst + i, pitchDst);
+}
+
+#endif  // AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
diff --git a/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c b/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c
index 2e3e2be105..4e6fe8faa3 100644
--- a/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c
@@ -15,7 +15,6 @@
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
 
 #include "aom_dsp/x86/synonyms.h"
 
diff --git a/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c b/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c
index c8b02f5560..6ec5dd8c10 100644
--- a/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c
@@ -15,7 +15,6 @@
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
 
 #include "aom_dsp/x86/synonyms.h"
 
@@ -116,38 +115,6 @@ void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
   }
 }
 
-void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
-  int n;
-  int i;
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
-  assert(!(width * height & 15));
-  n = width * height >> 4;
-
-  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
-  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
-  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
-                                 w1, w0, w1, w0);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-
-  for (i = 0; i < n; i++) {
-    __m128i p0 = xx_loadu_128(comp_pred);
-    __m128i p1 = xx_loadu_128(pred);
-
-    compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
-
-    comp_pred += 16;
-    pred += 16;
-  }
-}
-
 #define DIST_WTD_SUBPIX_AVG_VAR(W, H)                                      \
   uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3(           \
       const uint8_t *a, int a_stride, int xoffset, int yoffset,            \
@@ -184,9 +151,12 @@ DIST_WTD_SUBPIX_AVG_VAR(8, 8)
 DIST_WTD_SUBPIX_AVG_VAR(8, 4)
 DIST_WTD_SUBPIX_AVG_VAR(4, 8)
 DIST_WTD_SUBPIX_AVG_VAR(4, 4)
+
+#if !CONFIG_REALTIME_ONLY
 DIST_WTD_SUBPIX_AVG_VAR(4, 16)
 DIST_WTD_SUBPIX_AVG_VAR(16, 4)
 DIST_WTD_SUBPIX_AVG_VAR(8, 32)
 DIST_WTD_SUBPIX_AVG_VAR(32, 8)
 DIST_WTD_SUBPIX_AVG_VAR(16, 64)
 DIST_WTD_SUBPIX_AVG_VAR(64, 16)
+#endif
diff --git a/media/libaom/src/aom_dsp/x86/loopfilter_avx2.c b/media/libaom/src/aom_dsp/x86/loopfilter_avx2.c
new file mode 100644
index 0000000000..b59381928e
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/loopfilter_avx2.c
@@ -0,0 +1,918 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> /* AVX2 */
+
+#include "config/aom_dsp_rtcd.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+  0, 128, 1, 128, 2,  128, 3,  128, 4,  128, 5,  128, 6,  128, 7,  128,
+  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
+void aom_lpf_horizontal_6_quad_avx2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0) {
+  __m256i p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i mask, flat;
+
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+  const __m128i limit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
+  p256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+  p256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+  p256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+  q256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+  q256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+  q256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+
+  p2 = _mm256_castsi256_si128(p256_2);
+  p1 = _mm256_castsi256_si128(p256_1);
+  p0 = _mm256_castsi256_si128(p256_0);
+  q0 = _mm256_castsi256_si128(q256_0);
+  q1 = _mm256_castsi256_si128(q256_1);
+  q2 = _mm256_castsi256_si128(q256_2);
+
+  {
+    __m128i work;
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i abs_p1p0 =
+        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 =
+        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+    __m128i abs_p0q0 =
+        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 =
+        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  // loop filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i hev;
+
+    hev = _mm_subs_epu8(flat, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    __m128i ps1 = _mm_xor_si128(p1, t80);
+    __m128i ps0 = _mm_xor_si128(p0, t80);
+    __m128i qs0 = _mm_xor_si128(q0, t80);
+    __m128i qs1 = _mm_xor_si128(q1, t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+    __m128i flat_p1, flat_p0, flat_q0, flat_q1;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+    __m128i work;
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+        _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m256i four = _mm256_set1_epi16(4);
+      __m256i pixetFilter, add, res;
+
+      const __m256i filter =
+          _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+
+      p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+      p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+      p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+      q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+      q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+      q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+
+      pixetFilter = _mm256_slli_epi16(
+          _mm256_add_epi16(p256_2, _mm256_add_epi16(p256_1, p256_0)), 1);
+      pixetFilter =
+          _mm256_add_epi16(pixetFilter, _mm256_add_epi16(p256_2, q256_0));
+      pixetFilter = _mm256_add_epi16(four, pixetFilter);
+      res = _mm256_srli_epi16(pixetFilter, 3);
+      flat_p1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+      p1 = _mm_andnot_si128(flat, ps1);
+      flat_p1 = _mm_and_si128(flat, flat_p1);
+      p1 = _mm_or_si128(flat_p1, p1);
+
+      add = _mm256_add_epi16(_mm256_sub_epi16(q256_1, p256_2),
+                             _mm256_sub_epi16(q256_0, p256_2));
+      pixetFilter = _mm256_add_epi16(pixetFilter, add);
+      res = _mm256_srli_epi16(pixetFilter, 3);
+      flat_p0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+      p0 = _mm_andnot_si128(flat, ps0);
+      flat_p0 = _mm_and_si128(flat, flat_p0);
+      p0 = _mm_or_si128(flat_p0, p0);
+
+      add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_2),
+                             _mm256_sub_epi16(q256_1, p256_1));
+      pixetFilter = _mm256_add_epi16(pixetFilter, add);
+      res = _mm256_srli_epi16(pixetFilter, 3);
+      flat_q0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+      q0 = _mm_andnot_si128(flat, qs0);
+      flat_q0 = _mm_and_si128(flat, flat_q0);
+      q0 = _mm_or_si128(flat_q0, q0);
+
+      add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_1),
+                             _mm256_sub_epi16(q256_2, p256_0));
+      pixetFilter = _mm256_add_epi16(pixetFilter, add);
+      res = _mm256_srli_epi16(pixetFilter, 3);
+      flat_q1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+      q1 = _mm_andnot_si128(flat, qs1);
+      flat_q1 = _mm_and_si128(flat, flat_q1);
+      q1 = _mm_or_si128(flat_q1, q1);
+
+      _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+      _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
+    }
+  }
+}
+
+void aom_lpf_horizontal_8_quad_avx2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0) {
+  __m256i p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  __m128i mask, flat;
+
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+  const __m128i limit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
+  p256_3 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
+  p256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+  p256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+  p256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+  q256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+  q256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+  q256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+  q256_3 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
+
+  p3 = _mm256_castsi256_si128(p256_3);
+  p2 = _mm256_castsi256_si128(p256_2);
+  p1 = _mm256_castsi256_si128(p256_1);
+  p0 = _mm256_castsi256_si128(p256_0);
+  q0 = _mm256_castsi256_si128(q256_0);
+  q1 = _mm256_castsi256_si128(q256_1);
+  q2 = _mm256_castsi256_si128(q256_2);
+  q3 = _mm256_castsi256_si128(q256_3);
+
+  {
+    __m128i work;
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i abs_p1p0 =
+        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 =
+        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+    __m128i abs_p0q0 =
+        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 =
+        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  // loop filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i hev;
+
+    hev = _mm_subs_epu8(flat, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    __m128i ps1 = _mm_xor_si128(p1, t80);
+    __m128i ps0 = _mm_xor_si128(p0, t80);
+    __m128i qs0 = _mm_xor_si128(q0, t80);
+    __m128i qs1 = _mm_xor_si128(q1, t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+    __m128i flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+    __m128i work;
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+        _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+        _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m256i four = _mm256_set1_epi16(4);
+      __m256i pixetFilter_p2p1p0, p2p1p0, q2q1q0, pixetFilter_q2q1q0, sum_p,
+          sum_q, res_p, res_q;
+
+      const __m256i filter =
+          _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+
+      p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+      p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+      p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+      p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+      q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+      q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+      q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+      q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+
+      p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+      q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+
+      pixetFilter_p2p1p0 =
+          _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
+      pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
+
+      pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
+      flat_p0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+      p0 = _mm_andnot_si128(flat, ps0);
+      flat_p0 = _mm_and_si128(flat, flat_p0);
+      p0 = _mm_or_si128(flat_p0, p0);
+
+      pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
+      flat_q0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+      q0 = _mm_andnot_si128(flat, qs0);
+      flat_q0 = _mm_and_si128(flat, flat_q0);
+      q0 = _mm_or_si128(flat_q0, q0);
+
+      sum_p = _mm256_sub_epi16(p256_3, q256_2);
+      pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
+      flat_p1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+      p1 = _mm_andnot_si128(flat, ps1);
+      flat_p1 = _mm_and_si128(flat, flat_p1);
+      p1 = _mm_or_si128(flat_p1, p1);
+
+      sum_q = _mm256_sub_epi16(q256_3, p256_2);
+      pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
+      flat_q1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+      q1 = _mm_andnot_si128(flat, qs1);
+      flat_q1 = _mm_and_si128(flat, flat_q1);
+      q1 = _mm_or_si128(flat_q1, q1);
+
+      sum_p = _mm256_sub_epi16(p256_3, q256_1);
+      pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
+      flat_p2 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+      p2 = _mm_andnot_si128(flat, p2);
+      flat_p2 = _mm_and_si128(flat, flat_p2);
+      p2 = _mm_or_si128(flat_p2, p2);
+
+      sum_q = _mm256_sub_epi16(q256_3, p256_1);
+      pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
+      flat_q2 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+      q2 = _mm_andnot_si128(flat, q2);
+      flat_q2 = _mm_and_si128(flat, flat_q2);
+      q2 = _mm_or_si128(flat_q2, q2);
+
+      _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+      _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
+    }
+  }
+}
+
+void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
+                                     const unsigned char *_blimit0,
+                                     const unsigned char *_limit0,
+                                     const unsigned char *_thresh0) {
+  __m256i p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  __m128i mask, flat, flat2;
+
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+  const __m128i limit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
+  p256_3 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
+  p256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+  p256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+  p256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+  q256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+  q256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+  q256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+  q256_3 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
+
+  p3 = _mm256_castsi256_si128(p256_3);
+  p2 = _mm256_castsi256_si128(p256_2);
+  p1 = _mm256_castsi256_si128(p256_1);
+  p0 = _mm256_castsi256_si128(p256_0);
+  q0 = _mm256_castsi256_si128(q256_0);
+  q1 = _mm256_castsi256_si128(q256_1);
+  q2 = _mm256_castsi256_si128(q256_2);
+  q3 = _mm256_castsi256_si128(q256_3);
+
+  {
+    __m128i work;
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i abs_p1p0 =
+        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 =
+        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+    __m128i abs_p0q0 =
+        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 =
+        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  // loop filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i hev;
+
+    hev = _mm_subs_epu8(flat, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    __m128i ps1 = _mm_xor_si128(p1, t80);
+    __m128i ps0 = _mm_xor_si128(p0, t80);
+    __m128i qs0 = _mm_xor_si128(q0, t80);
+    __m128i qs1 = _mm_xor_si128(q1, t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+    __m128i flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, flat2_p0,
+        flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5, flat_p2,
+        flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+    __m128i work;
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+        _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+        _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m256i eight = _mm256_set1_epi16(8);
+      const __m256i four = _mm256_set1_epi16(4);
+      __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, p2p1p0, q2q1q0,
+          pixetFilter_q2q1q0, sum_p, sum_q, res_p, res_q;
+      __m256i p256_4, q256_4, p256_5, q256_5, p256_6, q256_6;
+      __m128i p4, q4, p5, q5, p6, q6;
+
+      p256_4 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
+      q256_4 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
+      p4 = _mm256_castsi256_si128(p256_4);
+      q4 = _mm256_castsi256_si128(q256_4);
+      work = _mm_max_epu8(
+          _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+          _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+
+      p256_5 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
+      q256_5 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
+      p5 = _mm256_castsi256_si128(p256_5);
+      q5 = _mm256_castsi256_si128(q256_5);
+      flat2 = _mm_max_epu8(
+          _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+          _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+      flat2 = _mm_max_epu8(work, flat2);
+
+      p256_6 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
+      q256_6 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
+      p6 = _mm256_castsi256_si128(p256_6);
+      q6 = _mm256_castsi256_si128(q256_6);
+      work = _mm_max_epu8(
+          _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+          _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+      flat2 = _mm_max_epu8(work, flat2);
+
+      flat2 = _mm_subs_epu8(flat2, one);
+      flat2 = _mm_cmpeq_epi8(flat2, zero);
+      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+
+      const __m256i filter =
+          _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+
+      p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+      p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+      p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+      p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+      q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+      q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+      q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+      q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+
+      p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+      q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+
+      pixetFilter_p2p1p0 =
+          _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
+      pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
+
+      pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
+      flat_p0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+      p0 = _mm_andnot_si128(flat, ps0);
+      flat_p0 = _mm_and_si128(flat, flat_p0);
+      p0 = _mm_or_si128(flat_p0, p0);
+
+      pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
+      flat_q0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+      q0 = _mm_andnot_si128(flat, qs0);
+      flat_q0 = _mm_and_si128(flat, flat_q0);
+      q0 = _mm_or_si128(flat_q0, q0);
+
+      sum_p = _mm256_sub_epi16(p256_3, q256_2);
+      pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
+      flat_p1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+      p1 = _mm_andnot_si128(flat, ps1);
+      flat_p1 = _mm_and_si128(flat, flat_p1);
+      p1 = _mm_or_si128(flat_p1, p1);
+
+      sum_q = _mm256_sub_epi16(q256_3, p256_2);
+      pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
+      flat_q1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+      q1 = _mm_andnot_si128(flat, qs1);
+      flat_q1 = _mm_and_si128(flat, flat_q1);
+      q1 = _mm_or_si128(flat_q1, q1);
+
+      sum_p = _mm256_sub_epi16(p256_3, q256_1);
+      pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
+      flat_p2 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+      p2 = _mm_andnot_si128(flat, p2);
+      flat_p2 = _mm_and_si128(flat, flat_p2);
+      p2 = _mm_or_si128(flat_p2, p2);
+
+      sum_q = _mm256_sub_epi16(q256_3, p256_1);
+      pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
+      flat_q2 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+      q2 = _mm_andnot_si128(flat, q2);
+      flat_q2 = _mm_and_si128(flat, flat_q2);
+      q2 = _mm_or_si128(flat_q2, q2);
+
+      if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+        p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+        p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+        p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+        q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+        q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+        q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+
+        pixelFilter_p =
+            _mm256_add_epi16(p256_5, _mm256_add_epi16(p256_4, p256_3));
+        pixelFilter_q =
+            _mm256_add_epi16(q256_5, _mm256_add_epi16(q256_4, q256_3));
+
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p2p1p0);
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q2q1q0);
+
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p256_0);
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q256_0);
+        pixelFilter_p = _mm256_add_epi16(
+            eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+        pixelFilter_q = pixelFilter_p;
+
+        pixelFilter_p =
+            _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_1), pixelFilter_p);
+        res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+        flat2_p0 = _mm256_castsi256_si128(
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+        p0 = _mm_andnot_si128(flat2, p0);
+        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+        p0 = _mm_or_si128(flat2_p0, p0);
+        _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+
+        pixelFilter_q =
+            _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_1), pixelFilter_q);
+        res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+        flat2_q0 = _mm256_castsi256_si128(
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+        q0 = _mm_andnot_si128(flat2, q0);
+        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+        q0 = _mm_or_si128(flat2_q0, q0);
+        _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+
+        sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_5),
+                                 _mm256_sub_epi16(p256_2, q256_0));
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+        res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+        flat2_p1 = _mm256_castsi256_si128(
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+        p1 = _mm_andnot_si128(flat2, p1);
+        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+        p1 = _mm_or_si128(flat2_p1, p1);
+        _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+
+        sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_5),
+                                 _mm256_sub_epi16(q256_2, p256_0));
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+        res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+        flat2_q1 = _mm256_castsi256_si128(
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+        q1 = _mm_andnot_si128(flat2, q1);
+        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+        q1 = _mm_or_si128(flat2_q1, q1);
+        _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+
+        sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_4),
+                                 _mm256_sub_epi16(p256_3, p256_0));
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+        res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+        flat2_p2 = _mm256_castsi256_si128(
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+        p2 = _mm_andnot_si128(flat2, p2);
+        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+        p2 = _mm_or_si128(flat2_p2, p2);
+        _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+
+        sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_4),
+                                 _mm256_sub_epi16(q256_3, q256_0));
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+        res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+        flat2_q2 = _mm256_castsi256_si128(
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+        q2 = _mm_andnot_si128(flat2, q2);
+        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+        q2 = _mm_or_si128(flat2_q2, q2);
+        _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+
+        sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_3),
+                                 _mm256_sub_epi16(p256_4, p256_1));
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+        res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+        flat2_p3 = _mm256_castsi256_si128(
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+        p3 = _mm_andnot_si128(flat2, p3);
+        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+        p3 = _mm_or_si128(flat2_p3, p3);
+        _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+
+        sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_3),
+                                 _mm256_sub_epi16(q256_4, q256_1));
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+        res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+        flat2_q3 = _mm256_castsi256_si128(
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+        q3 = _mm_andnot_si128(flat2, q3);
+        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+        q3 = _mm_or_si128(flat2_q3, q3);
+        _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+        sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_2),
+                                 _mm256_sub_epi16(p256_5, p256_2));
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+        res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+        flat2_p4 = _mm256_castsi256_si128(
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+        p4 = _mm_andnot_si128(flat2, p4);
+        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+        p4 = _mm_or_si128(flat2_p4, p4);
+        _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+
+        sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_2),
+                                 _mm256_sub_epi16(q256_5, q256_2));
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+        res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+        flat2_q4 = _mm256_castsi256_si128(
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+        q4 = _mm_andnot_si128(flat2, q4);
+        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+        q4 = _mm_or_si128(flat2_q4, q4);
+        _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+
+        sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_1),
+                                 _mm256_sub_epi16(p256_6, p256_3));
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+        res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+        flat2_p5 = _mm256_castsi256_si128(
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+        p5 = _mm_andnot_si128(flat2, p5);
+        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+        p5 = _mm_or_si128(flat2_p5, p5);
+        _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+
+        sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_1),
+                                 _mm256_sub_epi16(q256_6, q256_3));
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+        res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+        flat2_q5 = _mm256_castsi256_si128(
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+        q5 = _mm_andnot_si128(flat2, q5);
+        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+        q5 = _mm_or_si128(flat2_q5, q5);
+        _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+      } else {
+        _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+        _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+        _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+        _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+        _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+        _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+      }
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
+    }
+  }
+}
diff --git a/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c b/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c
index d534683fce..87c5bb32a4 100644
--- a/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c
@@ -2098,3 +2098,897 @@ void aom_lpf_vertical_14_dual_sse2(
   _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
   _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
 }
+
+static INLINE __m128i filter_add2_sub2(const __m128i *const total,
+                                       const __m128i *const a1,
+                                       const __m128i *const a2,
+                                       const __m128i *const s1,
+                                       const __m128i *const s2) {
+  __m128i x = _mm_add_epi16(*a1, *total);
+  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
+  return x;
+}
+
+static INLINE __m128i filter8_mask(const __m128i *const flat,
+                                   const __m128i *const other_filt,
+                                   const __m128i *const f8_lo,
+                                   const __m128i *const f8_hi) {
+  const __m128i f8 =
+      _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
+  const __m128i result = _mm_and_si128(*flat, f8);
+  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+static INLINE __m128i filter16_mask(const __m128i *const flat,
+                                    const __m128i *const other_filt,
+                                    const __m128i *const f_lo,
+                                    const __m128i *const f_hi) {
+  const __m128i f =
+      _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
+  const __m128i result = _mm_and_si128(*flat, f);
+  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+void aom_lpf_horizontal_14_quad_sse2(unsigned char *s, int p,
+                                     const unsigned char *_blimit0,
+                                     const unsigned char *_limit0,
+                                     const unsigned char *_thresh0) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+  __m128i mask, hev, flat, flat2;
+  __m128i p6, p5;
+  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+  __m128i q6, q5;
+
+  __m128i op2, op1, op0, oq0, oq1, oq2;
+
+  __m128i max_abs_p1p0q1q0;
+
+  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+
+  {
+    const __m128i abs_p1p0 = abs_diff(p1, p0);
+    const __m128i abs_q1q0 = abs_diff(q1, q0);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+    __m128i abs_p0q0 = abs_diff(p0, q0);
+    __m128i abs_p1q1 = abs_diff(p1, q1);
+    __m128i work;
+    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  {
+    __m128i work;
+    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  }
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    op1 = _mm_xor_si128(p1, t80);
+    op0 = _mm_xor_si128(p0, t80);
+    oq0 = _mm_xor_si128(q0, t80);
+    oq1 = _mm_xor_si128(q1, t80);
+
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+    work_a = _mm_subs_epi8(oq0, op0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // filter8
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m128i four = _mm_set1_epi16(4);
+      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+      __m128i f8_lo, f8_hi;
+
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+                            _mm_add_epi16(p3_lo, p2_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+                            _mm_add_epi16(p2_lo, p1_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+                            _mm_add_epi16(p3_hi, p2_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+                            _mm_add_epi16(p2_hi, p1_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+
+      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      // wide flat calculations
+      if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+        const __m128i eight = _mm_set1_epi16(8);
+        const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
+        const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
+        const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
+        const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
+        const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
+        const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
+
+        const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
+        const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
+        const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
+        const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
+        const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
+        const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
+
+        __m128i f_lo;
+        __m128i f_hi;
+
+        f_lo = _mm_sub_epi16(_mm_slli_epi16(p6_lo, 3), p6_lo);
+        f_lo = _mm_add_epi16(_mm_slli_epi16(p5_lo, 1), f_lo);
+        f_lo = _mm_add_epi16(_mm_slli_epi16(p4_lo, 1), f_lo);
+        f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
+                             _mm_add_epi16(p2_lo, p1_lo));
+        f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
+        f_lo = _mm_add_epi16(f_lo, eight);
+
+        f_hi = _mm_sub_epi16(_mm_slli_epi16(p6_hi, 3), p6_hi);
+        f_hi = _mm_add_epi16(_mm_slli_epi16(p5_hi, 1), f_hi);
+        f_hi = _mm_add_epi16(_mm_slli_epi16(p4_hi, 1), f_hi);
+        f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
+                             _mm_add_epi16(p2_hi, p1_hi));
+        f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
+        f_hi = _mm_add_epi16(f_hi, eight);
+
+        p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+
+        f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p3_lo, &p6_lo, &p6_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p3_hi, &p6_hi, &p6_hi);
+        p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+
+        f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p2_lo, &p6_lo, &p5_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p2_hi, &p6_hi, &p5_hi);
+        p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+
+        f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p1_lo, &p6_lo, &p4_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p1_hi, &p6_hi, &p4_hi);
+        op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+
+        f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p0_lo, &p6_lo, &p3_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p0_hi, &p6_hi, &p3_hi);
+        op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+        f_lo = filter_add2_sub2(&f_lo, &q5_lo, &q0_lo, &p6_lo, &p2_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q5_hi, &q0_hi, &p6_hi, &p2_hi);
+        op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q1_lo, &p6_lo, &p1_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q1_hi, &p6_hi, &p1_hi);
+        oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q2_lo, &p5_lo, &p0_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q2_hi, &p5_hi, &p0_hi);
+        oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+
+        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q3_lo, &p4_lo, &q0_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q3_hi, &p4_hi, &q0_hi);
+        oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+
+        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q4_lo, &p3_lo, &q1_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q4_hi, &p3_hi, &q1_hi);
+        q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q5_lo, &p2_lo, &q2_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q5_hi, &p2_hi, &q2_hi);
+        q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+
+        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q6_lo, &p1_lo, &q3_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q6_hi, &p1_hi, &q3_hi);
+        q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+      } else {
+        _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+        _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+        _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+        _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+        _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+        _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+      }
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+    }
+  }
+}
+
+void aom_lpf_horizontal_8_quad_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+  __m128i mask, hev, flat;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  __m128i op2, op1, op0, oq0, oq1, oq2;
+
+  __m128i max_abs_p1p0q1q0;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+
+  {
+    const __m128i abs_p1p0 = abs_diff(p1, p0);
+    const __m128i abs_q1q0 = abs_diff(q1, q0);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+    __m128i abs_p0q0 = abs_diff(p0, q0);
+    __m128i abs_p1q1 = abs_diff(p1, q1);
+    __m128i work;
+    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  {
+    __m128i work;
+    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    op1 = _mm_xor_si128(p1, t80);
+    op0 = _mm_xor_si128(p0, t80);
+    oq0 = _mm_xor_si128(q0, t80);
+    oq1 = _mm_xor_si128(q1, t80);
+
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+    work_a = _mm_subs_epi8(oq0, op0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // filter8
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m128i four = _mm_set1_epi16(4);
+      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+      __m128i f8_lo, f8_hi;
+
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+                            _mm_add_epi16(p3_lo, p2_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+                            _mm_add_epi16(p2_lo, p1_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+                            _mm_add_epi16(p3_hi, p2_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+                            _mm_add_epi16(p2_hi, p1_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+    }
+  }
+}
+
+void aom_lpf_horizontal_6_quad_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+  __m128i mask, hev, flat;
+  __m128i p2, p1, p0, q0, q1, q2;
+
+  __m128i op1, op0, oq0, oq1;
+
+  __m128i max_abs_p1p0q1q0;
+
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+
+  {
+    const __m128i abs_p1p0 = abs_diff(p1, p0);
+    const __m128i abs_q1q0 = abs_diff(q1, q0);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+    __m128i abs_p0q0 = abs_diff(p0, q0);
+    __m128i abs_p1q1 = abs_diff(p1, q1);
+    __m128i work;
+    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(q2, q1));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  {
+    __m128i work;
+    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    op1 = _mm_xor_si128(p1, t80);
+    op0 = _mm_xor_si128(p0, t80);
+    oq0 = _mm_xor_si128(q0, t80);
+    oq1 = _mm_xor_si128(q1, t80);
+
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+    work_a = _mm_subs_epi8(oq0, op0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // filter6
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m128i four = _mm_set1_epi16(4);
+      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+
+      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+      __m128i f8_lo, f8_hi;
+
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p2_lo, four),
+                            _mm_add_epi16(p2_lo, p2_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p1_lo, f8_lo),
+                            _mm_add_epi16(p1_lo, p0_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p2_hi, four),
+                            _mm_add_epi16(p2_hi, p2_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p1_hi, f8_hi),
+                            _mm_add_epi16(p1_hi, p0_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q0_lo, &q1_lo, &p2_lo, &p2_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q0_hi, &q1_hi, &p2_hi, &p2_hi);
+      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &q2_lo, &p1_lo, &p2_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &q2_hi, &p1_hi, &p2_hi);
+      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &q2_lo, &p0_lo, &p1_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &q2_hi, &p0_hi, &p1_hi);
+      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+    }
+  }
+}
+
+void aom_lpf_horizontal_4_quad_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+  __m128i mask, hev;
+  __m128i p1, p0, q0, q1;
+
+  __m128i op1, op0, oq0, oq1;
+
+  __m128i max_abs_p1p0q1q0;
+
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+
+  {
+    const __m128i abs_p1p0 = abs_diff(p1, p0);
+    const __m128i abs_q1q0 = abs_diff(q1, q0);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+    __m128i abs_p0q0 = abs_diff(p0, q0);
+    __m128i abs_p1q1 = abs_diff(p1, q1);
+    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    op1 = _mm_xor_si128(p1, t80);
+    op0 = _mm_xor_si128(p0, t80);
+    oq0 = _mm_xor_si128(q0, t80);
+    oq1 = _mm_xor_si128(q1, t80);
+
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+    work_a = _mm_subs_epi8(oq0, op0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+    _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+    _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+  }
+}
+
+void aom_lpf_vertical_14_quad_sse2(unsigned char *s, int pitch,
+                                   const uint8_t *_blimit0,
+                                   const uint8_t *_limit0,
+                                   const uint8_t *_thresh0) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+  // Transpose 16x16
+  transpose_16x8(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+  transpose_16x8(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
+
+  // Loop filtering
+  aom_lpf_horizontal_14_quad(t_dst + 8 * 16, 16, _blimit0, _limit0, _thresh0);
+
+  // Transpose back
+  transpose_16x8(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+  transpose_16x8(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
+}
+
+void aom_lpf_vertical_8_quad_sse2(uint8_t *s, int pitch,
+                                  const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  // Transpose 16x8
+  transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+  // Loop filtering
+  aom_lpf_horizontal_8_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  dst[0] = s - 4;
+  dst[1] = s - 4 + pitch * 8;
+
+  // Transpose back
+  transpose_8xn(src, 16, dst, pitch, 2);
+}
+
+void aom_lpf_vertical_6_quad_sse2(uint8_t *s, int pitch,
+                                  const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  // Transpose 16x8
+  transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+  // Loop filtering
+  aom_lpf_horizontal_6_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  dst[0] = s - 4;
+  dst[1] = s - 4 + pitch * 8;
+
+  // Transpose back
+  transpose_8xn(src, 16, dst, pitch, 2);
+}
+
+void aom_lpf_vertical_4_quad_sse2(uint8_t *s, int pitch,
+                                  const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  // Transpose 16x8
+  transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+  // Loop filtering
+  aom_lpf_horizontal_4_quad_sse2(t_dst + 4 * 16, 16, _blimit0, _limit0,
+                                 _thresh0);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  dst[0] = s - 4;
+  dst[1] = s - 4 + pitch * 8;
+
+  // Transpose back
+  transpose_8xn(src, 16, dst, pitch, 2);
+}
diff --git a/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h b/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h
index 6ed2cbfdf4..84daa94b0f 100644
--- a/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h
+++ b/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h
@@ -16,6 +16,9 @@
 
 #include "config/aom_config.h"
 
+#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
+#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
+
 static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
                                             __m128i *x2, __m128i *x3,
                                             __m128i *x4, __m128i *x5,
@@ -492,4 +495,139 @@ static INLINE void transpose8x16_16x8_sse2(
   *d14d15 = _mm_unpackhi_epi64(w7, w15);
 }
 
+static INLINE void transpose_16x8(unsigned char *in0, unsigned char *in1,
+                                  int in_p, unsigned char *out, int out_p) {
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
+
+  x0 = _mm_loadl_epi64((__m128i *)in0);
+  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
+  x0 = _mm_unpacklo_epi8(x0, x1);
+
+  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
+  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));
+  x1 = _mm_unpacklo_epi8(x2, x3);
+
+  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));
+  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));
+  x2 = _mm_unpacklo_epi8(x4, x5);
+
+  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));
+  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));
+  x3 = _mm_unpacklo_epi8(x6, x7);
+  x4 = _mm_unpacklo_epi16(x0, x1);
+
+  x8 = _mm_loadl_epi64((__m128i *)in1);
+  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
+  x8 = _mm_unpacklo_epi8(x8, x9);
+  x5 = _mm_unpacklo_epi16(x2, x3);
+
+  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
+  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));
+  x9 = _mm_unpacklo_epi8(x10, x11);
+
+  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));
+  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));
+  x10 = _mm_unpacklo_epi8(x12, x13);
+  x12 = _mm_unpacklo_epi16(x8, x9);
+
+  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));
+  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));
+  x11 = _mm_unpacklo_epi8(x14, x15);
+  x13 = _mm_unpacklo_epi16(x10, x11);
+
+  x6 = _mm_unpacklo_epi32(x4, x5);
+  x7 = _mm_unpackhi_epi32(x4, x5);
+  x14 = _mm_unpacklo_epi32(x12, x13);
+  x15 = _mm_unpackhi_epi32(x12, x13);
+
+  // Store first 4-line result
+  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
+  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
+
+  x4 = _mm_unpackhi_epi16(x0, x1);
+  x5 = _mm_unpackhi_epi16(x2, x3);
+  x12 = _mm_unpackhi_epi16(x8, x9);
+  x13 = _mm_unpackhi_epi16(x10, x11);
+
+  x6 = _mm_unpacklo_epi32(x4, x5);
+  x7 = _mm_unpackhi_epi32(x4, x5);
+  x14 = _mm_unpacklo_epi32(x12, x13);
+  x15 = _mm_unpackhi_epi32(x12, x13);
+
+  // Store second 4-line result
+  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
+  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
+}
+
+static INLINE void transpose_8xn(unsigned char *src[], int in_p,
+                                 unsigned char *dst[], int out_p,
+                                 int num_8x8_to_transpose) {
+  int idx8x8 = 0;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  do {
+    unsigned char *in = src[idx8x8];
+    unsigned char *out = dst[idx8x8];
+
+    x0 =
+        _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
+    x1 =
+        _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
+    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+    x0 = _mm_unpacklo_epi8(x0, x1);
+
+    x2 =
+        _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
+    x3 =
+        _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
+    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+    x1 = _mm_unpacklo_epi8(x2, x3);
+
+    x4 =
+        _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
+    x5 =
+        _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
+    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+    x2 = _mm_unpacklo_epi8(x4, x5);
+
+    x6 =
+        _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
+    x7 =
+        _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
+    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+    x3 = _mm_unpacklo_epi8(x6, x7);
+
+    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    x4 = _mm_unpacklo_epi16(x0, x1);
+    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+    x5 = _mm_unpacklo_epi16(x2, x3);
+    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+    x6 = _mm_unpacklo_epi32(x4, x5);
+    mm_storelu(out + 0 * out_p, x6);  // 00 10 20 30 40 50 60 70
+    mm_storehu(out + 1 * out_p, x6);  // 01 11 21 31 41 51 61 71
+    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi32(x4, x5);
+    mm_storelu(out + 2 * out_p, x7);  // 02 12 22 32 42 52 62 72
+    mm_storehu(out + 3 * out_p, x7);  // 03 13 23 33 43 53 63 73
+
+    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+    x4 = _mm_unpackhi_epi16(x0, x1);
+    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+    x5 = _mm_unpackhi_epi16(x2, x3);
+    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+    x6 = _mm_unpacklo_epi32(x4, x5);
+    mm_storelu(out + 4 * out_p, x6);  // 04 14 24 34 44 54 64 74
+    mm_storehu(out + 5 * out_p, x6);  // 05 15 25 35 45 55 65 75
+    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi32(x4, x5);
+
+    mm_storelu(out + 6 * out_p, x7);  // 06 16 26 36 46 56 66 76
+    mm_storehu(out + 7 * out_p, x7);  // 07 17 27 37 47 57 67 77
+  } while (++idx8x8 < num_8x8_to_transpose);
+}
+
 #endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/media/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c b/media/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c
index 8ef7ee0d7b..1235f27797 100644
--- a/media/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -37,11 +37,11 @@
   res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
 
 static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
-                                       const uint8_t *a_ptr[], int a_stride,
+                                       const uint8_t *a_ptr[4], int a_stride,
                                        const uint8_t *b_ptr, int b_stride,
                                        const uint8_t *m_ptr, int m_stride,
                                        int width, int height, int inv_mask,
-                                       unsigned sad_array[]) {
+                                       unsigned sad_array[4]) {
   int x, y;
   __m128i a;
   __m128i data_l, data_r, mask_l, mask_r, pred_l, pred_r, pred;
@@ -104,10 +104,10 @@ static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
   res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
 
 void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_array[], int a_stride,
+                                const uint8_t *ref_array[4], int a_stride,
                                 const uint8_t *b_ptr, int b_stride,
                                 const uint8_t *m_ptr, int m_stride, int height,
-                                int inv_mask, unsigned sad_array[]) {
+                                int inv_mask, unsigned sad_array[4]) {
   const uint8_t *ref0 = ref_array[0];
   const uint8_t *ref1 = ref_array[1];
   const uint8_t *ref2 = ref_array[2];
@@ -165,10 +165,10 @@ void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
   res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
 
 void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_array[], int a_stride,
+                                const uint8_t *ref_array[4], int a_stride,
                                 const uint8_t *b_ptr, int b_stride,
                                 const uint8_t *m_ptr, int m_stride, int height,
-                                int inv_mask, unsigned sad_array[]) {
+                                int inv_mask, unsigned sad_array[4]) {
   const uint8_t *ref0 = ref_array[0];
   const uint8_t *ref1 = ref_array[1];
   const uint8_t *ref2 = ref_array[2];
@@ -217,27 +217,27 @@ void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
 
 #define MASKSADMXN_SSSE3(m, n)                                                 \
   void aom_masked_sad##m##x##n##x4d_ssse3(                                     \
-      const uint8_t *src, int src_stride, const uint8_t *ref[],                \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],               \
       int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
-      int msk_stride, int inv_mask, unsigned sad_array[]) {                    \
+      int msk_stride, int inv_mask, unsigned sad_array[4]) {                   \
     masked_sadx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, \
                         msk_stride, m, n, inv_mask, sad_array);                \
   }
 
 #define MASKSAD8XN_SSSE3(n)                                                   \
   void aom_masked_sad8x##n##x4d_ssse3(                                        \
-      const uint8_t *src, int src_stride, const uint8_t *ref[],               \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],              \
       int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
-      int msk_stride, int inv_mask, unsigned sad_array[]) {                   \
+      int msk_stride, int inv_mask, unsigned sad_array[4]) {                  \
     aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
                                8, msk, msk_stride, n, inv_mask, sad_array);   \
   }
 
 #define MASKSAD4XN_SSSE3(n)                                                   \
   void aom_masked_sad4x##n##x4d_ssse3(                                        \
-      const uint8_t *src, int src_stride, const uint8_t *ref[],               \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],              \
       int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
-      int msk_stride, int inv_mask, unsigned sad_array[]) {                   \
+      int msk_stride, int inv_mask, unsigned sad_array[4]) {                  \
     aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
                                4, msk, msk_stride, n, inv_mask, sad_array);   \
   }
diff --git a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c
index 60f0ab3390..2c022555b5 100644
--- a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -365,25 +365,25 @@ static INLINE unsigned int aom_highbd_masked_sad_avx2(
                                       invert_mask, m, n);                 \
   }
 
-HIGHBD_MASKSADMXN_AVX2(4, 4);
-HIGHBD_MASKSADMXN_AVX2(4, 8);
-HIGHBD_MASKSADMXN_AVX2(8, 4);
-HIGHBD_MASKSADMXN_AVX2(8, 8);
-HIGHBD_MASKSADMXN_AVX2(8, 16);
-HIGHBD_MASKSADMXN_AVX2(16, 8);
-HIGHBD_MASKSADMXN_AVX2(16, 16);
-HIGHBD_MASKSADMXN_AVX2(16, 32);
-HIGHBD_MASKSADMXN_AVX2(32, 16);
-HIGHBD_MASKSADMXN_AVX2(32, 32);
-HIGHBD_MASKSADMXN_AVX2(32, 64);
-HIGHBD_MASKSADMXN_AVX2(64, 32);
-HIGHBD_MASKSADMXN_AVX2(64, 64);
-HIGHBD_MASKSADMXN_AVX2(64, 128);
-HIGHBD_MASKSADMXN_AVX2(128, 64);
-HIGHBD_MASKSADMXN_AVX2(128, 128);
-HIGHBD_MASKSADMXN_AVX2(4, 16);
-HIGHBD_MASKSADMXN_AVX2(16, 4);
-HIGHBD_MASKSADMXN_AVX2(8, 32);
-HIGHBD_MASKSADMXN_AVX2(32, 8);
-HIGHBD_MASKSADMXN_AVX2(16, 64);
-HIGHBD_MASKSADMXN_AVX2(64, 16);
+HIGHBD_MASKSADMXN_AVX2(4, 4)
+HIGHBD_MASKSADMXN_AVX2(4, 8)
+HIGHBD_MASKSADMXN_AVX2(8, 4)
+HIGHBD_MASKSADMXN_AVX2(8, 8)
+HIGHBD_MASKSADMXN_AVX2(8, 16)
+HIGHBD_MASKSADMXN_AVX2(16, 8)
+HIGHBD_MASKSADMXN_AVX2(16, 16)
+HIGHBD_MASKSADMXN_AVX2(16, 32)
+HIGHBD_MASKSADMXN_AVX2(32, 16)
+HIGHBD_MASKSADMXN_AVX2(32, 32)
+HIGHBD_MASKSADMXN_AVX2(32, 64)
+HIGHBD_MASKSADMXN_AVX2(64, 32)
+HIGHBD_MASKSADMXN_AVX2(64, 64)
+HIGHBD_MASKSADMXN_AVX2(64, 128)
+HIGHBD_MASKSADMXN_AVX2(128, 64)
+HIGHBD_MASKSADMXN_AVX2(128, 128)
+HIGHBD_MASKSADMXN_AVX2(4, 16)
+HIGHBD_MASKSADMXN_AVX2(16, 4)
+HIGHBD_MASKSADMXN_AVX2(8, 32)
+HIGHBD_MASKSADMXN_AVX2(32, 8)
+HIGHBD_MASKSADMXN_AVX2(16, 64)
+HIGHBD_MASKSADMXN_AVX2(64, 16)
diff --git a/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c b/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c
index fa93f0df4f..bfd86ee410 100644
--- a/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -1052,12 +1052,14 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
       mask += (mask_stride << 1);
       i += 2;
     } while (i < height);
-  } else {  // width == 32
-    assert(width == 32);
+  } else {
     do {
-      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
-      comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16);
-      comp_pred += (width);
+      for (int x = 0; x < width; x += 32) {
+        comp_mask_pred_16_ssse3(src0 + x, src1 + x, mask + x, comp_pred);
+        comp_mask_pred_16_ssse3(src0 + x + 16, src1 + x + 16, mask + x + 16,
+                                comp_pred + 16);
+        comp_pred += 32;
+      }
       src0 += (stride0);
       src1 += (stride1);
       mask += (mask_stride);
diff --git a/media/libaom/src/aom_dsp/x86/mem_sse2.h b/media/libaom/src/aom_dsp/x86/mem_sse2.h
index 6c821673e8..dacb613641 100644
--- a/media/libaom/src/aom_dsp/x86/mem_sse2.h
+++ b/media/libaom/src/aom_dsp/x86/mem_sse2.h
@@ -13,11 +13,34 @@
 #define AOM_AOM_DSP_X86_MEM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
+#include <string.h>
 
 #include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
 
+static INLINE uint16_t loadu_uint16(const void *src) {
+  uint16_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE uint32_t loadu_uint32(const void *src) {
+  uint32_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE uint64_t loadu_uint64(const void *src) {
+  uint64_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+  _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
+}
+
 static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
   return _mm_castps_si128(
       _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
@@ -25,10 +48,10 @@ static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
 
 static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
                                                   const int byte_stride) {
-  return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 1 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 2 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 3 * byte_stride));
+  return _mm_setr_epi32(loadu_uint32((int8_t *)src + 0 * byte_stride),
+                        loadu_uint32((int8_t *)src + 1 * byte_stride),
+                        loadu_uint32((int8_t *)src + 2 * byte_stride),
+                        loadu_uint32((int8_t *)src + 3 * byte_stride));
 }
 
 static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
@@ -39,4 +62,106 @@ static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
   return dst;
 }
 
+static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+                                            uint8_t *const d,
+                                            const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
+  _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
+}
+
+static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
+                                  const ptrdiff_t stride) {
+  *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
+  *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
+  *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
+  *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
+}
+
+static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+                                       const ptrdiff_t stride) {
+  __m128i ss[4];
+
+  ss[0] = s;
+  ss[1] = _mm_srli_si128(s, 4);
+  ss[2] = _mm_srli_si128(s, 8);
+  ss[3] = _mm_srli_si128(s, 12);
+  store_8bit_4x4(ss, d, stride);
+}
+
+static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
+  d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
+  d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
+  d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  load_8bit_4x4(s + 0 * stride, stride, &d[0]);
+  load_8bit_4x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void loadu_8bit_16x4(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  load_8bit_8x4(s + 0 * stride, stride, &d[0]);
+  load_8bit_8x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_16x8(const uint8_t *const s,
+                                  const ptrdiff_t stride, __m128i *const d) {
+  d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
+  d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
+  d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
+  d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
+  d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
+}
+
+static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
+  loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
+                                  const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
+  _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
+  _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
+  _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
+  _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
+}
+
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+                                    const ptrdiff_t stride) {
+  _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
 #endif  // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm b/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm
deleted file mode 100644
index d6e15c4be5..0000000000
--- a/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm
+++ /dev/null
@@ -1,464 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-
-  vzeroupper
-
-%ifnidn %1, b_32x32
-
-  ; Special case for ncoeff == 16, as it is frequent and we can save on
-  ; not setting up a loop.
-  cmp                       ncoeffmp, 16
-  jne .generic
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Special case of ncoeff == 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.single:
-
-  movifnidn                   coeffq, coeffmp
-  movifnidn                    zbinq, zbinmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-
-  ; Get DC and first 15 AC coeffs - in this special case, that is all.
-  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
-  mova                            m9, [coeffq]
-  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
-  mova                           m10, [coeffq+32]
-  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
-
-  mov                             r0, eobmp                ; Output pointer
-  mov                             r1, qcoeffmp             ; Output pointer
-  mov                             r2, dqcoeffmp            ; Output pointer
-
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  pcmpeqw                         m4, m4                   ; All word lanes -1
-  paddw                           m0, m4                   ; m0 = zbin - 1
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, we just write zeros
-  ; to the outputs and we are done.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .single_nonzero
-
-  mova                       [r1   ], ymm5
-  mova                       [r1+32], ymm5
-  mova                       [r2   ], ymm5
-  mova                       [r2+32], ymm5
-  mov                           [r0], word 0
-
-  vzeroupper
-  RET
-
-.single_nonzero:
-
-  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
-  movifnidn                       r3, roundmp
-  movifnidn                       r4, quantmp
-  mov                             r6, dequantmp
-  mov                             r5, shiftmp
-  mova                            m1, [r3]              ; m1 = round
-  mova                            m2, [r4]              ; m2 = quant
-  mova                            m3, [r6]              ; m3 = dequant
-  mova                            m4, [r5]              ; m4 = shift
-
-  mov                             r3, iscanmp
-
-  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                  [qcoeffq   ], m11
-  mova                  [qcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                  [qcoeffq+32], m11
-  mova                  [qcoeffq+48], m6
-
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-
-  ; Store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                 [dqcoeffq   ], m11
-  mova                 [dqcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                 [dqcoeffq+32], m11
-  mova                 [dqcoeffq+48], m6
-
-  mova                            m6, [iscanq]            ; m6 = scan[i]
-  mova                           m11, [iscanq+16]         ; m11 = scan[i]
-
-  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
-  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
-  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
-  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
-  pandn                           m8,  m8,  m6            ; m8 = max(eob)
-  pandn                          m13, m13, m11            ; m13 = max(eob)
-  pmaxsw                          m8,  m8, m13
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                         [eobq], ax
-
-  vzeroupper
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of ncoeff != 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.generic:
-
-%endif ; %ifnidn %1, b_32x32
-
-DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  ; Actual quantization loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  movifnidn                 dequantq, dequantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-  mova                            m3, [dequantq]           ; m3 = dequant
-  pcmpeqw                         m4, m4                   ; All lanes -1
-%ifidn %1, b_32x32
-  psubw                           m0, m4
-  psubw                           m1, m4
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  paddw                           m0, m4                   ; m0 = m0 + 1
-
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]            ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-  pxor                            m5, m5              ; m5 = dedicated zero
-
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
-
-
-  lea                         coeffq, [  coeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  ; coeff stored as 32bit numbers & require 16bit numbers
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .first_nonzero
-
-  mova        [qcoeffq+ncoeffq*4   ], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4   ], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-  add                        ncoeffq, mmsize
-
-  punpckhqdq                      m1, m1
-  punpckhqdq                      m2, m2
-  punpckhqdq                      m3, m3
-  punpckhqdq                      m4, m4
-  pxor                            m8, m8
-
-  jmp .ac_only_loop
-
-.first_nonzero:
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  %ifidn %1, b_32x32
-  pmullw                          m5, m8, m4               ; store the lower 16 bits of m8*qsh
-  %endif
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  %ifidn %1, b_32x32
-  psllw                           m8, 1
-  psrlw                           m5, 15
-  por                             m8, m5
-  %endif
-  punpckhqdq                      m4, m4
-  %ifidn %1, b_32x32
-  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
-  %endif
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  %ifidn %1, b_32x32
-  psllw                          m13, 1
-  psrlw                           m5, 15
-  por                            m13, m5
-  pxor                            m5, m5                   ; reset m5 to zero register
-  %endif
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-
-  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                           m8, m6                    ; m8 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-
-.ac_only_loop:
-
-  ; pack coeff from 32bit to 16bit array
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
-  ; And just write zeros as the result would be.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .rest_nonzero
-
-  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-
-.rest_nonzero:
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  %ifidn %1, b_32x32
-  pmullw                          m5, m14, m4              ; store the lower 16 bits of m14*qsh
-  %endif
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  %ifidn %1, b_32x32
-  psllw                          m14, 1
-  psrlw                           m5, 15
-  por                            m14, m5
-  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
-  %endif
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  %ifidn %1, b_32x32
-  psllw                          m13, 1
-  psrlw                           m5, 15
-  por                            m13, m5
-  pxor                            m5, m5                   ; reset m5 to zero register
-  %endif
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-
-  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                          m14, m6                    ; m14 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-%endmacro
-
-INIT_XMM avx
-QUANTIZE_FN b, 9
-QUANTIZE_FN b_32x32, 9
diff --git a/media/libaom/src/aom_dsp/x86/sad4d_avx2.c b/media/libaom/src/aom_dsp/x86/sad4d_avx2.c
index 0771252584..1abeb4c3db 100644
--- a/media/libaom/src/aom_dsp/x86/sad4d_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/sad4d_avx2.c
@@ -85,22 +85,47 @@ void aom_sadMxNx4d_avx2(int M, int N, const uint8_t *src, int src_stride,
   }
 }
 
-#define sadMxN_avx2(m, n)                                                      \
+#define SADMXN_AVX2(m, n)                                                      \
   void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride,          \
                                   const uint8_t *const ref[4], int ref_stride, \
                                   uint32_t res[4]) {                           \
     aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
   }
 
-sadMxN_avx2(32, 8);
-sadMxN_avx2(32, 16);
-sadMxN_avx2(32, 32);
-sadMxN_avx2(32, 64);
+SADMXN_AVX2(32, 8)
+SADMXN_AVX2(32, 16)
+SADMXN_AVX2(32, 32)
+SADMXN_AVX2(32, 64)
 
-sadMxN_avx2(64, 16);
-sadMxN_avx2(64, 32);
-sadMxN_avx2(64, 64);
-sadMxN_avx2(64, 128);
+SADMXN_AVX2(64, 16)
+SADMXN_AVX2(64, 32)
+SADMXN_AVX2(64, 64)
+SADMXN_AVX2(64, 128)
 
-sadMxN_avx2(128, 64);
-sadMxN_avx2(128, 128);
+SADMXN_AVX2(128, 64)
+SADMXN_AVX2(128, 128)
+
+#define SAD_SKIP_MXN_AVX2(m, n)                                             \
+  void aom_sad_skip_##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+                                        const uint8_t *const ref[4],        \
+                                        int ref_stride, uint32_t res[4]) {  \
+    aom_sadMxNx4d_avx2(m, ((n) >> 1), src, 2 * src_stride, ref,             \
+                       2 * ref_stride, res);                                \
+    res[0] <<= 1;                                                           \
+    res[1] <<= 1;                                                           \
+    res[2] <<= 1;                                                           \
+    res[3] <<= 1;                                                           \
+  }
+
+SAD_SKIP_MXN_AVX2(32, 8)
+SAD_SKIP_MXN_AVX2(32, 16)
+SAD_SKIP_MXN_AVX2(32, 32)
+SAD_SKIP_MXN_AVX2(32, 64)
+
+SAD_SKIP_MXN_AVX2(64, 16)
+SAD_SKIP_MXN_AVX2(64, 32)
+SAD_SKIP_MXN_AVX2(64, 64)
+SAD_SKIP_MXN_AVX2(64, 128)
+
+SAD_SKIP_MXN_AVX2(128, 64)
+SAD_SKIP_MXN_AVX2(128, 128)
diff --git a/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm b/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm
index a9043742d4..9ab44c1340 100644
--- a/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm
@@ -312,10 +312,22 @@ SECTION .text
 ; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2-3 0
-%if %3 == 0
-%if UNIX64
+; Macro Arguments:
+;   1: Width
+;   2: Height
+;   3: If 0, then normal sad, else avg
+;   4: If 0, then normal sad, else skip rows
+%macro SADNXN4D 2-4 0,0
+%if %4 == 1  ; skip rows
+%if ARCH_X86_64
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+%elif %3 == 0  ; normal sad
+%if ARCH_X86_64
 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
 %else
@@ -323,8 +335,7 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
 %endif
 %else ; avg
-
-%if UNIX64
+%if ARCH_X86_64
 cglobal sad%1x%2x4d_avg, 6, 10, 8, src, src_stride, ref1, ref_stride, \
                                   second_pred, res, ref2, ref3, ref4
 %else
@@ -335,7 +346,11 @@ cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \
 %endif
 %endif
 
-  %define mflag ((1 - UNIX64) & %3)
+  %define mflag ((1 - ARCH_X86_64) & %3)
+%if %4 == 1
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
+%endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 
@@ -345,9 +360,15 @@ cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \
   mov                ref1q, [ref1q+gprsize*0]
 
   PROCESS_%1x2x4 1, 0, 0, 0, ref_strideq, %3, 0, 1, 2
-%rep (%2-4)/2
+%if %4 == 1  ; downsample number of rows by 2
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
 %endrep
+%undef num_rep
   PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
 
 %if %3 == 0
@@ -368,12 +389,19 @@ cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \
   punpcklqdq            m4, m6
   punpckhqdq            m5, m7
   paddd                 m4, m5
+%if %4 == 1
+  pslld                 m4, 1
+%endif
   movifnidn             resultq, resultmp
   movu                [resultq], m4
   RET
 %else
   pshufd            m6, m6, 0x08
   pshufd            m7, m7, 0x08
+%if %4 == 1
+  pslld                 m6, 1
+  pslld                 m7, 1
+%endif
   movifnidn             resultq, resultmp
   movq              [resultq+0], m6
   movq              [resultq+8], m7
@@ -383,46 +411,76 @@ cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \
 
 INIT_XMM sse2
 SADNXN4D 128, 128
-SADNXN4D 128, 64
-SADNXN4D 64,  128
-SADNXN4D 64, 64
-SADNXN4D 64, 32
-SADNXN4D 32, 64
-SADNXN4D 32, 32
-SADNXN4D 32, 16
-SADNXN4D 16, 32
-SADNXN4D 16, 16
-SADNXN4D 16,  8
-SADNXN4D  8, 16
-SADNXN4D  8,  8
-SADNXN4D  8,  4
-SADNXN4D  4,  8
-SADNXN4D  4,  4
-SADNXN4D  4, 16
-SADNXN4D 16,  4
-SADNXN4D  8, 32
-SADNXN4D 32,  8
-SADNXN4D 16, 64
-SADNXN4D 64, 16
+SADNXN4D 128,  64
+SADNXN4D  64, 128
+SADNXN4D  64,  64
+SADNXN4D  64,  32
+SADNXN4D  32,  64
+SADNXN4D  32,  32
+SADNXN4D  32,  16
+SADNXN4D  16,  32
+SADNXN4D  16,  16
+SADNXN4D  16,   8
+SADNXN4D   8,  16
+SADNXN4D   8,   8
+SADNXN4D   8,   4
+SADNXN4D   4,   8
+SADNXN4D   4,   4
+%if CONFIG_REALTIME_ONLY==0
+SADNXN4D   4,  16
+SADNXN4D  16,   4
+SADNXN4D   8,  32
+SADNXN4D  32,   8
+SADNXN4D  16,  64
+SADNXN4D  64,  16
+%endif
+%if CONFIG_REALTIME_ONLY==0
 SADNXN4D 128, 128, 1
-SADNXN4D 128, 64, 1
-SADNXN4D 64,  128, 1
-SADNXN4D 64, 64, 1
-SADNXN4D 64, 32, 1
-SADNXN4D 32, 64, 1
-SADNXN4D 32, 32, 1
-SADNXN4D 32, 16, 1
-SADNXN4D 16, 32, 1
-SADNXN4D 16, 16, 1
-SADNXN4D 16,  8, 1
-SADNXN4D  8, 16, 1
-SADNXN4D  8,  8, 1
-SADNXN4D  8,  4, 1
-SADNXN4D  4,  8, 1
-SADNXN4D  4,  4, 1
-SADNXN4D  4, 16, 1
-SADNXN4D 16,  4, 1
-SADNXN4D  8, 32, 1
-SADNXN4D 32,  8, 1
-SADNXN4D 16, 64, 1
-SADNXN4D 64, 16, 1
+SADNXN4D 128,  64, 1
+SADNXN4D  64, 128, 1
+SADNXN4D  64,  64, 1
+SADNXN4D  64,  32, 1
+SADNXN4D  32,  64, 1
+SADNXN4D  32,  32, 1
+SADNXN4D  32,  16, 1
+SADNXN4D  16,  32, 1
+SADNXN4D  16,  16, 1
+SADNXN4D  16,   8, 1
+SADNXN4D   8,  16, 1
+SADNXN4D   8,   8, 1
+SADNXN4D   8,   4, 1
+SADNXN4D   4,   8, 1
+SADNXN4D   4,   4, 1
+SADNXN4D   4,  16, 1
+SADNXN4D  16,   4, 1
+SADNXN4D   8,  32, 1
+SADNXN4D  32,   8, 1
+SADNXN4D  16,  64, 1
+SADNXN4D  64,  16, 1
+%endif
+SADNXN4D 128, 128, 0, 1
+SADNXN4D 128,  64, 0, 1
+SADNXN4D  64, 128, 0, 1
+SADNXN4D  64,  64, 0, 1
+SADNXN4D  64,  32, 0, 1
+SADNXN4D  32,  64, 0, 1
+SADNXN4D  32,  32, 0, 1
+SADNXN4D  32,  16, 0, 1
+SADNXN4D  16,  32, 0, 1
+SADNXN4D  16,  16, 0, 1
+SADNXN4D  16,   8, 0, 1
+SADNXN4D   8,  16, 0, 1
+SADNXN4D   8,   8, 0, 1
+SADNXN4D   4,   8, 0, 1
+%if CONFIG_REALTIME_ONLY==0
+SADNXN4D   4,  16, 0, 1
+SADNXN4D   8,  32, 0, 1
+SADNXN4D  32,   8, 0, 1
+SADNXN4D  16,  64, 0, 1
+SADNXN4D  64,  16, 0, 1
+%endif
+
+; Different assembly is needed when the height gets subsampled to 2
+; SADNXN4D 16,  4, 0, 1
+; SADNXN4D  8,  4, 0, 1
+; SADNXN4D  4,  4, 0, 1
diff --git a/media/libaom/src/aom_dsp/x86/sad_avx2.c b/media/libaom/src/aom_dsp/x86/sad_avx2.c
index a50dba64a1..ef3fdc1d28 100644
--- a/media/libaom/src/aom_dsp/x86/sad_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/sad_avx2.c
@@ -14,76 +14,106 @@
 
 #include "aom_ports/mem.h"
 
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  for (i = 0; i < h; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref_stride;
+    src_ptr += src_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  _mm256_zeroupper();
+  return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  int ref2_stride = ref_stride << 1;
+  int src2_stride = src_stride << 1;
+  int max = h >> 1;
+  for (i = 0; i < max; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref2_stride;
+    src_ptr += src2_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  _mm256_zeroupper();
+  return res;
+}
+
 #define FSAD64_H(h)                                                           \
   unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    for (i = 0; i < h; i++) {                                                 \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref_stride;                                                  \
-      src_ptr += src_stride;                                                  \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
+    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS64_H(h)                                                          \
+  unsigned int aom_sad_skip_64x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
 #define FSAD32_H(h)                                                           \
   unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    int ref2_stride = ref_stride << 1;                                        \
-    int src2_stride = src_stride << 1;                                        \
-    int max = h >> 1;                                                         \
-    for (i = 0; i < max; i++) {                                               \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg,                                                           \
-          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref2_stride;                                                 \
-      src_ptr += src2_stride;                                                 \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
+    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS32_H(h)                                                          \
+  unsigned int aom_sad_skip_32x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
 #define FSAD64  \
-  FSAD64_H(64); \
-  FSAD64_H(32);
+  FSAD64_H(64)  \
+  FSAD64_H(32)  \
+  FSADS64_H(64) \
+  FSADS64_H(32)
 
 #define FSAD32  \
-  FSAD32_H(64); \
-  FSAD32_H(32); \
-  FSAD32_H(16);
+  FSAD32_H(64)  \
+  FSAD32_H(32)  \
+  FSAD32_H(16)  \
+  FSADS32_H(64) \
+  FSADS32_H(32) \
+  FSADS32_H(16)
 
 /* clang-format off */
 FSAD64
@@ -169,14 +199,14 @@ FSAD32
     return res;                                                               \
   }
 
-#define FSADAVG64  \
-  FSADAVG64_H(64); \
-  FSADAVG64_H(32);
+#define FSADAVG64 \
+  FSADAVG64_H(64) \
+  FSADAVG64_H(32)
 
-#define FSADAVG32  \
-  FSADAVG32_H(64); \
-  FSADAVG32_H(32); \
-  FSADAVG32_H(16);
+#define FSADAVG32 \
+  FSADAVG32_H(64) \
+  FSADAVG32_H(32) \
+  FSADAVG32_H(16)
 
 /* clang-format off */
 FSADAVG64
diff --git a/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c b/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c
index f77a585b4c..2afae4bc64 100644
--- a/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c
@@ -84,6 +84,30 @@ unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride,
   return sum;
 }
 
+unsigned int aom_sad_skip_128x64_avx2(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride) {
+  const uint32_t half_width = 64;
+  uint32_t sum = sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2);
+  src_ptr += half_width;
+  ref_ptr += half_width;
+  sum += sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2);
+  return 2 * sum;
+}
+
+unsigned int aom_sad_skip_64x128_avx2(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride) {
+  const uint32_t sum =
+      sad64x64(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride);
+  return 2 * sum;
+}
+
+unsigned int aom_sad_skip_128x128_avx2(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride) {
+  const uint32_t sum =
+      aom_sad128x64_avx2(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride);
+  return 2 * sum;
+}
+
 static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
                                      const uint8_t *ref_ptr, int ref_stride,
                                      const int h, const uint8_t *second_pred,
diff --git a/media/libaom/src/aom_dsp/x86/sad_sse2.asm b/media/libaom/src/aom_dsp/x86/sad_sse2.asm
index 3251b76559..de9845a069 100644
--- a/media/libaom/src/aom_dsp/x86/sad_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/sad_sse2.asm
@@ -15,15 +15,29 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro SAD_FN 4
-%if %4 == 0
+%if %4 == 0 ; normal sad
 %if %3 == 5
 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
 %if %3 == 5
 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
@@ -38,7 +52,11 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea           src_strided, [src_strided*2]
+lea           ref_strided, [ref_strided*2]
+%endif ; %4 skip
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -51,7 +69,11 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
 ;                                  uint8_t *ref, int ref_stride);
 %macro SAD128XN 1-2 0
   SAD_FN 128, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -104,6 +126,9 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -111,15 +136,21 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
 INIT_XMM sse2
 SAD128XN 128     ; sad128x128_sse2
 SAD128XN 128, 1  ; sad128x128_avg_sse2
+SAD128XN 128, 2  ; sad128x128_skip_sse2
 SAD128XN 64      ; sad128x64_sse2
 SAD128XN 64, 1   ; sad128x64_avg_sse2
+SAD128XN 64, 2   ; sad128x64_skip_sse2
 
 
 ; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD64XN 1-2 0
   SAD_FN 64, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -148,25 +179,36 @@ SAD128XN 64, 1   ; sad128x64_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
 SAD64XN 128     ; sad64x128_sse2
+SAD64XN  64     ; sad64x64_sse2
+SAD64XN  32     ; sad64x32_sse2
+SAD64XN  16     ; sad64x16_sse2
 SAD64XN 128, 1  ; sad64x128_avg_sse2
-SAD64XN 64 ; sad64x64_sse2
-SAD64XN 32 ; sad64x32_sse2
-SAD64XN 64, 1 ; sad64x64_avg_sse2
-SAD64XN 32, 1 ; sad64x32_avg_sse2
-SAD64XN 16 ; sad64x16_sse2
-SAD64XN 16, 1 ; sad64x16_avg_sse2
+SAD64XN  64, 1  ; sad64x64_avg_sse2
+SAD64XN  32, 1  ; sad64x32_avg_sse2
+SAD64XN  16, 1  ; sad64x16_avg_sse2
+SAD64XN 128, 2  ; sad64x128_skip_sse2
+SAD64XN  64, 2  ; sad64x64_skip_sse2
+SAD64XN  32, 2  ; sad64x32_skip_sse2
+SAD64XN  16, 2  ; sad64x16_skip_sse2
 
 ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD32XN 1-2 0
   SAD_FN 32, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -195,25 +237,36 @@ SAD64XN 16, 1 ; sad64x16_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
-SAD32XN 64 ; sad32x64_sse2
-SAD32XN 32 ; sad32x32_sse2
-SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64    ; sad32x64_sse2
+SAD32XN 32    ; sad32x32_sse2
+SAD32XN 16    ; sad32x16_sse2
+SAD32XN  8    ; sad_32x8_sse2
 SAD32XN 64, 1 ; sad32x64_avg_sse2
 SAD32XN 32, 1 ; sad32x32_avg_sse2
 SAD32XN 16, 1 ; sad32x16_avg_sse2
-SAD32XN 8 ; sad_32x8_sse2
-SAD32XN 8, 1 ; sad_32x8_avg_sse2
+SAD32XN  8, 1 ; sad_32x8_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
+SAD32XN  8, 2 ; sad_32x8_skip_sse2
 
 ; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro SAD16XN 1-2 0
   SAD_FN 16, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -243,27 +296,38 @@ SAD32XN 8, 1 ; sad_32x8_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
-SAD16XN 32 ; sad16x32_sse2
-SAD16XN 16 ; sad16x16_sse2
-SAD16XN  8 ; sad16x8_sse2
+SAD16XN 64    ; sad_16x64_sse2
+SAD16XN 32    ; sad16x32_sse2
+SAD16XN 16    ; sad16x16_sse2
+SAD16XN  8    ; sad16x8_sse2
+SAD16XN  4    ; sad_16x4_sse2
+SAD16XN 64, 1 ; sad_16x64_avg_sse2
 SAD16XN 32, 1 ; sad16x32_avg_sse2
 SAD16XN 16, 1 ; sad16x16_avg_sse2
 SAD16XN  8, 1 ; sad16x8_avg_sse2
-SAD16XN 4 ; sad_16x4_sse2
-SAD16XN 4, 1 ; sad_16x4_avg_sse2
-SAD16XN 64 ; sad_16x64_sse2
-SAD16XN 64, 1 ; sad_16x64_avg_sse2
+SAD16XN  4, 1 ; sad_16x4_avg_sse2
+SAD16XN 64, 2 ; sad_16x64_skip_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN  8, 2 ; sad16x8_skip_sse2
 
 ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD8XN 1-2 0
   SAD_FN 8, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -291,25 +355,35 @@ SAD16XN 64, 1 ; sad_16x64_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
-SAD8XN 16 ; sad8x16_sse2
-SAD8XN  8 ; sad8x8_sse2
-SAD8XN  4 ; sad8x4_sse2
+SAD8XN 32    ; sad_8x32_sse2
+SAD8XN 16    ; sad8x16_sse2
+SAD8XN  8    ; sad8x8_sse2
+SAD8XN  4    ; sad8x4_sse2
+SAD8XN 32, 1 ; sad_8x32_avg_sse2
 SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
-SAD8XN 32 ; sad_8x32_sse2
-SAD8XN 32, 1 ; sad_8x32_avg_sse2
+SAD8XN 32, 2 ; sad_8x32_skip_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN  8, 2 ; sad8x8_skip_sse2
 
 ; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD4XN 1-2 0
   SAD_FN 4, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -340,14 +414,19 @@ SAD8XN 32, 1 ; sad_8x32_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
 
 INIT_XMM sse2
+SAD4XN 16 ; sad_4x16_sse2
 SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
+SAD4XN 16, 1 ; sad_4x16_avg_sse2
 SAD4XN  8, 1 ; sad4x8_avg_sse
 SAD4XN  4, 1 ; sad4x4_avg_sse
-SAD4XN 16 ; sad_4x16_sse2
-SAD4XN 16, 1 ; sad_4x16_avg_sse2
+SAD4XN 16, 2 ; sad_4x16_skip_sse2
+SAD4XN  8, 2 ; sad4x8_skip_sse
diff --git a/media/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm b/media/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm
index 6d9b5a12f1..49bc655336 100644
--- a/media/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm
+++ b/media/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm
@@ -67,7 +67,7 @@ SECTION .text
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(aom_ssim_parms_16x16_sse2) PRIVATE
+globalsym(aom_ssim_parms_16x16_sse2)
 sym(aom_ssim_parms_16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -157,7 +157,7 @@ sym(aom_ssim_parms_16x16_sse2):
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(aom_ssim_parms_8x8_sse2) PRIVATE
+globalsym(aom_ssim_parms_8x8_sse2)
 sym(aom_ssim_parms_8x8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libaom/src/aom_dsp/x86/subtract_sse2.asm b/media/libaom/src/aom_dsp/x86/subtract_sse2.asm
index 1a75a234f9..af380221f2 100644
--- a/media/libaom/src/aom_dsp/x86/subtract_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/subtract_sse2.asm
@@ -143,4 +143,5 @@ INIT_MMX
   lea                predq, [predq+pred_str*2]
   sub                rowsd, 2
   jg .loop_4
+  emms
   RET
diff --git a/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c b/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c
index 97d78b6842..0d63db288e 100644
--- a/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c
@@ -78,6 +78,84 @@ uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
   }
 }
 
+static uint64_t aom_sum_sse_2d_i16_nxn_avx2(const int16_t *src, int stride,
+                                            int width, int height, int *sum) {
+  uint64_t result;
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i one_reg = _mm256_set1_epi16(1);
+
+  __m256i v_sse_total = zero_reg;
+  __m256i v_sum_total = zero_reg;
+
+  for (int col = 0; col < height; col += 4) {
+    __m256i v_sse_row = zero_reg;
+    for (int row = 0; row < width; row += 16) {
+      const int16_t *tempsrc = src + row;
+      const __m256i v_val_0_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+      const __m256i v_val_1_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+      const __m256i v_val_2_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+      const __m256i v_val_3_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+
+      const __m256i v_sum_01 = _mm256_add_epi16(v_val_0_w, v_val_1_w);
+      const __m256i v_sum_23 = _mm256_add_epi16(v_val_2_w, v_val_3_w);
+      __m256i v_sum_0123 = _mm256_add_epi16(v_sum_01, v_sum_23);
+      v_sum_0123 = _mm256_madd_epi16(v_sum_0123, one_reg);
+      v_sum_total = _mm256_add_epi32(v_sum_total, v_sum_0123);
+
+      const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+      const __m256i v_sq_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m256i v_sq_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m256i v_sq_0123_d = _mm256_add_epi32(v_sq_01_d, v_sq_23_d);
+      v_sse_row = _mm256_add_epi32(v_sse_row, v_sq_0123_d);
+    }
+    const __m256i v_sse_row_low = _mm256_unpacklo_epi32(v_sse_row, zero_reg);
+    const __m256i v_sse_row_hi = _mm256_unpackhi_epi32(v_sse_row, zero_reg);
+    v_sse_row = _mm256_add_epi64(v_sse_row_low, v_sse_row_hi);
+    v_sse_total = _mm256_add_epi64(v_sse_total, v_sse_row);
+    src += 4 * stride;
+  }
+
+  const __m128i v_sum_total_low = _mm256_castsi256_si128(v_sum_total);
+  const __m128i v_sum_total_hi = _mm256_extracti128_si256(v_sum_total, 1);
+  __m128i sum_128bit = _mm_add_epi32(v_sum_total_hi, v_sum_total_low);
+  sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 8));
+  sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 4));
+  *sum += _mm_cvtsi128_si32(sum_128bit);
+
+  __m128i v_sse_total_lo = _mm256_castsi256_si128(v_sse_total);
+  __m128i v_sse_total_hi = _mm256_extracti128_si256(v_sse_total, 1);
+  __m128i sse_128bit = _mm_add_epi64(v_sse_total_lo, v_sse_total_hi);
+
+  sse_128bit =
+      _mm_add_epi64(sse_128bit, _mm_unpackhi_epi64(sse_128bit, sse_128bit));
+
+  xx_storel_64(&result, sse_128bit);
+
+  return result;
+}
+
+uint64_t aom_sum_sse_2d_i16_avx2(const int16_t *src, int src_stride, int width,
+                                 int height, int *sum) {
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum);
+  } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+    return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum);
+  } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+    return aom_sum_sse_2d_i16_nxn_avx2(src, src_stride, width, height, sum);
+  } else {
+    return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum);
+  }
+}
+
 // Accumulate sum of 16-bit elements in the vector
 static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) {
   __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
diff --git a/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c b/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c
index 85b301a88e..0bdeee9f27 100644
--- a/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c
@@ -53,6 +53,27 @@ uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) {
   return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
 }
 
+uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum) {
+  const __m128i one_reg = _mm_set1_epi16(1);
+  const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+  const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+  __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+  __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+
+  __m128i v_sum_0123_d = _mm_add_epi16(v_val_01_w, v_val_23_w);
+  v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg);
+  v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 8));
+  v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 4));
+  *sum = _mm_cvtsi128_si32(v_sum_0123_d);
+
+  const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+  const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+  __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+  v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 8));
+  v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 4));
+  return (uint64_t)_mm_cvtsi128_si32(v_sq_0123_d);
+}
+
 uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
                                          int height) {
   int r = 0;
@@ -70,6 +91,20 @@ uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
   return xx_cvtsi128_si64(v_acc_64);
 }
 
+uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height,
+                                     int *sum) {
+  int r = 0;
+  uint64_t sse = 0;
+  do {
+    int curr_sum = 0;
+    sse += aom_sum_sse_2d_i16_4x4_sse2(src, stride, &curr_sum);
+    *sum += curr_sum;
+    src += stride << 2;
+    r += 4;
+  } while (r < height);
+  return sse;
+}
+
 #ifdef __GNUC__
 // This prevents GCC/Clang from inlining this function into
 // aom_sum_squares_2d_i16_sse2, which in turn saves some stack
@@ -120,6 +155,69 @@ aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
   return xx_cvtsi128_si64(v_acc_q);
 }
 
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// aom_sum_sse_2d_i16_nxn_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+uint64_t
+aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+                            int height, int *sum) {
+  int r = 0;
+  uint64_t result;
+  const __m128i zero_reg = _mm_setzero_si128();
+  const __m128i one_reg = _mm_set1_epi16(1);
+
+  __m128i v_sse_total = zero_reg;
+  __m128i v_sum_total = zero_reg;
+
+  do {
+    int c = 0;
+    __m128i v_sse_row = zero_reg;
+    do {
+      const int16_t *b = src + c;
+
+      __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+      __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+      __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+      __m128i v_val_3_w = xx_load_128(b + 3 * stride);
+
+      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+      const __m128i v_sq_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m128i v_sq_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+      v_sse_row = _mm_add_epi32(v_sse_row, v_sq_0123_d);
+
+      const __m128i v_sum_01 = _mm_add_epi16(v_val_0_w, v_val_1_w);
+      const __m128i v_sum_23 = _mm_add_epi16(v_val_2_w, v_val_3_w);
+      __m128i v_sum_0123_d = _mm_add_epi16(v_sum_01, v_sum_23);
+      v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg);
+      v_sum_total = _mm_add_epi32(v_sum_total, v_sum_0123_d);
+
+      c += 8;
+    } while (c < width);
+
+    const __m128i v_sse_row_low = _mm_unpacklo_epi32(v_sse_row, zero_reg);
+    const __m128i v_sse_row_hi = _mm_unpackhi_epi32(v_sse_row, zero_reg);
+    v_sse_row = _mm_add_epi64(v_sse_row_low, v_sse_row_hi);
+    v_sse_total = _mm_add_epi64(v_sse_total, v_sse_row);
+    src += 4 * stride;
+    r += 4;
+  } while (r < height);
+
+  v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 8));
+  v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 4));
+  *sum += _mm_cvtsi128_si32(v_sum_total);
+
+  v_sse_total = _mm_add_epi64(v_sse_total, _mm_srli_si128(v_sse_total, 8));
+  xx_storel_64(&result, v_sse_total);
+  return result;
+}
+
 uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
                                      int height) {
   // 4 elements per row only requires half an XMM register, so this
@@ -137,6 +235,20 @@ uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
   }
 }
 
+uint64_t aom_sum_sse_2d_i16_sse2(const int16_t *src, int src_stride, int width,
+                                 int height, int *sum) {
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+    // Generic case
+    return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum);
+  } else {
+    return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum);
+  }
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // 1D version
 //////////////////////////////////////////////////////////////////////////////
diff --git a/media/libaom/src/aom_dsp/x86/sum_squares_sse2.h b/media/libaom/src/aom_dsp/x86/sum_squares_sse2.h
index 491e31cc5e..5ed3f2c7bf 100644
--- a/media/libaom/src/aom_dsp/x86/sum_squares_sse2.h
+++ b/media/libaom/src/aom_dsp/x86/sum_squares_sse2.h
@@ -19,4 +19,10 @@ uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
                                          int height);
 uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
 
+uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum);
+uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height,
+                                     int *sum);
+uint64_t aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+                                     int height, int *sum);
+
 #endif  // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/media/libaom/src/aom_dsp/x86/synonyms.h b/media/libaom/src/aom_dsp/x86/synonyms.h
index 2e99bee3e9..d53801581b 100644
--- a/media/libaom/src/aom_dsp/x86/synonyms.h
+++ b/media/libaom/src/aom_dsp/x86/synonyms.h
@@ -100,6 +100,12 @@ static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
   return _mm_srli_epi32(v_tmp_d, bits);
 }
 
+static INLINE __m128i xx_roundn_epi16_unsigned(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
 // This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
 static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
   const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
diff --git a/media/libaom/src/aom_dsp/x86/transpose_sse2.h b/media/libaom/src/aom_dsp/x86/transpose_sse2.h
index 7ac692c78b..9dab750f44 100644
--- a/media/libaom/src/aom_dsp/x86/transpose_sse2.h
+++ b/media/libaom/src/aom_dsp/x86/transpose_sse2.h
@@ -107,10 +107,14 @@ static INLINE void transpose_16bit_4x4(const __m128i *const in,
   const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
 
   // Unpack 32 bit elements resulting in:
-  // out[0]: 00 10 20 30
-  // out[1]: 01 11 21 31
-  // out[2]: 02 12 22 32
-  // out[3]: 03 13 23 33
+  // out[0]: 00 10 20 30  01 11 21 31
+  // out[1]: 01 11 21 31  __ __ __ __
+  // out[2]: 02 12 22 32  03 13 23 33
+  // out[3]: 03 13 23 33  __ __ __ __
+  //
+  // Note: The high 64 bits of the output registers are shown for informational
+  // purposes only. Callers should only use the low 64 bits of the output
+  // registers. "__" indicates zeros.
   out[0] = _mm_unpacklo_epi32(a0, a1);
   out[1] = _mm_srli_si128(out[0], 8);
   out[2] = _mm_unpackhi_epi32(a0, a1);
diff --git a/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h b/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h
index ea57c9f35e..4105250bc0 100644
--- a/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h
+++ b/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h
@@ -246,21 +246,19 @@ static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
   }
 }
 
-static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+static INLINE __m256i round_shift_32_avx2(__m256i vec, int bit) {
   __m256i tmp, round;
   round = _mm256_set1_epi32(1 << (bit - 1));
   tmp = _mm256_add_epi32(vec, round);
   return _mm256_srai_epi32(tmp, bit);
 }
 
-static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
-                                                 __m256i *output,
-                                                 const int size,
-                                                 const int bit) {
+static INLINE void round_shift_array_32_avx2(__m256i *input, __m256i *output,
+                                             const int size, const int bit) {
   if (bit > 0) {
     int i;
     for (i = 0; i < size; i++) {
-      output[i] = av1_round_shift_32_avx2(input[i], bit);
+      output[i] = round_shift_32_avx2(input[i], bit);
     }
   } else {
     int i;
@@ -270,25 +268,24 @@ static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
   }
 }
 
-static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input,
-                                                      __m256i *output,
-                                                      const int size,
-                                                      const int bit,
-                                                      const int val) {
+static INLINE void round_shift_rect_array_32_avx2(__m256i *input,
+                                                  __m256i *output,
+                                                  const int size, const int bit,
+                                                  const int val) {
   const __m256i sqrt2 = _mm256_set1_epi32(val);
   if (bit > 0) {
     int i;
     for (i = 0; i < size; i++) {
-      const __m256i r0 = av1_round_shift_32_avx2(input[i], bit);
+      const __m256i r0 = round_shift_32_avx2(input[i], bit);
       const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
-      output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+      output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
     }
   } else {
     int i;
     for (i = 0; i < size; i++) {
       const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
       const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
-      output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+      output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
     }
   }
 }
diff --git a/media/libaom/src/aom_dsp/x86/variance_avx2.c b/media/libaom/src/aom_dsp/x86/variance_avx2.c
index c4919ba9b4..7398a73b0e 100644
--- a/media/libaom/src/aom_dsp/x86/variance_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/variance_avx2.c
@@ -14,6 +14,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
 
 static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
   return _mm_add_epi16(_mm256_castsi256_si128(val),
@@ -185,19 +186,22 @@ static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
     return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
   }
 
-AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512)
+AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512)
+AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512)
 
-AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
-AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
-AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
-AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
+AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512)
+AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024)
+AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048)
 
-AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
-AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
+AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048)
+
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024)
+AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512)
+AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024)
+AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512)
+#endif
 
 #define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh)                                   \
   unsigned int aom_variance##bw##x##bh##_avx2(                                \
@@ -218,10 +222,10 @@ AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
     return *sse - (unsigned int)(((int64_t)sum * sum) >> bits);               \
   }
 
-AOM_VAR_LOOP_AVX2(64, 64, 12, 32);    // 64x32 * ( 64/32)
-AOM_VAR_LOOP_AVX2(64, 128, 13, 32);   // 64x32 * (128/32)
-AOM_VAR_LOOP_AVX2(128, 64, 13, 16);   // 128x16 * ( 64/16)
-AOM_VAR_LOOP_AVX2(128, 128, 14, 16);  // 128x16 * (128/16)
+AOM_VAR_LOOP_AVX2(64, 64, 12, 32)    // 64x32 * ( 64/32)
+AOM_VAR_LOOP_AVX2(64, 128, 13, 32)   // 64x32 * (128/32)
+AOM_VAR_LOOP_AVX2(128, 64, 13, 16)   // 128x16 * ( 64/16)
+AOM_VAR_LOOP_AVX2(128, 128, 14, 16)  // 128x16 * (128/16)
 
 unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
                                const uint8_t *ref, int ref_stride,
@@ -272,19 +276,21 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
     return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));       \
   }
 
-AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7);
-AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6);
-AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7);
-AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6);
-AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5);
-AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
-AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
-AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
-AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6);
-AOM_SUB_PIXEL_VAR_AVX2(16, 32, 16, 4, 5);
-AOM_SUB_PIXEL_VAR_AVX2(16, 16, 16, 4, 4);
-AOM_SUB_PIXEL_VAR_AVX2(16, 8, 16, 4, 3);
-AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2);
+AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7)
+AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6)
+AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7)
+AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6)
+AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5)
+AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6)
+AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5)
+AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4)
+AOM_SUB_PIXEL_VAR_AVX2(16, 32, 16, 4, 5)
+AOM_SUB_PIXEL_VAR_AVX2(16, 16, 16, 4, 4)
+AOM_SUB_PIXEL_VAR_AVX2(16, 8, 16, 4, 3)
+#if !CONFIG_REALTIME_ONLY
+AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6)
+AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2)
+#endif
 
 #define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
   unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
@@ -318,14 +324,14 @@ AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2);
     return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
   }
 
-AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4)
 
 static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
   const __m256i d =
@@ -394,25 +400,20 @@ void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
       comp_pred += (16 << 2);
       i += 4;
     } while (i < height);
-  } else {  // for width == 32
+  } else {
     do {
-      const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0));
-      const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1));
-      const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask));
-
-      const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0));
-      const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1));
-      const __m256i aB =
-          _mm256_lddqu_si256((const __m256i *)(mask + mask_stride));
-
-      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
-      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
-      comp_pred += (32 << 1);
-
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += (mask_stride << 1);
-      i += 2;
+      for (int x = 0; x < width; x += 32) {
+        const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0 + x));
+        const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1 + x));
+        const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask + x));
+
+        comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+        comp_pred += 32;
+      }
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      i++;
     } while (i < height);
   }
 }
@@ -498,29 +499,231 @@ void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
       comp_pred += width;
       i += 1;
     } while (i < height);
-  } else if (width == 32) {
+  } else {
     do {
-      const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
-      const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16));
-      const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
-      const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16));
+      for (int x = 0; x < width; x += 32) {
+        const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0 + x));
+        const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + x + 16));
+        const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1 + x));
+        const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + x + 16));
 
-      const __m256i m01_16 =
-          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
-      const __m256i m23_16 =
-          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16)));
+        const __m256i m01_16 =
+            _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + x)));
+        const __m256i m23_16 = _mm256_cvtepu8_epi16(
+            _mm_loadu_si128((const __m128i *)(mask + x + 16)));
 
-      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
-      const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
+        const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
+        const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
 
-      _mm256_storeu_si256((__m256i *)comp_pred, comp);
-      _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
+        _mm256_storeu_si256((__m256i *)comp_pred, comp);
+        _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
 
+        comp_pred += 32;
+      }
       src0 += stride0;
       src1 += stride1;
       mask += mask_stride;
-      comp_pred += width;
       i += 1;
     } while (i < height);
   }
 }
+
+uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8;
+  __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16;
+  __m256i src0_8x16, src1_8x16, dst_16x16, src_16x16;
+  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m256i sub_result;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  for (int i = 0; i < h; i += 4) {
+    dst0_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 0) * dstride]));
+    dst1_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 1) * dstride]));
+    dst2_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 2) * dstride]));
+    dst3_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 3) * dstride]));
+    dst_16x8 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(dst0_4x8, dst1_4x8),
+                                  _mm_unpacklo_epi32(dst2_4x8, dst3_4x8));
+    dst_16x16 = _mm256_cvtepu8_epi16(dst_16x8);
+
+    src0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+    src1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+    src2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
+    src3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
+    src0_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(src0_4x16, src1_4x16));
+    src1_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(src2_4x16, src3_4x16));
+    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);  // 32bit store
+    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);  // 32bit store
+
+    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+    square_result = _mm256_add_epi64(
+        square_result,
+        _mm256_add_epi64(
+            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+  const __m128i sum_2x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(square_result),
+                    _mm256_extracti128_si256(square_result, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_8x8, dst1_8x8, dst3_16x8;
+  __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16;
+  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m256i sub_result;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+
+  for (int i = 0; i < h; i += 2) {
+    dst0_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+    dst1_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+    dst3_16x8 = _mm_unpacklo_epi64(dst0_8x8, dst1_8x8);
+    dst_16x16 = _mm256_cvtepu8_epi16(dst3_16x8);
+
+    src0_8x16 =
+        _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
+    src1_8x16 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
+    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
+    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+
+    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+    square_result = _mm256_add_epi64(
+        square_result,
+        _mm256_add_epi64(
+            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+
+  const __m128i sum_2x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(square_result),
+                    _mm256_extracti128_si256(square_result, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must satisfy");
+  switch (w) {
+    case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h);
+    case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
+
+static INLINE void sum_final_256bit_avx2(__m256i sum_8x16[2], int *const sum) {
+  const __m256i sum_result_0 = _mm256_hadd_epi16(sum_8x16[0], sum_8x16[1]);
+  const __m256i sum_result_1 =
+      _mm256_add_epi16(_mm256_srli_si256(sum_result_0, 4), sum_result_0);
+  const __m256i sum_result_2 =
+      _mm256_add_epi16(_mm256_srli_si256(sum_result_1, 2), sum_result_1);
+  const __m128i sum_128_high = _mm256_extractf128_si256(sum_result_2, 1);
+  const __m128i sum_result_3 =
+      _mm_unpacklo_epi16(_mm256_castsi256_si128(sum_result_2), sum_128_high);
+  const __m128i sum_result_4 =
+      _mm_unpackhi_epi16(_mm256_castsi256_si128(sum_result_2), sum_128_high);
+  const __m128i sum_result_5 = _mm_unpacklo_epi32(sum_result_3, sum_result_4);
+
+  _mm_storeu_si128((__m128i *)sum, _mm_cvtepi16_epi32(sum_result_5));
+}
+
+static INLINE void calc_sum_sse_for_8x32_block_avx2(const uint8_t *src,
+                                                    const uint8_t *ref,
+                                                    __m256i sse_8x16[2],
+                                                    __m256i sum_8x16[2]) {
+  const __m256i s0_256 =
+      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)src));
+  const __m256i r0_256 =
+      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)ref));
+  const __m256i s1_256 =
+      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(src + 16)));
+  const __m256i r1_256 =
+      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(ref + 16)));
+  const __m256i diff0 = _mm256_sub_epi16(s0_256, r0_256);
+  const __m256i diff1 = _mm256_sub_epi16(s1_256, r1_256);
+
+  sse_8x16[0] = _mm256_add_epi32(sse_8x16[0], _mm256_madd_epi16(diff0, diff0));
+  sse_8x16[1] = _mm256_add_epi32(sse_8x16[1], _mm256_madd_epi16(diff1, diff1));
+  sum_8x16[0] = _mm256_add_epi16(sum_8x16[0], diff0);
+  sum_8x16[1] = _mm256_add_epi16(sum_8x16[1], diff1);
+}
+
+static INLINE void get_sse_sum_8x8_quad_avx2(const uint8_t *src,
+                                             const int src_stride,
+                                             const uint8_t *ref,
+                                             const int ref_stride, const int h,
+                                             unsigned int *const sse,
+                                             int *const sum) {
+  assert(h <= 128);  // May overflow for larger height.
+  __m256i sse_8x16[2], sum_8x16[2];
+  sum_8x16[0] = _mm256_setzero_si256();
+  sse_8x16[0] = _mm256_setzero_si256();
+  sum_8x16[1] = sum_8x16[0];
+  sse_8x16[1] = sse_8x16[0];
+
+  for (int i = 0; i < h; i += 2) {
+    // Process 8x32 block of first row.
+    calc_sum_sse_for_8x32_block_avx2(src, ref, sse_8x16, sum_8x16);
+
+    // Process 8x32 block of second row.
+    calc_sum_sse_for_8x32_block_avx2(src + src_stride, ref + ref_stride,
+                                     sse_8x16, sum_8x16);
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+  }
+
+  // Add sse registers appropriately to get each 8x8 block sse separately.
+  const __m256i sse_result_1 = _mm256_hadd_epi32(sse_8x16[0], sse_8x16[1]);
+  const __m256i sse_result_2 =
+      _mm256_hadd_epi32(sse_result_1, _mm256_setzero_si256());
+  const __m256i sse_result_3 = _mm256_permute4x64_epi64(sse_result_2, 0xd8);
+
+  _mm_storeu_si128(
+      (__m128i *)sse,
+      _mm_shuffle_epi32(_mm256_castsi256_si128(sse_result_3), 0xd8));
+
+  // Add sum registers appropriately to get each 8x8 block sum separately.
+  sum_final_256bit_avx2(sum_8x16, sum);
+}
+
+void aom_get_sse_sum_8x8_quad_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   unsigned int *sse, int *sum) {
+  get_sse_sum_8x8_quad_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse,
+                            sum);
+}
diff --git a/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c b/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c
index f779270ae3..163e4cc566 100644
--- a/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c
@@ -616,7 +616,7 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         src += src_stride;
         dst += dst_stride;
       }
-    } else if (y_offset == 8) {
+    } else if (y_offset == 4) {
       __m256i src_next_reg;
       for (i = 0; i < height; i++) {
         LOAD_SRC_DST
@@ -652,8 +652,8 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         dst += dst_stride;
       }
     }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
     if (y_offset == 0) {
       __m256i src_next_reg;
       for (i = 0; i < height; i++) {
@@ -668,8 +668,8 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i src_next_reg, src_avg;
       // load source and another source starting from the next
       // following byte
@@ -691,7 +691,7 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         CALC_SUM_SSE_INSIDE_LOOP
         dst += dst_stride;
       }
-      // x_offset = 8  and y_offset = bilin interpolation
+      // x_offset = 4  and y_offset = bilin interpolation
     } else {
       __m256i filter, pw8, src_next_reg, src_avg;
       y_offset <<= 5;
@@ -741,8 +741,8 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i filter, pw8, src_next_reg, src_pack;
       x_offset <<= 5;
       filter = _mm256_load_si256(
diff --git a/media/libaom/src/aom_dsp/x86/variance_sse2.c b/media/libaom/src/aom_dsp/x86/variance_sse2.c
index 4e2b5a1aa0..c36eeeedde 100644
--- a/media/libaom/src/aom_dsp/x86/variance_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/variance_sse2.c
@@ -14,18 +14,12 @@
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
 
 #include "aom_dsp/blend.h"
+#include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms.h"
-
 #include "aom_ports/mem.h"
 
-#include "av1/common/av1_common_int.h"
-#include "av1/common/filter.h"
-#include "av1/common/reconinter.h"
-#include "av1/encoder/reconinter_enc.h"
-
 unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
   __m128i vsum = _mm_setzero_si128();
   int i;
@@ -42,8 +36,8 @@ unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
 }
 
 static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
-  const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
-  const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
+  const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
   return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
 }
 
@@ -246,6 +240,29 @@ void aom_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
   variance_final_128_pel_sse2(vsse, vsum, sse, sum);
 }
 
+void aom_get_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   unsigned int *sse, int *sum) {
+  // Loop over 4 8x8 blocks. Process one 8x32 block.
+  for (int k = 0; k < 4; k++) {
+    const uint8_t *src = src_ptr;
+    const uint8_t *ref = ref_ptr;
+    __m128i vsum = _mm_setzero_si128();
+    __m128i vsse = _mm_setzero_si128();
+    for (int i = 0; i < 8; i++) {
+      const __m128i s = load8_8to16_sse2(src + (k * 8));
+      const __m128i r = load8_8to16_sse2(ref + (k * 8));
+      const __m128i diff = _mm_sub_epi16(s, r);
+      vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff, diff));
+      vsum = _mm_add_epi16(vsum, diff);
+
+      src += src_stride;
+      ref += ref_stride;
+    }
+    variance_final_128_pel_sse2(vsse, vsum, &sse[k], &sum[k]);
+  }
+}
+
 #define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
   unsigned int aom_variance##bw##x##bh##_sse2(                                \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
@@ -260,24 +277,27 @@ void aom_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
     return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
   }
 
-AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128);
-AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128);
-AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128);
+AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128)
+AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128)
+AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128)
 
-AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
+AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128)
+AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128)
+AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128)
 
-AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
-AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128);
-AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256);
-AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512);
-AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
+AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128)
+AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512)
 
-AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256);
-AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512);
-AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
+AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512)
+AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024)
+
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128)
+AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024)
+#endif
 
 #define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh)                                   \
   unsigned int aom_variance##bw##x##bh##_sse2(                                \
@@ -300,15 +320,18 @@ AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
     return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
   }
 
-AOM_VAR_LOOP_SSE2(32, 64, 11, 32);  // 32x32 * ( 64/32 )
+AOM_VAR_LOOP_SSE2(32, 64, 11, 32)  // 32x32 * ( 64/32 )
+
+AOM_VAR_LOOP_SSE2(64, 32, 11, 16)   // 64x16 * ( 32/16 )
+AOM_VAR_LOOP_SSE2(64, 64, 12, 16)   // 64x16 * ( 64/16 )
+AOM_VAR_LOOP_SSE2(64, 128, 13, 16)  // 64x16 * ( 128/16 )
 
-AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
-AOM_VAR_LOOP_SSE2(64, 32, 11, 16);   // 64x16 * ( 32/16 )
-AOM_VAR_LOOP_SSE2(64, 64, 12, 16);   // 64x16 * ( 64/16 )
-AOM_VAR_LOOP_SSE2(64, 128, 13, 16);  // 64x16 * ( 128/16 )
+AOM_VAR_LOOP_SSE2(128, 64, 13, 8)   // 128x8 * ( 64/8 )
+AOM_VAR_LOOP_SSE2(128, 128, 14, 8)  // 128x8 * ( 128/8 )
 
-AOM_VAR_LOOP_SSE2(128, 64, 13, 8);   // 128x8 * ( 64/8 )
-AOM_VAR_LOOP_SSE2(128, 128, 14, 8);  // 128x8 * ( 128/8 )
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024)
+#endif
 
 unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride,
@@ -383,32 +406,52 @@ DECLS(ssse3);
     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
   }
 
-#define FNS(opt)                                     \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));    \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));     \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));      \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));      \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));      \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));      \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
-  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
-  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))      \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
   FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-
-FNS(sse2);
-FNS(ssse3);
+#else
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
+#endif
+
+FNS(sse2)
+FNS(ssse3)
 
 #undef FNS
 #undef FN
@@ -462,192 +505,56 @@ DECLS(ssse3);
     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
   }
 
-#define FNS(opt)                                     \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));     \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));     \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
-  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
-  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))     \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
   FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-
-FNS(sse2);
-FNS(ssse3);
+#else
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
+#endif
+
+FNS(sse2)
+FNS(ssse3)
 
 #undef FNS
 #undef FN
 
-void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
-                             int mi_row, int mi_col, const MV *const mv,
-                             uint8_t *comp_pred, int width, int height,
-                             int subpel_x_q3, int subpel_y_q3,
-                             const uint8_t *ref, int ref_stride,
-                             int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      int plane = 0;
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-
-      InterPredParams inter_pred_params;
-      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
-      const int_interpfilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-      av1_init_inter_params(
-          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
-          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
-          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
-      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
-                                        &inter_pred_params);
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter = av1_get_filter(subpel_search);
-  // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
-  // 2-tap yet.
-  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    if (width >= 16) {
-      int i;
-      assert(!(width & 15));
-      /*Read 16 pixels one row at a time.*/
-      for (i = 0; i < height; i++) {
-        int j;
-        for (j = 0; j < width; j += 16) {
-          xx_storeu_128(comp_pred, xx_loadu_128(ref));
-          comp_pred += 16;
-          ref += 16;
-        }
-        ref += ref_stride - width;
-      }
-    } else if (width >= 8) {
-      int i;
-      assert(!(width & 7));
-      assert(!(height & 1));
-      /*Read 8 pixels two rows at a time.*/
-      for (i = 0; i < height; i += 2) {
-        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
-        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
-        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
-        comp_pred += 16;
-        ref += 2 * ref_stride;
-      }
-    } else {
-      int i;
-      assert(!(width & 3));
-      assert(!(height & 3));
-      /*Read 4 pixels four rows at a time.*/
-      for (i = 0; i < height; i++) {
-        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
-        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
-        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
-        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
-        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
-                                               _mm_unpacklo_epi32(row2, row3));
-        xx_storeu_128(comp_pred, reg);
-        comp_pred += 16;
-        ref += 4 * ref_stride;
-      }
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
-                        width, height);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
-                       width, height);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
-    uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
-                                    ? temp + (filter_taps >> 1) * MAX_SB_SIZE
-                                    : temp;
-    uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
-    int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
-                        kernel_x, 16, NULL, -1, width, intermediate_height);
-    aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
-                       kernel_y, 16, width, height);
-  }
-}
-
-void aom_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, int subpel_search) {
-  int n;
-  int i;
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
-  assert(!(width * height & 15));
-  n = width * height >> 4;
-  for (i = 0; i < n; i++) {
-    __m128i s0 = xx_loadu_128(comp_pred);
-    __m128i p0 = xx_loadu_128(pred);
-    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
-    comp_pred += 16;
-    pred += 16;
-  }
-}
-
-void aom_comp_mask_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int subpel_search) {
-  if (subpel_x_q3 | subpel_y_q3) {
-    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
-    ref = comp_pred;
-    ref_stride = width;
-  }
-  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
-                     mask_stride, invert_mask);
-}
-
 static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
                                                       const __m128i s1,
                                                       const __m128i a) {
@@ -727,31 +634,131 @@ void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
       comp_pred += width;
       i += 1;
     } while (i < height);
-  } else if (width == 32) {
+  } else {
     do {
-      for (int j = 0; j < 2; j++) {
-        const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16));
-        const __m128i s2 =
-            _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16));
-        const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16));
-        const __m128i s3 =
-            _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16));
-
-        const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16));
-        const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
-        const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
-
-        const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
-        const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
-
-        _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
-        _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+      for (int x = 0; x < width; x += 32) {
+        for (int j = 0; j < 2; j++) {
+          const __m128i s0 =
+              _mm_loadu_si128((const __m128i *)(src0 + x + j * 16));
+          const __m128i s2 =
+              _mm_loadu_si128((const __m128i *)(src0 + x + 8 + j * 16));
+          const __m128i s1 =
+              _mm_loadu_si128((const __m128i *)(src1 + x + j * 16));
+          const __m128i s3 =
+              _mm_loadu_si128((const __m128i *)(src1 + x + 8 + j * 16));
+
+          const __m128i m_8 =
+              _mm_loadu_si128((const __m128i *)(mask + x + j * 16));
+          const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+          const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+          const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+          const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+          _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
+          _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+        }
+        comp_pred += 32;
       }
       src0 += stride0;
       src1 += stride1;
       mask += mask_stride;
-      comp_pred += width;
       i += 1;
     } while (i < height);
   }
 }
+
+uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_8x8, dst1_8x8, dst_16x8;
+  __m128i src0_16x4, src1_16x4, src_16x8;
+  __m128i res0_32x4, res1_32x4, res0_64x4, res1_64x4, res2_64x4, res3_64x4;
+  __m128i sub_result_16x8;
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i square_result = _mm_setzero_si128();
+  for (int i = 0; i < h; i += 2) {
+    dst0_8x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 0) * dstride]));
+    dst1_8x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 1) * dstride]));
+    dst_16x8 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(dst0_8x8, dst1_8x8), zeros);
+
+    src0_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+    src1_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+    src_16x8 = _mm_unpacklo_epi64(src0_16x4, src1_16x4);
+
+    sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
+
+    res0_32x4 = _mm_unpacklo_epi16(sub_result_16x8, zeros);
+    res1_32x4 = _mm_unpackhi_epi16(sub_result_16x8, zeros);
+
+    res0_32x4 = _mm_madd_epi16(res0_32x4, res0_32x4);
+    res1_32x4 = _mm_madd_epi16(res1_32x4, res1_32x4);
+
+    res0_64x4 = _mm_unpacklo_epi32(res0_32x4, zeros);
+    res1_64x4 = _mm_unpackhi_epi32(res0_32x4, zeros);
+    res2_64x4 = _mm_unpacklo_epi32(res1_32x4, zeros);
+    res3_64x4 = _mm_unpackhi_epi32(res1_32x4, zeros);
+
+    square_result = _mm_add_epi64(
+        square_result,
+        _mm_add_epi64(
+            _mm_add_epi64(_mm_add_epi64(res0_64x4, res1_64x4), res2_64x4),
+            res3_64x4));
+  }
+  const __m128i sum_1x64 =
+      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i dst_8x8, dst_16x8;
+  __m128i src_16x8;
+  __m128i res0_32x4, res1_32x4, res0_64x4, res1_64x4, res2_64x4, res3_64x4;
+  __m128i sub_result_16x8;
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i square_result = _mm_setzero_si128();
+
+  for (int i = 0; i < h; i++) {
+    dst_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+    dst_16x8 = _mm_unpacklo_epi8(dst_8x8, zeros);
+
+    src_16x8 = _mm_loadu_si128((__m128i *)&src[i * sstride]);
+
+    sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
+
+    res0_32x4 = _mm_unpacklo_epi16(sub_result_16x8, zeros);
+    res1_32x4 = _mm_unpackhi_epi16(sub_result_16x8, zeros);
+
+    res0_32x4 = _mm_madd_epi16(res0_32x4, res0_32x4);
+    res1_32x4 = _mm_madd_epi16(res1_32x4, res1_32x4);
+
+    res0_64x4 = _mm_unpacklo_epi32(res0_32x4, zeros);
+    res1_64x4 = _mm_unpackhi_epi32(res0_32x4, zeros);
+    res2_64x4 = _mm_unpacklo_epi32(res1_32x4, zeros);
+    res3_64x4 = _mm_unpackhi_epi32(res1_32x4, zeros);
+
+    square_result = _mm_add_epi64(
+        square_result,
+        _mm_add_epi64(
+            _mm_add_epi64(_mm_add_epi64(res0_64x4, res1_64x4), res2_64x4),
+            res3_64x4));
+  }
+  const __m128i sum_1x64 =
+      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must satisfy");
+  switch (w) {
+    case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h);
+    case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
diff --git a/media/libaom/src/aom_mem/aom_mem.c b/media/libaom/src/aom_mem/aom_mem.c
index e977b01d7d..f13ee2fa27 100644
--- a/media/libaom/src/aom_mem/aom_mem.c
+++ b/media/libaom/src/aom_mem/aom_mem.c
@@ -10,25 +10,31 @@
  */
 
 #include "aom_mem.h"
-#include <stdio.h>
+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 #include "include/aom_mem_intrnl.h"
 #include "aom/aom_integer.h"
 
-#if defined(AOM_MAX_ALLOCABLE_MEMORY)
+static size_t GetAllocationPaddingSize(size_t align) {
+  assert(align > 0);
+  assert(align < SIZE_MAX - ADDRESS_STORAGE_SIZE);
+  return align - 1 + ADDRESS_STORAGE_SIZE;
+}
+
 // Returns 0 in case of overflow of nmemb * size.
-static int check_size_argument_overflow(uint64_t nmemb, uint64_t size) {
-  const uint64_t total_size = nmemb * size;
+static int check_size_argument_overflow(size_t nmemb, size_t size,
+                                        size_t align) {
   if (nmemb == 0) return 1;
-  if (size > AOM_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
-  if (total_size != (size_t)total_size) return 0;
-  return 1;
-}
+  const size_t alloc_padding = GetAllocationPaddingSize(align);
+#if defined(AOM_MAX_ALLOCABLE_MEMORY)
+  assert(AOM_MAX_ALLOCABLE_MEMORY >= alloc_padding);
+  assert(AOM_MAX_ALLOCABLE_MEMORY <= SIZE_MAX);
+  if (size > (AOM_MAX_ALLOCABLE_MEMORY - alloc_padding) / nmemb) return 0;
+#else
+  if (size > (SIZE_MAX - alloc_padding) / nmemb) return 0;
 #endif
-
-static size_t GetAlignedMallocSize(size_t size, size_t align) {
-  return size + align - 1 + ADDRESS_STORAGE_SIZE;
+  return 1;
 }
 
 static size_t *GetMallocAddressLocation(void *const mem) {
@@ -48,10 +54,8 @@ static void *GetActualMallocAddress(void *const mem) {
 
 void *aom_memalign(size_t align, size_t size) {
   void *x = NULL;
-  const size_t aligned_size = GetAlignedMallocSize(size, align);
-#if defined(AOM_MAX_ALLOCABLE_MEMORY)
-  if (!check_size_argument_overflow(1, aligned_size)) return NULL;
-#endif
+  if (!check_size_argument_overflow(1, size, align)) return NULL;
+  const size_t aligned_size = size + GetAllocationPaddingSize(align);
   void *const addr = malloc(aligned_size);
   if (addr) {
     x = aom_align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align);
@@ -63,6 +67,7 @@ void *aom_memalign(size_t align, size_t size) {
 void *aom_malloc(size_t size) { return aom_memalign(DEFAULT_ALIGNMENT, size); }
 
 void *aom_calloc(size_t num, size_t size) {
+  if (!check_size_argument_overflow(num, size, DEFAULT_ALIGNMENT)) return NULL;
   const size_t total_size = num * size;
   void *const x = aom_malloc(total_size);
   if (x) memset(x, 0, total_size);
diff --git a/media/libaom/src/aom_ports/aom_once.h b/media/libaom/src/aom_ports/aom_once.h
index d1a031bf17..37057e6169 100644
--- a/media/libaom/src/aom_ports/aom_once.h
+++ b/media/libaom/src/aom_ports/aom_once.h
@@ -62,7 +62,7 @@ static void aom_once(void (*func)(void)) {
 #define INCL_DOS
 #include <os2.h>
 static void aom_once(void (*func)(void)) {
-  static int done;
+  static volatile int done;
 
   /* If the initialization is complete, return early. */
   if (done) return;
@@ -92,7 +92,7 @@ static void aom_once(void (*func)(void)) {
 /* Default version that performs no synchronization. */
 
 static void aom_once(void (*func)(void)) {
-  static int done;
+  static volatile int done;
 
   if (!done) {
     func();
diff --git a/media/libaom/src/aom_ports/aom_ports.cmake b/media/libaom/src/aom_ports/aom_ports.cmake
index d579896549..5d9f69a79b 100644
--- a/media/libaom/src/aom_ports/aom_ports.cmake
+++ b/media/libaom/src/aom_ports/aom_ports.cmake
@@ -13,19 +13,14 @@ if(AOM_AOM_PORTS_AOM_PORTS_CMAKE_)
 endif() # AOM_AOM_PORTS_AOM_PORTS_CMAKE_
 set(AOM_AOM_PORTS_AOM_PORTS_CMAKE_ 1)
 
-list(APPEND AOM_PORTS_INCLUDES
-            "${AOM_ROOT}/aom_ports/aom_once.h"
-            "${AOM_ROOT}/aom_ports/aom_timer.h"
-            "${AOM_ROOT}/aom_ports/bitops.h"
+list(APPEND AOM_PORTS_INCLUDES "${AOM_ROOT}/aom_ports/aom_once.h"
+            "${AOM_ROOT}/aom_ports/aom_timer.h" "${AOM_ROOT}/aom_ports/bitops.h"
             "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
-            "${AOM_ROOT}/aom_ports/mem.h"
-            "${AOM_ROOT}/aom_ports/mem_ops.h"
+            "${AOM_ROOT}/aom_ports/mem.h" "${AOM_ROOT}/aom_ports/mem_ops.h"
             "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
-            "${AOM_ROOT}/aom_ports/msvc.h"
-            "${AOM_ROOT}/aom_ports/sanitizer.h"
-            "${AOM_ROOT}/aom_ports/system_state.h")
+            "${AOM_ROOT}/aom_ports/msvc.h" "${AOM_ROOT}/aom_ports/sanitizer.h")
 
-list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/emms.asm")
+list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/float.asm")
 
 list(APPEND AOM_PORTS_INCLUDES_X86 "${AOM_ROOT}/aom_ports/x86_abi_support.asm")
 
@@ -48,8 +43,9 @@ list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h"
 #
 # * The libaom target must exist before this function is called.
 function(setup_aom_ports_targets)
-  if("${AOM_TARGET_CPU}" MATCHES "^x86")
+  if(WIN32 AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
     add_asm_library("aom_ports" "AOM_PORTS_ASM_X86")
+    set(aom_ports_asm_lib 1)
     set(aom_ports_has_symbols 1)
   elseif("${AOM_TARGET_CPU}" MATCHES "arm")
     add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_ARM})
@@ -66,27 +62,27 @@ function(setup_aom_ports_targets)
     endif()
   endif()
 
+  # Note AOM_PORTS_INCLUDES_X86 are not added to the aom_ports, aom or
+  # aom_static targets to avoid compilation issues in projects that enable ASM
+  # language support in project(). These sources were never included in
+  # libaom_srcs.*; if it becomes necessary for a particular generator another
+  # method should be used.
   if(aom_ports_has_symbols)
-    target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES})
-
-    if("${AOM_TARGET_CPU}" STREQUAL "x86"
-       OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
-      target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES_X86})
+    if(aom_ports_asm_lib)
+      # When aom_ports is an asm library its name changes based on build
+      # configuration. This handles adding sources to the correct target(s).
+      target_sources(aom_ports_static PRIVATE ${AOM_PORTS_INCLUDES})
+      if(BUILD_SHARED_LIBS)
+        target_sources(aom_ports_shared PRIVATE ${AOM_PORTS_INCLUDES})
+      endif()
+    else()
+      target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES})
     endif()
-
     set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
   else()
     target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES})
     if(BUILD_SHARED_LIBS)
       target_sources(aom_static PRIVATE ${AOM_PORTS_INCLUDES})
     endif()
-
-    if("${AOM_TARGET_CPU}" STREQUAL "x86"
-       OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
-      target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES_X86})
-      if(BUILD_SHARED_LIBS)
-        target_sources(aom_static PRIVATE ${AOM_PORTS_INCLUDES_X86})
-      endif()
-    endif()
   endif()
 endfunction()
diff --git a/media/libaom/src/aom_ports/arm_cpudetect.c b/media/libaom/src/aom_ports/arm_cpudetect.c
index 5a75bb3484..23d3aa503a 100644
--- a/media/libaom/src/aom_ports/arm_cpudetect.c
+++ b/media/libaom/src/aom_ports/arm_cpudetect.c
@@ -38,7 +38,7 @@ static int arm_cpu_env_mask(void) {
   return env && *env ? (int)strtol(env, NULL, 0) : ~0;
 }
 
-#if !CONFIG_RUNTIME_CPU_DETECT
+#if !CONFIG_RUNTIME_CPU_DETECT || defined(__APPLE__)
 
 int aom_arm_cpu_caps(void) {
   /* This function should actually be a no-op. There is no way to adjust any of
@@ -56,7 +56,7 @@ int aom_arm_cpu_caps(void) {
   return flags & mask;
 }
 
-#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
+#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT || __APPLE__ */
 /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
 #define WIN32_LEAN_AND_MEAN
 #define WIN32_EXTRA_LEAN
@@ -145,6 +145,6 @@ int aom_arm_cpu_caps(void) {
 }
 #else  /* end __linux__ */
 #error \
-    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
-"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
+    "Runtime CPU detection selected, but no CPU detection method " \
+"available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0."
 #endif
diff --git a/media/libaom/src/aom_ports/emms.asm b/media/libaom/src/aom_ports/float.asm
index 90776bacb3..abff60a7a4 100644
--- a/media/libaom/src/aom_ports/emms.asm
+++ b/media/libaom/src/aom_ports/float.asm
@@ -9,20 +9,12 @@
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 
-;
-
 
 %include "aom_ports/x86_abi_support.asm"
 
 section .text
-global sym(aom_reset_mmx_state) PRIVATE
-sym(aom_reset_mmx_state):
-    emms
-    ret
-
-
 %if LIBAOM_YASM_WIN64
-global sym(aom_winx64_fldcw) PRIVATE
+globalsym(aom_winx64_fldcw)
 sym(aom_winx64_fldcw):
     sub   rsp, 8
     mov   [rsp], rcx ; win x64 specific
@@ -31,7 +23,7 @@ sym(aom_winx64_fldcw):
     ret
 
 
-global sym(aom_winx64_fstcw) PRIVATE
+globalsym(aom_winx64_fstcw)
 sym(aom_winx64_fstcw):
     sub   rsp, 8
     fstcw [rsp]
diff --git a/media/libaom/src/aom_ports/mem.h b/media/libaom/src/aom_ports/mem.h
index 9e3d424037..e9bb8adbc5 100644
--- a/media/libaom/src/aom_ports/mem.h
+++ b/media/libaom/src/aom_ports/mem.h
@@ -38,7 +38,10 @@
 #define __builtin_prefetch(x)
 #endif
 
-/* Shift down with rounding for use when n >= 0, value >= 0 */
+/* Shift down with rounding for use when n >= 0. Usually value >= 0, but the
+ * macro can be used with a negative value if the direction of rounding is
+ * acceptable.
+ */
 #define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
 
 /* Shift down with rounding for signed integers, for use when n >= 0 */
@@ -46,14 +49,22 @@
   (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
                  : ROUND_POWER_OF_TWO((value), (n)))
 
-/* Shift down with rounding for use when n >= 0, value >= 0 for (64 bit) */
+/* Shift down with rounding for use when n >= 0 (64-bit value). Usually
+ * value >= 0, but the macro can be used with a negative value if the direction
+ * of rounding is acceptable.
+ */
 #define ROUND_POWER_OF_TWO_64(value, n) \
   (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
-/* Shift down with rounding for signed integers, for use when n >= 0 (64 bit) */
+/* Shift down with rounding for signed integers, for use when n >= 0 (64-bit
+ * value)
+ */
 #define ROUND_POWER_OF_TWO_SIGNED_64(value, n)           \
   (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
                  : ROUND_POWER_OF_TWO_64((value), (n)))
 
+/* Shift down with ceil() for use when n >= 0 and value >= 0.*/
+#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
+
 /* shift right or left depending on sign of n */
 #define RIGHT_SIGNED_SHIFT(value, n) \
   ((n) < 0 ? ((value) << (-(n))) : ((value) >> (n)))
diff --git a/media/libaom/src/aom_ports/x86.h b/media/libaom/src/aom_ports/x86.h
index 8c18448714..79cbd02bf2 100644
--- a/media/libaom/src/aom_ports/x86.h
+++ b/media/libaom/src/aom_ports/x86.h
@@ -48,7 +48,7 @@ typedef enum {
 #define cpuid(func, func2, ax, bx, cx, dx)                      \
   __asm__ __volatile__("cpuid           \n\t"                   \
                        : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
-                       : "a"(func), "c"(func2));
+                       : "a"(func), "c"(func2))
 #else
 #define cpuid(func, func2, ax, bx, cx, dx)     \
   __asm__ __volatile__(                        \
@@ -56,7 +56,7 @@ typedef enum {
       "cpuid              \n\t"                \
       "xchg %%edi, %%ebx  \n\t"                \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #endif
 #elif defined(__SUNPRO_C) || \
     defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
@@ -68,7 +68,7 @@ typedef enum {
       "movl %ebx, %edi \n\t"                   \
       "xchg %rsi, %rbx \n\t"                   \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #else
 #define cpuid(func, func2, ax, bx, cx, dx)     \
   asm volatile(                                \
@@ -77,7 +77,7 @@ typedef enum {
       "movl %ebx, %edi  \n\t"                  \
       "popl %ebx        \n\t"                  \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #endif
 #else /* end __SUNPRO__ */
 #if ARCH_X86_64
@@ -164,15 +164,14 @@ static INLINE uint64_t xgetbv(void) {
 #define HAS_AVX2 0x80
 #define HAS_SSE4_2 0x100
 #ifndef BIT
-#define BIT(n) (1 << n)
+#define BIT(n) (1u << (n))
 #endif
 
 static INLINE int x86_simd_caps(void) {
   unsigned int flags = 0;
-  unsigned int mask = ~0;
+  unsigned int mask = ~0u;
   unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
   char *env;
-  (void)reg_ebx;
 
   /* See if the CPU capabilities are being overridden by the environment */
   env = getenv("AOM_SIMD_CAPS");
@@ -207,6 +206,7 @@ static INLINE int x86_simd_caps(void) {
 
   // bits 27 (OSXSAVE) & 28 (256-bit AVX)
   if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
+    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
     if ((xgetbv() & 0x6) == 0x6) {
       flags |= HAS_AVX;
 
@@ -219,12 +219,14 @@ static INLINE int x86_simd_caps(void) {
     }
   }
 
+  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
+
   return flags & mask;
 }
 
 // Fine-Grain Measurement Functions
 //
-// If you are a timing a small region of code, access the timestamp counter
+// If you are timing a small region of code, access the timestamp counter
 // (TSC) via:
 //
 // unsigned int start = x86_tsc_start();
@@ -236,7 +238,7 @@ static INLINE int x86_simd_caps(void) {
 // x86_readtsc directly, but prevent the CPU's out-of-order execution from
 // affecting the measurement (by having earlier/later instructions be evaluated
 // in the time interval). See the white paper, "How to Benchmark Code
-// Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures" by
+// Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by
 // Gabriele Paoloni for more information.
 //
 // If you are timing a large function (CPU time > a couple of seconds), use
@@ -302,14 +304,26 @@ static INLINE unsigned int x86_readtscp(void) {
 
 static INLINE unsigned int x86_tsc_start(void) {
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  // This call should not be removed. See function notes above.
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  // Avoid compiler warnings on unused-but-set variables.
+  (void)reg_eax;
+  (void)reg_ebx;
+  (void)reg_ecx;
+  (void)reg_edx;
   return x86_readtsc();
 }
 
 static INLINE unsigned int x86_tsc_end(void) {
   uint32_t v = x86_readtscp();
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  // This call should not be removed. See function notes above.
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  // Avoid compiler warnings on unused-but-set variables.
+  (void)reg_eax;
+  (void)reg_ebx;
+  (void)reg_ecx;
+  (void)reg_edx;
   return v;
 }
 
@@ -362,12 +376,21 @@ static unsigned short x87_get_control_word(void) {
 
 static INLINE unsigned int x87_set_double_precision(void) {
   unsigned int mode = x87_get_control_word();
+  // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1
+  // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf
+  // 8.1.5.2 Precision Control Field
+  // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control")
+  // determine the number of bits used in floating point calculations. To match
+  // later SSE instructions restrict x87 operations to Double Precision (0x200).
+  // Precision                     PC Field
+  // Single Precision (24-Bits)    00B
+  // Reserved                      01B
+  // Double Precision (53-Bits)    10B
+  // Extended Precision (64-Bits)  11B
   x87_set_control_word((mode & ~0x300) | 0x200);
   return mode;
 }
 
-extern void aom_reset_mmx_state(void);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/aom_ports/x86_abi_support.asm b/media/libaom/src/aom_ports/x86_abi_support.asm
index 64489908f0..f1a65f53e5 100644
--- a/media/libaom/src/aom_ports/x86_abi_support.asm
+++ b/media/libaom/src/aom_ports/x86_abi_support.asm
@@ -92,34 +92,51 @@
 %define LIBAOM_YASM_WIN64 0
 %endif
 
+; Declare groups of platforms
+%ifidn   __OUTPUT_FORMAT__,elf32
+  %define LIBAOM_ELF 1
+%elifidn   __OUTPUT_FORMAT__,elfx32
+  %define LIBAOM_ELF 1
+%elifidn   __OUTPUT_FORMAT__,elf64
+  %define LIBAOM_ELF 1
+%else
+  %define LIBAOM_ELF 0
+%endif
+
+%ifidn __OUTPUT_FORMAT__,macho32
+  %define LIBAOM_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+  %define LIBAOM_MACHO 1
+%else
+  %define LIBAOM_MACHO 0
+%endif
+
 ; sym()
 ; Return the proper symbol name for the target ABI.
 ;
 ; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols
 ; with C linkage be prefixed with an underscore.
 ;
-%ifidn   __OUTPUT_FORMAT__,elf32
-%define sym(x) x
-%elifidn __OUTPUT_FORMAT__,elf64
-%define sym(x) x
-%elifidn __OUTPUT_FORMAT__,elfx32
-%define sym(x) x
-%elif LIBAOM_YASM_WIN64
-%define sym(x) x
+%if LIBAOM_ELF || LIBAOM_YASM_WIN64
+  %define sym(x) x
 %else
-%define sym(x) _ %+ x
+  ; Mach-O / COFF
+  %define sym(x) _ %+ x
 %endif
 
-;  PRIVATE
-;  Macro for the attribute to hide a global symbol for the target ABI.
-;  This is only active if CHROMIUM is defined.
+; globalsym()
+; Return a global declaration with the proper decoration for the target ABI.
+;
+; When CHROMIUM is defined, include attributes to hide the symbol from the
+; global namespace.
 ;
-;  Chromium doesn't like exported global symbols due to symbol clashing with
-;  plugins among other things.
+; Chromium doesn't like exported global symbols due to symbol clashing with
+; plugins among other things.
 ;
-;  Requires Chromium's patched copy of yasm:
-;    http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
-;    http://www.tortall.net/projects/yasm/ticket/236
+; Requires Chromium's patched copy of yasm:
+;   http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
+;   http://www.tortall.net/projects/yasm/ticket/236
+; or nasm > 2.14.
 ;
 %ifdef CHROMIUM
   %ifdef __NASM_VER__
@@ -129,19 +146,16 @@
     %endif
   %endif
 
-  %ifidn   __OUTPUT_FORMAT__,elf32
-    %define PRIVATE :hidden
-  %elifidn __OUTPUT_FORMAT__,elf64
-    %define PRIVATE :hidden
-  %elifidn __OUTPUT_FORMAT__,elfx32
-    %define PRIVATE :hidden
-  %elif LIBAOM_YASM_WIN64
-    %define PRIVATE
+  %if LIBAOM_ELF
+    %define globalsym(x) global sym(x) %+ :function hidden
+  %elif LIBAOM_MACHO
+    %define globalsym(x) global sym(x) %+ :private_extern
   %else
-    %define PRIVATE :private_extern
+    ; COFF / PE32+
+    %define globalsym(x) global sym(x)
   %endif
 %else
-  %define PRIVATE
+  %define globalsym(x) global sym(x)
 %endif
 
 ; arg()
diff --git a/media/libaom/src/aom_scale/generic/yv12config.c b/media/libaom/src/aom_scale/generic/yv12config.c
index 1f80d7ba72..de56263fa4 100644
--- a/media/libaom/src/aom_scale/generic/yv12config.c
+++ b/media/libaom/src/aom_scale/generic/yv12config.c
@@ -50,7 +50,8 @@ static int realloc_frame_buffer_aligned(
     void *cb_priv, const int y_stride, const uint64_t yplane_size,
     const uint64_t uvplane_size, const int aligned_width,
     const int aligned_height, const int uv_width, const int uv_height,
-    const int uv_stride, const int uv_border_w, const int uv_border_h) {
+    const int uv_stride, const int uv_border_w, const int uv_border_h,
+    int alloc_y_buffer_8bit, int alloc_y_plane_only) {
   if (ybf) {
     const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
     const uint64_t frame_size =
@@ -143,17 +144,22 @@ static int realloc_frame_buffer_aligned(
 
     ybf->y_buffer = (uint8_t *)aom_align_addr(
         buf + (border * y_stride) + border, aom_byte_align);
-    ybf->u_buffer = (uint8_t *)aom_align_addr(
-        buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w,
-        aom_byte_align);
-    ybf->v_buffer =
-        (uint8_t *)aom_align_addr(buf + yplane_size + uvplane_size +
-                                      (uv_border_h * uv_stride) + uv_border_w,
-                                  aom_byte_align);
+    if (!alloc_y_plane_only) {
+      ybf->u_buffer = (uint8_t *)aom_align_addr(
+          buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w,
+          aom_byte_align);
+      ybf->v_buffer =
+          (uint8_t *)aom_align_addr(buf + yplane_size + uvplane_size +
+                                        (uv_border_h * uv_stride) + uv_border_w,
+                                    aom_byte_align);
+    } else {
+      ybf->u_buffer = NULL;
+      ybf->v_buffer = NULL;
+    }
 
     ybf->use_external_reference_buffers = 0;
 
-    if (use_highbitdepth) {
+    if (use_highbitdepth && alloc_y_buffer_8bit) {
       if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
       ybf->y_buffer_8bit = (uint8_t *)aom_memalign(32, (size_t)yplane_size);
       if (!ybf->y_buffer_8bit) return AOM_CODEC_MEM_ERROR;
@@ -171,26 +177,30 @@ static int realloc_frame_buffer_aligned(
   return AOM_CODEC_MEM_ERROR;
 }
 
-static int calc_stride_and_planesize(const int ss_x, const int ss_y,
-                                     const int aligned_width,
-                                     const int aligned_height, const int border,
-                                     const int byte_alignment, int *y_stride,
-                                     int *uv_stride, uint64_t *yplane_size,
-                                     uint64_t *uvplane_size,
-                                     const int uv_height) {
+static int calc_stride_and_planesize(
+    const int ss_x, const int ss_y, const int aligned_width,
+    const int aligned_height, const int border, const int byte_alignment,
+    int alloc_y_plane_only, int *y_stride, int *uv_stride,
+    uint64_t *yplane_size, uint64_t *uvplane_size, const int uv_height) {
   /* Only support allocating buffers that have a border that's a multiple
    * of 32. The border restriction is required to get 16-byte alignment of
    * the start of the chroma rows without introducing an arbitrary gap
    * between planes, which would break the semantics of things like
    * aom_img_set_rect(). */
   if (border & 0x1f) return AOM_CODEC_MEM_ERROR;
-  *y_stride = ((aligned_width + 2 * border) + 31) & ~31;
+  *y_stride = aom_calc_y_stride(aligned_width, border);
   *yplane_size =
       (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment;
 
-  *uv_stride = *y_stride >> ss_x;
-  *uvplane_size = (uv_height + 2 * (border >> ss_y)) * (uint64_t)(*uv_stride) +
-                  byte_alignment;
+  if (!alloc_y_plane_only) {
+    *uv_stride = *y_stride >> ss_x;
+    *uvplane_size =
+        (uv_height + 2 * (border >> ss_y)) * (uint64_t)(*uv_stride) +
+        byte_alignment;
+  } else {
+    *uv_stride = 0;
+    *uvplane_size = 0;
+  }
   return 0;
 }
 
@@ -198,7 +208,8 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                              int ss_x, int ss_y, int use_highbitdepth,
                              int border, int byte_alignment,
                              aom_codec_frame_buffer_t *fb,
-                             aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
+                             aom_get_frame_buffer_cb_fn_t cb, void *cb_priv,
+                             int alloc_y_buffer_8bit, int alloc_y_plane_only) {
 #if CONFIG_SIZE_LIMIT
   if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
     return AOM_CODEC_MEM_ERROR;
@@ -218,25 +229,26 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
 
     int error = calc_stride_and_planesize(
         ss_x, ss_y, aligned_width, aligned_height, border, byte_alignment,
-        &y_stride, &uv_stride, &yplane_size, &uvplane_size, uv_height);
+        alloc_y_plane_only, &y_stride, &uv_stride, &yplane_size, &uvplane_size,
+        uv_height);
     if (error) return error;
     return realloc_frame_buffer_aligned(
         ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
         byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size,
         aligned_width, aligned_height, uv_width, uv_height, uv_stride,
-        uv_border_w, uv_border_h);
+        uv_border_w, uv_border_h, alloc_y_buffer_8bit, alloc_y_plane_only);
   }
   return AOM_CODEC_MEM_ERROR;
 }
 
 int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                            int ss_x, int ss_y, int use_highbitdepth, int border,
-                           int byte_alignment) {
+                           int byte_alignment, int alloc_y_plane_only) {
   if (ybf) {
     aom_free_frame_buffer(ybf);
     return aom_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
                                     use_highbitdepth, border, byte_alignment,
-                                    NULL, NULL, NULL);
+                                    NULL, NULL, NULL, 0, alloc_y_plane_only);
   }
   return AOM_CODEC_MEM_ERROR;
 }
@@ -251,6 +263,7 @@ void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
 int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                       const aom_metadata_array_t *arr) {
   if (!ybf || !arr || !arr->metadata_array) return -1;
+  if (ybf->metadata == arr) return 0;
   aom_remove_metadata_from_frame_buffer(ybf);
   ybf->metadata = aom_img_metadata_array_alloc(arr->sz);
   if (!ybf->metadata) return -1;
diff --git a/media/libaom/src/aom_scale/generic/yv12extend.c b/media/libaom/src/aom_scale/generic/yv12extend.c
index 834a59dbff..5d797c8a5b 100644
--- a/media/libaom/src/aom_scale/generic/yv12extend.c
+++ b/media/libaom/src/aom_scale/generic/yv12extend.c
@@ -22,14 +22,16 @@
 static void extend_plane(uint8_t *const src, int src_stride, int width,
                          int height, int extend_top, int extend_left,
                          int extend_bottom, int extend_right) {
+  assert(src != NULL);
   int i;
   const int linesize = extend_left + extend_right + width;
+  assert(linesize <= src_stride);
 
   /* copy the left and right most columns out */
   uint8_t *src_ptr1 = src;
   uint8_t *src_ptr2 = src + width - 1;
   uint8_t *dst_ptr1 = src - extend_left;
-  uint8_t *dst_ptr2 = src + width;
+  uint8_t *dst_ptr2 = src_ptr2 + 1;
 
   for (i = 0; i < height; ++i) {
     memset(dst_ptr1, src_ptr1[0], extend_left);
@@ -44,18 +46,19 @@ static void extend_plane(uint8_t *const src, int src_stride, int width,
    * borders
    */
   src_ptr1 = src - extend_left;
-  src_ptr2 = src + src_stride * (height - 1) - extend_left;
-  dst_ptr1 = src + src_stride * -extend_top - extend_left;
-  dst_ptr2 = src + src_stride * height - extend_left;
+  dst_ptr1 = src_ptr1 + src_stride * -extend_top;
 
   for (i = 0; i < extend_top; ++i) {
     memcpy(dst_ptr1, src_ptr1, linesize);
     dst_ptr1 += src_stride;
   }
 
+  src_ptr2 = src_ptr1 + src_stride * (height - 1);
+  dst_ptr2 = src_ptr2;
+
   for (i = 0; i < extend_bottom; ++i) {
-    memcpy(dst_ptr2, src_ptr2, linesize);
     dst_ptr2 += src_stride;
+    memcpy(dst_ptr2, src_ptr2, linesize);
   }
 }
 
@@ -65,13 +68,14 @@ static void extend_plane_high(uint8_t *const src8, int src_stride, int width,
                               int extend_bottom, int extend_right) {
   int i;
   const int linesize = extend_left + extend_right + width;
+  assert(linesize <= src_stride);
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 
   /* copy the left and right most columns out */
   uint16_t *src_ptr1 = src;
   uint16_t *src_ptr2 = src + width - 1;
   uint16_t *dst_ptr1 = src - extend_left;
-  uint16_t *dst_ptr2 = src + width;
+  uint16_t *dst_ptr2 = src_ptr2 + 1;
 
   for (i = 0; i < height; ++i) {
     aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
@@ -86,18 +90,19 @@ static void extend_plane_high(uint8_t *const src8, int src_stride, int width,
    * borders
    */
   src_ptr1 = src - extend_left;
-  src_ptr2 = src + src_stride * (height - 1) - extend_left;
-  dst_ptr1 = src + src_stride * -extend_top - extend_left;
-  dst_ptr2 = src + src_stride * height - extend_left;
+  dst_ptr1 = src_ptr1 + src_stride * -extend_top;
 
   for (i = 0; i < extend_top; ++i) {
     memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
     dst_ptr1 += src_stride;
   }
 
+  src_ptr2 = src_ptr1 + src_stride * (height - 1);
+  dst_ptr2 = src_ptr2;
+
   for (i = 0; i < extend_bottom; ++i) {
-    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
     dst_ptr2 += src_stride;
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
   }
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
@@ -138,8 +143,8 @@ void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
 
 static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size,
                          const int num_planes) {
-  const int ss_x = ybf->uv_width < ybf->y_width;
-  const int ss_y = ybf->uv_height < ybf->y_height;
+  const int ss_x = ybf->subsampling_x;
+  const int ss_y = ybf->subsampling_y;
 
   assert(ybf->y_height - ybf->y_crop_height < 16);
   assert(ybf->y_width - ybf->y_crop_width < 16);
@@ -220,13 +225,8 @@ static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
 // Note: The frames are assumed to be identical in size.
 void aom_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_bc,
                            YV12_BUFFER_CONFIG *dst_bc, const int num_planes) {
-#if 0
-  /* These assertions are valid in the codec, but the libaom-tester uses
-   * this code slightly differently.
-   */
   assert(src_bc->y_width == dst_bc->y_width);
   assert(src_bc->y_height == dst_bc->y_height);
-#endif
 
 #if CONFIG_AV1_HIGHBITDEPTH
   assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) ==
@@ -460,7 +460,7 @@ int aom_yv12_realloc_with_new_border_c(YV12_BUFFER_CONFIG *ybf, int new_border,
     const int error = aom_alloc_frame_buffer(
         &new_buf, ybf->y_crop_width, ybf->y_crop_height, ybf->subsampling_x,
         ybf->subsampling_y, ybf->flags & YV12_FLAG_HIGHBITDEPTH, new_border,
-        byte_alignment);
+        byte_alignment, 0);
     if (error) return error;
     // Copy image buffer
     aom_yv12_copy_frame(ybf, &new_buf, num_planes);
diff --git a/media/libaom/src/aom_scale/mips/dspr2/yv12extend_dspr2.c b/media/libaom/src/aom_scale/mips/dspr2/yv12extend_dspr2.c
index 869e594d79..8556e71a20 100644
--- a/media/libaom/src/aom_scale/mips/dspr2/yv12extend_dspr2.c
+++ b/media/libaom/src/aom_scale/mips/dspr2/yv12extend_dspr2.c
@@ -90,6 +90,7 @@ static void extend_plane(uint8_t *const src, int src_stride, int width,
   top_dst = src + src_stride * (-extend_top) - extend_left;
   bot_dst = src + src_stride * (height)-extend_left;
   linesize = extend_left + extend_right + width;
+  assert(linesize <= src_stride);
 
   for (i = 0; i < extend_top; i++) {
     memcpy(top_dst, top_src, linesize);
@@ -105,8 +106,8 @@ static void extend_plane(uint8_t *const src, int src_stride, int width,
 static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
   const int c_w = ybf->uv_crop_width;
   const int c_h = ybf->uv_crop_height;
-  const int ss_x = ybf->uv_width < ybf->y_width;
-  const int ss_y = ybf->uv_height < ybf->y_height;
+  const int ss_x = ybf->subsampling_x;
+  const int ss_y = ybf->subsampling_y;
   const int c_et = ext_size >> ss_y;
   const int c_el = ext_size >> ss_x;
   const int c_eb = c_et + ybf->uv_height - ybf->uv_crop_height;
diff --git a/media/libaom/src/aom_scale/yv12config.h b/media/libaom/src/aom_scale/yv12config.h
index 3642bb7f37..581e923322 100644
--- a/media/libaom/src/aom_scale/yv12config.h
+++ b/media/libaom/src/aom_scale/yv12config.h
@@ -23,13 +23,21 @@ extern "C" {
 #include "aom/aom_integer.h"
 #include "aom/internal/aom_image_internal.h"
 
+/*!\cond */
+
 #define AOMINNERBORDERINPIXELS 160
 #define AOM_INTERP_EXTEND 4
 #define AOM_BORDER_IN_PIXELS 288
 #define AOM_ENC_NO_SCALE_BORDER 160
+#define AOM_ENC_ALLINTRA_BORDER 64
 #define AOM_DEC_BORDER_IN_PIXELS 64
 
+/*!\endcond */
+/*!
+ * \brief YV12 frame buffer data structure
+ */
 typedef struct yv12_buffer_config {
+  /*!\cond */
   union {
     struct {
       int y_width;
@@ -106,13 +114,16 @@ typedef struct yv12_buffer_config {
   int corrupted;
   int flags;
   aom_metadata_array_t *metadata;
+  /*!\endcond */
 } YV12_BUFFER_CONFIG;
 
+/*!\cond */
+
 #define YV12_FLAG_HIGHBITDEPTH 8
 
 int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                            int ss_x, int ss_y, int use_highbitdepth, int border,
-                           int byte_alignment);
+                           int byte_alignment, int alloc_y_plane_only);
 
 // Updates the yv12 buffer config with the frame buffer. |byte_alignment| must
 // be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not
@@ -125,10 +136,12 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                              int ss_x, int ss_y, int use_highbitdepth,
                              int border, int byte_alignment,
                              aom_codec_frame_buffer_t *fb,
-                             aom_get_frame_buffer_cb_fn_t cb, void *cb_priv);
+                             aom_get_frame_buffer_cb_fn_t cb, void *cb_priv,
+                             int alloc_y_buffer_8bit, int alloc_y_plane_only);
 
 int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
+/*!\endcond */
 /*!\brief Removes metadata from YUV_BUFFER_CONFIG struct.
  *
  * Frees metadata in frame buffer.
@@ -140,10 +153,12 @@ void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
 /*!\brief Copy metadata to YUV_BUFFER_CONFIG struct.
  *
- * Copies metadata in frame buffer.
+ * Copies metadata to frame buffer.
  * Frame buffer will clear any previous metadata and will reallocate the
  * metadata array to the new metadata size. Then, it will copy the new metadata
  * array into it.
+ * If arr metadata pointer points to the same address as current metadata in the
+ * frame buffer, function will do nothing and return 0.
  * Returns 0 on success or -1 on failure.
  *
  * \param[in]    ybf       Frame buffer struct pointer
@@ -152,6 +167,18 @@ void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                       const aom_metadata_array_t *arr);
 
+/*!\brief Calculate the stride required for the image.
+ *
+ * Calculates the stride value for an image from aligned width and border.
+ * Returns the y stride value.
+ *
+ * \param[in]    aligned_width       Aligned width of the image
+ * \param[in]    border              Border in pixels
+ */
+static AOM_INLINE int aom_calc_y_stride(int aligned_width, int border) {
+  return ((aligned_width + 2 * border) + 31) & ~31;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/media/libaom/src/aom_util/debug_util.c b/media/libaom/src/aom_util/debug_util.c
index 5762e693bf..3e9c314b97 100644
--- a/media/libaom/src/aom_util/debug_util.c
+++ b/media/libaom/src/aom_util/debug_util.c
@@ -10,6 +10,7 @@
  */
 
 #include <assert.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 #include "aom_util/debug_util.h"
@@ -22,7 +23,7 @@ void aom_bitstream_queue_set_frame_write(int frame_idx) {
   frame_idx_w = frame_idx;
 }
 
-int aom_bitstream_queue_get_frame_writee(void) { return frame_idx_w; }
+int aom_bitstream_queue_get_frame_write(void) { return frame_idx_w; }
 
 void aom_bitstream_queue_set_frame_read(int frame_idx) {
   frame_idx_r = frame_idx;
@@ -68,6 +69,19 @@ void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs) {
 }
 
 void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs) {
+  // If you observe a CDF error:
+  // - Set 'debug_cdf_mismatch' to true
+  // - Set target_frame_idx_r and target_queue_r to where CDF error was reported
+  // - Set a breakpoint in debugger at the 'fprintf' below.
+  const bool debug_cdf_mismatch = false;
+  if (debug_cdf_mismatch) {
+    int target_frame_idx_r = 1;
+    int target_queue_r = 18005;
+    if (frame_idx_w == target_frame_idx_r && queue_w == target_queue_r) {
+      fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+              frame_idx_w, queue_w);
+    }
+  }
   if (!skip_w) {
     result_queue[queue_w] = result;
     nsymbs_queue[queue_w] = nsymbs;
diff --git a/media/libaom/src/aomedia_logo_200.png b/media/libaom/src/aomedia_logo_200.png
new file mode 100644
index 0000000000..4a3b9fcc0c
--- /dev/null
+++ b/media/libaom/src/aomedia_logo_200.png
diff --git a/media/libaom/src/apps/aomdec.c b/media/libaom/src/apps/aomdec.c
index 2591d41a6c..2c74dd36ca 100644
--- a/media/libaom/src/apps/aomdec.c
+++ b/media/libaom/src/apps/aomdec.c
@@ -82,6 +82,8 @@ static const arg_def_t outputfile =
     ARG_DEF("o", "output", 1, "Output file name pattern (see below)");
 static const arg_def_t threadsarg =
     ARG_DEF("t", "threads", 1, "Max threads to use");
+static const arg_def_t rowmtarg =
+    ARG_DEF(NULL, "row-mt", 1, "Enable row based multi-threading, default: 0");
 static const arg_def_t verbosearg =
     ARG_DEF("v", "verbose", 0, "Show version string");
 static const arg_def_t scalearg =
@@ -106,11 +108,13 @@ static const arg_def_t skipfilmgrain =
     ARG_DEF(NULL, "skip-film-grain", 0, "Skip film grain application");
 
 static const arg_def_t *all_args[] = {
-  &help,       &codecarg,   &use_yv12,      &use_i420,      &flipuvarg,
-  &rawvideo,   &noblitarg,  &progressarg,   &limitarg,      &skiparg,
-  &summaryarg, &outputfile, &threadsarg,    &verbosearg,    &scalearg,
-  &fb_arg,     &md5arg,     &framestatsarg, &continuearg,   &outbitdeptharg,
-  &isannexb,   &oppointarg, &outallarg,     &skipfilmgrain, NULL
+  &help,           &codecarg, &use_yv12,      &use_i420,
+  &flipuvarg,      &rawvideo, &noblitarg,     &progressarg,
+  &limitarg,       &skiparg,  &summaryarg,    &outputfile,
+  &threadsarg,     &rowmtarg, &verbosearg,    &scalearg,
+  &fb_arg,         &md5arg,   &framestatsarg, &continuearg,
+  &outbitdeptharg, &isannexb, &oppointarg,    &outallarg,
+  &skipfilmgrain,  NULL
 };
 
 #if CONFIG_LIBYUV
@@ -166,9 +170,9 @@ static void show_help(FILE *fout, int shorthelp) {
   fprintf(fout, "\nIncluded decoders:\n\n");
 
   for (int i = 0; i < get_aom_decoder_count(); ++i) {
-    const AvxInterface *const decoder = get_aom_decoder_by_index(i);
-    fprintf(fout, "    %-6s - %s\n", decoder->name,
-            aom_codec_iface_name(decoder->codec_interface()));
+    aom_codec_iface_t *decoder = get_aom_decoder_by_index(i);
+    fprintf(fout, "    %-6s - %s\n", get_short_name_by_aom_decoder(decoder),
+            aom_codec_iface_name(decoder));
   }
 }
 
@@ -183,20 +187,22 @@ static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
   size_t frame_size = 0;
 
   if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile)) warn("Failed to read RAW frame size\n");
+    if (!feof(infile)) aom_tools_warn("Failed to read RAW frame size\n");
   } else {
     const size_t kCorruptFrameThreshold = 256 * 1024 * 1024;
     const size_t kFrameTooSmallThreshold = 256 * 1024;
     frame_size = mem_get_le32(raw_hdr);
 
     if (frame_size > kCorruptFrameThreshold) {
-      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      aom_tools_warn("Read invalid frame size (%u)\n",
+                     (unsigned int)frame_size);
       frame_size = 0;
     }
 
     if (frame_size < kFrameTooSmallThreshold) {
-      warn("Warning: Read invalid frame size (%u) - not a raw file?\n",
-           (unsigned int)frame_size);
+      aom_tools_warn(
+          "Warning: Read invalid frame size (%u) - not a raw file?\n",
+          (unsigned int)frame_size);
     }
 
     if (frame_size > *buffer_size) {
@@ -205,7 +211,7 @@ static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
         *buffer = new_buf;
         *buffer_size = 2 * frame_size;
       } else {
-        warn("Failed to allocate compressed data buffer\n");
+        aom_tools_warn("Failed to allocate compressed data buffer\n");
         frame_size = 0;
       }
     }
@@ -213,7 +219,7 @@ static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
 
   if (!feof(infile)) {
     if (fread(*buffer, 1, frame_size, infile) != frame_size) {
-      warn("Failed to read full frame\n");
+      aom_tools_warn("Failed to read full frame\n");
       return 1;
     }
     *bytes_read = frame_size;
@@ -254,11 +260,10 @@ static int file_is_raw(struct AvxInputContext *input) {
 
     if (mem_get_le32(buf) < 256 * 1024 * 1024) {
       for (i = 0; i < get_aom_decoder_count(); ++i) {
-        const AvxInterface *const decoder = get_aom_decoder_by_index(i);
-        if (!aom_codec_peek_stream_info(decoder->codec_interface(), buf + 4,
-                                        32 - 4, &si)) {
+        aom_codec_iface_t *decoder = get_aom_decoder_by_index(i);
+        if (!aom_codec_peek_stream_info(decoder, buf + 4, 32 - 4, &si)) {
           is_raw = 1;
-          input->fourcc = decoder->fourcc;
+          input->fourcc = get_fourcc_by_aom_decoder(decoder);
           input->width = si.w;
           input->height = si.h;
           input->framerate.numerator = 30;
@@ -436,8 +441,6 @@ static int main_loop(int argc, const char **argv_) {
   int stop_after = 0, summary = 0, quiet = 1;
   int arg_skip = 0;
   int keep_going = 0;
-  const AvxInterface *interface = NULL;
-  const AvxInterface *fourcc_interface = NULL;
   uint64_t dx_time = 0;
   struct arg arg;
   char **argv, **argi, **argj;
@@ -456,6 +459,7 @@ static int main_loop(int argc, const char **argv_) {
   int operating_point = 0;
   int output_all_layers = 0;
   int skip_film_grain = 0;
+  int enable_row_mt = 0;
   aom_image_t *scaled_img = NULL;
   aom_image_t *img_shifted = NULL;
   int frame_avail, got_data, flush_decoder = 0;
@@ -489,7 +493,12 @@ static int main_loop(int argc, const char **argv_) {
   /* Parse command line */
   exec_name = argv_[0];
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
 
+  aom_codec_iface_t *interface = NULL;
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
@@ -498,7 +507,7 @@ static int main_loop(int argc, const char **argv_) {
       show_help(stdout, 0);
       exit(EXIT_SUCCESS);
     } else if (arg_match(&arg, &codecarg, argi)) {
-      interface = get_aom_decoder_by_name(arg.val);
+      interface = get_aom_decoder_by_short_name(arg.val);
       if (!interface)
         die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
     } else if (arg_match(&arg, &looparg, argi)) {
@@ -551,6 +560,8 @@ static int main_loop(int argc, const char **argv_) {
             cfg.threads);
       }
 #endif
+    } else if (arg_match(&arg, &rowmtarg, argi)) {
+      enable_row_mt = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &verbosearg, argi)) {
       quiet = 0;
     } else if (arg_match(&arg, &scalearg, argi)) {
@@ -600,6 +611,7 @@ static int main_loop(int argc, const char **argv_) {
     fprintf(stderr,
             "Not dumping raw video to your terminal. Use '-o -' to "
             "override.\n");
+    free(argv);
     return EXIT_FAILURE;
   }
 #endif
@@ -658,21 +670,22 @@ static int main_loop(int argc, const char **argv_) {
 #endif
   }
 
-  fourcc_interface = get_aom_decoder_by_fourcc(aom_input_ctx.fourcc);
+  aom_codec_iface_t *fourcc_interface =
+      get_aom_decoder_by_fourcc(aom_input_ctx.fourcc);
 
   if (is_ivf && !fourcc_interface)
     fatal("Unsupported fourcc: %x\n", aom_input_ctx.fourcc);
 
   if (interface && fourcc_interface && interface != fourcc_interface)
-    warn("Header indicates codec: %s\n", fourcc_interface->name);
+    aom_tools_warn("Header indicates codec: %s\n",
+                   aom_codec_iface_name(fourcc_interface));
   else
     interface = fourcc_interface;
 
   if (!interface) interface = get_aom_decoder_by_index(0);
 
   dec_flags = 0;
-  if (aom_codec_dec_init(&decoder, interface->codec_interface(), &cfg,
-                         dec_flags)) {
+  if (aom_codec_dec_init(&decoder, interface, &cfg, dec_flags)) {
     fprintf(stderr, "Failed to initialize decoder: %s\n",
             aom_codec_error(&decoder));
     goto fail2;
@@ -706,6 +719,12 @@ static int main_loop(int argc, const char **argv_) {
     goto fail;
   }
 
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_ROW_MT, enable_row_mt)) {
+    fprintf(stderr, "Failed to set row multithreading mode: %s\n",
+            aom_codec_error(&decoder));
+    goto fail;
+  }
+
   if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
   while (arg_skip) {
     if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break;
@@ -716,6 +735,10 @@ static int main_loop(int argc, const char **argv_) {
     ext_fb_list.num_external_frame_buffers = num_external_frame_buffers;
     ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc(
         num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb));
+    if (!ext_fb_list.ext_fb) {
+      fprintf(stderr, "Failed to allocate ExternalFrameBuffer\n");
+      goto fail;
+    }
     if (aom_codec_set_frame_buffer_functions(&decoder, get_av1_frame_buffer,
                                              release_av1_frame_buffer,
                                              &ext_fb_list)) {
@@ -747,10 +770,10 @@ static int main_loop(int argc, const char **argv_) {
 
         if (aom_codec_decode(&decoder, buf, bytes_in_buffer, NULL)) {
           const char *detail = aom_codec_error_detail(&decoder);
-          warn("Failed to decode frame %d: %s", frame_in,
-               aom_codec_error(&decoder));
+          aom_tools_warn("Failed to decode frame %d: %s", frame_in,
+                         aom_codec_error(&decoder));
 
-          if (detail) warn("Additional information: %s", detail);
+          if (detail) aom_tools_warn("Additional information: %s", detail);
           if (!keep_going) goto fail;
         }
 
@@ -758,8 +781,8 @@ static int main_loop(int argc, const char **argv_) {
           int qp;
           if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_LAST_QUANTIZER,
                                             &qp)) {
-            warn("Failed AOMD_GET_LAST_QUANTIZER: %s",
-                 aom_codec_error(&decoder));
+            aom_tools_warn("Failed AOMD_GET_LAST_QUANTIZER: %s",
+                           aom_codec_error(&decoder));
             if (!keep_going) goto fail;
           }
           fprintf(framestats_file, "%d,%d\r\n", (int)bytes_in_buffer, qp);
@@ -779,7 +802,8 @@ static int main_loop(int argc, const char **argv_) {
     if (flush_decoder) {
       // Flush the decoder.
       if (aom_codec_decode(&decoder, NULL, 0, NULL)) {
-        warn("Failed to flush decoder: %s", aom_codec_error(&decoder));
+        aom_tools_warn("Failed to flush decoder: %s",
+                       aom_codec_error(&decoder));
       }
     }
 
@@ -793,7 +817,8 @@ static int main_loop(int argc, const char **argv_) {
 
       if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_FRAME_CORRUPTED,
                                         &corrupted)) {
-        warn("Failed AOM_GET_FRAME_CORRUPTED: %s", aom_codec_error(&decoder));
+        aom_tools_warn("Failed AOM_GET_FRAME_CORRUPTED: %s",
+                       aom_codec_error(&decoder));
         if (!keep_going) goto fail;
       }
       frames_corrupted += corrupted;
@@ -828,6 +853,11 @@ static int main_loop(int argc, const char **argv_) {
             }
             scaled_img =
                 aom_img_alloc(NULL, img->fmt, render_width, render_height, 16);
+            if (!scaled_img) {
+              fprintf(stderr, "Failed to allocate scaled image (%d x %d)\n",
+                      render_width, render_height);
+              goto fail;
+            }
             scaled_img->bit_depth = img->bit_depth;
             scaled_img->monochrome = img->monochrome;
             scaled_img->csp = img->csp;
@@ -856,8 +886,12 @@ static int main_loop(int argc, const char **argv_) {
           output_bit_depth = fixed_output_bit_depth;
         }
         // Shift up or down if necessary
-        if (output_bit_depth != 0)
-          aom_shift_img(output_bit_depth, &img, &img_shifted);
+        if (output_bit_depth != 0) {
+          if (!aom_shift_img(output_bit_depth, &img, &img_shifted)) {
+            fprintf(stderr, "Error allocating image\n");
+            goto fail;
+          }
+        }
 
         aom_input_ctx.width = img->d_w;
         aom_input_ctx.height = img->d_h;
@@ -872,7 +906,8 @@ static int main_loop(int argc, const char **argv_) {
               len = y4m_write_file_header(
                   y4m_buf, sizeof(y4m_buf), aom_input_ctx.width,
                   aom_input_ctx.height, &aom_input_ctx.framerate,
-                  img->monochrome, img->csp, img->fmt, img->bit_depth);
+                  img->monochrome, img->csp, img->fmt, img->bit_depth,
+                  img->range);
               if (img->csp == AOM_CSP_COLOCATED) {
                 fprintf(stderr,
                         "Warning: Y4M lacks a colorspace for colocated "
@@ -1009,6 +1044,10 @@ int main(int argc, const char **argv_) {
   int error = 0;
 
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
diff --git a/media/libaom/src/apps/aomenc.c b/media/libaom/src/apps/aomenc.c
index bb57726b4d..11e548f2c6 100644
--- a/media/libaom/src/apps/aomenc.c
+++ b/media/libaom/src/apps/aomenc.c
@@ -64,8 +64,8 @@ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
 
 static const char *exec_name;
 
-static void warn_or_exit_on_errorv(aom_codec_ctx_t *ctx, int fatal,
-                                   const char *s, va_list ap) {
+static AOM_TOOLS_FORMAT_PRINTF(3, 0) void warn_or_exit_on_errorv(
+    aom_codec_ctx_t *ctx, int fatal, const char *s, va_list ap) {
   if (ctx->err) {
     const char *detail = aom_codec_error_detail(ctx);
 
@@ -78,7 +78,9 @@ static void warn_or_exit_on_errorv(aom_codec_ctx_t *ctx, int fatal,
   }
 }
 
-static void ctx_exit_on_error(aom_codec_ctx_t *ctx, const char *s, ...) {
+static AOM_TOOLS_FORMAT_PRINTF(2,
+                               3) void ctx_exit_on_error(aom_codec_ctx_t *ctx,
+                                                         const char *s, ...) {
   va_list ap;
 
   va_start(ap, s);
@@ -86,8 +88,8 @@ static void ctx_exit_on_error(aom_codec_ctx_t *ctx, const char *s, ...) {
   va_end(ap);
 }
 
-static void warn_or_exit_on_error(aom_codec_ctx_t *ctx, int fatal,
-                                  const char *s, ...) {
+static AOM_TOOLS_FORMAT_PRINTF(3, 4) void warn_or_exit_on_error(
+    aom_codec_ctx_t *ctx, int fatal, const char *s, ...) {
   va_list ap;
 
   va_start(ap, s);
@@ -123,808 +125,14 @@ static int fourcc_is_ivf(const char detect[4]) {
   return 0;
 }
 
-static const arg_def_t help =
-    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
-static const arg_def_t debugmode =
-    ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)");
-static const arg_def_t outputfile =
-    ARG_DEF("o", "output", 1, "Output filename");
-static const arg_def_t use_yv12 =
-    ARG_DEF(NULL, "yv12", 0, "Input file is YV12 ");
-static const arg_def_t use_i420 =
-    ARG_DEF(NULL, "i420", 0, "Input file is I420 (default)");
-static const arg_def_t use_i422 =
-    ARG_DEF(NULL, "i422", 0, "Input file is I422");
-static const arg_def_t use_i444 =
-    ARG_DEF(NULL, "i444", 0, "Input file is I444");
-static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
-static const arg_def_t passes =
-    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
-static const arg_def_t pass_arg =
-    ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
-static const arg_def_t fpf_name =
-    ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
-static const arg_def_t limit =
-    ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames");
-static const arg_def_t skip =
-    ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
-static const arg_def_t good_dl =
-    ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline");
-static const arg_def_t rt_dl =
-    ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline");
-static const arg_def_t quietarg =
-    ARG_DEF("q", "quiet", 0, "Do not print encode progress");
-static const arg_def_t verbosearg =
-    ARG_DEF("v", "verbose", 0, "Show encoder parameters");
-static const arg_def_t psnrarg =
-    ARG_DEF(NULL, "psnr", 0, "Show PSNR in status line");
-static const arg_def_t use_cfg = ARG_DEF("c", "cfg", 1, "Config file to use");
-
-static const struct arg_enum_list test_decode_enum[] = {
-  { "off", TEST_DECODE_OFF },
-  { "fatal", TEST_DECODE_FATAL },
-  { "warn", TEST_DECODE_WARN },
-  { NULL, 0 }
-};
-static const arg_def_t recontest = ARG_DEF_ENUM(
-    NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum);
-static const arg_def_t framerate =
-    ARG_DEF(NULL, "fps", 1, "Stream frame rate (rate/scale)");
-static const arg_def_t use_webm =
-    ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)");
-static const arg_def_t use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF");
-static const arg_def_t use_obu = ARG_DEF(NULL, "obu", 0, "Output OBU");
-static const arg_def_t q_hist_n =
-    ARG_DEF(NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)");
-static const arg_def_t rate_hist_n =
-    ARG_DEF(NULL, "rate-hist", 1, "Show rate histogram (n-buckets)");
-static const arg_def_t disable_warnings =
-    ARG_DEF(NULL, "disable-warnings", 0,
-            "Disable warnings about potentially incorrect encode settings.");
-static const arg_def_t disable_warning_prompt =
-    ARG_DEF("y", "disable-warning-prompt", 0,
-            "Display warnings, but do not prompt user to continue.");
-static const struct arg_enum_list bitdepth_enum[] = {
-  { "8", AOM_BITS_8 }, { "10", AOM_BITS_10 }, { "12", AOM_BITS_12 }, { NULL, 0 }
-};
-
-static const arg_def_t bitdeptharg = ARG_DEF_ENUM(
-    "b", "bit-depth", 1,
-    "Bit depth for codec (8 for version <=1, 10 or 12 for version 2)",
-    bitdepth_enum);
-static const arg_def_t inbitdeptharg =
-    ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input");
-
-static const arg_def_t input_chroma_subsampling_x = ARG_DEF(
-    NULL, "input-chroma-subsampling-x", 1, "chroma subsampling x value.");
-static const arg_def_t input_chroma_subsampling_y = ARG_DEF(
-    NULL, "input-chroma-subsampling-y", 1, "chroma subsampling y value.");
-
-static const arg_def_t *main_args[] = { &help,
-                                        &use_cfg,
-                                        &debugmode,
-                                        &outputfile,
-                                        &codecarg,
-                                        &passes,
-                                        &pass_arg,
-                                        &fpf_name,
-                                        &limit,
-                                        &skip,
-                                        &good_dl,
-                                        &rt_dl,
-                                        &quietarg,
-                                        &verbosearg,
-                                        &psnrarg,
-                                        &use_webm,
-                                        &use_ivf,
-                                        &use_obu,
-                                        &q_hist_n,
-                                        &rate_hist_n,
-                                        &disable_warnings,
-                                        &disable_warning_prompt,
-                                        &recontest,
-                                        NULL };
-
-static const arg_def_t usage =
-    ARG_DEF("u", "usage", 1, "Usage profile number to use");
-static const arg_def_t threads =
-    ARG_DEF("t", "threads", 1, "Max number of threads to use");
-static const arg_def_t profile =
-    ARG_DEF(NULL, "profile", 1, "Bitstream profile number to use");
-static const arg_def_t width = ARG_DEF("w", "width", 1, "Frame width");
-static const arg_def_t height = ARG_DEF("h", "height", 1, "Frame height");
-static const arg_def_t forced_max_frame_width = ARG_DEF(
-    NULL, "forced_max_frame_width", 1, "Maximum frame width value to force");
-static const arg_def_t forced_max_frame_height = ARG_DEF(
-    NULL, "forced_max_frame_height", 1, "Maximum frame height value to force");
-#if CONFIG_WEBM_IO
-static const struct arg_enum_list stereo_mode_enum[] = {
-  { "mono", STEREO_FORMAT_MONO },
-  { "left-right", STEREO_FORMAT_LEFT_RIGHT },
-  { "bottom-top", STEREO_FORMAT_BOTTOM_TOP },
-  { "top-bottom", STEREO_FORMAT_TOP_BOTTOM },
-  { "right-left", STEREO_FORMAT_RIGHT_LEFT },
-  { NULL, 0 }
-};
-static const arg_def_t stereo_mode = ARG_DEF_ENUM(
-    NULL, "stereo-mode", 1, "Stereo 3D video format", stereo_mode_enum);
-#endif
-static const arg_def_t timebase = ARG_DEF(
-    NULL, "timebase", 1, "Output timestamp precision (fractional seconds)");
-static const arg_def_t global_error_resilient =
-    ARG_DEF(NULL, "global-error-resilient", 1,
-            "Enable global error resiliency features");
-static const arg_def_t lag_in_frames =
-    ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag");
-static const arg_def_t large_scale_tile = ARG_DEF(
-    NULL, "large-scale-tile", 1,
-    "Large scale tile coding (0: off (default), 1: on (ivf output only))");
-static const arg_def_t monochrome =
-    ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)");
-static const arg_def_t full_still_picture_hdr = ARG_DEF(
-    NULL, "full-still-picture-hdr", 0, "Use full header for still picture");
-
-static const arg_def_t *global_args[] = { &use_yv12,
-                                          &use_i420,
-                                          &use_i422,
-                                          &use_i444,
-                                          &usage,
-                                          &threads,
-                                          &profile,
-                                          &width,
-                                          &height,
-                                          &forced_max_frame_width,
-                                          &forced_max_frame_height,
-#if CONFIG_WEBM_IO
-                                          &stereo_mode,
-#endif
-                                          &timebase,
-                                          &framerate,
-                                          &global_error_resilient,
-                                          &bitdeptharg,
-                                          &lag_in_frames,
-                                          &large_scale_tile,
-                                          &monochrome,
-                                          &full_still_picture_hdr,
-                                          NULL };
-
-static const arg_def_t dropframe_thresh =
-    ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
-static const arg_def_t resize_mode =
-    ARG_DEF(NULL, "resize-mode", 1, "Frame resize mode");
-static const arg_def_t resize_denominator =
-    ARG_DEF(NULL, "resize-denominator", 1, "Frame resize denominator");
-static const arg_def_t resize_kf_denominator = ARG_DEF(
-    NULL, "resize-kf-denominator", 1, "Frame resize keyframe denominator");
-static const arg_def_t superres_mode =
-    ARG_DEF(NULL, "superres-mode", 1, "Frame super-resolution mode");
-static const arg_def_t superres_denominator = ARG_DEF(
-    NULL, "superres-denominator", 1, "Frame super-resolution denominator");
-static const arg_def_t superres_kf_denominator =
-    ARG_DEF(NULL, "superres-kf-denominator", 1,
-            "Frame super-resolution keyframe denominator");
-static const arg_def_t superres_qthresh = ARG_DEF(
-    NULL, "superres-qthresh", 1, "Frame super-resolution qindex threshold");
-static const arg_def_t superres_kf_qthresh =
-    ARG_DEF(NULL, "superres-kf-qthresh", 1,
-            "Frame super-resolution keyframe qindex threshold");
-static const struct arg_enum_list end_usage_enum[] = { { "vbr", AOM_VBR },
-                                                       { "cbr", AOM_CBR },
-                                                       { "cq", AOM_CQ },
-                                                       { "q", AOM_Q },
-                                                       { NULL, 0 } };
-static const arg_def_t end_usage =
-    ARG_DEF_ENUM(NULL, "end-usage", 1, "Rate control mode", end_usage_enum);
-static const arg_def_t target_bitrate =
-    ARG_DEF(NULL, "target-bitrate", 1, "Bitrate (kbps)");
-static const arg_def_t min_quantizer =
-    ARG_DEF(NULL, "min-q", 1, "Minimum (best) quantizer");
-static const arg_def_t max_quantizer =
-    ARG_DEF(NULL, "max-q", 1, "Maximum (worst) quantizer");
-static const arg_def_t undershoot_pct =
-    ARG_DEF(NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)");
-static const arg_def_t overshoot_pct =
-    ARG_DEF(NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)");
-static const arg_def_t buf_sz =
-    ARG_DEF(NULL, "buf-sz", 1, "Client buffer size (ms)");
-static const arg_def_t buf_initial_sz =
-    ARG_DEF(NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)");
-static const arg_def_t buf_optimal_sz =
-    ARG_DEF(NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)");
-static const arg_def_t *rc_args[] = { &dropframe_thresh,
-                                      &resize_mode,
-                                      &resize_denominator,
-                                      &resize_kf_denominator,
-                                      &superres_mode,
-                                      &superres_denominator,
-                                      &superres_kf_denominator,
-                                      &superres_qthresh,
-                                      &superres_kf_qthresh,
-                                      &end_usage,
-                                      &target_bitrate,
-                                      &min_quantizer,
-                                      &max_quantizer,
-                                      &undershoot_pct,
-                                      &overshoot_pct,
-                                      &buf_sz,
-                                      &buf_initial_sz,
-                                      &buf_optimal_sz,
-                                      NULL };
-
-static const arg_def_t bias_pct =
-    ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
-static const arg_def_t minsection_pct =
-    ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
-static const arg_def_t maxsection_pct =
-    ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
-static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct,
-                                              &maxsection_pct, NULL };
-static const arg_def_t fwd_kf_enabled =
-    ARG_DEF(NULL, "enable-fwd-kf", 1, "Enable forward reference keyframes");
-static const arg_def_t kf_min_dist =
-    ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
-static const arg_def_t kf_max_dist =
-    ARG_DEF(NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)");
-static const arg_def_t kf_disabled =
-    ARG_DEF(NULL, "disable-kf", 0, "Disable keyframe placement");
-static const arg_def_t *kf_args[] = { &fwd_kf_enabled, &kf_min_dist,
-                                      &kf_max_dist, &kf_disabled, NULL };
-static const arg_def_t sframe_dist =
-    ARG_DEF(NULL, "sframe-dist", 1, "S-Frame interval (frames)");
-static const arg_def_t sframe_mode =
-    ARG_DEF(NULL, "sframe-mode", 1, "S-Frame insertion mode (1..2)");
-static const arg_def_t save_as_annexb =
-    ARG_DEF(NULL, "annexb", 1, "Save as Annex-B");
-static const arg_def_t noise_sens =
-    ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)");
-static const arg_def_t sharpness =
-    ARG_DEF(NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
-static const arg_def_t static_thresh =
-    ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold");
-static const arg_def_t auto_altref =
-    ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames");
-static const arg_def_t arnr_maxframes =
-    ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)");
-static const arg_def_t arnr_strength =
-    ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)");
-static const struct arg_enum_list tuning_enum[] = {
-  { "psnr", AOM_TUNE_PSNR },
-  { "ssim", AOM_TUNE_SSIM },
-  { "vmaf_with_preprocessing", AOM_TUNE_VMAF_WITH_PREPROCESSING },
-  { "vmaf_without_preprocessing", AOM_TUNE_VMAF_WITHOUT_PREPROCESSING },
-  { "vmaf", AOM_TUNE_VMAF_MAX_GAIN },
-  { NULL, 0 }
-};
-static const arg_def_t tune_metric =
-    ARG_DEF_ENUM(NULL, "tune", 1, "Distortion metric tuned with", tuning_enum);
-static const arg_def_t cq_level =
-    ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level");
-static const arg_def_t max_intra_rate_pct =
-    ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)");
-
-#if CONFIG_AV1_ENCODER
-static const arg_def_t cpu_used_av1 =
-    ARG_DEF(NULL, "cpu-used", 1,
-            "Speed setting (0..6 in good mode, 6..8 in realtime mode)");
-static const arg_def_t rowmtarg =
-    ARG_DEF(NULL, "row-mt", 1,
-            "Enable row based multi-threading (0: off, 1: on (default))");
-static const arg_def_t tile_cols =
-    ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
-static const arg_def_t tile_rows =
-    ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2");
-static const arg_def_t enable_tpl_model =
-    ARG_DEF(NULL, "enable-tpl-model", 1,
-            "RDO based on frame temporal dependency "
-            "(0: off, 1: backward source based). "
-            "This is required for deltaq mode.");
-static const arg_def_t enable_keyframe_filtering =
-    ARG_DEF(NULL, "enable-keyframe-filtering", 1,
-            "Apply temporal filtering on key frame "
-            "(0: false, 1: true (default)");
-static const arg_def_t tile_width =
-    ARG_DEF(NULL, "tile-width", 1, "Tile widths (comma separated)");
-static const arg_def_t tile_height =
-    ARG_DEF(NULL, "tile-height", 1, "Tile heights (command separated)");
-static const arg_def_t lossless =
-    ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
-static const arg_def_t enable_cdef =
-    ARG_DEF(NULL, "enable-cdef", 1,
-            "Enable the constrained directional enhancement filter (0: false, "
-            "1: true (default))");
-static const arg_def_t enable_restoration = ARG_DEF(
-    NULL, "enable-restoration", 1,
-    "Enable the loop restoration filter (0: false (default in Realtime mode), "
-    "1: true (default in Non-realtime mode))");
-static const arg_def_t enable_rect_partitions =
-    ARG_DEF(NULL, "enable-rect-partitions", 1,
-            "Enable rectangular partitions "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_ab_partitions =
-    ARG_DEF(NULL, "enable-ab-partitions", 1,
-            "Enable ab partitions (0: false, 1: true (default))");
-static const arg_def_t enable_1to4_partitions =
-    ARG_DEF(NULL, "enable-1to4-partitions", 1,
-            "Enable 1:4 and 4:1 partitions "
-            "(0: false, 1: true (default))");
-static const arg_def_t min_partition_size =
-    ARG_DEF(NULL, "min-partition-size", 4,
-            "Set min partition size "
-            "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128). "
-            "On frame with 4k+ resolutions or higher speed settings, the min "
-            "partition size will have a minimum of 8.");
-static const arg_def_t max_partition_size =
-    ARG_DEF(NULL, "max-partition-size", 128,
-            "Set max partition size "
-            "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)");
-static const arg_def_t enable_dual_filter =
-    ARG_DEF(NULL, "enable-dual-filter", 1,
-            "Enable dual filter "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_chroma_deltaq =
-    ARG_DEF(NULL, "enable-chroma-deltaq", 1,
-            "Enable chroma delta quant "
-            "(0: false (default), 1: true)");
-static const arg_def_t enable_intra_edge_filter =
-    ARG_DEF(NULL, "enable-intra-edge-filter", 1,
-            "Enable intra edge filtering "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_order_hint =
-    ARG_DEF(NULL, "enable-order-hint", 1,
-            "Enable order hint "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_tx64 =
-    ARG_DEF(NULL, "enable-tx64", 1,
-            "Enable 64-pt transform (0: false, 1: true (default))");
-static const arg_def_t enable_flip_idtx =
-    ARG_DEF(NULL, "enable-flip-idtx", 1,
-            "Enable extended transform type (0: false, 1: true (default)) "
-            "including FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, "
-            "ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, "
-            "H_ADST, V_FLIPADST, H_FLIPADST");
-static const arg_def_t enable_dist_wtd_comp =
-    ARG_DEF(NULL, "enable-dist-wtd-comp", 1,
-            "Enable distance-weighted compound "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_masked_comp =
-    ARG_DEF(NULL, "enable-masked-comp", 1,
-            "Enable masked (wedge/diff-wtd) compound "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_onesided_comp =
-    ARG_DEF(NULL, "enable-onesided-comp", 1,
-            "Enable one sided compound "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_interintra_comp =
-    ARG_DEF(NULL, "enable-interintra-comp", 1,
-            "Enable interintra compound "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_smooth_interintra =
-    ARG_DEF(NULL, "enable-smooth-interintra", 1,
-            "Enable smooth interintra mode "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_diff_wtd_comp =
-    ARG_DEF(NULL, "enable-diff-wtd-comp", 1,
-            "Enable difference-weighted compound "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_interinter_wedge =
-    ARG_DEF(NULL, "enable-interinter-wedge", 1,
-            "Enable interinter wedge compound "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_interintra_wedge =
-    ARG_DEF(NULL, "enable-interintra-wedge", 1,
-            "Enable interintra wedge compound "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_global_motion =
-    ARG_DEF(NULL, "enable-global-motion", 1,
-            "Enable global motion "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_warped_motion =
-    ARG_DEF(NULL, "enable-warped-motion", 1,
-            "Enable local warped motion "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_filter_intra =
-    ARG_DEF(NULL, "enable-filter-intra", 1,
-            "Enable filter intra prediction mode "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_smooth_intra =
-    ARG_DEF(NULL, "enable-smooth-intra", 1,
-            "Enable smooth intra prediction modes "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_paeth_intra =
-    ARG_DEF(NULL, "enable-paeth-intra", 1,
-            "Enable Paeth intra prediction mode (0: false, 1: true (default))");
-static const arg_def_t enable_cfl_intra =
-    ARG_DEF(NULL, "enable-cfl-intra", 1,
-            "Enable chroma from luma intra prediction mode "
-            "(0: false, 1: true (default))");
-static const arg_def_t force_video_mode =
-    ARG_DEF(NULL, "force-video-mode", 1,
-            "Force video mode (0: false, 1: true (default))");
-static const arg_def_t enable_obmc = ARG_DEF(
-    NULL, "enable-obmc", 1, "Enable OBMC (0: false, 1: true (default))");
-static const arg_def_t enable_overlay =
-    ARG_DEF(NULL, "enable-overlay", 1,
-            "Enable coding overlay frames (0: false, 1: true (default))");
-static const arg_def_t enable_palette =
-    ARG_DEF(NULL, "enable-palette", 1,
-            "Enable palette prediction mode (0: false, 1: true (default))");
-static const arg_def_t enable_intrabc =
-    ARG_DEF(NULL, "enable-intrabc", 1,
-            "Enable intra block copy prediction mode "
-            "(0: false, 1: true (default))");
-static const arg_def_t enable_angle_delta =
-    ARG_DEF(NULL, "enable-angle-delta", 1,
-            "Enable intra angle delta (0: false, 1: true (default))");
-static const arg_def_t disable_trellis_quant =
-    ARG_DEF(NULL, "disable-trellis-quant", 1,
-            "Disable trellis optimization of quantized coefficients (0: false "
-            "1: true  2: true for rd search 3: true for estimate yrd serch "
-            "(default))");
-static const arg_def_t enable_qm =
-    ARG_DEF(NULL, "enable-qm", 1,
-            "Enable quantisation matrices (0: false (default), 1: true)");
-static const arg_def_t qm_min = ARG_DEF(
-    NULL, "qm-min", 1, "Min quant matrix flatness (0..15), default is 8");
-static const arg_def_t qm_max = ARG_DEF(
-    NULL, "qm-max", 1, "Max quant matrix flatness (0..15), default is 15");
-static const arg_def_t reduced_tx_type_set = ARG_DEF(
-    NULL, "reduced-tx-type-set", 1, "Use reduced set of transform types");
-static const arg_def_t use_intra_dct_only =
-    ARG_DEF(NULL, "use-intra-dct-only", 1, "Use DCT only for INTRA modes");
-static const arg_def_t use_inter_dct_only =
-    ARG_DEF(NULL, "use-inter-dct-only", 1, "Use DCT only for INTER modes");
-static const arg_def_t use_intra_default_tx_only =
-    ARG_DEF(NULL, "use-intra-default-tx-only", 1,
-            "Use Default-transform only for INTRA modes");
-static const arg_def_t quant_b_adapt =
-    ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b");
-static const arg_def_t coeff_cost_upd_freq =
-    ARG_DEF(NULL, "coeff-cost-upd-freq", 1,
-            "Update freq for coeff costs"
-            "0: SB, 1: SB Row per Tile, 2: Tile");
-static const arg_def_t mode_cost_upd_freq =
-    ARG_DEF(NULL, "mode-cost-upd-freq", 1,
-            "Update freq for mode costs"
-            "0: SB, 1: SB Row per Tile, 2: Tile");
-static const arg_def_t mv_cost_upd_freq =
-    ARG_DEF(NULL, "mv-cost-upd-freq", 1,
-            "Update freq for mv costs"
-            "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off");
-static const arg_def_t num_tg = ARG_DEF(
-    NULL, "num-tile-groups", 1, "Maximum number of tile groups, default is 1");
-static const arg_def_t mtu_size =
-    ARG_DEF(NULL, "mtu-size", 1,
-            "MTU size for a tile group, default is 0 (no MTU targeting), "
-            "overrides maximum number of tile groups");
-static const struct arg_enum_list timing_info_enum[] = {
-  { "unspecified", AOM_TIMING_UNSPECIFIED },
-  { "constant", AOM_TIMING_EQUAL },
-  { "model", AOM_TIMING_DEC_MODEL },
-  { NULL, 0 }
-};
-static const arg_def_t timing_info =
-    ARG_DEF_ENUM(NULL, "timing-info", 1,
-                 "Signal timing info in the bitstream (model unly works for no "
-                 "hidden frames, no super-res yet):",
-                 timing_info_enum);
-#if CONFIG_TUNE_VMAF
-static const arg_def_t vmaf_model_path =
-    ARG_DEF(NULL, "vmaf-model-path", 1, "Path to the VMAF model file");
-#endif
-static const arg_def_t film_grain_test =
-    ARG_DEF(NULL, "film-grain-test", 1,
-            "Film grain test vectors (0: none (default), 1: test-1  2: test-2, "
-            "... 16: test-16)");
-static const arg_def_t film_grain_table =
-    ARG_DEF(NULL, "film-grain-table", 1,
-            "Path to file containing film grain parameters");
-#if CONFIG_DENOISE
-static const arg_def_t denoise_noise_level =
-    ARG_DEF(NULL, "denoise-noise-level", 1,
-            "Amount of noise (from 0 = don't denoise, to 50)");
-static const arg_def_t denoise_block_size =
-    ARG_DEF(NULL, "denoise-block-size", 1, "Denoise block size (default = 32)");
-#endif
-static const arg_def_t enable_ref_frame_mvs =
-    ARG_DEF(NULL, "enable-ref-frame-mvs", 1,
-            "Enable temporal mv prediction (default is 1)");
-static const arg_def_t frame_parallel_decoding =
-    ARG_DEF(NULL, "frame-parallel", 1,
-            "Enable frame parallel decodability features "
-            "(0: false (default), 1: true)");
-static const arg_def_t error_resilient_mode =
-    ARG_DEF(NULL, "error-resilient", 1,
-            "Enable error resilient features "
-            "(0: false (default), 1: true)");
-static const arg_def_t aq_mode = ARG_DEF(
-    NULL, "aq-mode", 1,
-    "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
-    "3: cyclic refresh)");
-static const arg_def_t deltaq_mode =
-    ARG_DEF(NULL, "deltaq-mode", 1,
-            "Delta qindex mode (0: off, 1: deltaq objective (default), "
-            "2: deltaq perceptual). "
-            "Currently this requires enable-tpl-model as a prerequisite.");
-static const arg_def_t deltalf_mode = ARG_DEF(
-    NULL, "delta-lf-mode", 1, "Enable delta-lf-mode (0: off (default), 1: on)");
-static const arg_def_t frame_periodic_boost =
-    ARG_DEF(NULL, "frame-boost", 1,
-            "Enable frame periodic boost (0: off (default), 1: on)");
-static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
-    NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
-static const arg_def_t max_inter_rate_pct =
-    ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)");
-static const arg_def_t min_gf_interval = ARG_DEF(
-    NULL, "min-gf-interval", 1,
-    "min gf/arf frame interval (default 0, indicating in-built behavior)");
-static const arg_def_t max_gf_interval = ARG_DEF(
-    NULL, "max-gf-interval", 1,
-    "max gf/arf frame interval (default 0, indicating in-built behavior)");
-static const arg_def_t gf_min_pyr_height =
-    ARG_DEF(NULL, "gf-min-pyr-height", 1,
-            "Min height for GF group pyramid structure (0 (default) to 5)");
-static const arg_def_t gf_max_pyr_height =
-    ARG_DEF(NULL, "gf-max-pyr-height", 1,
-            "maximum height for GF group pyramid structure (0 to 5 (default))");
-static const arg_def_t max_reference_frames = ARG_DEF(
-    NULL, "max-reference-frames", 1,
-    "maximum number of reference frames allowed per frame (3 to 7 (default))");
-static const arg_def_t reduced_reference_set =
-    ARG_DEF(NULL, "reduced-reference-set", 1,
-            "Use reduced set of single and compound references (0: off "
-            "(default), 1: on)");
-static const arg_def_t target_seq_level_idx =
-    ARG_DEF(NULL, "target-seq-level-idx", 1,
-            "Target sequence level index. "
-            "Possible values are in the form of \"ABxy\"(pad leading zeros if "
-            "less than 4 digits). "
-            "AB: Operating point(OP) index; "
-            "xy: Target level index for the OP. "
-            "E.g. \"0\" means target level index 0 for the 0th OP; "
-            "\"1021\" means target level index 21 for the 10th OP.");
-static const arg_def_t set_min_cr =
-    ARG_DEF(NULL, "min-cr", 1,
-            "Set minimum compression ratio. Take integer values. Default is 0. "
-            "If non-zero, encoder will try to keep the compression ratio of "
-            "each frame to be higher than the given value divided by 100.");
-
-static const struct arg_enum_list color_primaries_enum[] = {
-  { "bt709", AOM_CICP_CP_BT_709 },
-  { "unspecified", AOM_CICP_CP_UNSPECIFIED },
-  { "bt601", AOM_CICP_CP_BT_601 },
-  { "bt470m", AOM_CICP_CP_BT_470_M },
-  { "bt470bg", AOM_CICP_CP_BT_470_B_G },
-  { "smpte240", AOM_CICP_CP_SMPTE_240 },
-  { "film", AOM_CICP_CP_GENERIC_FILM },
-  { "bt2020", AOM_CICP_CP_BT_2020 },
-  { "xyz", AOM_CICP_CP_XYZ },
-  { "smpte431", AOM_CICP_CP_SMPTE_431 },
-  { "smpte432", AOM_CICP_CP_SMPTE_432 },
-  { "ebu3213", AOM_CICP_CP_EBU_3213 },
-  { NULL, 0 }
-};
-
-static const arg_def_t input_color_primaries = ARG_DEF_ENUM(
-    NULL, "color-primaries", 1,
-    "Color primaries (CICP) of input content:", color_primaries_enum);
-
-static const struct arg_enum_list transfer_characteristics_enum[] = {
-  { "unspecified", AOM_CICP_CP_UNSPECIFIED },
-  { "bt709", AOM_CICP_TC_BT_709 },
-  { "bt470m", AOM_CICP_TC_BT_470_M },
-  { "bt470bg", AOM_CICP_TC_BT_470_B_G },
-  { "bt601", AOM_CICP_TC_BT_601 },
-  { "smpte240", AOM_CICP_TC_SMPTE_240 },
-  { "lin", AOM_CICP_TC_LINEAR },
-  { "log100", AOM_CICP_TC_LOG_100 },
-  { "log100sq10", AOM_CICP_TC_LOG_100_SQRT10 },
-  { "iec61966", AOM_CICP_TC_IEC_61966 },
-  { "bt1361", AOM_CICP_TC_BT_1361 },
-  { "srgb", AOM_CICP_TC_SRGB },
-  { "bt2020-10bit", AOM_CICP_TC_BT_2020_10_BIT },
-  { "bt2020-12bit", AOM_CICP_TC_BT_2020_12_BIT },
-  { "smpte2084", AOM_CICP_TC_SMPTE_2084 },
-  { "hlg", AOM_CICP_TC_HLG },
-  { "smpte428", AOM_CICP_TC_SMPTE_428 },
-  { NULL, 0 }
-};
-
-static const arg_def_t input_transfer_characteristics =
-    ARG_DEF_ENUM(NULL, "transfer-characteristics", 1,
-                 "Transfer characteristics (CICP) of input content:",
-                 transfer_characteristics_enum);
-
-static const struct arg_enum_list matrix_coefficients_enum[] = {
-  { "identity", AOM_CICP_MC_IDENTITY },
-  { "bt709", AOM_CICP_MC_BT_709 },
-  { "unspecified", AOM_CICP_MC_UNSPECIFIED },
-  { "fcc73", AOM_CICP_MC_FCC },
-  { "bt470bg", AOM_CICP_MC_BT_470_B_G },
-  { "bt601", AOM_CICP_MC_BT_601 },
-  { "smpte240", AOM_CICP_CP_SMPTE_240 },
-  { "ycgco", AOM_CICP_MC_SMPTE_YCGCO },
-  { "bt2020ncl", AOM_CICP_MC_BT_2020_NCL },
-  { "bt2020cl", AOM_CICP_MC_BT_2020_CL },
-  { "smpte2085", AOM_CICP_MC_SMPTE_2085 },
-  { "chromncl", AOM_CICP_MC_CHROMAT_NCL },
-  { "chromcl", AOM_CICP_MC_CHROMAT_CL },
-  { "ictcp", AOM_CICP_MC_ICTCP },
-  { NULL, 0 }
-};
-
-static const arg_def_t input_matrix_coefficients = ARG_DEF_ENUM(
-    NULL, "matrix-coefficients", 1,
-    "Matrix coefficients (CICP) of input content:", matrix_coefficients_enum);
-
-static const struct arg_enum_list chroma_sample_position_enum[] = {
-  { "unknown", AOM_CSP_UNKNOWN },
-  { "vertical", AOM_CSP_VERTICAL },
-  { "colocated", AOM_CSP_COLOCATED },
-  { NULL, 0 }
-};
-
-static const arg_def_t input_chroma_sample_position =
-    ARG_DEF_ENUM(NULL, "chroma-sample-position", 1,
-                 "The chroma sample position when chroma 4:2:0 is signaled:",
-                 chroma_sample_position_enum);
-
-static const struct arg_enum_list tune_content_enum[] = {
-  { "default", AOM_CONTENT_DEFAULT },
-  { "screen", AOM_CONTENT_SCREEN },
-  { NULL, 0 }
-};
-
-static const arg_def_t tune_content = ARG_DEF_ENUM(
-    NULL, "tune-content", 1, "Tune content type", tune_content_enum);
-
-static const arg_def_t cdf_update_mode =
-    ARG_DEF(NULL, "cdf-update-mode", 1,
-            "CDF update mode for entropy coding "
-            "(0: no CDF update; 1: update CDF on all frames(default); "
-            "2: selectively update CDF on some frames");
-
-static const struct arg_enum_list superblock_size_enum[] = {
-  { "dynamic", AOM_SUPERBLOCK_SIZE_DYNAMIC },
-  { "64", AOM_SUPERBLOCK_SIZE_64X64 },
-  { "128", AOM_SUPERBLOCK_SIZE_128X128 },
-  { NULL, 0 }
-};
-static const arg_def_t superblock_size = ARG_DEF_ENUM(
-    NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum);
-
-static const arg_def_t set_tier_mask =
-    ARG_DEF(NULL, "set-tier-mask", 1,
-            "Set bit mask to specify which tier each of the 32 possible "
-            "operating points conforms to. "
-            "Bit value 0(defualt): Main Tier; 1: High Tier.");
-
-static const arg_def_t use_fixed_qp_offsets =
-    ARG_DEF(NULL, "use-fixed-qp-offsets", 1,
-            "Enable fixed QP offsets for frames at different levels of the "
-            "pyramid. Selected automatically from --cq-level if "
-            "--fixed-qp-offsets is not provided. If this option is not "
-            "specified (default), offsets are adaptively chosen by the "
-            "encoder.");
-
-static const arg_def_t fixed_qp_offsets =
-    ARG_DEF(NULL, "fixed-qp-offsets", 1,
-            "Set fixed QP offsets for frames at different levels of the "
-            "pyramid. Comma-separated list of 5 offsets for keyframe, ALTREF, "
-            "and 3 levels of internal alt-refs. If this option is not "
-            "specified (default), offsets are adaptively chosen by the "
-            "encoder.");
-
-static const arg_def_t *av1_args[] = { &cpu_used_av1,
-                                       &auto_altref,
-                                       &sharpness,
-                                       &static_thresh,
-                                       &rowmtarg,
-                                       &tile_cols,
-                                       &tile_rows,
-                                       &enable_tpl_model,
-                                       &enable_keyframe_filtering,
-                                       &arnr_maxframes,
-                                       &arnr_strength,
-                                       &tune_metric,
-                                       &cq_level,
-                                       &max_intra_rate_pct,
-                                       &max_inter_rate_pct,
-                                       &gf_cbr_boost_pct,
-                                       &lossless,
-                                       &enable_cdef,
-                                       &enable_restoration,
-                                       &enable_rect_partitions,
-                                       &enable_ab_partitions,
-                                       &enable_1to4_partitions,
-                                       &min_partition_size,
-                                       &max_partition_size,
-                                       &enable_dual_filter,
-                                       &enable_chroma_deltaq,
-                                       &enable_intra_edge_filter,
-                                       &enable_order_hint,
-                                       &enable_tx64,
-                                       &enable_flip_idtx,
-                                       &enable_dist_wtd_comp,
-                                       &enable_masked_comp,
-                                       &enable_onesided_comp,
-                                       &enable_interintra_comp,
-                                       &enable_smooth_interintra,
-                                       &enable_diff_wtd_comp,
-                                       &enable_interinter_wedge,
-                                       &enable_interintra_wedge,
-                                       &enable_global_motion,
-                                       &enable_warped_motion,
-                                       &enable_filter_intra,
-                                       &enable_smooth_intra,
-                                       &enable_paeth_intra,
-                                       &enable_cfl_intra,
-                                       &force_video_mode,
-                                       &enable_obmc,
-                                       &enable_overlay,
-                                       &enable_palette,
-                                       &enable_intrabc,
-                                       &enable_angle_delta,
-                                       &disable_trellis_quant,
-                                       &enable_qm,
-                                       &qm_min,
-                                       &qm_max,
-                                       &reduced_tx_type_set,
-                                       &use_intra_dct_only,
-                                       &use_inter_dct_only,
-                                       &use_intra_default_tx_only,
-                                       &quant_b_adapt,
-                                       &coeff_cost_upd_freq,
-                                       &mode_cost_upd_freq,
-                                       &mv_cost_upd_freq,
-                                       &frame_parallel_decoding,
-                                       &error_resilient_mode,
-                                       &aq_mode,
-                                       &deltaq_mode,
-                                       &deltalf_mode,
-                                       &frame_periodic_boost,
-                                       &noise_sens,
-                                       &tune_content,
-                                       &cdf_update_mode,
-                                       &input_color_primaries,
-                                       &input_transfer_characteristics,
-                                       &input_matrix_coefficients,
-                                       &input_chroma_sample_position,
-                                       &min_gf_interval,
-                                       &max_gf_interval,
-                                       &gf_min_pyr_height,
-                                       &gf_max_pyr_height,
-                                       &superblock_size,
-                                       &num_tg,
-                                       &mtu_size,
-                                       &timing_info,
-                                       &film_grain_test,
-                                       &film_grain_table,
-#if CONFIG_DENOISE
-                                       &denoise_noise_level,
-                                       &denoise_block_size,
-#endif  // CONFIG_DENOISE
-                                       &max_reference_frames,
-                                       &reduced_reference_set,
-                                       &enable_ref_frame_mvs,
-                                       &target_seq_level_idx,
-                                       &set_tier_mask,
-                                       &set_min_cr,
-                                       &bitdeptharg,
-                                       &inbitdeptharg,
-                                       &input_chroma_subsampling_x,
-                                       &input_chroma_subsampling_y,
-                                       &sframe_dist,
-                                       &sframe_mode,
-                                       &save_as_annexb,
-#if CONFIG_TUNE_VMAF
-                                       &vmaf_model_path,
-#endif
-                                       NULL };
 static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                         AOME_SET_ENABLEAUTOALTREF,
                                         AOME_SET_SHARPNESS,
                                         AOME_SET_STATIC_THRESHOLD,
                                         AV1E_SET_ROW_MT,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                        AV1E_SET_FP_MT,
+#endif
                                         AV1E_SET_TILE_COLUMNS,
                                         AV1E_SET_TILE_ROWS,
                                         AV1E_SET_ENABLE_TPL_MODEL,
@@ -950,6 +158,7 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                         AV1E_SET_ENABLE_ORDER_HINT,
                                         AV1E_SET_ENABLE_TX64,
                                         AV1E_SET_ENABLE_FLIP_IDTX,
+                                        AV1E_SET_ENABLE_RECT_TX,
                                         AV1E_SET_ENABLE_DIST_WTD_COMP,
                                         AV1E_SET_ENABLE_MASKED_COMP,
                                         AV1E_SET_ENABLE_ONESIDED_COMP,
@@ -964,6 +173,7 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                         AV1E_SET_ENABLE_SMOOTH_INTRA,
                                         AV1E_SET_ENABLE_PAETH_INTRA,
                                         AV1E_SET_ENABLE_CFL_INTRA,
+                                        AV1E_SET_ENABLE_DIAGONAL_INTRA,
                                         AV1E_SET_FORCE_VIDEO_MODE,
                                         AV1E_SET_ENABLE_OBMC,
                                         AV1E_SET_ENABLE_OVERLAY,
@@ -986,6 +196,7 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                         AV1E_SET_ERROR_RESILIENT_MODE,
                                         AV1E_SET_AQ_MODE,
                                         AV1E_SET_DELTAQ_MODE,
+                                        AV1E_SET_DELTAQ_STRENGTH,
                                         AV1E_SET_DELTALF_MODE,
                                         AV1E_SET_FRAME_PERIODIC_BOOST,
                                         AV1E_SET_NOISE_SENSITIVITY,
@@ -1008,6 +219,7 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
 #if CONFIG_DENOISE
                                         AV1E_SET_DENOISE_NOISE_LEVEL,
                                         AV1E_SET_DENOISE_BLOCK_SIZE,
+                                        AV1E_SET_ENABLE_DNL_DENOISING,
 #endif  // CONFIG_DENOISE
                                         AV1E_SET_MAX_REFERENCE_FRAMES,
                                         AV1E_SET_REDUCED_REFERENCE_SET,
@@ -1015,16 +227,241 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                         AV1E_SET_TARGET_SEQ_LEVEL_IDX,
                                         AV1E_SET_TIER_MASK,
                                         AV1E_SET_MIN_CR,
+                                        AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP,
+                                        AV1E_SET_CHROMA_SUBSAMPLING_X,
+                                        AV1E_SET_CHROMA_SUBSAMPLING_Y,
 #if CONFIG_TUNE_VMAF
                                         AV1E_SET_VMAF_MODEL_PATH,
 #endif
+                                        AV1E_SET_DV_COST_UPD_FREQ,
+                                        AV1E_SET_PARTITION_INFO_PATH,
+                                        AV1E_SET_ENABLE_DIRECTIONAL_INTRA,
+                                        AV1E_SET_ENABLE_TX_SIZE_SEARCH,
+                                        AV1E_SET_LOOPFILTER_CONTROL,
+                                        AV1E_SET_AUTO_INTRA_TOOLS_OFF,
                                         0 };
-#endif  // CONFIG_AV1_ENCODER
+
+const arg_def_t *main_args[] = { &g_av1_codec_arg_defs.help,
+                                 &g_av1_codec_arg_defs.use_cfg,
+                                 &g_av1_codec_arg_defs.debugmode,
+                                 &g_av1_codec_arg_defs.outputfile,
+                                 &g_av1_codec_arg_defs.codecarg,
+                                 &g_av1_codec_arg_defs.passes,
+                                 &g_av1_codec_arg_defs.pass_arg,
+                                 &g_av1_codec_arg_defs.fpf_name,
+                                 &g_av1_codec_arg_defs.limit,
+                                 &g_av1_codec_arg_defs.skip,
+                                 &g_av1_codec_arg_defs.good_dl,
+                                 &g_av1_codec_arg_defs.rt_dl,
+                                 &g_av1_codec_arg_defs.ai_dl,
+                                 &g_av1_codec_arg_defs.quietarg,
+                                 &g_av1_codec_arg_defs.verbosearg,
+                                 &g_av1_codec_arg_defs.psnrarg,
+                                 &g_av1_codec_arg_defs.use_webm,
+                                 &g_av1_codec_arg_defs.use_ivf,
+                                 &g_av1_codec_arg_defs.use_obu,
+                                 &g_av1_codec_arg_defs.q_hist_n,
+                                 &g_av1_codec_arg_defs.rate_hist_n,
+                                 &g_av1_codec_arg_defs.disable_warnings,
+                                 &g_av1_codec_arg_defs.disable_warning_prompt,
+                                 &g_av1_codec_arg_defs.recontest,
+                                 NULL };
+
+const arg_def_t *global_args[] = {
+  &g_av1_codec_arg_defs.use_nv12,
+  &g_av1_codec_arg_defs.use_yv12,
+  &g_av1_codec_arg_defs.use_i420,
+  &g_av1_codec_arg_defs.use_i422,
+  &g_av1_codec_arg_defs.use_i444,
+  &g_av1_codec_arg_defs.usage,
+  &g_av1_codec_arg_defs.threads,
+  &g_av1_codec_arg_defs.profile,
+  &g_av1_codec_arg_defs.width,
+  &g_av1_codec_arg_defs.height,
+  &g_av1_codec_arg_defs.forced_max_frame_width,
+  &g_av1_codec_arg_defs.forced_max_frame_height,
+#if CONFIG_WEBM_IO
+  &g_av1_codec_arg_defs.stereo_mode,
+#endif
+  &g_av1_codec_arg_defs.timebase,
+  &g_av1_codec_arg_defs.framerate,
+  &g_av1_codec_arg_defs.global_error_resilient,
+  &g_av1_codec_arg_defs.bitdeptharg,
+  &g_av1_codec_arg_defs.inbitdeptharg,
+  &g_av1_codec_arg_defs.lag_in_frames,
+  &g_av1_codec_arg_defs.large_scale_tile,
+  &g_av1_codec_arg_defs.monochrome,
+  &g_av1_codec_arg_defs.full_still_picture_hdr,
+  &g_av1_codec_arg_defs.use_16bit_internal,
+  &g_av1_codec_arg_defs.save_as_annexb,
+  NULL
+};
+
+const arg_def_t *rc_args[] = { &g_av1_codec_arg_defs.dropframe_thresh,
+                               &g_av1_codec_arg_defs.resize_mode,
+                               &g_av1_codec_arg_defs.resize_denominator,
+                               &g_av1_codec_arg_defs.resize_kf_denominator,
+                               &g_av1_codec_arg_defs.superres_mode,
+                               &g_av1_codec_arg_defs.superres_denominator,
+                               &g_av1_codec_arg_defs.superres_kf_denominator,
+                               &g_av1_codec_arg_defs.superres_qthresh,
+                               &g_av1_codec_arg_defs.superres_kf_qthresh,
+                               &g_av1_codec_arg_defs.end_usage,
+                               &g_av1_codec_arg_defs.target_bitrate,
+                               &g_av1_codec_arg_defs.min_quantizer,
+                               &g_av1_codec_arg_defs.max_quantizer,
+                               &g_av1_codec_arg_defs.undershoot_pct,
+                               &g_av1_codec_arg_defs.overshoot_pct,
+                               &g_av1_codec_arg_defs.buf_sz,
+                               &g_av1_codec_arg_defs.buf_initial_sz,
+                               &g_av1_codec_arg_defs.buf_optimal_sz,
+                               &g_av1_codec_arg_defs.bias_pct,
+                               &g_av1_codec_arg_defs.minsection_pct,
+                               &g_av1_codec_arg_defs.maxsection_pct,
+                               NULL };
+
+const arg_def_t *kf_args[] = { &g_av1_codec_arg_defs.fwd_kf_enabled,
+                               &g_av1_codec_arg_defs.kf_min_dist,
+                               &g_av1_codec_arg_defs.kf_max_dist,
+                               &g_av1_codec_arg_defs.kf_disabled,
+                               &g_av1_codec_arg_defs.sframe_dist,
+                               &g_av1_codec_arg_defs.sframe_mode,
+                               NULL };
+
+// TODO(bohanli): Currently all options are supported by the key & value API.
+// Consider removing the control ID usages?
+const arg_def_t *av1_ctrl_args[] = {
+  &g_av1_codec_arg_defs.cpu_used_av1,
+  &g_av1_codec_arg_defs.auto_altref,
+  &g_av1_codec_arg_defs.sharpness,
+  &g_av1_codec_arg_defs.static_thresh,
+  &g_av1_codec_arg_defs.rowmtarg,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  &g_av1_codec_arg_defs.fpmtarg,
+#endif
+  &g_av1_codec_arg_defs.tile_cols,
+  &g_av1_codec_arg_defs.tile_rows,
+  &g_av1_codec_arg_defs.enable_tpl_model,
+  &g_av1_codec_arg_defs.enable_keyframe_filtering,
+  &g_av1_codec_arg_defs.arnr_maxframes,
+  &g_av1_codec_arg_defs.arnr_strength,
+  &g_av1_codec_arg_defs.tune_metric,
+  &g_av1_codec_arg_defs.cq_level,
+  &g_av1_codec_arg_defs.max_intra_rate_pct,
+  &g_av1_codec_arg_defs.max_inter_rate_pct,
+  &g_av1_codec_arg_defs.gf_cbr_boost_pct,
+  &g_av1_codec_arg_defs.lossless,
+  &g_av1_codec_arg_defs.enable_cdef,
+  &g_av1_codec_arg_defs.enable_restoration,
+  &g_av1_codec_arg_defs.enable_rect_partitions,
+  &g_av1_codec_arg_defs.enable_ab_partitions,
+  &g_av1_codec_arg_defs.enable_1to4_partitions,
+  &g_av1_codec_arg_defs.min_partition_size,
+  &g_av1_codec_arg_defs.max_partition_size,
+  &g_av1_codec_arg_defs.enable_dual_filter,
+  &g_av1_codec_arg_defs.enable_chroma_deltaq,
+  &g_av1_codec_arg_defs.enable_intra_edge_filter,
+  &g_av1_codec_arg_defs.enable_order_hint,
+  &g_av1_codec_arg_defs.enable_tx64,
+  &g_av1_codec_arg_defs.enable_flip_idtx,
+  &g_av1_codec_arg_defs.enable_rect_tx,
+  &g_av1_codec_arg_defs.enable_dist_wtd_comp,
+  &g_av1_codec_arg_defs.enable_masked_comp,
+  &g_av1_codec_arg_defs.enable_onesided_comp,
+  &g_av1_codec_arg_defs.enable_interintra_comp,
+  &g_av1_codec_arg_defs.enable_smooth_interintra,
+  &g_av1_codec_arg_defs.enable_diff_wtd_comp,
+  &g_av1_codec_arg_defs.enable_interinter_wedge,
+  &g_av1_codec_arg_defs.enable_interintra_wedge,
+  &g_av1_codec_arg_defs.enable_global_motion,
+  &g_av1_codec_arg_defs.enable_warped_motion,
+  &g_av1_codec_arg_defs.enable_filter_intra,
+  &g_av1_codec_arg_defs.enable_smooth_intra,
+  &g_av1_codec_arg_defs.enable_paeth_intra,
+  &g_av1_codec_arg_defs.enable_cfl_intra,
+  &g_av1_codec_arg_defs.enable_diagonal_intra,
+  &g_av1_codec_arg_defs.force_video_mode,
+  &g_av1_codec_arg_defs.enable_obmc,
+  &g_av1_codec_arg_defs.enable_overlay,
+  &g_av1_codec_arg_defs.enable_palette,
+  &g_av1_codec_arg_defs.enable_intrabc,
+  &g_av1_codec_arg_defs.enable_angle_delta,
+  &g_av1_codec_arg_defs.disable_trellis_quant,
+  &g_av1_codec_arg_defs.enable_qm,
+  &g_av1_codec_arg_defs.qm_min,
+  &g_av1_codec_arg_defs.qm_max,
+  &g_av1_codec_arg_defs.reduced_tx_type_set,
+  &g_av1_codec_arg_defs.use_intra_dct_only,
+  &g_av1_codec_arg_defs.use_inter_dct_only,
+  &g_av1_codec_arg_defs.use_intra_default_tx_only,
+  &g_av1_codec_arg_defs.quant_b_adapt,
+  &g_av1_codec_arg_defs.coeff_cost_upd_freq,
+  &g_av1_codec_arg_defs.mode_cost_upd_freq,
+  &g_av1_codec_arg_defs.mv_cost_upd_freq,
+  &g_av1_codec_arg_defs.frame_parallel_decoding,
+  &g_av1_codec_arg_defs.error_resilient_mode,
+  &g_av1_codec_arg_defs.aq_mode,
+  &g_av1_codec_arg_defs.deltaq_mode,
+  &g_av1_codec_arg_defs.deltaq_strength,
+  &g_av1_codec_arg_defs.deltalf_mode,
+  &g_av1_codec_arg_defs.frame_periodic_boost,
+  &g_av1_codec_arg_defs.noise_sens,
+  &g_av1_codec_arg_defs.tune_content,
+  &g_av1_codec_arg_defs.cdf_update_mode,
+  &g_av1_codec_arg_defs.input_color_primaries,
+  &g_av1_codec_arg_defs.input_transfer_characteristics,
+  &g_av1_codec_arg_defs.input_matrix_coefficients,
+  &g_av1_codec_arg_defs.input_chroma_sample_position,
+  &g_av1_codec_arg_defs.min_gf_interval,
+  &g_av1_codec_arg_defs.max_gf_interval,
+  &g_av1_codec_arg_defs.gf_min_pyr_height,
+  &g_av1_codec_arg_defs.gf_max_pyr_height,
+  &g_av1_codec_arg_defs.superblock_size,
+  &g_av1_codec_arg_defs.num_tg,
+  &g_av1_codec_arg_defs.mtu_size,
+  &g_av1_codec_arg_defs.timing_info,
+  &g_av1_codec_arg_defs.film_grain_test,
+  &g_av1_codec_arg_defs.film_grain_table,
+#if CONFIG_DENOISE
+  &g_av1_codec_arg_defs.denoise_noise_level,
+  &g_av1_codec_arg_defs.denoise_block_size,
+  &g_av1_codec_arg_defs.enable_dnl_denoising,
+#endif  // CONFIG_DENOISE
+  &g_av1_codec_arg_defs.max_reference_frames,
+  &g_av1_codec_arg_defs.reduced_reference_set,
+  &g_av1_codec_arg_defs.enable_ref_frame_mvs,
+  &g_av1_codec_arg_defs.target_seq_level_idx,
+  &g_av1_codec_arg_defs.set_tier_mask,
+  &g_av1_codec_arg_defs.set_min_cr,
+  &g_av1_codec_arg_defs.vbr_corpus_complexity_lap,
+  &g_av1_codec_arg_defs.input_chroma_subsampling_x,
+  &g_av1_codec_arg_defs.input_chroma_subsampling_y,
+#if CONFIG_TUNE_VMAF
+  &g_av1_codec_arg_defs.vmaf_model_path,
+#endif
+  &g_av1_codec_arg_defs.dv_cost_upd_freq,
+  &g_av1_codec_arg_defs.partition_info_path,
+  &g_av1_codec_arg_defs.enable_directional_intra,
+  &g_av1_codec_arg_defs.enable_tx_size_search,
+  &g_av1_codec_arg_defs.loopfilter_control,
+  &g_av1_codec_arg_defs.auto_intra_tools_off,
+  NULL,
+};
+
+const arg_def_t *av1_key_val_args[] = {
+  &g_av1_codec_arg_defs.passes,
+  &g_av1_codec_arg_defs.two_pass_output,
+  &g_av1_codec_arg_defs.second_pass_log,
+  &g_av1_codec_arg_defs.fwd_kf_dist,
+  &g_av1_codec_arg_defs.strict_level_conformance,
+  &g_av1_codec_arg_defs.dist_metric,
+  NULL,
+};
 
 static const arg_def_t *no_args[] = { NULL };
 
 static void show_help(FILE *fout, int shorthelp) {
-  fprintf(fout, "Usage: %s <options> -o dst_filename src_filename \n",
+  fprintf(fout, "Usage: %s <options> -o dst_filename src_filename\n",
           exec_name);
 
   if (shorthelp) {
@@ -1038,13 +475,12 @@ static void show_help(FILE *fout, int shorthelp) {
   arg_show_usage(fout, global_args);
   fprintf(fout, "\nRate Control Options:\n");
   arg_show_usage(fout, rc_args);
-  fprintf(fout, "\nTwopass Rate Control Options:\n");
-  arg_show_usage(fout, rc_twopass_args);
   fprintf(fout, "\nKeyframe Placement Options:\n");
   arg_show_usage(fout, kf_args);
 #if CONFIG_AV1_ENCODER
   fprintf(fout, "\nAV1 Specific Options:\n");
-  arg_show_usage(fout, av1_args);
+  arg_show_usage(fout, av1_ctrl_args);
+  arg_show_usage(fout, av1_key_val_args);
 #endif
   fprintf(fout,
           "\nStream timebase (--timebase):\n"
@@ -1054,10 +490,10 @@ static void show_help(FILE *fout, int shorthelp) {
 
   const int num_encoder = get_aom_encoder_count();
   for (int i = 0; i < num_encoder; ++i) {
-    const AvxInterface *const encoder = get_aom_encoder_by_index(i);
+    aom_codec_iface_t *encoder = get_aom_encoder_by_index(i);
     const char *defstr = (i == (num_encoder - 1)) ? "(default)" : "";
-    fprintf(fout, "    %-6s - %s %s\n", encoder->name,
-            aom_codec_iface_name(encoder->codec_interface()), defstr);
+    fprintf(fout, "    %-6s - %s %s\n", get_short_name_by_aom_encoder(encoder),
+            aom_codec_iface_name(encoder), defstr);
   }
   fprintf(fout, "\n        ");
   fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n");
@@ -1070,6 +506,7 @@ void usage_exit(void) {
 
 #if CONFIG_AV1_ENCODER
 #define ARG_CTRL_CNT_MAX NELEMENTS(av1_arg_ctrl_map)
+#define ARG_KEY_VAL_CNT_MAX NELEMENTS(av1_key_val_args)
 #endif
 
 #if !CONFIG_WEBM_IO
@@ -1087,6 +524,8 @@ struct stream_config {
   stereo_format_t stereo_fmt;
   int arg_ctrls[ARG_CTRL_CNT_MAX][2];
   int arg_ctrl_cnt;
+  const char *arg_key_vals[ARG_KEY_VAL_CNT_MAX][2];
+  int arg_key_val_cnt;
   int write_webm;
   const char *film_grain_filename;
   int write_ivf;
@@ -1095,6 +534,12 @@ struct stream_config {
 #if CONFIG_TUNE_VMAF
   const char *vmaf_model_path;
 #endif
+  const char *partition_info_path;
+  aom_color_range_t color_range;
+  const char *two_pass_input;
+  const char *two_pass_output;
+  int two_pass_width;
+  int two_pass_height;
 };
 
 struct stream_state {
@@ -1104,10 +549,10 @@ struct stream_state {
   FILE *file;
   struct rate_hist *rate_hist;
   struct WebmOutputContext webm_ctx;
-  uint64_t psnr_sse_total;
-  uint64_t psnr_samples_total;
-  double psnr_totals[4];
-  int psnr_count;
+  uint64_t psnr_sse_total[2];
+  uint64_t psnr_samples_total[2];
+  double psnr_totals[2][4];
+  int psnr_count[2];
   int counts[64];
   aom_codec_ctx_t encoder;
   unsigned int frames_out;
@@ -1119,6 +564,12 @@ struct stream_state {
   int mismatch_seen;
   unsigned int chroma_subsampling_x;
   unsigned int chroma_subsampling_y;
+  const char *orig_out_fn;
+  unsigned int orig_width;
+  unsigned int orig_height;
+  int orig_write_webm;
+  int orig_write_ivf;
+  char tmp_out_fn[1000];
 };
 
 static void validate_positive_rational(const char *msg,
@@ -1157,6 +608,7 @@ static void parse_global_config(struct AvxEncoderConfig *global, char ***argv) {
   global->passes = 0;
   global->color_type = I420;
   global->csp = AOM_CSP_UNKNOWN;
+  global->show_psnr = 0;
 
   int cfg_included = 0;
   init_config(&global->encoder_config);
@@ -1164,82 +616,92 @@ static void parse_global_config(struct AvxEncoderConfig *global, char ***argv) {
   for (argi = argj = argv_local; (*argj = *argi); argi += arg.argv_step) {
     arg.argv_step = 1;
 
-    if (arg_match(&arg, &use_cfg, argi)) {
-      if (cfg_included) continue;
-      parse_cfg(arg.val, &global->encoder_config);
-      cfg_included = 1;
-      continue;
-    }
-    if (arg_match(&arg, &help, argi)) {
+    if (arg_match(&arg, &g_av1_codec_arg_defs.use_cfg, argi)) {
+      if (!cfg_included) {
+        parse_cfg(arg.val, &global->encoder_config);
+        cfg_included = 1;
+      }
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.help, argi)) {
       show_help(stdout, 0);
       exit(EXIT_SUCCESS);
-    } else if (arg_match(&arg, &codecarg, argi)) {
-      global->codec = get_aom_encoder_by_name(arg.val);
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.codecarg, argi)) {
+      global->codec = get_aom_encoder_by_short_name(arg.val);
       if (!global->codec)
         die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
-    } else if (arg_match(&arg, &passes, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.passes, argi)) {
       global->passes = arg_parse_uint(&arg);
 
-      if (global->passes < 1 || global->passes > 2)
+      if (global->passes < 1 || global->passes > 3)
         die("Error: Invalid number of passes (%d)\n", global->passes);
-    } else if (arg_match(&arg, &pass_arg, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.pass_arg, argi)) {
       global->pass = arg_parse_uint(&arg);
 
-      if (global->pass < 1 || global->pass > 2)
+      if (global->pass < 1 || global->pass > 3)
         die("Error: Invalid pass selected (%d)\n", global->pass);
-    } else if (arg_match(&arg, &input_chroma_sample_position, argi)) {
+    } else if (arg_match(&arg,
+                         &g_av1_codec_arg_defs.input_chroma_sample_position,
+                         argi)) {
       global->csp = arg_parse_enum(&arg);
       /* Flag is used by later code as well, preserve it. */
       argj++;
-    } else if (arg_match(&arg, &usage, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.usage, argi)) {
       global->usage = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &good_dl, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.good_dl, argi)) {
       global->usage = AOM_USAGE_GOOD_QUALITY;  // Good quality usage
-    else if (arg_match(&arg, &rt_dl, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.rt_dl, argi)) {
       global->usage = AOM_USAGE_REALTIME;  // Real-time usage
-    else if (arg_match(&arg, &use_yv12, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.ai_dl, argi)) {
+      global->usage = AOM_USAGE_ALL_INTRA;  // All intra usage
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_nv12, argi)) {
+      global->color_type = NV12;
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_yv12, argi)) {
       global->color_type = YV12;
-    else if (arg_match(&arg, &use_i420, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_i420, argi)) {
       global->color_type = I420;
-    else if (arg_match(&arg, &use_i422, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_i422, argi)) {
       global->color_type = I422;
-    else if (arg_match(&arg, &use_i444, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_i444, argi)) {
       global->color_type = I444;
-    else if (arg_match(&arg, &quietarg, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.quietarg, argi)) {
       global->quiet = 1;
-    else if (arg_match(&arg, &verbosearg, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.verbosearg, argi)) {
       global->verbose = 1;
-    else if (arg_match(&arg, &limit, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.limit, argi)) {
       global->limit = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &skip, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.skip, argi)) {
       global->skip_frames = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &psnrarg, argi))
-      global->show_psnr = 1;
-    else if (arg_match(&arg, &recontest, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.psnrarg, argi)) {
+      if (arg.val)
+        global->show_psnr = arg_parse_int(&arg);
+      else
+        global->show_psnr = 1;
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.recontest, argi)) {
       global->test_decode = arg_parse_enum_or_int(&arg);
-    else if (arg_match(&arg, &framerate, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.framerate, argi)) {
       global->framerate = arg_parse_rational(&arg);
       validate_positive_rational(arg.name, &global->framerate);
       global->have_framerate = 1;
-    } else if (arg_match(&arg, &debugmode, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.debugmode, argi)) {
       global->debug = 1;
-    else if (arg_match(&arg, &q_hist_n, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.q_hist_n, argi)) {
       global->show_q_hist_buckets = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &rate_hist_n, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.rate_hist_n, argi)) {
       global->show_rate_hist_buckets = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &disable_warnings, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.disable_warnings, argi)) {
       global->disable_warnings = 1;
-    else if (arg_match(&arg, &disable_warning_prompt, argi))
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.disable_warning_prompt,
+                         argi)) {
       global->disable_warning_prompt = 1;
-    else
+    } else {
       argj++;
+    }
   }
 
   if (global->pass) {
     /* DWIM: Assume the user meant passes=2 if pass=2 is specified */
     if (global->pass > global->passes) {
-      warn("Assuming --pass=%d implies --passes=%d\n", global->pass,
-           global->pass);
+      aom_tools_warn("Assuming --pass=%d implies --passes=%d\n", global->pass,
+                     global->pass);
       global->passes = global->pass;
     }
   }
@@ -1248,18 +710,26 @@ static void parse_global_config(struct AvxEncoderConfig *global, char ***argv) {
 #if CONFIG_AV1_ENCODER
     // Make default AV1 passes = 2 until there is a better quality 1-pass
     // encoder
-    if (global->codec != NULL && global->codec->name != NULL)
-      global->passes = (strcmp(global->codec->name, "av1") == 0 &&
-                        global->usage != AOM_USAGE_REALTIME)
-                           ? 2
-                           : 1;
+    if (global->codec != NULL)
+      global->passes =
+          (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0 &&
+           global->usage != AOM_USAGE_REALTIME)
+              ? 2
+              : 1;
 #else
     global->passes = 1;
 #endif
   }
 
   if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) {
-    warn("Enforcing one-pass encoding in realtime mode\n");
+    aom_tools_warn("Enforcing one-pass encoding in realtime mode\n");
+    if (global->pass > 1)
+      die("Error: Invalid --pass=%d for one-pass encoding\n", global->pass);
+    global->passes = 1;
+  }
+
+  if (global->usage == AOM_USAGE_ALL_INTRA && global->passes > 1) {
+    aom_tools_warn("Enforcing one-pass encoding in all intra mode\n");
     global->passes = 1;
   }
 }
@@ -1302,6 +772,7 @@ static void open_input_file(struct AvxInputContext *input,
       input->framerate.denominator = input->y4m.fps_d;
       input->fmt = input->y4m.aom_fmt;
       input->bit_depth = input->y4m.bit_depth;
+      input->color_range = input->y4m.color_range;
     } else
       fatal("Unsupported Y4M stream.");
   } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
@@ -1333,8 +804,8 @@ static struct stream_state *new_stream(struct AvxEncoderConfig *global,
     aom_codec_err_t res;
 
     /* Populate encoder configuration */
-    res = aom_codec_enc_config_default(global->codec->codec_interface(),
-                                       &stream->config.cfg, global->usage);
+    res = aom_codec_enc_config_default(global->codec, &stream->config.cfg,
+                                       global->usage);
     if (res) fatal("Failed to get config: %s\n", aom_codec_err_to_string(res));
 
     /* Change the default timebase to a high enough value so that the
@@ -1367,6 +838,10 @@ static struct stream_state *new_stream(struct AvxEncoderConfig *global,
 
   /* Output files must be specified for each stream */
   stream->config.out_fn = NULL;
+  stream->config.two_pass_input = NULL;
+  stream->config.two_pass_output = NULL;
+  stream->config.two_pass_width = 0;
+  stream->config.two_pass_height = 0;
 
   stream->next = NULL;
   return stream;
@@ -1384,7 +859,7 @@ static void set_config_arg_ctrls(struct stream_config *config, int key,
   // so we simply append it.
   if (key == AV1E_SET_TARGET_SEQ_LEVEL_IDX) {
     j = config->arg_ctrl_cnt;
-    assert(j < (int)ARG_CTRL_CNT_MAX);
+    assert(j < ARG_CTRL_CNT_MAX);
     config->arg_ctrls[j][0] = key;
     config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg);
     ++config->arg_ctrl_cnt;
@@ -1398,22 +873,63 @@ static void set_config_arg_ctrls(struct stream_config *config, int key,
     if (config->arg_ctrls[j][0] == key) break;
 
   /* Update/insert */
-  assert(j < (int)ARG_CTRL_CNT_MAX);
+  assert(j < ARG_CTRL_CNT_MAX);
   config->arg_ctrls[j][0] = key;
   config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg);
 
   if (key == AOME_SET_ENABLEAUTOALTREF && config->arg_ctrls[j][1] > 1) {
-    warn("auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n");
+    aom_tools_warn(
+        "auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n");
     config->arg_ctrls[j][1] = 1;
   }
+
   if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++;
 }
 
+static void set_config_arg_key_vals(struct stream_config *config,
+                                    const char *name, const struct arg *arg) {
+  int j;
+  const char *val = arg->val;
+  // For target level, the settings should accumulate rather than overwrite,
+  // so we simply append it.
+  if (strcmp(name, "target-seq-level-idx") == 0) {
+    j = config->arg_key_val_cnt;
+    assert(j < ARG_KEY_VAL_CNT_MAX);
+    config->arg_key_vals[j][0] = name;
+    config->arg_key_vals[j][1] = val;
+    ++config->arg_key_val_cnt;
+    return;
+  }
+
+  /* Point either to the next free element or the first instance of this
+   * option.
+   */
+  for (j = 0; j < config->arg_key_val_cnt; j++)
+    if (strcmp(name, config->arg_key_vals[j][0]) == 0) break;
+
+  /* Update/insert */
+  assert(j < ARG_KEY_VAL_CNT_MAX);
+  config->arg_key_vals[j][0] = name;
+  config->arg_key_vals[j][1] = val;
+
+  if (strcmp(name, g_av1_codec_arg_defs.auto_altref.long_name) == 0) {
+    int auto_altref = arg_parse_int(arg);
+    if (auto_altref > 1) {
+      aom_tools_warn(
+          "auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n");
+      config->arg_key_vals[j][1] = "1";
+    }
+  }
+
+  if (j == config->arg_key_val_cnt) config->arg_key_val_cnt++;
+}
+
 static int parse_stream_params(struct AvxEncoderConfig *global,
                                struct stream_state *stream, char **argv) {
   char **argi, **argj;
   struct arg arg;
   static const arg_def_t **ctrl_args = no_args;
+  static const arg_def_t **key_val_args = no_args;
   static const int *ctrl_args_map = NULL;
   struct stream_config *config = &stream->config;
   int eos_mark_found = 0;
@@ -1422,11 +938,19 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
   // Handle codec specific options
   if (0) {
 #if CONFIG_AV1_ENCODER
-  } else if (strcmp(global->codec->name, "av1") == 0) {
+  } else if (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0) {
     // TODO(jingning): Reuse AV1 specific encoder configuration parameters.
     // Consider to expand this set for AV1 encoder control.
-    ctrl_args = av1_args;
+#if __STDC_VERSION__ >= 201112L
+    _Static_assert(NELEMENTS(av1_ctrl_args) == NELEMENTS(av1_arg_ctrl_map),
+                   "The av1_ctrl_args and av1_arg_ctrl_map arrays must be of "
+                   "the same size.");
+#else
+    assert(NELEMENTS(av1_ctrl_args) == NELEMENTS(av1_arg_ctrl_map));
+#endif
+    ctrl_args = av1_ctrl_args;
     ctrl_args_map = av1_arg_ctrl_map;
+    key_val_args = av1_key_val_args;
 #endif
   }
 
@@ -1444,7 +968,7 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
       continue;
     }
 
-    if (arg_match(&arg, &outputfile, argi)) {
+    if (arg_match(&arg, &g_av1_codec_arg_defs.outputfile, argi)) {
       config->out_fn = arg.val;
       if (!webm_forced) {
         const size_t out_fn_len = strlen(config->out_fn);
@@ -1458,170 +982,249 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
           config->write_ivf = 0;
         }
       }
-    } else if (arg_match(&arg, &fpf_name, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.fpf_name, argi)) {
       config->stats_fn = arg.val;
-    } else if (arg_match(&arg, &use_webm, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_webm, argi)) {
 #if CONFIG_WEBM_IO
       config->write_webm = 1;
       webm_forced = 1;
 #else
       die("Error: --webm specified but webm is disabled.");
 #endif
-    } else if (arg_match(&arg, &use_ivf, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_ivf, argi)) {
       config->write_webm = 0;
       config->write_ivf = 1;
-    } else if (arg_match(&arg, &use_obu, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_obu, argi)) {
       config->write_webm = 0;
       config->write_ivf = 0;
-    } else if (arg_match(&arg, &threads, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.threads, argi)) {
       config->cfg.g_threads = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &profile, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.profile, argi)) {
       config->cfg.g_profile = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &width, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.width, argi)) {
       config->cfg.g_w = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &height, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.height, argi)) {
       config->cfg.g_h = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &forced_max_frame_width, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.forced_max_frame_width,
+                         argi)) {
       config->cfg.g_forced_max_frame_width = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &forced_max_frame_height, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.forced_max_frame_height,
+                         argi)) {
       config->cfg.g_forced_max_frame_height = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &bitdeptharg, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.bitdeptharg, argi)) {
       config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg);
-    } else if (arg_match(&arg, &inbitdeptharg, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.inbitdeptharg, argi)) {
       config->cfg.g_input_bit_depth = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &input_chroma_subsampling_x, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.input_chroma_subsampling_x,
+                         argi)) {
       stream->chroma_subsampling_x = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &input_chroma_subsampling_y, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.input_chroma_subsampling_y,
+                         argi)) {
       stream->chroma_subsampling_y = arg_parse_uint(&arg);
 #if CONFIG_WEBM_IO
-    } else if (arg_match(&arg, &stereo_mode, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.stereo_mode, argi)) {
       config->stereo_fmt = arg_parse_enum_or_int(&arg);
 #endif
-    } else if (arg_match(&arg, &timebase, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.timebase, argi)) {
       config->cfg.g_timebase = arg_parse_rational(&arg);
       validate_positive_rational(arg.name, &config->cfg.g_timebase);
-    } else if (arg_match(&arg, &global_error_resilient, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.global_error_resilient,
+                         argi)) {
       config->cfg.g_error_resilient = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &lag_in_frames, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.lag_in_frames, argi)) {
       config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
-      if (global->usage == AOM_USAGE_REALTIME &&
-          config->cfg.rc_end_usage == AOM_CBR &&
-          config->cfg.g_lag_in_frames != 0) {
-        warn("non-zero %s option ignored in realtime CBR mode.\n", arg.name);
-        config->cfg.g_lag_in_frames = 0;
-      }
-    } else if (arg_match(&arg, &large_scale_tile, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.large_scale_tile, argi)) {
       config->cfg.large_scale_tile = arg_parse_uint(&arg);
-      if (config->cfg.large_scale_tile) global->codec = get_aom_lst_encoder();
-    } else if (arg_match(&arg, &monochrome, argi)) {
+      if (config->cfg.large_scale_tile) {
+        global->codec = get_aom_encoder_by_short_name("av1");
+      }
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.monochrome, argi)) {
       config->cfg.monochrome = 1;
-    } else if (arg_match(&arg, &full_still_picture_hdr, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.full_still_picture_hdr,
+                         argi)) {
       config->cfg.full_still_picture_hdr = 1;
-    } else if (arg_match(&arg, &dropframe_thresh, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_16bit_internal,
+                         argi)) {
+      config->use_16bit_internal = CONFIG_AV1_HIGHBITDEPTH;
+      if (!config->use_16bit_internal) {
+        aom_tools_warn("%s option ignored with CONFIG_AV1_HIGHBITDEPTH=0.\n",
+                       arg.name);
+      }
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.dropframe_thresh, argi)) {
       config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &resize_mode, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_mode, argi)) {
       config->cfg.rc_resize_mode = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &resize_denominator, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_denominator,
+                         argi)) {
       config->cfg.rc_resize_denominator = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &resize_kf_denominator, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_kf_denominator,
+                         argi)) {
       config->cfg.rc_resize_kf_denominator = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &superres_mode, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_mode, argi)) {
       config->cfg.rc_superres_mode = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &superres_denominator, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_denominator,
+                         argi)) {
       config->cfg.rc_superres_denominator = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &superres_kf_denominator, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_kf_denominator,
+                         argi)) {
       config->cfg.rc_superres_kf_denominator = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &superres_qthresh, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_qthresh, argi)) {
       config->cfg.rc_superres_qthresh = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &superres_kf_qthresh, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_kf_qthresh,
+                         argi)) {
       config->cfg.rc_superres_kf_qthresh = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &end_usage, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.end_usage, argi)) {
       config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
-    } else if (arg_match(&arg, &target_bitrate, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.target_bitrate, argi)) {
       config->cfg.rc_target_bitrate = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &min_quantizer, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.min_quantizer, argi)) {
       config->cfg.rc_min_quantizer = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &max_quantizer, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.max_quantizer, argi)) {
       config->cfg.rc_max_quantizer = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &undershoot_pct, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.undershoot_pct, argi)) {
       config->cfg.rc_undershoot_pct = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &overshoot_pct, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.overshoot_pct, argi)) {
       config->cfg.rc_overshoot_pct = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &buf_sz, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.buf_sz, argi)) {
       config->cfg.rc_buf_sz = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &buf_initial_sz, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.buf_initial_sz, argi)) {
       config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &buf_optimal_sz, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.buf_optimal_sz, argi)) {
       config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &bias_pct, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.bias_pct, argi)) {
       config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
       if (global->passes < 2)
-        warn("option %s ignored in one-pass mode.\n", arg.name);
-    } else if (arg_match(&arg, &minsection_pct, argi)) {
+        aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.minsection_pct, argi)) {
       config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg);
 
       if (global->passes < 2)
-        warn("option %s ignored in one-pass mode.\n", arg.name);
-    } else if (arg_match(&arg, &maxsection_pct, argi)) {
+        aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.maxsection_pct, argi)) {
       config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg);
 
       if (global->passes < 2)
-        warn("option %s ignored in one-pass mode.\n", arg.name);
-    } else if (arg_match(&arg, &fwd_kf_enabled, argi)) {
+        aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.fwd_kf_enabled, argi)) {
       config->cfg.fwd_kf_enabled = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &kf_min_dist, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_min_dist, argi)) {
       config->cfg.kf_min_dist = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &kf_max_dist, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_max_dist, argi)) {
       config->cfg.kf_max_dist = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &kf_disabled, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_disabled, argi)) {
       config->cfg.kf_mode = AOM_KF_DISABLED;
-    } else if (arg_match(&arg, &sframe_dist, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.sframe_dist, argi)) {
       config->cfg.sframe_dist = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &sframe_mode, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.sframe_mode, argi)) {
       config->cfg.sframe_mode = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &save_as_annexb, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.save_as_annexb, argi)) {
       config->cfg.save_as_annexb = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &tile_width, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.tile_width, argi)) {
       config->cfg.tile_width_count =
           arg_parse_list(&arg, config->cfg.tile_widths, MAX_TILE_WIDTHS);
-    } else if (arg_match(&arg, &tile_height, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.tile_height, argi)) {
       config->cfg.tile_height_count =
           arg_parse_list(&arg, config->cfg.tile_heights, MAX_TILE_HEIGHTS);
 #if CONFIG_TUNE_VMAF
-    } else if (arg_match(&arg, &vmaf_model_path, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.vmaf_model_path, argi)) {
       config->vmaf_model_path = arg.val;
 #endif
-    } else if (arg_match(&arg, &use_fixed_qp_offsets, argi)) {
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.partition_info_path,
+                         argi)) {
+      config->partition_info_path = arg.val;
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_fixed_qp_offsets,
+                         argi)) {
       config->cfg.use_fixed_qp_offsets = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &fixed_qp_offsets, argi)) {
-      const int fixed_qp_offset_count = arg_parse_list(
-          &arg, config->cfg.fixed_qp_offsets, FIXED_QP_OFFSET_COUNT);
-      if (fixed_qp_offset_count < FIXED_QP_OFFSET_COUNT) {
-        die("Option --fixed_qp_offsets requires %d comma-separated values, but "
-            "only %d values were provided.\n",
-            FIXED_QP_OFFSET_COUNT, fixed_qp_offset_count);
-      }
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.fixed_qp_offsets, argi)) {
       config->cfg.use_fixed_qp_offsets = 1;
     } else if (global->usage == AOM_USAGE_REALTIME &&
-               arg_match(&arg, &enable_restoration, argi)) {
+               arg_match(&arg, &g_av1_codec_arg_defs.enable_restoration,
+                         argi)) {
       if (arg_parse_uint(&arg) == 1) {
-        warn("non-zero %s option ignored in realtime mode.\n", arg.name);
+        aom_tools_warn("non-zero %s option ignored in realtime mode.\n",
+                       arg.name);
       }
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_input, argi)) {
+      config->two_pass_input = arg.val;
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_output, argi)) {
+      config->two_pass_output = arg.val;
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_width, argi)) {
+      config->two_pass_width = arg_parse_int(&arg);
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_height, argi)) {
+      config->two_pass_height = arg_parse_int(&arg);
     } else {
       int i, match = 0;
-      for (i = 0; ctrl_args[i]; i++) {
-        if (arg_match(&arg, ctrl_args[i], argi)) {
-          match = 1;
-          if (ctrl_args_map) {
+      // check if the control ID API supports this arg
+      if (ctrl_args_map) {
+        for (i = 0; ctrl_args[i]; i++) {
+          if (arg_match(&arg, ctrl_args[i], argi)) {
+            match = 1;
             set_config_arg_ctrls(config, ctrl_args_map[i], &arg);
+            break;
+          }
+        }
+      }
+      if (!match) {
+        // check if the key & value API supports this arg
+        for (i = 0; key_val_args[i]; i++) {
+          if (arg_match(&arg, key_val_args[i], argi)) {
+            match = 1;
+            set_config_arg_key_vals(config, key_val_args[i]->long_name, &arg);
+            break;
           }
         }
       }
       if (!match) argj++;
     }
   }
-  config->use_16bit_internal =
-      config->cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING;
+  config->use_16bit_internal |= config->cfg.g_bit_depth > AOM_BITS_8;
+
+  if (global->usage == AOM_USAGE_REALTIME && config->cfg.g_lag_in_frames != 0) {
+    aom_tools_warn("non-zero lag-in-frames option ignored in realtime mode.\n");
+    config->cfg.g_lag_in_frames = 0;
+  }
+
+  if (global->usage == AOM_USAGE_ALL_INTRA) {
+    if (config->cfg.g_lag_in_frames != 0) {
+      aom_tools_warn(
+          "non-zero lag-in-frames option ignored in all intra mode.\n");
+      config->cfg.g_lag_in_frames = 0;
+    }
+    if (config->cfg.kf_max_dist != 0) {
+      aom_tools_warn(
+          "non-zero max key frame distance option ignored in all intra "
+          "mode.\n");
+      config->cfg.kf_max_dist = 0;
+    }
+  }
+
+  // set the passes field using key & val API
+  if (config->arg_key_val_cnt >= ARG_KEY_VAL_CNT_MAX) {
+    die("Not enough buffer for the key & value API.");
+  }
+  config->arg_key_vals[config->arg_key_val_cnt][0] = "passes";
+  switch (global->passes) {
+    case 0: config->arg_key_vals[config->arg_key_val_cnt][1] = "0"; break;
+    case 1: config->arg_key_vals[config->arg_key_val_cnt][1] = "1"; break;
+    case 2: config->arg_key_vals[config->arg_key_val_cnt][1] = "2"; break;
+    case 3: config->arg_key_vals[config->arg_key_val_cnt][1] = "3"; break;
+    default: die("Invalid value of --passes.");
+  }
+  config->arg_key_val_cnt++;
+
+  // set the two_pass_output field
+  if (!config->two_pass_output && global->passes == 3) {
+    // If not specified, set the name of two_pass_output file here.
+    snprintf(stream->tmp_out_fn, sizeof(stream->tmp_out_fn),
+             "%.980s_pass2_%d.ivf", stream->config.out_fn, stream->index);
+    stream->config.two_pass_output = stream->tmp_out_fn;
+  }
+  if (config->two_pass_output) {
+    config->arg_key_vals[config->arg_key_val_cnt][0] = "two-pass-output";
+    config->arg_key_vals[config->arg_key_val_cnt][1] = config->two_pass_output;
+    config->arg_key_val_cnt++;
+  }
+
   return eos_mark_found;
 }
 
@@ -1699,6 +1302,7 @@ static const char *image_format_to_string(aom_img_fmt_t f) {
     case AOM_IMG_FMT_I422: return "I422";
     case AOM_IMG_FMT_I444: return "I444";
     case AOM_IMG_FMT_YV12: return "YV12";
+    case AOM_IMG_FMT_NV12: return "NV12";
     case AOM_IMG_FMT_YV1216: return "YV1216";
     case AOM_IMG_FMT_I42016: return "I42016";
     case AOM_IMG_FMT_I42216: return "I42216";
@@ -1714,8 +1318,7 @@ static void show_stream_config(struct stream_state *stream,
   fprintf(stderr, "    %-28s = %d\n", #field, stream->config.cfg.field)
 
   if (stream->index == 0) {
-    fprintf(stderr, "Codec: %s\n",
-            aom_codec_iface_name(global->codec->codec_interface()));
+    fprintf(stderr, "Codec: %s\n", aom_codec_iface_name(global->codec));
     fprintf(stderr, "Source file: %s File Type: %s Format: %s\n",
             input->filename, file_type_to_string(input->file_type),
             image_format_to_string(input->fmt));
@@ -1769,45 +1372,48 @@ static void show_stream_config(struct stream_state *stream,
 #define SHOW_PARAMS(field)                    \
   fprintf(stderr, "    %-28s = %d\n", #field, \
           stream->config.cfg.encoder_cfg.field)
-  SHOW_PARAMS(super_block_size);
-  SHOW_PARAMS(max_partition_size);
-  SHOW_PARAMS(min_partition_size);
-  SHOW_PARAMS(disable_ab_partition_type);
-  SHOW_PARAMS(disable_rect_partition_type);
-  SHOW_PARAMS(disable_1to4_partition_type);
-  SHOW_PARAMS(disable_flip_idtx);
-  SHOW_PARAMS(disable_cdef);
-  SHOW_PARAMS(disable_lr);
-  SHOW_PARAMS(disable_obmc);
-  SHOW_PARAMS(disable_warp_motion);
-  SHOW_PARAMS(disable_global_motion);
-  SHOW_PARAMS(disable_dist_wtd_comp);
-  SHOW_PARAMS(disable_diff_wtd_comp);
-  SHOW_PARAMS(disable_inter_intra_comp);
-  SHOW_PARAMS(disable_masked_comp);
-  SHOW_PARAMS(disable_one_sided_comp);
-  SHOW_PARAMS(disable_palette);
-  SHOW_PARAMS(disable_intrabc);
-  SHOW_PARAMS(disable_cfl);
-  SHOW_PARAMS(disable_smooth_intra);
-  SHOW_PARAMS(disable_filter_intra);
-  SHOW_PARAMS(disable_dual_filter);
-  SHOW_PARAMS(disable_intra_angle_delta);
-  SHOW_PARAMS(disable_intra_edge_filter);
-  SHOW_PARAMS(disable_tx_64x64);
-  SHOW_PARAMS(disable_smooth_inter_intra);
-  SHOW_PARAMS(disable_inter_inter_wedge);
-  SHOW_PARAMS(disable_inter_intra_wedge);
-  SHOW_PARAMS(disable_paeth_intra);
-  SHOW_PARAMS(disable_trellis_quant);
-  SHOW_PARAMS(disable_ref_frame_mv);
-  SHOW_PARAMS(reduced_reference_set);
-  SHOW_PARAMS(reduced_tx_type_set);
+  if (global->encoder_config.init_by_cfg_file) {
+    SHOW_PARAMS(super_block_size);
+    SHOW_PARAMS(max_partition_size);
+    SHOW_PARAMS(min_partition_size);
+    SHOW_PARAMS(disable_ab_partition_type);
+    SHOW_PARAMS(disable_rect_partition_type);
+    SHOW_PARAMS(disable_1to4_partition_type);
+    SHOW_PARAMS(disable_flip_idtx);
+    SHOW_PARAMS(disable_cdef);
+    SHOW_PARAMS(disable_lr);
+    SHOW_PARAMS(disable_obmc);
+    SHOW_PARAMS(disable_warp_motion);
+    SHOW_PARAMS(disable_global_motion);
+    SHOW_PARAMS(disable_dist_wtd_comp);
+    SHOW_PARAMS(disable_diff_wtd_comp);
+    SHOW_PARAMS(disable_inter_intra_comp);
+    SHOW_PARAMS(disable_masked_comp);
+    SHOW_PARAMS(disable_one_sided_comp);
+    SHOW_PARAMS(disable_palette);
+    SHOW_PARAMS(disable_intrabc);
+    SHOW_PARAMS(disable_cfl);
+    SHOW_PARAMS(disable_smooth_intra);
+    SHOW_PARAMS(disable_filter_intra);
+    SHOW_PARAMS(disable_dual_filter);
+    SHOW_PARAMS(disable_intra_angle_delta);
+    SHOW_PARAMS(disable_intra_edge_filter);
+    SHOW_PARAMS(disable_tx_64x64);
+    SHOW_PARAMS(disable_smooth_inter_intra);
+    SHOW_PARAMS(disable_inter_inter_wedge);
+    SHOW_PARAMS(disable_inter_intra_wedge);
+    SHOW_PARAMS(disable_paeth_intra);
+    SHOW_PARAMS(disable_trellis_quant);
+    SHOW_PARAMS(disable_ref_frame_mv);
+    SHOW_PARAMS(reduced_reference_set);
+    SHOW_PARAMS(reduced_tx_type_set);
+  }
 }
 
 static void open_output_file(struct stream_state *stream,
                              struct AvxEncoderConfig *global,
-                             const struct AvxRational *pixel_aspect_ratio) {
+                             const struct AvxRational *pixel_aspect_ratio,
+                             const char *encoder_settings) {
   const char *fn = stream->config.out_fn;
   const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg;
 
@@ -1824,17 +1430,20 @@ static void open_output_file(struct stream_state *stream,
   if (stream->config.write_webm) {
     stream->webm_ctx.stream = stream->file;
     if (write_webm_file_header(&stream->webm_ctx, &stream->encoder, cfg,
-                               stream->config.stereo_fmt, global->codec->fourcc,
-                               pixel_aspect_ratio) != 0) {
+                               stream->config.stereo_fmt,
+                               get_fourcc_by_aom_encoder(global->codec),
+                               pixel_aspect_ratio, encoder_settings) != 0) {
       fatal("WebM writer initialization failed.");
     }
   }
 #else
   (void)pixel_aspect_ratio;
+  (void)encoder_settings;
 #endif
 
   if (!stream->config.write_webm && stream->config.write_ivf) {
-    ivf_write_file_header(stream->file, cfg, global->codec->fourcc, 0);
+    ivf_write_file_header(stream->file, cfg,
+                          get_fourcc_by_aom_encoder(global->codec), 0);
   }
 }
 
@@ -1871,9 +1480,17 @@ static void setup_pass(struct stream_state *stream,
       fatal("Failed to open statistics store");
   }
 
-  stream->config.cfg.g_pass = global->passes == 2
-                                  ? pass ? AOM_RC_LAST_PASS : AOM_RC_FIRST_PASS
-                                  : AOM_RC_ONE_PASS;
+  if (global->passes == 1) {
+    stream->config.cfg.g_pass = AOM_RC_ONE_PASS;
+  } else {
+    switch (pass) {
+      case 0: stream->config.cfg.g_pass = AOM_RC_FIRST_PASS; break;
+      case 1: stream->config.cfg.g_pass = AOM_RC_SECOND_PASS; break;
+      case 2: stream->config.cfg.g_pass = AOM_RC_THIRD_PASS; break;
+      default: fatal("Failed to set pass");
+    }
+  }
+
   if (pass) {
     stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
   }
@@ -1888,12 +1505,12 @@ static void initialize_encoder(struct stream_state *stream,
   int i;
   int flags = 0;
 
-  flags |= global->show_psnr ? AOM_CODEC_USE_PSNR : 0;
+  flags |= (global->show_psnr >= 1) ? AOM_CODEC_USE_PSNR : 0;
   flags |= stream->config.use_16bit_internal ? AOM_CODEC_USE_HIGHBITDEPTH : 0;
 
   /* Construct Encoder Context */
-  aom_codec_enc_init(&stream->encoder, global->codec->codec_interface(),
-                     &stream->config.cfg, flags);
+  aom_codec_enc_init(&stream->encoder, global->codec, &stream->config.cfg,
+                     flags);
   ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder");
 
   for (i = 0; i < stream->config.arg_ctrl_cnt; i++) {
@@ -1905,25 +1522,42 @@ static void initialize_encoder(struct stream_state *stream,
     ctx_exit_on_error(&stream->encoder, "Failed to control codec");
   }
 
+  for (i = 0; i < stream->config.arg_key_val_cnt; i++) {
+    const char *name = stream->config.arg_key_vals[i][0];
+    const char *val = stream->config.arg_key_vals[i][1];
+    if (aom_codec_set_option(&stream->encoder, name, val))
+      fprintf(stderr, "Error: Tried to set option %s = %s\n", name, val);
+
+    ctx_exit_on_error(&stream->encoder, "Failed to set codec option");
+  }
+
 #if CONFIG_TUNE_VMAF
   if (stream->config.vmaf_model_path) {
     AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_VMAF_MODEL_PATH,
                                   stream->config.vmaf_model_path);
   }
 #endif
+  if (stream->config.partition_info_path) {
+    AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+                                  AV1E_SET_PARTITION_INFO_PATH,
+                                  stream->config.partition_info_path);
+  }
 
   if (stream->config.film_grain_filename) {
     AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE,
                                   stream->config.film_grain_filename);
   }
+  AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_COLOR_RANGE,
+                                stream->config.color_range);
 
 #if CONFIG_AV1_DECODER
   if (global->test_decode != TEST_DECODE_OFF) {
-    const AvxInterface *decoder = get_aom_decoder_by_name(global->codec->name);
-    aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
-    aom_codec_dec_init(&stream->decoder, decoder->codec_interface(), &cfg, 0);
+    aom_codec_iface_t *decoder = get_aom_decoder_by_short_name(
+        get_short_name_by_aom_encoder(global->codec));
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0, !stream->config.use_16bit_internal };
+    aom_codec_dec_init(&stream->decoder, decoder, &cfg, 0);
 
-    if (strcmp(global->codec->name, "av1") == 0) {
+    if (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0) {
       AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_TILE_MODE,
                                     stream->config.cfg.large_scale_tile);
       ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_mode");
@@ -1944,6 +1578,33 @@ static void initialize_encoder(struct stream_state *stream,
 #endif
 }
 
+// Convert the input image 'img' to a monochrome image. The Y plane of the
+// output image is a shallow copy of the Y plane of the input image, therefore
+// the input image must remain valid for the lifetime of the output image. The U
+// and V planes of the output image are set to null pointers. The output image
+// format is AOM_IMG_FMT_I420 because libaom does not have AOM_IMG_FMT_I400.
+static void convert_image_to_monochrome(const struct aom_image *img,
+                                        struct aom_image *monochrome_img) {
+  *monochrome_img = *img;
+  monochrome_img->fmt = AOM_IMG_FMT_I420;
+  if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    monochrome_img->fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  }
+  monochrome_img->monochrome = 1;
+  monochrome_img->csp = AOM_CSP_UNKNOWN;
+  monochrome_img->x_chroma_shift = 1;
+  monochrome_img->y_chroma_shift = 1;
+  monochrome_img->planes[AOM_PLANE_U] = NULL;
+  monochrome_img->planes[AOM_PLANE_V] = NULL;
+  monochrome_img->stride[AOM_PLANE_U] = 0;
+  monochrome_img->stride[AOM_PLANE_V] = 0;
+  monochrome_img->sz = 0;
+  monochrome_img->bps = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+  monochrome_img->img_data = NULL;
+  monochrome_img->img_data_owner = 0;
+  monochrome_img->self_allocd = 0;
+}
+
 static void encode_frame(struct stream_state *stream,
                          struct AvxEncoderConfig *global, struct aom_image *img,
                          unsigned int frames_in) {
@@ -2023,6 +1684,12 @@ static void encode_frame(struct stream_state *stream,
 #endif
   }
 
+  struct aom_image monochrome_img;
+  if (img && cfg->monochrome) {
+    convert_image_to_monochrome(img, &monochrome_img);
+    img = &monochrome_img;
+  }
+
   aom_usec_timer_start(&timer);
   aom_codec_encode(&stream->encoder, img, frame_start,
                    (uint32_t)(next_frame_start - frame_start), 0);
@@ -2113,17 +1780,31 @@ static void get_cx_data(struct stream_state *stream,
         break;
       case AOM_CODEC_PSNR_PKT:
 
-        if (global->show_psnr) {
+        if (global->show_psnr >= 1) {
           int i;
 
-          stream->psnr_sse_total += pkt->data.psnr.sse[0];
-          stream->psnr_samples_total += pkt->data.psnr.samples[0];
+          stream->psnr_sse_total[0] += pkt->data.psnr.sse[0];
+          stream->psnr_samples_total[0] += pkt->data.psnr.samples[0];
           for (i = 0; i < 4; i++) {
             if (!global->quiet)
               fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]);
-            stream->psnr_totals[i] += pkt->data.psnr.psnr[i];
+            stream->psnr_totals[0][i] += pkt->data.psnr.psnr[i];
           }
-          stream->psnr_count++;
+          stream->psnr_count[0]++;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+          if (stream->config.cfg.g_input_bit_depth <
+              (unsigned int)stream->config.cfg.g_bit_depth) {
+            stream->psnr_sse_total[1] += pkt->data.psnr.sse_hbd[0];
+            stream->psnr_samples_total[1] += pkt->data.psnr.samples_hbd[0];
+            for (i = 0; i < 4; i++) {
+              if (!global->quiet)
+                fprintf(stderr, "%.3f ", pkt->data.psnr.psnr_hbd[i]);
+              stream->psnr_totals[1][i] += pkt->data.psnr.psnr_hbd[i];
+            }
+            stream->psnr_count[1]++;
+          }
+#endif
         }
 
         break;
@@ -2136,15 +1817,38 @@ static void show_psnr(struct stream_state *stream, double peak, int64_t bps) {
   int i;
   double ovpsnr;
 
-  if (!stream->psnr_count) return;
+  if (!stream->psnr_count[0]) return;
+
+  fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
+  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total[0], peak,
+                       (double)stream->psnr_sse_total[0]);
+  fprintf(stderr, " %.3f", ovpsnr);
+
+  for (i = 0; i < 4; i++) {
+    fprintf(stderr, " %.3f", stream->psnr_totals[0][i] / stream->psnr_count[0]);
+  }
+  if (bps > 0) {
+    fprintf(stderr, " %7" PRId64 " bps", bps);
+  }
+  fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000);
+  fprintf(stderr, "\n");
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void show_psnr_hbd(struct stream_state *stream, double peak,
+                          int64_t bps) {
+  int i;
+  double ovpsnr;
+  // Compute PSNR based on stream bit depth
+  if (!stream->psnr_count[1]) return;
 
   fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
-  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, peak,
-                       (double)stream->psnr_sse_total);
+  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total[1], peak,
+                       (double)stream->psnr_sse_total[1]);
   fprintf(stderr, " %.3f", ovpsnr);
 
   for (i = 0; i < 4; i++) {
-    fprintf(stderr, " %.3f", stream->psnr_totals[i] / stream->psnr_count);
+    fprintf(stderr, " %.3f", stream->psnr_totals[1][i] / stream->psnr_count[1]);
   }
   if (bps > 0) {
     fprintf(stderr, " %7" PRId64 " bps", bps);
@@ -2152,6 +1856,7 @@ static void show_psnr(struct stream_state *stream, double peak, int64_t bps) {
   fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000);
   fprintf(stderr, "\n");
 }
+#endif
 
 static float usec_to_fps(uint64_t usec, unsigned int frames) {
   return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
@@ -2231,12 +1936,36 @@ static void print_time(const char *label, int64_t etl) {
   }
 }
 
+static void clear_stream_count_state(struct stream_state *stream) {
+  // PSNR counters
+  for (int k = 0; k < 2; k++) {
+    stream->psnr_sse_total[k] = 0;
+    stream->psnr_samples_total[k] = 0;
+    for (int i = 0; i < 4; i++) {
+      stream->psnr_totals[k][i] = 0;
+    }
+    stream->psnr_count[k] = 0;
+  }
+  // q hist
+  memset(stream->counts, 0, sizeof(stream->counts));
+}
+
+// aomenc will downscale the second pass if:
+// 1. the specific pass is not given by commandline (aomenc will perform all
+//    passes)
+// 2. there are more than 2 passes in total
+// 3. current pass is the second pass (the parameter pass starts with 0 so
+//    pass == 1)
+static int pass_need_downscale(int global_pass, int global_passes, int pass) {
+  return !global_pass && global_passes > 2 && pass == 1;
+}
+
 int main(int argc, const char **argv_) {
   int pass;
   aom_image_t raw;
   aom_image_t raw_shift;
   int allocated_raw_shift = 0;
-  int use_16bit_internal = 0;
+  int do_16bit_internal = 0;
   int input_shift = 0;
   int frame_avail, got_data;
 
@@ -2250,6 +1979,7 @@ int main(int argc, const char **argv_) {
   int profile_updated = 0;
 
   memset(&input, 0, sizeof(input));
+  memset(&raw, 0, sizeof(raw));
   exec_name = argv_[0];
 
   /* Setup default input stream settings */
@@ -2263,6 +1993,10 @@ int main(int argc, const char **argv_) {
    * codec.
    */
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   parse_global_config(&global, &argv);
 
   if (argc < 2) usage_exit();
@@ -2272,6 +2006,7 @@ int main(int argc, const char **argv_) {
     case I422: input.fmt = AOM_IMG_FMT_I422; break;
     case I444: input.fmt = AOM_IMG_FMT_I444; break;
     case YV12: input.fmt = AOM_IMG_FMT_YV12; break;
+    case NV12: input.fmt = AOM_IMG_FMT_NV12; break;
   }
 
   {
@@ -2304,6 +2039,14 @@ int main(int argc, const char **argv_) {
 
   /* Handle non-option arguments */
   input.filename = argv[0];
+  const char *orig_input_filename = input.filename;
+  FOREACH_STREAM(stream, streams) {
+    stream->orig_out_fn = stream->config.out_fn;
+    stream->orig_width = stream->config.cfg.g_w;
+    stream->orig_height = stream->config.cfg.g_h;
+    stream->orig_write_ivf = stream->config.write_ivf;
+    stream->orig_write_webm = stream->config.write_webm;
+  }
 
   if (!input.filename) {
     fprintf(stderr, "No input file specified!\n");
@@ -2311,13 +2054,52 @@ int main(int argc, const char **argv_) {
   }
 
   /* Decide if other chroma subsamplings than 4:2:0 are supported */
-  if (global.codec->fourcc == AV1_FOURCC) input.only_i420 = 0;
+  if (get_fourcc_by_aom_encoder(global.codec) == AV1_FOURCC)
+    input.only_i420 = 0;
 
   for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
+    if (pass > 1) {
+      FOREACH_STREAM(stream, streams) { clear_stream_count_state(stream); }
+    }
+
     int frames_in = 0, seen_frames = 0;
     int64_t estimated_time_left = -1;
     int64_t average_rate = -1;
     int64_t lagged_count = 0;
+    const int need_downscale =
+        pass_need_downscale(global.pass, global.passes, pass);
+
+    // Set the output to the specified two-pass output file, and
+    // restore the width and height to the original values.
+    FOREACH_STREAM(stream, streams) {
+      if (need_downscale) {
+        stream->config.out_fn = stream->config.two_pass_output;
+        // Libaom currently only supports the ivf format for the third pass.
+        stream->config.write_ivf = 1;
+        stream->config.write_webm = 0;
+      } else {
+        stream->config.out_fn = stream->orig_out_fn;
+        stream->config.write_ivf = stream->orig_write_ivf;
+        stream->config.write_webm = stream->orig_write_webm;
+      }
+      stream->config.cfg.g_w = stream->orig_width;
+      stream->config.cfg.g_h = stream->orig_height;
+    }
+
+    // For second pass in three-pass encoding, set the input to
+    // the given two-pass-input file if available. If the scaled input is not
+    // given, we will attempt to re-scale the original input.
+    input.filename = orig_input_filename;
+    const char *two_pass_input = NULL;
+    if (need_downscale) {
+      FOREACH_STREAM(stream, streams) {
+        if (stream->config.two_pass_input) {
+          two_pass_input = stream->config.two_pass_input;
+          input.filename = two_pass_input;
+          break;
+        }
+      }
+    }
 
     open_input_file(&input, global.csp);
 
@@ -2325,20 +2107,65 @@ int main(int argc, const char **argv_) {
      * the data from the first stream's configuration.
      */
     if (!input.width || !input.height) {
-      FOREACH_STREAM(stream, streams) {
-        if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
-          input.width = stream->config.cfg.g_w;
-          input.height = stream->config.cfg.g_h;
-          break;
+      if (two_pass_input) {
+        FOREACH_STREAM(stream, streams) {
+          if (stream->config.two_pass_width && stream->config.two_pass_height) {
+            input.width = stream->config.two_pass_width;
+            input.height = stream->config.two_pass_height;
+            break;
+          }
         }
-      };
+      } else {
+        FOREACH_STREAM(stream, streams) {
+          if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
+            input.width = stream->config.cfg.g_w;
+            input.height = stream->config.cfg.g_h;
+            break;
+          }
+        }
+      }
     }
 
     /* Update stream configurations from the input file's parameters */
-    if (!input.width || !input.height)
-      fatal(
-          "Specify stream dimensions with --width (-w) "
-          " and --height (-h)");
+    if (!input.width || !input.height) {
+      if (two_pass_input) {
+        fatal(
+            "Specify downscaled stream dimensions with --two-pass-width "
+            " and --two-pass-height");
+      } else {
+        fatal(
+            "Specify stream dimensions with --width (-w) "
+            " and --height (-h)");
+      }
+    }
+
+    if (need_downscale) {
+      FOREACH_STREAM(stream, streams) {
+        if (stream->config.two_pass_width && stream->config.two_pass_height) {
+          stream->config.cfg.g_w = stream->config.two_pass_width;
+          stream->config.cfg.g_h = stream->config.two_pass_height;
+        } else if (two_pass_input) {
+          stream->config.cfg.g_w = input.width;
+          stream->config.cfg.g_h = input.height;
+        } else if (stream->orig_width && stream->orig_height) {
+#if CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL
+          stream->config.cfg.g_w = stream->orig_width;
+          stream->config.cfg.g_h = stream->orig_height;
+#else   // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL
+          stream->config.cfg.g_w = (stream->orig_width + 1) / 2;
+          stream->config.cfg.g_h = (stream->orig_height + 1) / 2;
+#endif  // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL
+        } else {
+#if CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL
+          stream->config.cfg.g_w = input.width;
+          stream->config.cfg.g_h = input.height;
+#else   // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL
+          stream->config.cfg.g_w = (input.width + 1) / 2;
+          stream->config.cfg.g_h = (input.height + 1) / 2;
+#endif  // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL
+        }
+      }
+    }
 
     /* If input file does not specify bit-depth but input-bit-depth parameter
      * exists, assume that to be the input bit-depth. However, if the
@@ -2361,7 +2188,8 @@ int main(int argc, const char **argv_) {
     }
 
     FOREACH_STREAM(stream, streams) {
-      if (input.fmt != AOM_IMG_FMT_I420 && input.fmt != AOM_IMG_FMT_I42016) {
+      if (input.fmt != AOM_IMG_FMT_I420 && input.fmt != AOM_IMG_FMT_I42016 &&
+          input.fmt != AOM_IMG_FMT_NV12) {
         /* Automatically upgrade if input is non-4:2:0 but a 4:2:0 profile
            was selected. */
         switch (stream->config.cfg.g_profile) {
@@ -2372,8 +2200,10 @@ int main(int argc, const char **argv_) {
                 stream->config.cfg.g_profile = 1;
                 profile_updated = 1;
               }
-            } else if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 ||
-                       input.fmt == AOM_IMG_FMT_I42216) {
+            } else if (input.bit_depth == 12 ||
+                       ((input.fmt == AOM_IMG_FMT_I422 ||
+                         input.fmt == AOM_IMG_FMT_I42216) &&
+                        !stream->config.cfg.monochrome)) {
               stream->config.cfg.g_profile = 2;
               profile_updated = 1;
             }
@@ -2435,6 +2265,11 @@ int main(int argc, const char **argv_) {
                   stream->config.cfg.g_input_bit_depth);
         }
       }
+#if !CONFIG_AV1_HIGHBITDEPTH
+      if (stream->config.cfg.g_bit_depth > 8) {
+        fatal("Unsupported bit-depth with CONFIG_AV1_HIGHBITDEPTH=0\n");
+      }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
       if (stream->config.cfg.g_bit_depth > 10) {
         switch (stream->config.cfg.g_profile) {
           case 0:
@@ -2454,24 +2289,39 @@ int main(int argc, const char **argv_) {
                 "match input format.\n",
                 stream->config.cfg.g_profile);
       }
+      if ((global.show_psnr == 2) && (stream->config.cfg.g_input_bit_depth ==
+                                      stream->config.cfg.g_bit_depth)) {
+        fprintf(stderr,
+                "Warning: --psnr==2 and --psnr==1 will provide same "
+                "results when input bit-depth == stream bit-depth, "
+                "falling back to default psnr value\n");
+        global.show_psnr = 1;
+      }
+      if (global.show_psnr < 0 || global.show_psnr > 2) {
+        fprintf(stderr,
+                "Warning: --psnr can take only 0,1,2 as values,"
+                "falling back to default psnr value\n");
+        global.show_psnr = 1;
+      }
       /* Set limit */
       stream->config.cfg.g_limit = global.limit;
     }
 
     FOREACH_STREAM(stream, streams) {
       set_stream_dimensions(stream, input.width, input.height);
+      stream->config.color_range = input.color_range;
     }
     FOREACH_STREAM(stream, streams) { validate_stream_config(stream, &global); }
 
     /* Ensure that --passes and --pass are consistent. If --pass is set and
-     * --passes=2, ensure --fpf was set.
+     * --passes >= 2, ensure --fpf was set.
      */
-    if (global.pass && global.passes == 2) {
+    if (global.pass > 0 && global.pass <= 3 && global.passes >= 2) {
       FOREACH_STREAM(stream, streams) {
         if (!stream->config.stats_fn)
           die("Stream %d: Must specify --fpf when --pass=%d"
-              " and --passes=2\n",
-              stream->index, global.pass);
+              " and --passes=%d\n",
+              stream->index, global.pass, global.passes);
       }
     }
 
@@ -2480,7 +2330,7 @@ int main(int argc, const char **argv_) {
       if (stream->config.write_webm) {
         stream->config.write_webm = 0;
         stream->config.write_ivf = 0;
-        warn("aomenc compiled w/o WebM support. Writing OBU stream.");
+        aom_tools_warn("aomenc compiled w/o WebM support. Writing OBU stream.");
       }
     }
 #endif
@@ -2504,14 +2354,10 @@ int main(int argc, const char **argv_) {
     }
 
     if (pass == (global.pass ? global.pass - 1 : 0)) {
-      if (input.file_type == FILE_TYPE_Y4M)
-        /*The Y4M reader does its own allocation.
-          Just initialize this here to avoid problems if we never read any
-          frames.*/
-        memset(&raw, 0, sizeof(raw));
-      else
+      // The Y4M reader does its own allocation.
+      if (input.file_type != FILE_TYPE_Y4M) {
         aom_img_alloc(&raw, input.fmt, input.width, input.height, 32);
-
+      }
       FOREACH_STREAM(stream, streams) {
         stream->rate_hist =
             init_rate_histogram(&stream->config.cfg, &global.framerate);
@@ -2521,21 +2367,41 @@ int main(int argc, const char **argv_) {
     FOREACH_STREAM(stream, streams) { setup_pass(stream, &global, pass); }
     FOREACH_STREAM(stream, streams) { initialize_encoder(stream, &global); }
     FOREACH_STREAM(stream, streams) {
-      open_output_file(stream, &global, &input.pixel_aspect_ratio);
+      char *encoder_settings = NULL;
+#if CONFIG_WEBM_IO
+      // Test frameworks may compare outputs from different versions, but only
+      // wish to check for bitstream changes. The encoder-settings tag, however,
+      // can vary if the version is updated, even if no encoder algorithm
+      // changes were made. To work around this issue, do not output
+      // the encoder-settings tag when --debug is enabled (which is the flag
+      // that test frameworks should use, when they want deterministic output
+      // from the container format).
+      if (stream->config.write_webm && !stream->webm_ctx.debug) {
+        encoder_settings = extract_encoder_settings(
+            aom_codec_version_str(), argv_, argc, input.filename);
+        if (encoder_settings == NULL) {
+          fprintf(
+              stderr,
+              "Warning: unable to extract encoder settings. Continuing...\n");
+        }
+      }
+#endif
+      open_output_file(stream, &global, &input.pixel_aspect_ratio,
+                       encoder_settings);
+      free(encoder_settings);
     }
 
-    if (strcmp(global.codec->name, "av1") == 0 ||
-        strcmp(global.codec->name, "av1") == 0) {
+    if (strcmp(get_short_name_by_aom_encoder(global.codec), "av1") == 0) {
       // Check to see if at least one stream uses 16 bit internal.
       // Currently assume that the bit_depths for all streams using
       // highbitdepth are the same.
       FOREACH_STREAM(stream, streams) {
         if (stream->config.use_16bit_internal) {
-          use_16bit_internal = 1;
+          do_16bit_internal = 1;
         }
         input_shift = (int)stream->config.cfg.g_bit_depth -
                       stream->config.cfg.g_input_bit_depth;
-      };
+      }
     }
 
     frame_avail = 1;
@@ -2566,6 +2432,11 @@ int main(int argc, const char **argv_) {
                   cx_time > 9999999 ? "ms" : "us", fps >= 1.0 ? fps : fps * 60,
                   fps >= 1.0 ? "fps" : "fpm");
           print_time("ETA", estimated_time_left);
+          // mingw-w64 gcc does not match msvc for stderr buffering behavior
+          // and uses line buffering, thus the progress output is not
+          // real-time. The fflush() is here to make sure the progress output
+          // is sent out while the clip is being processed.
+          fflush(stderr);
         }
 
       } else {
@@ -2574,8 +2445,8 @@ int main(int argc, const char **argv_) {
 
       if (frames_in > global.skip_frames) {
         aom_image_t *frame_to_encode;
-        if (input_shift || (use_16bit_internal && input.bit_depth == 8)) {
-          assert(use_16bit_internal);
+        if (input_shift || (do_16bit_internal && input.bit_depth == 8)) {
+          assert(do_16bit_internal);
           // Input bit depth and stream bit depth do not match, so up
           // shift frame to stream bit depth
           if (!allocated_raw_shift) {
@@ -2589,7 +2460,7 @@ int main(int argc, const char **argv_) {
           frame_to_encode = &raw;
         }
         aom_usec_timer_start(&timer);
-        if (use_16bit_internal) {
+        if (do_16bit_internal) {
           assert(frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH);
           FOREACH_STREAM(stream, streams) {
             if (stream->config.use_16bit_internal)
@@ -2597,7 +2468,7 @@ int main(int argc, const char **argv_) {
                            frame_avail ? frame_to_encode : NULL, frames_in);
             else
               assert(0);
-          };
+          }
         } else {
           assert((frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH) == 0);
           FOREACH_STREAM(stream, streams) {
@@ -2671,25 +2542,58 @@ int main(int argc, const char **argv_) {
                                           : stream->cx_time,
                 stream->cx_time > 9999999 ? "ms" : "us",
                 usec_to_fps(stream->cx_time, seen_frames));
+        // This instance of cr does not need fflush as it is followed by a
+        // newline in the same string.
       }
     }
 
-    if (global.show_psnr) {
-      if (global.codec->fourcc == AV1_FOURCC) {
+    if (global.show_psnr >= 1) {
+      if (get_fourcc_by_aom_encoder(global.codec) == AV1_FOURCC) {
         FOREACH_STREAM(stream, streams) {
           int64_t bps = 0;
-          if (stream->psnr_count && seen_frames && global.framerate.den) {
-            bps = (int64_t)stream->nbytes * 8 * (int64_t)global.framerate.num /
-                  global.framerate.den / seen_frames;
+          if (global.show_psnr == 1) {
+            if (stream->psnr_count[0] && seen_frames && global.framerate.den) {
+              bps = (int64_t)stream->nbytes * 8 *
+                    (int64_t)global.framerate.num / global.framerate.den /
+                    seen_frames;
+            }
+            show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1,
+                      bps);
+          }
+          if (global.show_psnr == 2) {
+#if CONFIG_AV1_HIGHBITDEPTH
+            if (stream->config.cfg.g_input_bit_depth <
+                (unsigned int)stream->config.cfg.g_bit_depth)
+              show_psnr_hbd(stream, (1 << stream->config.cfg.g_bit_depth) - 1,
+                            bps);
+#endif
           }
-          show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1,
-                    bps);
         }
       } else {
         FOREACH_STREAM(stream, streams) { show_psnr(stream, 255.0, 0); }
       }
     }
 
+    if (pass == global.passes - 1) {
+      FOREACH_STREAM(stream, streams) {
+        int levels[32] = { 0 };
+        int target_levels[32] = { 0 };
+        aom_codec_control(&stream->encoder, AV1E_GET_SEQ_LEVEL_IDX, levels);
+        aom_codec_control(&stream->encoder, AV1E_GET_TARGET_SEQ_LEVEL_IDX,
+                          target_levels);
+
+        for (int i = 0; i < 32; i++) {
+          if (levels[i] > target_levels[i]) {
+            aom_tools_warn(
+                "Failed to encode to target level %d.%d for operating point "
+                "%d. The output level is %d.%d",
+                2 + (target_levels[i] >> 2), target_levels[i] & 3, i,
+                2 + (levels[i] >> 2), levels[i] & 3);
+          }
+        }
+      }
+    }
+
     FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->encoder); }
 
     if (global.test_decode != TEST_DECODE_OFF) {
@@ -2702,7 +2606,7 @@ int main(int argc, const char **argv_) {
       FOREACH_STREAM(stream, streams) { res |= stream->mismatch_seen; }
     }
     FOREACH_STREAM(stream, streams) {
-      close_output_file(stream, global.codec->fourcc);
+      close_output_file(stream, get_fourcc_by_aom_encoder(global.codec));
     }
 
     FOREACH_STREAM(stream, streams) {
diff --git a/media/libaom/src/apps/aomenc.h b/media/libaom/src/apps/aomenc.h
index a38258b872..935d5fcd16 100644
--- a/media/libaom/src/apps/aomenc.h
+++ b/media/libaom/src/apps/aomenc.h
@@ -13,29 +13,23 @@
 
 #include "aom/aom_codec.h"
 #include "aom/aom_encoder.h"
+#include "av1/arg_defs.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-enum TestDecodeFatality {
-  TEST_DECODE_OFF,
-  TEST_DECODE_FATAL,
-  TEST_DECODE_WARN,
-};
-
 typedef enum {
   I420,  // 4:2:0 8+ bit-depth
   I422,  // 4:2:2 8+ bit-depth
   I444,  // 4:4:4 8+ bit-depth
   YV12,  // 4:2:0 with uv flipped, only 8-bit depth
+  NV12,  // 4:2:0 with uv interleaved, only 8-bit depth
 } ColorInputType;
 
-struct AvxInterface;
-
 /* Configuration elements common to all streams. */
 struct AvxEncoderConfig {
-  const struct AvxInterface *codec;
+  aom_codec_iface_t *codec;
   int passes;
   int pass;
   unsigned int usage;
diff --git a/media/libaom/src/av1/arg_defs.c b/media/libaom/src/av1/arg_defs.c
new file mode 100644
index 0000000000..c517818754
--- /dev/null
+++ b/media/libaom/src/av1/arg_defs.c
@@ -0,0 +1,677 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/arg_defs.h"
+
+static const struct arg_enum_list test_decode_enum[] = {
+  { "off", TEST_DECODE_OFF },
+  { "fatal", TEST_DECODE_FATAL },
+  { "warn", TEST_DECODE_WARN },
+  { NULL, 0 }
+};
+
+static const struct arg_enum_list bitdepth_enum[] = {
+  { "8", AOM_BITS_8 }, { "10", AOM_BITS_10 }, { "12", AOM_BITS_12 }, { NULL, 0 }
+};
+
+#if CONFIG_WEBM_IO
+static const struct arg_enum_list stereo_mode_enum[] = {
+  { "mono", STEREO_FORMAT_MONO },
+  { "left-right", STEREO_FORMAT_LEFT_RIGHT },
+  { "bottom-top", STEREO_FORMAT_BOTTOM_TOP },
+  { "top-bottom", STEREO_FORMAT_TOP_BOTTOM },
+  { "right-left", STEREO_FORMAT_RIGHT_LEFT },
+  { NULL, 0 }
+};
+#endif
+
+static const struct arg_enum_list end_usage_enum[] = { { "vbr", AOM_VBR },
+                                                       { "cbr", AOM_CBR },
+                                                       { "cq", AOM_CQ },
+                                                       { "q", AOM_Q },
+                                                       { NULL, 0 } };
+
+static const struct arg_enum_list tuning_enum[] = {
+  { "psnr", AOM_TUNE_PSNR },
+  { "ssim", AOM_TUNE_SSIM },
+  { "vmaf_with_preprocessing", AOM_TUNE_VMAF_WITH_PREPROCESSING },
+  { "vmaf_without_preprocessing", AOM_TUNE_VMAF_WITHOUT_PREPROCESSING },
+  { "vmaf", AOM_TUNE_VMAF_MAX_GAIN },
+  { "vmaf_neg", AOM_TUNE_VMAF_NEG_MAX_GAIN },
+  { "butteraugli", AOM_TUNE_BUTTERAUGLI },
+  { NULL, 0 }
+};
+
+static const struct arg_enum_list dist_metric_enum[] = {
+  { "psnr", AOM_DIST_METRIC_PSNR },
+  { "qm-psnr", AOM_DIST_METRIC_QM_PSNR },
+  { NULL, 0 }
+};
+
+#if CONFIG_AV1_ENCODER
+static const struct arg_enum_list timing_info_enum[] = {
+  { "unspecified", AOM_TIMING_UNSPECIFIED },
+  { "constant", AOM_TIMING_EQUAL },
+  { "model", AOM_TIMING_DEC_MODEL },
+  { NULL, 0 }
+};
+
+static const struct arg_enum_list superblock_size_enum[] = {
+  { "dynamic", AOM_SUPERBLOCK_SIZE_DYNAMIC },
+  { "64", AOM_SUPERBLOCK_SIZE_64X64 },
+  { "128", AOM_SUPERBLOCK_SIZE_128X128 },
+  { NULL, 0 }
+};
+
+static const struct arg_enum_list matrix_coefficients_enum[] = {
+  { "identity", AOM_CICP_MC_IDENTITY },
+  { "bt709", AOM_CICP_MC_BT_709 },
+  { "unspecified", AOM_CICP_MC_UNSPECIFIED },
+  { "fcc73", AOM_CICP_MC_FCC },
+  { "bt470bg", AOM_CICP_MC_BT_470_B_G },
+  { "bt601", AOM_CICP_MC_BT_601 },
+  { "smpte240", AOM_CICP_CP_SMPTE_240 },
+  { "ycgco", AOM_CICP_MC_SMPTE_YCGCO },
+  { "bt2020ncl", AOM_CICP_MC_BT_2020_NCL },
+  { "bt2020cl", AOM_CICP_MC_BT_2020_CL },
+  { "smpte2085", AOM_CICP_MC_SMPTE_2085 },
+  { "chromncl", AOM_CICP_MC_CHROMAT_NCL },
+  { "chromcl", AOM_CICP_MC_CHROMAT_CL },
+  { "ictcp", AOM_CICP_MC_ICTCP },
+  { NULL, 0 }
+};
+
+static const struct arg_enum_list chroma_sample_position_enum[] = {
+  { "unknown", AOM_CSP_UNKNOWN },
+  { "vertical", AOM_CSP_VERTICAL },
+  { "colocated", AOM_CSP_COLOCATED },
+  { NULL, 0 }
+};
+
+static const struct arg_enum_list tune_content_enum[] = {
+  { "default", AOM_CONTENT_DEFAULT },
+  { "screen", AOM_CONTENT_SCREEN },
+  { "film", AOM_CONTENT_FILM },
+  { NULL, 0 }
+};
+
+static const struct arg_enum_list transfer_characteristics_enum[] = {
+  { "unspecified", AOM_CICP_CP_UNSPECIFIED },
+  { "bt709", AOM_CICP_TC_BT_709 },
+  { "bt470m", AOM_CICP_TC_BT_470_M },
+  { "bt470bg", AOM_CICP_TC_BT_470_B_G },
+  { "bt601", AOM_CICP_TC_BT_601 },
+  { "smpte240", AOM_CICP_TC_SMPTE_240 },
+  { "lin", AOM_CICP_TC_LINEAR },
+  { "log100", AOM_CICP_TC_LOG_100 },
+  { "log100sq10", AOM_CICP_TC_LOG_100_SQRT10 },
+  { "iec61966", AOM_CICP_TC_IEC_61966 },
+  { "bt1361", AOM_CICP_TC_BT_1361 },
+  { "srgb", AOM_CICP_TC_SRGB },
+  { "bt2020-10bit", AOM_CICP_TC_BT_2020_10_BIT },
+  { "bt2020-12bit", AOM_CICP_TC_BT_2020_12_BIT },
+  { "smpte2084", AOM_CICP_TC_SMPTE_2084 },
+  { "hlg", AOM_CICP_TC_HLG },
+  { "smpte428", AOM_CICP_TC_SMPTE_428 },
+  { NULL, 0 }
+};
+
+static const struct arg_enum_list color_primaries_enum[] = {
+  { "bt709", AOM_CICP_CP_BT_709 },
+  { "unspecified", AOM_CICP_CP_UNSPECIFIED },
+  { "bt601", AOM_CICP_CP_BT_601 },
+  { "bt470m", AOM_CICP_CP_BT_470_M },
+  { "bt470bg", AOM_CICP_CP_BT_470_B_G },
+  { "smpte240", AOM_CICP_CP_SMPTE_240 },
+  { "film", AOM_CICP_CP_GENERIC_FILM },
+  { "bt2020", AOM_CICP_CP_BT_2020 },
+  { "xyz", AOM_CICP_CP_XYZ },
+  { "smpte431", AOM_CICP_CP_SMPTE_431 },
+  { "smpte432", AOM_CICP_CP_SMPTE_432 },
+  { "ebu3213", AOM_CICP_CP_EBU_3213 },
+  { NULL, 0 }
+};
+#endif  // CONFIG_AV1_ENCODER
+
+const av1_codec_arg_definitions_t g_av1_codec_arg_defs = {
+  .help = ARG_DEF(NULL, "help", 0, "Show usage options and exit"),
+  .debugmode =
+      ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)"),
+  .outputfile = ARG_DEF("o", "output", 1, "Output filename"),
+  .use_nv12 = ARG_DEF(NULL, "nv12", 0, "Input file is NV12"),
+  .use_yv12 = ARG_DEF(NULL, "yv12", 0, "Input file is YV12"),
+  .use_i420 = ARG_DEF(NULL, "i420", 0, "Input file is I420 (default)"),
+  .use_i422 = ARG_DEF(NULL, "i422", 0, "Input file is I422"),
+  .use_i444 = ARG_DEF(NULL, "i444", 0, "Input file is I444"),
+  .codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"),
+  .passes = ARG_DEF("p", "passes", 1, "Number of passes (1/2/3)"),
+  .pass_arg = ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2/3)"),
+  .fpf_name = ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"),
+  .limit = ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames"),
+  .skip = ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"),
+  .good_dl = ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline"),
+  .rt_dl = ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline"),
+  .ai_dl = ARG_DEF(NULL, "allintra", 0, "Use all intra mode"),
+  .quietarg = ARG_DEF("q", "quiet", 0, "Do not print encode progress"),
+  .verbosearg = ARG_DEF("v", "verbose", 0, "Show encoder parameters"),
+  .psnrarg = ARG_DEF(
+      NULL, "psnr", -1,
+      "Show PSNR in status line "
+      "(0: Disable PSNR status line display, 1: PSNR calculated using input "
+      "bit-depth (default), 2: PSNR calculated using stream bit-depth); "
+      "takes default option when arguments are not specified"),
+  .use_cfg = ARG_DEF("c", "cfg", 1, "Config file to use"),
+  .recontest = ARG_DEF_ENUM(NULL, "test-decode", 1,
+                            "Test encode/decode mismatch", test_decode_enum),
+  .framerate = ARG_DEF(NULL, "fps", 1, "Stream frame rate (rate/scale)"),
+  .use_webm =
+      ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)"),
+  .use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF"),
+  .use_obu = ARG_DEF(NULL, "obu", 0, "Output OBU"),
+  .q_hist_n =
+      ARG_DEF(NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)"),
+  .rate_hist_n =
+      ARG_DEF(NULL, "rate-hist", 1, "Show rate histogram (n-buckets)"),
+  .disable_warnings =
+      ARG_DEF(NULL, "disable-warnings", 0,
+              "Disable warnings about potentially incorrect encode settings"),
+  .disable_warning_prompt =
+      ARG_DEF("y", "disable-warning-prompt", 0,
+              "Display warnings, but do not prompt user to continue"),
+  .bitdeptharg =
+      ARG_DEF_ENUM("b", "bit-depth", 1, "Bit depth for codec", bitdepth_enum),
+  .inbitdeptharg = ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input"),
+
+  .input_chroma_subsampling_x = ARG_DEF(NULL, "input-chroma-subsampling-x", 1,
+                                        "Chroma subsampling x value"),
+  .input_chroma_subsampling_y = ARG_DEF(NULL, "input-chroma-subsampling-y", 1,
+                                        "Chroma subsampling y value"),
+
+  .usage = ARG_DEF("u", "usage", 1,
+                   "Usage profile number to use (0: good, 1: rt, 2: allintra)"),
+  .threads = ARG_DEF("t", "threads", 1, "Max number of threads to use"),
+  .profile = ARG_DEF(NULL, "profile", 1, "Bitstream profile number to use"),
+  .width = ARG_DEF("w", "width", 1, "Frame width"),
+  .height = ARG_DEF("h", "height", 1, "Frame height"),
+  .forced_max_frame_width = ARG_DEF(NULL, "forced_max_frame_width", 1,
+                                    "Maximum frame width value to force"),
+  .forced_max_frame_height = ARG_DEF(NULL, "forced_max_frame_height", 1,
+                                     "Maximum frame height value to force"),
+#if CONFIG_WEBM_IO
+  .stereo_mode = ARG_DEF_ENUM(NULL, "stereo-mode", 1, "Stereo 3D video format",
+                              stereo_mode_enum),
+#endif
+  .timebase = ARG_DEF(NULL, "timebase", 1,
+                      "Output timestamp precision (fractional seconds)"),
+  .global_error_resilient = ARG_DEF(NULL, "global-error-resilient", 1,
+                                    "Enable global error resiliency features"),
+  .lag_in_frames =
+      ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag"),
+  .large_scale_tile = ARG_DEF(
+      NULL, "large-scale-tile", 1,
+      "Large scale tile coding (0: off (default), 1: on (ivf output only))"),
+  .monochrome =
+      ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)"),
+  .full_still_picture_hdr = ARG_DEF(NULL, "full-still-picture-hdr", 0,
+                                    "Use full header for still picture"),
+  .use_16bit_internal =
+      ARG_DEF(NULL, "use-16bit-internal", 0, "Force use of 16-bit pipeline"),
+  .dropframe_thresh =
+      ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"),
+  .resize_mode = ARG_DEF(NULL, "resize-mode", 1, "Frame resize mode"),
+  .resize_denominator =
+      ARG_DEF(NULL, "resize-denominator", 1, "Frame resize denominator"),
+  .resize_kf_denominator = ARG_DEF(NULL, "resize-kf-denominator", 1,
+                                   "Frame resize keyframe denominator"),
+  .superres_mode =
+      ARG_DEF(NULL, "superres-mode", 1, "Frame super-resolution mode"),
+  .superres_denominator = ARG_DEF(NULL, "superres-denominator", 1,
+                                  "Frame super-resolution denominator"),
+  .superres_kf_denominator =
+      ARG_DEF(NULL, "superres-kf-denominator", 1,
+              "Frame super-resolution keyframe denominator"),
+  .superres_qthresh = ARG_DEF(NULL, "superres-qthresh", 1,
+                              "Frame super-resolution qindex threshold"),
+  .superres_kf_qthresh =
+      ARG_DEF(NULL, "superres-kf-qthresh", 1,
+              "Frame super-resolution keyframe qindex threshold"),
+  .end_usage =
+      ARG_DEF_ENUM(NULL, "end-usage", 1, "Rate control mode", end_usage_enum),
+  .target_bitrate = ARG_DEF(NULL, "target-bitrate", 1, "Bitrate (kbps)"),
+  .min_quantizer = ARG_DEF(NULL, "min-q", 1, "Minimum (best) quantizer"),
+  .max_quantizer = ARG_DEF(NULL, "max-q", 1, "Maximum (worst) quantizer"),
+  .undershoot_pct = ARG_DEF(NULL, "undershoot-pct", 1,
+                            "Datarate undershoot (min) target (%)"),
+  .overshoot_pct =
+      ARG_DEF(NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)"),
+  .buf_sz = ARG_DEF(NULL, "buf-sz", 1, "Client buffer size (ms)"),
+  .buf_initial_sz =
+      ARG_DEF(NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)"),
+  .buf_optimal_sz =
+      ARG_DEF(NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)"),
+  .bias_pct = ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)"),
+  .minsection_pct =
+      ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)"),
+  .maxsection_pct =
+      ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)"),
+  .fwd_kf_enabled =
+      ARG_DEF(NULL, "enable-fwd-kf", 1, "Enable forward reference keyframes"),
+  .kf_min_dist =
+      ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)"),
+  .kf_max_dist =
+      ARG_DEF(NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)"),
+  .kf_disabled = ARG_DEF(NULL, "disable-kf", 0, "Disable keyframe placement"),
+  .sframe_dist = ARG_DEF(NULL, "sframe-dist", 1, "S-Frame interval (frames)"),
+  .sframe_mode =
+      ARG_DEF(NULL, "sframe-mode", 1, "S-Frame insertion mode (1..2)"),
+  .save_as_annexb = ARG_DEF(NULL, "annexb", 1, "Save as Annex-B"),
+  .noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,
+                        "Noise sensitivity (frames to blur)"),
+  .sharpness = ARG_DEF(NULL, "sharpness", 1,
+                       "Bias towards block sharpness in rate-distortion "
+                       "optimization of transform coefficients "
+                       "(0..7), default is 0"),
+  .static_thresh =
+      ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"),
+  .auto_altref =
+      ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames"),
+  .arnr_maxframes =
+      ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)"),
+  .arnr_strength =
+      ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)"),
+  .tune_metric = ARG_DEF_ENUM(NULL, "tune", 1, "Distortion metric tuned with",
+                              tuning_enum),
+  .dist_metric = ARG_DEF_ENUM(
+      NULL, "dist-metric", 1,
+      "Distortion metric to use for in-block optimization", dist_metric_enum),
+  .cq_level =
+      ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level"),
+  .max_intra_rate_pct =
+      ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"),
+#if CONFIG_AV1_ENCODER
+  .cpu_used_av1 = ARG_DEF(NULL, "cpu-used", 1,
+                          "Speed setting (0..6 in good mode, 5..10 in realtime "
+                          "mode, 0..9 in all intra mode)"),
+  .rowmtarg =
+      ARG_DEF(NULL, "row-mt", 1,
+              "Enable row based multi-threading (0: off, 1: on (default))"),
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  .fpmtarg = ARG_DEF(
+      NULL, "fp-mt", 1,
+      "Enable frame parallel multi-threading (0: off (default), 1: on)"),
+#endif
+  .tile_cols =
+      ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2"),
+  .tile_rows =
+      ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2"),
+  .enable_tpl_model = ARG_DEF(NULL, "enable-tpl-model", 1,
+                              "RDO based on frame temporal dependency "
+                              "(0: off, 1: backward source based); "
+                              "required for deltaq mode"),
+  .enable_keyframe_filtering = ARG_DEF(
+      NULL, "enable-keyframe-filtering", 1,
+      "Apply temporal filtering on key frame "
+      "(0: no filter, 1: filter without overlay (default), "
+      "2: filter with overlay - experimental, may break random access in "
+      "players)"),
+  .tile_width = ARG_DEF(NULL, "tile-width", 1, "Tile widths (comma separated)"),
+  .tile_height =
+      ARG_DEF(NULL, "tile-height", 1, "Tile heights (command separated)"),
+  .lossless = ARG_DEF(NULL, "lossless", 1,
+                      "Lossless mode (0: false (default), 1: true)"),
+  .enable_cdef = ARG_DEF(
+      NULL, "enable-cdef", 1,
+      "Enable the constrained directional enhancement filter (0: false, "
+      "1: true (default), 2: disable for non-reference frames)"),
+  .enable_restoration = ARG_DEF(NULL, "enable-restoration", 1,
+                                "Enable the loop restoration filter (0: false "
+                                "(default in realtime mode), "
+                                "1: true (default in non-realtime mode))"),
+  .enable_rect_partitions = ARG_DEF(NULL, "enable-rect-partitions", 1,
+                                    "Enable rectangular partitions "
+                                    "(0: false, 1: true (default))"),
+  .enable_ab_partitions =
+      ARG_DEF(NULL, "enable-ab-partitions", 1,
+              "Enable ab partitions (0: false, 1: true (default))"),
+  .enable_1to4_partitions = ARG_DEF(NULL, "enable-1to4-partitions", 1,
+                                    "Enable 1:4 and 4:1 partitions "
+                                    "(0: false, 1: true (default))"),
+  .min_partition_size =
+      ARG_DEF(NULL, "min-partition-size", 1,
+              "Set min partition size "
+              "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128); "
+              "with 4k+ resolutions or higher speed settings, min "
+              "partition size will have a minimum of 8"),
+  .max_partition_size =
+      ARG_DEF(NULL, "max-partition-size", 1,
+              "Set max partition size "
+              "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)"),
+  .enable_dual_filter = ARG_DEF(NULL, "enable-dual-filter", 1,
+                                "Enable dual filter "
+                                "(0: false, 1: true (default))"),
+  .enable_chroma_deltaq = ARG_DEF(NULL, "enable-chroma-deltaq", 1,
+                                  "Enable chroma delta quant "
+                                  "(0: false (default), 1: true)"),
+  .enable_intra_edge_filter = ARG_DEF(NULL, "enable-intra-edge-filter", 1,
+                                      "Enable intra edge filtering "
+                                      "(0: false, 1: true (default))"),
+  .enable_order_hint = ARG_DEF(NULL, "enable-order-hint", 1,
+                               "Enable order hint "
+                               "(0: false, 1: true (default))"),
+  .enable_tx64 =
+      ARG_DEF(NULL, "enable-tx64", 1,
+              "Enable 64-pt transform (0: false, 1: true (default))"),
+  .enable_flip_idtx =
+      ARG_DEF(NULL, "enable-flip-idtx", 1,
+              "Enable extended transform type (0: false, 1: true (default)) "
+              "including FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, "
+              "ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, "
+              "H_ADST, V_FLIPADST, H_FLIPADST"),
+  .enable_rect_tx =
+      ARG_DEF(NULL, "enable-rect-tx", 1,
+              "Enable rectangular transform (0: false, 1: true (default))"),
+  .enable_dist_wtd_comp = ARG_DEF(NULL, "enable-dist-wtd-comp", 1,
+                                  "Enable distance-weighted compound "
+                                  "(0: false, 1: true (default))"),
+  .enable_masked_comp = ARG_DEF(NULL, "enable-masked-comp", 1,
+                                "Enable masked (wedge/diff-wtd) compound "
+                                "(0: false, 1: true (default))"),
+  .enable_onesided_comp = ARG_DEF(NULL, "enable-onesided-comp", 1,
+                                  "Enable one sided compound "
+                                  "(0: false, 1: true (default))"),
+  .enable_interintra_comp = ARG_DEF(NULL, "enable-interintra-comp", 1,
+                                    "Enable interintra compound "
+                                    "(0: false, 1: true (default))"),
+  .enable_smooth_interintra = ARG_DEF(NULL, "enable-smooth-interintra", 1,
+                                      "Enable smooth interintra mode "
+                                      "(0: false, 1: true (default))"),
+  .enable_diff_wtd_comp = ARG_DEF(NULL, "enable-diff-wtd-comp", 1,
+                                  "Enable difference-weighted compound "
+                                  "(0: false, 1: true (default))"),
+  .enable_interinter_wedge = ARG_DEF(NULL, "enable-interinter-wedge", 1,
+                                     "Enable interinter wedge compound "
+                                     "(0: false, 1: true (default))"),
+  .enable_interintra_wedge = ARG_DEF(NULL, "enable-interintra-wedge", 1,
+                                     "Enable interintra wedge compound "
+                                     "(0: false, 1: true (default))"),
+  .enable_global_motion = ARG_DEF(NULL, "enable-global-motion", 1,
+                                  "Enable global motion "
+                                  "(0: false, 1: true (default))"),
+  .enable_warped_motion = ARG_DEF(NULL, "enable-warped-motion", 1,
+                                  "Enable local warped motion "
+                                  "(0: false, 1: true (default))"),
+  .enable_filter_intra = ARG_DEF(NULL, "enable-filter-intra", 1,
+                                 "Enable filter intra prediction mode "
+                                 "(0: false, 1: true (default))"),
+  .enable_smooth_intra = ARG_DEF(NULL, "enable-smooth-intra", 1,
+                                 "Enable smooth intra prediction modes "
+                                 "(0: false, 1: true (default))"),
+  .enable_paeth_intra = ARG_DEF(
+      NULL, "enable-paeth-intra", 1,
+      "Enable Paeth intra prediction mode (0: false, 1: true (default))"),
+  .enable_cfl_intra = ARG_DEF(NULL, "enable-cfl-intra", 1,
+                              "Enable chroma from luma intra prediction mode "
+                              "(0: false, 1: true (default))"),
+  .enable_directional_intra =
+      ARG_DEF(NULL, "enable-directional-intra", 1,
+              "Enable directional intra prediction modes "
+              "(0: false, 1: true (default))"),
+  .enable_diagonal_intra =
+      ARG_DEF(NULL, "enable-diagonal-intra", 1,
+              "Enable diagonal (D45 to D203) intra prediction modes, which are "
+              "a subset of directional modes; has no effect if "
+              "enable-directional-intra is 0 (0: false, 1: true (default))"),
+  .force_video_mode = ARG_DEF(NULL, "force-video-mode", 1,
+                              "Force video mode (0: false, 1: true (default))"),
+  .enable_obmc = ARG_DEF(NULL, "enable-obmc", 1,
+                         "Enable OBMC (0: false, 1: true (default))"),
+  .enable_overlay =
+      ARG_DEF(NULL, "enable-overlay", 1,
+              "Enable coding overlay frames (0: false, 1: true (default))"),
+  .enable_palette =
+      ARG_DEF(NULL, "enable-palette", 1,
+              "Enable palette prediction mode (0: false, 1: true (default))"),
+  .enable_intrabc = ARG_DEF(NULL, "enable-intrabc", 1,
+                            "Enable intra block copy prediction mode "
+                            "(0: false, 1: true (default))"),
+  .enable_angle_delta =
+      ARG_DEF(NULL, "enable-angle-delta", 1,
+              "Enable intra angle delta (0: false, 1: true (default))"),
+  .disable_trellis_quant = ARG_DEF(
+      NULL, "disable-trellis-quant", 1,
+      "Disable trellis optimization of quantized coefficients (0: false "
+      "1: true  2: true for rd search 3: true for estimate yrd search "
+      "(default))"),
+  .enable_qm =
+      ARG_DEF(NULL, "enable-qm", 1,
+              "Enable quantisation matrices (0: false (default), 1: true)"),
+  .qm_min = ARG_DEF(NULL, "qm-min", 1,
+                    "Min quant matrix flatness (0..15), default is 8"),
+  .qm_max = ARG_DEF(NULL, "qm-max", 1,
+                    "Max quant matrix flatness (0..15), default is 15"),
+  .reduced_tx_type_set = ARG_DEF(NULL, "reduced-tx-type-set", 1,
+                                 "Use reduced set of transform types"),
+  .use_intra_dct_only =
+      ARG_DEF(NULL, "use-intra-dct-only", 1, "Use DCT only for INTRA modes"),
+  .use_inter_dct_only =
+      ARG_DEF(NULL, "use-inter-dct-only", 1, "Use DCT only for INTER modes"),
+  .use_intra_default_tx_only =
+      ARG_DEF(NULL, "use-intra-default-tx-only", 1,
+              "Use Default-transform only for INTRA modes"),
+  .quant_b_adapt = ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b"),
+  .coeff_cost_upd_freq = ARG_DEF(NULL, "coeff-cost-upd-freq", 1,
+                                 "Update freq for coeff costs. "
+                                 "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
+  .mode_cost_upd_freq = ARG_DEF(NULL, "mode-cost-upd-freq", 1,
+                                "Update freq for mode costs. "
+                                "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
+  .mv_cost_upd_freq = ARG_DEF(NULL, "mv-cost-upd-freq", 1,
+                              "Update freq for mv costs. "
+                              "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
+  .dv_cost_upd_freq = ARG_DEF(NULL, "dv-cost-upd-freq", 1,
+                              "Update freq for dv costs. "
+                              "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
+  .num_tg = ARG_DEF(NULL, "num-tile-groups", 1,
+                    "Maximum number of tile groups, default is 1"),
+  .mtu_size =
+      ARG_DEF(NULL, "mtu-size", 1,
+              "MTU size for a tile group, default is 0 (no MTU targeting), "
+              "overrides maximum number of tile groups"),
+  .timing_info = ARG_DEF_ENUM(
+      NULL, "timing-info", 1,
+      "Signal timing info in the bitstream (model only works for no "
+      "hidden frames, no super-res yet):",
+      timing_info_enum),
+#if CONFIG_TUNE_VMAF
+  .vmaf_model_path =
+      ARG_DEF(NULL, "vmaf-model-path", 1, "Path to the VMAF model file"),
+#endif
+  .partition_info_path = ARG_DEF(NULL, "partition-info-path", 1,
+                                 "Partition information read and write path"),
+  .film_grain_test = ARG_DEF(
+      NULL, "film-grain-test", 1,
+      "Film grain test vectors (0: none (default), 1: test-1  2: test-2, "
+      "... 16: test-16)"),
+  .film_grain_table = ARG_DEF(NULL, "film-grain-table", 1,
+                              "Path to file containing film grain parameters"),
+#if CONFIG_DENOISE
+  .denoise_noise_level =
+      ARG_DEF(NULL, "denoise-noise-level", 1,
+              "Amount of noise (from 0 = don't denoise, to 50)"),
+  .denoise_block_size = ARG_DEF(NULL, "denoise-block-size", 1,
+                                "Denoise block size (default = 32)"),
+  .enable_dnl_denoising = ARG_DEF(NULL, "enable-dnl-denoising", 1,
+                                  "Apply denoising to the frame "
+                                  "being encoded when denoise-noise-level is "
+                                  "enabled (0: false, 1: true (default))"),
+#endif
+  .enable_ref_frame_mvs =
+      ARG_DEF(NULL, "enable-ref-frame-mvs", 1,
+              "Enable temporal mv prediction (default is 1)"),
+  .frame_parallel_decoding =
+      ARG_DEF(NULL, "frame-parallel", 1,
+              "Enable frame parallel decodability features "
+              "(0: false (default), 1: true)"),
+  .error_resilient_mode = ARG_DEF(NULL, "error-resilient", 1,
+                                  "Enable error resilient features "
+                                  "(0: false (default), 1: true)"),
+  .aq_mode = ARG_DEF(NULL, "aq-mode", 1,
+                     "Adaptive quantization mode (0: off (default), 1: "
+                     "variance 2: complexity, "
+                     "3: cyclic refresh)"),
+  .deltaq_mode =
+      ARG_DEF(NULL, "deltaq-mode", 1,
+              "Delta qindex mode (0: off, 1: deltaq objective (default), "
+              "2: deltaq placeholder, 3: key frame visual quality, 4: user "
+              "rating based visual quality optimization); "
+              "requires --enable-tpl-model=1"),
+  .deltaq_strength = ARG_DEF(NULL, "deltaq-strength", 1,
+                             "Deltaq strength for"
+                             " --deltaq-mode=4 (%)"),
+  .deltalf_mode = ARG_DEF(NULL, "delta-lf-mode", 1,
+                          "Enable delta-lf-mode (0: off (default), 1: on)"),
+  .frame_periodic_boost =
+      ARG_DEF(NULL, "frame-boost", 1,
+              "Enable frame periodic boost (0: off (default), 1: on)"),
+  .gf_cbr_boost_pct = ARG_DEF(NULL, "gf-cbr-boost", 1,
+                              "Boost for Golden Frame in CBR mode (pct)"),
+  .max_inter_rate_pct =
+      ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)"),
+  .min_gf_interval = ARG_DEF(
+      NULL, "min-gf-interval", 1,
+      "Min gf/arf frame interval (default 0, indicating in-built behavior)"),
+  .max_gf_interval = ARG_DEF(
+      NULL, "max-gf-interval", 1,
+      "Max gf/arf frame interval (default 0, indicating in-built behavior)"),
+  .gf_min_pyr_height =
+      ARG_DEF(NULL, "gf-min-pyr-height", 1,
+              "Min height for GF group pyramid structure (0 (default) to 5)"),
+  .gf_max_pyr_height = ARG_DEF(
+      NULL, "gf-max-pyr-height", 1,
+      "Maximum height for GF group pyramid structure (0 to 5 (default))"),
+  .max_reference_frames = ARG_DEF(NULL, "max-reference-frames", 1,
+                                  "Maximum number of reference frames allowed "
+                                  "per frame (3 to 7 (default))"),
+  .reduced_reference_set =
+      ARG_DEF(NULL, "reduced-reference-set", 1,
+              "Use reduced set of single and compound references (0: off "
+              "(default), 1: on)"),
+  .target_seq_level_idx =
+      ARG_DEF(NULL, "target-seq-level-idx", 1,
+              "Target sequence level index. "
+              "Possible values are in the form of \"ABxy\". "
+              "AB: Operating point (OP) index, "
+              "xy: Target level index for the OP. "
+              "E.g. \"0\" means target level index 0 (2.0) for the 0th OP, "
+              "\"1019\" means target level index 19 (6.3) for the 10th OP."),
+  .set_min_cr = ARG_DEF(
+      NULL, "min-cr", 1,
+      "Set minimum compression ratio. Take integer values. Default is 0. "
+      "If non-zero, encoder will try to keep the compression ratio of "
+      "each frame to be higher than the given value divided by 100."),
+
+  .input_color_primaries = ARG_DEF_ENUM(
+      NULL, "color-primaries", 1,
+      "Color primaries (CICP) of input content:", color_primaries_enum),
+
+  .input_transfer_characteristics =
+      ARG_DEF_ENUM(NULL, "transfer-characteristics", 1,
+                   "Transfer characteristics (CICP) of input content:",
+                   transfer_characteristics_enum),
+
+  .input_matrix_coefficients = ARG_DEF_ENUM(
+      NULL, "matrix-coefficients", 1,
+      "Matrix coefficients (CICP) of input content:", matrix_coefficients_enum),
+
+  .input_chroma_sample_position =
+      ARG_DEF_ENUM(NULL, "chroma-sample-position", 1,
+                   "The chroma sample position when chroma 4:2:0 is signaled:",
+                   chroma_sample_position_enum),
+
+  .tune_content = ARG_DEF_ENUM(NULL, "tune-content", 1, "Tune content type",
+                               tune_content_enum),
+
+  .cdf_update_mode =
+      ARG_DEF(NULL, "cdf-update-mode", 1,
+              "CDF update mode for entropy coding "
+              "(0: no CDF update, 1: update CDF on all frames (default), "
+              "2: selectively update CDF on some frames)"),
+
+  .superblock_size = ARG_DEF_ENUM(NULL, "sb-size", 1, "Superblock size to use",
+                                  superblock_size_enum),
+
+  .set_tier_mask =
+      ARG_DEF(NULL, "set-tier-mask", 1,
+              "Set bit mask to specify which tier each of the 32 possible "
+              "operating points conforms to. "
+              "Bit value 0 (default): Main Tier, 1: High Tier."),
+
+  .use_fixed_qp_offsets =
+      ARG_DEF(NULL, "use-fixed-qp-offsets", 1,
+              "Enable fixed QP offsets for frames at different levels of the "
+              "pyramid. Selected automatically from --cq-level if "
+              "--fixed-qp-offsets is not provided. If this option is not "
+              "specified (default), offsets are adaptively chosen by the "
+              "encoder."),
+
+  .fixed_qp_offsets = ARG_DEF(
+      NULL, "fixed-qp-offsets", 1,
+      "Set fixed QP offsets for frames at different levels of the "
+      "pyramid. Comma-separated list of 5 offsets for keyframe, ALTREF, "
+      "and 3 levels of internal alt-refs. If this option is not "
+      "specified (default), offsets are adaptively chosen by the "
+      "encoder."),
+
+  .vbr_corpus_complexity_lap = ARG_DEF(
+      NULL, "vbr-corpus-complexity-lap", 1,
+      "Set average corpus complexity per mb for single pass VBR using lap. "
+      "(0..10000), default is 0"),
+
+  .fwd_kf_dist = ARG_DEF(NULL, "fwd-kf-dist", -1,
+                         "Set distance between forward keyframes. A value of "
+                         "-1 (default) means no repetitive forward keyframes."),
+
+  .enable_tx_size_search = ARG_DEF(
+      NULL, "enable-tx-size-search", 1,
+      "Enable transform size search to find the best size for each block. "
+      "If false, transforms always have the largest possible size "
+      "(0: false, 1: true (default))"),
+
+  .loopfilter_control = ARG_DEF(
+      NULL, "loopfilter-control", 1,
+      "Control loop filtering "
+      "(0: Loopfilter disabled for all frames, 1: Enable loopfilter for all "
+      "frames (default), 2: Disable loopfilter for non-reference frames, 3: "
+      "Disable loopfilter for frames with low motion)"),
+
+  .auto_intra_tools_off = ARG_DEF(
+      NULL, "auto-intra-tools-off", 1,
+      "Automatically turn off several intra coding tools for allintra mode; "
+      "only in effect if --deltaq-mode=3"),
+
+  .two_pass_input =
+      ARG_DEF(NULL, "two-pass-input", 1,
+              "The input file for the second pass for three-pass encoding"),
+  .two_pass_output = ARG_DEF(
+      NULL, "two-pass-output", 1,
+      "The output file for the first two passes for three-pass encoding"),
+  .two_pass_width =
+      ARG_DEF(NULL, "two-pass-width", 1, "The width of two-pass-input"),
+  .two_pass_height =
+      ARG_DEF(NULL, "two-pass-height", 1, "The height of two-pass-input"),
+  .second_pass_log =
+      ARG_DEF("spf", "second-pass-log", 1, "Log file from second pass"),
+  .strict_level_conformance =
+      ARG_DEF(NULL, "strict-level-conformance", 1,
+              "When set to 1, exit the encoder when it fails to encode "
+              "to a given target level"),
+#endif  // CONFIG_AV1_ENCODER
+};
diff --git a/media/libaom/src/av1/arg_defs.h b/media/libaom/src/av1/arg_defs.h
new file mode 100644
index 0000000000..cfd269e8dd
--- /dev/null
+++ b/media/libaom/src/av1/arg_defs.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ARG_DEFS_H_
+#define AOM_AV1_ARG_DEFS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+#include "common/args_helper.h"
+#if CONFIG_WEBM_IO
+#include "common/webmenc.h"
+#endif
+#include "aom/aomcx.h"
+
+enum TestDecodeFatality {
+  TEST_DECODE_OFF,
+  TEST_DECODE_FATAL,
+  TEST_DECODE_WARN,
+};
+
+typedef struct av1_codec_arg_definitions {
+  arg_def_t help;
+  arg_def_t debugmode;
+  arg_def_t outputfile;
+  arg_def_t use_nv12;
+  arg_def_t use_yv12;
+  arg_def_t use_i420;
+  arg_def_t use_i422;
+  arg_def_t use_i444;
+  arg_def_t codecarg;
+  arg_def_t passes;
+  arg_def_t pass_arg;
+  arg_def_t fpf_name;
+  arg_def_t limit;
+  arg_def_t skip;
+  arg_def_t good_dl;
+  arg_def_t rt_dl;
+  arg_def_t ai_dl;
+  arg_def_t quietarg;
+  arg_def_t verbosearg;
+  arg_def_t psnrarg;
+  arg_def_t use_cfg;
+  arg_def_t recontest;
+  arg_def_t framerate;
+  arg_def_t use_webm;
+  arg_def_t use_ivf;
+  arg_def_t use_obu;
+  arg_def_t q_hist_n;
+  arg_def_t rate_hist_n;
+  arg_def_t disable_warnings;
+  arg_def_t disable_warning_prompt;
+  arg_def_t bitdeptharg;
+  arg_def_t inbitdeptharg;
+  arg_def_t input_chroma_subsampling_x;
+  arg_def_t input_chroma_subsampling_y;
+  arg_def_t usage;
+  arg_def_t threads;
+  arg_def_t profile;
+  arg_def_t width;
+  arg_def_t height;
+  arg_def_t forced_max_frame_width;
+  arg_def_t forced_max_frame_height;
+#if CONFIG_WEBM_IO
+  arg_def_t stereo_mode;
+#endif
+  arg_def_t timebase;
+  arg_def_t global_error_resilient;
+  arg_def_t lag_in_frames;
+  arg_def_t large_scale_tile;
+  arg_def_t monochrome;
+  arg_def_t full_still_picture_hdr;
+  arg_def_t use_16bit_internal;
+  arg_def_t dropframe_thresh;
+  arg_def_t resize_mode;
+  arg_def_t resize_denominator;
+  arg_def_t resize_kf_denominator;
+  arg_def_t superres_mode;
+  arg_def_t superres_denominator;
+  arg_def_t superres_kf_denominator;
+  arg_def_t superres_qthresh;
+  arg_def_t superres_kf_qthresh;
+  arg_def_t end_usage;
+  arg_def_t target_bitrate;
+  arg_def_t min_quantizer;
+  arg_def_t max_quantizer;
+  arg_def_t undershoot_pct;
+  arg_def_t overshoot_pct;
+  arg_def_t buf_sz;
+  arg_def_t buf_initial_sz;
+  arg_def_t buf_optimal_sz;
+  arg_def_t bias_pct;
+  arg_def_t minsection_pct;
+  arg_def_t maxsection_pct;
+  arg_def_t fwd_kf_enabled;
+  arg_def_t kf_min_dist;
+  arg_def_t kf_max_dist;
+  arg_def_t kf_disabled;
+  arg_def_t sframe_dist;
+  arg_def_t sframe_mode;
+  arg_def_t save_as_annexb;
+  arg_def_t noise_sens;
+  arg_def_t sharpness;
+  arg_def_t static_thresh;
+  arg_def_t auto_altref;
+  arg_def_t arnr_maxframes;
+  arg_def_t arnr_strength;
+  arg_def_t tune_metric;
+  arg_def_t dist_metric;
+  arg_def_t cq_level;
+  arg_def_t max_intra_rate_pct;
+#if CONFIG_AV1_ENCODER
+  arg_def_t cpu_used_av1;
+  arg_def_t rowmtarg;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  arg_def_t fpmtarg;
+#endif
+  arg_def_t tile_cols;
+  arg_def_t tile_rows;
+  arg_def_t enable_tpl_model;
+  arg_def_t enable_keyframe_filtering;
+  arg_def_t tile_width;
+  arg_def_t tile_height;
+  arg_def_t lossless;
+  arg_def_t enable_cdef;
+  arg_def_t enable_restoration;
+  arg_def_t enable_rect_partitions;
+  arg_def_t enable_ab_partitions;
+  arg_def_t enable_1to4_partitions;
+  arg_def_t min_partition_size;
+  arg_def_t max_partition_size;
+  arg_def_t enable_dual_filter;
+  arg_def_t enable_chroma_deltaq;
+  arg_def_t enable_intra_edge_filter;
+  arg_def_t enable_order_hint;
+  arg_def_t enable_tx64;
+  arg_def_t enable_flip_idtx;
+  arg_def_t enable_rect_tx;
+  arg_def_t enable_dist_wtd_comp;
+  arg_def_t enable_masked_comp;
+  arg_def_t enable_onesided_comp;
+  arg_def_t enable_interintra_comp;
+  arg_def_t enable_smooth_interintra;
+  arg_def_t enable_diff_wtd_comp;
+  arg_def_t enable_interinter_wedge;
+  arg_def_t enable_interintra_wedge;
+  arg_def_t enable_global_motion;
+  arg_def_t enable_warped_motion;
+  arg_def_t enable_filter_intra;
+  arg_def_t enable_smooth_intra;
+  arg_def_t enable_paeth_intra;
+  arg_def_t enable_cfl_intra;
+  arg_def_t enable_directional_intra;
+  arg_def_t enable_diagonal_intra;
+  arg_def_t force_video_mode;
+  arg_def_t enable_obmc;
+  arg_def_t enable_overlay;
+  arg_def_t enable_palette;
+  arg_def_t enable_intrabc;
+  arg_def_t enable_angle_delta;
+  arg_def_t disable_trellis_quant;
+  arg_def_t enable_qm;
+  arg_def_t qm_min;
+  arg_def_t qm_max;
+  arg_def_t reduced_tx_type_set;
+  arg_def_t use_intra_dct_only;
+  arg_def_t use_inter_dct_only;
+  arg_def_t use_intra_default_tx_only;
+  arg_def_t quant_b_adapt;
+  arg_def_t coeff_cost_upd_freq;
+  arg_def_t mode_cost_upd_freq;
+  arg_def_t mv_cost_upd_freq;
+  arg_def_t dv_cost_upd_freq;
+  arg_def_t num_tg;
+  arg_def_t mtu_size;
+  arg_def_t timing_info;
+#if CONFIG_TUNE_VMAF
+  arg_def_t vmaf_model_path;
+#endif
+  arg_def_t partition_info_path;
+  arg_def_t film_grain_test;
+  arg_def_t film_grain_table;
+#if CONFIG_DENOISE
+  arg_def_t denoise_noise_level;
+  arg_def_t denoise_block_size;
+  arg_def_t enable_dnl_denoising;
+#endif
+  arg_def_t enable_ref_frame_mvs;
+  arg_def_t frame_parallel_decoding;
+  arg_def_t error_resilient_mode;
+  arg_def_t aq_mode;
+  arg_def_t deltaq_mode;
+  arg_def_t deltaq_strength;
+  arg_def_t deltalf_mode;
+  arg_def_t frame_periodic_boost;
+  arg_def_t gf_cbr_boost_pct;
+  arg_def_t max_inter_rate_pct;
+  arg_def_t min_gf_interval;
+  arg_def_t max_gf_interval;
+  arg_def_t gf_min_pyr_height;
+  arg_def_t gf_max_pyr_height;
+  arg_def_t max_reference_frames;
+  arg_def_t reduced_reference_set;
+  arg_def_t target_seq_level_idx;
+  arg_def_t set_min_cr;
+  arg_def_t input_color_primaries;
+  arg_def_t input_transfer_characteristics;
+  arg_def_t input_matrix_coefficients;
+  arg_def_t input_chroma_sample_position;
+  arg_def_t tune_content;
+  arg_def_t cdf_update_mode;
+  arg_def_t superblock_size;
+  arg_def_t set_tier_mask;
+  arg_def_t use_fixed_qp_offsets;
+  arg_def_t fixed_qp_offsets;
+  arg_def_t vbr_corpus_complexity_lap;
+  arg_def_t fwd_kf_dist;
+  arg_def_t enable_tx_size_search;
+  arg_def_t loopfilter_control;
+  arg_def_t two_pass_input;
+  arg_def_t two_pass_output;
+  arg_def_t two_pass_width;
+  arg_def_t two_pass_height;
+  arg_def_t second_pass_log;
+  arg_def_t auto_intra_tools_off;
+  arg_def_t strict_level_conformance;
+#endif  // CONFIG_AV1_ENCODER
+} av1_codec_arg_definitions_t;
+
+extern const av1_codec_arg_definitions_t g_av1_codec_arg_defs;
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // AOM_AV1_ARG_DEFS_H_
diff --git a/media/libaom/src/av1/av1.cmake b/media/libaom/src/av1/av1.cmake
index 2ab3496303..b2e90ec334 100644
--- a/media/libaom/src/av1/av1.cmake
+++ b/media/libaom/src/av1/av1.cmake
@@ -14,6 +14,10 @@ endif() # AOM_AV1_AV1_CMAKE_
 set(AOM_AV1_AV1_CMAKE_ 1)
 
 list(APPEND AOM_AV1_COMMON_SOURCES
+            "${AOM_ROOT}/common/args_helper.h"
+            "${AOM_ROOT}/common/args_helper.c"
+            "${AOM_ROOT}/av1/arg_defs.h"
+            "${AOM_ROOT}/av1/arg_defs.c"
             "${AOM_ROOT}/av1/av1_iface_common.h"
             "${AOM_ROOT}/av1/common/alloccommon.c"
             "${AOM_ROOT}/av1/common/alloccommon.h"
@@ -56,8 +60,6 @@ list(APPEND AOM_AV1_COMMON_SOURCES
             "${AOM_ROOT}/av1/common/mvref_common.h"
             "${AOM_ROOT}/av1/common/obu_util.c"
             "${AOM_ROOT}/av1/common/obu_util.h"
-            "${AOM_ROOT}/av1/common/odintrin.c"
-            "${AOM_ROOT}/av1/common/odintrin.h"
             "${AOM_ROOT}/av1/common/pred_common.c"
             "${AOM_ROOT}/av1/common/pred_common.h"
             "${AOM_ROOT}/av1/common/quant_common.c"
@@ -88,10 +90,6 @@ list(APPEND AOM_AV1_COMMON_SOURCES
             "${AOM_ROOT}/av1/common/warped_motion.c"
             "${AOM_ROOT}/av1/common/warped_motion.h")
 
-if(CONFIG_LPF_MASK)
-  list(APPEND AOM_AV1_COMMON_SOURCES "${AOM_ROOT}/av1/common/loopfiltermask.c")
-endif()
-
 list(APPEND AOM_AV1_DECODER_SOURCES
             "${AOM_ROOT}/av1/av1_dx_iface.c"
             "${AOM_ROOT}/av1/decoder/decodeframe.c"
@@ -105,6 +103,8 @@ list(APPEND AOM_AV1_DECODER_SOURCES
             "${AOM_ROOT}/av1/decoder/detokenize.c"
             "${AOM_ROOT}/av1/decoder/detokenize.h"
             "${AOM_ROOT}/av1/decoder/dthread.h"
+            "${AOM_ROOT}/av1/decoder/grain_synthesis.c"
+            "${AOM_ROOT}/av1/decoder/grain_synthesis.h"
             "${AOM_ROOT}/av1/decoder/obu.h"
             "${AOM_ROOT}/av1/decoder/obu.c")
 
@@ -116,13 +116,13 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
             "${AOM_ROOT}/av1/encoder/aq_variance.c"
             "${AOM_ROOT}/av1/encoder/aq_variance.h"
+            "${AOM_ROOT}/av1/encoder/allintra_vis.c"
+            "${AOM_ROOT}/av1/encoder/allintra_vis.h"
             "${AOM_ROOT}/av1/encoder/enc_enums.h"
             "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.c"
             "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h"
             "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d_cfg.h"
             "${AOM_ROOT}/av1/encoder/av1_fwd_txfm2d.c"
-            "${AOM_ROOT}/av1/encoder/av1_multi_thread.c"
-            "${AOM_ROOT}/av1/encoder/av1_multi_thread.h"
             "${AOM_ROOT}/av1/encoder/av1_quantize.c"
             "${AOM_ROOT}/av1/encoder/av1_quantize.h"
             "${AOM_ROOT}/av1/encoder/bitstream.c"
@@ -142,6 +142,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/cost.h"
             "${AOM_ROOT}/av1/encoder/encodeframe.c"
             "${AOM_ROOT}/av1/encoder/encodeframe.h"
+            "${AOM_ROOT}/av1/encoder/encodeframe_utils.c"
+            "${AOM_ROOT}/av1/encoder/encodeframe_utils.h"
             "${AOM_ROOT}/av1/encoder/encodemb.c"
             "${AOM_ROOT}/av1/encoder/encodemb.h"
             "${AOM_ROOT}/av1/encoder/encodemv.c"
@@ -150,16 +152,23 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/encode_strategy.h"
             "${AOM_ROOT}/av1/encoder/encoder.c"
             "${AOM_ROOT}/av1/encoder/encoder.h"
+            "${AOM_ROOT}/av1/encoder/encoder_alloc.h"
+            "${AOM_ROOT}/av1/encoder/encoder_utils.c"
+            "${AOM_ROOT}/av1/encoder/encoder_utils.h"
             "${AOM_ROOT}/av1/encoder/encodetxb.c"
             "${AOM_ROOT}/av1/encoder/encodetxb.h"
             "${AOM_ROOT}/av1/encoder/ethread.c"
             "${AOM_ROOT}/av1/encoder/ethread.h"
             "${AOM_ROOT}/av1/encoder/extend.c"
             "${AOM_ROOT}/av1/encoder/extend.h"
+            "${AOM_ROOT}/av1/encoder/external_partition.c"
+            "${AOM_ROOT}/av1/encoder/external_partition.h"
             "${AOM_ROOT}/av1/encoder/firstpass.c"
             "${AOM_ROOT}/av1/encoder/firstpass.h"
             "${AOM_ROOT}/av1/encoder/global_motion.c"
             "${AOM_ROOT}/av1/encoder/global_motion.h"
+            "${AOM_ROOT}/av1/encoder/global_motion_facade.c"
+            "${AOM_ROOT}/av1/encoder/global_motion_facade.h"
             "${AOM_ROOT}/av1/encoder/gop_structure.c"
             "${AOM_ROOT}/av1/encoder/gop_structure.h"
             "${AOM_ROOT}/av1/encoder/grain_test_vectors.h"
@@ -186,11 +195,14 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/mv_prec.h"
             "${AOM_ROOT}/av1/encoder/palette.c"
             "${AOM_ROOT}/av1/encoder/palette.h"
+            "${AOM_ROOT}/av1/encoder/partition_search.h"
+            "${AOM_ROOT}/av1/encoder/partition_search.c"
             "${AOM_ROOT}/av1/encoder/partition_strategy.h"
             "${AOM_ROOT}/av1/encoder/partition_strategy.c"
             "${AOM_ROOT}/av1/encoder/pass2_strategy.h"
             "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
             "${AOM_ROOT}/av1/encoder/pickcdef.c"
+            "${AOM_ROOT}/av1/encoder/pickcdef.h"
             "${AOM_ROOT}/av1/encoder/picklpf.c"
             "${AOM_ROOT}/av1/encoder/picklpf.h"
             "${AOM_ROOT}/av1/encoder/pickrst.c"
@@ -199,10 +211,12 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/ransac.h"
             "${AOM_ROOT}/av1/encoder/ratectrl.c"
             "${AOM_ROOT}/av1/encoder/ratectrl.h"
+            "${AOM_ROOT}/av1/encoder/rc_utils.h"
             "${AOM_ROOT}/av1/encoder/rd.c"
             "${AOM_ROOT}/av1/encoder/rd.h"
             "${AOM_ROOT}/av1/encoder/rdopt.c"
             "${AOM_ROOT}/av1/encoder/nonrd_pickmode.c"
+            "${AOM_ROOT}/av1/encoder/nonrd_opt.h"
             "${AOM_ROOT}/av1/encoder/rdopt.h"
             "${AOM_ROOT}/av1/encoder/rdopt_data_defs.h"
             "${AOM_ROOT}/av1/encoder/rdopt_utils.h"
@@ -210,23 +224,34 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/reconinter_enc.h"
             "${AOM_ROOT}/av1/encoder/segmentation.c"
             "${AOM_ROOT}/av1/encoder/segmentation.h"
+            "${AOM_ROOT}/av1/encoder/sorting_network.h"
             "${AOM_ROOT}/av1/encoder/speed_features.c"
             "${AOM_ROOT}/av1/encoder/speed_features.h"
+            "${AOM_ROOT}/av1/encoder/superres_scale.c"
+            "${AOM_ROOT}/av1/encoder/superres_scale.h"
             "${AOM_ROOT}/av1/encoder/svc_layercontext.c"
             "${AOM_ROOT}/av1/encoder/svc_layercontext.h"
             "${AOM_ROOT}/av1/encoder/temporal_filter.c"
             "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+            "${AOM_ROOT}/av1/encoder/thirdpass.c"
+            "${AOM_ROOT}/av1/encoder/thirdpass.h"
             "${AOM_ROOT}/av1/encoder/tokenize.c"
             "${AOM_ROOT}/av1/encoder/tokenize.h"
             "${AOM_ROOT}/av1/encoder/tpl_model.c"
             "${AOM_ROOT}/av1/encoder/tpl_model.h"
             "${AOM_ROOT}/av1/encoder/tx_search.c"
             "${AOM_ROOT}/av1/encoder/tx_search.h"
+            "${AOM_ROOT}/av1/encoder/txb_rdopt.c"
+            "${AOM_ROOT}/av1/encoder/txb_rdopt.h"
+            "${AOM_ROOT}/av1/encoder/txb_rdopt_utils.h"
             "${AOM_ROOT}/av1/encoder/intra_mode_search.c"
             "${AOM_ROOT}/av1/encoder/intra_mode_search.h"
+            "${AOM_ROOT}/av1/encoder/intra_mode_search_utils.h"
             "${AOM_ROOT}/av1/encoder/wedge_utils.c"
             "${AOM_ROOT}/av1/encoder/var_based_part.c"
             "${AOM_ROOT}/av1/encoder/var_based_part.h"
+            "${AOM_ROOT}/av1/encoder/av1_noise_estimate.c"
+            "${AOM_ROOT}/av1/encoder/av1_noise_estimate.h"
             "${AOM_ROOT}/third_party/fastfeat/fast.c"
             "${AOM_ROOT}/third_party/fastfeat/fast.h"
             "${AOM_ROOT}/third_party/fastfeat/fast_9.c"
@@ -236,145 +261,108 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
             "${AOM_ROOT}/av1/encoder/dwt.c"
             "${AOM_ROOT}/av1/encoder/dwt.h")
 
-if(CONFIG_TUNE_VMAF)
-  list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/tune_vmaf.c"
-              "${AOM_ROOT}/av1/encoder/tune_vmaf.h")
-endif()
-
 list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
-            "${AOM_ROOT}/av1/common/cdef_block_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h"
+            "${AOM_ROOT}/av1/common/x86/cdef_block_sse2.c"
             "${AOM_ROOT}/av1/common/x86/cfl_sse2.c"
             "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
             "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
-            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
-            "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c"
-            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h"
-            "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c")
-
-if(NOT CONFIG_AV1_HIGHBITDEPTH)
-  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE2
-                   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c")
-endif()
+            "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
-            "${AOM_ROOT}/av1/common/cdef_block_ssse3.c"
             "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c"
             "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h"
+            "${AOM_ROOT}/av1/common/x86/cdef_block_ssse3.c"
             "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c"
-            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c"
-            "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c"
-            "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c")
-
-if(NOT CONFIG_AV1_HIGHBITDEPTH)
-  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSSE3
-                   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c")
-endif()
+            "${AOM_ROOT}/av1/common/x86/resize_ssse3.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1
-            "${AOM_ROOT}/av1/common/cdef_block_sse4.c"
             "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c"
             "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c"
             "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c"
             "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h"
+            "${AOM_ROOT}/av1/common/x86/cdef_block_sse4.c"
             "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c"
-            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c"
             "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c"
-            "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_sse4.c"
-            "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c"
             "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c"
             "${AOM_ROOT}/av1/common/x86/reconinter_sse4.c"
             "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c"
             "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c")
 
-if(NOT CONFIG_AV1_HIGHBITDEPTH)
-  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE4_1
-                   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c"
-                   "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c")
-endif()
-
 list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
-            "${AOM_ROOT}/av1/common/cdef_block_avx2.c"
             "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c"
             "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.h"
+            "${AOM_ROOT}/av1/common/x86/cdef_block_avx2.c"
             "${AOM_ROOT}/av1/common/x86/cfl_avx2.c"
             "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c"
             "${AOM_ROOT}/av1/common/x86/convolve_avx2.c"
-            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c"
             "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
-            "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c"
-            "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
             "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
             "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
             "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c"
             "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
 
-if(NOT CONFIG_AV1_HIGHBITDEPTH)
-  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_AVX2
-                   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c")
-endif()
-
 list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
             "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.h"
+            "${AOM_ROOT}/av1/encoder/x86/av1_k_means_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
-            "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/error_intrin_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
 
-if(NOT CONFIG_AV1_HIGHBITDEPTH)
-  list(
-    REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE2
-                "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c")
-endif()
-
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c")
 
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSSE3
+            "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_ssse3.c")
+
 list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm1d_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_sse4.c"
-            "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c"
-            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_constants.h"
-            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
-            "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/corner_match_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
-            "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_k_means_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c"
-            "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
-
-if(NOT CONFIG_AV1_HIGHBITDEPTH)
-  list(
-    REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_AVX2
-                "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c")
-endif()
+            "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
             "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c")
+            "${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/picksrt_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_MSA
             "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
@@ -382,28 +370,90 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_MSA
             "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
+            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
             "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c"
+            "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c"
+            "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
+            "${AOM_ROOT}/av1/common/arm/cdef_block_neon.c"
             "${AOM_ROOT}/av1/common/arm/cfl_neon.c"
             "${AOM_ROOT}/av1/common/arm/convolve_neon.c"
             "${AOM_ROOT}/av1/common/arm/convolve_neon.h"
+            "${AOM_ROOT}/av1/common/arm/highbd_inv_txfm_neon.c"
             "${AOM_ROOT}/av1/common/arm/jnt_convolve_neon.c"
-            "${AOM_ROOT}/av1/common/arm/mem_neon.h"
-            "${AOM_ROOT}/av1/common/arm/transpose_neon.h"
-            "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c"
-            "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
             "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
-            "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/reconintra_neon.c"
+            "${AOM_ROOT}/av1/common/arm/resize_neon.c"
             "${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
-            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
-            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
             "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c"
-            "${AOM_ROOT}/av1/common/cdef_block_neon.c")
+            "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
             "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
 
+if(CONFIG_TUNE_VMAF)
+  list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/tune_vmaf.c"
+              "${AOM_ROOT}/av1/encoder/tune_vmaf.h")
+endif()
+
+if(CONFIG_TUNE_BUTTERAUGLI)
+  list(APPEND AOM_AV1_ENCODER_SOURCES
+              "${AOM_ROOT}/av1/encoder/tune_butteraugli.c"
+              "${AOM_ROOT}/av1/encoder/tune_butteraugli.h")
+endif()
+
+if(CONFIG_OPTICAL_FLOW_API)
+  list(APPEND AOM_AV1_ENCODER_SOURCES
+              "${AOM_ROOT}/av1/encoder/sparse_linear_solver.c"
+              "${AOM_ROOT}/av1/encoder/sparse_linear_solver.h"
+              "${AOM_ROOT}/av1/encoder/optical_flow.c"
+              "${AOM_ROOT}/av1/encoder/optical_flow.h")
+endif()
+
+if(CONFIG_AV1_TEMPORAL_DENOISING)
+  list(APPEND AOM_AV1_ENCODER_SOURCES
+              "${AOM_ROOT}/av1/encoder/av1_temporal_denoiser.c"
+              "${AOM_ROOT}/av1/encoder/av1_temporal_denoiser.h")
+
+  list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
+              "${AOM_ROOT}/av1/encoder/x86/av1_temporal_denoiser_sse2.c")
+
+  list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+              "${AOM_ROOT}/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c")
+endif()
+
+if(CONFIG_AV1_HIGHBITDEPTH)
+  list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
+              "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c"
+              "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c"
+              "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c")
+
+  list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1
+              "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c"
+              "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_sse4.c"
+              "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c")
+
+  list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
+              "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c"
+              "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c"
+              "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c"
+              "${AOM_ROOT}/av1/common/x86/highbd_warp_affine_avx2.c")
+
+  list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
+              "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
+              "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c")
+
+  list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
+              "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c")
+
+  list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
+              "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
+              "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c"
+              "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c")
+endif()
+
 if(CONFIG_ACCOUNTING)
   list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/accounting.c"
               "${AOM_ROOT}/av1/decoder/accounting.h")
@@ -419,23 +469,34 @@ if(CONFIG_INTERNAL_STATS)
 endif()
 
 if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE4_1
+                   "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
+
+  list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_AVX2
+                   "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c"
+                   "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c")
+
   list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES
                    "${AOM_ROOT}/av1/encoder/cnn.c"
                    "${AOM_ROOT}/av1/encoder/cnn.h"
                    "${AOM_ROOT}/av1/encoder/firstpass.c"
                    "${AOM_ROOT}/av1/encoder/firstpass.h"
+                   "${AOM_ROOT}/av1/encoder/global_motion.c"
+                   "${AOM_ROOT}/av1/encoder/global_motion.h"
+                   "${AOM_ROOT}/av1/encoder/global_motion_facade.c"
+                   "${AOM_ROOT}/av1/encoder/global_motion_facade.h"
                    "${AOM_ROOT}/av1/encoder/gop_structure.c"
                    "${AOM_ROOT}/av1/encoder/gop_structure.h"
                    "${AOM_ROOT}/av1/encoder/misc_model_weights.h"
                    "${AOM_ROOT}/av1/encoder/partition_cnn_weights.h"
                    "${AOM_ROOT}/av1/encoder/partition_model_weights.h"
                    "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
+                   "${AOM_ROOT}/av1/encoder/picklpf.h"
+                   "${AOM_ROOT}/av1/encoder/pickrst.c"
                    "${AOM_ROOT}/av1/encoder/temporal_filter.c"
                    "${AOM_ROOT}/av1/encoder/temporal_filter.h"
-                   "${AOM_ROOT}/av1/encoder/temporal_filter_constants.h"
                    "${AOM_ROOT}/av1/encoder/tpl_model.c"
-                   "${AOM_ROOT}/av1/encoder/tpl_model.h"
-                   "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c")
+                   "${AOM_ROOT}/av1/encoder/tpl_model.h")
 endif()
 
 # Setup AV1 common/decoder/encoder targets. The libaom target must exist before
@@ -507,6 +568,12 @@ function(setup_av1_targets)
                                       "AOM_AV1_DECODER_INTRIN_SSSE3")
       endif()
     endif()
+    if(CONFIG_AV1_ENCODER)
+      if(AOM_AV1_ENCODER_INTRIN_SSSE3)
+        add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_SSSE3")
+      endif()
+    endif()
   endif()
 
   if(HAVE_SSE4_1)
@@ -555,10 +622,12 @@ function(setup_av1_targets)
                                     "AOM_AV1_COMMON_INTRIN_NEON")
     endif()
 
-    if(AOM_AV1_ENCODER_INTRIN_NEON)
-      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
-                                    "aom_av1_encoder"
-                                    "AOM_AV1_ENCODER_INTRIN_NEON")
+    if(CONFIG_AV1_ENCODER)
+      if(AOM_AV1_ENCODER_INTRIN_NEON)
+        add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                      "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_NEON")
+      endif()
     endif()
   endif()
 
diff --git a/media/libaom/src/av1/av1_cx_iface.c b/media/libaom/src/av1/av1_cx_iface.c
index 676eaa0adb..12de7ea8ab 100644
--- a/media/libaom/src/av1/av1_cx_iface.c
+++ b/media/libaom/src/av1/av1_cx_iface.c
@@ -11,12 +11,11 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "aom_mem/aom_mem.h"
 #include "config/aom_config.h"
 #include "config/aom_version.h"
 
-#include "aom_ports/aom_once.h"
 #include "aom_ports/mem_ops.h"
-#include "aom_ports/system_state.h"
 
 #include "aom/aom_encoder.h"
 #include "aom/internal/aom_codec_internal.h"
@@ -24,9 +23,12 @@
 #include "av1/av1_iface_common.h"
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/external_partition.h"
 #include "av1/encoder/firstpass.h"
+#include "av1/arg_defs.h"
 
-#define MAG_SIZE (4)
+#include "common/args_helper.h"
 
 struct av1_extracfg {
   int cpu_used;
@@ -36,6 +38,9 @@ struct av1_extracfg {
   unsigned int sharpness;
   unsigned int static_thresh;
   unsigned int row_mt;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  unsigned int fp_mt;
+#endif
   unsigned int tile_columns;  // log2 number of tile columns
   unsigned int tile_rows;     // log2 number of tile rows
   unsigned int enable_tpl_model;
@@ -48,6 +53,8 @@ struct av1_extracfg {
   unsigned int gf_max_pyr_height;
   aom_tune_metric tuning;
   const char *vmaf_model_path;
+  const char *partition_info_path;
+  aom_dist_metric dist_metric;
   unsigned int cq_level;  // constrained quality level
   unsigned int rc_max_intra_bitrate_pct;
   unsigned int rc_max_inter_bitrate_pct;
@@ -73,6 +80,7 @@ struct av1_extracfg {
   unsigned int enable_chroma_deltaq;
   AQ_MODE aq_mode;
   DELTAQ_MODE deltaq_mode;
+  int deltaq_strength;
   int deltalf_mode;
   unsigned int frame_periodic_boost;
   aom_bit_depth_t bit_depth;
@@ -92,6 +100,9 @@ struct av1_extracfg {
   int film_grain_test_vector;
   const char *film_grain_table_filename;
   unsigned int motion_vector_unit_test;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  unsigned int fpmt_unit_test;
+#endif
   unsigned int cdf_update_mode;
   int enable_rect_partitions;    // enable rectangular partitions for sequence
   int enable_ab_partitions;      // enable AB partitions for sequence
@@ -102,8 +113,9 @@ struct av1_extracfg {
   int enable_order_hint;         // enable order hint for sequence
   int enable_tx64;               // enable 64-pt transform usage for sequence
   int enable_flip_idtx;          // enable flip and identity transform types
-  int enable_dist_wtd_comp;      // enable dist wtd compound for sequence
-  int max_reference_frames;      // maximum number of references per frame
+  int enable_rect_tx;        // enable rectangular transform usage for sequence
+  int enable_dist_wtd_comp;  // enable dist wtd compound for sequence
+  int max_reference_frames;  // maximum number of references per frame
   int enable_reduced_reference_set;  // enable reduced set of references
   int enable_ref_frame_mvs;          // sequence level
   int allow_ref_frame_mvs;           // frame level
@@ -121,6 +133,8 @@ struct av1_extracfg {
   int enable_smooth_intra;           // enable smooth intra modes for sequence
   int enable_paeth_intra;            // enable Paeth intra mode for sequence
   int enable_cfl_intra;              // enable CFL uv intra mode for sequence
+  int enable_directional_intra;      // enable directional modes for sequence
+  int enable_diagonal_intra;  // enable D45 to D203 intra modes for sequence
   int enable_superres;
   int enable_overlay;  // enable overlay for filtered arf frames
   int enable_palette;
@@ -129,6 +143,7 @@ struct av1_extracfg {
 #if CONFIG_DENOISE
   float noise_level;
   int noise_block_size;
+  int enable_dnl_denoising;
 #endif
 
   unsigned int chroma_subsampling_x;
@@ -137,7 +152,9 @@ struct av1_extracfg {
   int use_intra_dct_only;
   int use_inter_dct_only;
   int use_intra_default_tx_only;
+  int enable_tx_size_search;
   int quant_b_adapt;
+  unsigned int vbr_corpus_complexity_lap;
   AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
   // Bit mask to specify which tier each of the 32 possible operating points
   // conforms to.
@@ -147,11 +164,196 @@ struct av1_extracfg {
   COST_UPDATE_TYPE coeff_cost_upd_freq;
   COST_UPDATE_TYPE mode_cost_upd_freq;
   COST_UPDATE_TYPE mv_cost_upd_freq;
+  COST_UPDATE_TYPE dv_cost_upd_freq;
   unsigned int ext_tile_debug;
   unsigned int sb_multipass_unit_test;
+  // Total number of passes. If this number is -1, then we assume passes = 1 or
+  // 2 (passes = 1 if pass == AOM_RC_ONE_PASS and passes = 2 otherwise).
+  int passes;
+  int fwd_kf_dist;
+
+  LOOPFILTER_CONTROL loopfilter_control;
+  // the name of the second pass output file when passes > 2
+  const char *two_pass_output;
+  const char *second_pass_log;
+  // Automatically determine whether to disable several intra tools
+  // when "--deltaq-mode=3" is true.
+  // Default as 0.
+  // When set to 1, the encoder will analyze the reconstruction quality
+  // as compared to the source image in the preprocessing pass.
+  // If the recontruction quality is considered high enough, we disable
+  // the following intra coding tools, for better encoding speed:
+  // "--enable_smooth_intra",
+  // "--enable_paeth_intra",
+  // "--enable_cfl_intra",
+  // "--enable_diagonal_intra".
+  int auto_intra_tools_off;
+  int strict_level_conformance;
 };
 
-static struct av1_extracfg default_extra_cfg = {
+#if CONFIG_REALTIME_ONLY
+// Settings changed for realtime only build:
+// cpu_used: 7
+// enable_tpl_model: 0
+// enable_restoration: 0
+// enable_obmc: 0
+// deltaq_mode: NO_DELTA_Q
+// enable_global_motion usage: 0
+// enable_warped_motion at sequence level: 0
+// allow_warped_motion at frame level: 0
+// coeff_cost_upd_freq: COST_UPD_OFF
+// mode_cost_upd_freq: COST_UPD_OFF
+// mv_cost_upd_freq: COST_UPD_OFF
+// dv_cost_upd_freq: COST_UPD_OFF
+static const struct av1_extracfg default_extra_cfg = {
+  7,  // cpu_used
+  1,  // enable_auto_alt_ref
+  0,  // enable_auto_bwd_ref
+  0,  // noise_sensitivity
+  0,  // sharpness
+  0,  // static_thresh
+  1,  // row_mt
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  0,  // fp_mt
+#endif
+  0,              // tile_columns
+  0,              // tile_rows
+  0,              // enable_tpl_model
+  1,              // enable_keyframe_filtering
+  7,              // arnr_max_frames
+  5,              // arnr_strength
+  0,              // min_gf_interval; 0 -> default decision
+  0,              // max_gf_interval; 0 -> default decision
+  0,              // gf_min_pyr_height
+  5,              // gf_max_pyr_height
+  AOM_TUNE_PSNR,  // tuning
+  "/usr/local/share/model/vmaf_v0.6.1.json",  // VMAF model path
+  ".",                                        // partition info path
+  AOM_DIST_METRIC_PSNR,                       // dist_metric
+  10,                                         // cq_level
+  0,                                          // rc_max_intra_bitrate_pct
+  0,                                          // rc_max_inter_bitrate_pct
+  0,                                          // gf_cbr_boost_pct
+  0,                                          // lossless
+  1,                                          // enable_cdef
+  0,                                          // enable_restoration
+  0,                                          // force_video_mode
+  0,                                          // enable_obmc
+  3,                                          // disable_trellis_quant
+  0,                                          // enable_qm
+  DEFAULT_QM_Y,                               // qm_y
+  DEFAULT_QM_U,                               // qm_u
+  DEFAULT_QM_V,                               // qm_v
+  DEFAULT_QM_FIRST,                           // qm_min
+  DEFAULT_QM_LAST,                            // qm_max
+  1,                                          // max number of tile groups
+  0,                                          // mtu_size
+  AOM_TIMING_UNSPECIFIED,       // No picture timing signaling in bitstream
+  0,                            // frame_parallel_decoding_mode
+  1,                            // enable dual filter
+  0,                            // enable delta quant in chroma planes
+  NO_AQ,                        // aq_mode
+  NO_DELTA_Q,                   // deltaq_mode
+  100,                          // deltaq_strength
+  0,                            // delta lf mode
+  0,                            // frame_periodic_boost
+  AOM_BITS_8,                   // Bit depth
+  AOM_CONTENT_DEFAULT,          // content
+  AOM_CICP_CP_UNSPECIFIED,      // CICP color primaries
+  AOM_CICP_TC_UNSPECIFIED,      // CICP transfer characteristics
+  AOM_CICP_MC_UNSPECIFIED,      // CICP matrix coefficients
+  AOM_CSP_UNKNOWN,              // chroma sample position
+  0,                            // color range
+  0,                            // render width
+  0,                            // render height
+  AOM_SUPERBLOCK_SIZE_DYNAMIC,  // superblock_size
+  1,                            // this depends on large_scale_tile.
+  0,                            // error_resilient_mode off by default.
+  0,                            // s_frame_mode off by default.
+  0,                            // film_grain_test_vector
+  NULL,                         // film_grain_table_filename
+  0,                            // motion_vector_unit_test
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  0,  // fpmt_unit_test
+#endif
+  1,    // CDF update mode
+  1,    // enable rectangular partitions
+  1,    // enable ab shape partitions
+  1,    // enable 1:4 and 4:1 partitions
+  4,    // min_partition_size
+  128,  // max_partition_size
+  1,    // enable intra edge filter
+  1,    // frame order hint
+  1,    // enable 64-pt transform usage
+  1,    // enable flip and identity transform
+  1,    // enable rectangular transform usage
+  1,    // dist-wtd compound
+  7,    // max_reference_frames
+  0,    // enable_reduced_reference_set
+  1,    // enable_ref_frame_mvs sequence level
+  1,    // allow ref_frame_mvs frame level
+  1,    // enable masked compound at sequence level
+  1,    // enable one sided compound at sequence level
+  1,    // enable interintra compound at sequence level
+  1,    // enable smooth interintra mode
+  1,    // enable difference-weighted compound
+  1,    // enable interinter wedge compound
+  1,    // enable interintra wedge compound
+  0,    // enable_global_motion usage
+  0,    // enable_warped_motion at sequence level
+  0,    // allow_warped_motion at frame level
+  1,    // enable filter intra at sequence level
+  1,    // enable smooth intra modes usage for sequence
+  1,    // enable Paeth intra mode usage for sequence
+  1,    // enable CFL uv intra mode usage for sequence
+  1,    // enable directional intra mode usage for sequence
+  1,    // enable D45 to D203 intra mode usage for sequence
+  1,    // superres
+  1,    // enable overlay
+  1,    // enable palette
+  1,    // enable intrabc
+  1,    // enable angle delta
+#if CONFIG_DENOISE
+  0,   // noise_level
+  32,  // noise_block_size
+  1,   // enable_dnl_denoising
+#endif
+  0,  // chroma_subsampling_x
+  0,  // chroma_subsampling_y
+  0,  // reduced_tx_type_set
+  0,  // use_intra_dct_only
+  0,  // use_inter_dct_only
+  0,  // use_intra_default_tx_only
+  1,  // enable_tx_size_search
+  0,  // quant_b_adapt
+  0,  // vbr_corpus_complexity_lap
+  {
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+  },               // target_seq_level_idx
+  0,               // tier_mask
+  0,               // min_cr
+  COST_UPD_OFF,    // coeff_cost_upd_freq
+  COST_UPD_OFF,    // mode_cost_upd_freq
+  COST_UPD_OFF,    // mv_cost_upd_freq
+  COST_UPD_OFF,    // dv_cost_upd_freq
+  0,               // ext_tile_debug
+  0,               // sb_multipass_unit_test
+  -1,              // passes
+  -1,              // fwd_kf_dist
+  LOOPFILTER_ALL,  // loopfilter_control
+  NULL,            // two_pass_output
+  NULL,            // second_pass_log
+  0,               // auto_intra_tools_off
+  0,               // strict_level_conformance
+};
+#else
+static const struct av1_extracfg default_extra_cfg = {
   0,              // cpu_used
   1,              // enable_auto_alt_ref
   0,              // enable_auto_bwd_ref
@@ -159,6 +361,9 @@ static struct av1_extracfg default_extra_cfg = {
   0,              // sharpness
   0,              // static_thresh
   1,              // row_mt
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  0,              // fp_mt
+#endif
   0,              // tile_columns
   0,              // tile_rows
   1,              // enable_tpl_model
@@ -170,36 +375,39 @@ static struct av1_extracfg default_extra_cfg = {
   0,              // gf_min_pyr_height
   5,              // gf_max_pyr_height
   AOM_TUNE_PSNR,  // tuning
-  "/usr/local/share/model/vmaf_v0.6.1.pkl",  // VMAF model path
-  10,                                        // cq_level
-  0,                                         // rc_max_intra_bitrate_pct
-  0,                                         // rc_max_inter_bitrate_pct
-  0,                                         // gf_cbr_boost_pct
-  0,                                         // lossless
-  1,                                         // enable_cdef
-  1,                                         // enable_restoration
-  0,                                         // force_video_mode
-  1,                                         // enable_obmc
-  3,                                         // disable_trellis_quant
-  0,                                         // enable_qm
-  DEFAULT_QM_Y,                              // qm_y
-  DEFAULT_QM_U,                              // qm_u
-  DEFAULT_QM_V,                              // qm_v
-  DEFAULT_QM_FIRST,                          // qm_min
-  DEFAULT_QM_LAST,                           // qm_max
-  1,                                         // max number of tile groups
-  0,                                         // mtu_size
+  "/usr/local/share/model/vmaf_v0.6.1.json",  // VMAF model path
+  ".",                                        // partition info path
+  AOM_DIST_METRIC_PSNR,                       // dist_metric
+  10,                                         // cq_level
+  0,                                          // rc_max_intra_bitrate_pct
+  0,                                          // rc_max_inter_bitrate_pct
+  0,                                          // gf_cbr_boost_pct
+  0,                                          // lossless
+  1,                                          // enable_cdef
+  1,                                          // enable_restoration
+  0,                                          // force_video_mode
+  1,                                          // enable_obmc
+  3,                                          // disable_trellis_quant
+  0,                                          // enable_qm
+  DEFAULT_QM_Y,                               // qm_y
+  DEFAULT_QM_U,                               // qm_u
+  DEFAULT_QM_V,                               // qm_v
+  DEFAULT_QM_FIRST,                           // qm_min
+  DEFAULT_QM_LAST,                            // qm_max
+  1,                                          // max number of tile groups
+  0,                                          // mtu_size
   AOM_TIMING_UNSPECIFIED,       // No picture timing signaling in bitstream
   0,                            // frame_parallel_decoding_mode
   1,                            // enable dual filter
   0,                            // enable delta quant in chroma planes
   NO_AQ,                        // aq_mode
   DELTA_Q_OBJECTIVE,            // deltaq_mode
+  100,                          // deltaq_strength
   0,                            // delta lf mode
-  0,                            // frame_periodic_delta_q
+  0,                            // frame_periodic_boost
   AOM_BITS_8,                   // Bit depth
   AOM_CONTENT_DEFAULT,          // content
-  AOM_CICP_CP_UNSPECIFIED,      // CICP color space
+  AOM_CICP_CP_UNSPECIFIED,      // CICP color primaries
   AOM_CICP_TC_UNSPECIFIED,      // CICP transfer characteristics
   AOM_CICP_MC_UNSPECIFIED,      // CICP matrix coefficients
   AOM_CSP_UNKNOWN,              // chroma sample position
@@ -211,8 +419,11 @@ static struct av1_extracfg default_extra_cfg = {
   0,                            // error_resilient_mode off by default.
   0,                            // s_frame_mode off by default.
   0,                            // film_grain_test_vector
-  0,                            // film_grain_table_filename
+  NULL,                         // film_grain_table_filename
   0,                            // motion_vector_unit_test
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  0,                            // fpmt_unit_test
+#endif
   1,                            // CDF update mode
   1,                            // enable rectangular partitions
   1,                            // enable ab shape partitions
@@ -223,6 +434,7 @@ static struct av1_extracfg default_extra_cfg = {
   1,                            // frame order hint
   1,                            // enable 64-pt transform usage
   1,                            // enable flip and identity transform
+  1,                            // enable rectangular transform usage
   1,                            // dist-wtd compound
   7,                            // max_reference_frames
   0,                            // enable_reduced_reference_set
@@ -242,22 +454,27 @@ static struct av1_extracfg default_extra_cfg = {
   1,                            // enable smooth intra modes usage for sequence
   1,                            // enable Paeth intra mode usage for sequence
   1,                            // enable CFL uv intra mode usage for sequence
-  1,                            // superres
-  1,                            // enable overlay
-  1,                            // enable palette
-  !CONFIG_SHARP_SETTINGS,       // enable intrabc
-  1,                            // enable angle delta
+  1,   // enable directional intra mode usage for sequence
+  1,   // enable D45 to D203 intra mode usage for sequence
+  1,   // superres
+  1,   // enable overlay
+  1,   // enable palette
+  1,   // enable intrabc
+  1,   // enable angle delta
 #if CONFIG_DENOISE
   0,   // noise_level
   32,  // noise_block_size
+  1,   // enable_dnl_denoising
 #endif
-  0,  // chroma_subsampling_x
-  0,  // chroma_subsampling_y
-  0,  // reduced_tx_type_set
-  0,  // use_intra_dct_only
-  0,  // use_inter_dct_only
-  0,  // use_intra_default_tx_only
-  0,  // quant_b_adapt
+  0,   // chroma_subsampling_x
+  0,   // chroma_subsampling_y
+  0,   // reduced_tx_type_set
+  0,   // use_intra_dct_only
+  0,   // use_inter_dct_only
+  0,   // use_intra_default_tx_only
+  1,   // enable_tx_size_search
+  0,   // quant_b_adapt
+  0,   // vbr_corpus_complexity_lap
   {
       SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
       SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
@@ -266,15 +483,24 @@ static struct av1_extracfg default_extra_cfg = {
       SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
       SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
       SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
-  },            // target_seq_level_idx
-  0,            // tier_mask
-  0,            // min_cr
-  COST_UPD_SB,  // coeff_cost_upd_freq
-  COST_UPD_SB,  // mode_cost_upd_freq
-  COST_UPD_SB,  // mv_cost_upd_freq
-  0,            // ext_tile_debug
-  0,            // sb_multipass_unit_test
+  },               // target_seq_level_idx
+  0,               // tier_mask
+  0,               // min_cr
+  COST_UPD_SB,     // coeff_cost_upd_freq
+  COST_UPD_SB,     // mode_cost_upd_freq
+  COST_UPD_SB,     // mv_cost_upd_freq
+  COST_UPD_SB,     // dv_cost_upd_freq
+  0,               // ext_tile_debug
+  0,               // sb_multipass_unit_test
+  -1,              // passes
+  -1,              // fwd_kf_dist
+  LOOPFILTER_ALL,  // loopfilter_control
+  NULL,            // two_pass_output
+  NULL,            // second_pass_log
+  0,               // auto_intra_tools_off
+  0,               // strict_level_conformance
 };
+#endif
 
 struct aom_codec_alg_priv {
   aom_codec_priv_t base;
@@ -284,13 +510,10 @@ struct aom_codec_alg_priv {
   aom_codec_pts_t pts_offset;
   unsigned char pts_offset_initialized;
   AV1EncoderConfig oxcf;
-  AV1_COMP *cpi;
+  AV1_PRIMARY *ppi;
   unsigned char *cx_data;
   size_t cx_data_sz;
-  unsigned char *pending_cx_data;
   size_t pending_cx_data_sz;
-  int pending_frame_count;
-  size_t pending_frame_sizes[8];
   aom_image_t preview_img;
   aom_enc_frame_flags_t next_frame_flags;
   aom_codec_pkt_list_decl(256) pkt_list;
@@ -300,7 +523,6 @@ struct aom_codec_alg_priv {
 
   // lookahead instance variables
   BufferPool *buffer_pool_lap;
-  AV1_COMP *cpi_lap;
   FIRSTPASS_STATS *frame_stats_buffer;
   // Number of stats buffers required for look ahead
   int num_lap_buffers;
@@ -308,7 +530,7 @@ struct aom_codec_alg_priv {
 };
 
 static INLINE int gcd(int64_t a, int b) {
-  int remainder;  // remainder
+  int remainder;
   while (b > 0) {
     remainder = (int)(a % b);
     a = b;
@@ -334,6 +556,41 @@ static aom_codec_err_t update_error_state(
   return res;
 }
 
+// This function deep copies a string src to *dst. For default string we will
+// use a string literal, and otherwise we will allocate memory for the string.
+static aom_codec_err_t allocate_and_set_string(const char *src,
+                                               const char *default_src,
+                                               const char **dst,
+                                               char *err_detail) {
+  if (!src) {
+    snprintf(err_detail, ARG_ERR_MSG_MAX_LEN,
+             "Null pointer given to a string parameter.");
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  if (*dst && strcmp(src, *dst) == 0) return AOM_CODEC_OK;
+  // If the input is exactly the same as default, we will use the string
+  // literal, so do not free here.
+  if (*dst != default_src) {
+    aom_free((void *)*dst);
+  }
+
+  if (default_src && strcmp(src, default_src) == 0) {
+    // default_src should be a string literal
+    *dst = default_src;
+  } else {
+    size_t len = strlen(src) + 1;
+    char *tmp = aom_malloc(len * sizeof(*tmp));
+    if (!tmp) {
+      snprintf(err_detail, ARG_ERR_MSG_MAX_LEN,
+               "Failed to allocate memory for copying parameters.");
+      return AOM_CODEC_MEM_ERROR;
+    }
+    memcpy(tmp, src, len);
+    *dst = tmp;
+  }
+  return 0;
+}
+
 #undef ERROR
 #define ERROR(str)                  \
   do {                              \
@@ -373,7 +630,11 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTA_Q_MODE_COUNT - 1);
   RANGE_CHECK_HI(extra_cfg, deltalf_mode, 1);
   RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
-  RANGE_CHECK_HI(cfg, g_usage, 1);
+#if CONFIG_REALTIME_ONLY
+  RANGE_CHECK(cfg, g_usage, AOM_USAGE_REALTIME, AOM_USAGE_REALTIME);
+#else
+  RANGE_CHECK_HI(cfg, g_usage, AOM_USAGE_ALL_INTRA);
+#endif
   RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
   RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
   RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
@@ -381,12 +642,16 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
   RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO);
   RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
-  RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_LAST_PASS);
+  RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_THIRD_PASS);
   if (cfg->g_pass == AOM_RC_ONE_PASS) {
     RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_TOTAL_BUFFERS);
   } else {
     RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
   }
+  if (cfg->g_usage == AOM_USAGE_ALL_INTRA) {
+    RANGE_CHECK_HI(cfg, g_lag_in_frames, 0);
+    RANGE_CHECK_HI(cfg, kf_max_dist, 0);
+  }
   RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
   RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
   if (extra_cfg->max_gf_interval > 0) {
@@ -406,7 +671,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
               SCALE_NUMERATOR << 1);
   RANGE_CHECK(cfg, rc_resize_kf_denominator, SCALE_NUMERATOR,
               SCALE_NUMERATOR << 1);
-  RANGE_CHECK_HI(cfg, rc_superres_mode, SUPERRES_MODES - 1);
+  RANGE_CHECK_HI(cfg, rc_superres_mode, AOM_SUPERRES_AUTO);
   RANGE_CHECK(cfg, rc_superres_denominator, SCALE_NUMERATOR,
               SCALE_NUMERATOR << 1);
   RANGE_CHECK(cfg, rc_superres_kf_denominator, SCALE_NUMERATOR,
@@ -415,20 +680,16 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg, rc_superres_kf_qthresh, 1, 63);
   RANGE_CHECK_HI(extra_cfg, cdf_update_mode, 2);
 
-  // AV1 does not support a lower bound on the keyframe interval in
-  // automatic keyframe placement mode.
-  if (cfg->kf_mode != AOM_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist &&
-      cfg->kf_min_dist > 0)
-    ERROR(
-        "kf_min_dist not supported in auto mode, use 0 "
-        "or kf_max_dist instead.");
-
   RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2);
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  RANGE_CHECK_HI(extra_cfg, fpmt_unit_test, 1);
+#endif
   RANGE_CHECK_HI(extra_cfg, sb_multipass_unit_test, 1);
   RANGE_CHECK_HI(extra_cfg, ext_tile_debug, 1);
   RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 1);
   RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
-  RANGE_CHECK(extra_cfg, cpu_used, 0, 8);
+  RANGE_CHECK(extra_cfg, cpu_used, 0,
+              (cfg->g_usage == AOM_USAGE_REALTIME) ? 10 : 9);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
               AOM_SUPERBLOCK_SIZE_DYNAMIC);
@@ -436,6 +697,9 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1);
 
   RANGE_CHECK_HI(extra_cfg, row_mt, 1);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  RANGE_CHECK_HI(extra_cfg, fp_mt, 1);
+#endif
 
   RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
   RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
@@ -455,7 +719,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
   RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1);
 
-  if (cfg->g_pass == AOM_RC_LAST_PASS) {
+  if (cfg->g_pass >= AOM_RC_SECOND_PASS) {
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
     const FIRSTPASS_STATS *stats;
@@ -476,6 +740,15 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
       ERROR("rc_twopass_stats_in missing EOS stats packet");
   }
 
+  if (extra_cfg->passes != -1 && cfg->g_pass == AOM_RC_ONE_PASS &&
+      extra_cfg->passes != 1) {
+    ERROR("One pass encoding but passes != 1.");
+  }
+
+  if (extra_cfg->passes != -1 && (int)cfg->g_pass > extra_cfg->passes) {
+    ERROR("Current pass is larger than total number of passes.");
+  }
+
   if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
       cfg->g_bit_depth > AOM_BITS_10) {
     ERROR("Codec bit-depth 12 not supported in profile < 2");
@@ -487,18 +760,10 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
 
   if (cfg->rc_end_usage == AOM_Q) {
     RANGE_CHECK_HI(cfg, use_fixed_qp_offsets, 1);
-    for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
-      RANGE_CHECK_HI(cfg, fixed_qp_offsets[i], 63);
-    }
   } else {
     if (cfg->use_fixed_qp_offsets > 0) {
       ERROR("--use_fixed_qp_offsets can only be used with --end-usage=q");
     }
-    for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
-      if (cfg->fixed_qp_offsets[i] >= 0) {
-        ERROR("--fixed_qp_offsets can only be used with --end-usage=q");
-      }
-    }
   }
 
   RANGE_CHECK(extra_cfg, color_primaries, AOM_CICP_CP_BT_709,
@@ -510,21 +775,37 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
               AOM_CICP_MC_ICTCP);
   RANGE_CHECK(extra_cfg, color_range, 0, 1);
 
+  /* Average corpus complexity is supported only in the case of single pass
+   * VBR*/
+  if (cfg->g_pass == AOM_RC_ONE_PASS && cfg->rc_end_usage == AOM_VBR)
+    RANGE_CHECK_HI(extra_cfg, vbr_corpus_complexity_lap,
+                   MAX_VBR_CORPUS_COMPLEXITY);
+  else if (extra_cfg->vbr_corpus_complexity_lap != 0)
+    ERROR(
+        "VBR corpus complexity is supported only in the case of single pass "
+        "VBR mode.");
+
+#if !CONFIG_TUNE_BUTTERAUGLI
+  if (extra_cfg->tuning == AOM_TUNE_BUTTERAUGLI) {
+    ERROR(
+        "This error may be related to the wrong configuration options: try to "
+        "set -DCONFIG_TUNE_BUTTERAUGLI=1 at the time CMake is run.");
+  }
+#endif
+
 #if !CONFIG_TUNE_VMAF
-  if (extra_cfg->tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
-      extra_cfg->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
-      extra_cfg->tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+  if (extra_cfg->tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+      extra_cfg->tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
     ERROR(
         "This error may be related to the wrong configuration options: try to "
         "set -DCONFIG_TUNE_VMAF=1 at the time CMake is run.");
   }
 #endif
 
-#if CONFIG_TUNE_VMAF
-  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_VMAF_MAX_GAIN);
-#else
-  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_SSIM);
-#endif
+  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_BUTTERAUGLI);
+
+  RANGE_CHECK(extra_cfg, dist_metric, AOM_DIST_METRIC_PSNR,
+              AOM_DIST_METRIC_QM_PSNR);
 
   RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED,
               AOM_TIMING_DEC_MODEL);
@@ -538,20 +819,16 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
       ERROR("Only --enable_chroma_deltaq=0 can be used with --lossless=1.");
   }
 
-  if (cfg->rc_resize_mode != RESIZE_NONE &&
-      extra_cfg->aq_mode == CYCLIC_REFRESH_AQ) {
-    ERROR("--aq_mode=3 is only supported for --resize-mode=0.");
-  }
-
   RANGE_CHECK(extra_cfg, max_reference_frames, 3, 7);
   RANGE_CHECK(extra_cfg, enable_reduced_reference_set, 0, 1);
   RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1);
   RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1);
 
   RANGE_CHECK_HI(extra_cfg, disable_trellis_quant, 3);
-  RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 2);
-  RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 2);
+  RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 3);
+  RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 3);
   RANGE_CHECK(extra_cfg, mv_cost_upd_freq, 0, 3);
+  RANGE_CHECK(extra_cfg, dv_cost_upd_freq, 0, 3);
 
   RANGE_CHECK(extra_cfg, min_partition_size, 4, 128);
   RANGE_CHECK(extra_cfg, max_partition_size, 4, 128);
@@ -564,6 +841,12 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
     }
   }
 
+  RANGE_CHECK(extra_cfg, deltaq_strength, 0, 1000);
+  RANGE_CHECK_HI(extra_cfg, loopfilter_control, 3);
+  RANGE_CHECK_HI(extra_cfg, enable_cdef, 2);
+  RANGE_CHECK_BOOL(extra_cfg, auto_intra_tools_off);
+  RANGE_CHECK_BOOL(extra_cfg, strict_level_conformance);
+
   return AOM_CODEC_OK;
 }
 
@@ -571,6 +854,7 @@ static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
                                     const aom_image_t *img) {
   switch (img->fmt) {
     case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_NV12:
     case AOM_IMG_FMT_I420:
     case AOM_IMG_FMT_YV1216:
     case AOM_IMG_FMT_I42016: break;
@@ -589,7 +873,7 @@ static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
       break;
     default:
       ERROR(
-          "Invalid image format. Only YV12, I420, I422, I444 images are "
+          "Invalid image format. Only YV12, NV12, I420, I422, I444 images are "
           "supported.");
       break;
   }
@@ -597,9 +881,19 @@ static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
   if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h)
     ERROR("Image size must match encoder init configuration size");
 
-  if (img->fmt != AOM_IMG_FMT_I420 && !ctx->extra_cfg.enable_tx64) {
-    ERROR("TX64 can only be disabled on I420 images.");
+#if CONFIG_TUNE_BUTTERAUGLI
+  if (ctx->extra_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+    if (img->bit_depth > 8) {
+      ERROR("Only 8 bit depth images supported in tune=butteraugli mode.");
+    }
+    if (img->mc != 0 && img->mc != AOM_CICP_MC_BT_709 &&
+        img->mc != AOM_CICP_MC_BT_601 && img->mc != AOM_CICP_MC_BT_470_B_G) {
+      ERROR(
+          "Only BT.709 and BT.601 matrix coefficients supported in "
+          "tune=butteraugli mode. Identity matrix is treated as BT.601.");
+    }
   }
+#endif
 
   return AOM_CODEC_OK;
 }
@@ -607,6 +901,7 @@ static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
 static int get_image_bps(const aom_image_t *img) {
   switch (img->fmt) {
     case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_NV12:
     case AOM_IMG_FMT_I420: return 12;
     case AOM_IMG_FMT_I422: return 16;
     case AOM_IMG_FMT_I444: return 24;
@@ -620,17 +915,17 @@ static int get_image_bps(const aom_image_t *img) {
 }
 
 // Set appropriate options to disable frame super-resolution.
-static void disable_superres(AV1EncoderConfig *const oxcf) {
-  oxcf->superres_mode = SUPERRES_NONE;
-  oxcf->superres_scale_denominator = SCALE_NUMERATOR;
-  oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
-  oxcf->superres_qthresh = 255;
-  oxcf->superres_kf_qthresh = 255;
+static void disable_superres(SuperResCfg *const superres_cfg) {
+  superres_cfg->superres_mode = AOM_SUPERRES_NONE;
+  superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+  superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+  superres_cfg->superres_qthresh = 255;
+  superres_cfg->superres_kf_qthresh = 255;
 }
 
 static void update_default_encoder_config(const cfg_options_t *cfg,
                                           struct av1_extracfg *extra_cfg) {
-  extra_cfg->enable_cdef = (cfg->disable_cdef == 0);
+  extra_cfg->enable_cdef = (cfg->disable_cdef == 0) ? 1 : 0;
   extra_cfg->enable_restoration = (cfg->disable_lr == 0);
   extra_cfg->superblock_size = (cfg->super_block_size == 64)
                                    ? AOM_SUPERBLOCK_SIZE_64X64
@@ -671,23 +966,6 @@ static void update_default_encoder_config(const cfg_options_t *cfg,
   extra_cfg->reduced_tx_type_set = cfg->reduced_tx_type_set;
 }
 
-static double convert_qp_offset(int cq_level, int q_offset, int bit_depth) {
-  const double base_q_val = av1_convert_qindex_to_q(cq_level, bit_depth);
-  const int new_q_index_offset = av1_quantizer_to_qindex(q_offset);
-  const int new_q_index = AOMMAX(cq_level - new_q_index_offset, 0);
-  const double new_q_val = av1_convert_qindex_to_q(new_q_index, bit_depth);
-  return (base_q_val - new_q_val);
-}
-
-static double get_modeled_qp_offset(int cq_level, int level, int bit_depth) {
-  // 80% for keyframe was derived empirically.
-  // 40% similar to rc_pick_q_and_bounds_one_pass_vbr() for Q mode ARF.
-  // Rest derived similar to rc_pick_q_and_bounds_two_pass()
-  static const int percents[FIXED_QP_OFFSET_COUNT] = { 76, 60, 30, 15, 8 };
-  const double q_val = av1_convert_qindex_to_q(cq_level, bit_depth);
-  return q_val * percents[level] / 100;
-}
-
 static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
                                           const aom_codec_enc_cfg_t *cfg,
                                           struct av1_extracfg *extra_cfg) {
@@ -695,32 +973,92 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
     update_default_encoder_config(&cfg->encoder_cfg, extra_cfg);
   }
 
+  TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+  FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+
+  TileConfig *const tile_cfg = &oxcf->tile_cfg;
+
+  ResizeCfg *const resize_cfg = &oxcf->resize_cfg;
+
+  GFConfig *const gf_cfg = &oxcf->gf_cfg;
+
+  PartitionCfg *const part_cfg = &oxcf->part_cfg;
+
+  IntraModeCfg *const intra_mode_cfg = &oxcf->intra_mode_cfg;
+
+  TxfmSizeTypeCfg *const txfm_cfg = &oxcf->txfm_cfg;
+
+  CompoundTypeCfg *const comp_type_cfg = &oxcf->comp_type_cfg;
+
+  SuperResCfg *const superres_cfg = &oxcf->superres_cfg;
+
+  KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
+
+  DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+
+  RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+
+  ColorCfg *const color_cfg = &oxcf->color_cfg;
+
+  InputCfg *const input_cfg = &oxcf->input_cfg;
+
+  AlgoCfg *const algo_cfg = &oxcf->algo_cfg;
+
+  ToolCfg *const tool_cfg = &oxcf->tool_cfg;
+
   const int is_vbr = cfg->rc_end_usage == AOM_VBR;
   oxcf->profile = cfg->g_profile;
-  oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled;
   oxcf->max_threads = (int)cfg->g_threads;
-  oxcf->mode = (cfg->g_usage == AOM_USAGE_REALTIME) ? REALTIME : GOOD;
-  oxcf->width = cfg->g_w;
-  oxcf->height = cfg->g_h;
-  oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width;
-  oxcf->forced_max_frame_height = cfg->g_forced_max_frame_height;
-  oxcf->bit_depth = cfg->g_bit_depth;
-  oxcf->input_bit_depth = cfg->g_input_bit_depth;
+
+  switch (cfg->g_usage) {
+    case AOM_USAGE_REALTIME: oxcf->mode = REALTIME; break;
+    case AOM_USAGE_ALL_INTRA: oxcf->mode = ALLINTRA; break;
+    default: oxcf->mode = GOOD; break;
+  }
+
+  // Set frame-dimension related configuration.
+  frm_dim_cfg->width = cfg->g_w;
+  frm_dim_cfg->height = cfg->g_h;
+  frm_dim_cfg->forced_max_frame_width = cfg->g_forced_max_frame_width;
+  frm_dim_cfg->forced_max_frame_height = cfg->g_forced_max_frame_height;
+  frm_dim_cfg->render_width = extra_cfg->render_width;
+  frm_dim_cfg->render_height = extra_cfg->render_height;
+
+  // Set input video related configuration.
+  input_cfg->input_bit_depth = cfg->g_input_bit_depth;
   // guess a frame rate if out of whack, use 30
-  oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+  input_cfg->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+  if (cfg->g_pass >= AOM_RC_SECOND_PASS) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+    input_cfg->limit = n_packets - 1;
+  } else {
+    input_cfg->limit = cfg->g_limit;
+  }
+  input_cfg->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
+  input_cfg->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
+  if (input_cfg->init_framerate > 180) {
+    input_cfg->init_framerate = 30;
+    dec_model_cfg->timing_info_present = 0;
+  }
+
+  // Set Decoder model configuration.
   if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL ||
       extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
-    oxcf->timing_info_present = 1;
-    oxcf->timing_info.num_units_in_display_tick = cfg->g_timebase.num;
-    oxcf->timing_info.time_scale = cfg->g_timebase.den;
-    oxcf->timing_info.num_ticks_per_picture = 1;
+    dec_model_cfg->timing_info_present = 1;
+    dec_model_cfg->timing_info.num_units_in_display_tick = cfg->g_timebase.num;
+    dec_model_cfg->timing_info.time_scale = cfg->g_timebase.den;
+    dec_model_cfg->timing_info.num_ticks_per_picture = 1;
   } else {
-    oxcf->timing_info_present = 0;
+    dec_model_cfg->timing_info_present = 0;
   }
   if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL) {
-    oxcf->timing_info.equal_picture_interval = 1;
-    oxcf->decoder_model_info_present_flag = 0;
-    oxcf->display_model_info_present_flag = 1;
+    dec_model_cfg->timing_info.equal_picture_interval = 1;
+    dec_model_cfg->decoder_model_info_present_flag = 0;
+    dec_model_cfg->display_model_info_present_flag = 1;
   } else if (extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
     //    if( extra_cfg->arnr_strength > 0 )
     //    {
@@ -732,303 +1070,366 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
     //      printf("Only --superres-mode=0 can currently be used with
     //      --timing-info=model."); return AOM_CODEC_INVALID_PARAM;
     //    }
-    oxcf->buffer_model.num_units_in_decoding_tick = cfg->g_timebase.num;
-    oxcf->timing_info.equal_picture_interval = 0;
-    oxcf->decoder_model_info_present_flag = 1;
-    oxcf->buffer_removal_time_present = 1;
-    oxcf->display_model_info_present_flag = 1;
-  }
-  if (oxcf->init_framerate > 180) {
-    oxcf->init_framerate = 30;
-    oxcf->timing_info_present = 0;
+    dec_model_cfg->num_units_in_decoding_tick = cfg->g_timebase.num;
+    dec_model_cfg->timing_info.equal_picture_interval = 0;
+    dec_model_cfg->decoder_model_info_present_flag = 1;
+    dec_model_cfg->display_model_info_present_flag = 1;
   }
-  oxcf->encoder_cfg = &cfg->encoder_cfg;
 
-  switch (cfg->g_pass) {
-    case AOM_RC_ONE_PASS: oxcf->pass = 0; break;
-    case AOM_RC_FIRST_PASS: oxcf->pass = 1; break;
-    case AOM_RC_LAST_PASS: oxcf->pass = 2; break;
+  oxcf->pass = cfg->g_pass;
+  // For backward compatibility, assume that if extra_cfg->passes==-1, then
+  // passes = 1 or 2.
+  if (extra_cfg->passes == -1) {
+    if (cfg->g_pass == AOM_RC_ONE_PASS) {
+      oxcf->passes = 1;
+    } else {
+      oxcf->passes = 2;
+    }
+  } else {
+    oxcf->passes = extra_cfg->passes;
   }
 
-  oxcf->lag_in_frames = clamp(cfg->g_lag_in_frames, 0, MAX_LAG_BUFFERS);
-  oxcf->rc_mode = cfg->rc_end_usage;
-
-  // Convert target bandwidth from Kbit/s to Bit/s
-  oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
-  oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
-  oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
-  oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
-
-  oxcf->best_allowed_q =
+  // Set Rate Control configuration.
+  rc_cfg->max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+  rc_cfg->max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+  rc_cfg->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
+  rc_cfg->mode = cfg->rc_end_usage;
+  rc_cfg->min_cr = extra_cfg->min_cr;
+  rc_cfg->best_allowed_q =
       extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_min_quantizer);
-  oxcf->worst_allowed_q =
+  rc_cfg->worst_allowed_q =
       extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_max_quantizer);
-  oxcf->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
-  oxcf->fixed_q = -1;
-
-  oxcf->enable_cdef = extra_cfg->enable_cdef;
-  oxcf->enable_restoration =
+  rc_cfg->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
+  rc_cfg->under_shoot_pct = cfg->rc_undershoot_pct;
+  rc_cfg->over_shoot_pct = cfg->rc_overshoot_pct;
+  rc_cfg->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
+  rc_cfg->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+  rc_cfg->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+  // Convert target bandwidth from Kbit/s to Bit/s
+  rc_cfg->target_bandwidth = 1000 * cfg->rc_target_bitrate;
+  rc_cfg->drop_frames_water_mark = cfg->rc_dropframe_thresh;
+  rc_cfg->vbr_corpus_complexity_lap = extra_cfg->vbr_corpus_complexity_lap;
+  rc_cfg->vbrbias = cfg->rc_2pass_vbr_bias_pct;
+  rc_cfg->vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
+  rc_cfg->vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
+
+  // Set Toolset related configuration.
+  tool_cfg->bit_depth = cfg->g_bit_depth;
+  tool_cfg->cdef_control = (CDEF_CONTROL)extra_cfg->enable_cdef;
+  tool_cfg->enable_restoration =
       (cfg->g_usage == AOM_USAGE_REALTIME) ? 0 : extra_cfg->enable_restoration;
-  oxcf->force_video_mode = extra_cfg->force_video_mode;
-  oxcf->enable_obmc = extra_cfg->enable_obmc;
-  oxcf->enable_overlay = extra_cfg->enable_overlay;
-  oxcf->enable_palette = extra_cfg->enable_palette;
-  oxcf->enable_intrabc = extra_cfg->enable_intrabc;
-  oxcf->enable_angle_delta = extra_cfg->enable_angle_delta;
-  oxcf->disable_trellis_quant = extra_cfg->disable_trellis_quant;
-  oxcf->allow_ref_frame_mvs = extra_cfg->enable_ref_frame_mvs;
-  oxcf->using_qm = extra_cfg->enable_qm;
-  oxcf->qm_y = extra_cfg->qm_y;
-  oxcf->qm_u = extra_cfg->qm_u;
-  oxcf->qm_v = extra_cfg->qm_v;
-  oxcf->qm_minlevel = extra_cfg->qm_min;
-  oxcf->qm_maxlevel = extra_cfg->qm_max;
-  oxcf->reduced_tx_type_set = extra_cfg->reduced_tx_type_set;
-  oxcf->use_intra_dct_only = extra_cfg->use_intra_dct_only;
-  oxcf->use_inter_dct_only = extra_cfg->use_inter_dct_only;
-  oxcf->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only;
-  oxcf->quant_b_adapt = extra_cfg->quant_b_adapt;
-  oxcf->coeff_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
-  oxcf->mode_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
-  oxcf->mv_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq;
-  oxcf->num_tile_groups = extra_cfg->num_tg;
-  // In large-scale tile encoding mode, num_tile_groups is always 1.
-  if (cfg->large_scale_tile) oxcf->num_tile_groups = 1;
-  oxcf->mtu = extra_cfg->mtu_size;
-
+  tool_cfg->force_video_mode = extra_cfg->force_video_mode;
+  tool_cfg->enable_palette = extra_cfg->enable_palette;
   // FIXME(debargha): Should this be:
-  // oxcf->allow_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs &
-  //                             extra_cfg->enable_order_hint ?
+  // tool_cfg->enable_ref_frame_mvs  = extra_cfg->allow_ref_frame_mvs &
+  //                                         extra_cfg->enable_order_hint ?
   // Disallow using temporal MVs while large_scale_tile = 1.
-  oxcf->allow_ref_frame_mvs =
+  tool_cfg->enable_ref_frame_mvs =
       extra_cfg->allow_ref_frame_mvs && !cfg->large_scale_tile;
-  oxcf->under_shoot_pct = cfg->rc_undershoot_pct;
-  oxcf->over_shoot_pct = cfg->rc_overshoot_pct;
-
-  oxcf->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode;
-  oxcf->resize_scale_denominator = (uint8_t)cfg->rc_resize_denominator;
-  oxcf->resize_kf_scale_denominator = (uint8_t)cfg->rc_resize_kf_denominator;
-  if (oxcf->resize_mode == RESIZE_FIXED &&
-      oxcf->resize_scale_denominator == SCALE_NUMERATOR &&
-      oxcf->resize_kf_scale_denominator == SCALE_NUMERATOR)
-    oxcf->resize_mode = RESIZE_NONE;
-
-  if (extra_cfg->lossless || cfg->large_scale_tile) {
-    disable_superres(oxcf);
-  } else {
-    oxcf->superres_mode = (SUPERRES_MODE)cfg->rc_superres_mode;
-    oxcf->superres_scale_denominator = (uint8_t)cfg->rc_superres_denominator;
-    oxcf->superres_kf_scale_denominator =
-        (uint8_t)cfg->rc_superres_kf_denominator;
-    oxcf->superres_qthresh = av1_quantizer_to_qindex(cfg->rc_superres_qthresh);
-    oxcf->superres_kf_qthresh =
-        av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh);
-    if (oxcf->superres_mode == SUPERRES_FIXED &&
-        oxcf->superres_scale_denominator == SCALE_NUMERATOR &&
-        oxcf->superres_kf_scale_denominator == SCALE_NUMERATOR) {
-      disable_superres(oxcf);
-    }
-    if (oxcf->superres_mode == SUPERRES_QTHRESH &&
-        oxcf->superres_qthresh == 255 && oxcf->superres_kf_qthresh == 255) {
-      disable_superres(oxcf);
-    }
-  }
-
-  oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
-  oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
-  oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
-
-  oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh;
-
-  oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct;
-  oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
-  oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
-
-  oxcf->auto_key =
+  tool_cfg->superblock_size = extra_cfg->superblock_size;
+  tool_cfg->enable_monochrome = cfg->monochrome;
+  tool_cfg->full_still_picture_hdr = cfg->full_still_picture_hdr;
+  tool_cfg->enable_dual_filter = extra_cfg->enable_dual_filter;
+  tool_cfg->enable_order_hint = extra_cfg->enable_order_hint;
+  tool_cfg->enable_interintra_comp = extra_cfg->enable_interintra_comp;
+  tool_cfg->ref_frame_mvs_present =
+      extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint;
+  tool_cfg->enable_global_motion = extra_cfg->enable_global_motion;
+  tool_cfg->error_resilient_mode =
+      cfg->g_error_resilient | extra_cfg->error_resilient_mode;
+  tool_cfg->frame_parallel_decoding_mode =
+      extra_cfg->frame_parallel_decoding_mode;
+
+  // Set Quantization related configuration.
+  q_cfg->using_qm = extra_cfg->enable_qm;
+  q_cfg->qm_minlevel = extra_cfg->qm_min;
+  q_cfg->qm_maxlevel = extra_cfg->qm_max;
+  q_cfg->quant_b_adapt = extra_cfg->quant_b_adapt;
+  q_cfg->enable_chroma_deltaq = extra_cfg->enable_chroma_deltaq;
+  q_cfg->aq_mode = extra_cfg->aq_mode;
+  q_cfg->deltaq_mode = extra_cfg->deltaq_mode;
+  q_cfg->deltaq_strength = extra_cfg->deltaq_strength;
+  q_cfg->use_fixed_qp_offsets =
+      cfg->use_fixed_qp_offsets && (rc_cfg->mode == AOM_Q);
+  q_cfg->enable_hdr_deltaq =
+      (q_cfg->deltaq_mode == DELTA_Q_HDR) &&
+      (cfg->g_bit_depth == AOM_BITS_10) &&
+      (extra_cfg->color_primaries == AOM_CICP_CP_BT_2020);
+
+  tool_cfg->enable_deltalf_mode =
+      (q_cfg->deltaq_mode != NO_DELTA_Q) && extra_cfg->deltalf_mode;
+
+  // Set cost update frequency configuration.
+  oxcf->cost_upd_freq.coeff = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
+  oxcf->cost_upd_freq.mode = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
+  // Avoid MV cost update for allintra encoding mode.
+  oxcf->cost_upd_freq.mv = (cfg->kf_max_dist != 0)
+                               ? (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq
+                               : COST_UPD_OFF;
+  oxcf->cost_upd_freq.dv = (COST_UPDATE_TYPE)extra_cfg->dv_cost_upd_freq;
+
+  // Set frame resize mode configuration.
+  resize_cfg->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode;
+  resize_cfg->resize_scale_denominator = (uint8_t)cfg->rc_resize_denominator;
+  resize_cfg->resize_kf_scale_denominator =
+      (uint8_t)cfg->rc_resize_kf_denominator;
+  if (resize_cfg->resize_mode == RESIZE_FIXED &&
+      resize_cfg->resize_scale_denominator == SCALE_NUMERATOR &&
+      resize_cfg->resize_kf_scale_denominator == SCALE_NUMERATOR)
+    resize_cfg->resize_mode = RESIZE_NONE;
+
+  // Set encoder algorithm related configuration.
+  algo_cfg->enable_overlay = extra_cfg->enable_overlay;
+  algo_cfg->disable_trellis_quant = extra_cfg->disable_trellis_quant;
+  algo_cfg->sharpness = extra_cfg->sharpness;
+  algo_cfg->arnr_max_frames = extra_cfg->arnr_max_frames;
+  algo_cfg->arnr_strength = extra_cfg->arnr_strength;
+  algo_cfg->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode;
+  // TODO(any): Fix and Enable TPL for resize-mode > 0
+  algo_cfg->enable_tpl_model =
+      resize_cfg->resize_mode ? 0 : extra_cfg->enable_tpl_model;
+  algo_cfg->loopfilter_control = extra_cfg->loopfilter_control;
+
+  // Set two-pass stats configuration.
+  oxcf->twopass_stats_in = cfg->rc_twopass_stats_in;
+
+  if (extra_cfg->two_pass_output)
+    oxcf->two_pass_output = extra_cfg->two_pass_output;
+
+  oxcf->second_pass_log = extra_cfg->second_pass_log;
+
+  // Set Key frame configuration.
+  kf_cfg->fwd_kf_enabled = cfg->fwd_kf_enabled;
+  kf_cfg->auto_key =
       cfg->kf_mode == AOM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
+  kf_cfg->key_freq_min = cfg->kf_min_dist;
+  kf_cfg->key_freq_max = cfg->kf_max_dist;
+  kf_cfg->sframe_dist = cfg->sframe_dist;
+  kf_cfg->sframe_mode = cfg->sframe_mode;
+  kf_cfg->enable_sframe = extra_cfg->s_frame_mode;
+  kf_cfg->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering;
+  kf_cfg->fwd_kf_dist = extra_cfg->fwd_kf_dist;
+  // Disable key frame filtering in all intra mode.
+  if (cfg->kf_max_dist == 0) {
+    kf_cfg->enable_keyframe_filtering = 0;
+  }
+  kf_cfg->enable_intrabc = extra_cfg->enable_intrabc;
 
-  oxcf->key_freq = cfg->kf_max_dist;
-  oxcf->sframe_dist = cfg->sframe_dist;
-  oxcf->sframe_mode = cfg->sframe_mode;
-  oxcf->sframe_enabled = cfg->sframe_dist != 0;
   oxcf->speed = extra_cfg->cpu_used;
-  oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
-  oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
-  oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
-  oxcf->sharpness = extra_cfg->sharpness;
-
-  oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in;
-
-  oxcf->color_primaries = extra_cfg->color_primaries;
-  oxcf->transfer_characteristics = extra_cfg->transfer_characteristics;
-  oxcf->matrix_coefficients = extra_cfg->matrix_coefficients;
-  oxcf->chroma_sample_position = extra_cfg->chroma_sample_position;
-
-  oxcf->color_range = extra_cfg->color_range;
-  oxcf->render_width = extra_cfg->render_width;
-  oxcf->render_height = extra_cfg->render_height;
-  oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
-  oxcf->arnr_strength = extra_cfg->arnr_strength;
-  oxcf->min_gf_interval = extra_cfg->min_gf_interval;
-  oxcf->max_gf_interval = extra_cfg->max_gf_interval;
-  oxcf->gf_min_pyr_height = extra_cfg->gf_min_pyr_height;
-  oxcf->gf_max_pyr_height = extra_cfg->gf_max_pyr_height;
-
-  oxcf->tuning = extra_cfg->tuning;
-  oxcf->vmaf_model_path = extra_cfg->vmaf_model_path;
-  oxcf->content = extra_cfg->content;
-  oxcf->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode;
-  oxcf->superblock_size = extra_cfg->superblock_size;
+
+  // Set Color related configuration.
+  color_cfg->color_primaries = extra_cfg->color_primaries;
+  color_cfg->transfer_characteristics = extra_cfg->transfer_characteristics;
+  color_cfg->matrix_coefficients = extra_cfg->matrix_coefficients;
+  color_cfg->color_range = extra_cfg->color_range;
+  color_cfg->chroma_sample_position = extra_cfg->chroma_sample_position;
+
+  // Set Group of frames configuration.
+  // Force lag_in_frames to 0 for REALTIME mode
+  gf_cfg->lag_in_frames = (oxcf->mode == REALTIME)
+                              ? 0
+                              : clamp(cfg->g_lag_in_frames, 0, MAX_LAG_BUFFERS);
+  gf_cfg->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
+  gf_cfg->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
+  gf_cfg->min_gf_interval = extra_cfg->min_gf_interval;
+  gf_cfg->max_gf_interval = extra_cfg->max_gf_interval;
+  gf_cfg->gf_min_pyr_height = extra_cfg->gf_min_pyr_height;
+  gf_cfg->gf_max_pyr_height = extra_cfg->gf_max_pyr_height;
+
+  // Set tune related configuration.
+  tune_cfg->tuning = extra_cfg->tuning;
+  tune_cfg->vmaf_model_path = extra_cfg->vmaf_model_path;
+  tune_cfg->content = extra_cfg->content;
   if (cfg->large_scale_tile) {
-    oxcf->film_grain_test_vector = 0;
-    oxcf->film_grain_table_filename = NULL;
+    tune_cfg->film_grain_test_vector = 0;
+    tune_cfg->film_grain_table_filename = NULL;
   } else {
-    oxcf->film_grain_test_vector = extra_cfg->film_grain_test_vector;
-    oxcf->film_grain_table_filename = extra_cfg->film_grain_table_filename;
+    tune_cfg->film_grain_test_vector = extra_cfg->film_grain_test_vector;
+    tune_cfg->film_grain_table_filename = extra_cfg->film_grain_table_filename;
   }
+  tune_cfg->dist_metric = extra_cfg->dist_metric;
 #if CONFIG_DENOISE
   oxcf->noise_level = extra_cfg->noise_level;
   oxcf->noise_block_size = extra_cfg->noise_block_size;
+  oxcf->enable_dnl_denoising = extra_cfg->enable_dnl_denoising;
+#endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  // Temporal denoiser is for nonrd pickmode so disable it for speed < 7.
+  // Also disable it for speed 7 for now since it needs to be modified for
+  // the check_partition_merge_mode feature.
+  if (cfg->g_bit_depth == AOM_BITS_8 && oxcf->speed > 7) {
+    oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
+  } else {
+    oxcf->noise_sensitivity = 0;
+  }
 #endif
-  oxcf->large_scale_tile = cfg->large_scale_tile;
-  oxcf->single_tile_decoding =
-      (oxcf->large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
-  if (oxcf->large_scale_tile) {
+  // Set Tile related configuration.
+  tile_cfg->num_tile_groups = extra_cfg->num_tg;
+  // In large-scale tile encoding mode, num_tile_groups is always 1.
+  if (cfg->large_scale_tile) tile_cfg->num_tile_groups = 1;
+  tile_cfg->mtu = extra_cfg->mtu_size;
+  tile_cfg->enable_large_scale_tile = cfg->large_scale_tile;
+  tile_cfg->enable_single_tile_decoding =
+      (tile_cfg->enable_large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
+  tile_cfg->tile_columns = extra_cfg->tile_columns;
+  tile_cfg->tile_rows = extra_cfg->tile_rows;
+  tile_cfg->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS);
+  tile_cfg->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS);
+  for (int i = 0; i < tile_cfg->tile_width_count; i++) {
+    tile_cfg->tile_widths[i] = AOMMAX(cfg->tile_widths[i], 1);
+  }
+  for (int i = 0; i < tile_cfg->tile_height_count; i++) {
+    tile_cfg->tile_heights[i] = AOMMAX(cfg->tile_heights[i], 1);
+  }
+  tile_cfg->enable_ext_tile_debug = extra_cfg->ext_tile_debug;
+
+  if (tile_cfg->enable_large_scale_tile) {
     // The superblock_size can only be AOM_SUPERBLOCK_SIZE_64X64 or
-    // AOM_SUPERBLOCK_SIZE_128X128 while oxcf->large_scale_tile = 1. If
-    // superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to
+    // AOM_SUPERBLOCK_SIZE_128X128 while tile_cfg->enable_large_scale_tile = 1.
+    // If superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to
     // AOM_SUPERBLOCK_SIZE_64X64(default value in large_scale_tile).
     if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64 &&
         extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_128X128)
-      oxcf->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
+      tool_cfg->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
   }
 
+  // Set reference frame related configuration.
+  oxcf->ref_frm_cfg.max_reference_frames = extra_cfg->max_reference_frames;
+  oxcf->ref_frm_cfg.enable_reduced_reference_set =
+      extra_cfg->enable_reduced_reference_set;
+  oxcf->ref_frm_cfg.enable_onesided_comp = extra_cfg->enable_onesided_comp;
+
   oxcf->row_mt = extra_cfg->row_mt;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  oxcf->fp_mt = extra_cfg->fp_mt;
+#endif
 
-  oxcf->tile_columns = extra_cfg->tile_columns;
-  oxcf->tile_rows = extra_cfg->tile_rows;
-
-  oxcf->monochrome = cfg->monochrome;
-  oxcf->full_still_picture_hdr = cfg->full_still_picture_hdr;
-  oxcf->enable_dual_filter = extra_cfg->enable_dual_filter;
-  oxcf->enable_rect_partitions = extra_cfg->enable_rect_partitions;
-  oxcf->enable_ab_partitions = extra_cfg->enable_ab_partitions;
-  oxcf->enable_1to4_partitions = extra_cfg->enable_1to4_partitions;
-  oxcf->min_partition_size = extra_cfg->min_partition_size;
-  oxcf->max_partition_size = extra_cfg->max_partition_size;
-  oxcf->enable_intra_edge_filter = extra_cfg->enable_intra_edge_filter;
-  oxcf->enable_tx64 = extra_cfg->enable_tx64;
-  oxcf->enable_flip_idtx = extra_cfg->enable_flip_idtx;
-  oxcf->enable_order_hint = extra_cfg->enable_order_hint;
-  oxcf->enable_dist_wtd_comp =
+  // Set motion mode related configuration.
+  oxcf->motion_mode_cfg.enable_obmc = extra_cfg->enable_obmc;
+  oxcf->motion_mode_cfg.enable_warped_motion = extra_cfg->enable_warped_motion;
+#if !CONFIG_REALTIME_ONLY
+  oxcf->motion_mode_cfg.allow_warped_motion =
+      (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion);
+#else
+  oxcf->motion_mode_cfg.allow_warped_motion =
+      (cfg->g_usage == AOM_USAGE_REALTIME && oxcf->speed >= 7)
+          ? false
+          : (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion);
+#endif
+
+  // Set partition related configuration.
+  part_cfg->enable_rect_partitions = extra_cfg->enable_rect_partitions;
+  part_cfg->enable_ab_partitions = extra_cfg->enable_ab_partitions;
+  part_cfg->enable_1to4_partitions = extra_cfg->enable_1to4_partitions;
+  part_cfg->min_partition_size = extra_cfg->min_partition_size;
+  part_cfg->max_partition_size = extra_cfg->max_partition_size;
+
+  // Set intra mode configuration.
+  intra_mode_cfg->enable_angle_delta = extra_cfg->enable_angle_delta;
+  intra_mode_cfg->enable_intra_edge_filter =
+      extra_cfg->enable_intra_edge_filter;
+  intra_mode_cfg->enable_filter_intra = extra_cfg->enable_filter_intra;
+  intra_mode_cfg->enable_smooth_intra = extra_cfg->enable_smooth_intra;
+  intra_mode_cfg->enable_paeth_intra = extra_cfg->enable_paeth_intra;
+  intra_mode_cfg->enable_cfl_intra = extra_cfg->enable_cfl_intra;
+  intra_mode_cfg->enable_directional_intra =
+      extra_cfg->enable_directional_intra;
+  intra_mode_cfg->enable_diagonal_intra = extra_cfg->enable_diagonal_intra;
+  intra_mode_cfg->auto_intra_tools_off = extra_cfg->auto_intra_tools_off;
+
+  // Set transform size/type configuration.
+  txfm_cfg->enable_tx64 = extra_cfg->enable_tx64;
+  txfm_cfg->enable_flip_idtx = extra_cfg->enable_flip_idtx;
+  txfm_cfg->enable_rect_tx = extra_cfg->enable_rect_tx;
+  txfm_cfg->reduced_tx_type_set = extra_cfg->reduced_tx_type_set;
+  txfm_cfg->use_intra_dct_only = extra_cfg->use_intra_dct_only;
+  txfm_cfg->use_inter_dct_only = extra_cfg->use_inter_dct_only;
+  txfm_cfg->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only;
+  txfm_cfg->enable_tx_size_search = extra_cfg->enable_tx_size_search;
+
+  // Set compound type configuration.
+  comp_type_cfg->enable_dist_wtd_comp =
       extra_cfg->enable_dist_wtd_comp & extra_cfg->enable_order_hint;
-  oxcf->max_reference_frames = extra_cfg->max_reference_frames;
-  oxcf->enable_reduced_reference_set = extra_cfg->enable_reduced_reference_set;
-  oxcf->enable_masked_comp = extra_cfg->enable_masked_comp;
-  oxcf->enable_onesided_comp = extra_cfg->enable_onesided_comp;
-  oxcf->enable_diff_wtd_comp =
+  comp_type_cfg->enable_masked_comp = extra_cfg->enable_masked_comp;
+  comp_type_cfg->enable_diff_wtd_comp =
       extra_cfg->enable_masked_comp & extra_cfg->enable_diff_wtd_comp;
-  oxcf->enable_interinter_wedge =
+  comp_type_cfg->enable_interinter_wedge =
       extra_cfg->enable_masked_comp & extra_cfg->enable_interinter_wedge;
-  oxcf->enable_interintra_comp = extra_cfg->enable_interintra_comp;
-  oxcf->enable_smooth_interintra =
+  comp_type_cfg->enable_smooth_interintra =
       extra_cfg->enable_interintra_comp && extra_cfg->enable_smooth_interintra;
-  oxcf->enable_interintra_wedge =
+  comp_type_cfg->enable_interintra_wedge =
       extra_cfg->enable_interintra_comp & extra_cfg->enable_interintra_wedge;
-  oxcf->enable_ref_frame_mvs =
-      extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint;
-
-  oxcf->enable_global_motion = extra_cfg->enable_global_motion;
-  oxcf->enable_warped_motion = extra_cfg->enable_warped_motion;
-  oxcf->allow_warped_motion =
-      (cfg->g_usage == AOM_USAGE_REALTIME)
-          ? 0
-          : (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion);
-  oxcf->enable_filter_intra = extra_cfg->enable_filter_intra;
-  oxcf->enable_smooth_intra = extra_cfg->enable_smooth_intra;
-  oxcf->enable_paeth_intra = extra_cfg->enable_paeth_intra;
-  oxcf->enable_cfl_intra = extra_cfg->enable_cfl_intra;
 
-  oxcf->enable_superres =
-      (oxcf->superres_mode != SUPERRES_NONE) && extra_cfg->enable_superres;
-  if (!oxcf->enable_superres) {
-    disable_superres(oxcf);
+  // Set Super-resolution mode configuration.
+  if (extra_cfg->lossless || cfg->large_scale_tile) {
+    disable_superres(superres_cfg);
+  } else {
+    superres_cfg->superres_mode = cfg->rc_superres_mode;
+    superres_cfg->superres_scale_denominator =
+        (uint8_t)cfg->rc_superres_denominator;
+    superres_cfg->superres_kf_scale_denominator =
+        (uint8_t)cfg->rc_superres_kf_denominator;
+    superres_cfg->superres_qthresh =
+        av1_quantizer_to_qindex(cfg->rc_superres_qthresh);
+    superres_cfg->superres_kf_qthresh =
+        av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh);
+    if (superres_cfg->superres_mode == AOM_SUPERRES_FIXED &&
+        superres_cfg->superres_scale_denominator == SCALE_NUMERATOR &&
+        superres_cfg->superres_kf_scale_denominator == SCALE_NUMERATOR) {
+      disable_superres(superres_cfg);
+    }
+    if (superres_cfg->superres_mode == AOM_SUPERRES_QTHRESH &&
+        superres_cfg->superres_qthresh == 255 &&
+        superres_cfg->superres_kf_qthresh == 255) {
+      disable_superres(superres_cfg);
+    }
   }
 
-  oxcf->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS);
-  oxcf->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS);
-  for (int i = 0; i < oxcf->tile_width_count; i++) {
-    oxcf->tile_widths[i] = AOMMAX(cfg->tile_widths[i], 1);
-  }
-  for (int i = 0; i < oxcf->tile_height_count; i++) {
-    oxcf->tile_heights[i] = AOMMAX(cfg->tile_heights[i], 1);
-  }
-  oxcf->error_resilient_mode =
-      cfg->g_error_resilient | extra_cfg->error_resilient_mode;
-  oxcf->s_frame_mode = extra_cfg->s_frame_mode;
-  oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
-  if (cfg->g_pass == AOM_RC_LAST_PASS) {
-    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
-    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
-    oxcf->limit = n_packets - 1;
-  } else {
-    oxcf->limit = cfg->g_limit;
+  superres_cfg->enable_superres =
+      (superres_cfg->superres_mode != AOM_SUPERRES_NONE) &&
+      extra_cfg->enable_superres;
+  if (!superres_cfg->enable_superres) {
+    disable_superres(superres_cfg);
   }
 
-  if (oxcf->limit == 1) {
+  if (input_cfg->limit == 1) {
     // still picture mode, display model and timing is meaningless
-    oxcf->display_model_info_present_flag = 0;
-    oxcf->timing_info_present = 0;
+    dec_model_cfg->display_model_info_present_flag = 0;
+    dec_model_cfg->timing_info_present = 0;
   }
 
-  oxcf->enable_tpl_model = extra_cfg->enable_tpl_model;
-  oxcf->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering;
-
-  oxcf->enable_chroma_deltaq = extra_cfg->enable_chroma_deltaq;
-  oxcf->aq_mode = extra_cfg->aq_mode;
-  oxcf->deltaq_mode = extra_cfg->deltaq_mode;
-
-  oxcf->deltalf_mode =
-      (oxcf->deltaq_mode != NO_DELTA_Q) && extra_cfg->deltalf_mode;
-
   oxcf->save_as_annexb = cfg->save_as_annexb;
 
-  oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
-  oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
-  oxcf->sb_multipass_unit_test = extra_cfg->sb_multipass_unit_test;
-  oxcf->ext_tile_debug = extra_cfg->ext_tile_debug;
-
-  oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
-  oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
-  oxcf->border_in_pixels = (oxcf->resize_mode || oxcf->superres_mode)
-                               ? AOM_BORDER_IN_PIXELS
-                               : AOM_ENC_NO_SCALE_BORDER;
+  // Set unit test related configuration.
+  oxcf->unit_test_cfg.motion_vector_unit_test =
+      extra_cfg->motion_vector_unit_test;
+  oxcf->unit_test_cfg.sb_multipass_unit_test =
+      extra_cfg->sb_multipass_unit_test;
+
+  // For allintra encoding mode, inter-frame motion search is not applicable and
+  // the intraBC motion vectors are restricted within the tile boundaries. Hence
+  // a smaller frame border size (AOM_ENC_ALLINTRA_BORDER) is used in this case.
+  oxcf->border_in_pixels =
+      (resize_cfg->resize_mode || superres_cfg->superres_mode)
+          ? AOM_BORDER_IN_PIXELS
+          : (oxcf->kf_cfg.key_freq_max == 0) ? AOM_ENC_ALLINTRA_BORDER
+                                             : AOM_ENC_NO_SCALE_BORDER;
   memcpy(oxcf->target_seq_level_idx, extra_cfg->target_seq_level_idx,
          sizeof(oxcf->target_seq_level_idx));
   oxcf->tier_mask = extra_cfg->tier_mask;
 
-  oxcf->use_fixed_qp_offsets =
-      cfg->use_fixed_qp_offsets && (oxcf->rc_mode == AOM_Q);
-  for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
-    if (oxcf->use_fixed_qp_offsets) {
-      if (cfg->fixed_qp_offsets[i] >= 0) {  // user-provided qp offset
-        oxcf->fixed_qp_offsets[i] = convert_qp_offset(
-            oxcf->cq_level, cfg->fixed_qp_offsets[i], oxcf->bit_depth);
-      } else {  // auto-selected qp offset
-        oxcf->fixed_qp_offsets[i] =
-            get_modeled_qp_offset(oxcf->cq_level, i, oxcf->bit_depth);
-      }
-    } else {
-      oxcf->fixed_qp_offsets[i] = -1.0;
-    }
-  }
+  oxcf->partition_info_path = extra_cfg->partition_info_path;
+
+  oxcf->strict_level_conformance = extra_cfg->strict_level_conformance;
 
-  oxcf->min_cr = extra_cfg->min_cr;
   return AOM_CODEC_OK;
 }
 
 static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
                                           const aom_codec_enc_cfg_t *cfg) {
+  InitialDimensions *const initial_dimensions =
+      &ctx->ppi->cpi->initial_dimensions;
   aom_codec_err_t res;
   int force_key = 0;
 
@@ -1036,8 +1437,10 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
     if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS)
       ERROR("Cannot change width or height after initialization");
     if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
-        (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
-        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+        (initial_dimensions->width &&
+         (int)cfg->g_w > initial_dimensions->width) ||
+        (initial_dimensions->height &&
+         (int)cfg->g_h > initial_dimensions->height))
       force_key = 1;
   }
 
@@ -1058,8 +1461,21 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
     ctx->cfg = *cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
     // On profile change, request a key frame
-    force_key |= ctx->cpi->common.seq_params.profile != ctx->oxcf.profile;
-    av1_change_config(ctx->cpi, &ctx->oxcf);
+    force_key |= ctx->ppi->seq_params.profile != ctx->oxcf.profile;
+    bool is_sb_size_changed = false;
+    av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    int i;
+    for (i = 0; i < ctx->ppi->num_fp_contexts; i++) {
+      av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf,
+                        is_sb_size_changed);
+    }
+#else
+    av1_change_config(ctx->ppi->cpi, &ctx->oxcf, is_sb_size_changed);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    if (ctx->ppi->cpi_lap != NULL) {
+      av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed);
+    }
   }
 
   if (force_key) ctx->next_frame_flags |= AOM_EFLAG_FORCE_KF;
@@ -1068,14 +1484,14 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
 }
 
 static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) {
-  return av1_get_global_headers(ctx->cpi);
+  return av1_get_global_headers(ctx->ppi);
 }
 
 static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   int *const arg = va_arg(args, int *);
   if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
-  *arg = av1_get_quantizer(ctx->cpi);
+  *arg = av1_get_quantizer(ctx->ppi->cpi);
   return AOM_CODEC_OK;
 }
 
@@ -1083,7 +1499,23 @@ static aom_codec_err_t ctrl_get_quantizer64(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
   int *const arg = va_arg(args, int *);
   if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
-  *arg = av1_qindex_to_quantizer(av1_get_quantizer(ctx->cpi));
+  *arg = av1_qindex_to_quantizer(av1_get_quantizer(ctx->ppi->cpi));
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_loopfilter_level(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = ctx->ppi->cpi->common.lf.filter_level[0];
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_baseline_gf_interval(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = ctx->ppi->p_rc.baseline_gf_interval;
   return AOM_CODEC_OK;
 }
 
@@ -1093,7 +1525,23 @@ static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
   if (res == AOM_CODEC_OK) {
     ctx->extra_cfg = *extra_cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    av1_change_config(ctx->cpi, &ctx->oxcf);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    av1_check_fpmt_config(ctx->ppi, &ctx->oxcf);
+#endif
+    bool is_sb_size_changed = false;
+    av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    int i;
+    for (i = 0; i < ctx->ppi->num_fp_contexts; i++) {
+      av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf,
+                        is_sb_size_changed);
+    }
+#else
+    av1_change_config(ctx->ppi->cpi, &ctx->oxcf, is_sb_size_changed);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    if (ctx->ppi->cpi_lap != NULL) {
+      av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed);
+    }
   }
   return res;
 }
@@ -1164,7 +1612,13 @@ static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_set_enable_tpl_model(aom_codec_alg_priv_t *ctx,
                                                  va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_tpl_model = CAST(AV1E_SET_ENABLE_TPL_MODEL, args);
+  const unsigned int tpl_model_arg = CAST(AV1E_SET_ENABLE_TPL_MODEL, args);
+#if CONFIG_REALTIME_ONLY
+  if (tpl_model_arg) {
+    ERROR("TPL model can't be turned on in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_tpl_model = tpl_model_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1244,7 +1698,13 @@ static aom_codec_err_t ctrl_set_enable_cdef(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx,
                                                    va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_restoration = CAST(AV1E_SET_ENABLE_RESTORATION, args);
+  const unsigned int restoration_arg = CAST(AV1E_SET_ENABLE_RESTORATION, args);
+#if CONFIG_REALTIME_ONLY
+  if (restoration_arg) {
+    ERROR("Restoration can't be turned on in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_restoration = restoration_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1258,7 +1718,13 @@ static aom_codec_err_t ctrl_set_force_video_mode(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_obmc = CAST(AV1E_SET_ENABLE_OBMC, args);
+  const unsigned int obmc_arg = CAST(AV1E_SET_ENABLE_OBMC, args);
+#if CONFIG_REALTIME_ONLY
+  if (obmc_arg) {
+    ERROR("OBMC can't be enabled in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_obmc = obmc_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1403,6 +1869,13 @@ static aom_codec_err_t ctrl_set_enable_flip_idtx(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_rect_tx(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_rect_tx = CAST(AV1E_SET_ENABLE_RECT_TX, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_dist_wtd_comp(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1495,14 +1968,26 @@ static aom_codec_err_t ctrl_set_enable_interintra_wedge(
 static aom_codec_err_t ctrl_set_enable_global_motion(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_global_motion = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args);
+  const int global_motion_arg = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args);
+#if CONFIG_REALTIME_ONLY
+  if (global_motion_arg) {
+    ERROR("Global motion can't be enabled in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_global_motion = global_motion_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
 static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_warped_motion = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args);
+  const int warped_motion_arg = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args);
+#if CONFIG_REALTIME_ONLY
+  if (warped_motion_arg) {
+    ERROR("Warped motion can't be enabled in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_warped_motion = warped_motion_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1527,6 +2012,21 @@ static aom_codec_err_t ctrl_set_enable_smooth_intra(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_directional_intra(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_directional_intra =
+      CAST(AV1E_SET_ENABLE_DIRECTIONAL_INTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_diagonal_intra(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_diagonal_intra = CAST(AV1E_SET_ENABLE_DIAGONAL_INTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_paeth_intra(aom_codec_alg_priv_t *ctx,
                                                    va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1641,6 +2141,13 @@ static aom_codec_err_t ctrl_set_intra_default_tx_only(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_tx_size_search(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_tx_size_search = CAST(AV1E_SET_ENABLE_TX_SIZE_SEARCH, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_quant_b_adapt(aom_codec_alg_priv_t *ctx,
                                               va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1648,6 +2155,13 @@ static aom_codec_err_t ctrl_set_quant_b_adapt(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_vbr_corpus_complexity_lap(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.vbr_corpus_complexity_lap =
+      CAST(AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
 static aom_codec_err_t ctrl_set_coeff_cost_upd_freq(aom_codec_alg_priv_t *ctx,
                                                     va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1669,10 +2183,32 @@ static aom_codec_err_t ctrl_set_mv_cost_upd_freq(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_dv_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.dv_cost_upd_freq = CAST(AV1E_SET_DV_COST_UPD_FREQ, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_vmaf_model_path(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.vmaf_model_path = CAST(AV1E_SET_VMAF_MODEL_PATH, args);
+  const char *str = CAST(AV1E_SET_VMAF_MODEL_PATH, args);
+  const aom_codec_err_t ret = allocate_and_set_string(
+      str, default_extra_cfg.vmaf_model_path, &extra_cfg.vmaf_model_path,
+      ctx->ppi->error.detail);
+  if (ret != AOM_CODEC_OK) return ret;
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_partition_info_path(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  const char *str = CAST(AV1E_SET_PARTITION_INFO_PATH, args);
+  const aom_codec_err_t ret = allocate_and_set_string(
+      str, default_extra_cfg.partition_info_path,
+      &extra_cfg.partition_info_path, ctx->ppi->error.detail);
+  if (ret != AOM_CODEC_OK) return ret;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1687,7 +2223,16 @@ static aom_codec_err_t ctrl_set_film_grain_test_vector(
 static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx,
                                                  va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.film_grain_table_filename = CAST(AV1E_SET_FILM_GRAIN_TABLE, args);
+  const char *str = CAST(AV1E_SET_FILM_GRAIN_TABLE, args);
+  if (str == NULL) {
+    // this parameter allows NULL as its value
+    extra_cfg.film_grain_table_filename = str;
+  } else {
+    const aom_codec_err_t ret = allocate_and_set_string(
+        str, default_extra_cfg.film_grain_table_filename,
+        &extra_cfg.film_grain_table_filename, ctx->ppi->error.detail);
+    if (ret != AOM_CODEC_OK) return ret;
+  }
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1718,10 +2263,36 @@ static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx,
 #endif
 }
 
+static aom_codec_err_t ctrl_set_enable_dnl_denoising(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+#if !CONFIG_DENOISE
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+#else
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_dnl_denoising = CAST(AV1E_SET_ENABLE_DNL_DENOISING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+#endif
+}
+
 static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.deltaq_mode = CAST(AV1E_SET_DELTAQ_MODE, args);
+  const DELTAQ_MODE deltaq_arg = CAST(AV1E_SET_DELTAQ_MODE, args);
+#if CONFIG_REALTIME_ONLY
+  if (deltaq_arg > NO_DELTA_Q) {
+    ERROR("Delta Q mode can't be enabled in realtime only build.");
+  }
+#endif
+  extra_cfg.deltaq_mode = deltaq_arg;
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_deltaq_strength(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.deltaq_strength = CAST(AV1E_SET_DELTAQ_STRENGTH, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1775,6 +2346,22 @@ static aom_codec_err_t ctrl_enable_motion_vector_unit_test(
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_enable_fpmt_unit_test(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+#if !(CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST)
+  (void)args;
+  (void)ctx;
+  return AOM_CODEC_INCAPABLE;
+#else
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.fpmt_unit_test = CAST(AV1E_SET_FP_MT_UNIT_TEST, args);
+  ctx->ppi->fpmt_unit_test_cfg = (extra_cfg.fpmt_unit_test == 1)
+                                     ? PARALLEL_ENCODE
+                                     : PARALLEL_SIMULATION_ENCODE;
+  return update_extra_cfg(ctx, &extra_cfg);
+#endif
+}
+
 static aom_codec_err_t ctrl_enable_ext_tile_debug(aom_codec_alg_priv_t *ctx,
                                                   va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1817,6 +2404,32 @@ static aom_codec_err_t ctrl_enable_sb_multipass_unit_test(
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_external_partition(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  AV1_COMP *const cpi = ctx->ppi->cpi;
+  aom_ext_part_funcs_t funcs = *CAST(AV1E_SET_EXTERNAL_PARTITION, args);
+  aom_ext_part_config_t config;
+  // TODO(chengchen): verify the sb_size has been set at this point.
+  config.superblock_size = cpi->common.seq_params->sb_size;
+  const aom_codec_err_t status =
+      av1_ext_part_create(funcs, config, &cpi->ext_part_controller);
+  return status;
+}
+
+static aom_codec_err_t ctrl_set_loopfilter_control(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.loopfilter_control = CAST(AV1E_SET_LOOPFILTER_CONTROL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rtc_external_rc(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->ppi->cpi->rc.rtc_external_ratectrl =
+      CAST(AV1E_SET_RTC_EXTERNAL_RC, args);
+  return AOM_CODEC_OK;
+}
+
 #if !CONFIG_REALTIME_ONLY
 static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer,
                                            STATS_BUFFER_CTX *stats_buf_context,
@@ -1844,31 +2457,68 @@ static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer,
 #endif
 
 static aom_codec_err_t create_context_and_bufferpool(
-    AV1_COMP **p_cpi, BufferPool **p_buffer_pool, AV1EncoderConfig *oxcf,
-    struct aom_codec_pkt_list *pkt_list_head, FIRSTPASS_STATS *frame_stats_buf,
-    COMPRESSOR_STAGE stage, int num_lap_buffers, int lap_lag_in_frames,
-    STATS_BUFFER_CTX *stats_buf_context) {
+    AV1_PRIMARY *ppi, AV1_COMP **p_cpi, BufferPool **p_buffer_pool,
+    AV1EncoderConfig *oxcf, COMPRESSOR_STAGE stage, int lap_lag_in_frames) {
   aom_codec_err_t res = AOM_CODEC_OK;
 
-  *p_buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
-  if (*p_buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+  if (*p_buffer_pool == NULL) {
+    *p_buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+    if (*p_buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
 
 #if CONFIG_MULTITHREAD
-  if (pthread_mutex_init(&((*p_buffer_pool)->pool_mutex), NULL)) {
-    return AOM_CODEC_MEM_ERROR;
-  }
+    if (pthread_mutex_init(&((*p_buffer_pool)->pool_mutex), NULL)) {
+      return AOM_CODEC_MEM_ERROR;
+    }
 #endif
-  *p_cpi = av1_create_compressor(oxcf, *p_buffer_pool, frame_stats_buf, stage,
-                                 num_lap_buffers, lap_lag_in_frames,
-                                 stats_buf_context);
-  if (*p_cpi == NULL)
-    res = AOM_CODEC_MEM_ERROR;
-  else
-    (*p_cpi)->output_pkt_list = pkt_list_head;
+  }
+  *p_cpi = av1_create_compressor(ppi, oxcf, *p_buffer_pool, stage,
+                                 lap_lag_in_frames);
+  if (*p_cpi == NULL) res = AOM_CODEC_MEM_ERROR;
 
   return res;
 }
 
+static aom_codec_err_t ctrl_set_fp_mt(aom_codec_alg_priv_t *ctx, va_list args) {
+#if !CONFIG_FRAME_PARALLEL_ENCODE
+  (void)args;
+  (void)ctx;
+  return AOM_CODEC_INCAPABLE;
+#else
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.fp_mt = CAST(AV1E_SET_FP_MT, args);
+  const aom_codec_err_t result = update_extra_cfg(ctx, &extra_cfg);
+  int num_fp_contexts = 1;
+  if (ctx->ppi->num_fp_contexts == 1) {
+    num_fp_contexts =
+        av1_compute_num_fp_contexts(ctx->ppi, &ctx->ppi->parallel_cpi[0]->oxcf);
+    if (num_fp_contexts > 1) {
+      int i;
+      for (i = 1; i < num_fp_contexts; i++) {
+        int res = create_context_and_bufferpool(
+            ctx->ppi, &ctx->ppi->parallel_cpi[i], &ctx->buffer_pool, &ctx->oxcf,
+            ENCODE_STAGE, -1);
+        if (res != AOM_CODEC_OK) {
+          return res;
+        }
+#if !CONFIG_REALTIME_ONLY
+        ctx->ppi->parallel_cpi[i]->twopass_frame.stats_in =
+            ctx->ppi->twopass.stats_buf_ctx->stats_in_start;
+#endif
+      }
+    }
+  }
+  ctx->ppi->num_fp_contexts = num_fp_contexts;
+  return result;
+#endif
+}
+
+static aom_codec_err_t ctrl_set_auto_intra_tools_off(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.auto_intra_tools_off = CAST(AV1E_SET_AUTO_INTRA_TOOLS_OFF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
   aom_codec_err_t res = AOM_CODEC_OK;
 
@@ -1879,14 +2529,21 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
     ctx->priv = (aom_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
 
-    if (ctx->config.enc) {
-      // Update the reference to the config structure to an internal copy.
-      priv->cfg = *ctx->config.enc;
-      ctx->config.enc = &priv->cfg;
-    }
+    // Update the reference to the config structure to an internal copy.
+    assert(ctx->config.enc);
+    priv->cfg = *ctx->config.enc;
+    ctx->config.enc = &priv->cfg;
 
     priv->extra_cfg = default_extra_cfg;
-    aom_once(av1_initialize_enc);
+    // Special handling:
+    // By default, if omitted, --enable-cdef = 1.
+    // Here we set its default value to 0 when --allintra is turned on.
+    // However, if users set --enable-cdef = 1 from command line,
+    // The encoder still respects it.
+    if (priv->cfg.g_usage == ALLINTRA) {
+      priv->extra_cfg.enable_cdef = 0;
+    }
+    av1_initialize_enc(priv->cfg.g_usage, priv->cfg.rc_end_usage);
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
 
@@ -1900,14 +2557,13 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
       reduce_ratio(&priv->timestamp_ratio);
 
       set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
-      if (priv->oxcf.rc_mode == AOM_Q && priv->oxcf.pass == 0 &&
-          priv->oxcf.mode == GOOD) {
-        // Enable look ahead
-        *num_lap_buffers = priv->cfg.g_lag_in_frames;
+      if (priv->oxcf.rc_cfg.mode != AOM_CBR &&
+          priv->oxcf.pass == AOM_RC_ONE_PASS && priv->oxcf.mode == GOOD) {
+        // Enable look ahead - enabled for AOM_Q, AOM_CQ, AOM_VBR
         *num_lap_buffers =
-            clamp(*num_lap_buffers, 1,
-                  AOMMIN(MAX_LAP_BUFFERS,
-                         priv->oxcf.key_freq + SCENE_CUT_KEY_TEST_INTERVAL));
+            AOMMIN((int)priv->cfg.g_lag_in_frames,
+                   AOMMIN(MAX_LAP_BUFFERS, priv->oxcf.kf_cfg.key_freq_max +
+                                               SCENE_CUT_KEY_TEST_INTERVAL));
         if ((int)priv->cfg.g_lag_in_frames - (*num_lap_buffers) >=
             LAP_LAG_IN_FRAMES) {
           lap_lag_in_frames = LAP_LAG_IN_FRAMES;
@@ -1916,24 +2572,51 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
       priv->oxcf.use_highbitdepth =
           (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
 
+      priv->ppi = av1_create_primary_compressor(&priv->pkt_list.head,
+                                                *num_lap_buffers, &priv->oxcf);
+      if (!priv->ppi) return AOM_CODEC_MEM_ERROR;
+
 #if !CONFIG_REALTIME_ONLY
       res = create_stats_buffer(&priv->frame_stats_buffer,
                                 &priv->stats_buf_context, *num_lap_buffers);
       if (res != AOM_CODEC_OK) return AOM_CODEC_MEM_ERROR;
+
+      assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS);
+      int size = get_stats_buf_size(*num_lap_buffers, MAX_LAG_BUFFERS);
+      for (int i = 0; i < size; i++)
+        priv->ppi->twopass.frame_stats_arr[i] = &priv->frame_stats_buffer[i];
+
+      priv->ppi->twopass.stats_buf_ctx = &priv->stats_buf_context;
 #endif
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      assert(priv->ppi->num_fp_contexts >= 1);
       res = create_context_and_bufferpool(
-          &priv->cpi, &priv->buffer_pool, &priv->oxcf, &priv->pkt_list.head,
-          priv->frame_stats_buffer, ENCODE_STAGE, *num_lap_buffers, -1,
-          &priv->stats_buf_context);
+          priv->ppi, &priv->ppi->parallel_cpi[0], &priv->buffer_pool,
+          &priv->oxcf, ENCODE_STAGE, -1);
+      if (res != AOM_CODEC_OK) {
+        return res;
+      }
+#if !CONFIG_REALTIME_ONLY
+      priv->ppi->parallel_cpi[0]->twopass_frame.stats_in =
+          priv->ppi->twopass.stats_buf_ctx->stats_in_start;
+#endif
+      priv->ppi->cpi = priv->ppi->parallel_cpi[0];
+#else
+      res = create_context_and_bufferpool(priv->ppi, &priv->ppi->cpi,
+                                          &priv->buffer_pool, &priv->oxcf,
+                                          ENCODE_STAGE, -1);
+#if !CONFIG_REALTIME_ONLY
+      priv->ppi->cpi->twopass_frame.stats_in =
+          priv->ppi->twopass.stats_buf_ctx->stats_in_start;
+#endif
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
       // Create another compressor if look ahead is enabled
       if (res == AOM_CODEC_OK && *num_lap_buffers) {
         res = create_context_and_bufferpool(
-            &priv->cpi_lap, &priv->buffer_pool_lap, &priv->oxcf, NULL,
-            priv->frame_stats_buffer, LAP_STAGE, *num_lap_buffers,
-            clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS),
-            &priv->stats_buf_context);
+            priv->ppi, &priv->ppi->cpi_lap, &priv->buffer_pool_lap, &priv->oxcf,
+            LAP_STAGE, clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS));
       }
     }
   }
@@ -1942,12 +2625,16 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
 }
 
 static void destroy_context_and_bufferpool(AV1_COMP *cpi,
-                                           BufferPool *buffer_pool) {
+                                           BufferPool **p_buffer_pool) {
   av1_remove_compressor(cpi);
+  if (*p_buffer_pool) {
+    av1_free_ref_frame_buffers(*p_buffer_pool);
 #if CONFIG_MULTITHREAD
-  if (buffer_pool) pthread_mutex_destroy(&buffer_pool->pool_mutex);
+    pthread_mutex_destroy(&(*p_buffer_pool)->pool_mutex);
 #endif
-  aom_free(buffer_pool);
+    aom_free(*p_buffer_pool);
+    *p_buffer_pool = NULL;
+  }
 }
 
 static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context,
@@ -1957,15 +2644,62 @@ static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context,
   aom_free(frame_stats_buffer);
 }
 
+static void check_and_free_string(const char *default_str, const char **ptr) {
+  if (*ptr == default_str) {
+    // Default should be a literal. Do not free.
+    return;
+  }
+  aom_free((void *)*ptr);
+  *ptr = NULL;
+}
+
+static void destroy_extra_config(struct av1_extracfg *extra_cfg) {
+#if CONFIG_TUNE_VMAF
+  check_and_free_string(default_extra_cfg.vmaf_model_path,
+                        &extra_cfg->vmaf_model_path);
+#endif
+  check_and_free_string(default_extra_cfg.two_pass_output,
+                        &extra_cfg->two_pass_output);
+  check_and_free_string(default_extra_cfg.two_pass_output,
+                        &extra_cfg->second_pass_log);
+  check_and_free_string(default_extra_cfg.partition_info_path,
+                        &extra_cfg->partition_info_path);
+  check_and_free_string(default_extra_cfg.film_grain_table_filename,
+                        &extra_cfg->film_grain_table_filename);
+}
+
 static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) {
   free(ctx->cx_data);
-  destroy_context_and_bufferpool(ctx->cpi, ctx->buffer_pool);
-  if (ctx->cpi_lap) {
-    // As both cpi and cpi_lap have the same lookahead_ctx, it is already freed
-    // when destroy is called on cpi. Thus, setting lookahead_ctx to null here,
-    // so that it doesn't attempt to free it again.
-    ctx->cpi_lap->lookahead = NULL;
-    destroy_context_and_bufferpool(ctx->cpi_lap, ctx->buffer_pool_lap);
+  destroy_extra_config(&ctx->extra_cfg);
+
+  if (ctx->ppi) {
+    AV1_PRIMARY *ppi = ctx->ppi;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    for (int i = 0; i < MAX_PARALLEL_FRAMES - 1; i++) {
+      if (ppi->parallel_frames_data[i].cx_data) {
+        free(ppi->parallel_frames_data[i].cx_data);
+      }
+    }
+#endif
+#if CONFIG_ENTROPY_STATS
+    print_entropy_stats(ppi);
+#endif
+#if CONFIG_INTERNAL_STATS
+    print_internal_stats(ppi);
+#endif
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    int i;
+    for (i = 0; i < MAX_PARALLEL_FRAMES; i++) {
+      destroy_context_and_bufferpool(ppi->parallel_cpi[i], &ctx->buffer_pool);
+    }
+    ppi->cpi = NULL;
+#else
+    destroy_context_and_bufferpool(ppi->cpi, &ctx->buffer_pool);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    if (ppi->cpi_lap) {
+      destroy_context_and_bufferpool(ppi->cpi_lap, &ctx->buffer_pool_lap);
+    }
+    av1_remove_primary_compressor(ppi);
   }
   destroy_stats_buffer(&ctx->stats_buf_context, ctx->frame_stats_buffer);
   aom_free(ctx);
@@ -1975,7 +2709,6 @@ static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) {
 static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi,
                                                    unsigned int lib_flags) {
   aom_codec_frame_flags_t flags = lib_flags << 16;
-
   if (lib_flags & FRAMEFLAGS_KEY) flags |= AOM_FRAME_IS_KEY;
   if (lib_flags & FRAMEFLAGS_INTRAONLY) flags |= AOM_FRAME_IS_INTRAONLY;
   if (lib_flags & FRAMEFLAGS_SWITCH) flags |= AOM_FRAME_IS_SWITCH;
@@ -1995,45 +2728,68 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
                                       aom_enc_frame_flags_t enc_flags) {
   const size_t kMinCompressedSize = 8192;
   volatile aom_codec_err_t res = AOM_CODEC_OK;
-  AV1_COMP *const cpi = ctx->cpi;
-  const aom_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio;
+  AV1_PRIMARY *const ppi = ctx->ppi;
   volatile aom_codec_pts_t ptsvol = pts;
-  // LAP context
-  AV1_COMP *cpi_lap = ctx->cpi_lap;
+  AV1_COMP_DATA cpi_data = { 0 };
 
-  if (cpi == NULL) return AOM_CODEC_INVALID_PARAM;
+  cpi_data.timestamp_ratio = &ctx->timestamp_ratio;
+  cpi_data.flush = !img;
+  // LAP context
+  AV1_COMP *cpi_lap = ppi->cpi_lap;
+  if (ppi->cpi == NULL) return AOM_CODEC_INVALID_PARAM;
 
-  if (cpi->lap_enabled && cpi_lap == NULL && cpi->oxcf.pass == 0)
+  if (ppi->lap_enabled && cpi_lap == NULL &&
+      ppi->cpi->oxcf.pass == AOM_RC_ONE_PASS)
     return AOM_CODEC_INVALID_PARAM;
 
   if (img != NULL) {
     res = validate_img(ctx, img);
-    // TODO(jzern) the checks related to cpi's validity should be treated as a
-    // failure condition, encoder setup is done fully in init() currently.
     if (res == AOM_CODEC_OK) {
-      size_t data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
-                       ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) * get_image_bps(img);
+      const size_t uncompressed_frame_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
+                                           ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) *
+                                           get_image_bps(img) / 8;
+
+      // Due to the presence of no-show frames, the ctx->cx_data buffer holds
+      // compressed data corresponding to multiple frames. As no-show frames are
+      // not possible for all intra frame encoding with no forward key frames,
+      // the buffer is allocated with a smaller size in this case.
+      //
+      // For pseudo random input, the compressed frame size is seen to exceed
+      // the uncompressed frame size, but is less than 2 times the uncompressed
+      // frame size. Hence the size of the buffer is chosen as 2 times the
+      // uncompressed frame size.
+      int multiplier = 8;
+      if (ppi->cpi->oxcf.kf_cfg.key_freq_max == 0 &&
+          !ppi->cpi->oxcf.kf_cfg.fwd_kf_enabled)
+        multiplier = 2;
+      size_t data_sz = uncompressed_frame_sz * multiplier;
       if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
       if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
         ctx->cx_data_sz = data_sz;
         free(ctx->cx_data);
         ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
         if (ctx->cx_data == NULL) {
+          ctx->cx_data_sz = 0;
           return AOM_CODEC_MEM_ERROR;
         }
       }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      for (int i = 0; i < ppi->num_fp_contexts - 1; i++) {
+        if (ppi->parallel_frames_data[i].cx_data == NULL) {
+          ppi->parallel_frames_data[i].cx_data_sz = uncompressed_frame_sz;
+          ppi->parallel_frames_data[i].frame_display_order_hint = -1;
+          ppi->parallel_frames_data[i].frame_size = 0;
+          ppi->parallel_frames_data[i].cx_data =
+              (unsigned char *)malloc(ppi->parallel_frames_data[i].cx_data_sz);
+          if (ppi->parallel_frames_data[i].cx_data == NULL) {
+            ppi->parallel_frames_data[i].cx_data_sz = 0;
+            return AOM_CODEC_MEM_ERROR;
+          }
+        }
+      }
+#endif
     }
   }
-  if (ctx->oxcf.mode != GOOD && ctx->oxcf.mode != REALTIME) {
-    ctx->oxcf.mode = GOOD;
-    av1_change_config(ctx->cpi, &ctx->oxcf);
-  }
-
-  if (!ctx->pts_offset_initialized) {
-    ctx->pts_offset = ptsvol;
-    ctx->pts_offset_initialized = 1;
-  }
-  ptsvol -= ctx->pts_offset;
 
   aom_codec_pkt_list_init(&ctx->pkt_list);
 
@@ -2042,36 +2798,37 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
-  if (setjmp(cpi->common.error.jmp)) {
-    cpi->common.error.setjmp = 0;
-    res = update_error_state(ctx, &cpi->common.error);
-    aom_clear_system_state();
+  if (setjmp(ppi->error.jmp)) {
+    ppi->error.setjmp = 0;
+    res = update_error_state(ctx, &ppi->error);
     return res;
   }
-  cpi->common.error.setjmp = 1;
-  if (cpi_lap != NULL) {
-    if (setjmp(cpi_lap->common.error.jmp)) {
-      cpi_lap->common.error.setjmp = 0;
-      res = update_error_state(ctx, &cpi_lap->common.error);
-      aom_clear_system_state();
-      return res;
-    }
-    cpi_lap->common.error.setjmp = 1;
-  }
+  ppi->error.setjmp = 1;
+
+  if (ppi->use_svc && ppi->cpi->svc.use_flexible_mode == 0 && flags == 0)
+    av1_set_svc_fixed_mode(ppi->cpi);
 
   // Note(yunqing): While applying encoding flags, always start from enabling
   // all, and then modifying according to the flags. Previous frame's flags are
   // overwritten.
-  av1_apply_encoding_flags(cpi, flags);
+  av1_apply_encoding_flags(ppi->cpi, flags);
   if (cpi_lap != NULL) {
     av1_apply_encoding_flags(cpi_lap, flags);
   }
 
+#if CONFIG_TUNE_VMAF
+  if (ctx->extra_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+      ctx->extra_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    aom_init_vmaf_model(&ppi->cpi->vmaf_info.vmaf_model,
+                        ppi->cpi->oxcf.tune_cfg.vmaf_model_path);
+  }
+#endif
+
   // Handle fixed keyframe intervals
-  if (is_stat_generation_stage(cpi)) {
+  if (is_stat_generation_stage(ppi->cpi) || is_one_pass_rt_params(ppi->cpi)) {
     if (ctx->cfg.kf_mode == AOM_KF_AUTO &&
         ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
-      if (cpi->common.spatial_layer_id == 0 &&
+      if (ppi->cpi->common.spatial_layer_id == 0 &&
           ++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
         flags |= AOM_EFLAG_FORCE_KF;
         ctx->fixed_kf_cntr = 1;
@@ -2080,38 +2837,63 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
   }
 
   if (res == AOM_CODEC_OK) {
-    int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, ptsvol);
-    int64_t dst_end_time_stamp =
-        timebase_units_to_ticks(timestamp_ratio, ptsvol + duration);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    AV1_COMP *cpi = ppi->cpi;
+#else
+    AV1_COMP *const cpi = ppi->cpi;
+#endif
 
     // Set up internal flags
-    if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
+    if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) ppi->b_calculate_psnr = 1;
 
     if (img != NULL) {
+      if (!ctx->pts_offset_initialized) {
+        ctx->pts_offset = ptsvol;
+        ctx->pts_offset_initialized = 1;
+      }
+      ptsvol -= ctx->pts_offset;
+      int64_t src_time_stamp =
+          timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol);
+      int64_t src_end_time_stamp =
+          timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol + duration);
+
       YV12_BUFFER_CONFIG sd;
-      int use_highbitdepth, subsampling_x, subsampling_y;
       res = image2yuvconfig(img, &sd);
-      use_highbitdepth = (sd.flags & YV12_FLAG_HIGHBITDEPTH) != 0;
-      subsampling_x = sd.subsampling_x;
-      subsampling_y = sd.subsampling_y;
-
-      if (!cpi->lookahead) {
-        int lag_in_frames = cpi_lap != NULL ? cpi_lap->oxcf.lag_in_frames
-                                            : cpi->oxcf.lag_in_frames;
-
-        cpi->lookahead = av1_lookahead_init(
-            cpi->oxcf.width, cpi->oxcf.height, subsampling_x, subsampling_y,
-            use_highbitdepth, lag_in_frames, cpi->oxcf.border_in_pixels,
-            cpi->common.features.byte_alignment, ctx->num_lap_buffers);
+      // When generating a monochrome stream, make |sd| a monochrome image.
+      if (ctx->cfg.monochrome) {
+        sd.u_buffer = sd.v_buffer = NULL;
+        sd.uv_stride = 0;
+        sd.monochrome = 1;
+      }
+      int use_highbitdepth = (sd.flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+      int subsampling_x = sd.subsampling_x;
+      int subsampling_y = sd.subsampling_y;
+
+      if (!ppi->lookahead) {
+        int lag_in_frames = cpi_lap != NULL ? cpi_lap->oxcf.gf_cfg.lag_in_frames
+                                            : cpi->oxcf.gf_cfg.lag_in_frames;
+
+        ppi->lookahead = av1_lookahead_init(
+            cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height,
+            subsampling_x, subsampling_y, use_highbitdepth, lag_in_frames,
+            cpi->oxcf.border_in_pixels, cpi->common.features.byte_alignment,
+            ctx->num_lap_buffers, (cpi->oxcf.kf_cfg.key_freq_max == 0),
+            cpi->oxcf.tool_cfg.enable_global_motion);
       }
-      if (!cpi->lookahead)
-        aom_internal_error(&cpi->common.error, AOM_CODEC_MEM_ERROR,
+      if (!ppi->lookahead)
+        aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
                            "Failed to allocate lag buffers");
-
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      int i;
+      for (i = 0; i < ppi->num_fp_contexts; i++) {
+        av1_check_initial_width(ppi->parallel_cpi[i], use_highbitdepth,
+                                subsampling_x, subsampling_y);
+      }
+#else
       av1_check_initial_width(cpi, use_highbitdepth, subsampling_x,
                               subsampling_y);
+#endif
       if (cpi_lap != NULL) {
-        cpi_lap->lookahead = cpi->lookahead;
         av1_check_initial_width(cpi_lap, use_highbitdepth, subsampling_x,
                                 subsampling_y);
       }
@@ -2119,184 +2901,272 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
       // Store the original flags in to the frame buffer. Will extract the
       // key frame flag when we actually encode this frame.
       if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
-                                dst_time_stamp, dst_end_time_stamp)) {
-        res = update_error_state(ctx, &cpi->common.error);
+                                src_time_stamp, src_end_time_stamp)) {
+        res = update_error_state(ctx, &ppi->error);
       }
       ctx->next_frame_flags = 0;
     }
 
-    unsigned char *cx_data = ctx->cx_data;
-    size_t cx_data_sz = ctx->cx_data_sz;
-
-    assert(!(cx_data == NULL && cx_data_sz != 0));
+    cpi_data.cx_data = ctx->cx_data;
+    cpi_data.cx_data_sz = ctx->cx_data_sz;
 
     /* Any pending invisible frames? */
-    if (ctx->pending_cx_data) {
-      memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz);
-      ctx->pending_cx_data = cx_data;
-      cx_data += ctx->pending_cx_data_sz;
-      cx_data_sz -= ctx->pending_cx_data_sz;
+    if (ctx->pending_cx_data_sz) {
+      cpi_data.cx_data += ctx->pending_cx_data_sz;
+      cpi_data.cx_data_sz -= ctx->pending_cx_data_sz;
 
       /* TODO: this is a minimal check, the underlying codec doesn't respect
        * the buffer size anyway.
        */
-      if (cx_data_sz < ctx->cx_data_sz / 2) {
-        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+      if (cpi_data.cx_data_sz < ctx->cx_data_sz / 2) {
+        aom_internal_error(&ppi->error, AOM_CODEC_ERROR,
                            "Compressed data buffer too small");
       }
     }
 
-    size_t frame_size = 0;
-    unsigned int lib_flags = 0;
     int is_frame_visible = 0;
-    int index_size = 0;
-    int has_fwd_keyframe = 0;
+    int has_no_show_keyframe = 0;
+    int num_workers = 0;
+
+    if (cpi->oxcf.pass == AOM_RC_FIRST_PASS) {
+#if !CONFIG_REALTIME_ONLY
+      num_workers = ppi->p_mt_info.num_mod_workers[MOD_FP] =
+          av1_fp_compute_num_enc_workers(cpi);
+#endif
+    } else {
+      av1_compute_num_workers_for_mt(cpi);
+      num_workers = av1_get_max_num_workers(cpi);
+    }
+    if ((num_workers > 1) && (ppi->p_mt_info.num_workers == 0)) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      // Obtain the maximum no. of frames that can be supported in a parallel
+      // encode set.
+      if (is_stat_consumption_stage(cpi)) {
+        ppi->num_fp_contexts = av1_compute_num_fp_contexts(ppi, &cpi->oxcf);
+      }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+      av1_create_workers(ppi, num_workers);
+      av1_init_tile_thread_data(ppi, cpi->oxcf.pass == AOM_RC_FIRST_PASS);
+#if CONFIG_MULTITHREAD
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      for (int i = 0; i < ppi->num_fp_contexts; i++) {
+        av1_init_mt_sync(ppi->parallel_cpi[i],
+                         ppi->parallel_cpi[i]->oxcf.pass == AOM_RC_FIRST_PASS);
+      }
+#else
+      av1_init_mt_sync(cpi, cpi->oxcf.pass == 1);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+      if (cpi_lap != NULL) {
+        av1_init_mt_sync(cpi_lap, 1);
+      }
+#endif  // CONFIG_MULTITHREAD
+    }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    for (int i = 0; i < ppi->num_fp_contexts; i++) {
+      av1_init_frame_mt(ppi, ppi->parallel_cpi[i]);
+    }
+#else
+    av1_init_frame_mt(ppi, cpi);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    if (cpi_lap != NULL) {
+      av1_init_frame_mt(ppi, cpi_lap);
+    }
 
     // Call for LAP stage
     if (cpi_lap != NULL) {
-      int status;
-      aom_rational64_t timestamp_ratio_la = *timestamp_ratio;
-      int64_t dst_time_stamp_la = dst_time_stamp;
-      int64_t dst_end_time_stamp_la = dst_end_time_stamp;
-      status = av1_get_compressed_data(
-          cpi_lap, &lib_flags, &frame_size, NULL, &dst_time_stamp_la,
-          &dst_end_time_stamp_la, !img, &timestamp_ratio_la);
+      AV1_COMP_DATA cpi_lap_data = { 0 };
+      cpi_lap_data.flush = !img;
+      cpi_lap_data.timestamp_ratio = &ctx->timestamp_ratio;
+      const int status = av1_get_compressed_data(cpi_lap, &cpi_lap_data);
       if (status != -1) {
         if (status != AOM_CODEC_OK) {
-          aom_internal_error(&cpi_lap->common.error, AOM_CODEC_ERROR, NULL);
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
-        cpi_lap->seq_params_locked = 1;
       }
-      lib_flags = 0;
-      frame_size = 0;
+      av1_post_encode_updates(cpi_lap, &cpi_lap_data);
+    }
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Recalculate the maximum number of frames that can be encoded in
+    // parallel at the beginning of sub gop.
+    if (is_stat_consumption_stage(cpi) && ppi->gf_group.size > 0 &&
+        cpi->gf_frame_index == ppi->gf_group.size) {
+      ppi->num_fp_contexts = av1_compute_num_fp_contexts(ppi, &cpi->oxcf);
     }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+    // Get the next visible frame. Invisible frames get packed with the next
+    // visible frame.
+    while (cpi_data.cx_data_sz >= ctx->cx_data_sz / 2 && !is_frame_visible) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      int simulate_parallel_frame = 0;
+      int status = -1;
+      cpi->do_frame_data_update = true;
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+      cpi->ref_idx_to_skip = INVALID_IDX;
+      cpi->ref_refresh_index = INVALID_IDX;
+      cpi->refresh_idx_available = false;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+
+#if CONFIG_FPMT_TEST
+      simulate_parallel_frame =
+          cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0;
+      if (simulate_parallel_frame) {
+        if (ppi->num_fp_contexts > 1 && ppi->gf_group.size > 1) {
+          if (cpi->gf_frame_index < ppi->gf_group.size) {
+            calc_frame_data_update_flag(&ppi->gf_group, cpi->gf_frame_index,
+                                        &cpi->do_frame_data_update);
+          }
+        }
+        status = av1_get_compressed_data(cpi, &cpi_data);
+      }
 
-    // invisible frames get packed with the next visible frame
-    while (cx_data_sz - index_size >= ctx->cx_data_sz / 2 &&
-           !is_frame_visible) {
-      const int status = av1_get_compressed_data(
-          cpi, &lib_flags, &frame_size, cx_data, &dst_time_stamp,
-          &dst_end_time_stamp, !img, timestamp_ratio);
+#endif
+      if (!simulate_parallel_frame) {
+        if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+          status = av1_get_compressed_data(cpi, &cpi_data);
+        } else if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] ==
+                   1) {
+          status = av1_compress_parallel_frames(ppi, &cpi_data);
+        } else {
+          cpi = av1_get_parallel_frame_enc_data(ppi, &cpi_data);
+          status = AOM_CODEC_OK;
+        }
+      }
+#else
+      const int status = av1_get_compressed_data(cpi, &cpi_data);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
       if (status == -1) break;
       if (status != AOM_CODEC_OK) {
-        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+        aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
       }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      if (ppi->num_fp_contexts > 0 && frame_is_intra_only(&cpi->common)) {
+        av1_init_sc_decisions(ppi);
+      }
+#endif
 
-      cpi->seq_params_locked = 1;
-      if (frame_size) {
-        if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data;
-
-        const int write_temporal_delimiter =
-            !cpi->common.spatial_layer_id && !ctx->pending_frame_count;
-
-        if (write_temporal_delimiter) {
-          uint32_t obu_header_size = 1;
-          const uint32_t obu_payload_size = 0;
-          const size_t length_field_size =
-              aom_uleb_size_in_bytes(obu_payload_size);
-
-          if (ctx->pending_cx_data) {
-            const size_t move_offset = length_field_size + 1;
-            memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
-                    frame_size);
-          }
-          const uint32_t obu_header_offset = 0;
-          obu_header_size = av1_write_obu_header(
-              &cpi->level_params, OBU_TEMPORAL_DELIMITER, 0,
-              (uint8_t *)(ctx->pending_cx_data + obu_header_offset));
-
-          // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
-          if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
-                                      ctx->pending_cx_data) != AOM_CODEC_OK) {
-            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
-          }
+      ppi->seq_params_locked = 1;
+      av1_post_encode_updates(cpi, &cpi_data);
 
-          frame_size += obu_header_size + obu_payload_size + length_field_size;
+#if CONFIG_ENTROPY_STATS
+      if (ppi->cpi->oxcf.pass != 1 && !cpi->common.show_existing_frame)
+        av1_accumulate_frame_counts(&ppi->aggregate_fc, &cpi->counts);
+#endif
+#if CONFIG_INTERNAL_STATS
+      if (ppi->cpi->oxcf.pass != 1) {
+        ppi->total_time_compress_data += cpi->time_compress_data;
+        ppi->total_recode_hits += cpi->frame_recode_hits;
+        ppi->total_bytes += cpi->bytes;
+        for (int i = 0; i < MAX_MODES; i++) {
+          ppi->total_mode_chosen_counts[i] += cpi->mode_chosen_counts[i];
+        }
+      }
+#endif  // CONFIG_INTERNAL_STATS
+
+      if (!cpi_data.frame_size) continue;
+      assert(cpi_data.cx_data != NULL && cpi_data.cx_data_sz != 0);
+      const int write_temporal_delimiter =
+          !cpi->common.spatial_layer_id && !ctx->pending_cx_data_sz;
+
+      if (write_temporal_delimiter) {
+        uint32_t obu_header_size = 1;
+        const uint32_t obu_payload_size = 0;
+        const size_t length_field_size =
+            aom_uleb_size_in_bytes(obu_payload_size);
+
+        const size_t move_offset = obu_header_size + length_field_size;
+        memmove(ctx->cx_data + move_offset, ctx->cx_data, cpi_data.frame_size);
+        obu_header_size =
+            av1_write_obu_header(&ppi->level_params, &cpi->frame_header_count,
+                                 OBU_TEMPORAL_DELIMITER, 0, ctx->cx_data);
+
+        // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
+        if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
+                                    ctx->cx_data) != AOM_CODEC_OK) {
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
 
-        if (ctx->oxcf.save_as_annexb) {
-          size_t curr_frame_size = frame_size;
-          if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) !=
-              AOM_CODEC_OK) {
-            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
-          }
-          frame_size = curr_frame_size;
+        cpi_data.frame_size +=
+            obu_header_size + obu_payload_size + length_field_size;
+      }
 
-          // B_PRIME (add frame size)
-          const size_t length_field_size = aom_uleb_size_in_bytes(frame_size);
-          if (ctx->pending_cx_data) {
-            const size_t move_offset = length_field_size;
-            memmove(cx_data + move_offset, cx_data, frame_size);
-          }
-          if (av1_write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
-              AOM_CODEC_OK) {
-            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
-          }
-          frame_size += length_field_size;
+      if (ctx->oxcf.save_as_annexb) {
+        size_t curr_frame_size = cpi_data.frame_size;
+        if (av1_convert_sect5obus_to_annexb(cpi_data.cx_data,
+                                            &curr_frame_size) != AOM_CODEC_OK) {
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
+        cpi_data.frame_size = curr_frame_size;
+
+        // B_PRIME (add frame size)
+        const size_t length_field_size =
+            aom_uleb_size_in_bytes(cpi_data.frame_size);
+        memmove(cpi_data.cx_data + length_field_size, cpi_data.cx_data,
+                cpi_data.frame_size);
+        if (av1_write_uleb_obu_size(0, (uint32_t)cpi_data.frame_size,
+                                    cpi_data.cx_data) != AOM_CODEC_OK) {
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
+        }
+        cpi_data.frame_size += length_field_size;
+      }
 
-        ctx->pending_frame_sizes[ctx->pending_frame_count++] = frame_size;
-        ctx->pending_cx_data_sz += frame_size;
-
-        cx_data += frame_size;
-        cx_data_sz -= frame_size;
+      ctx->pending_cx_data_sz += cpi_data.frame_size;
 
-        index_size = MAG_SIZE * (ctx->pending_frame_count - 1) + 2;
+      cpi_data.cx_data += cpi_data.frame_size;
+      cpi_data.cx_data_sz -= cpi_data.frame_size;
 
-        is_frame_visible = cpi->common.show_frame;
+      is_frame_visible = cpi->common.show_frame;
 
-        has_fwd_keyframe |= (!is_frame_visible &&
-                             cpi->common.current_frame.frame_type == KEY_FRAME);
-      }
+      has_no_show_keyframe |=
+          (!is_frame_visible &&
+           cpi->common.current_frame.frame_type == KEY_FRAME);
     }
     if (is_frame_visible) {
       // Add the frame packet to the list of returned packets.
       aom_codec_cx_pkt_t pkt;
 
+      // decrement frames_left counter
+      ppi->frames_left = AOMMAX(0, ppi->frames_left - 1);
       if (ctx->oxcf.save_as_annexb) {
         //  B_PRIME (add TU size)
         size_t tu_size = ctx->pending_cx_data_sz;
         const size_t length_field_size = aom_uleb_size_in_bytes(tu_size);
-        if (ctx->pending_cx_data) {
-          const size_t move_offset = length_field_size;
-          memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
-                  tu_size);
-        }
-        if (av1_write_uleb_obu_size(0, (uint32_t)tu_size,
-                                    ctx->pending_cx_data) != AOM_CODEC_OK) {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+        memmove(ctx->cx_data + length_field_size, ctx->cx_data, tu_size);
+        if (av1_write_uleb_obu_size(0, (uint32_t)tu_size, ctx->cx_data) !=
+            AOM_CODEC_OK) {
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
         ctx->pending_cx_data_sz += length_field_size;
       }
 
       pkt.kind = AOM_CODEC_CX_FRAME_PKT;
 
-      pkt.data.frame.buf = ctx->pending_cx_data;
+      pkt.data.frame.buf = ctx->cx_data;
       pkt.data.frame.sz = ctx->pending_cx_data_sz;
       pkt.data.frame.partition_id = -1;
-      pkt.data.frame.vis_frame_size = frame_size;
+      pkt.data.frame.vis_frame_size = cpi_data.frame_size;
 
-      pkt.data.frame.pts =
-          ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
-          ctx->pts_offset;
-      pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
-      if (has_fwd_keyframe) {
+      pkt.data.frame.pts = ticks_to_timebase_units(cpi_data.timestamp_ratio,
+                                                   cpi_data.ts_frame_start) +
+                           ctx->pts_offset;
+      pkt.data.frame.flags = get_frame_pkt_flags(cpi, cpi_data.lib_flags);
+      if (has_no_show_keyframe) {
         // If one of the invisible frames in the packet is a keyframe, set
         // the delayed random access point flag.
         pkt.data.frame.flags |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT;
       }
       pkt.data.frame.duration = (uint32_t)ticks_to_timebase_units(
-          timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
+          cpi_data.timestamp_ratio,
+          cpi_data.ts_frame_end - cpi_data.ts_frame_start);
 
       aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
 
-      ctx->pending_cx_data = NULL;
       ctx->pending_cx_data_sz = 0;
-      ctx->pending_frame_count = 0;
     }
   }
 
-  cpi->common.error.setjmp = 0;
+  ppi->error.setjmp = 0;
   return res;
 }
 
@@ -2313,7 +3183,7 @@ static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
     YV12_BUFFER_CONFIG sd;
 
     image2yuvconfig(&frame->img, &sd);
-    av1_set_reference_enc(ctx->cpi, frame->idx, &sd);
+    av1_set_reference_enc(ctx->ppi->cpi, frame->idx, &sd);
     return AOM_CODEC_OK;
   } else {
     return AOM_CODEC_INVALID_PARAM;
@@ -2328,7 +3198,7 @@ static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
     YV12_BUFFER_CONFIG sd;
 
     image2yuvconfig(&frame->img, &sd);
-    av1_copy_reference_enc(ctx->cpi, frame->idx, &sd);
+    av1_copy_reference_enc(ctx->ppi->cpi, frame->idx, &sd);
     return AOM_CODEC_OK;
   } else {
     return AOM_CODEC_INVALID_PARAM;
@@ -2340,7 +3210,7 @@ static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
   av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
 
   if (frame != NULL) {
-    YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx);
+    YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->ppi->cpi->common, frame->idx);
     if (fb == NULL) return AOM_CODEC_ERROR;
 
     yuvconfig2image(&frame->img, fb, NULL);
@@ -2357,7 +3227,7 @@ static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
   if (new_img != NULL) {
     YV12_BUFFER_CONFIG new_frame;
 
-    if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+    if (av1_get_last_show_frame(ctx->ppi->cpi, &new_frame) == 0) {
       yuvconfig2image(new_img, &new_frame, NULL);
       return AOM_CODEC_OK;
     } else {
@@ -2375,10 +3245,10 @@ static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
   if (new_img != NULL) {
     YV12_BUFFER_CONFIG new_frame;
 
-    if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+    if (av1_get_last_show_frame(ctx->ppi->cpi, &new_frame) == 0) {
       YV12_BUFFER_CONFIG sd;
       image2yuvconfig(new_img, &sd);
-      return av1_copy_new_frame_enc(&ctx->cpi->common, &new_frame, &sd);
+      return av1_copy_new_frame_enc(&ctx->ppi->cpi->common, &new_frame, &sd);
     } else {
       return AOM_CODEC_ERROR;
     }
@@ -2390,7 +3260,7 @@ static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
 static aom_image_t *encoder_get_preview(aom_codec_alg_priv_t *ctx) {
   YV12_BUFFER_CONFIG sd;
 
-  if (av1_get_preview_raw_frame(ctx->cpi, &sd) == 0) {
+  if (av1_get_preview_raw_frame(ctx->ppi->cpi, &sd) == 0) {
     yuvconfig2image(&ctx->preview_img, &sd, NULL);
     return &ctx->preview_img;
   } else {
@@ -2402,7 +3272,8 @@ static aom_codec_err_t ctrl_use_reference(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   const int reference_flag = va_arg(args, int);
 
-  av1_use_as_reference(&ctx->cpi->ext_flags.ref_frame_flags, reference_flag);
+  av1_use_as_reference(&ctx->ppi->cpi->ext_flags.ref_frame_flags,
+                       reference_flag);
   return AOM_CODEC_OK;
 }
 
@@ -2420,7 +3291,7 @@ static aom_codec_err_t ctrl_set_active_map(aom_codec_alg_priv_t *ctx,
   aom_active_map_t *const map = va_arg(args, aom_active_map_t *);
 
   if (map) {
-    if (!av1_set_active_map(ctx->cpi, map->active_map, (int)map->rows,
+    if (!av1_set_active_map(ctx->ppi->cpi, map->active_map, (int)map->rows,
                             (int)map->cols))
       return AOM_CODEC_OK;
     else
@@ -2435,7 +3306,7 @@ static aom_codec_err_t ctrl_get_active_map(aom_codec_alg_priv_t *ctx,
   aom_active_map_t *const map = va_arg(args, aom_active_map_t *);
 
   if (map) {
-    if (!av1_get_active_map(ctx->cpi, map->active_map, (int)map->rows,
+    if (!av1_get_active_map(ctx->ppi->cpi, map->active_map, (int)map->rows,
                             (int)map->cols))
       return AOM_CODEC_OK;
     else
@@ -2451,8 +3322,11 @@ static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
 
   if (mode) {
     const int res = av1_set_internal_size(
-        &ctx->cpi->oxcf, &ctx->cpi->resize_pending_params,
+        &ctx->ppi->cpi->oxcf, &ctx->ppi->cpi->resize_pending_params,
         (AOM_SCALING)mode->h_scaling_mode, (AOM_SCALING)mode->v_scaling_mode);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
+#endif
     return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
   } else {
     return AOM_CODEC_INVALID_PARAM;
@@ -2464,7 +3338,7 @@ static aom_codec_err_t ctrl_set_spatial_layer_id(aom_codec_alg_priv_t *ctx,
   const int spatial_layer_id = va_arg(args, int);
   if (spatial_layer_id >= MAX_NUM_SPATIAL_LAYERS)
     return AOM_CODEC_INVALID_PARAM;
-  ctx->cpi->common.spatial_layer_id = spatial_layer_id;
+  ctx->ppi->cpi->common.spatial_layer_id = spatial_layer_id;
   return AOM_CODEC_OK;
 }
 
@@ -2473,36 +3347,37 @@ static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
   const int number_spatial_layers = va_arg(args, int);
   if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS)
     return AOM_CODEC_INVALID_PARAM;
-  ctx->cpi->common.number_spatial_layers = number_spatial_layers;
+  ctx->ppi->number_spatial_layers = number_spatial_layers;
   return AOM_CODEC_OK;
 }
 
 static aom_codec_err_t ctrl_set_layer_id(aom_codec_alg_priv_t *ctx,
                                          va_list args) {
   aom_svc_layer_id_t *const data = va_arg(args, aom_svc_layer_id_t *);
-  ctx->cpi->common.spatial_layer_id = data->spatial_layer_id;
-  ctx->cpi->common.temporal_layer_id = data->temporal_layer_id;
-  ctx->cpi->svc.spatial_layer_id = data->spatial_layer_id;
-  ctx->cpi->svc.temporal_layer_id = data->temporal_layer_id;
+  ctx->ppi->cpi->common.spatial_layer_id = data->spatial_layer_id;
+  ctx->ppi->cpi->common.temporal_layer_id = data->temporal_layer_id;
+  ctx->ppi->cpi->svc.spatial_layer_id = data->spatial_layer_id;
+  ctx->ppi->cpi->svc.temporal_layer_id = data->temporal_layer_id;
   return AOM_CODEC_OK;
 }
 
 static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
-  AV1_COMP *const cpi = ctx->cpi;
+  AV1_PRIMARY *const ppi = ctx->ppi;
+  AV1_COMP *const cpi = ppi->cpi;
+  AV1_COMMON *const cm = &cpi->common;
   aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *);
-  cpi->common.number_spatial_layers = params->number_spatial_layers;
-  cpi->common.number_temporal_layers = params->number_temporal_layers;
+  int64_t target_bandwidth = 0;
+  ppi->number_spatial_layers = params->number_spatial_layers;
+  ppi->number_temporal_layers = params->number_temporal_layers;
   cpi->svc.number_spatial_layers = params->number_spatial_layers;
   cpi->svc.number_temporal_layers = params->number_temporal_layers;
-  if (cpi->common.number_spatial_layers > 1 ||
-      cpi->common.number_temporal_layers > 1) {
+  if (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1) {
     unsigned int sl, tl;
-    cpi->use_svc = 1;
-    for (sl = 0; sl < cpi->common.number_spatial_layers; ++sl) {
-      for (tl = 0; tl < cpi->common.number_temporal_layers; ++tl) {
-        const int layer =
-            LAYER_IDS_TO_IDX(sl, tl, cpi->common.number_temporal_layers);
+    ctx->ppi->use_svc = 1;
+    for (sl = 0; sl < ppi->number_spatial_layers; ++sl) {
+      for (tl = 0; tl < ppi->number_temporal_layers; ++tl) {
+        const int layer = LAYER_IDS_TO_IDX(sl, tl, ppi->number_temporal_layers);
         LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
         lc->max_q = params->max_quantizers[layer];
         lc->min_q = params->min_quantizers[layer];
@@ -2510,28 +3385,52 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
         lc->scaling_factor_den = params->scaling_factor_den[sl];
         lc->layer_target_bitrate = 1000 * params->layer_target_bitrate[layer];
         lc->framerate_factor = params->framerate_factor[tl];
+        if (tl == ppi->number_temporal_layers - 1)
+          target_bandwidth += lc->layer_target_bitrate;
       }
     }
-    if (cpi->common.current_frame.frame_number == 0)
+    if (cm->current_frame.frame_number == 0) {
+      if (!cpi->ppi->seq_params_locked) {
+        SequenceHeader *const seq_params = &ppi->seq_params;
+        seq_params->operating_points_cnt_minus_1 =
+            ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
+        av1_init_seq_coding_tools(ppi, &cpi->oxcf, 1);
+      }
       av1_init_layer_context(cpi);
-    else
-      av1_update_layer_context_change_config(cpi, cpi->oxcf.target_bandwidth);
+    }
+    av1_update_layer_context_change_config(cpi, target_bandwidth);
   }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   return AOM_CODEC_OK;
 }
 
 static aom_codec_err_t ctrl_set_svc_ref_frame_config(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
-  AV1_COMP *const cpi = ctx->cpi;
+  AV1_COMP *const cpi = ctx->ppi->cpi;
   aom_svc_ref_frame_config_t *const data =
       va_arg(args, aom_svc_ref_frame_config_t *);
-  cpi->svc.external_ref_frame_config = 1;
+  cpi->svc.set_ref_frame_config = 1;
   for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     cpi->svc.reference[i] = data->reference[i];
     cpi->svc.ref_idx[i] = data->ref_idx[i];
   }
   for (unsigned int i = 0; i < REF_FRAMES; ++i)
     cpi->svc.refresh[i] = data->refresh[i];
+  cpi->svc.use_flexible_mode = 1;
+  cpi->svc.ksvc_fixed_mode = 0;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_svc_ref_frame_comp_pred(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  AV1_COMP *const cpi = ctx->ppi->cpi;
+  aom_svc_ref_frame_comp_pred_t *const data =
+      va_arg(args, aom_svc_ref_frame_comp_pred_t *);
+  cpi->svc.ref_frame_comp[0] = data->use_comp_pred[0];
+  cpi->svc.ref_frame_comp[1] = data->use_comp_pred[1];
+  cpi->svc.ref_frame_comp[2] = data->use_comp_pred[2];
   return AOM_CODEC_OK;
 }
 
@@ -2616,15 +3515,473 @@ static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx,
+                                          const char *name, const char *value) {
+  if (ctx == NULL || name == NULL || value == NULL)
+    return AOM_CODEC_INVALID_PARAM;
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  // Used to mock the argv with just one string "--{name}={value}"
+  char *argv[2] = { NULL, "" };
+  size_t len = strlen(name) + strlen(value) + 4;
+  char *const err_string = ctx->ppi->error.detail;
+
+#if __STDC_VERSION__ >= 201112L
+  // We use the keyword _Static_assert because clang-cl does not allow the
+  // convenience macro static_assert to be used in function scope. See
+  // https://bugs.llvm.org/show_bug.cgi?id=48904.
+  _Static_assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN,
+                 "The size of the err_msg buffer for arg_match_helper must be "
+                 "at least ARG_ERR_MSG_MAX_LEN");
+#else
+  assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN);
+#endif
+
+  argv[0] = aom_malloc(len * sizeof(argv[1][0]));
+  if (!argv[0]) return AOM_CODEC_MEM_ERROR;
+  snprintf(argv[0], len, "--%s=%s", name, value);
+  struct arg arg;
+  aom_codec_err_t err = AOM_CODEC_OK;
+
+  int match = 1;
+  if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_keyframe_filtering,
+                       argv, err_string)) {
+    extra_cfg.enable_keyframe_filtering =
+        arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.min_gf_interval, argv,
+                              err_string)) {
+    extra_cfg.min_gf_interval = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_gf_interval, argv,
+                              err_string)) {
+    extra_cfg.max_gf_interval = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.gf_min_pyr_height,
+                              argv, err_string)) {
+    extra_cfg.gf_min_pyr_height = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.gf_max_pyr_height,
+                              argv, err_string)) {
+    extra_cfg.gf_max_pyr_height = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cpu_used_av1, argv,
+                              err_string)) {
+    extra_cfg.cpu_used = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.auto_altref, argv,
+                              err_string)) {
+    extra_cfg.enable_auto_alt_ref = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.noise_sens, argv,
+                              err_string)) {
+    extra_cfg.noise_sensitivity = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.sharpness, argv,
+                              err_string)) {
+    extra_cfg.sharpness = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.static_thresh, argv,
+                              err_string)) {
+    extra_cfg.static_thresh = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.rowmtarg, argv,
+                              err_string)) {
+    extra_cfg.row_mt = arg_parse_uint_helper(&arg, err_string);
+  }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.fpmtarg, argv,
+                            err_string)) {
+    extra_cfg.fp_mt = arg_parse_uint_helper(&arg, err_string);
+  }
+#endif
+  else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_cols, argv,
+                            err_string)) {
+    extra_cfg.tile_columns = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_rows, argv,
+                              err_string)) {
+    extra_cfg.tile_rows = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_tpl_model,
+                              argv, err_string)) {
+    extra_cfg.enable_tpl_model = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.arnr_maxframes, argv,
+                              err_string)) {
+    extra_cfg.arnr_max_frames = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.arnr_strength, argv,
+                              err_string)) {
+    extra_cfg.arnr_strength = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tune_metric, argv,
+                              err_string)) {
+    extra_cfg.tuning = arg_parse_enum_helper(&arg, err_string);
+  }
+#if CONFIG_TUNE_VMAF
+  else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.vmaf_model_path, argv,
+                            err_string)) {
+    err = allocate_and_set_string(value, default_extra_cfg.vmaf_model_path,
+                                  &extra_cfg.vmaf_model_path, err_string);
+  }
+#endif
+  else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.partition_info_path,
+                            argv, err_string)) {
+    err = allocate_and_set_string(value, default_extra_cfg.partition_info_path,
+                                  &extra_cfg.partition_info_path, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.dist_metric, argv,
+                              err_string)) {
+    extra_cfg.dist_metric = arg_parse_enum_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cq_level, argv,
+                              err_string)) {
+    extra_cfg.cq_level = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_intra_rate_pct,
+                              argv, err_string)) {
+    extra_cfg.rc_max_intra_bitrate_pct =
+        arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_inter_rate_pct,
+                              argv, err_string)) {
+    extra_cfg.rc_max_inter_bitrate_pct =
+        arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.gf_cbr_boost_pct,
+                              argv, err_string)) {
+    extra_cfg.gf_cbr_boost_pct = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.lossless, argv,
+                              err_string)) {
+    extra_cfg.lossless = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_cdef, argv,
+                              err_string)) {
+    extra_cfg.enable_cdef = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_restoration,
+                              argv, err_string)) {
+    extra_cfg.enable_restoration = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.force_video_mode,
+                              argv, err_string)) {
+    extra_cfg.force_video_mode = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_obmc, argv,
+                              err_string)) {
+    extra_cfg.enable_obmc = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.disable_trellis_quant,
+                              argv, err_string)) {
+    extra_cfg.disable_trellis_quant = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_qm, argv,
+                              err_string)) {
+    extra_cfg.enable_qm = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.qm_max, argv,
+                              err_string)) {
+    extra_cfg.qm_max = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.qm_min, argv,
+                              err_string)) {
+    extra_cfg.qm_min = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.num_tg, argv,
+                              err_string)) {
+    extra_cfg.num_tg = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mtu_size, argv,
+                              err_string)) {
+    extra_cfg.mtu_size = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.timing_info, argv,
+                              err_string)) {
+    extra_cfg.timing_info_type = arg_parse_enum_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.frame_parallel_decoding,
+                              argv, err_string)) {
+    extra_cfg.frame_parallel_decoding_mode =
+        arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_dual_filter,
+                              argv, err_string)) {
+    extra_cfg.enable_dual_filter = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_chroma_deltaq,
+                              argv, err_string)) {
+    extra_cfg.enable_chroma_deltaq = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.aq_mode, argv,
+                              err_string)) {
+    extra_cfg.aq_mode = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.deltaq_mode, argv,
+                              err_string)) {
+    extra_cfg.deltaq_mode = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.deltaq_strength, argv,
+                              err_string)) {
+    extra_cfg.deltaq_strength = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.deltalf_mode, argv,
+                              err_string)) {
+    extra_cfg.deltalf_mode = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.frame_periodic_boost,
+                              argv, err_string)) {
+    extra_cfg.frame_periodic_boost = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tune_content, argv,
+                              err_string)) {
+    extra_cfg.content = arg_parse_enum_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.input_color_primaries,
+                              argv, err_string)) {
+    extra_cfg.color_primaries = arg_parse_enum_helper(&arg, err_string);
+  } else if (arg_match_helper(
+                 &arg, &g_av1_codec_arg_defs.input_transfer_characteristics,
+                 argv, err_string)) {
+    extra_cfg.transfer_characteristics =
+        arg_parse_enum_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.input_matrix_coefficients,
+                              argv, err_string)) {
+    extra_cfg.matrix_coefficients = arg_parse_enum_helper(&arg, err_string);
+  } else if (arg_match_helper(
+                 &arg, &g_av1_codec_arg_defs.input_chroma_sample_position, argv,
+                 err_string)) {
+    extra_cfg.chroma_sample_position = arg_parse_enum_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.superblock_size, argv,
+                              err_string)) {
+    extra_cfg.superblock_size = arg_parse_enum_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.error_resilient_mode,
+                              argv, err_string)) {
+    extra_cfg.error_resilient_mode = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.sframe_mode, argv,
+                              err_string)) {
+    extra_cfg.s_frame_mode = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.film_grain_test, argv,
+                              err_string)) {
+    extra_cfg.film_grain_test_vector = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.film_grain_table,
+                              argv, err_string)) {
+    if (value == NULL) {
+      // this parameter allows NULL as its value
+      extra_cfg.film_grain_table_filename = value;
+    } else {
+      err = allocate_and_set_string(
+          value, default_extra_cfg.film_grain_table_filename,
+          &extra_cfg.film_grain_table_filename, err_string);
+    }
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cdf_update_mode, argv,
+                              err_string)) {
+    extra_cfg.cdf_update_mode = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.enable_rect_partitions,
+                              argv, err_string)) {
+    extra_cfg.enable_rect_partitions = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_ab_partitions,
+                              argv, err_string)) {
+    extra_cfg.enable_ab_partitions = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.enable_1to4_partitions,
+                              argv, err_string)) {
+    extra_cfg.enable_1to4_partitions = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.min_partition_size,
+                              argv, err_string)) {
+    extra_cfg.min_partition_size = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_partition_size,
+                              argv, err_string)) {
+    extra_cfg.max_partition_size = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.enable_intra_edge_filter,
+                              argv, err_string)) {
+    extra_cfg.enable_intra_edge_filter =
+        arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_order_hint,
+                              argv, err_string)) {
+    extra_cfg.enable_order_hint = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_tx64, argv,
+                              err_string)) {
+    extra_cfg.enable_tx64 = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_flip_idtx,
+                              argv, err_string)) {
+    extra_cfg.enable_flip_idtx = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_rect_tx, argv,
+                              err_string)) {
+    extra_cfg.enable_rect_tx = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_dist_wtd_comp,
+                              argv, err_string)) {
+    extra_cfg.enable_dist_wtd_comp = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_reference_frames,
+                              argv, err_string)) {
+    extra_cfg.max_reference_frames = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.reduced_reference_set,
+                              argv, err_string)) {
+    extra_cfg.enable_reduced_reference_set =
+        arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_ref_frame_mvs,
+                              argv, err_string)) {
+    extra_cfg.enable_ref_frame_mvs = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_masked_comp,
+                              argv, err_string)) {
+    extra_cfg.enable_masked_comp = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_onesided_comp,
+                              argv, err_string)) {
+    extra_cfg.enable_onesided_comp = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.enable_interintra_comp,
+                              argv, err_string)) {
+    extra_cfg.enable_interintra_comp = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.enable_smooth_interintra,
+                              argv, err_string)) {
+    extra_cfg.enable_smooth_interintra = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_diff_wtd_comp,
+                              argv, err_string)) {
+    extra_cfg.enable_diff_wtd_comp = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.enable_interinter_wedge,
+                              argv, err_string)) {
+    extra_cfg.enable_interinter_wedge = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.enable_interintra_wedge,
+                              argv, err_string)) {
+    extra_cfg.enable_interintra_wedge = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_global_motion,
+                              argv, err_string)) {
+    extra_cfg.enable_global_motion = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_warped_motion,
+                              argv, err_string)) {
+    extra_cfg.enable_warped_motion = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_filter_intra,
+                              argv, err_string)) {
+    extra_cfg.enable_filter_intra = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_smooth_intra,
+                              argv, err_string)) {
+    extra_cfg.enable_smooth_intra = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_paeth_intra,
+                              argv, err_string)) {
+    extra_cfg.enable_paeth_intra = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_cfl_intra,
+                              argv, err_string)) {
+    extra_cfg.enable_cfl_intra = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.enable_directional_intra,
+                              argv, err_string)) {
+    extra_cfg.enable_directional_intra = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_diagonal_intra,
+                              argv, err_string)) {
+    extra_cfg.enable_diagonal_intra = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_overlay, argv,
+                              err_string)) {
+    extra_cfg.enable_overlay = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_palette, argv,
+                              err_string)) {
+    extra_cfg.enable_palette = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_intrabc, argv,
+                              err_string)) {
+    extra_cfg.enable_intrabc = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_angle_delta,
+                              argv, err_string)) {
+    extra_cfg.enable_angle_delta = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.reduced_tx_type_set,
+                              argv, err_string)) {
+    extra_cfg.reduced_tx_type_set = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.use_intra_dct_only,
+                              argv, err_string)) {
+    extra_cfg.use_intra_dct_only = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.use_inter_dct_only,
+                              argv, err_string)) {
+    extra_cfg.use_inter_dct_only = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.use_intra_default_tx_only,
+                              argv, err_string)) {
+    extra_cfg.use_intra_default_tx_only =
+        arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.quant_b_adapt, argv,
+                              err_string)) {
+    extra_cfg.quant_b_adapt = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.vbr_corpus_complexity_lap,
+                              argv, err_string)) {
+    extra_cfg.vbr_corpus_complexity_lap =
+        arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.set_tier_mask, argv,
+                              err_string)) {
+    extra_cfg.tier_mask = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.set_min_cr, argv,
+                              err_string)) {
+    extra_cfg.min_cr = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.coeff_cost_upd_freq,
+                              argv, err_string)) {
+    extra_cfg.coeff_cost_upd_freq = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mode_cost_upd_freq,
+                              argv, err_string)) {
+    extra_cfg.mode_cost_upd_freq = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mv_cost_upd_freq,
+                              argv, err_string)) {
+    extra_cfg.mv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.dv_cost_upd_freq,
+                              argv, err_string)) {
+    extra_cfg.dv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string);
+  }
+#if CONFIG_DENOISE
+  else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.denoise_noise_level,
+                            argv, err_string)) {
+    extra_cfg.noise_level =
+        (float)arg_parse_int_helper(&arg, err_string) / 10.0f;
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.denoise_block_size,
+                              argv, err_string)) {
+    extra_cfg.noise_block_size = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_dnl_denoising,
+                              argv, err_string)) {
+    extra_cfg.enable_dnl_denoising = arg_parse_uint_helper(&arg, err_string);
+  }
+#endif
+  else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.target_seq_level_idx,
+                            argv, err_string)) {
+    const int val = arg_parse_int_helper(&arg, err_string);
+    const int level = val % 100;
+    const int operating_point_idx = val / 100;
+    if (operating_point_idx >= 0 &&
+        operating_point_idx < MAX_NUM_OPERATING_POINTS) {
+      extra_cfg.target_seq_level_idx[operating_point_idx] = (AV1_LEVEL)level;
+    }
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.input_chroma_subsampling_x,
+                              argv, err_string)) {
+    extra_cfg.chroma_subsampling_x = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.input_chroma_subsampling_y,
+                              argv, err_string)) {
+    extra_cfg.chroma_subsampling_y = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.passes, argv,
+                              err_string)) {
+    extra_cfg.passes = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.fwd_kf_dist, argv,
+                              err_string)) {
+    extra_cfg.fwd_kf_dist = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.two_pass_output, argv,
+                              err_string)) {
+    err = allocate_and_set_string(value, default_extra_cfg.two_pass_output,
+                                  &extra_cfg.two_pass_output, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.second_pass_log, argv,
+                              err_string)) {
+    err = allocate_and_set_string(value, default_extra_cfg.second_pass_log,
+                                  &extra_cfg.second_pass_log, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.loopfilter_control,
+                              argv, err_string)) {
+    extra_cfg.loopfilter_control = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.auto_intra_tools_off,
+                              argv, err_string)) {
+    extra_cfg.auto_intra_tools_off = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.strict_level_conformance,
+                              argv, err_string)) {
+    extra_cfg.strict_level_conformance = arg_parse_int_helper(&arg, err_string);
+  } else {
+    match = 0;
+    snprintf(err_string, ARG_ERR_MSG_MAX_LEN, "Cannot find aom option %s",
+             name);
+  }
+  aom_free(argv[0]);
+
+  if (err != AOM_CODEC_OK) {
+    ctx->base.err_detail = err_string;
+    return err;
+  }
+
+  if (strlen(err_string) != 0) {
+    ctx->base.err_detail = err_string;
+    return AOM_CODEC_INVALID_PARAM;
+  }
+
+  ctx->base.err_detail = NULL;
+
+  if (!match) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx,
                                               va_list args) {
   int *const arg = va_arg(args, int *);
-  const AV1_COMP *const cpi = ctx->cpi;
   if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
-  return av1_get_seq_level_idx(&cpi->common.seq_params, &cpi->level_params,
+  return av1_get_seq_level_idx(&ctx->ppi->seq_params, &ctx->ppi->level_params,
                                arg);
 }
 
+static aom_codec_err_t ctrl_get_target_seq_level_idx(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  return av1_get_target_seq_level_idx(&ctx->ppi->seq_params,
+                                      &ctx->ppi->level_params, arg);
+}
+
 static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1_COPY_REFERENCE, ctrl_copy_reference },
   { AOME_USE_REFERENCE, ctrl_use_reference },
@@ -2641,6 +3998,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AOME_SET_SHARPNESS, ctrl_set_sharpness },
   { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
   { AV1E_SET_ROW_MT, ctrl_set_row_mt },
+  { AV1E_SET_FP_MT, ctrl_set_fp_mt },
   { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
   { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows },
   { AV1E_SET_ENABLE_TPL_MODEL, ctrl_set_enable_tpl_model },
@@ -2682,6 +4040,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint },
   { AV1E_SET_ENABLE_TX64, ctrl_set_enable_tx64 },
   { AV1E_SET_ENABLE_FLIP_IDTX, ctrl_set_enable_flip_idtx },
+  { AV1E_SET_ENABLE_RECT_TX, ctrl_set_enable_rect_tx },
   { AV1E_SET_ENABLE_DIST_WTD_COMP, ctrl_set_enable_dist_wtd_comp },
   { AV1E_SET_MAX_REFERENCE_FRAMES, ctrl_set_max_reference_frames },
   { AV1E_SET_REDUCED_REFERENCE_SET, ctrl_set_enable_reduced_reference_set },
@@ -2701,6 +4060,8 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_ENABLE_SMOOTH_INTRA, ctrl_set_enable_smooth_intra },
   { AV1E_SET_ENABLE_PAETH_INTRA, ctrl_set_enable_paeth_intra },
   { AV1E_SET_ENABLE_CFL_INTRA, ctrl_set_enable_cfl_intra },
+  { AV1E_SET_ENABLE_DIRECTIONAL_INTRA, ctrl_set_enable_directional_intra },
+  { AV1E_SET_ENABLE_DIAGONAL_INTRA, ctrl_set_enable_diagonal_intra },
   { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres },
   { AV1E_SET_ENABLE_OVERLAY, ctrl_set_enable_overlay },
   { AV1E_SET_ENABLE_PALETTE, ctrl_set_enable_palette },
@@ -2716,6 +4077,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_MODE_COST_UPD_FREQ, ctrl_set_mode_cost_upd_freq },
   { AV1E_SET_MV_COST_UPD_FREQ, ctrl_set_mv_cost_upd_freq },
   { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
+  { AV1E_SET_DELTAQ_STRENGTH, ctrl_set_deltaq_strength },
   { AV1E_SET_DELTALF_MODE, ctrl_set_deltalf_mode },
   { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
   { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
@@ -2734,11 +4096,14 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
   { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
   { AV1E_SET_VMAF_MODEL_PATH, ctrl_set_vmaf_model_path },
+  { AV1E_SET_PARTITION_INFO_PATH, ctrl_set_partition_info_path },
   { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
   { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
   { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level },
   { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size },
+  { AV1E_SET_ENABLE_DNL_DENOISING, ctrl_set_enable_dnl_denoising },
   { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
+  { AV1E_SET_FP_MT_UNIT_TEST, ctrl_enable_fpmt_unit_test },
   { AV1E_ENABLE_EXT_TILE_DEBUG, ctrl_enable_ext_tile_debug },
   { AV1E_SET_TARGET_SEQ_LEVEL_IDX, ctrl_set_target_seq_level_idx },
   { AV1E_SET_TIER_MASK, ctrl_set_tier_mask },
@@ -2746,11 +4111,20 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_SVC_LAYER_ID, ctrl_set_layer_id },
   { AV1E_SET_SVC_PARAMS, ctrl_set_svc_params },
   { AV1E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },
+  { AV1E_SET_SVC_REF_FRAME_COMP_PRED, ctrl_set_svc_ref_frame_comp_pred },
+  { AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, ctrl_set_vbr_corpus_complexity_lap },
   { AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, ctrl_enable_sb_multipass_unit_test },
+  { AV1E_SET_DV_COST_UPD_FREQ, ctrl_set_dv_cost_upd_freq },
+  { AV1E_SET_EXTERNAL_PARTITION, ctrl_set_external_partition },
+  { AV1E_SET_ENABLE_TX_SIZE_SEARCH, ctrl_set_enable_tx_size_search },
+  { AV1E_SET_LOOPFILTER_CONTROL, ctrl_set_loopfilter_control },
+  { AV1E_SET_AUTO_INTRA_TOOLS_OFF, ctrl_set_auto_intra_tools_off },
+  { AV1E_SET_RTC_EXTERNAL_RC, ctrl_set_rtc_external_rc },
 
   // Getters
   { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
   { AOME_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 },
+  { AOME_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level },
   { AV1_GET_REFERENCE, ctrl_get_reference },
   { AV1E_GET_ACTIVEMAP, ctrl_get_active_map },
   { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
@@ -2758,18 +4132,22 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x },
   { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y },
   { AV1E_GET_SEQ_LEVEL_IDX, ctrl_get_seq_level_idx },
-  { -1, NULL },
+  { AV1E_GET_BASELINE_GF_INTERVAL, ctrl_get_baseline_gf_interval },
+  { AV1E_GET_TARGET_SEQ_LEVEL_IDX, ctrl_get_target_seq_level_idx },
+
+  CTRL_MAP_END,
 };
 
 static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
+#if !CONFIG_REALTIME_ONLY
   {
       // NOLINT
       AOM_USAGE_GOOD_QUALITY,  // g_usage - non-realtime usage
       0,                       // g_threads
       0,                       // g_profile
 
-      320,         // g_width
-      240,         // g_height
+      320,         // g_w
+      240,         // g_h
       0,           // g_limit
       0,           // g_forced_max_frame_width
       0,           // g_forced_max_frame_height
@@ -2782,23 +4160,23 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
 
       AOM_RC_ONE_PASS,  // g_pass
 
-      19,  // g_lag_in_frames
+      35,  // g_lag_in_frames
 
       0,                // rc_dropframe_thresh
       RESIZE_NONE,      // rc_resize_mode
       SCALE_NUMERATOR,  // rc_resize_denominator
       SCALE_NUMERATOR,  // rc_resize_kf_denominator
 
-      SUPERRES_NONE,    // rc_superres_mode
-      SCALE_NUMERATOR,  // rc_superres_denominator
-      SCALE_NUMERATOR,  // rc_superres_kf_denominator
-      63,               // rc_superres_qthresh
-      32,               // rc_superres_kf_qthresh
+      AOM_SUPERRES_NONE,  // rc_superres_mode
+      SCALE_NUMERATOR,    // rc_superres_denominator
+      SCALE_NUMERATOR,    // rc_superres_kf_denominator
+      63,                 // rc_superres_qthresh
+      32,                 // rc_superres_kf_qthresh
 
       AOM_VBR,      // rc_end_usage
       { NULL, 0 },  // rc_twopass_stats_in
       { NULL, 0 },  // rc_firstpass_mb_stats_in
-      256,          // rc_target_bandwidth
+      256,          // rc_target_bitrate
       0,            // rc_min_quantizer
       63,           // rc_max_quantizer
       25,           // rc_undershoot_pct
@@ -2814,7 +4192,7 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
 
       // keyframing settings (kf)
       0,                       // fwd_kf_enabled
-      AOM_KF_AUTO,             // g_kfmode
+      AOM_KF_AUTO,             // kf_mode
       0,                       // kf_min_dist
       9999,                    // kf_max_dist
       0,                       // sframe_dist
@@ -2832,14 +4210,15 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
       { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
   },
+#endif  // !CONFIG_REALTIME_ONLY
   {
       // NOLINT
       AOM_USAGE_REALTIME,  // g_usage - real-time usage
       0,                   // g_threads
       0,                   // g_profile
 
-      320,         // g_width
-      240,         // g_height
+      320,         // g_w
+      240,         // g_h
       0,           // g_limit
       0,           // g_forced_max_frame_width
       0,           // g_forced_max_frame_height
@@ -2852,23 +4231,23 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
 
       AOM_RC_ONE_PASS,  // g_pass
 
-      1,  // g_lag_in_frames
+      0,  // g_lag_in_frames
 
       0,                // rc_dropframe_thresh
       RESIZE_NONE,      // rc_resize_mode
       SCALE_NUMERATOR,  // rc_resize_denominator
       SCALE_NUMERATOR,  // rc_resize_kf_denominator
 
-      0,                // rc_superres_mode
-      SCALE_NUMERATOR,  // rc_superres_denominator
-      SCALE_NUMERATOR,  // rc_superres_kf_denominator
-      63,               // rc_superres_qthresh
-      32,               // rc_superres_kf_qthresh
+      AOM_SUPERRES_NONE,  // rc_superres_mode
+      SCALE_NUMERATOR,    // rc_superres_denominator
+      SCALE_NUMERATOR,    // rc_superres_kf_denominator
+      63,                 // rc_superres_qthresh
+      32,                 // rc_superres_kf_qthresh
 
       AOM_CBR,      // rc_end_usage
       { NULL, 0 },  // rc_twopass_stats_in
       { NULL, 0 },  // rc_firstpass_mb_stats_in
-      256,          // rc_target_bandwidth
+      256,          // rc_target_bitrate
       0,            // rc_min_quantizer
       63,           // rc_max_quantizer
       25,           // rc_undershoot_pct
@@ -2884,7 +4263,7 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
 
       // keyframing settings (kf)
       0,                       // fwd_kf_enabled
-      AOM_KF_AUTO,             // g_kfmode
+      AOM_KF_AUTO,             // kf_mode
       0,                       // kf_min_dist
       9999,                    // kf_max_dist
       0,                       // sframe_dist
@@ -2902,12 +4281,85 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
       { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
   },
+#if !CONFIG_REALTIME_ONLY
+  {
+      // NOLINT
+      AOM_USAGE_ALL_INTRA,  // g_usage - all intra usage
+      0,                    // g_threads
+      0,                    // g_profile
+
+      320,         // g_w
+      240,         // g_h
+      0,           // g_limit
+      0,           // g_forced_max_frame_width
+      0,           // g_forced_max_frame_height
+      AOM_BITS_8,  // g_bit_depth
+      8,           // g_input_bit_depth
+
+      { 1, 30 },  // g_timebase
+
+      0,  // g_error_resilient
+
+      AOM_RC_ONE_PASS,  // g_pass
+
+      0,  // g_lag_in_frames
+
+      0,                // rc_dropframe_thresh
+      RESIZE_NONE,      // rc_resize_mode
+      SCALE_NUMERATOR,  // rc_resize_denominator
+      SCALE_NUMERATOR,  // rc_resize_kf_denominator
+
+      AOM_SUPERRES_NONE,  // rc_superres_mode
+      SCALE_NUMERATOR,    // rc_superres_denominator
+      SCALE_NUMERATOR,    // rc_superres_kf_denominator
+      63,                 // rc_superres_qthresh
+      32,                 // rc_superres_kf_qthresh
+
+      AOM_Q,        // rc_end_usage
+      { NULL, 0 },  // rc_twopass_stats_in
+      { NULL, 0 },  // rc_firstpass_mb_stats_in
+      256,          // rc_target_bitrate
+      0,            // rc_min_quantizer
+      63,           // rc_max_quantizer
+      25,           // rc_undershoot_pct
+      25,           // rc_overshoot_pct
+
+      6000,  // rc_max_buffer_size
+      4000,  // rc_buffer_initial_size
+      5000,  // rc_buffer_optimal_size
+
+      50,    // rc_two_pass_vbrbias
+      0,     // rc_two_pass_vbrmin_section
+      2000,  // rc_two_pass_vbrmax_section
+
+      // keyframing settings (kf)
+      0,                       // fwd_kf_enabled
+      AOM_KF_DISABLED,         // kf_mode
+      0,                       // kf_min_dist
+      0,                       // kf_max_dist
+      0,                       // sframe_dist
+      1,                       // sframe_mode
+      0,                       // large_scale_tile
+      0,                       // monochrome
+      0,                       // full_still_picture_hdr
+      0,                       // save_as_annexb
+      0,                       // tile_width_count
+      0,                       // tile_height_count
+      { 0 },                   // tile_widths
+      { 0 },                   // tile_heights
+      0,                       // use_fixed_qp_offsets
+      { -1, -1, -1, -1, -1 },  // fixed_qp_offsets
+      { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
+  },
+#endif  // !CONFIG_REALTIME_ONLY
 };
 
+// This data structure and function are exported in aom/aomcx.h
 #ifndef VERSION_STRING
 #define VERSION_STRING
 #endif
-CODEC_INTERFACE(aom_codec_av1_cx) = {
+aom_codec_iface_t aom_codec_av1_cx_algo = {
   "AOMedia Project AV1 Encoder" VERSION_STRING,
   AOM_CODEC_INTERNAL_ABI_VERSION,
   AOM_CODEC_CAP_HIGHBITDEPTH | AOM_CODEC_CAP_ENCODER |
@@ -2925,12 +4377,15 @@ CODEC_INTERFACE(aom_codec_av1_cx) = {
   },
   {
       // NOLINT
-      2,                           // 2 cfg
-      encoder_usage_cfg,           // aom_codec_enc_cfg_t
-      encoder_encode,              // aom_codec_encode_fn_t
-      encoder_get_cxdata,          // aom_codec_get_cx_data_fn_t
-      encoder_set_config,          // aom_codec_enc_config_set_fn_t
-      encoder_get_global_headers,  // aom_codec_get_global_headers_fn_t
-      encoder_get_preview          // aom_codec_get_preview_frame_fn_t
-  }
+      NELEMENTS(encoder_usage_cfg),  // cfg_count
+      encoder_usage_cfg,             // aom_codec_enc_cfg_t
+      encoder_encode,                // aom_codec_encode_fn_t
+      encoder_get_cxdata,            // aom_codec_get_cx_data_fn_t
+      encoder_set_config,            // aom_codec_enc_config_set_fn_t
+      encoder_get_global_headers,    // aom_codec_get_global_headers_fn_t
+      encoder_get_preview            // aom_codec_get_preview_frame_fn_t
+  },
+  encoder_set_option  // aom_codec_set_option_fn_t
 };
+
+aom_codec_iface_t *aom_codec_av1_cx(void) { return &aom_codec_av1_cx_algo; }
diff --git a/media/libaom/src/av1/av1_dx_iface.c b/media/libaom/src/av1/av1_dx_iface.c
index d821a52f6e..cb5c6e374f 100644
--- a/media/libaom/src/av1/av1_dx_iface.c
+++ b/media/libaom/src/av1/av1_dx_iface.c
@@ -31,6 +31,7 @@
 
 #include "av1/decoder/decoder.h"
 #include "av1/decoder/decodeframe.h"
+#include "av1/decoder/grain_synthesis.h"
 #include "av1/decoder/obu.h"
 
 #include "av1/av1_iface_common.h"
@@ -115,12 +116,15 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
   if (ctx->frame_worker != NULL) {
     AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    AV1Decoder *const pbi = frame_worker_data->pbi;
     aom_get_worker_interface()->end(worker);
-    aom_free(frame_worker_data->pbi->common.tpl_mvs);
-    frame_worker_data->pbi->common.tpl_mvs = NULL;
+    aom_free(pbi->common.tpl_mvs);
+    pbi->common.tpl_mvs = NULL;
     av1_remove_common(&frame_worker_data->pbi->common);
-    av1_free_restoration_buffers(&frame_worker_data->pbi->common);
-    av1_decoder_remove(frame_worker_data->pbi);
+    av1_free_cdef_buffers(&pbi->common, &pbi->cdef_worker, &pbi->cdef_sync);
+    av1_free_cdef_sync(&pbi->cdef_sync);
+    av1_free_restoration_buffers(&pbi->common);
+    av1_decoder_remove(pbi);
     aom_free(frame_worker_data);
 #if CONFIG_MULTITHREAD
     pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
@@ -153,7 +157,7 @@ static aom_codec_err_t parse_timing_info(struct aom_read_bit_buffer *rb) {
   if (equal_picture_interval) {
     const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
     if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
-      // num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.
+      // num_ticks_per_picture_minus_1 cannot be (1 << 32) - 1.
       return AOM_CODEC_UNSUP_BITSTREAM;
     }
   }
@@ -390,7 +394,7 @@ static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
     pool->release_fb_cb = av1_release_frame_buffer;
 
     if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                          "Failed to initialize internal frame buffers");
 
     pool->cb_priv = &pool->int_frame_buffers;
@@ -467,7 +471,8 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
   frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
   frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
   frame_worker_data->pbi->row_mt = ctx->row_mt;
-
+  frame_worker_data->pbi->is_fwd_kf_present = 0;
+  frame_worker_data->pbi->is_arf_frame_present = 0;
   worker->hook = frame_worker_hook;
 
   init_buffer_callbacks(ctx);
@@ -524,22 +529,85 @@ static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
   *data = frame_worker_data->data_end;
 
   if (worker->had_error)
-    return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+    return update_error_state(ctx, &frame_worker_data->pbi->error);
 
   check_resync(ctx, frame_worker_data->pbi);
 
   return AOM_CODEC_OK;
 }
 
-#if CONFIG_INSPECTION
+static void release_pending_output_frames(aom_codec_alg_priv_t *ctx) {
+  // Release any pending output frames from the previous decoder_decode or
+  // decoder_inspect call. We need to do this even if the decoder is being
+  // flushed or the input arguments are invalid.
+  if (ctx->frame_worker) {
+    BufferPool *const pool = ctx->buffer_pool;
+    lock_buffer_pool(pool);
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    struct AV1Decoder *pbi = frame_worker_data->pbi;
+    for (size_t j = 0; j < pbi->num_output_frames; j++) {
+      decrease_ref_count(pbi->output_frames[j], pool);
+    }
+    pbi->num_output_frames = 0;
+    unlock_buffer_pool(pool);
+    for (size_t j = 0; j < ctx->num_grain_image_frame_buffers; j++) {
+      pool->release_fb_cb(pool->cb_priv, &ctx->grain_image_frame_buffers[j]);
+      ctx->grain_image_frame_buffers[j].data = NULL;
+      ctx->grain_image_frame_buffers[j].size = 0;
+      ctx->grain_image_frame_buffers[j].priv = NULL;
+    }
+    ctx->num_grain_image_frame_buffers = 0;
+  }
+}
+
 // This function enables the inspector to inspect non visible frames.
 static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx,
                                        const uint8_t *data, size_t data_sz,
                                        void *user_priv) {
   aom_codec_err_t res = AOM_CODEC_OK;
 
-  const uint8_t *const data_end = data + data_sz;
-  Av1DecodeReturn *data2 = (Av1DecodeReturn *)user_priv;
+  release_pending_output_frames(ctx);
+
+  /* Sanity checks */
+  /* NULL data ptr allowed if data_sz is 0 too */
+  if (data == NULL && data_sz == 0) {
+    ctx->flushed = 1;
+    return AOM_CODEC_OK;
+  }
+  if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM;
+
+  // Reset flushed when receiving a valid frame.
+  ctx->flushed = 0;
+
+  const uint8_t *data_start = data;
+  const uint8_t *data_end = data + data_sz;
+
+  uint64_t frame_size;
+  if (ctx->is_annexb) {
+    // read the size of this temporal unit
+    size_t length_of_size;
+    uint64_t temporal_unit_size;
+    if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size,
+                        &length_of_size) != 0) {
+      return AOM_CODEC_CORRUPT_FRAME;
+    }
+    data_start += length_of_size;
+    if (temporal_unit_size > (size_t)(data_end - data_start))
+      return AOM_CODEC_CORRUPT_FRAME;
+    data_end = data_start + temporal_unit_size;
+
+    // read the size of this frame unit
+    if (aom_uleb_decode(data_start, (size_t)(data_end - data_start),
+                        &frame_size, &length_of_size) != 0) {
+      return AOM_CODEC_CORRUPT_FRAME;
+    }
+    data_start += length_of_size;
+    if (frame_size > (size_t)(data_end - data_start))
+      return AOM_CODEC_CORRUPT_FRAME;
+  } else {
+    frame_size = (uint64_t)(data_end - data_start);
+  }
 
   if (ctx->frame_worker == NULL) {
     res = init_decoder(ctx);
@@ -549,29 +617,34 @@ static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx,
       (FrameWorkerData *)ctx->frame_worker->data1;
   AV1Decoder *const pbi = frame_worker_data->pbi;
   AV1_COMMON *const cm = &pbi->common;
+#if CONFIG_INSPECTION
   frame_worker_data->pbi->inspect_cb = ctx->inspect_cb;
   frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx;
-  res = av1_receive_compressed_data(frame_worker_data->pbi, data_sz, &data);
+#endif
+  res = av1_receive_compressed_data(frame_worker_data->pbi, (size_t)frame_size,
+                                    &data_start);
   check_resync(ctx, frame_worker_data->pbi);
 
   if (ctx->frame_worker->had_error)
-    return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+    return update_error_state(ctx, &frame_worker_data->pbi->error);
 
   // Allow extra zero bytes after the frame end
-  while (data < data_end) {
-    const uint8_t marker = data[0];
+  while (data_start < data_end) {
+    const uint8_t marker = data_start[0];
     if (marker) break;
-    ++data;
+    ++data_start;
   }
 
+  Av1DecodeReturn *data2 = (Av1DecodeReturn *)user_priv;
   data2->idx = -1;
-  for (int i = 0; i < REF_FRAMES; ++i)
-    if (cm->ref_frame_map[i] == cm->cur_frame) data2->idx = i;
-  data2->buf = data;
+  if (cm->cur_frame) {
+    for (int i = 0; i < REF_FRAMES; ++i)
+      if (cm->ref_frame_map[i] == cm->cur_frame) data2->idx = i;
+  }
+  data2->buf = data_start;
   data2->show_existing = cm->show_existing_frame;
   return res;
 }
-#endif
 
 static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
                                       const uint8_t *data, size_t data_sz,
@@ -583,28 +656,8 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
     return decoder_inspect(ctx, data, data_sz, user_priv);
   }
 #endif
-  // Release any pending output frames from the previous decoder_decode call.
-  // We need to do this even if the decoder is being flushed or the input
-  // arguments are invalid.
-  if (ctx->frame_worker) {
-    BufferPool *const pool = ctx->buffer_pool;
-    lock_buffer_pool(pool);
-    AVxWorker *const worker = ctx->frame_worker;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    struct AV1Decoder *pbi = frame_worker_data->pbi;
-    for (size_t j = 0; j < pbi->num_output_frames; j++) {
-      decrease_ref_count(pbi->output_frames[j], pool);
-    }
-    pbi->num_output_frames = 0;
-    unlock_buffer_pool(pool);
-    for (size_t j = 0; j < ctx->num_grain_image_frame_buffers; j++) {
-      pool->release_fb_cb(pool->cb_priv, &ctx->grain_image_frame_buffers[j]);
-      ctx->grain_image_frame_buffers[j].data = NULL;
-      ctx->grain_image_frame_buffers[j].size = 0;
-      ctx->grain_image_frame_buffers[j].priv = NULL;
-    }
-    ctx->num_grain_image_frame_buffers = 0;
-  }
+
+  release_pending_output_frames(ctx);
 
   /* Sanity checks */
   /* NULL data ptr allowed if data_sz is 0 too */
@@ -814,13 +867,13 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
 
         ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv;
         img = &ctx->img;
-        img->temporal_id = cm->temporal_layer_id;
-        img->spatial_id = cm->spatial_layer_id;
+        img->temporal_id = output_frame_buf->temporal_id;
+        img->spatial_id = output_frame_buf->spatial_id;
         if (pbi->skip_film_grain) grain_params->apply_grain = 0;
         aom_image_t *res =
             add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params);
         if (!res) {
-          aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Grain systhesis failed\n");
         }
         *index += 1;  // Advance the iterator to point to the next image
@@ -964,11 +1017,193 @@ static aom_codec_err_t ctrl_get_last_quantizer(aom_codec_alg_priv_t *ctx,
                                                va_list args) {
   int *const arg = va_arg(args, int *);
   if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
   *arg = ((FrameWorkerData *)ctx->frame_worker->data1)
              ->pbi->common.quant_params.base_qindex;
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_get_fwd_kf_value(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+  *arg = ((FrameWorkerData *)ctx->frame_worker->data1)->pbi->is_fwd_kf_present;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_altref_present(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+  *arg =
+      ((FrameWorkerData *)ctx->frame_worker->data1)->pbi->is_arf_frame_present;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_frame_flags(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+  AV1Decoder *pbi = ((FrameWorkerData *)ctx->frame_worker->data1)->pbi;
+  *arg = 0;
+  switch (pbi->common.current_frame.frame_type) {
+    case KEY_FRAME:
+      *arg |= AOM_FRAME_IS_KEY;
+      *arg |= AOM_FRAME_IS_INTRAONLY;
+      if (!pbi->common.show_frame) {
+        *arg |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT;
+      }
+      break;
+    case INTRA_ONLY_FRAME: *arg |= AOM_FRAME_IS_INTRAONLY; break;
+    case S_FRAME: *arg |= AOM_FRAME_IS_SWITCH; break;
+  }
+  if (pbi->common.features.error_resilient_mode) {
+    *arg |= AOM_FRAME_IS_ERROR_RESILIENT;
+  }
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_tile_info(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  aom_tile_info *const tile_info = va_arg(args, aom_tile_info *);
+
+  if (tile_info) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      const CommonTileParams *tiles = &pbi->common.tiles;
+
+      int tile_rows = tiles->rows;
+      int tile_cols = tiles->cols;
+
+      if (tiles->uniform_spacing) {
+        tile_info->tile_rows = 1 << tiles->log2_rows;
+        tile_info->tile_columns = 1 << tiles->log2_cols;
+      } else {
+        tile_info->tile_rows = tile_rows;
+        tile_info->tile_columns = tile_cols;
+      }
+
+      for (int tile_col = 1; tile_col <= tile_cols; tile_col++) {
+        tile_info->tile_widths[tile_col - 1] =
+            tiles->col_start_sb[tile_col] - tiles->col_start_sb[tile_col - 1];
+      }
+
+      for (int tile_row = 1; tile_row <= tile_rows; tile_row++) {
+        tile_info->tile_heights[tile_row - 1] =
+            tiles->row_start_sb[tile_row] - tiles->row_start_sb[tile_row - 1];
+      }
+      tile_info->num_tile_groups = pbi->num_tile_groups;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_screen_content_tools_info(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  aom_screen_content_tools_info *const sc_info =
+      va_arg(args, aom_screen_content_tools_info *);
+  if (sc_info) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      sc_info->allow_screen_content_tools =
+          pbi->common.features.allow_screen_content_tools;
+      sc_info->allow_intrabc = pbi->common.features.allow_intrabc;
+      sc_info->force_integer_mv =
+          (int)pbi->common.features.cur_frame_force_integer_mv;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_still_picture(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  aom_still_picture_info *const still_picture_info =
+      va_arg(args, aom_still_picture_info *);
+  if (still_picture_info) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      still_picture_info->is_still_picture = (int)pbi->seq_params.still_picture;
+      still_picture_info->is_reduced_still_picture_hdr =
+          (int)(pbi->seq_params.reduced_still_picture_hdr);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_sb_size(aom_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  aom_superblock_size_t *const sb_size = va_arg(args, aom_superblock_size_t *);
+  if (sb_size) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      if (pbi->seq_params.sb_size == BLOCK_128X128) {
+        *sb_size = AOM_SUPERBLOCK_SIZE_128X128;
+      } else {
+        *sb_size = AOM_SUPERBLOCK_SIZE_64X64;
+      }
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_show_existing_frame_flag(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+  *arg = ((FrameWorkerData *)ctx->frame_worker->data1)
+             ->pbi->common.show_existing_frame;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_s_frame_info(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  aom_s_frame_info *const s_frame_info = va_arg(args, aom_s_frame_info *);
+  if (s_frame_info) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      s_frame_info->is_s_frame = pbi->sframe_info.is_s_frame;
+      s_frame_info->is_s_frame_at_altref =
+          pbi->sframe_info.is_s_frame_at_altref;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_INVALID_PARAM;
+}
+
 static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   int *corrupted = va_arg(args, int *);
@@ -1026,6 +1261,7 @@ static aom_codec_err_t ctrl_get_frame_header_info(aom_codec_alg_priv_t *ctx,
       frame_header_info->coded_tile_data_size = pbi->obu_size_hdr.size;
       frame_header_info->coded_tile_data = pbi->obu_size_hdr.data;
       frame_header_info->extra_size = pbi->frame_header_size;
+      return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
     }
@@ -1104,7 +1340,7 @@ static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
-      *bit_depth = cm->seq_params.bit_depth;
+      *bit_depth = cm->seq_params->bit_depth;
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
@@ -1140,9 +1376,9 @@ static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx,
           (FrameWorkerData *)worker->data1;
       const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
 
-      *img_fmt = get_img_format(cm->seq_params.subsampling_x,
-                                cm->seq_params.subsampling_y,
-                                cm->seq_params.use_highbitdepth);
+      *img_fmt = get_img_format(cm->seq_params->subsampling_x,
+                                cm->seq_params->subsampling_y,
+                                cm->seq_params->use_highbitdepth);
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
@@ -1191,6 +1427,65 @@ static aom_codec_err_t ctrl_get_tile_count(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_INVALID_PARAM;
 }
 
+static aom_codec_err_t ctrl_get_base_q_idx(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+  FrameWorkerData *const frame_worker_data =
+      (FrameWorkerData *)ctx->frame_worker->data1;
+  *arg = frame_worker_data->pbi->common.quant_params.base_qindex;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_show_frame_flag(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+  FrameWorkerData *const frame_worker_data =
+      (FrameWorkerData *)ctx->frame_worker->data1;
+  *arg = frame_worker_data->pbi->common.show_frame;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_order_hint(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  unsigned int *const arg = va_arg(args, unsigned int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+  FrameWorkerData *const frame_worker_data =
+      (FrameWorkerData *)ctx->frame_worker->data1;
+  *arg = frame_worker_data->pbi->common.current_frame.order_hint;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_mi_info(aom_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  int mi_row = va_arg(args, int);
+  int mi_col = va_arg(args, int);
+  MB_MODE_INFO *mi = va_arg(args, MB_MODE_INFO *);
+  if (mi == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+  FrameWorkerData *const frame_worker_data =
+      (FrameWorkerData *)ctx->frame_worker->data1;
+  if (frame_worker_data == NULL) return AOM_CODEC_ERROR;
+
+  AV1_COMMON *cm = &frame_worker_data->pbi->common;
+  const int mi_rows = cm->mi_params.mi_rows;
+  const int mi_cols = cm->mi_params.mi_cols;
+  const int mi_stride = cm->mi_params.mi_stride;
+  const int offset = mi_row * mi_stride + mi_col;
+
+  if (mi_row < 0 || mi_row >= mi_rows || mi_col < 0 || mi_col >= mi_cols) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+
+  memcpy(mi, cm->mi_params.mi_grid_base[offset], sizeof(*mi));
+
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx,
                                                   va_list args) {
   ctx->invert_tile_order = va_arg(args, int);
@@ -1252,17 +1547,25 @@ static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx,
   (void)args;
   return AOM_CODEC_INCAPABLE;
 #else
-  if (ctx->frame_worker) {
-    AVxWorker *const worker = ctx->frame_worker;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    AV1Decoder *pbi = frame_worker_data->pbi;
-    Accounting **acct = va_arg(args, Accounting **);
-    *acct = &pbi->accounting;
-    return AOM_CODEC_OK;
+  Accounting **acct = va_arg(args, Accounting **);
+
+  if (acct) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      AV1Decoder *pbi = frame_worker_data->pbi;
+      *acct = &pbi->accounting;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
   }
-  return AOM_CODEC_ERROR;
+
+  return AOM_CODEC_INVALID_PARAM;
 #endif
 }
+
 static aom_codec_err_t ctrl_set_decode_tile_row(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   ctx->decode_tile_row = va_arg(args, int);
@@ -1361,14 +1664,27 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { AV1_GET_REFERENCE, ctrl_get_reference },
   { AV1D_GET_FRAME_HEADER_INFO, ctrl_get_frame_header_info },
   { AV1D_GET_TILE_DATA, ctrl_get_tile_data },
-
-  { -1, NULL },
+  { AOMD_GET_FWD_KF_PRESENT, ctrl_get_fwd_kf_value },
+  { AOMD_GET_ALTREF_PRESENT, ctrl_get_altref_present },
+  { AOMD_GET_FRAME_FLAGS, ctrl_get_frame_flags },
+  { AOMD_GET_TILE_INFO, ctrl_get_tile_info },
+  { AOMD_GET_SCREEN_CONTENT_TOOLS_INFO, ctrl_get_screen_content_tools_info },
+  { AOMD_GET_STILL_PICTURE, ctrl_get_still_picture },
+  { AOMD_GET_SB_SIZE, ctrl_get_sb_size },
+  { AOMD_GET_SHOW_EXISTING_FRAME_FLAG, ctrl_get_show_existing_frame_flag },
+  { AOMD_GET_S_FRAME_INFO, ctrl_get_s_frame_info },
+  { AOMD_GET_SHOW_FRAME_FLAG, ctrl_get_show_frame_flag },
+  { AOMD_GET_BASE_Q_IDX, ctrl_get_base_q_idx },
+  { AOMD_GET_ORDER_HINT, ctrl_get_order_hint },
+  { AV1D_GET_MI_INFO, ctrl_get_mi_info },
+  CTRL_MAP_END,
 };
 
+// This data structure and function are exported in aom/aomdx.h
 #ifndef VERSION_STRING
 #define VERSION_STRING
 #endif
-CODEC_INTERFACE(aom_codec_av1_dx) = {
+aom_codec_iface_t aom_codec_av1_dx_algo = {
   "AOMedia Project AV1 Decoder" VERSION_STRING,
   AOM_CODEC_INTERNAL_ABI_VERSION,
   AOM_CODEC_CAP_DECODER |
@@ -1393,5 +1709,40 @@ CODEC_INTERFACE(aom_codec_av1_dx) = {
       NULL,  // aom_codec_enc_config_set_fn_t
       NULL,  // aom_codec_get_global_headers_fn_t
       NULL   // aom_codec_get_preview_frame_fn_t
-  }
+  },
+  NULL  // aom_codec_set_option_fn_t
 };
+
+// Decoder interface for inspecting frame data. It uses decoder_inspect instead
+// of decoder_decode so it only decodes one frame at a time, whether the frame
+// is shown or not.
+aom_codec_iface_t aom_codec_av1_inspect_algo = {
+  "AOMedia Project AV1 Decoder Inspector" VERSION_STRING,
+  AOM_CODEC_INTERNAL_ABI_VERSION,
+  AOM_CODEC_CAP_DECODER |
+      AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER,  // aom_codec_caps_t
+  decoder_init,                             // aom_codec_init_fn_t
+  decoder_destroy,                          // aom_codec_destroy_fn_t
+  decoder_ctrl_maps,                        // aom_codec_ctrl_fn_map_t
+  {
+      // NOLINT
+      decoder_peek_si,    // aom_codec_peek_si_fn_t
+      decoder_get_si,     // aom_codec_get_si_fn_t
+      decoder_inspect,    // aom_codec_decode_fn_t
+      decoder_get_frame,  // aom_codec_get_frame_fn_t
+      decoder_set_fb_fn,  // aom_codec_set_fb_fn_t
+  },
+  {
+      // NOLINT
+      0,
+      NULL,  // aom_codec_enc_cfg_t
+      NULL,  // aom_codec_encode_fn_t
+      NULL,  // aom_codec_get_cx_data_fn_t
+      NULL,  // aom_codec_enc_config_set_fn_t
+      NULL,  // aom_codec_get_global_headers_fn_t
+      NULL   // aom_codec_get_preview_frame_fn_t
+  },
+  NULL  // aom_codec_set_option_fn_t
+};
+
+aom_codec_iface_t *aom_codec_av1_dx(void) { return &aom_codec_av1_dx_algo; }
diff --git a/media/libaom/src/av1/av1_iface_common.h b/media/libaom/src/av1/av1_iface_common.h
index 9b5ffcba45..57dd1b8eda 100644
--- a/media/libaom/src/av1/av1_iface_common.h
+++ b/media/libaom/src/av1/av1_iface_common.h
@@ -16,8 +16,11 @@
 #include "aom_ports/mem.h"
 #include "aom_scale/yv12config.h"
 
-static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
-                            void *user_priv) {
+extern aom_codec_iface_t aom_codec_av1_inspect_algo;
+
+static AOM_INLINE void yuvconfig2image(aom_image_t *img,
+                                       const YV12_BUFFER_CONFIG *yv12,
+                                       void *user_priv) {
   /* aom_img_wrap() doesn't allow specifying independent strides for
    * the Y, U, and V planes, nor other alignment adjustments that
    * might be representable by a YV12_BUFFER_CONFIG, so we just
@@ -80,8 +83,8 @@ static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
   img->metadata = NULL;
 }
 
-static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
-                                       YV12_BUFFER_CONFIG *yv12) {
+static AOM_INLINE aom_codec_err_t image2yuvconfig(const aom_image_t *img,
+                                                  YV12_BUFFER_CONFIG *yv12) {
   yv12->y_buffer = img->planes[AOM_PLANE_Y];
   yv12->u_buffer = img->planes[AOM_PLANE_U];
   yv12->v_buffer = img->planes[AOM_PLANE_V];
@@ -93,12 +96,13 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
   yv12->y_width = img->w;
   yv12->y_height = img->h;
 
-  yv12->uv_width =
-      img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2 : yv12->y_width;
+  yv12->uv_width = (yv12->y_width + img->x_chroma_shift) >> img->x_chroma_shift;
   yv12->uv_height =
-      img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height;
-  yv12->uv_crop_width = yv12->uv_width;
-  yv12->uv_crop_height = yv12->uv_height;
+      (yv12->y_height + img->y_chroma_shift) >> img->y_chroma_shift;
+  yv12->uv_crop_width =
+      (yv12->y_crop_width + img->x_chroma_shift) >> img->x_chroma_shift;
+  yv12->uv_crop_height =
+      (yv12->y_crop_height + img->y_chroma_shift) >> img->y_chroma_shift;
 
   yv12->y_stride = img->stride[AOM_PLANE_Y];
   yv12->uv_stride = img->stride[AOM_PLANE_U];
diff --git a/media/libaom/src/av1/common/alloccommon.c b/media/libaom/src/av1/common/alloccommon.c
index badee3df92..5cf6c0fa7f 100644
--- a/media/libaom/src/av1/common/alloccommon.c
+++ b/media/libaom/src/av1/common/alloccommon.c
@@ -17,8 +17,10 @@
 #include "av1/common/alloccommon.h"
 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
+#include "av1/common/cdef_block.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/entropymv.h"
+#include "av1/common/thread_common.h"
 
 int av1_get_MBs(int width, int height) {
   const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
@@ -51,6 +53,238 @@ void av1_free_ref_frame_buffers(BufferPool *pool) {
   }
 }
 
+static INLINE void free_cdef_linebuf_conditional(
+    AV1_COMMON *const cm, const size_t *new_linebuf_size) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+    if (new_linebuf_size[plane] != cdef_info->allocated_linebuf_size[plane]) {
+      aom_free(cdef_info->linebuf[plane]);
+      cdef_info->linebuf[plane] = NULL;
+    }
+  }
+}
+
+static INLINE void free_cdef_bufs_conditional(AV1_COMMON *const cm,
+                                              uint16_t **colbuf,
+                                              uint16_t **srcbuf,
+                                              const size_t *new_colbuf_size,
+                                              const size_t new_srcbuf_size) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  if (new_srcbuf_size != cdef_info->allocated_srcbuf_size) {
+    aom_free(*srcbuf);
+    *srcbuf = NULL;
+  }
+  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+    if (new_colbuf_size[plane] != cdef_info->allocated_colbuf_size[plane]) {
+      aom_free(colbuf[plane]);
+      colbuf[plane] = NULL;
+    }
+  }
+}
+
+static INLINE void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) {
+  aom_free(*srcbuf);
+  *srcbuf = NULL;
+  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+    aom_free(colbuf[plane]);
+    colbuf[plane] = NULL;
+  }
+}
+
+static INLINE void free_cdef_row_sync(AV1CdefRowSync **cdef_row_mt,
+                                      const int num_mi_rows) {
+  if (*cdef_row_mt == NULL) return;
+#if CONFIG_MULTITHREAD
+  for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) {
+    pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_);
+    pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_);
+    aom_free((*cdef_row_mt)[row_idx].row_mutex_);
+    aom_free((*cdef_row_mt)[row_idx].row_cond_);
+  }
+#else
+  (void)num_mi_rows;
+#endif  // CONFIG_MULTITHREAD
+  aom_free(*cdef_row_mt);
+  *cdef_row_mt = NULL;
+}
+
+void av1_free_cdef_buffers(AV1_COMMON *const cm,
+                           AV1CdefWorkerData **cdef_worker,
+                           AV1CdefSync *cdef_sync) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  const int num_mi_rows = cdef_info->allocated_mi_rows;
+
+  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+    aom_free(cdef_info->linebuf[plane]);
+    cdef_info->linebuf[plane] = NULL;
+  }
+  // De-allocation of column buffer & source buffer (worker_0).
+  free_cdef_bufs(cdef_info->colbuf, &cdef_info->srcbuf);
+
+  free_cdef_row_sync(&cdef_sync->cdef_row_mt, num_mi_rows);
+
+  if (cdef_info->allocated_num_workers < 2) return;
+  if (*cdef_worker != NULL) {
+    for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--) {
+      // De-allocation of column buffer & source buffer for remaining workers.
+      free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf);
+    }
+    aom_free(*cdef_worker);
+    *cdef_worker = NULL;
+  }
+}
+
+static INLINE void alloc_cdef_linebuf(AV1_COMMON *const cm, uint16_t **linebuf,
+                                      const int num_planes) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (linebuf[plane] == NULL)
+      CHECK_MEM_ERROR(cm, linebuf[plane],
+                      aom_malloc(cdef_info->allocated_linebuf_size[plane]));
+  }
+}
+
+static INLINE void alloc_cdef_bufs(AV1_COMMON *const cm, uint16_t **colbuf,
+                                   uint16_t **srcbuf, const int num_planes) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  if (*srcbuf == NULL)
+    CHECK_MEM_ERROR(cm, *srcbuf,
+                    aom_memalign(16, cdef_info->allocated_srcbuf_size));
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (colbuf[plane] == NULL)
+      CHECK_MEM_ERROR(cm, colbuf[plane],
+                      aom_malloc(cdef_info->allocated_colbuf_size[plane]));
+  }
+}
+
+static INLINE void alloc_cdef_row_sync(AV1_COMMON *const cm,
+                                       AV1CdefRowSync **cdef_row_mt,
+                                       const int num_mi_rows) {
+  if (*cdef_row_mt != NULL) return;
+
+  CHECK_MEM_ERROR(cm, *cdef_row_mt,
+                  aom_malloc(sizeof(**cdef_row_mt) * num_mi_rows));
+#if CONFIG_MULTITHREAD
+  for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) {
+    CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_mutex_,
+                    aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_mutex_)));
+    pthread_mutex_init((*cdef_row_mt)[row_idx].row_mutex_, NULL);
+
+    CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_cond_,
+                    aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_cond_)));
+    pthread_cond_init((*cdef_row_mt)[row_idx].row_cond_, NULL);
+
+    (*cdef_row_mt)[row_idx].is_row_done = 0;
+  }
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_alloc_cdef_buffers(AV1_COMMON *const cm,
+                            AV1CdefWorkerData **cdef_worker,
+                            AV1CdefSync *cdef_sync, int num_workers,
+                            int init_worker) {
+  const int num_planes = av1_num_planes(cm);
+  size_t new_linebuf_size[MAX_MB_PLANE] = { 0 };
+  size_t new_colbuf_size[MAX_MB_PLANE] = { 0 };
+  size_t new_srcbuf_size = 0;
+  CdefInfo *const cdef_info = &cm->cdef_info;
+  // Check for configuration change
+  const int num_mi_rows =
+      (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int is_num_workers_changed =
+      cdef_info->allocated_num_workers != num_workers;
+  const int is_cdef_enabled =
+      cm->seq_params->enable_cdef && !cm->tiles.large_scale;
+
+  // num-bufs=3 represents ping-pong buffers for top linebuf,
+  // followed by bottom linebuf.
+  // ping-pong is to avoid top linebuf over-write by consecutive row.
+  int num_bufs = 3;
+  if (num_workers > 1)
+    num_bufs = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+
+  if (is_cdef_enabled) {
+    // Calculate src buffer size
+    new_srcbuf_size = sizeof(*cdef_info->srcbuf) * CDEF_INBUF_SIZE;
+    for (int plane = 0; plane < num_planes; plane++) {
+      const int shift =
+          plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x;
+      // Calculate top and bottom line buffer size
+      const int luma_stride =
+          ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
+      new_linebuf_size[plane] = sizeof(*cdef_info->linebuf) * num_bufs *
+                                (CDEF_VBORDER << 1) * (luma_stride >> shift);
+      // Calculate column buffer size
+      const int block_height =
+          (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER;
+      new_colbuf_size[plane] =
+          sizeof(*cdef_info->colbuf[plane]) * block_height * CDEF_HBORDER;
+    }
+  }
+
+  // Free src, line and column buffers for worker 0 in case of reallocation
+  free_cdef_linebuf_conditional(cm, new_linebuf_size);
+  free_cdef_bufs_conditional(cm, cdef_info->colbuf, &cdef_info->srcbuf,
+                             new_colbuf_size, new_srcbuf_size);
+
+  // The flag init_worker indicates if cdef_worker has to be allocated for the
+  // frame. This is passed as 1 always from decoder. At encoder side, it is 0
+  // when called for parallel frames during FPMT (where cdef_worker is shared
+  // across parallel frames) and 1 otherwise.
+  if (*cdef_worker != NULL && init_worker) {
+    if (is_num_workers_changed) {
+      // Free src and column buffers for remaining workers in case of change in
+      // num_workers
+      for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--)
+        free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf);
+
+      aom_free(*cdef_worker);
+      *cdef_worker = NULL;
+    } else if (num_workers > 1) {
+      // Free src and column buffers for remaining workers in case of
+      // reallocation
+      for (int idx = num_workers - 1; idx >= 1; idx--)
+        free_cdef_bufs_conditional(cm, (*cdef_worker)[idx].colbuf,
+                                   &(*cdef_worker)[idx].srcbuf, new_colbuf_size,
+                                   new_srcbuf_size);
+    }
+  }
+
+  if (cdef_info->allocated_mi_rows != num_mi_rows)
+    free_cdef_row_sync(&cdef_sync->cdef_row_mt, cdef_info->allocated_mi_rows);
+
+  // Store allocated sizes for reallocation
+  cdef_info->allocated_srcbuf_size = new_srcbuf_size;
+  av1_copy(cdef_info->allocated_colbuf_size, new_colbuf_size);
+  av1_copy(cdef_info->allocated_linebuf_size, new_linebuf_size);
+  // Store configuration to check change in configuration
+  cdef_info->allocated_mi_rows = num_mi_rows;
+  cdef_info->allocated_num_workers = num_workers;
+
+  if (!is_cdef_enabled) return;
+
+  // Memory allocation of column buffer & source buffer (worker_0).
+  alloc_cdef_bufs(cm, cdef_info->colbuf, &cdef_info->srcbuf, num_planes);
+  alloc_cdef_linebuf(cm, cdef_info->linebuf, num_planes);
+
+  if (num_workers < 2) return;
+
+  if (init_worker) {
+    if (*cdef_worker == NULL)
+      CHECK_MEM_ERROR(cm, *cdef_worker,
+                      aom_calloc(num_workers, sizeof(**cdef_worker)));
+
+    // Memory allocation of column buffer & source buffer for remaining workers.
+    for (int idx = num_workers - 1; idx >= 1; idx--)
+      alloc_cdef_bufs(cm, (*cdef_worker)[idx].colbuf,
+                      &(*cdef_worker)[idx].srcbuf, num_planes);
+  }
+
+  alloc_cdef_row_sync(cm, &cdef_sync->cdef_row_mt,
+                      cdef_info->allocated_mi_rows);
+}
+
 // Assumes cm->rst_info[p].restoration_unit_size is already initialized
 void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
   const int num_planes = av1_num_planes(cm);
@@ -85,11 +319,11 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
   // Now we need to allocate enough space to store the line buffers for the
   // stripes
   const int frame_w = cm->superres_upscaled_width;
-  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int use_highbd = cm->seq_params->use_highbitdepth;
 
   for (int p = 0; p < num_planes; ++p) {
     const int is_uv = p > 0;
-    const int ss_x = is_uv && cm->seq_params.subsampling_x;
+    const int ss_x = is_uv && cm->seq_params->subsampling_x;
     const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
     const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
     const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
@@ -166,10 +400,6 @@ void av1_free_context_buffers(AV1_COMMON *cm) {
   cm->mi_params.free_mi(&cm->mi_params);
 
   av1_free_above_context_buffers(&cm->above_contexts);
-
-#if CONFIG_LPF_MASK
-  av1_free_loop_filter_mask(cm);
-#endif
 }
 
 int av1_alloc_above_context_buffers(CommonContexts *above_contexts,
@@ -248,15 +478,16 @@ static int alloc_mi(CommonModeInfoParams *mi_params) {
   return 0;
 }
 
-int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
+int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height,
+                              BLOCK_SIZE min_partition_size) {
   CommonModeInfoParams *const mi_params = &cm->mi_params;
-  mi_params->set_mb_mi(mi_params, width, height);
+  mi_params->set_mb_mi(mi_params, width, height, min_partition_size);
   if (alloc_mi(mi_params)) goto fail;
   return 0;
 
 fail:
   // clear the mi_* values to force a realloc on resync
-  mi_params->set_mb_mi(mi_params, 0, 0);
+  mi_params->set_mb_mi(mi_params, 0, 0, BLOCK_4X4);
   av1_free_context_buffers(cm);
   return 1;
 }
@@ -273,37 +504,3 @@ void av1_remove_common(AV1_COMMON *cm) {
 void av1_init_mi_buffers(CommonModeInfoParams *mi_params) {
   mi_params->setup_mi(mi_params);
 }
-
-#if CONFIG_LPF_MASK
-int av1_alloc_loop_filter_mask(AV1_COMMON *cm) {
-  aom_free(cm->lf.lfm);
-  cm->lf.lfm = NULL;
-
-  // Each lfm holds bit masks for all the 4x4 blocks in a max
-  // 64x64 (128x128 for ext_partitions) region.  The stride
-  // and rows are rounded up / truncated to a multiple of 16
-  // (32 for ext_partition).
-  cm->lf.lfm_stride =
-      (cm->mi_params.mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
-  cm->lf.lfm_num =
-      ((cm->mi_params.mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
-      cm->lf.lfm_stride;
-  cm->lf.lfm =
-      (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
-  if (!cm->lf.lfm) return 1;
-
-  unsigned int i;
-  for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
-
-  return 0;
-}
-
-void av1_free_loop_filter_mask(AV1_COMMON *cm) {
-  if (cm->lf.lfm == NULL) return;
-
-  aom_free(cm->lf.lfm);
-  cm->lf.lfm = NULL;
-  cm->lf.lfm_num = 0;
-  cm->lf.lfm_stride = 0;
-}
-#endif
diff --git a/media/libaom/src/av1/common/alloccommon.h b/media/libaom/src/av1/common/alloccommon.h
index fe8e0c5301..fc4a8ba187 100644
--- a/media/libaom/src/av1/common/alloccommon.h
+++ b/media/libaom/src/av1/common/alloccommon.h
@@ -16,6 +16,8 @@
 
 #include "config/aom_config.h"
 
+#include "av1/common/enums.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -24,6 +26,8 @@ struct AV1Common;
 struct BufferPool;
 struct CommonContexts;
 struct CommonModeInfoParams;
+struct AV1CdefWorker;
+struct AV1CdefSyncData;
 
 void av1_remove_common(struct AV1Common *cm);
 
@@ -31,11 +35,19 @@ int av1_alloc_above_context_buffers(struct CommonContexts *above_contexts,
                                     int num_tile_rows, int num_mi_cols,
                                     int num_planes);
 void av1_free_above_context_buffers(struct CommonContexts *above_contexts);
-int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height);
+int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height,
+                              BLOCK_SIZE min_partition_size);
 void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params);
 void av1_free_context_buffers(struct AV1Common *cm);
 
 void av1_free_ref_frame_buffers(struct BufferPool *pool);
+void av1_alloc_cdef_buffers(struct AV1Common *const cm,
+                            struct AV1CdefWorker **cdef_worker,
+                            struct AV1CdefSyncData *cdef_sync, int num_workers,
+                            int init_worker);
+void av1_free_cdef_buffers(struct AV1Common *const cm,
+                           struct AV1CdefWorker **cdef_worker,
+                           struct AV1CdefSyncData *cdef_sync);
 void av1_alloc_restoration_buffers(struct AV1Common *cm);
 void av1_free_restoration_buffers(struct AV1Common *cm);
 
@@ -44,11 +56,6 @@ void av1_free_state_buffers(struct AV1Common *cm);
 
 int av1_get_MBs(int width, int height);
 
-#if CONFIG_LPF_MASK
-int av1_alloc_loop_filter_mask(struct AV1Common *cm);
-void av1_free_loop_filter_mask(struct AV1Common *cm);
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c b/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c
index 2f3567aea3..bee496a491 100644
--- a/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c
+++ b/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c
@@ -15,13 +15,13 @@
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
 
+#include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/av1_inv_txfm1d.h"
 #include "av1/common/av1_inv_txfm1d_cfg.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/common/enums.h"
 #include "av1/common/idct.h"
 #include "av1/common/arm/av1_inv_txfm_neon.h"
-#include "av1/common/arm/transpose_neon.h"
 
 // 1D itx types
 typedef enum ATTRIBUTE_PACKED {
@@ -259,8 +259,7 @@ static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
 }
 
 static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out,
-                               int8_t cos_bit, int bit) {
-  (void)bit;
+                               int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
 
   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
@@ -333,8 +332,7 @@ static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out,
 }
 
 static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
+                                    int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
                                       (int16_t)cospi[16], (int16_t)cospi[48]);
@@ -385,9 +383,7 @@ static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
   out[7] = vqnegq_s16(x[1]);
 }
 
-static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
-                              int bit) {
-  (void)bit;
+static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[8], step2[8];
   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
@@ -426,8 +422,7 @@ static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
 }
 
 static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
-                                   int8_t cos_bit, int bit) {
-  (void)bit;
+                                   int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
   int32x4_t t32[2];
@@ -519,8 +514,7 @@ static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
 }
 
 static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
+                                    int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
   int32x4_t t32[2];
@@ -552,9 +546,7 @@ static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
   out[15] = step1;
 }
 
-static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
-                               int bit) {
-  (void)bit;
+static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[16], step2[16];
 
@@ -676,8 +668,7 @@ static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
 }
 
 static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
+                                    int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[16], step2[16];
   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
@@ -787,8 +778,7 @@ static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
 }
 
 static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out,
-                                int8_t cos_bit, int bit) {
-  (void)bit;
+                                int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
 
   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
@@ -945,8 +935,7 @@ static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out,
 }
 
 static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
-                                     int8_t cos_bit, int bit) {
-  (void)bit;
+                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
                                       (int16_t)cospi[40], (int16_t)cospi[24]);
@@ -1037,8 +1026,7 @@ static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
 }
 
 static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
-                                     int8_t cos_bit, int bit) {
-  (void)bit;
+                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
 
   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
@@ -1179,9 +1167,7 @@ static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
   out[15] = vqnegq_s16(x[1]);
 }
 
-static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
-                               int bit) {
-  (void)bit;
+static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[32], step2[32];
 
@@ -1465,8 +1451,7 @@ static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
 }
 
 static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
+                                    int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
   int32x4_t t32[2];
@@ -1522,8 +1507,7 @@ static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
 }
 
 static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
+                                    int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[32], step2[32];
   int32x4_t t32[16];
@@ -1769,8 +1753,7 @@ static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
 }
 
 static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
-                                     int8_t cos_bit, int bit) {
-  (void)bit;
+                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[32], step2[32];
   int32x4_t t32[16];
@@ -2166,8 +2149,7 @@ static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
 }
 
 static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
-                                     int8_t cos_bit, int bit) {
-  (void)bit;
+                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step2[64], step1[64];
   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
@@ -2647,8 +2629,7 @@ static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
 }
 
 static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
+                                    int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
   int32x4_t t32[2];
@@ -2737,8 +2718,7 @@ static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
 }
 
 static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
+                                    int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step2[64], step1[64];
 
@@ -3115,8 +3095,7 @@ static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
 }
 
 static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
-                                     int8_t cos_bit, int bit) {
-  (void)bit;
+                                     int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step2[64], step1[64];
 
@@ -3661,7 +3640,6 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
@@ -3693,7 +3671,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
       int y = i * txfm_size_col;
       round_shift_for_rect(&a[y], &a[y], txfm_size_col);
     }
-    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], INV_COS_BIT);
     av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
                                   -shift[0]);
     if (lr_flip == 1) {
@@ -3736,7 +3714,6 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
@@ -3777,7 +3754,7 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
     temp_b += 8;
   }
   for (int j = 0; j < buf_size_w_div8; ++j) {
-    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
     av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
                                   -shift[1]);
   }
@@ -3803,8 +3780,6 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
@@ -3822,7 +3797,7 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
   for (int i = 0; i < txfm_size_row; i++) {
-    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    row_txfm(input, buf_ptr, INV_COS_BIT, stage_range);
 
     input += txfm_size_col;
     buf_ptr += txfm_size_col;
@@ -3838,7 +3813,7 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
     clamp_buf(temp_in, txfm_size_row, bd + 8);
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
     if (ud_flip == 0) {
@@ -3866,8 +3841,6 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
@@ -3889,7 +3862,7 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
     for (int j = 0; j < txfm_size_col; j++)
       temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
 
-    row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
     input += txfm_size_col;
     buf_ptr += txfm_size_col;
   }
@@ -3904,7 +3877,7 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
     clamp_buf(temp_in, txfm_size_row, bd + 8);
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
     if (ud_flip == 0) {
@@ -3932,8 +3905,6 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
@@ -3955,7 +3926,7 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
     for (int j = 0; j < txfm_size_col; j++)
       temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
 
-    row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
     input += txfm_size_col;
     buf_ptr += txfm_size_col;
   }
@@ -3970,7 +3941,7 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
     clamp_buf(temp_in, txfm_size_row, bd + 8);
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
     if (ud_flip == 0) {
@@ -3998,8 +3969,6 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
@@ -4018,7 +3987,7 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
   for (int i = 0; i < txfm_size_row; i++) {
-    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    row_txfm(input, buf_ptr, INV_COS_BIT, stage_range);
     av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
     input += txfm_size_col;
     buf_ptr += txfm_size_col;
@@ -4034,7 +4003,7 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
     clamp_buf(temp_in, txfm_size_row, bd + 8);
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
     if (ud_flip == 0) {
@@ -4062,8 +4031,6 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
@@ -4082,7 +4049,7 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
   for (int i = 0; i < txfm_size_row; i++) {
-    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    row_txfm(input, buf_ptr, INV_COS_BIT, stage_range);
     av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
     input += txfm_size_col;
     buf_ptr += txfm_size_col;
@@ -4098,7 +4065,7 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
     clamp_buf(temp_in, txfm_size_row, bd + 8);
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
     if (ud_flip == 0) {
@@ -4126,8 +4093,6 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
@@ -4163,7 +4128,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
       int y = i * txfm_size_col;
       round_shift_for_rect(&a[y], &a[y], input_stride);
     }
-    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], INV_COS_BIT);
     av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
                                   -shift[0]);
     if (lr_flip == 1) {
@@ -4183,7 +4148,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
     }
   }
   for (int j = 0; j < buf_size_w_div8; ++j) {
-    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
     av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
                                   -shift[1]);
   }
diff --git a/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.h b/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.h
index 9ec658291c..97099c2042 100644
--- a/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.h
+++ b/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.h
@@ -24,7 +24,7 @@ typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
                                   const int8_t cos_bit,
                                   const int8_t *stage_ptr);
 typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output,
-                               int8_t cos_bit, int bit);
+                               int8_t cos_bit);
 
 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
diff --git a/media/libaom/src/av1/common/arm/av1_txfm_neon.c b/media/libaom/src/av1/common/arm/av1_txfm_neon.c
index 7e3a05ab7e..f955a379f7 100644
--- a/media/libaom/src/av1/common/arm/av1_txfm_neon.c
+++ b/media/libaom/src/av1/common/arm/av1_txfm_neon.c
@@ -14,8 +14,8 @@
 
 #include "config/av1_rtcd.h"
 
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
 
 void av1_round_shift_array_neon(int32_t *arr, int size, int bit) {
   assert(!(size % 4));
diff --git a/media/libaom/src/av1/common/arm/blend_a64_hmask_neon.c b/media/libaom/src/av1/common/arm/blend_a64_hmask_neon.c
index 7134f183e3..4639d4c416 100644
--- a/media/libaom/src/av1/common/arm/blend_a64_hmask_neon.c
+++ b/media/libaom/src/av1/common/arm/blend_a64_hmask_neon.c
@@ -14,10 +14,10 @@
 #include <assert.h>
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/blend.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
-#include "aom_dsp/aom_dsp_common.h"
 #include "config/aom_dsp_rtcd.h"
 
 void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
diff --git a/media/libaom/src/av1/common/arm/blend_a64_vmask_neon.c b/media/libaom/src/av1/common/arm/blend_a64_vmask_neon.c
index 194e94c8c0..061af74055 100644
--- a/media/libaom/src/av1/common/arm/blend_a64_vmask_neon.c
+++ b/media/libaom/src/av1/common/arm/blend_a64_vmask_neon.c
@@ -14,10 +14,10 @@
 #include <assert.h>
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/blend.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
-#include "aom_dsp/aom_dsp_common.h"
 #include "config/aom_dsp_rtcd.h"
 
 void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
diff --git a/media/libaom/src/av1/common/arm/cdef_block_neon.c b/media/libaom/src/av1/common/arm/cdef_block_neon.c
new file mode 100644
index 0000000000..7a8fed50f1
--- /dev/null
+++ b/media/libaom/src/av1/common/arm/cdef_block_neon.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_neon
+#include "av1/common/cdef_block_simd.h"
+
+void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
+                                        const uint8_t *src, int sstride, int v,
+                                        int h) {
+  int j;
+  for (int i = 0; i < v; i++) {
+    for (j = 0; j < (h & ~0x7); j += 8) {
+      v64 row = v64_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+    }
+    for (; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2,
+                             int stride, int32_t *var_out_1st,
+                             int32_t *var_out_2nd, int coeff_shift,
+                             int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+  // Process first 8x8.
+  *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
+
+  // Process second 8x8.
+  *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
+}
diff --git a/media/libaom/src/av1/common/arm/convolve_neon.c b/media/libaom/src/av1/common/arm/convolve_neon.c
index 51c96961cf..f0e4bedccb 100644
--- a/media/libaom/src/av1/common/arm/convolve_neon.c
+++ b/media/libaom/src/av1/common/arm/convolve_neon.c
@@ -16,12 +16,12 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 #include "av1/common/arm/convolve_neon.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
 
 static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,
@@ -194,16 +194,16 @@ static INLINE uint8x8_t convolve8_vert_8x4_s32(
 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
-                            const InterpFilterParams *filter_params_y,
-                            const int subpel_x_qn, const int subpel_y_qn,
+                            const int subpel_x_qn,
                             ConvolveParams *conv_params) {
+  if (filter_params_x->taps > 8) {
+    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        subpel_x_qn, conv_params);
+    return;
+  }
   const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
   const int8_t bits = FILTER_BITS - conv_params->round_0;
 
-  (void)subpel_y_qn;
-  (void)conv_params;
-  (void)filter_params_y;
-
   uint8x8_t t0;
 #if defined(__aarch64__)
   uint8x8_t t1, t2, t3;
@@ -401,26 +401,36 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                         0);  // 10 11 12 13
           dst += dst_stride;
         } else if ((w == 2) && (h > 4)) {
-          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0),
+                        0);  // 00 01
           dst += dst_stride;
-          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1),
+                        0);  // 10 11
           dst += dst_stride;
-          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0);  // 20 21
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2),
+                        0);  // 20 21
           dst += dst_stride;
-          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0);  // 30 31
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3),
+                        0);  // 30 31
           dst += dst_stride;
-          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2);  // 40 41
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0),
+                        2);  // 40 41
           dst += dst_stride;
-          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2);  // 50 51
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1),
+                        2);  // 50 51
           dst += dst_stride;
-          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2);  // 60 61
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2),
+                        2);  // 60 61
           dst += dst_stride;
-          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2);  // 70 71
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3),
+                        2);  // 70 71
           dst += dst_stride;
         } else if ((w == 2) && (h == 2)) {
-          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0),
+                        0);  // 00 01
           dst += dst_stride;
-          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1),
+                        0);  // 10 11
           dst += dst_stride;
         }
         h -= 8;
@@ -601,22 +611,17 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
 
 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_qn, const int subpel_y_qn,
-                            ConvolveParams *conv_params) {
+                            const int subpel_y_qn) {
+  if (filter_params_y->taps > 8) {
+    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
+                        subpel_y_qn);
+    return;
+  }
   const int vert_offset = filter_params_y->taps / 2 - 1;
 
   src -= vert_offset * src_stride;
 
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
@@ -942,6 +947,12 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                              const InterpFilterParams *filter_params_y,
                              const int subpel_x_qn, const int subpel_y_qn,
                              ConvolveParams *conv_params) {
+  if (filter_params_x->taps > 8) {
+    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, subpel_x_qn,
+                         subpel_y_qn, conv_params);
+    return;
+  }
   int im_dst_stride;
   int width, height;
 #if defined(__aarch64__)
@@ -1543,51 +1554,307 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
     }
   }
 }
-void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
-                                  ConvolveParams *conv_params) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-
-  const uint8_t *src1;
-  uint8_t *dst1;
-  int y;
 
-  if (!(w & 0x0F)) {
-    for (y = 0; y < h; ++y) {
-      src1 = src;
-      dst1 = dst;
-      for (int x = 0; x < (w >> 4); ++x) {
-        vst1q_u8(dst1, vld1q_u8(src1));
-        src1 += 16;
-        dst1 += 16;
+static INLINE void scaledconvolve_horiz_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+    const int x0_q4, const int x_step_q4, const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int x, y, z;
+
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  y = h;
+  do {
+    int x_q4 = x0_q4;
+    x = 0;
+    do {
+      // process 4 src_x steps
+      for (z = 0; z < 4; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+          const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+          const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+          uint8x8_t s[8], d;
+          int16x8_t ss[4];
+          int16x4_t t[8], tt;
+
+          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
+          transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
+
+          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+          t[0] = vget_low_s16(ss[0]);
+          t[1] = vget_low_s16(ss[1]);
+          t[2] = vget_low_s16(ss[2]);
+          t[3] = vget_low_s16(ss[3]);
+          t[4] = vget_high_s16(ss[0]);
+          t[5] = vget_high_s16(ss[1]);
+          t[6] = vget_high_s16(ss[2]);
+          t[7] = vget_high_s16(ss[3]);
+
+          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
+                           filters, filter3, filter4);
+          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+          vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
+        } else {
+          int i;
+          for (i = 0; i < 4; ++i) {
+            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 4x4 filters values back to dst
+      {
+        const uint8x8x4_t d4 = vld4_u8(temp);
+        vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[0]), 0);
+        vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[1]), 0);
+        vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[2]), 0);
+        vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[3]), 0);
       }
-      src += src_stride;
-      dst += dst_stride;
-    }
-  } else if (!(w & 0x07)) {
-    for (y = 0; y < h; ++y) {
-      vst1_u8(dst, vld1_u8(src));
-      src += src_stride;
-      dst += dst_stride;
+      x += 4;
+    } while (x < w);
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    y -= 4;
+  } while (y > 0);
+}
+
+static INLINE void scaledconvolve_horiz_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+    const int x0_q4, const int x_step_q4, const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas. The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = (h + 7) & ~7;
+
+  do {
+    int x_q4 = x0_q4;
+    x = 0;
+    do {
+      uint8x8_t d[8];
+      // process 8 src_x steps
+      for (z = 0; z < 8; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+          uint8x8_t s[8];
+          load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
+                      &s[5], &s[6], &s[7]);
+          transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                           &s[7]);
+          d[0] = scale_filter_8(s, filters);
+          vst1_u8(&temp[8 * z], d[0]);
+        } else {
+          int i;
+          for (i = 0; i < 8; ++i) {
+            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 8x8 filters values back to dst
+      load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+                  &d[7]);
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      vst1_u8(&dst[x + 0 * dst_stride], d[0]);
+      vst1_u8(&dst[x + 1 * dst_stride], d[1]);
+      vst1_u8(&dst[x + 2 * dst_stride], d[2]);
+      vst1_u8(&dst[x + 3 * dst_stride], d[3]);
+      vst1_u8(&dst[x + 4 * dst_stride], d[4]);
+      vst1_u8(&dst[x + 5 * dst_stride], d[5]);
+      vst1_u8(&dst[x + 6 * dst_stride], d[6]);
+      vst1_u8(&dst[x + 7 * dst_stride], d[7]);
+      x += 8;
+    } while (x < w);
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static INLINE void scaledconvolve_vert_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+    if (y_q4 & SUBPEL_MASK) {
+      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+      const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+      const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+      uint8x8_t s[8], d;
+      int16x4_t t[8], tt;
+
+      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                  &s[6], &s[7]);
+      t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
+      t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
+      t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
+      t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
+      t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
+      t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
+      t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
+      t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
+
+      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
+                       filter3, filter4);
+      d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
     }
-  } else if (!(w & 0x03)) {
-    for (y = 0; y < h; ++y) {
-      vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0);
-      src += src_stride;
-      dst += dst_stride;
+
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    if (y_q4 & SUBPEL_MASK) {
+      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+      uint8x8_t s[8], d;
+      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                  &s[6], &s[7]);
+      d = scale_filter_8(s, filters);
+      vst1_u8(dst, d);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
     }
-  } else if (!(w & 0x01)) {
-    for (y = 0; y < h; ++y) {
-      vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0);
-      src += src_stride;
-      dst += dst_stride;
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w16(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int x, y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    if (y_q4 & SUBPEL_MASK) {
+      x = 0;
+      do {
+        const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+        uint8x16_t ss[8];
+        uint8x8_t s[8], d[2];
+        load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
+                     &ss[5], &ss[6], &ss[7]);
+        s[0] = vget_low_u8(ss[0]);
+        s[1] = vget_low_u8(ss[1]);
+        s[2] = vget_low_u8(ss[2]);
+        s[3] = vget_low_u8(ss[3]);
+        s[4] = vget_low_u8(ss[4]);
+        s[5] = vget_low_u8(ss[5]);
+        s[6] = vget_low_u8(ss[6]);
+        s[7] = vget_low_u8(ss[7]);
+        d[0] = scale_filter_8(s, filters);
+
+        s[0] = vget_high_u8(ss[0]);
+        s[1] = vget_high_u8(ss[1]);
+        s[2] = vget_high_u8(ss[2]);
+        s[3] = vget_high_u8(ss[3]);
+        s[4] = vget_high_u8(ss[4]);
+        s[5] = vget_high_u8(ss[5]);
+        s[6] = vget_high_u8(ss[6]);
+        s[7] = vget_high_u8(ss[7]);
+        d[1] = scale_filter_8(s, filters);
+        vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
+        src_y += 16;
+        x += 16;
+      } while (x < w);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
     }
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if (w >= 8) {
+    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  } else {
+    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  }
+
+  if (w >= 16) {
+    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else if (w == 8) {
+    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else {
+    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
   }
 }
diff --git a/media/libaom/src/av1/common/arm/convolve_neon.h b/media/libaom/src/av1/common/arm/convolve_neon.h
index dbcfab6318..27a996ce9e 100644
--- a/media/libaom/src/av1/common/arm/convolve_neon.h
+++ b/media/libaom/src/av1/common/arm/convolve_neon.h
@@ -15,6 +15,69 @@
 
 #define HORIZ_EXTRA_ROWS ((SUBPEL_TAPS + 7) & ~0x07)
 
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t s4, const int16x4_t s5,
+                                    const int16x4_t s6, const int16x4_t s7,
+                                    const int16x8_t filters,
+                                    const int16x4_t filter3,
+                                    const int16x4_t filter4) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int16x4_t sum;
+
+  sum = vmul_lane_s16(s0, filters_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filters_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filters_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filters_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filters_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filters_hi, 3);
+  sum = vqadd_s16(sum, vmul_s16(s3, filter3));
+  sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x8_t s4, const int16x8_t s5,
+                                    const int16x8_t s6, const int16x8_t s7,
+                                    const int16x8_t filters,
+                                    const int16x8_t filter3,
+                                    const int16x8_t filter4) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int16x8_t sum;
+
+  sum = vmulq_lane_s16(s0, filters_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
+  sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
+  sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+  return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+                                       const int16x8_t filters) {
+  const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
+  const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
+  int16x8_t ss[8];
+
+  ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+  ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+  ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+  ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+  ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
+  ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
+  ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
+  ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
+
+  return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
+                     filters, filter3, filter4);
+}
+
 static INLINE uint8x8_t wiener_convolve8_vert_4x8(
     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
diff --git a/media/libaom/src/av1/common/arm/highbd_inv_txfm_neon.c b/media/libaom/src/av1/common/arm/highbd_inv_txfm_neon.c
new file mode 100644
index 0000000000..dd8dee338e
--- /dev/null
+++ b/media/libaom/src/av1/common/arm/highbd_inv_txfm_neon.c
@@ -0,0 +1,6052 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you canzip
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#if defined(__aarch64__)
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)         \
+  do {                                                        \
+    int32x4x2_t swap_low = vtrnq_s32(x0, x1);                 \
+    int32x4x2_t swap_high = vtrnq_s32(x2, x3);                \
+    y0 = vreinterpretq_s32_s64(                               \
+        vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
+                   vreinterpretq_s64_s32(swap_high.val[0]))); \
+    y1 = vreinterpretq_s32_s64(                               \
+        vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
+                   vreinterpretq_s64_s32(swap_high.val[1]))); \
+    y2 = vreinterpretq_s32_s64(                               \
+        vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
+                   vreinterpretq_s64_s32(swap_high.val[0]))); \
+    y3 = vreinterpretq_s32_s64(                               \
+        vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
+                   vreinterpretq_s64_s32(swap_high.val[1]))); \
+  } while (0)
+#else
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)                    \
+  do {                                                                   \
+    int32x4x2_t swap_low = vtrnq_s32(x0, x1);                            \
+    int32x4x2_t swap_high = vtrnq_s32(x2, x3);                           \
+    y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2),       \
+                   swap_high.val[0], 2);                                 \
+    y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2),       \
+                   swap_high.val[1], 2);                                 \
+    y2 = vextq_s32(swap_low.val[0],                                      \
+                   vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \
+    y3 = vextq_s32(swap_low.val[1],                                      \
+                   vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \
+  } while (0)
+#endif  // (__aarch64__)
+
+static INLINE void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
+  TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+  TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+  TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+  TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+                out[15]);
+}
+
+static INLINE void av1_round_shift_array_32_neon(int32x4_t *input,
+                                                 int32x4_t *output,
+                                                 const int size,
+                                                 const int bit) {
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  int i;
+  for (i = 0; i < size; i++) {
+    int32x4_t vradd = vaddq_s32(input[i], rnding);
+    output[i] = vshlq_s32(vradd, v_bit);
+  }
+}
+
+static INLINE void av1_round_shift_rect_array_32_neon(int32x4_t *input,
+                                                      int32x4_t *output,
+                                                      const int size,
+                                                      const int bit,
+                                                      const int val) {
+  const int32x4_t sqrt2 = vdupq_n_s32(val);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  const int32x4_t rnding2 = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
+  int i;
+  if (bit > 0) {
+    for (i = 0; i < size; i++) {
+      int32x4_t vradd = vshlq_s32(input[i], rnding);
+      const int32x4_t r0 = vshlq_s32(vradd, v_bit);
+      const int32x4_t r1 = vmlaq_s32(rnding2, sqrt2, r0);
+      output[i] = vshrq_n_s32(r1, NewSqrt2Bits);
+    }
+  } else {
+    for (i = 0; i < size; i++) {
+      const int32x4_t r0 = vshlq_s32(input[i], v_bit);
+      const int32x4_t r1 = vmlaq_s32(rnding2, sqrt2, r0);
+      output[i] = vshrq_n_s32(r1, NewSqrt2Bits);
+    }
+  }
+}
+
+static INLINE int32x4_t half_btf_neon_r(const int32_t *n0, const int32x4_t *w0,
+                                        const int32_t *n1, const int32x4_t *w1,
+                                        const int32x4_t *v_bit,
+                                        const int32x4_t *rnding) {
+  int32x4_t x;
+  x = vmlaq_n_s32(*rnding, *w0, *n0);
+  x = vmlaq_n_s32(x, *w1, *n1);
+  x = vshlq_s32(x, *v_bit);
+  return x;
+}
+
+static INLINE int32x4_t half_btf_neon_mode11_r(
+    const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
+    const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
+  int32x4_t x;
+  x = vmlaq_n_s32(*rnding, *w0, -*n0);
+  x = vmlaq_n_s32(x, *w1, -*n1);
+  x = vshlq_s32(x, *v_bit);
+  return x;
+}
+
+static INLINE int32x4_t half_btf_neon_mode01_r(
+    const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
+    const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
+  int32x4_t x;
+  x = vmlaq_n_s32(*rnding, *w0, *n0);
+  x = vmlsq_n_s32(x, *w1, *n1);
+  x = vshlq_s32(x, *v_bit);
+  return x;
+}
+
+static INLINE int32x4_t half_btf_neon_mode10_r(
+    const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
+    const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
+  int32x4_t x;
+  x = vmlaq_n_s32(*rnding, *w1, *n1);
+  x = vmlsq_n_s32(x, *w0, *n0);
+  x = vshlq_s32(x, *v_bit);
+  return x;
+}
+
+static INLINE int32x4_t half_btf_0_neon_r(const int32_t *n0,
+                                          const int32x4_t *w0,
+                                          const int32x4_t *v_bit,
+                                          const int32x4_t *rnding) {
+  int32x4_t x;
+  x = vmlaq_n_s32(*rnding, *w0, *n0);
+  x = vshlq_s32(x, *v_bit);
+  return x;
+}
+
+static INLINE int32x4_t half_btf_0_m_neon_r(const int32_t *n0,
+                                            const int32x4_t *w0,
+                                            const int32x4_t *v_bit,
+                                            const int32x4_t *rnding) {
+  int32x4_t x;
+  x = vmlaq_n_s32(*rnding, *w0, -*n0);
+  x = vshlq_s32(x, *v_bit);
+  return x;
+}
+
+static INLINE void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit,
+                                      const int num_cols);
+
+typedef void (*transform_1d_neon)(int32x4_t *in, int32x4_t *out, int32_t bit,
+                                  int32_t do_cols, int32_t bd,
+                                  int32_t out_shift);
+
+static INLINE uint16x8_t highbd_clamp_u16(uint16x8_t *u, const uint16x8_t *min,
+                                          const uint16x8_t *max) {
+  int16x8_t clamped;
+  clamped = vminq_s16(vreinterpretq_s16_u16(*u), vreinterpretq_s16_u16(*max));
+  clamped = vmaxq_s16(clamped, vreinterpretq_s16_u16(*min));
+  return vreinterpretq_u16_s16(clamped);
+}
+
+static INLINE void round_shift_4x4(int32x4_t *in, int shift,
+                                   const int32x4_t *rnding) {
+  if (shift != 0) {
+    const int32x4_t v_shift = vdupq_n_s32(-shift);
+    int32x4_t vradd = vaddq_s32(in[0], *rnding);
+    in[0] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[1], *rnding);
+    in[1] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[2], *rnding);
+    in[2] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[3], *rnding);
+    in[3] = vshlq_s32(vradd, v_shift);
+  }
+}
+
+static void round_shift_8x8(int32x4_t *in, int shift, const int32x4_t *rnding) {
+  if (shift != 0) {
+    const int32x4_t v_shift = vdupq_n_s32(-shift);
+    int32x4_t vradd = vaddq_s32(in[0], *rnding);
+    in[0] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[1], *rnding);
+    in[1] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[2], *rnding);
+    in[2] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[3], *rnding);
+    in[3] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[4], *rnding);
+    in[4] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[5], *rnding);
+    in[5] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[6], *rnding);
+    in[6] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[7], *rnding);
+    in[7] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[8], *rnding);
+    in[8] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[9], *rnding);
+    in[9] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[10], *rnding);
+    in[10] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[11], *rnding);
+    in[11] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[12], *rnding);
+    in[12] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[13], *rnding);
+    in[13] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[14], *rnding);
+    in[14] = vshlq_s32(vradd, v_shift);
+    vradd = vaddq_s32(in[15], *rnding);
+    in[15] = vshlq_s32(vradd, v_shift);
+  }
+}
+
+static void highbd_clamp_s32_neon(int32x4_t *in, int32x4_t *out,
+                                  const int32x4_t *clamp_lo,
+                                  const int32x4_t *clamp_hi, int size) {
+  int32x4_t a0, a1;
+  for (int i = 0; i < size; i += 4) {
+    a0 = vmaxq_s32(in[i], *clamp_lo);
+    out[i] = vminq_s32(a0, *clamp_hi);
+
+    a1 = vmaxq_s32(in[i + 1], *clamp_lo);
+    out[i + 1] = vminq_s32(a1, *clamp_hi);
+
+    a0 = vmaxq_s32(in[i + 2], *clamp_lo);
+    out[i + 2] = vminq_s32(a0, *clamp_hi);
+
+    a1 = vmaxq_s32(in[i + 3], *clamp_lo);
+    out[i + 3] = vminq_s32(a1, *clamp_hi);
+  }
+}
+
+static INLINE uint16x8_t highbd_get_recon_8x8_neon(const uint16x8_t pred,
+                                                   int32x4_t res0,
+                                                   int32x4_t res1,
+                                                   const int bd) {
+  const uint16x8_t v_zero = vdupq_n_u16(0);
+  int32x4_t min_clip_val = vreinterpretq_s32_u16(v_zero);
+  int32x4_t max_clip_val = vdupq_n_s32((1 << bd) - 1);
+  uint16x8x2_t x;
+  x.val[0] = vreinterpretq_u16_s32(
+      vaddw_s16(res0, vreinterpret_s16_u16(vget_low_u16(pred))));
+  x.val[1] = vreinterpretq_u16_s32(
+      vaddw_s16(res1, vreinterpret_s16_u16(vget_high_u16(pred))));
+  x.val[0] = vreinterpretq_u16_s32(
+      vmaxq_s32(vreinterpretq_s32_u16(x.val[0]), min_clip_val));
+  x.val[0] = vreinterpretq_u16_s32(
+      vminq_s32(vreinterpretq_s32_u16(x.val[0]), max_clip_val));
+  x.val[1] = vreinterpretq_u16_s32(
+      vmaxq_s32(vreinterpretq_s32_u16(x.val[1]), min_clip_val));
+  x.val[1] = vreinterpretq_u16_s32(
+      vminq_s32(vreinterpretq_s32_u16(x.val[1]), max_clip_val));
+  uint16x8_t res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])),
+                                vqmovn_u32(vreinterpretq_u32_u16(x.val[1])));
+  return res;
+}
+
+static INLINE uint16x4_t highbd_get_recon_4xn_neon(uint16x4_t pred,
+                                                   int32x4_t res0,
+                                                   const int bd) {
+  uint16x4_t x0_ = vreinterpret_u16_s16(
+      vmovn_s32(vaddw_s16(res0, vreinterpret_s16_u16(pred))));
+  uint16x8_t x0 = vcombine_u16(x0_, x0_);
+  const uint16x8_t vmin = vdupq_n_u16(0);
+  const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
+  x0 = highbd_clamp_u16(&x0, &vmin, &vmax);
+  return vget_low_u16(x0);
+}
+
+static INLINE void highbd_write_buffer_4xn_neon(int32x4_t *in, uint16_t *output,
+                                                int stride, int flipud,
+                                                int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    uint16x4_t v = vld1_u16(output + i * stride);
+    uint16x4_t u = highbd_get_recon_4xn_neon(v, in[j], bd);
+
+    vst1_u16(output + i * stride, u);
+  }
+}
+
+static INLINE void highbd_write_buffer_8xn_neon(int32x4_t *in, uint16_t *output,
+                                                int stride, int flipud,
+                                                int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    uint16x8_t v = vld1q_u16(output + i * stride);
+    uint16x8_t u = highbd_get_recon_8x8_neon(v, in[j], in[j + height], bd);
+
+    vst1q_u16(output + i * stride, u);
+  }
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+                                           int32x4_t *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = vld1q_s32(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, int32x4_t *in) {
+  in[0] = vld1q_s32(coeff + 0);
+  in[1] = vld1q_s32(coeff + 4);
+  in[2] = vld1q_s32(coeff + 8);
+  in[3] = vld1q_s32(coeff + 12);
+}
+
+static void addsub_neon(const int32x4_t in0, const int32x4_t in1,
+                        int32x4_t *out0, int32x4_t *out1,
+                        const int32x4_t *clamp_lo, const int32x4_t *clamp_hi) {
+  int32x4_t a0 = vaddq_s32(in0, in1);
+  int32x4_t a1 = vsubq_s32(in0, in1);
+
+  a0 = vmaxq_s32(a0, *clamp_lo);
+  a0 = vminq_s32(a0, *clamp_hi);
+  a1 = vmaxq_s32(a1, *clamp_lo);
+  a1 = vminq_s32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void shift_and_clamp_neon(int32x4_t *in0, int32x4_t *in1,
+                                 const int32x4_t *clamp_lo,
+                                 const int32x4_t *clamp_hi,
+                                 const int32x4_t *v_shift) {
+  int32x4_t in0_w_offset = vrshlq_s32(*in0, *v_shift);
+  int32x4_t in1_w_offset = vrshlq_s32(*in1, *v_shift);
+
+  in0_w_offset = vmaxq_s32(in0_w_offset, *clamp_lo);
+  in0_w_offset = vminq_s32(in0_w_offset, *clamp_hi);
+  in1_w_offset = vmaxq_s32(in1_w_offset, *clamp_lo);
+  in1_w_offset = vminq_s32(in1_w_offset, *clamp_hi);
+
+  *in0 = in0_w_offset;
+  *in1 = in1_w_offset;
+}
+
+static INLINE void idct32_stage4_neon(int32x4_t *bf1, const int32_t *cospi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int32x4_t temp1, temp2;
+  temp1 = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30],
+                                 v_bit, rnding);
+  bf1[30] =
+      half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], v_bit, rnding);
+  bf1[17] = temp1;
+
+  temp2 = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29],
+                                 v_bit, rnding);
+  bf1[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29],
+                                   v_bit, rnding);
+  bf1[18] = temp2;
+
+  temp1 = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26],
+                                 v_bit, rnding);
+  bf1[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], v_bit,
+                            rnding);
+  bf1[21] = temp1;
+
+  temp2 = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25],
+                                 v_bit, rnding);
+  bf1[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25],
+                                   v_bit, rnding);
+  bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_neon(int32x4_t *bf1, const int32_t *cospi,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int32x4_t temp1, temp2;
+  temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[9], &cospi[48], &bf1[14],
+                                 v_bit, rnding);
+  bf1[14] =
+      half_btf_neon_r(&cospi[48], &bf1[9], &cospi[16], &bf1[14], v_bit, rnding);
+  bf1[9] = temp1;
+
+  temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[10], &cospi[16], &bf1[13],
+                                 v_bit, rnding);
+  bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf1[10], &cospi[48], &bf1[13],
+                                   v_bit, rnding);
+  bf1[10] = temp2;
+
+  addsub_neon(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+  addsub_neon(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+  addsub_neon(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_neon(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_neon(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+  addsub_neon(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_neon(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+  addsub_neon(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_neon(int32x4_t *bf1, const int32_t *cospi,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int32x4_t temp1, temp2;
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6],
+                                 v_bit, rnding);
+  bf1[6] =
+      half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], v_bit, rnding);
+  bf1[5] = temp1;
+
+  addsub_neon(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_neon(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_neon(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_neon(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29],
+                                 v_bit, rnding);
+  bf1[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], v_bit,
+                            rnding);
+  bf1[18] = temp1;
+  temp2 = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28],
+                                 v_bit, rnding);
+  bf1[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], v_bit,
+                            rnding);
+  bf1[19] = temp2;
+  temp1 = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27],
+                                 v_bit, rnding);
+  bf1[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27],
+                                   v_bit, rnding);
+  bf1[20] = temp1;
+  temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26],
+                                 v_bit, rnding);
+  bf1[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26],
+                                   v_bit, rnding);
+  bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_neon(int32x4_t *bf1, const int32_t *cospi,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int32x4_t temp1, temp2;
+  addsub_neon(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+  addsub_neon(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+  addsub_neon(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+  addsub_neon(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13],
+                                 v_bit, rnding);
+  bf1[13] = half_btf_neon_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13], v_bit,
+                            rnding);
+  bf1[10] = temp1;
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12],
+                                 v_bit, rnding);
+  bf1[12] = half_btf_neon_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12], v_bit,
+                            rnding);
+  bf1[11] = temp2;
+
+  addsub_neon(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+  addsub_neon(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+  addsub_neon(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_neon(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_neon(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+  addsub_neon(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+  addsub_neon(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_neon(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_neon(int32x4_t *bf1, const int32_t *cospi,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int32x4_t temp1, temp2;
+  addsub_neon(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+  addsub_neon(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+  addsub_neon(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+  addsub_neon(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_neon(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_neon(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_neon(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+  addsub_neon(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27],
+                                 v_bit, rnding);
+  bf1[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], v_bit,
+                            rnding);
+  bf1[20] = temp1;
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26],
+                                 v_bit, rnding);
+  bf1[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], v_bit,
+                            rnding);
+  bf1[21] = temp2;
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25],
+                                 v_bit, rnding);
+  bf1[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], v_bit,
+                            rnding);
+  bf1[22] = temp1;
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24],
+                                 v_bit, rnding);
+  bf1[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], v_bit,
+                            rnding);
+  bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_neon(int32x4_t *bf1, int32x4_t *out,
+                                      const int do_cols, const int bd,
+                                      const int out_shift,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi) {
+  addsub_neon(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+  addsub_neon(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+  addsub_neon(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+  addsub_neon(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+  addsub_neon(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+  addsub_neon(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+  addsub_neon(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+  addsub_neon(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+  addsub_neon(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+  addsub_neon(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+  addsub_neon(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+  addsub_neon(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+  addsub_neon(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+  addsub_neon(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+  addsub_neon(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+  addsub_neon(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t rnding = vdupq_n_s32(1 << (out_shift - 1));
+    for (int i = 0; i < 32; i += 8) {
+      round_shift_4x4(out + i, out_shift, &rnding);
+      round_shift_4x4(out + i + 4, out_shift, &rnding);
+    }
+    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+  }
+}
+
+static void neg_shift_neon(const int32x4_t *in0, const int32x4_t *in1,
+                           int32x4_t *out0, int32x4_t *out1,
+                           const int32x4_t *clamp_lo, const int32x4_t *clamp_hi,
+                           const int32x4_t *v_shift, int32x4_t *offset) {
+  int32x4_t a0 = vaddq_s32(*offset, *in0);
+  int32x4_t a1 = vsubq_s32(*offset, *in1);
+
+  a0 = vshlq_s32(a0, *v_shift);
+  a1 = vshlq_s32(a1, *v_shift);
+
+  a0 = vmaxq_s32(a0, *clamp_lo);
+  a0 = vminq_s32(a0, *clamp_hi);
+  a1 = vmaxq_s32(a1, *clamp_lo);
+  a1 = vminq_s32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void idct4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                         int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  int32x4_t u0, u1, u2, u3;
+  int32x4_t v0, v1, v2, v3, x, y;
+
+  // Stage 0-1-2
+
+  TRANSPOSE_4X4(in[0], in[1], in[2], in[3], u0, u1, u2, u3);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+  x = vmlaq_n_s32(rnding, u0, cospi[32]);
+  y = vmulq_n_s32(u2, cospi[32]);
+  v0 = vaddq_s32(x, y);
+  v0 = vshlq_s32(v0, v_bit);
+
+  v1 = vsubq_s32(x, y);
+  v1 = vshlq_s32(v1, v_bit);
+
+  x = vmlaq_n_s32(rnding, u1, cospi[48]);
+  v2 = vmlsq_n_s32(x, u3, cospi[16]);
+  v2 = vshlq_s32(v2, v_bit);
+
+  x = vmlaq_n_s32(rnding, u1, cospi[16]);
+  v3 = vmlaq_n_s32(x, u3, cospi[48]);
+  v3 = vshlq_s32(v3, v_bit);
+  // Stage 3
+  addsub_neon(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
+  addsub_neon(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    log_range = AOMMAX(16, bd + 6);
+    clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+    clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    shift_and_clamp_neon(out + 0, out + 3, &clamp_lo, &clamp_hi, &v_shift);
+    shift_and_clamp_neon(out + 1, out + 2, &clamp_lo, &clamp_hi, &v_shift);
+  }
+}
+
+static void iadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                          int bd, int out_shift) {
+  const int32_t *sinpi = sinpi_arr(bit);
+  const int32x4_t zero = vdupq_n_s32(0);
+  int64x2_t rnding = vdupq_n_s64(1 << (bit + 4 - 1));
+  const int32x2_t mul = vdup_n_s32(1 << 4);
+  int32x4_t t;
+  int32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int32x4_t x0, x1, x2, x3;
+  int32x4_t u0, u1, u2, u3;
+
+  TRANSPOSE_4X4(in[0], in[1], in[2], in[3], x0, x1, x2, x3);
+
+  s0 = vmulq_n_s32(x0, sinpi[1]);
+  s1 = vmulq_n_s32(x0, sinpi[2]);
+  s2 = vmulq_n_s32(x1, sinpi[3]);
+  s3 = vmulq_n_s32(x2, sinpi[4]);
+  s4 = vmulq_n_s32(x2, sinpi[1]);
+  s5 = vmulq_n_s32(x3, sinpi[2]);
+  s6 = vmulq_n_s32(x3, sinpi[4]);
+  t = vsubq_s32(x0, x2);
+  s7 = vaddq_s32(t, x3);
+
+  t = vaddq_s32(s0, s3);
+  s0 = vaddq_s32(t, s5);
+  t = vsubq_s32(s1, s4);
+  s1 = vsubq_s32(t, s6);
+  s3 = s2;
+  s2 = vmulq_n_s32(s7, sinpi[3]);
+
+  u0 = vaddq_s32(s0, s3);
+  u1 = vaddq_s32(s1, s3);
+  u2 = s2;
+  t = vaddq_s32(s0, s1);
+  u3 = vsubq_s32(t, s3);
+
+  // u0
+  int32x4x2_t u0x;
+  u0x.val[0] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul));
+  u0x.val[0] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u0x.val[0]), rnding));
+
+  u0 = vextq_s32(u0, zero, 1);
+  u0x.val[1] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul));
+  u0x.val[1] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u0x.val[1]), rnding));
+
+  u0x.val[0] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u0x.val[0]), vreinterpretq_s16_s32(zero), 1));
+  u0x.val[1] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u0x.val[1]), vreinterpretq_s16_s32(zero), 1));
+
+  u0x = vzipq_s32(u0x.val[0], u0x.val[1]);
+#if defined(__aarch64__)
+  u0 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u0x.val[0]),
+                                        vreinterpretq_s64_s32(u0x.val[1])));
+#else
+  u0 = vcombine_s32(vget_low_s32(u0x.val[0]), vget_low_s32(u0x.val[1]));
+#endif  // (__aarch64__)
+  // u1
+  int32x4x2_t u1x;
+  u1x.val[0] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul));
+  u1x.val[0] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u1x.val[0]), rnding));
+
+  u1 = vextq_s32(u1, zero, 1);
+  u1x.val[1] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul));
+  u1x.val[1] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u1x.val[1]), rnding));
+
+  u1x.val[0] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u1x.val[0]), vreinterpretq_s16_s32(zero), 1));
+  u1x.val[1] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u1x.val[1]), vreinterpretq_s16_s32(zero), 1));
+
+  u1x = vzipq_s32(u1x.val[0], u1x.val[1]);
+#if defined(__aarch64__)
+  u1 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u1x.val[0]),
+                                        vreinterpretq_s64_s32(u1x.val[1])));
+#else
+  u1 = vcombine_s32(vget_low_s32(u1x.val[0]), vget_low_s32(u1x.val[1]));
+#endif  // (__aarch64__)
+
+  // u2
+  int32x4x2_t u2x;
+  u2x.val[0] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul));
+  u2x.val[0] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u2x.val[0]), rnding));
+
+  u2 = vextq_s32(u2, zero, 1);
+  u2x.val[1] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul));
+  u2x.val[1] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u2x.val[1]), rnding));
+
+  u2x.val[0] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u2x.val[0]), vreinterpretq_s16_s32(zero), 1));
+  u2x.val[1] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u2x.val[1]), vreinterpretq_s16_s32(zero), 1));
+
+  u2x = vzipq_s32(u2x.val[0], u2x.val[1]);
+#if defined(__aarch64__)
+  u2 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u2x.val[0]),
+                                        vreinterpretq_s64_s32(u2x.val[1])));
+#else
+  u2 = vcombine_s32(vget_low_s32(u2x.val[0]), vget_low_s32(u2x.val[1]));
+#endif  // (__aarch64__)
+
+  // u3
+  int32x4x2_t u3x;
+  u3x.val[0] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul));
+  u3x.val[0] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u3x.val[0]), rnding));
+
+  u3 = vextq_s32(u3, zero, 1);
+  u3x.val[1] = vreinterpretq_s32_s64(
+      vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul));
+  u3x.val[1] = vreinterpretq_s32_s64(
+      vaddq_s64(vreinterpretq_s64_s32(u3x.val[1]), rnding));
+
+  u3x.val[0] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u3x.val[0]), vreinterpretq_s16_s32(zero), 1));
+  u3x.val[1] = vreinterpretq_s32_s16(vextq_s16(
+      vreinterpretq_s16_s32(u3x.val[1]), vreinterpretq_s16_s32(zero), 1));
+
+  u3x = vzipq_s32(u3x.val[0], u3x.val[1]);
+#if defined(__aarch64__)
+  u3 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u3x.val[0]),
+                                        vreinterpretq_s64_s32(u3x.val[1])));
+#else
+  u3 = vcombine_s32(vget_low_s32(u3x.val[0]), vget_low_s32(u3x.val[1]));
+#endif  // (__aarch64__)
+
+  out[0] = u0;
+  out[1] = u1;
+  out[2] = u2;
+  out[3] = u3;
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+    const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+    const int32x4_t rnding32 = vdupq_n_s32(1 << (out_shift - 1));
+    round_shift_4x4(out, out_shift, &rnding32);
+    highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4);
+  }
+}
+
+static void write_buffer_4x4(int32x4_t *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  uint32x4_t u0, u1, u2, u3;
+  uint16x4_t v0, v1, v2, v3;
+  const int32x4_t rnding = vdupq_n_s32(1 << (shift - 1));
+  round_shift_4x4(in, shift, &rnding);
+
+  v0 = vld1_u16(output + 0 * stride);
+  v1 = vld1_u16(output + 1 * stride);
+  v2 = vld1_u16(output + 2 * stride);
+  v3 = vld1_u16(output + 3 * stride);
+
+  if (fliplr) {
+    u0 = vrev64q_u32(vreinterpretq_u32_s32(in[0]));
+    in[0] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
+    u0 = vrev64q_u32(vreinterpretq_u32_s32(in[1]));
+    in[1] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
+    u0 = vrev64q_u32(vreinterpretq_u32_s32(in[2]));
+    in[2] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
+    u0 = vrev64q_u32(vreinterpretq_u32_s32(in[3]));
+    in[3] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
+  }
+
+  if (flipud) {
+    u0 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v0);
+    u1 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v1);
+    u2 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v2);
+    u3 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v3);
+  } else {
+    u0 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v0);
+    u1 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v1);
+    u2 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v2);
+    u3 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v3);
+  }
+
+  uint16x8_t u4 = vcombine_u16(vqmovn_u32(u0), vqmovn_u32(u1));
+  uint16x8_t u5 = vcombine_u16(vqmovn_u32(u2), vqmovn_u32(u3));
+  const uint16x8_t vmin = vdupq_n_u16(0);
+  const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
+  u4 = highbd_clamp_u16(&u4, &vmin, &vmax);
+  u5 = highbd_clamp_u16(&u5, &vmin, &vmax);
+
+  vst1_u16(output + 0 * stride, vget_low_u16(u4));
+  vst1_u16(output + 1 * stride, vget_high_u16(u4));
+  vst1_u16(output + 2 * stride, vget_low_u16(u5));
+  vst1_u16(output + 3 * stride, vget_high_u16(u5));
+}
+
+static void iidentity4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                            int bd, int out_shift) {
+  (void)bit;
+  int32x4_t v[4];
+  int32x4_t zero = vdupq_n_s32(0);
+  int32x2_t fact = vdup_n_s32(NewSqrt2);
+  int32x4x2_t a0;
+  const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1));
+
+  for (int i = 0; i < 4; i++) {
+    a0.val[0] = vreinterpretq_s32_s64(
+        vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
+    a0.val[0] = vreinterpretq_s32_s64(
+        vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
+    a0.val[1] = vextq_s32(in[i], zero, 1);
+    a0.val[1] = vreinterpretq_s32_s64(
+        vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
+    a0.val[1] = vreinterpretq_s32_s64(
+        vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
+
+    a0 = vzipq_s32(a0.val[0], a0.val[1]);
+#if defined(__aarch64__)
+    out[i] = vreinterpretq_s32_s64(vzip1q_s64(
+        vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
+#else
+    out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2);
+#endif
+  }
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+    const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+    const int32x4_t rnding32 = vdupq_n_s32(1 << (out_shift - 1));
+    round_shift_4x4(out, out_shift, &rnding32);
+    highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4);
+  }
+  v[0] = out[0];
+  v[1] = out[1];
+  v[2] = out[2];
+  v[3] = out[3];
+
+  // Transpose for 4x4
+  TRANSPOSE_4X4(v[0], v[1], v[2], v[3], out[0], out[1], out[2], out[3]);
+}
+
+void av1_inv_txfm2d_add_4x4_neon(const int32_t *input, uint16_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  int32x4_t in[4];
+
+  const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case IDTX:
+      load_buffer_4x4(input, in);
+      iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, in);
+      iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, in);
+      iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, in);
+      iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+      iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    default: assert(0);
+  }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, int32x4_t *in) {
+  in[0] = vld1q_s32(coeff + 0);
+  in[1] = vld1q_s32(coeff + 4);
+  in[2] = vld1q_s32(coeff + 8);
+  in[3] = vld1q_s32(coeff + 12);
+  in[4] = vld1q_s32(coeff + 16);
+  in[5] = vld1q_s32(coeff + 20);
+  in[6] = vld1q_s32(coeff + 24);
+  in[7] = vld1q_s32(coeff + 28);
+  in[8] = vld1q_s32(coeff + 32);
+  in[9] = vld1q_s32(coeff + 36);
+  in[10] = vld1q_s32(coeff + 40);
+  in[11] = vld1q_s32(coeff + 44);
+  in[12] = vld1q_s32(coeff + 48);
+  in[13] = vld1q_s32(coeff + 52);
+  in[14] = vld1q_s32(coeff + 56);
+  in[15] = vld1q_s32(coeff + 60);
+}
+
+static void idct8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                         int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
+  int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
+  int32x4_t x, y;
+  int col;
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    // stage 2
+    u0 = in[0 * 2 + col];
+    u1 = in[4 * 2 + col];
+    u2 = in[2 * 2 + col];
+    u3 = in[6 * 2 + col];
+
+    x = vmulq_n_s32(in[1 * 2 + col], cospi[56]);
+    u4 = vmlaq_n_s32(x, in[7 * 2 + col], -cospi[8]);
+    u4 = vaddq_s32(u4, rnding);
+    u4 = vshlq_s32(u4, v_bit);
+
+    x = vmulq_n_s32(in[1 * 2 + col], cospi[8]);
+    u7 = vmlaq_n_s32(x, in[7 * 2 + col], cospi[56]);
+    u7 = vaddq_s32(u7, rnding);
+    u7 = vshlq_s32(u7, v_bit);
+
+    x = vmulq_n_s32(in[5 * 2 + col], cospi[24]);
+    u5 = vmlaq_n_s32(x, in[3 * 2 + col], -cospi[40]);
+    u5 = vaddq_s32(u5, rnding);
+    u5 = vshlq_s32(u5, v_bit);
+
+    x = vmulq_n_s32(in[5 * 2 + col], cospi[40]);
+    u6 = vmlaq_n_s32(x, in[3 * 2 + col], cospi[24]);
+    u6 = vaddq_s32(u6, rnding);
+    u6 = vshlq_s32(u6, v_bit);
+
+    // stage 3
+    x = vmulq_n_s32(u0, cospi[32]);
+    y = vmulq_n_s32(u1, cospi[32]);
+    v0 = vaddq_s32(x, y);
+    v0 = vaddq_s32(v0, rnding);
+    v0 = vshlq_s32(v0, v_bit);
+
+    v1 = vsubq_s32(x, y);
+    v1 = vaddq_s32(v1, rnding);
+    v1 = vshlq_s32(v1, v_bit);
+
+    x = vmulq_n_s32(u2, cospi[48]);
+    v2 = vmlaq_n_s32(x, u3, -cospi[16]);
+    v2 = vaddq_s32(v2, rnding);
+    v2 = vshlq_s32(v2, v_bit);
+
+    x = vmulq_n_s32(u2, cospi[16]);
+    v3 = vmlaq_n_s32(x, u3, cospi[48]);
+    v3 = vaddq_s32(v3, rnding);
+    v3 = vshlq_s32(v3, v_bit);
+
+    addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+    addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+    // stage 4
+    addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+    addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+    u4 = v4;
+    u7 = v7;
+
+    x = vmulq_n_s32(v5, cospi[32]);
+    y = vmulq_n_s32(v6, cospi[32]);
+    u6 = vaddq_s32(y, x);
+    u6 = vaddq_s32(u6, rnding);
+    u6 = vshlq_s32(u6, v_bit);
+
+    u5 = vsubq_s32(y, x);
+    u5 = vaddq_s32(u5, rnding);
+    u5 = vshlq_s32(u5, v_bit);
+
+    // stage 5
+    addsub_neon(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
+                &clamp_hi);
+    addsub_neon(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
+                &clamp_hi);
+    addsub_neon(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
+                &clamp_hi);
+    addsub_neon(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
+                &clamp_hi);
+  }
+
+  if (!do_cols) {
+    const int32x4_t rnding_shift = vdupq_n_s32(1 << (out_shift - 1));
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift, &rnding_shift);
+    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+  }
+}
+
+static void iadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                          int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int32x4_t kZero = vdupq_n_s32(0);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t u[8], v[8], x;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-1-2
+  // (1)
+  u[0] = vmlaq_n_s32(rnding, in[14], cospi[4]);
+  u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]);
+  u[0] = vshlq_s32(u[0], v_bit);
+
+  u[1] = vmlaq_n_s32(rnding, in[14], cospi[60]);
+  u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]);
+  u[1] = vshlq_s32(u[1], v_bit);
+
+  // (2)
+  u[2] = vmlaq_n_s32(rnding, in[10], cospi[20]);
+  u[2] = vmlaq_n_s32(u[2], in[4], cospi[44]);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vmlaq_n_s32(rnding, in[10], cospi[44]);
+  u[3] = vmlsq_n_s32(u[3], in[4], cospi[20]);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  // (3)
+  u[4] = vmlaq_n_s32(rnding, in[6], cospi[36]);
+  u[4] = vmlaq_n_s32(u[4], in[8], cospi[28]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlaq_n_s32(rnding, in[6], cospi[28]);
+  u[5] = vmlsq_n_s32(u[5], in[8], cospi[36]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  // (4)
+  u[6] = vmlaq_n_s32(rnding, in[2], cospi[52]);
+  u[6] = vmlaq_n_s32(u[6], in[12], cospi[12]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlaq_n_s32(rnding, in[2], cospi[12]);
+  u[7] = vmlsq_n_s32(u[7], in[12], cospi[52]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 3
+  addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
+  u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
+  u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]);
+  u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlaq_n_s32(rnding, v[7], cospi[48]);
+  u[7] = vmlaq_n_s32(u[7], v[6], cospi[16]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 5
+  addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
+  x = vmulq_n_s32(v[3], cospi[32]);
+  u[2] = vaddq_s32(v[0], x);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vsubq_s32(v[0], x);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
+  x = vmulq_n_s32(v[7], cospi[32]);
+  u[6] = vaddq_s32(v[0], x);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vsubq_s32(v[0], x);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[2] = vsubq_s32(kZero, u[4]);
+    out[4] = u[6];
+    out[6] = vsubq_s32(kZero, u[2]);
+    out[8] = u[3];
+    out[10] = vsubq_s32(kZero, u[7]);
+    out[12] = u[5];
+    out[14] = vsubq_s32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&u[0], &u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[6], &u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[3], &u[7], out + 8, out + 10, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[5], &u[1], out + 12, out + 14, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+  }
+
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = vmlaq_n_s32(rnding, in[15], cospi[4]);
+  u[0] = vmlaq_n_s32(u[0], in[1], cospi[60]);
+  u[0] = vshlq_s32(u[0], v_bit);
+
+  u[1] = vmlaq_n_s32(rnding, in[15], cospi[60]);
+  u[1] = vmlsq_n_s32(u[1], in[1], cospi[4]);
+  u[1] = vshlq_s32(u[1], v_bit);
+
+  // (2)
+  u[2] = vmlaq_n_s32(rnding, in[11], cospi[20]);
+  u[2] = vmlaq_n_s32(u[2], in[5], cospi[44]);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vmlaq_n_s32(rnding, in[11], cospi[44]);
+  u[3] = vmlsq_n_s32(u[3], in[5], cospi[20]);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  // (3)
+  u[4] = vmlaq_n_s32(rnding, in[7], cospi[36]);
+  u[4] = vmlaq_n_s32(u[4], in[9], cospi[28]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlaq_n_s32(rnding, in[7], cospi[28]);
+  u[5] = vmlsq_n_s32(u[5], in[9], cospi[36]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  // (4)
+  u[6] = vmlaq_n_s32(rnding, in[3], cospi[52]);
+  u[6] = vmlaq_n_s32(u[6], in[13], cospi[12]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlaq_n_s32(rnding, in[3], cospi[12]);
+  u[7] = vmlsq_n_s32(u[7], in[13], cospi[52]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 3
+  addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
+  u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
+  u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]);
+  u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]);
+  u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 5
+  addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
+  x = vmulq_n_s32(v[3], cospi[32]);
+  u[2] = vaddq_s32(v[0], x);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vsubq_s32(v[0], x);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
+  x = vmulq_n_s32(v[7], cospi[32]);
+  u[6] = vaddq_s32(v[0], x);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vsubq_s32(v[0], x);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 7
+  if (do_cols) {
+    out[1] = u[0];
+    out[3] = vsubq_s32(kZero, u[4]);
+    out[5] = u[6];
+    out[7] = vsubq_s32(kZero, u[2]);
+    out[9] = u[3];
+    out[11] = vsubq_s32(kZero, u[7]);
+    out[13] = u[5];
+    out[15] = vsubq_s32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&u[0], &u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[6], &u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[3], &u[7], out + 9, out + 11, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[5], &u[1], out + 13, out + 15, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+  }
+}
+
+static void iidentity8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                            int bd, int out_shift) {
+  (void)bit;
+  out[0] = vaddq_s32(in[0], in[0]);
+  out[1] = vaddq_s32(in[1], in[1]);
+  out[2] = vaddq_s32(in[2], in[2]);
+  out[3] = vaddq_s32(in[3], in[3]);
+  out[4] = vaddq_s32(in[4], in[4]);
+  out[5] = vaddq_s32(in[5], in[5]);
+  out[6] = vaddq_s32(in[6], in[6]);
+  out[7] = vaddq_s32(in[7], in[7]);
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+    const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+    const int32x4_t rnding = vdupq_n_s32(1 << (out_shift - 1));
+    round_shift_4x4(out, out_shift, &rnding);
+    round_shift_4x4(out + 4, out_shift, &rnding);
+    highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 8);
+  }
+}
+
+static uint16x8_t get_recon_8x8(const uint16x8_t pred, int32x4_t res_lo,
+                                int32x4_t res_hi, int fliplr, int bd) {
+  uint16x8x2_t x;
+
+  if (fliplr) {
+    res_lo = vrev64q_s32(res_lo);
+    res_lo = vextq_s32(res_lo, res_lo, 2);
+    res_hi = vrev64q_s32(res_hi);
+    res_hi = vextq_s32(res_hi, res_hi, 2);
+    x.val[0] = vreinterpretq_u16_s32(
+        vaddw_s16(res_hi, vreinterpret_s16_u16(vget_low_u16(pred))));
+    x.val[1] = vreinterpretq_u16_s32(
+        vaddw_s16(res_lo, vreinterpret_s16_u16(vget_high_u16(pred))));
+
+  } else {
+    x.val[0] = vreinterpretq_u16_s32(
+        vaddw_s16(res_lo, vreinterpret_s16_u16(vget_low_u16(pred))));
+    x.val[1] = vreinterpretq_u16_s32(
+        vaddw_s16(res_hi, vreinterpret_s16_u16(vget_high_u16(pred))));
+  }
+
+  uint16x8_t x2 = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])),
+                               vqmovn_u32(vreinterpretq_u32_u16(x.val[1])));
+  const uint16x8_t vmin = vdupq_n_u16(0);
+  const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
+  return highbd_clamp_u16(&x2, &vmin, &vmax);
+}
+
+static void write_buffer_8x8(int32x4_t *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  uint16x8_t u0, u1, u2, u3, u4, u5, u6, u7;
+  uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7;
+  const int32x4_t rnding = vdupq_n_s32(1 << (shift - 1));
+  round_shift_8x8(in, shift, &rnding);
+
+  v0 = vld1q_u16(output + 0 * stride);
+  v1 = vld1q_u16(output + 1 * stride);
+  v2 = vld1q_u16(output + 2 * stride);
+  v3 = vld1q_u16(output + 3 * stride);
+  v4 = vld1q_u16(output + 4 * stride);
+  v5 = vld1q_u16(output + 5 * stride);
+  v6 = vld1q_u16(output + 6 * stride);
+  v7 = vld1q_u16(output + 7 * stride);
+
+  if (flipud) {
+    u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+  } else {
+    u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+  }
+
+  vst1q_u16(output + 0 * stride, u0);
+  vst1q_u16(output + 1 * stride, u1);
+  vst1q_u16(output + 2 * stride, u2);
+  vst1q_u16(output + 3 * stride, u3);
+  vst1q_u16(output + 4 * stride, u4);
+  vst1q_u16(output + 5 * stride, u5);
+  vst1q_u16(output + 6 * stride, u6);
+  vst1q_u16(output + 7 * stride, u7);
+}
+
+void av1_inv_txfm2d_add_8x8_neon(const int32_t *input, uint16_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  int32x4_t in[16], out[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      idct8x8_neon(out, in, INV_COS_BIT, 0, bd, -shift[0]);
+      transpose_8x8(in, out);
+      idct8x8_neon(out, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, INV_COS_BIT, 0, bd, -shift[0]);
+      transpose_8x8(in, out);
+      idct8x8_neon(out, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      idct8x8_neon(out, in, INV_COS_BIT, 0, bd, -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, INV_COS_BIT, 0, bd, -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      idct8x8_neon(out, in, INV_COS_BIT, 0, bd, -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, INV_COS_BIT, 0, bd, -shift[0]);
+      transpose_8x8(in, out);
+      idct8x8_neon(out, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, INV_COS_BIT, 0, bd, -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, INV_COS_BIT, 0, bd, -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, INV_COS_BIT, 0, bd, -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_neon(out, in, INV_COS_BIT, 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    default: assert(0);
+  }
+}
+
+static void idct8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+                              int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t x;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-1-2-3
+  x = vmulq_n_s32(in[0], cospi[32]);
+  x = vaddq_s32(vshlq_s32(x, v_bit), rnding);
+
+  // stage 4-5
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    x = vaddq_s32(x, offset);
+    x = vshlq_s32(x, vdupq_n_s32(-out_shift));
+  }
+
+  x = vmaxq_s32(x, clamp_lo);
+  x = vminq_s32(x, clamp_hi);
+  out[0] = x;
+  out[1] = x;
+  out[2] = x;
+  out[3] = x;
+  out[4] = x;
+  out[5] = x;
+  out[6] = x;
+  out[7] = x;
+}
+
+static void idct8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit,
+                             int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
+  int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
+  int32x4_t x, y;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  // stage 0
+  // stage 1
+  // stage 2
+  u0 = in[0];
+  u1 = in[4];
+  u2 = in[2];
+  u3 = in[6];
+
+  x = vmlaq_n_s32(rnding, in[1], cospi[56]);
+  u4 = vmlaq_n_s32(x, in[7], -cospi[8]);
+  u4 = vshlq_s32(u4, v_bit);
+
+  x = vmlaq_n_s32(rnding, in[1], cospi[8]);
+  u7 = vmlaq_n_s32(x, in[7], cospi[56]);
+  u7 = vshlq_s32(u7, v_bit);
+
+  x = vmlaq_n_s32(rnding, in[5], cospi[24]);
+  u5 = vmlaq_n_s32(x, in[3], -cospi[40]);
+  u5 = vshlq_s32(u5, v_bit);
+
+  x = vmlaq_n_s32(rnding, in[5], cospi[40]);
+  u6 = vmlaq_n_s32(x, in[3], cospi[24]);
+  u6 = vshlq_s32(u6, v_bit);
+
+  // stage 3
+  x = vmlaq_n_s32(rnding, u0, cospi[32]);
+  y = vmulq_n_s32(u1, cospi[32]);
+  v0 = vaddq_s32(x, y);
+  v0 = vshlq_s32(v0, v_bit);
+
+  v1 = vsubq_s32(x, y);
+  v1 = vshlq_s32(v1, v_bit);
+
+  x = vmlaq_n_s32(rnding, u2, cospi[48]);
+  v2 = vmlaq_n_s32(x, u3, -cospi[16]);
+  v2 = vshlq_s32(v2, v_bit);
+
+  x = vmlaq_n_s32(rnding, u2, cospi[16]);
+  v3 = vmlaq_n_s32(x, u3, cospi[48]);
+  v3 = vshlq_s32(v3, v_bit);
+
+  addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+  addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+  // stage 4
+  addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+  addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+  u4 = v4;
+  u7 = v7;
+
+  x = vmulq_n_s32(v5, cospi[32]);
+  y = vmlaq_n_s32(rnding, v6, cospi[32]);
+  u6 = vaddq_s32(y, x);
+  u6 = vshlq_s32(u6, v_bit);
+
+  u5 = vsubq_s32(y, x);
+  u5 = vshlq_s32(u5, v_bit);
+
+  // stage 5
+  addsub_neon(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+  addsub_neon(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+  addsub_neon(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+  addsub_neon(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t rnding32 = vdupq_n_s32(1 << (out_shift - 1));
+    round_shift_4x4(out, out_shift, &rnding32);
+    round_shift_4x4(out + 4, out_shift, &rnding32);
+    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 8);
+  }
+}
+
+static void iadst8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+                               int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  int32x4_t u[8], x;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-2
+
+  u[0] = vmlaq_n_s32(rnding, in[0], cospi[60]);
+  u[0] = vshlq_s32(u[0], v_bit);
+
+  u[1] = vmlaq_n_s32(rnding, in[0], cospi[4]);
+  u[1] = vshlq_s32(vnegq_s32(u[1]), v_bit);
+
+  // stage 3-4
+  int32x4_t temp1, temp2;
+  temp1 = vmlaq_n_s32(rnding, u[0], cospi[16]);
+  temp1 = vmlaq_n_s32(temp1, u[1], cospi[48]);
+  temp1 = vshlq_s32(temp1, v_bit);
+  u[4] = temp1;
+
+  temp2 = vmlaq_n_s32(rnding, u[0], cospi[48]);
+  u[5] = vmlsq_n_s32(temp2, u[1], cospi[16]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  // stage 5-6
+  temp1 = vmlaq_n_s32(rnding, u[0], cospi[32]);
+  x = vmulq_n_s32(u[1], cospi[32]);
+  u[2] = vaddq_s32(temp1, x);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vsubq_s32(temp1, x);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  temp1 = vmlaq_n_s32(rnding, u[4], cospi[32]);
+  x = vmulq_n_s32(u[5], cospi[32]);
+  u[6] = vaddq_s32(temp1, x);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vsubq_s32(temp1, x);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = vnegq_s32(u[4]);
+    out[2] = u[6];
+    out[3] = vnegq_s32(u[2]);
+    out[4] = u[3];
+    out[5] = vnegq_s32(u[7]);
+    out[6] = u[5];
+    out[7] = vnegq_s32(u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+  }
+}
+
+static void iadst8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit,
+                              int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  // const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t u[8], v[8], x;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-2
+
+  u[0] = vmlaq_n_s32(rnding, in[7], cospi[4]);
+  u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]);
+  u[0] = vshlq_s32(u[0], v_bit);
+
+  u[1] = vmlaq_n_s32(rnding, in[7], cospi[60]);
+  u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]);
+  u[1] = vshlq_s32(u[1], v_bit);
+
+  // (2)
+  u[2] = vmlaq_n_s32(rnding, in[5], cospi[20]);
+  u[2] = vmlaq_n_s32(u[2], in[2], cospi[44]);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vmlaq_n_s32(rnding, in[5], cospi[44]);
+  u[3] = vmlsq_n_s32(u[3], in[2], cospi[20]);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  // (3)
+  u[4] = vmlaq_n_s32(rnding, in[3], cospi[36]);
+  u[4] = vmlaq_n_s32(u[4], in[4], cospi[28]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlaq_n_s32(rnding, in[3], cospi[28]);
+  u[5] = vmlsq_n_s32(u[5], in[4], cospi[36]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  // (4)
+  u[6] = vmulq_n_s32(in[1], cospi[52]);
+  u[6] = vmlaq_n_s32(u[6], in[6], cospi[12]);
+  u[6] = vaddq_s32(u[6], rnding);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmulq_n_s32(in[1], cospi[12]);
+  u[7] = vmlsq_n_s32(u[7], in[6], cospi[52]);
+  u[7] = vaddq_s32(u[7], rnding);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 3
+  addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
+  u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
+  u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  u[6] = vmlsq_n_s32(rnding, v[6], cospi[48]);
+  u[6] = vmlaq_n_s32(u[6], v[7], cospi[16]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]);
+  u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 5
+  addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
+  x = vmulq_n_s32(v[3], cospi[32]);
+  u[2] = vaddq_s32(v[0], x);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vsubq_s32(v[0], x);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
+  x = vmulq_n_s32(v[7], cospi[32]);
+  u[6] = vaddq_s32(v[0], x);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vsubq_s32(v[0], x);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = vnegq_s32(u[4]);
+    out[2] = u[6];
+    out[3] = vnegq_s32(u[2]);
+    out[4] = u[3];
+    out[5] = vnegq_s32(u[7]);
+    out[6] = u[5];
+    out[7] = vnegq_s32(u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+  }
+}
+
+static void idct16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-4
+  in[0] = vmlaq_n_s32(rnding, in[0], cospi[32]);
+  in[0] = vshlq_s32(in[0], v_bit);
+
+  // stage 5-7
+  if (!do_cols) {
+    log_range = AOMMAX(16, bd + 6);
+    clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+    clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+    if (out_shift != 0) {
+      int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+      in[0] = vaddq_s32(in[0], offset);
+      in[0] = vshlq_s32(in[0], vdupq_n_s32(-out_shift));
+    }
+  }
+
+  in[0] = vmaxq_s32(in[0], clamp_lo);
+  in[0] = vminq_s32(in[0], clamp_hi);
+  out[0] = in[0];
+  out[1] = in[0];
+  out[2] = in[0];
+  out[3] = in[0];
+  out[4] = in[0];
+  out[5] = in[0];
+  out[6] = in[0];
+  out[7] = in[0];
+  out[8] = in[0];
+  out[9] = in[0];
+  out[10] = in[0];
+  out[11] = in[0];
+  out[12] = in[0];
+  out[13] = in[0];
+  out[14] = in[0];
+  out[15] = in[0];
+}
+
+static void idct16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  int32x4_t u[16], x, y;
+  // stage 0-1
+  u[0] = in[0];
+  u[2] = in[4];
+  u[4] = in[2];
+  u[6] = in[6];
+  u[8] = in[1];
+  u[10] = in[5];
+  u[12] = in[3];
+  u[14] = in[7];
+
+  // stage 2
+  u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
+  u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
+
+  u[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding);
+  u[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding);
+
+  u[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding);
+  u[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding);
+
+  u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
+  u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
+
+  // stage 3
+  u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding);
+  u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding);
+  u[5] = half_btf_0_m_neon_r(&cospi[40], &u[6], &v_bit, &rnding);
+  u[6] = half_btf_0_neon_r(&cospi[24], &u[6], &v_bit, &rnding);
+
+  addsub_neon(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+  addsub_neon(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+  addsub_neon(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+  addsub_neon(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  x = vmlaq_n_s32(rnding, u[0], cospi[32]);
+  u[0] = vshlq_s32(x, v_bit);
+  u[1] = u[0];
+
+  u[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding);
+  u[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding);
+
+  addsub_neon(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+  addsub_neon(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+  x = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
+                             &rnding);
+  u[14] =
+      half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+  u[9] = x;
+  y = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], &v_bit,
+                             &rnding);
+  u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], &v_bit,
+                                 &rnding);
+  u[10] = y;
+
+  // stage 5
+  addsub_neon(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+  x = vmulq_n_s32(u[5], cospi[32]);
+  y = vmlaq_n_s32(rnding, u[6], cospi[32]);
+  u[5] = vsubq_s32(y, x);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  u[6] = vaddq_s32(y, x);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+  addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+  addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+  addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  addsub_neon(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+  x = vmulq_n_s32(u[10], cospi[32]);
+  y = vmlaq_n_s32(rnding, u[13], cospi[32]);
+  u[10] = vsubq_s32(y, x);
+  u[10] = vshlq_s32(u[10], v_bit);
+
+  u[13] = vaddq_s32(x, y);
+  u[13] = vshlq_s32(u[13], v_bit);
+
+  x = vmulq_n_s32(u[11], cospi[32]);
+  y = vmlaq_n_s32(rnding, u[12], cospi[32]);
+  u[11] = vsubq_s32(y, x);
+  u[11] = vshlq_s32(u[11], v_bit);
+
+  u[12] = vaddq_s32(x, y);
+  u[12] = vshlq_s32(u[12], v_bit);
+  // stage 7
+  addsub_neon(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+  addsub_neon(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+  addsub_neon(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+  addsub_neon(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+  addsub_neon(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    const int32x4_t rnding_shift = vdupq_n_s32(1 << (out_shift - 1));
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift, &rnding_shift);
+    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+  }
+}
+
+static void iadst16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  int32x4_t v[16], x, y, temp1, temp2;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0
+  // stage 1
+  // stage 2
+  v[0] = vmlaq_n_s32(rnding, in[0], cospi[62]);
+  v[0] = vshlq_s32(v[0], v_bit);
+
+  v[1] = vmlsq_n_s32(rnding, in[0], cospi[2]);
+  v[1] = vshlq_s32(v[1], v_bit);
+
+  // stage 3
+  v[8] = v[0];
+  v[9] = v[1];
+
+  // stage 4
+  temp1 = vmlaq_n_s32(rnding, v[8], cospi[8]);
+  temp1 = vmlaq_n_s32(temp1, v[9], cospi[56]);
+  temp1 = vshlq_s32(temp1, v_bit);
+
+  temp2 = vmlaq_n_s32(rnding, v[8], cospi[56]);
+  temp2 = vmlsq_n_s32(temp2, v[9], cospi[8]);
+  temp2 = vshlq_s32(temp2, v_bit);
+  v[8] = temp1;
+  v[9] = temp2;
+
+  // stage 5
+  v[4] = v[0];
+  v[5] = v[1];
+  v[12] = v[8];
+  v[13] = v[9];
+
+  // stage 6
+  temp1 = vmlaq_n_s32(rnding, v[4], cospi[16]);
+  temp1 = vmlaq_n_s32(temp1, v[5], cospi[48]);
+  temp1 = vshlq_s32(temp1, v_bit);
+
+  temp2 = vmlaq_n_s32(rnding, v[4], cospi[48]);
+  temp2 = vmlsq_n_s32(temp2, v[5], cospi[16]);
+  temp2 = vshlq_s32(temp2, v_bit);
+  v[4] = temp1;
+  v[5] = temp2;
+
+  temp1 = vmlaq_n_s32(rnding, v[12], cospi[16]);
+  temp1 = vmlaq_n_s32(temp1, v[13], cospi[48]);
+  temp1 = vshlq_s32(temp1, v_bit);
+
+  temp2 = vmlaq_n_s32(rnding, v[12], cospi[48]);
+  temp2 = vmlsq_n_s32(temp2, v[13], cospi[16]);
+  temp2 = vshlq_s32(temp2, v_bit);
+  v[12] = temp1;
+  v[13] = temp2;
+
+  // stage 7
+  v[2] = v[0];
+  v[3] = v[1];
+  v[6] = v[4];
+  v[7] = v[5];
+  v[10] = v[8];
+  v[11] = v[9];
+  v[14] = v[12];
+  v[15] = v[13];
+
+  // stage 8
+  y = vmlaq_n_s32(rnding, v[2], cospi[32]);
+  x = vmulq_n_s32(v[3], cospi[32]);
+  v[2] = vaddq_s32(y, x);
+  v[2] = vshlq_s32(v[2], v_bit);
+
+  v[3] = vsubq_s32(y, x);
+  v[3] = vshlq_s32(v[3], v_bit);
+
+  y = vmlaq_n_s32(rnding, v[6], cospi[32]);
+  x = vmulq_n_s32(v[7], cospi[32]);
+  v[6] = vaddq_s32(y, x);
+  v[6] = vshlq_s32(v[6], v_bit);
+
+  v[7] = vsubq_s32(y, x);
+  v[7] = vshlq_s32(v[7], v_bit);
+
+  y = vmlaq_n_s32(rnding, v[10], cospi[32]);
+  x = vmulq_n_s32(v[11], cospi[32]);
+  v[10] = vaddq_s32(y, x);
+  v[10] = vshlq_s32(v[10], v_bit);
+
+  v[11] = vsubq_s32(y, x);
+  v[11] = vshlq_s32(v[11], v_bit);
+
+  y = vmlaq_n_s32(rnding, v[14], cospi[32]);
+  x = vmulq_n_s32(v[15], cospi[32]);
+  v[14] = vaddq_s32(y, x);
+  v[14] = vshlq_s32(v[14], v_bit);
+
+  v[15] = vsubq_s32(y, x);
+  v[15] = vshlq_s32(v[15], v_bit);
+
+  // stage 9
+  if (do_cols) {
+    out[0] = v[0];
+    out[1] = vnegq_s32(v[8]);
+    out[2] = v[12];
+    out[3] = vnegq_s32(v[4]);
+    out[4] = v[6];
+    out[5] = vnegq_s32(v[14]);
+    out[6] = v[10];
+    out[7] = vnegq_s32(v[2]);
+    out[8] = v[3];
+    out[9] = vnegq_s32(v[11]);
+    out[10] = v[15];
+    out[11] = vnegq_s32(v[7]);
+    out[12] = v[5];
+    out[13] = vnegq_s32(v[13]);
+    out[14] = v[9];
+    out[15] = vnegq_s32(v[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+  }
+}
+
+static void iadst16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t zero = vdupq_n_s32(0);
+  int32x4_t u[16], x, y;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-2
+  u[0] = vmlaq_n_s32(rnding, in[0], cospi[62]);
+  u[0] = vshlq_s32(u[0], v_bit);
+
+  u[1] = vmlsq_n_s32(rnding, in[0], cospi[2]);
+  u[1] = vshlq_s32(u[1], v_bit);
+
+  u[2] = vmlaq_n_s32(rnding, in[2], cospi[54]);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vmlsq_n_s32(rnding, in[2], cospi[10]);
+  u[3] = vshlq_s32(u[3], v_bit);
+
+  u[4] = vmlaq_n_s32(rnding, in[4], cospi[46]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlsq_n_s32(rnding, in[4], cospi[18]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  u[6] = vmlaq_n_s32(rnding, in[6], cospi[38]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlsq_n_s32(rnding, in[6], cospi[26]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  u[8] = vmlaq_n_s32(rnding, in[7], cospi[34]);
+  u[8] = vshlq_s32(u[8], v_bit);
+
+  u[9] = vmlaq_n_s32(rnding, in[7], cospi[30]);
+  u[9] = vshlq_s32(u[9], v_bit);
+
+  u[10] = vmlaq_n_s32(rnding, in[5], cospi[42]);
+  u[10] = vshlq_s32(u[10], v_bit);
+
+  u[11] = vmlaq_n_s32(rnding, in[5], cospi[22]);
+  u[11] = vshlq_s32(u[11], v_bit);
+
+  u[12] = vmlaq_n_s32(rnding, in[3], cospi[50]);
+  u[12] = vshlq_s32(u[12], v_bit);
+
+  u[13] = vmlaq_n_s32(rnding, in[3], cospi[14]);
+  u[13] = vshlq_s32(u[13], v_bit);
+
+  u[14] = vmlaq_n_s32(rnding, in[1], cospi[58]);
+  u[14] = vshlq_s32(u[14], v_bit);
+
+  u[15] = vmlaq_n_s32(rnding, in[1], cospi[6]);
+  u[15] = vshlq_s32(u[15], v_bit);
+
+  // stage 3
+  addsub_neon(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+  addsub_neon(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+  addsub_neon(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+  addsub_neon(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+  addsub_neon(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  y = vmlaq_n_s32(rnding, u[8], cospi[56]);
+  u[8] = vmlaq_n_s32(rnding, u[8], cospi[8]);
+  u[8] = vmlaq_n_s32(u[8], u[9], cospi[56]);
+  u[8] = vshlq_s32(u[8], v_bit);
+
+  u[9] = vmlsq_n_s32(y, u[9], cospi[8]);
+  u[9] = vshlq_s32(u[9], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[10], cospi[24]);
+  u[10] = vmlaq_n_s32(rnding, u[10], cospi[40]);
+  u[10] = vmlaq_n_s32(u[10], u[11], cospi[24]);
+  u[10] = vshlq_s32(u[10], v_bit);
+
+  u[11] = vmlsq_n_s32(y, u[11], cospi[40]);
+  u[11] = vshlq_s32(u[11], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[12], cospi[8]);
+  u[12] = vmlsq_n_s32(rnding, u[12], cospi[56]);
+  u[12] = vmlaq_n_s32(u[12], u[13], cospi[8]);
+  u[12] = vshlq_s32(u[12], v_bit);
+
+  u[13] = vmlaq_n_s32(y, u[13], cospi[56]);
+  u[13] = vshlq_s32(u[13], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[14], cospi[40]);
+  u[14] = vmlsq_n_s32(rnding, u[14], cospi[24]);
+  u[14] = vmlaq_n_s32(u[14], u[15], cospi[40]);
+  u[14] = vshlq_s32(u[14], v_bit);
+
+  u[15] = vmlaq_n_s32(y, u[15], cospi[24]);
+  u[15] = vshlq_s32(u[15], v_bit);
+
+  // stage 5
+  addsub_neon(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+  addsub_neon(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+  addsub_neon(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+  addsub_neon(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+  addsub_neon(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+  addsub_neon(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  y = vmlaq_n_s32(rnding, u[4], cospi[48]);
+  u[4] = vmlaq_n_s32(rnding, u[4], cospi[16]);
+  u[4] = vmlaq_n_s32(u[4], u[5], cospi[48]);
+  u[4] = vshlq_s32(u[4], v_bit);
+
+  u[5] = vmlsq_n_s32(y, u[5], cospi[16]);
+  u[5] = vshlq_s32(u[5], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[6], cospi[16]);
+  u[6] = vmlsq_n_s32(rnding, u[6], cospi[48]);
+  u[6] = vmlaq_n_s32(u[6], u[7], cospi[16]);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vmlaq_n_s32(y, u[7], cospi[48]);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[12], cospi[48]);
+  u[12] = vmulq_n_s32(u[12], cospi[16]);
+  u[12] = vmlaq_n_s32(u[12], u[13], cospi[48]);
+  u[12] = vshlq_s32(u[12], v_bit);
+
+  u[13] = vmlsq_n_s32(y, u[13], cospi[16]);
+  u[13] = vshlq_s32(u[13], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[14], cospi[16]);
+  u[14] = vmlsq_n_s32(rnding, u[14], cospi[48]);
+  u[14] = vmlaq_n_s32(u[14], u[15], cospi[16]);
+  u[14] = vshlq_s32(u[14], v_bit);
+
+  u[15] = vmlaq_n_s32(y, u[15], cospi[48]);
+  u[15] = vshlq_s32(u[15], v_bit);
+
+  // stage 7
+  addsub_neon(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+  addsub_neon(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+  addsub_neon(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+  addsub_neon(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+  addsub_neon(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+  addsub_neon(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+  addsub_neon(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+  addsub_neon(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 8
+  y = vmlaq_n_s32(rnding, u[2], cospi[32]);
+  x = vmulq_n_s32(u[3], cospi[32]);
+  u[2] = vaddq_s32(y, x);
+  u[2] = vshlq_s32(u[2], v_bit);
+
+  u[3] = vsubq_s32(y, x);
+  u[3] = vshlq_s32(u[3], v_bit);
+  y = vmlaq_n_s32(rnding, u[6], cospi[32]);
+  x = vmulq_n_s32(u[7], cospi[32]);
+  u[6] = vaddq_s32(y, x);
+  u[6] = vshlq_s32(u[6], v_bit);
+
+  u[7] = vsubq_s32(y, x);
+  u[7] = vshlq_s32(u[7], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[10], cospi[32]);
+  x = vmulq_n_s32(u[11], cospi[32]);
+  u[10] = vaddq_s32(y, x);
+  u[10] = vshlq_s32(u[10], v_bit);
+
+  u[11] = vsubq_s32(y, x);
+  u[11] = vshlq_s32(u[11], v_bit);
+
+  y = vmlaq_n_s32(rnding, u[14], cospi[32]);
+  x = vmulq_n_s32(u[15], cospi[32]);
+  u[14] = vaddq_s32(y, x);
+  u[14] = vshlq_s32(u[14], v_bit);
+
+  u[15] = vsubq_s32(y, x);
+  u[15] = vshlq_s32(u[15], v_bit);
+
+  // stage 9
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = vsubq_s32(zero, u[8]);
+    out[2] = u[12];
+    out[3] = vsubq_s32(zero, u[4]);
+    out[4] = u[6];
+    out[5] = vsubq_s32(zero, u[14]);
+    out[6] = u[10];
+    out[7] = vsubq_s32(zero, u[2]);
+    out[8] = u[3];
+    out[9] = vsubq_s32(zero, u[11]);
+    out[10] = u[15];
+    out[11] = vsubq_s32(zero, u[7]);
+    out[12] = u[5];
+    out[13] = vsubq_s32(zero, u[13]);
+    out[14] = u[9];
+    out[15] = vsubq_s32(zero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&u[0], &u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&u[12], &u[4], out + 2, out + 3, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[6], &u[14], out + 4, out + 5, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[10], &u[2], out + 6, out + 7, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[3], &u[11], out + 8, out + 9, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[15], &u[7], out + 10, out + 11, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[5], &u[13], out + 12, out + 13, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&u[9], &u[1], out + 14, out + 15, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+  }
+}
+
+static void idct16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                           int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t u[16], v[16], x, y;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  {
+    // stage 0-1
+    u[0] = in[0];
+    u[1] = in[8];
+    u[2] = in[4];
+    u[3] = in[12];
+    u[4] = in[2];
+    u[5] = in[10];
+    u[6] = in[6];
+    u[7] = in[14];
+    u[8] = in[1];
+    u[9] = in[9];
+    u[10] = in[5];
+    u[11] = in[13];
+    u[12] = in[3];
+    u[13] = in[11];
+    u[14] = in[7];
+    u[15] = in[15];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_neon_mode01_r(&cospi[60], &u[8], &cospi[4], &u[15], &v_bit,
+                                  &rnding);
+    v[9] = half_btf_neon_mode01_r(&cospi[28], &u[9], &cospi[36], &u[14], &v_bit,
+                                  &rnding);
+    v[10] = half_btf_neon_mode01_r(&cospi[44], &u[10], &cospi[20], &u[13],
+                                   &v_bit, &rnding);
+    v[11] = half_btf_neon_mode01_r(&cospi[12], &u[11], &cospi[52], &u[12],
+                                   &v_bit, &rnding);
+    v[12] = half_btf_neon_r(&cospi[52], &u[11], &cospi[12], &u[12], &v_bit,
+                            &rnding);
+    v[13] = half_btf_neon_r(&cospi[20], &u[10], &cospi[44], &u[13], &v_bit,
+                            &rnding);
+    v[14] =
+        half_btf_neon_r(&cospi[36], &u[9], &cospi[28], &u[14], &v_bit, &rnding);
+    v[15] =
+        half_btf_neon_r(&cospi[4], &u[8], &cospi[60], &u[15], &v_bit, &rnding);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_neon_mode01_r(&cospi[56], &v[4], &cospi[8], &v[7], &v_bit,
+                                  &rnding);
+    u[5] = half_btf_neon_mode01_r(&cospi[24], &v[5], &cospi[40], &v[6], &v_bit,
+                                  &rnding);
+    u[6] =
+        half_btf_neon_r(&cospi[40], &v[5], &cospi[24], &v[6], &v_bit, &rnding);
+    u[7] =
+        half_btf_neon_r(&cospi[8], &v[4], &cospi[56], &v[7], &v_bit, &rnding);
+    addsub_neon(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_neon(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_neon(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_neon(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = vmlaq_n_s32(rnding, u[0], cospi[32]);
+    y = vmulq_n_s32(u[1], cospi[32]);
+    v[0] = vaddq_s32(x, y);
+    v[0] = vshlq_s32(v[0], v_bit);
+
+    v[1] = vsubq_s32(x, y);
+    v[1] = vshlq_s32(v[1], v_bit);
+
+    v[2] = half_btf_neon_mode01_r(&cospi[48], &u[2], &cospi[16], &u[3], &v_bit,
+                                  &rnding);
+    v[3] =
+        half_btf_neon_r(&cospi[16], &u[2], &cospi[48], &u[3], &v_bit, &rnding);
+    addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
+                                  &rnding);
+    v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13],
+                                   &v_bit, &rnding);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
+                                   &v_bit, &rnding);
+    v[14] =
+        half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+    v[15] = u[15];
+
+    // stage 5
+    addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+    u[4] = v[4];
+
+    x = vmulq_n_s32(v[5], cospi[32]);
+    y = vmlaq_n_s32(rnding, v[6], cospi[32]);
+    u[5] = vsubq_s32(y, x);
+    u[5] = vshlq_s32(u[5], v_bit);
+
+    u[6] = vaddq_s32(y, x);
+    u[6] = vshlq_s32(u[6], v_bit);
+
+    u[7] = v[7];
+    addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_neon(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+    addsub_neon(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+    addsub_neon(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+    addsub_neon(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = vmulq_n_s32(u[10], cospi[32]);
+    y = vmlaq_n_s32(rnding, u[13], cospi[32]);
+    v[10] = vsubq_s32(y, x);
+    v[10] = vshlq_s32(v[10], v_bit);
+
+    v[13] = vaddq_s32(x, y);
+    v[13] = vshlq_s32(v[13], v_bit);
+
+    x = vmulq_n_s32(u[11], cospi[32]);
+    y = vmlaq_n_s32(rnding, u[12], cospi[32]);
+    v[11] = vsubq_s32(y, x);
+    v[11] = vshlq_s32(v[11], v_bit);
+
+    v[12] = vaddq_s32(x, y);
+    v[12] = vshlq_s32(v[12], v_bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    addsub_neon(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+    addsub_neon(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+    addsub_neon(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+    addsub_neon(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+    addsub_neon(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+    addsub_neon(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+    addsub_neon(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+    addsub_neon(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+    if (!do_cols) {
+      const int32x4_t rnding_shift = vdupq_n_s32(1 << (out_shift - 1));
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+      const int32x4_t clamp_hi_out =
+          vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8(out, out_shift, &rnding_shift);
+      highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+    }
+  }
+}
+
+static void iadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                            int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  const int32x4_t zero = vdupq_n_s32(0);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  int32x4_t u[16], v[16], x, y;
+  // Calculate the column 0, 1, 2, 3
+  // stage 0
+  // stage 1
+  // stage 2
+  v[0] = vmlaq_n_s32(rnding, in[15], cospi[2]);
+  v[0] = vmlaq_n_s32(v[0], in[0], cospi[62]);
+  v[0] = vshlq_s32(v[0], v_bit);
+
+  v[1] = vmlaq_n_s32(rnding, in[15], cospi[62]);
+  v[1] = vmlsq_n_s32(v[1], in[0], cospi[2]);
+  v[1] = vshlq_s32(v[1], v_bit);
+
+  v[2] = vmlaq_n_s32(rnding, in[13], cospi[10]);
+  v[2] = vmlaq_n_s32(v[2], in[2], cospi[54]);
+  v[2] = vshlq_s32(v[2], v_bit);
+
+  v[3] = vmlaq_n_s32(rnding, in[13], cospi[54]);
+  v[3] = vmlsq_n_s32(v[3], in[2], cospi[10]);
+  v[3] = vshlq_s32(v[3], v_bit);
+
+  v[4] = vmlaq_n_s32(rnding, in[11], cospi[18]);
+  v[4] = vmlaq_n_s32(v[4], in[4], cospi[46]);
+  v[4] = vshlq_s32(v[4], v_bit);
+
+  v[5] = vmlaq_n_s32(rnding, in[11], cospi[46]);
+  v[5] = vmlsq_n_s32(v[5], in[4], cospi[18]);
+  v[5] = vshlq_s32(v[5], v_bit);
+
+  v[6] = vmlaq_n_s32(rnding, in[9], cospi[26]);
+  v[6] = vmlaq_n_s32(v[6], in[6], cospi[38]);
+  v[6] = vshlq_s32(v[6], v_bit);
+
+  v[7] = vmlaq_n_s32(rnding, in[9], cospi[38]);
+  v[7] = vmlsq_n_s32(v[7], in[6], cospi[26]);
+  v[7] = vshlq_s32(v[7], v_bit);
+
+  v[8] = vmlaq_n_s32(rnding, in[7], cospi[34]);
+  v[8] = vmlaq_n_s32(v[8], in[8], cospi[30]);
+  v[8] = vshlq_s32(v[8], v_bit);
+
+  v[9] = vmlaq_n_s32(rnding, in[7], cospi[30]);
+  v[9] = vmlsq_n_s32(v[9], in[8], cospi[34]);
+  v[9] = vshlq_s32(v[9], v_bit);
+
+  v[10] = vmlaq_n_s32(rnding, in[5], cospi[42]);
+  v[10] = vmlaq_n_s32(v[10], in[10], cospi[22]);
+  v[10] = vshlq_s32(v[10], v_bit);
+
+  v[11] = vmlaq_n_s32(rnding, in[5], cospi[22]);
+  v[11] = vmlsq_n_s32(v[11], in[10], cospi[42]);
+  v[11] = vshlq_s32(v[11], v_bit);
+
+  v[12] = vmlaq_n_s32(rnding, in[3], cospi[50]);
+  v[12] = vmlaq_n_s32(v[12], in[12], cospi[14]);
+  v[12] = vshlq_s32(v[12], v_bit);
+
+  v[13] = vmlaq_n_s32(rnding, in[3], cospi[14]);
+  v[13] = vmlsq_n_s32(v[13], in[12], cospi[50]);
+  v[13] = vshlq_s32(v[13], v_bit);
+
+  v[14] = vmlaq_n_s32(rnding, in[1], cospi[58]);
+  v[14] = vmlaq_n_s32(v[14], in[14], cospi[6]);
+  v[14] = vshlq_s32(v[14], v_bit);
+
+  v[15] = vmlaq_n_s32(rnding, in[1], cospi[6]);
+  v[15] = vmlsq_n_s32(v[15], in[14], cospi[58]);
+  v[15] = vshlq_s32(v[15], v_bit);
+
+  // stage 3
+  addsub_neon(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+  addsub_neon(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+  addsub_neon(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+  addsub_neon(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+  addsub_neon(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+  addsub_neon(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+  addsub_neon(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+  addsub_neon(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+  v[4] = u[4];
+  v[5] = u[5];
+  v[6] = u[6];
+  v[7] = u[7];
+
+  v[8] = vmlaq_n_s32(rnding, u[8], cospi[8]);
+  v[8] = vmlaq_n_s32(v[8], u[9], cospi[56]);
+  v[8] = vshlq_s32(v[8], v_bit);
+
+  v[9] = vmlaq_n_s32(rnding, u[8], cospi[56]);
+  v[9] = vmlsq_n_s32(v[9], u[9], cospi[8]);
+  v[9] = vshlq_s32(v[9], v_bit);
+
+  v[10] = vmlaq_n_s32(rnding, u[10], cospi[40]);
+  v[10] = vmlaq_n_s32(v[10], u[11], cospi[24]);
+  v[10] = vshlq_s32(v[10], v_bit);
+
+  v[11] = vmlaq_n_s32(rnding, u[10], cospi[24]);
+  v[11] = vmlsq_n_s32(v[11], u[11], cospi[40]);
+  v[11] = vshlq_s32(v[11], v_bit);
+
+  v[12] = vmlaq_n_s32(rnding, u[12], -cospi[56]);
+  v[12] = vmlaq_n_s32(v[12], u[13], cospi[8]);
+  v[12] = vshlq_s32(v[12], v_bit);
+
+  v[13] = vmlaq_n_s32(rnding, u[12], cospi[8]);
+  v[13] = vmlsq_n_s32(v[13], u[13], -cospi[56]);
+  v[13] = vshlq_s32(v[13], v_bit);
+
+  v[14] = vmlaq_n_s32(rnding, u[14], -cospi[24]);
+  v[14] = vmlaq_n_s32(v[14], u[15], cospi[40]);
+  v[14] = vshlq_s32(v[14], v_bit);
+
+  v[15] = vmlaq_n_s32(rnding, u[14], cospi[40]);
+  v[15] = vmlsq_n_s32(v[15], u[15], -cospi[24]);
+  v[15] = vshlq_s32(v[15], v_bit);
+
+  // stage 5
+  addsub_neon(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+  addsub_neon(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+  addsub_neon(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+  addsub_neon(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+  addsub_neon(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+  addsub_neon(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+  addsub_neon(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+  addsub_neon(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+
+  v[4] = vmlaq_n_s32(rnding, u[4], cospi[16]);
+  v[4] = vmlaq_n_s32(v[4], u[5], cospi[48]);
+  v[4] = vshlq_s32(v[4], v_bit);
+
+  v[5] = vmlaq_n_s32(rnding, u[4], cospi[48]);
+  v[5] = vmlsq_n_s32(v[5], u[5], cospi[16]);
+  v[5] = vshlq_s32(v[5], v_bit);
+
+  v[6] = vmlaq_n_s32(rnding, u[6], -cospi[48]);
+  v[6] = vmlaq_n_s32(v[6], u[7], cospi[16]);
+  v[6] = vshlq_s32(v[6], v_bit);
+
+  v[7] = vmlaq_n_s32(rnding, u[6], cospi[16]);
+  v[7] = vmlsq_n_s32(v[7], u[7], -cospi[48]);
+  v[7] = vshlq_s32(v[7], v_bit);
+
+  v[8] = u[8];
+  v[9] = u[9];
+  v[10] = u[10];
+  v[11] = u[11];
+
+  v[12] = vmlaq_n_s32(rnding, u[12], cospi[16]);
+  v[12] = vmlaq_n_s32(v[12], u[13], cospi[48]);
+  v[12] = vshlq_s32(v[12], v_bit);
+
+  v[13] = vmlaq_n_s32(rnding, u[12], cospi[48]);
+  v[13] = vmlsq_n_s32(v[13], u[13], cospi[16]);
+  v[13] = vshlq_s32(v[13], v_bit);
+
+  v[14] = vmlaq_n_s32(rnding, u[14], -cospi[48]);
+  v[14] = vmlaq_n_s32(v[14], u[15], cospi[16]);
+  v[14] = vshlq_s32(v[14], v_bit);
+
+  v[15] = vmlaq_n_s32(rnding, u[14], cospi[16]);
+  v[15] = vmlsq_n_s32(v[15], u[15], -cospi[48]);
+  v[15] = vshlq_s32(v[15], v_bit);
+
+  // stage 7
+  addsub_neon(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+  addsub_neon(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+  addsub_neon(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+  addsub_neon(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+  addsub_neon(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+  addsub_neon(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+  addsub_neon(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+  addsub_neon(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 8
+  v[0] = u[0];
+  v[1] = u[1];
+
+  y = vmlaq_n_s32(rnding, u[2], cospi[32]);
+  x = vmulq_n_s32(u[3], cospi[32]);
+  v[2] = vaddq_s32(y, x);
+  v[2] = vshlq_s32(v[2], v_bit);
+
+  v[3] = vsubq_s32(y, x);
+  v[3] = vshlq_s32(v[3], v_bit);
+
+  v[4] = u[4];
+  v[5] = u[5];
+
+  y = vmlaq_n_s32(rnding, u[6], cospi[32]);
+  x = vmulq_n_s32(u[7], cospi[32]);
+  v[6] = vaddq_s32(y, x);
+  v[6] = vshlq_s32(v[6], v_bit);
+
+  v[7] = vsubq_s32(y, x);
+  v[7] = vshlq_s32(v[7], v_bit);
+
+  v[8] = u[8];
+  v[9] = u[9];
+
+  y = vmlaq_n_s32(rnding, u[10], cospi[32]);
+  x = vmulq_n_s32(u[11], cospi[32]);
+  v[10] = vaddq_s32(y, x);
+  v[10] = vshlq_s32(v[10], v_bit);
+
+  v[11] = vsubq_s32(y, x);
+  v[11] = vshlq_s32(v[11], v_bit);
+
+  v[12] = u[12];
+  v[13] = u[13];
+
+  y = vmlaq_n_s32(rnding, u[14], cospi[32]);
+  x = vmulq_n_s32(u[15], cospi[32]);
+  v[14] = vaddq_s32(y, x);
+  v[14] = vshlq_s32(v[14], v_bit);
+
+  v[15] = vsubq_s32(y, x);
+  v[15] = vshlq_s32(v[15], v_bit);
+
+  // stage 9
+  if (do_cols) {
+    out[0] = v[0];
+    out[1] = vsubq_s32(zero, v[8]);
+    out[2] = v[12];
+    out[3] = vsubq_s32(zero, v[4]);
+    out[4] = v[6];
+    out[5] = vsubq_s32(zero, v[14]);
+    out[6] = v[10];
+    out[7] = vsubq_s32(zero, v[2]);
+    out[8] = v[3];
+    out[9] = vsubq_s32(zero, v[11]);
+    out[10] = v[15];
+    out[11] = vsubq_s32(zero, v[7]);
+    out[12] = v[5];
+    out[13] = vsubq_s32(zero, v[13]);
+    out[14] = v[9];
+    out[15] = vsubq_s32(zero, v[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+    int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+    neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   &v_shift, &offset);
+    neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+    neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out,
+                   &clamp_hi_out, &v_shift, &offset);
+  }
+}
+static void iidentity16_neon(int32x4_t *in, int32x4_t *out, int bit,
+                             int do_cols, int bd, int out_shift) {
+  (void)bit;
+  int32x2_t fact = vdup_n_s32(2 * NewSqrt2);
+  int32x4x2_t a0;
+  int32x4_t zero = vdupq_n_s32(0);
+  const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1));
+  for (int i = 0; i < 16; i++) {
+    a0.val[0] = vreinterpretq_s32_s64(
+        vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
+    a0.val[0] = vreinterpretq_s32_s64(
+        vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
+    a0.val[1] = vextq_s32(in[i], zero, 1);
+    a0.val[1] = vreinterpretq_s32_s64(
+        vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
+    a0.val[1] = vreinterpretq_s32_s64(
+        vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
+    a0 = vzipq_s32(a0.val[0], a0.val[1]);
+#if defined(__aarch64__)
+    out[i] = vreinterpretq_s32_s64(vzip1q_s64(
+        vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
+#else
+    out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2);
+#endif
+  }
+
+  if (!do_cols) {
+    const int32x4_t rnding_shift = vdupq_n_s32(1 << (out_shift - 1));
+    const int log_range = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+    const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+    round_shift_8x8(out, out_shift, &rnding_shift);
+    highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 16);
+  }
+}
+static INLINE void idct64_stage8_neon(int32x4_t *u, const int32_t *cospi,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int i;
+  int32x4_t temp1, temp2, temp3, temp4;
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit,
+                                 rnding);
+  u[13] =
+      half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit, rnding);
+  u[10] = temp1;
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit,
+                                 rnding);
+  u[12] =
+      half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit, rnding);
+  u[11] = temp2;
+
+  for (i = 16; i < 20; ++i) {
+    addsub_neon(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+    addsub_neon(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59], v_bit,
+                                 rnding);
+  temp2 = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58], v_bit,
+                                 rnding);
+  temp3 = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57], v_bit,
+                                 rnding);
+  temp4 = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56], v_bit,
+                                 rnding);
+  u[56] =
+      half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], v_bit, rnding);
+  u[57] =
+      half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], v_bit, rnding);
+  u[58] =
+      half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], v_bit, rnding);
+  u[59] =
+      half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], v_bit, rnding);
+  u[36] = temp1;
+  u[37] = temp2;
+  u[38] = temp3;
+  u[39] = temp4;
+
+  temp1 = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55], v_bit,
+                                 rnding);
+  temp2 = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54], v_bit,
+                                 rnding);
+  temp3 = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53], v_bit,
+                                 rnding);
+  temp4 = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52], v_bit,
+                                 rnding);
+  u[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52], v_bit,
+                                 rnding);
+  u[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53], v_bit,
+                                 rnding);
+  u[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54], v_bit,
+                                 rnding);
+  u[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55], v_bit,
+                                 rnding);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_neon(int32x4_t *u, const int32_t *cospi,
+                                      const int32x4_t *clamp_lo,
+                                      const int32x4_t *clamp_hi,
+                                      const int32x4_t *v_bit,
+                                      const int32x4_t *rnding) {
+  int i;
+  int32x4_t temp1, temp2, temp3, temp4;
+  for (i = 0; i < 8; ++i) {
+    addsub_neon(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+  }
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit,
+                                 rnding);
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit,
+                                 rnding);
+  temp3 = half_btf_neon_mode10_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit,
+                                 rnding);
+  temp4 = half_btf_neon_mode10_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit,
+                                 rnding);
+  u[24] =
+      half_btf_neon_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit, rnding);
+  u[25] =
+      half_btf_neon_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit, rnding);
+  u[26] =
+      half_btf_neon_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit, rnding);
+  u[27] =
+      half_btf_neon_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit, rnding);
+  u[20] = temp1;
+  u[21] = temp2;
+  u[22] = temp3;
+  u[23] = temp4;
+  for (i = 32; i < 40; i++) {
+    addsub_neon(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+  }
+
+  for (i = 48; i < 56; i++) {
+    addsub_neon(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+  }
+}
+
+static INLINE void idct64_stage10_neon(int32x4_t *u, const int32_t *cospi,
+                                       const int32x4_t *clamp_lo,
+                                       const int32x4_t *clamp_hi,
+                                       const int32x4_t *v_bit,
+                                       const int32x4_t *rnding) {
+  int32x4_t temp1, temp2, temp3, temp4;
+  for (int i = 0; i < 16; i++) {
+    addsub_neon(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+  }
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit,
+                                 rnding);
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit,
+                                 rnding);
+  temp3 = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit,
+                                 rnding);
+  temp4 = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit,
+                                 rnding);
+  u[52] =
+      half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit, rnding);
+  u[53] =
+      half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit, rnding);
+  u[54] =
+      half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit, rnding);
+  u[55] =
+      half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit, rnding);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+
+  temp1 = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit,
+                                 rnding);
+  temp2 = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit,
+                                 rnding);
+  temp3 = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit,
+                                 rnding);
+  temp4 = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit,
+                                 rnding);
+  u[48] =
+      half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit, rnding);
+  u[49] =
+      half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit, rnding);
+  u[50] =
+      half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit, rnding);
+  u[51] =
+      half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit, rnding);
+  u[44] = temp1;
+  u[45] = temp2;
+  u[46] = temp3;
+  u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_neon(int32x4_t *u, int32x4_t *out,
+                                       int do_cols, int bd, int out_shift,
+                                       const int32x4_t *clamp_lo,
+                                       const int32x4_t *clamp_hi) {
+  for (int i = 0; i < 32; i++) {
+    addsub_neon(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
+  }
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    const int32x4_t rnding = vdupq_n_s32(1 << (out_shift - 1));
+    for (int i = 0; i < 64; i += 4) {
+      round_shift_4x4(out + i, out_shift, &rnding);
+      highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 4);
+    }
+  }
+}
+
+static void idct64x64_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  {
+    int32x4_t x;
+
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    // stage 5
+    // stage 6
+    x = half_btf_0_neon_r(&cospi[32], &in[0], &v_bit, &rnding);
+
+    // stage 8
+    // stage 9
+    // stage 10
+    // stage 11
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
+      clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+      if (out_shift != 0) {
+        int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+        x = vaddq_s32(x, offset);
+        x = vshlq_s32(x, vdupq_n_s32(-out_shift));
+      }
+    }
+    x = vmaxq_s32(x, clamp_lo);
+    x = vminq_s32(x, clamp_hi);
+    out[0] = x;
+    out[1] = x;
+    out[2] = x;
+    out[3] = x;
+    out[4] = x;
+    out[5] = x;
+    out[6] = x;
+    out[7] = x;
+    out[8] = x;
+    out[9] = x;
+    out[10] = x;
+    out[11] = x;
+    out[12] = x;
+    out[13] = x;
+    out[14] = x;
+    out[15] = x;
+    out[16] = x;
+    out[17] = x;
+    out[18] = x;
+    out[19] = x;
+    out[20] = x;
+    out[21] = x;
+    out[22] = x;
+    out[23] = x;
+    out[24] = x;
+    out[25] = x;
+    out[26] = x;
+    out[27] = x;
+    out[28] = x;
+    out[29] = x;
+    out[30] = x;
+    out[31] = x;
+    out[32] = x;
+    out[33] = x;
+    out[34] = x;
+    out[35] = x;
+    out[36] = x;
+    out[37] = x;
+    out[38] = x;
+    out[39] = x;
+    out[40] = x;
+    out[41] = x;
+    out[42] = x;
+    out[43] = x;
+    out[44] = x;
+    out[45] = x;
+    out[46] = x;
+    out[47] = x;
+    out[48] = x;
+    out[49] = x;
+    out[50] = x;
+    out[51] = x;
+    out[52] = x;
+    out[53] = x;
+    out[54] = x;
+    out[55] = x;
+    out[56] = x;
+    out[57] = x;
+    out[58] = x;
+    out[59] = x;
+    out[60] = x;
+    out[61] = x;
+    out[62] = x;
+    out[63] = x;
+  }
+}
+
+static void idct64x64_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                int do_cols, int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  {
+    int32x4_t u[64];
+
+    // stage 1
+    u[0] = in[0];
+    u[8] = in[4];
+    u[16] = in[2];
+    u[24] = in[6];
+    u[32] = in[1];
+    u[40] = in[5];
+    u[48] = in[3];
+    u[56] = in[7];
+
+    // stage 2
+    u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
+    u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
+    u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
+    u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
+    u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
+    u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
+    u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
+    u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
+
+    // stage 3
+    u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding);
+    u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding);
+    u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding);
+    u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding);
+    u[33] = u[32];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[62] = u[63];
+
+    // stage 4
+    int32x4_t temp1, temp2;
+    u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
+    u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
+    u[17] = u[16];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[30] = u[31];
+
+    temp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62],
+                                   &v_bit, &rnding);
+    u[62] =
+        half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
+    u[33] = temp1;
+
+    temp2 = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
+                                   &v_bit, &rnding);
+    u[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
+                                   &v_bit, &rnding);
+    u[57] = temp2;
+
+    temp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
+                                   &v_bit, &rnding);
+    u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
+                            &rnding);
+    u[41] = temp1;
+
+    temp2 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
+                                   &v_bit, &rnding);
+    u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
+                                   &v_bit, &rnding);
+    u[46] = temp2;
+
+    // stage 5
+    u[9] = u[8];
+    u[14] = u[15];
+
+    temp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30],
+                                   &v_bit, &rnding);
+    u[30] =
+        half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding);
+    u[17] = temp1;
+
+    temp2 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25],
+                                   &v_bit, &rnding);
+    u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25],
+                                   &v_bit, &rnding);
+    u[22] = temp2;
+
+    u[35] = u[32];
+    u[34] = u[33];
+    u[36] = u[39];
+    u[37] = u[38];
+    u[43] = u[40];
+    u[42] = u[41];
+    u[44] = u[47];
+    u[45] = u[46];
+    u[51] = u[48];
+    u[50] = u[49];
+    u[52] = u[55];
+    u[53] = u[54];
+    u[59] = u[56];
+    u[58] = u[57];
+    u[60] = u[63];
+    u[61] = u[62];
+
+    // stage 6
+    temp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+    u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+    u[0] = temp1;
+
+    temp2 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14],
+                                   &v_bit, &rnding);
+    u[14] =
+        half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+    u[9] = temp2;
+    u[19] = u[16];
+    u[18] = u[17];
+    u[20] = u[23];
+    u[21] = u[22];
+    u[27] = u[24];
+    u[26] = u[25];
+    u[28] = u[31];
+    u[29] = u[30];
+
+    temp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61],
+                                   &v_bit, &rnding);
+    u[61] =
+        half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
+    u[34] = temp1;
+    temp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60],
+                                   &v_bit, &rnding);
+    u[60] =
+        half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
+    u[35] = temp2;
+    temp1 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59],
+                                   &v_bit, &rnding);
+    u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
+                                   &v_bit, &rnding);
+    u[36] = temp1;
+    temp2 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58],
+                                   &v_bit, &rnding);
+    u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
+                                   &v_bit, &rnding);
+    u[37] = temp2;
+    temp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
+                                   &v_bit, &rnding);
+    u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
+                            &rnding);
+    u[42] = temp1;
+    temp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
+                                   &v_bit, &rnding);
+    u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
+                            &rnding);
+    u[43] = temp2;
+    temp1 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
+                                   &v_bit, &rnding);
+    u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
+                                   &v_bit, &rnding);
+    u[44] = temp1;
+    temp2 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
+                                   &v_bit, &rnding);
+    u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
+                                   &v_bit, &rnding);
+    u[45] = temp2;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    u[11] = u[8];
+    u[10] = u[9];
+    u[12] = u[15];
+    u[13] = u[14];
+
+    temp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29],
+                                   &v_bit, &rnding);
+    u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit,
+                            &rnding);
+    u[18] = temp1;
+    temp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28],
+                                   &v_bit, &rnding);
+    u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit,
+                            &rnding);
+    u[19] = temp2;
+    temp1 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27],
+                                   &v_bit, &rnding);
+    u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27],
+                                   &v_bit, &rnding);
+    u[20] = temp1;
+    temp2 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26],
+                                   &v_bit, &rnding);
+    u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26],
+                                   &v_bit, &rnding);
+    u[21] = temp2;
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                    &clamp_hi);
+      }
+    }
+
+    // stage 8
+    u[7] = u[0];
+    u[6] = u[1];
+    u[5] = u[2];
+    u[4] = u[3];
+    u[9] = u[9];
+
+    idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+    // stage 9
+    idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+    // stage 10
+    idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+    // stage 11
+    idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+  }
+}
+
+static void idct64x64_low16_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  {
+    int32x4_t u[64];
+    int32x4_t tmp1, tmp2, tmp3, tmp4;
+    // stage 1
+    u[0] = in[0];
+    u[32] = in[1];
+    u[36] = in[9];
+    u[40] = in[5];
+    u[44] = in[13];
+    u[48] = in[3];
+    u[52] = in[11];
+    u[56] = in[7];
+    u[60] = in[15];
+    u[16] = in[2];
+    u[20] = in[10];
+    u[24] = in[6];
+    u[28] = in[14];
+    u[4] = in[8];
+    u[8] = in[4];
+    u[12] = in[12];
+
+    // stage 2
+    u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
+    u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
+    u[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding);
+    u[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding);
+    u[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding);
+    u[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding);
+    u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
+    u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
+    u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
+    u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
+    u[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding);
+    u[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding);
+    u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
+    u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
+    u[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding);
+    u[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding);
+
+    // stage 3
+    u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding);
+    u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding);
+    u[19] = half_btf_0_m_neon_r(&cospi[50], &u[28], &v_bit, &rnding);
+    u[28] = half_btf_0_neon_r(&cospi[14], &u[28], &v_bit, &rnding);
+    u[27] = half_btf_0_neon_r(&cospi[10], &u[20], &v_bit, &rnding);
+    u[20] = half_btf_0_neon_r(&cospi[54], &u[20], &v_bit, &rnding);
+    u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding);
+    u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding);
+    u[33] = u[32];
+    u[34] = u[35];
+    u[37] = u[36];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[42] = u[43];
+    u[45] = u[44];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[50] = u[51];
+    u[53] = u[52];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[58] = u[59];
+    u[61] = u[60];
+    u[62] = u[63];
+
+    // stage 4
+    u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
+    u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
+    u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
+    u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
+
+    u[17] = u[16];
+    u[18] = u[19];
+    u[21] = u[20];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[26] = u[27];
+    u[29] = u[28];
+    u[30] = u[31];
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], &v_bit,
+                                  &rnding);
+    tmp2 = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61], &v_bit,
+                                  &rnding);
+    tmp3 = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58],
+                                  &v_bit, &rnding);
+    tmp4 = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
+                                  &v_bit, &rnding);
+    u[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
+                                   &v_bit, &rnding);
+    u[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit,
+                            &rnding);
+    u[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61],
+                                   &v_bit, &rnding);
+    u[62] =
+        half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
+    u[33] = tmp1;
+    u[34] = tmp2;
+    u[37] = tmp3;
+    u[38] = tmp4;
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
+                                  &v_bit, &rnding);
+    tmp2 = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53],
+                                  &v_bit, &rnding);
+    tmp3 = half_btf_neon_r(&cospi[52], &u[45], &cospi[12], &u[50], &v_bit,
+                           &rnding);
+    tmp4 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
+                                  &v_bit, &rnding);
+    u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
+                                   &v_bit, &rnding);
+    u[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit,
+                            &rnding);
+    u[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53],
+                                   &v_bit, &rnding);
+    u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
+                            &rnding);
+    u[41] = tmp1;
+    u[42] = tmp2;
+    u[45] = tmp3;
+    u[46] = tmp4;
+
+    // stage 5
+    u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding);
+    u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding);
+
+    u[9] = u[8];
+    u[10] = u[11];
+    u[13] = u[12];
+    u[14] = u[15];
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30], &v_bit,
+                                  &rnding);
+    tmp2 = half_btf_neon_mode11_r(&cospi[56], &u[18], &cospi[8], &u[29], &v_bit,
+                                  &rnding);
+    tmp3 = half_btf_neon_mode10_r(&cospi[40], &u[21], &cospi[24], &u[26],
+                                  &v_bit, &rnding);
+    tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25],
+                                  &v_bit, &rnding);
+    u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25],
+                                   &v_bit, &rnding);
+    u[26] = half_btf_neon_r(&cospi[24], &u[21], &cospi[40], &u[26], &v_bit,
+                            &rnding);
+    u[29] = half_btf_neon_mode10_r(&cospi[8], &u[18], &cospi[56], &u[29],
+                                   &v_bit, &rnding);
+    u[30] =
+        half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding);
+    u[17] = tmp1;
+    u[18] = tmp2;
+    u[21] = tmp3;
+    u[22] = tmp4;
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    // stage 6
+    tmp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+    u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+    u[0] = tmp1;
+    u[5] = u[4];
+    u[6] = u[7];
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
+                                  &rnding);
+    u[14] =
+        half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+    u[9] = tmp1;
+    tmp2 = half_btf_neon_mode01_r(&cospi[48], &u[10], &cospi[16], &u[13],
+                                  &v_bit, &rnding);
+    u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
+                                   &v_bit, &rnding);
+    u[10] = tmp2;
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], &v_bit,
+                                  &rnding);
+    tmp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], &v_bit,
+                                  &rnding);
+    tmp3 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], &v_bit,
+                                  &rnding);
+    tmp4 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], &v_bit,
+                                  &rnding);
+    u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
+                                   &v_bit, &rnding);
+    u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
+                                   &v_bit, &rnding);
+    u[60] =
+        half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
+    u[61] =
+        half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
+    u[34] = tmp1;
+    u[35] = tmp2;
+    u[36] = tmp3;
+    u[37] = tmp4;
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
+                                  &v_bit, &rnding);
+    tmp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
+                                  &v_bit, &rnding);
+    tmp3 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
+                                  &v_bit, &rnding);
+    tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
+                                  &v_bit, &rnding);
+    u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
+                                   &v_bit, &rnding);
+    u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
+                                   &v_bit, &rnding);
+    u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
+                            &rnding);
+    u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
+                            &rnding);
+    u[42] = tmp1;
+    u[43] = tmp2;
+    u[44] = tmp3;
+    u[45] = tmp4;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    tmp1 = half_btf_neon_mode10_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit,
+                                  &rnding);
+    u[6] =
+        half_btf_neon_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit, &rnding);
+    u[5] = tmp1;
+    addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29],
+                                  &v_bit, &rnding);
+    tmp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28],
+                                  &v_bit, &rnding);
+    tmp3 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27],
+                                  &v_bit, &rnding);
+    tmp4 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26],
+                                  &v_bit, &rnding);
+    u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26],
+                                   &v_bit, &rnding);
+    u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27],
+                                   &v_bit, &rnding);
+    u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit,
+                            &rnding);
+    u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit,
+                            &rnding);
+    u[18] = tmp1;
+    u[19] = tmp2;
+    u[20] = tmp3;
+    u[21] = tmp4;
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                    &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_neon(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+    // stage 9
+    idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+    // stage 10
+    idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+    // stage 11
+    idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+  }
+}
+
+static void idct64x64_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                           int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+
+  {
+    int32x4_t u[64], v[64];
+
+    // stage 1
+    u[32] = in[1];
+    u[34] = in[17];
+    u[36] = in[9];
+    u[38] = in[25];
+    u[40] = in[5];
+    u[42] = in[21];
+    u[44] = in[13];
+    u[46] = in[29];
+    u[48] = in[3];
+    u[50] = in[19];
+    u[52] = in[11];
+    u[54] = in[27];
+    u[56] = in[7];
+    u[58] = in[23];
+    u[60] = in[15];
+    u[62] = in[31];
+
+    v[16] = in[2];
+    v[18] = in[18];
+    v[20] = in[10];
+    v[22] = in[26];
+    v[24] = in[6];
+    v[26] = in[22];
+    v[28] = in[14];
+    v[30] = in[30];
+
+    u[8] = in[4];
+    u[10] = in[20];
+    u[12] = in[12];
+    u[14] = in[28];
+
+    v[4] = in[8];
+    v[6] = in[24];
+
+    u[0] = in[0];
+    u[2] = in[16];
+
+    // stage 2
+    v[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
+    v[33] = half_btf_0_m_neon_r(&cospi[33], &u[62], &v_bit, &rnding);
+    v[34] = half_btf_0_neon_r(&cospi[47], &u[34], &v_bit, &rnding);
+    v[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding);
+    v[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding);
+    v[37] = half_btf_0_m_neon_r(&cospi[41], &u[58], &v_bit, &rnding);
+    v[38] = half_btf_0_neon_r(&cospi[39], &u[38], &v_bit, &rnding);
+    v[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
+    v[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
+    v[41] = half_btf_0_m_neon_r(&cospi[37], &u[54], &v_bit, &rnding);
+    v[42] = half_btf_0_neon_r(&cospi[43], &u[42], &v_bit, &rnding);
+    v[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding);
+    v[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding);
+    v[45] = half_btf_0_m_neon_r(&cospi[45], &u[50], &v_bit, &rnding);
+    v[46] = half_btf_0_neon_r(&cospi[35], &u[46], &v_bit, &rnding);
+    v[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
+    v[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
+    v[49] = half_btf_0_neon_r(&cospi[29], &u[46], &v_bit, &rnding);
+    v[50] = half_btf_0_neon_r(&cospi[19], &u[50], &v_bit, &rnding);
+    v[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding);
+    v[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding);
+    v[53] = half_btf_0_neon_r(&cospi[21], &u[42], &v_bit, &rnding);
+    v[54] = half_btf_0_neon_r(&cospi[27], &u[54], &v_bit, &rnding);
+    v[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
+    v[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
+    v[57] = half_btf_0_neon_r(&cospi[25], &u[38], &v_bit, &rnding);
+    v[58] = half_btf_0_neon_r(&cospi[23], &u[58], &v_bit, &rnding);
+    v[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding);
+    v[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding);
+    v[61] = half_btf_0_neon_r(&cospi[17], &u[34], &v_bit, &rnding);
+    v[62] = half_btf_0_neon_r(&cospi[31], &u[62], &v_bit, &rnding);
+    v[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
+
+    // stage 3
+    u[16] = half_btf_0_neon_r(&cospi[62], &v[16], &v_bit, &rnding);
+    u[17] = half_btf_0_m_neon_r(&cospi[34], &v[30], &v_bit, &rnding);
+    u[18] = half_btf_0_neon_r(&cospi[46], &v[18], &v_bit, &rnding);
+    u[19] = half_btf_0_m_neon_r(&cospi[50], &v[28], &v_bit, &rnding);
+    u[20] = half_btf_0_neon_r(&cospi[54], &v[20], &v_bit, &rnding);
+    u[21] = half_btf_0_m_neon_r(&cospi[42], &v[26], &v_bit, &rnding);
+    u[22] = half_btf_0_neon_r(&cospi[38], &v[22], &v_bit, &rnding);
+    u[23] = half_btf_0_m_neon_r(&cospi[58], &v[24], &v_bit, &rnding);
+    u[24] = half_btf_0_neon_r(&cospi[6], &v[24], &v_bit, &rnding);
+    u[25] = half_btf_0_neon_r(&cospi[26], &v[22], &v_bit, &rnding);
+    u[26] = half_btf_0_neon_r(&cospi[22], &v[26], &v_bit, &rnding);
+    u[27] = half_btf_0_neon_r(&cospi[10], &v[20], &v_bit, &rnding);
+    u[28] = half_btf_0_neon_r(&cospi[14], &v[28], &v_bit, &rnding);
+    u[29] = half_btf_0_neon_r(&cospi[18], &v[18], &v_bit, &rnding);
+    u[30] = half_btf_0_neon_r(&cospi[30], &v[30], &v_bit, &rnding);
+    u[31] = half_btf_0_neon_r(&cospi[2], &v[16], &v_bit, &rnding);
+
+    for (i = 32; i < 64; i += 4) {
+      addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    // stage 4
+    v[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
+    v[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding);
+    v[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding);
+    v[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
+    v[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
+    v[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding);
+    v[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding);
+    v[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
+
+    for (i = 16; i < 32; i += 4) {
+      addsub_neon(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[33] = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62],
+                                   &v_bit, &rnding);
+    v[34] = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61],
+                                   &v_bit, &rnding);
+    v[37] = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58],
+                                   &v_bit, &rnding);
+    v[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
+                                   &v_bit, &rnding);
+    v[41] = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
+                                   &v_bit, &rnding);
+    v[42] = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53],
+                                   &v_bit, &rnding);
+    v[45] = half_btf_neon_mode10_r(&cospi[52], &u[45], &cospi[12], &u[50],
+                                   &v_bit, &rnding);
+    v[46] = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
+                                   &v_bit, &rnding);
+    v[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
+                                   &v_bit, &rnding);
+    v[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit,
+                            &rnding);
+    v[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53],
+                                   &v_bit, &rnding);
+    v[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
+                            &rnding);
+    v[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
+                                   &v_bit, &rnding);
+    v[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit,
+                            &rnding);
+    v[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61],
+                                   &v_bit, &rnding);
+    v[62] =
+        half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
+
+    // stage 5
+    u[4] = half_btf_0_neon_r(&cospi[56], &v[4], &v_bit, &rnding);
+    u[5] = half_btf_0_m_neon_r(&cospi[40], &v[6], &v_bit, &rnding);
+    u[6] = half_btf_0_neon_r(&cospi[24], &v[6], &v_bit, &rnding);
+    u[7] = half_btf_0_neon_r(&cospi[8], &v[4], &v_bit, &rnding);
+
+    for (i = 8; i < 16; i += 4) {
+      addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 16; i < 32; i += 4) {
+      u[i + 0] = v[i + 0];
+      u[i + 3] = v[i + 3];
+    }
+
+    u[17] = half_btf_neon_mode10_r(&cospi[8], &v[17], &cospi[56], &v[30],
+                                   &v_bit, &rnding);
+    u[18] = half_btf_neon_mode11_r(&cospi[56], &v[18], &cospi[8], &v[29],
+                                   &v_bit, &rnding);
+    u[21] = half_btf_neon_mode10_r(&cospi[40], &v[21], &cospi[24], &v[26],
+                                   &v_bit, &rnding);
+    u[22] = half_btf_neon_mode11_r(&cospi[24], &v[22], &cospi[40], &v[25],
+                                   &v_bit, &rnding);
+    u[25] = half_btf_neon_mode10_r(&cospi[40], &v[22], &cospi[24], &v[25],
+                                   &v_bit, &rnding);
+    u[26] = half_btf_neon_r(&cospi[24], &v[21], &cospi[40], &v[26], &v_bit,
+                            &rnding);
+    u[29] = half_btf_neon_mode10_r(&cospi[8], &v[18], &cospi[56], &v[29],
+                                   &v_bit, &rnding);
+    u[30] =
+        half_btf_neon_r(&cospi[56], &v[17], &cospi[8], &v[30], &v_bit, &rnding);
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_neon(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_neon(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    // stage 6
+    v[0] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+    v[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+    v[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding);
+    v[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding);
+
+    addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+    for (i = 8; i < 16; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
+                                  &rnding);
+    v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13],
+                                   &v_bit, &rnding);
+    v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
+                                   &v_bit, &rnding);
+    v[14] =
+        half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_neon(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_neon(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_neon(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 8) {
+      v[i + 0] = u[i + 0];
+      v[i + 1] = u[i + 1];
+      v[i + 6] = u[i + 6];
+      v[i + 7] = u[i + 7];
+    }
+
+    v[34] = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61],
+                                   &v_bit, &rnding);
+    v[35] = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60],
+                                   &v_bit, &rnding);
+    v[36] = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59],
+                                   &v_bit, &rnding);
+    v[37] = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58],
+                                   &v_bit, &rnding);
+    v[42] = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
+                                   &v_bit, &rnding);
+    v[43] = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
+                                   &v_bit, &rnding);
+    v[44] = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
+                                   &v_bit, &rnding);
+    v[45] = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
+                                   &v_bit, &rnding);
+    v[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
+                                   &v_bit, &rnding);
+    v[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
+                                   &v_bit, &rnding);
+    v[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
+                            &rnding);
+    v[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
+                            &rnding);
+    v[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
+                                   &v_bit, &rnding);
+    v[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
+                                   &v_bit, &rnding);
+    v[60] =
+        half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
+    v[61] =
+        half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
+
+    // stage 7
+    addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    u[4] = v[4];
+    u[7] = v[7];
+    u[5] = half_btf_neon_mode10_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit,
+                                  &rnding);
+    u[6] =
+        half_btf_neon_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit, &rnding);
+
+    addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    for (i = 16; i < 32; i += 8) {
+      u[i + 0] = v[i + 0];
+      u[i + 1] = v[i + 1];
+      u[i + 6] = v[i + 6];
+      u[i + 7] = v[i + 7];
+    }
+
+    u[18] = half_btf_neon_mode10_r(&cospi[16], &v[18], &cospi[48], &v[29],
+                                   &v_bit, &rnding);
+    u[19] = half_btf_neon_mode10_r(&cospi[16], &v[19], &cospi[48], &v[28],
+                                   &v_bit, &rnding);
+    u[20] = half_btf_neon_mode11_r(&cospi[48], &v[20], &cospi[16], &v[27],
+                                   &v_bit, &rnding);
+    u[21] = half_btf_neon_mode11_r(&cospi[48], &v[21], &cospi[16], &v[26],
+                                   &v_bit, &rnding);
+    u[26] = half_btf_neon_mode10_r(&cospi[16], &v[21], &cospi[48], &v[26],
+                                   &v_bit, &rnding);
+    u[27] = half_btf_neon_mode10_r(&cospi[16], &v[20], &cospi[48], &v[27],
+                                   &v_bit, &rnding);
+    u[28] = half_btf_neon_r(&cospi[48], &v[19], &cospi[16], &v[28], &v_bit,
+                            &rnding);
+    u[29] = half_btf_neon_r(&cospi[48], &v[18], &cospi[16], &v[29], &v_bit,
+                            &rnding);
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_neon(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_neon(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                    &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_neon(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[14] = u[14];
+    v[15] = u[15];
+
+    v[10] = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13],
+                                   &v_bit, &rnding);
+    v[11] = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12],
+                                   &v_bit, &rnding);
+    v[12] = half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], &v_bit,
+                            &rnding);
+    v[13] = half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], &v_bit,
+                            &rnding);
+
+    for (i = 16; i < 20; ++i) {
+      addsub_neon(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+      addsub_neon(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 32; i < 36; ++i) {
+      v[i] = u[i];
+      v[i + 12] = u[i + 12];
+      v[i + 16] = u[i + 16];
+      v[i + 28] = u[i + 28];
+    }
+
+    v[36] = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59],
+                                   &v_bit, &rnding);
+    v[37] = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58],
+                                   &v_bit, &rnding);
+    v[38] = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57],
+                                   &v_bit, &rnding);
+    v[39] = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56],
+                                   &v_bit, &rnding);
+    v[40] = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55],
+                                   &v_bit, &rnding);
+    v[41] = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54],
+                                   &v_bit, &rnding);
+    v[42] = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53],
+                                   &v_bit, &rnding);
+    v[43] = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52],
+                                   &v_bit, &rnding);
+    v[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52],
+                                   &v_bit, &rnding);
+    v[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53],
+                                   &v_bit, &rnding);
+    v[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54],
+                                   &v_bit, &rnding);
+    v[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55],
+                                   &v_bit, &rnding);
+    v[56] = half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], &v_bit,
+                            &rnding);
+    v[57] = half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], &v_bit,
+                            &rnding);
+    v[58] = half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], &v_bit,
+                            &rnding);
+    v[59] = half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], &v_bit,
+                            &rnding);
+
+    // stage 9
+    for (i = 0; i < 8; ++i) {
+      addsub_neon(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 16; i < 20; ++i) {
+      u[i] = v[i];
+      u[i + 12] = v[i + 12];
+    }
+
+    u[20] = half_btf_neon_mode10_r(&cospi[32], &v[20], &cospi[32], &v[27],
+                                   &v_bit, &rnding);
+    u[21] = half_btf_neon_mode10_r(&cospi[32], &v[21], &cospi[32], &v[26],
+                                   &v_bit, &rnding);
+    u[22] = half_btf_neon_mode10_r(&cospi[32], &v[22], &cospi[32], &v[25],
+                                   &v_bit, &rnding);
+    u[23] = half_btf_neon_mode10_r(&cospi[32], &v[23], &cospi[32], &v[24],
+                                   &v_bit, &rnding);
+    u[24] = half_btf_neon_r(&cospi[32], &v[23], &cospi[32], &v[24], &v_bit,
+                            &rnding);
+    u[25] = half_btf_neon_r(&cospi[32], &v[22], &cospi[32], &v[25], &v_bit,
+                            &rnding);
+    u[26] = half_btf_neon_r(&cospi[32], &v[21], &cospi[32], &v[26], &v_bit,
+                            &rnding);
+    u[27] = half_btf_neon_r(&cospi[32], &v[20], &cospi[32], &v[27], &v_bit,
+                            &rnding);
+
+    for (i = 32; i < 40; i++) {
+      addsub_neon(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 48; i < 56; i++) {
+      addsub_neon(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+    }
+
+    // stage 10
+    for (i = 0; i < 16; i++) {
+      addsub_neon(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 32; i < 40; i++) v[i] = u[i];
+
+    v[40] = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55],
+                                   &v_bit, &rnding);
+    v[41] = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54],
+                                   &v_bit, &rnding);
+    v[42] = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53],
+                                   &v_bit, &rnding);
+    v[43] = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52],
+                                   &v_bit, &rnding);
+    v[44] = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51],
+                                   &v_bit, &rnding);
+    v[45] = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50],
+                                   &v_bit, &rnding);
+    v[46] = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49],
+                                   &v_bit, &rnding);
+    v[47] = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48],
+                                   &v_bit, &rnding);
+    v[48] = half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], &v_bit,
+                            &rnding);
+    v[49] = half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], &v_bit,
+                            &rnding);
+    v[50] = half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], &v_bit,
+                            &rnding);
+    v[51] = half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], &v_bit,
+                            &rnding);
+    v[52] = half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], &v_bit,
+                            &rnding);
+    v[53] = half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], &v_bit,
+                            &rnding);
+    v[54] = half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], &v_bit,
+                            &rnding);
+    v[55] = half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], &v_bit,
+                            &rnding);
+
+    for (i = 56; i < 64; i++) v[i] = u[i];
+
+    // stage 11
+    for (i = 0; i < 32; i++) {
+      addsub_neon(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+      const int32x4_t clamp_hi_out =
+          vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+      const int32x4_t rnding32 = vdupq_n_s32(1 << (out_shift - 1));
+      for (i = 0; i < 64; i += 4) {
+        round_shift_4x4(out + i, out_shift, &rnding32);
+        highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
+                              4);
+      }
+    }
+  }
+}
+
+static void idct32x32_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t bf1;
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0-1
+  bf1 = in[0];
+
+  // stage 2-5
+  bf1 = half_btf_0_neon_r(&cospi[32], &bf1, &v_bit, &rnding);
+
+  // stage 6-9
+  if (do_cols) {
+    bf1 = vmaxq_s32(bf1, clamp_lo);
+    bf1 = vminq_s32(bf1, clamp_hi);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    if (out_shift != 0) {
+      bf1 = vrshlq_s32(bf1, vdupq_n_s32(-out_shift));
+    }
+  }
+
+  bf1 = vmaxq_s32(bf1, clamp_lo);
+  bf1 = vminq_s32(bf1, clamp_hi);
+
+  for (int i = 0; i < 32; i++) out[i] = bf1;
+}
+
+static void idct32x32_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t bf1[32];
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  // stage 0-1
+  bf1[0] = in[0];
+  bf1[4] = in[4];
+  bf1[8] = in[2];
+  bf1[12] = in[6];
+  bf1[16] = in[1];
+  bf1[20] = in[5];
+  bf1[24] = in[3];
+  bf1[28] = in[7];
+
+  // stage 2
+  bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding);
+  bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding);
+  bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding);
+  bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding);
+  bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding);
+  bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding);
+  bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding);
+  bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding);
+
+  // stage 3
+  bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding);
+  bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding);
+
+  bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding);
+  bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding);
+  bf1[17] = bf1[16];
+  bf1[18] = bf1[19];
+  bf1[21] = bf1[20];
+  bf1[22] = bf1[23];
+  bf1[25] = bf1[24];
+  bf1[26] = bf1[27];
+  bf1[29] = bf1[28];
+  bf1[30] = bf1[31];
+
+  // stage 4 :
+  bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding);
+  bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding);
+
+  bf1[9] = bf1[8];
+  bf1[10] = bf1[11];
+  bf1[13] = bf1[12];
+  bf1[14] = bf1[15];
+
+  idct32_stage4_neon(bf1, cospi, &v_bit, &rnding);
+
+  // stage 5
+  bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding);
+  bf1[1] = bf1[0];
+  bf1[5] = bf1[4];
+  bf1[6] = bf1[7];
+
+  idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 6
+  bf1[3] = bf1[0];
+  bf1[2] = bf1[1];
+
+  idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 7
+  idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 8
+  idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 9
+  idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+}
+
+static void idct32x32_low16_neon(int32x4_t *in, int32x4_t *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t bf1[32];
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+  // stage 0-1
+
+  bf1[0] = in[0];
+  bf1[2] = in[8];
+  bf1[4] = in[4];
+  bf1[6] = in[12];
+  bf1[8] = in[2];
+  bf1[10] = in[10];
+  bf1[12] = in[6];
+  bf1[14] = in[14];
+  bf1[16] = in[1];
+  bf1[18] = in[9];
+  bf1[20] = in[5];
+  bf1[22] = in[13];
+  bf1[24] = in[3];
+  bf1[26] = in[11];
+  bf1[28] = in[7];
+  bf1[30] = in[15];
+
+  // stage 2
+  bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding);
+  bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding);
+  bf1[17] = half_btf_0_m_neon_r(&cospi[34], &bf1[30], &v_bit, &rnding);
+  bf1[30] = half_btf_0_neon_r(&cospi[30], &bf1[30], &v_bit, &rnding);
+  bf1[29] = half_btf_0_neon_r(&cospi[18], &bf1[18], &v_bit, &rnding);
+  bf1[18] = half_btf_0_neon_r(&cospi[46], &bf1[18], &v_bit, &rnding);
+  bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding);
+  bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding);
+  bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding);
+  bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding);
+  bf1[21] = half_btf_0_m_neon_r(&cospi[42], &bf1[26], &v_bit, &rnding);
+  bf1[26] = half_btf_0_neon_r(&cospi[22], &bf1[26], &v_bit, &rnding);
+  bf1[25] = half_btf_0_neon_r(&cospi[26], &bf1[22], &v_bit, &rnding);
+  bf1[22] = half_btf_0_neon_r(&cospi[38], &bf1[22], &v_bit, &rnding);
+  bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding);
+  bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding);
+
+  // stage 3
+  bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding);
+  bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding);
+  bf1[9] = half_btf_0_m_neon_r(&cospi[36], &bf1[14], &v_bit, &rnding);
+  bf1[14] = half_btf_0_neon_r(&cospi[28], &bf1[14], &v_bit, &rnding);
+  bf1[13] = half_btf_0_neon_r(&cospi[20], &bf1[10], &v_bit, &rnding);
+  bf1[10] = half_btf_0_neon_r(&cospi[44], &bf1[10], &v_bit, &rnding);
+  bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding);
+  bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding);
+
+  addsub_neon(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+  // stage 4
+  bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding);
+  bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding);
+  bf1[5] = half_btf_0_m_neon_r(&cospi[40], &bf1[6], &v_bit, &rnding);
+  bf1[6] = half_btf_0_neon_r(&cospi[24], &bf1[6], &v_bit, &rnding);
+
+  addsub_neon(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+  idct32_stage4_neon(bf1, cospi, &v_bit, &rnding);
+
+  // stage 5
+  bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding);
+  bf1[1] = bf1[0];
+  bf1[3] = half_btf_0_neon_r(&cospi[16], &bf1[2], &v_bit, &rnding);
+  bf1[2] = half_btf_0_neon_r(&cospi[48], &bf1[2], &v_bit, &rnding);
+
+  addsub_neon(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+  idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 6
+  addsub_neon(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+  idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 7
+  idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+  // stage 8
+  idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+  // stage 9
+  idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+}
+
+static void idct32x32_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+                           int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+  const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+  int32x4_t bf1[32], bf0[32];
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+  // stage 0
+  // stage 1
+  bf1[0] = in[0];
+  bf1[1] = in[16];
+  bf1[2] = in[8];
+  bf1[3] = in[24];
+  bf1[4] = in[4];
+  bf1[5] = in[20];
+  bf1[6] = in[12];
+  bf1[7] = in[28];
+  bf1[8] = in[2];
+  bf1[9] = in[18];
+  bf1[10] = in[10];
+  bf1[11] = in[26];
+  bf1[12] = in[6];
+  bf1[13] = in[22];
+  bf1[14] = in[14];
+  bf1[15] = in[30];
+  bf1[16] = in[1];
+  bf1[17] = in[17];
+  bf1[18] = in[9];
+  bf1[19] = in[25];
+  bf1[20] = in[5];
+  bf1[21] = in[21];
+  bf1[22] = in[13];
+  bf1[23] = in[29];
+  bf1[24] = in[3];
+  bf1[25] = in[19];
+  bf1[26] = in[11];
+  bf1[27] = in[27];
+  bf1[28] = in[7];
+  bf1[29] = in[23];
+  bf1[30] = in[15];
+  bf1[31] = in[31];
+
+  // stage 2
+  for (int i = 0; i < 16; i++) bf0[i] = bf1[i];
+
+  bf0[16] = half_btf_neon_mode01_r(&cospi[62], &bf1[16], &cospi[2], &bf1[31],
+                                   &v_bit, &rnding);
+  bf0[17] = half_btf_neon_mode01_r(&cospi[30], &bf1[17], &cospi[34], &bf1[30],
+                                   &v_bit, &rnding);
+  bf0[18] = half_btf_neon_mode01_r(&cospi[46], &bf1[18], &cospi[18], &bf1[29],
+                                   &v_bit, &rnding);
+  bf0[19] = half_btf_neon_mode01_r(&cospi[14], &bf1[19], &cospi[50], &bf1[28],
+                                   &v_bit, &rnding);
+  bf0[20] = half_btf_neon_mode01_r(&cospi[54], &bf1[20], &cospi[10], &bf1[27],
+                                   &v_bit, &rnding);
+  bf0[21] = half_btf_neon_mode01_r(&cospi[22], &bf1[21], &cospi[42], &bf1[26],
+                                   &v_bit, &rnding);
+  bf0[22] = half_btf_neon_mode01_r(&cospi[38], &bf1[22], &cospi[26], &bf1[25],
+                                   &v_bit, &rnding);
+  bf0[23] = half_btf_neon_mode01_r(&cospi[6], &bf1[23], &cospi[58], &bf1[24],
+                                   &v_bit, &rnding);
+  bf0[24] = half_btf_neon_r(&cospi[58], &bf1[23], &cospi[6], &bf1[24], &v_bit,
+                            &rnding);
+  bf0[25] = half_btf_neon_r(&cospi[26], &bf1[22], &cospi[38], &bf1[25], &v_bit,
+                            &rnding);
+  bf0[26] = half_btf_neon_r(&cospi[42], &bf1[21], &cospi[22], &bf1[26], &v_bit,
+                            &rnding);
+  bf0[27] = half_btf_neon_r(&cospi[10], &bf1[20], &cospi[54], &bf1[27], &v_bit,
+                            &rnding);
+  bf0[28] = half_btf_neon_r(&cospi[50], &bf1[19], &cospi[14], &bf1[28], &v_bit,
+                            &rnding);
+  bf0[29] = half_btf_neon_r(&cospi[18], &bf1[18], &cospi[46], &bf1[29], &v_bit,
+                            &rnding);
+  bf0[30] = half_btf_neon_r(&cospi[34], &bf1[17], &cospi[30], &bf1[30], &v_bit,
+                            &rnding);
+  bf0[31] = half_btf_neon_r(&cospi[2], &bf1[16], &cospi[62], &bf1[31], &v_bit,
+                            &rnding);
+
+  // stage 3
+  for (int i = 0; i < 8; i++) bf1[i] = bf0[i];
+
+  bf1[8] = half_btf_neon_mode01_r(&cospi[60], &bf0[8], &cospi[4], &bf0[15],
+                                  &v_bit, &rnding);
+  bf1[9] = half_btf_neon_mode01_r(&cospi[28], &bf0[9], &cospi[36], &bf0[14],
+                                  &v_bit, &rnding);
+  bf1[10] = half_btf_neon_mode01_r(&cospi[44], &bf0[10], &cospi[20], &bf0[13],
+                                   &v_bit, &rnding);
+  bf1[11] = half_btf_neon_mode01_r(&cospi[12], &bf0[11], &cospi[52], &bf0[12],
+                                   &v_bit, &rnding);
+  bf1[12] = half_btf_neon_r(&cospi[52], &bf0[11], &cospi[12], &bf0[12], &v_bit,
+                            &rnding);
+  bf1[13] = half_btf_neon_r(&cospi[20], &bf0[10], &cospi[44], &bf0[13], &v_bit,
+                            &rnding);
+  bf1[14] = half_btf_neon_r(&cospi[36], &bf0[9], &cospi[28], &bf0[14], &v_bit,
+                            &rnding);
+  bf1[15] = half_btf_neon_r(&cospi[4], &bf0[8], &cospi[60], &bf0[15], &v_bit,
+                            &rnding);
+
+  addsub_neon(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+  // stage 4
+  bf0[0] = bf1[0];
+  bf0[1] = bf1[1];
+  bf0[2] = bf1[2];
+  bf0[3] = bf1[3];
+  bf0[4] = half_btf_neon_mode01_r(&cospi[56], &bf1[4], &cospi[8], &bf1[7],
+                                  &v_bit, &rnding);
+  bf0[5] = half_btf_neon_mode01_r(&cospi[24], &bf1[5], &cospi[40], &bf1[6],
+                                  &v_bit, &rnding);
+  bf0[6] = half_btf_neon_r(&cospi[40], &bf1[5], &cospi[24], &bf1[6], &v_bit,
+                           &rnding);
+  bf0[7] =
+      half_btf_neon_r(&cospi[8], &bf1[4], &cospi[56], &bf1[7], &v_bit, &rnding);
+
+  addsub_neon(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+  bf0[16] = bf1[16];
+  bf0[17] = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30],
+                                   &v_bit, &rnding);
+  bf0[18] = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29],
+                                   &v_bit, &rnding);
+  bf0[19] = bf1[19];
+  bf0[20] = bf1[20];
+  bf0[21] = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26],
+                                   &v_bit, &rnding);
+  bf0[22] = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25],
+                                   &v_bit, &rnding);
+  bf0[23] = bf1[23];
+  bf0[24] = bf1[24];
+  bf0[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25],
+                                   &v_bit, &rnding);
+  bf0[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], &v_bit,
+                            &rnding);
+  bf0[27] = bf1[27];
+  bf0[28] = bf1[28];
+  bf0[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29],
+                                   &v_bit, &rnding);
+  bf0[30] = half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], &v_bit,
+                            &rnding);
+  bf0[31] = bf1[31];
+
+  // stage 5
+  bf1[0] = half_btf_neon_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1], &v_bit,
+                           &rnding);
+  bf1[1] = half_btf_neon_mode01_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1],
+                                  &v_bit, &rnding);
+  bf1[2] = half_btf_neon_mode01_r(&cospi[48], &bf0[2], &cospi[16], &bf0[3],
+                                  &v_bit, &rnding);
+  bf1[3] = half_btf_neon_r(&cospi[16], &bf0[2], &cospi[48], &bf0[3], &v_bit,
+                           &rnding);
+  addsub_neon(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf_neon_mode10_r(&cospi[16], &bf0[9], &cospi[48], &bf0[14],
+                                  &v_bit, &rnding);
+  bf1[10] = half_btf_neon_mode11_r(&cospi[48], &bf0[10], &cospi[16], &bf0[13],
+                                   &v_bit, &rnding);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf0[10], &cospi[48], &bf0[13],
+                                   &v_bit, &rnding);
+  bf1[14] = half_btf_neon_r(&cospi[48], &bf0[9], &cospi[16], &bf0[14], &v_bit,
+                            &rnding);
+  bf1[15] = bf0[15];
+  addsub_neon(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+  // stage 6
+  addsub_neon(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+  bf0[4] = bf1[4];
+  bf0[5] = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6],
+                                  &v_bit, &rnding);
+  bf0[6] = half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], &v_bit,
+                           &rnding);
+  bf0[7] = bf1[7];
+  addsub_neon(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+  bf0[16] = bf1[16];
+  bf0[17] = bf1[17];
+  bf0[18] = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29],
+                                   &v_bit, &rnding);
+  bf0[19] = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28],
+                                   &v_bit, &rnding);
+  bf0[20] = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27],
+                                   &v_bit, &rnding);
+  bf0[21] = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26],
+                                   &v_bit, &rnding);
+  bf0[22] = bf1[22];
+  bf0[23] = bf1[23];
+  bf0[24] = bf1[24];
+  bf0[25] = bf1[25];
+  bf0[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26],
+                                   &v_bit, &rnding);
+  bf0[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27],
+                                   &v_bit, &rnding);
+  bf0[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], &v_bit,
+                            &rnding);
+  bf0[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], &v_bit,
+                            &rnding);
+  bf0[30] = bf1[30];
+  bf0[31] = bf1[31];
+
+  // stage 7
+  addsub_neon(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf_neon_mode10_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13],
+                                   &v_bit, &rnding);
+  bf1[11] = half_btf_neon_mode10_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12],
+                                   &v_bit, &rnding);
+  bf1[12] = half_btf_neon_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12], &v_bit,
+                            &rnding);
+  bf1[13] = half_btf_neon_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13], &v_bit,
+                            &rnding);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  addsub_neon(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+  // stage 8
+  addsub_neon(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+  addsub_neon(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+  bf0[16] = bf1[16];
+  bf0[17] = bf1[17];
+  bf0[18] = bf1[18];
+  bf0[19] = bf1[19];
+  bf0[20] = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27],
+                                   &v_bit, &rnding);
+  bf0[21] = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26],
+                                   &v_bit, &rnding);
+  bf0[22] = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25],
+                                   &v_bit, &rnding);
+  bf0[23] = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24],
+                                   &v_bit, &rnding);
+  bf0[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], &v_bit,
+                            &rnding);
+  bf0[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], &v_bit,
+                            &rnding);
+  bf0[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], &v_bit,
+                            &rnding);
+  bf0[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], &v_bit,
+                            &rnding);
+  bf0[28] = bf1[28];
+  bf0[29] = bf1[29];
+  bf0[30] = bf1[30];
+  bf0[31] = bf1[31];
+
+  // stage 9
+  addsub_neon(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+  addsub_neon(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    const int32x4_t rnding_shift = vdupq_n_s32(1 << (out_shift - 1));
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift, &rnding_shift);
+    round_shift_8x8(out + 16, out_shift, &rnding_shift);
+    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+  }
+}
+
+static void iidentity32_neon(int32x4_t *in, int32x4_t *out, int bit,
+                             int do_cols, int bd, int out_shift) {
+  (void)bit;
+  for (int i = 0; i < 32; i += 16) {
+    out[i] = vshlq_n_s32(in[i], 2);
+    out[i + 1] = vshlq_n_s32(in[i + 1], 2);
+    out[i + 2] = vshlq_n_s32(in[i + 2], 2);
+    out[i + 3] = vshlq_n_s32(in[i + 3], 2);
+    out[i + 4] = vshlq_n_s32(in[i + 4], 2);
+    out[i + 5] = vshlq_n_s32(in[i + 5], 2);
+    out[i + 6] = vshlq_n_s32(in[i + 6], 2);
+    out[i + 7] = vshlq_n_s32(in[i + 7], 2);
+    out[i + 8] = vshlq_n_s32(in[i + 8], 2);
+    out[i + 9] = vshlq_n_s32(in[i + 9], 2);
+    out[i + 10] = vshlq_n_s32(in[i + 10], 2);
+    out[i + 11] = vshlq_n_s32(in[i + 11], 2);
+    out[i + 12] = vshlq_n_s32(in[i + 12], 2);
+    out[i + 13] = vshlq_n_s32(in[i + 13], 2);
+    out[i + 14] = vshlq_n_s32(in[i + 14], 2);
+    out[i + 15] = vshlq_n_s32(in[i + 15], 2);
+  }
+
+  if (!do_cols) {
+    const int32x4_t rnding_shift = vdupq_n_s32(1 << (out_shift - 1));
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+    const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift, &rnding_shift);
+    round_shift_8x8(out + 16, out_shift, &rnding_shift);
+    highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+  }
+}
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+  IDCT_1D,
+  IADST_1D,
+  IFLIPADST_1D = IADST_1D,
+  IIDENTITY_1D,
+  ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
+  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
+  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
+  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
+  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
+};
+
+static const transform_1d_neon
+    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { idct4x4_neon, NULL, NULL, NULL },
+          { iadst4x4_neon, NULL, NULL, NULL },
+          { iidentity4_neon, iidentity4_neon, iidentity4_neon, NULL },
+      },
+      { { idct8x8_low1_neon, idct8x8_new_neon, NULL, NULL },
+        { iadst8x8_low1_neon, iadst8x8_new_neon, NULL, NULL },
+        { iidentity8_neon, iidentity8_neon, NULL, NULL } },
+      {
+          { idct16x16_low1_neon, idct16x16_low8_neon, idct16x16_neon, NULL },
+          { iadst16x16_low1_neon, iadst16x16_low8_neon, iadst16x16_neon, NULL },
+          { iidentity16_neon, NULL, iidentity16_neon, NULL },
+      },
+      { { idct32x32_low1_neon, idct32x32_low8_neon, idct32x32_low16_neon,
+          idct32x32_neon },
+        { NULL, NULL, NULL, NULL },
+        { iidentity32_neon, NULL, NULL, NULL } },
+      { { idct64x64_low1_neon, idct64x64_low8_neon, idct64x64_low16_neon,
+          idct64x64_neon },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+void av1_inv_txfm2d_add_4x8_neon(const tran_low_t *input, uint16_t *output,
+                                 int stride, TX_TYPE tx_type, const int bd) {
+  TX_SIZE tx_size = TX_4X8;
+  int32x4_t buf1[32] = { vdupq_n_s32(0) };
+
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  int32x4_t buf0[8];
+  const int32_t *input_row = input;
+  int32x4_t *buf0_cur = buf0;
+  load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+  av1_round_shift_rect_array_32_neon(buf0, buf0, txfm_size_row, 0, NewInvSqrt2);
+  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+  row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]);
+
+  if (lr_flip) {
+    TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
+                  buf1[3]);
+
+    TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
+                  buf1[7]);
+  } else {
+    TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
+                  buf1[3]);
+
+    TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
+                  buf1[7]);
+  }
+
+  // 2nd stage: column transform
+  col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
+
+  av1_round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
+
+  // write to buffer
+  highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
+                               bd);
+}
+
+void av1_inv_txfm2d_add_8x4_neon(const int32_t *input, uint16_t *output,
+                                 int stride, TX_TYPE tx_type, const int bd) {
+  TX_SIZE tx_size = TX_8X4;
+  int32x4_t buf1[8];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  int32x4_t buf0[8];
+  const int32_t *input_row = input;
+  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+  TRANSPOSE_4X4(buf0[0], buf0[2], buf0[4], buf0[6], buf1[0], buf1[1], buf1[2],
+                buf1[3]);
+  TRANSPOSE_4X4(buf0[1], buf0[3], buf0[5], buf0[7], buf1[4], buf1[5], buf1[6],
+                buf1[7]);
+
+  av1_round_shift_rect_array_32_neon(buf1, buf0, txfm_size_col, 0, NewInvSqrt2);
+  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+  int32x4_t *buf1_ptr;
+  if (lr_flip) {
+    flip_buf_neon(buf0, buf1, txfm_size_col);
+    buf1_ptr = buf1;
+  } else {
+    buf1_ptr = buf0;
+  }
+
+  // 2nd stage: column transform
+  for (int i = 0; i < 2; i++) {
+    col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+             INV_COS_BIT, 1, bd, 0);
+  }
+  av1_round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+  // write to buffer
+  highbd_write_buffer_8xn_neon(buf1_ptr, output, stride, ud_flip, txfm_size_row,
+                               bd);
+}
+
+void av1_inv_txfm2d_add_4x16_neon(const int32_t *input, uint16_t *output,
+                                  int stride, TX_TYPE tx_type, const int bd) {
+  TX_SIZE tx_size = TX_4X16;
+  int32x4_t buf1[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_h_div8 = txfm_size_row >> 2;
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  int32x4_t buf0[16];
+  const int32_t *input_row = input;
+  int32x4_t *buf0_cur = buf0;
+  load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+  for (int i = 0; i < (txfm_size_row >> 2); i++) {
+    row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]);
+  }
+
+  if (lr_flip) {
+    for (int j = 0; j < buf_size_h_div8; ++j) {
+      TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                    buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
+                    buf1[4 * j + 3]);
+    }
+  } else {
+    for (int j = 0; j < buf_size_h_div8; ++j) {
+      TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
+                    buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
+                    buf1[4 * j + 2], buf1[4 * j + 3]);
+    }
+  }
+
+  // 2nd stage: column transform
+  col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
+
+  av1_round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
+
+  // write to buffer
+  highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
+                               bd);
+}
+
+void av1_inv_txfm2d_add_16x4_neon(const int32_t *input, uint16_t *output,
+                                  int stride, TX_TYPE tx_type, const int bd) {
+  TX_SIZE tx_size = TX_16X4;
+  int32x4_t buf1[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  int32x4_t buf0[16];
+  const int32_t *input_row = input;
+  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+  for (int j = 0; j < buf_size_w_div8; j++) {
+    TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
+                  buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
+  }
+  row_txfm(buf1, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+  int32x4_t *buf1_ptr;
+  if (lr_flip) {
+    flip_buf_neon(buf0, buf1, txfm_size_col);
+    buf1_ptr = buf1;
+  } else {
+    buf1_ptr = buf0;
+  }
+
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+             INV_COS_BIT, 1, bd, 0);
+  }
+  av1_round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+
+  // write to buffer
+  for (int i = 0; i < (txfm_size_col >> 3); i++) {
+    highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2,
+                                 output + 8 * i, stride, ud_flip, txfm_size_row,
+                                 bd);
+  }
+}
+
+void highbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint16_t *output,
+                                     int stride, TX_TYPE tx_type, int eob,
+                                     const int bd) {
+  (void)eob;
+  TX_SIZE tx_size = TX_4X16;
+  int32x4_t buf1[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_h_div8 = txfm_size_row >> 2;
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  int32x4_t buf0[16];
+  const int32_t *input_row = input;
+  int32x4_t *buf0_cur = buf0;
+  load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+  for (int i = 0; i < (txfm_size_row >> 2); i++) {
+    row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]);
+  }
+
+  if (lr_flip) {
+    for (int j = 0; j < buf_size_h_div8; ++j) {
+      TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                    buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
+                    buf1[4 * j + 3]);
+    }
+  } else {
+    for (int j = 0; j < buf_size_h_div8; ++j) {
+      TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
+                    buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
+                    buf1[4 * j + 2], buf1[4 * j + 3]);
+    }
+  }
+
+  // 2nd stage: column transform
+  col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
+
+  av1_round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
+
+  // write to buffer
+  highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
+                               bd);
+}
+
+void highbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint16_t *output,
+                                     int stride, TX_TYPE tx_type, int eob,
+                                     const int bd) {
+  (void)eob;
+  TX_SIZE tx_size = TX_16X4;
+  int32x4_t buf1[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  int32x4_t buf0[16];
+  const int32_t *input_row = input;
+  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+  for (int j = 0; j < buf_size_w_div8; j++) {
+    TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
+                  buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
+  }
+  row_txfm(buf1, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+  int32x4_t *buf1_ptr;
+  if (lr_flip) {
+    flip_buf_neon(buf0, buf1, txfm_size_col);
+    buf1_ptr = buf1;
+  } else {
+    buf1_ptr = buf0;
+  }
+
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+             INV_COS_BIT, 1, bd, 0);
+  }
+  av1_round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+
+  // write to buffer
+  for (int i = 0; i < (txfm_size_col >> 3); i++) {
+    highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2,
+                                 output + 8 * i, stride, ud_flip, txfm_size_row,
+                                 bd);
+  }
+}
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x16_default[16]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x32_default[32]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x32_default[32]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x16_default[16]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+  NULL,
+  av1_eob_to_eobxy_8x8_default,
+  av1_eob_to_eobxy_16x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x16_default,
+  av1_eob_to_eobxy_16x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x32_default,
+  av1_eob_to_eobxy_32x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+};
+
+static INLINE void highbd_get_eobx_eoby_scan_default(int *eobx, int *eoby,
+                                                     TX_SIZE tx_size, int eob) {
+  if (eob == 1) {
+    *eobx = 0;
+    *eoby = 0;
+    return;
+  }
+
+  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+  const int eob_row = (eob - 1) >> tx_w_log2;
+  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+  *eobx = eobxy & 0xFF;
+  *eoby = eobxy >> 8;
+}
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+                                              TX_SIZE tx_size) {
+  if (tx_size == 2) {
+    *eoby = 15, *eobx = 15;
+  } else if (tx_size == 3) {
+    *eoby = 31, *eobx = 31;
+  } else if (tx_size == 4) {
+    *eoby = 31, *eobx = 31;
+  } else if (tx_size == 7) {
+    *eoby = 15, *eobx = 7;
+  } else if (tx_size == 8) {
+    *eoby = 7, *eobx = 15;
+  } else if (tx_size == 9) {
+    *eoby = 31, *eobx = 15;
+  } else if (tx_size == 10) {
+    *eoby = 15, *eobx = 31;
+  } else if (tx_size == 11) {
+    *eoby = 31, *eobx = 31;
+  } else if (tx_size == 12) {
+    *eoby = 31, *eobx = 31;
+  } else if (tx_size == 15) {
+    *eoby = 31, *eobx = 7;
+  } else if (tx_size == 16) {
+    *eoby = 7, *eobx = 31;
+  } else if (tx_size == 17) {
+    *eoby = 31, *eobx = 15;
+  } else if (tx_size == 18) {
+    *eoby = 15, *eobx = 31;
+  } else {
+    *eoby = 0, *eobx = 0;
+  }
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size) {
+  const int txfm_size_row = tx_size_high[tx_size];
+  *eoby = AOMMIN(32, txfm_size_row) - 1;
+  *eobx = 0;
+}
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size) {
+  const int txfm_size_col = tx_size_wide[tx_size];
+  *eobx = AOMMIN(32, txfm_size_col) - 1;
+  *eoby = 0;
+}
+
+static void inv_txfm2d_add_h_identity_neon(const int32_t *input,
+                                           uint16_t *output, int stride,
+                                           TX_TYPE tx_type, TX_SIZE tx_size,
+                                           const int bd) {
+  int32x4_t buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div4 = input_stride >> 2;
+  const int buf_size_h_div8 = (eoby + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
+    int32x4_t buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_w_div4; ++j) {
+      int32x4_t *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_neon(buf0, buf0, input_stride, 0,
+                                         NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+    int32x4_t *_buf1 = buf1 + i * 4;
+
+    for (int j = 0; j < buf_size_w_div4; ++j) {
+      _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+      _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+      _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+      _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+    }
+  }
+  for (int i = 0; i < buf_size_w_div4; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+             bd, 0);
+
+    av1_round_shift_array_32_neon(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  for (int i = 0; i < (txfm_size_col >> 3); i++) {
+    highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                 stride, ud_flip, txfm_size_row, bd);
+  }
+}
+
+static void inv_txfm2d_add_v_identity_neon(const int32_t *input,
+                                           uint16_t *output, int stride,
+                                           TX_TYPE tx_type, TX_SIZE tx_size,
+                                           const int bd) {
+  int32x4_t buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div8 = input_stride >> 2;
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    int32x4_t buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
+      int32x4_t *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_neon(
+          buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+    int32x4_t *_buf1 = buf1 + i * 4;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+             bd, 0);
+
+    av1_round_shift_array_32_neon(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                   stride, ud_flip, txfm_size_row, bd);
+    }
+  }
+}
+static void inv_txfm2d_add_idtx_neon(const int32_t *input, uint16_t *output,
+                                     int stride, TX_TYPE tx_type,
+                                     TX_SIZE tx_size, const int bd) {
+  int32x4_t buf1[64 * 4];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    int32x4_t buf0[32];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (input_stride >> 2); ++j) {
+      int32x4_t *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_neon(buf0, buf0, input_stride, 0,
+                                         NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+    int32x4_t *_buf1 = buf1 + i * 4;
+    for (int j = 0; j < (input_stride >> 2); ++j) {
+      _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+      _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+      _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+      _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+    }
+  }
+  for (int i = 0; i < (input_stride >> 2); i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+             bd, 0);
+
+    av1_round_shift_array_32_neon(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                   stride, 0, txfm_size_row, bd);
+    }
+  }
+}
+void inv_txfm2d_add_no_identity_neon(const int32_t *input, uint16_t *output,
+                                     int stride, TX_TYPE tx_type,
+                                     TX_SIZE tx_size, const int bd) {
+  int32x4_t buf1[64 * 16];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // 1st stage: column transform
+  for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+    int32x4_t buf0[64];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
+      int32x4_t *buf0_cur = &buf0[j * 4];
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_neon(
+          buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+    int32x4_t *_buf1 = &buf1[i * 4];
+
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+             bd, 0);
+
+    av1_round_shift_array_32_neon(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                   stride, ud_flip, txfm_size_row, bd);
+    }
+  }
+}
+
+void highbd_inv_txfm2d_add_no_identity_neon(const int32_t *input,
+                                            uint16_t *output, int stride,
+                                            TX_TYPE tx_type, TX_SIZE tx_size,
+                                            int eob, const int bd) {
+  int32x4_t buf1[64 * 16];
+  int eobx, eoby;
+  highbd_get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // 1st stage: column transform
+  for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+    int32x4_t buf0[64];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
+      int32x4_t *buf0_cur = &buf0[j * 4];
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_neon(
+          buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+    int32x4_t *_buf1 = &buf1[i * 4];
+
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+             bd, 0);
+
+    av1_round_shift_array_32_neon(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                   stride, ud_flip, txfm_size_row, bd);
+    }
+  }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_neon(const int32_t *input,
+                                             uint8_t *output, int stride,
+                                             TX_TYPE tx_type, TX_SIZE tx_size,
+                                             int eob, const int bd) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      highbd_inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output),
+                                             stride, tx_type, tx_size, eob, bd);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+                                     tx_type, tx_size, bd);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+                                     tx_type, tx_size, bd);
+      break;
+    case IDTX:
+      inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+                               tx_type, tx_size, bd);
+      break;
+    default: assert(0); break;
+  }
+}
+
+void av1_inv_txfm2d_add_universe_neon(const int32_t *input, uint8_t *output,
+                                      int stride, TX_TYPE tx_type,
+                                      TX_SIZE tx_size, const int bd) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output),
+                                      stride, tx_type, tx_size, bd);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+                                     tx_type, tx_size, bd);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+                                     tx_type, tx_size, bd);
+      break;
+    case IDTX:
+      inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+                               tx_type, tx_size, bd);
+      break;
+    default: assert(0); break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_8x8_neon(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+    case IDTX:
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride, tx_type,
+                                              txfm_param->tx_size,
+                                              txfm_param->eob, bd);
+      break;
+    default:
+      av1_inv_txfm2d_add_8x8_neon(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, bd);
+      break;
+  }
+}
+void av1_highbd_inv_txfm_add_4x4_neon(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  int eob = txfm_param->eob;
+  int bd = txfm_param->bd;
+  int lossless = txfm_param->lossless;
+  const int32_t *src = cast_to_int32(input);
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    return;
+  }
+  av1_inv_txfm2d_add_4x4_neon(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                              bd);
+}
+
+void av1_highbd_inv_txfm_add_4x8_neon(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
+  av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                              txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4_neon(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
+  av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                              txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_8x16_neon(const tran_low_t *input, uint16_t *dest,
+                                  int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_8X16, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16_neon(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  int eob = txfm_param->eob;
+  highbd_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4_neon(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  int eob = txfm_param->eob;
+  highbd_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x16_neon(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_8X16,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x8_neon(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_16X8,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_16x8_neon(const tran_low_t *input, uint16_t *dest,
+                                  int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_16X8, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x32_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_16X32,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_16x32_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_16X32, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x16_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_32X16,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_32x16_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_32X16, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x32_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_32X32,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_32x32_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_32X32, bd);
+}
+
+void av1_highbd_inv_txfm_add_64x64_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_64X64,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_64x64_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_64X64, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x64_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_32X64,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_32x64_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_32X64, bd);
+}
+
+void av1_highbd_inv_txfm_add_64x32_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_64X32,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_64x32_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_64X32, bd);
+}
+
+void av1_highbd_inv_txfm_add_64x16_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_64X16,
+                                          txfm_param->eob, txfm_param->bd);
+}
+void av1_inv_txfm2d_add_64x16_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_64X16, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x64_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_16X64,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_16x64_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_16X64, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x16_neon(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_16X16,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_16x16_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_16X16, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x8_neon(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_32X8,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_32x8_neon(const tran_low_t *input, uint16_t *dest,
+                                  int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_32X8, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x32_neon(const tran_low_t *input, uint8_t *dest,
+                                       int stride,
+                                       const TxfmParam *txfm_param) {
+  av1_highbd_inv_txfm2d_add_universe_neon(input, dest, stride,
+                                          txfm_param->tx_type, TX_8X32,
+                                          txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_8x32_neon(const tran_low_t *input, uint16_t *dest,
+                                  int stride, TX_TYPE tx_type, const int bd) {
+  av1_inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+                                   TX_8X32, bd);
+}
+
+void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
+  const TX_SIZE tx_size = txfm_param->tx_size;
+
+  TX_TYPE tx_type = txfm_param->tx_type;
+  int bd = txfm_param->bd;
+  switch (tx_size) {
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8_neon(input, dest, stride, txfm_param);
+      break;
+    case TX_4X8:
+      av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  txfm_param->tx_type, txfm_param->bd);
+      break;
+    case TX_8X4:
+      av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  txfm_param->tx_type, txfm_param->bd);
+      break;
+    case TX_4X4:
+      av1_highbd_inv_txfm_add_4x4_neon(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                   txfm_param->tx_type, txfm_param->bd);
+      break;
+    case TX_4X16:
+      av1_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                   txfm_param->tx_type, txfm_param->bd);
+      break;
+    case TX_8X16:
+      av1_inv_txfm2d_add_8x16_neon(input, (uint16_t *)dest, stride, tx_type,
+                                   bd);
+      break;
+    case TX_16X8:
+      av1_inv_txfm2d_add_16x8_neon(input, (uint16_t *)dest, stride, tx_type,
+                                   bd);
+      break;
+    case TX_16X32:
+      av1_inv_txfm2d_add_16x32_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_32X16:
+      av1_inv_txfm2d_add_32x16_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_16X16:
+      av1_inv_txfm2d_add_16x16_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_32X32:
+      av1_inv_txfm2d_add_32x32_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_64X64:
+      av1_inv_txfm2d_add_64x64_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_32X64:
+      av1_inv_txfm2d_add_32x64_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_64X32:
+      av1_inv_txfm2d_add_64x32_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_16X64:
+      av1_inv_txfm2d_add_16x64_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_64X16:
+      av1_inv_txfm2d_add_64x16_neon(input, (uint16_t *)dest, stride, tx_type,
+                                    bd);
+      break;
+    case TX_32X8:
+      av1_inv_txfm2d_add_32x8_neon(input, (uint16_t *)dest, stride, tx_type,
+                                   bd);
+      break;
+    case TX_8X32:
+      av1_inv_txfm2d_add_8x32_neon(input, (uint16_t *)dest, stride, tx_type,
+                                   bd);
+      break;
+  }
+}
diff --git a/media/libaom/src/av1/common/arm/jnt_convolve_neon.c b/media/libaom/src/av1/common/arm/jnt_convolve_neon.c
index 92112fb856..e0b76a87bc 100644
--- a/media/libaom/src/av1/common/arm/jnt_convolve_neon.c
+++ b/media/libaom/src/av1/common/arm/jnt_convolve_neon.c
@@ -16,11 +16,11 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_dsp/txfm_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 #include "av1/common/common.h"
 #include "av1/common/arm/convolve_neon.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
 
 #if !defined(__aarch64__)
 static INLINE void compute_avg_4x1(
@@ -751,11 +751,9 @@ void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
                                  conv_params, y_filter, h, w);
 }
 
-void av1_dist_wtd_convolve_2d_copy_neon(
-    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
+                                        uint8_t *dst8, int dst8_stride, int w,
+                                        int h, ConvolveParams *conv_params) {
   uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
       tmp_shift3;
   uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3;
@@ -776,11 +774,6 @@ void av1_dist_wtd_convolve_2d_copy_neon(
   const int16x4_t dup_bits16x4 = vdup_n_s16(bits);
   const int16x8_t dup_bits16x8 = vdupq_n_s16(bits);
 
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-
   if (!(w & 0x07)) {
     for (y = 0; y < (h >> 2); ++y) {
       src1 = src;
@@ -879,8 +872,7 @@ void av1_dist_wtd_convolve_2d_copy_neon(
 void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
                                   uint8_t *dst8, int dst8_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  const int subpel_x_qn,
                                   ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
@@ -899,9 +891,6 @@ void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
   const uint16_t bck_offset = conv_params->bck_offset;
   const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
-  (void)filter_params_y;
-  (void)subpel_y_qn;
-
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
       filter_params_x, subpel_x_qn & SUBPEL_MASK);
@@ -1341,9 +1330,8 @@ void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
 
 void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
                                   uint8_t *dst8, int dst8_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
@@ -1363,9 +1351,6 @@ void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
   const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int shift_value = (conv_params->round_1 - 1 - bits);
 
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
diff --git a/media/libaom/src/av1/common/arm/reconinter_neon.c b/media/libaom/src/av1/common/arm/reconinter_neon.c
index 44e064195e..3694763d0f 100644
--- a/media/libaom/src/av1/common/arm/reconinter_neon.c
+++ b/media/libaom/src/av1/common/arm/reconinter_neon.c
@@ -15,8 +15,8 @@
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
 #include "av1/common/blockd.h"
 #include "config/av1_rtcd.h"
 
diff --git a/media/libaom/src/av1/common/arm/reconintra_neon.c b/media/libaom/src/av1/common/arm/reconintra_neon.c
new file mode 100644
index 0000000000..43c470f9e1
--- /dev/null
+++ b/media/libaom/src/av1/common/arm/reconintra_neon.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+DECLARE_ALIGNED(16, const int8_t,
+                av1_filter_intra_taps_neon[FILTER_INTRA_MODES][8][8]) = {
+  {
+      { -6, 0, 0, 0, -5, 10, 0, 0 },
+      { 10, 0, 12, 0, 2, 0, 9, 0 },
+      { -3, 1, 0, 0, -3, 1, 10, 0 },
+      { 1, 10, 7, 0, 1, 2, 5, 0 },
+      { -4, 0, 0, 12, -3, 6, 0, 9 },
+      { 6, 0, 2, 0, 2, 0, 2, 0 },
+      { -3, 2, 0, 7, -3, 2, 6, 5 },
+      { 2, 6, 2, 0, 1, 2, 3, 0 },
+  },
+  {
+      { -10, 0, 0, 0, -6, 16, 0, 0 },
+      { 16, 0, 10, 0, 0, 0, 6, 0 },
+      { -4, 0, 0, 0, -2, 0, 16, 0 },
+      { 0, 16, 4, 0, 0, 0, 2, 0 },
+      { -10, 0, 0, 10, -6, 16, 0, 6 },
+      { 16, 0, 0, 0, 0, 0, 0, 0 },
+      { -4, 0, 0, 4, -2, 0, 16, 2 },
+      { 0, 16, 0, 0, 0, 0, 0, 0 },
+  },
+  {
+      { -8, 0, 0, 0, -8, 8, 0, 0 },
+      { 8, 0, 16, 0, 0, 0, 16, 0 },
+      { -8, 0, 0, 0, -8, 0, 8, 0 },
+      { 0, 8, 16, 0, 0, 0, 16, 0 },
+      { -4, 0, 0, 16, -4, 4, 0, 16 },
+      { 4, 0, 0, 0, 0, 0, 0, 0 },
+      { -4, 0, 0, 16, -4, 0, 4, 16 },
+      { 0, 4, 0, 0, 0, 0, 0, 0 },
+  },
+  {
+      { -2, 0, 0, 0, -1, 8, 0, 0 },
+      { 8, 0, 10, 0, 3, 0, 6, 0 },
+      { -1, 3, 0, 0, 0, 2, 8, 0 },
+      { 2, 8, 4, 0, 1, 3, 2, 0 },
+      { -1, 0, 0, 10, -1, 4, 0, 6 },
+      { 4, 0, 3, 0, 3, 0, 4, 0 },
+      { -1, 3, 0, 4, -1, 2, 4, 3 },
+      { 2, 4, 4, 0, 2, 3, 3, 0 },
+  },
+  {
+      { -12, 0, 0, 0, -10, 14, 0, 0 },
+      { 14, 0, 14, 0, 0, 0, 12, 0 },
+      { -9, 0, 0, 0, -8, 0, 14, 0 },
+      { 0, 14, 11, 0, 0, 0, 10, 0 },
+      { -10, 0, 0, 14, -9, 12, 0, 12 },
+      { 12, 0, 0, 0, 1, 0, 0, 0 },
+      { -8, 0, 0, 11, -7, 0, 12, 9 },
+      { 0, 12, 1, 0, 0, 1, 1, 0 },
+  },
+};
+
+#define FILTER_INTRA_SCALE_BITS 4
+#define SHIFT_INTRA_SCALE_BITS 15 - FILTER_INTRA_SCALE_BITS
+
+#define MASK_LOW \
+  0x604020006040200  // (0 | (2 << 8) | (4 << 16) | (6 << 24)) x 2
+#define MASK_HIGH \
+  0x705030107050301  // (1 | (3 << 8) | (5 << 16) | (7 << 24)) x 2
+
+void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride,
+                                     TX_SIZE tx_size, const uint8_t *above,
+                                     const uint8_t *left, int mode) {
+  int r, c;
+  uint8_t buffer[33][33];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  const int8x16_t f1f0 = vld1q_s8(av1_filter_intra_taps_neon[mode][0]);
+  const int8x16_t f3f2 = vld1q_s8(av1_filter_intra_taps_neon[mode][2]);
+  const int8x16_t f5f4 = vld1q_s8(av1_filter_intra_taps_neon[mode][4]);
+  const int8x16_t f7f6 = vld1q_s8(av1_filter_intra_taps_neon[mode][6]);
+  const int16x8_t f1f0_lo = vmovl_s8(vget_low_s8(f1f0));
+  const int16x8_t f1f0_hi = vmovl_s8(vget_high_s8(f1f0));
+  const int16x8_t f3f2_lo = vmovl_s8(vget_low_s8(f3f2));
+  const int16x8_t f3f2_hi = vmovl_s8(vget_high_s8(f3f2));
+  const int16x8_t f5f4_lo = vmovl_s8(vget_low_s8(f5f4));
+  const int16x8_t f5f4_hi = vmovl_s8(vget_high_s8(f5f4));
+  const int16x8_t f7f6_lo = vmovl_s8(vget_low_s8(f7f6));
+  const int16x8_t f7f6_hi = vmovl_s8(vget_high_s8(f7f6));
+  const uint8x8_t vmask_low = vcreate_u8(MASK_LOW);
+  const uint8x8_t vmask_high = vcreate_u8(MASK_HIGH);
+
+  assert(bw <= 32 && bh <= 32);
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+  for (r = 1; r < bh + 1; r += 2) {
+    for (c = 1; c < bw + 1; c += 4) {
+      DECLARE_ALIGNED(16, uint8_t, p[8]);
+      memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
+      p[5] = buffer[r][c - 1];
+      p[6] = buffer[r + 1][c - 1];
+      p[7] = 0;
+
+      const uint8x8_t p_b = vld1_u8(p);
+
+      const uint16x8_t p_b_lo = vmovl_u8(vtbl1_u8(p_b, vmask_low));
+      const uint16x8_t p_b_hi = vmovl_u8(vtbl1_u8(p_b, vmask_high));
+
+      int16x8_t out_01 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f1f0_lo);
+      out_01 = vmlaq_s16(out_01, vreinterpretq_s16_u16(p_b_hi), f1f0_hi);
+      int16x8_t out_23 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f3f2_lo);
+      out_23 = vmlaq_s16(out_23, vreinterpretq_s16_u16(p_b_hi), f3f2_hi);
+      int16x8_t out_45 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f5f4_lo);
+      out_45 = vmlaq_s16(out_45, vreinterpretq_s16_u16(p_b_hi), f5f4_hi);
+      int16x8_t out_67 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f7f6_lo);
+      out_67 = vmlaq_s16(out_67, vreinterpretq_s16_u16(p_b_hi), f7f6_hi);
+#if defined(__aarch64__)
+      const int16x8_t out_0123 = vpaddq_s16(out_01, out_23);
+      const int16x8_t out_4567 = vpaddq_s16(out_45, out_67);
+      const int16x8_t out_01234567 = vpaddq_s16(out_0123, out_4567);
+#else
+      const int16x8_t out_0123 = vcombine_s16(vqmovn_s32(vpaddlq_s16(out_01)),
+                                              vqmovn_s32(vpaddlq_s16(out_23)));
+      const int16x8_t out_4567 = vcombine_s16(vqmovn_s32(vpaddlq_s16(out_45)),
+                                              vqmovn_s32(vpaddlq_s16(out_67)));
+      const int16x8_t out_01234567 = vcombine_s16(
+          vqmovn_s32(vpaddlq_s16(out_0123)), vqmovn_s32(vpaddlq_s16(out_4567)));
+#endif  // (__aarch64__)
+      const uint32x2_t out_r =
+          vreinterpret_u32_u8(vqmovun_s16(vrshrq_n_s16(out_01234567, 4)));
+      // Storing
+      vst1_lane_u32((uint32_t *)&buffer[r][c], out_r, 0);
+      vst1_lane_u32((uint32_t *)&buffer[r + 1][c], out_r, 1);
+    }
+  }
+
+  for (r = 0; r < bh; ++r) {
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
+    dst += stride;
+  }
+}
diff --git a/media/libaom/src/av1/common/arm/resize_neon.c b/media/libaom/src/av1/common/arm/resize_neon.c
new file mode 100644
index 0000000000..190a3b2894
--- /dev/null
+++ b/media/libaom/src/av1/common/arm/resize_neon.c
@@ -0,0 +1,805 @@
+/*
+ *
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/resize.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
+                                              const int src_stride,
+                                              uint8_t *dst,
+                                              const int dst_stride, const int w,
+                                              const int h) {
+  const int max_width = (w + 15) & ~15;
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      const uint8x16x2_t s = vld2q_u8(src);
+      vst1q_u8(dst, s.val[0]);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                              const int src_stride,
+                                              uint8_t *dst,
+                                              const int dst_stride, const int w,
+                                              const int h) {
+  const int max_width = (w + 15) & ~15;
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      const uint8x16x4_t s = vld4q_u8(src);
+      vst1q_u8(dst, s.val[0]);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_bilinear_kernel(
+    const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2,
+    const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1,
+    uint8_t *const dst) {
+  const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0);
+  const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0);
+  const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0);
+  const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0);
+  const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1);
+  const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1);
+  const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1);
+  const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1);
+
+  const uint8x8_t hor0 = vrshrn_n_u16(h4, 7);  // temp: 00 01 02 03 04 05 06 07
+  const uint8x8_t hor1 = vrshrn_n_u16(h5, 7);  // temp: 08 09 0A 0B 0C 0D 0E 0F
+  const uint8x8_t hor2 = vrshrn_n_u16(h6, 7);  // temp: 10 11 12 13 14 15 16 17
+  const uint8x8_t hor3 = vrshrn_n_u16(h7, 7);  // temp: 18 19 1A 1B 1C 1D 1E 1F
+  const uint16x8_t v0 = vmull_u8(hor0, coef0);
+  const uint16x8_t v1 = vmull_u8(hor1, coef0);
+  const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1);
+  const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1);
+  // dst: 0 1 2 3 4 5 6 7  8 9 A B C D E F
+  const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7));
+  vst1q_u8(dst, d);
+}
+
+static INLINE void scale_plane_2_to_1_bilinear(
+    const uint8_t *const src, const int src_stride, uint8_t *dst,
+    const int dst_stride, const int w, const int h, const int16_t c0,
+    const int16_t c1) {
+  const int max_width = (w + 15) & ~15;
+  const uint8_t *src0 = src;
+  const uint8_t *src1 = src + src_stride;
+  const uint8x8_t coef0 = vdup_n_u8(c0);
+  const uint8x8_t coef1 = vdup_n_u8(c1);
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      // 000 002 004 006 008 00A 00C 00E  010 012 014 016 018 01A 01C 01E
+      // 001 003 005 007 009 00B 00D 00F  011 013 015 017 019 01B 01D 01F
+      const uint8x16x2_t s0 = vld2q_u8(src0);
+      // 100 102 104 106 108 10A 10C 10E  110 112 114 116 118 11A 11C 11E
+      // 101 103 105 107 109 10B 10D 10F  111 113 115 117 119 11B 11D 11F
+      const uint8x16x2_t s1 = vld2q_u8(src1);
+      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+                                  coef0, coef1, dst);
+      src0 += 32;
+      src1 += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src0 += 2 * (src_stride - max_width);
+    src1 += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_bilinear(
+    const uint8_t *const src, const int src_stride, uint8_t *dst,
+    const int dst_stride, const int w, const int h, const int16_t c0,
+    const int16_t c1) {
+  const int max_width = (w + 15) & ~15;
+  const uint8_t *src0 = src;
+  const uint8_t *src1 = src + src_stride;
+  const uint8x8_t coef0 = vdup_n_u8(c0);
+  const uint8x8_t coef1 = vdup_n_u8(c1);
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      // (*) -- useless
+      // 000 004 008 00C 010 014 018 01C  020 024 028 02C 030 034 038 03C
+      // 001 005 009 00D 011 015 019 01D  021 025 029 02D 031 035 039 03D
+      // 002 006 00A 00E 012 016 01A 01E  022 026 02A 02E 032 036 03A 03E (*)
+      // 003 007 00B 00F 013 017 01B 01F  023 027 02B 02F 033 037 03B 03F (*)
+      const uint8x16x4_t s0 = vld4q_u8(src0);
+      // 100 104 108 10C 110 114 118 11C  120 124 128 12C 130 134 138 13C
+      // 101 105 109 10D 111 115 119 11D  121 125 129 12D 131 135 139 13D
+      // 102 106 10A 10E 112 116 11A 11E  122 126 12A 12E 132 136 13A 13E (*)
+      // 103 107 10B 10F 113 117 11B 11F  123 127 12B 12F 133 137 13B 13F (*)
+      const uint8x16x4_t s1 = vld4q_u8(src1);
+      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+                                  coef0, coef1, dst);
+      src0 += 64;
+      src1 += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src0 += 4 * (src_stride - max_width);
+    src1 += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 3) & ~3;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 3) & ~3;
+  const int16x8_t filters = vld1q_s16(coef);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[14], d[4];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+  // horizontal 4x8
+  // Note: processing 4x8 is about 20% faster than processing row by row using
+  // vld4_u8().
+  do {
+    load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+                  &s[12], &s[13]);
+      transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                       &s[13]);
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
+      d[1] = scale_filter_8(&s[2], filters);  // 01 11 21 31 41 51 61 71
+      d[2] = scale_filter_8(&s[4], filters);  // 02 12 22 32 42 52 62 72
+      d[3] = scale_filter_8(&s[6], filters);  // 03 13 23 33 43 53 63 73
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
+      vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]),
+                    1);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+
+      t += 4;
+      x -= 4;
+    } while (x);
+    src += 8 * src_stride - 2 * width_hor;
+    t += 7 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x4
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += 6 * width_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+                  &s[12], &s[13]);
+      t += 8 * width_hor;
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
+      d[1] = scale_filter_8(&s[2], filters);  // 10 11 12 13 14 15 16 17
+      d[2] = scale_filter_8(&s[4], filters);  // 20 21 22 23 24 25 26 27
+      d[3] = scale_filter_8(&s[6], filters);  // 30 31 32 33 34 35 36 37
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+
+      dst += 4 * dst_stride;
+      y -= 4;
+    } while (y);
+    t -= width_hor * (2 * height_ver + 6);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 1) & ~1;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 1) & ~1;
+  const int16x8_t filters = vld1q_s16(coef);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[12], d[2];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+  // horizontal 2x8
+  // Note: processing 2x8 is about 20% faster than processing row by row using
+  // vld4_u8().
+  do {
+    load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]);
+    x = width_hor;
+
+    do {
+      uint8x8x2_t dd;
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+                  &s[10], &s[11]);
+      transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10],
+                       &s[11]);
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
+      d[1] = scale_filter_8(&s[4], filters);  // 01 11 21 31 41 51 61 71
+      // dd.val[0]: 00 01 20 21 40 41 60 61
+      // dd.val[1]: 10 11 30 31 50 51 70 71
+      dd = vtrn_u8(d[0], d[1]);
+      vst1_lane_u16((uint16_t *)(t + 0 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 0);
+      vst1_lane_u16((uint16_t *)(t + 1 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 0);
+      vst1_lane_u16((uint16_t *)(t + 2 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 1);
+      vst1_lane_u16((uint16_t *)(t + 3 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 1);
+      vst1_lane_u16((uint16_t *)(t + 4 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 2);
+      vst1_lane_u16((uint16_t *)(t + 5 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 2);
+      vst1_lane_u16((uint16_t *)(t + 6 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 3);
+      vst1_lane_u16((uint16_t *)(t + 7 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 3);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+
+      t += 2;
+      x -= 2;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor;
+    t += 7 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x2
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]);
+    t += 4 * width_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+                  &s[10], &s[11]);
+      t += 8 * width_hor;
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
+      d[1] = scale_filter_8(&s[4], filters);  // 10 11 12 13 14 15 16 17
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+    t -= width_hor * (4 * height_ver + 4);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s,
+                                              const uint8x8_t *const coef) {
+  const uint16x8_t h0 = vmull_u8(s[0], coef[0]);
+  const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]);
+
+  return vrshrn_n_u16(h1, 7);
+}
+
+// Notes for 4 to 3 scaling:
+//
+// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be
+// multiple of 6, and no less than w.
+//
+// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be
+// multiple of 8, and no less than w.
+//
+// 3. 8 columns are calculated in each horizontal inner loop for further
+// vertical scaling, so height_hor must be multiple of 8, and no less than
+// 4 * h / 3.
+//
+// 4. 6 columns are calculated in each vertical inner loop, so height_ver must
+// be multiple of 6, and no less than h.
+//
+// 5. The physical location of the last row of the 4 to 3 scaled frame is
+// decided by phase_scaler, and are always less than 1 pixel below the last row
+// of the original image.
+static void scale_plane_4_to_3_bilinear(const uint8_t *src,
+                                        const int src_stride, uint8_t *dst,
+                                        const int dst_stride, const int w,
+                                        const int h, const int phase_scaler,
+                                        uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = width_hor + 2;  // store 2 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We only need 1 extra row below because there are only 2 bilinear
+  // coefficients.
+  const int height_hor = (4 * h / 3 + 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[9], d[8], c[6];
+  const InterpKernel *interp_kernel =
+      (const InterpKernel *)av1_interp_filter_params_list[BILINEAR].filter_ptr;
+  assert(w && h);
+
+  c[0] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][3]);
+  c[1] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][4]);
+  c[2] = vdup_n_u8(
+      (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][3]);
+  c[3] = vdup_n_u8(
+      (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][4]);
+  c[4] = vdup_n_u8(
+      (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][3]);
+  c[5] = vdup_n_u8(
+      (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][4]);
+
+  d[6] = vdup_n_u8(0);
+  d[7] = vdup_n_u8(0);
+
+  // horizontal 6x8
+  do {
+    load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    src += 1;
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                  &s[7], &s[8]);
+      src += 8;
+      transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = scale_filter_bilinear(&s[0], &c[0]);
+      d[1] =
+          scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+      d[2] =
+          scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+      d[3] = scale_filter_bilinear(&s[4], &c[0]);
+      d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+                                   &c[2]);
+      d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+                                   &c[4]);
+
+      // 00 01 02 03 04 05 xx xx
+      // 10 11 12 13 14 15 xx xx
+      // 20 21 22 23 24 25 xx xx
+      // 30 31 32 33 34 35 xx xx
+      // 40 41 42 43 44 45 xx xx
+      // 50 51 52 53 54 55 xx xx
+      // 60 61 62 63 64 65 xx xx
+      // 70 71 72 73 74 75 xx xx
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      // store 2 extra pixels
+      vst1_u8(t + 0 * stride_hor, d[0]);
+      vst1_u8(t + 1 * stride_hor, d[1]);
+      vst1_u8(t + 2 * stride_hor, d[2]);
+      vst1_u8(t + 3 * stride_hor, d[3]);
+      vst1_u8(t + 4 * stride_hor, d[4]);
+      vst1_u8(t + 5 * stride_hor, d[5]);
+      vst1_u8(t + 6 * stride_hor, d[6]);
+      vst1_u8(t + 7 * stride_hor, d[7]);
+
+      s[0] = s[8];
+
+      t += 6;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3 - 1;
+    t += 7 * stride_hor + 2;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += stride_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                  &s[7], &s[8]);
+      t += 8 * stride_hor;
+
+      d[0] = scale_filter_bilinear(&s[0], &c[0]);
+      d[1] =
+          scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+      d[2] =
+          scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+      d[3] = scale_filter_bilinear(&s[4], &c[0]);
+      d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+                                   &c[2]);
+      d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+                                   &c[4]);
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+      vst1_u8(dst + 4 * dst_stride, d[4]);
+      vst1_u8(dst + 5 * dst_stride, d[5]);
+
+      s[0] = s[8];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * (4 * height_ver / 3 + 1);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const InterpKernel *const coef,
+                                       const int phase_scaler,
+                                       uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = width_hor + 2;  // store 2 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+  // above and (SUBPEL_TAPS / 2) extra rows below.
+  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  const int16x8_t filters0 = vld1q_s16(
+      (const int16_t *)&coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]);
+  const int16x8_t filters1 = vld1q_s16(
+      (const int16_t *)&coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]);
+  const int16x8_t filters2 = vld1q_s16(
+      (const int16_t *)&coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[15], d[8];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2;
+  d[6] = vdup_n_u8(0);
+  d[7] = vdup_n_u8(0);
+
+  // horizontal 6x8
+  do {
+    load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                  &s[13], &s[14]);
+      transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13],
+                       &s[14]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = scale_filter_8(&s[0], filters0);
+      d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+      d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+      d[3] = scale_filter_8(&s[4], filters0);
+      d[4] =
+          scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+      d[5] =
+          scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+
+      // 00 01 02 03 04 05 xx xx
+      // 10 11 12 13 14 15 xx xx
+      // 20 21 22 23 24 25 xx xx
+      // 30 31 32 33 34 35 xx xx
+      // 40 41 42 43 44 45 xx xx
+      // 50 51 52 53 54 55 xx xx
+      // 60 61 62 63 64 65 xx xx
+      // 70 71 72 73 74 75 xx xx
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      // store 2 extra pixels
+      vst1_u8(t + 0 * stride_hor, d[0]);
+      vst1_u8(t + 1 * stride_hor, d[1]);
+      vst1_u8(t + 2 * stride_hor, d[2]);
+      vst1_u8(t + 3 * stride_hor, d[3]);
+      vst1_u8(t + 4 * stride_hor, d[4]);
+      vst1_u8(t + 5 * stride_hor, d[5]);
+      vst1_u8(t + 6 * stride_hor, d[6]);
+      vst1_u8(t + 7 * stride_hor, d[7]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+      s[6] = s[14];
+
+      t += 6;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3;
+    t += 7 * stride_hor + 2;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += 7 * stride_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                  &s[13], &s[14]);
+      t += 8 * stride_hor;
+
+      d[0] = scale_filter_8(&s[0], filters0);
+      d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+      d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+      d[3] = scale_filter_8(&s[4], filters0);
+      d[4] =
+          scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+      d[5] =
+          scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+      vst1_u8(dst + 4 * dst_stride, d[4]);
+      vst1_u8(dst + 5 * dst_stride, d[5]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+      s[6] = s[14];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * (4 * height_ver / 3 + 7);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
+                                      YV12_BUFFER_CONFIG *dst,
+                                      const InterpFilter filter,
+                                      const int phase, const int num_planes) {
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  int scaled = 0;
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+    const int is_uv = i > 0;
+    const int src_w = src->crop_widths[is_uv];
+    const int src_h = src->crop_heights[is_uv];
+    const int dst_w = dst->crop_widths[is_uv];
+    const int dst_h = dst->crop_heights[is_uv];
+    const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
+    const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
+
+    if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+      scaled = 1;
+      if (phase == 0) {
+        scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h);
+      } else if (filter == BILINEAR) {
+        const int16_t c0 = av1_bilinear_filters[phase][3];
+        const int16_t c1 = av1_bilinear_filters[phase][4];
+        scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+                                    dst->buffers[i], dst->strides[is_uv], dst_w,
+                                    dst_h, c0, c1);
+      } else {
+        const int buffer_stride = (dst_y_w + 3) & ~3;
+        const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+        uint8_t *const temp_buffer =
+            (uint8_t *)malloc(buffer_stride * buffer_height);
+        if (temp_buffer) {
+          const InterpKernel *interp_kernel =
+              (const InterpKernel *)av1_interp_filter_params_list[filter]
+                  .filter_ptr;
+          scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
+                                     dst->buffers[i], dst->strides[is_uv],
+                                     dst_w, dst_h, interp_kernel[phase],
+                                     temp_buffer);
+          free(temp_buffer);
+        } else {
+          scaled = 0;
+        }
+      }
+    } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+      scaled = 1;
+      if (phase == 0) {
+        scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h);
+      } else if (filter == BILINEAR) {
+        const int16_t c0 = av1_bilinear_filters[phase][3];
+        const int16_t c1 = av1_bilinear_filters[phase][4];
+        scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+                                    dst->buffers[i], dst->strides[is_uv], dst_w,
+                                    dst_h, c0, c1);
+      } else {
+        const int buffer_stride = (dst_y_w + 1) & ~1;
+        const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+        uint8_t *const temp_buffer =
+            (uint8_t *)malloc(buffer_stride * buffer_height);
+        if (temp_buffer) {
+          const InterpKernel *interp_kernel =
+              (const InterpKernel *)av1_interp_filter_params_list[filter]
+                  .filter_ptr;
+          scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
+                                     dst->buffers[i], dst->strides[is_uv],
+                                     dst_w, dst_h, interp_kernel[phase],
+                                     temp_buffer);
+          free(temp_buffer);
+        } else {
+          scaled = 0;
+        }
+      }
+    } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+      // 4 to 3
+      const int buffer_stride = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
+      const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scaled = 1;
+        if (filter == BILINEAR) {
+          scale_plane_4_to_3_bilinear(src->buffers[i], src->strides[is_uv],
+                                      dst->buffers[i], dst->strides[is_uv],
+                                      dst_w, dst_h, phase, temp_buffer);
+        } else {
+          const InterpKernel *interp_kernel =
+              (const InterpKernel *)av1_interp_filter_params_list[filter]
+                  .filter_ptr;
+          scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
+                                     dst->buffers[i], dst->strides[is_uv],
+                                     dst_w, dst_h, interp_kernel, phase,
+                                     temp_buffer);
+        }
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  }
+  if (!scaled) {
+    av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
+  } else {
+    aom_extend_frame_borders(dst, num_planes);
+  }
+}
diff --git a/media/libaom/src/av1/common/arm/selfguided_neon.c b/media/libaom/src/av1/common/arm/selfguided_neon.c
index fc404a64ab..f5eb36cce9 100644
--- a/media/libaom/src/av1/common/arm/selfguided_neon.c
+++ b/media/libaom/src/av1/common/arm/selfguided_neon.c
@@ -17,14 +17,14 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/txfm_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #include "av1/common/av1_common_int.h"
 #include "av1/common/common.h"
 #include "av1/common/resize.h"
 #include "av1/common/restoration.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
 
 // Constants used for right shift in final_filter calculation.
 #define NB_EVEN 5
diff --git a/media/libaom/src/av1/common/arm/warp_plane_neon.c b/media/libaom/src/av1/common/arm/warp_plane_neon.c
index c10a34fcd3..03b6db8a96 100644
--- a/media/libaom/src/av1/common/arm/warp_plane_neon.c
+++ b/media/libaom/src/av1/common/arm/warp_plane_neon.c
@@ -519,14 +519,16 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
     for (int j = 0; j < p_width; j += 8) {
       const int32_t src_x = (p_col + j + 4) << subsampling_x;
       const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      const int64_t dst_x =
+          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+      const int64_t dst_y =
+          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+      const int64_t x4 = dst_x >> subsampling_x;
+      const int64_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
diff --git a/media/libaom/src/av1/common/arm/wiener_convolve_neon.c b/media/libaom/src/av1/common/arm/wiener_convolve_neon.c
index a9bb5bcf00..06e7555f54 100644
--- a/media/libaom/src/av1/common/arm/wiener_convolve_neon.c
+++ b/media/libaom/src/av1/common/arm/wiener_convolve_neon.c
@@ -16,11 +16,11 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_dsp/txfm_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 #include "av1/common/common.h"
 #include "av1/common/arm/convolve_neon.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
 
 /* Wiener filter 2D
    Apply horizontal filter and store in a temporary buffer. When applying
diff --git a/media/libaom/src/av1/common/av1_common_int.h b/media/libaom/src/av1/common/av1_common_int.h
index 0403405e9c..fd2ec069fa 100644
--- a/media/libaom/src/av1/common/av1_common_int.h
+++ b/media/libaom/src/av1/common/av1_common_int.h
@@ -29,10 +29,9 @@
 #include "av1/common/restoration.h"
 #include "av1/common/tile_common.h"
 #include "av1/common/timing.h"
-#include "av1/common/odintrin.h"
-#include "av1/encoder/hash_motion.h"
-#include "aom_dsp/grain_synthesis.h"
+#include "aom_dsp/grain_params.h"
 #include "aom_dsp/grain_table.h"
+#include "aom_dsp/odintrin.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -79,6 +78,8 @@ extern "C" {
 #define TXCOEFF_TIMER 0
 #define TXCOEFF_COST_TIMER 0
 
+/*!\cond */
+
 enum {
   SINGLE_REFERENCE = 0,
   COMPOUND_REFERENCE = 1,
@@ -133,7 +134,8 @@ typedef struct RefCntBuffer {
   // distance when a very old frame is used as a reference.
   unsigned int display_order_hint;
   unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME];
-
+  // Frame's level within the hierarchical structure.
+  unsigned int pyramid_level;
   MV_REF *mvs;
   uint8_t *seg_map;
   struct segmentation seg;
@@ -149,6 +151,8 @@ typedef struct RefCntBuffer {
   aom_film_grain_t film_grain_params;
   aom_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
+  int temporal_id;  // Temporal layer ID of the frame
+  int spatial_id;   // Spatial layer ID of the frame
   FRAME_TYPE frame_type;
 
   // This is only used in the encoder but needs to be indexed per ref frame
@@ -186,14 +190,40 @@ typedef struct BufferPool {
   InternalFrameBufferList int_frame_buffers;
 } BufferPool;
 
+/*!\endcond */
+
+/*!\brief Parameters related to CDEF */
 typedef struct {
+  //! CDEF column line buffer
+  uint16_t *colbuf[MAX_MB_PLANE];
+  //! CDEF top & bottom line buffer
+  uint16_t *linebuf[MAX_MB_PLANE];
+  //! CDEF intermediate buffer
+  uint16_t *srcbuf;
+  //! CDEF column line buffer sizes
+  size_t allocated_colbuf_size[MAX_MB_PLANE];
+  //! CDEF top and bottom line buffer sizes
+  size_t allocated_linebuf_size[MAX_MB_PLANE];
+  //! CDEF intermediate buffer size
+  size_t allocated_srcbuf_size;
+  //! CDEF damping factor
   int cdef_damping;
+  //! Number of CDEF strength values
   int nb_cdef_strengths;
+  //! CDEF strength values for luma
   int cdef_strengths[CDEF_MAX_STRENGTHS];
+  //! CDEF strength values for chroma
   int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
+  //! Number of CDEF strength values in bits
   int cdef_bits;
+  //! Number of rows in the frame in 4 pixel
+  int allocated_mi_rows;
+  //! Number of CDEF workers
+  int allocated_num_workers;
 } CdefInfo;
 
+/*!\cond */
+
 typedef struct {
   int delta_q_present_flag;
   // Resolution of delta quant
@@ -230,6 +260,10 @@ typedef struct SequenceHeader {
   int num_bits_height;
   int max_frame_width;
   int max_frame_height;
+  // Whether current and reference frame IDs are signaled in the bitstream.
+  // Frame id numbers are additional information that do not affect the
+  // decoding process, but provide decoders with a way of detecting missing
+  // reference frames so that appropriate action can be taken.
   uint8_t frame_id_numbers_present_flag;
   int frame_id_length;
   int delta_frame_id_length;
@@ -308,416 +342,710 @@ typedef struct {
 
   unsigned int order_hint;
   unsigned int display_order_hint;
+  // Frame's level within the hierarchical structure.
+  unsigned int pyramid_level;
   unsigned int frame_number;
   SkipModeInfo skip_mode_info;
   int refresh_frame_flags;  // Which ref frames are overwritten by this frame
   int frame_refs_short_signaling;
 } CurrentFrame;
 
-// Struct containing some frame level features.
+/*!\endcond */
+
+/*!
+ * \brief Frame level features.
+ */
 typedef struct {
+  /*!
+   * If true, CDF update in the symbol encoding/decoding process is disabled.
+   */
   bool disable_cdf_update;
+  /*!
+   * If true, motion vectors are specified to eighth pel precision; and
+   * if false, motion vectors are specified to quarter pel precision.
+   */
   bool allow_high_precision_mv;
-  bool cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
+  /*!
+   * If true, force integer motion vectors; if false, use the default.
+   */
+  bool cur_frame_force_integer_mv;
+  /*!
+   * If true, palette tool and/or intra block copy tools may be used.
+   */
   bool allow_screen_content_tools;
-  bool allow_intrabc;
-  bool allow_warped_motion;
-  // Whether to use previous frames' motion vectors for prediction.
+  bool allow_intrabc;       /*!< If true, intra block copy tool may be used. */
+  bool allow_warped_motion; /*!< If true, frame may use warped motion mode. */
+  /*!
+   * If true, using previous frames' motion vectors for prediction is allowed.
+   */
   bool allow_ref_frame_mvs;
-  bool coded_lossless;  // frame is fully lossless at the coded resolution.
-  bool all_lossless;    // frame is fully lossless at the upscaled resolution.
+  /*!
+   * If true, frame is fully lossless at coded resolution.
+   * */
+  bool coded_lossless;
+  /*!
+   * If true, frame is fully lossless at upscaled resolution.
+   */
+  bool all_lossless;
+  /*!
+   * If true, the frame is restricted to a reduced subset of the full set of
+   * transform types.
+   */
   bool reduced_tx_set_used;
+  /*!
+   * If true, error resilient mode is enabled.
+   * Note: Error resilient mode allows the syntax of a frame to be parsed
+   * independently of previously decoded frames.
+   */
   bool error_resilient_mode;
+  /*!
+   * If false, only MOTION_MODE that may be used is SIMPLE_TRANSLATION;
+   * if true, all MOTION_MODES may be used.
+   */
   bool switchable_motion_mode;
-  TX_MODE tx_mode;
-  InterpFilter interp_filter;
+  TX_MODE tx_mode;            /*!< Transform mode at frame level. */
+  InterpFilter interp_filter; /*!< Interpolation filter at frame level. */
+  /*!
+   * The reference frame that contains the CDF values and other state that
+   * should be loaded at the start of the frame.
+   */
   int primary_ref_frame;
+  /*!
+   * Byte alignment of the planes in the reference buffers.
+   */
   int byte_alignment;
-  // Flag signaling how frame contexts should be updated at the end of
-  // a frame decode
+  /*!
+   * Flag signaling how frame contexts should be updated at the end of
+   * a frame decode.
+   */
   REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
 } FeatureFlags;
 
-// Struct containing params related to tiles.
+/*!
+ * \brief Params related to tiles.
+ */
 typedef struct CommonTileParams {
-  int cols;           // number of tile columns that frame is divided into
-  int rows;           // number of tile rows that frame is divided into
-  int max_width_sb;   // maximum tile width in superblock units.
-  int max_height_sb;  // maximum tile height in superblock units.
-  // Min width of non-rightmost tile in MI units. Only valid if cols > 1.
+  int cols;          /*!< number of tile columns that frame is divided into */
+  int rows;          /*!< number of tile rows that frame is divided into */
+  int max_width_sb;  /*!< maximum tile width in superblock units. */
+  int max_height_sb; /*!< maximum tile height in superblock units. */
+
+  /*!
+   * Min width of non-rightmost tile in MI units. Only valid if cols > 1.
+   */
   int min_inner_width;
 
-  // If true, tiles are uniformly spaced with power-of-two number of rows and
-  // columns.
-  // If false, tiles have explicitly configured widths and heights.
+  /*!
+   * If true, tiles are uniformly spaced with power-of-two number of rows and
+   * columns.
+   * If false, tiles have explicitly configured widths and heights.
+   */
   int uniform_spacing;
 
-  // Following members are only valid when uniform_spacing == 1
-  int log2_cols;  // log2 of 'cols'.
-  int log2_rows;  // log2 of 'rows'.
-  int width;      // tile width in MI units
-  int height;     // tile height in MI units
-  // End of members that are only valid when uniform_spacing == 1
-
-  // Min num of tile columns possible based on 'max_width_sb' and frame width.
+  /**
+   * \name Members only valid when uniform_spacing == 1
+   */
+  /**@{*/
+  int log2_cols; /*!< log2 of 'cols'. */
+  int log2_rows; /*!< log2 of 'rows'. */
+  int width;     /*!< tile width in MI units */
+  int height;    /*!< tile height in MI units */
+  /**@}*/
+
+  /*!
+   * Min num of tile columns possible based on 'max_width_sb' and frame width.
+   */
   int min_log2_cols;
-  // Min num of tile rows possible based on 'max_height_sb' and frame height.
+  /*!
+   * Min num of tile rows possible based on 'max_height_sb' and frame height.
+   */
   int min_log2_rows;
-  // Min num of tile columns possible based on frame width.
+  /*!
+   * Min num of tile columns possible based on frame width.
+   */
   int max_log2_cols;
-  // Max num of tile columns possible based on frame width.
+  /*!
+   * Max num of tile columns possible based on frame width.
+   */
   int max_log2_rows;
-  // log2 of min number of tiles (same as min_log2_cols + min_log2_rows).
+  /*!
+   * log2 of min number of tiles (same as min_log2_cols + min_log2_rows).
+   */
   int min_log2;
-  // col_start_sb[i] is the start position of tile column i in superblock units.
-  // valid for 0 <= i <= cols
+  /*!
+   * col_start_sb[i] is the start position of tile column i in superblock units.
+   * valid for 0 <= i <= cols
+   */
   int col_start_sb[MAX_TILE_COLS + 1];
-  // row_start_sb[i] is the start position of tile row i in superblock units.
-  // valid for 0 <= i <= rows
+  /*!
+   * row_start_sb[i] is the start position of tile row i in superblock units.
+   * valid for 0 <= i <= rows
+   */
   int row_start_sb[MAX_TILE_ROWS + 1];
-  // If true, we are using large scale tile mode.
+  /*!
+   * If true, we are using large scale tile mode.
+   */
   unsigned int large_scale;
-  // Only relevant when large_scale == 1.
-  // If true, the independent decoding of a single tile or a section of a frame
-  // is allowed.
+  /*!
+   * Only relevant when large_scale == 1.
+   * If true, the independent decoding of a single tile or a section of a frame
+   * is allowed.
+   */
   unsigned int single_tile_decoding;
 } CommonTileParams;
 
-// Struct containing params related to MB_MODE_INFO arrays and related info.
 typedef struct CommonModeInfoParams CommonModeInfoParams;
+/*!
+ * \brief Params related to MB_MODE_INFO arrays and related info.
+ */
 struct CommonModeInfoParams {
-  // Number of rows/cols in the frame in 16 pixel units.
-  // This is computed from frame width and height aligned to a multiple of 8.
+  /*!
+   * Number of rows in the frame in 16 pixel units.
+   * This is computed from frame height aligned to a multiple of 8.
+   */
   int mb_rows;
+  /*!
+   * Number of cols in the frame in 16 pixel units.
+   * This is computed from frame width aligned to a multiple of 8.
+   */
   int mb_cols;
-  // Total MBs = mb_rows * mb_cols.
+
+  /*!
+   * Total MBs = mb_rows * mb_cols.
+   */
   int MBs;
 
-  // Number of rows/cols in the frame in 4 pixel (MB_MODE_INFO) units.
-  // This is computed from frame width and height aligned to a multiple of 8.
+  /*!
+   * Number of rows in the frame in 4 pixel (MB_MODE_INFO) units.
+   * This is computed from frame height aligned to a multiple of 8.
+   */
   int mi_rows;
+  /*!
+   * Number of cols in the frame in 4 pixel (MB_MODE_INFO) units.
+   * This is computed from frame width aligned to a multiple of 8.
+   */
   int mi_cols;
 
-  // An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block
-  // in the frame.
-  // Note: This array should be treated like a scratch memory, and should NOT be
-  // accessed directly, in most cases. Please use 'mi_grid_base' array instead.
+  /*!
+   * An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block
+   * in the frame.
+   * Note: This array should be treated like a scratch memory, and should NOT be
+   * accessed directly, in most cases. Please use 'mi_grid_base' array instead.
+   */
   MB_MODE_INFO *mi_alloc;
-  // Number of allocated elements in 'mi_alloc'.
+  /*!
+   * Number of allocated elements in 'mi_alloc'.
+   */
   int mi_alloc_size;
-  // Stride for 'mi_alloc' array.
+  /*!
+   * Stride for 'mi_alloc' array.
+   */
   int mi_alloc_stride;
-  // The minimum block size that each element in 'mi_alloc' can correspond to.
-  // For decoder, this is always BLOCK_4X4.
-  // For encoder, this is currently set to BLOCK_4X4 for resolution < 4k,
-  // and BLOCK_8X8 for resolution >= 4k.
+  /*!
+   * The minimum block size that each element in 'mi_alloc' can correspond to.
+   * For decoder, this is always BLOCK_4X4.
+   * For encoder, this is BLOCK_8X8 for resolution >= 4k case or REALTIME mode
+   * case. Otherwise, this is BLOCK_4X4.
+   */
   BLOCK_SIZE mi_alloc_bsize;
 
-  // Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'.
-  // It's possible that:
-  // - Multiple pointers in the grid point to the same element in 'mi_alloc'
-  // (for example, for all 4x4 blocks that belong to the same partition block).
-  // - Some pointers can be NULL (for example, for blocks outside visible area).
+  /*!
+   * Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'.
+   * It's possible that:
+   * - Multiple pointers in the grid point to the same element in 'mi_alloc'
+   * (for example, for all 4x4 blocks that belong to the same partition block).
+   * - Some pointers can be NULL (for example, for blocks outside visible area).
+   */
   MB_MODE_INFO **mi_grid_base;
-  // Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also).
+  /*!
+   * Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also).
+   */
   int mi_grid_size;
-  // Stride for 'mi_grid_base' (and 'tx_type_map' also).
+  /*!
+   * Stride for 'mi_grid_base' (and 'tx_type_map' also).
+   */
   int mi_stride;
 
-  // An array of tx types for each 4x4 block in the frame.
-  // Number of allocated elements is same as 'mi_grid_size', and stride is
-  // same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of
-  // 'mi_grid_base'.
+  /*!
+   * An array of tx types for each 4x4 block in the frame.
+   * Number of allocated elements is same as 'mi_grid_size', and stride is
+   * same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of
+   * 'mi_grid_base'.
+   */
   TX_TYPE *tx_type_map;
 
-  // Function pointers to allow separate logic for encoder and decoder.
+  /**
+   * \name Function pointers to allow separate logic for encoder and decoder.
+   */
+  /**@{*/
+  /*!
+   * Free the memory allocated to arrays in 'mi_params'.
+   * \param[in,out]   mi_params   object containing common mode info parameters
+   */
   void (*free_mi)(struct CommonModeInfoParams *mi_params);
+  /*!
+   * Initialize / reset appropriate arrays in 'mi_params'.
+   * \param[in,out]   mi_params   object containing common mode info parameters
+   */
   void (*setup_mi)(struct CommonModeInfoParams *mi_params);
+  /*!
+   * Allocate required memory for arrays in 'mi_params'.
+   * \param[in,out]   mi_params           object containing common mode info
+   *                                      parameters
+   * \param           width               frame width
+   * \param           height              frame height
+   * \param           min_partition_size  minimum partition size allowed while
+   *                                      encoding
+   */
   void (*set_mb_mi)(struct CommonModeInfoParams *mi_params, int width,
-                    int height);
+                    int height, BLOCK_SIZE min_partition_size);
+  /**@}*/
 };
 
-// Parameters related to quantization at the frame level.
 typedef struct CommonQuantParams CommonQuantParams;
+/*!
+ * \brief Parameters related to quantization at the frame level.
+ */
 struct CommonQuantParams {
-  // Base qindex of the frame in the range 0 to 255.
+  /*!
+   * Base qindex of the frame in the range 0 to 255.
+   */
   int base_qindex;
 
-  // Delta of qindex (from base_qindex) for Y plane DC coefficient.
-  // Note: y_ac_delta_q is implicitly 0.
+  /*!
+   * Delta of qindex (from base_qindex) for Y plane DC coefficient.
+   * Note: y_ac_delta_q is implicitly 0.
+   */
   int y_dc_delta_q;
 
-  // Delta of qindex (from base_qindex) for U plane DC and AC coefficients.
+  /*!
+   * Delta of qindex (from base_qindex) for U plane DC coefficients.
+   */
   int u_dc_delta_q;
+  /*!
+   * Delta of qindex (from base_qindex) for U plane AC coefficients.
+   */
   int v_dc_delta_q;
 
-  // Delta of qindex (from base_qindex) for V plane DC and AC coefficients.
-  // Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0.
+  /*!
+   * Delta of qindex (from base_qindex) for V plane DC coefficients.
+   * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0.
+   */
   int u_ac_delta_q;
+  /*!
+   * Delta of qindex (from base_qindex) for V plane AC coefficients.
+   * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0.
+   */
   int v_ac_delta_q;
 
-  // Note: The qindex per superblock may have a delta from the qindex obtained
-  // at frame level from parameters above, based on 'cm->delta_q_info'.
+  /*
+   * Note: The qindex per superblock may have a delta from the qindex obtained
+   * at frame level from parameters above, based on 'cm->delta_q_info'.
+   */
 
-  // The dequantizers below are true dequantizers used only in the
-  // dequantization process.  They have the same coefficient
-  // shift/scale as TX.
-  int16_t y_dequant_QTX[MAX_SEGMENTS][2];
-  int16_t u_dequant_QTX[MAX_SEGMENTS][2];
-  int16_t v_dequant_QTX[MAX_SEGMENTS][2];
+  /**
+   * \name True dequantizers.
+   * The dequantizers below are true dequantizers used only in the
+   * dequantization process.  They have the same coefficient
+   * shift/scale as TX.
+   */
+  /**@{*/
+  int16_t y_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for Y plane */
+  int16_t u_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for U plane */
+  int16_t v_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for V plane */
+  /**@}*/
 
-  // Global quant matrix tables
+  /**
+   * \name Global quantization matrix tables.
+   */
+  /**@{*/
+  /*!
+   * Global dquantization matrix table.
+   */
   const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+  /*!
+   * Global quantization matrix table.
+   */
   const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+  /**@}*/
 
-  // Local quant matrix tables for each frame
+  /**
+   * \name Local dequantization matrix tables for each frame.
+   */
+  /**@{*/
+  /*!
+   * Local dequant matrix for Y plane.
+   */
   const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  /*!
+   * Local dequant matrix for U plane.
+   */
   const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  /*!
+   * Local dequant matrix for V plane.
+   */
   const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  /**@}*/
 
-  // Flag indicating whether quantization matrices are being used:
-  //  - If true, qm_level_y, qm_level_u and qm_level_v indicate the level
-  //    indices to be used to access appropriate global quant matrix tables.
-  //  - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'.
+  /*!
+   * Flag indicating whether quantization matrices are being used:
+   *  - If true, qm_level_y, qm_level_u and qm_level_v indicate the level
+   *    indices to be used to access appropriate global quant matrix tables.
+   *  - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'.
+   */
   bool using_qmatrix;
-  int qmatrix_level_y;
-  int qmatrix_level_u;
-  int qmatrix_level_v;
+  /**
+   * \name Valid only when using_qmatrix == true
+   * Indicate the level indices to be used to access appropriate global quant
+   * matrix tables.
+   */
+  /**@{*/
+  int qmatrix_level_y; /*!< Level index for Y plane */
+  int qmatrix_level_u; /*!< Level index for U plane */
+  int qmatrix_level_v; /*!< Level index for V plane */
+  /**@}*/
 };
 
-// Context used for transmitting various symbols in the bistream.
 typedef struct CommonContexts CommonContexts;
+/*!
+ * \brief Contexts used for transmitting various symbols in the bitstream.
+ */
 struct CommonContexts {
-  // Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type.
-  // partition[i][j] is the context for ith tile row, jth mi_col.
+  /*!
+   * Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type.
+   * partition[i][j] is the context for ith tile row, jth mi_col.
+   */
   PARTITION_CONTEXT **partition;
 
-  // Context used to derive context for multiple symbols:
-  // - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit
-  // to transmit skip_txfm flag.
-  // - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit
-  // sign.
-  // entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col.
+  /*!
+   * Context used to derive context for multiple symbols:
+   * - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit
+   * to transmit skip_txfm flag.
+   * - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit
+   * sign.
+   * entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col.
+   */
   ENTROPY_CONTEXT **entropy[MAX_MB_PLANE];
 
-  // Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to
-  // transmit 'is_split' flag to indicate if this transform block should be
-  // split into smaller sub-blocks.
-  // txfm[i][j] is the context for ith tile row, jth mi_col.
+  /*!
+   * Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to
+   * transmit 'is_split' flag to indicate if this transform block should be
+   * split into smaller sub-blocks.
+   * txfm[i][j] is the context for ith tile row, jth mi_col.
+   */
   TXFM_CONTEXT **txfm;
 
-  // Dimensions that were used to allocate the arrays above.
-  // If these dimensions change, the arrays may have to be re-allocated.
-  int num_planes;     // Corresponds to av1_num_planes(cm)
-  int num_tile_rows;  // Corresponds to cm->tiles.row
-  int num_mi_cols;    // Corresponds to cm->mi_params.mi_cols
+  /*!
+   * Dimensions that were used to allocate the arrays above.
+   * If these dimensions change, the arrays may have to be re-allocated.
+   */
+  int num_planes;    /*!< Corresponds to av1_num_planes(cm) */
+  int num_tile_rows; /*!< Corresponds to cm->tiles.row */
+  int num_mi_cols;   /*!< Corresponds to cm->mi_params.mi_cols */
 };
 
+/*!
+ * \brief Top level common structure used by both encoder and decoder.
+ */
 typedef struct AV1Common {
-  // Information about the current frame that is being coded.
+  /*!
+   * Information about the current frame that is being coded.
+   */
   CurrentFrame current_frame;
-  // Code and details about current error status.
-  struct aom_internal_error_info error;
-
-  // AV1 allows two types of frame scaling operations:
-  // (1) Frame super-resolution: that allows coding a frame at lower resolution
-  // and after decoding the frame, normatively uscales and restores the frame --
-  // inside the coding loop.
-  // (2) Frame resize: that allows coding frame at lower/higher resolution, and
-  // then non-normatively upscale the frame at the time of rendering -- outside
-  // the coding loop.
-  // Hence, the need for 3 types of dimensions.
-
-  // Coded frame dimensions.
-  int width;
-  int height;
+  /*!
+   * Code and details about current error status.
+   */
+  struct aom_internal_error_info *error;
+
+  /*!
+   * AV1 allows two types of frame scaling operations:
+   * 1. Frame super-resolution: that allows coding a frame at lower resolution
+   * and after decoding the frame, normatively uscales and restores the frame --
+   * inside the coding loop.
+   * 2. Frame resize: that allows coding frame at lower/higher resolution, and
+   * then non-normatively upscale the frame at the time of rendering -- outside
+   * the coding loop.
+   * Hence, the need for 3 types of dimensions.
+   */
+
+  /**
+   * \name Coded frame dimensions.
+   */
+  /**@{*/
+  int width;  /*!< Coded frame width */
+  int height; /*!< Coded frame height */
+  /**@}*/
+
+  /**
+   * \name Rendered frame dimensions.
+   * Dimensions after applying both super-resolution and resize to the coded
+   * frame. Different from coded dimensions if super-resolution and/or resize
+   * are being used for this frame.
+   */
+  /**@{*/
+  int render_width;  /*!< Rendered frame width */
+  int render_height; /*!< Rendered frame height */
+  /**@}*/
 
-  // Rendered frame dimensions, after applying both super-resolution and resize
-  // to the coded frame.
-  // Different from coded dimensions if super-resolution and/or resize are
-  // being used for this frame.
-  int render_width;
-  int render_height;
-
-  // Frame dimensions after applying super-resolution to the coded frame (if
-  // present), but before applying resize.
-  // Larger than the coded dimensions if super-resolution is being used for
-  // this frame.
-  // Different from rendered dimensions if resize is being used for this frame.
-  int superres_upscaled_width;
-  int superres_upscaled_height;
-
-  // The denominator of the superres scale used by this frame.
-  // Note: The numerator is fixed to be SCALE_NUMERATOR.
+  /**
+   * \name Super-resolved frame dimensions.
+   * Frame dimensions after applying super-resolution to the coded frame (if
+   * present), but before applying resize.
+   * Larger than the coded dimensions if super-resolution is being used for
+   * this frame.
+   * Different from rendered dimensions if resize is being used for this frame.
+   */
+  /**@{*/
+  int superres_upscaled_width;  /*!< Super-resolved frame width */
+  int superres_upscaled_height; /*!< Super-resolved frame height */
+  /**@}*/
+
+  /*!
+   * The denominator of the superres scale used by this frame.
+   * Note: The numerator is fixed to be SCALE_NUMERATOR.
+   */
   uint8_t superres_scale_denominator;
 
-  // If true, buffer removal times are present.
-  bool buffer_removal_time_present;
-  // buffer_removal_times[op_num] specifies the frame removal time in units of
-  // DecCT clock ticks counted from the removal time of the last random access
-  // point for operating point op_num.
-  // TODO(urvang): We probably don't need the +1 here.
+  /*!
+   * buffer_removal_times[op_num] specifies the frame removal time in units of
+   * DecCT clock ticks counted from the removal time of the last random access
+   * point for operating point op_num.
+   * TODO(urvang): We probably don't need the +1 here.
+   */
   uint32_t buffer_removal_times[MAX_NUM_OPERATING_POINTS + 1];
-  // Presentation time of the frame in clock ticks DispCT counted from the
-  // removal time of the last random access point for the operating point that
-  // is being decoded.
+  /*!
+   * Presentation time of the frame in clock ticks DispCT counted from the
+   * removal time of the last random access point for the operating point that
+   * is being decoded.
+   */
   uint32_t frame_presentation_time;
 
-  // Buffer where previous frame is stored.
+  /*!
+   * Buffer where previous frame is stored.
+   */
   RefCntBuffer *prev_frame;
 
-  // Buffer into which the current frame will be stored and other related info.
-  // TODO(hkuang): Combine this with cur_buf in macroblockd.
+  /*!
+   * Buffer into which the current frame will be stored and other related info.
+   * TODO(hkuang): Combine this with cur_buf in macroblockd.
+   */
   RefCntBuffer *cur_frame;
 
-  // For encoder, we have a two-level mapping from reference frame type to the
-  // corresponding buffer in the buffer pool:
-  // * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ...
-  // EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1)
-  // * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to
-  // the reference counted buffer structure RefCntBuffer, taken from the buffer
-  // pool cm->buffer_pool->frame_bufs.
-  //
-  // LAST_FRAME,                        ...,      EXTREF_FRAME
-  //      |                                           |
-  //      v                                           v
-  // remapped_ref_idx[LAST_FRAME - 1],  ...,  remapped_ref_idx[EXTREF_FRAME - 1]
-  //      |                                           |
-  //      v                                           v
-  // ref_frame_map[],                   ...,     ref_frame_map[]
-  //
-  // Note: INTRA_FRAME always refers to the current frame, so there's no need to
-  // have a remapped index for the same.
+  /*!
+   * For encoder, we have a two-level mapping from reference frame type to the
+   * corresponding buffer in the buffer pool:
+   * * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ...
+   * EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1)
+   * * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to
+   * the reference counted buffer structure RefCntBuffer, taken from the buffer
+   * pool cm->buffer_pool->frame_bufs.
+   *
+   * LAST_FRAME,                        ...,      EXTREF_FRAME
+   *      |                                           |
+   *      v                                           v
+   * remapped_ref_idx[LAST_FRAME - 1],  ...,  remapped_ref_idx[EXTREF_FRAME - 1]
+   *      |                                           |
+   *      v                                           v
+   * ref_frame_map[],                   ...,     ref_frame_map[]
+   *
+   * Note: INTRA_FRAME always refers to the current frame, so there's no need to
+   * have a remapped index for the same.
+   */
   int remapped_ref_idx[REF_FRAMES];
 
-  // Scale of the current frame with respect to itself.
-  // This is currently used for intra block copy, which behaves like an inter
-  // prediction mode, where the reference frame is the current frame itself.
+  /*!
+   * Scale of the current frame with respect to itself.
+   * This is currently used for intra block copy, which behaves like an inter
+   * prediction mode, where the reference frame is the current frame itself.
+   */
   struct scale_factors sf_identity;
 
-  // Scale factors of the reference frame with respect to the current frame.
-  // This is required for generating inter prediction and will be non-identity
-  // for a reference frame, if it has different dimensions than the coded
-  // dimensions of the current frame.
+  /*!
+   * Scale factors of the reference frame with respect to the current frame.
+   * This is required for generating inter prediction and will be non-identity
+   * for a reference frame, if it has different dimensions than the coded
+   * dimensions of the current frame.
+   */
   struct scale_factors ref_scale_factors[REF_FRAMES];
 
-  // For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to
-  // the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
-  // For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps
-  // remapped reference index 'j' (that is, original reference type 'i') to
-  // a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+  /*!
+   * For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to
+   * the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+   * For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps
+   * remapped reference index 'j' (that is, original reference type 'i') to
+   * a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+   */
   RefCntBuffer *ref_frame_map[REF_FRAMES];
 
-  // If true, this frame is actually shown after decoding.
-  // If false, this frame is coded in the bitstream, but not shown. It is only
-  // used as a reference for other frames coded later.
+  /*!
+   * If true, this frame is actually shown after decoding.
+   * If false, this frame is coded in the bitstream, but not shown. It is only
+   * used as a reference for other frames coded later.
+   */
   int show_frame;
 
-  // If true, this frame can be used as a show-existing frame for other frames
-  // coded later.
-  // When 'show_frame' is true, this is always true for all non-keyframes.
-  // When 'show_frame' is false, this value is transmitted in the bitstream.
+  /*!
+   * If true, this frame can be used as a show-existing frame for other frames
+   * coded later.
+   * When 'show_frame' is true, this is always true for all non-keyframes.
+   * When 'show_frame' is false, this value is transmitted in the bitstream.
+   */
   int showable_frame;
 
-  // If true, show an existing frame coded before, instead of actually coding a
-  // frame. The existing frame comes from one of the existing reference buffers,
-  // as signaled in the bitstream.
+  /*!
+   * If true, show an existing frame coded before, instead of actually coding a
+   * frame. The existing frame comes from one of the existing reference buffers,
+   * as signaled in the bitstream.
+   */
   int show_existing_frame;
 
-  // Whether some features are allowed or not.
+  /*!
+   * Whether some features are allowed or not.
+   */
   FeatureFlags features;
 
-  // Params related to MB_MODE_INFO arrays and related info.
+  /*!
+   * Params related to MB_MODE_INFO arrays and related info.
+   */
   CommonModeInfoParams mi_params;
 
 #if CONFIG_ENTROPY_STATS
+  /*!
+   * Context type used by token CDFs, in the range 0 .. (TOKEN_CDF_Q_CTXS - 1).
+   */
   int coef_cdf_category;
-#endif
-  // Quantization params.
+#endif  // CONFIG_ENTROPY_STATS
+
+  /*!
+   * Quantization params.
+   */
   CommonQuantParams quant_params;
 
-  // Segmentation info for current frame.
+  /*!
+   * Segmentation info for current frame.
+   */
   struct segmentation seg;
 
-  // Segmentation map for previous frame.
+  /*!
+   * Segmentation map for previous frame.
+   */
   uint8_t *last_frame_seg_map;
 
-  // Deblocking filter parameters.
-  loop_filter_info_n lf_info;
-  struct loopfilter lf;
-
-  // Loop Restoration filter parameters.
-  RestorationInfo rst_info[MAX_MB_PLANE];  // Loop Restoration filter info.
-  int32_t *rst_tmpbuf;  // Scratch buffer for self-guided restoration filter.
-  RestorationLineBuffers *rlbs;  // Line buffers required by loop restoration.
-  YV12_BUFFER_CONFIG rst_frame;  // Stores the output of loop restoration.
+  /**
+   * \name Deblocking filter parameters.
+   */
+  /**@{*/
+  loop_filter_info_n lf_info; /*!< Loop filter info */
+  struct loopfilter lf;       /*!< Loop filter parameters */
+  /**@}*/
 
-  // CDEF (Constrained Directional Enhancement Filter) parameters.
+  /**
+   * \name Loop Restoration filter parameters.
+   */
+  /**@{*/
+  RestorationInfo rst_info[MAX_MB_PLANE]; /*!< Loop Restoration filter info */
+  int32_t *rst_tmpbuf; /*!< Scratch buffer for self-guided restoration */
+  RestorationLineBuffers *rlbs; /*!< Line buffers needed by loop restoration */
+  YV12_BUFFER_CONFIG rst_frame; /*!< Stores the output of loop restoration */
+  /**@}*/
+
+  /*!
+   * CDEF (Constrained Directional Enhancement Filter) parameters.
+   */
   CdefInfo cdef_info;
 
-  // Parameters for film grain synthesis.
+  /*!
+   * Parameters for film grain synthesis.
+   */
   aom_film_grain_t film_grain_params;
 
-  // Parameters for delta quantization and delta loop filter level.
+  /*!
+   * Parameters for delta quantization and delta loop filter level.
+   */
   DeltaQInfo delta_q_info;
 
-  // Global motion parameters for each reference frame.
+  /*!
+   * Global motion parameters for each reference frame.
+   */
   WarpedMotionParams global_motion[REF_FRAMES];
 
-  // Elements part of the sequence header, that are applicable for all the
-  // frames in the video.
-  SequenceHeader seq_params;
+  /*!
+   * Elements part of the sequence header, that are applicable for all the
+   * frames in the video.
+   */
+  SequenceHeader *seq_params;
 
-  // Current CDFs of all the symbols for the current frame.
+  /*!
+   * Current CDFs of all the symbols for the current frame.
+   */
   FRAME_CONTEXT *fc;
-  // Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE
-  // (e.g. for a keyframe). These default CDFs are defined by the bitstream and
-  // copied from default CDF tables for each symbol.
+  /*!
+   * Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE
+   * (e.g. for a keyframe). These default CDFs are defined by the bitstream and
+   * copied from default CDF tables for each symbol.
+   */
   FRAME_CONTEXT *default_frame_context;
 
-  // Parameters related to tiling.
+  /*!
+   * Parameters related to tiling.
+   */
   CommonTileParams tiles;
 
-  // External BufferPool passed from outside.
+  /*!
+   * External BufferPool passed from outside.
+   */
   BufferPool *buffer_pool;
 
-  // Above context buffers and their sizes.
-  // Note: above contexts are allocated in this struct, as their size is
-  // dependent on frame width, while left contexts are declared and allocated in
-  // MACROBLOCKD struct, as they have a fixed size.
+  /*!
+   * Above context buffers and their sizes.
+   * Note: above contexts are allocated in this struct, as their size is
+   * dependent on frame width, while left contexts are declared and allocated in
+   * MACROBLOCKD struct, as they have a fixed size.
+   */
   CommonContexts above_contexts;
 
-  // When cm->seq_params.frame_id_numbers_present_flag == 1, current and
-  // reference frame IDs are signaled in the bitstream.
-  int current_frame_id;
-  int ref_frame_id[REF_FRAMES];
-
-  // Motion vectors provided by motion field estimation.
-  // tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where:
-  // mi_row = 2 * row,
-  // mi_col = 2 * col, and
-  // stride = cm->mi_params.mi_stride / 2
+  /**
+   * \name Signaled when cm->seq_params->frame_id_numbers_present_flag == 1
+   */
+  /**@{*/
+  int current_frame_id;         /*!< frame ID for the current frame. */
+  int ref_frame_id[REF_FRAMES]; /*!< frame IDs for the reference frames. */
+  /**@}*/
+
+  /*!
+   * Motion vectors provided by motion field estimation.
+   * tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where:
+   * mi_row = 2 * row,
+   * mi_col = 2 * col, and
+   * stride = cm->mi_params.mi_stride / 2
+   */
   TPL_MV_REF *tpl_mvs;
-  // Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function.
+  /*!
+   * Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function.
+   */
   int tpl_mvs_mem_size;
-  // ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and
-  // current frame is positive; and 0 otherwise.
+  /*!
+   * ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and
+   * current frame is positive; and 0 otherwise.
+   */
   int ref_frame_sign_bias[REF_FRAMES];
-  // ref_frame_side[k] is 1 if relative distance between reference 'k' and
-  // current frame is positive, -1 if relative distance is 0; and 0 otherwise.
-  // TODO(jingning): This can be combined with sign_bias later.
+  /*!
+   * ref_frame_side[k] is 1 if relative distance between reference 'k' and
+   * current frame is positive, -1 if relative distance is 0; and 0 otherwise.
+   * TODO(jingning): This can be combined with sign_bias later.
+   */
   int8_t ref_frame_side[REF_FRAMES];
 
-  // Number of temporal layers: may be > 1 for SVC (scalable vector coding).
-  unsigned int number_temporal_layers;
-  // Temporal layer ID of this frame
-  // (in the range 0 ... (number_temporal_layers - 1)).
+  /*!
+   * Temporal layer ID of this frame
+   * (in the range 0 ... (number_temporal_layers - 1)).
+   */
   int temporal_layer_id;
 
-  // Number of spatial layers: may be > 1 for SVC (scalable vector coding).
-  unsigned int number_spatial_layers;
-  // Spatial layer ID of this frame
-  // (in the range 0 ... (number_spatial_layers - 1)).
+  /*!
+   * Spatial layer ID of this frame
+   * (in the range 0 ... (number_spatial_layers - 1)).
+   */
   int spatial_layer_id;
 
 #if TXCOEFF_TIMER
@@ -731,12 +1059,10 @@ typedef struct AV1Common {
   int64_t txcoeff_cost_timer;
   int64_t txcoeff_cost_count;
 #endif  // TXCOEFF_COST_TIMER
-
-#if CONFIG_LPF_MASK
-  int is_decoding;
-#endif  // CONFIG_LPF_MASK
 } AV1_COMMON;
 
+/*!\cond */
+
 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
 // frame reference count.
 static void lock_buffer_pool(BufferPool *const pool) {
@@ -877,15 +1203,15 @@ static INLINE RefCntBuffer *get_primary_ref_frame_buf(
 // Returns 1 if this frame might allow mvs from some reference frame.
 static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
   return !cm->features.error_resilient_mode &&
-         cm->seq_params.order_hint_info.enable_ref_frame_mvs &&
-         cm->seq_params.order_hint_info.enable_order_hint &&
+         cm->seq_params->order_hint_info.enable_ref_frame_mvs &&
+         cm->seq_params->order_hint_info.enable_order_hint &&
          !frame_is_intra_only(cm);
 }
 
 // Returns 1 if this frame might use warped_motion
 static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
   return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) &&
-         cm->seq_params.enable_warped_motion;
+         cm->seq_params->enable_warped_motion;
 }
 
 static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
@@ -925,7 +1251,7 @@ static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
 void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
 
 static INLINE int av1_num_planes(const AV1_COMMON *cm) {
-  return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
+  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
 }
 
 static INLINE void av1_init_above_context(CommonContexts *above_contexts,
@@ -938,14 +1264,11 @@ static INLINE void av1_init_above_context(CommonContexts *above_contexts,
   xd->above_txfm_context = above_contexts->txfm[tile_row];
 }
 
-static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        tran_low_t *dqcoeff) {
+static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) {
   const int num_planes = av1_num_planes(cm);
   const CommonQuantParams *const quant_params = &cm->quant_params;
 
   for (int i = 0; i < num_planes; ++i) {
-    xd->plane[i].dqcoeff = dqcoeff;
-
     if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
       memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX,
              sizeof(quant_params->y_dequant_QTX));
@@ -967,8 +1290,8 @@ static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
     }
   }
   xd->mi_stride = cm->mi_params.mi_stride;
-  xd->error_info = &cm->error;
-  cfl_init(&xd->cfl, &cm->seq_params);
+  xd->error_info = cm->error;
+  cfl_init(&xd->cfl, cm->seq_params);
 }
 
 static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
@@ -979,7 +1302,7 @@ static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
   for (i = 0; i < num_planes; ++i) {
     struct macroblockd_plane *const pd = &xd->plane[i];
     // Offset the buffer pointer
-    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+    const BLOCK_SIZE bsize = xd->mi[0]->bsize;
     if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
       row_offset = mi_row - 1;
     if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
@@ -1072,16 +1395,17 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
 
   xd->height = bh;
   xd->width = bw;
-  xd->is_sec_rect = 0;
+
+  xd->is_last_vertical_rect = 0;
   if (xd->width < xd->height) {
-    // Only mark is_sec_rect as 1 for the last block.
-    // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
-    // For other partitions, it would be (0, 1).
-    if (!((mi_col + xd->width) & (xd->height - 1))) xd->is_sec_rect = 1;
+    if (!((mi_col + xd->width) & (xd->height - 1))) {
+      xd->is_last_vertical_rect = 1;
+    }
   }
 
+  xd->is_first_horizontal_rect = 0;
   if (xd->width > xd->height)
-    if (mi_row & (xd->width - 1)) xd->is_sec_rect = 1;
+    if (!(mi_row & (xd->width - 1))) xd->is_first_horizontal_rect = 1;
 }
 
 static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
@@ -1249,7 +1573,7 @@ static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
                                           const MACROBLOCKD *xd,
                                           int mi_col_start, int mi_col_end,
                                           const int tile_row) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   const int width = mi_col_end - mi_col_start;
   const int aligned_width =
@@ -1447,7 +1771,9 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
 
   const int offset = mi_row * mi_params->mi_stride + mi_col;
   MB_MODE_INFO **mi = mi_params->mi_grid_base + offset;
-  const BLOCK_SIZE subsize = mi[0]->sb_type;
+  const BLOCK_SIZE subsize = mi[0]->bsize;
+
+  assert(bsize < BLOCK_SIZES_ALL);
 
   if (subsize == bsize) return PARTITION_NONE;
 
@@ -1470,7 +1796,7 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
       if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
       assert(sshigh * 2 == bhigh);
 
-      if (mbmi_below->sb_type == subsize)
+      if (mbmi_below->bsize == subsize)
         return PARTITION_HORZ;
       else
         return PARTITION_HORZ_B;
@@ -1481,7 +1807,7 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
       if (sswide * 4 == bwide) return PARTITION_VERT_4;
       assert(sswide * 2 == bhigh);
 
-      if (mbmi_right->sb_type == subsize)
+      if (mbmi_right->bsize == subsize)
         return PARTITION_VERT;
       else
         return PARTITION_VERT_B;
@@ -1495,8 +1821,8 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
       // it's PARTITION_SPLIT.
       if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
 
-      if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A;
-      if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A;
+      if (mi_size_wide[mbmi_below->bsize] == bwide) return PARTITION_HORZ_A;
+      if (mi_size_high[mbmi_right->bsize] == bhigh) return PARTITION_VERT_A;
 
       return PARTITION_SPLIT;
     }
@@ -1550,6 +1876,8 @@ static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
           seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3);
 }
 
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h b/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h
index 47fedbd2ae..b4f7801295 100644
--- a/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h
+++ b/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h
@@ -41,7 +41,5 @@ extern const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL];
 // Values in both av1_inv_cos_bit_col and av1_inv_cos_bit_row are always 12
 // for each valid row and col combination
 #define INV_COS_BIT 12
-extern const int8_t av1_inv_cos_bit_col[5 /*row*/][5 /*col*/];
-extern const int8_t av1_inv_cos_bit_row[5 /*row*/][5 /*col*/];
 
 #endif  // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
diff --git a/media/libaom/src/av1/common/av1_inv_txfm2d.c b/media/libaom/src/av1/common/av1_inv_txfm2d.c
index 559d12129e..154c9d2fdf 100644
--- a/media/libaom/src/av1/common/av1_inv_txfm2d.c
+++ b/media/libaom/src/av1/common/av1_inv_txfm2d.c
@@ -157,26 +157,6 @@ const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL] = {
   inv_shift_32x8,  inv_shift_16x64, inv_shift_64x16,
 };
 
-/* clang-format off */
-const int8_t av1_inv_cos_bit_col[MAX_TXWH_IDX]      // txw_idx
-                            [MAX_TXWH_IDX] = {  // txh_idx
-    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
-    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
-    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
-    {           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
-    {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
-  };
-
-const int8_t av1_inv_cos_bit_row[MAX_TXWH_IDX]      // txw_idx
-                            [MAX_TXWH_IDX] = {  // txh_idx
-    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
-    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
-    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
-    {           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
-    {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
-  };
-/* clang-format on */
-
 static const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
 
 void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
@@ -191,8 +171,8 @@ void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
   cfg->shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  cfg->cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  cfg->cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  cfg->cos_bit_col = INV_COS_BIT;
+  cfg->cos_bit_row = INV_COS_BIT;
   cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
   if (cfg->txfm_type_col == TXFM_TYPE_ADST4) {
     memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));
diff --git a/media/libaom/src/av1/common/av1_loopfilter.c b/media/libaom/src/av1/common/av1_loopfilter.c
index c756760def..5dede53d33 100644
--- a/media/libaom/src/av1/common/av1_loopfilter.c
+++ b/media/libaom/src/av1/common/av1_loopfilter.c
@@ -22,6 +22,12 @@
 #include "av1/common/reconinter.h"
 #include "av1/common/seg_common.h"
 
+enum {
+  USE_SINGLE,
+  USE_DUAL,
+  USE_QUAD,
+} UENUM1BYTE(USE_FILTER_TYPE);
+
 static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
   { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
   { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U },
@@ -106,8 +112,6 @@ void av1_loop_filter_init(AV1_COMMON *cm) {
   struct loopfilter *lf = &cm->lf;
   int lvl;
 
-  lf->combine_vert_horz_lf = 1;
-
   // init limits for given sharpness
   update_sharpness(lfi, lf->sharpness_level);
 
@@ -188,22 +192,19 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
   }
 }
 
-static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
-                                  const MB_MODE_INFO *const mbmi,
-                                  const EDGE_DIR edge_dir, const int mi_row,
-                                  const int mi_col, const int plane,
-                                  const struct macroblockd_plane *plane_ptr) {
+static AOM_FORCE_INLINE TX_SIZE
+get_transform_size(const MACROBLOCKD *const xd, const MB_MODE_INFO *const mbmi,
+                   const int mi_row, const int mi_col, const int plane,
+                   const int ss_x, const int ss_y) {
   assert(mbmi != NULL);
   if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4;
 
-  TX_SIZE tx_size =
-      (plane == AOM_PLANE_Y)
-          ? mbmi->tx_size
-          : av1_get_max_uv_txsize(mbmi->sb_type, plane_ptr->subsampling_x,
-                                  plane_ptr->subsampling_y);
+  TX_SIZE tx_size = (plane == AOM_PLANE_Y)
+                        ? mbmi->tx_size
+                        : av1_get_max_uv_txsize(mbmi->bsize, ss_x, ss_y);
   assert(tx_size < TX_SIZES_ALL);
-  if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip) {
-    const BLOCK_SIZE sb_type = mbmi->sb_type;
+  if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip_txfm) {
+    const BLOCK_SIZE sb_type = mbmi->bsize;
     const int blk_row = mi_row & (mi_size_high[sb_type] - 1);
     const int blk_col = mi_col & (mi_size_wide[sb_type] - 1);
     const TX_SIZE mb_tx_size =
@@ -212,23 +213,10 @@ static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
     tx_size = mb_tx_size;
   }
 
-  // since in case of chrominance or non-square transform need to convert
-  // transform size into transform size in particular direction.
-  // for vertical edge, filter direction is horizontal, for horizontal
-  // edge, filter direction is vertical.
-  tx_size = (VERT_EDGE == edge_dir) ? txsize_horz_map[tx_size]
-                                    : txsize_vert_map[tx_size];
   return tx_size;
 }
 
-typedef struct AV1_DEBLOCKING_PARAMETERS {
-  // length of the filter applied to the outer edge
-  uint32_t filter_length;
-  // deblocking limits
-  const uint8_t *lim;
-  const uint8_t *mblim;
-  const uint8_t *hev_thr;
-} AV1_DEBLOCKING_PARAMETERS;
+static const int tx_dim_to_filter_length[TX_SIZES] = { 4, 8, 14, 14, 14 };
 
 // Return TX_SIZE from get_transform_size(), so it is plane and direction
 // aware
@@ -264,8 +252,8 @@ static TX_SIZE set_lpf_parameters(
   // it not set up.
   if (mbmi == NULL) return TX_INVALID;
 
-  const TX_SIZE ts =
-      get_transform_size(xd, mi[0], edge_dir, mi_row, mi_col, plane, plane_ptr);
+  const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, plane,
+                                        scale_horz, scale_vert);
 
   {
     const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y);
@@ -279,7 +267,7 @@ static TX_SIZE set_lpf_parameters(
     {
       const uint32_t curr_level =
           av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
-      const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
+      const int curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi);
       uint32_t level = curr_level;
       if (coord) {
         {
@@ -290,15 +278,15 @@ static TX_SIZE set_lpf_parameters(
           const int pv_col =
               (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col);
           const TX_SIZE pv_ts = get_transform_size(
-              xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
+              xd, mi_prev, pv_row, pv_col, plane, scale_horz, scale_vert);
 
           const uint32_t pv_lvl =
               av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
 
-          const int pv_skip = mi_prev->skip && is_inter_block(mi_prev);
-          const BLOCK_SIZE bsize =
-              get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x,
-                                   plane_ptr->subsampling_y);
+          const int pv_skip_txfm =
+              mi_prev->skip_txfm && is_inter_block(mi_prev);
+          const BLOCK_SIZE bsize = get_plane_block_size(
+              mbmi->bsize, plane_ptr->subsampling_x, plane_ptr->subsampling_y);
           assert(bsize < BLOCK_SIZES_ALL);
           const int prediction_masks = edge_dir == VERT_EDGE
                                            ? block_size_wide[bsize] - 1
@@ -307,21 +295,18 @@ static TX_SIZE set_lpf_parameters(
           // if the current and the previous blocks are skipped,
           // deblock the edge if the edge belongs to a PU's edge only.
           if ((curr_level || pv_lvl) &&
-              (!pv_skip || !curr_skipped || pu_edge)) {
-            const TX_SIZE min_ts = AOMMIN(ts, pv_ts);
-            if (TX_4X4 >= min_ts) {
-              params->filter_length = 4;
-            } else if (TX_8X8 == min_ts) {
-              if (plane != 0)
-                params->filter_length = 6;
-              else
-                params->filter_length = 8;
+              (!pv_skip_txfm || !curr_skipped || pu_edge)) {
+            const int dim = (VERT_EDGE == edge_dir)
+                                ? AOMMIN(tx_size_wide_unit_log2[ts],
+                                         tx_size_wide_unit_log2[pv_ts])
+                                : AOMMIN(tx_size_high_unit_log2[ts],
+                                         tx_size_high_unit_log2[pv_ts]);
+            if (plane) {
+              params->filter_length = (dim == 0) ? 4 : 6;
             } else {
-              params->filter_length = 14;
-              // No wide filtering for chroma plane
-              if (plane != 0) {
-                params->filter_length = 6;
-              }
+              assert(dim < TX_SIZES);
+              assert(dim >= 0);
+              params->filter_length = tx_dim_to_filter_length[dim];
             }
 
             // update the level if the current block is skipped,
@@ -333,9 +318,7 @@ static TX_SIZE set_lpf_parameters(
       // prepare common parameters
       if (params->filter_length) {
         const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
-        params->lim = limits->lim;
-        params->mblim = limits->mblim;
-        params->hev_thr = limits->hev_thr;
+        params->lfthr = limits;
       }
     }
   }
@@ -343,247 +326,998 @@ static TX_SIZE set_lpf_parameters(
   return ts;
 }
 
-void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
-                                 const MACROBLOCKD *const xd, const int plane,
-                                 const MACROBLOCKD_PLANE *const plane_ptr,
-                                 const uint32_t mi_row, const uint32_t mi_col) {
-  const uint32_t scale_horz = plane_ptr->subsampling_x;
-  const uint32_t scale_vert = plane_ptr->subsampling_y;
-  uint8_t *const dst_ptr = plane_ptr->dst.buf;
-  const int dst_stride = plane_ptr->dst.stride;
-  const int y_range = (MAX_MIB_SIZE >> scale_vert);
-  const int x_range = (MAX_MIB_SIZE >> scale_horz);
-  for (int y = 0; y < y_range; y++) {
-    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
-    for (int x = 0; x < x_range;) {
-      // inner loop always filter vertical edges in a MI block. If MI size
-      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
-      // If 4x4 transform is used, it will then filter the internal edge
-      //  aligned with a 4x4 block
-      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
-      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
-      uint32_t advance_units;
-      TX_SIZE tx_size;
-      AV1_DEBLOCKING_PARAMETERS params;
-      memset(&params, 0, sizeof(params));
+static const uint32_t vert_filter_length_luma[TX_SIZES_ALL][TX_SIZES_ALL] = {
+  // TX_4X4
+  {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  },
+  // TX_8X8
+  {
+      4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8,
+  },
+  // TX_16X16
+  {
+      4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+  },
+  // TX_32X32
+  {
+      4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+  },
+  // TX_64X64
+  {
+      4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+  },
+  // TX_4X8
+  {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  },
+  // TX_8X4
+  {
+      4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8,
+  },
+  // TX_8X16
+  {
+      4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8,
+  },
+  // TX_16X8
+  {
+      4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+  },
+  // TX_16X32
+  {
+      4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+  },
+  // TX_32X16
+  {
+      4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+  },
+  // TX_32X64
+  {
+      4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+  },
+  // TX_64X32
+  {
+      4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+  },
+  // TX_4X16
+  {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  },
+  // TX_16X4
+  {
+      4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+  },
+  // TX_8X32
+  {
+      4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8,
+  },
+  // TX_32X8
+  {
+      4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+  },
+  // TX_16X64
+  {
+      4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+  },
+  // TX_64X16
+  {
+      4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+  },
+};
 
-      tx_size =
-          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
-                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
-      if (tx_size == TX_INVALID) {
-        params.filter_length = 0;
-        tx_size = TX_4X4;
+static const uint32_t horz_filter_length_luma[TX_SIZES_ALL][TX_SIZES_ALL] = {
+  // TX_4X4
+  {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  },
+  // TX_8X8
+  {
+      4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8,
+  },
+  // TX_16X16
+  {
+      4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+  },
+  // TX_32X32
+  {
+      4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+  },
+  // TX_64X64
+  {
+      4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+  },
+  // TX_4X8
+  {
+      4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8,
+  },
+  // TX_8X4
+  {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  },
+  // TX_8X16
+  {
+      4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+  },
+  // TX_16X8
+  {
+      4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8,
+  },
+  // TX_16X32
+  {
+      4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+  },
+  // TX_32X16
+  {
+      4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+  },
+  // TX_32X64
+  {
+      4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+  },
+  // TX_64X32
+  {
+      4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+  },
+  // TX_4X16
+  {
+      4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+  },
+  // TX_16X4
+  {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  },
+  // TX_8X32
+  {
+      4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+  },
+  // TX_32X8
+  {
+      4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8,
+  },
+  // TX_16X64
+  {
+      4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+  },
+  // TX_64X16
+  {
+      4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+  },
+};
+
+static const uint32_t vert_filter_length_chroma[TX_SIZES_ALL][TX_SIZES_ALL] = {
+  // TX_4X4
+  {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  },
+  // TX_8X8
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_16X16
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_32X32
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_64X64
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_4X8
+  {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  },
+  // TX_8X4
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_8X16
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_16X8
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_16X32
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_32X16
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_32X64
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_64X32
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_4X16
+  {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  },
+  // TX_16X4
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_8X32
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_32X8
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_16X64
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+  // TX_64X16
+  {
+      4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+  },
+};
+
+static const uint32_t horz_filter_length_chroma[TX_SIZES_ALL][TX_SIZES_ALL] = {
+  // TX_4X4
+  {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  },
+  // TX_8X8
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_16X16
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_32X32
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_64X64
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_4X8
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_8X4
+  {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  },
+  // TX_8X16
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_16X8
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_16X32
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_32X16
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_32X64
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_64X32
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_4X16
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_16X4
+  {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  },
+  // TX_8X32
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_32X8
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_16X64
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+  // TX_64X16
+  {
+      4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+  },
+};
+
+static AOM_FORCE_INLINE void set_one_param_for_line_luma(
+    AV1_DEBLOCKING_PARAMETERS *const params, TX_SIZE *tx_size,
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row,
+    const struct macroblockd_plane *const plane_ptr, int coord,
+    bool is_first_block, TX_SIZE prev_tx_size, const ptrdiff_t mode_step,
+    int *min_dim) {
+  (void)plane_ptr;
+  assert(mi_col << MI_SIZE_LOG2 < (uint32_t)plane_ptr->dst.width &&
+         mi_row << MI_SIZE_LOG2 < (uint32_t)plane_ptr->dst.height);
+  const int is_vert = edge_dir == VERT_EDGE;
+  // reset to initial values
+  params->filter_length = 0;
+
+  MB_MODE_INFO **mi =
+      cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col;
+  const MB_MODE_INFO *mbmi = mi[0];
+  assert(mbmi);
+
+  const TX_SIZE ts =
+      get_transform_size(xd, mi[0], mi_row, mi_col, AOM_PLANE_Y, 0, 0);
+
+#ifndef NDEBUG
+  const uint32_t transform_masks =
+      is_vert ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
+  const int32_t tu_edge = ((coord * MI_SIZE) & transform_masks) ? (0) : (1);
+  assert(tu_edge);
+#endif  // NDEBUG
+  // If we are not the first block, then coord is always true, so
+  // !is_first_block is technically redundant. But we are keeping it here so the
+  // compiler can compile away this conditional if we pass in is_first_block :=
+  // false
+  bool curr_skipped = false;
+  if (!is_first_block || coord) {
+    const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
+    const int pv_row = is_vert ? mi_row : (mi_row - 1);
+    const int pv_col = is_vert ? (mi_col - 1) : mi_col;
+    const TX_SIZE pv_ts =
+        is_first_block
+            ? get_transform_size(xd, mi_prev, pv_row, pv_col, AOM_PLANE_Y, 0, 0)
+            : prev_tx_size;
+    if (is_first_block) {
+      *min_dim = is_vert ? block_size_high[mi_prev->bsize]
+                         : block_size_wide[mi_prev->bsize];
+    }
+    assert(mi_prev);
+    uint8_t level =
+        av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y, mbmi);
+    if (!level) {
+      level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y,
+                                   mi_prev);
+    }
+
+    const int32_t pu_edge = mi_prev != mbmi;
+
+    // The quad loop filter assumes that all the transform blocks within a
+    // 8x16/16x8/16x16 prediction block are of the same size.
+    assert(IMPLIES(
+        !pu_edge && (mbmi->bsize >= BLOCK_8X16 && mbmi->bsize <= BLOCK_16X16),
+        pv_ts == ts));
+
+    if (!pu_edge) {
+      curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi);
+    }
+    if ((pu_edge || !curr_skipped) && level) {
+      params->filter_length = is_vert ? vert_filter_length_luma[ts][pv_ts]
+                                      : horz_filter_length_luma[ts][pv_ts];
+
+      // prepare common parameters
+      const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
+      params->lfthr = limits;
+    }
+  }
+  const int block_dim =
+      is_vert ? block_size_high[mbmi->bsize] : block_size_wide[mbmi->bsize];
+  *min_dim = AOMMIN(*min_dim, block_dim);
+
+  *tx_size = ts;
+}
+
+// Similar to set_lpf_parameters, but does so one row/col at a time to reduce
+// calls to \ref get_transform_size and \ref av1_get_filter_level
+static AOM_FORCE_INLINE void set_lpf_parameters_for_line_luma(
+    AV1_DEBLOCKING_PARAMETERS *const params_buf, TX_SIZE *tx_buf,
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row,
+    const struct macroblockd_plane *const plane_ptr, const uint32_t mi_range,
+    const ptrdiff_t mode_step, int *min_dim) {
+  const int is_vert = edge_dir == VERT_EDGE;
+
+  AV1_DEBLOCKING_PARAMETERS *params = params_buf;
+  TX_SIZE *tx_size = tx_buf;
+  uint32_t *counter_ptr = is_vert ? &mi_col : &mi_row;
+  TX_SIZE prev_tx_size = TX_INVALID;
+
+  // Unroll the first iteration of the loop
+  set_one_param_for_line_luma(params, tx_size, cm, xd, edge_dir, mi_col, mi_row,
+                              plane_ptr, *counter_ptr, true, prev_tx_size,
+                              mode_step, min_dim);
+
+  // Advance
+  int advance_units =
+      is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size];
+  prev_tx_size = *tx_size;
+  *counter_ptr += advance_units;
+  params += advance_units;
+  tx_size += advance_units;
+
+  while (*counter_ptr < mi_range) {
+    set_one_param_for_line_luma(params, tx_size, cm, xd, edge_dir, mi_col,
+                                mi_row, plane_ptr, *counter_ptr, false,
+                                prev_tx_size, mode_step, min_dim);
+
+    // Advance
+    advance_units =
+        is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size];
+    prev_tx_size = *tx_size;
+    *counter_ptr += advance_units;
+    params += advance_units;
+    tx_size += advance_units;
+  }
+}
+
+static AOM_FORCE_INLINE void set_one_param_for_line_chroma(
+    AV1_DEBLOCKING_PARAMETERS *const params, TX_SIZE *tx_size,
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, int coord,
+    bool is_first_block, TX_SIZE prev_tx_size,
+    const struct macroblockd_plane *const plane_ptr, const ptrdiff_t mode_step,
+    const int scale_horz, const int scale_vert, int *min_dim, int plane,
+    int joint_filter_chroma) {
+  const int is_vert = edge_dir == VERT_EDGE;
+  (void)plane_ptr;
+  assert((mi_col << MI_SIZE_LOG2) <
+             (uint32_t)(plane_ptr->dst.width << scale_horz) &&
+         (mi_row << MI_SIZE_LOG2) <
+             (uint32_t)(plane_ptr->dst.height << scale_vert));
+  // reset to initial values
+  params->filter_length = 0;
+
+  // for sub8x8 block, chroma prediction mode is obtained from the
+  // bottom/right mi structure of the co-located 8x8 luma block. so for chroma
+  // plane, mi_row and mi_col should map to the bottom/right mi structure,
+  // i.e, both mi_row and mi_col should be odd number for chroma plane.
+  mi_row |= scale_vert;
+  mi_col |= scale_horz;
+  MB_MODE_INFO **mi =
+      cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col;
+  const MB_MODE_INFO *mbmi = mi[0];
+  assert(mbmi);
+
+  const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, plane,
+                                        scale_horz, scale_vert);
+  *tx_size = ts;
+
+#ifndef NDEBUG
+  const uint32_t transform_masks =
+      is_vert ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
+  const int32_t tu_edge = ((coord * MI_SIZE) & transform_masks) ? (0) : (1);
+  assert(tu_edge);
+#endif  // NDEBUG
+
+  // If we are not the first block, then coord is always true, so
+  // !is_first_block is technically redundant. But we are keeping it here so the
+  // compiler can compile away this conditional if we pass in is_first_block :=
+  // false
+  bool curr_skipped = false;
+  if (!is_first_block || coord) {
+    const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
+    assert(mi_prev);
+    const int pv_row = is_vert ? (mi_row) : (mi_row - (1 << scale_vert));
+    const int pv_col = is_vert ? (mi_col - (1 << scale_horz)) : (mi_col);
+    const TX_SIZE pv_ts =
+        is_first_block ? get_transform_size(xd, mi_prev, pv_row, pv_col, plane,
+                                            scale_horz, scale_vert)
+                       : prev_tx_size;
+    if (is_first_block) {
+      *min_dim = is_vert ? tx_size_high[pv_ts] : tx_size_wide[pv_ts];
+    }
+
+    uint8_t level =
+        av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+    if (!level) {
+      level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+    }
+#ifndef NDEBUG
+    if (joint_filter_chroma) {
+      uint8_t v_level =
+          av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V, mbmi);
+      if (!v_level) {
+        v_level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V,
+                                       mi_prev);
       }
+      assert(level == v_level);
+    }
+#else
+    (void)joint_filter_chroma;
+#endif  // NDEBUG
+    const int32_t pu_edge = mi_prev != mbmi;
+
+    if (!pu_edge) {
+      curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi);
+    }
+    // For realtime mode, u and v have the same level
+    if ((!curr_skipped || pu_edge) && level) {
+      params->filter_length = is_vert ? vert_filter_length_chroma[ts][pv_ts]
+                                      : horz_filter_length_chroma[ts][pv_ts];
+
+      const loop_filter_thresh *const limits = cm->lf_info.lfthr;
+      params->lfthr = limits + level;
+    }
+  }
+  const int tx_dim = is_vert ? tx_size_high[ts] : tx_size_wide[ts];
+  *min_dim = AOMMIN(*min_dim, tx_dim);
+}
 
+static AOM_FORCE_INLINE void set_lpf_parameters_for_line_chroma(
+    AV1_DEBLOCKING_PARAMETERS *const params_buf, TX_SIZE *tx_buf,
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row,
+    const struct macroblockd_plane *const plane_ptr, const uint32_t mi_range,
+    const ptrdiff_t mode_step, const int scale_horz, const int scale_vert,
+    int *min_dim, int plane, int joint_filter_chroma) {
+  const int is_vert = edge_dir == VERT_EDGE;
+
+  AV1_DEBLOCKING_PARAMETERS *params = params_buf;
+  TX_SIZE *tx_size = tx_buf;
+  uint32_t *counter_ptr = is_vert ? &mi_col : &mi_row;
+  const uint32_t scale = is_vert ? scale_horz : scale_vert;
+  TX_SIZE prev_tx_size = TX_INVALID;
+
+  // Unroll the first iteration of the loop
+  set_one_param_for_line_chroma(params, tx_size, cm, xd, edge_dir, mi_col,
+                                mi_row, *counter_ptr, true, prev_tx_size,
+                                plane_ptr, mode_step, scale_horz, scale_vert,
+                                min_dim, plane, joint_filter_chroma);
+
+  // Advance
+  int advance_units =
+      is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size];
+  prev_tx_size = *tx_size;
+  *counter_ptr += advance_units << scale;
+  params += advance_units;
+  tx_size += advance_units;
+
+  while (*counter_ptr < mi_range) {
+    set_one_param_for_line_chroma(params, tx_size, cm, xd, edge_dir, mi_col,
+                                  mi_row, *counter_ptr, false, prev_tx_size,
+                                  plane_ptr, mode_step, scale_horz, scale_vert,
+                                  min_dim, plane, joint_filter_chroma);
+
+    // Advance
+    advance_units =
+        is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size];
+    prev_tx_size = *tx_size;
+    *counter_ptr += advance_units << scale;
+    params += advance_units;
+    tx_size += advance_units;
+  }
+}
+
+static AOM_INLINE void filter_vert(uint8_t *dst, int dst_stride,
+                                   const AV1_DEBLOCKING_PARAMETERS *params,
+                                   const SequenceHeader *seq_params,
+                                   USE_FILTER_TYPE use_filter_type) {
+  const loop_filter_thresh *limits = params->lfthr;
 #if CONFIG_AV1_HIGHBITDEPTH
-      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
-      const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
-      switch (params.filter_length) {
+  const int use_highbitdepth = seq_params->use_highbitdepth;
+  const aom_bit_depth_t bit_depth = seq_params->bit_depth;
+  if (use_highbitdepth) {
+    uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst);
+    if (use_filter_type == USE_QUAD) {
+      switch (params->filter_length) {
         // apply 4-tap filtering
         case 4:
-          if (use_highbitdepth)
-            aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                      params.mblim, params.lim, params.hev_thr,
-                                      bit_depth);
-          else
-            aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
-                               params.hev_thr);
+          aom_highbd_lpf_vertical_4_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          aom_highbd_lpf_vertical_4_dual(
+              dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+              limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
+              limits->lim, limits->hev_thr, bit_depth);
           break;
         case 6:  // apply 6-tap filter for chroma plane only
-          assert(plane != 0);
-          if (use_highbitdepth)
-            aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                      params.mblim, params.lim, params.hev_thr,
-                                      bit_depth);
-          else
-            aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
-                               params.hev_thr);
+          aom_highbd_lpf_vertical_6_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          aom_highbd_lpf_vertical_6_dual(
+              dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+              limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
+              limits->lim, limits->hev_thr, bit_depth);
           break;
         // apply 8-tap filtering
         case 8:
-          if (use_highbitdepth)
-            aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                      params.mblim, params.lim, params.hev_thr,
-                                      bit_depth);
-          else
-            aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
-                               params.hev_thr);
+          aom_highbd_lpf_vertical_8_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          aom_highbd_lpf_vertical_8_dual(
+              dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+              limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
+              limits->lim, limits->hev_thr, bit_depth);
           break;
         // apply 14-tap filtering
         case 14:
-          if (use_highbitdepth)
-            aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                       params.mblim, params.lim, params.hev_thr,
-                                       bit_depth);
-          else
-            aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
-                                params.hev_thr);
+          aom_highbd_lpf_vertical_14_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          aom_highbd_lpf_vertical_14_dual(
+              dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+              limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
+              limits->lim, limits->hev_thr, bit_depth);
           break;
         // no filtering
         default: break;
       }
-#else
-      switch (params.filter_length) {
+    } else if (use_filter_type == USE_DUAL) {
+      switch (params->filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_highbd_lpf_vertical_4_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          aom_highbd_lpf_vertical_6_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_highbd_lpf_vertical_8_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_highbd_lpf_vertical_14_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          break;
+        // no filtering
+        default: break;
+      }
+    } else {
+      assert(use_filter_type == USE_SINGLE);
+      switch (params->filter_length) {
         // apply 4-tap filtering
         case 4:
-          aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
-                             params.hev_thr);
+          aom_highbd_lpf_vertical_4(dst_shortptr, dst_stride, limits->mblim,
+                                    limits->lim, limits->hev_thr, bit_depth);
           break;
         case 6:  // apply 6-tap filter for chroma plane only
-          assert(plane != 0);
-          aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
-                             params.hev_thr);
+          aom_highbd_lpf_vertical_6(dst_shortptr, dst_stride, limits->mblim,
+                                    limits->lim, limits->hev_thr, bit_depth);
           break;
         // apply 8-tap filtering
         case 8:
-          aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
-                             params.hev_thr);
+          aom_highbd_lpf_vertical_8(dst_shortptr, dst_stride, limits->mblim,
+                                    limits->lim, limits->hev_thr, bit_depth);
           break;
         // apply 14-tap filtering
         case 14:
-          aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
-                              params.hev_thr);
+          aom_highbd_lpf_vertical_14(dst_shortptr, dst_stride, limits->mblim,
+                                     limits->lim, limits->hev_thr, bit_depth);
           break;
         // no filtering
         default: break;
       }
+    }
+    return;
+  }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
-      // advance the destination pointer
-      advance_units = tx_size_wide_unit[tx_size];
-      x += advance_units;
-      p += advance_units * MI_SIZE;
+  if (use_filter_type == USE_QUAD) {
+    // Only one set of loop filter parameters (mblim, lim and hev_thr) is
+    // passed as argument to quad loop filter because quad loop filter is
+    // called for those cases where all the 4 set of loop filter parameters
+    // are equal.
+    switch (params->filter_length) {
+      // apply 4-tap filtering
+      case 4:
+        aom_lpf_vertical_4_quad(dst, dst_stride, limits->mblim, limits->lim,
+                                limits->hev_thr);
+        break;
+      case 6:  // apply 6-tap filter for chroma plane only
+        aom_lpf_vertical_6_quad(dst, dst_stride, limits->mblim, limits->lim,
+                                limits->hev_thr);
+        break;
+      // apply 8-tap filtering
+      case 8:
+        aom_lpf_vertical_8_quad(dst, dst_stride, limits->mblim, limits->lim,
+                                limits->hev_thr);
+        break;
+      // apply 14-tap filtering
+      case 14:
+        aom_lpf_vertical_14_quad(dst, dst_stride, limits->mblim, limits->lim,
+                                 limits->hev_thr);
+        break;
+      // no filtering
+      default: break;
+    }
+  } else if (use_filter_type == USE_DUAL) {
+    switch (params->filter_length) {
+      // apply 4-tap filtering
+      case 4:
+        aom_lpf_vertical_4_dual(dst, dst_stride, limits->mblim, limits->lim,
+                                limits->hev_thr, limits->mblim, limits->lim,
+                                limits->hev_thr);
+        break;
+      case 6:  // apply 6-tap filter for chroma plane only
+        aom_lpf_vertical_6_dual(dst, dst_stride, limits->mblim, limits->lim,
+                                limits->hev_thr, limits->mblim, limits->lim,
+                                limits->hev_thr);
+        break;
+      // apply 8-tap filtering
+      case 8:
+        aom_lpf_vertical_8_dual(dst, dst_stride, limits->mblim, limits->lim,
+                                limits->hev_thr, limits->mblim, limits->lim,
+                                limits->hev_thr);
+        break;
+      // apply 14-tap filtering
+      case 14:
+        aom_lpf_vertical_14_dual(dst, dst_stride, limits->mblim, limits->lim,
+                                 limits->hev_thr, limits->mblim, limits->lim,
+                                 limits->hev_thr);
+        break;
+      // no filtering
+      default: break;
+    }
+  } else {
+    assert(use_filter_type == USE_SINGLE);
+    switch (params->filter_length) {
+      // apply 4-tap filtering
+      case 4:
+        aom_lpf_vertical_4(dst, dst_stride, limits->mblim, limits->lim,
+                           limits->hev_thr);
+        break;
+      case 6:  // apply 6-tap filter for chroma plane only
+        aom_lpf_vertical_6(dst, dst_stride, limits->mblim, limits->lim,
+                           limits->hev_thr);
+        break;
+      // apply 8-tap filtering
+      case 8:
+        aom_lpf_vertical_8(dst, dst_stride, limits->mblim, limits->lim,
+                           limits->hev_thr);
+        break;
+      // apply 14-tap filtering
+      case 14:
+        aom_lpf_vertical_14(dst, dst_stride, limits->mblim, limits->lim,
+                            limits->hev_thr);
+        break;
+      // no filtering
+      default: break;
     }
   }
+#if !CONFIG_AV1_HIGHBITDEPTH
+  (void)seq_params;
+#endif  // !CONFIG_AV1_HIGHBITDEPTH
 }
 
-void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
-                                 const MACROBLOCKD *const xd, const int plane,
-                                 const MACROBLOCKD_PLANE *const plane_ptr,
-                                 const uint32_t mi_row, const uint32_t mi_col) {
-  const uint32_t scale_horz = plane_ptr->subsampling_x;
-  const uint32_t scale_vert = plane_ptr->subsampling_y;
-  uint8_t *const dst_ptr = plane_ptr->dst.buf;
-  const int dst_stride = plane_ptr->dst.stride;
-  const int y_range = (MAX_MIB_SIZE >> scale_vert);
-  const int x_range = (MAX_MIB_SIZE >> scale_horz);
-  for (int x = 0; x < x_range; x++) {
-    uint8_t *p = dst_ptr + x * MI_SIZE;
-    for (int y = 0; y < y_range;) {
-      // inner loop always filter vertical edges in a MI block. If MI size
-      // is 8x8, it will first filter the vertical edge aligned with a 8x8
-      // block. If 4x4 transform is used, it will then filter the internal
-      // edge aligned with a 4x4 block
-      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
-      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
-      uint32_t advance_units;
-      TX_SIZE tx_size;
-      AV1_DEBLOCKING_PARAMETERS params;
-      memset(&params, 0, sizeof(params));
-
-      tx_size = set_lpf_parameters(
-          &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
-          curr_x, curr_y, plane, plane_ptr);
-      if (tx_size == TX_INVALID) {
-        params.filter_length = 0;
-        tx_size = TX_4X4;
-      }
-
+static AOM_INLINE void filter_vert_chroma(
+    uint8_t *u_dst, uint8_t *v_dst, int dst_stride,
+    const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params,
+    USE_FILTER_TYPE use_filter_type) {
+  const loop_filter_thresh *u_limits = params->lfthr;
+  const loop_filter_thresh *v_limits = params->lfthr;
 #if CONFIG_AV1_HIGHBITDEPTH
-      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
-      const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
-      switch (params.filter_length) {
+  const int use_highbitdepth = seq_params->use_highbitdepth;
+  const aom_bit_depth_t bit_depth = seq_params->bit_depth;
+  if (use_highbitdepth) {
+    uint16_t *u_dst_shortptr = CONVERT_TO_SHORTPTR(u_dst);
+    uint16_t *v_dst_shortptr = CONVERT_TO_SHORTPTR(v_dst);
+    if (use_filter_type == USE_QUAD) {
+      switch (params->filter_length) {
         // apply 4-tap filtering
         case 4:
-          if (use_highbitdepth)
-            aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                        params.mblim, params.lim,
-                                        params.hev_thr, bit_depth);
-          else
-            aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
-                                 params.hev_thr);
-          break;
-        // apply 6-tap filtering
-        case 6:
-          assert(plane != 0);
-          if (use_highbitdepth)
-            aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                        params.mblim, params.lim,
-                                        params.hev_thr, bit_depth);
-          else
-            aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
-                                 params.hev_thr);
-          break;
-        // apply 8-tap filtering
-        case 8:
-          if (use_highbitdepth)
-            aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                        params.mblim, params.lim,
-                                        params.hev_thr, bit_depth);
-          else
-            aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
-                                 params.hev_thr);
+          aom_highbd_lpf_vertical_4_dual(
+              u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_vertical_4_dual(
+              u_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+              u_limits->mblim, u_limits->lim, u_limits->hev_thr,
+              u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_vertical_4_dual(
+              v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_vertical_4_dual(
+              v_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+              v_limits->mblim, v_limits->lim, v_limits->hev_thr,
+              v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth);
           break;
-        // apply 14-tap filtering
-        case 14:
-          if (use_highbitdepth)
-            aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                         params.mblim, params.lim,
-                                         params.hev_thr, bit_depth);
-          else
-            aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
-                                  params.hev_thr);
+        case 6:  // apply 6-tap filter for chroma plane only
+          aom_highbd_lpf_vertical_6_dual(
+              u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_vertical_6_dual(
+              u_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+              u_limits->mblim, u_limits->lim, u_limits->hev_thr,
+              u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_vertical_6_dual(
+              v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_vertical_6_dual(
+              v_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+              v_limits->mblim, v_limits->lim, v_limits->hev_thr,
+              v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth);
           break;
+        case 8:
+        case 14: assert(0);
         // no filtering
         default: break;
       }
-#else
-      switch (params.filter_length) {
+    } else if (use_filter_type == USE_DUAL) {
+      switch (params->filter_length) {
         // apply 4-tap filtering
         case 4:
-          aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
-                               params.hev_thr);
+          aom_highbd_lpf_vertical_4_dual(
+              u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_vertical_4_dual(
+              v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, bit_depth);
           break;
-        // apply 6-tap filtering
-        case 6:
-          assert(plane != 0);
-          aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
-                               params.hev_thr);
+        case 6:  // apply 6-tap filter for chroma plane only
+          aom_highbd_lpf_vertical_6_dual(
+              u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_vertical_6_dual(
+              v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, bit_depth);
           break;
-        // apply 8-tap filtering
         case 8:
-          aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
-                               params.hev_thr);
+        case 14: assert(0);
+        // no filtering
+        default: break;
+      }
+    } else {
+      assert(use_filter_type == USE_SINGLE);
+      switch (params->filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_highbd_lpf_vertical_4(u_dst_shortptr, dst_stride, u_limits->mblim,
+                                    u_limits->lim, u_limits->hev_thr,
+                                    bit_depth);
+          aom_highbd_lpf_vertical_4(v_dst_shortptr, dst_stride, v_limits->mblim,
+                                    v_limits->lim, v_limits->hev_thr,
+                                    bit_depth);
           break;
-        // apply 14-tap filtering
-        case 14:
-          aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
-                                params.hev_thr);
+        case 6:  // apply 6-tap filter for chroma plane only
+          aom_highbd_lpf_vertical_6(u_dst_shortptr, dst_stride, u_limits->mblim,
+                                    u_limits->lim, u_limits->hev_thr,
+                                    bit_depth);
+          aom_highbd_lpf_vertical_6(v_dst_shortptr, dst_stride, v_limits->mblim,
+                                    v_limits->lim, v_limits->hev_thr,
+                                    bit_depth);
           break;
+        case 8:
+        case 14: assert(0); break;
         // no filtering
         default: break;
       }
+    }
+    return;
+  }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
-
-      // advance the destination pointer
-      advance_units = tx_size_high_unit[tx_size];
-      y += advance_units;
-      p += advance_units * dst_stride * MI_SIZE;
+  if (use_filter_type == USE_QUAD) {
+    // Only one set of loop filter parameters (mblim, lim and hev_thr) is
+    // passed as argument to quad loop filter because quad loop filter is
+    // called for those cases where all the 4 set of loop filter parameters
+    // are equal.
+    switch (params->filter_length) {
+      // apply 4-tap filtering
+      case 4:
+        aom_lpf_vertical_4_quad(u_dst, dst_stride, u_limits->mblim,
+                                u_limits->lim, u_limits->hev_thr);
+        aom_lpf_vertical_4_quad(v_dst, dst_stride, v_limits->mblim,
+                                v_limits->lim, v_limits->hev_thr);
+        break;
+      case 6:  // apply 6-tap filter for chroma plane only
+        aom_lpf_vertical_6_quad(u_dst, dst_stride, u_limits->mblim,
+                                u_limits->lim, u_limits->hev_thr);
+        aom_lpf_vertical_6_quad(v_dst, dst_stride, v_limits->mblim,
+                                v_limits->lim, v_limits->hev_thr);
+        break;
+      case 8:
+      case 14: assert(0);
+      // no filtering
+      default: break;
+    }
+  } else if (use_filter_type == USE_DUAL) {
+    switch (params->filter_length) {
+      // apply 4-tap filtering
+      case 4:
+        aom_lpf_vertical_4_dual(u_dst, dst_stride, u_limits->mblim,
+                                u_limits->lim, u_limits->hev_thr,
+                                u_limits->mblim, u_limits->lim,
+                                u_limits->hev_thr);
+        aom_lpf_vertical_4_dual(v_dst, dst_stride, v_limits->mblim,
+                                v_limits->lim, v_limits->hev_thr,
+                                v_limits->mblim, v_limits->lim,
+                                v_limits->hev_thr);
+        break;
+      case 6:  // apply 6-tap filter for chroma plane only
+        aom_lpf_vertical_6_dual(u_dst, dst_stride, u_limits->mblim,
+                                u_limits->lim, u_limits->hev_thr,
+                                u_limits->mblim, u_limits->lim,
+                                u_limits->hev_thr);
+        aom_lpf_vertical_6_dual(v_dst, dst_stride, v_limits->mblim,
+                                v_limits->lim, v_limits->hev_thr,
+                                v_limits->mblim, v_limits->lim,
+                                v_limits->hev_thr);
+        break;
+      case 8:
+      case 14: assert(0);
+      // no filtering
+      default: break;
+    }
+  } else {
+    assert(use_filter_type == USE_SINGLE);
+    switch (params->filter_length) {
+      // apply 4-tap filtering
+      case 4:
+        aom_lpf_vertical_4(u_dst, dst_stride, u_limits->mblim, u_limits->lim,
+                           u_limits->hev_thr);
+        aom_lpf_vertical_4(v_dst, dst_stride, v_limits->mblim, v_limits->lim,
+                           u_limits->hev_thr);
+        break;
+      case 6:  // apply 6-tap filter for chroma plane only
+        aom_lpf_vertical_6(u_dst, dst_stride, u_limits->mblim, u_limits->lim,
+                           u_limits->hev_thr);
+        aom_lpf_vertical_6(v_dst, dst_stride, v_limits->mblim, v_limits->lim,
+                           v_limits->hev_thr);
+        break;
+      case 8:
+      case 14: assert(0); break;
+      // no filtering
+      default: break;
     }
   }
+#if !CONFIG_AV1_HIGHBITDEPTH
+  (void)seq_params;
+#endif  // !CONFIG_AV1_HIGHBITDEPTH
 }
 
-void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm,
-                                      const MACROBLOCKD *const xd,
-                                      const int plane,
-                                      const MACROBLOCKD_PLANE *const plane_ptr,
-                                      const uint32_t mi_row,
-                                      const uint32_t mi_col) {
+void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col) {
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-  const int y_range = cm->mi_params.mi_rows >> scale_vert;
-  const int x_range = cm->mi_params.mi_cols >> scale_horz;
+  const int plane_mi_rows =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+  const int plane_mi_cols =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
+
   for (int y = 0; y < y_range; y++) {
     uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
     for (int x = 0; x < x_range;) {
@@ -606,6 +1340,8 @@ void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm,
         tx_size = TX_4X4;
       }
 
+      filter_vert(p, dst_stride, &params, cm->seq_params, USE_SINGLE);
+
       // advance the destination pointer
       advance_units = tx_size_wide_unit[tx_size];
       x += advance_units;
@@ -614,18 +1350,567 @@ void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm,
   }
 }
 
-void av1_filter_block_plane_horz_test(const AV1_COMMON *const cm,
-                                      const MACROBLOCKD *const xd,
-                                      const int plane,
-                                      const MACROBLOCKD_PLANE *const plane_ptr,
-                                      const uint32_t mi_row,
-                                      const uint32_t mi_col) {
+void av1_filter_block_plane_vert_opt(const AV1_COMMON *const cm,
+                                     const MACROBLOCKD *const xd,
+                                     const MACROBLOCKD_PLANE *const plane_ptr,
+                                     const uint32_t mi_row,
+                                     const uint32_t mi_col,
+                                     AV1_DEBLOCKING_PARAMETERS *params_buf,
+                                     TX_SIZE *tx_buf) {
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned
+  // to MI_SIZE.
+  const int plane_mi_cols =
+      CEIL_POWER_OF_TWO(plane_ptr->dst.width, MI_SIZE_LOG2);
+  const int plane_mi_rows =
+      CEIL_POWER_OF_TWO(plane_ptr->dst.height, MI_SIZE_LOG2);
+  const int y_range = AOMMIN((int)(plane_mi_rows - mi_row), MAX_MIB_SIZE);
+  const int x_range = AOMMIN((int)(plane_mi_cols - mi_col), MAX_MIB_SIZE);
+  const ptrdiff_t mode_step = 1;
+  for (int y = 0; y < y_range; y++) {
+    const uint32_t curr_y = mi_row + y;
+    const uint32_t x_start = mi_col;
+    const uint32_t x_end = mi_col + x_range;
+    int min_block_height = block_size_high[BLOCK_128X128];
+    set_lpf_parameters_for_line_luma(params_buf, tx_buf, cm, xd, VERT_EDGE,
+                                     x_start, curr_y, plane_ptr, x_end,
+                                     mode_step, &min_block_height);
+
+    AV1_DEBLOCKING_PARAMETERS *params = params_buf;
+    TX_SIZE *tx_size = tx_buf;
+    USE_FILTER_TYPE use_filter_type = USE_SINGLE;
+
+    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+
+    if ((y & 3) == 0 && (y + 3) < y_range && min_block_height >= 16) {
+      // If we are on a row which is a multiple of 4, and the minimum height is
+      // 16 pixels, then the current and right 3 cols must contain the same
+      // prediction block. This is because dim 16 can only happen every unit of
+      // 4 mi's.
+      use_filter_type = USE_QUAD;
+      y += 3;
+    } else if ((y + 1) < y_range && min_block_height >= 8) {
+      use_filter_type = USE_DUAL;
+      y += 1;
+    }
+
+    for (int x = 0; x < x_range;) {
+      if (*tx_size == TX_INVALID) {
+        params->filter_length = 0;
+        *tx_size = TX_4X4;
+      }
+
+      filter_vert(p, dst_stride, params, cm->seq_params, use_filter_type);
+
+      // advance the destination pointer
+      const uint32_t advance_units = tx_size_wide_unit[*tx_size];
+      x += advance_units;
+      p += advance_units * MI_SIZE;
+      params += advance_units;
+      tx_size += advance_units;
+    }
+  }
+}
+
+void av1_filter_block_plane_vert_opt_chroma(
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+    const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+    TX_SIZE *tx_buf, int plane, bool joint_filter_chroma) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  const int dst_stride = plane_ptr->dst.stride;
+  // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned
+  // to MI_SIZE.
+  const int mi_cols =
+      ((plane_ptr->dst.width << scale_horz) + MI_SIZE - 1) >> MI_SIZE_LOG2;
+  const int mi_rows =
+      ((plane_ptr->dst.height << scale_vert) + MI_SIZE - 1) >> MI_SIZE_LOG2;
+  const int plane_mi_rows = ROUND_POWER_OF_TWO(mi_rows, scale_vert);
+  const int plane_mi_cols = ROUND_POWER_OF_TWO(mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
+  const ptrdiff_t mode_step = (ptrdiff_t)1 << scale_horz;
+
+  for (int y = 0; y < y_range; y++) {
+    const uint32_t curr_y = mi_row + (y << scale_vert);
+    const uint32_t x_start = mi_col + (0 << scale_horz);
+    const uint32_t x_end = mi_col + (x_range << scale_horz);
+    int min_height = tx_size_high[TX_64X64];
+    set_lpf_parameters_for_line_chroma(params_buf, tx_buf, cm, xd, VERT_EDGE,
+                                       x_start, curr_y, plane_ptr, x_end,
+                                       mode_step, scale_horz, scale_vert,
+                                       &min_height, plane, joint_filter_chroma);
+
+    AV1_DEBLOCKING_PARAMETERS *params = params_buf;
+    TX_SIZE *tx_size = tx_buf;
+    int use_filter_type = USE_SINGLE;
+    int y_inc = 0;
+
+    if ((y & 3) == 0 && (y + 3) < y_range && min_height >= 16) {
+      // If we are on a row which is a multiple of 4, and the minimum height is
+      // 16 pixels, then the current and below 3 rows must contain the same tx
+      // block. This is because dim 16 can only happen every unit of 4 mi's.
+      use_filter_type = USE_QUAD;
+      y_inc = 3;
+    } else if (y % 2 == 0 && (y + 1) < y_range && min_height >= 8) {
+      // If we are on an even row, and the minimum height is 8 pixels, then the
+      // current and below rows must contain the same tx block. This is because
+      // dim 4 can only happen every unit of 2**0, and 8 every unit of 2**1,
+      // etc.
+      use_filter_type = USE_DUAL;
+      y_inc = 1;
+    }
+
+    for (int x = 0; x < x_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+      // If 4x4 transform is used, it will then filter the internal edge
+      //  aligned with a 4x4 block
+      if (*tx_size == TX_INVALID) {
+        params->filter_length = 0;
+        *tx_size = TX_4X4;
+      }
+
+      const int offset = y * MI_SIZE * dst_stride + x * MI_SIZE;
+      if (joint_filter_chroma) {
+        uint8_t *u_dst = plane_ptr[0].dst.buf + offset;
+        uint8_t *v_dst = plane_ptr[1].dst.buf + offset;
+        filter_vert_chroma(u_dst, v_dst, dst_stride, params, cm->seq_params,
+                           use_filter_type);
+      } else {
+        uint8_t *dst_ptr = plane_ptr->dst.buf + offset;
+        filter_vert(dst_ptr, dst_stride, params, cm->seq_params,
+                    use_filter_type);
+      }
+
+      // advance the destination pointer
+      const uint32_t advance_units = tx_size_wide_unit[*tx_size];
+      x += advance_units;
+      params += advance_units;
+      tx_size += advance_units;
+    }
+    y += y_inc;
+  }
+}
+
+static AOM_INLINE void filter_horz(uint8_t *dst, int dst_stride,
+                                   const AV1_DEBLOCKING_PARAMETERS *params,
+                                   const SequenceHeader *seq_params,
+                                   USE_FILTER_TYPE use_filter_type) {
+  const loop_filter_thresh *limits = params->lfthr;
+#if CONFIG_AV1_HIGHBITDEPTH
+  const int use_highbitdepth = seq_params->use_highbitdepth;
+  const aom_bit_depth_t bit_depth = seq_params->bit_depth;
+  if (use_highbitdepth) {
+    uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst);
+    if (use_filter_type == USE_QUAD) {
+      switch (params->filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_highbd_lpf_horizontal_4_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          aom_highbd_lpf_horizontal_4_dual(
+              dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
+              limits->lim, limits->hev_thr, limits->mblim, limits->lim,
+              limits->hev_thr, bit_depth);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          aom_highbd_lpf_horizontal_6_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          aom_highbd_lpf_horizontal_6_dual(
+              dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
+              limits->lim, limits->hev_thr, limits->mblim, limits->lim,
+              limits->hev_thr, bit_depth);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_highbd_lpf_horizontal_8_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          aom_highbd_lpf_horizontal_8_dual(
+              dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
+              limits->lim, limits->hev_thr, limits->mblim, limits->lim,
+              limits->hev_thr, bit_depth);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_highbd_lpf_horizontal_14_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          aom_highbd_lpf_horizontal_14_dual(
+              dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
+              limits->lim, limits->hev_thr, limits->mblim, limits->lim,
+              limits->hev_thr, bit_depth);
+          break;
+        // no filtering
+        default: break;
+      }
+    } else if (use_filter_type == USE_DUAL) {
+      switch (params->filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_highbd_lpf_horizontal_4_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          aom_highbd_lpf_horizontal_6_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_highbd_lpf_horizontal_8_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_highbd_lpf_horizontal_14_dual(
+              dst_shortptr, dst_stride, limits->mblim, limits->lim,
+              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+              bit_depth);
+          break;
+        // no filtering
+        default: break;
+      }
+    } else {
+      assert(use_filter_type == USE_SINGLE);
+      switch (params->filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_highbd_lpf_horizontal_4(dst_shortptr, dst_stride, limits->mblim,
+                                      limits->lim, limits->hev_thr, bit_depth);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          aom_highbd_lpf_horizontal_6(dst_shortptr, dst_stride, limits->mblim,
+                                      limits->lim, limits->hev_thr, bit_depth);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_highbd_lpf_horizontal_8(dst_shortptr, dst_stride, limits->mblim,
+                                      limits->lim, limits->hev_thr, bit_depth);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_highbd_lpf_horizontal_14(dst_shortptr, dst_stride, limits->mblim,
+                                       limits->lim, limits->hev_thr, bit_depth);
+          break;
+        // no filtering
+        default: break;
+      }
+    }
+    return;
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  if (use_filter_type == USE_QUAD) {
+    // Only one set of loop filter parameters (mblim, lim and hev_thr) is
+    // passed as argument to quad loop filter because quad loop filter is
+    // called for those cases where all the 4 set of loop filter parameters
+    // are equal.
+    switch (params->filter_length) {
+      // apply 4-tap filtering
+      case 4:
+        aom_lpf_horizontal_4_quad(dst, dst_stride, limits->mblim, limits->lim,
+                                  limits->hev_thr);
+        break;
+      case 6:  // apply 6-tap filter for chroma plane only
+        aom_lpf_horizontal_6_quad(dst, dst_stride, limits->mblim, limits->lim,
+                                  limits->hev_thr);
+        break;
+      // apply 8-tap filtering
+      case 8:
+        aom_lpf_horizontal_8_quad(dst, dst_stride, limits->mblim, limits->lim,
+                                  limits->hev_thr);
+        break;
+      // apply 14-tap filtering
+      case 14:
+        aom_lpf_horizontal_14_quad(dst, dst_stride, limits->mblim, limits->lim,
+                                   limits->hev_thr);
+        break;
+      // no filtering
+      default: break;
+    }
+  } else if (use_filter_type == USE_DUAL) {
+    switch (params->filter_length) {
+      // apply 4-tap filtering
+      case 4:
+        aom_lpf_horizontal_4_dual(dst, dst_stride, limits->mblim, limits->lim,
+                                  limits->hev_thr, limits->mblim, limits->lim,
+                                  limits->hev_thr);
+        break;
+      case 6:  // apply 6-tap filter for chroma plane only
+        aom_lpf_horizontal_6_dual(dst, dst_stride, limits->mblim, limits->lim,
+                                  limits->hev_thr, limits->mblim, limits->lim,
+                                  limits->hev_thr);
+        break;
+      // apply 8-tap filtering
+      case 8:
+        aom_lpf_horizontal_8_dual(dst, dst_stride, limits->mblim, limits->lim,
+                                  limits->hev_thr, limits->mblim, limits->lim,
+                                  limits->hev_thr);
+        break;
+      // apply 14-tap filtering
+      case 14:
+        aom_lpf_horizontal_14_dual(dst, dst_stride, limits->mblim, limits->lim,
+                                   limits->hev_thr, limits->mblim, limits->lim,
+                                   limits->hev_thr);
+        break;
+      // no filtering
+      default: break;
+    }
+  } else {
+    assert(use_filter_type == USE_SINGLE);
+    switch (params->filter_length) {
+      // apply 4-tap filtering
+      case 4:
+        aom_lpf_horizontal_4(dst, dst_stride, limits->mblim, limits->lim,
+                             limits->hev_thr);
+        break;
+      case 6:  // apply 6-tap filter for chroma plane only
+        aom_lpf_horizontal_6(dst, dst_stride, limits->mblim, limits->lim,
+                             limits->hev_thr);
+        break;
+      // apply 8-tap filtering
+      case 8:
+        aom_lpf_horizontal_8(dst, dst_stride, limits->mblim, limits->lim,
+                             limits->hev_thr);
+        break;
+      // apply 14-tap filtering
+      case 14:
+        aom_lpf_horizontal_14(dst, dst_stride, limits->mblim, limits->lim,
+                              limits->hev_thr);
+        break;
+      // no filtering
+      default: break;
+    }
+  }
+#if !CONFIG_AV1_HIGHBITDEPTH
+  (void)seq_params;
+#endif  // !CONFIG_AV1_HIGHBITDEPTH
+}
+
+static AOM_INLINE void filter_horz_chroma(
+    uint8_t *u_dst, uint8_t *v_dst, int dst_stride,
+    const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params,
+    USE_FILTER_TYPE use_filter_type) {
+  const loop_filter_thresh *u_limits = params->lfthr;
+  const loop_filter_thresh *v_limits = params->lfthr;
+#if CONFIG_AV1_HIGHBITDEPTH
+  const int use_highbitdepth = seq_params->use_highbitdepth;
+  const aom_bit_depth_t bit_depth = seq_params->bit_depth;
+  if (use_highbitdepth) {
+    uint16_t *u_dst_shortptr = CONVERT_TO_SHORTPTR(u_dst);
+    uint16_t *v_dst_shortptr = CONVERT_TO_SHORTPTR(v_dst);
+    if (use_filter_type == USE_QUAD) {
+      switch (params->filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_highbd_lpf_horizontal_4_dual(
+              u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_horizontal_4_dual(
+              u_dst_shortptr + (2 * MI_SIZE), dst_stride, u_limits->mblim,
+              u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_horizontal_4_dual(
+              v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_horizontal_4_dual(
+              v_dst_shortptr + (2 * MI_SIZE), dst_stride, v_limits->mblim,
+              v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, bit_depth);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          aom_highbd_lpf_horizontal_6_dual(
+              u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_horizontal_6_dual(
+              u_dst_shortptr + (2 * MI_SIZE), dst_stride, u_limits->mblim,
+              u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_horizontal_6_dual(
+              v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_horizontal_6_dual(
+              v_dst_shortptr + (2 * MI_SIZE), dst_stride, v_limits->mblim,
+              v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, bit_depth);
+          break;
+        case 8:
+        case 14: assert(0);
+        // no filtering
+        default: break;
+      }
+    } else if (use_filter_type == USE_DUAL) {
+      switch (params->filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_highbd_lpf_horizontal_4_dual(
+              u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_horizontal_4_dual(
+              v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, bit_depth);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          aom_highbd_lpf_horizontal_6_dual(
+              u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+              u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_horizontal_6_dual(
+              v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+              v_limits->hev_thr, bit_depth);
+          break;
+        case 8:
+        case 14: assert(0);
+        // no filtering
+        default: break;
+      }
+    } else {
+      assert(use_filter_type == USE_SINGLE);
+      switch (params->filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_highbd_lpf_horizontal_4(u_dst_shortptr, dst_stride,
+                                      u_limits->mblim, u_limits->lim,
+                                      u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_horizontal_4(v_dst_shortptr, dst_stride,
+                                      v_limits->mblim, v_limits->lim,
+                                      v_limits->hev_thr, bit_depth);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          aom_highbd_lpf_horizontal_6(u_dst_shortptr, dst_stride,
+                                      u_limits->mblim, u_limits->lim,
+                                      u_limits->hev_thr, bit_depth);
+          aom_highbd_lpf_horizontal_6(v_dst_shortptr, dst_stride,
+                                      v_limits->mblim, v_limits->lim,
+                                      v_limits->hev_thr, bit_depth);
+          break;
+        case 8:
+        case 14: assert(0); break;
+        // no filtering
+        default: break;
+      }
+    }
+    return;
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  if (use_filter_type == USE_QUAD) {
+    // Only one set of loop filter parameters (mblim, lim and hev_thr) is
+    // passed as argument to quad loop filter because quad loop filter is
+    // called for those cases where all the 4 set of loop filter parameters
+    // are equal.
+    switch (params->filter_length) {
+      // apply 4-tap filtering
+      case 4:
+        aom_lpf_horizontal_4_quad(u_dst, dst_stride, u_limits->mblim,
+                                  u_limits->lim, u_limits->hev_thr);
+        aom_lpf_horizontal_4_quad(v_dst, dst_stride, v_limits->mblim,
+                                  v_limits->lim, v_limits->hev_thr);
+        break;
+      case 6:  // apply 6-tap filter for chroma plane only
+        aom_lpf_horizontal_6_quad(u_dst, dst_stride, u_limits->mblim,
+                                  u_limits->lim, u_limits->hev_thr);
+        aom_lpf_horizontal_6_quad(v_dst, dst_stride, v_limits->mblim,
+                                  v_limits->lim, v_limits->hev_thr);
+        break;
+      case 8:
+      case 14: assert(0);
+      // no filtering
+      default: break;
+    }
+  } else if (use_filter_type == USE_DUAL) {
+    switch (params->filter_length) {
+      // apply 4-tap filtering
+      case 4:
+        aom_lpf_horizontal_4_dual(u_dst, dst_stride, u_limits->mblim,
+                                  u_limits->lim, u_limits->hev_thr,
+                                  u_limits->mblim, u_limits->lim,
+                                  u_limits->hev_thr);
+        aom_lpf_horizontal_4_dual(v_dst, dst_stride, v_limits->mblim,
+                                  v_limits->lim, v_limits->hev_thr,
+                                  v_limits->mblim, v_limits->lim,
+                                  v_limits->hev_thr);
+        break;
+      case 6:  // apply 6-tap filter for chroma plane only
+        aom_lpf_horizontal_6_dual(u_dst, dst_stride, u_limits->mblim,
+                                  u_limits->lim, u_limits->hev_thr,
+                                  u_limits->mblim, u_limits->lim,
+                                  u_limits->hev_thr);
+        aom_lpf_horizontal_6_dual(v_dst, dst_stride, v_limits->mblim,
+                                  v_limits->lim, v_limits->hev_thr,
+                                  v_limits->mblim, v_limits->lim,
+                                  v_limits->hev_thr);
+        break;
+      case 8:
+      case 14: assert(0);
+      // no filtering
+      default: break;
+    }
+  } else {
+    assert(use_filter_type == USE_SINGLE);
+    switch (params->filter_length) {
+      // apply 4-tap filtering
+      case 4:
+        aom_lpf_horizontal_4(u_dst, dst_stride, u_limits->mblim, u_limits->lim,
+                             u_limits->hev_thr);
+        aom_lpf_horizontal_4(v_dst, dst_stride, v_limits->mblim, v_limits->lim,
+                             u_limits->hev_thr);
+        break;
+      case 6:  // apply 6-tap filter for chroma plane only
+        aom_lpf_horizontal_6(u_dst, dst_stride, u_limits->mblim, u_limits->lim,
+                             u_limits->hev_thr);
+        aom_lpf_horizontal_6(v_dst, dst_stride, v_limits->mblim, v_limits->lim,
+                             v_limits->hev_thr);
+        break;
+      case 8:
+      case 14: assert(0); break;
+      // no filtering
+      default: break;
+    }
+  }
+#if !CONFIG_AV1_HIGHBITDEPTH
+  (void)seq_params;
+#endif  // !CONFIG_AV1_HIGHBITDEPTH
+}
+
+void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col) {
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-  const int y_range = cm->mi_params.mi_rows >> scale_vert;
-  const int x_range = cm->mi_params.mi_cols >> scale_horz;
+  const int plane_mi_rows =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+  const int plane_mi_cols =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
   for (int x = 0; x < x_range; x++) {
     uint8_t *p = dst_ptr + x * MI_SIZE;
     for (int y = 0; y < y_range;) {
@@ -648,6 +1933,8 @@ void av1_filter_block_plane_horz_test(const AV1_COMMON *const cm,
         tx_size = TX_4X4;
       }
 
+      filter_horz(p, dst_stride, &params, cm->seq_params, USE_SINGLE);
+
       // advance the destination pointer
       advance_units = tx_size_high_unit[tx_size];
       y += advance_units;
@@ -656,135 +1943,149 @@ void av1_filter_block_plane_horz_test(const AV1_COMMON *const cm,
   }
 }
 
-static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
-                             MACROBLOCKD *xd, int start, int stop,
-#if CONFIG_LPF_MASK
-                             int is_decoding,
-#endif
-                             int plane_start, int plane_end) {
-  struct macroblockd_plane *pd = xd->plane;
-  const int col_start = 0;
-  const int col_end = cm->mi_params.mi_cols;
-  int mi_row, mi_col;
-  int plane;
+void av1_filter_block_plane_horz_opt(const AV1_COMMON *const cm,
+                                     const MACROBLOCKD *const xd,
+                                     const MACROBLOCKD_PLANE *const plane_ptr,
+                                     const uint32_t mi_row,
+                                     const uint32_t mi_col,
+                                     AV1_DEBLOCKING_PARAMETERS *params_buf,
+                                     TX_SIZE *tx_buf) {
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned
+  // to MI_SIZE.
+  const int plane_mi_cols =
+      CEIL_POWER_OF_TWO(plane_ptr->dst.width, MI_SIZE_LOG2);
+  const int plane_mi_rows =
+      CEIL_POWER_OF_TWO(plane_ptr->dst.height, MI_SIZE_LOG2);
+  const int y_range = AOMMIN((int)(plane_mi_rows - mi_row), MAX_MIB_SIZE);
+  const int x_range = AOMMIN((int)(plane_mi_cols - mi_col), MAX_MIB_SIZE);
+
+  const ptrdiff_t mode_step = cm->mi_params.mi_stride;
+  for (int x = 0; x < x_range; x++) {
+    const uint32_t curr_x = mi_col + x;
+    const uint32_t y_start = mi_row;
+    const uint32_t y_end = mi_row + y_range;
+    int min_block_width = block_size_high[BLOCK_128X128];
+    set_lpf_parameters_for_line_luma(params_buf, tx_buf, cm, xd, HORZ_EDGE,
+                                     curr_x, y_start, plane_ptr, y_end,
+                                     mode_step, &min_block_width);
+
+    AV1_DEBLOCKING_PARAMETERS *params = params_buf;
+    TX_SIZE *tx_size = tx_buf;
+    USE_FILTER_TYPE filter_type = USE_SINGLE;
 
-#if CONFIG_LPF_MASK
-  if (is_decoding) {
-    cm->is_decoding = is_decoding;
-    for (plane = plane_start; plane < plane_end; plane++) {
-      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
-        break;
-      else if (plane == 1 && !(cm->lf.filter_level_u))
-        continue;
-      else if (plane == 2 && !(cm->lf.filter_level_v))
-        continue;
-
-      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
-                           plane, plane + 1);
-
-      av1_build_bitmask_vert_info(cm, &pd[plane], plane);
-      av1_build_bitmask_horz_info(cm, &pd[plane], plane);
-
-      // apply loop filtering which only goes through buffer once
-      for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
-        for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
-          av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, mi_col,
-                               plane, plane + 1);
-          av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
-                                              mi_col);
-          if (mi_col - MI_SIZE_64X64 >= 0) {
-            av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
-                                 mi_col - MI_SIZE_64X64, plane, plane + 1);
-            av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
-                                                mi_col - MI_SIZE_64X64);
-          }
-        }
-        av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
-                             mi_col - MI_SIZE_64X64, plane, plane + 1);
-        av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
-                                            mi_col - MI_SIZE_64X64);
+    uint8_t *p = dst_ptr + x * MI_SIZE;
+
+    if ((x & 3) == 0 && (x + 3) < x_range && min_block_width >= 16) {
+      // If we are on a col which is a multiple of 4, and the minimum width is
+      // 16 pixels, then the current and right 3 cols must contain the same
+      // prediction block. This is because dim 16 can only happen every unit of
+      // 4 mi's.
+      filter_type = USE_QUAD;
+      x += 3;
+    } else if ((x + 1) < x_range && min_block_width >= 8) {
+      filter_type = USE_DUAL;
+      x += 1;
+    }
+
+    for (int y = 0; y < y_range;) {
+      if (*tx_size == TX_INVALID) {
+        params->filter_length = 0;
+        *tx_size = TX_4X4;
       }
+
+      filter_horz(p, dst_stride, params, cm->seq_params, filter_type);
+
+      // advance the destination pointer
+      const uint32_t advance_units = tx_size_high_unit[*tx_size];
+      y += advance_units;
+      p += advance_units * dst_stride * MI_SIZE;
+      params += advance_units;
+      tx_size += advance_units;
     }
-    return;
   }
-#endif
+}
 
-  for (plane = plane_start; plane < plane_end; plane++) {
-    if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
-      break;
-    else if (plane == 1 && !(cm->lf.filter_level_u))
-      continue;
-    else if (plane == 2 && !(cm->lf.filter_level_v))
-      continue;
+void av1_filter_block_plane_horz_opt_chroma(
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+    const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+    TX_SIZE *tx_buf, int plane, bool joint_filter_chroma) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  const int dst_stride = plane_ptr->dst.stride;
+  // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned
+  // to MI_SIZE.
+  const int mi_cols =
+      ((plane_ptr->dst.width << scale_horz) + MI_SIZE - 1) >> MI_SIZE_LOG2;
+  const int mi_rows =
+      ((plane_ptr->dst.height << scale_vert) + MI_SIZE - 1) >> MI_SIZE_LOG2;
+  const int plane_mi_rows = ROUND_POWER_OF_TWO(mi_rows, scale_vert);
+  const int plane_mi_cols = ROUND_POWER_OF_TWO(mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
+  const ptrdiff_t mode_step = cm->mi_params.mi_stride << scale_vert;
+  for (int x = 0; x < x_range; x++) {
+    const uint32_t y_start = mi_row + (0 << scale_vert);
+    const uint32_t curr_x = mi_col + (x << scale_horz);
+    const uint32_t y_end = mi_row + (y_range << scale_vert);
+    int min_width = tx_size_wide[TX_64X64];
+    set_lpf_parameters_for_line_chroma(params_buf, tx_buf, cm, xd, HORZ_EDGE,
+                                       curr_x, y_start, plane_ptr, y_end,
+                                       mode_step, scale_horz, scale_vert,
+                                       &min_width, plane, joint_filter_chroma);
+
+    AV1_DEBLOCKING_PARAMETERS *params = params_buf;
+    TX_SIZE *tx_size = tx_buf;
+    USE_FILTER_TYPE use_filter_type = USE_SINGLE;
+    int x_inc = 0;
+
+    if ((x & 3) == 0 && (x + 3) < x_range && min_width >= 16) {
+      // If we are on a col which is a multiple of 4, and the minimum width is
+      // 16 pixels, then the current and right 3 cols must contain the same tx
+      // block. This is because dim 16 can only happen every unit of 4 mi's.
+      use_filter_type = USE_QUAD;
+      x_inc = 3;
+    } else if (x % 2 == 0 && (x + 1) < x_range && min_width >= 8) {
+      // If we are on an even col, and the minimum width is 8 pixels, then the
+      // current and left cols must contain the same tx block. This is because
+      // dim 4 can only happen every unit of 2**0, and 8 every unit of 2**1,
+      // etc.
+      use_filter_type = USE_DUAL;
+      x_inc = 1;
+    }
 
-    if (cm->lf.combine_vert_horz_lf) {
-      // filter all vertical and horizontal edges in every 128x128 super block
-      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-          // filter vertical edges
-          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
-          av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
-                                      mi_col);
-          // filter horizontal edges
-          if (mi_col - MAX_MIB_SIZE >= 0) {
-            av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer,
-                                 mi_row, mi_col - MAX_MIB_SIZE, plane,
-                                 plane + 1);
-            av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
-                                        mi_col - MAX_MIB_SIZE);
-          }
-        }
-        // filter horizontal edges
-        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                             mi_col - MAX_MIB_SIZE, plane, plane + 1);
-        av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
-                                    mi_col - MAX_MIB_SIZE);
-      }
-    } else {
-      // filter all vertical edges in every 128x128 super block
-      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
-          av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
-                                      mi_col);
-        }
+    for (int y = 0; y < y_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will first filter the vertical edge aligned with a 8x8
+      // block. If 4x4 transform is used, it will then filter the internal
+      // edge aligned with a 4x4 block
+      if (*tx_size == TX_INVALID) {
+        params->filter_length = 0;
+        *tx_size = TX_4X4;
       }
 
-      // filter all horizontal edges in every 128x128 super block
-      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
-          av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
-                                      mi_col);
-        }
+      const int offset = y * MI_SIZE * dst_stride + x * MI_SIZE;
+      if (joint_filter_chroma) {
+        uint8_t *u_dst = plane_ptr[0].dst.buf + offset;
+        uint8_t *v_dst = plane_ptr[1].dst.buf + offset;
+        filter_horz_chroma(u_dst, v_dst, dst_stride, params, cm->seq_params,
+                           use_filter_type);
+      } else {
+        uint8_t *dst_ptr = plane_ptr->dst.buf + offset;
+        filter_horz(dst_ptr, dst_stride, params, cm->seq_params,
+                    use_filter_type);
       }
-    }
-  }
-}
 
-void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                           MACROBLOCKD *xd,
-#if CONFIG_LPF_MASK
-                           int is_decoding,
-#endif
-                           int plane_start, int plane_end, int partial_frame) {
-  int start_mi_row, end_mi_row, mi_rows_to_filter;
-
-  start_mi_row = 0;
-  mi_rows_to_filter = cm->mi_params.mi_rows;
-  if (partial_frame && cm->mi_params.mi_rows > 8) {
-    start_mi_row = cm->mi_params.mi_rows >> 1;
-    start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
+      // advance the destination pointer
+      const int advance_units = tx_size_high_unit[*tx_size];
+      y += advance_units;
+      params += advance_units;
+      tx_size += advance_units;
+    }
+    x += x_inc;
   }
-  end_mi_row = start_mi_row + mi_rows_to_filter;
-  av1_loop_filter_frame_init(cm, plane_start, plane_end);
-  loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
-#if CONFIG_LPF_MASK
-                   is_decoding,
-#endif
-                   plane_start, plane_end);
 }
diff --git a/media/libaom/src/av1/common/av1_loopfilter.h b/media/libaom/src/av1/common/av1_loopfilter.h
index ce26d16477..43bd780eb5 100644
--- a/media/libaom/src/av1/common/av1_loopfilter.h
+++ b/media/libaom/src/av1/common/av1_loopfilter.h
@@ -33,52 +33,12 @@ enum lf_path {
   LF_PATH_SLOW,
 };
 
+/*!\cond */
 enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR);
 typedef struct {
   uint64_t bits[4];
 } FilterMask;
 
-#if CONFIG_LPF_MASK
-// This structure holds bit masks for all 4x4 blocks in a 64x64 region.
-// Each 1 bit represents a position in which we want to apply the loop filter.
-// For Y plane, 4x4 in 64x64 requires 16x16 = 256 bit, therefore we use 4
-// uint64_t; For U, V plane, for 420 format, plane size is 32x32, thus we use
-// a uint64_t to represent bitmask.
-// Left_ entries refer to whether we apply a filter on the border to the
-// left of the block.   Above_ entries refer to whether or not to apply a
-// filter on the above border.
-// Since each transform is accompanied by a potentially different type of
-// loop filter there is a different entry in the array for each transform size.
-typedef struct {
-  FilterMask left_y[TX_SIZES];
-  FilterMask above_y[TX_SIZES];
-  FilterMask left_u[TX_SIZES];
-  FilterMask above_u[TX_SIZES];
-  FilterMask left_v[TX_SIZES];
-  FilterMask above_v[TX_SIZES];
-
-  // Y plane vertical edge and horizontal edge filter level
-  uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64];
-  uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
-
-  // U plane filter level
-  uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64];
-  uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64];
-
-  // V plane filter level
-  uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64];
-  uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64];
-
-  // other info
-  FilterMask skip;
-  FilterMask is_vert_border;
-  FilterMask is_horz_border;
-  // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64
-  FilterMask tx_size_ver[2][5];
-  FilterMask tx_size_hor[2][5];
-} LoopFilterMask;
-#endif  // CONFIG_LPF_MASK
-
 struct loopfilter {
   int filter_level[2];
   int filter_level_u;
@@ -95,14 +55,6 @@ struct loopfilter {
 
   // 0 = ZERO_MV, MV
   int8_t mode_deltas[MAX_MODE_LF_DELTAS];
-
-  int combine_vert_horz_lf;
-
-#if CONFIG_LPF_MASK
-  LoopFilterMask *lfm;
-  size_t lfm_num;
-  int lfm_stride;
-#endif  // CONFIG_LPF_MASK
 };
 
 // Need to align this structure so when it is declared and
@@ -118,6 +70,26 @@ typedef struct {
   uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][REF_FRAMES][MAX_MODE_LF_DELTAS];
 } loop_filter_info_n;
 
+typedef struct AV1_DEBLOCKING_PARAMETERS {
+  // length of the filter applied to the outer edge
+  uint8_t filter_length;
+  // deblocking limits
+  const loop_filter_thresh *lfthr;
+} AV1_DEBLOCKING_PARAMETERS;
+
+typedef struct LoopFilterWorkerData {
+  YV12_BUFFER_CONFIG *frame_buffer;
+  struct AV1Common *cm;
+  struct macroblockd_plane planes[MAX_MB_PLANE];
+  // TODO(Ranjit): When the filter functions are modified to use xd->lossless
+  // add lossless as a member here.
+  MACROBLOCKD *xd;
+
+  AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE];
+  TX_SIZE tx_buf[MAX_MIB_SIZE];
+} LFWorkerData;
+/*!\endcond */
+
 /* assorted loopfilter functions which get used elsewhere */
 struct AV1Common;
 struct macroblockd;
@@ -128,16 +100,6 @@ void av1_loop_filter_init(struct AV1Common *cm);
 void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
                                 int plane_end);
 
-#if CONFIG_LPF_MASK
-void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                           struct macroblockd *xd, int is_decoding,
-                           int plane_start, int plane_end, int partial_frame);
-#else
-void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                           struct macroblockd *xd, int plane_start,
-                           int plane_end, int partial_frame);
-#endif
-
 void av1_filter_block_plane_vert(const struct AV1Common *const cm,
                                  const MACROBLOCKD *const xd, const int plane,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
@@ -148,58 +110,37 @@ void av1_filter_block_plane_horz(const struct AV1Common *const cm,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col);
 
-typedef struct LoopFilterWorkerData {
-  YV12_BUFFER_CONFIG *frame_buffer;
-  struct AV1Common *cm;
-  struct macroblockd_plane planes[MAX_MB_PLANE];
-  // TODO(Ranjit): When the filter functions are modified to use xd->lossless
-  // add lossless as a member here.
-  MACROBLOCKD *xd;
-} LFWorkerData;
+void av1_filter_block_plane_vert_opt(const struct AV1Common *const cm,
+                                     const MACROBLOCKD *const xd,
+                                     const MACROBLOCKD_PLANE *const plane_ptr,
+                                     const uint32_t mi_row,
+                                     const uint32_t mi_col,
+                                     AV1_DEBLOCKING_PARAMETERS *params_buf,
+                                     TX_SIZE *tx_buf);
+
+void av1_filter_block_plane_vert_opt_chroma(
+    const struct AV1Common *const cm, const MACROBLOCKD *const xd,
+    const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+    const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+    TX_SIZE *tx_buf, int plane, bool joint_filter_chroma);
+
+void av1_filter_block_plane_horz_opt(const struct AV1Common *const cm,
+                                     const MACROBLOCKD *const xd,
+                                     const MACROBLOCKD_PLANE *const plane_ptr,
+                                     const uint32_t mi_row,
+                                     const uint32_t mi_col,
+                                     AV1_DEBLOCKING_PARAMETERS *params_buf,
+                                     TX_SIZE *tx_buf);
+
+void av1_filter_block_plane_horz_opt_chroma(
+    const struct AV1Common *const cm, const MACROBLOCKD *const xd,
+    const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+    const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+    TX_SIZE *tx_buf, int plane, bool joint_filter_chroma);
 
 uint8_t av1_get_filter_level(const struct AV1Common *cm,
                              const loop_filter_info_n *lfi_n, const int dir_idx,
                              int plane, const MB_MODE_INFO *mbmi);
-#if CONFIG_LPF_MASK
-void av1_filter_block_plane_ver(struct AV1Common *const cm,
-                                struct macroblockd_plane *const plane_ptr,
-                                int pl, int mi_row, int mi_col);
-
-void av1_filter_block_plane_hor(struct AV1Common *const cm,
-                                struct macroblockd_plane *const plane, int pl,
-                                int mi_row, int mi_col);
-
-int get_index_shift(int mi_col, int mi_row, int *index);
-
-void av1_build_bitmask_vert_info(
-    struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr,
-    int plane);
-
-void av1_build_bitmask_horz_info(
-    struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr,
-    int plane);
-
-void av1_filter_block_plane_bitmask_vert(
-    struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
-    int pl, int mi_row, int mi_col);
-
-void av1_filter_block_plane_bitmask_horz(
-    struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
-    int pl, int mi_row, int mi_col);
-
-void av1_store_bitmask_univariant_tx(struct AV1Common *cm, int mi_row,
-                                     int mi_col, BLOCK_SIZE bsize,
-                                     MB_MODE_INFO *mbmi);
-
-void av1_store_bitmask_other_info(struct AV1Common *cm, int mi_row, int mi_col,
-                                  BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
-                                  int is_horz_coding_block_border,
-                                  int is_vert_coding_block_border);
-
-void av1_store_bitmask_vartx(struct AV1Common *cm, int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, TX_SIZE tx_size,
-                             MB_MODE_INFO *mbmi);
-#endif  // CONFIG_LPF_MASK
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/av1/common/av1_rtcd.c b/media/libaom/src/av1/common/av1_rtcd.c
index a77a4d2541..c484166047 100644
--- a/media/libaom/src/av1/common/av1_rtcd.c
+++ b/media/libaom/src/av1/common/av1_rtcd.c
@@ -15,8 +15,4 @@
 
 #include "aom_ports/aom_once.h"
 
-void av1_rtcd() {
-  // TODO(JBB): Remove this aom_once, by insuring that both the encoder and
-  // decoder setup functions are protected by aom_once();
-  aom_once(setup_rtcd_internal);
-}
+void av1_rtcd() { aom_once(setup_rtcd_internal); }
diff --git a/media/libaom/src/av1/common/av1_rtcd_defs.pl b/media/libaom/src/av1/common/av1_rtcd_defs.pl
index 296c6c572d..c9e87e3db3 100644
--- a/media/libaom/src/av1/common/av1_rtcd_defs.pl
+++ b/media/libaom/src/av1/common/av1_rtcd_defs.pl
@@ -15,6 +15,7 @@ print <<EOF
  */
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/odintrin.h"
 #include "aom_dsp/txfm_common.h"
 #include "av1/common/common.h"
 #include "av1/common/enums.h"
@@ -22,7 +23,6 @@ print <<EOF
 #include "av1/common/filter.h"
 #include "av1/common/convolve.h"
 #include "av1/common/av1_txfm.h"
-#include "av1/common/odintrin.h"
 #include "av1/common/restoration.h"
 
 struct macroblockd;
@@ -103,56 +103,120 @@ specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
 
 # directional intra predictor functions
 add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
-specialize qw/av1_dr_prediction_z1 avx2/;
+specialize qw/av1_dr_prediction_z1 sse4_1 avx2 neon/;
 add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
-specialize qw/av1_dr_prediction_z2 avx2/;
+specialize qw/av1_dr_prediction_z2 sse4_1 avx2 neon/;
 add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
-specialize qw/av1_dr_prediction_z3 avx2/;
+specialize qw/av1_dr_prediction_z3 sse4_1 avx2 neon/;
 
 # FILTER_INTRA predictor functions
 add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
-specialize qw/av1_filter_intra_predictor sse4_1/;
+specialize qw/av1_filter_intra_predictor sse4_1 neon/;
 
 # High bitdepth functions
 
 #
 # Sub Pixel Filters
 #
-add_proto qw/void av1_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
 
-add_proto qw/void av1_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void av1_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
 
-add_proto qw/void av1_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/av1_highbd_convolve8/, "$sse2_x86_64";
+  add_proto qw/void av1_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/av1_highbd_convolve8/, "$sse2_x86_64";
 
-add_proto qw/void av1_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/av1_highbd_convolve8_horiz/, "$sse2_x86_64";
+  add_proto qw/void av1_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/av1_highbd_convolve8_horiz/, "$sse2_x86_64";
 
-add_proto qw/void av1_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
+  add_proto qw/void av1_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
+}
 
 #inv txfm
 add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
 
 add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
+specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2 neon/;
 
 add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1 neon/;
 add_proto qw/void av1_highbd_inv_txfm_add_8x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1 neon/;
 add_proto qw/void av1_highbd_inv_txfm_add_4x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1 neon/;
 add_proto qw/void av1_highbd_inv_txfm_add_8x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1 neon/;
 add_proto qw/void av1_highbd_inv_txfm_add_4x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1 neon/;
 add_proto qw/void av1_highbd_inv_txfm_add_16x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x16  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x8  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x32/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x32  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x16  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x32/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x32  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x64/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x64  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_64x32/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_64x32  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_64x64/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_64x64  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x32/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x32  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x64  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x64/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_64x32  neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_64x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_64x64  neon/;
+
+add_proto qw/void av1_inv_txfm2d_add_4x4/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_4x4 neon/;
+add_proto qw/void av1_inv_txfm2d_add_8x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_8x8 neon/;
+add_proto qw/void av1_inv_txfm2d_add_4x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_4x8 neon/;
+add_proto qw/void av1_inv_txfm2d_add_8x4/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_8x4 neon/;
+add_proto qw/void av1_inv_txfm2d_add_4x16/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_4x16 neon/;
+add_proto qw/void av1_inv_txfm2d_add_16x4/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_16x4 neon/;
+add_proto qw/void av1_inv_txfm2d_add_8x16/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_8x16  neon/;
+add_proto qw/void av1_inv_txfm2d_add_16x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_16x8  neon/;
+add_proto qw/void av1_inv_txfm2d_add_16x32/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_16x32  neon/;
+add_proto qw/void av1_inv_txfm2d_add_32x16/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_32x16  neon/;
+add_proto qw/void av1_inv_txfm2d_add_32x32/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_32x32  neon/;
+add_proto qw/void av1_inv_txfm2d_add_32x64/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_32x64  neon/;
+add_proto qw/void av1_inv_txfm2d_add_64x32/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_64x32  neon/;
+add_proto qw/void av1_inv_txfm2d_add_64x64/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_64x64  neon/;
+add_proto qw/void av1_inv_txfm2d_add_8x32/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_8x32  neon/;
+add_proto qw/void av1_inv_txfm2d_add_32x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_32x8  neon/;
+add_proto qw/void av1_inv_txfm2d_add_16x64/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_16x64  neon/;
+add_proto qw/void av1_inv_txfm2d_add_64x16/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_64x16  neon/;
 
 add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/av1_highbd_iwht4x4_16_add  sse4_1/;
 
 add_proto qw/void av1_inv_txfm2d_add_4x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_8x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -183,6 +247,7 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
   specialize qw/av1_highbd_dr_prediction_z1 avx2/;
   add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
+
   specialize qw/av1_highbd_dr_prediction_z2 avx2/;
   add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
   specialize qw/av1_highbd_dr_prediction_z3 avx2/;
@@ -192,8 +257,10 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
 add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
 specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/;
 
-add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
-specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
+  specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
+}
 
 add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
 specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
@@ -202,12 +269,59 @@ specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
 add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
 specialize "av1_round_shift_array", qw/sse4_1 neon/;
 
+# Resize functions.
+add_proto qw/void av1_resize_and_extend_frame/, "const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes";
+specialize qw/av1_resize_and_extend_frame ssse3 neon/;
+
 #
 # Encoder functions below this point.
 #
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   # ENCODEMB INVOKE
+  add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                          const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
+                                          int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search";
+  specialize qw/aom_upsampled_pred sse2/;
+  #
+  #
+  #
+  add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                   const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                   int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                   int ref_stride, int subpel_search";
+  specialize qw/aom_comp_avg_upsampled_pred sse2/;
+
+  add_proto qw/void aom_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                       int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+  specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3/;
+
+  add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                       int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+                                                       int subpel_search";
+  specialize qw/aom_comp_mask_upsampled_pred sse2/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                   const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
+                                                   int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
+    specialize qw/aom_highbd_upsampled_pred sse2/;
+
+    add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                            const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+                                                            int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
+    specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
+
+    add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                                const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+                                                                int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+                                                                int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+    specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/;
+  }
 
   # the transform coefficients are held in 32-bit
   # values, so the assembler code for  av1_block_error can no longer be used.
@@ -215,85 +329,96 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/av1_block_error sse2 avx2 neon/;
 
   add_proto qw/int64_t av1_block_error_lp/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size";
-  specialize qw/av1_block_error_lp avx2 neon/;
+  specialize qw/av1_block_error_lp sse2 avx2 neon/;
 
   add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/av1_quantize_fp sse2 avx2 neon/;
 
-  add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan";
-  specialize qw/av1_quantize_lp avx2 neon/;
-
+  add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_lp sse2 avx2 neon/;
 
   add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/av1_quantize_fp_32x32 avx2/;
+  specialize qw/av1_quantize_fp_32x32 neon avx2/;
 
   add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/av1_quantize_fp_64x64 avx2/;
+  specialize qw/av1_quantize_fp_64x64 neon avx2/;
+
+  add_proto qw/void aom_quantize_b_helper/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale";
+  specialize qw/aom_quantize_b_helper neon/;
 
   # fdct functions
 
   add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/av1_fwht4x4 sse4_1 neon/;
 
   #fwd txfm
   add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
-  specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/;
+  specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2 neon/;
 
   add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_4x8 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_4x8 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x4 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_8x4 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2/;
+  specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2 neon/;
   add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2/;
+  specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2 neon/;
   add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x32 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_16x32 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x16 sse4_1/;
-  add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_4x16 sse4_1/;
-  add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x4 sse4_1/;
-  add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x32 sse4_1/;
-  add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x8 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_32x16 sse4_1 neon/;
+
   add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_4x4 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2/;
+  specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2 neon/;
   add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2/;
+  specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2 neon/;
   add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2/;
+  specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2 neon/;
 
   add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2/;
+  specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2 neon/;
   add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x64 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_32x64 sse4_1 neon/;
   add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_64x32 sse4_1/;
-  add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x64 sse4_1/;
-  add_proto qw/void av1_fwd_txfm2d_64x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_64x16 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_64x32 sse4_1 neon/;
+  add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x4 sse4_1 neon/;
 
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+    specialize qw/av1_fwd_txfm2d_4x16 sse4_1 neon/;
+    add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+    specialize qw/av1_fwd_txfm2d_8x32 sse4_1 neon/;
+    add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+    specialize qw/av1_fwd_txfm2d_32x8 sse4_1 neon/;
+    add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+    specialize qw/av1_fwd_txfm2d_16x64 sse4_1 neon/;
+    add_proto qw/void av1_fwd_txfm2d_64x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+    specialize qw/av1_fwd_txfm2d_64x16 sse4_1 neon/;
+  }
   #
   # Motion search
   #
-  add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
-
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    add_proto qw/void av1_apply_temporal_filter_yuv/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const int strength, const int use_subblock, const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, uint16_t *count";
-    specialize qw/av1_apply_temporal_filter_yuv sse4_1/;
+    add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    specialize qw/av1_apply_temporal_filter sse2 avx2/;
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+      specialize qw/av1_highbd_apply_temporal_filter sse2 avx2/;
+    }
   }
 
-  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int use_subblock, const int block_mse, const int *subblock_mses, const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count";
-    specialize qw/av1_apply_temporal_filter_planewise sse2 avx2/;
-  }
   add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
+  add_proto qw/void av1_calc_indices_dim1/, "const int *data, const int *centroids, uint8_t *indices, int n, int k";
+  specialize qw/av1_calc_indices_dim1 sse2 avx2/;
+
+  # TODO(any): Disable av1_calc_indices_dim2 sse2 version due to c/SIMD mismatch. Re-enable it after mismatch is fixed.
+  add_proto qw/void av1_calc_indices_dim2/, "const int *data, const int *centroids, uint8_t *indices, int n, int k";
+  specialize qw/av1_calc_indices_dim2 avx2/;
+
   # ENCODEMB INVOKE
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
@@ -302,18 +427,19 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
-    specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
+    specialize qw/av1_highbd_quantize_fp sse4_1 avx2 neon/;
   }
 
   add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/av1_highbd_fwht4x4 sse4_1 neon/;
 
   # End av1_high encoder functions
 
   # txb
   add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
-  specialize qw/av1_get_nz_map_contexts sse2/;
+  specialize qw/av1_get_nz_map_contexts sse2 neon/;
   add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
-  specialize qw/av1_txb_init_levels sse4_1 avx2/;
+  specialize qw/av1_txb_init_levels sse4_1 avx2 neon/;
 
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
   specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
@@ -326,45 +452,72 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length";
   specialize qw/av1_get_crc32c_value sse4_2/;
 
-  add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H";
-  specialize qw/av1_compute_stats sse4_1 avx2/;
-
-  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
-    specialize qw/av1_compute_stats_highbd sse4_1 avx2/;
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats";
+    specialize qw/av1_compute_stats sse4_1 avx2/;
+    add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+    specialize qw/av1_calc_proj_params sse4_1 avx2/;
+    add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+    specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2 neon/;
+
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/void av1_calc_proj_params_high_bd/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+      specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2/;
+      add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+      specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/;
+      add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
+      specialize qw/av1_compute_stats_highbd sse4_1 avx2/;
+    }
   }
 
-  add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
-  specialize qw/av1_calc_proj_params avx2/;
+  add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
+  specialize qw/av1_get_horver_correlation_full sse4_1 avx2 neon/;
 
-  add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
-  specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/;
+  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
 
-  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
-    specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/;
+  add_proto qw/void av1_nn_fast_softmax_16/, " const float *input_nodes, float *output";
+  if (aom_config("CONFIG_EXCLUDE_SIMD_MISMATCH") ne "yes") {
+    specialize qw/av1_nn_predict sse3 neon/;
+    specialize qw/av1_nn_fast_softmax_16 sse3/;
   }
-  add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
-  specialize qw/av1_get_horver_correlation_full sse4_1 avx2/;
 
-  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
-  specialize qw/av1_nn_predict sse3/;
+  # CNN functions
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void av1_cnn_activate/, " float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation";
+    add_proto qw/void av1_cnn_add/, " float **input, int channels, int width, int height, int stride, const float **add";
+    add_proto qw/bool av1_cnn_predict/, " const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct";
+    add_proto qw/void av1_cnn_convolve_no_maxpool_padding_valid/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step";
+    if (aom_config("CONFIG_EXCLUDE_SIMD_MISMATCH") ne "yes") {
+      specialize qw/av1_cnn_convolve_no_maxpool_padding_valid avx2/;
+    }
+    add_proto qw/void av1_cnn_deconvolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride";
+    add_proto qw/void av1_cnn_batchnorm/, "float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std";
+  }
+
+  # Temporal Denoiser
+  if (aom_config("CONFIG_AV1_TEMPORAL_DENOISING") eq "yes") {
+    add_proto qw/int av1_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
+    specialize qw/av1_denoiser_filter neon sse2/;
+  }
 }
 # end encoder functions
 
-# CNN functions
-
-add_proto qw/void av1_cnn_activate/, " float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation";
-add_proto qw/void av1_cnn_add/, " float **input, int channels, int width, int height, int stride, const float **add";
-add_proto qw/void av1_cnn_predict/, " const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct";
-add_proto qw/void av1_cnn_convolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int step";
-add_proto qw/void av1_cnn_deconvolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride";
-add_proto qw/void av1_cnn_batchnorm/, "float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std";
 
 # Deringing Functions
 
 add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
-add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift";
+add_proto qw/void cdef_find_dir_dual/, "const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2";
+
+# 8 bit dst
+add_proto qw/void cdef_filter_8_0/, "void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+add_proto qw/void cdef_filter_8_1/, "void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+add_proto qw/void cdef_filter_8_2/, "void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+add_proto qw/void cdef_filter_8_3/, "void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+# 16 bit dst
+add_proto qw/void cdef_filter_16_0/, "void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+add_proto qw/void cdef_filter_16_1/, "void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+add_proto qw/void cdef_filter_16_2/, "void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+add_proto qw/void cdef_filter_16_3/, "void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
 
 add_proto qw/void cdef_copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
 add_proto qw/void cdef_copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
@@ -374,21 +527,31 @@ add_proto qw/void cdef_copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride,
 # hard to support, so optimizations for this target are disabled.
 if ($opts{config} !~ /libs-x86-win32-vs.*/) {
   specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
-  specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_find_dir_dual sse2 ssse3 sse4_1 avx2 neon/;
+
+  specialize qw/cdef_filter_8_0 sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_filter_8_1 sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_filter_8_2 sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_filter_8_3 sse2 ssse3 sse4_1 avx2 neon/;
+
+  specialize qw/cdef_filter_16_0 sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_filter_16_1 sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_filter_16_2 sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_filter_16_3 sse2 ssse3 sse4_1 avx2 neon/;
+
   specialize qw/cdef_copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
   specialize qw/cdef_copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
 }
 
 # WARPED_MOTION / GLOBAL_MOTION functions
-
-add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_warp_affine sse4_1 avx2 neon/;
-
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-  specialize qw/av1_highbd_warp_affine sse4_1/;
+  specialize qw/av1_highbd_warp_affine sse4_1 avx2/;
 }
 
+add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+specialize qw/av1_warp_affine sse4_1 avx2 neon/;
+
 add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
 specialize qw/av1_calc_frame_error sse2 avx2/;
 
@@ -398,41 +561,37 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 }
 
 # LOOP_RESTORATION functions
-
 add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
 specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
 
 add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
-                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
-                                 int sgr_params_idx, int bit_depth, int highbd";
+                                int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                int sgr_params_idx, int bit_depth, int highbd";
 specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
 
 # CONVOLVE_ROUND/COMPOUND_ROUND functions
 
 add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn";
 add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params";
 if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
   add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
-  add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
-  add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd";
   add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
-  add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
-  add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
-  add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params, int bd";
   add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd";
 }
 
   add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
 
   specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
-  specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_2d_scale sse4_1/;
@@ -445,7 +604,6 @@ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2/;
     specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2/;
     specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2/;
-    specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
     specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
     specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
     specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
diff --git a/media/libaom/src/av1/common/av1_txfm.h b/media/libaom/src/av1/common/av1_txfm.h
index 20049b6806..be1164f8bb 100644
--- a/media/libaom/src/av1/common/av1_txfm.h
+++ b/media/libaom/src/av1/common/av1_txfm.h
@@ -81,7 +81,7 @@ static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
                                int bit) {
   int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
   int64_t intermediate = result_64 + (1LL << (bit - 1));
-  // NOTE(david.barker): The value 'result_64' may not necessarily fit
+  // NOTE(rachelbarker): The value 'result_64' may not necessarily fit
   // into 32 bits. However, the result of this function is nominally
   // ROUND_POWER_OF_TWO_64(result_64, bit)
   // and that is required to fit into stage_range[stage] many bits
diff --git a/media/libaom/src/av1/common/blockd.c b/media/libaom/src/av1/common/blockd.c
index 00725ea2d6..1d597502ce 100644
--- a/media/libaom/src/av1/common/blockd.c
+++ b/media/libaom/src/av1/common/blockd.c
@@ -11,8 +11,6 @@
 
 #include <math.h>
 
-#include "aom_ports/system_state.h"
-
 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 
diff --git a/media/libaom/src/av1/common/blockd.h b/media/libaom/src/av1/common/blockd.h
index 47597bc834..b2e72d2e46 100644
--- a/media/libaom/src/av1/common/blockd.h
+++ b/media/libaom/src/av1/common/blockd.h
@@ -39,6 +39,14 @@ extern "C" {
 
 #define INTERINTRA_WEDGE_SIGN 0
 
+#define DEFAULT_INTER_TX_TYPE DCT_DCT
+
+#define MAX_PALETTE_BLOCK_WIDTH 64
+
+#define MAX_PALETTE_BLOCK_HEIGHT 64
+
+/*!\cond */
+
 // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
 enum {
   DIFFWTD_38 = 0,
@@ -188,15 +196,10 @@ typedef struct RD_STATS {
   // rate/dist.
   int64_t rdcost;
   int64_t sse;
-  int skip;  // sse should equal to dist when skip == 1
+  int skip_txfm;  // sse should equal to dist when skip_txfm == 1
   int zero_rate;
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost[MAX_MB_PLANE];
-  // TODO(jingning): Temporary solution to silence stack over-size warning
-  // in handle_inter_mode. This should be fixed after rate-distortion
-  // optimization refactoring.
-  int16_t txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
-                            [TXB_COEFF_COST_MAP_SIZE];
 #endif  // CONFIG_RD_DEBUG
 } RD_STATS;
 
@@ -212,64 +215,136 @@ typedef struct {
 
 #define INTER_TX_SIZE_BUF_LEN 16
 #define TXK_TYPE_BUF_LEN 64
-// This structure now relates to 4x4 block regions.
+/*!\endcond */
+
+/*! \brief Stores the prediction/txfm mode of the current coding block
+ */
 typedef struct MB_MODE_INFO {
-  // interinter members
-  INTERINTER_COMPOUND_DATA interinter_comp;
-  WarpedMotionParams wm_params;
-  int_mv mv[2];
-  int current_qindex;
-  // Only for INTER blocks
-  int_interpfilters interp_filters;
-  // TODO(debargha): Consolidate these flags
-#if CONFIG_RD_DEBUG
-  RD_STATS rd_stats;
-  int mi_row;
-  int mi_col;
-#endif
-#if CONFIG_INSPECTION
-  int16_t tx_skip[TXK_TYPE_BUF_LEN];
-#endif
-  PALETTE_MODE_INFO palette_mode_info;
-  // Common for both INTER and INTRA blocks
-  BLOCK_SIZE sb_type;
+  /*****************************************************************************
+   * \name General Info of the Coding Block
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief The block size of the current coding block */
+  BLOCK_SIZE bsize;
+  /*! \brief The partition type of the current coding block. */
+  PARTITION_TYPE partition;
+  /*! \brief The prediction mode used */
   PREDICTION_MODE mode;
-  // Only for INTRA blocks
+  /*! \brief The UV mode when intra is used */
   UV_PREDICTION_MODE uv_mode;
-  // interintra members
-  INTERINTRA_MODE interintra_mode;
-  MOTION_MODE motion_mode;
-  PARTITION_TYPE partition;
+  /*! \brief The q index for the current coding block. */
+  int current_qindex;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Mode Info
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief The motion vectors used by the current inter mode */
+  int_mv mv[2];
+  /*! \brief The reference frames for the MV */
   MV_REFERENCE_FRAME ref_frame[2];
-  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-  int8_t skip;
-  uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
-  TX_SIZE tx_size;
-  int8_t delta_lf_from_base;
-  int8_t delta_lf[FRAME_LF_COUNT];
+  /*! \brief Filter used in subpel interpolation. */
+  int_interpfilters interp_filters;
+  /*! \brief The motion mode used by the inter prediction. */
+  MOTION_MODE motion_mode;
+  /*! \brief Number of samples used by warp causal */
+  uint8_t num_proj_ref;
+  /*! \brief The number of overlapped neighbors above/left for obmc/warp motion
+   * mode. */
+  uint8_t overlappable_neighbors;
+  /*! \brief The parameters used in warp motion mode. */
+  WarpedMotionParams wm_params;
+  /*! \brief The type of intra mode used by inter-intra */
+  INTERINTRA_MODE interintra_mode;
+  /*! \brief The type of wedge used in interintra mode. */
   int8_t interintra_wedge_index;
-  // The actual prediction angle is the base angle + (angle_delta * step).
+  /*! \brief Struct that stores the data used in interinter compound mode. */
+  INTERINTER_COMPOUND_DATA interinter_comp;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Intra Mode Info
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Directional mode delta: the angle is base angle + (angle_delta *
+   * step). */
   int8_t angle_delta[PLANE_TYPES];
-  /* deringing gain *per-superblock* */
-  // Joint sign of alpha Cb and alpha Cr
+  /*! \brief The type of filter intra mode used (if applicable). */
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  /*! \brief Chroma from Luma: Joint sign of alpha Cb and alpha Cr */
   int8_t cfl_alpha_signs;
-  // Index of the alpha Cb and alpha Cr combination
+  /*! \brief Chroma from Luma: Index of the alpha Cb and alpha Cr combination */
   uint8_t cfl_alpha_idx;
-  uint8_t num_proj_ref;
-  uint8_t overlappable_neighbors[2];
-  // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used.
-  uint8_t compound_idx;
-  uint8_t use_wedge_interintra : 1;
+  /*! \brief Stores the size and colors of palette mode */
+  PALETTE_MODE_INFO palette_mode_info;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Transform Info
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Whether to skip transforming and sending. */
+  int8_t skip_txfm;
+  /*! \brief Transform size when fixed size txfm is used (e.g. intra modes). */
+  TX_SIZE tx_size;
+  /*! \brief Transform size when recursive txfm tree is on. */
+  TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Loop Filter Info
+   ****************************************************************************/
+  /**@{*/
+  /*! \copydoc MACROBLOCKD::delta_lf_from_base */
+  int8_t delta_lf_from_base;
+  /*! \copydoc MACROBLOCKD::delta_lf */
+  int8_t delta_lf[FRAME_LF_COUNT];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Bitfield for Memory Reduction
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief The segment id */
   uint8_t segment_id : 3;
-  uint8_t seg_id_predicted : 1;  // valid only when temporal_update is enabled
+  /*! \brief Only valid when temporal update if off. */
+  uint8_t seg_id_predicted : 1;
+  /*! \brief Which ref_mv to use */
+  uint8_t ref_mv_idx : 2;
+  /*! \brief Inter skip mode */
   uint8_t skip_mode : 1;
+  /*! \brief Whether intrabc is used. */
   uint8_t use_intrabc : 1;
-  uint8_t ref_mv_idx : 2;
-  // Indicate if masked compound is used(1) or not(0).
+  /*! \brief Indicates if masked compound is used(1) or not (0). */
   uint8_t comp_group_idx : 1;
+  /*! \brief Indicates whether dist_wtd_comp(0) is used or not (0). */
+  uint8_t compound_idx : 1;
+  /*! \brief Whether to use interintra wedge */
+  uint8_t use_wedge_interintra : 1;
+  /*! \brief CDEF strength per BLOCK_64X64 */
   int8_t cdef_strength : 4;
+  /**@}*/
+
+  /*! \brief Skip CDEF for this superblock */
+  uint8_t skip_cdef_curr_sb;
+
+#if CONFIG_RD_DEBUG
+  /*! \brief RD info used for debugging */
+  RD_STATS rd_stats;
+  /*! \brief The current row in unit of 4x4 blocks for debugging */
+  int mi_row;
+  /*! \brief The current col in unit of 4x4 blocks for debugging */
+  int mi_col;
+#endif
+#if CONFIG_INSPECTION
+  /*! \brief Whether we are skipping the current rows or columns. */
+  int16_t tx_skip[TXK_TYPE_BUF_LEN];
+#endif
 } MB_MODE_INFO;
 
+/*!\cond */
+
 static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
   return mbmi->use_intrabc;
 }
@@ -349,7 +424,7 @@ PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi);
 static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
                                      TransformationType type) {
   const PREDICTION_MODE mode = mbmi->mode;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int block_size_allowed =
       AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
   return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
@@ -390,9 +465,6 @@ typedef struct {
 } CB_BUFFER;
 
 typedef struct macroblockd_plane {
-  tran_low_t *dqcoeff;
-  tran_low_t *dqcoeff_block;
-  eob_info *eob_data;
   PLANE_TYPE plane_type;
   int subsampling_x;
   int subsampling_y;
@@ -405,6 +477,9 @@ typedef struct macroblockd_plane {
   // dequantization process.  They have the same coefficient
   // shift/scale as TX.
   int16_t seg_dequant_QTX[MAX_SEGMENTS][2];
+  // Pointer to color index map of:
+  // - Current coding block, on encoder side.
+  // - Current superblock, on decoder side.
   uint8_t *color_index_map;
 
   // block size in pixels
@@ -416,16 +491,36 @@ typedef struct macroblockd_plane {
 
 #define BLOCK_OFFSET(i) ((i) << 4)
 
+/*!\endcond */
+
+/*!\brief Parameters related to Wiener Filter */
 typedef struct {
+  /*!
+   * Vertical filter kernel.
+   */
   DECLARE_ALIGNED(16, InterpKernel, vfilter);
+
+  /*!
+   * Horizontal filter kernel.
+   */
   DECLARE_ALIGNED(16, InterpKernel, hfilter);
 } WienerInfo;
 
+/*!\brief Parameters related to Sgrproj Filter */
 typedef struct {
+  /*!
+   * Parameter index.
+   */
   int ep;
+
+  /*!
+   * Weights for linear combination of filtered versions
+   */
   int xqd[2];
 } SgrprojInfo;
 
+/*!\cond */
+
 #if CONFIG_DEBUG
 #define CFL_SUB8X8_VAL_MI_SIZE (4)
 #define CFL_SUB8X8_VAL_MI_SQUARE \
@@ -461,10 +556,6 @@ typedef struct cfl_ctx {
 
   // Whether the reconstructed luma pixels need to be stored
   int store_y;
-
-#if CONFIG_DEBUG
-  int rate;
-#endif  // CONFIG_DEBUG
 } CFL_CTX;
 
 typedef struct dist_wtd_comp_params {
@@ -475,212 +566,413 @@ typedef struct dist_wtd_comp_params {
 
 struct scale_factors;
 
-// Most/all of the pointers are mere pointers to actual arrays are allocated
-// elsewhere. This is mostly for coding convenience.
+/*!\endcond */
+
+/*! \brief Variables related to current coding block.
+ *
+ * This is a common set of variables used by both encoder and decoder.
+ * Most/all of the pointers are mere pointers to actual arrays are allocated
+ * elsewhere. This is mostly for coding convenience.
+ */
 typedef struct macroblockd {
-  // Row and column position of current macroblock in mi units.
-  int mi_row;
-  int mi_col;
-  // Same as cm->mi_params.mi_stride, copied here for convenience.
+  /**
+   * \name Position of current macroblock in mi units
+   */
+  /**@{*/
+  int mi_row; /*!< Row position in mi units. */
+  int mi_col; /*!< Column position in mi units. */
+  /**@}*/
+
+  /*!
+   * Same as cm->mi_params.mi_stride, copied here for convenience.
+   */
   int mi_stride;
 
-  // True if current block transmits chroma information.
-  // More detail:
-  // Smallest supported block size for both luma and chroma plane is 4x4. Hence,
-  // in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma
-  // blocks smaller than 8x8 maybe combined into one chroma block.
-  // For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4
-  // luma blocks. Then, a single chroma block of size 4x4 will cover the area of
-  // these four luma blocks. This is implemented in bitstream as follows:
-  // - There are four MB_MODE_INFO structs for the four luma blocks.
-  // - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit
-  // any information for chroma planes.
-  // - Last block will have is_chroma_ref = true and transmits chroma
-  // information for the 4x4 chroma block that covers whole 8x8 area covered by
-  // four luma blocks.
-  // Similar logic applies for chroma blocks that cover 2 or 3 luma blocks.
+  /*!
+   * True if current block transmits chroma information.
+   * More detail:
+   * Smallest supported block size for both luma and chroma plane is 4x4. Hence,
+   * in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma
+   * blocks smaller than 8x8 maybe combined into one chroma block.
+   * For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4
+   * luma blocks. Then, a single chroma block of size 4x4 will cover the area of
+   * these four luma blocks. This is implemented in bitstream as follows:
+   * - There are four MB_MODE_INFO structs for the four luma blocks.
+   * - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit
+   * any information for chroma planes.
+   * - Last block will have is_chroma_ref = true and transmits chroma
+   * information for the 4x4 chroma block that covers whole 8x8 area covered by
+   * four luma blocks.
+   * Similar logic applies for chroma blocks that cover 2 or 3 luma blocks.
+   */
   bool is_chroma_ref;
 
+  /*!
+   * Info specific to each plane.
+   */
   struct macroblockd_plane plane[MAX_MB_PLANE];
 
+  /*!
+   * Tile related info.
+   */
   TileInfo tile;
 
-  // Appropriate offset inside cm->mi_params.mi_grid_base based on current
-  // mi_row and mi_col.
+  /*!
+   * Appropriate offset inside cm->mi_params.mi_grid_base based on current
+   * mi_row and mi_col.
+   */
   MB_MODE_INFO **mi;
 
-  // True if 4x4 block above the current block is available.
+  /*!
+   * True if 4x4 block above the current block is available.
+   */
   bool up_available;
-  // True if 4x4 block to the left of the current block is available.
+  /*!
+   * True if 4x4 block to the left of the current block is available.
+   */
   bool left_available;
-  // True if the above chrome reference block is available.
+  /*!
+   * True if the above chrome reference block is available.
+   */
   bool chroma_up_available;
-  // True if the left chrome reference block is available.
+  /*!
+   * True if the left chrome reference block is available.
+   */
   bool chroma_left_available;
 
-  // MB_MODE_INFO for 4x4 block to the left of the current block, if
-  // left_available == true; otherwise NULL.
+  /*!
+   * MB_MODE_INFO for 4x4 block to the left of the current block, if
+   * left_available == true; otherwise NULL.
+   */
   MB_MODE_INFO *left_mbmi;
-  // MB_MODE_INFO for 4x4 block above the current block, if
-  // up_available == true; otherwise NULL.
+  /*!
+   * MB_MODE_INFO for 4x4 block above the current block, if
+   * up_available == true; otherwise NULL.
+   */
   MB_MODE_INFO *above_mbmi;
-  // Above chroma reference block if is_chroma_ref == true for the current block
-  // and chroma_up_available == true; otherwise NULL.
-  // See also: the special case logic when current chroma block covers more than
-  // one luma blocks in set_mi_row_col().
+  /*!
+   * Above chroma reference block if is_chroma_ref == true for the current block
+   * and chroma_up_available == true; otherwise NULL.
+   * See also: the special case logic when current chroma block covers more than
+   * one luma blocks in set_mi_row_col().
+   */
   MB_MODE_INFO *chroma_left_mbmi;
-  // Left chroma reference block if is_chroma_ref == true for the current block
-  // and chroma_left_available == true; otherwise NULL.
-  // See also: the special case logic when current chroma block covers more than
-  // one luma blocks in set_mi_row_col().
+  /*!
+   * Left chroma reference block if is_chroma_ref == true for the current block
+   * and chroma_left_available == true; otherwise NULL.
+   * See also: the special case logic when current chroma block covers more than
+   * one luma blocks in set_mi_row_col().
+   */
   MB_MODE_INFO *chroma_above_mbmi;
 
-  // Appropriate offset based on current 'mi_row' and 'mi_col', inside
-  // 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or
-  // 'MACROBLOCK' structs.
+  /*!
+   * Appropriate offset based on current 'mi_row' and 'mi_col', inside
+   * 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or
+   * 'MACROBLOCK' structs.
+   */
   uint8_t *tx_type_map;
-  // Stride for 'tx_type_map'. Note that this may / may not be same as
-  // 'mi_stride', depending on which actual array 'tx_type_map' points to.
+  /*!
+   * Stride for 'tx_type_map'. Note that this may / may not be same as
+   * 'mi_stride', depending on which actual array 'tx_type_map' points to.
+   */
   int tx_type_map_stride;
 
-  // Distance of this macroblock from frame edges in 1/8th pixel units.
-  int mb_to_left_edge;
-  int mb_to_right_edge;
-  int mb_to_top_edge;
-  int mb_to_bottom_edge;
-
-  // Scale factors for reference frames of the current block.
-  // These are pointers into 'cm->ref_scale_factors'.
+  /**
+   * \name Distance of this macroblock from frame edges in 1/8th pixel units.
+   */
+  /**@{*/
+  int mb_to_left_edge;   /*!< Distance from left edge */
+  int mb_to_right_edge;  /*!< Distance from right edge */
+  int mb_to_top_edge;    /*!< Distance from top edge */
+  int mb_to_bottom_edge; /*!< Distance from bottom edge */
+  /**@}*/
+
+  /*!
+   * Scale factors for reference frames of the current block.
+   * These are pointers into 'cm->ref_scale_factors'.
+   */
   const struct scale_factors *block_ref_scale_factors[2];
 
+  /*!
+   * - On encoder side: points to cpi->source, which is the buffer containing
+   * the current *source* frame (maybe filtered).
+   * - On decoder side: points to cm->cur_frame->buf, which is the buffer into
+   * which current frame is being *decoded*.
+   */
   const YV12_BUFFER_CONFIG *cur_buf;
 
-  // Entropy contexts for the above blocks.
-  // above_entropy_context[i][j] corresponds to above entropy context for ith
-  // plane and jth mi column of this *frame*, wrt current 'mi_row'.
-  // These are pointers into 'cm->above_contexts.entropy'.
+  /*!
+   * Entropy contexts for the above blocks.
+   * above_entropy_context[i][j] corresponds to above entropy context for ith
+   * plane and jth mi column of this *frame*, wrt current 'mi_row'.
+   * These are pointers into 'cm->above_contexts.entropy'.
+   */
   ENTROPY_CONTEXT *above_entropy_context[MAX_MB_PLANE];
-  // Entropy contexts for the left blocks.
-  // left_entropy_context[i][j] corresponds to left entropy context for ith
-  // plane and jth mi row of this *superblock*, wrt current 'mi_col'.
-  // Note: These contain actual data, NOT pointers.
+  /*!
+   * Entropy contexts for the left blocks.
+   * left_entropy_context[i][j] corresponds to left entropy context for ith
+   * plane and jth mi row of this *superblock*, wrt current 'mi_col'.
+   * Note: These contain actual data, NOT pointers.
+   */
   ENTROPY_CONTEXT left_entropy_context[MAX_MB_PLANE][MAX_MIB_SIZE];
 
-  // Partition contexts for the above blocks.
-  // above_partition_context[i] corresponds to above partition context for ith
-  // mi column of this *frame*, wrt current 'mi_row'.
-  // These are pointers into 'cm->above_contexts.partition'.
+  /*!
+   * Partition contexts for the above blocks.
+   * above_partition_context[i] corresponds to above partition context for ith
+   * mi column of this *frame*, wrt current 'mi_row'.
+   * This is a pointer into 'cm->above_contexts.partition'.
+   */
   PARTITION_CONTEXT *above_partition_context;
-  // Partition contexts for the left blocks.
-  // left_partition_context[i] corresponds to left partition context for ith
-  // mi row of this *superblock*, wrt current 'mi_col'.
-  // Note: These contain actual data, NOT pointers.
+  /*!
+   * Partition contexts for the left blocks.
+   * left_partition_context[i] corresponds to left partition context for ith
+   * mi row of this *superblock*, wrt current 'mi_col'.
+   * Note: These contain actual data, NOT pointers.
+   */
   PARTITION_CONTEXT left_partition_context[MAX_MIB_SIZE];
 
-  // Transform contexts for the above blocks.
-  // TODO(urvang): Indexed two different ways from cm->above_contexts.txfm in
-  // code currently. Need to make it consistent / document why.
+  /*!
+   * Transform contexts for the above blocks.
+   * above_txfm_context[i] corresponds to above transform context for ith mi col
+   * from the current position (mi row and mi column) for this *frame*.
+   * This is a pointer into 'cm->above_contexts.txfm'.
+   */
   TXFM_CONTEXT *above_txfm_context;
-  // Transform contexts for the left blocks.
+  /*!
+   * Transform contexts for the left blocks.
+   * left_txfm_context[i] corresponds to left transform context for ith mi row
+   * from the current position (mi_row and mi_col) for this *superblock*.
+   * This is a pointer into 'left_txfm_context_buffer'.
+   */
   TXFM_CONTEXT *left_txfm_context;
-  // TODO(urvang): 'left_txfm_context' points to 'left_txfm_context_buffer'.
-  // Can we remove this indirection?
+  /*!
+   * left_txfm_context_buffer[i] is the left transform context for ith mi_row
+   * in this *superblock*.
+   * Behaves like an internal actual buffer which 'left_txt_context' points to,
+   * and never accessed directly except to fill in initial default values.
+   */
   TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
 
-  // Default values for the two restoration filters for each plane.
-  // These values are used as reference values when writing the bitstream. That
-  // is, we transmit the delta between the actual values in
-  // cm->rst_info[plane].unit_info[unit_idx] and these reference values.
-  WienerInfo wiener_info[MAX_MB_PLANE];
-  SgrprojInfo sgrproj_info[MAX_MB_PLANE];
-
-  // Block dimensions in MB_MODE_INFO units.
-  uint8_t width;
-  uint8_t height;
-
-  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  /**
+   * \name Default values for the two restoration filters for each plane.
+   * Default values for the two restoration filters for each plane.
+   * These values are used as reference values when writing the bitstream. That
+   * is, we transmit the delta between the actual values in
+   * cm->rst_info[plane].unit_info[unit_idx] and these reference values.
+   */
+  /**@{*/
+  WienerInfo wiener_info[MAX_MB_PLANE];   /*!< Defaults for Wiener filter*/
+  SgrprojInfo sgrproj_info[MAX_MB_PLANE]; /*!< Defaults for SGR filter */
+  /**@}*/
+
+  /**
+   * \name Block dimensions in MB_MODE_INFO units.
+   */
+  /**@{*/
+  uint8_t width;  /*!< Block width in MB_MODE_INFO units */
+  uint8_t height; /*!< Block height in MB_MODE_INFO units */
+  /**@}*/
+
+  /*!
+   * Contains the motion vector candidates found during motion vector prediction
+   * process. ref_mv_stack[i] contains the candidates for ith type of
+   * reference frame (single/compound). The actual number of candidates found in
+   * ref_mv_stack[i] is stored in either dcb->ref_mv_count[i] (decoder side)
+   * or mbmi_ext->ref_mv_count[i] (encoder side).
+   */
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  /*!
+   * weight[i][j] is the weight for ref_mv_stack[i][j] and used to compute the
+   * DRL (dynamic reference list) mode contexts.
+   */
   uint16_t weight[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
-  uint8_t is_sec_rect;
 
-  // Counts of each reference frame in the above and left neighboring blocks.
-  // NOTE: Take into account both single and comp references.
+  /*!
+   * True if this is the last vertical rectangular block in a VERTICAL or
+   * VERTICAL_4 partition.
+   */
+  bool is_last_vertical_rect;
+  /*!
+   * True if this is the 1st horizontal rectangular block in a HORIZONTAL or
+   * HORIZONTAL_4 partition.
+   */
+  bool is_first_horizontal_rect;
+
+  /*!
+   * Counts of each reference frame in the above and left neighboring blocks.
+   * NOTE: Take into account both single and comp references.
+   */
   uint8_t neighbors_ref_counts[REF_FRAMES];
 
+  /*!
+   * Current CDFs of all the symbols for the current tile.
+   */
   FRAME_CONTEXT *tile_ctx;
-  // Bit depth: copied from cm->seq_params.bit_depth for convenience.
+
+  /*!
+   * Bit depth: copied from cm->seq_params->bit_depth for convenience.
+   */
   int bd;
 
+  /*!
+   * Quantizer index for each segment (base qindex + delta for each segment).
+   */
   int qindex[MAX_SEGMENTS];
+  /*!
+   * lossless[s] is true if segment 's' is coded losslessly.
+   */
   int lossless[MAX_SEGMENTS];
-  // TODO(urvang): Move to decoder.
-  int corrupted;
-  // Same as cm->features.cur_frame_force_integer_mv.
+  /*!
+   * Q index for the coding blocks in this superblock will be stored in
+   * mbmi->current_qindex. Now, when cm->delta_q_info.delta_q_present_flag is
+   * true, mbmi->current_qindex is computed by taking 'current_base_qindex' as
+   * the base, and adding any transmitted delta qindex on top of it.
+   * Precisely, this is the latest qindex used by the first coding block of a
+   * non-skip superblock in the current tile; OR
+   * same as cm->quant_params.base_qindex (if not explicitly set yet).
+   * Note: This is 'CurrentQIndex' in the AV1 spec.
+   */
+  int current_base_qindex;
+
+  /*!
+   * Same as cm->features.cur_frame_force_integer_mv.
+   */
   int cur_frame_force_integer_mv;
-  // Pointer to cm->error.
+
+  /*!
+   * Pointer to cm->error.
+   */
   struct aom_internal_error_info *error_info;
-  // Same as cm->global_motion.
+
+  /*!
+   * Same as cm->global_motion.
+   */
   const WarpedMotionParams *global_motion;
-  int delta_qindex;
-  int current_qindex;
-  // Since actual frame level loop filtering level value is not available
-  // at the beginning of the tile (only available during actual filtering)
-  // at encoder side.we record the delta_lf (against the frame level loop
-  // filtering level) and code the delta between previous superblock's delta
-  // lf and current delta lf. It is equivalent to the delta between previous
-  // superblock's actual lf and current lf.
+
+  /*!
+   * Since actual frame level loop filtering level value is not available
+   * at the beginning of the tile (only available during actual filtering)
+   * at encoder side.we record the delta_lf (against the frame level loop
+   * filtering level) and code the delta between previous superblock's delta
+   * lf and current delta lf. It is equivalent to the delta between previous
+   * superblock's actual lf and current lf.
+   */
   int8_t delta_lf_from_base;
-  // For this experiment, we have four frame filter levels for different plane
-  // and direction. So, to support the per superblock update, we need to add
-  // a few more params as below.
-  // 0: delta loop filter level for y plane vertical
-  // 1: delta loop filter level for y plane horizontal
-  // 2: delta loop filter level for u plane
-  // 3: delta loop filter level for v plane
-  // To make it consistent with the reference to each filter level in segment,
-  // we need to -1, since
-  // SEG_LVL_ALT_LF_Y_V = 1;
-  // SEG_LVL_ALT_LF_Y_H = 2;
-  // SEG_LVL_ALT_LF_U   = 3;
-  // SEG_LVL_ALT_LF_V   = 4;
+  /*!
+   * We have four frame filter levels for different plane and direction. So, to
+   * support the per superblock update, we need to add a few more params:
+   * 0. delta loop filter level for y plane vertical
+   * 1. delta loop filter level for y plane horizontal
+   * 2. delta loop filter level for u plane
+   * 3. delta loop filter level for v plane
+   * To make it consistent with the reference to each filter level in segment,
+   * we need to -1, since
+   * - SEG_LVL_ALT_LF_Y_V = 1;
+   * - SEG_LVL_ALT_LF_Y_H = 2;
+   * - SEG_LVL_ALT_LF_U   = 3;
+   * - SEG_LVL_ALT_LF_V   = 4;
+   */
   int8_t delta_lf[FRAME_LF_COUNT];
-  // cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the
-  // current superblock has already been read from (decoder) / written to
-  // (encoder) the bitstream; and false otherwise.
-  // More detail:
-  // (1) CDEF strength is transmitted only once per CDEF unit, in the 1st
-  // non-skip coding block. So, we need this array to keep track of whether CDEF
-  // strengths for the given CDEF units have been transmitted yet or not.
-  // (2) Superblock size can be either 128x128 or 64x64, but CDEF unit size is
-  // fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if
-  // superblock size is 128x128). Hence the array size is 4.
-  // (3) In the current implementation, CDEF strength for this CDEF unit is
-  // stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside
-  // cm->mi_params.mi_grid_base).
+  /*!
+   * cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the
+   * current superblock has already been read from (decoder) / written to
+   * (encoder) the bitstream; and false otherwise.
+   * More detail:
+   * 1. CDEF strength is transmitted only once per CDEF unit, in the 1st
+   * non-skip coding block. So, we need this array to keep track of whether CDEF
+   * strengths for the given CDEF units have been transmitted yet or not.
+   * 2. Superblock size can be either 128x128 or 64x64, but CDEF unit size is
+   * fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if
+   * superblock size is 128x128). Hence the array size is 4.
+   * 3. In the current implementation, CDEF strength for this CDEF unit is
+   * stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside
+   * cm->mi_params.mi_grid_base).
+   */
   bool cdef_transmitted[4];
 
-  DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
-  uint8_t *mc_buf[2];
-  CFL_CTX cfl;
+  /*!
+   * Mask for this block used for compound prediction.
+   */
+  uint8_t *seg_mask;
 
-  DIST_WTD_COMP_PARAMS jcp_param;
+  /*!
+   * CFL (chroma from luma) related parameters.
+   */
+  CFL_CTX cfl;
 
-  uint16_t cb_offset[MAX_MB_PLANE];
-  uint16_t txb_offset[MAX_MB_PLANE];
+  /*!
+   * Offset to plane[p].color_index_map.
+   * Currently:
+   * - On encoder side, this is always 0 as 'color_index_map' is allocated per
+   * *coding block* there.
+   * - On decoder side, this may be non-zero, as 'color_index_map' is a (static)
+   * memory pointing to the base of a *superblock* there, and we need an offset
+   * to it to get the color index map for current coding block.
+   */
   uint16_t color_index_map_offset[2];
 
+  /*!
+   * Temporary buffer used for convolution in case of compound reference only
+   * for (weighted or uniform) averaging operation.
+   * There are pointers to actual buffers allocated elsewhere: e.g.
+   * - In decoder, 'pbi->td.tmp_conv_dst' or
+   * 'pbi->thread_data[t].td->xd.tmp_conv_dst' and
+   * - In encoder, 'x->tmp_conv_dst' or
+   * 'cpi->tile_thr_data[t].td->mb.tmp_conv_dst'.
+   */
   CONV_BUF_TYPE *tmp_conv_dst;
+  /*!
+   * Temporary buffers used to build OBMC prediction by above (index 0) and left
+   * (index 1) predictors respectively.
+   * tmp_obmc_bufs[i][p * MAX_SB_SQUARE] is the buffer used for plane 'p'.
+   * There are pointers to actual buffers allocated elsewhere: e.g.
+   * - In decoder, 'pbi->td.tmp_obmc_bufs' or
+   * 'pbi->thread_data[t].td->xd.tmp_conv_dst' and
+   * -In encoder, 'x->tmp_pred_bufs' or
+   * 'cpi->tile_thr_data[t].td->mb.tmp_pred_bufs'.
+   */
   uint8_t *tmp_obmc_bufs[2];
 } MACROBLOCKD;
 
+/*!\cond */
+
 static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
+#if CONFIG_AV1_HIGHBITDEPTH
   return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
+#else
+  (void)xd;
+  return 0;
+#endif
 }
 
 static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
+#if CONFIG_AV1_HIGHBITDEPTH
   return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
              ? CONVERT_TO_BYTEPTR(buf16)
              : buf16;
+#else
+  (void)xd;
+  return buf16;
+#endif
+}
+
+typedef struct BitDepthInfo {
+  int bit_depth;
+  /*! Is the image buffer high bit depth?
+   * Low bit depth buffer uses uint8_t.
+   * High bit depth buffer uses uint16_t.
+   * Equivalent to cm->seq_params->use_highbitdepth
+   */
+  int use_highbitdepth_buf;
+} BitDepthInfo;
+
+static INLINE BitDepthInfo get_bit_depth_info(const MACROBLOCKD *xd) {
+  BitDepthInfo bit_depth_info;
+  bit_depth_info.bit_depth = xd->bd;
+  bit_depth_info.use_highbitdepth_buf = is_cur_buf_hbd(xd);
+  assert(IMPLIES(!bit_depth_info.use_highbitdepth_buf,
+                 bit_depth_info.bit_depth == 8));
+  return bit_depth_info;
 }
 
 static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
@@ -757,6 +1049,28 @@ static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
 };
 
+// The bitmask corresponds to the transform types as defined in
+// enums.h TX_TYPE enumeration type. Setting the bit 0 means to disable
+// the use of the corresponding transform type in that table.
+// The av1_derived_intra_tx_used_flag table is used when
+// use_reduced_intra_txset is set to 2, where one only searches
+// the transform types derived from residual statistics.
+static const uint16_t av1_derived_intra_tx_used_flag[INTRA_MODES] = {
+  0x0209,  // DC_PRED:       0000 0010 0000 1001
+  0x0403,  // V_PRED:        0000 0100 0000 0011
+  0x0805,  // H_PRED:        0000 1000 0000 0101
+  0x020F,  // D45_PRED:      0000 0010 0000 1111
+  0x0009,  // D135_PRED:     0000 0000 0000 1001
+  0x0009,  // D113_PRED:     0000 0000 0000 1001
+  0x0009,  // D157_PRED:     0000 0000 0000 1001
+  0x0805,  // D203_PRED:     0000 1000 0000 0101
+  0x0403,  // D67_PRED:      0000 0100 0000 0011
+  0x0205,  // SMOOTH_PRED:   0000 0010 0000 1001
+  0x0403,  // SMOOTH_V_PRED: 0000 0100 0000 0011
+  0x0805,  // SMOOTH_H_PRED: 0000 1000 0000 0101
+  0x0209,  // PAETH_PRED:    0000 0010 0000 1001
+};
+
 static const uint16_t av1_reduced_intra_tx_used_flag[INTRA_MODES] = {
   0x080F,  // DC_PRED:       0000 1000 0000 1111
   0x040F,  // V_PRED:        0000 0100 0000 1111
@@ -863,13 +1177,13 @@ static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
 static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
                                           const MACROBLOCKD *xd,
                                           TX_SIZE tx_size,
-                                          int is_screen_content_type) {
+                                          int use_screen_content_tools) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
 
   if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
       xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 ||
-      is_screen_content_type)
-    return DCT_DCT;
+      use_screen_content_tools)
+    return DEFAULT_INTER_TX_TYPE;
 
   return intra_mode_to_tx_type(mbmi, plane_type);
 }
@@ -1076,7 +1390,7 @@ static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
   if (xd->lossless[mbmi->segment_id]) return TX_4X4;
   if (plane == 0) return mbmi->tx_size;
   const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
-  return av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+  return av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
                                pd->subsampling_y);
 }
 
@@ -1116,7 +1430,7 @@ static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
 }
 
 static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
-  return is_interintra_allowed_bsize(mbmi->sb_type) &&
+  return is_interintra_allowed_bsize(mbmi->bsize) &&
          is_interintra_allowed_mode(mbmi->mode) &&
          is_interintra_allowed_ref(mbmi->ref_frame);
 }
@@ -1159,34 +1473,29 @@ static INLINE int is_motion_variation_allowed_compound(
 static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 };
 
 static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
-  return !(mbmi->overlappable_neighbors[0] == 0 &&
-           mbmi->overlappable_neighbors[1] == 0);
+  return mbmi->overlappable_neighbors != 0;
 }
 
 static INLINE MOTION_MODE
 motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
                     const MB_MODE_INFO *mbmi, int allow_warped_motion) {
+  if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
   if (xd->cur_frame_force_integer_mv == 0) {
     const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype;
     if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION;
   }
-  if (is_motion_variation_allowed_bsize(mbmi->sb_type) &&
+  if (is_motion_variation_allowed_bsize(mbmi->bsize) &&
       is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME &&
       is_motion_variation_allowed_compound(mbmi)) {
-    if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
     assert(!has_second_ref(mbmi));
-    if (mbmi->num_proj_ref >= 1 &&
-        (allow_warped_motion &&
-         !av1_is_scaled(xd->block_ref_scale_factors[0]))) {
-      if (xd->cur_frame_force_integer_mv) {
-        return OBMC_CAUSAL;
-      }
+    if (mbmi->num_proj_ref >= 1 && allow_warped_motion &&
+        !xd->cur_frame_force_integer_mv &&
+        !av1_is_scaled(xd->block_ref_scale_factors[0])) {
       return WARPED_CAUSAL;
     }
     return OBMC_CAUSAL;
-  } else {
-    return SIMPLE_TRANSLATION;
   }
+  return SIMPLE_TRANSLATION;
 }
 
 static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
@@ -1196,8 +1505,10 @@ static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
 static INLINE int av1_allow_palette(int allow_screen_content_tools,
                                     BLOCK_SIZE sb_type) {
   assert(sb_type < BLOCK_SIZES_ALL);
-  return allow_screen_content_tools && block_size_wide[sb_type] <= 64 &&
-         block_size_high[sb_type] <= 64 && sb_type >= BLOCK_8X8;
+  return allow_screen_content_tools &&
+         block_size_wide[sb_type] <= MAX_PALETTE_BLOCK_WIDTH &&
+         block_size_high[sb_type] <= MAX_PALETTE_BLOCK_HEIGHT &&
+         sb_type >= BLOCK_8X8;
 }
 
 // Returns sub-sampled dimensions of the given block.
@@ -1228,23 +1539,33 @@ static INLINE void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane,
   // Special handling for chroma sub8x8.
   const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
   const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
-  if (width) *width = plane_block_width + 2 * is_chroma_sub8_x;
-  if (height) *height = plane_block_height + 2 * is_chroma_sub8_y;
+  if (width) {
+    *width = plane_block_width + 2 * is_chroma_sub8_x;
+    assert(*width >= 0);
+  }
+  if (height) {
+    *height = plane_block_height + 2 * is_chroma_sub8_y;
+    assert(*height >= 0);
+  }
   if (rows_within_bounds) {
     *rows_within_bounds =
         (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
+    assert(*rows_within_bounds >= 0);
   }
   if (cols_within_bounds) {
     *cols_within_bounds =
         (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
+    assert(*cols_within_bounds >= 0);
   }
 }
 
 /* clang-format off */
+// Pointer to a three-dimensional array whose first dimension is PALETTE_SIZES.
 typedef aom_cdf_prob (*MapCdf)[PALETTE_COLOR_INDEX_CONTEXTS]
                               [CDF_SIZE(PALETTE_COLORS)];
-typedef const int (*ColorCost)[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                              [PALETTE_COLORS];
+// Pointer to a const three-dimensional array whose first dimension is
+// PALETTE_SIZES.
+typedef const int (*ColorCost)[PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS];
 /* clang-format on */
 
 typedef struct {
@@ -1265,7 +1586,7 @@ static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd,
   // First check if all modes are GLOBALMV
   if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0;
 
-  if (AOMMIN(mi_size_wide[mbmi->sb_type], mi_size_high[mbmi->sb_type]) < 2)
+  if (AOMMIN(mi_size_wide[mbmi->bsize], mi_size_high[mbmi->bsize]) < 2)
     return 0;
 
   // Now check if all global motion is non translational
@@ -1289,6 +1610,8 @@ static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
   return tx_size_2d[tx_size];
 }
 
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/common/cdef.c b/media/libaom/src/av1/common/cdef.c
index ef7b866b5d..7807bb7398 100644
--- a/media/libaom/src/av1/common/cdef.c
+++ b/media/libaom/src/av1/common/cdef.c
@@ -26,7 +26,7 @@ static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
   MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col;
   for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) {
     for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) {
-      if (!mbmi[c]->skip) return 0;
+      if (!mbmi[c]->skip_txfm) return 0;
     }
   }
 
@@ -87,10 +87,10 @@ void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
   }
 }
 
-static void copy_sb8_16(AV1_COMMON *cm, uint16_t *dst, int dstride,
-                        const uint8_t *src, int src_voffset, int src_hoffset,
-                        int sstride, int vsize, int hsize) {
-  if (cm->seq_params.use_highbitdepth) {
+void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst,
+                          int dstride, const uint8_t *src, int src_voffset,
+                          int src_hoffset, int sstride, int vsize, int hsize) {
+  if (cm->seq_params->use_highbitdepth) {
     const uint16_t *base =
         &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
     cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
@@ -118,271 +118,327 @@ static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
   }
 }
 
-void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                    MACROBLOCKD *xd) {
-  const CdefInfo *const cdef_info = &cm->cdef_info;
+// Prepares intermediate input buffer for CDEF.
+// Inputs:
+//   cm: Pointer to common structure.
+//   fb_info: Pointer to the CDEF block-level parameter structure.
+//   colbuf: Left column buffer for CDEF.
+//   cdef_left: Left block is filtered or not.
+//   fbc, fbr: col and row index of a block.
+//   plane: plane index Y/CB/CR.
+// Returns:
+//   Nothing will be returned.
+static void cdef_prepare_fb(const AV1_COMMON *const cm, CdefBlockInfo *fb_info,
+                            uint16_t **const colbuf, const int cdef_left,
+                            int fbc, int fbr, int plane) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int num_planes = av1_num_planes(cm);
-  DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
-  uint16_t *linebuf[3];
-  uint16_t *colbuf[3];
-  cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
-  unsigned char *row_cdef, *prev_row_cdef, *curr_row_cdef;
-  int cdef_count;
-  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
-  int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
-  int mi_wide_l2[3];
-  int mi_high_l2[3];
-  int xdec[3];
-  int ydec[3];
-  int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
+  uint16_t *src = fb_info->src;
+  const int luma_stride =
+      ALIGN_POWER_OF_TWO(mi_params->mi_cols << MI_SIZE_LOG2, 4);
   const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
-                       num_planes);
-  row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
-  memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2);
-  prev_row_cdef = row_cdef + 1;
-  curr_row_cdef = prev_row_cdef + nhfb + 2;
-  for (int pli = 0; pli < num_planes; pli++) {
-    xdec[pli] = xd->plane[pli].subsampling_x;
-    ydec[pli] = xd->plane[pli].subsampling_y;
-    mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
-    mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+  int cstart = 0;
+  if (!cdef_left) cstart = -CDEF_HBORDER;
+  int rend, cend;
+  const int nhb =
+      AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+  const int nvb =
+      AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+  const int hsize = nhb << fb_info->mi_wide_l2;
+  const int vsize = nvb << fb_info->mi_high_l2;
+  const uint16_t *top_linebuf = fb_info->top_linebuf[plane];
+  const uint16_t *bot_linebuf = fb_info->bot_linebuf[plane];
+  const int bot_offset = (vsize + CDEF_VBORDER) * CDEF_BSTRIDE;
+  const int stride =
+      luma_stride >> (plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x);
+
+  if (fbc == nhfb - 1)
+    cend = hsize;
+  else
+    cend = hsize + CDEF_HBORDER;
+
+  if (fbr == nvfb - 1)
+    rend = vsize;
+  else
+    rend = vsize + CDEF_VBORDER;
+
+  /* Copy in the pixels we need from the current superblock for
+  deringing.*/
+  av1_cdef_copy_sb8_16(
+      cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
+      CDEF_BSTRIDE, fb_info->dst, fb_info->roffset, fb_info->coffset + cstart,
+      fb_info->dst_stride, vsize, cend - cstart);
+
+  /* Copy in the pixels we need for the current superblock from bottom buffer.*/
+  if (fbr < nvfb - 1) {
+    copy_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE,
+              &bot_linebuf[fb_info->coffset], stride, CDEF_VBORDER, hsize);
+  } else {
+    fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
+              hsize, CDEF_VERY_LARGE);
   }
-  const int stride = (mi_params->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
-  for (int pli = 0; pli < num_planes; pli++) {
-    linebuf[pli] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
-    colbuf[pli] =
-        aom_malloc(sizeof(*colbuf) *
-                   ((CDEF_BLOCKSIZE << mi_high_l2[pli]) + 2 * CDEF_VBORDER) *
-                   CDEF_HBORDER);
+  if (fbr < nvfb - 1 && fbc > 0) {
+    copy_rect(&src[bot_offset], CDEF_BSTRIDE,
+              &bot_linebuf[fb_info->coffset - CDEF_HBORDER], stride,
+              CDEF_VBORDER, CDEF_HBORDER);
+  } else {
+    fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+              CDEF_VERY_LARGE);
+  }
+  if (fbr < nvfb - 1 && fbc < nhfb - 1) {
+    copy_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+              &bot_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER,
+              CDEF_HBORDER);
+  } else {
+    fill_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+              CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
   }
-  for (int fbr = 0; fbr < nvfb; fbr++) {
-    for (int pli = 0; pli < num_planes; pli++) {
-      const int block_height =
-          (MI_SIZE_64X64 << mi_high_l2[pli]) + 2 * CDEF_VBORDER;
-      fill_rect(colbuf[pli], CDEF_HBORDER, block_height, CDEF_HBORDER,
-                CDEF_VERY_LARGE);
-    }
-    int cdef_left = 1;
-    for (int fbc = 0; fbc < nhfb; fbc++) {
-      int level, sec_strength;
-      int uv_level, uv_sec_strength;
-      int nhb, nvb;
-      int cstart = 0;
-      curr_row_cdef[fbc] = 0;
-      if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
-                                  MI_SIZE_64X64 * fbc] == NULL ||
-          mi_params
-                  ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
-                                 MI_SIZE_64X64 * fbc]
-                  ->cdef_strength == -1) {
-        cdef_left = 0;
-        continue;
-      }
-      if (!cdef_left) cstart = -CDEF_HBORDER;
-      nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
-      nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
-      int frame_top, frame_left, frame_bottom, frame_right;
-
-      int mi_row = MI_SIZE_64X64 * fbr;
-      int mi_col = MI_SIZE_64X64 * fbc;
-      // for the current filter block, it's top left corner mi structure (mi_tl)
-      // is first accessed to check whether the top and left boundaries are
-      // frame boundaries. Then bottom-left and top-right mi structures are
-      // accessed to check whether the bottom and right boundaries
-      // (respectively) are frame boundaries.
-      //
-      // Note that we can't just check the bottom-right mi structure - eg. if
-      // we're at the right-hand edge of the frame but not the bottom, then
-      // the bottom-right mi is NULL but the bottom-left is not.
-      frame_top = (mi_row == 0) ? 1 : 0;
-      frame_left = (mi_col == 0) ? 1 : 0;
-
-      if (fbr != nvfb - 1)
-        frame_bottom = (mi_row + MI_SIZE_64X64 == mi_params->mi_rows) ? 1 : 0;
-      else
-        frame_bottom = 1;
-
-      if (fbc != nhfb - 1)
-        frame_right = (mi_col + MI_SIZE_64X64 == mi_params->mi_cols) ? 1 : 0;
-      else
-        frame_right = 1;
-
-      const int mbmi_cdef_strength =
-          mi_params
-              ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
-                             MI_SIZE_64X64 * fbc]
-              ->cdef_strength;
-      level =
-          cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
-      sec_strength =
-          cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
-      sec_strength += sec_strength == 3;
-      uv_level =
-          cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
-      uv_sec_strength =
-          cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
-      uv_sec_strength += uv_sec_strength == 3;
-      if ((level == 0 && sec_strength == 0 && uv_level == 0 &&
-           uv_sec_strength == 0) ||
-          (cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64,
-                                                 fbc * MI_SIZE_64X64, dlist,
-                                                 BLOCK_64X64)) == 0) {
-        cdef_left = 0;
-        continue;
-      }
 
-      curr_row_cdef[fbc] = 1;
-      for (int pli = 0; pli < num_planes; pli++) {
-        int coffset;
-        int rend, cend;
-        int damping = cdef_info->cdef_damping;
-        int hsize = nhb << mi_wide_l2[pli];
-        int vsize = nvb << mi_high_l2[pli];
-
-        if (pli) {
-          level = uv_level;
-          sec_strength = uv_sec_strength;
-        }
-
-        if (fbc == nhfb - 1)
-          cend = hsize;
-        else
-          cend = hsize + CDEF_HBORDER;
-
-        if (fbr == nvfb - 1)
-          rend = vsize;
-        else
-          rend = vsize + CDEF_VBORDER;
-
-        coffset = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
-        if (fbc == nhfb - 1) {
-          /* On the last superblock column, fill in the right border with
-             CDEF_VERY_LARGE to avoid filtering with the outside. */
-          fill_rect(&src[cend + CDEF_HBORDER], CDEF_BSTRIDE,
-                    rend + CDEF_VBORDER, hsize + CDEF_HBORDER - cend,
-                    CDEF_VERY_LARGE);
-        }
-        if (fbr == nvfb - 1) {
-          /* On the last superblock row, fill in the bottom border with
-             CDEF_VERY_LARGE to avoid filtering with the outside. */
-          fill_rect(&src[(rend + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
-                    CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
-        }
-        /* Copy in the pixels we need from the current superblock for
-           deringing.*/
-        copy_sb8_16(cm,
-                    &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
-                    CDEF_BSTRIDE, xd->plane[pli].dst.buf,
-                    (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr, coffset + cstart,
-                    xd->plane[pli].dst.stride, rend, cend - cstart);
-        if (!prev_row_cdef[fbc]) {
-          copy_sb8_16(cm, &src[CDEF_HBORDER], CDEF_BSTRIDE,
-                      xd->plane[pli].dst.buf,
-                      (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
-                      coffset, xd->plane[pli].dst.stride, CDEF_VBORDER, hsize);
-        } else if (fbr > 0) {
-          copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &linebuf[pli][coffset],
-                    stride, CDEF_VBORDER, hsize);
-        } else {
-          fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
-                    CDEF_VERY_LARGE);
-        }
-        if (!prev_row_cdef[fbc - 1]) {
-          copy_sb8_16(cm, src, CDEF_BSTRIDE, xd->plane[pli].dst.buf,
-                      (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
-                      coffset - CDEF_HBORDER, xd->plane[pli].dst.stride,
-                      CDEF_VBORDER, CDEF_HBORDER);
-        } else if (fbr > 0 && fbc > 0) {
-          copy_rect(src, CDEF_BSTRIDE, &linebuf[pli][coffset - CDEF_HBORDER],
-                    stride, CDEF_VBORDER, CDEF_HBORDER);
-        } else {
-          fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
-                    CDEF_VERY_LARGE);
-        }
-        if (!prev_row_cdef[fbc + 1]) {
-          copy_sb8_16(cm, &src[CDEF_HBORDER + (nhb << mi_wide_l2[pli])],
-                      CDEF_BSTRIDE, xd->plane[pli].dst.buf,
-                      (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
-                      coffset + hsize, xd->plane[pli].dst.stride, CDEF_VBORDER,
-                      CDEF_HBORDER);
-        } else if (fbr > 0 && fbc < nhfb - 1) {
-          copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
-                    &linebuf[pli][coffset + hsize], stride, CDEF_VBORDER,
-                    CDEF_HBORDER);
-        } else {
-          fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
-                    CDEF_HBORDER, CDEF_VERY_LARGE);
-        }
-        if (cdef_left) {
-          /* If we deringed the superblock on the left then we need to copy in
-             saved pixels. */
-          copy_rect(src, CDEF_BSTRIDE, colbuf[pli], CDEF_HBORDER,
-                    rend + CDEF_VBORDER, CDEF_HBORDER);
-        }
-        /* Saving pixels in case we need to dering the superblock on the
-            right. */
-        copy_rect(colbuf[pli], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
-                  rend + CDEF_VBORDER, CDEF_HBORDER);
-        copy_sb8_16(
-            cm, &linebuf[pli][coffset], stride, xd->plane[pli].dst.buf,
-            (MI_SIZE_64X64 << mi_high_l2[pli]) * (fbr + 1) - CDEF_VBORDER,
-            coffset, xd->plane[pli].dst.stride, CDEF_VBORDER, hsize);
-
-        if (frame_top) {
-          fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER,
-                    CDEF_VERY_LARGE);
-        }
-        if (frame_left) {
-          fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
-                    CDEF_VERY_LARGE);
-        }
-        if (frame_bottom) {
-          fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
-                    CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
-        }
-        if (frame_right) {
-          fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
-                    vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
-        }
-
-        if (cm->seq_params.use_highbitdepth) {
-          av1_cdef_filter_fb(
-              NULL,
-              &CONVERT_TO_SHORTPTR(
-                  xd->plane[pli]
-                      .dst.buf)[xd->plane[pli].dst.stride *
-                                    (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
-                                (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
-              xd->plane[pli].dst.stride,
-              &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
-              ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
-              sec_strength, damping, coeff_shift);
-        } else {
-          av1_cdef_filter_fb(
-              &xd->plane[pli]
-                   .dst.buf[xd->plane[pli].dst.stride *
-                                (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
-                            (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
-              NULL, xd->plane[pli].dst.stride,
-              &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
-              ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
-              sec_strength, damping, coeff_shift);
-        }
-      }
-      cdef_left = 1;
-    }
-    {
-      unsigned char *tmp = prev_row_cdef;
-      prev_row_cdef = curr_row_cdef;
-      curr_row_cdef = tmp;
+  /* Copy in the pixels we need from the current superblock from top buffer.*/
+  if (fbr > 0) {
+    copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset],
+              stride, CDEF_VBORDER, hsize);
+  } else {
+    fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
+              CDEF_VERY_LARGE);
+  }
+  if (fbr > 0 && fbc > 0) {
+    copy_rect(src, CDEF_BSTRIDE, &top_linebuf[fb_info->coffset - CDEF_HBORDER],
+              stride, CDEF_VBORDER, CDEF_HBORDER);
+  } else {
+    fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+  }
+  if (fbr > 0 && fbc < nhfb - 1) {
+    copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+              &top_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER,
+              CDEF_HBORDER);
+  } else {
+    fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
+              CDEF_HBORDER, CDEF_VERY_LARGE);
+  }
+  if (cdef_left) {
+    /* If we deringed the superblock on the left then we need to copy in
+    saved pixels. */
+    copy_rect(src, CDEF_BSTRIDE, colbuf[plane], CDEF_HBORDER,
+              rend + CDEF_VBORDER, CDEF_HBORDER);
+  }
+  /* Saving pixels in case we need to dering the superblock on the
+  right. */
+  copy_rect(colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
+            rend + CDEF_VBORDER, CDEF_HBORDER);
+
+  if (fb_info->frame_boundary[LEFT]) {
+    fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
+              CDEF_VERY_LARGE);
+  }
+  if (fb_info->frame_boundary[RIGHT]) {
+    fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+              vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+  }
+}
+
+static INLINE void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane,
+                                  uint8_t use_highbitdepth) {
+  int offset = fb_info->dst_stride * fb_info->roffset + fb_info->coffset;
+  if (use_highbitdepth) {
+    av1_cdef_filter_fb(
+        NULL, CONVERT_TO_SHORTPTR(fb_info->dst + offset), fb_info->dst_stride,
+        &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
+        fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane,
+        fb_info->dlist, fb_info->cdef_count, fb_info->level,
+        fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift);
+  } else {
+    av1_cdef_filter_fb(
+        fb_info->dst + offset, NULL, fb_info->dst_stride,
+        &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
+        fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane,
+        fb_info->dlist, fb_info->cdef_count, fb_info->level,
+        fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift);
+  }
+}
+
+// Initializes block-level parameters for CDEF.
+static INLINE void cdef_init_fb_col(const MACROBLOCKD *const xd,
+                                    CdefBlockInfo *const fb_info, int *level,
+                                    int *sec_strength, int fbc, int fbr,
+                                    int plane) {
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  fb_info->level = level[plane_type];
+  fb_info->sec_strength = sec_strength[plane_type];
+  fb_info->dst = xd->plane[plane].dst.buf;
+  fb_info->dst_stride = xd->plane[plane].dst.stride;
+
+  fb_info->xdec = xd->plane[plane].subsampling_x;
+  fb_info->ydec = xd->plane[plane].subsampling_y;
+  fb_info->mi_wide_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_x;
+  fb_info->mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
+  fb_info->roffset = MI_SIZE_64X64 * fbr << fb_info->mi_high_l2;
+  fb_info->coffset = MI_SIZE_64X64 * fbc << fb_info->mi_wide_l2;
+}
+
+static void cdef_fb_col(const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+                        CdefBlockInfo *const fb_info, uint16_t **const colbuf,
+                        int *cdef_left, int fbc, int fbr) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mbmi_cdef_strength =
+      mi_params
+          ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                         MI_SIZE_64X64 * fbc]
+          ->cdef_strength;
+  const int num_planes = av1_num_planes(cm);
+  int is_zero_level[PLANE_TYPES] = { 1, 1 };
+  int level[PLANE_TYPES] = { 0 };
+  int sec_strength[PLANE_TYPES] = { 0 };
+  const CdefInfo *const cdef_info = &cm->cdef_info;
+
+  if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                              MI_SIZE_64X64 * fbc] == NULL ||
+      mbmi_cdef_strength == -1) {
+    av1_zero_array(cdef_left, num_planes);
+    return;
+  }
+
+  // Compute level and secondary strength for planes
+  level[PLANE_TYPE_Y] =
+      cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+  sec_strength[PLANE_TYPE_Y] =
+      cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+  sec_strength[PLANE_TYPE_Y] += sec_strength[PLANE_TYPE_Y] == 3;
+  is_zero_level[PLANE_TYPE_Y] =
+      (level[PLANE_TYPE_Y] == 0) && (sec_strength[PLANE_TYPE_Y] == 0);
+
+  if (num_planes > 1) {
+    level[PLANE_TYPE_UV] =
+        cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+    sec_strength[PLANE_TYPE_UV] =
+        cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+    sec_strength[PLANE_TYPE_UV] += sec_strength[PLANE_TYPE_UV] == 3;
+    is_zero_level[PLANE_TYPE_UV] =
+        (level[PLANE_TYPE_UV] == 0) && (sec_strength[PLANE_TYPE_UV] == 0);
+  }
+
+  if (is_zero_level[PLANE_TYPE_Y] && is_zero_level[PLANE_TYPE_UV]) {
+    av1_zero_array(cdef_left, num_planes);
+    return;
+  }
+
+  fb_info->cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64,
+                                                 fbc * MI_SIZE_64X64,
+                                                 fb_info->dlist, BLOCK_64X64);
+  if (!fb_info->cdef_count) {
+    av1_zero_array(cdef_left, num_planes);
+    return;
+  }
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    // Do not skip cdef filtering for luma plane as filter direction is
+    // computed based on luma.
+    if (plane && is_zero_level[get_plane_type(plane)]) {
+      cdef_left[plane] = 0;
+      continue;
     }
+    cdef_init_fb_col(xd, fb_info, level, sec_strength, fbc, fbr, plane);
+    cdef_prepare_fb(cm, fb_info, colbuf, cdef_left[plane], fbc, fbr, plane);
+    cdef_filter_fb(fb_info, plane, cm->seq_params->use_highbitdepth);
+    cdef_left[plane] = 1;
+  }
+}
+
+// Initializes row-level parameters for CDEF frame.
+void av1_cdef_init_fb_row(const AV1_COMMON *const cm,
+                          const MACROBLOCKD *const xd,
+                          CdefBlockInfo *const fb_info,
+                          uint16_t **const linebuf, uint16_t *const src,
+                          struct AV1CdefSyncData *const cdef_sync, int fbr) {
+  (void)cdef_sync;
+  const int num_planes = av1_num_planes(cm);
+  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int luma_stride =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
+  const bool ping_pong = fbr & 1;
+  // for the current filter block, it's top left corner mi structure (mi_tl)
+  // is first accessed to check whether the top and left boundaries are
+  // frame boundaries. Then bottom-left and top-right mi structures are
+  // accessed to check whether the bottom and right boundaries
+  // (respectively) are frame boundaries.
+  //
+  // Note that we can't just check the bottom-right mi structure - eg. if
+  // we're at the right-hand edge of the frame but not the bottom, then
+  // the bottom-right mi is NULL but the bottom-left is not.
+  fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0;
+  if (fbr != nvfb - 1)
+    fb_info->frame_boundary[BOTTOM] =
+        (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0;
+  else
+    fb_info->frame_boundary[BOTTOM] = 1;
+
+  fb_info->src = src;
+  fb_info->damping = cm->cdef_info.cdef_damping;
+  fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
+  av1_zero(fb_info->dir);
+  av1_zero(fb_info->var);
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
+    const int offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
+    const int stride = luma_stride >> xd->plane[plane].subsampling_x;
+    // here ping-pong buffers are maintained for top linebuf
+    // to avoid linebuf over-write by consecutive row.
+    uint16_t *const top_linebuf =
+        &linebuf[plane][ping_pong * CDEF_VBORDER * stride];
+    fb_info->bot_linebuf[plane] = &linebuf[plane][(CDEF_VBORDER << 1) * stride];
+
+    if (fbr != nvfb - 1)  // top line buffer copy
+      av1_cdef_copy_sb8_16(cm, top_linebuf, stride, xd->plane[plane].dst.buf,
+                           offset - CDEF_VBORDER, 0,
+                           xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+    fb_info->top_linebuf[plane] =
+        &linebuf[plane][(!ping_pong) * CDEF_VBORDER * stride];
+
+    if (fbr != nvfb - 1)  // bottom line buffer copy
+      av1_cdef_copy_sb8_16(cm, fb_info->bot_linebuf[plane], stride,
+                           xd->plane[plane].dst.buf, offset, 0,
+                           xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
   }
-  aom_free(row_cdef);
-  for (int pli = 0; pli < num_planes; pli++) {
-    aom_free(linebuf[pli]);
-    aom_free(colbuf[pli]);
+}
+
+void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                     uint16_t **const linebuf, uint16_t **const colbuf,
+                     uint16_t *const src, int fbr,
+                     cdef_init_fb_row_t cdef_init_fb_row_fn,
+                     struct AV1CdefSyncData *const cdef_sync) {
+  CdefBlockInfo fb_info;
+  int cdef_left[MAX_MB_PLANE] = { 1, 1, 1 };
+  const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+
+  cdef_init_fb_row_fn(cm, xd, &fb_info, linebuf, src, cdef_sync, fbr);
+  for (int fbc = 0; fbc < nhfb; fbc++) {
+    fb_info.frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0;
+    if (fbc != nhfb - 1)
+      fb_info.frame_boundary[RIGHT] =
+          (MI_SIZE_64X64 * (fbc + 1) == cm->mi_params.mi_cols) ? 1 : 0;
+    else
+      fb_info.frame_boundary[RIGHT] = 1;
+    cdef_fb_col(cm, xd, &fb_info, colbuf, &cdef_left[0], fbc, fbr);
   }
 }
+
+// Perform CDEF on input frame.
+// Inputs:
+//   frame: Pointer to input frame buffer.
+//   cm: Pointer to common structure.
+//   xd: Pointer to common current coding block structure.
+// Returns:
+//   Nothing will be returned.
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm,
+                    MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn) {
+  const int num_planes = av1_num_planes(cm);
+  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+
+  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
+                       num_planes);
+
+  for (int fbr = 0; fbr < nvfb; fbr++)
+    av1_cdef_fb_row(cm, xd, cm->cdef_info.linebuf, cm->cdef_info.colbuf,
+                    cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL);
+}
diff --git a/media/libaom/src/av1/common/cdef.h b/media/libaom/src/av1/common/cdef.h
index c36fd135a4..5bf40e4710 100644
--- a/media/libaom/src/av1/common/cdef.h
+++ b/media/libaom/src/av1/common/cdef.h
@@ -23,6 +23,39 @@
 #include "av1/common/av1_common_int.h"
 #include "av1/common/cdef_block.h"
 
+enum { TOP, LEFT, BOTTOM, RIGHT, BOUNDARIES } UENUM1BYTE(BOUNDARY);
+
+struct AV1CdefSyncData;
+
+/*!\brief Parameters related to CDEF Block */
+typedef struct {
+  uint16_t *src;                       /*!< CDEF intermediate buffer */
+  uint16_t *top_linebuf[MAX_MB_PLANE]; /*!< CDEF top line buffer */
+  uint16_t *bot_linebuf[MAX_MB_PLANE]; /*!< CDEF bottom line buffer */
+  uint8_t *dst;                        /*!< CDEF destination buffer */
+  cdef_list
+      dlist[MI_SIZE_64X64 * MI_SIZE_64X64]; /*!< CDEF 8x8 block positions */
+
+  int xdec;                       /*!< Sub-sampling X */
+  int ydec;                       /*!< Sub-sampling X */
+  int mi_wide_l2;                 /*!< Pixels per mi unit in width */
+  int mi_high_l2;                 /*!< Pixels per mi unit in height */
+  int frame_boundary[BOUNDARIES]; /*!< frame boundaries */
+
+  int damping;     /*!< CDEF damping factor */
+  int coeff_shift; /*!< Bit-depth based shift for calculating filter strength */
+  int level;       /*!< CDEF filtering level */
+  int sec_strength; /*!< CDEF secondary strength */
+  int cdef_count;   /*!< Number of CDEF sub-blocks in superblock */
+  int dir[CDEF_NBLOCKS]
+         [CDEF_NBLOCKS]; /*!< CDEF filter direction for all 8x8 sub-blocks*/
+  int var[CDEF_NBLOCKS][CDEF_NBLOCKS]; /*!< variance for all 8x8 sub-blocks */
+
+  int dst_stride; /*!< CDEF destination buffer stride */
+  int coffset;    /*!< current superblock offset in a row */
+  int roffset;    /*!< current row offset */
+} CdefBlockInfo;
+
 static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
 
 static INLINE int constrain(int diff, int threshold, int damping) {
@@ -40,11 +73,37 @@ extern "C" {
 int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
                              int mi_row, int mi_col, cdef_list *dlist,
                              BLOCK_SIZE bsize);
-void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
 
-void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
-                     AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method,
-                     int rdmult);
+typedef void (*cdef_init_fb_row_t)(
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    CdefBlockInfo *const fb_info, uint16_t **const linebuf, uint16_t *const src,
+    struct AV1CdefSyncData *const cdef_sync, int fbr);
+
+/*!\brief Function for applying CDEF to a frame
+ *
+ * \ingroup in_loop_cdef
+ * This function applies CDEF to a frame.
+ *
+ * \param[in, out]  frame     Compressed frame buffer
+ * \param[in, out]  cm        Pointer to top level common structure
+ * \param[in]       xd        Pointer to common current coding block structure
+ * \param[in]       cdef_init_fb_row_fn   Function Pointer
+ *
+ * \return Nothing is returned. Instead, the filtered frame is output in
+ * \c frame.
+ */
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm,
+                    MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn);
+void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                     uint16_t **const linebuf, uint16_t **const colbuf,
+                     uint16_t *const src, int fbr,
+                     cdef_init_fb_row_t cdef_init_fb_row_fn,
+                     struct AV1CdefSyncData *const cdef_sync);
+void av1_cdef_init_fb_row(const AV1_COMMON *const cm,
+                          const MACROBLOCKD *const xd,
+                          CdefBlockInfo *const fb_info,
+                          uint16_t **const linebuf, uint16_t *const src,
+                          struct AV1CdefSyncData *const cdef_sync, int fbr);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/av1/common/cdef_block.c b/media/libaom/src/av1/common/cdef_block.c
index 7120705d3b..0731678075 100644
--- a/media/libaom/src/av1/common/cdef_block.c
+++ b/media/libaom/src/av1/common/cdef_block.c
@@ -16,9 +16,19 @@
 #include "config/av1_rtcd.h"
 
 #include "av1/common/cdef.h"
+/*
+This is Cdef_Directions (section 7.15.3) with 2 padding entries at the
+beginning and end of the table. The cdef direction range is [0, 7] and the
+first index is offset +/-2. This removes the need to constrain the first
+index to the same range using e.g., & 7.
+*/
+DECLARE_ALIGNED(16, const int, cdef_directions_padded[12][2]) = {
+  /* Padding: cdef_directions[6] */
+  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
+  /* Padding: cdef_directions[7] */
+  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 },
 
-/* Generated from gen_filter_tables.c. */
-DECLARE_ALIGNED(16, const int, cdef_directions[8][2]) = {
+  /* Begin cdef_directions */
   { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
   { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
   { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 },
@@ -26,9 +36,17 @@ DECLARE_ALIGNED(16, const int, cdef_directions[8][2]) = {
   { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 },
   { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 },
   { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
-  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }
+  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 },
+  /* End cdef_directions */
+
+  /* Padding: cdef_directions[0] */
+  { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
+  /* Padding: cdef_directions[1] */
+  { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
 };
 
+const int (*const cdef_directions)[2] = cdef_directions_padded + 2;
+
 /* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
    The search minimizes the weighted variance along all the lines in a
    particular direction, i.e. the squared error between the input and a
@@ -107,52 +125,73 @@ int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var,
   return best_dir;
 }
 
+void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2,
+                          int stride, int32_t *var1, int32_t *var2,
+                          int coeff_shift, int *out1, int *out2) {
+  *out1 = cdef_find_dir_c(img1, stride, var1, coeff_shift);
+  *out2 = cdef_find_dir_c(img2, stride, var2, coeff_shift);
+}
+
 const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
 const int cdef_sec_taps[2] = { 2, 1 };
 
 /* Smooth in the direction detected. */
-void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
-                         const uint16_t *in, int pri_strength, int sec_strength,
-                         int dir, int pri_damping, int sec_damping, int bsize,
-                         int coeff_shift) {
+static void cdef_filter_block_internal(
+    uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in,
+    int pri_strength, int sec_strength, int dir, int pri_damping,
+    int sec_damping, int coeff_shift, int block_width, int block_height,
+    int enable_primary, int enable_secondary) {
+  const int clipping_required = (enable_primary && enable_secondary);
   int i, j, k;
   const int s = CDEF_BSTRIDE;
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
   const int *sec_taps = cdef_sec_taps;
-  for (i = 0; i < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_4X8); i++) {
-    for (j = 0; j < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_8X4); j++) {
+  for (i = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++) {
       int16_t sum = 0;
       int16_t y;
       int16_t x = in[i * s + j];
       int max = x;
       int min = x;
       for (k = 0; k < 2; k++) {
-        int16_t p0 = in[i * s + j + cdef_directions[dir][k]];
-        int16_t p1 = in[i * s + j - cdef_directions[dir][k]];
-        sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping);
-        sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping);
-        if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max);
-        if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max);
-        min = AOMMIN(p0, min);
-        min = AOMMIN(p1, min);
-        int16_t s0 = in[i * s + j + cdef_directions[(dir + 2) & 7][k]];
-        int16_t s1 = in[i * s + j - cdef_directions[(dir + 2) & 7][k]];
-        int16_t s2 = in[i * s + j + cdef_directions[(dir + 6) & 7][k]];
-        int16_t s3 = in[i * s + j - cdef_directions[(dir + 6) & 7][k]];
-        if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max);
-        if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max);
-        if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max);
-        if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max);
-        min = AOMMIN(s0, min);
-        min = AOMMIN(s1, min);
-        min = AOMMIN(s2, min);
-        min = AOMMIN(s3, min);
-        sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping);
-        sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping);
-        sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping);
-        sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping);
+        if (enable_primary) {
+          int16_t p0 = in[i * s + j + cdef_directions[dir][k]];
+          int16_t p1 = in[i * s + j - cdef_directions[dir][k]];
+          sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping);
+          sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping);
+          if (clipping_required) {
+            if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max);
+            if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max);
+            min = AOMMIN(p0, min);
+            min = AOMMIN(p1, min);
+          }
+        }
+        if (enable_secondary) {
+          int16_t s0 = in[i * s + j + cdef_directions[dir + 2][k]];
+          int16_t s1 = in[i * s + j - cdef_directions[dir + 2][k]];
+          int16_t s2 = in[i * s + j + cdef_directions[dir - 2][k]];
+          int16_t s3 = in[i * s + j - cdef_directions[dir - 2][k]];
+          if (clipping_required) {
+            if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max);
+            if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max);
+            if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max);
+            if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max);
+            min = AOMMIN(s0, min);
+            min = AOMMIN(s1, min);
+            min = AOMMIN(s2, min);
+            min = AOMMIN(s3, min);
+          }
+          sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping);
+          sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping);
+          sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping);
+          sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping);
+        }
+      }
+      y = ((int16_t)x + ((8 + sum - (sum < 0)) >> 4));
+      if (clipping_required) {
+        y = clamp(y, min, max);
       }
-      y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max);
+
       if (dst8)
         dst8[i * dstride + j] = (uint8_t)y;
       else
@@ -161,6 +200,86 @@ void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
   }
 }
 
+void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in,
+                       int pri_strength, int sec_strength, int dir,
+                       int pri_damping, int sec_damping, int coeff_shift,
+                       int block_width, int block_height) {
+  cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength,
+                             sec_strength, dir, pri_damping, sec_damping,
+                             coeff_shift, block_width, block_height,
+                             /*enable_primary=*/1, /*enable_secondary=*/1);
+}
+
+void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in,
+                       int pri_strength, int sec_strength, int dir,
+                       int pri_damping, int sec_damping, int coeff_shift,
+                       int block_width, int block_height) {
+  cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength,
+                             sec_strength, dir, pri_damping, sec_damping,
+                             coeff_shift, block_width, block_height,
+                             /*enable_primary=*/1, /*enable_secondary=*/0);
+}
+
+void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in,
+                       int pri_strength, int sec_strength, int dir,
+                       int pri_damping, int sec_damping, int coeff_shift,
+                       int block_width, int block_height) {
+  cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength,
+                             sec_strength, dir, pri_damping, sec_damping,
+                             coeff_shift, block_width, block_height,
+                             /*enable_primary=*/0, /*enable_secondary=*/1);
+}
+
+void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in,
+                       int pri_strength, int sec_strength, int dir,
+                       int pri_damping, int sec_damping, int coeff_shift,
+                       int block_width, int block_height) {
+  cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength,
+                             sec_strength, dir, pri_damping, sec_damping,
+                             coeff_shift, block_width, block_height,
+                             /*enable_primary=*/0, /*enable_secondary=*/0);
+}
+
+void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in,
+                        int pri_strength, int sec_strength, int dir,
+                        int pri_damping, int sec_damping, int coeff_shift,
+                        int block_width, int block_height) {
+  cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength,
+                             sec_strength, dir, pri_damping, sec_damping,
+                             coeff_shift, block_width, block_height,
+                             /*enable_primary=*/1, /*enable_secondary=*/1);
+}
+
+void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in,
+                        int pri_strength, int sec_strength, int dir,
+                        int pri_damping, int sec_damping, int coeff_shift,
+                        int block_width, int block_height) {
+  cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength,
+                             sec_strength, dir, pri_damping, sec_damping,
+                             coeff_shift, block_width, block_height,
+                             /*enable_primary=*/1, /*enable_secondary=*/0);
+}
+
+void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in,
+                        int pri_strength, int sec_strength, int dir,
+                        int pri_damping, int sec_damping, int coeff_shift,
+                        int block_width, int block_height) {
+  cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength,
+                             sec_strength, dir, pri_damping, sec_damping,
+                             coeff_shift, block_width, block_height,
+                             /*enable_primary=*/0, /*enable_secondary=*/1);
+}
+
+void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in,
+                        int pri_strength, int sec_strength, int dir,
+                        int pri_damping, int sec_damping, int coeff_shift,
+                        int block_width, int block_height) {
+  cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength,
+                             sec_strength, dir, pri_damping, sec_damping,
+                             coeff_shift, block_width, block_height,
+                             /*enable_primary=*/0, /*enable_secondary=*/0);
+}
+
 /* Compute the primary filter strength for an 8x8 block based on the
    directional variance difference. A high variance difference means
    that we have a highly directional pattern (e.g. a high contrast
@@ -173,6 +292,34 @@ static INLINE int adjust_strength(int strength, int32_t var) {
   return var ? (strength * (4 + i) + 8) >> 4 : 0;
 }
 
+static AOM_INLINE void aom_cdef_find_dir(uint16_t *in, cdef_list *dlist,
+                                         int var[CDEF_NBLOCKS][CDEF_NBLOCKS],
+                                         int cdef_count, int coeff_shift,
+                                         int dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) {
+  int bi;
+
+  // Find direction of two 8x8 blocks together.
+  for (bi = 0; bi < cdef_count - 1; bi += 2) {
+    const int by = dlist[bi].by;
+    const int bx = dlist[bi].bx;
+    const int by2 = dlist[bi + 1].by;
+    const int bx2 = dlist[bi + 1].bx;
+    const int pos1 = 8 * by * CDEF_BSTRIDE + 8 * bx;
+    const int pos2 = 8 * by2 * CDEF_BSTRIDE + 8 * bx2;
+    cdef_find_dir_dual(&in[pos1], &in[pos2], CDEF_BSTRIDE, &var[by][bx],
+                       &var[by2][bx2], coeff_shift, &dir[by][bx],
+                       &dir[by2][bx2]);
+  }
+
+  // Process remaining 8x8 blocks here. One 8x8 at a time.
+  if (cdef_count % 2) {
+    const int by = dlist[bi].by;
+    const int bx = dlist[bi].bx;
+    dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
+                                CDEF_BSTRIDE, &var[by][bx], coeff_shift);
+  }
+}
+
 void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
                         uint16_t *in, int xdec, int ydec,
                         int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
@@ -207,12 +354,7 @@ void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
 
   if (pli == 0) {
     if (!dirinit || !*dirinit) {
-      for (bi = 0; bi < cdef_count; bi++) {
-        by = dlist[bi].by;
-        bx = dlist[bi].bx;
-        dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
-                                    CDEF_BSTRIDE, &var[by][bx], coeff_shift);
-      }
+      aom_cdef_find_dir(in, dlist, var, cdef_count, coeff_shift, dir);
       if (dirinit) *dirinit = 1;
     }
   }
@@ -226,28 +368,59 @@ void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
     }
   }
 
-  const int bsize =
-      ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
-  const int t = pri_strength;
-  const int s = sec_strength;
-  for (bi = 0; bi < cdef_count; bi++) {
-    by = dlist[bi].by;
-    bx = dlist[bi].bx;
-    if (dst8) {
-      cdef_filter_block(
-          &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], NULL, dstride,
-          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)],
-          (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
-          damping, damping, bsize, coeff_shift);
-    } else {
-      cdef_filter_block(
-          NULL,
+  if (dst8) {
+    const int block_width = 8 >> xdec;
+    const int block_height = 8 >> ydec;
+    /*
+     * strength_index == 0 : enable_primary = 1, enable_secondary = 1
+     * strength_index == 1 : enable_primary = 1, enable_secondary = 0
+     * strength_index == 2 : enable_primary = 0, enable_secondary = 1
+     * strength_index == 3 : enable_primary = 0, enable_secondary = 0
+     */
+    const cdef_filter_block_func cdef_filter_fn[4] = {
+      cdef_filter_8_0, cdef_filter_8_1, cdef_filter_8_2, cdef_filter_8_3
+    };
+
+    for (bi = 0; bi < cdef_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      const int t =
+          (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx]));
+      const int strength_index = (sec_strength == 0) | ((t == 0) << 1);
+
+      cdef_filter_fn[strength_index](
+          &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], dstride,
+          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t,
+          sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping,
+          coeff_shift, block_width, block_height);
+    }
+  } else {
+    const int block_width = 8 >> xdec;
+    const int block_height = 8 >> ydec;
+    /*
+     * strength_index == 0 : enable_primary = 1, enable_secondary = 1
+     * strength_index == 1 : enable_primary = 1, enable_secondary = 0
+     * strength_index == 2 : enable_primary = 0, enable_secondary = 1
+     * strength_index == 3 : enable_primary = 0, enable_secondary = 0
+     */
+    const cdef_filter_block_func cdef_filter_fn[4] = {
+      cdef_filter_16_0, cdef_filter_16_1, cdef_filter_16_2, cdef_filter_16_3
+    };
+
+    for (bi = 0; bi < cdef_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      const int t =
+          (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx]));
+      const int strength_index = (sec_strength == 0) | ((t == 0) << 1);
+
+      cdef_filter_fn[strength_index](
           &dst16[dirinit ? bi << (bw_log2 + bh_log2)
                          : (by << bh_log2) * dstride + (bx << bw_log2)],
           dirinit ? 1 << bw_log2 : dstride,
-          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)],
-          (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
-          damping, damping, bsize, coeff_shift);
+          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t,
+          sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping,
+          coeff_shift, block_width, block_height);
     }
   }
 }
diff --git a/media/libaom/src/av1/common/cdef_block.h b/media/libaom/src/av1/common/cdef_block.h
index 6b0ae0a9db..679f1ef2ab 100644
--- a/media/libaom/src/av1/common/cdef_block.h
+++ b/media/libaom/src/av1/common/cdef_block.h
@@ -12,40 +12,41 @@
 #ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_
 #define AOM_AV1_COMMON_CDEF_BLOCK_H_
 
-#include "av1/common/odintrin.h"
+#include "aom_dsp/odintrin.h"
 
 #define CDEF_BLOCKSIZE 64
 #define CDEF_BLOCKSIZE_LOG2 6
 #define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8)
 #define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2)
 
-/* We need to buffer three vertical lines. */
-#define CDEF_VBORDER (3)
+/* We need to buffer two vertical lines. */
+#define CDEF_VBORDER (2)
 /* We only need to buffer three horizontal pixels too, but let's align to
    16 bytes (8 x 16 bits) to make vectorization easier. */
 #define CDEF_HBORDER (8)
 #define CDEF_BSTRIDE \
   ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
 
-#define CDEF_VERY_LARGE (30000)
+#define CDEF_VERY_LARGE (0x4000)
 #define CDEF_INBUF_SIZE \
   (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
 
 extern const int cdef_pri_taps[2][2];
 extern const int cdef_sec_taps[2];
-DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
+extern const int (*const cdef_directions)[2];
 
 typedef struct {
   uint8_t by;
   uint8_t bx;
 } cdef_list;
 
-typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
-                                       int dstride, const uint16_t *in,
-                                       int pri_strength, int sec_strength,
-                                       int dir, int pri_damping,
-                                       int sec_damping, int bsize,
-                                       int coeff_shift);
+typedef void (*cdef_filter_block_func)(void *dest, int dstride,
+                                       const uint16_t *in, int pri_strength,
+                                       int sec_strength, int dir,
+                                       int pri_damping, int sec_damping,
+                                       int coeff_shift, int block_width,
+                                       int block_height);
+
 void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
                               cdef_list *dlist, int cdef_count, int bsize);
 
diff --git a/media/libaom/src/av1/common/cdef_block_simd.h b/media/libaom/src/av1/common/cdef_block_simd.h
index 5a52bc1e4f..73119e2faf 100644
--- a/media/libaom/src/av1/common/cdef_block_simd.h
+++ b/media/libaom/src/av1/common/cdef_block_simd.h
@@ -208,692 +208,609 @@ SIMD_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
   return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign);
 }
 
-// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
-SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
-                           unsigned int adjdamp) {
-  const v256 diff16 = v256_sub_16(a, b);
-  v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
-  const v128 sign = v128_cmplt_s8(diff, v128_zero());
-  diff = v128_abs_s8(diff);
-  return v128_xor(
-      v128_add_8(sign,
-                 v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
-                                                v128_shr_u8(diff, adjdamp)))),
-      sign);
-}
-
-void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
-                                        const uint16_t *in, int pri_strength,
-                                        int sec_strength, int dir,
-                                        int pri_damping, int sec_damping,
-                                        int coeff_shift) {
-  v128 p0, p1, p2, p3;
-  v256 sum, row, tap, res;
-  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
-  int po1 = cdef_directions[dir][0];
-  int po2 = cdef_directions[dir][1];
-  int s1o1 = cdef_directions[(dir + 2) & 7][0];
-  int s1o2 = cdef_directions[(dir + 2) & 7][1];
-  int s2o1 = cdef_directions[(dir + 6) & 7][0];
-  int s2o2 = cdef_directions[(dir + 6) & 7][1];
-
-  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
-  const int *sec_taps = cdef_sec_taps;
-
-  if (pri_strength)
-    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
-  if (sec_strength)
-    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
-
-  sum = v256_zero();
-  row = v256_from_v64(v64_load_aligned(&in[0 * CDEF_BSTRIDE]),
-                      v64_load_aligned(&in[1 * CDEF_BSTRIDE]),
-                      v64_load_aligned(&in[2 * CDEF_BSTRIDE]),
-                      v64_load_aligned(&in[3 * CDEF_BSTRIDE]));
-  max = min = row;
-
-  if (pri_strength) {
-    // Primary near taps
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po1]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p0 = constrain(tap, row, pri_strength, pri_damping);
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p1 = constrain(tap, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[0] * (p0 + p1)
-    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
-                                         v256_from_v128(v128_ziphi_8(p0, p1),
-                                                        v128_ziplo_8(p0, p1))));
-
-    // Primary far taps
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po2]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p0 = constrain(tap, row, pri_strength, pri_damping);
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p1 = constrain(tap, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[1] * (p0 + p1)
-    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
-                                         v256_from_v128(v128_ziphi_8(p0, p1),
-                                                        v128_ziplo_8(p0, p1))));
-  }
-
-  if (sec_strength) {
-    // Secondary near taps
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o1]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p0 = constrain(tap, row, sec_strength, sec_damping);
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p1 = constrain(tap, row, sec_strength, sec_damping);
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p2 = constrain(tap, row, sec_strength, sec_damping);
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p3 = constrain(tap, row, sec_strength, sec_damping);
-
-    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
-    p0 = v128_add_8(p0, p1);
-    p2 = v128_add_8(p2, p3);
-    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
-                                         v256_from_v128(v128_ziphi_8(p0, p2),
-                                                        v128_ziplo_8(p0, p2))));
-
-    // Secondary far taps
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o2]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p0 = constrain(tap, row, sec_strength, sec_damping);
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p1 = constrain(tap, row, sec_strength, sec_damping);
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p2 = constrain(tap, row, sec_strength, sec_damping);
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p3 = constrain(tap, row, sec_strength, sec_damping);
-
-    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
-    p0 = v128_add_8(p0, p1);
-    p2 = v128_add_8(p2, p3);
-
-    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
-                                         v256_from_v128(v128_ziphi_8(p0, p2),
-                                                        v128_ziplo_8(p0, p2))));
+SIMD_INLINE v256 get_max_primary(const int is_lowbd, v256 *tap, v256 max,
+                                 v256 cdef_large_value_mask) {
+  if (is_lowbd) {
+    v256 max_u8;
+    max_u8 = tap[0];
+    max_u8 = v256_max_u8(max_u8, tap[1]);
+    max_u8 = v256_max_u8(max_u8, tap[2]);
+    max_u8 = v256_max_u8(max_u8, tap[3]);
+    /* The source is 16 bits, however, we only really care about the lower
+    8 bits.  The upper 8 bits contain the "large" flag.  After the final
+    primary max has been calculated, zero out the upper 8 bits.  Use this
+    to find the "16 bit" max. */
+    max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
+  } else {
+    /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
+    max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask));
+    max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask));
+    max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask));
+    max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask));
   }
-
-  // res = row + ((sum - (sum < 0) + 8) >> 4)
-  sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
-  res = v256_add_16(sum, v256_dup_16(8));
-  res = v256_shr_n_s16(res, 4);
-  res = v256_add_16(row, res);
-  res = v256_min_s16(v256_max_s16(res, min), max);
-  res = v256_pack_s16_u8(res, res);
-
-  p0 = v256_low_v128(res);
-  u32_store_aligned(&dst[0 * dstride], v64_high_u32(v128_high_v64(p0)));
-  u32_store_aligned(&dst[1 * dstride], v64_low_u32(v128_high_v64(p0)));
-  u32_store_aligned(&dst[2 * dstride], v64_high_u32(v128_low_v64(p0)));
-  u32_store_aligned(&dst[3 * dstride], v64_low_u32(v128_low_v64(p0)));
+  return max;
 }
 
-void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
-                                        const uint16_t *in, int pri_strength,
-                                        int sec_strength, int dir,
-                                        int pri_damping, int sec_damping,
-                                        int coeff_shift) {
-  int i;
-  v128 p0, p1, p2, p3;
-  v256 sum, row, res, tap;
-  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
-  int po1 = cdef_directions[dir][0];
-  int po2 = cdef_directions[dir][1];
-  int s1o1 = cdef_directions[(dir + 2) & 7][0];
-  int s1o2 = cdef_directions[(dir + 2) & 7][1];
-  int s2o1 = cdef_directions[(dir + 6) & 7][0];
-  int s2o2 = cdef_directions[(dir + 6) & 7][1];
-
-  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
-  const int *sec_taps = cdef_sec_taps;
-
-  if (pri_strength)
-    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
-  if (sec_strength)
-    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
-  for (i = 0; i < 8; i += 2) {
-    sum = v256_zero();
-    row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
-                         v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
-
-    max = min = row;
-    // Primary near taps
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p0 = constrain(tap, row, pri_strength, pri_damping);
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p1 = constrain(tap, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[0] * (p0 + p1)
-    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
-                                         v256_from_v128(v128_ziphi_8(p0, p1),
-                                                        v128_ziplo_8(p0, p1))));
-
-    // Primary far taps
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p0 = constrain(tap, row, pri_strength, pri_damping);
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p1 = constrain(tap, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[1] * (p0 + p1)
-    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
-                                         v256_from_v128(v128_ziphi_8(p0, p1),
-                                                        v128_ziplo_8(p0, p1))));
-
-    // Secondary near taps
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p0 = constrain(tap, row, sec_strength, sec_damping);
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p1 = constrain(tap, row, sec_strength, sec_damping);
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p2 = constrain(tap, row, sec_strength, sec_damping);
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p3 = constrain(tap, row, sec_strength, sec_damping);
-
-    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
-    p0 = v128_add_8(p0, p1);
-    p2 = v128_add_8(p2, p3);
-    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
-                                         v256_from_v128(v128_ziphi_8(p0, p2),
-                                                        v128_ziplo_8(p0, p2))));
-
-    // Secondary far taps
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p0 = constrain(tap, row, sec_strength, sec_damping);
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p1 = constrain(tap, row, sec_strength, sec_damping);
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p2 = constrain(tap, row, sec_strength, sec_damping);
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-    p3 = constrain(tap, row, sec_strength, sec_damping);
-
-    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
-    p0 = v128_add_8(p0, p1);
-    p2 = v128_add_8(p2, p3);
-    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
-                                         v256_from_v128(v128_ziphi_8(p0, p2),
-                                                        v128_ziplo_8(p0, p2))));
-
-    // res = row + ((sum - (sum < 0) + 8) >> 4)
-    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
-    res = v256_add_16(sum, v256_dup_16(8));
-    res = v256_shr_n_s16(res, 4);
-    res = v256_add_16(row, res);
-    res = v256_min_s16(v256_max_s16(res, min), max);
-    res = v256_pack_s16_u8(res, res);
-
-    p0 = v256_low_v128(res);
-    v64_store_aligned(&dst[i * dstride], v128_high_v64(p0));
-    v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(p0));
+SIMD_INLINE v256 get_max_secondary(const int is_lowbd, v256 *tap, v256 max,
+                                   v256 cdef_large_value_mask) {
+  if (is_lowbd) {
+    v256 max_u8;
+    max_u8 = tap[0];
+    max_u8 = v256_max_u8(max_u8, tap[1]);
+    max_u8 = v256_max_u8(max_u8, tap[2]);
+    max_u8 = v256_max_u8(max_u8, tap[3]);
+    max_u8 = v256_max_u8(max_u8, tap[4]);
+    max_u8 = v256_max_u8(max_u8, tap[5]);
+    max_u8 = v256_max_u8(max_u8, tap[6]);
+    max_u8 = v256_max_u8(max_u8, tap[7]);
+    /* The source is 16 bits, however, we only really care about the lower
+    8 bits.  The upper 8 bits contain the "large" flag.  After the final
+    primary max has been calculated, zero out the upper 8 bits.  Use this
+    to find the "16 bit" max. */
+    max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
+  } else {
+    /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
+    max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask));
+    max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask));
+    max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask));
+    max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask));
+    max = v256_max_s16(max, v256_and(tap[4], cdef_large_value_mask));
+    max = v256_max_s16(max, v256_and(tap[5], cdef_large_value_mask));
+    max = v256_max_s16(max, v256_and(tap[6], cdef_large_value_mask));
+    max = v256_max_s16(max, v256_and(tap[7], cdef_large_value_mask));
   }
+  return max;
 }
 
-void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
-                                         const uint16_t *in, int pri_strength,
-                                         int sec_strength, int dir,
-                                         int pri_damping, int sec_damping,
-                                         int coeff_shift) {
-  int i;
-  v256 p0, p1, p2, p3, sum, row, res;
-  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
-  int po1 = cdef_directions[dir][0];
-  int po2 = cdef_directions[dir][1];
-  int s1o1 = cdef_directions[(dir + 2) & 7][0];
-  int s1o2 = cdef_directions[(dir + 2) & 7][1];
-  int s2o1 = cdef_directions[(dir + 6) & 7][0];
-  int s2o2 = cdef_directions[(dir + 6) & 7][1];
-
+SIMD_INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride,
+                                  const uint16_t *in, int pri_strength,
+                                  int sec_strength, int dir, int pri_damping,
+                                  int sec_damping, int coeff_shift, int height,
+                                  int enable_primary, int enable_secondary) {
+  uint8_t *dst8 = (uint8_t *)dest;
+  uint16_t *dst16 = (uint16_t *)dest;
+  const int clipping_required = enable_primary && enable_secondary;
+  v256 p0, p1, p2, p3;
+  v256 sum, row, res;
+  v256 max, min;
+  const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE);
+  const int po1 = cdef_directions[dir][0];
+  const int po2 = cdef_directions[dir][1];
+  const int s1o1 = cdef_directions[dir + 2][0];
+  const int s1o2 = cdef_directions[dir + 2][1];
+  const int s2o1 = cdef_directions[dir - 2][0];
+  const int s2o2 = cdef_directions[dir - 2][1];
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
   const int *sec_taps = cdef_sec_taps;
+  int i;
 
-  if (pri_strength)
+  if (enable_primary && pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
-  if (sec_strength)
+  if (enable_secondary && sec_strength)
     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
-  for (i = 0; i < 4; i += 4) {
+
+  for (i = 0; i < height; i += 4) {
     sum = v256_zero();
-    row = v256_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
+    row = v256_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]),
                         v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]),
                         v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
                         v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
-    min = max = row;
+    max = min = row;
 
-    // Primary near taps
-    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]),
-                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]),
-                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1]));
-    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]),
-                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]),
-                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1]));
-    max =
-        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
-                     v256_andn(p1, v256_cmpeq_16(p1, large)));
-    min = v256_min_s16(v256_min_s16(min, p0), p1);
-    p0 = constrain16(p0, row, pri_strength, pri_damping);
-    p1 = constrain16(p1, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[0] * (p0 + p1)
-    sum = v256_add_16(
-        sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
-
-    // Primary far taps
-    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]),
-                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]),
-                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2]));
-    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]),
-                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]),
-                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2]));
-    max =
-        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
-                     v256_andn(p1, v256_cmpeq_16(p1, large)));
-    min = v256_min_s16(v256_min_s16(min, p0), p1);
-    p0 = constrain16(p0, row, pri_strength, pri_damping);
-    p1 = constrain16(p1, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[1] * (p0 + p1)
-    sum = v256_add_16(
-        sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
-
-    // Secondary near taps
-    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]),
-                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]),
-                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1]));
-    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]),
-                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]),
-                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1]));
-    p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]),
-                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]),
-                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1]));
-    p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]),
-                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]),
-                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1]));
-    max =
-        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
-                     v256_andn(p1, v256_cmpeq_16(p1, large)));
-    max =
-        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
-                     v256_andn(p3, v256_cmpeq_16(p3, large)));
-    min = v256_min_s16(
-        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
-    p0 = constrain16(p0, row, sec_strength, sec_damping);
-    p1 = constrain16(p1, row, sec_strength, sec_damping);
-    p2 = constrain16(p2, row, sec_strength, sec_damping);
-    p3 = constrain16(p3, row, sec_strength, sec_damping);
-
-    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
-    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
-                                          v256_add_16(v256_add_16(p0, p1),
-                                                      v256_add_16(p2, p3))));
-
-    // Secondary far taps
-    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]),
-                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]),
-                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2]));
-    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]),
-                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]),
-                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2]));
-    p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]),
-                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]),
-                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2]));
-    p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]),
-                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]),
-                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2]));
-    max =
-        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
-                     v256_andn(p1, v256_cmpeq_16(p1, large)));
-    max =
-        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
-                     v256_andn(p3, v256_cmpeq_16(p3, large)));
-    min = v256_min_s16(
-        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
-    p0 = constrain16(p0, row, sec_strength, sec_damping);
-    p1 = constrain16(p1, row, sec_strength, sec_damping);
-    p2 = constrain16(p2, row, sec_strength, sec_damping);
-    p3 = constrain16(p3, row, sec_strength, sec_damping);
-
-    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
-    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
-                                          v256_add_16(v256_add_16(p0, p1),
-                                                      v256_add_16(p2, p3))));
+    if (enable_primary) {
+      v256 tap[4];
+      // Primary near taps
+      tap[0] =
+          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po1]),
+                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]),
+                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]),
+                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1]));
+      p0 = constrain16(tap[0], row, pri_strength, pri_damping);
+      tap[1] =
+          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po1]),
+                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]),
+                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]),
+                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1]));
+      p1 = constrain16(tap[1], row, pri_strength, pri_damping);
+
+      // sum += pri_taps[0] * (p0 + p1)
+      sum = v256_add_16(
+          sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
+
+      // Primary far taps
+      tap[2] =
+          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po2]),
+                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]),
+                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]),
+                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2]));
+      p0 = constrain16(tap[2], row, pri_strength, pri_damping);
+      tap[3] =
+          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po2]),
+                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]),
+                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]),
+                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2]));
+      p1 = constrain16(tap[3], row, pri_strength, pri_damping);
+
+      // sum += pri_taps[1] * (p0 + p1)
+      sum = v256_add_16(
+          sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
+      if (clipping_required) {
+        max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
+
+        min = v256_min_s16(min, tap[0]);
+        min = v256_min_s16(min, tap[1]);
+        min = v256_min_s16(min, tap[2]);
+        min = v256_min_s16(min, tap[3]);
+      }
+    }
+
+    if (enable_secondary) {
+      v256 tap[8];
+      // Secondary near taps
+      tap[0] =
+          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o1]),
+                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]),
+                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]),
+                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1]));
+      p0 = constrain16(tap[0], row, sec_strength, sec_damping);
+      tap[1] =
+          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o1]),
+                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]),
+                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]),
+                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1]));
+      p1 = constrain16(tap[1], row, sec_strength, sec_damping);
+      tap[2] =
+          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o1]),
+                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]),
+                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]),
+                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1]));
+      p2 = constrain16(tap[2], row, sec_strength, sec_damping);
+      tap[3] =
+          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o1]),
+                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]),
+                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]),
+                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1]));
+      p3 = constrain16(tap[3], row, sec_strength, sec_damping);
+
+      // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+      sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+                                            v256_add_16(v256_add_16(p0, p1),
+                                                        v256_add_16(p2, p3))));
+
+      // Secondary far taps
+      tap[4] =
+          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o2]),
+                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]),
+                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]),
+                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2]));
+      p0 = constrain16(tap[4], row, sec_strength, sec_damping);
+      tap[5] =
+          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o2]),
+                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]),
+                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]),
+                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2]));
+      p1 = constrain16(tap[5], row, sec_strength, sec_damping);
+      tap[6] =
+          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o2]),
+                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]),
+                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]),
+                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2]));
+      p2 = constrain16(tap[6], row, sec_strength, sec_damping);
+      tap[7] =
+          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o2]),
+                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]),
+                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]),
+                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2]));
+      p3 = constrain16(tap[7], row, sec_strength, sec_damping);
+
+      // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+      sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+                                            v256_add_16(v256_add_16(p0, p1),
+                                                        v256_add_16(p2, p3))));
+
+      if (clipping_required) {
+        max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
+
+        min = v256_min_s16(min, tap[0]);
+        min = v256_min_s16(min, tap[1]);
+        min = v256_min_s16(min, tap[2]);
+        min = v256_min_s16(min, tap[3]);
+        min = v256_min_s16(min, tap[4]);
+        min = v256_min_s16(min, tap[5]);
+        min = v256_min_s16(min, tap[6]);
+        min = v256_min_s16(min, tap[7]);
+      }
+    }
 
     // res = row + ((sum - (sum < 0) + 8) >> 4)
     sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
     res = v256_add_16(sum, v256_dup_16(8));
     res = v256_shr_n_s16(res, 4);
     res = v256_add_16(row, res);
-    res = v256_min_s16(v256_max_s16(res, min), max);
-
-    v64_store_aligned(&dst[i * dstride], v128_high_v64(v256_high_v128(res)));
-    v64_store_aligned(&dst[(i + 1) * dstride],
-                      v128_low_v64(v256_high_v128(res)));
-    v64_store_aligned(&dst[(i + 2) * dstride],
-                      v128_high_v64(v256_low_v128(res)));
-    v64_store_aligned(&dst[(i + 3) * dstride],
-                      v128_low_v64(v256_low_v128(res)));
+    if (clipping_required) {
+      res = v256_min_s16(v256_max_s16(res, min), max);
+    }
+
+    if (is_lowbd) {
+      const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res));
+      u32_store_aligned(&dst8[(i + 0) * dstride],
+                        v64_high_u32(v128_high_v64(res_128)));
+      u32_store_aligned(&dst8[(i + 1) * dstride],
+                        v64_low_u32(v128_high_v64(res_128)));
+      u32_store_aligned(&dst8[(i + 2) * dstride],
+                        v64_high_u32(v128_low_v64(res_128)));
+      u32_store_aligned(&dst8[(i + 3) * dstride],
+                        v64_low_u32(v128_low_v64(res_128)));
+    } else {
+      v64_store_aligned(&dst16[(i + 0) * dstride],
+                        v128_high_v64(v256_high_v128(res)));
+      v64_store_aligned(&dst16[(i + 1) * dstride],
+                        v128_low_v64(v256_high_v128(res)));
+      v64_store_aligned(&dst16[(i + 2) * dstride],
+                        v128_high_v64(v256_low_v128(res)));
+      v64_store_aligned(&dst16[(i + 3) * dstride],
+                        v128_low_v64(v256_low_v128(res)));
+    }
   }
 }
 
-void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
-                                         const uint16_t *in, int pri_strength,
-                                         int sec_strength, int dir,
-                                         int pri_damping, int sec_damping,
-                                         int coeff_shift) {
+SIMD_INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride,
+                                  const uint16_t *in, int pri_strength,
+                                  int sec_strength, int dir, int pri_damping,
+                                  int sec_damping, int coeff_shift, int height,
+                                  int enable_primary, int enable_secondary) {
+  uint8_t *dst8 = (uint8_t *)dest;
+  uint16_t *dst16 = (uint16_t *)dest;
+  const int clipping_required = enable_primary && enable_secondary;
   int i;
   v256 sum, p0, p1, p2, p3, row, res;
-  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
-  int po1 = cdef_directions[dir][0];
-  int po2 = cdef_directions[dir][1];
-  int s1o1 = cdef_directions[(dir + 2) & 7][0];
-  int s1o2 = cdef_directions[(dir + 2) & 7][1];
-  int s2o1 = cdef_directions[(dir + 6) & 7][0];
-  int s2o2 = cdef_directions[(dir + 6) & 7][1];
-
+  const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE);
+  v256 max, min;
+  const int po1 = cdef_directions[dir][0];
+  const int po2 = cdef_directions[dir][1];
+  const int s1o1 = cdef_directions[dir + 2][0];
+  const int s1o2 = cdef_directions[dir + 2][1];
+  const int s2o1 = cdef_directions[dir - 2][0];
+  const int s2o2 = cdef_directions[dir - 2][1];
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
   const int *sec_taps = cdef_sec_taps;
 
-  if (pri_strength)
+  if (enable_primary && pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
-  if (sec_strength)
+  if (enable_secondary && sec_strength)
     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
 
-  for (i = 0; i < 8; i += 2) {
+  for (i = 0; i < height; i += 2) {
+    v256 tap[8];
     sum = v256_zero();
     row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
                          v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
 
     min = max = row;
-    // Primary near taps
-    p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
-                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
-    p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
-                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
-    max =
-        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
-                     v256_andn(p1, v256_cmpeq_16(p1, large)));
-    min = v256_min_s16(v256_min_s16(min, p0), p1);
-    p0 = constrain16(p0, row, pri_strength, pri_damping);
-    p1 = constrain16(p1, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[0] * (p0 + p1)
-    sum = v256_add_16(
-        sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
-
-    // Primary far taps
-    p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
-                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
-    p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
-                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
-    max =
-        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
-                     v256_andn(p1, v256_cmpeq_16(p1, large)));
-    min = v256_min_s16(v256_min_s16(min, p0), p1);
-    p0 = constrain16(p0, row, pri_strength, pri_damping);
-    p1 = constrain16(p1, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[1] * (p0 + p1)
-    sum = v256_add_16(
-        sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
-
-    // Secondary near taps
-    p0 =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
-    p1 =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
-    p2 =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
-    p3 =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
-    max =
-        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
-                     v256_andn(p1, v256_cmpeq_16(p1, large)));
-    max =
-        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
-                     v256_andn(p3, v256_cmpeq_16(p3, large)));
-    min = v256_min_s16(
-        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
-    p0 = constrain16(p0, row, sec_strength, sec_damping);
-    p1 = constrain16(p1, row, sec_strength, sec_damping);
-    p2 = constrain16(p2, row, sec_strength, sec_damping);
-    p3 = constrain16(p3, row, sec_strength, sec_damping);
-
-    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
-    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
-                                          v256_add_16(v256_add_16(p0, p1),
-                                                      v256_add_16(p2, p3))));
-
-    // Secondary far taps
-    p0 =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
-    p1 =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
-    p2 =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
-    p3 =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
-    max =
-        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
-                     v256_andn(p1, v256_cmpeq_16(p1, large)));
-    max =
-        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
-                     v256_andn(p3, v256_cmpeq_16(p3, large)));
-    min = v256_min_s16(
-        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
-    p0 = constrain16(p0, row, sec_strength, sec_damping);
-    p1 = constrain16(p1, row, sec_strength, sec_damping);
-    p2 = constrain16(p2, row, sec_strength, sec_damping);
-    p3 = constrain16(p3, row, sec_strength, sec_damping);
-
-    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
-    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
-                                          v256_add_16(v256_add_16(p0, p1),
-                                                      v256_add_16(p2, p3))));
+    if (enable_primary) {
+      // Primary near taps
+      tap[0] = v256_from_v128(
+          v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+      tap[1] = v256_from_v128(
+          v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
+      p0 = constrain16(tap[0], row, pri_strength, pri_damping);
+      p1 = constrain16(tap[1], row, pri_strength, pri_damping);
+
+      // sum += pri_taps[0] * (p0 + p1)
+      sum = v256_add_16(
+          sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
+
+      // Primary far taps
+      tap[2] = v256_from_v128(
+          v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+      tap[3] = v256_from_v128(
+          v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
+      p0 = constrain16(tap[2], row, pri_strength, pri_damping);
+      p1 = constrain16(tap[3], row, pri_strength, pri_damping);
+
+      // sum += pri_taps[1] * (p0 + p1)
+      sum = v256_add_16(
+          sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
+
+      if (clipping_required) {
+        max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
+
+        min = v256_min_s16(min, tap[0]);
+        min = v256_min_s16(min, tap[1]);
+        min = v256_min_s16(min, tap[2]);
+        min = v256_min_s16(min, tap[3]);
+      }
+      // End primary
+    }
+
+    if (enable_secondary) {
+      // Secondary near taps
+      tap[0] = v256_from_v128(
+          v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+      tap[1] = v256_from_v128(
+          v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+      tap[2] = v256_from_v128(
+          v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+      tap[3] = v256_from_v128(
+          v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
+      p0 = constrain16(tap[0], row, sec_strength, sec_damping);
+      p1 = constrain16(tap[1], row, sec_strength, sec_damping);
+      p2 = constrain16(tap[2], row, sec_strength, sec_damping);
+      p3 = constrain16(tap[3], row, sec_strength, sec_damping);
+
+      // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+      sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+                                            v256_add_16(v256_add_16(p0, p1),
+                                                        v256_add_16(p2, p3))));
+
+      // Secondary far taps
+      tap[4] = v256_from_v128(
+          v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+      tap[5] = v256_from_v128(
+          v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+      tap[6] = v256_from_v128(
+          v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+      tap[7] = v256_from_v128(
+          v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
+      p0 = constrain16(tap[4], row, sec_strength, sec_damping);
+      p1 = constrain16(tap[5], row, sec_strength, sec_damping);
+      p2 = constrain16(tap[6], row, sec_strength, sec_damping);
+      p3 = constrain16(tap[7], row, sec_strength, sec_damping);
+
+      // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+      sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+                                            v256_add_16(v256_add_16(p0, p1),
+                                                        v256_add_16(p2, p3))));
+
+      if (clipping_required) {
+        max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
+
+        min = v256_min_s16(min, tap[0]);
+        min = v256_min_s16(min, tap[1]);
+        min = v256_min_s16(min, tap[2]);
+        min = v256_min_s16(min, tap[3]);
+        min = v256_min_s16(min, tap[4]);
+        min = v256_min_s16(min, tap[5]);
+        min = v256_min_s16(min, tap[6]);
+        min = v256_min_s16(min, tap[7]);
+      }
+      // End secondary
+    }
 
     // res = row + ((sum - (sum < 0) + 8) >> 4)
     sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
     res = v256_add_16(sum, v256_dup_16(8));
     res = v256_shr_n_s16(res, 4);
     res = v256_add_16(row, res);
-    res = v256_min_s16(v256_max_s16(res, min), max);
-    v128_store_unaligned(&dst[i * dstride], v256_high_v128(res));
-    v128_store_unaligned(&dst[(i + 1) * dstride], v256_low_v128(res));
+    if (clipping_required) {
+      res = v256_min_s16(v256_max_s16(res, min), max);
+    }
+
+    if (is_lowbd) {
+      const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res));
+      v64_store_aligned(&dst8[i * dstride], v128_high_v64(res_128));
+      v64_store_aligned(&dst8[(i + 1) * dstride], v128_low_v64(res_128));
+    } else {
+      v128_store_unaligned(&dst16[i * dstride], v256_high_v128(res));
+      v128_store_unaligned(&dst16[(i + 1) * dstride], v256_low_v128(res));
+    }
   }
 }
 
-void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
-                                  const uint16_t *in, int pri_strength,
-                                  int sec_strength, int dir, int pri_damping,
-                                  int sec_damping, int bsize, int coeff_shift) {
-  if (dst8) {
-    if (bsize == BLOCK_8X8) {
-      SIMD_FUNC(cdef_filter_block_8x8_8)
-      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, coeff_shift);
-    } else if (bsize == BLOCK_4X8) {
-      SIMD_FUNC(cdef_filter_block_4x4_8)
-      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, coeff_shift);
-      SIMD_FUNC(cdef_filter_block_4x4_8)
-      (dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
-       sec_strength, dir, pri_damping, sec_damping, coeff_shift);
-    } else if (bsize == BLOCK_8X4) {
-      SIMD_FUNC(cdef_filter_block_4x4_8)
-      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, coeff_shift);
-      SIMD_FUNC(cdef_filter_block_4x4_8)
-      (dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, coeff_shift);
+SIMD_INLINE void copy_block_4xh(const int is_lowbd, void *dest, int dstride,
+                                const uint16_t *in, int height) {
+  uint8_t *dst8 = (uint8_t *)dest;
+  uint16_t *dst16 = (uint16_t *)dest;
+  int i;
+  for (i = 0; i < height; i += 4) {
+    const v128 row0 =
+        v128_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]),
+                      v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+    const v128 row1 =
+        v128_from_v64(v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
+                      v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
+    if (is_lowbd) {
+      /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */
+      const v128 res_128 = v128_pack_s16_u8(row1, row0);
+      u32_store_aligned(&dst8[(i + 0) * dstride],
+                        v64_high_u32(v128_low_v64(res_128)));
+      u32_store_aligned(&dst8[(i + 1) * dstride],
+                        v64_low_u32(v128_low_v64(res_128)));
+      u32_store_aligned(&dst8[(i + 2) * dstride],
+                        v64_high_u32(v128_high_v64(res_128)));
+      u32_store_aligned(&dst8[(i + 3) * dstride],
+                        v64_low_u32(v128_high_v64(res_128)));
     } else {
-      SIMD_FUNC(cdef_filter_block_4x4_8)
-      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, coeff_shift);
+      v64_store_aligned(&dst16[(i + 0) * dstride], v128_high_v64(row0));
+      v64_store_aligned(&dst16[(i + 1) * dstride], v128_low_v64(row0));
+      v64_store_aligned(&dst16[(i + 2) * dstride], v128_high_v64(row1));
+      v64_store_aligned(&dst16[(i + 3) * dstride], v128_low_v64(row1));
     }
-  } else {
-    if (bsize == BLOCK_8X8) {
-      SIMD_FUNC(cdef_filter_block_8x8_16)
-      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, coeff_shift);
-    } else if (bsize == BLOCK_4X8) {
-      SIMD_FUNC(cdef_filter_block_4x4_16)
-      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, coeff_shift);
-      SIMD_FUNC(cdef_filter_block_4x4_16)
-      (dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
-       sec_strength, dir, pri_damping, sec_damping, coeff_shift);
-    } else if (bsize == BLOCK_8X4) {
-      SIMD_FUNC(cdef_filter_block_4x4_16)
-      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, coeff_shift);
-      SIMD_FUNC(cdef_filter_block_4x4_16)
-      (dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, coeff_shift);
+  }
+}
+
+SIMD_INLINE void copy_block_8xh(const int is_lowbd, void *dest, int dstride,
+                                const uint16_t *in, int height) {
+  uint8_t *dst8 = (uint8_t *)dest;
+  uint16_t *dst16 = (uint16_t *)dest;
+  int i;
+  for (i = 0; i < height; i += 2) {
+    const v128 row0 = v128_load_aligned(&in[i * CDEF_BSTRIDE]);
+    const v128 row1 = v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]);
+    if (is_lowbd) {
+      /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */
+      const v128 res_128 = v128_pack_s16_u8(row1, row0);
+      v64_store_aligned(&dst8[i * dstride], v128_low_v64(res_128));
+      v64_store_aligned(&dst8[(i + 1) * dstride], v128_high_v64(res_128));
     } else {
-      assert(bsize == BLOCK_4X4);
-      SIMD_FUNC(cdef_filter_block_4x4_16)
-      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, coeff_shift);
+      v128_store_unaligned(&dst16[i * dstride], row0);
+      v128_store_unaligned(&dst16[(i + 1) * dstride], row1);
     }
   }
 }
 
-void SIMD_FUNC(cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
-                                              const uint8_t *src, int sstride,
-                                              int v, int h) {
-  int i, j;
-  for (i = 0; i < v; i++) {
-    for (j = 0; j < (h & ~0x7); j += 8) {
-      v64 row = v64_load_unaligned(&src[i * sstride + j]);
-      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
-    }
-    for (; j < h; j++) {
-      dst[i * dstride + j] = src[i * sstride + j];
-    }
+void SIMD_FUNC(cdef_filter_8_0)(void *dest, int dstride, const uint16_t *in,
+                                int pri_strength, int sec_strength, int dir,
+                                int pri_damping, int sec_damping,
+                                int coeff_shift, int block_width,
+                                int block_height) {
+  uint8_t *dst8 = (uint8_t *)dest;
+  if (block_width == 8) {
+    filter_block_8x8(/*is_lowbd=*/1, dst8, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/1);
+  } else {
+    filter_block_4x4(/*is_lowbd=*/1, dst8, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/1);
+  }
+}
+
+void SIMD_FUNC(cdef_filter_8_1)(void *dest, int dstride, const uint16_t *in,
+                                int pri_strength, int sec_strength, int dir,
+                                int pri_damping, int sec_damping,
+                                int coeff_shift, int block_width,
+                                int block_height) {
+  uint8_t *dst8 = (uint8_t *)dest;
+  if (block_width == 8) {
+    filter_block_8x8(/*is_lowbd=*/1, dst8, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/0);
+  } else {
+    filter_block_4x4(/*is_lowbd=*/1, dst8, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/0);
+  }
+}
+void SIMD_FUNC(cdef_filter_8_2)(void *dest, int dstride, const uint16_t *in,
+                                int pri_strength, int sec_strength, int dir,
+                                int pri_damping, int sec_damping,
+                                int coeff_shift, int block_width,
+                                int block_height) {
+  uint8_t *dst8 = (uint8_t *)dest;
+  if (block_width == 8) {
+    filter_block_8x8(/*is_lowbd=*/1, dst8, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/0,
+                     /*enable_secondary=*/1);
+  } else {
+    filter_block_4x4(/*is_lowbd=*/1, dst8, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/0,
+                     /*enable_secondary=*/1);
+  }
+}
+
+void SIMD_FUNC(cdef_filter_8_3)(void *dest, int dstride, const uint16_t *in,
+                                int pri_strength, int sec_strength, int dir,
+                                int pri_damping, int sec_damping,
+                                int coeff_shift, int block_width,
+                                int block_height) {
+  uint8_t *dst8 = (uint8_t *)dest;
+  (void)pri_strength;
+  (void)sec_strength;
+  (void)dir;
+  (void)pri_damping;
+  (void)sec_damping;
+  (void)coeff_shift;
+  (void)block_width;
+
+  if (block_width == 8) {
+    copy_block_8xh(/*is_lowbd=*/1, dst8, dstride, in, block_height);
+  } else {
+    copy_block_4xh(/*is_lowbd=*/1, dst8, dstride, in, block_height);
+  }
+}
+
+void SIMD_FUNC(cdef_filter_16_0)(void *dest, int dstride, const uint16_t *in,
+                                 int pri_strength, int sec_strength, int dir,
+                                 int pri_damping, int sec_damping,
+                                 int coeff_shift, int block_width,
+                                 int block_height) {
+  uint16_t *dst16 = (uint16_t *)dest;
+  if (block_width == 8) {
+    filter_block_8x8(/*is_lowbd=*/0, dst16, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/1);
+  } else {
+    filter_block_4x4(/*is_lowbd=*/0, dst16, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/1);
+  }
+}
+
+void SIMD_FUNC(cdef_filter_16_1)(void *dest, int dstride, const uint16_t *in,
+                                 int pri_strength, int sec_strength, int dir,
+                                 int pri_damping, int sec_damping,
+                                 int coeff_shift, int block_width,
+                                 int block_height) {
+  uint16_t *dst16 = (uint16_t *)dest;
+  if (block_width == 8) {
+    filter_block_8x8(/*is_lowbd=*/0, dst16, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/0);
+  } else {
+    filter_block_4x4(/*is_lowbd=*/0, dst16, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/1,
+                     /*enable_secondary=*/0);
+  }
+}
+void SIMD_FUNC(cdef_filter_16_2)(void *dest, int dstride, const uint16_t *in,
+                                 int pri_strength, int sec_strength, int dir,
+                                 int pri_damping, int sec_damping,
+                                 int coeff_shift, int block_width,
+                                 int block_height) {
+  uint16_t *dst16 = (uint16_t *)dest;
+  if (block_width == 8) {
+    filter_block_8x8(/*is_lowbd=*/0, dst16, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/0,
+                     /*enable_secondary=*/1);
+  } else {
+    filter_block_4x4(/*is_lowbd=*/0, dst16, dstride, in, pri_strength,
+                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+                     block_height, /*enable_primary=*/0,
+                     /*enable_secondary=*/1);
+  }
+}
+
+void SIMD_FUNC(cdef_filter_16_3)(void *dest, int dstride, const uint16_t *in,
+                                 int pri_strength, int sec_strength, int dir,
+                                 int pri_damping, int sec_damping,
+                                 int coeff_shift, int block_width,
+                                 int block_height) {
+  uint16_t *dst16 = (uint16_t *)dest;
+  (void)pri_strength;
+  (void)sec_strength;
+  (void)dir;
+  (void)pri_damping;
+  (void)sec_damping;
+  (void)coeff_shift;
+  (void)block_width;
+  if (block_width == 8) {
+    copy_block_8xh(/*is_lowbd=*/0, dst16, dstride, in, block_height);
+  } else {
+    copy_block_4xh(/*is_lowbd=*/0, dst16, dstride, in, block_height);
   }
 }
 
diff --git a/media/libaom/src/av1/common/cdef_block_ssse3.c b/media/libaom/src/av1/common/cdef_block_ssse3.c
deleted file mode 100644
index 3a93b150f3..0000000000
--- a/media/libaom/src/av1/common/cdef_block_ssse3.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_ssse3
-#include "av1/common/cdef_block_simd.h"
diff --git a/media/libaom/src/av1/common/cfl.h b/media/libaom/src/av1/common/cfl.h
index a1d6dc2eaa..0d53764f28 100644
--- a/media/libaom/src/av1/common/cfl.h
+++ b/media/libaom/src/av1/common/cfl.h
@@ -18,7 +18,7 @@
 // Can we use CfL for the current block?
 static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   assert(bsize < BLOCK_SIZES_ALL);
   if (xd->lossless[mbmi->segment_id]) {
     // In lossless, CfL is available when the partition size is equal to the
@@ -39,7 +39,7 @@ static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
                                                   const MACROBLOCKD *xd) {
   const MB_MODE_INFO *mbmi = xd->mi[0];
 
-  if (cm->seq_params.monochrome) return CFL_DISALLOWED;
+  if (cm->seq_params->monochrome) return CFL_DISALLOWED;
 
   if (!xd->is_chroma_ref) {
     // For non-chroma-reference blocks, we should always store the luma pixels,
diff --git a/media/libaom/src/av1/common/common.h b/media/libaom/src/av1/common/common.h
index bed6083db2..ccb45b68ce 100644
--- a/media/libaom/src/av1/common/common.h
+++ b/media/libaom/src/av1/common/common.h
@@ -26,21 +26,19 @@
 extern "C" {
 #endif
 
-#define PI 3.141592653589793238462643383279502884
-
 // Only need this for fixed-size arrays, for structs just assign.
 #define av1_copy(dest, src)              \
-  {                                      \
+  do {                                   \
     assert(sizeof(dest) == sizeof(src)); \
     memcpy(dest, src, sizeof(src));      \
-  }
+  } while (0)
 
 // Use this for variably-sized arrays.
 #define av1_copy_array(dest, src, n)           \
-  {                                            \
+  do {                                         \
     assert(sizeof(*(dest)) == sizeof(*(src))); \
     memcpy(dest, src, n * sizeof(*(src)));     \
-  }
+  } while (0)
 
 #define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
 #define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
@@ -50,7 +48,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 }
 
 #define CHECK_MEM_ERROR(cm, lval, expr) \
-  AOM_CHECK_MEM_ERROR(&cm->error, lval, expr)
+  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
 
 #define AOM_FRAME_MARKER 0x2
 
diff --git a/media/libaom/src/av1/common/common_data.h b/media/libaom/src/av1/common/common_data.h
index 402845cafe..6ab7af4174 100644
--- a/media/libaom/src/av1/common/common_data.h
+++ b/media/libaom/src/av1/common/common_data.h
@@ -257,11 +257,21 @@ static const int tx_size_wide_log2[TX_SIZES_ALL] = {
   2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6,
 };
 
+// Transform block width in log2 unit
+static const int tx_size_wide_unit_log2[TX_SIZES_ALL] = {
+  0, 1, 2, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 2, 1, 3, 2, 4,
+};
+
 // Transform block height in log2
 static const int tx_size_high_log2[TX_SIZES_ALL] = {
   2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4,
 };
 
+// Transform block height in log2 unit
+static const int tx_size_high_unit_log2[TX_SIZES_ALL] = {
+  0, 1, 2, 3, 4, 1, 0, 2, 1, 3, 2, 4, 3, 2, 0, 3, 1, 4, 2,
+};
+
 static const int tx_size_2d[TX_SIZES_ALL + 1] = {
   16,  64,   256,  1024, 4096, 32,  32,  128,  128,  512,
   512, 2048, 2048, 64,   64,   256, 256, 1024, 1024,
@@ -434,9 +444,12 @@ static const int intra_mode_context[INTRA_MODES] = {
 static const int quant_dist_weight[4][2] = {
   { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE }
 };
-static const int quant_dist_lookup_table[2][4][2] = {
-  { { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 } },
-  { { 7, 9 }, { 5, 11 }, { 4, 12 }, { 3, 13 } },
+
+static const int quant_dist_lookup_table[4][2] = {
+  { 9, 7 },
+  { 11, 5 },
+  { 12, 4 },
+  { 13, 3 },
 };
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/av1/common/convolve.c b/media/libaom/src/av1/common/convolve.c
index e177e3cad3..63dda39daa 100644
--- a/media/libaom/src/av1/common/convolve.c
+++ b/media/libaom/src/av1/common/convolve.c
@@ -166,18 +166,9 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
 
 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                          int dst_stride, int w, int h,
-                         const InterpFilterParams *filter_params_x,
                          const InterpFilterParams *filter_params_y,
-                         const int subpel_x_qn, const int subpel_y_qn,
-                         ConvolveParams *conv_params) {
+                         const int subpel_y_qn) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
@@ -197,14 +188,9 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                          int dst_stride, int w, int h,
                          const InterpFilterParams *filter_params_x,
-                         const InterpFilterParams *filter_params_y,
-                         const int subpel_x_qn, const int subpel_y_qn,
-                         ConvolveParams *conv_params) {
+                         const int subpel_x_qn, ConvolveParams *conv_params) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_0;
-  (void)filter_params_y;
-  (void)subpel_y_qn;
-  (void)conv_params;
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -226,23 +212,6 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
-void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, int w, int h,
-                               const InterpFilterParams *filter_params_x,
-                               const InterpFilterParams *filter_params_y,
-                               const int subpel_x_qn, const int subpel_y_qn,
-                               ConvolveParams *conv_params) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-
-  for (int y = 0; y < h; ++y) {
-    memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
-  }
-}
-
 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
                                 uint8_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
@@ -311,9 +280,8 @@ void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
 
 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, int w, int h,
-                               const InterpFilterParams *filter_params_x,
                                const InterpFilterParams *filter_params_y,
-                               const int subpel_x_qn, const int subpel_y_qn,
+                               const int subpel_y_qn,
                                ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
@@ -325,8 +293,6 @@ void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
                            (1 << (offset_bits - conv_params->round_1 - 1));
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
@@ -362,8 +328,7 @@ void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, int w, int h,
                                const InterpFilterParams *filter_params_x,
-                               const InterpFilterParams *filter_params_y,
-                               const int subpel_x_qn, const int subpel_y_qn,
+                               const int subpel_x_qn,
                                ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
@@ -375,8 +340,6 @@ void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
                            (1 << (offset_bits - conv_params->round_1 - 1));
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  (void)filter_params_y;
-  (void)subpel_y_qn;
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
@@ -411,10 +374,6 @@ void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
 
 void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
                                      uint8_t *dst, int dst_stride, int w, int h,
-                                     const InterpFilterParams *filter_params_x,
-                                     const InterpFilterParams *filter_params_y,
-                                     const int subpel_x_qn,
-                                     const int subpel_y_qn,
                                      ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
@@ -424,10 +383,6 @@ void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
                            (1 << (offset_bits - conv_params->round_1 - 1));
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
 
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -552,13 +507,58 @@ static void convolve_2d_scale_wrapper(
                         y_step_qn, conv_params);
 }
 
+static void convolve_2d_facade_compound(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
+  const bool need_x = subpel_x_qn != 0;
+  const bool need_y = subpel_y_qn != 0;
+  if (!need_x && !need_y) {
+    av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
+                                  conv_params);
+  } else if (need_x && !need_y) {
+    av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
+                            filter_params_x, subpel_x_qn, conv_params);
+  } else if (!need_x && need_y) {
+    av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
+                            filter_params_y, subpel_y_qn, conv_params);
+  } else {
+    assert(need_y && need_x);
+    av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
+                             filter_params_x, filter_params_y, subpel_x_qn,
+                             subpel_y_qn, conv_params);
+  }
+}
+
+static void convolve_2d_facade_single(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
+  const bool need_x = subpel_x_qn != 0;
+  const bool need_y = subpel_y_qn != 0;
+  if (!need_x && !need_y) {
+    aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
+  } else if (need_x && !need_y) {
+    av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                      subpel_x_qn, conv_params);
+  } else if (!need_x && need_y) {
+    av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
+                      subpel_y_qn);
+  } else {
+    assert(need_x && need_y);
+    av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                       filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
+  }
+}
+
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *interp_filters[2],
                             const int subpel_x_qn, int x_step_q4,
                             const int subpel_y_qn, int y_step_q4, int scaled,
-                            ConvolveParams *conv_params,
-                            const struct scale_factors *sf) {
+                            ConvolveParams *conv_params) {
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst;
@@ -580,13 +580,11 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
       return;
     } else if (subpel_x_qn) {
       av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
-                          filter_params_x, filter_params_y, subpel_x_qn,
-                          subpel_y_qn, conv_params);
+                          filter_params_x, subpel_x_qn, conv_params);
       return;
     } else if (subpel_y_qn) {
       av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
-                          filter_params_x, filter_params_y, subpel_x_qn,
-                          subpel_y_qn, conv_params);
+                          filter_params_y, subpel_y_qn);
       return;
     }
   }
@@ -595,41 +593,25 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
                               filter_params_x, filter_params_y, subpel_x_qn,
                               x_step_q4, subpel_y_qn, y_step_q4, conv_params);
+  } else if (conv_params->is_compound) {
+    convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
+                                filter_params_x, filter_params_y, subpel_x_qn,
+                                subpel_y_qn, conv_params);
   } else {
-    sf->convolve[subpel_x_qn != 0][subpel_y_qn != 0][conv_params->is_compound](
-        src, src_stride, dst, dst_stride, w, h, filter_params_x,
-        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
+    convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
+                              filter_params_x, filter_params_y, subpel_x_qn,
+                              subpel_y_qn, conv_params);
   }
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-void av1_highbd_convolve_2d_copy_sr_c(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-  (void)bd;
-
-  for (int y = 0; y < h; ++y) {
-    memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
-  }
-}
-
 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
-                                const InterpFilterParams *filter_params_y,
-                                const int subpel_x_qn, const int subpel_y_qn,
+                                const int subpel_x_qn,
                                 ConvolveParams *conv_params, int bd) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_0;
-  (void)filter_params_y;
-  (void)subpel_y_qn;
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -653,18 +635,9 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
 
 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst, int dst_stride, int w, int h,
-                                const InterpFilterParams *filter_params_x,
                                 const InterpFilterParams *filter_params_y,
-                                const int subpel_x_qn, const int subpel_y_qn,
-                                ConvolveParams *conv_params, int bd) {
+                                const int subpel_y_qn, int bd) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
       filter_params_y, subpel_y_qn & SUBPEL_MASK);
@@ -800,11 +773,12 @@ void av1_highbd_dist_wtd_convolve_2d_c(
   }
 }
 
-void av1_highbd_dist_wtd_convolve_x_c(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
+                                      uint16_t *dst, int dst_stride, int w,
+                                      int h,
+                                      const InterpFilterParams *filter_params_x,
+                                      const int subpel_x_qn,
+                                      ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -815,8 +789,6 @@ void av1_highbd_dist_wtd_convolve_x_c(
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   assert(round_bits >= 0);
-  (void)filter_params_y;
-  (void)subpel_y_qn;
   assert(bits >= 0);
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
@@ -849,11 +821,12 @@ void av1_highbd_dist_wtd_convolve_x_c(
   }
 }
 
-void av1_highbd_dist_wtd_convolve_y_c(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
+                                      uint16_t *dst, int dst_stride, int w,
+                                      int h,
+                                      const InterpFilterParams *filter_params_y,
+                                      const int subpel_y_qn,
+                                      ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -864,8 +837,6 @@ void av1_highbd_dist_wtd_convolve_y_c(
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   assert(round_bits >= 0);
-  (void)filter_params_x;
-  (void)subpel_x_qn;
   assert(bits >= 0);
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
@@ -898,11 +869,11 @@ void av1_highbd_dist_wtd_convolve_y_c(
   }
 }
 
-void av1_highbd_dist_wtd_convolve_2d_copy_c(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
+                                            uint16_t *dst, int dst_stride,
+                                            int w, int h,
+                                            ConvolveParams *conv_params,
+                                            int bd) {
   CONV_BUF_TYPE *dst16 = conv_params->dst;
   int dst16_stride = conv_params->dst_stride;
   const int bits =
@@ -911,10 +882,6 @@ void av1_highbd_dist_wtd_convolve_2d_copy_c(
   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
                            (1 << (offset_bits - conv_params->round_1 - 1));
   assert(bits >= 0);
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
 
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -1025,13 +992,63 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
   }
 }
 
+static void highbd_convolve_2d_facade_compound(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    const int w, const int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  const bool need_x = subpel_x_qn != 0;
+  const bool need_y = subpel_y_qn != 0;
+  if (!need_x && !need_y) {
+    av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
+                                         conv_params, bd);
+  } else if (need_x && !need_y) {
+    av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_x, subpel_x_qn, conv_params,
+                                   bd);
+  } else if (!need_x && need_y) {
+    av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_y, subpel_y_qn, conv_params,
+                                   bd);
+  } else {
+    assert(need_x && need_y);
+    av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
+                                    filter_params_x, filter_params_y,
+                                    subpel_x_qn, subpel_y_qn, conv_params, bd);
+  }
+}
+
+static void highbd_convolve_2d_facade_single(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    const int w, const int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  const bool need_x = subpel_x_qn != 0;
+  const bool need_y = subpel_y_qn != 0;
+
+  if (!need_x && !need_y) {
+    aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
+  } else if (need_x && !need_y) {
+    av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
+                             filter_params_x, subpel_x_qn, conv_params, bd);
+  } else if (!need_x && need_y) {
+    av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
+                             filter_params_y, subpel_y_qn, bd);
+  } else {
+    assert(need_x && need_y);
+    av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
+                              filter_params_x, filter_params_y, subpel_x_qn,
+                              subpel_y_qn, conv_params, bd);
+  }
+}
+
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    uint8_t *dst8, int dst_stride, int w, int h,
                                    const InterpFilterParams *interp_filters[2],
                                    const int subpel_x_qn, int x_step_q4,
                                    const int subpel_y_qn, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   const struct scale_factors *sf, int bd) {
+                                   int bd) {
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst_stride;
@@ -1044,8 +1061,8 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
   const InterpFilterParams *filter_params_y =
       need_filter_params_y ? interp_filters[1] : NULL;
 
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   if (scaled) {
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
     if (conv_params->is_compound) {
       assert(conv_params->dst != NULL);
     }
@@ -1053,13 +1070,14 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                  filter_params_x, filter_params_y, subpel_x_qn,
                                  x_step_q4, subpel_y_qn, y_step_q4, conv_params,
                                  bd);
-  } else {
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-
-    sf->highbd_convolve[subpel_x_qn != 0][subpel_y_qn !=
-                                          0][conv_params->is_compound](
+  } else if (conv_params->is_compound) {
+    highbd_convolve_2d_facade_compound(
         src, src_stride, dst, dst_stride, w, h, filter_params_x,
         filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
+  } else {
+    highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
+                                     filter_params_x, filter_params_y,
+                                     subpel_x_qn, subpel_y_qn, conv_params, bd);
   }
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/av1/common/convolve.h b/media/libaom/src/av1/common/convolve.h
index 04df86c42f..5f3e59625b 100644
--- a/media/libaom/src/av1/common/convolve.h
+++ b/media/libaom/src/av1/common/convolve.h
@@ -26,7 +26,6 @@ typedef struct ConvolveParams {
   int round_1;
   int plane;
   int is_compound;
-  int compound_index;  // 0: the first single in compound mode, 1: the second.
   int use_dist_wtd_comp_avg;
   int fwd_offset;
   int bck_offset;
@@ -59,18 +58,17 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             const InterpFilterParams *interp_filters[2],
                             const int subpel_x_qn, int x_step_q4,
                             const int subpel_y_qn, int y_step_q4, int scaled,
-                            ConvolveParams *conv_params,
-                            const struct scale_factors *sf);
+                            ConvolveParams *conv_params);
 
 static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane,
                                                       CONV_BUF_TYPE *dst,
                                                       int dst_stride,
                                                       int is_compound, int bd) {
   ConvolveParams conv_params;
-  conv_params.compound_index = cmp_index;
   assert(IMPLIES(cmp_index, is_compound));
 
   conv_params.is_compound = is_compound;
+  conv_params.use_dist_wtd_comp_avg = 0;
   conv_params.round_0 = ROUND0_BITS;
   conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
                                     : 2 * FILTER_BITS - conv_params.round_0;
@@ -122,7 +120,7 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    const int subpel_x_qn, int x_step_q4,
                                    const int subpel_y_qn, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   const struct scale_factors *sf, int bd);
+                                   int bd);
 
 // TODO(sarahparker) This will need to be integerized and optimized
 void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
diff --git a/media/libaom/src/av1/common/debugmodes.c b/media/libaom/src/av1/common/debugmodes.c
index ff02ddde0b..7e6160f9a5 100644
--- a/media/libaom/src/av1/common/debugmodes.c
+++ b/media/libaom/src/av1/common/debugmodes.c
@@ -17,7 +17,7 @@
 
 static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
   fprintf(f, "%s", str);
-  fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
+  fprintf(f, "(Frame %u, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
           cm->show_frame, cm->quant_params.base_qindex);
 }
 /* This function dereferences a pointer to the mbmi structure
@@ -52,7 +52,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
   const int rows = mi_params->mi_rows;
   const int cols = mi_params->mi_cols;
 
-  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
+  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, bsize));
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
   print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
   print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
@@ -63,7 +63,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
   for (int mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "S ");
     for (int mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[0]->skip);
+      fprintf(mvs, "%2d ", mi[0]->skip_txfm);
       mi++;
     }
     fprintf(mvs, "\n");
diff --git a/media/libaom/src/av1/common/entropy.c b/media/libaom/src/av1/common/entropy.c
index 1f7a0efe08..97d95ea394 100644
--- a/media/libaom/src/av1/common/entropy.c
+++ b/media/libaom/src/av1/common/entropy.c
@@ -130,12 +130,11 @@ void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
   RESET_CDF_COUNTER(fc->compound_index_cdf, 2);
   RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2);
   RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2);
-  RESET_CDF_COUNTER(fc->skip_cdfs, 2);
+  RESET_CDF_COUNTER(fc->skip_txfm_cdfs, 2);
   RESET_CDF_COUNTER(fc->intra_inter_cdf, 2);
   reset_nmv_counter(&fc->nmvc);
   reset_nmv_counter(&fc->ndvc);
   RESET_CDF_COUNTER(fc->intrabc_cdf, 2);
-  RESET_CDF_COUNTER(fc->seg.tree_cdf, MAX_SEGMENTS);
   RESET_CDF_COUNTER(fc->seg.pred_cdf, 2);
   RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
   RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2);
diff --git a/media/libaom/src/av1/common/entropy.h b/media/libaom/src/av1/common/entropy.h
index ee78f56a37..53ef3b1c89 100644
--- a/media/libaom/src/av1/common/entropy.h
+++ b/media/libaom/src/av1/common/entropy.h
@@ -73,6 +73,7 @@ struct AV1Common;
 struct frame_contexts;
 void av1_reset_cdf_symbol_counters(struct frame_contexts *fc);
 void av1_default_coef_probs(struct AV1Common *cm);
+void av1_init_mode_probs(struct frame_contexts *fc);
 
 struct frame_contexts;
 
diff --git a/media/libaom/src/av1/common/entropymode.c b/media/libaom/src/av1/common/entropymode.c
index 5f061be35e..7582d54d48 100644
--- a/media/libaom/src/av1/common/entropymode.c
+++ b/media/libaom/src/av1/common/entropymode.c
@@ -793,7 +793,7 @@ static const aom_cdf_prob
       { AOM_CDF2(28165) }, { AOM_CDF2(22401) }, { AOM_CDF2(16088) }
     };
 
-static const aom_cdf_prob default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = {
+static const aom_cdf_prob default_skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = {
   { AOM_CDF2(31671) }, { AOM_CDF2(16515) }, { AOM_CDF2(4576) }
 };
 
@@ -850,11 +850,6 @@ static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = {
   AOM_CDF4(28160, 32120, 32677)
 };
 
-// FIXME(someone) need real defaults here
-static const aom_cdf_prob default_seg_tree_cdf[CDF_SIZE(MAX_SEGMENTS)] = {
-  AOM_CDF8(4096, 8192, 12288, 16384, 20480, 24576, 28672)
-};
-
 static const aom_cdf_prob
     default_segment_pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)] = {
       { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }
@@ -973,10 +968,117 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
   assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
   return color_index_ctx;
 }
+
+int av1_fast_palette_color_index_context(const uint8_t *color_map, int stride,
+                                         int r, int c, int *color_idx) {
+  assert(r > 0 || c > 0);
+
+  // This goes in the order of left, top, and top-left. This has the advantage
+  // that unless anything here are not distinct or invalid, this will already
+  // be in sorted order. Furthermore, if either of the first two are not
+  // invalid, we know the last one is also invalid.
+  int color_neighbors[NUM_PALETTE_NEIGHBORS];
+  color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
+  color_neighbors[1] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
+  color_neighbors[2] =
+      (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
+
+  // Since our array is so small, using a couple if statements is faster
+  int scores[NUM_PALETTE_NEIGHBORS] = { 2, 2, 1 };
+  if (color_neighbors[0] == color_neighbors[1]) {
+    scores[0] += scores[1];
+    color_neighbors[1] = -1;
+
+    if (color_neighbors[0] == color_neighbors[2]) {
+      scores[0] += scores[2];
+      color_neighbors[2] = -1;
+    }
+  } else if (color_neighbors[0] == color_neighbors[2]) {
+    scores[0] += scores[2];
+    color_neighbors[2] = -1;
+  } else if (color_neighbors[1] == color_neighbors[2]) {
+    scores[1] += scores[2];
+    color_neighbors[2] = -1;
+  }
+
+  int color_rank[NUM_PALETTE_NEIGHBORS] = { -1, -1, -1 };
+  int score_rank[NUM_PALETTE_NEIGHBORS] = { 0, 0, 0 };
+  int num_valid_colors = 0;
+  for (int idx = 0; idx < NUM_PALETTE_NEIGHBORS; idx++) {
+    if (color_neighbors[idx] != -1) {
+      score_rank[num_valid_colors] = scores[idx];
+      color_rank[num_valid_colors] = color_neighbors[idx];
+      num_valid_colors++;
+    }
+  }
+
+  // Sort everything
+  // We need to swap the first two elements if they have the same score but
+  // the color indices are not in the right order
+  if (score_rank[0] < score_rank[1] ||
+      (score_rank[0] == score_rank[1] && color_rank[0] > color_rank[1])) {
+    const int tmp_score = score_rank[0];
+    const int tmp_color = color_rank[0];
+    score_rank[0] = score_rank[1];
+    color_rank[0] = color_rank[1];
+    score_rank[1] = tmp_score;
+    color_rank[1] = tmp_color;
+  }
+  if (score_rank[0] < score_rank[2]) {
+    const int tmp_score = score_rank[0];
+    const int tmp_color = color_rank[0];
+    score_rank[0] = score_rank[2];
+    color_rank[0] = color_rank[2];
+    score_rank[2] = tmp_score;
+    color_rank[2] = tmp_color;
+  }
+  if (score_rank[1] < score_rank[2]) {
+    const int tmp_score = score_rank[1];
+    const int tmp_color = color_rank[1];
+    score_rank[1] = score_rank[2];
+    color_rank[1] = color_rank[2];
+    score_rank[2] = tmp_score;
+    color_rank[2] = tmp_color;
+  }
+
+  if (color_idx != NULL) {
+    // If any of the neighbor color has higher index than current color index,
+    // then we move up by 1 unless the current color is the same as one of the
+    // neighbor
+    const int current_color = *color_idx = color_map[r * stride + c];
+    int same_neighbor = -1;
+    for (int idx = 0; idx < NUM_PALETTE_NEIGHBORS; idx++) {
+      if (color_rank[idx] > current_color) {
+        (*color_idx)++;
+      } else if (color_rank[idx] == current_color) {
+        same_neighbor = idx;
+      }
+    }
+    if (same_neighbor != -1) {
+      *color_idx = same_neighbor;
+    }
+  }
+
+  // Get hash value of context.
+  int color_index_ctx_hash = 0;
+  static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
+  for (int idx = 0; idx < NUM_PALETTE_NEIGHBORS; ++idx) {
+    color_index_ctx_hash += score_rank[idx] * hash_multipliers[idx];
+  }
+  assert(color_index_ctx_hash > 0);
+  assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
+
+  // Lookup context from hash.
+  const int color_index_ctx =
+      palette_color_index_context_lookup[color_index_ctx_hash];
+  assert(color_index_ctx >= 0);
+  assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
+  return color_index_ctx;
+}
 #undef NUM_PALETTE_NEIGHBORS
 #undef MAX_COLOR_CONTEXT_HASH
 
-static void init_mode_probs(FRAME_CONTEXT *fc) {
+void av1_init_mode_probs(FRAME_CONTEXT *fc) {
   av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf);
   av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf);
   av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf);
@@ -1007,7 +1109,6 @@ static void init_mode_probs(FRAME_CONTEXT *fc) {
   av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf);
   av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf);
   av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf);
-  av1_copy(fc->seg.tree_cdf, default_seg_tree_cdf);
   av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs);
   av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf);
   av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf);
@@ -1020,7 +1121,7 @@ static void init_mode_probs(FRAME_CONTEXT *fc) {
   av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf);
   av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf);
   av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs);
-  av1_copy(fc->skip_cdfs, default_skip_cdfs);
+  av1_copy(fc->skip_txfm_cdfs, default_skip_txfm_cdfs);
   av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf);
   for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++)
     av1_copy(fc->seg.spatial_pred_seg_cdf[i],
@@ -1086,9 +1187,10 @@ void av1_setup_past_independence(AV1_COMMON *cm) {
   // Features disabled, 0, with delta coding (Default state).
   av1_clearall_segfeatures(&cm->seg);
 
-  if (cm->cur_frame->seg_map)
+  if (cm->cur_frame->seg_map) {
     memset(cm->cur_frame->seg_map, 0,
-           (cm->mi_params.mi_rows * cm->mi_params.mi_cols));
+           (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols));
+  }
 
   // reset mode ref deltas
   av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
@@ -1096,7 +1198,7 @@ void av1_setup_past_independence(AV1_COMMON *cm) {
   set_default_lf_deltas(&cm->lf);
 
   av1_default_coef_probs(cm);
-  init_mode_probs(cm->fc);
+  av1_init_mode_probs(cm->fc);
   av1_init_mv_probs(cm);
   cm->fc->initialized = 1;
   av1_setup_frame_contexts(cm);
diff --git a/media/libaom/src/av1/common/entropymode.h b/media/libaom/src/av1/common/entropymode.h
index bbbf55dc85..59f249b118 100644
--- a/media/libaom/src/av1/common/entropymode.h
+++ b/media/libaom/src/av1/common/entropymode.h
@@ -121,8 +121,8 @@ typedef struct frame_contexts {
   aom_cdf_prob txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob compound_index_cdf[COMP_INDEX_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob comp_group_idx_cdf[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob skip_mode_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)];
   nmv_context nmvc;
   nmv_context ndvc;
@@ -205,6 +205,11 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
                                         int r, int c, int palette_size,
                                         uint8_t *color_order, int *color_idx);
 
+// A faster version of av1_get_palette_color_index_context used by the encoder
+// exploiting the fact that the encoder does not need to maintain a color order.
+int av1_fast_palette_color_index_context(const uint8_t *color_map, int stride,
+                                         int r, int c, int *color_idx);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/common/enums.h b/media/libaom/src/av1/common/enums.h
index 0c09a1bc7a..eb655c9b97 100644
--- a/media/libaom/src/av1/common/enums.h
+++ b/media/libaom/src/av1/common/enums.h
@@ -16,12 +16,17 @@
 
 #include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
 #include "aom_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+/*! @file */
+
+/*!\cond */
+
 #undef MAX_SB_SIZE
 
 // Max superblock size
@@ -167,33 +172,6 @@ typedef char PARTITION_CONTEXT;
 #define PARTITION_BLOCK_SIZES 5
 #define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
 
-// block transform size
-enum {
-  TX_4X4,             // 4x4 transform
-  TX_8X8,             // 8x8 transform
-  TX_16X16,           // 16x16 transform
-  TX_32X32,           // 32x32 transform
-  TX_64X64,           // 64x64 transform
-  TX_4X8,             // 4x8 transform
-  TX_8X4,             // 8x4 transform
-  TX_8X16,            // 8x16 transform
-  TX_16X8,            // 16x8 transform
-  TX_16X32,           // 16x32 transform
-  TX_32X16,           // 32x16 transform
-  TX_32X64,           // 32x64 transform
-  TX_64X32,           // 64x32 transform
-  TX_4X16,            // 4x16 transform
-  TX_16X4,            // 16x4 transform
-  TX_8X32,            // 8x32 transform
-  TX_32X8,            // 32x8 transform
-  TX_16X64,           // 16x64 transform
-  TX_64X16,           // 64x16 transform
-  TX_SIZES_ALL,       // Includes rectangular transforms
-  TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
-  TX_SIZES_LARGEST = TX_64X64,
-  TX_INVALID = 255  // Invalid transform size
-} UENUM1BYTE(TX_SIZE);
-
 #define TX_SIZE_LUMA_MIN (TX_4X4)
 /* We don't need to code a transform size unless the allowed size is at least
    one more than the minimum. */
@@ -243,27 +221,6 @@ enum {
 } UENUM1BYTE(TX_TYPE_1D);
 
 enum {
-  DCT_DCT,            // DCT in both horizontal and vertical
-  ADST_DCT,           // ADST in vertical, DCT in horizontal
-  DCT_ADST,           // DCT in vertical, ADST in horizontal
-  ADST_ADST,          // ADST in both directions
-  FLIPADST_DCT,       // FLIPADST in vertical, DCT in horizontal
-  DCT_FLIPADST,       // DCT in vertical, FLIPADST in horizontal
-  FLIPADST_FLIPADST,  // FLIPADST in both directions
-  ADST_FLIPADST,      // ADST in vertical, FLIPADST in horizontal
-  FLIPADST_ADST,      // FLIPADST in vertical, ADST in horizontal
-  IDTX,               // Identity in both directions
-  V_DCT,              // DCT in vertical, identity in horizontal
-  H_DCT,              // Identity in vertical, DCT in horizontal
-  V_ADST,             // ADST in vertical, identity in horizontal
-  H_ADST,             // Identity in vertical, ADST in horizontal
-  V_FLIPADST,         // FLIPADST in vertical, identity in horizontal
-  H_FLIPADST,         // Identity in vertical, FLIPADST in horizontal
-  TX_TYPES,
-  DCT_ADST_TX_MASK = 0x000F,  // Either DCT or ADST in each direction
-} UENUM1BYTE(TX_TYPE);
-
-enum {
   REG_REG,
   REG_SMOOTH,
   REG_SHARP,
@@ -275,22 +232,6 @@ enum {
   SHARP_SHARP,
 } UENUM1BYTE(DUAL_FILTER_TYPE);
 
-enum {
-  // DCT only
-  EXT_TX_SET_DCTONLY,
-  // DCT + Identity only
-  EXT_TX_SET_DCT_IDTX,
-  // Discrete Trig transforms w/o flip (4) + Identity (1)
-  EXT_TX_SET_DTT4_IDTX,
-  // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2)
-  EXT_TX_SET_DTT4_IDTX_1DDCT,
-  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2)
-  EXT_TX_SET_DTT9_IDTX_1DDCT,
-  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
-  EXT_TX_SET_ALL16,
-  EXT_TX_SET_TYPES
-} UENUM1BYTE(TxSetType);
-
 #define EXT_TX_SIZES 4       // number of sizes that use extended transforms
 #define EXT_TX_SETS_INTER 4  // Sets of transform selections for INTER
 #define EXT_TX_SETS_INTRA 3  // Sets of transform selections for INTRA
@@ -317,6 +258,7 @@ enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE);
 #define CFL_ALPHABET_SIZE_LOG2 4
 #define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
 #define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1)
+#define CFL_INDEX_ZERO CFL_ALPHABET_SIZE
 #define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
 #define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
 
@@ -408,6 +350,7 @@ enum {
   GLOBAL_GLOBALMV,
   NEW_NEWMV,
   MB_MODE_COUNT,
+  PRED_MODE_INVALID = MB_MODE_COUNT,
   INTRA_MODE_START = DC_PRED,
   INTRA_MODE_END = NEARESTMV,
   DIR_MODE_START = V_PRED,
@@ -446,6 +389,13 @@ enum {
   UV_MODE_INVALID,  // For uv_mode in inter blocks
 } UENUM1BYTE(UV_PREDICTION_MODE);
 
+// Number of top model rd to store for pruning y modes in intra mode decision
+#define TOP_INTRA_MODEL_COUNT 4
+// Total number of luma intra prediction modes (include both directional and
+// non-directional modes)
+// Because there are 8 directional modes, each has additional 6 delta angles.
+#define LUMA_MODE_COUNT (PAETH_PRED - DC_PRED + 1 + 6 * 8)
+
 enum {
   SIMPLE_TRANSLATION,
   OBMC_CAUSAL,    // 2-sided OBMC
@@ -607,6 +557,9 @@ enum {
 #define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
 #define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
 
+// Select all the decoded frame buffer slots
+#define SELECT_ALL_BUF_SLOTS 0xFF
+
 enum {
   LAST_LAST2_FRAMES,      // { LAST_FRAME, LAST2_FRAME }
   LAST_LAST3_FRAMES,      // { LAST_FRAME, LAST3_FRAME }
@@ -636,15 +589,21 @@ enum {
 // NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum.
 typedef int8_t MV_REFERENCE_FRAME;
 
-enum {
-  RESTORE_NONE,
-  RESTORE_WIENER,
-  RESTORE_SGRPROJ,
-  RESTORE_SWITCHABLE,
-  RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
-  RESTORE_TYPES = 4,
-} UENUM1BYTE(RestorationType);
+/*!\endcond */
 
+/*!\enum RestorationType
+ * \brief This enumeration defines various restoration types supported
+ */
+typedef enum {
+  RESTORE_NONE,       /**< No restoration */
+  RESTORE_WIENER,     /**< Separable Wiener restoration */
+  RESTORE_SGRPROJ,    /**< Selfguided restoration */
+  RESTORE_SWITCHABLE, /**< Switchable restoration */
+  RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE, /**< Num Switchable types */
+  RESTORE_TYPES = 4,                             /**< Num Restore types */
+} RestorationType;
+
+/*!\cond */
 // Picture prediction structures (0-12 are predefined) in scalability metadata.
 enum {
   SCALABILITY_L1T2 = 0,
@@ -671,6 +630,8 @@ enum {
 #define MAX_EXTERNAL_REFERENCES 128
 #define MAX_TILES 512
 
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/common/filter.h b/media/libaom/src/av1/common/filter.h
index 91791d3dcb..ded5ce5ae5 100644
--- a/media/libaom/src/av1/common/filter.h
+++ b/media/libaom/src/av1/common/filter.h
@@ -25,13 +25,16 @@
 extern "C" {
 #endif
 
-#define MAX_FILTER_TAP 8
+#define MAX_FILTER_TAP 12
 
 typedef enum ATTRIBUTE_PACKED {
   EIGHTTAP_REGULAR,
   EIGHTTAP_SMOOTH,
   MULTITAP_SHARP,
   BILINEAR,
+  // Encoder side only filters
+  MULTITAP_SHARP2,
+
   INTERP_FILTERS_ALL,
   SWITCHABLE_FILTERS = BILINEAR,
   SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
@@ -102,7 +105,6 @@ static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
 typedef struct InterpFilterParams {
   const int16_t *filter_ptr;
   uint16_t taps;
-  uint16_t subpel_shifts;
   InterpFilter interp_filter;
 } InterpFilterParams;
 
@@ -154,16 +156,38 @@ DECLARE_ALIGNED(256, static const InterpKernel,
   { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
 };
 
+DECLARE_ALIGNED(256, static const int16_t,
+                av1_sub_pel_filters_12sharp[SUBPEL_SHIFTS][12]) = {
+  { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 },
+  { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0 },
+  { -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1 },
+  { -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1 },
+  { -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1 },
+  { -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2 },
+  { -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2 },
+  { -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2 },
+  { -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2 },
+  { -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2 },
+  { -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2 },
+  { -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2 },
+  { -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1 },
+  { -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1 },
+  { -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1 },
+  { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 }
+};
+
 static const InterpFilterParams
-    av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
-      { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_REGULAR },
+    av1_interp_filter_params_list[INTERP_FILTERS_ALL] = {
+      { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, EIGHTTAP_REGULAR },
       { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS,
-        SUBPEL_SHIFTS, EIGHTTAP_SMOOTH },
-      { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_SMOOTH },
+      { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS,
         MULTITAP_SHARP },
-      { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        BILINEAR }
+      { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR },
+
+      // The following filters are for encoder only, and now they are used in
+      // temporal filtering. The predictor block size >= 16 in temporal filter.
+      { (const int16_t *)av1_sub_pel_filters_12sharp, 12, MULTITAP_SHARP2 },
     };
 
 // A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
@@ -175,7 +199,7 @@ DECLARE_ALIGNED(256, static const int16_t,
 };
 
 static const InterpFilterParams av1_intrabc_filter_params = {
-  av1_intrabc_bilinear_filter, 2, 0, BILINEAR
+  av1_intrabc_bilinear_filter, 2, BILINEAR
 };
 
 DECLARE_ALIGNED(256, static const InterpKernel,
@@ -213,20 +237,18 @@ static const uint16_t
 
 // For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
 static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
-  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    EIGHTTAP_REGULAR },
-  { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, EIGHTTAP_REGULAR },
+  { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS,
     EIGHTTAP_SMOOTH },
-  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    EIGHTTAP_REGULAR },
-  { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    BILINEAR },
+  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, EIGHTTAP_REGULAR },
+  { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR },
 };
 
 static INLINE const InterpFilterParams *
 av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
                                              const int w) {
-  if (w <= 4) return &av1_interp_4tap[interp_filter];
+  if (w <= 4 && interp_filter != MULTITAP_SHARP2)
+    return &av1_interp_4tap[interp_filter];
   return &av1_interp_filter_params_list[interp_filter];
 }
 
diff --git a/media/libaom/src/av1/common/loopfiltermask.c b/media/libaom/src/av1/common/loopfiltermask.c
deleted file mode 100644
index 157310f2df..0000000000
--- a/media/libaom/src/av1/common/loopfiltermask.c
+++ /dev/null
@@ -1,1458 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "av1/common/av1_common_int.h"
-#include "av1/common/av1_loopfilter.h"
-#include "av1/common/reconinter.h"
-#include "av1/common/seg_common.h"
-
-// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
-// We use 4 uint64_t to represent the 256 bit.
-// Each 1 represents a position where we should apply a loop filter
-// across the left border of an 4x4 block boundary.
-//
-// In the case of TX_8x8->  ( in low order byte first we end up with
-// a mask that looks like this (-- and | are used for better view)
-//
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    -----------------
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//
-// A loopfilter should be applied to every other 4x4 horizontally.
-
-// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
-// We use 4 uint64_t to represent the 256 bit.
-// Each 1 represents a position where we should apply a loop filter
-// across the top border of an 4x4 block boundary.
-//
-// In the case of TX_8x8->  ( in low order byte first we end up with
-// a mask that looks like this
-//
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    -----------------
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//
-// A loopfilter should be applied to every other 4x4 horizontally.
-#if CONFIG_LPF_MASK
-static const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
-  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
-};
-
-static const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
-  -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
-};
-
-static const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
-  -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
-};
-
-static const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = {
-  -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,
-  2,  3,  -1, -1, -1, -1, -1, -1, -1, -1, -1
-};
-static const int mask_id_table_vert_border[BLOCK_SIZES_ALL] = {
-  0,  47, 49, 19, 51, 53, 33, 55, 57, 42, 59,
-  60, 46, -1, -1, -1, 61, 62, 63, 64, 65, 66
-};
-
-static const FilterMask left_mask_univariant_reordered[67] = {
-  // TX_4X4
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
-  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
-  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
-  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
-      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
-      0xffffffffffffffffULL } },  // block size 64X64, TX_4X4
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
-  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
-  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
-      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
-  // TX_8X8
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
-  { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
-  { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
-  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
-  { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
-  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
-  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
-      0x0055005500550055ULL } },  // block size 32X64, TX_8X8
-  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
-  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
-      0x5555555555555555ULL } },  // block size 64X64, TX_8X8
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
-  { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
-  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
-      0x0005000500050005ULL } },  // block size 16X64, TX_8X8
-  { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
-  // TX_16X16
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
-  { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
-  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
-  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
-      0x0011001100110011ULL } },  // block size 32X64, TX_16X16
-  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
-  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
-      0x1111111111111111ULL } },  // block size 64X64, TX_16X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 16X64, TX_16X16
-  { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
-  // TX_32X32
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
-  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
-      0x0101010101010101ULL } },  // block size 32X64, TX_32X32
-  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
-  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
-      0x0101010101010101ULL } },  // block size 64X64, TX_32X32
-  // TX_64X64
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 64X64, TX_64X64
-  // 2:1, 1:2 transform sizes.
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
-  { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
-  { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 16X64, TX_16X32
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
-  { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 32X64, TX_32X64
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
-  // 4:1, 1:4 transform sizes.
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 16X64, TX_16X64
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
-};
-
-static const FilterMask above_mask_univariant_reordered[67] = {
-  // TX_4X4
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
-  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
-  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
-  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
-      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
-      0xffffffffffffffffULL } },  // block size 64X64, TX_4x4
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
-  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
-  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
-      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
-  // TX_8X8
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
-  { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
-  { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
-  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
-  { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
-  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
-  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
-      0x000000ff000000ffULL } },  // block size 32X64, TX_8X8
-  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
-  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
-      0x0000ffff0000ffffULL } },  // block size 64X64, TX_8X8
-  { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
-  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
-      0x0000000f0000000fULL } },  // block size 16X64, TX_8X8
-  { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
-  // TX_16X16
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
-  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
-  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
-  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
-      0x00000000000000ffULL } },  // block size 32X64, TX_16X16
-  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
-  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
-      0x000000000000ffffULL } },  // block size 64X64, TX_16X16
-  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
-      0x000000000000000fULL } },  // block size 16X64, TX_16X16
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
-  // TX_32X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
-      0x0000000000000000ULL } },  // block size 32X64, TX_32X32
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
-      0x0000000000000000ULL } },  // block size 64X64, TX_32X32
-  // TX_64X64
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X64, TX_64X64
-  // 2:1, 1:2 transform sizes.
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
-  { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
-  { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
-      0x0000000000000000ULL } },  // block size 16X64, TX_16X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X64, TX_32X64
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
-  // 4:1, 1:4 transform sizes.
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X64, TX_16X64
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
-};
-
-static LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm,
-                                            int mi_row, int mi_col) {
-  assert(cm->lf.lfm != NULL);
-  const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
-  const int col = mi_col >> MIN_MIB_SIZE_LOG2;
-  return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
-}
-
-typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
-                        const uint8_t *limit, const uint8_t *thresh);
-
-typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
-                            const uint8_t *limit0, const uint8_t *thresh0,
-                            const uint8_t *blimit1, const uint8_t *limit1,
-                            const uint8_t *thresh1);
-
-typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh, int bd);
-
-typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1, int bd);
-// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
-// Every 4 rows is represented by one uint64_t mask. Hence,
-// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
-//
-// Given a location by (mi_col, mi_row), This function returns the index
-// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
-//
-// For example, mi_row is the offset of pixels in mi size (4),
-// (mi_row / 4) returns which uint64_t.
-// After locating which uint64_t, mi_row % 4 is the
-// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
-// Therefore, shift = (row << stride_log2) + mi_col;
-int get_index_shift(int mi_col, int mi_row, int *index) {
-  // *index = mi_row >> 2;
-  // rows = mi_row % 4;
-  // stride_log2 = 4;
-  // shift = (rows << stride_log2) + mi_col;
-  *index = mi_row >> 2;
-  return ((mi_row & 3) << 4) | mi_col;
-}
-
-static void filter_selectively_vert_row2(
-    int subsampling_factor, uint8_t *s, int pitch, int plane,
-    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
-    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
-    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
-  uint64_t mask;
-  const int step = 1 << subsampling_factor;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
-              mask_8x8_1 | mask_4x4_1;
-       mask; mask >>= step) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
-
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
-
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          if (plane) {
-            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          } else {
-            aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                     lfi1->hev_thr);
-          }
-        } else if (mask_16x16_0 & 1) {
-          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                       lfi1->hev_thr);
-        }
-      }
-
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
-
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          if (plane) {
-            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          } else {
-            aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          }
-        } else if (mask_8x8_0 & 1) {
-          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                       lfi1->hev_thr);
-        }
-      }
-
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_0 & 1) {
-          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
-        }
-      }
-    }
-
-    s += 4;
-    lfl += step;
-    lfl2 += step;
-    mask_16x16_0 >>= step;
-    mask_8x8_0 >>= step;
-    mask_4x4_0 >>= step;
-    mask_16x16_1 >>= step;
-    mask_8x8_1 >>= step;
-    mask_4x4_1 >>= step;
-  }
-}
-
-#if CONFIG_AV1_HIGHBITDEPTH
-static void highbd_filter_selectively_vert_row2(
-    int subsampling_factor, uint16_t *s, int pitch, int plane,
-    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
-    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
-    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
-  uint64_t mask;
-  const int step = 1 << subsampling_factor;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
-              mask_8x8_1 | mask_4x4_1;
-       mask; mask >>= step) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
-
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        HbdLpfFunc highbd_lpf_vertical =
-            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
-
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          if (plane) {
-            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                           lfi0->hev_thr, lfi1->mblim,
-                                           lfi1->lim, lfi1->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                            lfi0->hev_thr, lfi1->mblim,
-                                            lfi1->lim, lfi1->hev_thr, bd);
-          }
-        } else if (mask_16x16_0 & 1) {
-          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                              bd);
-        } else {
-          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                              lfi1->hev_thr, bd);
-        }
-      }
-
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        HbdLpfFunc highbd_lpf_vertical =
-            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
-
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          if (plane) {
-            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                           lfi0->hev_thr, lfi1->mblim,
-                                           lfi1->lim, lfi1->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                           lfi0->hev_thr, lfi1->mblim,
-                                           lfi1->lim, lfi1->hev_thr, bd);
-          }
-        } else if (mask_8x8_0 & 1) {
-          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                              bd);
-        } else {
-          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                              lfi1->hev_thr, bd);
-        }
-      }
-
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_0 & 1) {
-          aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
-        } else {
-          aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
-        }
-      }
-    }
-
-    s += 4;
-    lfl += step;
-    lfl2 += step;
-    mask_16x16_0 >>= step;
-    mask_8x8_0 >>= step;
-    mask_4x4_0 >>= step;
-    mask_16x16_1 >>= step;
-    mask_8x8_1 >>= step;
-    mask_4x4_1 >>= step;
-  }
-}
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
-                                     int subsampling, uint64_t mask_16x16,
-                                     uint64_t mask_8x8, uint64_t mask_4x4,
-                                     const loop_filter_info_n *lfi_n,
-                                     const uint8_t *lfl) {
-  uint64_t mask;
-  int count;
-  const int step = 1 << subsampling;
-  const unsigned int two_block_mask = subsampling ? 5 : 3;
-  int offset = 0;
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-    // Next block's thresholds, when it is within current 64x64 block.
-    // If it is out of bound, its mask is zero, and it points to current edge's
-    // filter parameters, instead of next edge's.
-    int next_edge = step;
-    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
-    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
-
-    count = 1;
-    if (mask & 1) {
-      if (mask_16x16 & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_horizontal =
-            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
-
-        if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          } else {
-            aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, lfin->mblim, lfin->lim,
-                                       lfin->hev_thr);
-          }
-          count = 2;
-        } else {
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      } else if (mask_8x8 & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_horizontal =
-            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
-
-        if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          } else {
-            aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          }
-          count = 2;
-        } else {
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, lfin->mblim, lfin->lim,
-                                    lfin->hev_thr);
-          count = 2;
-        } else {
-          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      }
-    }
-
-    s += 4 * count;
-    lfl += step * count;
-    mask_16x16 >>= step * count;
-    mask_8x8 >>= step * count;
-    mask_4x4 >>= step * count;
-    offset += step * count;
-  }
-}
-
-#if CONFIG_AV1_HIGHBITDEPTH
-static void highbd_filter_selectively_horiz(
-    uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
-    uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
-    uint8_t *lfl, int bd) {
-  uint64_t mask;
-  int count;
-  const int step = 1 << subsampling;
-  const unsigned int two_block_mask = subsampling ? 5 : 3;
-  int offset = 0;
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-    // Next block's thresholds, when it is within current 64x64 block.
-    // If it is out of bound, its mask is zero, and it points to current edge's
-    // filter parameters, instead of next edge's.
-    int next_edge = step;
-    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
-    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
-
-    count = 1;
-    if (mask & 1) {
-      if (mask_16x16 & 1) {
-        HbdLpfFunc highbd_lpf_horizontal =
-            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
-
-        if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                               lfi->hev_thr, lfin->mblim,
-                                               lfin->lim, lfin->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                              lfi->hev_thr, lfin->mblim,
-                                              lfin->lim, lfin->hev_thr, bd);
-          }
-          count = 2;
-        } else {
-          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                bd);
-        }
-      } else if (mask_8x8 & 1) {
-        HbdLpfFunc highbd_lpf_horizontal =
-            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
-
-        if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                               lfi->hev_thr, lfin->mblim,
-                                               lfin->lim, lfin->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_horizontal_8_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                               lfi->hev_thr, lfin->mblim,
-                                               lfin->lim, lfin->hev_thr, bd);
-          }
-          count = 2;
-        } else {
-          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                bd);
-        }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          aom_highbd_lpf_horizontal_4_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                             lfi->hev_thr, lfin->mblim,
-                                             lfin->lim, lfin->hev_thr, bd);
-          count = 2;
-        } else {
-          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, bd);
-        }
-      }
-    }
-
-    s += 4 * count;
-    lfl += step * count;
-    mask_16x16 >>= step * count;
-    mask_8x8 >>= step * count;
-    mask_4x4 >>= step * count;
-    offset += step * count;
-  }
-}
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-void av1_build_bitmask_vert_info(
-    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
-    int plane) {
-  const int subsampling_x = plane_ptr->subsampling_x;
-  const int subsampling_y = plane_ptr->subsampling_y;
-  const int is_uv = plane > 0;
-  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
-  uint8_t level, prev_level = 1;
-  uint64_t skip, prev_skip = 0;
-  uint64_t is_coding_block_border;
-
-  for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r++) {
-    const int mi_row = r << subsampling_y;
-    const int row = mi_row % MI_SIZE_64X64;
-    const int row_uv = row | subsampling_y;
-    int index = 0;
-    const int shift = get_index_shift(0, row, &index);
-
-    for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
-         c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
-      const int mi_col = c << subsampling_x;
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-
-      for (int col_in_unit = 0;
-           col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
-        const int x = (c + col_in_unit) << MI_SIZE_LOG2;
-        if (x >= plane_ptr->dst.width) break;
-        const int col = col_in_unit << subsampling_x;
-        const int col_uv = col | subsampling_x;
-        const uint64_t mask = ((uint64_t)1 << (shift | col));
-        skip = lfm->skip.bits[index] & mask;
-        is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
-        switch (plane) {
-          case 0: level = lfm->lfl_y_ver[row_uv][col_uv]; break;
-          case 1: level = lfm->lfl_u_ver[row_uv][col_uv]; break;
-          case 2: level = lfm->lfl_v_ver[row_uv][col_uv]; break;
-          default: assert(plane >= 0 && plane <= 2); return;
-        }
-        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
-          if (is_uv && ts == TX_64X64) continue;
-          if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
-            tx_size = ts;
-            break;
-          }
-        }
-        if ((c + col_in_unit > 0) && (level || prev_level) &&
-            (!prev_skip || !skip || is_coding_block_border)) {
-          const TX_SIZE min_tx_size =
-              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
-          const int shift_1 = get_index_shift(col_uv, row_uv, &index);
-          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
-          switch (plane) {
-            case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
-            case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
-            case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
-            default: assert(plane >= 0 && plane <= 2); return;
-          }
-          if (level == 0 && prev_level != 0) {
-            switch (plane) {
-              case 0: lfm->lfl_y_ver[row_uv][col_uv] = prev_level; break;
-              case 1: lfm->lfl_u_ver[row_uv][col_uv] = prev_level; break;
-              case 2: lfm->lfl_v_ver[row_uv][col_uv] = prev_level; break;
-              default: assert(plane >= 0 && plane <= 2); return;
-            }
-          }
-        }
-
-        // update prev info
-        prev_level = level;
-        prev_skip = skip;
-        prev_tx_size = tx_size;
-        // advance
-        col_in_unit += tx_size_wide_unit[tx_size];
-      }
-    }
-  }
-}
-
-void av1_build_bitmask_horz_info(
-    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
-    int plane) {
-  const int subsampling_x = plane_ptr->subsampling_x;
-  const int subsampling_y = plane_ptr->subsampling_y;
-  const int is_uv = plane > 0;
-  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
-  uint8_t level, prev_level = 1;
-  uint64_t skip, prev_skip = 0;
-  uint64_t is_coding_block_border;
-
-  for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c++) {
-    const int mi_col = c << subsampling_x;
-    const int col = mi_col % MI_SIZE_64X64;
-    const int col_uv = col | subsampling_x;
-
-    for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
-         r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
-      const int mi_row = r << subsampling_y;
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-
-      for (int r_in_unit = 0;
-           r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
-        const int y = (r + r_in_unit) << MI_SIZE_LOG2;
-        if (y >= plane_ptr->dst.height) break;
-        const int row = r_in_unit << subsampling_y;
-        const int row_uv = row | subsampling_y;
-        int index = 0;
-        const int shift = get_index_shift(col, row, &index);
-        const uint64_t mask = ((uint64_t)1 << shift);
-        skip = lfm->skip.bits[index] & mask;
-        is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
-        switch (plane) {
-          case 0: level = lfm->lfl_y_hor[row_uv][col_uv]; break;
-          case 1: level = lfm->lfl_u_hor[row_uv][col_uv]; break;
-          case 2: level = lfm->lfl_v_hor[row_uv][col_uv]; break;
-          default: assert(plane >= 0 && plane <= 2); return;
-        }
-        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
-          if (is_uv && ts == TX_64X64) continue;
-          if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
-            tx_size = ts;
-            break;
-          }
-        }
-        if ((r + r_in_unit > 0) && (level || prev_level) &&
-            (!prev_skip || !skip || is_coding_block_border)) {
-          const TX_SIZE min_tx_size =
-              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
-          const int shift_1 = get_index_shift(col_uv, row_uv, &index);
-          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
-
-          switch (plane) {
-            case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
-            case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
-            case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
-            default: assert(plane >= 0 && plane <= 2); return;
-          }
-          if (level == 0 && prev_level != 0) {
-            switch (plane) {
-              case 0: lfm->lfl_y_hor[row_uv][col_uv] = prev_level; break;
-              case 1: lfm->lfl_u_hor[row_uv][col_uv] = prev_level; break;
-              case 2: lfm->lfl_v_hor[row_uv][col_uv] = prev_level; break;
-              default: assert(plane >= 0 && plane <= 2); return;
-            }
-          }
-        }
-
-        // update prev info
-        prev_level = level;
-        prev_skip = skip;
-        prev_tx_size = tx_size;
-        // advance
-        r_in_unit += tx_size_high_unit[tx_size];
-      }
-    }
-  }
-}
-
-void av1_filter_block_plane_bitmask_vert(
-    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
-    int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  uint8_t *const buf0 = dst->buf;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int row_step = 1 << ssy;
-  const int two_row_step = 2 << ssy;
-  const int row_stride = dst->stride << MI_SIZE_LOG2;
-  const int two_row_stride = row_stride << 1;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-  uint8_t *lfl2;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  assert(lfm);
-
-  // 1. vertical filtering. filter two rows at a time
-  for (int r = 0;
-       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
-       r += two_row_step) {
-    const int row = r | ssy;
-    const int row_next = row + row_step;
-    const int col = ssx;
-    int index = 0;
-    const int shift = get_index_shift(col, row, &index);
-    int index_next = 0;
-    const int shift_next = get_index_shift(col, row_next, &index_next);
-    const int has_next_row = row_next < cm->mi_params.mi_rows;
-    switch (pl) {
-      case 0:
-        mask_16x16 = lfm->left_y[TX_16X16].bits[index];
-        mask_8x8 = lfm->left_y[TX_8X8].bits[index];
-        mask_4x4 = lfm->left_y[TX_4X4].bits[index];
-        lfl = &lfm->lfl_y_ver[row][col];
-        lfl2 = &lfm->lfl_y_ver[row_next][col];
-        break;
-      case 1:
-        mask_16x16 = lfm->left_u[TX_16X16].bits[index];
-        mask_8x8 = lfm->left_u[TX_8X8].bits[index];
-        mask_4x4 = lfm->left_u[TX_4X4].bits[index];
-        lfl = &lfm->lfl_u_ver[row][col];
-        lfl2 = &lfm->lfl_u_ver[row_next][col];
-        break;
-      case 2:
-        mask_16x16 = lfm->left_v[TX_16X16].bits[index];
-        mask_8x8 = lfm->left_v[TX_8X8].bits[index];
-        mask_4x4 = lfm->left_v[TX_4X4].bits[index];
-        lfl = &lfm->lfl_v_ver[row][col];
-        lfl2 = &lfm->lfl_v_ver[row_next][col];
-        break;
-      default: assert(pl >= 0 && pl <= 2); return;
-    }
-    uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
-    uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
-    uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
-    uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
-    uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
-    uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
-    if (!has_next_row) {
-      mask_16x16_1 = 0;
-      mask_8x8_1 = 0;
-      mask_4x4_1 = 0;
-    }
-
-#if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth)
-      highbd_filter_selectively_vert_row2(
-          ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
-          mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
-    else
-      filter_selectively_vert_row2(
-          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
-          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
-#else
-    filter_selectively_vert_row2(
-        ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
-        mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
-#endif
-    dst->buf += two_row_stride;
-  }
-  // reset buf pointer for horizontal filtering
-  dst->buf = buf0;
-}
-
-void av1_filter_block_plane_bitmask_horz(
-    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
-    int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  uint8_t *const buf0 = dst->buf;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int row_step = 1 << ssy;
-  const int row_stride = dst->stride << MI_SIZE_LOG2;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  assert(lfm);
-  for (int r = 0;
-       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
-       r += row_step) {
-    if (mi_row + r == 0) {
-      dst->buf += row_stride;
-      continue;
-    }
-    const int row = r | ssy;
-    const int col = ssx;
-    int index = 0;
-    const int shift = get_index_shift(col, row, &index);
-    switch (pl) {
-      case 0:
-        mask_16x16 = lfm->above_y[TX_16X16].bits[index];
-        mask_8x8 = lfm->above_y[TX_8X8].bits[index];
-        mask_4x4 = lfm->above_y[TX_4X4].bits[index];
-        lfl = &lfm->lfl_y_hor[row][col];
-        break;
-      case 1:
-        mask_16x16 = lfm->above_u[TX_16X16].bits[index];
-        mask_8x8 = lfm->above_u[TX_8X8].bits[index];
-        mask_4x4 = lfm->above_u[TX_4X4].bits[index];
-        lfl = &lfm->lfl_u_hor[row][col];
-        break;
-      case 2:
-        mask_16x16 = lfm->above_v[TX_16X16].bits[index];
-        mask_8x8 = lfm->above_v[TX_8X8].bits[index];
-        mask_4x4 = lfm->above_v[TX_4X4].bits[index];
-        lfl = &lfm->lfl_v_hor[row][col];
-        break;
-      default: assert(pl >= 0 && pl <= 2); return;
-    }
-    mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
-    mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
-    mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
-
-#if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth)
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
-          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
-    else
-      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
-#else
-    filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                             mask_8x8, mask_4x4, &cm->lf_info, lfl);
-#endif
-    dst->buf += row_stride;
-  }
-  // reset buf pointer for next block
-  dst->buf = buf0;
-}
-
-void av1_filter_block_plane_ver(AV1_COMMON *const cm,
-                                struct macroblockd_plane *const plane_ptr,
-                                int pl, int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  int r, c;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int single_step = 1 << ssy;
-  const int r_step = 2 << ssy;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-  uint8_t *lfl2;
-
-  // filter two rows at a time
-  for (r = 0; r < cm->seq_params.mib_size &&
-              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
-       r += r_step) {
-    for (c = 0; c < cm->seq_params.mib_size &&
-                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
-         c += MI_SIZE_64X64) {
-      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
-      assert(lfm);
-      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
-      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
-      int index = 0;
-      const int shift = get_index_shift(col, row, &index);
-      // current and next row should belong to the same mask_idx and index
-      // next row's shift
-      const int row_next = row + single_step;
-      int index_next = 0;
-      const int shift_next = get_index_shift(col, row_next, &index_next);
-      switch (pl) {
-        case 0:
-          mask_16x16 = lfm->left_y[TX_16X16].bits[index];
-          mask_8x8 = lfm->left_y[TX_8X8].bits[index];
-          mask_4x4 = lfm->left_y[TX_4X4].bits[index];
-          lfl = &lfm->lfl_y_ver[row][col];
-          lfl2 = &lfm->lfl_y_ver[row_next][col];
-          break;
-        case 1:
-          mask_16x16 = lfm->left_u[TX_16X16].bits[index];
-          mask_8x8 = lfm->left_u[TX_8X8].bits[index];
-          mask_4x4 = lfm->left_u[TX_4X4].bits[index];
-          lfl = &lfm->lfl_u_ver[row][col];
-          lfl2 = &lfm->lfl_u_ver[row_next][col];
-          break;
-        case 2:
-          mask_16x16 = lfm->left_v[TX_16X16].bits[index];
-          mask_8x8 = lfm->left_v[TX_8X8].bits[index];
-          mask_4x4 = lfm->left_v[TX_4X4].bits[index];
-          lfl = &lfm->lfl_v_ver[row][col];
-          lfl2 = &lfm->lfl_v_ver[row_next][col];
-          break;
-        default: assert(pl >= 0 && pl <= 2); return;
-      }
-      uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
-      uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
-      uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
-      uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
-      uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
-      uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
-
-#if CONFIG_AV1_HIGHBITDEPTH
-      if (cm->seq_params.use_highbitdepth)
-        highbd_filter_selectively_vert_row2(
-            ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
-            mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-            &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
-      else
-        filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
-                                     mask_16x16_0, mask_8x8_0, mask_4x4_0,
-                                     mask_16x16_1, mask_8x8_1, mask_4x4_1,
-                                     &cm->lf_info, lfl, lfl2);
-#else
-      filter_selectively_vert_row2(
-          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
-          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
-#endif
-      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
-    }
-    dst->buf += 2 * MI_SIZE * dst->stride;
-  }
-}
-
-void av1_filter_block_plane_hor(AV1_COMMON *const cm,
-                                struct macroblockd_plane *const plane_ptr,
-                                int pl, int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  int r, c;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int r_step = 1 << ssy;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-
-  for (r = 0; r < cm->seq_params.mib_size &&
-              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
-       r += r_step) {
-    for (c = 0; c < cm->seq_params.mib_size &&
-                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
-         c += MI_SIZE_64X64) {
-      if (mi_row + r == 0) continue;
-
-      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
-      assert(lfm);
-      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
-      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
-      int index = 0;
-      const int shift = get_index_shift(col, row, &index);
-      switch (pl) {
-        case 0:
-          mask_16x16 = lfm->above_y[TX_16X16].bits[index];
-          mask_8x8 = lfm->above_y[TX_8X8].bits[index];
-          mask_4x4 = lfm->above_y[TX_4X4].bits[index];
-          lfl = &lfm->lfl_y_hor[row][col];
-          break;
-        case 1:
-          mask_16x16 = lfm->above_u[TX_16X16].bits[index];
-          mask_8x8 = lfm->above_u[TX_8X8].bits[index];
-          mask_4x4 = lfm->above_u[TX_4X4].bits[index];
-          lfl = &lfm->lfl_u_hor[row][col];
-          break;
-        case 2:
-          mask_16x16 = lfm->above_v[TX_16X16].bits[index];
-          mask_8x8 = lfm->above_v[TX_8X8].bits[index];
-          mask_4x4 = lfm->above_v[TX_4X4].bits[index];
-          lfl = &lfm->lfl_v_hor[row][col];
-          break;
-        default: assert(pl >= 0 && pl <= 2); return;
-      }
-      mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
-      mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
-      mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
-
-#if CONFIG_AV1_HIGHBITDEPTH
-      if (cm->seq_params.use_highbitdepth)
-        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
-                                        dst->stride, pl, ssx, mask_16x16,
-                                        mask_8x8, mask_4x4, &cm->lf_info, lfl,
-                                        (int)cm->seq_params.bit_depth);
-      else
-        filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                                 mask_8x8, mask_4x4, &cm->lf_info, lfl);
-#else
-      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
-#endif
-      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
-    }
-    dst->buf += MI_SIZE * dst->stride;
-  }
-}
-
-void av1_store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, TX_SIZE tx_size,
-                             MB_MODE_INFO *mbmi) {
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
-  const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
-  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
-      mbmi->sb_type, cm->seq_params.subsampling_x,
-      cm->seq_params.subsampling_y)];
-  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
-      mbmi->sb_type, cm->seq_params.subsampling_x,
-      cm->seq_params.subsampling_y)];
-  const int is_square_transform_size = tx_size <= TX_64X64;
-  int mask_id = 0;
-  int offset = 0;
-  const int half_ratio_tx_size_max32 =
-      (tx_size > TX_64X64) & (tx_size <= TX_32X16);
-  if (is_square_transform_size) {
-    switch (tx_size) {
-      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
-      case TX_8X8:
-        mask_id = mask_id_table_tx_8x8[bsize];
-        offset = 19;
-        break;
-      case TX_16X16:
-        mask_id = mask_id_table_tx_16x16[bsize];
-        offset = 33;
-        break;
-      case TX_32X32:
-        mask_id = mask_id_table_tx_32x32[bsize];
-        offset = 42;
-        break;
-      case TX_64X64: mask_id = 46; break;
-      default: assert(!is_square_transform_size); return;
-    }
-    mask_id += offset;
-  } else if (half_ratio_tx_size_max32) {
-    int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size];
-    mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
-  } else if (tx_size == TX_32X64) {
-    mask_id = 59;
-  } else if (tx_size == TX_64X32) {
-    mask_id = 60;
-  } else {  // quarter ratio tx size
-    mask_id = 61 + (tx_size - TX_4X16);
-  }
-  int index = 0;
-  const int row = mi_row % MI_SIZE_64X64;
-  const int col = mi_col % MI_SIZE_64X64;
-  const int shift = get_index_shift(col, row, &index);
-  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
-  for (int i = 0; i + index < 4; ++i) {
-    // y vertical.
-    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
-        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
-    // y horizontal.
-    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
-        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
-    // u/v vertical.
-    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
-        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
-    // u/v horizontal.
-    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
-        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
-  }
-}
-
-void av1_store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
-                                     BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
-  // Use a lookup table that provides one bitmask for a given block size and
-  // a univariant transform size.
-  int index;
-  int shift;
-  int row;
-  int col;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
-  const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
-  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
-      mbmi->sb_type, cm->seq_params.subsampling_x,
-      cm->seq_params.subsampling_y)];
-  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
-      mbmi->sb_type, cm->seq_params.subsampling_x,
-      cm->seq_params.subsampling_y)];
-  const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
-  int mask_id = 0;
-  int offset = 0;
-  const int half_ratio_tx_size_max32 =
-      (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16);
-  if (is_square_transform_size) {
-    switch (mbmi->tx_size) {
-      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
-      case TX_8X8:
-        mask_id = mask_id_table_tx_8x8[bsize];
-        offset = 19;
-        break;
-      case TX_16X16:
-        mask_id = mask_id_table_tx_16x16[bsize];
-        offset = 33;
-        break;
-      case TX_32X32:
-        mask_id = mask_id_table_tx_32x32[bsize];
-        offset = 42;
-        break;
-      case TX_64X64: mask_id = 46; break;
-      default: assert(!is_square_transform_size); return;
-    }
-    mask_id += offset;
-  } else if (half_ratio_tx_size_max32) {
-    int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size];
-    mask_id =
-        47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
-  } else if (mbmi->tx_size == TX_32X64) {
-    mask_id = 59;
-  } else if (mbmi->tx_size == TX_64X32) {
-    mask_id = 60;
-  } else {  // quarter ratio tx size
-    mask_id = 61 + (mbmi->tx_size - TX_4X16);
-  }
-  row = mi_row % MI_SIZE_64X64;
-  col = mi_col % MI_SIZE_64X64;
-  shift = get_index_shift(col, row, &index);
-  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
-  for (int i = 0; i + index < 4; ++i) {
-    // y vertical.
-    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
-        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
-    // y horizontal.
-    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
-        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
-    // u/v vertical.
-    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
-        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
-    // u/v horizontal.
-    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
-        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
-  }
-}
-
-void av1_store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col,
-                                  BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
-                                  int is_horz_coding_block_border,
-                                  int is_vert_coding_block_border) {
-  int index;
-  int shift;
-  int row;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  const int row_start = mi_row % MI_SIZE_64X64;
-  const int col_start = mi_col % MI_SIZE_64X64;
-  shift = get_index_shift(col_start, row_start, &index);
-  if (is_horz_coding_block_border) {
-    const int block_shift = shift + mi_size_wide[bsize];
-    assert(block_shift <= 64);
-    const uint64_t right_edge_shift =
-        (block_shift == 64) ? 0xffffffffffffffff : ((uint64_t)1 << block_shift);
-    const uint64_t left_edge_shift = (block_shift == 64)
-                                         ? (((uint64_t)1 << shift) - 1)
-                                         : ((uint64_t)1 << shift);
-    assert(right_edge_shift > left_edge_shift);
-    const uint64_t top_edge_mask = right_edge_shift - left_edge_shift;
-    lfm->is_horz_border.bits[index] |= top_edge_mask;
-  }
-  if (is_vert_coding_block_border) {
-    const int is_vert_border = mask_id_table_vert_border[bsize];
-    const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start;
-    for (int i = 0; i + index < 4; ++i) {
-      lfm->is_vert_border.bits[i + index] |=
-          (left_mask_univariant_reordered[is_vert_border].bits[i]
-           << vert_shift);
-    }
-  }
-  const int is_skip = mbmi->skip && is_inter_block(mbmi);
-  if (is_skip) {
-    const int is_skip_mask = mask_id_table_tx_4x4[bsize];
-    for (int i = 0; i + index < 4; ++i) {
-      lfm->skip.bits[i + index] |=
-          (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift);
-    }
-  }
-  const uint8_t level_vert_y =
-      av1_get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
-  const uint8_t level_horz_y =
-      av1_get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
-  const uint8_t level_u = av1_get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
-  const uint8_t level_v = av1_get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
-  for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
-    index = 0;
-    row = r % MI_SIZE_64X64;
-    memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_u_ver[row][col_start], level_u,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_u_hor[row][col_start], level_u,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_v_ver[row][col_start], level_v,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_v_hor[row][col_start], level_v,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-  }
-}
-#endif  // CONFIG_LPF_MASK
diff --git a/media/libaom/src/av1/common/mv.h b/media/libaom/src/av1/common/mv.h
index be539e8201..c7eaf76d08 100644
--- a/media/libaom/src/av1/common/mv.h
+++ b/media/libaom/src/av1/common/mv.h
@@ -12,6 +12,8 @@
 #ifndef AOM_AV1_COMMON_MV_H_
 #define AOM_AV1_COMMON_MV_H_
 
+#include <stdlib.h>
+
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
 #include "aom_dsp/aom_filter.h"
@@ -21,13 +23,14 @@ extern "C" {
 #endif
 
 #define INVALID_MV 0x80008000
+#define INVALID_MV_ROW_COL -32768
 #define GET_MV_RAWPEL(x) (((x) + 3 + ((x) >= 0)) >> 3)
 #define GET_MV_SUBPEL(x) ((x)*8)
 
 #define MARK_MV_INVALID(mv)                \
   do {                                     \
     ((int_mv *)(mv))->as_int = INVALID_MV; \
-  } while (0);
+  } while (0)
 #define CHECK_MV_EQUAL(x, y) (((x).row == (y).row) && ((x).col == (y).col))
 
 // The motion vector in units of full pixel
@@ -136,7 +139,7 @@ static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
 //  z .  y'  =   m4 m5 m1 *  y
 //       1]      m6 m7 1)    1]
 typedef struct {
-  int32_t wmmat[8];
+  int32_t wmmat[6];
   int16_t alpha, beta, gamma, delta;
   TransformationType wmtype;
   int8_t invalid;
@@ -144,8 +147,7 @@ typedef struct {
 
 /* clang-format off */
 static const WarpedMotionParams default_warp_params = {
-  { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0,
-    0 },
+  { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS) },
   0, 0, 0, 0,
   IDENTITY,
   0,
diff --git a/media/libaom/src/av1/common/mvref_common.c b/media/libaom/src/av1/common/mvref_common.c
index db3098cc0e..d8889f3eb3 100644
--- a/media/libaom/src/av1/common/mvref_common.c
+++ b/media/libaom/src/av1/common/mvref_common.c
@@ -160,7 +160,7 @@ static AOM_INLINE void scan_row_mbmi(
 
   for (int i = 0; i < end_mi;) {
     const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
-    const int candidate_bsize = candidate->sb_type;
+    const int candidate_bsize = candidate->bsize;
     const int n4_w = mi_size_wide[candidate_bsize];
     int len = AOMMIN(xd->width, n4_w);
     if (use_step_16)
@@ -207,7 +207,7 @@ static AOM_INLINE void scan_col_mbmi(
   for (i = 0; i < end_mi;) {
     const MB_MODE_INFO *const candidate =
         xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
-    const int candidate_bsize = candidate->sb_type;
+    const int candidate_bsize = candidate->bsize;
     const int n4_h = mi_size_high[candidate_bsize];
     int len = AOMMIN(xd->height, n4_h);
     if (use_step_16)
@@ -258,7 +258,7 @@ static AOM_INLINE void scan_blk_mbmi(
 
 static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                          int mi_row, int mi_col, int bs) {
-  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
   const int mask_row = mi_row & (sb_mi_size - 1);
   const int mask_col = mi_col & (sb_mi_size - 1);
 
@@ -285,15 +285,17 @@ static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     bs <<= 1;
   }
 
-  // The left hand of two vertical rectangles always has a top right (as the
-  // block above will have been decoded)
-  if (xd->width < xd->height)
-    if (!xd->is_sec_rect) has_tr = 1;
+  // In a VERTICAL or VERTICAL_4 partition, all partition before the last one
+  // always have a top right (as the block above will have been decoded).
+  if (xd->width < xd->height) {
+    if (!xd->is_last_vertical_rect) has_tr = 1;
+  }
 
-  // The bottom of two horizontal rectangles never has a top right (as the block
-  // to the right won't have been decoded)
-  if (xd->width > xd->height)
-    if (xd->is_sec_rect) has_tr = 0;
+  // In a HORIZONTAL or HORIZONTAL_4 partition, partitions after the first one
+  // never have a top right (as the block to the right won't have been decoded).
+  if (xd->width > xd->height) {
+    if (!xd->is_first_horizontal_rect) has_tr = 0;
+  }
 
   // The bottom left square of a Vertical A (in the old format) does
   // not have a top right as it is decoded before the right hand
@@ -345,7 +347,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   const int cur_frame_index = cm->cur_frame->order_hint;
   const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
   const int frame0_index = buf_0->order_hint;
-  const int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
+  const int cur_offset_0 = get_relative_dist(&cm->seq_params->order_hint_info,
                                              cur_frame_index, frame0_index);
   int idx;
   const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
@@ -378,7 +380,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     // Process compound inter mode
     const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]);
     const int frame1_index = buf_1->order_hint;
-    const int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info,
+    const int cur_offset_1 = get_relative_dist(&cm->seq_params->order_hint_info,
                                                cur_frame_index, frame1_index);
     int_mv comp_refmv;
     get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
@@ -686,14 +688,14 @@ static AOM_INLINE void setup_ref_mv_list(
         const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
         process_compound_ref_mv_candidate(
             candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
-        idx += mi_size_wide[candidate->sb_type];
+        idx += mi_size_wide[candidate->bsize];
       }
 
       for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
         const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
         process_compound_ref_mv_candidate(
             candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
-        idx += mi_size_high[candidate->sb_type];
+        idx += mi_size_high[candidate->bsize];
       }
 
       // Build up the compound mv predictor
@@ -750,7 +752,7 @@ static AOM_INLINE void setup_ref_mv_list(
       const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
       process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
                                       ref_mv_stack, ref_mv_weight);
-      idx += mi_size_wide[candidate->sb_type];
+      idx += mi_size_wide[candidate->bsize];
     }
 
     for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
@@ -758,7 +760,7 @@ static AOM_INLINE void setup_ref_mv_list(
       const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
       process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
                                       ref_mv_stack, ref_mv_weight);
-      idx += mi_size_high[candidate->sb_type];
+      idx += mi_size_high[candidate->bsize];
     }
 
     for (int idx = 0; idx < *refmv_count; ++idx) {
@@ -795,7 +797,7 @@ void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
       global_mvs[ref_frame].as_int = INVALID_MV;
     }
   } else {
-    const BLOCK_SIZE bsize = mi->sb_type;
+    const BLOCK_SIZE bsize = mi->bsize;
     const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
     const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
     if (ref_frame < REF_FRAMES) {
@@ -836,7 +838,7 @@ void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
 void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
   cm->cur_frame->order_hint = cm->current_frame.order_hint;
   cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint;
-
+  cm->cur_frame->pyramid_level = cm->current_frame.pyramid_level;
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
@@ -852,10 +854,10 @@ void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
-    if (cm->seq_params.order_hint_info.enable_order_hint && buf != NULL) {
+    if (cm->seq_params->order_hint_info.enable_order_hint && buf != NULL) {
       const int ref_order_hint = buf->order_hint;
       cm->ref_frame_sign_bias[ref_frame] =
-          (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint,
+          (get_relative_dist(&cm->seq_params->order_hint_info, ref_order_hint,
                              (int)cm->current_frame.order_hint) <= 0)
               ? 0
               : 1;
@@ -928,10 +930,10 @@ static int motion_field_projection(AV1_COMMON *cm,
       &start_frame_buf->ref_order_hints[0];
   const int cur_order_hint = cm->cur_frame->order_hint;
   int start_to_current_frame_offset = get_relative_dist(
-      &cm->seq_params.order_hint_info, start_frame_order_hint, cur_order_hint);
+      &cm->seq_params->order_hint_info, start_frame_order_hint, cur_order_hint);
 
   for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
-    ref_offset[rf] = get_relative_dist(&cm->seq_params.order_hint_info,
+    ref_offset[rf] = get_relative_dist(&cm->seq_params->order_hint_info,
                                        start_frame_order_hint,
                                        ref_order_hints[rf - LAST_FRAME]);
   }
@@ -978,12 +980,34 @@ static int motion_field_projection(AV1_COMMON *cm,
   return 1;
 }
 
-void av1_setup_motion_field(AV1_COMMON *cm) {
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+// cm->ref_frame_side is calculated here, and will be used in
+// av1_copy_frame_mvs() to affect how mvs are copied.
+void av1_calculate_ref_frame_side(AV1_COMMON *cm) {
+  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
 
   memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
   if (!order_hint_info->enable_order_hint) return;
 
+  const int cur_order_hint = cm->cur_frame->order_hint;
+
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    int order_hint = 0;
+
+    if (buf != NULL) order_hint = buf->order_hint;
+
+    if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0)
+      cm->ref_frame_side[ref_frame] = 1;
+    else if (order_hint == cur_order_hint)
+      cm->ref_frame_side[ref_frame] = -1;
+  }
+}
+
+void av1_setup_motion_field(AV1_COMMON *cm) {
+  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
+
+  if (!order_hint_info->enable_order_hint) return;
+
   TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
   int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) *
              (cm->mi_params.mi_stride >> 1);
@@ -993,7 +1017,6 @@ void av1_setup_motion_field(AV1_COMMON *cm) {
   }
 
   const int cur_order_hint = cm->cur_frame->order_hint;
-
   const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME];
   int ref_order_hint[INTER_REFS_PER_FRAME];
 
@@ -1006,11 +1029,6 @@ void av1_setup_motion_field(AV1_COMMON *cm) {
 
     ref_buf[ref_idx] = buf;
     ref_order_hint[ref_idx] = order_hint;
-
-    if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0)
-      cm->ref_frame_side[ref_frame] = 1;
-    else if (order_hint == cur_order_hint)
-      cm->ref_frame_side[ref_frame] = -1;
   }
 
   int ref_stamp = MFMV_STACK_SIZE - 1;
@@ -1050,15 +1068,15 @@ void av1_setup_motion_field(AV1_COMMON *cm) {
 static INLINE void record_samples(const MB_MODE_INFO *mbmi, int *pts,
                                   int *pts_inref, int row_offset, int sign_r,
                                   int col_offset, int sign_c) {
-  int bw = block_size_wide[mbmi->sb_type];
-  int bh = block_size_high[mbmi->sb_type];
-  int x = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1;
-  int y = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1;
+  const int bw = block_size_wide[mbmi->bsize];
+  const int bh = block_size_high[mbmi->bsize];
+  const int x = col_offset * MI_SIZE + sign_c * bw / 2 - 1;
+  const int y = row_offset * MI_SIZE + sign_r * bh / 2 - 1;
 
   pts[0] = GET_MV_SUBPEL(x);
   pts[1] = GET_MV_SUBPEL(y);
-  pts_inref[0] = GET_MV_SUBPEL(x) + mbmi->mv[0].as_mv.col;
-  pts_inref[1] = GET_MV_SUBPEL(y) + mbmi->mv[0].as_mv.row;
+  pts_inref[0] = pts[0] + mbmi->mv[0].as_mv.col;
+  pts_inref[1] = pts[1] + mbmi->mv[0].as_mv.row;
 }
 
 // Select samples according to the motion vector difference.
@@ -1067,44 +1085,22 @@ uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len,
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
-  int pts_mvd[SAMPLES_ARRAY_SIZE] = { 0 };
-  int i, j, k, l = len;
   uint8_t ret = 0;
   assert(len <= LEAST_SQUARES_SAMPLES_MAX);
 
-  // Obtain the motion vector difference.
-  for (i = 0; i < len; ++i) {
-    pts_mvd[i] = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) +
-                 abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row);
-
-    if (pts_mvd[i] > thresh)
-      pts_mvd[i] = -1;
-    else
-      ret++;
+  // Only keep the samples with MV differences within threshold.
+  for (int i = 0; i < len; ++i) {
+    const int diff = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) +
+                     abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row);
+    if (diff > thresh) continue;
+    if (ret != i) {
+      memcpy(pts + 2 * ret, pts + 2 * i, 2 * sizeof(pts[0]));
+      memcpy(pts_inref + 2 * ret, pts_inref + 2 * i, 2 * sizeof(pts_inref[0]));
+    }
+    ++ret;
   }
-
   // Keep at least 1 sample.
-  if (!ret) return 1;
-
-  i = 0;
-  j = l - 1;
-  for (k = 0; k < l - ret; k++) {
-    while (pts_mvd[i] != -1) i++;
-    while (pts_mvd[j] == -1) j--;
-    assert(i != j);
-    if (i > j) break;
-
-    // Replace the discarded samples;
-    pts_mvd[i] = pts_mvd[j];
-    pts[2 * i] = pts[2 * j];
-    pts[2 * i + 1] = pts[2 * j + 1];
-    pts_inref[2 * i] = pts_inref[2 * j];
-    pts_inref[2 * i + 1] = pts_inref[2 * j + 1];
-    i++;
-    j--;
-  }
-
-  return ret;
+  return AOMMAX(ret, 1);
 }
 
 // Note: Samples returned are at 1/8-pel precision
@@ -1116,7 +1112,6 @@ uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
   const int ref_frame = mbmi0->ref_frame[0];
   const int up_available = xd->up_available;
   const int left_available = xd->left_available;
-  int i, mi_step;
   uint8_t np = 0;
   int do_tl = 1;
   int do_tr = 1;
@@ -1128,7 +1123,7 @@ uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
   if (up_available) {
     const int mi_row_offset = -1;
     const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride];
-    uint8_t superblock_width = mi_size_wide[mbmi->sb_type];
+    uint8_t superblock_width = mi_size_wide[mbmi->bsize];
 
     if (xd->width <= superblock_width) {
       // Handle "current block width <= above block width" case.
@@ -1141,24 +1136,22 @@ uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
         record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
         pts += 2;
         pts_inref += 2;
-        np++;
-        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+        if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
     } else {
       // Handle "current block width > above block width" case.
-      for (i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
-           i += mi_step) {
+      for (int i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
+           i += superblock_width) {
         mbmi = xd->mi[i + mi_row_offset * mi_stride];
-        superblock_width = mi_size_wide[mbmi->sb_type];
-        mi_step = AOMMIN(xd->width, superblock_width);
+        superblock_width = mi_size_wide[mbmi->bsize];
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
           record_samples(mbmi, pts, pts_inref, 0, -1, i, 1);
           pts += 2;
           pts_inref += 2;
-          np++;
-          if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+          if (++np >= LEAST_SQUARES_SAMPLES_MAX)
+            return LEAST_SQUARES_SAMPLES_MAX;
         }
       }
     }
@@ -1169,7 +1162,7 @@ uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
   if (left_available) {
     const int mi_col_offset = -1;
     const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
-    uint8_t superblock_height = mi_size_high[mbmi->sb_type];
+    uint8_t superblock_height = mi_size_high[mbmi->bsize];
 
     if (xd->height <= superblock_height) {
       // Handle "current block height <= above block height" case.
@@ -1186,19 +1179,18 @@ uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
       }
     } else {
       // Handle "current block height > above block height" case.
-      for (i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
-           i += mi_step) {
+      for (int i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
+           i += superblock_height) {
         mbmi = xd->mi[mi_col_offset + i * mi_stride];
-        superblock_height = mi_size_high[mbmi->sb_type];
-        mi_step = AOMMIN(xd->height, superblock_height);
+        superblock_height = mi_size_high[mbmi->bsize];
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
           record_samples(mbmi, pts, pts_inref, i, 1, 0, -1);
           pts += 2;
           pts_inref += 2;
-          np++;
-          if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+          if (++np >= LEAST_SQUARES_SAMPLES_MAX)
+            return LEAST_SQUARES_SAMPLES_MAX;
         }
       }
     }
@@ -1215,8 +1207,7 @@ uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
       record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
       pts += 2;
       pts_inref += 2;
-      np++;
-      if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+      if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
     }
   }
   assert(np <= LEAST_SQUARES_SAMPLES_MAX);
@@ -1234,8 +1225,7 @@ uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
         record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1);
-        np++;
-        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+        if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
     }
   }
@@ -1245,7 +1235,7 @@ uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
 }
 
 void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
   SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
 
   skip_mode_info->skip_mode_allowed = 0;
@@ -1349,11 +1339,11 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
   int lst_frame_sort_idx = -1;
   int gld_frame_sort_idx = -1;
 
-  assert(cm->seq_params.order_hint_info.enable_order_hint);
-  assert(cm->seq_params.order_hint_info.order_hint_bits_minus_1 >= 0);
+  assert(cm->seq_params->order_hint_info.enable_order_hint);
+  assert(cm->seq_params->order_hint_info.order_hint_bits_minus_1 >= 0);
   const int cur_order_hint = (int)cm->current_frame.order_hint;
   const int cur_frame_sort_idx =
-      1 << cm->seq_params.order_hint_info.order_hint_bits_minus_1;
+      1 << cm->seq_params->order_hint_info.order_hint_bits_minus_1;
 
   REF_FRAME_INFO ref_frame_info[REF_FRAMES];
   int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
@@ -1375,7 +1365,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
     ref_frame_info[i].sort_idx =
         (offset == -1) ? -1
                        : cur_frame_sort_idx +
-                             get_relative_dist(&cm->seq_params.order_hint_info,
+                             get_relative_dist(&cm->seq_params->order_hint_info,
                                                offset, cur_order_hint);
     assert(ref_frame_info[i].sort_idx >= -1);
 
@@ -1386,11 +1376,11 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
   // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference
   // frames.
   if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Inter frame requests a look-ahead frame as LAST");
   }
   if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Inter frame requests a look-ahead frame as GOLDEN");
   }
 
diff --git a/media/libaom/src/av1/common/mvref_common.h b/media/libaom/src/av1/common/mvref_common.h
index 05a0dbc041..3ab784c1ed 100644
--- a/media/libaom/src/av1/common/mvref_common.h
+++ b/media/libaom/src/av1/common/mvref_common.h
@@ -201,6 +201,7 @@ static INLINE uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) {
 void av1_setup_frame_buf_refs(AV1_COMMON *cm);
 void av1_setup_frame_sign_bias(AV1_COMMON *cm);
 void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
+void av1_calculate_ref_frame_side(AV1_COMMON *cm);
 void av1_setup_motion_field(AV1_COMMON *cm);
 void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
                         int lst_map_idx, int gld_map_idx);
diff --git a/media/libaom/src/av1/common/obmc.h b/media/libaom/src/av1/common/obmc.h
index cc97b6bb12..b84034541e 100644
--- a/media/libaom/src/av1/common/obmc.h
+++ b/media/libaom/src/av1/common/obmc.h
@@ -35,7 +35,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
        above_mi_col += mi_step) {
     MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
     mi_step =
-        AOMMIN(mi_size_wide[above_mi[0]->sb_type], mi_size_wide[BLOCK_64X64]);
+        AOMMIN(mi_size_wide[above_mi[0]->bsize], mi_size_wide[BLOCK_64X64]);
     // If we're considering a block with width 4, it should be treated as
     // half of a pair of blocks with chroma information in the second. Move
     // above_mi_col back to the start of the pair if needed, set above_mbmi
@@ -72,7 +72,7 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
        left_mi_row += mi_step) {
     MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
     mi_step =
-        AOMMIN(mi_size_high[left_mi[0]->sb_type], mi_size_high[BLOCK_64X64]);
+        AOMMIN(mi_size_high[left_mi[0]->bsize], mi_size_high[BLOCK_64X64]);
     if (mi_step == 1) {
       left_mi_row &= ~1;
       left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
diff --git a/media/libaom/src/av1/common/obu_util.c b/media/libaom/src/av1/common/obu_util.c
index 7d2694b89b..cfca03bb4d 100644
--- a/media/libaom/src/av1/common/obu_util.c
+++ b/media/libaom/src/av1/common/obu_util.c
@@ -14,24 +14,6 @@
 
 #include "aom_dsp/bitreader_buffer.h"
 
-// Returns 1 when OBU type is valid, and 0 otherwise.
-static int valid_obu_type(int obu_type) {
-  int valid_type = 0;
-  switch (obu_type) {
-    case OBU_SEQUENCE_HEADER:
-    case OBU_TEMPORAL_DELIMITER:
-    case OBU_FRAME_HEADER:
-    case OBU_TILE_GROUP:
-    case OBU_METADATA:
-    case OBU_FRAME:
-    case OBU_REDUNDANT_FRAME_HEADER:
-    case OBU_TILE_LIST:
-    case OBU_PADDING: valid_type = 1; break;
-    default: break;
-  }
-  return valid_type;
-}
-
 static aom_codec_err_t read_obu_size(const uint8_t *data,
                                      size_t bytes_available,
                                      size_t *const obu_size,
@@ -63,9 +45,6 @@ static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
   }
 
   header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
-
-  if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
-
   header->has_extension = aom_rb_read_bit(rb);
   header->has_size_field = aom_rb_read_bit(rb);
 
@@ -74,10 +53,8 @@ static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
     return AOM_CODEC_UNSUP_BITSTREAM;
   }
 
-  if (aom_rb_read_bit(rb) != 0) {
-    // obu_reserved_1bit must be set to 0.
-    return AOM_CODEC_CORRUPT_FRAME;
-  }
+  // obu_reserved_1bit must be set to 0. The value is ignored by a decoder.
+  aom_rb_read_bit(rb);
 
   if (header->has_extension) {
     if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
@@ -85,10 +62,12 @@ static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
     header->size += 1;
     header->temporal_layer_id = aom_rb_read_literal(rb, 3);
     header->spatial_layer_id = aom_rb_read_literal(rb, 2);
-    if (aom_rb_read_literal(rb, 3) != 0) {
-      // extension_header_reserved_3bits must be set to 0.
-      return AOM_CODEC_CORRUPT_FRAME;
-    }
+    // extension_header_reserved_3bits must be set to 0. The value is ignored by
+    // a decoder.
+    aom_rb_read_literal(rb, 3);
+  } else {
+    header->temporal_layer_id = 0;
+    header->spatial_layer_id = 0;
   }
 
   return AOM_CODEC_OK;
diff --git a/media/libaom/src/av1/common/obu_util.h b/media/libaom/src/av1/common/obu_util.h
index 7c56904c84..adf3568e15 100644
--- a/media/libaom/src/av1/common/obu_util.h
+++ b/media/libaom/src/av1/common/obu_util.h
@@ -22,9 +22,9 @@ typedef struct {
                 // optional OBU extension header) in the bitstream.
   OBU_TYPE type;
   int has_size_field;
-  int has_extension;
-  // The following fields come from the OBU extension header and therefore are
-  // only used if has_extension is true.
+  int has_extension;  // Whether the optional OBU extension header is present.
+  // The following fields come from the OBU extension header. They are set to 0
+  // if has_extension is false.
   int temporal_layer_id;
   int spatial_layer_id;
 } ObuHeader;
diff --git a/media/libaom/src/av1/common/pred_common.h b/media/libaom/src/av1/common/pred_common.h
index d1dab97e72..3db9dd69ef 100644
--- a/media/libaom/src/av1/common/pred_common.h
+++ b/media/libaom/src/av1/common/pred_common.h
@@ -107,9 +107,9 @@ static INLINE int get_comp_index_context(const AV1_COMMON *cm,
   if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
   if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
 
-  int fwd = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+  int fwd = abs(get_relative_dist(&cm->seq_params->order_hint_info,
                                   fwd_frame_index, cur_frame_index));
-  int bck = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+  int bck = abs(get_relative_dist(&cm->seq_params->order_hint_info,
                                   cur_frame_index, bck_frame_index));
 
   const MB_MODE_INFO *const above_mi = xd->above_mbmi;
@@ -169,12 +169,12 @@ static INLINE int av1_get_skip_mode_context(const MACROBLOCKD *xd) {
   return above_skip_mode + left_skip_mode;
 }
 
-static INLINE int av1_get_skip_context(const MACROBLOCKD *xd) {
+static INLINE int av1_get_skip_txfm_context(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *const above_mi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mi = xd->left_mbmi;
-  const int above_skip = above_mi ? above_mi->skip : 0;
-  const int left_skip = left_mi ? left_mi->skip : 0;
-  return above_skip + left_skip;
+  const int above_skip_txfm = above_mi ? above_mi->skip_txfm : 0;
+  const int left_skip_txfm = left_mi ? left_mi->skip_txfm : 0;
+  return above_skip_txfm + left_skip_txfm;
 }
 
 int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir);
@@ -208,8 +208,8 @@ static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) {
   return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)];
 }
 
-static INLINE aom_cdf_prob *av1_get_skip_cdf(const MACROBLOCKD *xd) {
-  return xd->tile_ctx->skip_cdfs[av1_get_skip_context(xd)];
+static INLINE aom_cdf_prob *av1_get_skip_txfm_cdf(const MACROBLOCKD *xd) {
+  return xd->tile_ctx->skip_txfm_cdfs[av1_get_skip_txfm_context(xd)];
 }
 
 int av1_get_comp_reference_type_context(const MACROBLOCKD *xd);
@@ -340,7 +340,7 @@ static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->sb_type];
+  const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->bsize];
   const int max_tx_wide = tx_size_wide[max_tx_size];
   const int max_tx_high = tx_size_high[max_tx_size];
   const int has_above = xd->up_available;
@@ -351,11 +351,11 @@ static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
 
   if (has_above)
     if (is_inter_block(above_mbmi))
-      above = block_size_wide[above_mbmi->sb_type] >= max_tx_wide;
+      above = block_size_wide[above_mbmi->bsize] >= max_tx_wide;
 
   if (has_left)
     if (is_inter_block(left_mbmi))
-      left = block_size_high[left_mbmi->sb_type] >= max_tx_high;
+      left = block_size_high[left_mbmi->bsize] >= max_tx_high;
 
   if (has_above && has_left)
     return (above + left);
diff --git a/media/libaom/src/av1/common/quant_common.h b/media/libaom/src/av1/common/quant_common.h
index 9c30204ff0..8f36eb105b 100644
--- a/media/libaom/src/av1/common/quant_common.h
+++ b/media/libaom/src/av1/common/quant_common.h
@@ -36,6 +36,7 @@ extern "C" {
 #define DEFAULT_QM_V 12
 #define DEFAULT_QM_FIRST 5
 #define DEFAULT_QM_LAST 9
+#define LOSSLESS_Q_STEP 4  // this should equal to dc/ac_qlookup_QTX[0]
 
 struct AV1Common;
 struct CommonQuantParams;
diff --git a/media/libaom/src/av1/common/reconinter.c b/media/libaom/src/av1/common/reconinter.c
index 287adddcc0..5dc1de228a 100644
--- a/media/libaom/src/av1/common/reconinter.c
+++ b/media/libaom/src/av1/common/reconinter.c
@@ -19,6 +19,7 @@
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
+#include "aom_ports/aom_once.h"
 
 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
@@ -104,18 +105,8 @@ void av1_init_warp_params(InterPredParams *inter_pred_params,
 
   if (av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0,
                      inter_pred_params->scale_factors,
-                     &inter_pred_params->warp_params))
+                     &inter_pred_params->warp_params)) {
     inter_pred_params->mode = WARP_PRED;
-}
-
-void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize,
-                        const INTERINTER_COMPOUND_DATA *mask_comp) {
-  inter_pred_params->sb_type = bsize;
-  inter_pred_params->mask_comp = *mask_comp;
-
-  if (inter_pred_params->conv_params.compound_index == 1) {
-    inter_pred_params->conv_params.do_average = 0;
-    inter_pred_params->comp_mode = MASK_COMP;
   }
 }
 
@@ -126,8 +117,32 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
   assert(IMPLIES(inter_pred_params->conv_params.is_compound,
                  inter_pred_params->conv_params.dst != NULL));
 
+  if (inter_pred_params->mode == TRANSLATION_PRED) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (inter_pred_params->use_hbd_buf) {
+      highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
+                             inter_pred_params->block_width,
+                             inter_pred_params->block_height,
+                             &inter_pred_params->conv_params,
+                             inter_pred_params->interp_filter_params,
+                             inter_pred_params->bit_depth);
+    } else {
+      inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
+                      inter_pred_params->block_width,
+                      inter_pred_params->block_height,
+                      &inter_pred_params->conv_params,
+                      inter_pred_params->interp_filter_params);
+    }
+#else
+    inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
+                    inter_pred_params->block_width,
+                    inter_pred_params->block_height,
+                    &inter_pred_params->conv_params,
+                    inter_pred_params->interp_filter_params);
+#endif
+  }
   // TODO(jingning): av1_warp_plane() can be further cleaned up.
-  if (inter_pred_params->mode == WARP_PRED) {
+  else if (inter_pred_params->mode == WARP_PRED) {
     av1_warp_plane(
         &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf,
         inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0,
@@ -138,29 +153,8 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
         inter_pred_params->block_width, inter_pred_params->block_height,
         dst_stride, inter_pred_params->subsampling_x,
         inter_pred_params->subsampling_y, &inter_pred_params->conv_params);
-  } else if (inter_pred_params->mode == TRANSLATION_PRED) {
-#if CONFIG_AV1_HIGHBITDEPTH
-    if (inter_pred_params->use_hbd_buf) {
-      highbd_inter_predictor(
-          src, src_stride, dst, dst_stride, subpel_params,
-          inter_pred_params->scale_factors, inter_pred_params->block_width,
-          inter_pred_params->block_height, &inter_pred_params->conv_params,
-          inter_pred_params->interp_filter_params,
-          inter_pred_params->bit_depth);
-    } else {
-      inter_predictor(
-          src, src_stride, dst, dst_stride, subpel_params,
-          inter_pred_params->scale_factors, inter_pred_params->block_width,
-          inter_pred_params->block_height, &inter_pred_params->conv_params,
-          inter_pred_params->interp_filter_params);
-    }
-#else
-    inter_predictor(
-        src, src_stride, dst, dst_stride, subpel_params,
-        inter_pred_params->scale_factors, inter_pred_params->block_width,
-        inter_pred_params->block_height, &inter_pred_params->conv_params,
-        inter_pred_params->interp_filter_params);
-#endif
+  } else {
+    assert(0 && "Unsupported inter_pred_params->mode");
   }
 }
 
@@ -328,14 +322,12 @@ static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
 
 const uint8_t *av1_get_compound_type_mask(
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) {
-  assert(is_masked_compound_type(comp_data->type));
   (void)sb_type;
   switch (comp_data->type) {
     case COMPOUND_WEDGE:
       return av1_get_contiguous_soft_mask(comp_data->wedge_index,
                                           comp_data->wedge_sign, sb_type);
-    case COMPOUND_DIFFWTD: return comp_data->seg_mask;
-    default: assert(0); return NULL;
+    default: return comp_data->seg_mask;
   }
 }
 
@@ -484,7 +476,7 @@ void av1_build_compound_diffwtd_mask_highbd_c(
   }
 }
 
-static AOM_INLINE void init_wedge_master_masks() {
+static AOM_INLINE void init_wedge_master_masks(void) {
   int i, j;
   const int w = MASK_MASTER_SIZE;
   const int h = MASK_MASTER_SIZE;
@@ -529,7 +521,7 @@ static AOM_INLINE void init_wedge_master_masks() {
   }
 }
 
-static AOM_INLINE void init_wedge_masks() {
+static AOM_INLINE void init_wedge_masks(void) {
   uint8_t *dst = wedge_mask_buf;
   BLOCK_SIZE bsize;
   memset(wedge_masks, 0, sizeof(wedge_masks));
@@ -543,13 +535,13 @@ static AOM_INLINE void init_wedge_masks() {
     int w;
     for (w = 0; w < wtypes; ++w) {
       mask = get_wedge_mask_inplace(w, 0, bsize);
-      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw,
                         bh);
       wedge_params->masks[0][w] = dst;
       dst += bw * bh;
 
       mask = get_wedge_mask_inplace(w, 1, bsize);
-      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw,
                         bh);
       wedge_params->masks[1][w] = dst;
       dst += bw * bh;
@@ -616,7 +608,7 @@ static AOM_INLINE void build_smooth_interintra_mask(uint8_t *mask, int stride,
   }
 }
 
-static AOM_INLINE void init_smooth_interintra_masks() {
+static AOM_INLINE void init_smooth_interintra_masks(void) {
   for (int m = 0; m < INTERINTRA_MODES; ++m) {
     for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) {
       const int bw = block_size_wide[bs];
@@ -629,12 +621,14 @@ static AOM_INLINE void init_smooth_interintra_masks() {
 }
 
 // Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
-void av1_init_wedge_masks() {
+static void init_all_wedge_masks(void) {
   init_wedge_master_masks();
   init_wedge_masks();
   init_smooth_interintra_masks();
 }
 
+void av1_init_wedge_masks(void) { aom_once(init_all_wedge_masks); }
+
 static AOM_INLINE void build_masked_compound_no_round(
     uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride,
@@ -662,10 +656,10 @@ static AOM_INLINE void build_masked_compound_no_round(
 #endif
 }
 
-void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
-                                     uint8_t *dst, int dst_stride,
-                                     InterPredParams *inter_pred_params,
-                                     const SubpelParams *subpel_params) {
+static void make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                        uint8_t *dst, int dst_stride,
+                                        InterPredParams *inter_pred_params,
+                                        const SubpelParams *subpel_params) {
   const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp;
   BLOCK_SIZE sb_type = inter_pred_params->sb_type;
 
@@ -705,28 +699,77 @@ void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
 void av1_build_one_inter_predictor(
     uint8_t *dst, int dst_stride, const MV *const src_mv,
     InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
-    int ref, CalcSubpelParamsFunc calc_subpel_params_func) {
+    int ref, uint8_t **mc_buf, CalcSubpelParamsFunc calc_subpel_params_func) {
   SubpelParams subpel_params;
   uint8_t *src;
   int src_stride;
-  calc_subpel_params_func(src_mv, inter_pred_params, xd, mi_x, mi_y, ref, &src,
-                          &subpel_params, &src_stride);
+  calc_subpel_params_func(src_mv, inter_pred_params, xd, mi_x, mi_y, ref,
+                          mc_buf, &src, &subpel_params, &src_stride);
 
   if (inter_pred_params->comp_mode == UNIFORM_SINGLE ||
       inter_pred_params->comp_mode == UNIFORM_COMP) {
     av1_make_inter_predictor(src, src_stride, dst, dst_stride,
                              inter_pred_params, &subpel_params);
   } else {
-    av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride,
-                                    inter_pred_params, &subpel_params);
+    make_masked_inter_predictor(src, src_stride, dst, dst_stride,
+                                inter_pred_params, &subpel_params);
   }
 }
 
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+                                     const MB_MODE_INFO *mbmi, int *fwd_offset,
+                                     int *bck_offset,
+                                     int *use_dist_wtd_comp_avg,
+                                     int is_compound) {
+  assert(fwd_offset != NULL && bck_offset != NULL);
+  if (!is_compound || mbmi->compound_idx) {
+    *fwd_offset = 8;
+    *bck_offset = 8;
+    *use_dist_wtd_comp_avg = 0;
+    return;
+  }
+
+  *use_dist_wtd_comp_avg = 1;
+  const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+  const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
+  const int cur_frame_index = cm->cur_frame->order_hint;
+  int bck_frame_index = 0, fwd_frame_index = 0;
+
+  if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
+  if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
+
+  int d0 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info,
+                                       fwd_frame_index, cur_frame_index)),
+                 0, MAX_FRAME_DISTANCE);
+  int d1 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info,
+                                       cur_frame_index, bck_frame_index)),
+                 0, MAX_FRAME_DISTANCE);
+
+  const int order = d0 <= d1;
+
+  if (d0 == 0 || d1 == 0) {
+    *fwd_offset = quant_dist_lookup_table[3][order];
+    *bck_offset = quant_dist_lookup_table[3][1 - order];
+    return;
+  }
+
+  int i;
+  for (i = 0; i < 3; ++i) {
+    int c0 = quant_dist_weight[i][order];
+    int c1 = quant_dist_weight[i][!order];
+    int d0_c0 = d0 * c0;
+    int d1_c1 = d1 * c1;
+    if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
+  }
+
+  *fwd_offset = quant_dist_lookup_table[i][order];
+  *bck_offset = quant_dist_lookup_table[i][1 - order];
+}
+
 // True if the following hold:
 //  1. Not intrabc and not build_for_obmc
-//  2. A U or V plane
-//  3. If the block size differs from the base block size
-//  4. If sub-sampled, none of the previous blocks around the sub-sample
+//  2. At least one dimension is size 4 with subsampling
+//  3. If sub-sampled, none of the previous blocks around the sub-sample
 //     are intrabc or inter-blocks
 static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
                             int is_intrabc, int build_for_obmc) {
@@ -737,8 +780,9 @@ static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ss_x = pd->subsampling_x;
   const int ss_y = pd->subsampling_y;
-  if ((block_size_wide[bsize] >= 8 || !ss_x) &&
-      (block_size_high[bsize] >= 8 || !ss_y)) {
+  const int is_sub4_x = (block_size_wide[bsize] == 4) && ss_x;
+  const int is_sub4_y = (block_size_high[bsize] == 4) && ss_y;
+  if (!is_sub4_x && !is_sub4_y) {
     return false;
   }
 
@@ -746,8 +790,8 @@ static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
   // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
   // the top-left corner of the prediction source - the correct top-left corner
   // is at (pre_x, pre_y).
-  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
-  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+  const int row_start = is_sub4_y ? -1 : 0;
+  const int col_start = is_sub4_x ? -1 : 0;
 
   for (int row = row_start; row <= 0; ++row) {
     for (int col = col_start; col <= 0; ++col) {
@@ -761,9 +805,9 @@ static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
 
 static void build_inter_predictors_sub8x8(
     const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
-    int bw, int bh, int mi_x, int mi_y,
+    int mi_x, int mi_y, uint8_t **mc_buf,
     CalcSubpelParamsFunc calc_subpel_params_func) {
-  const BLOCK_SIZE bsize = mi->sb_type;
+  const BLOCK_SIZE bsize = mi->bsize;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const bool ss_x = pd->subsampling_x;
   const bool ss_y = pd->subsampling_y;
@@ -790,10 +834,6 @@ static void build_inter_predictors_sub8x8(
     int col = col_start;
     for (int x = 0; x < b8_w; x += b4_w) {
       MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
-      int tmp_dst_stride = 8;
-      assert(bw < 8 || bh < 8);
-      (void)bw;
-      (void)bh;
       struct buf_2d *const dst_buf = &pd->dst;
       uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
       int ref = 0;
@@ -817,13 +857,12 @@ static void build_inter_predictors_sub8x8(
                             pre_x + x, pd->subsampling_x, pd->subsampling_y,
                             xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf,
                             &pre_buf, this_mbmi->interp_filters);
-      inter_pred_params.conv_params = get_conv_params_no_round(
-          ref, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
-      inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0;
+      inter_pred_params.conv_params =
+          get_conv_params_no_round(ref, plane, NULL, 0, is_compound, xd->bd);
 
       av1_build_one_inter_predictor(dst, dst_buf->stride, &mv,
                                     &inter_pred_params, xd, mi_x + x, mi_y + y,
-                                    ref, calc_subpel_params_func);
+                                    ref, mc_buf, calc_subpel_params_func);
 
       ++col;
     }
@@ -833,7 +872,7 @@ static void build_inter_predictors_sub8x8(
 
 static void build_inter_predictors_8x8_and_bigger(
     const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
-    int build_for_obmc, int bw, int bh, int mi_x, int mi_y,
+    int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf,
     CalcSubpelParamsFunc calc_subpel_params_func) {
   const int is_compound = has_second_ref(mi);
   const int is_intrabc = is_intrabc_block(mi);
@@ -848,7 +887,7 @@ static void build_inter_predictors_8x8_and_bigger(
     is_global[ref] = is_global_mv_block(mi, wm->wmtype);
   }
 
-  const BLOCK_SIZE bsize = mi->sb_type;
+  const BLOCK_SIZE bsize = mi->bsize;
   const int ss_x = pd->subsampling_x;
   const int ss_y = pd->subsampling_y;
   const int row_start =
@@ -876,7 +915,7 @@ static void build_inter_predictors_8x8_and_bigger(
         ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
 
     av1_dist_wtd_comp_weight_assign(
-        cm, mi, 0, &inter_pred_params.conv_params.fwd_offset,
+        cm, mi, &inter_pred_params.conv_params.fwd_offset,
         &inter_pred_params.conv_params.bck_offset,
         &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
 
@@ -884,80 +923,38 @@ static void build_inter_predictors_8x8_and_bigger(
       av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
 
     if (is_masked_compound_type(mi->interinter_comp.type)) {
-      av1_init_mask_comp(&inter_pred_params, mi->sb_type, &mi->interinter_comp);
+      inter_pred_params.sb_type = mi->bsize;
+      inter_pred_params.mask_comp = mi->interinter_comp;
+      if (ref == 1) {
+        inter_pred_params.conv_params.do_average = 0;
+        inter_pred_params.comp_mode = MASK_COMP;
+      }
       // Assign physical buffer.
       inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
     }
 
     av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
-                                  xd, mi_x, mi_y, ref, calc_subpel_params_func);
+                                  xd, mi_x, mi_y, ref, mc_buf,
+                                  calc_subpel_params_func);
   }
 }
 
 void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 int plane, const MB_MODE_INFO *mi,
                                 int build_for_obmc, int bw, int bh, int mi_x,
-                                int mi_y,
+                                int mi_y, uint8_t **mc_buf,
                                 CalcSubpelParamsFunc calc_subpel_params_func) {
-  if (is_sub8x8_inter(xd, plane, mi->sb_type, is_intrabc_block(mi),
+  if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi),
                       build_for_obmc)) {
-    build_inter_predictors_sub8x8(cm, xd, plane, mi, bw, bh, mi_x, mi_y,
+    assert(bw < 8 || bh < 8);
+    build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y, mc_buf,
                                   calc_subpel_params_func);
   } else {
     build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
-                                          bh, mi_x, mi_y,
+                                          bh, mi_x, mi_y, mc_buf,
                                           calc_subpel_params_func);
   }
 }
-
-void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
-                                     const MB_MODE_INFO *mbmi, int order_idx,
-                                     int *fwd_offset, int *bck_offset,
-                                     int *use_dist_wtd_comp_avg,
-                                     int is_compound) {
-  assert(fwd_offset != NULL && bck_offset != NULL);
-  if (!is_compound || mbmi->compound_idx) {
-    *use_dist_wtd_comp_avg = 0;
-    return;
-  }
-
-  *use_dist_wtd_comp_avg = 1;
-  const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
-  const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
-  const int cur_frame_index = cm->cur_frame->order_hint;
-  int bck_frame_index = 0, fwd_frame_index = 0;
-
-  if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
-  if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
-
-  int d0 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info,
-                                       fwd_frame_index, cur_frame_index)),
-                 0, MAX_FRAME_DISTANCE);
-  int d1 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info,
-                                       cur_frame_index, bck_frame_index)),
-                 0, MAX_FRAME_DISTANCE);
-
-  const int order = d0 <= d1;
-
-  if (d0 == 0 || d1 == 0) {
-    *fwd_offset = quant_dist_lookup_table[order_idx][3][order];
-    *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order];
-    return;
-  }
-
-  int i;
-  for (i = 0; i < 3; ++i) {
-    int c0 = quant_dist_weight[i][order];
-    int c1 = quant_dist_weight[i][!order];
-    int d0_c0 = d0 * c0;
-    int d1_c1 = d1 * c1;
-    if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
-  }
-
-  *fwd_offset = quant_dist_lookup_table[order_idx][i][order];
-  *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
-}
-
 void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const int plane_start, const int plane_end) {
@@ -982,7 +979,7 @@ void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
     for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
       struct macroblockd_plane *const pd = &xd->plane[i];
       const int is_uv = i > 0;
-      setup_pred_plane(&pd->pre[idx], xd->mi[0]->sb_type, src->buffers[i],
+      setup_pred_plane(&pd->pre[idx], xd->mi[0]->bsize, src->buffers[i],
                        src->crop_widths[is_uv], src->crop_heights[is_uv],
                        src->strides[is_uv], mi_row, mi_col, sf,
                        pd->subsampling_x, pd->subsampling_y);
@@ -1043,15 +1040,15 @@ static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row,
 void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) {
   MB_MODE_INFO *mbmi = xd->mi[0];
 
-  mbmi->overlappable_neighbors[0] = 0;
-  mbmi->overlappable_neighbors[1] = 0;
+  mbmi->overlappable_neighbors = 0;
 
-  if (!is_motion_variation_allowed_bsize(mbmi->sb_type)) return;
+  if (!is_motion_variation_allowed_bsize(mbmi->bsize)) return;
 
   foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr,
-                                &mbmi->overlappable_neighbors[0]);
+                                &mbmi->overlappable_neighbors);
+  if (mbmi->overlappable_neighbors) return;
   foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr,
-                               &mbmi->overlappable_neighbors[1]);
+                               &mbmi->overlappable_neighbors);
 }
 
 // HW does not support < 4x4 prediction. To limit the bandwidth requirement, if
@@ -1098,7 +1095,7 @@ static INLINE void build_obmc_inter_pred_above(
   (void)rel_mi_row;
   (void)dir;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   const int overlap =
       AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
 
@@ -1137,7 +1134,7 @@ static INLINE void build_obmc_inter_pred_left(
   (void)rel_mi_col;
   (void)dir;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   const int overlap =
       AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
 
@@ -1179,7 +1176,7 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      int above_stride[MAX_MB_PLANE],
                                      uint8_t *left[MAX_MB_PLANE],
                                      int left_stride[MAX_MB_PLANE]) {
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 
   // handle above row
   struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
@@ -1194,42 +1191,35 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                build_obmc_inter_pred_left, &ctxt_left);
 }
 
-void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
-                                int mi_col_offset, MB_MODE_INFO *ref_mbmi,
-                                struct build_prediction_ctxt *ctxt,
-                                const int num_planes) {
-  const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->sb_type);
-  const int ref_mi_row = xd->mi_row + mi_row_offset;
-  const int ref_mi_col = xd->mi_col + mi_col_offset;
-
-  for (int plane = 0; plane < num_planes; ++plane) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane],
-                     ctxt->tmp_width[plane], ctxt->tmp_height[plane],
-                     ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset,
-                     NULL, pd->subsampling_x, pd->subsampling_y);
+void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1,
+                             uint8_t **dst_buf2) {
+  if (is_cur_buf_hbd(xd)) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+    dst_buf1[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+    dst_buf1[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+    dst_buf2[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+    dst_buf2[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+  } else {
+    dst_buf1[0] = xd->tmp_obmc_bufs[0];
+    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+    dst_buf2[0] = xd->tmp_obmc_bufs[1];
+    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
   }
-
-  const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0];
-
-  const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
-  const struct scale_factors *const sf =
-      get_ref_scale_factors_const(ctxt->cm, frame);
-
-  xd->block_ref_scale_factors[0] = sf;
-  if ((!av1_is_valid_scale(sf)))
-    aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                       "Reference frame has invalid dimensions");
-
-  av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf,
-                       num_planes);
 }
 
 void av1_setup_build_prediction_by_above_pred(
     MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
     MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
     const int num_planes) {
-  const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
+  const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->bsize);
   const int above_mi_col = xd->mi_col + rel_mi_col;
 
   av1_modify_neighbor_predictor_for_obmc(above_mbmi);
@@ -1268,7 +1258,7 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
                                              MB_MODE_INFO *left_mbmi,
                                              struct build_prediction_ctxt *ctxt,
                                              const int num_planes) {
-  const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
+  const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->bsize);
   const int left_mi_row = xd->mi_row + rel_mi_row;
 
   av1_modify_neighbor_predictor_for_obmc(left_mbmi);
@@ -1373,10 +1363,12 @@ void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
   assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
   assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0);
   assert(xd->mi[0]->use_intrabc == 0);
+  const SequenceHeader *seq_params = cm->seq_params;
 
-  av1_predict_intra_block(cm, xd, pd->width, pd->height,
-                          max_txsize_rect_lookup[plane_bsize], mode, 0, 0,
-                          FILTER_INTRA_MODES, ctx->plane[plane],
+  av1_predict_intra_block(xd, seq_params->sb_size,
+                          seq_params->enable_intra_edge_filter, pd->width,
+                          pd->height, max_txsize_rect_lookup[plane_bsize], mode,
+                          0, 0, FILTER_INTRA_MODES, ctx->plane[plane],
                           ctx->stride[plane], dst, dst_stride, 0, 0, plane);
 }
 
diff --git a/media/libaom/src/av1/common/reconinter.h b/media/libaom/src/av1/common/reconinter.h
index fe3c6a6217..056dc67d07 100644
--- a/media/libaom/src/av1/common/reconinter.h
+++ b/media/libaom/src/av1/common/reconinter.h
@@ -88,6 +88,7 @@ struct build_prediction_ctxt {
   int *tmp_height;
   int *tmp_stride;
   int mb_to_far_edge;
+  void *dcb;  // Decoder-only coding block.
 };
 
 typedef enum InterPredMode {
@@ -136,9 +137,6 @@ void av1_init_warp_params(InterPredParams *inter_pred_params,
                           const WarpTypesAllowed *warp_types, int ref,
                           const MACROBLOCKD *xd, const MB_MODE_INFO *mi);
 
-void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize,
-                        const INTERINTER_COMPOUND_DATA *mask_comp);
-
 static INLINE int has_scale(int xs, int ys) {
   return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
 }
@@ -156,45 +154,42 @@ static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
 
 static INLINE void inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
-    int h, ConvolveParams *conv_params,
-    const InterpFilterParams *interp_filters[2]) {
+    const SubpelParams *subpel_params, int w, int h,
+    ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2]) {
   assert(conv_params->do_average == 0 || conv_params->do_average == 1);
-  assert(sf);
   const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
   if (is_scaled) {
     av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                            interp_filters, subpel_params->subpel_x,
                            subpel_params->xs, subpel_params->subpel_y,
-                           subpel_params->ys, 1, conv_params, sf);
+                           subpel_params->ys, 1, conv_params);
   } else {
     SubpelParams sp = *subpel_params;
     revert_scale_extra_bits(&sp);
     av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                            interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
-                           sp.ys, 0, conv_params, sf);
+                           sp.ys, 0, conv_params);
   }
 }
 
 static INLINE void highbd_inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
-    int h, ConvolveParams *conv_params,
-    const InterpFilterParams *interp_filters[2], int bd) {
+    const SubpelParams *subpel_params, int w, int h,
+    ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2],
+    int bd) {
   assert(conv_params->do_average == 0 || conv_params->do_average == 1);
-  assert(sf);
   const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
   if (is_scaled) {
     av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                                   interp_filters, subpel_params->subpel_x,
                                   subpel_params->xs, subpel_params->subpel_y,
-                                  subpel_params->ys, 1, conv_params, sf, bd);
+                                  subpel_params->ys, 1, conv_params, bd);
   } else {
     SubpelParams sp = *subpel_params;
     revert_scale_extra_bits(&sp);
     av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                                   interp_filters, sp.subpel_x, sp.xs,
-                                  sp.subpel_y, sp.ys, 0, conv_params, sf, bd);
+                                  sp.subpel_y, sp.ys, 0, conv_params, bd);
   }
 }
 
@@ -241,27 +236,22 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                               InterPredParams *inter_pred_params,
                               const SubpelParams *subpel_params);
 
-void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
-                                     uint8_t *dst, int dst_stride,
-                                     InterPredParams *inter_pred_params,
-                                     const SubpelParams *subpel_params);
-
 typedef void (*CalcSubpelParamsFunc)(const MV *const src_mv,
                                      InterPredParams *const inter_pred_params,
                                      MACROBLOCKD *xd, int mi_x, int mi_y,
-                                     int ref, uint8_t **pre,
+                                     int ref, uint8_t **mc_buf, uint8_t **pre,
                                      SubpelParams *subpel_params,
                                      int *src_stride);
 
 void av1_build_one_inter_predictor(
     uint8_t *dst, int dst_stride, const MV *const src_mv,
     InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
-    int ref, CalcSubpelParamsFunc calc_subpel_params_func);
+    int ref, uint8_t **mc_buf, CalcSubpelParamsFunc calc_subpel_params_func);
 
 void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 int plane, const MB_MODE_INFO *mi,
                                 int build_for_obmc, int bw, int bh, int mi_x,
-                                int mi_y,
+                                int mi_y, uint8_t **mc_buf,
                                 CalcSubpelParamsFunc calc_subpel_params_func);
 
 // TODO(jkoleszar): yet another mv clamping function :-(
@@ -343,10 +333,10 @@ static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
   return 1;
 }
 
-void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
-                                int mi_col_offset, MB_MODE_INFO *ref_mbmi,
-                                struct build_prediction_ctxt *ctxt,
-                                const int num_planes);
+// Sets up buffers 'dst_buf1' and 'dst_buf2' from relevant buffers in 'xd' for
+// subsequent use in OBMC prediction.
+void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1,
+                             uint8_t **dst_buf2);
 
 void av1_setup_build_prediction_by_above_pred(
     MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
@@ -377,6 +367,12 @@ static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index,
   return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
 }
 
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+                                     const MB_MODE_INFO *mbmi, int *fwd_offset,
+                                     int *bck_offset,
+                                     int *use_dist_wtd_comp_avg,
+                                     int is_compound);
+
 const uint8_t *av1_get_compound_type_mask(
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
 
@@ -396,11 +392,6 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                             const uint8_t *inter_pred, int inter_stride,
                             const uint8_t *intra_pred, int intra_stride);
 
-void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
-                                     const MB_MODE_INFO *mbmi, int order_idx,
-                                     int *fwd_offset, int *bck_offset,
-                                     int *use_dist_wtd_comp_avg,
-                                     int is_compound);
 int av1_allow_warp(const MB_MODE_INFO *const mbmi,
                    const WarpTypesAllowed *const warp_types,
                    const WarpedMotionParams *const gm_params,
diff --git a/media/libaom/src/av1/common/reconintra.c b/media/libaom/src/av1/common/reconintra.c
index 1307a03138..6b1cf36f67 100644
--- a/media/libaom/src/av1/common/reconintra.c
+++ b/media/libaom/src/av1/common/reconintra.c
@@ -19,7 +19,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_once.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 #include "av1/common/av1_common_int.h"
 #include "av1/common/cfl.h"
 #include "av1/common/reconintra.h"
@@ -35,6 +34,7 @@ enum {
 #define INTRA_EDGE_FILT 3
 #define INTRA_EDGE_TAPS 5
 #define MAX_UPSAMPLE_SZ 16
+#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
 
 static const uint8_t extend_modes[INTRA_MODES] = {
   NEED_ABOVE | NEED_LEFT,                   // DC
@@ -192,7 +192,7 @@ static const uint8_t *get_has_tr_table(PARTITION_TYPE partition,
   return ret;
 }
 
-static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
+static int has_top_right(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row,
                          int mi_col, int top_available, int right_available,
                          PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
                          int col_off, int ss_x, int ss_y) {
@@ -222,7 +222,7 @@ static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
 
     const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
     const int bh_in_mi_log2 = mi_size_high_log2[bsize];
-    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
+    const int sb_mi_size = mi_size_high[sb_size];
     const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
     const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
 
@@ -377,7 +377,7 @@ static const uint8_t *get_has_bl_table(PARTITION_TYPE partition,
   return ret;
 }
 
-static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
+static int has_bottom_left(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row,
                            int mi_col, int bottom_available, int left_available,
                            PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
                            int col_off, int ss_x, int ss_y) {
@@ -414,7 +414,7 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
 
     const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
     const int bh_in_mi_log2 = mi_size_high_log2[bsize];
-    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
+    const int sb_mi_size = mi_size_high[sb_size];
     const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
     const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
 
@@ -490,27 +490,27 @@ static void init_intra_predictors_internal(void) {
   p[TX_4X4] = aom_##type##_predictor_4x4; \
   INIT_NO_4X4(p, type)
 
-  INIT_ALL_SIZES(pred[V_PRED], v);
-  INIT_ALL_SIZES(pred[H_PRED], h);
-  INIT_ALL_SIZES(pred[PAETH_PRED], paeth);
-  INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth);
-  INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v);
-  INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h);
-  INIT_ALL_SIZES(dc_pred[0][0], dc_128);
-  INIT_ALL_SIZES(dc_pred[0][1], dc_top);
-  INIT_ALL_SIZES(dc_pred[1][0], dc_left);
-  INIT_ALL_SIZES(dc_pred[1][1], dc);
+  INIT_ALL_SIZES(pred[V_PRED], v)
+  INIT_ALL_SIZES(pred[H_PRED], h)
+  INIT_ALL_SIZES(pred[PAETH_PRED], paeth)
+  INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth)
+  INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v)
+  INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h)
+  INIT_ALL_SIZES(dc_pred[0][0], dc_128)
+  INIT_ALL_SIZES(dc_pred[0][1], dc_top)
+  INIT_ALL_SIZES(dc_pred[1][0], dc_left)
+  INIT_ALL_SIZES(dc_pred[1][1], dc)
 #if CONFIG_AV1_HIGHBITDEPTH
-  INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
-  INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
-  INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth);
-  INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth);
-  INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v);
-  INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h);
-  INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128);
-  INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
-  INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
-  INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
+  INIT_ALL_SIZES(pred_high[V_PRED], highbd_v)
+  INIT_ALL_SIZES(pred_high[H_PRED], highbd_h)
+  INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth)
+  INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth)
+  INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v)
+  INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h)
+  INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128)
+  INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top)
+  INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left)
+  INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc)
 #endif
 #undef intra_pred_allsizes
 }
@@ -869,16 +869,19 @@ void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
       for (int k = 0; k < 8; ++k) {
         int r_offset = k >> 2;
         int c_offset = k & 0x03;
+        int pr = av1_filter_intra_taps[mode][k][0] * p0 +
+                 av1_filter_intra_taps[mode][k][1] * p1 +
+                 av1_filter_intra_taps[mode][k][2] * p2 +
+                 av1_filter_intra_taps[mode][k][3] * p3 +
+                 av1_filter_intra_taps[mode][k][4] * p4 +
+                 av1_filter_intra_taps[mode][k][5] * p5 +
+                 av1_filter_intra_taps[mode][k][6] * p6;
+        // Section 7.11.2.3 specifies the right-hand side of the assignment as
+        //   Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
+        // Since Clip1() clips a negative value to 0, it is safe to replace
+        // Round2Signed() with Round2().
         buffer[r + r_offset][c + c_offset] =
-            clip_pixel(ROUND_POWER_OF_TWO_SIGNED(
-                av1_filter_intra_taps[mode][k][0] * p0 +
-                    av1_filter_intra_taps[mode][k][1] * p1 +
-                    av1_filter_intra_taps[mode][k][2] * p2 +
-                    av1_filter_intra_taps[mode][k][3] * p3 +
-                    av1_filter_intra_taps[mode][k][4] * p4 +
-                    av1_filter_intra_taps[mode][k][5] * p5 +
-                    av1_filter_intra_taps[mode][k][6] * p6,
-                FILTER_INTRA_SCALE_BITS));
+            clip_pixel(ROUND_POWER_OF_TWO(pr, FILTER_INTRA_SCALE_BITS));
       }
     }
 
@@ -916,17 +919,19 @@ static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
       for (int k = 0; k < 8; ++k) {
         int r_offset = k >> 2;
         int c_offset = k & 0x03;
-        buffer[r + r_offset][c + c_offset] =
-            clip_pixel_highbd(ROUND_POWER_OF_TWO_SIGNED(
-                                  av1_filter_intra_taps[mode][k][0] * p0 +
-                                      av1_filter_intra_taps[mode][k][1] * p1 +
-                                      av1_filter_intra_taps[mode][k][2] * p2 +
-                                      av1_filter_intra_taps[mode][k][3] * p3 +
-                                      av1_filter_intra_taps[mode][k][4] * p4 +
-                                      av1_filter_intra_taps[mode][k][5] * p5 +
-                                      av1_filter_intra_taps[mode][k][6] * p6,
-                                  FILTER_INTRA_SCALE_BITS),
-                              bd);
+        int pr = av1_filter_intra_taps[mode][k][0] * p0 +
+                 av1_filter_intra_taps[mode][k][1] * p1 +
+                 av1_filter_intra_taps[mode][k][2] * p2 +
+                 av1_filter_intra_taps[mode][k][3] * p3 +
+                 av1_filter_intra_taps[mode][k][4] * p4 +
+                 av1_filter_intra_taps[mode][k][5] * p5 +
+                 av1_filter_intra_taps[mode][k][6] * p6;
+        // Section 7.11.2.3 specifies the right-hand side of the assignment as
+        //   Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
+        // Since Clip1() clips a negative value to 0, it is safe to replace
+        // Round2Signed() with Round2().
+        buffer[r + r_offset][c + c_offset] = clip_pixel_highbd(
+            ROUND_POWER_OF_TWO(pr, FILTER_INTRA_SCALE_BITS), bd);
       }
     }
 
@@ -953,7 +958,7 @@ static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
   }
 }
 
-static int get_filt_type(const MACROBLOCKD *xd, int plane) {
+static int get_intra_edge_filter_type(const MACROBLOCKD *xd, int plane) {
   int ab_sm, le_sm;
 
   if (plane == 0) {
@@ -1126,16 +1131,16 @@ void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
 }
 #if CONFIG_AV1_HIGHBITDEPTH
 static void build_intra_predictors_high(
-    const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
-    int dst_stride, PREDICTION_MODE mode, int angle_delta,
-    FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size,
-    int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px,
-    int n_bottomleft_px, int plane) {
+    const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride,
+    PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode,
+    TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
+    int n_left_px, int n_bottomleft_px, int intra_edge_filter_type,
+    int bit_depth) {
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  DECLARE_ALIGNED(16, uint16_t, left_data[MAX_TX_SIZE * 2 + 32]);
-  DECLARE_ALIGNED(16, uint16_t, above_data[MAX_TX_SIZE * 2 + 32]);
+  DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
   uint16_t *const above_row = above_data + 16;
   uint16_t *const left_col = left_data + 16;
   const int txwpx = tx_size_wide[tx_size];
@@ -1145,10 +1150,15 @@ static void build_intra_predictors_high(
   int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
   const uint16_t *above_ref = ref - ref_stride;
   const uint16_t *left_ref = ref - 1;
-  int p_angle = 0;
   const int is_dr_mode = av1_is_directional_mode(mode);
   const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
-  int base = 128 << (xd->bd - 8);
+  int base = 128 << (bit_depth - 8);
+  // The left_data, above_data buffers must be zeroed to fix some intermittent
+  // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
+  // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
+  // seen to be the potential reason for this issue.
+  aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
+  aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
 
   // The default values if ref pixels are not available:
   // base   base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
@@ -1158,7 +1168,6 @@ static void build_intra_predictors_high(
   // base+1   G      H  ..     S      T      T      T      T      T
 
   if (is_dr_mode) {
-    p_angle = mode_to_angle_map[mode] + angle_delta;
     if (p_angle <= 90)
       need_above = 1, need_left = 0, need_above_left = 1;
     else if (p_angle < 180)
@@ -1169,9 +1178,9 @@ static void build_intra_predictors_high(
   if (use_filter_intra) need_left = need_above = need_above_left = 1;
 
   assert(n_top_px >= 0);
-  assert(n_topright_px >= 0);
+  assert(n_topright_px >= -1);
   assert(n_left_px >= 0);
-  assert(n_bottomleft_px >= 0);
+  assert(n_bottomleft_px >= -1);
 
   if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
     int val;
@@ -1189,39 +1198,30 @@ static void build_intra_predictors_high(
 
   // NEED_LEFT
   if (need_left) {
-    int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT;
-    if (use_filter_intra) need_bottom = 0;
-    if (is_dr_mode) need_bottom = p_angle > 180;
-    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
+    const int num_left_pixels_needed =
+        txhpx + (n_bottomleft_px >= 0 ? txwpx : 0);
     i = 0;
     if (n_left_px > 0) {
       for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
-      if (need_bottom && n_bottomleft_px > 0) {
+      if (n_bottomleft_px > 0) {
         assert(i == txhpx);
         for (; i < txhpx + n_bottomleft_px; i++)
           left_col[i] = left_ref[i * ref_stride];
       }
       if (i < num_left_pixels_needed)
         aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
-    } else {
-      if (n_top_px > 0) {
-        aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
-      } else {
-        aom_memset16(left_col, base + 1, num_left_pixels_needed);
-      }
+    } else if (n_top_px > 0) {
+      aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
     }
   }
 
   // NEED_ABOVE
   if (need_above) {
-    int need_right = extend_modes[mode] & NEED_ABOVERIGHT;
-    if (use_filter_intra) need_right = 0;
-    if (is_dr_mode) need_right = p_angle < 90;
-    const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
+    const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0);
     if (n_top_px > 0) {
       memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
       i = n_top_px;
-      if (need_right && n_topright_px > 0) {
+      if (n_topright_px > 0) {
         assert(n_top_px == txwpx);
         memcpy(above_row + txwpx, above_ref + txwpx,
                n_topright_px * sizeof(above_ref[0]));
@@ -1230,12 +1230,8 @@ static void build_intra_predictors_high(
       if (i < num_top_pixels_needed)
         aom_memset16(&above_row[i], above_row[i - 1],
                      num_top_pixels_needed - i);
-    } else {
-      if (n_left_px > 0) {
-        aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
-      } else {
-        aom_memset16(above_row, base - 1, num_top_pixels_needed);
-      }
+    } else if (n_left_px > 0) {
+      aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
     }
   }
 
@@ -1254,7 +1250,7 @@ static void build_intra_predictors_high(
 
   if (use_filter_intra) {
     highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
-                                  filter_intra_mode, xd->bd);
+                                  filter_intra_mode, bit_depth);
     return;
   }
 
@@ -1264,66 +1260,62 @@ static void build_intra_predictors_high(
     if (!disable_edge_filter) {
       const int need_right = p_angle < 90;
       const int need_bottom = p_angle > 180;
-      const int filt_type = get_filt_type(xd, plane);
       if (p_angle != 90 && p_angle != 180) {
         const int ab_le = need_above_left ? 1 : 0;
         if (need_above && need_left && (txwpx + txhpx >= 24)) {
           filter_intra_edge_corner_high(above_row, left_col);
         }
         if (need_above && n_top_px > 0) {
-          const int strength =
-              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int strength = intra_edge_filter_strength(
+              txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
           const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
           av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
         }
         if (need_left && n_left_px > 0) {
           const int strength = intra_edge_filter_strength(
-              txhpx, txwpx, p_angle - 180, filt_type);
+              txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
           const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
           av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
         }
       }
-      upsample_above =
-          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
+                                                   intra_edge_filter_type);
       if (need_above && upsample_above) {
         const int n_px = txwpx + (need_right ? txhpx : 0);
-        av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
+        av1_upsample_intra_edge_high(above_row, n_px, bit_depth);
       }
-      upsample_left =
-          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
+                                                  intra_edge_filter_type);
       if (need_left && upsample_left) {
         const int n_px = txhpx + (need_bottom ? txwpx : 0);
-        av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
+        av1_upsample_intra_edge_high(left_col, n_px, bit_depth);
       }
     }
     highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
-                        upsample_above, upsample_left, p_angle, xd->bd);
+                        upsample_above, upsample_left, p_angle, bit_depth);
     return;
   }
 
   // predict
   if (mode == DC_PRED) {
     dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
-        dst, dst_stride, above_row, left_col, xd->bd);
+        dst, dst_stride, above_row, left_col, bit_depth);
   } else {
-    pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd);
+    pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth);
   }
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
-                                   int ref_stride, uint8_t *dst, int dst_stride,
-                                   PREDICTION_MODE mode, int angle_delta,
-                                   FILTER_INTRA_MODE filter_intra_mode,
-                                   TX_SIZE tx_size, int disable_edge_filter,
-                                   int n_top_px, int n_topright_px,
-                                   int n_left_px, int n_bottomleft_px,
-                                   int plane) {
+static void build_intra_predictors(
+    const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride,
+    PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode,
+    TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
+    int n_left_px, int n_bottomleft_px, int intra_edge_filter_type) {
   int i;
   const uint8_t *above_ref = ref - ref_stride;
   const uint8_t *left_ref = ref - 1;
-  DECLARE_ALIGNED(16, uint8_t, left_data[MAX_TX_SIZE * 2 + 32]);
-  DECLARE_ALIGNED(16, uint8_t, above_data[MAX_TX_SIZE * 2 + 32]);
+  DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
   uint8_t *const above_row = above_data + 16;
   uint8_t *const left_col = left_data + 16;
   const int txwpx = tx_size_wide[tx_size];
@@ -1331,9 +1323,14 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
   int need_left = extend_modes[mode] & NEED_LEFT;
   int need_above = extend_modes[mode] & NEED_ABOVE;
   int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
-  int p_angle = 0;
   const int is_dr_mode = av1_is_directional_mode(mode);
   const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+  // The left_data, above_data buffers must be zeroed to fix some intermittent
+  // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
+  // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to
+  // be the potential reason for this issue.
+  memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS);
+  memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS);
 
   // The default values if ref pixels are not available:
   // 128 127 127 .. 127 127 127 127 127 127
@@ -1344,7 +1341,6 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
   // ..
 
   if (is_dr_mode) {
-    p_angle = mode_to_angle_map[mode] + angle_delta;
     if (p_angle <= 90)
       need_above = 1, need_left = 0, need_above_left = 1;
     else if (p_angle < 180)
@@ -1355,9 +1351,9 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
   if (use_filter_intra) need_left = need_above = need_above_left = 1;
 
   assert(n_top_px >= 0);
-  assert(n_topright_px >= 0);
+  assert(n_topright_px >= -1);
   assert(n_left_px >= 0);
-  assert(n_bottomleft_px >= 0);
+  assert(n_bottomleft_px >= -1);
 
   if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
     int val;
@@ -1375,54 +1371,38 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
 
   // NEED_LEFT
   if (need_left) {
-    int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT;
-    if (use_filter_intra) need_bottom = 0;
-    if (is_dr_mode) need_bottom = p_angle > 180;
-    // the avx2 dr_prediction_z2 may read at most 3 extra bytes,
-    // due to the avx2 mask load is with dword granularity.
-    // so we initialize 3 extra bytes to silence valgrind complain.
-    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 3);
+    const int num_left_pixels_needed =
+        txhpx + (n_bottomleft_px >= 0 ? txwpx : 0);
     i = 0;
     if (n_left_px > 0) {
       for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
-      if (need_bottom && n_bottomleft_px > 0) {
+      if (n_bottomleft_px > 0) {
         assert(i == txhpx);
         for (; i < txhpx + n_bottomleft_px; i++)
           left_col[i] = left_ref[i * ref_stride];
       }
       if (i < num_left_pixels_needed)
         memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
-    } else {
-      if (n_top_px > 0) {
-        memset(left_col, above_ref[0], num_left_pixels_needed);
-      } else {
-        memset(left_col, 129, num_left_pixels_needed);
-      }
+    } else if (n_top_px > 0) {
+      memset(left_col, above_ref[0], num_left_pixels_needed);
     }
   }
 
   // NEED_ABOVE
   if (need_above) {
-    int need_right = extend_modes[mode] & NEED_ABOVERIGHT;
-    if (use_filter_intra) need_right = 0;
-    if (is_dr_mode) need_right = p_angle < 90;
-    const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
+    const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0);
     if (n_top_px > 0) {
       memcpy(above_row, above_ref, n_top_px);
       i = n_top_px;
-      if (need_right && n_topright_px > 0) {
+      if (n_topright_px > 0) {
         assert(n_top_px == txwpx);
         memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px);
         i += n_topright_px;
       }
       if (i < num_top_pixels_needed)
         memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
-    } else {
-      if (n_left_px > 0) {
-        memset(above_row, left_ref[0], num_top_pixels_needed);
-      } else {
-        memset(above_row, 127, num_top_pixels_needed);
-      }
+    } else if (n_left_px > 0) {
+      memset(above_row, left_ref[0], num_top_pixels_needed);
     }
   }
 
@@ -1451,33 +1431,32 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
     if (!disable_edge_filter) {
       const int need_right = p_angle < 90;
       const int need_bottom = p_angle > 180;
-      const int filt_type = get_filt_type(xd, plane);
       if (p_angle != 90 && p_angle != 180) {
         const int ab_le = need_above_left ? 1 : 0;
         if (need_above && need_left && (txwpx + txhpx >= 24)) {
           filter_intra_edge_corner(above_row, left_col);
         }
         if (need_above && n_top_px > 0) {
-          const int strength =
-              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int strength = intra_edge_filter_strength(
+              txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
           const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
           av1_filter_intra_edge(above_row - ab_le, n_px, strength);
         }
         if (need_left && n_left_px > 0) {
           const int strength = intra_edge_filter_strength(
-              txhpx, txwpx, p_angle - 180, filt_type);
+              txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
           const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
           av1_filter_intra_edge(left_col - ab_le, n_px, strength);
         }
       }
-      upsample_above =
-          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
+                                                   intra_edge_filter_type);
       if (need_above && upsample_above) {
         const int n_px = txwpx + (need_right ? txhpx : 0);
         av1_upsample_intra_edge(above_row, n_px);
       }
-      upsample_left =
-          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
+                                                  intra_edge_filter_type);
       if (need_left && upsample_left) {
         const int n_px = txhpx + (need_bottom ? txwpx : 0);
         av1_upsample_intra_edge(left_col, n_px);
@@ -1548,11 +1527,14 @@ static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
   return bs;
 }
 
-void av1_predict_intra_block(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
-    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
-    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
-    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane) {
+void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
+                             int enable_intra_edge_filter, int wpx, int hpx,
+                             TX_SIZE tx_size, PREDICTION_MODE mode,
+                             int angle_delta, int use_palette,
+                             FILTER_INTRA_MODE filter_intra_mode,
+                             const uint8_t *ref, int ref_stride, uint8_t *dst,
+                             int dst_stride, int col_off, int row_off,
+                             int plane) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int txwpx = tx_size_wide[tx_size];
   const int txhpx = tx_size_high[tx_size];
@@ -1594,17 +1576,13 @@ void av1_predict_intra_block(
       col_off || (ss_x ? xd->chroma_left_available : xd->left_available);
   const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
   const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
-  const int xr_chr_offset = 0;
-  const int yd_chr_offset = 0;
 
   // Distance between the right edge of this prediction block to
   // the frame right edge
-  const int xr =
-      (xd->mb_to_right_edge >> (3 + ss_x)) + (wpx - x - txwpx) - xr_chr_offset;
+  const int xr = (xd->mb_to_right_edge >> (3 + ss_x)) + wpx - x - txwpx;
   // Distance between the bottom edge of this prediction block to
   // the frame bottom edge
-  const int yd =
-      (xd->mb_to_bottom_edge >> (3 + ss_y)) + (hpx - y - txhpx) - yd_chr_offset;
+  const int yd = (xd->mb_to_bottom_edge >> (3 + ss_y)) + hpx - y - txhpx;
   const int right_available =
       mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end;
   const int bottom_available =
@@ -1612,39 +1590,64 @@ void av1_predict_intra_block(
 
   const PARTITION_TYPE partition = mbmi->partition;
 
-  BLOCK_SIZE bsize = mbmi->sb_type;
+  BLOCK_SIZE bsize = mbmi->bsize;
   // force 4x4 chroma component block size.
   if (ss_x || ss_y) {
     bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
   }
 
+  const int is_dr_mode = av1_is_directional_mode(mode);
+  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+  int p_angle = 0;
+  int need_top_right = extend_modes[mode] & NEED_ABOVERIGHT;
+  int need_bottom_left = extend_modes[mode] & NEED_BOTTOMLEFT;
+
+  if (use_filter_intra) {
+    need_top_right = 0;
+    need_bottom_left = 0;
+  }
+  if (is_dr_mode) {
+    p_angle = mode_to_angle_map[mode] + angle_delta;
+    need_top_right = p_angle < 90;
+    need_bottom_left = p_angle > 180;
+  }
+
+  // Possible states for have_top_right(TR) and have_bottom_left(BL)
+  // -1 : TR and BL are not needed
+  //  0 : TR and BL are needed but not available
+  // > 0 : TR and BL are needed and pixels are available
   const int have_top_right =
-      has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available,
-                    partition, tx_size, row_off, col_off, ss_x, ss_y);
+      need_top_right ? has_top_right(sb_size, bsize, mi_row, mi_col, have_top,
+                                     right_available, partition, tx_size,
+                                     row_off, col_off, ss_x, ss_y)
+                     : -1;
   const int have_bottom_left =
-      has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left,
-                      partition, tx_size, row_off, col_off, ss_x, ss_y);
+      need_bottom_left ? has_bottom_left(sb_size, bsize, mi_row, mi_col,
+                                         bottom_available, have_left, partition,
+                                         tx_size, row_off, col_off, ss_x, ss_y)
+                       : -1;
 
-  const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
+  const int disable_edge_filter = !enable_intra_edge_filter;
+  const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane);
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     build_intra_predictors_high(
-        xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
-        filter_intra_mode, tx_size, disable_edge_filter,
-        have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
-        have_top_right ? AOMMIN(txwpx, xr) : 0,
+        ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
+        tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+        have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right,
         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-        have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+        have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left,
+        intra_edge_filter_type, xd->bd);
     return;
   }
 #endif
-  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
-                         angle_delta, filter_intra_mode, tx_size,
-                         disable_edge_filter,
-                         have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
-                         have_top_right ? AOMMIN(txwpx, xr) : 0,
-                         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-                         have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+  build_intra_predictors(
+      ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
+      tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+      have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right,
+      have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+      have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left,
+      intra_edge_filter_type);
 }
 
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -1662,12 +1665,13 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
           ? mbmi->filter_intra_mode_info.filter_intra_mode
           : FILTER_INTRA_MODES;
   const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
+  const SequenceHeader *seq_params = cm->seq_params;
 
   if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
 #if CONFIG_DEBUG
     assert(is_cfl_allowed(xd));
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(
-        mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
     (void)plane_bsize;
     assert(plane_bsize < BLOCK_SIZES_ALL);
     if (!xd->lossless[mbmi->segment_id]) {
@@ -1680,10 +1684,11 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
     CFL_CTX *const cfl = &xd->cfl;
     CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
     if (cfl->dc_pred_is_cached[pred_plane] == 0) {
-      av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
-                              angle_delta, use_palette, filter_intra_mode, dst,
-                              dst_stride, dst, dst_stride, blk_col, blk_row,
-                              plane);
+      av1_predict_intra_block(xd, seq_params->sb_size,
+                              seq_params->enable_intra_edge_filter, pd->width,
+                              pd->height, tx_size, mode, angle_delta,
+                              use_palette, filter_intra_mode, dst, dst_stride,
+                              dst, dst_stride, blk_col, blk_row, plane);
       if (cfl->use_dc_pred_cache) {
         cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]);
         cfl->dc_pred_is_cached[pred_plane] = 1;
@@ -1694,9 +1699,10 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
     cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
     return;
   }
-  av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
-                          angle_delta, use_palette, filter_intra_mode, dst,
-                          dst_stride, dst, dst_stride, blk_col, blk_row, plane);
+  av1_predict_intra_block(
+      xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width,
+      pd->height, tx_size, mode, angle_delta, use_palette, filter_intra_mode,
+      dst, dst_stride, dst, dst_stride, blk_col, blk_row, plane);
 }
 
 void av1_init_intra_predictors(void) {
diff --git a/media/libaom/src/av1/common/reconintra.h b/media/libaom/src/av1/common/reconintra.h
index 9d203569c0..fa66ccd541 100644
--- a/media/libaom/src/av1/common/reconintra.h
+++ b/media/libaom/src/av1/common/reconintra.h
@@ -26,11 +26,14 @@ void av1_init_intra_predictors(void);
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                     int plane, int blk_col, int blk_row,
                                     TX_SIZE tx_size);
-void av1_predict_intra_block(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
-    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
-    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
-    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane);
+void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
+                             int enable_intra_edge_filter, int wpx, int hpx,
+                             TX_SIZE tx_size, PREDICTION_MODE mode,
+                             int angle_delta, int use_palette,
+                             FILTER_INTRA_MODE filter_intra_mode,
+                             const uint8_t *ref, int ref_stride, uint8_t *dst,
+                             int dst_stride, int col_off, int row_off,
+                             int plane);
 
 // Mapping of interintra to intra mode for use in the intra component
 static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
@@ -49,6 +52,10 @@ static INLINE int av1_is_directional_mode(PREDICTION_MODE mode) {
   return mode >= V_PRED && mode <= D67_PRED;
 }
 
+static INLINE int av1_is_diagonal_mode(PREDICTION_MODE mode) {
+  return mode >= D45_PRED && mode <= D67_PRED;
+}
+
 static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) {
   return bsize >= BLOCK_8X8;
 }
@@ -60,7 +67,7 @@ static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) {
 
 static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
                                                  BLOCK_SIZE bs) {
-  if (!cm->seq_params.enable_filter_intra || bs == BLOCK_INVALID) return 0;
+  if (!cm->seq_params->enable_filter_intra || bs == BLOCK_INVALID) return 0;
 
   return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32;
 }
@@ -69,7 +76,7 @@ static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm,
                                            const MB_MODE_INFO *mbmi) {
   return mbmi->mode == DC_PRED &&
          mbmi->palette_mode_info.palette_size[0] == 0 &&
-         av1_filter_intra_allowed_bsize(cm, mbmi->sb_type);
+         av1_filter_intra_allowed_bsize(cm, mbmi->bsize);
 }
 
 extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
diff --git a/media/libaom/src/av1/common/resize.c b/media/libaom/src/av1/common/resize.c
index 98f28f7b56..fe9d1dc58e 100644
--- a/media/libaom/src/av1/common/resize.c
+++ b/media/libaom/src/av1/common/resize.c
@@ -12,6 +12,7 @@
 #include <assert.h>
 #include <limits.h>
 #include <math.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -24,6 +25,7 @@
 #include "av1/common/common.h"
 #include "av1/common/resize.h"
 
+#include "config/aom_dsp_rtcd.h"
 #include "config/aom_scale_rtcd.h"
 
 // Filters for interpolation (0.5-band) - note this also filters integer pels.
@@ -699,7 +701,7 @@ Error:
   aom_free(arrbuf2);
 }
 
-static void upscale_normative_rect(const uint8_t *const input, int height,
+static bool upscale_normative_rect(const uint8_t *const input, int height,
                                    int width, int in_stride, uint8_t *output,
                                    int height2, int width2, int out_stride,
                                    int x_step_qn, int x0_qn, int pad_left,
@@ -724,6 +726,7 @@ static void upscale_normative_rect(const uint8_t *const input, int height,
   uint8_t *const in_tr = (uint8_t *)(input + width);
   if (pad_left) {
     tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+    if (!tmp_left) return false;
     for (int i = 0; i < height; i++) {
       memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols);
       memset(in_tl + i * in_stride, input[i * in_stride], border_cols);
@@ -732,6 +735,10 @@ static void upscale_normative_rect(const uint8_t *const input, int height,
   if (pad_right) {
     tmp_right =
         (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+    if (!tmp_right) {
+      aom_free(tmp_left);
+      return false;
+    }
     for (int i = 0; i < height; i++) {
       memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols);
       memset(in_tr + i * in_stride, input[i * in_stride + width - 1],
@@ -756,6 +763,7 @@ static void upscale_normative_rect(const uint8_t *const input, int height,
     }
     aom_free(tmp_right);
   }
+  return true;
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1044,7 +1052,7 @@ Error:
   aom_free(arrbuf2);
 }
 
-static void highbd_upscale_normative_rect(const uint8_t *const input,
+static bool highbd_upscale_normative_rect(const uint8_t *const input,
                                           int height, int width, int in_stride,
                                           uint8_t *output, int height2,
                                           int width2, int out_stride,
@@ -1072,6 +1080,7 @@ static void highbd_upscale_normative_rect(const uint8_t *const input,
   uint16_t *const in_tr = input16 + width;
   if (pad_left) {
     tmp_left = (uint16_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+    if (!tmp_left) return false;
     for (int i = 0; i < height; i++) {
       memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_size);
       aom_memset16(in_tl + i * in_stride, input16[i * in_stride], border_cols);
@@ -1080,6 +1089,10 @@ static void highbd_upscale_normative_rect(const uint8_t *const input,
   if (pad_right) {
     tmp_right =
         (uint16_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+    if (!tmp_right) {
+      aom_free(tmp_left);
+      return false;
+    }
     for (int i = 0; i < height; i++) {
       memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_size);
       aom_memset16(in_tr + i * in_stride, input16[i * in_stride + width - 1],
@@ -1105,6 +1118,7 @@ static void highbd_upscale_normative_rect(const uint8_t *const input,
     }
     aom_free(tmp_right);
   }
+  return true;
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
@@ -1188,9 +1202,48 @@ void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst, int bd,
-                                 const int num_planes) {
+void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst,
+                                   const InterpFilter filter,
+                                   const int phase_scaler,
+                                   const int num_planes) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+                                   src->v_buffer };
+  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+  assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH ||
+         filter == EIGHTTAP_REGULAR);
+  const InterpKernel *const kernel =
+      filter == BILINEAR ? av1_bilinear_filters : av1_sub_pel_filters_8smooth;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+    const int factor = (i == 0 || i == 3 ? 1 : 2);
+    const int src_stride = src_strides[i];
+    const int dst_stride = dst_strides[i];
+    for (int y = 0; y < dst_h; y += 16) {
+      const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
+      for (int x = 0; x < dst_w; x += 16) {
+        const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
+        const uint8_t *src_ptr = srcs[i] +
+                                 (y / factor) * src_h / dst_h * src_stride +
+                                 (x / factor) * src_w / dst_w;
+        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+        aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                      x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                      16 * src_h / dst_h, 16 / factor, 16 / factor);
+      }
+    }
+  }
+}
+
+void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                              YV12_BUFFER_CONFIG *dst, int bd,
+                                              const int num_planes) {
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
 
   // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
@@ -1223,7 +1276,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
                                 int src_stride, uint8_t *dst, int dst_stride,
                                 int plane, int rows) {
   const int is_uv = (plane > 0);
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
   const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
   const int upscaled_plane_width =
       ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
@@ -1264,21 +1317,26 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
     const int pad_left = (j == 0);
     const int pad_right = (j == cm->tiles.cols - 1);
 
+    bool success;
 #if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth)
-      highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
-                                    dst_ptr, rows, dst_width, dst_stride,
-                                    x_step_qn, x0_qn, pad_left, pad_right,
-                                    cm->seq_params.bit_depth);
+    if (cm->seq_params->use_highbitdepth)
+      success = highbd_upscale_normative_rect(
+          src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width,
+          dst_stride, x_step_qn, x0_qn, pad_left, pad_right,
+          cm->seq_params->bit_depth);
     else
-      upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
-                             rows, dst_width, dst_stride, x_step_qn, x0_qn,
-                             pad_left, pad_right);
+      success = upscale_normative_rect(src_ptr, rows, src_width, src_stride,
+                                       dst_ptr, rows, dst_width, dst_stride,
+                                       x_step_qn, x0_qn, pad_left, pad_right);
 #else
-    upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows,
-                           dst_width, dst_stride, x_step_qn, x0_qn, pad_left,
-                           pad_right);
+    success = upscale_normative_rect(src_ptr, rows, src_width, src_stride,
+                                     dst_ptr, rows, dst_width, dst_stride,
+                                     x_step_qn, x0_qn, pad_left, pad_right);
 #endif
+    if (!success) {
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error upscaling frame");
+    }
     // Update the fractional pixel offset to prepare for the next tile column.
     x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
   }
@@ -1298,14 +1356,49 @@ void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
   aom_extend_frame_borders(dst, num_planes);
 }
 
-YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
-                                          YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled) {
-  const int num_planes = av1_num_planes(cm);
-  if (cm->width != unscaled->y_crop_width ||
-      cm->height != unscaled->y_crop_height) {
-    av1_resize_and_extend_frame(unscaled, scaled, (int)cm->seq_params.bit_depth,
-                                num_planes);
+YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required(
+    AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    const InterpFilter filter, const int phase, const bool use_optimized_scaler,
+    const bool for_psnr, const int border_in_pixels,
+    const bool alloc_y_buffer_8bit) {
+  // If scaling is performed for the sole purpose of calculating PSNR, then our
+  // target dimensions are superres upscaled width/height. Otherwise our target
+  // dimensions are coded width/height.
+  const int scaled_width = for_psnr ? cm->superres_upscaled_width : cm->width;
+  const int scaled_height =
+      for_psnr ? cm->superres_upscaled_height : cm->height;
+  const bool scaling_required = (scaled_width != unscaled->y_crop_width) ||
+                                (scaled_height != unscaled->y_crop_height);
+
+  if (scaling_required) {
+    const int num_planes = av1_num_planes(cm);
+    const SequenceHeader *seq_params = cm->seq_params;
+
+    // Reallocate the frame buffer based on the target dimensions when scaling
+    // is required.
+    if (aom_realloc_frame_buffer(
+            scaled, scaled_width, scaled_height, seq_params->subsampling_x,
+            seq_params->subsampling_y, seq_params->use_highbitdepth,
+            border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL,
+            alloc_y_buffer_8bit, 0))
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate scaled buffer");
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (use_optimized_scaler && cm->seq_params->bit_depth == AOM_BITS_8) {
+      av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
+    } else {
+      av1_resize_and_extend_frame_nonnormative(
+          unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes);
+    }
+#else
+    if (use_optimized_scaler) {
+      av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
+    } else {
+      av1_resize_and_extend_frame_nonnormative(
+          unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes);
+    }
+#endif
     return scaled;
   } else {
     return unscaled;
@@ -1370,7 +1463,7 @@ static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src,
 void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
   const int num_planes = av1_num_planes(cm);
   if (!av1_superres_scaled(cm)) return;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int byte_alignment = cm->features.byte_alignment;
 
   YV12_BUFFER_CONFIG copy_buffer;
@@ -1382,8 +1475,8 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
   if (aom_alloc_frame_buffer(
           &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_BORDER_IN_PIXELS, byte_alignment))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+          AOM_BORDER_IN_PIXELS, byte_alignment, 0))
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate copy buffer for superres upscaling");
 
   // Copy function assumes the frames are the same size.
@@ -1406,7 +1499,7 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
     if (release_fb_cb(cb_priv, fb)) {
       unlock_buffer_pool(pool);
       aom_internal_error(
-          &cm->error, AOM_CODEC_MEM_ERROR,
+          cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to free current frame buffer before superres upscaling");
     }
     // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
@@ -1414,10 +1507,10 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
             frame_to_show, cm->superres_upscaled_width,
             cm->superres_upscaled_height, seq_params->subsampling_x,
             seq_params->subsampling_y, seq_params->use_highbitdepth,
-            AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv)) {
+            AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv, 0, 0)) {
       unlock_buffer_pool(pool);
       aom_internal_error(
-          &cm->error, AOM_CODEC_MEM_ERROR,
+          cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to allocate current frame buffer for superres upscaling");
     }
     unlock_buffer_pool(pool);
@@ -1431,9 +1524,9 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
             frame_to_show, cm->superres_upscaled_width,
             cm->superres_upscaled_height, seq_params->subsampling_x,
             seq_params->subsampling_y, seq_params->use_highbitdepth,
-            AOM_BORDER_IN_PIXELS, byte_alignment))
+            AOM_BORDER_IN_PIXELS, byte_alignment, 0))
       aom_internal_error(
-          &cm->error, AOM_CODEC_MEM_ERROR,
+          cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate current frame buffer for superres upscaling");
 
     // Restore config data back to frame_to_show
diff --git a/media/libaom/src/av1/common/resize.h b/media/libaom/src/av1/common/resize.h
index 8ee859e5c0..75abe6274e 100644
--- a/media/libaom/src/av1/common/resize.h
+++ b/media/libaom/src/av1/common/resize.h
@@ -63,9 +63,6 @@ void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
                                 uint8_t *oy, int oy_stride, uint8_t *ou,
                                 uint8_t *ov, int ouv_stride, int oheight,
                                 int owidth, int bd);
-void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst, int bd,
-                                 const int num_planes);
 
 void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
                                 int src_stride, uint8_t *dst, int dst_stride,
@@ -74,9 +71,15 @@ void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
                                             const YV12_BUFFER_CONFIG *src,
                                             YV12_BUFFER_CONFIG *dst);
 
-YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
-                                          YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled);
+YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required(
+    AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    const InterpFilter filter, const int phase, const bool use_optimized_scaler,
+    const bool for_psnr, const int border_in_pixels,
+    const bool alloc_y_buffer_8bit);
+
+void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                              YV12_BUFFER_CONFIG *dst, int bd,
+                                              const int num_planes);
 
 // Calculates the scaled dimensions from the given original dimensions and the
 // resize scale denominator.
diff --git a/media/libaom/src/av1/common/restoration.c b/media/libaom/src/av1/common/restoration.c
index a0f37ad637..dbfd1cc0de 100644
--- a/media/libaom/src/av1/common/restoration.c
+++ b/media/libaom/src/av1/common/restoration.c
@@ -42,8 +42,8 @@ const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
 AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
   AV1PixelRect rect;
 
-  int ss_x = is_uv && cm->seq_params.subsampling_x;
-  int ss_y = is_uv && cm->seq_params.subsampling_y;
+  int ss_x = is_uv && cm->seq_params->subsampling_x;
+  int ss_y = is_uv && cm->seq_params->subsampling_y;
 
   rect.top = 0;
   rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
@@ -1107,7 +1107,7 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
                                             YV12_BUFFER_CONFIG *frame,
                                             AV1_COMMON *cm, int optimized_lr,
                                             int num_planes) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int bit_depth = seq_params->bit_depth;
   const int highbd = seq_params->use_highbitdepth;
   lr_ctxt->dst = &cm->rst_frame;
@@ -1117,8 +1117,8 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
   if (aom_realloc_frame_buffer(
           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
           seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
-          cm->features.byte_alignment, NULL, NULL, NULL) < 0)
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+          cm->features.byte_alignment, NULL, NULL, NULL, 0, 0) < 0)
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate restoration dst buffer");
 
   lr_ctxt->on_rest_unit = filter_frame_on_unit;
@@ -1299,7 +1299,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
                                     int32_t *tmpbuf,
                                     RestorationLineBuffers *rlbs) {
   const int is_uv = plane > 0;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
 
   const RestorationInfo *rsi = &cm->rst_info[plane];
 
@@ -1315,7 +1315,7 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
                                        int *rrow1) {
   assert(rcol0 && rcol1 && rrow0 && rrow1);
 
-  if (bsize != cm->seq_params.sb_size) return 0;
+  if (bsize != cm->seq_params->sb_size) return 0;
   if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
 
   assert(!cm->features.all_lossless);
@@ -1345,8 +1345,8 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
   const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
 
   // The size of an MI-unit on this plane of the image
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
   const int mi_size_x = MI_SIZE >> ss_x;
   const int mi_size_y = MI_SIZE >> ss_y;
 
@@ -1427,7 +1427,7 @@ static void save_deblock_boundary_lines(
   int upscaled_width;
   int line_bytes;
   if (av1_superres_scaled(cm)) {
-    const int ss_x = is_uv && cm->seq_params.subsampling_x;
+    const int ss_x = is_uv && cm->seq_params->subsampling_x;
     upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
     line_bytes = upscaled_width << use_highbd;
     if (use_highbd)
@@ -1474,7 +1474,7 @@ static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
   // At the point where this function is called, we've already applied
   // superres. So we don't need to extend the lines here, we can just
   // pull directly from the topmost row of the upscaled frame.
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
   const int upscaled_width = av1_superres_scaled(cm)
                                  ? (cm->superres_upscaled_width + ss_x) >> ss_x
                                  : src_width;
@@ -1494,7 +1494,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
                                          int use_highbd, int plane,
                                          AV1_COMMON *cm, int after_cdef) {
   const int is_uv = plane > 0;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
   const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
   const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
 
@@ -1559,7 +1559,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
                                               AV1_COMMON *cm, int after_cdef) {
   const int num_planes = av1_num_planes(cm);
-  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int use_highbd = cm->seq_params->use_highbitdepth;
   for (int p = 0; p < num_planes; ++p) {
     save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
   }
diff --git a/media/libaom/src/av1/common/restoration.h b/media/libaom/src/av1/common/restoration.h
index 3b80dd5a97..65ccd0900c 100644
--- a/media/libaom/src/av1/common/restoration.h
+++ b/media/libaom/src/av1/common/restoration.h
@@ -22,6 +22,10 @@
 extern "C" {
 #endif
 
+/*! @file */
+
+/*!\cond */
+
 // Border for Loop restoration buffer
 #define AOM_RESTORATION_FRAME_BORDER 32
 #define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
@@ -124,6 +128,7 @@ extern "C" {
 #define WIENER_WIN_CHROMA (WIENER_WIN - 2)
 #define WIENER_WIN_REDUCED (WIENER_WIN - 2)
 #define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
+#define WIENER_STATS_DOWNSAMPLE_FACTOR 4
 
 #define WIENER_FILT_PREC_BITS 7
 #define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
@@ -183,13 +188,28 @@ typedef struct {
   int r[2];  // radii
   int s[2];  // sgr parameters for r[0] and r[1], based on GenSgrprojVtable()
 } sgr_params_type;
+/*!\endcond */
 
+/*!\brief Parameters related to Restoration Unit Info */
 typedef struct {
+  /*!
+   * restoration type
+   */
   RestorationType restoration_type;
+
+  /*!
+   * Wiener filter parameters if restoration_type indicates Wiener
+   */
   WienerInfo wiener_info;
+
+  /*!
+   * Sgrproj filter parameters if restoration_type indicates Sgrproj
+   */
   SgrprojInfo sgrproj_info;
 } RestorationUnitInfo;
 
+/*!\cond */
+
 // A restoration line buffer needs space for two lines plus a horizontal filter
 // margin of RESTORATION_EXTRA_HORZ on each side.
 #define RESTORATION_LINEBUFFER_WIDTH \
@@ -207,33 +227,89 @@ typedef struct {
   uint16_t tmp_save_above[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
   uint16_t tmp_save_below[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
 } RestorationLineBuffers;
+/*!\endcond */
 
+/*!\brief Parameters related to Restoration Stripe boundaries */
 typedef struct {
+  /*!
+   * stripe boundary above
+   */
   uint8_t *stripe_boundary_above;
+
+  /*!
+   * stripe boundary below
+   */
   uint8_t *stripe_boundary_below;
+
+  /*!
+   * strides for stripe boundaries above and below
+   */
   int stripe_boundary_stride;
+
+  /*!
+   * size of stripe boundaries above and below
+   */
   int stripe_boundary_size;
 } RestorationStripeBoundaries;
 
+/*!\brief Parameters related to Restoration Info */
 typedef struct {
+  /*!
+   * Restoration type for frame
+   */
   RestorationType frame_restoration_type;
+
+  /*!
+   * Restoration unit size
+   */
   int restoration_unit_size;
 
-  // Fields below here are allocated and initialised by
-  // av1_alloc_restoration_struct. (horz_)units_per_tile give the number of
-  // restoration units in (one row of) the largest tile in the frame. The data
-  // in unit_info is laid out with units_per_tile entries for each tile, which
-  // have stride horz_units_per_tile.
-  //
-  // Even if there are tiles of different sizes, the data in unit_info is laid
-  // out as if all tiles are of full size.
+  /**
+   * \name Fields allocated and initialised by av1_alloc_restoration_struct.
+   * (horz_)units_per_tile give the number of restoration units in
+   * (one row of) the largest tile in the frame.
+   */
+  /**@{*/
+  /*!
+   * Number of units per tile for the largest tile in the frame
+   */
   int units_per_tile;
-  int vert_units_per_tile, horz_units_per_tile;
+
+  /*!
+   * Number of vertical units per tile
+   */
+  int vert_units_per_tile;
+
+  /*!
+   * Number of horizontal units per tile for the largest tile in the frame
+   */
+  int horz_units_per_tile;
+  /**@}*/
+
+  /*!
+   * List of info for units in tile.
+   * The data in unit_info is laid out with units_per_tile entries for each
+   * tile, which have stride horz_units_per_tile.
+   * Even if there are tiles of different sizes, the data in unit_info is
+   * laid out as if all tiles are of full size.
+   */
   RestorationUnitInfo *unit_info;
+
+  /*!
+   * Restoration Stripe boundary info
+   */
   RestorationStripeBoundaries boundaries;
+
+  /*!
+   * Whether optimized lr can be used for speed.
+   * That includes cases of no cdef and no superres, or if fast trial runs
+   * are used on the encoder side.
+   */
   int optimized_lr;
 } RestorationInfo;
 
+/*!\cond */
+
 static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
   sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
   sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2;
@@ -291,25 +367,39 @@ void av1_extend_frame(uint8_t *data, int width, int height, int stride,
                       int border_horz, int border_vert, int highbd);
 void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
 
-// Filter a single loop restoration unit.
-//
-// limits is the limits of the unit. rui gives the mode to use for this unit
-// and its coefficients. If striped loop restoration is enabled, rsb contains
-// deblocked pixels to use for stripe boundaries; rlbs is just some space to
-// use as a scratch buffer. tile_rect gives the limits of the tile containing
-// this unit. tile_stripe0 is the index of the first stripe in this tile.
-//
-// ss_x and ss_y are flags which should be 1 if this is a plane with
-// horizontal/vertical subsampling, respectively. highbd is a flag which should
-// be 1 in high bit depth mode, in which case bit_depth is the bit depth.
-//
-// data8 is the frame data (pointing at the top-left corner of the frame, not
-// the restoration unit) and stride is its stride. dst8 is the buffer where the
-// results will be written and has stride dst_stride. Like data8, dst8 should
-// point at the top-left corner of the frame.
-//
-// Finally tmpbuf is a scratch buffer used by the sgrproj filter which should
-// be at least SGRPROJ_TMPBUF_SIZE big.
+/*!\endcond */
+
+/*!\brief Function for applying loop restoration filter to a single unit.
+ *
+ * \ingroup in_loop_restoration
+ * This function applies the loop restoration filter to a single
+ * loop restoration unit.
+ *
+ * \param[in]  limits        Limits of the unit
+ * \param[in]  rui           The parameters to use for this unit and its
+ *                           coefficients
+ * \param[in]  rsb           Deblocked pixels to use for stripe boundaries
+ * \param[in]  rlbs          Space to use as a scratch buffer
+ * \param[in]  tile_rect     Limits of the tile containing this unit
+ * \param[in]  tile_stripe0  Index of the first stripe in this tile
+ * \param[in]  ss_x          Horizontal subsampling for plane
+ * \param[in]  ss_y          Vertical subsampling for plane
+ * \param[in]  highbd        Whether high bitdepth pipeline is used
+ * \param[in]  bit_depth     Bit-depth of the video
+ * \param[in]  data8         Frame data (pointing at the top-left corner of
+ *                           the frame, not the restoration unit).
+ * \param[in]  stride        Stride of \c data8
+ * \param[out] dst8          Buffer where the results will be written. Like
+ *                           \c data8, \c dst8 should point at the top-left
+ *                           corner of the frame
+ * \param[in]  dst_stride    Stride of \c dst8
+ * \param[in]  tmpbuf        Scratch buffer used by the sgrproj filter which
+ *                           should be at least SGRPROJ_TMPBUF_SIZE big.
+ * \param[in]  optimized_lr  Whether to use fast optimized Loop Restoration
+ *
+ * \return Nothing is returned. Instead, the filtered unit is output in
+ * \c dst8 at the proper restoration unit offset.
+ */
 void av1_loop_restoration_filter_unit(
     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
@@ -317,9 +407,24 @@ void av1_loop_restoration_filter_unit(
     int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
     int dst_stride, int32_t *tmpbuf, int optimized_lr);
 
+/*!\brief Function for applying loop restoration filter to a frame
+ *
+ * \ingroup in_loop_restoration
+ * This function applies the loop restoration filter to a frame.
+ *
+ * \param[in, out]  frame         Compressed frame buffer
+ * \param[in, out]  cm            Pointer to top level common structure
+ * \param[in]       optimized_lr  Whether to use fast optimized Loop Restoration
+ * \param[in]       lr_ctxt       Loop restoration context
+ *
+ * \return Nothing is returned. Instead, the filtered frame is output in
+ * \c frame.
+ */
 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
                                        struct AV1Common *cm, int optimized_lr,
                                        void *lr_ctxt);
+/*!\cond */
+
 void av1_loop_restoration_precal();
 
 typedef void (*rest_tile_start_visitor_t)(int tile_row, int tile_col,
@@ -373,6 +478,9 @@ int av1_lr_count_units_in_tile(int unit_size, int tile_size);
 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
                              const int sb_cols, int plane);
+
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/common/scale.c b/media/libaom/src/av1/common/scale.c
index 3b14c0a2c6..5bcd8df0cd 100644
--- a/media/libaom/src/av1/common/scale.c
+++ b/media/libaom/src/av1/common/scale.c
@@ -84,45 +84,4 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
     sf->scale_value_x = unscaled_value;
     sf->scale_value_y = unscaled_value;
   }
-
-  // AV1 convolve functions
-  // Special case convolve functions should produce the same result as
-  // av1_convolve_2d.
-  // subpel_x_qn == 0 && subpel_y_qn == 0
-  sf->convolve[0][0][0] = av1_convolve_2d_copy_sr;
-  // subpel_x_qn == 0
-  sf->convolve[0][1][0] = av1_convolve_y_sr;
-  // subpel_y_qn == 0
-  sf->convolve[1][0][0] = av1_convolve_x_sr;
-  // subpel_x_qn != 0 && subpel_y_qn != 0
-  sf->convolve[1][1][0] = av1_convolve_2d_sr;
-  // subpel_x_qn == 0 && subpel_y_qn == 0
-  sf->convolve[0][0][1] = av1_dist_wtd_convolve_2d_copy;
-  // subpel_x_qn == 0
-  sf->convolve[0][1][1] = av1_dist_wtd_convolve_y;
-  // subpel_y_qn == 0
-  sf->convolve[1][0][1] = av1_dist_wtd_convolve_x;
-  // subpel_x_qn != 0 && subpel_y_qn != 0
-  sf->convolve[1][1][1] = av1_dist_wtd_convolve_2d;
-#if CONFIG_AV1_HIGHBITDEPTH
-  // AV1 High BD convolve functions
-  // Special case convolve functions should produce the same result as
-  // av1_highbd_convolve_2d.
-  // subpel_x_qn == 0 && subpel_y_qn == 0
-  sf->highbd_convolve[0][0][0] = av1_highbd_convolve_2d_copy_sr;
-  // subpel_x_qn == 0
-  sf->highbd_convolve[0][1][0] = av1_highbd_convolve_y_sr;
-  // subpel_y_qn == 0
-  sf->highbd_convolve[1][0][0] = av1_highbd_convolve_x_sr;
-  // subpel_x_qn != 0 && subpel_y_qn != 0
-  sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr;
-  // subpel_x_qn == 0 && subpel_y_qn == 0
-  sf->highbd_convolve[0][0][1] = av1_highbd_dist_wtd_convolve_2d_copy;
-  // subpel_x_qn == 0
-  sf->highbd_convolve[0][1][1] = av1_highbd_dist_wtd_convolve_y;
-  // subpel_y_qn == 0
-  sf->highbd_convolve[1][0][1] = av1_highbd_dist_wtd_convolve_x;
-  // subpel_x_qn != 0 && subpel_y_qn != 0
-  sf->highbd_convolve[1][1][1] = av1_highbd_dist_wtd_convolve_2d;
-#endif
 }
diff --git a/media/libaom/src/av1/common/scale.h b/media/libaom/src/av1/common/scale.h
index 16b40bde8f..fd30416dfa 100644
--- a/media/libaom/src/av1/common/scale.h
+++ b/media/libaom/src/av1/common/scale.h
@@ -33,10 +33,6 @@ struct scale_factors {
 
   int (*scale_value_x)(int val, const struct scale_factors *sf);
   int (*scale_value_y)(int val, const struct scale_factors *sf);
-
-  // convolve_fn_ptr[subpel_x != 0][subpel_y != 0][is_compound]
-  aom_convolve_fn_t convolve[2][2][2];
-  aom_highbd_convolve_fn_t highbd_convolve[2][2][2];
 };
 
 MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
diff --git a/media/libaom/src/av1/common/scan.c b/media/libaom/src/av1/common/scan.c
index c1d4f35813..b86068da02 100644
--- a/media/libaom/src/av1/common/scan.c
+++ b/media/libaom/src/av1/common/scan.c
@@ -1663,16 +1663,6 @@ DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = {
   978,  979,  995,  996,  1008, 1009, 1017, 1018, 1022, 1023
 };
 
-const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
-  { default_scan_4x4, av1_default_iscan_4x4 },
-  { default_scan_8x8, av1_default_iscan_8x8 },
-  { default_scan_16x16, av1_default_iscan_16x16 },
-  { default_scan_32x32, av1_default_iscan_32x32 },
-  // Half of the coefficients of tx64 at higher frequencies are set to
-  // zeros. So tx32's scan order is used.
-  { default_scan_32x32, av1_default_iscan_32x32 },
-};
-
 const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
   {
       // TX_4X4
diff --git a/media/libaom/src/av1/common/scan.h b/media/libaom/src/av1/common/scan.h
index d9620e1c53..4f369786f2 100644
--- a/media/libaom/src/av1/common/scan.h
+++ b/media/libaom/src/av1/common/scan.h
@@ -34,7 +34,6 @@ enum {
   SCAN_MODES
 } UENUM1BYTE(SCAN_MODE);
 
-extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
 extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
 
 void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
diff --git a/media/libaom/src/av1/common/seg_common.h b/media/libaom/src/av1/common/seg_common.h
index aeb9c1768e..3ad058c291 100644
--- a/media/libaom/src/av1/common/seg_common.h
+++ b/media/libaom/src/av1/common/seg_common.h
@@ -53,7 +53,6 @@ struct segmentation {
 };
 
 struct segmentation_probs {
-  aom_cdf_prob tree_cdf[CDF_SIZE(MAX_SEGMENTS)];
   aom_cdf_prob pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)];
   aom_cdf_prob spatial_pred_seg_cdf[SPATIAL_PREDICTION_PROBS]
                                    [CDF_SIZE(MAX_SEGMENTS)];
diff --git a/media/libaom/src/av1/common/thread_common.c b/media/libaom/src/av1/common/thread_common.c
index f3c8795f8d..49522ea9a6 100644
--- a/media/libaom/src/av1/common/thread_common.c
+++ b/media/libaom/src/av1/common/thread_common.c
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "aom/aom_image.h"
 #include "config/aom_config.h"
 #include "config/aom_scale_rtcd.h"
 
@@ -52,8 +53,8 @@ static INLINE int get_lr_sync_range(int width) {
 }
 
 // Allocate memory for lf row synchronization
-static void loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
-                              int width, int num_workers) {
+void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
+                           int width, int num_workers) {
   lf_sync->rows = rows;
 #if CONFIG_MULTITHREAD
   {
@@ -150,6 +151,61 @@ static void loop_filter_data_reset(LFWorkerData *lf_data,
   }
 }
 
+void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync,
+                         int num_workers) {
+  if (num_workers < 1) return;
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_ == NULL) {
+    CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
+                    aom_malloc(sizeof(*(cdef_sync->mutex_))));
+    if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+  }
+#else
+  (void)cm;
+  (void)cdef_sync;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_free_cdef_sync(AV1CdefSync *cdef_sync) {
+  if (cdef_sync == NULL) return;
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_ != NULL) {
+    pthread_mutex_destroy(cdef_sync->mutex_);
+    aom_free(cdef_sync->mutex_);
+  }
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void cdef_row_mt_sync_read(AV1CdefSync *const cdef_sync,
+                                         int row) {
+  if (!row) return;
+#if CONFIG_MULTITHREAD
+  AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
+  pthread_mutex_lock(cdef_row_mt[row - 1].row_mutex_);
+  while (cdef_row_mt[row - 1].is_row_done != 1)
+    pthread_cond_wait(cdef_row_mt[row - 1].row_cond_,
+                      cdef_row_mt[row - 1].row_mutex_);
+  cdef_row_mt[row - 1].is_row_done = 0;
+  pthread_mutex_unlock(cdef_row_mt[row - 1].row_mutex_);
+#else
+  (void)cdef_sync;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void cdef_row_mt_sync_write(AV1CdefSync *const cdef_sync,
+                                          int row) {
+#if CONFIG_MULTITHREAD
+  AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
+  pthread_mutex_lock(cdef_row_mt[row].row_mutex_);
+  pthread_cond_signal(cdef_row_mt[row].row_cond_);
+  cdef_row_mt[row].is_row_done = 1;
+  pthread_mutex_unlock(cdef_row_mt[row].row_mutex_);
+#else
+  (void)cdef_sync;
+  (void)row;
+#endif  // CONFIG_MULTITHREAD
+}
+
 static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c,
                              int plane) {
 #if CONFIG_MULTITHREAD
@@ -204,38 +260,51 @@ static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
 #endif  // CONFIG_MULTITHREAD
 }
 
-static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
-                            int stop,
-#if CONFIG_LPF_MASK
-                            int is_decoding,
-#endif
-                            int plane_start, int plane_end) {
+static AOM_FORCE_INLINE bool skip_loop_filter_plane(const int planes_to_lf[3],
+                                                    int plane,
+                                                    int lpf_opt_level) {
+  // If LPF_PICK_METHOD is LPF_PICK_FROM_Q, we have the option to filter both
+  // chroma planes together
+  if (lpf_opt_level == 2) {
+    if (plane == AOM_PLANE_Y) {
+      return !planes_to_lf[plane];
+    }
+    if (plane == AOM_PLANE_U) {
+      // U and V are handled together
+      return !planes_to_lf[1] && !planes_to_lf[2];
+    }
+    assert(plane == AOM_PLANE_V);
+    if (plane == AOM_PLANE_V) {
+      // V is handled when u is filtered
+      return true;
+    }
+  }
+
+  // Normal operation mode
+  return !planes_to_lf[plane];
+}
+
+static void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop,
+                            const int planes_to_lf[3], int lpf_opt_level) {
   int mi_row, plane, dir;
   AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
   lf_sync->jobs_enqueued = 0;
   lf_sync->jobs_dequeued = 0;
 
-  for (dir = 0; dir < 2; dir++) {
-    for (plane = plane_start; plane < plane_end; plane++) {
-      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
-        break;
-      else if (plane == 1 && !(cm->lf.filter_level_u))
-        continue;
-      else if (plane == 2 && !(cm->lf.filter_level_v))
-        continue;
-#if CONFIG_LPF_MASK
-      int step = MAX_MIB_SIZE;
-      if (is_decoding) {
-        step = MI_SIZE_64X64;
-      }
-      for (mi_row = start; mi_row < stop; mi_row += step)
-#else
-      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE)
-#endif
-      {
+  // Launch all vertical jobs first, as they are blocking the horizontal ones.
+  // Launch top row jobs for all planes first, in case the output can be
+  // partially reconstructed row by row.
+  for (dir = 0; dir < 2; ++dir) {
+    for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+      for (plane = 0; plane < 3; ++plane) {
+        if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
+          continue;
+        }
+        if (!planes_to_lf[plane]) continue;
         lf_job_queue->mi_row = mi_row;
         lf_job_queue->plane = plane;
         lf_job_queue->dir = dir;
+        lf_job_queue->lpf_opt_level = lpf_opt_level;
         lf_job_queue++;
         lf_sync->jobs_enqueued++;
       }
@@ -262,171 +331,106 @@ static AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
   return cur_job_info;
 }
 
-// Implement row loopfiltering for each thread.
+// One job of row loopfiltering.
 static INLINE void thread_loop_filter_rows(
     const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
-    struct macroblockd_plane *planes, MACROBLOCKD *xd,
-    AV1LfSync *const lf_sync) {
+    struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane,
+    int dir, int lpf_opt_level, AV1LfSync *const lf_sync,
+    AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf) {
   const int sb_cols =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2) >>
-      MAX_MIB_SIZE_LOG2;
-  int mi_row, mi_col, plane, dir;
-  int r, c;
-
-  while (1) {
-    AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
-
-    if (cur_job_info != NULL) {
-      mi_row = cur_job_info->mi_row;
-      plane = cur_job_info->plane;
-      dir = cur_job_info->dir;
-      r = mi_row >> MAX_MIB_SIZE_LOG2;
-
-      if (dir == 0) {
-        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
-             mi_col += MAX_MIB_SIZE) {
-          c = mi_col >> MAX_MIB_SIZE_LOG2;
-
-          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
-                               mi_row, mi_col, plane, plane + 1);
-
-          av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
-                                      mi_col);
-          sync_write(lf_sync, r, c, sb_cols, plane);
-        }
-      } else if (dir == 1) {
-        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
-             mi_col += MAX_MIB_SIZE) {
-          c = mi_col >> MAX_MIB_SIZE_LOG2;
-
-          // Wait for vertical edge filtering of the top-right block to be
-          // completed
-          sync_read(lf_sync, r, c, plane);
-
-          // Wait for vertical edge filtering of the right block to be
-          // completed
-          sync_read(lf_sync, r + 1, c, plane);
-
-          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
-                               mi_row, mi_col, plane, plane + 1);
-          av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
-                                      mi_col);
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2);
+  const int r = mi_row >> MAX_MIB_SIZE_LOG2;
+  int mi_col, c;
+
+  const bool joint_filter_chroma = (lpf_opt_level == 2) && plane > AOM_PLANE_Y;
+  const int num_planes = joint_filter_chroma ? 2 : 1;
+  assert(IMPLIES(joint_filter_chroma, plane == AOM_PLANE_U));
+
+  if (dir == 0) {
+    for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) {
+      c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+      av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
+                           mi_row, mi_col, plane, plane + num_planes);
+      if (lpf_opt_level) {
+        if (plane == AOM_PLANE_Y) {
+          av1_filter_block_plane_vert_opt(cm, xd, &planes[plane], mi_row,
+                                          mi_col, params_buf, tx_buf);
+        } else {
+          av1_filter_block_plane_vert_opt_chroma(cm, xd, &planes[plane], mi_row,
+                                                 mi_col, params_buf, tx_buf,
+                                                 plane, joint_filter_chroma);
         }
+      } else {
+        av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
+                                    mi_col);
+      }
+      if (lf_sync != NULL) {
+        sync_write(lf_sync, r, c, sb_cols, plane);
       }
-    } else {
-      break;
     }
-  }
-}
-
-// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(void *arg1, void *arg2) {
-  AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
-  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
-  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                          lf_data->xd, lf_sync);
-  return 1;
-}
-
-#if CONFIG_LPF_MASK
-static INLINE void thread_loop_filter_bitmask_rows(
-    const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
-    struct macroblockd_plane *planes, MACROBLOCKD *xd,
-    AV1LfSync *const lf_sync) {
-  const int sb_cols =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MIN_MIB_SIZE_LOG2) >>
-      MIN_MIB_SIZE_LOG2;
-  int mi_row, mi_col, plane, dir;
-  int r, c;
-  (void)xd;
-
-  while (1) {
-    AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
-
-    if (cur_job_info != NULL) {
-      mi_row = cur_job_info->mi_row;
-      plane = cur_job_info->plane;
-      dir = cur_job_info->dir;
-      r = mi_row >> MIN_MIB_SIZE_LOG2;
+  } else if (dir == 1) {
+    for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) {
+      c = mi_col >> MAX_MIB_SIZE_LOG2;
 
-      if (dir == 0) {
-        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
-             mi_col += MI_SIZE_64X64) {
-          c = mi_col >> MIN_MIB_SIZE_LOG2;
+      if (lf_sync != NULL) {
+        // Wait for vertical edge filtering of the top-right block to be
+        // completed
+        sync_read(lf_sync, r, c, plane);
 
-          av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
+        // Wait for vertical edge filtering of the right block to be completed
+        sync_read(lf_sync, r + 1, c, plane);
+      }
 
-          av1_filter_block_plane_bitmask_vert(cm, &planes[plane], plane, mi_row,
-                                              mi_col);
-          sync_write(lf_sync, r, c, sb_cols, plane);
-        }
-      } else if (dir == 1) {
-        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
-             mi_col += MI_SIZE_64X64) {
-          c = mi_col >> MIN_MIB_SIZE_LOG2;
-
-          // Wait for vertical edge filtering of the top-right block to be
-          // completed
-          sync_read(lf_sync, r, c, plane);
-
-          // Wait for vertical edge filtering of the right block to be
-          // completed
-          sync_read(lf_sync, r + 1, c, plane);
-
-          av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
-          av1_filter_block_plane_bitmask_horz(cm, &planes[plane], plane, mi_row,
-                                              mi_col);
+      av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
+                           mi_row, mi_col, plane, plane + num_planes);
+      if (lpf_opt_level) {
+        if (plane == AOM_PLANE_Y) {
+          av1_filter_block_plane_horz_opt(cm, xd, &planes[plane], mi_row,
+                                          mi_col, params_buf, tx_buf);
+        } else {
+          av1_filter_block_plane_horz_opt_chroma(cm, xd, &planes[plane], mi_row,
+                                                 mi_col, params_buf, tx_buf,
+                                                 plane, joint_filter_chroma);
         }
+      } else {
+        av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
+                                    mi_col);
       }
-    } else {
-      break;
     }
   }
 }
 
 // Row-based multi-threaded loopfilter hook
-static int loop_filter_bitmask_row_worker(void *arg1, void *arg2) {
+static int loop_filter_row_worker(void *arg1, void *arg2) {
   AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
   LFWorkerData *const lf_data = (LFWorkerData *)arg2;
-  thread_loop_filter_bitmask_rows(lf_data->frame_buffer, lf_data->cm,
-                                  lf_data->planes, lf_data->xd, lf_sync);
+  AV1LfMTInfo *cur_job_info;
+  while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
+    const int lpf_opt_level = cur_job_info->lpf_opt_level;
+    thread_loop_filter_rows(
+        lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
+        cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
+        lpf_opt_level, lf_sync, lf_data->params_buf, lf_data->tx_buf);
+  }
   return 1;
 }
-#endif  // CONFIG_LPF_MASK
 
 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                                 MACROBLOCKD *xd, int start, int stop,
-                                int plane_start, int plane_end,
-#if CONFIG_LPF_MASK
-                                int is_decoding,
-#endif
-                                AVxWorker *workers, int nworkers,
-                                AV1LfSync *lf_sync) {
+                                const int planes_to_lf[3], AVxWorker *workers,
+                                int num_workers, AV1LfSync *lf_sync,
+                                int lpf_opt_level) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-#if CONFIG_LPF_MASK
-  int sb_rows;
-  if (is_decoding) {
-    sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MIN_MIB_SIZE_LOG2) >>
-              MIN_MIB_SIZE_LOG2;
-  } else {
-    sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >>
-              MAX_MIB_SIZE_LOG2;
-  }
-#else
   // Number of superblock rows and cols
   const int sb_rows =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >>
-      MAX_MIB_SIZE_LOG2;
-#endif
-  const int num_workers = nworkers;
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2);
   int i;
 
   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
       num_workers > lf_sync->num_workers) {
     av1_loop_filter_dealloc(lf_sync);
-    loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+    av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
   }
 
   // Initialize cur_sb_col to -1 for all SB rows.
@@ -435,26 +439,14 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
            sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
   }
 
-  enqueue_lf_jobs(lf_sync, cm, start, stop,
-#if CONFIG_LPF_MASK
-                  is_decoding,
-#endif
-                  plane_start, plane_end);
+  enqueue_lf_jobs(lf_sync, start, stop, planes_to_lf, lpf_opt_level);
 
   // Set up loopfilter thread data.
-  for (i = 0; i < num_workers; ++i) {
+  for (i = num_workers - 1; i >= 0; --i) {
     AVxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
 
-#if CONFIG_LPF_MASK
-    if (is_decoding) {
-      worker->hook = loop_filter_bitmask_row_worker;
-    } else {
-      worker->hook = loop_filter_row_worker;
-    }
-#else
     worker->hook = loop_filter_row_worker;
-#endif
     worker->data1 = lf_sync;
     worker->data2 = lf_data;
 
@@ -462,7 +454,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
     loop_filter_data_reset(lf_data, frame, cm, xd);
 
     // Start loopfiltering
-    if (i == num_workers - 1) {
+    if (i == 0) {
       winterface->execute(worker);
     } else {
       winterface->launch(worker);
@@ -470,20 +462,52 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   }
 
   // Wait till all rows are finished
-  for (i = 0; i < num_workers; ++i) {
+  for (i = 1; i < num_workers; ++i) {
     winterface->sync(&workers[i]);
   }
 }
 
+static void loop_filter_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                             MACROBLOCKD *xd, int start, int stop,
+                             const int planes_to_lf[3], int lpf_opt_level) {
+  // Filter top rows of all planes first, in case the output can be partially
+  // reconstructed row by row.
+  int mi_row, plane, dir;
+
+  AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE];
+  TX_SIZE tx_buf[MAX_MIB_SIZE];
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    for (plane = 0; plane < 3; ++plane) {
+      if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
+        continue;
+      }
+
+      for (dir = 0; dir < 2; ++dir) {
+        thread_loop_filter_rows(frame, cm, xd->plane, xd, mi_row, plane, dir,
+                                lpf_opt_level, /*lf_sync=*/NULL, params_buf,
+                                tx_buf);
+      }
+    }
+  }
+}
+
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                               MACROBLOCKD *xd, int plane_start, int plane_end,
-                              int partial_frame,
-#if CONFIG_LPF_MASK
-                              int is_decoding,
-#endif
-                              AVxWorker *workers, int num_workers,
-                              AV1LfSync *lf_sync) {
+                              int partial_frame, AVxWorker *workers,
+                              int num_workers, AV1LfSync *lf_sync,
+                              int lpf_opt_level) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
+  int planes_to_lf[3];
+
+  // For each luma and chroma plane, whether to filter it or not.
+  planes_to_lf[0] = (cm->lf.filter_level[0] || cm->lf.filter_level[1]) &&
+                    plane_start <= 0 && 0 < plane_end;
+  planes_to_lf[1] = cm->lf.filter_level_u && plane_start <= 1 && 1 < plane_end;
+  planes_to_lf[2] = cm->lf.filter_level_v && plane_start <= 2 && 2 < plane_end;
+  // If the luma plane is purposely not filtered, neither are the chroma planes.
+  if (!planes_to_lf[0] && plane_start <= 0 && 0 < plane_end) return;
+  // Early exit.
+  if (!planes_to_lf[0] && !planes_to_lf[1] && !planes_to_lf[2]) return;
 
   start_mi_row = 0;
   mi_rows_to_filter = cm->mi_params.mi_rows;
@@ -495,37 +519,15 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   end_mi_row = start_mi_row + mi_rows_to_filter;
   av1_loop_filter_frame_init(cm, plane_start, plane_end);
 
-#if CONFIG_LPF_MASK
-  if (is_decoding) {
-    cm->is_decoding = is_decoding;
-    // TODO(chengchen): currently use one thread to build bitmasks for the
-    // frame. Make it support multi-thread later.
-    for (int plane = plane_start; plane < plane_end; plane++) {
-      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
-        break;
-      else if (plane == 1 && !(cm->lf.filter_level_u))
-        continue;
-      else if (plane == 2 && !(cm->lf.filter_level_v))
-        continue;
-
-      // TODO(chengchen): can we remove this?
-      struct macroblockd_plane *pd = xd->plane;
-      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame, 0, 0, plane,
-                           plane + 1);
-
-      av1_build_bitmask_vert_info(cm, &pd[plane], plane);
-      av1_build_bitmask_horz_info(cm, &pd[plane], plane);
-    }
-    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
-                        plane_end, 1, workers, num_workers, lf_sync);
+  if (num_workers > 1) {
+    // Enqueue and execute loopfiltering jobs.
+    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf,
+                        workers, num_workers, lf_sync, lpf_opt_level);
   } else {
-    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
-                        plane_end, 0, workers, num_workers, lf_sync);
+    // Directly filter in the main thread.
+    loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf,
+                     lpf_opt_level);
   }
-#else
-  loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
-                      plane_end, workers, num_workers, lf_sync);
-#endif
 }
 
 static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
@@ -584,9 +586,9 @@ static INLINE void lr_sync_write(void *const lr_sync, int r, int c,
 }
 
 // Allocate memory for loop restoration row synchronization
-static void loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
-                                   int num_workers, int num_rows_lr,
-                                   int num_planes, int width) {
+void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
+                                int num_workers, int num_rows_lr,
+                                int num_planes, int width) {
   lr_sync->rows = num_rows_lr;
   lr_sync->num_planes = num_planes;
 #if CONFIG_MULTITHREAD
@@ -717,7 +719,7 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
   for (int plane = 0; plane < num_planes; plane++) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
     const int is_uv = plane > 0;
-    const int ss_y = is_uv && cm->seq_params.subsampling_y;
+    const int ss_y = is_uv && cm->seq_params->subsampling_y;
 
     AV1PixelRect tile_rect = ctxt[plane].tile_rect;
     const int unit_size = ctxt[plane].rsi->restoration_unit_size;
@@ -875,11 +877,11 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
   int i;
   assert(MAX_MB_PLANE == 3);
 
-  if (!lr_sync->sync_range || num_rows_lr != lr_sync->rows ||
-      num_workers > lr_sync->num_workers || num_planes != lr_sync->num_planes) {
+  if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
+      num_workers > lr_sync->num_workers || num_planes > lr_sync->num_planes) {
     av1_loop_restoration_dealloc(lr_sync, num_workers);
-    loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, num_planes,
-                           cm->width);
+    av1_loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr,
+                               num_planes, cm->width);
   }
 
   // Initialize cur_sb_col to -1 for all SB rows.
@@ -891,15 +893,15 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
   enqueue_lr_jobs(lr_sync, lr_ctxt, cm);
 
   // Set up looprestoration thread data.
-  for (i = 0; i < num_workers; ++i) {
+  for (i = num_workers - 1; i >= 0; --i) {
     AVxWorker *const worker = &workers[i];
     lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
     worker->hook = loop_restoration_row_worker;
     worker->data1 = lr_sync;
     worker->data2 = &lr_sync->lrworkerdata[i];
 
-    // Start loopfiltering
-    if (i == num_workers - 1) {
+    // Start loop restoration
+    if (i == 0) {
       winterface->execute(worker);
     } else {
       winterface->launch(worker);
@@ -907,7 +909,7 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
   }
 
   // Wait till all rows are finished
-  for (i = 0; i < num_workers; ++i) {
+  for (i = 1; i < num_workers; ++i) {
     winterface->sync(&workers[i]);
   }
 }
@@ -928,3 +930,198 @@ void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
   foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync,
                                  cm);
 }
+
+// Initializes cdef_sync parameters.
+static AOM_INLINE void reset_cdef_job_info(AV1CdefSync *const cdef_sync) {
+  cdef_sync->end_of_frame = 0;
+  cdef_sync->fbr = 0;
+  cdef_sync->fbc = 0;
+}
+
+static AOM_INLINE void launch_cdef_workers(AVxWorker *const workers,
+                                           int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &workers[i];
+    if (i == 0)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+}
+
+static AOM_INLINE void sync_cdef_workers(AVxWorker *const workers,
+                                         AV1_COMMON *const cm,
+                                         int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int had_error = 0;
+
+  // Wait for completion of Cdef frame.
+  for (int i = num_workers - 1; i > 0; i--) {
+    AVxWorker *const worker = &workers[i];
+    had_error |= !winterface->sync(worker);
+  }
+  if (had_error)
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "Failed to process cdef frame");
+}
+
+// Updates the row index of the next job to be processed.
+// Also updates end_of_frame flag when the processing of all rows is complete.
+static void update_cdef_row_next_job_info(AV1CdefSync *const cdef_sync,
+                                          const int nvfb) {
+  cdef_sync->fbr++;
+  if (cdef_sync->fbr == nvfb) {
+    cdef_sync->end_of_frame = 1;
+  }
+}
+
+// Checks if a job is available. If job is available,
+// populates next job information and returns 1, else returns 0.
+static AOM_INLINE int get_cdef_row_next_job(AV1CdefSync *const cdef_sync,
+                                            int *cur_fbr, const int nvfb) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(cdef_sync->mutex_);
+#endif  // CONFIG_MULTITHREAD
+  int do_next_row = 0;
+  // Populates information needed for current job and update the row
+  // index of the next row to be processed.
+  if (cdef_sync->end_of_frame == 0) {
+    do_next_row = 1;
+    *cur_fbr = cdef_sync->fbr;
+    update_cdef_row_next_job_info(cdef_sync, nvfb);
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(cdef_sync->mutex_);
+#endif  // CONFIG_MULTITHREAD
+  return do_next_row;
+}
+
+// Hook function for each thread in CDEF multi-threading.
+static int cdef_sb_row_worker_hook(void *arg1, void *arg2) {
+  AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
+  AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2;
+  const int nvfb =
+      (cdef_worker->cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  int cur_fbr;
+  while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) {
+    av1_cdef_fb_row(cdef_worker->cm, cdef_worker->xd, cdef_worker->linebuf,
+                    cdef_worker->colbuf, cdef_worker->srcbuf, cur_fbr,
+                    cdef_worker->cdef_init_fb_row_fn, cdef_sync);
+  }
+  return 1;
+}
+
+// Assigns CDEF hook function and thread data to each worker.
+static void prepare_cdef_frame_workers(
+    AV1_COMMON *const cm, MACROBLOCKD *xd, AV1CdefWorkerData *const cdef_worker,
+    AVxWorkerHook hook, AVxWorker *const workers, AV1CdefSync *const cdef_sync,
+    int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn) {
+  const int num_planes = av1_num_planes(cm);
+
+  cdef_worker[0].srcbuf = cm->cdef_info.srcbuf;
+  for (int plane = 0; plane < num_planes; plane++)
+    cdef_worker[0].colbuf[plane] = cm->cdef_info.colbuf[plane];
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &workers[i];
+    cdef_worker[i].cm = cm;
+    cdef_worker[i].xd = xd;
+    cdef_worker[i].cdef_init_fb_row_fn = cdef_init_fb_row_fn;
+    for (int plane = 0; plane < num_planes; plane++)
+      cdef_worker[i].linebuf[plane] = cm->cdef_info.linebuf[plane];
+
+    worker->hook = hook;
+    worker->data1 = cdef_sync;
+    worker->data2 = &cdef_worker[i];
+  }
+}
+
+// Initializes row-level parameters for CDEF frame.
+void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm,
+                             const MACROBLOCKD *const xd,
+                             CdefBlockInfo *const fb_info,
+                             uint16_t **const linebuf, uint16_t *const src,
+                             struct AV1CdefSyncData *const cdef_sync, int fbr) {
+  const int num_planes = av1_num_planes(cm);
+  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int luma_stride =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
+
+  // for the current filter block, it's top left corner mi structure (mi_tl)
+  // is first accessed to check whether the top and left boundaries are
+  // frame boundaries. Then bottom-left and top-right mi structures are
+  // accessed to check whether the bottom and right boundaries
+  // (respectively) are frame boundaries.
+  //
+  // Note that we can't just check the bottom-right mi structure - eg. if
+  // we're at the right-hand edge of the frame but not the bottom, then
+  // the bottom-right mi is NULL but the bottom-left is not.
+  fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0;
+  if (fbr != nvfb - 1)
+    fb_info->frame_boundary[BOTTOM] =
+        (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0;
+  else
+    fb_info->frame_boundary[BOTTOM] = 1;
+
+  fb_info->src = src;
+  fb_info->damping = cm->cdef_info.cdef_damping;
+  fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
+  av1_zero(fb_info->dir);
+  av1_zero(fb_info->var);
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    const int stride = luma_stride >> xd->plane[plane].subsampling_x;
+    uint16_t *top_linebuf = &linebuf[plane][0];
+    uint16_t *bot_linebuf = &linebuf[plane][nvfb * CDEF_VBORDER * stride];
+    {
+      const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
+      const int top_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
+      const int bot_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
+
+      if (fbr != nvfb - 1)  // if (fbr != 0)  // top line buffer copy
+        av1_cdef_copy_sb8_16(
+            cm, &top_linebuf[(fbr + 1) * CDEF_VBORDER * stride], stride,
+            xd->plane[plane].dst.buf, top_offset - CDEF_VBORDER, 0,
+            xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+      if (fbr != nvfb - 1)  // bottom line buffer copy
+        av1_cdef_copy_sb8_16(cm, &bot_linebuf[fbr * CDEF_VBORDER * stride],
+                             stride, xd->plane[plane].dst.buf, bot_offset, 0,
+                             xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+    }
+
+    fb_info->top_linebuf[plane] = &linebuf[plane][fbr * CDEF_VBORDER * stride];
+    fb_info->bot_linebuf[plane] =
+        &linebuf[plane]
+                [nvfb * CDEF_VBORDER * stride + (fbr * CDEF_VBORDER * stride)];
+  }
+
+  cdef_row_mt_sync_write(cdef_sync, fbr);
+  cdef_row_mt_sync_read(cdef_sync, fbr);
+}
+
+// Implements multi-threading for CDEF.
+// Perform CDEF on input frame.
+// Inputs:
+//   frame: Pointer to input frame buffer.
+//   cm: Pointer to common structure.
+//   xd: Pointer to common current coding block structure.
+// Returns:
+//   Nothing will be returned.
+void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                       AV1CdefWorkerData *const cdef_worker,
+                       AVxWorker *const workers, AV1CdefSync *const cdef_sync,
+                       int num_workers,
+                       cdef_init_fb_row_t cdef_init_fb_row_fn) {
+  YV12_BUFFER_CONFIG *frame = &cm->cur_frame->buf;
+  const int num_planes = av1_num_planes(cm);
+
+  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
+                       num_planes);
+
+  reset_cdef_job_info(cdef_sync);
+  prepare_cdef_frame_workers(cm, xd, cdef_worker, cdef_sb_row_worker_hook,
+                             workers, cdef_sync, num_workers,
+                             cdef_init_fb_row_fn);
+  launch_cdef_workers(workers, num_workers);
+  sync_cdef_workers(workers, cm, num_workers);
+}
diff --git a/media/libaom/src/av1/common/thread_common.h b/media/libaom/src/av1/common/thread_common.h
index 7397f1c542..7cbae33174 100644
--- a/media/libaom/src/av1/common/thread_common.h
+++ b/media/libaom/src/av1/common/thread_common.h
@@ -15,6 +15,7 @@
 #include "config/aom_config.h"
 
 #include "av1/common/av1_loopfilter.h"
+#include "av1/common/cdef.h"
 #include "aom_util/aom_thread.h"
 
 #ifdef __cplusplus
@@ -27,6 +28,7 @@ typedef struct AV1LfMTInfo {
   int mi_row;
   int plane;
   int dir;
+  int lpf_opt_level;
 } AV1LfMTInfo;
 
 // Loopfilter row synchronization
@@ -97,23 +99,75 @@ typedef struct AV1LrSyncData {
   int jobs_dequeued;
 } AV1LrSync;
 
+typedef struct AV1CdefWorker {
+  AV1_COMMON *cm;
+  MACROBLOCKD *xd;
+  uint16_t *colbuf[MAX_MB_PLANE];
+  uint16_t *srcbuf;
+  uint16_t *linebuf[MAX_MB_PLANE];
+  cdef_init_fb_row_t cdef_init_fb_row_fn;
+} AV1CdefWorkerData;
+
+typedef struct AV1CdefRowSync {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *row_mutex_;
+  pthread_cond_t *row_cond_;
+#endif  // CONFIG_MULTITHREAD
+  int is_row_done;
+} AV1CdefRowSync;
+
+// Data related to CDEF search multi-thread synchronization.
+typedef struct AV1CdefSyncData {
+#if CONFIG_MULTITHREAD
+  // Mutex lock used while dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif  // CONFIG_MULTITHREAD
+  // Data related to CDEF row mt sync information
+  AV1CdefRowSync *cdef_row_mt;
+  // Flag to indicate all blocks are processed and end of frame is reached
+  int end_of_frame;
+  // Row index in units of 64x64 block
+  int fbr;
+  // Column index in units of 64x64 block
+  int fbc;
+} AV1CdefSync;
+
+void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                       AV1CdefWorkerData *const cdef_worker,
+                       AVxWorker *const workers, AV1CdefSync *const cdef_sync,
+                       int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn);
+void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm,
+                             const MACROBLOCKD *const xd,
+                             CdefBlockInfo *const fb_info,
+                             uint16_t **const linebuf, uint16_t *const src,
+                             struct AV1CdefSyncData *const cdef_sync, int fbr);
+void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst,
+                          int dstride, const uint8_t *src, int src_voffset,
+                          int src_hoffset, int sstride, int vsize, int hsize);
+void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync,
+                         int num_workers);
+void av1_free_cdef_sync(AV1CdefSync *cdef_sync);
+
 // Deallocate loopfilter synchronization related mutex and data.
 void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
+void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
+                           int width, int num_workers);
 
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
                               struct macroblockd *xd, int plane_start,
                               int plane_end, int partial_frame,
-#if CONFIG_LPF_MASK
-                              int is_decoding,
-#endif
                               AVxWorker *workers, int num_workers,
-                              AV1LfSync *lf_sync);
+                              AV1LfSync *lf_sync, int lpf_opt_level);
+
 void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                                           struct AV1Common *cm,
                                           int optimized_lr, AVxWorker *workers,
                                           int num_workers, AV1LrSync *lr_sync,
                                           void *lr_ctxt);
 void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
+void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
+                                int num_workers, int num_rows_lr,
+                                int num_planes, int width);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/av1/common/tile_common.c b/media/libaom/src/av1/common/tile_common.c
index 1b11bd7606..6ecead8183 100644
--- a/media/libaom/src/av1/common/tile_common.c
+++ b/media/libaom/src/av1/common/tile_common.c
@@ -28,14 +28,12 @@ static int tile_log2(int blk_size, int target) {
 }
 
 void av1_get_tile_limits(AV1_COMMON *const cm) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   CommonTileParams *const tiles = &cm->tiles;
-  const int mi_cols =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
-  const int mi_rows =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
-  const int sb_cols = mi_cols >> seq_params->mib_size_log2;
-  const int sb_rows = mi_rows >> seq_params->mib_size_log2;
+  const int sb_cols =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
+  const int sb_rows =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
 
   const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2;
   tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
@@ -51,10 +49,8 @@ void av1_get_tile_limits(AV1_COMMON *const cm) {
 void av1_calculate_tile_cols(const SequenceHeader *const seq_params,
                              int cm_mi_rows, int cm_mi_cols,
                              CommonTileParams *const tiles) {
-  int mi_cols = ALIGN_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2);
-  int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
-  int sb_cols = mi_cols >> seq_params->mib_size_log2;
-  int sb_rows = mi_rows >> seq_params->mib_size_log2;
+  int sb_cols = CEIL_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2);
+  int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
   int i;
 
   // This will be overridden if there is at least two columns of tiles
@@ -63,8 +59,7 @@ void av1_calculate_tile_cols(const SequenceHeader *const seq_params,
 
   if (tiles->uniform_spacing) {
     int start_sb;
-    int size_sb = ALIGN_POWER_OF_TWO(sb_cols, tiles->log2_cols);
-    size_sb >>= tiles->log2_cols;
+    int size_sb = CEIL_POWER_OF_TWO(sb_cols, tiles->log2_cols);
     assert(size_sb > 0);
     for (i = 0, start_sb = 0; start_sb < sb_cols; i++) {
       tiles->col_start_sb[i] = start_sb;
@@ -105,13 +100,11 @@ void av1_calculate_tile_cols(const SequenceHeader *const seq_params,
 
 void av1_calculate_tile_rows(const SequenceHeader *const seq_params,
                              int cm_mi_rows, CommonTileParams *const tiles) {
-  int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
-  int sb_rows = mi_rows >> seq_params->mib_size_log2;
+  int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
   int start_sb, size_sb, i;
 
   if (tiles->uniform_spacing) {
-    size_sb = ALIGN_POWER_OF_TWO(sb_rows, tiles->log2_rows);
-    size_sb >>= tiles->log2_rows;
+    size_sb = CEIL_POWER_OF_TWO(sb_rows, tiles->log2_rows);
     assert(size_sb > 0);
     for (i = 0, start_sb = 0; start_sb < sb_rows; i++) {
       tiles->row_start_sb[i] = start_sb;
@@ -130,9 +123,9 @@ void av1_calculate_tile_rows(const SequenceHeader *const seq_params,
 void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
   assert(row < cm->tiles.rows);
   int mi_row_start = cm->tiles.row_start_sb[row]
-                     << cm->seq_params.mib_size_log2;
+                     << cm->seq_params->mib_size_log2;
   int mi_row_end = cm->tiles.row_start_sb[row + 1]
-                   << cm->seq_params.mib_size_log2;
+                   << cm->seq_params->mib_size_log2;
   tile->tile_row = row;
   tile->mi_row_start = mi_row_start;
   tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows);
@@ -142,29 +135,23 @@ void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
 void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
   assert(col < cm->tiles.cols);
   int mi_col_start = cm->tiles.col_start_sb[col]
-                     << cm->seq_params.mib_size_log2;
+                     << cm->seq_params->mib_size_log2;
   int mi_col_end = cm->tiles.col_start_sb[col + 1]
-                   << cm->seq_params.mib_size_log2;
+                   << cm->seq_params->mib_size_log2;
   tile->tile_col = col;
   tile->mi_col_start = mi_col_start;
   tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols);
   assert(tile->mi_col_end > tile->mi_col_start);
 }
 
-int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) {
-  int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
-      tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
-  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
-
-  return sb_rows;
+int av1_get_sb_rows_in_tile(AV1_COMMON *cm, const TileInfo *tile) {
+  return CEIL_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start,
+                           cm->seq_params->mib_size_log2);
 }
 
-int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
-  int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
-      tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
-  int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
-
-  return sb_cols;
+int av1_get_sb_cols_in_tile(AV1_COMMON *cm, const TileInfo *tile) {
+  return CEIL_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start,
+                           cm->seq_params->mib_size_log2);
 }
 
 AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
@@ -195,8 +182,8 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
   r.bottom = AOMMIN(r.bottom, frame_h);
 
   // Convert to coordinates in the appropriate plane
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
 
   r.left = ROUND_POWER_OF_TWO(r.left, ss_x);
   r.right = ROUND_POWER_OF_TWO(r.right, ss_x);
@@ -215,7 +202,7 @@ void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) {
     for (int i = 0; i < tiles->cols; ++i) {
       const int tile_width_sb =
           tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
-      const int tile_w = tile_width_sb * cm->seq_params.mib_size;
+      const int tile_w = tile_width_sb * cm->seq_params->mib_size;
       assert(i == 0 || tile_w == *w);  // ensure all tiles have same dimension
       *w = tile_w;
     }
@@ -223,7 +210,7 @@ void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) {
     for (int i = 0; i < tiles->rows; ++i) {
       const int tile_height_sb =
           tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
-      const int tile_h = tile_height_sb * cm->seq_params.mib_size;
+      const int tile_h = tile_height_sb * cm->seq_params->mib_size;
       assert(i == 0 || tile_h == *h);  // ensure all tiles have same dimension
       *h = tile_h;
     }
diff --git a/media/libaom/src/av1/common/tile_common.h b/media/libaom/src/av1/common/tile_common.h
index ca7c5f496e..5e90d95e77 100644
--- a/media/libaom/src/av1/common/tile_common.h
+++ b/media/libaom/src/av1/common/tile_common.h
@@ -39,8 +39,8 @@ void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
 void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
 void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
 
-int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile);
-int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile);
+int av1_get_sb_rows_in_tile(struct AV1Common *cm, const TileInfo *tile);
+int av1_get_sb_cols_in_tile(struct AV1Common *cm, const TileInfo *tile);
 
 typedef struct {
   int left, top, right, bottom;
diff --git a/media/libaom/src/av1/common/txb_common.h b/media/libaom/src/av1/common/txb_common.h
index 5a62fa89b1..5ba3951e8b 100644
--- a/media/libaom/src/av1/common/txb_common.h
+++ b/media/libaom/src/av1/common/txb_common.h
@@ -84,7 +84,8 @@ static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
   int ctx_idx = -1;
 
   if (row == 0 && col == 0) {
-    if (sig_mag >= 2) return ctx_idx = 0;
+    if (sig_mag >= 2) return 0;
+
     if (sig_mag == 1) {
       if (count >= 2)
         ctx_idx = 1;
@@ -98,7 +99,7 @@ static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
     assert(ctx_idx <= 6);
     return ctx_idx;
   } else if (row == 0) {
-    if (sig_mag >= 2) return ctx_idx = 6;
+    if (sig_mag >= 2) return 6;
     if (sig_mag == 1) {
       if (count >= 2)
         ctx_idx = 7;
@@ -111,7 +112,7 @@ static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
     assert(ctx_idx <= 11);
     return ctx_idx;
   } else if (col == 0) {
-    if (sig_mag >= 2) return ctx_idx = 12;
+    if (sig_mag >= 2) return 12;
     if (sig_mag == 1) {
       if (count >= 2)
         ctx_idx = 13;
@@ -126,7 +127,7 @@ static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
     // TODO(angiebird): turn this on once the optimization is finalized
     // assert(ctx_idx < 28);
   } else {
-    if (sig_mag >= 2) return ctx_idx = 18;
+    if (sig_mag >= 2) return 18;
     if (sig_mag == 1) {
       if (count >= 2)
         ctx_idx = 19;
diff --git a/media/libaom/src/av1/common/warped_motion.c b/media/libaom/src/av1/common/warped_motion.c
index 4e9fab9bd8..4e5966e46b 100644
--- a/media/libaom/src/av1/common/warped_motion.c
+++ b/media/libaom/src/av1/common/warped_motion.c
@@ -350,14 +350,16 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
       // then convert back to the original coordinates (if necessary)
       const int32_t src_x = (j + 4) << subsampling_x;
       const int32_t src_y = (i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      const int64_t dst_x =
+          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+      const int64_t dst_y =
+          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+      const int64_t x4 = dst_x >> subsampling_x;
+      const int64_t y4 = dst_y >> subsampling_y;
+
+      const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       sx4 += alpha * (-4) + beta * (-4);
@@ -540,7 +542,7 @@ static int64_t highbd_segmented_frame_error(
    are set appropriately (if using a ROTZOOM model), and that alpha, beta,
    gamma, delta are all in range.
 
-   TODO(david.barker): Maybe support scaled references?
+   TODO(rachelbarker): Maybe support scaled references?
 */
 /* A note on hardware implementation:
     The warp filter is intended to be implementable using the same hardware as
@@ -621,14 +623,16 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
       // then convert back to the original coordinates (if necessary)
       const int32_t src_x = (j + 4) << subsampling_x;
       const int32_t src_y = (i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      const int64_t dst_x =
+          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+      const int64_t dst_y =
+          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+      const int64_t x4 = dst_x >> subsampling_x;
+      const int64_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       sx4 += alpha * (-4) + beta * (-4);
@@ -1052,8 +1056,6 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
       clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
   wm->wmmat[1] =
       clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
-
-  wm->wmmat[6] = wm->wmmat[7] = 0;
   return 0;
 }
 
diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c b/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c
index 0fbd5eae4b..7993707dac 100644
--- a/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c
@@ -61,8 +61,7 @@ static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
   btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
 }
 
-static void idct16_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
-  (void)(cos_bit);
+static void idct16_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -106,35 +105,43 @@ static void idct16_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
   x1[15] = input[15];
 
   // stage 2
-  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
+                  INV_COS_BIT);
 
   // stage 3
-  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
+                  INV_COS_BIT);
   btf_16_adds_subs_avx2(&x1[8], &x1[9]);
   btf_16_adds_subs_avx2(&x1[11], &x1[10]);
   btf_16_adds_subs_avx2(&x1[12], &x1[13]);
   btf_16_adds_subs_avx2(&x1[15], &x1[14]);
 
   // stage 4
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
+                  INV_COS_BIT);
   btf_16_adds_subs_avx2(&x1[4], &x1[5]);
   btf_16_adds_subs_avx2(&x1[7], &x1[6]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
+                  INV_COS_BIT);
 
-  idct16_stage5_avx2(x1, cospi, _r, cos_bit);
-  idct16_stage6_avx2(x1, cospi, _r, cos_bit);
+  idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
+  idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
   idct16_stage7_avx2(output, x1);
 }
 
-static void idct16_low8_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
-  (void)(cos_bit);
+static void idct16_low8_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -172,17 +179,17 @@ static void idct16_low8_avx2(const __m256i *input, __m256i *output,
   btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
   btf_16_adds_subs_avx2(&x1[4], &x1[5]);
   btf_16_adds_subs_avx2(&x1[7], &x1[6]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
+                  INV_COS_BIT);
 
-  idct16_stage5_avx2(x1, cospi, _r, cos_bit);
-  idct16_stage6_avx2(x1, cospi, _r, cos_bit);
+  idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
+  idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
   idct16_stage7_avx2(output, x1);
 }
 
-static void idct16_low1_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
-  (void)(cos_bit);
+static void idct16_low1_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
   // stage 1
@@ -302,9 +309,7 @@ static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
   output[15] = _mm256_subs_epi16(__zero, x1[1]);
 }
 
-static void iadst16_avx2(const __m256i *input, __m256i *output,
-                         int8_t cos_bit) {
-  (void)(cos_bit);
+static void iadst16_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -346,27 +351,33 @@ static void iadst16_avx2(const __m256i *input, __m256i *output,
   x1[15] = input[14];
 
   // stage 2
-  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r,
+                  INV_COS_BIT);
 
   iadst16_stage3_avx2(x1);
-  iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
   iadst16_stage5_avx2(x1);
-  iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
   iadst16_stage7_avx2(x1);
-  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
   iadst16_stage9_avx2(output, x1);
 }
 
-static void iadst16_low8_avx2(const __m256i *input, __m256i *output,
-                              int8_t cos_bit) {
-  (void)(cos_bit);
+static void iadst16_low8_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -392,17 +403,15 @@ static void iadst16_low8_avx2(const __m256i *input, __m256i *output,
   btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
 
   iadst16_stage3_avx2(x1);
-  iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
   iadst16_stage5_avx2(x1);
-  iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
   iadst16_stage7_avx2(x1);
-  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
   iadst16_stage9_avx2(output, x1);
 }
 
-static void iadst16_low1_avx2(const __m256i *input, __m256i *output,
-                              int8_t cos_bit) {
-  (void)(cos_bit);
+static void iadst16_low1_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -423,7 +432,8 @@ static void iadst16_low1_avx2(const __m256i *input, __m256i *output,
   x1[9] = x1[1];
 
   // stage 4
-  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r,
+                  INV_COS_BIT);
 
   // stage 5
   x1[4] = x1[0];
@@ -433,8 +443,10 @@ static void iadst16_low1_avx2(const __m256i *input, __m256i *output,
   x1[13] = x1[9];
 
   // stage 6
-  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r,
+                  INV_COS_BIT);
 
   // stage 7
   x1[2] = x1[0];
@@ -446,7 +458,7 @@ static void iadst16_low1_avx2(const __m256i *input, __m256i *output,
   x1[14] = x1[12];
   x1[15] = x1[13];
 
-  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
   iadst16_stage9_avx2(output, x1);
 }
 
@@ -567,9 +579,7 @@ static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
   btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
 }
 
-static void idct32_low1_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
-  (void)cos_bit;
+static void idct32_low1_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
   // stage 1
@@ -620,9 +630,7 @@ static void idct32_low1_avx2(const __m256i *input, __m256i *output,
   output[16] = x[0];
 }
 
-static void idct32_low8_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
-  (void)cos_bit;
+static void idct32_low8_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -661,26 +669,24 @@ static void idct32_low8_avx2(const __m256i *input, __m256i *output,
   x[10] = x[11];
   x[13] = x[12];
   x[14] = x[15];
-  idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
+  idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 5
   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
   x[5] = x[4];
   x[6] = x[7];
-  idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
+  idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
   // stage 6
   x[3] = x[0];
   x[2] = x[1];
-  idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
+  idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
 
-  idct32_stage7_avx2(x, cospi, _r, cos_bit);
-  idct32_stage8_avx2(x, cospi, _r, cos_bit);
+  idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
+  idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
   idct32_stage9_avx2(output, x);
 }
 
-static void idct32_low16_avx2(const __m256i *input, __m256i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
+static void idct32_low16_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -727,26 +733,25 @@ static void idct32_low16_avx2(const __m256i *input, __m256i *output,
   btf_16_adds_subs_avx2(&x[11], &x[10]);
   btf_16_adds_subs_avx2(&x[12], &x[13]);
   btf_16_adds_subs_avx2(&x[15], &x[14]);
-  idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
+  idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 5
   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
   btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
   btf_16_adds_subs_avx2(&x[4], &x[5]);
   btf_16_adds_subs_avx2(&x[7], &x[6]);
-  idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
+  idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
 
   btf_16_adds_subs_avx2(&x[0], &x[3]);
   btf_16_adds_subs_avx2(&x[1], &x[2]);
-  idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
+  idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
 
-  idct32_stage7_avx2(x, cospi, _r, cos_bit);
-  idct32_stage8_avx2(x, cospi, _r, cos_bit);
+  idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
+  idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
   idct32_stage9_avx2(output, x);
 }
 
-static void idct32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
-  (void)(cos_bit);
+static void idct32_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -819,45 +824,61 @@ static void idct32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
   x1[31] = input[31];
 
   // stage 2
-  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r,
+                  INV_COS_BIT);
 
   // stage 3
-  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
+                  INV_COS_BIT);
   idct32_high16_stage3_avx2(x1);
 
   // stage 4
-  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
+                  INV_COS_BIT);
   btf_16_adds_subs_avx2(&x1[8], &x1[9]);
   btf_16_adds_subs_avx2(&x1[11], &x1[10]);
   btf_16_adds_subs_avx2(&x1[12], &x1[13]);
   btf_16_adds_subs_avx2(&x1[15], &x1[14]);
-  idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit);
+  idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
 
   // stage 5
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
+                  INV_COS_BIT);
   btf_16_adds_subs_avx2(&x1[4], &x1[5]);
   btf_16_adds_subs_avx2(&x1[7], &x1[6]);
-  idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit);
+  idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
 
   // stage 6
   btf_16_adds_subs_avx2(&x1[0], &x1[3]);
   btf_16_adds_subs_avx2(&x1[1], &x1[2]);
-  idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit);
+  idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
 
-  idct32_stage7_avx2(x1, cospi, _r, cos_bit);
-  idct32_stage8_avx2(x1, cospi, _r, cos_bit);
+  idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT);
+  idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
   idct32_stage9_avx2(output, x1);
 }
 
@@ -1102,9 +1123,7 @@ static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
   btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
 }
 
-static void idct64_low1_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
-  (void)cos_bit;
+static void idct64_low1_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
   // stage 1
@@ -1189,9 +1208,7 @@ static void idct64_low1_avx2(const __m256i *input, __m256i *output,
   output[32] = x[0];
 }
 
-static void idct64_low8_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
-  (void)cos_bit;
+static void idct64_low8_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
   const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
@@ -1246,16 +1263,22 @@ static void idct64_low8_avx2(const __m256i *input, __m256i *output,
   x[22] = x[23];
   x[25] = x[24];
   x[30] = x[31];
-  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r,
+                  INV_COS_BIT);
 
   // stage 5
   x[9] = x[8];
   x[14] = x[15];
-  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r,
+                  INV_COS_BIT);
   x[35] = x[32];
   x[34] = x[33];
   x[36] = x[39];
@@ -1275,7 +1298,7 @@ static void idct64_low8_avx2(const __m256i *input, __m256i *output,
 
   // stage 6
   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
   x[19] = x[16];
   x[18] = x[17];
   x[20] = x[23];
@@ -1284,7 +1307,7 @@ static void idct64_low8_avx2(const __m256i *input, __m256i *output,
   x[26] = x[25];
   x[28] = x[31];
   x[29] = x[30];
-  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
+  idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 7
   x[3] = x[0];
@@ -1293,25 +1316,25 @@ static void idct64_low8_avx2(const __m256i *input, __m256i *output,
   x[10] = x[9];
   x[12] = x[15];
   x[13] = x[14];
-  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
+  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 8
   x[7] = x[0];
   x[6] = x[1];
   x[5] = x[2];
   x[4] = x[3];
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
-  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
-
-  idct64_stage9_avx2(x, cospi, _r, cos_bit);
-  idct64_stage10_avx2(x, cospi, _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
+                  INV_COS_BIT);
+  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
+  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
   idct64_stage11_avx2(output, x);
 }
 
-static void idct64_low16_avx2(const __m256i *input, __m256i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
+static void idct64_low16_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -1383,7 +1406,7 @@ static void idct64_low16_avx2(const __m256i *input, __m256i *output,
   x[26] = x[27];
   x[29] = x[28];
   x[30] = x[31];
-  idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
+  idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 5
   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
@@ -1391,43 +1414,44 @@ static void idct64_low16_avx2(const __m256i *input, __m256i *output,
   x[10] = x[11];
   x[13] = x[12];
   x[14] = x[15];
-  idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
+  idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 6
   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
   x[5] = x[4];
   x[6] = x[7];
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
-  idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
+                  INV_COS_BIT);
+  idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 7
   x[3] = x[0];
   x[2] = x[1];
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
   btf_16_adds_subs_avx2(&x[8], &x[11]);
   btf_16_adds_subs_avx2(&x[9], &x[10]);
   btf_16_adds_subs_avx2(&x[15], &x[12]);
   btf_16_adds_subs_avx2(&x[14], &x[13]);
-  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
+  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 8
   btf_16_adds_subs_avx2(&x[0], &x[7]);
   btf_16_adds_subs_avx2(&x[1], &x[6]);
   btf_16_adds_subs_avx2(&x[2], &x[5]);
   btf_16_adds_subs_avx2(&x[3], &x[4]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
-  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
-
-  idct64_stage9_avx2(x, cospi, _r, cos_bit);
-  idct64_stage10_avx2(x, cospi, _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
+                  INV_COS_BIT);
+  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
+  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
   idct64_stage11_avx2(output, x);
 }
 
-static void idct64_low32_avx2(const __m256i *input, __m256i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
+static void idct64_low32_avx2(const __m256i *input, __m256i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -1529,7 +1553,7 @@ static void idct64_low32_avx2(const __m256i *input, __m256i *output,
   btf_16_adds_subs_avx2(&x[27], &x[26]);
   btf_16_adds_subs_avx2(&x[28], &x[29]);
   btf_16_adds_subs_avx2(&x[31], &x[30]);
-  idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
+  idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 5
   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
@@ -1538,44 +1562,46 @@ static void idct64_low32_avx2(const __m256i *input, __m256i *output,
   btf_16_adds_subs_avx2(&x[11], &x[10]);
   btf_16_adds_subs_avx2(&x[12], &x[13]);
   btf_16_adds_subs_avx2(&x[15], &x[14]);
-  idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
+  idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 6
   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
   btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
   btf_16_adds_subs_avx2(&x[4], &x[5]);
   btf_16_adds_subs_avx2(&x[7], &x[6]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
-  idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
+                  INV_COS_BIT);
+  idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 7
   btf_16_adds_subs_avx2(&x[0], &x[3]);
   btf_16_adds_subs_avx2(&x[1], &x[2]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
   btf_16_adds_subs_avx2(&x[8], &x[11]);
   btf_16_adds_subs_avx2(&x[9], &x[10]);
   btf_16_adds_subs_avx2(&x[15], &x[12]);
   btf_16_adds_subs_avx2(&x[14], &x[13]);
-  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
+  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 8
   btf_16_adds_subs_avx2(&x[0], &x[7]);
   btf_16_adds_subs_avx2(&x[1], &x[6]);
   btf_16_adds_subs_avx2(&x[2], &x[5]);
   btf_16_adds_subs_avx2(&x[3], &x[4]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
-  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
+                  INV_COS_BIT);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
+                  INV_COS_BIT);
+  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
 
   // stage 9~11
-  idct64_stage9_avx2(x, cospi, _r, cos_bit);
-  idct64_stage10_avx2(x, cospi, _r, cos_bit);
+  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
+  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
   idct64_stage11_avx2(output, x);
 }
 
-typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit);
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output);
 
 // 1D functions process 16 pixels at one time.
 static const transform_1d_avx2
@@ -1612,8 +1638,6 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div16 = txfm_size_col >> 4;
@@ -1647,7 +1671,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
     if (rect_type == 1 || rect_type == -1) {
       round_shift_avx2(buf0, buf0, input_stride);  // rect special code
     }
-    row_txfm(buf0, buf0, cos_bit_row);
+    row_txfm(buf0, buf0);
     for (int j = 0; j < txfm_size_col; ++j) {
       buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
     }
@@ -1669,7 +1693,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
   const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
   for (int i = 0; i < buf_size_w_div16; i++) {
     __m256i *buf1_cur = buf1 + i * txfm_size_row;
-    col_txfm(buf1_cur, buf1_cur, cos_bit_col);
+    col_txfm(buf1_cur, buf1_cur);
     for (int j = 0; j < txfm_size_row; ++j) {
       buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
     }
@@ -1774,7 +1798,6 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
@@ -1794,7 +1817,7 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
     __m256i buf0[64];
     iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
                             eoby + 1, txw_idx, rect_type);
-    col_txfm(buf0, buf0, cos_bit_col);
+    col_txfm(buf0, buf0);
     __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
     int k = ud_flip ? (txfm_size_row - 1) : 0;
     const int step = ud_flip ? -1 : 1;
@@ -1814,7 +1837,6 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div16 = txfm_size_col >> 4;
@@ -1842,7 +1864,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
     if (rect_type == 1 || rect_type == -1) {
       round_shift_avx2(buf0, buf0, input_stride);  // rect special code
     }
-    row_txfm(buf0, buf0, cos_bit_row);
+    row_txfm(buf0, buf0);
     round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
     __m256i *_buf1 = buf1;
     if (lr_flip) {
diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h b/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h
index f74cbaeaa5..a09dea389f 100644
--- a/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h
+++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h
@@ -27,13 +27,13 @@ extern "C" {
 
 // half input is zero
 #define btf_16_w16_0_avx2(w0, w1, in, out0, out1)  \
-  {                                                \
+  do {                                             \
     const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
     const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
     const __m256i _in = in;                        \
     out0 = _mm256_mulhrs_epi16(_in, _w0);          \
     out1 = _mm256_mulhrs_epi16(_in, _w1);          \
-  }
+  } while (0)
 
 static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
                                     int size) {
diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c
index 46c051ff8c..a2a43f8d8d 100644
--- a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -24,8 +24,8 @@ static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
 
 // TODO(binpengsmail@gmail.com): replace some for loop with do {} while
 
-static void idct4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
-  (void)cos_bit;
+static void idct4_sse2(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -50,9 +50,8 @@ static void idct4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
 }
 
-static void idct4_w4_sse2(const __m128i *input, __m128i *output,
-                          int8_t cos_bit) {
-  (void)cos_bit;
+static void idct4_w4_sse2(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -77,9 +76,7 @@ static void idct4_w4_sse2(const __m128i *input, __m128i *output,
   btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
 }
 
-static void idct8_low1_ssse3(const __m128i *input, __m128i *output,
-                             int8_t cos_bit) {
-  (void)cos_bit;
+static void idct8_low1_ssse3(const __m128i *input, __m128i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
   // stage 1
@@ -102,8 +99,8 @@ static void idct8_low1_ssse3(const __m128i *input, __m128i *output,
   output[4] = x[0];
 }
 
-static void idct8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
-  (void)cos_bit;
+static void idct8_sse2(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -150,9 +147,8 @@ static void idct8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
 }
 
-static void idct8_w4_sse2(const __m128i *input, __m128i *output,
-                          int8_t cos_bit) {
-  (void)cos_bit;
+static void idct8_w4_sse2(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -237,9 +233,7 @@ static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
   btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
 }
 
-static void idct16_low1_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
+static void idct16_low1_ssse3(const __m128i *input, __m128i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
   // stage 1
@@ -272,9 +266,8 @@ static void idct16_low1_ssse3(const __m128i *input, __m128i *output,
   output[8] = x[0];
 }
 
-static void idct16_low8_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
+static void idct16_low8_ssse3(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
@@ -319,8 +312,8 @@ static void idct16_low8_ssse3(const __m128i *input, __m128i *output,
   idct16_stage7_sse2(output, x);
 }
 
-static void idct16_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
-  (void)cos_bit;
+static void idct16_sse2(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -391,9 +384,8 @@ static void idct16_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   idct16_stage7_sse2(output, x);
 }
 
-static void idct16_w4_sse2(const __m128i *input, __m128i *output,
-                           int8_t cos_bit) {
-  (void)cos_bit;
+static void idct16_w4_sse2(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -602,9 +594,7 @@ static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
   btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
 }
 
-static void idct32_low1_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
+static void idct32_low1_ssse3(const __m128i *input, __m128i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
   // stage 1
@@ -655,9 +645,8 @@ static void idct32_low1_ssse3(const __m128i *input, __m128i *output,
   output[16] = x[0];
 }
 
-static void idct32_low8_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
+static void idct32_low8_ssse3(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -713,9 +702,8 @@ static void idct32_low8_ssse3(const __m128i *input, __m128i *output,
   idct32_stage9_sse2(output, x);
 }
 
-static void idct32_low16_ssse3(const __m128i *input, __m128i *output,
-                               int8_t cos_bit) {
-  (void)cos_bit;
+static void idct32_low16_ssse3(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -780,8 +768,8 @@ static void idct32_low16_ssse3(const __m128i *input, __m128i *output,
   idct32_stage9_sse2(output, x);
 }
 
-static void idct32_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
-  (void)cos_bit;
+static void idct32_sse2(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -1139,9 +1127,7 @@ static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
   btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
 }
 
-static void idct64_low1_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
+static void idct64_low1_ssse3(const __m128i *input, __m128i *output) {
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
   // stage 1
@@ -1226,9 +1212,8 @@ static void idct64_low1_ssse3(const __m128i *input, __m128i *output,
   output[32] = x[0];
 }
 
-static void idct64_low8_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
+static void idct64_low8_ssse3(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
@@ -1346,9 +1331,8 @@ static void idct64_low8_ssse3(const __m128i *input, __m128i *output,
   idct64_stage11_sse2(output, x);
 }
 
-static void idct64_low16_ssse3(const __m128i *input, __m128i *output,
-                               int8_t cos_bit) {
-  (void)cos_bit;
+static void idct64_low16_ssse3(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -1462,9 +1446,8 @@ static void idct64_low16_ssse3(const __m128i *input, __m128i *output,
   idct64_stage11_sse2(output, x);
 }
 
-static void idct64_low32_ssse3(const __m128i *input, __m128i *output,
-                               int8_t cos_bit) {
-  (void)cos_bit;
+static void idct64_low32_ssse3(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -1611,8 +1594,7 @@ static void idct64_low32_ssse3(const __m128i *input, __m128i *output,
   idct64_stage11_sse2(output, x);
 }
 
-static void iadst4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
-  (void)cos_bit;
+static void iadst4_sse2(const __m128i *input, __m128i *output) {
   const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
   const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
   const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
@@ -1672,9 +1654,7 @@ static void iadst4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   }
 }
 
-static void iadst4_w4_sse2(const __m128i *input, __m128i *output,
-                           int8_t cos_bit) {
-  (void)cos_bit;
+static void iadst4_w4_sse2(const __m128i *input, __m128i *output) {
   const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
   const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
   const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
@@ -1718,9 +1698,8 @@ static void iadst4_w4_sse2(const __m128i *input, __m128i *output,
   }
 }
 
-static void iadst8_low1_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
+static void iadst8_low1_ssse3(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __zero = _mm_setzero_si128();
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1765,8 +1744,8 @@ static void iadst8_low1_ssse3(const __m128i *input, __m128i *output,
   output[7] = _mm_subs_epi16(__zero, x[1]);
 }
 
-static void iadst8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
-  (void)cos_bit;
+static void iadst8_sse2(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __zero = _mm_setzero_si128();
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1833,9 +1812,8 @@ static void iadst8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   output[7] = _mm_subs_epi16(__zero, x[1]);
 }
 
-static void iadst8_w4_sse2(const __m128i *input, __m128i *output,
-                           int8_t cos_bit) {
-  (void)cos_bit;
+static void iadst8_w4_sse2(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __zero = _mm_setzero_si128();
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1993,9 +1971,8 @@ static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
   output[15] = _mm_subs_epi16(__zero, x[1]);
 }
 
-static void iadst16_low1_ssse3(const __m128i *input, __m128i *output,
-                               int8_t cos_bit) {
-  (void)cos_bit;
+static void iadst16_low1_ssse3(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -2042,9 +2019,8 @@ static void iadst16_low1_ssse3(const __m128i *input, __m128i *output,
   iadst16_stage9_ssse3(output, x);
 }
 
-static void iadst16_low8_ssse3(const __m128i *input, __m128i *output,
-                               int8_t cos_bit) {
-  (void)cos_bit;
+static void iadst16_low8_ssse3(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -2078,9 +2054,8 @@ static void iadst16_low8_ssse3(const __m128i *input, __m128i *output,
   iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
   iadst16_stage9_ssse3(output, x);
 }
-static void iadst16_sse2(const __m128i *input, __m128i *output,
-                         int8_t cos_bit) {
-  (void)cos_bit;
+static void iadst16_sse2(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
   const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
@@ -2139,9 +2114,8 @@ static void iadst16_sse2(const __m128i *input, __m128i *output,
   iadst16_stage9_ssse3(output, x);
 }
 
-static void iadst16_w4_sse2(const __m128i *input, __m128i *output,
-                            int8_t cos_bit) {
-  (void)cos_bit;
+static void iadst16_w4_sse2(const __m128i *input, __m128i *output) {
+  const int8_t cos_bit = INV_COS_BIT;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
 
@@ -2233,9 +2207,7 @@ static void iadst16_w4_sse2(const __m128i *input, __m128i *output,
   iadst16_stage9_ssse3(output, x);
 }
 
-static void iidentity4_ssse3(const __m128i *input, __m128i *output,
-                             int8_t cos_bit) {
-  (void)cos_bit;
+static void iidentity4_ssse3(const __m128i *input, __m128i *output) {
   const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
   const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
   for (int i = 0; i < 4; ++i) {
@@ -2244,17 +2216,13 @@ static void iidentity4_ssse3(const __m128i *input, __m128i *output,
   }
 }
 
-static void iidentity8_sse2(const __m128i *input, __m128i *output,
-                            int8_t cos_bit) {
-  (void)cos_bit;
+static void iidentity8_sse2(const __m128i *input, __m128i *output) {
   for (int i = 0; i < 8; ++i) {
     output[i] = _mm_adds_epi16(input[i], input[i]);
   }
 }
 
-static void iidentity16_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
+static void iidentity16_ssse3(const __m128i *input, __m128i *output) {
   const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
   const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
   for (int i = 0; i < 16; ++i) {
@@ -2446,8 +2414,6 @@ static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
 
@@ -2460,7 +2426,7 @@ static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
   transpose_16bit_4x4(buf, buf);
-  row_txfm(buf, buf, cos_bit_row);
+  row_txfm(buf, buf);
   if (lr_flip) {
     __m128i temp[4];
     flip_buf_sse2(buf, temp, txfm_size_col);
@@ -2468,7 +2434,7 @@ static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
   } else {
     transpose_16bit_4x4(buf, buf);
   }
-  col_txfm(buf, buf, cos_bit_col);
+  col_txfm(buf, buf);
   round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 }
@@ -2512,8 +2478,6 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div8 = txfm_size_col >> 3;
@@ -2544,7 +2508,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
     if (rect_type == 1 || rect_type == -1) {
       round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
     }
-    row_txfm(buf0, buf0, cos_bit_row);
+    row_txfm(buf0, buf0);
     round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
     __m128i *_buf1 = buf1 + i * 8;
     if (lr_flip) {
@@ -2561,7 +2525,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
     }
   }
   for (int i = 0; i < buf_size_w_div8; i++) {
-    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row);
     round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
   }
 
@@ -2584,7 +2548,6 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div8 = (eobx + 8) >> 3;
@@ -2604,7 +2567,7 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
     __m128i buf0[64];
     iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
                             eoby + 1, txw_idx, rect_type);
-    col_txfm(buf0, buf0, cos_bit_col);
+    col_txfm(buf0, buf0);
     __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
     int k = ud_flip ? (txfm_size_row - 1) : 0;
     const int step = ud_flip ? -1 : 1;
@@ -2628,7 +2591,6 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div8 = txfm_size_col >> 3;
@@ -2654,7 +2616,7 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
     if (rect_type == 1 || rect_type == -1) {
       round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
     }
-    row_txfm(buf0, buf0, cos_bit_row);
+    row_txfm(buf0, buf0);
     round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
     __m128i *_buf1 = buf1;
     if (lr_flip) {
@@ -2718,8 +2680,6 @@ static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
 
@@ -2733,7 +2693,7 @@ static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
   load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
   transpose_16bit_4x8(buf, buf);
   round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
-  row_txfm(buf, buf, cos_bit_row);
+  row_txfm(buf, buf);
   // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
   if (lr_flip) {
     __m128i temp[4];
@@ -2742,7 +2702,7 @@ static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
   } else {
     transpose_16bit_8x4(buf, buf);
   }
-  col_txfm(buf, buf, cos_bit_col);
+  col_txfm(buf, buf);
   round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 }
@@ -2758,8 +2718,6 @@ static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
 
@@ -2773,7 +2731,7 @@ static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
   load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
   transpose_16bit_8x4(buf, buf);
   round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
-  row_txfm(buf, buf, cos_bit_row);
+  row_txfm(buf, buf);
   // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
   if (lr_flip) {
     __m128i temp[8];
@@ -2782,7 +2740,7 @@ static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
   } else {
     transpose_16bit_4x8(buf, buf);
   }
-  col_txfm(buf, buf, cos_bit_col);
+  col_txfm(buf, buf);
   round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
   lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 }
@@ -2798,8 +2756,6 @@ static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
 
@@ -2831,7 +2787,7 @@ static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
         buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
       }
     } else {
-      row_txfm(buf_cur, buf_cur, cos_bit_row);
+      row_txfm(buf_cur, buf_cur);
       round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
     }
     if (lr_flip) {
@@ -2842,7 +2798,7 @@ static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
       transpose_16bit_8x4(buf_cur, buf_cur);
     }
   }
-  col_txfm(buf, buf, cos_bit_col);
+  col_txfm(buf, buf);
   round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 }
@@ -2858,8 +2814,6 @@ static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div8 = txfm_size_col >> 3;
@@ -2892,7 +2846,7 @@ static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
       buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
     }
   } else {
-    row_txfm(buf, buf, cos_bit_row);
+    row_txfm(buf, buf);
     round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
   }
   if (lr_flip) {
@@ -2905,7 +2859,7 @@ static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
     transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
   }
   for (int i = 0; i < buf_size_w_div8; i++) {
-    col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
+    col_txfm(buf + i * row_one_loop, buf + i * row_one_loop);
     round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
   }
   lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h
index 7d5055debe..b85bc9dd3b 100644
--- a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -219,8 +219,7 @@ static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
   *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
 }
 
-typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output,
-                                   int8_t cos_bit);
+typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output);
 
 void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
                                     int stride, TX_TYPE tx_type,
diff --git a/media/libaom/src/av1/common/x86/av1_txfm_sse2.h b/media/libaom/src/av1/common/x86/av1_txfm_sse2.h
index 77aeb6eb13..b67bf54572 100644
--- a/media/libaom/src/av1/common/x86/av1_txfm_sse2.h
+++ b/media/libaom/src/av1/common/x86/av1_txfm_sse2.h
@@ -42,7 +42,7 @@ static INLINE void btf_16_w4_sse2(
 }
 
 #define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
-  {                                                  \
+  do {                                               \
     __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
     __m128i u0 = _mm_madd_epi16(t0, w0);             \
     __m128i v0 = _mm_madd_epi16(t0, w1);             \
@@ -55,10 +55,10 @@ static INLINE void btf_16_w4_sse2(
                                                      \
     out0 = _mm_packs_epi32(c0, c0);                  \
     out1 = _mm_packs_epi32(d0, d0);                  \
-  }
+  } while (0)
 
 #define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
-  {                                               \
+  do {                                            \
     __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
     __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
     __m128i u0 = _mm_madd_epi16(t0, w0);          \
@@ -78,7 +78,7 @@ static INLINE void btf_16_w4_sse2(
                                                   \
     out0 = _mm_packs_epi32(c0, c1);               \
     out1 = _mm_packs_epi32(d0, d1);               \
-  }
+  } while (0)
 
 static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
   return _mm_load_si128((const __m128i *)a);
diff --git a/media/libaom/src/av1/common/x86/cdef_block_avx2.c b/media/libaom/src/av1/common/x86/cdef_block_avx2.c
new file mode 100644
index 0000000000..3396a51cf7
--- /dev/null
+++ b/media/libaom/src/av1/common/x86/cdef_block_avx2.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_avx2
+#include "av1/common/cdef_block_simd.h"
+
+// Mask used to shuffle the elements present in 256bit register.
+const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
+                                    0x0f0e0100, 0x0b0a0d0c, 0x07060908,
+                                    0x03020504, 0x0f0e0100 };
+
+/* partial A is a 16-bit vector of the form:
+[x8 - - x1 | x16 - - x9] and partial B has the form:
+[0  y1 - y7 | 0 y9 - y15].
+This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+(x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants
+are in const1 and const2. */
+static INLINE __m256i fold_mul_and_sum_avx2(__m256i *partiala,
+                                            __m256i *partialb,
+                                            const __m256i *const1,
+                                            const __m256i *const2) {
+  __m256i tmp;
+  /* Reverse partial B. */
+  *partialb = _mm256_shuffle_epi8(
+      *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit));
+
+  /* Interleave the x and y values of identical indices and pair x8 with 0. */
+  tmp = *partiala;
+  *partiala = _mm256_unpacklo_epi16(*partiala, *partialb);
+  *partialb = _mm256_unpackhi_epi16(tmp, *partialb);
+
+  /* Square and add the corresponding x and y values. */
+  *partiala = _mm256_madd_epi16(*partiala, *partiala);
+  *partialb = _mm256_madd_epi16(*partialb, *partialb);
+  /* Multiply by constant. */
+  *partiala = _mm256_mullo_epi32(*partiala, *const1);
+  *partialb = _mm256_mullo_epi32(*partialb, *const2);
+  /* Sum all results. */
+  *partiala = _mm256_add_epi32(*partiala, *partialb);
+  return *partiala;
+}
+
+static INLINE __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2,
+                                 __m256i *x3) {
+  const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1);
+  const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3);
+  const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1);
+  const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3);
+
+  *x0 = _mm256_unpacklo_epi64(t0, t1);
+  *x1 = _mm256_unpackhi_epi64(t0, t1);
+  *x2 = _mm256_unpacklo_epi64(t2, t3);
+  *x3 = _mm256_unpackhi_epi64(t2, t3);
+  return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1),
+                          _mm256_add_epi32(*x2, *x3));
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+to compute the remaining directions. */
+static INLINE __m256i compute_directions_avx2(__m256i *lines,
+                                              int32_t cost_frist_8x8[4],
+                                              int32_t cost_second_8x8[4]) {
+  __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+  __m256i partial6;
+  __m256i tmp;
+  /* Partial sums for lines 0 and 1. */
+  partial4a = _mm256_slli_si256(lines[0], 14);
+  partial4b = _mm256_srli_si256(lines[0], 2);
+  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12));
+  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4));
+  tmp = _mm256_add_epi16(lines[0], lines[1]);
+  partial5a = _mm256_slli_si256(tmp, 10);
+  partial5b = _mm256_srli_si256(tmp, 6);
+  partial7a = _mm256_slli_si256(tmp, 4);
+  partial7b = _mm256_srli_si256(tmp, 12);
+  partial6 = tmp;
+
+  /* Partial sums for lines 2 and 3. */
+  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10));
+  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6));
+  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8));
+  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8));
+  tmp = _mm256_add_epi16(lines[2], lines[3]);
+  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8));
+  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8));
+  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6));
+  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10));
+  partial6 = _mm256_add_epi16(partial6, tmp);
+
+  /* Partial sums for lines 4 and 5. */
+  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6));
+  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10));
+  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4));
+  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12));
+  tmp = _mm256_add_epi16(lines[4], lines[5]);
+  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6));
+  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10));
+  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8));
+  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8));
+  partial6 = _mm256_add_epi16(partial6, tmp);
+
+  /* Partial sums for lines 6 and 7. */
+  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2));
+  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14));
+  partial4a = _mm256_add_epi16(partial4a, lines[7]);
+  tmp = _mm256_add_epi16(lines[6], lines[7]);
+  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4));
+  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12));
+  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10));
+  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6));
+  partial6 = _mm256_add_epi16(partial6, tmp);
+
+  const __m256i const_reg_1 =
+      _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840);
+  const __m256i const_reg_2 =
+      _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168);
+  const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0);
+  const __m256i const_reg_4 =
+      _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140);
+
+  /* Compute costs in terms of partial sums. */
+  partial4a =
+      fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2);
+  partial7a =
+      fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4);
+  partial5a =
+      fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4);
+  partial6 = _mm256_madd_epi16(partial6, partial6);
+  partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105));
+
+  partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a);
+  _mm_storeu_si128((__m128i *)cost_frist_8x8,
+                   _mm256_castsi256_si128(partial4a));
+  _mm_storeu_si128((__m128i *)cost_second_8x8,
+                   _mm256_extractf128_si256(partial4a, 1));
+
+  return partial4a;
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) {
+  const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]);
+  const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]);
+  const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+  const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+  const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]);
+  const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]);
+  const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+  const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
+  const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
+  const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
+  const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
+  const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
+  const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
+  const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
+  const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
+
+  res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1);
+  res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1);
+  res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3);
+  res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3);
+  res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5);
+  res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5);
+  res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7);
+  res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2,
+                             int stride, int32_t *var_out_1st,
+                             int32_t *var_out_2nd, int coeff_shift,
+                             int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+  int32_t cost_first_8x8[8];
+  int32_t cost_second_8x8[8];
+  // Used to store the best cost for 2 8x8's.
+  int32_t best_cost[2] = { 0 };
+  // Best direction for 2 8x8's.
+  int best_dir[2] = { 0 };
+
+  const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift);
+  const __m256i const_128_reg = _mm256_set1_epi16(128);
+  __m256i lines[8];
+  for (int i = 0; i < 8; i++) {
+    const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]);
+    const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]);
+
+    lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1);
+    lines[i] = _mm256_sub_epi16(
+        _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg);
+  }
+
+  /* Compute "mostly vertical" directions. */
+  const __m256i dir47 =
+      compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4);
+
+  /* Transpose and reverse the order of the lines. */
+  array_reverse_transpose_8x8_avx2(lines, lines);
+
+  /* Compute "mostly horizontal" directions. */
+  const __m256i dir03 =
+      compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8);
+
+  __m256i max = _mm256_max_epi32(dir03, dir47);
+  max =
+      _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8),
+                                            _mm256_slli_si256(max, 16 - (8))));
+  max =
+      _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4),
+                                            _mm256_slli_si256(max, 16 - (4))));
+
+  const __m128i first_8x8_output = _mm256_castsi256_si128(max);
+  const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1);
+  const __m128i cmpeg_res_00 =
+      _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47));
+  const __m128i cmpeg_res_01 =
+      _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03));
+  const __m128i cmpeg_res_10 =
+      _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1));
+  const __m128i cmpeg_res_11 =
+      _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1));
+  const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00);
+  const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10);
+
+  best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max));
+  best_cost[1] = _mm_cvtsi128_si32(second_8x8_output);
+  best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8));
+  best_dir[0] =
+      get_msb(best_dir[0] ^ (best_dir[0] - 1));  // Count trailing zeros
+  best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8));
+  best_dir[1] =
+      get_msb(best_dir[1] ^ (best_dir[1] - 1));  // Count trailing zeros
+
+  /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+  *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7];
+  *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7];
+
+  /* We'd normally divide by 840, but dividing by 1024 is close enough
+  for what we're going to do with this. */
+  *var_out_1st >>= 10;
+  *var_out_2nd >>= 10;
+  *out_dir_1st_8x8 = best_dir[0];
+  *out_dir_2nd_8x8 = best_dir[1];
+}
+
+void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride,
+                                        const uint8_t *src, int sstride, int v,
+                                        int h) {
+  int i = 0, j = 0;
+  int remaining_width = h;
+
+  // Process multiple 16 pixels at a time.
+  if (h > 15) {
+    for (i = 0; i < v; i++) {
+      for (j = 0; j < h - 15; j += 16) {
+        __m128i row = _mm_loadu_si128((__m128i *)&src[i * sstride + j]);
+        _mm256_storeu_si256((__m256i *)&dst[i * dstride + j],
+                            _mm256_cvtepu8_epi16(row));
+      }
+    }
+    remaining_width = h & 0xe;
+  }
+
+  // Process multiple 8 pixels at a time.
+  if (remaining_width > 7) {
+    for (i = 0; i < v; i++) {
+      __m128i row = _mm_loadl_epi64((__m128i *)&src[i * sstride + j]);
+      _mm_storeu_si128((__m128i *)&dst[i * dstride + j],
+                       _mm_unpacklo_epi8(row, _mm_setzero_si128()));
+    }
+    remaining_width = h & 0x7;
+    j += 8;
+  }
+
+  // Process the remaining pixels.
+  if (remaining_width) {
+    for (i = 0; i < v; i++) {
+      for (int k = j; k < h; k++) {
+        dst[i * dstride + k] = src[i * sstride + k];
+      }
+    }
+  }
+}
diff --git a/media/libaom/src/av1/common/x86/cdef_block_sse2.c b/media/libaom/src/av1/common/x86/cdef_block_sse2.c
new file mode 100644
index 0000000000..faf51fdd98
--- /dev/null
+++ b/media/libaom/src/av1/common/x86/cdef_block_sse2.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse2
+#include "av1/common/cdef_block_simd.h"
+
+void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2,
+                             int stride, int32_t *var_out_1st,
+                             int32_t *var_out_2nd, int coeff_shift,
+                             int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+  // Process first 8x8.
+  *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
+
+  // Process second 8x8.
+  *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
+}
+
+void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride,
+                                        const uint8_t *src, int sstride, int v,
+                                        int h) {
+  int j = 0;
+  for (int i = 0; i < v; i++) {
+    for (j = 0; j < (h & ~0x7); j += 8) {
+      v64 row = v64_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+    }
+    for (; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
diff --git a/media/libaom/src/av1/common/x86/cdef_block_sse4.c b/media/libaom/src/av1/common/x86/cdef_block_sse4.c
new file mode 100644
index 0000000000..f87d158456
--- /dev/null
+++ b/media/libaom/src/av1/common/x86/cdef_block_sse4.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse4_1
+#include "av1/common/cdef_block_simd.h"
+
+void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2,
+                               int stride, int32_t *var_out_1st,
+                               int32_t *var_out_2nd, int coeff_shift,
+                               int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+  // Process first 8x8.
+  *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
+
+  // Process second 8x8.
+  *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
+}
+
+void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride,
+                                          const uint8_t *src, int sstride,
+                                          int v, int h) {
+  int j = 0;
+  for (int i = 0; i < v; i++) {
+    for (j = 0; j < (h & ~0x7); j += 8) {
+      v64 row = v64_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+    }
+    for (; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
diff --git a/media/libaom/src/av1/common/x86/cdef_block_ssse3.c b/media/libaom/src/av1/common/x86/cdef_block_ssse3.c
new file mode 100644
index 0000000000..a2faf79e3e
--- /dev/null
+++ b/media/libaom/src/av1/common/x86/cdef_block_ssse3.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_ssse3
+#include "av1/common/cdef_block_simd.h"
+
+void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2,
+                              int stride, int32_t *var_out_1st,
+                              int32_t *var_out_2nd, int coeff_shift,
+                              int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+  // Process first 8x8.
+  *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
+
+  // Process second 8x8.
+  *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
+}
+
+void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride,
+                                         const uint8_t *src, int sstride, int v,
+                                         int h) {
+  int j;
+  for (int i = 0; i < v; i++) {
+    for (j = 0; j < (h & ~0x7); j += 8) {
+      v64 row = v64_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+    }
+    for (; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
diff --git a/media/libaom/src/av1/common/x86/cfl_avx2.c b/media/libaom/src/av1/common/x86/cfl_avx2.c
index d9c6f99d59..e1e187c4a6 100644
--- a/media/libaom/src/av1/common/x86/cfl_avx2.c
+++ b/media/libaom/src/av1/common/x86/cfl_avx2.c
@@ -271,9 +271,9 @@ static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
   } while ((row += CFL_BUF_LINE_I256) < row_end);
 }
 
-CFL_PREDICT_X(avx2, 32, 8, lbd);
-CFL_PREDICT_X(avx2, 32, 16, lbd);
-CFL_PREDICT_X(avx2, 32, 32, lbd);
+CFL_PREDICT_X(avx2, 32, 8, lbd)
+CFL_PREDICT_X(avx2, 32, 16, lbd)
+CFL_PREDICT_X(avx2, 32, 32, lbd)
 
 cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
   static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
diff --git a/media/libaom/src/av1/common/x86/convolve_2d_avx2.c b/media/libaom/src/av1/common/x86/convolve_2d_avx2.c
index e19575d725..04112ff9b9 100644
--- a/media/libaom/src/av1/common/x86/convolve_2d_avx2.c
+++ b/media/libaom/src/av1/common/x86/convolve_2d_avx2.c
@@ -26,292 +26,130 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                              const InterpFilterParams *filter_params_y,
                              const int subpel_x_qn, const int subpel_y_qn,
                              ConvolveParams *conv_params) {
-  const int bd = 8;
-  int im_stride = 8;
-  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
-  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  if (filter_params_x->taps > 8) {
+    const int bd = 8;
+    int im_stride = 8, i;
+    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+    const int bits =
+        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
 
-  assert(conv_params->round_0 > 0);
+    assert(conv_params->round_0 > 0);
 
-  const __m256i round_const_h = _mm256_set1_epi16(
-      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
-  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+    const __m256i round_const_h12 = _mm256_set1_epi32(
+        ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
 
-  const __m256i sum_round_v = _mm256_set1_epi32(
-      (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
-  const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+    const __m256i sum_round_v = _mm256_set1_epi32(
+        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
 
-  const __m256i round_const_v = _mm256_set1_epi32(
-      ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
-      ((1 << (offset_bits - conv_params->round_1)) >> 1));
-  const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+    const __m256i round_const_v = _mm256_set1_epi32(
+        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+        ((1 << (offset_bits - conv_params->round_1)) >> 1));
+    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
 
-  __m256i filt[4], coeffs_h[4], coeffs_v[4];
+    __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
 
-  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+    int horiz_tap = 12;
+    int vert_tap = 12;
 
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
-  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
+    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
+    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
 
-  // Condition for checking valid horz_filt taps
-  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_h[0], coeffs_h[3]), 0)))
-    is_horiz_4tap = 1;
-
-  // Condition for checking valid vert_filt taps
-  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_v[0], coeffs_v[3]), 0)))
-    is_vert_4tap = 1;
-
-  // horz_filt as 4 tap and vert_filt as 8 tap
-  if (is_horiz_4tap) {
-    int im_h = h + filter_params_y->taps - 1;
-    const int fo_vert = filter_params_y->taps / 2 - 1;
-    const int fo_horiz = 1;
+    int im_h = h + vert_tap - 1;
+    const int fo_vert = vert_tap / 2 - 1;
+    const int fo_horiz = horiz_tap / 2 - 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
-    // horz-filter
     for (int j = 0; j < w; j += 8) {
-      for (i = 0; i < (im_h - 2); i += 2) {
-        __m256i data = _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
-
-        // Load the next line
-        data = _mm256_inserti128_si256(
-            data,
-            _mm_loadu_si128(
-                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
-            1);
-        __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt);
-
-        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
-                               round_shift_h);
-        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
-      }
-
-      __m256i data_1 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
-
-      __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt);
-      res =
-          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
-      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
-
-      // vert filter
-      CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+      CONVOLVE_SR_HORIZONTAL_FILTER_12TAP
+      CONVOLVE_SR_VERTICAL_FILTER_12TAP
     }
-  } else if (is_vert_4tap) {
-    int im_h = h + 3;
-    const int fo_vert = 1;
-    const int fo_horiz = filter_params_x->taps / 2 - 1;
+  } else {
+    const int bd = 8;
+    int im_stride = 8, i;
+    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+    const int bits =
+        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+    assert(conv_params->round_0 > 0);
+
+    const __m256i round_const_h =
+        _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) +
+                          (1 << (bd + FILTER_BITS - 2)));
+    const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+    const __m256i sum_round_v = _mm256_set1_epi32(
+        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+    const __m256i round_const_v = _mm256_set1_epi32(
+        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+        ((1 << (offset_bits - conv_params->round_1)) >> 1));
+    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+
+    __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+    filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+    filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
+
+    const int16_t *const filter_x = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
+    const int16_t *const filter_y = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+    int horiz_tap = SUBPEL_TAPS;
+    int vert_tap = SUBPEL_TAPS;
+
+    if (!(filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]))
+      horiz_tap = 4;
+    else if (!(filter_x[0] | filter_x[7]))
+      horiz_tap = 6;
+
+    if (!(filter_y[0] | filter_y[1] | filter_y[6] | filter_y[7]))
+      vert_tap = 4;
+    else if (!(filter_y[0] | filter_y[7]))
+      vert_tap = 6;
+
+    if (horiz_tap == 6)
+      prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+    else
+      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+
+    if (vert_tap == 6)
+      prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
+    else
+      prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
+
+    int im_h = h + vert_tap - 1;
+    const int fo_vert = vert_tap / 2 - 1;
+    const int fo_horiz = horiz_tap / 2 - 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
     filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
     filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
 
     for (int j = 0; j < w; j += 8) {
-      // horz_filter
-      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
-      // vert_filter
-      __m256i s[6];
-      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
-      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
-      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
-      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
-
-      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
-      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
-      s[3] = _mm256_unpackhi_epi16(src_0, src_1);
-      s[4] = _mm256_unpackhi_epi16(src_2, src_3);
-
-      for (i = 0; i < h; i += 2) {
-        const int16_t *data = &im_block[i * im_stride];
-
-        const __m256i s4 =
-            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
-        const __m256i s5 =
-            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
-
-        s[2] = _mm256_unpacklo_epi16(s4, s5);
-        s[5] = _mm256_unpackhi_epi16(s4, s5);
-
-        __m256i res_a = convolve_4tap(s, coeffs_v + 1);
-        __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1);
-
-        // Combine V round and 2F-H-V round into a single rounding
-        res_a =
-            _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);
-        res_b =
-            _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);
-
-        const __m256i res_a_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
-        const __m256i res_b_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
-
-        /* rounding code */
-        // 16 bit conversion
-        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
-        // 8 bit conversion and saturation to uint8
-        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
-
-        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
-
-        // Store values into the destination buffer
-        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
-        if (w - j > 4) {
-          _mm_storel_epi64(p_0, res_0);
-          _mm_storel_epi64(p_1, res_1);
-        } else if (w == 4) {
-          xx_storel_32(p_0, res_0);
-          xx_storel_32(p_1, res_1);
-        } else {
-          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
-        }
-
-        s[0] = s[1];
-        s[1] = s[2];
-        s[3] = s[4];
-        s[4] = s[5];
+      if (horiz_tap == 4) {
+        CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
+      } else if (horiz_tap == 6) {
+        CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
+      } else {
+        CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
       }
-    }
-  } else {
-    int j;
-    int im_h = h + filter_params_y->taps - 1;
-    const int fo_vert = filter_params_y->taps / 2 - 1;
-    const int fo_horiz = filter_params_x->taps / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
-    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-    for (j = 0; j < w; j += 8) {
-      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
-
-      CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+      if (vert_tap == 4) {
+        CONVOLVE_SR_VERTICAL_FILTER_4TAP
+      } else if (vert_tap == 6) {
+        CONVOLVE_SR_VERTICAL_FILTER_6TAP
+      } else {
+        CONVOLVE_SR_VERTICAL_FILTER_8TAP
+      }
     }
   }
 }
-
-static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
-  __m256i s[4];
-  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
-  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
-  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
-  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
-  _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
-  _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
-  _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
-  _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
-}
-
-void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
-                                  ConvolveParams *conv_params) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-
-  if (w >= 16) {
-    assert(!((intptr_t)dst % 16));
-    assert(!(dst_stride % 16));
-  }
-
-  if (w == 2) {
-    do {
-      memmove(dst, src, 2 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      memmove(dst, src, 2 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 4) {
-    do {
-      memmove(dst, src, 4 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      memmove(dst, src, 4 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 8) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      _mm_storel_epi64((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_storel_epi64((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 16) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      _mm_store_si128((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 32) {
-    do {
-      __m256i s[2];
-      s[0] = _mm256_loadu_si256((__m256i *)src);
-      src += src_stride;
-      s[1] = _mm256_loadu_si256((__m256i *)src);
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)dst, s[0]);
-      dst += dst_stride;
-      _mm256_storeu_si256((__m256i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 64) {
-    do {
-      __m256i s[4];
-      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
-      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
-      src += src_stride;
-      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
-      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
-      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
-      dst += dst_stride;
-      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
-      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else {
-    do {
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  }
-}
diff --git a/media/libaom/src/av1/common/x86/convolve_2d_sse2.c b/media/libaom/src/av1/common/x86/convolve_2d_sse2.c
index 5376ea79bb..ca88bd7d5d 100644
--- a/media/libaom/src/av1/common/x86/convolve_2d_sse2.c
+++ b/media/libaom/src/av1/common/x86/convolve_2d_sse2.c
@@ -16,20 +16,21 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
 #include "av1/common/convolve.h"
 
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_qn, const int subpel_y_qn,
-                             ConvolveParams *conv_params) {
+void av1_convolve_2d_sr_12tap_sse2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   ConvolveParams *conv_params) {
   const int bd = 8;
 
   DECLARE_ALIGNED(16, int16_t,
                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
   int im_h = h + filter_params_y->taps - 1;
-  int im_stride = MAX_SB_SIZE;
+  int im_stride = w;
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -41,26 +42,11 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
 
   assert(conv_params->round_0 > 0);
+  __m128i coeffs[6];
 
   /* Horizontal filter */
   {
-    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_qn & SUBPEL_MASK);
-    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+    prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
 
     const __m128i round_const = _mm_set1_epi32(
         (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
@@ -70,34 +56,54 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
       for (j = 0; j < w; j += 8) {
         const __m128i data =
             _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i data_2 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 4)]);
 
         // Filter even-index pixels
         const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
         const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+        const __m128i src_4 = _mm_unpacklo_epi8(data_2, zero);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+        const __m128i src_6 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 2), zero);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+        const __m128i src_8 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 4), zero);
+        const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]);
+        const __m128i src_10 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 6), zero);
+        const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]);
+
+        const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                               _mm_add_epi32(res_2, res_6));
+        __m128i res_even =
+            _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246);
         res_even =
             _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
 
         // Filter odd-index pixels
         const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]);
         const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]);
+        const __m128i src_5 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 1), zero);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]);
+        const __m128i src_7 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 3), zero);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]);
+        const __m128i src_9 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 5), zero);
+        const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]);
+        const __m128i src_11 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 7), zero);
+        const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]);
+
+        const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                               _mm_add_epi32(res_3, res_7));
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357);
         res_odd =
             _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
 
@@ -110,23 +116,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
 
   /* Vertical filter */
   {
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_qn & SUBPEL_MASK);
-    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+    prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
 
     const __m128i sum_round =
         _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
@@ -153,14 +143,24 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
         const __m128i src_6 =
             _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
                                *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+        const __m128i src_8 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 8 * im_stride),
+                               *(__m128i *)(data + 9 * im_stride));
+        const __m128i src_10 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 10 * im_stride),
+                               *(__m128i *)(data + 11 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+        const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]);
+        const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]);
+
+        const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
                                                _mm_add_epi32(res_4, res_6));
+        __m128i res_even =
+            _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246);
 
         // Filter odd-index pixels
         const __m128i src_1 =
@@ -175,14 +175,23 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
         const __m128i src_7 =
             _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
                                *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
+        const __m128i src_9 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 8 * im_stride),
+                               *(__m128i *)(data + 9 * im_stride));
+        const __m128i src_11 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 10 * im_stride),
+                               *(__m128i *)(data + 11 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]);
+        const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]);
+        const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]);
+
+        const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                               _mm_add_epi32(res_3, res_7));
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357);
 
         // Rearrange pixels back into the order 0 ... 7
         const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
@@ -204,168 +213,235 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
         // Accumulate values into the destination buffer
         __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
 
-        if (w == 2) {
-          *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
-        } else if (w == 4) {
-          *(uint32_t *)p = _mm_cvtsi128_si32(res);
-        } else {
-          _mm_storel_epi64(p, res);
-        }
+        _mm_storel_epi64(p, res);
       }
     }
   }
 }
 
-static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
-  __m128i s[8];
-  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
-  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
-  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
-  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
-  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
-  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
-  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
-  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
-  _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
-  _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
-  _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
-  _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
-  _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
-  _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
-  _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
-  _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
-}
-
-void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
-                                  ConvolveParams *conv_params) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-
-  if (w >= 16) {
-    assert(!((intptr_t)dst % 16));
-    assert(!(dst_stride % 16));
-  }
-
-  if (w == 2) {
-    do {
-      memmove(dst, src, 2 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      memmove(dst, src, 2 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 4) {
-    do {
-      memmove(dst, src, 4 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      memmove(dst, src, 4 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 8) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      _mm_storel_epi64((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_storel_epi64((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 16) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      _mm_store_si128((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 32) {
-    do {
-      __m128i s[4];
-      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
-      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
-      src += src_stride;
-      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
-      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
-      src += src_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
-      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
-      _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 64) {
-    do {
-      __m128i s[8];
-      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
-      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
-      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
-      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
-      src += src_stride;
-      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
-      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
-      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
-      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
-      src += src_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
-      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
-      _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
-      _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
-      _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
-      _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
-      _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_qn, const int subpel_y_qn,
+                             ConvolveParams *conv_params) {
+  if (filter_params_x->taps > 8) {
+    if (w < 8) {
+      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                           filter_params_x, filter_params_y, subpel_x_qn,
+                           subpel_y_qn, conv_params);
+    } else {
+      av1_convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+                                    filter_params_x, filter_params_y,
+                                    subpel_x_qn, subpel_y_qn, conv_params);
+    }
   } else {
-    do {
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
+    const int bd = 8;
+
+    DECLARE_ALIGNED(16, int16_t,
+                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+    int im_h = h + filter_params_y->taps - 1;
+    int im_stride = MAX_SB_SIZE;
+    int i, j;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    const __m128i zero = _mm_setzero_si128();
+    const int bits =
+        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+    assert(conv_params->round_0 > 0);
+
+    /* Horizontal filter */
+    {
+      const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params_x, subpel_x_qn & SUBPEL_MASK);
+      const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+      // coeffs 0 1 0 1 2 3 2 3
+      const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+      // coeffs 4 5 4 5 6 7 6 7
+      const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+      // coeffs 0 1 0 1 0 1 0 1
+      const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+      // coeffs 2 3 2 3 2 3 2 3
+      const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+      // coeffs 4 5 4 5 4 5 4 5
+      const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+      // coeffs 6 7 6 7 6 7 6 7
+      const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+      const __m128i round_const = _mm_set1_epi32(
+          (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
+      const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+      for (i = 0; i < im_h; ++i) {
+        for (j = 0; j < w; j += 8) {
+          const __m128i data =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+          // Filter even-index pixels
+          const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+          const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+          const __m128i src_2 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+          const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+          const __m128i src_4 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+          const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+          const __m128i src_6 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+          const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                           _mm_add_epi32(res_2, res_6));
+          res_even =
+              _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+          // Filter odd-index pixels
+          const __m128i src_1 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+          const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+          const __m128i src_3 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+          const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+          const __m128i src_5 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+          const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+          const __m128i src_7 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+          const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                          _mm_add_epi32(res_3, res_7));
+          res_odd =
+              _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+          // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+          __m128i res = _mm_packs_epi32(res_even, res_odd);
+          _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+        }
+      }
+    }
+
+    /* Vertical filter */
+    {
+      const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params_y, subpel_y_qn & SUBPEL_MASK);
+      const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+      // coeffs 0 1 0 1 2 3 2 3
+      const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+      // coeffs 4 5 4 5 6 7 6 7
+      const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+      // coeffs 0 1 0 1 0 1 0 1
+      const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+      // coeffs 2 3 2 3 2 3 2 3
+      const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+      // coeffs 4 5 4 5 4 5 4 5
+      const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+      // coeffs 6 7 6 7 6 7 6 7
+      const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+      const __m128i sum_round = _mm_set1_epi32(
+          (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+      const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+      const __m128i round_const = _mm_set1_epi32(
+          ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+          ((1 << (offset_bits - conv_params->round_1)) >> 1));
+      const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+      for (i = 0; i < h; ++i) {
+        for (j = 0; j < w; j += 8) {
+          // Filter even-index pixels
+          const int16_t *data = &im_block[i * im_stride + j];
+          const __m128i src_0 =
+              _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                                 *(__m128i *)(data + 1 * im_stride));
+          const __m128i src_2 =
+              _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                                 *(__m128i *)(data + 3 * im_stride));
+          const __m128i src_4 =
+              _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                                 *(__m128i *)(data + 5 * im_stride));
+          const __m128i src_6 =
+              _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                                 *(__m128i *)(data + 7 * im_stride));
+
+          const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+          const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+          const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+          const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+          const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                                 _mm_add_epi32(res_4, res_6));
+
+          // Filter odd-index pixels
+          const __m128i src_1 =
+              _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                                 *(__m128i *)(data + 1 * im_stride));
+          const __m128i src_3 =
+              _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                                 *(__m128i *)(data + 3 * im_stride));
+          const __m128i src_5 =
+              _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                                 *(__m128i *)(data + 5 * im_stride));
+          const __m128i src_7 =
+              _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                                 *(__m128i *)(data + 7 * im_stride));
+
+          const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+          const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+          const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+          const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+          const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                                _mm_add_epi32(res_5, res_7));
+
+          // Rearrange pixels back into the order 0 ... 7
+          const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+          const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+          __m128i res_lo_round =
+              _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+          __m128i res_hi_round =
+              _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+          res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+                                       round_shift);
+          res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+                                       round_shift);
+
+          const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+          const __m128i res = _mm_packus_epi16(res16, res16);
+
+          // Accumulate values into the destination buffer
+          __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+          if (w == 2) {
+            *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
+          } else if (w == 4) {
+            *(uint32_t *)p = _mm_cvtsi128_si32(res);
+          } else {
+            _mm_storel_epi64(p, res);
+          }
+        }
+      }
+    }
   }
 }
 
-void av1_dist_wtd_convolve_2d_copy_sse2(
-    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
+                                        uint8_t *dst0, int dst_stride0, int w,
+                                        int h, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
diff --git a/media/libaom/src/av1/common/x86/convolve_avx2.c b/media/libaom/src/av1/common/x86/convolve_avx2.c
index 1d5bc6fbd5..c7d1141a63 100644
--- a/media/libaom/src/av1/common/x86/convolve_avx2.c
+++ b/media/libaom/src/av1/common/x86/convolve_avx2.c
@@ -14,41 +14,45 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/synonyms.h"
 
 void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_qn, const int subpel_y_qn,
-                            ConvolveParams *conv_params) {
-  int i, j, is_vert_4tap = 0;
+                            const int subpel_y_qn) {
+  int i, j, vert_tap = SUBPEL_TAPS;
   // right shift is F-1 because we are already dividing
   // filter co-efficients by 2
   const int right_shift_bits = (FILTER_BITS - 1);
-  const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
-  const __m256i right_shift_const =
-      _mm256_set1_epi16((1 << right_shift_bits) >> 1);
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-  __m256i coeffs[4], s[8];
-  __m128i d[6];
+  __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
+  __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1);
 
-  prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
+  __m256i coeffs[6], s[12];
+  __m128i d[10];
 
   // Condition for checking valid vert_filt taps
-  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
-    is_vert_4tap = 1;
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  if (filter_params_y->taps == 12) {
+    vert_tap = 12;
+  } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
+    vert_tap = 4;
+  } else if (!(filter[0] | filter[7])) {
+    vert_tap = 6;
+  }
+
+  if (vert_tap == 6)
+    prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
+  else if (vert_tap == 12) {
+    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
+  } else {
+    prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
+  }
 
   // vert_filt as 4 tap
-  if (is_vert_4tap) {
+  if (vert_tap == 4) {
     const int fo_vert = 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride;
     for (j = 0; j < w; j += 16) {
@@ -142,6 +146,260 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
         s[4] = s[5];
       }
     }
+  } else if (vert_tap == 6) {
+    const int fo_vert = vert_tap / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src6;
+
+      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      const __m256i src_01a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+      const __m256i src_12a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+      const __m256i src_23a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
+      const __m256i src_34a =
+          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+
+      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+        const __m256i src_45a = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+        const __m256i src_56a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+            src6, 0x20);
+
+        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+        const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
+
+        /* rounding code */
+        // shift by F - 1
+        const __m256i res_16b_lo = _mm256_sra_epi16(
+            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+        if (w - j > 8) {
+          const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
+
+          /* rounding code */
+          // shift by F - 1
+          const __m256i res_16b_hi = _mm256_sra_epi16(
+              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_a);
+          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_1);
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_1);
+          } else if (w - j > 2) {
+            xx_storel_32(&dst[i * dst_stride + j], res_0);
+            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+          } else {
+            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+            __m128i *const p_1 =
+                (__m128i *)&dst[i * dst_stride + j + dst_stride];
+            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[3] = s[4];
+        s[4] = s[5];
+      }
+    }
+  } else if (vert_tap == 12) {  // vert_tap == 12
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    const __m256i v_zero = _mm256_setzero_si256();
+    right_shift = _mm_cvtsi32_si128(FILTER_BITS);
+    right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
+
+    for (j = 0; j < w; j += 8) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src10;
+
+      d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
+      d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
+      d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
+      d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
+      d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
+      d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
+      d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+      d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
+      d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+      d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      const __m256i src_01a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+      const __m256i src_12a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+      const __m256i src_23a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+      const __m256i src_34a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+      const __m256i src_45a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+      const __m256i src_56a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
+
+      const __m256i src_67a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
+
+      const __m256i src_78a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
+
+      const __m256i src_89a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
+
+      src10 = _mm256_castsi128_si256(
+          _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
+      const __m256i src_910a =
+          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
+
+      const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
+      const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
+      const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
+      const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
+      const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
+      const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
+      const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
+      const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
+      const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
+      const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
+
+      s[0] = _mm256_unpacklo_epi16(src_01, src_12);
+      s[1] = _mm256_unpacklo_epi16(src_23, src_34);
+      s[2] = _mm256_unpacklo_epi16(src_45, src_56);
+      s[3] = _mm256_unpacklo_epi16(src_67, src_78);
+      s[4] = _mm256_unpacklo_epi16(src_89, src_910);
+
+      s[6] = _mm256_unpackhi_epi16(src_01, src_12);
+      s[7] = _mm256_unpackhi_epi16(src_23, src_34);
+      s[8] = _mm256_unpackhi_epi16(src_45, src_56);
+      s[9] = _mm256_unpackhi_epi16(src_67, src_78);
+      s[10] = _mm256_unpackhi_epi16(src_89, src_910);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+        const __m256i src_1011a = _mm256_permute2x128_si256(
+            src10,
+            _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
+            0x20);
+
+        src10 = _mm256_castsi128_si256(
+            _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
+
+        const __m256i src_1112a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
+            src10, 0x20);
+
+        const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
+        const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
+
+        s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
+        s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
+
+        const __m256i res_lo = convolve_12taps(s, coeffs);
+
+        const __m256i res_32b_lo = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo, right_shift_const), right_shift);
+        // 8 bit conversion and saturation to uint8
+        __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
+        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+        if (w - j > 4) {
+          const __m256i res_hi = convolve_12taps(s + 6, coeffs);
+
+          const __m256i res_32b_hi = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi, right_shift_const), right_shift);
+          __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+          __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
+
+          const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
+          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_1);
+        } else {
+          const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+          if (w - j > 2) {
+            *(uint32_t *)&dst[i * dst_stride + j] =
+                (uint32_t)_mm_cvtsi128_si32(res_0);
+            *(uint32_t *)&dst[i * dst_stride + j + dst_stride] =
+                (uint32_t)_mm_cvtsi128_si32(res_1);
+          } else {
+            *(uint16_t *)&dst[i * dst_stride + j] =
+                (uint16_t)_mm_cvtsi128_si32(res_0);
+            *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
+                (uint16_t)_mm_cvtsi128_si32(res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+        s[3] = s[4];
+        s[4] = s[5];
+
+        s[6] = s[7];
+        s[7] = s[8];
+        s[8] = s[9];
+        s[9] = s[10];
+        s[10] = s[11];
+      }
+    }
   } else {
     const int fo_vert = filter_params_y->taps / 2 - 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride;
@@ -263,37 +521,45 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
 void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
-                            const InterpFilterParams *filter_params_y,
-                            const int subpel_x_qn, const int subpel_y_qn,
+                            const int subpel_x_qn,
                             ConvolveParams *conv_params) {
   const int bits = FILTER_BITS - conv_params->round_0;
-
-  const __m256i round_0_const =
-      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
-  const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
-  const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(bits);
-  int i, is_horiz_4tap = 0;
-  (void)filter_params_y;
-  (void)subpel_y_qn;
+  __m256i round_0_const =
+      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+  __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+  __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
+  int i, horiz_tap = SUBPEL_TAPS;
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
   assert(conv_params->round_0 > 0);
 
-  __m256i coeffs[4], filt[4];
+  __m256i coeffs[6], filt[4];
   filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
   filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
 
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  if (filter_params_x->taps == 12) {
+    horiz_tap = 12;
+  } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
+    horiz_tap = 4;
+  } else if (!(filter[0] | filter[7])) {
+    horiz_tap = 6;
+  }
 
-  // Condition for checking valid horz_filt taps
-  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
-    is_horiz_4tap = 1;
+  if (horiz_tap == 6)
+    prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
+  else if (horiz_tap == 12) {
+    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
+  } else {
+    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
+  }
 
   // horz_filt as 4 tap
-  if (is_horiz_4tap) {
+  if (horiz_tap == 4) {
     const int fo_horiz = 1;
     const uint8_t *const src_ptr = src - fo_horiz;
     if (w <= 8) {
@@ -363,6 +629,208 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
         }
       }
     }
+  } else if (horiz_tap == 6) {
+    const int fo_horiz = horiz_tap / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    if (w <= 8) {
+      for (i = 0; i < h; i += 2) {
+        const __m256i data = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+            0x20);
+
+        __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                   round_0_shift);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                   round_shift);
+
+        /* rounding code */
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+        } else if (w > 2) {
+          xx_storel_32(&dst[i * dst_stride], res_0);
+          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+      }
+    } else {
+      for (i = 0; i < h; ++i) {
+        for (int j = 0; j < w; j += 16) {
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+          // 18 19 20 21 22 23
+          const __m256i data = _mm256_inserti128_si256(
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+              1);
+
+          __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                     round_0_shift);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                     round_shift);
+
+          /* rounding code */
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+          // Store values into the destination buffer
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+          __m128i res = _mm256_castsi256_si128(res_8b);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        }
+      }
+    }
+  } else if (horiz_tap == 12) {  // horiz_tap == 12
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    const __m256i v_zero = _mm256_setzero_si256();
+    round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
+    round_const = _mm256_set1_epi32((1 << bits) >> 1);
+    round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+    __m256i s[6];
+
+    if (w <= 4) {
+      for (i = 0; i < h; i += 2) {
+        const __m256i data = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+            0x20);
+        // row0 0..7 row1 0..7
+        const __m256i s_16l = _mm256_unpacklo_epi8(data, v_zero);
+        // row0 8..F row1 8..F
+        const __m256i s_16h = _mm256_unpackhi_epi8(data, v_zero);
+
+        // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
+        const __m256i s_ll = _mm256_unpacklo_epi16(s_16l, s_16l);
+        // row0 04 04 .. 07 07 row1 04 04 .. 07 07
+        const __m256i s_lh = _mm256_unpackhi_epi16(s_16l, s_16l);
+
+        // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
+        const __m256i s_hl = _mm256_unpacklo_epi16(s_16h, s_16h);
+        // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
+        const __m256i s_hh = _mm256_unpackhi_epi16(s_16h, s_16h);
+
+        // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
+        s[0] = _mm256_alignr_epi8(s_lh, s_ll, 2);
+        // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+        s[1] = _mm256_alignr_epi8(s_lh, s_ll, 10);
+        // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
+        s[2] = _mm256_alignr_epi8(s_hl, s_lh, 2);
+        // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
+        s[3] = _mm256_alignr_epi8(s_hl, s_lh, 10);
+        // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
+        s[4] = _mm256_alignr_epi8(s_hh, s_hl, 2);
+        // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
+        s[5] = _mm256_alignr_epi8(s_hh, s_hl, 10);
+
+        const __m256i res_lo = convolve_12taps(s, coeffs);
+
+        __m256i res_32b_lo = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
+
+        // 00 01 02 03 10 12 13 14
+        res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
+                                      round_shift);
+        // 8 bit conversion and saturation to uint8
+        // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
+        __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
+        // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
+        // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
+        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+        // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
+        const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
+        // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+        if (w > 2) {
+          // 00 01 02 03
+          *(uint32_t *)&dst[i * dst_stride] =
+              (uint32_t)_mm_cvtsi128_si32(res_0);
+          // 10 11 12 13
+          *(uint32_t *)&dst[i * dst_stride + dst_stride] =
+              (uint32_t)_mm_cvtsi128_si32(res_1);
+        } else {
+          // 00 01
+          *(uint16_t *)&dst[i * dst_stride] =
+              (uint16_t)_mm_cvtsi128_si32(res_0);
+          // 10 11
+          *(uint16_t *)&dst[i * dst_stride + dst_stride] =
+              (uint16_t)_mm_cvtsi128_si32(res_1);
+        }
+      }
+    } else {
+      for (i = 0; i < h; i++) {
+        for (int j = 0; j < w; j += 8) {
+          const __m256i data = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
+              _mm256_castsi128_si256(_mm_loadu_si128(
+                  (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
+              0x20);
+          // row0 0..7 4..B
+          const __m256i s_16l = _mm256_unpacklo_epi8(data, v_zero);
+          // row0 8..F C..13
+          const __m256i s_16h = _mm256_unpackhi_epi8(data, v_zero);
+
+          // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
+          const __m256i s_ll = _mm256_unpacklo_epi16(s_16l, s_16l);
+          // row0 04 04 .. 07 07 08 08 .. 0B 0B
+          const __m256i s_lh = _mm256_unpackhi_epi16(s_16l, s_16l);
+
+          // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
+          const __m256i s_hl = _mm256_unpacklo_epi16(s_16h, s_16h);
+          // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
+          const __m256i s_hh = _mm256_unpackhi_epi16(s_16h, s_16h);
+
+          s[0] = _mm256_alignr_epi8(s_lh, s_ll, 2);
+          s[1] = _mm256_alignr_epi8(s_lh, s_ll, 10);
+          s[2] = _mm256_alignr_epi8(s_hl, s_lh, 2);
+          s[3] = _mm256_alignr_epi8(s_hl, s_lh, 10);
+          s[4] = _mm256_alignr_epi8(s_hh, s_hl, 2);
+          s[5] = _mm256_alignr_epi8(s_hh, s_hl, 10);
+
+          const __m256i res_lo = convolve_12taps(s, coeffs);
+
+          __m256i res_32b_lo = _mm256_sra_epi32(
+              _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
+
+          res_32b_lo = _mm256_sra_epi32(
+              _mm256_add_epi32(res_32b_lo, round_const), round_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
+          __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+          const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+          *(uint32_t *)&dst[i * dst_stride + j] =
+              (uint32_t)_mm_cvtsi128_si32(res_0);
+          *(uint32_t *)&dst[i * dst_stride + j + 4] =
+              (uint32_t)_mm_cvtsi128_si32(res_1);
+        }
+      }
+    }
   } else {
     const int fo_horiz = filter_params_x->taps / 2 - 1;
     const uint8_t *const src_ptr = src - fo_horiz;
diff --git a/media/libaom/src/av1/common/x86/convolve_sse2.c b/media/libaom/src/av1/common/x86/convolve_sse2.c
index 4323ac4d13..cd5521e333 100644
--- a/media/libaom/src/av1/common/x86/convolve_sse2.c
+++ b/media/libaom/src/av1/common/x86/convolve_sse2.c
@@ -75,82 +75,91 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
   return convolve(ss, coeffs);
 }
 
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
-                            const InterpFilterParams *filter_params_y,
-                            const int subpel_x_qn, const int subpel_y_qn,
-                            ConvolveParams *conv_params) {
+void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_y,
+                                  int subpel_y_qn) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint8_t *src_ptr = src - fo_vert * src_stride;
   const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
-  __m128i coeffs[4];
-
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
-  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
-
-  if (w <= 4) {
-    __m128i s[8], src6, res, res_round, res16;
-    uint32_t res_int;
-    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
-    s[0] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
-    s[1] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
-    s[2] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
-    s[3] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
-    s[4] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
-    s[5] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+  __m128i coeffs[6];
+
+  prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
+
+  int j = 0;
+  do {
+    __m128i s[12], src10, res_lo, res_hi;
+    __m128i res_lo_round, res_hi_round, res16, res;
+    const uint8_t *data = &src_ptr[j];
+
+    src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride));
+    s[0] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+    s[1] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+    s[2] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+    s[3] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+    s[4] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+    s[5] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)));
+    s[6] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+    s[7] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)));
+    s[8] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)));
+    s[9] = _mm_unpacklo_epi8(
+        _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10);
 
+    int i = 0;
     do {
-      s[6] = _mm_unpacklo_epi8(
-          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
-      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
-      s[7] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
-
-      res = convolve_lo_y(s + 0, coeffs);
-      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
-      res16 = _mm_packs_epi32(res_round, res_round);
-      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
-
-      if (w == 2)
-        *(uint16_t *)dst = (uint16_t)res_int;
-      else
-        *(uint32_t *)dst = res_int;
-
-      src_ptr += src_stride;
-      dst += dst_stride;
-
-      res = convolve_lo_y(s + 1, coeffs);
-      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
-      res16 = _mm_packs_epi32(res_round, res_round);
-      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
-
-      if (w == 2)
-        *(uint16_t *)dst = (uint16_t)res_int;
-      else
-        *(uint32_t *)dst = res_int;
-
-      src_ptr += src_stride;
-      dst += dst_stride;
+      data = &src_ptr[i * src_stride + j];
+      s[10] = _mm_unpacklo_epi8(
+          src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)));
+      src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride));
+      s[11] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10);
+
+      res_lo = convolve_lo_y_12tap(s, coeffs);  // Filter low index pixels
+      res_hi = convolve_hi_y_12tap(s, coeffs);  // Filter high index pixels
+
+      res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+      res_hi_round =
+          _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+      res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+      res = _mm_packus_epi16(res16, res16);
+
+      _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+      i++;
+
+      res_lo = convolve_lo_y_12tap(s + 1, coeffs);  // Filter low index pixels
+      res_hi = convolve_hi_y_12tap(s + 1, coeffs);  // Filter high index pixels
+
+      res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+      res_hi_round =
+          _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+      res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+      res = _mm_packus_epi16(res16, res16);
+
+      _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+      i++;
 
       s[0] = s[2];
       s[1] = s[3];
@@ -158,71 +167,90 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
       s[3] = s[5];
       s[4] = s[6];
       s[5] = s[7];
-      h -= 2;
-    } while (h);
-  } else {
-    assert(!(w % 8));
-    int j = 0;
-    do {
-      __m128i s[8], src6, res_lo, res_hi;
-      __m128i res_lo_round, res_hi_round, res16, res;
-      const uint8_t *data = &src_ptr[j];
+      s[6] = s[8];
+      s[7] = s[9];
+      s[8] = s[10];
+      s[9] = s[11];
+    } while (i < h);
+    j += 8;
+  } while (j < w);
+}
 
-      src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_y_qn) {
+  if (filter_params_y->taps > 8) {
+    if (w < 8) {
+      av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_y, subpel_y_qn);
+    } else {
+      av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_y, subpel_y_qn);
+    }
+  } else {
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const uint8_t *src_ptr = src - fo_vert * src_stride;
+    const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+    const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+    __m128i coeffs[4];
+
+    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
+
+    if (w <= 4) {
+      __m128i s[8], src6, res, res_round, res16;
+      uint32_t res_int;
+      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
       s[0] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
       s[1] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
       s[2] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
       s[3] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
       s[4] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
       s[5] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
 
-      int i = 0;
       do {
-        data = &src_ptr[i * src_stride + j];
         s[6] = _mm_unpacklo_epi8(
-            src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
-        src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+            src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+        src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
         s[7] = _mm_unpacklo_epi8(
-            _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+            _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
 
-        res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
-        res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
+        res = convolve_lo_y(s + 0, coeffs);
+        res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+        res16 = _mm_packs_epi32(res_round, res_round);
+        res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
 
-        res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+        if (w == 2)
+          *(uint16_t *)dst = (uint16_t)res_int;
+        else
+          *(uint32_t *)dst = res_int;
 
-        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
-        res = _mm_packus_epi16(res16, res16);
+        src_ptr += src_stride;
+        dst += dst_stride;
 
-        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
-        i++;
+        res = convolve_lo_y(s + 1, coeffs);
+        res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+        res16 = _mm_packs_epi32(res_round, res_round);
+        res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
 
-        res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
-        res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
+        if (w == 2)
+          *(uint16_t *)dst = (uint16_t)res_int;
+        else
+          *(uint32_t *)dst = res_int;
 
-        res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
-        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
-        res = _mm_packus_epi16(res16, res16);
-
-        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
-        i++;
+        src_ptr += src_stride;
+        dst += dst_stride;
 
         s[0] = s[2];
         s[1] = s[3];
@@ -230,18 +258,90 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
         s[3] = s[5];
         s[4] = s[6];
         s[5] = s[7];
-      } while (i < h);
-      j += 8;
-    } while (j < w);
+        h -= 2;
+      } while (h);
+    } else {
+      assert(!(w % 8));
+      int j = 0;
+      do {
+        __m128i s[8], src6, res_lo, res_hi;
+        __m128i res_lo_round, res_hi_round, res16, res;
+        const uint8_t *data = &src_ptr[j];
+
+        src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+        s[0] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+        s[1] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+        s[2] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+        s[3] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+        s[4] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+        s[5] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+        int i = 0;
+        do {
+          data = &src_ptr[i * src_stride + j];
+          s[6] = _mm_unpacklo_epi8(
+              src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+          src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+          s[7] = _mm_unpacklo_epi8(
+              _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+          res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
+          res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
+
+          res_lo_round =
+              _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+          res_hi_round =
+              _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+          res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+          res = _mm_packus_epi16(res16, res16);
+
+          _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+          i++;
+
+          res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
+          res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
+
+          res_lo_round =
+              _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+          res_hi_round =
+              _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+          res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+          res = _mm_packus_epi16(res16, res16);
+
+          _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+          i++;
+
+          s[0] = s[2];
+          s[1] = s[3];
+          s[2] = s[4];
+          s[3] = s[5];
+          s[4] = s[6];
+          s[5] = s[7];
+        } while (i < h);
+        j += 8;
+      } while (j < w);
+    }
   }
 }
 
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
-                            const InterpFilterParams *filter_params_y,
-                            const int subpel_x_qn, const int subpel_y_qn,
-                            ConvolveParams *conv_params) {
+void av1_convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  int subpel_x_qn,
+                                  ConvolveParams *conv_params) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint8_t *src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_0;
@@ -250,89 +350,151 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
   const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
   const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
   const __m128i round_shift = _mm_cvtsi32_si128(bits);
-  __m128i coeffs[4];
-
-  (void)filter_params_y;
-  (void)subpel_y_qn;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i coeffs[6];
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
-  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
+  prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
 
-  if (w <= 4) {
+  int i = 0;
+  do {
+    int j = 0;
     do {
-      const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+      const __m128i data =
+          _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
       __m128i s[4];
 
-      s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+      s[0] = _mm_unpacklo_epi16(data, _mm_srli_si128(data, 1));
       s[1] =
-          _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+          _mm_unpacklo_epi16(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
       s[2] =
-          _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+          _mm_unpacklo_epi16(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
       s[3] =
-          _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
-      const __m128i res_lo = convolve_lo_x(s, coeffs);
-      __m128i res_lo_round =
-          _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
-      res_lo_round =
-          _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift);
+          _mm_unpacklo_epi16(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+
+      const __m128i res32 = convolve_lo_x_12tap(s, coeffs, zero);
+
+      __m128i res32_round =
+          _mm_sra_epi32(_mm_add_epi32(res32, round_0_const), round_0_shift);
+      res32_round =
+          _mm_sra_epi32(_mm_add_epi32(res32_round, round_const), round_shift);
 
-      const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
-      const __m128i res = _mm_packus_epi16(res16, res16);
+      const __m128i res16 = _mm_packs_epi32(res32_round, zero);
+      const __m128i res = _mm_packus_epi16(res16, zero);
 
-      uint32_t r = _mm_cvtsi128_si32(res);
-      if (w == 2)
-        *(uint16_t *)dst = (uint16_t)r;
-      else
-        *(uint32_t *)dst = r;
+      const int val = _mm_cvtsi128_si32(res);
+      memcpy((dst + i * dst_stride + j), &val, sizeof(val));
+      j += 4;
+    } while (j < w);
+  } while (++i < h);
+}
 
-      src_ptr += src_stride;
-      dst += dst_stride;
-    } while (--h);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const int subpel_x_qn,
+                            ConvolveParams *conv_params) {
+  if (filter_params_x->taps > 8) {
+    if (w < 4) {
+      av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_x, subpel_x_qn, conv_params);
+    } else {
+      av1_convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_x, subpel_x_qn, conv_params);
+    }
   } else {
-    assert(!(w % 8));
-    int i = 0;
-    do {
-      int j = 0;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *src_ptr = src - fo_horiz;
+    const int bits = FILTER_BITS - conv_params->round_0;
+    const __m128i round_0_const =
+        _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+    const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
+    const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+    const __m128i round_shift = _mm_cvtsi32_si128(bits);
+    __m128i coeffs[4];
+
+    assert(bits >= 0);
+    assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+           ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+    prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
+
+    if (w <= 4) {
       do {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
         __m128i s[4];
 
-        // Filter even-index pixels
-        s[0] = data;
-        s[1] = _mm_srli_si128(data, 2);
-        s[2] = _mm_srli_si128(data, 4);
-        s[3] = _mm_srli_si128(data, 6);
-        const __m128i res_even = convolve_lo_x(s, coeffs);
-
-        // Filter odd-index pixels
-        s[0] = _mm_srli_si128(data, 1);
-        s[1] = _mm_srli_si128(data, 3);
-        s[2] = _mm_srli_si128(data, 5);
-        s[3] = _mm_srli_si128(data, 7);
-        const __m128i res_odd = convolve_lo_x(s, coeffs);
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+        s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+        s[1] =
+            _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+        s[2] =
+            _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+        s[3] =
+            _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+        const __m128i res_lo = convolve_lo_x(s, coeffs);
         __m128i res_lo_round =
             _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
         res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
                                      round_shift);
-        __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift);
-        res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
-                                     round_shift);
 
-        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
         const __m128i res = _mm_packus_epi16(res16, res16);
 
-        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
-        j += 8;
-      } while (j < w);
-    } while (++i < h);
+        uint32_t r = _mm_cvtsi128_si32(res);
+        if (w == 2)
+          *(uint16_t *)dst = (uint16_t)r;
+        else
+          *(uint32_t *)dst = r;
+
+        src_ptr += src_stride;
+        dst += dst_stride;
+      } while (--h);
+    } else {
+      assert(!(w % 8));
+      int i = 0;
+      do {
+        int j = 0;
+        do {
+          const __m128i data =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+          __m128i s[4];
+
+          // Filter even-index pixels
+          s[0] = data;
+          s[1] = _mm_srli_si128(data, 2);
+          s[2] = _mm_srli_si128(data, 4);
+          s[3] = _mm_srli_si128(data, 6);
+          const __m128i res_even = convolve_lo_x(s, coeffs);
+
+          // Filter odd-index pixels
+          s[0] = _mm_srli_si128(data, 1);
+          s[1] = _mm_srli_si128(data, 3);
+          s[2] = _mm_srli_si128(data, 5);
+          s[3] = _mm_srli_si128(data, 7);
+          const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+          // Rearrange pixels back into the order 0 ... 7
+          const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+          const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+          __m128i res_lo_round = _mm_sra_epi32(
+              _mm_add_epi32(res_lo, round_0_const), round_0_shift);
+          res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+                                       round_shift);
+          __m128i res_hi_round = _mm_sra_epi32(
+              _mm_add_epi32(res_hi, round_0_const), round_0_shift);
+          res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+                                       round_shift);
+
+          const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+          const __m128i res = _mm_packus_epi16(res16, res16);
+
+          _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+          j += 8;
+        } while (j < w);
+      } while (++i < h);
+    }
   }
 }
diff --git a/media/libaom/src/av1/common/x86/filterintra_sse4.c b/media/libaom/src/av1/common/x86/filterintra_sse4.c
index 99f4d99675..d05bb0e15f 100644
--- a/media/libaom/src/av1/common/x86/filterintra_sse4.c
+++ b/media/libaom/src/av1/common/x86/filterintra_sse4.c
@@ -9,7 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <assert.h>
 #include <smmintrin.h>
+#include <string.h>
 
 #include "config/av1_rtcd.h"
 
@@ -17,55 +19,332 @@
 #include "av1/common/enums.h"
 #include "av1/common/reconintra.h"
 
+//------------------------------------------------------------------------------
+// filter_intra_predictor_sse4_1
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+#define DUPLICATE_FIRST_HALF 0x44
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+static INLINE void filter_4x2_sse4_1(uint8_t *dst, const ptrdiff_t stride,
+                                     const __m128i *pixels,
+                                     const __m128i *taps_0_1,
+                                     const __m128i *taps_2_3,
+                                     const __m128i *taps_4_5,
+                                     const __m128i *taps_6_7) {
+  const __m128i mul_0_01 = _mm_maddubs_epi16(*pixels, *taps_0_1);
+  const __m128i mul_0_23 = _mm_maddubs_epi16(*pixels, *taps_2_3);
+  // |output_half| contains 8 partial sums.
+  __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+  __m128i output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row0 =
+      _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4),
+                       /* arbitrary pack arg */ output);
+  xx_storel_32(dst, output_row0);
+  const __m128i mul_1_01 = _mm_maddubs_epi16(*pixels, *taps_4_5);
+  const __m128i mul_1_23 = _mm_maddubs_epi16(*pixels, *taps_6_7);
+  output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+  output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row1 =
+      _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4),
+                       /* arbitrary pack arg */ output);
+  xx_storel_32(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because xx_loadl_64 goes out
+// of bounds and every block involves the left column. This implementation
+// loads TL from the top row for the first block, so it is not
+static INLINE void filter_4xh(uint8_t *dest, ptrdiff_t stride,
+                              const uint8_t *const top_ptr,
+                              const uint8_t *const left_ptr, int mode,
+                              const int height) {
+  const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]);
+  const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]);
+  const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]);
+  const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]);
+  __m128i top = xx_loadl_32(top_ptr - 1);
+  __m128i pixels = _mm_insert_epi8(top, (int8_t)top_ptr[3], 4);
+  __m128i left = (height == 4 ? xx_loadl_32(left_ptr) : xx_loadl_64(left_ptr));
+  left = _mm_slli_si128(left, 5);
+
+  // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+  // left[2], left[3], left[4], left[5], left[6], left[7]
+  pixels = _mm_or_si128(left, pixels);
+
+  // Duplicate first 8 bytes.
+  pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+  filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                    &taps_6_7);
+  dest += stride;  // Move to y = 1.
+  pixels = xx_loadl_32(dest);
+
+  // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+  // left[0], left[1], ...
+  pixels = _mm_or_si128(left, pixels);
+
+  // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+  // byte is an unused value, which shall be multiplied by 0 when we apply the
+  // filter.
+  const int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+  // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  dest += stride;  // Move to y = 2.
+  filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                    &taps_6_7);
+  dest += stride;  // Move to y = 3.
+
+  // Compute the middle 8 rows before using common code for the final 4 rows.
+  // Because the common code below this block assumes that
+  if (height == 16) {
+    // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+    left = _mm_slli_si128(left, 1);
+    pixels = xx_loadl_32(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+    // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+    pixels = _mm_or_si128(left, pixels);
+
+    // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+    // last byte is an unused value, as above. The top-left was shifted to
+    // position nine to keep two empty spaces after the top pixels.
+    const int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+    // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+    // the end.
+    const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 4.
+
+    // First 4x2 in the if body.
+    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+
+    // Clear all but final pixel in the first 8 of left column.
+    __m128i keep_top_left = _mm_srli_si128(left, 13);
+    dest += stride;  // Move to y = 5.
+    pixels = xx_loadl_32(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+    // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+    pixels = _mm_or_si128(left, pixels);
+    left = xx_loadl_64(left_ptr + 8);
+
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 6.
+
+    // Second 4x2 in the if body.
+    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+
+    // Position TL value so we can use pixel_order1.
+    keep_top_left = _mm_slli_si128(keep_top_left, 6);
+    dest += stride;  // Move to y = 7.
+    pixels = xx_loadl_32(dest);
+    left = _mm_slli_si128(left, 7);
+    left = _mm_or_si128(left, keep_top_left);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 8.
+
+    // Third 4x2 in the if body.
+    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+    dest += stride;  // Move to y = 9.
+
+    // Prepare final inputs.
+    pixels = xx_loadl_32(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 10.
+
+    // Fourth 4x2 in the if body.
+    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+    dest += stride;  // Move to y = 11.
+  }
+
+  // In both the 8 and 16 case, we assume that the left vector has the next TL
+  // at position 8.
+  if (height > 4) {
+    // Erase prior left pixels by shifting TL to position 0.
+    left = _mm_srli_si128(left, 8);
+    left = _mm_slli_si128(left, 6);
+    pixels = xx_loadl_32(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 12 or 4.
+
+    // First of final two 4x2 blocks.
+    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+    dest += stride;  // Move to y = 13 or 5.
+    pixels = xx_loadl_32(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 14 or 6.
+
+    // Last of final two 4x2 blocks.
+    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+  }
+}
+
+static INLINE void filter_intra_predictor_sse4_1(void *const dest,
+                                                 ptrdiff_t stride,
+                                                 const void *const top_row,
+                                                 const void *const left_column,
+                                                 int mode, const int width,
+                                                 const int height) {
+  const uint8_t *const top_ptr = (const uint8_t *)top_row;
+  const uint8_t *const left_ptr = (const uint8_t *)left_column;
+  uint8_t *dst = (uint8_t *)dest;
+  if (width == 4) {
+    filter_4xh(dst, stride, top_ptr, left_ptr, mode, height);
+    return;
+  }
+
+  // There is one set of 7 taps for each of the 4x2 output pixels.
+  const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]);
+  const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]);
+  const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]);
+  const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]);
+
+  // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+  // the end is an unused value, which shall be multiplied by 0 when we apply
+  // the filter.
+  const int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+  // Takes the "left section" and puts it right after p0-p4.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+  // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+  // byte is unused as above.
+  const int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+  // Shuffles the "top left" from the left section, to the front. Used when
+  // grabbing data from left_column and not top_row.
+  const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+  // This first pass takes care of the cases where the top left pixel comes from
+  // top_row.
+  __m128i pixels = xx_loadl_64(top_ptr - 1);
+  __m128i left = _mm_slli_si128(xx_loadl_32(left_column), 8);
+  pixels = _mm_or_si128(pixels, left);
+
+  // Two sets of the same pixels to multiply with two sets of taps.
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                    &taps_6_7);
+  left = _mm_srli_si128(left, 1);
+
+  // Load
+  pixels = xx_loadl_32(dst + stride);
+
+  // Because of the above shift, this OR 'invades' the final of the first 8
+  // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+  // a padded 0.
+  pixels = _mm_or_si128(pixels, left);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+  const ptrdiff_t stride2 = stride << 1;
+  const ptrdiff_t stride4 = stride << 2;
+  filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+                    &taps_4_5, &taps_6_7);
+  dst += 4;
+  for (int x = 3; x < width - 4; x += 4) {
+    pixels = xx_loadl_32(top_ptr + x);
+    pixels = _mm_insert_epi8(pixels, (int8_t)top_ptr[x + 4], 4);
+    pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5);
+    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+    filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+    pixels = xx_loadl_32(dst + stride - 1);
+    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4);
+    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5);
+    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + stride2 - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+    filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+                      &taps_4_5, &taps_6_7);
+    dst += 4;
+  }
+
+  // Now we handle heights that reference previous blocks rather than top_row.
+  for (int y = 4; y < height; y += 4) {
+    // Leftmost 4x4 block for this height.
+    dst -= width;
+    dst += stride4;
+
+    // Top Left is not available by offset in these leftmost blocks.
+    pixels = xx_loadl_32(dst - stride);
+    left = _mm_slli_si128(xx_loadl_32(left_ptr + y - 1), 8);
+    left = _mm_insert_epi8(left, (int8_t)left_ptr[y + 3], 12);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                      &taps_6_7);
+
+    // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+    left = _mm_srli_si128(left, 2);
+    pixels = xx_loadl_32(dst + stride);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+                      &taps_4_5, &taps_6_7);
+
+    dst += 4;
+
+    // Remaining 4x4 blocks for this height.
+    for (int x = 4; x < width; x += 4) {
+      pixels = xx_loadl_32(dst - stride - 1);
+      pixels = _mm_insert_epi8(pixels, (int8_t)dst[-stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5);
+      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+      filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+                        &taps_6_7);
+      pixels = xx_loadl_32(dst + stride - 1);
+      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5);
+      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 + stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+      filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+                        &taps_4_5, &taps_6_7);
+      dst += 4;
+    }
+  }
+}
+
 void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
                                        TX_SIZE tx_size, const uint8_t *above,
                                        const uint8_t *left, int mode) {
-  int r, c;
-  uint8_t buffer[33][33];
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
-
-  assert(bw <= 32 && bh <= 32);
-
-  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
-  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
-
-  const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]);
-  const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]);
-  const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]);
-  const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]);
-  const __m128i filter_intra_scale_bits =
-      _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS));
-
-  for (r = 1; r < bh + 1; r += 2) {
-    for (c = 1; c < bw + 1; c += 4) {
-      DECLARE_ALIGNED(16, uint8_t, p[8]);
-      memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
-      p[5] = buffer[r][c - 1];
-      p[6] = buffer[r + 1][c - 1];
-      p[7] = 0;
-      const __m128i p_b = xx_loadl_64(p);
-      const __m128i in = _mm_unpacklo_epi64(p_b, p_b);
-      const __m128i out_01 = _mm_maddubs_epi16(in, f1f0);
-      const __m128i out_23 = _mm_maddubs_epi16(in, f3f2);
-      const __m128i out_45 = _mm_maddubs_epi16(in, f5f4);
-      const __m128i out_67 = _mm_maddubs_epi16(in, f7f6);
-      const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23);
-      const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67);
-      const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567);
-      // Rounding
-      const __m128i round_w =
-          _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits);
-      const __m128i out_r = _mm_packus_epi16(round_w, round_w);
-      const __m128i out_r1 = _mm_srli_si128(out_r, 4);
-      // Storing
-      xx_storel_32(&buffer[r][c], out_r);
-      xx_storel_32(&buffer[r + 1][c], out_r1);
-    }
-  }
-
-  for (r = 0; r < bh; ++r) {
-    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
-    dst += stride;
-  }
+  filter_intra_predictor_sse4_1(dst, stride, above, left, mode, bw, bh);
 }
diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c
index 396aed01bc..12046e40c7 100644
--- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -20,6 +20,12 @@
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
 
+void av1_highbd_convolve_2d_sr_ssse3(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+
 void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_x,
@@ -27,6 +33,13 @@ void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
                                     const int subpel_x_qn,
                                     const int subpel_y_qn,
                                     ConvolveParams *conv_params, int bd) {
+  if (filter_params_x->taps == 12) {
+    av1_highbd_convolve_2d_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
+                                    filter_params_x, filter_params_y,
+                                    subpel_x_qn, subpel_y_qn, conv_params, bd);
+    return;
+  }
+
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = 8;
@@ -185,142 +198,3 @@ void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
     }
   }
 }
-
-static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
-  __m256i s[4];
-  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
-  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
-  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
-  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
-  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
-  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
-  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
-  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
-}
-
-static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
-  __m256i s[8];
-  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
-  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
-  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
-  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
-  s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
-  s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
-  s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
-  s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
-
-  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
-  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
-  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
-  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
-  _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
-  _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
-  _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
-  _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
-}
-
-void av1_highbd_convolve_2d_copy_sr_avx2(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
-  (void)conv_params;
-  (void)bd;
-
-  if (w >= 16) {
-    assert(!((intptr_t)dst % 16));
-    assert(!(dst_stride % 16));
-  }
-
-  if (w == 2) {
-    do {
-      memmove(dst, src, 2 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      memmove(dst, src, 2 * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 4) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadl_epi64((__m128i *)src);
-      src += src_stride;
-      _mm_storel_epi64((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_storel_epi64((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 8) {
-    do {
-      __m128i s[2];
-      s[0] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      s[1] = _mm_loadu_si128((__m128i *)src);
-      src += src_stride;
-      _mm_store_si128((__m128i *)dst, s[0]);
-      dst += dst_stride;
-      _mm_store_si128((__m128i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 16) {
-    do {
-      __m256i s[2];
-      s[0] = _mm256_loadu_si256((__m256i *)src);
-      src += src_stride;
-      s[1] = _mm256_loadu_si256((__m256i *)src);
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)dst, s[0]);
-      dst += dst_stride;
-      _mm256_storeu_si256((__m256i *)dst, s[1]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 32) {
-    do {
-      __m256i s[4];
-      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
-      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
-      src += src_stride;
-      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
-      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
-      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
-      dst += dst_stride;
-      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
-      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else if (w == 64) {
-    do {
-      copy_64(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      copy_64(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  } else {
-    do {
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      copy_128(src, dst);
-      src += src_stride;
-      dst += dst_stride;
-      h -= 2;
-    } while (h);
-  }
-}
diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c
index d2ff47c1f9..b2c39cdd48 100644
--- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -21,17 +21,13 @@
 #include "aom_dsp/x86/convolve_sse4_1.h"
 #include "av1/common/convolve.h"
 
-void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(
-    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(const uint16_t *src,
+                                                 int src_stride, uint16_t *dst0,
+                                                 int dst_stride0, int w, int h,
+                                                 ConvolveParams *conv_params,
+                                                 int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c
index 5318fcaa8e..148543f667 100644
--- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -18,6 +18,7 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
 #include "av1/common/convolve.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
 
 void av1_highbd_convolve_2d_sr_ssse3(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
@@ -35,7 +36,6 @@ void av1_highbd_convolve_2d_sr_ssse3(
   // Check that, even with 12-bit input, the intermediate values will fit
   // into an unsigned 16-bit intermediate array.
   assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
-  __m128i coeffs_x[4], coeffs_y[4], s[16];
 
   const __m128i round_const_x = _mm_set1_epi32(
       ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
@@ -54,163 +54,360 @@ void av1_highbd_convolve_2d_sr_ssse3(
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m128i zero = _mm_setzero_si128();
 
-  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
-  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
-
-  for (j = 0; j < w; j += 8) {
-    /* Horizontal filter */
-    {
-      for (i = 0; i < im_h; i += 1) {
-        const __m128i row00 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        const __m128i row01 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
-
-        // even pixels
-        s[0] = _mm_alignr_epi8(row01, row00, 0);
-        s[1] = _mm_alignr_epi8(row01, row00, 4);
-        s[2] = _mm_alignr_epi8(row01, row00, 8);
-        s[3] = _mm_alignr_epi8(row01, row00, 12);
-
-        __m128i res_even = convolve(s, coeffs_x);
-        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
-                                 round_shift_x);
-
-        // odd pixels
-        s[0] = _mm_alignr_epi8(row01, row00, 2);
-        s[1] = _mm_alignr_epi8(row01, row00, 6);
-        s[2] = _mm_alignr_epi8(row01, row00, 10);
-        s[3] = _mm_alignr_epi8(row01, row00, 14);
-
-        __m128i res_odd = convolve(s, coeffs_x);
-        res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
-
-        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
-        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
-        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
-
-        _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
+  if (filter_params_x->taps == 12) {
+    __m128i coeffs_x[6], coeffs_y[6], s[24];
+    prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x);
+    prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y);
+
+    for (j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      {
+        for (i = 0; i < im_h; i += 1) {
+          const __m128i row00 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+          const __m128i row01 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+          const __m128i row02 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]);
+
+          // even pixels
+          s[0] = _mm_alignr_epi8(row01, row00, 0);
+          s[1] = _mm_alignr_epi8(row01, row00, 4);
+          s[2] = _mm_alignr_epi8(row01, row00, 8);
+          s[3] = _mm_alignr_epi8(row01, row00, 12);
+          s[4] = _mm_alignr_epi8(row02, row01, 0);
+          s[5] = _mm_alignr_epi8(row02, row01, 4);
+
+          __m128i res_even = convolve_12tap(s, coeffs_x);
+          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                   round_shift_x);
+
+          // odd pixels
+          s[0] = _mm_alignr_epi8(row01, row00, 2);
+          s[1] = _mm_alignr_epi8(row01, row00, 6);
+          s[2] = _mm_alignr_epi8(row01, row00, 10);
+          s[3] = _mm_alignr_epi8(row01, row00, 14);
+          s[4] = _mm_alignr_epi8(row02, row01, 2);
+          s[5] = _mm_alignr_epi8(row02, row01, 6);
+
+          __m128i res_odd = convolve_12tap(s, coeffs_x);
+          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+                                  round_shift_x);
+
+          __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+          __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+          __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+          _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
+        }
       }
-    }
-    /* Vertical filter */
-    {
-      __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
-      __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
-      __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
-      __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
-      __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
-      __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
-      __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
-
-      s[0] = _mm_unpacklo_epi16(s0, s1);
-      s[1] = _mm_unpacklo_epi16(s2, s3);
-      s[2] = _mm_unpacklo_epi16(s4, s5);
-
-      s[4] = _mm_unpackhi_epi16(s0, s1);
-      s[5] = _mm_unpackhi_epi16(s2, s3);
-      s[6] = _mm_unpackhi_epi16(s4, s5);
-
-      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
-      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
-      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
-
-      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
-      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
-      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
-
-      for (i = 0; i < h; i += 2) {
-        const int16_t *data = &im_block[i * im_stride];
-
-        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
-        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride));
 
+      /* Vertical filter */
+      {
+        __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
+        __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
+        __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
+        __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
+        __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
+        __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
+        __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
+        __m128i s7 = _mm_loadu_si128((__m128i *)(im_block + 7 * im_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(im_block + 8 * im_stride));
+        __m128i s9 = _mm_loadu_si128((__m128i *)(im_block + 9 * im_stride));
+        __m128i s10 = _mm_loadu_si128((__m128i *)(im_block + 10 * im_stride));
+
+        s[0] = _mm_unpacklo_epi16(s0, s1);
+        s[1] = _mm_unpacklo_epi16(s2, s3);
+        s[2] = _mm_unpacklo_epi16(s4, s5);
         s[3] = _mm_unpacklo_epi16(s6, s7);
-        s[7] = _mm_unpackhi_epi16(s6, s7);
-
-        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
-        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
-
-        const __m128i res_a0 = convolve(s, coeffs_y);
-        __m128i res_a_round0 =
-            _mm_sra_epi32(_mm_add_epi32(res_a0, round_const_y), round_shift_y);
-        res_a_round0 = _mm_sra_epi32(
-            _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
-
-        const __m128i res_a1 = convolve(s + 8, coeffs_y);
-        __m128i res_a_round1 =
-            _mm_sra_epi32(_mm_add_epi32(res_a1, round_const_y), round_shift_y);
-        res_a_round1 = _mm_sra_epi32(
-            _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
-
-        if (w - j > 4) {
-          const __m128i res_b0 = convolve(s + 4, coeffs_y);
-          __m128i res_b_round0 = _mm_sra_epi32(
-              _mm_add_epi32(res_b0, round_const_y), round_shift_y);
-          res_b_round0 = _mm_sra_epi32(
-              _mm_add_epi32(res_b_round0, round_const_bits), round_shift_bits);
-
-          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
-          __m128i res_b_round1 = _mm_sra_epi32(
-              _mm_add_epi32(res_b1, round_const_y), round_shift_y);
-          res_b_round1 = _mm_sra_epi32(
-              _mm_add_epi32(res_b_round1, round_const_bits), round_shift_bits);
-
-          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
-          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
-          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
-
-          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
-          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
-          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
-
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           res_16bit1);
-        } else if (w == 4) {
-          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
-          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
-          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
-
-          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
-          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
-          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
-
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           res_a_round1);
-        } else {
-          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
-          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
-          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
-
-          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
-          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
-          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
-
-          *((uint32_t *)(&dst[i * dst_stride + j])) =
-              _mm_cvtsi128_si32(res_a_round0);
-
-          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
-              _mm_cvtsi128_si32(res_a_round1);
+        s[4] = _mm_unpacklo_epi16(s8, s9);
+
+        s[6] = _mm_unpackhi_epi16(s0, s1);
+        s[7] = _mm_unpackhi_epi16(s2, s3);
+        s[8] = _mm_unpackhi_epi16(s4, s5);
+        s[9] = _mm_unpackhi_epi16(s6, s7);
+        s[10] = _mm_unpackhi_epi16(s8, s9);
+
+        s[12] = _mm_unpacklo_epi16(s1, s2);
+        s[13] = _mm_unpacklo_epi16(s3, s4);
+        s[14] = _mm_unpacklo_epi16(s5, s6);
+        s[15] = _mm_unpacklo_epi16(s7, s8);
+        s[16] = _mm_unpacklo_epi16(s9, s10);
+
+        s[18] = _mm_unpackhi_epi16(s1, s2);
+        s[19] = _mm_unpackhi_epi16(s3, s4);
+        s[20] = _mm_unpackhi_epi16(s5, s6);
+        s[21] = _mm_unpackhi_epi16(s7, s8);
+        s[22] = _mm_unpackhi_epi16(s9, s10);
+
+        for (i = 0; i < h; i += 2) {
+          const int16_t *data = &im_block[i * im_stride];
+
+          __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * im_stride));
+          __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * im_stride));
+
+          s[5] = _mm_unpacklo_epi16(s10, s11);
+          s[11] = _mm_unpackhi_epi16(s10, s11);
+
+          s[17] = _mm_unpacklo_epi16(s11, s12);
+          s[23] = _mm_unpackhi_epi16(s11, s12);
+
+          const __m128i res_a0 = convolve_12tap(s, coeffs_y);
+          __m128i res_a_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_a0, round_const_y), round_shift_y);
+          res_a_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
+
+          const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y);
+          __m128i res_a_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_a1, round_const_y), round_shift_y);
+          res_a_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
+
+          if (w - j > 4) {
+            const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y);
+            __m128i res_b_round0 = _mm_sra_epi32(
+                _mm_add_epi32(res_b0, round_const_y), round_shift_y);
+            res_b_round0 =
+                _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits),
+                              round_shift_bits);
+
+            const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y);
+            __m128i res_b_round1 = _mm_sra_epi32(
+                _mm_add_epi32(res_b1, round_const_y), round_shift_y);
+            res_b_round1 =
+                _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits),
+                              round_shift_bits);
+
+            __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+            res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+            res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+            __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+            res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+            res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_16bit1);
+          } else if (w == 4) {
+            res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+            res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+            res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+            res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+            res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+            res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_a_round1);
+          } else {
+            res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+            res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+            res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+            res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+            res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+            res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+            *((uint32_t *)(&dst[i * dst_stride + j])) =
+                _mm_cvtsi128_si32(res_a_round0);
+
+            *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+                _mm_cvtsi128_si32(res_a_round1);
+          }
+          s[0] = s[1];
+          s[1] = s[2];
+          s[2] = s[3];
+          s[3] = s[4];
+          s[4] = s[5];
+
+          s[6] = s[7];
+          s[7] = s[8];
+          s[8] = s[9];
+          s[9] = s[10];
+          s[10] = s[11];
+
+          s[12] = s[13];
+          s[13] = s[14];
+          s[14] = s[15];
+          s[15] = s[16];
+          s[16] = s[17];
+
+          s[18] = s[19];
+          s[19] = s[20];
+          s[20] = s[21];
+          s[21] = s[22];
+          s[22] = s[23];
+
+          s10 = s12;
         }
-        s[0] = s[1];
-        s[1] = s[2];
-        s[2] = s[3];
-
-        s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
-
-        s[0 + 8] = s[1 + 8];
-        s[1 + 8] = s[2 + 8];
-        s[2 + 8] = s[3 + 8];
-
-        s[4 + 8] = s[5 + 8];
-        s[5 + 8] = s[6 + 8];
-        s[6 + 8] = s[7 + 8];
+      }
+    }
+  } else {
+    __m128i coeffs_x[4], coeffs_y[4], s[16];
+    prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+    for (j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      {
+        for (i = 0; i < im_h; i += 1) {
+          const __m128i row00 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+          const __m128i row01 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+          // even pixels
+          s[0] = _mm_alignr_epi8(row01, row00, 0);
+          s[1] = _mm_alignr_epi8(row01, row00, 4);
+          s[2] = _mm_alignr_epi8(row01, row00, 8);
+          s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+          __m128i res_even = convolve(s, coeffs_x);
+          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                   round_shift_x);
+
+          // odd pixels
+          s[0] = _mm_alignr_epi8(row01, row00, 2);
+          s[1] = _mm_alignr_epi8(row01, row00, 6);
+          s[2] = _mm_alignr_epi8(row01, row00, 10);
+          s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+          __m128i res_odd = convolve(s, coeffs_x);
+          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+                                  round_shift_x);
+
+          __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+          __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+          __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+          _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
+        }
+      }
 
-        s6 = s8;
+      /* Vertical filter */
+      {
+        __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
+        __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
+        __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
+        __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
+        __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
+        __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
+        __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
+
+        s[0] = _mm_unpacklo_epi16(s0, s1);
+        s[1] = _mm_unpacklo_epi16(s2, s3);
+        s[2] = _mm_unpacklo_epi16(s4, s5);
+
+        s[4] = _mm_unpackhi_epi16(s0, s1);
+        s[5] = _mm_unpackhi_epi16(s2, s3);
+        s[6] = _mm_unpackhi_epi16(s4, s5);
+
+        s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+        s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+        s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+        s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+        s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+        s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+        for (i = 0; i < h; i += 2) {
+          const int16_t *data = &im_block[i * im_stride];
+
+          __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+          __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride));
+
+          s[3] = _mm_unpacklo_epi16(s6, s7);
+          s[7] = _mm_unpackhi_epi16(s6, s7);
+
+          s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+          s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+          const __m128i res_a0 = convolve(s, coeffs_y);
+          __m128i res_a_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_a0, round_const_y), round_shift_y);
+          res_a_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
+
+          const __m128i res_a1 = convolve(s + 8, coeffs_y);
+          __m128i res_a_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_a1, round_const_y), round_shift_y);
+          res_a_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
+
+          if (w - j > 4) {
+            const __m128i res_b0 = convolve(s + 4, coeffs_y);
+            __m128i res_b_round0 = _mm_sra_epi32(
+                _mm_add_epi32(res_b0, round_const_y), round_shift_y);
+            res_b_round0 =
+                _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits),
+                              round_shift_bits);
+
+            const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+            __m128i res_b_round1 = _mm_sra_epi32(
+                _mm_add_epi32(res_b1, round_const_y), round_shift_y);
+            res_b_round1 =
+                _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits),
+                              round_shift_bits);
+
+            __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+            res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+            res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+            __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+            res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+            res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_16bit1);
+          } else if (w == 4) {
+            res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+            res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+            res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+            res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+            res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+            res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_a_round1);
+          } else {
+            res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+            res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+            res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+            res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+            res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+            res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+            *((uint32_t *)(&dst[i * dst_stride + j])) =
+                _mm_cvtsi128_si32(res_a_round0);
+
+            *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+                _mm_cvtsi128_si32(res_a_round1);
+          }
+          s[0] = s[1];
+          s[1] = s[2];
+          s[2] = s[3];
+
+          s[4] = s[5];
+          s[5] = s[6];
+          s[6] = s[7];
+
+          s[0 + 8] = s[1 + 8];
+          s[1 + 8] = s[2 + 8];
+          s[2 + 8] = s[3 + 8];
+
+          s[4 + 8] = s[5 + 8];
+          s[5 + 8] = s[6 + 8];
+          s[6 + 8] = s[7 + 8];
+
+          s6 = s8;
+        }
       }
     }
   }
diff --git a/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c b/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c
index 93e98e4b3d..0798c6d828 100644
--- a/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -4146,11 +4146,10 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
       transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
     }
     if (rect_type == 1 || rect_type == -1) {
-      av1_round_shift_rect_array_32_avx2(
-          buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+      round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w_div8 << 3,
+                                     0, NewInvSqrt2);
     }
-    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-             -shift[0]);
+    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
 
     __m256i *_buf1 = buf1 + i * 8;
     if (lr_flip) {
@@ -4166,12 +4165,12 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
   }
   // 2nd stage: column transform
   for (int i = 0; i < buf_size_w_div8; i++) {
-    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
-             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+             bd, 0);
 
-    av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
-                                  buf1 + i * txfm_size_row, txfm_size_row,
-                                  -shift[1]);
+    round_shift_array_32_avx2(buf1 + i * txfm_size_row,
+                              buf1 + i * txfm_size_row, txfm_size_row,
+                              -shift[1]);
   }
 
   // write to buffer
diff --git a/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c b/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c
index 03eaef832b..37f8f42b29 100644
--- a/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -145,6 +145,74 @@ static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
   in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
 }
 
+void av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t *input, uint8_t *dest8,
+                                      int stride, int bd) {
+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+     0.5 shifts per pixel. */
+  __m128i op[4];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  load_buffer_4x4(input, op);
+
+  // Shift before-hand.
+  op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT);
+  op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT);
+  op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT);
+  op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT);
+
+  for (int i = 0; i < 2; ++i) {
+    transpose_32bit_4x4(op, op);
+
+    __m128i a1 = op[0];
+    __m128i c1 = op[1];
+    __m128i d1 = op[2];
+    __m128i b1 = op[3];
+    a1 = _mm_add_epi32(a1, c1);          // a1 += c1
+    d1 = _mm_sub_epi32(d1, b1);          // d1 -= b1
+    __m128i e1 = _mm_sub_epi32(a1, d1);  // e1 = (a1 - d1) >> 1
+    e1 = _mm_srai_epi32(e1, 1);
+    b1 = _mm_sub_epi32(e1, b1);  // b1 = e1 - b1
+    c1 = _mm_sub_epi32(e1, c1);  // c1 = e1 - c1
+    a1 = _mm_sub_epi32(a1, b1);  // a1 -= b1
+    d1 = _mm_add_epi32(d1, c1);  // d1 += c1
+
+    op[0] = a1;
+    op[1] = b1;
+    op[2] = c1;
+    op[3] = d1;
+  }
+
+  // Convert to int16_t. The C code checks that we are in range.
+  op[0] = _mm_packs_epi32(op[0], op[1]);
+  op[1] = _mm_packs_epi32(op[2], op[3]);
+
+  // Load uint16_t.
+  __m128i dst[2];
+  __m128i tmp[4];
+  tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+  tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
+  dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]);
+  tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
+  tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
+  dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]);
+
+  // Add to the previous results.
+  dst[0] = _mm_add_epi16(dst[0], op[0]);
+  dst[1] = _mm_add_epi16(dst[1], op[1]);
+
+  // Clamp.
+  dst[0] = highbd_clamp_epi16(dst[0], bd);
+  dst[1] = highbd_clamp_epi16(dst[1], bd);
+
+  // Store.
+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]);
+  dst[0] = _mm_srli_si128(dst[0], 8);
+  _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]);
+  _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]);
+  dst[1] = _mm_srli_si128(dst[1], 8);
+  _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]);
+}
+
 static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
                           __m128i *out1, const __m128i *clamp_lo,
                           const __m128i *clamp_hi) {
@@ -676,112 +744,102 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
   const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
-  const int txw_idx = get_txw_idx(TX_4X4);
-  const int txh_idx = get_txh_idx(TX_4X4);
 
   switch (tx_type) {
     case DCT_DCT:
       load_buffer_4x4(input, in);
-      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_DCT:
       load_buffer_4x4(input, in);
-      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
       load_buffer_4x4(input, in);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
       load_buffer_4x4(input, in);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case FLIPADST_DCT:
       load_buffer_4x4(input, in);
-      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
       load_buffer_4x4(input, in);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_4x4(input, in);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case ADST_FLIPADST:
       load_buffer_4x4(input, in);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_ADST:
       load_buffer_4x4(input, in);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case IDTX:
       load_buffer_4x4(input, in);
-      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        0);
-      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
-                        0);
+      iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case V_DCT:
       load_buffer_4x4(input, in);
-      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        0);
-      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case H_DCT:
       load_buffer_4x4(input, in);
-      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
-                        0);
+      idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case V_ADST:
       load_buffer_4x4(input, in);
-      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        0);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case H_ADST:
       load_buffer_4x4(input, in);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
-                        0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case V_FLIPADST:
       load_buffer_4x4(input, in);
-      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                        0);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case H_FLIPADST:
       load_buffer_4x4(input, in);
-      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
-      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
-                        0);
+      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+      iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     default: assert(0);
@@ -1346,89 +1404,78 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m128i in[16], out[16];
   const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
-  const int txw_idx = get_txw_idx(TX_8X8);
-  const int txh_idx = get_txh_idx(TX_8X8);
 
   switch (tx_type) {
     case DCT_DCT:
       load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                     -shift[0]);
+      idct8x8_sse4_1(out, in, INV_COS_BIT, 0, bd, -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      idct8x8_sse4_1(out, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
       load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                      -shift[0]);
+      iadst8x8_sse4_1(out, in, INV_COS_BIT, 0, bd, -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      idct8x8_sse4_1(out, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_DCT:
       load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                     -shift[0]);
+      idct8x8_sse4_1(out, in, INV_COS_BIT, 0, bd, -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
       load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                      -shift[0]);
+      iadst8x8_sse4_1(out, in, INV_COS_BIT, 0, bd, -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case FLIPADST_DCT:
       load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                     -shift[0]);
+      idct8x8_sse4_1(out, in, INV_COS_BIT, 0, bd, -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
       load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                      -shift[0]);
+      iadst8x8_sse4_1(out, in, INV_COS_BIT, 0, bd, -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      idct8x8_sse4_1(out, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case ADST_FLIPADST:
       load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                      -shift[0]);
+      iadst8x8_sse4_1(out, in, INV_COS_BIT, 0, bd, -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                      -shift[0]);
+      iadst8x8_sse4_1(out, in, INV_COS_BIT, 0, bd, -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case FLIPADST_ADST:
       load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-                      -shift[0]);
+      iadst8x8_sse4_1(out, in, INV_COS_BIT, 0, bd, -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, INV_COS_BIT, 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
       break;
     default: assert(0);
@@ -5227,8 +5274,7 @@ static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
       av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
                                            NewInvSqrt2);
     }
-    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-             -shift[0]);
+    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
 
     __m128i *_buf1 = buf1 + i * 4;
 
@@ -5240,8 +5286,8 @@ static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
     }
   }
   for (int i = 0; i < buf_size_w_div4; i++) {
-    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
-             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+             bd, 0);
 
     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
                                     buf1 + i * txfm_size_row, txfm_size_row,
@@ -5294,8 +5340,7 @@ static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
       av1_round_shift_rect_array_32_sse4_1(
           buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
     }
-    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-             -shift[0]);
+    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
 
     __m128i *_buf1 = buf1 + i * 4;
     if (lr_flip) {
@@ -5317,8 +5362,8 @@ static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
     }
   }
   for (int i = 0; i < buf_size_w_div8; i++) {
-    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
-             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+             bd, 0);
 
     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
                                     buf1 + i * txfm_size_row, txfm_size_row,
@@ -5364,8 +5409,7 @@ static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
       av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
                                            NewInvSqrt2);
     }
-    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-             -shift[0]);
+    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
 
     __m128i *_buf1 = buf1 + i * 4;
     for (int j = 0; j < (input_stride >> 2); ++j) {
@@ -5376,8 +5420,8 @@ static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
     }
   }
   for (int i = 0; i < (input_stride >> 2); i++) {
-    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
-             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+             bd, 0);
 
     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
                                     buf1 + i * txfm_size_row, txfm_size_row,
@@ -5439,8 +5483,7 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
       av1_round_shift_rect_array_32_sse4_1(
           buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
     }
-    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-             -shift[0]);
+    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
 
     __m128i *_buf1 = buf1 + i * 4;
     if (lr_flip) {
@@ -5463,8 +5506,8 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
   }
   // 2nd stage: column transform
   for (int i = 0; i < buf_size_w_div8; i++) {
-    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
-             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+             bd, 0);
 
     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
                                     buf1 + i * txfm_size_row, txfm_size_row,
@@ -5510,9 +5553,8 @@ static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input,
   load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
   av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
                                        NewInvSqrt2);
-  row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
-  row_txfm(buf0 + 4, buf0 + 4, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-           -shift[0]);
+  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+  row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]);
 
   if (lr_flip) {
     TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
@@ -5529,7 +5571,7 @@ static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input,
   }
 
   // 2nd stage: column transform
-  col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+  col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
 
   av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
 
@@ -5571,7 +5613,7 @@ static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input,
 
   av1_round_shift_rect_array_32_sse4_1(buf1, buf0, txfm_size_col, 0,
                                        NewInvSqrt2);
-  row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
 
   __m128i *buf1_ptr;
   if (lr_flip) {
@@ -5584,7 +5626,7 @@ static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input,
   // 2nd stage: column transform
   for (int i = 0; i < 2; i++) {
     col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
-             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+             INV_COS_BIT, 1, bd, 0);
   }
   av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
   // write to buffer
@@ -5621,8 +5663,7 @@ static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
   __m128i *buf0_cur = buf0;
   load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
   for (int i = 0; i < (txfm_size_row >> 2); i++) {
-    row_txfm(buf0 + (i << 2), buf0 + (i << 2),
-             av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+    row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]);
   }
 
   if (lr_flip) {
@@ -5640,7 +5681,7 @@ static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
   }
 
   // 2nd stage: column transform
-  col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+  col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
 
   av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
 
@@ -5680,7 +5721,7 @@ static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
     TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
                   buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
   }
-  row_txfm(buf1, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+  row_txfm(buf1, buf0, INV_COS_BIT, 0, bd, -shift[0]);
 
   __m128i *buf1_ptr;
   if (lr_flip) {
@@ -5693,7 +5734,7 @@ static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
   // 2nd stage: column transform
   for (int i = 0; i < buf_size_w_div8; i++) {
     col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
-             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+             INV_COS_BIT, 1, bd, 0);
   }
   av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
 
diff --git a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c
index 70f1ec7092..9cedd449a2 100644
--- a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -22,17 +22,13 @@
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
 
-void av1_highbd_dist_wtd_convolve_2d_copy_avx2(
-    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_2d_copy_avx2(const uint16_t *src,
+                                               int src_stride, uint16_t *dst0,
+                                               int dst_stride0, int w, int h,
+                                               ConvolveParams *conv_params,
+                                               int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
@@ -466,16 +462,13 @@ void av1_highbd_dist_wtd_convolve_2d_avx2(
 
 void av1_highbd_dist_wtd_convolve_x_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_1;
-  (void)filter_params_y;
-  (void)subpel_y_qn;
 
   int i, j;
   __m256i s[4], coeffs_x[4];
@@ -635,16 +628,13 @@ void av1_highbd_dist_wtd_convolve_x_avx2(
 
 void av1_highbd_dist_wtd_convolve_y_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+    ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
   const int bits = FILTER_BITS - conv_params->round_0;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
 
   assert(bits >= 0);
   int i, j;
diff --git a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c
index f033a6f940..af45764b27 100644
--- a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -19,16 +19,13 @@
 
 void av1_highbd_dist_wtd_convolve_y_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+    ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
   const int bits = FILTER_BITS - conv_params->round_0;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
 
   assert(bits >= 0);
   int i, j;
@@ -261,16 +258,13 @@ void av1_highbd_dist_wtd_convolve_y_sse4_1(
 
 void av1_highbd_dist_wtd_convolve_x_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_1;
-  (void)filter_params_y;
-  (void)subpel_y_qn;
 
   int i, j;
   __m128i s[4], coeffs_x[4];
diff --git a/media/libaom/src/av1/common/x86/highbd_warp_affine_avx2.c b/media/libaom/src/av1/common/x86/highbd_warp_affine_avx2.c
new file mode 100644
index 0000000000..87b1a66a4a
--- /dev/null
+++ b/media/libaom/src/av1/common/x86/highbd_warp_affine_avx2.c
@@ -0,0 +1,654 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref,
+                                 int width, int height, int stride,
+                                 uint16_t *pred, int p_col, int p_row,
+                                 int p_width, int p_height, int p_stride,
+                                 int subsampling_x, int subsampling_y, int bd,
+                                 ConvolveParams *conv_params, int16_t alpha,
+                                 int16_t beta, int16_t gamma, int16_t delta) {
+  __m256i tmp[15];
+  const int reduce_bits_horiz =
+      conv_params->round_0 +
+      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  (void)max_bits_horiz;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+  const __m256i reduce_bits_vert_const =
+      _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
+  const __m256i res_sub_const =
+      _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
+                        (1 << (offset_bits - conv_params->round_1 - 1)));
+  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+  __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+
+  __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1));
+  __m256i v_zeros = _mm256_setzero_si256();
+  int ohoriz = 1 << offset_bits_horiz;
+  int mhoriz = 1 << max_bits_horiz;
+  (void)mhoriz;
+  int sx;
+
+  for (int i = 0; i < p_height; i += 8) {
+    for (int j = 0; j < p_width; j += 8) {
+      // Calculate the center of this 8x8 block,
+      // project to luma coordinates (if in a subsampled chroma plane),
+      // apply the affine transformation,
+      // then convert back to the original coordinates (if necessary)
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int64_t dst_x =
+          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+      const int64_t dst_y =
+          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+      const int64_t x4 = dst_x >> subsampling_x;
+      const int64_t y4 = dst_y >> subsampling_y;
+
+      const int16_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      const int16_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      if (ix4 <= -7) {
+        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16(
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))));
+        }
+      } else if (ix4 >= width + 6) {
+        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm256_cvtepi16_epi32(
+              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                             ref[iy * stride + (width - 1)] *
+                                 (1 << (FILTER_BITS - reduce_bits_horiz))));
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        int32_t tmp1[8];
+        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          const int iy = clamp(iy4 + k, 0, height - 1);
+
+          sx = sx4 + beta * (k + 4);
+          for (int l = -4; l < 4; ++l) {
+            int ix = ix4 + l - 3;
+            const int offs = sx >> WARPEDDIFF_PREC_BITS;
+            const int16_t *coeffs = av1_warped_filter[offs];
+
+            int32_t sum = 1 << offset_bits_horiz;
+            for (int m = 0; m < 8; ++m) {
+              const int sample_x = clamp(ix + m, 0, width - 1);
+              sum += ref[iy * stride + sample_x] * coeffs[m];
+            }
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+            tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum;
+            sx += alpha;
+          }
+          tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1);
+        }
+      } else {
+        if (beta == 0 && alpha == 0) {
+          sx = sx4;
+          __m128i v_01 = _mm_loadu_si128(
+              (__m128i *)
+                  av1_warped_filter[sx >>
+                                    WARPEDDIFF_PREC_BITS]);  // A7A6A5A4A3A2A1A0
+          __m256i v_c01 = _mm256_broadcastd_epi32(v_01);     // A1A0A1A0A1A0A1A0
+          __m256i v_c23 = _mm256_broadcastd_epi32(
+              _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
+          __m256i v_c45 = _mm256_broadcastd_epi32(
+              _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
+          __m256i v_c67 = _mm256_broadcastd_epi32(
+              _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
+          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+            int iy = iy4 + k;
+            if (iy < 0)
+              iy = 0;
+            else if (iy > height - 1)
+              iy = height - 1;
+            iy = iy * stride;
+
+            __m256i v_refl = _mm256_inserti128_si256(
+                _mm256_set1_epi16(0),
+                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+                1);  // R15 .. R0
+
+            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+            __m256i v_refu =
+                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+            v_refu = _mm256_inserti128_si256(
+                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+            __m256i v_sum = _mm256_set1_epi32(ohoriz);
+            __m256i parsum = _mm256_madd_epi16(
+                v_c01, _mm256_alignr_epi8(v_refu, v_refl,
+                                          0));  // R8R7R6..R1R7R6R5..R1R0
+            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+            parsum = _mm256_madd_epi16(
+                v_c23,
+                _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
+            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+            parsum = _mm256_madd_epi16(
+                v_c45, _mm256_alignr_epi8(v_refu, v_refl,
+                                          8));  // R12R11..R5R11R10..R5R4
+            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+            parsum = _mm256_madd_epi16(
+                v_c67, _mm256_alignr_epi8(v_refu, v_refl,
+                                          12));  // R14R13..R7R13R12..R7R6
+            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+                                           reduce_bits_horiz);
+          }
+        } else if (alpha == 0) {
+          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+            int iy = iy4 + k;
+            if (iy < 0)
+              iy = 0;
+            else if (iy > height - 1)
+              iy = height - 1;
+            iy = iy * stride;
+
+            sx = sx4 + beta * (k + 4);
+
+            __m128i v_01 = _mm_loadu_si128(
+                (__m128i *)av1_warped_filter
+                    [sx >> WARPEDDIFF_PREC_BITS]);          // A7A6A5A4A3A2A1A0
+            __m256i v_c01 = _mm256_broadcastd_epi32(v_01);  // A1A0A1A0A1A0A1A0
+            __m256i v_c23 = _mm256_broadcastd_epi32(
+                _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
+            __m256i v_c45 = _mm256_broadcastd_epi32(
+                _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
+            __m256i v_c67 = _mm256_broadcastd_epi32(
+                _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
+
+            __m256i v_refl = _mm256_inserti128_si256(
+                _mm256_set1_epi16(0),
+                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+                1);  // R15 .. R0
+
+            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+            __m256i v_refu =
+                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
+
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+            v_refu = _mm256_inserti128_si256(
+                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+            __m256i v_sum = _mm256_set1_epi32(ohoriz);
+            __m256i parsum =
+                _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
+            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+            parsum =
+                _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
+            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+            parsum =
+                _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
+            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+            parsum = _mm256_madd_epi16(v_c67,
+                                       _mm256_alignr_epi8(v_refu, v_refl, 12));
+            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+                                           reduce_bits_horiz);
+          }
+        } else if (beta == 0) {
+          sx = sx4;
+          __m256i v_coeff01 = _mm256_inserti128_si256(
+              v_zeros,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
+              0);
+          v_coeff01 = _mm256_inserti128_si256(
+              v_coeff01,
+              _mm_loadu_si128(
+                  (__m128i *)
+                      av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]),
+              1);  // B7B6..B1B0A7A6..A1A0
+          __m256i v_coeff23 = _mm256_inserti128_si256(
+              v_zeros,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
+                                               WARPEDDIFF_PREC_BITS]),
+              0);
+          v_coeff23 = _mm256_inserti128_si256(
+              v_coeff23,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
+                                               WARPEDDIFF_PREC_BITS]),
+              1);  // D7D6..D1D0C7C6..C1C0
+          __m256i v_coeff45 = _mm256_inserti128_si256(
+              v_zeros,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
+                                               WARPEDDIFF_PREC_BITS]),
+              0);
+          v_coeff45 = _mm256_inserti128_si256(
+              v_coeff45,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
+                                               WARPEDDIFF_PREC_BITS]),
+              1);  // F7F6..F1F0E7E6..E1E0
+          __m256i v_coeff67 = _mm256_inserti128_si256(
+              v_zeros,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
+                                               WARPEDDIFF_PREC_BITS]),
+              0);
+          v_coeff67 = _mm256_inserti128_si256(
+              v_coeff67,
+              _mm_loadu_si128(
+                  (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
+                                               WARPEDDIFF_PREC_BITS]),
+              1);  // H7H6..H1H0G7G6..G1G0
+
+          __m256i v_c0123 = _mm256_unpacklo_epi32(
+              v_coeff01,
+              v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
+          __m256i v_c0123u = _mm256_unpackhi_epi32(
+              v_coeff01,
+              v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
+          __m256i v_c4567 = _mm256_unpacklo_epi32(
+              v_coeff45,
+              v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
+          __m256i v_c4567u = _mm256_unpackhi_epi32(
+              v_coeff45,
+              v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
+
+          __m256i v_c01 = _mm256_unpacklo_epi64(
+              v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
+          __m256i v_c23 =
+              _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
+          __m256i v_c45 =
+              _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
+          __m256i v_c67 =
+              _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
+
+          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+            int iy = iy4 + k;
+            if (iy < 0)
+              iy = 0;
+            else if (iy > height - 1)
+              iy = height - 1;
+            iy = iy * stride;
+
+            __m256i v_refl = _mm256_inserti128_si256(
+                _mm256_set1_epi16(0),
+                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+                1);  // R15 .. R0
+
+            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+            __m256i v_refu =
+                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
+
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+            v_refu = _mm256_inserti128_si256(
+                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+            __m256i v_sum = _mm256_set1_epi32(ohoriz);
+            __m256i parsum = _mm256_madd_epi16(
+                v_c01, _mm256_alignr_epi8(v_refu, v_refl,
+                                          0));  // R8R7R6..R1R7R6R5..R1R0
+            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+            parsum = _mm256_madd_epi16(
+                v_c23,
+                _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
+            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+            parsum = _mm256_madd_epi16(
+                v_c45, _mm256_alignr_epi8(v_refu, v_refl,
+                                          8));  // R12R11..R5R11R10..R5R4
+            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+            parsum = _mm256_madd_epi16(
+                v_c67, _mm256_alignr_epi8(v_refu, v_refl,
+                                          12));  // R14R13..R7R13R12..R7R6
+            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+                                           reduce_bits_horiz);
+          }
+
+        } else {
+          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+            int iy = iy4 + k;
+            if (iy < 0)
+              iy = 0;
+            else if (iy > height - 1)
+              iy = height - 1;
+            iy = iy * stride;
+
+            sx = sx4 + beta * (k + 4);
+
+            __m256i v_coeff01 = _mm256_inserti128_si256(
+                v_zeros,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
+                0);
+            v_coeff01 = _mm256_inserti128_si256(
+                v_coeff01,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                1);  // B7B6..B1B0A7A6..A1A0
+            __m256i v_coeff23 = _mm256_inserti128_si256(
+                v_zeros,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                0);
+            v_coeff23 = _mm256_inserti128_si256(
+                v_coeff23,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                1);  // D7D6..D1D0C7C6..C1C0
+            __m256i v_coeff45 = _mm256_inserti128_si256(
+                v_zeros,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                0);
+            v_coeff45 = _mm256_inserti128_si256(
+                v_coeff45,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                1);  // F7F6..F1F0E7E6..E1E0
+            __m256i v_coeff67 = _mm256_inserti128_si256(
+                v_zeros,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                0);
+            v_coeff67 = _mm256_inserti128_si256(
+                v_coeff67,
+                _mm_loadu_si128(
+                    (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
+                                                 WARPEDDIFF_PREC_BITS]),
+                1);  // H7H6..H1H0G7G6..G1G0
+
+            __m256i v_c0123 = _mm256_unpacklo_epi32(
+                v_coeff01,
+                v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
+            __m256i v_c0123u = _mm256_unpackhi_epi32(
+                v_coeff01,
+                v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
+            __m256i v_c4567 = _mm256_unpacklo_epi32(
+                v_coeff45,
+                v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
+            __m256i v_c4567u = _mm256_unpackhi_epi32(
+                v_coeff45,
+                v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
+
+            __m256i v_c01 = _mm256_unpacklo_epi64(
+                v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
+            __m256i v_c23 =
+                _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
+            __m256i v_c45 =
+                _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
+            __m256i v_c67 =
+                _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
+
+            __m256i v_refl = _mm256_inserti128_si256(
+                _mm256_set1_epi16(0),
+                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+                1);  // R15 .. R0
+
+            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+            __m256i v_refu =
+                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
+
+            v_refl = _mm256_inserti128_si256(
+                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+            v_refu = _mm256_inserti128_si256(
+                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+            __m256i v_sum = _mm256_set1_epi32(ohoriz);
+            __m256i parsum =
+                _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
+            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+            parsum =
+                _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
+            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+            parsum =
+                _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
+            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+            parsum = _mm256_madd_epi16(v_c67,
+                                       _mm256_alignr_epi8(v_refu, v_refl, 12));
+            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+                                           reduce_bits_horiz);
+          }
+        }
+      }
+
+      // Vertical filter
+      for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+        const __m256i *src = tmp + (k + 4);
+
+        __m256i v_coeff01 = _mm256_inserti128_si256(
+            v_zeros,
+            _mm_loadu_si128(
+                (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]),
+            0);
+        v_coeff01 = _mm256_inserti128_si256(
+            v_coeff01,
+            _mm_loadu_si128(
+                (__m128i *)
+                    av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]),
+            1);
+        __m256i v_coeff23 = _mm256_inserti128_si256(
+            v_zeros,
+            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >>
+                                                         WARPEDDIFF_PREC_BITS]),
+            0);
+        v_coeff23 = _mm256_inserti128_si256(
+            v_coeff23,
+            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >>
+                                                         WARPEDDIFF_PREC_BITS]),
+            1);
+        __m256i v_coeff45 = _mm256_inserti128_si256(
+            v_zeros,
+            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >>
+                                                         WARPEDDIFF_PREC_BITS]),
+            0);
+        v_coeff45 = _mm256_inserti128_si256(
+            v_coeff45,
+            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >>
+                                                         WARPEDDIFF_PREC_BITS]),
+            1);
+        __m256i v_coeff67 = _mm256_inserti128_si256(
+            v_zeros,
+            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >>
+                                                         WARPEDDIFF_PREC_BITS]),
+            0);
+        v_coeff67 = _mm256_inserti128_si256(
+            v_coeff67,
+            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >>
+                                                         WARPEDDIFF_PREC_BITS]),
+            1);
+
+        __m256i v_c0123 = _mm256_unpacklo_epi32(
+            v_coeff01,
+            v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
+        __m256i v_c0123u = _mm256_unpackhi_epi32(
+            v_coeff01,
+            v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
+        __m256i v_c4567 = _mm256_unpacklo_epi32(
+            v_coeff45,
+            v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
+        __m256i v_c4567u = _mm256_unpackhi_epi32(
+            v_coeff45,
+            v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
+
+        __m256i v_c01 = _mm256_unpacklo_epi64(
+            v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
+        __m256i v_c23 =
+            _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
+        __m256i v_c45 =
+            _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
+        __m256i v_c67 =
+            _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
+
+        __m256i v_src01l =
+            _mm256_unpacklo_epi32(src[0], src[1]);  // T13T03T11T01T12T02T10T00
+        __m256i v_src01u =
+            _mm256_unpackhi_epi32(src[0], src[1]);  // T17T07T15T05T16T06T14T04
+        __m256i v_sum =
+            _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u),
+                              v_c01);  // S7S5S3S1S6S4S2S0
+
+        __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]);
+        __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]);
+        v_sum = _mm256_add_epi32(
+            v_sum,
+            _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23));
+
+        __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]);
+        __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]);
+        v_sum = _mm256_add_epi32(
+            v_sum,
+            _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45));
+
+        __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]);
+        __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]);
+        v_sum = _mm256_add_epi32(
+            v_sum,
+            _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67));
+
+        // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0
+
+        __m256i v_suml =
+            _mm256_permute4x64_epi64(v_sum, 0xD8);  // S7S5S6S4S3S1S2S0
+        __m256i v_sumh =
+            _mm256_permute4x64_epi64(v_sum, 0x32);      // S2S0S7S5S2S0S3S1
+        v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh);  // S7S6S5S4S3S2S1S0
+
+        if (conv_params->is_compound) {
+          __m128i *const p =
+              (__m128i *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+
+          v_sum = _mm256_add_epi32(v_sum, res_add_const);
+          v_sum =
+              _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const),
+                               reduce_bits_vert_shift);
+          if (conv_params->do_average) {
+            __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+            __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p));
+
+            if (conv_params->use_dist_wtd_comp_avg) {
+              v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0),
+                                       _mm256_mullo_epi32(v_sum, wt1));
+              v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS);
+            } else {
+              v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1);
+            }
+
+            __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const);
+            v_sum1 = _mm256_sra_epi32(
+                _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift);
+
+            __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1);
+            v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8);
+            v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel);
+            _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0));
+          } else {
+            v_sum = _mm256_packus_epi32(v_sum, v_sum);
+            __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8);
+            _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
+          }
+        } else {
+          // Round and pack into 8 bits
+          const __m256i round_const =
+              _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                                ((1 << reduce_bits_vert) >> 1));
+
+          __m256i v_sum1 = _mm256_srai_epi32(
+              _mm256_add_epi32(v_sum, round_const), reduce_bits_vert);
+
+          v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1);
+          __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8);
+          // Clamp res_16bit to the range [0, 2^bd - 1]
+          const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1);
+          const __m256i zero = _mm256_setzero_si256();
+          v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero);
+
+          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+          _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
+        }
+      }
+    }
+  }
+}
diff --git a/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c b/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c
index 60a8193088..9df0ddc5e6 100644
--- a/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c
@@ -350,14 +350,16 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
     for (j = 0; j < p_width; j += 8) {
       const int32_t src_x = (p_col + j + 4) << subsampling_x;
       const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      const int64_t dst_x =
+          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+      const int64_t dst_y =
+          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+      const int64_t x4 = dst_x >> subsampling_x;
+      const int64_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       // Add in all the constant terms, including rounding and offset
diff --git a/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c b/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c
index 6de61573ef..7a13d4a67b 100644
--- a/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c
+++ b/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c
@@ -38,8 +38,7 @@ static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
 void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
                                   uint8_t *dst0, int dst_stride0, int w, int h,
                                   const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  const int subpel_x_qn,
                                   ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -64,9 +63,6 @@ void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
 
-  (void)filter_params_y;
-  (void)subpel_y_qn;
-
   __m256i filt[4], coeffs[4];
 
   filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
@@ -189,9 +185,8 @@ void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
 
 void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride,
                                   uint8_t *dst0, int dst_stride0, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -222,10 +217,6 @@ void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride,
 
   prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
 
-  (void)conv_params;
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-
   // Condition for checking valid vert_filt taps
   if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
     is_vert_4tap = 1;
@@ -802,18 +793,12 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
   }
 }
 
-void av1_dist_wtd_convolve_2d_copy_avx2(
-    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
+                                        uint8_t *dst0, int dst_stride0, int w,
+                                        int h, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_qn;
-  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
diff --git a/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c b/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c
index f8f640a114..b8400c062d 100644
--- a/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c
+++ b/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c
@@ -19,8 +19,7 @@
 void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
                                   uint8_t *dst0, int dst_stride0, int w, int h,
                                   const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  const int subpel_x_qn,
                                   ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -47,9 +46,6 @@ void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
   const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
   __m128i coeffs[4];
 
-  (void)filter_params_y;
-  (void)subpel_y_qn;
-
   prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
 
   if (w == 4) {
@@ -152,9 +148,8 @@ void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
 
 void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
                                   uint8_t *dst0, int dst_stride0, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -179,9 +174,6 @@ void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
   __m128i coeffs[4];
 
-  (void)filter_params_x;
-  (void)subpel_x_qn;
-
   prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
 
   if (w == 4) {
diff --git a/media/libaom/src/av1/common/x86/reconinter_avx2.c b/media/libaom/src/av1/common/x86/reconinter_avx2.c
index a38bd83177..71fab7a577 100644
--- a/media/libaom/src/av1/common/x86/reconinter_avx2.c
+++ b/media/libaom/src/av1/common/x86/reconinter_avx2.c
@@ -514,6 +514,8 @@ void av1_build_compound_diffwtd_mask_d16_avx2(
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+
 void av1_build_compound_diffwtd_mask_highbd_avx2(
     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
     int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
@@ -618,3 +620,5 @@ void av1_build_compound_diffwtd_mask_highbd_avx2(
     }
   }
 }
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/av1/common/x86/reconinter_ssse3.c b/media/libaom/src/av1/common/x86/reconinter_ssse3.c
index cf684447c5..c9a3709a62 100644
--- a/media/libaom/src/av1/common/x86/reconinter_ssse3.c
+++ b/media/libaom/src/av1/common/x86/reconinter_ssse3.c
@@ -9,10 +9,12 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <tmmintrin.h>
-
 #include "config/av1_rtcd.h"
 
+#if CONFIG_AV1_HIGHBITDEPTH
+
+#include <tmmintrin.h>
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
 #include "aom_dsp/x86/synonyms.h"
@@ -114,3 +116,5 @@ void av1_build_compound_diffwtd_mask_highbd_ssse3(
     }
   }
 }
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/av1/common/x86/resize_ssse3.c b/media/libaom/src/av1/common/x86/resize_ssse3.c
new file mode 100644
index 0000000000..0d871de717
--- /dev/null
+++ b/media/libaom/src/av1/common/x86/resize_ssse3.c
@@ -0,0 +1,947 @@
+/*
+ *
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>  // SSSE3
+#include "config/av1_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_ssse3.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "av1/common/resize.h"
+
+static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+    const uint8_t *const src, const __m128i *const mask) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
+  const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
+  const __m128i a_and = _mm_and_si128(a, *mask);
+  const __m128i b_and = _mm_and_si128(b, *mask);
+  return _mm_packus_epi16(a_and, b_and);
+}
+
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+                                            __m128i *const f) {
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+  // half of f[0] and f[4].
+  assert(filter[3] >= 0 && filter[3] < 256);
+  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+  f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+                                                    const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  // compensate the subtracted 64 in f[1]. x4 is always non negative.
+  const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+  // add and saturate the results together
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, x1);
+  temp = _mm_adds_epi16(temp, x2);
+  temp = _mm_adds_epi16(temp, x4);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_adds_epi16(temp, k_64);
+  temp = _mm_srai_epi16(temp, 7);
+  return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+                                                   const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+  // compensate the subtracted 64 in f[2]. x5 is always non negative.
+  const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+  __m128i temp;
+
+  // add and saturate the results together
+  temp = _mm_adds_epi16(x0, x1);
+  temp = _mm_adds_epi16(temp, x2);
+  temp = _mm_adds_epi16(temp, x3);
+  temp = _mm_adds_epi16(temp, x4);
+  temp = _mm_adds_epi16(temp, x5);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_adds_epi16(temp, k_64);
+  temp = _mm_srai_epi16(temp, 7);
+  return temp;
+}
+
+static void scale_plane_2_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
+  const __m128i mask = _mm_set1_epi16(0x00FF);
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+      _mm_storeu_si128((__m128i *)dst, d);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
+  const __m128i mask = _mm_set1_epi32(0x000000FF);
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
+      const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
+      const __m128i d2 = _mm_packus_epi16(d0, d1);
+      _mm_storeu_si128((__m128i *)dst, d2);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+                                                  const __m128i c0c1) {
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
+  const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
+  // round and shift by 7 bit each 16 bit
+  const __m128i t2 = _mm_adds_epi16(t0, k_64);
+  const __m128i t3 = _mm_adds_epi16(t1, k_64);
+  const __m128i t4 = _mm_srai_epi16(t2, 7);
+  const __m128i t5 = _mm_srai_epi16(t3, 7);
+  return _mm_packus_epi16(t4, t5);
+}
+
+static void scale_plane_2_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[2], d[2];
+
+      // Horizontal
+      // Even rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // odd rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      d[1] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // Vertical
+      s[0] = _mm_unpacklo_epi8(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi8(d[0], d[1]);
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[8], d[8];
+
+      // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
+      //       Here we tried to not use shuffle instructions which would be slow
+      //       on some x86 CPUs.
+
+      // Horizontal
+      // 000 001 xx xx 004 005 xx xx  008 009 xx xx 00C 00D xx xx
+      // 010 011 xx xx 014 015 xx xx  018 019 xx xx 01C 01D xx xx
+      // 020 021 xx xx 024 025 xx xx  028 029 xx xx 02C 02D xx xx
+      // 030 031 xx xx 034 035 xx xx  038 039 xx xx 03C 03D xx xx
+      // 100 101 xx xx 104 105 xx xx  108 109 xx xx 10C 10D xx xx
+      // 110 111 xx xx 114 115 xx xx  118 119 xx xx 11C 11D xx xx
+      // 120 121 xx xx 124 125 xx xx  128 129 xx xx 12C 12D xx xx
+      // 130 131 xx xx 134 135 xx xx  138 139 xx xx 13C 13D xx xx
+      s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
+      s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
+      s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
+      s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
+      s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
+      s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
+
+      // 000 001 100 101 xx xx xx xx  004 005 104 105 xx xx xx xx
+      // 008 009 108 109 xx xx xx xx  00C 00D 10C 10D xx xx xx xx
+      // 010 011 110 111 xx xx xx xx  014 015 114 115 xx xx xx xx
+      // 018 019 118 119 xx xx xx xx  01C 01D 11C 11D xx xx xx xx
+      // 020 021 120 121 xx xx xx xx  024 025 124 125 xx xx xx xx
+      // 028 029 128 129 xx xx xx xx  02C 02D 12C 12D xx xx xx xx
+      // 030 031 130 131 xx xx xx xx  034 035 134 135 xx xx xx xx
+      // 038 039 138 139 xx xx xx xx  03C 03D 13C 13D xx xx xx xx
+      d[0] = _mm_unpacklo_epi16(s[0], s[4]);
+      d[1] = _mm_unpackhi_epi16(s[0], s[4]);
+      d[2] = _mm_unpacklo_epi16(s[1], s[5]);
+      d[3] = _mm_unpackhi_epi16(s[1], s[5]);
+      d[4] = _mm_unpacklo_epi16(s[2], s[6]);
+      d[5] = _mm_unpackhi_epi16(s[2], s[6]);
+      d[6] = _mm_unpacklo_epi16(s[3], s[7]);
+      d[7] = _mm_unpackhi_epi16(s[3], s[7]);
+
+      // 000 001 100 101 008 009 108 109  xx xx xx xx xx xx xx xx
+      // 004 005 104 105 00C 00D 10C 10D  xx xx xx xx xx xx xx xx
+      // 010 011 110 111 018 019 118 119  xx xx xx xx xx xx xx xx
+      // 014 015 114 115 01C 01D 11C 11D  xx xx xx xx xx xx xx xx
+      // 020 021 120 121 028 029 128 129  xx xx xx xx xx xx xx xx
+      // 024 025 124 125 02C 02D 12C 12D  xx xx xx xx xx xx xx xx
+      // 030 031 130 131 038 039 138 139  xx xx xx xx xx xx xx xx
+      // 034 035 134 135 03C 03D 13C 13D  xx xx xx xx xx xx xx xx
+      s[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      s[2] = _mm_unpacklo_epi32(d[2], d[3]);
+      s[3] = _mm_unpackhi_epi32(d[2], d[3]);
+      s[4] = _mm_unpacklo_epi32(d[4], d[5]);
+      s[5] = _mm_unpackhi_epi32(d[4], d[5]);
+      s[6] = _mm_unpacklo_epi32(d[6], d[7]);
+      s[7] = _mm_unpackhi_epi32(d[6], d[7]);
+
+      // 000 001 100 101 004 005 104 105  008 009 108 109 00C 00D 10C 10D
+      // 010 011 110 111 014 015 114 115  018 019 118 119 01C 01D 11C 11D
+      // 020 021 120 121 024 025 124 125  028 029 128 129 02C 02D 12C 12D
+      // 030 031 130 131 034 035 134 135  038 039 138 139 03C 03D 13C 13D
+      d[0] = _mm_unpacklo_epi32(s[0], s[1]);
+      d[1] = _mm_unpacklo_epi32(s[2], s[3]);
+      d[2] = _mm_unpacklo_epi32(s[4], s[5]);
+      d[3] = _mm_unpacklo_epi32(s[6], s[7]);
+
+      d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
+      d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
+
+      // Vertical
+      d[0] = scale_plane_bilinear_kernel(d, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 1) & ~1;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 1) & ~1;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+  // horizontal 2x8
+  do {
+    load_8bit_8x8(src + 4, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75 (overlapped)
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[2]);
+      // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      transpose_16bit_4x8(&s[2], &s[2]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 01 11 21 31 41 51 61 71
+
+      // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
+      // 01 11 21 31 41 51 61 71  xx xx xx xx xx xx xx xx
+      d[0] = _mm_packus_epi16(d[0], d[0]);
+      d[1] = _mm_packus_epi16(d[1], d[1]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      d[0] = _mm_unpacklo_epi16(d[0], d[1]);
+      store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      t += 4;
+      x -= 2;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x2
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    t += 4 * width_hor;
+    y = height_ver;
+
+    do {
+      // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 10 11 12 13 14 15 16 17
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+    t -= width_hor * (4 * height_ver + 4);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 3) & ~3;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 3) & ~3;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+  // horizontal 4x8
+  do {
+    load_8bit_8x8(src + 2, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[3]);
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // 0C 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      transpose_16bit_4x8(&s[3], &s[3]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 01 11 21 31 41 51 61 71
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 02 12 22 32 42 52 62 72
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 03 13 23 33 43 53 63 73
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      d[0] = _mm_packus_epi16(d[0], d[2]);
+      d[1] = _mm_packus_epi16(d[1], d[3]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      d[2] = _mm_unpacklo_epi16(d[0], d[1]);
+      d[3] = _mm_unpackhi_epi16(d[0], d[1]);
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      d[0] = _mm_unpacklo_epi32(d[2], d[3]);
+      d[1] = _mm_unpackhi_epi32(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      t += 8;
+      x -= 4;
+    } while (x);
+    src += 8 * src_stride - 2 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x4
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
+    t += 6 * width_hor;
+    y = height_ver;
+
+    do {
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 10 11 12 13 14 15 16 17
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 20 21 22 23 24 25 26 27
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 30 31 32 33 34 35 36 37
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[1] = _mm_packus_epi16(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, dst, dst_stride);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      dst += 4 * dst_stride;
+      y -= 4;
+    } while (y);
+    t -= width_hor * (2 * height_ver + 6);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
+                                     __m128i *const f);
+
+typedef __m128i (*convolve8_funcs)(const __m128i *const s,
+                                   const __m128i *const f);
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const InterpKernel *const coef,
+                                       const int phase,
+                                       uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = 2 * width_hor + 4;  // store 4 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+  // above and (SUBPEL_TAPS / 2) extra rows below.
+  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[12], d[6], dd[4];
+  __m128i f0[4], f1[5], f2[5];
+  // The offset of the first row is always less than 1 pixel.
+  const int offset1_q4 = phase + 1 * step_q4;
+  const int offset2_q4 = phase + 2 * step_q4;
+  // offset_idxx indicates the pixel offset is even (0) or odd (1).
+  // It's used to choose the src offset and filter coefficient offset.
+  const int offset_idx1 = (offset1_q4 >> 4) & 1;
+  const int offset_idx2 = (offset2_q4 >> 4) & 1;
+  static const shuffle_filter_funcs shuffle_filter_func_list[2] = {
+    shuffle_filter_ssse3, shuffle_filter_odd_ssse3
+  };
+  static const convolve8_funcs convolve8_func_list[2] = {
+    convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
+  };
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef[(phase + 0 * step_q4) & SUBPEL_MASK], f0);
+  shuffle_filter_func_list[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+  shuffle_filter_func_list[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+
+  // Sub 64 to avoid overflow.
+  // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
+  // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
+  // When filter phase idx is 1, the two biggest coefficients are shuffled
+  // together, and the sum of them are always no less than 128. Sub 64 here.
+  // After the subtraction, when the sum of all positive coefficients are no
+  // larger than 128, and the sum of all negative coefficients are no
+  // less than -128, there will be no overflow in the convolve8 functions.
+  f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
+  f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
+  f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
+
+  // horizontal 6x8
+  do {
+    load_8bit_8x8(src, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[4]);
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // OC 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      // 0E 0F 1E 1F 2E 2F 3E 3F  4E 4F 5E 5F 6E 6F 7E 7F
+      transpose_16bit_4x8(&s[4], &s[4]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+      d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+      d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74  xx xx xx xx xx xx xx xx
+      // 05 15 25 35 45 55 65 75  xx xx xx xx xx xx xx xx
+      dd[0] = _mm_packus_epi16(d[0], d[2]);
+      dd[1] = _mm_packus_epi16(d[1], d[3]);
+      dd[2] = _mm_packus_epi16(d[4], d[4]);
+      dd[3] = _mm_packus_epi16(d[5], d[5]);
+
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      // 04 14 05 15 24 34 25 35  44 54 45 55 64 74 65 75
+      d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
+      d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
+      d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
+
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      // 04 14 05 15 xx xx xx xx  24 34 25 35 xx xx xx xx
+      // 44 54 45 55 xx xx xx xx  64 74 65 75 xx xx xx xx
+      dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
+      dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
+
+      // 00 10 01 11 02 12 03 13  04 14 05 15 xx xx xx xx
+      // 20 30 21 31 22 32 23 33  24 34 25 35 xx xx xx xx
+      // 40 50 41 51 42 52 43 53  44 54 45 55 xx xx xx xx
+      // 60 70 61 71 62 72 63 73  64 74 65 75 xx xx xx xx
+      d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
+      d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
+      d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
+      d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
+
+      // store 4 extra pixels
+      storeu_8bit_16x4(d, t, stride_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+      s[3] = s[7];
+
+      t += 12;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3;
+    t += 3 * stride_hor + 4;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+    loadu_8bit_16x4(t, stride_hor, s);
+    y = height_ver;
+
+    do {
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 97
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 B7
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 D7
+      // E0 F0 E1 F1 E2 F2 E3 F3  E4 F4 E5 F5 E6 F6 E7 F7
+      t += 4 * stride_hor;
+      loadu_8bit_16x4(t, stride_hor, &s[4]);
+
+      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+      d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+      d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[2] = _mm_packus_epi16(d[2], d[3]);
+      d[4] = _mm_packus_epi16(d[4], d[5]);
+
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
+      _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
+      _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
+      _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+      s[3] = s[7];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * 2 * height_ver / 3;
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
+                                                  const __m128i *const f) {
+  __m128i ss[4], temp;
+
+  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+  temp = convolve8_8_ssse3(ss, f);
+  return _mm_packus_epi16(temp, temp);
+}
+
+// Only calculate odd columns since even columns are just src pixels' copies.
+static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
+                                     const int w, const __m128i *const f) {
+  int x = w;
+
+  do {
+    __m128i s[8], temp;
+    s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
+    s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
+    s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
+    s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
+    s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
+    s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
+    s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
+    s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
+    temp = scale_1_to_2_phase_0_kernel(s, f);
+    _mm_storel_epi64((__m128i *)dst, temp);
+    src += 8;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_1_to_2_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int src_w, const int src_h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  int max_width;
+  int y;
+  uint8_t *tmp[9];
+  __m128i f[4];
+
+  max_width = (src_w + 7) & ~7;
+  tmp[0] = temp_buffer + 0 * max_width;
+  tmp[1] = temp_buffer + 1 * max_width;
+  tmp[2] = temp_buffer + 2 * max_width;
+  tmp[3] = temp_buffer + 3 * max_width;
+  tmp[4] = temp_buffer + 4 * max_width;
+  tmp[5] = temp_buffer + 5 * max_width;
+  tmp[6] = temp_buffer + 6 * max_width;
+  tmp[7] = temp_buffer + 7 * max_width;
+
+  shuffle_filter_ssse3(coef, f);
+
+  scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
+  scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
+  scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
+  scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
+  scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
+  scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
+  scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
+
+  y = src_h;
+  do {
+    int x;
+    scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
+    for (x = 0; x < max_width; x += 8) {
+      __m128i s[8], C, D, CD;
+
+      // Even rows
+      const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
+      const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+      const __m128i ab = _mm_unpacklo_epi8(a, b);
+      _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
+
+      // Odd rows
+      // Even columns
+      load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
+      C = scale_1_to_2_phase_0_kernel(s, f);
+
+      // Odd columns
+      s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
+      s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
+      s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
+      s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+      s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
+      s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
+      s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
+      s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
+      D = scale_1_to_2_phase_0_kernel(s, f);
+
+      CD = _mm_unpacklo_epi8(C, D);
+      _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
+    }
+
+    src += src_stride;
+    dst += 2 * dst_stride;
+    tmp[8] = tmp[0];
+    tmp[0] = tmp[1];
+    tmp[1] = tmp[2];
+    tmp[2] = tmp[3];
+    tmp[3] = tmp[4];
+    tmp[4] = tmp[5];
+    tmp[5] = tmp[6];
+    tmp[6] = tmp[7];
+    tmp[7] = tmp[8];
+  } while (--y);
+}
+
+void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
+                                       YV12_BUFFER_CONFIG *dst,
+                                       const InterpFilter filter,
+                                       const int phase, const int num_planes) {
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  int scaled = 0;
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+    const int is_uv = i > 0;
+    const int src_w = src->crop_widths[is_uv];
+    const int src_h = src->crop_heights[is_uv];
+    const int src_y_w = (src->crop_widths[0] + 1) & ~1;
+    const int dst_w = dst->crop_widths[is_uv];
+    const int dst_h = dst->crop_heights[is_uv];
+    const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
+    const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
+
+    if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+      // 2 to 1
+      scaled = 1;
+      if (phase == 0) {
+        scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h);
+      } else if (filter == BILINEAR) {
+        const int16_t c0 = av1_bilinear_filters[phase][3];
+        const int16_t c1 = av1_bilinear_filters[phase][4];
+        const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+        scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+                                    dst->buffers[i], dst->strides[is_uv], dst_w,
+                                    dst_h, c0c1);
+      } else {
+        const int buffer_stride = (dst_y_w + 3) & ~3;
+        const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+        uint8_t *const temp_buffer =
+            (uint8_t *)malloc(buffer_stride * buffer_height);
+        if (temp_buffer) {
+          const InterpKernel *interp_kernel =
+              (const InterpKernel *)av1_interp_filter_params_list[filter]
+                  .filter_ptr;
+          scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
+                                     dst->buffers[i], dst->strides[is_uv],
+                                     dst_w, dst_h, interp_kernel[phase],
+                                     temp_buffer);
+          free(temp_buffer);
+        } else {
+          scaled = 0;
+        }
+      }
+    } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+      // 4 to 1
+      scaled = 1;
+      if (phase == 0) {
+        scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h);
+      } else if (filter == BILINEAR) {
+        const int16_t c0 = av1_bilinear_filters[phase][3];
+        const int16_t c1 = av1_bilinear_filters[phase][4];
+        const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+        scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+                                    dst->buffers[i], dst->strides[is_uv], dst_w,
+                                    dst_h, c0c1);
+      } else {
+        const int buffer_stride = (dst_y_w + 1) & ~1;
+        const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+        // When dst_w is 1 or 2, we need extra padding to avoid heap read
+        // overflow
+        const int extra_padding = 16;
+        uint8_t *const temp_buffer =
+            (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
+        if (temp_buffer) {
+          const InterpKernel *interp_kernel =
+              (const InterpKernel *)av1_interp_filter_params_list[filter]
+                  .filter_ptr;
+          scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
+                                     dst->buffers[i], dst->strides[is_uv],
+                                     dst_w, dst_h, interp_kernel[phase],
+                                     temp_buffer);
+          free(temp_buffer);
+        } else {
+          scaled = 0;
+        }
+      }
+    } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+      // 4 to 3
+      const int buffer_stride_hor = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
+      const int buffer_stride_ver = (dst_y_w + 7) & ~7;
+      const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+      // When the vertical filter reads more pixels than the horizontal filter
+      // generated in each row, we need extra padding to avoid heap read
+      // overflow. For example, the horizontal filter generates 18 pixels but
+      // the vertical filter reads 24 pixels in a row. The difference is
+      // multiplied by 2 since two rows are interlaced together in the
+      // optimization.
+      const int extra_padding =
+          (buffer_stride_ver > buffer_stride_hor)
+              ? 2 * (buffer_stride_ver - buffer_stride_hor)
+              : 0;
+      const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
+      uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
+      if (temp_buffer) {
+        scaled = 1;
+        const InterpKernel *interp_kernel =
+            (const InterpKernel *)av1_interp_filter_params_list[filter]
+                .filter_ptr;
+        scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], dst_w,
+                                   dst_h, interp_kernel, phase, temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    } else if (dst_w == src_w * 2 && dst_h == src_h * 2) {
+      // 1 to 2
+      uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_y_w + 7) & ~7));
+      if (temp_buffer) {
+        scaled = 1;
+        const InterpKernel *interp_kernel =
+            (const InterpKernel *)av1_interp_filter_params_list[filter]
+                .filter_ptr;
+        scale_plane_1_to_2_phase_0(src->buffers[i], src->strides[is_uv],
+                                   dst->buffers[i], dst->strides[is_uv], src_w,
+                                   src_h, interp_kernel[8], temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  }
+  if (!scaled) {
+    av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
+  } else {
+    aom_extend_frame_borders(dst, num_planes);
+  }
+}
diff --git a/media/libaom/src/av1/common/x86/warp_plane_avx2.c b/media/libaom/src/av1/common/x86/warp_plane_avx2.c
index 53a928d76b..f6aaa8887b 100644
--- a/media/libaom/src/av1/common/x86/warp_plane_avx2.c
+++ b/media/libaom/src/av1/common/x86/warp_plane_avx2.c
@@ -1193,14 +1193,16 @@ void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
     for (j = 0; j < p_width; j += 8) {
       const int32_t src_x = (p_col + j + 4) << subsampling_x;
       const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      const int64_t dst_x =
+          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+      const int64_t dst_y =
+          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+      const int64_t x4 = dst_x >> subsampling_x;
+      const int64_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       // Add in all the constant terms, including rounding and offset
diff --git a/media/libaom/src/av1/common/x86/warp_plane_sse4.c b/media/libaom/src/av1/common/x86/warp_plane_sse4.c
index 10ddf92d02..b1df486f47 100644
--- a/media/libaom/src/av1/common/x86/warp_plane_sse4.c
+++ b/media/libaom/src/av1/common/x86/warp_plane_sse4.c
@@ -875,14 +875,16 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
     for (j = 0; j < p_width; j += 8) {
       const int32_t src_x = (p_col + j + 4) << subsampling_x;
       const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      const int64_t dst_x =
+          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+      const int64_t dst_y =
+          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+      const int64_t x4 = dst_x >> subsampling_x;
+      const int64_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       // Add in all the constant terms, including rounding and offset
diff --git a/media/libaom/src/av1/decoder/accounting.c b/media/libaom/src/av1/decoder/accounting.c
index 2e58d09e0d..1ded380ec3 100644
--- a/media/libaom/src/av1/decoder/accounting.c
+++ b/media/libaom/src/av1/decoder/accounting.c
@@ -47,6 +47,7 @@ int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str) {
   accounting->hash_dictionary[hash] = dictionary->num_strs;
   len = strlen(str);
   dictionary->strs[dictionary->num_strs] = malloc(len + 1);
+  if (!dictionary->strs[dictionary->num_strs]) abort();
   snprintf(dictionary->strs[dictionary->num_strs], len + 1, "%s", str);
   dictionary->num_strs++;
   return dictionary->num_strs - 1;
@@ -57,6 +58,7 @@ void aom_accounting_init(Accounting *accounting) {
   accounting->num_syms_allocated = 1000;
   accounting->syms.syms =
       malloc(sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+  if (!accounting->syms.syms) abort();
   accounting->syms.dictionary.num_strs = 0;
   assert(AOM_ACCOUNTING_HASH_SIZE > 2 * MAX_SYMBOL_TYPES);
   for (i = 0; i < AOM_ACCOUNTING_HASH_SIZE; i++)
@@ -116,7 +118,7 @@ void aom_accounting_record(Accounting *accounting, const char *str,
     accounting->syms.syms =
         realloc(accounting->syms.syms,
                 sizeof(AccountingSymbol) * accounting->num_syms_allocated);
-    assert(accounting->syms.syms != NULL);
+    if (!accounting->syms.syms) abort();
   }
   accounting->syms.syms[accounting->syms.num_syms++] = sym;
 }
diff --git a/media/libaom/src/av1/decoder/decodeframe.c b/media/libaom/src/av1/decoder/decodeframe.c
index 7abfac4aaa..5665082dd7 100644
--- a/media/libaom/src/av1/decoder/decodeframe.c
+++ b/media/libaom/src/av1/decoder/decodeframe.c
@@ -76,12 +76,11 @@
 // Checks that the remaining bits start with a 1 and ends with 0s.
 // It consumes an additional byte, if already byte aligned before the check.
 int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
-  AV1_COMMON *const cm = &pbi->common;
   // bit_offset is set to 0 (mod 8) when the reader is already byte aligned
   int bits_before_alignment = 8 - rb->bit_offset % 8;
   int trailing = aom_rb_read_literal(rb, bits_before_alignment);
   if (trailing != (1 << (bits_before_alignment - 1))) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return -1;
   }
   return 0;
@@ -110,7 +109,7 @@ static AOM_INLINE void set_planes_to_neutral_grey(
     for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
       const int is_uv = plane > 0;
       for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
-        memset(&buf->buffers[plane][row_idx * buf->uv_stride], 1 << 7,
+        memset(&buf->buffers[plane][row_idx * buf->strides[is_uv]], 1 << 7,
                buf->crop_widths[is_uv]);
       }
     }
@@ -140,31 +139,30 @@ static REFERENCE_MODE read_frame_reference_mode(
   }
 }
 
-static AOM_INLINE void inverse_transform_block(MACROBLOCKD *xd, int plane,
-                                               const TX_TYPE tx_type,
+static AOM_INLINE void inverse_transform_block(DecoderCodingBlock *dcb,
+                                               int plane, const TX_TYPE tx_type,
                                                const TX_SIZE tx_size,
                                                uint8_t *dst, int stride,
                                                int reduced_tx_set) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const dqcoeff = pd->dqcoeff_block + xd->cb_offset[plane];
-  eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  tran_low_t *const dqcoeff = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane];
+  eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
   uint16_t scan_line = eob_data->max_scan_line;
   uint16_t eob = eob_data->eob;
-  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, stride,
-                              eob, reduced_tx_set);
+  av1_inverse_transform_block(&dcb->xd, dqcoeff, plane, tx_type, tx_size, dst,
+                              stride, eob, reduced_tx_set);
   memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
 }
 
 static AOM_INLINE void read_coeffs_tx_intra_block(
-    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r,
     const int plane, const int row, const int col, const TX_SIZE tx_size) {
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  if (!mbmi->skip) {
+  MB_MODE_INFO *mbmi = dcb->xd.mi[0];
+  if (!mbmi->skip_txfm) {
 #if TXCOEFF_TIMER
     struct aom_usec_timer timer;
     aom_usec_timer_start(&timer);
 #endif
-    av1_read_coeffs_txb_facade(cm, xd, r, plane, row, col, tx_size);
+    av1_read_coeffs_txb_facade(cm, dcb, r, plane, row, col, tx_size);
 #if TXCOEFF_TIMER
     aom_usec_timer_mark(&timer);
     const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
@@ -175,12 +173,12 @@ static AOM_INLINE void read_coeffs_tx_intra_block(
 }
 
 static AOM_INLINE void decode_block_void(const AV1_COMMON *const cm,
-                                         MACROBLOCKD *const xd,
+                                         DecoderCodingBlock *dcb,
                                          aom_reader *const r, const int plane,
                                          const int row, const int col,
                                          const TX_SIZE tx_size) {
   (void)cm;
-  (void)xd;
+  (void)dcb;
   (void)r;
   (void)plane;
   (void)row;
@@ -189,10 +187,10 @@ static AOM_INLINE void decode_block_void(const AV1_COMMON *const cm,
 }
 
 static AOM_INLINE void predict_inter_block_void(AV1_COMMON *const cm,
-                                                MACROBLOCKD *const xd,
+                                                DecoderCodingBlock *dcb,
                                                 BLOCK_SIZE bsize) {
   (void)cm;
-  (void)xd;
+  (void)dcb;
   (void)bsize;
 }
 
@@ -203,37 +201,39 @@ static AOM_INLINE void cfl_store_inter_block_void(AV1_COMMON *const cm,
 }
 
 static AOM_INLINE void predict_and_reconstruct_intra_block(
-    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r,
     const int plane, const int row, const int col, const TX_SIZE tx_size) {
   (void)r;
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   PLANE_TYPE plane_type = get_plane_type(plane);
 
   av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
 
-  if (!mbmi->skip) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  if (!mbmi->skip_txfm) {
+    eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
     if (eob_data->eob) {
       const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
       // tx_type was read out in av1_read_coeffs_txb.
       const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size,
                                               reduced_tx_set_used);
+      struct macroblockd_plane *const pd = &xd->plane[plane];
       uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
-      inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+      inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride,
                               reduced_tx_set_used);
     }
   }
   if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) {
-    cfl_store_tx(xd, row, col, tx_size, mbmi->sb_type);
+    cfl_store_tx(xd, row, col, tx_size, mbmi->bsize);
   }
 }
 
 static AOM_INLINE void inverse_transform_inter_block(
-    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r,
     const int plane, const int blk_row, const int blk_col,
     const TX_SIZE tx_size) {
   (void)r;
+  MACROBLOCKD *const xd = &dcb->xd;
   PLANE_TYPE plane_type = get_plane_type(plane);
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
@@ -243,7 +243,7 @@ static AOM_INLINE void inverse_transform_inter_block(
 
   uint8_t *dst =
       &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
-  inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+  inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride,
                           reduced_tx_set_used);
 #if CONFIG_MISMATCH_DEBUG
   int pixel_c, pixel_r;
@@ -260,21 +260,22 @@ static AOM_INLINE void inverse_transform_inter_block(
 #endif
 }
 
-static AOM_INLINE void set_cb_buffer_offsets(MACROBLOCKD *const xd,
+static AOM_INLINE void set_cb_buffer_offsets(DecoderCodingBlock *dcb,
                                              TX_SIZE tx_size, int plane) {
-  xd->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
-  xd->txb_offset[plane] =
-      xd->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  dcb->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
+  dcb->txb_offset[plane] =
+      dcb->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
 }
 
 static AOM_INLINE void decode_reconstruct_tx(
     AV1_COMMON *cm, ThreadData *const td, aom_reader *r,
     MB_MODE_INFO *const mbmi, int plane, BLOCK_SIZE plane_bsize, int blk_row,
     int blk_col, int block, TX_SIZE tx_size, int *eob_total) {
-  MACROBLOCKD *const xd = &td->xd;
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const TX_SIZE plane_tx_size =
-      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
                                     pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
@@ -285,14 +286,14 @@ static AOM_INLINE void decode_reconstruct_tx(
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   if (tx_size == plane_tx_size || plane) {
-    td->read_coeffs_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+    td->read_coeffs_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col,
                                          tx_size);
 
-    td->inverse_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+    td->inverse_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col,
                                      tx_size);
-    eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+    eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
     *eob_total += eob_data->eob;
-    set_cb_buffer_offsets(xd, tx_size, plane);
+    set_cb_buffer_offsets(dcb, tx_size, plane);
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
@@ -300,16 +301,18 @@ static AOM_INLINE void decode_reconstruct_tx(
     const int bsw = tx_size_wide_unit[sub_txs];
     const int bsh = tx_size_high_unit[sub_txs];
     const int sub_step = bsw * bsh;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
 
     assert(bsw > 0 && bsh > 0);
 
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        const int offsetr = blk_row + row;
+    for (int row = 0; row < row_end; row += bsh) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += bsw) {
         const int offsetc = blk_col + col;
 
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
         decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr,
                               offsetc, block, sub_txs, eob_total);
         block += sub_step;
@@ -326,7 +329,7 @@ static AOM_INLINE void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
   const TileInfo *const tile = &xd->tile;
 
   set_mi_offsets(mi_params, xd, mi_row, mi_col);
-  xd->mi[0]->sb_type = bsize;
+  xd->mi[0]->bsize = bsize;
 #if CONFIG_RD_DEBUG
   xd->mi[0]->mi_row = mi_row;
   xd->mi[0]->mi_col = mi_col;
@@ -353,23 +356,24 @@ static AOM_INLINE void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
 }
 
 static AOM_INLINE void decode_mbmi_block(AV1Decoder *const pbi,
-                                         MACROBLOCKD *const xd, int mi_row,
+                                         DecoderCodingBlock *dcb, int mi_row,
                                          int mi_col, aom_reader *r,
                                          PARTITION_TYPE partition,
                                          BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col);
   const int y_mis = AOMMIN(bh, cm->mi_params.mi_rows - mi_row);
+  MACROBLOCKD *const xd = &dcb->xd;
 
 #if CONFIG_ACCOUNTING
   aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
 #endif
   set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
   xd->mi[0]->partition = partition;
-  av1_read_mode_info(pbi, xd, r, x_mis, y_mis);
+  av1_read_mode_info(pbi, dcb, r, x_mis, y_mis);
   if (bsize >= BLOCK_8X8 &&
       (seq_params->subsampling_x || seq_params->subsampling_y)) {
     const BLOCK_SIZE uv_subsize =
@@ -629,8 +633,8 @@ static void dec_calc_subpel_params(const MV *const src_mv,
 
 static void dec_calc_subpel_params_and_extend(
     const MV *const src_mv, InterPredParams *const inter_pred_params,
-    MACROBLOCKD *xd, int mi_x, int mi_y, int ref, uint8_t **pre,
-    SubpelParams *subpel_params, int *src_stride) {
+    MACROBLOCKD *const xd, int mi_x, int mi_y, int ref, uint8_t **mc_buf,
+    uint8_t **pre, SubpelParams *subpel_params, int *src_stride) {
   PadBlock block;
   MV32 scaled_mv;
   int subpel_x_mv, subpel_y_mv;
@@ -641,26 +645,30 @@ static void dec_calc_subpel_params_and_extend(
       inter_pred_params->scale_factors, &inter_pred_params->ref_frame_buf,
       scaled_mv, block, subpel_x_mv, subpel_y_mv,
       inter_pred_params->mode == WARP_PRED, inter_pred_params->is_intrabc,
-      inter_pred_params->use_hbd_buf, xd->mc_buf[ref], pre, src_stride);
+      inter_pred_params->use_hbd_buf, mc_buf[ref], pre, src_stride);
 }
 
-static void dec_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                       int plane, const MB_MODE_INFO *mi,
+static void dec_build_inter_predictors(const AV1_COMMON *cm,
+                                       DecoderCodingBlock *dcb, int plane,
+                                       const MB_MODE_INFO *mi,
                                        int build_for_obmc, int bw, int bh,
                                        int mi_x, int mi_y) {
-  av1_build_inter_predictors(cm, xd, plane, mi, build_for_obmc, bw, bh, mi_x,
-                             mi_y, dec_calc_subpel_params_and_extend);
+  av1_build_inter_predictors(cm, &dcb->xd, plane, mi, build_for_obmc, bw, bh,
+                             mi_x, mi_y, dcb->mc_buf,
+                             dec_calc_subpel_params_and_extend);
 }
 
 static AOM_INLINE void dec_build_inter_predictor(const AV1_COMMON *cm,
-                                                 MACROBLOCKD *xd, int mi_row,
-                                                 int mi_col, BLOCK_SIZE bsize) {
+                                                 DecoderCodingBlock *dcb,
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &dcb->xd;
   const int num_planes = av1_num_planes(cm);
   for (int plane = 0; plane < num_planes; ++plane) {
     if (plane && !xd->is_chroma_ref) break;
     const int mi_x = mi_col * MI_SIZE;
     const int mi_y = mi_row * MI_SIZE;
-    dec_build_inter_predictors(cm, xd, plane, xd->mi[0], 0,
+    dec_build_inter_predictors(cm, dcb, plane, xd->mi[0], 0,
                                xd->plane[plane].width, xd->plane[plane].height,
                                mi_x, mi_y);
     if (is_interintra_pred(xd->mi[0])) {
@@ -676,7 +684,7 @@ static AOM_INLINE void dec_build_inter_predictor(const AV1_COMMON *cm,
 }
 
 static INLINE void dec_build_prediction_by_above_pred(
-    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    MACROBLOCKD *const xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
     int dir, MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
   struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
   const int above_mi_col = xd->mi_col + rel_mi_col;
@@ -691,7 +699,7 @@ static INLINE void dec_build_prediction_by_above_pred(
   mi_x = above_mi_col << MI_SIZE_LOG2;
   mi_y = xd->mi_row << MI_SIZE_LOG2;
 
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 
   for (int j = 0; j < num_planes; ++j) {
     const struct macroblockd_plane *pd = &xd->plane[j];
@@ -700,15 +708,16 @@ static INLINE void dec_build_prediction_by_above_pred(
                    block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
 
     if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
-    dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
-                               mi_y);
+    dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j,
+                               &backup_mbmi, 1, bw, bh, mi_x, mi_y);
   }
 }
 
 static AOM_INLINE void dec_build_prediction_by_above_preds(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE],
-    int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE],
-    int tmp_stride[MAX_MB_PLANE]) {
+    const AV1_COMMON *cm, DecoderCodingBlock *dcb,
+    uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+  MACROBLOCKD *const xd = &dcb->xd;
   if (!xd->up_available) return;
 
   // Adjust mb_to_bottom_edge to have the correct value for the OBMC
@@ -717,10 +726,10 @@ static AOM_INLINE void dec_build_prediction_by_above_preds(
   const int this_height = xd->height * MI_SIZE;
   const int pred_height = AOMMIN(this_height / 2, 32);
   xd->mb_to_bottom_edge += GET_MV_SUBPEL(this_height - pred_height);
-  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
-                                        tmp_width,  tmp_height,
-                                        tmp_stride, xd->mb_to_right_edge };
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  struct build_prediction_ctxt ctxt = {
+    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, dcb
+  };
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   foreach_overlappable_nb_above(cm, xd,
                                 max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                 dec_build_prediction_by_above_pred, &ctxt);
@@ -731,7 +740,7 @@ static AOM_INLINE void dec_build_prediction_by_above_preds(
 }
 
 static INLINE void dec_build_prediction_by_left_pred(
-    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    MACROBLOCKD *const xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
     int dir, MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
   struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
   const int left_mi_row = xd->mi_row + rel_mi_row;
@@ -745,7 +754,7 @@ static INLINE void dec_build_prediction_by_left_pred(
                                           &backup_mbmi, ctxt, num_planes);
   mi_x = xd->mi_col << MI_SIZE_LOG2;
   mi_y = left_mi_row << MI_SIZE_LOG2;
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 
   for (int j = 0; j < num_planes; ++j) {
     const struct macroblockd_plane *pd = &xd->plane[j];
@@ -754,15 +763,16 @@ static INLINE void dec_build_prediction_by_left_pred(
     int bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y;
 
     if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
-    dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
-                               mi_y);
+    dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j,
+                               &backup_mbmi, 1, bw, bh, mi_x, mi_y);
   }
 }
 
 static AOM_INLINE void dec_build_prediction_by_left_preds(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE],
-    int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE],
-    int tmp_stride[MAX_MB_PLANE]) {
+    const AV1_COMMON *cm, DecoderCodingBlock *dcb,
+    uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+  MACROBLOCKD *const xd = &dcb->xd;
   if (!xd->left_available) return;
 
   // Adjust mb_to_right_edge to have the correct value for the OBMC
@@ -772,10 +782,10 @@ static AOM_INLINE void dec_build_prediction_by_left_preds(
   const int pred_width = AOMMIN(this_width / 2, 32);
   xd->mb_to_right_edge += GET_MV_SUBPEL(this_width - pred_width);
 
-  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
-                                        tmp_width,  tmp_height,
-                                        tmp_stride, xd->mb_to_bottom_edge };
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  struct build_prediction_ctxt ctxt = {
+    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, dcb
+  };
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   foreach_overlappable_nb_left(cm, xd,
                                max_neighbor_obmc[mi_size_high_log2[bsize]],
                                dec_build_prediction_by_left_pred, &ctxt);
@@ -785,33 +795,8 @@ static AOM_INLINE void dec_build_prediction_by_left_preds(
   xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
 }
 
-static void set_dst_buf(MACROBLOCKD *xd, uint8_t **dst_buf1,
-                        uint8_t **dst_buf2) {
-  dst_buf1[0] = xd->tmp_obmc_bufs[0];
-  dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
-  dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
-  dst_buf2[0] = xd->tmp_obmc_bufs[1];
-  dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
-  dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
-}
-
-#if CONFIG_AV1_HIGHBITDEPTH
-static void set_dst_buf_highbd(MACROBLOCKD *xd, uint8_t **dst_buf1,
-                               uint8_t **dst_buf2) {
-  int len = sizeof(uint16_t);
-  dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
-  dst_buf1[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
-  dst_buf1[2] =
-      CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
-  dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
-  dst_buf2[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
-  dst_buf2[2] =
-      CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
-}
-#endif
-
-static AOM_INLINE void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
-                                                          MACROBLOCKD *xd) {
+static AOM_INLINE void dec_build_obmc_inter_predictors_sb(
+    const AV1_COMMON *cm, DecoderCodingBlock *dcb) {
   const int num_planes = av1_num_planes(cm);
   uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
   int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -821,24 +806,17 @@ static AOM_INLINE void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-#if CONFIG_AV1_HIGHBITDEPTH
-  if (is_cur_buf_hbd(xd)) {
-    set_dst_buf_highbd(xd, dst_buf1, dst_buf2);
-  } else {
-    set_dst_buf(xd, dst_buf1, dst_buf2);
-  }
-#else
-  set_dst_buf(xd, dst_buf1, dst_buf2);
-#endif
+  MACROBLOCKD *const xd = &dcb->xd;
+  av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2);
 
-  dec_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1,
-                                      dst_stride1);
-  dec_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2,
+  dec_build_prediction_by_above_preds(cm, dcb, dst_buf1, dst_width1,
+                                      dst_height1, dst_stride1);
+  dec_build_prediction_by_left_preds(cm, dcb, dst_buf2, dst_width2, dst_height2,
                                      dst_stride2);
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
-  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, &cm->cur_frame->buf,
-                       mi_row, mi_col, 0, num_planes);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row,
+                       mi_col, 0, num_planes);
   av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
                                   dst_stride2);
 }
@@ -847,13 +825,14 @@ static AOM_INLINE void cfl_store_inter_block(AV1_COMMON *const cm,
                                              MACROBLOCKD *const xd) {
   MB_MODE_INFO *mbmi = xd->mi[0];
   if (store_cfl_required(cm, xd)) {
-    cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
+    cfl_store_block(xd, mbmi->bsize, mbmi->tx_size);
   }
 }
 
 static AOM_INLINE void predict_inter_block(AV1_COMMON *const cm,
-                                           MACROBLOCKD *const xd,
+                                           DecoderCodingBlock *dcb,
                                            BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   const int num_planes = av1_num_planes(cm);
   const int mi_row = xd->mi_row;
@@ -875,9 +854,9 @@ static AOM_INLINE void predict_inter_block(AV1_COMMON *const cm,
     }
   }
 
-  dec_build_inter_predictor(cm, xd, mi_row, mi_col, bsize);
+  dec_build_inter_predictor(cm, dcb, mi_row, mi_col, bsize);
   if (mbmi->motion_mode == OBMC_CAUSAL) {
-    dec_build_obmc_inter_predictors_sb(cm, xd);
+    dec_build_obmc_inter_predictors_sb(cm, dcb);
   }
 #if CONFIG_MISMATCH_DEBUG
   for (int plane = 0; plane < num_planes; ++plane) {
@@ -901,7 +880,7 @@ static AOM_INLINE void set_color_index_map_offset(MACROBLOCKD *const xd,
   (void)r;
   Av1ColorMapParam params;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
-  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+  av1_get_block_dimensions(mbmi->bsize, plane, xd, &params.plane_width,
                            &params.plane_height, NULL, NULL);
   xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
 }
@@ -911,7 +890,8 @@ static AOM_INLINE void decode_token_recon_block(AV1Decoder *const pbi,
                                                 aom_reader *r,
                                                 BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &td->xd;
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
   const int num_planes = av1_num_planes(cm);
   MB_MODE_INFO *mbmi = xd->mi[0];
 
@@ -945,20 +925,20 @@ static AOM_INLINE void decode_token_recon_block(AV1Decoder *const pbi,
                blk_row += stepr) {
             for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
                  blk_col += stepc) {
-              td->read_coeffs_tx_intra_block_visit(cm, xd, r, plane, blk_row,
+              td->read_coeffs_tx_intra_block_visit(cm, dcb, r, plane, blk_row,
                                                    blk_col, tx_size);
-              td->predict_and_recon_intra_block_visit(cm, xd, r, plane, blk_row,
-                                                      blk_col, tx_size);
-              set_cb_buffer_offsets(xd, tx_size, plane);
+              td->predict_and_recon_intra_block_visit(
+                  cm, dcb, r, plane, blk_row, blk_col, tx_size);
+              set_cb_buffer_offsets(dcb, tx_size, plane);
             }
           }
         }
       }
     }
   } else {
-    td->predict_inter_block_visit(cm, xd, bsize);
+    td->predict_inter_block_visit(cm, dcb, bsize);
     // Reconstruction
-    if (!mbmi->skip) {
+    if (!mbmi->skip_txfm) {
       int eobtotal = 0;
 
       const int max_blocks_wide = max_block_wide(xd, bsize, 0);
@@ -1034,15 +1014,11 @@ static AOM_INLINE void set_inter_tx_size(MB_MODE_INFO *mbmi, int stride_log2,
 
 static AOM_INLINE void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
                                           TX_SIZE tx_size, int depth,
-#if CONFIG_LPF_MASK
-                                          AV1_COMMON *cm, int mi_row,
-                                          int mi_col, int store_bitmask,
-#endif
                                           int blk_row, int blk_col,
                                           aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   int is_split = 0;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int max_blocks_high = max_block_high(xd, bsize, 0);
   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
@@ -1066,7 +1042,7 @@ static AOM_INLINE void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
 
   const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
                                          xd->left_txfm_context + blk_row,
-                                         mbmi->sb_type, tx_size);
+                                         mbmi->bsize, tx_size);
   is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR);
 
   if (is_split) {
@@ -1080,32 +1056,15 @@ static AOM_INLINE void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
       mbmi->tx_size = sub_txs;
       txfm_partition_update(xd->above_txfm_context + blk_col,
                             xd->left_txfm_context + blk_row, sub_txs, tx_size);
-#if CONFIG_LPF_MASK
-      if (store_bitmask) {
-        av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
-                                txsize_to_bsize[tx_size], TX_4X4, mbmi);
-      }
-#endif
       return;
     }
-#if CONFIG_LPF_MASK
-    if (depth + 1 == MAX_VARTX_DEPTH && store_bitmask) {
-      av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
-                              txsize_to_bsize[tx_size], sub_txs, mbmi);
-      store_bitmask = 0;
-    }
-#endif
 
     assert(bsw > 0 && bsh > 0);
     for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
       for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
         int offsetr = blk_row + row;
         int offsetc = blk_col + col;
-        read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1,
-#if CONFIG_LPF_MASK
-                           cm, mi_row, mi_col, store_bitmask,
-#endif
-                           offsetr, offsetc, r);
+        read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r);
       }
     }
   } else {
@@ -1114,12 +1073,6 @@ static AOM_INLINE void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
     mbmi->tx_size = tx_size;
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
-#if CONFIG_LPF_MASK
-    if (store_bitmask) {
-      av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
-                              txsize_to_bsize[tx_size], tx_size, mbmi);
-    }
-#endif
   }
 }
 
@@ -1127,7 +1080,7 @@ static TX_SIZE read_selected_tx_size(const MACROBLOCKD *const xd,
                                      aom_reader *r) {
   // TODO(debargha): Clean up the logic here. This function should only
   // be called for intra.
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
   const int max_depths = bsize_to_max_depth(bsize);
   const int ctx = get_tx_size_context(xd);
@@ -1142,7 +1095,7 @@ static TX_SIZE read_selected_tx_size(const MACROBLOCKD *const xd,
 static TX_SIZE read_tx_size(const MACROBLOCKD *const xd, TX_MODE tx_mode,
                             int is_inter, int allow_select_inter,
                             aom_reader *r) {
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
 
   if (block_signals_txsize(bsize)) {
@@ -1163,8 +1116,9 @@ static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi,
                                           int mi_col, aom_reader *r,
                                           PARTITION_TYPE partition,
                                           BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &td->xd;
-  decode_mbmi_block(pbi, xd, mi_row, mi_col, r, partition, bsize);
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
+  decode_mbmi_block(pbi, dcb, mi_row, mi_col, r, partition, bsize);
 
   av1_visit_palette(pbi, xd, r, av1_decode_palette_tokens);
 
@@ -1173,7 +1127,7 @@ static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi,
   MB_MODE_INFO *mbmi = xd->mi[0];
   int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
   if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
-      !mbmi->skip && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
+      !mbmi->skip_txfm && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
     const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
     const int bh = tx_size_high_unit[max_tx_size];
     const int bw = tx_size_wide_unit[max_tx_size];
@@ -1182,52 +1136,20 @@ static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi,
 
     for (int idy = 0; idy < height; idy += bh)
       for (int idx = 0; idx < width; idx += bw)
-        read_tx_size_vartx(xd, mbmi, max_tx_size, 0,
-#if CONFIG_LPF_MASK
-                           cm, mi_row, mi_col, 1,
-#endif
-                           idy, idx, r);
+        read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r);
   } else {
-    mbmi->tx_size =
-        read_tx_size(xd, cm->features.tx_mode, inter_block_tx, !mbmi->skip, r);
+    mbmi->tx_size = read_tx_size(xd, cm->features.tx_mode, inter_block_tx,
+                                 !mbmi->skip_txfm, r);
     if (inter_block_tx)
       memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
     set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
-                  mbmi->skip && is_inter_block(mbmi), xd);
-#if CONFIG_LPF_MASK
-    const int w = mi_size_wide[bsize];
-    const int h = mi_size_high[bsize];
-    if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
-      av1_store_bitmask_univariant_tx(cm, mi_row, mi_col, bsize, mbmi);
-    } else {
-      for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
-        for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
-          av1_store_bitmask_univariant_tx(cm, mi_row + row, mi_col + col,
-                                          BLOCK_64X64, mbmi);
-        }
-      }
-    }
-#endif
+                  mbmi->skip_txfm && is_inter_block(mbmi), xd);
   }
-#if CONFIG_LPF_MASK
-  const int w = mi_size_wide[bsize];
-  const int h = mi_size_high[bsize];
-  if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
-    av1_store_bitmask_other_info(cm, mi_row, mi_col, bsize, mbmi, 1, 1);
-  } else {
-    for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
-      for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
-        av1_store_bitmask_other_info(cm, mi_row + row, mi_col + col,
-                                     BLOCK_64X64, mbmi, row == 0, col == 0);
-      }
-    }
-  }
-#endif
 
   if (cm->delta_q_info.delta_q_present_flag) {
     for (int i = 0; i < MAX_SEGMENTS; i++) {
       const int current_qindex =
-          av1_get_qindex(&cm->seg, i, xd->current_qindex);
+          av1_get_qindex(&cm->seg, i, xd->current_base_qindex);
       const CommonQuantParams *const quant_params = &cm->quant_params;
       for (int j = 0; j < num_planes; ++j) {
         const int dc_delta_q = j == 0 ? quant_params->y_dc_delta_q
@@ -1237,13 +1159,13 @@ static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi,
                                       : (j == 1 ? quant_params->u_ac_delta_q
                                                 : quant_params->v_ac_delta_q);
         xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX(
-            current_qindex, dc_delta_q, cm->seq_params.bit_depth);
+            current_qindex, dc_delta_q, cm->seq_params->bit_depth);
         xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX(
-            current_qindex, ac_delta_q, cm->seq_params.bit_depth);
+            current_qindex, ac_delta_q, cm->seq_params->bit_depth);
       }
     }
   }
-  if (mbmi->skip) av1_reset_entropy_context(xd, bsize, num_planes);
+  if (mbmi->skip_txfm) av1_reset_entropy_context(xd, bsize, num_planes);
 
   decode_token_recon_block(pbi, td, r, bsize);
 }
@@ -1254,7 +1176,8 @@ static AOM_INLINE void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
                                                       BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  MACROBLOCKD *const xd = &td->xd;
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int num_planes = av1_num_planes(cm);
@@ -1324,7 +1247,8 @@ static AOM_INLINE void decode_partition(AV1Decoder *const pbi,
                                         int parse_decode_flag) {
   assert(bsize < BLOCK_SIZES_ALL);
   AV1_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &td->xd;
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
   const int bw = mi_size_wide[bsize];
   const int hbs = bw >> 1;
   PARTITION_TYPE partition;
@@ -1369,6 +1293,10 @@ static AOM_INLINE void decode_partition(AV1Decoder *const pbi,
   }
   subsize = get_partition_subsize(bsize, partition);
   if (subsize == BLOCK_INVALID) {
+    // When an internal error occurs ensure that xd->mi_row is set appropriately
+    // w.r.t. current tile, which is used to signal processing of current row is
+    // done.
+    xd->mi_row = mi_row;
     aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
                        "Partition is invalid for block size %dx%d",
                        block_size_wide[bsize], block_size_high[bsize]);
@@ -1378,6 +1306,10 @@ static AOM_INLINE void decode_partition(AV1Decoder *const pbi,
   const struct macroblockd_plane *const pd_u = &xd->plane[1];
   if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) ==
       BLOCK_INVALID) {
+    // When an internal error occurs ensure that xd->mi_row is set appropriately
+    // w.r.t. current tile, which is used to signal processing of current row is
+    // done.
+    xd->mi_row = mi_row;
     aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
                        "Block size %dx%d invalid with this subsampling mode",
                        block_size_wide[subsize], block_size_high[subsize]);
@@ -1455,19 +1387,30 @@ static AOM_INLINE void decode_partition(AV1Decoder *const pbi,
 }
 
 static AOM_INLINE void setup_bool_decoder(
-    const uint8_t *data, const uint8_t *data_end, const size_t read_size,
-    struct aom_internal_error_info *error_info, aom_reader *r,
-    uint8_t allow_update_cdf) {
+    MACROBLOCKD *const xd, const uint8_t *data, const uint8_t *data_end,
+    const size_t read_size, struct aom_internal_error_info *error_info,
+    aom_reader *r, uint8_t allow_update_cdf) {
   // Validate the calculated partition length. If the buffer
   // described by the partition can't be fully read, then restrict
   // it to the portion that can be (for EC mode) or throw an error.
-  if (!read_is_valid(data, read_size, data_end))
+  if (!read_is_valid(data, read_size, data_end)) {
+    // When internal error occurs ensure that xd->mi_row is set appropriately
+    // w.r.t. current tile, which is used to signal processing of current row is
+    // done in row-mt decoding.
+    xd->mi_row = xd->tile.mi_row_start;
+
     aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt tile length");
+  }
+  if (aom_reader_init(r, data, read_size)) {
+    // When internal error occurs ensure that xd->mi_row is set appropriately
+    // w.r.t. current tile, which is used to signal processing of current row is
+    // done in row-mt decoding.
+    xd->mi_row = xd->tile.mi_row_start;
 
-  if (aom_reader_init(r, data, read_size))
     aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
+  }
 
   r->allow_update_cdf = allow_update_cdf;
 }
@@ -1482,9 +1425,10 @@ static AOM_INLINE void setup_segmentation(AV1_COMMON *const cm,
 
   seg->enabled = aom_rb_read_bit(rb);
   if (!seg->enabled) {
-    if (cm->cur_frame->seg_map)
+    if (cm->cur_frame->seg_map) {
       memset(cm->cur_frame->seg_map, 0,
-             (cm->mi_params.mi_rows * cm->mi_params.mi_cols));
+             (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols));
+    }
 
     memset(seg, 0, sizeof(*seg));
     segfeatures_copy(&cm->cur_frame->seg, seg);
@@ -1567,9 +1511,9 @@ static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm,
     }
   }
   if (!all_none) {
-    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
-           cm->seq_params.sb_size == BLOCK_128X128);
-    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+    assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+           cm->seq_params->sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64;
 
     for (int p = 0; p < num_planes; ++p)
       cm->rst_info[p].restoration_unit_size = sb_size;
@@ -1589,7 +1533,8 @@ static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm,
   }
 
   if (num_planes > 1) {
-    int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
+    int s =
+        AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y);
     if (s && !chroma_none) {
       cm->rst_info[1].restoration_unit_size =
           cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
@@ -1705,7 +1650,7 @@ static AOM_INLINE void loop_restoration_read_sb_coeffs(
     int runit_idx) {
   const RestorationInfo *rsi = &cm->rst_info[plane];
   RestorationUnitInfo *rui = &rsi->unit_info[runit_idx];
-  if (rsi->frame_restoration_type == RESTORE_NONE) return;
+  assert(rsi->frame_restoration_type != RESTORE_NONE);
 
   assert(!cm->features.all_lossless);
 
@@ -1858,7 +1803,7 @@ static AOM_INLINE void setup_quantization(CommonQuantParams *quant_params,
 // Build y/uv dequant values based on segmentation.
 static AOM_INLINE void setup_segmentation_dequant(AV1_COMMON *const cm,
                                                   MACROBLOCKD *const xd) {
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   // When segmentation is disabled, only the first value is used.  The
   // remaining are don't cares.
   const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1;
@@ -1920,7 +1865,7 @@ static AOM_INLINE void setup_superres(AV1_COMMON *const cm,
   cm->superres_upscaled_width = *width;
   cm->superres_upscaled_height = *height;
 
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   if (!seq_params->enable_superres) return;
 
   if (aom_rb_read_bit(rb)) {
@@ -1941,31 +1886,29 @@ static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width,
                                               int height) {
 #if CONFIG_SIZE_LIMIT
   if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Dimensions of %dx%d beyond allowed size of %dx%d.",
                        width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
 #endif
   if (cm->width != width || cm->height != height) {
-    const int new_mi_rows =
-        ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
-    const int new_mi_cols =
-        ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+    const int new_mi_rows = CEIL_POWER_OF_TWO(height, MI_SIZE_LOG2);
+    const int new_mi_cols = CEIL_POWER_OF_TWO(width, MI_SIZE_LOG2);
 
     // Allocations in av1_alloc_context_buffers() depend on individual
     // dimensions as well as the overall size.
     if (new_mi_cols > cm->mi_params.mi_cols ||
         new_mi_rows > cm->mi_params.mi_rows) {
-      if (av1_alloc_context_buffers(cm, width, height)) {
+      if (av1_alloc_context_buffers(cm, width, height, BLOCK_4X4)) {
         // The cm->mi_* values have been cleared and any existing context
         // buffers have been freed. Clear cm->width and cm->height to be
         // consistent and to force a realloc next time.
         cm->width = 0;
         cm->height = 0;
-        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                            "Failed to allocate context buffers");
       }
     } else {
-      cm->mi_params.set_mb_mi(&cm->mi_params, width, height);
+      cm->mi_params.set_mb_mi(&cm->mi_params, width, height, BLOCK_4X4);
     }
     av1_init_mi_buffers(&cm->mi_params);
     cm->width = width;
@@ -1979,16 +1922,17 @@ static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width,
 
 static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) {
   BufferPool *const pool = cm->buffer_pool;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
 
   lock_buffer_pool(pool);
   if (aom_realloc_frame_buffer(
           &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
           AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment,
-          &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) {
+          &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0,
+          0)) {
     unlock_buffer_pool(pool);
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
   unlock_buffer_pool(pool);
@@ -2009,7 +1953,7 @@ static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) {
 static AOM_INLINE void setup_frame_size(AV1_COMMON *cm,
                                         int frame_size_override_flag,
                                         struct aom_read_bit_buffer *rb) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   int width, height;
 
   if (frame_size_override_flag) {
@@ -2018,7 +1962,7 @@ static AOM_INLINE void setup_frame_size(AV1_COMMON *cm,
     av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
     if (width > seq_params->max_frame_width ||
         height > seq_params->max_frame_height) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Frame dimensions are larger than the maximum values");
     }
   } else {
@@ -2059,7 +2003,7 @@ static AOM_INLINE void setup_frame_size_with_refs(
       // the middle of a stream, and static analysis will error if we don't do
       // a null check here.
       if (ref_buf == NULL) {
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                            "Invalid condition: invalid reference buffer");
       } else {
         const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf;
@@ -2075,7 +2019,7 @@ static AOM_INLINE void setup_frame_size_with_refs(
     }
   }
 
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   if (!found) {
     int num_bits_width = seq_params->num_bits_width;
     int num_bits_height = seq_params->num_bits_height;
@@ -2087,7 +2031,7 @@ static AOM_INLINE void setup_frame_size_with_refs(
   }
 
   if (width <= 0 || height <= 0)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Invalid frame size");
 
   // Check to make sure at least one of frames that this frame references
@@ -2099,7 +2043,7 @@ static AOM_INLINE void setup_frame_size_with_refs(
                              ref_frame->buf.y_crop_height, width, height);
   }
   if (!has_valid_ref_frame)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Referenced frame has invalid size");
   for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
     const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
@@ -2107,7 +2051,7 @@ static AOM_INLINE void setup_frame_size_with_refs(
             ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x,
             ref_frame->buf.subsampling_y, seq_params->bit_depth,
             seq_params->subsampling_x, seq_params->subsampling_y))
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Referenced frame has incompatible color format");
   }
   setup_buffer_pool(cm);
@@ -2127,14 +2071,12 @@ static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
 
 static AOM_INLINE void read_tile_info_max_tile(
     AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   CommonTileParams *const tiles = &cm->tiles;
-  int width_mi =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
-  int height_mi =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
-  int width_sb = width_mi >> seq_params->mib_size_log2;
-  int height_sb = height_mi >> seq_params->mib_size_log2;
+  int width_sb =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
+  int height_sb =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
 
   av1_get_tile_limits(cm);
   tiles->uniform_spacing = aom_rb_read_bit(rb);
@@ -2223,7 +2165,7 @@ static AOM_INLINE void read_tile_info(AV1Decoder *const pbi,
     pbi->context_update_tile_id =
         aom_rb_read_literal(rb, cm->tiles.log2_rows + cm->tiles.log2_cols);
     if (pbi->context_update_tile_id >= cm->tiles.rows * cm->tiles.cols) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid context_update_tile_id");
     }
     // tile size magnitude
@@ -2376,7 +2318,7 @@ static const uint8_t *get_ls_tile_buffers(
 
       // Get the whole of the last column, otherwise stop at the required tile.
       for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
-        get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+        get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data,
                            tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
       }
     }
@@ -2388,7 +2330,7 @@ static const uint8_t *get_ls_tile_buffers(
       data = tile_col_data_end[c - 1];
 
       for (int r = 0; r < tile_rows; ++r) {
-        get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+        get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data,
                            tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
       }
     }
@@ -2456,31 +2398,32 @@ static AOM_INLINE void get_tile_buffers(
       if (tc < start_tile || tc > end_tile) continue;
 
       if (data + hdr_offset >= data_end)
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                            "Data ended before all tiles were read.");
       data += hdr_offset;
-      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last,
-                      &pbi->common.error, &data, buf);
+      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, &pbi->error,
+                      &data, buf);
     }
   }
 }
 
-static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, MACROBLOCKD *const xd,
+static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, DecoderCodingBlock *dcb,
                                      CB_BUFFER *cb_buffer_base,
                                      const int num_planes, int mi_row,
                                      int mi_col) {
   AV1_COMMON *const cm = &pbi->common;
-  int mib_size_log2 = cm->seq_params.mib_size_log2;
+  int mib_size_log2 = cm->seq_params->mib_size_log2;
   int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
   int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
   CB_BUFFER *cb_buffer = cb_buffer_base + offset;
 
   for (int plane = 0; plane < num_planes; ++plane) {
-    xd->plane[plane].dqcoeff_block = cb_buffer->dqcoeff[plane];
-    xd->plane[plane].eob_data = cb_buffer->eob_data[plane];
-    xd->cb_offset[plane] = 0;
-    xd->txb_offset[plane] = 0;
+    dcb->dqcoeff_block[plane] = cb_buffer->dqcoeff[plane];
+    dcb->eob_data[plane] = cb_buffer->eob_data[plane];
+    dcb->cb_offset[plane] = 0;
+    dcb->txb_offset[plane] = 0;
   }
+  MACROBLOCKD *const xd = &dcb->xd;
   xd->plane[0].color_index_map = cb_buffer->color_index_map[0];
   xd->plane[1].color_index_map = cb_buffer->color_index_map[1];
   xd->color_index_map_offset[0] = 0;
@@ -2629,28 +2572,55 @@ static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
 #endif  // CONFIG_MULTITHREAD
 }
 
+static INLINE void signal_decoding_done_for_erroneous_row(
+    AV1Decoder *const pbi, const MACROBLOCKD *const xd) {
+  AV1_COMMON *const cm = &pbi->common;
+  const TileInfo *const tile = &xd->tile;
+  const int sb_row_in_tile =
+      ((xd->mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2);
+  const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile);
+  TileDataDec *const tile_data =
+      pbi->tile_data + tile->tile_row * cm->tiles.cols + tile->tile_col;
+  AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+  sync_write(dec_row_mt_sync, sb_row_in_tile, sb_cols_in_tile - 1,
+             sb_cols_in_tile);
+}
+
 static AOM_INLINE void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
-                                          TileInfo tile_info,
+                                          const TileInfo *tile_info,
                                           const int mi_row) {
   AV1_COMMON *const cm = &pbi->common;
   const int num_planes = av1_num_planes(cm);
-  TileDataDec *const tile_data =
-      pbi->tile_data + tile_info.tile_row * cm->tiles.cols + tile_info.tile_col;
+  TileDataDec *const tile_data = pbi->tile_data +
+                                 tile_info->tile_row * cm->tiles.cols +
+                                 tile_info->tile_col;
   const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
   const int sb_row_in_tile =
-      (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2;
+      (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
   int sb_col_in_tile = 0;
+  int row_mt_exit = 0;
 
-  for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-       mi_col += cm->seq_params.mib_size, sb_col_in_tile++) {
-    set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
+  for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+       mi_col += cm->seq_params->mib_size, sb_col_in_tile++) {
+    set_cb_buffer(pbi, &td->dcb, pbi->cb_buffer_base, num_planes, mi_row,
                   mi_col);
 
     sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile);
 
-    // Decoding of the super-block
-    decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
-                     cm->seq_params.sb_size, 0x2);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+    row_mt_exit = pbi->frame_row_mt_info.row_mt_exit;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+
+    if (!row_mt_exit) {
+      // Decoding of the super-block
+      decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+                       cm->seq_params->sb_size, 0x2);
+    }
 
     sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile,
                sb_cols_in_tile);
@@ -2711,25 +2681,28 @@ static AOM_INLINE void decode_tile(AV1Decoder *pbi, ThreadData *const td,
 
   av1_tile_set_row(&tile_info, cm, tile_row);
   av1_tile_set_col(&tile_info, cm, tile_col);
-  av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
-                         tile_info.mi_col_end, tile_row);
-  av1_reset_loop_filter_delta(&td->xd, num_planes);
-  av1_reset_loop_restoration(&td->xd, num_planes);
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
+
+  av1_zero_above_context(cm, xd, tile_info.mi_col_start, tile_info.mi_col_end,
+                         tile_row);
+  av1_reset_loop_filter_delta(xd, num_planes);
+  av1_reset_loop_restoration(xd, num_planes);
 
   for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-       mi_row += cm->seq_params.mib_size) {
-    av1_zero_left_context(&td->xd);
+       mi_row += cm->seq_params->mib_size) {
+    av1_zero_left_context(xd);
 
     for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-         mi_col += cm->seq_params.mib_size) {
-      set_cb_buffer(pbi, &td->xd, &td->cb_buffer_base, num_planes, 0, 0);
+         mi_col += cm->seq_params->mib_size) {
+      set_cb_buffer(pbi, dcb, &td->cb_buffer_base, num_planes, 0, 0);
 
       // Bit-stream parsing and decoding of the superblock
       decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
-                       cm->seq_params.sb_size, 0x3);
+                       cm->seq_params->sb_size, 0x3);
 
       if (aom_reader_has_overflowed(td->bit_reader)) {
-        aom_merge_corrupted_flag(&td->xd.corrupted, 1);
+        aom_merge_corrupted_flag(&dcb->corrupted, 1);
         return;
       }
     }
@@ -2737,7 +2710,7 @@ static AOM_INLINE void decode_tile(AV1Decoder *pbi, ThreadData *const td,
 
   int corrupted =
       (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
-  aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
+  aom_merge_corrupted_flag(&dcb->corrupted, corrupted);
 }
 
 static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
@@ -2807,6 +2780,10 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
   if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
     decoder_alloc_tile_data(pbi, n_tiles);
   }
+  if (pbi->dcb.xd.seg_mask == NULL)
+    CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
+                    (uint8_t *)aom_memalign(
+                        16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
 #if CONFIG_ACCOUNTING
   if (pbi->acct_enabled) {
     aom_accounting_reset(&pbi->accounting);
@@ -2816,13 +2793,14 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
   set_decode_func_pointers(&pbi->td, 0x3);
 
   // Load all tile information into thread_data.
-  td->xd = pbi->mb;
-  td->xd.corrupted = 0;
-  td->xd.mc_buf[0] = td->mc_buf[0];
-  td->xd.mc_buf[1] = td->mc_buf[1];
-  td->xd.tmp_conv_dst = td->tmp_conv_dst;
+  td->dcb = pbi->dcb;
+
+  td->dcb.corrupted = 0;
+  td->dcb.mc_buf[0] = td->mc_buf[0];
+  td->dcb.mc_buf[1] = td->mc_buf[1];
+  td->dcb.xd.tmp_conv_dst = td->tmp_conv_dst;
   for (int j = 0; j < 2; ++j) {
-    td->xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
+    td->dcb.xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
   }
 
   for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
@@ -2839,10 +2817,11 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
 
       td->bit_reader = &tile_data->bit_reader;
       av1_zero(td->cb_buffer_base.dqcoeff);
-      av1_tile_init(&td->xd.tile, cm, row, col);
-      td->xd.current_qindex = cm->quant_params.base_qindex;
-      setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
-                         &cm->error, td->bit_reader, allow_update_cdf);
+      av1_tile_init(&td->dcb.xd.tile, cm, row, col);
+      td->dcb.xd.current_base_qindex = cm->quant_params.base_qindex;
+      setup_bool_decoder(&td->dcb.xd, tile_bs_buf->data, data_end,
+                         tile_bs_buf->size, &pbi->error, td->bit_reader,
+                         allow_update_cdf);
 #if CONFIG_ACCOUNTING
       if (pbi->acct_enabled) {
         td->bit_reader->accounting = &pbi->accounting;
@@ -2852,19 +2831,19 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
         td->bit_reader->accounting = NULL;
       }
 #endif
-      av1_init_macroblockd(cm, &td->xd, NULL);
+      av1_init_macroblockd(cm, &td->dcb.xd);
       av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), row,
-                             &td->xd);
+                             &td->dcb.xd);
 
       // Initialise the tile context from the frame context
       tile_data->tctx = *cm->fc;
-      td->xd.tile_ctx = &tile_data->tctx;
+      td->dcb.xd.tile_ctx = &tile_data->tctx;
 
       // decode tile
       decode_tile(pbi, td, row, col);
-      aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted);
-      if (pbi->mb.corrupted)
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_merge_corrupted_flag(&pbi->dcb.corrupted, td->dcb.corrupted);
+      if (pbi->dcb.corrupted)
+        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                            "Failed to decode tile data");
     }
   }
@@ -2910,9 +2889,12 @@ static AOM_INLINE void tile_worker_hook_init(
 
   td->bit_reader = &tile_data->bit_reader;
   av1_zero(td->cb_buffer_base.dqcoeff);
-  av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
-  td->xd.current_qindex = cm->quant_params.base_qindex;
-  setup_bool_decoder(tile_buffer->data, thread_data->data_end,
+
+  MACROBLOCKD *const xd = &td->dcb.xd;
+  av1_tile_init(&xd->tile, cm, tile_row, tile_col);
+  xd->current_base_qindex = cm->quant_params.base_qindex;
+
+  setup_bool_decoder(xd, tile_buffer->data, thread_data->data_end,
                      tile_buffer->size, &thread_data->error_info,
                      td->bit_reader, allow_update_cdf);
 #if CONFIG_ACCOUNTING
@@ -2924,14 +2906,13 @@ static AOM_INLINE void tile_worker_hook_init(
     td->bit_reader->accounting = NULL;
   }
 #endif
-  av1_init_macroblockd(cm, &td->xd, NULL);
-  td->xd.error_info = &thread_data->error_info;
-  av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
-                         &td->xd);
+  av1_init_macroblockd(cm, xd);
+  xd->error_info = &thread_data->error_info;
+  av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, xd);
 
   // Initialise the tile context from the frame context
   tile_data->tctx = *cm->fc;
-  td->xd.tile_ctx = &tile_data->tctx;
+  xd->tile_ctx = &tile_data->tctx;
 #if CONFIG_ACCOUNTING
   if (pbi->acct_enabled) {
     tile_data->bit_reader.accounting->last_tell_frac =
@@ -2952,7 +2933,7 @@ static int tile_worker_hook(void *arg1, void *arg2) {
   // before it returns.
   if (setjmp(thread_data->error_info.jmp)) {
     thread_data->error_info.setjmp = 0;
-    thread_data->td->xd.corrupted = 1;
+    thread_data->td->dcb.corrupted = 1;
     return 0;
   }
   thread_data->error_info.setjmp = 1;
@@ -2963,7 +2944,7 @@ static int tile_worker_hook(void *arg1, void *arg2) {
   set_decode_func_pointers(td, 0x3);
 
   assert(cm->tiles.cols > 0);
-  while (!td->xd.corrupted) {
+  while (!td->dcb.corrupted) {
     TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
 
     if (cur_job_info != NULL) {
@@ -2980,11 +2961,11 @@ static int tile_worker_hook(void *arg1, void *arg2) {
     }
   }
   thread_data->error_info.setjmp = 0;
-  return !td->xd.corrupted;
+  return !td->dcb.corrupted;
 }
 
 static INLINE int get_max_row_mt_workers_per_tile(AV1_COMMON *cm,
-                                                  TileInfo tile) {
+                                                  const TileInfo *tile) {
   // NOTE: Currently value of max workers is calculated based
   // on the parse and decode time. As per the theoretical estimate
   // when percentage of parse time is equal to percentage of decode
@@ -3014,14 +2995,13 @@ static int get_next_job_info(AV1Decoder *const pbi,
   TileDataDec *tile_data;
   AV1DecRowMTSync *dec_row_mt_sync;
   AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
-  TileInfo tile_info;
   const int tile_rows_start = frame_row_mt_info->tile_rows_start;
   const int tile_rows_end = frame_row_mt_info->tile_rows_end;
   const int tile_cols_start = frame_row_mt_info->tile_cols_start;
   const int tile_cols_end = frame_row_mt_info->tile_cols_end;
   const int start_tile = frame_row_mt_info->start_tile;
   const int end_tile = frame_row_mt_info->end_tile;
-  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
   int num_mis_to_decode, num_threads_working;
   int num_mis_waiting_for_decode;
   int min_threads_working = INT_MAX;
@@ -3078,7 +3058,7 @@ static int get_next_job_info(AV1Decoder *const pbi,
         if (num_threads_working == min_threads_working &&
             num_mis_to_decode > max_mis_to_decode &&
             num_threads_working <
-                get_max_row_mt_workers_per_tile(cm, tile_data->tile_info)) {
+                get_max_row_mt_workers_per_tile(cm, &tile_data->tile_info)) {
           max_mis_to_decode = num_mis_to_decode;
           tile_row = tile_row_idx;
           tile_col = tile_col_idx;
@@ -3090,13 +3070,12 @@ static int get_next_job_info(AV1Decoder *const pbi,
   if (tile_row == -1 || tile_col == -1) return 0;
 
   tile_data = pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
-  tile_info = tile_data->tile_info;
   dec_row_mt_sync = &tile_data->dec_row_mt_sync;
 
   next_job_info->tile_row = tile_row;
   next_job_info->tile_col = tile_col;
-  next_job_info->mi_row =
-      dec_row_mt_sync->mi_rows_decode_started + tile_info.mi_row_start;
+  next_job_info->mi_row = dec_row_mt_sync->mi_rows_decode_started +
+                          tile_data->tile_info.mi_row_start;
 
   dec_row_mt_sync->num_threads_working++;
   dec_row_mt_sync->mi_rows_decode_started += sb_mi_size;
@@ -3139,31 +3118,32 @@ static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi,
 static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td,
                                          TileDataDec *const tile_data) {
   AV1_COMMON *const cm = &pbi->common;
-  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
   const int num_planes = av1_num_planes(cm);
-  TileInfo tile_info = tile_data->tile_info;
-  int tile_row = tile_info.tile_row;
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  int tile_row = tile_info->tile_row;
+  DecoderCodingBlock *const dcb = &td->dcb;
+  MACROBLOCKD *const xd = &dcb->xd;
 
-  av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
-                         tile_info.mi_col_end, tile_row);
-  av1_reset_loop_filter_delta(&td->xd, num_planes);
-  av1_reset_loop_restoration(&td->xd, num_planes);
+  av1_zero_above_context(cm, xd, tile_info->mi_col_start, tile_info->mi_col_end,
+                         tile_row);
+  av1_reset_loop_filter_delta(xd, num_planes);
+  av1_reset_loop_restoration(xd, num_planes);
 
-  for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-       mi_row += cm->seq_params.mib_size) {
-    av1_zero_left_context(&td->xd);
+  for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+       mi_row += cm->seq_params->mib_size) {
+    av1_zero_left_context(xd);
 
-    for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-         mi_col += cm->seq_params.mib_size) {
-      set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
-                    mi_col);
+    for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+         mi_col += cm->seq_params->mib_size) {
+      set_cb_buffer(pbi, dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col);
 
       // Bit-stream parsing of the superblock
       decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
-                       cm->seq_params.sb_size, 0x1);
+                       cm->seq_params->sb_size, 0x1);
 
       if (aom_reader_has_overflowed(td->bit_reader)) {
-        aom_merge_corrupted_flag(&td->xd.corrupted, 1);
+        aom_merge_corrupted_flag(&dcb->corrupted, 1);
         return;
       }
     }
@@ -3172,28 +3152,35 @@ static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td,
 
   int corrupted =
       (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
-  aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
+  aom_merge_corrupted_flag(&dcb->corrupted, corrupted);
 }
 
 static int row_mt_worker_hook(void *arg1, void *arg2) {
   DecWorkerData *const thread_data = (DecWorkerData *)arg1;
   AV1Decoder *const pbi = (AV1Decoder *)arg2;
-  AV1_COMMON *cm = &pbi->common;
   ThreadData *const td = thread_data->td;
   uint8_t allow_update_cdf;
   AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
-  td->xd.corrupted = 0;
+  td->dcb.corrupted = 0;
 
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
   if (setjmp(thread_data->error_info.jmp)) {
     thread_data->error_info.setjmp = 0;
-    thread_data->td->xd.corrupted = 1;
+    thread_data->td->dcb.corrupted = 1;
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(pbi->row_mt_mutex_);
 #endif
     frame_row_mt_info->row_mt_exit = 1;
+
+    // If any SB row (erroneous row) processed by a thread encounters an
+    // internal error, there is a need to indicate other threads that decoding
+    // of the erroneous row is complete. This ensures that other threads which
+    // wait upon the completion of SB's present in erroneous row are not waiting
+    // indefinitely.
+    signal_decoding_done_for_erroneous_row(pbi, &thread_data->td->dcb.xd);
+
 #if CONFIG_MULTITHREAD
     pthread_cond_broadcast(pbi->row_mt_cond_);
     pthread_mutex_unlock(pbi->row_mt_mutex_);
@@ -3202,13 +3189,14 @@ static int row_mt_worker_hook(void *arg1, void *arg2) {
   }
   thread_data->error_info.setjmp = 1;
 
+  AV1_COMMON *cm = &pbi->common;
   allow_update_cdf = cm->tiles.large_scale ? 0 : 1;
   allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
 
   set_decode_func_pointers(td, 0x1);
 
   assert(cm->tiles.cols > 0);
-  while (!td->xd.corrupted) {
+  while (!td->dcb.corrupted) {
     TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
 
     if (cur_job_info != NULL) {
@@ -3237,7 +3225,7 @@ static int row_mt_worker_hook(void *arg1, void *arg2) {
     }
   }
 
-  if (td->xd.corrupted) {
+  if (td->dcb.corrupted) {
     thread_data->error_info.setjmp = 0;
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(pbi->row_mt_mutex_);
@@ -3277,13 +3265,12 @@ static int row_mt_worker_hook(void *arg1, void *arg2) {
     TileDataDec *tile_data =
         pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
     AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
-    TileInfo tile_info = tile_data->tile_info;
 
-    av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
-    av1_init_macroblockd(cm, &td->xd, NULL);
-    td->xd.error_info = &thread_data->error_info;
+    av1_tile_init(&td->dcb.xd.tile, cm, tile_row, tile_col);
+    av1_init_macroblockd(cm, &td->dcb.xd);
+    td->dcb.xd.error_info = &thread_data->error_info;
 
-    decode_tile_sb_row(pbi, td, tile_info, mi_row);
+    decode_tile_sb_row(pbi, td, &tile_data->tile_info, mi_row);
 
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(pbi->row_mt_mutex_);
@@ -3294,7 +3281,7 @@ static int row_mt_worker_hook(void *arg1, void *arg2) {
 #endif
   }
   thread_data->error_info.setjmp = 0;
-  return !td->xd.corrupted;
+  return !td->dcb.corrupted;
 }
 
 // sorts in descending order
@@ -3360,6 +3347,8 @@ void av1_free_mc_tmp_buf(ThreadData *thread_data) {
 
   aom_free(thread_data->tmp_conv_dst);
   thread_data->tmp_conv_dst = NULL;
+  aom_free(thread_data->seg_mask);
+  thread_data->seg_mask = NULL;
   for (int i = 0; i < 2; ++i) {
     aom_free(thread_data->tmp_obmc_bufs[i]);
     thread_data->tmp_obmc_bufs[i] = NULL;
@@ -3370,13 +3359,20 @@ static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm,
                                            ThreadData *thread_data,
                                            int buf_size, int use_highbd) {
   for (int ref = 0; ref < 2; ref++) {
+    // The mc_buf/hbd_mc_buf must be zeroed to fix a intermittent valgrind error
+    // 'Conditional jump or move depends on uninitialised value' from the loop
+    // filter. Uninitialized reads in convolve function (e.g. horiz_4tap path in
+    // av1_convolve_2d_sr_avx2()) from mc_buf/hbd_mc_buf are seen to be the
+    // potential reason for this issue.
     if (use_highbd) {
       uint16_t *hbd_mc_buf;
       CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size));
+      memset(hbd_mc_buf, 0, buf_size);
       thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf);
     } else {
       CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref],
                       (uint8_t *)aom_memalign(16, buf_size));
+      memset(thread_data->mc_buf[ref], 0, buf_size);
     }
   }
   thread_data->mc_buf_size = buf_size;
@@ -3385,6 +3381,10 @@ static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm,
   CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst,
                   aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
                                        sizeof(*thread_data->tmp_conv_dst)));
+  CHECK_MEM_ERROR(cm, thread_data->seg_mask,
+                  (uint8_t *)aom_memalign(
+                      16, 2 * MAX_SB_SQUARE * sizeof(*thread_data->seg_mask)));
+
   for (int i = 0; i < 2; ++i) {
     CHECK_MEM_ERROR(
         cm, thread_data->tmp_obmc_bufs[i],
@@ -3402,13 +3402,16 @@ static AOM_INLINE void reset_dec_workers(AV1Decoder *pbi,
   for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
     AVxWorker *const worker = &pbi->tile_workers[worker_idx];
     DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
-    thread_data->td->xd = pbi->mb;
-    thread_data->td->xd.corrupted = 0;
-    thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0];
-    thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1];
-    thread_data->td->xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+    thread_data->td->dcb = pbi->dcb;
+    thread_data->td->dcb.corrupted = 0;
+    thread_data->td->dcb.mc_buf[0] = thread_data->td->mc_buf[0];
+    thread_data->td->dcb.mc_buf[1] = thread_data->td->mc_buf[1];
+    thread_data->td->dcb.xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+    if (worker_idx)
+      thread_data->td->dcb.xd.seg_mask = thread_data->td->seg_mask;
     for (int j = 0; j < 2; ++j) {
-      thread_data->td->xd.tmp_obmc_bufs[j] = thread_data->td->tmp_obmc_bufs[j];
+      thread_data->td->dcb.xd.tmp_obmc_bufs[j] =
+          thread_data->td->tmp_obmc_bufs[j];
     }
     winterface->sync(worker);
 
@@ -3428,14 +3431,14 @@ static AOM_INLINE void launch_dec_workers(AV1Decoder *pbi,
                                           int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 
-  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+  for (int worker_idx = num_workers - 1; worker_idx >= 0; --worker_idx) {
     AVxWorker *const worker = &pbi->tile_workers[worker_idx];
     DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
 
     thread_data->data_end = data_end;
 
     worker->had_error = 0;
-    if (worker_idx == num_workers - 1) {
+    if (worker_idx == 0) {
       winterface->execute(worker);
     } else {
       winterface->launch(worker);
@@ -3452,7 +3455,7 @@ static AOM_INLINE void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
     aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
   }
 
-  pbi->mb.corrupted = corrupted;
+  pbi->dcb.corrupted = corrupted;
 }
 
 static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
@@ -3475,12 +3478,12 @@ static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
 
       winterface->init(worker);
       worker->thread_name = "aom tile worker";
-      if (worker_idx < num_threads - 1 && !winterface->reset(worker)) {
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+      if (worker_idx != 0 && !winterface->reset(worker)) {
+        aom_internal_error(&pbi->error, AOM_CODEC_ERROR,
                            "Tile decoder thread creation failed");
       }
 
-      if (worker_idx < num_threads - 1) {
+      if (worker_idx != 0) {
         // Allocate thread data.
         CHECK_MEM_ERROR(cm, thread_data->td,
                         aom_memalign(32, sizeof(*thread_data->td)));
@@ -3493,9 +3496,9 @@ static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
       thread_data->error_info.setjmp = 0;
     }
   }
-  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int use_highbd = cm->seq_params->use_highbitdepth;
   const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
-  for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
+  for (worker_idx = 1; worker_idx < pbi->max_threads; ++worker_idx) {
     DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
     if (thread_data->td->mc_buf_size != buf_size) {
       av1_free_mc_tmp_buf(thread_data->td);
@@ -3585,6 +3588,10 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
   if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
     decoder_alloc_tile_data(pbi, n_tiles);
   }
+  if (pbi->dcb.xd.seg_mask == NULL)
+    CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
+                    (uint8_t *)aom_memalign(
+                        16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
 
   for (int row = 0; row < tile_rows; row++) {
     for (int col = 0; col < tile_cols; col++) {
@@ -3600,8 +3607,8 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
   launch_dec_workers(pbi, data_end, num_workers);
   sync_dec_workers(pbi, num_workers);
 
-  if (pbi->mb.corrupted)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+  if (pbi->dcb.corrupted)
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Failed to decode tile data");
 
   if (tiles->large_scale) {
@@ -3619,8 +3626,8 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
 
 static AOM_INLINE void dec_alloc_cb_buf(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
-  int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
-             ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
+  int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) *
+             ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1);
 
   if (pbi->cb_buffer_alloc_size < size) {
     av1_dec_free_cb_buf(pbi);
@@ -3657,17 +3664,17 @@ static AOM_INLINE void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start,
 
       TileDataDec *const tile_data =
           pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
-      TileInfo tile_info = tile_data->tile_info;
+      const TileInfo *const tile_info = &tile_data->tile_info;
 
       tile_data->dec_row_mt_sync.mi_rows_parse_done = 0;
       tile_data->dec_row_mt_sync.mi_rows_decode_started = 0;
       tile_data->dec_row_mt_sync.num_threads_working = 0;
       tile_data->dec_row_mt_sync.mi_rows =
-          ALIGN_POWER_OF_TWO(tile_info.mi_row_end - tile_info.mi_row_start,
-                             cm->seq_params.mib_size_log2);
+          ALIGN_POWER_OF_TWO(tile_info->mi_row_end - tile_info->mi_row_start,
+                             cm->seq_params->mib_size_log2);
       tile_data->dec_row_mt_sync.mi_cols =
-          ALIGN_POWER_OF_TWO(tile_info.mi_col_end - tile_info.mi_col_start,
-                             cm->seq_params.mib_size_log2);
+          ALIGN_POWER_OF_TWO(tile_info->mi_col_end - tile_info->mi_col_start,
+                             cm->seq_params->mib_size_log2);
 
       frame_row_mt_info->mi_rows_to_decode +=
           tile_data->dec_row_mt_sync.mi_rows;
@@ -3771,6 +3778,10 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
     }
     decoder_alloc_tile_data(pbi, n_tiles);
   }
+  if (pbi->dcb.xd.seg_mask == NULL)
+    CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
+                    (uint8_t *)aom_memalign(
+                        16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
 
   for (int row = 0; row < tile_rows; row++) {
     for (int col = 0; col < tile_cols; col++) {
@@ -3778,8 +3789,8 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
       av1_tile_init(&tile_data->tile_info, cm, row, col);
 
       max_sb_rows = AOMMAX(max_sb_rows,
-                           av1_get_sb_rows_in_tile(cm, tile_data->tile_info));
-      num_workers += get_max_row_mt_workers_per_tile(cm, tile_data->tile_info);
+                           av1_get_sb_rows_in_tile(cm, &tile_data->tile_info));
+      num_workers += get_max_row_mt_workers_per_tile(cm, &tile_data->tile_info);
     }
   }
   num_workers = AOMMIN(num_workers, max_threads);
@@ -3805,8 +3816,8 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
   launch_dec_workers(pbi, data_end, num_workers);
   sync_dec_workers(pbi, num_workers);
 
-  if (pbi->mb.corrupted)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+  if (pbi->dcb.corrupted)
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Failed to decode tile data");
 
   if (tiles->large_scale) {
@@ -3824,7 +3835,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
 
 static AOM_INLINE void error_handler(void *data) {
   AV1_COMMON *const cm = (AV1_COMMON *)data;
-  aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
+  aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
 
 // Reads the high_bitdepth and twelve_bit fields in color_config() and sets
@@ -3855,7 +3866,7 @@ static AOM_INLINE void read_bitdepth(
 void av1_read_film_grain_params(AV1_COMMON *cm,
                                 struct aom_read_bit_buffer *rb) {
   aom_film_grain_t *pars = &cm->film_grain_params;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
 
   pars->apply_grain = aom_rb_read_bit(rb);
   if (!pars->apply_grain) {
@@ -3885,7 +3896,7 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
       }
     }
     if (!found) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Invalid film grain reference idx %d. ref_frame_idx = "
                          "{%d, %d, %d, %d, %d, %d, %d}",
                          film_grain_params_ref_idx, cm->remapped_ref_idx[0],
@@ -3895,11 +3906,11 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
     }
     RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx];
     if (buf == NULL) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Invalid Film grain reference idx");
     }
     if (!buf->film_grain_params_present) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Film grain reference parameters not available");
     }
     uint16_t random_seed = pars->random_seed;
@@ -3911,13 +3922,13 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
   // Scaling functions parameters
   pars->num_y_points = aom_rb_read_literal(rb, 4);  // max 14
   if (pars->num_y_points > 14)
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+    aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                        "Number of points for film grain luma scaling function "
                        "exceeds the maximum value.");
   for (int i = 0; i < pars->num_y_points; i++) {
     pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8);
     if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0])
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "First coordinate of the scaling function points "
                          "shall be increasing.");
     pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
@@ -3936,14 +3947,14 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
   } else {
     pars->num_cb_points = aom_rb_read_literal(rb, 4);  // max 10
     if (pars->num_cb_points > 10)
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Number of points for film grain cb scaling function "
                          "exceeds the maximum value.");
     for (int i = 0; i < pars->num_cb_points; i++) {
       pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8);
       if (i &&
           pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0])
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "First coordinate of the scaling function points "
                            "shall be increasing.");
       pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8);
@@ -3951,14 +3962,14 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
 
     pars->num_cr_points = aom_rb_read_literal(rb, 4);  // max 10
     if (pars->num_cr_points > 10)
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Number of points for film grain cr scaling function "
                          "exceeds the maximum value.");
     for (int i = 0; i < pars->num_cr_points; i++) {
       pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8);
       if (i &&
           pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0])
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "First coordinate of the scaling function points "
                            "shall be increasing.");
       pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
@@ -3967,7 +3978,7 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
     if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
         (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
          ((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "In YCbCr 4:2:0, film grain shall be applied "
                          "to both chroma components or neither.");
   }
@@ -4019,13 +4030,13 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
 
 static AOM_INLINE void read_film_grain(AV1_COMMON *cm,
                                        struct aom_read_bit_buffer *rb) {
-  if (cm->seq_params.film_grain_params_present &&
+  if (cm->seq_params->film_grain_params_present &&
       (cm->show_frame || cm->showable_frame)) {
     av1_read_film_grain_params(cm, rb);
   } else {
     memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
-  cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+  cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
   memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params,
          sizeof(aom_film_grain_t));
 }
@@ -4127,7 +4138,7 @@ void av1_read_timing_info_header(aom_timing_info_t *timing_info,
     if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
       aom_internal_error(
           error, AOM_CODEC_UNSUP_BITSTREAM,
-          "num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.");
+          "num_ticks_per_picture_minus_1 cannot be (1 << 32) - 1.");
     }
     timing_info->num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1;
   }
@@ -4159,7 +4170,7 @@ void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params,
 static AOM_INLINE void read_temporal_point_info(
     AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) {
   cm->frame_presentation_time = aom_rb_read_unsigned_literal(
-      rb, cm->seq_params.decoder_model_info.frame_presentation_time_length);
+      rb, cm->seq_params->decoder_model_info.frame_presentation_time_length);
 }
 
 void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
@@ -4187,7 +4198,7 @@ void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
     seq_params->frame_id_length =
         aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1;
     if (seq_params->frame_id_length > 16)
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid frame_id_length");
   }
 
@@ -4437,15 +4448,18 @@ static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
 static int read_uncompressed_header(AV1Decoder *pbi,
                                     struct aom_read_bit_buffer *rb) {
   AV1_COMMON *const cm = &pbi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   CurrentFrame *const current_frame = &cm->current_frame;
   FeatureFlags *const features = &cm->features;
-  MACROBLOCKD *const xd = &pbi->mb;
+  MACROBLOCKD *const xd = &pbi->dcb.xd;
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = pool->frame_bufs;
+  aom_s_frame_info *sframe_info = &pbi->sframe_info;
+  sframe_info->is_s_frame = 0;
+  sframe_info->is_s_frame_at_altref = 0;
 
   if (!pbi->sequence_header_ready) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "No sequence header");
   }
 
@@ -4467,14 +4481,14 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     if (cm->show_existing_frame) {
       if (pbi->sequence_header_changed) {
         aom_internal_error(
-            &cm->error, AOM_CODEC_CORRUPT_FRAME,
+            &pbi->error, AOM_CODEC_CORRUPT_FRAME,
             "New sequence header starts with a show_existing_frame.");
       }
       // Show an existing frame directly.
       const int existing_frame_idx = aom_rb_read_literal(rb, 3);
       RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx];
       if (frame_to_show == NULL) {
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "Buffer does not contain a decoded frame");
       }
       if (seq_params->decoder_model_info_present_flag &&
@@ -4488,7 +4502,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
          * referencing */
         if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
             pbi->valid_for_referencing[existing_frame_idx] == 0)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Reference buffer frame ID mismatch");
       }
       lock_buffer_pool(pool);
@@ -4509,12 +4523,13 @@ static int read_uncompressed_header(AV1Decoder *pbi,
       cm->lf.filter_level[0] = 0;
       cm->lf.filter_level[1] = 0;
       cm->show_frame = 1;
+      current_frame->order_hint = frame_to_show->order_hint;
 
       // Section 6.8.2: It is a requirement of bitstream conformance that when
       // show_existing_frame is used to show a previous frame, that the value
       // of showable_frame for the previous frame was equal to 1.
       if (!frame_to_show->showable_frame) {
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "Buffer does not contain a showable frame");
       }
       // Section 6.8.2: It is a requirement of bitstream conformance that when
@@ -4542,15 +4557,22 @@ static int read_uncompressed_header(AV1Decoder *pbi,
         pbi->decoding_first_frame = 1;
         reset_frame_buffers(cm);
       } else {
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                            "Sequence header has changed without a keyframe.");
       }
     }
 
     cm->show_frame = aom_rb_read_bit(rb);
+    if (cm->show_frame == 0) pbi->is_arf_frame_present = 1;
+    if (cm->show_frame == 0 && cm->current_frame.frame_type == KEY_FRAME)
+      pbi->is_fwd_kf_present = 1;
+    if (cm->current_frame.frame_type == S_FRAME) {
+      sframe_info->is_s_frame = 1;
+      sframe_info->is_s_frame_at_altref = cm->show_frame ? 0 : 1;
+    }
     if (seq_params->still_picture &&
         (current_frame->frame_type != KEY_FRAME || !cm->show_frame)) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                          "Still pictures must be coded as shown keyframes");
     }
     cm->showable_frame = current_frame->frame_type != KEY_FRAME;
@@ -4622,7 +4644,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
         /* Check current_frame_id for conformance */
         if (prev_frame_id == cm->current_frame_id ||
             diff_frame_id >= (1 << (frame_id_length - 1))) {
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Invalid value of current_frame_id");
         }
       }
@@ -4645,7 +4667,9 @@ static int read_uncompressed_header(AV1Decoder *pbi,
 
     current_frame->order_hint = aom_rb_read_literal(
         rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
-    current_frame->frame_number = current_frame->order_hint;
+
+    if (seq_params->order_hint_info.enable_order_hint)
+      current_frame->frame_number = current_frame->order_hint;
 
     if (!features->error_resilient_mode && !frame_is_intra_only(cm)) {
       features->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS);
@@ -4653,18 +4677,18 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   }
 
   if (seq_params->decoder_model_info_present_flag) {
-    cm->buffer_removal_time_present = aom_rb_read_bit(rb);
-    if (cm->buffer_removal_time_present) {
+    pbi->buffer_removal_time_present = aom_rb_read_bit(rb);
+    if (pbi->buffer_removal_time_present) {
       for (int op_num = 0;
            op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
         if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
-          if ((((seq_params->operating_point_idc[op_num] >>
+          if (seq_params->operating_point_idc[op_num] == 0 ||
+              (((seq_params->operating_point_idc[op_num] >>
                  cm->temporal_layer_id) &
                 0x1) &&
                ((seq_params->operating_point_idc[op_num] >>
                  (cm->spatial_layer_id + 8)) &
-                0x1)) ||
-              seq_params->operating_point_idc[op_num] == 0) {
+                0x1))) {
             cm->buffer_removal_times[op_num] = aom_rb_read_unsigned_literal(
                 rb, seq_params->decoder_model_info.buffer_removal_time_length);
           } else {
@@ -4694,7 +4718,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     if (current_frame->frame_type == INTRA_ONLY_FRAME) {
       current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
       if (current_frame->refresh_frame_flags == 0xFF) {
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "Intra only frames cannot have refresh flags 0xFF");
       }
       if (pbi->need_resync) {
@@ -4728,7 +4752,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
           // pixels set to neutral grey.
           int buf_idx = get_free_fb(cm);
           if (buf_idx == INVALID_IDX) {
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                                "Unable to find free frame buffer");
           }
           buf = &frame_bufs[buf_idx];
@@ -4738,10 +4762,11 @@ static int read_uncompressed_header(AV1Decoder *pbi,
                   seq_params->max_frame_height, seq_params->subsampling_x,
                   seq_params->subsampling_y, seq_params->use_highbitdepth,
                   AOM_BORDER_IN_PIXELS, features->byte_alignment,
-                  &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) {
+                  &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0,
+                  0)) {
             decrease_ref_count(buf, pool);
             unlock_buffer_pool(pool);
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           }
           unlock_buffer_pool(pool);
@@ -4808,10 +4833,10 @@ static int read_uncompressed_header(AV1Decoder *pbi,
         // reference to a slot that hasn't been set yet. That's what we are
         // checking here.
         if (lst_buf == NULL)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Inter frame requests nonexistent reference");
         if (gld_buf == NULL)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Inter frame requests nonexistent reference");
 
         av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref);
@@ -4829,7 +4854,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
           // reference to a slot that hasn't been set yet. That's what we are
           // checking here.
           if (cm->ref_frame_map[ref] == NULL)
-            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+            aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                                "Inter frame requests nonexistent reference");
           cm->remapped_ref_idx[i] = ref;
         } else {
@@ -4837,7 +4862,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
         }
         // Check valid for referencing
         if (pbi->valid_for_referencing[ref] == 0)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Reference frame not valid for referencing");
 
         cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
@@ -4853,7 +4878,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
           // Compare values derived from delta_frame_id_minus_1 and
           // refresh_frame_flags.
           if (ref_frame_id != cm->ref_frame_id[ref])
-            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+            aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                                "Reference buffer frame ID mismatch");
         }
       }
@@ -4876,7 +4901,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     cm->prev_frame = get_primary_ref_frame_buf(cm);
     if (features->primary_ref_frame != PRIMARY_REF_NONE &&
         get_primary_ref_frame_buf(cm) == NULL) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                          "Reference frame containing this frame's initial "
                          "frame context is unavailable.");
     }
@@ -4896,7 +4921,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
             ref_scale_factors, ref_buf->buf.y_crop_width,
             ref_buf->buf.y_crop_height, cm->width, cm->height);
         if ((!av1_is_valid_scale(ref_scale_factors)))
-          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                              "Reference frame has invalid dimensions");
       }
     }
@@ -4933,7 +4958,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   cm->cur_frame->buf.render_height = cm->render_height;
 
   if (pbi->need_resync) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Keyframe / intra-only frame required to reset decoder"
                        " state");
   }
@@ -4954,13 +4979,13 @@ static int read_uncompressed_header(AV1Decoder *pbi,
 
   read_tile_info(pbi, rb);
   if (!av1_is_min_tile_width_satisfied(cm)) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Minimum tile width requirement not satisfied");
   }
 
   CommonQuantParams *const quant_params = &cm->quant_params;
   setup_quantization(quant_params, av1_num_planes(cm),
-                     cm->seq_params.separate_uv_delta_q, rb);
+                     cm->seq_params->separate_uv_delta_q, rb);
   xd->bd = (int)seq_params->bit_depth;
 
   CommonContexts *const above_contexts = &cm->above_contexts;
@@ -4971,7 +4996,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
     if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
                                         cm->mi_params.mi_cols,
                                         av1_num_planes(cm))) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate context buffers");
     }
   }
@@ -4989,7 +5014,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   cm->delta_q_info.delta_q_present_flag =
       quant_params->base_qindex > 0 ? aom_rb_read_bit(rb) : 0;
   if (cm->delta_q_info.delta_q_present_flag) {
-    xd->current_qindex = quant_params->base_qindex;
+    xd->current_base_qindex = quant_params->base_qindex;
     cm->delta_q_info.delta_q_res = 1 << aom_rb_read_literal(rb, 2);
     if (!features->allow_intrabc)
       cm->delta_q_info.delta_lf_present_flag = aom_rb_read_bit(rb);
@@ -5051,7 +5076,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
   features->reduced_tx_set_used = aom_rb_read_bit(rb);
 
   if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Frame wrongly requests reference frame MVs");
   }
 
@@ -5104,16 +5129,20 @@ static AOM_INLINE void superres_post_decode(AV1Decoder *pbi) {
 
 uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
                                             struct aom_read_bit_buffer *rb,
-                                            const uint8_t *data,
-                                            const uint8_t **p_data_end,
                                             int trailing_bits_present) {
   AV1_COMMON *const cm = &pbi->common;
   const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &pbi->mb;
+  MACROBLOCKD *const xd = &pbi->dcb.xd;
 
 #if CONFIG_BITSTREAM_DEBUG
-  aom_bitstream_queue_set_frame_read(cm->current_frame.frame_number * 2 +
-                                     cm->show_frame);
+  if (cm->seq_params->order_hint_info.enable_order_hint) {
+    aom_bitstream_queue_set_frame_read(cm->current_frame.order_hint * 2 +
+                                       cm->show_frame);
+  } else {
+    // This is currently used in RTC encoding. cm->show_frame is always 1.
+    assert(cm->show_frame);
+    aom_bitstream_queue_set_frame_read(cm->current_frame.frame_number);
+  }
 #endif
 #if CONFIG_MISMATCH_DEBUG
   mismatch_move_frame_idx_r();
@@ -5145,14 +5174,13 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
         xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height);
   }
 
+  // Showing a frame directly.
   if (cm->show_existing_frame) {
-    // showing a frame directly
-    *p_data_end = data + uncomp_hdr_size;
     if (pbi->reset_decoder_state) {
       // Use the default frame context values.
       *cm->fc = *cm->default_frame_context;
       if (!cm->fc->initialized)
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                            "Uninitialized entropy context.");
     }
     return uncomp_hdr_size;
@@ -5160,10 +5188,11 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
 
   cm->mi_params.setup_mi(&cm->mi_params);
 
-  av1_setup_motion_field(cm);
+  av1_calculate_ref_frame_side(cm);
+  if (cm->features.allow_ref_frame_mvs) av1_setup_motion_field(cm);
 
-  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
-                         cm->seq_params.subsampling_y, num_planes);
+  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+                         cm->seq_params->subsampling_y, num_planes);
   if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
     // use the default frame context values
     *cm->fc = *cm->default_frame_context;
@@ -5171,10 +5200,10 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
     *cm->fc = get_primary_ref_frame_buf(cm)->frame_context;
   }
   if (!cm->fc->initialized)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Uninitialized entropy context.");
 
-  xd->corrupted = 0;
+  pbi->dcb.corrupted = 0;
   return uncomp_hdr_size;
 }
 
@@ -5187,7 +5216,8 @@ static AOM_INLINE void setup_frame_info(AV1Decoder *pbi) {
       cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
     av1_alloc_restoration_buffers(cm);
   }
-  const int use_highbd = cm->seq_params.use_highbitdepth;
+
+  const int use_highbd = cm->seq_params->use_highbitdepth;
   const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
   if (pbi->td.mc_buf_size != buf_size) {
     av1_free_mc_tmp_buf(&pbi->td);
@@ -5201,14 +5231,11 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
                                     int end_tile, int initialize_flag) {
   AV1_COMMON *const cm = &pbi->common;
   CommonTileParams *const tiles = &cm->tiles;
-  MACROBLOCKD *const xd = &pbi->mb;
+  MACROBLOCKD *const xd = &pbi->dcb.xd;
   const int tile_count_tg = end_tile - start_tile + 1;
 
   if (initialize_flag) setup_frame_info(pbi);
   const int num_planes = av1_num_planes(cm);
-#if CONFIG_LPF_MASK
-  av1_loop_filter_frame_init(cm, 0, num_planes);
-#endif
 
   if (pbi->max_threads > 1 && !(tiles->large_scale && !pbi->ext_tile_debug) &&
       pbi->row_mt)
@@ -5222,48 +5249,49 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
 
   // If the bit stream is monochrome, set the U and V buffers to a constant.
   if (num_planes < 3) {
-    set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);
+    set_planes_to_neutral_grey(cm->seq_params, xd->cur_buf, 1);
   }
 
   if (end_tile != tiles->rows * tiles->cols - 1) {
     return;
   }
 
+  av1_alloc_cdef_buffers(cm, &pbi->cdef_worker, &pbi->cdef_sync,
+                         pbi->num_workers, 1);
+  av1_alloc_cdef_sync(cm, &pbi->cdef_sync, pbi->num_workers);
+
   if (!cm->features.allow_intrabc && !tiles->single_tile_decoding) {
     if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
-      if (pbi->num_workers > 1) {
-        av1_loop_filter_frame_mt(
-            &cm->cur_frame->buf, cm, &pbi->mb, 0, num_planes, 0,
-#if CONFIG_LPF_MASK
-            1,
-#endif
-            pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync);
-      } else {
-        av1_loop_filter_frame(&cm->cur_frame->buf, cm, &pbi->mb,
-#if CONFIG_LPF_MASK
-                              1,
-#endif
-                              0, num_planes, 0);
-      }
+      av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &pbi->dcb.xd, 0,
+                               num_planes, 0, pbi->tile_workers,
+                               pbi->num_workers, &pbi->lf_row_sync, 0);
     }
 
-    const int do_loop_restoration =
-        cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-        cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
     const int do_cdef =
         !pbi->skip_loop_filter && !cm->features.coded_lossless &&
         (cm->cdef_info.cdef_bits || cm->cdef_info.cdef_strengths[0] ||
          cm->cdef_info.cdef_uv_strengths[0]);
     const int do_superres = av1_superres_scaled(cm);
     const int optimized_loop_restoration = !do_cdef && !do_superres;
-
+    const int do_loop_restoration =
+        cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
     if (!optimized_loop_restoration) {
       if (do_loop_restoration)
         av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf,
                                                  cm, 0);
 
-      if (do_cdef) av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->mb);
+      if (do_cdef) {
+        if (pbi->num_workers > 1) {
+          av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker,
+                            pbi->tile_workers, &pbi->cdef_sync,
+                            pbi->num_workers, av1_cdef_init_fb_row_mt);
+        } else {
+          av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd,
+                         av1_cdef_init_fb_row);
+        }
+      }
 
       superres_post_decode(pbi);
 
@@ -5298,18 +5326,15 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
       }
     }
   }
-#if CONFIG_LPF_MASK
-  av1_zero_array(cm->lf.lfm, cm->lf.lfm_num);
-#endif
 
-  if (!xd->corrupted) {
+  if (!pbi->dcb.corrupted) {
     if (cm->features.refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
       assert(pbi->context_update_tile_id < pbi->allocated_tiles);
       *cm->fc = pbi->tile_data[pbi->context_update_tile_id].tctx;
       av1_reset_cdf_symbol_counters(cm->fc);
     }
   } else {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Decode failed. Frame data is corrupted.");
   }
 
@@ -5323,4 +5348,8 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
   if (!tiles->large_scale) {
     cm->cur_frame->frame_context = *cm->fc;
   }
+
+  if (cm->show_frame && !cm->seq_params->order_hint_info.enable_order_hint) {
+    ++cm->current_frame.frame_number;
+  }
 }
diff --git a/media/libaom/src/av1/decoder/decodeframe.h b/media/libaom/src/av1/decoder/decodeframe.h
index 95b3c9f22c..46ae475ff5 100644
--- a/media/libaom/src/av1/decoder/decodeframe.h
+++ b/media/libaom/src/av1/decoder/decodeframe.h
@@ -37,11 +37,8 @@ int av1_check_trailing_bits(struct AV1Decoder *pbi,
 
 // On success, returns the frame header size. On failure, calls
 // aom_internal_error and does not return.
-// TODO(wtc): Figure out and document the p_data_end parameter.
 uint32_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
                                             struct aom_read_bit_buffer *rb,
-                                            const uint8_t *data,
-                                            const uint8_t **p_data_end,
                                             int trailing_bits_present);
 
 void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data,
diff --git a/media/libaom/src/av1/decoder/decodemv.c b/media/libaom/src/av1/decoder/decodemv.c
index e97cec42cb..d6743b3c09 100644
--- a/media/libaom/src/av1/decoder/decodemv.c
+++ b/media/libaom/src/av1/decoder/decodemv.c
@@ -37,7 +37,7 @@ static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
 }
 
 static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
-  const int skip = xd->mi[0]->skip;
+  const int skip_txfm = xd->mi[0]->skip_txfm;
   if (cm->features.coded_lossless) return;
   if (cm->features.allow_intrabc) {
     assert(cm->cdef_info.cdef_bits == 0);
@@ -46,7 +46,7 @@ static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
 
   // At the start of a superblock, mark that we haven't yet read CDEF strengths
   // for any of the CDEF units contained in this superblock.
-  const int sb_mask = (cm->seq_params.mib_size - 1);
+  const int sb_mask = (cm->seq_params->mib_size - 1);
   const int mi_row_in_sb = (xd->mi_row & sb_mask);
   const int mi_col_in_sb = (xd->mi_col & sb_mask);
   if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
@@ -61,12 +61,12 @@ static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
   const int index_mask = cdef_size;
   const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
   const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
-  const int index = (cm->seq_params.sb_size == BLOCK_128X128)
+  const int index = (cm->seq_params->sb_size == BLOCK_128X128)
                         ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
                         : 0;
 
   // Read CDEF strength from the first non-skip coding block in this CDEF unit.
-  if (!xd->cdef_transmitted[index] && !skip) {
+  if (!xd->cdef_transmitted[index] && !skip_txfm) {
     // CDEF strength for this CDEF unit needs to be read into the MB_MODE_INFO
     // of the 1st block in this CDEF unit.
     const int first_block_mask = ~(cdef_size - 1);
@@ -84,13 +84,13 @@ static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
 static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
                              aom_reader *r, MB_MODE_INFO *const mbmi) {
   int sign, abs, reduced_delta_qindex = 0;
-  BLOCK_SIZE bsize = mbmi->sb_type;
-  const int b_col = xd->mi_col & (cm->seq_params.mib_size - 1);
-  const int b_row = xd->mi_row & (cm->seq_params.mib_size - 1);
+  BLOCK_SIZE bsize = mbmi->bsize;
+  const int b_col = xd->mi_col & (cm->seq_params->mib_size - 1);
+  const int b_row = xd->mi_row & (cm->seq_params->mib_size - 1);
   const int read_delta_q_flag = (b_col == 0 && b_row == 0);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-  if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+  if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) &&
       read_delta_q_flag) {
     abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
     const int smallval = (abs < DELTA_Q_SMALL);
@@ -116,12 +116,12 @@ static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r,
                               const MB_MODE_INFO *const mbmi, int mi_col,
                               int mi_row) {
   int reduced_delta_lflevel = 0;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int b_col = mi_col & (cm->seq_params.mib_size - 1);
-  const int b_row = mi_row & (cm->seq_params.mib_size - 1);
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const int b_col = mi_col & (cm->seq_params->mib_size - 1);
+  const int b_row = mi_row & (cm->seq_params->mib_size - 1);
   const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
 
-  if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+  if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) &&
       read_delta_lf_flag) {
     int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
     const int smallval = (abs < DELTA_LF_SMALL);
@@ -193,13 +193,14 @@ static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, aom_reader *r,
     return NEARMV;
 }
 
-static void read_drl_idx(FRAME_CONTEXT *ec_ctx, MACROBLOCKD *xd,
+static void read_drl_idx(FRAME_CONTEXT *ec_ctx, DecoderCodingBlock *dcb,
                          MB_MODE_INFO *mbmi, aom_reader *r) {
+  MACROBLOCKD *const xd = &dcb->xd;
   uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
   mbmi->ref_mv_idx = 0;
   if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
     for (int idx = 0; idx < 2; ++idx) {
-      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+      if (dcb->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
         int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
         mbmi->ref_mv_idx = idx + drl_idx;
@@ -212,7 +213,7 @@ static void read_drl_idx(FRAME_CONTEXT *ec_ctx, MACROBLOCKD *xd,
     // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
     // mode is factored in.
     for (int idx = 1; idx < 3; ++idx) {
-      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+      if (dcb->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
         int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
         mbmi->ref_mv_idx = idx + drl_idx - 1;
@@ -235,12 +236,11 @@ static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
 
   if (last_motion_mode_allowed == OBMC_CAUSAL) {
     motion_mode =
-        aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2, ACCT_STR);
+        aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->bsize], 2, ACCT_STR);
     return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
   } else {
-    motion_mode =
-        aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
-                        MOTION_MODES, ACCT_STR);
+    motion_mode = aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->bsize],
+                                  MOTION_MODES, ACCT_STR);
     return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
   }
 }
@@ -367,8 +367,8 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
   const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
-  const int bw = mi_size_wide[mbmi->sb_type];
-  const int bh = mi_size_high[mbmi->sb_type];
+  const int bw = mi_size_wide[mbmi->bsize];
+  const int bh = mi_size_high[mbmi->bsize];
 
   // TODO(slavarnway): move x_mis, y_mis into xd ?????
   const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
@@ -386,7 +386,7 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
   if (preskip) {
     if (!seg->segid_preskip) return 0;
   } else {
-    if (mbmi->skip) {
+    if (mbmi->skip_txfm) {
       if (seg->temporal_update) {
         mbmi->seg_id_predicted = 0;
       }
@@ -422,7 +422,7 @@ static int read_skip_mode(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
     return 0;
   }
 
-  if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return 0;
+  if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return 0;
 
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
       segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
@@ -439,15 +439,16 @@ static int read_skip_mode(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
   return skip_mode;
 }
 
-static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
-                     aom_reader *r) {
+static int read_skip_txfm(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+                          aom_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int ctx = av1_get_skip_context(xd);
+    const int ctx = av1_get_skip_txfm_context(xd);
     FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-    const int skip = aom_read_symbol(r, ec_ctx->skip_cdfs[ctx], 2, ACCT_STR);
-    return skip;
+    const int skip_txfm =
+        aom_read_symbol(r, ec_ctx->skip_txfm_cdfs[ctx], 2, ACCT_STR);
+    return skip_txfm;
   }
 }
 
@@ -563,7 +564,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                                    aom_reader *r) {
   const int num_planes = av1_num_planes(cm);
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
@@ -578,7 +579,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
           aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
                           PALETTE_SIZES, ACCT_STR) +
           2;
-      read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r);
+      read_palette_colors_y(xd, cm->seq_params->bit_depth, pmi, r);
     }
   }
   if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref) {
@@ -590,7 +591,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
           aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
                           PALETTE_SIZES, ACCT_STR) +
           2;
-      read_palette_colors_uv(xd, cm->seq_params.bit_depth, pmi, r);
+      read_palette_colors_uv(xd, cm->seq_params->bit_depth, pmi, r);
     }
   }
 }
@@ -608,7 +609,7 @@ static void read_filter_intra_mode_info(const AV1_COMMON *const cm,
 
   if (av1_filter_intra_allowed(cm, mbmi)) {
     filter_intra_mode_info->use_filter_intra = aom_read_symbol(
-        r, xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2, ACCT_STR);
+        r, xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2, ACCT_STR);
     if (filter_intra_mode_info->use_filter_intra) {
       filter_intra_mode_info->filter_intra_mode = aom_read_symbol(
           r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR);
@@ -626,7 +627,8 @@ void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
   *tx_type = DCT_DCT;
 
   // No need to read transform type if block is skipped.
-  if (mbmi->skip || segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+  if (mbmi->skip_txfm ||
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
     return;
 
   // No need to read transform type for lossless mode(qindex==0).
@@ -680,17 +682,18 @@ static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv,
   mv->as_mv.row = (mv->as_mv.row >> 3) * 8;
   int valid = is_mv_valid(&mv->as_mv) &&
               av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize,
-                              cm->seq_params.mib_size_log2);
+                              cm->seq_params->mib_size_log2);
   return valid;
 }
 
-static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+static void read_intrabc_info(AV1_COMMON *const cm, DecoderCodingBlock *dcb,
                               aom_reader *r) {
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
   if (mbmi->use_intrabc) {
-    BLOCK_SIZE bsize = mbmi->sb_type;
+    BLOCK_SIZE bsize = mbmi->bsize;
     mbmi->mode = DC_PRED;
     mbmi->uv_mode = UV_DC_PRED;
     mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
@@ -699,7 +702,7 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
     int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
     int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
 
-    av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, xd->ref_mv_count,
+    av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, dcb->ref_mv_count,
                      xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL,
                      inter_mode_ctx);
 
@@ -708,7 +711,7 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
     av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0);
     int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
     if (dv_ref.as_int == 0)
-      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params.mib_size, xd->mi_row);
+      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params->mib_size, xd->mi_row);
     // Ref DV should not have sub-pel.
     int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0;
     dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8;
@@ -731,10 +734,10 @@ static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd,
 
   if (delta_q_info->delta_q_present_flag) {
     MB_MODE_INFO *const mbmi = xd->mi[0];
-    xd->current_qindex +=
+    xd->current_base_qindex +=
         read_delta_qindex(cm, xd, r, mbmi) * delta_q_info->delta_q_res;
     /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
-    xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
+    xd->current_base_qindex = clamp(xd->current_base_qindex, 1, MAXQ);
     FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
     if (delta_q_info->delta_lf_present_flag) {
       const int mi_row = xd->mi_row;
@@ -764,11 +767,12 @@ static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd,
 }
 
 static void read_intra_frame_mode_info(AV1_COMMON *const cm,
-                                       MACROBLOCKD *const xd, aom_reader *r) {
+                                       DecoderCodingBlock *dcb, aom_reader *r) {
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const MB_MODE_INFO *above_mi = xd->above_mbmi;
   const MB_MODE_INFO *left_mi = xd->left_mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   struct segmentation *const seg = &cm->seg;
 
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
@@ -776,16 +780,16 @@ static void read_intra_frame_mode_info(AV1_COMMON *const cm,
   if (seg->segid_preskip)
     mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, 0);
 
-  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+  mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r);
 
   if (!seg->segid_preskip)
-    mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, mbmi->skip);
+    mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, mbmi->skip_txfm);
 
   read_cdef(cm, r, xd);
 
   read_delta_q_params(cm, xd, r);
 
-  mbmi->current_qindex = xd->current_qindex;
+  mbmi->current_qindex = xd->current_base_qindex;
 
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
@@ -800,7 +804,7 @@ static void read_intra_frame_mode_info(AV1_COMMON *const cm,
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   if (av1_allow_intrabc(cm)) {
-    read_intrabc_info(cm, xd, r);
+    read_intrabc_info(cm, dcb, r);
     if (is_intrabc_block(mbmi)) return;
   }
 
@@ -812,7 +816,7 @@ static void read_intra_frame_mode_info(AV1_COMMON *const cm,
           ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
           : 0;
 
-  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
+  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
     mbmi->uv_mode =
         read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
     if (mbmi->uv_mode == UV_CFL_PRED) {
@@ -896,7 +900,7 @@ static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
 static REFERENCE_MODE read_block_reference_mode(AV1_COMMON *cm,
                                                 const MACROBLOCKD *xd,
                                                 aom_reader *r) {
-  if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return SINGLE_REFERENCE;
+  if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return SINGLE_REFERENCE;
   if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
     const int ctx = av1_get_reference_mode_context(xd);
     const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol(
@@ -1058,7 +1062,7 @@ static void read_intra_block_mode_info(AV1_COMMON *const cm,
                                        MACROBLOCKD *const xd,
                                        MB_MODE_INFO *const mbmi,
                                        aom_reader *r) {
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int use_angle_delta = av1_use_angle_delta(bsize);
 
   mbmi->ref_frame[0] = INTRA_FRAME;
@@ -1072,7 +1076,7 @@ static void read_intra_block_mode_info(AV1_COMMON *const cm,
       use_angle_delta && av1_is_directional_mode(mbmi->mode)
           ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
           : 0;
-  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
+  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
     mbmi->uv_mode =
         read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
     if (mbmi->uv_mode == UV_CFL_PRED) {
@@ -1111,7 +1115,7 @@ static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
                             aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  BLOCK_SIZE bsize = mbmi->sb_type;
+  BLOCK_SIZE bsize = mbmi->bsize;
   FeatureFlags *const features = &cm->features;
   if (features->cur_frame_force_integer_mv) {
     allow_hp = MV_SUBPEL_NONE;
@@ -1261,17 +1265,18 @@ static void dec_dump_logs(AV1_COMMON *cm, MB_MODE_INFO *const mbmi, int mi_row,
 #endif  // DEC_MISMATCH_DEBUG
 
 static void read_inter_block_mode_info(AV1Decoder *const pbi,
-                                       MACROBLOCKD *const xd,
+                                       DecoderCodingBlock *dcb,
                                        MB_MODE_INFO *const mbmi,
                                        aom_reader *r) {
   AV1_COMMON *const cm = &pbi->common;
   FeatureFlags *const features = &cm->features;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int allow_hp = features->allow_high_precision_mv;
   int_mv nearestmv[2], nearmv[2];
   int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } };
   int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
   int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+  MACROBLOCKD *const xd = &dcb->xd;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   mbmi->uv_mode = UV_DC_PRED;
@@ -1284,7 +1289,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
   const int is_compound = has_second_ref(mbmi);
 
   const MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
-  av1_find_mv_refs(cm, xd, mbmi, ref_frame, xd->ref_mv_count, xd->ref_mv_stack,
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, dcb->ref_mv_count, xd->ref_mv_stack,
                    xd->weight, ref_mvs, /*global_mvs=*/NULL, inter_mode_ctx);
 
   mbmi->ref_mv_idx = 0;
@@ -1305,7 +1310,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
         mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx);
       if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
           have_nearmv_in_inter_mode(mbmi->mode))
-        read_drl_idx(ec_ctx, xd, mbmi, r);
+        read_drl_idx(ec_ctx, dcb, mbmi, r);
     }
   }
 
@@ -1357,7 +1362,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
       ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
   } else {
     if (mbmi->mode == NEWMV) {
-      if (xd->ref_mv_count[ref_frame] > 1)
+      if (dcb->ref_mv_count[ref_frame] > 1)
         ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv;
     }
   }
@@ -1367,10 +1372,10 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
   const int mv_corrupted_flag =
       !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv,
                  nearestmv, nearmv, is_compound, allow_hp, r);
-  aom_merge_corrupted_flag(&xd->corrupted, mv_corrupted_flag);
+  aom_merge_corrupted_flag(&dcb->corrupted, mv_corrupted_flag);
 
   mbmi->use_wedge_interintra = 0;
-  if (cm->seq_params.enable_interintra_compound && !mbmi->skip_mode &&
+  if (cm->seq_params->enable_interintra_compound && !mbmi->skip_mode &&
       is_interintra_allowed(mbmi)) {
     const int bsize_group = size_group_lookup[bsize];
     const int interintra =
@@ -1401,7 +1406,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
   }
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
-  if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode &&
+  if (is_motion_variation_allowed_bsize(mbmi->bsize) && !mbmi->skip_mode &&
       !has_second_ref(mbmi)) {
     mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
   }
@@ -1418,7 +1423,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
   if (has_second_ref(mbmi) && !mbmi->skip_mode) {
     // Read idx to indicate current compound inter prediction mode group
     const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                     cm->seq_params.enable_masked_compound;
+                                     cm->seq_params->enable_masked_compound;
 
     if (masked_compound_used) {
       const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
@@ -1427,7 +1432,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
     }
 
     if (mbmi->comp_group_idx == 0) {
-      if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
+      if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) {
         const int comp_index_ctx = get_comp_index_context(cm, xd);
         mbmi->compound_idx = (uint8_t)aom_read_symbol(
             r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
@@ -1468,12 +1473,11 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
   }
 
   read_mb_interp_filter(xd, features->interp_filter,
-                        cm->seq_params.enable_dual_filter, mbmi, r);
-
-  const int mi_row = xd->mi_row;
-  const int mi_col = xd->mi_col;
+                        cm->seq_params->enable_dual_filter, mbmi, r);
 
   if (mbmi->motion_mode == WARPED_CAUSAL) {
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
     mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
     mbmi->wm_params.invalid = 0;
 
@@ -1500,8 +1504,9 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
 }
 
 static void read_inter_frame_mode_info(AV1Decoder *const pbi,
-                                       MACROBLOCKD *const xd, aom_reader *r) {
+                                       DecoderCodingBlock *dcb, aom_reader *r) {
   AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   int inter_block = 1;
 
@@ -1512,9 +1517,9 @@ static void read_inter_frame_mode_info(AV1Decoder *const pbi,
   mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r);
 
   if (mbmi->skip_mode)
-    mbmi->skip = 1;
+    mbmi->skip_txfm = 1;
   else
-    mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+    mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r);
 
   if (!cm->seg.segid_preskip)
     mbmi->segment_id = read_inter_segment_id(cm, xd, 0, r);
@@ -1526,7 +1531,7 @@ static void read_inter_frame_mode_info(AV1Decoder *const pbi,
   if (!mbmi->skip_mode)
     inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
 
-  mbmi->current_qindex = xd->current_qindex;
+  mbmi->current_qindex = xd->current_base_qindex;
 
   xd->above_txfm_context =
       cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
@@ -1534,7 +1539,7 @@ static void read_inter_frame_mode_info(AV1Decoder *const pbi,
       xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
 
   if (inter_block)
-    read_inter_block_mode_info(pbi, xd, mbmi, r);
+    read_inter_block_mode_info(pbi, dcb, mbmi, r);
   else
     read_intra_block_mode_info(cm, xd, mbmi, r);
 }
@@ -1557,19 +1562,20 @@ static void intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row, int mi_col,
   }
 }
 
-void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, aom_reader *r,
-                        int x_mis, int y_mis) {
+void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb,
+                        aom_reader *r, int x_mis, int y_mis) {
   AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *const mi = xd->mi[0];
   mi->use_intrabc = 0;
 
   if (frame_is_intra_only(cm)) {
-    read_intra_frame_mode_info(cm, xd, r);
-    if (pbi->common.seq_params.order_hint_info.enable_ref_frame_mvs)
+    read_intra_frame_mode_info(cm, dcb, r);
+    if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
       intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis);
   } else {
-    read_inter_frame_mode_info(pbi, xd, r);
-    if (pbi->common.seq_params.order_hint_info.enable_ref_frame_mvs)
+    read_inter_frame_mode_info(pbi, dcb, r);
+    if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
       av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis);
   }
 }
diff --git a/media/libaom/src/av1/decoder/decodemv.h b/media/libaom/src/av1/decoder/decodemv.h
index 289e66ae1e..3d8629c9a5 100644
--- a/media/libaom/src/av1/decoder/decodemv.h
+++ b/media/libaom/src/av1/decoder/decodemv.h
@@ -20,8 +20,8 @@
 extern "C" {
 #endif
 
-void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, aom_reader *r,
-                        int x_mis, int y_mis);
+void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb,
+                        aom_reader *r, int x_mis, int y_mis);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/av1/decoder/decoder.c b/media/libaom/src/av1/decoder/decoder.c
index fc5f2cd20d..2553ffb79f 100644
--- a/media/libaom/src/av1/decoder/decoder.c
+++ b/media/libaom/src/av1/decoder/decoder.c
@@ -19,8 +19,6 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
-#include "aom_ports/system_state.h"
-#include "aom_ports/aom_once.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_scale/aom_scale.h"
 #include "aom_util/aom_thread.h"
@@ -46,7 +44,8 @@ static void initialize_dec(void) {
 }
 
 static void dec_set_mb_mi(CommonModeInfoParams *mi_params, int width,
-                          int height) {
+                          int height, BLOCK_SIZE min_partition_size) {
+  (void)min_partition_size;
   // Ensure that the decoded width and height are both multiples of
   // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
   // subsampling is used).
@@ -68,10 +67,6 @@ static void dec_set_mb_mi(CommonModeInfoParams *mi_params, int width,
 
   assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
          mi_size_high[mi_params->mi_alloc_bsize]);
-
-#if CONFIG_LPF_MASK
-  av1_alloc_loop_filter_mask(mi_params);
-#endif
 }
 
 static void dec_setup_mi(CommonModeInfoParams *mi_params) {
@@ -97,17 +92,19 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   av1_zero(*pbi);
 
   AV1_COMMON *volatile const cm = &pbi->common;
+  cm->seq_params = &pbi->seq_params;
+  cm->error = &pbi->error;
 
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
-  if (setjmp(cm->error.jmp)) {
-    cm->error.setjmp = 0;
+  if (setjmp(pbi->error.jmp)) {
+    pbi->error.setjmp = 0;
     av1_decoder_remove(pbi);
     return NULL;
   }
 
-  cm->error.setjmp = 1;
+  pbi->error.setjmp = 1;
 
   CHECK_MEM_ERROR(cm, cm->fc,
                   (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
@@ -118,7 +115,7 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
 
   pbi->need_resync = 1;
-  aom_once(initialize_dec);
+  initialize_dec();
 
   // Initialize the references to not point to any frame buffers.
   for (int i = 0; i < REF_FRAMES; i++) {
@@ -129,7 +126,7 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   pbi->decoding_first_frame = 1;
   pbi->common.buffer_pool = pool;
 
-  cm->seq_params.bit_depth = AOM_BITS_8;
+  cm->seq_params->bit_depth = AOM_BITS_8;
 
   cm->mi_params.free_mi = dec_free_mi;
   cm->mi_params.setup_mi = dec_setup_mi;
@@ -139,12 +136,13 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
 
   av1_qm_init(&cm->quant_params, av1_num_planes(cm));
   av1_loop_restoration_precal();
+
 #if CONFIG_ACCOUNTING
   pbi->acct_enabled = 1;
   aom_accounting_init(&pbi->accounting);
 #endif
 
-  cm->error.setjmp = 0;
+  pbi->error.setjmp = 0;
 
   aom_get_worker_interface()->init(&pbi->lf_worker);
   pbi->lf_worker.thread_name = "aom lf worker";
@@ -185,13 +183,14 @@ void av1_decoder_remove(AV1Decoder *pbi) {
   aom_free(pbi->lf_worker.data1);
 
   if (pbi->thread_data) {
-    for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
+    for (int worker_idx = 1; worker_idx < pbi->max_threads; worker_idx++) {
       DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
       av1_free_mc_tmp_buf(thread_data->td);
       aom_free(thread_data->td);
     }
     aom_free(pbi->thread_data);
   }
+  aom_free(pbi->dcb.xd.seg_mask);
 
   for (i = 0; i < pbi->num_workers; ++i) {
     AVxWorker *const worker = &pbi->tile_workers[i];
@@ -257,16 +256,16 @@ aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx,
 
   const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx);
   if (cfg == NULL) {
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame");
+    aom_internal_error(&pbi->error, AOM_CODEC_ERROR, "No reference frame");
     return AOM_CODEC_ERROR;
   }
   if (!equal_dimensions(cfg, sd))
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(&pbi->error, AOM_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   else
     aom_yv12_copy_frame(cfg, sd, num_planes);
 
-  return cm->error.error_code;
+  return pbi->error.error_code;
 }
 
 static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
@@ -289,13 +288,13 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
   ref_buf = get_ref_frame(cm, idx);
 
   if (ref_buf == NULL) {
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame");
+    aom_internal_error(cm->error, AOM_CODEC_ERROR, "No reference frame");
     return AOM_CODEC_ERROR;
   }
 
   if (!use_external_ref) {
     if (!equal_dimensions(ref_buf, sd)) {
-      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_ERROR,
                          "Incorrect buffer dimensions");
     } else {
       // Overwrite the reference frame buffer.
@@ -303,7 +302,7 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
     }
   } else {
     if (!equal_dimensions_and_border(ref_buf, sd)) {
-      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_ERROR,
                          "Incorrect buffer dimensions");
     } else {
       // Overwrite the reference frame buffer pointers.
@@ -319,7 +318,7 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
     }
   }
 
-  return cm->error.error_code;
+  return cm->error->error_code;
 }
 
 aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
@@ -328,12 +327,12 @@ aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
   const int num_planes = av1_num_planes(cm);
 
   if (!equal_dimensions_and_border(new_frame, sd))
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   else
     aom_yv12_copy_frame(new_frame, sd, num_planes);
 
-  return cm->error.error_code;
+  return cm->error->error_code;
 }
 
 static void release_current_frame(AV1Decoder *pbi) {
@@ -351,7 +350,7 @@ static void release_current_frame(AV1Decoder *pbi) {
 // Consumes a reference to cm->cur_frame.
 //
 // This functions returns void. It reports failure by setting
-// cm->error.error_code.
+// pbi->error.error_code.
 static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
   int ref_index = 0, mask;
   AV1_COMMON *const cm = &pbi->common;
@@ -384,7 +383,7 @@ static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
           // error
           cm->cur_frame->buf.corrupted = 1;
           decrease_ref_count(cm->cur_frame, pool);
-          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+          pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
         } else {
           pbi->output_frames[pbi->num_output_frames] = cm->cur_frame;
           pbi->num_output_frames++;
@@ -423,8 +422,8 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
                                 const uint8_t **psource) {
   AV1_COMMON *volatile const cm = &pbi->common;
   const uint8_t *source = *psource;
-  cm->error.error_code = AOM_CODEC_OK;
-  cm->error.has_detail = 0;
+  pbi->error.error_code = AOM_CODEC_OK;
+  pbi->error.has_detail = 0;
 
   if (size == 0) {
     // This is used to signal that we are missing frames.
@@ -440,18 +439,18 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
   }
 
   if (assign_cur_frame_new_fb(cm) == NULL) {
-    cm->error.error_code = AOM_CODEC_MEM_ERROR;
+    pbi->error.error_code = AOM_CODEC_MEM_ERROR;
     return 1;
   }
 
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
-  if (setjmp(cm->error.jmp)) {
+  if (setjmp(pbi->error.jmp)) {
     const AVxWorkerInterface *const winterface = aom_get_worker_interface();
     int i;
 
-    cm->error.setjmp = 0;
+    pbi->error.setjmp = 0;
 
     // Synchronize all threads immediately as a subsequent decode call may
     // cause a resize invalidating some allocations.
@@ -461,19 +460,18 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
     }
 
     release_current_frame(pbi);
-    aom_clear_system_state();
     return -1;
   }
 
-  cm->error.setjmp = 1;
+  pbi->error.setjmp = 1;
 
   int frame_decoded =
       aom_decode_frame_from_obus(pbi, source, source + size, psource);
 
   if (frame_decoded < 0) {
-    assert(cm->error.error_code != AOM_CODEC_OK);
+    assert(pbi->error.error_code != AOM_CODEC_OK);
     release_current_frame(pbi);
-    cm->error.setjmp = 0;
+    pbi->error.setjmp = 0;
     return 1;
   }
 
@@ -494,13 +492,11 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
     pbi->decoding_first_frame = 0;
   }
 
-  if (cm->error.error_code != AOM_CODEC_OK) {
-    cm->error.setjmp = 0;
+  if (pbi->error.error_code != AOM_CODEC_OK) {
+    pbi->error.setjmp = 0;
     return 1;
   }
 
-  aom_clear_system_state();
-
   if (!cm->show_existing_frame) {
     if (cm->seg.enabled) {
       if (cm->prev_frame &&
@@ -514,7 +510,7 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
   }
 
   // Update progress in frame parallel decode.
-  cm->error.setjmp = 0;
+  pbi->error.setjmp = 0;
 
   return 0;
 }
@@ -525,12 +521,11 @@ int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
   if (index >= pbi->num_output_frames) return -1;
   *sd = &pbi->output_frames[index]->buf;
   *grain_params = &pbi->output_frames[index]->film_grain_params;
-  aom_clear_system_state();
   return 0;
 }
 
 // Get the highest-spatial-layer output
-// TODO(david.barker): What should this do?
+// TODO(rachelbarker): What should this do?
 int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) {
   if (pbi->num_output_frames == 0) return -1;
 
diff --git a/media/libaom/src/av1/decoder/decoder.h b/media/libaom/src/av1/decoder/decoder.h
index 4580de2ac7..226b9dca85 100644
--- a/media/libaom/src/av1/decoder/decoder.h
+++ b/media/libaom/src/av1/decoder/decoder.h
@@ -33,25 +33,90 @@
 extern "C" {
 #endif
 
+/*!
+ * \brief Contains coding block data required by the decoder.
+ *
+ * This includes:
+ * - Coding block info that is common between encoder and decoder.
+ * - Other coding block info only needed by the decoder.
+ * Contrast this with a similar struct MACROBLOCK on encoder side.
+ * This data is also common between ThreadData and AV1Decoder structs.
+ */
+typedef struct DecoderCodingBlock {
+  /*!
+   * Coding block info that is common between encoder and decoder.
+   */
+  DECLARE_ALIGNED(32, MACROBLOCKD, xd);
+  /*!
+   * True if the at least one of the coding blocks decoded was corrupted.
+   */
+  int corrupted;
+  /*!
+   * Pointer to 'mc_buf' inside 'pbi->td' (single-threaded decoding) or
+   * 'pbi->thread_data[i].td' (multi-threaded decoding).
+   */
+  uint8_t *mc_buf[2];
+  /*!
+   * Pointer to 'dqcoeff' inside 'td->cb_buffer_base' or 'pbi->cb_buffer_base'
+   * with appropriate offset for the current superblock, for each plane.
+   */
+  tran_low_t *dqcoeff_block[MAX_MB_PLANE];
+  /*!
+   * cb_offset[p] is the offset into the dqcoeff_block[p] for the current coding
+   * block, for each plane 'p'.
+   */
+  uint16_t cb_offset[MAX_MB_PLANE];
+  /*!
+   * Pointer to 'eob_data' inside 'td->cb_buffer_base' or 'pbi->cb_buffer_base'
+   * with appropriate offset for the current superblock, for each plane.
+   */
+  eob_info *eob_data[MAX_MB_PLANE];
+  /*!
+   * txb_offset[p] is the offset into the eob_data[p] for the current coding
+   * block, for each plane 'p'.
+   */
+  uint16_t txb_offset[MAX_MB_PLANE];
+  /*!
+   * ref_mv_count[i] specifies the number of number of motion vector candidates
+   * in xd->ref_mv_stack[i].
+   */
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+} DecoderCodingBlock;
+
+/*!\cond */
+
 typedef void (*decode_block_visitor_fn_t)(const AV1_COMMON *const cm,
-                                          MACROBLOCKD *const xd,
+                                          DecoderCodingBlock *dcb,
                                           aom_reader *const r, const int plane,
                                           const int row, const int col,
                                           const TX_SIZE tx_size);
 
 typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
-                                                 MACROBLOCKD *const xd,
+                                                 DecoderCodingBlock *dcb,
                                                  BLOCK_SIZE bsize);
 
 typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
                                                    MACROBLOCKD *const xd);
 
 typedef struct ThreadData {
-  DECLARE_ALIGNED(32, MACROBLOCKD, xd);
+  DecoderCodingBlock dcb;
+
+  // Coding block buffer for the current superblock.
+  // Used only for single-threaded decoding and multi-threaded decoding with
+  // row_mt == 1 cases.
+  // See also: similar buffer in 'AV1Decoder'.
   CB_BUFFER cb_buffer_base;
+
   aom_reader *bit_reader;
+
+  // Motion compensation buffer used to get a prediction buffer with extended
+  // borders. One buffer for each of the two possible references.
   uint8_t *mc_buf[2];
+  // Mask for this block used for compound prediction.
+  uint8_t *seg_mask;
+  // Allocated size of 'mc_buf'.
   int32_t mc_buf_size;
+  // If true, the pointers in 'mc_buf' were converted from highbd pointers.
   int mc_buf_use_highbd;  // Boolean: whether the byte pointers stored in
                           // mc_buf were converted from highbd pointers.
 
@@ -156,7 +221,7 @@ typedef struct AV1DecTileMTData {
 } AV1DecTileMT;
 
 typedef struct AV1Decoder {
-  DECLARE_ALIGNED(32, MACROBLOCKD, mb);
+  DecoderCodingBlock dcb;
 
   DECLARE_ALIGNED(32, AV1_COMMON, common);
 
@@ -164,6 +229,8 @@ typedef struct AV1Decoder {
   AV1LfSync lf_row_sync;
   AV1LrSync lr_row_sync;
   AV1LrStruct lr_ctxt;
+  AV1CdefSync cdef_sync;
+  AV1CdefWorkerData *cdef_worker;
   AVxWorker *tile_workers;
   int num_workers;
   DecWorkerData *thread_data;
@@ -228,11 +295,24 @@ typedef struct AV1Decoder {
   int tile_count_minus_1;
   uint32_t coded_tile_data_size;
   unsigned int ext_tile_debug;  // for ext-tile software debug & testing
+
+  // Decoder has 3 modes of operation:
+  // (1) Single-threaded decoding.
+  // (2) Multi-threaded decoding with each tile decoded in parallel.
+  // (3) In addition to (2), each thread decodes 1 superblock row in parallel.
+  // row_mt = 1 triggers mode (3) above, while row_mt = 0, will trigger mode (1)
+  // or (2) depending on 'max_threads'.
   unsigned int row_mt;
+
   EXTERNAL_REFERENCES ext_refs;
   YV12_BUFFER_CONFIG tile_list_outbuf;
 
+  // Coding block buffer for the current frame.
+  // Allocated and used only for multi-threaded decoding with 'row_mt == 0'.
+  // See also: similar buffer in 'ThreadData' struct.
   CB_BUFFER *cb_buffer_base;
+  // Allocated size of 'cb_buffer_base'. Currently same as the number of
+  // superblocks in the coded frame.
   int cb_buffer_alloc_size;
 
   int allocated_row_mt_sync_rows;
@@ -250,6 +330,36 @@ typedef struct AV1Decoder {
   int skip_film_grain;
   int is_annexb;
   int valid_for_referencing[REF_FRAMES];
+  int is_fwd_kf_present;
+  int is_arf_frame_present;
+  int num_tile_groups;
+  aom_s_frame_info sframe_info;
+
+  /*!
+   * Elements part of the sequence header, that are applicable for all the
+   * frames in the video.
+   */
+  SequenceHeader seq_params;
+
+  /*!
+   * If true, buffer removal times are present.
+   */
+  bool buffer_removal_time_present;
+
+  /*!
+   * Code and details about current error status.
+   */
+  struct aom_internal_error_info error;
+
+  /*!
+   * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_temporal_layers;
+
+  /*!
+   * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_spatial_layers;
 } AV1Decoder;
 
 // Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error
@@ -324,6 +434,8 @@ typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td,
                                    int mi_row, int mi_col, aom_reader *r,
                                    PARTITION_TYPE partition, BLOCK_SIZE bsize);
 
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/decoder/decodetxb.c b/media/libaom/src/av1/decoder/decodetxb.c
index 541f4c9846..0ec1487625 100644
--- a/media/libaom/src/av1/decoder/decodetxb.c
+++ b/media/libaom/src/av1/decoder/decodetxb.c
@@ -107,11 +107,12 @@ static INLINE void read_coeffs_reverse(aom_reader *r, TX_SIZE tx_size,
   }
 }
 
-uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, DecoderCodingBlock *dcb,
                             aom_reader *const r, const int blk_row,
                             const int blk_col, const int plane,
                             const TXB_CTX *const txb_ctx,
                             const TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &dcb->xd;
   FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
   const int32_t max_value = (1 << (7 + xd->bd)) - 1;
   const int32_t min_value = -(1 << (7 + xd->bd));
@@ -120,7 +121,7 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id];
-  tran_low_t *const tcoeffs = pd->dqcoeff_block + xd->cb_offset[plane];
+  tran_low_t *const tcoeffs = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane];
   const int shift = av1_get_tx_scale(tx_size);
   const int bwl = get_txb_bwl(tx_size);
   const int width = get_txb_wide(tx_size);
@@ -131,7 +132,7 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
   uint8_t *const levels = set_levels(levels_buf, width);
   const int all_zero = aom_read_symbol(
       r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR);
-  eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
   uint16_t *const eob = &(eob_data->eob);
   uint16_t *const max_scan_line = &(eob_data->max_scan_line);
   *max_scan_line = 0;
@@ -140,7 +141,7 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
 #if CONFIG_INSPECTION
   if (plane == 0) {
     const int txk_type_idx =
-        av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+        av1_get_txk_type_index(mbmi->bsize, blk_row, blk_col);
     mbmi->tx_skip[txk_type_idx] = all_zero;
   }
 #endif
@@ -321,17 +322,18 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
 }
 
 void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
-                                MACROBLOCKD *const xd, aom_reader *const r,
+                                DecoderCodingBlock *dcb, aom_reader *const r,
                                 const int plane, const int row, const int col,
                                 const TX_SIZE tx_size) {
 #if TXCOEFF_TIMER
   struct aom_usec_timer timer;
   aom_usec_timer_start(&timer);
 #endif
+  MACROBLOCKD *const xd = &dcb->xd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[plane];
 
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   assert(bsize < BLOCK_SIZES_ALL);
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
@@ -340,7 +342,7 @@ void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
   get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + col,
               pd->left_entropy_context + row, &txb_ctx);
   const uint8_t cul_level =
-      av1_read_coeffs_txb(cm, xd, r, row, col, plane, &txb_ctx, tx_size);
+      av1_read_coeffs_txb(cm, dcb, r, row, col, plane, &txb_ctx, tx_size);
   av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col,
                            row);
 
diff --git a/media/libaom/src/av1/decoder/decodetxb.h b/media/libaom/src/av1/decoder/decodetxb.h
index 39bf0bf78f..fd34d40341 100644
--- a/media/libaom/src/av1/decoder/decodetxb.h
+++ b/media/libaom/src/av1/decoder/decodetxb.h
@@ -12,21 +12,23 @@
 #ifndef AOM_AV1_DECODER_DECODETXB_H_
 #define AOM_AV1_DECODER_DECODETXB_H_
 
-#include "config/aom_config.h"
+#include "av1/common/enums.h"
 
-#include "av1/common/av1_common_int.h"
-#include "av1/common/blockd.h"
-#include "av1/common/txb_common.h"
-#include "aom_dsp/bitreader.h"
+struct aom_reader;
+struct AV1Common;
+struct DecoderCodingBlock;
+struct txb_ctx;
 
-uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                            aom_reader *const r, const int blk_row,
+uint8_t av1_read_coeffs_txb(const struct AV1Common *const cm,
+                            struct DecoderCodingBlock *dcb,
+                            struct aom_reader *const r, const int blk_row,
                             const int blk_col, const int plane,
-                            const TXB_CTX *const txb_ctx,
+                            const struct txb_ctx *const txb_ctx,
                             const TX_SIZE tx_size);
 
-void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
-                                MACROBLOCKD *const xd, aom_reader *const r,
-                                const int plane, const int row, const int col,
+void av1_read_coeffs_txb_facade(const struct AV1Common *const cm,
+                                struct DecoderCodingBlock *dcb,
+                                struct aom_reader *const r, const int plane,
+                                const int row, const int col,
                                 const TX_SIZE tx_size);
 #endif  // AOM_AV1_DECODER_DECODETXB_H_
diff --git a/media/libaom/src/av1/decoder/detokenize.c b/media/libaom/src/av1/decoder/detokenize.c
index 9d54bd13dd..3c6a006eaf 100644
--- a/media/libaom/src/av1/decoder/detokenize.c
+++ b/media/libaom/src/av1/decoder/detokenize.c
@@ -72,7 +72,7 @@ void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
                          : xd->tile_ctx->palette_y_color_index_cdf;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   params.n_colors = mbmi->palette_mode_info.palette_size[plane];
-  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+  av1_get_block_dimensions(mbmi->bsize, plane, xd, &params.plane_width,
                            &params.plane_height, &params.rows, &params.cols);
   decode_color_map_tokens(&params, r);
 }
diff --git a/media/libaom/src/aom_dsp/grain_synthesis.c b/media/libaom/src/av1/decoder/grain_synthesis.c
index 626eb76af0..d276f6f90e 100644
--- a/media/libaom/src/aom_dsp/grain_synthesis.c
+++ b/media/libaom/src/av1/decoder/grain_synthesis.c
@@ -14,12 +14,14 @@
  *
  */
 
+#include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 #include <assert.h>
-#include "aom_dsp/grain_synthesis.h"
+#include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
+#include "av1/decoder/grain_synthesis.h"
 
 // Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits)
 // with zero mean and standard deviation of about 512.
@@ -237,7 +239,61 @@ static int grain_max;
 
 static uint16_t random_register = 0;  // random number generator register
 
-static void init_arrays(const aom_film_grain_t *params, int luma_stride,
+static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma,
+                           int ***pred_pos_chroma, int **luma_grain_block,
+                           int **cb_grain_block, int **cr_grain_block,
+                           int **y_line_buf, int **cb_line_buf,
+                           int **cr_line_buf, int **y_col_buf, int **cb_col_buf,
+                           int **cr_col_buf) {
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (params->num_y_points > 0) ++num_pos_chroma;
+
+  if (*pred_pos_luma) {
+    for (int row = 0; row < num_pos_luma; row++) {
+      aom_free((*pred_pos_luma)[row]);
+    }
+    aom_free(*pred_pos_luma);
+    *pred_pos_luma = NULL;
+  }
+
+  if (*pred_pos_chroma) {
+    for (int row = 0; row < num_pos_chroma; row++) {
+      aom_free((*pred_pos_chroma)[row]);
+    }
+    aom_free(*pred_pos_chroma);
+    *pred_pos_chroma = NULL;
+  }
+
+  aom_free(*y_line_buf);
+  *y_line_buf = NULL;
+
+  aom_free(*cb_line_buf);
+  *cb_line_buf = NULL;
+
+  aom_free(*cr_line_buf);
+  *cr_line_buf = NULL;
+
+  aom_free(*y_col_buf);
+  *y_col_buf = NULL;
+
+  aom_free(*cb_col_buf);
+  *cb_col_buf = NULL;
+
+  aom_free(*cr_col_buf);
+  *cr_col_buf = NULL;
+
+  aom_free(*luma_grain_block);
+  *luma_grain_block = NULL;
+
+  aom_free(*cb_grain_block);
+  *cb_grain_block = NULL;
+
+  aom_free(*cr_grain_block);
+  *cr_grain_block = NULL;
+}
+
+static bool init_arrays(const aom_film_grain_t *params, int luma_stride,
                         int chroma_stride, int ***pred_pos_luma_p,
                         int ***pred_pos_chroma_p, int **luma_grain_block,
                         int **cb_grain_block, int **cr_grain_block,
@@ -245,6 +301,18 @@ static void init_arrays(const aom_film_grain_t *params, int luma_stride,
                         int **y_col_buf, int **cb_col_buf, int **cr_col_buf,
                         int luma_grain_samples, int chroma_grain_samples,
                         int chroma_subsamp_y, int chroma_subsamp_x) {
+  *pred_pos_luma_p = NULL;
+  *pred_pos_chroma_p = NULL;
+  *luma_grain_block = NULL;
+  *cb_grain_block = NULL;
+  *cr_grain_block = NULL;
+  *y_line_buf = NULL;
+  *cb_line_buf = NULL;
+  *cr_line_buf = NULL;
+  *y_col_buf = NULL;
+  *cb_col_buf = NULL;
+  *cr_col_buf = NULL;
+
   memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256);
   memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256);
   memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256);
@@ -256,17 +324,38 @@ static void init_arrays(const aom_film_grain_t *params, int luma_stride,
   int **pred_pos_luma;
   int **pred_pos_chroma;
 
-  pred_pos_luma = (int **)aom_malloc(sizeof(*pred_pos_luma) * num_pos_luma);
+  pred_pos_luma = (int **)aom_calloc(num_pos_luma, sizeof(*pred_pos_luma));
+  if (!pred_pos_luma) return false;
 
   for (int row = 0; row < num_pos_luma; row++) {
     pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3);
+    if (!pred_pos_luma[row]) {
+      dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p,
+                     luma_grain_block, cb_grain_block, cr_grain_block,
+                     y_line_buf, cb_line_buf, cr_line_buf, y_col_buf,
+                     cb_col_buf, cr_col_buf);
+      return false;
+    }
   }
 
   pred_pos_chroma =
-      (int **)aom_malloc(sizeof(*pred_pos_chroma) * num_pos_chroma);
+      (int **)aom_calloc(num_pos_chroma, sizeof(*pred_pos_chroma));
+  if (!pred_pos_chroma) {
+    dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, luma_grain_block,
+                   cb_grain_block, cr_grain_block, y_line_buf, cb_line_buf,
+                   cr_line_buf, y_col_buf, cb_col_buf, cr_col_buf);
+    return false;
+  }
 
   for (int row = 0; row < num_pos_chroma; row++) {
     pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3);
+    if (!pred_pos_chroma[row]) {
+      dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p,
+                     luma_grain_block, cb_grain_block, cr_grain_block,
+                     y_line_buf, cb_line_buf, cr_line_buf, y_col_buf,
+                     cb_col_buf, cr_col_buf);
+      return false;
+    }
   }
 
   int pos_ar_index = 0;
@@ -329,45 +418,15 @@ static void init_arrays(const aom_film_grain_t *params, int luma_stride,
       (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples);
   *cr_grain_block =
       (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples);
-}
-
-static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma,
-                           int ***pred_pos_chroma, int **luma_grain_block,
-                           int **cb_grain_block, int **cr_grain_block,
-                           int **y_line_buf, int **cb_line_buf,
-                           int **cr_line_buf, int **y_col_buf, int **cb_col_buf,
-                           int **cr_col_buf) {
-  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
-  int num_pos_chroma = num_pos_luma;
-  if (params->num_y_points > 0) ++num_pos_chroma;
-
-  for (int row = 0; row < num_pos_luma; row++) {
-    aom_free((*pred_pos_luma)[row]);
+  if (!(*pred_pos_luma_p && *pred_pos_chroma_p && *y_line_buf && *cb_line_buf &&
+        *cr_line_buf && *y_col_buf && *cb_col_buf && *cr_col_buf &&
+        *luma_grain_block && *cb_grain_block && *cr_grain_block)) {
+    dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, luma_grain_block,
+                   cb_grain_block, cr_grain_block, y_line_buf, cb_line_buf,
+                   cr_line_buf, y_col_buf, cb_col_buf, cr_col_buf);
+    return false;
   }
-  aom_free(*pred_pos_luma);
-
-  for (int row = 0; row < num_pos_chroma; row++) {
-    aom_free((*pred_pos_chroma)[row]);
-  }
-  aom_free((*pred_pos_chroma));
-
-  aom_free(*y_line_buf);
-
-  aom_free(*cb_line_buf);
-
-  aom_free(*cr_line_buf);
-
-  aom_free(*y_col_buf);
-
-  aom_free(*cb_col_buf);
-
-  aom_free(*cr_col_buf);
-
-  aom_free(*luma_grain_block);
-
-  aom_free(*cb_grain_block);
-
-  aom_free(*cr_grain_block);
+  return true;
 }
 
 // get a number between 0 and 2^bits - 1
@@ -395,15 +454,14 @@ static void init_random_generator(int luma_line, uint16_t seed) {
   random_register ^= ((luma_num * 173 + 105) & 255);
 }
 
-// Return 0 for success, -1 for failure
-static int generate_luma_grain_block(
+static void generate_luma_grain_block(
     const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
     int luma_block_size_y, int luma_block_size_x, int luma_grain_stride,
     int left_pad, int top_pad, int right_pad, int bottom_pad) {
   if (params->num_y_points == 0) {
     memset(luma_grain_block, 0,
            sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride);
-    return 0;
+    return;
   }
 
   int bit_depth = params->bit_depth;
@@ -433,17 +491,14 @@ static int generate_luma_grain_block(
                     ((wsum + rounding_offset) >> params->ar_coeff_shift),
                 grain_min, grain_max);
     }
-  return 0;
 }
 
-// Return 0 for success, -1 for failure
-static int generate_chroma_grain_blocks(
-    const aom_film_grain_t *params,
-    //                                  int** pred_pos_luma,
-    int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block,
-    int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y,
-    int chroma_block_size_x, int chroma_grain_stride, int left_pad, int top_pad,
-    int right_pad, int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) {
+static bool generate_chroma_grain_blocks(
+    const aom_film_grain_t *params, int **pred_pos_chroma,
+    int *luma_grain_block, int *cb_grain_block, int *cr_grain_block,
+    int luma_grain_stride, int chroma_block_size_y, int chroma_block_size_x,
+    int chroma_grain_stride, int left_pad, int top_pad, int right_pad,
+    int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) {
   int bit_depth = params->bit_depth;
   int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
 
@@ -516,7 +571,7 @@ static int generate_chroma_grain_blocks(
               stderr,
               "Grain synthesis: prediction between two chroma components is "
               "not supported!");
-          return -1;
+          return false;
         }
       }
       if (params->num_cb_points || params->chroma_scaling_from_luma)
@@ -530,7 +585,7 @@ static int generate_chroma_grain_blocks(
                       ((wsum_cr + rounding_offset) >> params->ar_coeff_shift),
                   grain_min, grain_max);
     }
-  return 0;
+  return true;
 }
 
 static void init_scaling_function(const int scaling_points[][2], int num_points,
@@ -1080,27 +1135,25 @@ int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
   grain_min = 0 - grain_center;
   grain_max = grain_center - 1;
 
-  init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma,
-              &pred_pos_chroma, &luma_grain_block, &cb_grain_block,
-              &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf,
-              &y_col_buf, &cb_col_buf, &cr_col_buf,
-              luma_block_size_y * luma_block_size_x,
-              chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y,
-              chroma_subsamp_x);
-
-  if (generate_luma_grain_block(params, pred_pos_luma, luma_grain_block,
-                                luma_block_size_y, luma_block_size_x,
-                                luma_grain_stride, left_pad, top_pad, right_pad,
-                                bottom_pad))
+  if (!init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma,
+                   &pred_pos_chroma, &luma_grain_block, &cb_grain_block,
+                   &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf,
+                   &y_col_buf, &cb_col_buf, &cr_col_buf,
+                   luma_block_size_y * luma_block_size_x,
+                   chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y,
+                   chroma_subsamp_x))
     return -1;
 
-  if (generate_chroma_grain_blocks(
-          params,
-          //                               pred_pos_luma,
-          pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block,
-          luma_grain_stride, chroma_block_size_y, chroma_block_size_x,
-          chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad,
-          chroma_subsamp_y, chroma_subsamp_x))
+  generate_luma_grain_block(params, pred_pos_luma, luma_grain_block,
+                            luma_block_size_y, luma_block_size_x,
+                            luma_grain_stride, left_pad, top_pad, right_pad,
+                            bottom_pad);
+
+  if (!generate_chroma_grain_blocks(
+          params, pred_pos_chroma, luma_grain_block, cb_grain_block,
+          cr_grain_block, luma_grain_stride, chroma_block_size_y,
+          chroma_block_size_x, chroma_grain_stride, left_pad, top_pad,
+          right_pad, bottom_pad, chroma_subsamp_y, chroma_subsamp_x))
     return -1;
 
   init_scaling_function(params->scaling_points_y, params->num_y_points,
diff --git a/media/libaom/src/av1/decoder/grain_synthesis.h b/media/libaom/src/av1/decoder/grain_synthesis.h
new file mode 100644
index 0000000000..9858ce0013
--- /dev/null
+++ b/media/libaom/src/av1/decoder/grain_synthesis.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain synthesis
+ *
+ */
+#ifndef AOM_AV1_DECODER_GRAIN_SYNTHESIS_H_
+#define AOM_AV1_DECODER_GRAIN_SYNTHESIS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "aom_dsp/grain_params.h"
+#include "aom/aom_image.h"
+
+/*!\brief Add film grain
+ *
+ * Add film grain to an image
+ *
+ * Returns 0 for success, -1 for failure
+ *
+ * \param[in]    grain_params     Grain parameters
+ * \param[in]    luma             luma plane
+ * \param[in]    cb               cb plane
+ * \param[in]    cr               cr plane
+ * \param[in]    height           luma plane height
+ * \param[in]    width            luma plane width
+ * \param[in]    luma_stride      luma plane stride
+ * \param[in]    chroma_stride    chroma plane stride
+ */
+int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
+                           uint8_t *cb, uint8_t *cr, int height, int width,
+                           int luma_stride, int chroma_stride,
+                           int use_high_bit_depth, int chroma_subsamp_y,
+                           int chroma_subsamp_x, int mc_identity);
+
+/*!\brief Add film grain
+ *
+ * Add film grain to an image
+ *
+ * Returns 0 for success, -1 for failure
+ *
+ * \param[in]    grain_params     Grain parameters
+ * \param[in]    src              Source image
+ * \param[out]   dst              Resulting image with grain
+ */
+int av1_add_film_grain(const aom_film_grain_t *grain_params,
+                       const aom_image_t *src, aom_image_t *dst);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_DECODER_GRAIN_SYNTHESIS_H_
diff --git a/media/libaom/src/av1/decoder/inspection.c b/media/libaom/src/av1/decoder/inspection.c
index d121a70348..288d69a224 100644
--- a/media/libaom/src/av1/decoder/inspection.c
+++ b/media/libaom/src/av1/decoder/inspection.c
@@ -8,6 +8,10 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+
+#include <stdio.h>
+#include <stdlib.h>
+
 #include "av1/decoder/decoder.h"
 #include "av1/decoder/inspection.h"
 #include "av1/common/enums.h"
@@ -18,6 +22,10 @@ static void ifd_init_mi_rc(insp_frame_data *fd, int mi_cols, int mi_rows) {
   fd->mi_rows = mi_rows;
   fd->mi_grid = (insp_mi_data *)aom_malloc(sizeof(insp_mi_data) * fd->mi_rows *
                                            fd->mi_cols);
+  if (!fd->mi_grid) {
+    fprintf(stderr, "Error allocating inspection data\n");
+    abort();
+  }
 }
 
 void ifd_init(insp_frame_data *fd, int frame_width, int frame_height) {
@@ -99,9 +107,9 @@ int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform) {
       mi->compound_type = mbmi->interinter_comp.type;
 
       // Block Size
-      mi->sb_type = mbmi->sb_type;
+      mi->bsize = mbmi->bsize;
       // Skip Flag
-      mi->skip = mbmi->skip;
+      mi->skip = mbmi->skip_txfm;
       mi->filter[0] = av1_extract_interp_filter(mbmi->interp_filters, 0);
       mi->filter[1] = av1_extract_interp_filter(mbmi->interp_filters, 1);
       mi->dual_filter_type = mi->filter[0] * 3 + mi->filter[1];
@@ -109,7 +117,7 @@ int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform) {
       // Transform
       // TODO(anyone): extract tx type info from mbmi->txk_type[].
 
-      const BLOCK_SIZE bsize = mbmi->sb_type;
+      const BLOCK_SIZE bsize = mbmi->bsize;
       const int c = i % mi_size_wide[bsize];
       const int r = j % mi_size_high[bsize];
       if (is_inter_block(mbmi) || is_intrabc_block(mbmi))
diff --git a/media/libaom/src/av1/decoder/inspection.h b/media/libaom/src/av1/decoder/inspection.h
index b963f6ac61..70b1c80fab 100644
--- a/media/libaom/src/av1/decoder/inspection.h
+++ b/media/libaom/src/av1/decoder/inspection.h
@@ -38,7 +38,7 @@ struct insp_mi_data {
   int16_t ref_frame[2];
   int16_t mode;
   int16_t uv_mode;
-  int16_t sb_type;
+  int16_t bsize;
   int16_t skip;
   int16_t segment_id;
   int16_t dual_filter_type;
diff --git a/media/libaom/src/av1/decoder/obu.c b/media/libaom/src/av1/decoder/obu.c
index 791e5965b5..d589f000bc 100644
--- a/media/libaom/src/av1/decoder/obu.c
+++ b/media/libaom/src/av1/decoder/obu.c
@@ -52,13 +52,13 @@ aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
 }
 
 static int is_obu_in_current_operating_point(AV1Decoder *pbi,
-                                             ObuHeader obu_header) {
-  if (!pbi->current_operating_point) {
+                                             const ObuHeader *obu_header) {
+  if (!pbi->current_operating_point || !obu_header->has_extension) {
     return 1;
   }
 
-  if ((pbi->current_operating_point >> obu_header.temporal_layer_id) & 0x1 &&
-      (pbi->current_operating_point >> (obu_header.spatial_layer_id + 8)) &
+  if ((pbi->current_operating_point >> obu_header->temporal_layer_id) & 0x1 &&
+      (pbi->current_operating_point >> (obu_header->spatial_layer_id + 8)) &
           0x1) {
     return 1;
   }
@@ -69,7 +69,7 @@ static int byte_alignment(AV1_COMMON *const cm,
                           struct aom_read_bit_buffer *const rb) {
   while (rb->bit_offset & 7) {
     if (aom_rb_read_bit(rb)) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      cm->error->error_code = AOM_CODEC_CORRUPT_FRAME;
       return -1;
     }
   }
@@ -110,12 +110,12 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
 
   // Use a local variable to store the information as we decode. At the end,
   // if no errors have occurred, cm->seq_params is updated.
-  SequenceHeader sh = cm->seq_params;
+  SequenceHeader sh = *cm->seq_params;
   SequenceHeader *const seq_params = &sh;
 
   seq_params->profile = av1_read_profile(rb);
   if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) {
-    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
     return 0;
   }
 
@@ -124,7 +124,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
   seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
   // Video must have reduced_still_picture_hdr = 0
   if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) {
-    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
     return 0;
   }
 
@@ -135,7 +135,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
     seq_params->operating_points_cnt_minus_1 = 0;
     seq_params->operating_point_idc[0] = 0;
     if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) {
-      cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+      pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
       return 0;
     }
     seq_params->tier[0] = 0;
@@ -144,7 +144,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
   } else {
     seq_params->timing_info_present = aom_rb_read_bit(rb);
     if (seq_params->timing_info_present) {
-      av1_read_timing_info_header(&seq_params->timing_info, &cm->error, rb);
+      av1_read_timing_info_header(&seq_params->timing_info, &pbi->error, rb);
 
       seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb);
       if (seq_params->decoder_model_info_present_flag)
@@ -159,7 +159,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
       seq_params->operating_point_idc[i] =
           aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
       if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) {
-        cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+        pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
         return 0;
       }
       // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
@@ -188,7 +188,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
         // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
         // the check
         if (seq_params->op_params[i].bitrate == 0)
-          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                              "AV1 does not support this combination of "
                              "profile, level, and tier.");
         // Buffer size in bits/s is bitrate in bits/s * 1 s
@@ -212,7 +212,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
               aom_rb_read_literal(rb, 4) + 1;
           if (seq_params->op_params[i].initial_display_delay > 10)
             aom_internal_error(
-                &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                &pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                 "AV1 does not support more than 10 decoded frames delay");
         } else {
           seq_params->op_params[i].initial_display_delay = 10;
@@ -232,19 +232,19 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
   pbi->current_operating_point =
       seq_params->operating_point_idc[operating_point];
   if (aom_get_num_layers_from_operating_point_idc(
-          pbi->current_operating_point, &cm->number_spatial_layers,
-          &cm->number_temporal_layers) != AOM_CODEC_OK) {
-    cm->error.error_code = AOM_CODEC_ERROR;
+          pbi->current_operating_point, &pbi->number_spatial_layers,
+          &pbi->number_temporal_layers) != AOM_CODEC_OK) {
+    pbi->error.error_code = AOM_CODEC_ERROR;
     return 0;
   }
 
   av1_read_sequence_header(cm, rb, seq_params);
 
-  av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &cm->error);
+  av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &pbi->error);
   if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) &&
       !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) &&
       !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) {
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+    aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                        "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, "
                        "%d %d subsampling is not supported.\n",
                        seq_params->subsampling_x, seq_params->subsampling_y);
@@ -253,32 +253,38 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
   seq_params->film_grain_params_present = aom_rb_read_bit(rb);
 
   if (av1_check_trailing_bits(pbi, rb) != 0) {
-    // cm->error.error_code is already set.
+    // pbi->error.error_code is already set.
     return 0;
   }
 
   // If a sequence header has been decoded before, we check if the new
   // one is consistent with the old one.
   if (pbi->sequence_header_ready) {
-    if (!are_seq_headers_consistent(&cm->seq_params, seq_params))
+    if (!are_seq_headers_consistent(cm->seq_params, seq_params))
       pbi->sequence_header_changed = 1;
   }
 
-  cm->seq_params = *seq_params;
+  *cm->seq_params = *seq_params;
   pbi->sequence_header_ready = 1;
 
   return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
 }
 
 // On success, returns the frame header size. On failure, calls
-// aom_internal_error and does not return.
+// aom_internal_error and does not return. If show existing frame,
+// also marks the data processing to end after the frame header.
 static uint32_t read_frame_header_obu(AV1Decoder *pbi,
                                       struct aom_read_bit_buffer *rb,
                                       const uint8_t *data,
                                       const uint8_t **p_data_end,
                                       int trailing_bits_present) {
-  return av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end,
-                                            trailing_bits_present);
+  const uint32_t hdr_size =
+      av1_decode_frame_headers_and_setup(pbi, rb, trailing_bits_present);
+  const AV1_COMMON *cm = &pbi->common;
+  if (cm->show_existing_frame) {
+    *p_data_end = data + hdr_size;
+  }
+  return hdr_size;
 }
 
 // On success, returns the tile group header size. On failure, calls
@@ -297,7 +303,7 @@ static int32_t read_tile_group_header(AV1Decoder *pbi,
     tile_start_and_end_present_flag = aom_rb_read_bit(rb);
     if (tile_start_implicit && tile_start_and_end_present_flag) {
       aom_internal_error(
-          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          &pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
           "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
       return -1;
     }
@@ -312,20 +318,20 @@ static int32_t read_tile_group_header(AV1Decoder *pbi,
     *end_tile = aom_rb_read_literal(rb, tile_bits);
   }
   if (*start_tile != pbi->next_start_tile) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "tg_start (%d) must be equal to %d", *start_tile,
                        pbi->next_start_tile);
     return -1;
   }
   if (*start_tile > *end_tile) {
     aom_internal_error(
-        &cm->error, AOM_CODEC_CORRUPT_FRAME,
+        &pbi->error, AOM_CODEC_CORRUPT_FRAME,
         "tg_end (%d) must be greater than or equal to tg_start (%d)", *end_tile,
         *start_tile);
     return -1;
   }
   if (*end_tile >= num_tiles) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "tg_end (%d) must be less than NumTiles (%d)", *end_tile,
                        num_tiles);
     return -1;
@@ -382,15 +388,16 @@ static void alloc_tile_list_buffer(AV1Decoder *pbi) {
              (pbi->output_frame_height_in_tiles_minus_1 + 1));
 
   // Allocate the tile list output buffer.
-  // Note: if cm->seq_params.use_highbitdepth is 1 and cm->seq_params.bit_depth
-  // is 8, we could allocate less memory, namely, 8 bits/pixel.
+  // Note: if cm->seq_params->use_highbitdepth is 1 and
+  // cm->seq_params->bit_depth is 8, we could allocate less memory, namely, 8
+  // bits/pixel.
   if (aom_alloc_frame_buffer(&pbi->tile_list_outbuf, output_frame_width,
-                             output_frame_height, cm->seq_params.subsampling_x,
-                             cm->seq_params.subsampling_y,
-                             (cm->seq_params.use_highbitdepth &&
-                              (cm->seq_params.bit_depth > AOM_BITS_8)),
-                             0, cm->features.byte_alignment))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                             output_frame_height, cm->seq_params->subsampling_x,
+                             cm->seq_params->subsampling_y,
+                             (cm->seq_params->use_highbitdepth &&
+                              (cm->seq_params->bit_depth > AOM_BITS_8)),
+                             0, cm->features.byte_alignment, 0))
+    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate the tile list output buffer");
 }
 
@@ -424,8 +431,8 @@ static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
   av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
   const int tile_width_in_pixels = tile_width * MI_SIZE;
   const int tile_height_in_pixels = tile_height * MI_SIZE;
-  const int ssy = cm->seq_params.subsampling_y;
-  const int ssx = cm->seq_params.subsampling_x;
+  const int ssy = cm->seq_params->subsampling_y;
+  const int ssx = cm->seq_params->subsampling_x;
   const int num_planes = av1_num_planes(cm);
 
   YV12_BUFFER_CONFIG *cur_frame = &cm->cur_frame->buf;
@@ -449,8 +456,8 @@ static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
     int vstart2 = tr * h;
     int hstart2 = tc * w;
 
-    if (cm->seq_params.use_highbitdepth &&
-        cm->seq_params.bit_depth == AOM_BITS_8) {
+    if (cm->seq_params->use_highbitdepth &&
+        cm->seq_params->bit_depth == AOM_BITS_8) {
       yv12_tile_copy(cur_frame, hstart1, hend1, vstart1, vend1,
                      &pbi->tile_list_outbuf, hstart2, vstart2, plane);
     } else {
@@ -495,7 +502,7 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
   pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
   pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
   if (pbi->tile_count_minus_1 > MAX_TILES - 1) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return 0;
   }
 
@@ -518,7 +525,7 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
     // Set reference for each tile.
     int ref_idx = aom_rb_read_literal(rb, 8);
     if (ref_idx >= MAX_EXTERNAL_REFERENCES) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     av1_set_reference_dec(cm, cm->remapped_ref_idx[0], 1,
@@ -529,14 +536,14 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
     if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 ||
         pbi->dec_tile_row >= cm->tiles.rows ||
         pbi->dec_tile_col >= cm->tiles.cols) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
 
     pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1;
     data += tile_info_bytes;
     if ((size_t)(data_end - data) < pbi->coded_tile_data_size) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
 
@@ -575,103 +582,92 @@ static void alloc_read_metadata(AV1Decoder *const pbi,
                                 OBU_METADATA_TYPE metadata_type,
                                 const uint8_t *data, size_t sz,
                                 aom_metadata_insert_flags_t insert_flag) {
-  AV1_COMMON *const cm = &pbi->common;
+  if (!pbi->metadata) {
+    pbi->metadata = aom_img_metadata_array_alloc(0);
+    if (!pbi->metadata) {
+      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate metadata array");
+    }
+  }
   aom_metadata_t *metadata =
       aom_img_metadata_alloc(metadata_type, data, sz, insert_flag);
   if (!metadata) {
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                        "Error allocating metadata");
   }
-  if (!pbi->metadata) {
-    pbi->metadata = aom_img_metadata_array_alloc(1);
-    if (!pbi->metadata) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                         "Failed to allocate metadata array");
-    }
-  } else {
-    aom_metadata_t **metadata_array =
-        (aom_metadata_t **)realloc(pbi->metadata->metadata_array,
-                                   (pbi->metadata->sz + 1) * sizeof(metadata));
-    if (!metadata_array) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                         "Error allocating metadata");
-    }
-    pbi->metadata->metadata_array = metadata_array;
-    pbi->metadata->sz++;
+  aom_metadata_t **metadata_array =
+      (aom_metadata_t **)realloc(pbi->metadata->metadata_array,
+                                 (pbi->metadata->sz + 1) * sizeof(metadata));
+  if (!metadata_array) {
+    aom_img_metadata_free(metadata);
+    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
+                       "Error growing metadata array");
   }
-  pbi->metadata->metadata_array[pbi->metadata->sz - 1] = metadata;
+  pbi->metadata->metadata_array = metadata_array;
+  pbi->metadata->metadata_array[pbi->metadata->sz] = metadata;
+  pbi->metadata->sz++;
 }
 
-// On success, returns the number of bytes read from 'data'. On failure, calls
-// aom_internal_error() and does not return.
-static size_t read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data,
-                                     size_t sz) {
-  const int kMinItuT35PayloadSize = 2;
-  AV1_COMMON *const cm = &pbi->common;
+// On failure, calls aom_internal_error() and does not return.
+static void read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data,
+                                   size_t sz) {
   if (sz == 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "itu_t_t35_country_code is missing");
   }
-  int bytes_read = get_last_nonzero_byte_index(data, sz);
-  if (bytes_read < 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "No trailing bits found on metadata");
+  int country_code_size = 1;
+  if (*data == 0xFF) {
+    if (sz == 1) {
+      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+                         "itu_t_t35_country_code_extension_byte is missing");
+    }
+    ++country_code_size;
   }
-  if (*data == 0xFF && bytes_read < kMinItuT35PayloadSize) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "itu_t_t35_country_code_extension_byte is missing");
+  int end_index = get_last_nonzero_byte_index(data, sz);
+  if (end_index < country_code_size) {
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+                       "No trailing bits found in ITU-T T.35 metadata OBU");
   }
-  alloc_read_metadata(pbi, OBU_METADATA_TYPE_ITUT_T35, data, (size_t)bytes_read,
+  // itu_t_t35_payload_bytes is byte aligned. Section 6.7.2 of the spec says:
+  //   itu_t_t35_payload_bytes shall be bytes containing data registered as
+  //   specified in Recommendation ITU-T T.35.
+  // Therefore the first trailing byte should be 0x80.
+  if (data[end_index] != 0x80) {
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+                       "The last nonzero byte of the ITU-T T.35 metadata OBU "
+                       "is 0x%02x, should be 0x80.",
+                       data[end_index]);
+  }
+  alloc_read_metadata(pbi, OBU_METADATA_TYPE_ITUT_T35, data, end_index,
                       AOM_MIF_ANY_FRAME);
-  return (size_t)bytes_read;
 }
 
 // On success, returns the number of bytes read from 'data'. On failure, calls
 // aom_internal_error() and does not return.
 static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data,
                                     size_t sz) {
-  const int kHdrCllPayloadSize = 4;
-  AV1_COMMON *const cm = &pbi->common;
-  if (sz == 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "HDR CLL metadata payload is missing");
-  }
-  int bytes_read = get_last_nonzero_byte_index(data, sz);
-  if (bytes_read < 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "No trailing bits found on metadata");
-  }
-  if (bytes_read != kHdrCllPayloadSize) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+  const size_t kHdrCllPayloadSize = 4;
+  if (sz < kHdrCllPayloadSize) {
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Incorrect HDR CLL metadata payload size");
   }
-  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, (size_t)bytes_read,
+  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, kHdrCllPayloadSize,
                       AOM_MIF_ANY_FRAME);
-  return (size_t)bytes_read;
+  return kHdrCllPayloadSize;
 }
 
 // On success, returns the number of bytes read from 'data'. On failure, calls
 // aom_internal_error() and does not return.
 static size_t read_metadata_hdr_mdcv(AV1Decoder *const pbi, const uint8_t *data,
                                      size_t sz) {
-  const int kMdcvPayloadSize = 24;
-  AV1_COMMON *const cm = &pbi->common;
-  if (sz == 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "HDR MDCV metadata payload is missing");
-  }
-  int bytes_read = get_last_nonzero_byte_index(data, sz);
-  if (bytes_read < 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "No trailing bits found on HDR MDCV metadata");
-  }
-  if (bytes_read != kMdcvPayloadSize) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+  const size_t kMdcvPayloadSize = 24;
+  if (sz < kMdcvPayloadSize) {
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Incorrect HDR MDCV metadata payload size");
   }
-  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, (size_t)bytes_read,
+  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, kMdcvPayloadSize,
                       AOM_MIF_ANY_FRAME);
-  return (size_t)bytes_read;
+  return kMdcvPayloadSize;
 }
 
 static void scalability_structure(struct aom_read_bit_buffer *rb) {
@@ -679,7 +675,9 @@ static void scalability_structure(struct aom_read_bit_buffer *rb) {
   const int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb);
   const int spatial_layer_description_present_flag = aom_rb_read_bit(rb);
   const int temporal_group_description_present_flag = aom_rb_read_bit(rb);
-  aom_rb_read_literal(rb, 3);  // reserved
+  // scalability_structure_reserved_3bits must be set to zero and be ignored by
+  // decoders.
+  aom_rb_read_literal(rb, 3);
 
   if (spatial_layer_dimensions_present_flag) {
     for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) {
@@ -769,11 +767,10 @@ static uint8_t get_last_nonzero_byte(const uint8_t *data, size_t sz) {
 // pbi->common.error.error_code and returns 0, or calls aom_internal_error()
 // and does not return.
 static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
-  AV1_COMMON *const cm = &pbi->common;
   size_t type_length;
   uint64_t type_value;
   if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return 0;
   }
   const OBU_METADATA_TYPE metadata_type = (OBU_METADATA_TYPE)type_value;
@@ -781,28 +778,21 @@ static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
     // If metadata_type is reserved for future use or a user private value,
     // ignore the entire OBU and just check trailing bits.
     if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) {
-      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     return sz;
   }
   if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) {
-    size_t bytes_read =
-        type_length +
-        read_metadata_itut_t35(pbi, data + type_length, sz - type_length);
-    // itu_t_t35_payload_bytes is byte aligned and the first
-    // trailing byte should be 0x80.
-    if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
-      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
-      return 0;
-    }
+    // read_metadata_itut_t35() checks trailing bits.
+    read_metadata_itut_t35(pbi, data + type_length, sz - type_length);
     return sz;
   } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) {
     size_t bytes_read =
         type_length +
         read_metadata_hdr_cll(pbi, data + type_length, sz - type_length);
     if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
-      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     return sz;
@@ -811,7 +801,7 @@ static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
         type_length +
         read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length);
     if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
-      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     return sz;
@@ -826,7 +816,7 @@ static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
     read_metadata_timecode(&rb);
   }
   if (av1_check_trailing_bits(pbi, &rb) != 0) {
-    // cm->error.error_code is already set.
+    // pbi->error.error_code is already set.
     return 0;
   }
   assert((rb.bit_offset & 7) == 0);
@@ -844,7 +834,7 @@ static size_t read_padding(AV1_COMMON *const cm, const uint8_t *data,
     // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
     const uint8_t last_nonzero_byte = get_last_nonzero_byte(data, sz);
     if (last_nonzero_byte != 0x80) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      cm->error->error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
   }
@@ -852,7 +842,7 @@ static size_t read_padding(AV1_COMMON *const cm, const uint8_t *data,
 }
 
 // On success, returns a boolean that indicates whether the decoding of the
-// current frame is finished. On failure, sets cm->error.error_code and
+// current frame is finished. On failure, sets pbi->error.error_code and
 // returns -1.
 int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
                                const uint8_t *data_end,
@@ -860,14 +850,25 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
   AV1_COMMON *const cm = &pbi->common;
   int frame_decoding_finished = 0;
   int is_first_tg_obu_received = 1;
+  // Whenever pbi->seen_frame_header is set to 1, frame_header is set to the
+  // beginning of the frame_header_obu and frame_header_size is set to its
+  // size. This allows us to check if a redundant frame_header_obu is a copy
+  // of the previous frame_header_obu.
+  //
+  // Initialize frame_header to a dummy nonnull pointer, otherwise the Clang
+  // Static Analyzer in clang 7.0.1 will falsely warn that a null pointer is
+  // passed as an argument to a 'nonnull' parameter of memcmp(). The initial
+  // value will not be used.
+  const uint8_t *frame_header = data;
   uint32_t frame_header_size = 0;
   ObuHeader obu_header;
   memset(&obu_header, 0, sizeof(obu_header));
   pbi->seen_frame_header = 0;
   pbi->next_start_tile = 0;
+  pbi->num_tile_groups = 0;
 
   if (data_end < data) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return -1;
   }
 
@@ -875,7 +876,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
   if (!cm->tiles.large_scale) pbi->camera_frame_header_ready = 0;
 
   // decode frame as a series of OBUs
-  while (!frame_decoding_finished && cm->error.error_code == AOM_CODEC_OK) {
+  while (!frame_decoding_finished && pbi->error.error_code == AOM_CODEC_OK) {
     struct aom_read_bit_buffer rb;
     size_t payload_size = 0;
     size_t decoded_payload_size = 0;
@@ -885,7 +886,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
 
     if (bytes_available == 0 && !pbi->seen_frame_header) {
       *p_data_end = data;
-      cm->error.error_code = AOM_CODEC_OK;
+      pbi->error.error_code = AOM_CODEC_OK;
       break;
     }
 
@@ -894,7 +895,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
                                      &obu_header, &payload_size, &bytes_read);
 
     if (status != AOM_CODEC_OK) {
-      cm->error.error_code = status;
+      pbi->error.error_code = status;
       return -1;
     }
 
@@ -907,7 +908,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
     data += bytes_read;
 
     if ((size_t)(data_end - data) < payload_size) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return -1;
     }
 
@@ -915,10 +916,9 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
     cm->spatial_layer_id = obu_header.spatial_layer_id;
 
     if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
-        obu_header.type != OBU_SEQUENCE_HEADER &&
-        obu_header.type != OBU_PADDING) {
+        obu_header.type != OBU_SEQUENCE_HEADER) {
       // don't decode obu if it's not in current operating mode
-      if (!is_obu_in_current_operating_point(pbi, obu_header)) {
+      if (!is_obu_in_current_operating_point(pbi, &obu_header)) {
         data += payload_size;
         continue;
       }
@@ -929,15 +929,19 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
     switch (obu_header.type) {
       case OBU_TEMPORAL_DELIMITER:
         decoded_payload_size = read_temporal_delimiter_obu();
-        pbi->seen_frame_header = 0;
-        pbi->next_start_tile = 0;
+        if (pbi->seen_frame_header) {
+          // A new temporal unit has started, but the frame in the previous
+          // temporal unit is incomplete.
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
         break;
       case OBU_SEQUENCE_HEADER:
         decoded_payload_size = read_sequence_header_obu(pbi, &rb);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         // The sequence header should not change in the middle of a frame.
         if (pbi->sequence_header_changed && pbi->seen_frame_header) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         break;
@@ -946,13 +950,13 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
       case OBU_FRAME:
         if (obu_header.type == OBU_REDUNDANT_FRAME_HEADER) {
           if (!pbi->seen_frame_header) {
-            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
             return -1;
           }
         } else {
           // OBU_FRAME_HEADER or OBU_FRAME.
           if (pbi->seen_frame_header) {
-            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
             return -1;
           }
         }
@@ -961,15 +965,16 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
             (cm->tiles.large_scale && !pbi->camera_frame_header_ready)) {
           frame_header_size = read_frame_header_obu(
               pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
+          frame_header = data;
           pbi->seen_frame_header = 1;
           if (!pbi->ext_tile_debug && cm->tiles.large_scale)
             pbi->camera_frame_header_ready = 1;
         } else {
-          // TODO(wtc): Verify that the frame_header_obu is identical to the
-          // original frame_header_obu. For now just skip frame_header_size
-          // bytes in the bit buffer.
-          if (frame_header_size > payload_size) {
-            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          // Verify that the frame_header_obu is identical to the original
+          // frame_header_obu.
+          if (frame_header_size > payload_size ||
+              memcmp(data, frame_header, frame_header_size) != 0) {
+            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
             return -1;
           }
           assert(rb.bit_offset == 0);
@@ -978,14 +983,21 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
 
         decoded_payload_size = frame_header_size;
         pbi->frame_header_size = frame_header_size;
+        cm->cur_frame->temporal_id = obu_header.temporal_layer_id;
+        cm->cur_frame->spatial_id = obu_header.spatial_layer_id;
 
         if (cm->show_existing_frame) {
           if (obu_header.type == OBU_FRAME) {
-            cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+            pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
             return -1;
           }
           frame_decoding_finished = 1;
           pbi->seen_frame_header = 0;
+
+          if (cm->show_frame &&
+              !cm->seq_params->order_hint_info.enable_order_hint) {
+            ++cm->current_frame.frame_number;
+          }
           break;
         }
 
@@ -1003,40 +1015,44 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
         if (obu_header.type != OBU_FRAME) break;
         obu_payload_offset = frame_header_size;
         // Byte align the reader before reading the tile group.
-        // byte_alignment() has set cm->error.error_code if it returns -1.
+        // byte_alignment() has set pbi->error.error_code if it returns -1.
         if (byte_alignment(cm, &rb)) return -1;
         AOM_FALLTHROUGH_INTENDED;  // fall through to read tile group.
       case OBU_TILE_GROUP:
         if (!pbi->seen_frame_header) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         if (obu_payload_offset > payload_size) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         decoded_payload_size += read_one_tile_group_obu(
             pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset,
             data + payload_size, p_data_end, &frame_decoding_finished,
             obu_header.type == OBU_FRAME);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         is_first_tg_obu_received = 0;
-        if (frame_decoding_finished) pbi->seen_frame_header = 0;
+        if (frame_decoding_finished) {
+          pbi->seen_frame_header = 0;
+          pbi->next_start_tile = 0;
+        }
+        pbi->num_tile_groups++;
         break;
       case OBU_METADATA:
         decoded_payload_size = read_metadata(pbi, data, payload_size);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         break;
       case OBU_TILE_LIST:
         if (CONFIG_NORMAL_TILE_MODE) {
-          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+          pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
           return -1;
         }
 
         // This OBU type is purely for the large scale tile coding mode.
         // The common camera frame header has to be already decoded.
         if (!pbi->camera_frame_header_ready) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
 
@@ -1045,17 +1061,17 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
         decoded_payload_size =
             read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size,
                                           p_data_end, &frame_decoding_finished);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         break;
       case OBU_PADDING:
-        decoded_payload_size = read_padding(&pbi->common, data, payload_size);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        decoded_payload_size = read_padding(cm, data, payload_size);
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         break;
       default:
         // Skip unrecognized OBUs
         if (payload_size > 0 &&
             get_last_nonzero_byte(data, payload_size) == 0) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         decoded_payload_size = payload_size;
@@ -1064,7 +1080,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
 
     // Check that the signalled OBU size matches the actual amount of data read
     if (decoded_payload_size > payload_size) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return -1;
     }
 
@@ -1072,7 +1088,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
     while (decoded_payload_size < payload_size) {
       uint8_t padding_byte = data[decoded_payload_size++];
       if (padding_byte != 0) {
-        cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+        pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
         return -1;
       }
     }
@@ -1080,6 +1096,6 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
     data += payload_size;
   }
 
-  if (cm->error.error_code != AOM_CODEC_OK) return -1;
+  if (pbi->error.error_code != AOM_CODEC_OK) return -1;
   return frame_decoding_finished;
 }
diff --git a/media/libaom/src/av1/encoder/allintra_vis.c b/media/libaom/src/av1/encoder/allintra_vis.c
new file mode 100644
index 0000000000..86913aa536
--- /dev/null
+++ b/media/libaom/src/av1/encoder/allintra_vis.c
@@ -0,0 +1,916 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#if CONFIG_TFLITE
+#include "tensorflow/lite/c/c_api.h"
+#include "av1/encoder/deltaq4_model.c"
+#endif
+
+#include "av1/common/common_data.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/rdopt_utils.h"
+
+// Process the wiener variance in 16x16 block basis.
+static int qsort_comp(const void *elem1, const void *elem2) {
+  int a = *((const int *)elem1);
+  int b = *((const int *)elem2);
+  if (a > b) return 1;
+  if (a < b) return -1;
+  return 0;
+}
+
+void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+
+  cpi->weber_bsize = BLOCK_8X8;
+
+  if (cpi->mb_weber_stats) return;
+
+  CHECK_MEM_ERROR(cm, cpi->mb_weber_stats,
+                  aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols,
+                             sizeof(*cpi->mb_weber_stats)));
+}
+
+static int64_t get_satd(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                        int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  const int mi_step = mi_size_wide[cpi->weber_bsize];
+  int mb_stride = cpi->frame_info.mi_cols;
+  int mb_count = 0;
+  int64_t satd = 0;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+
+      satd += cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]
+                  .satd;
+      ++mb_count;
+    }
+  }
+
+  if (mb_count) satd = (int)(satd / mb_count);
+  satd = AOMMAX(1, satd);
+
+  return (int)satd;
+}
+
+static int64_t get_sse(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                       int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  const int mi_step = mi_size_wide[cpi->weber_bsize];
+  int mb_stride = cpi->frame_info.mi_cols;
+  int mb_count = 0;
+  int64_t distortion = 0;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+
+      distortion +=
+          cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]
+              .distortion;
+      ++mb_count;
+    }
+  }
+
+  if (mb_count) distortion = (int)(distortion / mb_count);
+  distortion = AOMMAX(1, distortion);
+
+  return (int)distortion;
+}
+
+static double get_max_scale(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                            int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+  const int mi_step = mi_size_wide[cpi->weber_bsize];
+  int mb_stride = cpi->frame_info.mi_cols;
+  double min_max_scale = 10.0;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+      WeberStats *weber_stats =
+          &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)];
+      if (weber_stats->max_scale < 1.0) continue;
+      if (weber_stats->max_scale < min_max_scale)
+        min_max_scale = weber_stats->max_scale;
+    }
+  }
+  return min_max_scale;
+}
+
+static int get_window_wiener_var(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  const int mi_step = mi_size_wide[cpi->weber_bsize];
+  int sb_wiener_var = 0;
+  int mb_stride = cpi->frame_info.mi_cols;
+  int mb_count = 0;
+  double base_num = 1;
+  double base_den = 1;
+  double base_reg = 1;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+
+      WeberStats *weber_stats =
+          &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)];
+
+      base_num += ((double)weber_stats->distortion) *
+                  sqrt((double)weber_stats->src_variance) *
+                  weber_stats->rec_pix_max;
+
+      base_den += fabs(
+          weber_stats->rec_pix_max * sqrt((double)weber_stats->src_variance) -
+          weber_stats->src_pix_max * sqrt((double)weber_stats->rec_variance));
+
+      base_reg += sqrt((double)weber_stats->distortion) *
+                  sqrt((double)weber_stats->src_pix_max) * 0.1;
+      ++mb_count;
+    }
+  }
+
+  sb_wiener_var =
+      (int)(((base_num + base_reg) / (base_den + base_reg)) / mb_count);
+  sb_wiener_var = AOMMAX(1, sb_wiener_var);
+
+  return (int)sb_wiener_var;
+}
+
+static int get_var_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  int sb_wiener_var = get_window_wiener_var(cpi, bsize, mi_row, mi_col);
+
+  if (mi_row >= (mi_high / 2)) {
+    sb_wiener_var =
+        AOMMIN(sb_wiener_var,
+               get_window_wiener_var(cpi, bsize, mi_row - mi_high / 2, mi_col));
+  }
+  if (mi_row <= (cm->mi_params.mi_rows - mi_high - (mi_high / 2))) {
+    sb_wiener_var =
+        AOMMIN(sb_wiener_var,
+               get_window_wiener_var(cpi, bsize, mi_row + mi_high / 2, mi_col));
+  }
+  if (mi_col >= (mi_wide / 2)) {
+    sb_wiener_var =
+        AOMMIN(sb_wiener_var,
+               get_window_wiener_var(cpi, bsize, mi_row, mi_col - mi_wide / 2));
+  }
+  if (mi_col <= (cm->mi_params.mi_cols - mi_wide - (mi_wide / 2))) {
+    sb_wiener_var =
+        AOMMIN(sb_wiener_var,
+               get_window_wiener_var(cpi, bsize, mi_row, mi_col + mi_wide / 2));
+  }
+
+  return sb_wiener_var;
+}
+
+static double calc_src_mean_var(const uint8_t *const src_buffer,
+                                const int buf_stride, const int block_size,
+                                const int use_hbd, double *mean) {
+  double src_mean = 0.0;
+  double src_variance = 0.0;
+  for (int pix_row = 0; pix_row < block_size; ++pix_row) {
+    for (int pix_col = 0; pix_col < block_size; ++pix_col) {
+      int src_pix;
+      if (use_hbd) {
+        const uint16_t *src = CONVERT_TO_SHORTPTR(src_buffer);
+        src_pix = src[pix_row * buf_stride + pix_col];
+      } else {
+        src_pix = src_buffer[pix_row * buf_stride + pix_col];
+      }
+      src_mean += src_pix;
+      src_variance += src_pix * src_pix;
+    }
+  }
+  const int pix_num = block_size * block_size;
+  src_variance -= (src_mean * src_mean) / pix_num;
+  src_variance /= pix_num;
+  *mean = src_mean / pix_num;
+  return src_variance;
+}
+
+static BLOCK_SIZE pick_block_size(AV1_COMP *cpi,
+                                  const BLOCK_SIZE orig_block_size) {
+  const BLOCK_SIZE sub_block_size =
+      get_partition_subsize(orig_block_size, PARTITION_SPLIT);
+  const int mb_step = mi_size_wide[orig_block_size];
+  const int sub_step = mb_step >> 1;
+  const TX_SIZE tx_size = max_txsize_lookup[orig_block_size];
+  const int block_size = tx_size_wide[tx_size];
+  const int split_block_size = block_size >> 1;
+  assert(split_block_size >= 8);
+  const uint8_t *const buffer = cpi->source->y_buffer;
+  const int buf_stride = cpi->source->y_stride;
+  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  double vote = 0.0;
+  for (int mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) {
+    for (int mi_col = 0; mi_col < cpi->frame_info.mi_cols; mi_col += mb_step) {
+      const uint8_t *mb_buffer =
+          buffer + mi_row * MI_SIZE * buf_stride + mi_col * MI_SIZE;
+      // (1). Calculate mean and var using the original block size
+      double mean = 0.0;
+      const double orig_var =
+          calc_src_mean_var(mb_buffer, buf_stride, block_size, use_hbd, &mean);
+      // (2). Calculate mean and var using the split block size
+      double split_var[4] = { 0 };
+      double split_mean[4] = { 0 };
+      int sub_idx = 0;
+      for (int row = mi_row; row < mi_row + mb_step; row += sub_step) {
+        for (int col = mi_col; col < mi_col + mb_step; col += sub_step) {
+          mb_buffer = buffer + row * MI_SIZE * buf_stride + col * MI_SIZE;
+          split_var[sub_idx] =
+              calc_src_mean_var(mb_buffer, buf_stride, split_block_size,
+                                use_hbd, &split_mean[sub_idx]);
+          ++sub_idx;
+        }
+      }
+      // (3). Determine whether to use the original or the split block size.
+      // If use original, vote += 1.0.
+      // If use split, vote -= 1.0.
+      double max_split_mean = 0.0;
+      double max_split_var = 0.0;
+      double geo_split_var = 0.0;
+      for (int i = 0; i < 4; ++i) {
+        max_split_mean = AOMMAX(max_split_mean, split_mean[i]);
+        max_split_var = AOMMAX(max_split_var, split_var[i]);
+        geo_split_var += log(0.1 + split_var[i]);
+      }
+      geo_split_var = exp(geo_split_var / 4);
+      const double param_1 = 1.5;
+      const double param_2 = 1.0;
+      // If the variance of the large block size is considerably larger than the
+      // geometric mean of vars of small blocks;
+      // Or if the variance of the large block size is larger than the local
+      // variance;
+      // Or if the variance of the large block size is considerably larger
+      // than the mean.
+      // It indicates that the source block is not a flat area, therefore we
+      // might want to split into smaller block sizes to capture the
+      // local characteristics.
+      if (orig_var > param_1 * geo_split_var || orig_var > max_split_var ||
+          sqrt(orig_var) > param_2 * mean) {
+        vote -= 1.0;
+      } else {
+        vote += 1.0;
+      }
+    }
+  }
+
+  return vote > 0.0 ? orig_block_size : sub_block_size;
+}
+
+static int64_t pick_norm_factor_and_block_size(AV1_COMP *const cpi,
+                                               BLOCK_SIZE *best_block_size) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  BLOCK_SIZE last_block_size;
+  BLOCK_SIZE this_block_size = sb_size;
+  *best_block_size = sb_size;
+  // Pick from block size 128x128, 64x64, 32x32 and 16x16.
+  do {
+    last_block_size = this_block_size;
+    assert(this_block_size >= BLOCK_16X16 && this_block_size <= BLOCK_128X128);
+    const int block_size = block_size_wide[this_block_size];
+    if (block_size < 32) break;
+    this_block_size = pick_block_size(cpi, last_block_size);
+  } while (this_block_size != last_block_size);
+  *best_block_size = this_block_size;
+
+  int64_t norm_factor = 1;
+  const BLOCK_SIZE norm_block_size = this_block_size;
+  assert(norm_block_size >= BLOCK_16X16 && norm_block_size <= BLOCK_128X128);
+  const int norm_step = mi_size_wide[norm_block_size];
+  double sb_wiener_log = 0;
+  double sb_count = 0;
+  for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) {
+    for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += norm_step) {
+      const int sb_wiener_var =
+          get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col);
+      const int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col);
+      const int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col);
+      const double scaled_satd = (double)satd / sqrt((double)sse);
+      sb_wiener_log += scaled_satd * log(sb_wiener_var);
+      sb_count += scaled_satd;
+    }
+  }
+  if (sb_count > 0) norm_factor = (int64_t)(exp(sb_wiener_log / sb_count));
+  norm_factor = AOMMAX(1, norm_factor);
+
+  return norm_factor;
+}
+
+static void automatic_intra_tools_off(AV1_COMP *cpi,
+                                      const double sum_rec_distortion,
+                                      const double sum_est_rate) {
+  if (!cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) return;
+
+  // Thresholds
+  const int high_quality_qindex = 128;
+  const double high_quality_bpp = 2.0;
+  const double high_quality_dist_per_pix = 4.0;
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int qindex = cm->quant_params.base_qindex;
+  const double dist_per_pix =
+      (double)sum_rec_distortion / (cm->width * cm->height);
+  // The estimate bpp is not accurate, an empirical constant 100 is divided.
+  const double estimate_bpp = sum_est_rate / (cm->width * cm->height * 100);
+
+  if (qindex < high_quality_qindex && estimate_bpp > high_quality_bpp &&
+      dist_per_pix < high_quality_dist_per_pix) {
+    cpi->oxcf.intra_mode_cfg.enable_smooth_intra = 0;
+    cpi->oxcf.intra_mode_cfg.enable_paeth_intra = 0;
+    cpi->oxcf.intra_mode_cfg.enable_cfl_intra = 0;
+    cpi->oxcf.intra_mode_cfg.enable_diagonal_intra = 0;
+  }
+}
+
+void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  uint8_t *buffer = cpi->source->y_buffer;
+  int buf_stride = cpi->source->y_stride;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO mbmi;
+  memset(&mbmi, 0, sizeof(mbmi));
+  MB_MODE_INFO *mbmi_ptr = &mbmi;
+  xd->mi = &mbmi_ptr;
+  xd->cur_buf = cpi->source;
+
+  const SequenceHeader *const seq_params = cm->seq_params;
+  if (aom_realloc_frame_buffer(
+          &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+          NULL, cpi->oxcf.tool_cfg.enable_global_motion, 0))
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+
+  cm->quant_params.base_qindex = cpi->oxcf.rc_cfg.cq_level;
+  av1_frame_init_quantizer(cpi);
+
+  DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+
+  int mi_row, mi_col;
+
+  BLOCK_SIZE bsize = cpi->weber_bsize;
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int block_size = tx_size_wide[tx_size];
+  const int coeff_count = block_size * block_size;
+
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  cpi->norm_wiener_variance = 0;
+  int mb_step = mi_size_wide[bsize];
+
+  double sum_rec_distortion = 0.0;
+  double sum_est_rate = 0.0;
+  for (mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) {
+    for (mi_col = 0; mi_col < cpi->frame_info.mi_cols; mi_col += mb_step) {
+      PREDICTION_MODE best_mode = DC_PRED;
+      int best_intra_cost = INT_MAX;
+
+      xd->up_available = mi_row > 0;
+      xd->left_available = mi_col > 0;
+
+      const int mi_width = mi_size_wide[bsize];
+      const int mi_height = mi_size_high[bsize];
+      set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                            mi_row, mi_col);
+      set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
+                     cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+      set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
+                   av1_num_planes(cm));
+      xd->mi[0]->bsize = bsize;
+      xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+
+      av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row,
+                           mi_col, 0, av1_num_planes(cm));
+
+      int dst_buffer_stride = xd->plane[0].dst.stride;
+      uint8_t *dst_buffer = xd->plane[0].dst.buf;
+      uint8_t *mb_buffer =
+          buffer + mi_row * MI_SIZE * buf_stride + mi_col * MI_SIZE;
+
+      for (PREDICTION_MODE mode = INTRA_MODE_START; mode < INTRA_MODE_END;
+           ++mode) {
+        av1_predict_intra_block(
+            xd, cm->seq_params->sb_size,
+            cm->seq_params->enable_intra_edge_filter, block_size, block_size,
+            tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
+            dst_buffer_stride, dst_buffer, dst_buffer_stride, 0, 0, 0);
+
+        av1_subtract_block(bd_info, block_size, block_size, src_diff,
+                           block_size, mb_buffer, buf_stride, dst_buffer,
+                           dst_buffer_stride);
+        av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
+        int intra_cost = aom_satd(coeff, coeff_count);
+        if (intra_cost < best_intra_cost) {
+          best_intra_cost = intra_cost;
+          best_mode = mode;
+        }
+      }
+
+      int idx;
+      av1_predict_intra_block(xd, cm->seq_params->sb_size,
+                              cm->seq_params->enable_intra_edge_filter,
+                              block_size, block_size, tx_size, best_mode, 0, 0,
+                              FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride,
+                              dst_buffer, dst_buffer_stride, 0, 0, 0);
+      av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
+                         mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
+      av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
+
+      const struct macroblock_plane *const p = &x->plane[0];
+      uint16_t eob;
+      const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+      QUANT_PARAM quant_param;
+      int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+      av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (is_cur_buf_hbd(xd)) {
+        av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+                                      scan_order, &quant_param);
+      } else {
+        av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+                               scan_order, &quant_param);
+      }
+#else
+      av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+                             scan_order, &quant_param);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+      av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst_buffer,
+                                  dst_buffer_stride, eob, 0);
+      WeberStats *weber_stats =
+          &cpi->mb_weber_stats[(mi_row / mb_step) * cpi->frame_info.mi_cols +
+                               (mi_col / mb_step)];
+
+      weber_stats->rec_pix_max = 1;
+      weber_stats->rec_variance = 0;
+      weber_stats->src_pix_max = 1;
+      weber_stats->src_variance = 0;
+      weber_stats->distortion = 0;
+
+      int64_t src_mean = 0;
+      int64_t rec_mean = 0;
+      int64_t dist_mean = 0;
+
+      for (int pix_row = 0; pix_row < block_size; ++pix_row) {
+        for (int pix_col = 0; pix_col < block_size; ++pix_col) {
+          int src_pix, rec_pix;
+#if CONFIG_AV1_HIGHBITDEPTH
+          if (is_cur_buf_hbd(xd)) {
+            uint16_t *src = CONVERT_TO_SHORTPTR(mb_buffer);
+            uint16_t *rec = CONVERT_TO_SHORTPTR(dst_buffer);
+            src_pix = src[pix_row * buf_stride + pix_col];
+            rec_pix = rec[pix_row * dst_buffer_stride + pix_col];
+          } else {
+            src_pix = mb_buffer[pix_row * buf_stride + pix_col];
+            rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
+          }
+#else
+          src_pix = mb_buffer[pix_row * buf_stride + pix_col];
+          rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
+#endif
+          src_mean += src_pix;
+          rec_mean += rec_pix;
+          dist_mean += src_pix - rec_pix;
+          weber_stats->src_variance += src_pix * src_pix;
+          weber_stats->rec_variance += rec_pix * rec_pix;
+          weber_stats->src_pix_max = AOMMAX(weber_stats->src_pix_max, src_pix);
+          weber_stats->rec_pix_max = AOMMAX(weber_stats->rec_pix_max, rec_pix);
+          weber_stats->distortion += (src_pix - rec_pix) * (src_pix - rec_pix);
+        }
+      }
+
+      sum_rec_distortion += weber_stats->distortion;
+      int est_block_rate = 0;
+      int64_t est_block_dist = 0;
+      model_rd_sse_fn[MODELRD_LEGACY](cpi, x, bsize, 0, weber_stats->distortion,
+                                      pix_num, &est_block_rate,
+                                      &est_block_dist);
+      sum_est_rate += est_block_rate;
+
+      weber_stats->src_variance -= (src_mean * src_mean) / pix_num;
+      weber_stats->rec_variance -= (rec_mean * rec_mean) / pix_num;
+      weber_stats->distortion -= (dist_mean * dist_mean) / pix_num;
+      weber_stats->satd = best_intra_cost;
+
+      qcoeff[0] = 0;
+      for (idx = 1; idx < coeff_count; ++idx) qcoeff[idx] = abs(qcoeff[idx]);
+      qsort(qcoeff, coeff_count, sizeof(*coeff), qsort_comp);
+
+      weber_stats->max_scale = (double)qcoeff[coeff_count - 1];
+    }
+  }
+
+  // Determine whether to turn off several intra coding tools.
+  automatic_intra_tools_off(cpi, sum_rec_distortion, sum_est_rate);
+
+  BLOCK_SIZE norm_block_size = BLOCK_16X16;
+  cpi->norm_wiener_variance =
+      pick_norm_factor_and_block_size(cpi, &norm_block_size);
+  const int norm_step = mi_size_wide[norm_block_size];
+
+  double sb_wiener_log = 0;
+  double sb_count = 0;
+  for (int its_cnt = 0; its_cnt < 2; ++its_cnt) {
+    sb_wiener_log = 0;
+    sb_count = 0;
+    for (mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) {
+      for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += norm_step) {
+        int sb_wiener_var =
+            get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col);
+
+        double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
+        double min_max_scale = AOMMAX(
+            1.0, get_max_scale(cpi, cm->seq_params->sb_size, mi_row, mi_col));
+        beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale);
+        beta = AOMMIN(beta, 4);
+        beta = AOMMAX(beta, 0.25);
+
+        sb_wiener_var = (int)(cpi->norm_wiener_variance / beta);
+
+        int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col);
+        int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col);
+        double scaled_satd = (double)satd / sqrt((double)sse);
+        sb_wiener_log += scaled_satd * log(sb_wiener_var);
+        sb_count += scaled_satd;
+      }
+    }
+
+    if (sb_count > 0)
+      cpi->norm_wiener_variance = (int64_t)(exp(sb_wiener_log / sb_count));
+    cpi->norm_wiener_variance = AOMMAX(1, cpi->norm_wiener_variance);
+  }
+
+  aom_free_frame_buffer(&cm->cur_frame->buf);
+}
+
+int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                              int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
+  int sb_wiener_var = get_var_perceptual_ai(cpi, bsize, mi_row, mi_col);
+  int offset = 0;
+  double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
+  double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col));
+  beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale);
+
+  // Cap beta such that the delta q value is not much far away from the base q.
+  beta = AOMMIN(beta, 4);
+  beta = AOMMAX(beta, 0.25);
+  offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta);
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  offset = AOMMIN(offset, delta_q_info->delta_q_res * 20 - 1);
+  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 20 + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+  if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1);
+
+  return qindex;
+}
+
+void av1_init_mb_ur_var_buffer(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+
+  if (cpi->mb_delta_q) return;
+
+  CHECK_MEM_ERROR(cm, cpi->mb_delta_q,
+                  aom_calloc(cpi->frame_info.mb_rows * cpi->frame_info.mb_cols,
+                             sizeof(*cpi->mb_delta_q)));
+}
+
+#if CONFIG_TFLITE
+static int model_predict(BLOCK_SIZE block_size, int num_cols, int num_rows,
+                         int bit_depth, uint8_t *y_buffer, int y_stride,
+                         float *predicts0, float *predicts1) {
+  // Create the model and interpreter options.
+  TfLiteModel *model =
+      TfLiteModelCreate(av1_deltaq4_model_file, av1_deltaq4_model_fsize);
+  if (model == NULL) return 1;
+
+  TfLiteInterpreterOptions *options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsSetNumThreads(options, 2);
+  if (options == NULL) {
+    TfLiteModelDelete(model);
+    return 1;
+  }
+
+  // Create the interpreter.
+  TfLiteInterpreter *interpreter = TfLiteInterpreterCreate(model, options);
+  if (interpreter == NULL) {
+    TfLiteInterpreterOptionsDelete(options);
+    TfLiteModelDelete(model);
+    return 1;
+  }
+
+  // Allocate tensors and populate the input tensor data.
+  TfLiteInterpreterAllocateTensors(interpreter);
+  TfLiteTensor *input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0);
+  if (input_tensor == NULL) {
+    TfLiteInterpreterDelete(interpreter);
+    TfLiteInterpreterOptionsDelete(options);
+    TfLiteModelDelete(model);
+    return 1;
+  }
+
+  size_t input_size = TfLiteTensorByteSize(input_tensor);
+  float *input_data = aom_calloc(input_size, 1);
+  if (input_data == NULL) {
+    TfLiteInterpreterDelete(interpreter);
+    TfLiteInterpreterOptionsDelete(options);
+    TfLiteModelDelete(model);
+    return 1;
+  }
+
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int row_offset = (row * num_mi_h) << 2;
+      const int col_offset = (col * num_mi_w) << 2;
+
+      uint8_t *buf = y_buffer + row_offset * y_stride + col_offset;
+      int r = row_offset, pos = 0;
+      const float base = (float)((1 << bit_depth) - 1);
+      while (r < row_offset + (num_mi_h << 2)) {
+        for (int c = 0; c < (num_mi_w << 2); ++c) {
+          input_data[pos++] = bit_depth > 8
+                                  ? (float)*CONVERT_TO_SHORTPTR(buf + c) / base
+                                  : (float)*(buf + c) / base;
+        }
+        buf += y_stride;
+        ++r;
+      }
+      TfLiteTensorCopyFromBuffer(input_tensor, input_data, input_size);
+
+      // Execute inference.
+      if (TfLiteInterpreterInvoke(interpreter) != kTfLiteOk) {
+        TfLiteInterpreterDelete(interpreter);
+        TfLiteInterpreterOptionsDelete(options);
+        TfLiteModelDelete(model);
+        return 1;
+      }
+
+      // Extract the output tensor data.
+      const TfLiteTensor *output_tensor =
+          TfLiteInterpreterGetOutputTensor(interpreter, 0);
+      if (output_tensor == NULL) {
+        TfLiteInterpreterDelete(interpreter);
+        TfLiteInterpreterOptionsDelete(options);
+        TfLiteModelDelete(model);
+        return 1;
+      }
+
+      size_t output_size = TfLiteTensorByteSize(output_tensor);
+      float output_data[2];
+
+      TfLiteTensorCopyToBuffer(output_tensor, output_data, output_size);
+      predicts0[row * num_cols + col] = output_data[0];
+      predicts1[row * num_cols + col] = output_data[1];
+    }
+  }
+
+  // Dispose of the model and interpreter objects.
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+  aom_free(input_data);
+  return 0;
+}
+
+void av1_set_mb_ur_variance(AV1_COMP *cpi) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  uint8_t *y_buffer = cpi->source->y_buffer;
+  const int y_stride = cpi->source->y_stride;
+  const int block_size = cpi->common.seq_params->sb_size;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+
+  // TODO(sdeng): fit a better model_1; disable it at this time.
+  float *mb_delta_q0, *mb_delta_q1, delta_q_avg0 = 0.0f;
+  CHECK_MEM_ERROR(cm, mb_delta_q0,
+                  aom_calloc(num_rows * num_cols, sizeof(float)));
+  CHECK_MEM_ERROR(cm, mb_delta_q1,
+                  aom_calloc(num_rows * num_cols, sizeof(float)));
+
+  if (model_predict(block_size, num_cols, num_rows, bit_depth, y_buffer,
+                    y_stride, mb_delta_q0, mb_delta_q1)) {
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "Failed to call TFlite functions.");
+  }
+
+  // Loop through each SB block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      delta_q_avg0 += mb_delta_q0[index];
+    }
+  }
+
+  delta_q_avg0 /= (float)(num_rows * num_cols);
+
+  float scaling_factor;
+  const float cq_level = (float)cpi->oxcf.rc_cfg.cq_level / (float)MAXQ;
+  if (cq_level < delta_q_avg0) {
+    scaling_factor = cq_level / delta_q_avg0;
+  } else {
+    scaling_factor = 1.0f - (cq_level - delta_q_avg0) / (1.0f - delta_q_avg0);
+  }
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      cpi->mb_delta_q[index] =
+          RINT((float)cpi->oxcf.q_cfg.deltaq_strength / 100.0f * (float)MAXQ *
+               scaling_factor * (mb_delta_q0[index] - delta_q_avg0));
+    }
+  }
+
+  aom_free(mb_delta_q0);
+  aom_free(mb_delta_q1);
+}
+#else  // !CONFIG_TFLITE
+void av1_set_mb_ur_variance(AV1_COMP *cpi) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  uint8_t *y_buffer = cpi->source->y_buffer;
+  const int y_stride = cpi->source->y_stride;
+  const int block_size = cpi->common.seq_params->sb_size;
+
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  int *mb_delta_q[2];
+  CHECK_MEM_ERROR(cm, mb_delta_q[0],
+                  aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[0])));
+  CHECK_MEM_ERROR(cm, mb_delta_q[1],
+                  aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[1])));
+
+  // Approximates the model change between current version (Spet 2021) and the
+  // baseline (July 2021).
+  const double model_change[] = { 3.0, 3.0 };
+  // The following parameters are fitted from user labeled data.
+  const double a[] = { -24.50 * 4.0, -17.20 * 4.0 };
+  const double b[] = { 0.004898, 0.003093 };
+  const double c[] = { (29.932 + model_change[0]) * 4.0,
+                       (42.100 + model_change[1]) * 4.0 };
+  int delta_q_avg[2] = { 0, 0 };
+  // Loop through each SB block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      double var = 0.0, num_of_var = 0.0;
+      const int index = row * num_cols + col;
+
+      // Loop through each 8x8 block.
+      for (int mi_row = row * num_mi_h;
+           mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
+           mi_row += 2) {
+        for (int mi_col = col * num_mi_w;
+             mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
+             mi_col += 2) {
+          struct buf_2d buf;
+          const int row_offset_y = mi_row << 2;
+          const int col_offset_y = mi_col << 2;
+
+          buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+          buf.stride = y_stride;
+
+          unsigned int block_variance;
+          if (use_hbd) {
+            block_variance = av1_high_get_sby_perpixel_variance(
+                cpi, &buf, BLOCK_8X8, xd->bd);
+          } else {
+            block_variance =
+                av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8);
+          }
+
+          block_variance = AOMMAX(block_variance, 1);
+          var += log((double)block_variance);
+          num_of_var += 1.0;
+        }
+      }
+      var = exp(var / num_of_var);
+      mb_delta_q[0][index] = RINT(a[0] * exp(-b[0] * var) + c[0]);
+      mb_delta_q[1][index] = RINT(a[1] * exp(-b[1] * var) + c[1]);
+      delta_q_avg[0] += mb_delta_q[0][index];
+      delta_q_avg[1] += mb_delta_q[1][index];
+    }
+  }
+
+  delta_q_avg[0] = RINT((double)delta_q_avg[0] / (num_rows * num_cols));
+  delta_q_avg[1] = RINT((double)delta_q_avg[1] / (num_rows * num_cols));
+
+  int model_idx;
+  double scaling_factor;
+  const int cq_level = cpi->oxcf.rc_cfg.cq_level;
+  if (cq_level < delta_q_avg[0]) {
+    model_idx = 0;
+    scaling_factor = (double)cq_level / delta_q_avg[0];
+  } else if (cq_level < delta_q_avg[1]) {
+    model_idx = 2;
+    scaling_factor =
+        (double)(cq_level - delta_q_avg[0]) / (delta_q_avg[1] - delta_q_avg[0]);
+  } else {
+    model_idx = 1;
+    scaling_factor = (double)(MAXQ - cq_level) / (MAXQ - delta_q_avg[1]);
+  }
+
+  const double new_delta_q_avg =
+      delta_q_avg[0] + scaling_factor * (delta_q_avg[1] - delta_q_avg[0]);
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      if (model_idx == 2) {
+        const double delta_q =
+            mb_delta_q[0][index] +
+            scaling_factor * (mb_delta_q[1][index] - mb_delta_q[0][index]);
+        cpi->mb_delta_q[index] = RINT((double)cpi->oxcf.q_cfg.deltaq_strength /
+                                      100.0 * (delta_q - new_delta_q_avg));
+      } else {
+        cpi->mb_delta_q[index] = RINT(
+            (double)cpi->oxcf.q_cfg.deltaq_strength / 100.0 * scaling_factor *
+            (mb_delta_q[model_idx][index] - delta_q_avg[model_idx]));
+      }
+    }
+  }
+
+  aom_free(mb_delta_q[0]);
+  aom_free(mb_delta_q[1]);
+}
+#endif
+
+int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col) {
+  const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  AV1_COMMON *const cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
+  if (base_qindex == MINQ || base_qindex == MAXQ) return base_qindex;
+
+  const int num_mi_w = mi_size_wide[bsize];
+  const int num_mi_h = mi_size_high[bsize];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int index = (mi_row / num_mi_h) * num_cols + (mi_col / num_mi_w);
+  const int delta_q = cpi->mb_delta_q[index];
+
+  int qindex = base_qindex + delta_q;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ + 1);
+
+  return qindex;
+}
diff --git a/media/libaom/src/av1/encoder/allintra_vis.h b/media/libaom/src/av1/encoder/allintra_vis.h
new file mode 100644
index 0000000000..6f60cdb6ae
--- /dev/null
+++ b/media/libaom/src/av1/encoder/allintra_vis.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ALLINTRA_VIS_H_
+#define AOM_AV1_ENCODER_ALLINTRA_VIS_H_
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+
+void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi);
+
+void av1_set_mb_wiener_variance(AV1_COMP *cpi);
+
+int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                              int mi_col);
+
+// User rating based mode
+void av1_init_mb_ur_var_buffer(AV1_COMP *cpi);
+
+void av1_set_mb_ur_variance(AV1_COMP *cpi);
+
+int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col);
+
+#endif  // AOM_AV1_ENCODER_ALLINTRA_VIS_H_
diff --git a/media/libaom/src/av1/encoder/aq_complexity.c b/media/libaom/src/av1/encoder/aq_complexity.c
index 36580063d6..37bc309ec8 100644
--- a/media/libaom/src/av1/encoder/aq_complexity.c
+++ b/media/libaom/src/av1/encoder/aq_complexity.c
@@ -18,7 +18,6 @@
 #include "av1/common/seg_common.h"
 #include "av1/encoder/segmentation.h"
 #include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/system_state.h"
 
 #define AQ_C_SEGMENTS 5
 #define DEFAULT_AQ2_SEG 3  // Neutral Q segment
@@ -47,10 +46,11 @@ static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
 
 static bool is_frame_aq_enabled(const AV1_COMP *const cpi) {
   const AV1_COMMON *const cm = &cpi->common;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
 
   return frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
-         cpi->refresh_alt_ref_frame ||
-         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+         refresh_frame->alt_ref_frame ||
+         (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
 // Segmentation only makes sense if the target bits per SB is above a threshold.
@@ -68,7 +68,6 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
                          cm->height != cm->prev_frame->height);
 
   // Make SURE use of floating point in this function is safe.
-  aom_clear_system_state();
 
   if (resolution_change) {
     memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
@@ -80,7 +79,7 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
   if (is_frame_aq_enabled(cpi)) {
     int segment;
     const int aq_strength =
-        get_aq_c_strength(base_qindex, cm->seq_params.bit_depth);
+        get_aq_c_strength(base_qindex, cm->seq_params->bit_depth);
 
     // Clear down the segment map.
     memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG,
@@ -106,7 +105,8 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
 
       qindex_delta = av1_compute_qdelta_by_rate(
           &cpi->rc, cm->current_frame.frame_type, base_qindex,
-          aq_c_q_adj_factor[aq_strength][segment], cm->seq_params.bit_depth);
+          aq_c_q_adj_factor[aq_strength][segment], cpi->is_screen_content_type,
+          cm->seq_params->bit_depth);
 
       // For AQ complexity mode, we dont allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
@@ -148,18 +148,17 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
     // It is converted to bits << AV1_PROB_COST_SHIFT units.
     const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
                         << AV1_PROB_COST_SHIFT;
-    const int denom = cm->seq_params.mib_size * cm->seq_params.mib_size;
+    const int denom = cm->seq_params->mib_size * cm->seq_params->mib_size;
     const int target_rate = (int)(num / denom);
     double logvar;
     double low_var_thresh;
     const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex,
-                                              cm->seq_params.bit_depth);
+                                              cm->seq_params->bit_depth);
 
-    aom_clear_system_state();
-    low_var_thresh =
-        (is_stat_consumption_stage_twopass(cpi))
-            ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH)
-            : DEFAULT_LV_THRESH;
+    low_var_thresh = (is_stat_consumption_stage_twopass(cpi))
+                         ? AOMMAX(exp(cpi->twopass_frame.mb_av_energy),
+                                  MIN_DEFAULT_LV_THRESH)
+                         : DEFAULT_LV_THRESH;
 
     av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs);
     logvar = av1_log_block_var(cpi, mb, bs);
diff --git a/media/libaom/src/av1/encoder/aq_cyclicrefresh.c b/media/libaom/src/av1/encoder/aq_cyclicrefresh.c
index b8884942ad..d00e5011a7 100644
--- a/media/libaom/src/av1/encoder/aq_cyclicrefresh.c
+++ b/media/libaom/src/av1/encoder/aq_cyclicrefresh.c
@@ -12,39 +12,30 @@
 #include <limits.h>
 #include <math.h>
 
+#include "av1/common/pred_common.h"
 #include "av1/common/seg_common.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
 #include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/system_state.h"
 
 CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
-  size_t last_coded_q_map_size;
   CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
   if (cr == NULL) return NULL;
 
   cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+  cr->counter_encode_maxq_scene_change = 0;
   if (cr->map == NULL) {
     av1_cyclic_refresh_free(cr);
     return NULL;
   }
-  last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
-  cr->last_coded_q_map = aom_malloc(last_coded_q_map_size);
-  if (cr->last_coded_q_map == NULL) {
-    av1_cyclic_refresh_free(cr);
-    return NULL;
-  }
-  assert(MAXQ <= 255);
-  memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
-  cr->avg_frame_low_motion = 0.0;
   return cr;
 }
 
 void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
   if (cr != NULL) {
     aom_free(cr->map);
-    aom_free(cr->last_coded_q_map);
     aom_free(cr);
   }
 }
@@ -55,21 +46,24 @@ void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
 // mode, and rate/distortion.
 static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
                                 const MB_MODE_INFO *mbmi, int64_t rate,
-                                int64_t dist, int bsize) {
+                                int64_t dist, int bsize, int noise_level) {
   MV mv = mbmi->mv[0].as_mv;
-  // Reject the block for lower-qp coding if projected distortion
-  // is above the threshold, and any of the following is true:
+  int is_compound = has_second_ref(mbmi);
+  // Reject the block for lower-qp coding for non-compound mode if
+  // projected distortion is above the threshold, and any of the following
+  // is true:
   // 1) mode uses large mv
   // 2) mode is an intra-mode
   // Otherwise accept for refresh.
-  if (dist > cr->thresh_dist_sb &&
+  if (!is_compound && dist > cr->thresh_dist_sb &&
       (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
        mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
        !is_inter_block(mbmi)))
     return CR_SEGMENT_ID_BASE;
-  else if (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
-           is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 &&
-           cr->rate_boost_fac > 10)
+  else if ((is_compound && noise_level < kMedium) ||
+           (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
+            is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 &&
+            cr->rate_boost_fac > 10))
     // More aggressive delta-q for bigger blocks with zero motion.
     return CR_SEGMENT_ID_BOOST2;
   else
@@ -80,51 +74,51 @@ static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
 static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
   const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const RATE_CONTROL *const rc = &cpi->rc;
-  int deltaq =
-      av1_compute_qdelta_by_rate(rc, cpi->common.current_frame.frame_type, q,
-                                 rate_factor, cpi->common.seq_params.bit_depth);
+  int deltaq = av1_compute_qdelta_by_rate(
+      rc, cpi->common.current_frame.frame_type, q, rate_factor,
+      cpi->is_screen_content_type, cpi->common.seq_params->bit_depth);
   if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
     deltaq = -cr->max_qdelta_perc * q / 100;
   }
   return deltaq;
 }
 
-// For the just encoded frame, estimate the bits, incorporating the delta-q
-// from non-base segment. For now ignore effect of multiple segments
-// (with different delta-q). Note this function is called in the postencode
-// (called from rc_update_rate_correction_factors()).
 int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
                                           double correction_factor) {
   const AV1_COMMON *const cm = &cpi->common;
   const FRAME_TYPE frame_type = cm->current_frame.frame_type;
   const int base_qindex = cm->quant_params.base_qindex;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int mbs = cm->mi_params.MBs;
   const int num4x4bl = mbs << 4;
   // Weight for non-base segments: use actual number of blocks refreshed in
   // previous/just encoded frame. Note number of blocks here is in 4x4 units.
-  const double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl;
-  const double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl;
+  double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl;
+  double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl;
+  if (cpi->rc.rtc_external_ratectrl) {
+    weight_segment1 = (double)(cr->percent_refresh * cm->mi_params.mi_rows *
+                               cm->mi_params.mi_cols / 100) /
+                      num4x4bl;
+    weight_segment2 = 0;
+  }
   // Take segment weighted average for estimated bits.
   const int estimated_bits =
       (int)((1.0 - weight_segment1 - weight_segment2) *
                 av1_estimate_bits_at_q(frame_type, base_qindex, mbs,
-                                       correction_factor, bit_depth) +
+                                       correction_factor, bit_depth,
+                                       cpi->is_screen_content_type) +
             weight_segment1 * av1_estimate_bits_at_q(
                                   frame_type, base_qindex + cr->qindex_delta[1],
-                                  mbs, correction_factor, bit_depth) +
+                                  mbs, correction_factor, bit_depth,
+                                  cpi->is_screen_content_type) +
             weight_segment2 * av1_estimate_bits_at_q(
                                   frame_type, base_qindex + cr->qindex_delta[2],
-                                  mbs, correction_factor, bit_depth));
+                                  mbs, correction_factor, bit_depth,
+                                  cpi->is_screen_content_type));
   return estimated_bits;
 }
 
-// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
-// a corresponding delta-q (for segment 1). This function is called in the
-// rc_regulate_q() to set the base qp index.
-// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
-// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
 int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
                                       double correction_factor) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -138,36 +132,79 @@ int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
                 cr->actual_num_seg2_blocks) >>
                1) /
       num4x4bl;
+  if (cpi->rc.rtc_external_ratectrl) {
+    weight_segment = (double)((cr->target_num_seg_blocks +
+                               cr->percent_refresh * cm->mi_params.mi_rows *
+                                   cm->mi_params.mi_cols / 100) >>
+                              1) /
+                     num4x4bl;
+  }
   // Compute delta-q corresponding to qindex i.
   int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
   // Take segment weighted average for bits per mb.
   bits_per_mb =
       (int)((1.0 - weight_segment) *
                 av1_rc_bits_per_mb(cm->current_frame.frame_type, i,
-                                   correction_factor,
-                                   cm->seq_params.bit_depth) +
+                                   correction_factor, cm->seq_params->bit_depth,
+                                   cpi->is_screen_content_type) +
             weight_segment * av1_rc_bits_per_mb(cm->current_frame.frame_type,
                                                 i + deltaq, correction_factor,
-                                                cm->seq_params.bit_depth));
+                                                cm->seq_params->bit_depth,
+                                                cpi->is_screen_content_type));
   return bits_per_mb;
 }
 
-// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
-// check if we should reset the segment_id, and update the cyclic_refresh map
-// and segmentation map.
-void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi,
-                                       MB_MODE_INFO *const mbmi, int mi_row,
-                                       int mi_col, BLOCK_SIZE bsize,
-                                       int64_t rate, int64_t dist, int skip) {
+void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  int cdf_num;
   const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int prev_segment_id = mbmi->segment_id;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+  if (!cr->skip_over4x4) {
+    mbmi->segment_id = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
+    if (prev_segment_id != mbmi->segment_id) {
+      const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
+      for (int mi_y = 0; mi_y < ymis; mi_y++) {
+        for (int mi_x = 0; mi_x < xmis; mi_x++) {
+          const int map_offset =
+              block_index + mi_y * cm->mi_params.mi_cols + mi_x;
+          cr->map[map_offset] = 0;
+          cpi->enc_seg.map[map_offset] = mbmi->segment_id;
+          cm->cur_frame->seg_map[map_offset] = mbmi->segment_id;
+        }
+      }
+    }
+  }
+  if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1)
+    x->actual_num_seg1_blocks -= xmis * ymis;
+  else if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST2)
+    x->actual_num_seg2_blocks -= xmis * ymis;
+}
+
+void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip,
+                                       RUN_TYPE dry_run) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
   const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
   const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
+  int noise_level = 0;
+  if (cpi->noise_estimate.enabled) noise_level = cpi->noise_estimate.level;
   const int refresh_this_block =
-      candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
+      candidate_refresh_aq(cr, mbmi, rate, dist, bsize, noise_level);
+  int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1;
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
 
@@ -197,60 +234,98 @@ void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi,
 
   // Update entries in the cyclic refresh map with new_map_value, and
   // copy mbmi->segment_id into global segmentation map.
-  for (int y = 0; y < ymis; y++)
-    for (int x = 0; x < xmis; x++) {
-      int map_offset = block_index + y * cm->mi_params.mi_cols + x;
-      cr->map[map_offset] = new_map_value;
-      cpi->enc_seg.map[map_offset] = mbmi->segment_id;
+  if (sh == 1) {
+    for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
+      const int map_offset = block_index + mi_y * cm->mi_params.mi_cols;
+      memset(&cr->map[map_offset], new_map_value, xmis);
+      memset(&cpi->enc_seg.map[map_offset], mbmi->segment_id, xmis);
+      memset(&cm->cur_frame->seg_map[map_offset], mbmi->segment_id, xmis);
     }
+  } else {
+    for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
+      for (int mi_x = 0; mi_x < xmis; mi_x += sh) {
+        const int map_offset =
+            block_index + mi_y * cm->mi_params.mi_cols + mi_x;
+        cr->map[map_offset] = new_map_value;
+        cpi->enc_seg.map[map_offset] = mbmi->segment_id;
+        cm->cur_frame->seg_map[map_offset] = mbmi->segment_id;
+      }
+    }
+  }
+  // Accumulate cyclic refresh update counters.
+  if (!dry_run) {
+    if (cyclic_refresh_segment_id(mbmi->segment_id) == CR_SEGMENT_ID_BOOST1)
+      x->actual_num_seg1_blocks += xmis * ymis;
+    else if (cyclic_refresh_segment_id(mbmi->segment_id) ==
+             CR_SEGMENT_ID_BOOST2)
+      x->actual_num_seg2_blocks += xmis * ymis;
+  }
+}
+
+// Initializes counters used for cyclic refresh.
+void av1_init_cyclic_refresh_counters(MACROBLOCK *const x) {
+  x->actual_num_seg1_blocks = 0;
+  x->actual_num_seg2_blocks = 0;
+  x->cnt_zeromv = 0;
+}
+
+// Accumulate cyclic refresh counters.
+void av1_accumulate_cyclic_refresh_counters(
+    CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x) {
+  cyclic_refresh->actual_num_seg1_blocks += x->actual_num_seg1_blocks;
+  cyclic_refresh->actual_num_seg2_blocks += x->actual_num_seg2_blocks;
+  cyclic_refresh->cnt_zeromv += x->cnt_zeromv;
 }
 
-// Update the some stats after encode frame is done.
 void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  unsigned char *const seg_map = cpi->enc_seg.map;
-  cr->cnt_zeromv = 0;
-  cr->actual_num_seg1_blocks = 0;
-  cr->actual_num_seg2_blocks = 0;
-  for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) {
-    for (int mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) {
-      MB_MODE_INFO **mi =
-          mi_params->mi_grid_base + mi_row * mi_params->mi_stride + mi_col;
-      MV mv = mi[0]->mv[0].as_mv;
-      if (cm->seg.enabled) {
-        int map_index = mi_row * mi_params->mi_cols + mi_col;
-        if (cyclic_refresh_segment_id(seg_map[map_index]) ==
-            CR_SEGMENT_ID_BOOST1)
-          cr->actual_num_seg1_blocks++;
-        else if (cyclic_refresh_segment_id(seg_map[map_index]) ==
-                 CR_SEGMENT_ID_BOOST2)
-          cr->actual_num_seg2_blocks++;
+  RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
+  const int avg_cnt_zeromv =
+      100 * cr->cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols);
+
+  if (!cpi->ppi->use_svc ||
+      (cpi->ppi->use_svc &&
+       !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+       cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+    rc->avg_frame_low_motion =
+        (rc->avg_frame_low_motion == 0)
+            ? avg_cnt_zeromv
+            : (3 * rc->avg_frame_low_motion + avg_cnt_zeromv) / 4;
+    // For SVC: set avg_frame_low_motion (only computed on top spatial layer)
+    // to all lower spatial layers.
+    if (cpi->ppi->use_svc &&
+        svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+      for (int i = 0; i < svc->number_spatial_layers - 1; ++i) {
+        const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+                                           svc->number_temporal_layers);
+        LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+        RATE_CONTROL *const lrc = &lc->rc;
+        lrc->avg_frame_low_motion = rc->avg_frame_low_motion;
       }
-      // Accumulate low_content_frame.
-      if (is_inter_block(mi[0]) && abs(mv.row) < 16 && abs(mv.col) < 16)
-        cr->cnt_zeromv++;
     }
   }
-  cr->cnt_zeromv =
-      100 * cr->cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols);
-  cr->avg_frame_low_motion =
-      (3 * cr->avg_frame_low_motion + (double)cr->cnt_zeromv) / 4;
 }
 
-// Set golden frame update interval, for 1 pass CBR mode.
 void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   // Set minimum gf_interval for GF update to a multiple of the refresh period,
   // with some max limit. Depending on past encoding stats, GF flag may be
   // reset and update may not occur until next baseline_gf_interval.
+  const int gf_length_mult[2] = { 8, 4 };
   if (cr->percent_refresh > 0)
-    rc->baseline_gf_interval = AOMMIN(2 * (100 / cr->percent_refresh), 40);
+    p_rc->baseline_gf_interval =
+        AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] *
+                   (100 / cr->percent_refresh),
+               MAX_GF_INTERVAL_RT);
   else
-    rc->baseline_gf_interval = 20;
-  if (cr->avg_frame_low_motion < 40) rc->baseline_gf_interval = 8;
+    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT;
+  if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40)
+    p_rc->baseline_gf_interval = 16;
 }
 
 // Update the segmentation map, and related quantities: cyclic refresh map,
@@ -267,10 +342,10 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
   memset(seg_map, CR_SEGMENT_ID_BASE, mi_params->mi_rows * mi_params->mi_cols);
-  sb_cols = (mi_params->mi_cols + cm->seq_params.mib_size - 1) /
-            cm->seq_params.mib_size;
-  sb_rows = (mi_params->mi_rows + cm->seq_params.mib_size - 1) /
-            cm->seq_params.mib_size;
+  sb_cols = (mi_params->mi_cols + cm->seq_params->mib_size - 1) /
+            cm->seq_params->mib_size;
+  sb_rows = (mi_params->mi_rows + cm->seq_params->mib_size - 1) /
+            cm->seq_params->mib_size;
   sbs_in_frame = sb_cols * sb_rows;
   // Number of target blocks to get the q delta (segment 1).
   block_count =
@@ -287,29 +362,23 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
     // Get the mi_row/mi_col corresponding to superblock index i.
     int sb_row_index = (i / sb_cols);
     int sb_col_index = i - sb_row_index * sb_cols;
-    int mi_row = sb_row_index * cm->seq_params.mib_size;
-    int mi_col = sb_col_index * cm->seq_params.mib_size;
-    // TODO(any): Ensure the population of
-    // cpi->common.features.allow_screen_content_tools and use the same instead
-    // of cpi->oxcf.content == AOM_CONTENT_SCREEN
-    int qindex_thresh = cpi->oxcf.content == AOM_CONTENT_SCREEN
-                            ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2,
-                                             cm->quant_params.base_qindex)
-                            : 0;
+    int mi_row = sb_row_index * cm->seq_params->mib_size;
+    int mi_col = sb_col_index * cm->seq_params->mib_size;
     assert(mi_row >= 0 && mi_row < mi_params->mi_rows);
     assert(mi_col >= 0 && mi_col < mi_params->mi_cols);
     bl_index = mi_row * mi_params->mi_cols + mi_col;
     // Loop through all MI blocks in superblock and update map.
-    xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params.mib_size);
-    ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params.mib_size);
-    for (y = 0; y < ymis; y++) {
-      for (x = 0; x < xmis; x++) {
+    xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params->mib_size);
+    ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params->mib_size);
+    // cr_map only needed at 8x8 blocks.
+    for (y = 0; y < ymis; y += 2) {
+      for (x = 0; x < xmis; x += 2) {
         const int bl_index2 = bl_index + y * mi_params->mi_cols + x;
         // If the block is as a candidate for clean up then mark it
         // for possible boost/refresh (segment 1). The segment id may get
         // reset to 0 later if block gets coded anything other than GLOBALMV.
         if (cr->map[bl_index2] == 0) {
-          if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++;
+          sum_map += 4;
         } else if (cr->map[bl_index2] < 0) {
           cr->map[bl_index2]++;
         }
@@ -317,7 +386,7 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
     }
     // Enforce constant segment over superblock.
     // If segment is at least half of superblock, set to 1.
-    if (sum_map >= xmis * ymis / 2) {
+    if (sum_map >= (xmis * ymis) >> 1) {
       for (y = 0; y < ymis; y++)
         for (x = 0; x < xmis; x++) {
           seg_map[bl_index + y * mi_params->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
@@ -336,6 +405,7 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
 void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   // TODO(marpan): Parameters need to be tuned.
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   int num4x4bl = cm->mi_params.MBs << 4;
@@ -343,27 +413,53 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   double weight_segment_target = 0;
   double weight_segment = 0;
   int qp_thresh = AOMMIN(20, rc->best_quality << 1);
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)
+    qp_thresh = AOMMIN(35, rc->best_quality << 1);
   int qp_max_thresh = 118 * MAXQ >> 7;
+  const int scene_change_detected =
+      cpi->rc.high_source_sad ||
+      (cpi->ppi->use_svc && cpi->svc.high_source_sad_superframe);
+  // Although this segment feature for RTC is only used for
+  // blocks >= 8X8, for more efficient coding of the seg map
+  // cur_frame->seg_map needs to set at 4x4 along with the
+  // function av1_cyclic_reset_segment_skip(). Skipping over
+  // 4x4 will therefore have small bdrate loss (~0.2%), so
+  // we use it only for speed > 9 for now.
+  // Also if loop-filter deltas is applied via segment, then
+  // we need to set cr->skip_over4x4 = 1.
+  cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0;
   cr->apply_cyclic_refresh = 1;
-  if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf) ||
-      cpi->svc.temporal_layer_id > 0 ||
-      rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+  if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
+      scene_change_detected || cpi->svc.temporal_layer_id > 0 ||
+      p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+      (cpi->svc.number_spatial_layers > 1 &&
+       cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
       (rc->frames_since_key > 20 &&
-       rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) ||
-      (cr->avg_frame_low_motion < 45 && rc->frames_since_key > 40)) {
+       p_rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) ||
+      (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 30 &&
+       rc->frames_since_key > 40)) {
     cr->apply_cyclic_refresh = 0;
     return;
   }
   cr->percent_refresh = 10;
+  // Increase the amount of refresh for #temporal_layers > 2, and for some
+  // frames after scene change that is encoded at high Q.
+  if (cpi->svc.number_temporal_layers > 2)
+    cr->percent_refresh = 15;
+  else if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+           cr->counter_encode_maxq_scene_change < 20)
+    cr->percent_refresh = 15;
   cr->max_qdelta_perc = 60;
   cr->time_for_refresh = 0;
   cr->motion_thresh = 32;
-  cr->rate_boost_fac = 15;
+  cr->rate_boost_fac =
+      (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) ? 10 : 15;
   // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
   // periods of the refresh cycle, after a key frame.
   // Account for larger interval on base layer for temporal layers.
   if (cr->percent_refresh > 0 &&
-      rc->frames_since_key < 400 / cr->percent_refresh) {
+      rc->frames_since_key <
+          (4 * cpi->svc.number_temporal_layers) * (100 / cr->percent_refresh)) {
     cr->rate_ratio_qdelta = 3.0;
   } else {
     cr->rate_ratio_qdelta = 2.0;
@@ -374,18 +470,18 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
       cr->motion_thresh = 16;
       cr->rate_boost_fac = 13;
     } else {
-      cr->max_qdelta_perc = 70;
-      cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.5);
+      cr->max_qdelta_perc = 50;
+      cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.0);
     }
   }
-  if (cpi->oxcf.rc_mode == AOM_VBR) {
+  if (cpi->oxcf.rc_cfg.mode == AOM_VBR) {
     // To be adjusted for VBR mode, e.g., based on gf period and boost.
     // For now use smaller qp-delta (than CBR), no second boosted seg, and
     // turn-off (no refresh) on golden refresh (since it's already boosted).
     cr->percent_refresh = 10;
     cr->rate_ratio_qdelta = 1.5;
     cr->rate_boost_fac = 10;
-    if (cpi->refresh_golden_frame == 1) {
+    if (cpi->refresh_frame.golden_frame) {
       cr->percent_refresh = 0;
       cr->rate_ratio_qdelta = 1.0;
     }
@@ -404,6 +500,12 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   if (weight_segment_target < 7 * weight_segment / 8)
     weight_segment = weight_segment_target;
   cr->weight_segment = weight_segment;
+  if (rc->rtc_external_ratectrl) {
+    cr->actual_num_seg1_blocks = cr->percent_refresh * cm->mi_params.mi_rows *
+                                 cm->mi_params.mi_cols / 100;
+    cr->actual_num_seg2_blocks = 0;
+    cr->weight_segment = (double)(cr->actual_num_seg1_blocks) / num4x4bl;
+  }
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
@@ -412,27 +514,27 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
-  int resolution_change =
+  const int scene_change_detected =
+      cpi->rc.high_source_sad ||
+      (cpi->ppi->use_svc && cpi->svc.high_source_sad_superframe);
+  const int resolution_change =
       cm->prev_frame && (cm->width != cm->prev_frame->width ||
                          cm->height != cm->prev_frame->height);
   if (resolution_change) av1_cyclic_refresh_reset_resize(cpi);
-  if (cm->current_frame.frame_number == 0) cr->low_content_avg = 0.0;
   if (!cr->apply_cyclic_refresh) {
     // Set segmentation map to 0 and disable.
     unsigned char *const seg_map = cpi->enc_seg.map;
     memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
     av1_disable_segmentation(&cm->seg);
-    if (cm->current_frame.frame_type == KEY_FRAME) {
-      memset(cr->last_coded_q_map, MAXQ,
-             cm->mi_params.mi_rows * cm->mi_params.mi_cols *
-                 sizeof(*cr->last_coded_q_map));
+    if (cm->current_frame.frame_type == KEY_FRAME || scene_change_detected) {
       cr->sb_index = 0;
+      cr->counter_encode_maxq_scene_change = 0;
     }
     return;
   } else {
+    cr->counter_encode_maxq_scene_change++;
     const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex,
-                                             cm->seq_params.bit_depth);
-    aom_clear_system_state();
+                                             cm->seq_params->bit_depth);
     // Set rate threshold to some multiple (set to 2 for now) of the target
     // rate (target is given by sb64_target_rate and scaled by 256).
     cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
@@ -440,7 +542,12 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
     // q will not exceed 457, so (q * q) is within 32bit; see:
     // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[].
     cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
-
+    // For low-resoln or lower speeds, the rate/dist thresholds need to be
+    // tuned/updated.
+    if (cpi->oxcf.speed <= 7 || (cm->width * cm->height < 640 * 360)) {
+      cr->thresh_dist_sb = 0;
+      cr->thresh_rate_sb = INT64_MAX;
+    }
     // Set up segmentation.
     // Clear down the segment map.
     av1_enable_segmentation(&cm->seg);
@@ -497,5 +604,17 @@ void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
   cr->sb_index = 0;
-  cpi->refresh_golden_frame = 1;
+  cpi->refresh_frame.golden_frame = true;
+  cr->apply_cyclic_refresh = 0;
+  cr->counter_encode_maxq_scene_change = 0;
+}
+
+int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) {
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  // TODO(marpan): Tune these conditons, add QP dependence.
+  if (cpi->rc.frames_since_key > 30 && cr->percent_refresh > 0 &&
+      cr->counter_encode_maxq_scene_change > 300 / cr->percent_refresh &&
+      cpi->rc.frame_source_sad < 1000)
+    return 1;
+  return 0;
 }
diff --git a/media/libaom/src/av1/encoder/aq_cyclicrefresh.h b/media/libaom/src/av1/encoder/aq_cyclicrefresh.h
index ee62f6aaa1..85da647eed 100644
--- a/media/libaom/src/av1/encoder/aq_cyclicrefresh.h
+++ b/media/libaom/src/av1/encoder/aq_cyclicrefresh.h
@@ -13,6 +13,8 @@
 #define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
 
 #include "av1/common/blockd.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/tokenize.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -27,46 +29,87 @@ extern "C" {
 // Maximum rate target ratio for setting segment delta-qp.
 #define CR_MAX_RATE_TARGET_RATIO 4.0
 
+/*!
+ * \brief The stucture of CYCLIC_REFRESH.
+ * \ingroup cyclic_refresh
+ */
 struct CYCLIC_REFRESH {
-  // Percentage of blocks per frame that are targeted as candidates
-  // for cyclic refresh.
+  /*!
+   * Percentage of blocks per frame that are targeted as candidates
+   * for cyclic refresh.
+   */
   int percent_refresh;
-  // Maximum q-delta as percentage of base q.
+  /*!
+   * Maximum q-delta as percentage of base q.
+   */
   int max_qdelta_perc;
-  // Superblock starting index for cycling through the frame.
+  /*!
+   *Superblock starting index for cycling through the frame.
+   */
   int sb_index;
-  // Controls how long block will need to wait to be refreshed again, in
-  // excess of the cycle time, i.e., in the case of all zero motion, block
-  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+  /*!
+   * Controls how long block will need to wait to be refreshed again, in
+   * excess of the cycle time, i.e., in the case of all zero motion, block
+   * will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+   */
   int time_for_refresh;
-  // Target number of (4x4) blocks that are set for delta-q.
+  /*!
+   * Target number of (4x4) blocks that are set for delta-q.
+   */
   int target_num_seg_blocks;
-  // Actual number of (4x4) blocks that were applied delta-q.
+  /*!
+   * Actual number of (4x4) blocks that were applied delta-q,
+   * for segment 1.
+   */
   int actual_num_seg1_blocks;
+  /*!
+   * Actual number of (4x4) blocks that were applied delta-q,
+   * for segment 2.
+   */
   int actual_num_seg2_blocks;
-  // RD mult. parameters for segment 1.
+  /*!
+   * RD mult. parameters for segment 1.
+   */
   int rdmult;
-  // Cyclic refresh map.
+  /*!
+   * Count of zero motion vectors
+   */
+  int cnt_zeromv;
+  /*!
+   * Cyclic refresh map.
+   */
   int8_t *map;
-  // Map of the last q a block was coded at.
-  uint8_t *last_coded_q_map;
-  // Thresholds applied to the projected rate/distortion of the coding block,
-  // when deciding whether block should be refreshed.
+  /*!
+   * Threshold applied to the projected rate of the coding block,
+   * when deciding whether block should be refreshed.
+   */
   int64_t thresh_rate_sb;
+  /*!
+   * Threshold applied to the projected distortion of the coding block,
+   * when deciding whether block should be refreshed.
+   */
   int64_t thresh_dist_sb;
-  // Threshold applied to the motion vector (in units of 1/8 pel) of the
-  // coding block, when deciding whether block should be refreshed.
+  /*!
+   * Threshold applied to the motion vector (in units of 1/8 pel) of the
+   * coding block, when deciding whether block should be refreshed.
+   */
   int16_t motion_thresh;
-  // Rate target ratio to set q delta.
+  /*!
+   * Rate target ratio to set q delta.
+   */
   double rate_ratio_qdelta;
-  // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+  /*!
+   * Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+   */
   int rate_boost_fac;
-  double low_content_avg;
+
+  /*!\cond */
   int qindex_delta[3];
   double weight_segment;
   int apply_cyclic_refresh;
-  int cnt_zeromv;
-  double avg_frame_low_motion;
+  int skip_over4x4;
+  int counter_encode_maxq_scene_change;
+  /*!\endcond */
 };
 
 struct AV1_COMP;
@@ -77,40 +120,199 @@ CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols);
 
 void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr);
 
-// Estimate the bits, incorporating the delta-q from segment 1, after encoding
-// the frame.
+/*!\brief Estimate the bits, incorporating the delta-q from the segments.
+ *
+ * For the just encoded frame, estimate the bits, incorporating the delta-q
+ * from non-base segment(s). Note this function is called in the postencode
+ * (called from rc_update_rate_correction_factors()).
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi               Top level encoder structure
+ * \param[in]       correction_factor rate correction factor
+ *
+ * \return Return the estimated bits at given q.
+ */
 int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi,
                                           double correction_factor);
 
-// Estimate the bits per mb, for a given q = i and a corresponding delta-q
-// (for segment 1), prior to encoding the frame.
+/*!\brief Estimate the bits per mb, for given q = i and delta-q.
+ *
+ * Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+ * a corresponding delta-q (for segment 1). This function is called in the
+ * rc_regulate_q() to set the base qp index. Note: the segment map is set to
+ * either 0/CR_SEGMENT_ID_BASE (no refresh) or to 1/CR_SEGMENT_ID_BOOST1
+ * (refresh) for each superblock, prior to encoding.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi               Top level encoder structure
+ * \param[in]       i                 q index
+ * \param[in]       correction_factor rate correction factor
+ *
+ * \return Return the estimated bits for q = i and delta-q (segment 1).
+ */
 int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
                                       double correction_factor);
 
-// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
-// check if we should reset the segment_id, and update the cyclic_refresh map
-// and segmentation map.
+/*!\brief Update segment_id for blocks are skipped.
+ *
+ * After encoding a given prediction block, of size bsize at (mi_row, mi_col),
+ * check if we should reset the segment_id based on skip_txfm,
+ * and update the cyclic_refresh map and segmentation counters.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cpi       Top level encoder structure
+ * \param[in]   x         Pointer to MACROBLOCK structure
+ * \param[in]   mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]   mi_col    Col coordinate of the block in a step size of MI_SIZE
+ * \param[in]   bsize     Block size
+ *
+ * \return Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * the \c cm->cpi->enc_seg.map.
+ */
+
+void av1_cyclic_reset_segment_skip(const struct AV1_COMP *cpi,
+                                   MACROBLOCK *const x, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize);
+
+/*!\brief Update segment_id for block based on mode selected.
+ *
+ * Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+ * check if we should reset the segment_id (based on mode/motion/skip selected
+ * for that block) and update the cyclic_refresh map and segmentation map.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cpi       Top level encoder structure
+ * \param[in]   x         Pointer to MACROBLOCK structure
+ * \param[in]   mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]   mi_col    Col coordinate of the block in a step size of MI_SIZE
+ * \param[in]   bsize     Block size
+ * \param[in]   rate      Projected block rate from pickmode
+ * \param[in]   dist      Projected block dist from pickmode
+ * \param[in]   skip      Skip flag set from picmode
+ * \param[in]   dry_run   A code indicating whether it is part of the final
+ *                         pass for reconstructing the superblock
+ *
+ * \return Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * the \c cm->cpi->enc_seg.map.
+ */
 void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
-                                       MB_MODE_INFO *const mbmi, int mi_row,
+                                       MACROBLOCK *const x, int mi_row,
                                        int mi_col, BLOCK_SIZE bsize,
-                                       int64_t rate, int64_t dist, int skip);
+                                       int64_t rate, int64_t dist, int skip,
+                                       RUN_TYPE dry_run);
+
+/*!\brief Initialize counters used for cyclic refresh.
+ *
+ * Initializes cyclic refresh counters cnt_zeromv, actual_num_seg1_blocks and
+ * actual_num_seg2_blocks.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   x         Pointer to MACROBLOCK structure
+ *
+ * \return Update the \c x->cnt_zeromv, the \c x->actual_num_seg1_blocks and
+ * the \c x->actual_num_seg1_blocks.
+ */
+void av1_init_cyclic_refresh_counters(MACROBLOCK *const x);
 
-// Update the some stats after encode frame is done.
+/*!\brief Accumulate cyclic refresh counters.
+ *
+ * Accumulates cyclic refresh counters cnt_zeromv, actual_num_seg1_blocks and
+ * actual_num_seg2_blocks from MACROBLOCK strcture to CYCLIC_REFRESH strcture.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cyclic_refresh Pointer to CYCLIC_REFRESH structure
+ * \param[in]   x              Pointer to MACROBLOCK structure
+ *
+ * \return Update the \c cyclic_refresh->cnt_zeromv, the \c
+ * cyclic_refresh->actual_num_seg1_blocks and the \c
+ * cyclic_refresh->actual_num_seg1_blocks.
+ */
+void av1_accumulate_cyclic_refresh_counters(
+    CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x);
+
+/*!\brief Update stats after encoding frame.
+ *
+ * Update the number of block encoded with segment 1 and 2,
+ * and update the number of blocks encoded with small/zero motion.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cpi       Top level encoder structure
+ *
+ * \return Updates the \c cpi->cyclic_refresh with the new stats.
+ */
 void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi);
 
-// Set golden frame update interval, for 1 pass CBR mode.
+/*!\brief Set golden frame update interval nased on cyclic refresh.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cpi       Top level encoder structure
+ *
+ * \return Returns the interval in \c cpi->rc.baseline_gf_interval.
+ */
 void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
 
-// Set/update global/frame level refresh parameters.
+/*!\brief Set the global/frame level parameters for cyclic refresh.
+ *
+ * First call to the cyclic refresh, before encoding the frame.
+ * Sets the flag on whether cyclic refresh should be applied, sets
+ * the amount/percent of refresh, and the amount of boost applied to
+ * the two segments (set by rate_ratio_qdelta and rate_boost_fac).
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi          Top level encoder structure
+ *
+ * \return Updates the \c cpi->cyclic_refresh with the settings.
+ */
 void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
 
-// Setup cyclic background refresh: set delta q and segmentation map.
+/*!\brief Setup the cyclic background refresh.
+ *
+ * Set the delta q for the segment(s), and set the segmentation map.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi          Top level encoder structure
+ *
+ * \return Updates the \c cpi->cyclic_refresh with the cyclic refresh
+ * parameters and the \c cm->seg with the segmentation data.
+ */
 void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi);
 
 int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
 
 void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi);
 
+int av1_cyclic_refresh_disable_lf_cdef(struct AV1_COMP *const cpi);
+
 static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
   return segment_id == CR_SEGMENT_ID_BOOST1 ||
          segment_id == CR_SEGMENT_ID_BOOST2;
diff --git a/media/libaom/src/av1/encoder/aq_variance.c b/media/libaom/src/av1/encoder/aq_variance.c
index 4176da292c..3273ef8ed2 100644
--- a/media/libaom/src/av1/encoder/aq_variance.c
+++ b/media/libaom/src/av1/encoder/aq_variance.c
@@ -20,7 +20,6 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/dwt.h"
-#include "aom_ports/system_state.h"
 
 static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0,
                                                  0.9, .8,  .7,  .6 };
@@ -44,6 +43,7 @@ static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
 
 void av1_vaq_frame_setup(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const int base_qindex = cm->quant_params.base_qindex;
   struct segmentation *seg = &cm->seg;
   int i;
@@ -51,7 +51,7 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
   int resolution_change =
       cm->prev_frame && (cm->width != cm->prev_frame->width ||
                          cm->height != cm->prev_frame->height);
-  int avg_energy = (int)(cpi->twopass.mb_av_energy - 2);
+  int avg_energy = (int)(cpi->twopass_frame.mb_av_energy - 2);
   double avg_ratio;
   if (avg_energy > 7) avg_energy = 7;
   if (avg_energy < 0) avg_energy = 0;
@@ -60,26 +60,24 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
   if (resolution_change) {
     memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
     av1_clearall_segfeatures(seg);
-    aom_clear_system_state();
     av1_disable_segmentation(seg);
     return;
   }
   if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
-      cpi->refresh_alt_ref_frame ||
-      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+      refresh_frame->alt_ref_frame ||
+      (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     cpi->vaq_refresh = 1;
 
     av1_enable_segmentation(seg);
     av1_clearall_segfeatures(seg);
 
-    aom_clear_system_state();
-
     for (i = 0; i < MAX_SEGMENTS; ++i) {
       // Set up avg segment id to be 1.0 and adjust the other segments around
       // it.
       int qindex_delta = av1_compute_qdelta_by_rate(
           &cpi->rc, cm->current_frame.frame_type, base_qindex,
-          rate_ratio[i] / avg_ratio, cm->seq_params.bit_depth);
+          rate_ratio[i] / avg_ratio, cpi->is_screen_content_type,
+          cm->seq_params->bit_depth);
 
       // We don't allow qindex 0 in a segment if the base value is not 0.
       // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
@@ -118,23 +116,21 @@ int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
   const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
   const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
 
-  aom_clear_system_state();
-
   for (i = 0; i < bh; i += 4) {
     for (j = 0; j < bw; j += 4) {
       if (is_cur_buf_hbd(xd)) {
         var +=
-            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+            log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
                           x->plane[0].src.stride,
                           CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
-                          16);
+                          16.0);
       } else {
         var +=
-            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+            log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
                           x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
-                          16);
+                          16.0);
       }
     }
   }
@@ -142,41 +138,63 @@ int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
   var /= (bw / 4 * bh / 4);
   if (var > 7) var = 7;
 
-  aom_clear_system_state();
   return (int)(var);
 }
 
+int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+                      int mi_row, int mi_col) {
+  // This functions returns the block average of luma block
+  unsigned int sum, avg, num_pix;
+  int r, c;
+  const int pic_w = cpi->common.width;
+  const int pic_h = cpi->common.height;
+  const int bw = MI_SIZE * mi_size_wide[bs];
+  const int bh = MI_SIZE * mi_size_high[bs];
+  const uint16_t *x16 = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+  sum = 0;
+  num_pix = 0;
+  avg = 0;
+  int row = mi_row << MI_SIZE_LOG2;
+  int col = mi_col << MI_SIZE_LOG2;
+  for (r = row; (r < (row + bh)) && (r < pic_h); r++) {
+    for (c = col; (c < (col + bw)) && (c < pic_w); c++) {
+      sum += *(x16 + r * x->plane[0].src.stride + c);
+      num_pix++;
+    }
+  }
+  if (num_pix != 0) {
+    avg = sum / num_pix;
+  }
+  return avg;
+}
+
 #define DEFAULT_E_MIDPOINT 10.0
 
 static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
   MACROBLOCKD *xd = &x->e_mbd;
   int stride = x->plane[0].src.stride;
   uint8_t *buf = x->plane[0].src.buf;
-  const int bw = MI_SIZE * mi_size_wide[bs];
-  const int bh = MI_SIZE * mi_size_high[bs];
+  const int num_8x8_cols = block_size_wide[bs] / 8;
+  const int num_8x8_rows = block_size_high[bs] / 8;
   const int hbd = is_cur_buf_hbd(xd);
 
-  int var = 0;
-  for (int r = 0; r < bh; r += 8)
-    for (int c = 0; c < bw; c += 8) {
-      var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride, hbd);
-    }
+  int64_t var = av1_haar_ac_sad_mxn_uint8_input(buf, stride, hbd, num_8x8_rows,
+                                                num_8x8_cols);
 
   return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
 }
 
 double av1_log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
   unsigned int haar_sad = haar_ac_energy(x, bs);
-  aom_clear_system_state();
   return log(haar_sad + 1.0);
 }
 
 int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
                                    BLOCK_SIZE bs) {
   double energy, energy_midpoint;
-  aom_clear_system_state();
   energy_midpoint = (is_stat_consumption_stage_twopass(cpi))
-                        ? cpi->twopass.frame_avg_haar_energy
+                        ? cpi->twopass_frame.frame_avg_haar_energy
                         : DEFAULT_E_MIDPOINT;
   energy = av1_log_block_wavelet_energy(x, bs) - energy_midpoint;
   return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
@@ -196,7 +214,8 @@ int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
   const int base_qindex = cm->quant_params.base_qindex;
   int qindex_delta = av1_compute_qdelta_by_rate(
       &cpi->rc, cm->current_frame.frame_type, base_qindex,
-      deltaq_rate_ratio[rate_level], cm->seq_params.bit_depth);
+      deltaq_rate_ratio[rate_level], cpi->is_screen_content_type,
+      cm->seq_params->bit_depth);
 
   if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
     qindex_delta = -base_qindex + 1;
diff --git a/media/libaom/src/av1/encoder/aq_variance.h b/media/libaom/src/av1/encoder/aq_variance.h
index 543eb0b511..aa0535ad72 100644
--- a/media/libaom/src/av1/encoder/aq_variance.h
+++ b/media/libaom/src/av1/encoder/aq_variance.h
@@ -21,6 +21,8 @@ extern "C" {
 void av1_vaq_frame_setup(AV1_COMP *cpi);
 
 int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+                      int mi_row, int mi_col);
 int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
                                                 int block_var_level);
 int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
diff --git a/media/libaom/src/av1/encoder/arm/neon/av1_error_neon.c b/media/libaom/src/av1/encoder/arm/neon/av1_error_neon.c
index 22da1a8d66..124c1fdda1 100644
--- a/media/libaom/src/av1/encoder/arm/neon/av1_error_neon.c
+++ b/media/libaom/src/av1/encoder/arm/neon/av1_error_neon.c
@@ -11,8 +11,8 @@
 #include <arm_neon.h>
 #include <assert.h>
 
-#include "av1/common/arm/mem_neon.h"
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
 
 int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                              intptr_t block_size, int64_t *ssz) {
diff --git a/media/libaom/src/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/media/libaom/src/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
new file mode 100644
index 0000000000..153a2382b8
--- /dev/null
+++ b/media/libaom/src/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -0,0 +1,4402 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#define custom_packs_s32(w0, w1) vcombine_s16(vqmovn_s32(w0), vqmovn_s32(w1))
+
+static INLINE void transpose_16bit_4x4(const int16x8_t *const in,
+                                       int16x8_t *const out) {
+#if defined(__aarch64__)
+  const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
+  const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
+#else
+  int16x4x2_t temp;
+  temp = vzip_s16(vget_low_s16(in[0]), vget_low_s16(in[1]));
+  const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(vget_low_s16(in[2]), vget_low_s16(in[3]));
+  const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
+#endif
+
+  int32x4x2_t a01 =
+      vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
+  out[0] = vreinterpretq_s16_s32(a01.val[0]);
+  out[1] = vextq_s16(vreinterpretq_s16_s32(a01.val[0]), out[1], 4);
+  out[2] = vreinterpretq_s16_s32(a01.val[1]);
+  out[3] = vextq_s16(vreinterpretq_s16_s32(a01.val[1]), out[3], 4);
+}
+
+static INLINE void transpose_16bit_4x8(const int16x8_t *const in,
+                                       int16x8_t *const out) {
+#if defined(__aarch64__)
+  const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
+  const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
+  const int16x8_t a2 = vzip1q_s16(in[4], in[5]);
+  const int16x8_t a3 = vzip1q_s16(in[6], in[7]);
+#else
+  int16x4x2_t temp;
+  temp = vzip_s16(vget_low_s16(in[0]), vget_low_s16(in[1]));
+  const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(vget_low_s16(in[2]), vget_low_s16(in[3]));
+  const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(vget_low_s16(in[4]), vget_low_s16(in[5]));
+  const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(vget_low_s16(in[6]), vget_low_s16(in[7]));
+  const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
+#endif
+
+  const int32x4x2_t b02 =
+      vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
+  const int32x4x2_t b13 =
+      vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
+
+#if defined(__aarch64__)
+  out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
+                                            vreinterpretq_s64_s32(b13.val[0])));
+  out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
+                                            vreinterpretq_s64_s32(b13.val[0])));
+  out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
+                                            vreinterpretq_s64_s32(b13.val[1])));
+  out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
+                                            vreinterpretq_s64_s32(b13.val[1])));
+#else
+  out[0] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
+  out[2] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
+  out[1] = vreinterpretq_s16_s32(
+      vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
+  out[3] = vreinterpretq_s16_s32(
+      vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
+#endif
+}
+
+static INLINE void transpose_16bit_8x4(const int16x8_t *const in,
+                                       int16x8_t *const out) {
+  const int16x8x2_t a04 = vzipq_s16(in[0], in[1]);
+  const int16x8x2_t a15 = vzipq_s16(in[2], in[3]);
+
+  const int32x4x2_t b01 = vzipq_s32(vreinterpretq_s32_s16(a04.val[0]),
+                                    vreinterpretq_s32_s16(a15.val[0]));
+  const int32x4x2_t b45 = vzipq_s32(vreinterpretq_s32_s16(a04.val[1]),
+                                    vreinterpretq_s32_s16(a15.val[1]));
+
+  const int32x4_t zeros = vdupq_n_s32(0);
+
+#if defined(__aarch64__)
+  out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[0]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[0]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[1]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[1]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[4] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b45.val[0]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[5] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b45.val[0]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[6] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b45.val[1]),
+                                            vreinterpretq_s64_s32(zeros)));
+  out[7] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b45.val[1]),
+                                            vreinterpretq_s64_s32(zeros)));
+#else
+  out[0] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b01.val[0], b01.val[0], 2), zeros, 2));
+  out[1] = vreinterpretq_s16_s32(vextq_s32(b01.val[0], zeros, 2));
+  out[2] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b01.val[1], b01.val[1], 2), zeros, 2));
+  out[3] = vreinterpretq_s16_s32(vextq_s32(b01.val[1], zeros, 2));
+  out[4] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b45.val[0], b45.val[0], 2), zeros, 2));
+  out[5] = vreinterpretq_s16_s32(vextq_s32(b45.val[0], zeros, 2));
+  out[6] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b45.val[1], b45.val[1], 2), zeros, 2));
+  out[7] = vreinterpretq_s16_s32(vextq_s32(b45.val[1], zeros, 2));
+#endif
+}
+
+static INLINE void transpose_16bit_8x8(const int16x8_t *const in,
+                                       int16x8_t *const out) {
+  const int16x8x2_t a04 = vzipq_s16(in[0], in[1]);
+  const int16x8x2_t a15 = vzipq_s16(in[2], in[3]);
+  const int16x8x2_t a26 = vzipq_s16(in[4], in[5]);
+  const int16x8x2_t a37 = vzipq_s16(in[6], in[7]);
+
+  const int32x4x2_t b04 = vzipq_s32(vreinterpretq_s32_s16(a04.val[0]),
+                                    vreinterpretq_s32_s16(a15.val[0]));
+  const int32x4x2_t b15 = vzipq_s32(vreinterpretq_s32_s16(a26.val[0]),
+                                    vreinterpretq_s32_s16(a37.val[0]));
+  const int32x4x2_t b26 = vzipq_s32(vreinterpretq_s32_s16(a04.val[1]),
+                                    vreinterpretq_s32_s16(a15.val[1]));
+  const int32x4x2_t b37 = vzipq_s32(vreinterpretq_s32_s16(a26.val[1]),
+                                    vreinterpretq_s32_s16(a37.val[1]));
+
+#if defined(__aarch64__)
+  out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[0]),
+                                            vreinterpretq_s64_s32(b15.val[0])));
+  out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[0]),
+                                            vreinterpretq_s64_s32(b15.val[0])));
+  out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[1]),
+                                            vreinterpretq_s64_s32(b15.val[1])));
+  out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[1]),
+                                            vreinterpretq_s64_s32(b15.val[1])));
+  out[4] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b26.val[0]),
+                                            vreinterpretq_s64_s32(b37.val[0])));
+  out[5] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b26.val[0]),
+                                            vreinterpretq_s64_s32(b37.val[0])));
+  out[6] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b26.val[1]),
+                                            vreinterpretq_s64_s32(b37.val[1])));
+  out[7] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b26.val[1]),
+                                            vreinterpretq_s64_s32(b37.val[1])));
+#else
+  out[0] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b04.val[0], b04.val[0], 2), b15.val[0], 2));
+  out[1] = vreinterpretq_s16_s32(
+      vextq_s32(b04.val[0], vextq_s32(b15.val[0], b15.val[0], 2), 2));
+  out[2] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b04.val[1], b04.val[1], 2), b15.val[1], 2));
+  out[3] = vreinterpretq_s16_s32(
+      vextq_s32(b04.val[1], vextq_s32(b15.val[1], b15.val[1], 2), 2));
+  out[4] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b26.val[0], b26.val[0], 2), b37.val[0], 2));
+  out[5] = vreinterpretq_s16_s32(
+      vextq_s32(b26.val[0], vextq_s32(b37.val[0], b37.val[0], 2), 2));
+  out[6] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b26.val[1], b26.val[1], 2), b37.val[1], 2));
+  out[7] = vreinterpretq_s16_s32(
+      vextq_s32(b26.val[1], vextq_s32(b37.val[1], b37.val[1], 2), 2));
+#endif
+}
+
+static INLINE void av1_round_shift_rect_array_32_neon(int32x4_t *input,
+                                                      int32x4_t *output,
+                                                      const int size) {
+  int i;
+  for (i = 0; i < size; i++) {
+    output[i] = vrshrq_n_s32(vmulq_n_s32(vrshrq_n_s32(input[i], 2), NewSqrt2),
+                             NewSqrt2Bits);
+  }
+}
+
+static INLINE void av1_round_shift_array_32_neon(int32x4_t *input,
+                                                 int32x4_t *output,
+                                                 const int size) {
+  int i;
+  for (i = 0; i < size; i++) output[i] = vrshrq_n_s32(input[i], 2);
+}
+
+#define btf_32_neon(w0, w1, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                       \
+    out0 = vmulq_n_s32(in0, w0);                             \
+    out0 = vmlaq_n_s32(out0, in1, w1);                       \
+    out0 = vrshlq_s32(out0, v_cos_bit);                      \
+    out1 = vmulq_n_s32(in0, w1);                             \
+    out1 = vmlsq_n_s32(out1, in1, w0);                       \
+    out1 = vrshlq_s32(out1, v_cos_bit);                      \
+  } while (0)
+
+#define btf_32_type1_neon(w0, w1, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                             \
+    btf_32_neon(w1, w0, in1, in0, out0, out1, v_cos_bit);          \
+  } while (0)
+
+#define btf_32_neon_mode0(w0, w1, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                             \
+    out0 = vmulq_n_s32(in1, w1);                                   \
+    out0 = vmlsq_n_s32(out0, in0, w0);                             \
+    out0 = vrshlq_s32(out0, v_cos_bit);                            \
+    out1 = vmulq_n_s32(in0, w1);                                   \
+    out1 = vmlaq_n_s32(out1, in1, w0);                             \
+    out1 = vrshlq_s32(out1, v_cos_bit);                            \
+  } while (0)
+
+#define btf_32_neon_mode01(w0, w1, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                              \
+    out0 = vmulq_n_s32(in1, w1);                                    \
+    out0 = vmlaq_n_s32(out0, in0, w0);                              \
+    out0 = vrshlq_s32(vnegq_s32(out0), v_cos_bit);                  \
+    out1 = vmulq_n_s32(in1, w0);                                    \
+    out1 = vmlsq_n_s32(out1, in0, w1);                              \
+    out1 = vrshlq_s32(out1, v_cos_bit);                             \
+  } while (0)
+
+static INLINE void flip_buf_neon(int16x8_t *in, int16x8_t *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+static INLINE void store_16bit_to_32bit_w4(const int16x8_t a,
+                                           int32_t *const b) {
+  vst1q_s32(b, vmovl_s16(vget_low_s16(a)));
+}
+
+static INLINE void store_16bit_to_32bit(int16x8_t a, int32_t *b) {
+  vst1q_s32(b, vmovl_s16(vget_low_s16(a)));
+  vst1q_s32((b + 4), vmovl_s16(vget_high_s16(a)));
+}
+
+static INLINE void store_rect_16bit_to_32bit_w4(
+    const int16x8_t a, int32_t *const b, const int16x4_t *v_newsqrt2,
+    const int32x4_t *v_newsqrt2bits) {
+  const int32x4_t b_lo =
+      vrshlq_s32(vmull_s16(vget_low_s16(a), *v_newsqrt2), *v_newsqrt2bits);
+  vst1q_s32(b, b_lo);
+}
+
+static INLINE void store_rect_16bit_to_32bit(const int16x8_t a,
+                                             int32_t *const b,
+                                             const int16x4_t *v_newsqrt2,
+                                             const int32x4_t *v_newsqrt2bits) {
+  const int32x4_t b_lo =
+      vrshlq_s32(vmull_s16(vget_low_s16(a), *v_newsqrt2), *v_newsqrt2bits);
+  const int32x4_t b_hi =
+      vrshlq_s32(vmull_s16(vget_high_s16(a), *v_newsqrt2), *v_newsqrt2bits);
+  vst1q_s32(b, b_lo);
+  vst1q_s32((b + 4), b_hi);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+                                                 const int stride,
+                                                 int16x8_t *const out,
+                                                 const int out_size) {
+  for (int i = 0; i < out_size; ++i)
+    out[i] = vreinterpretq_s16_u64(vld1q_lane_u64(
+        (uint64_t *)(in + i * stride), vreinterpretq_u64_s16(out[i]), 0));
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+                                                      const int stride,
+                                                      int16x8_t *const out,
+                                                      const int out_size) {
+  for (int i = 0; i < out_size; ++i)
+    out[out_size - i - 1] = vreinterpretq_s16_u64(
+        vld1q_lane_u64((uint64_t *)(in + i * stride),
+                       vreinterpretq_u64_s16(out[out_size - i - 1]), 0));
+}
+
+static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
+                                              int16x8_t *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = vld1q_s16(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+                                                   int stride, int16x8_t *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = vld1q_s16(in + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w4(const int16x8_t *const in,
+                                                  int32_t *const out,
+                                                  const int stride,
+                                                  const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_16bit_to_32bit_w4(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8(const int16x8_t *const in,
+                                                  int32_t *const out,
+                                                  const int stride,
+                                                  const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_16bit_to_32bit(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w4(
+    const int16x8_t *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
+  const int32x4_t v_newsqrt2bits = vdupq_n_s32(-NewSqrt2Bits);
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_w4(in[i], out + i * stride, &v_newsqrt2,
+                                 &v_newsqrt2bits);
+  }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8(
+    const int16x8_t *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
+  const int32x4_t v_newsqrt2bits = vdupq_n_s32(-NewSqrt2Bits);
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit(in[i], out + i * stride, &v_newsqrt2,
+                              &v_newsqrt2bits);
+  }
+}
+
+static INLINE void round_shift_16bit(int16x8_t *in, int size, int bit) {
+  const int16x8_t vbit = vdupq_n_s16(bit);
+  for (int i = 0; i < size; ++i) {
+    in[i] = vrshlq_s16(in[i], vbit);
+  }
+}
+
+static INLINE void round_shift_16bit_vector(int16x8_t *in, int size,
+                                            const int16x8_t *v_bit) {
+  for (int i = 0; i < size; ++i) {
+    in[i] = vrshlq_s16(in[i], *v_bit);
+  }
+}
+
+void av1_fadst4x4_neon(const int16x8_t *input, int16x8_t *output,
+                       int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+
+  int32x4_t u[6], v[6];
+
+  u[0] = vmovl_s16(vget_low_s16(input[0]));
+  u[1] = vmovl_s16(vget_low_s16(input[1]));
+  u[2] = vmovl_s16(vget_low_s16(input[2]));
+  u[3] = vmovl_s16(vget_low_s16(input[3]));
+  u[4] = vaddq_s32(u[0], u[1]);
+  v[5] = vmulq_n_s32(u[2], sinpi[3]);
+  v[0] = vmulq_n_s32(u[1], sinpi[2]);
+  v[0] = vmlaq_n_s32(v[0], u[0], sinpi[1]);
+  v[1] = vmlaq_n_s32(v[5], u[3], sinpi[4]);
+  v[2] = vmulq_n_s32(u[4], sinpi[3]);
+  v[3] = vmulq_n_s32(u[0], sinpi[4]);
+  v[3] = vmlsq_n_s32(v[3], u[1], sinpi[1]);
+  v[4] = vmlsq_n_s32(v[5], u[3], sinpi[2]);
+
+  u[0] = vaddq_s32(v[0], v[1]);
+  u[1] = vmlsq_n_s32(v[2], u[3], sinpi[3]);
+  u[2] = vsubq_s32(v[3], v[4]);
+  u[3] = vsubq_s32(u[2], u[0]);
+  u[5] = vmlaq_n_s32(u[3], v[5], 3);
+
+  int32x4_t vshift = vdupq_n_s32(-cos_bit);
+  u[0] = vrshlq_s32(u[0], vshift);
+  u[1] = vrshlq_s32(u[1], vshift);
+  u[2] = vrshlq_s32(u[2], vshift);
+  u[3] = vrshlq_s32(u[5], vshift);
+
+  output[0] = custom_packs_s32(u[0], u[2]);
+
+  output[1] = custom_packs_s32(u[1], u[3]);
+  output[2] = vextq_s16(output[0], output[0], 4);
+  output[3] = vextq_s16(output[1], output[1], 4);
+}
+
+#define btf_16_w4_neon(w0_l, w0_h, w1_l, w1_h, in0, in1, out0, out1, \
+                       v_cos_bit)                                    \
+  do {                                                               \
+    int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                  \
+    int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                  \
+    int32x4_t u0 = vmulq_n_s32(in0_l, w0_l);                         \
+    u0 = vmlaq_n_s32(u0, in1_l, w0_h);                               \
+    int32x4_t v0 = vmulq_n_s32(in0_l, w1_l);                         \
+    v0 = vmlaq_n_s32(v0, in1_l, w1_h);                               \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                        \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                        \
+    const int16x4_t c1 = vqmovn_s32(c0);                             \
+    const int16x4_t d1 = vqmovn_s32(d0);                             \
+    out0 = vcombine_s16(c1, c1);                                     \
+    out1 = vcombine_s16(d1, c1);                                     \
+  } while (0)
+
+#define btf_16_w4_neon_mode0(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                                    \
+    int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                       \
+    int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                       \
+    int32x4_t u0 = vmulq_n_s32(in1_l, w0_h);                              \
+    u0 = vmlsq_n_s32(u0, in0_l, w0_l);                                    \
+    int32x4_t v0 = vmulq_n_s32(in0_l, w0_h);                              \
+    v0 = vmlaq_n_s32(v0, in1_l, w0_l);                                    \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                             \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                             \
+    const int16x4_t c1 = vqmovn_s32(c0);                                  \
+    const int16x4_t d1 = vqmovn_s32(d0);                                  \
+    out0 = vcombine_s16(c1, c1);                                          \
+    out1 = vcombine_s16(d1, c1);                                          \
+  } while (0)
+
+#define btf_16_w4_neon_mode2(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                                    \
+    int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                       \
+    int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                       \
+    int32x4_t u0 = vmulq_n_s32(in0_l, w0_l);                              \
+    u0 = vmlaq_n_s32(u0, in1_l, w0_h);                                    \
+    int32x4_t v0 = vmulq_n_s32(in1_l, w0_l);                              \
+    v0 = vmlsq_n_s32(v0, in0_l, w0_h);                                    \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                             \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                             \
+    const int16x4_t c1 = vqmovn_s32(c0);                                  \
+    const int16x4_t d1 = vqmovn_s32(d0);                                  \
+    out0 = vcombine_s16(c1, c1);                                          \
+    out1 = vcombine_s16(d1, c1);                                          \
+  } while (0)
+
+#define btf_16_w4_neon_mode3(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                                    \
+    int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                       \
+    int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                       \
+    int32x4_t u0 = vmulq_n_s32(in0_l, w0_l);                              \
+    u0 = vmlaq_n_s32(u0, in1_l, w0_h);                                    \
+    int32x4_t v0 = vmulq_n_s32(in0_l, w0_h);                              \
+    v0 = vmlsq_n_s32(v0, in1_l, w0_l);                                    \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                             \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                             \
+    const int16x4_t c1 = vqmovn_s32(c0);                                  \
+    const int16x4_t d1 = vqmovn_s32(d0);                                  \
+    out0 = vcombine_s16(c1, c1);                                          \
+    out1 = vcombine_s16(d1, c1);                                          \
+  } while (0)
+
+static void fadst4x8_neon(const int16x8_t *input, int16x8_t *output,
+                          int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1-2
+  int16x8_t x2[8];
+  btf_16_w4_neon_mode3(cospi[32], cospi[32], vqnegq_s16(input[3]), input[4],
+                       x2[2], x2[3], v_cos_bit);
+  btf_16_w4_neon_mode3(cospi[32], cospi[32], input[2], vqnegq_s16(input[5]),
+                       x2[6], x2[7], v_cos_bit);
+
+  // stage 3
+  int16x8_t x3[8];
+  x3[0] = vqaddq_s16(input[0], x2[2]);
+  x3[2] = vqsubq_s16(input[0], x2[2]);
+  x3[1] = vqsubq_s16(x2[3], input[7]);
+  x3[3] = vqsubq_s16(vqnegq_s16(input[7]), x2[3]);
+  x3[4] = vqaddq_s16(vqnegq_s16(input[1]), x2[6]);
+  x3[6] = vqsubq_s16(vqnegq_s16(input[1]), x2[6]);
+  x3[5] = vqaddq_s16(input[6], x2[7]);
+  x3[7] = vqsubq_s16(input[6], x2[7]);
+
+  // stage 4
+  int16x8_t x4[8];
+
+  btf_16_w4_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x4[4], x4[5],
+                       v_cos_bit);
+  btf_16_w4_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x4[6], x4[7],
+                       v_cos_bit);
+
+  // stage 5
+  int16x8_t x5[8];
+  x5[0] = vqaddq_s16(x3[0], x4[4]);
+  x5[4] = vqsubq_s16(x3[0], x4[4]);
+  x5[1] = vqaddq_s16(x3[1], x4[5]);
+  x5[5] = vqsubq_s16(x3[1], x4[5]);
+  x5[2] = vqaddq_s16(x3[2], x4[6]);
+  x5[6] = vqsubq_s16(x3[2], x4[6]);
+  x5[3] = vqaddq_s16(x3[3], x4[7]);
+  x5[7] = vqsubq_s16(x3[3], x4[7]);
+
+  // stage 6-7
+  btf_16_w4_neon_mode3(cospi[4], cospi[60], x5[0], x5[1], output[7], output[0],
+                       v_cos_bit);
+  btf_16_w4_neon_mode3(cospi[20], cospi[44], x5[2], x5[3], output[5], output[2],
+                       v_cos_bit);
+  btf_16_w4_neon_mode3(cospi[36], cospi[28], x5[4], x5[5], output[3], output[4],
+                       v_cos_bit);
+  btf_16_w4_neon_mode3(cospi[52], cospi[12], x5[6], x5[7], output[1], output[6],
+                       v_cos_bit);
+}
+
+static void fadst8x4_neon(const int16x8_t *input, int16x8_t *output,
+                          int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+
+  const int16x8_t in7 = vaddq_s16(input[0], input[1]);
+  int32x4_t u_lo[8], u_hi[8], v_hi[8];
+
+  int32x4_t in0_l = vmovl_s16(vget_low_s16(input[0]));
+  int32x4_t in0_h = vmovl_s16(vget_high_s16(input[0]));
+  int32x4_t in1_l = vmovl_s16(vget_low_s16(input[1]));
+  int32x4_t in1_h = vmovl_s16(vget_high_s16(input[1]));
+  int32x4_t in2_l = vmovl_s16(vget_low_s16(input[2]));
+  int32x4_t in2_h = vmovl_s16(vget_high_s16(input[2]));
+  int32x4_t in3_l = vmovl_s16(vget_low_s16(input[3]));
+  int32x4_t in3_h = vmovl_s16(vget_high_s16(input[3]));
+  int32x4_t in7_l = vmovl_s16(vget_low_s16(in7));
+  int32x4_t in7_h = vmovl_s16(vget_high_s16(in7));
+
+  u_lo[0] = vmulq_n_s32(in1_l, sinpi[2]);
+  u_lo[0] = vmlaq_n_s32(u_lo[0], in0_l, sinpi[1]);
+
+  u_hi[0] = vmulq_n_s32(in1_h, sinpi[2]);
+  u_hi[0] = vmlaq_n_s32(u_hi[0], in0_h, sinpi[1]);
+
+  u_lo[0] = vmlaq_n_s32(u_lo[0], in3_l, sinpi[4]);
+  u_lo[0] = vmlaq_n_s32(u_lo[0], in2_l, sinpi[3]);
+
+  u_hi[0] = vmlaq_n_s32(u_hi[0], in3_h, sinpi[4]);
+  u_hi[0] = vmlaq_n_s32(u_hi[0], in2_h, sinpi[3]);
+
+  u_lo[1] = vmulq_n_s32(in7_l, sinpi[3]);
+
+  v_hi[2] = vmulq_n_s32(in7_h, sinpi[3]);
+  u_lo[2] = vmulq_n_s32(in0_l, sinpi[4]);
+  u_lo[2] = vmlsq_n_s32(u_lo[2], in1_l, sinpi[1]);
+
+  u_hi[2] = vmulq_n_s32(in0_h, sinpi[4]);
+  u_hi[2] = vmlsq_n_s32(u_hi[2], in1_h, sinpi[1]);
+
+  u_lo[2] = vmlaq_n_s32(u_lo[2], in3_l, sinpi[2]);
+  u_lo[2] = vmlsq_n_s32(u_lo[2], in2_l, sinpi[3]);
+
+  u_hi[2] = vmlaq_n_s32(u_hi[2], in3_h, sinpi[2]);
+  u_hi[2] = vmlsq_n_s32(u_hi[2], in2_h, sinpi[3]);
+
+  u_lo[1] = vmlsq_n_s32(u_lo[1], in3_l, sinpi[3]);
+
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  u_hi[1] = vmlsq_n_s32(v_hi[2], in3_h, sinpi[3]);
+
+  u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]);
+  u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]);
+
+  u_lo[6] = vmlaq_n_s32(u_lo[3], in2_l, sinpi[3] * 3);
+  u_hi[6] = vmlaq_n_s32(u_hi[3], in2_h, sinpi[3] * 3);
+
+  u_lo[0] = vrshlq_s32(u_lo[0], v_cos_bit);
+  u_hi[0] = vrshlq_s32(u_hi[0], v_cos_bit);
+  u_lo[1] = vrshlq_s32(u_lo[1], v_cos_bit);
+  u_hi[1] = vrshlq_s32(u_hi[1], v_cos_bit);
+  u_lo[2] = vrshlq_s32(u_lo[2], v_cos_bit);
+  u_hi[2] = vrshlq_s32(u_hi[2], v_cos_bit);
+  u_lo[3] = vrshlq_s32(u_lo[6], v_cos_bit);
+  u_hi[3] = vrshlq_s32(u_hi[6], v_cos_bit);
+
+  output[0] = custom_packs_s32(u_lo[0], u_hi[0]);
+  output[1] = custom_packs_s32(u_lo[1], u_hi[1]);
+  output[2] = custom_packs_s32(u_lo[2], u_hi[2]);
+  output[3] = custom_packs_s32(u_lo[3], u_hi[3]);
+}
+
+void av1_fdct4x4_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  int32x4_t u[4];
+
+  int32x4_t in12a = vaddl_s16(vget_low_s16(input[1]), vget_low_s16(input[2]));
+  int32x4_t in12s = vsubl_s16(vget_low_s16(input[1]), vget_low_s16(input[2]));
+  int32x4_t in03a = vaddl_s16(vget_low_s16(input[0]), vget_low_s16(input[3]));
+  int32x4_t in03s = vsubl_s16(vget_low_s16(input[0]), vget_low_s16(input[3]));
+
+  int32x4_t u0ad1 = vmulq_n_s32(in12a, cospi[32]);
+  int32x4_t u0ad2 = vmulq_n_s32(in03a, cospi[32]);
+  u[0] = vaddq_s32(u0ad1, u0ad2);
+  u[1] = vsubq_s32(u0ad2, u0ad1);
+  u[2] = vmulq_n_s32(in12s, cospi[48]);
+  u[2] = vmlaq_n_s32(u[2], in03s, cospi[16]);
+
+  u[3] = vmulq_n_s32(in03s, cospi[48]);
+  u[3] = vmlsq_n_s32(u[3], in12s, cospi[16]);
+
+  u[0] = vrshlq_s32(u[0], v_cos_bit);
+  u[1] = vrshlq_s32(u[1], v_cos_bit);
+  u[2] = vrshlq_s32(u[2], v_cos_bit);
+  u[3] = vrshlq_s32(u[3], v_cos_bit);
+
+  output[0] = custom_packs_s32(u[0], u[1]);
+  output[1] = custom_packs_s32(u[2], u[3]);
+  output[2] = vextq_s16(output[0], output[0], 4);
+  output[3] = vextq_s16(output[1], output[1], 4);
+}
+
+#define btf_16_neon(w0_l, w0_h, w1_l, w1_h, in0, in1, out0, out1) \
+  do {                                                            \
+    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));             \
+    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));           \
+    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));             \
+    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));           \
+    int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                    \
+    u0 = vmlaq_n_s32(u0, in_low0, w0_l);                          \
+    int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                   \
+    u1 = vmlaq_n_s32(u1, in_high0, w0_l);                         \
+    int32x4_t v0 = vmulq_n_s32(in_low1, w1_h);                    \
+    v0 = vmlaq_n_s32(v0, in_low0, w1_l);                          \
+    int32x4_t v1 = vmulq_n_s32(in_high1, w1_h);                   \
+    v1 = vmlaq_n_s32(v1, in_high0, w1_l);                         \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                     \
+    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                     \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                     \
+    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                     \
+    out0 = custom_packs_s32(c0, c1);                              \
+    out1 = custom_packs_s32(d0, d1);                              \
+  } while (0)
+
+#define btf_16_neon_mode0(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                                 \
+    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
+    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
+    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
+    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
+    int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                         \
+    u0 = vmlsq_n_s32(u0, in_low0, w0_l);                               \
+    int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                        \
+    u1 = vmlsq_n_s32(u1, in_high0, w0_l);                              \
+    int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                         \
+    v0 = vmlaq_n_s32(v0, in_low0, w0_h);                               \
+    int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                        \
+    v1 = vmlaq_n_s32(v1, in_high0, w0_h);                              \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
+    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
+    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
+    out0 = custom_packs_s32(c0, c1);                                   \
+    out1 = custom_packs_s32(d0, d1);                                   \
+  } while (0)
+
+#define btf_16_neon_mode1(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                                 \
+    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
+    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
+    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
+    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
+    int32x4_t u0 = vmulq_n_s32(in_low0, w0_l);                         \
+    u0 = vmlsq_n_s32(u0, in_low1, w0_h);                               \
+    int32x4_t u1 = vmulq_n_s32(in_high0, w0_l);                        \
+    u1 = vmlsq_n_s32(u1, in_high1, w0_h);                              \
+    int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                         \
+    v0 = vmlaq_n_s32(v0, in_low0, w0_h);                               \
+    int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                        \
+    v1 = vmlaq_n_s32(v1, in_high0, w0_h);                              \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
+    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
+    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
+    out0 = custom_packs_s32(c0, c1);                                   \
+    out1 = custom_packs_s32(d0, d1);                                   \
+  } while (0)
+
+#define btf_16_neon_mode02(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                                  \
+    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                   \
+    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                 \
+    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                   \
+    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                 \
+    int32x4_t u0 = vmulq_n_s32(in_low1, -w0_h);                         \
+    u0 = vmlsq_n_s32(u0, in_low0, w0_l);                                \
+    int32x4_t u1 = vmulq_n_s32(in_high1, -w0_h);                        \
+    u1 = vmlsq_n_s32(u1, in_high0, w0_l);                               \
+    int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                          \
+    v0 = vmlsq_n_s32(v0, in_low0, w0_h);                                \
+    int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                         \
+    v1 = vmlsq_n_s32(v1, in_high0, w0_h);                               \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                           \
+    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                           \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                           \
+    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                           \
+    out0 = custom_packs_s32(c0, c1);                                    \
+    out1 = custom_packs_s32(d0, d1);                                    \
+  } while (0)
+
+#define btf_16_neon_mode2(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                                 \
+    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
+    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
+    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
+    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
+    int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                         \
+    u0 = vmlaq_n_s32(u0, in_low0, w0_l);                               \
+    int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                        \
+    u1 = vmlaq_n_s32(u1, in_high0, w0_l);                              \
+    int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                         \
+    v0 = vmlsq_n_s32(v0, in_low0, w0_h);                               \
+    int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                        \
+    v1 = vmlsq_n_s32(v1, in_high0, w0_h);                              \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
+    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
+    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
+    out0 = custom_packs_s32(c0, c1);                                   \
+    out1 = custom_packs_s32(d0, d1);                                   \
+  } while (0)
+
+#define btf_16_neon_mode3(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                                 \
+    int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
+    int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
+    int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
+    int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
+    int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                         \
+    u0 = vmlaq_n_s32(u0, in_low0, w0_l);                               \
+    int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                        \
+    u1 = vmlaq_n_s32(u1, in_high0, w0_l);                              \
+    int32x4_t v0 = vmulq_n_s32(in_low0, w0_h);                         \
+    v0 = vmlsq_n_s32(v0, in_low1, w0_l);                               \
+    int32x4_t v1 = vmulq_n_s32(in_high0, w0_h);                        \
+    v1 = vmlsq_n_s32(v1, in_high1, w0_l);                              \
+    int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
+    int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
+    int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
+    int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
+    out0 = custom_packs_s32(c0, c1);                                   \
+    out1 = custom_packs_s32(d0, d1);                                   \
+  } while (0)
+
+static void fdct8x4_neon(const int16x8_t *input, int16x8_t *output,
+                         int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[4];
+  x1[0] = vqaddq_s16(input[0], input[3]);
+  x1[3] = vqsubq_s16(input[0], input[3]);
+  x1[1] = vqaddq_s16(input[1], input[2]);
+  x1[2] = vqsubq_s16(input[1], input[2]);
+
+  // stage 2
+  int16x8_t x2[4];
+  btf_16_neon_mode3(cospi[32], cospi[32], x1[0], x1[1], x2[0], x2[1],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[48], cospi[16], x1[2], x1[3], x2[2], x2[3],
+                    v_cos_bit);
+
+  // stage 3
+  output[0] = x2[0];
+  output[1] = x2[2];
+  output[2] = x2[1];
+  output[3] = x2[3];
+}
+
+static void fdct4x8_neon(const int16x8_t *input, int16x8_t *output,
+                         int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[8];
+  x1[0] = vqaddq_s16(input[0], input[7]);
+  x1[7] = vqsubq_s16(input[0], input[7]);
+  x1[1] = vqaddq_s16(input[1], input[6]);
+  x1[6] = vqsubq_s16(input[1], input[6]);
+  x1[2] = vqaddq_s16(input[2], input[5]);
+  x1[5] = vqsubq_s16(input[2], input[5]);
+  x1[3] = vqaddq_s16(input[3], input[4]);
+  x1[4] = vqsubq_s16(input[3], input[4]);
+
+  // stage 2
+  int16x8_t x2[8];
+  x2[0] = vqaddq_s16(x1[0], x1[3]);
+  x2[3] = vqsubq_s16(x1[0], x1[3]);
+  x2[1] = vqaddq_s16(x1[1], x1[2]);
+  x2[2] = vqsubq_s16(x1[1], x1[2]);
+
+  btf_16_w4_neon_mode0(cospi[32], cospi[32], x1[5], x1[6], x2[5], x2[6],
+                       v_cos_bit);
+
+  // stage 3
+  int16x8_t x3[8];
+  btf_16_w4_neon_mode3(cospi[32], cospi[32], x2[0], x2[1], output[0], output[4],
+                       v_cos_bit);
+
+  btf_16_w4_neon_mode2(cospi[48], cospi[16], x2[2], x2[3], output[2], output[6],
+                       v_cos_bit);
+  x3[4] = vqaddq_s16(x1[4], x2[5]);
+  x3[5] = vqsubq_s16(x1[4], x2[5]);
+  x3[6] = vqsubq_s16(x1[7], x2[6]);
+  x3[7] = vqaddq_s16(x1[7], x2[6]);
+
+  // stage 4-5
+  btf_16_w4_neon_mode2(cospi[56], cospi[8], x3[4], x3[7], output[1], output[7],
+                       v_cos_bit);
+  btf_16_w4_neon_mode2(cospi[24], cospi[40], x3[5], x3[6], output[5], output[3],
+                       v_cos_bit);
+}
+
+void fdct8x8_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
+                  const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[8];
+  x1[0] = vqaddq_s16(input[0], input[7]);
+  x1[7] = vqsubq_s16(input[0], input[7]);
+  x1[1] = vqaddq_s16(input[1], input[6]);
+  x1[6] = vqsubq_s16(input[1], input[6]);
+  x1[2] = vqaddq_s16(input[2], input[5]);
+  x1[5] = vqsubq_s16(input[2], input[5]);
+  x1[3] = vqaddq_s16(input[3], input[4]);
+  x1[4] = vqsubq_s16(input[3], input[4]);
+
+  // stage 2
+  int16x8_t x2[8];
+  x2[0] = vqaddq_s16(x1[0], x1[3]);
+  x2[3] = vqsubq_s16(x1[0], x1[3]);
+  x2[1] = vqaddq_s16(x1[1], x1[2]);
+  x2[2] = vqsubq_s16(x1[1], x1[2]);
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[5], x1[6], x2[5], x2[6],
+                    v_cos_bit);
+
+  // stage 3
+  int16x8_t x3[8];
+  btf_16_neon_mode3(cospi[32], cospi[32], x2[0], x2[1], output[0], output[4],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[48], cospi[16], x2[2], x2[3], output[2], output[6],
+                    v_cos_bit);
+  x3[4] = vqaddq_s16(x1[4], x2[5]);
+  x3[5] = vqsubq_s16(x1[4], x2[5]);
+  x3[6] = vqsubq_s16(x1[7], x2[6]);
+  x3[7] = vqaddq_s16(x1[7], x2[6]);
+
+  // stage 4-5
+  btf_16_neon_mode2(cospi[56], cospi[8], x3[4], x3[7], output[1], output[7],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[24], cospi[40], x3[5], x3[6], output[5], output[3],
+                    v_cos_bit);
+}
+
+static void fdct8x16_neon(const int16x8_t *input, int16x8_t *output,
+                          int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[16];
+  x1[0] = vqaddq_s16(input[0], input[15]);
+  x1[15] = vqsubq_s16(input[0], input[15]);
+  x1[1] = vqaddq_s16(input[1], input[14]);
+  x1[14] = vqsubq_s16(input[1], input[14]);
+  x1[2] = vqaddq_s16(input[2], input[13]);
+  x1[13] = vqsubq_s16(input[2], input[13]);
+  x1[3] = vqaddq_s16(input[3], input[12]);
+  x1[12] = vqsubq_s16(input[3], input[12]);
+  x1[4] = vqaddq_s16(input[4], input[11]);
+  x1[11] = vqsubq_s16(input[4], input[11]);
+  x1[5] = vqaddq_s16(input[5], input[10]);
+  x1[10] = vqsubq_s16(input[5], input[10]);
+  x1[6] = vqaddq_s16(input[6], input[9]);
+  x1[9] = vqsubq_s16(input[6], input[9]);
+  x1[7] = vqaddq_s16(input[7], input[8]);
+  x1[8] = vqsubq_s16(input[7], input[8]);
+
+  // stage 2
+  int16x8_t x2[16];
+  x2[0] = vqaddq_s16(x1[0], x1[7]);
+  x2[7] = vqsubq_s16(x1[0], x1[7]);
+  x2[1] = vqaddq_s16(x1[1], x1[6]);
+  x2[6] = vqsubq_s16(x1[1], x1[6]);
+  x2[2] = vqaddq_s16(x1[2], x1[5]);
+  x2[5] = vqsubq_s16(x1[2], x1[5]);
+  x2[3] = vqaddq_s16(x1[3], x1[4]);
+  x2[4] = vqsubq_s16(x1[3], x1[4]);
+
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[10], x1[13], x2[10], x2[13],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[11], x1[12], x2[11], x2[12],
+                    v_cos_bit);
+
+  // stage 3
+  int16x8_t x3[16];
+  x3[0] = vqaddq_s16(x2[0], x2[3]);
+  x3[3] = vqsubq_s16(x2[0], x2[3]);
+  x3[1] = vqaddq_s16(x2[1], x2[2]);
+  x3[2] = vqsubq_s16(x2[1], x2[2]);
+
+  btf_16_neon_mode0(cospi[32], cospi[32], x2[5], x2[6], x3[5], x3[6],
+                    v_cos_bit);
+
+  x3[8] = vqaddq_s16(x1[8], x2[11]);
+  x3[11] = vqsubq_s16(x1[8], x2[11]);
+  x3[9] = vqaddq_s16(x1[9], x2[10]);
+  x3[10] = vqsubq_s16(x1[9], x2[10]);
+  x3[12] = vqsubq_s16(x1[15], x2[12]);
+  x3[15] = vqaddq_s16(x1[15], x2[12]);
+  x3[13] = vqsubq_s16(x1[14], x2[13]);
+  x3[14] = vqaddq_s16(x1[14], x2[13]);
+
+  // stage 4
+  int16x8_t x4[16];
+  btf_16_neon(cospi[32], cospi[32], cospi[32], -cospi[32], x3[0], x3[1],
+              output[0], output[8]);
+  btf_16_neon(cospi[48], cospi[16], -cospi[16], cospi[48], x3[2], x3[3],
+              output[4], output[12]);
+  x4[4] = vqaddq_s16(x2[4], x3[5]);
+  x4[5] = vqsubq_s16(x2[4], x3[5]);
+  x4[6] = vqsubq_s16(x2[7], x3[6]);
+  x4[7] = vqaddq_s16(x2[7], x3[6]);
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[9], x3[14], x4[9], x4[14],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[10], x3[13], x4[10], x4[13],
+                     v_cos_bit);
+
+  // stage 5
+  int16x8_t x5[16];
+
+  btf_16_neon_mode2(cospi[56], cospi[8], x4[4], x4[7], output[2], output[14],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[24], cospi[40], x4[5], x4[6], output[10], output[6],
+                    v_cos_bit);
+  x5[8] = vqaddq_s16(x3[8], x4[9]);
+  x5[9] = vqsubq_s16(x3[8], x4[9]);
+  x5[10] = vqsubq_s16(x3[11], x4[10]);
+  x5[11] = vqaddq_s16(x3[11], x4[10]);
+  x5[12] = vqaddq_s16(x3[12], x4[13]);
+  x5[13] = vqsubq_s16(x3[12], x4[13]);
+  x5[14] = vqsubq_s16(x3[15], x4[14]);
+  x5[15] = vqaddq_s16(x3[15], x4[14]);
+
+  // stage 6-7
+  btf_16_neon_mode2(cospi[60], cospi[4], x5[8], x5[15], output[1], output[15],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[28], cospi[36], x5[9], x5[14], output[9], output[7],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[44], cospi[20], x5[10], x5[13], output[5], output[11],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[12], cospi[52], x5[11], x5[12], output[13], output[3],
+                    v_cos_bit);
+}
+
+void av1_fdct8x32_neon(const int16x8_t *input, int16x8_t *output,
+                       int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[32];
+  x1[0] = vqaddq_s16(input[0], input[31]);
+  x1[31] = vqsubq_s16(input[0], input[31]);
+  x1[1] = vqaddq_s16(input[1], input[30]);
+  x1[30] = vqsubq_s16(input[1], input[30]);
+  x1[2] = vqaddq_s16(input[2], input[29]);
+  x1[29] = vqsubq_s16(input[2], input[29]);
+  x1[3] = vqaddq_s16(input[3], input[28]);
+  x1[28] = vqsubq_s16(input[3], input[28]);
+  x1[4] = vqaddq_s16(input[4], input[27]);
+  x1[27] = vqsubq_s16(input[4], input[27]);
+  x1[5] = vqaddq_s16(input[5], input[26]);
+  x1[26] = vqsubq_s16(input[5], input[26]);
+  x1[6] = vqaddq_s16(input[6], input[25]);
+  x1[25] = vqsubq_s16(input[6], input[25]);
+  x1[7] = vqaddq_s16(input[7], input[24]);
+  x1[24] = vqsubq_s16(input[7], input[24]);
+  x1[8] = vqaddq_s16(input[8], input[23]);
+  x1[23] = vqsubq_s16(input[8], input[23]);
+  x1[9] = vqaddq_s16(input[9], input[22]);
+  x1[22] = vqsubq_s16(input[9], input[22]);
+  x1[10] = vqaddq_s16(input[10], input[21]);
+  x1[21] = vqsubq_s16(input[10], input[21]);
+  x1[11] = vqaddq_s16(input[11], input[20]);
+  x1[20] = vqsubq_s16(input[11], input[20]);
+  x1[12] = vqaddq_s16(input[12], input[19]);
+  x1[19] = vqsubq_s16(input[12], input[19]);
+  x1[13] = vqaddq_s16(input[13], input[18]);
+  x1[18] = vqsubq_s16(input[13], input[18]);
+  x1[14] = vqaddq_s16(input[14], input[17]);
+  x1[17] = vqsubq_s16(input[14], input[17]);
+  x1[15] = vqaddq_s16(input[15], input[16]);
+  x1[16] = vqsubq_s16(input[15], input[16]);
+
+  // stage 2
+  int16x8_t x2[32];
+  x2[0] = vqaddq_s16(x1[0], x1[15]);
+  x2[15] = vqsubq_s16(x1[0], x1[15]);
+  x2[1] = vqaddq_s16(x1[1], x1[14]);
+  x2[14] = vqsubq_s16(x1[1], x1[14]);
+  x2[2] = vqaddq_s16(x1[2], x1[13]);
+  x2[13] = vqsubq_s16(x1[2], x1[13]);
+  x2[3] = vqaddq_s16(x1[3], x1[12]);
+  x2[12] = vqsubq_s16(x1[3], x1[12]);
+  x2[4] = vqaddq_s16(x1[4], x1[11]);
+  x2[11] = vqsubq_s16(x1[4], x1[11]);
+  x2[5] = vqaddq_s16(x1[5], x1[10]);
+  x2[10] = vqsubq_s16(x1[5], x1[10]);
+  x2[6] = vqaddq_s16(x1[6], x1[9]);
+  x2[9] = vqsubq_s16(x1[6], x1[9]);
+  x2[7] = vqaddq_s16(x1[7], x1[8]);
+  x2[8] = vqsubq_s16(x1[7], x1[8]);
+
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[20], x1[27], x2[20], x2[27],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[21], x1[26], x2[21], x2[26],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[22], x1[25], x2[22], x2[25],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[32], cospi[32], x1[23], x1[24], x2[23], x2[24],
+                    v_cos_bit);
+
+  // stage 3
+  int16x8_t x3[32];
+  x3[0] = vqaddq_s16(x2[0], x2[7]);
+  x3[7] = vqsubq_s16(x2[0], x2[7]);
+  x3[1] = vqaddq_s16(x2[1], x2[6]);
+  x3[6] = vqsubq_s16(x2[1], x2[6]);
+  x3[2] = vqaddq_s16(x2[2], x2[5]);
+  x3[5] = vqsubq_s16(x2[2], x2[5]);
+  x3[3] = vqaddq_s16(x2[3], x2[4]);
+  x3[4] = vqsubq_s16(x2[3], x2[4]);
+
+  btf_16_neon_mode0(cospi[32], cospi[32], x2[10], x2[13], x3[10], x3[13],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[32], cospi[32], x2[11], x2[12], x3[11], x3[12],
+                    v_cos_bit);
+
+  x3[16] = vqaddq_s16(x1[16], x2[23]);
+  x3[23] = vqsubq_s16(x1[16], x2[23]);
+  x3[17] = vqaddq_s16(x1[17], x2[22]);
+  x3[22] = vqsubq_s16(x1[17], x2[22]);
+  x3[18] = vqaddq_s16(x1[18], x2[21]);
+  x3[21] = vqsubq_s16(x1[18], x2[21]);
+  x3[19] = vqaddq_s16(x1[19], x2[20]);
+  x3[20] = vqsubq_s16(x1[19], x2[20]);
+  x3[24] = vqsubq_s16(x1[31], x2[24]);
+  x3[31] = vqaddq_s16(x1[31], x2[24]);
+  x3[25] = vqsubq_s16(x1[30], x2[25]);
+  x3[30] = vqaddq_s16(x1[30], x2[25]);
+  x3[26] = vqsubq_s16(x1[29], x2[26]);
+  x3[29] = vqaddq_s16(x1[29], x2[26]);
+  x3[27] = vqsubq_s16(x1[28], x2[27]);
+  x3[28] = vqaddq_s16(x1[28], x2[27]);
+
+  // stage 4
+  int16x8_t x4[32];
+  x4[0] = vqaddq_s16(x3[0], x3[3]);
+  x4[3] = vqsubq_s16(x3[0], x3[3]);
+  x4[1] = vqaddq_s16(x3[1], x3[2]);
+  x4[2] = vqsubq_s16(x3[1], x3[2]);
+  btf_16_neon_mode0(cospi[32], cospi[32], x3[5], x3[6], x4[5], x4[6],
+                    v_cos_bit);
+  x4[8] = vqaddq_s16(x2[8], x3[11]);
+  x4[11] = vqsubq_s16(x2[8], x3[11]);
+  x4[9] = vqaddq_s16(x2[9], x3[10]);
+  x4[10] = vqsubq_s16(x2[9], x3[10]);
+  x4[12] = vqsubq_s16(x2[15], x3[12]);
+  x4[15] = vqaddq_s16(x2[15], x3[12]);
+  x4[13] = vqsubq_s16(x2[14], x3[13]);
+  x4[14] = vqaddq_s16(x2[14], x3[13]);
+
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[18], x3[29], x4[18], x4[29],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[19], x3[28], x4[19], x4[28],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[20], x3[27], x4[20], x4[27],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[21], x3[26], x4[21], x4[26],
+                     v_cos_bit);
+
+  // stage 5
+  int16x8_t x5[32];
+  btf_16_neon_mode3(cospi[32], cospi[32], x4[0], x4[1], output[0], output[16],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[48], cospi[16], x4[2], x4[3], output[8], output[24],
+                    v_cos_bit);
+  x5[4] = vqaddq_s16(x3[4], x4[5]);
+  x5[5] = vqsubq_s16(x3[4], x4[5]);
+  x5[6] = vqsubq_s16(x3[7], x4[6]);
+  x5[7] = vqaddq_s16(x3[7], x4[6]);
+
+  btf_16_neon_mode0(cospi[16], cospi[48], x4[9], x4[14], x5[9], x5[14],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x4[10], x4[13], x5[10], x5[13],
+                     v_cos_bit);
+
+  x5[16] = vqaddq_s16(x3[16], x4[19]);
+  x5[19] = vqsubq_s16(x3[16], x4[19]);
+  x5[17] = vqaddq_s16(x3[17], x4[18]);
+  x5[18] = vqsubq_s16(x3[17], x4[18]);
+  x5[20] = vqsubq_s16(x3[23], x4[20]);
+  x5[23] = vqaddq_s16(x3[23], x4[20]);
+  x5[21] = vqsubq_s16(x3[22], x4[21]);
+  x5[22] = vqaddq_s16(x3[22], x4[21]);
+  x5[24] = vqaddq_s16(x3[24], x4[27]);
+  x5[27] = vqsubq_s16(x3[24], x4[27]);
+  x5[25] = vqaddq_s16(x3[25], x4[26]);
+  x5[26] = vqsubq_s16(x3[25], x4[26]);
+  x5[28] = vqsubq_s16(x3[31], x4[28]);
+  x5[31] = vqaddq_s16(x3[31], x4[28]);
+  x5[29] = vqsubq_s16(x3[30], x4[29]);
+  x5[30] = vqaddq_s16(x3[30], x4[29]);
+
+  // stage 6
+  int16x8_t x6[32];
+  btf_16_neon_mode2(cospi[56], cospi[8], x5[4], x5[7], output[4], output[28],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[24], cospi[40], x5[5], x5[6], output[20], output[12],
+                    v_cos_bit);
+  x6[8] = vqaddq_s16(x4[8], x5[9]);
+  x6[9] = vqsubq_s16(x4[8], x5[9]);
+  x6[10] = vqsubq_s16(x4[11], x5[10]);
+  x6[11] = vqaddq_s16(x4[11], x5[10]);
+  x6[12] = vqaddq_s16(x4[12], x5[13]);
+  x6[13] = vqsubq_s16(x4[12], x5[13]);
+  x6[14] = vqsubq_s16(x4[15], x5[14]);
+  x6[15] = vqaddq_s16(x4[15], x5[14]);
+  btf_16_neon_mode0(cospi[8], cospi[56], x5[17], x5[30], x6[17], x6[30],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[56], cospi[8], x5[18], x5[29], x6[18], x6[29],
+                     v_cos_bit);
+  btf_16_neon_mode0(cospi[40], cospi[24], x5[21], x5[26], x6[21], x6[26],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[24], cospi[40], x5[22], x5[25], x6[22], x6[25],
+                     v_cos_bit);
+
+  // stage 7
+  int16x8_t x7[32];
+  btf_16_neon_mode2(cospi[60], cospi[4], x6[8], x6[15], output[2], output[30],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[28], cospi[36], x6[9], x6[14], output[18], output[14],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[44], cospi[20], x6[10], x6[13], output[10],
+                    output[22], v_cos_bit);
+  btf_16_neon_mode2(cospi[12], cospi[52], x6[11], x6[12], output[26], output[6],
+                    v_cos_bit);
+  x7[16] = vqaddq_s16(x5[16], x6[17]);
+  x7[17] = vqsubq_s16(x5[16], x6[17]);
+  x7[18] = vqsubq_s16(x5[19], x6[18]);
+  x7[19] = vqaddq_s16(x5[19], x6[18]);
+  x7[20] = vqaddq_s16(x5[20], x6[21]);
+  x7[21] = vqsubq_s16(x5[20], x6[21]);
+  x7[22] = vqsubq_s16(x5[23], x6[22]);
+  x7[23] = vqaddq_s16(x5[23], x6[22]);
+  x7[24] = vqaddq_s16(x5[24], x6[25]);
+  x7[25] = vqsubq_s16(x5[24], x6[25]);
+  x7[26] = vqsubq_s16(x5[27], x6[26]);
+  x7[27] = vqaddq_s16(x5[27], x6[26]);
+  x7[28] = vqaddq_s16(x5[28], x6[29]);
+  x7[29] = vqsubq_s16(x5[28], x6[29]);
+  x7[30] = vqsubq_s16(x5[31], x6[30]);
+  x7[31] = vqaddq_s16(x5[31], x6[30]);
+
+  btf_16_neon_mode2(cospi[62], cospi[2], x7[16], x7[31], output[1], output[31],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[30], cospi[34], x7[17], x7[30], output[17],
+                    output[15], v_cos_bit);
+  btf_16_neon_mode2(cospi[46], cospi[18], x7[18], x7[29], output[9], output[23],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[14], cospi[50], x7[19], x7[28], output[25], output[7],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[54], cospi[10], x7[20], x7[27], output[5], output[27],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[22], cospi[42], x7[21], x7[26], output[21],
+                    output[11], v_cos_bit);
+  btf_16_neon_mode2(cospi[38], cospi[26], x7[22], x7[25], output[13],
+                    output[19], v_cos_bit);
+  btf_16_neon_mode2(cospi[6], cospi[58], x7[23], x7[24], output[29], output[3],
+                    v_cos_bit);
+}
+
+void av1_fdct8x64_stage_1234_neon(const int16x8_t *input, int16x8_t *x3,
+                                  int16x8_t *x4, const int32_t *cospi32,
+                                  const int32x4_t *v_cos_bit) {
+  int16x8_t x1[64];
+  int16x8_t x2[64];
+  x1[0] = vqaddq_s16(input[0], input[63]);
+  x1[63] = vqsubq_s16(input[0], input[63]);
+  x1[1] = vqaddq_s16(input[1], input[62]);
+  x1[62] = vqsubq_s16(input[1], input[62]);
+  x1[2] = vqaddq_s16(input[2], input[61]);
+  x1[61] = vqsubq_s16(input[2], input[61]);
+  x1[3] = vqaddq_s16(input[3], input[60]);
+  x1[60] = vqsubq_s16(input[3], input[60]);
+  x1[4] = vqaddq_s16(input[4], input[59]);
+  x1[59] = vqsubq_s16(input[4], input[59]);
+  x1[5] = vqaddq_s16(input[5], input[58]);
+  x1[58] = vqsubq_s16(input[5], input[58]);
+  x1[6] = vqaddq_s16(input[6], input[57]);
+  x1[57] = vqsubq_s16(input[6], input[57]);
+  x1[7] = vqaddq_s16(input[7], input[56]);
+  x1[56] = vqsubq_s16(input[7], input[56]);
+  x1[8] = vqaddq_s16(input[8], input[55]);
+  x1[55] = vqsubq_s16(input[8], input[55]);
+  x1[9] = vqaddq_s16(input[9], input[54]);
+  x1[54] = vqsubq_s16(input[9], input[54]);
+  x1[10] = vqaddq_s16(input[10], input[53]);
+  x1[53] = vqsubq_s16(input[10], input[53]);
+  x1[11] = vqaddq_s16(input[11], input[52]);
+  x1[52] = vqsubq_s16(input[11], input[52]);
+  x1[12] = vqaddq_s16(input[12], input[51]);
+  x1[51] = vqsubq_s16(input[12], input[51]);
+  x1[13] = vqaddq_s16(input[13], input[50]);
+  x1[50] = vqsubq_s16(input[13], input[50]);
+  x1[14] = vqaddq_s16(input[14], input[49]);
+  x1[49] = vqsubq_s16(input[14], input[49]);
+  x1[15] = vqaddq_s16(input[15], input[48]);
+  x1[48] = vqsubq_s16(input[15], input[48]);
+  x1[16] = vqaddq_s16(input[16], input[47]);
+  x1[47] = vqsubq_s16(input[16], input[47]);
+  x1[17] = vqaddq_s16(input[17], input[46]);
+  x1[46] = vqsubq_s16(input[17], input[46]);
+  x1[18] = vqaddq_s16(input[18], input[45]);
+  x1[45] = vqsubq_s16(input[18], input[45]);
+  x1[19] = vqaddq_s16(input[19], input[44]);
+  x1[44] = vqsubq_s16(input[19], input[44]);
+  x1[20] = vqaddq_s16(input[20], input[43]);
+  x1[43] = vqsubq_s16(input[20], input[43]);
+  x1[21] = vqaddq_s16(input[21], input[42]);
+  x1[42] = vqsubq_s16(input[21], input[42]);
+  x1[22] = vqaddq_s16(input[22], input[41]);
+  x1[41] = vqsubq_s16(input[22], input[41]);
+  x1[23] = vqaddq_s16(input[23], input[40]);
+  x1[40] = vqsubq_s16(input[23], input[40]);
+  x1[24] = vqaddq_s16(input[24], input[39]);
+  x1[39] = vqsubq_s16(input[24], input[39]);
+  x1[25] = vqaddq_s16(input[25], input[38]);
+  x1[38] = vqsubq_s16(input[25], input[38]);
+  x1[26] = vqaddq_s16(input[26], input[37]);
+  x1[37] = vqsubq_s16(input[26], input[37]);
+  x1[27] = vqaddq_s16(input[27], input[36]);
+  x1[36] = vqsubq_s16(input[27], input[36]);
+  x1[28] = vqaddq_s16(input[28], input[35]);
+  x1[35] = vqsubq_s16(input[28], input[35]);
+  x1[29] = vqaddq_s16(input[29], input[34]);
+  x1[34] = vqsubq_s16(input[29], input[34]);
+  x1[30] = vqaddq_s16(input[30], input[33]);
+  x1[33] = vqsubq_s16(input[30], input[33]);
+  x1[31] = vqaddq_s16(input[31], input[32]);
+  x1[32] = vqsubq_s16(input[31], input[32]);
+
+  x2[0] = vqaddq_s16(x1[0], x1[31]);
+  x2[31] = vqsubq_s16(x1[0], x1[31]);
+  x2[1] = vqaddq_s16(x1[1], x1[30]);
+  x2[30] = vqsubq_s16(x1[1], x1[30]);
+  x2[2] = vqaddq_s16(x1[2], x1[29]);
+  x2[29] = vqsubq_s16(x1[2], x1[29]);
+  x2[3] = vqaddq_s16(x1[3], x1[28]);
+  x2[28] = vqsubq_s16(x1[3], x1[28]);
+  x2[4] = vqaddq_s16(x1[4], x1[27]);
+  x2[27] = vqsubq_s16(x1[4], x1[27]);
+  x2[5] = vqaddq_s16(x1[5], x1[26]);
+  x2[26] = vqsubq_s16(x1[5], x1[26]);
+  x2[6] = vqaddq_s16(x1[6], x1[25]);
+  x2[25] = vqsubq_s16(x1[6], x1[25]);
+  x2[7] = vqaddq_s16(x1[7], x1[24]);
+  x2[24] = vqsubq_s16(x1[7], x1[24]);
+  x2[8] = vqaddq_s16(x1[8], x1[23]);
+  x2[23] = vqsubq_s16(x1[8], x1[23]);
+  x2[9] = vqaddq_s16(x1[9], x1[22]);
+  x2[22] = vqsubq_s16(x1[9], x1[22]);
+  x2[10] = vqaddq_s16(x1[10], x1[21]);
+  x2[21] = vqsubq_s16(x1[10], x1[21]);
+  x2[11] = vqaddq_s16(x1[11], x1[20]);
+  x2[20] = vqsubq_s16(x1[11], x1[20]);
+  x2[12] = vqaddq_s16(x1[12], x1[19]);
+  x2[19] = vqsubq_s16(x1[12], x1[19]);
+  x2[13] = vqaddq_s16(x1[13], x1[18]);
+  x2[18] = vqsubq_s16(x1[13], x1[18]);
+  x2[14] = vqaddq_s16(x1[14], x1[17]);
+  x2[17] = vqsubq_s16(x1[14], x1[17]);
+  x2[15] = vqaddq_s16(x1[15], x1[16]);
+  x2[16] = vqsubq_s16(x1[15], x1[16]);
+
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[40], x1[55], x2[40], x2[55],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[41], x1[54], x2[41], x2[54],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[42], x1[53], x2[42], x2[53],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[43], x1[52], x2[43], x2[52],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[44], x1[51], x2[44], x2[51],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[45], x1[50], x2[45], x2[50],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[46], x1[49], x2[46], x2[49],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x1[47], x1[48], x2[47], x2[48],
+                    *v_cos_bit);
+
+  // stage 3
+  x3[0] = vqaddq_s16(x2[0], x2[15]);
+  x3[15] = vqsubq_s16(x2[0], x2[15]);
+  x3[1] = vqaddq_s16(x2[1], x2[14]);
+  x3[14] = vqsubq_s16(x2[1], x2[14]);
+  x3[2] = vqaddq_s16(x2[2], x2[13]);
+  x3[13] = vqsubq_s16(x2[2], x2[13]);
+  x3[3] = vqaddq_s16(x2[3], x2[12]);
+  x3[12] = vqsubq_s16(x2[3], x2[12]);
+  x3[4] = vqaddq_s16(x2[4], x2[11]);
+  x3[11] = vqsubq_s16(x2[4], x2[11]);
+  x3[5] = vqaddq_s16(x2[5], x2[10]);
+  x3[10] = vqsubq_s16(x2[5], x2[10]);
+  x3[6] = vqaddq_s16(x2[6], x2[9]);
+  x3[9] = vqsubq_s16(x2[6], x2[9]);
+  x3[7] = vqaddq_s16(x2[7], x2[8]);
+  x3[8] = vqsubq_s16(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_16_neon_mode0(*cospi32, *cospi32, x2[20], x2[27], x3[20], x3[27],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x2[21], x2[26], x3[21], x3[26],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x2[22], x2[25], x3[22], x3[25],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x2[23], x2[24], x3[23], x3[24],
+                    *v_cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = vqaddq_s16(x1[32], x2[47]);
+  x3[47] = vqsubq_s16(x1[32], x2[47]);
+  x3[33] = vqaddq_s16(x1[33], x2[46]);
+  x3[46] = vqsubq_s16(x1[33], x2[46]);
+  x3[34] = vqaddq_s16(x1[34], x2[45]);
+  x3[45] = vqsubq_s16(x1[34], x2[45]);
+  x3[35] = vqaddq_s16(x1[35], x2[44]);
+  x3[44] = vqsubq_s16(x1[35], x2[44]);
+  x3[36] = vqaddq_s16(x1[36], x2[43]);
+  x3[43] = vqsubq_s16(x1[36], x2[43]);
+  x3[37] = vqaddq_s16(x1[37], x2[42]);
+  x3[42] = vqsubq_s16(x1[37], x2[42]);
+  x3[38] = vqaddq_s16(x1[38], x2[41]);
+  x3[41] = vqsubq_s16(x1[38], x2[41]);
+  x3[39] = vqaddq_s16(x1[39], x2[40]);
+  x3[40] = vqsubq_s16(x1[39], x2[40]);
+  x3[48] = vqsubq_s16(x1[63], x2[48]);
+  x3[63] = vqaddq_s16(x1[63], x2[48]);
+  x3[49] = vqsubq_s16(x1[62], x2[49]);
+  x3[62] = vqaddq_s16(x1[62], x2[49]);
+  x3[50] = vqsubq_s16(x1[61], x2[50]);
+  x3[61] = vqaddq_s16(x1[61], x2[50]);
+  x3[51] = vqsubq_s16(x1[60], x2[51]);
+  x3[60] = vqaddq_s16(x1[60], x2[51]);
+  x3[52] = vqsubq_s16(x1[59], x2[52]);
+  x3[59] = vqaddq_s16(x1[59], x2[52]);
+  x3[53] = vqsubq_s16(x1[58], x2[53]);
+  x3[58] = vqaddq_s16(x1[58], x2[53]);
+  x3[54] = vqsubq_s16(x1[57], x2[54]);
+  x3[57] = vqaddq_s16(x1[57], x2[54]);
+  x3[55] = vqsubq_s16(x1[56], x2[55]);
+  x3[56] = vqaddq_s16(x1[56], x2[55]);
+
+  // stage 4
+  x4[0] = vqaddq_s16(x3[0], x3[7]);
+  x4[7] = vqsubq_s16(x3[0], x3[7]);
+  x4[1] = vqaddq_s16(x3[1], x3[6]);
+  x4[6] = vqsubq_s16(x3[1], x3[6]);
+  x4[2] = vqaddq_s16(x3[2], x3[5]);
+  x4[5] = vqsubq_s16(x3[2], x3[5]);
+  x4[3] = vqaddq_s16(x3[3], x3[4]);
+  x4[4] = vqsubq_s16(x3[3], x3[4]);
+
+  btf_16_neon_mode0(*cospi32, *cospi32, x3[10], x3[13], x4[10], x4[13],
+                    *v_cos_bit);
+  btf_16_neon_mode0(*cospi32, *cospi32, x3[11], x3[12], x4[11], x4[12],
+                    *v_cos_bit);
+
+  x4[16] = vqaddq_s16(x3[16], x3[23]);
+  x4[23] = vqsubq_s16(x3[16], x3[23]);
+  x4[17] = vqaddq_s16(x3[17], x3[22]);
+  x4[22] = vqsubq_s16(x3[17], x3[22]);
+  x4[18] = vqaddq_s16(x3[18], x3[21]);
+  x4[21] = vqsubq_s16(x3[18], x3[21]);
+  x4[19] = vqaddq_s16(x3[19], x3[20]);
+  x4[20] = vqsubq_s16(x3[19], x3[20]);
+  x4[24] = vqsubq_s16(x3[31], x3[24]);
+  x4[31] = vqaddq_s16(x3[31], x3[24]);
+  x4[25] = vqsubq_s16(x3[30], x3[25]);
+  x4[30] = vqaddq_s16(x3[30], x3[25]);
+  x4[26] = vqsubq_s16(x3[29], x3[26]);
+  x4[29] = vqaddq_s16(x3[29], x3[26]);
+  x4[27] = vqsubq_s16(x3[28], x3[27]);
+  x4[28] = vqaddq_s16(x3[28], x3[27]);
+}
+
+void av1_fdct8x64_neon(const int16x8_t *input, int16x8_t *output,
+                       int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  int16x8_t x3[64];
+  int16x8_t x4[64];
+
+  av1_fdct8x64_stage_1234_neon(input, x3, x4, &cospi[32], &v_cos_bit);
+
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[40], x3[55], x4[40], x4[55],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[41], x3[54], x4[41], x4[54],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[42], x3[53], x4[42], x4[53],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x3[43], x3[52], x4[43], x4[52],
+                     v_cos_bit);
+
+  // stage 5
+  int16x8_t x5[64];
+  x5[0] = vqaddq_s16(x4[0], x4[3]);
+  x5[3] = vqsubq_s16(x4[0], x4[3]);
+  x5[1] = vqaddq_s16(x4[1], x4[2]);
+  x5[2] = vqsubq_s16(x4[1], x4[2]);
+
+  btf_16_neon_mode0(cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
+                    v_cos_bit);
+
+  x5[8] = vqaddq_s16(x3[8], x4[11]);
+  x5[11] = vqsubq_s16(x3[8], x4[11]);
+  x5[9] = vqaddq_s16(x3[9], x4[10]);
+  x5[10] = vqsubq_s16(x3[9], x4[10]);
+  x5[12] = vqsubq_s16(x3[15], x4[12]);
+  x5[15] = vqaddq_s16(x3[15], x4[12]);
+  x5[13] = vqsubq_s16(x3[14], x4[13]);
+  x5[14] = vqaddq_s16(x3[14], x4[13]);
+
+  btf_16_neon_mode0(cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x4[20], x4[27], x5[20], x5[27],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x4[21], x4[26], x5[21], x5[26],
+                     v_cos_bit);
+
+  x5[32] = vqaddq_s16(x3[32], x4[39]);
+  x5[39] = vqsubq_s16(x3[32], x4[39]);
+  x5[33] = vqaddq_s16(x3[33], x4[38]);
+  x5[38] = vqsubq_s16(x3[33], x4[38]);
+  x5[34] = vqaddq_s16(x3[34], x4[37]);
+  x5[37] = vqsubq_s16(x3[34], x4[37]);
+  x5[35] = vqaddq_s16(x3[35], x4[36]);
+  x5[36] = vqsubq_s16(x3[35], x4[36]);
+  x5[40] = vqsubq_s16(x3[47], x4[40]);
+  x5[47] = vqaddq_s16(x3[47], x4[40]);
+  x5[41] = vqsubq_s16(x3[46], x4[41]);
+  x5[46] = vqaddq_s16(x3[46], x4[41]);
+  x5[42] = vqsubq_s16(x3[45], x4[42]);
+  x5[45] = vqaddq_s16(x3[45], x4[42]);
+  x5[43] = vqsubq_s16(x3[44], x4[43]);
+  x5[44] = vqaddq_s16(x3[44], x4[43]);
+  x5[48] = vqaddq_s16(x3[48], x4[55]);
+  x5[55] = vqsubq_s16(x3[48], x4[55]);
+  x5[49] = vqaddq_s16(x3[49], x4[54]);
+  x5[54] = vqsubq_s16(x3[49], x4[54]);
+  x5[50] = vqaddq_s16(x3[50], x4[53]);
+  x5[53] = vqsubq_s16(x3[50], x4[53]);
+  x5[51] = vqaddq_s16(x3[51], x4[52]);
+  x5[52] = vqsubq_s16(x3[51], x4[52]);
+  x5[56] = vqsubq_s16(x3[63], x4[56]);
+  x5[63] = vqaddq_s16(x3[63], x4[56]);
+  x5[57] = vqsubq_s16(x3[62], x4[57]);
+  x5[62] = vqaddq_s16(x3[62], x4[57]);
+  x5[58] = vqsubq_s16(x3[61], x4[58]);
+  x5[61] = vqaddq_s16(x3[61], x4[58]);
+  x5[59] = vqsubq_s16(x3[60], x4[59]);
+  x5[60] = vqaddq_s16(x3[60], x4[59]);
+
+  // stage 6
+  int16x8_t x6[64];
+  btf_16_neon_mode2(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
+                    v_cos_bit);
+  x6[4] = vqaddq_s16(x4[4], x5[5]);
+  x6[5] = vqsubq_s16(x4[4], x5[5]);
+  x6[6] = vqsubq_s16(x4[7], x5[6]);
+  x6[7] = vqaddq_s16(x4[7], x5[6]);
+
+  btf_16_neon_mode0(cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[48], cospi[16], x5[10], x5[13], x6[10], x6[13],
+                     v_cos_bit);
+
+  x6[16] = vqaddq_s16(x4[16], x5[19]);
+  x6[19] = vqsubq_s16(x4[16], x5[19]);
+  x6[17] = vqaddq_s16(x4[17], x5[18]);
+  x6[18] = vqsubq_s16(x4[17], x5[18]);
+  x6[20] = vqsubq_s16(x4[23], x5[20]);
+  x6[23] = vqaddq_s16(x4[23], x5[20]);
+  x6[21] = vqsubq_s16(x4[22], x5[21]);
+  x6[22] = vqaddq_s16(x4[22], x5[21]);
+  x6[24] = vqaddq_s16(x4[24], x5[27]);
+  x6[27] = vqsubq_s16(x4[24], x5[27]);
+  x6[25] = vqaddq_s16(x4[25], x5[26]);
+  x6[26] = vqsubq_s16(x4[25], x5[26]);
+  x6[28] = vqsubq_s16(x4[31], x5[28]);
+  x6[31] = vqaddq_s16(x4[31], x5[28]);
+  x6[29] = vqsubq_s16(x4[30], x5[29]);
+  x6[30] = vqaddq_s16(x4[30], x5[29]);
+
+  btf_16_neon_mode0(cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[56], cospi[8], x5[36], x5[59], x6[36], x6[59],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[56], cospi[8], x5[37], x5[58], x6[37], x6[58],
+                     v_cos_bit);
+  btf_16_neon_mode0(cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[24], cospi[40], x5[44], x5[51], x6[44], x6[51],
+                     v_cos_bit);
+  btf_16_neon_mode02(cospi[24], cospi[40], x5[45], x5[50], x6[45], x6[50],
+                     v_cos_bit);
+
+  // stage 7
+  int16x8_t x7[64];
+
+  btf_16_neon_mode2(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
+  btf_16_neon_mode2(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
+                    v_cos_bit);
+  x7[8] = vqaddq_s16(x5[8], x6[9]);
+  x7[9] = vqsubq_s16(x5[8], x6[9]);
+  x7[10] = vqsubq_s16(x5[11], x6[10]);
+  x7[11] = vqaddq_s16(x5[11], x6[10]);
+  x7[12] = vqaddq_s16(x5[12], x6[13]);
+  x7[13] = vqsubq_s16(x5[12], x6[13]);
+  x7[14] = vqsubq_s16(x5[15], x6[14]);
+  x7[15] = vqaddq_s16(x5[15], x6[14]);
+
+  btf_16_neon_mode0(cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[56], cospi[8], x6[18], x6[29], x7[18], x7[29],
+                     v_cos_bit);
+
+  btf_16_neon_mode0(cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[24], cospi[40], x6[22], x6[25], x7[22], x7[25],
+                     v_cos_bit);
+
+  x7[32] = vqaddq_s16(x5[32], x6[35]);
+  x7[35] = vqsubq_s16(x5[32], x6[35]);
+  x7[33] = vqaddq_s16(x5[33], x6[34]);
+  x7[34] = vqsubq_s16(x5[33], x6[34]);
+  x7[36] = vqsubq_s16(x5[39], x6[36]);
+  x7[39] = vqaddq_s16(x5[39], x6[36]);
+  x7[37] = vqsubq_s16(x5[38], x6[37]);
+  x7[38] = vqaddq_s16(x5[38], x6[37]);
+  x7[40] = vqaddq_s16(x5[40], x6[43]);
+  x7[43] = vqsubq_s16(x5[40], x6[43]);
+  x7[41] = vqaddq_s16(x5[41], x6[42]);
+  x7[42] = vqsubq_s16(x5[41], x6[42]);
+  x7[44] = vqsubq_s16(x5[47], x6[44]);
+  x7[47] = vqaddq_s16(x5[47], x6[44]);
+  x7[45] = vqsubq_s16(x5[46], x6[45]);
+  x7[46] = vqaddq_s16(x5[46], x6[45]);
+  x7[48] = vqaddq_s16(x5[48], x6[51]);
+  x7[51] = vqsubq_s16(x5[48], x6[51]);
+  x7[49] = vqaddq_s16(x5[49], x6[50]);
+  x7[50] = vqsubq_s16(x5[49], x6[50]);
+  x7[52] = vqsubq_s16(x5[55], x6[52]);
+  x7[55] = vqaddq_s16(x5[55], x6[52]);
+  x7[53] = vqsubq_s16(x5[54], x6[53]);
+  x7[54] = vqaddq_s16(x5[54], x6[53]);
+  x7[56] = vqaddq_s16(x5[56], x6[59]);
+  x7[59] = vqsubq_s16(x5[56], x6[59]);
+  x7[57] = vqaddq_s16(x5[57], x6[58]);
+  x7[58] = vqsubq_s16(x5[57], x6[58]);
+  x7[60] = vqsubq_s16(x5[63], x6[60]);
+  x7[63] = vqaddq_s16(x5[63], x6[60]);
+  x7[61] = vqsubq_s16(x5[62], x6[61]);
+  x7[62] = vqaddq_s16(x5[62], x6[61]);
+
+  // stage 8
+  int16x8_t x8[64];
+
+  btf_16_neon_mode2(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
+                    v_cos_bit);
+  x8[16] = vqaddq_s16(x6[16], x7[17]);
+  x8[17] = vqsubq_s16(x6[16], x7[17]);
+  x8[18] = vqsubq_s16(x6[19], x7[18]);
+  x8[19] = vqaddq_s16(x6[19], x7[18]);
+  x8[20] = vqaddq_s16(x6[20], x7[21]);
+  x8[21] = vqsubq_s16(x6[20], x7[21]);
+  x8[22] = vqsubq_s16(x6[23], x7[22]);
+  x8[23] = vqaddq_s16(x6[23], x7[22]);
+  x8[24] = vqaddq_s16(x6[24], x7[25]);
+  x8[25] = vqsubq_s16(x6[24], x7[25]);
+  x8[26] = vqsubq_s16(x6[27], x7[26]);
+  x8[27] = vqaddq_s16(x6[27], x7[26]);
+  x8[28] = vqaddq_s16(x6[28], x7[29]);
+  x8[29] = vqsubq_s16(x6[28], x7[29]);
+  x8[30] = vqsubq_s16(x6[31], x7[30]);
+  x8[31] = vqaddq_s16(x6[31], x7[30]);
+
+  btf_16_neon_mode0(cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[60], cospi[4], x7[34], x7[61], x8[34], x8[61],
+                     v_cos_bit);
+  btf_16_neon_mode0(cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[28], cospi[36], x7[38], x7[57], x8[38], x8[57],
+                     v_cos_bit);
+  btf_16_neon_mode0(cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[44], cospi[20], x7[42], x7[53], x8[42], x8[53],
+                     v_cos_bit);
+  btf_16_neon_mode0(cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
+                    v_cos_bit);
+  btf_16_neon_mode02(cospi[12], cospi[52], x7[46], x7[49], x8[46], x8[49],
+                     v_cos_bit);
+
+  // stage 9
+  int16x8_t x9[64];
+
+  btf_16_neon_mode2(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
+                    v_cos_bit);
+  btf_16_neon_mode2(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
+                    v_cos_bit);
+  x9[32] = vqaddq_s16(x7[32], x8[33]);
+  x9[33] = vqsubq_s16(x7[32], x8[33]);
+  x9[34] = vqsubq_s16(x7[35], x8[34]);
+  x9[35] = vqaddq_s16(x7[35], x8[34]);
+  x9[36] = vqaddq_s16(x7[36], x8[37]);
+  x9[37] = vqsubq_s16(x7[36], x8[37]);
+  x9[38] = vqsubq_s16(x7[39], x8[38]);
+  x9[39] = vqaddq_s16(x7[39], x8[38]);
+  x9[40] = vqaddq_s16(x7[40], x8[41]);
+  x9[41] = vqsubq_s16(x7[40], x8[41]);
+  x9[42] = vqsubq_s16(x7[43], x8[42]);
+  x9[43] = vqaddq_s16(x7[43], x8[42]);
+  x9[44] = vqaddq_s16(x7[44], x8[45]);
+  x9[45] = vqsubq_s16(x7[44], x8[45]);
+  x9[46] = vqsubq_s16(x7[47], x8[46]);
+  x9[47] = vqaddq_s16(x7[47], x8[46]);
+  x9[48] = vqaddq_s16(x7[48], x8[49]);
+  x9[49] = vqsubq_s16(x7[48], x8[49]);
+  x9[50] = vqsubq_s16(x7[51], x8[50]);
+  x9[51] = vqaddq_s16(x7[51], x8[50]);
+  x9[52] = vqaddq_s16(x7[52], x8[53]);
+  x9[53] = vqsubq_s16(x7[52], x8[53]);
+  x9[54] = vqsubq_s16(x7[55], x8[54]);
+  x9[55] = vqaddq_s16(x7[55], x8[54]);
+  x9[56] = vqaddq_s16(x7[56], x8[57]);
+  x9[57] = vqsubq_s16(x7[56], x8[57]);
+  x9[58] = vqsubq_s16(x7[59], x8[58]);
+  x9[59] = vqaddq_s16(x7[59], x8[58]);
+  x9[60] = vqaddq_s16(x7[60], x8[61]);
+  x9[61] = vqsubq_s16(x7[60], x8[61]);
+  x9[62] = vqsubq_s16(x7[63], x8[62]);
+  x9[63] = vqaddq_s16(x7[63], x8[62]);
+
+  // stage 10
+  btf_16_neon_mode2(cospi[63], cospi[1], x9[32], x9[63], output[1], output[63],
+                    v_cos_bit);
+
+  btf_16_neon_mode2(cospi[31], cospi[33], x9[33], x9[62], output[33],
+                    output[31], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[47], cospi[17], x9[34], x9[61], output[17],
+                    output[47], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[15], cospi[49], x9[35], x9[60], output[49],
+                    output[15], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[55], cospi[9], x9[36], x9[59], output[9], output[55],
+                    v_cos_bit);
+
+  btf_16_neon_mode2(cospi[23], cospi[41], x9[37], x9[58], output[41],
+                    output[23], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[39], cospi[25], x9[38], x9[57], output[25],
+                    output[39], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[7], cospi[57], x9[39], x9[56], output[57], output[7],
+                    v_cos_bit);
+
+  btf_16_neon_mode2(cospi[59], cospi[5], x9[40], x9[55], output[5], output[59],
+                    v_cos_bit);
+
+  btf_16_neon_mode2(cospi[27], cospi[37], x9[41], x9[54], output[37],
+                    output[27], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[43], cospi[21], x9[42], x9[53], output[21],
+                    output[43], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[11], cospi[53], x9[43], x9[52], output[53],
+                    output[11], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[51], cospi[13], x9[44], x9[51], output[13],
+                    output[51], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[19], cospi[45], x9[45], x9[50], output[45],
+                    output[19], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[35], cospi[29], x9[46], x9[49], output[29],
+                    output[35], v_cos_bit);
+
+  btf_16_neon_mode2(cospi[3], cospi[61], x9[47], x9[48], output[61], output[3],
+                    v_cos_bit);
+
+  // stage 11
+  output[0] = x6[0];
+  output[2] = x9[16];
+  output[4] = x8[8];
+  output[6] = x9[24];
+  output[8] = x7[4];
+  output[10] = x9[20];
+  output[12] = x8[12];
+  output[14] = x9[28];
+  output[16] = x6[2];
+  output[18] = x9[18];
+  output[20] = x8[10];
+  output[22] = x9[26];
+  output[24] = x7[6];
+  output[26] = x9[22];
+  output[28] = x8[14];
+  output[30] = x9[30];
+  output[32] = x6[1];
+  output[34] = x9[17];
+  output[36] = x8[9];
+  output[38] = x9[25];
+  output[40] = x7[5];
+  output[42] = x9[21];
+  output[44] = x8[13];
+  output[46] = x9[29];
+  output[48] = x6[3];
+  output[52] = x8[11];
+  output[54] = x9[27];
+  output[56] = x7[7];
+  output[58] = x9[23];
+  output[60] = x8[15];
+  output[62] = x9[31];
+}
+
+void fadst_8x8_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[4];
+
+  x1[0] = vqnegq_s16(input[7]);
+  x1[1] = vqnegq_s16(input[3]);
+  x1[2] = vqnegq_s16(input[1]);
+  x1[3] = vqnegq_s16(input[5]);
+
+  // stage 2
+  int16x8_t x2[8];
+
+  btf_16_neon_mode3(cospi[32], cospi[32], x1[1], input[4], x2[2], x2[3],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[32], cospi[32], input[2], x1[3], x2[6], x2[7],
+                    v_cos_bit);
+  // stage 3
+  int16x8_t x3[8];
+  x3[0] = vqaddq_s16(input[0], x2[2]);
+  x3[2] = vqsubq_s16(input[0], x2[2]);
+  x3[1] = vqaddq_s16(x1[0], x2[3]);
+  x3[3] = vqsubq_s16(x1[0], x2[3]);
+  x3[4] = vqaddq_s16(x1[2], x2[6]);
+  x3[6] = vqsubq_s16(x1[2], x2[6]);
+  x3[5] = vqaddq_s16(input[6], x2[7]);
+  x3[7] = vqsubq_s16(input[6], x2[7]);
+
+  // stage 4
+  btf_16_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x3[4], x3[5],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x3[6], x3[7],
+                    v_cos_bit);
+
+  // stage 5
+  int16x8_t x5[8];
+  x5[0] = vqaddq_s16(x3[0], x3[4]);
+  x5[4] = vqsubq_s16(x3[0], x3[4]);
+  x5[1] = vqaddq_s16(x3[1], x3[5]);
+  x5[5] = vqsubq_s16(x3[1], x3[5]);
+  x5[2] = vqaddq_s16(x3[2], x3[6]);
+  x5[6] = vqsubq_s16(x3[2], x3[6]);
+  x5[3] = vqaddq_s16(x3[3], x3[7]);
+  x5[7] = vqsubq_s16(x3[3], x3[7]);
+
+  // stage 6
+  btf_16_neon_mode3(cospi[4], cospi[60], x5[0], x5[1], output[7], output[0],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[20], cospi[44], x5[2], x5[3], output[5], output[2],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[36], cospi[28], x5[4], x5[5], output[3], output[4],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[52], cospi[12], x5[6], x5[7], output[1], output[6],
+                    v_cos_bit);
+}
+
+static void fadst8x16_neon(const int16x8_t *input, int16x8_t *output,
+                           int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  // stage 1
+  int16x8_t x1[12];
+  x1[0] = vqnegq_s16(input[15]);
+  x1[1] = vqnegq_s16(input[3]);
+  x1[2] = vqnegq_s16(input[1]);
+  x1[3] = vqnegq_s16(input[13]);
+
+  // stage 2
+  btf_16_neon(-cospi[32], cospi[32], -cospi[32], -cospi[32], input[7], input[8],
+              x1[4], x1[5]);
+  btf_16_neon_mode1(cospi[32], cospi[32], input[4], input[11], x1[6], x1[7],
+                    v_cos_bit);
+  btf_16_neon_mode1(cospi[32], cospi[32], input[6], input[9], x1[8], x1[9],
+                    v_cos_bit);
+  btf_16_neon(-cospi[32], cospi[32], -cospi[32], -cospi[32], input[5],
+              input[10], x1[10], x1[11]);
+  // stage 3
+  int16x8_t x3[16];
+  x3[0] = vqaddq_s16(input[0], x1[4]);
+  x3[2] = vqsubq_s16(input[0], x1[4]);
+  x3[1] = vqaddq_s16(x1[0], x1[5]);
+  x3[3] = vqsubq_s16(x1[0], x1[5]);
+  x3[4] = vqaddq_s16(x1[1], x1[6]);
+  x3[6] = vqsubq_s16(x1[1], x1[6]);
+  x3[5] = vqaddq_s16(input[12], x1[7]);
+  x3[7] = vqsubq_s16(input[12], x1[7]);
+  x3[8] = vqaddq_s16(x1[2], x1[8]);
+  x3[10] = vqsubq_s16(x1[2], x1[8]);
+  x3[9] = vqaddq_s16(input[14], x1[9]);
+  x3[11] = vqsubq_s16(input[14], x1[9]);
+  x3[12] = vqaddq_s16(input[2], x1[10]);
+  x3[14] = vqsubq_s16(input[2], x1[10]);
+  x3[13] = vqaddq_s16(x1[3], x1[11]);
+  x3[15] = vqsubq_s16(x1[3], x1[11]);
+
+  // stage 4
+  btf_16_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x3[4], x3[5],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x3[6], x3[7],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[16], cospi[48], x3[12], x3[13], x3[12], x3[13],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[48], cospi[16], x3[14], x3[15], x3[14], x3[15],
+                    v_cos_bit);
+
+  // stage 5
+  int16x8_t x5[16];
+  x5[0] = vqaddq_s16(x3[0], x3[4]);
+  x5[4] = vqsubq_s16(x3[0], x3[4]);
+  x5[1] = vqaddq_s16(x3[1], x3[5]);
+  x5[5] = vqsubq_s16(x3[1], x3[5]);
+  x5[2] = vqaddq_s16(x3[2], x3[6]);
+  x5[6] = vqsubq_s16(x3[2], x3[6]);
+  x5[3] = vqaddq_s16(x3[3], x3[7]);
+  x5[7] = vqsubq_s16(x3[3], x3[7]);
+  x5[8] = vqaddq_s16(x3[8], x3[12]);
+  x5[12] = vqsubq_s16(x3[8], x3[12]);
+  x5[9] = vqaddq_s16(x3[9], x3[13]);
+  x5[13] = vqsubq_s16(x3[9], x3[13]);
+  x5[10] = vqaddq_s16(x3[10], x3[14]);
+  x5[14] = vqsubq_s16(x3[10], x3[14]);
+  x5[11] = vqaddq_s16(x3[11], x3[15]);
+  x5[15] = vqsubq_s16(x3[11], x3[15]);
+
+  // stage 6
+  btf_16_neon_mode3(cospi[8], cospi[56], x5[8], x5[9], x5[8], x5[9], v_cos_bit);
+  btf_16_neon_mode3(cospi[40], cospi[24], x5[10], x5[11], x5[10], x5[11],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[56], cospi[8], x5[12], x5[13], x5[12], x5[13],
+                    v_cos_bit);
+  btf_16_neon_mode0(cospi[24], cospi[40], x5[14], x5[15], x5[14], x5[15],
+                    v_cos_bit);
+
+  // stage 7
+  int16x8_t x7[16];
+  x7[0] = vqaddq_s16(x5[0], x5[8]);
+  x7[8] = vqsubq_s16(x5[0], x5[8]);
+  x7[1] = vqaddq_s16(x5[1], x5[9]);
+  x7[9] = vqsubq_s16(x5[1], x5[9]);
+  x7[2] = vqaddq_s16(x5[2], x5[10]);
+  x7[10] = vqsubq_s16(x5[2], x5[10]);
+  x7[3] = vqaddq_s16(x5[3], x5[11]);
+  x7[11] = vqsubq_s16(x5[3], x5[11]);
+  x7[4] = vqaddq_s16(x5[4], x5[12]);
+  x7[12] = vqsubq_s16(x5[4], x5[12]);
+  x7[5] = vqaddq_s16(x5[5], x5[13]);
+  x7[13] = vqsubq_s16(x5[5], x5[13]);
+  x7[6] = vqaddq_s16(x5[6], x5[14]);
+  x7[14] = vqsubq_s16(x5[6], x5[14]);
+  x7[7] = vqaddq_s16(x5[7], x5[15]);
+  x7[15] = vqsubq_s16(x5[7], x5[15]);
+
+  // stage 8
+  btf_16_neon_mode3(cospi[2], cospi[62], x7[0], x7[1], output[15], output[0],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[10], cospi[54], x7[2], x7[3], output[13], output[2],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[18], cospi[46], x7[4], x7[5], output[11], output[4],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[26], cospi[38], x7[6], x7[7], output[9], output[6],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[34], cospi[30], x7[8], x7[9], output[7], output[8],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[42], cospi[22], x7[10], x7[11], output[5], output[10],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[50], cospi[14], x7[12], x7[13], output[3], output[12],
+                    v_cos_bit);
+  btf_16_neon_mode3(cospi[58], cospi[6], x7[14], x7[15], output[1], output[14],
+                    v_cos_bit);
+}
+
+void av1_fidentity4x4_neon(const int16x8_t *const input,
+                           int16x8_t *const output, const int8_t cos_bit,
+                           const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
+  for (int i = 0; i < 4; ++i) {
+    const int16x4_t b = vqrshrn_n_s32(
+        vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
+    output[i] = vcombine_s16(b, b);
+  }
+}
+
+static INLINE void fidentity8x4_neon(const int16x8_t *const input,
+                                     int16x8_t *const output,
+                                     const int8_t cos_bit,
+                                     const int8_t *stage_range) {
+  (void)stage_range;
+  (void)cos_bit;
+  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
+  for (int i = 0; i < 4; ++i) {
+    const int16x4_t b_lo = vqrshrn_n_s32(
+        vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
+    const int16x4_t b_hi = vqrshrn_n_s32(
+        vmull_s16(vget_high_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
+    output[i] = vcombine_s16(b_lo, b_hi);
+  }
+}
+
+void fidentity8x8_neon(const int16x8_t *input, int16x8_t *output,
+                       int8_t cos_bit, const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  int16x8_t one = vdupq_n_s16(1);
+  output[0] = vqrshlq_s16(input[0], one);
+  output[1] = vqrshlq_s16(input[1], one);
+  output[2] = vqrshlq_s16(input[2], one);
+  output[3] = vqrshlq_s16(input[3], one);
+  output[4] = vqrshlq_s16(input[4], one);
+  output[5] = vqrshlq_s16(input[5], one);
+  output[6] = vqrshlq_s16(input[6], one);
+  output[7] = vqrshlq_s16(input[7], one);
+}
+
+static INLINE void fidentity8x16_neon(const int16x8_t *input, int16x8_t *output,
+                                      int8_t cos_bit,
+                                      const int8_t *stage_range) {
+  (void)stage_range;
+  (void)cos_bit;
+  const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2 * 2);
+  for (int i = 0; i < 16; ++i) {
+    const int16x4_t b_lo = vqrshrn_n_s32(
+        vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
+    const int16x4_t b_hi = vqrshrn_n_s32(
+        vmull_s16(vget_high_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
+    output[i] = vcombine_s16(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity8x32_neon(const int16x8_t *input, int16x8_t *output,
+                                      int8_t cos_bit,
+                                      const int8_t *stage_range) {
+  (void)stage_range;
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) {
+    output[i] = vshlq_n_s16(input[i], 2);
+  }
+}
+
+typedef void (*transform_1d_lbd_neon)(const int16x8_t *input, int16x8_t *output,
+                                      int8_t cos_bit,
+                                      const int8_t *stage_range);
+
+static const transform_1d_lbd_neon col_txfm4x4_arr[TX_TYPES] = {
+  av1_fdct4x4_neon,       // DCT_DCT
+  av1_fadst4x4_neon,      // ADST_DCT
+  av1_fdct4x4_neon,       // DCT_ADST
+  av1_fadst4x4_neon,      // ADST_ADST
+  av1_fadst4x4_neon,      // FLIPADST_DCT
+  av1_fdct4x4_neon,       // DCT_FLIPADST
+  av1_fadst4x4_neon,      // FLIPADST_FLIPADST
+  av1_fadst4x4_neon,      // ADST_FLIPADST
+  av1_fadst4x4_neon,      // FLIPADST_ADST
+  av1_fidentity4x4_neon,  // IDTX
+  av1_fdct4x4_neon,       // V_DCT
+  av1_fidentity4x4_neon,  // H_DCT
+  av1_fadst4x4_neon,      // V_ADST
+  av1_fidentity4x4_neon,  // H_ADST
+  av1_fadst4x4_neon,      // V_FLIPADST
+  av1_fidentity4x4_neon   // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon row_txfm4x4_arr[TX_TYPES] = {
+  av1_fdct4x4_neon,       // DCT_DCT
+  av1_fdct4x4_neon,       // ADST_DCT
+  av1_fadst4x4_neon,      // DCT_ADST
+  av1_fadst4x4_neon,      // ADST_ADST
+  av1_fdct4x4_neon,       // FLIPADST_DCT
+  av1_fadst4x4_neon,      // DCT_FLIPADST
+  av1_fadst4x4_neon,      // FLIPADST_FLIPADST
+  av1_fadst4x4_neon,      // ADST_FLIPADST
+  av1_fadst4x4_neon,      // FLIPADST_ADST
+  av1_fidentity4x4_neon,  // IDTX
+  av1_fidentity4x4_neon,  // V_DCT
+  av1_fdct4x4_neon,       // H_DCT
+  av1_fidentity4x4_neon,  // V_ADST
+  av1_fadst4x4_neon,      // H_ADST
+  av1_fidentity4x4_neon,  // V_FLIPADST
+  av1_fadst4x4_neon       // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon col_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_neon,       // DCT_DCT
+  fadst4x8_neon,      // ADST_DCT
+  fdct4x8_neon,       // DCT_ADST
+  fadst4x8_neon,      // ADST_ADST
+  fadst4x8_neon,      // FLIPADST_DCT
+  fdct4x8_neon,       // DCT_FLIPADST
+  fadst4x8_neon,      // FLIPADST_FLIPADST
+  fadst4x8_neon,      // ADST_FLIPADST
+  fadst4x8_neon,      // FLIPADST_ADST
+  fidentity8x8_neon,  // IDTX
+  fdct4x8_neon,       // V_DCT
+  fidentity8x8_neon,  // H_DCT
+  fadst4x8_neon,      // V_ADST
+  fidentity8x8_neon,  // H_ADST
+  fadst4x8_neon,      // V_FLIPADST
+  fidentity8x8_neon   // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon row_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_neon,       // DCT_DCT
+  fdct8x4_neon,       // ADST_DCT
+  fadst8x4_neon,      // DCT_ADST
+  fadst8x4_neon,      // ADST_ADST
+  fdct8x4_neon,       // FLIPADST_DCT
+  fadst8x4_neon,      // DCT_FLIPADST
+  fadst8x4_neon,      // FLIPADST_FLIPADST
+  fadst8x4_neon,      // ADST_FLIPADST
+  fadst8x4_neon,      // FLIPADST_ADST
+  fidentity8x4_neon,  // IDTX
+  fidentity8x4_neon,  // V_DCT
+  fdct8x4_neon,       // H_DCT
+  fidentity8x4_neon,  // V_ADST
+  fadst8x4_neon,      // H_ADST
+  fidentity8x4_neon,  // V_FLIPADST
+  fadst8x4_neon       // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon col_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_neon,       // DCT_DCT
+  fadst8x4_neon,      // ADST_DCT
+  fdct8x4_neon,       // DCT_ADST
+  fadst8x4_neon,      // ADST_ADST
+  fadst8x4_neon,      // FLIPADST_DCT
+  fdct8x4_neon,       // DCT_FLIPADST
+  fadst8x4_neon,      // FLIPADST_FLIPADST
+  fadst8x4_neon,      // ADST_FLIPADST
+  fadst8x4_neon,      // FLIPADST_ADST
+  fidentity8x4_neon,  // IDTX
+  fdct8x4_neon,       // V_DCT
+  fidentity8x4_neon,  // H_DCT
+  fadst8x4_neon,      // V_ADST
+  fidentity8x4_neon,  // H_ADST
+  fadst8x4_neon,      // V_FLIPADST
+  fidentity8x4_neon   // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon row_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_neon,       // DCT_DCT
+  fdct4x8_neon,       // ADST_DCT
+  fadst4x8_neon,      // DCT_ADST
+  fadst4x8_neon,      // ADST_ADST
+  fdct4x8_neon,       // FLIPADST_DCT
+  fadst4x8_neon,      // DCT_FLIPADST
+  fadst4x8_neon,      // FLIPADST_FLIPADST
+  fadst4x8_neon,      // ADST_FLIPADST
+  fadst4x8_neon,      // FLIPADST_ADST
+  fidentity8x8_neon,  // IDTX
+  fidentity8x8_neon,  // V_DCT
+  fdct4x8_neon,       // H_DCT
+  fidentity8x8_neon,  // V_ADST
+  fadst4x8_neon,      // H_ADST
+  fidentity8x8_neon,  // V_FLIPADST
+  fadst4x8_neon       // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon col_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_neon,       // DCT_DCT
+  fadst_8x8_neon,     // ADST_DCT
+  fdct8x8_neon,       // DCT_ADST
+  fadst_8x8_neon,     // ADST_ADST
+  fadst_8x8_neon,     // FLIPADST_DCT
+  fdct8x8_neon,       // DCT_FLIPADST
+  fadst_8x8_neon,     // FLIPADST_FLIPADST
+  fadst_8x8_neon,     // ADST_FLIPADST
+  fadst_8x8_neon,     // FLIPADST_ADST
+  fidentity8x8_neon,  // IDTX
+  fdct8x8_neon,       // V_DCT
+  fidentity8x8_neon,  // H_DCT
+  fadst_8x8_neon,     // V_ADST
+  fidentity8x8_neon,  // H_ADST
+  fadst_8x8_neon,     // V_FLIPADST
+  fidentity8x8_neon,  // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon row_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_neon,       // DCT_DCT
+  fdct8x8_neon,       // ADST_DCT
+  fadst_8x8_neon,     // DCT_ADST
+  fadst_8x8_neon,     // ADST_ADST
+  fdct8x8_neon,       // FLIPADST_DCT
+  fadst_8x8_neon,     // DCT_FLIPADST
+  fadst_8x8_neon,     // FLIPADST_FLIPADST
+  fadst_8x8_neon,     // ADST_FLIPADST
+  fadst_8x8_neon,     // FLIPADST_ADST
+  fidentity8x8_neon,  // IDTX
+  fidentity8x8_neon,  // V_DCT
+  fdct8x8_neon,       // H_DCT
+  fidentity8x8_neon,  // V_ADST
+  fadst_8x8_neon,     // H_ADST
+  fidentity8x8_neon,  // V_FLIPADST
+  fadst_8x8_neon      // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_neon,       // DCT_DCT
+  fadst8x16_neon,      // ADST_DCT
+  fdct8x16_neon,       // DCT_ADST
+  fadst8x16_neon,      // ADST_ADST
+  fadst8x16_neon,      // FLIPADST_DCT
+  fdct8x16_neon,       // DCT_FLIPADST
+  fadst8x16_neon,      // FLIPADST_FLIPADST
+  fadst8x16_neon,      // ADST_FLIPADST
+  fadst8x16_neon,      // FLIPADST_ADST
+  fidentity8x16_neon,  // IDTX
+  fdct8x16_neon,       // V_DCT
+  fidentity8x16_neon,  // H_DCT
+  fadst8x16_neon,      // V_ADST
+  fidentity8x16_neon,  // H_ADST
+  fadst8x16_neon,      // V_FLIPADST
+  fidentity8x16_neon   // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_neon,       // DCT_DCT
+  fdct8x16_neon,       // ADST_DCT
+  fadst8x16_neon,      // DCT_ADST
+  fadst8x16_neon,      // ADST_ADST
+  fdct8x16_neon,       // FLIPADST_DCT
+  fadst8x16_neon,      // DCT_FLIPADST
+  fadst8x16_neon,      // FLIPADST_FLIPADST
+  fadst8x16_neon,      // ADST_FLIPADST
+  fadst8x16_neon,      // FLIPADST_ADST
+  fidentity8x16_neon,  // IDTX
+  fidentity8x16_neon,  // V_DCT
+  fdct8x16_neon,       // H_DCT
+  fidentity8x16_neon,  // V_ADST
+  fadst8x16_neon,      // H_ADST
+  fidentity8x16_neon,  // V_FLIPADST
+  fadst8x16_neon       // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon row_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct8x32_neon,   // DCT_DCT
+  NULL,                // ADST_DCT
+  NULL,                // DCT_ADST
+  NULL,                // ADST_ADST
+  NULL,                // FLIPADST_DCT
+  NULL,                // DCT_FLIPADST
+  NULL,                // FLIPADST_FLIPADST
+  NULL,                // ADST_FLIPADST
+  NULL,                // FLIPADST_ADST
+  fidentity8x32_neon,  // IDTX
+  fidentity8x32_neon,  // V_DCT
+  av1_fdct8x32_neon,   // H_DCT
+  NULL,                // V_ADST
+  NULL,                // H_ADST
+  NULL,                // V_FLIPADST
+  NULL                 // H_FLIPADST
+};
+
+static const transform_1d_lbd_neon col_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct8x32_neon,   // DCT_DCT
+  NULL,                // ADST_DCT
+  NULL,                // DCT_ADST
+  NULL,                // ADST_ADST
+  NULL,                // FLIPADST_DCT
+  NULL,                // DCT_FLIPADST
+  NULL,                // FLIPADST_FLIPADST
+  NULL,                // ADST_FLIPADST
+  NULL,                // FLIPADST_ADST
+  fidentity8x32_neon,  // IDTX
+  av1_fdct8x32_neon,   // V_DCT
+  fidentity8x32_neon,  // H_DCT
+  NULL,                // V_ADST
+  NULL,                // H_ADST
+  NULL,                // V_FLIPADST
+  NULL                 // H_FLIPADST
+};
+
+void av1_lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[4], buf1[4], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 4;
+  const transform_1d_lbd_neon col_txfm = col_txfm4x4_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm4x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_4x4(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_neon(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift2);
+
+  transpose_16bit_4x4(buf, buf);
+  store_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)stride;
+  (void)bd;
+  int16x8_t buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
+  const int txw_idx = get_txw_idx(TX_4X8);
+  const int txh_idx = get_txh_idx(TX_4X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 8;
+  const transform_1d_lbd_neon col_txfm = col_txfm4x8_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_4x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_neon(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift2);
+  transpose_16bit_8x4(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
+  const int txw_idx = get_txw_idx(TX_4X16);
+  const int txh_idx = get_txh_idx(TX_4X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 16;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_4x8(buf0, buf1);
+  transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    int16x8_t *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_neon(buf1 + 8 * i, buf, width);
+    } else {
+      buf = buf1 + 8 * i;
+    }
+    row_txfm(buf, buf, cos_bit_row, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift2);
+    transpose_16bit_8x4(buf, buf);
+    store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
+  const int txw_idx = get_txw_idx(TX_8X4);
+  const int txh_idx = get_txh_idx(TX_8X4);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 4;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm4x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_neon(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift2);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 8;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_neon(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift2);
+  transpose_16bit_8x8(buf, buf);
+  store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 16;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    int16x8_t *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_neon(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift2);
+    transpose_16bit_8x8(buf, buf);
+    store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
+  const int txw_idx = get_txw_idx(TX_8X32);
+  const int txh_idx = get_txh_idx(TX_8X32);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 32;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit_vector(buf0, height, &v_shift0);
+  col_txfm(buf0, buf0, cos_bit_col, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift1);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+  transpose_16bit_8x8(buf0 + 16, buf1 + 16);
+  transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+
+  for (int i = 0; i < 4; i++) {
+    int16x8_t *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_neon(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift2);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
+  const int txw_idx = get_txw_idx(TX_16X4);
+  const int txh_idx = get_txh_idx(TX_16X4);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 4;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
+  int16x8_t *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit_vector(buf0, height, &v_shift0);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift1);
+    transpose_16bit_8x4(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_neon(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift2);
+  transpose_16bit_4x8(buf, buf);
+  store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_4x8(buf + 8, buf + 8);
+  store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 8;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
+  int16x8_t *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit_vector(buf0, height, &v_shift0);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift1);
+    transpose_16bit_8x8(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_neon(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row, NULL);
+  round_shift_16bit_vector(buf0, height, &v_shift2);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_8x8(buf + 8, buf + 8);
+  store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[16], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 16;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+  const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+  const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+  const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit_vector(buf0, height, &v_shift0);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift1);
+    transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    int16x8_t *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_neon(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row, NULL);
+    round_shift_16bit_vector(buf0, height, &v_shift2);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+    transpose_16bit_8x8(buf + 8, buf + 8);
+    store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                   8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
+  const int txw_idx = get_txw_idx(TX_16X32);
+  const int txh_idx = get_txh_idx(TX_16X32);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 32;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+    const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+    const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+    const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+    const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+
+    for (int i = 0; i < 2; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit_vector(buf0, height, &v_shift0);
+      col_txfm(buf0, buf0, cos_bit_col, NULL);
+      round_shift_16bit_vector(buf0, height, &v_shift1);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      int16x8_t *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_neon(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row, NULL);
+      round_shift_16bit_vector(buf0, height, &v_shift2);
+      transpose_16bit_8x8(buf, buf);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                          8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+                                          width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
+  const int txw_idx = get_txw_idx(TX_32X8);
+  const int txh_idx = get_txh_idx(TX_32X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 8;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+    const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+    const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+    const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+    const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit_vector(buf0, height, &v_shift0);
+      col_txfm(buf0, buf0, cos_bit_col, NULL);
+      round_shift_16bit_vector(buf0, height, &v_shift1);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 1; i++) {
+      int16x8_t *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_neon(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row, NULL);
+      round_shift_16bit_vector(buf, width, &v_shift2);
+      transpose_16bit_8x8(buf, buf);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                     height);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                     height);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                     width, height);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                     width, height);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 16;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
+    const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
+    const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
+    const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit_vector(buf0, height, &v_shift0);
+      col_txfm(buf0, buf0, cos_bit_col, NULL);
+      round_shift_16bit_vector(buf0, height, &v_shift1);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 2; i++) {
+      int16x8_t *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_neon(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row, NULL);
+      round_shift_16bit_vector(buf, width, &v_shift2);
+      transpose_16bit_8x8(buf, buf);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                          8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+                                          width, 8);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                          width, 8);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                          width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int16x8_t buf0[32], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32];
+  const int txw_idx = get_txw_idx(TX_32X32);
+  const int txh_idx = get_txh_idx(TX_32X32);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 32;
+  const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col, NULL);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      int16x8_t *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_neon(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row, NULL);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                     8);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                     width, 8);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                     width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X16;
+  int16x8_t buf0[64], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_lbd_neon col_txfm = fdct8x16_neon;
+  const transform_1d_lbd_neon row_txfm = av1_fdct8x64_neon;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < height_div8; i++) {
+    int16x8_t *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row, NULL);
+    round_shift_16bit(buf, width, shift[2]);
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      int16x8_t *buf8 = buf + 8 * j;
+      transpose_16bit_8x8(buf8, buf8);
+      store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8);
+    }
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_16X64;
+  int16x8_t buf0[64], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
+  const transform_1d_lbd_neon row_txfm = fdct8x16_neon;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    int16x8_t *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row, NULL);
+    round_shift_16bit(buf, width, shift[2]);
+    int32_t *output8 = output + 8 * width * i;
+    for (int j = 0; j < width_div8; ++j) {
+      int16x8_t *buf8 = buf + 8 * j;
+      transpose_16bit_8x8(buf8, buf8);
+      store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8);
+    }
+  }
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+#define TRANSPOSE_4X4_L32(x0, x1, x2, x3, y0, y1, y2, y3)      \
+  do {                                                         \
+    int32x4x2_t temp01 = vzipq_s32(x0, x1);                    \
+    int32x4x2_t temp23 = vzipq_s32(x2, x3);                    \
+    int32x4x2_t y01 = vzipq_s32(temp01.val[0], temp23.val[0]); \
+    int32x4x2_t y23 = vzipq_s32(temp01.val[1], temp23.val[1]); \
+    y0 = y01.val[0];                                           \
+    y1 = y01.val[1];                                           \
+    y2 = y23.val[0];                                           \
+    y3 = y23.val[1];                                           \
+  } while (0)
+
+static INLINE void transpose_32_4x4x2(int stride, const int32x4_t *inputA,
+                                      const int32x4_t *inputB,
+                                      int32x4_t *output) {
+  TRANSPOSE_4X4_L32(inputA[0], inputA[2], inputA[1], inputA[3],
+                    output[0 * stride], output[1 * stride], output[2 * stride],
+                    output[3 * stride]);
+  TRANSPOSE_4X4_L32(inputB[0], inputB[2], inputB[1], inputB[3],
+                    output[4 * stride], output[5 * stride], output[6 * stride],
+                    output[7 * stride]);
+}
+
+static void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output,
+                                int cos_bit, const int stride,
+                                const int8_t *stage_range) {
+  (void)stage_range;
+  int32x4_t buf0[32];
+  int32x4_t buf1[32];
+  const int32_t *cospi;
+  cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  int startidx = 0 * stride;
+  int endidx = 31 * stride;
+  // stage 0
+  // stage 1
+  buf1[0] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[31] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[1] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[30] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[2] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[29] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[3] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[28] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[4] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[27] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[5] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[26] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[6] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[25] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[7] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[24] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[8] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[23] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[9] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[22] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[10] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[21] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[11] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[20] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[12] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[19] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[13] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[18] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[14] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[17] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[15] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[16] = vsubq_s32(input[startidx], input[endidx]);
+
+  // stage 2
+  buf0[0] = vaddq_s32(buf1[0], buf1[15]);
+  buf0[15] = vsubq_s32(buf1[0], buf1[15]);
+  buf0[1] = vaddq_s32(buf1[1], buf1[14]);
+  buf0[14] = vsubq_s32(buf1[1], buf1[14]);
+  buf0[2] = vaddq_s32(buf1[2], buf1[13]);
+  buf0[13] = vsubq_s32(buf1[2], buf1[13]);
+  buf0[3] = vaddq_s32(buf1[3], buf1[12]);
+  buf0[12] = vsubq_s32(buf1[3], buf1[12]);
+  buf0[4] = vaddq_s32(buf1[4], buf1[11]);
+  buf0[11] = vsubq_s32(buf1[4], buf1[11]);
+  buf0[5] = vaddq_s32(buf1[5], buf1[10]);
+  buf0[10] = vsubq_s32(buf1[5], buf1[10]);
+  buf0[6] = vaddq_s32(buf1[6], buf1[9]);
+  buf0[9] = vsubq_s32(buf1[6], buf1[9]);
+  buf0[7] = vaddq_s32(buf1[7], buf1[8]);
+  buf0[8] = vsubq_s32(buf1[7], buf1[8]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  btf_32_neon_mode0(cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                    buf0[27], v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                    buf0[25], v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                    buf0[24], v_cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = vaddq_s32(buf0[0], buf0[7]);
+  buf1[7] = vsubq_s32(buf0[0], buf0[7]);
+  buf1[1] = vaddq_s32(buf0[1], buf0[6]);
+  buf1[6] = vsubq_s32(buf0[1], buf0[6]);
+  buf1[2] = vaddq_s32(buf0[2], buf0[5]);
+  buf1[5] = vsubq_s32(buf0[2], buf0[5]);
+  buf1[3] = vaddq_s32(buf0[3], buf0[4]);
+  buf1[4] = vsubq_s32(buf0[3], buf0[4]);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  btf_32_neon_mode0(cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                    buf1[13], v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                    buf1[12], v_cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  buf1[16] = vaddq_s32(buf0[16], buf0[23]);
+  buf1[23] = vsubq_s32(buf0[16], buf0[23]);
+  buf1[17] = vaddq_s32(buf0[17], buf0[22]);
+  buf1[22] = vsubq_s32(buf0[17], buf0[22]);
+  buf1[18] = vaddq_s32(buf0[18], buf0[21]);
+  buf1[21] = vsubq_s32(buf0[18], buf0[21]);
+  buf1[19] = vaddq_s32(buf0[19], buf0[20]);
+  buf1[20] = vsubq_s32(buf0[19], buf0[20]);
+  buf1[24] = vsubq_s32(buf0[31], buf0[24]);
+  buf1[31] = vaddq_s32(buf0[31], buf0[24]);
+  buf1[25] = vsubq_s32(buf0[30], buf0[25]);
+  buf1[30] = vaddq_s32(buf0[30], buf0[25]);
+  buf1[26] = vsubq_s32(buf0[29], buf0[26]);
+  buf1[29] = vaddq_s32(buf0[29], buf0[26]);
+  buf1[27] = vsubq_s32(buf0[28], buf0[27]);
+  buf1[28] = vaddq_s32(buf0[28], buf0[27]);
+
+  // stage 4
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = vaddq_s32(buf1[0], buf1[3]);
+  buf0[3] = vsubq_s32(buf1[0], buf1[3]);
+  buf0[1] = vaddq_s32(buf1[1], buf1[2]);
+  buf0[2] = vsubq_s32(buf1[1], buf1[2]);
+  buf0[4] = buf1[4];
+  btf_32_neon_mode0(cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+                    v_cos_bit);
+  buf0[7] = buf1[7];
+  buf0[8] = vaddq_s32(buf1[8], buf1[11]);
+  buf0[11] = vsubq_s32(buf1[8], buf1[11]);
+  buf0[9] = vaddq_s32(buf1[9], buf1[10]);
+  buf0[10] = vsubq_s32(buf1[9], buf1[10]);
+  buf0[12] = vsubq_s32(buf1[15], buf1[12]);
+  buf0[15] = vaddq_s32(buf1[15], buf1[12]);
+  buf0[13] = vsubq_s32(buf1[14], buf1[13]);
+  buf0[14] = vaddq_s32(buf1[14], buf1[13]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  btf_32_neon_mode0(cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                    buf0[29], v_cos_bit);
+  btf_32_neon_mode0(cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                    buf0[28], v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], buf1[20], buf1[27], buf0[20],
+                     buf0[27], v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], buf1[21], buf1[26], buf0[21],
+                     buf0[26], v_cos_bit);
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  cospi = cospi_arr(cos_bit);
+  btf_32_neon(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+              v_cos_bit);
+  btf_32_type1_neon(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+                    v_cos_bit);
+  buf1[4] = vaddq_s32(buf0[4], buf0[5]);
+  buf1[5] = vsubq_s32(buf0[4], buf0[5]);
+  buf1[6] = vsubq_s32(buf0[7], buf0[6]);
+  buf1[7] = vaddq_s32(buf0[7], buf0[6]);
+  buf1[8] = buf0[8];
+  btf_32_neon_mode0(cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], buf0[10], buf0[13], buf1[10],
+                     buf1[13], v_cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  buf1[16] = vaddq_s32(buf0[16], buf0[19]);
+  buf1[19] = vsubq_s32(buf0[16], buf0[19]);
+  buf1[17] = vaddq_s32(buf0[17], buf0[18]);
+  buf1[18] = vsubq_s32(buf0[17], buf0[18]);
+  buf1[20] = vsubq_s32(buf0[23], buf0[20]);
+  buf1[23] = vaddq_s32(buf0[23], buf0[20]);
+  buf1[21] = vsubq_s32(buf0[22], buf0[21]);
+  buf1[22] = vaddq_s32(buf0[22], buf0[21]);
+  buf1[24] = vaddq_s32(buf0[24], buf0[27]);
+  buf1[27] = vsubq_s32(buf0[24], buf0[27]);
+  buf1[25] = vaddq_s32(buf0[25], buf0[26]);
+  buf1[26] = vsubq_s32(buf0[25], buf0[26]);
+  buf1[28] = vsubq_s32(buf0[31], buf0[28]);
+  buf1[31] = vaddq_s32(buf0[31], buf0[28]);
+  buf1[29] = vsubq_s32(buf0[30], buf0[29]);
+  buf1[30] = vaddq_s32(buf0[30], buf0[29]);
+
+  // stage 6
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  btf_32_type1_neon(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+                    v_cos_bit);
+  buf0[8] = vaddq_s32(buf1[8], buf1[9]);
+  buf0[9] = vsubq_s32(buf1[8], buf1[9]);
+  buf0[10] = vsubq_s32(buf1[11], buf1[10]);
+  buf0[11] = vaddq_s32(buf1[11], buf1[10]);
+  buf0[12] = vaddq_s32(buf1[12], buf1[13]);
+  buf0[13] = vsubq_s32(buf1[12], buf1[13]);
+  buf0[14] = vsubq_s32(buf1[15], buf1[14]);
+  buf0[15] = vaddq_s32(buf1[15], buf1[14]);
+  buf0[16] = buf1[16];
+  btf_32_neon_mode0(cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], buf0[30],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[56], cospi[8], buf1[18], buf1[29], buf0[18],
+                     buf0[29], v_cos_bit);
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+  btf_32_neon_mode0(cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+  btf_32_neon_mode01(cospi[24], cospi[40], buf1[22], buf1[25], buf0[22],
+                     buf0[25], v_cos_bit);
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+
+  btf_32_type1_neon(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], buf1[14],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                    buf1[13], v_cos_bit);
+  btf_32_type1_neon(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                    buf1[12], v_cos_bit);
+  buf1[16] = vaddq_s32(buf0[16], buf0[17]);
+  buf1[17] = vsubq_s32(buf0[16], buf0[17]);
+  buf1[18] = vsubq_s32(buf0[19], buf0[18]);
+  buf1[19] = vaddq_s32(buf0[19], buf0[18]);
+  buf1[20] = vaddq_s32(buf0[20], buf0[21]);
+  buf1[21] = vsubq_s32(buf0[20], buf0[21]);
+  buf1[22] = vsubq_s32(buf0[23], buf0[22]);
+  buf1[23] = vaddq_s32(buf0[23], buf0[22]);
+  buf1[24] = vaddq_s32(buf0[24], buf0[25]);
+  buf1[25] = vsubq_s32(buf0[24], buf0[25]);
+  buf1[26] = vsubq_s32(buf0[27], buf0[26]);
+  buf1[27] = vaddq_s32(buf0[27], buf0[26]);
+  buf1[28] = vaddq_s32(buf0[28], buf0[29]);
+  buf1[29] = vsubq_s32(buf0[28], buf0[29]);
+  buf1[30] = vsubq_s32(buf0[31], buf0[30]);
+  buf1[31] = vaddq_s32(buf0[31], buf0[30]);
+
+  // stage 8
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+
+  btf_32_type1_neon(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], buf0[31],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                    buf0[30], v_cos_bit);
+  btf_32_type1_neon(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                    buf0[29], v_cos_bit);
+  btf_32_type1_neon(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                    buf0[28], v_cos_bit);
+  btf_32_type1_neon(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                    buf0[27], v_cos_bit);
+  btf_32_type1_neon(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+  btf_32_type1_neon(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                    buf0[25], v_cos_bit);
+  btf_32_type1_neon(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], buf0[24],
+                    v_cos_bit);
+
+  startidx = 0 * stride;
+  endidx = 31 * stride;
+  // stage 9
+  output[startidx] = buf0[0];
+  output[endidx] = buf0[31];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[16];
+  output[endidx] = buf0[15];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[8];
+  output[endidx] = buf0[23];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[24];
+  output[endidx] = buf0[7];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[4];
+  output[endidx] = buf0[27];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[20];
+  output[endidx] = buf0[11];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[12];
+  output[endidx] = buf0[19];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[28];
+  output[endidx] = buf0[3];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[2];
+  output[endidx] = buf0[29];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[18];
+  output[endidx] = buf0[13];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[10];
+  output[endidx] = buf0[21];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[26];
+  output[endidx] = buf0[5];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[6];
+  output[endidx] = buf0[25];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[22];
+  output[endidx] = buf0[9];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[14];
+  output[endidx] = buf0[17];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[30];
+  output[endidx] = buf0[1];
+}
+
+static void av1_fdct64_new_stage1234_neon(int32x4_t *input, const int instride,
+                                          int32x4_t *x3, int32x4_t *x4,
+                                          const int32_t *cospi,
+                                          const int32x4_t *v_cos_bit,
+                                          int *startidx, int *endidx) {
+  // stage 1
+  int32x4_t x1[64];
+  x1[0] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[63] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[1] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[62] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[2] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[61] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[3] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[60] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[4] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[59] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[5] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[58] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[6] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[57] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[7] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[56] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[8] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[55] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[9] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[54] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[10] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[53] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[11] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[52] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[12] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[51] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[13] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[50] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[14] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[49] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[15] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[48] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[16] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[47] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[17] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[46] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[18] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[45] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[19] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[44] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[20] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[43] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[21] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[42] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[22] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[41] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[23] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[40] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[24] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[39] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[25] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[38] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[26] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[37] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[27] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[36] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[28] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[35] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[29] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[34] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[30] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[33] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[31] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[32] = vsubq_s32(input[*startidx], input[*endidx]);
+
+  // stage 2
+  int32x4_t x2[64];
+  x2[0] = vaddq_s32(x1[0], x1[31]);
+  x2[31] = vsubq_s32(x1[0], x1[31]);
+  x2[1] = vaddq_s32(x1[1], x1[30]);
+  x2[30] = vsubq_s32(x1[1], x1[30]);
+  x2[2] = vaddq_s32(x1[2], x1[29]);
+  x2[29] = vsubq_s32(x1[2], x1[29]);
+  x2[3] = vaddq_s32(x1[3], x1[28]);
+  x2[28] = vsubq_s32(x1[3], x1[28]);
+  x2[4] = vaddq_s32(x1[4], x1[27]);
+  x2[27] = vsubq_s32(x1[4], x1[27]);
+  x2[5] = vaddq_s32(x1[5], x1[26]);
+  x2[26] = vsubq_s32(x1[5], x1[26]);
+  x2[6] = vaddq_s32(x1[6], x1[25]);
+  x2[25] = vsubq_s32(x1[6], x1[25]);
+  x2[7] = vaddq_s32(x1[7], x1[24]);
+  x2[24] = vsubq_s32(x1[7], x1[24]);
+  x2[8] = vaddq_s32(x1[8], x1[23]);
+  x2[23] = vsubq_s32(x1[8], x1[23]);
+  x2[9] = vaddq_s32(x1[9], x1[22]);
+  x2[22] = vsubq_s32(x1[9], x1[22]);
+  x2[10] = vaddq_s32(x1[10], x1[21]);
+  x2[21] = vsubq_s32(x1[10], x1[21]);
+  x2[11] = vaddq_s32(x1[11], x1[20]);
+  x2[20] = vsubq_s32(x1[11], x1[20]);
+  x2[12] = vaddq_s32(x1[12], x1[19]);
+  x2[19] = vsubq_s32(x1[12], x1[19]);
+  x2[13] = vaddq_s32(x1[13], x1[18]);
+  x2[18] = vsubq_s32(x1[13], x1[18]);
+  x2[14] = vaddq_s32(x1[14], x1[17]);
+  x2[17] = vsubq_s32(x1[14], x1[17]);
+  x2[15] = vaddq_s32(x1[15], x1[16]);
+  x2[16] = vsubq_s32(x1[15], x1[16]);
+
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[40], x1[55], x2[40], x2[55],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[41], x1[54], x2[41], x2[54],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[42], x1[53], x2[42], x2[53],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[43], x1[52], x2[43], x2[52],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[44], x1[51], x2[44], x2[51],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[45], x1[50], x2[45], x2[50],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[46], x1[49], x2[46], x2[49],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x1[47], x1[48], x2[47], x2[48],
+                    *v_cos_bit);
+
+  // stage 3
+  x3[0] = vaddq_s32(x2[0], x2[15]);
+  x3[15] = vsubq_s32(x2[0], x2[15]);
+  x3[1] = vaddq_s32(x2[1], x2[14]);
+  x3[14] = vsubq_s32(x2[1], x2[14]);
+  x3[2] = vaddq_s32(x2[2], x2[13]);
+  x3[13] = vsubq_s32(x2[2], x2[13]);
+  x3[3] = vaddq_s32(x2[3], x2[12]);
+  x3[12] = vsubq_s32(x2[3], x2[12]);
+  x3[4] = vaddq_s32(x2[4], x2[11]);
+  x3[11] = vsubq_s32(x2[4], x2[11]);
+  x3[5] = vaddq_s32(x2[5], x2[10]);
+  x3[10] = vsubq_s32(x2[5], x2[10]);
+  x3[6] = vaddq_s32(x2[6], x2[9]);
+  x3[9] = vsubq_s32(x2[6], x2[9]);
+  x3[7] = vaddq_s32(x2[7], x2[8]);
+  x3[8] = vsubq_s32(x2[7], x2[8]);
+
+  btf_32_neon_mode0(cospi[32], cospi[32], x2[20], x2[27], x3[20], x3[27],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x2[21], x2[26], x3[21], x3[26],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x2[22], x2[25], x3[22], x3[25],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x2[23], x2[24], x3[23], x3[24],
+                    *v_cos_bit);
+
+  x3[32] = vaddq_s32(x1[32], x2[47]);
+  x3[47] = vsubq_s32(x1[32], x2[47]);
+  x3[33] = vaddq_s32(x1[33], x2[46]);
+  x3[46] = vsubq_s32(x1[33], x2[46]);
+  x3[34] = vaddq_s32(x1[34], x2[45]);
+  x3[45] = vsubq_s32(x1[34], x2[45]);
+  x3[35] = vaddq_s32(x1[35], x2[44]);
+  x3[44] = vsubq_s32(x1[35], x2[44]);
+  x3[36] = vaddq_s32(x1[36], x2[43]);
+  x3[43] = vsubq_s32(x1[36], x2[43]);
+  x3[37] = vaddq_s32(x1[37], x2[42]);
+  x3[42] = vsubq_s32(x1[37], x2[42]);
+  x3[38] = vaddq_s32(x1[38], x2[41]);
+  x3[41] = vsubq_s32(x1[38], x2[41]);
+  x3[39] = vaddq_s32(x1[39], x2[40]);
+  x3[40] = vsubq_s32(x1[39], x2[40]);
+  x3[48] = vsubq_s32(x1[63], x2[48]);
+  x3[63] = vaddq_s32(x1[63], x2[48]);
+  x3[49] = vsubq_s32(x1[62], x2[49]);
+  x3[62] = vaddq_s32(x1[62], x2[49]);
+  x3[50] = vsubq_s32(x1[61], x2[50]);
+  x3[61] = vaddq_s32(x1[61], x2[50]);
+  x3[51] = vsubq_s32(x1[60], x2[51]);
+  x3[60] = vaddq_s32(x1[60], x2[51]);
+  x3[52] = vsubq_s32(x1[59], x2[52]);
+  x3[59] = vaddq_s32(x1[59], x2[52]);
+  x3[53] = vsubq_s32(x1[58], x2[53]);
+  x3[58] = vaddq_s32(x1[58], x2[53]);
+  x3[54] = vsubq_s32(x1[57], x2[54]);
+  x3[57] = vaddq_s32(x1[57], x2[54]);
+  x3[55] = vsubq_s32(x1[56], x2[55]);
+  x3[56] = vaddq_s32(x1[56], x2[55]);
+
+  // stage 4
+  x4[0] = vaddq_s32(x3[0], x3[7]);
+  x4[7] = vsubq_s32(x3[0], x3[7]);
+  x4[1] = vaddq_s32(x3[1], x3[6]);
+  x4[6] = vsubq_s32(x3[1], x3[6]);
+  x4[2] = vaddq_s32(x3[2], x3[5]);
+  x4[5] = vsubq_s32(x3[2], x3[5]);
+  x4[3] = vaddq_s32(x3[3], x3[4]);
+  x4[4] = vsubq_s32(x3[3], x3[4]);
+
+  btf_32_neon_mode0(cospi[32], cospi[32], x3[10], x3[13], x4[10], x4[13],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[32], cospi[32], x3[11], x3[12], x4[11], x4[12],
+                    *v_cos_bit);
+
+  x4[16] = vaddq_s32(x2[16], x3[23]);
+  x4[23] = vsubq_s32(x2[16], x3[23]);
+  x4[17] = vaddq_s32(x2[17], x3[22]);
+  x4[22] = vsubq_s32(x2[17], x3[22]);
+  x4[18] = vaddq_s32(x2[18], x3[21]);
+  x4[21] = vsubq_s32(x2[18], x3[21]);
+  x4[19] = vaddq_s32(x2[19], x3[20]);
+  x4[20] = vsubq_s32(x2[19], x3[20]);
+  x4[24] = vsubq_s32(x2[31], x3[24]);
+  x4[31] = vaddq_s32(x2[31], x3[24]);
+  x4[25] = vsubq_s32(x2[30], x3[25]);
+  x4[30] = vaddq_s32(x2[30], x3[25]);
+  x4[26] = vsubq_s32(x2[29], x3[26]);
+  x4[29] = vaddq_s32(x2[29], x3[26]);
+  x4[27] = vsubq_s32(x2[28], x3[27]);
+  x4[28] = vaddq_s32(x2[28], x3[27]);
+
+  btf_32_neon_mode0(cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
+                    *v_cos_bit);
+  btf_32_neon_mode0(cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
+                    *v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x3[40], x3[55], x4[40], x4[55],
+                     *v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x3[41], x3[54], x4[41], x4[54],
+                     *v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x3[42], x3[53], x4[42], x4[53],
+                     *v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x3[43], x3[52], x4[43], x4[52],
+                     *v_cos_bit);
+}
+
+static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
+                                int8_t cos_bit, const int instride,
+                                const int outstride,
+                                const int8_t *stage_range) {
+  (void)stage_range;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  int startidx = 0 * instride;
+  int endidx = 63 * instride;
+
+  // stage 1-2-3-4
+  int32x4_t x3[64], x4[64];
+  av1_fdct64_new_stage1234_neon(input, instride, x3, x4, cospi, &v_cos_bit,
+                                &startidx, &endidx);
+
+  // stage 5
+  int32x4_t x5[64];
+  x5[0] = vaddq_s32(x4[0], x4[3]);
+  x5[3] = vsubq_s32(x4[0], x4[3]);
+  x5[1] = vaddq_s32(x4[1], x4[2]);
+  x5[2] = vsubq_s32(x4[1], x4[2]);
+
+  btf_32_neon_mode0(cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
+                    v_cos_bit);
+
+  x5[8] = vaddq_s32(x3[8], x4[11]);
+  x5[11] = vsubq_s32(x3[8], x4[11]);
+  x5[9] = vaddq_s32(x3[9], x4[10]);
+  x5[10] = vsubq_s32(x3[9], x4[10]);
+  x5[12] = vsubq_s32(x3[15], x4[12]);
+  x5[15] = vaddq_s32(x3[15], x4[12]);
+  x5[13] = vsubq_s32(x3[14], x4[13]);
+  x5[14] = vaddq_s32(x3[14], x4[13]);
+
+  btf_32_neon_mode0(cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
+                    v_cos_bit);
+  btf_32_neon_mode0(cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x4[20], x4[27], x5[20], x5[27],
+                     v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x4[21], x4[26], x5[21], x5[26],
+                     v_cos_bit);
+
+  x5[32] = vaddq_s32(x3[32], x4[39]);
+  x5[39] = vsubq_s32(x3[32], x4[39]);
+  x5[33] = vaddq_s32(x3[33], x4[38]);
+  x5[38] = vsubq_s32(x3[33], x4[38]);
+  x5[34] = vaddq_s32(x3[34], x4[37]);
+  x5[37] = vsubq_s32(x3[34], x4[37]);
+  x5[35] = vaddq_s32(x3[35], x4[36]);
+  x5[36] = vsubq_s32(x3[35], x4[36]);
+  x5[40] = vsubq_s32(x3[47], x4[40]);
+  x5[47] = vaddq_s32(x3[47], x4[40]);
+  x5[41] = vsubq_s32(x3[46], x4[41]);
+  x5[46] = vaddq_s32(x3[46], x4[41]);
+  x5[42] = vsubq_s32(x3[45], x4[42]);
+  x5[45] = vaddq_s32(x3[45], x4[42]);
+  x5[43] = vsubq_s32(x3[44], x4[43]);
+  x5[44] = vaddq_s32(x3[44], x4[43]);
+  x5[48] = vaddq_s32(x3[48], x4[55]);
+  x5[55] = vsubq_s32(x3[48], x4[55]);
+  x5[49] = vaddq_s32(x3[49], x4[54]);
+  x5[54] = vsubq_s32(x3[49], x4[54]);
+  x5[50] = vaddq_s32(x3[50], x4[53]);
+  x5[53] = vsubq_s32(x3[50], x4[53]);
+  x5[51] = vaddq_s32(x3[51], x4[52]);
+  x5[52] = vsubq_s32(x3[51], x4[52]);
+  x5[56] = vsubq_s32(x3[63], x4[56]);
+  x5[63] = vaddq_s32(x3[63], x4[56]);
+  x5[57] = vsubq_s32(x3[62], x4[57]);
+  x5[62] = vaddq_s32(x3[62], x4[57]);
+  x5[58] = vsubq_s32(x3[61], x4[58]);
+  x5[61] = vaddq_s32(x3[61], x4[58]);
+  x5[59] = vsubq_s32(x3[60], x4[59]);
+  x5[60] = vaddq_s32(x3[60], x4[59]);
+
+  // stage 6
+  int32x4_t x6[64];
+  btf_32_neon(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1], v_cos_bit);
+  btf_32_type1_neon(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
+                    v_cos_bit);
+  x6[4] = vaddq_s32(x4[4], x5[5]);
+  x6[5] = vsubq_s32(x4[4], x5[5]);
+  x6[6] = vsubq_s32(x4[7], x5[6]);
+  x6[7] = vaddq_s32(x4[7], x5[6]);
+  btf_32_neon_mode0(cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[48], cospi[16], x5[10], x5[13], x6[10], x6[13],
+                     v_cos_bit);
+
+  x6[16] = vaddq_s32(x4[16], x5[19]);
+  x6[19] = vsubq_s32(x4[16], x5[19]);
+  x6[17] = vaddq_s32(x4[17], x5[18]);
+  x6[18] = vsubq_s32(x4[17], x5[18]);
+  x6[20] = vsubq_s32(x4[23], x5[20]);
+  x6[23] = vaddq_s32(x4[23], x5[20]);
+  x6[21] = vsubq_s32(x4[22], x5[21]);
+  x6[22] = vaddq_s32(x4[22], x5[21]);
+  x6[24] = vaddq_s32(x4[24], x5[27]);
+  x6[27] = vsubq_s32(x4[24], x5[27]);
+  x6[25] = vaddq_s32(x4[25], x5[26]);
+  x6[26] = vsubq_s32(x4[25], x5[26]);
+  x6[28] = vsubq_s32(x4[31], x5[28]);
+  x6[31] = vaddq_s32(x4[31], x5[28]);
+  x6[29] = vsubq_s32(x4[30], x5[29]);
+  x6[30] = vaddq_s32(x4[30], x5[29]);
+
+  btf_32_neon_mode0(cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
+                    v_cos_bit);
+  btf_32_neon_mode0(cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[56], cospi[8], x5[36], x5[59], x6[36], x6[59],
+                     v_cos_bit);
+  btf_32_neon_mode01(cospi[56], cospi[8], x5[37], x5[58], x6[37], x6[58],
+                     v_cos_bit);
+  btf_32_neon_mode0(cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
+                    v_cos_bit);
+  btf_32_neon_mode0(cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[24], cospi[40], x5[44], x5[51], x6[44], x6[51],
+                     v_cos_bit);
+  btf_32_neon_mode01(cospi[24], cospi[40], x5[45], x5[50], x6[45], x6[50],
+                     v_cos_bit);
+
+  // stage 7
+  int32x4_t x7[64];
+
+  btf_32_type1_neon(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
+  btf_32_type1_neon(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
+                    v_cos_bit);
+  x7[8] = vaddq_s32(x5[8], x6[9]);
+  x7[9] = vsubq_s32(x5[8], x6[9]);
+  x7[10] = vsubq_s32(x5[11], x6[10]);
+  x7[11] = vaddq_s32(x5[11], x6[10]);
+  x7[12] = vaddq_s32(x5[12], x6[13]);
+  x7[13] = vsubq_s32(x5[12], x6[13]);
+  x7[14] = vsubq_s32(x5[15], x6[14]);
+  x7[15] = vaddq_s32(x5[15], x6[14]);
+
+  btf_32_neon_mode0(cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[56], cospi[8], x6[18], x6[29], x7[18], x7[29],
+                     v_cos_bit);
+
+  btf_32_neon_mode0(cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[24], cospi[40], x6[22], x6[25], x7[22], x7[25],
+                     v_cos_bit);
+
+  x7[32] = vaddq_s32(x5[32], x6[35]);
+  x7[35] = vsubq_s32(x5[32], x6[35]);
+  x7[33] = vaddq_s32(x5[33], x6[34]);
+  x7[34] = vsubq_s32(x5[33], x6[34]);
+  x7[36] = vsubq_s32(x5[39], x6[36]);
+  x7[39] = vaddq_s32(x5[39], x6[36]);
+  x7[37] = vsubq_s32(x5[38], x6[37]);
+  x7[38] = vaddq_s32(x5[38], x6[37]);
+  x7[40] = vaddq_s32(x5[40], x6[43]);
+  x7[43] = vsubq_s32(x5[40], x6[43]);
+  x7[41] = vaddq_s32(x5[41], x6[42]);
+  x7[42] = vsubq_s32(x5[41], x6[42]);
+  x7[44] = vsubq_s32(x5[47], x6[44]);
+  x7[47] = vaddq_s32(x5[47], x6[44]);
+  x7[45] = vsubq_s32(x5[46], x6[45]);
+  x7[46] = vaddq_s32(x5[46], x6[45]);
+  x7[48] = vaddq_s32(x5[48], x6[51]);
+  x7[51] = vsubq_s32(x5[48], x6[51]);
+  x7[49] = vaddq_s32(x5[49], x6[50]);
+  x7[50] = vsubq_s32(x5[49], x6[50]);
+  x7[52] = vsubq_s32(x5[55], x6[52]);
+  x7[55] = vaddq_s32(x5[55], x6[52]);
+  x7[53] = vsubq_s32(x5[54], x6[53]);
+  x7[54] = vaddq_s32(x5[54], x6[53]);
+  x7[56] = vaddq_s32(x5[56], x6[59]);
+  x7[59] = vsubq_s32(x5[56], x6[59]);
+  x7[57] = vaddq_s32(x5[57], x6[58]);
+  x7[58] = vsubq_s32(x5[57], x6[58]);
+  x7[60] = vsubq_s32(x5[63], x6[60]);
+  x7[63] = vaddq_s32(x5[63], x6[60]);
+  x7[61] = vsubq_s32(x5[62], x6[61]);
+  x7[62] = vaddq_s32(x5[62], x6[61]);
+
+  // stage 8
+  int32x4_t x8[64];
+
+  btf_32_type1_neon(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
+                    v_cos_bit);
+  x8[16] = vaddq_s32(x6[16], x7[17]);
+  x8[17] = vsubq_s32(x6[16], x7[17]);
+  x8[18] = vsubq_s32(x6[19], x7[18]);
+  x8[19] = vaddq_s32(x6[19], x7[18]);
+  x8[20] = vaddq_s32(x6[20], x7[21]);
+  x8[21] = vsubq_s32(x6[20], x7[21]);
+  x8[22] = vsubq_s32(x6[23], x7[22]);
+  x8[23] = vaddq_s32(x6[23], x7[22]);
+  x8[24] = vaddq_s32(x6[24], x7[25]);
+  x8[25] = vsubq_s32(x6[24], x7[25]);
+  x8[26] = vsubq_s32(x6[27], x7[26]);
+  x8[27] = vaddq_s32(x6[27], x7[26]);
+  x8[28] = vaddq_s32(x6[28], x7[29]);
+  x8[29] = vsubq_s32(x6[28], x7[29]);
+  x8[30] = vsubq_s32(x6[31], x7[30]);
+  x8[31] = vaddq_s32(x6[31], x7[30]);
+
+  btf_32_neon_mode0(cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[60], cospi[4], x7[34], x7[61], x8[34], x8[61],
+                     v_cos_bit);
+  btf_32_neon_mode0(cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[28], cospi[36], x7[38], x7[57], x8[38], x8[57],
+                     v_cos_bit);
+  btf_32_neon_mode0(cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[44], cospi[20], x7[42], x7[53], x8[42], x8[53],
+                     v_cos_bit);
+  btf_32_neon_mode0(cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
+                    v_cos_bit);
+  btf_32_neon_mode01(cospi[12], cospi[52], x7[46], x7[49], x8[46], x8[49],
+                     v_cos_bit);
+
+  // stage 9
+  int32x4_t x9[64];
+
+  btf_32_type1_neon(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
+                    v_cos_bit);
+  x9[32] = vaddq_s32(x7[32], x8[33]);
+  x9[33] = vsubq_s32(x7[32], x8[33]);
+  x9[34] = vsubq_s32(x7[35], x8[34]);
+  x9[35] = vaddq_s32(x7[35], x8[34]);
+  x9[36] = vaddq_s32(x7[36], x8[37]);
+  x9[37] = vsubq_s32(x7[36], x8[37]);
+  x9[38] = vsubq_s32(x7[39], x8[38]);
+  x9[39] = vaddq_s32(x7[39], x8[38]);
+  x9[40] = vaddq_s32(x7[40], x8[41]);
+  x9[41] = vsubq_s32(x7[40], x8[41]);
+  x9[42] = vsubq_s32(x7[43], x8[42]);
+  x9[43] = vaddq_s32(x7[43], x8[42]);
+  x9[44] = vaddq_s32(x7[44], x8[45]);
+  x9[45] = vsubq_s32(x7[44], x8[45]);
+  x9[46] = vsubq_s32(x7[47], x8[46]);
+  x9[47] = vaddq_s32(x7[47], x8[46]);
+  x9[48] = vaddq_s32(x7[48], x8[49]);
+  x9[49] = vsubq_s32(x7[48], x8[49]);
+  x9[50] = vsubq_s32(x7[51], x8[50]);
+  x9[51] = vaddq_s32(x7[51], x8[50]);
+  x9[52] = vaddq_s32(x7[52], x8[53]);
+  x9[53] = vsubq_s32(x7[52], x8[53]);
+  x9[54] = vsubq_s32(x7[55], x8[54]);
+  x9[55] = vaddq_s32(x7[55], x8[54]);
+  x9[56] = vaddq_s32(x7[56], x8[57]);
+  x9[57] = vsubq_s32(x7[56], x8[57]);
+  x9[58] = vsubq_s32(x7[59], x8[58]);
+  x9[59] = vaddq_s32(x7[59], x8[58]);
+  x9[60] = vaddq_s32(x7[60], x8[61]);
+  x9[61] = vsubq_s32(x7[60], x8[61]);
+  x9[62] = vsubq_s32(x7[63], x8[62]);
+  x9[63] = vaddq_s32(x7[63], x8[62]);
+
+  // stage 10
+  int32x4_t x10[64];
+
+  btf_32_type1_neon(cospi[63], cospi[1], x9[32], x9[63], x10[32], x10[63],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[31], cospi[33], x9[33], x9[62], x10[33], x10[62],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[47], cospi[17], x9[34], x9[61], x10[34], x10[61],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[15], cospi[49], x9[35], x9[60], x10[35], x10[60],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[55], cospi[9], x9[36], x9[59], x10[36], x10[59],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[23], cospi[41], x9[37], x9[58], x10[37], x10[58],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[39], cospi[25], x9[38], x9[57], x10[38], x10[57],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[7], cospi[57], x9[39], x9[56], x10[39], x10[56],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[59], cospi[5], x9[40], x9[55], x10[40], x10[55],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[27], cospi[37], x9[41], x9[54], x10[41], x10[54],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[43], cospi[21], x9[42], x9[53], x10[42], x10[53],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[11], cospi[53], x9[43], x9[52], x10[43], x10[52],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[51], cospi[13], x9[44], x9[51], x10[44], x10[51],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[19], cospi[45], x9[45], x9[50], x10[45], x10[50],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[35], cospi[29], x9[46], x9[49], x10[46], x10[49],
+                    v_cos_bit);
+  btf_32_type1_neon(cospi[3], cospi[61], x9[47], x9[48], x10[47], x10[48],
+                    v_cos_bit);
+
+  startidx = 0 * outstride;
+  endidx = 63 * outstride;
+  // stage 11
+  output[startidx] = x6[0];
+  output[endidx] = x10[63];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[32];
+  output[endidx] = x9[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[16];
+  output[endidx] = x10[47];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[48];
+  output[endidx] = x8[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x8[8];
+  output[endidx] = x10[55];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[40];
+  output[endidx] = x9[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[24];
+  output[endidx] = x10[39];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[56];
+  output[endidx] = x7[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x7[4];
+  output[endidx] = x10[59];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[36];
+  output[endidx] = x9[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[20];
+  output[endidx] = x10[43];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[52];
+  output[endidx] = x8[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x8[12];
+  output[endidx] = x10[51];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[44];
+  output[endidx] = x9[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[28];
+  output[endidx] = x10[35];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[60];
+  output[endidx] = x6[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x6[2];
+  output[endidx] = x10[61];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[34];
+  output[endidx] = x9[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[18];
+  output[endidx] = x10[45];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[50];
+  output[endidx] = x8[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x8[10];
+  output[endidx] = x10[53];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[42];
+  output[endidx] = x9[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[26];
+  output[endidx] = x10[37];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[58];
+  output[endidx] = x7[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x7[6];
+  output[endidx] = x10[57];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[38];
+  output[endidx] = x9[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[22];
+  output[endidx] = x10[41];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[54];
+  output[endidx] = x8[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x8[14];
+  output[endidx] = x10[49];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[46];
+  output[endidx] = x9[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x9[30];
+  output[endidx] = x10[33];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[62];
+  output[endidx] = x6[1];
+}
+
+static void av1_lowbd_fwd_txfm2d_64x64_neon(const int16_t *input,
+                                            int32_t *output, int stride,
+                                            TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  int16x8_t buf0[64], buf1[512];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    int32x4_t bufA[64];
+    int32x4_t bufB[64];
+    int16x8_t *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+      bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+    }
+    av1_fdct64_new_neon(bufA, bufA, cos_bit_row, 1, 1, NULL);
+    av1_fdct64_new_neon(bufB, bufB, cos_bit_row, 1, 1, NULL);
+    av1_round_shift_array_32_neon(bufA, bufA, 32);
+    av1_round_shift_array_32_neon(bufB, bufB, 32);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < width_div8; ++j) {
+      int32x4_t *out = (int32x4_t *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+static void av1_lowbd_fwd_txfm2d_64x32_neon(const int16_t *input,
+                                            int32_t *output, int stride,
+                                            TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_64X32;
+  int16x8_t buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  assert(tx_type == DCT_DCT);
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    int32x4_t bufA[64];
+    int32x4_t bufB[64];
+    int16x8_t *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+      bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+    }
+    av1_fdct64_new_neon(bufA, bufA, cos_bit_row, 1, 1, NULL);
+    av1_fdct64_new_neon(bufB, bufB, cos_bit_row, 1, 1, NULL);
+    av1_round_shift_rect_array_32_neon(bufA, bufA, 32);
+    av1_round_shift_rect_array_32_neon(bufB, bufB, 32);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < width_div8; ++j) {
+      int32x4_t *out = (int32x4_t *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static void av1_lowbd_fwd_txfm2d_32x64_neon(const int16_t *input,
+                                            int32_t *output, int stride,
+                                            TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_32X64;
+  int16x8_t buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col, NULL);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    int32x4_t bufA[32];
+    int32x4_t bufB[32];
+    int16x8_t *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+      bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+    }
+    av1_fdct32_new_neon(bufA, bufA, cos_bit_row, 1, NULL);
+    av1_fdct32_new_neon(bufB, bufB, cos_bit_row, 1, NULL);
+    av1_round_shift_rect_array_32_neon(bufA, bufA, 32);
+    av1_round_shift_rect_array_32_neon(bufB, bufB, 32);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < (32 / 4); ++j) {
+      int32x4_t *out = (int32x4_t *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_neon,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_neon,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_neon,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_neon,  // 32x32 transform
+  av1_lowbd_fwd_txfm2d_64x64_neon,  // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_neon,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_neon,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_neon,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_neon,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_neon,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_neon,  // 32x16 transform
+  av1_lowbd_fwd_txfm2d_32x64_neon,  // 32x64 transform
+  av1_lowbd_fwd_txfm2d_64x32_neon,  // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_neon,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_neon,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_neon,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_neon,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_neon,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_neon,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size];
+  if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  } else {
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+  }
+}
diff --git a/media/libaom/src/av1/encoder/arm/neon/av1_highbd_quantize_neon.c b/media/libaom/src/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
new file mode 100644
index 0000000000..197eae09b3
--- /dev/null
+++ b/media/libaom/src/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/encoder/av1_quantize.h"
+
+static INLINE uint16x4_t quantize_4(const tran_low_t *coeff_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    int32x4_t v_quant_s32,
+                                    int32x4_t v_dequant_s32,
+                                    int32x4_t v_round_s32, int log_scale) {
+  const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+  const int32x4_t v_coeff_sign =
+      vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_log_scale = vdupq_n_s32(log_scale);
+  const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+  // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01])
+  const int32x4_t v_abs_coeff_scaled =
+      vshlq_s32(v_abs_coeff, vdupq_n_s32(1 + log_scale));
+  const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32);
+  // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+  const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32),
+                                    vreinterpretq_s32_u32(v_mask));
+  //  const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
+  const int32x4_t v_abs_qcoeff =
+      vqdmulhq_s32(vshlq_s32(v_tmp, v_log_scale), v_quant_s32);
+  //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_qcoeff =
+      vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  // vshlq_s32 will shift right if shift value is negative.
+  const int32x4_t v_abs_dqcoeff =
+      vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale));
+  //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_dqcoeff =
+      vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+  vst1q_s32(qcoeff_ptr, v_qcoeff);
+  vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+  // Used to find eob.
+  const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0));
+  return vmovn_u32(nz_qcoeff_mask);
+}
+
+static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+                                         int16x8_t v_eobmax,
+                                         uint16x8_t v_mask) {
+  const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+  return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#ifdef __aarch64__
+  return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+  const int16x4_t v_eobmax_3210 =
+      vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+  const int64x1_t v_eobmax_xx32 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+  const int16x4_t v_eobmax_tmp =
+      vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+  const int64x1_t v_eobmax_xxx3 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+  const int16x4_t v_eobmax_final =
+      vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif
+}
+
+void av1_highbd_quantize_fp_neon(
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  (void)scan;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  const int16x4_t v_quant = vld1_s16(quant_ptr);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
+  const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
+  const int16x4_t v_round_log_scale =
+      vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+  const int16x4_t v_round =
+      vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
+  int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+  int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+  // DC and first 3 AC
+  v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+                         v_dequant_s32, v_round_s32, log_scale);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+  // 4 more AC
+  v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                         v_quant_s32, v_dequant_s32, v_round_s32, log_scale);
+
+  // Find the max lane eob for the first 8 coeffs.
+  v_eobmax =
+      get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+  count -= 8;
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+                           v_dequant_s32, v_round_s32, log_scale);
+    v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                           v_quant_s32, v_dequant_s32, v_round_s32, log_scale);
+    // Find the max lane eob for 8 coeffs.
+    v_eobmax =
+        get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+    count -= 8;
+  } while (count);
+
+  *eob_ptr = get_max_eob(v_eobmax);
+}
diff --git a/media/libaom/src/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c b/media/libaom/src/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
new file mode 100644
index 0000000000..ad81f40c3e
--- /dev/null
+++ b/media/libaom/src/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+  const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
+  const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+  const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
+  const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
+                                vget_low_s64(fedcba98_76543210));
+  const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
+  return sum_diff;
+}
+
+// Denoise a 16x1 vector.
+static INLINE int8x16_t denoiser_16x1_neon(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
+    const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
+    const uint8x16_t v_delta_level_1_and_2,
+    const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
+  const uint8x16_t v_sig = vld1q_u8(sig);
+  const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+  /* Calculate absolute difference and sign masks. */
+  const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+  /* Figure out which level that put us in. */
+  const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
+  const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
+  const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
+
+  /* Calculate absolute adjustments for level 1, 2 and 3. */
+  const uint8x16_t v_level2_adjustment =
+      vandq_u8(v_level2_mask, v_delta_level_1_and_2);
+  const uint8x16_t v_level3_adjustment =
+      vandq_u8(v_level3_mask, v_delta_level_2_and_3);
+  const uint8x16_t v_level1and2_adjustment =
+      vaddq_u8(v_level1_adjustment, v_level2_adjustment);
+  const uint8x16_t v_level1and2and3_adjustment =
+      vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
+
+  /* Figure adjustment absolute value by selecting between the absolute
+   * difference if in level0 or the value for level 1, 2 and 3.
+   */
+  const uint8x16_t v_abs_adjustment =
+      vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
+
+  /* Calculate positive and negative adjustments. Apply them to the signal
+   * and accumulate them. Adjustments are less than eight and the maximum
+   * sum of them (7 * 16) can fit in a signed char.
+   */
+  const uint8x16_t v_pos_adjustment =
+      vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+  const uint8x16_t v_neg_adjustment =
+      vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+  uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+  v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+  /* Store results. */
+  vst1q_u8(running_avg_y, v_running_avg_y);
+
+  /* Sum all the accumulators to have the sum of all pixel differences
+   * for this macroblock.
+   */
+  {
+    const int8x16_t v_sum_diff =
+        vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+                  vreinterpretq_s8_u8(v_neg_adjustment));
+    v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+  }
+  return v_sum_diff_total;
+}
+
+static INLINE int8x16_t denoiser_adjust_16x1_neon(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
+  uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+  const uint8x16_t v_sig = vld1q_u8(sig);
+  const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+  /* Calculate absolute difference and sign masks. */
+  const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+  // Clamp absolute difference to delta to get the adjustment.
+  const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
+
+  const uint8x16_t v_pos_adjustment =
+      vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+  const uint8x16_t v_neg_adjustment =
+      vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+  v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+  v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+  /* Store results. */
+  vst1q_u8(running_avg_y, v_running_avg_y);
+
+  {
+    const int8x16_t v_sum_diff =
+        vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+                  vreinterpretq_s8_u8(v_pos_adjustment));
+    v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+  }
+  return v_sum_diff_total;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int av1_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
+                                 const uint8_t *mc_running_avg_y,
+                                 int mc_avg_y_stride, uint8_t *running_avg_y,
+                                 int avg_y_stride, int increase_denoising,
+                                 BLOCK_SIZE bs, int motion_magnitude,
+                                 int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+
+  const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+  const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+  const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+  const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
+  const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+  const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+  const int b_height = block_size_high[bs] >> 1;
+
+  int8x16_t v_sum_diff_total = vdupq_n_s8(0);
+
+  for (r = 0; r < b_height; ++r) {
+    memcpy(sig_buffer[r], sig, width);
+    memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+           width);
+    memcpy(running_buffer[r], running_avg_y, width);
+    memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+    v_sum_diff_total = denoiser_16x1_neon(
+        sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+        v_level1_threshold, v_level2_threshold, v_level3_threshold,
+        v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
+        v_sum_diff_total);
+    {
+      const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+      const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
+      const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
+      vst1_u8(running_avg_y, v_running_buffer_low);
+      vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+    }
+    // Update pointers for next iteration.
+    sig += (sig_stride << 1);
+    mc_running_avg_y += (mc_avg_y_stride << 1);
+    running_avg_y += (avg_y_stride << 1);
+  }
+
+  {
+    sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // check if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the acceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const uint8x16_t k_delta = vmovq_n_u8(delta);
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
+          v_sum_diff_total = denoiser_adjust_16x1_neon(
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
+              v_sum_diff_total);
+          {
+            const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+            const uint8x8_t v_running_buffer_high =
+                vget_high_u8(v_running_buffer);
+            const uint8x8_t v_running_buffer_low =
+                vget_low_u8(v_running_buffer);
+            vst1_u8(running_avg_y, v_running_buffer_low);
+            vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+          }
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << 1);
+        }
+        sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+
+  return FILTER_BLOCK;
+}
+
+// Denoise 16x16, to 128x128 blocks.
+static int av1_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
+                                 const uint8_t *mc_running_avg_y,
+                                 int mc_avg_y_stride, uint8_t *running_avg_y,
+                                 int avg_y_stride, int increase_denoising,
+                                 BLOCK_SIZE bs, int motion_magnitude) {
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+  const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+  const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+  const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+  const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+  const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+  const int b_width = block_size_wide[bs];
+  const int b_height = block_size_high[bs];
+  const int b_width_shift4 = b_width >> 4;
+
+  int8x16_t v_sum_diff_total[8][8];
+  int r, c, sum_diff = 0;
+
+  for (r = 0; r < 8; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      v_sum_diff_total[c][r] = vdupq_n_s8(0);
+    }
+  }
+
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
+          sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
+          v_level2_threshold, v_level3_threshold, v_level1_adjustment,
+          v_delta_level_1_and_2, v_delta_level_2_and_3,
+          v_sum_diff_total[c][r >> 4]);
+
+      // Update pointers for next iteration.
+      sig += 16;
+      mc_running_avg_y += 16;
+      running_avg_y += 16;
+    }
+
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
+  }
+
+  {
+    const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const uint8x16_t k_delta = vdupq_n_u8(delta);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
+        sum_diff = 0;
+
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            v_sum_diff_total[c][r >> 4] =
+                denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
+                                          k_delta, v_sum_diff_total[c][r >> 4]);
+
+            // Update pointers for next iteration.
+            sig += 16;
+            mc_running_avg_y += 16;
+            running_avg_y += 16;
+          }
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+            }
+          }
+
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
+        }
+
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+int av1_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
+                             const uint8_t *mc_avg, int mc_avg_stride,
+                             uint8_t *avg, int avg_stride,
+                             int increase_denoising, BLOCK_SIZE bs,
+                             int motion_magnitude) {
+  // Rank by frequency of the block type to have an early termination.
+  if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+      bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 ||
+      bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+      bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+    return av1_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+                                 avg_stride, increase_denoising, bs,
+                                 motion_magnitude);
+  } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return av1_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+                                 avg_stride, increase_denoising, bs,
+                                 motion_magnitude, 8);
+  }
+  return COPY_BLOCK;
+}
diff --git a/media/libaom/src/av1/encoder/arm/neon/encodetxb_neon.c b/media/libaom/src/av1/encoder/arm/neon/encodetxb_neon.c
new file mode 100644
index 0000000000..9bb822afa0
--- /dev/null
+++ b/media/libaom/src/av1/encoder/arm/neon/encodetxb_neon.c
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <math.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/encodetxb.h"
+
+void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
+                              const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  memset(levels - TX_PAD_TOP * stride, 0,
+         sizeof(*levels) * TX_PAD_TOP * stride);
+  memset(levels + stride * height, 0,
+         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+  const int32x4_t zeros = vdupq_n_s32(0);
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (width == 4) {
+    do {
+      const int32x4_t coeffA = vld1q_s32(cf);
+      const int32x4_t coeffB = vld1q_s32(cf + width);
+      const int16x8_t coeffAB =
+          vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
+      const int16x8_t absAB = vqabsq_s16(coeffAB);
+      const int8x8_t absABs = vqmovn_s16(absAB);
+#if defined(__aarch64__)
+      const int8x16_t absAB8 =
+          vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
+      const uint8x16_t lsAB =
+          vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
+#else
+      const int32x2x2_t absAB8 =
+          vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
+      const uint8x16_t lsAB =
+          vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1]));
+#endif
+      vst1q_u8(ls, lsAB);
+      ls += (stride << 1);
+      cf += (width << 1);
+      i += 2;
+    } while (i < height);
+  } else if (width == 8) {
+    do {
+      const int32x4_t coeffA = vld1q_s32(cf);
+      const int32x4_t coeffB = vld1q_s32(cf + 4);
+      const int16x8_t coeffAB =
+          vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
+      const int16x8_t absAB = vqabsq_s16(coeffAB);
+      const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
+          vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
+      vst1q_u8(ls, absAB8);
+      ls += stride;
+      cf += width;
+      i += 1;
+    } while (i < height);
+  } else {
+    do {
+      int j = 0;
+      do {
+        const int32x4_t coeffA = vld1q_s32(cf);
+        const int32x4_t coeffB = vld1q_s32(cf + 4);
+        const int32x4_t coeffC = vld1q_s32(cf + 8);
+        const int32x4_t coeffD = vld1q_s32(cf + 12);
+        const int16x8_t coeffAB =
+            vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
+        const int16x8_t coeffCD =
+            vcombine_s16(vqmovn_s32(coeffC), vqmovn_s32(coeffD));
+        const int16x8_t absAB = vqabsq_s16(coeffAB);
+        const int16x8_t absCD = vqabsq_s16(coeffCD);
+        const uint8x16_t absABCD = vreinterpretq_u8_s8(
+            vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
+        vst1q_u8((ls + j), absABCD);
+        j += 16;
+        cf += 16;
+      } while (j < width);
+      *(int32_t *)(ls + width) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < height);
+  }
+}
+
+// get_4_nz_map_contexts_2d coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
+  { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
+  { 0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, 21, 21 }
+};
+
+// get_4_nz_map_contexts_hor coefficients:
+/* clang-format off */
+#define SIG_COEF_CONTEXTS_2D_X4_051010                        \
+  (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
+  ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
+/* clang-format on */
+
+// get_4_nz_map_contexts_ver coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_4_po_ver[16]) = {
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
+  SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// get_8_coeff_contexts_2d coefficients:
+// if (height == 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
+  { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
+};
+// if (height < 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
+  { 0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, 21, 21 },
+  { 16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21 }
+};
+
+// if (height > 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
+  { 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// get_4_nz_map_contexts_ver coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_hor[16]) = {
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// get_16n_coeff_contexts_2d coefficients:
+// real_width == real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
+  { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// real_width > real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
+  { 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// real_width < real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
+  { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// get_16n_coeff_contexts_hor coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_hor[16]) = {
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// end of coefficients declaration area
+
+static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
+                                                const int byte_stride) {
+#ifdef __aarch64__
+  uint32x4_t v_data = vld1q_u32((uint32_t *)src);
+  v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
+  v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
+  v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3);
+
+  return vreinterpretq_u8_u32(v_data);
+#else
+  return load_unaligned_u8q(src, byte_stride);
+#endif
+}
+
+static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
+                                                const int byte_stride) {
+#ifdef __aarch64__
+  uint64x2_t v_data = vld1q_u64((uint64_t *)src);
+  v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);
+
+  return vreinterpretq_u8_u64(v_data);
+#else
+  uint8x8_t v_data_low = vld1_u8(src);
+  uint8x8_t v_data_high = vld1_u8(src + byte_stride);
+
+  return vcombine_u8(v_data_low, v_data_high);
+#endif
+}
+
+static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
+                                                 const int byte_stride) {
+  (void)byte_stride;
+  return vld1q_u8(src);
+}
+
+static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride,
+                                     const ptrdiff_t *const offsets,
+                                     uint8x16_t *const level) {
+  level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
+  level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
+  level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
+  level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
+  level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride,
+                                     const ptrdiff_t *const offsets,
+                                     uint8x16_t *const level) {
+  level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
+  level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
+  level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
+  level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
+  level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE void load_levels_16x1x5(const uint8_t *const src,
+                                      const int stride,
+                                      const ptrdiff_t *const offsets,
+                                      uint8x16_t *const level) {
+  level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
+  level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
+  level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
+  level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
+  level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
+  const uint8x16_t const_3 = vdupq_n_u8(3);
+  const uint8x16_t const_4 = vdupq_n_u8(4);
+  uint8x16_t count;
+
+  count = vminq_u8(level[0], const_3);
+  level[1] = vminq_u8(level[1], const_3);
+  level[2] = vminq_u8(level[2], const_3);
+  level[3] = vminq_u8(level[3], const_3);
+  level[4] = vminq_u8(level[4], const_3);
+  count = vaddq_u8(count, level[1]);
+  count = vaddq_u8(count, level[2]);
+  count = vaddq_u8(count, level[3]);
+  count = vaddq_u8(count, level[4]);
+
+  count = vrshrq_n_u8(count, 1);
+  count = vminq_u8(count, const_4);
+  return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            uint8_t *const coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);
+
+  uint8x16_t pos_to_offset =
+      vld1q_u8((height == 4) ? c_4_po_2d[0] : c_4_po_2d[1]);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+  uint8_t *cc = coeff_contexts;
+
+  assert(!(height % 4));
+
+  int row = height;
+  do {
+    load_levels_4x4x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(cc, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    cc += 16;
+    row -= 4;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             uint8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+
+  const uint8x16_t pos_to_offset =
+      vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 4));
+
+  int row = height;
+  do {
+    load_levels_4x4x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             uint8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  uint8x16_t pos_to_offset = vld1q_u8(c_4_po_ver);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 4));
+
+  int row = height;
+  do {
+    load_levels_4x4x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+                                           const int height,
+                                           const ptrdiff_t *const offsets,
+                                           uint8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  uint8_t *cc = coeff_contexts;
+  uint8x16_t count;
+  uint8x16_t level[5];
+  uint8x16_t pos_to_offset[3];
+
+  assert(!(height % 2));
+
+  if (height == 8) {
+    pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
+    pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
+  } else if (height < 8) {
+    pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
+    pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
+  } else {
+    pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
+    pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
+  }
+  pos_to_offset[2] = vdupq_n_u8(21);
+
+  int row = height;
+  do {
+    load_levels_8x2x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset[0]);
+    vst1q_u8(cc, count);
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += 2 * stride;
+    cc += 16;
+    row -= 2;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            uint8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+
+  const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_hor);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 2));
+
+  int row = height;
+  do {
+    load_levels_8x2x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            uint8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
+                                         vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 2));
+
+  int row = height;
+  do {
+    load_levels_8x2x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+                                             const int real_width,
+                                             const int real_height,
+                                             const int width, const int height,
+                                             const ptrdiff_t *const offsets,
+                                             uint8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  uint8_t *cc = coeff_contexts;
+  int row = height;
+  uint8x16_t pos_to_offset[5];
+  uint8x16_t pos_to_offset_large[3];
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 16));
+
+  pos_to_offset_large[2] = vdupq_n_u8(21);
+  if (real_width == real_height) {
+    pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
+    pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
+    pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
+    pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
+    pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+        pos_to_offset_large[2];
+  } else if (real_width > real_height) {
+    pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
+    pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
+    pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
+        vld1q_u8(c_16_po_2d_g[2]);
+    pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+  } else {  // real_width < real_height
+    pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
+    pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
+    pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
+    pos_to_offset[4] = pos_to_offset_large[2];
+    pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(11);
+  }
+
+  do {
+    int w = width;
+
+    do {
+      load_levels_16x1x5(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel(level);
+      count = vaddq_u8(count, pos_to_offset[0]);
+      vst1q_u8(cc, count);
+      levels += 16;
+      cc += 16;
+      w -= 16;
+      pos_to_offset[0] = pos_to_offset_large[0];
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    pos_to_offset[2] = pos_to_offset[3];
+    pos_to_offset[3] = pos_to_offset[4];
+    pos_to_offset_large[0] = pos_to_offset_large[1];
+    pos_to_offset_large[1] = pos_to_offset_large[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              uint8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 16));
+
+  int row = height;
+  do {
+    uint8x16_t pos_to_offset = vld1q_u8(c_16_po_hor);
+
+    int w = width;
+    do {
+      load_levels_16x1x5(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel(level);
+      count = vaddq_u8(count, pos_to_offset);
+      vst1q_u8(coeff_contexts, count);
+      pos_to_offset = pos_to_offset_large;
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              uint8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+
+  uint8x16_t pos_to_offset[3];
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 16));
+
+  pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
+  pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
+  pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  int row = height;
+  do {
+    int w = width;
+    do {
+      load_levels_16x1x5(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel(level);
+      count = vaddq_u8(count, pos_to_offset[0]);
+      vst1q_u8(coeff_contexts, count);
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
+                                  const int16_t *const scan, const uint16_t eob,
+                                  const TX_SIZE tx_size,
+                                  const TX_CLASS tx_class,
+                                  int8_t *const coeff_contexts) {
+  const int last_idx = eob - 1;
+  if (!last_idx) {
+    coeff_contexts[0] = 0;
+    return;
+  }
+
+  uint8_t *const coefficients = (uint8_t *const)coeff_contexts;
+
+  const int real_width = tx_size_wide[tx_size];
+  const int real_height = tx_size_high[tx_size];
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const int stride = width + TX_PAD_HOR;
+  ptrdiff_t offsets[3];
+
+  /* coeff_contexts must be 16 byte aligned. */
+  assert(!((intptr_t)coeff_contexts & 0xf));
+
+  if (tx_class == TX_CLASS_2D) {
+    offsets[0] = 0 * stride + 2;
+    offsets[1] = 1 * stride + 1;
+    offsets[2] = 2 * stride + 0;
+
+    if (width == 4) {
+      get_4_nz_map_contexts_2d(levels, height, offsets, coefficients);
+    } else if (width == 8) {
+      get_8_coeff_contexts_2d(levels, height, offsets, coefficients);
+    } else {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coefficients);
+    }
+  } else if (tx_class == TX_CLASS_HORIZ) {
+    offsets[0] = 2;
+    offsets[1] = 3;
+    offsets[2] = 4;
+    if (width == 4) {
+      get_4_nz_map_contexts_hor(levels, height, offsets, coefficients);
+    } else if (width == 8) {
+      get_8_coeff_contexts_hor(levels, height, offsets, coefficients);
+    } else {
+      get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
+    }
+  } else {  // TX_CLASS_VERT
+    offsets[0] = 2 * stride;
+    offsets[1] = 3 * stride;
+    offsets[2] = 4 * stride;
+    if (width == 4) {
+      get_4_nz_map_contexts_ver(levels, height, offsets, coefficients);
+    } else if (width == 8) {
+      get_8_coeff_contexts_ver(levels, height, offsets, coefficients);
+    } else {
+      get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
+    }
+  }
+
+  const int bwl = get_txb_bwl(tx_size);
+  const int pos = scan[last_idx];
+  if (last_idx <= (height << bwl) / 8)
+    coeff_contexts[pos] = 1;
+  else if (last_idx <= (height << bwl) / 4)
+    coeff_contexts[pos] = 2;
+  else
+    coeff_contexts[pos] = 3;
+}
diff --git a/media/libaom/src/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c b/media/libaom/src/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
new file mode 100644
index 0000000000..273712a9d4
--- /dev/null
+++ b/media/libaom/src/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
@@ -0,0 +1,4047 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_config.h"
+
+static INLINE int32x4_t half_btf_neon(const int32_t *w0, const int32x4_t *n0,
+                                      const int32_t *w1, const int32x4_t *n1,
+                                      const int32x4_t v_bit) {
+  int32x4_t x;
+  x = vmulq_n_s32(*n0, *w0);
+  x = vmlaq_n_s32(x, *n1, *w1);
+  x = vrshlq_s32(x, v_bit);
+  return x;
+}
+
+static INLINE int32x4_t half_btf_neon_m(const int32_t *w0, const int32x4_t *n0,
+                                        const int32_t *w1, const int32x4_t *n1,
+                                        const int32x4_t v_bit) {
+  int32x4_t x;
+  x = vmulq_n_s32(*n0, *w0);
+  x = vmlsq_n_s32(x, *n1, *w1);
+  x = vrshlq_s32(x, v_bit);
+  return x;
+}
+
+#if defined(__aarch64__)
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)         \
+  do {                                                        \
+    int32x4x2_t swap_low = vtrnq_s32(x0, x1);                 \
+    int32x4x2_t swap_high = vtrnq_s32(x2, x3);                \
+    y0 = vreinterpretq_s32_s64(                               \
+        vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
+                   vreinterpretq_s64_s32(swap_high.val[0]))); \
+    y1 = vreinterpretq_s32_s64(                               \
+        vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
+                   vreinterpretq_s64_s32(swap_high.val[1]))); \
+    y2 = vreinterpretq_s32_s64(                               \
+        vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
+                   vreinterpretq_s64_s32(swap_high.val[0]))); \
+    y3 = vreinterpretq_s32_s64(                               \
+        vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
+                   vreinterpretq_s64_s32(swap_high.val[1]))); \
+  } while (0)
+#else
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)                    \
+  do {                                                                   \
+    int32x4x2_t swap_low = vtrnq_s32(x0, x1);                            \
+    int32x4x2_t swap_high = vtrnq_s32(x2, x3);                           \
+    y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2),       \
+                   swap_high.val[0], 2);                                 \
+    y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2),       \
+                   swap_high.val[1], 2);                                 \
+    y2 = vextq_s32(swap_low.val[0],                                      \
+                   vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \
+    y3 = vextq_s32(swap_low.val[1],                                      \
+                   vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \
+  } while (0)
+#endif  // (__aarch64__)
+
+static INLINE void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
+  TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+  TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+  TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+  TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+                out[15]);
+}
+
+static INLINE void transpose_16x16(const int32x4_t *in, int32x4_t *out) {
+  // Upper left 8x8
+  TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
+  TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
+                out[28]);
+  TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
+                out[13]);
+  TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
+                out[29]);
+
+  // Upper right 8x8
+  TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
+                out[44]);
+  TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
+                out[60]);
+  TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
+                out[45]);
+  TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
+                out[61]);
+
+  // Lower left 8x8
+  TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
+                out[14]);
+  TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
+                out[30]);
+  TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
+                out[15]);
+  TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
+                out[31]);
+  // Lower right 8x8
+  TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
+                out[46]);
+  TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
+                out[62]);
+  TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
+                out[47]);
+  TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
+                out[63]);
+}
+
+static INLINE void av1_round_shift_rect_array_32_neon(int32x4_t *input,
+                                                      int32x4_t *output,
+                                                      const int size,
+                                                      const int bit,
+                                                      const int val) {
+  const int32x4_t sqrt2 = vdupq_n_s32(val);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  int i;
+  for (i = 0; i < size; i++) {
+    const int32x4_t r0 = vrshlq_s32(input[i], v_bit);
+    const int32x4_t r1 = vmulq_s32(sqrt2, r0);
+    output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
+  }
+}
+
+#define btf_32_neon_type0(w0, w1, in0, in1, out0, out1, v_cos_bit) \
+  do {                                                             \
+    out0 = vmulq_n_s32(in0, w0);                                   \
+    out0 = vmlaq_n_s32(out0, in1, w1);                             \
+    out0 = vrshlq_s32(out0, v_cos_bit);                            \
+    out1 = vmulq_n_s32(in0, w1);                                   \
+    out1 = vmlsq_n_s32(out1, in1, w0);                             \
+    out1 = vrshlq_s32(out1, v_cos_bit);                            \
+  } while (0)
+
+#define btf_32_neon_type1(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                       \
+    btf_32_neon_type0(w1, w0, in1, in0, out0, out1, bit);    \
+  } while (0)
+
+static INLINE void load_buffer_4x4(const int16_t *input, int32x4_t *in,
+                                   int stride, int flipud, int fliplr,
+                                   const int32x4_t *v_shift) {
+  int16x4_t v0, v1, v2, v3;
+
+  if (!flipud) {
+    v0 = vld1_s16(input + 0 * stride);
+    v1 = vld1_s16(input + 1 * stride);
+    v2 = vld1_s16(input + 2 * stride);
+    v3 = vld1_s16(input + 3 * stride);
+  } else {
+    v0 = vld1_s16(input + 3 * stride);
+    v1 = vld1_s16(input + 2 * stride);
+    v2 = vld1_s16(input + 1 * stride);
+    v3 = vld1_s16(input + 0 * stride);
+  }
+
+  if (fliplr) {
+    v0 = vrev64_s16(v0);
+    v1 = vrev64_s16(v1);
+    v2 = vrev64_s16(v2);
+    v3 = vrev64_s16(v3);
+  }
+  in[0] = vshlq_s32(vmovl_s16(v0), *v_shift);
+  in[1] = vshlq_s32(vmovl_s16(v1), *v_shift);
+  in[2] = vshlq_s32(vmovl_s16(v2), *v_shift);
+  in[3] = vshlq_s32(vmovl_s16(v3), *v_shift);
+}
+
+static void fdct4x4_neon(int32x4_t *in, int32x4_t *out, int bit,
+                         const int num_col) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int32x4_t cospi32 = vdupq_n_s32(cospi[32]);
+  const int32x4_t cospi48 = vdupq_n_s32(cospi[48]);
+  const int32x4_t cospi16 = vdupq_n_s32(cospi[16]);
+  int32x4_t s0, s1, s2, s3;
+  int32x4_t u0, u1, u2, u3;
+  int32x4_t v0, v2;
+
+  int endidx = 3 * num_col;
+  s0 = vaddq_s32(in[0], in[endidx]);
+  s3 = vsubq_s32(in[0], in[endidx]);
+  endidx -= num_col;
+  s1 = vaddq_s32(in[num_col], in[endidx]);
+  s2 = vsubq_s32(in[num_col], in[endidx]);
+
+  u0 = vmulq_s32(s0, cospi32);
+  u1 = vmulq_s32(s1, cospi32);
+  u2 = vaddq_s32(u0, u1);
+  v0 = vsubq_s32(u0, u1);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  u0 = vrshlq_s32(u2, v_bit);
+  u2 = vrshlq_s32(v0, v_bit);
+
+  v0 = vmulq_s32(s2, cospi48);
+  v2 = vmlaq_s32(v0, s3, cospi16);
+
+  u1 = vrshlq_s32(v2, v_bit);
+
+  v0 = vmulq_s32(s3, cospi48);
+  v2 = vmlsq_s32(v0, s2, cospi16);
+
+  u3 = vrshlq_s32(v2, v_bit);
+
+  TRANSPOSE_4X4(u0, u1, u2, u3, out[0], out[1], out[2], out[3]);
+}
+
+static INLINE void write_buffer_4x4(int32x4_t *res, int32_t *output) {
+  vst1q_s32((output + 0 * 4), res[0]);
+  vst1q_s32((output + 1 * 4), res[1]);
+  vst1q_s32((output + 2 * 4), res[2]);
+  vst1q_s32((output + 3 * 4), res[3]);
+}
+
+static void fadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit,
+                          const int num_col) {
+  const int32_t *sinpi = sinpi_arr(bit);
+  const int32x4_t sinpi4x = vld1q_s32(&sinpi[1]);
+
+  const int32x4_t sinpi1 = vdupq_lane_s32(vget_low_s32(sinpi4x), 0);
+  const int32x4_t sinpi2 = vdupq_lane_s32(vget_low_s32(sinpi4x), 1);
+  const int32x4_t sinpi3 = vdupq_lane_s32(vget_high_s32(sinpi4x), 0);
+  const int32x4_t sinpi4 = vdupq_lane_s32(vget_high_s32(sinpi4x), 1);
+  int32x4_t t;
+  int32x4_t s0, s1, s2, s3, s7;
+  int32x4_t x0, x1, x2, x3;
+  int32x4_t u0, u1, u2, u3;
+
+  int idx = 0 * num_col;
+  s0 = vmulq_s32(in[idx], sinpi1);
+  s1 = vmulq_s32(in[idx], sinpi4);
+  t = vaddq_s32(in[idx], in[idx + num_col]);
+  idx += 2 * num_col;
+  x3 = vmulq_s32(in[idx], sinpi3);
+  idx += num_col;
+  s7 = vsubq_s32(t, in[idx]);
+
+  t = vmlaq_s32(s0, in[idx - 2 * num_col], sinpi2);
+  x0 = vmlaq_s32(t, in[idx], sinpi4);
+  x1 = vmulq_s32(s7, sinpi3);
+  t = vmlsq_s32(s1, in[idx - 2 * num_col], sinpi1);
+  x2 = vmlaq_s32(t, in[idx], sinpi2);
+
+  s0 = vaddq_s32(x0, x3);
+  s1 = x1;
+  s2 = vsubq_s32(x2, x3);
+  t = vsubq_s32(x2, x0);
+  s3 = vaddq_s32(t, x3);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  u0 = vrshlq_s32(s0, v_bit);
+  u1 = vrshlq_s32(s1, v_bit);
+  u2 = vrshlq_s32(s2, v_bit);
+  u3 = vrshlq_s32(s3, v_bit);
+
+  TRANSPOSE_4X4(u0, u1, u2, u3, out[0], out[1], out[2], out[3]);
+}
+static void idtx4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int col_num) {
+  (void)bit;
+  int32x4_t fact = vdupq_n_s32(NewSqrt2);
+  int32x4_t a_low;
+
+  int i;
+  for (i = 0; i < 4; i++) {
+    a_low = vmulq_s32(in[i * col_num], fact);
+    out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits);
+  }
+
+  TRANSPOSE_4X4(out[0], out[1], out[2], out[3], out[0], out[1], out[2], out[3]);
+}
+void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff,
+                             int input_stride, TX_TYPE tx_type, int bd) {
+  int32x4_t in[4];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+  int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in, input_stride, 1, 0, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, &v_shift0);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 1, 1, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in, input_stride, 1, 0, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case IDTX:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fdct4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, &v_shift0);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 1, 0, &v_shift0);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, &v_shift0);
+      idtx4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_neon(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, int32x4_t *in,
+                                   int stride, int flipud, int fliplr,
+                                   const int shift) {
+  if (!flipud) {
+    in[0] = vreinterpretq_s32_s16(vld1q_s16((input + 0 * stride)));
+    in[1] = vreinterpretq_s32_s16(vld1q_s16((input + 1 * stride)));
+    in[2] = vreinterpretq_s32_s16(vld1q_s16((input + 2 * stride)));
+    in[3] = vreinterpretq_s32_s16(vld1q_s16((input + 3 * stride)));
+    in[4] = vreinterpretq_s32_s16(vld1q_s16((input + 4 * stride)));
+    in[5] = vreinterpretq_s32_s16(vld1q_s16((input + 5 * stride)));
+    in[6] = vreinterpretq_s32_s16(vld1q_s16((input + 6 * stride)));
+    in[7] = vreinterpretq_s32_s16(vld1q_s16((input + 7 * stride)));
+  } else {
+    in[0] = vreinterpretq_s32_s16(vld1q_s16((input + 7 * stride)));
+    in[1] = vreinterpretq_s32_s16(vld1q_s16((input + 6 * stride)));
+    in[2] = vreinterpretq_s32_s16(vld1q_s16((input + 5 * stride)));
+    in[3] = vreinterpretq_s32_s16(vld1q_s16((input + 4 * stride)));
+    in[4] = vreinterpretq_s32_s16(vld1q_s16((input + 3 * stride)));
+    in[5] = vreinterpretq_s32_s16(vld1q_s16((input + 2 * stride)));
+    in[6] = vreinterpretq_s32_s16(vld1q_s16((input + 1 * stride)));
+    in[7] = vreinterpretq_s32_s16(vld1q_s16((input + 0 * stride)));
+  }
+
+  if (fliplr) {
+    in[0] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[0])));
+    in[0] = vextq_s32(in[0], in[0], 2);
+    in[1] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[1])));
+    in[1] = vextq_s32(in[1], in[1], 2);
+    in[2] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[2])));
+    in[2] = vextq_s32(in[2], in[2], 2);
+    in[3] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[3])));
+    in[3] = vextq_s32(in[3], in[3], 2);
+    in[4] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[4])));
+    in[4] = vextq_s32(in[4], in[4], 2);
+    in[5] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[5])));
+    in[5] = vextq_s32(in[5], in[5], 2);
+    in[6] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[6])));
+    in[6] = vextq_s32(in[6], in[6], 2);
+    in[7] = vreinterpretq_s32_s16(vrev64q_s16(vreinterpretq_s16_s32(in[7])));
+    in[7] = vextq_s32(in[7], in[7], 2);
+  }
+
+  int16x4_t u = vget_high_s16(vreinterpretq_s16_s32(in[4]));
+  in[8] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[4])));
+  in[9] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[5]));
+  in[10] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[5])));
+  in[11] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[6]));
+  in[12] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[6])));
+  in[13] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[7]));
+  in[14] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[7])));
+  in[15] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[3]));
+  in[6] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[3])));
+  in[7] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[2]));
+  in[4] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[2])));
+  in[5] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[1]));
+  in[2] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[1])));
+  in[3] = vmovl_s16(u);
+
+  u = vget_high_s16(vreinterpretq_s16_s32(in[0]));
+  in[0] = vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(in[0])));
+  in[1] = vmovl_s16(u);
+
+  const int32x4_t v_shift = vdupq_n_s32(shift);
+
+  in[0] = vshlq_s32(in[0], v_shift);
+  in[1] = vshlq_s32(in[1], v_shift);
+  in[2] = vshlq_s32(in[2], v_shift);
+  in[3] = vshlq_s32(in[3], v_shift);
+  in[4] = vshlq_s32(in[4], v_shift);
+  in[5] = vshlq_s32(in[5], v_shift);
+  in[6] = vshlq_s32(in[6], v_shift);
+  in[7] = vshlq_s32(in[7], v_shift);
+
+  in[8] = vshlq_s32(in[8], v_shift);
+  in[9] = vshlq_s32(in[9], v_shift);
+  in[10] = vshlq_s32(in[10], v_shift);
+  in[11] = vshlq_s32(in[11], v_shift);
+  in[12] = vshlq_s32(in[12], v_shift);
+  in[13] = vshlq_s32(in[13], v_shift);
+  in[14] = vshlq_s32(in[14], v_shift);
+  in[15] = vshlq_s32(in[15], v_shift);
+}
+
+static INLINE void col_txfm_8x8_rounding(int32x4_t *in,
+                                         const int32x4_t *v_shift) {
+  in[0] = vrshlq_s32(in[0], *v_shift);
+  in[1] = vrshlq_s32(in[1], *v_shift);
+  in[2] = vrshlq_s32(in[2], *v_shift);
+  in[3] = vrshlq_s32(in[3], *v_shift);
+  in[4] = vrshlq_s32(in[4], *v_shift);
+  in[5] = vrshlq_s32(in[5], *v_shift);
+  in[6] = vrshlq_s32(in[6], *v_shift);
+  in[7] = vrshlq_s32(in[7], *v_shift);
+  in[8] = vrshlq_s32(in[8], *v_shift);
+  in[9] = vrshlq_s32(in[9], *v_shift);
+  in[10] = vrshlq_s32(in[10], *v_shift);
+  in[11] = vrshlq_s32(in[11], *v_shift);
+  in[12] = vrshlq_s32(in[12], *v_shift);
+  in[13] = vrshlq_s32(in[13], *v_shift);
+  in[14] = vrshlq_s32(in[14], *v_shift);
+  in[15] = vrshlq_s32(in[15], *v_shift);
+}
+
+static INLINE void col_txfm_4x8_rounding(int32x4_t *in,
+                                         const int32x4_t *v_shift) {
+  in[0] = vrshlq_s32(in[0], *v_shift);
+  in[1] = vrshlq_s32(in[1], *v_shift);
+  in[2] = vrshlq_s32(in[2], *v_shift);
+  in[3] = vrshlq_s32(in[3], *v_shift);
+  in[4] = vrshlq_s32(in[4], *v_shift);
+  in[5] = vrshlq_s32(in[5], *v_shift);
+  in[6] = vrshlq_s32(in[6], *v_shift);
+  in[7] = vrshlq_s32(in[7], *v_shift);
+}
+
+static INLINE void write_buffer_8x8(const int32x4_t *res, int32_t *output) {
+  vst1q_s32(output + 0 * 4, res[0]);
+  vst1q_s32(output + 1 * 4, res[1]);
+  vst1q_s32(output + 2 * 4, res[2]);
+  vst1q_s32(output + 3 * 4, res[3]);
+
+  vst1q_s32(output + 4 * 4, res[4]);
+  vst1q_s32(output + 5 * 4, res[5]);
+  vst1q_s32(output + 6 * 4, res[6]);
+  vst1q_s32(output + 7 * 4, res[7]);
+
+  vst1q_s32(output + 8 * 4, res[8]);
+  vst1q_s32(output + 9 * 4, res[9]);
+  vst1q_s32(output + 10 * 4, res[10]);
+  vst1q_s32(output + 11 * 4, res[11]);
+
+  vst1q_s32(output + 12 * 4, res[12]);
+  vst1q_s32(output + 13 * 4, res[13]);
+  vst1q_s32(output + 14 * 4, res[14]);
+  vst1q_s32(output + 15 * 4, res[15]);
+}
+
+static INLINE void write_buffer_16x8(const int32x4_t *res, int32_t *output,
+                                     const int stride) {
+  vst1q_s32(output, res[0]);
+  vst1q_s32(output + 4, res[1]);
+  vst1q_s32(output + stride, res[2]);
+  vst1q_s32(output + stride + 4, res[3]);
+
+  vst1q_s32(output + (stride * 2), res[4]);
+  vst1q_s32(output + (stride * 2) + 4, res[5]);
+  vst1q_s32(output + (stride * 3), res[6]);
+  vst1q_s32(output + (stride * 3) + 4, res[7]);
+
+  vst1q_s32(output + (stride * 4), res[8]);
+  vst1q_s32(output + (stride * 4) + 4, res[9]);
+  vst1q_s32(output + (stride * 5), res[10]);
+  vst1q_s32(output + (stride * 5) + 4, res[11]);
+
+  vst1q_s32(output + (stride * 6), res[12]);
+  vst1q_s32(output + (stride * 6) + 4, res[13]);
+  vst1q_s32(output + (stride * 7), res[14]);
+  vst1q_s32(output + (stride * 7) + 4, res[15]);
+}
+
+static void fdct4x8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                         const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  int32x4_t u[8], v[8];
+
+  int startidx = 0 * col_num;
+  int endidx = 7 * col_num;
+  // stage 0-1
+  u[0] = vaddq_s32(in[startidx], in[endidx]);
+  v[7] = vsubq_s32(in[startidx], in[endidx]);
+  startidx += col_num;
+  endidx -= col_num;
+  u[1] = vaddq_s32(in[startidx], in[endidx]);
+  u[6] = vsubq_s32(in[startidx], in[endidx]);
+  startidx += col_num;
+  endidx -= col_num;
+  u[2] = vaddq_s32(in[startidx], in[endidx]);
+  u[5] = vsubq_s32(in[startidx], in[endidx]);
+  startidx += col_num;
+  endidx -= col_num;
+  u[3] = vaddq_s32(in[startidx], in[endidx]);
+  v[4] = vsubq_s32(in[startidx], in[endidx]);
+
+  // stage 2
+  v[0] = vaddq_s32(u[0], u[3]);
+  v[3] = vsubq_s32(u[0], u[3]);
+  v[1] = vaddq_s32(u[1], u[2]);
+  v[2] = vsubq_s32(u[1], u[2]);
+
+  v[5] = vmulq_n_s32(u[6], cospi[32]);
+  v[5] = vmlsq_n_s32(v[5], u[5], cospi[32]);
+  v[5] = vrshlq_s32(v[5], v_bit);
+
+  u[0] = vmulq_n_s32(u[5], cospi[32]);
+  v[6] = vmlaq_n_s32(u[0], u[6], cospi[32]);
+  v[6] = vrshlq_s32(v[6], v_bit);
+
+  // stage 3
+  // type 0
+  v[0] = vmulq_n_s32(v[0], cospi[32]);
+  v[1] = vmulq_n_s32(v[1], cospi[32]);
+  u[0] = vaddq_s32(v[0], v[1]);
+  u[0] = vrshlq_s32(u[0], v_bit);
+
+  u[1] = vsubq_s32(v[0], v[1]);
+  u[1] = vrshlq_s32(u[1], v_bit);
+
+  // type 1
+  v[0] = vmulq_n_s32(v[2], cospi[48]);
+  u[2] = vmlaq_n_s32(v[0], v[3], cospi[16]);
+  u[2] = vrshlq_s32(u[2], v_bit);
+
+  v[1] = vmulq_n_s32(v[3], cospi[48]);
+  u[3] = vmlsq_n_s32(v[1], v[2], cospi[16]);
+  u[3] = vrshlq_s32(u[3], v_bit);
+
+  u[4] = vaddq_s32(v[4], v[5]);
+  u[5] = vsubq_s32(v[4], v[5]);
+  u[6] = vsubq_s32(v[7], v[6]);
+  u[7] = vaddq_s32(v[7], v[6]);
+
+  // stage 4-5
+  v[0] = vmulq_n_s32(u[4], cospi[56]);
+  v[0] = vmlaq_n_s32(v[0], u[7], cospi[8]);
+  out[1 * col_num] = vrshlq_s32(v[0], v_bit);
+
+  v[1] = vmulq_n_s32(u[7], cospi[56]);
+  v[0] = vmlsq_n_s32(v[1], u[4], cospi[8]);
+  out[7 * col_num] = vrshlq_s32(v[0], v_bit);
+
+  v[0] = vmulq_n_s32(u[5], cospi[24]);
+  v[0] = vmlaq_n_s32(v[0], u[6], cospi[40]);
+  out[5 * col_num] = vrshlq_s32(v[0], v_bit);
+
+  v[1] = vmulq_n_s32(u[6], cospi[24]);
+  v[0] = vmlsq_n_s32(v[1], u[5], cospi[40]);
+  out[3 * col_num] = vrshlq_s32(v[0], v_bit);
+
+  out[0 * col_num] = u[0];
+  out[4 * col_num] = u[1];
+  out[2 * col_num] = u[2];
+  out[6 * col_num] = u[3];
+}
+
+static void fdct8x8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                         const int col_num) {
+  fdct4x8_neon(in, out, bit, col_num);
+  fdct4x8_neon(in + 1, out + 1, bit, col_num);
+}
+
+static void fadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit,
+                          const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
+  int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
+  int32x4_t x, y;
+  int col;
+
+  for (col = 0; col < col_num; ++col) {
+    // stage 0-1
+    u0 = in[col_num * 0 + col];
+    u1 = vnegq_s32(in[col_num * 7 + col]);
+    u2 = vnegq_s32(in[col_num * 3 + col]);
+    u3 = in[col_num * 4 + col];
+    u4 = vnegq_s32(in[col_num * 1 + col]);
+    u5 = in[col_num * 6 + col];
+    u6 = in[col_num * 2 + col];
+    u7 = vnegq_s32(in[col_num * 5 + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = vmulq_n_s32(u2, cospi[32]);
+    y = vmulq_n_s32(u3, cospi[32]);
+    v2 = vaddq_s32(x, y);
+    v2 = vrshlq_s32(v2, v_bit);
+
+    v3 = vsubq_s32(x, y);
+    v3 = vrshlq_s32(v3, v_bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = vmulq_n_s32(u6, cospi[32]);
+    y = vmulq_n_s32(u7, cospi[32]);
+    v6 = vaddq_s32(x, y);
+    v6 = vrshlq_s32(v6, v_bit);
+
+    v7 = vsubq_s32(x, y);
+    v7 = vrshlq_s32(v7, v_bit);
+
+    // stage 3
+    u0 = vaddq_s32(v0, v2);
+    u1 = vaddq_s32(v1, v3);
+    u2 = vsubq_s32(v0, v2);
+    u3 = vsubq_s32(v1, v3);
+    u4 = vaddq_s32(v4, v6);
+    u5 = vaddq_s32(v5, v7);
+    u6 = vsubq_s32(v4, v6);
+    u7 = vsubq_s32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    v4 = vmulq_n_s32(u4, cospi[16]);
+    v4 = vmlaq_n_s32(v4, u5, cospi[48]);
+    v4 = vrshlq_s32(v4, v_bit);
+
+    v5 = vmulq_n_s32(u4, cospi[48]);
+    v5 = vmlsq_n_s32(v5, u5, cospi[16]);
+    v5 = vrshlq_s32(v5, v_bit);
+
+    v6 = vmulq_n_s32(u7, cospi[16]);
+    v6 = vmlsq_n_s32(v6, u6, cospi[48]);
+    v6 = vrshlq_s32(v6, v_bit);
+
+    v7 = vmulq_n_s32(u6, cospi[16]);
+    v7 = vmlaq_n_s32(v7, u7, cospi[48]);
+    v7 = vrshlq_s32(v7, v_bit);
+
+    // stage 5
+    u0 = vaddq_s32(v0, v4);
+    u1 = vaddq_s32(v1, v5);
+    u2 = vaddq_s32(v2, v6);
+    u3 = vaddq_s32(v3, v7);
+    u4 = vsubq_s32(v0, v4);
+    u5 = vsubq_s32(v1, v5);
+    u6 = vsubq_s32(v2, v6);
+    u7 = vsubq_s32(v3, v7);
+
+    // stage 6
+    v0 = vmulq_n_s32(u0, cospi[4]);
+    v0 = vmlaq_n_s32(v0, u1, cospi[60]);
+    v0 = vrshlq_s32(v0, v_bit);
+
+    v1 = vmulq_n_s32(u0, cospi[60]);
+    v1 = vmlsq_n_s32(v1, u1, cospi[4]);
+    v1 = vrshlq_s32(v1, v_bit);
+
+    v2 = vmulq_n_s32(u2, cospi[20]);
+    v2 = vmlaq_n_s32(v2, u3, cospi[44]);
+    v2 = vrshlq_s32(v2, v_bit);
+
+    v3 = vmulq_n_s32(u2, cospi[44]);
+    v3 = vmlsq_n_s32(v3, u3, cospi[20]);
+    v3 = vrshlq_s32(v3, v_bit);
+
+    v4 = vmulq_n_s32(u4, cospi[36]);
+    v4 = vmlaq_n_s32(v4, u5, cospi[28]);
+    v4 = vrshlq_s32(v4, v_bit);
+
+    v5 = vmulq_n_s32(u4, cospi[28]);
+    v5 = vmlsq_n_s32(v5, u5, cospi[36]);
+    v5 = vrshlq_s32(v5, v_bit);
+
+    x = vmulq_n_s32(u6, cospi[52]);
+    v6 = vmlaq_n_s32(x, u7, cospi[12]);
+    v6 = vrshlq_s32(v6, v_bit);
+
+    v7 = vmulq_n_s32(u6, cospi[12]);
+    v7 = vmlsq_n_s32(v7, u7, cospi[52]);
+    v7 = vrshlq_s32(v7, v_bit);
+
+    // stage 7
+    out[col_num * 0 + col] = v1;
+    out[col_num * 1 + col] = v6;
+    out[col_num * 2 + col] = v3;
+    out[col_num * 3 + col] = v4;
+    out[col_num * 4 + col] = v5;
+    out[col_num * 5 + col] = v2;
+    out[col_num * 6 + col] = v7;
+    out[col_num * 7 + col] = v0;
+  }
+}
+static void idtx8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int col_num) {
+  (void)bit;
+
+  for (int i = 0; i < col_num; i += 1) {
+    out[0 + 8 * i] = vshlq_n_s32(in[0 + 8 * i], 1);
+    out[1 + 8 * i] = vshlq_n_s32(in[1 + 8 * i], 1);
+    out[2 + 8 * i] = vshlq_n_s32(in[2 + 8 * i], 1);
+    out[3 + 8 * i] = vshlq_n_s32(in[3 + 8 * i], 1);
+    out[4 + 8 * i] = vshlq_n_s32(in[4 + 8 * i], 1);
+    out[5 + 8 * i] = vshlq_n_s32(in[5 + 8 * i], 1);
+    out[6 + 8 * i] = vshlq_n_s32(in[6 + 8 * i], 1);
+    out[7 + 8 * i] = vshlq_n_s32(in[7 + 8 * i], 1);
+  }
+}
+#if !CONFIG_REALTIME_ONLY
+static void idtx32x8_neon(int32x4_t *in, int32x4_t *out, int bit, int col_num) {
+  (void)bit;
+  (void)col_num;
+  for (int j = 0; j < 2; j++) {
+    out[j + 8 * 0] = vshlq_n_s32(in[j + 8 * 0], 1);
+    out[j + 8 * 1] = vshlq_n_s32(in[j + 8 * 1], 1);
+    out[j + 8 * 2] = vshlq_n_s32(in[j + 8 * 2], 1);
+    out[j + 8 * 3] = vshlq_n_s32(in[j + 8 * 3], 1);
+    out[j + 8 * 4] = vshlq_n_s32(in[j + 8 * 4], 1);
+    out[j + 8 * 5] = vshlq_n_s32(in[j + 8 * 5], 1);
+    out[j + 8 * 6] = vshlq_n_s32(in[j + 8 * 6], 1);
+    out[j + 8 * 7] = vshlq_n_s32(in[j + 8 * 7], 1);
+  }
+}
+#endif
+void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  int32x4_t in[16], out[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case IDTX:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fdct8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      idtx8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, &v_shift1);
+      transpose_8x8(out, in);
+      fadst8x8_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+// Hybrid Transform 16x16
+
+static INLINE void convert_8x8_to_16x16(const int32x4_t *in, int32x4_t *out) {
+  int row_index = 0;
+  int dst_index = 0;
+  int src_index = 0;
+
+  // row 0, 1, .., 7
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 8);
+
+  // row 8, 9, ..., 15
+  src_index += 16;
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 16);
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, int32x4_t *out,
+                                     int stride, int flipud, int fliplr,
+                                     int shift) {
+  int32x4_t in[64];
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+
+  // load second 8 columns
+  load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+
+  convert_8x8_to_16x16(in, out);
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, int32x4_t *out,
+                                    int stride, int flipud, int fliplr,
+                                    int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+
+  load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_8x4(const int16_t *input, int32x4_t *out,
+                                   int stride, int flipud, int fliplr,
+                                   const int32x4_t *v_shift) {
+  const int16_t *topL = input;
+  const int16_t *topR = input + 4;
+
+  const int16_t *tmp;
+
+  if (fliplr) {
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+  }
+  load_buffer_4x4(topL, out, stride, flipud, fliplr, v_shift);
+  load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, v_shift);
+}
+
+static INLINE void load_buffer_16x4(const int16_t *input, int32x4_t *out,
+                                    int stride, int flipud, int fliplr,
+                                    const int32x4_t *v_shift) {
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+
+  const int16_t *tmp;
+
+  if (fliplr) {
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+  }
+
+  load_buffer_8x4(topL, out, stride, flipud, fliplr, v_shift);
+  load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, v_shift);
+}
+
+static INLINE void load_buffer_4x8(const int16_t *input, int32x4_t *out,
+                                   int stride, int flipud, int fliplr,
+                                   const int32x4_t *v_shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 4 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+
+  load_buffer_4x4(topL, out, stride, flipud, fliplr, v_shift);
+  load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, v_shift);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void load_buffer_4x16(const int16_t *input, int32x4_t *out,
+                                    const int stride, const int flipud,
+                                    const int fliplr,
+                                    const int32x4_t *v_shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+  load_buffer_4x8(topL, out, stride, flipud, fliplr, v_shift);
+  load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, v_shift);
+}
+#endif
+
+static INLINE void load_buffer_32x8n(const int16_t *input, int32x4_t *out,
+                                     int stride, int flipud, int fliplr,
+                                     int shift, const int height) {
+  const int16_t *in = input;
+  int32x4_t *output = out;
+  for (int col = 0; col < height; col++) {
+    in = input + col * stride;
+    output = out + col * 8;
+    int32x4_t v_shift = vdupq_n_s32(shift);
+    load_buffer_4x4(in, output, 4, flipud, fliplr, &v_shift);
+    load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, &v_shift);
+  }
+}
+
+static void fdct16x16_neon(int32x4_t *in, int32x4_t *out, int bit,
+                           const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  int32x4_t u[16], v[16];
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = vaddq_s32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[15] = vsubq_s32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[1] = vaddq_s32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[14] = vsubq_s32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[2] = vaddq_s32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[13] = vsubq_s32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[3] = vaddq_s32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[12] = vsubq_s32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[4] = vaddq_s32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[11] = vsubq_s32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[5] = vaddq_s32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[10] = vsubq_s32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[6] = vaddq_s32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[9] = vsubq_s32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[7] = vaddq_s32(in[7 * col_num + col], in[8 * col_num + col]);
+    u[8] = vsubq_s32(in[7 * col_num + col], in[8 * col_num + col]);
+
+    // stage 2
+    v[0] = vaddq_s32(u[0], u[7]);
+    v[7] = vsubq_s32(u[0], u[7]);
+    v[1] = vaddq_s32(u[1], u[6]);
+    v[6] = vsubq_s32(u[1], u[6]);
+    v[2] = vaddq_s32(u[2], u[5]);
+    v[5] = vsubq_s32(u[2], u[5]);
+    v[3] = vaddq_s32(u[3], u[4]);
+    v[4] = vsubq_s32(u[3], u[4]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    v[10] = vmulq_n_s32(u[13], cospi[32]);
+    v[10] = vmlsq_n_s32(v[10], u[10], cospi[32]);
+    v[10] = vrshlq_s32(v[10], v_bit);
+
+    v[13] = vmulq_n_s32(u[10], cospi[32]);
+    v[13] = vmlaq_n_s32(v[13], u[13], cospi[32]);
+    v[13] = vrshlq_s32(v[13], v_bit);
+
+    v[11] = vmulq_n_s32(u[12], cospi[32]);
+    v[11] = vmlsq_n_s32(v[11], u[11], cospi[32]);
+    v[11] = vrshlq_s32(v[11], v_bit);
+
+    v[12] = vmulq_n_s32(u[11], cospi[32]);
+    v[12] = vmlaq_n_s32(v[12], u[12], cospi[32]);
+    v[12] = vrshlq_s32(v[12], v_bit);
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 3
+    u[0] = vaddq_s32(v[0], v[3]);
+    u[3] = vsubq_s32(v[0], v[3]);
+    u[1] = vaddq_s32(v[1], v[2]);
+    u[2] = vsubq_s32(v[1], v[2]);
+    u[4] = v[4];
+
+    u[5] = vmulq_n_s32(v[6], cospi[32]);
+    u[5] = vmlsq_n_s32(u[5], v[5], cospi[32]);
+    u[5] = vrshlq_s32(u[5], v_bit);
+
+    u[6] = vmulq_n_s32(v[5], cospi[32]);
+    u[6] = vmlaq_n_s32(u[6], v[6], cospi[32]);
+    u[6] = vrshlq_s32(u[6], v_bit);
+
+    u[7] = v[7];
+    u[8] = vaddq_s32(v[8], v[11]);
+    u[11] = vsubq_s32(v[8], v[11]);
+    u[9] = vaddq_s32(v[9], v[10]);
+    u[10] = vsubq_s32(v[9], v[10]);
+    u[12] = vsubq_s32(v[15], v[12]);
+    u[15] = vaddq_s32(v[15], v[12]);
+    u[13] = vsubq_s32(v[14], v[13]);
+    u[14] = vaddq_s32(v[14], v[13]);
+
+    // stage 4
+    u[0] = vmulq_n_s32(u[0], cospi[32]);
+    u[1] = vmulq_n_s32(u[1], cospi[32]);
+    v[0] = vaddq_s32(u[0], u[1]);
+    v[0] = vrshlq_s32(v[0], v_bit);
+
+    v[1] = vsubq_s32(u[0], u[1]);
+    v[1] = vrshlq_s32(v[1], v_bit);
+
+    v[2] = vmulq_n_s32(u[2], cospi[48]);
+    v[2] = vmlaq_n_s32(v[2], u[3], cospi[16]);
+    v[2] = vrshlq_s32(v[2], v_bit);
+
+    v[3] = vmulq_n_s32(u[3], cospi[48]);
+    v[3] = vmlsq_n_s32(v[3], u[2], cospi[16]);
+    v[3] = vrshlq_s32(v[3], v_bit);
+
+    v[4] = vaddq_s32(u[4], u[5]);
+    v[5] = vsubq_s32(u[4], u[5]);
+    v[6] = vsubq_s32(u[7], u[6]);
+    v[7] = vaddq_s32(u[7], u[6]);
+    v[8] = u[8];
+
+    v[9] = vmulq_n_s32(u[14], cospi[48]);
+    v[9] = vmlsq_n_s32(v[9], u[9], cospi[16]);
+    v[9] = vrshlq_s32(v[9], v_bit);
+
+    v[14] = vmulq_n_s32(u[9], cospi[48]);
+    v[14] = vmlaq_n_s32(v[14], u[14], cospi[16]);
+    v[14] = vrshlq_s32(v[14], v_bit);
+
+    v[10] = vmulq_n_s32(u[13], -cospi[16]);
+    v[10] = vmlsq_n_s32(v[10], u[10], cospi[48]);
+    v[10] = vrshlq_s32(v[10], v_bit);
+
+    v[13] = vmulq_n_s32(u[10], -cospi[16]);
+    v[13] = vmlaq_n_s32(v[13], u[13], cospi[48]);
+    v[13] = vrshlq_s32(v[13], v_bit);
+
+    v[11] = u[11];
+    v[12] = u[12];
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+
+    u[4] = vmulq_n_s32(v[4], cospi[56]);
+    u[4] = vmlaq_n_s32(u[4], v[7], cospi[8]);
+    u[4] = vrshlq_s32(u[4], v_bit);
+
+    u[7] = vmulq_n_s32(v[7], cospi[56]);
+    u[7] = vmlsq_n_s32(u[7], v[4], cospi[8]);
+    u[7] = vrshlq_s32(u[7], v_bit);
+
+    u[5] = vmulq_n_s32(v[5], cospi[24]);
+    u[5] = vmlaq_n_s32(u[5], v[6], cospi[40]);
+    u[5] = vrshlq_s32(u[5], v_bit);
+
+    u[6] = vmulq_n_s32(v[6], cospi[24]);
+    u[6] = vmlsq_n_s32(u[6], v[5], cospi[40]);
+    u[6] = vrshlq_s32(u[6], v_bit);
+
+    u[8] = vaddq_s32(v[8], v[9]);
+    u[9] = vsubq_s32(v[8], v[9]);
+    u[10] = vsubq_s32(v[11], v[10]);
+    u[11] = vaddq_s32(v[11], v[10]);
+    u[12] = vaddq_s32(v[12], v[13]);
+    u[13] = vsubq_s32(v[12], v[13]);
+    u[14] = vsubq_s32(v[15], v[14]);
+    u[15] = vaddq_s32(v[15], v[14]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = vmulq_n_s32(u[8], cospi[60]);
+    v[8] = vmlaq_n_s32(v[8], u[15], cospi[4]);
+    v[8] = vrshlq_s32(v[8], v_bit);
+
+    v[15] = vmulq_n_s32(u[15], cospi[60]);
+    v[15] = vmlsq_n_s32(v[15], u[8], cospi[4]);
+    v[15] = vrshlq_s32(v[15], v_bit);
+
+    v[9] = vmulq_n_s32(u[9], cospi[28]);
+    v[9] = vmlaq_n_s32(v[9], u[14], cospi[36]);
+    v[9] = vrshlq_s32(v[9], v_bit);
+
+    v[14] = vmulq_n_s32(u[14], cospi[28]);
+    v[14] = vmlsq_n_s32(v[14], u[9], cospi[36]);
+    v[14] = vrshlq_s32(v[14], v_bit);
+
+    v[10] = vmulq_n_s32(u[10], cospi[44]);
+    v[10] = vmlaq_n_s32(v[10], u[13], cospi[20]);
+    v[10] = vrshlq_s32(v[10], v_bit);
+
+    v[13] = vmulq_n_s32(u[13], cospi[44]);
+    v[13] = vmlsq_n_s32(v[13], u[10], cospi[20]);
+    v[13] = vrshlq_s32(v[13], v_bit);
+
+    v[11] = vmulq_n_s32(u[11], cospi[12]);
+    v[11] = vmlaq_n_s32(v[11], u[12], cospi[52]);
+    v[11] = vrshlq_s32(v[11], v_bit);
+
+    v[12] = vmulq_n_s32(u[12], cospi[12]);
+    v[12] = vmlsq_n_s32(v[12], u[11], cospi[52]);
+    v[12] = vrshlq_s32(v[12], v_bit);
+
+    out[0 * col_num + col] = v[0];
+    out[1 * col_num + col] = v[8];
+    out[2 * col_num + col] = v[4];
+    out[3 * col_num + col] = v[12];
+    out[4 * col_num + col] = v[2];
+    out[5 * col_num + col] = v[10];
+    out[6 * col_num + col] = v[6];
+    out[7 * col_num + col] = v[14];
+    out[8 * col_num + col] = v[1];
+    out[9 * col_num + col] = v[9];
+    out[10 * col_num + col] = v[5];
+    out[11 * col_num + col] = v[13];
+    out[12 * col_num + col] = v[3];
+    out[13 * col_num + col] = v[11];
+    out[14 * col_num + col] = v[7];
+    out[15 * col_num + col] = v[15];
+  }
+}
+
+static void fadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit,
+                            const int num_cols) {
+  const int32_t *cospi = cospi_arr(bit);
+
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+  int32x4_t u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < num_cols; ++col) {
+    // stage 0-1
+    u[0] = in[0 * num_cols + col];
+    u[1] = vnegq_s32(in[15 * num_cols + col]);
+    u[2] = vnegq_s32(in[7 * num_cols + col]);
+    u[3] = in[8 * num_cols + col];
+    u[4] = vnegq_s32(in[3 * num_cols + col]);
+    u[5] = in[12 * num_cols + col];
+    u[6] = in[4 * num_cols + col];
+    u[7] = vnegq_s32(in[11 * num_cols + col]);
+    u[8] = vnegq_s32(in[1 * num_cols + col]);
+    u[9] = in[14 * num_cols + col];
+    u[10] = in[6 * num_cols + col];
+    u[11] = vnegq_s32(in[9 * num_cols + col]);
+    u[12] = in[2 * num_cols + col];
+    u[13] = vnegq_s32(in[13 * num_cols + col]);
+    u[14] = vnegq_s32(in[5 * num_cols + col]);
+    u[15] = in[10 * num_cols + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = vmulq_n_s32(u[2], cospi[32]);
+    y = vmulq_n_s32(u[3], cospi[32]);
+    v[2] = vaddq_s32(x, y);
+    v[2] = vrshlq_s32(v[2], v_bit);
+
+    v[3] = vsubq_s32(x, y);
+    v[3] = vrshlq_s32(v[3], v_bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = vmulq_n_s32(u[6], cospi[32]);
+    y = vmulq_n_s32(u[7], cospi[32]);
+    v[6] = vaddq_s32(x, y);
+    v[6] = vrshlq_s32(v[6], v_bit);
+
+    v[7] = vsubq_s32(x, y);
+    v[7] = vrshlq_s32(v[7], v_bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = vmulq_n_s32(u[10], cospi[32]);
+    y = vmulq_n_s32(u[11], cospi[32]);
+    v[10] = vaddq_s32(x, y);
+    v[10] = vrshlq_s32(v[10], v_bit);
+
+    v[11] = vsubq_s32(x, y);
+    v[11] = vrshlq_s32(v[11], v_bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = vmulq_n_s32(u[14], cospi[32]);
+    y = vmulq_n_s32(u[15], cospi[32]);
+    v[14] = vaddq_s32(x, y);
+    v[14] = vrshlq_s32(v[14], v_bit);
+
+    v[15] = vsubq_s32(x, y);
+    v[15] = vrshlq_s32(v[15], v_bit);
+
+    // stage 3
+    u[0] = vaddq_s32(v[0], v[2]);
+    u[1] = vaddq_s32(v[1], v[3]);
+    u[2] = vsubq_s32(v[0], v[2]);
+    u[3] = vsubq_s32(v[1], v[3]);
+    u[4] = vaddq_s32(v[4], v[6]);
+    u[5] = vaddq_s32(v[5], v[7]);
+    u[6] = vsubq_s32(v[4], v[6]);
+    u[7] = vsubq_s32(v[5], v[7]);
+    u[8] = vaddq_s32(v[8], v[10]);
+    u[9] = vaddq_s32(v[9], v[11]);
+    u[10] = vsubq_s32(v[8], v[10]);
+    u[11] = vsubq_s32(v[9], v[11]);
+    u[12] = vaddq_s32(v[12], v[14]);
+    u[13] = vaddq_s32(v[13], v[15]);
+    u[14] = vsubq_s32(v[12], v[14]);
+    u[15] = vsubq_s32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = half_btf_neon(&cospi[16], &u[4], &cospi[48], &u[5], v_bit);
+    v[7] = half_btf_neon(&cospi[16], &u[6], &cospi[48], &u[7], v_bit);
+    v[5] = half_btf_neon_m(&cospi[48], &u[4], &cospi[16], &u[5], v_bit);
+    v[6] = half_btf_neon_m(&cospi[16], &u[7], &cospi[48], &u[6], v_bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+
+    v[12] = half_btf_neon(&cospi[16], &u[12], &cospi[48], &u[13], v_bit);
+    v[15] = half_btf_neon(&cospi[16], &u[14], &cospi[48], &u[15], v_bit);
+    v[13] = half_btf_neon_m(&cospi[48], &u[12], &cospi[16], &u[13], v_bit);
+    v[14] = half_btf_neon_m(&cospi[16], &u[15], &cospi[48], &u[14], v_bit);
+
+    // stage 5
+    u[0] = vaddq_s32(v[0], v[4]);
+    u[1] = vaddq_s32(v[1], v[5]);
+    u[2] = vaddq_s32(v[2], v[6]);
+    u[3] = vaddq_s32(v[3], v[7]);
+    u[4] = vsubq_s32(v[0], v[4]);
+    u[5] = vsubq_s32(v[1], v[5]);
+    u[6] = vsubq_s32(v[2], v[6]);
+    u[7] = vsubq_s32(v[3], v[7]);
+    u[8] = vaddq_s32(v[8], v[12]);
+    u[9] = vaddq_s32(v[9], v[13]);
+    u[10] = vaddq_s32(v[10], v[14]);
+    u[11] = vaddq_s32(v[11], v[15]);
+    u[12] = vsubq_s32(v[8], v[12]);
+    u[13] = vsubq_s32(v[9], v[13]);
+    u[14] = vsubq_s32(v[10], v[14]);
+    u[15] = vsubq_s32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_neon(&cospi[8], &u[8], &cospi[56], &u[9], v_bit);
+    v[13] = half_btf_neon(&cospi[8], &u[12], &cospi[56], &u[13], v_bit);
+    v[9] = half_btf_neon_m(&cospi[56], &u[8], &cospi[8], &u[9], v_bit);
+    v[12] = half_btf_neon_m(&cospi[8], &u[13], &cospi[56], &u[12], v_bit);
+
+    v[10] = half_btf_neon(&cospi[40], &u[10], &cospi[24], &u[11], v_bit);
+    v[15] = half_btf_neon(&cospi[40], &u[14], &cospi[24], &u[15], v_bit);
+    v[11] = half_btf_neon_m(&cospi[24], &u[10], &cospi[40], &u[11], v_bit);
+    v[14] = half_btf_neon_m(&cospi[40], &u[15], &cospi[24], &u[14], v_bit);
+
+    // stage 7
+    u[0] = vaddq_s32(v[0], v[8]);
+    u[1] = vaddq_s32(v[1], v[9]);
+    u[2] = vaddq_s32(v[2], v[10]);
+    u[3] = vaddq_s32(v[3], v[11]);
+    u[4] = vaddq_s32(v[4], v[12]);
+    u[5] = vaddq_s32(v[5], v[13]);
+    u[6] = vaddq_s32(v[6], v[14]);
+    u[7] = vaddq_s32(v[7], v[15]);
+    u[8] = vsubq_s32(v[0], v[8]);
+    u[9] = vsubq_s32(v[1], v[9]);
+    u[10] = vsubq_s32(v[2], v[10]);
+    u[11] = vsubq_s32(v[3], v[11]);
+    u[12] = vsubq_s32(v[4], v[12]);
+    u[13] = vsubq_s32(v[5], v[13]);
+    u[14] = vsubq_s32(v[6], v[14]);
+    u[15] = vsubq_s32(v[7], v[15]);
+
+    // stage 8
+    v[0] = half_btf_neon(&cospi[2], &u[0], &cospi[62], &u[1], v_bit);
+    v[1] = half_btf_neon_m(&cospi[62], &u[0], &cospi[2], &u[1], v_bit);
+    v[2] = half_btf_neon(&cospi[10], &u[2], &cospi[54], &u[3], v_bit);
+    v[3] = half_btf_neon_m(&cospi[54], &u[2], &cospi[10], &u[3], v_bit);
+    v[4] = half_btf_neon(&cospi[18], &u[4], &cospi[46], &u[5], v_bit);
+    v[5] = half_btf_neon_m(&cospi[46], &u[4], &cospi[18], &u[5], v_bit);
+    v[6] = half_btf_neon(&cospi[26], &u[6], &cospi[38], &u[7], v_bit);
+    v[7] = half_btf_neon_m(&cospi[38], &u[6], &cospi[26], &u[7], v_bit);
+    v[8] = half_btf_neon(&cospi[34], &u[8], &cospi[30], &u[9], v_bit);
+    v[9] = half_btf_neon_m(&cospi[30], &u[8], &cospi[34], &u[9], v_bit);
+    v[10] = half_btf_neon(&cospi[42], &u[10], &cospi[22], &u[11], v_bit);
+    v[11] = half_btf_neon_m(&cospi[22], &u[10], &cospi[42], &u[11], v_bit);
+    v[12] = half_btf_neon(&cospi[50], &u[12], &cospi[14], &u[13], v_bit);
+    v[13] = half_btf_neon_m(&cospi[14], &u[12], &cospi[50], &u[13], v_bit);
+    v[14] = half_btf_neon(&cospi[58], &u[14], &cospi[6], &u[15], v_bit);
+    v[15] = half_btf_neon_m(&cospi[6], &u[14], &cospi[58], &u[15], v_bit);
+
+    // stage 9
+    out[0 * num_cols + col] = v[1];
+    out[1 * num_cols + col] = v[14];
+    out[2 * num_cols + col] = v[3];
+    out[3 * num_cols + col] = v[12];
+    out[4 * num_cols + col] = v[5];
+    out[5 * num_cols + col] = v[10];
+    out[6 * num_cols + col] = v[7];
+    out[7 * num_cols + col] = v[8];
+    out[8 * num_cols + col] = v[9];
+    out[9 * num_cols + col] = v[6];
+    out[10 * num_cols + col] = v[11];
+    out[11 * num_cols + col] = v[4];
+    out[12 * num_cols + col] = v[13];
+    out[13 * num_cols + col] = v[2];
+    out[14 * num_cols + col] = v[15];
+    out[15 * num_cols + col] = v[0];
+  }
+}
+
+static void col_txfm_16x16_rounding(int32x4_t *in, const int32x4_t *v_shift) {
+  // Note:
+  //  We split 16x16 rounding into 4 sections of 8x8 rounding,
+  //  instead of 4 columns
+  col_txfm_8x8_rounding(&in[0], v_shift);
+  col_txfm_8x8_rounding(&in[16], v_shift);
+  col_txfm_8x8_rounding(&in[32], v_shift);
+  col_txfm_8x8_rounding(&in[48], v_shift);
+}
+
+static void col_txfm_8x16_rounding(int32x4_t *in, const int32x4_t *v_shift) {
+  col_txfm_8x8_rounding(&in[0], v_shift);
+  col_txfm_8x8_rounding(&in[16], v_shift);
+}
+
+static void write_buffer_16x16(const int32x4_t *in, int32_t *output) {
+  const int size_8x8 = 16 * 4;
+  write_buffer_8x8(&in[0], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[16], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[32], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[48], output);
+}
+static void idtx16x16_neon(int32x4_t *in, int32x4_t *out, int bit,
+                           int col_num) {
+  (void)bit;
+  int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
+  int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
+  int32x4_t a_low;
+
+  int num_iters = 16 * col_num;
+  for (int i = 0; i < num_iters; i++) {
+    a_low = vmulq_s32(in[i], fact);
+    a_low = vaddq_s32(a_low, offset);
+    out[i] = vshrq_n_s32(a_low, NewSqrt2Bits);
+  }
+}
+void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  int32x4_t in[64], out[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
+  const int col_num = 4;
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case IDTX:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fdct16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      idtx16x16_neon(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, &v_shift);
+      transpose_16x16(out, in);
+      fadst16x16_neon(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) {
+  for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
+  for (int i = 1; i < size; i += 2) in[size - i] = out[i];
+}
+
+typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit,
+                                      const int num_cols);
+
+static const fwd_transform_1d_neon col_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_neon,   // DCT_DCT
+  fadst8x8_neon,  // ADST_DCT
+  fdct8x8_neon,   // DCT_ADST
+  fadst8x8_neon,  // ADST_ADST
+  fadst8x8_neon,  // FLIPADST_DCT
+  fdct8x8_neon,   // DCT_FLIPADST
+  fadst8x8_neon,  // FLIPADST_FLIPADST
+  fadst8x8_neon,  // ADST_FLIPADST
+  fadst8x8_neon,  // FLIPADST_ADST
+  idtx8x8_neon,   // IDTX
+  fdct8x8_neon,   // V_DCT
+  idtx8x8_neon,   // H_DCT
+  fadst8x8_neon,  // V_ADST
+  idtx8x8_neon,   // H_ADST
+  fadst8x8_neon,  // V_FLIPADST
+  idtx8x8_neon    // H_FLIPADST
+};
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_neon row_highbd_txfm32x8_arr[TX_TYPES] = {
+  fdct8x8_neon,   // DCT_DCT
+  NULL,           // ADST_DCT
+  NULL,           // DCT_ADST
+  NULL,           // ADST_ADST
+  NULL,           // FLIPADST_DCT
+  NULL,           // DCT_FLIPADST
+  NULL,           // FLIPADST_FLIPADST
+  NULL,           // ADST_FLIPADST
+  NULL,           // FLIPADST-ADST
+  idtx32x8_neon,  // IDTX
+  NULL,           // V_DCT
+  NULL,           // H_DCT
+  NULL,           // V_ADST
+  NULL,           // H_ADST
+  NULL,           // V_FLIPADST
+  NULL,           // H_FLIPADST
+};
+#endif
+static const fwd_transform_1d_neon col_highbd_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_neon,   // DCT_DCT
+  fadst8x8_neon,  // ADST_DCT
+  fdct4x8_neon,   // DCT_ADST
+  fadst8x8_neon,  // ADST_ADST
+  fadst8x8_neon,  // FLIPADST_DCT
+  fdct4x8_neon,   // DCT_FLIPADST
+  fadst8x8_neon,  // FLIPADST_FLIPADST
+  fadst8x8_neon,  // ADST_FLIPADST
+  fadst8x8_neon,  // FLIPADST_ADST
+  idtx8x8_neon,   // IDTX
+  fdct4x8_neon,   // V_DCT
+  idtx8x8_neon,   // H_DCT
+  fadst8x8_neon,  // V_ADST
+  idtx8x8_neon,   // H_ADST
+  fadst8x8_neon,  // V_FLIPADST
+  idtx8x8_neon    // H_FLIPADST
+};
+
+static const fwd_transform_1d_neon row_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16x16_neon,   // DCT_DCT
+  fdct16x16_neon,   // ADST_DCT
+  fadst16x16_neon,  // DCT_ADST
+  fadst16x16_neon,  // ADST_ADST
+  fdct16x16_neon,   // FLIPADST_DCT
+  fadst16x16_neon,  // DCT_FLIPADST
+  fadst16x16_neon,  // FLIPADST_FLIPADST
+  fadst16x16_neon,  // ADST_FLIPADST
+  fadst16x16_neon,  // FLIPADST_ADST
+  idtx16x16_neon,   // IDTX
+  idtx16x16_neon,   // V_DCT
+  fdct16x16_neon,   // H_DCT
+  idtx16x16_neon,   // V_ADST
+  fadst16x16_neon,  // H_ADST
+  idtx16x16_neon,   // V_FLIPADST
+  fadst16x16_neon   // H_FLIPADST
+};
+
+static const fwd_transform_1d_neon col_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16x16_neon,   // DCT_DCT
+  fadst16x16_neon,  // ADST_DCT
+  fdct16x16_neon,   // DCT_ADST
+  fadst16x16_neon,  // ADST_ADST
+  fadst16x16_neon,  // FLIPADST_DCT
+  fdct16x16_neon,   // DCT_FLIPADST
+  fadst16x16_neon,  // FLIPADST_FLIPADST
+  fadst16x16_neon,  // ADST_FLIPADST
+  fadst16x16_neon,  // FLIPADST_ADST
+  idtx16x16_neon,   // IDTX
+  fdct16x16_neon,   // V_DCT
+  idtx16x16_neon,   // H_DCT
+  fadst16x16_neon,  // V_ADST
+  idtx16x16_neon,   // H_ADST
+  fadst16x16_neon,  // V_FLIPADST
+  idtx16x16_neon    // H_FLIPADST
+};
+static const fwd_transform_1d_neon row_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_neon,   // DCT_DCT
+  fdct8x8_neon,   // ADST_DCT
+  fadst8x8_neon,  // DCT_ADST
+  fadst8x8_neon,  // ADST_ADST
+  fdct8x8_neon,   // FLIPADST_DCT
+  fadst8x8_neon,  // DCT_FLIPADST
+  fadst8x8_neon,  // FLIPADST_FLIPADST
+  fadst8x8_neon,  // ADST_FLIPADST
+  fadst8x8_neon,  // FLIPADST_ADST
+  idtx8x8_neon,   // IDTX
+  idtx8x8_neon,   // V_DCT
+  fdct8x8_neon,   // H_DCT
+  idtx8x8_neon,   // V_ADST
+  fadst8x8_neon,  // H_ADST
+  idtx8x8_neon,   // V_FLIPADST
+  fadst8x8_neon   // H_FLIPADST
+};
+
+static const fwd_transform_1d_neon row_highbd_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_neon,   // DCT_DCT
+  fdct4x8_neon,   // ADST_DCT
+  fadst8x8_neon,  // DCT_ADST
+  fadst8x8_neon,  // ADST_ADST
+  fdct4x8_neon,   // FLIPADST_DCT
+  fadst8x8_neon,  // DCT_FLIPADST
+  fadst8x8_neon,  // FLIPADST_FLIPADST
+  fadst8x8_neon,  // ADST_FLIPADST
+  fadst8x8_neon,  // FLIPADST_ADST
+  idtx8x8_neon,   // IDTX
+  idtx8x8_neon,   // V_DCT
+  fdct4x8_neon,   // H_DCT
+  idtx8x8_neon,   // V_ADST
+  fadst8x8_neon,  // H_ADST
+  idtx8x8_neon,   // V_FLIPADST
+  fadst8x8_neon   // H_FLIPADST
+};
+
+static const fwd_transform_1d_neon row_highbd_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_neon,   // DCT_DCT
+  fdct4x4_neon,   // ADST_DCT
+  fadst4x4_neon,  // DCT_ADST
+  fadst4x4_neon,  // ADST_ADST
+  fdct4x4_neon,   // FLIPADST_DCT
+  fadst4x4_neon,  // DCT_FLIPADST
+  fadst4x4_neon,  // FLIPADST_FLIPADST
+  fadst4x4_neon,  // ADST_FLIPADST
+  fadst4x4_neon,  // FLIPADST_ADST
+  idtx4x4_neon,   // IDTX
+  idtx4x4_neon,   // V_DCT
+  fdct4x4_neon,   // H_DCT
+  idtx4x4_neon,   // V_ADST
+  fadst4x4_neon,  // H_ADST
+  idtx4x4_neon,   // V_FLIPADST
+  fadst4x4_neon   // H_FLIPADST
+};
+
+static const fwd_transform_1d_neon col_highbd_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_neon,   // DCT_DCT
+  fadst4x4_neon,  // ADST_DCT
+  fdct4x4_neon,   // DCT_ADST
+  fadst4x4_neon,  // ADST_ADST
+  fadst4x4_neon,  // FLIPADST_DCT
+  fdct4x4_neon,   // DCT_FLIPADST
+  fadst4x4_neon,  // FLIPADST_FLIPADST
+  fadst4x4_neon,  // ADST_FLIPADST
+  fadst4x4_neon,  // FLIPADST_ADST
+  idtx4x4_neon,   // IDTX
+  fdct4x4_neon,   // V_DCT
+  idtx4x4_neon,   // H_DCT
+  fadst4x4_neon,  // V_ADST
+  idtx4x4_neon,   // H_ADST
+  fadst4x4_neon,  // V_FLIPADST
+  idtx4x4_neon    // H_FLIPADST
+};
+
+void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit,
+                         const int stride) {
+  int32x4_t buf0[32];
+  int32x4_t buf1[32];
+  const int32_t *cospi;
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  int startidx = 0 * stride;
+  int endidx = 31 * stride;
+  // stage 0
+  // stage 1
+  buf1[0] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[31] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[1] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[30] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[2] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[29] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[3] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[28] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[4] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[27] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[5] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[26] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[6] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[25] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[7] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[24] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[8] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[23] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[9] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[22] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[10] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[21] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[11] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[20] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[12] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[19] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[13] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[18] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[14] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[17] = vsubq_s32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[15] = vaddq_s32(input[startidx], input[endidx]);
+  buf1[16] = vsubq_s32(input[startidx], input[endidx]);
+
+  // stage 2
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = vaddq_s32(buf1[0], buf1[15]);
+  buf0[15] = vsubq_s32(buf1[0], buf1[15]);
+  buf0[1] = vaddq_s32(buf1[1], buf1[14]);
+  buf0[14] = vsubq_s32(buf1[1], buf1[14]);
+  buf0[2] = vaddq_s32(buf1[2], buf1[13]);
+  buf0[13] = vsubq_s32(buf1[2], buf1[13]);
+  buf0[3] = vaddq_s32(buf1[3], buf1[12]);
+  buf0[12] = vsubq_s32(buf1[3], buf1[12]);
+  buf0[4] = vaddq_s32(buf1[4], buf1[11]);
+  buf0[11] = vsubq_s32(buf1[4], buf1[11]);
+  buf0[5] = vaddq_s32(buf1[5], buf1[10]);
+  buf0[10] = vsubq_s32(buf1[5], buf1[10]);
+  buf0[6] = vaddq_s32(buf1[6], buf1[9]);
+  buf0[9] = vsubq_s32(buf1[6], buf1[9]);
+  buf0[7] = vaddq_s32(buf1[7], buf1[8]);
+  buf0[8] = vsubq_s32(buf1[7], buf1[8]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  btf_32_neon_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                    buf0[27], v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                    buf0[25], v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                    buf0[24], v_cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = vaddq_s32(buf0[0], buf0[7]);
+  buf1[7] = vsubq_s32(buf0[0], buf0[7]);
+  buf1[1] = vaddq_s32(buf0[1], buf0[6]);
+  buf1[6] = vsubq_s32(buf0[1], buf0[6]);
+  buf1[2] = vaddq_s32(buf0[2], buf0[5]);
+  buf1[5] = vsubq_s32(buf0[2], buf0[5]);
+  buf1[3] = vaddq_s32(buf0[3], buf0[4]);
+  buf1[4] = vsubq_s32(buf0[3], buf0[4]);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  btf_32_neon_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                    buf1[13], v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                    buf1[12], v_cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  buf1[16] = vaddq_s32(buf0[16], buf0[23]);
+  buf1[23] = vsubq_s32(buf0[16], buf0[23]);
+  buf1[17] = vaddq_s32(buf0[17], buf0[22]);
+  buf1[22] = vsubq_s32(buf0[17], buf0[22]);
+  buf1[18] = vaddq_s32(buf0[18], buf0[21]);
+  buf1[21] = vsubq_s32(buf0[18], buf0[21]);
+  buf1[19] = vaddq_s32(buf0[19], buf0[20]);
+  buf1[20] = vsubq_s32(buf0[19], buf0[20]);
+  buf1[24] = vsubq_s32(buf0[31], buf0[24]);
+  buf1[31] = vaddq_s32(buf0[31], buf0[24]);
+  buf1[25] = vsubq_s32(buf0[30], buf0[25]);
+  buf1[30] = vaddq_s32(buf0[30], buf0[25]);
+  buf1[26] = vsubq_s32(buf0[29], buf0[26]);
+  buf1[29] = vaddq_s32(buf0[29], buf0[26]);
+  buf1[27] = vsubq_s32(buf0[28], buf0[27]);
+  buf1[28] = vaddq_s32(buf0[28], buf0[27]);
+
+  // stage 4
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = vaddq_s32(buf1[0], buf1[3]);
+  buf0[3] = vsubq_s32(buf1[0], buf1[3]);
+  buf0[1] = vaddq_s32(buf1[1], buf1[2]);
+  buf0[2] = vsubq_s32(buf1[1], buf1[2]);
+  buf0[4] = buf1[4];
+  btf_32_neon_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+                    v_cos_bit);
+  buf0[7] = buf1[7];
+  buf0[8] = vaddq_s32(buf1[8], buf1[11]);
+  buf0[11] = vsubq_s32(buf1[8], buf1[11]);
+  buf0[9] = vaddq_s32(buf1[9], buf1[10]);
+  buf0[10] = vsubq_s32(buf1[9], buf1[10]);
+  buf0[12] = vsubq_s32(buf1[15], buf1[12]);
+  buf0[15] = vaddq_s32(buf1[15], buf1[12]);
+  buf0[13] = vsubq_s32(buf1[14], buf1[13]);
+  buf0[14] = vaddq_s32(buf1[14], buf1[13]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+
+  btf_32_neon_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                    buf0[29], v_cos_bit);
+  btf_32_neon_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                    buf0[28], v_cos_bit);
+
+  btf_32_neon_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                    buf0[27], v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  btf_32_neon_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+                    v_cos_bit);
+
+  btf_32_neon_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+                    v_cos_bit);
+  buf1[4] = vaddq_s32(buf0[4], buf0[5]);
+  buf1[5] = vsubq_s32(buf0[4], buf0[5]);
+  buf1[6] = vsubq_s32(buf0[7], buf0[6]);
+  buf1[7] = vaddq_s32(buf0[7], buf0[6]);
+  buf1[8] = buf0[8];
+  btf_32_neon_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                    buf1[13], v_cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  buf1[16] = vaddq_s32(buf0[16], buf0[19]);
+  buf1[19] = vsubq_s32(buf0[16], buf0[19]);
+  buf1[17] = vaddq_s32(buf0[17], buf0[18]);
+  buf1[18] = vsubq_s32(buf0[17], buf0[18]);
+  buf1[20] = vsubq_s32(buf0[23], buf0[20]);
+  buf1[23] = vaddq_s32(buf0[23], buf0[20]);
+  buf1[21] = vsubq_s32(buf0[22], buf0[21]);
+  buf1[22] = vaddq_s32(buf0[22], buf0[21]);
+  buf1[24] = vaddq_s32(buf0[24], buf0[27]);
+  buf1[27] = vsubq_s32(buf0[24], buf0[27]);
+  buf1[25] = vaddq_s32(buf0[25], buf0[26]);
+  buf1[26] = vsubq_s32(buf0[25], buf0[26]);
+  buf1[28] = vsubq_s32(buf0[31], buf0[28]);
+  buf1[31] = vaddq_s32(buf0[31], buf0[28]);
+  buf1[29] = vsubq_s32(buf0[30], buf0[29]);
+  buf1[30] = vaddq_s32(buf0[30], buf0[29]);
+
+  // stage 6
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+
+  btf_32_neon_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                    buf0[30], v_cos_bit);
+  btf_32_neon_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                    buf0[29], v_cos_bit);
+
+  buf0[8] = vaddq_s32(buf1[8], buf1[9]);
+  buf0[9] = vsubq_s32(buf1[8], buf1[9]);
+  buf0[10] = vsubq_s32(buf1[11], buf1[10]);
+  buf0[11] = vaddq_s32(buf1[11], buf1[10]);
+  buf0[12] = vaddq_s32(buf1[12], buf1[13]);
+  buf0[13] = vsubq_s32(buf1[12], buf1[13]);
+  buf0[14] = vsubq_s32(buf1[15], buf1[14]);
+  buf0[15] = vaddq_s32(buf1[15], buf1[14]);
+  buf0[16] = buf1[16];
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+
+  btf_32_neon_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+  btf_32_neon_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                    buf0[25], v_cos_bit);
+
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+  btf_32_neon_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], buf1[14],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                    buf1[13], v_cos_bit);
+  btf_32_neon_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                    buf1[12], v_cos_bit);
+  buf1[16] = vaddq_s32(buf0[16], buf0[17]);
+  buf1[17] = vsubq_s32(buf0[16], buf0[17]);
+  buf1[18] = vsubq_s32(buf0[19], buf0[18]);
+  buf1[19] = vaddq_s32(buf0[19], buf0[18]);
+  buf1[20] = vaddq_s32(buf0[20], buf0[21]);
+  buf1[21] = vsubq_s32(buf0[20], buf0[21]);
+  buf1[22] = vsubq_s32(buf0[23], buf0[22]);
+  buf1[23] = vaddq_s32(buf0[23], buf0[22]);
+  buf1[24] = vaddq_s32(buf0[24], buf0[25]);
+  buf1[25] = vsubq_s32(buf0[24], buf0[25]);
+  buf1[26] = vsubq_s32(buf0[27], buf0[26]);
+  buf1[27] = vaddq_s32(buf0[27], buf0[26]);
+  buf1[28] = vaddq_s32(buf0[28], buf0[29]);
+  buf1[29] = vsubq_s32(buf0[28], buf0[29]);
+  buf1[30] = vsubq_s32(buf0[31], buf0[30]);
+  buf1[31] = vaddq_s32(buf0[31], buf0[30]);
+
+  // stage 8
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+  btf_32_neon_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], buf0[31],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                    buf0[30], v_cos_bit);
+  btf_32_neon_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                    buf0[29], v_cos_bit);
+  btf_32_neon_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                    buf0[28], v_cos_bit);
+  btf_32_neon_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                    buf0[27], v_cos_bit);
+  btf_32_neon_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                    buf0[26], v_cos_bit);
+  btf_32_neon_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                    buf0[25], v_cos_bit);
+  btf_32_neon_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], buf0[24],
+                    v_cos_bit);
+
+  startidx = 0 * stride;
+  endidx = 31 * stride;
+  // stage 9
+  output[startidx] = buf0[0];
+  output[endidx] = buf0[31];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[16];
+  output[endidx] = buf0[15];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[8];
+  output[endidx] = buf0[23];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[24];
+  output[endidx] = buf0[7];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[4];
+  output[endidx] = buf0[27];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[20];
+  output[endidx] = buf0[11];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[12];
+  output[endidx] = buf0[19];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[28];
+  output[endidx] = buf0[3];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[2];
+  output[endidx] = buf0[29];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[18];
+  output[endidx] = buf0[13];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[10];
+  output[endidx] = buf0[21];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[26];
+  output[endidx] = buf0[5];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[6];
+  output[endidx] = buf0[25];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[22];
+  output[endidx] = buf0[9];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[14];
+  output[endidx] = buf0[17];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[30];
+  output[endidx] = buf0[1];
+}
+
+void av1_fadst4_new_neon(const int32x4_t *input, int32x4_t *output,
+                         const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+  int32x4_t buf0[4];
+  int32x4_t buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  cospi = cospi_arr(cos_bit);
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    int j;
+    for (j = 0; j < 4; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    stage_idx++;
+    buf1[0] = buf0[3];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[2];
+
+    // stage 2
+    stage_idx++;
+
+    btf_32_neon_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                      v_cos_bit);
+    btf_32_neon_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], buf0[3],
+                      v_cos_bit);
+
+    // stage 3
+    stage_idx++;
+    buf1[0] = vaddq_s32(buf0[0], buf0[2]);
+    buf1[2] = vsubq_s32(buf0[0], buf0[2]);
+    buf1[1] = vaddq_s32(buf0[1], buf0[3]);
+    buf1[3] = vsubq_s32(buf0[1], buf0[3]);
+
+    // stage 4
+    stage_idx++;
+
+    cospi = cospi_arr(cos_bit);
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+
+    btf_32_neon_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3],
+                      v_cos_bit);
+
+    // stage 5
+    stage_idx++;
+    buf1[0] = buf0[0];
+    buf1[1] = vnegq_s32(buf0[2]);
+    buf1[2] = buf0[3];
+    buf1[3] = vnegq_s32(buf0[1]);
+
+    for (j = 0; j < 4; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+static void av1_fdct64_new_stage12345_neon(int32x4_t *input, const int instride,
+                                           int32x4_t *x5, const int32_t *cospi,
+                                           const int32x4_t *v_cos_bit,
+                                           int *startidx, int *endidx) {
+  int32x4_t x1[64];
+  x1[0] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[63] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[1] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[62] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[2] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[61] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[3] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[60] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[4] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[59] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[5] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[58] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[6] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[57] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[7] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[56] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[8] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[55] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[9] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[54] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[10] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[53] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[11] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[52] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[12] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[51] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[13] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[50] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[14] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[49] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[15] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[48] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[16] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[47] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[17] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[46] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[18] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[45] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[19] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[44] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[20] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[43] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[21] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[42] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[22] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[41] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[23] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[40] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[24] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[39] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[25] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[38] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[26] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[37] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[27] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[36] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[28] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[35] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[29] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[34] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[30] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[33] = vsubq_s32(input[*startidx], input[*endidx]);
+  *startidx += instride;
+  *endidx -= instride;
+  x1[31] = vaddq_s32(input[*startidx], input[*endidx]);
+  x1[32] = vsubq_s32(input[*startidx], input[*endidx]);
+
+  // stage 2
+  int32x4_t x2[64];
+  x2[0] = vaddq_s32(x1[0], x1[31]);
+  x2[31] = vsubq_s32(x1[0], x1[31]);
+  x2[1] = vaddq_s32(x1[1], x1[30]);
+  x2[30] = vsubq_s32(x1[1], x1[30]);
+  x2[2] = vaddq_s32(x1[2], x1[29]);
+  x2[29] = vsubq_s32(x1[2], x1[29]);
+  x2[3] = vaddq_s32(x1[3], x1[28]);
+  x2[28] = vsubq_s32(x1[3], x1[28]);
+  x2[4] = vaddq_s32(x1[4], x1[27]);
+  x2[27] = vsubq_s32(x1[4], x1[27]);
+  x2[5] = vaddq_s32(x1[5], x1[26]);
+  x2[26] = vsubq_s32(x1[5], x1[26]);
+  x2[6] = vaddq_s32(x1[6], x1[25]);
+  x2[25] = vsubq_s32(x1[6], x1[25]);
+  x2[7] = vaddq_s32(x1[7], x1[24]);
+  x2[24] = vsubq_s32(x1[7], x1[24]);
+  x2[8] = vaddq_s32(x1[8], x1[23]);
+  x2[23] = vsubq_s32(x1[8], x1[23]);
+  x2[9] = vaddq_s32(x1[9], x1[22]);
+  x2[22] = vsubq_s32(x1[9], x1[22]);
+  x2[10] = vaddq_s32(x1[10], x1[21]);
+  x2[21] = vsubq_s32(x1[10], x1[21]);
+  x2[11] = vaddq_s32(x1[11], x1[20]);
+  x2[20] = vsubq_s32(x1[11], x1[20]);
+  x2[12] = vaddq_s32(x1[12], x1[19]);
+  x2[19] = vsubq_s32(x1[12], x1[19]);
+  x2[13] = vaddq_s32(x1[13], x1[18]);
+  x2[18] = vsubq_s32(x1[13], x1[18]);
+  x2[14] = vaddq_s32(x1[14], x1[17]);
+  x2[17] = vsubq_s32(x1[14], x1[17]);
+  x2[15] = vaddq_s32(x1[15], x1[16]);
+  x2[16] = vsubq_s32(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[40], x1[55], x2[40], x2[55],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[41], x1[54], x2[41], x2[54],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[42], x1[53], x2[42], x2[53],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[43], x1[52], x2[43], x2[52],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[44], x1[51], x2[44], x2[51],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[45], x1[50], x2[45], x2[50],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[46], x1[49], x2[46], x2[49],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x1[47], x1[48], x2[47], x2[48],
+                    *v_cos_bit);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  int32x4_t x3[64];
+  x3[0] = vaddq_s32(x2[0], x2[15]);
+  x3[15] = vsubq_s32(x2[0], x2[15]);
+  x3[1] = vaddq_s32(x2[1], x2[14]);
+  x3[14] = vsubq_s32(x2[1], x2[14]);
+  x3[2] = vaddq_s32(x2[2], x2[13]);
+  x3[13] = vsubq_s32(x2[2], x2[13]);
+  x3[3] = vaddq_s32(x2[3], x2[12]);
+  x3[12] = vsubq_s32(x2[3], x2[12]);
+  x3[4] = vaddq_s32(x2[4], x2[11]);
+  x3[11] = vsubq_s32(x2[4], x2[11]);
+  x3[5] = vaddq_s32(x2[5], x2[10]);
+  x3[10] = vsubq_s32(x2[5], x2[10]);
+  x3[6] = vaddq_s32(x2[6], x2[9]);
+  x3[9] = vsubq_s32(x2[6], x2[9]);
+  x3[7] = vaddq_s32(x2[7], x2[8]);
+  x3[8] = vsubq_s32(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_32_neon_type0(-cospi[32], cospi[32], x2[20], x2[27], x3[20], x3[27],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x2[21], x2[26], x3[21], x3[26],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x2[22], x2[25], x3[22], x3[25],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x2[23], x2[24], x3[23], x3[24],
+                    *v_cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = vaddq_s32(x2[32], x2[47]);
+  x3[47] = vsubq_s32(x2[32], x2[47]);
+  x3[33] = vaddq_s32(x2[33], x2[46]);
+  x3[46] = vsubq_s32(x2[33], x2[46]);
+  x3[34] = vaddq_s32(x2[34], x2[45]);
+  x3[45] = vsubq_s32(x2[34], x2[45]);
+  x3[35] = vaddq_s32(x2[35], x2[44]);
+  x3[44] = vsubq_s32(x2[35], x2[44]);
+  x3[36] = vaddq_s32(x2[36], x2[43]);
+  x3[43] = vsubq_s32(x2[36], x2[43]);
+  x3[37] = vaddq_s32(x2[37], x2[42]);
+  x3[42] = vsubq_s32(x2[37], x2[42]);
+  x3[38] = vaddq_s32(x2[38], x2[41]);
+  x3[41] = vsubq_s32(x2[38], x2[41]);
+  x3[39] = vaddq_s32(x2[39], x2[40]);
+  x3[40] = vsubq_s32(x2[39], x2[40]);
+  x3[48] = vsubq_s32(x2[63], x2[48]);
+  x3[63] = vaddq_s32(x2[63], x2[48]);
+  x3[49] = vsubq_s32(x2[62], x2[49]);
+  x3[62] = vaddq_s32(x2[62], x2[49]);
+  x3[50] = vsubq_s32(x2[61], x2[50]);
+  x3[61] = vaddq_s32(x2[61], x2[50]);
+  x3[51] = vsubq_s32(x2[60], x2[51]);
+  x3[60] = vaddq_s32(x2[60], x2[51]);
+  x3[52] = vsubq_s32(x2[59], x2[52]);
+  x3[59] = vaddq_s32(x2[59], x2[52]);
+  x3[53] = vsubq_s32(x2[58], x2[53]);
+  x3[58] = vaddq_s32(x2[58], x2[53]);
+  x3[54] = vsubq_s32(x2[57], x2[54]);
+  x3[57] = vaddq_s32(x2[57], x2[54]);
+  x3[55] = vsubq_s32(x2[56], x2[55]);
+  x3[56] = vaddq_s32(x2[56], x2[55]);
+
+  // stage 4
+  int32x4_t x4[64];
+  x4[0] = vaddq_s32(x3[0], x3[7]);
+  x4[7] = vsubq_s32(x3[0], x3[7]);
+  x4[1] = vaddq_s32(x3[1], x3[6]);
+  x4[6] = vsubq_s32(x3[1], x3[6]);
+  x4[2] = vaddq_s32(x3[2], x3[5]);
+  x4[5] = vsubq_s32(x3[2], x3[5]);
+  x4[3] = vaddq_s32(x3[3], x3[4]);
+  x4[4] = vsubq_s32(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_32_neon_type0(-cospi[32], cospi[32], x3[10], x3[13], x4[10], x4[13],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[32], cospi[32], x3[11], x3[12], x4[11], x4[12],
+                    *v_cos_bit);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = vaddq_s32(x3[16], x3[23]);
+  x4[23] = vsubq_s32(x3[16], x3[23]);
+  x4[17] = vaddq_s32(x3[17], x3[22]);
+  x4[22] = vsubq_s32(x3[17], x3[22]);
+  x4[18] = vaddq_s32(x3[18], x3[21]);
+  x4[21] = vsubq_s32(x3[18], x3[21]);
+  x4[19] = vaddq_s32(x3[19], x3[20]);
+  x4[20] = vsubq_s32(x3[19], x3[20]);
+  x4[24] = vsubq_s32(x3[31], x3[24]);
+  x4[31] = vaddq_s32(x3[31], x3[24]);
+  x4[25] = vsubq_s32(x3[30], x3[25]);
+  x4[30] = vaddq_s32(x3[30], x3[25]);
+  x4[26] = vsubq_s32(x3[29], x3[26]);
+  x4[29] = vaddq_s32(x3[29], x3[26]);
+  x4[27] = vsubq_s32(x3[28], x3[27]);
+  x4[28] = vaddq_s32(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+
+  btf_32_neon_type0(-cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x3[40], x3[55], x4[40], x4[55],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x3[41], x3[54], x4[41], x4[54],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x3[42], x3[53], x4[42], x4[53],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x3[43], x3[52], x4[43], x4[52],
+                    *v_cos_bit);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  x5[0] = vaddq_s32(x4[0], x4[3]);
+  x5[3] = vsubq_s32(x4[0], x4[3]);
+  x5[1] = vaddq_s32(x4[1], x4[2]);
+  x5[2] = vsubq_s32(x4[1], x4[2]);
+  x5[4] = x4[4];
+
+  btf_32_neon_type0(-cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
+                    *v_cos_bit);
+  x5[7] = x4[7];
+  x5[8] = vaddq_s32(x4[8], x4[11]);
+  x5[11] = vsubq_s32(x4[8], x4[11]);
+  x5[9] = vaddq_s32(x4[9], x4[10]);
+  x5[10] = vsubq_s32(x4[9], x4[10]);
+  x5[12] = vsubq_s32(x4[15], x4[12]);
+  x5[15] = vaddq_s32(x4[15], x4[12]);
+  x5[13] = vsubq_s32(x4[14], x4[13]);
+  x5[14] = vaddq_s32(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+
+  btf_32_neon_type0(-cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x4[20], x4[27], x5[20], x5[27],
+                    *v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x4[21], x4[26], x5[21], x5[26],
+                    *v_cos_bit);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = vaddq_s32(x4[32], x4[39]);
+  x5[39] = vsubq_s32(x4[32], x4[39]);
+  x5[33] = vaddq_s32(x4[33], x4[38]);
+  x5[38] = vsubq_s32(x4[33], x4[38]);
+  x5[34] = vaddq_s32(x4[34], x4[37]);
+  x5[37] = vsubq_s32(x4[34], x4[37]);
+  x5[35] = vaddq_s32(x4[35], x4[36]);
+  x5[36] = vsubq_s32(x4[35], x4[36]);
+  x5[40] = vsubq_s32(x4[47], x4[40]);
+  x5[47] = vaddq_s32(x4[47], x4[40]);
+  x5[41] = vsubq_s32(x4[46], x4[41]);
+  x5[46] = vaddq_s32(x4[46], x4[41]);
+  x5[42] = vsubq_s32(x4[45], x4[42]);
+  x5[45] = vaddq_s32(x4[45], x4[42]);
+  x5[43] = vsubq_s32(x4[44], x4[43]);
+  x5[44] = vaddq_s32(x4[44], x4[43]);
+  x5[48] = vaddq_s32(x4[48], x4[55]);
+  x5[55] = vsubq_s32(x4[48], x4[55]);
+  x5[49] = vaddq_s32(x4[49], x4[54]);
+  x5[54] = vsubq_s32(x4[49], x4[54]);
+  x5[50] = vaddq_s32(x4[50], x4[53]);
+  x5[53] = vsubq_s32(x4[50], x4[53]);
+  x5[51] = vaddq_s32(x4[51], x4[52]);
+  x5[52] = vsubq_s32(x4[51], x4[52]);
+  x5[56] = vsubq_s32(x4[63], x4[56]);
+  x5[63] = vaddq_s32(x4[63], x4[56]);
+  x5[57] = vsubq_s32(x4[62], x4[57]);
+  x5[62] = vaddq_s32(x4[62], x4[57]);
+  x5[58] = vsubq_s32(x4[61], x4[58]);
+  x5[61] = vaddq_s32(x4[61], x4[58]);
+  x5[59] = vsubq_s32(x4[60], x4[59]);
+  x5[60] = vaddq_s32(x4[60], x4[59]);
+}
+
+static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
+                                int8_t cos_bit, const int instride,
+                                const int outstride) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+  int startidx = 0 * instride;
+  int endidx = 63 * instride;
+
+  // stage 1-2-3-4-5
+  int32x4_t x5[64];
+  av1_fdct64_new_stage12345_neon(input, instride, x5, cospi, &v_cos_bit,
+                                 &startidx, &endidx);
+
+  // stage 6
+  int32x4_t x6[64];
+  btf_32_neon_type0(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
+                    v_cos_bit);
+  x6[4] = vaddq_s32(x5[4], x5[5]);
+  x6[5] = vsubq_s32(x5[4], x5[5]);
+  x6[6] = vsubq_s32(x5[7], x5[6]);
+  x6[7] = vaddq_s32(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_32_neon_type0(-cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[48], -cospi[16], x5[10], x5[13], x6[10], x6[13],
+                    v_cos_bit);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = vaddq_s32(x5[16], x5[19]);
+  x6[19] = vsubq_s32(x5[16], x5[19]);
+  x6[17] = vaddq_s32(x5[17], x5[18]);
+  x6[18] = vsubq_s32(x5[17], x5[18]);
+  x6[20] = vsubq_s32(x5[23], x5[20]);
+  x6[23] = vaddq_s32(x5[23], x5[20]);
+  x6[21] = vsubq_s32(x5[22], x5[21]);
+  x6[22] = vaddq_s32(x5[22], x5[21]);
+  x6[24] = vaddq_s32(x5[24], x5[27]);
+  x6[27] = vsubq_s32(x5[24], x5[27]);
+  x6[25] = vaddq_s32(x5[25], x5[26]);
+  x6[26] = vsubq_s32(x5[25], x5[26]);
+  x6[28] = vsubq_s32(x5[31], x5[28]);
+  x6[31] = vaddq_s32(x5[31], x5[28]);
+  x6[29] = vsubq_s32(x5[30], x5[29]);
+  x6[30] = vaddq_s32(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+
+  btf_32_neon_type0(-cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[24], -cospi[40], x5[44], x5[51], x6[44], x6[51],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[24], -cospi[40], x5[45], x5[50], x6[45], x6[50],
+                    v_cos_bit);
+
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  int32x4_t x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_32_neon_type1(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
+                    v_cos_bit);
+
+  x7[8] = vaddq_s32(x6[8], x6[9]);
+  x7[9] = vsubq_s32(x6[8], x6[9]);
+  x7[10] = vsubq_s32(x6[11], x6[10]);
+  x7[11] = vaddq_s32(x6[11], x6[10]);
+  x7[12] = vaddq_s32(x6[12], x6[13]);
+  x7[13] = vsubq_s32(x6[12], x6[13]);
+  x7[14] = vsubq_s32(x6[15], x6[14]);
+  x7[15] = vaddq_s32(x6[15], x6[14]);
+  x7[16] = x6[16];
+
+  btf_32_neon_type0(-cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[24], -cospi[40], x6[22], x6[25], x7[22], x7[25],
+                    v_cos_bit);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+
+  btf_32_neon_type0(-cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[56], -cospi[8], x5[36], x5[59], x6[36], x6[59],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[56], -cospi[8], x5[37], x5[58], x6[37], x6[58],
+                    v_cos_bit);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+
+  btf_32_neon_type1(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
+  btf_32_neon_type0(-cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[56], -cospi[8], x6[18], x6[29], x7[18], x7[29],
+                    v_cos_bit);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+
+  x7[32] = vaddq_s32(x6[32], x6[35]);
+  x7[35] = vsubq_s32(x6[32], x6[35]);
+  x7[33] = vaddq_s32(x6[33], x6[34]);
+  x7[34] = vsubq_s32(x6[33], x6[34]);
+  x7[36] = vsubq_s32(x6[39], x6[36]);
+  x7[39] = vaddq_s32(x6[39], x6[36]);
+  x7[37] = vsubq_s32(x6[38], x6[37]);
+  x7[38] = vaddq_s32(x6[38], x6[37]);
+  x7[40] = vaddq_s32(x6[40], x6[43]);
+  x7[43] = vsubq_s32(x6[40], x6[43]);
+  x7[41] = vaddq_s32(x6[41], x6[42]);
+  x7[42] = vsubq_s32(x6[41], x6[42]);
+  x7[44] = vsubq_s32(x6[47], x6[44]);
+  x7[47] = vaddq_s32(x6[47], x6[44]);
+  x7[45] = vsubq_s32(x6[46], x6[45]);
+  x7[46] = vaddq_s32(x6[46], x6[45]);
+  x7[48] = vaddq_s32(x6[48], x6[51]);
+  x7[51] = vsubq_s32(x6[48], x6[51]);
+  x7[49] = vaddq_s32(x6[49], x6[50]);
+  x7[50] = vsubq_s32(x6[49], x6[50]);
+  x7[52] = vsubq_s32(x6[55], x6[52]);
+  x7[55] = vaddq_s32(x6[55], x6[52]);
+  x7[53] = vsubq_s32(x6[54], x6[53]);
+  x7[54] = vaddq_s32(x6[54], x6[53]);
+  x7[56] = vaddq_s32(x6[56], x6[59]);
+  x7[59] = vsubq_s32(x6[56], x6[59]);
+  x7[57] = vaddq_s32(x6[57], x6[58]);
+  x7[58] = vsubq_s32(x6[57], x6[58]);
+  x7[60] = vsubq_s32(x6[63], x6[60]);
+  x7[63] = vaddq_s32(x6[63], x6[60]);
+  x7[61] = vsubq_s32(x6[62], x6[61]);
+  x7[62] = vaddq_s32(x6[62], x6[61]);
+
+  // stage 8
+  int32x4_t x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+
+  btf_32_neon_type1(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
+                    v_cos_bit);
+  x8[16] = vaddq_s32(x7[16], x7[17]);
+  x8[17] = vsubq_s32(x7[16], x7[17]);
+  x8[18] = vsubq_s32(x7[19], x7[18]);
+  x8[19] = vaddq_s32(x7[19], x7[18]);
+  x8[20] = vaddq_s32(x7[20], x7[21]);
+  x8[21] = vsubq_s32(x7[20], x7[21]);
+  x8[22] = vsubq_s32(x7[23], x7[22]);
+  x8[23] = vaddq_s32(x7[23], x7[22]);
+  x8[24] = vaddq_s32(x7[24], x7[25]);
+  x8[25] = vsubq_s32(x7[24], x7[25]);
+  x8[26] = vsubq_s32(x7[27], x7[26]);
+  x8[27] = vaddq_s32(x7[27], x7[26]);
+  x8[28] = vaddq_s32(x7[28], x7[29]);
+  x8[29] = vsubq_s32(x7[28], x7[29]);
+  x8[30] = vsubq_s32(x7[31], x7[30]);
+  x8[31] = vaddq_s32(x7[31], x7[30]);
+  x8[32] = x7[32];
+
+  btf_32_neon_type0(-cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[60], -cospi[4], x7[34], x7[61], x8[34], x8[61],
+                    v_cos_bit);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_32_neon_type0(-cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[28], -cospi[36], x7[38], x7[57], x8[38], x8[57],
+                    v_cos_bit);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_32_neon_type0(-cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[44], -cospi[20], x7[42], x7[53], x8[42], x8[53],
+                    v_cos_bit);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_32_neon_type0(-cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
+                    v_cos_bit);
+  btf_32_neon_type0(-cospi[12], -cospi[52], x7[46], x7[49], x8[46], x8[49],
+                    v_cos_bit);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  int32x4_t x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+
+  btf_32_neon_type1(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
+                    v_cos_bit);
+
+  x9[32] = vaddq_s32(x8[32], x8[33]);
+  x9[33] = vsubq_s32(x8[32], x8[33]);
+  x9[34] = vsubq_s32(x8[35], x8[34]);
+  x9[35] = vaddq_s32(x8[35], x8[34]);
+  x9[36] = vaddq_s32(x8[36], x8[37]);
+  x9[37] = vsubq_s32(x8[36], x8[37]);
+  x9[38] = vsubq_s32(x8[39], x8[38]);
+  x9[39] = vaddq_s32(x8[39], x8[38]);
+  x9[40] = vaddq_s32(x8[40], x8[41]);
+  x9[41] = vsubq_s32(x8[40], x8[41]);
+  x9[42] = vsubq_s32(x8[43], x8[42]);
+  x9[43] = vaddq_s32(x8[43], x8[42]);
+  x9[44] = vaddq_s32(x8[44], x8[45]);
+  x9[45] = vsubq_s32(x8[44], x8[45]);
+  x9[46] = vsubq_s32(x8[47], x8[46]);
+  x9[47] = vaddq_s32(x8[47], x8[46]);
+  x9[48] = vaddq_s32(x8[48], x8[49]);
+  x9[49] = vsubq_s32(x8[48], x8[49]);
+  x9[50] = vsubq_s32(x8[51], x8[50]);
+  x9[51] = vaddq_s32(x8[51], x8[50]);
+  x9[52] = vaddq_s32(x8[52], x8[53]);
+  x9[53] = vsubq_s32(x8[52], x8[53]);
+  x9[54] = vsubq_s32(x8[55], x8[54]);
+  x9[55] = vaddq_s32(x8[55], x8[54]);
+  x9[56] = vaddq_s32(x8[56], x8[57]);
+  x9[57] = vsubq_s32(x8[56], x8[57]);
+  x9[58] = vsubq_s32(x8[59], x8[58]);
+  x9[59] = vaddq_s32(x8[59], x8[58]);
+  x9[60] = vaddq_s32(x8[60], x8[61]);
+  x9[61] = vsubq_s32(x8[60], x8[61]);
+  x9[62] = vsubq_s32(x8[63], x8[62]);
+  x9[63] = vaddq_s32(x8[63], x8[62]);
+
+  // stage 10
+  int32x4_t x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_32_neon_type1(cospi[63], cospi[1], x9[32], x9[63], x10[32], x10[63],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[31], cospi[33], x9[33], x9[62], x10[33], x10[62],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[47], cospi[17], x9[34], x9[61], x10[34], x10[61],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[15], cospi[49], x9[35], x9[60], x10[35], x10[60],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[55], cospi[9], x9[36], x9[59], x10[36], x10[59],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[23], cospi[41], x9[37], x9[58], x10[37], x10[58],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[39], cospi[25], x9[38], x9[57], x10[38], x10[57],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[7], cospi[57], x9[39], x9[56], x10[39], x10[56],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[59], cospi[5], x9[40], x9[55], x10[40], x10[55],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[27], cospi[37], x9[41], x9[54], x10[41], x10[54],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[43], cospi[21], x9[42], x9[53], x10[42], x10[53],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[11], cospi[53], x9[43], x9[52], x10[43], x10[52],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[51], cospi[13], x9[44], x9[51], x10[44], x10[51],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[19], cospi[45], x9[45], x9[50], x10[45], x10[50],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[35], cospi[29], x9[46], x9[49], x10[46], x10[49],
+                    v_cos_bit);
+  btf_32_neon_type1(cospi[3], cospi[61], x9[47], x9[48], x10[47], x10[48],
+                    v_cos_bit);
+
+  startidx = 0 * outstride;
+  endidx = 63 * outstride;
+  // stage 11
+  output[startidx] = x10[0];
+  output[endidx] = x10[63];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[32];
+  output[endidx] = x10[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[16];
+  output[endidx] = x10[47];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[48];
+  output[endidx] = x10[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[8];
+  output[endidx] = x10[55];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[40];
+  output[endidx] = x10[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[24];
+  output[endidx] = x10[39];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[56];
+  output[endidx] = x10[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[4];
+  output[endidx] = x10[59];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[36];
+  output[endidx] = x10[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[20];
+  output[endidx] = x10[43];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[52];
+  output[endidx] = x10[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[12];
+  output[endidx] = x10[51];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[44];
+  output[endidx] = x10[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[28];
+  output[endidx] = x10[35];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[60];
+  output[endidx] = x10[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[2];
+  output[endidx] = x10[61];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[34];
+  output[endidx] = x10[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[18];
+  output[endidx] = x10[45];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[50];
+  output[endidx] = x10[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[10];
+  output[endidx] = x10[53];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[42];
+  output[endidx] = x10[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[26];
+  output[endidx] = x10[37];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[58];
+  output[endidx] = x10[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[6];
+  output[endidx] = x10[57];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[38];
+  output[endidx] = x10[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[22];
+  output[endidx] = x10[41];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[54];
+  output[endidx] = x10[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[14];
+  output[endidx] = x10[49];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[46];
+  output[endidx] = x10[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[30];
+  output[endidx] = x10[33];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[62];
+  output[endidx] = x10[1];
+}
+
+void av1_idtx32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit,
+                         const int col_num) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; i++) {
+    output[i * col_num] = vshlq_n_s32(input[i * col_num], 2);
+  }
+}
+
+static const fwd_transform_1d_neon col_highbd_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct32_new_neon,  // DCT_DCT
+  NULL,                 // ADST_DCT
+  NULL,                 // DCT_ADST
+  NULL,                 // ADST_ADST
+  NULL,                 // FLIPADST_DCT
+  NULL,                 // DCT_FLIPADST
+  NULL,                 // FLIPADST_FLIPADST
+  NULL,                 // ADST_FLIPADST
+  NULL,                 // FLIPADST_ADST
+  av1_idtx32_new_neon,  // IDTX
+  NULL,                 // V_DCT
+  NULL,                 // H_DCT
+  NULL,                 // V_ADST
+  NULL,                 // H_ADST
+  NULL,                 // V_FLIPADST
+  NULL                  // H_FLIPADST
+};
+
+static const fwd_transform_1d_neon row_highbd_txfm8x32_arr[TX_TYPES] = {
+  fdct16x16_neon,  // DCT_DCT
+  NULL,            // ADST_DCT
+  NULL,            // DCT_ADST
+  NULL,            // ADST_ADST
+  NULL,            // FLIPADST_DCT
+  NULL,            // DCT_FLIPADST
+  NULL,            // FLIPADST_FLIPADST
+  NULL,            // ADST_FLIPADST
+  NULL,            // FLIPADST_ADST
+  idtx16x16_neon,  // IDTX
+  NULL,            // V_DCT
+  NULL,            // H_DCT
+  NULL,            // V_ADST
+  NULL,            // H_ADST
+  NULL,            // V_FLIPADST
+  NULL             // H_FLIPADST
+};
+
+void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int32x4_t in[32], out[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x8_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  for (int i = 0; i < 2; i++) {
+    load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
+    col_txfm(in, in, bit, 2);
+    col_txfm_8x8_rounding(in, &v_shift1);
+    transpose_8x8(in, out + i * 16);
+  }
+
+  if (lr_flip) {
+    flip_buf_neon(in, out, 32);
+    row_txfm(in, out, bit, 2);
+  } else {
+    row_txfm(out, out, bit, 2);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    transpose_8x8(out + i * 16, in);
+    av1_round_shift_rect_array_32_neon(in, in, 16, -shift[2], NewSqrt2);
+    write_buffer_16x8(in, coeff + i * 8, 16);
+  }
+}
+
+void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+  int32x4_t in[32], out[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x8_arr[tx_type];
+  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, in, bit, 2);
+  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  col_txfm_8x16_rounding(in, &v_shift1);
+  transpose_8x8(in, out);
+  transpose_8x8(in + 16, out + 16);
+
+  for (int i = 0; i < 2; i++) {
+    row_txfm(out + i * 16, out, bit, 2);
+    transpose_8x8(out, in);
+    av1_round_shift_rect_array_32_neon(in, in, 16, -shift[2], NewSqrt2);
+    write_buffer_8x8(in, coeff + i * 64);
+  }
+}
+
+static INLINE void transpose_8nx8n(const int32x4_t *input, int32x4_t *output,
+                                   const int width, const int height) {
+  const int numcol = height >> 2;
+  const int numrow = width >> 2;
+  for (int j = 0; j < numrow; j++) {
+    for (int i = 0; i < numcol; i++) {
+      TRANSPOSE_4X4(input[i * width + j + (numrow * 0)],
+                    input[i * width + j + (numrow * 1)],
+                    input[i * width + j + (numrow * 2)],
+                    input[i * width + j + (numrow * 3)],
+                    output[j * height + i + (numcol * 0)],
+                    output[j * height + i + (numcol * 1)],
+                    output[j * height + i + (numcol * 2)],
+                    output[j * height + i + (numcol * 3)]);
+    }
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+
+  int32x4_t in[16];
+  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
+  const int txw_idx = get_txw_idx(TX_4X16);
+  const int txh_idx = get_txh_idx(TX_4X16);
+  const int txfm_size_col = tx_size_wide[TX_4X16];
+  const int txfm_size_row = tx_size_high[TX_4X16];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm4x4_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // col transform
+  int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  load_buffer_4x16(input, in, stride, ud_flip, lr_flip, &v_shift0);
+  col_txfm(in, outcoeff128, bitcol, 1);
+  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  col_txfm_8x8_rounding(outcoeff128, &v_shift1);
+  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < txfm_size_col; i++) {
+    row_txfm(in + i, outcoeff128 + i * txfm_size_col, bitrow, txfm_size_col);
+  }
+}
+#endif
+
+void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  (void)bd;
+
+  int32x4_t in[16];
+  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
+  const int txw_idx = get_txw_idx(TX_16X4);
+  const int txh_idx = get_txh_idx(TX_16X4);
+  const int txfm_size_col = tx_size_wide[TX_16X4];
+  const int txfm_size_row = tx_size_high[TX_16X4];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm4x4_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // col transform
+  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  load_buffer_16x4(input, in, stride, ud_flip, lr_flip, &v_shift0);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    col_txfm(in + i * txfm_size_row, outcoeff128 + i * txfm_size_row, bitcol,
+             1);
+  }
+  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  col_txfm_8x8_rounding(outcoeff128, &v_shift1);
+
+  // row transform
+  row_txfm(outcoeff128, in, bitrow, 1);
+  transpose_8nx8n(in, outcoeff128, txfm_size_row, txfm_size_col);
+}
+
+void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)bd;
+
+  int32x4_t in[128];
+  int32x4_t *outcoef128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
+  const int txw_idx = get_txw_idx(TX_16X32);
+  const int txh_idx = get_txh_idx(TX_16X32);
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x32_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm8x32_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  // column transform
+  load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+  load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]);
+
+  for (int i = 0; i < 4; i++) {
+    col_txfm((in + i), (in + i), bitcol, 4);
+  }
+
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  col_txfm_16x16_rounding(&in[0], &v_shift);
+  col_txfm_16x16_rounding(&in[64], &v_shift);
+  transpose_8nx8n(in, outcoef128, 16, 32);
+
+  // row transform
+  row_txfm(outcoef128, in, bitrow, 8);
+  transpose_8nx8n(in, outcoef128, 32, 16);
+  av1_round_shift_rect_array_32_neon(outcoef128, outcoef128, 128, -shift[2],
+                                     NewSqrt2);
+}
+
+void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)tx_type;
+  (void)bd;
+
+  int32x4_t in[512];
+  int32x4_t *outcoef128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64];
+  const int txw_idx = get_txw_idx(TX_32X64);
+  const int txh_idx = get_txh_idx(TX_32X64);
+  const int txfm_size_col = tx_size_wide[TX_32X64];
+  const int txfm_size_row = tx_size_high[TX_32X64];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int num_row = txfm_size_row >> 2;
+  const int num_col = txfm_size_col >> 2;
+
+  // column transform
+  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
+  for (int i = 0; i < num_col; i++) {
+    av1_fdct64_new_neon((in + i), (in + i), bitcol, num_col, num_col);
+  }
+
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  for (int i = 0; i < num_col; i++) {
+    col_txfm_16x16_rounding((in + i * txfm_size_row), &v_shift);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < num_row; i++) {
+    av1_fdct32_new_neon((outcoef128 + i), (in + i), bitrow, num_row);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
+  av1_round_shift_rect_array_32_neon(outcoef128, outcoef128, 512, -shift[2],
+                                     NewSqrt2);
+}
+
+void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  (void)tx_type;
+  int32x4_t in[512];
+  int32x4_t *outcoef128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32];
+  const int txw_idx = get_txw_idx(TX_64X32);
+  const int txh_idx = get_txh_idx(TX_64X32);
+  const int txfm_size_col = tx_size_wide[TX_64X32];
+  const int txfm_size_row = tx_size_high[TX_64X32];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int num_row = txfm_size_row >> 2;
+  const int num_col = txfm_size_col >> 2;
+
+  // column transform
+  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  for (int i = 0; i < 32; i++) {
+    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0,
+                    &v_shift0);
+    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0,
+                    &v_shift0);
+    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0,
+                    &v_shift0);
+    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0,
+                    &v_shift0);
+  }
+
+  for (int i = 0; i < num_col; i++) {
+    av1_fdct32_new_neon((in + i), (in + i), bitcol, num_col);
+  }
+
+  const int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  for (int i = 0; i < num_row; i++) {
+    col_txfm_16x16_rounding((in + i * txfm_size_col), &v_shift1);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < num_row; i++) {
+    av1_fdct64_new_neon((outcoef128 + i), (in + i), bitrow, num_row, num_row);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col >> 1);
+  av1_round_shift_rect_array_32_neon(outcoef128, outcoef128, 512 >> 1,
+                                     -shift[2], NewSqrt2);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  int32x4_t in[128];
+  int32x4_t *outcoef128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const fwd_transform_1d_neon col_txfm = row_highbd_txfm8x32_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = col_highbd_txfm8x32_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  // column transform
+  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16);
+  col_txfm(in, in, bitcol, 8);
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  col_txfm_16x16_rounding(&in[0], &v_shift);
+  col_txfm_16x16_rounding(&in[64], &v_shift);
+  transpose_8nx8n(in, outcoef128, 32, 16);
+
+  // row transform
+  for (int i = 0; i < 4; i++) {
+    row_txfm((outcoef128 + i), (in + i), bitrow, 4);
+  }
+  transpose_8nx8n(in, outcoef128, 16, 32);
+  av1_round_shift_rect_array_32_neon(outcoef128, outcoef128, 128, -shift[2],
+                                     NewSqrt2);
+  (void)bd;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  int32x4_t in[64];
+  int32x4_t *outcoef128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
+  const int txw_idx = get_txw_idx(TX_8X32);
+  const int txh_idx = get_txh_idx(TX_8X32);
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm8x32_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm32x8_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  const int txfm_size_col = tx_size_wide[TX_8X32];
+  const int txfm_size_row = tx_size_high[TX_8X32];
+  const int num_col = txfm_size_col >> 2;
+
+  // column transform
+  load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
+  load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
+                   stride, 0, 0, shift[0]);
+
+  for (int i = 0; i < num_col; i++) {
+    col_txfm((in + i), (in + i), bitcol, num_col);
+  }
+
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  col_txfm_16x16_rounding(in, &v_shift);
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < txfm_size_col; i += 2) {
+    row_txfm((outcoef128 + i), (in + i), bitrow, txfm_size_col);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  int32x4_t in[64];
+  int32x4_t *outcoef128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
+  const int txw_idx = get_txw_idx(TX_32X8);
+  const int txh_idx = get_txh_idx(TX_32X8);
+  const fwd_transform_1d_neon col_txfm = row_highbd_txfm32x8_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = col_highbd_txfm8x32_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  const int txfm_size_col = tx_size_wide[TX_32X8];
+  const int txfm_size_row = tx_size_high[TX_32X8];
+  const int num_col = txfm_size_row >> 2;
+
+  // column transform
+  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
+  for (int i = 0; i < txfm_size_row; i += 2) {
+    col_txfm((in + i), (in + i), bitcol, txfm_size_row);
+  }
+
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  col_txfm_16x16_rounding(&in[0], &v_shift);
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < num_col; i++) {
+    row_txfm((outcoef128 + i), (in + i), bitrow, num_col);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
+  (void)bd;
+}
+#endif
+
+void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  int32x4_t in[8];
+  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
+  const int txw_idx = get_txw_idx(TX_4X8);
+  const int txh_idx = get_txh_idx(TX_4X8);
+  const int txfm_size_col = tx_size_wide[TX_4X8];
+  const int txfm_size_row = tx_size_high[TX_4X8];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm4x8_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm4x4_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  load_buffer_4x8(input, in, stride, ud_flip, lr_flip, &v_shift0);
+  col_txfm(in, in, bitcol, 1);
+  int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  col_txfm_4x8_rounding(in, &v_shift1);
+  transpose_8nx8n(in, outcoeff128, txfm_size_col, txfm_size_row);
+
+  for (int i = 0; i < 2; i++) {
+    row_txfm(outcoeff128 + i, in + i * txfm_size_col, bitrow, 2);
+  }
+  av1_round_shift_rect_array_32_neon(in, outcoeff128, txfm_size_row, -shift[2],
+                                     NewSqrt2);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  int32x4_t in[8];
+  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
+  const int txw_idx = get_txw_idx(TX_8X4);
+  const int txh_idx = get_txh_idx(TX_8X4);
+  const int txfm_size_col = tx_size_wide[TX_8X4];
+  const int txfm_size_row = tx_size_high[TX_8X4];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_neon col_txfm = col_highbd_txfm4x4_arr[tx_type];
+  const fwd_transform_1d_neon row_txfm = row_highbd_txfm4x8_arr[tx_type];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // col tranform
+  int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  load_buffer_8x4(input, in, stride, ud_flip, lr_flip, &v_shift0);
+  for (int i = 0; i < 2; i++) {
+    col_txfm(in + i * txfm_size_row, in + i * txfm_size_row, bitcol, 1);
+  }
+  int32x4_t v_shift1 = vdupq_n_s32(shift[1]);
+  col_txfm_4x8_rounding(in, &v_shift1);
+
+  // row tranform
+  row_txfm(in, outcoeff128, bitrow, 1);
+  av1_round_shift_rect_array_32_neon(outcoeff128, in, txfm_size_col, -shift[2],
+                                     NewSqrt2);
+  transpose_8nx8n(in, outcoeff128, txfm_size_row, txfm_size_col);
+  (void)bd;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  int32x4_t in[256];
+  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
+  const int txw_idx = get_txw_idx(TX_16X64);
+  const int txh_idx = get_txh_idx(TX_16X64);
+  const int txfm_size_col = tx_size_wide[TX_16X64];
+  const int txfm_size_row = tx_size_high[TX_16X64];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int num_col = txfm_size_col >> 2;
+  // col tranform
+  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  for (int i = 0; i < txfm_size_row; i += num_col) {
+    load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
+                    ud_flip, lr_flip, &v_shift0);
+    load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
+                    ud_flip, lr_flip, &v_shift0);
+    load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
+                    ud_flip, lr_flip, &v_shift0);
+    load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
+                    ud_flip, lr_flip, &v_shift0);
+  }
+
+  for (int i = 0; i < num_col; i++) {
+    av1_fdct64_new_neon(in + i, outcoeff128 + i, bitcol, num_col, num_col);
+  }
+
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  col_txfm_16x16_rounding(outcoeff128, &v_shift);
+  col_txfm_16x16_rounding(outcoeff128 + 64, &v_shift);
+  col_txfm_16x16_rounding(outcoeff128 + 128, &v_shift);
+  col_txfm_16x16_rounding(outcoeff128 + 192, &v_shift);
+
+  transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
+  fdct16x16_neon(in, in, bitrow, 8);
+  transpose_8nx8n(in, outcoeff128, 32, txfm_size_col);
+  memset(coeff + txfm_size_col * 32, 0, txfm_size_col * 32 * sizeof(*coeff));
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  int32x4_t in[256];
+  int32x4_t *outcoeff128 = (int32x4_t *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
+  const int txw_idx = get_txw_idx(TX_64X16);
+  const int txh_idx = get_txh_idx(TX_64X16);
+  const int txfm_size_col = tx_size_wide[TX_64X16];
+  const int txfm_size_row = tx_size_high[TX_64X16];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // col tranform
+  const int32x4_t v_shift0 = vdupq_n_s32(shift[0]);
+  for (int i = 0; i < txfm_size_row; i++) {
+    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, &v_shift0);
+    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, &v_shift0);
+    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, &v_shift0);
+    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, &v_shift0);
+  }
+
+  fdct16x16_neon(in, outcoeff128, bitcol, txfm_size_row);
+  const int32x4_t v_shift = vdupq_n_s32(shift[1]);
+  col_txfm_16x16_rounding(outcoeff128, &v_shift);
+  col_txfm_16x16_rounding(outcoeff128 + 64, &v_shift);
+  col_txfm_16x16_rounding(outcoeff128 + 128, &v_shift);
+  col_txfm_16x16_rounding(outcoeff128 + 192, &v_shift);
+
+  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+  for (int i = 0; i < 4; i++) {
+    av1_fdct64_new_neon(in + i, in + i, bitrow, 4, 4);
+  }
+  transpose_8nx8n(in, outcoeff128, txfm_size_row, 32);
+  (void)bd;
+}
+#endif
+
+static void fdct64_new_neon(int32x4_t *input, int32x4_t *output,
+                            const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 64;
+  const int num_per_128 = 4;
+  int col_num = txfm_size / num_per_128;
+  (void)stage_range;
+  for (int col = 0; col < col_num; col++) {
+    av1_fdct64_new_neon((input + col), (output + col), cos_bit, col_num,
+                        col_num);
+  }
+}
+
+static void fdct32_new_neon(int32x4_t *input, int32x4_t *output,
+                            const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    av1_fdct32_new_neon((input + col), (output + col), cos_bit, col_num);
+  }
+}
+
+static void idtx32x32_neon(int32x4_t *input, int32x4_t *output,
+                           const int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+
+  for (int i = 0; i < 8; i++) {
+    av1_idtx32_new_neon(&input[i * 32], &output[i * 32], cos_bit, 1);
+  }
+}
+
+typedef void (*TxfmFuncNEON)(int32x4_t *input, int32x4_t *output,
+                             const int8_t cos_bit, const int8_t *stage_range);
+
+static INLINE TxfmFuncNEON fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT32: return fdct32_new_neon; break;
+    case TXFM_TYPE_DCT64: return fdct64_new_neon; break;
+    case TXFM_TYPE_IDENTITY32: return idtx32x32_neon; break;
+    default: assert(0);
+  }
+  return NULL;
+}
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+  int r, c;
+  for (r = 0; r < txfm1d_size; r++) {
+    for (c = 0; c < txfm1d_size; c++) {
+      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+    }
+  }
+}
+
+static INLINE void av1_round_shift_array_32_neon(int32x4_t *input,
+                                                 int32x4_t *output,
+                                                 const int size,
+                                                 const int bit) {
+  const int32x4_t v_bit = vdupq_n_s32(-bit);
+  for (int i = 0; i < size; i++) output[i] = vrshlq_s32(input[i], v_bit);
+}
+
+static INLINE void transpose_32_4x4(int stride, const int32x4_t *input,
+                                    int32x4_t *output) {
+  int32x4x2_t temp01 = vzipq_s32(input[0 * stride], input[2 * stride]);
+  int32x4x2_t temp23 = vzipq_s32(input[1 * stride], input[3 * stride]);
+
+  const int32x4x2_t output01 = vzipq_s32(temp01.val[0], temp23.val[0]);
+  const int32x4x2_t output23 = vzipq_s32(temp01.val[1], temp23.val[1]);
+
+  output[0 * stride] = output01.val[0];
+  output[1 * stride] = output01.val[1];
+  output[2 * stride] = output23.val[0];
+  output[3 * stride] = output23.val[1];
+}
+
+static INLINE void transpose_32(int txfm_size, const int32x4_t *input,
+                                int32x4_t *output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+static INLINE void fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+                                         const int stride,
+                                         const TXFM_2D_FLIP_CFG *cfg,
+                                         int32_t *txfm_buf) {
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncNEON txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  int32x4_t *buf_128 = (int32x4_t *)txfm_buf;
+  int32x4_t *out_128 = (int32x4_t *)output;
+
+  const int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+  int col_num = txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+                                                        txfm_size);
+  /*col wise transform*/
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_neon(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+
+  /*row wise transform*/
+  for (int col = 0; col < (col_num >> 1); col++) {
+    av1_fdct64_new_neon((buf_128 + col), (out_128 + col), cos_bit_row, col_num,
+                        (col_num >> 1));
+  }
+
+  txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+  av1_round_shift_array_32_neon(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_8nx8n(buf_128, out_128, 32, 32);
+}
+
+static INLINE void fwd_txfm2d_neon(const int16_t *input, int32_t *output,
+                                   const int stride,
+                                   const TXFM_2D_FLIP_CFG *cfg,
+                                   int32_t *txfm_buf) {
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncNEON txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFuncNEON txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  int32x4_t *buf_128 = (int32x4_t *)txfm_buf;
+  int32x4_t *out_128 = (int32x4_t *)output;
+  int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+                                                        txfm_size);
+  av1_round_shift_array_32_neon(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_neon(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+  av1_round_shift_array_32_neon(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32(txfm_size, buf_128, out_128);
+}
+
+void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+  (void)bd;
+  fwd_txfm2d_neon(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+  (void)bd;
+  fwd_txfm2d_64x64_neon(input, output, stride, &cfg, txfm_buf);
+}
diff --git a/media/libaom/src/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c b/media/libaom/src/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
new file mode 100644
index 0000000000..0ad11315d4
--- /dev/null
+++ b/media/libaom/src/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/txfm_common.h"
+
+static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) {
+  int32x4x2_t b0 =
+      vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1]));
+  int16x4x2_t c0 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])),
+                            vreinterpret_s16_s32(vget_high_s32(b0.val[0])));
+  int16x4x2_t c1 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])),
+                            vreinterpret_s16_s32(vget_high_s32(b0.val[1])));
+  out[0] = c0.val[0];
+  out[1] = c0.val[1];
+  out[2] = c1.val[0];
+  out[3] = c1.val[1];
+}
+
+void av1_fwht4x4_neon(const int16_t *input, tran_low_t *output, int stride) {
+  // Load the 4x4 source in transposed form.
+  int16x4_t a1, b1, c1, d1, e;
+  a1 = vld1_s16(&input[0]);
+  b1 = vld1_s16(&input[1 * stride]);
+  c1 = vld1_s16(&input[2 * stride]);
+  d1 = vld1_s16(&input[3 * stride]);
+
+  // WHT.
+
+  // Row transforms.
+  a1 = vadd_s16(a1, b1);
+  d1 = vsub_s16(d1, c1);
+  e = vhsub_s16(a1, d1);
+  b1 = vsub_s16(e, b1);
+  c1 = vsub_s16(e, c1);
+  a1 = vsub_s16(a1, c1);
+  d1 = vadd_s16(d1, b1);
+
+  int16x8_t x[2];
+  x[0] = vcombine_s16(a1, c1);
+  x[1] = vcombine_s16(d1, b1);
+
+  int16x4_t s[4];
+  transpose4x4(x, s);
+
+  a1 = s[0];
+  b1 = s[1];
+  c1 = s[2];
+  d1 = s[3];
+
+  // Row transforms.
+  a1 = vadd_s16(a1, b1);
+  d1 = vsub_s16(d1, c1);
+  e = vhsub_s16(a1, d1);
+  b1 = vsub_s16(e, b1);
+  c1 = vsub_s16(e, c1);
+  a1 = vsub_s16(a1, c1);
+  d1 = vadd_s16(d1, b1);
+
+  x[0] = vcombine_s16(a1, c1);
+  x[1] = vcombine_s16(d1, b1);
+
+  transpose4x4(x, s);
+
+  vst1q_s32(&output[0], vshll_n_s16(s[0], UNIT_QUANT_SHIFT));
+  vst1q_s32(&output[4], vshll_n_s16(s[1], UNIT_QUANT_SHIFT));
+  vst1q_s32(&output[8], vshll_n_s16(s[2], UNIT_QUANT_SHIFT));
+  vst1q_s32(&output[12], vshll_n_s16(s[3], UNIT_QUANT_SHIFT));
+}
+
+void av1_highbd_fwht4x4_neon(const int16_t *input, tran_low_t *output,
+                             int stride) {
+  av1_fwht4x4_neon(input, output, stride);
+}
diff --git a/media/libaom/src/av1/encoder/arm/neon/ml_neon.c b/media/libaom/src/av1/encoder/arm/neon/ml_neon.c
new file mode 100644
index 0000000000..fcff3a95e8
--- /dev/null
+++ b/media/libaom/src/av1/encoder/arm/neon/ml_neon.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+
+static void nn_activate8(float32x4_t *out_h, float32x4_t *out_l,
+                         const float32x4_t *zero) {
+  *out_h = vmaxq_f32(*out_h, *zero);
+  *out_l = vmaxq_f32(*out_l, *zero);
+}
+
+static void nn_activate4(float32x4_t *x, const float32x4_t *zero) {
+  *x = vmaxq_f32(*x, *zero);
+}
+
+#define CLAMP_0(x) (x = x > 0 ? x : 0)
+
+static void nn_propagate_8to1(int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  const float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t vadd = zero;
+  float total = *layer_bias;
+
+  for (int in = 0; in < num_inputs; in += 8) {
+    const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+    const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+
+    const float32x4_t weights_h = vld1q_f32(&weights[in + 4]);
+    const float32x4_t weights_l = vld1q_f32(&weights[in]);
+
+    vadd = vmlaq_f32(vadd, inputs_h, weights_h);
+    vadd = vmlaq_f32(vadd, inputs_l, weights_l);
+  }
+#if defined(__aarch64__)
+  total += vaddvq_f32(vadd);
+#else
+  float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+  vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+  total += vget_lane_f32(vadd_lo, 0);
+#endif
+
+  if (!output_layer) CLAMP_0(total);
+  *output_nodes = total;
+}
+
+static void nn_propagate_xto1(int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes) {
+  float32x4_t vadd = vdupq_n_f32(0);
+
+  float total = *layer_bias;
+  int j = num_inputs;
+  int in = 0;
+  while (j > 7) {
+    const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+    const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+
+    const float32x4_t weights_h = vld1q_f32(&weights[in + 4]);
+    const float32x4_t weights_l = vld1q_f32(&weights[in]);
+
+    vadd = vmlaq_f32(vadd, inputs_h, weights_h);
+    vadd = vmlaq_f32(vadd, inputs_l, weights_l);
+    in += 8;
+    j -= 8;
+  }
+
+#if defined(__aarch64__)
+  total += vaddvq_f32(vadd);
+
+#else
+  float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+  vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+  total += vget_lane_f32(vadd_lo, 0);
+#endif
+  for (; in < num_inputs; in++) total += weights[in] * inputs[in];
+
+  *output_nodes = CLAMP_0(total);
+}
+
+static void nn_propagate_xsto1(int num_inputs, const float *const inputs,
+                               const float *const weights,
+                               const float *layer_bias,
+                               float *const output_nodes) {
+  float total = *layer_bias;
+#if defined(__aarch64__)
+  const float32x4_t v_inputs = vld1q_f32(inputs);
+  const float32x4_t v_weights = vld1q_f32(weights);
+  const float32x4_t vadd = vmulq_f32(v_inputs, v_weights);
+  total += vaddvq_f32(vadd);
+  int in = 4;
+#else
+  int in = 0;
+#endif
+  for (; in < num_inputs; in++) total += weights[in] * inputs[in];
+
+  *output_nodes = CLAMP_0(total);
+}
+
+static void nn_propagate_4to1(int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  const float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t vadd = zero;
+  float total = *layer_bias;
+
+  for (int in = 0; in < num_inputs; in += 4) {
+    const float32x4_t v_inputs = vld1q_f32(&inputs[in]);
+    const float32x4_t v_weights = vld1q_f32(&weights[in]);
+    vadd = vmlaq_f32(vadd, v_inputs, v_weights);
+  }
+
+#if defined(__aarch64__)
+  total += vaddvq_f32(vadd);
+#else
+  float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+  vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+  total += vget_lane_f32(vadd_lo, 0);
+#endif
+
+  if (!output_layer) CLAMP_0(total);
+  *output_nodes = total;
+}
+
+static void nn_propagate_4to4(int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  float32x4_t outputs = vld1q_f32(layer_bias);
+  const float32x4_t zero = vdupq_n_f32(0);
+
+  float32x4_t mul0[2] = { zero, zero };
+  float32x4_t mul1[2] = { zero, zero };
+  for (int in = 0; in < num_inputs; in += 4) {
+    const float32x4_t v_input = vld1q_f32(&inputs[in]);
+
+    for (int i = 0; i < 2; i++) {
+      const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]);
+      mul0[i] = vmlaq_f32(mul0[i], weight0, v_input);
+      const float32x4_t weight1 =
+          vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]);
+      mul1[i] = vmlaq_f32(mul1[i], weight1, v_input);
+    }
+  }
+  for (int i = 0; i < 2; i++)
+#if defined(__aarch64__)
+    mul0[i] = vpaddq_f32(mul0[i], mul1[i]);
+  const float32x4_t hh = vpaddq_f32(mul0[0], mul0[1]);
+#else
+    mul0[i] =
+        vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])),
+                     vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i])));
+  const float32x4_t hh =
+      vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])),
+                   vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1])));
+#endif
+
+  outputs = vaddq_f32(outputs, hh);
+  if (!output_layer) nn_activate4(&outputs, &zero);
+  vst1q_f32(output_nodes, outputs);
+}
+
+static void nn_propagate_4to8(const int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  float32x4_t out_h = vld1q_f32(&layer_bias[4]);
+  float32x4_t out_l = vld1q_f32(layer_bias);
+  const float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t mul0[4] = { zero, zero, zero, zero };
+  float32x4_t mul1[4] = { zero, zero, zero, zero };
+
+  for (int in = 0; in < num_inputs; in += 4) {
+    const float32x4_t v_input = vld1q_f32(&inputs[in]);
+    for (int i = 0; i < 4; i++) {
+      const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]);
+      const float32x4_t weight1 =
+          vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]);
+      mul0[i] = vmlaq_f32(mul0[i], v_input, weight0);
+      mul1[i] = vmlaq_f32(mul1[i], v_input, weight1);
+    }
+  }
+  for (int i = 0; i < 4; i++)
+#if defined(__aarch64__)
+    mul0[i] = vpaddq_f32(mul0[i], mul1[i]);
+  const float32x4_t hh0 = vpaddq_f32(mul0[0], mul0[1]);
+  const float32x4_t hh1 = vpaddq_f32(mul0[2], mul0[3]);
+#else
+    mul0[i] =
+        vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])),
+                     vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i])));
+  const float32x4_t hh0 =
+      vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])),
+                   vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1])));
+  const float32x4_t hh1 =
+      vcombine_f32(vpadd_f32(vget_low_f32(mul0[2]), vget_high_f32(mul0[2])),
+                   vpadd_f32(vget_low_f32(mul0[3]), vget_high_f32(mul0[3])));
+#endif
+
+  out_h = vaddq_f32(out_h, hh1);
+  out_l = vaddq_f32(out_l, hh0);
+
+  if (!output_layer) nn_activate8(&out_h, &out_l, &zero);
+  vst1q_f32(&output_nodes[4], out_h);
+  vst1q_f32(output_nodes, out_l);
+}
+
+static void nn_propagate_8to4(const int num_inputs, const float *const inputs,
+                              const float *const weights,
+                              const float *layer_bias,
+                              float *const output_nodes, bool output_layer) {
+  float32x4_t outputs = vld1q_f32(layer_bias);
+  const float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t add[4] = { zero, zero, zero, zero };
+  for (int in = 0; in < num_inputs; in += 8) {
+    const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+    const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+
+    for (int i = 0; i < 4; i++) {
+      const float32x4_t weight_l = vld1q_f32(&weights[in + i * num_inputs]);
+      const float32x4_t weight_h = vld1q_f32(&weights[in + i * num_inputs + 4]);
+      add[i] = vmlaq_f32(add[i], inputs_l, weight_l);
+      add[i] = vmlaq_f32(add[i], inputs_h, weight_h);
+    }
+  }
+#if defined(__aarch64__)
+  const float32x4_t hadd_h = vpaddq_f32(add[2], add[3]);
+  const float32x4_t hadd_l = vpaddq_f32(add[0], add[1]);
+  const float32x4_t haddhadd = vpaddq_f32(hadd_l, hadd_h);
+#else
+  const float32x4_t hadd_h =
+      vcombine_f32(vpadd_f32(vget_low_f32(add[2]), vget_high_f32(add[2])),
+                   vpadd_f32(vget_low_f32(add[3]), vget_high_f32(add[3])));
+  const float32x4_t hadd_l =
+      vcombine_f32(vpadd_f32(vget_low_f32(add[0]), vget_high_f32(add[0])),
+                   vpadd_f32(vget_low_f32(add[1]), vget_high_f32(add[1])));
+  const float32x4_t haddhadd =
+      vcombine_f32(vpadd_f32(vget_low_f32(hadd_l), vget_high_f32(hadd_l)),
+                   vpadd_f32(vget_low_f32(hadd_h), vget_high_f32(hadd_h)));
+#endif
+
+  outputs = vaddq_f32(outputs, haddhadd);
+  if (!output_layer) nn_activate4(&outputs, &zero);
+  vst1q_f32(output_nodes, outputs);
+}
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_neon(const float *input_nodes,
+                         const NN_CONFIG *const nn_config, int reduce_prec,
+                         float *const output) {
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  int buf_index = 0;
+  int num_inputs = nn_config->num_inputs;
+  // Hidden layers, except the final iteration is the output layer.
+  for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+    const float *layer_weights = nn_config->weights[layer];
+    const float *layer_bias = nn_config->bias[layer];
+    bool output_layer = (layer == nn_config->num_hidden_layers);
+    float *const output_nodes = output_layer ? output : buf[buf_index];
+    const int num_outputs = output_layer ? nn_config->num_outputs
+                                         : nn_config->num_hidden_nodes[layer];
+
+    if (num_inputs % 4 == 0 && num_outputs % 8 == 0) {
+      for (int out = 0; out < num_outputs; out += 8) {
+        nn_propagate_4to8(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out += 4) {
+        nn_propagate_8to4(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out += 4) {
+        nn_propagate_4to4(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs % 8 == 0) {
+      for (int out = 0; out < num_outputs; out++) {
+        nn_propagate_8to1(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out++) {
+        nn_propagate_4to1(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out], output_layer);
+      }
+    } else if (num_inputs > 8) {
+      for (int out = 0; out < num_outputs; out++) {
+        nn_propagate_xto1(num_inputs, input_nodes,
+                          &layer_weights[out * num_inputs], &layer_bias[out],
+                          &output_nodes[out]);
+      }
+    } else if (num_inputs >= 4) {
+      for (int out = 0; out < num_outputs; out++) {
+        nn_propagate_xsto1(num_inputs, input_nodes,
+                           &layer_weights[out * num_inputs], &layer_bias[out],
+                           &output_nodes[out]);
+      }
+    } else {
+      for (int node = 0; node < num_outputs; ++node) {
+        float val = layer_bias[node];
+        for (int i = 0; i < num_inputs; ++i)
+          val += layer_weights[node * num_inputs + i] * input_nodes[i];
+        // ReLU as activation function.
+        val = val > 0.0f ? val : 0.0f;  // Could use AOMMAX().
+        output_nodes[node] = val;
+      }
+    }
+    input_nodes = output_nodes;
+    num_inputs = num_outputs;
+    buf_index = 1 - buf_index;
+  }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
diff --git a/media/libaom/src/av1/encoder/arm/neon/picksrt_neon.c b/media/libaom/src/av1/encoder/arm/neon/picksrt_neon.c
new file mode 100644
index 0000000000..d69c44166c
--- /dev/null
+++ b/media/libaom/src/av1/encoder/arm/neon/picksrt_neon.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <math.h>
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/restoration.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+int64_t av1_lowbd_pixel_proj_error_neon(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const int32x4_t zero = vdupq_n_s32(0);
+  uint64x2_t sum64 = vreinterpretq_u64_s32(zero);
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    for (i = 0; i < height; ++i) {
+      int32x4_t err0 = zero;
+      for (j = 0; j <= width - 8; j += 8) {
+        const uint8x8_t d0 = vld1_u8(&dat[j]);
+        const uint8x8_t s0 = vld1_u8(&src[j]);
+        const int16x8_t flt0_16b =
+            vcombine_s16(vqmovn_s32(vld1q_s32(&flt0[j])),
+                         vqmovn_s32(vld1q_s32(&flt0[j + 4])));
+        const int16x8_t flt1_16b =
+            vcombine_s16(vqmovn_s32(vld1q_s32(&flt1[j])),
+                         vqmovn_s32(vld1q_s32(&flt1[j + 4])));
+        const int16x8_t u0 =
+            vreinterpretq_s16_u16(vshll_n_u8(d0, SGRPROJ_RST_BITS));
+        const int16x8_t flt0_0_sub_u = vsubq_s16(flt0_16b, u0);
+        const int16x8_t flt1_0_sub_u = vsubq_s16(flt1_16b, u0);
+        const int16x4_t flt0_16b_sub_u_lo = vget_low_s16(flt0_0_sub_u);
+        const int16x4_t flt0_16b_sub_u_hi = vget_high_s16(flt0_0_sub_u);
+        const int16x4_t flt1_16b_sub_u_lo = vget_low_s16(flt1_0_sub_u);
+        const int16x4_t flt1_16b_sub_u_hi = vget_high_s16(flt1_0_sub_u);
+
+        int32x4_t v0 = vmull_n_s16(flt0_16b_sub_u_lo, (int16_t)xq[0]);
+        v0 = vmlal_n_s16(v0, flt1_16b_sub_u_lo, (int16_t)xq[1]);
+        int32x4_t v1 = vmull_n_s16(flt0_16b_sub_u_hi, (int16_t)xq[0]);
+        v1 = vmlal_n_s16(v1, flt1_16b_sub_u_hi, (int16_t)xq[1]);
+        const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
+        const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
+        const int16x8_t e0 = vaddq_s16(vcombine_s16(vr0, vr1),
+                                       vreinterpretq_s16_u16(vsubl_u8(d0, s0)));
+        const int16x4_t e0_lo = vget_low_s16(e0);
+        const int16x4_t e0_hi = vget_high_s16(e0);
+        err0 = vmlal_s16(err0, e0_lo, e0_lo);
+        err0 = vmlal_s16(err0, e0_hi, e0_hi);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = dat[k] << SGRPROJ_RST_BITS;
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, 11) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
+    }
+
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    for (i = 0; i < height; ++i) {
+      int32x4_t err0 = zero;
+      for (j = 0; j <= width - 8; j += 8) {
+        const uint8x8_t d0 = vld1_u8(&dat[j]);
+        const uint8x8_t s0 = vld1_u8(&src[j]);
+        const uint16x8_t d0s0 = vsubl_u8(d0, s0);
+        const uint16x8x2_t d0w =
+            vzipq_u16(vmovl_u8(d0), vreinterpretq_u16_s32(zero));
+
+        const int32x4_t flt_16b_lo = vld1q_s32(&flt[j]);
+        const int32x4_t flt_16b_hi = vld1q_s32(&flt[j + 4]);
+
+        int32x4_t v0 = vmulq_n_s32(flt_16b_lo, xq_active);
+        v0 = vmlsq_n_s32(v0, vreinterpretq_s32_u16(d0w.val[0]),
+                         xq_active << SGRPROJ_RST_BITS);
+        int32x4_t v1 = vmulq_n_s32(flt_16b_hi, xq_active);
+        v1 = vmlsq_n_s32(v1, vreinterpretq_s32_u16(d0w.val[1]),
+                         xq_active << SGRPROJ_RST_BITS);
+        const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
+        const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
+        const int16x8_t e0 =
+            vaddq_s16(vcombine_s16(vr0, vr1), vreinterpretq_s16_u16(d0s0));
+        const int16x4_t e0_lo = vget_low_s16(e0);
+        const int16x4_t e0_hi = vget_high_s16(e0);
+        err0 = vmlal_s16(err0, e0_lo, e0_lo);
+        err0 = vmlal_s16(err0, e0_hi, e0_hi);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = dat[k] << SGRPROJ_RST_BITS;
+        int32_t v = xq_active * (flt[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt += flt_stride;
+      sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
+    }
+  } else {
+    uint32x4_t err0 = vreinterpretq_u32_s32(zero);
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j <= width - 16; j += 16) {
+        const uint8x16_t d = vld1q_u8(&dat[j]);
+        const uint8x16_t s = vld1q_u8(&src[j]);
+        const uint8x16_t diff = vabdq_u8(d, s);
+        const uint8x8_t diff0 = vget_low_u8(diff);
+        const uint8x8_t diff1 = vget_high_u8(diff);
+        err0 = vpadalq_u16(err0, vmull_u8(diff0, diff0));
+        err0 = vpadalq_u16(err0, vmull_u8(diff1, diff1));
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t e = dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+    sum64 = vpaddlq_u32(err0);
+  }
+#if defined(__aarch64__)
+  err += vaddvq_u64(sum64);
+#else
+  err += vget_lane_u64(vadd_u64(vget_low_u64(sum64), vget_high_u64(sum64)), 0);
+#endif  // __aarch64__
+  return err;
+}
diff --git a/media/libaom/src/av1/encoder/arm/neon/quantize_neon.c b/media/libaom/src/av1/encoder/arm/neon/quantize_neon.c
index c2f50a2173..289218dc5d 100644
--- a/media/libaom/src/av1/encoder/arm/neon/quantize_neon.c
+++ b/media/libaom/src/av1/encoder/arm/neon/quantize_neon.c
@@ -13,16 +13,62 @@
 
 #include <math.h>
 
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_mem/aom_mem.h"
 
 #include "av1/common/quant_common.h"
 #include "av1/common/seg_common.h"
-#include "av1/common/arm/mem_neon.h"
 
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/rd.h"
 
+static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#ifdef __aarch64__
+  return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+  const int16x4_t v_eobmax_3210 =
+      vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+  const int64x1_t v_eobmax_xx32 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+  const int16x4_t v_eobmax_tmp =
+      vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+  const int64x1_t v_eobmax_xxx3 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+  const int16x4_t v_eobmax_final =
+      vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif
+}
+
+static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+                                         int16x8_t v_eobmax,
+                                         uint16x8_t v_mask) {
+  const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+  return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static INLINE uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr,
+                                       tran_low_t *qcoeff_ptr,
+                                       tran_low_t *dqcoeff_ptr,
+                                       int16x8_t v_quant, int16x8_t v_dequant,
+                                       int16x8_t v_round, int16x8_t v_zero) {
+  const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_abs = vabsq_s16(v_coeff);
+  const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+  const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
+  const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+  const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+  const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+  const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+  store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff);
+  store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff);
+  return v_nz_mask;
+}
+
 void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
                           const int16_t *zbin_ptr, const int16_t *round_ptr,
                           const int16_t *quant_ptr,
@@ -38,178 +84,874 @@ void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
 
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
-  int i;
   const int16x8_t v_zero = vdupq_n_s16(0);
-  const int16x8_t v_one = vdupq_n_s16(1);
+  int16x8_t v_quant = vld1q_s16(quant_ptr);
+  int16x8_t v_dequant = vld1q_s16(dequant_ptr);
+  int16x8_t v_round = vld1q_s16(round_ptr);
   int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
-  int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
-  int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
-  int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
-  // adjust for dc
-  v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
-  v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
-  v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+  uint16x8_t v_nz_mask;
   // process dc and the first seven ac coeffs
-  {
-    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
-    const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
-    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-    const int16x8_t v_abs = vabsq_s16(v_coeff);
-    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
-    const int32x4_t v_tmp_lo =
-        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-    const int32x4_t v_tmp_hi =
-        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-    const int16x8_t v_tmp2 =
-        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-    const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-    store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff);
-    store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff);
-    v_round = vmovq_n_s16(round_ptr[1]);
-    v_quant = vmovq_n_s16(quant_ptr[1]);
-    v_dequant = vmovq_n_s16(dequant_ptr[1]);
-  }
+  v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+                            v_dequant, v_round, v_zero);
+  v_eobmax_76543210 = get_max_lane_eob(&iscan[0], v_eobmax_76543210, v_nz_mask);
+  // overwrite the dc constants with ac constants
+  v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
+  v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
+  v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
+
+  count -= 8;
   // now process the rest of the ac coeffs
-  for (i = 8; i < count; i += 8) {
-    const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
-    const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
-    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-    const int16x8_t v_abs = vabsq_s16(v_coeff);
-    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
-    const int32x4_t v_tmp_lo =
-        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-    const int32x4_t v_tmp_hi =
-        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-    const int16x8_t v_tmp2 =
-        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-    const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-    store_s16q_to_tran_low(&qcoeff_ptr[i], v_qcoeff);
-    store_s16q_to_tran_low(&dqcoeff_ptr[i], v_dqcoeff);
-  }
-#ifdef __aarch64__
-  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
-#else
-  {
-    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
-                                             vget_high_s16(v_eobmax_76543210));
-    const int64x1_t v_eobmax_xx32 =
-        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
-    const int16x4_t v_eobmax_tmp =
-        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
-    const int64x1_t v_eobmax_xxx3 =
-        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
-    const int16x4_t v_eobmax_final =
-        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-
-    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-  }
-#endif  // __aarch64__
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+                              v_dequant, v_round, v_zero);
+    v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+    count -= 8;
+  } while (count > 0);
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
 }
 
-static INLINE void calculate_dqcoeff_lp_and_store(const int16x8_t qcoeff,
-                                                  const int16x8_t dequant,
-                                                  int16_t *dqcoeff) {
-  const int32x4_t dqcoeff_0 =
-      vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
-  const int32x4_t dqcoeff_1 =
-      vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
-
-  vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+static INLINE uint16x8_t quantize_lp_8(const int16_t *coeff_ptr,
+                                       int16_t *qcoeff_ptr,
+                                       int16_t *dqcoeff_ptr, int16x8_t v_quant,
+                                       int16x8_t v_dequant, int16x8_t v_round,
+                                       int16x8_t v_zero) {
+  const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_abs = vabsq_s16(v_coeff);
+  const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+  const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
+  const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+  const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+  const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+  const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+  vst1q_s16(qcoeff_ptr, v_qcoeff);
+  vst1q_s16(dqcoeff_ptr, v_dqcoeff);
+  return v_nz_mask;
 }
 
-void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t count,
+void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan) {
+                          const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
   const int16x8_t v_zero = vdupq_n_s16(0);
-  const int16x8_t v_one = vdupq_n_s16(1);
+  int16x8_t v_quant = vld1q_s16(quant_ptr);
+  int16x8_t v_dequant = vld1q_s16(dequant_ptr);
+  int16x8_t v_round = vld1q_s16(round_ptr);
   int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
-  int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
-  int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
-  int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
-
-  // adjust for dc
-  v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
-  v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
-  v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+  uint16x8_t v_nz_mask;
+  intptr_t count = n_coeffs;
+
   // process dc and the first seven ac coeffs
-  {
-    const int16x8_t v_iscan = vld1q_s16(&scan[0]);
-    const int16x8_t v_coeff = vld1q_s16(coeff_ptr);
-    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-    const int16x8_t v_abs = vabsq_s16(v_coeff);
-    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
-    const int32x4_t v_tmp_lo =
-        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-    const int32x4_t v_tmp_hi =
-        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-    const int16x8_t v_tmp2 =
-        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-    calculate_dqcoeff_lp_and_store(v_qcoeff, v_dequant, dqcoeff_ptr);
-    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-    vst1q_s16(qcoeff_ptr, v_qcoeff);
-    v_round = vmovq_n_s16(round_ptr[1]);
-    v_quant = vmovq_n_s16(quant_ptr[1]);
-    v_dequant = vmovq_n_s16(dequant_ptr[1]);
-  }
+  v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+                            v_dequant, v_round, v_zero);
+  v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+  // overwrite the dc constants with ac constants
+  v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
+  v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
+  v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
+
+  count -= 8;
   // now process the rest of the ac coeffs
-  for (int i = 8; i < count; i += 8) {
-    const int16x8_t v_iscan = vld1q_s16(&scan[i]);
-    const int16x8_t v_coeff = vld1q_s16(coeff_ptr + i);
-    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-    const int16x8_t v_abs = vabsq_s16(v_coeff);
-    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
-    const int32x4_t v_tmp_lo =
-        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-    const int32x4_t v_tmp_hi =
-        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-    const int16x8_t v_tmp2 =
-        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-    calculate_dqcoeff_lp_and_store(v_qcoeff, v_dequant, dqcoeff_ptr + i);
-    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-    vst1q_s16(qcoeff_ptr + i, v_qcoeff);
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+                              v_dequant, v_round, v_zero);
+    v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+    count -= 8;
+  } while (count != 0);
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
+}
+
+void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const int log_scale = 1;
+  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+  int16x8_t round = vdupq_n_s16(rounding[1]);
+  int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+  dequant = vsetq_lane_s16(dequant_ptr[0], dequant, 0);
+
+  int16x8_t coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+
+  int16x8_t abs = vabsq_s16(coeff);
+  uint16x8_t check = vcgeq_s16(abs, vshrq_n_s16(dequant, 2));
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(check)), 0);
+  if (nz_check) {
+    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    round = vsetq_lane_s16(rounding[0], round, 0);
+    quant = vsetq_lane_s16(quant_ptr[0], quant, 0);
+
+    abs = vqaddq_s16(abs, round);
+    int16x8_t temp = vqdmulhq_s16(abs, quant);
+    int16x8_t qcoeff_temp = vsubq_s16(veorq_s16(temp, coeff_sign), coeff_sign);
+    abs = vreinterpretq_s16_u16(
+        vshrq_n_u16(vreinterpretq_u16_s16(vmulq_s16(temp, dequant)), 1));
+    int16x8_t dqcoeff_temp = vsubq_s16(veorq_s16(abs, coeff_sign), coeff_sign);
+
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(check, qcoeff_temp, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+    coeff_nz_mask =
+        vbslq_s16(check, dqcoeff_temp, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    round = vsetq_lane_s16(rounding[1], round, 0);
+    quant = vsetq_lane_s16(quant_ptr[1], quant, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(abs, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, check);
+    check = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(check, v_iscan, v_eobmax_76543210);
   }
-#ifdef __aarch64__
-  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
-#else
-  {
-    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
-                                             vget_high_s16(v_eobmax_76543210));
-    const int64x1_t v_eobmax_xx32 =
-        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
-    const int16x4_t v_eobmax_tmp =
-        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
-    const int64x1_t v_eobmax_xxx3 =
-        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
-    const int16x4_t v_eobmax_final =
-        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-
-    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-  }
-#endif  // __aarch64__
+
+  dequant = vsetq_lane_s16(dequant_ptr[1], dequant, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    abs = vabsq_s16(coeff);
+    check = vcgeq_s16(abs, vshrq_n_s16(dequant, 2));
+
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(check)), 0);
+    if (nz_check) {
+      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+
+      abs = vqaddq_s16(abs, round);
+      int16x8_t temp = vqdmulhq_s16(abs, quant);
+      int16x8_t qcoeff_temp =
+          vsubq_s16(veorq_s16(temp, coeff_sign), coeff_sign);
+      abs = vreinterpretq_s16_u16(
+          vshrq_n_u16(vreinterpretq_u16_s16(vmulq_s16(temp, dequant)), 1));
+      int16x8_t dqcoeff_temp =
+          vsubq_s16(veorq_s16(abs, coeff_sign), coeff_sign);
+
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(check, qcoeff_temp, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+      coeff_nz_mask = vbslq_s16(check, dqcoeff_temp,
+                                load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(abs, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, check);
+      check = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(check, v_iscan, v_eobmax_76543210);
+    }
+  }
+  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const int log_scale = 2;
+  const int16x8_t v_log_scale =
+      vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE));
+
+  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+
+  int16x8_t round = vdupq_n_s16(rounding[1]);
+  int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+  dequant = vsetq_lane_s16(dequant_ptr[0], dequant, 0);
+
+  int16x8_t coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t abs = vabsq_s16(coeff);
+  uint16x8_t check = vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(abs), 1),
+                               vshrq_n_u16(vreinterpretq_u16_s16(dequant), 2));
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(check)), 0);
+  if (nz_check) {
+    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    round = vsetq_lane_s16(rounding[0], round, 0);
+    quant = vsetq_lane_s16(quant_ptr[0], quant, 0);
+    abs = vqaddq_s16(abs, round);
+    int16x8_t temp =
+        vorrq_s16(vshlq_n_s16(vqdmulhq_s16(abs, quant), 1),
+                  vreinterpretq_s16_u16(vshrq_n_u16(
+                      vreinterpretq_u16_s16(vmulq_s16(abs, quant)), 14)));
+    int16x8_t qcoeff_temp = vsubq_s16(veorq_s16(temp, coeff_sign), coeff_sign);
+
+    abs = vreinterpretq_s16_u16(vshlq_u16(
+        vreinterpretq_u16_s16(vmulq_s16(temp, dequant)), v_log_scale));
+    abs = vorrq_s16(vshlq_n_s16(vqdmulhq_s16(temp, dequant), 13), abs);
+    int16x8_t dqcoeff_temp = vsubq_s16(veorq_s16(abs, coeff_sign), coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(check, qcoeff_temp, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+    coeff_nz_mask =
+        vbslq_s16(check, dqcoeff_temp, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    round = vsetq_lane_s16(rounding[1], round, 0);
+    quant = vsetq_lane_s16(quant_ptr[1], quant, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(abs, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, check);
+    check = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(check, v_iscan, v_eobmax_76543210);
+  }
+
+  dequant = vsetq_lane_s16(dequant_ptr[1], dequant, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    abs = vabsq_s16(coeff);
+    check = vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(abs), 1),
+                      vshrq_n_u16(vreinterpretq_u16_s16(dequant), 2));
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(check)), 0);
+    if (nz_check) {
+      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      abs = vqaddq_s16(abs, round);
+      int16x8_t temp =
+          vorrq_s16(vshlq_n_s16(vqdmulhq_s16(abs, quant), 1),
+                    vreinterpretq_s16_u16(vshrq_n_u16(
+                        vreinterpretq_u16_s16(vmulq_s16(abs, quant)), 14)));
+
+      int16x8_t qcoeff_temp =
+          vsubq_s16(veorq_s16(temp, coeff_sign), coeff_sign);
+
+      abs = vreinterpretq_s16_u16(vshlq_u16(
+          vreinterpretq_u16_s16(vmulq_s16(temp, dequant)), v_log_scale));
+      abs = vorrq_s16(vshlq_n_s16(vqdmulhq_s16(temp, dequant), 13), abs);
+
+      int16x8_t dqcoeff_temp =
+          vsubq_s16(veorq_s16(abs, coeff_sign), coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(check, qcoeff_temp, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+      coeff_nz_mask = vbslq_s16(check, dqcoeff_temp,
+                                load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(abs, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, check);
+
+      check = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(check, v_iscan, v_eobmax_76543210);
+    }
+  }
+  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+
+  int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
+  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  int16x8_t v_abs = vabsq_s16(v_coeff);
+
+  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+
+  uint16x8_t vcond = vcgeq_s16(v_abs, vzbins);
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+  if (nz_check) {
+    vround = vsetq_lane_s16(round_ptr[0], vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+    int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+    vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+
+    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+    int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+
+    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+    coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    vround = vsetq_lane_s16(round_ptr[1], vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+  }
+  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    v_abs = vabsq_s16(v_coeff);
+    vcond = vcgeq_s16(v_abs, vzbins);
+
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+    if (nz_check) {
+      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+      int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+
+      vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+      int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+      coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+    }
+  }
+  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+#define QM_MULL_SHIFT(x0, x1)                                              \
+  vreinterpretq_s16_u16(vorrq_u16(                                         \
+      vreinterpretq_u16_s16(vshlq_n_s16(                                   \
+          vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \
+      vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS)))
+
+static void aom_quantize_b_helper_16x16_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  (void)scan;
+
+  uint16x8_t vwt, viwt;
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+
+  int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
+  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  int16x8_t v_abs = vabsq_s16(v_coeff);
+  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+  uint16x8_t vcond;
+  if (qm_ptr == NULL) {
+    vcond = vcgeq_s16(v_abs, vzbins);
+  } else {
+    vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+    vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+  }
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+  if (nz_check) {
+    vround = vsetq_lane_s16(round_ptr[0], vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+    int16x8_t vtmp2;
+    if (qm_ptr == NULL) {
+      vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+    } else {
+      vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+      vtmp2 = vaddq_s16(vtmp2, vtmp);
+    }
+
+    vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+    if (iqm_ptr != NULL) {
+      viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+      vdequant = QM_MULL_SHIFT(vdequant, viwt);
+    }
+    int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+    coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    vround = vsetq_lane_s16(round_ptr[1], vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+  }
+  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    v_abs = vabsq_s16(v_coeff);
+
+    if (qm_ptr == NULL) {
+      vcond = vcgeq_s16(v_abs, vzbins);
+    } else {
+      vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+      vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+    }
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+    if (nz_check) {
+      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+      int16x8_t vtmp2;
+      if (qm_ptr == NULL) {
+        vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+      } else {
+        vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+        vtmp2 = vaddq_s16(vtmp2, vtmp);
+      }
+
+      vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+      if (iqm_ptr != NULL) {
+        viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+        vdequant = QM_MULL_SHIFT(vdequant, viwt);
+      }
+      int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+      coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+    }
+  }
+  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+static void aom_quantize_b_helper_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  (void)scan;
+
+  uint16x8_t vwt, viwt;
+  const int log_scale = 1;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+  const int16x8_t v_log_scale = v_eobmax_76543210;
+
+  int16x8_t vzbins = vdupq_n_s16(zbins[1]),
+            vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
+  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  int16x8_t v_abs = vabsq_s16(v_coeff);
+  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+  uint16x8_t vcond;
+  if (qm_ptr == NULL) {
+    vcond = vcgeq_s16(v_abs, vzbins);
+  } else {
+    vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+    vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+  }
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+  if (nz_check) {
+    vround =
+        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+    int16x8_t vtmp2;
+    if (qm_ptr == NULL) {
+      vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+    } else {
+      vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+      vtmp2 = vaddq_s16(vtmp2, vtmp);
+    }
+
+    vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
+    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+    if (iqm_ptr != NULL) {
+      viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+      vdequant = QM_MULL_SHIFT(vdequant, viwt);
+    }
+    int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+        vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+    coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+    vround =
+        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+  }
+  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    v_abs = vabsq_s16(v_coeff);
+
+    if (qm_ptr == NULL) {
+      vcond = vcgeq_s16(v_abs, vzbins);
+    } else {
+      vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+      vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+    }
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+    if (nz_check) {
+      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+      int16x8_t vtmp2;
+      if (qm_ptr == NULL) {
+        vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+      } else {
+        vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+        vtmp2 = vaddq_s16(vtmp2, vtmp);
+      }
+      vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
+
+      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+      if (iqm_ptr != NULL) {
+        viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+        vdequant = QM_MULL_SHIFT(vdequant, viwt);
+      }
+      int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+          vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+      coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+    }
+  }
+  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+static void aom_quantize_b_helper_64x64_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  (void)scan;
+
+  uint16x8_t vwt, viwt;
+  const int log_scale = 2;
+  const int16x8_t v_log_scale =
+      vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE));
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  const int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+  int16x8_t v_ones = vnegq_s16(v_eobmax_76543210);
+
+  int16x8_t vzbins = vdupq_n_s16(zbins[1]),
+            vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
+  int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+  int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+  int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+  int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+  int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  int16x8_t v_abs = vabsq_s16(v_coeff);
+  vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+  uint16x8_t vcond;
+  if (qm_ptr == NULL) {
+    vcond = vcgeq_s16(v_abs, vzbins);
+  } else {
+    vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+    vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+  }
+  uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+  if (nz_check) {
+    vround =
+        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+    int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+    int16x8_t vtmp2;
+    if (qm_ptr == NULL) {
+      vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+    } else {
+      vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+      vtmp2 = vaddq_s16(vtmp2, vtmp);
+    }
+
+    int16x8_t ones =
+        vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
+    vtmp2 =
+        vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
+    int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+    int16x8_t coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+    store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+    if (iqm_ptr != NULL) {
+      viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+      vdequant = QM_MULL_SHIFT(vdequant, viwt);
+    }
+    int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+        vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+    v_deq_abs =
+        vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
+    vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+    coeff_nz_mask =
+        vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+    vround =
+        vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
+    vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+    vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+    vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+    uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+    const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+    int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+    v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+  }
+  vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+  for (int i = 8; i < n_coeffs; i += 8) {
+    v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    v_abs = vabsq_s16(v_coeff);
+
+    if (qm_ptr == NULL) {
+      vcond = vcgeq_s16(v_abs, vzbins);
+    } else {
+      vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+      vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+    }
+    nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+    if (nz_check) {
+      int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+      int16x8_t vtmp2;
+      if (qm_ptr == NULL) {
+        vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+      } else {
+        vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+        vtmp2 = vaddq_s16(vtmp2, vtmp);
+      }
+
+      int16x8_t ones =
+          vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
+      vtmp2 =
+          vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
+      int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+      int16x8_t coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+      store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+      if (iqm_ptr != NULL) {
+        viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+        vdequant = QM_MULL_SHIFT(vdequant, viwt);
+      }
+      int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+          vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+      v_deq_abs =
+          vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
+      vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+      coeff_nz_mask =
+          vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+      store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+      uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+      const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+      int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+      v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+    }
+  }
+  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+void aom_quantize_b_helper_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  switch (log_scale) {  // log_scale for AV1 encoder can be only 0, 1, 2
+    case 0:
+      aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                       quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                       dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                       iscan, qm_ptr, iqm_ptr);
+      break;
+    case 1:
+      aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                       quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                       dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                       iscan, qm_ptr, iqm_ptr);
+      break;
+    case 2:
+      aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                       quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                       dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                       iscan, qm_ptr, iqm_ptr);
+      break;
+  }
+}
+
+void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                             NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                             NULL, NULL, 2);
 }
diff --git a/media/libaom/src/av1/encoder/arm/neon/rdopt_neon.c b/media/libaom/src/av1/encoder/arm/neon/rdopt_neon.c
new file mode 100644
index 0000000000..25df6b46ea
--- /dev/null
+++ b/media/libaom/src/av1/encoder/arm/neon/rdopt_neon.c
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include "av1/encoder/rdopt.h"
+#include "config/av1_rtcd.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+                                          int32x4_t *xy_sum_32,
+                                          int32x4_t *xz_sum_32,
+                                          int32x4_t *x_sum_32,
+                                          int32x4_t *x2_sum_32) {
+  // Pixels in this 4x4   [ a b c d ]
+  // are referred to as:  [ e f g h ]
+  //                      [ i j k l ]
+  //                      [ m n o p ]
+
+  const int16x4_t pixelsa_2_lo = vld1_s16(diff + (0 * stride));
+  const int16x4_t pixelsa_2_sli =
+      vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_2_lo), 16));
+  const int16x4_t pixelsb_2_lo = vld1_s16(diff + (1 * stride));
+  const int16x4_t pixelsb_2_sli =
+      vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_2_lo), 16));
+  const int16x4_t pixelsa_1_lo = vld1_s16(diff + (2 * stride));
+  const int16x4_t pixelsa_1_sli =
+      vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_1_lo), 16));
+  const int16x4_t pixelsb_1_lo = vld1_s16(diff + (3 * stride));
+  const int16x4_t pixelsb_1_sli =
+      vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_1_lo), 16));
+
+  const int16x8_t slli_a = vcombine_s16(pixelsa_1_sli, pixelsa_2_sli);
+
+  *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_1_lo, pixelsa_1_sli);
+  *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_2_lo, pixelsa_2_sli);
+  *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsb_2_lo, pixelsb_2_sli);
+
+  *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_1_sli);
+  *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_2_sli, pixelsb_2_sli);
+  *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_2_sli);
+
+  // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k
+  // (sum up every element in slli_a and swap_b)
+  *x_sum_32 = vpadalq_s16(*x_sum_32, slli_a);
+  *x_sum_32 = vaddw_s16(*x_sum_32, pixelsb_2_sli);
+
+  // Also sum their squares
+  *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_1_sli, pixelsa_1_sli);
+  *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_2_sli, pixelsa_2_sli);
+  *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsb_2_sli, pixelsb_2_sli);
+}
+
+void av1_get_horver_correlation_full_neon(const int16_t *diff, int stride,
+                                          int width, int height, float *hcorr,
+                                          float *vcorr) {
+  // The following notation is used:
+  // x - current pixel
+  // y - right neighbour pixel
+  // z - below neighbour pixel
+  // w - down-right neighbour pixel
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t x_sum = 0, x2_sum = 0;
+  int32x4_t zero = vdupq_n_s32(0);
+  int64x2_t v_x_sum = vreinterpretq_s64_s32(zero);
+  int64x2_t v_xy_sum = vreinterpretq_s64_s32(zero);
+  int64x2_t v_xz_sum = vreinterpretq_s64_s32(zero);
+  int64x2_t v_x2_sum = vreinterpretq_s64_s32(zero);
+  // Process horizontal and vertical correlations through the body in 4x4
+  // blocks.  This excludes the final row and column and possibly one extra
+  // column depending how 3 divides into width and height
+
+  for (int i = 0; i <= height - 4; i += 3) {
+    int32x4_t xy_sum_32 = zero;
+    int32x4_t xz_sum_32 = zero;
+    int32x4_t x_sum_32 = zero;
+    int32x4_t x2_sum_32 = zero;
+    for (int j = 0; j <= width - 4; j += 3) {
+      horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+                             &xz_sum_32, &x_sum_32, &x2_sum_32);
+    }
+    v_xy_sum = vpadalq_s32(v_xy_sum, xy_sum_32);
+    v_xz_sum = vpadalq_s32(v_xz_sum, xz_sum_32);
+    v_x_sum = vpadalq_s32(v_x_sum, x_sum_32);
+    v_x2_sum = vpadalq_s32(v_x2_sum, x2_sum_32);
+  }
+#if defined(__aarch64__)
+  xy_sum = vaddvq_s64(v_xy_sum);
+  xz_sum = vaddvq_s64(v_xz_sum);
+  x2_sum = vaddvq_s64(v_x2_sum);
+  x_sum = vaddvq_s64(v_x_sum);
+#else
+  xy_sum = vget_lane_s64(
+      vadd_s64(vget_low_s64(v_xy_sum), vget_high_s64(v_xy_sum)), 0);
+  xz_sum = vget_lane_s64(
+      vadd_s64(vget_low_s64(v_xz_sum), vget_high_s64(v_xz_sum)), 0);
+  x2_sum = vget_lane_s64(
+      vadd_s64(vget_low_s64(v_x2_sum), vget_high_s64(v_x2_sum)), 0);
+  x_sum =
+      vget_lane_s64(vadd_s64(vget_low_s64(v_x_sum), vget_high_s64(v_x_sum)), 0);
+#endif
+  // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+  int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+  // Do we have 2 rows remaining or just the one?  Note that width and height
+  // are powers of 2, so each modulo 3 must be 1 or 2.
+  if (height % 3 == 1) {  // Just horiz corrs on the final row
+    const int16_t x0 = diff[(height - 1) * stride];
+    x_sum += x0;
+    x_finalrow += x0;
+    x2_sum += x0 * x0;
+    x2_finalrow += x0 * x0;
+    if (width >= 8) {
+      int32x4_t v_y_sum = zero;
+      int32x4_t v_y2_sum = zero;
+      int32x4_t v_xy_sum_a = zero;
+      int k = width - 1;
+      int j = 0;
+      while ((k - 8) > 0) {
+        const int16x8_t v_x = vld1q_s16(&diff[(height - 1) * stride + j]);
+        const int16x8_t v_y = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
+        const int16x4_t v_x_lo = vget_low_s16(v_x);
+        const int16x4_t v_x_hi = vget_high_s16(v_x);
+        const int16x4_t v_y_lo = vget_low_s16(v_y);
+        const int16x4_t v_y_hi = vget_high_s16(v_y);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+        v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+        v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+        v_y_sum = vpadalq_s16(v_y_sum, v_y);
+        k -= 8;
+        j += 8;
+      }
+
+      const int16x8_t v_l = vld1q_s16(&diff[(height - 1) * stride] + j);
+      const int16x8_t v_x =
+          vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
+                    vreinterpretq_s16_s32(zero), 1);
+      const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
+      const int16x4_t v_x_lo = vget_low_s16(v_x);
+      const int16x4_t v_x_hi = vget_high_s16(v_x);
+      const int16x4_t v_y_lo = vget_low_s16(v_y);
+      const int16x4_t v_y_hi = vget_high_s16(v_y);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+      v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+      v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+      const int32x4_t v_y_sum_a = vpadalq_s16(v_y_sum, v_y);
+      const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
+#if defined(__aarch64__)
+      const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
+      xy_sum += vaddvq_s64(v_xy_sum2);
+      const int32_t y = vaddvq_s32(v_y_sum_a);
+      const int64_t y2 = vaddvq_s64(v_y2_sum_a);
+#else
+      xy_sum += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
+      const int64x2_t v_y_a = vpaddlq_s32(v_y_sum_a);
+      const int64_t y =
+          vget_lane_s64(vadd_s64(vget_low_s64(v_y_a), vget_high_s64(v_y_a)), 0);
+      const int64x2_t v_y2_sum_b = vpaddlq_s32(v_y2_sum);
+      int64_t y2 = vget_lane_s64(
+          vadd_s64(vget_low_s64(v_y2_sum_b), vget_high_s64(v_y2_sum_b)), 0);
+#endif
+      x_sum += y;
+      x2_sum += y2;
+      x_finalrow += y;
+      x2_finalrow += y2;
+    } else {
+      for (int j = 0; j < width - 1; ++j) {
+        const int16_t x = diff[(height - 1) * stride + j];
+        const int16_t y = diff[(height - 1) * stride + j + 1];
+        xy_sum += x * y;
+        x_sum += y;
+        x2_sum += y * y;
+        x_finalrow += y;
+        x2_finalrow += y * y;
+      }
+    }
+  } else {  // Two rows remaining to do
+    const int16_t x0 = diff[(height - 2) * stride];
+    const int16_t z0 = diff[(height - 1) * stride];
+    x_sum += x0 + z0;
+    x2_sum += x0 * x0 + z0 * z0;
+    x_finalrow += z0;
+    x2_finalrow += z0 * z0;
+    if (width >= 8) {
+      int32x4_t v_y2_sum = zero;
+      int32x4_t v_w2_sum = zero;
+      int32x4_t v_xy_sum_a = zero;
+      int32x4_t v_xz_sum_a = zero;
+      int32x4_t v_x_sum_a = zero;
+      int32x4_t v_w_sum = zero;
+      int k = width - 1;
+      int j = 0;
+      while ((k - 8) > 0) {
+        const int16x8_t v_x = vld1q_s16(&diff[(height - 2) * stride + j]);
+        const int16x8_t v_y = vld1q_s16(&diff[(height - 2) * stride + j + 1]);
+        const int16x8_t v_z = vld1q_s16(&diff[(height - 1) * stride + j]);
+        const int16x8_t v_w = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
+
+        const int16x4_t v_x_lo = vget_low_s16(v_x);
+        const int16x4_t v_y_lo = vget_low_s16(v_y);
+        const int16x4_t v_z_lo = vget_low_s16(v_z);
+        const int16x4_t v_w_lo = vget_low_s16(v_w);
+        const int16x4_t v_x_hi = vget_high_s16(v_x);
+        const int16x4_t v_y_hi = vget_high_s16(v_y);
+        const int16x4_t v_z_hi = vget_high_s16(v_z);
+        const int16x4_t v_w_hi = vget_high_s16(v_w);
+
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
+        v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
+
+        v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
+        v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
+
+        v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
+        v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
+        v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+        v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+
+        v_w_sum = vpadalq_s16(v_w_sum, v_w);
+        v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
+        v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
+
+        k -= 8;
+        j += 8;
+      }
+      const int16x8_t v_l = vld1q_s16(&diff[(height - 2) * stride] + j);
+      const int16x8_t v_x =
+          vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
+                    vreinterpretq_s16_s32(zero), 1);
+      const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
+      const int16x8_t v_l_2 = vld1q_s16(&diff[(height - 1) * stride] + j);
+      const int16x8_t v_z =
+          vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l_2, 7),
+                    vreinterpretq_s16_s32(zero), 1);
+      const int16x8_t v_w = vextq_s16(v_l_2, vreinterpretq_s16_s32(zero), 1);
+
+      const int16x4_t v_x_lo = vget_low_s16(v_x);
+      const int16x4_t v_y_lo = vget_low_s16(v_y);
+      const int16x4_t v_z_lo = vget_low_s16(v_z);
+      const int16x4_t v_w_lo = vget_low_s16(v_w);
+      const int16x4_t v_x_hi = vget_high_s16(v_x);
+      const int16x4_t v_y_hi = vget_high_s16(v_y);
+      const int16x4_t v_z_hi = vget_high_s16(v_z);
+      const int16x4_t v_w_hi = vget_high_s16(v_w);
+
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
+      v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
+
+      v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
+      v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
+
+      v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
+      v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
+      v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+      v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+
+      v_w_sum = vpadalq_s16(v_w_sum, v_w);
+      v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
+      v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
+
+#if defined(__aarch64__)
+      xy_sum += vaddvq_s64(vpaddlq_s32(v_xy_sum_a));
+      xz_sum += vaddvq_s64(vpaddlq_s32(v_xz_sum_a));
+      x_sum += vaddvq_s32(v_x_sum_a);
+      x_finalrow += vaddvq_s32(v_w_sum);
+      int64_t y2 = vaddvq_s64(vpaddlq_s32(v_y2_sum));
+      int64_t w2 = vaddvq_s64(vpaddlq_s32(v_w2_sum));
+#else
+      const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
+      xy_sum += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
+      const int64x2_t v_xz_sum2 = vpaddlq_s32(v_xz_sum_a);
+      xz_sum += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_xz_sum2), vget_high_s64(v_xz_sum2)), 0);
+      const int64x2_t v_x_sum2 = vpaddlq_s32(v_x_sum_a);
+      x_sum += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_x_sum2), vget_high_s64(v_x_sum2)), 0);
+      const int64x2_t v_w_sum_a = vpaddlq_s32(v_w_sum);
+      x_finalrow += vget_lane_s64(
+          vadd_s64(vget_low_s64(v_w_sum_a), vget_high_s64(v_w_sum_a)), 0);
+      const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
+      int64_t y2 = vget_lane_s64(
+          vadd_s64(vget_low_s64(v_y2_sum_a), vget_high_s64(v_y2_sum_a)), 0);
+      const int64x2_t v_w2_sum_a = vpaddlq_s32(v_w2_sum);
+      int64_t w2 = vget_lane_s64(
+          vadd_s64(vget_low_s64(v_w2_sum_a), vget_high_s64(v_w2_sum_a)), 0);
+#endif
+      x2_sum += y2 + w2;
+      x2_finalrow += w2;
+    } else {
+      for (int j = 0; j < width - 1; ++j) {
+        const int16_t x = diff[(height - 2) * stride + j];
+        const int16_t y = diff[(height - 2) * stride + j + 1];
+        const int16_t z = diff[(height - 1) * stride + j];
+        const int16_t w = diff[(height - 1) * stride + j + 1];
+
+        // Horizontal and vertical correlations for the penultimate row:
+        xy_sum += x * y;
+        xz_sum += x * z;
+
+        // Now just horizontal correlations for the final row:
+        xy_sum += z * w;
+
+        x_sum += y + w;
+        x2_sum += y * y + w * w;
+        x_finalrow += w;
+        x2_finalrow += w * w;
+      }
+    }
+  }
+
+  // Do we have 2 columns remaining or just the one?
+  if (width % 3 == 1) {  // Just vert corrs on the final col
+    const int16_t x0 = diff[width - 1];
+    x_sum += x0;
+    x_finalcol += x0;
+    x2_sum += x0 * x0;
+    x2_finalcol += x0 * x0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 1];
+      xz_sum += x * z;
+      x_finalcol += z;
+      x2_finalcol += z * z;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z;
+        x2_sum += z * z;
+      }
+    }
+  } else {  // Two cols remaining
+    const int16_t x0 = diff[width - 2];
+    const int16_t y0 = diff[width - 1];
+    x_sum += x0 + y0;
+    x2_sum += x0 * x0 + y0 * y0;
+    x_finalcol += y0;
+    x2_finalcol += y0 * y0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 2];
+      const int16_t y = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 2];
+      const int16_t w = diff[(i + 1) * stride + width - 1];
+
+      // Horizontal and vertical correlations for the penultimate col:
+      // Skip these on the last iteration of this loop if we also had two
+      // rows remaining, otherwise the final horizontal and vertical correlation
+      // get erroneously processed twice
+      if (i < height - 2 || height % 3 == 1) {
+        xy_sum += x * y;
+        xz_sum += x * z;
+      }
+
+      x_finalcol += w;
+      x2_finalcol += w * w;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z + w;
+        x2_sum += z * z + w * w;
+      }
+
+      // Now just vertical correlations for the final column:
+      xz_sum += y * w;
+    }
+  }
+
+  // Calculate the simple sums and squared-sums
+  int64_t x_firstrow = 0, x_firstcol = 0;
+  int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+  if (width >= 8) {
+    int32x4_t v_x_firstrow = zero;
+    int32x4_t v_x2_firstrow = zero;
+    for (int j = 0; j < width; j += 8) {
+      const int16x8_t v_diff = vld1q_s16(diff + j);
+      const int16x4_t v_diff_lo = vget_low_s16(v_diff);
+      const int16x4_t v_diff_hi = vget_high_s16(v_diff);
+      v_x_firstrow = vpadalq_s16(v_x_firstrow, v_diff);
+      v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_lo, v_diff_lo);
+      v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_hi, v_diff_hi);
+    }
+#if defined(__aarch64__)
+    x_firstrow += vaddvq_s32(v_x_firstrow);
+    x2_firstrow += vaddvq_s32(v_x2_firstrow);
+#else
+    const int64x2_t v_x_firstrow_64 = vpaddlq_s32(v_x_firstrow);
+    x_firstrow += vget_lane_s64(
+        vadd_s64(vget_low_s64(v_x_firstrow_64), vget_high_s64(v_x_firstrow_64)),
+        0);
+    const int64x2_t v_x2_firstrow_64 = vpaddlq_s32(v_x2_firstrow);
+    x2_firstrow += vget_lane_s64(vadd_s64(vget_low_s64(v_x2_firstrow_64),
+                                          vget_high_s64(v_x2_firstrow_64)),
+                                 0);
+#endif
+  } else {
+    for (int j = 0; j < width; ++j) {
+      x_firstrow += diff[j];
+      x2_firstrow += diff[j] * diff[j];
+    }
+  }
+  for (int i = 0; i < height; ++i) {
+    x_firstcol += diff[i * stride];
+    x2_firstcol += diff[i * stride] * diff[i * stride];
+  }
+
+  int64_t xhor_sum = x_sum - x_finalcol;
+  int64_t xver_sum = x_sum - x_finalrow;
+  int64_t y_sum = x_sum - x_firstcol;
+  int64_t z_sum = x_sum - x_firstrow;
+  int64_t x2hor_sum = x2_sum - x2_finalcol;
+  int64_t x2ver_sum = x2_sum - x2_finalrow;
+  int64_t y2_sum = x2_sum - x2_firstcol;
+  int64_t z2_sum = x2_sum - x2_firstrow;
+
+  const float num_hor = (float)(height * (width - 1));
+  const float num_ver = (float)((height - 1) * width);
+
+  const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+  const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+  const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+  const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+  const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+  const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+  if (xhor_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  } else {
+    *hcorr = 1.0;
+  }
+  if (xver_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  } else {
+    *vcorr = 1.0;
+  }
+}
diff --git a/media/libaom/src/av1/encoder/av1_ml_partition_models.h b/media/libaom/src/av1/encoder/av1_ml_partition_models.h
new file mode 100644
index 0000000000..2572b138d5
--- /dev/null
+++ b/media/libaom/src/av1/encoder/av1_ml_partition_models.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
+#define AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// TODO(kyslov): Replace with proper weights after training AV1 models
+
+#define FEATURES 6
+static const float av1_var_part_nn_weights_64_layer0[FEATURES * 8] = {
+  0.35755366f,  0.86281112f,  -0.20871686f, 0.0409634f,   0.97305766f,
+  0.75510254f,  0.04860447f,  0.77095283f,  -0.44105278f, -0.3755049f,
+  -0.08456618f, 1.1821136f,   -0.73956301f, 1.30016453f,  0.45566902f,
+  0.4742967f,   0.44213975f,  0.4876028f,   0.26720522f,  -0.34429858f,
+  -0.25148252f, -0.49623932f, -0.46747941f, -0.36656624f, 0.10213375f,
+  0.60262819f,  -0.54788715f, -0.27272022f, 1.0995462f,   -0.36338376f,
+  -0.64836313f, 0.16057039f,  1.02782791f,  0.9985311f,   0.90607883f,
+  0.80570411f,  -0.07750863f, -0.74006402f, 1.72839526f,  1.72355343f,
+  1.69288916f,  1.59102043f,  0.14140216f,  -1.47262839f, 0.4262519f,
+  -0.33805936f, -0.02449707f, 0.67203692f
+};
+
+static const float av1_var_part_nn_bias_64_layer0[8] = {
+  0.39995694f, 0.65593756f, 1.12876737f,  1.28790576f,
+  0.53468556f, 0.3177908f,  -0.74388266f, -1.81131248f
+};
+
+static const float av1_var_part_nn_weights_64_layer1[8] = {
+  -1.31174053f, 0.69696917f, 0.78721456f, 0.45326379f,
+  0.79258322f,  1.74626188f, -5.41831f,   3.33887435f
+};
+
+static const float av1_var_part_nn_bias_64_layer1[1] = { -0.90951047f };
+
+static const float av1_var_part_means_64[FEATURES] = {
+  5.36750249f, 11.58023127f, 0.25550964f, 0.23809917f, 0.24650665f, 0.22117687f
+};
+static const float av1_var_part_vars_64[FEATURES] = {
+  0.89599769f, 2.2686018f, 0.02568608f, 0.02523411f, 0.02443085f, 0.01922085f
+};
+
+static const NN_CONFIG av1_var_part_nnconfig_64 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_var_part_nn_weights_64_layer0,
+      av1_var_part_nn_weights_64_layer1,
+  },
+  {
+      av1_var_part_nn_bias_64_layer0,
+      av1_var_part_nn_bias_64_layer1,
+  },
+};
+
+static const float av1_var_part_nn_weights_32_layer0[FEATURES * 8] = {
+  0.97886049f,  -1.66262011f, 0.94902798f,  0.7080922f,   0.91181186f,
+  0.35222601f,  -0.04428585f, 0.42086472f,  -0.0206325f,  -0.77937809f,
+  -0.70947522f, -1.24463119f, 0.23739497f,  -1.34327359f, 0.01024804f,
+  0.4544633f,   -0.96907661f, 0.67279522f,  0.23180693f,  1.54063368f,
+  -0.15700707f, 0.18597331f,  0.34167589f,  0.40736558f,  0.69213366f,
+  -1.33584593f, 1.21190814f,  1.26725267f,  1.21284802f,  1.26611399f,
+  0.17546514f,  -0.30248399f, -1.32589316f, -1.37432674f, -1.37423023f,
+  -1.26890855f, 0.12166347f,  -0.94565678f, -1.47475267f, -0.69279948f,
+  -0.10166587f, -0.23489881f, 0.57123565f,  0.80051137f,  -1.28411946f,
+  -1.36576732f, -1.30257508f, -1.30575106f
+};
+
+static const float av1_var_part_nn_bias_32_layer0[8] = {
+  -1.6301435f, 0.61879037f, -1.68612662f, 1.66960165f,
+  -0.0838243f, 0.32253287f, -0.65755282f, 0.96661531f
+};
+
+static const float av1_var_part_nn_weights_32_layer1[8] = {
+  1.99257161f,  0.7331492f,  1.33539961f,  1.13501456f,
+  -2.21154528f, 1.85858542f, -0.85565298f, -1.96410246f
+};
+
+static const float av1_var_part_nn_bias_32_layer1[1] = { -0.14880827f };
+
+static const float av1_var_part_means_32[FEATURES] = {
+  5.36360686f, 9.88421868f, 0.23543671f, 0.23621205f, 0.23409667f, 0.22855539f
+};
+
+static const float av1_var_part_vars_32[FEATURES] = {
+  0.89077225f, 2.32312894f, 0.02167654f, 0.02392842f, 0.02466495f, 0.02047641f
+};
+
+static const NN_CONFIG av1_var_part_nnconfig_32 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_var_part_nn_weights_32_layer0,
+      av1_var_part_nn_weights_32_layer1,
+  },
+  {
+      av1_var_part_nn_bias_32_layer0,
+      av1_var_part_nn_bias_32_layer1,
+  },
+};
+
+static const float av1_var_part_nn_weights_16_layer0[FEATURES * 8] = {
+  0.45118305f,  -0.22068295f, 0.4604435f,   -0.1446326f,  -0.15765035f,
+  0.42260198f,  -0.0945916f,  0.49544996f,  0.62781567f,  -0.41564372f,
+  -0.39103292f, 0.44407624f,  0.48382613f,  -0.85424238f, -0.00961433f,
+  0.25383582f,  0.14403897f,  0.00901859f,  -0.83201967f, -0.19323284f,
+  0.59271213f,  0.69487457f,  0.6897112f,   0.62768521f,  0.9204492f,
+  -1.42448347f, -0.16491054f, -0.10114424f, -0.1069687f,  -0.11289049f,
+  0.26290832f,  -0.41850393f, 0.17239733f,  0.41770622f,  0.43725942f,
+  0.19362467f,  -0.35955731f, -0.899446f,   0.49726389f,  0.66569571f,
+  0.65893982f,  0.53199654f,  -0.1158694f,  -0.26472603f, 0.4155923f,
+  0.15059544f,  0.09596755f,  0.26247133f
+};
+
+static const float av1_var_part_nn_bias_16_layer0[8] = {
+  1.64486321f, -0.11851574f, 1.29322833f,  -0.61193136f,
+  0.33027532f, 1.04197232f,  -0.80716674f, 0.88681233f
+};
+
+static const float av1_var_part_nn_weights_16_layer1[8] = {
+  -1.02832118f, 0.72800106f, -0.42904783f, 1.44490586f,
+  -1.03888227f, -0.9023916f, -1.51543102f, -0.43059521f
+};
+
+static const float av1_var_part_nn_bias_16_layer1[1] = { -0.85087946f };
+
+static const float av1_var_part_means_16[FEATURES] = {
+  5.32551326f, 8.218448f, 0.21954822f, 0.22808377f, 0.23019798f, 0.22320699f
+};
+
+static const float av1_var_part_vars_16[FEATURES] = { 0.86806032f, 2.39938956f,
+                                                      0.01958579f, 0.02437927f,
+                                                      0.02420755f, 0.0192003f };
+
+static const NN_CONFIG av1_var_part_nnconfig_16 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_var_part_nn_weights_16_layer0,
+      av1_var_part_nn_weights_16_layer1,
+  },
+  {
+      av1_var_part_nn_bias_16_layer0,
+      av1_var_part_nn_bias_16_layer1,
+  },
+};
+
+#undef FEATURES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
diff --git a/media/libaom/src/av1/encoder/av1_multi_thread.c b/media/libaom/src/av1/encoder/av1_multi_thread.c
deleted file mode 100644
index d170b0c282..0000000000
--- a/media/libaom/src/av1/encoder/av1_multi_thread.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/encoder/encoder.h"
-#include "av1/encoder/ethread.h"
-#include "av1/encoder/av1_multi_thread.h"
-
-void av1_row_mt_mem_alloc(AV1_COMP *cpi, int max_sb_rows) {
-  struct AV1Common *cm = &cpi->common;
-  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
-  int tile_row, tile_col;
-  const int tile_cols = cm->tiles.cols;
-  const int tile_rows = cm->tiles.rows;
-
-  multi_thread_ctxt->allocated_tile_cols = tile_cols;
-  multi_thread_ctxt->allocated_tile_rows = tile_rows;
-  multi_thread_ctxt->allocated_sb_rows = max_sb_rows;
-
-  // Allocate memory for row based multi-threading
-  for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows;
-       tile_row++) {
-    for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
-         tile_col++) {
-      TileDataEnc *this_tile =
-          &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
-                          tile_col];
-      av1_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_sb_rows);
-      if (cpi->oxcf.cdf_update_mode)
-        CHECK_MEM_ERROR(
-            cm, this_tile->row_ctx,
-            (FRAME_CONTEXT *)aom_memalign(
-                16,
-                AOMMAX(1, (av1_get_sb_cols_in_tile(cm, this_tile->tile_info) -
-                           1)) *
-                    sizeof(*this_tile->row_ctx)));
-    }
-  }
-}
-
-void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
-  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
-  int tile_col;
-  int tile_row;
-
-  // Free row based multi-threading sync memory
-  for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows;
-       tile_row++) {
-    for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
-         tile_col++) {
-      TileDataEnc *this_tile =
-          &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
-                          tile_col];
-      av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
-      if (cpi->oxcf.cdf_update_mode) aom_free(this_tile->row_ctx);
-    }
-  }
-  multi_thread_ctxt->allocated_sb_rows = 0;
-  multi_thread_ctxt->allocated_tile_cols = 0;
-  multi_thread_ctxt->allocated_tile_rows = 0;
-}
diff --git a/media/libaom/src/av1/encoder/av1_noise_estimate.c b/media/libaom/src/av1/encoder/av1_noise_estimate.c
new file mode 100644
index 0000000000..4419085be4
--- /dev/null
+++ b/media/libaom/src/av1/encoder/av1_noise_estimate.c
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_noise_estimate.h"
+#include "av1/encoder/encoder.h"
+#if CONFIG_AV1_TEMPORAL_DENOISING
+#include "av1/encoder/av1_temporal_denoiser.h"
+#endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+// For SVC: only do noise estimation on top spatial layer.
+static INLINE int noise_est_svc(const struct AV1_COMP *const cpi) {
+  return (!cpi->ppi->use_svc ||
+          (cpi->ppi->use_svc &&
+           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+}
+#endif
+
+void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
+  ne->enabled = 0;
+  ne->level = (width * height < 1280 * 720) ? kLowLow : kLow;
+  ne->value = 0;
+  ne->count = 0;
+  ne->thresh = 90;
+  ne->last_w = 0;
+  ne->last_h = 0;
+  if (width * height >= 1920 * 1080) {
+    ne->thresh = 200;
+  } else if (width * height >= 1280 * 720) {
+    ne->thresh = 140;
+  } else if (width * height >= 640 * 360) {
+    ne->thresh = 115;
+  }
+  ne->num_frames_estimate = 15;
+  ne->adapt_thresh = (3 * ne->thresh) >> 1;
+}
+
+static int enable_noise_estimation(AV1_COMP *const cpi) {
+  const int resize_pending = is_frame_resize_pending(cpi);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (cpi->common.seq_params->use_highbitdepth) return 0;
+#endif
+// Enable noise estimation if denoising is on.
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+      cpi->common.width >= 320 && cpi->common.height >= 180)
+    return 1;
+#endif
+  // Only allow noise estimate under certain encoding mode.
+  // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
+  // Not enabled for SVC mode and screen_content_mode.
+  // Not enabled for low resolutions.
+  if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+      cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
+      resize_pending == 0 && !cpi->ppi->use_svc &&
+      cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+      cpi->common.width * cpi->common.height >= 640 * 360)
+    return 1;
+  else
+    return 0;
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void copy_frame(YV12_BUFFER_CONFIG *const dest,
+                       const YV12_BUFFER_CONFIG *const src) {
+  const uint8_t *srcbuf = src->y_buffer;
+  uint8_t *destbuf = dest->y_buffer;
+
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+
+  for (int r = 0; r < dest->y_height; ++r) {
+    memcpy(destbuf, srcbuf, dest->y_width);
+    destbuf += dest->y_stride;
+    srcbuf += src->y_stride;
+  }
+}
+#endif  // CONFIG_AV1_TEMPORAL_DENOISING
+
+NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
+  int noise_level = kLowLow;
+  if (ne->value > (ne->thresh << 1)) {
+    noise_level = kHigh;
+  } else {
+    if (ne->value > ne->thresh)
+      noise_level = kMedium;
+    else if (ne->value > (ne->thresh >> 1))
+      noise_level = kLow;
+    else
+      noise_level = kLowLow;
+  }
+  return noise_level;
+}
+
+void av1_update_noise_estimate(AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  // Estimate of noise level every frame_period frames.
+  int frame_period = 8;
+  int thresh_consec_zeromv = 2;
+  int frame_counter = cm->current_frame.frame_number;
+  // Estimate is between current source and last source.
+  YV12_BUFFER_CONFIG *last_source = cpi->last_source;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
+    last_source = &cpi->denoiser.last_source;
+    // Tune these thresholds for different resolutions when denoising is
+    // enabled.
+    if (cm->width > 640 && cm->width <= 1920) {
+      thresh_consec_zeromv = 2;
+    }
+  }
+#endif
+  ne->enabled = enable_noise_estimation(cpi);
+  if (cpi->svc.number_spatial_layers > 1)
+    frame_counter = cpi->svc.current_superframe;
+  if (!ne->enabled || frame_counter % frame_period != 0 ||
+      last_source == NULL ||
+      (cpi->svc.number_spatial_layers == 1 &&
+       (ne->last_w != cm->width || ne->last_h != cm->height))) {
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+      copy_frame(&cpi->denoiser.last_source, cpi->source);
+#endif
+    if (last_source != NULL) {
+      ne->last_w = cm->width;
+      ne->last_h = cm->height;
+    }
+    return;
+  } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 &&
+             cpi->rc.frames_since_key > cpi->svc.number_spatial_layers &&
+             cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+             cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) {
+    // Force noise estimation to 0 and denoiser off if content has high motion.
+    ne->level = kLowLow;
+    ne->count = 0;
+    ne->num_frames_estimate = 10;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+        cpi->svc.current_superframe > 1) {
+      av1_denoiser_set_noise_level(cpi, ne->level);
+      copy_frame(&cpi->denoiser.last_source, cpi->source);
+    }
+#endif
+    return;
+  } else {
+    unsigned int bin_size = 100;
+    unsigned int hist[MAX_VAR_HIST_BINS] = { 0 };
+    unsigned int hist_avg[MAX_VAR_HIST_BINS];
+    unsigned int max_bin = 0;
+    unsigned int max_bin_count = 0;
+    unsigned int bin_cnt;
+    int bsize = BLOCK_16X16;
+    // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
+    // been encoded as zero/small mv at least x consecutive frames, compute
+    // the variance to update estimate of noise in the source.
+    const uint8_t *src_y = cpi->source->y_buffer;
+    const int src_ystride = cpi->source->y_stride;
+    const uint8_t *last_src_y = last_source->y_buffer;
+    const int last_src_ystride = last_source->y_stride;
+    int mi_row, mi_col;
+    int num_low_motion = 0;
+    int frame_low_motion = 1;
+    for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += 2) {
+      for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += 2) {
+        int bl_index =
+            (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1);
+        if (cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv)
+          num_low_motion++;
+      }
+    }
+    if (num_low_motion <
+        (((3 * (mi_params->mi_rows * mi_params->mi_cols) >> 2)) >> 3))
+      frame_low_motion = 0;
+    for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) {
+      for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) {
+        // 16x16 blocks, 1/4 sample of frame.
+        if (mi_row % 8 == 0 && mi_col % 8 == 0 &&
+            mi_row < mi_params->mi_rows - 3 &&
+            mi_col < mi_params->mi_cols - 3) {
+          int bl_index =
+              (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1);
+          int bl_index1 = bl_index + 1;
+          int bl_index2 = bl_index + (mi_params->mi_cols >> 1);
+          int bl_index3 = bl_index2 + 1;
+          int consec_zeromv =
+              AOMMIN(cpi->consec_zero_mv[bl_index],
+                     AOMMIN(cpi->consec_zero_mv[bl_index1],
+                            AOMMIN(cpi->consec_zero_mv[bl_index2],
+                                   cpi->consec_zero_mv[bl_index3])));
+          // Only consider blocks that are likely steady background. i.e, have
+          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+          // 4 sub-blocks for 16x16 block. And exclude this frame if
+          // high_source_sad is true (i.e., scene/content change).
+          if (frame_low_motion && consec_zeromv > thresh_consec_zeromv &&
+              !cpi->rc.high_source_sad) {
+            unsigned int sse;
+            // Compute variance between co-located blocks from current and
+            // last input frames.
+            unsigned int variance = cpi->ppi->fn_ptr[bsize].vf(
+                src_y, src_ystride, last_src_y, last_src_ystride, &sse);
+            unsigned int hist_index = variance / bin_size;
+            if (hist_index < MAX_VAR_HIST_BINS)
+              hist[hist_index]++;
+            else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1))
+              hist[MAX_VAR_HIST_BINS - 1]++;  // Account for the tail
+          }
+        }
+        src_y += 4;
+        last_src_y += 4;
+      }
+      src_y += (src_ystride << 2) - (mi_params->mi_cols << 2);
+      last_src_y += (last_src_ystride << 2) - (mi_params->mi_cols << 2);
+    }
+    ne->last_w = cm->width;
+    ne->last_h = cm->height;
+    // Adjust histogram to account for effect that histogram flattens
+    // and shifts to zero as scene darkens.
+    if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) {
+      hist[0] = 0;
+      hist[1] >>= 2;
+      hist[2] >>= 2;
+      hist[3] >>= 2;
+      hist[4] >>= 1;
+      hist[5] >>= 1;
+      hist[6] = 3 * hist[6] >> 1;
+      hist[MAX_VAR_HIST_BINS - 1] >>= 1;
+    }
+
+    // Average hist[] and find largest bin
+    for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) {
+      if (bin_cnt == 0)
+        hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3;
+      else if (bin_cnt == MAX_VAR_HIST_BINS - 1)
+        hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2;
+      else if (bin_cnt == MAX_VAR_HIST_BINS - 2)
+        hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] +
+                             (hist[bin_cnt + 1] >> 1) + 2) >>
+                            2;
+      else
+        hist_avg[bin_cnt] =
+            (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >>
+            2;
+
+      if (hist_avg[bin_cnt] > max_bin_count) {
+        max_bin_count = hist_avg[bin_cnt];
+        max_bin = bin_cnt;
+      }
+    }
+    // Scale by 40 to work with existing thresholds
+    ne->value = (int)((3 * ne->value + max_bin * 40) >> 2);
+    // Quickly increase VNR strength when the noise level increases suddenly.
+    if (ne->level < kMedium && ne->value > ne->adapt_thresh) {
+      ne->count = ne->num_frames_estimate;
+    } else {
+      ne->count++;
+    }
+    if (ne->count == ne->num_frames_estimate) {
+      // Reset counter and check noise level condition.
+      ne->num_frames_estimate = 30;
+      ne->count = 0;
+      ne->level = av1_noise_estimate_extract_level(ne);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+      if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+        av1_denoiser_set_noise_level(cpi, ne->level);
+#endif
+    }
+  }
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+    copy_frame(&cpi->denoiser.last_source, cpi->source);
+#endif
+}
diff --git a/media/libaom/src/av1/encoder/av1_noise_estimate.h b/media/libaom/src/av1/encoder/av1_noise_estimate.h
new file mode 100644
index 0000000000..85530666f6
--- /dev/null
+++ b/media/libaom/src/av1/encoder/av1_noise_estimate.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
+#define AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
+
+#include "av1/encoder/block.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_VAR_HIST_BINS 20
+
+typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL;
+
+typedef struct noise_estimate {
+  int enabled;
+  NOISE_LEVEL level;
+  int value;
+  int thresh;
+  int adapt_thresh;
+  int count;
+  int last_w;
+  int last_h;
+  int num_frames_estimate;
+} NOISE_ESTIMATE;
+
+struct AV1_COMP;
+
+void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height);
+
+NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne);
+
+void av1_update_noise_estimate(struct AV1_COMP *const cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
diff --git a/media/libaom/src/av1/encoder/av1_quantize.c b/media/libaom/src/av1/encoder/av1_quantize.c
index 569784a2af..256558e818 100644
--- a/media/libaom/src/av1/encoder/av1_quantize.c
+++ b/media/libaom/src/av1/encoder/av1_quantize.c
@@ -33,6 +33,40 @@ void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
   *eob_ptr = 0;
 }
 
+int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2],
+                               const int16_t dequant_ptr[2],
+                               const int16_t round_ptr[2], int log_scale,
+                               const int16_t *scan, int coeff_count,
+                               const tran_low_t *coeff_ptr,
+                               tran_low_t *qcoeff_ptr,
+                               tran_low_t *dqcoeff_ptr) {
+  memset(qcoeff_ptr, 0, coeff_count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, coeff_count * sizeof(*dqcoeff_ptr));
+  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+  int eob = 0;
+  for (int i = 0; i < coeff_count; i++) {
+    const int rc = scan[i];
+    const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int tmp32 = 0;
+    if ((abs_coeff << (1 + log_scale)) >= thresh) {
+      abs_coeff = clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
+      tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
+      if (tmp32) {
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        const tran_low_t abs_dqcoeff =
+            (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+        dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+      }
+    }
+    if (tmp32) eob = i + 1;
+  }
+  return eob;
+}
+
 static void quantize_fp_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -53,26 +87,9 @@ static void quantize_fp_helper_c(
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (qm_ptr == NULL && iqm_ptr == NULL) {
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = AOMSIGN(coeff);
-      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      int tmp32 = 0;
-      if ((abs_coeff << (1 + log_scale)) >= thresh) {
-        abs_coeff =
-            clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
-        tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
-        if (tmp32) {
-          qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-          const tran_low_t abs_dqcoeff =
-              (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
-          dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
-        }
-      }
-      if (tmp32) eob = i;
-    }
+    *eob_ptr = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr,
+                                          log_scale, scan, (int)n_coeffs,
+                                          coeff_ptr, qcoeff_ptr, dqcoeff_ptr);
   } else {
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
@@ -100,8 +117,8 @@ static void quantize_fp_helper_c(
 
       if (tmp32) eob = i;
     }
+    *eob_ptr = eob + 1;
   }
-  *eob_ptr = eob + 1;
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -196,7 +213,8 @@ void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
                        const int16_t *round_ptr, const int16_t *quant_ptr,
                        int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                       const int16_t *scan) {
+                       const int16_t *scan, const int16_t *iscan) {
+  (void)iscan;
   int eob = -1;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -586,7 +604,7 @@ void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
     const int qrounding_factor = q == 0 ? 64 : 48;
 
     for (i = 0; i < 2; ++i) {
-      int qrounding_factor_fp = 64;
+      const int qrounding_factor_fp = 64;
       // y quantizer with TX scale
       quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth)
                          : av1_ac_quant_QTX(q, 0, bit_depth);
@@ -637,7 +655,8 @@ void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
       quants->u_zbin[q][i] = quants->u_zbin[q][1];
       quants->u_round[q][i] = quants->u_round[q][1];
       deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1];
-      quants->v_quant[q][i] = quants->u_quant[q][1];
+
+      quants->v_quant[q][i] = quants->v_quant[q][1];
       quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1];
       quants->v_round_fp[q][i] = quants->v_round_fp[q][1];
       quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1];
@@ -659,27 +678,15 @@ void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params,
                       quants, dequants);
 }
 
-void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
-                               int segment_id) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const CommonQuantParams *const quant_params = &cm->quant_params;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const QUANTS *const quants = &cpi->enc_quant_dequant_params.quants;
-  const Dequants *const dequants = &cpi->enc_quant_dequant_params.dequants;
-
-  const int current_qindex =
-      AOMMAX(0, AOMMIN(QINDEX_RANGE - 1,
-                       cm->delta_q_info.delta_q_present_flag
-                           ? quant_params->base_qindex + xd->delta_qindex
-                           : quant_params->base_qindex));
-  const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
-  const int rdmult =
-      av1_compute_rd_mult(cpi, qindex + quant_params->y_dc_delta_q);
-  const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id);
+void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params,
+                     int qindex, MACROBLOCK *x) {
+  const QUANTS *const quants = &enc_quant_dequant_params->quants;
+  const Dequants *const dequants = &enc_quant_dequant_params->dequants;
+  x->qindex = qindex;
+  x->seg_skip_block =
+      0;  // TODO(angiebird): Find a proper place to init this variable.
 
   // Y
-  const int qmlevel_y =
-      use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
   x->plane[0].quant_QTX = quants->y_quant[qindex];
   x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex];
   x->plane[0].round_fp_QTX = quants->y_round_fp[qindex];
@@ -687,16 +694,8 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
   x->plane[0].zbin_QTX = quants->y_zbin[qindex];
   x->plane[0].round_QTX = quants->y_round[qindex];
   x->plane[0].dequant_QTX = dequants->y_dequant_QTX[qindex];
-  memcpy(&xd->plane[0].seg_qmatrix[segment_id],
-         quant_params->gqmatrix[qmlevel_y][0],
-         sizeof(quant_params->gqmatrix[qmlevel_y][0]));
-  memcpy(&xd->plane[0].seg_iqmatrix[segment_id],
-         quant_params->giqmatrix[qmlevel_y][0],
-         sizeof(quant_params->giqmatrix[qmlevel_y][0]));
 
   // U
-  const int qmlevel_u =
-      use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
   x->plane[1].quant_QTX = quants->u_quant[qindex];
   x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
   x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
@@ -704,15 +703,8 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
   x->plane[1].zbin_QTX = quants->u_zbin[qindex];
   x->plane[1].round_QTX = quants->u_round[qindex];
   x->plane[1].dequant_QTX = dequants->u_dequant_QTX[qindex];
-  memcpy(&xd->plane[1].seg_qmatrix[segment_id],
-         quant_params->gqmatrix[qmlevel_u][1],
-         sizeof(quant_params->gqmatrix[qmlevel_u][1]));
-  memcpy(&xd->plane[1].seg_iqmatrix[segment_id],
-         quant_params->giqmatrix[qmlevel_u][1],
-         sizeof(quant_params->giqmatrix[qmlevel_u][1]));
+
   // V
-  const int qmlevel_v =
-      use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
   x->plane[2].quant_QTX = quants->v_quant[qindex];
   x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
   x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
@@ -720,44 +712,127 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
   x->plane[2].zbin_QTX = quants->v_zbin[qindex];
   x->plane[2].round_QTX = quants->v_round[qindex];
   x->plane[2].dequant_QTX = dequants->v_dequant_QTX[qindex];
-  memcpy(&xd->plane[2].seg_qmatrix[segment_id],
-         quant_params->gqmatrix[qmlevel_v][2],
-         sizeof(quant_params->gqmatrix[qmlevel_v][2]));
-  memcpy(&xd->plane[2].seg_iqmatrix[segment_id],
-         quant_params->giqmatrix[qmlevel_v][2],
-         sizeof(quant_params->giqmatrix[qmlevel_v][2]));
-  x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
-  x->qindex = qindex;
+}
+
+void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id,
+                     MACROBLOCKD *xd) {
+  const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id);
+  const int qmlevel_y =
+      use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
+  const int qmlevel_u =
+      use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
+  const int qmlevel_v =
+      use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
+  const int qmlevel_ls[MAX_MB_PLANE] = { qmlevel_y, qmlevel_u, qmlevel_v };
+  for (int i = 0; i < MAX_MB_PLANE; ++i) {
+    const int qmlevel = qmlevel_ls[i];
+    memcpy(&xd->plane[i].seg_qmatrix[segment_id],
+           quant_params->gqmatrix[qmlevel][i],
+           sizeof(quant_params->gqmatrix[qmlevel][i]));
+    memcpy(&xd->plane[i].seg_iqmatrix[segment_id],
+           quant_params->giqmatrix[qmlevel][i],
+           sizeof(quant_params->giqmatrix[qmlevel][i]));
+  }
+}
+
+void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
+                               int segment_id, const int do_update) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonQuantParams *const quant_params = &cm->quant_params;
+  const int current_qindex = AOMMAX(
+      0,
+      AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag
+                                   ? quant_params->base_qindex + x->delta_qindex
+                                   : quant_params->base_qindex));
+  const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
+  const int rdmult =
+      av1_compute_rd_mult(cpi, qindex + quant_params->y_dc_delta_q);
+  const int qindex_change = x->qindex != qindex;
+  if (qindex_change || do_update) {
+    av1_set_q_index(&cpi->enc_quant_dequant_params, qindex, x);
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  if ((segment_id != x->prev_segment_id) ||
+      av1_use_qmatrix(quant_params, xd, segment_id)) {
+    av1_set_qmatrix(quant_params, segment_id, xd);
+  }
+
+  x->seg_skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
 
-  set_error_per_bit(x, rdmult);
+  av1_set_error_per_bit(&x->errorperbit, rdmult);
+  av1_set_sad_per_bit(cpi, &x->sadperbit, qindex);
 
-  av1_initialize_me_consts(cpi, x, qindex);
+  x->prev_segment_id = segment_id;
 }
 
 void av1_frame_init_quantizer(AV1_COMP *cpi) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
+  x->prev_segment_id = -1;
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 1);
+}
+
+static int adjust_hdr_cb_deltaq(int base_qindex) {
+  double baseQp = base_qindex / QP_SCALE_FACTOR;
+  const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET;
+  const double dcbQP = CHROMA_CB_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
+  int dqpCb = (int)(dcbQP + (dcbQP < 0 ? -0.5 : 0.5));
+  dqpCb = AOMMIN(0, dqpCb);
+  dqpCb = (int)CLIP(dqpCb, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+  return dqpCb;
+}
+
+static int adjust_hdr_cr_deltaq(int base_qindex) {
+  double baseQp = base_qindex / QP_SCALE_FACTOR;
+  const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET;
+  const double dcrQP = CHROMA_CR_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
+  int dqpCr = (int)(dcrQP + (dcrQP < 0 ? -0.5 : 0.5));
+  dqpCr = AOMMIN(0, dqpCr);
+  dqpCr = (int)CLIP(dqpCr, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+  return dqpCr;
 }
 
 void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel,
-                       int q) {
+                       int q, int enable_chroma_deltaq, int enable_hdr_deltaq) {
   // quantizer has to be reinitialized with av1_init_quantizer() if any
   // delta_q changes.
   CommonQuantParams *quant_params = &cm->quant_params;
   quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q);
   quant_params->y_dc_delta_q = 0;
-  quant_params->u_dc_delta_q = 0;
-  quant_params->u_ac_delta_q = 0;
-  quant_params->v_dc_delta_q = 0;
-  quant_params->v_ac_delta_q = 0;
+
+  if (enable_chroma_deltaq) {
+    // TODO(aomedia:2717): need to design better delta
+    quant_params->u_dc_delta_q = 2;
+    quant_params->u_ac_delta_q = 2;
+    quant_params->v_dc_delta_q = 2;
+    quant_params->v_ac_delta_q = 2;
+  } else {
+    quant_params->u_dc_delta_q = 0;
+    quant_params->u_ac_delta_q = 0;
+    quant_params->v_dc_delta_q = 0;
+    quant_params->v_ac_delta_q = 0;
+  }
+
+  // following section 8.3.2 in T-REC-H.Sup15 document
+  // to apply to AV1 qindex in the range of [0, 255]
+  if (enable_hdr_deltaq) {
+    int dqpCb = adjust_hdr_cb_deltaq(quant_params->base_qindex);
+    int dqpCr = adjust_hdr_cr_deltaq(quant_params->base_qindex);
+    quant_params->u_dc_delta_q = quant_params->u_ac_delta_q = dqpCb;
+    quant_params->v_dc_delta_q = quant_params->v_ac_delta_q = dqpCr;
+    if (dqpCb != dqpCr) {
+      cm->seq_params->separate_uv_delta_q = 1;
+    }
+  }
+
   quant_params->qmatrix_level_y =
       aom_get_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel);
   quant_params->qmatrix_level_u =
       aom_get_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q,
                       min_qmlevel, max_qmlevel);
 
-  if (!cm->seq_params.separate_uv_delta_q)
+  if (!cm->seq_params->separate_uv_delta_q)
     quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
   else
     quant_params->qmatrix_level_v =
diff --git a/media/libaom/src/av1/encoder/av1_quantize.h b/media/libaom/src/av1/encoder/av1_quantize.h
index 40fb4bee89..701e4cfcb3 100644
--- a/media/libaom/src/av1/encoder/av1_quantize.h
+++ b/media/libaom/src/av1/encoder/av1_quantize.h
@@ -22,9 +22,6 @@
 extern "C" {
 #endif
 
-#define EOB_FACTOR 325
-#define SKIP_EOB_FACTOR_ADJUST 200
-
 typedef struct QUANT_PARAM {
   int log_scale;
   TX_SIZE tx_size;
@@ -97,7 +94,7 @@ struct AV1Common;
 void av1_frame_init_quantizer(struct AV1_COMP *cpi);
 
 void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                               int segment_id);
+                               int segment_id, const int do_update);
 
 void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
                          int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
@@ -109,7 +106,8 @@ void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params,
                         aom_bit_depth_t bit_depth);
 
 void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel,
-                       int max_qmlevel, int q);
+                       int max_qmlevel, int q, int enable_chroma_deltaq,
+                       int enable_hdr_deltaq);
 
 int av1_quantizer_to_qindex(int quantizer);
 
@@ -118,6 +116,32 @@ int av1_qindex_to_quantizer(int qindex);
 void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
 
+/*!\brief Quantize transform coefficients without using qmatrix
+ *
+ * quant_ptr, dequant_ptr and round_ptr are size 2 arrays,
+ * where index 0 corresponds to dc coeff and index 1 corresponds to ac coeffs.
+ *
+ * \param[in]  quant_ptr    16-bit fixed point representation of inverse
+ *                          quantize step size, i.e. 2^16/dequant
+ * \param[in]  dequant_ptr  quantize step size
+ * \param[in]  round_ptr    rounding
+ * \param[in]  log_scale    the relative log scale of the transform
+ *                          coefficients
+ * \param[in]  scan         scan[i] indicates the position of ith to-be-coded
+ *                          coefficient
+ * \param[in]  coeff_count  number of coefficients
+ * \param[out] qcoeff_ptr   quantized coefficients
+ * \param[out] dqcoeff_ptr  dequantized coefficients
+ *
+ * \return The last non-zero coefficient's scan index plus 1
+ */
+int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2],
+                               const int16_t dequant_ptr[2],
+                               const int16_t round_ptr[2], int log_scale,
+                               const int16_t *scan, int coeff_count,
+                               const tran_low_t *coeff_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr);
+
 void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
@@ -133,6 +157,29 @@ void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                             const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
 
+/*!\brief Update quantize parameters in MACROBLOCK
+ *
+ * \param[in]  enc_quant_dequant_params This parameter cached the quantize and
+ *                                      dequantize parameters for all q
+ *                                      indices.
+ * \param[in]  qindex                   Quantize index used for the current
+ *                                      superblock.
+ * \param[out] x                        A superblock data structure for
+ *                                      encoder.
+ */
+void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params,
+                     int qindex, MACROBLOCK *x);
+
+/*!\brief Update quantize matrix in MACROBLOCKD based on segment id
+ *
+ * \param[in]  quant_params  Quantize parameters used by encoder and decoder
+ * \param[in]  segment_id    Segment id.
+ * \param[out] xd            A superblock data structure used by encoder and
+ * decoder.
+ */
+void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id,
+                     MACROBLOCKD *xd);
+
 #if CONFIG_AV1_HIGHBITDEPTH
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
@@ -154,6 +201,7 @@ void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                    const SCAN_ORDER *sc,
                                    const QUANT_PARAM *qparam);
+
 #endif
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/av1/encoder/av1_temporal_denoiser.c b/media/libaom/src/av1/encoder/av1_temporal_denoiser.c
new file mode 100644
index 0000000000..27a12cb034
--- /dev/null
+++ b/media/libaom/src/av1/encoder/av1_temporal_denoiser.c
@@ -0,0 +1,800 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
+#endif
+
+static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  return 3 + (increase_denoising ? 1 : 0);
+}
+
+static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  (void)increase_denoising;
+  return 4;
+}
+
+static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  (void)increase_denoising;
+  return 625;
+}
+
+static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 80 : 40);
+}
+
+static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
+                           int motion_magnitude) {
+  if (motion_magnitude > noise_motion_thresh(bs, increase_denoising)) {
+    if (increase_denoising)
+      return (1 << num_pels_log2_lookup[bs]) << 2;
+    else
+      return 0;
+  } else {
+    return (1 << num_pels_log2_lookup[bs]) << 4;
+  }
+}
+
+static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+
+// TODO(kyslov): If increase_denoising is enabled in the future,
+// we might need to update the code for calculating 'total_adj' in
+// case the C code is not bit-exact with corresponding sse2 code.
+int av1_denoiser_filter_c(const uint8_t *sig, int sig_stride,
+                          const uint8_t *mc_avg, int mc_avg_stride,
+                          uint8_t *avg, int avg_stride, int increase_denoising,
+                          BLOCK_SIZE bs, int motion_magnitude) {
+  int r, c;
+  const uint8_t *sig_start = sig;
+  const uint8_t *mc_avg_start = mc_avg;
+  uint8_t *avg_start = avg;
+  int diff, adj, absdiff, delta;
+  int adj_val[] = { 3, 4, 6 };
+  int total_adj = 0;
+  int shift_inc = 1;
+
+  // If motion_magnitude is small, making the denoiser more aggressive by
+  // increasing the adjustment for each level. Add another increment for
+  // blocks that are labeled for increase denoising.
+  if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+    if (increase_denoising) {
+      shift_inc = 2;
+    }
+    adj_val[0] += shift_inc;
+    adj_val[1] += shift_inc;
+    adj_val[2] += shift_inc;
+  }
+
+  // First attempt to apply a strong temporal denoising filter.
+  for (r = 0; r < block_size_high[bs]; ++r) {
+    for (c = 0; c < block_size_wide[bs]; ++c) {
+      diff = mc_avg[c] - sig[c];
+      absdiff = abs(diff);
+
+      if (absdiff <= absdiff_thresh(bs, increase_denoising)) {
+        avg[c] = mc_avg[c];
+        total_adj += diff;
+      } else {
+        switch (absdiff) {
+          case 4:
+          case 5:
+          case 6:
+          case 7: adj = adj_val[0]; break;
+          case 8:
+          case 9:
+          case 10:
+          case 11:
+          case 12:
+          case 13:
+          case 14:
+          case 15: adj = adj_val[1]; break;
+          default: adj = adj_val[2];
+        }
+        if (diff > 0) {
+          avg[c] = AOMMIN(UINT8_MAX, sig[c] + adj);
+          total_adj += adj;
+        } else {
+          avg[c] = AOMMAX(0, sig[c] - adj);
+          total_adj -= adj;
+        }
+      }
+    }
+    sig += sig_stride;
+    avg += avg_stride;
+    mc_avg += mc_avg_stride;
+  }
+
+  // If the strong filter did not modify the signal too much, we're all set.
+  if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) {
+    return FILTER_BLOCK;
+  }
+
+  // Otherwise, we try to dampen the filter if the delta is not too high.
+  delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising)) >>
+           num_pels_log2_lookup[bs]) +
+          1;
+
+  if (delta >= delta_thresh(bs, increase_denoising)) {
+    return COPY_BLOCK;
+  }
+
+  mc_avg = mc_avg_start;
+  avg = avg_start;
+  sig = sig_start;
+  for (r = 0; r < block_size_high[bs]; ++r) {
+    for (c = 0; c < block_size_wide[bs]; ++c) {
+      diff = mc_avg[c] - sig[c];
+      adj = abs(diff);
+      if (adj > delta) {
+        adj = delta;
+      }
+      if (diff > 0) {
+        // Diff positive means we made positive adjustment above
+        // (in first try/attempt), so now make negative adjustment to bring
+        // denoised signal down.
+        avg[c] = AOMMAX(0, avg[c] - adj);
+        total_adj -= adj;
+      } else {
+        // Diff negative means we made negative adjustment above
+        // (in first try/attempt), so now make positive adjustment to bring
+        // denoised signal up.
+        avg[c] = AOMMIN(UINT8_MAX, avg[c] + adj);
+        total_adj += adj;
+      }
+    }
+    sig += sig_stride;
+    avg += avg_stride;
+    mc_avg += mc_avg_stride;
+  }
+
+  // We can use the filter if it has been sufficiently dampened
+  if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) {
+    return FILTER_BLOCK;
+  }
+  return COPY_BLOCK;
+}
+
+static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row,
+                            int mi_col) {
+  return framebuf + (stride * mi_row << 2) + (mi_col << 2);
+}
+
+static AV1_DENOISER_DECISION perform_motion_compensation(
+    AV1_COMMON *const cm, AV1_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs,
+    int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
+    int motion_magnitude, int *zeromv_filter, int num_spatial_layers, int width,
+    int lst_fb_idx, int gld_fb_idx, int use_svc, int spatial_layer,
+    int use_gf_temporal_ref) {
+  const int sse_diff = (ctx->newmv_sse == UINT_MAX)
+                           ? 0
+                           : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
+  int frame;
+  int denoise_layer_idx = 0;
+  MACROBLOCKD *filter_mbd = &mb->e_mbd;
+  MB_MODE_INFO *mi = filter_mbd->mi[0];
+  MB_MODE_INFO saved_mi;
+  int i;
+  struct buf_2d saved_dst[MAX_MB_PLANE];
+  struct buf_2d saved_pre[MAX_MB_PLANE];
+  // const RefBuffer *saved_block_refs[2];
+  MV_REFERENCE_FRAME saved_frame;
+
+  frame = ctx->best_reference_frame;
+
+  saved_mi = *mi;
+
+  // Avoid denoising small blocks. When noise > kDenLow or frame width > 480,
+  // denoise 16x16 blocks.
+  if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 ||
+      (bs == BLOCK_16X16 && width > 480 &&
+       denoiser->denoising_level <= kDenLow))
+    return COPY_BLOCK;
+
+  // If the best reference frame uses inter-prediction and there is enough of a
+  // difference in sum-squared-error, use it.
+  if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME &&
+      sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
+    mi->ref_frame[0] = ctx->best_reference_frame;
+    mi->mode = ctx->best_sse_inter_mode;
+    mi->mv[0] = ctx->best_sse_mv;
+  } else {
+    // Otherwise, use the zero reference frame.
+    frame = ctx->best_zeromv_reference_frame;
+    ctx->newmv_sse = ctx->zeromv_sse;
+    // Bias to last reference.
+    if ((num_spatial_layers > 1 && !use_gf_temporal_ref) ||
+        frame == ALTREF_FRAME ||
+        (frame == GOLDEN_FRAME && use_gf_temporal_ref) ||
+        (frame != LAST_FRAME &&
+         ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) ||
+          denoiser->denoising_level >= kDenHigh))) {
+      frame = LAST_FRAME;
+      ctx->newmv_sse = ctx->zeromv_lastref_sse;
+    }
+    mi->ref_frame[0] = frame;
+    mi->mode = GLOBALMV;
+    mi->mv[0].as_int = 0;
+    ctx->best_sse_inter_mode = GLOBALMV;
+    ctx->best_sse_mv.as_int = 0;
+    *zeromv_filter = 1;
+    if (denoiser->denoising_level > kDenMedium) {
+      motion_magnitude = 0;
+    }
+  }
+
+  saved_frame = frame;
+  // When using SVC, we need to map REF_FRAME to the frame buffer index.
+  if (use_svc) {
+    if (frame == LAST_FRAME)
+      frame = lst_fb_idx + 1;
+    else if (frame == GOLDEN_FRAME)
+      frame = gld_fb_idx + 1;
+    // Shift for the second spatial layer.
+    if (num_spatial_layers - spatial_layer == 2)
+      frame = frame + denoiser->num_ref_frames;
+    denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
+  }
+
+  // Force copy (no denoise, copy source in denoised buffer) if
+  // running_avg_y[frame] is NULL.
+  if (denoiser->running_avg_y[frame].buffer_alloc == NULL) {
+    // Restore everything to its original state
+    *mi = saved_mi;
+    return COPY_BLOCK;
+  }
+
+  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
+    // Restore everything to its original state
+    *mi = saved_mi;
+    return COPY_BLOCK;
+  }
+  if (motion_magnitude > (noise_motion_thresh(bs, increase_denoising) << 3)) {
+    // Restore everything to its original state
+    *mi = saved_mi;
+    return COPY_BLOCK;
+  }
+
+  // We will restore these after motion compensation.
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    saved_pre[i] = filter_mbd->plane[i].pre[0];
+    saved_dst[i] = filter_mbd->plane[i].dst;
+  }
+
+  // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
+  // struct.
+  set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
+  av1_setup_pre_planes(filter_mbd, 0, &(denoiser->running_avg_y[frame]), mi_row,
+                       mi_col, filter_mbd->block_ref_scale_factors[0], 1);
+  av1_setup_dst_planes(filter_mbd->plane, bs,
+                       &(denoiser->mc_running_avg_y[denoise_layer_idx]), mi_row,
+                       mi_col, 0, 1);
+
+  av1_enc_build_inter_predictor_y(filter_mbd, mi_row, mi_col);
+
+  // Restore everything to its original state
+  *mi = saved_mi;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    filter_mbd->plane[i].pre[0] = saved_pre[i];
+    filter_mbd->plane[i].dst = saved_dst[i];
+  }
+
+  return FILTER_BLOCK;
+}
+
+void av1_denoiser_denoise(AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
+                          BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
+                          AV1_DENOISER_DECISION *denoiser_decision,
+                          int use_gf_temporal_ref) {
+  int mv_col, mv_row;
+  int motion_magnitude = 0;
+  int zeromv_filter = 0;
+  AV1_DENOISER *denoiser = &cpi->denoiser;
+  AV1_DENOISER_DECISION decision = COPY_BLOCK;
+
+  const int shift =
+      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+          ? denoiser->num_ref_frames
+          : 0;
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift];
+  const int denoise_layer_index =
+      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1;
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index];
+  uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+
+  uint8_t *mc_avg_start =
+      block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
+  struct buf_2d src = mb->plane[0].src;
+  int increase_denoising = 0;
+  int last_is_reference = cpi->ref_frame_flags & AOM_LAST_FLAG;
+  mv_col = ctx->best_sse_mv.as_mv.col;
+  mv_row = ctx->best_sse_mv.as_mv.row;
+  motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+
+  if (denoiser->denoising_level == kDenHigh) increase_denoising = 1;
+
+  // Copy block if LAST_FRAME is not a reference.
+  // Last doesn't always exist when SVC layers are dynamically changed, e.g. top
+  // spatial layer doesn't have last reference when it's brought up for the
+  // first time on the fly.
+  if (last_is_reference && denoiser->denoising_level >= kDenLow &&
+      !ctx->sb_skip_denoising)
+    decision = perform_motion_compensation(
+        &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
+        motion_magnitude, &zeromv_filter, cpi->svc.number_spatial_layers,
+        cpi->source->y_width, cpi->svc.ref_idx[0], cpi->svc.ref_idx[3],
+        cpi->ppi->use_svc, cpi->svc.spatial_layer_id, use_gf_temporal_ref);
+
+  if (decision == FILTER_BLOCK) {
+    decision = av1_denoiser_filter(src.buf, src.stride, mc_avg_start,
+                                   mc_avg.y_stride, avg_start, avg.y_stride,
+                                   increase_denoising, bs, motion_magnitude);
+  }
+
+  if (decision == FILTER_BLOCK) {
+    aom_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+                      block_size_wide[bs], block_size_high[bs]);
+  } else {  // COPY_BLOCK
+    aom_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+                      block_size_wide[bs], block_size_high[bs]);
+  }
+  *denoiser_decision = decision;
+  if (decision == FILTER_BLOCK && zeromv_filter == 1)
+    *denoiser_decision = FILTER_ZEROMV_BLOCK;
+}
+
+static void copy_frame(YV12_BUFFER_CONFIG *const dest,
+                       const YV12_BUFFER_CONFIG *const src) {
+  int r;
+  const uint8_t *srcbuf = src->y_buffer;
+  uint8_t *destbuf = dest->y_buffer;
+
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+
+  for (r = 0; r < dest->y_height; ++r) {
+    memcpy(destbuf, srcbuf, dest->y_width);
+    destbuf += dest->y_stride;
+    srcbuf += src->y_stride;
+  }
+}
+
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest,
+                              YV12_BUFFER_CONFIG *const src) {
+  uint8_t *tmp_buf = dest->y_buffer;
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+  dest->y_buffer = src->y_buffer;
+  src->y_buffer = tmp_buf;
+}
+
+void av1_denoiser_update_frame_info(
+    AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc,
+    FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame,
+    int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx,
+    int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer) {
+  const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
+  // Copy source into denoised reference buffers on KEY_FRAME or
+  // if the just encoded frame was resized. For SVC, copy source if the base
+  // spatial layer was key frame.
+  if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset ||
+      svc_refresh_denoiser_buffers) {
+    int i;
+    // Start at 1 so as not to overwrite the INTRA_FRAME
+    for (i = 1; i < denoiser->num_ref_frames; ++i) {
+      if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL)
+        copy_frame(&denoiser->running_avg_y[i + shift], &src);
+    }
+    denoiser->reset = 0;
+    return;
+  }
+
+  if (svc->set_ref_frame_config) {
+    int i;
+    for (i = 0; i < REF_FRAMES; i++) {
+      if (svc->refresh[svc->spatial_layer_id] & (1 << i))
+        copy_frame(&denoiser->running_avg_y[i + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+    }
+  } else {
+    // If more than one refresh occurs, must copy frame buffer.
+    if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) >
+        1) {
+      if (refresh_alt_ref_frame) {
+        copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_golden_frame) {
+        copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_last_frame) {
+        copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+    } else {
+      if (refresh_alt_ref_frame) {
+        swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                          &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_golden_frame) {
+        swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                          &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_last_frame) {
+        swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                          &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+    }
+  }
+}
+
+void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
+  ctx->zeromv_sse = INT64_MAX;
+  ctx->newmv_sse = INT64_MAX;
+  ctx->zeromv_lastref_sse = INT64_MAX;
+  ctx->best_sse_mv.as_int = 0;
+}
+
+void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse,
+                                     PREDICTION_MODE mode,
+                                     PICK_MODE_CONTEXT *ctx) {
+  if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
+    ctx->zeromv_sse = sse;
+    ctx->best_zeromv_reference_frame = mi->ref_frame[0];
+    if (mi->ref_frame[0] == LAST_FRAME) ctx->zeromv_lastref_sse = sse;
+  }
+
+  if (mi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
+    ctx->newmv_sse = sse;
+    ctx->best_sse_inter_mode = mode;
+    ctx->best_sse_mv = mi->mv[0];
+    ctx->best_reference_frame = mi->ref_frame[0];
+  }
+}
+
+static int av1_denoiser_realloc_svc_helper(AV1_COMMON *cm,
+                                           AV1_DENOISER *denoiser, int fb_idx) {
+  int fail = 0;
+  if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) {
+    fail = aom_alloc_frame_buffer(
+        &denoiser->running_avg_y[fb_idx], cm->width, cm->height,
+        cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+        cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+        cm->features.byte_alignment, 0);
+    if (fail) {
+      av1_denoiser_free(denoiser);
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser,
+                             struct SVC *svc, int svc_buf_shift,
+                             int refresh_alt, int refresh_gld, int refresh_lst,
+                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) {
+  int fail = 0;
+  if (svc->set_ref_frame_config) {
+    int i;
+    for (i = 0; i < REF_FRAMES; i++) {
+      if (cm->current_frame.frame_type == KEY_FRAME ||
+          svc->refresh[svc->spatial_layer_id] & (1 << i)) {
+        fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+                                               i + 1 + svc_buf_shift);
+      }
+    }
+  } else {
+    if (refresh_alt) {
+      // Increase the frame buffer index by 1 to map it to the buffer index in
+      // the denoiser.
+      fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+                                             alt_fb_idx + 1 + svc_buf_shift);
+      if (fail) return 1;
+    }
+    if (refresh_gld) {
+      fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+                                             gld_fb_idx + 1 + svc_buf_shift);
+      if (fail) return 1;
+    }
+    if (refresh_lst) {
+      fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+                                             lst_fb_idx + 1 + svc_buf_shift);
+      if (fail) return 1;
+    }
+  }
+  return 0;
+}
+
+int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser,
+                       int use_svc, int noise_sen, int width, int height,
+                       int ssx, int ssy, int use_highbitdepth, int border) {
+  int i, layer, fail, init_num_ref_frames;
+  const int legacy_byte_alignment = 0;
+  int num_layers = 1;
+  int scaled_width = width;
+  int scaled_height = height;
+  if (use_svc) {
+    LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id *
+                                                svc->number_temporal_layers +
+                                            svc->temporal_layer_id];
+    av1_get_layer_resolution(width, height, lc->scaling_factor_num,
+                             lc->scaling_factor_den, &scaled_width,
+                             &scaled_height);
+    // For SVC: only denoise at most 2 spatial (highest) layers.
+    if (noise_sen >= 2)
+      // Denoise from one spatial layer below the top.
+      svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 2, 0);
+    else
+      // Only denoise the top spatial layer.
+      svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 1, 0);
+    num_layers = svc->number_spatial_layers - svc->first_layer_denoise;
+  }
+  assert(denoiser != NULL);
+  denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
+  init_num_ref_frames = use_svc ? REF_FRAMES : NONSVC_REF_FRAMES;
+  denoiser->num_layers = num_layers;
+  CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+                  aom_calloc(denoiser->num_ref_frames * num_layers,
+                             sizeof(denoiser->running_avg_y[0])));
+  CHECK_MEM_ERROR(
+      cm, denoiser->mc_running_avg_y,
+      aom_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
+
+  for (layer = 0; layer < num_layers; ++layer) {
+    const int denoise_width = (layer == 0) ? width : scaled_width;
+    const int denoise_height = (layer == 0) ? height : scaled_height;
+    for (i = 0; i < init_num_ref_frames; ++i) {
+      fail = aom_alloc_frame_buffer(
+          &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer],
+          denoise_width, denoise_height, ssx, ssy, use_highbitdepth, border,
+          legacy_byte_alignment, 0);
+      if (fail) {
+        av1_denoiser_free(denoiser);
+        return 1;
+      }
+#ifdef OUTPUT_YUV_DENOISED
+      make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+    }
+
+    fail = aom_alloc_frame_buffer(
+        &denoiser->mc_running_avg_y[layer], denoise_width, denoise_height, ssx,
+        ssy, use_highbitdepth, border, legacy_byte_alignment, 0);
+    if (fail) {
+      av1_denoiser_free(denoiser);
+      return 1;
+    }
+  }
+
+  // denoiser->last_source only used for noise_estimation, so only for top
+  // layer.
+  fail = aom_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy,
+                                use_highbitdepth, border, legacy_byte_alignment,
+                                0);
+  if (fail) {
+    av1_denoiser_free(denoiser);
+    return 1;
+  }
+#ifdef OUTPUT_YUV_DENOISED
+  make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+  denoiser->frame_buffer_initialized = 1;
+  denoiser->denoising_level = kDenMedium;
+  denoiser->prev_denoising_level = kDenMedium;
+  denoiser->reset = 0;
+  denoiser->current_denoiser_frame = 0;
+  return 0;
+}
+
+void av1_denoiser_free(AV1_DENOISER *denoiser) {
+  int i;
+  if (denoiser == NULL) {
+    return;
+  }
+  denoiser->frame_buffer_initialized = 0;
+  for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) {
+    aom_free_frame_buffer(&denoiser->running_avg_y[i]);
+  }
+  aom_free(denoiser->running_avg_y);
+  denoiser->running_avg_y = NULL;
+
+  for (i = 0; i < denoiser->num_layers; ++i) {
+    aom_free_frame_buffer(&denoiser->mc_running_avg_y[i]);
+  }
+
+  aom_free(denoiser->mc_running_avg_y);
+  denoiser->mc_running_avg_y = NULL;
+  aom_free_frame_buffer(&denoiser->last_source);
+}
+
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+static void force_refresh_longterm_ref(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  // If long term reference is used, force refresh of that slot, so
+  // denoiser buffer for long term reference stays in sync.
+  if (svc->use_gf_temporal_ref_current_layer) {
+    int index = svc->spatial_layer_id;
+    if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+    assert(index >= 0);
+    cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+    cpi->refresh_alt_ref_frame = 1;
+  }
+}
+#endif
+
+void av1_denoiser_set_noise_level(AV1_COMP *const cpi, int noise_level) {
+  AV1_DENOISER *const denoiser = &cpi->denoiser;
+  denoiser->denoising_level = noise_level;
+  if (denoiser->denoising_level > kDenLowLow &&
+      denoiser->prev_denoising_level == kDenLowLow) {
+    denoiser->reset = 1;
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+    force_refresh_longterm_ref(cpi);
+#endif
+  } else {
+    denoiser->reset = 0;
+  }
+  denoiser->prev_denoising_level = denoiser->denoising_level;
+}
+
+// Scale/increase the partition threshold
+// for denoiser speed-up.
+int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level,
+                              CONTENT_STATE_SB content_state,
+                              int temporal_layer_id) {
+  if ((content_state.source_sad_nonrd == kLowSad &&
+       content_state.low_sumdiff) ||
+      (content_state.source_sad_nonrd == kHighSad &&
+       content_state.low_sumdiff) ||
+      (content_state.lighting_change && !content_state.low_sumdiff) ||
+      (noise_level == kDenHigh) || (temporal_layer_id != 0)) {
+    int64_t scaled_thr =
+        (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2;
+    return scaled_thr;
+  } else {
+    return (5 * threshold) >> 2;
+  }
+}
+
+//  Scale/increase the ac skip threshold for
+//  denoiser speed-up.
+int64_t av1_scale_acskip_thresh(int64_t threshold,
+                                AV1_DENOISER_LEVEL noise_level, int abs_sumdiff,
+                                int temporal_layer_id) {
+  if (noise_level >= kDenLow && abs_sumdiff < 5)
+    return threshold *=
+           (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6;
+  else
+    return threshold;
+}
+
+void av1_denoiser_reset_on_first_frame(AV1_COMP *const cpi) {
+  if (/*av1_denoise_svc_non_key(cpi) &&*/
+      cpi->denoiser.current_denoiser_frame == 0) {
+    cpi->denoiser.reset = 1;
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+    force_refresh_longterm_ref(cpi);
+#endif
+  }
+}
+
+void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+      cpi->denoiser.denoising_level > kDenLowLow) {
+    int svc_refresh_denoiser_buffers = 0;
+    int denoise_svc_second_layer = 0;
+    FRAME_TYPE frame_type = cm->current_frame.frame_type == INTRA_ONLY_FRAME
+                                ? KEY_FRAME
+                                : cm->current_frame.frame_type;
+    cpi->denoiser.current_denoiser_frame++;
+    const int resize_pending = is_frame_resize_pending(cpi);
+
+    if (cpi->ppi->use_svc) {
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+      const int svc_buf_shift =
+          svc->number_spatial_layers - svc->spatial_layer_id == 2
+              ? cpi->denoiser.num_ref_frames
+              : 0;
+      int layer =
+          LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                           svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      svc_refresh_denoiser_buffers =
+          lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id];
+      denoise_svc_second_layer =
+          svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0;
+      // Check if we need to allocate extra buffers in the denoiser
+      // for refreshed frames.
+      if (av1_denoiser_realloc_svc(cm, &cpi->denoiser, svc, svc_buf_shift,
+                                   cpi->refresh_alt_ref_frame,
+                                   cpi->refresh_golden_frame,
+                                   cpi->refresh_last_frame, cpi->alt_fb_idx,
+                                   cpi->gld_fb_idx, cpi->lst_fb_idx))
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to re-allocate denoiser for SVC");
+#endif
+    }
+    av1_denoiser_update_frame_info(
+        &cpi->denoiser, *cpi->source, svc, frame_type,
+        cpi->refresh_frame.alt_ref_frame, cpi->refresh_frame.golden_frame, 1,
+        svc->ref_idx[6], svc->ref_idx[3], svc->ref_idx[0], resize_pending,
+        svc_refresh_denoiser_buffers, denoise_svc_second_layer);
+  }
+}
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
+  int r, c;
+  uint8_t *u = yuv->u_buffer;
+  uint8_t *v = yuv->v_buffer;
+
+  for (r = 0; r < yuv->uv_height; ++r) {
+    for (c = 0; c < yuv->uv_width; ++c) {
+      u[c] = UINT8_MAX / 2;
+      v[c] = UINT8_MAX / 2;
+    }
+    u += yuv->uv_stride;
+    v += yuv->uv_stride;
+  }
+}
+
+void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
+  unsigned char *src = s->y_buffer;
+  int h = s->y_crop_height;
+
+  do {
+    fwrite(src, s->y_width, 1, yuv_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_crop_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_crop_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+}
+#endif
diff --git a/media/libaom/src/av1/encoder/av1_temporal_denoiser.h b/media/libaom/src/av1/encoder/av1_temporal_denoiser.h
new file mode 100644
index 0000000000..71c8c1c0e9
--- /dev/null
+++ b/media/libaom/src/av1/encoder/av1_temporal_denoiser.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_
+#define AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_
+
+#include "av1/encoder/block.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
+
+// Denoiser is used in non svc real-time mode which does not use alt-ref, so no
+// need to allocate for it, and hence we need MAX_REF_FRAME - 1
+#define NONSVC_REF_FRAMES REF_FRAMES - 1
+
+// Number of frame buffers when SVC is used. [0] for current denoised buffer and
+// [1..8] for REF_FRAMES
+#define SVC_REF_FRAMES 9
+
+typedef enum av1_denoiser_decision {
+  COPY_BLOCK,
+  FILTER_BLOCK,
+  FILTER_ZEROMV_BLOCK
+} AV1_DENOISER_DECISION;
+
+typedef enum av1_denoiser_level {
+  kDenLowLow,
+  kDenLow,
+  kDenMedium,
+  kDenHigh
+} AV1_DENOISER_LEVEL;
+
+typedef struct av1_denoiser {
+  YV12_BUFFER_CONFIG *running_avg_y;
+  YV12_BUFFER_CONFIG *mc_running_avg_y;
+  YV12_BUFFER_CONFIG last_source;
+  int frame_buffer_initialized;
+  int reset;
+  int num_ref_frames;
+  int num_layers;
+  unsigned int current_denoiser_frame;
+  AV1_DENOISER_LEVEL denoising_level;
+  AV1_DENOISER_LEVEL prev_denoising_level;
+} AV1_DENOISER;
+
+typedef struct {
+  int64_t zero_last_cost_orig;
+  unsigned int *ref_frame_cost;
+  int_mv (*frame_mv)[REF_FRAMES];
+  int reuse_inter_pred;
+  TX_SIZE best_tx_size;
+  PREDICTION_MODE best_mode;
+  MV_REFERENCE_FRAME best_ref_frame;
+  int_interpfilters best_pred_filter;
+  uint8_t best_mode_skip_txfm;
+} AV1_PICKMODE_CTX_DEN;
+
+struct AV1_COMP;
+struct SVC;
+
+void av1_denoiser_update_frame_info(
+    AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc,
+    FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame,
+    int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx,
+    int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer);
+
+void av1_denoiser_denoise(struct AV1_COMP *cpi, MACROBLOCK *mb, int mi_row,
+                          int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
+                          AV1_DENOISER_DECISION *denoiser_decision,
+                          int use_gf_temporal_ref);
+
+void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
+
+void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse,
+                                     PREDICTION_MODE mode,
+                                     PICK_MODE_CONTEXT *ctx);
+
+int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser,
+                             struct SVC *svc, int svc_buf_shift,
+                             int refresh_alt, int refresh_gld, int refresh_lst,
+                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx);
+
+int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser,
+                       int use_svc, int noise_sen, int width, int height,
+                       int ssx, int ssy, int use_highbitdepth, int border);
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+// This function is used by both c and sse2 denoiser implementations.
+// Define it as a static function within the scope where av1_denoiser.h
+// is referenced.
+static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs,
+                                          int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+#endif
+
+void av1_denoiser_free(AV1_DENOISER *denoiser);
+
+void av1_denoiser_set_noise_level(struct AV1_COMP *const cpi, int noise_level);
+
+void av1_denoiser_reset_on_first_frame(struct AV1_COMP *const cpi);
+
+int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level,
+                              CONTENT_STATE_SB content_state,
+                              int temporal_layer_id);
+
+int64_t av1_scale_acskip_thresh(int64_t threshold,
+                                AV1_DENOISER_LEVEL noise_level, int abs_sumdiff,
+                                int temporal_layer_id);
+
+void av1_denoiser_update_ref_frame(struct AV1_COMP *const cpi);
+
+void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_
diff --git a/media/libaom/src/av1/encoder/bitstream.c b/media/libaom/src/av1/encoder/bitstream.c
index daa8ce1fc4..38ef8c9149 100644
--- a/media/libaom/src/av1/encoder/bitstream.c
+++ b/media/libaom/src/av1/encoder/bitstream.c
@@ -20,7 +20,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/bitops.h"
 #include "aom_ports/mem_ops.h"
-#include "aom_ports/system_state.h"
 #if CONFIG_BITSTREAM_DEBUG
 #include "aom_util/debug_util.h"
 #endif  // CONFIG_BITSTREAM_DEBUG
@@ -41,12 +40,15 @@
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/palette.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
 
 #define ENC_MISMATCH_DEBUG 0
+#define SETUP_TIME_OH_CONST 5     // Setup time overhead constant per worker
+#define JOB_DISP_TIME_OH_CONST 1  // Job dispatch time overhead per tile
 
 static INLINE void write_uniform(aom_writer *w, int n, int v) {
   const int l = get_unsigned_bits(n);
@@ -60,9 +62,11 @@ static INLINE void write_uniform(aom_writer *w, int n, int v) {
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 static AOM_INLINE void loop_restoration_write_sb_coeffs(
     const AV1_COMMON *const cm, MACROBLOCKD *xd, const RestorationUnitInfo *rui,
     aom_writer *const w, int plane, FRAME_COUNTS *counts);
+#endif
 
 static AOM_INLINE void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
                                              const MB_MODE_INFO *mi,
@@ -145,8 +149,8 @@ static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd,
                                            int blk_row, int blk_col,
                                            aom_writer *w) {
   FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
-  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
-  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+  const int max_blocks_high = max_block_high(xd, mbmi->bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->bsize, 0);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -158,9 +162,9 @@ static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd,
 
   const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
                                          xd->left_txfm_context + blk_row,
-                                         mbmi->sb_type, tx_size);
+                                         mbmi->bsize, tx_size);
   const int txb_size_index =
-      av1_get_txb_size_index(mbmi->sb_type, blk_row, blk_col);
+      av1_get_txb_size_index(mbmi->bsize, blk_row, blk_col);
   const int write_txfm_partition =
       tx_size == mbmi->inter_tx_size[txb_size_index];
   if (write_txfm_partition) {
@@ -183,19 +187,20 @@ static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd,
     }
 
     assert(bsw > 0 && bsh > 0);
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh)
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      const int offsetr = blk_row + row;
       for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        int offsetr = blk_row + row;
-        int offsetc = blk_col + col;
+        const int offsetc = blk_col + col;
         write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w);
       }
+    }
   }
 }
 
 static AOM_INLINE void write_selected_tx_size(const MACROBLOCKD *xd,
                                               aom_writer *w) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   if (block_signals_txsize(bsize)) {
     const TX_SIZE tx_size = mbmi->tx_size;
@@ -218,11 +223,11 @@ static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int skip = mi->skip;
-    const int ctx = av1_get_skip_context(xd);
+    const int skip_txfm = mi->skip_txfm;
+    const int ctx = av1_get_skip_txfm_context(xd);
     FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-    aom_write_symbol(w, skip, ec_ctx->skip_cdfs[ctx], 2);
-    return skip;
+    aom_write_symbol(w, skip_txfm, ec_ctx->skip_txfm_cdfs[ctx], 2);
+    return skip_txfm;
   }
 }
 
@@ -234,7 +239,7 @@ static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     return 0;
   }
   const int skip_mode = mi->skip_mode;
-  if (!is_comp_ref_allowed(mi->sb_type)) {
+  if (!is_comp_ref_allowed(mi->bsize)) {
     assert(!skip_mode);
     return 0;
   }
@@ -278,11 +283,11 @@ static AOM_INLINE void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
     case SIMPLE_TRANSLATION: break;
     case OBMC_CAUSAL:
       aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
-                       xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
+                       xd->tile_ctx->obmc_cdf[mbmi->bsize], 2);
       break;
     default:
       aom_write_symbol(w, mbmi->motion_mode,
-                       xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
+                       xd->tile_ctx->motion_mode_cdf[mbmi->bsize],
                        MOTION_MODES);
   }
 }
@@ -311,14 +316,16 @@ static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd,
 
 static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm,
                                            const MACROBLOCKD *xd, int lf_id,
-                                           int delta_lflevel, aom_writer *w) {
+                                           int delta_lflevel,
+                                           int delta_lf_multi, aom_writer *w) {
   int sign = delta_lflevel < 0;
   int abs = sign ? -delta_lflevel : delta_lflevel;
   int rem_bits, thr;
   int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
 
-  if (cm->delta_q_info.delta_lf_multi) {
+  if (delta_lf_multi) {
     assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
                                                          : FRAME_LF_COUNT - 2));
     aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL),
@@ -339,22 +346,26 @@ static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm,
   }
 }
 
-static AOM_INLINE void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp,
-                                       int n, int num) {
-  const TOKENEXTRA *p = *tp;
+static AOM_INLINE void pack_map_tokens(aom_writer *w, const TokenExtra **tp,
+                                       int n, int num, MapCdf map_pb_cdf) {
+  const TokenExtra *p = *tp;
+  const int palette_size_idx = n - PALETTE_MIN_SIZE;
   write_uniform(w, n, p->token);  // The first color index.
   ++p;
   --num;
   for (int i = 0; i < num; ++i) {
-    aom_write_symbol(w, p->token, p->color_map_cdf, n);
+    assert((p->color_ctx >= 0) &&
+           (p->color_ctx < PALETTE_COLOR_INDEX_CONTEXTS));
+    aom_cdf_prob *color_map_cdf = map_pb_cdf[palette_size_idx][p->color_ctx];
+    aom_write_symbol(w, p->token, color_map_cdf, n);
     ++p;
   }
   *tp = p;
 }
 
 static AOM_INLINE void pack_txb_tokens(
-    aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TOKENEXTRA **tp,
-    const TOKENEXTRA *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+    aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TokenExtra **tp,
+    const TokenExtra *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
     int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block,
     int blk_row, int blk_col, TX_SIZE tx_size, TOKEN_STATS *token_stats) {
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
@@ -364,7 +375,7 @@ static AOM_INLINE void pack_txb_tokens(
 
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const TX_SIZE plane_tx_size =
-      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
                                     pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
@@ -374,7 +385,6 @@ static AOM_INLINE void pack_txb_tokens(
 #if CONFIG_RD_DEBUG
     TOKEN_STATS tmp_token_stats;
     init_token_stats(&tmp_token_stats);
-    token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost;
     token_stats->cost += tmp_token_stats.cost;
 #endif
   } else {
@@ -382,14 +392,17 @@ static AOM_INLINE void pack_txb_tokens(
     const int bsw = tx_size_wide_unit[sub_txs];
     const int bsh = tx_size_high_unit[sub_txs];
     const int step = bsh * bsw;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
 
     assert(bsw > 0 && bsh > 0);
 
-    for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
-      for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw) {
-        const int offsetr = blk_row + r;
+    for (int r = 0; r < row_end; r += bsh) {
+      const int offsetr = blk_row + r;
+      for (int c = 0; c < col_end; c += bsw) {
         const int offsetc = blk_col + c;
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
         pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize,
                         bit_depth, block, offsetr, offsetc, sub_txs,
                         token_stats);
@@ -439,27 +452,29 @@ int av1_neg_interleave(int x, int ref, int max) {
   }
 }
 
-static AOM_INLINE void write_segment_id(
-    AV1_COMP *cpi, const MB_MODE_INFO *const mbmi, aom_writer *w,
-    const struct segmentation *seg, struct segmentation_probs *segp, int skip) {
+static AOM_INLINE void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd,
+                                        const MB_MODE_INFO *const mbmi,
+                                        aom_writer *w,
+                                        const struct segmentation *seg,
+                                        struct segmentation_probs *segp,
+                                        int skip_txfm) {
   if (!seg->enabled || !seg->update_map) return;
 
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int cdf_num;
   const int pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
 
-  if (skip) {
-    // Still need to transmit tx size for intra blocks even if skip is
+  if (skip_txfm) {
+    // Still need to transmit tx size for intra blocks even if skip_txfm is
     // true. Changing segment_id may make the tx size become invalid, e.g
     // changing from lossless to lossy.
     assert(is_inter_block(mbmi) || !cpi->enc_seg.has_lossless_segment);
 
-    set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
-                           mbmi->sb_type, mi_row, mi_col, pred);
-    set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->sb_type,
+    set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize,
+                           mi_row, mi_col, pred);
+    set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->bsize,
                            mi_row, mi_col, pred);
     /* mbmi is read only but we need to update segment_id */
     ((MB_MODE_INFO *)mbmi)->segment_id = pred;
@@ -470,7 +485,7 @@ static AOM_INLINE void write_segment_id(
       av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
   aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
   aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
-  set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->sb_type,
+  set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize,
                          mi_row, mi_col, mbmi->segment_id);
 }
 
@@ -498,7 +513,7 @@ static AOM_INLINE void write_ref_frames(const AV1_COMMON *cm,
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
-      if (is_comp_ref_allowed(mbmi->sb_type))
+      if (is_comp_ref_allowed(mbmi->bsize))
         aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2);
     } else {
       assert((!is_compound) ==
@@ -587,7 +602,7 @@ static AOM_INLINE void write_filter_intra_mode_info(
     aom_writer *w) {
   if (av1_filter_intra_allowed(cm, mbmi)) {
     aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra,
-                     xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2);
+                     xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2);
     if (mbmi->filter_intra_mode_info.use_filter_intra) {
       const FILTER_INTRA_MODE mode =
           mbmi->filter_intra_mode_info.filter_intra_mode;
@@ -604,8 +619,8 @@ static AOM_INLINE void write_angle_delta(aom_writer *w, int angle_delta,
 }
 
 static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm,
-                                              const MACROBLOCKD *xd,
-                                              aom_writer *w) {
+                                              ThreadData *td, aom_writer *w) {
+  const MACROBLOCKD *xd = &td->mb.e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
@@ -624,8 +639,8 @@ static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm,
           av1_extract_interp_filter(mbmi->interp_filters, dir);
       aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
                        SWITCHABLE_FILTERS);
-      ++cm->cur_frame->interp_filter_selected[filter];
-      if (cm->seq_params.enable_dual_filter == 0) return;
+      ++td->interp_filter_selected[filter];
+      if (cm->seq_params->enable_dual_filter == 0) return;
     }
   }
 }
@@ -753,7 +768,7 @@ static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm,
                                                const MB_MODE_INFO *const mbmi,
                                                aom_writer *w) {
   const int num_planes = av1_num_planes(cm);
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
   const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
@@ -768,7 +783,7 @@ static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm,
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
                        xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
                        PALETTE_SIZES);
-      write_palette_colors_y(xd, pmi, cm->seq_params.bit_depth, w);
+      write_palette_colors_y(xd, pmi, cm->seq_params->bit_depth, w);
     }
   }
 
@@ -783,7 +798,7 @@ static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm,
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
                        xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
                        PALETTE_SIZES);
-      write_palette_colors_uv(xd, pmi, cm->seq_params.bit_depth, w);
+      write_palette_colors_uv(xd, pmi, cm->seq_params->bit_depth, w);
     }
   }
 }
@@ -796,7 +811,7 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
   if (get_ext_tx_types(tx_size, is_inter, features->reduced_tx_set_used) > 1 &&
       ((!cm->seg.enabled && cm->quant_params.base_qindex > 0) ||
        (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-      !mbmi->skip &&
+      !mbmi->skip_txfm &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
     const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
@@ -865,7 +880,7 @@ static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
 
   // At the start of a superblock, mark that we haven't yet written CDEF
   // strengths for any of the CDEF units contained in this superblock.
-  const int sb_mask = (cm->seq_params.mib_size - 1);
+  const int sb_mask = (cm->seq_params->mib_size - 1);
   const int mi_row_in_sb = (xd->mi_row & sb_mask);
   const int mi_col_in_sb = (xd->mi_col & sb_mask);
   if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
@@ -880,7 +895,7 @@ static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
   const int index_mask = cdef_size;
   const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
   const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
-  const int index = (cm->seq_params.sb_size == BLOCK_128X128)
+  const int index = (cm->seq_params->sb_size == BLOCK_128X128)
                         ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
                         : 0;
 
@@ -900,9 +915,9 @@ static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
 }
 
 static AOM_INLINE void write_inter_segment_id(
-    AV1_COMP *cpi, aom_writer *w, const struct segmentation *const seg,
-    struct segmentation_probs *const segp, int skip, int preskip) {
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+    AV1_COMP *cpi, MACROBLOCKD *const xd, aom_writer *w,
+    const struct segmentation *const seg, struct segmentation_probs *const segp,
+    int skip, int preskip) {
   MB_MODE_INFO *const mbmi = xd->mi[0];
   AV1_COMMON *const cm = &cpi->common;
   const int mi_row = xd->mi_row;
@@ -914,7 +929,7 @@ static AOM_INLINE void write_inter_segment_id(
     } else {
       if (seg->segid_preskip) return;
       if (skip) {
-        write_segment_id(cpi, mbmi, w, seg, segp, 1);
+        write_segment_id(cpi, xd, mbmi, w, seg, segp, 1);
         if (seg->temporal_update) mbmi->seg_id_predicted = 0;
         return;
       }
@@ -924,42 +939,40 @@ static AOM_INLINE void write_inter_segment_id(
       aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
       aom_write_symbol(w, pred_flag, pred_cdf, 2);
       if (!pred_flag) {
-        write_segment_id(cpi, mbmi, w, seg, segp, 0);
+        write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
       }
       if (pred_flag) {
         set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
-                               mbmi->sb_type, mi_row, mi_col, mbmi->segment_id);
+                               mbmi->bsize, mi_row, mi_col, mbmi->segment_id);
       }
     } else {
-      write_segment_id(cpi, mbmi, w, seg, segp, 0);
+      write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
     }
   }
 }
 
 // If delta q is present, writes delta_q index.
 // Also writes delta_q loop filter levels, if present.
-static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip,
+static AOM_INLINE void write_delta_q_params(AV1_COMMON *const cm,
+                                            MACROBLOCKD *const xd, int skip,
                                             aom_writer *w) {
-  AV1_COMMON *const cm = &cpi->common;
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
 
   if (delta_q_info->delta_q_present_flag) {
-    MACROBLOCK *const x = &cpi->td.mb;
-    MACROBLOCKD *const xd = &x->e_mbd;
     const MB_MODE_INFO *const mbmi = xd->mi[0];
-    const BLOCK_SIZE bsize = mbmi->sb_type;
+    const BLOCK_SIZE bsize = mbmi->bsize;
     const int super_block_upper_left =
-        ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-        ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
+        ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+        ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0);
 
-    if ((bsize != cm->seq_params.sb_size || skip == 0) &&
+    if ((bsize != cm->seq_params->sb_size || skip == 0) &&
         super_block_upper_left) {
       assert(mbmi->current_qindex > 0);
       const int reduced_delta_qindex =
-          (mbmi->current_qindex - xd->current_qindex) /
+          (mbmi->current_qindex - xd->current_base_qindex) /
           delta_q_info->delta_q_res;
       write_delta_qindex(xd, reduced_delta_qindex, w);
-      xd->current_qindex = mbmi->current_qindex;
+      xd->current_base_qindex = mbmi->current_qindex;
       if (delta_q_info->delta_lf_present_flag) {
         if (delta_q_info->delta_lf_multi) {
           const int frame_lf_count =
@@ -968,14 +981,14 @@ static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip,
             int reduced_delta_lflevel =
                 (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
                 delta_q_info->delta_lf_res;
-            write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
+            write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, 1, w);
             xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
           }
         } else {
           int reduced_delta_lflevel =
               (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
               delta_q_info->delta_lf_res;
-          write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
+          write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, 0, w);
           xd->delta_lf_from_base = mbmi->delta_lf_from_base;
         }
       }
@@ -983,16 +996,14 @@ static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip,
   }
 }
 
-static AOM_INLINE void write_intra_prediction_modes(AV1_COMP *cpi,
+static AOM_INLINE void write_intra_prediction_modes(const AV1_COMMON *cm,
+                                                    MACROBLOCKD *const xd,
                                                     int is_keyframe,
                                                     aom_writer *w) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const PREDICTION_MODE mode = mbmi->mode;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
 
   // Y mode.
   if (is_keyframe) {
@@ -1011,7 +1022,7 @@ static AOM_INLINE void write_intra_prediction_modes(AV1_COMP *cpi,
   }
 
   // UV mode and UV angle delta.
-  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
+  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
     const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
     write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
     if (uv_mode == UV_CFL_PRED)
@@ -1073,9 +1084,10 @@ static INLINE int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) {
                                x->mbmi_ext_frame);
 }
 
-static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
+static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td,
+                                           aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const struct segmentation *const seg = &cm->seg;
@@ -1084,32 +1096,32 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
   const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = x->mbmi_ext_frame;
   const PREDICTION_MODE mode = mbmi->mode;
   const int segment_id = mbmi->segment_id;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int allow_hp = cm->features.allow_high_precision_mv;
   const int is_inter = is_inter_block(mbmi);
   const int is_compound = has_second_ref(mbmi);
   int ref;
 
-  write_inter_segment_id(cpi, w, seg, segp, 0, 1);
+  write_inter_segment_id(cpi, xd, w, seg, segp, 0, 1);
 
   write_skip_mode(cm, xd, segment_id, mbmi, w);
 
-  assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
+  assert(IMPLIES(mbmi->skip_mode, mbmi->skip_txfm));
   const int skip =
       mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
 
-  write_inter_segment_id(cpi, w, seg, segp, skip, 0);
+  write_inter_segment_id(cpi, xd, w, seg, segp, skip, 0);
 
   write_cdef(cm, xd, w, skip);
 
-  write_delta_q_params(cpi, skip, w);
+  write_delta_q_params(cm, xd, skip, w);
 
   if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
 
   if (mbmi->skip_mode) return;
 
   if (!is_inter) {
-    write_intra_prediction_modes(cpi, 0, w);
+    write_intra_prediction_modes(cm, xd, 0, w);
   } else {
     int16_t mode_ctx;
 
@@ -1137,21 +1149,23 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
       for (ref = 0; ref < 1 + is_compound; ++ref) {
         nmv_context *nmvc = &ec_ctx->nmvc;
         const int_mv ref_mv = get_ref_mv(x, ref);
-        av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
+        av1_encode_mv(cpi, w, td, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
                       allow_hp);
       }
     } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
       nmv_context *nmvc = &ec_ctx->nmvc;
       const int_mv ref_mv = get_ref_mv(x, 1);
-      av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+      av1_encode_mv(cpi, w, td, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc,
+                    allow_hp);
     } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
       nmv_context *nmvc = &ec_ctx->nmvc;
       const int_mv ref_mv = get_ref_mv(x, 0);
-      av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+      av1_encode_mv(cpi, w, td, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc,
+                    allow_hp);
     }
 
     if (cpi->common.current_frame.reference_mode != COMPOUND_REFERENCE &&
-        cpi->common.seq_params.enable_interintra_compound &&
+        cpi->common.seq_params->enable_interintra_compound &&
         is_interintra_allowed(mbmi)) {
       const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
       const int bsize_group = size_group_lookup[bsize];
@@ -1178,7 +1192,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
     // Group B (1): interintra, compound_diffwtd, wedge
     if (has_second_ref(mbmi)) {
       const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                       cm->seq_params.enable_masked_compound;
+                                       cm->seq_params->enable_masked_compound;
 
       if (masked_compound_used) {
         const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
@@ -1192,7 +1206,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
         if (mbmi->compound_idx)
           assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
 
-        if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
+        if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) {
           const int comp_index_ctx = get_comp_index_context(cm, xd);
           aom_write_symbol(w, mbmi->compound_idx,
                            ec_ctx->compound_index_cdf[comp_index_ctx], 2);
@@ -1225,7 +1239,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
         }
       }
     }
-    write_mb_interp_filter(cm, xd, w);
+    write_mb_interp_filter(cm, td, w);
   }
 }
 
@@ -1255,30 +1269,30 @@ static AOM_INLINE void write_mb_modes_kf(
   const MB_MODE_INFO *const mbmi = xd->mi[0];
 
   if (seg->segid_preskip && seg->update_map)
-    write_segment_id(cpi, mbmi, w, seg, segp, 0);
+    write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
 
   const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w);
 
   if (!seg->segid_preskip && seg->update_map)
-    write_segment_id(cpi, mbmi, w, seg, segp, skip);
+    write_segment_id(cpi, xd, mbmi, w, seg, segp, skip);
 
   write_cdef(cm, xd, w, skip);
 
-  write_delta_q_params(cpi, skip, w);
+  write_delta_q_params(cm, xd, skip, w);
 
   if (av1_allow_intrabc(cm)) {
     write_intrabc_info(xd, mbmi_ext_frame, w);
     if (is_intrabc_block(mbmi)) return;
   }
 
-  write_intra_prediction_modes(cpi, 1, w);
+  write_intra_prediction_modes(cm, xd, 1, w);
 }
 
 #if CONFIG_RD_DEBUG
 static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) {
   printf("\nmi->mi_row == %d\n", mi->mi_row);
   printf("&& mi->mi_col == %d\n", mi->mi_col);
-  printf("&& mi->sb_type == %d\n", mi->sb_type);
+  printf("&& mi->bsize == %d\n", mi->bsize);
   printf("&& mi->tx_size == %d\n", mi->tx_size);
   printf("&& mi->mode == %d\n", mi->mode);
 }
@@ -1286,24 +1300,8 @@ static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) {
 static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
                                    int plane) {
   if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
-    int r, c;
     printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n",
            plane, rd_stats->txb_coeff_cost[plane], token_stats->cost);
-    printf("rd txb_coeff_cost_map\n");
-    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
-      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
-        printf("%d ", rd_stats->txb_coeff_cost_map[plane][r][c]);
-      }
-      printf("\n");
-    }
-
-    printf("pack txb_coeff_cost_map\n");
-    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
-      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
-        printf("%d ", token_stats->txb_coeff_cost_map[r][c]);
-      }
-      printf("\n");
-    }
     return 1;
   }
   return 0;
@@ -1324,7 +1322,7 @@ static AOM_INLINE void enc_dump_logs(
 #define FRAME_TO_CHECK 11
     if (cm->current_frame.frame_number == FRAME_TO_CHECK &&
         cm->show_frame == 1) {
-      const BLOCK_SIZE bsize = mbmi->sb_type;
+      const BLOCK_SIZE bsize = mbmi->bsize;
 
       int_mv mv[2] = { 0 };
       const int is_comp_ref = has_second_ref(mbmi);
@@ -1367,13 +1365,14 @@ static AOM_INLINE void enc_dump_logs(
 }
 #endif  // ENC_MISMATCH_DEBUG
 
-static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, aom_writer *w) {
+static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td,
+                                    aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
   MB_MODE_INFO *m = xd->mi[0];
 
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext_frame, w);
+    write_mb_modes_kf(cpi, xd, td->mb.mbmi_ext_frame, w);
   } else {
     // has_subpel_mv_component needs the ref frame buffers set up to look
     // up if they are scaled. has_subpel_mv_component is in turn needed by
@@ -1384,18 +1383,18 @@ static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, aom_writer *w) {
     enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col);
 #endif  // ENC_MISMATCH_DEBUG
 
-    pack_inter_mode_mvs(cpi, w);
+    pack_inter_mode_mvs(cpi, td, w);
   }
 }
 
 static AOM_INLINE void write_inter_txb_coeff(
     AV1_COMMON *const cm, MACROBLOCK *const x, MB_MODE_INFO *const mbmi,
-    aom_writer *w, const TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+    aom_writer *w, const TokenExtra **tok, const TokenExtra *const tok_end,
     TOKEN_STATS *token_stats, const int row, const int col, int *block,
     const int plane) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   assert(bsize < BLOCK_SIZES_ALL);
   const int ss_x = pd->subsampling_x;
   const int ss_y = pd->subsampling_y;
@@ -1417,27 +1416,26 @@ static AOM_INLINE void write_inter_txb_coeff(
   for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) {
     for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) {
       pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
-                      cm->seq_params.bit_depth, *block, blk_row, blk_col,
+                      cm->seq_params->bit_depth, *block, blk_row, blk_col,
                       max_tx_size, token_stats);
       *block += step;
     }
   }
 }
 
-static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, aom_writer *w,
-                                      const TOKENEXTRA **tok,
-                                      const TOKENEXTRA *const tok_end) {
+static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x,
+                                      aom_writer *w, const TokenExtra **tok,
+                                      const TokenExtra *const tok_end) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
 
-  assert(!mbmi->skip);
+  assert(!mbmi->skip_txfm);
 
   const int is_inter = is_inter_block(mbmi);
   if (!is_inter) {
-    av1_write_coeffs_mb(cm, x, w, bsize);
+    av1_write_intra_coeffs_mb(cm, x, w, bsize);
   } else {
     int block[MAX_MB_PLANE] = { 0 };
     assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
@@ -1468,7 +1466,7 @@ static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, aom_writer *w,
     }
 #if CONFIG_RD_DEBUG
     for (int plane = 0; plane < num_planes; ++plane) {
-      if (mbmi->sb_type >= BLOCK_8X8 &&
+      if (mbmi->bsize >= BLOCK_8X8 &&
           rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
         dump_mode_info(mbmi);
         assert(0);
@@ -1478,16 +1476,18 @@ static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, aom_writer *w,
   }
 }
 
-static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
-                                     aom_writer *w, const TOKENEXTRA **tok,
-                                     const TOKENEXTRA *const tok_end,
+static AOM_INLINE void write_modes_b(AV1_COMP *cpi, ThreadData *const td,
+                                     const TileInfo *const tile, aom_writer *w,
+                                     const TokenExtra **tok,
+                                     const TokenExtra *const tok_end,
                                      int mi_row, int mi_col) {
   const AV1_COMMON *cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *xd = &td->mb.e_mbd;
+  FRAME_CONTEXT *tile_ctx = xd->tile_ctx;
   const int grid_idx = mi_row * mi_params->mi_stride + mi_col;
   xd->mi = mi_params->mi_grid_base + grid_idx;
-  cpi->td.mb.mbmi_ext_frame =
+  td->mb.mbmi_ext_frame =
       cpi->mbmi_ext_info.frame_base +
       get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
                      cpi->mbmi_ext_info.stride);
@@ -1495,8 +1495,8 @@ static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
   xd->tx_type_map_stride = mi_params->mi_stride;
 
   const MB_MODE_INFO *mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  assert(bsize <= cm->seq_params.sb_size ||
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  assert(bsize <= cm->seq_params->sb_size ||
          (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL));
 
   const int bh = mi_size_high[bsize];
@@ -1508,7 +1508,7 @@ static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
   xd->left_txfm_context =
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
-  write_mbmi_b(cpi, w);
+  write_mbmi_b(cpi, td, w);
 
   for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) {
     const uint8_t palette_size_plane =
@@ -1517,21 +1517,23 @@ static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
     if (palette_size_plane > 0) {
       assert(mbmi->use_intrabc == 0);
       assert(av1_allow_palette(cm->features.allow_screen_content_tools,
-                               mbmi->sb_type));
+                               mbmi->bsize));
       assert(!plane || xd->is_chroma_ref);
       int rows, cols;
-      av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
+      av1_get_block_dimensions(mbmi->bsize, plane, xd, NULL, NULL, &rows,
                                &cols);
       assert(*tok < tok_end);
-      pack_map_tokens(w, tok, palette_size_plane, rows * cols);
+      MapCdf map_pb_cdf = plane ? tile_ctx->palette_uv_color_index_cdf
+                                : tile_ctx->palette_y_color_index_cdf;
+      pack_map_tokens(w, tok, palette_size_plane, rows * cols, map_pb_cdf);
     }
   }
 
   const int is_inter_tx = is_inter_block(mbmi);
-  const int skip = mbmi->skip;
+  const int skip_txfm = mbmi->skip_txfm;
   const int segment_id = mbmi->segment_id;
   if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
-      !(is_inter_tx && skip) && !xd->lossless[segment_id]) {
+      !(is_inter_tx && skip_txfm) && !xd->lossless[segment_id]) {
     if (is_inter_tx) {  // This implies skip flag is 0.
       const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
       const int txbh = tx_size_high_unit[max_tx_size];
@@ -1548,12 +1550,17 @@ static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
       set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, 0, xd);
     }
   } else {
-    set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, skip && is_inter_tx,
-                  xd);
+    set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
+                  skip_txfm && is_inter_tx, xd);
   }
 
-  if (!mbmi->skip) {
-    write_tokens_b(cpi, w, tok, tok_end);
+  if (!mbmi->skip_txfm) {
+    int start = aom_tell_size(w);
+
+    write_tokens_b(cpi, &td->mb, w, tok, tok_end);
+
+    const int end = aom_tell_size(w);
+    td->coefficient_size += end - start;
   }
 }
 
@@ -1595,12 +1602,12 @@ static AOM_INLINE void write_partition(const AV1_COMMON *const cm,
 }
 
 static AOM_INLINE void write_modes_sb(
-    AV1_COMP *const cpi, const TileInfo *const tile, aom_writer *const w,
-    const TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, int mi_row,
-    int mi_col, BLOCK_SIZE bsize) {
+    AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile,
+    aom_writer *const w, const TokenExtra **tok,
+    const TokenExtra *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
   assert(bsize < BLOCK_SIZES_ALL);
   const int hbs = mi_size_wide[bsize] / 2;
   const int quarter_step = mi_size_wide[bsize] / 4;
@@ -1610,6 +1617,7 @@ static AOM_INLINE void write_modes_sb(
 
   if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
 
+#if !CONFIG_REALTIME_ONLY
   const int num_planes = av1_num_planes(cm);
   for (int plane = 0; plane < num_planes; ++plane) {
     int rcol0, rcol1, rrow0, rrow1;
@@ -1621,61 +1629,63 @@ static AOM_INLINE void write_modes_sb(
           const int runit_idx = rcol + rrow * rstride;
           const RestorationUnitInfo *rui =
               &cm->rst_info[plane].unit_info[runit_idx];
-          loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane,
-                                           cpi->td.counts);
+          loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane, td->counts);
         }
       }
     }
   }
+#endif
 
   write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
   switch (partition) {
     case PARTITION_NONE:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
       break;
     case PARTITION_HORZ:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
       if (mi_row + hbs < mi_params->mi_rows)
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+        write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
       break;
     case PARTITION_VERT:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
       if (mi_col + hbs < mi_params->mi_cols)
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+        write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
       break;
     case PARTITION_SPLIT:
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize);
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize);
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs,
+                     subsize);
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col,
+                     subsize);
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
                      subsize);
       break;
     case PARTITION_HORZ_A:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
       break;
     case PARTITION_HORZ_B:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
       break;
     case PARTITION_VERT_A:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
       break;
     case PARTITION_VERT_B:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
       break;
     case PARTITION_HORZ_4:
       for (i = 0; i < 4; ++i) {
         int this_mi_row = mi_row + i * quarter_step;
         if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
 
-        write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col);
+        write_modes_b(cpi, td, tile, w, tok, tok_end, this_mi_row, mi_col);
       }
       break;
     case PARTITION_VERT_4:
@@ -1683,7 +1693,7 @@ static AOM_INLINE void write_modes_sb(
         int this_mi_col = mi_col + i * quarter_step;
         if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
 
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col);
+        write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, this_mi_col);
       }
       break;
     default: assert(0);
@@ -1693,12 +1703,28 @@ static AOM_INLINE void write_modes_sb(
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
-static AOM_INLINE void write_modes(AV1_COMP *const cpi,
+// Populate token pointers appropriately based on token_info.
+static AOM_INLINE void get_token_pointers(const TokenInfo *token_info,
+                                          const int tile_row, int tile_col,
+                                          const int sb_row_in_tile,
+                                          const TokenExtra **tok,
+                                          const TokenExtra **tok_end) {
+  if (!is_token_info_allocated(token_info)) {
+    *tok = NULL;
+    *tok_end = NULL;
+    return;
+  }
+  *tok = token_info->tplist[tile_row][tile_col][sb_row_in_tile].start;
+  *tok_end =
+      *tok + token_info->tplist[tile_row][tile_col][sb_row_in_tile].count;
+}
+
+static AOM_INLINE void write_modes(AV1_COMP *const cpi, ThreadData *const td,
                                    const TileInfo *const tile,
                                    aom_writer *const w, int tile_row,
                                    int tile_col) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
   const int mi_row_start = tile->mi_row_start;
   const int mi_row_end = tile->mi_row_end;
   const int mi_col_start = tile->mi_col_start;
@@ -1709,37 +1735,38 @@ static AOM_INLINE void write_modes(AV1_COMP *const cpi,
   av1_init_above_context(&cm->above_contexts, num_planes, tile->tile_row, xd);
 
   if (cpi->common.delta_q_info.delta_q_present_flag) {
-    xd->current_qindex = cpi->common.quant_params.base_qindex;
+    xd->current_base_qindex = cpi->common.quant_params.base_qindex;
     if (cpi->common.delta_q_info.delta_lf_present_flag) {
       av1_reset_loop_filter_delta(xd, num_planes);
     }
   }
 
   for (int mi_row = mi_row_start; mi_row < mi_row_end;
-       mi_row += cm->seq_params.mib_size) {
+       mi_row += cm->seq_params->mib_size) {
     const int sb_row_in_tile =
-        (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2;
-    const TOKENEXTRA *tok =
-        cpi->tplist[tile_row][tile_col][sb_row_in_tile].start;
-    const TOKENEXTRA *tok_end =
-        tok + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count;
+        (mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2;
+    const TokenInfo *token_info = &cpi->token_info;
+    const TokenExtra *tok;
+    const TokenExtra *tok_end;
+    get_token_pointers(token_info, tile_row, tile_col, sb_row_in_tile, &tok,
+                       &tok_end);
 
     av1_zero_left_context(xd);
 
     for (int mi_col = mi_col_start; mi_col < mi_col_end;
-         mi_col += cm->seq_params.mib_size) {
-      cpi->td.mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
-      write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
-                     cm->seq_params.sb_size);
+         mi_col += cm->seq_params->mib_size) {
+      td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
+      write_modes_sb(cpi, td, tile, w, &tok, tok_end, mi_row, mi_col,
+                     cm->seq_params->sb_size);
     }
-    assert(tok == cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop);
+    assert(tok == tok_end);
   }
 }
 
 static AOM_INLINE void encode_restoration_mode(
     AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
   assert(!cm->features.all_lossless);
-  if (!cm->seq_params.enable_restoration) return;
+  if (!cm->seq_params->enable_restoration) return;
   if (cm->features.allow_intrabc) return;
   const int num_planes = av1_num_planes(cm);
   int all_none = 1, chroma_none = 1;
@@ -1770,9 +1797,9 @@ static AOM_INLINE void encode_restoration_mode(
     }
   }
   if (!all_none) {
-    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
-           cm->seq_params.sb_size == BLOCK_128X128);
-    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+    assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+           cm->seq_params->sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64;
 
     RestorationInfo *rsi = &cm->rst_info[0];
 
@@ -1788,7 +1815,8 @@ static AOM_INLINE void encode_restoration_mode(
   }
 
   if (num_planes > 1) {
-    int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
+    int s =
+        AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y);
     if (s && !chroma_none) {
       aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
                                cm->rst_info[0].restoration_unit_size);
@@ -1807,6 +1835,7 @@ static AOM_INLINE void encode_restoration_mode(
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 static AOM_INLINE void write_wiener_filter(int wiener_win,
                                            const WienerInfo *wiener_info,
                                            WienerInfo *ref_wiener_info,
@@ -1888,7 +1917,7 @@ static AOM_INLINE void loop_restoration_write_sb_coeffs(
     aom_writer *const w, int plane, FRAME_COUNTS *counts) {
   const RestorationInfo *rsi = cm->rst_info + plane;
   RestorationType frame_rtype = rsi->frame_restoration_type;
-  if (frame_rtype == RESTORE_NONE) return;
+  assert(frame_rtype != RESTORE_NONE);
 
   (void)counts;
   assert(!cm->features.all_lossless);
@@ -1933,6 +1962,7 @@ static AOM_INLINE void loop_restoration_write_sb_coeffs(
     }
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 // Only write out the ref delta section if any of the elements
 // will signal a delta.
@@ -2019,7 +2049,7 @@ static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
 static AOM_INLINE void encode_cdef(const AV1_COMMON *cm,
                                    struct aom_write_bit_buffer *wb) {
   assert(!cm->features.coded_lossless);
-  if (!cm->seq_params.enable_cdef) return;
+  if (!cm->seq_params->enable_cdef) return;
   if (cm->features.allow_intrabc) return;
   const int num_planes = av1_num_planes(cm);
   int i;
@@ -2072,7 +2102,7 @@ static AOM_INLINE void encode_quantization(
   }
 }
 
-static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
+static AOM_INLINE void encode_segmentation(AV1_COMMON *cm,
                                            struct aom_write_bit_buffer *wb) {
   int i, j;
   struct segmentation *seg = &cm->seg;
@@ -2081,17 +2111,9 @@ static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
   if (!seg->enabled) return;
 
   // Write update flags
-  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
-    assert(seg->update_map == 1);
-    seg->temporal_update = 0;
-    assert(seg->update_data == 1);
-  } else {
+  if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) {
     aom_wb_write_bit(wb, seg->update_map);
-    if (seg->update_map) {
-      // Select the coding strategy (temporal or spatial)
-      av1_choose_segmap_coding_method(cm, xd);
-      aom_wb_write_bit(wb, seg->temporal_update);
-    }
+    if (seg->update_map) aom_wb_write_bit(wb, seg->temporal_update);
     aom_wb_write_bit(wb, seg->update_data);
   }
 
@@ -2141,12 +2163,10 @@ static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n,
 
 static AOM_INLINE void write_tile_info_max_tile(
     const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
-  int width_mi =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params.mib_size_log2);
-  int height_mi =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2);
-  int width_sb = width_mi >> cm->seq_params.mib_size_log2;
-  int height_sb = height_mi >> cm->seq_params.mib_size_log2;
+  int width_sb =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+  int height_sb =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
   int size_sb, i;
   const CommonTileParams *const tiles = &cm->tiles;
 
@@ -2223,13 +2243,6 @@ static AOM_INLINE void write_ext_tile_info(
   }
 }
 
-// Stores the location and size of a tile's data in the bitstream.  Used for
-// later identifying identical tiles
-typedef struct TileBufferEnc {
-  uint8_t *data;
-  size_t size;
-} TileBufferEnc;
-
 static INLINE int find_identical_tile(
     const int tile_row, const int tile_col,
     TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) {
@@ -2293,7 +2306,7 @@ static AOM_INLINE void write_render_size(const AV1_COMMON *cm,
 
 static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm,
                                             struct aom_write_bit_buffer *wb) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   if (!seq_params->enable_superres) {
     assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
     return;
@@ -2320,7 +2333,7 @@ static AOM_INLINE void write_frame_size(const AV1_COMMON *cm,
   const int coded_height = cm->superres_upscaled_height - 1;
 
   if (frame_size_override) {
-    const SequenceHeader *seq_params = &cm->seq_params;
+    const SequenceHeader *seq_params = cm->seq_params;
     int num_bits_width = seq_params->num_bits_width;
     int num_bits_height = seq_params->num_bits_height;
     aom_wb_write_literal(wb, coded_width, num_bits_width);
@@ -2478,14 +2491,13 @@ static AOM_INLINE void write_tu_pts_info(AV1_COMMON *const cm,
                                          struct aom_write_bit_buffer *wb) {
   aom_wb_write_unsigned_literal(
       wb, cm->frame_presentation_time,
-      cm->seq_params.decoder_model_info.frame_presentation_time_length);
+      cm->seq_params->decoder_model_info.frame_presentation_time_length);
 }
 
 static AOM_INLINE void write_film_grain_params(
     const AV1_COMP *const cpi, struct aom_write_bit_buffer *wb) {
   const AV1_COMMON *const cm = &cpi->common;
   const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params;
-
   aom_wb_write_bit(wb, pars->apply_grain);
   if (!pars->apply_grain) return;
 
@@ -2501,7 +2513,7 @@ static AOM_INLINE void write_film_grain_params(
       assert(ref_idx != INVALID_IDX);
       const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx];
       if (buf->film_grain_params_present &&
-          av1_check_grain_params_equiv(pars, &buf->film_grain_params)) {
+          aom_check_grain_params_equiv(pars, &buf->film_grain_params)) {
         break;
       }
     }
@@ -2517,15 +2529,15 @@ static AOM_INLINE void write_film_grain_params(
     aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
   }
 
-  if (!cm->seq_params.monochrome) {
+  if (!cm->seq_params->monochrome) {
     aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
   } else {
     assert(!pars->chroma_scaling_from_luma);
   }
 
-  if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
-      ((cm->seq_params.subsampling_x == 1) &&
-       (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) {
+  if (cm->seq_params->monochrome || pars->chroma_scaling_from_luma ||
+      ((cm->seq_params->subsampling_x == 1) &&
+       (cm->seq_params->subsampling_y == 1) && (pars->num_y_points == 0))) {
     assert(pars->num_cb_points == 0 && pars->num_cr_points == 0);
   } else {
     aom_wb_write_literal(wb, pars->num_cb_points, 4);  // max 10
@@ -2821,12 +2833,11 @@ static int check_frame_refs_short_signaling(AV1_COMMON *const cm) {
 
 // New function based on HLS R18
 static AOM_INLINE void write_uncompressed_header_obu(
-    AV1_COMP *cpi, struct aom_write_bit_buffer *saved_wb,
+    AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb,
     struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const CommonQuantParams *quant_params = &cm->quant_params;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   CurrentFrame *const current_frame = &cm->current_frame;
   FeatureFlags *const features = &cm->features;
 
@@ -2905,7 +2916,7 @@ static AOM_INLINE void write_uncompressed_header_obu(
 
     if (cm->superres_upscaled_width > seq_params->max_frame_width ||
         cm->superres_upscaled_height > seq_params->max_frame_height) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Frame dimensions are larger than the maximum values");
     }
 
@@ -2927,24 +2938,24 @@ static AOM_INLINE void write_uncompressed_header_obu(
   }
 
   if (seq_params->decoder_model_info_present_flag) {
-    aom_wb_write_bit(wb, cm->buffer_removal_time_present);
-    if (cm->buffer_removal_time_present) {
+    aom_wb_write_bit(wb, cpi->ppi->buffer_removal_time_present);
+    if (cpi->ppi->buffer_removal_time_present) {
       for (int op_num = 0;
            op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
         if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
-          if (((seq_params->operating_point_idc[op_num] >>
+          if (seq_params->operating_point_idc[op_num] == 0 ||
+              ((seq_params->operating_point_idc[op_num] >>
                 cm->temporal_layer_id) &
                    0x1 &&
                (seq_params->operating_point_idc[op_num] >>
                 (cm->spatial_layer_id + 8)) &
-                   0x1) ||
-              seq_params->operating_point_idc[op_num] == 0) {
+                   0x1)) {
             aom_wb_write_unsigned_literal(
                 wb, cm->buffer_removal_times[op_num],
                 seq_params->decoder_model_info.buffer_removal_time_length);
             cm->buffer_removal_times[op_num]++;
             if (cm->buffer_removal_times[op_num] == 0) {
-              aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+              aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                                  "buffer_removal_time overflowed");
             }
           }
@@ -3031,7 +3042,7 @@ static AOM_INLINE void write_uncompressed_header_obu(
               1;
           if (delta_frame_id_minus_1 < 0 ||
               delta_frame_id_minus_1 >= (1 << diff_len)) {
-            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+            aom_internal_error(cm->error, AOM_CODEC_ERROR,
                                "Invalid delta_frame_id_minus_1");
           }
           aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
@@ -3068,8 +3079,8 @@ static AOM_INLINE void write_uncompressed_header_obu(
 
   write_tile_info(cm, saved_wb, wb);
   encode_quantization(quant_params, av1_num_planes(cm),
-                      cm->seq_params.separate_uv_delta_q, wb);
-  encode_segmentation(cm, xd, wb);
+                      cm->seq_params->separate_uv_delta_q, wb);
+  encode_segmentation(cm, wb);
 
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
   if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0);
@@ -3077,7 +3088,7 @@ static AOM_INLINE void write_uncompressed_header_obu(
     aom_wb_write_bit(wb, delta_q_info->delta_q_present_flag);
     if (delta_q_info->delta_q_present_flag) {
       aom_wb_write_literal(wb, get_msb(delta_q_info->delta_q_res), 2);
-      xd->current_qindex = quant_params->base_qindex;
+      xd->current_base_qindex = quant_params->base_qindex;
       if (features->allow_intrabc)
         assert(delta_q_info->delta_lf_present_flag == 0);
       else
@@ -3268,11 +3279,11 @@ static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst,
 }
 
 uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
-                              OBU_TYPE obu_type, int obu_extension,
-                              uint8_t *const dst) {
+                              int *frame_header_count, OBU_TYPE obu_type,
+                              int obu_extension, uint8_t *const dst) {
   if (level_params->keep_level_stats &&
       (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER))
-    ++level_params->frame_header_count;
+    ++(*frame_header_count);
 
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
@@ -3306,8 +3317,8 @@ int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
   return AOM_CODEC_OK;
 }
 
-static size_t obu_memmove(size_t obu_header_size, size_t obu_payload_size,
-                          uint8_t *data) {
+size_t av1_obu_memmove(size_t obu_header_size, size_t obu_payload_size,
+                       uint8_t *data) {
   const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
   const size_t move_dst_offset = length_field_size + obu_header_size;
   const size_t move_src_offset = obu_header_size;
@@ -3406,12 +3417,12 @@ uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
   return size;
 }
 
-static uint32_t write_frame_header_obu(AV1_COMP *cpi,
+static uint32_t write_frame_header_obu(AV1_COMP *cpi, MACROBLOCKD *const xd,
                                        struct aom_write_bit_buffer *saved_wb,
                                        uint8_t *const dst,
                                        int append_trailing_bits) {
   struct aom_write_bit_buffer wb = { dst, 0 };
-  write_uncompressed_header_obu(cpi, saved_wb, &wb);
+  write_uncompressed_header_obu(cpi, xd, saved_wb, &wb);
   if (append_trailing_bits) add_trailing_bits(&wb);
   return aom_wb_bytes_written(&wb);
 }
@@ -3435,356 +3446,583 @@ static uint32_t write_tile_group_header(uint8_t *const dst, int start_tile,
   return size;
 }
 
-typedef struct {
-  uint8_t *frame_header;
-  size_t obu_header_byte_offset;
-  size_t total_length;
-} FrameHeaderInfo;
-
 extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
                                                 const char *filename);
 
-static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
-                                       struct aom_write_bit_buffer *saved_wb,
-                                       uint8_t obu_extension_header,
-                                       const FrameHeaderInfo *fh_info,
-                                       int *const largest_tile_id) {
+typedef struct {
+  uint32_t tg_hdr_size;
+  uint32_t frame_header_size;
+} LargeTileFrameOBU;
+
+// Initialize OBU header for large scale tile case.
+static uint32_t init_large_scale_tile_obu_header(
+    AV1_COMP *const cpi, uint8_t **data, struct aom_write_bit_buffer *saved_wb,
+    LargeTileFrameOBU *lst_obu) {
+  AV1LevelParams *const level_params = &cpi->ppi->level_params;
+  CurrentFrame *const current_frame = &cpi->common.current_frame;
+  // For large_scale_tile case, we always have only one tile group, so it can
+  // be written as an OBU_FRAME.
+  const OBU_TYPE obu_type = OBU_FRAME;
+  lst_obu->tg_hdr_size = av1_write_obu_header(
+      level_params, &cpi->frame_header_count, obu_type, 0, *data);
+  *data += lst_obu->tg_hdr_size;
+
+  const uint32_t frame_header_size =
+      write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, saved_wb, *data, 0);
+  *data += frame_header_size;
+  lst_obu->frame_header_size = frame_header_size;
+  // (yunqing) This test ensures the correctness of large scale tile coding.
+  if (cpi->oxcf.tile_cfg.enable_ext_tile_debug) {
+    char fn[20] = "./fh";
+    fn[4] = current_frame->frame_number / 100 + '0';
+    fn[5] = (current_frame->frame_number % 100) / 10 + '0';
+    fn[6] = (current_frame->frame_number % 10) + '0';
+    fn[7] = '\0';
+    av1_print_uncompressed_frame_header(*data - frame_header_size,
+                                        frame_header_size, fn);
+  }
+  return frame_header_size;
+}
+
+// Write total buffer size and related information into the OBU header for large
+// scale tile case.
+static void write_large_scale_tile_obu_size(
+    const CommonTileParams *const tiles, uint8_t *const dst, uint8_t *data,
+    struct aom_write_bit_buffer *saved_wb, LargeTileFrameOBU *const lst_obu,
+    int have_tiles, uint32_t *total_size, int max_tile_size,
+    int max_tile_col_size) {
+  int tile_size_bytes = 0;
+  int tile_col_size_bytes = 0;
+  if (have_tiles) {
+    *total_size = remux_tiles(
+        tiles, data, *total_size - lst_obu->frame_header_size, max_tile_size,
+        max_tile_col_size, &tile_size_bytes, &tile_col_size_bytes);
+    *total_size += lst_obu->frame_header_size;
+  }
+
+  // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write
+  // current tile group size before tile data(include tile column header).
+  // Tile group size doesn't include the bytes storing tg size.
+  *total_size += lst_obu->tg_hdr_size;
+  const uint32_t obu_payload_size = *total_size - lst_obu->tg_hdr_size;
+  const size_t length_field_size =
+      av1_obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst);
+  if (av1_write_uleb_obu_size(lst_obu->tg_hdr_size, obu_payload_size, dst) !=
+      AOM_CODEC_OK)
+    assert(0);
+
+  *total_size += (uint32_t)length_field_size;
+  saved_wb->bit_buffer += length_field_size;
+
+  // Now fill in the gaps in the uncompressed header.
+  if (have_tiles) {
+    assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+    aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2);
+
+    assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+    aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+  }
+}
+
+// Store information on each large scale tile in the OBU header.
+static void write_large_scale_tile_obu(
+    AV1_COMP *const cpi, uint8_t *const dst, LargeTileFrameOBU *const lst_obu,
+    int *const largest_tile_id, uint32_t *total_size, const int have_tiles,
+    unsigned int *const max_tile_size, unsigned int *const max_tile_col_size) {
   AV1_COMMON *const cm = &cpi->common;
   const CommonTileParams *const tiles = &cm->tiles;
-  AV1LevelParams *const level_params = &cpi->level_params;
-  aom_writer mode_bc;
-  int tile_row, tile_col;
-  // Store the location and size of each tile's data in the bitstream:
+
   TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
-  uint32_t total_size = 0;
   const int tile_cols = tiles->cols;
   const int tile_rows = tiles->rows;
   unsigned int tile_size = 0;
-  unsigned int max_tile_size = 0;
-  unsigned int max_tile_col_size = 0;
-  const int n_log2_tiles = tiles->log2_rows + tiles->log2_cols;
-  // Fixed size tile groups for the moment
-  const int num_tg_hdrs = cpi->num_tg;
-  const int tg_size =
-      (tiles->large_scale)
-          ? 1
-          : (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
-  int tile_count = 0;
-  int curr_tg_data_size = 0;
-  uint8_t *data = dst;
-  int new_tg = 1;
-  const int have_tiles = tile_cols * tile_rows > 1;
-  int first_tg = 1;
 
-  *largest_tile_id = 0;
+  av1_reset_pack_bs_thread_data(&cpi->td);
+  for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+    TileInfo tile_info;
+    const int is_last_col = (tile_col == tile_cols - 1);
+    const uint32_t col_offset = *total_size;
 
-  if (tiles->large_scale) {
-    // For large_scale_tile case, we always have only one tile group, so it can
-    // be written as an OBU_FRAME.
-    const OBU_TYPE obu_type = OBU_FRAME;
-    const uint32_t tg_hdr_size =
-        av1_write_obu_header(level_params, obu_type, 0, data);
-    data += tg_hdr_size;
-
-    const uint32_t frame_header_size =
-        write_frame_header_obu(cpi, saved_wb, data, 0);
-    data += frame_header_size;
-    total_size += frame_header_size;
-
-    // (yunqing) This test ensures the correctness of large scale tile coding.
-    if (cpi->oxcf.ext_tile_debug) {
-      char fn[20] = "./fh";
-      fn[4] = cm->current_frame.frame_number / 100 + '0';
-      fn[5] = (cm->current_frame.frame_number % 100) / 10 + '0';
-      fn[6] = (cm->current_frame.frame_number % 10) + '0';
-      fn[7] = '\0';
-      av1_print_uncompressed_frame_header(data - frame_header_size,
-                                          frame_header_size, fn);
-    }
-
-    int tile_size_bytes = 0;
-    int tile_col_size_bytes = 0;
-
-    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-      TileInfo tile_info;
-      const int is_last_col = (tile_col == tile_cols - 1);
-      const uint32_t col_offset = total_size;
-
-      av1_tile_set_col(&tile_info, cm, tile_col);
-
-      // The last column does not have a column header
-      if (!is_last_col) total_size += 4;
-
-      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
-        const int data_offset = have_tiles ? 4 : 0;
-        const int tile_idx = tile_row * tile_cols + tile_col;
-        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
-        av1_tile_set_row(&tile_info, cm, tile_row);
-
-        buf->data = dst + total_size + tg_hdr_size;
-
-        // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
-        // even for the last one, unless no tiling is used at all.
-        total_size += data_offset;
-        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-        mode_bc.allow_update_cdf = !tiles->large_scale;
-        mode_bc.allow_update_cdf =
-            mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
-        aom_start_encode(&mode_bc, buf->data + data_offset);
-        write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
-        aom_stop_encode(&mode_bc);
-        tile_size = mode_bc.pos;
-        buf->size = tile_size;
-
-        // Record the maximum tile size we see, so we can compact headers later.
-        if (tile_size > max_tile_size) {
-          max_tile_size = tile_size;
-          *largest_tile_id = tile_cols * tile_row + tile_col;
-        }
+    av1_tile_set_col(&tile_info, cm, tile_col);
 
-        if (have_tiles) {
-          // tile header: size of this tile, or copy offset
-          uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES;
-          const int tile_copy_mode =
-              ((AOMMAX(tiles->width, tiles->height) << MI_SIZE_LOG2) <= 256)
-                  ? 1
-                  : 0;
-
-          // If tile_copy_mode = 1, check if this tile is a copy tile.
-          // Very low chances to have copy tiles on the key frames, so don't
-          // search on key frames to reduce unnecessary search.
-          if (cm->current_frame.frame_type != KEY_FRAME && tile_copy_mode) {
-            const int identical_tile_offset =
-                find_identical_tile(tile_row, tile_col, tile_buffers);
-
-            // Indicate a copy-tile by setting the most significant bit.
-            // The row-offset to copy from is stored in the highest byte.
-            // remux_tiles will move these around later
-            if (identical_tile_offset > 0) {
-              tile_size = 0;
-              tile_header = identical_tile_offset | 0x80;
-              tile_header <<= 24;
-            }
-          }
+    // The last column does not have a column header
+    if (!is_last_col) *total_size += 4;
 
-          mem_put_le32(buf->data, tile_header);
-        }
+    for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+      const int data_offset = have_tiles ? 4 : 0;
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+      av1_tile_set_row(&tile_info, cm, tile_row);
+      aom_writer mode_bc;
 
-        total_size += tile_size;
+      buf->data = dst + *total_size + lst_obu->tg_hdr_size;
+
+      // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+      // even for the last one, unless no tiling is used at all.
+      *total_size += data_offset;
+      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+      mode_bc.allow_update_cdf = !tiles->large_scale;
+      mode_bc.allow_update_cdf =
+          mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
+      aom_start_encode(&mode_bc, buf->data + data_offset);
+      write_modes(cpi, &cpi->td, &tile_info, &mode_bc, tile_row, tile_col);
+      aom_stop_encode(&mode_bc);
+      tile_size = mode_bc.pos;
+      buf->size = tile_size;
+
+      // Record the maximum tile size we see, so we can compact headers later.
+      if (tile_size > *max_tile_size) {
+        *max_tile_size = tile_size;
+        *largest_tile_id = tile_cols * tile_row + tile_col;
       }
 
-      if (!is_last_col) {
-        uint32_t col_size = total_size - col_offset - 4;
-        mem_put_le32(dst + col_offset + tg_hdr_size, col_size);
+      if (have_tiles) {
+        // tile header: size of this tile, or copy offset
+        uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES;
+        const int tile_copy_mode =
+            ((AOMMAX(tiles->width, tiles->height) << MI_SIZE_LOG2) <= 256) ? 1
+                                                                           : 0;
+
+        // If tile_copy_mode = 1, check if this tile is a copy tile.
+        // Very low chances to have copy tiles on the key frames, so don't
+        // search on key frames to reduce unnecessary search.
+        if (cm->current_frame.frame_type != KEY_FRAME && tile_copy_mode) {
+          const int identical_tile_offset =
+              find_identical_tile(tile_row, tile_col, tile_buffers);
+
+          // Indicate a copy-tile by setting the most significant bit.
+          // The row-offset to copy from is stored in the highest byte.
+          // remux_tiles will move these around later
+          if (identical_tile_offset > 0) {
+            tile_size = 0;
+            tile_header = identical_tile_offset | 0x80;
+            tile_header <<= 24;
+          }
+        }
 
-        // Record the maximum tile column size we see.
-        max_tile_col_size = AOMMAX(max_tile_col_size, col_size);
+        mem_put_le32(buf->data, tile_header);
       }
-    }
 
-    if (have_tiles) {
-      total_size = remux_tiles(tiles, data, total_size - frame_header_size,
-                               max_tile_size, max_tile_col_size,
-                               &tile_size_bytes, &tile_col_size_bytes);
-      total_size += frame_header_size;
+      *total_size += tile_size;
     }
+    if (!is_last_col) {
+      uint32_t col_size = *total_size - col_offset - 4;
+      mem_put_le32(dst + col_offset + lst_obu->tg_hdr_size, col_size);
 
-    // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write
-    // current tile group size before tile data(include tile column header).
-    // Tile group size doesn't include the bytes storing tg size.
-    total_size += tg_hdr_size;
-    const uint32_t obu_payload_size = total_size - tg_hdr_size;
-    const size_t length_field_size =
-        obu_memmove(tg_hdr_size, obu_payload_size, dst);
-    if (av1_write_uleb_obu_size(tg_hdr_size, obu_payload_size, dst) !=
-        AOM_CODEC_OK) {
-      assert(0);
+      // Record the maximum tile column size we see.
+      *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
     }
-    total_size += (uint32_t)length_field_size;
+  }
+  av1_accumulate_pack_bs_thread_data(cpi, &cpi->td);
+}
+
+// Packs information in the obu header for large scale tiles.
+static INLINE uint32_t pack_large_scale_tiles_in_tg_obus(
+    AV1_COMP *const cpi, uint8_t *const dst,
+    struct aom_write_bit_buffer *saved_wb, int *const largest_tile_id) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  uint32_t total_size = 0;
+  unsigned int max_tile_size = 0;
+  unsigned int max_tile_col_size = 0;
+  const int have_tiles = tiles->cols * tiles->rows > 1;
+  uint8_t *data = dst;
+
+  LargeTileFrameOBU lst_obu;
+
+  total_size +=
+      init_large_scale_tile_obu_header(cpi, &data, saved_wb, &lst_obu);
+
+  write_large_scale_tile_obu(cpi, dst, &lst_obu, largest_tile_id, &total_size,
+                             have_tiles, &max_tile_size, &max_tile_col_size);
+
+  write_large_scale_tile_obu_size(tiles, dst, data, saved_wb, &lst_obu,
+                                  have_tiles, &total_size, max_tile_size,
+                                  max_tile_col_size);
+
+  return total_size;
+}
+
+// Writes obu, tile group and uncompressed headers to bitstream.
+void av1_write_obu_tg_tile_headers(AV1_COMP *const cpi, MACROBLOCKD *const xd,
+                                   PackBSParams *const pack_bs_params,
+                                   const int tile_idx) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  int *const curr_tg_hdr_size = &pack_bs_params->curr_tg_hdr_size;
+  const int tg_size =
+      (tiles->rows * tiles->cols + cpi->num_tg - 1) / cpi->num_tg;
+
+  // Write Tile group, frame and OBU header
+  // A new tile group begins at this tile.  Write the obu header and
+  // tile group header
+  const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP;
+  *curr_tg_hdr_size = av1_write_obu_header(
+      &cpi->ppi->level_params, &cpi->frame_header_count, obu_type,
+      pack_bs_params->obu_extn_header, pack_bs_params->tile_data_curr);
+  pack_bs_params->obu_header_size = *curr_tg_hdr_size;
+
+  if (cpi->num_tg == 1)
+    *curr_tg_hdr_size += write_frame_header_obu(
+        cpi, xd, pack_bs_params->saved_wb,
+        pack_bs_params->tile_data_curr + *curr_tg_hdr_size, 0);
+  *curr_tg_hdr_size += write_tile_group_header(
+      pack_bs_params->tile_data_curr + *curr_tg_hdr_size, tile_idx,
+      AOMMIN(tile_idx + tg_size - 1, tiles->cols * tiles->rows - 1),
+      (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1);
+  *pack_bs_params->total_size += *curr_tg_hdr_size;
+}
+
+// Pack tile data in the bitstream with tile_group, frame
+// and OBU header.
+void av1_pack_tile_info(AV1_COMP *const cpi, ThreadData *const td,
+                        PackBSParams *const pack_bs_params) {
+  aom_writer mode_bc;
+  AV1_COMMON *const cm = &cpi->common;
+  int tile_row = pack_bs_params->tile_row;
+  int tile_col = pack_bs_params->tile_col;
+  uint32_t *const total_size = pack_bs_params->total_size;
+  TileInfo tile_info;
+  av1_tile_set_col(&tile_info, cm, tile_col);
+  av1_tile_set_row(&tile_info, cm, tile_row);
+  mode_bc.allow_update_cdf = 1;
+  mode_bc.allow_update_cdf =
+      mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
+
+  unsigned int tile_size;
+
+  const int num_planes = av1_num_planes(cm);
+  av1_reset_loop_restoration(&td->mb.e_mbd, num_planes);
+
+  pack_bs_params->buf.data = pack_bs_params->dst + *total_size;
+
+  // The last tile of the tile group does not have a header.
+  if (!pack_bs_params->is_last_tile_in_tg) *total_size += 4;
+
+  // Pack tile data
+  aom_start_encode(&mode_bc, pack_bs_params->dst + *total_size);
+  write_modes(cpi, td, &tile_info, &mode_bc, tile_row, tile_col);
+  aom_stop_encode(&mode_bc);
+  tile_size = mode_bc.pos;
+  assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
+
+  pack_bs_params->buf.size = tile_size;
+
+  // Write tile size
+  if (!pack_bs_params->is_last_tile_in_tg) {
+    // size of this tile
+    mem_put_le32(pack_bs_params->buf.data, tile_size - AV1_MIN_TILE_SIZE_BYTES);
+  }
+}
+
+void av1_write_last_tile_info(
+    AV1_COMP *const cpi, const FrameHeaderInfo *fh_info,
+    struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size,
+    uint8_t *curr_tg_start, uint32_t *const total_size,
+    uint8_t **tile_data_start, int *const largest_tile_id,
+    int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header) {
+  // write current tile group size
+  const uint32_t obu_payload_size =
+      (uint32_t)(*curr_tg_data_size) - obu_header_size;
+  const size_t length_field_size =
+      av1_obu_memmove(obu_header_size, obu_payload_size, curr_tg_start);
+  if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
+                              curr_tg_start) != AOM_CODEC_OK) {
+    assert(0);
+  }
+  *curr_tg_data_size += (int)length_field_size;
+  *total_size += (uint32_t)length_field_size;
+  *tile_data_start += length_field_size;
+  if (cpi->num_tg == 1) {
+    // if this tg is combined with the frame header then update saved
+    // frame header base offset according to length field size
     saved_wb->bit_buffer += length_field_size;
+  }
 
-    // Now fill in the gaps in the uncompressed header.
-    if (have_tiles) {
-      assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
-      aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2);
+  if (!(*is_first_tg) && cpi->common.features.error_resilient_mode) {
+    // Make room for a duplicate Frame Header OBU.
+    memmove(curr_tg_start + fh_info->total_length, curr_tg_start,
+            *curr_tg_data_size);
 
-      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
-      aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
-    }
-    return total_size;
+    // Insert a copy of the Frame Header OBU.
+    memcpy(curr_tg_start, fh_info->frame_header, fh_info->total_length);
+
+    // Force context update tile to be the first tile in error
+    // resilient mode as the duplicate frame headers will have
+    // context_update_tile_id set to 0
+    *largest_tile_id = 0;
+
+    // Rewrite the OBU header to change the OBU type to Redundant Frame
+    // Header.
+    av1_write_obu_header(&cpi->ppi->level_params, &cpi->frame_header_count,
+                         OBU_REDUNDANT_FRAME_HEADER, obu_extn_header,
+                         &curr_tg_start[fh_info->obu_header_byte_offset]);
+
+    *curr_tg_data_size += (int)(fh_info->total_length);
+    *total_size += (uint32_t)(fh_info->total_length);
   }
+  *is_first_tg = 0;
+}
 
-  uint32_t obu_header_size = 0;
-  uint8_t *tile_data_start = dst + total_size;
-  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-    TileInfo tile_info;
-    av1_tile_set_row(&tile_info, cm, tile_row);
+void av1_reset_pack_bs_thread_data(ThreadData *const td) {
+  td->coefficient_size = 0;
+  td->max_mv_magnitude = 0;
+  av1_zero(td->interp_filter_selected);
+}
+
+void av1_accumulate_pack_bs_thread_data(AV1_COMP *const cpi,
+                                        ThreadData const *td) {
+  int do_max_mv_magnitude_update = 1;
+  cpi->rc.coefficient_size += td->coefficient_size;
+
+  // Disable max_mv_magnitude update for parallel frames based on update flag.
+  if (!cpi->do_frame_data_update) do_max_mv_magnitude_update = 0;
+
+  if (cpi->sf.mv_sf.auto_mv_step_size && do_max_mv_magnitude_update)
+    cpi->mv_search_params.max_mv_magnitude =
+        AOMMAX(cpi->mv_search_params.max_mv_magnitude, td->max_mv_magnitude);
+
+  for (InterpFilter filter = EIGHTTAP_REGULAR; filter < SWITCHABLE; filter++)
+    cpi->common.cur_frame->interp_filter_selected[filter] +=
+        td->interp_filter_selected[filter];
+}
 
-    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+// Store information related to each default tile in the OBU header.
+static void write_tile_obu(
+    AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+    unsigned int *max_tile_size, uint32_t *const obu_header_size,
+    uint8_t **tile_data_start) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
+  // Fixed size tile groups for the moment
+  const int num_tg_hdrs = cpi->num_tg;
+  const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+  int tile_count = 0;
+  size_t curr_tg_data_size = 0;
+  uint8_t *tile_data_curr = dst;
+  int new_tg = 1;
+  int is_first_tg = 1;
+
+  av1_reset_pack_bs_thread_data(&cpi->td);
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
       const int tile_idx = tile_row * tile_cols + tile_col;
-      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
       TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
-      int is_last_tile_in_tg = 0;
 
+      int is_last_tile_in_tg = 0;
       if (new_tg) {
-        data = dst + total_size;
-
-        // A new tile group begins at this tile.  Write the obu header and
-        // tile group header
-        const OBU_TYPE obu_type =
-            (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP;
-        curr_tg_data_size = av1_write_obu_header(level_params, obu_type,
-                                                 obu_extension_header, data);
-        obu_header_size = curr_tg_data_size;
-
-        if (num_tg_hdrs == 1) {
-          curr_tg_data_size += write_frame_header_obu(
-              cpi, saved_wb, data + curr_tg_data_size, 0);
-        }
-        curr_tg_data_size += write_tile_group_header(
-            data + curr_tg_data_size, tile_idx,
-            AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1),
-            n_log2_tiles, cpi->num_tg > 1);
-        total_size += curr_tg_data_size;
-        tile_data_start += curr_tg_data_size;
-        new_tg = 0;
+        tile_data_curr = dst + *total_size;
         tile_count = 0;
       }
       tile_count++;
-      av1_tile_set_col(&tile_info, cm, tile_col);
 
-      if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) {
+      if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1))
         is_last_tile_in_tg = 1;
-        new_tg = 1;
-      } else {
-        is_last_tile_in_tg = 0;
-      }
 
-      buf->data = dst + total_size;
+      xd->tile_ctx = &this_tile->tctx;
 
-      // The last tile of the tile group does not have a header.
-      if (!is_last_tile_in_tg) total_size += 4;
+      // PackBSParams stores all parameters required to pack tile and header
+      // info.
+      PackBSParams pack_bs_params;
+      pack_bs_params.dst = dst;
+      pack_bs_params.curr_tg_hdr_size = 0;
+      pack_bs_params.is_last_tile_in_tg = is_last_tile_in_tg;
+      pack_bs_params.new_tg = new_tg;
+      pack_bs_params.obu_extn_header = obu_extn_header;
+      pack_bs_params.obu_header_size = 0;
+      pack_bs_params.saved_wb = saved_wb;
+      pack_bs_params.tile_col = tile_col;
+      pack_bs_params.tile_row = tile_row;
+      pack_bs_params.tile_data_curr = tile_data_curr;
+      pack_bs_params.total_size = total_size;
 
-      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-      mode_bc.allow_update_cdf = 1;
-      mode_bc.allow_update_cdf =
-          mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
-      const int num_planes = av1_num_planes(cm);
-      av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes);
+      if (new_tg)
+        av1_write_obu_tg_tile_headers(cpi, xd, &pack_bs_params, tile_idx);
 
-      aom_start_encode(&mode_bc, dst + total_size);
-      write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
-      aom_stop_encode(&mode_bc);
-      tile_size = mode_bc.pos;
-      assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
+      av1_pack_tile_info(cpi, &cpi->td, &pack_bs_params);
 
-      curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4));
-      buf->size = tile_size;
-      if (tile_size > max_tile_size) {
-        *largest_tile_id = tile_cols * tile_row + tile_col;
-        max_tile_size = tile_size;
+      if (new_tg) {
+        curr_tg_data_size = pack_bs_params.curr_tg_hdr_size;
+        *tile_data_start += pack_bs_params.curr_tg_hdr_size;
+        *obu_header_size = pack_bs_params.obu_header_size;
+        new_tg = 0;
       }
+      if (is_last_tile_in_tg) new_tg = 1;
 
-      if (!is_last_tile_in_tg) {
-        // size of this tile
-        mem_put_le32(buf->data, tile_size - AV1_MIN_TILE_SIZE_BYTES);
-      } else {
-        // write current tile group size
-        const uint32_t obu_payload_size = curr_tg_data_size - obu_header_size;
-        const size_t length_field_size =
-            obu_memmove(obu_header_size, obu_payload_size, data);
-        if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
-            AOM_CODEC_OK) {
-          assert(0);
-        }
-        curr_tg_data_size += (int)length_field_size;
-        total_size += (uint32_t)length_field_size;
-        tile_data_start += length_field_size;
-        if (num_tg_hdrs == 1) {
-          // if this tg is combined with the frame header then update saved
-          // frame header base offset accroding to length field size
-          saved_wb->bit_buffer += length_field_size;
-        }
+      curr_tg_data_size +=
+          (pack_bs_params.buf.size + (is_last_tile_in_tg ? 0 : 4));
 
-        if (!first_tg && cm->features.error_resilient_mode) {
-          // Make room for a duplicate Frame Header OBU.
-          memmove(data + fh_info->total_length, data, curr_tg_data_size);
+      if (pack_bs_params.buf.size > *max_tile_size) {
+        *largest_tile_id = tile_idx;
+        *max_tile_size = (unsigned int)pack_bs_params.buf.size;
+      }
 
-          // Insert a copy of the Frame Header OBU.
-          memcpy(data, fh_info->frame_header, fh_info->total_length);
+      if (is_last_tile_in_tg)
+        av1_write_last_tile_info(cpi, fh_info, saved_wb, &curr_tg_data_size,
+                                 tile_data_curr, total_size, tile_data_start,
+                                 largest_tile_id, &is_first_tg,
+                                 *obu_header_size, obu_extn_header);
+      *total_size += (uint32_t)pack_bs_params.buf.size;
+    }
+  }
+  av1_accumulate_pack_bs_thread_data(cpi, &cpi->td);
+}
 
-          // Force context update tile to be the first tile in error
-          // resiliant mode as the duplicate frame headers will have
-          // context_update_tile_id set to 0
-          *largest_tile_id = 0;
+// Write total buffer size and related information into the OBU header for
+// default tile case.
+static void write_tile_obu_size(AV1_COMP *const cpi, uint8_t *const dst,
+                                struct aom_write_bit_buffer *saved_wb,
+                                int largest_tile_id, uint32_t *const total_size,
+                                unsigned int max_tile_size,
+                                uint32_t obu_header_size,
+                                uint8_t *tile_data_start) {
+  const CommonTileParams *const tiles = &cpi->common.tiles;
+
+  // Fill in context_update_tile_id indicating the tile to use for the
+  // cdf update. The encoder currently sets it to the largest tile
+  // (but is up to the encoder)
+  aom_wb_overwrite_literal(saved_wb, largest_tile_id,
+                           (tiles->log2_cols + tiles->log2_rows));
+  // If more than one tile group. tile_size_bytes takes the default value 4
+  // and does not need to be set. For a single tile group it is set in the
+  // section below.
+  if (cpi->num_tg != 1) return;
+  int tile_size_bytes = 4, unused;
+  const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst);
+  const uint32_t tile_data_size = *total_size - tile_data_offset;
+
+  *total_size = remux_tiles(tiles, tile_data_start, tile_data_size,
+                            max_tile_size, 0, &tile_size_bytes, &unused);
+  *total_size += tile_data_offset;
+  assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+
+  aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+
+  // Update the OBU length if remux_tiles() reduced the size.
+  uint64_t payload_size;
+  size_t length_field_size;
+  int res =
+      aom_uleb_decode(dst + obu_header_size, *total_size - obu_header_size,
+                      &payload_size, &length_field_size);
+  assert(res == 0);
+  (void)res;
+
+  const uint64_t new_payload_size =
+      *total_size - obu_header_size - length_field_size;
+  if (new_payload_size != payload_size) {
+    size_t new_length_field_size;
+    res = aom_uleb_encode(new_payload_size, length_field_size,
+                          dst + obu_header_size, &new_length_field_size);
+    assert(res == 0);
+    if (new_length_field_size < length_field_size) {
+      const size_t src_offset = obu_header_size + length_field_size;
+      const size_t dst_offset = obu_header_size + new_length_field_size;
+      memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size);
+      *total_size -= (int)(length_field_size - new_length_field_size);
+    }
+  }
+}
 
-          // Rewrite the OBU header to change the OBU type to Redundant Frame
-          // Header.
-          av1_write_obu_header(level_params, OBU_REDUNDANT_FRAME_HEADER,
-                               obu_extension_header,
-                               &data[fh_info->obu_header_byte_offset]);
+// As per the experiments, single-thread bitstream packing is better for
+// frames with a smaller bitstream size. This behavior is due to setup time
+// overhead of multithread function would be more than that of time required
+// to pack the smaller bitstream of such frames. This function computes the
+// number of required number of workers based on setup time overhead and job
+// dispatch time overhead for given tiles and available workers.
+int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles,
+                            int avail_workers, bool pack_bs_mt_enabled) {
+  if (!pack_bs_mt_enabled) return 1;
+
+  uint64_t frame_abs_sum_level = 0;
+
+  for (int idx = 0; idx < num_tiles; idx++)
+    frame_abs_sum_level += tile_data[idx].abs_sum_level;
+
+  int ideal_num_workers = 1;
+  const float job_disp_time_const = (float)num_tiles * JOB_DISP_TIME_OH_CONST;
+  float max_sum = 0.0;
+
+  for (int num_workers = avail_workers; num_workers > 1; num_workers--) {
+    const float fas_per_worker_const =
+        ((float)(num_workers - 1) / num_workers) * frame_abs_sum_level;
+    const float setup_time_const = (float)num_workers * SETUP_TIME_OH_CONST;
+    const float this_sum = fas_per_worker_const - setup_time_const -
+                           job_disp_time_const / num_workers;
+
+    if (this_sum > max_sum) {
+      max_sum = this_sum;
+      ideal_num_workers = num_workers;
+    }
+  }
+  return ideal_num_workers;
+}
 
-          data += fh_info->total_length;
+static INLINE uint32_t pack_tiles_in_tg_obus(
+    AV1_COMP *const cpi, uint8_t *const dst,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id) {
+  const CommonTileParams *const tiles = &cpi->common.tiles;
+  uint32_t total_size = 0;
+  unsigned int max_tile_size = 0;
+  uint32_t obu_header_size = 0;
+  uint8_t *tile_data_start = dst;
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
+  const int num_tiles = tile_rows * tile_cols;
 
-          curr_tg_data_size += (int)(fh_info->total_length);
-          total_size += (uint32_t)(fh_info->total_length);
-        }
-        first_tg = 0;
-      }
+  const int num_workers = calc_pack_bs_mt_workers(
+      cpi->tile_data, num_tiles, cpi->mt_info.num_mod_workers[MOD_PACK_BS],
+      cpi->mt_info.pack_bs_mt_enabled);
 
-      total_size += tile_size;
-    }
+  if (num_workers > 1) {
+    av1_write_tile_obu_mt(cpi, dst, &total_size, saved_wb, obu_extension_header,
+                          fh_info, largest_tile_id, &max_tile_size,
+                          &obu_header_size, &tile_data_start, num_workers);
+  } else {
+    write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header,
+                   fh_info, largest_tile_id, &max_tile_size, &obu_header_size,
+                   &tile_data_start);
   }
 
-  if (have_tiles) {
-    // Fill in context_update_tile_id indicating the tile to use for the
-    // cdf update. The encoder currently sets it to the largest tile
-    // (but is up to the encoder)
-    aom_wb_overwrite_literal(saved_wb, *largest_tile_id,
-                             tiles->log2_cols + tiles->log2_rows);
-    // If more than one tile group. tile_size_bytes takes the default value 4
-    // and does not need to be set. For a single tile group it is set in the
-    // section below.
-    if (num_tg_hdrs == 1) {
-      int tile_size_bytes = 4, unused;
-      const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst);
-      const uint32_t tile_data_size = total_size - tile_data_offset;
-
-      total_size =
-          remux_tiles(tiles, tile_data_start, tile_data_size, max_tile_size,
-                      max_tile_col_size, &tile_size_bytes, &unused);
-      total_size += tile_data_offset;
-      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
-
-      aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
-
-      // Update the OBU length if remux_tiles() reduced the size.
-      uint64_t payload_size;
-      size_t length_field_size;
-      int res =
-          aom_uleb_decode(dst + obu_header_size, total_size - obu_header_size,
-                          &payload_size, &length_field_size);
-      assert(res == 0);
-      (void)res;
-
-      const uint64_t new_payload_size =
-          total_size - obu_header_size - length_field_size;
-      if (new_payload_size != payload_size) {
-        size_t new_length_field_size;
-        res = aom_uleb_encode(new_payload_size, length_field_size,
-                              dst + obu_header_size, &new_length_field_size);
-        assert(res == 0);
-        if (new_length_field_size < length_field_size) {
-          const size_t src_offset = obu_header_size + length_field_size;
-          const size_t dst_offset = obu_header_size + new_length_field_size;
-          memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size);
-          total_size -= (int)(length_field_size - new_length_field_size);
-        }
-      }
+  if (num_tiles > 1)
+    write_tile_obu_size(cpi, dst, saved_wb, *largest_tile_id, &total_size,
+                        max_tile_size, obu_header_size, tile_data_start);
+  return total_size;
+}
+
+static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       uint8_t obu_extension_header,
+                                       const FrameHeaderInfo *fh_info,
+                                       int *const largest_tile_id) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  *largest_tile_id = 0;
+
+  // Select the coding strategy (temporal or spatial)
+  if (cm->seg.enabled && cm->seg.update_map) {
+    if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
+      cm->seg.temporal_update = 0;
+    } else {
+      cm->seg.temporal_update = 1;
+      if (cpi->td.rd_counts.seg_tmp_pred_cost[0] <
+          cpi->td.rd_counts.seg_tmp_pred_cost[1])
+        cm->seg.temporal_update = 0;
     }
   }
-  return total_size;
+
+  if (tiles->large_scale)
+    return pack_large_scale_tiles_in_tg_obus(cpi, dst, saved_wb,
+                                             largest_tile_id);
+
+  return pack_tiles_in_tg_obus(cpi, dst, saved_wb, obu_extension_header,
+                               fh_info, largest_tile_id);
 }
 
 static size_t av1_write_metadata_obu(const aom_metadata_t *metadata,
@@ -3818,18 +4056,20 @@ static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) {
           (cm->current_frame.frame_type != KEY_FRAME &&
            current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) ||
           current_metadata->insert_flag == AOM_MIF_ANY_FRAME) {
-        obu_header_size =
-            av1_write_obu_header(&cpi->level_params, OBU_METADATA, 0, dst);
+        obu_header_size = av1_write_obu_header(&cpi->ppi->level_params,
+                                               &cpi->frame_header_count,
+                                               OBU_METADATA, 0, dst);
         obu_payload_size =
             av1_write_metadata_obu(current_metadata, dst + obu_header_size);
-        length_field_size = obu_memmove(obu_header_size, obu_payload_size, dst);
+        length_field_size =
+            av1_obu_memmove(obu_header_size, obu_payload_size, dst);
         if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) ==
             AOM_CODEC_OK) {
           const size_t obu_size = obu_header_size + obu_payload_size;
           dst += obu_size + length_field_size;
           total_bytes_written += obu_size + length_field_size;
         } else {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+          aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
                              "Error writing metadata OBU size");
         }
       }
@@ -3843,7 +4083,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
   uint8_t *data = dst;
   uint32_t data_size;
   AV1_COMMON *const cm = &cpi->common;
-  AV1LevelParams *const level_params = &cpi->level_params;
+  AV1LevelParams *const level_params = &cpi->ppi->level_params;
   uint32_t obu_header_size = 0;
   uint32_t obu_payload_size = 0;
   FrameHeaderInfo fh_info = { NULL, 0, 0 };
@@ -3859,19 +4099,22 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
   bitstream_queue_reset_write();
 #endif
 
-  level_params->frame_header_count = 0;
+  cpi->frame_header_count = 0;
 
   // The TD is now written outside the frame encode loop
 
-  // write sequence header obu if KEY_FRAME, preceded by 4-byte size
-  if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
-    obu_header_size =
-        av1_write_obu_header(level_params, OBU_SEQUENCE_HEADER, 0, data);
+  // write sequence header obu at each key frame or intra_only frame,
+  // preceded by 4-byte size
+  if (cm->current_frame.frame_type == INTRA_ONLY_FRAME ||
+      (cm->current_frame.frame_type == KEY_FRAME &&
+       cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET)) {
+    obu_header_size = av1_write_obu_header(
+        level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER, 0, data);
 
     obu_payload_size =
-        av1_write_sequence_header_obu(&cm->seq_params, data + obu_header_size);
+        av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size);
     const size_t length_field_size =
-        obu_memmove(obu_header_size, obu_payload_size, data);
+        av1_obu_memmove(obu_header_size, obu_payload_size, data);
     if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
@@ -3885,35 +4128,37 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
 
   const int write_frame_header =
       (cpi->num_tg > 1 || encode_show_existing_frame(cm));
-  struct aom_write_bit_buffer saved_wb;
+  struct aom_write_bit_buffer saved_wb = { NULL, 0 };
+  size_t length_field = 0;
   if (write_frame_header) {
     // Write Frame Header OBU.
     fh_info.frame_header = data;
-    obu_header_size = av1_write_obu_header(level_params, OBU_FRAME_HEADER,
-                                           obu_extension_header, data);
-    obu_payload_size =
-        write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1);
+    obu_header_size =
+        av1_write_obu_header(level_params, &cpi->frame_header_count,
+                             OBU_FRAME_HEADER, obu_extension_header, data);
+    obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb,
+                                              data + obu_header_size, 1);
 
-    const size_t length_field_size =
-        obu_memmove(obu_header_size, obu_payload_size, data);
+    length_field = av1_obu_memmove(obu_header_size, obu_payload_size, data);
     if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
     }
 
     fh_info.obu_header_byte_offset = 0;
-    fh_info.total_length =
-        obu_header_size + obu_payload_size + length_field_size;
+    fh_info.total_length = obu_header_size + obu_payload_size + length_field;
     data += fh_info.total_length;
-
-    // Since length_field_size is determined adaptively after frame header
-    // encoding, saved_wb must be adjusted accordingly.
-    saved_wb.bit_buffer += length_field_size;
   }
 
   if (encode_show_existing_frame(cm)) {
     data_size = 0;
   } else {
+    // Since length_field is determined adaptively after frame header
+    // encoding, saved_wb must be adjusted accordingly.
+    if (saved_wb.bit_buffer != NULL) {
+      saved_wb.bit_buffer += length_field;
+    }
+
     //  Each tile group obu will be preceded by 4-byte size of the tile group
     //  obu
     data_size = write_tiles_in_tg_obus(
diff --git a/media/libaom/src/av1/encoder/bitstream.h b/media/libaom/src/av1/encoder/bitstream.h
index 45151e25e4..5999f9e3c1 100644
--- a/media/libaom/src/av1/encoder/bitstream.h
+++ b/media/libaom/src/av1/encoder/bitstream.h
@@ -16,9 +16,67 @@
 extern "C" {
 #endif
 
-#include "av1/encoder/encoder.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/level.h"
+#include "aom_dsp/bitwriter.h"
 
 struct aom_write_bit_buffer;
+struct AV1_COMP;
+struct ThreadData;
+
+/*!\cond */
+
+// Stores the location and size of a tile's data in the bitstream.  Used for
+// later identifying identical tiles
+typedef struct {
+  uint8_t *data;
+  size_t size;
+} TileBufferEnc;
+
+typedef struct {
+  uint8_t *frame_header;
+  size_t obu_header_byte_offset;
+  size_t total_length;
+} FrameHeaderInfo;
+
+typedef struct {
+  struct aom_write_bit_buffer *saved_wb;  // Bit stream buffer writer structure
+  TileBufferEnc buf;     // Structure to hold bitstream buffer and size
+  uint32_t *total_size;  // Size of the bitstream buffer for the tile in bytes
+  uint8_t *dst;          // Base address of tile bitstream buffer
+  uint8_t *tile_data_curr;   // Base address of tile-group bitstream buffer
+  size_t tile_buf_size;      // Available bitstream buffer for the tile in bytes
+  uint8_t obu_extn_header;   // Presence of OBU extension header
+  uint32_t obu_header_size;  // Size of the OBU header
+  int curr_tg_hdr_size;      // Size of the obu, tg, frame headers
+  int tile_size_mi;          // Tile size in mi units
+  int tile_row;              // Number of tile rows
+  int tile_col;              // Number of tile columns
+  int is_last_tile_in_tg;    // Flag to indicate last tile in a tile-group
+  int new_tg;                // Flag to indicate starting of a new tile-group
+} PackBSParams;
+
+typedef struct {
+  uint64_t abs_sum_level;
+  uint16_t tile_idx;
+} PackBSTileOrder;
+
+// Pack bitstream data for pack bitstream multi-threading.
+typedef struct {
+#if CONFIG_MULTITHREAD
+  // Mutex lock used while dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif
+  // Tile order structure of pack bitstream multithreading.
+  PackBSTileOrder pack_bs_tile_order[MAX_TILES];
+
+  // Index of next job to be processed.
+  int next_job_idx;
+} AV1EncPackBSSync;
+
+/*!\endcond */
 
 // Writes only the OBU Sequence Header payload, and returns the size of the
 // payload written to 'dst'. This function does not write the OBU header, the
@@ -29,18 +87,46 @@ uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
 // Writes the OBU header byte, and the OBU header extension byte when
 // 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
 uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
-                              OBU_TYPE obu_type, int obu_extension,
-                              uint8_t *const dst);
+                              int *frame_header_count, OBU_TYPE obu_type,
+                              int obu_extension, uint8_t *const dst);
 
 int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
                             uint8_t *dest);
 
-int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+// Pack tile data in the bitstream with tile_group, frame
+// and OBU header.
+void av1_pack_tile_info(struct AV1_COMP *const cpi, struct ThreadData *const td,
+                        PackBSParams *const pack_bs_params);
+
+void av1_write_last_tile_info(
+    struct AV1_COMP *const cpi, const FrameHeaderInfo *fh_info,
+    struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size,
+    uint8_t *curr_tg_start, uint32_t *const total_size,
+    uint8_t **tile_data_start, int *const largest_tile_id,
+    int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header);
+
+/*!\brief Pack the bitstream for one frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ */
+int av1_pack_bitstream(struct AV1_COMP *const cpi, uint8_t *dst, size_t *size,
                        int *const largest_tile_id);
 
 void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
                        TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w);
 
+void av1_reset_pack_bs_thread_data(struct ThreadData *const td);
+
+void av1_accumulate_pack_bs_thread_data(struct AV1_COMP *const cpi,
+                                        struct ThreadData const *td);
+
+void av1_write_obu_tg_tile_headers(struct AV1_COMP *const cpi,
+                                   MACROBLOCKD *const xd,
+                                   PackBSParams *const pack_bs_params,
+                                   const int tile_idx);
+
+int av1_neg_interleave(int x, int ref, int max);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/block.h b/media/libaom/src/av1/encoder/block.h
index 5a74567a46..d073fd129d 100644
--- a/media/libaom/src/av1/encoder/block.h
+++ b/media/libaom/src/av1/encoder/block.h
@@ -9,11 +9,16 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*! \file
+ * Declares various structs used to encode the current partition block.
+ */
 #ifndef AOM_AV1_ENCODER_BLOCK_H_
 #define AOM_AV1_ENCODER_BLOCK_H_
 
+#include "av1/common/blockd.h"
 #include "av1/common/entropymv.h"
 #include "av1/common/entropy.h"
+#include "av1/common/enums.h"
 #include "av1/common/mvref_common.h"
 
 #include "av1/encoder/enc_enums.h"
@@ -21,475 +26,1261 @@
 #include "av1/encoder/partition_cnn_weights.h"
 #endif
 
-#include "av1/encoder/hash.h"
+#include "av1/encoder/hash_motion.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define MC_FLOW_BSIZE_1D 16
-#define MC_FLOW_NUM_PELS (MC_FLOW_BSIZE_1D * MC_FLOW_BSIZE_1D)
-#define MAX_MC_FLOW_BLK_IN_SB (MAX_SB_SIZE / MC_FLOW_BSIZE_1D)
-#define MAX_WINNER_MODE_COUNT_INTRA 3
-#define MAX_WINNER_MODE_COUNT_INTER 1
+//! Minimum linear dimension of a tpl block
+#define MIN_TPL_BSIZE_1D 16
+//! Maximum number of tpl block in a super block
+#define MAX_TPL_BLK_IN_SB (MAX_SB_SIZE / MIN_TPL_BSIZE_1D)
+//! Number of txfm hash records kept for the partition block.
+#define RD_RECORD_BUFFER_LEN 8
+
+/*! Maximum value taken by transform type probabilities */
+#define MAX_TX_TYPE_PROB 1024
+/*! \brief Superblock level encoder info
+ *
+ * SuperblockEnc stores superblock level information used by the encoder for
+ * more efficient encoding. Currently this is mostly used to store TPL data
+ * for the current superblock.
+ */
 typedef struct {
+  //! Maximum partition size for the sb.
+  BLOCK_SIZE min_partition_size;
+  //! Minimum partition size for the sb.
+  BLOCK_SIZE max_partition_size;
+
+  /*****************************************************************************
+   * \name TPL Info
+   *
+   * Information gathered from tpl_model at tpl block precision for the
+   * superblock to speed up the encoding process..
+   ****************************************************************************/
+  /**@{*/
+  //! Number of TPL blocks in this superblock.
+  int tpl_data_count;
+  //! TPL's estimate of inter cost for each tpl block.
+  int64_t tpl_inter_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB];
+  //! TPL's estimate of tpl cost for each tpl block.
+  int64_t tpl_intra_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB];
+  //! Motion vectors found by TPL model for each tpl block.
+  int_mv tpl_mv[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB][INTER_REFS_PER_FRAME];
+  //! TPL's stride for the arrays in this struct.
+  int tpl_stride;
+  /**@}*/
+} SuperBlockEnc;
+
+/*! \brief Stores the best performing modes.
+ */
+typedef struct {
+  //! The mbmi used to reconstruct the winner mode.
   MB_MODE_INFO mbmi;
+  //! Rdstats of the winner mode.
   RD_STATS rd_cost;
+  //! Rdcost of the winner mode
   int64_t rd;
+  //! Luma rate of the winner mode.
   int rate_y;
+  //! Chroma rate of the winner mode.
   int rate_uv;
-  uint8_t color_index_map[64 * 64];
+  //! The color map needed to reconstruct palette mode.
+  uint8_t color_index_map[MAX_SB_SQUARE];
+  //! The current winner mode.
   THR_MODES mode_index;
 } WinnerModeStats;
 
-typedef struct {
-  unsigned int sse;
-  int sum;
-  unsigned int var;
-} DIFF;
-
-enum {
-  NO_TRELLIS_OPT,          // No trellis optimization
-  FULL_TRELLIS_OPT,        // Trellis optimization in all stages
-  FINAL_PASS_TRELLIS_OPT,  // Trellis optimization in only the final encode pass
-  NO_ESTIMATE_YRD_TRELLIS_OPT  // Disable trellis in estimate_yrd_for_sb
-} UENUM1BYTE(TRELLIS_OPT_TYPE);
-
+/*! \brief Each source plane of the current macroblock
+ *
+ * This struct also stores the txfm buffers and quantizer settings.
+ */
 typedef struct macroblock_plane {
-  DECLARE_ALIGNED(32, int16_t, src_diff[MAX_SB_SQUARE]);
+  //! Stores source - pred so the txfm can be computed later
+  int16_t *src_diff;
+  //! Dequantized coefficients
+  tran_low_t *dqcoeff;
+  //! Quantized coefficients
   tran_low_t *qcoeff;
+  //! Transformed coefficients
   tran_low_t *coeff;
+  //! Location of the end of qcoeff (end of block).
   uint16_t *eobs;
+  //! Contexts used to code the transform coefficients.
   uint8_t *txb_entropy_ctx;
+  //! A buffer containing the source frame.
   struct buf_2d src;
 
-  // Quantizer setings
-  // These are used/accessed only in the quantization process
-  // RDO does not / must not depend on any of these values
-  // All values below share the coefficient scale/shift used in TX
+  /*! \name Quantizer Settings
+   *
+   * \attention These are used/accessed only in the quantization process.
+   * RDO does not and *must not* depend on any of these values.
+   * All values below share the coefficient scale/shift used in TX.
+   */
+  /**@{*/
+  //! Quantization step size used by AV1_XFORM_QUANT_FP.
   const int16_t *quant_fp_QTX;
+  //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_FP.
   const int16_t *round_fp_QTX;
+  //! Quantization step size used by AV1_XFORM_QUANT_B.
   const int16_t *quant_QTX;
+  //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_B.
+  const int16_t *round_QTX;
+  //! Scale factor to shift coefficients toward zero. Only used by QUANT_B.
   const int16_t *quant_shift_QTX;
+  //! Size of the quantization bin around 0. Only Used by QUANT_B
   const int16_t *zbin_QTX;
-  const int16_t *round_QTX;
+  //! Dequantizer
   const int16_t *dequant_QTX;
+  /**@}*/
 } MACROBLOCK_PLANE;
 
+/*! \brief Costs for encoding the coefficients within a level.
+ *
+ * Covers everything including txb_skip, eob, dc_sign,
+ */
 typedef struct {
+  //! Cost to skip txfm for the current txfm block.
   int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
+  /*! \brief Cost for encoding the base_eob of a level.
+   *
+   * Decoder uses base_eob to derive the base_level as base_eob := base_eob+1.
+   */
   int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
+  /*! \brief Cost for encoding the base level of a coefficient.
+   *
+   * Decoder derives coeff_base as coeff_base := base_eob + 1.
+   */
   int base_cost[SIG_COEF_CONTEXTS][8];
+  /*! \brief Cost for encoding the last non-zero coefficient.
+   *
+   * Eob is derived from eob_extra at the decoder as eob := eob_extra + 1
+   */
   int eob_extra_cost[EOB_COEF_CONTEXTS][2];
+  //! Cost for encoding the dc_sign
   int dc_sign_cost[DC_SIGN_CONTEXTS][2];
+  //! Cost for encoding an increment to the coefficient
   int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1];
 } LV_MAP_COEFF_COST;
 
+/*! \brief Costs for encoding the eob.
+ */
 typedef struct {
+  //! eob_cost.
   int eob_cost[2][11];
 } LV_MAP_EOB_COST;
 
+/*! \brief Stores the transforms coefficients for the whole superblock.
+ */
 typedef struct {
-  tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE];
-  uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
-  // Transform block entropy contexts.
-  // Bits 0~3: txb_skip_ctx; bits 4~5: dc_sign_ctx.
-  uint8_t entropy_ctx[MAX_MB_PLANE]
-                     [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  //! The transformed coefficients.
+  tran_low_t *tcoeff[MAX_MB_PLANE];
+  //! Where the transformed coefficients end.
+  uint16_t *eobs[MAX_MB_PLANE];
+  /*! \brief Transform block entropy contexts.
+   *
+   * Each element is used as a bit field.
+   * - Bits 0~3: txb_skip_ctx
+   * - Bits 4~5: dc_sign_ctx.
+   */
+  uint8_t *entropy_ctx[MAX_MB_PLANE];
 } CB_COEFF_BUFFER;
 
+/*! \brief Extended mode info derived from mbmi.
+ */
 typedef struct {
   // TODO(angiebird): Reduce the buffer size according to sb_type
+  //! The reference mv list for the current block.
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
+  //! The weights used to compute the ref mvs.
   uint16_t weight[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
+  //! Number of ref mvs in the drl.
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  //! Global mvs
   int_mv global_mvs[REF_FRAMES];
+  //! Context used to encode the current mode.
   int16_t mode_context[MODE_CTX_REF_FRAMES];
-  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
 } MB_MODE_INFO_EXT;
 
-// Structure to store best mode information at frame level. This
-// frame level information will be used during bitstream preparation stage.
+/*! \brief Stores best extended mode information at frame level.
+ *
+ * The frame level in here is used in bitstream preparation stage. The
+ * information in \ref MB_MODE_INFO_EXT are copied to this struct to save
+ * memory.
+ */
 typedef struct {
+  //! \copydoc MB_MODE_INFO_EXT::ref_mv_stack
   CANDIDATE_MV ref_mv_stack[USABLE_REF_MV_STACK_SIZE];
+  //! \copydoc MB_MODE_INFO_EXT::weight
   uint16_t weight[USABLE_REF_MV_STACK_SIZE];
+  //! \copydoc MB_MODE_INFO_EXT::ref_mv_count
+  uint8_t ref_mv_count;
   // TODO(Ravi/Remya): Reduce the buffer size of global_mvs
+  //! \copydoc MB_MODE_INFO_EXT::global_mvs
   int_mv global_mvs[REF_FRAMES];
-  int cb_offset;
+  //! \copydoc MB_MODE_INFO_EXT::mode_context
   int16_t mode_context;
-  uint8_t ref_mv_count;
+  //! Offset of current coding block's coeff buffer relative to the sb.
+  uint16_t cb_offset[PLANE_TYPES];
 } MB_MODE_INFO_EXT_FRAME;
 
+/*! \brief Inter-mode txfm results for a partition block.
+ */
 typedef struct {
-  uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
-  int kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
-} PALETTE_BUFFER;
-
-typedef struct {
+  //! Txfm size used if the current mode is intra mode.
   TX_SIZE tx_size;
+  //! Txfm sizes used if the current mode is inter mode.
   TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  //! Map showing which txfm block skips the txfm process.
   uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  //! Map showing the txfm types for each block.
   uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  //! Rd_stats for the whole partition block.
   RD_STATS rd_stats;
+  //! Hash value of the current record.
   uint32_t hash_value;
 } MB_RD_INFO;
 
-#define RD_RECORD_BUFFER_LEN 8
+/*! \brief Hash records of the inter-mode transform results
+ *
+ * Hash records of the inter-mode transform results for a whole partition block
+ * based on the residue. Since this operates on the partition block level, this
+ * can give us a whole txfm partition tree.
+ */
 typedef struct {
-  MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];  // Circular buffer.
+  /*! Circular buffer that stores the inter-mode txfm results of a partition
+   *  block.
+   */
+  MB_RD_INFO mb_rd_info[RD_RECORD_BUFFER_LEN];
+  //! Index to insert the newest rd record.
   int index_start;
+  //! Number of info stored in this record.
   int num;
-  CRC32C crc_calculator;  // Hash function.
+  //! Hash function
+  CRC32C crc_calculator;
 } MB_RD_RECORD;
 
-typedef struct {
-  int64_t dist;
-  int64_t sse;
-  int rate;
-  uint16_t eob;
-  TX_TYPE tx_type;
-  uint16_t entropy_context;
-  uint8_t txb_entropy_ctx;
-  uint8_t valid;
-  uint8_t fast;  // This is not being used now.
-  uint8_t perform_block_coeff_opt;
-} TXB_RD_INFO;
-
-#define TX_SIZE_RD_RECORD_BUFFER_LEN 256
-typedef struct {
-  uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN];
-  TXB_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN];
-  int index_start;
-  int num;
-} TXB_RD_RECORD;
-
-typedef struct tx_size_rd_info_node {
-  TXB_RD_INFO *rd_info_array;  // Points to array of size TX_TYPES.
-  struct tx_size_rd_info_node *children[4];
-} TXB_RD_INFO_NODE;
-
-// Simple translation rd state for prune_comp_search_by_single_result
-typedef struct {
-  RD_STATS rd_stats;
-  RD_STATS rd_stats_y;
-  RD_STATS rd_stats_uv;
-  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  uint8_t skip;
-  uint8_t disable_skip;
-  uint8_t early_skipped;
-} SimpleRDState;
-
-// 4: NEAREST, NEW, NEAR, GLOBAL
-#define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4)
-
+//! Number of compound rd stats
 #define MAX_COMP_RD_STATS 64
+/*! \brief Rdcost stats in compound mode.
+ */
 typedef struct {
+  //! Rate of the compound modes.
   int32_t rate[COMPOUND_TYPES];
+  //! Distortion of the compound modes.
   int64_t dist[COMPOUND_TYPES];
+  //! Estimated rate of the compound modes.
   int32_t model_rate[COMPOUND_TYPES];
+  //! Estimated distortion of the compound modes.
   int64_t model_dist[COMPOUND_TYPES];
+  //! Rate need to send the mask type.
   int comp_rs2[COMPOUND_TYPES];
+  //! Motion vector for each predictor.
   int_mv mv[2];
+  //! Ref frame for each predictor.
   MV_REFERENCE_FRAME ref_frames[2];
+  //! Current prediction mode.
   PREDICTION_MODE mode;
+  //! Current interpolation filter.
   int_interpfilters filter;
+  //! Refmv index in the drl.
   int ref_mv_idx;
+  //! Whether the predictors are GLOBALMV.
   int is_global[2];
+  //! Current parameters for interinter mode.
   INTERINTER_COMPOUND_DATA interinter_comp;
 } COMP_RD_STATS;
 
-// Struct for buffers used by av1_compound_type_rd() function.
-// For sizes and alignment of these arrays, refer to
-// alloc_compound_type_rd_buffers() function.
+/*! \brief Contains buffers used to speed up rdopt for obmc.
+ *
+ * See the comments for calc_target_weighted_pred for details.
+ */
+typedef struct {
+  /*! \brief A new source weighted with the above and left predictors.
+   *
+   * Used to efficiently construct multiple obmc predictors during rdopt.
+   */
+  int32_t *wsrc;
+  /*! \brief A new mask constructed from the original horz/vert mask.
+   *
+   * \copydetails wsrc
+   */
+  int32_t *mask;
+  /*! \brief Prediction from the up predictor.
+   *
+   * Used to build the obmc predictor.
+   */
+  uint8_t *above_pred;
+  /*! \brief Prediction from the up predictor.
+   *
+   * \copydetails above_pred
+   */
+  uint8_t *left_pred;
+} OBMCBuffer;
+
+/*! \brief Contains color maps used in palette mode.
+ */
+typedef struct {
+  //! The best color map found.
+  uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
+  //! A temporary buffer used for k-means clustering.
+  int kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
+} PALETTE_BUFFER;
+
+/*! \brief Contains buffers used by av1_compound_type_rd()
+ *
+ * For sizes and alignment of these arrays, refer to
+ * alloc_compound_type_rd_buffers() function.
+ */
 typedef struct {
+  //! First prediction.
   uint8_t *pred0;
+  //! Second prediction.
   uint8_t *pred1;
-  int16_t *residual1;          // src - pred1
-  int16_t *diff10;             // pred1 - pred0
-  uint8_t *tmp_best_mask_buf;  // backup of the best segmentation mask
+  //! Source - first prediction.
+  int16_t *residual1;
+  //! Second prediction - first prediction.
+  int16_t *diff10;
+  //! Backup of the best segmentation mask.
+  uint8_t *tmp_best_mask_buf;
 } CompoundTypeRdBuffers;
 
-enum {
-  MV_COST_ENTROPY,    // Use the entropy rate of the mv as the cost
-  MV_COST_L1_LOWRES,  // Use the l1 norm of the mv as the cost (<480p)
-  MV_COST_L1_MIDRES,  // Use the l1 norm of the mv as the cost (>=480p)
-  MV_COST_L1_HDRES,   // Use the l1 norm of the mv as the cost (>=720p)
-  MV_COST_NONE        // Use 0 as as cost irrespective of the current mv
-} UENUM1BYTE(MV_COST_TYPE);
-
-struct inter_modes_info;
-typedef struct macroblock MACROBLOCK;
-struct macroblock {
-  struct macroblock_plane plane[MAX_MB_PLANE];
-
-  // Determine if one would go with reduced complexity transform block
-  // search model to select prediction modes, or full complexity model
-  // to select transform kernel.
-  int rd_model;
-
-  // prune_comp_search_by_single_result (3:MAX_REF_MV_SEARCH)
-  SimpleRDState simple_rd_state[SINGLE_REF_MODES][3];
-
-  // Inter macroblock RD search info.
-  MB_RD_RECORD mb_rd_record;
+/*! \brief Holds some parameters related to partitioning schemes in AV1.
+ */
+// TODO(chiyotsai@google.com): Consolidate this with SIMPLE_MOTION_DATA_TREE
+typedef struct {
+#if !CONFIG_REALTIME_ONLY
+  // The following 4 parameters are used for cnn-based partitioning on intra
+  // frame.
+  /*! \brief Current index on the partition block quad tree.
+   *
+   * Used to index into the cnn buffer for partition decision.
+   */
+  int quad_tree_idx;
+  //! Whether the CNN buffer contains valid output.
+  int cnn_output_valid;
+  //! A buffer used by our segmentation CNN for intra-frame partitioning.
+  float cnn_buffer[CNN_OUT_BUF_SIZE];
+  //! log of the quantization parameter of the ancestor BLOCK_64X64.
+  float log_q;
+#endif
 
-  // Inter transform block RD search info. for square TX sizes.
-  TXB_RD_RECORD txb_rd_record_8X8[(MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)];
-  TXB_RD_RECORD txb_rd_record_16X16[(MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)];
-  TXB_RD_RECORD txb_rd_record_32X32[(MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)];
-  TXB_RD_RECORD txb_rd_record_64X64[(MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)];
+  /*! \brief Variance of the subblocks in the superblock.
+   *
+   * This is used by rt mode for variance based partitioning.
+   * The indices corresponds to the following block sizes:
+   * -   0    - 128x128
+   * -  1-2   - 128x64
+   * -  3-4   -  64x128
+   * -  5-8   -  64x64
+   * -  9-16  -  64x32
+   * - 17-24  -  32x64
+   * - 25-40  -  32x32
+   * - 41-104 -  16x16
+   */
+  uint8_t variance_low[105];
+} PartitionSearchInfo;
 
-  // Intra transform block RD search info. for square TX sizes.
-  TXB_RD_RECORD txb_rd_record_intra;
+/*! \brief Defines the parameters used to perform txfm search.
+ *
+ * For the most part, this determines how various speed features are used.
+ */
+typedef struct {
+  /*! \brief Whether to limit the intra txfm search type to the default txfm.
+   *
+   * This could either be a result of either sequence parameter or speed
+   * features.
+   */
+  int use_default_intra_tx_type;
 
-  MACROBLOCKD e_mbd;
-  MB_MODE_INFO_EXT *mbmi_ext;
-  MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame;
-  // Array of mode stats for winner mode processing
-  WinnerModeStats winner_mode_stats[AOMMAX(MAX_WINNER_MODE_COUNT_INTRA,
-                                           MAX_WINNER_MODE_COUNT_INTER)];
-  int winner_mode_count;
-  int skip_block;
-  int qindex;
+  /*! Probability threshold used for conditionally forcing tx type*/
+  int default_inter_tx_type_prob_thresh;
+
+  //! Whether to prune 2d transforms based on 1d transform results.
+  int prune_2d_txfm_mode;
+
+  /*! \brief Variable from \ref WinnerModeParams based on current eval mode.
+   *
+   * See the documentation for \ref WinnerModeParams for more detail.
+   */
+  unsigned int coeff_opt_thresholds[2];
+  /*! \copydoc coeff_opt_thresholds */
+  unsigned int tx_domain_dist_threshold;
+  /*! \copydoc coeff_opt_thresholds */
+  TX_SIZE_SEARCH_METHOD tx_size_search_method;
+  /*! \copydoc coeff_opt_thresholds */
+  unsigned int use_transform_domain_distortion;
+  /*! \copydoc coeff_opt_thresholds */
+  unsigned int skip_txfm_level;
+
+  /*! \brief How to search for the optimal tx_size
+   *
+   * If ONLY_4X4, use TX_4X4; if TX_MODE_LARGEST, use the largest tx_size for
+   * the current partition block; if TX_MODE_SELECT, search through the whole
+   * tree.
+   *
+   * \attention
+   * Although this looks suspicious similar to a bitstream element, this
+   * tx_mode_search_type is only used internally by the encoder, and is *not*
+   * written to the bitstream. It determines what kind of tx_mode would be
+   * searched. For example, we might set it to TX_MODE_LARGEST to find a good
+   * candidate, then code it as TX_MODE_SELECT.
+   */
+  TX_MODE tx_mode_search_type;
 
-  // The equivalent error at the current rdmult of one whole bit (not one
-  // bitcost unit).
-  int errorperbit;
-  // The equivalend SAD error of one (whole) bit at the current quantizer
-  // for large blocks.
-  int sadperbit;
-  int rdmult;
-  int mb_energy;
-  int sb_energy_level;
+  /*!
+   * Flag to enable/disable DC block prediction.
+   */
+  unsigned int predict_dc_level;
+
+  /*!
+   * Whether or not we should use the quantization matrix as weights for PSNR
+   * during RD search.
+   */
+  int use_qm_dist_metric;
+
+  /*!
+   * Keep track of previous mode evaluation stage type. This will be used to
+   * reset mb rd hash record when mode evaluation type changes.
+   */
+  int mode_eval_type;
+} TxfmSearchParams;
+
+/*!\cond */
+#define MAX_NUM_8X8_TXBS ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1))
+#define MAX_NUM_16X16_TXBS ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2))
+#define MAX_NUM_32X32_TXBS ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3))
+#define MAX_NUM_64X64_TXBS ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4))
+/*!\endcond */
+
+/*! \brief Stores various encoding/search decisions related to txfm search.
+ *
+ * This struct contains a cache of previous txfm results, and some buffers for
+ * the current txfm decision.
+ */
+typedef struct {
+  //! Whether to skip transform and quantization on a partition block level.
+  int skip_txfm;
+
+  /*! \brief Whether to skip transform and quantization on a txfm block level.
+   *
+   * Skips transform and quantization on a transform block level inside the
+   * current partition block. Each element of this array is used as a bit-field.
+   * So for example, the we are skipping on the luma plane, then the last bit
+   * would be set to 1.
+   */
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
 
+  /*! \brief Transform types inside the partition block
+   *
+   * Keeps a record of what kind of transform to use for each of the transform
+   * block inside the partition block.
+   * \attention The buffer here is *never* directly used. Instead, this just
+   * allocates the memory for MACROBLOCKD::tx_type_map during rdopt on the
+   * partition block. So if we need to save memory, we could move the allocation
+   * to pick_sb_mode instead.
+   */
+  uint8_t tx_type_map_[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+  //! Txfm hash records of inter-modes.
+  MB_RD_RECORD *mb_rd_record;
+
+  /*! \brief Number of txb splits.
+   *
+   * Keep track of how many times we've used split tx partition for transform
+   * blocks. Somewhat misleadingly, this parameter doesn't actually keep track
+   * of the count of the current block. Instead, it's a cumulative count across
+   * of the whole frame. The main usage is that if txb_split_count is zero, then
+   * we can signal TX_MODE_LARGEST at frame level.
+   */
+  // TODO(chiyotsai@google.com): Move this to a more appropriate location such
+  // as ThreadData.
   unsigned int txb_split_count;
 #if CONFIG_SPEED_STATS
+  //! For debugging. Used to check how many txfm searches we are doing.
   unsigned int tx_search_count;
 #endif  // CONFIG_SPEED_STATS
+} TxfmSearchInfo;
+#undef MAX_NUM_8X8_TXBS
+#undef MAX_NUM_16X16_TXBS
+#undef MAX_NUM_32X32_TXBS
+#undef MAX_NUM_64X64_TXBS
 
-  // These are set to their default values at the beginning, and then adjusted
-  // further in the encoding process.
-  BLOCK_SIZE min_partition_size;
-  BLOCK_SIZE max_partition_size;
-
-  unsigned int max_mv_context[REF_FRAMES];
-  unsigned int source_variance;
-  unsigned int simple_motion_pred_sse;
-  unsigned int pred_sse[REF_FRAMES];
-  int pred_mv_sad[REF_FRAMES];
-  int best_pred_mv_sad;
-
-  int nmv_vec_cost[MV_JOINTS];
-  int nmv_costs[2][MV_VALS];
-  int nmv_costs_hp[2][MV_VALS];
-  int *nmvcost[2];
-  int *nmvcost_hp[2];
-  int **mv_cost_stack;
-
-  int32_t *wsrc_buf;
-  int32_t *mask_buf;
-  uint8_t *above_pred_buf;
-  uint8_t *left_pred_buf;
-
-  PALETTE_BUFFER *palette_buffer;
-  CompoundTypeRdBuffers comp_rd_buffer;
-
-  CONV_BUF_TYPE *tmp_conv_dst;
-  uint8_t *tmp_obmc_bufs[2];
-
-  FRAME_CONTEXT *row_ctx;
-  // This context will be used to update color_map_cdf pointer which would be
-  // used during pack bitstream. For single thread and tile-multithreading case
-  // this ponter will be same as xd->tile_ctx, but for the case of row-mt:
-  // xd->tile_ctx will point to a temporary context while tile_pb_ctx will point
-  // to the accurate tile context.
-  FRAME_CONTEXT *tile_pb_ctx;
-
-  struct inter_modes_info *inter_modes_info;
-
-  // Contains the hash table, hash function, and buffer used for intrabc
-  IntraBCHashInfo intrabc_hash_info;
-
-  // These define limits to motion vector components to prevent them
-  // from extending outside the UMV borders
-  FullMvLimits mv_limits;
-
-  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-
-  // Force the coding block to skip transform and quantization.
-  int force_skip;
-  int skip_cost[SKIP_CONTEXTS][2];
+/*! \brief Holds the entropy costs for various modes sent to the bitstream.
+ *
+ * \attention This does not include the costs for mv and transformed
+ * coefficients.
+ */
+typedef struct {
+  /*****************************************************************************
+   * \name Partition Costs
+   ****************************************************************************/
+  /**@{*/
+  //! Cost for coding the partition.
+  int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+  /**@}*/
 
-  int skip_mode;  // 0: off; 1: on
-  int skip_mode_cost[SKIP_CONTEXTS][2];
+  /*****************************************************************************
+   * \name Intra Costs: General
+   ****************************************************************************/
+  /**@{*/
+  //! Luma mode cost for inter frame.
+  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  //! Luma mode cost for intra frame.
+  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  //! Chroma mode cost
+  int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+  //! filter_intra_cost
+  int filter_intra_cost[BLOCK_SIZES_ALL][2];
+  //! filter_intra_mode_cost
+  int filter_intra_mode_cost[FILTER_INTRA_MODES];
+  //! angle_delta_cost
+  int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
 
-  LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
-  LV_MAP_EOB_COST eob_costs[7][2];
-  uint16_t cb_offset;
+  //! Rate rate associated with each alpha codeword
+  int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
+  /**@}*/
 
-  // mode costs
-  int intra_inter_cost[INTRA_INTER_CONTEXTS][2];
+  /*****************************************************************************
+   * \name Intra Costs: Screen Contents
+   ****************************************************************************/
+  /**@{*/
+  //! intrabc_cost
+  int intrabc_cost[2];
 
-  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  //! palette_y_size_cost
+  int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  //! palette_uv_size_cost
+  int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  //! palette_y_color_cost
+  int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                          [PALETTE_COLORS];
+  //! palette_uv_color_cost
+  int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                           [PALETTE_COLORS];
+  //! palette_y_mode_cost
+  int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+  //! palette_uv_mode_cost
+  int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: MV Modes
+   ****************************************************************************/
+  /**@{*/
+  //! skip_mode_cost
+  int skip_mode_cost[SKIP_MODE_CONTEXTS][2];
+  //! newmv_mode_cost
   int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+  //! zeromv_mode_cost
   int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2];
+  //! refmv_mode_cost
   int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+  //! drl_mode_cost0
   int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+  /**@}*/
 
-  int comp_inter_cost[COMP_INTER_CONTEXTS][2];
+  /*****************************************************************************
+   * \name Inter Costs: Ref Frame Types
+   ****************************************************************************/
+  /**@{*/
+  //! single_ref_cost
   int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2];
+  //! comp_inter_cost
+  int comp_inter_cost[COMP_INTER_CONTEXTS][2];
+  //! comp_ref_type_cost
   int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS]
                         [CDF_SIZE(COMP_REFERENCE_TYPES)];
+  //! uni_comp_ref_cost
   int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
                        [CDF_SIZE(2)];
-  // Cost for signaling ref_frame[0] (LAST_FRAME, LAST2_FRAME, LAST3_FRAME or
-  // GOLDEN_FRAME) in bidir-comp mode.
+  /*! \brief Cost for signaling ref_frame[0] in bidir-comp mode
+   *
+   * Includes LAST_FRAME, LAST2_FRAME, LAST3_FRAME, and GOLDEN_FRAME.
+   */
   int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2];
-  // Cost for signaling ref_frame[1] (ALTREF_FRAME, ALTREF2_FRAME, or
-  // BWDREF_FRAME) in bidir-comp mode.
+  /*! \brief Cost for signaling ref_frame[1] in bidir-comp mode
+   *
+   * Includes ALTREF_FRAME, ALTREF2_FRAME, and BWDREF_FRAME.
+   */
   int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: Compound Types
+   ****************************************************************************/
+  /**@{*/
+  //! intra_inter_cost
+  int intra_inter_cost[INTRA_INTER_CONTEXTS][2];
+  //! inter_compound_mode_cost
   int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  //! compound_type_cost
   int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
+  //! wedge_idx_cost
   int wedge_idx_cost[BLOCK_SIZES_ALL][16];
+  //! interintra_cost
   int interintra_cost[BLOCK_SIZE_GROUPS][2];
+  //! wedge_interintra_cost
   int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
+  //! interintra_mode_cost
   int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: Compound Masks
+   ****************************************************************************/
+  /**@{*/
+  //! comp_idx_cost
+  int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
+  //! comp_group_idx_cost
+  int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Inter Costs: Motion Modes/Filters
+   ****************************************************************************/
+  /**@{*/
+  //! motion_mode_cost
   int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
+  //! motion_mode_cost1
   int motion_mode_cost1[BLOCK_SIZES_ALL][2];
-  int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
-  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
-  int filter_intra_cost[BLOCK_SIZES_ALL][2];
-  int filter_intra_mode_cost[FILTER_INTRA_MODES];
+  //! switchable_interp_costs
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-  int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
-  int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
-  int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
-  int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                          [PALETTE_COLORS];
-  int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                           [PALETTE_COLORS];
-  int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
-  int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2];
-  // The rate associated with each alpha codeword
-  int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Txfm Mode Costs
+   ****************************************************************************/
+  /**@{*/
+  //! skip_txfm_cost
+  int skip_txfm_cost[SKIP_CONTEXTS][2];
+  //! tx_size_cost
   int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+  //! txfm_partition_cost
   int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2];
+  //! inter_tx_type_costs
   int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  //! intra_tx_type_costs
   int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
                          [TX_TYPES];
-  int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Restoration Mode Costs
+   ****************************************************************************/
+  /**@{*/
+  //! switchable_restore_cost
   int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
+  //! wiener_restore_cost
   int wiener_restore_cost[2];
+  //! sgrproj_restore_cost
   int sgrproj_restore_cost[2];
-  int intrabc_cost[2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Segmentation Mode Costs
+   ****************************************************************************/
+  /**@{*/
+  //! tmp_pred_cost
+  int tmp_pred_cost[SEG_TEMPORAL_PRED_CTXS][2];
+  //! spatial_pred_cost
+  int spatial_pred_cost[SPATIAL_PREDICTION_PROBS][MAX_SEGMENTS];
+  /**@}*/
+} ModeCosts;
+
+/*! \brief Holds mv costs for encoding and motion search.
+ */
+typedef struct {
+  /*****************************************************************************
+   * \name Encoding Costs
+   * Here are the entropy costs needed to encode a given mv.
+   * \ref nmv_cost_alloc and \ref nmv_cost_hp_alloc are two arrays that holds
+   * the memory for holding the mv cost. But since the motion vectors can be
+   * negative, we shift them to the middle and store the resulting pointer in
+   * \ref nmv_cost and \ref nmv_cost_hp for easier referencing. Finally, \ref
+   * mv_cost_stack points to the \ref nmv_cost with the mv precision we are
+   * currently working with. In essence, only \ref mv_cost_stack is needed for
+   * motion search, the other can be considered private.
+   ****************************************************************************/
+  /**@{*/
+  //! Costs for coding the zero components.
+  int nmv_joint_cost[MV_JOINTS];
+
+  //! Allocates memory for 1/4-pel motion vector costs.
+  int nmv_cost_alloc[2][MV_VALS];
+  //! Allocates memory for 1/8-pel motion vector costs.
+  int nmv_cost_hp_alloc[2][MV_VALS];
+  //! Points to the middle of \ref nmv_cost_alloc
+  int *nmv_cost[2];
+  //! Points to the middle of \ref nmv_cost_hp_alloc
+  int *nmv_cost_hp[2];
+  //! Points to the nmv_cost_hp in use.
+  int **mv_cost_stack;
+  /**@}*/
+} MvCosts;
 
-  // Used to store sub partition's choices.
-  MV pred_mv[REF_FRAMES];
+/*! \brief Holds mv costs for intrabc.
+ */
+typedef struct {
+  /*! Costs for coding the joint mv. */
+  int joint_mv[MV_JOINTS];
 
-  // Ref frames that are selected by square partition blocks within a super-
-  // block, in MI resolution. They can be used to prune ref frames for
-  // rectangular blocks.
-  int picked_ref_frames_mask[32 * 32];
+  /*! \brief Cost of transmitting the actual motion vector.
+   *  dv_costs_alloc[0][i] is the cost of motion vector with horizontal
+   * component (mv_row) equal to i - MV_MAX. dv_costs_alloc[1][i] is the cost of
+   * motion vector with vertical component (mv_col) equal to i - MV_MAX.
+   */
+  int dv_costs_alloc[2][MV_VALS];
 
-  // use default transform and skip transform type search for intra modes
-  int use_default_intra_tx_type;
-  // use default transform and skip transform type search for inter modes
-  int use_default_inter_tx_type;
-  int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
-  int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
+  /*! Points to the middle of \ref dv_costs_alloc. */
+  int *dv_costs[2];
+} IntraBCMVCosts;
+
+/*! \brief Holds the costs needed to encode the coefficients
+ */
+typedef struct {
+  //! Costs for coding the coefficients.
+  LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
+  //! Costs for coding the eobs.
+  LV_MAP_EOB_COST eob_costs[7][2];
+} CoeffCosts;
+
+/*!\cond */
+// 4: NEAREST, NEW, NEAR, GLOBAL
+#define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4)
+/*!\endcond */
+struct inter_modes_info;
+
+/*! \brief Holds the motion samples for warp motion model estimation
+ */
+typedef struct {
+  //! Number of samples.
+  int num;
+  //! Sample locations in current frame.
+  int pts[16];
+  //! Sample location in the reference frame.
+  int pts_inref[16];
+} WARP_SAMPLE_INFO;
+
+/*!\cond */
+typedef enum {
+  kZeroSad = 0,
+  kLowSad = 1,
+  kMedSad = 2,
+  kHighSad = 3
+} SOURCE_SAD;
+
+typedef struct {
+  //! SAD levels in non-rd path for var-based part and inter-mode search
+  SOURCE_SAD source_sad_nonrd;
+  //! SAD levels in rd-path for var-based part qindex thresholds
+  SOURCE_SAD source_sad_rd;
+  int lighting_change;
+  int low_sumdiff;
+} CONTENT_STATE_SB;
+
+// Structure to hold pixel level gradient info.
+typedef struct {
+  uint16_t abs_dx_abs_dy_sum;
+  int8_t hist_bin_idx;
+  bool is_dx_zero;
+} PixelLevelGradientInfo;
+
+// Structure to hold the variance and log(1 + variance) for 4x4 sub-blocks.
+typedef struct {
+  double log_var;
+  int var;
+} Block4x4VarInfo;
+
+#ifndef NDEBUG
+typedef struct SetOffsetsLoc {
+  int mi_row;
+  int mi_col;
+  BLOCK_SIZE bsize;
+} SetOffsetsLoc;
+#endif  // NDEBUG
+
+/*!\endcond */
+
+/*! \brief Encoder's parameters related to the current coding block.
+ *
+ * This struct contains most of the information the encoder needs to encode the
+ * current coding block. This includes the src and pred buffer, a copy of the
+ * decoder's view of the current block, the txfm coefficients. This struct also
+ * contains various buffers and data used to speed up the encoding process.
+ */
+typedef struct macroblock {
+  /*****************************************************************************
+   * \name Source, Buffers and Decoder
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Each of the encoding plane.
+   *
+   * An array holding the src buffer for each of plane of the current block. It
+   * also contains the txfm and quantized txfm coefficients.
+   */
+  struct macroblock_plane plane[MAX_MB_PLANE];
+
+  /*! \brief Decoder's view of current coding block.
+   *
+   * Contains the encoder's copy of what the decoder sees in the current block.
+   * Most importantly, this struct contains pointers to mbmi that is used in
+   * final bitstream packing.
+   */
+  MACROBLOCKD e_mbd;
+
+  /*! \brief Derived coding information.
+   *
+   * Contains extra information not transmitted in the bitstream but are
+   * derived. For example, this contains the stack of ref_mvs.
+   */
+  MB_MODE_INFO_EXT mbmi_ext;
+
+  /*! \brief Finalized mbmi_ext for the whole frame.
+   *
+   * Contains the finalized info in mbmi_ext that gets used at the frame level
+   * for bitstream packing.
+   */
+  MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame;
+
+  //! Entropy context for the current row.
+  FRAME_CONTEXT *row_ctx;
+  /*! \brief Entropy context for the current tile.
+   *
+   * This context will be used to update color_map_cdf pointer which would be
+   * used during pack bitstream. For single thread and tile-multithreading case
+   * this pointer will be same as xd->tile_ctx, but for the case of row-mt:
+   * xd->tile_ctx will point to a temporary context while tile_pb_ctx will point
+   * to the accurate tile context.
+   */
+  FRAME_CONTEXT *tile_pb_ctx;
+
+  /*! \brief Buffer of transformed coefficients
+   *
+   * Points to cb_coef_buff in the AV1_COMP struct, which contains the finalized
+   * coefficients. This is here to conveniently copy the best coefficients to
+   * frame level for bitstream packing. Since CB_COEFF_BUFFER is allocated on a
+   * superblock level, we need to combine it with cb_offset to get the proper
+   * position for the current coding block.
+   */
+  CB_COEFF_BUFFER *cb_coef_buff;
+  //! Offset of current coding block's coeff buffer relative to the sb.
+  uint16_t cb_offset[PLANE_TYPES];
+
+  //! Modified source and masks used for fast OBMC search.
+  OBMCBuffer obmc_buffer;
+  //! Buffer to store the best palette map.
+  PALETTE_BUFFER *palette_buffer;
+  //! Buffer used for compound_type_rd().
+  CompoundTypeRdBuffers comp_rd_buffer;
+  //! Buffer to store convolution during averaging process in compound mode.
+  CONV_BUF_TYPE *tmp_conv_dst;
+
+  /*! \brief Temporary buffer to hold prediction.
+   *
+   * Points to a buffer that is used to hold temporary prediction results. This
+   * is used in two ways:
+   * - This is a temporary buffer used to ping-pong the prediction in
+   *   handle_inter_mode.
+   * - xd->tmp_obmc_bufs also points to this buffer, and is used in ombc
+   *   prediction.
+   */
+  uint8_t *tmp_pred_bufs[2];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Rdopt Costs
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Quantization index for the current partition block.
+   *
+   * This is used to as the index to find quantization parameter for luma and
+   * chroma transformed coefficients.
+   */
+  int qindex;
+
+  /*! \brief Difference between frame-level qindex and current qindex.
+   *
+   *  This is used to track whether a non-zero delta for qindex is used at least
+   *  once in the current frame.
+   */
+  int delta_qindex;
+
+  /*! \brief Rate-distortion multiplier.
+   *
+   * The rd multiplier used to determine the rate-distortion trade-off. This is
+   * roughly proportional to the inverse of q-index for a given frame, but this
+   * can be manipulated for better rate-control. For example, in tune_ssim
+   * mode, this is scaled by a factor related to the variance of the current
+   * block.
+   */
+  int rdmult;
+
+  //! Intra only, per sb rd adjustment.
+  int intra_sb_rdmult_modifier;
+
+  //! Superblock level distortion propagation factor.
+  double rb;
+
+  //! Energy in the current source coding block. Used to calculate \ref rdmult
+  int mb_energy;
+  //! Energy in the current source superblock. Used to calculate \ref rdmult
+  int sb_energy_level;
+
+  //! The rate needed to signal a mode to the bitstream.
+  ModeCosts mode_costs;
+
+  //! The rate needed to encode a new motion vector to the bitstream and some
+  //! multipliers for motion search.
+  MvCosts *mv_costs;
+
+  /*! The rate needed to encode a new motion vector to the bitstream in intrabc
+   *  mode.
+   */
+  IntraBCMVCosts *dv_costs;
+
+  //! The rate needed to signal the txfm coefficients to the bitstream.
+  CoeffCosts coeff_costs;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Rate to Distortion Multipliers
+   ****************************************************************************/
+  /**@{*/
+  //! A multiplier that converts mv cost to l2 error.
+  int errorperbit;
+  //! A multiplier that converts mv cost to l1 error.
+  int sadperbit;
+  /**@}*/
+
+  /******************************************************************************
+   * \name Segmentation
+   *****************************************************************************/
+  /**@{*/
+  /*! \brief Skip mode for the segment
+   *
+   * A syntax element of the segmentation mode. In skip_block mode, all mvs are
+   * set 0 and all txfms are skipped.
+   */
+  int seg_skip_block;
+
+  /*! \brief Number of segment 1 blocks
+   * Actual number of (4x4) blocks that were applied delta-q,
+   * for segment 1.
+   */
+  int actual_num_seg1_blocks;
+
+  /*!\brief Number of segment 2 blocks
+   * Actual number of (4x4) blocks that were applied delta-q,
+   * for segment 2.
+   */
+  int actual_num_seg2_blocks;
+
+  /*!\brief Number of zero motion vectors
+   */
+  int cnt_zeromv;
+
+  /*!\brief Flag to force zeromv-skip block, for nonrd path.
+   */
+  int force_zeromv_skip;
+
+  /*! \brief Previous segment id for which qmatrices were updated.
+   * This is used to bypass setting of qmatrices if no change in qindex.
+   */
+  int prev_segment_id;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Superblock
+   ****************************************************************************/
+  /**@{*/
+  //! Information on a whole superblock level.
+  // TODO(chiyotsai@google.com): Refactor this out of macroblock
+  SuperBlockEnc sb_enc;
+
+  /*! \brief Characteristics of the current superblock.
+   *
+   *  Characteristics like whether the block has high sad, low sad, etc. This is
+   *  only used by av1 realtime mode.
+   */
+  CONTENT_STATE_SB content_state_sb;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Reference Frame Search
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Sum absolute distortion of the predicted mv for each ref frame.
+   *
+   * This is used to measure how viable a reference frame is.
+   */
+  int pred_mv_sad[REF_FRAMES];
+  /*! \brief The minimum of \ref pred_mv_sad.
+   *
+   * Index 0 stores the minimum \ref pred_mv_sad across past reference frames.
+   * Index 1 stores the minimum \ref pred_mv_sad across future reference frames.
+   */
+  int best_pred_mv_sad[2];
+  //! The sad of the 1st mv ref (nearest).
+  int pred_mv0_sad[REF_FRAMES];
+  //! The sad of the 2nd mv ref (near).
+  int pred_mv1_sad[REF_FRAMES];
+
+  /*! \brief Disables certain ref frame pruning based on tpl.
+   *
+   * Determines whether a given ref frame is "good" based on data from the TPL
+   * model. If so, this stops selective_ref frame from pruning the given ref
+   * frame at block level.
+   */
+  uint8_t tpl_keep_ref_frame[REF_FRAMES];
+
+  /*! \brief Warp motion samples buffer.
+   *
+   * Store the motion samples used for warp motion.
+   */
+  WARP_SAMPLE_INFO warp_sample_info[REF_FRAMES];
+
+  /*! \brief Reference frames picked by the square subblocks in a superblock.
+   *
+   * Keeps track of ref frames that are selected by square partition blocks
+   * within a superblock, in MI resolution. They can be used to prune ref frames
+   * for rectangular blocks.
+   */
+  int picked_ref_frames_mask[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+  /*! \brief Prune ref frames in real-time mode.
+   *
+   * Determines whether to prune reference frames in real-time mode. For the
+   * most part, this is the same as nonrd_prune_ref_frame_search in
+   * cpi->sf.rt_sf.nonrd_prune_ref_frame_search, but this can be selectively
+   * turned off if the only frame available is GOLDEN_FRAME.
+   */
+  int nonrd_prune_ref_frame_search;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Partition Search
+   ****************************************************************************/
+  /**@{*/
+  //! Stores some partition-search related buffers.
+  PartitionSearchInfo part_search_info;
+
+  /*! \brief Whether to disable some features to force a mode in current block.
+   *
+   * In some cases, our speed features can be overly aggressive and remove all
+   * modes search in the superblock. When this happens, we set
+   * must_find_valid_partition to 1 to reduce the number of speed features, and
+   * recode the superblock again.
+   */
   int must_find_valid_partition;
-  int recalc_luma_mc_data;  // Flag to indicate recalculation of MC data during
-                            // interpolation filter search
-  int prune_mode;
-  uint32_t tx_domain_dist_threshold;
-  int use_transform_domain_distortion;
-  // The likelihood of an edge existing in the block (using partial Canny edge
-  // detection). For reference, 556 is the value returned for a solid
-  // vertical black/white edge.
-  uint16_t edge_strength;
-  // The strongest edge strength seen along the x/y axis.
-  uint16_t edge_strength_x;
-  uint16_t edge_strength_y;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Prediction Mode Search
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Inter skip mode.
+   *
+   * Skip mode tries to use the closest forward and backward references for
+   * inter prediction. Skip here means to skip transmitting the reference
+   * frames, not to be confused with skip_txfm.
+   */
+  int skip_mode;
+
+  /*! \brief Factors used for rd-thresholding.
+   *
+   * Determines a rd threshold to determine whether to continue searching the
+   * current mode. If the current best rd is already <= threshold, then we skip
+   * the current mode.
+   */
+  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+
+  /*! \brief Tracks the winner modes in the current coding block.
+   *
+   * Winner mode is a two-pass strategy to find the best prediction mode. In the
+   * first pass, we search the prediction modes with a limited set of txfm
+   * options, and keep the top modes. These modes are called the winner modes.
+   * In the second pass, we retry the winner modes with more thorough txfm
+   * options.
+   */
+  WinnerModeStats *winner_mode_stats;
+  //! Tracks how many winner modes there are.
+  int winner_mode_count;
+
+  /*! \brief The model used for rd-estimation to avoid txfm
+   *
+   * These are for inter_mode_rd_model_estimation, which is another two pass
+   * approach. In this speed feature, we collect data in the first couple frames
+   * to build an rd model to estimate the rdcost of a prediction model based on
+   * the residue error. Once enough data is collected, this speed feature uses
+   * the estimated rdcost to find the most performant prediction mode. Then we
+   * follow up with a second pass find the best transform for the mode.
+   * Determines if one would go with reduced complexity transform block
+   * search model to select prediction modes, or full complexity model
+   * to select transform kernel.
+   */
+  TXFM_RD_MODEL rd_model;
+
+  /*! \brief Stores the inter mode information needed to build an rd model.
+   *
+   * These are for inter_mode_rd_model_estimation, which is another two pass
+   * approach. In this speed feature, we collect data in the first couple frames
+   * to build an rd model to estimate the rdcost of a prediction model based on
+   * the residue error. Once enough data is collected, this speed feature uses
+   * the estimated rdcost to find the most performant prediction mode. Then we
+   * follow up with a second pass find the best transform for the mode.
+   */
+  // TODO(any): try to consolidate this speed feature with winner mode
+  // processing.
+  struct inter_modes_info *inter_modes_info;
+
+  //! How to blend the compound predictions.
   uint8_t compound_idx;
 
-  // [Saved stat index]
+  //! A caches of results of compound type search so they can be reused later.
   COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS];
+  //! The idx for the latest compound mode in the cache \ref comp_rd_stats.
   int comp_rd_stats_idx;
 
-  CB_COEFF_BUFFER *cb_coef_buff;
+  /*! \brief Whether to recompute the luma prediction.
+   *
+   * In interpolation search, we can usually skip recalculating the luma
+   * prediction because it is already calculated by a previous predictor. This
+   * flag signifies that some modes might have been skipped, so we need to
+   * rebuild the prediction.
+   */
+  int recalc_luma_mc_data;
+
+  /*! \brief Data structure to speed up intrabc search.
+   *
+   * Contains the hash table, hash function, and buffer used for intrabc.
+   */
+  IntraBCHashInfo intrabc_hash_info;
 
-  // Threshold used to decide the applicability of R-D optimization of
-  // quantized coeffs
-  uint32_t coeff_opt_dist_threshold;
+  /*! \brief Whether to reuse the mode stored in mb_mode_cache. */
+  int use_mb_mode_cache;
+  /*! \brief The mode to reuse during \ref av1_rd_pick_intra_mode_sb and
+   *  \ref av1_rd_pick_inter_mode. */
+  const MB_MODE_INFO *mb_mode_cache;
+  /*! \brief Pointer to the buffer which caches gradient information.
+   *
+   * Pointer to the array of structures to store gradient information of each
+   * pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
+   * structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
+   */
+  PixelLevelGradientInfo *pixel_gradient_info;
+  /*! \brief Flags indicating the availability of cached gradient info. */
+  bool is_sb_gradient_cached[PLANE_TYPES];
+  /**@}*/
+
+  /*****************************************************************************
+   * \name MV Search
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Context used to determine the initial step size in motion search.
+   *
+   * This context is defined as the \f$l_\inf\f$ norm of the best ref_mvs for
+   * each frame.
+   */
+  unsigned int max_mv_context[REF_FRAMES];
 
-#if !CONFIG_REALTIME_ONLY
-  int quad_tree_idx;
-  int cnn_output_valid;
-  float cnn_buffer[CNN_OUT_BUF_SIZE];
-  float log_q;
-#endif
-  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
-  // 0 - 128x128
-  // 1-2 - 128x64
-  // 3-4 - 64x128
-  // 5-8 - 64x64
-  // 9-16 - 64x32
-  // 17-24 - 32x64
-  // 25-40 - 32x32
-  // 41-104 - 16x16
-  uint8_t variance_low[105];
-  uint8_t content_state_sb;
-  // Strong color activity detection. Used in REALTIME coding mode to enhance
-  // the visual quality at the boundary of moving color objects.
+  /*! \brief Limit for the range of motion vectors.
+   *
+   * These define limits to motion vector components to prevent them from
+   * extending outside the UMV borders
+   */
+  FullMvLimits mv_limits;
+  /**@}*/
+
+  /*****************************************************************************
+   * \name Txfm Search
+   ****************************************************************************/
+  /**@{*/
+  /*! \brief Parameters that control how motion search is done.
+   *
+   * Stores various txfm search related parameters such as txfm_type, txfm_size,
+   * trellis eob search, etc.
+   */
+  TxfmSearchParams txfm_search_params;
+
+  /*! \brief Results of the txfm searches that have been done.
+   *
+   * Caches old txfm search results and keeps the current txfm decisions to
+   * facilitate rdopt.
+   */
+  TxfmSearchInfo txfm_search_info;
+
+  /*! \brief Whether there is a strong color activity.
+   *
+   * Used in REALTIME coding mode to enhance the visual quality at the boundary
+   * of moving color objects.
+   */
+  uint8_t color_sensitivity_sb[2];
+  //! Color sensitivity flag for the coding block.
   uint8_t color_sensitivity[2];
-  int nonrd_prune_ref_frame_search;
-
-  // Used to control the tx size search evaluation for mode processing
-  // (normal/winner mode)
-  int tx_size_search_method;
-  // This tx_mode_search_type is used internally by the encoder, and is not
-  // written to the bitstream. It determines what kind of tx_mode should be
-  // searched. For example, we might set it to TX_MODE_LARGEST to find a good
-  // candidate, then use TX_MODE_SELECT on it
-  TX_MODE tx_mode_search_type;
+  /**@}*/
 
-  // Used to control aggressiveness of skip flag prediction for mode processing
-  // (normal/winner mode)
-  unsigned int predict_skip_level;
-
-  // Copy out this SB's TPL block stats.
-  int valid_cost_b;
-  int64_t inter_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
-  int64_t intra_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
-  int_mv mv_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB]
-             [INTER_REFS_PER_FRAME];
-  int cost_stride;
-
-  // The type of mv cost used during motion search
-  MV_COST_TYPE mv_cost_type;
-
-  uint8_t search_ref_frame[REF_FRAMES];
-
-#if CONFIG_AV1_HIGHBITDEPTH
-  void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride);
-  void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride,
-                       int eob);
-#else
-  void (*fwd_txfm4x4)(const int16_t *input, int16_t *output, int stride);
-  void (*inv_txfm_add)(const int16_t *input, uint8_t *dest, int stride,
-                       int eob);
+  /*****************************************************************************
+   * \name Misc
+   ****************************************************************************/
+  /**@{*/
+  //! Variance of the source frame.
+  unsigned int source_variance;
+  //! SSE of the current predictor.
+  unsigned int pred_sse[REF_FRAMES];
+  //! Prediction for ML based partition.
+#if CONFIG_RT_ML_PARTITIONING
+  DECLARE_ALIGNED(16, uint8_t, est_pred[128 * 128]);
 #endif
-};
-
-// Only consider full SB, MC_FLOW_BSIZE_1D = 16.
-static INLINE int tpl_blocks_in_sb(BLOCK_SIZE bsize) {
-  switch (bsize) {
-    case BLOCK_64X64: return 16;
-    case BLOCK_128X128: return 64;
-    default: assert(0);
+  /**@}*/
+
+  /*! \brief NONE partition evaluated for merge.
+   *
+   * In variance based partitioning scheme, NONE & SPLIT partitions are
+   * evaluated to check the SPLIT can be merged as NONE. This flag signifies the
+   * partition is evaluated in the scheme.
+   */
+  int try_merge_partition;
+
+  /*! \brief Pointer to buffer which caches sub-block variances in a superblock.
+   *
+   *  Pointer to the array of structures to store source variance information of
+   *  each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to
+   *  store source variance and log of source variance of each 4x4 sub-block.
+   */
+  Block4x4VarInfo *src_var_info_of_4x4_sub_blocks;
+#ifndef NDEBUG
+  /*! \brief A hash to make sure av1_set_offsets is called */
+  SetOffsetsLoc last_set_offsets_loc;
+#endif  // NDEBUG
+} MACROBLOCK;
+#undef SINGLE_REF_MODES
+
+/*!\cond */
+// Zeroes out 'n_stats' elements in the array x->winner_mode_stats.
+// It only zeroes out what is necessary in 'color_index_map' (just the block
+// size, not the whole array).
+static INLINE void zero_winner_mode_stats(BLOCK_SIZE bsize, int n_stats,
+                                          WinnerModeStats *stats) {
+  // When winner mode stats are not required, the memory allocation is avoided
+  // for x->winner_mode_stats. The stats pointer will be NULL in such cases.
+  if (stats == NULL) return;
+
+  const int block_height = block_size_high[bsize];
+  const int block_width = block_size_wide[bsize];
+  for (int i = 0; i < n_stats; ++i) {
+    WinnerModeStats *const stat = &stats[i];
+    memset(&stat->mbmi, 0, sizeof(stat->mbmi));
+    memset(&stat->rd_cost, 0, sizeof(stat->rd_cost));
+    memset(&stat->rd, 0, sizeof(stat->rd));
+    memset(&stat->rate_y, 0, sizeof(stat->rate_y));
+    memset(&stat->rate_uv, 0, sizeof(stat->rate_uv));
+    // Do not reset the whole array as it is CPU intensive.
+    memset(&stat->color_index_map, 0,
+           block_width * block_height * sizeof(stat->color_index_map[0]));
+    memset(&stat->mode_index, 0, sizeof(stat->mode_index));
   }
-  return -1;
 }
 
 static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
@@ -523,7 +1314,7 @@ static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
 
 static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
                                      const MB_MODE_INFO *mbmi) {
-  return is_rect_tx_allowed_bsize(mbmi->sb_type) &&
+  return is_rect_tx_allowed_bsize(mbmi->bsize) &&
          !xd->lossless[mbmi->segment_id];
 }
 
@@ -538,36 +1329,38 @@ static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
   return depth;
 }
 
-static INLINE void set_blk_skip(MACROBLOCK *x, int plane, int blk_idx,
+static INLINE void set_blk_skip(uint8_t txb_skip[], int plane, int blk_idx,
                                 int skip) {
   if (skip)
-    x->blk_skip[blk_idx] |= 1UL << plane;
+    txb_skip[blk_idx] |= 1UL << plane;
   else
-    x->blk_skip[blk_idx] &= ~(1UL << plane);
+    txb_skip[blk_idx] &= ~(1UL << plane);
 #ifndef NDEBUG
   // Set chroma planes to uninitialized states when luma is set to check if
   // it will be set later
   if (plane == 0) {
-    x->blk_skip[blk_idx] |= 1UL << (1 + 4);
-    x->blk_skip[blk_idx] |= 1UL << (2 + 4);
+    txb_skip[blk_idx] |= 1UL << (1 + 4);
+    txb_skip[blk_idx] |= 1UL << (2 + 4);
   }
 
   // Clear the initialization checking bit
-  x->blk_skip[blk_idx] &= ~(1UL << (plane + 4));
+  txb_skip[blk_idx] &= ~(1UL << (plane + 4));
 #endif
 }
 
-static INLINE int is_blk_skip(MACROBLOCK *x, int plane, int blk_idx) {
+static INLINE int is_blk_skip(uint8_t *txb_skip, int plane, int blk_idx) {
 #ifndef NDEBUG
   // Check if this is initialized
-  assert(!(x->blk_skip[blk_idx] & (1UL << (plane + 4))));
+  assert(!(txb_skip[blk_idx] & (1UL << (plane + 4))));
 
   // The magic number is 0x77, this is to test if there is garbage data
-  assert((x->blk_skip[blk_idx] & 0x88) == 0);
+  assert((txb_skip[blk_idx] & 0x88) == 0);
 #endif
-  return (x->blk_skip[blk_idx] >> plane) & 1;
+  return (txb_skip[blk_idx] >> plane) & 1;
 }
 
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/blockiness.c b/media/libaom/src/av1/encoder/blockiness.c
index f7cff9e532..6ad2ddaf25 100644
--- a/media/libaom/src/av1/encoder/blockiness.c
+++ b/media/libaom/src/av1/encoder/blockiness.c
@@ -18,7 +18,6 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 
 static int horizontal_filter(const uint8_t *s) {
   return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
@@ -125,7 +124,6 @@ double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
                           int height) {
   double blockiness = 0;
   int i, j;
-  aom_clear_system_state();
   for (i = 0; i < height;
        i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
     for (j = 0; j < width; j += 4) {
diff --git a/media/libaom/src/av1/encoder/cnn.c b/media/libaom/src/av1/encoder/cnn.c
index 5d8a236a06..83e2c457b6 100644
--- a/media/libaom/src/av1/encoder/cnn.c
+++ b/media/libaom/src/av1/encoder/cnn.c
@@ -11,10 +11,11 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stdbool.h>
 
 #include "aom_dsp/aom_dsp_common.h"
-#include "av1/encoder/cnn.h"
 #include "av1/common/av1_common_int.h"
+#include "av1/encoder/cnn.h"
 
 #define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
 
@@ -55,13 +56,14 @@ static void free_tensor(TENSOR *tensor) {
   }
 }
 
-static void realloc_tensor(TENSOR *tensor, int channels, int width,
+static bool realloc_tensor(TENSOR *tensor, int channels, int width,
                            int height) {
   const int newallocsize = channels * width * height;
   if (tensor->allocsize < newallocsize) {
     free_tensor(tensor);
     tensor->buf[0] =
         (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
+    if (!tensor->buf[0]) return false;
     tensor->allocsize = newallocsize;
   }
   tensor->width = width;
@@ -70,6 +72,7 @@ static void realloc_tensor(TENSOR *tensor, int channels, int width,
   tensor->channels = channels;
   for (int c = 1; c < channels; ++c)
     tensor->buf[c] = &tensor->buf[0][c * width * height];
+  return true;
 }
 
 static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
@@ -115,7 +118,7 @@ static void swap_tensor(TENSOR *t1, TENSOR *t2) {
 
 // The concatenated tensor goes into dst with first the channels in
 // original dst followed by the channels in the src
-static void concat_tensor(const TENSOR *src, TENSOR *dst) {
+static bool concat_tensor(const TENSOR *src, TENSOR *dst) {
   assert(src->width == dst->width);
   assert(src->height == dst->height);
 
@@ -126,7 +129,7 @@ static void concat_tensor(const TENSOR *src, TENSOR *dst) {
     TENSOR t;
     init_tensor(&t);
     // allocate new buffers and copy first the dst channels
-    realloc_tensor(&t, channels, dst->width, dst->height);
+    if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false;
     copy_tensor(dst, dst->channels, 0, &t);
     // Swap the tensors and free the old buffers
     swap_tensor(dst, &t);
@@ -136,6 +139,7 @@ static void concat_tensor(const TENSOR *src, TENSOR *dst) {
     dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
   // Copy the channels in src after the first dst_channels channels.
   copy_tensor(src, src->channels, dst_channels, dst);
+  return true;
 }
 
 int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
@@ -147,10 +151,12 @@ int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
           t1->height == t2->height);
 }
 
-static void find_layer_output_size(int in_width, int in_height,
-                                   const CNN_LAYER_CONFIG *layer_config,
-                                   int *out_width, int *out_height) {
+void av1_find_cnn_layer_output_size(int in_width, int in_height,
+                                    const CNN_LAYER_CONFIG *layer_config,
+                                    int *out_width, int *out_height) {
   if (!layer_config->deconvolve) {
+    assert(layer_config->skip_width > 0);
+    assert(layer_config->skip_height > 0);
     switch (layer_config->pad) {
       case PADDING_SAME_ZERO:
       case PADDING_SAME_REPLICATE:
@@ -260,8 +266,8 @@ void av1_find_cnn_output_size(int in_width, int in_height,
       }
     }
 
-    find_layer_output_size(i_width[branch], i_height[branch], layer_config,
-                           &o_width, &o_height);
+    av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
+                                   layer_config, &o_width, &o_height);
     i_width[branch] = o_width;
     i_height[branch] = o_height;
 
@@ -324,7 +330,7 @@ void av1_cnn_activate_c(float **output, int channels, int width, int height,
   }
 }
 
-static void copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
+static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
                                            const CNN_LAYER_CONFIG *layer_config,
                                            int branch, TENSOR branch_output[]) {
   const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
@@ -336,62 +342,318 @@ static void copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
       int copy_channels = branch_config->channels_to_copy > 0
                               ? branch_config->channels_to_copy
                               : layer_active_tensor->channels;
-      realloc_tensor(&branch_output[b], copy_channels,
-                     layer_active_tensor->width, layer_active_tensor->height);
+      if (!realloc_tensor(&branch_output[b], copy_channels,
+                          layer_active_tensor->width,
+                          layer_active_tensor->height)) {
+        return false;
+      }
       copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
     }
   }
+  return true;
 }
 
-static int convolve_layer(void *arg1, void *arg2) {
-  const CONVOLVE_OPS *convolve_ops = arg1;
-  (void)arg2;
-  av1_cnn_convolve(
-      convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
-      convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
-      convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
-  return 1;
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_SAME_ZERO.
+static void convolve_maxpool_padding_zero(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    const int cstep, const int filter_width_half,
+    const int filter_height_half) {
+  for (int i = 0; i < layer_config->out_channels; ++i) {
+    for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
+      for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
+        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+             ++hh) {
+          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+               ++ww) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int ii = hh + l - filter_height_half;
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int jj = ww + m - filter_width_half;
+                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+                    continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            const float a = sum;
+            if (h == hh && w == ww)
+              output[i][u * out_stride + v] = a;
+            else
+              output[i][u * out_stride + v] =
+                  AOMMAX(output[i][u * out_stride + v], a);
+          }
+        }
+      }
+    }
+  }
 }
 
-static void convolve_layer_mt(const float **input, int in_width, int in_height,
-                              int in_stride,
-                              const CNN_LAYER_CONFIG *layer_config,
-                              const CNN_THREAD_DATA *thread_data,
-                              float **output, int out_stride) {
-  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  const int num_workers = thread_data->num_workers;
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_SAME_REPLICATE.
+static void convolve_maxpool_padding_replicate(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    const int cstep, const int filter_width_half,
+    const int filter_height_half) {
+  for (int i = 0; i < layer_config->out_channels; ++i) {
+    for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
+      for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
+        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+             ++hh) {
+          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+               ++ww) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int ii =
+                    CLAMPINDEX(hh + l - filter_height_half, in_height);
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int jj =
+                      CLAMPINDEX(ww + m - filter_width_half, in_width);
+                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            const float a = sum;
+            if (h == hh && w == ww)
+              output[i][u * out_stride + v] = a;
+            else
+              output[i][u * out_stride + v] =
+                  AOMMAX(output[i][u * out_stride + v], a);
+          }
+        }
+      }
+    }
+  }
+}
 
-  CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
-  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
-    AVxWorker *const worker = &thread_data->workers[th];
-    winterface->reset(worker);
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_VALID.
+static void convolve_maxpool_padding_valid(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    const int cstep) {
+  for (int i = 0; i < layer_config->out_channels; ++i) {
+    for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
+         h += layer_config->skip_height, ++u) {
+      for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
+           w += layer_config->skip_width, ++v) {
+        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+             ++hh) {
+          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+               ++ww) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int ii = hh + l;
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int jj = ww + m;
+                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            const float a = sum;
+            if (h == hh && w == ww)
+              output[i][u * out_stride + v] = a;
+            else
+              output[i][u * out_stride + v] =
+                  AOMMAX(output[i][u * out_stride + v], a);
+          }
+        }
+      }
+    }
+  }
+}
 
-    CONVOLVE_OPS convolve_op = { input,      in_width,     in_height,
-                                 in_stride,  layer_config, output,
-                                 out_stride, th,           num_workers };
-    convolve_ops[th] = convolve_op;
-    worker->hook = convolve_layer;
-    worker->data1 = &(convolve_ops[th]);
-    worker->data2 = NULL;
+// CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
+// equal to 1.
+static void convolve_element_wise(const float **input, int in_width,
+                                  int in_height, int in_stride,
+                                  const CNN_LAYER_CONFIG *const layer_config,
+                                  float **output, int out_stride, int start_idx,
+                                  int step) {
+  const int start_h = get_start_shift_convolve(
+      in_height, layer_config->filter_height, layer_config->skip_height);
+  const int start_w =
+      get_start_shift_convolve(in_width, layer_config->filter_width,
+                               layer_config->skip_width) +
+      start_idx * layer_config->skip_width;
+  const int out_w_step = AOMMAX(step, 1);
+  const int in_w_step = layer_config->skip_width * out_w_step;
+  for (int i = 0; i < layer_config->out_channels; ++i) {
+    for (int h = start_h, u = 0; h < in_height;
+         h += layer_config->skip_height, ++u) {
+      const int in_h = h * in_stride;
+      const int out_h = u * out_stride + start_idx;
+      for (int w = start_w, out_index = out_h; w < in_width;
+           w += in_w_step, out_index += out_w_step) {
+        float sum = layer_config->bias[i];
+        for (int k = 0; k < layer_config->in_channels; ++k) {
+          sum += layer_config->weights[k * layer_config->out_channels + i] *
+                 input[k][in_h + w];
+        }
+        output[i][out_index] = sum;
+      }
+    }
+  }
+}
 
-    // Start convolving.
-    if (th == num_workers - 1) {
-      winterface->execute(worker);
-    } else {
-      winterface->launch(worker);
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_SAME_ZERO.
+static void convolve_no_maxpool_padding_zero(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int filter_width_half,
+    const int filter_height_half, const int ii_shift, const int jj_shift,
+    const int channel_step) {
+  const int start_h = get_start_shift_convolve(
+      in_height, layer_config->filter_height, layer_config->skip_height);
+  const int start_w = get_start_shift_convolve(
+      in_width, layer_config->filter_width, layer_config->skip_width);
+  const int end_ii_shift = filter_height_half + 1;
+  const int end_jj_shift = filter_width_half + 1;
+  // *_filter_margin stores the number of pixels along a dimension in the
+  // intersection of the complement of the image in the extended image
+  // and the filter.
+  const int top_filter_margin = layer_config->filter_width * ii_shift;
+  const int right_filter_margin = end_jj_shift - in_width;
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    for (int h = start_h, u = 0; h < in_height;
+         h += layer_config->skip_height, ++u) {
+      const int out_h = u * out_stride;
+      const int top_cstep =
+          AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
+              cstep +
+          i;
+      const int start_ii = AOMMAX(0, h - ii_shift);
+      const int end_ii = AOMMIN(in_height, h + end_ii_shift);
+      for (int w = start_w, out_index = out_h; w < in_width;
+           w += layer_config->skip_width, ++out_index) {
+        const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
+        const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
+        const int start_jj = AOMMAX(0, w - jj_shift);
+        const int end_jj = AOMMIN(in_width, w + end_jj_shift);
+        float sum = layer_config->bias[i];
+        for (int k = 0; k < layer_config->in_channels; ++k) {
+          int off = k * layer_config->out_channels + top_cstep;
+          for (int ii = start_ii; ii < end_ii; ++ii) {
+            off += left_cstep;
+            for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
+              sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
+            }
+            off += right_cstep;
+          }
+        }
+        output[i][out_index] = sum;
+      }
     }
   }
+}
 
-  // Wait until all workers have finished.
-  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
-    winterface->sync(&thread_data->workers[th]);
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_SAME_REPLICATE.
+static void convolve_no_maxpool_padding_replicate(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int ii_shift, const int jj_shift,
+    const int channel_step) {
+  // h and w are shifted to an offset coordinate system to reduce in-loop
+  // computation.
+  const int start_h =
+      get_start_shift_convolve(in_height, layer_config->filter_height,
+                               layer_config->skip_height) -
+      ii_shift;
+  const int start_w =
+      get_start_shift_convolve(in_width, layer_config->filter_width,
+                               layer_config->skip_width) -
+      jj_shift;
+  const int end_h = in_height - ii_shift;
+  const int end_w = in_width - jj_shift;
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    for (int h = start_h, u = 0; h < end_h;
+         h += layer_config->skip_height, ++u) {
+      const int out_h = u * out_stride;
+      const int upper_ii_index = layer_config->filter_height + h;
+      for (int w = start_w, out_index = out_h; w < end_w;
+           w += layer_config->skip_width, ++out_index) {
+        const int upper_jj_index = layer_config->filter_width + w;
+        float sum = layer_config->bias[i];
+        for (int k = 0; k < layer_config->in_channels; ++k) {
+          int off = k * layer_config->out_channels + i;
+          for (int ii = h; ii < upper_ii_index; ++ii) {
+            const int clamped_ii = CLAMPINDEX(ii, in_height);
+            for (int jj = w; jj < upper_jj_index; ++jj) {
+              const int clamped_jj = CLAMPINDEX(jj, in_width);
+              assert(clamped_ii >= 0 && clamped_ii < in_height &&
+                     clamped_jj >= 0 && clamped_jj < in_width);
+              sum += layer_config->weights[off] *
+                     input[k][clamped_ii * in_stride + clamped_jj];
+              off += cstep;
+            }
+          }
+        }
+        output[i][out_index] = sum;
+      }
+    }
   }
 }
 
-void av1_cnn_convolve_c(const float **input, int in_width, int in_height,
-                        int in_stride, const CNN_LAYER_CONFIG *layer_config,
-                        float **output, int out_stride, int start_idx,
-                        int step) {
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_VALID.
+void av1_cnn_convolve_no_maxpool_padding_valid_c(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+    int start_idx, int cstep, int channel_step) {
+  assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
+         !layer_config->maxpool);
+  assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
+  assert(layer_config->pad == PADDING_VALID);
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
+         h += layer_config->skip_height, ++u) {
+      const int out_h = u * out_stride;
+      const int upper_ii_index = layer_config->filter_height + h;
+      for (int w = 0, out_index = out_h;
+           w < in_width - layer_config->filter_width + 1;
+           w += layer_config->skip_width, ++out_index) {
+        const int upper_jj_index = layer_config->filter_width + w;
+        float sum = layer_config->bias[i];
+        for (int k = 0; k < layer_config->in_channels; ++k) {
+          int off = k * layer_config->out_channels + i;
+          for (int ii = h; ii < upper_ii_index; ++ii) {
+            for (int jj = w; jj < upper_jj_index; ++jj) {
+              assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+              sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
+              off += cstep;
+            }
+          }
+        }
+        output[i][out_index] = sum;
+      }
+    }
+  }
+}
+
+static void av1_cnn_convolve(const float **input, int in_width, int in_height,
+                             int in_stride,
+                             const CNN_LAYER_CONFIG *layer_config,
+                             float **output, int out_stride, int start_idx,
+                             int step) {
   assert(!layer_config->deconvolve);
   const int cstep = layer_config->in_channels * layer_config->out_channels;
   const int filter_height_half = layer_config->filter_height >> 1;
@@ -402,156 +664,26 @@ void av1_cnn_convolve_c(const float **input, int in_width, int in_height,
       (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
     switch (layer_config->pad) {
       case PADDING_SAME_ZERO:
-        for (int i = 0; i < layer_config->out_channels; ++i) {
-          for (int h = 0, u = 0; h < in_height;
-               h += layer_config->skip_height, ++u) {
-            for (int w = 0, v = 0; w < in_width;
-                 w += layer_config->skip_width, ++v) {
-              for (int hh = h;
-                   hh < AOMMIN(in_height, h + layer_config->skip_height);
-                   ++hh) {
-                for (int ww = w;
-                     ww < AOMMIN(in_width, w + layer_config->skip_width);
-                     ++ww) {
-                  float sum = layer_config->bias[i];
-                  for (int k = 0; k < layer_config->in_channels; ++k) {
-                    int off = k * layer_config->out_channels + i;
-                    for (int l = 0; l < layer_config->filter_height; ++l) {
-                      const int ii = hh + l - filter_height_half;
-                      for (int m = 0; m < layer_config->filter_width;
-                           ++m, off += cstep) {
-                        const int jj = ww + m - filter_width_half;
-                        if (ii < 0 || ii >= in_height || jj < 0 ||
-                            jj >= in_width)
-                          continue;
-                        sum += layer_config->weights[off] *
-                               input[k][ii * in_stride + jj];
-                      }
-                    }
-                  }
-                  const float a = sum;
-                  if (h == hh && w == ww)
-                    output[i][u * out_stride + v] = a;
-                  else
-                    output[i][u * out_stride + v] =
-                        AOMMAX(output[i][u * out_stride + v], a);
-                }
-              }
-            }
-          }
-        }
+        convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
+                                      layer_config, output, out_stride, cstep,
+                                      filter_width_half, filter_height_half);
         break;
       case PADDING_SAME_REPLICATE:
-        for (int i = 0; i < layer_config->out_channels; ++i) {
-          for (int h = 0, u = 0; h < in_height;
-               h += layer_config->skip_height, ++u) {
-            for (int w = 0, v = 0; w < in_width;
-                 w += layer_config->skip_width, ++v) {
-              for (int hh = h;
-                   hh < AOMMIN(in_height, h + layer_config->skip_height);
-                   ++hh) {
-                for (int ww = w;
-                     ww < AOMMIN(in_width, w + layer_config->skip_width);
-                     ++ww) {
-                  float sum = layer_config->bias[i];
-                  for (int k = 0; k < layer_config->in_channels; ++k) {
-                    int off = k * layer_config->out_channels + i;
-                    for (int l = 0; l < layer_config->filter_height; ++l) {
-                      const int ii =
-                          CLAMPINDEX(hh + l - filter_height_half, in_height);
-                      for (int m = 0; m < layer_config->filter_width;
-                           ++m, off += cstep) {
-                        const int jj =
-                            CLAMPINDEX(ww + m - filter_width_half, in_width);
-                        assert(ii >= 0 && ii < in_height && jj >= 0 &&
-                               jj < in_width);
-                        sum += layer_config->weights[off] *
-                               input[k][ii * in_stride + jj];
-                      }
-                    }
-                  }
-                  const float a = sum;
-                  if (h == hh && w == ww)
-                    output[i][u * out_stride + v] = a;
-                  else
-                    output[i][u * out_stride + v] =
-                        AOMMAX(output[i][u * out_stride + v], a);
-                }
-              }
-            }
-          }
-        }
+        convolve_maxpool_padding_replicate(
+            input, in_width, in_height, in_stride, layer_config, output,
+            out_stride, cstep, filter_width_half, filter_height_half);
         break;
       case PADDING_VALID:
-        for (int i = 0; i < layer_config->out_channels; ++i) {
-          for (int h = 0, u = 0;
-               h < in_height - layer_config->filter_height + 1;
-               h += layer_config->skip_height, ++u) {
-            for (int w = 0, v = 0;
-                 w < in_width - layer_config->filter_width + 1;
-                 w += layer_config->skip_width, ++v) {
-              for (int hh = h;
-                   hh < AOMMIN(in_height, h + layer_config->skip_height);
-                   ++hh) {
-                for (int ww = w;
-                     ww < AOMMIN(in_width, w + layer_config->skip_width);
-                     ++ww) {
-                  float sum = layer_config->bias[i];
-                  for (int k = 0; k < layer_config->in_channels; ++k) {
-                    int off = k * layer_config->out_channels + i;
-                    for (int l = 0; l < layer_config->filter_height; ++l) {
-                      const int ii = hh + l;
-                      for (int m = 0; m < layer_config->filter_width;
-                           ++m, off += cstep) {
-                        const int jj = ww + m;
-                        assert(ii >= 0 && ii < in_height && jj >= 0 &&
-                               jj < in_width);
-                        sum += layer_config->weights[off] *
-                               input[k][ii * in_stride + jj];
-                      }
-                    }
-                  }
-                  const float a = sum;
-                  if (h == hh && w == ww)
-                    output[i][u * out_stride + v] = a;
-                  else
-                    output[i][u * out_stride + v] =
-                        AOMMAX(output[i][u * out_stride + v], a);
-                }
-              }
-            }
-          }
-        }
+        convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
+                                       layer_config, output, out_stride, cstep);
         break;
       default: assert(0 && "Unknown padding type");
     }
   } else {
     // Results in element-wise matrix multiplication.
     if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
-      const int start_h = get_start_shift_convolve(
-          in_height, layer_config->filter_height, layer_config->skip_height);
-      const int start_w =
-          get_start_shift_convolve(in_width, layer_config->filter_width,
-                                   layer_config->skip_width) +
-          start_idx * layer_config->skip_width;
-      const int out_w_step = AOMMAX(step, 1);
-      const int in_w_step = layer_config->skip_width * out_w_step;
-      for (int i = 0; i < layer_config->out_channels; ++i) {
-        for (int h = start_h, u = 0; h < in_height;
-             h += layer_config->skip_height, ++u) {
-          const int in_h = h * in_stride;
-          const int out_h = u * out_stride + start_idx;
-          for (int w = start_w, out_index = out_h; w < in_width;
-               w += in_w_step, out_index += out_w_step) {
-            float sum = layer_config->bias[i];
-            for (int k = 0; k < layer_config->in_channels; ++k) {
-              sum += layer_config->weights[k * layer_config->out_channels + i] *
-                     input[k][in_h + w];
-            }
-            output[i][out_index] = sum;
-          }
-        }
-      }
+      convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
+                            output, out_stride, start_idx, step);
       return;
     }
     const int ii_shift =
@@ -559,133 +691,73 @@ void av1_cnn_convolve_c(const float **input, int in_width, int in_height,
     const int jj_shift =
         filter_width_half - (layer_config->filter_width - 1) % 2;
     switch (layer_config->pad) {
-      case PADDING_SAME_ZERO: {
-        const int start_h = get_start_shift_convolve(
-            in_height, layer_config->filter_height, layer_config->skip_height);
-        const int start_w = get_start_shift_convolve(
-            in_width, layer_config->filter_width, layer_config->skip_width);
-        const int end_ii_shift = filter_height_half + 1;
-        const int end_jj_shift = filter_width_half + 1;
-        // *_filter_margin stores the number of pixels along a dimension in the
-        // intersection of the complement of the image in the extended image
-        // and the filter.
-        const int top_filter_margin = layer_config->filter_width * ii_shift;
-        const int right_filter_margin = end_jj_shift - in_width;
-        for (int i = start_idx; i < layer_config->out_channels;
-             i += channel_step) {
-          for (int h = start_h, u = 0; h < in_height;
-               h += layer_config->skip_height, ++u) {
-            const int out_h = u * out_stride;
-            const int top_cstep =
-                AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
-                    cstep +
-                i;
-            const int start_ii = AOMMAX(0, h - ii_shift);
-            const int end_ii = AOMMIN(in_height, h + end_ii_shift);
-            for (int w = start_w, out_index = out_h; w < in_width;
-                 w += layer_config->skip_width, ++out_index) {
-              const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
-              const int right_cstep =
-                  AOMMAX(0, right_filter_margin + w) * cstep;
-              const int start_jj = AOMMAX(0, w - jj_shift);
-              const int end_jj = AOMMIN(in_width, w + end_jj_shift);
-              float sum = layer_config->bias[i];
-              for (int k = 0; k < layer_config->in_channels; ++k) {
-                int off = k * layer_config->out_channels + top_cstep;
-                for (int ii = start_ii; ii < end_ii; ++ii) {
-                  off += left_cstep;
-                  for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
-                    sum += layer_config->weights[off] *
-                           input[k][ii * in_stride + jj];
-                  }
-                  off += right_cstep;
-                }
-              }
-              output[i][out_index] = sum;
-            }
-          }
-        }
+      case PADDING_SAME_ZERO:
+        convolve_no_maxpool_padding_zero(
+            input, in_width, in_height, in_stride, layer_config, output,
+            out_stride, start_idx, cstep, filter_width_half, filter_height_half,
+            ii_shift, jj_shift, channel_step);
         break;
-      }
-      case PADDING_SAME_REPLICATE: {
-        // h and w are shifted to an offset coordinate system to reduce in-loop
-        // computation.
-        const int start_h =
-            get_start_shift_convolve(in_height, layer_config->filter_height,
-                                     layer_config->skip_height) -
-            ii_shift;
-        const int start_w =
-            get_start_shift_convolve(in_width, layer_config->filter_width,
-                                     layer_config->skip_width) -
-            jj_shift;
-        const int end_h = in_height - ii_shift;
-        const int end_w = in_width - jj_shift;
-        for (int i = start_idx; i < layer_config->out_channels;
-             i += channel_step) {
-          for (int h = start_h, u = 0; h < end_h;
-               h += layer_config->skip_height, ++u) {
-            const int out_h = u * out_stride;
-            const int upper_ii_index = layer_config->filter_height + h;
-            for (int w = start_w, out_index = out_h; w < end_w;
-                 w += layer_config->skip_width, ++out_index) {
-              const int upper_jj_index = layer_config->filter_width + w;
-              float sum = layer_config->bias[i];
-              for (int k = 0; k < layer_config->in_channels; ++k) {
-                int off = k * layer_config->out_channels + i;
-                for (int ii = h; ii < upper_ii_index; ++ii) {
-                  const int clamped_ii = CLAMPINDEX(ii, in_height);
-                  for (int jj = w; jj < upper_jj_index; ++jj) {
-                    const int clamped_jj = CLAMPINDEX(jj, in_width);
-                    assert(clamped_ii >= 0 && clamped_ii < in_height &&
-                           clamped_jj >= 0 && clamped_jj < in_width);
-                    sum += layer_config->weights[off] *
-                           input[k][clamped_ii * in_stride + clamped_jj];
-                    off += cstep;
-                  }
-                }
-              }
-              output[i][out_index] = sum;
-            }
-          }
-        }
+      case PADDING_SAME_REPLICATE:
+        convolve_no_maxpool_padding_replicate(
+            input, in_width, in_height, in_stride, layer_config, output,
+            out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
         break;
-      }
-      case PADDING_VALID: {
-        for (int i = start_idx; i < layer_config->out_channels;
-             i += channel_step) {
-          for (int h = 0, u = 0;
-               h < in_height - layer_config->filter_height + 1;
-               h += layer_config->skip_height, ++u) {
-            const int out_h = u * out_stride;
-            const int upper_ii_index = layer_config->filter_height + h;
-            for (int w = 0, out_index = out_h;
-                 w < in_width - layer_config->filter_width + 1;
-                 w += layer_config->skip_width, ++out_index) {
-              const int upper_jj_index = layer_config->filter_width + w;
-              float sum = layer_config->bias[i];
-              for (int k = 0; k < layer_config->in_channels; ++k) {
-                int off = k * layer_config->out_channels + i;
-                for (int ii = h; ii < upper_ii_index; ++ii) {
-                  for (int jj = w; jj < upper_jj_index; ++jj) {
-                    assert(ii >= 0 && ii < in_height && jj >= 0 &&
-                           jj < in_width);
-                    sum += layer_config->weights[off] *
-                           input[k][ii * in_stride + jj];
-                    off += cstep;
-                  }
-                }
-              }
-              output[i][out_index] = sum;
-            }
-          }
-        }
+      case PADDING_VALID:
+        av1_cnn_convolve_no_maxpool_padding_valid(
+            input, in_width, in_height, in_stride, layer_config, output,
+            out_stride, start_idx, cstep, channel_step);
         break;
-      }
       default: assert(0 && "Unknown padding type");
     }
   }
 }
 
+static int convolve_layer(void *arg1, void *arg2) {
+  const CONVOLVE_OPS *convolve_ops = arg1;
+  (void)arg2;
+  av1_cnn_convolve(
+      convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
+      convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
+      convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
+  return 1;
+}
+
+static void convolve_layer_mt(const float **input, int in_width, int in_height,
+                              int in_stride,
+                              const CNN_LAYER_CONFIG *layer_config,
+                              const CNN_THREAD_DATA *thread_data,
+                              float **output, int out_stride) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  const int num_workers = thread_data->num_workers;
+  assert(thread_data->workers);
+
+  CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
+  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+    AVxWorker *const worker = &thread_data->workers[th];
+    winterface->reset(worker);
+
+    CONVOLVE_OPS convolve_op = { input,      in_width,     in_height,
+                                 in_stride,  layer_config, output,
+                                 out_stride, th,           num_workers };
+    convolve_ops[th] = convolve_op;
+    worker->hook = convolve_layer;
+    worker->data1 = &(convolve_ops[th]);
+    worker->data2 = NULL;
+
+    // Start convolving.
+    if (th == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait until all workers have finished.
+  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+    winterface->sync(&thread_data->workers[th]);
+  }
+}
+
 static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
   const int dif = AOMMAX(filt_width - stride, 0);
   return dif / 2;
@@ -721,8 +793,8 @@ void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
 
   int out_width = 0;
   int out_height = 0;
-  find_layer_output_size(in_width, in_height, layer_config, &out_width,
-                         &out_height);
+  av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
+                                 &out_height);
   switch (layer_config->pad) {
     case PADDING_SAME_ZERO:
       for (int i = 0; i < layer_config->out_channels; ++i) {
@@ -785,7 +857,6 @@ void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
                   const int jj =
                       CLAMPINDEX(w / layer_config->skip_width, in_width);
                   assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
-                  continue;
                   sum += layer_config->weights[off] *
                          input[k][ii * in_stride + jj];
                 }
@@ -829,12 +900,13 @@ void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
   }
 }
 
-void av1_cnn_predict_c(const float **input, int in_width, int in_height,
+bool av1_cnn_predict_c(const float **input, int in_width, int in_height,
                        int in_stride, const CNN_CONFIG *cnn_config,
                        const CNN_THREAD_DATA *thread_data,
                        CNN_MULTI_OUT *output_struct) {
-  TENSOR tensor1[CNN_MAX_BRANCHES] = { 0 };
-  TENSOR tensor2[CNN_MAX_BRANCHES] = { 0 };
+  bool success = false;
+  TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
+  TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
 
   float **output[CNN_MAX_BRANCHES];
   const int *out_chs = output_struct->output_channels;
@@ -871,12 +943,14 @@ void av1_cnn_predict_c(const float **input, int in_width, int in_height,
     }
 
     // Allocate output tensor
-    find_layer_output_size(i_width, i_height, layer_config, &o_width,
-                           &o_height);
+    av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
+                                   &o_height);
     const int output_num = layer_config->output_num;
     if (output_num == -1) {  // Non-output layer
-      realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
-                     o_height);
+      if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
+                          o_height)) {
+        goto Error;
+      }
     } else {  // Output layer
       free_tensor(&tensor2[branch]);
       assign_tensor(&tensor2[branch], output[output_num],
@@ -890,8 +964,10 @@ void av1_cnn_predict_c(const float **input, int in_width, int in_height,
                    !(branch_config->branches_to_combine & (1 << branch))));
 
     if (layer_config->branch_copy_type == BRANCH_INPUT) {
-      copy_active_tensor_to_branches(&tensor1[branch], layer_config, branch,
-                                     tensor2);
+      if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config,
+                                          branch, tensor2)) {
+        goto Error;
+      }
     }
     // Check consistency of input and output channels
     assert(tensor1[branch].channels == layer_config->in_channels);
@@ -918,8 +994,10 @@ void av1_cnn_predict_c(const float **input, int in_width, int in_height,
     }
 
     if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
-      copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
-                                     tensor2);
+      if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
+                                          branch, tensor2)) {
+        goto Error;
+      }
     }
 
     // Add tensors from other branches if needed
@@ -955,7 +1033,7 @@ void av1_cnn_predict_c(const float **input, int in_width, int in_height,
           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
             assert(tensor2[b].channels > 0);
-            concat_tensor(&tensor2[b], &tensor2[branch]);
+            if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error;
           }
         }
       } else {  // Output layer
@@ -985,20 +1063,25 @@ void av1_cnn_predict_c(const float **input, int in_width, int in_height,
     }
 
     if (layer_config->branch_copy_type == BRANCH_COMBINED) {
-      copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
-                                     tensor2);
+      if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
+                                          branch, tensor2)) {
+        goto Error;
+      }
     }
   }
 
+  success = true;
+Error:
   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
     free_tensor(&tensor1[b]);
     free_tensor(&tensor2[b]);
   }
+  return success;
 }
 
 // Assume output already has proper allocation
 // Assume input image buffers all have same resolution and strides
-void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
                                    int stride, const CNN_CONFIG *cnn_config,
                                    const CNN_THREAD_DATA *thread_data,
                                    CNN_MULTI_OUT *output) {
@@ -1010,6 +1093,7 @@ void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
   float *inputs[CNN_MAX_CHANNELS];
   float *input_ =
       (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+  if (!input_) return false;
   const int in_stride = in_width;
 
   for (int c = 0; c < in_channels; ++c) {
@@ -1044,15 +1128,16 @@ void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
     }
   }
-  av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
-                  cnn_config, thread_data, output);
+  bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
+                                 in_stride, cnn_config, thread_data, output);
 
   aom_free(input_);
+  return success;
 }
 
 // Assume output already has proper allocation
 // Assume input image buffers all have same resolution and strides
-void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
                                           int stride,
                                           const CNN_CONFIG *cnn_config,
                                           const CNN_THREAD_DATA *thread_data,
@@ -1066,6 +1151,7 @@ void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
   float *inputs[CNN_MAX_CHANNELS];
   float *input_ =
       (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+  if (!input_) return false;
   const int in_stride = in_width;
 
   for (int c = 0; c < in_channels; ++c) {
@@ -1101,15 +1187,16 @@ void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
     }
   }
 
-  av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
-                  cnn_config, thread_data, output);
+  bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
+                                 in_stride, cnn_config, thread_data, output);
 
   aom_free(input_);
+  return success;
 }
 
 // Assume output already has proper allocation
 // Assume input image buffers all have same resolution and strides
-void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
+bool av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
                          const CNN_CONFIG *cnn_config,
                          const CNN_THREAD_DATA *thread_data, float **output,
                          int out_stride) {
@@ -1121,13 +1208,13 @@ void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
   CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
                                   .output_strides = output_strides,
                                   .output_buffer = output };
-  av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config,
-                                thread_data, &output_struct);
+  return av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config,
+                                       thread_data, &output_struct);
 }
 
 // Assume output already has proper allocation
 // Assume input image buffers all have same resolution and strides
-void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
+bool av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
                                 int stride, const CNN_CONFIG *cnn_config,
                                 const CNN_THREAD_DATA *thread_data,
                                 int bit_depth, float **output, int out_stride) {
@@ -1139,6 +1226,7 @@ void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
   CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
                                   .output_strides = output_strides,
                                   .output_buffer = output };
-  av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride, cnn_config,
-                                       thread_data, bit_depth, &output_struct);
+  return av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride,
+                                              cnn_config, thread_data,
+                                              bit_depth, &output_struct);
 }
diff --git a/media/libaom/src/av1/encoder/cnn.h b/media/libaom/src/av1/encoder/cnn.h
index 706be44474..1a6c03a4c9 100644
--- a/media/libaom/src/av1/encoder/cnn.h
+++ b/media/libaom/src/av1/encoder/cnn.h
@@ -17,6 +17,7 @@ extern "C" {
 #endif
 
 #include <math.h>
+#include <stdbool.h>
 
 #include "aom_util/aom_thread.h"
 #include "config/av1_rtcd.h"
@@ -167,13 +168,18 @@ void av1_find_cnn_output_size(int in_width, int in_height,
                               const CNN_CONFIG *cnn_config, int *out_width,
                               int *out_height, int *out_channels);
 
+// Function to return output width and output height of given layer.
+void av1_find_cnn_layer_output_size(int in_width, int in_height,
+                                    const CNN_LAYER_CONFIG *layer_config,
+                                    int *out_width, int *out_height);
+
 // Prediction functions from set of input image buffers. This function supports
 // CNN with multiple outputs.
-void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
                                    int stride, const CNN_CONFIG *cnn_config,
                                    const CNN_THREAD_DATA *thread_data,
                                    struct CNN_MULTI_OUT *output);
-void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
                                           int stride,
                                           const CNN_CONFIG *cnn_config,
                                           const CNN_THREAD_DATA *thread_data,
@@ -181,11 +187,11 @@ void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
 
 // Prediction functions from set of input image buffers. This function only
 // supports a single output.
-void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
+bool av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
                          const CNN_CONFIG *cnn_config,
                          const CNN_THREAD_DATA *thread_data, float **output,
                          int out_stride);
-void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
+bool av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
                                 int stride, const CNN_CONFIG *cnn_config,
                                 const CNN_THREAD_DATA *thread_data,
                                 int bit_depth, float **output, int out_stride);
diff --git a/media/libaom/src/av1/encoder/compound_type.c b/media/libaom/src/av1/encoder/compound_type.c
index 42095b79e3..4f762b93ed 100644
--- a/media/libaom/src/av1/encoder/compound_type.c
+++ b/media/libaom/src/av1/encoder/compound_type.c
@@ -11,6 +11,7 @@
 
 #include "av1/common/pred_common.h"
 #include "av1/encoder/compound_type.h"
+#include "av1/encoder/encoder_alloc.h"
 #include "av1/encoder/model_rd.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/rdopt_utils.h"
@@ -47,31 +48,31 @@ static INLINE int is_comp_rd_match(const AV1_COMP *const cpi,
     if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0;
   }
 
-  // Store the stats for COMPOUND_AVERAGE and COMPOUND_DISTWTD
-  for (int comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
-       comp_type++) {
-    comp_rate[comp_type] = st->rate[comp_type];
-    comp_dist[comp_type] = st->dist[comp_type];
-    comp_model_rate[comp_type] = st->model_rate[comp_type];
-    comp_model_dist[comp_type] = st->model_dist[comp_type];
-    comp_rs2[comp_type] = st->comp_rs2[comp_type];
-  }
-
-  // For compound wedge/segment, reuse data only if NEWMV is not present in
-  // either of the directions
+  int reuse_data[COMPOUND_TYPES] = { 1, 1, 0, 0 };
+  // For compound wedge, reuse data if newmv search is disabled when NEWMV is
+  // present or if NEWMV is not present in either of the directions
   if ((!have_newmv_in_inter_mode(mi->mode) &&
        !have_newmv_in_inter_mode(st->mode)) ||
-      (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)) {
-    memcpy(&comp_rate[COMPOUND_WEDGE], &st->rate[COMPOUND_WEDGE],
-           sizeof(comp_rate[COMPOUND_WEDGE]) * 2);
-    memcpy(&comp_dist[COMPOUND_WEDGE], &st->dist[COMPOUND_WEDGE],
-           sizeof(comp_dist[COMPOUND_WEDGE]) * 2);
-    memcpy(&comp_model_rate[COMPOUND_WEDGE], &st->model_rate[COMPOUND_WEDGE],
-           sizeof(comp_model_rate[COMPOUND_WEDGE]) * 2);
-    memcpy(&comp_model_dist[COMPOUND_WEDGE], &st->model_dist[COMPOUND_WEDGE],
-           sizeof(comp_model_dist[COMPOUND_WEDGE]) * 2);
-    memcpy(&comp_rs2[COMPOUND_WEDGE], &st->comp_rs2[COMPOUND_WEDGE],
-           sizeof(comp_rs2[COMPOUND_WEDGE]) * 2);
+      (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search))
+    reuse_data[COMPOUND_WEDGE] = 1;
+  // For compound diffwtd, reuse data if fast search is enabled (no newmv search
+  // when NEWMV is present) or if NEWMV is not present in either of the
+  // directions
+  if (cpi->sf.inter_sf.enable_fast_compound_mode_search ||
+      (!have_newmv_in_inter_mode(mi->mode) &&
+       !have_newmv_in_inter_mode(st->mode)))
+    reuse_data[COMPOUND_DIFFWTD] = 1;
+
+  // Store the stats for the different compound types
+  for (int comp_type = COMPOUND_AVERAGE; comp_type < COMPOUND_TYPES;
+       comp_type++) {
+    if (reuse_data[comp_type]) {
+      comp_rate[comp_type] = st->rate[comp_type];
+      comp_dist[comp_type] = st->dist[comp_type];
+      comp_model_rate[comp_type] = st->model_rate[comp_type];
+      comp_model_dist[comp_type] = st->model_dist[comp_type];
+      comp_rs2[comp_type] = st->comp_rs2[comp_type];
+    }
   }
   return 1;
 }
@@ -96,25 +97,25 @@ static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi,
   return 0;  // no match result found
 }
 
-static INLINE bool enable_wedge_search(MACROBLOCK *const x,
-                                       const AV1_COMP *const cpi) {
+static INLINE bool enable_wedge_search(
+    MACROBLOCK *const x, const unsigned int disable_wedge_var_thresh) {
   // Enable wedge search if source variance and edge strength are above
   // the thresholds.
-  return x->source_variance >
-             cpi->sf.inter_sf.disable_wedge_search_var_thresh &&
-         x->edge_strength > cpi->sf.inter_sf.disable_wedge_search_edge_thresh;
+  return x->source_variance > disable_wedge_var_thresh;
 }
 
 static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x,
                                                   const AV1_COMP *const cpi) {
-  return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interinter_wedge &&
-         !cpi->sf.inter_sf.disable_interinter_wedge;
+  return enable_wedge_search(
+             x, cpi->sf.inter_sf.disable_interinter_wedge_var_thresh) &&
+         cpi->oxcf.comp_type_cfg.enable_interinter_wedge;
 }
 
 static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x,
                                                   const AV1_COMP *const cpi) {
-  return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interintra_wedge &&
-         !cpi->sf.inter_sf.disable_wedge_interintra_search;
+  return enable_wedge_search(
+             x, cpi->sf.inter_sf.disable_interintra_wedge_var_thresh) &&
+         cpi->oxcf.comp_type_cfg.enable_interintra_wedge;
 }
 
 static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
@@ -165,14 +166,14 @@ static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
   // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants)
   // for all codebooks; experiment with other quadrant combinations for
   // 0, 90 and 135 degrees also.
-  cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
-  cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
-                          pred0 + bh_by2 * stride0 + bw_by2, stride0,
-                          &esq[0][1]);
-  cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
-  cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
-                          pred1 + bh_by2 * stride1 + bw_by2, stride0,
-                          &esq[1][1]);
+  cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+  cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+                               pred0 + bh_by2 * stride0 + bw_by2, stride0,
+                               &esq[0][1]);
+  cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+  cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+                               pred1 + bh_by2 * stride1 + bw_by2, stride0,
+                               &esq[1][1]);
 
   tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]);
   br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]);
@@ -207,7 +208,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
 #if CONFIG_AV1_HIGHBITDEPTH
   if (hbd) {
     aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+                              CONVERT_TO_BYTEPTR(p0), bw);
   } else {
     aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
   }
@@ -241,7 +242,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
     // sse, rate, dist, rate2, dist2); dist = dist2;
     // rate = rate2;
 
-    rate += x->wedge_idx_cost[bsize][wedge_index];
+    rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index];
     rd = RDCOST(x->rdmult, rate, dist);
 
     if (rd < best_rd) {
@@ -253,7 +254,8 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
   }
 
   return best_rd -
-         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
+         RDCOST(x->rdmult,
+                x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0);
 }
 
 // Choose the best wedge index the specified sign
@@ -284,7 +286,7 @@ static int64_t pick_wedge_fixed_sign(
 
     model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
                                                   &rate, &dist);
-    rate += x->wedge_idx_cost[bsize][wedge_index];
+    rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index];
     rd = RDCOST(x->rdmult, rate, dist);
 
     if (rd < best_rd) {
@@ -294,7 +296,8 @@ static int64_t pick_wedge_fixed_sign(
     }
   }
   return best_rd -
-         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
+         RDCOST(x->rdmult,
+                x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0);
 }
 
 static int64_t pick_interinter_wedge(
@@ -311,7 +314,7 @@ static int64_t pick_interinter_wedge(
   int8_t wedge_sign = 0;
 
   assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
-  assert(cpi->common.seq_params.enable_masked_compound);
+  assert(cpi->common.seq_params->enable_masked_compound);
 
   if (cpi->sf.inter_sf.fast_wedge_sign_estimate) {
     wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
@@ -351,6 +354,7 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
   // try each mask type and its inverse
   for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
     // build mask and inverse
+#if CONFIG_AV1_HIGHBITDEPTH
     if (hbd)
       av1_build_compound_diffwtd_mask_highbd(
           tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
@@ -358,6 +362,11 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
     else
       av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
                                       p0, bw, p1, bw, bh, bw);
+#else
+    (void)hbd;
+    av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, p0,
+                                    bw, p1, bw, bh, bw);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
     // compute rd for mask
     uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
@@ -389,7 +398,7 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
   const MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(av1_is_wedge_used(bsize));
-  assert(cpi->common.seq_params.enable_interintra_compound);
+  assert(cpi->common.seq_params->enable_interintra_compound);
 
   const struct buf_2d *const src = &x->plane[0].src;
   const int bw = block_size_wide[bsize];
@@ -399,9 +408,9 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+                              CONVERT_TO_BYTEPTR(p1), bw);
     aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
-                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+                              CONVERT_TO_BYTEPTR(p0), bw);
   } else {
     aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
     aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
@@ -434,9 +443,9 @@ static AOM_INLINE void get_inter_predictors_masked_compound(
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd);
+                              CONVERT_TO_BYTEPTR(*preds1), bw);
     aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
-                              bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd);
+                              bw, CONVERT_TO_BYTEPTR(*preds0), bw);
   } else {
     aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1,
                        bw);
@@ -480,20 +489,15 @@ static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
   MACROBLOCKD *const xd = &x->e_mbd;
   if (ref_best_rd < 0) return INT64_MAX;
   av1_subtract_plane(x, bs, 0);
-  x->rd_model = LOW_TXFM_RD;
-  const int skip_trellis = (cpi->optimize_seg_arr[xd->mi[0]->segment_id] ==
-                            NO_ESTIMATE_YRD_TRELLIS_OPT);
-  const int64_t rd =
-      av1_uniform_txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs,
-                           max_txsize_rect_lookup[bs], FTXS_NONE, skip_trellis);
-  x->rd_model = FULL_TXFM_RD;
+  const int64_t rd = av1_estimate_txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs,
+                                           max_txsize_rect_lookup[bs]);
   if (rd != INT64_MAX) {
-    const int skip_ctx = av1_get_skip_context(xd);
-    if (rd_stats->skip) {
-      const int s1 = x->skip_cost[skip_ctx][1];
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
+    if (rd_stats->skip_txfm) {
+      const int s1 = x->mode_costs.skip_txfm_cost[skip_ctx][1];
       rd_stats->rate = s1;
     } else {
-      const int s0 = x->skip_cost[skip_ctx][0];
+      const int s0 = x->mode_costs.skip_txfm_cost[skip_ctx][0];
       rd_stats->rate += s0;
     }
   }
@@ -529,7 +533,7 @@ static AOM_INLINE int64_t compute_best_wedge_interintra(
     int64_t rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
     const int rate_overhead =
         interintra_mode_cost[mode] +
-        x->wedge_idx_cost[bsize][mbmi->interintra_wedge_index];
+        x->mode_costs.wedge_idx_cost[bsize][mbmi->interintra_wedge_index];
     const int64_t total_rd = rd + RDCOST(x->rdmult, rate_overhead, 0);
     if (total_rd < best_total_rd) {
       best_total_rd = total_rd;
@@ -541,28 +545,214 @@ static AOM_INLINE int64_t compute_best_wedge_interintra(
   return best_interintra_rd_wedge;
 }
 
+static int handle_smooth_inter_intra_mode(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    MB_MODE_INFO *mbmi, int64_t ref_best_rd, int *rate_mv,
+    INTERINTRA_MODE *best_interintra_mode, int64_t *best_rd,
+    int *best_mode_rate, const BUFFER_SET *orig_dst, uint8_t *tmp_buf,
+    uint8_t *intrapred, HandleInterModeArgs *args) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *const interintra_mode_cost =
+      mode_costs->interintra_mode_cost[size_group_lookup[bsize]];
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bw = block_size_wide[bsize];
+
+  mbmi->use_wedge_interintra = 0;
+
+  if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 ||
+      *best_interintra_mode == INTERINTRA_MODES) {
+    int64_t best_interintra_rd = INT64_MAX;
+    for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+         ++cur_mode) {
+      if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
+           cpi->sf.intra_sf.disable_smooth_intra) &&
+          cur_mode == II_SMOOTH_PRED)
+        continue;
+      compute_best_interintra_mode(
+          cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred, tmp_buf,
+          best_interintra_mode, &best_interintra_rd, cur_mode, bsize);
+    }
+    args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode;
+  }
+  assert(IMPLIES(!cpi->oxcf.comp_type_cfg.enable_smooth_interintra,
+                 *best_interintra_mode != II_SMOOTH_PRED));
+  // Recompute prediction if required
+  bool interintra_mode_reuse = cpi->sf.inter_sf.reuse_inter_intra_mode ||
+                               *best_interintra_mode != INTERINTRA_MODES;
+  if (interintra_mode_reuse || *best_interintra_mode != INTERINTRA_MODES - 1) {
+    mbmi->interintra_mode = *best_interintra_mode;
+    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                              intrapred, bw);
+    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+  }
+
+  // Compute rd cost for best smooth_interintra
+  RD_STATS rd_stats;
+  const int is_wedge_used = av1_is_wedge_used(bsize);
+  const int rmode =
+      interintra_mode_cost[*best_interintra_mode] +
+      (is_wedge_used ? mode_costs->wedge_interintra_cost[bsize][0] : 0);
+  const int total_mode_rate = rmode + *rate_mv;
+  const int64_t rd_thresh = compute_rd_thresh(x, total_mode_rate, ref_best_rd);
+  int64_t rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats);
+  if (rd != INT64_MAX) {
+    rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist);
+  } else {
+    return IGNORE_MODE;
+  }
+  *best_rd = rd;
+  *best_mode_rate = rmode;
+  // Return early if best rd not good enough
+  if (ref_best_rd < INT64_MAX &&
+      (*best_rd >> INTER_INTRA_RD_THRESH_SHIFT) * INTER_INTRA_RD_THRESH_SCALE >
+          ref_best_rd) {
+    return IGNORE_MODE;
+  }
+  return 0;
+}
+
+static int handle_wedge_inter_intra_mode(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    MB_MODE_INFO *mbmi, int *rate_mv, INTERINTRA_MODE *best_interintra_mode,
+    int64_t *best_rd, const BUFFER_SET *orig_dst, uint8_t *tmp_buf_,
+    uint8_t *tmp_buf, uint8_t *intrapred_, uint8_t *intrapred,
+    HandleInterModeArgs *args, int *tmp_rate_mv, int *rate_overhead,
+    int_mv *tmp_mv, int64_t best_rd_no_wedge) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *const interintra_mode_cost =
+      mode_costs->interintra_mode_cost[size_group_lookup[bsize]];
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bw = block_size_wide[bsize];
+  const int try_smooth_interintra =
+      cpi->oxcf.comp_type_cfg.enable_smooth_interintra;
+
+  mbmi->use_wedge_interintra = 1;
+
+  if (!cpi->sf.inter_sf.fast_interintra_wedge_search) {
+    // Exhaustive search of all wedge and mode combinations.
+    int best_mode = 0;
+    int best_wedge_index = 0;
+    *best_rd = compute_best_wedge_interintra(
+        cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_, tmp_buf_,
+        &best_mode, &best_wedge_index, bsize);
+    mbmi->interintra_mode = best_mode;
+    mbmi->interintra_wedge_index = best_wedge_index;
+    if (best_mode != INTERINTRA_MODES - 1) {
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+    }
+  } else if (!try_smooth_interintra) {
+    if (*best_interintra_mode == INTERINTRA_MODES) {
+      mbmi->interintra_mode = INTERINTRA_MODES - 1;
+      *best_interintra_mode = INTERINTRA_MODES - 1;
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+      // Pick wedge mask based on INTERINTRA_MODES - 1
+      *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+      // Find the best interintra mode for the chosen wedge mask
+      for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+           ++cur_mode) {
+        compute_best_interintra_mode(
+            cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred,
+            tmp_buf, best_interintra_mode, best_rd, cur_mode, bsize);
+      }
+      args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode;
+      mbmi->interintra_mode = *best_interintra_mode;
+
+      // Recompute prediction if required
+      if (*best_interintra_mode != INTERINTRA_MODES - 1) {
+        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                  intrapred, bw);
+      }
+    } else {
+      // Pick wedge mask for the best interintra mode (reused)
+      mbmi->interintra_mode = *best_interintra_mode;
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+      *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+    }
+  } else {
+    // Pick wedge mask for the best interintra mode from smooth_interintra
+    *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+  }
+
+  *rate_overhead =
+      interintra_mode_cost[mbmi->interintra_mode] +
+      mode_costs->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] +
+      mode_costs->wedge_interintra_cost[bsize][1];
+  *best_rd += RDCOST(x->rdmult, *rate_overhead + *rate_mv, 0);
+
+  int64_t rd = INT64_MAX;
+  const int_mv mv0 = mbmi->mv[0];
+  // Refine motion vector for NEWMV case.
+  if (have_newmv_in_inter_mode(mbmi->mode)) {
+    int rate_sum, skip_txfm_sb;
+    int64_t dist_sum, skip_sse_sb;
+    // get negative of mask
+    const uint8_t *mask =
+        av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize);
+    av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv->as_mv, intrapred,
+                                      mask, bw, tmp_rate_mv, 0);
+    if (mbmi->mv[0].as_int != tmp_mv->as_int) {
+      mbmi->mv[0].as_int = tmp_mv->as_int;
+      // Set ref_frame[1] to NONE_FRAME temporarily so that the intra
+      // predictor is not calculated again in av1_enc_build_inter_predictor().
+      mbmi->ref_frame[1] = NONE_FRAME;
+      const int mi_row = xd->mi_row;
+      const int mi_col = xd->mi_col;
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+      mbmi->ref_frame[1] = INTRA_FRAME;
+      av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf,
+                             xd->plane[AOM_PLANE_Y].dst.stride, intrapred, bw);
+      model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+          cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb,
+          &skip_sse_sb, NULL, NULL, NULL);
+      rd =
+          RDCOST(x->rdmult, *tmp_rate_mv + *rate_overhead + rate_sum, dist_sum);
+    }
+  }
+  if (rd >= *best_rd) {
+    tmp_mv->as_int = mv0.as_int;
+    *tmp_rate_mv = *rate_mv;
+    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+  }
+  // Evaluate closer to true rd
+  RD_STATS rd_stats;
+  const int64_t mode_rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv, 0);
+  const int64_t tmp_rd_thresh = best_rd_no_wedge - mode_rd;
+  rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
+  if (rd != INT64_MAX) {
+    rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv + rd_stats.rate,
+                rd_stats.dist);
+  } else {
+    if (*best_rd == INT64_MAX) return IGNORE_MODE;
+  }
+  *best_rd = rd;
+  return 0;
+}
+
 int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                 BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
                                 HandleInterModeArgs *args, int64_t ref_best_rd,
                                 int *rate_mv, int *tmp_rate2,
                                 const BUFFER_SET *orig_dst) {
-  const int try_smooth_interintra = cpi->oxcf.enable_smooth_interintra &&
-                                    !cpi->sf.inter_sf.disable_smooth_interintra;
+  const int try_smooth_interintra =
+      cpi->oxcf.comp_type_cfg.enable_smooth_interintra;
+
   const int is_wedge_used = av1_is_wedge_used(bsize);
   const int try_wedge_interintra =
       is_wedge_used && enable_wedge_interintra_search(x, cpi);
-  if (!try_smooth_interintra && !try_wedge_interintra) return -1;
 
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  int64_t rd = INT64_MAX;
   const int bw = block_size_wide[bsize];
   DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
   DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
   uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
   uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
-  const int *const interintra_mode_cost =
-      x->interintra_mode_cost[size_group_lookup[bsize]];
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
 
@@ -584,189 +774,50 @@ int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
   int64_t best_interintra_rd_nowedge = INT64_MAX;
   int best_mode_rate = INT_MAX;
   if (try_smooth_interintra) {
-    mbmi->use_wedge_interintra = 0;
-    int interintra_mode_reuse = 1;
-    if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 ||
-        best_interintra_mode == INTERINTRA_MODES) {
-      interintra_mode_reuse = 0;
-      int64_t best_interintra_rd = INT64_MAX;
-      for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
-           ++cur_mode) {
-        if ((!cpi->oxcf.enable_smooth_intra ||
-             cpi->sf.intra_sf.disable_smooth_intra) &&
-            cur_mode == II_SMOOTH_PRED)
-          continue;
-        compute_best_interintra_mode(cpi, mbmi, xd, x, interintra_mode_cost,
-                                     orig_dst, intrapred, tmp_buf,
-                                     &best_interintra_mode, &best_interintra_rd,
-                                     cur_mode, bsize);
-      }
-      args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
-    }
-    assert(IMPLIES(!cpi->oxcf.enable_smooth_interintra ||
-                       cpi->sf.inter_sf.disable_smooth_interintra,
-                   best_interintra_mode != II_SMOOTH_PRED));
-    // Recompute prediction if required
-    if (interintra_mode_reuse || best_interintra_mode != INTERINTRA_MODES - 1) {
-      mbmi->interintra_mode = best_interintra_mode;
-      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                intrapred, bw);
-      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-    }
-
-    // Compute rd cost for best smooth_interintra
-    RD_STATS rd_stats;
-    const int rmode = interintra_mode_cost[best_interintra_mode] +
-                      (is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0);
-    const int total_mode_rate = rmode + *rate_mv;
-    const int64_t rd_thresh =
-        compute_rd_thresh(x, total_mode_rate, ref_best_rd);
-    rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats);
-    if (rd != INT64_MAX) {
-      rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist);
-    } else {
-      return -1;
-    }
-    best_interintra_rd_nowedge = rd;
-    best_mode_rate = rmode;
-    // Return early if best_interintra_rd_nowedge not good enough
-    if (ref_best_rd < INT64_MAX &&
-        (best_interintra_rd_nowedge >> INTER_INTRA_RD_THRESH_SHIFT) *
-                INTER_INTRA_RD_THRESH_SCALE >
-            ref_best_rd) {
-      return -1;
+    int ret = handle_smooth_inter_intra_mode(
+        cpi, x, bsize, mbmi, ref_best_rd, rate_mv, &best_interintra_mode,
+        &best_interintra_rd_nowedge, &best_mode_rate, orig_dst, tmp_buf,
+        intrapred, args);
+    if (ret == IGNORE_MODE) {
+      return IGNORE_MODE;
     }
   }
 
   // Compute wedge interintra
   int64_t best_interintra_rd_wedge = INT64_MAX;
+  const int_mv mv0 = mbmi->mv[0];
+  int_mv tmp_mv = mv0;
+  int tmp_rate_mv = 0;
+  int rate_overhead = 0;
   if (try_wedge_interintra) {
-    mbmi->use_wedge_interintra = 1;
-    if (!cpi->sf.inter_sf.fast_interintra_wedge_search) {
-      // Exhaustive search of all wedge and mode combinations.
-      int best_mode = 0;
-      int best_wedge_index = 0;
-      best_interintra_rd_wedge = compute_best_wedge_interintra(
-          cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_,
-          tmp_buf_, &best_mode, &best_wedge_index, bsize);
-      mbmi->interintra_mode = best_mode;
-      mbmi->interintra_wedge_index = best_wedge_index;
-      if (best_mode != INTERINTRA_MODES - 1) {
-        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                  intrapred, bw);
-      }
-    } else if (!try_smooth_interintra) {
-      if (best_interintra_mode == INTERINTRA_MODES) {
-        mbmi->interintra_mode = INTERINTRA_MODES - 1;
-        best_interintra_mode = INTERINTRA_MODES - 1;
-        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                  intrapred, bw);
-        // Pick wedge mask based on INTERINTRA_MODES - 1
-        best_interintra_rd_wedge =
-            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-        // Find the best interintra mode for the chosen wedge mask
-        for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
-             ++cur_mode) {
-          compute_best_interintra_mode(
-              cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred,
-              tmp_buf, &best_interintra_mode, &best_interintra_rd_wedge,
-              cur_mode, bsize);
-        }
-        args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
-        mbmi->interintra_mode = best_interintra_mode;
-
-        // Recompute prediction if required
-        if (best_interintra_mode != INTERINTRA_MODES - 1) {
-          av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                    intrapred, bw);
-        }
-      } else {
-        // Pick wedge mask for the best interintra mode (reused)
-        mbmi->interintra_mode = best_interintra_mode;
-        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
-                                                  intrapred, bw);
-        best_interintra_rd_wedge =
-            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-      }
-    } else {
-      // Pick wedge mask for the best interintra mode from smooth_interintra
-      best_interintra_rd_wedge =
-          pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-    }
-
-    const int rate_overhead =
-        interintra_mode_cost[mbmi->interintra_mode] +
-        x->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] +
-        x->wedge_interintra_cost[bsize][1];
-    best_interintra_rd_wedge += RDCOST(x->rdmult, rate_overhead + *rate_mv, 0);
-
-    const int_mv mv0 = mbmi->mv[0];
-    int_mv tmp_mv = mv0;
-    rd = INT64_MAX;
-    int tmp_rate_mv = 0;
-    // Refine motion vector for NEWMV case.
-    if (have_newmv_in_inter_mode(mbmi->mode)) {
-      int rate_sum, skip_txfm_sb;
-      int64_t dist_sum, skip_sse_sb;
-      // get negative of mask
-      const uint8_t *mask =
-          av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize);
-      av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, intrapred,
-                                        mask, bw, &tmp_rate_mv, 0);
-      if (mbmi->mv[0].as_int != tmp_mv.as_int) {
-        mbmi->mv[0].as_int = tmp_mv.as_int;
-        // Set ref_frame[1] to NONE_FRAME temporarily so that the intra
-        // predictor is not calculated again in av1_enc_build_inter_predictor().
-        mbmi->ref_frame[1] = NONE_FRAME;
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                      AOM_PLANE_Y, AOM_PLANE_Y);
-        mbmi->ref_frame[1] = INTRA_FRAME;
-        av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf,
-                               xd->plane[AOM_PLANE_Y].dst.stride, intrapred,
-                               bw);
-        model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
-            cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb,
-            &skip_sse_sb, NULL, NULL, NULL);
-        rd =
-            RDCOST(x->rdmult, tmp_rate_mv + rate_overhead + rate_sum, dist_sum);
-      }
-    }
-    if (rd >= best_interintra_rd_wedge) {
-      tmp_mv.as_int = mv0.as_int;
-      tmp_rate_mv = *rate_mv;
-      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-    }
-    // Evaluate closer to true rd
-    RD_STATS rd_stats;
-    const int64_t mode_rd = RDCOST(x->rdmult, rate_overhead + tmp_rate_mv, 0);
-    const int64_t tmp_rd_thresh = best_interintra_rd_nowedge - mode_rd;
-    rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
-    if (rd != INT64_MAX) {
-      rd = RDCOST(x->rdmult, rate_overhead + tmp_rate_mv + rd_stats.rate,
-                  rd_stats.dist);
-    } else {
-      if (best_interintra_rd_nowedge == INT64_MAX) return -1;
-    }
-    best_interintra_rd_wedge = rd;
-    if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
-      mbmi->mv[0].as_int = tmp_mv.as_int;
-      *tmp_rate2 += tmp_rate_mv - *rate_mv;
-      *rate_mv = tmp_rate_mv;
-      best_mode_rate = rate_overhead;
-    } else {
-      mbmi->use_wedge_interintra = 0;
-      mbmi->interintra_mode = best_interintra_mode;
-      mbmi->mv[0].as_int = mv0.as_int;
-      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                    AOM_PLANE_Y, AOM_PLANE_Y);
+    int ret = handle_wedge_inter_intra_mode(
+        cpi, x, bsize, mbmi, rate_mv, &best_interintra_mode,
+        &best_interintra_rd_wedge, orig_dst, tmp_buf_, tmp_buf, intrapred_,
+        intrapred, args, &tmp_rate_mv, &rate_overhead, &tmp_mv,
+        best_interintra_rd_nowedge);
+    if (ret == IGNORE_MODE) {
+      return IGNORE_MODE;
     }
   }
 
   if (best_interintra_rd_nowedge == INT64_MAX &&
       best_interintra_rd_wedge == INT64_MAX) {
-    return -1;
+    return IGNORE_MODE;
+  }
+  if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+    mbmi->mv[0].as_int = tmp_mv.as_int;
+    *tmp_rate2 += tmp_rate_mv - *rate_mv;
+    *rate_mv = tmp_rate_mv;
+    best_mode_rate = rate_overhead;
+  } else if (try_smooth_interintra && try_wedge_interintra) {
+    // If smooth was best, but we over-wrote the values when evaluating the
+    // wedge mode, we need to recompute the smooth values.
+    mbmi->use_wedge_interintra = 0;
+    mbmi->interintra_mode = best_interintra_mode;
+    mbmi->mv[0].as_int = mv0.as_int;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  AOM_PLANE_Y, AOM_PLANE_Y);
   }
-
   *tmp_rate2 += best_mode_rate;
 
   if (num_planes > 1) {
@@ -776,25 +827,13 @@ int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
   return 0;
 }
 
-static void alloc_compound_type_rd_buffers_no_check(
-    CompoundTypeRdBuffers *const bufs) {
-  bufs->pred0 =
-      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0));
-  bufs->pred1 =
-      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1));
-  bufs->residual1 =
-      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1));
-  bufs->diff10 =
-      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10));
-  bufs->tmp_best_mask_buf = (uint8_t *)aom_malloc(
-      2 * MAX_SB_SQUARE * sizeof(*bufs->tmp_best_mask_buf));
-}
-
 // Computes the valid compound_types to be evaluated
-static INLINE int compute_valid_comp_types(
-    MACROBLOCK *x, const AV1_COMP *const cpi, int *try_average_and_distwtd_comp,
-    BLOCK_SIZE bsize, int masked_compound_used, int mode_search_mask,
-    COMPOUND_TYPE *valid_comp_types) {
+static INLINE int compute_valid_comp_types(MACROBLOCK *x,
+                                           const AV1_COMP *const cpi,
+                                           BLOCK_SIZE bsize,
+                                           int masked_compound_used,
+                                           int mode_search_mask,
+                                           COMPOUND_TYPE *valid_comp_types) {
   const AV1_COMMON *cm = &cpi->common;
   int valid_type_count = 0;
   int comp_type, valid_check;
@@ -803,17 +842,15 @@ static INLINE int compute_valid_comp_types(
   const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE));
   const int try_distwtd_comp =
       ((mode_search_mask & (1 << COMPOUND_DISTWTD)) &&
-       cm->seq_params.order_hint_info.enable_dist_wtd_comp == 1 &&
+       cm->seq_params->order_hint_info.enable_dist_wtd_comp == 1 &&
        cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
-  *try_average_and_distwtd_comp = try_average_comp && try_distwtd_comp;
 
   // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases
   for (comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
        comp_type++) {
     valid_check =
         (comp_type == COMPOUND_AVERAGE) ? try_average_comp : try_distwtd_comp;
-    if (!*try_average_and_distwtd_comp && valid_check &&
-        is_interinter_compound_used(comp_type, bsize))
+    if (valid_check && is_interinter_compound_used(comp_type, bsize))
       valid_comp_types[valid_type_count++] = comp_type;
   }
   // Check if COMPOUND_WEDGE and COMPOUND_DIFFWTD are valid cases
@@ -821,7 +858,7 @@ static INLINE int compute_valid_comp_types(
     // enable_masked_type[0] corresponds to COMPOUND_WEDGE
     // enable_masked_type[1] corresponds to COMPOUND_DIFFWTD
     enable_masked_type[0] = enable_wedge_interinter_search(x, cpi);
-    enable_masked_type[1] = cpi->oxcf.enable_diff_wtd_comp;
+    enable_masked_type[1] = cpi->oxcf.comp_type_cfg.enable_diff_wtd_comp;
     for (comp_type = COMPOUND_WEDGE; comp_type <= COMPOUND_DIFFWTD;
          comp_type++) {
       if ((mode_search_mask & (1 << comp_type)) &&
@@ -834,11 +871,9 @@ static INLINE int compute_valid_comp_types(
 }
 
 // Calculates the cost for compound type mask
-static INLINE void calc_masked_type_cost(MACROBLOCK *x, BLOCK_SIZE bsize,
-                                         int comp_group_idx_ctx,
-                                         int comp_index_ctx,
-                                         int masked_compound_used,
-                                         int *masked_type_cost) {
+static INLINE void calc_masked_type_cost(
+    const ModeCosts *mode_costs, BLOCK_SIZE bsize, int comp_group_idx_ctx,
+    int comp_index_ctx, int masked_compound_used, int *masked_type_cost) {
   av1_zero_array(masked_type_cost, COMPOUND_TYPES);
   // Account for group index cost when wedge and/or diffwtd prediction are
   // enabled
@@ -846,18 +881,21 @@ static INLINE void calc_masked_type_cost(MACROBLOCK *x, BLOCK_SIZE bsize,
     // Compound group index of average and distwtd is 0
     // Compound group index of wedge and diffwtd is 1
     masked_type_cost[COMPOUND_AVERAGE] +=
-        x->comp_group_idx_cost[comp_group_idx_ctx][0];
+        mode_costs->comp_group_idx_cost[comp_group_idx_ctx][0];
     masked_type_cost[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_AVERAGE];
     masked_type_cost[COMPOUND_WEDGE] +=
-        x->comp_group_idx_cost[comp_group_idx_ctx][1];
+        mode_costs->comp_group_idx_cost[comp_group_idx_ctx][1];
     masked_type_cost[COMPOUND_DIFFWTD] += masked_type_cost[COMPOUND_WEDGE];
   }
 
   // Compute the cost to signal compound index/type
-  masked_type_cost[COMPOUND_AVERAGE] += x->comp_idx_cost[comp_index_ctx][1];
-  masked_type_cost[COMPOUND_DISTWTD] += x->comp_idx_cost[comp_index_ctx][0];
-  masked_type_cost[COMPOUND_WEDGE] += x->compound_type_cost[bsize][0];
-  masked_type_cost[COMPOUND_DIFFWTD] += x->compound_type_cost[bsize][1];
+  masked_type_cost[COMPOUND_AVERAGE] +=
+      mode_costs->comp_idx_cost[comp_index_ctx][1];
+  masked_type_cost[COMPOUND_DISTWTD] +=
+      mode_costs->comp_idx_cost[comp_index_ctx][0];
+  masked_type_cost[COMPOUND_WEDGE] += mode_costs->compound_type_cost[bsize][0];
+  masked_type_cost[COMPOUND_DIFFWTD] +=
+      mode_costs->compound_type_cost[bsize][1];
 }
 
 // Updates mbmi structure with the relevant compound type info
@@ -904,39 +942,11 @@ static INLINE void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd,
 
 // Updates best_mv for masked compound types
 static INLINE void update_mask_best_mv(const MB_MODE_INFO *const mbmi,
-                                       int_mv *best_mv, int_mv *cur_mv,
-                                       const COMPOUND_TYPE cur_type,
-                                       int *best_tmp_rate_mv, int tmp_rate_mv,
-                                       const SPEED_FEATURES *const sf) {
-  if (cur_type == COMPOUND_WEDGE ||
-      (sf->inter_sf.enable_interinter_diffwtd_newmv_search &&
-       cur_type == COMPOUND_DIFFWTD)) {
-    *best_tmp_rate_mv = tmp_rate_mv;
-    best_mv[0].as_int = mbmi->mv[0].as_int;
-    best_mv[1].as_int = mbmi->mv[1].as_int;
-  } else {
-    best_mv[0].as_int = cur_mv[0].as_int;
-    best_mv[1].as_int = cur_mv[1].as_int;
-  }
-}
-
-// Choose the better of the two COMPOUND_AVERAGE,
-// COMPOUND_DISTWTD based on modeled cost
-static int find_best_avg_distwtd_comp_type(MACROBLOCK *x, int *comp_model_rate,
-                                           int64_t *comp_model_dist,
-                                           int rate_mv, int64_t *best_rd) {
-  int64_t est_rd[2];
-  est_rd[COMPOUND_AVERAGE] =
-      RDCOST(x->rdmult, comp_model_rate[COMPOUND_AVERAGE] + rate_mv,
-             comp_model_dist[COMPOUND_AVERAGE]);
-  est_rd[COMPOUND_DISTWTD] =
-      RDCOST(x->rdmult, comp_model_rate[COMPOUND_DISTWTD] + rate_mv,
-             comp_model_dist[COMPOUND_DISTWTD]);
-  int best_type = (est_rd[COMPOUND_AVERAGE] <= est_rd[COMPOUND_DISTWTD])
-                      ? COMPOUND_AVERAGE
-                      : COMPOUND_DISTWTD;
-  *best_rd = est_rd[best_type];
-  return best_type;
+                                       int_mv *best_mv, int *best_tmp_rate_mv,
+                                       int tmp_rate_mv) {
+  *best_tmp_rate_mv = tmp_rate_mv;
+  best_mv[0].as_int = mbmi->mv[0].as_int;
+  best_mv[1].as_int = mbmi->mv[1].as_int;
 }
 
 static INLINE void save_comp_rd_search_stat(
@@ -969,14 +979,15 @@ static INLINE void save_comp_rd_search_stat(
 }
 
 static INLINE int get_interinter_compound_mask_rate(
-    const MACROBLOCK *const x, const MB_MODE_INFO *const mbmi) {
+    const ModeCosts *const mode_costs, const MB_MODE_INFO *const mbmi) {
   const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
   // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
   if (compound_type == COMPOUND_WEDGE) {
-    return av1_is_wedge_used(mbmi->sb_type)
+    return av1_is_wedge_used(mbmi->bsize)
                ? av1_cost_literal(1) +
-                     x->wedge_idx_cost[mbmi->sb_type]
-                                      [mbmi->interinter_comp.wedge_index]
+                     mode_costs
+                         ->wedge_idx_cost[mbmi->bsize]
+                                         [mbmi->interinter_comp.wedge_index]
                : 0;
   } else {
     assert(compound_type == COMPOUND_DIFFWTD);
@@ -997,6 +1008,29 @@ static INLINE void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate,
   comp_rs2[cur_type] = rs2;
 }
 
+static INLINE int save_mask_search_results(const PREDICTION_MODE this_mode,
+                                           const int reuse_level) {
+  if (reuse_level || (this_mode == NEW_NEWMV))
+    return 1;
+  else
+    return 0;
+}
+
+static INLINE int prune_mode_by_skip_rd(const AV1_COMP *const cpi,
+                                        MACROBLOCK *x, MACROBLOCKD *xd,
+                                        const BLOCK_SIZE bsize,
+                                        int64_t ref_skip_rd, int mode_rate) {
+  int eval_txfm = 1;
+  // Check if the mode is good enough based on skip rd
+  if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+    int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize);
+    int64_t skip_rd = RDCOST(x->rdmult, mode_rate, (sse_y << 4));
+    eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd,
+                                cpi->sf.inter_sf.txfm_rd_gate_level, 1);
+  }
+  return eval_txfm;
+}
+
 static int64_t masked_compound_type_rd(
     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
     const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
@@ -1027,14 +1061,15 @@ static int64_t masked_compound_type_rd(
                                          diff10, strides);
     *calc_pred_masked_compound = 0;
   }
-  if (cpi->sf.inter_sf.prune_wedge_pred_diff_based &&
-      compound_type == COMPOUND_WEDGE) {
+  if (compound_type == COMPOUND_WEDGE) {
     unsigned int sse;
     if (is_cur_buf_hbd(xd))
-      (void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
-                                  CONVERT_TO_BYTEPTR(*preds1), *strides, &sse);
+      (void)cpi->ppi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
+                                       CONVERT_TO_BYTEPTR(*preds1), *strides,
+                                       &sse);
     else
-      (void)cpi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides, &sse);
+      (void)cpi->ppi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides,
+                                       &sse);
     const unsigned int mse =
         ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]);
     // If two predictors are very similar, skip wedge compound mode search
@@ -1049,7 +1084,7 @@ static int64_t masked_compound_type_rd(
   uint64_t cur_sse = UINT64_MAX;
   best_rd_cur = pick_interinter_mask[compound_type - COMPOUND_WEDGE](
       cpi, x, bsize, *preds0, *preds1, residual1, diff10, &cur_sse);
-  *rs2 += get_interinter_compound_mask_rate(x, mbmi);
+  *rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
   best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
   assert(cur_sse != UINT64_MAX);
   int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, (cur_sse << 4));
@@ -1082,10 +1117,6 @@ static int64_t masked_compound_type_rd(
         have_newmv_in_inter_mode(this_mode) &&
         (compound_type == COMPOUND_WEDGE) &&
         (!cpi->sf.inter_sf.disable_interinter_wedge_newmv_search);
-    int diffwtd_newmv_search =
-        cpi->sf.inter_sf.enable_interinter_diffwtd_newmv_search &&
-        compound_type == COMPOUND_DIFFWTD &&
-        have_newmv_in_inter_mode(this_mode);
 
     // Search for new MV if needed and build predictor
     if (wedge_newmv_search) {
@@ -1095,40 +1126,6 @@ static int64_t masked_compound_type_rd(
       const int mi_col = xd->mi_col;
       av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize,
                                     AOM_PLANE_Y, AOM_PLANE_Y);
-    } else if (diffwtd_newmv_search) {
-      *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
-                                                           bsize, this_mode);
-      // we need to update the mask according to the new motion vector
-      CompoundTypeRdBuffers tmp_buf;
-      int64_t tmp_rd = INT64_MAX;
-      alloc_compound_type_rd_buffers_no_check(&tmp_buf);
-
-      uint8_t *tmp_preds0[1] = { tmp_buf.pred0 };
-      uint8_t *tmp_preds1[1] = { tmp_buf.pred1 };
-
-      get_inter_predictors_masked_compound(x, bsize, tmp_preds0, tmp_preds1,
-                                           tmp_buf.residual1, tmp_buf.diff10,
-                                           strides);
-
-      tmp_rd = pick_interinter_mask[compound_type - COMPOUND_WEDGE](
-          cpi, x, bsize, *tmp_preds0, *tmp_preds1, tmp_buf.residual1,
-          tmp_buf.diff10, &cur_sse);
-      // we can reuse rs2 here
-      tmp_rd += RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0);
-
-      if (tmp_rd >= best_rd_cur) {
-        // restore the motion vector
-        mbmi->mv[0].as_int = cur_mv[0].as_int;
-        mbmi->mv[1].as_int = cur_mv[1].as_int;
-        *out_rate_mv = rate_mv;
-        av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
-                                                 strides, preds1, strides);
-      } else {
-        // build the final prediciton using the updated mv
-        av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, tmp_preds0,
-                                                 strides, tmp_preds1, strides);
-      }
-      av1_release_compound_type_rd_buffers(&tmp_buf);
     } else {
       *out_rate_mv = rate_mv;
       av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
@@ -1175,7 +1172,8 @@ static int64_t masked_compound_type_rd(
     assert(comp_dist[compound_type] != INT64_MAX);
     // When disable_interinter_wedge_newmv_search is set, motion refinement is
     // disabled. Hence rate and distortion can be reused in this case as well
-    assert(IMPLIES(have_newmv_in_inter_mode(this_mode),
+    assert(IMPLIES((have_newmv_in_inter_mode(this_mode) &&
+                    (compound_type == COMPOUND_WEDGE)),
                    cpi->sf.inter_sf.disable_interinter_wedge_newmv_search));
     assert(mbmi->mv[0].as_int == cur_mv[0].as_int);
     assert(mbmi->mv[1].as_int == cur_mv[1].as_int);
@@ -1197,7 +1195,8 @@ static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 };
 static int comp_type_rd_threshold_div[3] = { 3, 16, 16 };
 
 int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                         BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask,
+                         HandleInterModeArgs *args, BLOCK_SIZE bsize,
+                         int_mv *cur_mv, int mode_search_mask,
                          int masked_compound_used, const BUFFER_SET *orig_dst,
                          const BUFFER_SET *tmp_dst,
                          const CompoundTypeRdBuffers *buffers, int *rate_mv,
@@ -1208,6 +1207,7 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   const PREDICTION_MODE this_mode = mbmi->mode;
+  int ref_frame = av1_ref_frame_type(mbmi->ref_frame);
   const int bw = block_size_wide[bsize];
   int rs2;
   int_mv best_mv[2];
@@ -1222,8 +1222,6 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   uint8_t *preds1[1] = { buffers->pred1 };
   int strides[1] = { bw };
   int tmp_rate_mv;
-  const int num_pix = 1 << num_pels_log2_lookup[bsize];
-  const int mask_len = 2 * num_pix * sizeof(uint8_t);
   COMPOUND_TYPE cur_type;
   // Local array to store the mask cost for different compound types
   int masked_type_cost[COMPOUND_TYPES];
@@ -1244,8 +1242,6 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   best_mv[0].as_int = cur_mv[0].as_int;
   best_mv[1].as_int = cur_mv[1].as_int;
   *rd = INT64_MAX;
-  int rate_sum, tmp_skip_txfm_sb;
-  int64_t dist_sum, tmp_skip_sse_sb;
 
   // Local array to store the valid compound types to be evaluated in the core
   // loop
@@ -1253,24 +1249,22 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
     COMPOUND_AVERAGE, COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD
   };
   int valid_type_count = 0;
-  int try_average_and_distwtd_comp = 0;
   // compute_valid_comp_types() returns the number of valid compound types to be
   // evaluated and populates the same in the local array valid_comp_types[].
   // It also sets the flag 'try_average_and_distwtd_comp'
   valid_type_count = compute_valid_comp_types(
-      x, cpi, &try_average_and_distwtd_comp, bsize, masked_compound_used,
-      mode_search_mask, valid_comp_types);
+      x, cpi, bsize, masked_compound_used, mode_search_mask, valid_comp_types);
 
   // The following context indices are independent of compound type
   const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
   const int comp_index_ctx = get_comp_index_context(cm, xd);
 
   // Populates masked_type_cost local array for the 4 compound types
-  calc_masked_type_cost(x, bsize, comp_group_idx_ctx, comp_index_ctx,
-                        masked_compound_used, masked_type_cost);
+  calc_masked_type_cost(&x->mode_costs, bsize, comp_group_idx_ctx,
+                        comp_index_ctx, masked_compound_used, masked_type_cost);
 
   int64_t comp_model_rd_cur = INT64_MAX;
-  int64_t best_rd_cur = INT64_MAX;
+  int64_t best_rd_cur = ref_best_rd;
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
 
@@ -1281,98 +1275,6 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
                                          comp_rate, comp_dist, comp_rs2,
                                          rate_mv, rd, match_index);
   }
-  // Special handling if both compound_average and compound_distwtd
-  // are to be searched. In this case, first estimate between the two
-  // modes and then call estimate_yrd_for_sb() only for the better of
-  // the two.
-  if (try_average_and_distwtd_comp) {
-    int est_rate[2];
-    int64_t est_dist[2], est_rd;
-    COMPOUND_TYPE best_type;
-    // Since modelled rate and dist are separately stored,
-    // compute better of COMPOUND_AVERAGE and COMPOUND_DISTWTD
-    // using the stored stats.
-    if ((comp_model_rate[COMPOUND_AVERAGE] != INT_MAX) &&
-        comp_model_rate[COMPOUND_DISTWTD] != INT_MAX) {
-      // Choose the better of the COMPOUND_AVERAGE,
-      // COMPOUND_DISTWTD on modeled cost.
-      best_type = find_best_avg_distwtd_comp_type(
-          x, comp_model_rate, comp_model_dist, *rate_mv, &est_rd);
-      update_mbmi_for_compound_type(mbmi, best_type);
-      if (comp_rate[best_type] != INT_MAX)
-        best_rd_cur = RDCOST(
-            x->rdmult,
-            masked_type_cost[best_type] + *rate_mv + comp_rate[best_type],
-            comp_dist[best_type]);
-      comp_model_rd_cur = est_rd;
-      // Update stats for best compound type
-      if (best_rd_cur < *rd) {
-        update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
-                         comp_model_rd_cur, masked_type_cost[best_type]);
-      }
-      restore_dst_buf(xd, *tmp_dst, 1);
-    } else {
-      int64_t sse_y[COMPOUND_DISTWTD + 1];
-      // Calculate model_rd for COMPOUND_AVERAGE and COMPOUND_DISTWTD
-      for (int comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
-           comp_type++) {
-        update_mbmi_for_compound_type(mbmi, comp_type);
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
-                                      AOM_PLANE_Y, AOM_PLANE_Y);
-        model_rd_sb_fn[MODELRD_CURVFIT](
-            cpi, bsize, x, xd, 0, 0, &est_rate[comp_type], &est_dist[comp_type],
-            NULL, NULL, NULL, NULL, NULL);
-        est_rate[comp_type] += masked_type_cost[comp_type];
-        comp_model_rate[comp_type] = est_rate[comp_type];
-        comp_model_dist[comp_type] = est_dist[comp_type];
-        sse_y[comp_type] = x->pred_sse[xd->mi[0]->ref_frame[0]];
-        if (comp_type == COMPOUND_AVERAGE) {
-          *is_luma_interp_done = 1;
-          restore_dst_buf(xd, *tmp_dst, 1);
-        }
-      }
-      // Choose the better of the two based on modeled cost and call
-      // estimate_yrd_for_sb() for that one.
-      best_type = find_best_avg_distwtd_comp_type(
-          x, comp_model_rate, comp_model_dist, *rate_mv, &est_rd);
-      update_mbmi_for_compound_type(mbmi, best_type);
-      if (best_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *orig_dst, 1);
-      rs2 = masked_type_cost[best_type];
-      RD_STATS est_rd_stats;
-      const int64_t mode_rd = RDCOST(x->rdmult, rs2 + *rate_mv, 0);
-      const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd;
-      int64_t est_rd_ = INT64_MAX;
-      int eval_txfm = 1;
-      // Check if the mode is good enough based on skip rd
-      if (cpi->sf.inter_sf.txfm_rd_gate_level) {
-        int64_t skip_rd =
-            RDCOST(x->rdmult, rs2 + *rate_mv, (sse_y[best_type] << 4));
-        eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd,
-                                    cpi->sf.inter_sf.txfm_rd_gate_level, 1);
-      }
-      // Evaluate further if skip rd is low enough
-      if (eval_txfm) {
-        est_rd_ =
-            estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &est_rd_stats);
-      }
-
-      if (est_rd_ != INT64_MAX) {
-        best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
-                             est_rd_stats.dist);
-        // Backup rate and distortion for future reuse
-        backup_stats(best_type, comp_rate, comp_dist, comp_model_rate,
-                     comp_model_dist, est_rate[best_type], est_dist[best_type],
-                     &est_rd_stats, comp_rs2, rs2);
-        comp_model_rd_cur = est_rd;
-      }
-      if (best_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
-      // Update stats for best compound type
-      if (best_rd_cur < *rd) {
-        update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
-                         comp_model_rd_cur, rs2);
-      }
-    }
-  }
 
   // If COMPOUND_AVERAGE is not valid, use the spare buffer
   if (valid_comp_types[0] != COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
@@ -1380,40 +1282,43 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   // Loop over valid compound types
   for (int i = 0; i < valid_type_count; i++) {
     cur_type = valid_comp_types[i];
+
+    if (args->cmp_mode[ref_frame] == COMPOUND_AVERAGE) {
+      if (cur_type == COMPOUND_WEDGE) continue;
+    }
+
     comp_model_rd_cur = INT64_MAX;
     tmp_rate_mv = *rate_mv;
     best_rd_cur = INT64_MAX;
+    ref_best_rd = AOMMIN(ref_best_rd, *rd);
+    update_mbmi_for_compound_type(mbmi, cur_type);
+    rs2 = masked_type_cost[cur_type];
+
+    int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+    if (mode_rd >= ref_best_rd) continue;
 
     // Case COMPOUND_AVERAGE and COMPOUND_DISTWTD
     if (cur_type < COMPOUND_WEDGE) {
-      update_mbmi_for_compound_type(mbmi, cur_type);
-      rs2 = masked_type_cost[cur_type];
-      const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
-      if (mode_rd < ref_best_rd) {
+      if (cpi->sf.inter_sf.enable_fast_compound_mode_search == 2) {
+        int rate_sum, tmp_skip_txfm_sb;
+        int64_t dist_sum, tmp_skip_sse_sb;
+
         // Reuse data if matching record is found
         if (comp_rate[cur_type] == INT_MAX) {
           av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
                                         AOM_PLANE_Y, AOM_PLANE_Y);
           if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
-
           // Compute RD cost for the current type
           RD_STATS est_rd_stats;
           const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd;
           int64_t est_rd = INT64_MAX;
-          int eval_txfm = 1;
-          // Check if the mode is good enough based on skip rd
-          if (cpi->sf.inter_sf.txfm_rd_gate_level) {
-            int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize);
-            int64_t skip_rd = RDCOST(x->rdmult, rs2 + *rate_mv, (sse_y << 4));
-            eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd,
-                                        cpi->sf.inter_sf.txfm_rd_gate_level, 1);
-          }
+          int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+                                                rs2 + *rate_mv);
           // Evaluate further if skip rd is low enough
           if (eval_txfm) {
             est_rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh,
                                          &est_rd_stats);
           }
-
           if (est_rd != INT64_MAX) {
             best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
                                  est_rd_stats.dist);
@@ -1422,7 +1327,6 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
                 &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
             comp_model_rd_cur =
                 RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
-
             // Backup rate and distortion for future reuse
             backup_stats(cur_type, comp_rate, comp_dist, comp_model_rate,
                          comp_model_dist, rate_sum, dist_sum, &est_rd_stats,
@@ -1438,13 +1342,257 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
               RDCOST(x->rdmult, rs2 + *rate_mv + comp_model_rate[cur_type],
                      comp_model_dist[cur_type]);
         }
+      } else {
+        tmp_rate_mv = *rate_mv;
+        if (have_newmv_in_inter_mode(this_mode)) {
+          InterPredParams inter_pred_params;
+          av1_dist_wtd_comp_weight_assign(
+              &cpi->common, mbmi, &inter_pred_params.conv_params.fwd_offset,
+              &inter_pred_params.conv_params.bck_offset,
+              &inter_pred_params.conv_params.use_dist_wtd_comp_avg, 1);
+          int mask_value = inter_pred_params.conv_params.fwd_offset * 4;
+          memset(xd->seg_mask, mask_value,
+                 sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
+          tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                              bsize, this_mode);
+        }
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
+
+        int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+                                              rs2 + *rate_mv);
+        if (eval_txfm) {
+          RD_STATS est_rd_stats;
+          estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats);
+
+          best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+                               est_rd_stats.dist);
+        }
       }
+
       // use spare buffer for following compound type try
       if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+    } else if (cur_type == COMPOUND_WEDGE) {
+      int best_mask_index = 0;
+      int best_wedge_sign = 0;
+      int_mv tmp_mv[2] = { mbmi->mv[0], mbmi->mv[1] };
+      int best_rs2 = 0;
+      int best_rate_mv = *rate_mv;
+      int wedge_mask_size = get_wedge_types_lookup(bsize);
+      int need_mask_search = args->wedge_index == -1;
+      int wedge_newmv_search =
+          have_newmv_in_inter_mode(this_mode) &&
+          !cpi->sf.inter_sf.disable_interinter_wedge_newmv_search;
+
+      if (need_mask_search && !wedge_newmv_search) {
+        // short cut repeated single reference block build
+        av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0,
+                                                         preds0, strides);
+        av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1,
+                                                         preds1, strides);
+      }
+
+      for (int wedge_mask = 0; wedge_mask < wedge_mask_size && need_mask_search;
+           ++wedge_mask) {
+        for (int wedge_sign = 0; wedge_sign < 2; ++wedge_sign) {
+          tmp_rate_mv = *rate_mv;
+          mbmi->interinter_comp.wedge_index = wedge_mask;
+          mbmi->interinter_comp.wedge_sign = wedge_sign;
+          rs2 = masked_type_cost[cur_type];
+          rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+          mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+          if (mode_rd >= ref_best_rd / 2) continue;
+
+          if (wedge_newmv_search) {
+            tmp_rate_mv = av1_interinter_compound_motion_search(
+                cpi, x, cur_mv, bsize, this_mode);
+            av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst,
+                                          bsize, AOM_PLANE_Y, AOM_PLANE_Y);
+          } else {
+            av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+                                                     strides, preds1, strides);
+          }
+
+          RD_STATS est_rd_stats;
+          int64_t this_rd_cur = INT64_MAX;
+          int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+                                                rs2 + *rate_mv);
+          if (eval_txfm) {
+            this_rd_cur = estimate_yrd_for_sb(
+                cpi, bsize, x, AOMMIN(best_rd_cur, ref_best_rd), &est_rd_stats);
+          }
+          if (this_rd_cur < INT64_MAX) {
+            this_rd_cur =
+                RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+                       est_rd_stats.dist);
+          }
+          if (this_rd_cur < best_rd_cur) {
+            best_mask_index = wedge_mask;
+            best_wedge_sign = wedge_sign;
+            best_rd_cur = this_rd_cur;
+            tmp_mv[0] = mbmi->mv[0];
+            tmp_mv[1] = mbmi->mv[1];
+            best_rate_mv = tmp_rate_mv;
+            best_rs2 = rs2;
+          }
+        }
+        // Consider the asymmetric partitions for oblique angle only if the
+        // corresponding symmetric partition is the best so far.
+        // Note: For horizontal and vertical types, both symmetric and
+        // asymmetric partitions are always considered.
+        if (cpi->sf.inter_sf.enable_fast_wedge_mask_search) {
+          // The first 4 entries in wedge_codebook_16_heqw/hltw/hgtw[16]
+          // correspond to symmetric partitions of the 4 oblique angles, the
+          // next 4 entries correspond to the vertical/horizontal
+          // symmetric/asymmetric partitions and the last 8 entries correspond
+          // to the asymmetric partitions of oblique types.
+          const int idx_before_asym_oblique = 7;
+          const int last_oblique_sym_idx = 3;
+          if (wedge_mask == idx_before_asym_oblique) {
+            if (best_mask_index > last_oblique_sym_idx) {
+              break;
+            } else {
+              // Asymmetric (Index-1) map for the corresponding oblique masks.
+              // WEDGE_OBLIQUE27: sym - 0, asym - 8, 9
+              // WEDGE_OBLIQUE63: sym - 1, asym - 12, 13
+              // WEDGE_OBLIQUE117: sym - 2, asym - 14, 15
+              // WEDGE_OBLIQUE153: sym - 3, asym - 10, 11
+              const int asym_mask_idx[4] = { 7, 11, 13, 9 };
+              wedge_mask = asym_mask_idx[best_mask_index];
+              wedge_mask_size = wedge_mask + 3;
+            }
+          }
+        }
+      }
+
+      if (need_mask_search) {
+        if (save_mask_search_results(
+                this_mode, cpi->sf.inter_sf.reuse_mask_search_results)) {
+          args->wedge_index = best_mask_index;
+          args->wedge_sign = best_wedge_sign;
+        }
+      } else {
+        mbmi->interinter_comp.wedge_index = args->wedge_index;
+        mbmi->interinter_comp.wedge_sign = args->wedge_sign;
+        rs2 = masked_type_cost[cur_type];
+        rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+        if (wedge_newmv_search) {
+          tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                              bsize, this_mode);
+        }
+
+        best_mask_index = args->wedge_index;
+        best_wedge_sign = args->wedge_sign;
+        tmp_mv[0] = mbmi->mv[0];
+        tmp_mv[1] = mbmi->mv[1];
+        best_rate_mv = tmp_rate_mv;
+        best_rs2 = masked_type_cost[cur_type];
+        best_rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+                                              best_rs2 + *rate_mv);
+        if (eval_txfm) {
+          RD_STATS est_rd_stats;
+          estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats);
+          best_rd_cur =
+              RDCOST(x->rdmult, best_rs2 + tmp_rate_mv + est_rd_stats.rate,
+                     est_rd_stats.dist);
+        }
+      }
+
+      mbmi->interinter_comp.wedge_index = best_mask_index;
+      mbmi->interinter_comp.wedge_sign = best_wedge_sign;
+      mbmi->mv[0] = tmp_mv[0];
+      mbmi->mv[1] = tmp_mv[1];
+      tmp_rate_mv = best_rate_mv;
+      rs2 = best_rs2;
+    } else if (!cpi->sf.inter_sf.enable_fast_compound_mode_search &&
+               cur_type == COMPOUND_DIFFWTD) {
+      int_mv tmp_mv[2];
+      int best_mask_index = 0;
+      rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+      int need_mask_search = args->diffwtd_index == -1;
+
+      for (int mask_index = 0; mask_index < 2 && need_mask_search;
+           ++mask_index) {
+        tmp_rate_mv = *rate_mv;
+        mbmi->interinter_comp.mask_type = mask_index;
+        if (have_newmv_in_inter_mode(this_mode)) {
+          // hard coded number for diff wtd
+          int mask_value = mask_index == 0 ? 38 : 26;
+          memset(xd->seg_mask, mask_value,
+                 sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
+          tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                              bsize, this_mode);
+        }
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        RD_STATS est_rd_stats;
+        int64_t this_rd_cur = INT64_MAX;
+        int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+                                              rs2 + *rate_mv);
+        if (eval_txfm) {
+          this_rd_cur =
+              estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+        }
+        if (this_rd_cur < INT64_MAX) {
+          this_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+                               est_rd_stats.dist);
+        }
+
+        if (this_rd_cur < best_rd_cur) {
+          best_rd_cur = this_rd_cur;
+          best_mask_index = mbmi->interinter_comp.mask_type;
+          tmp_mv[0] = mbmi->mv[0];
+          tmp_mv[1] = mbmi->mv[1];
+        }
+      }
+
+      if (need_mask_search) {
+        if (save_mask_search_results(this_mode, 0))
+          args->diffwtd_index = best_mask_index;
+      } else {
+        mbmi->interinter_comp.mask_type = args->diffwtd_index;
+        rs2 = masked_type_cost[cur_type];
+        rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+        int mask_value = mbmi->interinter_comp.mask_type == 0 ? 38 : 26;
+        memset(xd->seg_mask, mask_value,
+               sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
+
+        if (have_newmv_in_inter_mode(this_mode)) {
+          tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                              bsize, this_mode);
+        }
+        best_mask_index = mbmi->interinter_comp.mask_type;
+        tmp_mv[0] = mbmi->mv[0];
+        tmp_mv[1] = mbmi->mv[1];
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        RD_STATS est_rd_stats;
+        int64_t this_rd_cur = INT64_MAX;
+        int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+                                              rs2 + *rate_mv);
+        if (eval_txfm) {
+          this_rd_cur =
+              estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+        }
+        if (this_rd_cur < INT64_MAX) {
+          best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+                               est_rd_stats.dist);
+        }
+      }
+
+      mbmi->interinter_comp.mask_type = best_mask_index;
+      mbmi->mv[0] = tmp_mv[0];
+      mbmi->mv[1] = tmp_mv[1];
     } else {
       // Handle masked compound types
-      update_mbmi_for_compound_type(mbmi, cur_type);
-      rs2 = masked_type_cost[cur_type];
       // Factors to control gating of compound type selection based on best
       // approximate rd so far
       const int max_comp_type_rd_threshold_mul =
@@ -1469,37 +1617,35 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
             ref_skip_rd);
       }
     }
+
     // Update stats for best compound type
     if (best_rd_cur < *rd) {
       update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
                        comp_model_rd_cur, rs2);
-      if (masked_compound_used && cur_type >= COMPOUND_WEDGE) {
-        memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len);
-        if (have_newmv_in_inter_mode(this_mode))
-          update_mask_best_mv(mbmi, best_mv, cur_mv, cur_type,
-                              &best_tmp_rate_mv, tmp_rate_mv, &cpi->sf);
-      }
+      if (have_newmv_in_inter_mode(this_mode))
+        update_mask_best_mv(mbmi, best_mv, &best_tmp_rate_mv, tmp_rate_mv);
     }
     // reset to original mvs for next iteration
     mbmi->mv[0].as_int = cur_mv[0].as_int;
     mbmi->mv[1].as_int = cur_mv[1].as_int;
   }
-  if (mbmi->interinter_comp.type != best_type_stats.best_compound_data.type) {
-    mbmi->comp_group_idx =
-        (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1;
-    mbmi->compound_idx =
-        !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD);
-    mbmi->interinter_comp = best_type_stats.best_compound_data;
-    memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len);
-  }
+
+  mbmi->comp_group_idx =
+      (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1;
+  mbmi->compound_idx =
+      !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD);
+  mbmi->interinter_comp = best_type_stats.best_compound_data;
+
   if (have_newmv_in_inter_mode(this_mode)) {
     mbmi->mv[0].as_int = best_mv[0].as_int;
     mbmi->mv[1].as_int = best_mv[1].as_int;
-    if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
-      rd_stats->rate += best_tmp_rate_mv - *rate_mv;
-      *rate_mv = best_tmp_rate_mv;
-    }
+    rd_stats->rate += best_tmp_rate_mv - *rate_mv;
+    *rate_mv = best_tmp_rate_mv;
   }
+
+  if (this_mode == NEW_NEWMV)
+    args->cmp_mode[ref_frame] = mbmi->interinter_comp.type;
+
   restore_dst_buf(xd, *orig_dst, 1);
   if (!match_found)
     save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate,
diff --git a/media/libaom/src/av1/encoder/compound_type.h b/media/libaom/src/av1/encoder/compound_type.h
index f2bd857c9a..a028a35093 100644
--- a/media/libaom/src/av1/encoder/compound_type.h
+++ b/media/libaom/src/av1/encoder/compound_type.h
@@ -26,6 +26,9 @@ typedef struct {
   int best_compmode_interinter_cost;
 } BEST_COMP_TYPE_STATS;
 
+#define IGNORE_MODE -1
+// Searches for the best inter-intra mode. Returns IGNORE_MODE if no good mode
+// is found, 0 otherwise.
 int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                 BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
                                 HandleInterModeArgs *args, int64_t ref_best_rd,
@@ -33,7 +36,8 @@ int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                 const BUFFER_SET *orig_dst);
 
 int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                         BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask,
+                         HandleInterModeArgs *args, BLOCK_SIZE bsize,
+                         int_mv *cur_mv, int mode_search_mask,
                          int masked_compound_used, const BUFFER_SET *orig_dst,
                          const BUFFER_SET *tmp_dst,
                          const CompoundTypeRdBuffers *buffers, int *rate_mv,
diff --git a/media/libaom/src/av1/encoder/context_tree.c b/media/libaom/src/av1/encoder/context_tree.c
index 9b5b1cbf9d..7153ceb5d5 100644
--- a/media/libaom/src/av1/encoder/context_tree.c
+++ b/media/libaom/src/av1/encoder/context_tree.c
@@ -11,191 +11,257 @@
 
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
 
-static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
-  BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
-};
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+                           PICK_MODE_CONTEXT *src_ctx) {
+  dst_ctx->mic = src_ctx->mic;
+  dst_ctx->mbmi_ext_best = src_ctx->mbmi_ext_best;
+
+  dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk;
+  dst_ctx->skippable = src_ctx->skippable;
+#if CONFIG_INTERNAL_STATS
+  dst_ctx->best_mode_index = src_ctx->best_mode_index;
+#endif  // CONFIG_INTERNAL_STATS
 
-typedef struct {
-  tran_low_t *coeff_buf[MAX_MB_PLANE];
-  tran_low_t *qcoeff_buf[MAX_MB_PLANE];
-  tran_low_t *dqcoeff_buf[MAX_MB_PLANE];
-} PC_TREE_SHARED_BUFFERS;
+  memcpy(dst_ctx->blk_skip, src_ctx->blk_skip,
+         sizeof(uint8_t) * src_ctx->num_4x4_blk);
+  av1_copy_array(dst_ctx->tx_type_map, src_ctx->tx_type_map,
+                 src_ctx->num_4x4_blk);
+
+  dst_ctx->rd_stats = src_ctx->rd_stats;
+  dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
+}
+
+void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params,
+                                   PC_TREE_SHARED_BUFFERS *shared_bufs,
+                                   struct aom_internal_error_info *error) {
+  const int num_planes = seq_params->monochrome ? 1 : MAX_MB_PLANE;
+  const int max_sb_square_y = 1 << num_pels_log2_lookup[seq_params->sb_size];
+  const int max_sb_square_uv = max_sb_square_y >> (seq_params->subsampling_x +
+                                                   seq_params->subsampling_y);
+  for (int i = 0; i < num_planes; i++) {
+    const int max_num_pix =
+        (i == AOM_PLANE_Y) ? max_sb_square_y : max_sb_square_uv;
+    AOM_CHECK_MEM_ERROR(error, shared_bufs->coeff_buf[i],
+                        aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    AOM_CHECK_MEM_ERROR(error, shared_bufs->qcoeff_buf[i],
+                        aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    AOM_CHECK_MEM_ERROR(error, shared_bufs->dqcoeff_buf[i],
+                        aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+  }
+}
+
+void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs) {
+  for (int i = 0; i < 3; i++) {
+    aom_free(shared_bufs->coeff_buf[i]);
+    aom_free(shared_bufs->qcoeff_buf[i]);
+    aom_free(shared_bufs->dqcoeff_buf[i]);
+    shared_bufs->coeff_buf[i] = NULL;
+    shared_bufs->qcoeff_buf[i] = NULL;
+    shared_bufs->dqcoeff_buf[i] = NULL;
+  }
+}
+
+PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi,
+                                 BLOCK_SIZE bsize,
+                                 PC_TREE_SHARED_BUFFERS *shared_bufs) {
+  PICK_MODE_CONTEXT *ctx = NULL;
+  const AV1_COMMON *const cm = &cpi->common;
+  struct aom_internal_error_info error;
+
+  AOM_CHECK_MEM_ERROR(&error, ctx, aom_calloc(1, sizeof(*ctx)));
+  ctx->rd_mode_is_ready = 0;
 
-static AOM_INLINE void alloc_mode_context(AV1_COMMON *cm, int num_pix,
-                                          PICK_MODE_CONTEXT *ctx,
-                                          PC_TREE_SHARED_BUFFERS *shared_bufs) {
   const int num_planes = av1_num_planes(cm);
-  int i;
+  const int num_pix = block_size_wide[bsize] * block_size_high[bsize];
   const int num_blk = num_pix / 16;
+
+  AOM_CHECK_MEM_ERROR(&error, ctx->blk_skip,
+                      aom_calloc(num_blk, sizeof(*ctx->blk_skip)));
+  AOM_CHECK_MEM_ERROR(&error, ctx->tx_type_map,
+                      aom_calloc(num_blk, sizeof(*ctx->tx_type_map)));
   ctx->num_4x4_blk = num_blk;
 
-  CHECK_MEM_ERROR(cm, ctx->blk_skip,
-                  aom_calloc(num_blk, sizeof(*ctx->blk_skip)));
-  CHECK_MEM_ERROR(cm, ctx->tx_type_map,
-                  aom_calloc(num_blk, sizeof(*ctx->tx_type_map)));
-  for (i = 0; i < num_planes; ++i) {
+  for (int i = 0; i < num_planes; ++i) {
     ctx->coeff[i] = shared_bufs->coeff_buf[i];
     ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i];
     ctx->dqcoeff[i] = shared_bufs->dqcoeff_buf[i];
-    CHECK_MEM_ERROR(cm, ctx->eobs[i],
-                    aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
-    CHECK_MEM_ERROR(
-        cm, ctx->txb_entropy_ctx[i],
+    AOM_CHECK_MEM_ERROR(&error, ctx->eobs[i],
+                        aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
+    AOM_CHECK_MEM_ERROR(
+        &error, ctx->txb_entropy_ctx[i],
         aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
   }
 
   if (num_pix <= MAX_PALETTE_SQUARE) {
-    for (i = 0; i < 2; ++i) {
-      CHECK_MEM_ERROR(
-          cm, ctx->color_index_map[i],
-          aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+    for (int i = 0; i < 2; ++i) {
+      if (cm->features.allow_screen_content_tools) {
+        AOM_CHECK_MEM_ERROR(
+            &error, ctx->color_index_map[i],
+            aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+      } else {
+        ctx->color_index_map[i] = NULL;
+      }
     }
   }
+
+  av1_invalid_rd_stats(&ctx->rd_stats);
+
+  return ctx;
 }
 
-static AOM_INLINE void free_mode_context(PICK_MODE_CONTEXT *ctx,
-                                         const int num_planes) {
-  int i;
+void av1_reset_pmc(PICK_MODE_CONTEXT *ctx) {
+  av1_zero_array(ctx->blk_skip, ctx->num_4x4_blk);
+  av1_zero_array(ctx->tx_type_map, ctx->num_4x4_blk);
+  av1_invalid_rd_stats(&ctx->rd_stats);
+}
+
+void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes) {
+  if (ctx == NULL) return;
+
   aom_free(ctx->blk_skip);
-  ctx->blk_skip = 0;
+  ctx->blk_skip = NULL;
   aom_free(ctx->tx_type_map);
-  ctx->tx_type_map = 0;
-  for (i = 0; i < num_planes; ++i) {
-    ctx->coeff[i] = 0;
-    ctx->qcoeff[i] = 0;
-    ctx->dqcoeff[i] = 0;
+  for (int i = 0; i < num_planes; ++i) {
+    ctx->coeff[i] = NULL;
+    ctx->qcoeff[i] = NULL;
+    ctx->dqcoeff[i] = NULL;
     aom_free(ctx->eobs[i]);
-    ctx->eobs[i] = 0;
+    ctx->eobs[i] = NULL;
     aom_free(ctx->txb_entropy_ctx[i]);
-    ctx->txb_entropy_ctx[i] = 0;
+    ctx->txb_entropy_ctx[i] = NULL;
   }
 
-  for (i = 0; i < 2; ++i) {
-    aom_free(ctx->color_index_map[i]);
-    ctx->color_index_map[i] = 0;
+  for (int i = 0; i < 2; ++i) {
+    if (ctx->color_index_map[i]) {
+      aom_free(ctx->color_index_map[i]);
+      ctx->color_index_map[i] = NULL;
+    }
   }
+
+  aom_free(ctx);
 }
 
-static AOM_INLINE void alloc_tree_contexts(
-    AV1_COMMON *cm, PC_TREE *tree, int num_pix, int is_leaf,
-    PC_TREE_SHARED_BUFFERS *shared_bufs) {
-  alloc_mode_context(cm, num_pix, &tree->none, shared_bufs);
+PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize) {
+  PC_TREE *pc_tree = NULL;
+  struct aom_internal_error_info error;
 
-  if (is_leaf) return;
+  AOM_CHECK_MEM_ERROR(&error, pc_tree, aom_calloc(1, sizeof(*pc_tree)));
 
-  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[0], shared_bufs);
-  alloc_mode_context(cm, num_pix / 2, &tree->vertical[0], shared_bufs);
+  pc_tree->partitioning = PARTITION_NONE;
+  pc_tree->block_size = bsize;
+  pc_tree->index = 0;
+
+  pc_tree->none = NULL;
+  for (int i = 0; i < 2; ++i) {
+    pc_tree->horizontal[i] = NULL;
+    pc_tree->vertical[i] = NULL;
+  }
+  for (int i = 0; i < 3; ++i) {
+    pc_tree->horizontala[i] = NULL;
+    pc_tree->horizontalb[i] = NULL;
+    pc_tree->verticala[i] = NULL;
+    pc_tree->verticalb[i] = NULL;
+  }
+  for (int i = 0; i < 4; ++i) {
+    pc_tree->horizontal4[i] = NULL;
+    pc_tree->vertical4[i] = NULL;
+    pc_tree->split[i] = NULL;
+  }
 
-  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1], shared_bufs);
-  alloc_mode_context(cm, num_pix / 2, &tree->vertical[1], shared_bufs);
+  return pc_tree;
+}
 
-  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[0], shared_bufs);
-  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[1], shared_bufs);
-  alloc_mode_context(cm, num_pix / 2, &tree->horizontala[2], shared_bufs);
+#define FREE_PMC_NODE(CTX)         \
+  do {                             \
+    av1_free_pmc(CTX, num_planes); \
+    CTX = NULL;                    \
+  } while (0)
 
-  alloc_mode_context(cm, num_pix / 2, &tree->horizontalb[0], shared_bufs);
-  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[1], shared_bufs);
-  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[2], shared_bufs);
+void av1_free_pc_tree_recursive(PC_TREE *pc_tree, int num_planes, int keep_best,
+                                int keep_none) {
+  if (pc_tree == NULL) return;
 
-  alloc_mode_context(cm, num_pix / 4, &tree->verticala[0], shared_bufs);
-  alloc_mode_context(cm, num_pix / 4, &tree->verticala[1], shared_bufs);
-  alloc_mode_context(cm, num_pix / 2, &tree->verticala[2], shared_bufs);
+  const PARTITION_TYPE partition = pc_tree->partitioning;
 
-  alloc_mode_context(cm, num_pix / 2, &tree->verticalb[0], shared_bufs);
-  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[1], shared_bufs);
-  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[2], shared_bufs);
+  if (!keep_none && (!keep_best || (partition != PARTITION_NONE)))
+    FREE_PMC_NODE(pc_tree->none);
 
+  for (int i = 0; i < 2; ++i) {
+    if (!keep_best || (partition != PARTITION_HORZ))
+      FREE_PMC_NODE(pc_tree->horizontal[i]);
+    if (!keep_best || (partition != PARTITION_VERT))
+      FREE_PMC_NODE(pc_tree->vertical[i]);
+  }
+  for (int i = 0; i < 3; ++i) {
+    if (!keep_best || (partition != PARTITION_HORZ_A))
+      FREE_PMC_NODE(pc_tree->horizontala[i]);
+    if (!keep_best || (partition != PARTITION_HORZ_B))
+      FREE_PMC_NODE(pc_tree->horizontalb[i]);
+    if (!keep_best || (partition != PARTITION_VERT_A))
+      FREE_PMC_NODE(pc_tree->verticala[i]);
+    if (!keep_best || (partition != PARTITION_VERT_B))
+      FREE_PMC_NODE(pc_tree->verticalb[i]);
+  }
   for (int i = 0; i < 4; ++i) {
-    alloc_mode_context(cm, num_pix / 4, &tree->horizontal4[i], shared_bufs);
-    alloc_mode_context(cm, num_pix / 4, &tree->vertical4[i], shared_bufs);
+    if (!keep_best || (partition != PARTITION_HORZ_4))
+      FREE_PMC_NODE(pc_tree->horizontal4[i]);
+    if (!keep_best || (partition != PARTITION_VERT_4))
+      FREE_PMC_NODE(pc_tree->vertical4[i]);
   }
-}
 
-static AOM_INLINE void free_tree_contexts(PC_TREE *tree, const int num_planes) {
-  int i;
-  for (i = 0; i < 3; i++) {
-    free_mode_context(&tree->horizontala[i], num_planes);
-    free_mode_context(&tree->horizontalb[i], num_planes);
-    free_mode_context(&tree->verticala[i], num_planes);
-    free_mode_context(&tree->verticalb[i], num_planes);
-  }
-  for (i = 0; i < 4; ++i) {
-    free_mode_context(&tree->horizontal4[i], num_planes);
-    free_mode_context(&tree->vertical4[i], num_planes);
+  if (!keep_best || (partition != PARTITION_SPLIT)) {
+    for (int i = 0; i < 4; ++i) {
+      if (pc_tree->split[i] != NULL) {
+        av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0);
+        pc_tree->split[i] = NULL;
+      }
+    }
   }
-  free_mode_context(&tree->none, num_planes);
-  free_mode_context(&tree->horizontal[0], num_planes);
-  free_mode_context(&tree->horizontal[1], num_planes);
-  free_mode_context(&tree->vertical[0], num_planes);
-  free_mode_context(&tree->vertical[1], num_planes);
-}
 
-// This function will compute the number of pc_tree nodes to be allocated
-// or freed as per the super block size of BLOCK_128X128 or BLOCK_64X64
-static AOM_INLINE int get_pc_tree_nodes(const int is_sb_size_128,
-                                        int stat_generation_stage) {
-  const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
-  const int tree_nodes =
-      stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
-  return tree_nodes;
+  if (!keep_best && !keep_none) aom_free(pc_tree);
 }
 
-// This function sets up a tree of contexts such that at each square
-// partition level. There are contexts for none, horizontal, vertical, and
-// split.  Along with a block_size value and a selected block_size which
-// represents the state of our search.
-void av1_setup_pc_tree(AV1_COMP *const cpi, ThreadData *td) {
+void av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) {
+  // The structure 'sms_tree' is used to store the simple motion search data for
+  // partition pruning in inter frames. Hence, the memory allocations and
+  // initializations related to it are avoided for allintra encoding mode.
+  if (cpi->oxcf.kf_cfg.key_freq_max == 0) return;
+
   AV1_COMMON *const cm = &cpi->common;
-  int i, j, stat_generation_stage = is_stat_generation_stage(cpi);
-  const int is_sb_size_128 = cm->seq_params.sb_size == BLOCK_128X128;
+  const int stat_generation_stage = is_stat_generation_stage(cpi);
+  const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
   const int tree_nodes =
-      get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
-  int pc_tree_index = 0;
-  PC_TREE *this_pc;
-  PC_TREE_SHARED_BUFFERS shared_bufs;
+      av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+  int sms_tree_index = 0;
+  SIMPLE_MOTION_DATA_TREE *this_sms;
   int square_index = 1;
   int nodes;
 
-  aom_free(td->pc_tree);
-  CHECK_MEM_ERROR(cm, td->pc_tree,
-                  aom_calloc(tree_nodes, sizeof(*td->pc_tree)));
-  this_pc = &td->pc_tree[0];
-
-  for (i = 0; i < 3; i++) {
-    const int max_num_pix = MAX_SB_SIZE * MAX_SB_SIZE;
-    CHECK_MEM_ERROR(cm, td->tree_coeff_buf[i],
-                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
-    CHECK_MEM_ERROR(cm, td->tree_qcoeff_buf[i],
-                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
-    CHECK_MEM_ERROR(cm, td->tree_dqcoeff_buf[i],
-                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
-    shared_bufs.coeff_buf[i] = td->tree_coeff_buf[i];
-    shared_bufs.qcoeff_buf[i] = td->tree_qcoeff_buf[i];
-    shared_bufs.dqcoeff_buf[i] = td->tree_dqcoeff_buf[i];
-  }
+  aom_free(td->sms_tree);
+  CHECK_MEM_ERROR(cm, td->sms_tree,
+                  aom_calloc(tree_nodes, sizeof(*td->sms_tree)));
+  this_sms = &td->sms_tree[0];
 
   if (!stat_generation_stage) {
     const int leaf_factor = is_sb_size_128 ? 4 : 1;
     const int leaf_nodes = 256 * leaf_factor;
 
     // Sets up all the leaf nodes in the tree.
-    for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
-      PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+    for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) {
+      SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
       tree->block_size = square[0];
-      alloc_tree_contexts(cm, tree, 16, 1, &shared_bufs);
     }
 
     // Each node has 4 leaf nodes, fill each block_size level of the tree
     // from leafs to the root.
     for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
-      for (i = 0; i < nodes; ++i) {
-        PC_TREE *const tree = &td->pc_tree[pc_tree_index];
-        alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0,
-                            &shared_bufs);
+      for (int i = 0; i < nodes; ++i) {
+        SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
         tree->block_size = square[square_index];
-        for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
-        ++pc_tree_index;
+        for (int j = 0; j < 4; j++) tree->split[j] = this_sms++;
+        ++sms_tree_index;
       }
       ++square_index;
     }
@@ -203,66 +269,18 @@ void av1_setup_pc_tree(AV1_COMP *const cpi, ThreadData *td) {
     // Allocation for firstpass/LAP stage
     // TODO(Mufaddal): refactor square_index to use a common block_size macro
     // from firstpass.c
-    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+    SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
     square_index = 2;
-    alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 1, &shared_bufs);
     tree->block_size = square[square_index];
   }
 
-  // Set up the root node for the applicable superblock size
-  td->pc_root = &td->pc_tree[tree_nodes - 1];
-#if CONFIG_INTERNAL_STATS
-  td->pc_root->none.best_mode_index = THR_INVALID;
-#endif  // CONFIG_INTERNAL_STATS
+  // Set up the root node for the largest superblock size
+  td->sms_root = &td->sms_tree[tree_nodes - 1];
 }
 
-void av1_free_pc_tree(const AV1_COMP *const cpi, ThreadData *td,
-                      const int num_planes, BLOCK_SIZE sb_size) {
-  int stat_generation_stage = is_stat_generation_stage(cpi);
-  if (td->pc_tree != NULL) {
-    const int is_sb_size_128 = sb_size == BLOCK_128X128;
-    const int tree_nodes =
-        get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
-    for (int i = 0; i < tree_nodes; ++i) {
-      free_tree_contexts(&td->pc_tree[i], num_planes);
-    }
-    for (int i = 0; i < 3; ++i) {
-      aom_free(td->tree_coeff_buf[i]);
-      aom_free(td->tree_qcoeff_buf[i]);
-      aom_free(td->tree_dqcoeff_buf[i]);
-      td->tree_coeff_buf[i] = NULL;
-      td->tree_qcoeff_buf[i] = NULL;
-      td->tree_dqcoeff_buf[i] = NULL;
-    }
-    aom_free(td->pc_tree);
-    td->pc_tree = NULL;
+void av1_free_sms_tree(ThreadData *td) {
+  if (td->sms_tree != NULL) {
+    aom_free(td->sms_tree);
+    td->sms_tree = NULL;
   }
 }
-
-void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
-                           PICK_MODE_CONTEXT *src_ctx) {
-  dst_ctx->mic = src_ctx->mic;
-  dst_ctx->mbmi_ext_best = src_ctx->mbmi_ext_best;
-
-  dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk;
-  dst_ctx->skippable = src_ctx->skippable;
-#if CONFIG_INTERNAL_STATS
-  dst_ctx->best_mode_index = src_ctx->best_mode_index;
-#endif  // CONFIG_INTERNAL_STATS
-
-  memcpy(dst_ctx->blk_skip, src_ctx->blk_skip,
-         sizeof(uint8_t) * src_ctx->num_4x4_blk);
-  av1_copy_array(dst_ctx->tx_type_map, src_ctx->tx_type_map,
-                 src_ctx->num_4x4_blk);
-
-  dst_ctx->hybrid_pred_diff = src_ctx->hybrid_pred_diff;
-  dst_ctx->comp_pred_diff = src_ctx->comp_pred_diff;
-  dst_ctx->single_pred_diff = src_ctx->single_pred_diff;
-
-  dst_ctx->rd_stats = src_ctx->rd_stats;
-  dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
-
-  memcpy(dst_ctx->pred_mv, src_ctx->pred_mv, sizeof(MV) * REF_FRAMES);
-
-  dst_ctx->partition = src_ctx->partition;
-}
diff --git a/media/libaom/src/av1/encoder/context_tree.h b/media/libaom/src/av1/encoder/context_tree.h
index a39979413e..34305c3487 100644
--- a/media/libaom/src/av1/encoder/context_tree.h
+++ b/media/libaom/src/av1/encoder/context_tree.h
@@ -21,12 +21,19 @@
 extern "C" {
 #endif
 
+struct AV1_PRIMARY;
 struct AV1_COMP;
 struct AV1Common;
 struct ThreadData;
 
-// Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
+  tran_low_t *coeff_buf[MAX_MB_PLANE];
+  tran_low_t *qcoeff_buf[MAX_MB_PLANE];
+  tran_low_t *dqcoeff_buf[MAX_MB_PLANE];
+} PC_TREE_SHARED_BUFFERS;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct PICK_MODE_CONTEXT {
   MB_MODE_INFO mic;
   MB_MODE_INFO_EXT_FRAME mbmi_ext_best;
   uint8_t *color_index_map[2];
@@ -46,35 +53,42 @@ typedef struct {
 #if CONFIG_INTERNAL_STATS
   THR_MODES best_mode_index;
 #endif  // CONFIG_INTERNAL_STATS
-  int hybrid_pred_diff;
-  int comp_pred_diff;
-  int single_pred_diff;
-
   RD_STATS rd_stats;
 
   int rd_mode_is_ready;  // Flag to indicate whether rd pick mode decision has
                          // been made.
-
-  // motion vector cache for adaptive motion search control in partition
-  // search loop
-  MV pred_mv[REF_FRAMES];
-  PARTITION_TYPE partition;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  int64_t newmv_sse;
+  int64_t zeromv_sse;
+  int64_t zeromv_lastref_sse;
+  PREDICTION_MODE best_sse_inter_mode;
+  int_mv best_sse_mv;
+  MV_REFERENCE_FRAME best_reference_frame;
+  MV_REFERENCE_FRAME best_zeromv_reference_frame;
+  int sb_skip_denoising;
+#endif
 } PICK_MODE_CONTEXT;
 
 typedef struct PC_TREE {
   PARTITION_TYPE partitioning;
   BLOCK_SIZE block_size;
-  PICK_MODE_CONTEXT none;
-  PICK_MODE_CONTEXT horizontal[2];
-  PICK_MODE_CONTEXT vertical[2];
-  PICK_MODE_CONTEXT horizontala[3];
-  PICK_MODE_CONTEXT horizontalb[3];
-  PICK_MODE_CONTEXT verticala[3];
-  PICK_MODE_CONTEXT verticalb[3];
-  PICK_MODE_CONTEXT horizontal4[4];
-  PICK_MODE_CONTEXT vertical4[4];
+  PICK_MODE_CONTEXT *none;
+  PICK_MODE_CONTEXT *horizontal[2];
+  PICK_MODE_CONTEXT *vertical[2];
+  PICK_MODE_CONTEXT *horizontala[3];
+  PICK_MODE_CONTEXT *horizontalb[3];
+  PICK_MODE_CONTEXT *verticala[3];
+  PICK_MODE_CONTEXT *verticalb[3];
+  PICK_MODE_CONTEXT *horizontal4[4];
+  PICK_MODE_CONTEXT *vertical4[4];
   struct PC_TREE *split[4];
   int index;
+} PC_TREE;
+
+typedef struct SIMPLE_MOTION_DATA_TREE {
+  BLOCK_SIZE block_size;
+  PARTITION_TYPE partitioning;
+  struct SIMPLE_MOTION_DATA_TREE *split[4];
 
   // Simple motion search_features
   FULLPEL_MV start_mvs[REF_FRAMES];
@@ -82,14 +96,40 @@ typedef struct PC_TREE {
   unsigned int sms_rect_feat[8];
   int sms_none_valid;
   int sms_rect_valid;
-} PC_TREE;
-
-void av1_setup_pc_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
-void av1_free_pc_tree(const struct AV1_COMP *const cpi, struct ThreadData *td,
-                      const int num_planes, BLOCK_SIZE sb_size);
+} SIMPLE_MOTION_DATA_TREE;
+
+void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params,
+                                   PC_TREE_SHARED_BUFFERS *shared_bufs,
+                                   struct aom_internal_error_info *error);
+void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs);
+
+PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize);
+void av1_free_pc_tree_recursive(PC_TREE *tree, int num_planes, int keep_best,
+                                int keep_none);
+
+PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi,
+                                 BLOCK_SIZE bsize,
+                                 PC_TREE_SHARED_BUFFERS *shared_bufs);
+void av1_reset_pmc(PICK_MODE_CONTEXT *ctx);
+void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes);
 void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
                            PICK_MODE_CONTEXT *src_ctx);
 
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
+  BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
+};
+
+static AOM_INLINE int av1_get_pc_tree_nodes(const int is_sb_size_128,
+                                            int stat_generation_stage) {
+  const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
+  const int tree_nodes =
+      stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+  return tree_nodes;
+}
+
+void av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
+void av1_free_sms_tree(struct ThreadData *td);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/corner_match.c b/media/libaom/src/av1/encoder/corner_match.c
index 12f633b4fb..3631be9011 100644
--- a/media/libaom/src/av1/encoder/corner_match.c
+++ b/media/libaom/src/av1/encoder/corner_match.c
@@ -15,7 +15,6 @@
 
 #include "config/av1_rtcd.h"
 
-#include "aom_ports/system_state.h"
 #include "av1/encoder/corner_match.h"
 
 #define SEARCH_SZ 9
@@ -66,7 +65,6 @@ double av1_compute_cross_correlation_c(unsigned char *im1, int stride1, int x1,
     }
   var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
   cov = cross * MATCH_SZ_SQ - sum1 * sum2;
-  aom_clear_system_state();
   return cov / sqrt((double)var2);
 }
 
@@ -141,20 +139,20 @@ static void improve_correspondence(unsigned char *frm, unsigned char *ref,
   }
 }
 
-int av1_determine_correspondence(unsigned char *frm, int *frm_corners,
-                                 int num_frm_corners, unsigned char *ref,
+int av1_determine_correspondence(unsigned char *src, int *src_corners,
+                                 int num_src_corners, unsigned char *ref,
                                  int *ref_corners, int num_ref_corners,
-                                 int width, int height, int frm_stride,
+                                 int width, int height, int src_stride,
                                  int ref_stride, int *correspondence_pts) {
   // TODO(sarahparker) Improve this to include 2-way match
   int i, j;
   Correspondence *correspondences = (Correspondence *)correspondence_pts;
   int num_correspondences = 0;
-  for (i = 0; i < num_frm_corners; ++i) {
+  for (i = 0; i < num_src_corners; ++i) {
     double best_match_ncc = 0.0;
     double template_norm;
     int best_match_j = -1;
-    if (!is_eligible_point(frm_corners[2 * i], frm_corners[2 * i + 1], width,
+    if (!is_eligible_point(src_corners[2 * i], src_corners[2 * i + 1], width,
                            height))
       continue;
     for (j = 0; j < num_ref_corners; ++j) {
@@ -162,12 +160,12 @@ int av1_determine_correspondence(unsigned char *frm, int *frm_corners,
       if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width,
                              height))
         continue;
-      if (!is_eligible_distance(frm_corners[2 * i], frm_corners[2 * i + 1],
+      if (!is_eligible_distance(src_corners[2 * i], src_corners[2 * i + 1],
                                 ref_corners[2 * j], ref_corners[2 * j + 1],
                                 width, height))
         continue;
       match_ncc = av1_compute_cross_correlation(
-          frm, frm_stride, frm_corners[2 * i], frm_corners[2 * i + 1], ref,
+          src, src_stride, src_corners[2 * i], src_corners[2 * i + 1], ref,
           ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]);
       if (match_ncc > best_match_ncc) {
         best_match_ncc = match_ncc;
@@ -177,18 +175,18 @@ int av1_determine_correspondence(unsigned char *frm, int *frm_corners,
     // Note: We want to test if the best correlation is >= THRESHOLD_NCC,
     // but need to account for the normalization in
     // av1_compute_cross_correlation.
-    template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i],
-                                     frm_corners[2 * i + 1]);
+    template_norm = compute_variance(src, src_stride, src_corners[2 * i],
+                                     src_corners[2 * i + 1]);
     if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) {
-      correspondences[num_correspondences].x = frm_corners[2 * i];
-      correspondences[num_correspondences].y = frm_corners[2 * i + 1];
+      correspondences[num_correspondences].x = src_corners[2 * i];
+      correspondences[num_correspondences].y = src_corners[2 * i + 1];
       correspondences[num_correspondences].rx = ref_corners[2 * best_match_j];
       correspondences[num_correspondences].ry =
           ref_corners[2 * best_match_j + 1];
       num_correspondences++;
     }
   }
-  improve_correspondence(frm, ref, width, height, frm_stride, ref_stride,
+  improve_correspondence(src, ref, width, height, src_stride, ref_stride,
                          correspondences, num_correspondences);
   return num_correspondences;
 }
diff --git a/media/libaom/src/av1/encoder/corner_match.h b/media/libaom/src/av1/encoder/corner_match.h
index 3cf6de159d..45c90f32f0 100644
--- a/media/libaom/src/av1/encoder/corner_match.h
+++ b/media/libaom/src/av1/encoder/corner_match.h
@@ -24,10 +24,10 @@ typedef struct {
   int rx, ry;
 } Correspondence;
 
-int av1_determine_correspondence(unsigned char *frm, int *frm_corners,
-                                 int num_frm_corners, unsigned char *ref,
+int av1_determine_correspondence(unsigned char *src, int *src_corners,
+                                 int num_src_corners, unsigned char *ref,
                                  int *ref_corners, int num_ref_corners,
-                                 int width, int height, int frm_stride,
+                                 int width, int height, int src_stride,
                                  int ref_stride, int *correspondence_pts);
 
 #endif  // AOM_AV1_ENCODER_CORNER_MATCH_H_
diff --git a/media/libaom/src/av1/encoder/deltaq4_model.c b/media/libaom/src/av1/encoder/deltaq4_model.c
new file mode 100644
index 0000000000..60a7e6d2cf
--- /dev/null
+++ b/media/libaom/src/av1/encoder/deltaq4_model.c
@@ -0,0 +1,7776 @@
+/* Embedded file: model.tflite */
+const int av1_deltaq4_model_fsize = 101032;
+const unsigned char av1_deltaq4_model_file[101032] = {
+  0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x14, 0x00, 0x20, 0x00, 0x1c,
+  0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00,
+  0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00,
+  0x00, 0xc0, 0x00, 0x00, 0x00, 0xc0, 0x7e, 0x01, 0x00, 0xd0, 0x7e, 0x01, 0x00,
+  0x24, 0x8a, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04,
+  0x00, 0x00, 0x00, 0x6a, 0x80, 0xfe, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x1c, 0x00,
+  0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x72,
+  0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb4, 0xff, 0xff, 0xff, 0x14,
+  0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x65,
+  0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x00, 0x04, 0x00, 0x00, 0x00, 0xca, 0x81, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+  0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00,
+  0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04,
+  0x00, 0x08, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74, 0x69,
+  0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00, 0x17, 0x00,
+  0x00, 0x00, 0xfc, 0x7d, 0x01, 0x00, 0xf4, 0x7d, 0x01, 0x00, 0xdc, 0x7d, 0x01,
+  0x00, 0x84, 0x7d, 0x01, 0x00, 0xf4, 0x7c, 0x01, 0x00, 0xa4, 0x7c, 0x01, 0x00,
+  0x74, 0x7c, 0x01, 0x00, 0x5c, 0x7c, 0x01, 0x00, 0x4c, 0x5c, 0x00, 0x00, 0xbc,
+  0x5b, 0x00, 0x00, 0x8c, 0x5a, 0x00, 0x00, 0x7c, 0x48, 0x00, 0x00, 0x6c, 0x00,
+  0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00,
+  0x00, 0x4c, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04,
+  0x00, 0x00, 0x00, 0x7e, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00,
+  0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x77, 0xfe, 0xff, 0x54, 0x77, 0xfe, 0xff,
+  0x58, 0x77, 0xfe, 0xff, 0x5c, 0x77, 0xfe, 0xff, 0x60, 0x77, 0xfe, 0xff, 0x64,
+  0x77, 0xfe, 0xff, 0x68, 0x77, 0xfe, 0xff, 0x6c, 0x77, 0xfe, 0xff, 0x70, 0x77,
+  0xfe, 0xff, 0xbe, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00,
+  0x00, 0x3e, 0x84, 0xfc, 0x3b, 0xef, 0x95, 0x2f, 0xbd, 0xd3, 0x21, 0x96, 0xbd,
+  0x11, 0x9a, 0xc6, 0x3d, 0xd9, 0x7e, 0x0c, 0xbe, 0xcb, 0xd2, 0x8c, 0xbb, 0x60,
+  0xf5, 0x92, 0xbd, 0x70, 0xce, 0x9e, 0x3d, 0x26, 0x67, 0xc4, 0x3d, 0x9b, 0x2a,
+  0x8b, 0x3b, 0x3b, 0xdd, 0x2a, 0xbd, 0xf9, 0x09, 0x8a, 0xbd, 0x1b, 0xae, 0xd7,
+  0x3c, 0xbf, 0x39, 0x87, 0xbd, 0x4c, 0x9e, 0xe2, 0x3d, 0x50, 0x9c, 0xe7, 0xbd,
+  0x1e, 0x58, 0x57, 0x3d, 0x38, 0x8c, 0x58, 0xbd, 0x48, 0x9f, 0x4a, 0x3d, 0xcb,
+  0x1c, 0x93, 0xbd, 0xeb, 0xb8, 0x5a, 0xbc, 0x63, 0x04, 0x4b, 0xbd, 0x9b, 0x76,
+  0xa8, 0x3d, 0x20, 0xb4, 0x69, 0x3d, 0xee, 0xcc, 0xe5, 0x3a, 0x4f, 0x40, 0x02,
+  0x3e, 0x21, 0x2e, 0x03, 0x3e, 0x25, 0x77, 0x99, 0xbd, 0xf5, 0xa1, 0xd0, 0x3c,
+  0xc5, 0x15, 0xeb, 0x3c, 0x58, 0xb5, 0xb7, 0x3c, 0x80, 0x63, 0x33, 0xbd, 0xc9,
+  0x66, 0x63, 0xbd, 0xf6, 0xef, 0xb8, 0xbd, 0xd7, 0xbf, 0x9f, 0x3b, 0x93, 0x68,
+  0x35, 0x3d, 0x60, 0xfc, 0xf3, 0xbd, 0xed, 0xd9, 0x35, 0xbd, 0x57, 0xef, 0x8a,
+  0x3d, 0x31, 0x97, 0xa4, 0x3d, 0x8e, 0x55, 0xe2, 0x3d, 0x27, 0xa5, 0xe9, 0x3d,
+  0x36, 0x26, 0x67, 0xbc, 0xeb, 0xd1, 0x9e, 0xbd, 0xc7, 0xcd, 0x37, 0x3d, 0x31,
+  0xfc, 0xce, 0x3d, 0x5e, 0xe3, 0x96, 0xbd, 0xeb, 0x24, 0x4d, 0x3c, 0xe6, 0x00,
+  0xe2, 0xbd, 0x9b, 0x00, 0x17, 0xbd, 0xee, 0x9f, 0xc4, 0xbd, 0x6a, 0xcd, 0xba,
+  0xbc, 0x2c, 0x2b, 0x97, 0xbd, 0x8a, 0x02, 0x68, 0xbc, 0xc3, 0x46, 0x9f, 0xbd,
+  0x85, 0x3d, 0xc2, 0x3d, 0xbc, 0x16, 0x22, 0x3c, 0xf1, 0xca, 0xdf, 0x3d, 0xaf,
+  0xef, 0xbc, 0x3c, 0x4c, 0xde, 0xe8, 0xbd, 0x5c, 0x5a, 0xc9, 0xbb, 0x35, 0xe5,
+  0xc1, 0x3d, 0x14, 0xc7, 0xba, 0xbc, 0x05, 0xfb, 0x1d, 0x3d, 0x61, 0x23, 0xb7,
+  0xbb, 0x17, 0x50, 0xb0, 0xbd, 0x14, 0x5b, 0xf4, 0xbd, 0xb1, 0x4d, 0x40, 0x3d,
+  0x7e, 0x3d, 0xd8, 0x3d, 0x35, 0x2e, 0x90, 0x3d, 0x93, 0xcd, 0x0d, 0xbe, 0x8d,
+  0x60, 0x70, 0x3d, 0x4a, 0x7c, 0xf2, 0x3c, 0x07, 0x2a, 0x7f, 0x3d, 0x2c, 0xab,
+  0xd8, 0x3d, 0xb3, 0x1f, 0x1d, 0xbd, 0x44, 0x69, 0xf7, 0x3c, 0x71, 0xfd, 0x5e,
+  0x3c, 0xc8, 0x14, 0x28, 0x3d, 0x71, 0x2e, 0x0c, 0x3b, 0x7f, 0xa3, 0xb5, 0x3d,
+  0x55, 0x5c, 0x07, 0x3e, 0x0f, 0xf0, 0x3b, 0x3c, 0xd9, 0xc2, 0xbd, 0xbc, 0x71,
+  0xaa, 0xc5, 0xbb, 0xa3, 0x86, 0xc7, 0x3d, 0xcf, 0x37, 0x95, 0xbd, 0x09, 0x63,
+  0xc3, 0x3d, 0x0c, 0x01, 0x4e, 0xbd, 0xf1, 0xf9, 0x8d, 0x3d, 0xe2, 0x98, 0x45,
+  0x3d, 0x76, 0xbc, 0x3b, 0x3d, 0x2a, 0xa2, 0x47, 0x3d, 0x8c, 0x1d, 0xae, 0xbd,
+  0x5f, 0x35, 0x8c, 0xbd, 0x17, 0xeb, 0x05, 0x3d, 0x75, 0x62, 0xdb, 0xbd, 0x37,
+  0xf8, 0xea, 0x3d, 0xf8, 0xa6, 0x6c, 0xbd, 0x8a, 0x86, 0x03, 0x3d, 0x67, 0x6c,
+  0x8d, 0xbd, 0x58, 0xaf, 0xc5, 0xbd, 0x36, 0x51, 0x14, 0xbe, 0x60, 0xac, 0xe3,
+  0x3d, 0x86, 0x4f, 0xf4, 0x3c, 0xf6, 0xa3, 0x29, 0x3d, 0xc3, 0x1d, 0x9a, 0x3c,
+  0x44, 0xdc, 0x0e, 0xbc, 0x6b, 0x97, 0x8f, 0x3c, 0xc9, 0x3d, 0x88, 0xbc, 0x74,
+  0x90, 0x9d, 0x3d, 0x0f, 0x02, 0xec, 0xbd, 0x12, 0xec, 0xb2, 0x3d, 0x6c, 0x32,
+  0x31, 0x3d, 0x0b, 0x84, 0x35, 0x3d, 0xfc, 0xc2, 0x3c, 0x3d, 0x59, 0xdf, 0x16,
+  0x3d, 0x8e, 0x29, 0xee, 0x3d, 0x83, 0xc3, 0xb7, 0xbd, 0x66, 0xbd, 0x84, 0xbd,
+  0xb7, 0x49, 0x1b, 0x3d, 0x3f, 0xc1, 0x4a, 0x3d, 0x1a, 0x7d, 0xdf, 0x3d, 0xee,
+  0x12, 0xb1, 0x3c, 0x29, 0x47, 0xe6, 0xbd, 0xd6, 0x04, 0xd6, 0x3d, 0xc2, 0x31,
+  0x6f, 0xbd, 0xb0, 0x2c, 0x3e, 0xbd, 0x20, 0xd8, 0x43, 0xbd, 0x2d, 0x0c, 0x26,
+  0xbd, 0x23, 0x47, 0x06, 0xbe, 0xb9, 0xd2, 0xb9, 0xbd, 0x7b, 0xef, 0xc8, 0x3d,
+  0x23, 0x06, 0x06, 0x3d, 0x65, 0xc6, 0x45, 0xbd, 0x20, 0xc9, 0x24, 0xbc, 0xf7,
+  0x2b, 0xf5, 0x3d, 0x41, 0x91, 0x15, 0xbd, 0x90, 0xbe, 0x0f, 0x3d, 0xe8, 0x94,
+  0x8c, 0xbd, 0xdf, 0x96, 0x72, 0x3c, 0x8d, 0xb4, 0xed, 0x3d, 0x33, 0xf0, 0xb3,
+  0xbd, 0x60, 0x49, 0xbc, 0xbd, 0x32, 0xf2, 0xd5, 0x3d, 0x3e, 0x3e, 0x6b, 0xbd,
+  0xb4, 0x31, 0x09, 0x3e, 0xc6, 0x40, 0xfb, 0xbc, 0x75, 0x1a, 0x88, 0xbd, 0xbf,
+  0x13, 0xb2, 0xbd, 0xe3, 0x78, 0xc4, 0xba, 0x68, 0xfc, 0x10, 0x3e, 0x27, 0x4c,
+  0xf5, 0x3c, 0xfc, 0x68, 0x27, 0x3d, 0xb2, 0x2c, 0xe0, 0x3c, 0x6e, 0x4f, 0x9a,
+  0xbb, 0xbb, 0x9f, 0xa1, 0xbd, 0x91, 0x7b, 0x9a, 0xbc, 0x17, 0x21, 0x52, 0xba,
+  0x39, 0x8e, 0x4c, 0xbd, 0x03, 0xf5, 0xe5, 0x3d, 0x3a, 0x22, 0xcd, 0xbd, 0x90,
+  0x1c, 0x78, 0xbd, 0x3f, 0xb1, 0x8d, 0xbd, 0xfc, 0x77, 0x25, 0xbe, 0x48, 0x9a,
+  0xfd, 0x3c, 0xca, 0x6a, 0xa2, 0x3d, 0x45, 0xd6, 0x7a, 0xbd, 0xce, 0x9d, 0xbf,
+  0x3d, 0x94, 0x1c, 0xbe, 0xbd, 0xcc, 0xc4, 0x83, 0xbc, 0xe9, 0xc7, 0xf3, 0xbc,
+  0xdc, 0x31, 0x19, 0x39, 0x3a, 0x36, 0xea, 0x3d, 0x40, 0xa6, 0x72, 0xbd, 0x66,
+  0xeb, 0x85, 0xb9, 0x68, 0xa0, 0x97, 0xbd, 0xa7, 0xeb, 0xa9, 0x3c, 0x4d, 0x79,
+  0xf9, 0x3c, 0x55, 0x67, 0xb2, 0x3c, 0x80, 0x2a, 0x8f, 0xbd, 0xd5, 0x70, 0x17,
+  0x3b, 0x41, 0xfb, 0xed, 0xbd, 0xae, 0xfe, 0x0e, 0xbd, 0x6d, 0x06, 0xd6, 0xbc,
+  0x90, 0xc9, 0xd1, 0x3d, 0xb4, 0x6c, 0x19, 0x3b, 0xa3, 0x4f, 0x11, 0x3c, 0xb1,
+  0x71, 0xc1, 0xbd, 0xcc, 0x5b, 0x20, 0xbc, 0x7a, 0xb5, 0xe9, 0x3d, 0x6f, 0x8c,
+  0x95, 0x3d, 0x10, 0x56, 0x79, 0xbd, 0x45, 0x06, 0x69, 0x3c, 0xe4, 0x89, 0x9f,
+  0xbd, 0xad, 0x43, 0x82, 0xbd, 0x7a, 0x1f, 0xbd, 0xbd, 0xbb, 0x25, 0x9b, 0x3c,
+  0x27, 0xdc, 0x0f, 0xbe, 0x42, 0x7b, 0xe1, 0x3d, 0xaa, 0xd9, 0xcb, 0xbd, 0xa4,
+  0xdf, 0x0e, 0x3e, 0xdd, 0x57, 0xbe, 0xbd, 0xf0, 0xb7, 0x87, 0xbd, 0xbb, 0x8a,
+  0x73, 0xbd, 0x20, 0x8b, 0xb5, 0x3c, 0xb3, 0xac, 0x57, 0xbd, 0x4a, 0x5c, 0x68,
+  0x3d, 0x46, 0xc5, 0x6e, 0x3b, 0x44, 0xd8, 0x22, 0xbd, 0xc8, 0x88, 0x93, 0xbd,
+  0x71, 0x42, 0xd3, 0xbc, 0x80, 0x60, 0xf6, 0xbc, 0xe0, 0xb7, 0x04, 0x3d, 0xcb,
+  0x28, 0xf7, 0xbd, 0xfd, 0x2e, 0x9d, 0xbd, 0xd8, 0x81, 0x5b, 0x3d, 0x90, 0x88,
+  0x06, 0xbd, 0xb1, 0x2d, 0x8b, 0xbc, 0x74, 0x4d, 0x80, 0xbd, 0x1b, 0xce, 0x54,
+  0x3d, 0xd3, 0xea, 0x89, 0xbd, 0x7a, 0x0a, 0xc6, 0x3c, 0x8b, 0x33, 0xa2, 0x3d,
+  0x68, 0xe5, 0x8b, 0x3d, 0xcf, 0x19, 0x63, 0xbd, 0x50, 0x05, 0xc1, 0xbd, 0x2b,
+  0x1f, 0xc4, 0xbc, 0x9f, 0xed, 0xaf, 0xbd, 0xc6, 0x72, 0x07, 0xbb, 0xc1, 0x58,
+  0xa2, 0x3d, 0xf6, 0x27, 0x43, 0xbc, 0xa1, 0x5b, 0x36, 0x3d, 0x6b, 0x6b, 0x20,
+  0x3d, 0x03, 0xb0, 0xfb, 0xbd, 0xf9, 0xf7, 0x9b, 0xbd, 0x9a, 0xbf, 0x92, 0x3d,
+  0xa2, 0x0c, 0x5c, 0x3d, 0xd2, 0xc2, 0x73, 0xbd, 0x5c, 0xd3, 0xac, 0x3d, 0x9f,
+  0x28, 0xa6, 0x3d, 0x23, 0xf4, 0x46, 0xbd, 0xf5, 0xfe, 0x6b, 0x3d, 0x2d, 0x03,
+  0x56, 0x3d, 0x0c, 0x21, 0xe8, 0x3c, 0x6f, 0xdb, 0xe5, 0xbd, 0xd4, 0x8c, 0xe3,
+  0xbd, 0xdf, 0x9d, 0x62, 0x3d, 0x38, 0xa0, 0xd1, 0xbd, 0x67, 0x9e, 0x8d, 0xbc,
+  0xab, 0x78, 0x46, 0x3d, 0xf8, 0x88, 0x8e, 0xbc, 0x5a, 0x87, 0xd3, 0xbd, 0x40,
+  0xba, 0xab, 0xbd, 0x45, 0xf8, 0x9a, 0x3d, 0x77, 0x60, 0x49, 0xbd, 0xa5, 0x29,
+  0x98, 0xbc, 0xf9, 0xa7, 0x6b, 0x3d, 0xf8, 0x57, 0x1b, 0x3e, 0xf9, 0x7f, 0xcb,
+  0x3d, 0xc8, 0x38, 0x3f, 0xbb, 0x0e, 0x77, 0xd9, 0x3d, 0xa9, 0x8f, 0xca, 0x3d,
+  0x78, 0xbc, 0x92, 0x3d, 0xde, 0xe4, 0x31, 0xbc, 0x7f, 0x35, 0xec, 0x3d, 0x0b,
+  0x98, 0x5c, 0x3d, 0x3a, 0x86, 0xa0, 0x3d, 0x9d, 0xb7, 0xad, 0xbd, 0x42, 0x3c,
+  0xc2, 0xbc, 0x26, 0x4b, 0x7b, 0x3d, 0xbe, 0x8b, 0x0a, 0xb9, 0x28, 0x3e, 0xc5,
+  0x3d, 0xef, 0xac, 0xbb, 0xbd, 0xb3, 0xcc, 0x69, 0xbd, 0xb9, 0xff, 0x07, 0x3d,
+  0x30, 0xf6, 0x26, 0x3d, 0xa9, 0x18, 0xe6, 0x3d, 0x85, 0x72, 0xdb, 0xbd, 0xda,
+  0x6e, 0xa1, 0x3d, 0x3b, 0x16, 0xf7, 0x3c, 0xb1, 0x3d, 0x96, 0xbd, 0xd9, 0x88,
+  0xeb, 0x3b, 0x52, 0x76, 0x9a, 0xbd, 0xb9, 0x81, 0x1a, 0xbd, 0x81, 0x94, 0x96,
+  0xbc, 0xd4, 0x4b, 0xe8, 0x3d, 0x0f, 0x6c, 0xe4, 0xbc, 0xc0, 0xbd, 0xab, 0x3c,
+  0x1b, 0xdd, 0x76, 0x3c, 0x98, 0x18, 0xae, 0xbd, 0xfb, 0x1a, 0x6f, 0xbd, 0x72,
+  0x50, 0x83, 0xbd, 0x46, 0x0b, 0x12, 0xbc, 0x64, 0x93, 0xf2, 0x3d, 0x1f, 0xad,
+  0x71, 0x3b, 0xcf, 0x26, 0x77, 0xbd, 0x8b, 0x31, 0x2d, 0xbd, 0x0d, 0xb7, 0x54,
+  0x3b, 0x5b, 0x00, 0xc4, 0x3d, 0x57, 0x4c, 0x58, 0x3d, 0x11, 0x4c, 0x15, 0x3d,
+  0x1a, 0xfc, 0xa2, 0xbc, 0xf2, 0xed, 0xea, 0x3d, 0x9e, 0xad, 0xf7, 0xbd, 0x47,
+  0x8d, 0x41, 0x3d, 0xce, 0xc5, 0x96, 0xbb, 0x2a, 0x72, 0xa0, 0xbd, 0x93, 0x27,
+  0x9a, 0xbd, 0x3f, 0xcb, 0xef, 0xbb, 0xb5, 0xa5, 0x1e, 0x3d, 0xd6, 0x2a, 0xfd,
+  0xbc, 0xf5, 0xe0, 0xd4, 0xbc, 0xa1, 0x7d, 0x9d, 0x3d, 0xbb, 0x60, 0x22, 0xbd,
+  0x32, 0x15, 0x16, 0x3e, 0x80, 0x77, 0xb7, 0xbc, 0xba, 0x1c, 0xa4, 0xbd, 0x45,
+  0xb7, 0x0b, 0xbd, 0x6a, 0x33, 0x9a, 0x3d, 0xfc, 0x27, 0xab, 0xbc, 0x10, 0xcd,
+  0x2c, 0x3e, 0xb3, 0xf1, 0xa5, 0x3d, 0x03, 0xf7, 0xa3, 0x3c, 0x25, 0x0c, 0xe1,
+  0x3c, 0xc4, 0x82, 0xaa, 0xbd, 0x3a, 0x4a, 0x15, 0x3c, 0x5c, 0x56, 0x9e, 0x3d,
+  0x96, 0x52, 0xee, 0x3d, 0x67, 0xf7, 0x96, 0x3d, 0x3e, 0xb0, 0xd6, 0xbd, 0x6e,
+  0xbd, 0x8e, 0xbd, 0x16, 0xb3, 0x85, 0x3d, 0x84, 0xca, 0x6e, 0xbd, 0x0f, 0xfc,
+  0x40, 0x3d, 0x2d, 0xe0, 0xdc, 0x3d, 0xc1, 0xa1, 0xde, 0x39, 0x30, 0x79, 0xe7,
+  0x3d, 0x0a, 0xab, 0xba, 0x3d, 0x35, 0x57, 0xc7, 0xbd, 0x7e, 0x38, 0xa1, 0x3d,
+  0xe3, 0x25, 0x60, 0x3d, 0x47, 0xbd, 0x56, 0x3d, 0x62, 0xcf, 0xf6, 0x3d, 0xad,
+  0x06, 0xd5, 0xbd, 0x41, 0xda, 0xe8, 0x3a, 0x81, 0xcb, 0xbb, 0x3d, 0xce, 0x38,
+  0x4c, 0xbc, 0x17, 0xc0, 0x88, 0xbd, 0x12, 0x25, 0xd7, 0xbd, 0x3b, 0xf5, 0x9b,
+  0xbd, 0x4e, 0xa0, 0xb1, 0xbc, 0xa1, 0x8c, 0x9c, 0x3d, 0xc5, 0x2f, 0xb3, 0x3d,
+  0xe0, 0xc2, 0x08, 0x3e, 0x0b, 0xcc, 0x2f, 0x3d, 0x87, 0x3f, 0x1d, 0x3e, 0x76,
+  0xcd, 0xc3, 0xbd, 0x4f, 0x1d, 0xd4, 0xbd, 0x65, 0x6f, 0x00, 0x3e, 0x95, 0x4f,
+  0x9a, 0x3d, 0xa2, 0x66, 0x28, 0xbd, 0xaf, 0x81, 0x90, 0x3d, 0x16, 0x50, 0xde,
+  0x3b, 0x65, 0xec, 0xe3, 0xbd, 0x47, 0x6c, 0x34, 0xbc, 0xae, 0xe8, 0xe5, 0xbd,
+  0x5b, 0x7c, 0xa6, 0xbb, 0x1d, 0x4d, 0x8d, 0xbc, 0xb1, 0x7a, 0x1d, 0x3e, 0xbf,
+  0x37, 0xe6, 0xbc, 0x7b, 0x0c, 0x70, 0x3d, 0x09, 0x57, 0xe2, 0x3d, 0x10, 0x4a,
+  0x35, 0xbc, 0x5d, 0x58, 0xf5, 0xbc, 0xb9, 0x89, 0xa1, 0x3d, 0x6a, 0xb2, 0x68,
+  0xbd, 0xf4, 0xf6, 0x03, 0x3e, 0xf1, 0xc6, 0x3a, 0xbd, 0xf5, 0x3b, 0xe2, 0x3d,
+  0x3a, 0xd2, 0x4a, 0x3d, 0xe7, 0xb8, 0x9e, 0xbd, 0x18, 0xe7, 0xd9, 0x3c, 0x1d,
+  0x95, 0x8e, 0x3d, 0xde, 0x6f, 0x9e, 0xbc, 0xae, 0x7d, 0x0f, 0x3e, 0xb0, 0xf3,
+  0x04, 0x3d, 0xe0, 0xdc, 0x6b, 0x3d, 0x02, 0x2c, 0xee, 0xbd, 0x7c, 0xb2, 0x9f,
+  0xbd, 0xae, 0x94, 0xc3, 0x3c, 0x82, 0xba, 0xab, 0x3d, 0x07, 0x80, 0xde, 0x3c,
+  0x75, 0xec, 0xb3, 0xbd, 0x34, 0x42, 0x74, 0xbd, 0x44, 0xce, 0x7a, 0x3d, 0x21,
+  0xac, 0x28, 0xbe, 0xb1, 0xbb, 0x14, 0xbd, 0xe2, 0xe1, 0xdb, 0x3c, 0x41, 0x82,
+  0xc7, 0x3d, 0x3e, 0x0f, 0x9c, 0xbd, 0x92, 0x4e, 0x97, 0x3d, 0x69, 0x45, 0xf2,
+  0x3d, 0xc3, 0x86, 0xc4, 0xbb, 0x57, 0x0f, 0xb1, 0x3d, 0x8c, 0xa7, 0xc6, 0x3d,
+  0x27, 0xe2, 0xf3, 0xbc, 0xdd, 0x31, 0x44, 0xbd, 0x94, 0x2c, 0x29, 0xbc, 0xe6,
+  0xeb, 0xd1, 0xbd, 0x74, 0xf9, 0x02, 0x3d, 0x43, 0x51, 0x92, 0xbd, 0x38, 0xb8,
+  0x72, 0x3d, 0x73, 0xd3, 0x89, 0xbc, 0x06, 0x13, 0xdb, 0x3d, 0x75, 0xc5, 0xb2,
+  0x3b, 0x9a, 0xe9, 0x95, 0xbc, 0xd2, 0x6a, 0x05, 0x3e, 0x65, 0xc5, 0xa3, 0x3d,
+  0x59, 0x09, 0x72, 0xbd, 0x93, 0x0e, 0x85, 0xbc, 0x0d, 0x55, 0x6b, 0xbd, 0x55,
+  0x64, 0x16, 0xbd, 0x50, 0x04, 0x9f, 0x3d, 0x93, 0x37, 0x14, 0xbd, 0xe9, 0x24,
+  0x58, 0x3d, 0x04, 0x8e, 0xe9, 0xbd, 0xe4, 0x6e, 0x2b, 0xbd, 0x43, 0xbc, 0xba,
+  0xbd, 0x80, 0xa1, 0xc3, 0xbd, 0x32, 0x81, 0xf5, 0xbd, 0x94, 0x5a, 0x10, 0x3d,
+  0xfb, 0x5d, 0x27, 0x3c, 0xd7, 0x26, 0xc5, 0x3d, 0xf5, 0xc3, 0x4b, 0x3d, 0x32,
+  0xca, 0xdc, 0x3d, 0xb2, 0xe8, 0x35, 0xbc, 0xb2, 0x47, 0xb9, 0xbd, 0xfa, 0x59,
+  0x29, 0xbe, 0xab, 0x6f, 0x0a, 0x3e, 0x81, 0xa5, 0x10, 0xbd, 0x73, 0x96, 0x99,
+  0xbd, 0x39, 0x77, 0x23, 0xbc, 0xa8, 0x50, 0xf8, 0xbd, 0x4c, 0x1d, 0xdd, 0xbd,
+  0xf8, 0xf5, 0xb9, 0xbd, 0x65, 0x4e, 0x12, 0x3e, 0xc0, 0xa1, 0x7a, 0xbd, 0x16,
+  0x33, 0x27, 0x3d, 0xc4, 0xc6, 0x31, 0x3b, 0x0e, 0xcd, 0x48, 0xbd, 0xd2, 0x7f,
+  0xb4, 0xbd, 0x2c, 0x3a, 0x8b, 0x3c, 0x6f, 0x43, 0x59, 0x3d, 0x4e, 0x8a, 0x52,
+  0x3d, 0x91, 0x68, 0xc4, 0x3d, 0xa2, 0x78, 0x16, 0xbd, 0xe5, 0x2c, 0x60, 0x3d,
+  0x7f, 0x73, 0x8f, 0x3d, 0x9f, 0x70, 0x09, 0xbe, 0xf2, 0xf2, 0x05, 0x3c, 0x1e,
+  0x58, 0x98, 0x3d, 0xec, 0xfc, 0x03, 0x3e, 0x88, 0xbf, 0x56, 0xbd, 0x2b, 0xc8,
+  0x99, 0xbd, 0x9e, 0x13, 0x9a, 0xbc, 0x4f, 0x72, 0xca, 0xbd, 0x79, 0x6e, 0xef,
+  0x3d, 0x87, 0xc3, 0x80, 0xbc, 0xe7, 0xef, 0x05, 0x3d, 0xc7, 0x99, 0x0a, 0x3d,
+  0x17, 0x7c, 0x56, 0x3d, 0x01, 0xab, 0xd3, 0xbd, 0x48, 0x8b, 0xa2, 0xbd, 0x06,
+  0xad, 0xcc, 0xbc, 0xf0, 0xf5, 0x6d, 0xbd, 0x6a, 0x67, 0x0c, 0xbe, 0x7e, 0x2e,
+  0x6e, 0x3d, 0x53, 0x50, 0x29, 0xbd, 0x8c, 0x40, 0xb3, 0x3d, 0x5c, 0x9a, 0x0f,
+  0xbd, 0xe9, 0x4e, 0x0a, 0x3e, 0x4d, 0x05, 0xac, 0x3d, 0xf9, 0x1a, 0x8e, 0x3d,
+  0x0d, 0x69, 0xa6, 0xbd, 0x88, 0x94, 0x60, 0x3d, 0x48, 0x2a, 0x8a, 0xbb, 0x5a,
+  0x5d, 0x39, 0x3d, 0x88, 0x56, 0xc8, 0x3c, 0xb8, 0x91, 0x93, 0x3a, 0x64, 0x69,
+  0x8b, 0x3d, 0x4b, 0x48, 0x43, 0xbd, 0xb8, 0x91, 0xa7, 0xbd, 0x92, 0x96, 0xe5,
+  0x3d, 0x4c, 0x62, 0xd6, 0x3d, 0xa6, 0x7a, 0x88, 0xbd, 0x6c, 0xdb, 0xc6, 0x3d,
+  0x1c, 0x4d, 0xab, 0x3d, 0xe0, 0x1d, 0x57, 0x3c, 0x2a, 0xa3, 0x0c, 0x3d, 0xac,
+  0xff, 0xe8, 0xbb, 0x12, 0x86, 0x89, 0xbd, 0xc6, 0x68, 0xd3, 0xbd, 0xe7, 0xb0,
+  0xa6, 0xbc, 0x3c, 0xd2, 0xfa, 0xbb, 0xf2, 0xd6, 0xda, 0xbd, 0x80, 0x95, 0xc5,
+  0xbd, 0x0a, 0x19, 0x93, 0xbd, 0x94, 0xc1, 0xe4, 0xbd, 0xdd, 0x20, 0x18, 0x3e,
+  0xb3, 0x48, 0xba, 0xbd, 0xdd, 0x6b, 0x86, 0xbd, 0x3d, 0xbc, 0xb1, 0xbd, 0xbe,
+  0xc1, 0x7f, 0xbc, 0xfc, 0x54, 0x83, 0x3d, 0xb5, 0x4e, 0x1e, 0xbd, 0x5f, 0x54,
+  0xc3, 0x3c, 0xe4, 0x2e, 0x0a, 0x3e, 0xc9, 0x05, 0x05, 0x3d, 0xc7, 0x8d, 0x2c,
+  0xbc, 0x37, 0x21, 0xc2, 0xbc, 0xea, 0x7e, 0x96, 0x3d, 0x64, 0x7a, 0xca, 0x3d,
+  0xcb, 0xcf, 0xc8, 0x3b, 0x5a, 0xd4, 0x00, 0xbe, 0x5f, 0x49, 0xd0, 0x3d, 0xbe,
+  0x56, 0x15, 0x3e, 0x3f, 0x1d, 0x9e, 0xbd, 0xd4, 0x91, 0xa9, 0x3d, 0xf1, 0xea,
+  0x4b, 0xbb, 0x78, 0x4a, 0xa5, 0x3c, 0xc2, 0x9b, 0xac, 0xbd, 0x8c, 0xd3, 0x94,
+  0xbd, 0xb1, 0x52, 0x94, 0xbd, 0x55, 0xdd, 0x0d, 0xbe, 0x93, 0x2e, 0xa1, 0x3d,
+  0x31, 0x1e, 0xe0, 0x3c, 0xaf, 0xba, 0x6c, 0x3d, 0x8e, 0xec, 0x8f, 0xbd, 0x38,
+  0x79, 0xd2, 0xbc, 0x21, 0x7e, 0x9d, 0x3d, 0xbb, 0x21, 0xeb, 0x3d, 0x6e, 0x68,
+  0xec, 0x3d, 0xc2, 0xf4, 0xb6, 0xbd, 0x80, 0xe2, 0x91, 0xbc, 0x45, 0xa5, 0x8f,
+  0xbb, 0xf8, 0xb2, 0xc7, 0xbd, 0xe4, 0x47, 0x3a, 0xbd, 0xa2, 0x4f, 0xe9, 0xbd,
+  0xcc, 0x37, 0x53, 0x3c, 0x51, 0x03, 0x4f, 0x3d, 0x35, 0xa2, 0xfa, 0x3d, 0xea,
+  0x64, 0x7b, 0xbc, 0xbf, 0x49, 0xfb, 0x3d, 0x3d, 0x8e, 0x7b, 0x3b, 0x9c, 0x4b,
+  0x35, 0xbd, 0x62, 0xf1, 0x10, 0xbe, 0xac, 0xd2, 0xd8, 0xbd, 0x80, 0x00, 0x9d,
+  0x3d, 0xcc, 0x19, 0xaf, 0xbc, 0x97, 0x73, 0xdb, 0x3d, 0x6d, 0xb6, 0xf3, 0x3d,
+  0x19, 0xe7, 0x7a, 0xbd, 0xcf, 0xba, 0xc6, 0x3c, 0x77, 0xfc, 0x23, 0x3d, 0xd6,
+  0xfe, 0x3f, 0x3d, 0x73, 0xf2, 0xdb, 0xbd, 0x3d, 0x21, 0x95, 0xbb, 0x58, 0xb8,
+  0x86, 0xbd, 0x01, 0x3c, 0x6f, 0x3d, 0xaf, 0x2e, 0x3e, 0xbd, 0x7b, 0x6d, 0x73,
+  0xbd, 0x33, 0xe2, 0x5f, 0xbc, 0x64, 0x5f, 0xdb, 0xbd, 0x31, 0xf5, 0xb6, 0xbd,
+  0xfc, 0x90, 0xd4, 0xbd, 0x25, 0xd8, 0xc4, 0xbd, 0x38, 0xdf, 0xb9, 0x3d, 0x89,
+  0x14, 0x8b, 0x3d, 0x8d, 0x05, 0x2c, 0xbd, 0x20, 0xb8, 0xa3, 0xbc, 0xaf, 0x68,
+  0x12, 0x3d, 0xce, 0x53, 0xb0, 0xbd, 0xca, 0x8a, 0x95, 0x3d, 0x11, 0x84, 0x8a,
+  0x3d, 0x6d, 0xbd, 0x67, 0xbb, 0xe8, 0xd5, 0x76, 0xbc, 0xac, 0xc8, 0xfb, 0xbd,
+  0xa9, 0x8b, 0xa4, 0xbb, 0x3e, 0x3a, 0xba, 0x3d, 0xe2, 0xa5, 0x50, 0x3d, 0xf0,
+  0x4d, 0x81, 0x3b, 0x96, 0x79, 0x31, 0xbd, 0x87, 0xaf, 0xe5, 0x3a, 0x27, 0xb7,
+  0xa5, 0x3d, 0xd4, 0x71, 0xb5, 0xbd, 0x95, 0x06, 0xd1, 0xbd, 0x82, 0x3d, 0x1c,
+  0xbc, 0xdc, 0xe4, 0x6e, 0x3d, 0x21, 0xcf, 0x80, 0xbc, 0xbe, 0xc7, 0xb7, 0xbc,
+  0x21, 0x87, 0x3c, 0x3d, 0x11, 0x3a, 0x67, 0xbd, 0xa5, 0xd3, 0xe8, 0xbd, 0x9a,
+  0xb7, 0xc2, 0x3d, 0x2e, 0xa7, 0x86, 0xbc, 0xbe, 0x03, 0x26, 0xbc, 0x5e, 0x12,
+  0x08, 0xbe, 0x1d, 0xd9, 0xf8, 0xbd, 0xf3, 0x79, 0xe4, 0xbd, 0x38, 0xaa, 0x04,
+  0x3e, 0x98, 0x40, 0xa7, 0x3d, 0xfa, 0xd9, 0xce, 0xbd, 0x08, 0x73, 0x16, 0xb9,
+  0xd6, 0x47, 0x2c, 0x3d, 0x08, 0xb5, 0x8b, 0xbd, 0x04, 0x66, 0x70, 0x3c, 0x9f,
+  0xe6, 0xe4, 0xbd, 0x7f, 0xcd, 0xa5, 0x3b, 0x5b, 0x92, 0x8b, 0xbd, 0x29, 0x55,
+  0x19, 0xbd, 0x79, 0x98, 0x26, 0x3d, 0x32, 0x3d, 0xc3, 0xb9, 0x29, 0x8a, 0x05,
+  0xbe, 0xe8, 0x61, 0x92, 0x3d, 0x4f, 0x64, 0xa9, 0x3d, 0x00, 0x9a, 0xa0, 0xbd,
+  0x34, 0xcc, 0xd8, 0x3c, 0xcd, 0x8a, 0xaf, 0x3d, 0x69, 0xc6, 0x5c, 0x3c, 0xe0,
+  0x76, 0xd3, 0x3d, 0x49, 0x6a, 0x79, 0x3b, 0x33, 0x10, 0xbd, 0x3c, 0xe9, 0x47,
+  0x2a, 0xbd, 0x7f, 0xb4, 0x3e, 0xbb, 0x80, 0xd2, 0x18, 0xbe, 0xf3, 0x5c, 0x90,
+  0xbd, 0x0b, 0x88, 0xaf, 0xbd, 0x24, 0x0c, 0x94, 0xbd, 0xfd, 0xa9, 0xa1, 0xbd,
+  0x40, 0xc9, 0x82, 0xbd, 0x24, 0x56, 0xa0, 0x3c, 0xa0, 0x3e, 0x09, 0x3e, 0x30,
+  0x93, 0xc7, 0x3d, 0x03, 0xa3, 0x0c, 0x3c, 0x88, 0xdc, 0x96, 0x3d, 0xac, 0x34,
+  0xc7, 0xbd, 0x64, 0xb0, 0xe5, 0x3d, 0x61, 0x56, 0xc8, 0x3d, 0x08, 0x55, 0x99,
+  0x3d, 0xb5, 0xa9, 0x56, 0xbd, 0xfb, 0x4f, 0x95, 0xbd, 0xe9, 0xeb, 0x55, 0x3d,
+  0xbf, 0x4c, 0xdf, 0xbd, 0xbf, 0x4a, 0x12, 0xbb, 0x93, 0x9d, 0x65, 0xbd, 0x26,
+  0xd0, 0xce, 0x3d, 0x89, 0x19, 0x64, 0xbd, 0x91, 0x3d, 0x3f, 0x3d, 0x23, 0x3a,
+  0x3b, 0xbd, 0xc8, 0x9d, 0x20, 0xbc, 0xa1, 0x2c, 0xff, 0xbb, 0x8c, 0x39, 0xb2,
+  0x3b, 0xf3, 0xbe, 0x86, 0x3d, 0xa3, 0xfa, 0xcc, 0xbd, 0x3d, 0x3c, 0x07, 0xbe,
+  0xd4, 0xb4, 0xa7, 0xbd, 0x94, 0xfc, 0x71, 0x3d, 0x8b, 0xe6, 0x2e, 0x3d, 0x94,
+  0x30, 0x41, 0xbd, 0xb3, 0x63, 0x18, 0x3d, 0xbf, 0x35, 0x3c, 0xbb, 0x4c, 0xaa,
+  0xd9, 0xbd, 0x20, 0x83, 0xa1, 0x3d, 0xdb, 0xca, 0x49, 0x3c, 0x1d, 0xbb, 0xac,
+  0xbb, 0x3c, 0xea, 0x1c, 0xbc, 0x5b, 0xc3, 0xd1, 0x3d, 0x15, 0xd3, 0xc9, 0xbd,
+  0xb9, 0x30, 0x12, 0xbb, 0xe3, 0x34, 0xde, 0xbd, 0xa0, 0x31, 0xeb, 0xbd, 0xc2,
+  0x64, 0xe2, 0x3d, 0xb2, 0xfd, 0xf4, 0xbd, 0x45, 0xa5, 0xbe, 0x3c, 0xa1, 0x40,
+  0x56, 0xbd, 0x52, 0x01, 0xed, 0x3d, 0xd0, 0x6b, 0xfc, 0xbd, 0xef, 0x73, 0xb2,
+  0xbd, 0x03, 0xa0, 0xcd, 0xbd, 0x24, 0x69, 0xbe, 0x3c, 0x76, 0xcd, 0x9e, 0x3d,
+  0xbe, 0xcb, 0x3b, 0x3d, 0x55, 0x49, 0x4e, 0xbd, 0x99, 0xe9, 0xd5, 0xbc, 0x9c,
+  0x73, 0x88, 0x3c, 0x9a, 0x64, 0x75, 0xbd, 0x53, 0x89, 0xb2, 0xbd, 0x73, 0xa4,
+  0xb9, 0x3d, 0xa8, 0x68, 0xf3, 0xbd, 0x2a, 0xf3, 0x89, 0xbd, 0x8d, 0x63, 0x85,
+  0x3c, 0xbb, 0x72, 0x63, 0x3d, 0x29, 0x8a, 0xe8, 0xbd, 0x87, 0x03, 0xab, 0x3d,
+  0xbf, 0x88, 0x44, 0xbd, 0x74, 0x28, 0xae, 0xbd, 0xf7, 0xe8, 0x87, 0xbd, 0x16,
+  0x46, 0x04, 0xbd, 0x87, 0xf6, 0xcf, 0xbd, 0x8b, 0x67, 0x44, 0xbd, 0xac, 0xd4,
+  0xa5, 0xbd, 0xed, 0x0b, 0xf2, 0xbd, 0x20, 0x9e, 0xf5, 0xbd, 0xc1, 0xbd, 0x70,
+  0x3d, 0xae, 0xfe, 0x77, 0x3d, 0x27, 0x07, 0x82, 0xbd, 0xbe, 0x56, 0x19, 0xbd,
+  0xae, 0x94, 0xc9, 0xbd, 0x7a, 0x52, 0xc6, 0xbd, 0x4e, 0x64, 0x4d, 0x3c, 0xf7,
+  0xe4, 0x18, 0x3d, 0xef, 0x06, 0xa4, 0xbd, 0x8c, 0xad, 0xa8, 0xbd, 0xab, 0xcc,
+  0x62, 0xbc, 0x4a, 0x7c, 0x09, 0xba, 0x01, 0x0d, 0x2b, 0xbd, 0x3d, 0x77, 0xb6,
+  0x3b, 0xd3, 0x48, 0xc8, 0x3d, 0x89, 0xcf, 0x05, 0x3e, 0xdb, 0x48, 0x92, 0x3d,
+  0x1e, 0xa5, 0xc9, 0x3c, 0xc7, 0xad, 0x74, 0x3d, 0x66, 0x26, 0x4e, 0xbd, 0x8f,
+  0x4c, 0x85, 0x3d, 0xe2, 0x14, 0xe3, 0x3d, 0xad, 0x90, 0x2b, 0xbd, 0xcd, 0x7c,
+  0xf4, 0x3d, 0xe6, 0xae, 0x98, 0x3c, 0xa6, 0x86, 0x66, 0x3c, 0x18, 0x11, 0x1f,
+  0xbc, 0xb8, 0xe5, 0xa3, 0xbc, 0xea, 0xd7, 0x47, 0xbd, 0x39, 0x8a, 0xbb, 0x3d,
+  0x1c, 0x27, 0x4c, 0xba, 0x50, 0x9a, 0x4b, 0xbd, 0xda, 0x55, 0x5c, 0xbd, 0xa7,
+  0xd6, 0xb4, 0x3d, 0x40, 0x3f, 0xa0, 0xbd, 0x26, 0xa7, 0xba, 0xbd, 0x4c, 0xc0,
+  0x5c, 0x3d, 0x5c, 0xe1, 0x96, 0x3d, 0x50, 0xd9, 0x36, 0xbb, 0x8b, 0xf8, 0x7e,
+  0xbb, 0xb4, 0x9c, 0xf0, 0x3d, 0x88, 0xf4, 0xa8, 0xbd, 0x92, 0x72, 0x0e, 0xbd,
+  0x18, 0xc1, 0xa0, 0x3c, 0x78, 0x3f, 0xc6, 0xbd, 0xfa, 0xec, 0xe8, 0xbd, 0xa4,
+  0xbc, 0x3d, 0xbd, 0x47, 0x9d, 0xc6, 0xbc, 0x8e, 0x10, 0x4b, 0x3d, 0x18, 0x89,
+  0x51, 0xbd, 0x26, 0xd5, 0x9b, 0xbd, 0xb9, 0xbb, 0x0a, 0xbe, 0xa7, 0x0f, 0x8f,
+  0x3d, 0x62, 0x63, 0x4b, 0xbb, 0xfe, 0x46, 0x56, 0xbd, 0x64, 0xcc, 0xbb, 0x3d,
+  0x85, 0x17, 0x52, 0x3d, 0x08, 0xa8, 0x0e, 0x3d, 0x75, 0xdc, 0x4c, 0xbd, 0xf9,
+  0xc3, 0x92, 0x3d, 0xe0, 0x13, 0x84, 0x3d, 0xa1, 0x30, 0xe8, 0xbd, 0x2d, 0x2b,
+  0xd0, 0xbd, 0x68, 0x62, 0x91, 0xbc, 0x32, 0xd7, 0xd3, 0xbb, 0xac, 0xd6, 0xdb,
+  0x3d, 0x0d, 0x70, 0xe9, 0xbd, 0xed, 0xea, 0x69, 0x3d, 0xa4, 0xa3, 0x99, 0x3d,
+  0x60, 0xa0, 0xcd, 0xbd, 0xd8, 0x9b, 0x20, 0x3c, 0x29, 0x39, 0xaf, 0x3d, 0xd3,
+  0x2d, 0x2e, 0x3d, 0x10, 0xd7, 0x60, 0x3d, 0x2b, 0x82, 0xb1, 0xbd, 0x3d, 0x6b,
+  0x94, 0xbd, 0x73, 0xa6, 0x24, 0x3d, 0x33, 0x6b, 0xf9, 0xbd, 0x94, 0xe1, 0xac,
+  0x3d, 0xdf, 0x2c, 0x77, 0x3d, 0x82, 0x66, 0xa0, 0x3c, 0x9d, 0x7c, 0xd1, 0xbd,
+  0x67, 0x66, 0x39, 0x3d, 0x1b, 0xb4, 0x5e, 0x3d, 0x0a, 0x50, 0x7f, 0x3d, 0x1a,
+  0x08, 0x6c, 0x3d, 0x6c, 0x55, 0xac, 0xbd, 0x27, 0x4d, 0x04, 0xbc, 0x28, 0x6e,
+  0x54, 0x3c, 0x8d, 0x2e, 0x95, 0xbd, 0x56, 0x25, 0xd5, 0x3a, 0x8d, 0xf8, 0xde,
+  0xbd, 0x53, 0xd6, 0xe0, 0x3c, 0x09, 0xfc, 0x3f, 0x3d, 0x95, 0x29, 0xbe, 0xba,
+  0x9b, 0x98, 0xa6, 0x3d, 0xfd, 0xd1, 0xe1, 0x3d, 0x00, 0x2a, 0x04, 0xbe, 0x06,
+  0x73, 0x8b, 0xbd, 0x1e, 0x77, 0xcd, 0x3d, 0xf3, 0x47, 0x01, 0xbe, 0x41, 0x8d,
+  0xd2, 0xbc, 0x98, 0xba, 0x02, 0xbe, 0x14, 0x4e, 0x84, 0xbc, 0x7b, 0xee, 0xc1,
+  0x3d, 0x5c, 0x1f, 0x5f, 0xbd, 0x66, 0x1e, 0xd4, 0xbd, 0xa7, 0x18, 0x51, 0x3d,
+  0xaa, 0xbb, 0x7f, 0x3b, 0x9a, 0x15, 0x33, 0x3d, 0xcd, 0x6b, 0x8d, 0x3d, 0x9c,
+  0x73, 0x6d, 0xbd, 0x76, 0x3e, 0x54, 0x3c, 0x3d, 0x4f, 0xe4, 0x3d, 0x89, 0xaf,
+  0xf9, 0x3d, 0x0f, 0x5f, 0x8b, 0xbd, 0x5d, 0xcc, 0x9c, 0xbd, 0x8b, 0x08, 0xf1,
+  0xbd, 0xe3, 0xc3, 0x04, 0xbd, 0x5f, 0x0b, 0xf8, 0x3d, 0x4f, 0xd8, 0xaf, 0x3d,
+  0x2f, 0xff, 0x3e, 0x3d, 0x07, 0xf0, 0x5f, 0xbb, 0xcd, 0x6b, 0xbd, 0xbd, 0x0a,
+  0x80, 0xee, 0x3d, 0x58, 0xa2, 0xbd, 0x3c, 0xa6, 0x43, 0xf9, 0xbc, 0x7e, 0x76,
+  0xbb, 0x3d, 0x0b, 0x75, 0x11, 0xb9, 0x7c, 0x78, 0x46, 0x3d, 0xe9, 0xf0, 0x73,
+  0x3d, 0x6d, 0x01, 0x50, 0xbc, 0x6f, 0x55, 0x80, 0x3d, 0x88, 0x5d, 0xd4, 0xbc,
+  0x20, 0x61, 0x94, 0xbd, 0xbd, 0x32, 0xa3, 0x3c, 0x91, 0x29, 0xb3, 0xbd, 0x7a,
+  0x60, 0x62, 0xbc, 0xd8, 0x67, 0x99, 0xbb, 0xea, 0xd6, 0x4a, 0xbd, 0xb2, 0xb3,
+  0x14, 0xbd, 0x15, 0x9f, 0xf6, 0x3d, 0xc4, 0x35, 0xbe, 0xbd, 0xc6, 0x0b, 0x63,
+  0x3d, 0x43, 0x76, 0x43, 0xbd, 0x4f, 0x5e, 0x18, 0xbc, 0x6b, 0xac, 0xb1, 0x3d,
+  0x4e, 0xca, 0xd8, 0xbd, 0x2f, 0xef, 0xc3, 0x3d, 0x96, 0xc3, 0x48, 0x3c, 0x1c,
+  0x73, 0x17, 0x3d, 0x56, 0x34, 0xfb, 0x3c, 0x25, 0xa7, 0xb2, 0x3d, 0x29, 0x5e,
+  0xac, 0x3d, 0xdd, 0x3b, 0x80, 0x3d, 0x5a, 0xec, 0x37, 0x3c, 0xdc, 0xf9, 0x92,
+  0x3b, 0x66, 0x0b, 0xc6, 0xbd, 0x75, 0x09, 0xfc, 0xbc, 0x55, 0xd9, 0xea, 0xbd,
+  0x01, 0xed, 0x7a, 0x3c, 0x90, 0x7d, 0x5e, 0xbd, 0xb8, 0x38, 0xc9, 0x3d, 0xb8,
+  0x23, 0xa6, 0x3d, 0xb8, 0x83, 0x01, 0x3e, 0xe8, 0x22, 0xda, 0x3c, 0x66, 0xf5,
+  0x92, 0x3d, 0x82, 0xe0, 0x87, 0x3c, 0x6f, 0xa1, 0x6e, 0x3d, 0x27, 0xca, 0xaf,
+  0x3c, 0x7f, 0x68, 0xd6, 0xbd, 0x38, 0x98, 0x93, 0x3d, 0x4d, 0xdc, 0x5e, 0x3d,
+  0xc8, 0xb8, 0xb2, 0x3d, 0xab, 0xeb, 0x8a, 0xbb, 0x39, 0x48, 0xbb, 0xbd, 0x17,
+  0xe6, 0x0f, 0x3d, 0x57, 0x79, 0xea, 0xbc, 0xb2, 0x5e, 0xdb, 0x3d, 0x0c, 0x19,
+  0xc7, 0xbd, 0xeb, 0x33, 0x2b, 0x3d, 0x4b, 0x15, 0xf6, 0x3d, 0x96, 0x9b, 0xa1,
+  0xbc, 0x5c, 0xc8, 0x03, 0xbd, 0x88, 0x56, 0x21, 0x3e, 0x85, 0x0c, 0xa5, 0x3c,
+  0x85, 0xcb, 0xf4, 0xbd, 0x61, 0x03, 0x4d, 0x3c, 0xf1, 0xf4, 0x8c, 0xbd, 0x7b,
+  0x39, 0x34, 0x3b, 0xf4, 0xa2, 0x47, 0xbc, 0x10, 0x2d, 0xfc, 0xbd, 0xe8, 0xdd,
+  0xe6, 0x3c, 0xa5, 0x7c, 0x85, 0x3c, 0x3f, 0xcd, 0xeb, 0xbc, 0x42, 0x94, 0xba,
+  0xbd, 0x50, 0x23, 0xe3, 0xbd, 0x92, 0xf6, 0xa7, 0xbd, 0x5c, 0x36, 0xd0, 0xbd,
+  0x27, 0x9e, 0x18, 0x3e, 0x33, 0x9a, 0xe8, 0xbc, 0x80, 0x3a, 0x5d, 0x3d, 0xd0,
+  0xdc, 0x9c, 0xbd, 0xa3, 0x93, 0x51, 0xbd, 0x36, 0xab, 0x7a, 0x3d, 0x74, 0x9c,
+  0x63, 0x3d, 0x1c, 0x19, 0x9b, 0xbd, 0xa6, 0x10, 0xb4, 0xbd, 0xf4, 0x80, 0xb4,
+  0xbc, 0xd3, 0x9c, 0xd2, 0xbc, 0x6d, 0x1b, 0x68, 0xbd, 0x31, 0x6a, 0xfd, 0xbd,
+  0xdc, 0xa4, 0x82, 0xbd, 0xa7, 0xe7, 0x37, 0xbd, 0x5c, 0xd1, 0x07, 0xbd, 0x4e,
+  0x82, 0x15, 0xbc, 0x31, 0x43, 0x16, 0x3e, 0xe2, 0xf3, 0x1e, 0x3e, 0x62, 0x22,
+  0x14, 0x3e, 0x27, 0x65, 0x0d, 0x39, 0xaa, 0x9e, 0x8f, 0x3d, 0xdd, 0x59, 0x4c,
+  0x3c, 0x4a, 0xc5, 0xc5, 0xbd, 0x4a, 0xa5, 0xc7, 0x3b, 0xb9, 0x73, 0xcc, 0x3d,
+  0x10, 0x62, 0x5c, 0x3c, 0x87, 0xd8, 0xb2, 0xbd, 0x15, 0x50, 0xf8, 0x3d, 0xd7,
+  0x7f, 0x91, 0xbd, 0xf4, 0x07, 0xfb, 0x3c, 0x93, 0x09, 0xae, 0xbc, 0x54, 0x19,
+  0x76, 0x3a, 0x42, 0x4f, 0xbe, 0xbc, 0x6a, 0xef, 0xee, 0x3d, 0x98, 0x97, 0xb7,
+  0x3d, 0x33, 0x07, 0x3c, 0xbd, 0xe0, 0xc2, 0x46, 0x3c, 0x33, 0x5f, 0x80, 0x3c,
+  0x4d, 0x5e, 0xff, 0xbc, 0x4e, 0x02, 0xe8, 0xbc, 0x1f, 0x5b, 0xcd, 0xbc, 0x2d,
+  0x41, 0x8a, 0x3d, 0x2d, 0xeb, 0x5e, 0xbd, 0xff, 0x53, 0xb0, 0x3d, 0x7c, 0x37,
+  0xb0, 0x3c, 0x0b, 0xc9, 0x87, 0xbd, 0x32, 0xd1, 0xe6, 0xbb, 0xc0, 0x2f, 0xcf,
+  0x3d, 0x42, 0x5e, 0xb5, 0x3d, 0xd4, 0xbf, 0x36, 0xbd, 0x26, 0xd8, 0xf1, 0xbd,
+  0xf3, 0x8b, 0xc2, 0x3d, 0x1d, 0xd9, 0xe7, 0xbb, 0xab, 0xf9, 0x16, 0x3d, 0x13,
+  0x82, 0x93, 0x3d, 0x5e, 0xab, 0xbc, 0xbd, 0x57, 0xf5, 0x2f, 0x3c, 0x86, 0x19,
+  0x96, 0x3c, 0x17, 0xb1, 0x3e, 0x3d, 0xcd, 0xfd, 0x72, 0xbd, 0xae, 0x8d, 0xbf,
+  0x3c, 0x5e, 0x94, 0x5c, 0x3d, 0x16, 0x67, 0x88, 0x3d, 0xf1, 0xcb, 0x43, 0xbd,
+  0xc5, 0x5e, 0x6b, 0xbd, 0xa0, 0xc2, 0xdb, 0x3d, 0x94, 0x36, 0x11, 0xbd, 0x26,
+  0xb6, 0xb2, 0xbd, 0xe6, 0x9d, 0x93, 0xbd, 0x66, 0x04, 0x5e, 0xbd, 0xed, 0xfe,
+  0xaf, 0xbb, 0xbc, 0x70, 0x50, 0x3d, 0x0a, 0xeb, 0xd0, 0xbd, 0x3d, 0x06, 0xb5,
+  0x3d, 0xa7, 0x77, 0x31, 0xbd, 0x5f, 0x4b, 0xa6, 0xbd, 0x9b, 0x0f, 0x96, 0xbc,
+  0x7e, 0x02, 0xd4, 0xbc, 0x39, 0x52, 0xc4, 0xbd, 0xc3, 0x4e, 0x09, 0x3e, 0x5c,
+  0xc9, 0x48, 0x3d, 0xa4, 0x28, 0x36, 0xbd, 0xe3, 0xa7, 0x31, 0x3b, 0xdd, 0x29,
+  0xf4, 0x3d, 0x30, 0x52, 0x76, 0x3d, 0x10, 0xa8, 0x27, 0x3c, 0x0c, 0x16, 0x56,
+  0x3d, 0x84, 0xd6, 0x1a, 0xbd, 0x34, 0xea, 0xaa, 0x3c, 0x8b, 0xaa, 0x50, 0xbc,
+  0x02, 0x56, 0xc2, 0x3c, 0xee, 0x61, 0xe8, 0xbd, 0xf2, 0xaa, 0xb0, 0x3d, 0x22,
+  0xd5, 0x23, 0x3e, 0x2d, 0x7d, 0x62, 0xbd, 0x8a, 0x95, 0x6d, 0xbc, 0x6a, 0xaf,
+  0xb4, 0xbb, 0x34, 0x65, 0xad, 0x3d, 0x14, 0xff, 0xda, 0xbd, 0x43, 0xdc, 0x04,
+  0xbd, 0x26, 0xed, 0xa8, 0xbd, 0x97, 0xc7, 0xc3, 0x3d, 0x76, 0x2d, 0xd3, 0xbc,
+  0xe1, 0xc3, 0xbd, 0xbd, 0x75, 0x52, 0xca, 0x3c, 0x84, 0xfa, 0x13, 0x3c, 0x2e,
+  0xea, 0x00, 0xbd, 0xb9, 0xbc, 0xcf, 0x3d, 0xcb, 0x67, 0x65, 0xbd, 0xda, 0x95,
+  0xac, 0xbd, 0x51, 0x71, 0xed, 0x3c, 0xaf, 0xe1, 0x2c, 0xbd, 0xbf, 0x09, 0x2c,
+  0xba, 0xd1, 0xdc, 0xab, 0xbd, 0x60, 0xab, 0x71, 0xbc, 0x10, 0xa2, 0x2b, 0xbd,
+  0xb7, 0xba, 0x8f, 0xbd, 0x5e, 0x4b, 0x18, 0x3d, 0x4f, 0x72, 0xa6, 0xbc, 0xbb,
+  0x54, 0xc5, 0x3d, 0x2a, 0x54, 0xeb, 0xbd, 0x5b, 0x2e, 0x67, 0xbd, 0xc0, 0xd2,
+  0x61, 0x3b, 0x30, 0x8d, 0x34, 0x3d, 0xaa, 0x2e, 0xfe, 0xbc, 0x37, 0xa2, 0x7b,
+  0xbd, 0xb0, 0x0d, 0x7c, 0xbd, 0x05, 0x3f, 0x39, 0x3d, 0x52, 0xfc, 0xb2, 0x3d,
+  0xe8, 0x4a, 0xe6, 0xbd, 0x49, 0x3f, 0xd0, 0x3c, 0x1d, 0x43, 0x1a, 0xbd, 0x52,
+  0xcc, 0xc7, 0x3d, 0x6a, 0x3f, 0x72, 0x3b, 0x47, 0x6e, 0xdb, 0xbd, 0x6b, 0x97,
+  0xc2, 0xbd, 0xa0, 0x78, 0xe5, 0xbc, 0x01, 0xb0, 0xd8, 0xbc, 0xd0, 0x9f, 0x9f,
+  0xbc, 0x51, 0x99, 0x79, 0x3d, 0xf1, 0xd4, 0x1d, 0x3b, 0xe6, 0x19, 0x78, 0x3c,
+  0xb0, 0x8a, 0x8e, 0xbd, 0x90, 0xfc, 0xc9, 0x3d, 0x91, 0xe7, 0x85, 0x3d, 0xdd,
+  0xe2, 0x09, 0x3d, 0xb6, 0xf7, 0x5a, 0xbd, 0x26, 0xe8, 0xdc, 0xbd, 0x42, 0xca,
+  0x18, 0xbd, 0x2a, 0x1d, 0xb4, 0xbd, 0x83, 0x0b, 0xf1, 0x3a, 0xbd, 0x7b, 0x15,
+  0x3c, 0xf1, 0x7b, 0xa6, 0xbd, 0x55, 0xe4, 0x4d, 0xbd, 0xed, 0x07, 0xf8, 0xbc,
+  0xf3, 0x73, 0xa0, 0x3d, 0x75, 0x8a, 0xc5, 0xbd, 0x44, 0x2f, 0x7f, 0x3d, 0x35,
+  0x6c, 0x87, 0x3c, 0x61, 0x2c, 0x4b, 0xbc, 0x67, 0xde, 0x7d, 0xbd, 0x17, 0xaf,
+  0xe9, 0x3c, 0xaa, 0xd5, 0x0c, 0x3d, 0x98, 0xf5, 0xd8, 0xbc, 0x86, 0xa5, 0x2c,
+  0xbb, 0xad, 0x8e, 0x43, 0x3d, 0xd2, 0x59, 0xbd, 0xbd, 0x94, 0xc9, 0x69, 0xbd,
+  0x15, 0xa0, 0x81, 0x3d, 0x18, 0x49, 0x1e, 0x3d, 0xe7, 0xd7, 0xb5, 0xbd, 0x1f,
+  0x20, 0x10, 0xbd, 0xb0, 0x8b, 0xe0, 0xbd, 0xe0, 0x7c, 0x46, 0x3d, 0x1f, 0xc6,
+  0x5c, 0xbd, 0xbc, 0xc1, 0x1b, 0x3d, 0xc1, 0x1c, 0xc5, 0xbd, 0xf3, 0x52, 0x48,
+  0xbb, 0x39, 0x79, 0x86, 0x3d, 0x72, 0xbd, 0x36, 0x3c, 0xa5, 0xd7, 0x95, 0xbd,
+  0x73, 0xe0, 0x13, 0x3c, 0xe4, 0x9a, 0x50, 0xbd, 0x90, 0x58, 0x93, 0xbd, 0x3d,
+  0x9e, 0xac, 0x3d, 0x57, 0x08, 0xbb, 0x3d, 0x4e, 0xaf, 0x84, 0xbd, 0xdc, 0x16,
+  0xbc, 0xbd, 0x51, 0x1a, 0xbf, 0x3d, 0x62, 0x61, 0x97, 0x3d, 0x7a, 0xeb, 0x45,
+  0x3d, 0xa1, 0x27, 0xe7, 0x3d, 0x20, 0xcb, 0x45, 0xbd, 0xc3, 0x36, 0xda, 0x3d,
+  0xa2, 0x88, 0x48, 0x3d, 0x7c, 0x0d, 0x0d, 0x3b, 0x00, 0xa8, 0xaf, 0xbd, 0xda,
+  0x09, 0x51, 0xbd, 0xbd, 0xb3, 0x99, 0xbc, 0x6e, 0x40, 0x6a, 0xbd, 0x31, 0xdb,
+  0x71, 0x3c, 0x14, 0x0e, 0x0b, 0xbd, 0xe8, 0x4f, 0xae, 0xbd, 0xbb, 0xf3, 0xd4,
+  0x3d, 0xad, 0xdb, 0x8d, 0x3c, 0x72, 0x12, 0x66, 0xbd, 0x1f, 0xea, 0x98, 0xbd,
+  0xf7, 0xd0, 0x68, 0x3d, 0x47, 0x27, 0x13, 0x3d, 0xe9, 0x9d, 0xa2, 0xbd, 0x01,
+  0x07, 0xa9, 0x3d, 0x81, 0xa9, 0xa2, 0x3c, 0x54, 0x75, 0xb5, 0xbc, 0xbc, 0x9f,
+  0x8e, 0x3c, 0xdd, 0x55, 0x8c, 0x3c, 0xf6, 0x8f, 0xdc, 0x3d, 0x63, 0x45, 0xe7,
+  0x3c, 0xc2, 0x06, 0x48, 0x3c, 0x63, 0x7a, 0xe9, 0xbd, 0xb0, 0x14, 0x3f, 0x3d,
+  0x1b, 0x99, 0xe4, 0xbd, 0x0d, 0xa5, 0x89, 0x3d, 0x5d, 0x1e, 0xc4, 0xbd, 0x9b,
+  0x12, 0x8e, 0x3d, 0x47, 0xa7, 0xb6, 0xbc, 0xc7, 0x3f, 0xf3, 0xbd, 0x82, 0x32,
+  0x8f, 0xbd, 0xed, 0x11, 0xbe, 0x3d, 0xe4, 0x1e, 0xc6, 0xbc, 0x9d, 0x73, 0xee,
+  0xbd, 0xce, 0x18, 0xe3, 0xbd, 0x3f, 0x2c, 0x90, 0xbd, 0xc6, 0x82, 0xad, 0x3d,
+  0xa4, 0x9e, 0xf1, 0xbd, 0x6e, 0x4f, 0xe7, 0x3d, 0x63, 0x8b, 0x28, 0xbd, 0x0a,
+  0x66, 0x80, 0xbd, 0xa0, 0xa5, 0x84, 0xbd, 0xb0, 0xce, 0xbb, 0xbd, 0x72, 0xba,
+  0xa1, 0xbd, 0x42, 0x55, 0xa6, 0xbd, 0x36, 0x00, 0xce, 0x3d, 0x11, 0x44, 0xbc,
+  0x3b, 0xb4, 0x63, 0xa9, 0x3d, 0x07, 0x61, 0x9b, 0x3d, 0x50, 0xb7, 0xb3, 0xbd,
+  0xe1, 0xcc, 0x74, 0xbd, 0xa1, 0x8e, 0x6c, 0x3d, 0xa6, 0x54, 0xb6, 0xbd, 0xce,
+  0xde, 0xb4, 0x3c, 0x29, 0xd3, 0x31, 0xbc, 0x74, 0x1c, 0x78, 0xbd, 0xa7, 0xa4,
+  0x25, 0xbb, 0x01, 0xe0, 0x85, 0x3d, 0x67, 0xc7, 0xbd, 0xbc, 0xae, 0xdb, 0x3a,
+  0xbd, 0xaa, 0x9c, 0xdd, 0xbd, 0x7a, 0x65, 0xaa, 0xbc, 0x11, 0x1d, 0x53, 0xbd,
+  0xc0, 0xf8, 0x3a, 0xbd, 0x50, 0xd4, 0x84, 0xbc, 0x3b, 0x49, 0x7f, 0xbd, 0x44,
+  0x79, 0xde, 0x3d, 0xb9, 0x83, 0xfb, 0x3d, 0x12, 0x34, 0x8d, 0xbd, 0x0a, 0x31,
+  0xf0, 0x3c, 0x16, 0x71, 0x4e, 0xbd, 0xc4, 0x6a, 0x5f, 0x3d, 0x5a, 0xbe, 0x7e,
+  0x3d, 0xca, 0x56, 0xe7, 0xbc, 0xe7, 0xa1, 0xb8, 0xbd, 0xf7, 0xac, 0x17, 0x3d,
+  0xf1, 0x7c, 0x83, 0xbd, 0xe4, 0x5f, 0xec, 0xbd, 0x18, 0x92, 0xa9, 0xbb, 0x71,
+  0x9a, 0x3d, 0xbd, 0xd1, 0x18, 0x20, 0xbd, 0x94, 0xfa, 0xbd, 0x3d, 0x2f, 0x1f,
+  0x85, 0xbd, 0xc1, 0xc3, 0xa3, 0x3d, 0x36, 0xdb, 0x96, 0x3d, 0xa5, 0xae, 0x4e,
+  0xbc, 0xaa, 0x11, 0x9c, 0xbd, 0x44, 0xa2, 0x95, 0x3d, 0xe7, 0x39, 0x73, 0x3b,
+  0x1d, 0x57, 0x86, 0xbd, 0x14, 0x17, 0xa7, 0xbd, 0xaf, 0xc3, 0x09, 0xbd, 0x2f,
+  0x90, 0x20, 0xbd, 0x08, 0x91, 0x9c, 0x3c, 0x88, 0x0c, 0xd1, 0x3d, 0x56, 0x99,
+  0x9d, 0xbd, 0xb3, 0x75, 0xb2, 0x3d, 0xa1, 0x04, 0x59, 0xbb, 0x44, 0x0a, 0x6f,
+  0x3b, 0x5a, 0x42, 0xce, 0xbd, 0x1b, 0x3b, 0x91, 0x3d, 0x14, 0xb8, 0xdf, 0xbd,
+  0x85, 0x51, 0x8c, 0xbc, 0xa7, 0xd5, 0x5f, 0x3d, 0xe7, 0x88, 0x61, 0xbd, 0x97,
+  0x11, 0xd9, 0x39, 0x5c, 0x0b, 0x6d, 0xbd, 0xe4, 0xe3, 0xb1, 0xbd, 0xeb, 0xfe,
+  0xeb, 0xbd, 0xd3, 0x37, 0x66, 0x3c, 0x4b, 0x72, 0x49, 0xbd, 0x12, 0x06, 0xbf,
+  0x3b, 0x12, 0x40, 0x77, 0x3d, 0x7c, 0x9d, 0x92, 0x3d, 0xb2, 0xcd, 0xad, 0x3d,
+  0xb2, 0xe3, 0x65, 0x3d, 0x91, 0x55, 0xbd, 0x3c, 0x31, 0x00, 0xc0, 0xbd, 0xc9,
+  0x3b, 0x46, 0x3d, 0x51, 0xd9, 0xa6, 0x3d, 0xb9, 0xcb, 0xaf, 0xbd, 0xf8, 0x85,
+  0xd4, 0xbd, 0x47, 0x6f, 0xf2, 0xbd, 0x70, 0xd4, 0x13, 0x3d, 0x2c, 0x38, 0x55,
+  0x3d, 0x61, 0x11, 0xd7, 0x3d, 0x62, 0x90, 0xed, 0xbc, 0xd0, 0x71, 0x79, 0xbd,
+  0xc5, 0xc9, 0x87, 0xbd, 0x6d, 0x23, 0x96, 0xbc, 0xc1, 0x06, 0x9b, 0xbd, 0xc8,
+  0x2d, 0xfc, 0xbc, 0x79, 0x8d, 0xb8, 0xbd, 0xb3, 0x32, 0xca, 0xbc, 0x17, 0x71,
+  0xd3, 0xbd, 0x51, 0x07, 0xc6, 0xbc, 0x59, 0x04, 0x49, 0x3d, 0x15, 0x14, 0x8a,
+  0xbd, 0xd0, 0xae, 0xa4, 0xbd, 0x4c, 0x5f, 0xdd, 0x3d, 0xb5, 0x52, 0xbc, 0x3b,
+  0x4d, 0xca, 0x3f, 0xbd, 0x85, 0x21, 0xb0, 0xbd, 0x9e, 0x8b, 0xc3, 0xbd, 0x51,
+  0xd9, 0xa8, 0x3d, 0x53, 0x49, 0xd1, 0x3c, 0x35, 0x6f, 0xe3, 0xbd, 0x7f, 0xe2,
+  0x9e, 0xbd, 0x42, 0xd8, 0x14, 0xbd, 0x00, 0x6f, 0x19, 0x3d, 0xe1, 0x4e, 0x53,
+  0x3d, 0xda, 0xc8, 0x66, 0xbd, 0xf1, 0x51, 0xea, 0xbd, 0x8a, 0x7f, 0xbb, 0x3d,
+  0xa6, 0x85, 0x10, 0xbd, 0x4e, 0xcc, 0xd7, 0x3d, 0x8b, 0x94, 0xad, 0xbd, 0xaa,
+  0x92, 0x92, 0xbc, 0xdb, 0xcd, 0x3a, 0x3d, 0x43, 0x71, 0x99, 0x3d, 0xa0, 0xeb,
+  0xe1, 0x3d, 0xbe, 0x5e, 0xe3, 0x3c, 0x43, 0x28, 0x98, 0xbd, 0x04, 0x2b, 0x96,
+  0xbd, 0xc6, 0x1a, 0x21, 0xbb, 0xce, 0xba, 0xd3, 0xbd, 0x57, 0xee, 0x04, 0x3d,
+  0x87, 0xf6, 0x8a, 0xbb, 0xda, 0x72, 0x99, 0x3d, 0xcb, 0x2f, 0x8a, 0x3d, 0x1f,
+  0x20, 0xb5, 0xbd, 0xbe, 0x1f, 0x1e, 0xbd, 0x17, 0x5e, 0x84, 0xbd, 0xfd, 0xce,
+  0xb2, 0xbd, 0xfc, 0xcc, 0x74, 0x3d, 0x66, 0x53, 0xca, 0x3c, 0x35, 0x5e, 0x9e,
+  0x3d, 0x6c, 0x9b, 0xb4, 0x3d, 0x08, 0xbd, 0x90, 0x3d, 0x45, 0xc0, 0xc1, 0xbd,
+  0x83, 0x2c, 0xd3, 0xbc, 0x85, 0xa9, 0x81, 0xbc, 0xa4, 0x47, 0xbc, 0x3d, 0xc2,
+  0xc6, 0x91, 0xbb, 0x45, 0xf7, 0x51, 0x3d, 0x7c, 0x74, 0x32, 0x3d, 0x64, 0x6d,
+  0x67, 0xbd, 0xaf, 0x34, 0x37, 0x3d, 0xea, 0xb0, 0x95, 0xbd, 0xe6, 0x42, 0x22,
+  0x3d, 0xe4, 0x2b, 0xf9, 0xbd, 0x27, 0x85, 0x8c, 0xbc, 0x57, 0x16, 0xd4, 0x3d,
+  0x0d, 0x41, 0xb9, 0xbc, 0xde, 0xf7, 0xb3, 0xbc, 0xb1, 0x86, 0x5a, 0x3d, 0x16,
+  0x06, 0x99, 0x3d, 0x36, 0x5c, 0xf2, 0x3d, 0x96, 0x49, 0xfc, 0xbd, 0xd0, 0xda,
+  0x0b, 0xbd, 0x74, 0x35, 0xfd, 0x3d, 0x3c, 0x9d, 0x12, 0xbd, 0x88, 0xae, 0xc0,
+  0xbd, 0xd6, 0xe7, 0x5e, 0x3d, 0x31, 0x3f, 0xba, 0xbd, 0x0a, 0x05, 0xb9, 0xbd,
+  0x8d, 0xe3, 0x35, 0xbd, 0x83, 0xd0, 0x26, 0xbd, 0x04, 0xba, 0x97, 0xbc, 0x46,
+  0x99, 0xbf, 0xbd, 0xa1, 0x44, 0x75, 0x3b, 0xb8, 0x9b, 0x07, 0x3e, 0x32, 0xe6,
+  0xd5, 0xbd, 0xc0, 0x9f, 0xf3, 0x3d, 0x7f, 0x4f, 0x36, 0xbc, 0x42, 0xda, 0xe3,
+  0x3d, 0x3b, 0xb2, 0x5c, 0x3c, 0x97, 0x30, 0xd7, 0x3d, 0x51, 0xe8, 0xea, 0xbc,
+  0x6e, 0x73, 0x4d, 0x3d, 0x2f, 0x77, 0xb5, 0x3b, 0x0b, 0x79, 0xc1, 0x3c, 0x2f,
+  0xd9, 0x8c, 0xbd, 0x0e, 0x78, 0xbf, 0xbd, 0x3c, 0xec, 0x84, 0x3d, 0x59, 0xa9,
+  0xaa, 0xbd, 0x35, 0xdc, 0xe4, 0xbd, 0x91, 0xcf, 0x2e, 0x3d, 0x3c, 0x17, 0x0d,
+  0xbc, 0x10, 0xd0, 0xf9, 0x3d, 0xab, 0xca, 0xf9, 0xbd, 0x4b, 0xd7, 0x9b, 0x3d,
+  0xd0, 0x10, 0xc9, 0xbd, 0x11, 0x82, 0x05, 0x3e, 0xd0, 0x14, 0x21, 0xbd, 0x6d,
+  0x61, 0x99, 0xbd, 0xae, 0x85, 0x7a, 0xbd, 0x67, 0xc0, 0x86, 0xbb, 0x1e, 0xd0,
+  0xbf, 0x3d, 0x92, 0x46, 0xf8, 0xbc, 0x0d, 0xad, 0xa1, 0x3c, 0xea, 0x8d, 0xd0,
+  0x3c, 0x61, 0x10, 0x49, 0x3c, 0x8a, 0x7e, 0xe9, 0xbc, 0x31, 0x95, 0xdf, 0xb9,
+  0xb5, 0x03, 0x0d, 0x3d, 0x0b, 0xf5, 0xd9, 0xbb, 0xba, 0x95, 0x8f, 0xbd, 0x7c,
+  0x81, 0xde, 0xbd, 0xfc, 0x64, 0xcb, 0x3d, 0x0e, 0x80, 0x2c, 0x3d, 0x64, 0xa8,
+  0x0b, 0x3d, 0x58, 0xd7, 0xcc, 0xbc, 0x06, 0x10, 0x81, 0x3d, 0xd6, 0x24, 0x2f,
+  0xbe, 0x2f, 0x77, 0x4e, 0xbd, 0x53, 0x72, 0x1a, 0xbd, 0xc1, 0x05, 0x6e, 0x3d,
+  0x0b, 0x99, 0x8e, 0xbd, 0x30, 0x10, 0x04, 0xbd, 0xc3, 0x1c, 0x00, 0xbd, 0xf1,
+  0x16, 0xba, 0xbd, 0x00, 0x43, 0x03, 0xbc, 0xb8, 0x2d, 0xf4, 0x3c, 0x18, 0x18,
+  0x4d, 0x3d, 0x70, 0x7c, 0x99, 0xb9, 0x49, 0xef, 0xd2, 0xbc, 0x8a, 0xa4, 0x11,
+  0x3d, 0xe4, 0x8b, 0x5b, 0xbc, 0x16, 0xc1, 0x8c, 0xb9, 0x71, 0xa4, 0x37, 0x3d,
+  0xb2, 0xa4, 0xb0, 0x3c, 0x79, 0x6c, 0x8a, 0x3d, 0xb6, 0x86, 0x96, 0x3c, 0x06,
+  0xd1, 0x58, 0xbd, 0xae, 0x40, 0x92, 0xbc, 0x4c, 0x63, 0xa7, 0x3d, 0xac, 0x67,
+  0xb4, 0xbd, 0x5b, 0xda, 0x17, 0xbd, 0xeb, 0xfc, 0x09, 0x3d, 0x44, 0x95, 0x68,
+  0x3c, 0x03, 0xee, 0xd7, 0x3d, 0x57, 0x9f, 0xc2, 0x3d, 0x9c, 0xa6, 0xe7, 0x3b,
+  0xff, 0x8e, 0xcd, 0xbc, 0x22, 0x41, 0xf7, 0x3c, 0x19, 0xe0, 0x1d, 0xbd, 0xae,
+  0xcc, 0xe2, 0x3b, 0x70, 0xb1, 0x9f, 0x3d, 0xd8, 0x1d, 0xb7, 0x3d, 0xa1, 0xde,
+  0x4d, 0x3c, 0x12, 0xb6, 0x08, 0x3e, 0x1d, 0x9c, 0xbf, 0x3d, 0xd8, 0x48, 0x4a,
+  0xbb, 0x07, 0xd1, 0x5e, 0xbd, 0xd3, 0x82, 0xb1, 0x3d, 0x82, 0xef, 0x8d, 0x3d,
+  0x40, 0x79, 0xe5, 0xbc, 0x3f, 0x85, 0x8b, 0x3d, 0x6a, 0xa3, 0xa7, 0xbd, 0xed,
+  0xd4, 0xaf, 0xbd, 0x15, 0xf2, 0x96, 0xbd, 0x16, 0x8b, 0xf2, 0xbc, 0xdc, 0x5f,
+  0xc8, 0xbd, 0xef, 0x46, 0xb3, 0xbd, 0x41, 0x7a, 0x8c, 0xbd, 0x24, 0xfe, 0x62,
+  0xbd, 0xdf, 0xab, 0x89, 0xbb, 0xa9, 0x9c, 0xd6, 0x3d, 0xf5, 0xc0, 0x2c, 0x3d,
+  0x20, 0x81, 0xef, 0x3d, 0x1d, 0x1f, 0xd8, 0x3d, 0xe3, 0xea, 0xb7, 0xbc, 0xe5,
+  0x98, 0xb7, 0x3d, 0x97, 0x67, 0x48, 0x3d, 0x42, 0x5e, 0x10, 0xbe, 0x52, 0xdd,
+  0xb2, 0xbd, 0x79, 0x0f, 0x60, 0x3d, 0x7e, 0xc5, 0x1c, 0x3d, 0x9b, 0x47, 0x8a,
+  0xbd, 0xfe, 0x5a, 0x90, 0xba, 0xb3, 0x60, 0x7e, 0xbd, 0x59, 0x16, 0x7e, 0xbd,
+  0xb6, 0xb7, 0x01, 0x3d, 0x0d, 0x3c, 0xed, 0xbc, 0x0d, 0x44, 0x3c, 0xbb, 0x77,
+  0x3f, 0xf6, 0xbc, 0x74, 0x91, 0xb9, 0x3d, 0x15, 0xa6, 0x38, 0xbd, 0x6f, 0xa1,
+  0x39, 0x3d, 0xc8, 0x2e, 0xd8, 0x3d, 0x70, 0xf9, 0x7c, 0xbc, 0x17, 0x9c, 0xa5,
+  0x3a, 0xfd, 0x15, 0x0a, 0x3d, 0x55, 0x8c, 0xa7, 0x3d, 0xff, 0x06, 0x22, 0xbd,
+  0x2d, 0x31, 0x15, 0xbe, 0x70, 0x92, 0x92, 0xbd, 0x29, 0x8a, 0x0d, 0x3b, 0x6b,
+  0xca, 0x3d, 0xbd, 0xf2, 0xe1, 0x28, 0xbc, 0x36, 0x7a, 0x44, 0xbc, 0xea, 0x62,
+  0xd9, 0x3a, 0xd2, 0xdd, 0x9e, 0xbc, 0xda, 0xce, 0x16, 0xbe, 0x79, 0x5e, 0x97,
+  0x3b, 0x26, 0x34, 0x38, 0xbd, 0x77, 0x5d, 0x97, 0x3c, 0xc6, 0xcb, 0x84, 0xbd,
+  0xed, 0xa4, 0xda, 0x3d, 0xd2, 0x4f, 0x6d, 0xbc, 0x35, 0x16, 0xdc, 0xbd, 0xea,
+  0xfb, 0x08, 0xbe, 0x84, 0xea, 0x1e, 0xbd, 0x0e, 0x3a, 0x60, 0xb8, 0x4f, 0x4b,
+  0x0a, 0xbe, 0xfe, 0x33, 0x87, 0x3d, 0x63, 0x5e, 0x8d, 0x3d, 0x68, 0x29, 0x17,
+  0x3e, 0xa5, 0x25, 0x8f, 0xbc, 0x0a, 0x09, 0x78, 0xbd, 0x43, 0x98, 0x6d, 0xbd,
+  0x98, 0xa8, 0xa0, 0xbd, 0x7c, 0xa3, 0x13, 0x3d, 0xd4, 0xb8, 0x6d, 0xbc, 0x20,
+  0x1f, 0xc5, 0xbc, 0x06, 0xb5, 0x16, 0x3e, 0xcd, 0x4d, 0x90, 0xbd, 0xb8, 0xcc,
+  0xd4, 0x3d, 0xbd, 0xe9, 0xd1, 0xbd, 0x90, 0x68, 0xcf, 0x3d, 0xa7, 0xc6, 0x08,
+  0xbe, 0x1c, 0xe5, 0x5c, 0xbd, 0x6e, 0x56, 0xa6, 0x3d, 0x74, 0x4f, 0xa5, 0x3d,
+  0x96, 0x2b, 0x5a, 0x3d, 0xbe, 0xc6, 0x9b, 0xbd, 0x94, 0x33, 0x18, 0x3d, 0x57,
+  0x1a, 0x6b, 0xbd, 0xd7, 0x3d, 0x03, 0xbe, 0x6a, 0x36, 0x65, 0xbd, 0x13, 0x36,
+  0xbf, 0x3d, 0x82, 0x9a, 0x0a, 0x3d, 0x3c, 0x1d, 0xca, 0xbd, 0x0c, 0x40, 0x0e,
+  0xbe, 0x3f, 0x94, 0xae, 0xbd, 0x1f, 0x7e, 0x89, 0x3d, 0xe3, 0xbf, 0x30, 0xbe,
+  0x7a, 0x48, 0x23, 0x3a, 0xe5, 0x0e, 0x5d, 0x3d, 0x91, 0xd3, 0xf2, 0x3d, 0xb6,
+  0xef, 0x4a, 0xbd, 0xd4, 0xb3, 0x08, 0xbe, 0xa9, 0xba, 0xac, 0x3d, 0x31, 0x40,
+  0x86, 0x3d, 0xc2, 0xc7, 0x04, 0xbe, 0x7c, 0x3b, 0xdb, 0x3d, 0x11, 0x25, 0x04,
+  0xbd, 0x3f, 0x5d, 0xf3, 0xbc, 0xc2, 0x3f, 0xfb, 0x3c, 0x12, 0xac, 0xf4, 0xbd,
+  0xa7, 0xc4, 0x32, 0x3c, 0xc9, 0xea, 0xe3, 0x3c, 0x7d, 0xda, 0x36, 0x3c, 0x43,
+  0x55, 0x09, 0x3e, 0x5f, 0xd8, 0x22, 0xbd, 0x33, 0xf5, 0x29, 0x3e, 0xb8, 0x23,
+  0x8a, 0xbc, 0xfb, 0x3f, 0x52, 0xbe, 0xec, 0x1c, 0x79, 0x3d, 0x09, 0x9e, 0x24,
+  0xbd, 0x5b, 0x3c, 0xd3, 0xbd, 0x9f, 0x0b, 0x1f, 0x3e, 0x1f, 0xa2, 0xfc, 0xbd,
+  0x3b, 0x42, 0x9b, 0x3b, 0x0a, 0xae, 0xc4, 0xbc, 0x8b, 0xc8, 0xa7, 0x3d, 0x88,
+  0xaa, 0x9b, 0xbd, 0xaa, 0x37, 0xb6, 0x3d, 0x0d, 0x6a, 0x15, 0x3d, 0x47, 0xa8,
+  0x87, 0x3d, 0x53, 0xb1, 0xe3, 0x3d, 0xf7, 0x63, 0x0e, 0x3c, 0x37, 0x70, 0x8e,
+  0xbc, 0xc5, 0x5c, 0x32, 0xbe, 0x72, 0x7a, 0xd5, 0x3d, 0xcb, 0xac, 0xc7, 0xbd,
+  0x6f, 0xf1, 0x3a, 0xbd, 0x74, 0x40, 0x99, 0x3d, 0x35, 0x16, 0x88, 0xbc, 0xb4,
+  0x80, 0x14, 0x3e, 0x0b, 0x98, 0xd9, 0x3c, 0xa7, 0x98, 0x17, 0xbc, 0x6e, 0xd0,
+  0x60, 0xbb, 0xd9, 0xc2, 0x8f, 0x3d, 0xea, 0x37, 0xe1, 0xbd, 0x00, 0x42, 0xfd,
+  0x3d, 0xde, 0xb0, 0x3a, 0x3d, 0x4f, 0xe2, 0x50, 0x3c, 0x76, 0x9f, 0x42, 0xbd,
+  0x73, 0x18, 0x4e, 0xbe, 0x9b, 0xfd, 0x69, 0xbd, 0x69, 0xb2, 0x88, 0xbc, 0x6a,
+  0x13, 0x3e, 0xbd, 0x29, 0xf0, 0x0c, 0x3c, 0x1f, 0x81, 0x18, 0x3d, 0x03, 0x2e,
+  0x0c, 0x3e, 0xff, 0xf1, 0x4a, 0xbc, 0xb7, 0x9c, 0x14, 0xbe, 0xd5, 0x52, 0xce,
+  0xbd, 0xf6, 0x45, 0xf0, 0x3d, 0x8d, 0xc8, 0x55, 0xbd, 0x8f, 0xf0, 0x88, 0x3d,
+  0x8c, 0x8f, 0x20, 0xbd, 0x38, 0x7c, 0x4d, 0x3e, 0x6d, 0xba, 0x95, 0xbd, 0xdc,
+  0x7b, 0x0d, 0xbe, 0x3d, 0xbf, 0x2d, 0x3c, 0xee, 0xf6, 0xcb, 0x3c, 0x42, 0x85,
+  0x2e, 0x3d, 0x43, 0x4c, 0xb3, 0x3d, 0xe6, 0x70, 0x91, 0xbd, 0x58, 0x98, 0xfd,
+  0x3d, 0x70, 0x75, 0x52, 0xbd, 0xb7, 0x44, 0x34, 0xbe, 0x62, 0x65, 0xdc, 0xbd,
+  0xb8, 0xc7, 0x83, 0x3c, 0x0d, 0x0a, 0xaa, 0xbd, 0x09, 0xcb, 0x92, 0x3c, 0xbd,
+  0x5d, 0xc7, 0xb9, 0x3a, 0x4e, 0xa6, 0xbd, 0xd8, 0xfb, 0xa6, 0xbd, 0xcd, 0xfc,
+  0x72, 0xbe, 0x12, 0xdc, 0x4d, 0xbd, 0x0a, 0x7c, 0x5d, 0x3d, 0x8c, 0xce, 0x7a,
+  0x3d, 0xe8, 0x3d, 0x83, 0xbd, 0x0d, 0x6c, 0x9e, 0x3d, 0x14, 0xb3, 0x3c, 0x3d,
+  0x05, 0x0e, 0xdf, 0x3d, 0xf7, 0x27, 0xb7, 0xbd, 0xa3, 0x18, 0x08, 0x3d, 0x54,
+  0xdb, 0x6a, 0x3c, 0x93, 0x1a, 0x80, 0xbd, 0xf9, 0x13, 0x05, 0x3e, 0xd9, 0x61,
+  0x87, 0x3d, 0x08, 0xa5, 0x9b, 0xbd, 0x70, 0x5d, 0xc9, 0xbc, 0x9b, 0x99, 0x94,
+  0xbd, 0xc5, 0x6e, 0xd4, 0xbd, 0xc8, 0x60, 0xad, 0x3d, 0x29, 0x62, 0x05, 0xbd,
+  0x83, 0xd8, 0xc1, 0xbd, 0xa2, 0x72, 0xf1, 0x3d, 0x57, 0x3f, 0x2e, 0xbb, 0xb8,
+  0x1a, 0xcf, 0xbc, 0xc3, 0xda, 0x96, 0xbd, 0xd3, 0xbc, 0x81, 0xbd, 0xca, 0x52,
+  0xa1, 0xbb, 0xe8, 0xaf, 0x6a, 0x3d, 0x49, 0xaa, 0xf8, 0x3c, 0x5f, 0x2a, 0x9a,
+  0xbd, 0xcb, 0x12, 0x6b, 0xbd, 0xc9, 0x4a, 0x8f, 0xbc, 0xce, 0x3c, 0xfd, 0x3d,
+  0x71, 0x17, 0xed, 0x3d, 0x54, 0x40, 0xea, 0xbd, 0xcb, 0x7f, 0x2d, 0xbd, 0x2c,
+  0x13, 0x86, 0x3d, 0xcd, 0x8c, 0x44, 0xbd, 0xe4, 0x65, 0xa6, 0xbb, 0x06, 0x81,
+  0x04, 0x3d, 0x64, 0x45, 0x8e, 0x3d, 0xef, 0x80, 0x22, 0xbd, 0x35, 0x90, 0xaa,
+  0xbd, 0x02, 0xb6, 0x48, 0x3d, 0x76, 0xba, 0x39, 0x3d, 0xf3, 0xce, 0x66, 0xbd,
+  0x3f, 0x8e, 0xf1, 0xbd, 0x2a, 0x81, 0x0e, 0xbd, 0x82, 0x05, 0x0b, 0x3e, 0x7b,
+  0xdb, 0x2f, 0x3d, 0x86, 0xe3, 0xba, 0x3d, 0xac, 0x47, 0x17, 0x3e, 0xcb, 0x96,
+  0x8f, 0x3c, 0x3b, 0x58, 0xe7, 0xbd, 0x38, 0x64, 0x46, 0xbe, 0x9e, 0x73, 0x88,
+  0xbd, 0x0f, 0xf0, 0x8e, 0xbd, 0xc1, 0x4c, 0x00, 0xbd, 0x70, 0xbb, 0x54, 0xbd,
+  0x74, 0x55, 0x20, 0x3b, 0x1f, 0x22, 0x8d, 0x3d, 0xc9, 0x1d, 0xce, 0x3c, 0xad,
+  0x53, 0x3f, 0x3d, 0x7e, 0xd8, 0xb2, 0x3d, 0x9e, 0xc0, 0xf5, 0x3d, 0x79, 0x01,
+  0x32, 0xbd, 0x49, 0x13, 0x2e, 0x3d, 0xff, 0x7a, 0xce, 0x3d, 0xb5, 0xbc, 0x46,
+  0x3d, 0x43, 0xa5, 0xc8, 0xbd, 0xf2, 0x4d, 0xd3, 0x3b, 0x78, 0x3e, 0x39, 0x3d,
+  0x2c, 0x01, 0xc7, 0xbd, 0x5d, 0x5b, 0x8d, 0xbd, 0xb1, 0x3b, 0xa3, 0xbd, 0x1f,
+  0x70, 0x6e, 0x3c, 0x62, 0x07, 0x58, 0xbd, 0x29, 0xd9, 0xc8, 0xba, 0x13, 0xa6,
+  0xd3, 0xbd, 0xc1, 0x45, 0xbf, 0xbc, 0x3e, 0x9f, 0xea, 0xbc, 0x7c, 0x4d, 0xcc,
+  0x3d, 0x6c, 0x0c, 0x2e, 0xbd, 0xcf, 0xa0, 0x9a, 0x3b, 0x83, 0x9e, 0xfa, 0xbd,
+  0x77, 0x21, 0xaa, 0x3d, 0xcf, 0x18, 0xf5, 0xbd, 0xfe, 0x30, 0x79, 0x3d, 0x24,
+  0x33, 0x4d, 0x3d, 0xf7, 0x5f, 0x54, 0x3d, 0xda, 0x9d, 0xc9, 0xbd, 0x28, 0x08,
+  0x16, 0x3d, 0x53, 0x5a, 0xf6, 0xbc, 0xa5, 0x86, 0x84, 0xbd, 0x91, 0x39, 0xc5,
+  0xbc, 0x54, 0x2b, 0xda, 0xbd, 0x49, 0x34, 0xae, 0xbd, 0x9d, 0xad, 0x3a, 0xbd,
+  0x43, 0x59, 0xf1, 0x3d, 0x5c, 0xef, 0x06, 0x3e, 0xc7, 0xe0, 0x32, 0x3d, 0x43,
+  0xb3, 0x87, 0x3d, 0x12, 0x6c, 0x02, 0xbe, 0x9c, 0xdc, 0x02, 0x3e, 0x22, 0xcc,
+  0x1b, 0xbe, 0x46, 0x37, 0xe8, 0x3d, 0xf0, 0x11, 0x3b, 0xbd, 0x0d, 0x62, 0x51,
+  0x3d, 0x8b, 0x64, 0x2f, 0x3d, 0x57, 0x97, 0x5e, 0x3d, 0x53, 0xdd, 0xd6, 0x3c,
+  0x00, 0xf5, 0xfb, 0xbc, 0x6f, 0x83, 0xea, 0x3b, 0xec, 0x88, 0x20, 0xbb, 0xe5,
+  0x7f, 0xe6, 0x3d, 0xe6, 0xc4, 0xb5, 0x3d, 0x05, 0x76, 0x0f, 0xbe, 0x4a, 0x2f,
+  0x61, 0xbd, 0xa0, 0x69, 0xe2, 0x3d, 0xab, 0xc9, 0xb4, 0x3d, 0xeb, 0xd7, 0x88,
+  0xbc, 0x8f, 0x65, 0xfb, 0xbd, 0xc5, 0xca, 0x93, 0xbc, 0x1f, 0xe5, 0xa9, 0x3d,
+  0x0b, 0x34, 0x06, 0x3e, 0xbd, 0x9e, 0xe1, 0x3d, 0x58, 0x9d, 0xec, 0xbd, 0x60,
+  0x28, 0xe3, 0xbc, 0x62, 0x2e, 0x85, 0x3d, 0xec, 0x10, 0xb6, 0x3d, 0xd4, 0x0e,
+  0x55, 0x3d, 0x6a, 0xd9, 0x22, 0xbd, 0xa4, 0x2c, 0xb0, 0xbd, 0x8f, 0x8c, 0x8b,
+  0x3d, 0x05, 0xa0, 0xbb, 0x3d, 0x7b, 0xf7, 0xc0, 0x3d, 0xca, 0x2f, 0x90, 0xbc,
+  0x07, 0x79, 0xe3, 0xbd, 0x8b, 0x7d, 0x83, 0xbd, 0xfe, 0x8a, 0x93, 0xbc, 0xc0,
+  0xe9, 0xd0, 0x3d, 0xfb, 0x88, 0x76, 0xbc, 0x2d, 0x4b, 0x99, 0x3c, 0x69, 0x04,
+  0xd3, 0x3c, 0xb6, 0xd2, 0x88, 0x3d, 0xeb, 0xe2, 0x71, 0xbd, 0xa8, 0xb5, 0x98,
+  0x3d, 0x08, 0x79, 0xea, 0xbd, 0x7c, 0x53, 0x03, 0xbd, 0xb1, 0xda, 0xf9, 0xbd,
+  0xf1, 0x53, 0x83, 0xbc, 0xa0, 0xb3, 0x49, 0xbd, 0x7c, 0x79, 0x07, 0x3c, 0x68,
+  0x60, 0x21, 0x3c, 0xb1, 0x1f, 0x38, 0x3d, 0x5d, 0x0c, 0x4e, 0x3d, 0x36, 0x83,
+  0x62, 0x3c, 0x87, 0x96, 0x22, 0xbd, 0xd2, 0x3a, 0x09, 0x3c, 0xa2, 0x6e, 0x7a,
+  0xbd, 0x54, 0xc7, 0x31, 0xbc, 0x3a, 0x58, 0x1e, 0xbd, 0x51, 0x31, 0x94, 0x3d,
+  0x28, 0x85, 0xde, 0xbc, 0x52, 0x0e, 0xce, 0xbd, 0x79, 0x6a, 0xfb, 0xbd, 0x0f,
+  0x76, 0x14, 0xbd, 0xb4, 0xf0, 0xb3, 0x3c, 0x30, 0x4e, 0xab, 0xbd, 0xbc, 0x21,
+  0x2a, 0x3d, 0xa7, 0x29, 0x93, 0x3d, 0x05, 0x5e, 0x79, 0x3c, 0xc0, 0xdc, 0x93,
+  0xbd, 0x8c, 0x46, 0xd3, 0x3d, 0x6d, 0xef, 0x21, 0x3d, 0xcd, 0x62, 0xe5, 0x3d,
+  0xf2, 0x5f, 0xbc, 0xbd, 0xec, 0xb5, 0x6e, 0x3d, 0x8f, 0xdd, 0xd1, 0x3c, 0xb6,
+  0x13, 0x93, 0xbd, 0x1e, 0x1d, 0x0a, 0x3e, 0xfe, 0x00, 0x0a, 0x3d, 0xfe, 0xea,
+  0x70, 0x3c, 0x1e, 0x69, 0x94, 0xbd, 0x54, 0x92, 0xdf, 0x3d, 0x8d, 0xc4, 0xe3,
+  0xbd, 0xa8, 0x26, 0xc1, 0x3d, 0x90, 0x69, 0x97, 0x3d, 0x5f, 0xf7, 0x21, 0x3e,
+  0xd8, 0xf4, 0x13, 0x3d, 0x8e, 0x0f, 0x2a, 0x3d, 0x1a, 0xf3, 0xe8, 0x3d, 0xb1,
+  0x70, 0x75, 0xbd, 0x3d, 0x10, 0x87, 0x3d, 0xf2, 0x55, 0x8f, 0xbd, 0x7f, 0x15,
+  0x07, 0xbe, 0xe0, 0x3c, 0xba, 0x3d, 0x6d, 0x1f, 0xc2, 0xbc, 0xd6, 0xbf, 0x2c,
+  0xbd, 0x01, 0x4c, 0x87, 0x3c, 0xd8, 0xe5, 0x93, 0x3d, 0x6e, 0x5a, 0x12, 0x3d,
+  0xff, 0x3a, 0xd1, 0x3d, 0xfa, 0x05, 0x0a, 0x3d, 0x5a, 0xce, 0xa3, 0xbc, 0xc5,
+  0x2b, 0xd8, 0x3d, 0x98, 0xb3, 0xce, 0xbd, 0x6b, 0x72, 0x90, 0x3d, 0xa7, 0x35,
+  0xbb, 0xbd, 0xe2, 0xcb, 0xae, 0xbc, 0x8e, 0xe3, 0x74, 0x3d, 0xcd, 0x32, 0xcf,
+  0xbd, 0x76, 0x8d, 0x1d, 0x3d, 0x27, 0xc5, 0x0c, 0xbe, 0x27, 0x7e, 0x6c, 0xbd,
+  0x54, 0xf1, 0xdb, 0x3d, 0x39, 0x03, 0xed, 0xbc, 0xd7, 0x4b, 0xe1, 0x3a, 0x19,
+  0x67, 0x90, 0x3d, 0xf5, 0x03, 0x89, 0x3d, 0x31, 0x9d, 0xd4, 0x3a, 0x06, 0x9d,
+  0x05, 0x3e, 0xde, 0xaf, 0x63, 0xbd, 0xed, 0xfe, 0x54, 0x3c, 0xdd, 0x40, 0xc5,
+  0xbd, 0xf5, 0x54, 0x0d, 0xbc, 0x3e, 0xaa, 0xcd, 0x3c, 0x08, 0x18, 0xbf, 0xbd,
+  0x79, 0x2e, 0x90, 0xbd, 0x15, 0xe3, 0x8a, 0x3d, 0x7b, 0x54, 0x7c, 0xbd, 0x85,
+  0x07, 0xd0, 0x3d, 0xfb, 0x39, 0x01, 0xbd, 0x12, 0x57, 0xf0, 0xbd, 0x56, 0x7c,
+  0x8d, 0xbd, 0xae, 0x9e, 0xaf, 0x3c, 0x90, 0xc3, 0x85, 0x3d, 0x9c, 0x00, 0x88,
+  0x3d, 0x1f, 0x9a, 0x8f, 0xbd, 0x80, 0xef, 0xc4, 0xb9, 0x60, 0xba, 0x5b, 0xbd,
+  0x05, 0x25, 0xd8, 0x3c, 0x76, 0x60, 0x6d, 0x3d, 0xc5, 0xf0, 0xe1, 0x3c, 0x0d,
+  0x00, 0xf7, 0x3d, 0x57, 0xb7, 0x24, 0x3d, 0x2c, 0x11, 0x06, 0xbe, 0x48, 0x15,
+  0x5b, 0xbd, 0x0c, 0x67, 0x22, 0xbd, 0xc9, 0x10, 0x07, 0x3c, 0x69, 0x42, 0xbb,
+  0xbd, 0x5b, 0x32, 0xb8, 0xbd, 0x62, 0x5e, 0x35, 0xbd, 0xfc, 0xe1, 0x22, 0xbd,
+  0xff, 0xb3, 0x51, 0xbd, 0x6e, 0x4d, 0x2d, 0x3c, 0xfb, 0xca, 0xc5, 0xbd, 0x15,
+  0x16, 0x32, 0x3d, 0x50, 0xff, 0xbe, 0xbd, 0xf7, 0x84, 0x5e, 0xbb, 0x27, 0xa2,
+  0x17, 0x3c, 0x83, 0x85, 0xda, 0xbd, 0xd3, 0x8f, 0xd8, 0x3d, 0x19, 0xd4, 0x9d,
+  0xbd, 0x05, 0x56, 0xbd, 0x3b, 0x80, 0x5c, 0x8d, 0xbd, 0x02, 0x07, 0x01, 0x3e,
+  0x46, 0x0a, 0xd0, 0x3c, 0x28, 0x0a, 0x74, 0x3d, 0x45, 0xd8, 0x9c, 0x3d, 0x51,
+  0x8c, 0xe1, 0x3d, 0x94, 0x9d, 0x44, 0xbc, 0x1a, 0xfd, 0x6d, 0x3d, 0x6a, 0xa7,
+  0x00, 0x3e, 0x03, 0xb0, 0xa5, 0xbd, 0x84, 0xb6, 0x94, 0x3c, 0x6e, 0x1b, 0xd2,
+  0xbd, 0xff, 0xcf, 0xbd, 0xbd, 0x7f, 0x7c, 0x6c, 0xbd, 0xa0, 0xb0, 0x4a, 0xbd,
+  0x8c, 0xfc, 0xca, 0xbc, 0xf4, 0xa1, 0x81, 0xbd, 0x22, 0xad, 0xe2, 0x3c, 0xfa,
+  0x91, 0xaf, 0x3d, 0xf4, 0x2e, 0x19, 0xbd, 0x0b, 0x57, 0x71, 0xbc, 0x21, 0xca,
+  0x8d, 0x3c, 0xee, 0x8c, 0x2b, 0x3a, 0x46, 0x1a, 0xc1, 0xbb, 0x51, 0xbe, 0x2c,
+  0xbd, 0xc0, 0x3f, 0x40, 0x3d, 0xb2, 0xbb, 0x96, 0x3d, 0x88, 0x43, 0x23, 0xbe,
+  0x26, 0xd9, 0xe8, 0xbd, 0xf7, 0xfc, 0x9d, 0xbd, 0x4e, 0xf6, 0xd3, 0xbc, 0x2a,
+  0xda, 0xba, 0xbd, 0xe1, 0x21, 0xe1, 0x3d, 0x81, 0xea, 0x2e, 0xbd, 0xde, 0xaa,
+  0xd2, 0xbb, 0xde, 0x20, 0xbe, 0x3d, 0x15, 0x2f, 0x44, 0x3d, 0x37, 0x58, 0x6e,
+  0xbd, 0xcd, 0x34, 0x4c, 0xbb, 0x8d, 0xad, 0x08, 0xbc, 0xd9, 0xe2, 0x21, 0x3d,
+  0xfe, 0x8b, 0xab, 0x3d, 0xa2, 0x7f, 0x47, 0xbd, 0xad, 0xbe, 0xe3, 0xbc, 0x5f,
+  0x5d, 0x20, 0x3d, 0xa7, 0xa7, 0x19, 0xbe, 0x27, 0x1b, 0x8a, 0xbd, 0x2e, 0xcf,
+  0x4d, 0x3d, 0x68, 0x43, 0xb0, 0x3d, 0x54, 0xe8, 0xec, 0x3b, 0x5f, 0x47, 0x57,
+  0xbd, 0xde, 0x1b, 0xc4, 0x3d, 0xd2, 0x08, 0xfa, 0xbb, 0x23, 0x97, 0xe5, 0x3d,
+  0xb3, 0x70, 0x6b, 0x3d, 0x33, 0x68, 0x2a, 0xbc, 0xbb, 0xc7, 0xb5, 0xbd, 0x31,
+  0xe2, 0xcd, 0xbd, 0xe3, 0x77, 0x44, 0x3d, 0xb1, 0xf5, 0x60, 0x3d, 0x03, 0x24,
+  0xf7, 0xbd, 0x6c, 0x04, 0xb0, 0x3c, 0xba, 0x53, 0xa9, 0xbd, 0xcb, 0x94, 0x03,
+  0xbe, 0x19, 0x25, 0xfc, 0xbb, 0x8d, 0xaf, 0xe5, 0x3d, 0x95, 0xec, 0xa3, 0x3d,
+  0xca, 0x8d, 0xcb, 0xbd, 0x71, 0x02, 0xee, 0x3c, 0x31, 0x55, 0xdf, 0xbd, 0x85,
+  0xd6, 0x69, 0x3d, 0xa1, 0xd8, 0x1d, 0x3d, 0xd6, 0x60, 0x12, 0xbb, 0x46, 0x47,
+  0x46, 0x3d, 0x75, 0xf9, 0x97, 0x3d, 0x4c, 0xd5, 0x87, 0x3d, 0xc4, 0x77, 0xb7,
+  0x3c, 0x0a, 0xd5, 0x08, 0x3d, 0x7f, 0x4d, 0x74, 0xbd, 0xdd, 0x0e, 0x07, 0xbe,
+  0x0d, 0xb1, 0x51, 0xbb, 0x95, 0xf0, 0xa7, 0x3d, 0x8d, 0xdc, 0xe7, 0xbd, 0x11,
+  0x22, 0xd1, 0x3d, 0x81, 0xad, 0x8c, 0x3d, 0x51, 0x36, 0x1e, 0x3d, 0xe3, 0x75,
+  0x01, 0x3e, 0xa1, 0xd1, 0x9a, 0x3d, 0x4f, 0xd4, 0xc4, 0x3d, 0x50, 0x2a, 0x61,
+  0x3c, 0x9a, 0xd5, 0xbd, 0xbd, 0x37, 0xd1, 0xd5, 0x3c, 0xd5, 0x83, 0x8e, 0x3d,
+  0xbd, 0x05, 0xb6, 0xbb, 0x52, 0x6b, 0x66, 0x3d, 0x25, 0xcb, 0x0c, 0xbe, 0x3a,
+  0xff, 0xd3, 0xbd, 0xaf, 0xdc, 0xb3, 0xbd, 0xde, 0xdf, 0x06, 0x3d, 0x91, 0x0f,
+  0xc8, 0xbd, 0x62, 0xa1, 0x8f, 0xbc, 0x1c, 0x36, 0x40, 0x3c, 0x7d, 0x4f, 0xfa,
+  0x3d, 0x99, 0x76, 0xd5, 0x3d, 0xc3, 0x21, 0x5c, 0xbb, 0x61, 0x54, 0x52, 0xbc,
+  0xc4, 0x07, 0x9b, 0xbd, 0xb3, 0x00, 0x44, 0xbc, 0xbe, 0x1b, 0x06, 0xbd, 0x35,
+  0x4c, 0x5d, 0x3d, 0x6b, 0x45, 0x17, 0xbd, 0x10, 0xd6, 0xe5, 0xbd, 0x40, 0x57,
+  0x83, 0x3d, 0x62, 0xd1, 0x64, 0xbd, 0x79, 0x90, 0xbd, 0xbc, 0xce, 0xf0, 0x07,
+  0x3e, 0xc0, 0xbd, 0xaf, 0x3d, 0x88, 0xe1, 0x84, 0xbd, 0xf0, 0xdb, 0x4c, 0x3d,
+  0x17, 0x35, 0x02, 0x3b, 0x30, 0x1c, 0xed, 0xbd, 0x4f, 0xfc, 0xda, 0x3d, 0x92,
+  0x80, 0x87, 0xbc, 0x02, 0x74, 0x1a, 0xbe, 0xdc, 0xb1, 0xb3, 0xbd, 0x6c, 0x01,
+  0xc0, 0xbc, 0x8f, 0x2d, 0x8c, 0x3d, 0xf5, 0x96, 0xc0, 0xbd, 0x77, 0xbc, 0x7f,
+  0xbd, 0x8a, 0x64, 0xf1, 0x3c, 0xb7, 0x6c, 0xb4, 0xbd, 0x1c, 0x6f, 0x84, 0x3d,
+  0xa1, 0xd5, 0xc0, 0xbd, 0xbf, 0x63, 0xd4, 0x3d, 0xd6, 0xd7, 0xe7, 0x3d, 0x89,
+  0x1e, 0x64, 0x3c, 0xf3, 0x81, 0xbe, 0xbd, 0xb3, 0x57, 0xe9, 0xbd, 0x84, 0x5e,
+  0x9a, 0x3d, 0x77, 0x22, 0x01, 0xbe, 0x53, 0xa3, 0xb8, 0xbd, 0xc0, 0x62, 0xff,
+  0x3b, 0x9a, 0xfb, 0xbd, 0x3d, 0x13, 0x1a, 0xeb, 0x3b, 0x3b, 0x96, 0x78, 0x3d,
+  0xfc, 0xc6, 0x93, 0x3d, 0xfc, 0x33, 0x92, 0x3d, 0xcc, 0xc1, 0x62, 0xbd, 0x63,
+  0x7c, 0x77, 0xbd, 0x69, 0x92, 0x05, 0xbd, 0xbd, 0xee, 0xb8, 0x3a, 0xa2, 0x9d,
+  0x0e, 0xbe, 0xf3, 0xba, 0xed, 0xbd, 0x2f, 0x6a, 0xaa, 0x3d, 0x77, 0x4a, 0xc6,
+  0x3d, 0x4f, 0xe7, 0xa8, 0x3d, 0x1e, 0x3f, 0xbb, 0xbd, 0xae, 0x6c, 0xb8, 0xbc,
+  0x75, 0xf1, 0x6d, 0xbd, 0xc1, 0x5d, 0x11, 0xbe, 0x2b, 0xe2, 0x4f, 0xbd, 0x54,
+  0x21, 0xf6, 0x3b, 0x5c, 0xe2, 0x96, 0x3c, 0xbe, 0xe8, 0x2e, 0x3d, 0x38, 0x39,
+  0x93, 0x3c, 0xc3, 0x50, 0xbc, 0x3d, 0x67, 0x1d, 0xc4, 0x3d, 0xe6, 0x29, 0x56,
+  0xbc, 0x4d, 0x70, 0x4d, 0x3c, 0xd2, 0xca, 0xc4, 0xbd, 0xa1, 0x30, 0x3b, 0xbd,
+  0x97, 0x9b, 0xb5, 0xbd, 0x65, 0x99, 0x9b, 0xbd, 0xb5, 0x65, 0xb7, 0xbd, 0x51,
+  0xe1, 0x9a, 0xbd, 0x2f, 0x56, 0x4a, 0xbb, 0x9c, 0x68, 0x98, 0xbd, 0x36, 0x75,
+  0x73, 0xbd, 0x19, 0xe1, 0x83, 0xbd, 0x37, 0x69, 0xee, 0x3d, 0xe7, 0xd1, 0xad,
+  0xbd, 0x3b, 0x29, 0x95, 0xbd, 0xcd, 0x10, 0x75, 0x3d, 0xb4, 0x82, 0xc2, 0xbc,
+  0x72, 0xd7, 0x91, 0x3d, 0xc8, 0x77, 0x49, 0xbd, 0x96, 0x67, 0x4d, 0xbd, 0xc5,
+  0x75, 0x98, 0xbd, 0x96, 0x67, 0xcc, 0x3d, 0xba, 0x7a, 0x1e, 0xbe, 0x30, 0x3a,
+  0x02, 0x3d, 0xc1, 0xf8, 0x78, 0x3d, 0x46, 0xfc, 0xc1, 0x3d, 0x99, 0x3c, 0xc5,
+  0xbd, 0xbc, 0x69, 0x39, 0x3d, 0x7f, 0x95, 0xf0, 0x3b, 0x50, 0x78, 0x57, 0xbd,
+  0xfa, 0xf7, 0xa9, 0xbc, 0xb2, 0xae, 0x2b, 0x3c, 0x22, 0x75, 0x0d, 0x3e, 0x63,
+  0xaa, 0x03, 0x3d, 0xfa, 0x00, 0xd7, 0x3d, 0xc3, 0xcb, 0x60, 0x3c, 0xab, 0xf2,
+  0x61, 0x3c, 0x1b, 0x9a, 0x38, 0xbd, 0x1a, 0x33, 0xef, 0xbd, 0x9e, 0x11, 0xc5,
+  0x3d, 0xf5, 0xb1, 0x99, 0xbc, 0x65, 0xee, 0x5e, 0xbc, 0xde, 0x02, 0xe8, 0xbd,
+  0xef, 0x87, 0x58, 0x3d, 0x0e, 0x01, 0xcf, 0x3d, 0x51, 0xf7, 0xcb, 0xbc, 0x9e,
+  0x48, 0x50, 0xbd, 0xd2, 0xc8, 0x88, 0xbc, 0x56, 0x0a, 0x18, 0x3e, 0x49, 0xa6,
+  0xce, 0xbd, 0x9d, 0x8d, 0xf4, 0x3d, 0xd9, 0x71, 0x7e, 0x3d, 0x49, 0xcb, 0x67,
+  0x3d, 0x3d, 0x4f, 0xdb, 0x3c, 0x8c, 0x3b, 0xaa, 0xbd, 0xce, 0xc4, 0x1f, 0x3d,
+  0xda, 0x94, 0xaa, 0x3c, 0x4c, 0xae, 0x89, 0x3d, 0xac, 0x7e, 0x8d, 0x3d, 0xff,
+  0xfe, 0xf7, 0x3d, 0x89, 0xba, 0xbd, 0xbd, 0x98, 0xc1, 0x5c, 0x3d, 0x9a, 0xcf,
+  0x1b, 0xba, 0xdb, 0x22, 0xf3, 0x3d, 0x3a, 0xa6, 0x58, 0xbd, 0x6b, 0x7d, 0x2b,
+  0x3d, 0x22, 0x6f, 0xa2, 0xbd, 0x95, 0xf3, 0x07, 0x3e, 0x14, 0xfb, 0x7a, 0x3d,
+  0xda, 0x56, 0x40, 0xbd, 0x85, 0xe7, 0xcf, 0xbd, 0x7f, 0x4c, 0xb8, 0x3c, 0xf0,
+  0x6d, 0xc1, 0xbd, 0xb1, 0x01, 0xbd, 0x3d, 0xb4, 0xc0, 0xc0, 0xbd, 0x4f, 0x5f,
+  0xca, 0xbd, 0x4e, 0x96, 0xe1, 0x3d, 0x92, 0x0a, 0xa6, 0x3d, 0xd6, 0xd9, 0xb7,
+  0x3d, 0x8b, 0x52, 0xa8, 0x3d, 0xa9, 0xe6, 0xb4, 0xbc, 0x16, 0x49, 0xc0, 0x3b,
+  0xed, 0x64, 0xd1, 0x3d, 0xf1, 0xaf, 0x20, 0xbc, 0x8f, 0x44, 0xd9, 0x3b, 0xc0,
+  0x7a, 0xb4, 0x3d, 0x31, 0xb6, 0x15, 0xbe, 0x82, 0x8e, 0x62, 0xbd, 0xb3, 0x93,
+  0x1e, 0xbd, 0xae, 0x33, 0x8c, 0xbd, 0x82, 0xf3, 0xa6, 0x3c, 0xd2, 0x41, 0xb2,
+  0xbc, 0x58, 0x37, 0xce, 0x3d, 0xb9, 0xd2, 0xce, 0x3d, 0x99, 0x90, 0x69, 0x3d,
+  0xc3, 0x4b, 0xc8, 0x3d, 0xba, 0xfa, 0xcb, 0x3d, 0xee, 0x4a, 0xfe, 0xbc, 0x24,
+  0xc5, 0x3c, 0xbd, 0x5a, 0x95, 0xb3, 0xbd, 0xb1, 0xc0, 0x1f, 0xbd, 0x61, 0x53,
+  0xb4, 0x3c, 0x2e, 0x79, 0xc7, 0xbd, 0xd6, 0x70, 0x9d, 0xbd, 0x9d, 0xe7, 0x16,
+  0x3d, 0x4f, 0xe9, 0xa9, 0xbc, 0x7d, 0xbb, 0x7c, 0xbd, 0xf0, 0xdf, 0xe9, 0xbc,
+  0x66, 0xc4, 0x3f, 0xbd, 0xfc, 0xd3, 0x20, 0xbd, 0xd3, 0x4f, 0x36, 0xbd, 0x72,
+  0x8d, 0xec, 0x3d, 0x79, 0xbc, 0xaa, 0x3d, 0x69, 0x95, 0xe7, 0x3d, 0x46, 0xb6,
+  0xcc, 0xbc, 0xdd, 0x97, 0x70, 0xbd, 0x96, 0x31, 0x0c, 0xbe, 0x48, 0x86, 0xeb,
+  0x3d, 0x74, 0xf6, 0xa3, 0x3c, 0xe8, 0x26, 0xa1, 0x3d, 0xe3, 0xdd, 0x70, 0xbd,
+  0xcf, 0xbd, 0x02, 0x3c, 0x13, 0x3e, 0xbc, 0xbd, 0x69, 0xad, 0x05, 0xbd, 0xc0,
+  0xad, 0x53, 0x3c, 0xb6, 0x7c, 0xb2, 0xbd, 0x27, 0xc3, 0xfd, 0xbc, 0x5f, 0x42,
+  0xc5, 0x3d, 0x2f, 0x17, 0xd6, 0x3d, 0xb2, 0x68, 0xda, 0xbd, 0x95, 0xe5, 0x4f,
+  0x3c, 0xae, 0x99, 0xe4, 0x3d, 0x8f, 0x5c, 0xde, 0xbd, 0xf1, 0x87, 0x02, 0xbb,
+  0x17, 0x17, 0x7a, 0x3d, 0x75, 0x72, 0x1f, 0x3d, 0x70, 0x34, 0xa4, 0xbd, 0x43,
+  0x2a, 0xb2, 0x3d, 0xd9, 0x5a, 0xc7, 0x3d, 0xa5, 0x58, 0xc6, 0x3d, 0xa3, 0xb8,
+  0x76, 0xbd, 0x5b, 0xf5, 0x27, 0x3c, 0x58, 0xfa, 0x60, 0x3c, 0xcc, 0x2e, 0xd4,
+  0x3d, 0x71, 0xc3, 0x54, 0x3c, 0x75, 0xe3, 0x6b, 0x3d, 0x29, 0xf3, 0x9a, 0x3d,
+  0x9d, 0x62, 0x8b, 0xbd, 0xcd, 0xa8, 0x9f, 0xbd, 0xee, 0xaa, 0xbf, 0x3c, 0xd7,
+  0xe4, 0x20, 0xbd, 0x9f, 0x2c, 0xa4, 0x3c, 0x3a, 0x5e, 0x76, 0xbd, 0x9b, 0xcb,
+  0x07, 0x3e, 0x3e, 0x33, 0x34, 0x3d, 0x69, 0x57, 0x26, 0x3c, 0xf5, 0x54, 0xef,
+  0xbd, 0xf5, 0x3d, 0xe9, 0xbd, 0x8e, 0xed, 0x2b, 0x3d, 0x86, 0xf8, 0xb2, 0x3c,
+  0xb2, 0x7f, 0x45, 0x3d, 0xe1, 0x4f, 0xbd, 0x3c, 0xa7, 0xc8, 0x91, 0xbd, 0xea,
+  0x4c, 0xc5, 0x3d, 0x7a, 0x60, 0x7c, 0x3d, 0xce, 0x3e, 0xb6, 0x3d, 0xc3, 0x22,
+  0x52, 0xbd, 0xbf, 0x54, 0xd3, 0xbc, 0xc7, 0xe0, 0xe1, 0xbd, 0x08, 0x86, 0xc8,
+  0x3c, 0x98, 0x6c, 0xc3, 0xbd, 0xe6, 0xe1, 0x25, 0xbd, 0xdb, 0x07, 0x53, 0xbb,
+  0xbd, 0x04, 0x5f, 0xbd, 0x12, 0xfd, 0xe6, 0xbd, 0x2d, 0x0f, 0xe8, 0x3d, 0x9e,
+  0x08, 0x47, 0x3d, 0x93, 0xc8, 0xdc, 0xbd, 0x97, 0x91, 0xc9, 0xbd, 0xbd, 0x45,
+  0x88, 0xbd, 0x45, 0x8e, 0x0b, 0xbe, 0x8f, 0xb7, 0xd1, 0xbd, 0x9b, 0x3c, 0xc2,
+  0x3c, 0x04, 0xc5, 0xda, 0xba, 0xce, 0x19, 0x9a, 0x3d, 0xaf, 0xee, 0x25, 0x3e,
+  0xdf, 0x56, 0x48, 0xbd, 0x9d, 0x42, 0x02, 0x3e, 0x2c, 0x6a, 0xef, 0x3c, 0x25,
+  0x99, 0x07, 0x3c, 0x74, 0xa1, 0xca, 0x3c, 0xae, 0x08, 0x9e, 0x3c, 0xe5, 0xec,
+  0x25, 0xbd, 0x63, 0x8f, 0xd5, 0x3d, 0xf3, 0x4a, 0xc5, 0xbc, 0xab, 0x02, 0x53,
+  0xbd, 0x3e, 0xec, 0x5e, 0x3d, 0xea, 0xf2, 0x8f, 0x3d, 0xb9, 0xa3, 0x91, 0xbd,
+  0xa9, 0x34, 0x93, 0xbd, 0xd4, 0x95, 0x78, 0x3d, 0x84, 0x2b, 0x04, 0x3e, 0xe7,
+  0x61, 0x87, 0x3d, 0x41, 0x40, 0xe9, 0x3d, 0x3f, 0xea, 0xdc, 0xbc, 0xc9, 0xfd,
+  0xa4, 0x3d, 0xf6, 0xd5, 0x69, 0x3d, 0xa5, 0x93, 0x99, 0xbb, 0x21, 0x84, 0x76,
+  0x3d, 0xaa, 0xf2, 0x52, 0x3d, 0xbb, 0x3d, 0x9f, 0xbd, 0xd3, 0xd6, 0x6c, 0x3d,
+  0xe6, 0xb2, 0xcc, 0xbc, 0x18, 0x3b, 0x30, 0x3d, 0x25, 0xcf, 0xc5, 0xbc, 0xe0,
+  0xfd, 0xb4, 0x3c, 0x5c, 0x92, 0x6b, 0x3d, 0xa8, 0x01, 0x17, 0x3d, 0xf6, 0xed,
+  0xa2, 0xbd, 0x42, 0x7b, 0xec, 0x3d, 0x8e, 0x87, 0xd7, 0x3d, 0xfa, 0x30, 0xb7,
+  0x3d, 0x54, 0x66, 0x38, 0xbd, 0x68, 0xb5, 0xa9, 0xbd, 0x30, 0x1e, 0x7d, 0x3d,
+  0x93, 0xf4, 0xd5, 0xbc, 0x69, 0x6a, 0x98, 0xbd, 0x8f, 0x2b, 0x4f, 0xbd, 0xd3,
+  0x99, 0x9a, 0xbd, 0x9b, 0x72, 0xfe, 0xbc, 0xaf, 0xc3, 0xad, 0xbd, 0xe2, 0xdf,
+  0xde, 0x3c, 0xdc, 0x3e, 0xd3, 0x3d, 0x46, 0xb7, 0x92, 0xbd, 0x22, 0xd0, 0x21,
+  0xbd, 0x7a, 0x5e, 0xae, 0x3c, 0xb6, 0x91, 0xa4, 0x3d, 0xba, 0xda, 0x8f, 0xbc,
+  0xad, 0xb4, 0x18, 0x3b, 0xb1, 0x16, 0x9c, 0xbd, 0x2f, 0xf7, 0x89, 0xbd, 0x89,
+  0x33, 0xba, 0xbd, 0x03, 0x89, 0x61, 0xbd, 0xa8, 0x17, 0x50, 0xbd, 0xf5, 0xfe,
+  0x1a, 0x3d, 0xd2, 0x25, 0x02, 0x3d, 0xbb, 0xc9, 0x67, 0xbd, 0xc8, 0x32, 0xe0,
+  0x3d, 0x8e, 0xb2, 0x9e, 0xbd, 0x57, 0x57, 0x2a, 0xbc, 0xb4, 0xc4, 0x76, 0x3d,
+  0xfd, 0x46, 0x11, 0x3b, 0x38, 0x45, 0xe8, 0x3a, 0x90, 0x49, 0xc6, 0xbd, 0xc3,
+  0x50, 0x0b, 0xbe, 0x19, 0xca, 0xd9, 0x3d, 0x17, 0x4d, 0xe0, 0x3d, 0x68, 0x36,
+  0x3f, 0xbc, 0x3a, 0x6e, 0xda, 0xbd, 0x50, 0xd8, 0xde, 0x3d, 0x6f, 0x09, 0x29,
+  0xbe, 0x9d, 0x50, 0x03, 0xbd, 0x9a, 0x25, 0xf6, 0xbd, 0x43, 0xa2, 0xbc, 0x3d,
+  0x9a, 0x55, 0xa5, 0x3d, 0xa9, 0x0d, 0x2f, 0xbd, 0x5c, 0x8e, 0x22, 0xbd, 0x2e,
+  0xc1, 0x58, 0xbd, 0x5a, 0x05, 0x2c, 0xbd, 0xec, 0x19, 0xa1, 0xbd, 0xd7, 0x75,
+  0x7b, 0x3d, 0x9a, 0xcf, 0x82, 0x3c, 0x46, 0xc6, 0xff, 0x3c, 0x37, 0xc8, 0xca,
+  0x3d, 0xa0, 0xb7, 0x28, 0x3d, 0xaa, 0xb5, 0x2f, 0x3d, 0xaa, 0xa3, 0x9e, 0xbb,
+  0x01, 0x2b, 0xd6, 0xbd, 0xa5, 0x6d, 0xb1, 0x3d, 0x2c, 0x3d, 0x97, 0xbc, 0x63,
+  0xfb, 0x18, 0xbe, 0xb9, 0xa9, 0xcb, 0x3d, 0xb0, 0x7d, 0xb4, 0x3d, 0x22, 0x6a,
+  0x65, 0x3d, 0x7a, 0xaf, 0xf5, 0xba, 0xed, 0x29, 0x0e, 0x3d, 0x5c, 0xd5, 0x6f,
+  0xbd, 0xbe, 0xd9, 0xa0, 0xbc, 0x05, 0x8b, 0xe2, 0x3c, 0x35, 0xec, 0x8b, 0xbc,
+  0xa9, 0x59, 0x0d, 0x3c, 0x0b, 0x4c, 0x56, 0x3c, 0x39, 0x59, 0xad, 0xbd, 0x41,
+  0x06, 0xe3, 0xbd, 0xb1, 0xcd, 0xaa, 0x3d, 0xa8, 0xcc, 0xa1, 0xbd, 0x35, 0x63,
+  0x36, 0xbd, 0x44, 0xf9, 0x43, 0x3c, 0xee, 0x2c, 0xdb, 0x3c, 0x79, 0xd4, 0x78,
+  0x3d, 0x81, 0x34, 0x96, 0x3d, 0xc0, 0x43, 0xda, 0x3b, 0x9f, 0x9c, 0x0b, 0xbd,
+  0xaf, 0x07, 0xac, 0x3d, 0xcf, 0xe3, 0xf0, 0x3c, 0x44, 0x9b, 0xf8, 0x3d, 0xd4,
+  0x1f, 0x4e, 0xbd, 0xa6, 0xab, 0x9f, 0x3d, 0xcb, 0xd4, 0x30, 0x3d, 0x4b, 0xd4,
+  0x17, 0x3d, 0x7e, 0xf2, 0x3d, 0x3b, 0x47, 0x47, 0xac, 0x3b, 0x2f, 0xda, 0xa8,
+  0xbd, 0xb0, 0x53, 0xde, 0xbd, 0x2e, 0x06, 0xdc, 0x3d, 0x9a, 0x92, 0x9a, 0xbd,
+  0x86, 0xf9, 0xf2, 0xbd, 0xb0, 0x9b, 0xd6, 0xbd, 0x8f, 0x36, 0x53, 0x3d, 0x09,
+  0x68, 0x99, 0x3d, 0x25, 0xbb, 0xeb, 0x3d, 0x76, 0x5e, 0xfb, 0xbc, 0x24, 0x11,
+  0x05, 0xbd, 0xcf, 0xaf, 0xb7, 0xbd, 0x97, 0xcd, 0x65, 0xbd, 0xeb, 0x59, 0xf7,
+  0xb8, 0x95, 0x28, 0xb1, 0xbc, 0xff, 0xba, 0x91, 0xbd, 0x58, 0x33, 0xf0, 0x3c,
+  0x42, 0x68, 0xd9, 0xbd, 0xa7, 0x71, 0x95, 0xbb, 0x41, 0x0b, 0x6a, 0x3d, 0xe4,
+  0x83, 0x06, 0x3d, 0xae, 0x90, 0xa0, 0xbd, 0xfe, 0xf5, 0x27, 0xbd, 0x7f, 0xdc,
+  0xb4, 0x3d, 0x32, 0xf0, 0x75, 0xbd, 0x99, 0xfa, 0x7b, 0x3d, 0x5f, 0xca, 0x7a,
+  0x3d, 0xd9, 0x7e, 0x49, 0xbd, 0x7f, 0x2b, 0x5b, 0x3d, 0x02, 0x92, 0x46, 0xbb,
+  0x20, 0x77, 0x5b, 0x3c, 0x57, 0xa6, 0xd1, 0x3a, 0x74, 0x68, 0xb2, 0xbd, 0xa2,
+  0x4c, 0x0a, 0xbe, 0xb9, 0xcf, 0x43, 0xbd, 0xd6, 0x2e, 0x2d, 0xbc, 0x0f, 0x5d,
+  0xde, 0x3d, 0xfc, 0xdc, 0x1c, 0xb9, 0x6d, 0x7b, 0x91, 0xbc, 0x33, 0x39, 0x97,
+  0x3d, 0x37, 0xcf, 0x1f, 0x3d, 0xb3, 0x0b, 0xe3, 0x3d, 0x45, 0xbe, 0xa0, 0x3d,
+  0xda, 0x7c, 0x0e, 0x3d, 0x66, 0xd7, 0x25, 0xbd, 0xa7, 0xe0, 0x0f, 0x3d, 0xd2,
+  0x48, 0x8f, 0xbc, 0x2b, 0xbd, 0x9a, 0x3d, 0xf9, 0xe3, 0xd9, 0x3d, 0x0d, 0x1e,
+  0xf3, 0x3c, 0x12, 0xc5, 0xfe, 0xbc, 0x59, 0x75, 0x9f, 0x3c, 0x76, 0x0e, 0x46,
+  0xbd, 0xa3, 0x5d, 0xb9, 0x3d, 0x8c, 0x5a, 0xc9, 0x3c, 0xb5, 0x90, 0xbd, 0x3d,
+  0xe5, 0xaa, 0x42, 0x3d, 0xaf, 0x43, 0x9b, 0xbd, 0x50, 0x0e, 0xc9, 0xbc, 0xea,
+  0x53, 0x75, 0x3d, 0xfd, 0x0d, 0x4b, 0x3d, 0x7d, 0xc8, 0x17, 0x3d, 0xdd, 0xf0,
+  0xb5, 0xbd, 0x00, 0x53, 0xf4, 0xba, 0xa6, 0x3a, 0x54, 0xbd, 0x7f, 0x57, 0x5f,
+  0xbd, 0x00, 0x98, 0x56, 0xbd, 0xe6, 0x33, 0xbe, 0x3c, 0xe2, 0x66, 0x96, 0x3c,
+  0x41, 0x08, 0x88, 0x3c, 0x66, 0x40, 0x88, 0xbd, 0xfd, 0x89, 0xbb, 0x3d, 0xa6,
+  0xde, 0x99, 0x3a, 0xa4, 0x22, 0xf4, 0x3c, 0x94, 0xbc, 0xaf, 0xbd, 0x94, 0x01,
+  0xcd, 0xbd, 0x89, 0x93, 0x0d, 0x3d, 0x74, 0x5a, 0xdf, 0x3b, 0x5b, 0x0a, 0xce,
+  0xbd, 0xee, 0x6d, 0x87, 0x3d, 0x7c, 0x6a, 0xb0, 0x3d, 0x6d, 0xb0, 0x7b, 0x3c,
+  0x6f, 0xb8, 0x4e, 0x3d, 0x06, 0x6a, 0x25, 0xbd, 0x7c, 0xb9, 0xcc, 0x3d, 0xf5,
+  0x54, 0xb0, 0xbd, 0xf3, 0xf9, 0xe1, 0xbd, 0xcf, 0x6d, 0x91, 0x3c, 0x8d, 0x15,
+  0xa4, 0x3c, 0x15, 0xa1, 0x86, 0x3d, 0x47, 0x35, 0xc3, 0xbd, 0x34, 0xa8, 0x16,
+  0xbd, 0x11, 0xda, 0x49, 0x3d, 0x45, 0xb4, 0x61, 0x3d, 0x41, 0x15, 0xbf, 0xbc,
+  0xd4, 0x07, 0xfa, 0x3d, 0xb0, 0x3a, 0x18, 0x3d, 0xda, 0x7f, 0x69, 0xbd, 0x6b,
+  0xec, 0x9f, 0xbd, 0x6e, 0xfc, 0xe6, 0x3d, 0xc9, 0x5d, 0xb4, 0x3d, 0xa2, 0x1d,
+  0x12, 0xbc, 0x51, 0x23, 0xce, 0xbd, 0x0a, 0x20, 0x86, 0xbc, 0xc4, 0x1f, 0xbe,
+  0x3d, 0x18, 0x10, 0x6a, 0x3d, 0xe1, 0x58, 0x9f, 0x3c, 0x22, 0x7f, 0xc9, 0xbc,
+  0x1a, 0xed, 0x1e, 0xbe, 0x47, 0x93, 0x87, 0x3c, 0x4d, 0x77, 0x31, 0xbc, 0xf9,
+  0x29, 0xb2, 0x3d, 0xa9, 0xb3, 0x77, 0xbd, 0x43, 0x16, 0x0a, 0x3d, 0x88, 0x2f,
+  0x98, 0x3d, 0x3b, 0x7c, 0x2b, 0x3d, 0xfc, 0x29, 0x07, 0x3e, 0xa6, 0x27, 0x93,
+  0xbd, 0x5a, 0xa8, 0x13, 0xbe, 0xa8, 0xb8, 0x88, 0xbd, 0x9b, 0x64, 0xc5, 0xbc,
+  0xef, 0xb1, 0xe6, 0x3d, 0x33, 0x47, 0xc3, 0x38, 0x56, 0x92, 0x7b, 0xbd, 0x87,
+  0x81, 0xc7, 0x3c, 0x94, 0xe2, 0x21, 0x3c, 0xc2, 0x28, 0x75, 0x3d, 0xb7, 0x6f,
+  0x8b, 0xbd, 0x2b, 0xdd, 0x09, 0xbc, 0x1f, 0xb9, 0xbc, 0xbd, 0xd6, 0xef, 0x90,
+  0xbd, 0x52, 0xc7, 0xa5, 0xbc, 0xf7, 0x2c, 0x4d, 0x3c, 0xc7, 0xfe, 0x94, 0x3c,
+  0x24, 0x12, 0x46, 0xbc, 0x95, 0x3b, 0x59, 0x3c, 0x64, 0x96, 0xd7, 0xbc, 0xb3,
+  0x3c, 0xc7, 0xbd, 0xe6, 0x41, 0xbc, 0x3d, 0x70, 0xd8, 0x5c, 0x3b, 0xe2, 0x16,
+  0x88, 0xbd, 0x21, 0x12, 0xfc, 0x3d, 0xbd, 0x55, 0x1e, 0xbe, 0x3a, 0xf9, 0x1f,
+  0xbd, 0x59, 0xd3, 0x27, 0xbd, 0x14, 0x3b, 0xd7, 0x3d, 0x13, 0xf9, 0x66, 0x3d,
+  0x79, 0x92, 0x77, 0xbd, 0x9a, 0x35, 0x63, 0x3d, 0x07, 0xf2, 0x75, 0xbc, 0xc1,
+  0x6f, 0x73, 0x3d, 0x0f, 0x02, 0xc2, 0x3c, 0xd0, 0x45, 0x0c, 0x3d, 0x37, 0x87,
+  0x5e, 0x3d, 0x03, 0x9e, 0xce, 0x3d, 0x2b, 0x90, 0x13, 0xbd, 0xf4, 0x1a, 0xc5,
+  0xbd, 0xdf, 0x42, 0xdb, 0x3d, 0x47, 0x02, 0x58, 0xbd, 0x0f, 0x74, 0x1a, 0xbd,
+  0x1d, 0x5f, 0x05, 0x3d, 0x99, 0x81, 0xff, 0xbc, 0x56, 0x85, 0xb3, 0x3d, 0xac,
+  0x62, 0x17, 0xbd, 0xaa, 0x30, 0xc3, 0x3d, 0xdc, 0x53, 0x0f, 0xbe, 0x9b, 0x95,
+  0x49, 0x3d, 0xf8, 0x4e, 0xa7, 0x3d, 0x76, 0x74, 0x10, 0xbd, 0x2c, 0xe0, 0x9c,
+  0x3d, 0x7b, 0xc1, 0xc7, 0xbd, 0x15, 0x39, 0xe6, 0x3d, 0x52, 0xb3, 0xff, 0xbd,
+  0x72, 0x77, 0xd3, 0x3d, 0x6a, 0xc4, 0xfb, 0x3c, 0x27, 0x15, 0x5b, 0x3d, 0xba,
+  0xa2, 0x6b, 0xbd, 0x2b, 0xbc, 0x02, 0x3e, 0x6c, 0x7c, 0xda, 0x3c, 0x24, 0xa1,
+  0x61, 0xbb, 0xfb, 0x9b, 0xc9, 0xbc, 0x20, 0xcb, 0x93, 0xbc, 0x95, 0x98, 0x6c,
+  0xbd, 0x96, 0x34, 0xda, 0x3d, 0x5b, 0xa3, 0xe1, 0xbc, 0x71, 0xff, 0x07, 0x3d,
+  0x5e, 0x18, 0xd0, 0xbd, 0xc1, 0x9e, 0x26, 0x3e, 0x8b, 0x3d, 0x9c, 0x3d, 0x90,
+  0xe5, 0x84, 0x3d, 0x0d, 0xaa, 0x37, 0x3b, 0x99, 0x2d, 0xf6, 0x3c, 0x40, 0x23,
+  0xca, 0x3d, 0x1c, 0x56, 0xb4, 0xbd, 0xa9, 0x04, 0x97, 0xbd, 0x41, 0xa7, 0x9e,
+  0x3a, 0xb3, 0xfe, 0xb9, 0xbd, 0xf9, 0x34, 0x02, 0xbd, 0x44, 0x97, 0xb4, 0xbd,
+  0x67, 0x43, 0x80, 0xbd, 0xb0, 0xce, 0x36, 0xbd, 0x28, 0x48, 0xa2, 0x3d, 0x32,
+  0x52, 0xd3, 0x3d, 0x2a, 0xd4, 0x12, 0x3e, 0x8e, 0x41, 0xd5, 0x3c, 0x5e, 0x6b,
+  0x64, 0xbd, 0x19, 0x1a, 0xee, 0xbd, 0x91, 0xf3, 0xb1, 0xbb, 0x9e, 0x4f, 0x9b,
+  0x3d, 0x50, 0x3a, 0x9d, 0x3d, 0x25, 0xbc, 0xb5, 0xbd, 0xf7, 0xd6, 0x7b, 0x3d,
+  0x69, 0x87, 0x94, 0xbb, 0xed, 0x33, 0x31, 0xbd, 0x8f, 0xf3, 0xaa, 0xbd, 0x5b,
+  0x0b, 0xc0, 0x3d, 0xd9, 0xac, 0x60, 0xbd, 0x24, 0xa6, 0x9c, 0x3d, 0xfb, 0x17,
+  0x3f, 0x3d, 0x49, 0x6a, 0x97, 0x3d, 0x02, 0xe9, 0xef, 0xbd, 0x44, 0xbe, 0xb5,
+  0xbc, 0x61, 0x77, 0x94, 0xbb, 0x9e, 0x6d, 0xe1, 0xbc, 0xfa, 0x8c, 0xf2, 0xbc,
+  0x9c, 0xfc, 0x45, 0xbd, 0xed, 0x91, 0xde, 0xbd, 0xcd, 0xa8, 0xe7, 0x3d, 0x4e,
+  0x05, 0x10, 0xbe, 0x33, 0x4d, 0xa1, 0x3c, 0x01, 0x95, 0x91, 0x3d, 0x33, 0xf9,
+  0x13, 0xbd, 0x78, 0x50, 0x03, 0xbd, 0x7f, 0xa1, 0xd7, 0xbd, 0x0f, 0xe3, 0x92,
+  0x3d, 0x46, 0x19, 0x9e, 0x3d, 0xa8, 0xa7, 0x06, 0xbc, 0x0e, 0x64, 0xa6, 0x3d,
+  0xb4, 0x52, 0xe8, 0xbd, 0x87, 0xc6, 0x8f, 0xbd, 0x50, 0x8c, 0xbf, 0xbb, 0x76,
+  0x39, 0x34, 0x3d, 0xd2, 0x2f, 0x0b, 0xbd, 0xf4, 0xa3, 0x51, 0xbd, 0xb0, 0x28,
+  0x7d, 0xbd, 0x83, 0x61, 0x57, 0x3d, 0xca, 0x95, 0xb5, 0x3d, 0xdc, 0x22, 0x32,
+  0xbc, 0x58, 0xb3, 0x69, 0xbd, 0x09, 0x10, 0x79, 0x3c, 0x3c, 0x79, 0x35, 0xbd,
+  0xa0, 0x99, 0xa9, 0xbd, 0xdf, 0x93, 0x18, 0x3e, 0x6f, 0x5f, 0xad, 0x3d, 0xb2,
+  0x0b, 0x8e, 0xbd, 0xf5, 0xf2, 0xaa, 0x3d, 0xf2, 0x2e, 0xa9, 0xbd, 0xf6, 0xe2,
+  0x23, 0x3d, 0x17, 0xa2, 0xaf, 0x3d, 0xd9, 0x35, 0x8e, 0xbd, 0xf1, 0x8d, 0x08,
+  0x3e, 0xcc, 0x76, 0xb4, 0xbd, 0x71, 0xb4, 0xc9, 0xbd, 0x00, 0x10, 0xd4, 0xbc,
+  0xbe, 0x87, 0xf0, 0x3c, 0xe8, 0x15, 0xad, 0xbd, 0xfb, 0x2e, 0x5e, 0xbd, 0x6f,
+  0x3b, 0x99, 0xbc, 0x77, 0xc7, 0xe5, 0xbd, 0xf4, 0x52, 0x03, 0xbe, 0x74, 0x7b,
+  0x00, 0xbe, 0xe8, 0x51, 0x8c, 0x3d, 0xe1, 0x8d, 0x1c, 0xbc, 0x3d, 0x3c, 0x16,
+  0x3d, 0x94, 0x51, 0xd5, 0x3d, 0xff, 0x2e, 0xb0, 0x3d, 0xf5, 0x3c, 0xaa, 0xbc,
+  0x39, 0x6b, 0xb2, 0x3d, 0x1f, 0x8b, 0x44, 0x3d, 0xe4, 0xa4, 0xa8, 0x3d, 0xa9,
+  0xbc, 0x81, 0x3d, 0x67, 0x10, 0x83, 0xbd, 0x03, 0x1b, 0x08, 0x3d, 0xed, 0xef,
+  0x29, 0x3d, 0x46, 0x38, 0x58, 0xbc, 0x98, 0x03, 0xa3, 0x3d, 0x7d, 0xd6, 0x34,
+  0xbd, 0x36, 0xbd, 0xf7, 0x3d, 0xe7, 0xf9, 0x5d, 0xbd, 0x9c, 0x88, 0x87, 0x3d,
+  0x85, 0x7d, 0xa3, 0x3d, 0x81, 0x29, 0x75, 0xbc, 0xca, 0x17, 0x97, 0x3d, 0xbf,
+  0xd1, 0x04, 0x3e, 0xc9, 0x18, 0xfa, 0x3b, 0x0f, 0x59, 0xc3, 0x3d, 0x40, 0xa6,
+  0x05, 0xbd, 0x5e, 0x98, 0x8d, 0x3c, 0x8f, 0x73, 0xff, 0x3c, 0xb2, 0x58, 0xde,
+  0xbc, 0x97, 0x10, 0x04, 0xbd, 0x2d, 0xd2, 0x1c, 0x3d, 0xac, 0x03, 0x6e, 0xbd,
+  0xa8, 0x9a, 0xa8, 0x3d, 0x1c, 0x0e, 0x41, 0x3d, 0x30, 0x7a, 0xab, 0xbd, 0xec,
+  0x58, 0x14, 0xbd, 0xac, 0xe9, 0x9e, 0xbb, 0x0b, 0x14, 0x02, 0x3d, 0xac, 0x78,
+  0x00, 0x3e, 0xa1, 0xb6, 0xc2, 0xbd, 0x04, 0x51, 0x91, 0xbc, 0x57, 0x51, 0xf1,
+  0xbd, 0x95, 0x42, 0x49, 0x3d, 0x91, 0x54, 0xa2, 0x3c, 0xbd, 0x0f, 0x03, 0xbe,
+  0x0a, 0xf8, 0x17, 0xbd, 0xbb, 0x25, 0x14, 0x3d, 0xf2, 0x00, 0x19, 0xbd, 0x79,
+  0xea, 0x85, 0xbd, 0x4a, 0xf9, 0xb6, 0xbc, 0x4f, 0x1c, 0x34, 0xbc, 0x2e, 0x3e,
+  0x31, 0x3d, 0xe3, 0x63, 0x5e, 0xbd, 0x63, 0xf1, 0xaf, 0x3d, 0x4e, 0xee, 0xaa,
+  0x3d, 0x91, 0xc0, 0xcc, 0xbc, 0xc3, 0x43, 0xb2, 0xbc, 0xab, 0x9d, 0x54, 0xbd,
+  0x0b, 0x92, 0xa3, 0xbc, 0xc5, 0xe0, 0xf6, 0x3d, 0xb5, 0x2d, 0x52, 0xbd, 0x89,
+  0x8d, 0xf0, 0xbd, 0xd4, 0x40, 0x0c, 0xbe, 0x88, 0xf8, 0xaa, 0x3d, 0xc6, 0x0d,
+  0x10, 0x3d, 0xe0, 0x7d, 0xcb, 0xbc, 0x14, 0x58, 0xba, 0x3a, 0x11, 0x9d, 0x24,
+  0xbd, 0x14, 0x54, 0x03, 0x3b, 0x2c, 0xb4, 0x7d, 0x3c, 0x5a, 0x71, 0x99, 0xbd,
+  0x5d, 0xa3, 0xa3, 0xbd, 0xfc, 0xd0, 0xe5, 0x39, 0x4a, 0x6c, 0xf8, 0xbd, 0x81,
+  0x0e, 0xab, 0x3d, 0x0d, 0x40, 0x9a, 0x3d, 0x89, 0xff, 0x07, 0x3d, 0xd4, 0x8c,
+  0x97, 0x3b, 0x8a, 0x7a, 0xc5, 0x3c, 0xbb, 0xbf, 0xe3, 0x3a, 0xcb, 0x47, 0x41,
+  0x3d, 0x80, 0x8d, 0x29, 0x3d, 0x16, 0xe7, 0xf6, 0xbc, 0x01, 0x5f, 0xc0, 0x3d,
+  0xf1, 0x20, 0xe3, 0xbc, 0xec, 0x9f, 0x29, 0x3e, 0x8f, 0x46, 0x8d, 0x3d, 0x20,
+  0x99, 0xe9, 0x3c, 0x90, 0x04, 0x00, 0x3e, 0x35, 0xda, 0xba, 0xbd, 0x6c, 0xc5,
+  0x5b, 0x3d, 0x9a, 0x42, 0x41, 0xbd, 0x1a, 0x84, 0x6f, 0x3d, 0x94, 0xc4, 0x0c,
+  0xbd, 0x08, 0x43, 0x8a, 0x3d, 0xd8, 0xdb, 0xa4, 0x3d, 0xac, 0xc6, 0xa8, 0x3d,
+  0xa5, 0xf4, 0xff, 0xb9, 0xdc, 0x01, 0x58, 0xbc, 0x43, 0x37, 0xf0, 0x3d, 0xed,
+  0x73, 0x3b, 0xbd, 0x8d, 0x1f, 0x00, 0x3c, 0x4c, 0x89, 0x71, 0x3d, 0xb0, 0xbf,
+  0x4e, 0x3d, 0x1e, 0x61, 0x83, 0xbd, 0x82, 0xf6, 0x02, 0xbe, 0x3c, 0x97, 0xf9,
+  0x3d, 0x06, 0x96, 0x97, 0x3d, 0x5c, 0x13, 0xd7, 0xbd, 0xce, 0x77, 0x88, 0xbd,
+  0x26, 0x76, 0xba, 0x3c, 0x46, 0x28, 0xc4, 0x3d, 0x35, 0x72, 0x8d, 0x3c, 0x3e,
+  0x63, 0x81, 0xbd, 0x06, 0x13, 0x9b, 0x3d, 0xf9, 0x80, 0x20, 0x3d, 0x9c, 0xfb,
+  0x94, 0x3c, 0x50, 0x2c, 0x16, 0xbd, 0xdb, 0x7d, 0x59, 0xbd, 0x7a, 0xa8, 0x8d,
+  0x3d, 0x8b, 0x56, 0x94, 0xbd, 0xa5, 0x49, 0x8b, 0x3d, 0x76, 0xae, 0x99, 0xbc,
+  0x6e, 0x40, 0x84, 0x3d, 0xe0, 0x5a, 0x40, 0xbd, 0x33, 0xb8, 0x0b, 0xbd, 0x96,
+  0x14, 0x25, 0x3c, 0x3e, 0x5c, 0x78, 0xbd, 0x31, 0x40, 0x06, 0x3e, 0x05, 0x0b,
+  0xb7, 0x3c, 0x24, 0x3e, 0xe5, 0xbd, 0x94, 0x06, 0x12, 0x3d, 0x14, 0x07, 0x96,
+  0xbd, 0x14, 0x1d, 0x80, 0xbd, 0xfc, 0xd3, 0x66, 0xbd, 0xfa, 0xef, 0x67, 0x3d,
+  0x62, 0x1e, 0x9f, 0x3c, 0x27, 0x05, 0x2a, 0xbc, 0xbb, 0x0b, 0xa2, 0x3d, 0x07,
+  0x02, 0xaf, 0x3d, 0xcb, 0x9d, 0xc9, 0x3d, 0xbe, 0x5c, 0x15, 0x3b, 0x73, 0xc6,
+  0x92, 0xbd, 0x70, 0x29, 0xe4, 0x3d, 0x46, 0xa2, 0xb2, 0xbc, 0x56, 0xb8, 0xe1,
+  0x3d, 0x82, 0xf9, 0x0d, 0xbd, 0x9b, 0x59, 0xa8, 0xbd, 0x42, 0x59, 0x98, 0x3d,
+  0xae, 0x31, 0x22, 0xbd, 0x0d, 0xa2, 0x1f, 0x3e, 0xc8, 0xfd, 0x58, 0xbc, 0x4e,
+  0xd4, 0xca, 0x3d, 0xbd, 0x39, 0x81, 0xbd, 0x7c, 0x0a, 0x25, 0x3e, 0xdb, 0x88,
+  0x7f, 0x3c, 0xf1, 0x64, 0x07, 0x3e, 0xd2, 0x99, 0x1d, 0x3d, 0x2c, 0xc9, 0xb0,
+  0xbd, 0x7a, 0xe0, 0x9d, 0xbc, 0x9e, 0x93, 0x19, 0x3d, 0x7f, 0xfd, 0xd2, 0xbc,
+  0xec, 0x44, 0xd5, 0x3d, 0x69, 0x81, 0xbf, 0x3d, 0x9e, 0xff, 0xac, 0x3c, 0x60,
+  0x6b, 0x6a, 0xbd, 0xe6, 0x22, 0x48, 0xbd, 0x3b, 0xc4, 0xa3, 0xbd, 0x0c, 0xd3,
+  0xf5, 0x3c, 0x08, 0x03, 0x62, 0x3c, 0x5c, 0x46, 0x16, 0x3e, 0xd3, 0x2a, 0xce,
+  0x3c, 0xfc, 0x31, 0xa8, 0x3d, 0xbd, 0x02, 0x95, 0x3c, 0xe8, 0xc7, 0x7a, 0x3c,
+  0xff, 0xc5, 0xf8, 0x3c, 0x3a, 0xb0, 0x79, 0x3b, 0xe6, 0xfd, 0x37, 0xbd, 0x5e,
+  0xd3, 0x06, 0x3e, 0x21, 0x21, 0xe8, 0x3c, 0xa1, 0x6f, 0xf1, 0x3d, 0xa6, 0xc2,
+  0x54, 0x3d, 0x9c, 0xae, 0x9c, 0x3d, 0xcb, 0xfd, 0x0a, 0x3c, 0x3e, 0x2e, 0x00,
+  0xbd, 0xdc, 0xf2, 0x4b, 0xbd, 0x7a, 0xdf, 0xbd, 0x3d, 0xbd, 0x27, 0x8b, 0x3c,
+  0x1c, 0x12, 0x2d, 0xbd, 0xf9, 0xf3, 0x28, 0x3e, 0x4c, 0x90, 0xb3, 0xbd, 0x49,
+  0xfc, 0x84, 0x3d, 0x2e, 0xc1, 0x82, 0x3d, 0x54, 0xc7, 0x62, 0x3d, 0xcb, 0x24,
+  0xf9, 0x3d, 0xf4, 0x6a, 0x2b, 0x3c, 0x38, 0x27, 0x1c, 0xbd, 0x05, 0xf1, 0xf5,
+  0x3d, 0xc0, 0x87, 0xa2, 0x3d, 0x7e, 0x5c, 0x92, 0x3d, 0xef, 0x33, 0xad, 0x3d,
+  0x34, 0xff, 0x43, 0x3d, 0x87, 0x47, 0xc6, 0x3d, 0x58, 0x18, 0x76, 0xbd, 0x1d,
+  0x74, 0x9e, 0x3d, 0xae, 0x41, 0xb1, 0xbc, 0x7d, 0x42, 0x94, 0xbd, 0x37, 0x01,
+  0x66, 0x3d, 0xb4, 0x18, 0x96, 0xbd, 0x69, 0x31, 0xc4, 0x3c, 0xe7, 0x09, 0x00,
+  0xbe, 0x46, 0x1a, 0x2b, 0xbd, 0x76, 0xd4, 0x7b, 0xbd, 0x48, 0xcd, 0xfc, 0x3b,
+  0xf9, 0x98, 0xf6, 0xbc, 0x33, 0x91, 0x2c, 0xbe, 0xe1, 0x08, 0xf5, 0xbd, 0xb0,
+  0xcd, 0x79, 0x3d, 0xd3, 0x1d, 0x0f, 0x3e, 0x5a, 0x9f, 0x13, 0xbd, 0x7d, 0x6b,
+  0x44, 0x3c, 0xcf, 0x14, 0x38, 0x3d, 0xe3, 0xfb, 0x47, 0x3d, 0x37, 0x1e, 0x2f,
+  0x3c, 0x89, 0xa0, 0xb2, 0xbd, 0x89, 0x21, 0x81, 0xbd, 0x04, 0xda, 0xc5, 0x3d,
+  0xa7, 0xa8, 0x16, 0xbc, 0x07, 0x2e, 0xc1, 0xbb, 0x8c, 0x6f, 0xc2, 0x3c, 0x3b,
+  0x0c, 0x03, 0xbd, 0x74, 0xc2, 0xa5, 0x3d, 0x3f, 0xeb, 0xb2, 0xbd, 0x2f, 0x66,
+  0x94, 0xbd, 0x4f, 0x30, 0xab, 0xbd, 0xc4, 0xdd, 0x45, 0x3d, 0x4a, 0xb7, 0x48,
+  0x3d, 0x55, 0x77, 0x26, 0x3e, 0xbe, 0x1c, 0x96, 0xbb, 0x5b, 0xca, 0x62, 0xbd,
+  0xcf, 0x1e, 0xd3, 0x3c, 0xa7, 0x0e, 0xb9, 0xbd, 0x67, 0x75, 0x2b, 0xbd, 0x26,
+  0x12, 0xd5, 0xbc, 0xb6, 0x0f, 0xc0, 0xbd, 0x12, 0xab, 0x23, 0x3d, 0xf6, 0x23,
+  0xb2, 0x3d, 0x3f, 0x71, 0x83, 0x3d, 0x2a, 0x08, 0x95, 0xbc, 0xd8, 0x6e, 0xdc,
+  0xbd, 0x1c, 0x85, 0xa6, 0xbd, 0xc4, 0xbc, 0x52, 0xbd, 0xa8, 0xe0, 0x9c, 0x3d,
+  0xf8, 0xa9, 0xe5, 0x3d, 0xfe, 0xbd, 0x9c, 0x3d, 0x9d, 0x62, 0xc3, 0x3c, 0xe6,
+  0x95, 0xd6, 0xbc, 0x08, 0x07, 0x68, 0xbc, 0x99, 0x7b, 0xe4, 0xbd, 0xcf, 0x18,
+  0xb0, 0x3d, 0xdb, 0x65, 0x8e, 0xbd, 0x47, 0x34, 0xa9, 0xbd, 0x65, 0xab, 0x0a,
+  0xbe, 0xb3, 0x57, 0x24, 0xbe, 0x1f, 0xce, 0xa2, 0xbc, 0xd2, 0x8a, 0xb7, 0xbc,
+  0x1e, 0xd4, 0x53, 0x3d, 0xec, 0x02, 0x14, 0xbd, 0xd7, 0xc2, 0x05, 0x3d, 0x05,
+  0xe3, 0xcb, 0xbc, 0x18, 0xc7, 0x9d, 0x3d, 0x99, 0x69, 0x0a, 0xbe, 0xee, 0x58,
+  0xa1, 0x3d, 0xae, 0xa3, 0x36, 0xbe, 0x5c, 0x5d, 0x9c, 0xbd, 0x39, 0xfb, 0x00,
+  0xbd, 0x38, 0xcd, 0x70, 0xbd, 0x2f, 0x77, 0xf2, 0xbd, 0x8a, 0x7d, 0x74, 0xbd,
+  0x4b, 0x08, 0x7b, 0xbd, 0x42, 0xaf, 0x4a, 0xba, 0x56, 0x2e, 0x80, 0xbd, 0x81,
+  0x9b, 0xb9, 0x3d, 0xf0, 0x6d, 0x86, 0x3c, 0xfe, 0x53, 0x82, 0xbd, 0xb8, 0xac,
+  0x56, 0xbd, 0xf7, 0xc9, 0x14, 0x3d, 0xea, 0xe6, 0x1f, 0xbd, 0x9f, 0x23, 0xd0,
+  0xbd, 0x73, 0xd5, 0x6a, 0x3d, 0x24, 0xdb, 0xba, 0xbd, 0xf5, 0xf1, 0xda, 0xbc,
+  0xe6, 0x8b, 0x34, 0xbd, 0x6c, 0x15, 0x8a, 0x3c, 0x26, 0x05, 0x63, 0x3d, 0x27,
+  0xc2, 0x8b, 0xbd, 0x62, 0xb2, 0x83, 0x3d, 0x71, 0x11, 0x50, 0xbc, 0x67, 0x3d,
+  0xe4, 0x3d, 0xa5, 0x3d, 0x59, 0xbd, 0x18, 0xa4, 0x70, 0x3c, 0x6b, 0x86, 0x9c,
+  0x3d, 0xa6, 0xe4, 0xbf, 0x3d, 0x3a, 0x8f, 0xe2, 0xbd, 0xd7, 0xf8, 0x71, 0x3d,
+  0x1d, 0x46, 0x00, 0xbd, 0x3c, 0x59, 0xc0, 0xbc, 0x1f, 0x60, 0x50, 0xbd, 0x91,
+  0xe2, 0xe6, 0xbd, 0x4c, 0x72, 0xb6, 0xbd, 0x49, 0x1e, 0xba, 0x3d, 0xdd, 0x1e,
+  0x77, 0xbc, 0x35, 0x26, 0xab, 0x3c, 0x63, 0x83, 0xd7, 0xbd, 0x41, 0x6f, 0xa8,
+  0x3d, 0x6d, 0xf0, 0x50, 0xbd, 0xdc, 0x5f, 0x2f, 0xbd, 0x73, 0x67, 0xce, 0xbc,
+  0x10, 0x47, 0x0b, 0xbd, 0xdc, 0x85, 0x41, 0x3c, 0xcd, 0x61, 0xc9, 0xbd, 0x9d,
+  0x79, 0x77, 0x3d, 0xbd, 0xe5, 0xb5, 0xbd, 0xa4, 0x88, 0xf7, 0xbd, 0x43, 0xf7,
+  0x5e, 0x3b, 0x95, 0x23, 0x26, 0xbd, 0x39, 0x1e, 0xa7, 0x3d, 0x60, 0xd5, 0x2e,
+  0xbd, 0x78, 0xa7, 0x1b, 0x3d, 0xad, 0x5b, 0xcd, 0x3d, 0x73, 0xba, 0x9d, 0xbd,
+  0xb7, 0xe0, 0x91, 0x3d, 0xa7, 0x90, 0x8e, 0x3d, 0x12, 0x0d, 0x11, 0x3d, 0x6d,
+  0xf8, 0x9b, 0xbd, 0x7d, 0xd4, 0xdf, 0x3d, 0x67, 0x4c, 0xa3, 0x3d, 0x21, 0x33,
+  0x88, 0xbc, 0xc8, 0xd2, 0xc7, 0xbd, 0x93, 0xea, 0x80, 0xbd, 0x4d, 0xe7, 0x42,
+  0xbd, 0x0b, 0x43, 0xfb, 0xbc, 0xb0, 0x8c, 0x7f, 0xbc, 0x16, 0x83, 0xc3, 0x3d,
+  0x42, 0xd0, 0x86, 0xbd, 0x7f, 0x6f, 0xa6, 0x3d, 0xed, 0xee, 0x4c, 0x3d, 0xc9,
+  0x3e, 0x03, 0x3d, 0x72, 0x47, 0x9e, 0xbd, 0x2f, 0x66, 0xda, 0x3d, 0x3d, 0x45,
+  0x80, 0x3b, 0x3c, 0xab, 0xa6, 0xbd, 0x73, 0xe8, 0x9f, 0xbd, 0xf6, 0x76, 0xc2,
+  0xbd, 0x18, 0xaf, 0xb4, 0x3d, 0x94, 0x94, 0x9f, 0xbd, 0x46, 0xcd, 0xad, 0xbd,
+  0xdb, 0xe6, 0x87, 0xbd, 0x67, 0x03, 0x07, 0x3d, 0x05, 0xc2, 0x84, 0xbc, 0xb7,
+  0x1f, 0x8d, 0xbd, 0x19, 0x72, 0xa1, 0x3d, 0xd8, 0xa5, 0x52, 0x3d, 0x63, 0x90,
+  0x03, 0xbd, 0xf5, 0xe3, 0xcd, 0x3d, 0xd8, 0xfb, 0x9c, 0x3d, 0x74, 0xd7, 0x06,
+  0xbd, 0x8c, 0xb5, 0xdd, 0xbd, 0x20, 0x07, 0xba, 0xbd, 0x83, 0xa1, 0xd2, 0x3d,
+  0x4c, 0x58, 0xe3, 0x3d, 0x31, 0x7d, 0xe1, 0xbd, 0x29, 0x06, 0xa1, 0xbd, 0x64,
+  0xa9, 0x2e, 0xbd, 0x79, 0x6c, 0xb5, 0xbd, 0x8f, 0xe5, 0xac, 0x3d, 0x68, 0xc1,
+  0xc3, 0x3c, 0xd5, 0xa7, 0xf2, 0xbd, 0x2e, 0x24, 0x40, 0xbd, 0xd6, 0x39, 0xe7,
+  0x3d, 0xe0, 0xaf, 0x02, 0xbd, 0xe1, 0xd6, 0xe1, 0xbd, 0xfa, 0xa0, 0x25, 0x3d,
+  0x26, 0xe8, 0x57, 0x3d, 0xa5, 0x58, 0xf6, 0xbd, 0xd2, 0x32, 0x0f, 0xbd, 0x8e,
+  0xa1, 0x8d, 0x3c, 0xb6, 0x98, 0xce, 0xbc, 0x71, 0x96, 0xfa, 0xbc, 0xe2, 0x69,
+  0x35, 0x3c, 0x3d, 0x07, 0x21, 0x3d, 0xc1, 0x9f, 0x8a, 0x3d, 0x0a, 0x9e, 0x64,
+  0xbd, 0x3b, 0x91, 0x57, 0xbb, 0x99, 0x41, 0x8c, 0x3d, 0xcf, 0x60, 0x8f, 0xbd,
+  0x5e, 0xe6, 0x25, 0xbd, 0xec, 0x60, 0xb0, 0xbd, 0xcf, 0xd7, 0x87, 0x3d, 0x1a,
+  0x3f, 0x4e, 0xbd, 0xd7, 0xbf, 0x78, 0xbd, 0xe3, 0x77, 0xd9, 0x3d, 0x81, 0xd8,
+  0x81, 0xbd, 0x52, 0x2a, 0xd3, 0x3d, 0xc1, 0x32, 0x80, 0xbd, 0xaa, 0xbf, 0x9d,
+  0x3d, 0xbf, 0x21, 0x3b, 0x3d, 0x30, 0x5e, 0x9e, 0xbd, 0xfa, 0xf3, 0xda, 0xbc,
+  0x41, 0xeb, 0x9c, 0xbd, 0x71, 0x88, 0xd3, 0xbc, 0xf1, 0x4c, 0x00, 0xbd, 0x38,
+  0xd5, 0x2f, 0x3c, 0xcd, 0xd9, 0x3e, 0x3d, 0xf4, 0xf8, 0xa4, 0x3d, 0xbc, 0x2f,
+  0x0e, 0xbd, 0x28, 0x35, 0x34, 0x3d, 0x3a, 0x20, 0x5c, 0x3d, 0x97, 0x22, 0xdb,
+  0xbd, 0x75, 0xd3, 0x5f, 0xbd, 0xf9, 0x3b, 0x66, 0xbd, 0x4a, 0x18, 0xe7, 0xbb,
+  0x4e, 0x21, 0x5d, 0xbd, 0x9c, 0x6c, 0x45, 0xbd, 0x2c, 0xb8, 0xe7, 0x3c, 0x65,
+  0xbf, 0x45, 0x3d, 0x15, 0xbb, 0xa5, 0xbd, 0x7e, 0x1c, 0xba, 0xbd, 0xfa, 0x2d,
+  0xfc, 0x3c, 0xc2, 0xfb, 0x20, 0xbd, 0x62, 0xc3, 0xa6, 0xbd, 0xae, 0x66, 0xc1,
+  0x3b, 0x8e, 0x5e, 0x29, 0xbd, 0x1a, 0x5d, 0x27, 0xbd, 0xce, 0x36, 0xaf, 0xbd,
+  0x6d, 0x03, 0xdd, 0x3d, 0xb5, 0x5d, 0x95, 0x3c, 0xd2, 0x9d, 0x60, 0xbd, 0xf0,
+  0xb5, 0x60, 0xbc, 0x80, 0x21, 0x34, 0xbd, 0xf1, 0x05, 0xc8, 0x3b, 0x2c, 0x2a,
+  0x2f, 0x3e, 0x99, 0x23, 0x3c, 0x3d, 0x73, 0x2f, 0xe4, 0x3d, 0xc8, 0x22, 0xce,
+  0x3d, 0xbf, 0x98, 0xad, 0xbd, 0xa5, 0xb2, 0xd4, 0xbd, 0x6d, 0xca, 0x3b, 0xbe,
+  0xd1, 0xa0, 0x95, 0x3c, 0xa0, 0xed, 0xe1, 0x3b, 0x8c, 0x5d, 0x6f, 0x3d, 0x10,
+  0x04, 0x88, 0xbd, 0x76, 0x62, 0xe7, 0x3d, 0x53, 0x28, 0x8c, 0xbd, 0x7b, 0x4f,
+  0x5d, 0xbd, 0x2e, 0x69, 0x8b, 0x3c, 0xe7, 0x7f, 0x79, 0x3c, 0x2e, 0xe5, 0xbf,
+  0x3c, 0x56, 0x90, 0xf6, 0xbc, 0x8a, 0xc6, 0x3b, 0x3d, 0x86, 0xbf, 0xb8, 0xbd,
+  0xe6, 0xf7, 0xd7, 0xbc, 0xc5, 0x96, 0xcb, 0x3d, 0x48, 0xe0, 0x9a, 0xbd, 0xd8,
+  0xe1, 0x45, 0xbd, 0xa7, 0x00, 0xd7, 0xbd, 0xda, 0x57, 0x1c, 0xbc, 0x8e, 0x49,
+  0x40, 0x3d, 0x8b, 0x52, 0x0a, 0x3d, 0xe2, 0xe8, 0x1b, 0xbd, 0x74, 0xd1, 0x0f,
+  0x3e, 0x17, 0x20, 0xc1, 0x3d, 0x3a, 0xbe, 0x8a, 0xbd, 0xa4, 0xd5, 0xca, 0x3c,
+  0x4f, 0x17, 0x82, 0xbc, 0x1f, 0xea, 0x09, 0xbd, 0x8e, 0xcb, 0xd0, 0x3d, 0x9c,
+  0x1a, 0x36, 0xbd, 0x99, 0xee, 0x5b, 0xbd, 0x5c, 0x1d, 0x10, 0xbe, 0x9e, 0x99,
+  0x22, 0x3d, 0x8f, 0x8f, 0xda, 0x3c, 0x42, 0xa7, 0x2e, 0x3d, 0x37, 0x33, 0x03,
+  0xbe, 0x11, 0x7b, 0x8f, 0xbd, 0xb8, 0xa1, 0x7e, 0x3d, 0x31, 0x04, 0x62, 0x3d,
+  0x93, 0x03, 0xfe, 0x3b, 0x59, 0x82, 0xa0, 0xbd, 0x07, 0xb8, 0x24, 0x3d, 0x7a,
+  0x45, 0xf2, 0x3d, 0xab, 0xf4, 0xd7, 0xbd, 0x2f, 0xbd, 0xc6, 0x3d, 0xb2, 0x1c,
+  0x47, 0x3d, 0xbe, 0xf6, 0xb2, 0x3d, 0xe2, 0xd0, 0x92, 0xbd, 0x0d, 0xec, 0xb2,
+  0xbd, 0x40, 0x5c, 0xc0, 0xbd, 0xa8, 0xf7, 0x0e, 0x3c, 0xef, 0x56, 0xb1, 0xbd,
+  0x91, 0x09, 0x4f, 0xbd, 0x47, 0x51, 0xcc, 0x3d, 0xcd, 0x6d, 0x85, 0xbd, 0xfe,
+  0xb2, 0x6f, 0xbd, 0x3f, 0x9b, 0xec, 0x3c, 0x64, 0x20, 0x98, 0xbb, 0x82, 0x78,
+  0x09, 0x3d, 0x2f, 0xbf, 0xe7, 0xbc, 0x5d, 0x5e, 0x01, 0xbd, 0x0c, 0xca, 0x4b,
+  0x3d, 0xf2, 0xa2, 0x89, 0xbd, 0xa6, 0x59, 0x54, 0x3d, 0x62, 0x46, 0x04, 0x3c,
+  0x99, 0x2f, 0x48, 0xbd, 0x22, 0x21, 0x1b, 0xbd, 0x07, 0x3b, 0xb4, 0xbd, 0x88,
+  0x42, 0x0a, 0x3e, 0x7e, 0x29, 0xc3, 0xbb, 0xab, 0x7a, 0x86, 0x3d, 0xe7, 0x26,
+  0xc0, 0x3c, 0xac, 0x99, 0x0f, 0xbd, 0x6e, 0xdb, 0x74, 0x3d, 0xba, 0x02, 0xdb,
+  0x3d, 0x3c, 0x38, 0xae, 0x3d, 0xdf, 0x34, 0xe1, 0xbd, 0x53, 0xa6, 0x26, 0xbe,
+  0x26, 0xa7, 0x82, 0x3d, 0x7b, 0x0f, 0x03, 0xbe, 0x85, 0xb6, 0xaa, 0xbc, 0xc5,
+  0x08, 0xbf, 0x3c, 0x4f, 0xd1, 0xa8, 0xbb, 0x9f, 0x58, 0xa6, 0x3c, 0x51, 0xdc,
+  0xfb, 0x3d, 0x2e, 0x30, 0xab, 0xbd, 0x38, 0x19, 0x19, 0x3c, 0xa2, 0x6a, 0x7c,
+  0x3d, 0x1d, 0x52, 0xd5, 0xbc, 0x15, 0x5f, 0xb3, 0x3b, 0x9b, 0xd8, 0x75, 0xbd,
+  0x5f, 0xa1, 0x13, 0xbd, 0xdc, 0xc7, 0xfd, 0xbb, 0x44, 0x9b, 0x73, 0xbd, 0x41,
+  0x1d, 0x82, 0xbd, 0xa7, 0x0b, 0x15, 0x3c, 0x87, 0x91, 0x80, 0x3c, 0x74, 0x55,
+  0xab, 0xbd, 0xf4, 0xb6, 0x3d, 0x3b, 0xa7, 0x2c, 0xcd, 0xbd, 0x19, 0xa5, 0x96,
+  0xbc, 0xea, 0x8f, 0xfa, 0x3d, 0x98, 0x47, 0x12, 0xbd, 0xfc, 0x40, 0x62, 0x3d,
+  0x72, 0x61, 0xa0, 0xbd, 0x79, 0x4d, 0x71, 0x3d, 0x2f, 0x4a, 0x89, 0x3d, 0xb8,
+  0xdc, 0x98, 0x3d, 0x66, 0x46, 0x6f, 0x3d, 0xa2, 0xf2, 0x0d, 0x3d, 0x36, 0xf5,
+  0xd4, 0x3c, 0xb9, 0xe5, 0x88, 0x3d, 0xa4, 0x93, 0x05, 0x3e, 0x64, 0x7e, 0x18,
+  0xbe, 0xb6, 0x47, 0x76, 0x3d, 0x8e, 0x31, 0xca, 0x3d, 0x2f, 0x72, 0xf3, 0x3d,
+  0x73, 0x45, 0x0d, 0x3e, 0xf4, 0x52, 0xfa, 0xbc, 0x40, 0x37, 0x88, 0xbd, 0x44,
+  0x13, 0xae, 0xbc, 0x25, 0x7e, 0x0a, 0xbd, 0xbe, 0x26, 0x45, 0xbd, 0x2c, 0xf1,
+  0x37, 0x3d, 0x29, 0xbd, 0x9f, 0xbd, 0xcb, 0xff, 0x1c, 0xbd, 0x62, 0xf2, 0xa0,
+  0xba, 0x20, 0x57, 0xa8, 0xbc, 0xaa, 0xc1, 0x9c, 0xbd, 0xfb, 0xd0, 0x3b, 0x3d,
+  0xe2, 0xae, 0x3f, 0x3d, 0x41, 0x4d, 0x93, 0x3d, 0x28, 0x11, 0xcc, 0x3d, 0x52,
+  0x6e, 0x06, 0x3e, 0x8f, 0x9b, 0xc0, 0x3d, 0x40, 0xb0, 0xa4, 0xbc, 0xb0, 0x45,
+  0x86, 0x3d, 0xc9, 0x85, 0x40, 0xbd, 0xfa, 0xdb, 0xe3, 0xbd, 0xf3, 0x0e, 0x9b,
+  0x3d, 0x48, 0x39, 0x03, 0xbe, 0xc4, 0xfc, 0x2f, 0xbd, 0xb9, 0xbf, 0xbe, 0x3d,
+  0xd9, 0x2f, 0x11, 0xbd, 0x71, 0x6a, 0x75, 0x3c, 0x89, 0x2b, 0xc2, 0xbd, 0x21,
+  0x82, 0xd4, 0xbd, 0x36, 0xcc, 0xf5, 0x3d, 0xa3, 0x91, 0x3d, 0x3d, 0x16, 0xd1,
+  0x7d, 0xbd, 0x40, 0xba, 0x75, 0x3b, 0x5a, 0x82, 0xfa, 0x3d, 0xc1, 0x09, 0xaf,
+  0x3d, 0x1e, 0x44, 0xa3, 0x3d, 0xd7, 0x2a, 0x37, 0xbd, 0xd9, 0x72, 0xcc, 0x3d,
+  0x58, 0x58, 0x9a, 0xbd, 0xea, 0x90, 0x35, 0xbc, 0x0e, 0x69, 0x92, 0x3c, 0x68,
+  0x7e, 0x5c, 0xbc, 0x0a, 0xba, 0x55, 0x3d, 0x7e, 0xd4, 0xb9, 0x3b, 0x45, 0x5b,
+  0xe7, 0xbd, 0x6b, 0xe6, 0xd5, 0xbc, 0xbc, 0x3e, 0x14, 0xbd, 0xe8, 0xb5, 0x09,
+  0x3d, 0xbd, 0xde, 0xaf, 0x3d, 0xcf, 0x2d, 0x94, 0xbd, 0x12, 0x0f, 0xac, 0x3d,
+  0x21, 0x99, 0xc2, 0xbd, 0x45, 0x93, 0x0d, 0x3d, 0x8a, 0x1e, 0xe4, 0x3d, 0xe8,
+  0xfe, 0xb2, 0x3d, 0x0e, 0x69, 0xb8, 0xbd, 0xab, 0x2a, 0x91, 0xbc, 0x02, 0x24,
+  0x8f, 0xbd, 0xef, 0x96, 0xa7, 0x3b, 0x39, 0x39, 0xda, 0xbd, 0x31, 0x03, 0xcd,
+  0x3d, 0xe5, 0xf7, 0x4c, 0x3c, 0xca, 0x45, 0x3f, 0x3c, 0xb4, 0xf6, 0x8c, 0xbd,
+  0x4a, 0x36, 0x4f, 0x3c, 0x5c, 0xe7, 0x56, 0x3d, 0xe3, 0x81, 0xd6, 0xbd, 0x44,
+  0x9d, 0x3d, 0xbd, 0xb2, 0xf5, 0xe2, 0x3d, 0xaa, 0xd0, 0xff, 0xbc, 0x49, 0x86,
+  0x4b, 0x3d, 0x79, 0x40, 0x51, 0xbd, 0x60, 0xd2, 0x91, 0xbd, 0x9d, 0x61, 0x26,
+  0xbe, 0x32, 0x82, 0xe5, 0x3d, 0xa3, 0x28, 0xc5, 0xbc, 0x3f, 0x02, 0x08, 0xbd,
+  0x9b, 0xe8, 0xca, 0x3d, 0xb4, 0x34, 0xed, 0x3c, 0x48, 0x7f, 0xea, 0x3d, 0xd6,
+  0x07, 0xa1, 0xbd, 0xf9, 0xad, 0x18, 0x3c, 0xba, 0x0d, 0x8b, 0x3d, 0xa6, 0x13,
+  0x0f, 0x3e, 0x25, 0xfc, 0x99, 0x3c, 0xc4, 0x8e, 0xc1, 0x3c, 0xfe, 0xa2, 0x14,
+  0x3d, 0x0f, 0x96, 0xd5, 0xbc, 0x21, 0x99, 0xbb, 0xbc, 0xd7, 0x9c, 0xd1, 0x3d,
+  0x14, 0xd2, 0xa2, 0x3d, 0x8b, 0x64, 0xd9, 0xbd, 0x11, 0x36, 0xa2, 0x3c, 0xec,
+  0xbe, 0x24, 0xbd, 0x9f, 0x0f, 0x2a, 0x3d, 0x9d, 0xd5, 0xa6, 0xbd, 0xba, 0xe4,
+  0x83, 0xbd, 0xc1, 0xce, 0x45, 0xbd, 0x4a, 0x99, 0x8c, 0xbd, 0xa0, 0x8d, 0x99,
+  0x3b, 0xf1, 0x4b, 0x7a, 0xbc, 0x9d, 0x76, 0xd1, 0xbd, 0x65, 0x96, 0xd5, 0x3d,
+  0x65, 0xd5, 0x0a, 0xbd, 0x03, 0xb9, 0x60, 0x3c, 0xbe, 0xb3, 0x0e, 0xbe, 0xf3,
+  0x86, 0xf3, 0x3d, 0x28, 0xc1, 0x0f, 0x3d, 0x88, 0x69, 0xc0, 0xbc, 0x0e, 0x06,
+  0x7e, 0x3d, 0x42, 0x82, 0xa5, 0x3d, 0x28, 0x95, 0x1b, 0x3d, 0xb7, 0x6d, 0xac,
+  0xbd, 0xe0, 0xc9, 0x14, 0xbd, 0x5c, 0xf4, 0xb3, 0x3d, 0x74, 0x9e, 0xd4, 0xbd,
+  0x8d, 0x9a, 0xed, 0x3c, 0x9c, 0xe3, 0x01, 0x3d, 0x08, 0x0d, 0xc5, 0xbd, 0xc5,
+  0xba, 0xa7, 0xbd, 0xf2, 0xf8, 0x30, 0x3c, 0x41, 0x3c, 0xa8, 0x3d, 0x15, 0x63,
+  0x60, 0xbd, 0x31, 0x27, 0xc6, 0xbc, 0x61, 0x0f, 0xe8, 0xbd, 0xcf, 0x0c, 0xbb,
+  0xbc, 0xf5, 0x06, 0xbd, 0x3d, 0x99, 0x20, 0xb4, 0x3c, 0x5c, 0x27, 0x2d, 0xbd,
+  0x5f, 0x29, 0x4b, 0xbd, 0xe6, 0x17, 0xef, 0x3d, 0x9c, 0x60, 0x84, 0xbd, 0x6a,
+  0x76, 0xce, 0x3d, 0xf7, 0x48, 0x92, 0x3d, 0x6a, 0x72, 0xa3, 0x3d, 0x07, 0x7e,
+  0x04, 0x3e, 0x71, 0x2a, 0xa8, 0x3d, 0x9a, 0x94, 0x74, 0x3d, 0x78, 0x1b, 0xf6,
+  0x3d, 0x98, 0x1e, 0xfd, 0xbc, 0x3a, 0xf5, 0xc4, 0x39, 0x5f, 0x45, 0xc6, 0x3d,
+  0x14, 0xc4, 0x8b, 0x3d, 0xea, 0x0c, 0x16, 0xbd, 0x43, 0x08, 0x98, 0x3c, 0x42,
+  0x6d, 0x04, 0x3d, 0x8f, 0x4f, 0xc5, 0xbd, 0x88, 0x9e, 0x35, 0xbd, 0xfd, 0x1d,
+  0xfc, 0xbc, 0x82, 0x9f, 0xa5, 0x3c, 0xfe, 0xe2, 0x30, 0xbc, 0x6a, 0x80, 0xf1,
+  0x3c, 0xc0, 0x61, 0x39, 0x3d, 0xcd, 0x81, 0x08, 0xbe, 0x6f, 0xa9, 0xa9, 0xbd,
+  0x51, 0x50, 0x2b, 0xba, 0xaa, 0xd4, 0xa1, 0xbd, 0x13, 0x64, 0xdf, 0xbd, 0xa4,
+  0xd4, 0x5c, 0xbc, 0x2d, 0x83, 0xad, 0xbd, 0xc3, 0x31, 0x07, 0x3d, 0x7d, 0x7a,
+  0x97, 0xbc, 0xa7, 0x23, 0xf7, 0xbd, 0x61, 0x7f, 0xda, 0xbd, 0x1d, 0x39, 0xd4,
+  0xbd, 0x0b, 0x50, 0x8f, 0xbc, 0xfc, 0xa2, 0x06, 0x3e, 0x7b, 0x0e, 0x90, 0x3d,
+  0xf8, 0xa0, 0x9d, 0xbd, 0x25, 0x0f, 0x6d, 0x3d, 0xae, 0x7f, 0xb7, 0xbc, 0xe9,
+  0x1f, 0x10, 0xbe, 0x5b, 0x7f, 0x52, 0xbd, 0xe5, 0x86, 0x0d, 0xbd, 0x03, 0x12,
+  0x58, 0x3c, 0xee, 0x04, 0xaa, 0xbd, 0x08, 0x85, 0x0a, 0x3d, 0x73, 0x0b, 0x93,
+  0xbd, 0x4c, 0x42, 0x0d, 0xbd, 0xe9, 0xa4, 0x7f, 0x3d, 0x3b, 0x8a, 0xa8, 0x3c,
+  0xa6, 0x4d, 0x88, 0x3d, 0x44, 0xe9, 0x1e, 0x3c, 0x05, 0x39, 0xd0, 0x3d, 0x09,
+  0xc4, 0xc7, 0x3b, 0xdb, 0x43, 0x88, 0xbd, 0xb2, 0x44, 0x9d, 0x3d, 0x00, 0x42,
+  0x13, 0xbe, 0x25, 0x15, 0x9a, 0x3d, 0xee, 0x5d, 0x9d, 0x3d, 0x04, 0x63, 0x5b,
+  0xbb, 0x67, 0x1c, 0x9e, 0x3d, 0xe1, 0x8e, 0xb4, 0x3d, 0x68, 0xae, 0x8c, 0x3d,
+  0x1a, 0xdc, 0xac, 0x3d, 0xdb, 0x00, 0x86, 0x3d, 0x60, 0xb7, 0x07, 0xbd, 0x92,
+  0x7c, 0xbc, 0xbd, 0x47, 0xb6, 0x8f, 0x3c, 0x16, 0x03, 0xc1, 0x3d, 0xbb, 0x65,
+  0x94, 0x3d, 0x0c, 0x98, 0x05, 0xbd, 0xf1, 0xe1, 0xc2, 0x3d, 0xb5, 0xf2, 0x01,
+  0xbe, 0xf2, 0xe0, 0x01, 0x3d, 0xb4, 0x4a, 0xa5, 0x3d, 0x7c, 0x67, 0x97, 0x3d,
+  0xa4, 0xbe, 0x52, 0x3d, 0x17, 0x60, 0x1c, 0x3d, 0x95, 0x83, 0x5b, 0xbc, 0x33,
+  0x59, 0xd3, 0xbd, 0x45, 0x05, 0xf7, 0xbd, 0xa5, 0x82, 0xbe, 0x3d, 0x91, 0xc4,
+  0x46, 0x3d, 0x5c, 0x4b, 0x27, 0xb8, 0x32, 0xe3, 0xf9, 0x3c, 0xdf, 0xcb, 0xcc,
+  0x3d, 0xc3, 0x94, 0x6f, 0xbd, 0x10, 0xa2, 0xec, 0x3d, 0x2e, 0xaf, 0x09, 0xbc,
+  0x49, 0x91, 0x8d, 0x3d, 0x6e, 0xc8, 0xc5, 0xbc, 0x45, 0x0e, 0x66, 0xbc, 0x37,
+  0xd6, 0xfd, 0xbc, 0x2a, 0xea, 0x81, 0xbd, 0xf7, 0xc2, 0xc2, 0x3d, 0x12, 0x27,
+  0x6b, 0x3c, 0x97, 0x69, 0xf3, 0x3b, 0xc8, 0xb7, 0xa6, 0xbc, 0xd6, 0xdf, 0x96,
+  0xbc, 0xe0, 0x8a, 0x1b, 0x3e, 0xe3, 0x34, 0xc5, 0x3c, 0x96, 0xcd, 0x12, 0xbe,
+  0xcd, 0x75, 0x5a, 0x3c, 0x81, 0xd5, 0xd6, 0xbd, 0x2f, 0x97, 0x6e, 0xbd, 0x92,
+  0x28, 0x45, 0xbc, 0x81, 0xaf, 0xce, 0x3d, 0xc3, 0x35, 0xd3, 0x3d, 0x97, 0x1f,
+  0x99, 0x3c, 0x48, 0xb6, 0x5b, 0x3d, 0x98, 0x96, 0x9d, 0x3d, 0xed, 0x0a, 0xa3,
+  0x3c, 0x5e, 0x72, 0xe5, 0xbb, 0xad, 0x65, 0xaa, 0xbd, 0x16, 0x57, 0x8c, 0xbd,
+  0x4a, 0x37, 0x6b, 0xbd, 0x18, 0x35, 0xbe, 0xbd, 0xa8, 0xaa, 0x07, 0xbd, 0xbe,
+  0xcb, 0xf5, 0xbb, 0xbe, 0x69, 0xad, 0x3c, 0x1f, 0x82, 0x54, 0x3d, 0x32, 0xbe,
+  0x87, 0xbd, 0x67, 0x54, 0x41, 0x3d, 0x46, 0xb6, 0x2e, 0xbd, 0x04, 0xb2, 0x75,
+  0x3c, 0xb8, 0xf0, 0xcd, 0xbc, 0x63, 0x01, 0x7f, 0x3d, 0x92, 0xb6, 0x84, 0xbd,
+  0x43, 0x6b, 0xe0, 0x3d, 0x4a, 0xa8, 0xb3, 0x3c, 0x05, 0x93, 0x8f, 0xbd, 0xca,
+  0xa0, 0x84, 0x3d, 0x84, 0x4b, 0x27, 0x3e, 0x68, 0xce, 0xe2, 0xbd, 0x30, 0x5d,
+  0x22, 0x3d, 0xa3, 0x3c, 0xc0, 0x3d, 0xc3, 0xa5, 0x37, 0xbd, 0xc8, 0xb2, 0xa3,
+  0x3d, 0x79, 0xee, 0x82, 0x3d, 0xc6, 0xb3, 0xab, 0x3a, 0x72, 0xa4, 0x65, 0xbb,
+  0x5c, 0x20, 0xa7, 0x3d, 0xdd, 0xd9, 0xe5, 0xba, 0xbe, 0xcb, 0x9d, 0xbd, 0xdc,
+  0x19, 0xc5, 0xbd, 0xa8, 0x93, 0xc8, 0x3d, 0x4d, 0x2f, 0x1a, 0x3d, 0x24, 0x73,
+  0xa2, 0x3d, 0x11, 0xb1, 0x08, 0x3e, 0x8a, 0x27, 0xcf, 0x3d, 0xb6, 0xee, 0xab,
+  0xbd, 0x1f, 0xd7, 0xe1, 0x3d, 0x5d, 0xcf, 0x5f, 0xbd, 0x8e, 0xa9, 0xb0, 0x3c,
+  0x86, 0xb9, 0x31, 0x3d, 0xd7, 0xa8, 0x92, 0xbd, 0x7f, 0x37, 0xd0, 0x3d, 0x4c,
+  0xbb, 0xb6, 0x3d, 0xa4, 0x4d, 0x09, 0xbd, 0xc5, 0x8e, 0x0f, 0xbd, 0xbf, 0x27,
+  0xa8, 0xbd, 0x62, 0x94, 0xb2, 0x3d, 0x2d, 0x35, 0xe8, 0x3d, 0xd5, 0x78, 0xee,
+  0xbd, 0x2a, 0x5b, 0x5a, 0xbd, 0x72, 0x89, 0x4d, 0x3d, 0x7f, 0x5b, 0xfd, 0xb8,
+  0x11, 0x80, 0x58, 0xbd, 0x69, 0xa9, 0xbc, 0xbc, 0xdb, 0xe9, 0xd3, 0xbc, 0x45,
+  0x3b, 0xf5, 0xbc, 0xa6, 0x28, 0xc5, 0x3d, 0xe2, 0x48, 0x31, 0x3d, 0x49, 0xab,
+  0x36, 0x3b, 0xca, 0xd2, 0xc6, 0xbc, 0x29, 0x1f, 0x5a, 0x3d, 0x90, 0xe6, 0x3b,
+  0xbd, 0xf7, 0x5f, 0xa0, 0x3d, 0xb7, 0xc1, 0x91, 0x3d, 0x18, 0xcc, 0xc4, 0x3c,
+  0x0a, 0xc0, 0x8a, 0xbd, 0x2a, 0x5e, 0x63, 0xbd, 0xa1, 0x2f, 0xb7, 0xbc, 0xf2,
+  0xfb, 0xac, 0x3b, 0xa4, 0xed, 0x17, 0x3d, 0xc1, 0x09, 0x59, 0xbd, 0xe9, 0xf7,
+  0xf4, 0x3d, 0xad, 0xe5, 0x8f, 0xbd, 0xa9, 0x9e, 0xd0, 0x3d, 0x0a, 0x98, 0x40,
+  0xbd, 0xbc, 0x1f, 0x95, 0x3d, 0x0b, 0x17, 0xf0, 0x3c, 0x64, 0x3f, 0x60, 0xbd,
+  0xc0, 0xb2, 0xc7, 0x3b, 0x42, 0x3f, 0x62, 0x3c, 0x6a, 0x39, 0x8c, 0xbd, 0xbf,
+  0x72, 0xfd, 0xbd, 0x47, 0x3d, 0xd1, 0xbd, 0x7c, 0x0b, 0x6d, 0x3d, 0xf3, 0x4a,
+  0xda, 0xbc, 0xce, 0x57, 0x9d, 0x3d, 0xf0, 0x13, 0x53, 0x3b, 0x94, 0x39, 0x31,
+  0x3d, 0x3d, 0xa7, 0x3f, 0xbd, 0xfa, 0x3e, 0x6b, 0x3d, 0xfb, 0x19, 0xa9, 0x3d,
+  0x07, 0xfc, 0x5e, 0xbd, 0xfa, 0x47, 0xd3, 0x3d, 0xd6, 0x83, 0x9a, 0xbd, 0x2c,
+  0xa9, 0x14, 0x3e, 0x01, 0xb5, 0x7e, 0x3d, 0x27, 0xfb, 0x00, 0x3a, 0x7d, 0xe5,
+  0x35, 0xbd, 0x68, 0x50, 0x05, 0xbc, 0x87, 0xdb, 0x19, 0x3d, 0xbe, 0x2e, 0xe3,
+  0x3d, 0xe4, 0x41, 0x07, 0xbd, 0x53, 0x57, 0xcc, 0xb9, 0x28, 0x92, 0x96, 0x3d,
+  0xb6, 0x14, 0xa4, 0xbc, 0xad, 0x84, 0x69, 0x3c, 0x19, 0xe4, 0xde, 0xbd, 0x3b,
+  0xad, 0x04, 0xbe, 0xd9, 0xe3, 0xbc, 0x3d, 0x5b, 0x59, 0xd3, 0x3d, 0x00, 0x12,
+  0xcc, 0xbd, 0x2d, 0x0c, 0x8a, 0xbd, 0xc6, 0x1c, 0x79, 0x3d, 0x03, 0xf3, 0x14,
+  0xbc, 0xb7, 0x28, 0xa6, 0x3d, 0x28, 0x0d, 0xa5, 0xbd, 0xa9, 0x8e, 0x32, 0x3b,
+  0x60, 0xef, 0x30, 0x3d, 0x21, 0x9f, 0x68, 0xbc, 0x13, 0x02, 0x83, 0xbc, 0x21,
+  0x90, 0x9e, 0x3c, 0x78, 0xfa, 0xf4, 0xbc, 0xf9, 0x40, 0x6e, 0x3a, 0x11, 0xdb,
+  0x05, 0x3e, 0xc1, 0xb7, 0xff, 0x3b, 0x04, 0x47, 0x65, 0xbd, 0x6b, 0x8a, 0x85,
+  0xbd, 0x30, 0xd5, 0x95, 0x3d, 0x3c, 0x4a, 0x92, 0x3d, 0xa6, 0x20, 0x11, 0x3d,
+  0x03, 0xd8, 0xb1, 0x3c, 0x7d, 0x1e, 0x0b, 0xbd, 0xe9, 0x0a, 0x92, 0x3d, 0x7e,
+  0x9d, 0xb8, 0x3c, 0xb5, 0x1e, 0x6d, 0x3d, 0x6d, 0x4e, 0x6f, 0x3d, 0xbc, 0x1e,
+  0xdc, 0x3c, 0x2e, 0x87, 0xa0, 0x3d, 0x2d, 0x00, 0x5c, 0xb8, 0x8f, 0xfb, 0xb3,
+  0xbd, 0x9e, 0x36, 0x08, 0x3d, 0xa4, 0x19, 0xe0, 0xbb, 0x5f, 0xc0, 0xb7, 0xbb,
+  0xc7, 0x3c, 0x78, 0x3d, 0x53, 0xe4, 0x65, 0x3d, 0xca, 0xdf, 0xc9, 0x3d, 0x18,
+  0x8b, 0x27, 0xbd, 0x19, 0x05, 0xa6, 0x3d, 0x23, 0xa2, 0xa2, 0x3d, 0xc2, 0x4b,
+  0xac, 0xbd, 0x1b, 0x23, 0xd7, 0xbd, 0xc2, 0x53, 0x97, 0x3d, 0x2e, 0xb2, 0x45,
+  0xbd, 0x73, 0x7b, 0xbc, 0xbd, 0x33, 0xfc, 0x47, 0xbc, 0x0b, 0x36, 0x91, 0x3d,
+  0xaa, 0x1e, 0x0b, 0xbd, 0xc8, 0x3a, 0xda, 0x3c, 0x22, 0x29, 0xc5, 0x3d, 0x62,
+  0x18, 0xf3, 0x3c, 0x75, 0x25, 0xc1, 0xbc, 0xe8, 0x19, 0xb8, 0x3d, 0x30, 0x46,
+  0x47, 0x3d, 0x22, 0x80, 0x9f, 0xbc, 0x59, 0xcc, 0xcf, 0x3d, 0x00, 0x51, 0x95,
+  0xbc, 0x8b, 0x00, 0xbf, 0xbc, 0xf5, 0xca, 0x89, 0xbd, 0xca, 0x56, 0xe4, 0x3d,
+  0x7f, 0x86, 0x24, 0x3e, 0x23, 0xd7, 0x14, 0x3d, 0xe2, 0x8f, 0xa7, 0xbc, 0x1d,
+  0x6d, 0xb3, 0x3c, 0xa4, 0x8a, 0x85, 0xbd, 0x4a, 0x36, 0x40, 0xbd, 0x20, 0xa4,
+  0xa7, 0xbd, 0xfe, 0x10, 0xa3, 0xbc, 0xa3, 0x3b, 0xce, 0x3d, 0x88, 0x99, 0x12,
+  0xbd, 0x3d, 0x58, 0xd5, 0xbd, 0x76, 0xe5, 0x7f, 0x3c, 0x87, 0xa0, 0x68, 0xbd,
+  0x8a, 0xd4, 0xb7, 0xbd, 0xdb, 0x68, 0x6f, 0x3c, 0x22, 0x84, 0x2e, 0xbc, 0x94,
+  0x63, 0xa6, 0xbc, 0x35, 0xa4, 0xa9, 0x3d, 0x17, 0xec, 0x0d, 0xbd, 0xd4, 0x25,
+  0x9b, 0xbd, 0xf1, 0x84, 0x04, 0xbd, 0x3a, 0x19, 0xdd, 0x3d, 0xd8, 0xba, 0xb1,
+  0x3d, 0xb2, 0xb7, 0x21, 0xbd, 0xeb, 0x7e, 0x19, 0x3d, 0xb9, 0xd3, 0xb9, 0x3b,
+  0xa5, 0x6a, 0x88, 0xbd, 0xdc, 0x78, 0x99, 0xbd, 0xf4, 0x9f, 0xc4, 0x3d, 0x23,
+  0xfe, 0x49, 0xbb, 0xbe, 0xa0, 0x98, 0xbb, 0x05, 0xe8, 0x84, 0xbd, 0x0e, 0x24,
+  0x20, 0x3d, 0x30, 0x96, 0x80, 0xbd, 0xd8, 0x1e, 0xef, 0x3c, 0x0a, 0xad, 0xfe,
+  0x3d, 0xa3, 0xaa, 0x3b, 0xbd, 0x24, 0xd1, 0xb9, 0xbd, 0xfd, 0xb4, 0xd6, 0x3c,
+  0xe7, 0xfe, 0xe9, 0xbb, 0xf7, 0xd6, 0xaa, 0x3c, 0xa5, 0x35, 0xc1, 0xbc, 0x39,
+  0xbd, 0x00, 0xbe, 0x19, 0xed, 0x3b, 0x3d, 0x7f, 0x4e, 0x99, 0x3d, 0x09, 0x63,
+  0xe3, 0xbd, 0x74, 0xc3, 0x73, 0xbd, 0xb7, 0x7d, 0xa4, 0x3d, 0x68, 0x37, 0x50,
+  0xbd, 0xb0, 0xb0, 0xe8, 0xbd, 0x28, 0x4f, 0xa7, 0xbd, 0x22, 0x85, 0x9e, 0xbd,
+  0x32, 0xce, 0x12, 0x3e, 0x60, 0x47, 0xbb, 0x3c, 0xdb, 0xa8, 0xc6, 0x3d, 0x50,
+  0xcf, 0x0c, 0x3d, 0x4b, 0x7d, 0x9c, 0x3b, 0xa9, 0xeb, 0xb9, 0xbd, 0x07, 0x97,
+  0x13, 0x3c, 0xbe, 0x6b, 0x8f, 0xbd, 0x9c, 0xb3, 0xa9, 0x3d, 0x64, 0xd6, 0x96,
+  0xbd, 0x75, 0x6a, 0xc4, 0x3c, 0x20, 0xb6, 0x7e, 0x3d, 0x9b, 0x0e, 0x0c, 0x3e,
+  0xf3, 0xd5, 0xc5, 0x3d, 0x54, 0xb8, 0xdf, 0xbd, 0x12, 0x6e, 0xf2, 0x3a, 0x7b,
+  0xe4, 0xaa, 0x3c, 0xe3, 0x7c, 0xb5, 0xbd, 0xe6, 0x11, 0x05, 0x3d, 0xc6, 0x65,
+  0xa2, 0x3d, 0x95, 0x9e, 0x0c, 0x3d, 0x7f, 0xfe, 0xea, 0xbc, 0x22, 0x51, 0xcf,
+  0x3b, 0x7b, 0xdd, 0x98, 0xbc, 0x6e, 0x2f, 0xba, 0xbc, 0xb3, 0x8e, 0xe6, 0xbd,
+  0x5e, 0x5e, 0x76, 0x3d, 0x3e, 0xd4, 0xaf, 0xbd, 0x25, 0xbc, 0xa8, 0x3d, 0xb0,
+  0xd0, 0x81, 0x3c, 0x4c, 0x3f, 0x52, 0x3c, 0x10, 0xd7, 0x13, 0xbd, 0xd0, 0x83,
+  0x02, 0x3e, 0xd3, 0x03, 0xa5, 0x3d, 0xeb, 0xa7, 0xca, 0xbd, 0x91, 0x09, 0x1b,
+  0x3d, 0x7a, 0x8c, 0xbf, 0x3c, 0x89, 0x04, 0xdb, 0xbd, 0xf8, 0xfc, 0x56, 0xbd,
+  0x8a, 0x66, 0x36, 0x3d, 0x42, 0x8f, 0x6e, 0xbd, 0xc9, 0x79, 0x87, 0x3d, 0xbf,
+  0xfb, 0x26, 0x3d, 0x56, 0xeb, 0xbc, 0xbb, 0x3b, 0xa7, 0x17, 0x3d, 0x17, 0x46,
+  0x27, 0x3d, 0x87, 0xfb, 0xb4, 0x3d, 0x09, 0x7b, 0x9d, 0xbc, 0xf4, 0xdc, 0x30,
+  0x3d, 0xca, 0xee, 0xf7, 0xbd, 0x08, 0x73, 0xec, 0x3d, 0x60, 0xed, 0x24, 0x3d,
+  0x77, 0xa3, 0x26, 0x3c, 0x07, 0x95, 0xe2, 0x3c, 0x27, 0x2f, 0xde, 0x3c, 0xd3,
+  0x8a, 0x94, 0xbc, 0x58, 0x57, 0xaa, 0xbd, 0x86, 0xdd, 0x0d, 0x3d, 0x29, 0x14,
+  0x56, 0x3d, 0x94, 0xdf, 0xa8, 0x3d, 0x33, 0x86, 0xbd, 0x3d, 0xb2, 0x8a, 0x7b,
+  0x3c, 0x8d, 0x7b, 0x26, 0xbc, 0x2f, 0x59, 0xb8, 0xbd, 0x65, 0xc2, 0x87, 0xbd,
+  0xd3, 0x4b, 0x76, 0x3d, 0x16, 0x20, 0x22, 0x3d, 0xb9, 0xef, 0x62, 0x3b, 0xda,
+  0x3b, 0x6b, 0x3d, 0xce, 0x75, 0x59, 0x3d, 0x90, 0xde, 0x33, 0x3d, 0x77, 0x8b,
+  0xf7, 0x3d, 0x98, 0xfd, 0xa0, 0xbd, 0xcc, 0xa0, 0xd2, 0x3d, 0xec, 0x73, 0x84,
+  0xbd, 0x2c, 0x7a, 0x34, 0x3c, 0xbd, 0x44, 0x07, 0x3e, 0xd8, 0xf6, 0x74, 0xbd,
+  0x0a, 0x72, 0x8c, 0xbd, 0xad, 0xd3, 0xd5, 0xbd, 0x78, 0xf7, 0xc9, 0x3d, 0x28,
+  0xef, 0x5f, 0x3d, 0x01, 0xbf, 0x80, 0xbd, 0xcc, 0xd6, 0x01, 0xbd, 0x37, 0x34,
+  0x75, 0xbd, 0x4a, 0x00, 0x87, 0x3d, 0x4c, 0xd9, 0x4c, 0xbb, 0xcd, 0x86, 0x42,
+  0xbd, 0x7b, 0xef, 0x1a, 0x3d, 0x98, 0x2b, 0x3a, 0x3d, 0x97, 0x7a, 0x18, 0x3c,
+  0xd0, 0x24, 0xe6, 0xbd, 0xcd, 0xc5, 0xc2, 0x3c, 0x8d, 0x69, 0x7f, 0xbc, 0xed,
+  0xef, 0x88, 0xbd, 0x54, 0x72, 0xd6, 0x3d, 0xc4, 0x5b, 0xba, 0x3d, 0x13, 0xd9,
+  0x1d, 0xbd, 0xa9, 0x69, 0xd5, 0x3d, 0xf6, 0xab, 0x4b, 0x3d, 0xaf, 0x3c, 0xab,
+  0x3d, 0xad, 0x17, 0x02, 0x3d, 0xfe, 0x82, 0x97, 0xbd, 0xe7, 0x5b, 0xca, 0x3d,
+  0x0d, 0x04, 0x1b, 0x3d, 0x6a, 0x95, 0xb5, 0x3d, 0xa7, 0x5f, 0xc5, 0x3d, 0x57,
+  0xf4, 0xdc, 0x3d, 0x25, 0xf3, 0xa2, 0xbd, 0xad, 0x96, 0xd3, 0x3d, 0x16, 0xb7,
+  0x2f, 0xbe, 0x61, 0x4c, 0xaa, 0x3d, 0x71, 0x82, 0xcc, 0x3d, 0x44, 0x36, 0xbb,
+  0x3d, 0xba, 0x8f, 0xca, 0xbc, 0xe0, 0xa3, 0x63, 0x3c, 0xfa, 0x02, 0xb3, 0xbd,
+  0x0a, 0xcf, 0x00, 0xbe, 0x4b, 0xce, 0x7e, 0xbd, 0xe9, 0x90, 0xcf, 0x3b, 0x32,
+  0x0d, 0xa9, 0xbd, 0x54, 0x4d, 0x42, 0x3d, 0x30, 0x36, 0x32, 0x3d, 0x04, 0xa6,
+  0xb2, 0xbd, 0x79, 0x05, 0x0a, 0x3e, 0xbb, 0x45, 0xe6, 0x3c, 0xfd, 0xf6, 0x79,
+  0x3d, 0x1c, 0x9f, 0x1d, 0x3d, 0xe5, 0x27, 0x97, 0x3c, 0x31, 0xf4, 0x02, 0xbd,
+  0x30, 0x19, 0x45, 0x3d, 0xa4, 0x54, 0x06, 0x3d, 0x94, 0x4d, 0xb9, 0xbd, 0x3b,
+  0x21, 0xdf, 0xbd, 0xbb, 0x79, 0x1f, 0xbd, 0x41, 0x34, 0x9f, 0x3d, 0x02, 0x58,
+  0xb8, 0x3d, 0xe1, 0xb2, 0x03, 0xbe, 0x5e, 0x71, 0x29, 0x3d, 0x9e, 0xf7, 0xbf,
+  0xbd, 0xc7, 0x01, 0x75, 0xbd, 0x0d, 0xe3, 0x14, 0xbd, 0x38, 0x23, 0xa3, 0x3d,
+  0x93, 0xbc, 0xaa, 0xbd, 0xc9, 0x19, 0x91, 0x3d, 0xcb, 0xba, 0x69, 0x3d, 0xfc,
+  0xfa, 0xd7, 0x3d, 0x95, 0xd9, 0x38, 0xbd, 0x4e, 0x3f, 0x75, 0x3d, 0x73, 0xdb,
+  0x15, 0xbe, 0xdf, 0x76, 0x8d, 0x3d, 0x0f, 0xb1, 0x13, 0x3d, 0x90, 0x32, 0x24,
+  0x3e, 0x3a, 0x17, 0xf9, 0xbd, 0xcd, 0xd1, 0x38, 0xbd, 0x27, 0xf4, 0x9b, 0xbd,
+  0x10, 0x6c, 0xa3, 0xbc, 0x1e, 0x12, 0x42, 0x3d, 0xee, 0x38, 0xff, 0xbc, 0xb4,
+  0x28, 0x2e, 0x3d, 0xba, 0x69, 0xbd, 0xbc, 0x7c, 0x69, 0xbb, 0xbc, 0x1a, 0xe8,
+  0xde, 0xbd, 0xd8, 0xa2, 0x17, 0x3c, 0xb8, 0x9e, 0xb6, 0xbb, 0xae, 0x5e, 0x96,
+  0x3c, 0x4f, 0xbb, 0x03, 0xbd, 0x8f, 0x72, 0xb4, 0xbc, 0x94, 0x57, 0xd7, 0x3d,
+  0xf5, 0xe3, 0xaf, 0xbc, 0xa4, 0x0c, 0x0d, 0xbd, 0x13, 0xbb, 0x83, 0x3d, 0x62,
+  0x06, 0xda, 0x3d, 0xb7, 0xa5, 0x1c, 0x3e, 0x90, 0xd8, 0x86, 0xbd, 0xf5, 0x7e,
+  0xd0, 0xbd, 0x8b, 0x5e, 0xcb, 0xbd, 0x0e, 0x81, 0xf5, 0xbd, 0xfe, 0xf3, 0xe4,
+  0xbc, 0xe2, 0xc9, 0xd6, 0xbc, 0x4c, 0xa9, 0xc8, 0x3b, 0x04, 0xd2, 0x49, 0xbc,
+  0xf0, 0xb2, 0xa5, 0xbd, 0xc7, 0xd6, 0xea, 0x3d, 0xa6, 0xa6, 0x77, 0x3d, 0xdf,
+  0x24, 0x03, 0x3d, 0x05, 0x9e, 0x86, 0xbd, 0xce, 0x27, 0x31, 0x3d, 0x46, 0x54,
+  0xa4, 0x3d, 0x27, 0x9b, 0x35, 0xbd, 0x28, 0x86, 0x68, 0xbb, 0x2c, 0x1e, 0xc1,
+  0xbd, 0xda, 0x7e, 0xa2, 0x3b, 0xa6, 0xe6, 0xe9, 0x3d, 0x8a, 0xcf, 0x0f, 0x3d,
+  0x5e, 0xf0, 0x6f, 0xbd, 0xa0, 0xc6, 0xb1, 0xbb, 0x08, 0xc6, 0x77, 0xbc, 0x6d,
+  0x17, 0x16, 0xbd, 0xf5, 0xc6, 0x21, 0x3d, 0x70, 0x2a, 0x11, 0xbd, 0x3f, 0x5a,
+  0x6c, 0xbd, 0xfb, 0xd9, 0xbc, 0x3d, 0x91, 0x33, 0xb4, 0x3c, 0xc1, 0xc7, 0x84,
+  0x3d, 0xd9, 0xca, 0x41, 0xbd, 0xd8, 0x5d, 0xec, 0x3d, 0x17, 0xe2, 0x94, 0x3d,
+  0xbf, 0x3f, 0x04, 0xbe, 0x24, 0xa8, 0x66, 0xbd, 0xc4, 0xcd, 0xc0, 0x3d, 0x07,
+  0xce, 0x9e, 0xbd, 0x67, 0x5d, 0xe0, 0x3d, 0x9e, 0xdd, 0x1c, 0xbe, 0x77, 0xe5,
+  0x5c, 0x3d, 0x98, 0x1f, 0xaf, 0x3d, 0x8a, 0xfd, 0x02, 0x3e, 0x9f, 0x9a, 0xba,
+  0xbc, 0x40, 0xe9, 0xbb, 0x3c, 0x4e, 0x51, 0x10, 0xbc, 0xc6, 0xcc, 0x81, 0x3d,
+  0x83, 0x18, 0x78, 0xbc, 0x7f, 0x25, 0xe8, 0xbd, 0x2e, 0xa6, 0xcb, 0x3c, 0x2f,
+  0x8c, 0x3e, 0x3c, 0x38, 0xdc, 0x67, 0xbb, 0x57, 0xf8, 0xbd, 0x3d, 0xa2, 0x4b,
+  0x13, 0x3e, 0x6d, 0x76, 0x64, 0x3d, 0xcf, 0x5e, 0x98, 0x3c, 0x09, 0xc1, 0x8a,
+  0x3c, 0x42, 0x2b, 0x82, 0x3d, 0xa3, 0x83, 0x4a, 0x3d, 0xe3, 0x74, 0xb9, 0xbb,
+  0x26, 0xf8, 0x62, 0x3d, 0xd6, 0x4d, 0xa4, 0xbc, 0x68, 0x44, 0x13, 0x3d, 0x3b,
+  0x7d, 0x54, 0x3d, 0xf4, 0xdf, 0x8c, 0x3d, 0xef, 0x72, 0xcf, 0xbd, 0x4e, 0xd6,
+  0x85, 0x3c, 0x6a, 0x11, 0x38, 0xbc, 0xa5, 0xec, 0x83, 0xbd, 0x23, 0x95, 0x86,
+  0xbd, 0x93, 0xa0, 0xbf, 0x3c, 0x91, 0xc5, 0x11, 0xbd, 0x96, 0x1b, 0x23, 0x3d,
+  0xbc, 0x6d, 0x00, 0x3d, 0x55, 0xb7, 0x9d, 0x3d, 0x44, 0x45, 0x8d, 0x3c, 0x83,
+  0x34, 0x19, 0xbd, 0x1c, 0x2e, 0xbe, 0xbd, 0xfb, 0x4b, 0xd5, 0x3c, 0x25, 0xec,
+  0xd9, 0xba, 0xe0, 0xcd, 0xa9, 0x3d, 0x72, 0x99, 0xa1, 0x3d, 0xa6, 0xa1, 0x91,
+  0xbd, 0xc8, 0x70, 0x39, 0xbd, 0x33, 0x54, 0x24, 0x3d, 0x80, 0x25, 0xd8, 0x3c,
+  0x3c, 0x36, 0xdb, 0x3b, 0x04, 0x22, 0x3c, 0xbd, 0xc8, 0x81, 0xfb, 0x3d, 0x89,
+  0x15, 0xe1, 0x3d, 0xa5, 0x9d, 0x17, 0xbd, 0x68, 0xad, 0x64, 0xbd, 0xad, 0xbd,
+  0x59, 0xbc, 0xfc, 0x1a, 0xa5, 0xbd, 0xf5, 0x88, 0x44, 0x3d, 0x53, 0xa7, 0x9b,
+  0x3d, 0x2e, 0x00, 0x93, 0xbd, 0xbd, 0xb1, 0xb9, 0x3c, 0x61, 0x54, 0xc8, 0x3c,
+  0xe3, 0xe9, 0xd7, 0x3d, 0x78, 0xe2, 0xe0, 0x3d, 0x6c, 0xe0, 0x08, 0xbe, 0x80,
+  0xc2, 0xaf, 0x3d, 0x2a, 0x5c, 0x10, 0xbd, 0x60, 0xcb, 0xf0, 0x3d, 0x7a, 0xa1,
+  0xf0, 0xbb, 0x02, 0x56, 0xa9, 0x3d, 0x11, 0xf1, 0x1c, 0x3c, 0x39, 0xec, 0xa9,
+  0xbd, 0x73, 0xfd, 0x24, 0xbd, 0xd5, 0x86, 0x8c, 0x3d, 0xdc, 0x85, 0x21, 0x3c,
+  0xa7, 0x6f, 0xf6, 0x3d, 0xe0, 0x6b, 0x0c, 0xbd, 0x08, 0x15, 0xf2, 0x3d, 0xd6,
+  0x6a, 0xed, 0x3d, 0xda, 0xc1, 0x51, 0xbd, 0x27, 0x6e, 0x11, 0xbe, 0xbe, 0x8f,
+  0xcf, 0xbc, 0xa9, 0xf1, 0x05, 0x3d, 0xa1, 0x30, 0x8d, 0xbd, 0x35, 0x5e, 0x97,
+  0xbd, 0xee, 0x02, 0x9d, 0xbc, 0xf8, 0xba, 0xe9, 0xbd, 0x61, 0xe1, 0xb5, 0xbd,
+  0xaa, 0x6d, 0x0c, 0xbd, 0xeb, 0x1f, 0x5d, 0xbd, 0x17, 0x11, 0xda, 0x3c, 0xe3,
+  0x75, 0x55, 0xbd, 0x8b, 0x40, 0x4a, 0x3d, 0xb2, 0x5b, 0x17, 0xbd, 0xc2, 0xbb,
+  0x66, 0xbd, 0x42, 0x20, 0xf7, 0x3d, 0x05, 0x75, 0xff, 0xbd, 0xce, 0xd3, 0xca,
+  0x3c, 0x76, 0x10, 0xbb, 0x3d, 0x66, 0xa2, 0xcc, 0xbc, 0x96, 0x30, 0xf7, 0xba,
+  0xad, 0xa8, 0x16, 0xbc, 0x32, 0x10, 0x77, 0x3b, 0x98, 0xde, 0x1f, 0xbd, 0xc7,
+  0xd6, 0x72, 0x3d, 0x33, 0xea, 0xe1, 0x3d, 0xb5, 0x5d, 0x8d, 0x3c, 0xfe, 0xf1,
+  0x64, 0x3d, 0x3f, 0xe1, 0x88, 0x3c, 0x0d, 0xa2, 0x92, 0x3d, 0x52, 0x90, 0x20,
+  0xbd, 0xcd, 0x17, 0x88, 0xbd, 0xf7, 0xf1, 0x7b, 0x3d, 0x55, 0xbe, 0x9c, 0x3b,
+  0x1a, 0x3f, 0xd1, 0x3c, 0x46, 0xbe, 0x0d, 0x3d, 0x53, 0xd7, 0xd9, 0x3d, 0xda,
+  0x58, 0xb5, 0xbc, 0x3a, 0x41, 0x78, 0xbd, 0x78, 0xc0, 0x54, 0xbd, 0x3c, 0x27,
+  0x10, 0x3e, 0x16, 0x00, 0xe9, 0x3b, 0x6e, 0xcd, 0xc5, 0x3d, 0xd9, 0xf0, 0x82,
+  0x3d, 0x44, 0x3e, 0x82, 0x3d, 0xde, 0x31, 0x83, 0x3d, 0x10, 0x32, 0x4e, 0xbd,
+  0x13, 0x46, 0xd7, 0xbd, 0x60, 0xa0, 0xbb, 0xbc, 0x33, 0xc9, 0xb0, 0xbd, 0x8d,
+  0x52, 0xfb, 0x3d, 0x5e, 0xa7, 0x07, 0x3d, 0x05, 0xd7, 0xb7, 0x3d, 0x34, 0x8c,
+  0x71, 0x3d, 0xcf, 0x5d, 0x66, 0xbd, 0x2a, 0x61, 0x1c, 0x3d, 0xa5, 0xa5, 0x70,
+  0xbd, 0xd2, 0xb9, 0x67, 0x3b, 0x9e, 0x63, 0x5a, 0x3d, 0xbe, 0xea, 0xd4, 0xbc,
+  0x57, 0xe9, 0xb5, 0x3d, 0x03, 0xe4, 0xa6, 0x3d, 0xc4, 0x6b, 0xb3, 0x3d, 0x6e,
+  0x60, 0x9f, 0x3d, 0xac, 0x31, 0xa0, 0x3d, 0xcf, 0xcc, 0xb5, 0x3d, 0xd0, 0x80,
+  0xd6, 0x3d, 0xb9, 0x3f, 0x96, 0xbd, 0x2d, 0x17, 0x17, 0xbb, 0x6f, 0xf2, 0xe4,
+  0xbd, 0x17, 0x51, 0x6e, 0x3d, 0xc2, 0xe2, 0xc2, 0x3d, 0xfe, 0x71, 0x59, 0x3d,
+  0x0e, 0x1c, 0x78, 0xbd, 0xc9, 0xc7, 0xbc, 0xbd, 0x40, 0xb0, 0xa8, 0x3d, 0xbf,
+  0xff, 0x42, 0xbd, 0xe4, 0x2e, 0x67, 0x3d, 0xca, 0x73, 0x81, 0xbd, 0x0b, 0x0d,
+  0xf3, 0x3d, 0xce, 0x97, 0x70, 0x3d, 0xe9, 0x59, 0xe9, 0x3d, 0x45, 0x22, 0x73,
+  0xbd, 0x24, 0xb8, 0xdf, 0x3d, 0x96, 0xbb, 0x3f, 0x3c, 0x02, 0xed, 0x65, 0x3d,
+  0x84, 0x40, 0x25, 0x3c, 0x6c, 0xc5, 0xd2, 0x3c, 0xea, 0x38, 0x4a, 0x3d, 0xf9,
+  0xa2, 0xc9, 0x3d, 0x6f, 0x30, 0xbc, 0x3a, 0x2d, 0xd5, 0x81, 0xbd, 0xd2, 0xae,
+  0xa3, 0xbb, 0x8e, 0x91, 0xe7, 0x3c, 0x28, 0x6b, 0xc4, 0xbd, 0xf3, 0x0c, 0xbf,
+  0xbc, 0x66, 0xf8, 0xd3, 0x3b, 0x6d, 0x3e, 0x01, 0x3d, 0xf3, 0xbf, 0xc2, 0xbc,
+  0x0d, 0xc5, 0x6f, 0xbd, 0xb7, 0x9b, 0x9c, 0x3d, 0xeb, 0x79, 0x88, 0x3d, 0x81,
+  0x8a, 0x7d, 0xbc, 0xde, 0x8b, 0x14, 0x3d, 0xa4, 0x3f, 0x7d, 0x3d, 0xb4, 0x27,
+  0xa9, 0x3d, 0xb7, 0x75, 0x51, 0x3d, 0xff, 0x73, 0x85, 0x3d, 0x3f, 0xf3, 0x51,
+  0x3d, 0xe6, 0xdd, 0xe2, 0xbb, 0x83, 0xc7, 0x65, 0xbd, 0x6a, 0x16, 0xb6, 0xbd,
+  0xcf, 0xe8, 0x90, 0x3d, 0x5b, 0xc8, 0xad, 0xbc, 0xa1, 0x27, 0x29, 0xbd, 0x57,
+  0xbd, 0x3d, 0x3d, 0x61, 0x4e, 0x41, 0xbc, 0x21, 0x2f, 0x29, 0x3d, 0x55, 0x0b,
+  0xba, 0x3d, 0xaa, 0x67, 0xf3, 0xba, 0x7d, 0x60, 0xe4, 0x3d, 0xab, 0xe7, 0x20,
+  0xbd, 0x01, 0x71, 0x9f, 0x3d, 0x5a, 0xd5, 0x95, 0xbd, 0x2f, 0x75, 0xd5, 0x3d,
+  0x7c, 0x91, 0xf6, 0x3d, 0xaa, 0xd6, 0x0c, 0x3d, 0x6d, 0x1c, 0xd9, 0xbd, 0xb4,
+  0x4e, 0x82, 0xbc, 0x3f, 0x5a, 0x1a, 0x3b, 0xb4, 0x94, 0xfb, 0x3d, 0x0a, 0x71,
+  0x3c, 0xbd, 0x97, 0xba, 0x12, 0xbc, 0xfd, 0x3d, 0x33, 0xbd, 0xa3, 0x4d, 0x01,
+  0x3e, 0x54, 0xe2, 0x33, 0xbd, 0x8d, 0x32, 0x5d, 0x3d, 0x92, 0x84, 0xcb, 0x3d,
+  0x91, 0x67, 0xde, 0xbd, 0x4b, 0xfd, 0xc7, 0xbd, 0x4b, 0x11, 0x04, 0xbe, 0x3e,
+  0xde, 0xac, 0x3d, 0xe4, 0x9e, 0x3c, 0x3d, 0x5e, 0x7d, 0xfb, 0x3d, 0xfd, 0x4d,
+  0xae, 0x3d, 0x63, 0xcf, 0x6f, 0xbd, 0xa0, 0x4f, 0x8b, 0x3d, 0x46, 0x2c, 0x84,
+  0xbd, 0xda, 0x69, 0x11, 0x3b, 0xca, 0x5b, 0x1c, 0xbd, 0x59, 0x23, 0x26, 0x3e,
+  0x16, 0xb1, 0x68, 0xbd, 0x1c, 0xd4, 0x98, 0xbd, 0x9c, 0x91, 0x6e, 0xbd, 0xa5,
+  0xc6, 0x55, 0xbc, 0xd0, 0xf3, 0xcc, 0xbd, 0xe8, 0x91, 0xe0, 0xbd, 0xdf, 0xe3,
+  0xb4, 0x3d, 0x04, 0x77, 0xc2, 0xbd, 0xcc, 0x21, 0xda, 0xbd, 0x7d, 0xed, 0x1d,
+  0x3d, 0x1c, 0xa9, 0x0f, 0x3e, 0x25, 0x19, 0x67, 0x3d, 0xcc, 0x29, 0x65, 0xbd,
+  0x34, 0x00, 0xdd, 0x3d, 0xe3, 0x04, 0x15, 0xbd, 0x79, 0xb8, 0x50, 0xbd, 0x98,
+  0x5b, 0x44, 0xbc, 0x32, 0x55, 0xd1, 0x3d, 0x19, 0x20, 0x2a, 0xbd, 0xbd, 0x28,
+  0xb6, 0x3c, 0x33, 0xf4, 0xc4, 0xbb, 0x95, 0x26, 0x9f, 0xbb, 0x93, 0xb7, 0x7f,
+  0x3d, 0x16, 0xbc, 0x5f, 0x3d, 0x0a, 0x14, 0x82, 0x3c, 0x3a, 0x40, 0x12, 0x3e,
+  0x99, 0x9c, 0xbe, 0x3c, 0x6c, 0x22, 0x72, 0x3d, 0xb3, 0x18, 0x10, 0xbe, 0x2b,
+  0x6f, 0x4b, 0x3d, 0xaf, 0x83, 0x90, 0x3c, 0x67, 0x6b, 0x57, 0x3d, 0xae, 0xba,
+  0x1d, 0xbd, 0x42, 0x58, 0xda, 0xbd, 0xcd, 0x16, 0xc6, 0xbd, 0x28, 0x11, 0xa1,
+  0xbd, 0xc3, 0xfa, 0x6b, 0x3d, 0xff, 0x35, 0xc4, 0x3d, 0xca, 0x54, 0x9d, 0x3d,
+  0x65, 0xc0, 0x0a, 0x3d, 0xbe, 0xbd, 0x73, 0xbc, 0xee, 0xf8, 0xfb, 0x3a, 0x88,
+  0xcf, 0x2c, 0x3d, 0xa4, 0x2d, 0xb9, 0x3d, 0x30, 0xbf, 0x9c, 0xbd, 0x16, 0xf6,
+  0x97, 0x3c, 0x72, 0xf4, 0x12, 0x3d, 0x4c, 0xc6, 0x01, 0xbd, 0x68, 0x2e, 0xc0,
+  0xbd, 0x38, 0xd4, 0x2c, 0x3d, 0xe6, 0xb4, 0xbf, 0x3d, 0xf5, 0x15, 0x66, 0xbd,
+  0x29, 0x0f, 0x83, 0x3d, 0x44, 0x2b, 0xb0, 0x3d, 0xa1, 0x53, 0xeb, 0x3d, 0xc6,
+  0x86, 0x8a, 0x3d, 0xe0, 0x36, 0x48, 0xbd, 0x29, 0xff, 0x22, 0xbd, 0xff, 0x33,
+  0xae, 0x3d, 0xa2, 0x5b, 0x13, 0xbd, 0x1d, 0x6f, 0x9e, 0x3d, 0x0e, 0x6d, 0x09,
+  0x3d, 0x7f, 0x06, 0x01, 0xbe, 0xc8, 0x08, 0xc7, 0x3d, 0xc2, 0xe8, 0xae, 0x3d,
+  0xe6, 0x4a, 0xc7, 0x3d, 0x29, 0x40, 0xb3, 0x3d, 0xb5, 0x99, 0x83, 0xbd, 0xa4,
+  0x23, 0x8f, 0x3d, 0x4a, 0xa2, 0x9c, 0x3d, 0x0d, 0xe2, 0x04, 0x3d, 0x40, 0xff,
+  0x07, 0x3d, 0xa4, 0x8c, 0x30, 0x3d, 0x75, 0x00, 0x1c, 0x3d, 0x45, 0x9b, 0x02,
+  0x3e, 0xb2, 0xce, 0x2e, 0x3d, 0x16, 0x9d, 0x3f, 0xbd, 0x8e, 0xf1, 0x1b, 0xbc,
+  0x9b, 0x59, 0x04, 0xbd, 0xae, 0xd7, 0xd3, 0x3d, 0x2b, 0x15, 0x05, 0x3b, 0x12,
+  0xec, 0x5d, 0x3c, 0x30, 0xe9, 0xea, 0x3d, 0x58, 0xe5, 0xe4, 0xbd, 0x9b, 0x54,
+  0x86, 0xbd, 0xf0, 0x47, 0x4e, 0xbd, 0x21, 0xa7, 0xef, 0x3b, 0x89, 0xf9, 0x23,
+  0x3d, 0xec, 0x14, 0x48, 0xbd, 0xfc, 0x86, 0x20, 0x3e, 0x08, 0x69, 0x95, 0x3d,
+  0x26, 0x08, 0xb6, 0xbd, 0xd9, 0xe2, 0xb3, 0xbd, 0x27, 0x6f, 0xf0, 0x3d, 0x9d,
+  0xc4, 0x1c, 0xbe, 0x1a, 0x6e, 0x22, 0x3d, 0xc5, 0xe3, 0x68, 0x3d, 0x45, 0x2d,
+  0x8a, 0xbb, 0xbe, 0xf3, 0x84, 0x3d, 0x63, 0xef, 0x10, 0x3d, 0x54, 0xfa, 0xde,
+  0x3c, 0x57, 0x4c, 0xc4, 0x3d, 0xa7, 0x44, 0x8b, 0xbd, 0x9e, 0xf0, 0x33, 0xbd,
+  0x9a, 0x6c, 0x89, 0x3d, 0x6c, 0xc9, 0x21, 0xbe, 0x0e, 0x60, 0x9d, 0xbd, 0xd9,
+  0x35, 0x1f, 0xbd, 0x0d, 0x4f, 0x9a, 0x3d, 0xd4, 0x24, 0xca, 0x3d, 0xc4, 0x5c,
+  0x45, 0xbd, 0x28, 0x24, 0xea, 0x3c, 0xee, 0xea, 0xef, 0xbd, 0x4d, 0xae, 0x89,
+  0x3d, 0x91, 0x99, 0x79, 0xbc, 0xb6, 0x1b, 0xc2, 0x3d, 0xcb, 0x8d, 0xb4, 0xbc,
+  0x63, 0xaa, 0x7f, 0xbd, 0x19, 0xbc, 0xe6, 0xbc, 0x82, 0x28, 0x4e, 0xbd, 0xf4,
+  0x7a, 0xbc, 0x3d, 0xe4, 0xe7, 0xcd, 0xbd, 0x2c, 0xe3, 0xda, 0xbd, 0xc6, 0x98,
+  0xec, 0x3d, 0xd7, 0xfc, 0xf8, 0xbc, 0xd4, 0x80, 0x76, 0x3d, 0xbf, 0x17, 0x3e,
+  0xbd, 0x20, 0x69, 0x48, 0x3a, 0x1c, 0x2c, 0xa2, 0x3d, 0xc2, 0x8b, 0x95, 0x3d,
+  0xc4, 0xb5, 0xa9, 0x3d, 0x43, 0x5b, 0xde, 0xbc, 0xf1, 0x1e, 0x0f, 0xbd, 0x52,
+  0x3e, 0xbb, 0x3d, 0xff, 0xaf, 0xfd, 0x3d, 0x66, 0x65, 0x59, 0x3d, 0x03, 0x95,
+  0x55, 0x3d, 0x97, 0x22, 0x04, 0xbe, 0xcb, 0x24, 0x32, 0xbd, 0xf3, 0x26, 0xa5,
+  0xbd, 0xaa, 0xd3, 0xdb, 0xbc, 0x75, 0x5b, 0x41, 0xbd, 0x2e, 0x2c, 0xc4, 0x3d,
+  0xd5, 0x98, 0xc4, 0x3c, 0xa3, 0x19, 0x01, 0x3c, 0x4e, 0x3f, 0x3c, 0x3d, 0xea,
+  0xee, 0x2d, 0xbd, 0x3f, 0x97, 0x13, 0xbc, 0xed, 0xdd, 0x55, 0x3d, 0x49, 0xba,
+  0xfb, 0xbd, 0x5c, 0xbd, 0xc9, 0xbd, 0xe8, 0x9f, 0xad, 0x3d, 0x9c, 0x26, 0x32,
+  0xbd, 0xf6, 0xfa, 0x15, 0xbe, 0x09, 0x88, 0xc0, 0xbd, 0xe2, 0xcc, 0xaf, 0xbd,
+  0xdb, 0x22, 0x56, 0x3d, 0x78, 0x3f, 0x0f, 0xbc, 0x50, 0xe5, 0x93, 0xbd, 0x55,
+  0x90, 0x09, 0x3d, 0xac, 0xec, 0x6d, 0xbd, 0x93, 0x0e, 0xce, 0xbc, 0x5b, 0xde,
+  0x85, 0x3d, 0x08, 0x1d, 0x4b, 0x3d, 0x8f, 0x16, 0xf4, 0xbd, 0x89, 0xf8, 0x83,
+  0xbd, 0x65, 0xf3, 0xf8, 0xbc, 0xe3, 0x37, 0x09, 0x3b, 0x37, 0x89, 0x91, 0xbc,
+  0x69, 0xea, 0x2f, 0xbd, 0x2c, 0xf2, 0xbf, 0x3c, 0xd0, 0x57, 0xa7, 0x3d, 0xae,
+  0x94, 0xbf, 0x3d, 0x15, 0x1d, 0x63, 0x3d, 0x53, 0x20, 0x4b, 0xbd, 0x4f, 0xf2,
+  0x00, 0x3e, 0x29, 0x36, 0x54, 0xbd, 0x49, 0x2d, 0x8c, 0xbd, 0x29, 0xbc, 0xb6,
+  0x3d, 0x08, 0xc4, 0xc7, 0x3d, 0xb6, 0x3d, 0xf9, 0xbd, 0x84, 0x0f, 0xa1, 0x3d,
+  0xe8, 0x20, 0xb1, 0xbd, 0x8b, 0xf6, 0xa8, 0xbd, 0x51, 0xec, 0x75, 0x3d, 0x85,
+  0xeb, 0x13, 0xbe, 0x5c, 0xe5, 0x4f, 0x3d, 0xe5, 0x90, 0xf3, 0xbc, 0x5a, 0xb0,
+  0x39, 0xbd, 0xbf, 0x7a, 0x63, 0x3d, 0xa4, 0x35, 0x08, 0x3e, 0xae, 0x8a, 0xa6,
+  0xbd, 0x4d, 0x53, 0x46, 0xbd, 0x8e, 0xb0, 0x46, 0xbc, 0x9d, 0x94, 0x15, 0x3d,
+  0x6d, 0xdc, 0x62, 0x3c, 0x75, 0x33, 0x29, 0x3d, 0x61, 0xba, 0x3d, 0x3d, 0x0a,
+  0xdb, 0x72, 0xbc, 0x18, 0x43, 0xdb, 0xbc, 0xb0, 0xca, 0x83, 0xbc, 0x33, 0x9b,
+  0x12, 0xbe, 0xdb, 0x85, 0xb2, 0xbd, 0xe1, 0x52, 0xc7, 0xbd, 0xd6, 0xbc, 0x12,
+  0xbd, 0x19, 0x0f, 0x90, 0xbc, 0x75, 0xb0, 0x4c, 0x3d, 0x91, 0x46, 0xd2, 0x3b,
+  0xae, 0x95, 0x0e, 0x3d, 0x51, 0xa0, 0x74, 0x3d, 0x9b, 0x73, 0x90, 0xba, 0xec,
+  0x61, 0x85, 0x3c, 0xaa, 0x01, 0xb7, 0x3d, 0x83, 0x19, 0x96, 0xbd, 0xeb, 0x6f,
+  0xce, 0x3c, 0x46, 0x50, 0x15, 0xbe, 0x4c, 0x9d, 0xe2, 0xbb, 0xee, 0x86, 0x59,
+  0xbb, 0xd9, 0xea, 0x8c, 0x3d, 0x5e, 0x80, 0x96, 0x3b, 0x9e, 0x36, 0xf2, 0x3d,
+  0xfc, 0x4e, 0xa8, 0x3c, 0x67, 0x32, 0xb0, 0x3d, 0x93, 0xf9, 0x1a, 0x3d, 0x71,
+  0x3b, 0xaa, 0xbd, 0xd4, 0xcf, 0x34, 0x3d, 0x93, 0x11, 0x84, 0xbd, 0x76, 0x9c,
+  0xc7, 0x3d, 0x6b, 0xee, 0xd5, 0xbd, 0xb6, 0x03, 0xd8, 0x3d, 0xb8, 0x56, 0x53,
+  0xbd, 0x61, 0x89, 0xab, 0xbd, 0x69, 0x71, 0x46, 0xbc, 0x79, 0x31, 0x81, 0xbd,
+  0xa0, 0xaa, 0x9d, 0xbc, 0xab, 0x17, 0x0c, 0x3d, 0x31, 0xb8, 0x0a, 0x3d, 0xc3,
+  0x40, 0xb4, 0xbd, 0xab, 0xb6, 0x97, 0x3d, 0xc1, 0x3a, 0x47, 0x3d, 0x31, 0xdc,
+  0xdb, 0xbc, 0xb4, 0x23, 0x60, 0xbc, 0x9d, 0x47, 0x93, 0x3d, 0xc9, 0x69, 0xa1,
+  0x3d, 0xbb, 0x2f, 0x7a, 0x3d, 0x07, 0x8d, 0x91, 0x3d, 0x20, 0xdb, 0xca, 0x3d,
+  0xf8, 0x44, 0xd3, 0xbd, 0x68, 0xfc, 0x66, 0xbc, 0xfa, 0xab, 0x29, 0x3d, 0xcb,
+  0xb6, 0xa4, 0x3d, 0x9e, 0xbd, 0x06, 0x3d, 0xd1, 0x54, 0xb1, 0x3d, 0x06, 0x7e,
+  0xcb, 0xbd, 0x24, 0x71, 0xc4, 0x3d, 0x08, 0x17, 0x40, 0x3d, 0x7a, 0xf7, 0xae,
+  0xbd, 0xc0, 0x66, 0xc1, 0xbd, 0xfa, 0x2a, 0x22, 0xbd, 0xf0, 0x3d, 0xd2, 0xbc,
+  0x2e, 0xc7, 0x71, 0xbd, 0xc5, 0x4f, 0xd0, 0xbd, 0xf7, 0x68, 0x85, 0xbd, 0xab,
+  0xeb, 0x92, 0xbd, 0x5e, 0xb7, 0xe8, 0xbd, 0x66, 0xc1, 0xef, 0xbd, 0xb7, 0x07,
+  0x06, 0xbd, 0x5b, 0x2f, 0x40, 0x3d, 0xd6, 0xb0, 0xa8, 0xbd, 0xb8, 0x1a, 0xe8,
+  0x3d, 0x9f, 0xb7, 0xc4, 0x3d, 0x3c, 0xb5, 0x8f, 0xbd, 0x23, 0x9f, 0xbc, 0x3d,
+  0xfd, 0x90, 0x88, 0xbd, 0xa2, 0xa9, 0x27, 0xbc, 0x41, 0xe4, 0xd7, 0xbd, 0x29,
+  0x97, 0x07, 0xbd, 0xff, 0x72, 0x04, 0x3c, 0x56, 0x5a, 0x34, 0xbd, 0xf4, 0x8a,
+  0x9d, 0xbd, 0x7e, 0x5d, 0x83, 0xbd, 0xd2, 0x00, 0x4e, 0x3d, 0xbe, 0x7e, 0x5d,
+  0x3d, 0x03, 0xd1, 0x38, 0xbd, 0xb2, 0x2b, 0xbc, 0xbd, 0x04, 0xa8, 0x4d, 0x3d,
+  0xa8, 0x0b, 0xaa, 0xbd, 0x84, 0x50, 0xac, 0xbd, 0x09, 0xef, 0xbf, 0xbc, 0xfa,
+  0xb8, 0xb2, 0xbd, 0xeb, 0x7e, 0xd9, 0x3d, 0x54, 0x08, 0xda, 0xbd, 0x21, 0x24,
+  0x61, 0xbd, 0xae, 0x1e, 0xae, 0xbd, 0xb4, 0x50, 0x3a, 0xbc, 0x2e, 0x07, 0xe9,
+  0xbd, 0xec, 0xb1, 0x9d, 0xbd, 0x88, 0x5d, 0xca, 0xbc, 0x0c, 0x8a, 0x8c, 0x3d,
+  0x58, 0x56, 0xf9, 0x3c, 0x57, 0x0f, 0xe7, 0x3d, 0xd4, 0xd9, 0x1c, 0xbd, 0x87,
+  0xfe, 0x38, 0xbd, 0x1c, 0x08, 0x17, 0xbd, 0x72, 0xbb, 0xc1, 0xbc, 0x5b, 0xa9,
+  0xf7, 0xba, 0xf2, 0xd5, 0x34, 0xbd, 0x71, 0x2f, 0x4b, 0xbd, 0x6a, 0xd6, 0xab,
+  0xbd, 0x07, 0x81, 0xcd, 0x3d, 0x03, 0xf0, 0x2e, 0x3d, 0xcd, 0x20, 0xd4, 0xbd,
+  0x0e, 0xf4, 0x3f, 0xbc, 0xf3, 0xed, 0xe1, 0x3d, 0xf6, 0xc4, 0x82, 0x3d, 0x0b,
+  0x42, 0x48, 0x3d, 0xf9, 0xcd, 0x87, 0x3d, 0x91, 0x7d, 0x49, 0x3b, 0x9a, 0xc7,
+  0x28, 0xbd, 0xf6, 0x02, 0xc3, 0x3d, 0x6e, 0x82, 0xa4, 0xbd, 0x41, 0x1f, 0xe7,
+  0x3d, 0x44, 0x06, 0x76, 0x3d, 0x3b, 0xbc, 0xc1, 0x3b, 0x20, 0xf7, 0x7c, 0xbd,
+  0x0d, 0x0d, 0xe0, 0xbd, 0x2b, 0xa5, 0xc5, 0x3d, 0x51, 0x84, 0x6f, 0xbd, 0xd0,
+  0x24, 0x22, 0x3d, 0x33, 0x68, 0xb7, 0x3d, 0x37, 0x88, 0x87, 0x3d, 0x24, 0x04,
+  0x98, 0xbd, 0x1b, 0xba, 0x04, 0xbd, 0x48, 0x09, 0xdf, 0x3b, 0xac, 0x9e, 0x3c,
+  0xbd, 0x4b, 0xbf, 0x2c, 0x3c, 0x07, 0xba, 0xf4, 0xbd, 0x6e, 0x91, 0x84, 0x3d,
+  0x99, 0x5a, 0x7e, 0x3c, 0x21, 0x9e, 0xeb, 0x3c, 0xde, 0x69, 0x18, 0x3d, 0x1f,
+  0x8f, 0xaa, 0x3d, 0x09, 0x55, 0x08, 0xbd, 0x42, 0xf3, 0xe5, 0xbd, 0x61, 0x6b,
+  0x82, 0xbd, 0xe1, 0xe2, 0xd2, 0x3d, 0x3f, 0xd1, 0xb6, 0x3d, 0xf9, 0xf5, 0xc7,
+  0xbd, 0x47, 0x47, 0x90, 0xbd, 0x74, 0xa3, 0x42, 0xbd, 0xa5, 0xda, 0x3e, 0x3d,
+  0xaf, 0x45, 0xc1, 0x3d, 0x68, 0x46, 0xe5, 0xbd, 0x79, 0x83, 0x31, 0x3d, 0x7e,
+  0xd3, 0xce, 0x3c, 0xea, 0x30, 0xca, 0xbd, 0x00, 0xb0, 0xae, 0x3b, 0x66, 0x91,
+  0xde, 0xbd, 0x0e, 0x11, 0xc0, 0xbd, 0xd0, 0x6a, 0x41, 0xbd, 0x6d, 0x7a, 0x8e,
+  0xbd, 0x0a, 0xe2, 0x70, 0x3d, 0x7b, 0x4d, 0xcf, 0x3d, 0x2c, 0x2b, 0x3d, 0xbd,
+  0x7e, 0xc3, 0x6f, 0xbd, 0xd0, 0x38, 0xac, 0x3c, 0xac, 0x35, 0xd0, 0xbd, 0x88,
+  0x08, 0xe3, 0xbd, 0x78, 0x27, 0xbf, 0x3d, 0x80, 0x1e, 0xf8, 0xbc, 0x52, 0x7a,
+  0x84, 0xbc, 0x77, 0x84, 0xbb, 0xbc, 0x22, 0xdf, 0x2b, 0x3d, 0xa8, 0x16, 0xe9,
+  0xbd, 0xec, 0xab, 0xda, 0x3b, 0xb9, 0x2f, 0x9b, 0x3d, 0x28, 0x97, 0xd6, 0x3d,
+  0x08, 0xde, 0x2c, 0xbc, 0x8a, 0x6c, 0x29, 0x3d, 0xdd, 0xfe, 0xa4, 0xbc, 0x13,
+  0xb3, 0x4e, 0xbc, 0x4f, 0x72, 0x81, 0xbc, 0x33, 0x6c, 0xcc, 0x3d, 0x1c, 0xbc,
+  0x76, 0xbc, 0xfd, 0xd7, 0x8f, 0xbd, 0x99, 0xfd, 0x53, 0xbd, 0x2c, 0x76, 0x80,
+  0xbd, 0x65, 0x2e, 0x1d, 0xbd, 0x9d, 0xd5, 0x8e, 0x3d, 0xeb, 0x16, 0xac, 0x3d,
+  0xa6, 0x14, 0x3d, 0x3d, 0x75, 0x14, 0x97, 0x3d, 0x5e, 0x11, 0xf5, 0xbc, 0xca,
+  0x20, 0x46, 0xbb, 0xb1, 0x04, 0xa1, 0xbd, 0x90, 0xcd, 0x3a, 0x3d, 0x70, 0xaf,
+  0x01, 0xbe, 0x9d, 0xe3, 0xb2, 0xbd, 0xc3, 0xdf, 0x99, 0x3d, 0x20, 0x09, 0xab,
+  0x3d, 0x35, 0x91, 0x06, 0xbd, 0x10, 0x3a, 0xa0, 0xbc, 0xc2, 0xd1, 0xad, 0x3d,
+  0x60, 0x90, 0xe4, 0x3d, 0x9f, 0x47, 0xfd, 0x3c, 0x84, 0xa1, 0x5f, 0x3d, 0x06,
+  0x5e, 0xf0, 0x3c, 0xab, 0x8c, 0x07, 0xbc, 0xf4, 0x6c, 0x16, 0x3d, 0x64, 0x06,
+  0x04, 0xbe, 0xa8, 0x16, 0x85, 0x3d, 0xea, 0x1a, 0xa1, 0xbd, 0x0d, 0xb4, 0xdc,
+  0xbd, 0xf4, 0x77, 0xc0, 0xbc, 0x5d, 0x03, 0x28, 0xbd, 0x29, 0x7d, 0xcc, 0xbc,
+  0xae, 0x19, 0x9f, 0x3d, 0x09, 0x2a, 0xcd, 0x3d, 0xa4, 0x58, 0xaa, 0xbd, 0x6d,
+  0xb8, 0xa9, 0x3c, 0xa1, 0xb7, 0xe6, 0xbd, 0xa9, 0x41, 0x9a, 0xbd, 0x69, 0xa4,
+  0xab, 0x3c, 0xdd, 0x32, 0xa9, 0x3d, 0x19, 0x90, 0xd4, 0x3d, 0x52, 0xa8, 0xea,
+  0xbd, 0x1e, 0x3d, 0xd4, 0x39, 0x84, 0x91, 0x03, 0xbe, 0xc9, 0x63, 0x3f, 0x3d,
+  0x81, 0x1e, 0xe0, 0x3d, 0x05, 0xc5, 0x95, 0xbd, 0x2e, 0x1d, 0xc9, 0xbd, 0xf2,
+  0x9c, 0x7c, 0xbc, 0x69, 0x19, 0xdb, 0xbc, 0x09, 0x3d, 0x6f, 0xbd, 0x58, 0x94,
+  0xf8, 0x3d, 0x2c, 0x78, 0xb6, 0x3d, 0x96, 0xbe, 0xf8, 0x3d, 0x98, 0x4e, 0xb6,
+  0x3d, 0x1a, 0xa0, 0x90, 0x3d, 0xa3, 0xeb, 0xd2, 0xbd, 0x4c, 0xfb, 0x2d, 0xbd,
+  0xcb, 0xca, 0xa8, 0xbc, 0xa7, 0xca, 0x80, 0xbd, 0x65, 0xe2, 0x87, 0xbd, 0x9d,
+  0x9a, 0x25, 0x3c, 0xc7, 0xf2, 0xcc, 0x3c, 0x38, 0x81, 0x48, 0xbd, 0xd3, 0x83,
+  0xea, 0x3d, 0x4f, 0x72, 0xad, 0xbd, 0x6d, 0xef, 0x3f, 0xbc, 0x22, 0xc7, 0xbf,
+  0xbc, 0xb6, 0x25, 0x64, 0x3c, 0x82, 0x76, 0x53, 0xbd, 0xd7, 0x9a, 0x89, 0x3c,
+  0x01, 0xa7, 0x40, 0x3d, 0xbe, 0x03, 0x69, 0xbd, 0x5c, 0x79, 0x0e, 0xbe, 0xeb,
+  0x87, 0x9f, 0xbd, 0x14, 0xa6, 0xad, 0x3c, 0x78, 0x6b, 0x25, 0x3d, 0xea, 0xa0,
+  0xd7, 0x3d, 0x19, 0xb6, 0x22, 0xbd, 0xc6, 0xf6, 0xba, 0xbc, 0xe9, 0xd6, 0xe4,
+  0x3c, 0x55, 0x68, 0x2a, 0xbd, 0xc0, 0x4c, 0xb0, 0xbc, 0xf5, 0xa5, 0x01, 0x3e,
+  0x59, 0x9a, 0xd0, 0xbd, 0x4a, 0xb2, 0xfc, 0x3d, 0x3a, 0x59, 0x8f, 0x3d, 0x4a,
+  0x0a, 0xb4, 0xbd, 0x7d, 0xc4, 0x63, 0x3d, 0xb6, 0xb8, 0xb9, 0x3d, 0xb0, 0x95,
+  0x81, 0x3c, 0x2f, 0x7a, 0x32, 0x3d, 0x32, 0x87, 0xe4, 0xbc, 0xf0, 0xfc, 0xd5,
+  0x3d, 0xfc, 0xe6, 0xf1, 0x3d, 0x04, 0x66, 0x98, 0x3c, 0x14, 0x23, 0x72, 0x3c,
+  0xfe, 0x50, 0x95, 0x3d, 0xdf, 0xe6, 0x4c, 0x3d, 0x84, 0x80, 0x8e, 0x3d, 0x13,
+  0xe8, 0x4c, 0xbd, 0xd4, 0xca, 0x83, 0xbd, 0x20, 0x86, 0xb0, 0xbd, 0xed, 0x66,
+  0x89, 0x3c, 0x6a, 0x59, 0x19, 0xbd, 0xc2, 0x32, 0xc3, 0xbd, 0x04, 0x3f, 0x8d,
+  0xbc, 0x51, 0xcc, 0x23, 0xbc, 0xb4, 0x4f, 0xa3, 0xbc, 0x30, 0x98, 0xc8, 0x3d,
+  0x29, 0xaa, 0xd4, 0xbb, 0x5c, 0x7d, 0x88, 0xbd, 0x3a, 0xe9, 0xa9, 0xbd, 0xc3,
+  0x4f, 0x40, 0xbd, 0x2d, 0x12, 0x49, 0xbd, 0x9e, 0x4e, 0x9a, 0xbd, 0xf1, 0xa9,
+  0x84, 0xbd, 0x29, 0x09, 0x94, 0x3d, 0x98, 0x3c, 0xf0, 0x3d, 0x5f, 0xfe, 0x2a,
+  0xbd, 0xd8, 0xa8, 0x46, 0xbd, 0xa1, 0xc8, 0x1c, 0xbb, 0x12, 0x3d, 0xbc, 0x3d,
+  0x38, 0x39, 0x51, 0x3c, 0x3a, 0x00, 0x95, 0x3d, 0xd8, 0x2e, 0x67, 0x3c, 0x48,
+  0x7e, 0xe0, 0xbd, 0x8c, 0x90, 0x79, 0x3c, 0xf2, 0x3d, 0x50, 0x3d, 0xbc, 0x2f,
+  0xa1, 0x3c, 0xf9, 0xf0, 0x8a, 0x3d, 0x0e, 0x11, 0x30, 0x3c, 0x7c, 0xc8, 0xf8,
+  0x3c, 0xe0, 0x88, 0x10, 0x3d, 0x4b, 0xaa, 0xbe, 0xbd, 0xa4, 0x0a, 0x5b, 0x3d,
+  0xe2, 0x3c, 0x94, 0x3d, 0xdd, 0x36, 0x95, 0xbd, 0xc7, 0x70, 0x89, 0xbd, 0x95,
+  0xe7, 0x89, 0x3d, 0x91, 0x0e, 0x23, 0x3c, 0xfe, 0x32, 0x4f, 0x3b, 0xd4, 0x79,
+  0xc2, 0x3d, 0x52, 0xab, 0xb4, 0xbd, 0xb3, 0x98, 0xd2, 0x3d, 0xb8, 0x70, 0x88,
+  0xbd, 0x2e, 0x3e, 0x77, 0x3d, 0xb5, 0x44, 0x00, 0x3d, 0xb4, 0xe9, 0x59, 0x3d,
+  0xae, 0x3b, 0x9d, 0x3d, 0x3d, 0x89, 0x36, 0x3d, 0x22, 0x67, 0x9b, 0xbb, 0xca,
+  0xca, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0xcf, 0x02,
+  0xcf, 0x3d, 0x6b, 0xe2, 0x84, 0x3d, 0x62, 0xaa, 0xdc, 0x3d, 0xdf, 0x55, 0xef,
+  0x3b, 0xc1, 0x2b, 0x41, 0xbd, 0x6e, 0x82, 0xb3, 0xbd, 0x08, 0xc0, 0x6c, 0xbd,
+  0x7c, 0xb9, 0x10, 0xbe, 0x97, 0x76, 0xbb, 0xbc, 0xa3, 0x52, 0x00, 0xbe, 0xd9,
+  0x90, 0x32, 0xbe, 0xac, 0x38, 0x62, 0x3d, 0x6c, 0xdc, 0xae, 0xbc, 0x2a, 0x7d,
+  0x01, 0xbe, 0x2f, 0xf8, 0x30, 0xbd, 0x8f, 0x24, 0x45, 0xbe, 0x0c, 0x74, 0x1f,
+  0xbe, 0x5e, 0x0b, 0x0f, 0xbd, 0xf7, 0xb6, 0xc5, 0x3d, 0xe9, 0x3c, 0xbb, 0xbd,
+  0x61, 0x11, 0x19, 0x3d, 0x68, 0xf0, 0x44, 0x3e, 0x26, 0x64, 0x95, 0x3c, 0xa1,
+  0xde, 0x54, 0x3d, 0x25, 0x8b, 0x14, 0x3e, 0x0f, 0xed, 0xfe, 0x3b, 0x1b, 0x37,
+  0xf4, 0xbd, 0x9e, 0x28, 0xbd, 0x3d, 0x26, 0x5c, 0xca, 0x3d, 0xbb, 0xad, 0x02,
+  0x3d, 0x1f, 0xc1, 0x25, 0x3e, 0x85, 0x0a, 0x39, 0xbe, 0xfa, 0xc3, 0xf7, 0xbd,
+  0xda, 0x75, 0xc6, 0xbd, 0x06, 0x2d, 0x4a, 0x3c, 0x1a, 0xc1, 0x94, 0xbd, 0xb0,
+  0x62, 0xa0, 0xbd, 0x63, 0x0c, 0x0e, 0xbe, 0xf3, 0x67, 0x01, 0xbe, 0xd9, 0x42,
+  0x48, 0xbe, 0xaa, 0xf0, 0xf6, 0xbd, 0xc7, 0xa6, 0x39, 0xbe, 0xf6, 0xef, 0xb2,
+  0x3d, 0xe6, 0x6f, 0xd7, 0xbd, 0x14, 0x4f, 0xfb, 0xbc, 0x7f, 0xb1, 0x86, 0x3d,
+  0xcc, 0xca, 0xd9, 0xbd, 0x34, 0x6f, 0x3e, 0xbc, 0x90, 0x24, 0xe8, 0x3d, 0xda,
+  0x5a, 0xf9, 0x3d, 0x78, 0xc9, 0xf0, 0xbd, 0x1e, 0x50, 0xa5, 0x3d, 0xce, 0xed,
+  0x6d, 0xbd, 0x65, 0x3b, 0x62, 0xbd, 0x52, 0x36, 0x3d, 0xbd, 0xf8, 0x54, 0x70,
+  0x3d, 0x01, 0x85, 0x39, 0x3c, 0x57, 0xf0, 0xa8, 0xbc, 0xf5, 0x69, 0xda, 0xbd,
+  0xd5, 0x00, 0xda, 0x3d, 0x47, 0x0a, 0xe6, 0x3d, 0xf1, 0xed, 0xae, 0xbd, 0x1b,
+  0x51, 0x93, 0x3d, 0x25, 0x8d, 0x1e, 0x3e, 0x65, 0x36, 0x24, 0x3e, 0xab, 0x4e,
+  0x3b, 0xbe, 0x73, 0x91, 0x7b, 0x3d, 0x79, 0x2a, 0xa6, 0x3c, 0x6e, 0x13, 0x29,
+  0x3e, 0xae, 0x98, 0x8b, 0x3d, 0x61, 0xec, 0x36, 0xbe, 0xee, 0xd9, 0x8a, 0x3d,
+  0xe8, 0xd8, 0xff, 0xbd, 0x87, 0xae, 0x13, 0xbe, 0x45, 0x02, 0xae, 0x3d, 0xbc,
+  0x03, 0x94, 0xbd, 0xf6, 0x5b, 0x17, 0xbe, 0x3c, 0x46, 0x15, 0x3e, 0x99, 0xe3,
+  0x3b, 0x3e, 0x6c, 0x0a, 0x82, 0xbd, 0x67, 0xb1, 0xb4, 0x3c, 0x68, 0xc6, 0x0a,
+  0x3e, 0x7f, 0xe1, 0xa5, 0x3d, 0x38, 0x5c, 0x61, 0x3e, 0x0d, 0x37, 0xdd, 0xbd,
+  0x14, 0xae, 0xff, 0xbc, 0x00, 0xba, 0x97, 0x3d, 0x61, 0xf4, 0xd7, 0x3c, 0xb9,
+  0x7e, 0x0b, 0xbe, 0x87, 0xa5, 0x59, 0xbc, 0x01, 0x95, 0x19, 0x3c, 0x3e, 0xf3,
+  0x72, 0xbd, 0x8b, 0x32, 0x0e, 0xbe, 0x8e, 0x5c, 0x30, 0x3e, 0xd1, 0x09, 0x10,
+  0x3e, 0xfb, 0xc9, 0x13, 0x3e, 0x82, 0x6f, 0xe2, 0x3d, 0x71, 0xd7, 0xc8, 0xbd,
+  0x57, 0x14, 0xbb, 0xbd, 0x0f, 0x10, 0x40, 0x3d, 0xa6, 0x30, 0x1e, 0x3d, 0xc8,
+  0x3f, 0x4a, 0x3e, 0x06, 0xe9, 0x15, 0xbd, 0x8a, 0x87, 0x11, 0x3e, 0xe2, 0xa4,
+  0x0b, 0xbe, 0xe5, 0x96, 0x3d, 0x3e, 0x5e, 0x78, 0x0c, 0x3e, 0x32, 0x79, 0x7a,
+  0xba, 0x24, 0x9f, 0x1f, 0xbe, 0xe1, 0x2d, 0xc3, 0xbc, 0xdf, 0x43, 0xb4, 0xbd,
+  0xb1, 0x00, 0xde, 0x3d, 0x7e, 0x34, 0x4b, 0xbe, 0xeb, 0x21, 0xdd, 0xbd, 0xbe,
+  0x43, 0xe2, 0xbd, 0x4b, 0x49, 0x9f, 0x3d, 0xa3, 0xd0, 0x8e, 0x3d, 0xdf, 0x84,
+  0x17, 0xbe, 0x12, 0x0b, 0xc8, 0xbd, 0xcb, 0x0e, 0x64, 0xbd, 0xdd, 0x25, 0x83,
+  0xbd, 0xa0, 0x78, 0x1b, 0x3e, 0x2e, 0x77, 0x1e, 0xbe, 0x94, 0x81, 0xc8, 0xbd,
+  0x8d, 0x3e, 0xba, 0xbd, 0xff, 0xe9, 0x32, 0x3e, 0xb0, 0x76, 0xb9, 0xbd, 0xfd,
+  0x8a, 0x71, 0xbd, 0xab, 0xf3, 0x4c, 0xbc, 0x0c, 0xa0, 0x0c, 0x3e, 0xa2, 0x36,
+  0xb2, 0xbc, 0x1b, 0x34, 0xb2, 0xbd, 0x44, 0x18, 0x8c, 0xbd, 0xa3, 0xe3, 0x83,
+  0xbd, 0x45, 0x8c, 0xae, 0xbd, 0x4e, 0x7d, 0x09, 0xbe, 0xdf, 0x58, 0x19, 0xbd,
+  0xae, 0x8f, 0x5f, 0x3d, 0xa7, 0x36, 0x80, 0xbd, 0xfb, 0x12, 0x22, 0x3e, 0x25,
+  0x11, 0x99, 0xbb, 0x51, 0xc9, 0x4a, 0x3d, 0x99, 0x68, 0x32, 0x3e, 0x44, 0xcc,
+  0x7a, 0xbc, 0xa8, 0x46, 0xb7, 0x3d, 0x5f, 0xbb, 0x8a, 0xbd, 0xd3, 0xbb, 0x3a,
+  0x3e, 0x46, 0x2c, 0x89, 0x3d, 0x26, 0xcb, 0x79, 0x3d, 0xe1, 0x45, 0x40, 0xbd,
+  0x01, 0xc4, 0xe3, 0x3d, 0x42, 0x18, 0x24, 0x3e, 0x34, 0x73, 0x19, 0x3e, 0x00,
+  0x53, 0xb7, 0x3d, 0x33, 0x6d, 0xf8, 0x3c, 0x2c, 0x5d, 0x3f, 0xbd, 0x85, 0xa9,
+  0x1b, 0xbe, 0x18, 0xda, 0xb8, 0xbc, 0xaa, 0x92, 0xb4, 0x3d, 0x53, 0x65, 0x43,
+  0x3e, 0x4f, 0xda, 0x03, 0xbd, 0xba, 0x8e, 0x40, 0xbe, 0xc1, 0x11, 0xb8, 0xbb,
+  0x3e, 0x07, 0x66, 0x3e, 0xb8, 0x25, 0xe0, 0x3c, 0x7f, 0x4d, 0x0f, 0xbd, 0x35,
+  0x57, 0xaa, 0xbd, 0xe5, 0x8b, 0xec, 0xbd, 0x70, 0xda, 0x08, 0xbc, 0x03, 0xc2,
+  0xf5, 0xbb, 0xa5, 0x57, 0x83, 0xbd, 0xf1, 0x0b, 0x74, 0x3e, 0x9a, 0x63, 0x5a,
+  0xbd, 0x8f, 0xb3, 0xa1, 0xbb, 0xe3, 0x0a, 0xd1, 0x3c, 0xa8, 0xc3, 0xfd, 0x3d,
+  0x58, 0x80, 0x04, 0xbe, 0xfb, 0xca, 0xe0, 0x3d, 0x01, 0x75, 0x04, 0xbe, 0xbe,
+  0xa9, 0x55, 0xbd, 0x59, 0x90, 0xff, 0xbd, 0x6a, 0xf0, 0x64, 0xbd, 0x89, 0xdc,
+  0x1d, 0xbe, 0xb8, 0x8f, 0x26, 0xbd, 0x3b, 0x31, 0xc8, 0xbd, 0x2c, 0x3d, 0x88,
+  0xbd, 0x48, 0xea, 0x0f, 0xbd, 0xce, 0x3f, 0x22, 0x3d, 0x8b, 0x31, 0xe7, 0x3d,
+  0xa1, 0x13, 0x55, 0xbd, 0x2a, 0x96, 0xcc, 0x3d, 0xa1, 0xd9, 0xcf, 0x3d, 0x9f,
+  0x0f, 0xcf, 0x3c, 0xac, 0x8b, 0xa4, 0xbc, 0x88, 0x69, 0xb6, 0x3d, 0x35, 0x40,
+  0xc8, 0x3d, 0x5a, 0x6e, 0x23, 0xbe, 0x5f, 0xd9, 0x17, 0xbe, 0x4b, 0x8e, 0x9f,
+  0xbd, 0x44, 0xeb, 0x15, 0xbe, 0xe9, 0x93, 0xba, 0x3d, 0x4b, 0x93, 0x08, 0xbe,
+  0x79, 0x4d, 0x09, 0x3e, 0x5a, 0x98, 0x6d, 0xbd, 0x02, 0x95, 0x24, 0xbe, 0x80,
+  0x67, 0x9d, 0xbd, 0xd2, 0x10, 0x1f, 0xbe, 0x64, 0xd2, 0x62, 0xbd, 0x01, 0x92,
+  0x09, 0x3e, 0x96, 0x6e, 0xca, 0xbd, 0x62, 0x32, 0xf3, 0xbd, 0xe1, 0x10, 0x50,
+  0x3d, 0x61, 0x3e, 0xdc, 0x3d, 0x7e, 0x6e, 0xd5, 0xbd, 0xf4, 0xea, 0x1f, 0x3e,
+  0x2a, 0xd2, 0x10, 0xbd, 0x04, 0xa4, 0xdd, 0x3b, 0x7f, 0x19, 0x50, 0xbd, 0xad,
+  0x49, 0x0e, 0x3e, 0x63, 0x14, 0xe3, 0x3d, 0x6f, 0x2d, 0x99, 0x3d, 0x4a, 0x0b,
+  0x08, 0xbe, 0xd6, 0x54, 0xdd, 0xbd, 0xfb, 0x6b, 0x9e, 0xbd, 0xc0, 0x42, 0xe9,
+  0xbd, 0xba, 0xef, 0x40, 0xbb, 0x9c, 0x44, 0xc5, 0x3d, 0x1e, 0x3a, 0xde, 0xbd,
+  0xce, 0x6d, 0xef, 0x3d, 0x92, 0x4d, 0xf6, 0xbd, 0xa3, 0xc5, 0x0c, 0xbe, 0x74,
+  0x63, 0xd8, 0xbd, 0xff, 0xd4, 0x11, 0x3e, 0x02, 0x10, 0x28, 0xbd, 0x86, 0xf5,
+  0x4f, 0x3d, 0x6a, 0xfb, 0xc6, 0x3d, 0x6d, 0x29, 0x1f, 0xbe, 0xa4, 0x55, 0xab,
+  0x3d, 0xaa, 0xc8, 0xc7, 0x3d, 0xf4, 0xec, 0x59, 0x3d, 0xd1, 0x44, 0x75, 0x3d,
+  0xe6, 0x18, 0x3c, 0x3e, 0xd7, 0x83, 0xb5, 0x3d, 0xdc, 0xa3, 0xb1, 0xbd, 0xbb,
+  0xa7, 0x73, 0xbd, 0x03, 0x00, 0x3c, 0x3d, 0x3b, 0x59, 0x8d, 0xbd, 0x27, 0x1f,
+  0x07, 0xbe, 0x46, 0x5f, 0xcf, 0xbd, 0x5b, 0xf5, 0x13, 0xbe, 0xe9, 0xa9, 0x1b,
+  0x3e, 0x05, 0x6e, 0x0e, 0x3e, 0xd2, 0xa7, 0xad, 0xbc, 0x55, 0xda, 0x12, 0x3e,
+  0xd4, 0xd5, 0xcc, 0xbd, 0x5e, 0x0d, 0x33, 0xbe, 0x5f, 0xfa, 0x99, 0xbd, 0xa1,
+  0xd4, 0x96, 0xbd, 0x7b, 0xec, 0x08, 0x3d, 0xf0, 0x43, 0x04, 0xbe, 0xd6, 0x6a,
+  0x3e, 0x3d, 0x9c, 0x4c, 0xa5, 0xbd, 0xc1, 0x25, 0xeb, 0x3c, 0x00, 0x84, 0x7f,
+  0xbd, 0x8e, 0x5b, 0x2d, 0xbd, 0x5a, 0x0d, 0x93, 0x3c, 0x14, 0x09, 0x5e, 0x3d,
+  0x0e, 0x7c, 0x25, 0x3d, 0x4b, 0x3f, 0x0f, 0xbe, 0xad, 0x31, 0xd8, 0xbd, 0x81,
+  0xa4, 0x66, 0xbd, 0x25, 0x37, 0x32, 0xbe, 0x64, 0x42, 0x6f, 0x3d, 0x9c, 0xdb,
+  0xc2, 0x3d, 0x1f, 0x78, 0xcc, 0x3c, 0x45, 0xa8, 0x0c, 0x3e, 0xe8, 0x27, 0xe3,
+  0x3d, 0xbf, 0xb1, 0xff, 0x3d, 0x3e, 0x13, 0xc6, 0x3d, 0xf2, 0x5b, 0x64, 0x3d,
+  0xf1, 0xf8, 0x16, 0x3e, 0x24, 0x46, 0x40, 0x3d, 0xa1, 0x7e, 0x99, 0x3c, 0x6d,
+  0x30, 0x1e, 0xbe, 0x04, 0xdd, 0x2a, 0xbe, 0x03, 0x25, 0x20, 0xbd, 0x07, 0xf4,
+  0x74, 0xbc, 0xc8, 0x71, 0x03, 0xbd, 0x46, 0xf3, 0xd9, 0xbc, 0x33, 0x6d, 0xbb,
+  0xbd, 0xbd, 0x8a, 0xd5, 0x3d, 0x68, 0xbd, 0x9e, 0xbc, 0x1c, 0x26, 0x09, 0xbe,
+  0x0f, 0x3c, 0x9d, 0xbd, 0xde, 0x13, 0x53, 0xbd, 0x73, 0xe9, 0x90, 0x3d, 0xdc,
+  0x50, 0xef, 0x3c, 0x6f, 0x00, 0x32, 0xbc, 0x42, 0x79, 0x18, 0x3e, 0xa8, 0xe4,
+  0xb3, 0xbd, 0x04, 0x2f, 0x6e, 0xbd, 0x41, 0xb2, 0x51, 0x3e, 0x56, 0x54, 0xe7,
+  0x3d, 0x0c, 0x44, 0xbb, 0xbd, 0xa4, 0xce, 0x8b, 0x3c, 0xad, 0x8a, 0xec, 0x3d,
+  0xf7, 0xc9, 0x44, 0xbd, 0xc5, 0xdc, 0x2a, 0x3b, 0xde, 0x9e, 0xb6, 0x3d, 0x20,
+  0x2c, 0x1c, 0xbe, 0x04, 0x0c, 0x9f, 0xbd, 0x41, 0x5f, 0xd4, 0xbd, 0x76, 0x92,
+  0x06, 0xbe, 0x6a, 0x98, 0x30, 0xbe, 0xc4, 0xa0, 0xd3, 0x3c, 0x38, 0x33, 0xf5,
+  0xbd, 0x94, 0x28, 0x0d, 0xbd, 0x42, 0x60, 0x1e, 0x3d, 0xfd, 0x72, 0xca, 0x3d,
+  0xee, 0xf6, 0x0d, 0x3e, 0x35, 0xb3, 0x27, 0x3e, 0x15, 0xde, 0x08, 0xbe, 0x34,
+  0xc4, 0x8b, 0xbd, 0x4a, 0x4f, 0x9a, 0x3d, 0x87, 0x8f, 0x06, 0xbc, 0x68, 0x43,
+  0x10, 0xbd, 0x36, 0x40, 0xb6, 0xbc, 0xf2, 0xad, 0x82, 0xbd, 0xc5, 0xef, 0x13,
+  0xbe, 0x4c, 0x38, 0xcd, 0xbd, 0x4a, 0xdf, 0x9d, 0x3c, 0x9d, 0xb0, 0x9a, 0x3d,
+  0xe8, 0xf7, 0xd4, 0x3d, 0x9d, 0x50, 0x34, 0x3d, 0xc9, 0x92, 0xdf, 0x3d, 0x20,
+  0x66, 0xeb, 0x3d, 0x54, 0x5c, 0x85, 0xbd, 0x2d, 0x0e, 0xc6, 0x3d, 0x90, 0xea,
+  0x64, 0xbd, 0xcd, 0xa5, 0x5c, 0xbd, 0x77, 0x8d, 0x7b, 0x3d, 0xf7, 0xda, 0x98,
+  0xbd, 0xc2, 0x98, 0xcb, 0x3d, 0x79, 0xa4, 0x2d, 0x3d, 0x52, 0x42, 0x15, 0x3e,
+  0xc5, 0x68, 0x47, 0xbd, 0xbf, 0xa0, 0xe7, 0xbd, 0xbf, 0xa4, 0xbd, 0x3b, 0x6f,
+  0xe3, 0x05, 0xbd, 0xd3, 0xda, 0xdb, 0xbd, 0x40, 0x3a, 0xa8, 0xbd, 0x87, 0x88,
+  0x36, 0xbe, 0xaf, 0x1d, 0xe5, 0x3d, 0xf6, 0xe8, 0x2e, 0xbe, 0xbc, 0x78, 0x9b,
+  0x3d, 0x8b, 0x27, 0xf6, 0xbd, 0x18, 0x45, 0xef, 0xbd, 0x8c, 0x3f, 0x3e, 0x3e,
+  0x94, 0x69, 0x16, 0xbe, 0x4f, 0xce, 0x48, 0xbe, 0x0c, 0xfa, 0x0b, 0xbc, 0x01,
+  0x50, 0x37, 0x3e, 0x87, 0x13, 0x0b, 0xbe, 0xd0, 0xb1, 0x38, 0x3e, 0x71, 0x2c,
+  0xa1, 0x3d, 0x4a, 0x15, 0xb4, 0xbd, 0x80, 0x28, 0x2b, 0xbd, 0xc7, 0x3d, 0x7e,
+  0x3c, 0xe5, 0xe1, 0xf1, 0x3d, 0x43, 0x56, 0x2c, 0x3d, 0x18, 0xba, 0x20, 0xbe,
+  0x4e, 0x30, 0x8d, 0x3d, 0x0b, 0x52, 0x20, 0x3b, 0x2d, 0xbc, 0x48, 0xbd, 0xf8,
+  0xff, 0xcf, 0xbb, 0x34, 0xb2, 0xaf, 0x3c, 0xea, 0xad, 0xf0, 0x3d, 0xed, 0xbd,
+  0x8d, 0x3d, 0x41, 0x8c, 0xde, 0xbd, 0xb0, 0xb4, 0x32, 0x3e, 0xf8, 0x16, 0x2e,
+  0xbe, 0x0c, 0x4a, 0x8c, 0x3d, 0x89, 0x92, 0x13, 0x3e, 0x8b, 0xd2, 0xbb, 0xbd,
+  0xf5, 0xce, 0x0f, 0x3e, 0x31, 0x82, 0x7b, 0xbb, 0x7f, 0xac, 0x0e, 0x3e, 0x9f,
+  0xe7, 0x0a, 0xbe, 0x5b, 0xef, 0x2b, 0x3d, 0xa9, 0x7f, 0x0d, 0x3e, 0xa4, 0xc0,
+  0xde, 0x3d, 0xde, 0x0d, 0xbc, 0xbc, 0x59, 0x6f, 0x81, 0x3a, 0x46, 0x0c, 0x1b,
+  0xbe, 0xd0, 0xba, 0xf5, 0xbc, 0xe5, 0x6d, 0x1d, 0x3e, 0x31, 0x08, 0x5a, 0x3d,
+  0xab, 0x1c, 0xb5, 0xbc, 0xe7, 0xaa, 0x18, 0x3e, 0xaa, 0xcc, 0x14, 0x3e, 0x4e,
+  0x1e, 0x08, 0xbd, 0xfc, 0x9f, 0xbe, 0xbd, 0x44, 0x7b, 0x2b, 0xbe, 0xf1, 0xfa,
+  0x90, 0x3c, 0xa4, 0x75, 0x16, 0xbe, 0x27, 0x3b, 0x05, 0xbe, 0xf3, 0x41, 0xde,
+  0xbd, 0xb9, 0x96, 0x10, 0xbd, 0xd0, 0x44, 0x6a, 0x3b, 0x5b, 0x04, 0x02, 0xbe,
+  0x3c, 0xf7, 0x41, 0xbd, 0xe6, 0xaf, 0x06, 0xbe, 0x52, 0x74, 0x08, 0x3e, 0xda,
+  0x81, 0x54, 0x3d, 0xcd, 0xe8, 0xbc, 0x3d, 0xf8, 0x07, 0xdc, 0x3d, 0x84, 0x6f,
+  0xd8, 0xbd, 0xe0, 0x65, 0x2a, 0x3e, 0x04, 0xae, 0xe1, 0xbd, 0x34, 0xd5, 0x27,
+  0xbd, 0x5c, 0xb4, 0x70, 0xbd, 0x0d, 0x68, 0xfa, 0x3d, 0x04, 0xb0, 0xc5, 0xbd,
+  0xa0, 0xf7, 0x87, 0x3d, 0xdc, 0x08, 0x18, 0x3e, 0x86, 0xb9, 0x0f, 0xbe, 0x21,
+  0x03, 0x75, 0x3d, 0x2b, 0x4f, 0x15, 0xbd, 0x3c, 0x86, 0x8e, 0xbc, 0xc7, 0xd0,
+  0x73, 0x3d, 0xe0, 0x50, 0x37, 0x3c, 0xd6, 0x8d, 0xce, 0x3d, 0x3b, 0x42, 0x1b,
+  0x3e, 0xa9, 0xfc, 0x29, 0x3e, 0xe4, 0x58, 0x1d, 0x3d, 0x5d, 0xab, 0x3b, 0xbe,
+  0x28, 0x32, 0x07, 0xbd, 0x54, 0x37, 0x9c, 0x3d, 0xd4, 0xdd, 0x04, 0x3d, 0x28,
+  0xe1, 0xad, 0xbc, 0x98, 0x0e, 0x13, 0x3e, 0xae, 0x57, 0x2a, 0xbe, 0xc4, 0xf0,
+  0x70, 0xbd, 0xf9, 0x8d, 0x0d, 0xbe, 0x5e, 0x46, 0x17, 0xbe, 0x90, 0x6a, 0xbc,
+  0x3d, 0x12, 0xa1, 0xf3, 0xbd, 0x0f, 0xf9, 0x88, 0xbd, 0x60, 0xd9, 0x2f, 0xbd,
+  0x07, 0x99, 0xa2, 0xbd, 0x0b, 0xa5, 0x1b, 0xbc, 0x92, 0x9d, 0xaf, 0xbc, 0x37,
+  0xf5, 0x5a, 0x3c, 0x88, 0xf0, 0xcf, 0x3d, 0x96, 0xdd, 0x54, 0x3d, 0x2f, 0xd2,
+  0x0a, 0x3e, 0xe5, 0xbd, 0x46, 0x3c, 0xd2, 0x65, 0xcb, 0xbd, 0x19, 0x00, 0x0b,
+  0xbe, 0xd6, 0xf6, 0xb0, 0x3d, 0x39, 0xc2, 0x14, 0x3e, 0x44, 0x63, 0x3f, 0x3e,
+  0x4a, 0x6c, 0x1d, 0x3e, 0xf3, 0x6a, 0xe1, 0xbc, 0x31, 0xa5, 0x28, 0xbe, 0x54,
+  0x4d, 0x49, 0xbd, 0xd4, 0xbf, 0x64, 0xbd, 0xec, 0x58, 0xbc, 0xbd, 0xff, 0xc6,
+  0xd0, 0x3c, 0xb7, 0xf1, 0xa7, 0x3d, 0x55, 0x15, 0x26, 0xbd, 0xe6, 0x14, 0xe2,
+  0x3c, 0x6b, 0x28, 0x05, 0x3e, 0x83, 0xaf, 0xbc, 0xbd, 0xc6, 0xb7, 0x6a, 0x3d,
+  0x6f, 0xa9, 0x01, 0x3e, 0x93, 0x78, 0x62, 0xb9, 0x23, 0x46, 0x3f, 0xbd, 0x89,
+  0xbd, 0x88, 0x3d, 0x4d, 0xeb, 0xa0, 0x3d, 0x5e, 0x68, 0x74, 0xbd, 0x3d, 0xe2,
+  0x86, 0xbd, 0x11, 0x15, 0x62, 0xbd, 0x01, 0xde, 0xc8, 0xbd, 0xf0, 0x96, 0xc0,
+  0xbd, 0xf4, 0x9d, 0xff, 0xbd, 0x04, 0xcb, 0x80, 0x3c, 0x4f, 0x43, 0x35, 0x3d,
+  0x65, 0x45, 0x6c, 0x3d, 0x45, 0x55, 0xaa, 0xbc, 0xe1, 0x1a, 0x59, 0x3d, 0x4c,
+  0x54, 0x20, 0xbe, 0x35, 0xaf, 0xe3, 0x3d, 0xd2, 0x5e, 0xae, 0xbd, 0xa7, 0xaa,
+  0x15, 0x3e, 0xea, 0x3c, 0xe9, 0x3c, 0xa4, 0xc9, 0x08, 0xbe, 0xca, 0xec, 0x82,
+  0x3b, 0x8b, 0x49, 0xfa, 0xbd, 0x9d, 0x1e, 0x8b, 0xbc, 0x1b, 0xb4, 0xed, 0xbd,
+  0x1d, 0xbe, 0xc9, 0x3d, 0x8c, 0xdf, 0x2a, 0xbe, 0x8c, 0xba, 0xe3, 0x3d, 0x1f,
+  0xa2, 0x14, 0x3d, 0x61, 0xf2, 0xcf, 0xba, 0xd5, 0x67, 0x88, 0xbd, 0xa7, 0xd0,
+  0x5d, 0x3e, 0x71, 0x6e, 0xfd, 0x3d, 0xd5, 0xcf, 0x02, 0xbd, 0x0c, 0x25, 0xb5,
+  0x3c, 0xa6, 0x27, 0x90, 0x3c, 0x86, 0x80, 0x1c, 0x3e, 0x41, 0x4f, 0x02, 0xbe,
+  0xe1, 0x7a, 0x28, 0x3e, 0xef, 0xf7, 0x96, 0xbd, 0x0f, 0x11, 0xd3, 0x3d, 0xd9,
+  0x11, 0x00, 0x3e, 0x77, 0x16, 0x98, 0x3d, 0x6a, 0xbc, 0x03, 0xbe, 0xbc, 0x2b,
+  0xc9, 0xbd, 0xc0, 0xc5, 0x99, 0x3d, 0xf4, 0x17, 0xc9, 0x3d, 0x37, 0xc7, 0xea,
+  0x3d, 0xd0, 0x01, 0x29, 0xbe, 0xae, 0xfd, 0x37, 0xbd, 0x7a, 0xce, 0xba, 0xbc,
+  0x7d, 0x16, 0x19, 0x3e, 0x2b, 0x5f, 0x32, 0x3a, 0x54, 0x01, 0x96, 0xbd, 0xd6,
+  0xb6, 0x73, 0x3c, 0x8f, 0x5c, 0xa9, 0x3c, 0x67, 0x4e, 0xac, 0x3d, 0x52, 0x49,
+  0xab, 0x3d, 0x05, 0x07, 0x29, 0x3e, 0x43, 0x4c, 0x28, 0xbe, 0x0c, 0x1a, 0x12,
+  0xbe, 0x05, 0x18, 0x3c, 0x3c, 0x29, 0x0f, 0x22, 0x3e, 0xf3, 0x49, 0x54, 0x3e,
+  0xbf, 0xcd, 0x46, 0x3d, 0xea, 0x9f, 0x53, 0x3d, 0xf6, 0xcc, 0xb5, 0x3d, 0x80,
+  0x51, 0x9e, 0x3d, 0xff, 0xc1, 0x69, 0x3d, 0x94, 0x19, 0x41, 0xbd, 0x7b, 0x33,
+  0x75, 0x3c, 0x9e, 0x51, 0x2f, 0x3e, 0x58, 0x6e, 0x21, 0x3c, 0x46, 0x38, 0x22,
+  0x3e, 0x73, 0xf9, 0x15, 0xbe, 0xfa, 0x12, 0x04, 0xbe, 0xaf, 0x1d, 0x1e, 0xbe,
+  0xad, 0x03, 0x11, 0xbe, 0xb3, 0xa7, 0x07, 0x3d, 0x4b, 0x76, 0x58, 0xbd, 0x68,
+  0xaa, 0x21, 0xbe, 0x18, 0xb3, 0x24, 0xbe, 0x59, 0xa7, 0x9d, 0xbd, 0x8a, 0x64,
+  0x92, 0x3d, 0xf4, 0xe8, 0x00, 0xbe, 0xed, 0xd4, 0x85, 0x3c, 0x77, 0x84, 0xf0,
+  0xbd, 0x3f, 0x0d, 0x37, 0x3e, 0x2c, 0x42, 0x64, 0x3c, 0x5b, 0x23, 0x27, 0x3e,
+  0x3e, 0xc6, 0xb0, 0x3d, 0x1c, 0xba, 0xfe, 0xbc, 0xcf, 0xde, 0xb4, 0xbc, 0x97,
+  0x05, 0x1c, 0xbd, 0x0d, 0xa5, 0x92, 0xbb, 0x6a, 0x79, 0x50, 0x3e, 0x62, 0x30,
+  0x19, 0x3e, 0xd7, 0x23, 0x02, 0x3e, 0x9d, 0xc1, 0x7e, 0x3d, 0xb5, 0x03, 0x9c,
+  0xbd, 0x7b, 0xc5, 0x72, 0x3d, 0xc3, 0xd4, 0x22, 0xbe, 0x55, 0x27, 0x63, 0x3d,
+  0xb7, 0x8f, 0x2e, 0xbe, 0x18, 0xe1, 0xbd, 0xbd, 0xa9, 0x10, 0xf0, 0xbd, 0x51,
+  0xd4, 0x4d, 0x3d, 0x62, 0x08, 0xe2, 0x3d, 0x3b, 0xf4, 0x5e, 0x3d, 0xa1, 0xeb,
+  0xb4, 0x3d, 0xed, 0x6f, 0x72, 0x3d, 0x1c, 0x3b, 0xba, 0xbd, 0x56, 0xa6, 0xc8,
+  0xbd, 0x1e, 0x39, 0x3b, 0xbe, 0x83, 0xc7, 0xb4, 0x3d, 0x04, 0xe6, 0xd6, 0x3d,
+  0x2a, 0x2c, 0x91, 0x3d, 0x78, 0x72, 0x9f, 0x3d, 0x62, 0xf9, 0xdd, 0xbd, 0x21,
+  0x97, 0x28, 0xbe, 0x52, 0xaa, 0x06, 0x3e, 0x55, 0x9e, 0x26, 0xbe, 0xb0, 0x2a,
+  0x4f, 0xbd, 0x72, 0x66, 0xeb, 0x3c, 0xa8, 0x84, 0xed, 0x3d, 0x02, 0xca, 0xaf,
+  0xbd, 0xbd, 0x90, 0x64, 0xbd, 0x91, 0xd5, 0x81, 0xbd, 0xcd, 0x4a, 0x24, 0x3e,
+  0x57, 0x13, 0x44, 0xbd, 0x35, 0x93, 0x1b, 0xbb, 0x9e, 0x75, 0xe0, 0x3d, 0x86,
+  0xfb, 0x25, 0xbe, 0x7a, 0xe1, 0xe5, 0x3d, 0x15, 0x97, 0x28, 0x3d, 0xa5, 0x78,
+  0xe4, 0x3d, 0x22, 0xf8, 0x0d, 0x3d, 0x18, 0xbb, 0xcb, 0xbc, 0xfc, 0x53, 0x99,
+  0xbd, 0xd5, 0x40, 0xcc, 0xbd, 0x2e, 0x47, 0xf6, 0x3d, 0xd0, 0x5c, 0x1c, 0xbb,
+  0xac, 0x38, 0xb3, 0x3c, 0x25, 0xfd, 0x8e, 0x3c, 0xd0, 0xc9, 0x4c, 0xbd, 0x37,
+  0xc4, 0xfe, 0xbd, 0x1d, 0xca, 0x17, 0xbe, 0x54, 0x50, 0x8f, 0xbd, 0xc1, 0xfb,
+  0xed, 0xbd, 0xb9, 0x2f, 0x24, 0x3e, 0xc0, 0x6d, 0x1c, 0xbe, 0xe2, 0xd7, 0x95,
+  0x3d, 0x21, 0xa6, 0x7c, 0x3d, 0x1b, 0x02, 0x3c, 0x3d, 0xc6, 0x73, 0x4b, 0x3d,
+  0x28, 0x7a, 0xcf, 0x3d, 0x6c, 0x4f, 0xf5, 0x3c, 0x0a, 0x47, 0x88, 0xbd, 0xe1,
+  0xc9, 0x39, 0xbe, 0x0d, 0x2d, 0x04, 0x3c, 0x80, 0xf8, 0xd7, 0xbb, 0x8e, 0xa6,
+  0xf3, 0xbd, 0x10, 0x3c, 0xe1, 0x3d, 0xde, 0x10, 0xb2, 0xbd, 0x9c, 0x3f, 0x46,
+  0xbd, 0xd4, 0x42, 0x01, 0x3e, 0x63, 0x0f, 0x82, 0x3d, 0xab, 0x71, 0xe9, 0xbd,
+  0x06, 0xe4, 0x11, 0x3e, 0x12, 0x15, 0x0a, 0xbe, 0x46, 0x0a, 0x5a, 0xbd, 0x83,
+  0xff, 0x9a, 0xbc, 0xe4, 0x96, 0xdc, 0xbd, 0xc7, 0xaf, 0x7a, 0x3d, 0x64, 0x84,
+  0xbe, 0x3d, 0x90, 0x0c, 0x04, 0xbd, 0xb4, 0x26, 0xb1, 0xbc, 0x35, 0xf6, 0x23,
+  0x3e, 0x81, 0x0c, 0x89, 0xbd, 0x8a, 0xe7, 0xd7, 0xbc, 0x3b, 0xce, 0xa5, 0x3d,
+  0xc1, 0x40, 0x83, 0x3d, 0x44, 0x14, 0x9a, 0x3d, 0xeb, 0x57, 0xbe, 0x3c, 0xde,
+  0x7c, 0x01, 0x3d, 0xa0, 0x13, 0xe4, 0xbc, 0x54, 0xae, 0xca, 0x3d, 0x9d, 0xd5,
+  0xc7, 0x3b, 0x59, 0x7b, 0xfc, 0xbd, 0xae, 0x12, 0x00, 0x3e, 0x79, 0xac, 0x07,
+  0x3e, 0x40, 0x9b, 0x83, 0xbd, 0x7b, 0xb9, 0xeb, 0xbb, 0x12, 0x58, 0xf6, 0x3d,
+  0x10, 0x80, 0x8c, 0xbd, 0x73, 0x18, 0xc8, 0xbd, 0x5e, 0x85, 0xbc, 0xbd, 0xf4,
+  0x7c, 0xd0, 0xbd, 0x3b, 0x06, 0x66, 0xbd, 0x88, 0xaf, 0x82, 0xbc, 0x43, 0x81,
+  0x80, 0x3d, 0x03, 0x7a, 0x20, 0x3e, 0xc1, 0x44, 0xd1, 0x3c, 0x2f, 0xa0, 0x76,
+  0x3d, 0x63, 0x3e, 0x06, 0x3c, 0x80, 0xb6, 0xa4, 0x3d, 0x6d, 0x3d, 0x20, 0x3e,
+  0xee, 0xe4, 0xb3, 0x3d, 0x3f, 0xb3, 0xfc, 0x3c, 0x66, 0x46, 0x52, 0x3e, 0x93,
+  0x86, 0x14, 0xbd, 0x1f, 0x77, 0x8e, 0xbd, 0x99, 0x66, 0x88, 0x3c, 0xbb, 0xb7,
+  0xc1, 0x3d, 0x30, 0x43, 0xcd, 0xbd, 0xd6, 0x81, 0xbe, 0x39, 0x60, 0x9d, 0x21,
+  0xbe, 0x77, 0xb4, 0x16, 0x3e, 0x50, 0x6b, 0x88, 0xbb, 0xbe, 0x2a, 0xe1, 0xbc,
+  0x7e, 0xfb, 0x13, 0xbe, 0x04, 0xd2, 0x01, 0x3e, 0xd7, 0xf2, 0xfb, 0xbd, 0xa1,
+  0x97, 0xa5, 0x3d, 0x51, 0xb1, 0x1d, 0x3e, 0xa6, 0xe9, 0x11, 0x3e, 0x28, 0xe3,
+  0xb0, 0xbc, 0xd6, 0xd7, 0xcf, 0xbd, 0xf7, 0x89, 0x10, 0x3e, 0x2d, 0x9d, 0x0b,
+  0xbe, 0x08, 0x0a, 0x0e, 0xbd, 0xc7, 0x1e, 0x08, 0x3d, 0x18, 0x40, 0xad, 0xbd,
+  0xef, 0x48, 0x05, 0xbd, 0xf6, 0xc0, 0x23, 0xbe, 0xf6, 0x7d, 0xa6, 0x3d, 0x05,
+  0xb5, 0x6c, 0x3d, 0x7f, 0x05, 0xd4, 0xbd, 0xd5, 0x2a, 0x1f, 0x3e, 0x60, 0x90,
+  0xee, 0xbd, 0x82, 0x03, 0x26, 0xbd, 0x27, 0x9d, 0x05, 0xbd, 0x2d, 0x05, 0x9c,
+  0x3c, 0xa0, 0x72, 0xef, 0x3d, 0x4a, 0xd9, 0xad, 0x3d, 0x9f, 0x2a, 0x46, 0xbd,
+  0x47, 0x6e, 0xfb, 0xbc, 0x43, 0x4b, 0xde, 0xbd, 0xf0, 0x40, 0x97, 0x3d, 0xd9,
+  0xf7, 0xe1, 0xbd, 0xbd, 0xae, 0xce, 0x3c, 0x79, 0xae, 0x8c, 0xbd, 0x34, 0xc9,
+  0x34, 0xbe, 0x99, 0x0a, 0xae, 0xbd, 0xae, 0xe2, 0xe9, 0x3d, 0xe7, 0x97, 0xf7,
+  0x3d, 0xd1, 0x30, 0x05, 0x3e, 0x14, 0xd3, 0x0c, 0x3d, 0xcd, 0x90, 0x63, 0x3d,
+  0x50, 0xac, 0x27, 0xbd, 0x06, 0x6c, 0x30, 0xbe, 0x31, 0x20, 0xa1, 0xbd, 0xf3,
+  0x98, 0x87, 0x3d, 0x31, 0x34, 0xac, 0xbd, 0x2e, 0xc3, 0xb3, 0xbb, 0xec, 0xb6,
+  0x4d, 0xbd, 0x6f, 0x2c, 0x02, 0xbc, 0xcc, 0xcb, 0x80, 0xbd, 0x7b, 0x15, 0x29,
+  0xbe, 0x8f, 0xb6, 0x8b, 0x3c, 0xca, 0x8b, 0x51, 0xbd, 0x64, 0x5f, 0x45, 0xbd,
+  0x0f, 0xa3, 0xa4, 0x3d, 0xed, 0x79, 0x9c, 0xbd, 0x31, 0xa0, 0xbb, 0x3d, 0xe9,
+  0x06, 0x26, 0x3e, 0x85, 0x78, 0x21, 0x3e, 0x81, 0x35, 0xcd, 0xbd, 0x05, 0x31,
+  0x11, 0xbe, 0x9d, 0x19, 0xde, 0xbd, 0x9a, 0xd3, 0x11, 0xbe, 0x58, 0xa7, 0xff,
+  0xbc, 0x9f, 0x4a, 0x29, 0x3d, 0xda, 0x56, 0x8c, 0xbc, 0xf6, 0xf9, 0x79, 0x3d,
+  0x11, 0xbe, 0x82, 0x3d, 0xda, 0x43, 0x04, 0x3e, 0xed, 0xce, 0xe1, 0x3d, 0x3a,
+  0x95, 0x3a, 0x3d, 0x56, 0x31, 0x4e, 0x3d, 0x82, 0x65, 0xbd, 0x3b, 0x4c, 0x6f,
+  0xa8, 0xbc, 0xa4, 0xa1, 0x25, 0xbc, 0xad, 0x79, 0x2f, 0xbe, 0x73, 0xac, 0x2b,
+  0x3e, 0x2d, 0x80, 0x3f, 0xbd, 0x97, 0xee, 0x80, 0xbd, 0xd8, 0x02, 0x77, 0x3d,
+  0xb2, 0xcb, 0x9b, 0x3d, 0x7c, 0x94, 0xc9, 0xbd, 0xce, 0xd1, 0xdd, 0x3d, 0x12,
+  0xef, 0x8b, 0x3d, 0x3a, 0xbe, 0x08, 0x3e, 0x73, 0x80, 0x1d, 0xbe, 0x2f, 0xdb,
+  0x2d, 0xbe, 0x58, 0x7d, 0xd7, 0xbd, 0x44, 0x0f, 0xae, 0x3d, 0xd6, 0xe7, 0x3d,
+  0x3e, 0xe0, 0x3a, 0xad, 0x3c, 0x7b, 0x10, 0x19, 0x3e, 0x1b, 0x4e, 0x78, 0xbd,
+  0x3f, 0xf3, 0x07, 0xbe, 0x8c, 0xcc, 0xf7, 0xbd, 0x5a, 0x20, 0xb9, 0xbd, 0x53,
+  0x04, 0x34, 0x3d, 0x6b, 0xcf, 0x24, 0x3e, 0x32, 0x1b, 0xc2, 0xbd, 0x92, 0x01,
+  0xee, 0x3c, 0x79, 0x75, 0xd8, 0xbd, 0xdf, 0x4b, 0x0a, 0x3c, 0xf3, 0x93, 0xce,
+  0x3d, 0x76, 0xf7, 0x31, 0xbd, 0xd7, 0x71, 0x17, 0xbe, 0xac, 0xed, 0x1f, 0xbe,
+  0xb5, 0x4d, 0x46, 0x3d, 0xb0, 0xb9, 0x0b, 0xbe, 0x02, 0xb8, 0x9f, 0x3d, 0x7d,
+  0x42, 0x28, 0xbe, 0x65, 0x07, 0xc7, 0x3d, 0xb2, 0xd4, 0xb5, 0x3d, 0x28, 0x07,
+  0xd3, 0x3c, 0x55, 0x93, 0x2c, 0xbe, 0x79, 0x7c, 0x29, 0x3e, 0x59, 0x10, 0x0a,
+  0xbe, 0x9d, 0x0a, 0x08, 0xbd, 0xa3, 0x61, 0x5d, 0x3d, 0xf8, 0xb5, 0xde, 0xbb,
+  0x54, 0x24, 0xa7, 0x3d, 0xe3, 0xe4, 0x32, 0xbe, 0x20, 0x3b, 0x3d, 0xbe, 0x48,
+  0x67, 0xc2, 0xbd, 0x3c, 0x7b, 0x2b, 0xbd, 0x69, 0xee, 0x56, 0xbd, 0xa9, 0x90,
+  0xcb, 0x3d, 0xff, 0xf1, 0xa7, 0xbd, 0xa9, 0xd8, 0x43, 0xbd, 0xb8, 0xcd, 0xb7,
+  0x3c, 0xcd, 0xfb, 0xbb, 0x3d, 0xd6, 0x26, 0x8a, 0xbd, 0x45, 0xa4, 0x81, 0x3d,
+  0xd2, 0xc9, 0x29, 0x3e, 0xdb, 0xf4, 0xdd, 0xbd, 0x93, 0x95, 0xa9, 0x3d, 0x11,
+  0xbb, 0x12, 0x3e, 0xdf, 0xf4, 0xcd, 0xbd, 0xb9, 0xde, 0x82, 0x3c, 0xdf, 0x26,
+  0x76, 0x3d, 0xb6, 0x47, 0x32, 0xbe, 0x91, 0x0f, 0x6f, 0x3b, 0x56, 0x16, 0x4c,
+  0xbe, 0x77, 0x77, 0x00, 0xbe, 0x2c, 0x1f, 0xd1, 0xbd, 0xf6, 0x43, 0x12, 0x3e,
+  0xd8, 0x7c, 0x16, 0x3e, 0x26, 0xec, 0x0c, 0xbe, 0xaf, 0x69, 0xe0, 0x3d, 0x5a,
+  0x3b, 0xdf, 0x3d, 0xbb, 0x0f, 0x99, 0x3d, 0xe2, 0x32, 0x2b, 0xbd, 0xf3, 0x1e,
+  0x1d, 0x3e, 0x9e, 0xdc, 0xf3, 0x3c, 0x77, 0x8b, 0xf7, 0xbd, 0x46, 0xb5, 0x48,
+  0xbc, 0x28, 0xce, 0xbd, 0x3c, 0x22, 0x68, 0x1a, 0x3e, 0x92, 0x40, 0xf0, 0x3c,
+  0x35, 0xf1, 0xbe, 0xbd, 0x8d, 0xed, 0xd0, 0x3d, 0x93, 0x67, 0x5e, 0xbd, 0xc8,
+  0xa3, 0xb0, 0xbd, 0x83, 0x61, 0x2f, 0x3d, 0x39, 0xce, 0x81, 0x3b, 0xa5, 0x87,
+  0x1d, 0x3e, 0xe0, 0x8f, 0x38, 0x3c, 0xce, 0x6f, 0x26, 0x3d, 0x09, 0x7f, 0x9a,
+  0x3d, 0x6c, 0x04, 0x8f, 0xbd, 0x31, 0x13, 0x9c, 0xbb, 0xab, 0xbc, 0x3f, 0xbd,
+  0xe1, 0x11, 0xc2, 0xbd, 0x47, 0xa8, 0x3a, 0x3d, 0x76, 0xc5, 0x0b, 0xbe, 0x0d,
+  0x71, 0xff, 0x3d, 0x30, 0x8e, 0x41, 0x3d, 0xdc, 0xf6, 0x2d, 0xbe, 0x1a, 0x84,
+  0x1f, 0x3d, 0xe2, 0xd4, 0x09, 0x3e, 0xe7, 0x1f, 0x1d, 0xbd, 0x20, 0x25, 0x26,
+  0x3d, 0x68, 0x8f, 0x61, 0x3d, 0xe7, 0xdf, 0x1f, 0xbe, 0xad, 0x57, 0x1b, 0xbe,
+  0x3e, 0xec, 0x1b, 0xbe, 0x6f, 0xe4, 0x09, 0xbe, 0x87, 0x7d, 0xb5, 0xbc, 0xce,
+  0x89, 0x07, 0x3d, 0x8a, 0x34, 0xbe, 0x3b, 0x7a, 0x7d, 0x24, 0x3e, 0xde, 0xc8,
+  0xfa, 0x3d, 0xa4, 0xc7, 0x9e, 0xbd, 0x5b, 0x97, 0xf0, 0xbd, 0x16, 0xf7, 0x3b,
+  0xbe, 0x91, 0xad, 0x27, 0x3e, 0x06, 0x69, 0xf3, 0xbd, 0x6d, 0xb9, 0xe6, 0xbd,
+  0xfc, 0xa1, 0x33, 0x3e, 0x73, 0x47, 0xd4, 0xbd, 0xd1, 0x35, 0xc0, 0x3d, 0x74,
+  0x47, 0x12, 0x3d, 0x2d, 0x04, 0x23, 0x3d, 0xfc, 0xc6, 0x1b, 0x3d, 0x75, 0x18,
+  0x0e, 0xbe, 0xa5, 0x96, 0x55, 0x3c, 0xb8, 0x10, 0xad, 0xbc, 0x93, 0x9b, 0xde,
+  0xbd, 0x9f, 0xa2, 0xf4, 0x3d, 0xb8, 0x21, 0xf6, 0xba, 0xd7, 0x96, 0x09, 0xbd,
+  0x2a, 0x6c, 0xd9, 0xbd, 0xb1, 0x32, 0x45, 0x3d, 0xc0, 0x16, 0x94, 0xbd, 0x78,
+  0xac, 0x97, 0xbd, 0x97, 0xd4, 0xdf, 0xbd, 0x68, 0x97, 0x36, 0xbd, 0x28, 0xce,
+  0x2f, 0x3d, 0x12, 0x02, 0x3d, 0xbd, 0x5b, 0x8f, 0x23, 0x3d, 0xf5, 0xc3, 0xda,
+  0xba, 0xa6, 0x72, 0x41, 0x3e, 0x27, 0xa9, 0xcd, 0xbd, 0x9c, 0x9a, 0x3c, 0x3d,
+  0xf2, 0x7f, 0x45, 0x3e, 0x1c, 0x9f, 0x40, 0x3e, 0xa9, 0xdf, 0x74, 0x3c, 0x6a,
+  0x72, 0x6e, 0xbd, 0x46, 0x83, 0xa5, 0x3d, 0x3b, 0x67, 0x6c, 0x3c, 0xfc, 0x84,
+  0x2a, 0x3d, 0x3c, 0xf4, 0x35, 0x3e, 0xb4, 0x2c, 0x79, 0xbd, 0x43, 0xb9, 0xd6,
+  0x3d, 0xe6, 0xae, 0x13, 0xbd, 0xeb, 0x77, 0xd0, 0xbd, 0x31, 0x51, 0xbe, 0x3d,
+  0x5f, 0x2e, 0x23, 0x3c, 0x7a, 0xbe, 0x15, 0x3e, 0x4b, 0x59, 0xdc, 0xbd, 0xa0,
+  0x8f, 0xe7, 0xbd, 0x76, 0xa8, 0xf3, 0xbd, 0x88, 0x1c, 0x74, 0x3d, 0x85, 0x4d,
+  0xdd, 0xbd, 0x45, 0x96, 0x36, 0xbd, 0xe8, 0x39, 0x98, 0x3d, 0xbe, 0x82, 0xf9,
+  0x3d, 0x1d, 0xdb, 0x2d, 0x3b, 0x6f, 0xac, 0x63, 0xbd, 0x8c, 0xc8, 0xe1, 0xbd,
+  0xcf, 0x49, 0x73, 0xbd, 0x8a, 0xdd, 0xe3, 0xbd, 0xf8, 0x00, 0x19, 0xbd, 0x17,
+  0xe8, 0xdf, 0xbd, 0xba, 0x22, 0x5b, 0x3c, 0xf1, 0x54, 0x21, 0xbe, 0x7b, 0x38,
+  0x58, 0xbd, 0x48, 0x88, 0x67, 0xbd, 0x5e, 0xe2, 0x6c, 0x3d, 0xa5, 0x44, 0x20,
+  0xbe, 0x69, 0x7f, 0xbf, 0xbc, 0x7c, 0xfa, 0x25, 0x3e, 0xc1, 0xd9, 0xd5, 0xbd,
+  0x46, 0x87, 0x75, 0xbd, 0x13, 0x1c, 0x01, 0xbd, 0xe5, 0xc3, 0x19, 0xbb, 0x2d,
+  0xc8, 0x30, 0xbe, 0xad, 0xd8, 0xf2, 0x3d, 0xd9, 0x37, 0x14, 0xbd, 0xd2, 0xb5,
+  0x9a, 0x3d, 0xf4, 0x37, 0x8d, 0x3c, 0x2f, 0x8f, 0xc0, 0x3d, 0x8e, 0xe9, 0xc5,
+  0xbd, 0xf5, 0x4d, 0x21, 0xbe, 0xfd, 0x9a, 0xaa, 0xbd, 0x91, 0xb6, 0x00, 0xbe,
+  0xf0, 0x0d, 0xbf, 0x3c, 0xe4, 0x94, 0xed, 0x3d, 0x64, 0xbe, 0x8d, 0x3c, 0x27,
+  0xcf, 0x2f, 0x3e, 0x22, 0xa5, 0xf1, 0x3d, 0x96, 0xf2, 0xbf, 0xbd, 0x62, 0xde,
+  0xe5, 0xbd, 0x4b, 0x4a, 0x89, 0x3d, 0x7a, 0x3c, 0x1d, 0x3e, 0xfc, 0x83, 0xab,
+  0xbc, 0x0f, 0x00, 0x2e, 0xbe, 0xd5, 0xd1, 0x93, 0x3d, 0x32, 0x51, 0xca, 0xbd,
+  0x27, 0x77, 0x31, 0xbd, 0x6e, 0xe6, 0xe2, 0x3d, 0xdd, 0xb0, 0x03, 0xbe, 0xd7,
+  0xec, 0xe5, 0xbd, 0x97, 0x8e, 0x82, 0x3b, 0x7b, 0xaf, 0x03, 0xbe, 0xbe, 0x24,
+  0xc3, 0x3d, 0x1e, 0x4c, 0x51, 0x3e, 0x07, 0x32, 0x10, 0x3e, 0xac, 0xdb, 0x01,
+  0xbe, 0xef, 0x14, 0x38, 0x3e, 0x1b, 0xbb, 0x73, 0x3d, 0x6a, 0x42, 0x35, 0xbd,
+  0x79, 0x72, 0x13, 0xbe, 0x05, 0x8c, 0xe9, 0x3d, 0xc1, 0x57, 0xe5, 0x3b, 0x50,
+  0x38, 0x71, 0x3d, 0x47, 0xb5, 0xe4, 0xbd, 0x0f, 0x18, 0x01, 0xbe, 0xd6, 0x1c,
+  0x76, 0x3b, 0x99, 0x36, 0x1c, 0xbe, 0x6d, 0xee, 0x1a, 0x3d, 0x2d, 0xcb, 0x39,
+  0xbd, 0xc0, 0x54, 0x24, 0x3e, 0xcb, 0x5b, 0xfb, 0x3c, 0x8d, 0xc8, 0x85, 0x3a,
+  0x10, 0xcb, 0xd6, 0x3c, 0xfd, 0x81, 0xd8, 0x3c, 0xc7, 0xab, 0x1b, 0xba, 0xf5,
+  0xe1, 0xb5, 0xbd, 0x7a, 0x09, 0xfc, 0x3d, 0x98, 0x7b, 0x6b, 0xbd, 0x31, 0x74,
+  0x46, 0xbe, 0x13, 0x26, 0x02, 0x3e, 0x67, 0x37, 0x03, 0xbe, 0x68, 0x29, 0xc4,
+  0xbd, 0x8a, 0xc5, 0x8b, 0xbd, 0x50, 0x23, 0x22, 0xbc, 0x6d, 0x99, 0xf5, 0x3d,
+  0x01, 0x6c, 0xc5, 0xbd, 0xd6, 0xce, 0x14, 0xbe, 0x29, 0xd4, 0xef, 0xbd, 0x7c,
+  0xe1, 0x8b, 0x3c, 0x8f, 0x04, 0xd6, 0xbc, 0x29, 0xf1, 0x60, 0x3c, 0x02, 0x1a,
+  0x2c, 0x3b, 0x76, 0x21, 0x00, 0xbe, 0x16, 0x98, 0x66, 0xbd, 0x2a, 0x64, 0x3f,
+  0xbd, 0xbf, 0x81, 0x24, 0x3d, 0x30, 0x34, 0x27, 0x3e, 0x90, 0xee, 0x9b, 0x3d,
+  0xe1, 0x6c, 0xdd, 0x3c, 0x25, 0x40, 0x25, 0x3e, 0xc0, 0x85, 0x57, 0x3b, 0x16,
+  0xa8, 0x4f, 0x3e, 0xa9, 0xfb, 0x48, 0xbd, 0x38, 0x1c, 0xf8, 0x3b, 0x7a, 0x4a,
+  0xb0, 0xbd, 0x29, 0xe7, 0xf3, 0xbd, 0xa5, 0x5c, 0x42, 0x3d, 0xab, 0x54, 0x09,
+  0x3e, 0x94, 0x68, 0x75, 0x3d, 0x24, 0x37, 0x03, 0xbe, 0x4e, 0xba, 0x09, 0x3e,
+  0x16, 0xba, 0x09, 0x3e, 0xbd, 0x97, 0x00, 0xbe, 0x92, 0xe4, 0x95, 0xbd, 0x74,
+  0xf5, 0x9f, 0xbd, 0x40, 0x16, 0x81, 0x3d, 0x83, 0x4c, 0x26, 0x3e, 0x61, 0xd1,
+  0x25, 0x3e, 0xfb, 0x74, 0x1d, 0xbe, 0x9b, 0x9f, 0x0f, 0x3d, 0xe8, 0x7e, 0x10,
+  0x3d, 0x9e, 0xb0, 0x15, 0x3d, 0x34, 0xe6, 0xee, 0x3d, 0xaf, 0xef, 0xf0, 0xbb,
+  0xaa, 0x06, 0x24, 0xbe, 0x43, 0x5e, 0xdb, 0x3d, 0x10, 0xd8, 0xa4, 0x3d, 0x6e,
+  0xc9, 0x0c, 0xbd, 0x1c, 0xfe, 0xa9, 0x3d, 0xf0, 0xf3, 0x31, 0x3d, 0x38, 0xf5,
+  0x7e, 0xba, 0x24, 0x31, 0xe0, 0x3d, 0x6e, 0xf2, 0xa2, 0x3d, 0xbe, 0x8b, 0xd4,
+  0xbd, 0x65, 0xc3, 0x25, 0x3c, 0xa3, 0xde, 0x67, 0xba, 0x41, 0xe9, 0x13, 0xbe,
+  0x83, 0xd0, 0x02, 0xbd, 0x8b, 0x91, 0x3a, 0x3d, 0x29, 0x20, 0x4c, 0xbc, 0xfc,
+  0x3f, 0xcd, 0xbd, 0x5a, 0x01, 0xae, 0xbd, 0x6c, 0x48, 0x1e, 0xbe, 0xe0, 0x29,
+  0x80, 0x3d, 0x18, 0x74, 0xa0, 0xbd, 0x2a, 0xeb, 0xbd, 0x39, 0x28, 0xe6, 0x2e,
+  0xbe, 0x4b, 0x70, 0x59, 0x3d, 0xd7, 0xcf, 0xd7, 0xbc, 0x34, 0x77, 0xa5, 0x3c,
+  0xef, 0x6d, 0x58, 0xbb, 0x31, 0xcc, 0xde, 0xbb, 0xf6, 0xe6, 0xc2, 0xbd, 0x8b,
+  0xee, 0x14, 0x3e, 0xf3, 0x70, 0x12, 0xbe, 0x88, 0x93, 0xae, 0xbd, 0x57, 0xd4,
+  0xfc, 0x3d, 0x48, 0x74, 0x36, 0x3e, 0xb5, 0xcb, 0x08, 0xbe, 0x32, 0x08, 0xbe,
+  0xbd, 0x95, 0xe2, 0x2e, 0xbd, 0x6c, 0xa0, 0xc3, 0x3d, 0x83, 0xdb, 0xc4, 0x3a,
+  0xc8, 0x25, 0xf0, 0x3d, 0x8a, 0x78, 0x0f, 0x3e, 0xed, 0xd4, 0x02, 0xbc, 0xd4,
+  0x18, 0xad, 0xbd, 0x70, 0x10, 0xbf, 0xbd, 0x9f, 0x8e, 0x1c, 0xbe, 0x41, 0xdf,
+  0xf2, 0x3d, 0x20, 0x72, 0x45, 0x3d, 0x7f, 0x52, 0x16, 0xbe, 0xd7, 0xf4, 0x25,
+  0xbe, 0x6d, 0x3f, 0x3d, 0x3e, 0xd4, 0xb0, 0x26, 0xbe, 0x23, 0x8c, 0x87, 0x3d,
+  0x6c, 0x4e, 0xb9, 0xbc, 0x67, 0x6c, 0x44, 0x3c, 0x35, 0x7b, 0xde, 0x3d, 0x19,
+  0x66, 0xd7, 0x3d, 0x1c, 0xc9, 0xc2, 0x3d, 0xf1, 0xee, 0xba, 0xbd, 0xa3, 0xe1,
+  0xc8, 0x3d, 0xf5, 0xf9, 0x82, 0x3c, 0x3d, 0x0e, 0x81, 0x3d, 0xea, 0xc7, 0x5d,
+  0x3d, 0x19, 0x63, 0x25, 0x3e, 0x59, 0x2f, 0x13, 0xbd, 0xf2, 0x44, 0xeb, 0x3d,
+  0xf0, 0xb5, 0xf1, 0xbc, 0x85, 0x77, 0x03, 0x3d, 0xda, 0x66, 0x11, 0xbd, 0xef,
+  0xae, 0x1b, 0x3d, 0xe1, 0x4f, 0x94, 0xbd, 0x25, 0x17, 0x56, 0xbd, 0x74, 0x34,
+  0x0c, 0x3e, 0xf8, 0x12, 0x88, 0x3d, 0x96, 0x08, 0x97, 0xbd, 0x04, 0xb9, 0x75,
+  0xbc, 0x72, 0x9f, 0x8e, 0x3d, 0x0d, 0xf3, 0x7d, 0xbd, 0x51, 0xe7, 0x56, 0xbc,
+  0x93, 0x6d, 0x08, 0xbe, 0xa7, 0xd8, 0x09, 0x3e, 0x80, 0xd5, 0xa8, 0xbd, 0x40,
+  0x03, 0xd1, 0x3c, 0xe2, 0x44, 0x1f, 0xbd, 0x3e, 0x1f, 0xd6, 0xbd, 0x9f, 0x62,
+  0xe7, 0x3c, 0xf7, 0x6d, 0xae, 0xbd, 0xf4, 0x14, 0xf6, 0x3a, 0x54, 0x99, 0xea,
+  0x3b, 0x9c, 0xab, 0xf7, 0xbd, 0x74, 0x21, 0xdd, 0x3d, 0x87, 0x18, 0x95, 0xbd,
+  0x49, 0x55, 0x0c, 0xbe, 0xd6, 0xdc, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20,
+  0x01, 0x00, 0x00, 0x5a, 0xd4, 0xee, 0x3d, 0x38, 0x39, 0x64, 0x3e, 0x55, 0xb4,
+  0x79, 0x3d, 0x1d, 0xa3, 0xb9, 0x3d, 0xb9, 0x79, 0xe0, 0x3b, 0x30, 0xff, 0xd1,
+  0x3d, 0x7a, 0x3b, 0x2d, 0xbd, 0x18, 0x51, 0x07, 0xbe, 0x5c, 0x31, 0x3d, 0x3e,
+  0x46, 0x0f, 0x51, 0xbe, 0x29, 0x32, 0x13, 0x3e, 0x7c, 0x11, 0xf3, 0xbd, 0x3a,
+  0xbd, 0x4a, 0xbd, 0x56, 0xb3, 0xce, 0xbd, 0x37, 0xd0, 0xf6, 0x3d, 0xd5, 0x9b,
+  0xd8, 0x3d, 0xa8, 0xbc, 0x5a, 0xbe, 0x1b, 0x22, 0x0e, 0xbc, 0x03, 0x98, 0xf9,
+  0x3d, 0x64, 0xf4, 0x47, 0x3e, 0xa2, 0xb5, 0x2f, 0xbe, 0x70, 0x7a, 0x89, 0xbe,
+  0x9c, 0x58, 0x60, 0x3e, 0x71, 0xac, 0x25, 0xbe, 0x17, 0x1c, 0x01, 0x3e, 0x48,
+  0x73, 0x93, 0xbd, 0x0d, 0x92, 0xa3, 0x3d, 0xf1, 0xff, 0x62, 0xbe, 0x56, 0xe9,
+  0x71, 0xbe, 0x09, 0xf7, 0x96, 0xbe, 0x91, 0x7a, 0x0a, 0x3e, 0xc1, 0x6d, 0x88,
+  0x3c, 0x6c, 0xd0, 0x4f, 0xbe, 0x71, 0x75, 0x99, 0xbd, 0x7d, 0x92, 0x01, 0xbe,
+  0x35, 0x21, 0x96, 0xbe, 0xd9, 0x0e, 0x2d, 0x3e, 0x63, 0x17, 0x8b, 0x3d, 0x53,
+  0x6d, 0xb7, 0x3c, 0xb9, 0x06, 0x20, 0x3d, 0xdf, 0x56, 0x11, 0x3e, 0xc4, 0xcd,
+  0xa9, 0x3c, 0x7d, 0x0a, 0x3b, 0x3e, 0xd6, 0x23, 0x7f, 0xbc, 0xaf, 0x06, 0xc4,
+  0xbc, 0xe0, 0xe3, 0x63, 0xbd, 0x34, 0x50, 0x2a, 0x3e, 0x1f, 0xff, 0x4c, 0x3e,
+  0x34, 0x98, 0x79, 0xbe, 0x4c, 0xbd, 0x18, 0x3e, 0x5b, 0x8b, 0x0f, 0x3e, 0x33,
+  0x44, 0x34, 0xbd, 0xd6, 0xd7, 0x90, 0xbe, 0x51, 0x5e, 0x55, 0x3d, 0x46, 0x2b,
+  0x54, 0xbe, 0xd8, 0x49, 0x30, 0xbe, 0x45, 0xb3, 0x72, 0xbe, 0x93, 0x18, 0xcd,
+  0x3d, 0x86, 0xe1, 0x73, 0xbd, 0x94, 0x56, 0xf3, 0x3d, 0x0a, 0x54, 0xd7, 0xbd,
+  0x01, 0xd9, 0x98, 0x3e, 0xd5, 0x11, 0x01, 0xbb, 0x69, 0x07, 0x62, 0xbe, 0x81,
+  0x33, 0x03, 0xbb, 0x98, 0xf9, 0x9f, 0x3c, 0xe8, 0x77, 0x96, 0x3e, 0x3a, 0xc2,
+  0x73, 0x3e, 0xa1, 0x45, 0x35, 0xbe, 0xea, 0x1c, 0x86, 0xbc, 0xad, 0x90, 0x45,
+  0xbe, 0x0b, 0xd2, 0x03, 0x3d, 0x02, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0xa1, 0xc6, 0xcd, 0xbe, 0x46, 0xa7, 0xbd, 0x3e, 0x7c,
+  0xe3, 0x00, 0x3f, 0x13, 0x8d, 0xb6, 0xbe, 0x21, 0x72, 0x8b, 0x3e, 0x16, 0x68,
+  0x68, 0x3e, 0x05, 0xb7, 0xb6, 0xbe, 0xa0, 0xd3, 0xd4, 0x3e, 0x98, 0x82, 0x83,
+  0xbd, 0x8c, 0xb1, 0xe2, 0x3d, 0xd6, 0x94, 0x82, 0x3e, 0x07, 0x6a, 0x70, 0xbe,
+  0x6b, 0x74, 0x0b, 0x3f, 0xd8, 0xf5, 0x3d, 0x3e, 0xfb, 0xf3, 0x19, 0xbd, 0x2c,
+  0x72, 0xbf, 0x3e, 0xff, 0x95, 0x49, 0x3d, 0xee, 0x70, 0x78, 0x3e, 0xb0, 0x3f,
+  0x58, 0x3d, 0x78, 0xea, 0x9d, 0xbe, 0x53, 0x1d, 0x15, 0x3f, 0x0d, 0xfc, 0xbe,
+  0xbe, 0xad, 0x10, 0x07, 0xbf, 0xb4, 0x11, 0x87, 0xbe, 0x20, 0x92, 0x62, 0x3e,
+  0x58, 0x61, 0xbd, 0x3e, 0xea, 0x54, 0x4a, 0xbd, 0xbd, 0x55, 0xce, 0xbe, 0x12,
+  0x48, 0xa2, 0x3e, 0xe0, 0x74, 0x90, 0x3d, 0xce, 0x80, 0xf5, 0x3e, 0xa5, 0xb7,
+  0x15, 0x3f, 0x8e, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01,
+  0x00, 0x2c, 0xcf, 0x79, 0xbd, 0x8c, 0x37, 0x5a, 0xbc, 0x00, 0x4c, 0x6f, 0x3c,
+  0x14, 0x0b, 0x8e, 0x3d, 0xa8, 0xc3, 0x12, 0x3c, 0x10, 0x9f, 0xa5, 0xbb, 0xe8,
+  0x7e, 0x17, 0xbd, 0x43, 0x60, 0x74, 0xbd, 0xc6, 0x62, 0x6f, 0x3d, 0x88, 0x83,
+  0x6c, 0xbd, 0xf7, 0xf2, 0x36, 0xbd, 0xb7, 0x11, 0x81, 0xbd, 0x69, 0x1c, 0x30,
+  0xbd, 0xde, 0xd0, 0x4e, 0x3c, 0xa4, 0x9f, 0x6e, 0xbc, 0x06, 0xd8, 0xd6, 0xbc,
+  0x21, 0x75, 0x5f, 0xbd, 0x68, 0x6f, 0x0c, 0xbc, 0xbd, 0x21, 0xcf, 0xbb, 0x20,
+  0x31, 0xb0, 0x3b, 0x88, 0xa3, 0x32, 0x3c, 0xa0, 0xec, 0x56, 0x3d, 0x19, 0xfd,
+  0xf8, 0x3c, 0x99, 0xd1, 0x75, 0x3d, 0x99, 0x54, 0x3d, 0x3c, 0x4d, 0x0f, 0x12,
+  0x3b, 0x34, 0xf2, 0x37, 0xbd, 0xaa, 0x3b, 0x85, 0xbb, 0x23, 0xfe, 0xde, 0xbb,
+  0x8a, 0xe4, 0x21, 0x3c, 0xbd, 0x46, 0x8d, 0x3d, 0xd8, 0xf0, 0x03, 0x3d, 0xfa,
+  0xb6, 0xb6, 0x3c, 0xb8, 0x2e, 0xc9, 0xbc, 0xac, 0x52, 0x4a, 0xbd, 0xd2, 0x5d,
+  0x00, 0x3c, 0x7d, 0x64, 0x6f, 0xbd, 0xe6, 0x47, 0x77, 0x3d, 0xe0, 0x29, 0xbe,
+  0x3b, 0x5a, 0xb3, 0xee, 0xbc, 0x40, 0x76, 0xe3, 0xbb, 0x18, 0xf0, 0x8b, 0x3c,
+  0xbc, 0x5f, 0x3a, 0x3d, 0x47, 0xdd, 0x08, 0x3d, 0x0b, 0xae, 0x39, 0xbc, 0xa1,
+  0xca, 0xd9, 0xbc, 0xf8, 0x6b, 0x92, 0xbc, 0xf8, 0x2b, 0x42, 0x3d, 0xef, 0x4c,
+  0x14, 0xbd, 0x64, 0xd7, 0x4b, 0xbd, 0x22, 0x18, 0x18, 0x3c, 0x20, 0xf8, 0x29,
+  0xbd, 0x00, 0x5d, 0xdd, 0x3a, 0x56, 0x0c, 0x5f, 0xbd, 0x47, 0x5d, 0x84, 0xbd,
+  0x5e, 0xea, 0xa1, 0x3c, 0xc4, 0x53, 0x89, 0xbd, 0x53, 0xde, 0x4d, 0xbc, 0xe7,
+  0xc7, 0x88, 0xbc, 0x35, 0xef, 0x56, 0x3d, 0x45, 0x2c, 0xb4, 0x3c, 0xd8, 0x97,
+  0x7b, 0xbd, 0x17, 0xec, 0x89, 0x3d, 0xe1, 0x90, 0x45, 0x3d, 0x89, 0xf2, 0x3f,
+  0xbd, 0xf1, 0x11, 0xff, 0xbb, 0x1b, 0x6f, 0x03, 0xbd, 0xf7, 0xf7, 0x3d, 0x3b,
+  0xc4, 0x7d, 0x91, 0x3c, 0x44, 0x07, 0x0b, 0x3d, 0x4a, 0xc0, 0x6f, 0x3d, 0x79,
+  0x51, 0x8f, 0x3d, 0x66, 0x5e, 0x41, 0x3d, 0xf1, 0x9b, 0x8c, 0xbd, 0x38, 0xb9,
+  0xca, 0x3c, 0xe3, 0xf8, 0xe8, 0x3c, 0xcd, 0xce, 0x8f, 0xbb, 0xe4, 0xe9, 0x6b,
+  0x3c, 0x92, 0xd8, 0x39, 0x3d, 0xbe, 0x6d, 0x52, 0xbd, 0x38, 0xed, 0x4a, 0xbd,
+  0x68, 0xd4, 0x28, 0xbc, 0x6f, 0x16, 0x67, 0xbd, 0xd7, 0x55, 0x8a, 0x3d, 0xe0,
+  0x69, 0xb0, 0xbb, 0xfa, 0x9c, 0x93, 0xbd, 0x14, 0xe4, 0x21, 0x3d, 0x96, 0x1c,
+  0x7b, 0x3d, 0x4c, 0x31, 0x34, 0x3c, 0xa8, 0x41, 0x5c, 0x3c, 0x90, 0xe5, 0x8c,
+  0x3d, 0x11, 0x9f, 0x98, 0x3c, 0xf0, 0x3d, 0x16, 0x3d, 0x53, 0xd1, 0x91, 0xbd,
+  0x50, 0xc5, 0xef, 0x3c, 0x25, 0x52, 0x83, 0x3c, 0x9e, 0xce, 0x1f, 0x3c, 0x91,
+  0xa7, 0x0c, 0xbd, 0xb8, 0x95, 0x03, 0x3c, 0x7a, 0x4c, 0x35, 0x3d, 0x8e, 0xc4,
+  0x44, 0x3d, 0x1c, 0x66, 0x2c, 0x3d, 0x00, 0x89, 0x40, 0xba, 0xe1, 0xa3, 0x83,
+  0x3d, 0x68, 0xf2, 0x2b, 0xbd, 0x30, 0xd4, 0xde, 0x3b, 0xcf, 0xa1, 0xbc, 0x3c,
+  0x24, 0x79, 0x39, 0xbd, 0xe5, 0xf4, 0xb7, 0xbc, 0x79, 0x8d, 0x25, 0x3c, 0x95,
+  0xb6, 0x38, 0x3d, 0xd8, 0xc2, 0x74, 0x3c, 0xaa, 0x8e, 0x80, 0xbd, 0x0d, 0x74,
+  0xf3, 0x3c, 0x73, 0x5b, 0x98, 0xbc, 0x00, 0x64, 0x5e, 0xbc, 0x44, 0x82, 0xcb,
+  0x3c, 0x5a, 0x25, 0x53, 0xbd, 0xe2, 0xd0, 0x93, 0xbd, 0x3b, 0x7a, 0x77, 0xbd,
+  0x93, 0x3e, 0xd4, 0x3c, 0x39, 0x81, 0x28, 0xbd, 0x54, 0xd5, 0xef, 0x3c, 0x6c,
+  0x29, 0xe1, 0x3c, 0x69, 0xc8, 0x09, 0x3d, 0x83, 0xb3, 0x36, 0xbd, 0x90, 0xe1,
+  0xd4, 0xbb, 0x95, 0xa7, 0x1a, 0xbd, 0x39, 0xf5, 0x2b, 0xbc, 0x0c, 0xdf, 0x64,
+  0xbd, 0x74, 0xec, 0xdc, 0xbc, 0x20, 0xc6, 0x3b, 0x3d, 0x40, 0x20, 0x46, 0x3c,
+  0x18, 0x09, 0x3f, 0xbd, 0x96, 0x4c, 0xdc, 0xbc, 0x98, 0x98, 0x8d, 0xbd, 0xb4,
+  0xdd, 0x27, 0xbd, 0x74, 0x45, 0xbb, 0x3c, 0x49, 0xd9, 0x08, 0xbd, 0x8e, 0x06,
+  0xa8, 0x3b, 0x91, 0x10, 0xb4, 0x3c, 0xf8, 0x58, 0xf3, 0xbc, 0x06, 0xe9, 0x5e,
+  0x3d, 0x14, 0xc8, 0x26, 0x3d, 0xc5, 0xf7, 0x20, 0xbb, 0x6b, 0x78, 0xc0, 0x3c,
+  0xae, 0x64, 0x7f, 0x3c, 0xbb, 0xbf, 0x8b, 0x3c, 0x82, 0x4e, 0x0c, 0xbd, 0xb0,
+  0xd0, 0xdf, 0xbc, 0xfe, 0x53, 0x97, 0xbc, 0x8a, 0x9e, 0x24, 0xbd, 0xdf, 0x79,
+  0x84, 0x3d, 0x7e, 0xff, 0x8e, 0xbd, 0x66, 0x7b, 0xda, 0x3c, 0xb0, 0xdd, 0x8d,
+  0xbd, 0xab, 0x91, 0xbb, 0xbc, 0x23, 0x20, 0xb0, 0xbc, 0xbe, 0x43, 0x3f, 0xbd,
+  0x64, 0x80, 0xda, 0x3c, 0x32, 0x00, 0xde, 0x3c, 0xb2, 0x8a, 0x86, 0x3c, 0x68,
+  0x45, 0x05, 0x3d, 0x8b, 0x7c, 0xd8, 0x3b, 0x68, 0x97, 0xe7, 0x3c, 0x82, 0x8d,
+  0x6b, 0x3d, 0xa6, 0x53, 0x2d, 0x3d, 0xc0, 0x43, 0x23, 0x3c, 0xaa, 0xe6, 0x2d,
+  0xbd, 0x34, 0x06, 0x57, 0xbc, 0xfc, 0x9f, 0x0c, 0xbd, 0x42, 0x77, 0xc6, 0x3c,
+  0x51, 0x7a, 0x70, 0x3c, 0xe5, 0xe4, 0x7c, 0x3d, 0x86, 0x00, 0x67, 0xbd, 0x95,
+  0xb8, 0x37, 0xbd, 0xdd, 0x7a, 0x8d, 0x3d, 0x97, 0x08, 0xa9, 0x3c, 0xfd, 0xb6,
+  0x09, 0x3d, 0xdc, 0xb7, 0x81, 0x3d, 0xe0, 0x6c, 0x68, 0xbc, 0x79, 0x9b, 0x03,
+  0xbd, 0xb8, 0xc7, 0x78, 0xbb, 0x94, 0x60, 0x0f, 0x3d, 0x3b, 0x0e, 0x80, 0x3d,
+  0x11, 0xe6, 0x80, 0x3d, 0xb3, 0xab, 0x86, 0x3d, 0xed, 0xe6, 0x9d, 0xbc, 0xd8,
+  0xeb, 0xd9, 0xbc, 0xaa, 0x62, 0x80, 0x3d, 0x12, 0xc5, 0x00, 0x3d, 0x2b, 0x4b,
+  0x23, 0xbc, 0xc7, 0x31, 0xff, 0xbc, 0xe4, 0x95, 0xdb, 0x3b, 0xa7, 0x90, 0x66,
+  0x3c, 0xd3, 0x65, 0xdb, 0xbc, 0x50, 0xe3, 0x47, 0x3d, 0xd4, 0x25, 0x84, 0xbd,
+  0x5a, 0xd5, 0xae, 0xbc, 0x90, 0x5e, 0xba, 0x3c, 0x8c, 0x60, 0x90, 0xbd, 0xfc,
+  0x57, 0x4c, 0x3d, 0x99, 0x08, 0x7d, 0xbd, 0x9f, 0xac, 0x3b, 0x3c, 0x1c, 0xb1,
+  0x61, 0xbc, 0x6a, 0xb5, 0x33, 0xbc, 0x10, 0xb0, 0x28, 0x3c, 0x89, 0x5d, 0x9f,
+  0x3c, 0xd2, 0x80, 0x84, 0xbc, 0xb4, 0xb1, 0xd5, 0xba, 0x41, 0x1e, 0xa0, 0x3c,
+  0xd1, 0xd9, 0xd0, 0xbb, 0x04, 0xda, 0xd2, 0x3c, 0x58, 0x46, 0x90, 0xbc, 0xc1,
+  0x5c, 0x19, 0xbc, 0x01, 0x66, 0x2c, 0xbd, 0xad, 0xdc, 0x88, 0xbd, 0x32, 0xab,
+  0xb6, 0xbc, 0x14, 0x1f, 0x0b, 0x3d, 0x87, 0xf0, 0x69, 0x3d, 0x55, 0x30, 0x26,
+  0xbd, 0x2e, 0x3a, 0x05, 0xbd, 0xda, 0x08, 0x0e, 0xbd, 0xef, 0x31, 0x57, 0xbd,
+  0x0e, 0x44, 0x13, 0xbd, 0x53, 0x11, 0x29, 0xbd, 0x00, 0xd2, 0xea, 0x3a, 0x47,
+  0x72, 0xae, 0xbc, 0x54, 0x4a, 0x4d, 0xbd, 0x8a, 0x13, 0x2b, 0xbd, 0xa3, 0xaf,
+  0x92, 0x3d, 0x68, 0x15, 0x0d, 0x3c, 0x18, 0x17, 0x35, 0x3c, 0xb8, 0xf2, 0x6a,
+  0x3c, 0x15, 0xf8, 0xb2, 0x3c, 0x1d, 0x9d, 0xcd, 0x3c, 0xd3, 0x90, 0x81, 0xbd,
+  0x51, 0xe8, 0x21, 0x3d, 0x74, 0x43, 0xa9, 0x3c, 0x00, 0x0b, 0xa0, 0x3c, 0x8e,
+  0x69, 0xfb, 0xba, 0x81, 0x27, 0xfa, 0x3c, 0x6b, 0x7c, 0xf5, 0xbc, 0x61, 0x68,
+  0x84, 0x3d, 0xe4, 0x1a, 0x6b, 0xbd, 0xd0, 0xe9, 0xc8, 0x3c, 0x26, 0xff, 0x47,
+  0xbd, 0x64, 0xb7, 0xe9, 0x3b, 0xf3, 0xad, 0x36, 0x3d, 0x8a, 0x00, 0x3f, 0xbd,
+  0x94, 0x41, 0xcf, 0xbc, 0x01, 0xba, 0x55, 0x3d, 0x8c, 0x08, 0x36, 0xbd, 0xa4,
+  0x6b, 0x1a, 0x3d, 0x59, 0xfd, 0x83, 0x3d, 0xcc, 0xdd, 0x60, 0xbd, 0x59, 0xc2,
+  0xfe, 0xbc, 0xa6, 0x99, 0x2a, 0x3d, 0xbd, 0x45, 0x8b, 0x3d, 0xe2, 0x5e, 0x8c,
+  0x3d, 0x18, 0x83, 0x87, 0xbc, 0x10, 0x63, 0xda, 0x3b, 0x58, 0xa1, 0xc2, 0x3c,
+  0x78, 0xfa, 0x78, 0x3c, 0xfc, 0x33, 0xf0, 0x3c, 0xc4, 0xab, 0x5b, 0xbd, 0xde,
+  0x4b, 0x07, 0x3d, 0x53, 0x76, 0x1b, 0xbd, 0xee, 0xd8, 0x86, 0x3d, 0x7f, 0xd6,
+  0x7c, 0xbd, 0x68, 0xb5, 0x8e, 0x3c, 0x49, 0xdd, 0xd5, 0xbc, 0x83, 0x63, 0xed,
+  0xbb, 0x4e, 0x00, 0x91, 0xbd, 0x69, 0xce, 0xd5, 0xbb, 0x2f, 0x57, 0x71, 0xbc,
+  0x9a, 0xc3, 0x8f, 0xbd, 0x65, 0x27, 0x47, 0x3d, 0x2d, 0x6b, 0x77, 0xbd, 0xdd,
+  0x54, 0x43, 0xbc, 0xf7, 0x1f, 0xe8, 0xbc, 0x12, 0x8f, 0x87, 0xbd, 0x4f, 0xcf,
+  0x2f, 0x3d, 0x15, 0x51, 0x4b, 0xbd, 0x9d, 0x1f, 0x86, 0x3d, 0x68, 0x35, 0x58,
+  0xbd, 0x16, 0xe4, 0x4e, 0xbd, 0xd0, 0x03, 0x91, 0xbd, 0x39, 0xc6, 0x90, 0x3c,
+  0xdd, 0xbb, 0x0a, 0xbd, 0x58, 0x1b, 0x33, 0xbd, 0x55, 0x86, 0x91, 0xbd, 0x48,
+  0xe7, 0x90, 0xbc, 0xf4, 0x14, 0x3f, 0xbc, 0xc0, 0x75, 0x9e, 0xba, 0x7e, 0x8f,
+  0xa8, 0xbc, 0x8c, 0x2b, 0x55, 0x3d, 0x54, 0x4b, 0x70, 0xbd, 0x56, 0x74, 0x52,
+  0x3d, 0x6d, 0xf4, 0x02, 0x3b, 0x7d, 0x46, 0x5c, 0x3b, 0x76, 0xf4, 0x0c, 0xbd,
+  0xac, 0xa2, 0x1d, 0xbd, 0x5c, 0x63, 0xe2, 0xbc, 0x64, 0x4d, 0x31, 0x3c, 0xf9,
+  0x3e, 0x3f, 0x3d, 0xed, 0x12, 0x2c, 0xbd, 0xc8, 0x12, 0xb0, 0xbc, 0x4d, 0x90,
+  0x8f, 0x3d, 0x1d, 0xef, 0x89, 0x3d, 0xf0, 0x4f, 0x93, 0xbd, 0x88, 0x79, 0xd8,
+  0x3c, 0x74, 0x42, 0x1f, 0xbd, 0xba, 0x43, 0x90, 0x3c, 0xd5, 0x7e, 0xe3, 0xbc,
+  0x71, 0x49, 0x7b, 0xbd, 0x5d, 0x36, 0x16, 0x3d, 0x91, 0xb8, 0x22, 0xbd, 0xd4,
+  0x0e, 0x1e, 0x3d, 0xaa, 0x17, 0x2d, 0x3c, 0xca, 0x4d, 0xb9, 0x3b, 0x8a, 0x9d,
+  0x01, 0x3d, 0x60, 0xcf, 0xc3, 0xbb, 0xc4, 0xc0, 0x00, 0x3b, 0x6d, 0xeb, 0x09,
+  0xbd, 0x88, 0x55, 0x9e, 0xbc, 0x04, 0x54, 0xc3, 0xbc, 0x00, 0x93, 0xf2, 0x3a,
+  0xe2, 0x88, 0x6e, 0x3d, 0xa0, 0xdb, 0xd4, 0xbc, 0x12, 0x3b, 0xa4, 0x3b, 0x5d,
+  0x20, 0x88, 0x3d, 0xb4, 0xe5, 0xdc, 0xbc, 0x93, 0xf0, 0x70, 0xbc, 0xf6, 0x1a,
+  0x31, 0xbd, 0xe0, 0xc3, 0x75, 0x3c, 0xbc, 0x2b, 0x96, 0x3c, 0x5b, 0x81, 0x44,
+  0xbd, 0x6e, 0x2f, 0xab, 0xbc, 0x4c, 0x4e, 0x82, 0x3d, 0x6c, 0x17, 0x9b, 0xbc,
+  0x70, 0x5a, 0x16, 0xbc, 0x70, 0x5e, 0x10, 0x3c, 0x81, 0xf0, 0x7d, 0xbd, 0x55,
+  0xca, 0x3d, 0x3d, 0xca, 0x75, 0xa2, 0xbc, 0x7f, 0xc2, 0xe2, 0xbb, 0xc4, 0x59,
+  0x82, 0x3d, 0xbd, 0xde, 0xd0, 0xbc, 0xe6, 0x4c, 0x3a, 0x3d, 0x62, 0xc7, 0x62,
+  0x3d, 0x3e, 0xd2, 0xc1, 0xba, 0xeb, 0xae, 0xb3, 0xbb, 0x39, 0xf0, 0xa2, 0x3c,
+  0xd0, 0xa2, 0x18, 0xbd, 0x65, 0xea, 0x99, 0x3b, 0xd0, 0x01, 0x8d, 0xbc, 0x34,
+  0x0c, 0x84, 0xbd, 0xc3, 0x10, 0x3f, 0xbd, 0xb0, 0x26, 0xc4, 0x3b, 0xde, 0xc4,
+  0x2e, 0x3d, 0xb4, 0x3f, 0xe5, 0x3c, 0x80, 0x6d, 0xda, 0x3b, 0xd3, 0x01, 0x8f,
+  0x3d, 0x7b, 0x2e, 0x70, 0x3b, 0x95, 0x55, 0x51, 0xbd, 0xc2, 0x13, 0x4a, 0x3d,
+  0x70, 0xd8, 0x4a, 0x3d, 0x6d, 0xf3, 0xc7, 0xbb, 0x40, 0x46, 0xe8, 0x3c, 0x71,
+  0x53, 0x85, 0x3a, 0xea, 0x87, 0xf9, 0x3c, 0xb0, 0xb0, 0xf5, 0x3c, 0xf2, 0x2a,
+  0x58, 0x3d, 0xe8, 0xd7, 0xc4, 0x3c, 0x57, 0xd9, 0xc8, 0x3c, 0xf3, 0x05, 0x79,
+  0xbd, 0x9c, 0x0e, 0xf5, 0xbb, 0xcd, 0xaa, 0x1b, 0xbc, 0x42, 0xa2, 0x22, 0x3d,
+  0x3e, 0x81, 0xe3, 0x3c, 0x66, 0x13, 0x2a, 0xbd, 0x6d, 0xfd, 0x8f, 0x3d, 0xd3,
+  0x64, 0xab, 0x3c, 0x1e, 0x94, 0xba, 0x3c, 0x68, 0x42, 0x45, 0xbd, 0x4c, 0x0e,
+  0xaf, 0xbc, 0x90, 0xbf, 0x7e, 0x3d, 0x6f, 0x71, 0x91, 0x3d, 0xc3, 0xb6, 0x80,
+  0x3d, 0x3a, 0xbd, 0x32, 0xbd, 0x08, 0x63, 0x11, 0xbc, 0xec, 0xf4, 0x08, 0x3d,
+  0x60, 0x5c, 0xcc, 0x3b, 0x66, 0x5b, 0x59, 0xbd, 0xb9, 0xcb, 0x8d, 0xbd, 0xfd,
+  0x30, 0x54, 0x3d, 0x2e, 0xaa, 0x0f, 0xbc, 0x80, 0x26, 0x1a, 0xbb, 0x47, 0x43,
+  0x19, 0xbd, 0x2c, 0x5d, 0xb8, 0x3c, 0x6c, 0xa6, 0xe8, 0x3c, 0xec, 0x3c, 0xcb,
+  0xbc, 0x61, 0x53, 0xa4, 0x3c, 0x68, 0xf1, 0x0a, 0x3c, 0x9c, 0x5f, 0x30, 0x3d,
+  0x5b, 0x39, 0xb8, 0xbc, 0xd2, 0x8d, 0x99, 0xbc, 0xe7, 0x1e, 0x31, 0xbd, 0x61,
+  0x4e, 0x2c, 0xbd, 0x11, 0xeb, 0xb3, 0xbc, 0x80, 0x2e, 0x0b, 0xbc, 0x57, 0xbf,
+  0x75, 0x3c, 0xbb, 0xd3, 0x2b, 0x3d, 0xba, 0xc5, 0x1b, 0x3d, 0x43, 0x78, 0x80,
+  0x3d, 0xeb, 0x30, 0x0a, 0x3c, 0xf7, 0xf8, 0x04, 0x3d, 0x1f, 0x88, 0x17, 0xbd,
+  0x7c, 0x55, 0xf0, 0xbc, 0x4a, 0x93, 0x3c, 0x3d, 0x7a, 0x12, 0x5c, 0xbd, 0x54,
+  0x6b, 0x42, 0xbd, 0xa0, 0x16, 0xd8, 0x3b, 0x20, 0x3e, 0x3b, 0x3b, 0x3c, 0xde,
+  0x72, 0xbd, 0x68, 0x37, 0x68, 0xbd, 0x37, 0x55, 0x97, 0xbb, 0x19, 0x7b, 0x43,
+  0xbd, 0x82, 0xce, 0x8a, 0xbd, 0xcf, 0xc2, 0x88, 0xbd, 0x30, 0xde, 0xd8, 0x3b,
+  0xf1, 0xc1, 0xa9, 0x3c, 0x68, 0x51, 0x2d, 0x3d, 0x76, 0xd5, 0xac, 0x3c, 0xb8,
+  0x4b, 0x78, 0xbb, 0x0f, 0x1c, 0x5d, 0xbd, 0xf7, 0x31, 0x25, 0xbd, 0x72, 0x4c,
+  0x91, 0x3d, 0x6e, 0x4f, 0x51, 0x3d, 0xb4, 0x9b, 0x21, 0xbd, 0x03, 0x73, 0xdd,
+  0xbc, 0x38, 0x49, 0x4f, 0x3c, 0xb8, 0xc7, 0x4f, 0x3d, 0x6a, 0x17, 0x0a, 0xba,
+  0xf4, 0x4f, 0xcd, 0x3c, 0x93, 0x14, 0x86, 0xbd, 0xde, 0x1e, 0x31, 0x3c, 0x57,
+  0x45, 0xf1, 0x3c, 0x53, 0xc3, 0x7c, 0x3d, 0xc8, 0x1a, 0xd8, 0x3c, 0x85, 0xf4,
+  0x8d, 0x3d, 0xf2, 0xaa, 0x46, 0x3d, 0xa6, 0x5c, 0x73, 0x3d, 0xf8, 0x5a, 0x3c,
+  0x3d, 0xd0, 0x85, 0xaf, 0x3c, 0x60, 0x1f, 0xa0, 0x3c, 0xef, 0xcb, 0x45, 0xbd,
+  0x68, 0xc2, 0x24, 0x3d, 0x25, 0x65, 0x14, 0x3b, 0x0c, 0x01, 0x67, 0x3d, 0x43,
+  0x57, 0x65, 0xbd, 0x50, 0x8f, 0xec, 0x3b, 0x88, 0xf5, 0x16, 0x3d, 0xde, 0xa3,
+  0xe2, 0xbc, 0x92, 0x11, 0xfb, 0x3c, 0x35, 0x93, 0x26, 0x3d, 0x96, 0xe4, 0x70,
+  0x3d, 0x30, 0xea, 0x40, 0x3c, 0x50, 0x65, 0x37, 0x3c, 0x56, 0xf8, 0x84, 0xbd,
+  0x36, 0xc0, 0x8e, 0x3d, 0x58, 0x45, 0x6b, 0xbd, 0x46, 0xcc, 0x5e, 0xbc, 0x41,
+  0x2a, 0x4f, 0xbd, 0x5f, 0xce, 0x80, 0xbb, 0xfb, 0x75, 0xae, 0xbc, 0x19, 0xe3,
+  0x0b, 0xbd, 0x54, 0x3e, 0x8a, 0x3c, 0x41, 0x54, 0xb7, 0x39, 0x8f, 0xb4, 0x80,
+  0x3d, 0xfb, 0x42, 0x00, 0x3d, 0x5e, 0x0b, 0x19, 0xbd, 0x5d, 0x03, 0xb5, 0x3c,
+  0xd8, 0x30, 0x78, 0x3c, 0x3e, 0xef, 0x90, 0xbc, 0xe0, 0x2c, 0xdb, 0x3b, 0x0a,
+  0x5a, 0xfc, 0xbc, 0x24, 0x7e, 0x90, 0xbd, 0x1a, 0xd4, 0x1b, 0x3d, 0x10, 0x0a,
+  0x87, 0x3d, 0xa3, 0x8c, 0x3b, 0xbd, 0x3f, 0x54, 0xda, 0xbc, 0x0f, 0x59, 0xd8,
+  0x3b, 0xbe, 0xea, 0xea, 0x3c, 0x39, 0x2d, 0x7e, 0xbd, 0x19, 0xa0, 0x73, 0xba,
+  0x3c, 0xc5, 0x60, 0xbd, 0x57, 0x9e, 0x70, 0xbd, 0xdc, 0x65, 0xfb, 0x3b, 0xbc,
+  0x13, 0x32, 0xbd, 0xa4, 0xd0, 0x81, 0xbd, 0x5f, 0x74, 0x85, 0x3d, 0x1a, 0xf5,
+  0x58, 0x3d, 0xa3, 0x35, 0x7c, 0x3d, 0xb3, 0x3d, 0x87, 0x3c, 0x83, 0xc6, 0x6b,
+  0x3d, 0xff, 0xe3, 0x8e, 0x3d, 0x97, 0xab, 0x01, 0xbd, 0x7c, 0xd4, 0x85, 0x3d,
+  0xa0, 0xbd, 0x83, 0xbc, 0x04, 0x12, 0x41, 0x3d, 0x9e, 0x3d, 0x57, 0xbd, 0xa2,
+  0x37, 0xc1, 0x3c, 0xf2, 0xa6, 0x81, 0xbd, 0xe0, 0xde, 0xe6, 0xbc, 0xa0, 0x4b,
+  0xd4, 0xbb, 0xe8, 0x33, 0xd8, 0xbc, 0x9a, 0x4c, 0x55, 0x3d, 0x16, 0xc0, 0x91,
+  0xbd, 0x28, 0xa0, 0x1e, 0x3c, 0xfc, 0xc7, 0x5f, 0xbc, 0xc1, 0x5e, 0x95, 0x3c,
+  0xc4, 0x85, 0xa0, 0x3c, 0xf5, 0x01, 0xd7, 0xbc, 0xf3, 0x15, 0xcc, 0xbb, 0x52,
+  0x0c, 0x2c, 0xbd, 0xea, 0xdf, 0x7b, 0x3d, 0x06, 0xe0, 0x26, 0xbc, 0x7a, 0x9a,
+  0x8d, 0xbd, 0x9c, 0xdb, 0xac, 0x3c, 0x4b, 0xfa, 0x2f, 0x3d, 0xe4, 0x93, 0xf1,
+  0x3c, 0x89, 0xe5, 0x91, 0xbd, 0xda, 0x41, 0x28, 0xbd, 0x52, 0x6f, 0x58, 0x3d,
+  0x89, 0x2f, 0x43, 0xbd, 0x74, 0xe4, 0x00, 0xbd, 0x59, 0xd4, 0x26, 0xbd, 0x97,
+  0x79, 0xa9, 0x3c, 0xb0, 0x62, 0x9f, 0xb9, 0xbc, 0xac, 0x04, 0x3d, 0x5c, 0xce,
+  0x3d, 0xbd, 0x15, 0x58, 0x67, 0xbd, 0x0a, 0xce, 0xf4, 0xbc, 0x3a, 0x8f, 0x01,
+  0xbd, 0x50, 0xd2, 0x73, 0xbc, 0x8e, 0x54, 0x16, 0xbc, 0xea, 0xd7, 0x3c, 0x3d,
+  0xf0, 0xbe, 0xd7, 0x3c, 0x1a, 0x3d, 0x82, 0xbd, 0xba, 0x91, 0x2f, 0x3d, 0x10,
+  0xb0, 0x92, 0xbd, 0xf8, 0x36, 0x1c, 0x3d, 0x50, 0x2a, 0x8f, 0xbd, 0xb0, 0x09,
+  0x5e, 0x3d, 0x3b, 0xc8, 0x8f, 0xba, 0xf4, 0xce, 0x92, 0xbd, 0x38, 0xc4, 0x78,
+  0xbd, 0xe0, 0x8c, 0x5c, 0xbc, 0x98, 0x6b, 0x8b, 0x3d, 0x16, 0x7f, 0x4a, 0x3d,
+  0x18, 0xc0, 0xfe, 0xbc, 0x66, 0xbb, 0x4b, 0xbd, 0x90, 0xb6, 0xe1, 0x3b, 0x98,
+  0xca, 0x8c, 0x3c, 0x05, 0xfe, 0xec, 0xbc, 0x58, 0x1c, 0x17, 0x3d, 0x37, 0x17,
+  0x80, 0x3d, 0x41, 0x6e, 0x14, 0x3d, 0xee, 0x95, 0xcb, 0xbb, 0x1a, 0x56, 0x1f,
+  0xbd, 0xae, 0xc7, 0x2c, 0x3c, 0x28, 0x3a, 0x80, 0x3b, 0x00, 0x13, 0x76, 0xbc,
+  0x69, 0xaf, 0x5e, 0xbc, 0x80, 0xcc, 0x02, 0xbd, 0xa8, 0xea, 0x04, 0xba, 0xb8,
+  0xae, 0x09, 0x3d, 0xb3, 0x0d, 0x8d, 0x3d, 0xc0, 0x22, 0x84, 0xba, 0x04, 0x62,
+  0x5c, 0xbd, 0xd8, 0x28, 0x09, 0x3c, 0x68, 0xd3, 0x41, 0x3c, 0x62, 0x52, 0x1e,
+  0x3d, 0x99, 0x42, 0x03, 0xbd, 0x3b, 0x4b, 0xd9, 0xba, 0x68, 0x5e, 0x32, 0xbd,
+  0x8b, 0x9e, 0x26, 0xbb, 0x9c, 0xd7, 0xcd, 0x3c, 0x4e, 0xdc, 0x16, 0x3d, 0x42,
+  0x1a, 0x07, 0x3d, 0xbb, 0xa6, 0x96, 0xbb, 0xf4, 0x47, 0x59, 0xbc, 0x13, 0xa3,
+  0xa1, 0xbc, 0x8f, 0x58, 0x0f, 0xbc, 0x88, 0xd1, 0x1d, 0xbd, 0xe0, 0x0f, 0xfb,
+  0x3c, 0x81, 0xd3, 0x90, 0x3d, 0xe0, 0x4b, 0x4f, 0xbd, 0x3f, 0x4a, 0x80, 0x3d,
+  0x3a, 0x63, 0x67, 0x3d, 0xe2, 0xee, 0x1e, 0x3c, 0xf8, 0x65, 0xdd, 0x3b, 0x1c,
+  0x30, 0x09, 0xbd, 0xe9, 0x2f, 0xdb, 0xbc, 0x94, 0x36, 0x55, 0xbd, 0x2c, 0xa4,
+  0x95, 0x3a, 0x78, 0x24, 0x2f, 0x3d, 0xc7, 0x9c, 0x44, 0xbd, 0xb5, 0x09, 0x10,
+  0xbd, 0x7d, 0x10, 0x49, 0xbd, 0x60, 0xd3, 0x43, 0x3c, 0xef, 0x67, 0x05, 0xbd,
+  0x0a, 0x1d, 0x6c, 0x3d, 0xaa, 0x4d, 0x0c, 0x3d, 0x84, 0xfc, 0x8a, 0xbc, 0x0d,
+  0xf7, 0x65, 0xbd, 0x5c, 0x71, 0x93, 0xbc, 0xd8, 0xe9, 0x2a, 0x3d, 0x1d, 0xd9,
+  0xc6, 0xbc, 0xd6, 0xeb, 0x70, 0xbd, 0xef, 0x92, 0x41, 0xbd, 0x4a, 0xd3, 0x83,
+  0xbd, 0x1e, 0xf1, 0x74, 0x3b, 0xa3, 0xb4, 0x1e, 0xbc, 0x4f, 0x0c, 0x12, 0x3d,
+  0x69, 0xf6, 0x25, 0x3d, 0x5a, 0x52, 0x35, 0x3d, 0xb5, 0x14, 0x37, 0x3d, 0x2b,
+  0xf9, 0x2d, 0xbd, 0xb8, 0xc6, 0x12, 0x3d, 0x2e, 0xeb, 0xf8, 0xbb, 0x31, 0xe0,
+  0x43, 0xbd, 0x37, 0x68, 0xf4, 0x3b, 0x4e, 0xd7, 0x55, 0xbd, 0xf2, 0x8f, 0x06,
+  0x3d, 0xa3, 0xe0, 0x8a, 0x3d, 0x47, 0xcb, 0x91, 0x3d, 0xc3, 0xaa, 0x1c, 0xbd,
+  0x43, 0x44, 0x24, 0x3d, 0x5a, 0xcc, 0x30, 0xbd, 0x72, 0xbe, 0x27, 0x3c, 0xfc,
+  0xd5, 0xbe, 0x3c, 0x34, 0x0e, 0x3f, 0x3d, 0xdc, 0x3d, 0x7b, 0xbc, 0x64, 0xe1,
+  0xa9, 0x3c, 0x00, 0x61, 0x80, 0x3b, 0x19, 0xd4, 0x82, 0xbd, 0x41, 0xef, 0x8c,
+  0x3d, 0x90, 0x50, 0x11, 0xbd, 0x0d, 0x32, 0x8d, 0x3d, 0x56, 0x78, 0x5f, 0x3c,
+  0x71, 0x44, 0x6c, 0x3d, 0x21, 0xe4, 0x22, 0x3d, 0x31, 0xfd, 0xb4, 0xbb, 0xcc,
+  0x10, 0x7e, 0x3c, 0x7a, 0xb4, 0x06, 0x3d, 0xc5, 0xde, 0x22, 0xbc, 0xd2, 0x57,
+  0xfe, 0x3c, 0x30, 0x95, 0x81, 0xbd, 0x00, 0x6d, 0xde, 0x39, 0xfd, 0x2b, 0x3f,
+  0x3d, 0x8f, 0xe7, 0xf4, 0x3b, 0x2b, 0xf8, 0xa3, 0xbc, 0xcf, 0x7c, 0x4e, 0x3d,
+  0x86, 0xee, 0xf7, 0x3c, 0x20, 0x5a, 0x22, 0xbb, 0x1a, 0xa9, 0x62, 0xbd, 0x0f,
+  0x24, 0x7f, 0x3d, 0x74, 0x7e, 0x00, 0x3d, 0x24, 0xd2, 0xcb, 0xbc, 0x06, 0xc6,
+  0x44, 0xbd, 0xe1, 0x53, 0xa3, 0x3c, 0x7d, 0x24, 0x08, 0x3d, 0xf6, 0x9f, 0x23,
+  0xbd, 0x3f, 0xb0, 0x84, 0xbd, 0xb0, 0xbb, 0xbc, 0x3c, 0x74, 0x6c, 0x22, 0xbc,
+  0x0b, 0x32, 0x50, 0xbd, 0x81, 0x6f, 0x8b, 0x3d, 0x98, 0x37, 0xc3, 0x3c, 0xfd,
+  0x30, 0x08, 0xbd, 0x11, 0x42, 0x01, 0xbd, 0xd6, 0x91, 0x16, 0x3c, 0x6e, 0xf1,
+  0xc2, 0x3a, 0xed, 0x4b, 0x8c, 0xbd, 0x51, 0x70, 0x34, 0xbd, 0x2a, 0x7e, 0x1c,
+  0x3b, 0x5a, 0x96, 0xcd, 0x37, 0x9a, 0x8e, 0xf8, 0x3c, 0xce, 0x8a, 0x6d, 0x3d,
+  0x62, 0xb2, 0x38, 0x3d, 0x70, 0x0a, 0xbe, 0xbc, 0xd0, 0x3f, 0x66, 0xbc, 0xf4,
+  0xfe, 0x24, 0x3d, 0xbe, 0xf9, 0x89, 0x3c, 0xa0, 0x2b, 0xc1, 0xbc, 0x02, 0x6d,
+  0x41, 0x3c, 0xa4, 0x00, 0x14, 0xbd, 0xbc, 0xa1, 0xd1, 0x3b, 0xbc, 0x27, 0xa6,
+  0x3c, 0xc8, 0x08, 0xfd, 0xbc, 0xa1, 0x0e, 0x9c, 0xbc, 0xa1, 0x28, 0x07, 0xbc,
+  0x33, 0xf3, 0x71, 0x3c, 0x96, 0xed, 0x1f, 0x3d, 0xf6, 0x6d, 0x5e, 0xbd, 0x30,
+  0x7c, 0x12, 0xbc, 0xf2, 0xaf, 0x7b, 0x3d, 0x56, 0xfa, 0x36, 0xbd, 0x7a, 0x6f,
+  0x3a, 0x3d, 0x40, 0x65, 0x8f, 0x3c, 0x2c, 0xa1, 0x4f, 0xbc, 0x80, 0x0f, 0x7b,
+  0x3b, 0xaf, 0xc3, 0xf2, 0x3c, 0xae, 0x39, 0x8a, 0xbd, 0xd5, 0xf6, 0x42, 0xbd,
+  0x12, 0x9c, 0x33, 0x3d, 0x88, 0x27, 0x4d, 0x3d, 0x61, 0x05, 0x1e, 0xbd, 0x02,
+  0xcd, 0x04, 0xbd, 0xe8, 0x6f, 0xe1, 0x3c, 0xf8, 0xd2, 0x73, 0x3d, 0xb9, 0xa3,
+  0x61, 0xbd, 0x64, 0x01, 0x92, 0x3c, 0x4f, 0x8e, 0x21, 0xbc, 0x8b, 0xf5, 0x18,
+  0x3d, 0xce, 0x3b, 0x77, 0x3d, 0x8d, 0x0e, 0x97, 0x3a, 0x30, 0xfc, 0x85, 0x3c,
+  0x1f, 0x24, 0x8e, 0x3a, 0xca, 0xdd, 0x4e, 0x3d, 0x5f, 0x7c, 0xfe, 0x3b, 0x84,
+  0xdf, 0x2d, 0x3d, 0x7a, 0x5c, 0x8c, 0x3d, 0x90, 0xf3, 0x79, 0xbc, 0x4f, 0x99,
+  0x17, 0xbd, 0x30, 0xb1, 0xd2, 0xbb, 0x1c, 0x5a, 0x32, 0xbd, 0xd4, 0x8c, 0xd9,
+  0x3c, 0x08, 0x56, 0xec, 0x3c, 0xf0, 0xcf, 0x64, 0xbd, 0xf0, 0x2a, 0xf1, 0xbb,
+  0x28, 0x09, 0x0c, 0xbc, 0x0f, 0xf7, 0x8d, 0xbd, 0x86, 0x8f, 0x59, 0xbd, 0xfa,
+  0xbf, 0x52, 0xbd, 0x76, 0x65, 0x4c, 0xbd, 0x79, 0xaa, 0x16, 0xbd, 0x9e, 0x6f,
+  0xa7, 0xbc, 0xac, 0x9e, 0x8f, 0xbd, 0x5a, 0xfc, 0x7b, 0xbd, 0x90, 0xe3, 0x20,
+  0x3d, 0xd0, 0x2b, 0x81, 0x3d, 0xc1, 0xbf, 0x85, 0x3d, 0x48, 0x79, 0x44, 0x3d,
+  0x3e, 0x7b, 0x6d, 0x3d, 0x2b, 0x83, 0x11, 0x3d, 0x45, 0x84, 0x38, 0x3d, 0xbd,
+  0x6d, 0x47, 0xb8, 0xe9, 0x7c, 0x29, 0xbd, 0x51, 0xd2, 0xc9, 0x3c, 0x77, 0x53,
+  0xf0, 0x3b, 0xca, 0xc2, 0x17, 0xbd, 0xb2, 0xbc, 0x13, 0x3d, 0xbc, 0x58, 0xf9,
+  0x3c, 0xed, 0x65, 0xed, 0x3c, 0x05, 0xdd, 0x8e, 0xbc, 0x0f, 0xa5, 0x96, 0xbc,
+  0xd2, 0x96, 0x00, 0x3d, 0x90, 0xfe, 0x5c, 0x3d, 0x1f, 0x18, 0x90, 0xbd, 0x68,
+  0xbb, 0xc8, 0x3c, 0x86, 0xae, 0xbb, 0xbc, 0x8a, 0x69, 0xea, 0xbc, 0x28, 0x6a,
+  0x7c, 0x3c, 0x32, 0x5f, 0x70, 0x3d, 0xdd, 0x12, 0xd4, 0xba, 0xca, 0x54, 0x56,
+  0xbd, 0x46, 0x94, 0x3f, 0xbd, 0x28, 0x3e, 0xa6, 0x3c, 0x93, 0x06, 0x43, 0xbd,
+  0x58, 0xc7, 0xf0, 0x3c, 0x5d, 0x14, 0xa9, 0xbb, 0x58, 0x98, 0xc8, 0xbc, 0x89,
+  0x34, 0x8d, 0x3d, 0x39, 0x90, 0x7b, 0x3d, 0x66, 0x18, 0x63, 0x3d, 0x60, 0x47,
+  0x4d, 0x3b, 0x1d, 0x50, 0x6c, 0xbd, 0x55, 0x74, 0x27, 0x3d, 0x11, 0xf1, 0x66,
+  0xbd, 0x14, 0xe6, 0x90, 0x3d, 0xdf, 0x99, 0x88, 0x3d, 0x9b, 0xc6, 0x67, 0x3d,
+  0x16, 0xca, 0xd3, 0xbc, 0x79, 0xad, 0x87, 0x3d, 0x52, 0x56, 0x7b, 0x3d, 0x6e,
+  0x19, 0x14, 0xbc, 0x12, 0x02, 0x26, 0x3d, 0xaf, 0x26, 0x1b, 0xbd, 0x5e, 0x09,
+  0x8c, 0xbd, 0xa2, 0x3c, 0x5f, 0x3d, 0x60, 0x7e, 0x7d, 0xbd, 0x10, 0xc0, 0x85,
+  0xbd, 0x70, 0x15, 0xc4, 0x3b, 0xe0, 0xfa, 0xf8, 0x3b, 0xe6, 0x2e, 0x00, 0x3d,
+  0xf7, 0xd5, 0x1f, 0x3d, 0x48, 0x70, 0x60, 0x3d, 0x2a, 0x3a, 0xed, 0xbc, 0xfd,
+  0x05, 0x26, 0xbc, 0x67, 0xf0, 0xee, 0x3a, 0x7e, 0x6e, 0x46, 0x3d, 0x57, 0x87,
+  0x90, 0x3d, 0x22, 0xdb, 0x65, 0xbd, 0x70, 0xad, 0x7a, 0x3c, 0xa6, 0xb5, 0xc3,
+  0x3c, 0xd4, 0xfa, 0x12, 0x3c, 0x4e, 0x84, 0x2f, 0xbd, 0x00, 0x37, 0x63, 0xbb,
+  0xfb, 0x25, 0x41, 0xbc, 0x38, 0xa5, 0x84, 0x3d, 0x8a, 0xd7, 0x5a, 0xbd, 0x11,
+  0xf7, 0xd6, 0xbb, 0xd1, 0x99, 0x22, 0xbd, 0xc8, 0xfc, 0x83, 0x3c, 0xd8, 0x91,
+  0xd8, 0xbc, 0xa6, 0xf0, 0x3f, 0xbd, 0x08, 0x4d, 0x3b, 0x3d, 0xdd, 0x56, 0x4c,
+  0xbd, 0xeb, 0x23, 0x8d, 0xbd, 0x23, 0x09, 0xcc, 0x3c, 0xbb, 0x3d, 0x8a, 0x3d,
+  0x47, 0xb9, 0x75, 0xbd, 0x69, 0x75, 0x82, 0x3d, 0x30, 0x78, 0x86, 0x3c, 0x0c,
+  0xc2, 0xd6, 0xbc, 0x2a, 0x22, 0x51, 0x3d, 0x9c, 0xfa, 0x3b, 0xbc, 0x00, 0x4b,
+  0xbf, 0x39, 0x10, 0x58, 0xe6, 0xbb, 0x22, 0xa4, 0x47, 0x3d, 0x8b, 0xd1, 0x6f,
+  0x3c, 0xf3, 0x8b, 0x23, 0xbd, 0xad, 0x67, 0x71, 0xbd, 0xa4, 0xbb, 0x71, 0xbc,
+  0x68, 0x9d, 0x36, 0x3d, 0x79, 0xda, 0x00, 0x3d, 0x30, 0x88, 0x15, 0x3d, 0xc4,
+  0x55, 0xab, 0x3c, 0xd0, 0xbe, 0x4f, 0x3d, 0x43, 0xa2, 0x8b, 0x3d, 0xc0, 0x0b,
+  0x27, 0xbc, 0xfe, 0x35, 0x91, 0xbd, 0x27, 0x33, 0x5b, 0xbc, 0xc5, 0x00, 0x91,
+  0xb9, 0x3e, 0x30, 0x74, 0xbd, 0x1c, 0x92, 0x70, 0xbd, 0xfe, 0x13, 0x56, 0xbb,
+  0x63, 0x1b, 0x84, 0x3d, 0x24, 0x9a, 0xa1, 0x3c, 0x93, 0x78, 0x83, 0xbc, 0x29,
+  0xb2, 0xce, 0x3c, 0x05, 0x6f, 0x8f, 0x3d, 0xe8, 0xb4, 0x3b, 0xbd, 0x12, 0x90,
+  0x8e, 0x3d, 0x58, 0x6a, 0x76, 0xbd, 0xee, 0x8f, 0x90, 0xbd, 0x1e, 0x98, 0xde,
+  0xbc, 0x88, 0x22, 0x40, 0x3d, 0x1b, 0x7f, 0x87, 0xbd, 0x3e, 0x25, 0x5e, 0x3d,
+  0x38, 0xf3, 0x0c, 0xbc, 0x77, 0x6a, 0x8b, 0xbd, 0x0c, 0x98, 0x08, 0xbc, 0xbd,
+  0x52, 0xf6, 0x3c, 0x2d, 0x2f, 0x03, 0xbd, 0x15, 0xbf, 0x91, 0x3d, 0xba, 0x41,
+  0xef, 0xbc, 0xdf, 0x02, 0xab, 0xbc, 0xe4, 0xac, 0x7e, 0x3d, 0x9e, 0x8c, 0x51,
+  0x3d, 0xcc, 0x12, 0x01, 0x3d, 0xfc, 0xfb, 0x1b, 0xbd, 0x75, 0x2b, 0x81, 0xbd,
+  0x6a, 0xbf, 0x20, 0x3d, 0xbb, 0x3c, 0x77, 0xbd, 0xae, 0x2f, 0x74, 0xbd, 0x58,
+  0x94, 0x53, 0xbd, 0xa0, 0xcf, 0xd4, 0x3c, 0x68, 0x51, 0xd1, 0x3c, 0x1c, 0x40,
+  0x22, 0xbd, 0x86, 0x62, 0x04, 0x3d, 0x9c, 0x10, 0x02, 0xbd, 0x5d, 0x31, 0x49,
+  0xbb, 0x5d, 0x8e, 0xf5, 0xbc, 0xb8, 0xef, 0x44, 0xbc, 0x06, 0xe5, 0x50, 0xbd,
+  0xe6, 0x33, 0x40, 0xbd, 0x20, 0x2e, 0x39, 0x3b, 0x00, 0x2f, 0x96, 0xbb, 0x75,
+  0x2e, 0x80, 0xbd, 0x2c, 0x9f, 0x4e, 0x3d, 0xd0, 0x40, 0xf6, 0x3b, 0x2e, 0x56,
+  0x8e, 0x3d, 0xcf, 0x00, 0x15, 0x3d, 0xae, 0x5d, 0xc7, 0x3b, 0x44, 0x47, 0x05,
+  0x3d, 0x80, 0x19, 0x71, 0xbb, 0x8c, 0xce, 0x87, 0xbd, 0xd2, 0x30, 0x78, 0xbd,
+  0xcc, 0x7b, 0x14, 0xbd, 0xf4, 0xb8, 0x91, 0xbd, 0xbe, 0x76, 0x64, 0x3d, 0xf9,
+  0x7e, 0x80, 0x3d, 0xda, 0xf8, 0x13, 0xbd, 0x92, 0xd0, 0x11, 0xbd, 0x03, 0x64,
+  0x55, 0xbc, 0x50, 0x1a, 0xe8, 0xbc, 0x97, 0xeb, 0x5e, 0xbd, 0x7c, 0xf8, 0x90,
+  0x3d, 0xc4, 0x26, 0x4b, 0x3d, 0xc2, 0x04, 0x7d, 0xbd, 0x25, 0x41, 0x14, 0x3b,
+  0xac, 0xc2, 0xdf, 0x3c, 0xda, 0x60, 0xd3, 0xbc, 0x1b, 0x00, 0x45, 0xbd, 0x7e,
+  0x09, 0xac, 0xbc, 0x28, 0x65, 0xcb, 0xbc, 0xe6, 0xd0, 0xb2, 0xbc, 0xb8, 0xdf,
+  0xae, 0x3c, 0xc8, 0xb7, 0xca, 0x3c, 0x98, 0x50, 0xa1, 0x3c, 0x5c, 0xa2, 0xa0,
+  0xbc, 0x8c, 0x18, 0x56, 0x3d, 0xea, 0x98, 0x8e, 0xbd, 0xb5, 0xba, 0x49, 0x3b,
+  0xff, 0x2b, 0xaf, 0x3c, 0x91, 0xf6, 0x49, 0xbd, 0x0a, 0x19, 0x4d, 0x3d, 0xa1,
+  0x7e, 0x69, 0xbd, 0x6c, 0x77, 0x3e, 0xbc, 0xa0, 0x00, 0x6e, 0x3d, 0x81, 0xc6,
+  0xb1, 0x3b, 0x8b, 0xbf, 0x40, 0xbd, 0x5e, 0x71, 0xf5, 0xbc, 0x74, 0x2c, 0x96,
+  0xbc, 0x3d, 0x0c, 0x8b, 0xbd, 0x45, 0x9a, 0x8a, 0xbd, 0xdb, 0x49, 0xcb, 0x3c,
+  0x9b, 0x5b, 0x10, 0x3d, 0xf5, 0x79, 0x45, 0x3d, 0x5a, 0x50, 0x86, 0xbd, 0xf9,
+  0x2f, 0x7c, 0xbd, 0xf6, 0x3d, 0x19, 0xbd, 0x54, 0x10, 0x0c, 0x3b, 0xaf, 0x59,
+  0x27, 0xbd, 0x1f, 0x75, 0x78, 0x3d, 0x10, 0xb2, 0x9a, 0xbc, 0xc3, 0xb1, 0x99,
+  0xbc, 0xb4, 0x08, 0xac, 0x3c, 0x15, 0x41, 0x86, 0x3d, 0xc0, 0x2d, 0x46, 0xbb,
+  0xc4, 0x49, 0x56, 0xbc, 0xef, 0x2e, 0x7b, 0xbd, 0x6c, 0xee, 0x14, 0x3d, 0x70,
+  0xe7, 0x9c, 0x3c, 0x78, 0x7e, 0xfb, 0xbc, 0xf7, 0x06, 0x51, 0xbd, 0x52, 0xd4,
+  0x1a, 0xbd, 0xb0, 0x2b, 0xeb, 0xbc, 0xad, 0xad, 0x4e, 0xbd, 0xa4, 0x7c, 0xe3,
+  0x3c, 0x18, 0xa1, 0xd8, 0xbc, 0x6e, 0xa6, 0x8f, 0xbd, 0x79, 0x0d, 0xb7, 0xba,
+  0xb2, 0x10, 0x10, 0x3d, 0xe6, 0xcf, 0x52, 0x3d, 0x8e, 0x88, 0x35, 0x3d, 0xdd,
+  0x92, 0x8d, 0x3d, 0x54, 0x69, 0x83, 0xbc, 0xab, 0xa9, 0x88, 0xbd, 0xe0, 0xa7,
+  0x1c, 0xbb, 0x86, 0x10, 0x2c, 0xbd, 0x24, 0xde, 0x18, 0x3d, 0x4a, 0x04, 0x87,
+  0xbd, 0x42, 0x3c, 0x16, 0xbd, 0x62, 0x25, 0x90, 0xbd, 0xce, 0x01, 0x64, 0xbd,
+  0x2c, 0x76, 0x6f, 0xbd, 0xd2, 0x15, 0x0b, 0xbd, 0x45, 0x72, 0x73, 0x3b, 0xeb,
+  0x46, 0x02, 0xbd, 0x05, 0x12, 0x1c, 0xbd, 0xb8, 0x16, 0x22, 0xbd, 0xe5, 0x22,
+  0x89, 0x3d, 0x8c, 0x8a, 0xf4, 0x3c, 0x40, 0x6b, 0xe4, 0x3a, 0x5c, 0xe2, 0x70,
+  0xbd, 0x56, 0x08, 0x67, 0xbd, 0x5b, 0xec, 0x4d, 0x3d, 0xba, 0x4d, 0x2a, 0xbd,
+  0xb9, 0x55, 0xa4, 0xbc, 0xb7, 0xd7, 0x39, 0x3d, 0xa0, 0x88, 0xfe, 0x3c, 0xbf,
+  0x7d, 0x6b, 0xbd, 0xcd, 0xdf, 0xe3, 0xbc, 0x26, 0xa0, 0x3e, 0x3d, 0x19, 0x4b,
+  0x17, 0x3d, 0x54, 0x84, 0xa7, 0xbc, 0x78, 0x9a, 0x6a, 0xbd, 0x80, 0xcc, 0xa7,
+  0x3c, 0x58, 0x48, 0x3a, 0x3d, 0xd9, 0x9a, 0xe3, 0xbc, 0xe0, 0xa2, 0xb8, 0x3c,
+  0x3f, 0x32, 0x4d, 0x3d, 0x8e, 0xa6, 0x80, 0xbc, 0x0f, 0xfc, 0xd6, 0xbb, 0x40,
+  0x70, 0x8b, 0xbd, 0xe3, 0xa3, 0xf6, 0xbb, 0x40, 0x26, 0x33, 0xbb, 0x43, 0xb2,
+  0x01, 0xbd, 0x2e, 0xf9, 0x27, 0xbd, 0x6c, 0xcf, 0x54, 0x3c, 0xae, 0xca, 0x4d,
+  0x3c, 0x6e, 0x2d, 0x1d, 0x3a, 0x04, 0xda, 0x94, 0xbc, 0x2c, 0x2b, 0xc6, 0x3c,
+  0x59, 0xc8, 0x1a, 0xbd, 0x80, 0x56, 0xcb, 0x3b, 0xf4, 0xce, 0xa1, 0x3c, 0x84,
+  0xdd, 0xeb, 0x3c, 0x95, 0x36, 0x83, 0xbd, 0x60, 0xeb, 0x47, 0x3d, 0x90, 0xf8,
+  0x63, 0x3d, 0x8a, 0xc4, 0x6a, 0xbc, 0x40, 0x25, 0xa9, 0x3b, 0x7a, 0xfc, 0x65,
+  0x3d, 0xe2, 0xcd, 0x33, 0x3d, 0x69, 0x80, 0xe5, 0xbc, 0xf7, 0xc5, 0x42, 0xbc,
+  0x17, 0xf4, 0x31, 0xbd, 0xbe, 0xb3, 0x79, 0x3d, 0xff, 0xfc, 0x6c, 0x3d, 0xc5,
+  0x04, 0x7d, 0xbc, 0xd9, 0x4f, 0x8e, 0x3d, 0xfe, 0xd3, 0x86, 0xbd, 0xcd, 0xeb,
+  0x3f, 0x3d, 0xd8, 0x90, 0x2e, 0xbd, 0x56, 0x17, 0xbf, 0x3c, 0xbb, 0x23, 0x83,
+  0xbd, 0x69, 0x4a, 0x43, 0x3d, 0x0a, 0x76, 0x5e, 0xbd, 0xee, 0x69, 0x8d, 0x3d,
+  0x75, 0xda, 0x1c, 0x3c, 0xe8, 0xf7, 0xe0, 0xbc, 0x53, 0xbe, 0xda, 0xb8, 0xc2,
+  0x03, 0x2e, 0xbd, 0xe4, 0xa0, 0x38, 0xbc, 0xbc, 0x5e, 0x3b, 0xbd, 0xfc, 0xfc,
+  0xb7, 0x3c, 0xd4, 0xfb, 0x13, 0xbd, 0xf6, 0x8c, 0x44, 0x3d, 0x70, 0x13, 0x9d,
+  0x3c, 0xf8, 0xb8, 0x11, 0xbc, 0xcc, 0x9b, 0x3b, 0xbd, 0xf7, 0x18, 0xe4, 0xbc,
+  0x89, 0xc3, 0x31, 0x3d, 0xde, 0x7c, 0x32, 0xbd, 0x3c, 0xc7, 0x97, 0x3c, 0x2e,
+  0xc0, 0xb8, 0xbc, 0xa2, 0xfe, 0x29, 0xbd, 0x17, 0xb2, 0x35, 0xbd, 0xaa, 0x83,
+  0xdd, 0x3c, 0x1e, 0xfa, 0x83, 0x3d, 0xc6, 0x4c, 0x16, 0x3d, 0xfd, 0x0f, 0x29,
+  0x3d, 0x2d, 0x90, 0xac, 0x3b, 0xfe, 0xe5, 0xc8, 0x3b, 0xac, 0x11, 0xc7, 0xbc,
+  0x2d, 0xf3, 0xfa, 0x3c, 0x2a, 0x75, 0x81, 0xbd, 0x2d, 0x84, 0xb4, 0x3c, 0xfd,
+  0xad, 0x66, 0xbc, 0xaa, 0x80, 0x2a, 0xbd, 0x58, 0x82, 0x8c, 0x3d, 0x75, 0x06,
+  0x78, 0x3d, 0x1b, 0xdd, 0x21, 0xbc, 0x1c, 0x40, 0x38, 0x3d, 0xe0, 0xdc, 0x6e,
+  0x3d, 0x50, 0xb8, 0x32, 0xbc, 0x80, 0x13, 0x4f, 0xbb, 0x32, 0x50, 0x6c, 0x3d,
+  0xce, 0x1b, 0xf1, 0xbc, 0xd8, 0x20, 0x02, 0x3d, 0x43, 0x68, 0xa2, 0x3c, 0x9a,
+  0x6c, 0x29, 0xbd, 0x8d, 0x90, 0x22, 0xbd, 0x14, 0xff, 0xe6, 0xbb, 0xb8, 0xcf,
+  0xc1, 0x3c, 0xa6, 0x3b, 0x4a, 0x3d, 0xac, 0xad, 0x11, 0x3d, 0x60, 0x19, 0xc9,
+  0x3c, 0x55, 0xae, 0xf1, 0xbc, 0x3d, 0xc0, 0x23, 0xbd, 0xa3, 0x00, 0xcd, 0xbb,
+  0x44, 0x9e, 0x17, 0x3d, 0xc0, 0x31, 0xe2, 0x3a, 0x30, 0xdf, 0xf4, 0x3c, 0x31,
+  0x09, 0x92, 0xbc, 0xa8, 0xbd, 0x66, 0x3c, 0xa5, 0x06, 0x4f, 0x3c, 0xdc, 0x2e,
+  0x92, 0xbd, 0xfb, 0x54, 0x87, 0xb9, 0x9b, 0x34, 0x1f, 0x3d, 0xd8, 0xf7, 0xa7,
+  0xbb, 0xff, 0x1d, 0x62, 0xbd, 0xe0, 0xf8, 0x3c, 0x3d, 0x85, 0x58, 0x8f, 0xbd,
+  0x75, 0xf9, 0x62, 0xbd, 0xef, 0xf5, 0x7a, 0xbd, 0x58, 0x32, 0x86, 0x3d, 0x90,
+  0x17, 0x29, 0x3c, 0x64, 0xcc, 0x4a, 0xbd, 0xf0, 0x07, 0xc1, 0xbc, 0x72, 0xdc,
+  0x64, 0xbd, 0x68, 0x3e, 0x2e, 0x3c, 0x38, 0x6d, 0x60, 0xbd, 0x46, 0x1f, 0x59,
+  0x3d, 0xd0, 0xa7, 0x3e, 0x3d, 0x77, 0x1d, 0x49, 0x3d, 0xcb, 0xed, 0x7f, 0xbd,
+  0xd8, 0x47, 0x40, 0x3c, 0x00, 0xf0, 0xee, 0x39, 0xcc, 0xea, 0x57, 0x3d, 0x10,
+  0x1d, 0x8a, 0xbd, 0xb9, 0x55, 0x5f, 0xbd, 0x17, 0x3c, 0x66, 0xbc, 0x02, 0xb8,
+  0x06, 0xbd, 0x5f, 0xfb, 0x16, 0xbd, 0x58, 0x15, 0x8c, 0x3d, 0x18, 0x99, 0x5f,
+  0x3d, 0x5f, 0x73, 0xb3, 0xbc, 0x61, 0x73, 0x63, 0x3d, 0x61, 0xf2, 0x7b, 0xbc,
+  0xbd, 0x2b, 0xad, 0x3a, 0xda, 0x99, 0x5c, 0xbd, 0x81, 0xd1, 0xd0, 0x3c, 0xf0,
+  0xf9, 0xb0, 0x3c, 0x84, 0x54, 0x68, 0x3c, 0x24, 0x10, 0x84, 0x3d, 0x4d, 0xec,
+  0xa2, 0x3b, 0xd3, 0xab, 0x1e, 0xbd, 0xbd, 0x4d, 0x84, 0x3d, 0xd0, 0xd9, 0xb6,
+  0x3c, 0x84, 0xdc, 0x71, 0xbd, 0x84, 0x4a, 0x03, 0x3d, 0x54, 0xb8, 0xc6, 0x3c,
+  0x0a, 0x84, 0x0e, 0x3d, 0xdc, 0xfe, 0x64, 0xbd, 0xa6, 0xc2, 0x19, 0x3d, 0xd1,
+  0x79, 0x4c, 0x3c, 0x7c, 0x16, 0xbd, 0x3c, 0xc1, 0x7d, 0x3c, 0xbc, 0xb2, 0xe7,
+  0x94, 0xbc, 0xf0, 0x46, 0x69, 0xbc, 0x2d, 0x5f, 0x68, 0x3c, 0xbc, 0x78, 0x44,
+  0xbd, 0xcf, 0x27, 0x97, 0xbd, 0x03, 0xfb, 0x4b, 0xbd, 0x0c, 0xc4, 0xcd, 0xbc,
+  0xd7, 0xc5, 0x11, 0xbd, 0x6b, 0xe3, 0xf5, 0xbb, 0xda, 0x4d, 0x75, 0x3d, 0xb0,
+  0xf1, 0x39, 0xbd, 0x02, 0x4e, 0x00, 0xbd, 0xcf, 0x22, 0x81, 0x3d, 0x48, 0x54,
+  0x10, 0xbd, 0x93, 0x8c, 0x42, 0x3a, 0x62, 0x1e, 0x18, 0x3d, 0xb5, 0x1d, 0x8d,
+  0x3d, 0xbe, 0x37, 0x54, 0xbc, 0x9e, 0xa3, 0x92, 0xbc, 0x6a, 0x91, 0x7b, 0x3d,
+  0xc5, 0x13, 0x8c, 0xbb, 0x30, 0x93, 0x55, 0xbd, 0x01, 0x29, 0x2b, 0xbd, 0xd4,
+  0x57, 0x3a, 0xbd, 0xaf, 0xbc, 0xed, 0x3c, 0x65, 0xfe, 0x66, 0xbd, 0x2c, 0x98,
+  0x11, 0x3d, 0x6e, 0xcf, 0x7c, 0xbd, 0xbe, 0xb4, 0x49, 0x3d, 0x17, 0x7c, 0x4f,
+  0xbc, 0x13, 0xfc, 0x28, 0x3d, 0x28, 0xca, 0x2b, 0xbd, 0xdf, 0x3e, 0xa3, 0x3b,
+  0x7e, 0xf4, 0x99, 0xbd, 0x9d, 0x89, 0x35, 0xbc, 0x70, 0x4c, 0x8a, 0xbd, 0xf9,
+  0x58, 0x3a, 0xbd, 0x6f, 0xa9, 0x4f, 0x3d, 0x30, 0xce, 0x59, 0xbc, 0x52, 0xd4,
+  0x41, 0xbd, 0x0d, 0x88, 0x2d, 0xbd, 0x94, 0xe1, 0x30, 0x3d, 0x7a, 0x53, 0xcd,
+  0xbb, 0x2d, 0xcc, 0x75, 0x3c, 0x18, 0x30, 0x24, 0x3d, 0xfb, 0xa8, 0x07, 0x3d,
+  0xa8, 0x1f, 0x19, 0xbc, 0xdf, 0x0a, 0x1c, 0x3d, 0x76, 0x06, 0x31, 0x3d, 0x6c,
+  0x40, 0x82, 0x3c, 0x72, 0xb0, 0x82, 0xbd, 0x10, 0xae, 0x67, 0x3d, 0x00, 0x02,
+  0xb5, 0x3a, 0x0a, 0xcd, 0x29, 0x3d, 0x7a, 0xf4, 0x27, 0x3c, 0x9d, 0xe2, 0x75,
+  0xbd, 0x1e, 0xcd, 0x09, 0x3c, 0xa7, 0x3e, 0x25, 0xbd, 0x90, 0xb7, 0x8b, 0xbd,
+  0xac, 0x2e, 0x6c, 0x3c, 0x22, 0x59, 0x79, 0x3d, 0xaf, 0x3b, 0x02, 0xba, 0x40,
+  0xb8, 0x2c, 0x3d, 0xe8, 0x48, 0x6e, 0x3d, 0x13, 0xdb, 0x2f, 0x3b, 0x89, 0x0e,
+  0x82, 0x3c, 0xdf, 0xe9, 0xc4, 0xbc, 0xc9, 0x26, 0x19, 0xbc, 0x67, 0x6b, 0x50,
+  0x3d, 0xc0, 0x4c, 0x10, 0xbd, 0x30, 0xa9, 0x40, 0x3c, 0x12, 0x2f, 0xb1, 0x3c,
+  0x3e, 0x0e, 0x00, 0xbd, 0xe9, 0x1b, 0x6f, 0xbd, 0xe4, 0x4b, 0x81, 0xbd, 0x93,
+  0xc1, 0x7f, 0x3d, 0xb7, 0x8d, 0x04, 0xbd, 0x68, 0x33, 0x29, 0xbc, 0xa4, 0x5e,
+  0x60, 0x3d, 0x23, 0xc0, 0x0a, 0xbd, 0xf0, 0x22, 0x80, 0xbd, 0x79, 0xea, 0x47,
+  0x3d, 0x10, 0x77, 0x87, 0x3d, 0xc1, 0xfb, 0x19, 0xbd, 0x9c, 0xf7, 0x7c, 0x3d,
+  0x27, 0x74, 0xb9, 0xbc, 0xc6, 0xea, 0x25, 0x3d, 0x54, 0xbc, 0xa4, 0x3c, 0x88,
+  0x18, 0x36, 0x3d, 0x74, 0xd5, 0xd3, 0x3c, 0x68, 0x6e, 0x24, 0x3d, 0x36, 0xb4,
+  0x49, 0x3d, 0x3e, 0x98, 0x2c, 0xbd, 0x99, 0x3e, 0x47, 0xbd, 0x21, 0xac, 0x15,
+  0x3d, 0xef, 0x4f, 0x26, 0xbd, 0xb4, 0x49, 0x3f, 0xbd, 0xf5, 0xbc, 0x0a, 0xbd,
+  0x04, 0x05, 0x6f, 0x3d, 0xf1, 0x5f, 0x15, 0x3d, 0xca, 0x51, 0x3f, 0x3d, 0xc2,
+  0x88, 0x3a, 0xbd, 0x40, 0xeb, 0xbf, 0x3c, 0x4c, 0x13, 0xb6, 0x3c, 0xe6, 0x26,
+  0xfe, 0x3c, 0xda, 0xab, 0x95, 0xbd, 0xd8, 0xcf, 0x81, 0x3d, 0xa2, 0x19, 0x53,
+  0xbd, 0x5d, 0x5e, 0x0d, 0xbd, 0xfe, 0x6b, 0x36, 0x3d, 0xfb, 0x27, 0x4c, 0xbd,
+  0x36, 0x92, 0x43, 0xbd, 0x94, 0xee, 0x45, 0xbc, 0x8a, 0x6d, 0xe4, 0x3c, 0xa8,
+  0xb1, 0x52, 0xbc, 0x1f, 0x82, 0x88, 0xbb, 0x73, 0x6b, 0x53, 0xbd, 0x56, 0xc3,
+  0x6f, 0x3d, 0x78, 0x17, 0x4a, 0x3d, 0xf2, 0x2e, 0x77, 0xbd, 0x2e, 0xae, 0x2a,
+  0x3d, 0xa0, 0xd4, 0xa8, 0x3c, 0xe0, 0xb4, 0xd8, 0x3c, 0x24, 0x6d, 0x6a, 0xbd,
+  0x16, 0xd2, 0x58, 0xbd, 0x56, 0xf5, 0x5d, 0x3b, 0xae, 0xdb, 0x76, 0xbd, 0x16,
+  0x9a, 0x9a, 0xbd, 0x7c, 0x79, 0x51, 0x3d, 0x72, 0x5b, 0xa7, 0xbc, 0xce, 0xbf,
+  0x62, 0x3d, 0xab, 0xd8, 0x23, 0x3d, 0x7e, 0xfd, 0x23, 0x3d, 0x0c, 0x3d, 0x6b,
+  0x3d, 0x6c, 0x2f, 0x87, 0x3c, 0x1e, 0x26, 0x00, 0xbc, 0xc3, 0x94, 0x6f, 0xbd,
+  0xb3, 0x7d, 0x24, 0xbd, 0x2a, 0xfb, 0x71, 0x3d, 0xee, 0x5a, 0xeb, 0xbc, 0x6c,
+  0x3e, 0x60, 0xbd, 0x6c, 0x46, 0xf5, 0x3c, 0x83, 0xe3, 0x17, 0x3b, 0xe6, 0x15,
+  0x32, 0xbd, 0x45, 0xba, 0x05, 0xbd, 0x18, 0x9a, 0x72, 0x3d, 0x45, 0x9c, 0x83,
+  0xbd, 0x08, 0x2b, 0x5e, 0x3d, 0x75, 0xea, 0xe8, 0xbc, 0x81, 0xb6, 0x84, 0x3b,
+  0x4b, 0xf4, 0x16, 0xbd, 0x90, 0xf4, 0x16, 0x3d, 0x2b, 0x95, 0x53, 0xbc, 0x53,
+  0x27, 0x4b, 0xbd, 0x00, 0x6c, 0xe7, 0x3b, 0x62, 0xbd, 0x83, 0xbd, 0xd8, 0x6f,
+  0x87, 0x3c, 0x3c, 0x17, 0x65, 0x3c, 0x3b, 0x64, 0x7e, 0x3d, 0xbd, 0x05, 0x09,
+  0xbd, 0x7f, 0x37, 0x88, 0xbd, 0x63, 0x0e, 0x98, 0xbd, 0x03, 0x67, 0x71, 0x3c,
+  0x02, 0x06, 0xe5, 0x39, 0xe4, 0x9f, 0xe7, 0x3b, 0x93, 0x66, 0x93, 0xbd, 0xc6,
+  0xcd, 0x7c, 0xbd, 0xde, 0xaf, 0x20, 0x3d, 0xd2, 0x18, 0x54, 0x3c, 0xac, 0xeb,
+  0x62, 0xbd, 0x93, 0xf7, 0xa2, 0x3c, 0x4c, 0x4b, 0x00, 0x3d, 0x38, 0x67, 0x3d,
+  0xbd, 0x81, 0xcb, 0xa2, 0x3c, 0x9b, 0xd5, 0x90, 0x3c, 0x35, 0x26, 0x0f, 0x3c,
+  0xcb, 0x77, 0x45, 0xbd, 0x38, 0xe0, 0x48, 0xbd, 0x96, 0x9e, 0x1d, 0x3b, 0x7c,
+  0x3f, 0xaf, 0xbc, 0xef, 0x49, 0xac, 0xbc, 0x07, 0x74, 0xcc, 0x3c, 0xc0, 0x22,
+  0x42, 0xbb, 0x5b, 0x72, 0x62, 0x3d, 0xd0, 0x55, 0x95, 0xbd, 0xf7, 0x7d, 0x82,
+  0x3d, 0x90, 0x79, 0xd9, 0x3b, 0xd0, 0xa1, 0x96, 0x3c, 0xbf, 0x32, 0x8a, 0x3d,
+  0xbd, 0xf0, 0x57, 0x3d, 0x5f, 0xf9, 0x3b, 0x3c, 0x4f, 0xea, 0x86, 0x3d, 0xbb,
+  0x72, 0xaa, 0x3c, 0x42, 0x3b, 0x4c, 0x3d, 0x86, 0x1d, 0x86, 0x3c, 0x90, 0xc6,
+  0x2a, 0xbd, 0x4f, 0x86, 0x76, 0x3d, 0x92, 0x79, 0x3d, 0x3d, 0x0d, 0x95, 0x92,
+  0x3d, 0xbf, 0x77, 0x4e, 0x3d, 0x8b, 0x45, 0x03, 0xbd, 0x95, 0x0c, 0xff, 0xbc,
+  0x62, 0x35, 0x11, 0xbb, 0xbd, 0x74, 0x28, 0x3d, 0xaf, 0x87, 0x7f, 0xbd, 0x8e,
+  0xb8, 0x06, 0xbd, 0x0f, 0xbd, 0x3e, 0x3d, 0xe6, 0xd4, 0x41, 0xbd, 0x80, 0x81,
+  0xac, 0x3c, 0x7a, 0xec, 0x82, 0xbc, 0x01, 0xac, 0x93, 0xbd, 0xe8, 0xba, 0xb3,
+  0xbb, 0xcf, 0x47, 0x8f, 0xbb, 0x11, 0x6f, 0x57, 0x3d, 0x74, 0xf5, 0x9d, 0x3c,
+  0x67, 0x6e, 0x01, 0xbd, 0xa6, 0x8c, 0x8f, 0xbd, 0xe4, 0x48, 0x30, 0xbd, 0x80,
+  0xa7, 0x88, 0xbb, 0x48, 0x69, 0xea, 0x3c, 0x20, 0x78, 0x14, 0x3b, 0x18, 0xc4,
+  0xca, 0xbc, 0xd6, 0x83, 0xcb, 0x3c, 0x88, 0x63, 0xd1, 0x3c, 0x02, 0x3a, 0x1b,
+  0xbc, 0x02, 0x15, 0x13, 0x3c, 0xbe, 0x71, 0xf0, 0xbb, 0xe1, 0x3c, 0x12, 0xbd,
+  0xa6, 0x23, 0x33, 0x3c, 0xc8, 0x04, 0xee, 0x3c, 0x78, 0x7e, 0x4d, 0x3c, 0x7f,
+  0xd1, 0x95, 0xbc, 0xa3, 0x48, 0x22, 0x3c, 0x6d, 0x33, 0x77, 0xbd, 0xfc, 0x4f,
+  0xc7, 0xbc, 0x8c, 0x5c, 0x8c, 0xbd, 0x98, 0x32, 0x02, 0xbd, 0x5f, 0x37, 0x00,
+  0x3d, 0x41, 0xea, 0x7f, 0x3d, 0x4b, 0x38, 0x77, 0xbc, 0x47, 0x90, 0x92, 0xbd,
+  0x56, 0x10, 0x1f, 0xbd, 0x10, 0x70, 0x8e, 0xbb, 0x0a, 0x99, 0x7a, 0x3c, 0x46,
+  0x4c, 0x7d, 0x3d, 0xc0, 0x71, 0x6d, 0x3d, 0xd8, 0x3f, 0x28, 0x3d, 0x84, 0xe3,
+  0x2b, 0x3d, 0x31, 0xdc, 0x55, 0xbd, 0x6e, 0x0a, 0x34, 0x3d, 0x10, 0xff, 0x85,
+  0x3c, 0x72, 0x7b, 0x1d, 0xbd, 0x7f, 0xf5, 0xb4, 0xbb, 0xfb, 0xef, 0x87, 0x3d,
+  0xb5, 0x8a, 0x4f, 0x3c, 0x20, 0xd7, 0x40, 0xbd, 0x17, 0x2c, 0x38, 0xbd, 0xcb,
+  0xd4, 0x6d, 0x3d, 0x3c, 0x24, 0x7a, 0xbd, 0xb3, 0x3d, 0x92, 0xbd, 0x18, 0xbe,
+  0x99, 0xba, 0x29, 0xe3, 0x42, 0xbc, 0xf7, 0x2c, 0x8f, 0xbd, 0x34, 0xd9, 0xc7,
+  0x3c, 0xac, 0x8c, 0x99, 0xbd, 0x40, 0xe4, 0xa5, 0x3c, 0x8d, 0xcf, 0x3d, 0x3d,
+  0x81, 0xe9, 0x3e, 0x3d, 0x7a, 0xbb, 0x3f, 0x3d, 0xc7, 0x9b, 0x25, 0xbc, 0x84,
+  0x26, 0xc3, 0xbb, 0x52, 0x3f, 0x7a, 0x3d, 0x7b, 0xdb, 0x69, 0xbd, 0x99, 0x0e,
+  0x71, 0xbd, 0x4c, 0xb5, 0xa5, 0x3b, 0xcf, 0x2f, 0xfd, 0xbb, 0x6b, 0x5b, 0x0c,
+  0x3b, 0x9e, 0xeb, 0x04, 0xbc, 0x00, 0x9d, 0xdc, 0xbb, 0x10, 0xc2, 0xc0, 0x3c,
+  0x08, 0xa2, 0x31, 0xbd, 0xc0, 0x3c, 0xf9, 0x3a, 0xad, 0xd5, 0x55, 0xbd, 0x11,
+  0xea, 0xf3, 0x3c, 0x80, 0x63, 0xfa, 0x3a, 0x30, 0x82, 0x48, 0x3b, 0x58, 0x5f,
+  0x2c, 0xbd, 0xd4, 0x00, 0x83, 0xbd, 0x12, 0x38, 0x8a, 0xbd, 0xd2, 0xdf, 0x1e,
+  0x3c, 0xd0, 0x71, 0x1b, 0x3d, 0x92, 0x5f, 0x56, 0xbd, 0x51, 0x29, 0x94, 0xbd,
+  0x40, 0x81, 0x92, 0xbd, 0x04, 0x93, 0x82, 0xbd, 0x8c, 0xf7, 0x84, 0x3d, 0x8a,
+  0x96, 0x85, 0xbd, 0x2a, 0x93, 0x3b, 0xba, 0xc7, 0x7c, 0x3b, 0xbd, 0xb0, 0x3d,
+  0x50, 0x3d, 0xa0, 0xcb, 0x42, 0x3d, 0xad, 0x3c, 0x16, 0xbc, 0x59, 0xaa, 0x30,
+  0xbd, 0xcd, 0x10, 0x91, 0xbc, 0xe8, 0xea, 0x35, 0xbd, 0x53, 0x63, 0x36, 0xbd,
+  0xa9, 0x85, 0x82, 0x3c, 0x23, 0xbd, 0x36, 0xbd, 0x25, 0x81, 0xe9, 0x3c, 0x76,
+  0x54, 0x6d, 0x3d, 0xc1, 0x4f, 0x69, 0xbd, 0x55, 0x6c, 0x8f, 0x3d, 0xd5, 0x0a,
+  0x7d, 0xbd, 0x48, 0xbe, 0xd2, 0x3c, 0x5b, 0xce, 0x84, 0x3d, 0xaa, 0x8e, 0x46,
+  0xbc, 0x9c, 0x93, 0xc9, 0x3c, 0x66, 0xb1, 0x45, 0x3d, 0xf1, 0xc0, 0x90, 0xbc,
+  0x2d, 0x09, 0x22, 0x3d, 0xcc, 0x52, 0x20, 0x3d, 0xaa, 0xec, 0x70, 0x3d, 0x3a,
+  0xbd, 0xac, 0xbb, 0x70, 0x69, 0x81, 0x3d, 0x43, 0x3f, 0x8b, 0xbc, 0x46, 0x6a,
+  0x04, 0xbd, 0xac, 0x25, 0x5a, 0xbd, 0xc2, 0xb9, 0x74, 0xbd, 0x35, 0x78, 0xeb,
+  0x3c, 0xe2, 0x31, 0x54, 0xbd, 0xa0, 0xb1, 0xfe, 0x3c, 0xaf, 0xd2, 0xf8, 0x3c,
+  0x00, 0x44, 0x82, 0x3a, 0x70, 0xcc, 0x91, 0xbd, 0x82, 0x1f, 0x57, 0xbd, 0xc2,
+  0xe4, 0x03, 0x3d, 0xd0, 0xbd, 0x80, 0xbd, 0x7a, 0xde, 0x41, 0xbd, 0xe9, 0xf4,
+  0x3b, 0x3c, 0xf9, 0x96, 0x1a, 0xbd, 0xe2, 0x2e, 0x46, 0xbd, 0xae, 0xbd, 0x34,
+  0xbd, 0xb4, 0xa2, 0x8c, 0xbc, 0xa8, 0x0e, 0x30, 0xbd, 0x56, 0xf8, 0x33, 0xbd,
+  0xce, 0x69, 0x35, 0x3d, 0x52, 0x2f, 0xeb, 0xbc, 0x9f, 0xe0, 0x0f, 0xbd, 0xc9,
+  0x34, 0x29, 0xbd, 0x43, 0x26, 0x1e, 0x3d, 0xc8, 0x03, 0x05, 0x3c, 0x0f, 0x46,
+  0x97, 0x3c, 0x18, 0x4c, 0x0c, 0xbd, 0xb8, 0xf9, 0x1c, 0xbd, 0xbd, 0x84, 0x86,
+  0xbd, 0xbe, 0x50, 0xb1, 0xbc, 0x26, 0x15, 0x57, 0x3c, 0xca, 0x9f, 0x77, 0xbc,
+  0xc0, 0xea, 0xca, 0xba, 0x23, 0xde, 0x41, 0xbd, 0x9d, 0xb4, 0x5c, 0xbd, 0x46,
+  0x03, 0x30, 0xbd, 0xd0, 0xb3, 0x37, 0x3d, 0xfd, 0xe6, 0x3e, 0x3d, 0x8a, 0x0e,
+  0x6a, 0xbd, 0xf8, 0x91, 0x64, 0x3d, 0xb4, 0x0b, 0x76, 0x3d, 0xf2, 0x94, 0x5f,
+  0x3d, 0x98, 0xe6, 0x78, 0x3c, 0xc4, 0xab, 0x1e, 0xbd, 0xdd, 0xb6, 0x77, 0xbd,
+  0x56, 0x1e, 0x8c, 0x3d, 0x0f, 0xee, 0x15, 0xbd, 0x42, 0xb6, 0x92, 0xbd, 0x2c,
+  0xea, 0x96, 0xbc, 0x90, 0xc4, 0x30, 0xbd, 0x2e, 0xdc, 0xc8, 0xbb, 0xe4, 0x79,
+  0xb0, 0xbc, 0x2e, 0xe6, 0x08, 0x3d, 0x74, 0x81, 0x34, 0x3d, 0xc0, 0xd5, 0x48,
+  0xbc, 0xd3, 0xf2, 0x3c, 0xbd, 0x34, 0x47, 0xef, 0x3c, 0x9a, 0xcb, 0xe5, 0x3c,
+  0xe0, 0x94, 0xef, 0xba, 0x80, 0x36, 0x23, 0xbc, 0x08, 0xf9, 0x35, 0xbd, 0x0f,
+  0x9d, 0x99, 0xbd, 0x71, 0xdf, 0x2e, 0xbd, 0xb5, 0xa6, 0x78, 0xbd, 0xfa, 0xa8,
+  0x69, 0x3d, 0x97, 0xc3, 0xda, 0xbb, 0x37, 0x74, 0xdf, 0x3c, 0x7f, 0xc2, 0x88,
+  0xbd, 0x53, 0x20, 0xbe, 0x3b, 0x9c, 0x7a, 0xd9, 0x3c, 0xa9, 0x4b, 0x01, 0xbd,
+  0xfb, 0xf7, 0x00, 0xbd, 0xd5, 0xda, 0x41, 0x3d, 0x9d, 0x2a, 0x82, 0x3d, 0x9a,
+  0x03, 0x01, 0x3d, 0x38, 0xa7, 0x1b, 0x3d, 0x40, 0x75, 0xef, 0x3c, 0x4a, 0xdc,
+  0x1b, 0xbc, 0xd1, 0x1a, 0x41, 0x3d, 0x04, 0xee, 0x74, 0x3d, 0xdb, 0x3f, 0x71,
+  0xbd, 0x86, 0xc4, 0x22, 0x3d, 0x99, 0x74, 0x78, 0xbc, 0x48, 0x90, 0x54, 0xbd,
+  0x88, 0xae, 0xf9, 0x3c, 0x4f, 0xbe, 0x10, 0x3d, 0x7d, 0x35, 0x68, 0xbd, 0xb3,
+  0xf9, 0x3d, 0x3d, 0x1b, 0x89, 0x85, 0xbb, 0x85, 0x05, 0xae, 0x3c, 0xfd, 0x18,
+  0x5b, 0xbd, 0x2d, 0xfa, 0x7f, 0xbd, 0x6e, 0xad, 0x8c, 0xbd, 0x67, 0x72, 0x28,
+  0x3d, 0x2c, 0x8b, 0x9a, 0x3c, 0xb3, 0x94, 0x57, 0xbd, 0xa4, 0x3e, 0xa8, 0xbc,
+  0xa6, 0x6a, 0x06, 0x3d, 0xf8, 0x03, 0x33, 0x3d, 0x56, 0xb0, 0x7a, 0xbd, 0x47,
+  0x97, 0x68, 0xbc, 0xd0, 0x17, 0x7a, 0xbd, 0xe8, 0xab, 0x7d, 0xbd, 0xec, 0x67,
+  0xf9, 0xbb, 0x3d, 0x92, 0x83, 0xbd, 0x36, 0xa4, 0x00, 0xbd, 0x00, 0x1b, 0x45,
+  0x3a, 0x39, 0x13, 0x88, 0xbd, 0x05, 0x63, 0x26, 0x3c, 0x53, 0x7b, 0xc9, 0x3c,
+  0x67, 0x97, 0x7a, 0xbb, 0xfe, 0x71, 0xd6, 0xbc, 0x24, 0x84, 0x1e, 0xbd, 0x02,
+  0xa3, 0x76, 0x3d, 0xff, 0x16, 0x69, 0x3d, 0x80, 0xf0, 0x21, 0x3d, 0x90, 0x11,
+  0x48, 0xbd, 0xc8, 0xa9, 0x3f, 0xbd, 0xc8, 0x06, 0x25, 0xbd, 0xaa, 0xfe, 0x96,
+  0xbd, 0xa4, 0xbe, 0x57, 0xbc, 0x6e, 0x82, 0x1d, 0x3d, 0xd6, 0xfa, 0x66, 0xbb,
+  0x9a, 0x25, 0x20, 0x3d, 0xa3, 0x94, 0x27, 0xbb, 0x23, 0x2f, 0xcd, 0x3c, 0x5e,
+  0xa4, 0x4e, 0x3d, 0x2a, 0x3b, 0x09, 0xbd, 0x4a, 0x40, 0x6f, 0x3d, 0xfe, 0xd8,
+  0xe4, 0x3c, 0xab, 0xce, 0x56, 0xbd, 0x1d, 0x9a, 0x65, 0x3d, 0xb6, 0xf5, 0x76,
+  0xbd, 0x88, 0x3d, 0x52, 0x3d, 0x0f, 0x1c, 0x50, 0xbd, 0x1d, 0x0d, 0x6a, 0x3d,
+  0x99, 0x66, 0x98, 0xbd, 0x6e, 0xe2, 0xb9, 0x3c, 0x4c, 0x26, 0x82, 0xbd, 0xe2,
+  0x3f, 0x65, 0xbd, 0x09, 0xa4, 0x8a, 0x3c, 0x19, 0x7d, 0x7d, 0xbd, 0xe6, 0xf8,
+  0x1d, 0xbd, 0xfc, 0xe2, 0xee, 0xbc, 0x1d, 0xab, 0x89, 0x3d, 0x8e, 0xb4, 0xfe,
+  0xbc, 0x68, 0x9c, 0x83, 0x3c, 0xf7, 0xa9, 0x0b, 0xbd, 0x3c, 0xed, 0x92, 0x3c,
+  0x90, 0x72, 0xa5, 0x3c, 0x02, 0xd9, 0x69, 0xbd, 0xa9, 0x64, 0x2a, 0xbb, 0x6d,
+  0x20, 0xf5, 0xbc, 0x0e, 0x44, 0x37, 0xbd, 0xc7, 0xf0, 0xde, 0x3c, 0xb6, 0xdb,
+  0x71, 0x3d, 0xea, 0x6b, 0xda, 0xbc, 0xc8, 0x8f, 0x1d, 0xbd, 0xb9, 0x43, 0x05,
+  0xbd, 0x6c, 0x4a, 0x78, 0xbc, 0xc0, 0xc3, 0x82, 0x3b, 0x4b, 0x41, 0x49, 0xbd,
+  0xc1, 0xfc, 0xcb, 0x3b, 0x93, 0x21, 0x8d, 0xbd, 0xcf, 0x67, 0x7a, 0xbd, 0x58,
+  0x9d, 0xdb, 0x3c, 0xd3, 0x71, 0x03, 0x3d, 0xaf, 0x55, 0x84, 0x3d, 0x71, 0x0c,
+  0x5d, 0xbd, 0x4c, 0x19, 0x89, 0x3c, 0x7f, 0x29, 0x8b, 0x3d, 0xf6, 0xcd, 0xa9,
+  0x3c, 0xaa, 0x00, 0x4c, 0x3d, 0x2b, 0xaa, 0x19, 0xbc, 0x93, 0xde, 0x16, 0xb9,
+  0xda, 0xaf, 0x90, 0xbb, 0xf6, 0xde, 0x48, 0x3d, 0x00, 0x08, 0x29, 0x3b, 0xb2,
+  0xe0, 0x82, 0xbc, 0x84, 0xf3, 0x40, 0xbc, 0xd4, 0x75, 0x08, 0x3d, 0x88, 0xe7,
+  0x64, 0xbd, 0x68, 0xd6, 0x95, 0x3c, 0x1b, 0x70, 0x3f, 0x3d, 0x64, 0xfa, 0xfd,
+  0xbc, 0xfc, 0x82, 0x61, 0x3d, 0x8e, 0x6e, 0x11, 0xbd, 0x0a, 0x0a, 0x9f, 0xbc,
+  0xb5, 0x1d, 0x68, 0x3c, 0x7d, 0x9f, 0x86, 0x3d, 0xe6, 0x3f, 0x83, 0x3d, 0xf9,
+  0xd6, 0xfe, 0x3c, 0x68, 0x0c, 0x61, 0xbd, 0x65, 0x33, 0x27, 0x3d, 0x2c, 0xcf,
+  0x68, 0x3d, 0xb0, 0xc0, 0x14, 0xbd, 0xb0, 0xb2, 0x81, 0x3d, 0xc0, 0x9c, 0x89,
+  0xbc, 0xae, 0x60, 0x8e, 0xbd, 0x92, 0xdd, 0x91, 0xbd, 0xc9, 0x0b, 0x85, 0x3d,
+  0xa4, 0x00, 0xb1, 0xbc, 0x80, 0x9d, 0xf8, 0x3c, 0x1d, 0xc1, 0x98, 0xbd, 0x3e,
+  0x88, 0xcd, 0x3c, 0x67, 0xc9, 0x66, 0x3c, 0x00, 0x46, 0x64, 0xba, 0x80, 0x3e,
+  0x19, 0xbd, 0x18, 0xe0, 0x20, 0x3c, 0x50, 0xcb, 0xc0, 0x3b, 0xe3, 0xf3, 0x8c,
+  0xbc, 0xac, 0x02, 0xd6, 0x3c, 0xca, 0x7a, 0x45, 0x3d, 0x95, 0xab, 0x47, 0xbd,
+  0xe6, 0x14, 0x55, 0x3d, 0x88, 0x82, 0x09, 0x3d, 0x1c, 0x74, 0x91, 0x3c, 0xbf,
+  0x00, 0x2f, 0x3c, 0x8c, 0xfc, 0x96, 0xbd, 0xcb, 0xa8, 0x9e, 0xbb, 0xb5, 0x6b,
+  0x42, 0x3d, 0x0f, 0xed, 0x99, 0xbd, 0x6a, 0x9e, 0x45, 0xba, 0x50, 0xa3, 0x2d,
+  0xbc, 0x6a, 0x95, 0x52, 0x3d, 0x18, 0x66, 0xd7, 0xbb, 0x65, 0x63, 0x7c, 0xbd,
+  0xfe, 0xa8, 0xe1, 0xbc, 0x48, 0x89, 0x50, 0xbd, 0x64, 0x1d, 0xbe, 0x3c, 0x54,
+  0xe9, 0x07, 0x3d, 0x2f, 0x27, 0x2b, 0x3d, 0x55, 0x02, 0x00, 0x3d, 0xb2, 0xbe,
+  0x53, 0xbd, 0xd8, 0x03, 0x72, 0xbd, 0xd4, 0x63, 0x69, 0x3d, 0x1c, 0x9b, 0x7c,
+  0xbd, 0x87, 0x6b, 0x83, 0xbd, 0xc8, 0x0e, 0x0f, 0xbd, 0xed, 0x88, 0x30, 0xbd,
+  0xce, 0x02, 0x31, 0xbd, 0xae, 0xdd, 0x17, 0xbd, 0x03, 0x61, 0x43, 0xbd, 0xcf,
+  0xd3, 0x03, 0xbd, 0x56, 0x0b, 0x57, 0xbd, 0x85, 0x33, 0x0d, 0xbd, 0x36, 0x8f,
+  0x0b, 0xbd, 0x8e, 0x7d, 0x2c, 0xbc, 0x99, 0x21, 0x40, 0xbd, 0x9b, 0xf2, 0x62,
+  0xbb, 0xcc, 0xaf, 0x3f, 0x3d, 0x3f, 0xc0, 0xab, 0x3c, 0xc1, 0x4d, 0x27, 0x3c,
+  0x4b, 0x78, 0x30, 0x3d, 0x04, 0x65, 0xfe, 0x3b, 0xbe, 0x78, 0xb0, 0xbc, 0x9a,
+  0xb9, 0xe8, 0xbc, 0x58, 0x9c, 0x5d, 0x3d, 0x95, 0x93, 0x65, 0x3d, 0xd9, 0xa8,
+  0x41, 0xbd, 0x91, 0xb5, 0x36, 0x3d, 0x48, 0xc5, 0x84, 0xbd, 0xf8, 0x98, 0x3c,
+  0x3c, 0x07, 0x2e, 0x96, 0xbd, 0xf2, 0xa1, 0x2b, 0xba, 0xdc, 0xa1, 0x10, 0xbd,
+  0x3a, 0xa4, 0xdb, 0xbc, 0x03, 0x75, 0x63, 0xbd, 0x5f, 0x46, 0x3d, 0x3a, 0x75,
+  0x7d, 0x56, 0x3d, 0x68, 0x12, 0xa8, 0xbc, 0x03, 0xf5, 0x98, 0xbd, 0xe0, 0x3c,
+  0xe7, 0xbc, 0x90, 0xb6, 0xbb, 0xbb, 0x48, 0x0e, 0x08, 0x3d, 0x68, 0x30, 0x35,
+  0x3c, 0xb4, 0x17, 0xcf, 0x3c, 0xf9, 0xd9, 0xf8, 0x3c, 0xc8, 0x7e, 0x09, 0xbc,
+  0x84, 0xde, 0x45, 0xbd, 0xfe, 0xad, 0xf7, 0xbc, 0xdb, 0x10, 0x8b, 0xbd, 0x65,
+  0xac, 0x40, 0x3d, 0x2f, 0xc7, 0x12, 0x3c, 0x60, 0x81, 0x62, 0x3d, 0x96, 0xbd,
+  0xf6, 0x3c, 0xee, 0x7e, 0x80, 0x3d, 0x76, 0x78, 0x25, 0x3d, 0xec, 0x17, 0x1b,
+  0xbc, 0x17, 0xa7, 0x2f, 0xbd, 0x5c, 0x17, 0x4e, 0x3d, 0x92, 0x4e, 0x99, 0xbb,
+  0xe6, 0xec, 0x1d, 0xbd, 0xcf, 0xd4, 0x15, 0x3d, 0x36, 0x68, 0xcb, 0x3c, 0x05,
+  0xd3, 0x68, 0x3c, 0x4d, 0x37, 0x96, 0x3c, 0x85, 0x4b, 0x98, 0x3b, 0x3e, 0xf9,
+  0x6a, 0x3d, 0x42, 0xd5, 0x85, 0xbc, 0x35, 0xf1, 0x48, 0xbd, 0xae, 0x5a, 0x69,
+  0x3b, 0xfc, 0xc3, 0x81, 0xbd, 0x3d, 0xe3, 0x71, 0xbd, 0xdb, 0x3b, 0x18, 0xbd,
+  0x40, 0x90, 0x26, 0xbd, 0x5d, 0xef, 0x80, 0xbc, 0x94, 0x89, 0x9a, 0xbc, 0x96,
+  0x7a, 0x33, 0xbd, 0x94, 0x61, 0x71, 0x3d, 0xe6, 0xaf, 0x5a, 0x3d, 0x5f, 0x3d,
+  0x6a, 0x3b, 0x22, 0xcf, 0x23, 0xbc, 0xb1, 0x6f, 0x4b, 0xbb, 0x9a, 0x4b, 0xbe,
+  0x3c, 0xd7, 0x02, 0x95, 0xbc, 0xb5, 0xfa, 0x4b, 0xbd, 0x8d, 0x7e, 0x85, 0xbc,
+  0x12, 0x0b, 0x3c, 0x3d, 0xa5, 0x2c, 0xfc, 0xbb, 0xb0, 0xcc, 0xb2, 0xbb, 0xf2,
+  0x03, 0x4a, 0xbd, 0x87, 0xe3, 0x1d, 0xbd, 0xcc, 0xd7, 0xed, 0x3c, 0x16, 0x63,
+  0x73, 0xbc, 0x18, 0x4e, 0x47, 0x3d, 0x70, 0x95, 0x37, 0xbd, 0xfb, 0xdd, 0xc4,
+  0x3c, 0x3d, 0x65, 0xfb, 0x3c, 0x96, 0xa0, 0x84, 0x3d, 0x60, 0x19, 0xff, 0xbb,
+  0xa4, 0xbf, 0x4b, 0x3c, 0x5b, 0x63, 0x03, 0xbd, 0x8d, 0x86, 0xcb, 0xbb, 0x62,
+  0xee, 0x76, 0xbd, 0x9c, 0x16, 0x73, 0x3d, 0x4f, 0xd8, 0x81, 0x3d, 0xe2, 0x7d,
+  0xba, 0xbc, 0xd6, 0x7a, 0xb4, 0x3b, 0x61, 0x45, 0x87, 0x3d, 0xe1, 0x5e, 0x8a,
+  0xbd, 0xfc, 0x1f, 0xc0, 0xbc, 0xc0, 0x87, 0x14, 0xbd, 0x3d, 0x53, 0x16, 0x3d,
+  0x86, 0x91, 0x17, 0x3c, 0xa6, 0x1a, 0x71, 0xbc, 0xe7, 0x57, 0xf9, 0xbc, 0x27,
+  0x13, 0x87, 0x3d, 0x98, 0x4e, 0x02, 0x3d, 0xe5, 0x9d, 0x13, 0x3d, 0x89, 0xbf,
+  0x2e, 0x3c, 0xa0, 0x5f, 0x21, 0x3b, 0x80, 0xc1, 0xf4, 0x3b, 0x14, 0x22, 0x2a,
+  0xbc, 0x33, 0xd3, 0x93, 0x3c, 0xd7, 0x3d, 0x6e, 0x3d, 0x2e, 0xcd, 0x81, 0xbd,
+  0x71, 0xa3, 0x45, 0xbd, 0xde, 0xd6, 0x4f, 0x3d, 0xb7, 0xe7, 0x41, 0xbd, 0x27,
+  0x86, 0xd6, 0x3c, 0x6b, 0x72, 0x85, 0x3d, 0x6d, 0x89, 0x11, 0xbd, 0x21, 0x7b,
+  0x1a, 0xbd, 0x18, 0xf1, 0x38, 0xbd, 0xc3, 0xf7, 0xb1, 0x3c, 0xd7, 0xa0, 0x8e,
+  0xbd, 0x6e, 0x16, 0x24, 0x3d, 0xc2, 0x2b, 0x2f, 0x3d, 0xc8, 0x1c, 0x82, 0x3c,
+  0x53, 0x30, 0x24, 0xbc, 0xd9, 0x49, 0x1f, 0xbd, 0xea, 0x81, 0x3f, 0x3d, 0xc4,
+  0xb7, 0x1a, 0x3d, 0xc3, 0x0a, 0x0b, 0xbd, 0x29, 0x5d, 0x88, 0x3d, 0x3f, 0xb6,
+  0x9f, 0xbc, 0x97, 0x16, 0x72, 0xbd, 0x67, 0x40, 0xa4, 0xbc, 0x67, 0x64, 0x59,
+  0xbc, 0xd0, 0x90, 0xfd, 0xbc, 0x48, 0xa3, 0x1b, 0xbd, 0x5f, 0x6c, 0xf2, 0x3c,
+  0xe4, 0x81, 0x97, 0xbd, 0x2b, 0xe9, 0x86, 0x3d, 0x6c, 0xa1, 0x06, 0xbd, 0xa8,
+  0x7c, 0x2a, 0x3c, 0x07, 0xca, 0x8d, 0x3b, 0x1f, 0x0c, 0x21, 0xbd, 0xb0, 0x7f,
+  0x90, 0xbd, 0xe5, 0x3f, 0x17, 0x3d, 0x03, 0x58, 0x43, 0xbd, 0xe7, 0x24, 0x42,
+  0xbd, 0xdd, 0xf2, 0x95, 0xbd, 0x58, 0xd0, 0xd9, 0x3c, 0xa9, 0xbe, 0x00, 0x3d,
+  0x40, 0x4c, 0x97, 0xbd, 0x06, 0x0f, 0x63, 0xbd, 0x44, 0x04, 0x42, 0xbd, 0x69,
+  0xfa, 0xd6, 0xbb, 0x40, 0x95, 0xca, 0xba, 0xba, 0x29, 0x80, 0xbd, 0x40, 0x04,
+  0x8f, 0xbd, 0x9b, 0xd2, 0x71, 0xbd, 0x16, 0x0f, 0x36, 0xbd, 0xcf, 0xe9, 0x77,
+  0x3d, 0x00, 0x20, 0xe2, 0xb8, 0x77, 0xed, 0x89, 0xba, 0x27, 0x9d, 0x7d, 0xbd,
+  0x8b, 0x7d, 0xa1, 0x3c, 0xaf, 0x02, 0x41, 0xbd, 0x76, 0x0a, 0x80, 0xbd, 0xc5,
+  0xbe, 0x0c, 0x3c, 0x65, 0xbc, 0x53, 0x3c, 0x23, 0x57, 0x71, 0x3d, 0x4c, 0x69,
+  0xad, 0x3c, 0xe6, 0x35, 0x70, 0xbd, 0x4a, 0x71, 0x0f, 0x3d, 0x60, 0x74, 0x60,
+  0xbd, 0x00, 0x21, 0xff, 0xbc, 0x2e, 0x9e, 0x15, 0xbd, 0x5b, 0xfa, 0xfb, 0xbc,
+  0x70, 0x17, 0xe6, 0x3c, 0xb8, 0x5a, 0x03, 0x3d, 0x26, 0x71, 0x82, 0x3d, 0x40,
+  0xf1, 0xe2, 0xbb, 0xad, 0xa1, 0x7d, 0xbd, 0xbb, 0x38, 0xb0, 0xbc, 0xa8, 0x2e,
+  0x18, 0x3d, 0x29, 0xe4, 0x01, 0xbd, 0x3d, 0xed, 0x75, 0xbc, 0xc1, 0x90, 0x09,
+  0x3d, 0x7a, 0x35, 0xf9, 0xbc, 0x0a, 0x1f, 0x8e, 0xbc, 0x7b, 0x9e, 0x05, 0xbc,
+  0x00, 0xe1, 0x18, 0x3c, 0x90, 0xf1, 0xc1, 0xbc, 0xbc, 0xfc, 0x87, 0x3d, 0x28,
+  0x2a, 0x48, 0x3c, 0xcf, 0x41, 0xf4, 0xbc, 0xa3, 0x20, 0x7a, 0xbd, 0x58, 0x65,
+  0x0c, 0x3b, 0x5b, 0x8e, 0xd7, 0xbc, 0x09, 0x03, 0x87, 0x3d, 0xfa, 0xcf, 0xaa,
+  0xbc, 0x12, 0x45, 0x83, 0xbd, 0x29, 0x24, 0x89, 0xbd, 0x77, 0x6e, 0x98, 0xbd,
+  0x50, 0xf7, 0x91, 0xbb, 0x3e, 0x17, 0x86, 0x3c, 0xcf, 0x82, 0x54, 0x3d, 0x12,
+  0x48, 0xff, 0xbb, 0xa8, 0x39, 0xa6, 0x3c, 0x57, 0xfc, 0xb4, 0xbc, 0xc5, 0x25,
+  0x30, 0xbd, 0xcd, 0xbc, 0x04, 0xbd, 0x10, 0x87, 0xb4, 0xbc, 0x16, 0x7b, 0x6e,
+  0xbd, 0xba, 0x00, 0x5f, 0xbd, 0xf8, 0x14, 0xac, 0x3c, 0xdf, 0x4d, 0x88, 0xbd,
+  0x2e, 0xd2, 0xb6, 0xbc, 0x8e, 0x7a, 0x8e, 0xbd, 0xac, 0xdb, 0xe2, 0x3c, 0x7b,
+  0x12, 0x8b, 0x3d, 0x03, 0xe2, 0x91, 0xbd, 0x43, 0xac, 0x3c, 0xbc, 0x5a, 0xc7,
+  0x52, 0x3d, 0x5e, 0xec, 0x40, 0x3d, 0x1a, 0xb0, 0x1f, 0xbc, 0x1d, 0x9c, 0x92,
+  0xbd, 0xd3, 0x03, 0xfd, 0x3c, 0xdd, 0x22, 0x0a, 0xbb, 0xe2, 0x2a, 0x89, 0x3d,
+  0x94, 0xb6, 0xd4, 0xbb, 0x74, 0x26, 0xb8, 0xbc, 0xc6, 0x7a, 0x35, 0xbd, 0xa8,
+  0xb7, 0x8e, 0xbd, 0xbe, 0x94, 0x36, 0xbd, 0x22, 0xc0, 0x03, 0xbd, 0x40, 0xb4,
+  0xe5, 0x3a, 0x53, 0xb5, 0x14, 0xbc, 0xac, 0x00, 0x3a, 0xbc, 0xb3, 0xd9, 0xee,
+  0x3c, 0xb5, 0x7c, 0xae, 0xbb, 0xd6, 0xb2, 0x75, 0x3c, 0x2f, 0x0e, 0x1a, 0xbd,
+  0xf0, 0xb2, 0x47, 0xbd, 0xad, 0x36, 0x50, 0xbb, 0x19, 0x86, 0x36, 0xbd, 0xb4,
+  0x02, 0xe4, 0xbc, 0xe2, 0x37, 0x10, 0x3d, 0x17, 0xcb, 0x86, 0xbd, 0x33, 0x35,
+  0x5e, 0x3c, 0x63, 0xfe, 0x8f, 0x3d, 0x8e, 0x91, 0x6c, 0xbd, 0xf8, 0x55, 0x6f,
+  0x3c, 0x60, 0xc0, 0xb6, 0x3c, 0x09, 0x23, 0x8d, 0xbd, 0x75, 0xae, 0x89, 0x3d,
+  0x4e, 0xb2, 0x76, 0x3d, 0xbc, 0x52, 0x57, 0xbd, 0x5c, 0xf2, 0xde, 0xbc, 0x5a,
+  0xc5, 0xc5, 0xbc, 0x01, 0xbf, 0x1a, 0xbd, 0xc4, 0x10, 0x37, 0xbd, 0xe9, 0xe5,
+  0x7a, 0x3b, 0xa0, 0x03, 0x58, 0xbd, 0x4f, 0xe4, 0x66, 0x3d, 0xbd, 0xc0, 0xa8,
+  0xbc, 0xd0, 0x05, 0xb9, 0x3c, 0xd3, 0xb7, 0xd9, 0x3c, 0xf2, 0x28, 0x2d, 0x3d,
+  0x69, 0x78, 0x38, 0xbd, 0x55, 0x58, 0x49, 0xbc, 0xc5, 0x5b, 0xc2, 0x3c, 0x67,
+  0x0d, 0x40, 0x3d, 0x02, 0xec, 0x2b, 0x3d, 0x60, 0x6a, 0xac, 0x3c, 0x6a, 0x9c,
+  0x65, 0x3d, 0x19, 0x18, 0x4d, 0xbd, 0x05, 0xaf, 0xbd, 0xbc, 0x22, 0x2b, 0x54,
+  0xbd, 0x1d, 0x0c, 0xd9, 0xbc, 0x0a, 0xf7, 0xfd, 0x3a, 0x5a, 0x18, 0x23, 0x3d,
+  0xeb, 0xfc, 0x84, 0xbd, 0xaf, 0x71, 0x0c, 0xbc, 0x98, 0x72, 0x5e, 0x3c, 0x18,
+  0x8b, 0x88, 0x3c, 0xa4, 0x1d, 0x8f, 0xbb, 0x3c, 0x3d, 0xbf, 0xbc, 0x18, 0x7a,
+  0xc7, 0x3c, 0x2e, 0x1c, 0x77, 0xbd, 0x50, 0x47, 0x55, 0x3c, 0x5c, 0xa7, 0x23,
+  0xbc, 0x0c, 0x4e, 0xda, 0x3c, 0x00, 0x25, 0x7f, 0x3d, 0xdc, 0xbd, 0x85, 0xbd,
+  0xee, 0x84, 0x91, 0xbc, 0x0b, 0xcb, 0x81, 0x3d, 0x7a, 0x5f, 0x04, 0xbc, 0xde,
+  0x3d, 0x7b, 0xbb, 0x05, 0xa9, 0x79, 0x3d, 0x6c, 0x47, 0x2e, 0xbd, 0x9a, 0x8c,
+  0x7c, 0x3d, 0xee, 0xc6, 0x93, 0xbd, 0xaf, 0xd0, 0xd9, 0xbc, 0x33, 0x14, 0x3c,
+  0xbd, 0xe3, 0x36, 0x6e, 0x3d, 0x0b, 0x9a, 0x55, 0xbc, 0xe9, 0x83, 0x84, 0x3d,
+  0xd6, 0xb4, 0x6c, 0x3d, 0xc4, 0xea, 0xd4, 0x3c, 0x48, 0xb4, 0x20, 0x3d, 0x6e,
+  0xc9, 0x53, 0x3d, 0x4e, 0x95, 0xbb, 0xbc, 0x15, 0x0c, 0x86, 0x3d, 0xdc, 0x7a,
+  0x40, 0xbd, 0x98, 0x24, 0x6d, 0xbc, 0x2f, 0xea, 0x8a, 0xbd, 0x78, 0x00, 0xb4,
+  0x3c, 0x8f, 0x53, 0x52, 0x3d, 0xc2, 0xfb, 0x11, 0x3d, 0x10, 0x7e, 0x81, 0x3c,
+  0xae, 0xf3, 0x3e, 0x3d, 0x34, 0x8d, 0xeb, 0x3c, 0x72, 0x86, 0xd6, 0xbc, 0xd5,
+  0x02, 0xad, 0x3b, 0x9d, 0x1c, 0x41, 0xbd, 0xda, 0x6b, 0x23, 0x3d, 0xaf, 0xa0,
+  0x2b, 0x3d, 0x91, 0xd9, 0x5c, 0x3d, 0xce, 0x13, 0x4c, 0xbd, 0xa8, 0x7a, 0x4a,
+  0x3d, 0xfd, 0xc5, 0x29, 0xbd, 0xff, 0xa6, 0x50, 0xbd, 0x9d, 0x04, 0x43, 0x3d,
+  0x49, 0x9f, 0x82, 0xbd, 0xe0, 0x8c, 0x87, 0xbd, 0xb7, 0xb5, 0x64, 0xbd, 0x5e,
+  0x55, 0x27, 0x3d, 0x8d, 0xde, 0x41, 0x3d, 0x19, 0x6b, 0x23, 0xbc, 0x6f, 0x71,
+  0xf6, 0x3c, 0x04, 0x56, 0x24, 0x3d, 0xb8, 0x20, 0x3a, 0x3c, 0x97, 0xb4, 0x91,
+  0xbd, 0x87, 0xf5, 0x6d, 0x3d, 0x80, 0x5b, 0x9d, 0x3c, 0x70, 0x4c, 0xad, 0x3b,
+  0xff, 0x49, 0x81, 0x3d, 0x88, 0x14, 0x89, 0xbc, 0x72, 0xde, 0x25, 0xbd, 0x62,
+  0xa9, 0x21, 0x3d, 0x94, 0x43, 0x59, 0xbc, 0xb1, 0x5a, 0x92, 0x3d, 0x9d, 0x57,
+  0x6b, 0x3c, 0x5d, 0xa8, 0x8d, 0x3d, 0xd7, 0xf7, 0x08, 0x3d, 0x1c, 0x07, 0xe3,
+  0xbc, 0xdd, 0xfc, 0xb5, 0xbc, 0xbc, 0xca, 0x84, 0x3d, 0x5c, 0x9e, 0x18, 0xbd,
+  0xd5, 0x6d, 0x86, 0x3d, 0x42, 0x2b, 0x58, 0x3c, 0x0a, 0xc6, 0x33, 0x3d, 0x2c,
+  0x1e, 0xf6, 0xbc, 0xb8, 0x48, 0x46, 0xbd, 0x26, 0xd6, 0x88, 0xbd, 0xd8, 0x45,
+  0x2e, 0x3d, 0x7f, 0x28, 0x4f, 0x3d, 0x52, 0x42, 0x40, 0xbc, 0xad, 0xc8, 0x45,
+  0xbd, 0xaa, 0x1c, 0x27, 0xbd, 0x32, 0x83, 0x72, 0xbb, 0xd2, 0xc5, 0x33, 0x3b,
+  0x1e, 0x2f, 0x6f, 0x3d, 0x9e, 0x5c, 0x1c, 0x3d, 0x2d, 0xfb, 0xc5, 0xbc, 0x3d,
+  0x12, 0x68, 0x3b, 0xb4, 0x98, 0xe9, 0x3c, 0xb9, 0xbd, 0xdf, 0x3a, 0xe0, 0xac,
+  0x2c, 0x3d, 0x10, 0x5c, 0x87, 0x3c, 0x80, 0xd6, 0x2d, 0xba, 0x18, 0x73, 0x94,
+  0x3c, 0xb8, 0x3c, 0x39, 0xbc, 0x48, 0x64, 0xda, 0x3c, 0x54, 0xdf, 0x05, 0x3d,
+  0x04, 0x35, 0xdf, 0x3c, 0xdb, 0xf8, 0xfb, 0xba, 0xc3, 0x2d, 0xc1, 0xb8, 0x0e,
+  0x8c, 0xd1, 0x3c, 0x4f, 0x12, 0x14, 0x3d, 0x50, 0xbc, 0x7d, 0xbc, 0xc7, 0x20,
+  0x88, 0xbd, 0x79, 0x45, 0x2f, 0xbd, 0x77, 0x83, 0x55, 0xbc, 0x42, 0x7e, 0x95,
+  0xbd, 0x9d, 0xfb, 0x4d, 0xbd, 0x92, 0xcc, 0x89, 0xbd, 0x84, 0x1d, 0x03, 0xbd,
+  0x1f, 0xe1, 0x86, 0xbb, 0xca, 0xee, 0x4e, 0x3c, 0x15, 0x39, 0x55, 0xbd, 0x94,
+  0x4b, 0x87, 0xbd, 0xf3, 0xf0, 0x0d, 0xbd, 0x4d, 0x17, 0x7b, 0x3d, 0xe5, 0x0b,
+  0x95, 0xbc, 0x10, 0x50, 0x20, 0xbd, 0x60, 0x74, 0x7c, 0xbd, 0x50, 0x76, 0xad,
+  0xbc, 0xdd, 0x59, 0x89, 0x3c, 0xa1, 0xcc, 0x10, 0x3d, 0x23, 0x4c, 0x37, 0x3c,
+  0x50, 0x0e, 0xa6, 0x3c, 0x02, 0x0e, 0x24, 0xbd, 0x9d, 0x9f, 0x40, 0xbd, 0xba,
+  0xe1, 0x51, 0xbd, 0x9e, 0xe5, 0x2a, 0xbd, 0x44, 0x07, 0xc8, 0x3c, 0xc0, 0x11,
+  0x85, 0x3c, 0x1c, 0xde, 0x40, 0xbd, 0x34, 0xd3, 0xe3, 0x3c, 0xf1, 0xae, 0xdb,
+  0xbc, 0xea, 0xbb, 0xf0, 0xbc, 0x32, 0x81, 0xb7, 0x3c, 0x1b, 0xe9, 0x4f, 0xbd,
+  0x47, 0xd3, 0xb7, 0xbc, 0xc4, 0x4b, 0xe7, 0xbc, 0xf3, 0x52, 0x3b, 0x3d, 0x10,
+  0xb8, 0xb6, 0x3b, 0x0b, 0xb8, 0x33, 0xbc, 0xb1, 0xba, 0x29, 0x3d, 0x93, 0xfc,
+  0x00, 0xbd, 0xdf, 0x63, 0x30, 0xbd, 0xac, 0x1d, 0x1e, 0x3d, 0x52, 0xf7, 0x15,
+  0xbd, 0x7f, 0xea, 0x53, 0xbd, 0x29, 0xe4, 0x2f, 0xbc, 0x5e, 0xf0, 0xb7, 0x3c,
+  0xb1, 0xff, 0x09, 0xbd, 0xc9, 0x0f, 0xae, 0x3c, 0x5a, 0xc0, 0x06, 0xbd, 0x34,
+  0x15, 0x10, 0xbd, 0x76, 0xea, 0x95, 0xbc, 0x60, 0xd8, 0x2d, 0x3c, 0x4c, 0x12,
+  0x77, 0xbc, 0x2d, 0xb6, 0x88, 0x3d, 0x7f, 0x15, 0xe4, 0x3c, 0xb0, 0xef, 0xf0,
+  0xbc, 0x79, 0x32, 0x1c, 0xbd, 0x4d, 0xbc, 0x4b, 0xbd, 0xae, 0x6d, 0x64, 0x3d,
+  0x0c, 0x44, 0x82, 0xbc, 0x15, 0x4f, 0x3e, 0xbd, 0x86, 0x54, 0xab, 0xbc, 0x78,
+  0xea, 0x0d, 0xbd, 0x73, 0xc6, 0x87, 0xbd, 0x06, 0xed, 0x32, 0xbd, 0xfd, 0x03,
+  0x8a, 0xbd, 0x89, 0x8b, 0x30, 0xbd, 0x40, 0x73, 0x0d, 0xbd, 0xcf, 0x80, 0x84,
+  0xbd, 0x3c, 0x00, 0x69, 0xbd, 0xeb, 0x8a, 0xf8, 0x3b, 0xc1, 0xa4, 0x93, 0xbd,
+  0x25, 0x74, 0x69, 0xbd, 0x11, 0xe5, 0x00, 0x3d, 0x2d, 0xa0, 0x01, 0x3d, 0xf9,
+  0x7d, 0x02, 0xbc, 0x55, 0x26, 0x30, 0x3d, 0xad, 0xf7, 0x50, 0x3c, 0xd6, 0xb1,
+  0x68, 0x3d, 0xce, 0x49, 0x71, 0xbd, 0xcf, 0xde, 0xaa, 0x3b, 0x5d, 0x6e, 0x91,
+  0xbd, 0xb4, 0xf1, 0x1a, 0xbd, 0xc7, 0xeb, 0xc2, 0x3c, 0x50, 0x74, 0xd4, 0xbb,
+  0xe8, 0x25, 0x1f, 0x3d, 0xdb, 0x0a, 0x8e, 0xbc, 0x9d, 0x5d, 0x73, 0xbd, 0x70,
+  0xce, 0x01, 0xbc, 0xc4, 0x22, 0x84, 0x3d, 0x80, 0x3b, 0x1d, 0x3c, 0x3d, 0xfa,
+  0x15, 0xbd, 0x45, 0xd7, 0x9a, 0xbd, 0x4d, 0xa2, 0x4e, 0xbd, 0x41, 0x6e, 0x96,
+  0xbc, 0xbf, 0xe4, 0x6c, 0x3d, 0x90, 0x3c, 0x21, 0x3d, 0x99, 0x76, 0x83, 0x3c,
+  0xe1, 0xb9, 0x6f, 0x3d, 0x24, 0xb9, 0xcf, 0xbc, 0xc0, 0x33, 0xee, 0xbb, 0x8d,
+  0xa6, 0xf0, 0xbc, 0x40, 0x81, 0x3f, 0x3d, 0x43, 0x82, 0x7e, 0x3c, 0xfa, 0x13,
+  0x7a, 0x3d, 0x91, 0xcd, 0x0a, 0xbc, 0x80, 0x3e, 0x61, 0x3d, 0x65, 0xef, 0x56,
+  0xbd, 0x44, 0x57, 0x90, 0xbd, 0xb4, 0x86, 0x7a, 0x3c, 0x70, 0xf5, 0xbd, 0x3c,
+  0x90, 0x5c, 0xdc, 0x3c, 0x13, 0xe5, 0xeb, 0xbc, 0x30, 0x7a, 0x48, 0x3d, 0xfa,
+  0x4c, 0xbe, 0x3c, 0x4d, 0x35, 0x2e, 0xbd, 0x32, 0x33, 0xdb, 0xbc, 0xab, 0x4c,
+  0x0a, 0xbd, 0x12, 0x58, 0xad, 0xbc, 0x20, 0x07, 0x0c, 0x3c, 0xbc, 0xb5, 0xa6,
+  0x3c, 0xb6, 0x70, 0x8f, 0xbd, 0xbc, 0x9a, 0x57, 0x3d, 0xb3, 0x6f, 0x82, 0xbd,
+  0x52, 0xb9, 0x5c, 0x3c, 0x0d, 0x71, 0xd9, 0x3c, 0x18, 0x70, 0x0a, 0x3d, 0x80,
+  0x7b, 0x0a, 0x3b, 0xee, 0x75, 0x27, 0xbc, 0x63, 0x74, 0x56, 0xbd, 0xf0, 0x20,
+  0x5f, 0x3b, 0xfb, 0x77, 0x1e, 0xba, 0xb8, 0x6c, 0xee, 0x3c, 0x01, 0xd0, 0xef,
+  0x3c, 0xb2, 0x68, 0x12, 0xbd, 0x51, 0xf6, 0x3c, 0xbd, 0x12, 0xb0, 0x2e, 0xbd,
+  0x11, 0xfd, 0x5e, 0xbd, 0x48, 0xea, 0xb4, 0xbc, 0xce, 0xca, 0x88, 0x3d, 0x38,
+  0x57, 0x40, 0x3d, 0x11, 0xfa, 0x8b, 0x3d, 0xc0, 0x34, 0x36, 0x3d, 0xe4, 0x82,
+  0x8e, 0xbd, 0xbd, 0x95, 0x59, 0xbd, 0xf0, 0x8b, 0x43, 0xbd, 0x93, 0x9b, 0x0a,
+  0xbc, 0xb7, 0x99, 0x4d, 0x3c, 0x46, 0x42, 0x1d, 0x3d, 0x00, 0x19, 0x3a, 0xbd,
+  0x1c, 0xd3, 0x5a, 0xbd, 0xff, 0x09, 0x02, 0xbd, 0xa1, 0x01, 0x8e, 0x3d, 0xc3,
+  0x9e, 0xd8, 0xbb, 0x28, 0xb5, 0x2d, 0x3d, 0x56, 0x9c, 0x16, 0x3d, 0x78, 0xe6,
+  0x1e, 0xbc, 0x06, 0x56, 0x14, 0x3d, 0xbc, 0x3f, 0x88, 0xbd, 0x34, 0x45, 0x94,
+  0xbc, 0xfb, 0xb1, 0x0a, 0xbd, 0x67, 0x87, 0x90, 0xbd, 0x4d, 0x75, 0x27, 0xbd,
+  0x9f, 0xc8, 0x60, 0x3b, 0x02, 0xc4, 0xb0, 0xbc, 0x54, 0x5b, 0x5f, 0xbd, 0xe3,
+  0x43, 0xff, 0xbc, 0xf6, 0xf7, 0x39, 0xbc, 0x99, 0x4c, 0x82, 0xbd, 0xda, 0x99,
+  0xa9, 0x3b, 0x6a, 0xd5, 0xee, 0xbc, 0x1e, 0xc1, 0x93, 0xbd, 0xc2, 0x21, 0x52,
+  0xbc, 0x52, 0xfc, 0x06, 0xbc, 0x70, 0x59, 0x85, 0xbd, 0x5d, 0xbd, 0x8a, 0xbd,
+  0xe2, 0x10, 0x77, 0x3d, 0x36, 0x83, 0x90, 0xbd, 0x66, 0x9f, 0x90, 0xbc, 0x30,
+  0x78, 0x4c, 0x3d, 0xd4, 0x2c, 0x8b, 0x3c, 0xe0, 0x8b, 0x4e, 0xbc, 0x31, 0x0f,
+  0x80, 0xbd, 0x4a, 0xb7, 0x5b, 0xbd, 0x52, 0xd0, 0x1a, 0xbd, 0x5c, 0x20, 0xe3,
+  0x3c, 0x5a, 0x77, 0x29, 0xbd, 0x90, 0x0b, 0x00, 0xbd, 0x62, 0x10, 0x4c, 0x3d,
+  0x40, 0x52, 0x58, 0x3c, 0x18, 0x5e, 0x46, 0x3c, 0xc6, 0x6b, 0x37, 0x3d, 0x17,
+  0x5c, 0x90, 0x3d, 0x28, 0x6c, 0xfd, 0xbc, 0x7e, 0x4b, 0x28, 0xbd, 0x86, 0x7b,
+  0x1d, 0xbd, 0x2b, 0x78, 0x83, 0x3d, 0x48, 0x65, 0x53, 0x3d, 0x91, 0x41, 0x7b,
+  0xbd, 0x0a, 0x32, 0x65, 0xbd, 0x80, 0xb5, 0x83, 0xbd, 0x93, 0x10, 0x8b, 0x3d,
+  0x40, 0xc2, 0x9b, 0x3a, 0xe8, 0xe9, 0xcc, 0x3c, 0xb8, 0xf5, 0x00, 0x3d, 0x2a,
+  0x60, 0x70, 0x3d, 0xbb, 0xa9, 0x18, 0xbd, 0xbf, 0xca, 0x76, 0xbd, 0xf4, 0x83,
+  0xda, 0xbc, 0xcc, 0x89, 0xeb, 0x3c, 0xa0, 0x01, 0x27, 0xbb, 0x90, 0x98, 0x1e,
+  0x3d, 0x2d, 0x7a, 0x91, 0xbd, 0x00, 0x8e, 0x71, 0xbd, 0xc7, 0x30, 0x1a, 0xbd,
+  0x22, 0xe9, 0x3d, 0x3d, 0x1a, 0xb3, 0x46, 0x3d, 0xbe, 0x20, 0x5a, 0x3d, 0x02,
+  0x34, 0x0b, 0xbd, 0x8d, 0x91, 0x5c, 0xbd, 0x84, 0xeb, 0xdc, 0xbc, 0xaa, 0x4b,
+  0xd6, 0xbc, 0xab, 0xd1, 0x91, 0x3d, 0xb8, 0x2c, 0x95, 0x3c, 0x0c, 0xf7, 0x59,
+  0x3d, 0xc9, 0xea, 0x8e, 0xbd, 0x23, 0xb1, 0x83, 0xbd, 0x27, 0x20, 0x85, 0xbd,
+  0x40, 0xdb, 0xaa, 0x3a, 0x4c, 0x7b, 0x48, 0xbc, 0x00, 0x62, 0x9d, 0x3b, 0xaf,
+  0xeb, 0x83, 0x3d, 0xe0, 0x4e, 0x1d, 0x3b, 0x90, 0xf9, 0xdc, 0xbc, 0xd6, 0x49,
+  0x60, 0x3d, 0x4e, 0x96, 0x66, 0x3d, 0xbe, 0x9e, 0x9b, 0xbc, 0xec, 0x9e, 0xff,
+  0x3c, 0xd0, 0xa1, 0x0b, 0x3d, 0xb4, 0x2d, 0x39, 0x3d, 0x28, 0x62, 0x9a, 0x3c,
+  0xce, 0xdc, 0x67, 0x3d, 0xe8, 0xb6, 0x68, 0x3c, 0xb6, 0x37, 0x87, 0xbd, 0xee,
+  0xd3, 0x67, 0x3d, 0x18, 0xfb, 0x31, 0x3c, 0x27, 0x89, 0x26, 0xbd, 0x30, 0x9e,
+  0xc0, 0x3c, 0xd0, 0x5b, 0x30, 0xbd, 0x90, 0x96, 0x33, 0x3c, 0x1e, 0xf8, 0x20,
+  0xbd, 0x48, 0xa2, 0xa2, 0x3c, 0x2e, 0x6b, 0x3f, 0xbd, 0x32, 0x37, 0x1e, 0x3d,
+  0x10, 0x9e, 0x26, 0xbd, 0x1c, 0xd5, 0x60, 0xbd, 0xf5, 0x5f, 0x06, 0xbd, 0x87,
+  0xff, 0x71, 0xbd, 0x1d, 0xba, 0x8c, 0xbd, 0x00, 0xe0, 0x8c, 0xba, 0x20, 0x94,
+  0x0d, 0xbc, 0x5a, 0x15, 0x84, 0xbc, 0x36, 0x58, 0x50, 0x3d, 0x7a, 0x21, 0x5c,
+  0x3d, 0x78, 0x57, 0x39, 0xbd, 0x8d, 0x3b, 0x59, 0xbd, 0x90, 0x90, 0x80, 0xbb,
+  0xf0, 0x93, 0xbe, 0x3b, 0x50, 0x34, 0xe1, 0xbb, 0xc0, 0xac, 0xd3, 0xba, 0x42,
+  0x75, 0xb4, 0xbc, 0x38, 0xaa, 0x30, 0xbd, 0xa6, 0x79, 0x49, 0x3d, 0xfc, 0xd2,
+  0x37, 0xbc, 0xe0, 0x0d, 0xd6, 0xbb, 0xc1, 0x2d, 0x73, 0xbd, 0x4a, 0xf1, 0x5b,
+  0xbd, 0xd4, 0x0c, 0x82, 0x3c, 0xce, 0x51, 0x0c, 0xbd, 0xe0, 0x9c, 0x4e, 0xbd,
+  0x3e, 0x98, 0x6a, 0x3d, 0x7e, 0xbf, 0x27, 0x3d, 0x00, 0xb2, 0x6f, 0xbd, 0x0c,
+  0xcd, 0x4d, 0x3d, 0xfa, 0x7b, 0x22, 0x3d, 0x18, 0x3f, 0x02, 0xbc, 0xa4, 0x1a,
+  0xb7, 0xbc, 0xe2, 0xf5, 0x45, 0x3d, 0xf0, 0x66, 0xe6, 0xbb, 0xd2, 0x56, 0x54,
+  0x3d, 0x72, 0xff, 0x64, 0x3d, 0x68, 0xbf, 0x41, 0x3d, 0x8c, 0xa8, 0x39, 0xbd,
+  0x4b, 0x80, 0x88, 0x3d, 0x40, 0x05, 0x8f, 0x3c, 0x9a, 0x58, 0x6b, 0xbd, 0xb6,
+  0xc7, 0x58, 0xbd, 0x66, 0x73, 0x12, 0x3d, 0x9c, 0x2b, 0x50, 0xbd, 0xc8, 0x47,
+  0x7d, 0xbc, 0xb7, 0x6a, 0x04, 0xbd, 0xe6, 0x6a, 0x23, 0x3d, 0xdb, 0x11, 0x1f,
+  0xbd, 0x60, 0x1d, 0x5e, 0xbc, 0x80, 0x70, 0x72, 0xbd, 0x08, 0xed, 0x51, 0x3c,
+  0xb8, 0x35, 0x0c, 0xbc, 0x2e, 0xef, 0x47, 0x3d, 0xd0, 0xfb, 0xdf, 0x3b, 0xee,
+  0xea, 0x5c, 0x3d, 0x52, 0xa6, 0x7f, 0x3d, 0x1c, 0xd4, 0x92, 0x3c, 0x0c, 0xe1,
+  0xe3, 0x3c, 0x0b, 0x0e, 0x8b, 0x3d, 0x1e, 0x6f, 0x20, 0x3d, 0xee, 0xf3, 0x45,
+  0xbd, 0x28, 0xef, 0xfc, 0x3c, 0x48, 0x19, 0x8c, 0xbd, 0x02, 0x87, 0x7f, 0xbd,
+  0x6c, 0xc1, 0x4b, 0x3d, 0x30, 0x88, 0x72, 0xbc, 0x00, 0xb2, 0xce, 0x39, 0x68,
+  0x2f, 0xf1, 0xbc, 0x00, 0xa0, 0x3b, 0xb8, 0x0c, 0x90, 0x7b, 0xbd, 0xd0, 0x97,
+  0x45, 0xbd, 0xf6, 0xf5, 0x5d, 0x3d, 0x50, 0x0b, 0x0e, 0x3c, 0x48, 0x51, 0xf9,
+  0x3c, 0xb7, 0xe4, 0x4d, 0xbd, 0xca, 0x8d, 0xcf, 0xbc, 0x49, 0x0d, 0x88, 0xbd,
+  0xb1, 0x3c, 0x8f, 0x3d, 0xef, 0x72, 0x8a, 0x3d, 0x90, 0x23, 0x02, 0x3d, 0xe8,
+  0x60, 0x05, 0x3c, 0xc0, 0x9f, 0xb6, 0xba, 0xd5, 0x57, 0x03, 0xbd, 0x22, 0xae,
+  0x66, 0x3d, 0x61, 0x03, 0x8b, 0xbd, 0xcc, 0x23, 0xea, 0xbc, 0x80, 0x58, 0x4f,
+  0x3c, 0x60, 0xea, 0xd0, 0x3b, 0xae, 0x19, 0x2e, 0xbd, 0x5e, 0xee, 0xb5, 0xbc,
+  0x50, 0x19, 0x18, 0x3c, 0x6d, 0xd7, 0x78, 0xbd, 0x40, 0xcb, 0xe9, 0xbc, 0xea,
+  0x76, 0x53, 0xbd, 0x2c, 0x0e, 0x6b, 0xbc, 0xd8, 0xd6, 0x6a, 0x3c, 0xe0, 0x3d,
+  0x80, 0xbd, 0x80, 0x36, 0xf1, 0xba, 0x30, 0x30, 0x51, 0x3c, 0x40, 0x41, 0xa3,
+  0xba, 0xc8, 0xe8, 0x80, 0xbd, 0x72, 0x33, 0x67, 0x3d, 0xdd, 0x7d, 0x0c, 0xbd,
+  0x1c, 0xcf, 0xbe, 0x3c, 0x8c, 0x1d, 0x8f, 0xbd, 0x4c, 0x5a, 0x3a, 0x3d, 0xa0,
+  0x35, 0xff, 0x3b, 0x50, 0xb8, 0xea, 0xbb, 0x58, 0x63, 0x26, 0xbc, 0x70, 0x33,
+  0x0c, 0xbc, 0x58, 0xbb, 0x09, 0xbc, 0x1a, 0xd0, 0xf6, 0xbc, 0x02, 0xb0, 0x08,
+  0x3d, 0x4c, 0x72, 0xa7, 0x3c, 0x10, 0xa0, 0xa7, 0x3b, 0x7c, 0xab, 0x3f, 0x3d,
+  0x12, 0x95, 0xc6, 0xbc, 0x58, 0xe5, 0xac, 0xbc, 0x80, 0xbc, 0x56, 0x3b, 0x00,
+  0xd2, 0xda, 0xbb, 0x26, 0xff, 0xaa, 0xbc, 0xf2, 0xdc, 0x71, 0x3d, 0x30, 0xaf,
+  0x85, 0xbb, 0x88, 0xf9, 0x14, 0x3d, 0x50, 0x89, 0xc5, 0xbb, 0xc0, 0xd0, 0xf1,
+  0x3b, 0x95, 0xf2, 0x7b, 0xbd, 0x66, 0x43, 0xfa, 0xbc, 0xa0, 0x68, 0xf3, 0xbb,
+  0x60, 0xa0, 0xdc, 0x3c, 0x0e, 0x67, 0x6e, 0x3d, 0xdd, 0xec, 0x8a, 0xbd, 0xca,
+  0x1e, 0x8f, 0xbd, 0x64, 0x84, 0x6c, 0xbd, 0xee, 0x7b, 0x7a, 0xbd, 0xd2, 0xdc,
+  0x97, 0xbc, 0x84, 0x44, 0x77, 0xbd, 0xf8, 0xec, 0x0e, 0xbd, 0xea, 0x25, 0x03,
+  0x3d, 0x8e, 0x42, 0x27, 0xbd, 0x31, 0x0b, 0x87, 0x3d, 0xba, 0x5e, 0x31, 0xbd,
+  0x74, 0xee, 0xa5, 0x3c, 0xb5, 0xa1, 0x83, 0x3d, 0x48, 0x87, 0xad, 0x3c, 0x5c,
+  0xc4, 0x04, 0xbd, 0xe6, 0xe7, 0x4e, 0x3d, 0x24, 0xa4, 0xb2, 0xbc, 0x02, 0x4a,
+  0x8d, 0xbd, 0xfa, 0x96, 0x92, 0xbd, 0xf8, 0x1e, 0xaf, 0x3c, 0x80, 0xdb, 0xfe,
+  0x3a, 0x20, 0x48, 0xff, 0xbb, 0xf2, 0xdd, 0x63, 0x3d, 0x2c, 0x12, 0xaf, 0x3c,
+  0x8a, 0x05, 0xcf, 0xbc, 0xd8, 0x3a, 0x23, 0x3d, 0x2b, 0x32, 0x89, 0xbd, 0xd0,
+  0xff, 0x8b, 0x3b, 0x58, 0xd1, 0x13, 0xbd, 0x00, 0xac, 0x96, 0x3a, 0x8a, 0x92,
+  0x33, 0x3d, 0x1c, 0xdb, 0x2f, 0xbc, 0x8a, 0x30, 0x69, 0xbd, 0x80, 0xcc, 0x7a,
+  0x3b, 0x88, 0xaa, 0x7b, 0xbd, 0x03, 0xda, 0x8e, 0xbd, 0x10, 0x40, 0xfe, 0x3b,
+  0x74, 0x92, 0x0b, 0x3d, 0x54, 0x61, 0x7e, 0xbd, 0xdd, 0x2f, 0x75, 0xbd, 0xa8,
+  0xcd, 0x52, 0x3c, 0x20, 0xf1, 0x57, 0x3d, 0x98, 0x18, 0x05, 0xbc, 0x86, 0x14,
+  0x3a, 0x3d, 0xf0, 0xa5, 0x94, 0x3b, 0x13, 0xd7, 0x8b, 0x3d, 0xbe, 0x38, 0x1e,
+  0x3d, 0xe6, 0xa2, 0x8d, 0xbc, 0xc0, 0x39, 0xdf, 0x3c, 0xf8, 0x3f, 0x8b, 0xbd,
+  0xc9, 0x86, 0x8a, 0x3d, 0x51, 0xa4, 0x6d, 0xbd, 0x7b, 0xe0, 0x82, 0x3d, 0x50,
+  0x6e, 0x6d, 0x3c, 0xd0, 0x15, 0x60, 0xbd, 0x46, 0xec, 0x06, 0xbd, 0x50, 0x8b,
+  0x0f, 0x3d, 0x8e, 0x36, 0xab, 0xbc, 0x7f, 0x46, 0x74, 0xbd, 0x4e, 0x2b, 0x63,
+  0xbd, 0x6e, 0xdf, 0x2c, 0x3d, 0xee, 0x87, 0x60, 0x3d, 0x4e, 0x24, 0x6e, 0xbd,
+  0x06, 0xbf, 0x7d, 0x3d, 0x40, 0xf6, 0x25, 0x3c, 0xba, 0xea, 0x01, 0x3d, 0x29,
+  0x4f, 0x8c, 0xbd, 0xf3, 0x02, 0x8b, 0xbd, 0x7c, 0x06, 0x30, 0xbd, 0xda, 0x97,
+  0x1e, 0x3d, 0xad, 0x89, 0x8b, 0xbd, 0x90, 0x78, 0xd1, 0x3b, 0x2c, 0x75, 0xb5,
+  0x3c, 0x41, 0x04, 0x40, 0xbd, 0x52, 0x9d, 0x08, 0x3d, 0xf4, 0x53, 0xbf, 0x3c,
+  0x48, 0x82, 0x16, 0x3c, 0x3a, 0xa1, 0x72, 0x3d, 0xc8, 0x73, 0x32, 0x3d, 0x5a,
+  0x20, 0x20, 0x3d, 0x08, 0xb1, 0x48, 0x3d, 0x46, 0x6e, 0x73, 0x3d, 0x59, 0x17,
+  0x0f, 0xbd, 0xb8, 0xa7, 0x01, 0x3c, 0x10, 0x53, 0x46, 0x3c, 0x27, 0xc2, 0x3f,
+  0xbd, 0x77, 0x6b, 0x91, 0x3d, 0xa8, 0x1c, 0xec, 0x3c, 0xfd, 0x09, 0x92, 0xbd,
+  0x1c, 0x87, 0x89, 0xbd, 0x60, 0x10, 0xdc, 0xbb, 0x00, 0x40, 0xd1, 0x36, 0x48,
+  0xb3, 0x28, 0x3c, 0xc8, 0xb3, 0x94, 0x3c, 0xfa, 0x6c, 0x8e, 0xbc, 0x98, 0x5b,
+  0x68, 0xbc, 0x32, 0xc1, 0x3b, 0x3d, 0xb7, 0xd5, 0x81, 0x3d, 0x48, 0xb6, 0x10,
+  0x3d, 0x5c, 0x95, 0x58, 0xbd, 0xf6, 0xb9, 0x00, 0xbd, 0xaa, 0xbe, 0x51, 0xbd,
+  0x2e, 0xbc, 0x70, 0x3d, 0xc8, 0x89, 0x06, 0x3c, 0x00, 0x00, 0x41, 0xb9, 0x31,
+  0x3e, 0x10, 0xbd, 0xf0, 0x26, 0x14, 0xbc, 0x98, 0xfc, 0xf2, 0x3c, 0xf3, 0x6d,
+  0x27, 0xbd, 0xd0, 0xdd, 0x2e, 0xbc, 0xee, 0x5b, 0x92, 0xbd, 0xc6, 0x4c, 0x24,
+  0x3d, 0x3c, 0x5e, 0x01, 0x3d, 0x6a, 0xe6, 0x26, 0xbd, 0x90, 0xd6, 0x1f, 0x3c,
+  0xbc, 0x88, 0xcd, 0x3c, 0xb0, 0xad, 0xee, 0x3c, 0xd4, 0xc5, 0xdf, 0x3c, 0xa6,
+  0x0f, 0xe7, 0xbc, 0x51, 0x99, 0x84, 0x3d, 0xc4, 0x84, 0x6a, 0xbc, 0xa8, 0xb6,
+  0x5c, 0xbc, 0x00, 0xba, 0x3a, 0x39, 0x28, 0x4f, 0x59, 0x3d, 0x80, 0x55, 0x45,
+  0xba, 0x48, 0x20, 0x84, 0xbc, 0x3f, 0xfd, 0x90, 0x3d, 0x74, 0x17, 0x82, 0xbd,
+  0x93, 0xd5, 0x26, 0xbd, 0xc0, 0x02, 0xbf, 0xbc, 0x42, 0xdf, 0x24, 0x3d, 0x0e,
+  0xac, 0xd5, 0xbc, 0x42, 0xcc, 0x7a, 0xbd, 0xd0, 0x21, 0xf6, 0x3b, 0x88, 0x2e,
+  0x63, 0xbd, 0x08, 0xdd, 0xc4, 0xbc, 0x08, 0xa7, 0x6b, 0x3c, 0x17, 0x07, 0x83,
+  0xbd, 0x31, 0xfd, 0x81, 0x3d, 0x68, 0xb0, 0x3f, 0x3c, 0xec, 0x78, 0xc0, 0xbc,
+  0x40, 0x91, 0x3b, 0x3c, 0x80, 0x96, 0xbf, 0x3a, 0x94, 0xed, 0xa7, 0x3c, 0xb0,
+  0xf7, 0x2a, 0x3c, 0x00, 0x90, 0xc6, 0x37, 0xb4, 0x0d, 0x89, 0xbd, 0xd0, 0x28,
+  0xb0, 0xbb, 0xf0, 0x65, 0x06, 0x3c, 0xcd, 0xc8, 0x8d, 0x3d, 0x66, 0xa5, 0x6f,
+  0x3d, 0x36, 0x46, 0x4c, 0x3d, 0x00, 0x80, 0x67, 0x36, 0xaf, 0x78, 0x20, 0xbd,
+  0xce, 0x83, 0x08, 0x3d, 0x7f, 0x32, 0x84, 0xbd, 0x23, 0x80, 0x8e, 0x3d, 0xb4,
+  0xa5, 0x56, 0x3d, 0xe4, 0xc2, 0x10, 0xbd, 0xc0, 0xf4, 0xe9, 0xba, 0xa6, 0x4e,
+  0x6d, 0x3d, 0x04, 0x19, 0xad, 0xbc, 0x0c, 0xf2, 0x38, 0x3d, 0xc6, 0x2c, 0x29,
+  0xbd, 0xba, 0x51, 0x5c, 0x3d, 0x20, 0x92, 0xae, 0x3c, 0x68, 0x55, 0xf7, 0x3c,
+  0x40, 0x10, 0x08, 0x3d, 0x86, 0x95, 0x62, 0x3d, 0x36, 0xef, 0x80, 0xbd, 0xd8,
+  0x21, 0x37, 0xbd, 0x28, 0x37, 0x93, 0xbc, 0x20, 0xb5, 0x35, 0x3b, 0x2f, 0x41,
+  0x86, 0xbd, 0xf0, 0xf4, 0xfd, 0xbc, 0x3e, 0xa1, 0x8a, 0xbd, 0x38, 0xf3, 0x8f,
+  0xbd, 0x15, 0xd9, 0x6e, 0xbd, 0xb8, 0xd9, 0x4b, 0x3d, 0x6e, 0x7c, 0x61, 0xbd,
+  0x00, 0x0e, 0x4d, 0xbb, 0xf8, 0xa5, 0x58, 0xbc, 0x20, 0x15, 0xb6, 0x3b, 0xa0,
+  0x58, 0x09, 0x3b, 0xed, 0x15, 0x72, 0xbd, 0x00, 0xc6, 0x1a, 0x3a, 0x90, 0xdf,
+  0x44, 0x3d, 0x70, 0xb4, 0x28, 0xbd, 0x66, 0x55, 0x7d, 0xbd, 0x94, 0x94, 0x84,
+  0x3c, 0x49, 0xde, 0x32, 0xbd, 0x32, 0x47, 0x13, 0x3d, 0x2e, 0x3b, 0x4a, 0xbd,
+  0x8a, 0x6d, 0x53, 0xbd, 0x88, 0x9e, 0x8b, 0xbc, 0xfe, 0x9b, 0xd0, 0xbc, 0xf0,
+  0xb2, 0x16, 0x3c, 0x8c, 0x8a, 0x85, 0x3c, 0xd5, 0x73, 0x8b, 0xbd, 0xd6, 0xd6,
+  0x02, 0xbd, 0x70, 0x96, 0x22, 0x3d, 0x8a, 0x4b, 0x1c, 0x3d, 0x80, 0x91, 0xeb,
+  0x3a, 0x80, 0x29, 0x95, 0x3c, 0x71, 0xf1, 0x8d, 0x3d, 0x3e, 0x5e, 0x5e, 0xbd,
+  0xd2, 0x53, 0x63, 0x3d, 0x0b, 0xcb, 0x8d, 0xbd, 0x58, 0x76, 0x5f, 0xbc, 0xc2,
+  0xe8, 0x02, 0x3d, 0x9c, 0x96, 0x99, 0x3c, 0xbc, 0xe8, 0x96, 0x3c, 0xff, 0x05,
+  0x45, 0xbd, 0x48, 0xa6, 0x02, 0x3d, 0x83, 0x34, 0x87, 0xbd, 0xe4, 0x9a, 0x47,
+  0x3d, 0xd8, 0x5f, 0xc5, 0x3c, 0x0c, 0x1c, 0xee, 0xbc, 0x3e, 0x65, 0x46, 0x3d,
+  0xe5, 0xd2, 0x10, 0xbd, 0x00, 0x98, 0x9a, 0xbb, 0x06, 0x89, 0x8d, 0xbc, 0xb8,
+  0x08, 0xc5, 0xbc, 0x9e, 0xeb, 0xbd, 0xbc, 0x98, 0x4b, 0x78, 0xbd, 0x7d, 0x8a,
+  0x7d, 0xbd, 0x00, 0x70, 0xf6, 0x39, 0xe0, 0x0c, 0xba, 0x3b, 0xa2, 0xf4, 0xdf,
+  0xbc, 0xca, 0x61, 0x79, 0xbd, 0x44, 0x6f, 0xa3, 0xbc, 0x3c, 0x56, 0xe1, 0x3c,
+  0x90, 0xfd, 0x3c, 0xbd, 0x71, 0x08, 0x35, 0xbd, 0xde, 0x28, 0x6b, 0xbd, 0xae,
+  0xe2, 0x36, 0x3d, 0xe7, 0x04, 0x1e, 0xbd, 0x94, 0x0b, 0x1a, 0x3d, 0x3a, 0x8f,
+  0x26, 0x3d, 0x40, 0xbe, 0x07, 0xbc, 0x10, 0x36, 0x8d, 0xbd, 0x40, 0x7b, 0x06,
+  0x3b, 0xd8, 0x7b, 0x2c, 0x3d, 0x4f, 0x09, 0x59, 0xbd, 0x28, 0xc9, 0xeb, 0x3c,
+  0x1c, 0xee, 0x7c, 0xbc, 0xf0, 0x79, 0x19, 0x3c, 0xf8, 0x06, 0x72, 0x3c, 0xe0,
+  0x83, 0xb5, 0x3b, 0xc8, 0xca, 0x47, 0x3c, 0x88, 0x99, 0x0c, 0x3d, 0xe6, 0x5f,
+  0xaf, 0xbc, 0x14, 0x1b, 0x4f, 0xbc, 0x13, 0x70, 0x80, 0xbd, 0xdd, 0x13, 0x18,
+  0xbd, 0x4e, 0xae, 0xe3, 0xbc, 0xaa, 0x98, 0x7d, 0x3d, 0x00, 0xf9, 0x2f, 0x3c,
+  0xdd, 0xd1, 0x8c, 0x3d, 0x28, 0x5c, 0x3c, 0x3d, 0x90, 0x81, 0x38, 0x3d, 0x3a,
+  0xf4, 0x5d, 0x3d, 0xc2, 0x24, 0x53, 0x3d, 0x00, 0x34, 0x42, 0xbb, 0x32, 0xc8,
+  0x78, 0x3d, 0x7a, 0x94, 0xe6, 0xbc, 0x76, 0x8f, 0x80, 0xbc, 0x83, 0xca, 0x8b,
+  0x3d, 0x62, 0xfb, 0x78, 0x3d, 0xe9, 0x00, 0x90, 0x3d, 0xe8, 0x9b, 0x1c, 0xbd,
+  0x66, 0xd9, 0x8d, 0xbd, 0xa2, 0xe7, 0x73, 0x3d, 0xd8, 0xb6, 0xb9, 0xbc, 0xa0,
+  0x55, 0x70, 0x3b, 0x08, 0x5b, 0x00, 0x3c, 0xb4, 0xd0, 0x58, 0xbd, 0xe4, 0x3b,
+  0x52, 0xbd, 0xb0, 0x22, 0x3d, 0x3d, 0x4a, 0x4f, 0x81, 0xbd, 0x48, 0xf0, 0x6a,
+  0x3c, 0x61, 0xf4, 0x65, 0xbd, 0x34, 0x4e, 0x00, 0x3d, 0xd1, 0x71, 0x3c, 0xbd,
+  0x8e, 0x3e, 0x70, 0x3d, 0x55, 0x7a, 0x27, 0xbd, 0x68, 0x22, 0xd5, 0xbc, 0x59,
+  0x71, 0x90, 0xbd, 0xc8, 0xb0, 0x60, 0x3c, 0x74, 0x5b, 0x36, 0xbd, 0xdc, 0x16,
+  0xbf, 0x3c, 0x62, 0x7a, 0xe3, 0xbc, 0x00, 0x21, 0x8e, 0xba, 0x1e, 0x0d, 0x08,
+  0xbd, 0xa3, 0x7a, 0x07, 0xbd, 0xb4, 0x92, 0xee, 0x3c, 0x8d, 0xd2, 0x81, 0x3d,
+  0x40, 0xc6, 0x98, 0x3c, 0x78, 0xc1, 0x69, 0x3c, 0x36, 0x9a, 0x72, 0x3d, 0xd2,
+  0xfa, 0xe3, 0xbc, 0x42, 0x4c, 0x0e, 0x3d, 0x97, 0x2c, 0x88, 0x3d, 0x78, 0x6f,
+  0x13, 0xbc, 0x40, 0x90, 0x7a, 0x3b, 0x66, 0x40, 0x95, 0xbc, 0xb8, 0xe6, 0x33,
+  0x3d, 0x64, 0x0c, 0xf1, 0x3c, 0xb3, 0xc0, 0x1f, 0xbd, 0x67, 0x03, 0x03, 0xbd,
+  0xe4, 0x7c, 0xfb, 0x3c, 0x7e, 0x22, 0x0e, 0xbd, 0xd6, 0x60, 0x8d, 0xbd, 0xcc,
+  0xa2, 0x2c, 0xbd, 0x00, 0xa4, 0xd6, 0x39, 0xf8, 0x7d, 0x8d, 0xbd, 0xe4, 0x27,
+  0x9a, 0xbc, 0xd8, 0x19, 0x61, 0xbd, 0xb8, 0x49, 0x54, 0xbd, 0x70, 0xcb, 0xd3,
+  0x3b, 0x49, 0xe1, 0x89, 0x3d, 0x06, 0x6c, 0x78, 0x3d, 0xc0, 0xbe, 0x82, 0x3c,
+  0x4d, 0x99, 0x8f, 0x3d, 0xd8, 0x0d, 0xe6, 0x3c, 0x4e, 0x2d, 0x60, 0x3d, 0x1c,
+  0xab, 0x99, 0x3c, 0x66, 0xc6, 0xcc, 0xbc, 0x28, 0x76, 0x0b, 0xbc, 0x7b, 0x6e,
+  0x90, 0x3d, 0x3b, 0x2f, 0x1c, 0xbd, 0x60, 0x1e, 0x83, 0x3b, 0xc8, 0x88, 0xfd,
+  0x3c, 0x00, 0x48, 0xa8, 0x3c, 0x40, 0x3d, 0xd4, 0x3b, 0xa4, 0x83, 0xfc, 0x3c,
+  0x3c, 0xe7, 0xd8, 0x3c, 0xfe, 0xaa, 0x6f, 0x3d, 0xbb, 0x22, 0x90, 0xbd, 0xd6,
+  0xf5, 0x29, 0x3d, 0x8e, 0x7e, 0x65, 0x3d, 0xae, 0x3b, 0xe4, 0xbc, 0xea, 0x04,
+  0x54, 0x3d, 0x64, 0x22, 0x1f, 0x3d, 0x24, 0x95, 0x90, 0x3c, 0xcd, 0x7b, 0x21,
+  0xbd, 0xd0, 0xf8, 0xb9, 0x3b, 0x26, 0xf8, 0x28, 0xbd, 0x6a, 0x37, 0x5b, 0x3d,
+  0x6e, 0x7e, 0x70, 0x3d, 0xa0, 0x90, 0xec, 0x3c, 0x00, 0x8e, 0x0d, 0xbb, 0xe0,
+  0xbe, 0x5b, 0xbb, 0x58, 0xf6, 0x9c, 0x3c, 0xbe, 0x59, 0xc0, 0xbc, 0x64, 0x78,
+  0xa4, 0x3c, 0x79, 0xfb, 0x86, 0x3d, 0x60, 0x6c, 0x85, 0xbc, 0xba, 0x44, 0x18,
+  0xbd, 0x5e, 0xea, 0x6a, 0xbd, 0x6c, 0xf4, 0x36, 0xbd, 0xee, 0xd4, 0x4c, 0xbd,
+  0xa2, 0x17, 0x16, 0x3d, 0x98, 0x59, 0xb9, 0x3c, 0x90, 0x41, 0x3d, 0x3c, 0x66,
+  0x14, 0x06, 0x3d, 0x40, 0xa2, 0x17, 0xbb, 0xdd, 0x83, 0x75, 0xbd, 0x2c, 0x19,
+  0x8f, 0x3c, 0xfe, 0xde, 0x49, 0xbd, 0x57, 0x3d, 0x85, 0x3d, 0x1c, 0xb3, 0xef,
+  0xbc, 0x58, 0xdb, 0x3f, 0xbd, 0x0e, 0x38, 0x20, 0x3d, 0x80, 0xbf, 0xa7, 0x3a,
+  0xf0, 0xe2, 0x91, 0xbd, 0xcc, 0x0f, 0x0a, 0x3d, 0xc7, 0xad, 0x4d, 0xbd, 0x64,
+  0x33, 0x69, 0xbd, 0xc0, 0xc0, 0xd7, 0xbb, 0xb0, 0x16, 0x83, 0xbd, 0xd0, 0xbf,
+  0x3c, 0x3d, 0x11, 0x62, 0x87, 0x3d, 0x68, 0x04, 0x0f, 0x3d, 0x6e, 0xee, 0x2a,
+  0x3d, 0xb8, 0x70, 0x37, 0xbc, 0x62, 0x76, 0x7e, 0x3d, 0x84, 0xbc, 0xa0, 0x3c,
+  0xc0, 0xc9, 0x26, 0xbd, 0x82, 0x1a, 0x85, 0xbd, 0x80, 0x55, 0x8e, 0xbd, 0xe4,
+  0xdb, 0x48, 0x3d, 0x60, 0xa5, 0xd6, 0x3b, 0x39, 0x18, 0x92, 0x3d, 0x36, 0x5a,
+  0x6c, 0xbd, 0xe8, 0x77, 0xcb, 0x3c, 0x48, 0x9e, 0x12, 0x3d, 0x3b, 0x40, 0x91,
+  0xbd, 0x00, 0xe0, 0xf6, 0x38, 0xd6, 0xa0, 0x2f, 0xbd, 0xe0, 0xe2, 0x0f, 0xbc,
+  0xf4, 0x85, 0x50, 0x3d, 0x64, 0xf7, 0x9b, 0x3c, 0xdc, 0x72, 0x53, 0x3d, 0x28,
+  0x0b, 0x45, 0xbc, 0x4e, 0xb5, 0x3f, 0xbd, 0x34, 0x7a, 0xea, 0x3c, 0x58, 0xe1,
+  0x71, 0x3c, 0x60, 0x5b, 0xf8, 0xbc, 0xf8, 0x3d, 0x52, 0x3c, 0xd0, 0xdc, 0x67,
+  0xbd, 0xee, 0x2d, 0x0c, 0x3d, 0x70, 0x47, 0xb0, 0x3c, 0x70, 0x7c, 0x29, 0x3d,
+  0xf4, 0x97, 0xc9, 0x3c, 0x74, 0x63, 0x32, 0x3d, 0x6c, 0x17, 0x94, 0x3c, 0x87,
+  0xdc, 0x7a, 0xbd, 0xb6, 0xf5, 0x7c, 0x3d, 0x62, 0xd2, 0xe7, 0xbc, 0x99, 0xa5,
+  0x50, 0xbd, 0x4c, 0xa2, 0xb1, 0xbc, 0xf0, 0x38, 0xdd, 0xbb, 0xac, 0x44, 0x3f,
+  0xbd, 0x34, 0xb7, 0x06, 0x3d, 0xf6, 0x65, 0x25, 0x3d, 0xdb, 0x01, 0x1e, 0xbd,
+  0x68, 0xee, 0x19, 0xbc, 0x4c, 0xdd, 0x8a, 0x3c, 0xe0, 0xe4, 0x14, 0xbc, 0x9e,
+  0x6f, 0x21, 0x3d, 0x18, 0xd1, 0x59, 0x3d, 0x0c, 0xdd, 0xe1, 0xbc, 0x84, 0xa1,
+  0xe6, 0x3c, 0x5c, 0x56, 0xfa, 0x3c, 0xc4, 0x30, 0x8d, 0x3c, 0x9c, 0xba, 0x12,
+  0xbd, 0xe0, 0x85, 0xbf, 0xbc, 0x00, 0x1d, 0x62, 0xbb, 0xe4, 0x7a, 0x13, 0x3d,
+  0x36, 0x6c, 0x07, 0x3d, 0x88, 0xb1, 0x2a, 0x3c, 0x06, 0xba, 0x16, 0xbd, 0x24,
+  0x12, 0xaf, 0x3c, 0x7c, 0x97, 0x3b, 0xbc, 0xe4, 0x3d, 0x2e, 0xbd, 0x8c, 0x86,
+  0xa9, 0xbc, 0x6c, 0x70, 0x06, 0x3d, 0x0b, 0x2c, 0x76, 0xbd, 0x72, 0x24, 0xe8,
+  0xbc, 0x22, 0xeb, 0x70, 0x3d, 0xf0, 0xfb, 0x7b, 0x3c, 0x62, 0x51, 0x08, 0xbd,
+  0x52, 0x97, 0x88, 0xbd, 0x58, 0x8d, 0x76, 0x3c, 0x3c, 0x79, 0xf1, 0x3c, 0x6c,
+  0x9b, 0xbd, 0xbc, 0xa4, 0xf4, 0xe9, 0x3c, 0x80, 0x4d, 0x22, 0x3a, 0x78, 0x12,
+  0x81, 0x3c, 0x9a, 0xc5, 0x4a, 0x3d, 0xfa, 0x9b, 0x4a, 0x3d, 0x0c, 0x20, 0x7f,
+  0xbd, 0x36, 0x46, 0x06, 0xbd, 0x60, 0x13, 0xbd, 0xbb, 0x8e, 0x08, 0x92, 0xbc,
+  0xca, 0x25, 0x1c, 0x3d, 0xb2, 0x84, 0x3f, 0x3d, 0x98, 0x3f, 0x47, 0x3d, 0x58,
+  0x18, 0x4b, 0x3d, 0x60, 0x91, 0x63, 0xbb, 0xa2, 0x5c, 0xea, 0xbc, 0xc4, 0x8e,
+  0x86, 0x3c, 0x5c, 0x76, 0x91, 0xbd, 0x10, 0xa2, 0x1d, 0xbc, 0xe0, 0xcb, 0xb5,
+  0xbb, 0x50, 0xd2, 0xe2, 0x3c, 0x98, 0xbd, 0x88, 0xbd, 0x00, 0xd8, 0x0f, 0x39,
+  0x72, 0x33, 0x20, 0x3d, 0x00, 0x13, 0xbd, 0x39, 0xae, 0xc3, 0xd1, 0xbc, 0xec,
+  0x7e, 0xb8, 0xbc, 0x78, 0xb4, 0x90, 0xbc, 0xc2, 0x01, 0x68, 0x3d, 0x40, 0x0a,
+  0x4f, 0xbb, 0xb7, 0xe6, 0x87, 0x3d, 0x35, 0xe8, 0x85, 0x3d, 0x94, 0x2a, 0xe6,
+  0x3c, 0xd8, 0x5c, 0x69, 0x3c, 0x20, 0x8e, 0xc2, 0xbb, 0x4c, 0xa2, 0x92, 0x3c,
+  0xd6, 0xc7, 0x73, 0x3d, 0xf8, 0x0c, 0xb8, 0x3c, 0x40, 0x90, 0xb9, 0x3a, 0x2e,
+  0x2b, 0x31, 0x3d, 0x18, 0xf5, 0x8a, 0x3c, 0x91, 0x95, 0x5b, 0xbd, 0xc0, 0xfa,
+  0xc8, 0x3a, 0x72, 0xf1, 0xa9, 0xbc, 0x36, 0x77, 0x48, 0xbd, 0x73, 0x0d, 0x6c,
+  0xbd, 0x70, 0x22, 0xe4, 0xbb, 0x88, 0x5c, 0x28, 0x3d, 0xc6, 0x18, 0x3e, 0x3d,
+  0x94, 0x3c, 0xd1, 0xbc, 0x7f, 0x43, 0x15, 0xbd, 0xee, 0x0d, 0x9e, 0xbc, 0x62,
+  0xff, 0x29, 0x3d, 0xf0, 0x56, 0xf2, 0x3b, 0x22, 0x3f, 0x4e, 0x3d, 0xb6, 0x94,
+  0x39, 0xbd, 0x9e, 0xf1, 0x45, 0xbd, 0x87, 0xdb, 0x85, 0x3d, 0xd8, 0x35, 0x65,
+  0x3c, 0xcc, 0x13, 0x8a, 0x3c, 0x44, 0x89, 0x64, 0xbc, 0xe6, 0xb5, 0x2a, 0xbd,
+  0x28, 0x4f, 0x69, 0x3c, 0x36, 0x45, 0x53, 0x3d, 0x3a, 0xd2, 0xfe, 0xbc, 0xce,
+  0xa8, 0xa2, 0xbc, 0x8a, 0x16, 0x7d, 0xbd, 0xc2, 0xd5, 0xd9, 0xbc, 0xa0, 0x4a,
+  0x87, 0xbd, 0x9e, 0xc2, 0x2c, 0x3d, 0xfc, 0x3a, 0xaf, 0x3c, 0x9e, 0x10, 0x40,
+  0xbd, 0xe0, 0x3a, 0x82, 0x3b, 0x0c, 0xe4, 0xfc, 0x3c, 0xd8, 0x07, 0x57, 0xbd,
+  0xba, 0x34, 0x91, 0xbd, 0xc6, 0x42, 0x51, 0x3d, 0xc0, 0xe9, 0xe1, 0x3b, 0x9c,
+  0x4a, 0x2a, 0xbc, 0xc6, 0x92, 0x7b, 0x3d, 0x12, 0x9f, 0x59, 0xbd, 0x0c, 0x62,
+  0xfd, 0xbc, 0x6c, 0x1a, 0xe6, 0x3c, 0x72, 0x2c, 0x4b, 0x3d, 0x7a, 0xa5, 0x3b,
+  0xbd, 0xfa, 0x37, 0x7b, 0x3d, 0xc0, 0xf0, 0x87, 0xbc, 0x28, 0xd1, 0x5a, 0x3c,
+  0xd7, 0x35, 0x6b, 0xbd, 0x7e, 0x9c, 0x6f, 0x3d, 0x1a, 0xf6, 0x23, 0xbd, 0x66,
+  0x3b, 0xa2, 0xbc, 0x00, 0xb5, 0x5d, 0xba, 0xbb, 0xc3, 0x52, 0xbd, 0x24, 0x0d,
+  0x14, 0x3d, 0x6f, 0x6f, 0x7d, 0xbd, 0x74, 0x88, 0x90, 0xbd, 0xda, 0x8a, 0x68,
+  0xbd, 0xb4, 0xe0, 0x5f, 0xbc, 0xb8, 0x32, 0x88, 0xbd, 0x13, 0xc0, 0x81, 0x3d,
+  0x2c, 0x07, 0x2e, 0xbd, 0xd0, 0x8a, 0x8a, 0x3b, 0xe2, 0x9e, 0x8a, 0xbd, 0x60,
+  0x09, 0x8a, 0x3b, 0xd5, 0x6b, 0x92, 0xbd, 0x90, 0x61, 0x50, 0x3d, 0x62, 0x32,
+  0x0f, 0xbd, 0x9b, 0x7c, 0x6f, 0xbd, 0x10, 0x7c, 0xa3, 0x3c, 0x80, 0x22, 0xcc,
+  0xbb, 0x20, 0xc6, 0x3a, 0x3d, 0x40, 0xcb, 0x3f, 0x3b, 0xca, 0xa4, 0xdd, 0xbc,
+  0xc0, 0x36, 0xbf, 0x3c, 0x40, 0x4f, 0x85, 0x3b, 0x13, 0x52, 0x6c, 0xbd, 0x6b,
+  0xa9, 0x6f, 0xbd, 0x58, 0x41, 0x5d, 0xbc, 0xa8, 0x0e, 0x82, 0x3c, 0x7c, 0x92,
+  0xf5, 0x3c, 0xfa, 0xd8, 0x5a, 0xbd, 0xcc, 0x79, 0x54, 0x3d, 0xc4, 0x8f, 0x2a,
+  0xbc, 0x78, 0xec, 0xdb, 0x3c, 0xf0, 0x95, 0xa9, 0x3b, 0x78, 0x9d, 0xf6, 0xbc,
+  0x53, 0x59, 0x55, 0xbd, 0x08, 0x4e, 0xca, 0x3c, 0xcc, 0x95, 0xbb, 0x3c, 0xe4,
+  0x91, 0xb4, 0xbc, 0xfb, 0x9d, 0x86, 0xbd, 0x08, 0x68, 0x3f, 0xbc, 0x5d, 0x1b,
+  0x84, 0xbd, 0xd0, 0xc8, 0x83, 0x3b, 0x4a, 0x39, 0x54, 0x3d, 0x3c, 0x6e, 0xb6,
+  0xbc, 0x70, 0xdd, 0x1b, 0x3c, 0xf4, 0xfc, 0x21, 0xbd, 0x68, 0x25, 0x5e, 0x3c,
+  0x01, 0xfc, 0x8e, 0xbd, 0x60, 0xe5, 0x2a, 0x3b, 0x98, 0x51, 0x23, 0xbc, 0x00,
+  0xef, 0x0a, 0xba, 0xfc, 0x95, 0x1f, 0xbc, 0xf4, 0x89, 0x55, 0x3d, 0x76, 0x2e,
+  0x29, 0x3d, 0xdb, 0x02, 0x86, 0x3d, 0x64, 0xaa, 0x31, 0xbc, 0x7c, 0x3a, 0x9c,
+  0xbc, 0x00, 0xf2, 0x64, 0xbd, 0x86, 0xf3, 0x51, 0xbd, 0xc0, 0x2f, 0x9a, 0x3a,
+  0xf2, 0xf2, 0xd3, 0xbc, 0x1e, 0x43, 0xcb, 0xbc, 0x6d, 0x44, 0x92, 0x3d, 0x40,
+  0xc6, 0x90, 0xba, 0xaa, 0xc9, 0x3e, 0xbd, 0x02, 0xc1, 0x5b, 0x3d, 0x66, 0xeb,
+  0x1e, 0x3d, 0xf2, 0x34, 0x63, 0xbd, 0xea, 0xba, 0x66, 0x3d, 0xee, 0x8c, 0x1a,
+  0x3d, 0x3b, 0xb9, 0x1e, 0xbd, 0x0a, 0xd2, 0x13, 0x3d, 0xa0, 0xaf, 0x3e, 0x3c,
+  0xc0, 0x24, 0x83, 0x3c, 0x90, 0x69, 0xf0, 0xbb, 0x1f, 0x73, 0x86, 0x3d, 0x9d,
+  0x21, 0x77, 0xbd, 0x45, 0x4f, 0x8c, 0x3d, 0x40, 0x6d, 0xfe, 0x3c, 0xcb, 0xa5,
+  0x8d, 0xbd, 0x00, 0x8d, 0xe5, 0x39, 0x56, 0x9b, 0x55, 0x3d, 0x26, 0x49, 0x5a,
+  0xbd, 0x66, 0x93, 0x7a, 0x3d, 0x80, 0x29, 0x4f, 0xba, 0xff, 0xff, 0x82, 0xbd,
+  0x50, 0xf9, 0x65, 0x3c, 0x28, 0xa6, 0xb5, 0xbc, 0xdf, 0x70, 0x54, 0xbd, 0x17,
+  0xd1, 0x8e, 0xbd, 0x00, 0x3a, 0xb9, 0x3b, 0x26, 0x45, 0x86, 0xbc, 0xad, 0x85,
+  0x33, 0xbd, 0x94, 0x78, 0x32, 0x3d, 0x70, 0xcb, 0xa1, 0x3b, 0x40, 0xe5, 0x21,
+  0x3d, 0x32, 0xd5, 0xc2, 0xbc, 0xf8, 0x3d, 0x27, 0x3d, 0x28, 0xc0, 0x39, 0xbc,
+  0xac, 0xc8, 0x7a, 0xbc, 0xe6, 0xc2, 0xd4, 0xbc, 0x91, 0x81, 0x5c, 0xbd, 0xe1,
+  0x6a, 0x90, 0xbd, 0xa9, 0xc8, 0x1d, 0xbd, 0x00, 0x94, 0xcb, 0xb9, 0xe0, 0x0d,
+  0x31, 0x3c, 0x00, 0x2a, 0xbe, 0xbb, 0x9a, 0x1e, 0x2a, 0xbd, 0x06, 0xef, 0x7f,
+  0x3d, 0xc0, 0xcc, 0x0d, 0x3c, 0xd6, 0x50, 0x74, 0xbd, 0x10, 0x24, 0xcd, 0x3b,
+  0x22, 0x4f, 0x0c, 0xbd, 0xc8, 0xf2, 0xaa, 0x3c, 0x9e, 0x84, 0xc8, 0xbc, 0x80,
+  0xf2, 0x4e, 0x3c, 0x0c, 0x38, 0x77, 0xbd, 0x6c, 0xab, 0x63, 0xbd, 0xb7, 0x31,
+  0x11, 0xbd, 0x25, 0x39, 0x84, 0x3d, 0x31, 0x0b, 0x91, 0x3d, 0xe3, 0x1d, 0x08,
+  0xbd, 0x92, 0xb6, 0x1b, 0xbd, 0x65, 0xca, 0x88, 0x3d, 0x1c, 0x62, 0x2c, 0xbd,
+  0xda, 0x7b, 0x73, 0x3d, 0xff, 0xbb, 0x85, 0xbd, 0xc4, 0xc7, 0x51, 0x3d, 0x98,
+  0xd2, 0x6f, 0xbd, 0x70, 0xa4, 0xe9, 0x3c, 0x74, 0x65, 0xd7, 0x3c, 0x18, 0xdd,
+  0x5e, 0x3c, 0x78, 0x1d, 0x04, 0x3d, 0x2c, 0xef, 0x43, 0xbd, 0x48, 0x7d, 0x5e,
+  0xbd, 0xd6, 0x02, 0x9f, 0xbc, 0x80, 0x29, 0xa1, 0x3c, 0x70, 0x64, 0x54, 0x3d,
+  0x3e, 0xe0, 0x50, 0x3d, 0xd3, 0x7d, 0x2e, 0xbd, 0x64, 0xdf, 0x55, 0xbd, 0x72,
+  0x47, 0x8c, 0xbd, 0xfb, 0x45, 0x12, 0xbd, 0xd6, 0x49, 0x9d, 0xbc, 0xca, 0xd5,
+  0x67, 0x3d, 0x50, 0xb9, 0xf4, 0x3c, 0x93, 0xca, 0x1f, 0xbd, 0xa7, 0xe1, 0x8f,
+  0xbd, 0xcc, 0x00, 0x52, 0x3d, 0x07, 0xd3, 0x20, 0xbd, 0xd0, 0x26, 0x82, 0xbc,
+  0x2a, 0x6e, 0x69, 0x3d, 0x0c, 0x67, 0x70, 0xbd, 0xaa, 0x35, 0xe9, 0xbc, 0xae,
+  0x97, 0xba, 0xbc, 0xea, 0x69, 0x3d, 0xbd, 0x28, 0xa0, 0x6f, 0xbc, 0x2a, 0x6a,
+  0x67, 0x3d, 0x50, 0xd0, 0x6e, 0x3c, 0x16, 0x90, 0x06, 0x3d, 0x4a, 0xdf, 0x3f,
+  0x3d, 0xa0, 0x4e, 0x07, 0x3d, 0x48, 0x0d, 0x55, 0xbd, 0x50, 0x0b, 0xc6, 0xbc,
+  0xc4, 0xf3, 0x47, 0xbd, 0x90, 0x09, 0xb3, 0xbb, 0x20, 0xe9, 0x7f, 0xbd, 0xbf,
+  0x2e, 0x86, 0xbd, 0xba, 0xcf, 0x74, 0x3d, 0x86, 0xd8, 0xf6, 0xbc, 0x20, 0x65,
+  0x57, 0x3d, 0x82, 0xc5, 0x50, 0xbd, 0xac, 0x70, 0x41, 0x3d, 0x0e, 0xb0, 0x40,
+  0xbd, 0x4c, 0x30, 0x39, 0xbd, 0x80, 0xa0, 0xe5, 0x3c, 0x20, 0xc2, 0x86, 0xbb,
+  0xb8, 0x3d, 0x8c, 0x3c, 0xdf, 0x7e, 0x5f, 0xbd, 0xe0, 0xfd, 0x37, 0x3b, 0x0b,
+  0x70, 0x15, 0xbd, 0x00, 0xc1, 0x97, 0xba, 0x9a, 0x38, 0x56, 0xbd, 0x32, 0x67,
+  0xdb, 0xbc, 0x4a, 0x22, 0x38, 0x3d, 0x12, 0x1c, 0x7f, 0x3d, 0x88, 0x38, 0xee,
+  0x3c, 0x0a, 0x76, 0x61, 0x3d, 0x6d, 0xd7, 0x0a, 0xbd, 0xba, 0xb0, 0x3c, 0x3d,
+  0x28, 0xbe, 0x91, 0xbc, 0xa8, 0x3e, 0x0b, 0x3c, 0x54, 0x53, 0xb7, 0x3c, 0x50,
+  0x41, 0x57, 0x3c, 0xb4, 0x5d, 0x9b, 0x3c, 0x04, 0xb9, 0x18, 0xbd, 0xa8, 0xd5,
+  0x9c, 0xbc, 0x7c, 0x5f, 0x15, 0xbd, 0x64, 0xf3, 0x0d, 0x3d, 0x17, 0x85, 0x90,
+  0x3d, 0x5d, 0xf4, 0x51, 0xbd, 0x97, 0x93, 0x30, 0xbd, 0x40, 0x65, 0xe6, 0xbb,
+  0x20, 0xa7, 0xc3, 0x3c, 0x10, 0xb1, 0x90, 0x3c, 0xc8, 0x2f, 0x36, 0x3c, 0x6b,
+  0x38, 0x8e, 0xbd, 0xd6, 0x6c, 0x62, 0x3d, 0x94, 0x52, 0x4b, 0xbd, 0x48, 0xe5,
+  0x15, 0x3d, 0x48, 0x7a, 0x3f, 0x3d, 0x60, 0xb0, 0xdf, 0xbb, 0xc2, 0x53, 0x05,
+  0xbd, 0xc0, 0xaa, 0x94, 0x3a, 0xf2, 0xef, 0x68, 0xbd, 0xb0, 0x4d, 0x46, 0xbc,
+  0xa0, 0xdc, 0x0e, 0x3b, 0x9c, 0x99, 0x5d, 0xbd, 0xd0, 0x37, 0x63, 0xbd, 0x61,
+  0x02, 0x03, 0xbd, 0x80, 0x26, 0x51, 0x3a, 0xa0, 0xab, 0xb5, 0xbb, 0x65, 0x1e,
+  0x8d, 0x3d, 0xa0, 0x46, 0xc6, 0x3c, 0x00, 0x48, 0xa3, 0x3c, 0x4d, 0xdf, 0x84,
+  0x3d, 0x1c, 0xf1, 0x34, 0xbd, 0x1a, 0xb0, 0x00, 0x3d, 0x86, 0x6e, 0x5a, 0x3d,
+  0x02, 0xfe, 0x8b, 0xbd, 0x0e, 0x96, 0x32, 0x3d, 0xe6, 0x1e, 0x91, 0xbc, 0x8a,
+  0xe9, 0x6b, 0xbd, 0x4c, 0x53, 0x38, 0x3d, 0x39, 0xf5, 0x90, 0xbd, 0x66, 0x81,
+  0x7e, 0x3d, 0xec, 0x33, 0xaa, 0xbc, 0x3e, 0xc4, 0x5c, 0x3d, 0xd8, 0x19, 0x87,
+  0xbc, 0x70, 0xd6, 0x52, 0x3d, 0x00, 0x6a, 0xab, 0x3a, 0xda, 0x41, 0x81, 0xbc,
+  0xf0, 0xbd, 0xe3, 0x3c, 0x38, 0x66, 0x1e, 0x3c, 0x62, 0x7d, 0x8e, 0xbd, 0xa5,
+  0x2a, 0x15, 0xbd, 0xf6, 0x6a, 0x72, 0x3d, 0x72, 0x22, 0x33, 0x3d, 0x8c, 0xb7,
+  0x8e, 0xbd, 0xe2, 0xf8, 0x6a, 0xbd, 0x01, 0x40, 0x35, 0xbd, 0xb3, 0xe4, 0x79,
+  0xbd, 0xdc, 0xb4, 0x65, 0xbc, 0x3d, 0x74, 0x91, 0x3d, 0x94, 0x0a, 0xe8, 0x3c,
+  0x16, 0x25, 0x57, 0xbd, 0xd6, 0x05, 0x0b, 0x3d, 0x16, 0x2b, 0x5f, 0x3d, 0x38,
+  0x59, 0xcd, 0xbc, 0x8c, 0x9f, 0x0e, 0x3d, 0xac, 0x67, 0x9c, 0x3c, 0x00, 0xe1,
+  0xb3, 0x39, 0x1c, 0x2e, 0xf8, 0x3c, 0xed, 0xfd, 0x80, 0x3d, 0xc6, 0x8b, 0x2b,
+  0xbd, 0x08, 0x4d, 0xe0, 0x3c, 0xff, 0x55, 0x85, 0x3d, 0x3c, 0xd0, 0xe9, 0x3c,
+  0x30, 0x7c, 0x79, 0x3c, 0xd0, 0xf7, 0x8c, 0x3b, 0x82, 0xe9, 0x7d, 0xbd, 0x54,
+  0x3f, 0x46, 0x3d, 0xb8, 0x88, 0xc0, 0x3c, 0xc8, 0xf4, 0x35, 0xbc, 0xe9, 0x19,
+  0x85, 0x3d, 0x01, 0x5f, 0x62, 0xbd, 0xea, 0x7f, 0x0f, 0x3d, 0xf8, 0x73, 0x42,
+  0xbd, 0x41, 0x97, 0x8f, 0x3d, 0x13, 0xec, 0x80, 0x3d, 0xe7, 0xa8, 0x40, 0xbd,
+  0x08, 0x47, 0x4b, 0x3c, 0x80, 0xce, 0x77, 0xbc, 0xb6, 0x2d, 0x4f, 0xbd, 0xe0,
+  0xa7, 0x0b, 0x3b, 0xda, 0xb6, 0x76, 0x3d, 0xc8, 0xce, 0x14, 0x3c, 0xe0, 0xbf,
+  0x20, 0xbb, 0x10, 0xa1, 0x94, 0x3b, 0x02, 0x4e, 0x3f, 0x3d, 0xa0, 0xe9, 0x0c,
+  0xbc, 0x6a, 0x57, 0x2b, 0xbd, 0x22, 0x09, 0x1d, 0xbd, 0xa8, 0xa6, 0x4c, 0x3c,
+  0x21, 0x7d, 0x40, 0xbd, 0x91, 0xdf, 0x87, 0x3d, 0x65, 0xe4, 0x05, 0xbd, 0xdc,
+  0xd6, 0x84, 0xbd, 0x22, 0x49, 0x79, 0x3d, 0xf4, 0xf7, 0x40, 0xbc, 0x2c, 0x16,
+  0x86, 0xbc, 0xa8, 0x26, 0x40, 0x3d, 0xaa, 0x89, 0xa9, 0xbc, 0xc4, 0x74, 0xc5,
+  0xbc, 0x3c, 0x76, 0x83, 0xbc, 0x2b, 0xf7, 0x90, 0x3d, 0xa8, 0x0c, 0x6f, 0xbc,
+  0xdc, 0x96, 0x2c, 0x3d, 0xe0, 0x71, 0x88, 0x3c, 0x66, 0x9f, 0x2a, 0xbd, 0xf1,
+  0x10, 0x82, 0x3d, 0x41, 0x73, 0x41, 0xbd, 0x7e, 0x2c, 0x21, 0xbd, 0xf0, 0xea,
+  0x08, 0x3c, 0x54, 0xb4, 0x2a, 0xbc, 0xf6, 0xf5, 0x64, 0xbd, 0x46, 0xf9, 0x2a,
+  0xbd, 0x54, 0xa4, 0x29, 0x3d, 0x1e, 0x79, 0xee, 0xbc, 0xf5, 0x8b, 0x83, 0x3d,
+  0x30, 0x04, 0x10, 0x3d, 0x14, 0x83, 0x4e, 0x3d, 0x67, 0x9f, 0x62, 0xbd, 0x00,
+  0x01, 0x10, 0xbd, 0x96, 0xc8, 0x2c, 0x3d, 0x3f, 0x58, 0x8e, 0x3d, 0x34, 0xeb,
+  0xe1, 0x3c, 0x12, 0x5d, 0x87, 0xbc, 0x0b, 0x23, 0x80, 0x3d, 0x0a, 0x55, 0x81,
+  0xbd, 0xc2, 0x80, 0x16, 0xbd, 0x58, 0xa6, 0x7a, 0x3c, 0xec, 0x9a, 0xf1, 0x3c,
+  0xf0, 0x0e, 0xaa, 0x3c, 0xe2, 0x06, 0x9a, 0xbc, 0x20, 0x57, 0xec, 0xbb, 0xe8,
+  0x5b, 0xc6, 0x3c, 0x40, 0x51, 0x3b, 0x3c, 0x47, 0xf6, 0x8e, 0x3d, 0x6e, 0xc5,
+  0x06, 0xbd, 0xac, 0xf6, 0x2b, 0x3d, 0xec, 0x29, 0x05, 0x3d, 0x76, 0xd9, 0x2e,
+  0x3d, 0x7c, 0x02, 0x40, 0xbc, 0x5e, 0x98, 0x8b, 0xbc, 0x20, 0xf8, 0x8b, 0x3c,
+  0xcc, 0x04, 0x59, 0xbc, 0xd7, 0xfe, 0x8a, 0x3d, 0xda, 0xed, 0x1a, 0xbd, 0x82,
+  0x45, 0x9b, 0xbc, 0xfc, 0xa0, 0x7b, 0xbc, 0x14, 0x19, 0x0a, 0x3d, 0x7c, 0x3a,
+  0x7d, 0xbd, 0x46, 0x32, 0x91, 0xbd, 0xc0, 0xea, 0x8b, 0x3c, 0x0e, 0x44, 0x78,
+  0x3d, 0x96, 0x53, 0x2a, 0x3d, 0x3a, 0xbb, 0x79, 0x3d, 0x1f, 0xe3, 0x19, 0xbd,
+  0x56, 0xbb, 0x67, 0x3d, 0x44, 0x48, 0x86, 0x3c, 0x33, 0x5f, 0x8e, 0xbd, 0xc0,
+  0x86, 0x8c, 0xbc, 0xb0, 0x2a, 0x8e, 0x3b, 0x20, 0xd2, 0x8f, 0xbd, 0x16, 0x08,
+  0x67, 0x3d, 0x4a, 0xc7, 0x67, 0x3d, 0x50, 0x7c, 0xfd, 0xbc, 0xb0, 0xc1, 0x3f,
+  0xbd, 0xc0, 0x77, 0xde, 0x3b, 0x98, 0x6b, 0x98, 0xbc, 0x10, 0x91, 0xa0, 0x3b,
+  0x80, 0x9a, 0xed, 0x3c, 0xdd, 0xc9, 0x82, 0x3d, 0x2c, 0x20, 0x4d, 0x3d, 0x05,
+  0xe9, 0x78, 0xbd, 0x44, 0xae, 0xcd, 0x3c, 0xd8, 0x92, 0x81, 0x3c, 0x57, 0xa3,
+  0x77, 0xbd, 0xbe, 0x2e, 0x65, 0xbd, 0x74, 0xfc, 0x41, 0x3d, 0xa2, 0x99, 0x7b,
+  0x3d, 0xe0, 0x55, 0x98, 0x3b, 0xe4, 0xdf, 0xa5, 0x3c, 0xcf, 0x0c, 0x16, 0xbd,
+  0x68, 0x3f, 0x78, 0xbd, 0xbe, 0xe3, 0x4e, 0x3d, 0xf4, 0x7f, 0x4a, 0x3d, 0xaa,
+  0x64, 0x3b, 0xbd, 0xa7, 0xe7, 0x83, 0xbd, 0xe0, 0x45, 0x60, 0x3b, 0x41, 0x1e,
+  0x0c, 0xbd, 0x14, 0xa6, 0x90, 0xbd, 0x71, 0x37, 0x5f, 0xbd, 0x72, 0x90, 0xb8,
+  0xbc, 0xc6, 0x6e, 0x3b, 0xbd, 0x4d, 0x5e, 0xe0, 0xbc, 0x40, 0x74, 0x5b, 0xbb,
+  0xb2, 0x61, 0x06, 0x3d, 0xc8, 0xd6, 0xc1, 0x3c, 0xa9, 0x80, 0x85, 0xbd, 0x76,
+  0xe9, 0x20, 0x3d, 0x1a, 0xcc, 0x80, 0x3d, 0x39, 0x17, 0xdf, 0xbc, 0xe1, 0x45,
+  0x8c, 0x3c, 0x67, 0x35, 0x48, 0x3d, 0x9d, 0x17, 0x76, 0xbd, 0x38, 0xa6, 0xb2,
+  0xba, 0xad, 0x55, 0xaf, 0x3c, 0xf4, 0x50, 0x5e, 0x3d, 0x02, 0x7b, 0xd9, 0xba,
+  0x0a, 0x74, 0x0f, 0xbd, 0xa9, 0x69, 0x54, 0x3d, 0x3e, 0xa8, 0x6c, 0x3d, 0xcc,
+  0xde, 0x27, 0xbd, 0x4f, 0x51, 0xa7, 0xbb, 0xbf, 0x78, 0x26, 0xbd, 0x66, 0xcc,
+  0x84, 0xbd, 0xce, 0x30, 0xcd, 0xbc, 0xab, 0x28, 0x60, 0x3d, 0x97, 0xdb, 0x31,
+  0xbd, 0x6f, 0x6f, 0xc3, 0x3b, 0xe0, 0x7e, 0x8c, 0xbd, 0x06, 0xe2, 0xc0, 0xbc,
+  0xce, 0x5b, 0x7a, 0xbd, 0xa5, 0xfb, 0xe1, 0xbc, 0xbd, 0x3b, 0x44, 0xbd, 0x90,
+  0xa1, 0xbd, 0x3b, 0xc9, 0xba, 0x34, 0xbc, 0x5f, 0xab, 0x08, 0xbd, 0xf8, 0x5a,
+  0x5f, 0x3c, 0x23, 0xbe, 0x8c, 0x3d, 0xbc, 0x19, 0xad, 0xbc, 0xb1, 0xd8, 0x19,
+  0xbd, 0x33, 0x7a, 0x85, 0x3d, 0xa5, 0x19, 0xc7, 0x3b, 0x83, 0x55, 0x83, 0xbc,
+  0x9d, 0x63, 0x08, 0x3d, 0x36, 0x98, 0x1c, 0x3d, 0x20, 0x2d, 0x2d, 0xbc, 0x6b,
+  0xc3, 0x68, 0xbd, 0xbc, 0x22, 0xb6, 0x3c, 0x93, 0xdb, 0xc0, 0x3a, 0x88, 0x17,
+  0xdf, 0x3c, 0x0d, 0x0d, 0x2c, 0xbd, 0xc0, 0x40, 0x60, 0x3b, 0xea, 0xf9, 0x3f,
+  0xbd, 0x0d, 0xd7, 0x03, 0xbd, 0x45, 0x08, 0x68, 0xbd, 0xb3, 0xa4, 0xe9, 0xbc,
+  0xfd, 0xe9, 0x5f, 0x3d, 0x4c, 0x45, 0x0c, 0x3d, 0xff, 0xdb, 0xa3, 0xbc, 0x12,
+  0x16, 0x88, 0xbd, 0x70, 0x42, 0xe5, 0xbc, 0x60, 0xda, 0x1c, 0x3c, 0x2b, 0x55,
+  0xf8, 0x3b, 0x07, 0x82, 0x87, 0x3c, 0x08, 0x94, 0x83, 0xbd, 0x66, 0xf3, 0x44,
+  0x3d, 0x0b, 0xed, 0x10, 0x3c, 0x1b, 0x7e, 0x8f, 0xbd, 0xbe, 0x4c, 0xb5, 0xbc,
+  0xc4, 0x84, 0x26, 0x3d, 0x80, 0x5f, 0x6a, 0xbc, 0xb8, 0x41, 0x29, 0x3d, 0xfa,
+  0xbc, 0x4a, 0x3d, 0xbe, 0x44, 0x47, 0xbc, 0xc1, 0x9b, 0x21, 0x3d, 0x33, 0xb8,
+  0xd7, 0xbc, 0x54, 0xe6, 0x53, 0x3d, 0xd8, 0x95, 0x3d, 0xbd, 0x2b, 0x4d, 0x90,
+  0x3d, 0x0c, 0x3c, 0x3a, 0xbc, 0x6c, 0x41, 0x24, 0xbd, 0x31, 0xfd, 0x66, 0xbd,
+  0x43, 0x29, 0x4a, 0x3d, 0x00, 0x8d, 0xc3, 0xb9, 0x20, 0xd6, 0xe2, 0xbb, 0xb7,
+  0xf6, 0x22, 0xbd, 0xe9, 0xd7, 0x3f, 0x3d, 0x8d, 0xb7, 0xf7, 0x3c, 0x2b, 0x56,
+  0x8b, 0x3d, 0xa6, 0xa7, 0x70, 0xbd, 0xdf, 0x62, 0x56, 0x3d, 0xe9, 0x4b, 0xb0,
+  0x3c, 0x40, 0xb6, 0x04, 0x3c, 0x34, 0x8c, 0x04, 0xbd, 0xb9, 0x1a, 0x1b, 0x3d,
+  0x25, 0xbc, 0x05, 0xbd, 0x3d, 0x10, 0x1c, 0xbd, 0x77, 0x24, 0x8c, 0xbd, 0x53,
+  0x9b, 0xdf, 0x3b, 0x80, 0xc9, 0x53, 0x3d, 0x40, 0xc7, 0x6c, 0xbc, 0x00, 0xb3,
+  0xbe, 0xba, 0xe5, 0xe9, 0x89, 0x3d, 0xb0, 0x72, 0x88, 0xbd, 0xcd, 0x2d, 0x0c,
+  0xbd, 0x27, 0x35, 0x07, 0xbd, 0x6b, 0x6a, 0x49, 0xbd, 0x99, 0x9b, 0x51, 0xbd,
+  0x1c, 0x94, 0x51, 0x3c, 0x78, 0x26, 0x6a, 0xbd, 0xc2, 0x3e, 0x04, 0x3d, 0xf3,
+  0x19, 0x16, 0xbd, 0x9c, 0xb7, 0x0b, 0xbd, 0xb8, 0x3d, 0xf9, 0x3c, 0x69, 0xdb,
+  0x14, 0x3d, 0x0a, 0xe3, 0x0f, 0xbd, 0x1a, 0xd5, 0x80, 0xbd, 0xed, 0x79, 0x8d,
+  0x3c, 0x1b, 0x21, 0x00, 0xbb, 0x9a, 0x88, 0x0e, 0x3d, 0xc0, 0x1c, 0x66, 0x3d,
+  0x60, 0x74, 0x82, 0xbd, 0x7b, 0x96, 0x1c, 0x3d, 0x53, 0x16, 0x49, 0x3d, 0xeb,
+  0xfc, 0x8d, 0x3d, 0xb0, 0x52, 0x32, 0x3c, 0xa0, 0xa5, 0x5a, 0xbd, 0xfe, 0xf7,
+  0x9c, 0xbc, 0x19, 0x78, 0x4a, 0x3c, 0x78, 0xd1, 0xc2, 0x3c, 0xb4, 0x51, 0x91,
+  0xbd, 0x47, 0x08, 0x76, 0xbd, 0x7e, 0x70, 0x02, 0x3d, 0x8b, 0x90, 0x80, 0xbd,
+  0xc0, 0xad, 0x10, 0xbd, 0xc6, 0x2e, 0x4d, 0xbd, 0x0e, 0xe4, 0x0b, 0x3d, 0x9e,
+  0x8e, 0x8f, 0x3b, 0xd6, 0x81, 0x8a, 0xbd, 0xb9, 0x43, 0x05, 0xbd, 0xfd, 0xb4,
+  0x3d, 0xbd, 0x69, 0x1b, 0xa9, 0xbb, 0x0b, 0xb6, 0x88, 0xbd, 0xe3, 0x8f, 0x64,
+  0x3d, 0xd9, 0xda, 0x4d, 0x3c, 0xa8, 0xa9, 0x66, 0xbd, 0x87, 0x10, 0x23, 0x3d,
+  0xf6, 0x03, 0x3b, 0x3d, 0xa4, 0xcb, 0x83, 0x3c, 0x36, 0xd0, 0x2a, 0xbd, 0x22,
+  0x31, 0x27, 0x3d, 0xf0, 0xfb, 0x18, 0x3d, 0x8e, 0xa1, 0x04, 0x3d, 0x67, 0x0e,
+  0x67, 0xbc, 0x77, 0x07, 0x90, 0x3d, 0xaf, 0x11, 0x72, 0x3d, 0x7b, 0xdd, 0x80,
+  0x3d, 0x18, 0xd2, 0x6e, 0xbc, 0x0c, 0xfa, 0x5e, 0xbd, 0xe8, 0x92, 0xaf, 0xbc,
+  0x8f, 0x89, 0xe9, 0x3c, 0x15, 0x06, 0x1d, 0x3c, 0x02, 0x7f, 0x81, 0x3d, 0x88,
+  0xe0, 0x0f, 0xbd, 0x16, 0x6a, 0xab, 0xbc, 0xc4, 0x1f, 0xdf, 0x3c, 0x38, 0xab,
+  0x4b, 0x3c, 0x40, 0xfd, 0x83, 0x3b, 0x71, 0x9a, 0x52, 0xbd, 0x90, 0x3f, 0x04,
+  0xbd, 0xe4, 0x23, 0x81, 0x3d, 0x4a, 0xaa, 0x39, 0xbd, 0xc1, 0xb6, 0x7c, 0x3d,
+  0xa4, 0xb4, 0x2d, 0x3d, 0x3c, 0x8b, 0xea, 0x3b, 0xf3, 0x93, 0x8e, 0x3d, 0x9b,
+  0xea, 0x87, 0xbc, 0x25, 0x22, 0x91, 0xbd, 0xeb, 0x03, 0x1a, 0x3d, 0xde, 0xb3,
+  0x41, 0x3d, 0xb3, 0x03, 0x59, 0xbd, 0x98, 0xea, 0x1d, 0xbd, 0xaf, 0x46, 0xd9,
+  0xbc, 0xc0, 0x55, 0x3e, 0xbd, 0x4d, 0xe2, 0x45, 0x3d, 0x85, 0xa0, 0x44, 0x3c,
+  0x00, 0xe5, 0x3e, 0xbd, 0x6f, 0x4e, 0x4b, 0xbb, 0xe1, 0xcd, 0x86, 0x3c, 0x90,
+  0xaa, 0x08, 0xbd, 0xb6, 0xb9, 0x7a, 0x3d, 0x45, 0x80, 0x5c, 0x3d, 0xda, 0x7b,
+  0x28, 0xbd, 0x4e, 0x73, 0xc1, 0xbc, 0x8b, 0xff, 0x1b, 0x3d, 0xe0, 0xad, 0x71,
+  0xbc, 0x5c, 0xa3, 0xd3, 0xbc, 0x93, 0x08, 0x85, 0x3d, 0xce, 0x42, 0x3a, 0x3d,
+  0x31, 0x10, 0x86, 0x3d, 0x28, 0x95, 0x86, 0x3a, 0x81, 0x0e, 0x39, 0xbd, 0xa6,
+  0xb2, 0x57, 0x3d, 0x97, 0xab, 0xf8, 0xbc, 0x53, 0x5b, 0x9f, 0xbc, 0x79, 0x78,
+  0x54, 0x3d, 0xdc, 0x5b, 0x8b, 0x3d, 0xf5, 0xe7, 0x2d, 0x3d, 0xe7, 0x23, 0xa4,
+  0xbc, 0x6a, 0xff, 0x83, 0x3d, 0x53, 0xe7, 0x48, 0x3d, 0x27, 0x3c, 0x8c, 0x3d,
+  0x44, 0xdf, 0x74, 0xbd, 0x58, 0xe8, 0xf3, 0xbc, 0x4c, 0x9f, 0x57, 0x3c, 0x6c,
+  0xb6, 0x95, 0x3c, 0xbd, 0x8e, 0x65, 0x3d, 0x11, 0x3e, 0xcb, 0x3c, 0x88, 0x0e,
+  0x02, 0xbd, 0x68, 0x1c, 0x8d, 0xbb, 0xe9, 0xaa, 0x81, 0x3d, 0x00, 0xcc, 0x35,
+  0xbd, 0x4f, 0x0b, 0x8f, 0xbd, 0xa4, 0xaa, 0x40, 0xbc, 0x0a, 0x00, 0xac, 0xbc,
+  0xe2, 0x2a, 0x40, 0xbd, 0xc3, 0xff, 0x05, 0xbd, 0x09, 0xbe, 0x65, 0xbd, 0xe6,
+  0xde, 0x7e, 0xbd, 0x30, 0x36, 0x17, 0x3c, 0x50, 0x30, 0x0e, 0xbc, 0x64, 0x36,
+  0xfa, 0x3c, 0x9d, 0x5a, 0x85, 0xbb, 0x50, 0x2c, 0x65, 0xbc, 0x90, 0x5a, 0xae,
+  0xbb, 0x37, 0xe6, 0x41, 0xbd, 0xfd, 0x21, 0xf7, 0xbc, 0xb5, 0x91, 0x8b, 0xbb,
+  0x15, 0xaa, 0xbe, 0x3c, 0x86, 0x46, 0x78, 0xbd, 0xd4, 0x41, 0xf8, 0xbc, 0xf2,
+  0xb7, 0xe4, 0x3c, 0x1b, 0x84, 0x5a, 0x3c, 0x5a, 0xc8, 0x5e, 0x3d, 0x74, 0xad,
+  0xa8, 0x3c, 0x71, 0xbe, 0xa0, 0xbc, 0x9b, 0xaf, 0x2b, 0x3d, 0x43, 0x1b, 0x69,
+  0xbd, 0xb3, 0xe7, 0x88, 0x3d, 0xbd, 0xe2, 0x5c, 0x3d, 0x6b, 0xa4, 0x35, 0xbd,
+  0xe9, 0xbc, 0x8f, 0xbd, 0x16, 0xc0, 0x74, 0x3d, 0x92, 0xb9, 0x4c, 0x3d, 0x5d,
+  0xee, 0x91, 0x3c, 0x74, 0xda, 0x1d, 0xbd, 0xda, 0x42, 0x5a, 0xbb, 0x70, 0x1b,
+  0xbc, 0x3c, 0xc3, 0x23, 0xd9, 0xba, 0x6c, 0xf4, 0xa4, 0x3c, 0x9c, 0x95, 0x0a,
+  0x3d, 0xb8, 0x03, 0x9e, 0x3c, 0x05, 0x7b, 0x84, 0x3d, 0x88, 0x24, 0x29, 0x3d,
+  0x6e, 0xb3, 0x72, 0x3d, 0x36, 0x31, 0x62, 0x3c, 0xea, 0x27, 0x24, 0xbd, 0x6d,
+  0xf3, 0xe5, 0x3c, 0x2e, 0x24, 0x1f, 0x3d, 0x69, 0x95, 0x6b, 0xbd, 0xa6, 0xdf,
+  0x42, 0xba, 0xdd, 0x6e, 0x90, 0xbd, 0xb3, 0x52, 0x00, 0xbd, 0xbe, 0x22, 0x02,
+  0x3d, 0xbf, 0x61, 0x80, 0xbd, 0x8d, 0xde, 0x82, 0x3d, 0xf4, 0x40, 0x28, 0x3d,
+  0x7b, 0xeb, 0xb7, 0xba, 0xe1, 0x73, 0x94, 0x3c, 0xae, 0x7f, 0x12, 0xba, 0x02,
+  0xf0, 0x40, 0xbb, 0xf1, 0xb7, 0x05, 0x3d, 0x0d, 0xbb, 0x6b, 0xbd, 0xe2, 0x4f,
+  0x12, 0xbd, 0x0a, 0x66, 0x09, 0xbd, 0xb7, 0xe9, 0x8f, 0x3d, 0x0d, 0x7c, 0x14,
+  0x3d, 0x11, 0xf4, 0xbe, 0xba, 0x09, 0x4d, 0x38, 0xbd, 0x80, 0x94, 0x41, 0x3a,
+  0xd3, 0x89, 0xc2, 0x3c, 0xd8, 0x3a, 0x3d, 0x3c, 0x28, 0x00, 0x5f, 0xbc, 0xc4,
+  0x2a, 0x91, 0xbc, 0x50, 0x98, 0xe6, 0xbc, 0xfa, 0x52, 0x16, 0x3d, 0x3c, 0xb5,
+  0x87, 0x3d, 0xed, 0xcf, 0x70, 0x3c, 0x78, 0x9e, 0x72, 0xbb, 0x93, 0x6b, 0x23,
+  0x3d, 0xf0, 0xaf, 0x64, 0xbd, 0xce, 0xd7, 0x5e, 0xbd, 0x6c, 0x20, 0x7b, 0xbc,
+  0xd0, 0x7a, 0xe0, 0xbb, 0x60, 0xfd, 0xef, 0x3b, 0x95, 0xe5, 0x5f, 0xbd, 0xdf,
+  0x49, 0x33, 0x3c, 0x11, 0x3d, 0x80, 0x3d, 0xd4, 0x04, 0xc8, 0x3c, 0x58, 0xc0,
+  0x41, 0xbd, 0x50, 0x35, 0x63, 0x3d, 0xd2, 0x8a, 0xc8, 0xbc, 0x67, 0xf0, 0x8b,
+  0xbd, 0x69, 0x02, 0x55, 0x3d, 0x0c, 0xa1, 0x76, 0xbd, 0xa8, 0x5e, 0x05, 0xbb,
+  0xd0, 0xc3, 0x16, 0x3d, 0x78, 0x7f, 0x23, 0xbc, 0x59, 0x25, 0x5c, 0xbd, 0xb4,
+  0xaf, 0x36, 0xbd, 0x26, 0xc1, 0xd0, 0xb9, 0xa3, 0xb9, 0x54, 0x3d, 0xd3, 0x99,
+  0xea, 0xbc, 0x56, 0x87, 0xfc, 0xbc, 0x86, 0x17, 0x16, 0xbd, 0x80, 0x75, 0x17,
+  0xbd, 0xe9, 0xe9, 0x26, 0xbd, 0x73, 0xd9, 0x7f, 0xbd, 0x78, 0xf7, 0x08, 0x3d,
+  0xb4, 0x6e, 0x24, 0x3d, 0xdb, 0x78, 0x04, 0x3d, 0x91, 0x4e, 0x5e, 0x3d, 0x93,
+  0x73, 0x86, 0x3d, 0xd5, 0xc8, 0x41, 0xbd, 0x18, 0x68, 0x79, 0x3d, 0x1e, 0x5e,
+  0x74, 0xbd, 0x05, 0x92, 0x43, 0x3d, 0xed, 0xd7, 0xcb, 0x3c, 0x90, 0x04, 0x48,
+  0xbd, 0x2a, 0x81, 0x59, 0xbd, 0xa6, 0xf8, 0x8f, 0xbd, 0x21, 0x1b, 0x82, 0x3d,
+  0x47, 0x2f, 0x03, 0xbd, 0x49, 0x8a, 0xea, 0x3b, 0x82, 0x20, 0x29, 0x3d, 0x3e,
+  0x06, 0x0a, 0x3b, 0x0d, 0xe3, 0x93, 0x3c, 0x3f, 0xb2, 0x83, 0x3d, 0x57, 0x42,
+  0xe4, 0x3b, 0x02, 0x82, 0xde, 0xbc, 0x75, 0x96, 0x0a, 0xbd, 0x66, 0xb5, 0x0a,
+  0x3d, 0x11, 0xed, 0x8d, 0xbd, 0xc5, 0x7c, 0x61, 0xbd, 0x85, 0xde, 0x56, 0xbc,
+  0x2f, 0x3e, 0x41, 0xbd, 0x65, 0x92, 0x70, 0x3d, 0x10, 0x6d, 0xd8, 0xbb, 0x6e,
+  0x7b, 0x45, 0x3d, 0xe0, 0xcd, 0x58, 0x3d, 0x5a, 0xa0, 0x6c, 0xbd, 0x25, 0x13,
+  0x2f, 0xbd, 0x95, 0xcf, 0x6b, 0xbd, 0x42, 0x36, 0x20, 0xbc, 0x3c, 0x82, 0x47,
+  0x3c, 0x71, 0xef, 0x16, 0x3c, 0x50, 0xa2, 0xb8, 0xba, 0x7e, 0xc4, 0x61, 0x3c,
+  0xa6, 0xc5, 0x78, 0xbd, 0xb9, 0x33, 0x32, 0xbd, 0x47, 0x60, 0x81, 0x3d, 0x58,
+  0xd9, 0x16, 0x3d, 0x3a, 0x50, 0x7a, 0xbd, 0x47, 0xc7, 0x15, 0x3d, 0x00, 0xca,
+  0x8a, 0xbd, 0x6f, 0x8f, 0x83, 0xbd, 0x7b, 0x4f, 0x58, 0xba, 0x30, 0x8f, 0x43,
+  0xbd, 0xd1, 0x28, 0xd6, 0xbb, 0x20, 0x94, 0xf7, 0xbc, 0x84, 0xef, 0x25, 0xbd,
+  0x06, 0x79, 0x6f, 0x3d, 0xdb, 0x3e, 0xcd, 0x3c, 0xc7, 0xce, 0x79, 0x3d, 0x23,
+  0x71, 0x97, 0xbc, 0x5c, 0x5c, 0x38, 0x3d, 0xc8, 0xb6, 0x03, 0xbd, 0xd6, 0x31,
+  0xc6, 0xbc, 0x33, 0xe1, 0xd0, 0xbb, 0x66, 0xf2, 0xd5, 0xbc, 0xe2, 0x07, 0x49,
+  0x3d, 0x2c, 0x67, 0xc9, 0xbc, 0x71, 0xd2, 0x41, 0xbd, 0x1a, 0xb4, 0x81, 0x3c,
+  0xf0, 0x27, 0x7d, 0x3d, 0xca, 0xcc, 0xd5, 0xbc, 0x3f, 0x3e, 0x30, 0xbd, 0x50,
+  0xe1, 0x26, 0xba, 0x53, 0x7d, 0x00, 0x3d, 0x8e, 0x75, 0x4d, 0x3b, 0x0a, 0x56,
+  0x20, 0x3d, 0x61, 0xaf, 0xf4, 0xbc, 0x55, 0x41, 0x98, 0xbc, 0x16, 0x66, 0x13,
+  0x3d, 0x40, 0x96, 0x67, 0xbd, 0x40, 0x3a, 0x0b, 0xbd, 0xbe, 0x16, 0x88, 0xbc,
+  0x54, 0xd1, 0x56, 0xbd, 0xd5, 0xa2, 0xba, 0xbb, 0x97, 0x30, 0x1f, 0xbb, 0x37,
+  0x2d, 0x18, 0xbd, 0xe7, 0xe3, 0x8e, 0xbd, 0x82, 0x9b, 0x29, 0x3c, 0x8f, 0x41,
+  0x24, 0xbd, 0xa2, 0x55, 0x8f, 0x3b, 0x25, 0xa4, 0x18, 0x3c, 0xb6, 0xee, 0xe7,
+  0x3c, 0x3a, 0x0b, 0x12, 0xbd, 0x27, 0xfb, 0xb4, 0xb9, 0x70, 0x41, 0x0a, 0xbc,
+  0xe8, 0x8b, 0x62, 0xbd, 0x04, 0x95, 0xc5, 0x3c, 0xa4, 0x51, 0x46, 0xbd, 0x42,
+  0x1e, 0x65, 0xbd, 0x4f, 0x3d, 0x4a, 0x3d, 0x6f, 0x9d, 0x19, 0x3d, 0xb8, 0xdb,
+  0x8c, 0xbd, 0x9a, 0xfe, 0x23, 0x3c, 0x0c, 0x8a, 0x58, 0x3d, 0xe2, 0x61, 0x62,
+  0xbd, 0x1f, 0xee, 0x64, 0x3c, 0x0c, 0xb0, 0x9a, 0x3b, 0xe8, 0x9f, 0xf7, 0xbc,
+  0x54, 0xf9, 0xef, 0xbc, 0xbb, 0x3b, 0x57, 0x3a, 0xcc, 0x92, 0xa6, 0x3c, 0xfa,
+  0x7f, 0xf0, 0x3c, 0x92, 0x0c, 0x03, 0x3d, 0xc4, 0xa7, 0x0b, 0xbd, 0x3d, 0xf1,
+  0x8b, 0xbd, 0x6a, 0x7a, 0x4c, 0xbd, 0xfe, 0x96, 0xdc, 0x3c, 0xf8, 0x93, 0x99,
+  0x3b, 0xe4, 0xd7, 0x70, 0x3d, 0x72, 0x25, 0x4f, 0x3d, 0xc0, 0xa1, 0x80, 0xbd,
+  0xb8, 0xac, 0x50, 0x3d, 0x87, 0x18, 0x87, 0xbc, 0xcc, 0xe2, 0x01, 0xbd, 0x70,
+  0x67, 0xfb, 0xbb, 0xda, 0x29, 0x7c, 0x3d, 0xe6, 0xf0, 0x67, 0x3d, 0x98, 0xd8,
+  0x0e, 0x3d, 0xe8, 0xf6, 0x45, 0xbd, 0xcc, 0x76, 0x57, 0xbd, 0x12, 0xec, 0x02,
+  0x3d, 0x02, 0x73, 0xbf, 0x3c, 0xea, 0x67, 0x9e, 0x3a, 0x29, 0x29, 0x1f, 0x3d,
+  0x19, 0x65, 0x2a, 0x3d, 0x9c, 0x3a, 0x86, 0x3d, 0xd8, 0xcd, 0x15, 0xbd, 0xf3,
+  0xed, 0x75, 0xbd, 0xa6, 0x30, 0xff, 0xbc, 0x87, 0x2e, 0xc7, 0x3c, 0xe6, 0x41,
+  0xb9, 0x3c, 0x38, 0xf9, 0xb0, 0x3c, 0x49, 0x88, 0x8c, 0xbd, 0xf2, 0x2b, 0x70,
+  0x3d, 0x3d, 0x58, 0xec, 0x3b, 0xa2, 0x59, 0x3a, 0x3c, 0x3f, 0x5f, 0x3a, 0x3d,
+  0x5f, 0xb9, 0x48, 0xbd, 0x09, 0x9a, 0xc5, 0x3b, 0x12, 0x63, 0x84, 0xbd, 0x11,
+  0x76, 0x5e, 0x3d, 0x4f, 0xa0, 0x84, 0x3d, 0x90, 0x8b, 0x29, 0xbd, 0x03, 0xcc,
+  0x2c, 0xbd, 0xbe, 0x89, 0x8f, 0xbd, 0xa5, 0x7a, 0x81, 0x3d, 0x54, 0xa8, 0xd0,
+  0x3c, 0x54, 0x70, 0x9d, 0xbb, 0x4a, 0xe4, 0xb9, 0xbc, 0x94, 0x65, 0xfe, 0xbc,
+  0x3c, 0xef, 0xac, 0x3c, 0x4c, 0x87, 0x16, 0xbd, 0x0a, 0xda, 0x85, 0xbc, 0x89,
+  0x04, 0x88, 0x3d, 0xb6, 0xe7, 0x19, 0x3d, 0x38, 0x06, 0x08, 0xbd, 0x37, 0x6c,
+  0x3d, 0xbd, 0x75, 0x70, 0x09, 0x3d, 0x13, 0x5c, 0x7f, 0xbd, 0xe2, 0x25, 0xfb,
+  0x3c, 0x74, 0xe4, 0x06, 0x3d, 0xd8, 0xcb, 0x82, 0x3d, 0xbc, 0xa0, 0xeb, 0xbc,
+  0xaf, 0xb1, 0x8e, 0xbd, 0x30, 0x53, 0xdc, 0x3b, 0x4b, 0x94, 0x84, 0x3d, 0xc9,
+  0x6d, 0xcd, 0x3c, 0xd1, 0x47, 0x8e, 0x3d, 0x5e, 0x1a, 0x15, 0xbc, 0x0b, 0xe3,
+  0xb2, 0x3c, 0x4c, 0x7f, 0xfb, 0x3c, 0x6e, 0x6d, 0x53, 0x3d, 0xdc, 0xa5, 0x8d,
+  0x3d, 0x71, 0x25, 0x85, 0xbd, 0xc8, 0xa9, 0x17, 0xbc, 0xe1, 0xcd, 0xf3, 0xbc,
+  0xbd, 0xc5, 0x5f, 0xbd, 0xde, 0xbc, 0x07, 0x3d, 0x2a, 0x50, 0x91, 0x3c, 0x12,
+  0x64, 0x9a, 0x3b, 0x54, 0x8b, 0x02, 0x3d, 0x2d, 0x77, 0x8b, 0xbd, 0x83, 0x37,
+  0x82, 0x3d, 0x5f, 0xdb, 0x50, 0xbd, 0xba, 0xe6, 0x63, 0x3d, 0x2d, 0x97, 0x21,
+  0x3d, 0xfe, 0xba, 0x80, 0x3d, 0xe4, 0xc2, 0x39, 0xbd, 0x8d, 0x37, 0x94, 0x3c,
+  0x8d, 0xe8, 0xb0, 0xbc, 0x0e, 0xbc, 0xa9, 0xbc, 0xbb, 0xfb, 0xb1, 0xbb, 0xff,
+  0xdb, 0x13, 0xbd, 0x15, 0x1e, 0x1f, 0xbd, 0xe6, 0x81, 0x51, 0xbd, 0xf1, 0x39,
+  0xaf, 0xbc, 0x86, 0x69, 0x68, 0xbd, 0x33, 0x5c, 0xe8, 0x3c, 0x25, 0xd3, 0x5d,
+  0xbd, 0x77, 0xf4, 0x0e, 0xbd, 0x5f, 0x4b, 0xec, 0x3c, 0xc4, 0x6c, 0xfc, 0x3c,
+  0x39, 0x1e, 0xc9, 0x3c, 0x2c, 0xdc, 0x6f, 0xbd, 0xf0, 0xdd, 0x5b, 0x3c, 0xba,
+  0x58, 0x63, 0x3d, 0x20, 0xb8, 0x9c, 0x3b, 0x58, 0x4e, 0xb6, 0xbc, 0x47, 0x2d,
+  0xc4, 0xbc, 0x0c, 0x5b, 0x6b, 0x3d, 0x00, 0x18, 0xed, 0xb9, 0x96, 0xa9, 0x9e,
+  0x3c, 0x42, 0x5c, 0x4a, 0xbb, 0x94, 0x9f, 0x85, 0xbd, 0x10, 0xdd, 0xcd, 0x3c,
+  0x47, 0x98, 0x8c, 0xbd, 0x28, 0x33, 0x6f, 0xbd, 0x6c, 0x52, 0x21, 0x3d, 0x41,
+  0x5c, 0x45, 0x3c, 0xf7, 0x7c, 0x36, 0xbd, 0x6d, 0xf5, 0xdb, 0xbc, 0x30, 0x95,
+  0x87, 0x3d, 0xed, 0x8a, 0x8f, 0xbd, 0x79, 0x78, 0x88, 0xbd, 0x0c, 0x54, 0x1c,
+  0xbc, 0x82, 0xa3, 0xa7, 0x3b, 0x1f, 0xcf, 0x76, 0xbd, 0x71, 0x23, 0x8b, 0x3c,
+  0x01, 0xc3, 0x87, 0x3d, 0x54, 0xb5, 0xe5, 0x3c, 0x3e, 0x2f, 0x17, 0xbd, 0x99,
+  0xb5, 0x13, 0x3d, 0x69, 0xf7, 0xad, 0x3c, 0xb1, 0x19, 0x13, 0xbc, 0x0e, 0xf8,
+  0x5b, 0xbd, 0x74, 0x52, 0x82, 0x3d, 0x7a, 0x5f, 0xfd, 0xbb, 0x2b, 0x17, 0x15,
+  0xbd, 0x05, 0x3c, 0x72, 0xbd, 0x18, 0xbd, 0xb9, 0xba, 0xaf, 0x8e, 0xc5, 0xbc,
+  0x7a, 0x8f, 0xc3, 0xbb, 0xd9, 0x64, 0x14, 0xbd, 0x97, 0xdf, 0x55, 0x3d, 0x99,
+  0x96, 0xac, 0xba, 0x4f, 0x5c, 0x84, 0x3d, 0xa4, 0x57, 0x27, 0x3d, 0xf8, 0x8e,
+  0x81, 0xbd, 0xf8, 0xef, 0x55, 0x3c, 0x0e, 0x2d, 0x59, 0xbd, 0xf1, 0xeb, 0x52,
+  0x3a, 0x06, 0xde, 0x94, 0x3c, 0x53, 0x8e, 0x17, 0xbd, 0x5d, 0x25, 0x86, 0x3c,
+  0x1c, 0x8c, 0x8b, 0xbc, 0x32, 0xa0, 0x1c, 0x3d, 0x2e, 0xb3, 0x53, 0x3d, 0x2e,
+  0x1c, 0x3f, 0x3d, 0x38, 0xb0, 0xf1, 0x3c, 0x95, 0xc2, 0x55, 0xbb, 0x74, 0x05,
+  0x39, 0xbd, 0x4a, 0xa6, 0x27, 0x3b, 0xb3, 0x63, 0xd8, 0x3c, 0xd6, 0x03, 0x83,
+  0x3d, 0x24, 0x65, 0x49, 0xbd, 0x18, 0x9e, 0xee, 0x3c, 0x26, 0xf0, 0x85, 0xbd,
+  0xfc, 0xd0, 0x67, 0xbd, 0x43, 0xca, 0x12, 0xbd, 0xb1, 0xec, 0x03, 0x3d, 0x00,
+  0x1e, 0x74, 0x3c, 0xb5, 0x32, 0xa6, 0xbc, 0x3d, 0x56, 0x65, 0x3d, 0x8b, 0x0e,
+  0xa9, 0xbc, 0x03, 0x1e, 0x91, 0x3d, 0x64, 0x8f, 0x88, 0x3d, 0x1c, 0x50, 0xb5,
+  0xbc, 0xe4, 0xb3, 0x05, 0xbd, 0x2c, 0x4f, 0x59, 0xbd, 0x29, 0x30, 0x23, 0xbd,
+  0x0c, 0x23, 0x56, 0xbd, 0x7d, 0x77, 0x82, 0xbc, 0x45, 0x1a, 0xa4, 0x3c, 0xb7,
+  0x9c, 0x0f, 0xbc, 0xc5, 0x76, 0xd8, 0xbc, 0x7f, 0x4f, 0x78, 0xbd, 0xb4, 0x07,
+  0x82, 0x3c, 0x56, 0xcc, 0x6a, 0xbd, 0xc3, 0x11, 0x29, 0x3c, 0xa5, 0xf6, 0x7a,
+  0x3d, 0x8a, 0x88, 0xc4, 0x3c, 0x00, 0xf8, 0xa2, 0xbc, 0x30, 0x08, 0x50, 0xbd,
+  0x59, 0xcf, 0xb1, 0xbc, 0xd1, 0xba, 0x52, 0xbd, 0xc0, 0xe8, 0xbe, 0x3b, 0xc3,
+  0xb8, 0xfe, 0xbc, 0x22, 0xc5, 0x84, 0xbd, 0xef, 0x51, 0xbd, 0x3a, 0x75, 0x42,
+  0xc8, 0xbc, 0x1a, 0x32, 0x88, 0x3d, 0x2a, 0x26, 0xc2, 0xbc, 0x66, 0x17, 0x2a,
+  0xbd, 0x1d, 0x0f, 0x7f, 0x3d, 0x55, 0x2f, 0x8f, 0x3b, 0x01, 0x47, 0x8c, 0x3d,
+  0x3a, 0x01, 0x18, 0x3d, 0xca, 0xa0, 0xea, 0xbc, 0x3e, 0x16, 0x34, 0xbd, 0xe8,
+  0xf7, 0x75, 0x3c, 0x20, 0xee, 0x49, 0x3c, 0x6a, 0xc1, 0x3b, 0xbd, 0xa0, 0x98,
+  0x5c, 0xbd, 0x60, 0x8e, 0x94, 0x3b, 0xa2, 0x9b, 0x8a, 0x3d, 0x10, 0x4d, 0x4f,
+  0x3d, 0x87, 0xe4, 0x45, 0xbd, 0xb6, 0x17, 0xdd, 0x3b, 0xee, 0x06, 0x71, 0xbd,
+  0xca, 0xb4, 0xe0, 0x3c, 0xd4, 0x9d, 0x0b, 0xbd, 0xba, 0x3a, 0x21, 0x3d, 0x6c,
+  0xfd, 0xaa, 0x3c, 0x35, 0x20, 0x61, 0xbd, 0x20, 0x51, 0x52, 0x3d, 0x96, 0xcc,
+  0x29, 0xbd, 0x9f, 0x99, 0x22, 0x3d, 0x06, 0x2d, 0xdb, 0xba, 0xdb, 0xf1, 0x90,
+  0x3c, 0xf9, 0x05, 0x06, 0x3d, 0xdf, 0x02, 0xcb, 0x3c, 0x02, 0xb8, 0xf8, 0xbc,
+  0x70, 0x14, 0x50, 0xbd, 0x51, 0xdc, 0x88, 0x3d, 0xa8, 0xa5, 0xd6, 0xbc, 0x69,
+  0xd7, 0x8e, 0x3d, 0xbe, 0x91, 0x86, 0xbd, 0x5d, 0x93, 0x12, 0xbd, 0x7c, 0x23,
+  0x60, 0xbd, 0xb2, 0x55, 0xb7, 0x3c, 0x38, 0xb8, 0x0e, 0x3d, 0x88, 0x86, 0x0e,
+  0x3c, 0x9a, 0x4b, 0x0d, 0x3d, 0x00, 0xfa, 0x1a, 0x3b, 0xb8, 0x59, 0xbf, 0x3c,
+  0xbe, 0xa8, 0xea, 0x3c, 0xfc, 0xf4, 0xf3, 0x3c, 0xbf, 0x69, 0x17, 0x3d, 0x82,
+  0xe6, 0x84, 0xbd, 0x9d, 0xde, 0x3e, 0xbd, 0x3a, 0x02, 0x5b, 0xbd, 0x04, 0x34,
+  0x8b, 0xbd, 0x83, 0x26, 0xc5, 0x3c, 0x71, 0x0c, 0x17, 0x3d, 0x44, 0x33, 0x5a,
+  0xbd, 0xe0, 0x15, 0xe4, 0x3b, 0xd9, 0x25, 0x80, 0xbd, 0xbb, 0xac, 0x56, 0xbd,
+  0x54, 0x26, 0x6f, 0xbd, 0x30, 0x23, 0xa2, 0x3b, 0x08, 0x7c, 0x27, 0xbd, 0xba,
+  0x00, 0xde, 0xbc, 0x80, 0x47, 0x8f, 0xbd, 0xca, 0x52, 0x17, 0xbd, 0xf0, 0x9a,
+  0x0a, 0x3d, 0xe9, 0x6a, 0xea, 0x3b, 0x12, 0xaa, 0x65, 0x3d, 0x3e, 0x1a, 0x49,
+  0x3d, 0x3b, 0x68, 0x30, 0xbd, 0xfb, 0x34, 0x3d, 0x3d, 0x0c, 0x21, 0xe3, 0x3c,
+  0x13, 0x68, 0x67, 0xbb, 0xe5, 0xaf, 0x8b, 0xbd, 0xfe, 0x2b, 0x00, 0xbd, 0x5e,
+  0x1e, 0x4a, 0xbd, 0xb2, 0x94, 0x70, 0x3d, 0xa0, 0x7e, 0x47, 0x3b, 0xde, 0xa9,
+  0xef, 0xbc, 0x84, 0x2f, 0x1a, 0x3a, 0x26, 0xb6, 0xf8, 0x3c, 0xe4, 0xab, 0xd9,
+  0xbc, 0xa8, 0x0b, 0x87, 0xbd, 0x70, 0x2c, 0xbd, 0x3c, 0x32, 0xb2, 0x8c, 0x3c,
+  0xce, 0x0f, 0x34, 0xba, 0xc7, 0xc9, 0x3b, 0xbd, 0x22, 0xdb, 0xf3, 0xbc, 0x8d,
+  0x4e, 0x48, 0xbd, 0xf0, 0x63, 0x53, 0x3d, 0x04, 0xd6, 0xc7, 0x3b, 0xfa, 0x40,
+  0x6c, 0xbd, 0x22, 0xfb, 0x80, 0x38, 0xe9, 0x8c, 0x0e, 0x3c, 0xc4, 0x60, 0x27,
+  0x3d, 0xaa, 0xcf, 0x60, 0x3d, 0xfe, 0x59, 0x08, 0x3d, 0x6e, 0x69, 0x43, 0xbd,
+  0xcb, 0xa1, 0x03, 0xbd, 0x16, 0x47, 0x72, 0x3d, 0xc1, 0x37, 0x5d, 0x3d, 0x53,
+  0x6f, 0x8b, 0xbd, 0x50, 0x99, 0x18, 0x3d, 0x65, 0x92, 0x89, 0x3d, 0x12, 0x80,
+  0x94, 0xbd, 0x8d, 0x1d, 0x21, 0xbd, 0x6e, 0xc6, 0x69, 0x3d, 0x18, 0x1d, 0x23,
+  0x3d, 0x3e, 0x2b, 0x00, 0x3d, 0xe4, 0x71, 0x4f, 0xbd, 0xfb, 0xc5, 0x0e, 0xbd,
+  0x6e, 0x24, 0x47, 0x3d, 0x34, 0xf0, 0x50, 0x3c, 0x3f, 0x38, 0x89, 0x3d, 0xb5,
+  0x84, 0x41, 0xbc, 0xb8, 0xdc, 0x56, 0x3d, 0x3b, 0x56, 0x60, 0xbc, 0x5a, 0x3b,
+  0x58, 0x3d, 0x86, 0x56, 0x6d, 0xbd, 0x4f, 0x33, 0x43, 0x3d, 0x7e, 0x6c, 0x7d,
+  0x3c, 0xb9, 0x4c, 0x8b, 0x3d, 0x00, 0x88, 0x3f, 0x3a, 0x3a, 0xb8, 0xc1, 0x3c,
+  0x02, 0x18, 0x30, 0x3d, 0x6b, 0xb4, 0x4c, 0xbd, 0x0d, 0xd8, 0x3c, 0x3d, 0x9a,
+  0x25, 0x61, 0xbd, 0x87, 0x7b, 0xa7, 0xbc, 0x76, 0x8e, 0x06, 0xbb, 0x47, 0xf9,
+  0x73, 0xbd, 0x80, 0xfa, 0x28, 0xbb, 0xd4, 0xd1, 0x76, 0xbd, 0x9a, 0xcb, 0x29,
+  0xbd, 0xf6, 0x0f, 0xe5, 0xbc, 0x6d, 0xeb, 0x4f, 0xbd, 0x46, 0xe8, 0x69, 0xbc,
+  0x9a, 0x72, 0x69, 0x3d, 0x55, 0x19, 0x86, 0xbd, 0xba, 0x77, 0x0f, 0x3d, 0x4d,
+  0xf6, 0x64, 0x3d, 0xf4, 0xf6, 0x19, 0x3d, 0xc3, 0x53, 0x4a, 0x3d, 0x83, 0xc4,
+  0x7f, 0x3c, 0xb6, 0xcb, 0x53, 0xbd, 0xc5, 0x99, 0x83, 0xbd, 0xa9, 0xcb, 0x4e,
+  0xbd, 0xbc, 0xc0, 0xf3, 0x3c, 0xc3, 0x45, 0x2c, 0x3d, 0x6a, 0x2f, 0x93, 0xbd,
+  0x8d, 0x05, 0x67, 0x3d, 0xec, 0x6f, 0x3a, 0x3d, 0xf5, 0x47, 0x5a, 0x3d, 0xca,
+  0xa6, 0x79, 0x3d, 0x16, 0x97, 0x7d, 0xbd, 0x53, 0x30, 0x52, 0x3d, 0x07, 0x81,
+  0x52, 0x3d, 0xf7, 0xae, 0xa6, 0xbc, 0xa3, 0xc2, 0xa4, 0xbc, 0x5c, 0xd8, 0x23,
+  0xbd, 0xc5, 0x77, 0x50, 0x3d, 0x28, 0x78, 0x47, 0x3c, 0xe7, 0xe2, 0x04, 0xbd,
+  0xcc, 0x6f, 0x83, 0xbd, 0x4c, 0x2b, 0xfc, 0xbc, 0x42, 0xf8, 0xf6, 0x3c, 0x03,
+  0x7c, 0x87, 0x3d, 0x2d, 0x4d, 0x80, 0xbd, 0x08, 0x59, 0x65, 0x3d, 0x2b, 0x4a,
+  0x3a, 0xbd, 0xae, 0xec, 0x68, 0x3d, 0x1e, 0x42, 0x85, 0xbd, 0xd6, 0x06, 0x6a,
+  0x3d, 0x6e, 0xfe, 0x65, 0xbd, 0x77, 0xef, 0xb0, 0x3c, 0x81, 0xb1, 0x48, 0x3c,
+  0x86, 0x4b, 0x57, 0xbd, 0x1e, 0x45, 0x82, 0x3c, 0x9b, 0x6c, 0x0f, 0xbd, 0xeb,
+  0x5f, 0x1c, 0xbd, 0xc3, 0x49, 0x3b, 0x3d, 0x5b, 0x31, 0x7b, 0xbd, 0xee, 0xcb,
+  0x0c, 0xbd, 0x49, 0xa6, 0xa7, 0x3c, 0x89, 0x96, 0x73, 0xbd, 0x4d, 0xcf, 0x89,
+  0x3d, 0xec, 0x73, 0xe1, 0x3b, 0x0e, 0x74, 0x0b, 0x3c, 0xc4, 0x52, 0xe1, 0xbc,
+  0xf9, 0x15, 0x5f, 0x3d, 0x4a, 0x6c, 0x6c, 0xbd, 0x1d, 0x1d, 0xc7, 0xbb, 0xa2,
+  0x11, 0x26, 0x3d, 0x92, 0xa6, 0x00, 0xbd, 0xe8, 0x29, 0x52, 0x3d, 0x6c, 0x9f,
+  0xc3, 0x3c, 0xa9, 0xf6, 0xea, 0xbc, 0x0b, 0xce, 0x84, 0x3d, 0x3a, 0x7a, 0x83,
+  0x3d, 0x95, 0x99, 0xff, 0x3c, 0x26, 0xc1, 0xae, 0xbc, 0x4c, 0x73, 0xab, 0x3c,
+  0x10, 0x47, 0x5f, 0xbd, 0x6c, 0x99, 0xab, 0x3c, 0x40, 0x91, 0xee, 0x3a, 0x30,
+  0xe9, 0x43, 0xbd, 0xd8, 0xdf, 0xed, 0x3c, 0x93, 0xd4, 0x98, 0xbc, 0x05, 0xf8,
+  0x8c, 0x3d, 0x8d, 0x54, 0x89, 0xbd, 0x29, 0x6a, 0x5a, 0xbd, 0x54, 0x2f, 0x2d,
+  0xbd, 0x11, 0x76, 0x90, 0xbd, 0x62, 0x24, 0xdf, 0x3c, 0x1f, 0x0c, 0x92, 0xbd,
+  0x87, 0xb7, 0x06, 0xbd, 0x28, 0x1b, 0x92, 0xbd, 0x41, 0xb6, 0x19, 0xbd, 0x90,
+  0xa9, 0xc8, 0xbc, 0x10, 0x06, 0xa2, 0x3c, 0x9b, 0x59, 0x72, 0x3d, 0x9f, 0x9b,
+  0xc4, 0x3c, 0xc2, 0x44, 0xb9, 0xbb, 0xe4, 0x46, 0x90, 0x3d, 0xe9, 0x54, 0x40,
+  0xbd, 0x18, 0xdd, 0xc8, 0xbc, 0xff, 0x78, 0x44, 0xbd, 0x6e, 0xaa, 0x92, 0xbc,
+  0x76, 0xaa, 0x31, 0x3c, 0x37, 0x94, 0xe8, 0xbc, 0x2b, 0x84, 0xf6, 0x3c, 0xce,
+  0x29, 0x8f, 0xbc, 0x37, 0xdc, 0xaf, 0x3c, 0x40, 0x76, 0xbd, 0x3c, 0xd6, 0x49,
+  0x50, 0x3d, 0x48, 0x72, 0x36, 0xbd, 0xc7, 0x51, 0x63, 0xbd, 0x04, 0x47, 0x70,
+  0xbc, 0x02, 0x99, 0x7c, 0xbc, 0x83, 0xb4, 0x44, 0xbd, 0x1d, 0x3b, 0x83, 0xbd,
+  0x55, 0xe3, 0x41, 0x3d, 0x2c, 0x05, 0xcf, 0x3a, 0x52, 0x65, 0x2f, 0x3d, 0x8e,
+  0x0d, 0x2d, 0x3d, 0x59, 0x13, 0x43, 0xbd, 0xe6, 0x6e, 0xf3, 0x3c, 0xc3, 0xfc,
+  0xac, 0x3c, 0x82, 0x9e, 0x5f, 0xbc, 0x07, 0xd9, 0x6f, 0xbd, 0xf0, 0xf1, 0x9d,
+  0x3b, 0x09, 0xcd, 0x07, 0xbd, 0x99, 0xc1, 0x87, 0x3d, 0xfa, 0xef, 0x73, 0x3d,
+  0xe5, 0x18, 0xfc, 0x3c, 0xbc, 0x08, 0x06, 0x3d, 0x5e, 0x91, 0x90, 0xbd, 0x9c,
+  0x69, 0xf7, 0x3b, 0x71, 0x14, 0xef, 0xbc, 0x90, 0x77, 0xf9, 0x3c, 0x4c, 0x17,
+  0x6e, 0xbd, 0x59, 0x66, 0xe5, 0xbb, 0x6d, 0x0b, 0x5f, 0xbc, 0x8a, 0xde, 0x57,
+  0x3d, 0xdf, 0x37, 0x84, 0xbd, 0x6a, 0x62, 0x7b, 0x3d, 0x19, 0x4c, 0xc5, 0xbc,
+  0xf0, 0x81, 0x2b, 0x3d, 0x0c, 0xe8, 0x3f, 0xbd, 0x2c, 0xac, 0x36, 0xbd, 0x2a,
+  0x6a, 0x2e, 0x3d, 0x90, 0xcc, 0x94, 0xbb, 0x07, 0xfd, 0x28, 0xbd, 0x5e, 0x9f,
+  0xb7, 0x3b, 0xcc, 0xf7, 0x83, 0xbd, 0x2e, 0x4f, 0xa0, 0xbc, 0x06, 0x60, 0xcc,
+  0x3c, 0xc6, 0xbf, 0x5d, 0x3c, 0x48, 0x40, 0x6b, 0xbd, 0x69, 0x48, 0x03, 0x3d,
+  0x75, 0x47, 0x48, 0x3d, 0xc4, 0x2f, 0x0f, 0x3d, 0x2d, 0xa5, 0x6e, 0xbd, 0x5a,
+  0x05, 0x41, 0xbd, 0x7c, 0x10, 0xff, 0x3c, 0x2c, 0x2e, 0x78, 0xbd, 0x16, 0x4f,
+  0x7d, 0x3d, 0xcf, 0x20, 0x5f, 0x3d, 0xd7, 0x5c, 0x87, 0xbd, 0x96, 0x63, 0x1e,
+  0xbc, 0x2b, 0xf3, 0x8c, 0xbc, 0x6e, 0x52, 0x00, 0xbd, 0xb0, 0xb0, 0x47, 0x3d,
+  0x6e, 0x8c, 0xa2, 0xbc, 0x26, 0xa4, 0xbd, 0x3c, 0x50, 0xfb, 0xc4, 0xbc, 0x16,
+  0xc5, 0xe2, 0x3c, 0x34, 0xbe, 0xba, 0xbc, 0x58, 0x77, 0x06, 0xbc, 0xb6, 0x0f,
+  0x02, 0x3d, 0x00, 0xc0, 0x67, 0xbd, 0x19, 0x7b, 0x0f, 0xbd, 0xdf, 0xca, 0x42,
+  0xbd, 0x28, 0x6b, 0x5d, 0xbd, 0xe8, 0x7b, 0x0b, 0x3d, 0x0f, 0xd3, 0x9b, 0xbc,
+  0x0e, 0x94, 0x3c, 0x3d, 0x56, 0xcd, 0x32, 0xbd, 0x39, 0x73, 0x82, 0xbd, 0x32,
+  0x4b, 0x06, 0xbd, 0x77, 0xbe, 0x35, 0xbd, 0x4f, 0x03, 0x0b, 0x3d, 0x40, 0x14,
+  0x8b, 0x3d, 0xe0, 0x32, 0x60, 0xbd, 0x4f, 0xd0, 0x85, 0x3d, 0x0f, 0xfc, 0x74,
+  0xbc, 0xa1, 0xfc, 0xfa, 0xbb, 0x83, 0x11, 0x49, 0x3b, 0x48, 0x21, 0x1b, 0xbc,
+  0x4d, 0x36, 0xe6, 0xbc, 0x27, 0x47, 0x6c, 0xbc, 0x6f, 0x04, 0x37, 0xbd, 0xc6,
+  0x57, 0x6a, 0x3d, 0xa0, 0x16, 0x4d, 0x3b, 0x1a, 0xeb, 0x55, 0x3d, 0x6e, 0x5f,
+  0x2d, 0xbd, 0xde, 0xff, 0x65, 0xbd, 0x68, 0x46, 0x49, 0x3c, 0x3c, 0x27, 0x3c,
+  0xbd, 0xfd, 0xdc, 0x0e, 0xbd, 0xb9, 0xff, 0x24, 0xbd, 0xf0, 0x8f, 0x5c, 0xbd,
+  0xa8, 0x9d, 0x32, 0x3d, 0x5c, 0x6d, 0x4d, 0xbd, 0x0d, 0xc2, 0x47, 0x3d, 0xf5,
+  0xe0, 0x8b, 0x3c, 0x4e, 0xd4, 0xfb, 0xbc, 0x2f, 0xef, 0x7d, 0x3d, 0x0d, 0xbf,
+  0x03, 0x3d, 0x54, 0x6e, 0x16, 0x3d, 0x51, 0x8b, 0x85, 0xbd, 0xac, 0x6b, 0x19,
+  0xbb, 0x2e, 0x99, 0x9e, 0x3c, 0xd9, 0xa5, 0x35, 0x3d, 0x90, 0x56, 0x59, 0x3d,
+  0xda, 0xee, 0x7c, 0x3d, 0x63, 0x87, 0x1b, 0xbb, 0x12, 0x90, 0x39, 0xbd, 0x4b,
+  0xb8, 0x39, 0x3d, 0x3f, 0x49, 0x94, 0xbc, 0xeb, 0x8f, 0x80, 0x3d, 0x8a, 0x9f,
+  0x81, 0xbd, 0xdb, 0x11, 0x0c, 0x3d, 0x13, 0x28, 0x29, 0x3d, 0x70, 0x84, 0xfc,
+  0xbc, 0x48, 0x74, 0x10, 0x3c, 0xcc, 0xb3, 0x30, 0xbd, 0x48, 0x07, 0x16, 0x3c,
+  0x5d, 0x4f, 0x19, 0xbd, 0x2b, 0x80, 0xf7, 0xbb, 0x16, 0x87, 0x08, 0xbd, 0x07,
+  0x00, 0x88, 0x3d, 0x12, 0x69, 0x44, 0x3d, 0x18, 0x31, 0x0d, 0x3c, 0x57, 0xd3,
+  0x06, 0x3d, 0x24, 0x3d, 0x07, 0x3d, 0xcc, 0x07, 0x7f, 0x3d, 0xab, 0x2a, 0x79,
+  0xbd, 0x7e, 0x3c, 0x79, 0xbd, 0xa9, 0x22, 0xfb, 0xbc, 0x3d, 0xa3, 0x3f, 0x3d,
+  0x9b, 0x63, 0x40, 0x3c, 0x8f, 0xd5, 0x9b, 0x3c, 0x38, 0x24, 0x2b, 0x3d, 0x73,
+  0x53, 0x02, 0x3d, 0xf4, 0xe3, 0xfb, 0x3c, 0xab, 0x4b, 0x81, 0x3d, 0x6c, 0x44,
+  0x17, 0x3d, 0xe9, 0xbe, 0x8e, 0x3d, 0x79, 0xc1, 0x23, 0x3c, 0x19, 0xfd, 0x91,
+  0x3c, 0xf9, 0xea, 0x83, 0x3c, 0x5a, 0xee, 0x86, 0x3c, 0xa7, 0x51, 0x2f, 0xbd,
+  0x4a, 0xa1, 0x43, 0x3d, 0xf7, 0xc3, 0xdd, 0x3b, 0x41, 0x5d, 0x48, 0xbd, 0x91,
+  0x94, 0x92, 0xbd, 0x76, 0xb0, 0x87, 0x3d, 0xad, 0x39, 0x8e, 0x3d, 0xa0, 0x5a,
+  0xc3, 0xbb, 0x13, 0xd2, 0x42, 0xbd, 0x93, 0x32, 0x41, 0xbc, 0x02, 0x56, 0x91,
+  0xbd, 0x6e, 0x37, 0x12, 0xbd, 0x70, 0x73, 0xe7, 0x3b, 0x85, 0xd7, 0x78, 0x3b,
+  0xb0, 0xfb, 0x3f, 0xbd, 0x44, 0xb8, 0x2e, 0xbd, 0xcd, 0x1c, 0x92, 0xbd, 0x78,
+  0xee, 0xe1, 0xbc, 0xb4, 0x56, 0x52, 0xbd, 0xa6, 0xbd, 0x62, 0x3d, 0xdc, 0x38,
+  0xe8, 0xbc, 0x30, 0xaf, 0x68, 0x3c, 0xe0, 0x72, 0x05, 0xbc, 0x06, 0xad, 0xd5,
+  0x3b, 0xd9, 0x62, 0x23, 0x3d, 0xf8, 0xa2, 0xee, 0xbc, 0x44, 0x13, 0x07, 0x3d,
+  0x04, 0xcc, 0xf2, 0x3a, 0xce, 0x3f, 0x2c, 0x3d, 0x25, 0x8b, 0x28, 0x3c, 0x55,
+  0xd2, 0x7a, 0xbc, 0x19, 0x6f, 0x83, 0x3d, 0x62, 0xaa, 0x32, 0xbd, 0xf2, 0x19,
+  0x1c, 0xbc, 0x54, 0xc3, 0x8b, 0xbd, 0xdd, 0xeb, 0x52, 0x3c, 0x2a, 0xc7, 0x7c,
+  0x3d, 0x04, 0xf0, 0xb9, 0x3b, 0xe8, 0x91, 0x84, 0x3d, 0x8d, 0xa2, 0xa3, 0x3c,
+  0x01, 0xde, 0x7d, 0xbd, 0x14, 0xf3, 0x25, 0xbd, 0xde, 0x87, 0x8e, 0xbd, 0x6b,
+  0x3b, 0x85, 0x3d, 0x02, 0x85, 0x84, 0xbd, 0x6b, 0x77, 0x6d, 0xbc, 0xb6, 0x9a,
+  0x53, 0x3d, 0x0f, 0xb3, 0xaa, 0xbb, 0x13, 0x69, 0x55, 0xbd, 0x65, 0x98, 0x57,
+  0xbd, 0xef, 0x9c, 0xb2, 0xbc, 0xd2, 0x02, 0xd4, 0x3c, 0x8e, 0xca, 0x27, 0x3d,
+  0x64, 0xc8, 0x42, 0xbd, 0xca, 0x34, 0x39, 0xbd, 0xec, 0x45, 0x78, 0xbc, 0xe3,
+  0xe3, 0x15, 0xbd, 0xad, 0x80, 0x30, 0x3d, 0xa3, 0xc8, 0x12, 0xbd, 0x11, 0x8e,
+  0x40, 0x3d, 0x9a, 0x5f, 0x29, 0xbc, 0xbe, 0xc0, 0x8e, 0xbd, 0x2e, 0x01, 0x05,
+  0xba, 0xde, 0x16, 0x2d, 0x3d, 0xce, 0xc7, 0x68, 0x3d, 0x08, 0x78, 0x4b, 0x3d,
+  0xb9, 0xc7, 0x8f, 0xbd, 0x99, 0x7d, 0x71, 0x3d, 0x20, 0x52, 0x85, 0x3b, 0x8e,
+  0x86, 0xcc, 0xbc, 0x18, 0x1e, 0x1e, 0x3d, 0x06, 0x84, 0x35, 0x3d, 0xd8, 0x65,
+  0x71, 0xbd, 0xb1, 0x95, 0x1e, 0x3d, 0xa8, 0x12, 0x4f, 0x3d, 0xf0, 0x82, 0x6b,
+  0x3c, 0x82, 0x05, 0x05, 0xbd, 0x78, 0x40, 0xef, 0x3c, 0xea, 0xf1, 0x91, 0xbd,
+  0x06, 0x99, 0x82, 0x3d, 0x65, 0x80, 0x81, 0xbc, 0xc7, 0xd2, 0x98, 0xbc, 0x1b,
+  0xab, 0x8c, 0x3b, 0x8d, 0xe6, 0xa2, 0x3c, 0x5a, 0xb0, 0xe8, 0xbc, 0x74, 0x5c,
+  0x65, 0x3c, 0x53, 0x81, 0x88, 0x3d, 0x77, 0xe4, 0x83, 0xbd, 0x05, 0x68, 0x3f,
+  0xbd, 0x7f, 0xa0, 0x34, 0xbd, 0x23, 0xc6, 0x57, 0xbd, 0xe8, 0x03, 0x4c, 0xbd,
+  0xef, 0x5a, 0x91, 0x3c, 0x85, 0x78, 0x46, 0xbd, 0xc3, 0x5f, 0x2e, 0xbd, 0x38,
+  0x74, 0x09, 0x3d, 0x71, 0x8d, 0x2a, 0xbd, 0x7c, 0xb3, 0x40, 0x3d, 0x26, 0xf6,
+  0x72, 0xbd, 0x84, 0xfa, 0x4f, 0xbd, 0x34, 0x53, 0xa7, 0x3c, 0x2c, 0x63, 0x6f,
+  0x3d, 0xe4, 0xa4, 0x29, 0xbd, 0x00, 0x17, 0x21, 0xbb, 0x82, 0x9e, 0x6f, 0x3d,
+  0x8a, 0x61, 0x8d, 0xbd, 0xc4, 0xd7, 0x45, 0x3d, 0x20, 0x1a, 0xce, 0x3c, 0x86,
+  0x39, 0x27, 0xbd, 0xf1, 0x45, 0x1f, 0xbd, 0xe0, 0x3e, 0xd4, 0x3c, 0x8a, 0x80,
+  0x70, 0xbc, 0x80, 0xae, 0xd4, 0x3c, 0x04, 0x93, 0x0a, 0x3d, 0xff, 0x3c, 0x78,
+  0x3d, 0x31, 0x0e, 0x48, 0x3c, 0x20, 0xa8, 0x89, 0xbd, 0x98, 0x75, 0x07, 0xbc,
+  0x68, 0xa1, 0x71, 0x3d, 0xe0, 0xe8, 0x8e, 0xbc, 0xe9, 0x29, 0x19, 0x3d, 0x79,
+  0x7c, 0x4f, 0xbc, 0x90, 0x98, 0xd5, 0x3c, 0x3b, 0xec, 0x1c, 0xbd, 0x36, 0x46,
+  0x84, 0xb9, 0x18, 0x09, 0x8a, 0xbc, 0x84, 0xce, 0x0d, 0xbc, 0xb8, 0x2c, 0xa8,
+  0x3c, 0x20, 0x84, 0x18, 0xbc, 0xa0, 0x54, 0x72, 0xbd, 0x5f, 0xd9, 0x82, 0xbd,
+  0xe7, 0x32, 0x69, 0xbc, 0x58, 0xf3, 0x30, 0xbc, 0x12, 0xff, 0x89, 0x3b, 0x38,
+  0xb3, 0x50, 0x3c, 0x5c, 0xf7, 0x48, 0x3c, 0x40, 0xb3, 0xb9, 0x3c, 0x08, 0x01,
+  0x2b, 0x3d, 0xcb, 0x34, 0xc0, 0xbc, 0x9c, 0x64, 0x51, 0xbd, 0x58, 0x1a, 0x2f,
+  0xbd, 0x4a, 0x45, 0x8a, 0xbc, 0x6a, 0x88, 0xe3, 0x3b, 0xf2, 0xe0, 0x74, 0x3d,
+  0x08, 0xa7, 0x2d, 0xbd, 0x73, 0x61, 0x17, 0xbd, 0xf0, 0xee, 0xce, 0xbc, 0xda,
+  0xbc, 0x20, 0xbd, 0x57, 0x27, 0xc6, 0x3c, 0x3c, 0xfc, 0xb2, 0x3d, 0xf9, 0x52,
+  0x72, 0x3d, 0x98, 0x21, 0x23, 0x3a, 0x64, 0x0e, 0x39, 0xbd, 0x3c, 0x50, 0xff,
+  0xbd, 0xf0, 0xb9, 0x36, 0xbd, 0xff, 0xe2, 0xa3, 0x3d, 0x1c, 0xad, 0x24, 0xbd,
+  0x17, 0x26, 0x4b, 0x3d, 0x32, 0xdb, 0xca, 0x3b, 0xc6, 0x04, 0x3c, 0x3d, 0x3c,
+  0x98, 0x9c, 0x3d, 0xd7, 0xd3, 0x80, 0xbc, 0x30, 0x4e, 0xd9, 0x3c, 0xff, 0xc1,
+  0x21, 0x3d, 0x66, 0xcc, 0xa5, 0xbc, 0x61, 0x87, 0x98, 0x3d, 0x98, 0x20, 0x32,
+  0x3d, 0xec, 0xf1, 0x87, 0xbd, 0x40, 0x73, 0xb9, 0xbd, 0xed, 0x67, 0x98, 0x3d,
+  0x82, 0xde, 0x83, 0x3c, 0xef, 0xb3, 0xe9, 0x3c, 0xf6, 0xd1, 0x2f, 0x3d, 0xb6,
+  0xa2, 0x6c, 0xbd, 0xfa, 0x55, 0x87, 0xbd, 0x5e, 0x0d, 0x4b, 0xbd, 0x52, 0x83,
+  0x1b, 0x3d, 0x38, 0xa3, 0x32, 0xbd, 0x68, 0xa3, 0xd0, 0x3c, 0x6b, 0x9b, 0x0e,
+  0xbd, 0xe8, 0x58, 0x83, 0x3b, 0xac, 0xf2, 0x1d, 0x3d, 0xdc, 0x01, 0xfe, 0xbb,
+  0x45, 0xd1, 0x37, 0x3d, 0x7d, 0x74, 0x10, 0x3d, 0x39, 0x6f, 0x42, 0xbd, 0x1f,
+  0x11, 0xd3, 0xbc, 0x58, 0x36, 0x98, 0x3d, 0xe6, 0x99, 0x19, 0xbd, 0x2e, 0x3f,
+  0x44, 0x3c, 0x04, 0xd0, 0x08, 0xbd, 0x9e, 0x8c, 0x74, 0xbc, 0x73, 0x43, 0xeb,
+  0xbc, 0xa2, 0x01, 0x9b, 0xbd, 0x30, 0x8a, 0x29, 0xbd, 0x4d, 0xe1, 0x50, 0xbd,
+  0xc8, 0x2a, 0x1d, 0x3d, 0x2d, 0x12, 0x7d, 0x3d, 0xdd, 0x75, 0x24, 0xbc, 0xd7,
+  0x2b, 0x48, 0x3c, 0x84, 0x77, 0xf0, 0x3c, 0xf8, 0x69, 0x8a, 0x3d, 0x0d, 0x62,
+  0x23, 0x3d, 0x8d, 0x2a, 0x65, 0x3d, 0x33, 0xc6, 0xce, 0x3b, 0x34, 0xb9, 0x97,
+  0x3b, 0xf3, 0x86, 0xe2, 0xbb, 0x5d, 0x2a, 0x53, 0xbd, 0xea, 0x2b, 0x9a, 0xba,
+  0xbf, 0xd8, 0x91, 0xbc, 0x3d, 0x5f, 0xfa, 0xbc, 0x04, 0x71, 0x82, 0x3d, 0x02,
+  0x09, 0xbe, 0x3d, 0xa2, 0xb3, 0xad, 0x3c, 0x6c, 0x47, 0x28, 0xbd, 0xce, 0xd6,
+  0x16, 0xbd, 0x95, 0x44, 0xff, 0x3c, 0x6c, 0x62, 0x82, 0x3d, 0x2a, 0x15, 0xba,
+  0xbc, 0xc1, 0xa7, 0x83, 0xbb, 0x69, 0x42, 0x7c, 0xbd, 0x03, 0x6e, 0x01, 0x3d,
+  0xd9, 0x8c, 0x1b, 0xbd, 0xc7, 0x85, 0xdc, 0x3c, 0x76, 0x04, 0x4d, 0x3d, 0x99,
+  0x3b, 0x69, 0x3c, 0xee, 0x8a, 0x6f, 0x3d, 0x2c, 0xb5, 0x34, 0xbd, 0x95, 0xc2,
+  0x32, 0xbd, 0x34, 0x5b, 0x8a, 0x3c, 0x0d, 0x52, 0x44, 0xbb, 0xe8, 0xfd, 0xe3,
+  0xbc, 0x6c, 0x8f, 0x6c, 0x3d, 0x22, 0xe9, 0xce, 0xbc, 0x38, 0x1d, 0xa4, 0x3d,
+  0x37, 0xb9, 0xcc, 0xbb, 0x58, 0x8e, 0xbb, 0xbc, 0x13, 0x85, 0x8d, 0x3d, 0x7b,
+  0x10, 0x9d, 0xbd, 0xb0, 0x74, 0x20, 0xbd, 0xbf, 0x6b, 0x24, 0xbc, 0x0b, 0xb2,
+  0x6f, 0xbd, 0xbe, 0x9c, 0xae, 0x3d, 0x64, 0xfc, 0x34, 0x3d, 0x84, 0x44, 0x59,
+  0x3b, 0xc5, 0x97, 0xb6, 0xbc, 0x25, 0x1b, 0x42, 0xbd, 0x1c, 0x64, 0x59, 0x3d,
+  0x00, 0x12, 0x82, 0x3d, 0x64, 0xac, 0x91, 0x3b, 0x3b, 0xae, 0x6b, 0xbd, 0x18,
+  0x6c, 0xd0, 0x3d, 0x9e, 0xea, 0x60, 0x3d, 0xf3, 0xf6, 0x49, 0xbd, 0xd3, 0xfc,
+  0x5b, 0xbc, 0xe5, 0x37, 0x64, 0x3c, 0xbe, 0x33, 0x9c, 0xbc, 0x0e, 0x7a, 0x70,
+  0xbd, 0xf7, 0x19, 0x32, 0xbd, 0x7a, 0x54, 0xac, 0xbd, 0x94, 0x9a, 0x45, 0xbc,
+  0xb6, 0xa0, 0x55, 0x3d, 0x72, 0x8b, 0x81, 0x3d, 0xec, 0xf7, 0x1d, 0x3c, 0x7c,
+  0xc0, 0x65, 0xbd, 0x21, 0x3d, 0xa8, 0x3d, 0xfe, 0x98, 0x91, 0xbc, 0xfc, 0x4e,
+  0x99, 0xbd, 0xd5, 0x77, 0xa0, 0xbd, 0x9a, 0xec, 0x0b, 0x3d, 0xc2, 0xc5, 0x2e,
+  0xbd, 0x58, 0x39, 0x9b, 0x3d, 0x1a, 0x19, 0x4e, 0xbd, 0x32, 0x1e, 0x11, 0xbd,
+  0xe2, 0x81, 0x2f, 0xbd, 0x72, 0x93, 0x82, 0x3d, 0xb5, 0x33, 0x96, 0x3d, 0xfd,
+  0x32, 0x31, 0xbd, 0xf0, 0x5e, 0x7b, 0xbd, 0x37, 0x76, 0x4d, 0xbd, 0x5e, 0xa1,
+  0x9a, 0x3d, 0x58, 0xb2, 0x89, 0xbd, 0xc0, 0x61, 0x93, 0x3a, 0x12, 0xf4, 0x7a,
+  0x3d, 0xad, 0xe5, 0x32, 0xba, 0xf3, 0xfe, 0x75, 0x3d, 0xbd, 0xec, 0x57, 0xbd,
+  0x4d, 0x5b, 0x09, 0x3d, 0x27, 0x1d, 0x1b, 0xbd, 0x26, 0x5e, 0x77, 0xbc, 0x33,
+  0xd7, 0x30, 0xbd, 0x93, 0xde, 0x6d, 0xbd, 0xfe, 0xdd, 0x6f, 0x3d, 0x07, 0x21,
+  0xad, 0x3d, 0xb6, 0xfb, 0x77, 0x3d, 0xc7, 0xd4, 0x12, 0x3d, 0xee, 0xd1, 0x1a,
+  0x3b, 0x57, 0x6a, 0xdf, 0xbc, 0x9a, 0x69, 0x98, 0xbd, 0x18, 0xb5, 0x8b, 0xbd,
+  0x3f, 0x2a, 0x1b, 0xbc, 0xba, 0x61, 0x4e, 0x3d, 0xf7, 0xfc, 0x15, 0x3d, 0x15,
+  0x6a, 0x89, 0x3d, 0x0c, 0x26, 0x12, 0xbd, 0x3c, 0x56, 0x75, 0x3d, 0x31, 0x95,
+  0x49, 0x3c, 0x80, 0x89, 0x27, 0xbd, 0xc5, 0xc8, 0x2d, 0xba, 0xd4, 0xb2, 0x99,
+  0x3d, 0xbd, 0xfe, 0x19, 0xbd, 0x88, 0x62, 0x88, 0x3d, 0x1a, 0xea, 0xb6, 0x3d,
+  0x06, 0xc5, 0x95, 0xbd, 0xbe, 0x0c, 0x2d, 0xbd, 0x09, 0x1b, 0x59, 0x3d, 0xf7,
+  0xd4, 0xbe, 0xba, 0x23, 0x7e, 0x0d, 0xbd, 0x3f, 0x6a, 0x9f, 0x3c, 0x29, 0x6c,
+  0x86, 0x3c, 0x50, 0x53, 0xad, 0xbc, 0x4d, 0x7e, 0xd5, 0xbd, 0xd2, 0xac, 0x6b,
+  0x3d, 0xfd, 0xc0, 0x8d, 0xbd, 0x96, 0xc2, 0x3f, 0x3d, 0xc7, 0x50, 0x9d, 0xbc,
+  0xf8, 0x74, 0xa7, 0xbc, 0x20, 0xcb, 0xbe, 0xbd, 0x39, 0xaa, 0x5d, 0x3d, 0x53,
+  0x49, 0x99, 0xbc, 0xfe, 0x92, 0xca, 0xbd, 0xf2, 0x46, 0x75, 0xbd, 0x71, 0xfe,
+  0x6e, 0xbd, 0x9f, 0x2f, 0x59, 0xbd, 0x0b, 0xe7, 0x3f, 0xbc, 0xad, 0x3f, 0x80,
+  0x3d, 0xec, 0x4d, 0x81, 0xbd, 0x53, 0x8f, 0x8a, 0x3d, 0xfb, 0x2c, 0x54, 0x3d,
+  0x20, 0x2c, 0x57, 0xbd, 0xc1, 0xeb, 0xe2, 0xba, 0x98, 0xed, 0x46, 0x3d, 0x6a,
+  0x20, 0xc1, 0x3c, 0x54, 0x95, 0x2c, 0xbd, 0xac, 0xc1, 0x2b, 0x3c, 0x29, 0x2a,
+  0xf8, 0xbd, 0x4e, 0x69, 0x7f, 0x3d, 0x17, 0x04, 0x29, 0xbd, 0xf2, 0xbb, 0xeb,
+  0xbb, 0xf1, 0x49, 0x40, 0x3d, 0x00, 0x69, 0x01, 0x3d, 0x8d, 0x53, 0x64, 0x3d,
+  0xb7, 0x21, 0x0b, 0xbd, 0x43, 0xc5, 0xc7, 0xbd, 0x1b, 0xa3, 0x48, 0x3d, 0xcb,
+  0x7c, 0x09, 0xbd, 0x20, 0xcb, 0x6e, 0xbb, 0x94, 0x3f, 0x2e, 0x3d, 0xf7, 0x32,
+  0x72, 0xbd, 0x9a, 0x1e, 0x40, 0xbd, 0x5b, 0xf3, 0x47, 0x3d, 0x02, 0xea, 0x77,
+  0xba, 0x63, 0xf3, 0xe8, 0x3c, 0xac, 0x35, 0x06, 0xbd, 0xbd, 0x03, 0x4c, 0xbd,
+  0x11, 0xf6, 0x92, 0x3d, 0x1b, 0x1a, 0x64, 0x3d, 0x51, 0x88, 0x58, 0xbc, 0x61,
+  0xbf, 0x83, 0xbd, 0xdd, 0x44, 0x73, 0xbd, 0xe7, 0xe5, 0xd0, 0x3c, 0xc9, 0x5f,
+  0x87, 0x3d, 0xec, 0x20, 0xbe, 0x3d, 0xd9, 0x21, 0x0f, 0x3d, 0xf9, 0xdd, 0xe7,
+  0xbc, 0xf3, 0x32, 0x91, 0xbd, 0x71, 0xb6, 0x4a, 0x3d, 0x29, 0x35, 0x86, 0x3d,
+  0xba, 0xf4, 0x40, 0xbd, 0x1c, 0x2b, 0x17, 0xbd, 0x70, 0xfb, 0x3c, 0xbd, 0xed,
+  0x3e, 0xdf, 0xbc, 0x60, 0xf1, 0x3d, 0x3d, 0x53, 0x6e, 0x87, 0xbd, 0x0f, 0x52,
+  0x3d, 0x3d, 0x58, 0xd1, 0x47, 0xbd, 0xab, 0x7f, 0xc3, 0x3c, 0x3d, 0x5d, 0xa8,
+  0xbd, 0xe9, 0x7f, 0x11, 0xbd, 0x88, 0x93, 0x50, 0xbd, 0xf2, 0xd2, 0x0f, 0x3d,
+  0x24, 0x59, 0x90, 0x3a, 0x99, 0x86, 0x8b, 0xbd, 0x27, 0x21, 0x5f, 0xbd, 0xf4,
+  0xa1, 0x80, 0x3d, 0x0b, 0xbb, 0x89, 0x3c, 0xbc, 0xda, 0x79, 0x3d, 0xe8, 0x9b,
+  0x56, 0xbc, 0x42, 0xca, 0xf1, 0x3c, 0x74, 0xe2, 0x86, 0x3c, 0xe4, 0x85, 0x0f,
+  0x3d, 0x07, 0x57, 0x2e, 0x3d, 0x41, 0x24, 0x85, 0x3d, 0x48, 0x7e, 0x08, 0xbd,
+  0x91, 0xa8, 0xdd, 0x3c, 0x8c, 0xe1, 0xb7, 0xbc, 0x04, 0xae, 0x2f, 0x3d, 0xe4,
+  0x63, 0xa2, 0x3c, 0x6e, 0x28, 0x06, 0xbc, 0x8d, 0xd9, 0x67, 0xbd, 0x88, 0x14,
+  0x43, 0x3d, 0xe5, 0x9a, 0xde, 0x3c, 0x45, 0x3e, 0x9d, 0x3d, 0x03, 0x22, 0xcb,
+  0xbc, 0x71, 0x92, 0x7c, 0x3d, 0xf7, 0xc6, 0x0d, 0x3d, 0xfb, 0x47, 0xa4, 0x3d,
+  0x45, 0x18, 0x91, 0xbd, 0xda, 0x0b, 0x79, 0xbc, 0x18, 0x17, 0x71, 0xbd, 0xa2,
+  0x74, 0x4e, 0xbd, 0xd7, 0xdb, 0x46, 0x3d, 0x35, 0x53, 0xbb, 0x3c, 0x0c, 0x62,
+  0x0f, 0xbc, 0xe9, 0x2d, 0xdf, 0xbd, 0x33, 0xc7, 0x60, 0x3c, 0x18, 0x74, 0xa8,
+  0x3c, 0xa3, 0x75, 0x87, 0xbd, 0x7b, 0x58, 0xf3, 0xbd, 0x30, 0xcd, 0xfa, 0x3c,
+  0x35, 0xbd, 0x9c, 0xbd, 0x93, 0xcf, 0xdb, 0xbc, 0xc2, 0x35, 0xd9, 0xbc, 0x5e,
+  0x5a, 0x06, 0x3d, 0x3d, 0x8b, 0x39, 0xbd, 0xb7, 0x5d, 0x33, 0xbc, 0x50, 0xca,
+  0xb8, 0x3c, 0x8b, 0x71, 0xfb, 0x3c, 0x80, 0x8e, 0x2a, 0x3d, 0xa0, 0x72, 0x80,
+  0xbc, 0x08, 0x4a, 0x00, 0xbd, 0x9b, 0x6f, 0xd2, 0x3b, 0xda, 0x83, 0xf9, 0xbc,
+  0xed, 0x0c, 0x0b, 0x3c, 0x5d, 0x80, 0x40, 0xbc, 0x84, 0x40, 0x25, 0xbd, 0x52,
+  0x1e, 0x03, 0x3d, 0x53, 0xd4, 0x54, 0x3c, 0x0b, 0x6b, 0xda, 0x3c, 0xcc, 0x67,
+  0x17, 0x3b, 0x58, 0x05, 0xe5, 0xba, 0x63, 0x8d, 0x95, 0x3c, 0xc6, 0xa5, 0x5a,
+  0x3d, 0xdf, 0x29, 0x23, 0xbd, 0x4b, 0x72, 0x9b, 0x3d, 0xef, 0x78, 0x4b, 0xbd,
+  0xa5, 0x08, 0xb7, 0xbd, 0x9c, 0xb5, 0x78, 0xbc, 0xdf, 0x0c, 0x88, 0x3d, 0x07,
+  0xab, 0x19, 0x3d, 0xdc, 0xad, 0xc9, 0xbd, 0x5e, 0x37, 0x4f, 0x3d, 0xe6, 0x99,
+  0x77, 0xbd, 0x12, 0x5f, 0x48, 0xbc, 0x89, 0x82, 0xf2, 0x3b, 0x86, 0x89, 0x44,
+  0x3c, 0x66, 0x1b, 0xb7, 0xbc, 0x2f, 0x07, 0xd0, 0x3b, 0xb5, 0x85, 0x76, 0xb9,
+  0xb2, 0xc4, 0x11, 0xbd, 0x5b, 0x02, 0x30, 0xbd, 0xed, 0xed, 0xee, 0x3c, 0x77,
+  0xbd, 0x24, 0xbb, 0x36, 0xe9, 0x97, 0xbd, 0x2a, 0xe1, 0x6d, 0x3d, 0x75, 0x29,
+  0xaf, 0x3d, 0xff, 0x38, 0xac, 0xbb, 0x76, 0x6d, 0xe4, 0xbc, 0xf8, 0x03, 0x15,
+  0xbd, 0x6f, 0x3d, 0x9a, 0xbc, 0x6b, 0x64, 0x1f, 0x3d, 0xa6, 0x7c, 0x6f, 0xbd,
+  0xa7, 0x60, 0x83, 0x3c, 0xe1, 0xa5, 0x53, 0xbd, 0x04, 0x4f, 0xb6, 0xbc, 0xe7,
+  0x0b, 0x28, 0x3d, 0x4c, 0x15, 0xa9, 0xbc, 0x68, 0x90, 0x73, 0xbb, 0x77, 0x3e,
+  0x8e, 0x3c, 0xdd, 0x42, 0x0c, 0xbd, 0x07, 0x7d, 0x22, 0xbd, 0x35, 0x15, 0x82,
+  0xbd, 0xed, 0x56, 0xe0, 0x3c, 0xfa, 0x8d, 0x7e, 0x3d, 0xab, 0xb5, 0x85, 0xbd,
+  0x8c, 0x4b, 0xa4, 0xbc, 0xe5, 0xee, 0x53, 0xbc, 0x9e, 0x26, 0x4f, 0xbd, 0xaa,
+  0xdf, 0x63, 0xbd, 0xd2, 0x48, 0x11, 0x3c, 0xd6, 0x9c, 0x58, 0x3d, 0xa9, 0x90,
+  0x00, 0x3d, 0x9b, 0xfa, 0x8c, 0x3b, 0x2a, 0x97, 0x1d, 0x3d, 0x37, 0xe9, 0x3e,
+  0xbd, 0x51, 0xd8, 0xf0, 0xbd, 0x92, 0x65, 0x2b, 0xbd, 0x06, 0x73, 0x21, 0x3c,
+  0x85, 0x89, 0xad, 0x3d, 0x50, 0x07, 0x60, 0x3d, 0x01, 0x61, 0x9a, 0x3d, 0xcf,
+  0xba, 0x9c, 0x3d, 0x7c, 0x6f, 0x69, 0x3d, 0x20, 0x79, 0x71, 0xbd, 0xc8, 0x59,
+  0xd1, 0xbc, 0x2f, 0x68, 0x1e, 0xbd, 0xb2, 0xed, 0x87, 0xbd, 0x3e, 0xe7, 0xa0,
+  0xba, 0xb1, 0xf0, 0xd0, 0x3c, 0x1c, 0xf1, 0xdd, 0xbc, 0xb0, 0x4a, 0x83, 0xbb,
+  0xb5, 0x00, 0x55, 0xbc, 0xc6, 0x63, 0x0b, 0x3d, 0xa8, 0x88, 0x2f, 0x3d, 0x3c,
+  0x6e, 0xd7, 0x3c, 0x68, 0x1d, 0x14, 0xbc, 0xac, 0xd1, 0x37, 0x3d, 0x7f, 0xb7,
+  0x66, 0x3d, 0xca, 0xd0, 0xc7, 0xbb, 0x72, 0x5a, 0x91, 0x3d, 0x64, 0x09, 0xaf,
+  0x3c, 0xea, 0x7a, 0x0d, 0xbb, 0x87, 0xd8, 0x4f, 0xbb, 0x88, 0xdf, 0xa5, 0x3c,
+  0x1a, 0xd5, 0x73, 0xbc, 0x55, 0x5b, 0xce, 0x3a, 0xff, 0x62, 0x16, 0x3d, 0xb9,
+  0x06, 0xa8, 0xbd, 0xbc, 0x96, 0xc0, 0xbc, 0x77, 0x06, 0x17, 0xbc, 0xe9, 0xdf,
+  0x7e, 0xba, 0x94, 0x5f, 0xcd, 0x3b, 0x7b, 0x66, 0xf2, 0xbc, 0xc3, 0xdf, 0x7d,
+  0xbd, 0x9c, 0x07, 0x0e, 0xbd, 0xaa, 0x4e, 0x0a, 0xbd, 0x42, 0x2d, 0x7f, 0x3c,
+  0x6f, 0x45, 0xb9, 0x3c, 0x6a, 0xf4, 0x2c, 0xbd, 0x66, 0x01, 0x23, 0xbd, 0x5a,
+  0x2e, 0x12, 0xbc, 0x00, 0x0c, 0xc4, 0xbd, 0x56, 0xf3, 0xd9, 0xbc, 0x57, 0x20,
+  0x14, 0xbd, 0x8f, 0xae, 0xbd, 0x3c, 0x0a, 0x85, 0xbb, 0xbd, 0x51, 0x63, 0x28,
+  0xbd, 0xc3, 0x45, 0x19, 0xbd, 0x1a, 0xc0, 0x66, 0x3d, 0x58, 0xac, 0x77, 0xbd,
+  0x2e, 0xb6, 0xdc, 0xbc, 0xaa, 0x45, 0xe6, 0xbc, 0x06, 0xba, 0x43, 0xbd, 0x71,
+  0x36, 0xac, 0x3d, 0xf5, 0xcb, 0x96, 0x3d, 0x5b, 0x32, 0x58, 0xba, 0x6a, 0xe8,
+  0xe0, 0xb9, 0x39, 0xb6, 0xbe, 0x3c, 0x56, 0xcc, 0xc5, 0x3b, 0x6b, 0xde, 0xad,
+  0xbc, 0x6c, 0xd9, 0xf4, 0xbc, 0xb2, 0xe9, 0x43, 0x3d, 0xf9, 0xd2, 0x1b, 0xbc,
+  0xb1, 0x0f, 0x19, 0x3d, 0xb3, 0xe0, 0x05, 0x3b, 0xdd, 0x85, 0xa8, 0x3d, 0x92,
+  0x70, 0xc0, 0xbc, 0xaf, 0xa0, 0x22, 0xbd, 0x9f, 0x05, 0x33, 0xbd, 0x4a, 0xe4,
+  0xa8, 0x3c, 0x80, 0xf3, 0xc9, 0xba, 0x9f, 0x4c, 0x31, 0xbd, 0x5e, 0x75, 0xa4,
+  0xbc, 0x4e, 0xa3, 0x73, 0xbd, 0x32, 0x14, 0x96, 0xbd, 0xf1, 0xc8, 0xb1, 0x3c,
+  0xa6, 0x72, 0x15, 0xbd, 0x06, 0xbc, 0x4c, 0x3d, 0xd6, 0x84, 0x96, 0x3b, 0xbd,
+  0x95, 0x27, 0x3d, 0x89, 0x66, 0xd8, 0x3c, 0x14, 0xc8, 0xf8, 0xbc, 0x48, 0xc6,
+  0x2a, 0x3d, 0x68, 0x7c, 0xa4, 0x3d, 0x0b, 0xfe, 0x48, 0x3d, 0x03, 0x4e, 0xa0,
+  0x3c, 0x14, 0xeb, 0x9e, 0x3d, 0x54, 0x79, 0x17, 0xbd, 0x8d, 0xe5, 0x44, 0x3c,
+  0x89, 0xb2, 0x14, 0xbc, 0x37, 0x64, 0x98, 0x3d, 0xd5, 0x7d, 0x54, 0xbd, 0x82,
+  0x97, 0x92, 0xbd, 0x97, 0x4c, 0x7c, 0x3b, 0xf8, 0x3f, 0x2b, 0x3d, 0xa2, 0x52,
+  0xc8, 0x3c, 0x67, 0x7b, 0x49, 0xbd, 0x8b, 0xdc, 0x84, 0xbc, 0xfc, 0xd2, 0x1c,
+  0xbd, 0x50, 0x53, 0x8d, 0xbb, 0xa7, 0x93, 0xfe, 0xbc, 0xab, 0xb3, 0xff, 0xbc,
+  0xb0, 0x0d, 0x12, 0x3c, 0x90, 0xde, 0x69, 0x3d, 0x19, 0x4a, 0x31, 0x3d, 0xba,
+  0x86, 0xbe, 0xbd, 0xf0, 0xd1, 0x6f, 0xbd, 0x2a, 0x37, 0xa2, 0x3c, 0xba, 0x72,
+  0x91, 0xbc, 0x69, 0xfe, 0x8f, 0xbb, 0xb4, 0xe0, 0x26, 0x3d, 0x9e, 0x8e, 0x6f,
+  0x3d, 0x28, 0x1c, 0xa4, 0xbc, 0xeb, 0x11, 0x0b, 0x3d, 0xd3, 0x1a, 0x27, 0x3c,
+  0x89, 0x93, 0xa3, 0x3d, 0x22, 0xbf, 0x46, 0x3d, 0xe2, 0x27, 0xe5, 0xbc, 0xa1,
+  0x10, 0x8a, 0xbc, 0xe9, 0x93, 0x65, 0xbd, 0xef, 0x81, 0xce, 0x3c, 0x0c, 0x10,
+  0x44, 0x3c, 0xdc, 0x0d, 0x15, 0xbd, 0x8d, 0x3b, 0x09, 0x3d, 0xc2, 0xe2, 0x35,
+  0xbd, 0xc3, 0xde, 0x09, 0x3c, 0x68, 0xc5, 0x8f, 0x3d, 0xa2, 0xb3, 0x38, 0x3d,
+  0x94, 0xa6, 0x66, 0x3c, 0x5f, 0x15, 0x79, 0x3d, 0x74, 0x80, 0x7e, 0x3d, 0x00,
+  0xb6, 0xb0, 0xbb, 0xdb, 0xb6, 0x98, 0xbb, 0x8c, 0x1a, 0xb7, 0xbc, 0xa0, 0xf9,
+  0x7e, 0x3c, 0x66, 0x95, 0x47, 0x3d, 0xca, 0x33, 0xf0, 0xbc, 0xde, 0x00, 0xfa,
+  0x3b, 0x57, 0x05, 0xfb, 0xbb, 0xfc, 0x7f, 0xcb, 0xbc, 0x31, 0x1c, 0x11, 0x3d,
+  0x16, 0xe4, 0xfd, 0x3b, 0x3d, 0xd5, 0xb5, 0x3c, 0x8c, 0xd4, 0x69, 0xbd, 0x40,
+  0x7f, 0x87, 0xbb, 0x26, 0x9d, 0x77, 0xbc, 0x6b, 0xa7, 0xde, 0x3c, 0xf4, 0xd2,
+  0x00, 0x3c, 0xff, 0x0d, 0xbc, 0x3c, 0xab, 0xfb, 0x6f, 0x3d, 0x5a, 0x15, 0x8b,
+  0x3b, 0x05, 0x27, 0x77, 0x3d, 0xd8, 0xa8, 0x54, 0x3d, 0xa7, 0xf2, 0x01, 0x3d,
+  0x20, 0x41, 0x70, 0x3c, 0x19, 0x99, 0xfd, 0xbc, 0xc0, 0xea, 0x48, 0x3d, 0xd7,
+  0x09, 0x26, 0x3b, 0x79, 0x58, 0x6b, 0x3d, 0x2b, 0x43, 0x2e, 0xbd, 0x58, 0x06,
+  0x76, 0x3c, 0xc3, 0x4a, 0x8c, 0x3d, 0x4b, 0x5b, 0x62, 0x3d, 0xb2, 0xff, 0x1f,
+  0xbd, 0xeb, 0x73, 0x08, 0x3d, 0x39, 0xd4, 0x77, 0xbd, 0xfc, 0x94, 0x83, 0xbc,
+  0x0e, 0x0d, 0x6c, 0x3d, 0x5c, 0x29, 0x73, 0x3d, 0x96, 0xc4, 0x92, 0xba, 0x00,
+  0x64, 0x97, 0xbd, 0x3b, 0x52, 0x3a, 0xbd, 0x3a, 0x2d, 0x91, 0xbd, 0x62, 0x65,
+  0x97, 0xbd, 0x72, 0xde, 0xd2, 0xbd, 0x1d, 0x30, 0x00, 0xbd, 0x74, 0x93, 0x95,
+  0xbd, 0xae, 0x2c, 0xd7, 0xbc, 0xe3, 0xae, 0x27, 0x3d, 0x67, 0x7f, 0x0b, 0x3c,
+  0xfc, 0xcf, 0x74, 0xbc, 0x7f, 0x2b, 0x74, 0x3d, 0x00, 0x49, 0xa2, 0xba, 0x13,
+  0xfa, 0x0e, 0xbd, 0x7e, 0xfe, 0x9f, 0xbc, 0xa6, 0x05, 0xc7, 0xbb, 0xc2, 0xa7,
+  0x2a, 0xbc, 0xb3, 0x63, 0x9b, 0x3a, 0x9c, 0x14, 0x0e, 0x3d, 0x82, 0xc6, 0xb0,
+  0xbc, 0xc1, 0x25, 0xc0, 0x3c, 0x03, 0x95, 0x45, 0xbd, 0x61, 0xb6, 0x50, 0xbd,
+  0xf8, 0x77, 0xea, 0x3a, 0x9d, 0xa7, 0xaa, 0x3a, 0xf2, 0x18, 0x1d, 0xbd, 0x42,
+  0x15, 0x94, 0x3d, 0x7e, 0x0e, 0x47, 0xbd, 0xa5, 0x82, 0x84, 0x3d, 0xed, 0xbe,
+  0x3b, 0x3d, 0x3b, 0xdc, 0x2e, 0xbd, 0x5c, 0x8c, 0x4b, 0xbd, 0x37, 0xbc, 0x99,
+  0xbb, 0xb7, 0x55, 0x54, 0x3d, 0x8e, 0x6d, 0xa8, 0xbd, 0x09, 0x3c, 0x3f, 0x3d,
+  0x83, 0x0e, 0x3a, 0xbd, 0x8f, 0x1f, 0x91, 0x3d, 0x8b, 0x2b, 0x33, 0xbd, 0x92,
+  0x57, 0x58, 0x3d, 0x71, 0xcd, 0x27, 0xbd, 0xcf, 0x53, 0x30, 0x3d, 0x20, 0x81,
+  0x64, 0x3d, 0x50, 0x82, 0x60, 0xbd, 0x98, 0x46, 0x2f, 0x3d, 0x32, 0x95, 0x28,
+  0xbd, 0x70, 0xf5, 0x71, 0x3c, 0x9d, 0x96, 0xb0, 0xbc, 0x5b, 0x59, 0x56, 0xbd,
+  0x10, 0x59, 0x90, 0x3d, 0xc0, 0x1e, 0xbb, 0x3c, 0x5c, 0x37, 0x9d, 0x3d, 0xbd,
+  0x75, 0x61, 0x3d, 0xcf, 0x8b, 0x84, 0xbc, 0xb2, 0x23, 0x46, 0x3d, 0x0a, 0x82,
+  0x02, 0x3d, 0xaf, 0xd4, 0x8e, 0xbb, 0x60, 0x87, 0xca, 0x3c, 0xdb, 0x73, 0x1a,
+  0xbd, 0x52, 0xa2, 0x09, 0x3d, 0xa2, 0x5b, 0x4a, 0xbd, 0x1d, 0x5d, 0xa0, 0xbb,
+  0x30, 0x20, 0x7e, 0xbd, 0x84, 0x2a, 0x78, 0xbd, 0x74, 0x5f, 0x6a, 0xbd, 0xa5,
+  0x1a, 0xa5, 0xbd, 0xa8, 0x46, 0x92, 0x3c, 0xe5, 0x7e, 0x50, 0xbd, 0xc1, 0x19,
+  0x4b, 0x3c, 0x1a, 0x20, 0x71, 0x3d, 0xa1, 0xa7, 0x48, 0xbc, 0xc3, 0xa7, 0xeb,
+  0x3c, 0xd4, 0x58, 0x6c, 0xbd, 0x06, 0x40, 0x08, 0x3d, 0x07, 0x97, 0x93, 0x3d,
+  0x36, 0xb8, 0x5c, 0xbd, 0x69, 0x31, 0xc4, 0x3d, 0x5d, 0x20, 0x62, 0xbc, 0x73,
+  0x3a, 0xbf, 0xbc, 0xea, 0xff, 0x3f, 0x3d, 0x39, 0x07, 0xec, 0x3c, 0xeb, 0x30,
+  0xb4, 0xbb, 0x0b, 0x38, 0x72, 0xbd, 0x12, 0x71, 0xfd, 0xbc, 0xc5, 0x09, 0x82,
+  0x3b, 0x5d, 0x51, 0x84, 0xbd, 0xff, 0x16, 0x49, 0xbd, 0x5e, 0xd1, 0x13, 0xbd,
+  0xd8, 0xaf, 0x96, 0x3c, 0xea, 0x7c, 0x7e, 0xbd, 0x9b, 0x71, 0x1c, 0x3d, 0xe0,
+  0xff, 0xaf, 0xbc, 0xac, 0x24, 0x57, 0x3d, 0x8a, 0xf8, 0x49, 0x3d, 0x24, 0xfd,
+  0xbc, 0xbc, 0x46, 0x2c, 0xac, 0xbd, 0xc8, 0xdf, 0x63, 0xbc, 0x61, 0xc6, 0x2e,
+  0xbd, 0x9d, 0xec, 0xd9, 0xbc, 0xb1, 0x44, 0x86, 0xbd, 0x85, 0x38, 0x47, 0x3d,
+  0x7b, 0x49, 0x5a, 0xbd, 0xb0, 0x9c, 0xee, 0xbc, 0x03, 0x6f, 0x33, 0xbd, 0x55,
+  0x8c, 0x23, 0xbc, 0xd5, 0xcc, 0x82, 0xbc, 0x82, 0xc2, 0xcc, 0xbc, 0xac, 0x00,
+  0x85, 0x3c, 0xf6, 0xf5, 0x70, 0x3d, 0xb0, 0x0f, 0x03, 0x37, 0xa3, 0xfd, 0x5a,
+  0xbd, 0x13, 0x57, 0x38, 0x3c, 0x25, 0xe4, 0xea, 0xbc, 0x1a, 0xb8, 0x0e, 0x3c,
+  0x80, 0x95, 0x20, 0xbb, 0x84, 0x35, 0x36, 0x3d, 0x27, 0x0c, 0x1f, 0xbd, 0x4e,
+  0x46, 0x8d, 0x3d, 0xa4, 0xb0, 0xef, 0x3c, 0xe1, 0xf5, 0xce, 0xbc, 0x34, 0x54,
+  0x9d, 0xbc, 0x9f, 0x03, 0xd9, 0x3b, 0x22, 0xe9, 0xed, 0xbc, 0xd3, 0x7d, 0x30,
+  0xbd, 0xb8, 0x86, 0x1f, 0xbc, 0xed, 0xc3, 0x44, 0x3d, 0xbf, 0x32, 0xa1, 0x39,
+  0x74, 0xe5, 0x38, 0xbd, 0xa3, 0xe4, 0x6c, 0xbd, 0x56, 0x19, 0x33, 0xbd, 0x17,
+  0x60, 0xbd, 0xbc, 0xd5, 0xec, 0x4a, 0x3c, 0xa2, 0x27, 0xa4, 0x3d, 0x50, 0xea,
+  0x77, 0xbd, 0x5a, 0xb3, 0x91, 0x39, 0xf3, 0xc2, 0x19, 0x3d, 0xd2, 0xb9, 0x4f,
+  0xbd, 0x60, 0x90, 0x81, 0x3d, 0xbf, 0x14, 0x60, 0xbd, 0x7a, 0xdd, 0x62, 0x3c,
+  0x43, 0x4c, 0xa5, 0xbb, 0xad, 0x1c, 0xe1, 0xbc, 0xc8, 0x0b, 0x15, 0x3d, 0xe1,
+  0xbd, 0x0f, 0x3d, 0xc6, 0x1f, 0x92, 0x3d, 0xdf, 0x9a, 0x86, 0xbd, 0x08, 0x1a,
+  0xed, 0x3c, 0xfa, 0x1f, 0x00, 0x3c, 0x90, 0x94, 0x1b, 0x3d, 0x4a, 0x1c, 0x25,
+  0xbd, 0x79, 0xe4, 0xff, 0xbc, 0xdf, 0xeb, 0x91, 0x3d, 0x43, 0x22, 0x81, 0x3d,
+  0x1f, 0x1c, 0xa2, 0xbd, 0x54, 0xaf, 0x48, 0xbd, 0xbb, 0x7d, 0x4a, 0x3c, 0x32,
+  0xcd, 0x6a, 0x3d, 0xc0, 0x75, 0x8b, 0x3d, 0x9a, 0xad, 0x67, 0x3c, 0xd1, 0xe6,
+  0x30, 0xbd, 0x85, 0x2b, 0x33, 0x3c, 0xee, 0x90, 0x69, 0x3b, 0x7b, 0xdc, 0x96,
+  0xbd, 0x38, 0x29, 0xad, 0x3b, 0xd8, 0x2b, 0xff, 0xbb, 0x72, 0x62, 0x57, 0x3c,
+  0x55, 0x29, 0x86, 0x3d, 0xc7, 0x7c, 0x90, 0xbd, 0xfa, 0xa6, 0x71, 0xbd, 0x7f,
+  0x51, 0x15, 0x3c, 0x7a, 0x11, 0x61, 0xbd, 0xd8, 0xd1, 0x64, 0x3b, 0xbc, 0x7e,
+  0x8e, 0x3c, 0x06, 0x60, 0xe6, 0x3b, 0x1a, 0xd8, 0x43, 0x3d, 0x9b, 0xa8, 0x99,
+  0xbd, 0x30, 0x98, 0x17, 0x3d, 0x82, 0xd8, 0x7a, 0xbd, 0xca, 0x23, 0x14, 0x3d,
+  0x45, 0x6d, 0x18, 0xbd, 0x0d, 0x33, 0x8d, 0x3c, 0xd9, 0x88, 0xb5, 0xbc, 0x9c,
+  0x01, 0xc6, 0x3b, 0xc2, 0x52, 0xe5, 0x3c, 0xc6, 0xbf, 0x5a, 0x3d, 0xa8, 0x06,
+  0x1f, 0xbd, 0x1f, 0xaf, 0x4e, 0x3d, 0x84, 0x35, 0xca, 0xbd, 0x50, 0xc8, 0xee,
+  0x3c, 0x64, 0xe8, 0x35, 0xbd, 0xbc, 0x23, 0x31, 0x3d, 0x36, 0x1d, 0xbf, 0xbd,
+  0x7c, 0x88, 0x94, 0xbc, 0x0f, 0x8f, 0x1b, 0x3d, 0x08, 0x54, 0x81, 0x3c, 0x12,
+  0x2f, 0x8a, 0xbd, 0xd7, 0x70, 0x3c, 0xbc, 0xb8, 0x2a, 0x50, 0x3d, 0xc8, 0xed,
+  0x0e, 0xbd, 0xb7, 0xa3, 0x54, 0x3d, 0xc9, 0x64, 0x6c, 0xbc, 0x89, 0x83, 0x25,
+  0xbd, 0xef, 0x72, 0x3b, 0x3b, 0xeb, 0xf8, 0xec, 0x3b, 0xe6, 0x5e, 0x0b, 0xbc,
+  0xd4, 0xc0, 0xf5, 0xbc, 0x8a, 0x04, 0x92, 0x3d, 0xe8, 0x04, 0x39, 0xbd, 0x0f,
+  0x74, 0xea, 0x3c, 0xfc, 0x8b, 0x01, 0xbc, 0xb2, 0xe0, 0x73, 0x3d, 0xc8, 0xa1,
+  0xea, 0x3c, 0x99, 0xfe, 0x4f, 0x3d, 0xde, 0x4f, 0x36, 0xbd, 0x73, 0xe5, 0x76,
+  0xbd, 0x8b, 0xd2, 0xdb, 0x3b, 0x96, 0x72, 0x79, 0x3c, 0xd0, 0x9b, 0x14, 0x3d,
+  0x3d, 0x6f, 0x6a, 0x3d, 0x21, 0x55, 0x16, 0x3d, 0xeb, 0x2a, 0x91, 0x3d, 0x8c,
+  0xd0, 0x33, 0xbd, 0x45, 0xdd, 0x54, 0xbd, 0x7e, 0x94, 0x90, 0xbc, 0xd4, 0x4c,
+  0x8b, 0x3c, 0x4a, 0x6b, 0x19, 0x3d, 0x9e, 0x42, 0xeb, 0x3c, 0x7d, 0xf2, 0x4f,
+  0x3d, 0x17, 0x4f, 0xab, 0x3c, 0x28, 0x37, 0xa1, 0x3c, 0x6d, 0xb8, 0x88, 0xbd,
+  0xc1, 0xe3, 0x1e, 0xbd, 0x8f, 0x8c, 0x60, 0x3d, 0xe9, 0x88, 0x93, 0x3c, 0x54,
+  0x12, 0x8e, 0x3d, 0x04, 0x68, 0xcb, 0xbc, 0x6e, 0xbf, 0xb0, 0xb9, 0xba, 0x8b,
+  0x16, 0x3d, 0x3a, 0x30, 0xd5, 0x39, 0x89, 0x43, 0x89, 0x3c, 0x89, 0x8c, 0xc0,
+  0x3b, 0x93, 0x98, 0xd9, 0xbd, 0xc5, 0x26, 0x3e, 0xbd, 0x2a, 0x4f, 0xa9, 0xbb,
+  0x35, 0xa6, 0xe6, 0xbc, 0xeb, 0x89, 0x1f, 0x3d, 0xea, 0x85, 0xb7, 0xbc, 0xa7,
+  0x52, 0xbb, 0xbc, 0x02, 0xda, 0x86, 0x3d, 0x82, 0xad, 0xfd, 0xba, 0x01, 0x20,
+  0x2f, 0xbd, 0xb8, 0x8c, 0x9d, 0xbd, 0x9c, 0xbd, 0x1b, 0x3d, 0x1d, 0xad, 0xe6,
+  0x3c, 0xac, 0x48, 0x6b, 0x3c, 0xdd, 0x13, 0xcb, 0xbd, 0xee, 0xcd, 0x8a, 0xbd,
+  0x8b, 0x33, 0x7c, 0x3d, 0xc5, 0x0a, 0x2a, 0x3d, 0x13, 0x49, 0x77, 0x3d, 0x7e,
+  0x78, 0xd1, 0xbd, 0xd3, 0x18, 0x3c, 0x3c, 0xb7, 0xaa, 0xb1, 0xbc, 0x54, 0x3a,
+  0xce, 0xbc, 0x86, 0x08, 0x97, 0xbd, 0x04, 0x21, 0x01, 0xbc, 0x72, 0xa8, 0x65,
+  0x3d, 0x71, 0x0b, 0xf3, 0x3b, 0x14, 0x9e, 0x88, 0x3c, 0x9c, 0xc6, 0x90, 0x3d,
+  0x1d, 0xdb, 0x37, 0xbd, 0x8e, 0x9e, 0x59, 0x3c, 0xf6, 0xa9, 0x1a, 0xbd, 0xfd,
+  0xec, 0x19, 0x3d, 0xa3, 0x01, 0x5a, 0xbd, 0xcc, 0xe7, 0x15, 0xbd, 0x26, 0xe6,
+  0x51, 0x3d, 0xeb, 0x5f, 0x8d, 0x3d, 0x93, 0x7a, 0x73, 0x3c, 0x94, 0x02, 0x10,
+  0x3d, 0x5d, 0x7e, 0xa7, 0x3c, 0x52, 0x78, 0x12, 0xbd, 0xe2, 0xfb, 0x44, 0x3d,
+  0xb8, 0xdf, 0xa4, 0x3c, 0x84, 0x3d, 0x0e, 0xbd, 0xad, 0xae, 0x0e, 0x3c, 0x52,
+  0xda, 0x1e, 0x3d, 0xfe, 0x93, 0x92, 0xbd, 0xe8, 0xe3, 0xde, 0xbd, 0x7a, 0xdc,
+  0xd9, 0xbc, 0xc3, 0xb0, 0x68, 0x3d, 0x58, 0x56, 0x25, 0xbd, 0x3a, 0x61, 0xdc,
+  0xbc, 0x71, 0xa2, 0xbc, 0x3c, 0x1b, 0xab, 0x30, 0x3d, 0x2a, 0x68, 0xbd, 0xbb,
+  0x5e, 0xaf, 0x8b, 0xbd, 0xb4, 0x4d, 0x30, 0x3d, 0xa0, 0x46, 0x72, 0x3d, 0x4e,
+  0xd2, 0x10, 0x3d, 0x71, 0x47, 0x4e, 0xbd, 0xe5, 0xd4, 0xe6, 0xbc, 0x25, 0x05,
+  0x87, 0x3c, 0x33, 0x85, 0xec, 0x3c, 0x84, 0x58, 0x5f, 0xbd, 0xb0, 0xfa, 0xc0,
+  0xbd, 0xc0, 0xdb, 0x87, 0xba, 0xa0, 0x30, 0x13, 0x3d, 0x84, 0x01, 0xe2, 0xbc,
+  0xee, 0x8d, 0xa1, 0x3c, 0xc8, 0x8c, 0x24, 0x3c, 0x2b, 0x33, 0xf0, 0x3c, 0xc5,
+  0xdd, 0x55, 0x3c, 0x89, 0x7c, 0xa5, 0xbc, 0x3b, 0x39, 0x19, 0xbd, 0xed, 0x0d,
+  0x74, 0x3d, 0x98, 0xdf, 0x24, 0xbc, 0xdd, 0xdc, 0x38, 0xbd, 0xab, 0x9f, 0x75,
+  0x3b, 0xd7, 0x20, 0xf3, 0x3c, 0x96, 0xa3, 0x78, 0x3c, 0x58, 0x44, 0x90, 0xbd,
+  0x21, 0xcb, 0xf2, 0x3b, 0x18, 0x22, 0x58, 0xbd, 0x7c, 0x1c, 0x1b, 0xbd, 0xdc,
+  0x4d, 0x19, 0xbd, 0xff, 0x68, 0x35, 0xbb, 0x34, 0xc5, 0x5e, 0x3c, 0x48, 0x3a,
+  0x90, 0xbd, 0xa1, 0x84, 0xa7, 0x3c, 0x96, 0xc6, 0x46, 0xbd, 0x20, 0x22, 0xb3,
+  0xbc, 0x16, 0x95, 0x18, 0x3d, 0x84, 0xa2, 0x5e, 0x3d, 0x78, 0x3a, 0x29, 0xbd,
+  0x37, 0x9a, 0x5a, 0xbd, 0x93, 0x8b, 0x80, 0x3d, 0x25, 0xff, 0x49, 0xbd, 0xf0,
+  0x1e, 0x8c, 0xbb, 0xde, 0xa1, 0x48, 0x3d, 0x58, 0x67, 0x2d, 0x3d, 0x09, 0x18,
+  0x26, 0x3d, 0x37, 0x68, 0x85, 0x3d, 0xa0, 0x28, 0x70, 0x3d, 0x33, 0xf5, 0x9f,
+  0xbc, 0x81, 0xcc, 0x97, 0xbd, 0x75, 0x24, 0x45, 0xbd, 0x60, 0x45, 0x29, 0x3d,
+  0x6b, 0x87, 0x25, 0xbd, 0x67, 0xd9, 0xb5, 0xbc, 0x15, 0xcb, 0x01, 0xbd, 0x39,
+  0xa5, 0xc6, 0xbd, 0xd2, 0xbe, 0xb9, 0xbd, 0x7c, 0x53, 0x20, 0xbd, 0x1a, 0x64,
+  0xb4, 0xbd, 0x5a, 0xc1, 0x1d, 0x3d, 0xdf, 0xdd, 0x50, 0xbc, 0x8e, 0x86, 0x2b,
+  0x3d, 0x20, 0xeb, 0x4d, 0x3d, 0x9a, 0xf8, 0x88, 0x3d, 0x92, 0xf1, 0x5e, 0xbd,
+  0x24, 0xb3, 0xd8, 0xbb, 0x19, 0xbc, 0xd9, 0xbc, 0x8d, 0x97, 0x8f, 0xbd, 0x6d,
+  0xf5, 0x7b, 0x3c, 0xfe, 0x33, 0x66, 0xbc, 0x35, 0x64, 0xfa, 0x3b, 0xe6, 0x00,
+  0x9d, 0xbc, 0xd6, 0x9c, 0x63, 0xbd, 0x02, 0xff, 0x8e, 0xbd, 0x10, 0xa1, 0x23,
+  0xbd, 0x93, 0x33, 0x0f, 0xbd, 0x59, 0xfc, 0x1b, 0x3d, 0x43, 0x0c, 0x7f, 0x3d,
+  0x06, 0xbd, 0x96, 0x3d, 0xe1, 0x5b, 0x9f, 0xbc, 0x44, 0x05, 0xf8, 0x3c, 0x1c,
+  0x60, 0xec, 0xbd, 0x33, 0x7f, 0x8c, 0xbd, 0x93, 0xcb, 0x0c, 0xbc, 0xc0, 0x8d,
+  0x0e, 0xbb, 0x16, 0x45, 0x65, 0xbd, 0x76, 0x93, 0x88, 0xbd, 0x49, 0xd0, 0xb3,
+  0xbd, 0xeb, 0x0e, 0x56, 0xbd, 0x8f, 0x1a, 0xab, 0x3d, 0x30, 0xde, 0x72, 0xb8,
+  0xcf, 0xc7, 0x1d, 0xbd, 0x12, 0xc3, 0x31, 0xbd, 0x6e, 0x1d, 0x47, 0xbd, 0xb3,
+  0x0f, 0x8c, 0x3d, 0x31, 0x82, 0x80, 0x3d, 0x44, 0xc4, 0x6b, 0xbc, 0x07, 0x28,
+  0x5a, 0x3d, 0xa3, 0x3c, 0x3d, 0xbd, 0x13, 0x5c, 0x6a, 0x3d, 0x1c, 0x3f, 0x11,
+  0x3d, 0x50, 0xac, 0xb5, 0xbc, 0x9f, 0x0e, 0xd9, 0x3c, 0x55, 0xfb, 0xde, 0xbc,
+  0x6b, 0x4f, 0x6a, 0xbd, 0x38, 0x5f, 0x3f, 0x3b, 0x5a, 0x26, 0x98, 0xbc, 0x32,
+  0x8c, 0x36, 0x3d, 0x78, 0x0a, 0x73, 0x3c, 0x7f, 0xd4, 0x51, 0x3d, 0x69, 0xdb,
+  0x97, 0x3d, 0x52, 0x37, 0x80, 0x3d, 0x9b, 0x10, 0x88, 0xbd, 0xc0, 0xbf, 0x90,
+  0xbd, 0x43, 0x84, 0x44, 0x3d, 0x12, 0x73, 0xc8, 0xbc, 0x84, 0xe0, 0x42, 0x3d,
+  0xf5, 0x79, 0xd2, 0xbc, 0x88, 0x3b, 0x05, 0x3d, 0xf6, 0x10, 0xf3, 0x3b, 0x73,
+  0x77, 0x8d, 0x3d, 0x92, 0xf0, 0x77, 0x3d, 0xd4, 0xcd, 0x55, 0xbd, 0x44, 0x7c,
+  0x88, 0xbd, 0x3b, 0xe3, 0x5f, 0xbd, 0x0c, 0x35, 0x87, 0x3c, 0x09, 0x68, 0xf0,
+  0x3c, 0x60, 0x3e, 0x47, 0x3a, 0xf6, 0x12, 0xb2, 0xbd, 0x2b, 0xe9, 0x9d, 0x3d,
+  0x8e, 0x7c, 0x97, 0xbc, 0xb1, 0x05, 0x2e, 0xbc, 0x99, 0x6b, 0x14, 0xbd, 0xb2,
+  0xa1, 0x85, 0x3d, 0x1c, 0xd1, 0x31, 0x3d, 0x18, 0xe6, 0xf5, 0x3c, 0xa7, 0x25,
+  0x5a, 0x3c, 0xe0, 0x75, 0x9e, 0xbd, 0x1b, 0xe1, 0x69, 0xbd, 0x1b, 0x22, 0xc0,
+  0x3d, 0xc4, 0x04, 0x8e, 0x3d, 0x92, 0x7f, 0x9d, 0x3d, 0xd3, 0xf3, 0x80, 0xbb,
+  0x69, 0x7a, 0x58, 0x3c, 0xd5, 0xc2, 0x92, 0xbc, 0x26, 0x08, 0xa2, 0xbd, 0x9f,
+  0xe8, 0x45, 0x3d, 0x10, 0xc9, 0x44, 0x3d, 0x7e, 0xac, 0x61, 0x3d, 0x88, 0xa8,
+  0xf1, 0x3c, 0xa2, 0xd1, 0x87, 0xbd, 0x8c, 0xa7, 0xd1, 0xbc, 0x77, 0x21, 0x86,
+  0xbd, 0x3b, 0x5a, 0xaa, 0x3d, 0x27, 0x8b, 0xb7, 0x3d, 0xe2, 0x8c, 0x39, 0x3d,
+  0x16, 0x70, 0xc0, 0xbc, 0x45, 0xcc, 0x81, 0xbd, 0xfd, 0x54, 0x09, 0x3d, 0x7f,
+  0x19, 0x0d, 0x3c, 0x0a, 0xfe, 0x39, 0xbd, 0xaf, 0x91, 0x66, 0xbd, 0x1c, 0xf9,
+  0xa3, 0x3d, 0x6d, 0xfa, 0xa7, 0x3b, 0x55, 0x1d, 0xa2, 0x3d, 0xd4, 0x1c, 0x8a,
+  0x3d, 0x21, 0xeb, 0xbd, 0xbc, 0xd7, 0x77, 0x45, 0xbc, 0x2b, 0xb9, 0x37, 0xbd,
+  0x7b, 0x7c, 0xbd, 0xbd, 0x59, 0xa0, 0x92, 0xbd, 0xb9, 0x28, 0x2f, 0xbd, 0x1c,
+  0xb6, 0x8c, 0xbc, 0x48, 0x52, 0x58, 0xbd, 0x90, 0x67, 0xa3, 0x3b, 0x92, 0xff,
+  0x79, 0x3d, 0x55, 0x80, 0x9d, 0x3c, 0x68, 0x54, 0x98, 0xbd, 0xc6, 0xff, 0xbc,
+  0xbc, 0x76, 0xb5, 0x72, 0xbd, 0x00, 0x62, 0x86, 0xbd, 0x6b, 0x01, 0xe3, 0xbc,
+  0x42, 0x03, 0x6e, 0xbd, 0xd6, 0xe1, 0x7d, 0xbd, 0xcd, 0xed, 0x8b, 0x3c, 0x67,
+  0x9d, 0x49, 0x3d, 0x6a, 0xe8, 0x31, 0x3d, 0xfd, 0x25, 0x4c, 0x3d, 0x87, 0x12,
+  0xe8, 0xbb, 0x31, 0x54, 0x92, 0xbc, 0xbe, 0xab, 0x98, 0xbb, 0x85, 0x6c, 0xf7,
+  0x3b, 0xb8, 0x0e, 0xbc, 0xbc, 0xf8, 0xea, 0x9a, 0x3d, 0x36, 0x13, 0xe2, 0xbc,
+  0x9f, 0xd7, 0x6d, 0x3d, 0x4f, 0x0a, 0xb1, 0x3d, 0xba, 0x5c, 0x6b, 0xbd, 0xae,
+  0x73, 0x60, 0xbc, 0x61, 0xf2, 0x8b, 0x3c, 0x90, 0x4c, 0x7b, 0xbd, 0x50, 0xef,
+  0xe9, 0xbd, 0x54, 0x83, 0x99, 0xbc, 0x8f, 0xd5, 0x4d, 0x3d, 0x6b, 0x02, 0x37,
+  0x3d, 0xc8, 0xe7, 0x84, 0x3d, 0x4e, 0x73, 0x87, 0x3d, 0x7a, 0xcc, 0xaa, 0x3c,
+  0x0e, 0xde, 0x26, 0xbd, 0xef, 0xfb, 0xc8, 0xbd, 0x96, 0xe9, 0x11, 0xbd, 0xd2,
+  0xd6, 0x26, 0xbc, 0x01, 0xea, 0x72, 0xbd, 0xf4, 0xb7, 0xad, 0xbb, 0x5b, 0xe7,
+  0x9e, 0x3d, 0xe6, 0xa1, 0x06, 0xbe, 0x4d, 0xa9, 0xd4, 0x3c, 0x83, 0xc9, 0xdf,
+  0x3c, 0x31, 0x26, 0x85, 0x3c, 0x4d, 0x25, 0xcf, 0xbb, 0x6c, 0xea, 0x91, 0x3d,
+  0xb3, 0x55, 0x5d, 0x3c, 0x7f, 0x1d, 0x70, 0xbd, 0x0d, 0x6f, 0x85, 0x3d, 0xbe,
+  0xe6, 0x35, 0xbd, 0x0f, 0x5b, 0x02, 0xbc, 0x1e, 0xad, 0x60, 0xbd, 0xeb, 0x48,
+  0x4c, 0x3d, 0x73, 0x67, 0xaf, 0x3c, 0xda, 0x33, 0x03, 0x3d, 0xd9, 0xa3, 0x0d,
+  0xbb, 0x6e, 0x31, 0x11, 0x3d, 0xb3, 0x7e, 0xfc, 0x3c, 0xc4, 0x86, 0x49, 0x3c,
+  0x0a, 0x52, 0x0b, 0x3d, 0x68, 0x25, 0xae, 0x3d, 0xe0, 0x16, 0x02, 0x3d, 0xc0,
+  0x47, 0x3f, 0xbd, 0x98, 0x55, 0x70, 0x3c, 0x1a, 0xbb, 0x38, 0x3d, 0xcf, 0x31,
+  0xe4, 0xbc, 0xe0, 0x45, 0x39, 0xbd, 0x7c, 0xa1, 0x3f, 0xbd, 0xcc, 0x5b, 0x91,
+  0xbd, 0x55, 0x28, 0x59, 0x3a, 0x75, 0xdc, 0x02, 0xbd, 0xd8, 0x0d, 0xfe, 0xbb,
+  0x38, 0x7f, 0x92, 0xbd, 0x0f, 0xeb, 0x83, 0xbc, 0xcf, 0xe7, 0x0c, 0xbd, 0xb5,
+  0xf8, 0x59, 0x3d, 0xfc, 0xd4, 0xcf, 0xbb, 0xa3, 0x75, 0x8a, 0x3d, 0xac, 0xe9,
+  0x8e, 0xbd, 0x4a, 0xf9, 0x71, 0x3d, 0xee, 0x83, 0x32, 0xbc, 0x7c, 0x78, 0xa0,
+  0xbd, 0x87, 0x86, 0x6a, 0xbd, 0x1a, 0x3c, 0xe4, 0xbc, 0x89, 0x4a, 0xa1, 0x3d,
+  0xa0, 0x39, 0xdd, 0x3c, 0x93, 0xa3, 0x93, 0x3c, 0xdd, 0x08, 0xa2, 0x3d, 0x9a,
+  0x87, 0x98, 0xbd, 0xe6, 0x5a, 0x32, 0xbd, 0xeb, 0x4d, 0xea, 0xbb, 0x48, 0xda,
+  0x6b, 0x3c, 0x36, 0x23, 0x82, 0x3d, 0x80, 0x78, 0x90, 0x3d, 0x0e, 0x4c, 0x1b,
+  0xbd, 0xb9, 0x3c, 0x54, 0x3d, 0x5f, 0x8b, 0xf5, 0xbb, 0x54, 0x40, 0x54, 0xbd,
+  0x35, 0x04, 0x8e, 0xbc, 0x38, 0xcf, 0xe0, 0x3b, 0x2f, 0xf6, 0x55, 0xbd, 0xe0,
+  0xed, 0x7e, 0x3c, 0x84, 0x12, 0x9c, 0x3d, 0x74, 0x34, 0xfb, 0xbc, 0x02, 0xd9,
+  0x93, 0xbd, 0xff, 0x27, 0xa8, 0xbd, 0x83, 0xf3, 0xaf, 0xbb, 0x99, 0x16, 0x7d,
+  0x3d, 0xc6, 0xd9, 0x32, 0xbd, 0xb1, 0xa4, 0xbd, 0xbc, 0xd2, 0x1c, 0x5b, 0x3d,
+  0xb3, 0xdb, 0x31, 0x3d, 0xe4, 0x10, 0x03, 0x3c, 0x29, 0xb0, 0x0b, 0xbd, 0x16,
+  0x47, 0x9b, 0x3d, 0x75, 0x6b, 0xfd, 0xbc, 0x09, 0x92, 0xac, 0x3c, 0x12, 0x2c,
+  0x07, 0x3d, 0x5a, 0xb3, 0xa0, 0x3c, 0xc9, 0x3d, 0x21, 0xbd, 0xc1, 0x80, 0x6d,
+  0xbd, 0xa9, 0x20, 0x9c, 0x3d, 0xf5, 0x5b, 0x07, 0xbe, 0x9a, 0x76, 0x6f, 0xbd,
+  0xd5, 0x11, 0xff, 0x3d, 0x58, 0xda, 0xd4, 0x3c, 0x18, 0x2f, 0xb9, 0x3d, 0xd4,
+  0xa0, 0x6c, 0xbd, 0x4d, 0xe5, 0x2b, 0xbc, 0x97, 0x9d, 0x5f, 0xbc, 0x55, 0xe6,
+  0x9b, 0xbd, 0x61, 0xee, 0xb3, 0x3c, 0x24, 0x06, 0xbf, 0x3c, 0xc2, 0x90, 0x09,
+  0xbd, 0x91, 0xaf, 0x63, 0x3d, 0xde, 0x86, 0x7b, 0x3c, 0xca, 0x42, 0x0d, 0x3c,
+  0x5f, 0xda, 0xcd, 0xbc, 0x7b, 0x27, 0x13, 0x3d, 0xf9, 0xd1, 0x14, 0x3c, 0xb6,
+  0x83, 0x4a, 0x3d, 0x37, 0x74, 0x63, 0xbd, 0xbb, 0x85, 0x40, 0xbd, 0x3e, 0x15,
+  0x13, 0x3d, 0x00, 0xe1, 0x22, 0xbd, 0xef, 0xdd, 0x63, 0xbd, 0x95, 0xdb, 0xa6,
+  0x3c, 0xf4, 0xc1, 0x86, 0xbd, 0xfd, 0xf0, 0xe5, 0x3c, 0x84, 0xc1, 0x69, 0xbd,
+  0xe4, 0x85, 0xf5, 0x3c, 0x18, 0xfa, 0x79, 0xbd, 0xe3, 0xd5, 0x2e, 0xbd, 0x32,
+  0x90, 0x8f, 0xbc, 0x40, 0xfa, 0x08, 0xbc, 0xa4, 0x5f, 0xcb, 0xbc, 0x5a, 0xa7,
+  0x3f, 0x3d, 0x09, 0x40, 0x23, 0x3d, 0x7b, 0x17, 0x0e, 0xbd, 0x6e, 0x70, 0xb9,
+  0x3b, 0xc7, 0x3d, 0x4d, 0xbd, 0xe9, 0x57, 0x5d, 0x3d, 0x5c, 0x02, 0x91, 0x3c,
+  0xc8, 0x08, 0x31, 0xbd, 0x09, 0xea, 0xe3, 0x3c, 0x14, 0x23, 0xf6, 0x3c, 0x95,
+  0xd1, 0x22, 0xbd, 0xba, 0x27, 0xce, 0x3c, 0xb2, 0x59, 0x42, 0xbd, 0x29, 0x50,
+  0x6d, 0x3d, 0x20, 0xe5, 0x10, 0xbd, 0xc2, 0x68, 0x5a, 0xbd, 0x04, 0x6e, 0x81,
+  0xbd, 0xd6, 0xc7, 0xa4, 0xbc, 0x16, 0x22, 0x33, 0x3d, 0x80, 0xbf, 0x70, 0x3c,
+  0xbf, 0x62, 0x02, 0xbd, 0xdd, 0x19, 0x28, 0xbd, 0x8d, 0x5c, 0x60, 0x3d, 0x96,
+  0xb4, 0x24, 0xbd, 0x9a, 0xb5, 0x6e, 0xbd, 0x52, 0xb5, 0x81, 0x3d, 0xf3, 0x49,
+  0x85, 0xbd, 0x4a, 0x65, 0xcc, 0x3c, 0x06, 0xca, 0x13, 0xbd, 0x18, 0x94, 0x07,
+  0x3d, 0xde, 0x60, 0x45, 0x3c, 0x7a, 0x2d, 0x69, 0x3d, 0x7e, 0xc6, 0xba, 0xbc,
+  0xff, 0xcf, 0x64, 0x3d, 0x3e, 0x22, 0x98, 0xbd, 0xe1, 0x87, 0xc8, 0x3c, 0xec,
+  0x54, 0x90, 0xbd, 0x60, 0x0b, 0x09, 0x3d, 0x5e, 0xc7, 0x95, 0x3c, 0x54, 0x1c,
+  0x5b, 0x3b, 0xac, 0x77, 0xfe, 0x3c, 0x4c, 0x43, 0xea, 0xbc, 0xe4, 0x4d, 0xb3,
+  0x3c, 0xab, 0x96, 0x20, 0xbd, 0xf7, 0x8a, 0x48, 0xbd, 0xcc, 0xcb, 0x70, 0x3d,
+  0x25, 0x01, 0x91, 0xbc, 0x9c, 0x9a, 0x96, 0x3c, 0x9c, 0x7d, 0x56, 0x3d, 0x3e,
+  0x2b, 0x47, 0xbd, 0x44, 0x48, 0x15, 0xbd, 0x38, 0x4e, 0xc1, 0x3c, 0x9e, 0x72,
+  0x05, 0x3d, 0xe9, 0xbd, 0x44, 0xbc, 0x96, 0xdd, 0x6f, 0x3d, 0x17, 0x2b, 0x4e,
+  0x3c, 0x21, 0x91, 0x4c, 0x3d, 0x2f, 0x87, 0x8e, 0xbd, 0xf2, 0xd2, 0x31, 0x3d,
+  0x47, 0x07, 0xad, 0xbc, 0x41, 0x54, 0x89, 0x3c, 0xee, 0xa9, 0x4d, 0x3d, 0xf2,
+  0xb1, 0x80, 0x3d, 0x6a, 0xd9, 0x78, 0xbd, 0x55, 0x4a, 0x32, 0xbd, 0xd1, 0xd8,
+  0x44, 0x3d, 0xda, 0x72, 0x7d, 0x3d, 0xa1, 0xd1, 0xbc, 0x3b, 0x7a, 0xf4, 0x32,
+  0xbd, 0xf0, 0x44, 0x84, 0x3d, 0xd3, 0x0b, 0x8c, 0x3d, 0xd9, 0xc8, 0x58, 0xbd,
+  0xdd, 0x2c, 0x7c, 0x3d, 0x49, 0x3e, 0x8f, 0x3d, 0x39, 0xbd, 0x95, 0xbd, 0x99,
+  0x46, 0x25, 0x3d, 0x63, 0xfe, 0x20, 0xbd, 0x0a, 0x1d, 0x62, 0xbc, 0x4b, 0xae,
+  0x3b, 0xbc, 0x3c, 0x28, 0x84, 0xbc, 0x79, 0x24, 0x25, 0xbd, 0x62, 0x6b, 0x56,
+  0xbd, 0xe9, 0x9a, 0x88, 0x3d, 0xd6, 0x9f, 0x85, 0xbc, 0xad, 0xf6, 0x51, 0xbd,
+  0xc2, 0x72, 0x85, 0x3d, 0xf6, 0x0d, 0x89, 0xbd, 0x3e, 0x76, 0xca, 0x39, 0x90,
+  0x96, 0x89, 0x3d, 0xa1, 0x6e, 0x25, 0xbd, 0x4b, 0xbd, 0x18, 0x3c, 0x0e, 0x05,
+  0x69, 0xbc, 0x03, 0x9e, 0x76, 0x3d, 0xa3, 0xae, 0x67, 0x3d, 0xc4, 0x38, 0x5a,
+  0x3d, 0x8c, 0x9d, 0x53, 0xbd, 0x35, 0x24, 0x42, 0xbd, 0x36, 0xfa, 0xcf, 0x3c,
+  0xe8, 0x09, 0x0f, 0xbd, 0xe9, 0x6e, 0x15, 0xbd, 0x51, 0x03, 0x1b, 0xbd, 0xf7,
+  0x1d, 0x32, 0x3d, 0x08, 0xfc, 0x2f, 0xbd, 0x9d, 0x4c, 0x65, 0x3d, 0x9d, 0xf0,
+  0x98, 0xbb, 0xb0, 0xba, 0x0d, 0xbc, 0x64, 0xee, 0x03, 0xbb, 0x92, 0x82, 0x16,
+  0xbc, 0xa5, 0xa0, 0x94, 0xbd, 0xd0, 0x1f, 0xf1, 0x3c, 0xeb, 0x06, 0x8c, 0xbb,
+  0xb5, 0xc2, 0x64, 0x3c, 0x7e, 0x30, 0x55, 0x3c, 0x68, 0x89, 0x64, 0x3c, 0xec,
+  0x1e, 0x9e, 0x3c, 0xf0, 0xc9, 0x57, 0x3d, 0xfe, 0x25, 0x0c, 0xbd, 0x2f, 0xb4,
+  0x0b, 0x3c, 0x32, 0x76, 0x7a, 0xbd, 0xd2, 0x15, 0xea, 0xba, 0xc0, 0xc9, 0x45,
+  0xbd, 0xb7, 0xda, 0x48, 0xbc, 0x5e, 0x85, 0x6c, 0x3c, 0xbc, 0xda, 0x84, 0xbc,
+  0xc6, 0x56, 0x35, 0xbd, 0x21, 0xfd, 0x7d, 0x3d, 0xbf, 0x0c, 0x0f, 0x3b, 0xc2,
+  0x28, 0xa4, 0xbc, 0xad, 0xa3, 0xe7, 0xbb, 0x77, 0xd9, 0x55, 0x3d, 0x6d, 0x5a,
+  0x21, 0xbc, 0x3f, 0xa0, 0xd9, 0xbc, 0x1b, 0x86, 0x85, 0x3d, 0x38, 0x2f, 0x1f,
+  0xbd, 0xd5, 0xa5, 0x43, 0x3d, 0xdb, 0x04, 0x8d, 0xbd, 0xbc, 0x0d, 0x25, 0x3d,
+  0xf5, 0x71, 0x86, 0x3d, 0xa8, 0x4e, 0x88, 0xbd, 0xca, 0xab, 0x24, 0x3c, 0x8d,
+  0x03, 0xda, 0x3c, 0xad, 0x77, 0x19, 0xbc, 0x2e, 0x7c, 0xf5, 0x3c, 0x75, 0x45,
+  0x6e, 0x3d, 0x9b, 0x9f, 0x80, 0xbd, 0x1d, 0xce, 0x85, 0x3d, 0xb6, 0xbe, 0x86,
+  0xbc, 0xc0, 0x1c, 0x55, 0xbb, 0xd0, 0xc7, 0x5c, 0xbd, 0x1f, 0x60, 0x64, 0x3c,
+  0x4f, 0x04, 0x60, 0xbd, 0x04, 0xc9, 0x64, 0x3d, 0x0a, 0xbb, 0x10, 0x3b, 0x08,
+  0x41, 0x92, 0xbd, 0xac, 0x5b, 0x15, 0xbd, 0x44, 0xe8, 0x27, 0x3b, 0x9c, 0x98,
+  0x0c, 0x3d, 0x09, 0x52, 0x7a, 0x3d, 0x33, 0xe4, 0xcd, 0xbc, 0xda, 0x48, 0x17,
+  0xbd, 0x26, 0xe5, 0x5d, 0xbb, 0x2f, 0xfc, 0x69, 0xbd, 0x9f, 0xfd, 0x54, 0x3d,
+  0x1d, 0x45, 0x07, 0xbd, 0x86, 0x69, 0x91, 0x3c, 0x9e, 0x1a, 0xbe, 0xbc, 0xfa,
+  0xf4, 0x5e, 0x3d, 0xb5, 0x9d, 0x00, 0xbd, 0xe0, 0xfd, 0x90, 0x3c, 0x3a, 0xac,
+  0xc9, 0xbc, 0x11, 0xa7, 0xb0, 0xbb, 0x3e, 0x18, 0xa8, 0x3c, 0x79, 0x2e, 0x55,
+  0xbd, 0xe0, 0xb2, 0xfd, 0xbb, 0x72, 0xb0, 0x5d, 0xbc, 0xe1, 0xd9, 0x6f, 0x3d,
+  0xd5, 0x3a, 0x9f, 0xbc, 0xc8, 0x8f, 0x1a, 0xbd, 0x18, 0x60, 0x3b, 0x3c, 0xc0,
+  0x90, 0x24, 0xbc, 0x78, 0xb6, 0x50, 0x3d, 0x84, 0xc6, 0x81, 0xbd, 0x98, 0x2d,
+  0x46, 0x3d, 0x7f, 0x8a, 0x3b, 0x3d, 0x03, 0xd9, 0x7f, 0x3d, 0x50, 0x04, 0xae,
+  0x3c, 0xaf, 0xae, 0x6b, 0xbd, 0xcd, 0x34, 0x48, 0xbd, 0xbd, 0x05, 0xa8, 0x3c,
+  0x84, 0xc8, 0x3f, 0xbd, 0xcb, 0x46, 0x89, 0x3d, 0x92, 0x2b, 0x16, 0x3d, 0x98,
+  0xfb, 0xcd, 0xbc, 0x80, 0x5b, 0x43, 0xbd, 0xac, 0x5e, 0x78, 0x3c, 0xd6, 0xbf,
+  0x7e, 0x3b, 0x32, 0xec, 0x81, 0x3b, 0xce, 0xab, 0xf1, 0x3b, 0xb2, 0xd7, 0x86,
+  0xbc, 0xb1, 0xe3, 0x09, 0x3d, 0x4f, 0xc6, 0xa5, 0xbc, 0x4c, 0x1b, 0x89, 0x3c,
+  0xd6, 0x09, 0x2b, 0x3d, 0x61, 0x67, 0x4a, 0xbc, 0x7a, 0x5e, 0x87, 0xbc, 0x6c,
+  0x32, 0x55, 0x3c, 0x6b, 0xe0, 0xa7, 0xba, 0x41, 0xc8, 0xb5, 0xbc, 0x94, 0x54,
+  0x64, 0xbc, 0x81, 0xb6, 0x33, 0x3d, 0x3a, 0x05, 0x59, 0x3d, 0x42, 0x25, 0x46,
+  0xbd, 0xfc, 0xda, 0x8c, 0xbd, 0x17, 0x64, 0x87, 0x3d, 0x55, 0x39, 0x61, 0x3d,
+  0x4f, 0xcf, 0x25, 0xbd, 0xfc, 0x4d, 0x26, 0x3c, 0x7c, 0x18, 0xd8, 0x3c, 0x4f,
+  0x1b, 0x5c, 0x3d, 0x3a, 0x09, 0xcd, 0x3c, 0x27, 0x4a, 0x00, 0x3d, 0x1c, 0xb7,
+  0xb7, 0xbc, 0x0a, 0x1b, 0x38, 0xbc, 0x88, 0x6d, 0x2f, 0x3d, 0x96, 0xdf, 0x6a,
+  0xbd, 0x7e, 0x7e, 0xa0, 0xb9, 0x10, 0x23, 0x10, 0xbc, 0xec, 0x6b, 0xbf, 0x3c,
+  0x1a, 0x8e, 0x7a, 0xbc, 0x68, 0xb1, 0x7c, 0x3d, 0xb0, 0xcc, 0x30, 0xbd, 0xec,
+  0x59, 0xef, 0x3c, 0x8d, 0xd5, 0x41, 0x3b, 0x82, 0xa1, 0xec, 0xbc, 0x29, 0x35,
+  0x51, 0xbd, 0x6e, 0x6e, 0x91, 0xbc, 0xf9, 0x6d, 0x2a, 0x3d, 0x5d, 0x97, 0x17,
+  0x3d, 0xcb, 0xad, 0x29, 0x3c, 0xc4, 0x47, 0x41, 0x3d, 0x40, 0x7c, 0x6a, 0xbc,
+  0xa6, 0x09, 0x1e, 0x3d, 0x14, 0x9c, 0xf2, 0xbc, 0x70, 0x31, 0x5d, 0x3c, 0xd1,
+  0x54, 0x70, 0xbc, 0xd8, 0x58, 0xdd, 0x3a, 0x65, 0x21, 0x6a, 0xbd, 0x64, 0x81,
+  0x99, 0xbd, 0x51, 0x5a, 0x64, 0x3c, 0x8c, 0xa6, 0x90, 0x3c, 0xe6, 0xb6, 0x2a,
+  0xbd, 0x3d, 0x2a, 0x15, 0xbd, 0x82, 0xbe, 0x8d, 0xbc, 0x65, 0x32, 0x68, 0xbd,
+  0x0a, 0x5d, 0x6d, 0xbc, 0x24, 0x8c, 0xd6, 0xbc, 0x70, 0x4d, 0xe7, 0x3c, 0x06,
+  0x58, 0x01, 0x3c, 0x22, 0xd2, 0x58, 0x3d, 0x62, 0x60, 0x88, 0x3c, 0xfc, 0xe6,
+  0x12, 0x3d, 0x31, 0x59, 0xdb, 0x3c, 0x5d, 0xfb, 0x96, 0xbc, 0xb6, 0x50, 0x7f,
+  0x3b, 0xd7, 0x01, 0x37, 0x3d, 0x6a, 0x71, 0xc4, 0xbc, 0x8d, 0x28, 0xc9, 0x3c,
+  0x33, 0x39, 0x4f, 0xbb, 0x14, 0x14, 0x1b, 0x3d, 0x32, 0x36, 0x62, 0xbd, 0xa7,
+  0xf1, 0x89, 0x3d, 0xc4, 0x12, 0x13, 0x3d, 0xf3, 0x79, 0xde, 0x3c, 0xc0, 0x39,
+  0xb3, 0xbb, 0x36, 0xb5, 0x54, 0xbd, 0x04, 0xf2, 0xcc, 0xbc, 0x45, 0x14, 0xf8,
+  0x3a, 0x4b, 0x1d, 0x55, 0xbd, 0x13, 0x35, 0xc6, 0xbc, 0x7a, 0x92, 0x1b, 0xbd,
+  0x71, 0xb0, 0x3b, 0xbd, 0xfe, 0x84, 0x2f, 0xbd, 0xd4, 0x64, 0x60, 0x3d, 0xa7,
+  0x0b, 0xb7, 0xbb, 0xd1, 0xc7, 0x8a, 0xbd, 0x21, 0x20, 0x78, 0x3d, 0x1b, 0x25,
+  0x77, 0x3d, 0x5e, 0x06, 0x20, 0xbd, 0x7d, 0xfa, 0xe0, 0xbc, 0x5b, 0x2b, 0x38,
+  0x3d, 0x8c, 0x10, 0x90, 0xbd, 0xbe, 0xc0, 0xb2, 0x3c, 0x5a, 0x88, 0x94, 0xbd,
+  0x80, 0x87, 0x94, 0x3c, 0x73, 0xed, 0x81, 0xbd, 0x73, 0x42, 0x3f, 0xba, 0xdc,
+  0xf8, 0x4e, 0x3d, 0x9a, 0xd4, 0x8d, 0xbc, 0x3a, 0x6f, 0x72, 0xbc, 0x37, 0xe8,
+  0x06, 0x3d, 0xbb, 0x35, 0x61, 0x3d, 0x64, 0xc6, 0x4a, 0x3d, 0xee, 0x94, 0x13,
+  0xb9, 0xc0, 0x4b, 0xaf, 0xba, 0x60, 0x4b, 0x42, 0x3d, 0x40, 0x88, 0xb1, 0x3c,
+  0xc6, 0x61, 0x6c, 0x3d, 0x92, 0xd0, 0x40, 0x3d, 0x32, 0xc0, 0x8d, 0xbd, 0x90,
+  0x66, 0xc2, 0xbc, 0x52, 0x1f, 0x14, 0xbd, 0x03, 0x9d, 0x23, 0x3d, 0x81, 0x60,
+  0xe1, 0x3c, 0xe3, 0x31, 0x5f, 0x3d, 0x38, 0xbc, 0x52, 0x3d, 0x23, 0x3e, 0x3b,
+  0xbd, 0xf6, 0x53, 0x8e, 0xbd, 0xc9, 0xb1, 0x88, 0xbd, 0x02, 0x0c, 0xc6, 0xbc,
+  0x2e, 0x6d, 0x26, 0xbd, 0xe2, 0x88, 0x87, 0xbd, 0x45, 0x45, 0x28, 0x3d, 0xbc,
+  0x73, 0xd7, 0xba, 0x17, 0x1e, 0x15, 0xbc, 0xa6, 0x0c, 0x9c, 0xbc, 0x5a, 0x74,
+  0x63, 0x3d, 0x05, 0x28, 0xf6, 0x3c, 0xe5, 0xda, 0x4d, 0xbd, 0x02, 0x69, 0x42,
+  0xbd, 0x8a, 0xb0, 0x2c, 0x3d, 0x27, 0x22, 0x07, 0x3d, 0x6a, 0x7a, 0x08, 0x3b,
+  0x88, 0xb6, 0x03, 0x3d, 0x80, 0xad, 0xac, 0xbb, 0xc9, 0x67, 0x6d, 0xbb, 0x80,
+  0xf0, 0x8d, 0xbd, 0x53, 0x78, 0x85, 0x3d, 0x14, 0x99, 0x24, 0xbb, 0x86, 0x7c,
+  0x0c, 0x3d, 0xbe, 0xff, 0x79, 0x3d, 0x01, 0x39, 0xb4, 0x3c, 0x19, 0x42, 0x52,
+  0x3c, 0x4d, 0x8b, 0x73, 0x3d, 0xb4, 0x6b, 0xf1, 0x3a, 0x6e, 0x53, 0xb4, 0xbc,
+  0x09, 0x88, 0x11, 0xbd, 0xdf, 0x5e, 0x86, 0xbd, 0x10, 0xdc, 0x5a, 0xbd, 0x6b,
+  0xb3, 0x3a, 0xbd, 0x7e, 0x23, 0x84, 0xbd, 0x95, 0x50, 0x8c, 0xbd, 0xd1, 0x50,
+  0x93, 0x3c, 0x5f, 0x43, 0x67, 0x3a, 0x92, 0xc2, 0x91, 0xbd, 0xbe, 0xb0, 0x4e,
+  0xbd, 0x8c, 0xeb, 0x36, 0xbd, 0x4e, 0x0e, 0x82, 0xbd, 0xc5, 0x15, 0x0b, 0xbd,
+  0x1c, 0x66, 0x5a, 0xbd, 0xf6, 0xe4, 0x19, 0x3b, 0x4d, 0x1c, 0x07, 0x3d, 0x70,
+  0x1f, 0x24, 0x3d, 0x59, 0x80, 0x3b, 0xbd, 0x8e, 0x9e, 0xae, 0xbb, 0x11, 0x6f,
+  0x8f, 0x3b, 0x5f, 0xc9, 0x74, 0xbd, 0x36, 0x65, 0x2b, 0x3c, 0x43, 0xb4, 0xcf,
+  0x3c, 0x7f, 0xbf, 0x18, 0x3d, 0x91, 0x58, 0x16, 0xbd, 0x72, 0xc4, 0xf3, 0xbc,
+  0x80, 0xd3, 0x8a, 0x3b, 0x95, 0x0e, 0xe7, 0x3c, 0xdd, 0x17, 0x1d, 0x3d, 0x55,
+  0x74, 0x98, 0xbd, 0x5c, 0x6b, 0x1e, 0xbc, 0x02, 0x65, 0x61, 0xba, 0x01, 0x7f,
+  0x81, 0xbc, 0x97, 0x95, 0x73, 0xbd, 0xd8, 0x60, 0xfd, 0xbc, 0xd4, 0x64, 0x8a,
+  0x3a, 0xe5, 0x81, 0x24, 0x3c, 0xfd, 0x2b, 0x14, 0x3d, 0x60, 0x49, 0xff, 0x3b,
+  0x6f, 0x63, 0x33, 0xbd, 0xe0, 0x83, 0x4b, 0xbd, 0xed, 0x7a, 0x10, 0x3d, 0x5b,
+  0x26, 0x33, 0x3d, 0x03, 0xff, 0x2d, 0x3d, 0xcd, 0xca, 0x42, 0xbd, 0x4c, 0x09,
+  0x3f, 0x3d, 0xcb, 0xcb, 0x95, 0xbc, 0xff, 0x04, 0x18, 0x3c, 0x99, 0x48, 0x6c,
+  0xbd, 0xb6, 0x3f, 0x04, 0x3a, 0x68, 0x3d, 0x67, 0x3c, 0x71, 0xd9, 0x7a, 0xbc,
+  0x88, 0x7d, 0x02, 0x3c, 0x0f, 0xfa, 0x3b, 0xbd, 0x78, 0x64, 0xfc, 0x3c, 0xab,
+  0x8c, 0x37, 0x3d, 0x08, 0x19, 0xcf, 0xbc, 0x03, 0xe0, 0x85, 0xbd, 0x1b, 0xaf,
+  0x79, 0xbd, 0x92, 0x9e, 0x67, 0x3d, 0x31, 0x3e, 0x94, 0xbd, 0xe8, 0xd1, 0x1f,
+  0xbd, 0x4d, 0xa1, 0xcb, 0x3c, 0x9f, 0xc0, 0xf7, 0x3c, 0xa8, 0x88, 0xe1, 0xbc,
+  0xf7, 0x13, 0x8b, 0x3c, 0x77, 0x1b, 0xfe, 0xbc, 0x11, 0xf0, 0x4d, 0x3d, 0x02,
+  0x73, 0xff, 0xbc, 0x20, 0x4b, 0x2f, 0x3d, 0x50, 0x14, 0x28, 0x3c, 0xa2, 0x0a,
+  0xc1, 0xbc, 0xb3, 0xf6, 0xe1, 0xbc, 0x32, 0x98, 0xa1, 0x3c, 0x3f, 0xef, 0xcc,
+  0x3b, 0xd6, 0xbf, 0x37, 0xbd, 0x4e, 0x0a, 0x15, 0x3d, 0xfd, 0x81, 0x24, 0xbd,
+  0x62, 0x05, 0x43, 0x3d, 0x4b, 0x8d, 0xb5, 0xbc, 0x0e, 0xe7, 0x7c, 0x3d, 0xd1,
+  0x64, 0x88, 0xbd, 0xca, 0x03, 0xd3, 0xbb, 0xc9, 0xaa, 0x9f, 0xbb, 0xb5, 0x0e,
+  0xbf, 0xbc, 0x48, 0x82, 0xe7, 0x3c, 0xa1, 0x4b, 0x10, 0x3d, 0x40, 0x51, 0x68,
+  0xbb, 0xc0, 0x36, 0xc4, 0x3c, 0xcc, 0xd9, 0x37, 0xbc, 0xec, 0x40, 0xcf, 0x3c,
+  0xb2, 0x38, 0x52, 0xbd, 0x15, 0xe7, 0x0c, 0xbd, 0x52, 0xea, 0x59, 0x3c, 0xcf,
+  0xe3, 0xd1, 0xbc, 0x9e, 0xb7, 0x94, 0xbc, 0x1a, 0x13, 0xc8, 0x3c, 0x04, 0x51,
+  0xa0, 0x3b, 0x7f, 0xb4, 0x32, 0x3d, 0x5e, 0x43, 0x5a, 0x3d, 0x8b, 0x6d, 0x98,
+  0xba, 0xa4, 0x70, 0x47, 0x3d, 0xe6, 0x23, 0x60, 0x3d, 0x48, 0xf3, 0x8b, 0xbc,
+  0x85, 0xfe, 0x60, 0x3d, 0x33, 0x94, 0xc7, 0xbc, 0xdd, 0xbf, 0x80, 0xbd, 0x31,
+  0x98, 0xbb, 0x3b, 0x76, 0x70, 0x8a, 0x3c, 0x72, 0xc5, 0x4e, 0x3c, 0x31, 0x53,
+  0x20, 0x3d, 0xcd, 0xda, 0x03, 0x3b, 0x8c, 0xc0, 0x3d, 0x3d, 0x9c, 0xaa, 0x90,
+  0xbd, 0xb5, 0x9f, 0xab, 0x3c, 0x45, 0x77, 0x31, 0xbd, 0xea, 0x85, 0x8e, 0xbd,
+  0x15, 0x6d, 0x8b, 0xbc, 0xb9, 0x98, 0xb1, 0xbc, 0x09, 0x9b, 0xff, 0x3c, 0x1e,
+  0xcf, 0x3c, 0x3d, 0x3c, 0xe3, 0x2a, 0xbd, 0x2a, 0xff, 0x20, 0x3d, 0xbb, 0x1c,
+  0x4a, 0x3b, 0x8f, 0x19, 0x83, 0xbd, 0xad, 0x9f, 0xe5, 0x3c, 0x43, 0x3d, 0x44,
+  0x3d, 0xaa, 0xb9, 0xe3, 0x3c, 0x8c, 0xd1, 0x86, 0x3d, 0xfa, 0x93, 0x7c, 0x3d,
+  0x31, 0xe5, 0x67, 0xbc, 0x3f, 0x25, 0x8a, 0xbd, 0x90, 0x91, 0x5e, 0x3b, 0xbf,
+  0xd8, 0xfe, 0xbc, 0x68, 0xaa, 0x85, 0x3c, 0xb3, 0xb6, 0x07, 0xbd, 0x6f, 0x51,
+  0x91, 0xbd, 0x3c, 0x5d, 0xc8, 0xbc, 0xba, 0xf5, 0xd3, 0xbb, 0x8d, 0x90, 0xd5,
+  0xbc, 0x02, 0x78, 0x2f, 0xbc, 0x12, 0x94, 0x10, 0x3d, 0xb2, 0x26, 0x82, 0xbd,
+  0x49, 0x2a, 0x70, 0x3d, 0x9c, 0xf4, 0x67, 0xbd, 0x8d, 0x33, 0xf3, 0xbc, 0x22,
+  0xa0, 0xc3, 0x3c, 0x38, 0xb2, 0x31, 0x3d, 0x71, 0xe9, 0x87, 0xbd, 0x7c, 0xc5,
+  0x96, 0xbd, 0x5b, 0x13, 0xa5, 0xbc, 0x2d, 0x8a, 0x8a, 0x3d, 0x80, 0xc2, 0x24,
+  0x3d, 0x1e, 0xc5, 0x74, 0x3d, 0xec, 0x3a, 0xca, 0x3c, 0x37, 0xb4, 0x00, 0xbc,
+  0x29, 0xe2, 0x0c, 0x3d, 0xbc, 0x36, 0x20, 0x3d, 0x58, 0x3a, 0x5f, 0x3d, 0x8a,
+  0xe4, 0x24, 0xbd, 0x22, 0x99, 0x45, 0xbd, 0xbe, 0xef, 0x0d, 0xbd, 0xbe, 0xae,
+  0x0f, 0xbc, 0xe1, 0xe9, 0x4e, 0x3c, 0xd2, 0xed, 0x54, 0xbd, 0x62, 0xcb, 0x7d,
+  0x3c, 0xc8, 0xe4, 0x0d, 0xbc, 0x61, 0xaa, 0xa8, 0x3b, 0x68, 0x56, 0x92, 0xbb,
+  0x83, 0xb3, 0x25, 0xbd, 0x0a, 0x28, 0x39, 0xbd, 0x9d, 0xd4, 0x13, 0x3c, 0x5c,
+  0x3c, 0x27, 0x3d, 0x34, 0x21, 0x30, 0x3d, 0x9d, 0xac, 0x54, 0xbd, 0xaa, 0xe8,
+  0x60, 0x3d, 0xb4, 0xaf, 0xe5, 0x3c, 0xb0, 0x22, 0x1d, 0x3d, 0x9c, 0x7e, 0x64,
+  0x3d, 0x3e, 0xd9, 0x7b, 0x3d, 0x55, 0x9e, 0x46, 0x3d, 0x47, 0xf9, 0xfe, 0x3a,
+  0x00, 0xf0, 0x79, 0xbc, 0x49, 0x93, 0xd5, 0xbb, 0x98, 0x75, 0x29, 0xbc, 0xfb,
+  0xdc, 0x37, 0xbd, 0x9a, 0x0e, 0x65, 0x3d, 0x7a, 0x74, 0x93, 0xbd, 0x39, 0x83,
+  0xba, 0x3c, 0x20, 0xa3, 0x94, 0xbd, 0xbf, 0x32, 0x18, 0xbc, 0xbd, 0x90, 0x19,
+  0x3c, 0x31, 0xbe, 0x94, 0xbd, 0x1f, 0xd5, 0x9b, 0x3a, 0x09, 0xa3, 0x44, 0xbd,
+  0xe4, 0x91, 0xae, 0xbc, 0x98, 0x84, 0x73, 0xbd, 0xe6, 0x64, 0x70, 0x3d, 0xcc,
+  0x0d, 0x01, 0xbd, 0xb0, 0xd6, 0xce, 0x3c, 0x2a, 0x8b, 0x78, 0xbd, 0x51, 0x8a,
+  0xcd, 0x3c, 0x76, 0x3b, 0x0b, 0x3b, 0x85, 0xe3, 0x76, 0xbd, 0xad, 0x98, 0x6f,
+  0x3d, 0xf8, 0xa1, 0x92, 0xbd, 0x22, 0xb9, 0x24, 0xbd, 0x81, 0xf4, 0x62, 0xbd,
+  0xeb, 0x97, 0x83, 0x3d, 0x0d, 0xa9, 0x91, 0x3a, 0x62, 0x88, 0x0c, 0xbc, 0x99,
+  0x64, 0x48, 0x3d, 0x0b, 0x11, 0x80, 0xba, 0x94, 0xe3, 0x70, 0xbc, 0xa3, 0x42,
+  0x56, 0x3c, 0x1c, 0x41, 0xec, 0x3c, 0x68, 0x56, 0x29, 0x3c, 0x50, 0x4a, 0x05,
+  0x3d, 0xfa, 0x33, 0x37, 0x3d, 0x5d, 0x7c, 0x8d, 0x3d, 0xa8, 0x02, 0x3f, 0x3c,
+  0xa6, 0x1d, 0x68, 0x3d, 0x41, 0x3b, 0x76, 0x3d, 0x29, 0xa1, 0x56, 0xbd, 0xbd,
+  0x90, 0x7c, 0x3b, 0xd9, 0x96, 0x62, 0xbd, 0xf2, 0x15, 0xd8, 0xbc, 0xad, 0x62,
+  0x38, 0x3d, 0x19, 0xc7, 0x0d, 0x3d, 0xda, 0xcc, 0xf8, 0x3b, 0x63, 0xaf, 0x84,
+  0xbd, 0x42, 0x94, 0x3f, 0xbc, 0x60, 0x67, 0x83, 0x3d, 0x13, 0xdb, 0xa8, 0x3c,
+  0x8f, 0xcb, 0x5e, 0x3d, 0x97, 0x69, 0x14, 0xbd, 0xd5, 0x52, 0x97, 0x3c, 0x28,
+  0xb2, 0x09, 0xbb, 0xd0, 0x5c, 0x0f, 0x3d, 0x08, 0x01, 0x38, 0xbd, 0x2a, 0xd1,
+  0x75, 0xbd, 0xb6, 0x48, 0x5e, 0xbd, 0xe6, 0x3a, 0x40, 0x3d, 0x91, 0x52, 0xb5,
+  0x3c, 0xe6, 0xe6, 0x2f, 0x3d, 0x7b, 0x0a, 0x0b, 0x3d, 0x05, 0xa6, 0xf1, 0xbb,
+  0xe5, 0x14, 0x12, 0x3c, 0x70, 0x4a, 0x61, 0xbd, 0xc0, 0xd5, 0x77, 0x3c, 0xea,
+  0x92, 0x4e, 0x3d, 0xe8, 0xea, 0x7a, 0x3c, 0x85, 0xec, 0x8d, 0xbc, 0x1f, 0x06,
+  0x3a, 0x3d, 0x24, 0x7d, 0x43, 0x3c, 0x3b, 0xfb, 0x4e, 0x3d, 0x10, 0xdb, 0x26,
+  0xbc, 0x3c, 0xe4, 0x44, 0x3d, 0x5f, 0x54, 0xe6, 0x3c, 0x32, 0x15, 0xdf, 0xbc,
+  0x07, 0x77, 0x1f, 0x3d, 0x68, 0x58, 0xea, 0x3c, 0xbe, 0x48, 0x90, 0xbc, 0x42,
+  0x47, 0x35, 0x3d, 0x21, 0x06, 0x7d, 0xbd, 0x96, 0xd4, 0x67, 0x3c, 0x17, 0x5e,
+  0x79, 0x3b, 0xd0, 0x09, 0x93, 0xbd, 0xaf, 0x34, 0x3d, 0x3d, 0xc6, 0xd3, 0x8f,
+  0xbc, 0xae, 0x06, 0x0c, 0x3c, 0x84, 0xeb, 0x04, 0xbd, 0x44, 0xf4, 0x2e, 0xbd,
+  0xad, 0x8d, 0x61, 0x3c, 0xb0, 0x1e, 0xaf, 0xb9, 0xb6, 0xd3, 0x57, 0xbc, 0x78,
+  0x89, 0x97, 0x3c, 0x39, 0xa2, 0x41, 0xbd, 0x1c, 0xb3, 0x30, 0xbd, 0x44, 0xc4,
+  0x90, 0x3c, 0xa3, 0x43, 0x03, 0xbd, 0xe0, 0xe2, 0xc4, 0xbb, 0xf0, 0xf3, 0x4d,
+  0x3c, 0x6c, 0xf3, 0x85, 0x3d, 0x8f, 0xa9, 0x56, 0xbd, 0x36, 0x75, 0x5c, 0x3d,
+  0x7e, 0x57, 0x89, 0x3c, 0x3a, 0xb8, 0x29, 0x3c, 0x2c, 0x10, 0x40, 0xbd, 0x5f,
+  0x74, 0x32, 0xbd, 0xaf, 0x9e, 0x09, 0xbd, 0x60, 0xe4, 0x4b, 0xbd, 0x49, 0xb4,
+  0xd7, 0x3c, 0xa0, 0x1f, 0x31, 0xbd, 0xd6, 0x5e, 0xde, 0x3c, 0x4e, 0xb1, 0xdb,
+  0xbc, 0x98, 0x5a, 0x1e, 0x3d, 0x03, 0xe2, 0xa0, 0xba, 0x76, 0xc1, 0x63, 0xbd,
+  0xbd, 0x03, 0xcf, 0x3c, 0xde, 0x4d, 0x22, 0x3d, 0x6a, 0x58, 0x5c, 0xbb, 0xc3,
+  0xb8, 0x19, 0xbd, 0xf3, 0x01, 0x8f, 0x3d, 0x40, 0x62, 0xdc, 0x3b, 0x58, 0x64,
+  0xa0, 0xbc, 0xdc, 0xd4, 0x6d, 0x3d, 0x62, 0x98, 0x1d, 0xbd, 0x96, 0x88, 0x4d,
+  0x3b, 0x0e, 0xab, 0x46, 0x3d, 0xcb, 0xee, 0xce, 0x3b, 0xc5, 0x27, 0xe2, 0xbb,
+  0xe4, 0xe4, 0x1c, 0x3d, 0x75, 0x86, 0x08, 0xbd, 0xf0, 0xce, 0x1c, 0x3d, 0xcb,
+  0x9d, 0x7a, 0x3d, 0x24, 0x56, 0x42, 0xbc, 0x3a, 0x7f, 0xc4, 0xbc, 0x6e, 0xfd,
+  0x6e, 0x3d, 0xa1, 0x3f, 0x80, 0x3d, 0xfb, 0x13, 0xc9, 0xbc, 0x5f, 0x8f, 0xb9,
+  0x3c, 0xe3, 0xde, 0x94, 0xbd, 0x9f, 0x88, 0x88, 0xbd, 0x79, 0x27, 0x71, 0x3d,
+  0xeb, 0xc8, 0x36, 0x3d, 0xe7, 0x2c, 0x9e, 0xbc, 0xb1, 0x19, 0x4d, 0xbd, 0x1e,
+  0x82, 0x79, 0x3d, 0x75, 0xfe, 0x94, 0xbd, 0xdc, 0xd7, 0x96, 0xbd, 0x3a, 0x57,
+  0x84, 0x3d, 0x70, 0xcd, 0x09, 0xbd, 0x08, 0xd9, 0x01, 0xbd, 0xa6, 0x1a, 0x85,
+  0x3d, 0x5e, 0x34, 0xec, 0xbc, 0x3c, 0x0f, 0xa6, 0xbc, 0x0a, 0xc2, 0x6f, 0x3d,
+  0x72, 0x1c, 0x89, 0x3d, 0xb0, 0x55, 0x12, 0xbd, 0x71, 0x87, 0x1f, 0x3d, 0x03,
+  0xf0, 0x07, 0x3c, 0x52, 0x7d, 0x29, 0x3d, 0xe0, 0x13, 0x55, 0xbc, 0xe0, 0xac,
+  0xbb, 0x3c, 0x36, 0x1f, 0x58, 0x3d, 0x34, 0x2f, 0xe3, 0x3c, 0xb5, 0xb7, 0x89,
+  0xbc, 0x06, 0xfa, 0x93, 0xbd, 0xe7, 0x2e, 0x20, 0xbc, 0xc8, 0x71, 0x4c, 0x3d,
+  0x03, 0x3b, 0xf6, 0xbb, 0x1c, 0xf7, 0x24, 0x3d, 0x88, 0x07, 0x09, 0x3d, 0xa6,
+  0x16, 0xde, 0xbc, 0xd4, 0xfa, 0xf5, 0xbc, 0x2e, 0x35, 0x3f, 0x3d, 0x22, 0x36,
+  0x5c, 0xbd, 0x99, 0xea, 0x90, 0x3d, 0x7c, 0xfd, 0xe6, 0x3c, 0xda, 0x89, 0x2e,
+  0x3d, 0xea, 0x83, 0x39, 0x3c, 0xe2, 0x35, 0x12, 0x3d, 0xa6, 0xee, 0x46, 0x3d,
+  0x7b, 0x4e, 0x36, 0xbd, 0x0a, 0x6d, 0xd1, 0x3b, 0x90, 0x59, 0x08, 0xbc, 0x3e,
+  0xee, 0x86, 0x3b, 0x18, 0x92, 0x13, 0x3d, 0x71, 0xd5, 0x69, 0x3c, 0x5f, 0xc2,
+  0x8d, 0xbd, 0xb0, 0x51, 0x81, 0x3c, 0x5a, 0x81, 0x9e, 0x3c, 0xcf, 0xae, 0x13,
+  0x3d, 0xa4, 0x0d, 0x54, 0x3d, 0xb6, 0x82, 0x77, 0x3d, 0x6a, 0x20, 0xf7, 0xbc,
+  0x60, 0xcc, 0x56, 0xbd, 0x45, 0x8f, 0x23, 0xbd, 0x92, 0x5c, 0x69, 0xbc, 0x8d,
+  0xb5, 0x5d, 0xbd, 0x39, 0x60, 0x29, 0xbc, 0x06, 0x25, 0x6b, 0x3c, 0xad, 0x40,
+  0x32, 0xbd, 0xcd, 0xbe, 0xf3, 0xbc, 0x7e, 0xd6, 0x74, 0x3d, 0x2e, 0x72, 0x63,
+  0x3d, 0xc3, 0xaa, 0x0c, 0xbd, 0x74, 0xfc, 0x6a, 0xbd, 0xff, 0xa6, 0x7b, 0x3d,
+  0xa8, 0x4f, 0xec, 0xbc, 0x8a, 0x91, 0x39, 0xbd, 0xd1, 0xa4, 0x7b, 0x3d, 0xff,
+  0x3a, 0x99, 0x3b, 0xe9, 0xd2, 0x4e, 0xbd, 0xc6, 0x84, 0x1e, 0x3d, 0xe7, 0x73,
+  0xdf, 0xbc, 0x88, 0xfb, 0x08, 0x3d, 0xf9, 0x98, 0xa2, 0xbc, 0x41, 0x1d, 0x8d,
+  0x3d, 0xe6, 0x32, 0x38, 0x3d, 0x5f, 0xea, 0x1a, 0xbd, 0xce, 0x8f, 0x92, 0xbd,
+  0xea, 0x1f, 0x69, 0x3d, 0x5b, 0x6e, 0x58, 0xbc, 0x6d, 0xfc, 0x2d, 0x3d, 0xa9,
+  0x01, 0x83, 0x3d, 0xbc, 0xdb, 0x53, 0x3d, 0x70, 0xea, 0x72, 0xbd, 0xa4, 0xc0,
+  0xae, 0xbc, 0x80, 0x8a, 0x54, 0x3a, 0x4a, 0x00, 0x80, 0xbc, 0x4a, 0x66, 0x78,
+  0xbc, 0xbe, 0x62, 0x79, 0xbd, 0xe8, 0x24, 0x84, 0xbc, 0x0d, 0xef, 0x0f, 0x3d,
+  0xa9, 0xa6, 0x26, 0x3d, 0xb8, 0x68, 0x83, 0xbd, 0xe2, 0x7b, 0x27, 0xbd, 0xdc,
+  0xda, 0x80, 0xbd, 0x5e, 0x50, 0x88, 0xbd, 0x76, 0x41, 0x8d, 0x3d, 0xee, 0x0a,
+  0x95, 0xbc, 0xc4, 0x0b, 0x41, 0x3c, 0x6e, 0x16, 0xe0, 0xbc, 0xb2, 0x34, 0x58,
+  0x3d, 0x65, 0xd4, 0x06, 0x3d, 0x8a, 0x8a, 0x18, 0xbd, 0x99, 0xdd, 0x47, 0x3d,
+  0x2b, 0xec, 0x00, 0x3d, 0xc3, 0xb1, 0xad, 0xb9, 0xf9, 0x57, 0x77, 0x3c, 0xae,
+  0xc6, 0x8a, 0xbd, 0x55, 0x51, 0x43, 0x3d, 0x34, 0xd3, 0x1b, 0xbd, 0xda, 0x9e,
+  0x47, 0x3d, 0xe5, 0x3a, 0x1f, 0x3d, 0x6d, 0xf2, 0x59, 0x3d, 0x14, 0x27, 0xb7,
+  0xbc, 0xb0, 0x72, 0x8f, 0x3d, 0xbe, 0x91, 0x83, 0xbd, 0xbb, 0x8f, 0x39, 0xbd,
+  0x40, 0x7f, 0x7e, 0xbd, 0x2d, 0x3e, 0x86, 0x3b, 0xca, 0x43, 0x29, 0xbc, 0xe2,
+  0xb8, 0x4d, 0x3d, 0x48, 0x31, 0x85, 0xbd, 0xcb, 0x54, 0x1b, 0x3d, 0xb4, 0xc8,
+  0x56, 0x3d, 0x09, 0x2f, 0x1d, 0x3d, 0xca, 0x8f, 0x10, 0x3d, 0xe1, 0x8d, 0x4c,
+  0x3a, 0xdb, 0x4d, 0xd2, 0xbc, 0x4a, 0xc7, 0xd1, 0xbc, 0xc8, 0x03, 0xfa, 0x3c,
+  0x4e, 0x3f, 0xa4, 0xbc, 0x5f, 0x9e, 0x90, 0xbd, 0x13, 0x82, 0xc0, 0x3c, 0x59,
+  0x55, 0x54, 0x3c, 0xb6, 0x95, 0xa5, 0xbb, 0xef, 0x59, 0xa4, 0x3b, 0x7e, 0x93,
+  0x1e, 0xbd, 0xaf, 0x49, 0x81, 0xbc, 0xe7, 0xd1, 0xc6, 0xbb, 0xc0, 0xa3, 0xc9,
+  0x3b, 0x53, 0xa9, 0x77, 0xbb, 0xfa, 0x26, 0x74, 0xbc, 0x06, 0x1b, 0x63, 0x3d,
+  0xe4, 0x90, 0x0a, 0xbd, 0x64, 0x50, 0x31, 0x3d, 0xff, 0x66, 0x82, 0x3d, 0x9d,
+  0x1c, 0x06, 0xbd, 0x38, 0x29, 0x40, 0xbd, 0x6f, 0xea, 0x89, 0x3d, 0xdc, 0x8a,
+  0x3f, 0xbd, 0xd1, 0x88, 0x02, 0x3d, 0x2f, 0x23, 0x27, 0x3c, 0x9c, 0x85, 0x56,
+  0x3d, 0x41, 0xc7, 0x41, 0xbd, 0x67, 0x51, 0x49, 0x3c, 0x5f, 0x41, 0xf9, 0xbb,
+  0x15, 0x37, 0xdb, 0xbc, 0x51, 0x7a, 0xd9, 0x3a, 0x05, 0xc0, 0x90, 0xbd, 0x8f,
+  0xdb, 0x84, 0xbd, 0x3a, 0xc1, 0x48, 0xb9, 0x22, 0x3c, 0xfb, 0x3c, 0x7d, 0xf5,
+  0x14, 0xbd, 0x26, 0xe6, 0x53, 0xbc, 0xde, 0x94, 0xa0, 0xbc, 0xd9, 0xc4, 0x5e,
+  0x3d, 0xd4, 0xcf, 0xa6, 0xba, 0xfa, 0x43, 0x18, 0xbd, 0xee, 0x62, 0x19, 0xbd,
+  0xfb, 0x61, 0x66, 0xbb, 0x1e, 0x8b, 0x82, 0xbd, 0x26, 0xec, 0x87, 0xbd, 0xc2,
+  0xf6, 0x04, 0x3d, 0x2b, 0x2e, 0xe4, 0xbc, 0x60, 0xa6, 0x4e, 0x3d, 0x21, 0x99,
+  0x5c, 0x3d, 0xdd, 0xde, 0x37, 0x3d, 0x8e, 0xfc, 0xf5, 0x3c, 0x6d, 0x33, 0xc2,
+  0x39, 0x48, 0xea, 0x34, 0x3d, 0x79, 0x3e, 0x85, 0xbd, 0x20, 0xb1, 0x3d, 0xbb,
+  0xdc, 0xe9, 0x64, 0xbc, 0xd2, 0xac, 0x4a, 0xbd, 0x1a, 0x4a, 0x8d, 0xbd, 0xb5,
+  0xa2, 0xf3, 0x3c, 0xcd, 0x54, 0xb6, 0xbc, 0xc1, 0x9b, 0x2c, 0x3c, 0xd0, 0xea,
+  0xad, 0xbc, 0x3f, 0xbc, 0x7f, 0x3c, 0xde, 0xe3, 0xe9, 0xbc, 0x1e, 0x28, 0x6f,
+  0xbc, 0xd1, 0xce, 0xfe, 0xbc, 0xcc, 0x16, 0x21, 0x3d, 0x2a, 0x10, 0x18, 0xbd,
+  0x5e, 0x73, 0xe9, 0xbb, 0xb3, 0x67, 0xa1, 0xbb, 0x94, 0x7d, 0x0d, 0x3c, 0x1d,
+  0x67, 0x3b, 0xbd, 0xa9, 0xb9, 0x84, 0x3c, 0xe1, 0xc1, 0x89, 0xba, 0x49, 0x7f,
+  0x91, 0xbd, 0x47, 0xf8, 0x57, 0xbc, 0x00, 0x6a, 0x24, 0x3d, 0x61, 0x71, 0x6f,
+  0x3c, 0xd7, 0x6e, 0x4e, 0xbc, 0x07, 0xda, 0x60, 0xbb, 0x2d, 0xd9, 0x8e, 0x3d,
+  0x0d, 0x9d, 0xc5, 0x3b, 0x50, 0x74, 0xe2, 0xbc, 0xaf, 0x90, 0x2d, 0xbd, 0xce,
+  0x93, 0x2a, 0x3d, 0x56, 0xee, 0xee, 0xbc, 0x62, 0x58, 0x0a, 0x3d, 0x25, 0x7c,
+  0x64, 0x3d, 0x23, 0x8d, 0x80, 0x3d, 0x3b, 0xfd, 0x55, 0xbd, 0x8f, 0x71, 0xe2,
+  0xbc, 0x9c, 0xae, 0x07, 0x3d, 0x0e, 0xe4, 0xdd, 0xbc, 0x93, 0xc9, 0xd7, 0x3c,
+  0x87, 0x9c, 0xe5, 0xbb, 0xa3, 0xd5, 0x5d, 0x3d, 0x23, 0xdb, 0x3a, 0xbd, 0x67,
+  0xb3, 0x1a, 0x3d, 0x9e, 0xa1, 0x6b, 0x3d, 0x93, 0x17, 0xc2, 0xbc, 0x0c, 0xb7,
+  0x33, 0xbd, 0xc0, 0xba, 0xeb, 0xbc, 0x16, 0x2c, 0x4d, 0xbd, 0xed, 0x60, 0x78,
+  0x3c, 0x54, 0xa3, 0x93, 0xbd, 0x62, 0xa6, 0x8a, 0xbd, 0xdc, 0x16, 0x25, 0xbd,
+  0xa9, 0xaf, 0x76, 0xbd, 0xab, 0x3c, 0x5d, 0xbd, 0xcf, 0x78, 0x9c, 0x3c, 0x74,
+  0xf2, 0x97, 0x3c, 0xaa, 0x5d, 0x3b, 0x3d, 0x9c, 0xd2, 0xef, 0x3c, 0xd8, 0x6a,
+  0x37, 0x3c, 0x44, 0xd2, 0xb9, 0xbc, 0x41, 0x5d, 0x7e, 0x3d, 0x74, 0x3c, 0x7d,
+  0xbd, 0x40, 0x08, 0x0c, 0xbd, 0xbb, 0xc3, 0x04, 0xbd, 0xd7, 0xd3, 0x5d, 0xbd,
+  0x41, 0xe7, 0x7c, 0x3d, 0x65, 0x20, 0x6f, 0x3b, 0x4e, 0xef, 0x81, 0x3a, 0xae,
+  0xe0, 0x5d, 0xbd, 0x3f, 0xfb, 0x82, 0xbd, 0xf1, 0xc5, 0x58, 0xbd, 0x96, 0xab,
+  0x45, 0x3b, 0x97, 0x5f, 0xcd, 0x3b, 0x39, 0x48, 0x5b, 0x3b, 0x6d, 0xf0, 0x28,
+  0xbd, 0x08, 0xcc, 0x9f, 0x3c, 0x21, 0xd5, 0x2b, 0xbd, 0xc1, 0xe3, 0x1c, 0x3d,
+  0x86, 0x52, 0xb4, 0x3c, 0x02, 0xd4, 0xc6, 0xbc, 0xbe, 0xab, 0x27, 0xbd, 0x18,
+  0x8f, 0x84, 0x3c, 0x7d, 0x47, 0x2e, 0x3d, 0x0a, 0x58, 0x9c, 0x3b, 0x52, 0x72,
+  0xe4, 0xbc, 0x98, 0x57, 0x5e, 0x3c, 0x24, 0xf1, 0x04, 0xbc, 0x3b, 0xec, 0x0f,
+  0xbd, 0xf5, 0x54, 0x13, 0x3d, 0x6f, 0xf9, 0x80, 0x3c, 0x80, 0x19, 0xa2, 0xbc,
+  0xfa, 0x89, 0x35, 0x3d, 0xd8, 0x61, 0x82, 0x3c, 0x21, 0x81, 0x8b, 0x3d, 0x40,
+  0x2d, 0x65, 0xbc, 0xc6, 0x21, 0x61, 0x3d, 0x51, 0x3d, 0xa9, 0xbc, 0x47, 0x12,
+  0x55, 0x3d, 0x7e, 0x85, 0x71, 0xbd, 0x22, 0x14, 0x05, 0x3d, 0x94, 0x35, 0x97,
+  0xbd, 0x3c, 0x00, 0x86, 0xbd, 0x3a, 0x46, 0x5f, 0x3d, 0x18, 0x14, 0x06, 0xbd,
+  0xb4, 0xea, 0x8c, 0xbd, 0xdc, 0x2e, 0xfe, 0x3b, 0x21, 0x96, 0x3d, 0xbd, 0x3a,
+  0xf6, 0x8b, 0xbc, 0x3a, 0x3b, 0x6d, 0xbb, 0x39, 0x87, 0x13, 0x3c, 0x15, 0xbc,
+  0x92, 0xbd, 0x24, 0xb7, 0x13, 0x3d, 0x9c, 0x66, 0x7a, 0xbd, 0x6b, 0xf2, 0x41,
+  0xbd, 0x1d, 0x15, 0x6a, 0xbc, 0x20, 0x2a, 0x73, 0x3d, 0x25, 0x95, 0x40, 0x3d,
+  0x23, 0x8f, 0x90, 0xbd, 0xd6, 0x95, 0xa7, 0xbc, 0xbe, 0xce, 0x4f, 0x3d, 0xaf,
+  0xe0, 0x3f, 0x3d, 0x1b, 0x9f, 0x47, 0x3c, 0x57, 0x37, 0x14, 0x3d, 0x33, 0x06,
+  0x86, 0x3d, 0xe5, 0x3c, 0x77, 0x3d, 0x60, 0x46, 0x95, 0x3b, 0xee, 0xd2, 0x97,
+  0xbc, 0x38, 0x20, 0x9c, 0x3c, 0xe6, 0x90, 0xdf, 0xba, 0x77, 0x4f, 0x30, 0x3d,
+  0x54, 0x87, 0x03, 0x3d, 0x86, 0x7c, 0x25, 0x3d, 0xdb, 0x5a, 0x18, 0x3d, 0x60,
+  0x84, 0xf9, 0xbc, 0x84, 0x3c, 0xd0, 0xbc, 0xe9, 0x8c, 0x87, 0xbb, 0x39, 0xb9,
+  0x81, 0x3d, 0x2e, 0x3e, 0x67, 0x3d, 0x5d, 0x57, 0xf8, 0xba, 0x60, 0x31, 0x38,
+  0x3c, 0xf4, 0x31, 0x02, 0xbd, 0x31, 0x10, 0x98, 0x3c, 0x85, 0x28, 0x16, 0x3d,
+  0xc5, 0xcd, 0xef, 0x3c, 0x92, 0x8d, 0x59, 0x3d, 0x6a, 0x54, 0x27, 0xbc, 0x72,
+  0x4a, 0xf7, 0xbc, 0x0d, 0x8d, 0x81, 0x3d, 0xbd, 0x74, 0x8f, 0xbd, 0x80, 0xed,
+  0x5c, 0x3b, 0xbe, 0x52, 0x7e, 0x3d, 0x49, 0x3f, 0x28, 0xbd, 0xcc, 0xc5, 0xea,
+  0xbc, 0x2f, 0x46, 0x6b, 0xbd, 0x05, 0xd4, 0x0c, 0xbc, 0x41, 0x09, 0x02, 0x3d,
+  0x2e, 0xa8, 0x53, 0xbc, 0xc7, 0x56, 0x56, 0xbd, 0xc2, 0x01, 0x88, 0xbd, 0x7a,
+  0x9c, 0x6f, 0x3d, 0x3c, 0x49, 0x1c, 0x3d, 0x2b, 0x80, 0xe3, 0x3b, 0x43, 0x27,
+  0x7d, 0x3d, 0x91, 0xa0, 0x58, 0x3d, 0xdb, 0x70, 0x76, 0xbc, 0xc4, 0xfa, 0x04,
+  0xbd, 0x5e, 0x76, 0xcc, 0x3b, 0x0a, 0xcf, 0xc0, 0xbc, 0xfa, 0x3f, 0x08, 0xbd,
+  0x26, 0x65, 0xaa, 0x3c, 0x2f, 0xec, 0x37, 0x3d, 0xa0, 0xae, 0x51, 0x3d, 0xbd,
+  0x0e, 0x4e, 0x3d, 0x4d, 0x36, 0xae, 0xbc, 0xf1, 0xc8, 0x3f, 0xbd, 0x79, 0xe5,
+  0x84, 0xbc, 0xac, 0x19, 0xf7, 0x3b, 0x5f, 0x52, 0x70, 0xbd, 0x46, 0x15, 0x01,
+  0xbd, 0x17, 0xb1, 0xb1, 0x3c, 0x2e, 0x19, 0x87, 0xbd, 0x0c, 0xe6, 0x98, 0x3c,
+  0x35, 0xd0, 0x22, 0xbd, 0xe3, 0x8f, 0x8a, 0xbd, 0x23, 0x8b, 0xfa, 0x3c, 0x01,
+  0x67, 0x80, 0x3d, 0x6c, 0x9e, 0xb2, 0x3a, 0x6b, 0xbe, 0x8b, 0x3d, 0x74, 0x68,
+  0xdb, 0x3c, 0x4c, 0x13, 0xae, 0xbc, 0x94, 0xfe, 0x50, 0xbd, 0xdc, 0x7e, 0x2f,
+  0x3d, 0x78, 0x0a, 0x6e, 0xbc, 0x0e, 0x2b, 0xe9, 0xbc, 0x3b, 0x4b, 0x08, 0x3d,
+  0x4d, 0x1a, 0x3d, 0xbd, 0x55, 0x7e, 0x51, 0xbb, 0x15, 0xa6, 0xb4, 0xbc, 0xac,
+  0x1b, 0x86, 0xbb, 0x8a, 0x27, 0x22, 0x3d, 0x39, 0xc8, 0x34, 0xbc, 0x65, 0x0e,
+  0x1a, 0xbb, 0x4c, 0x08, 0xdb, 0x3b, 0x60, 0x75, 0x2d, 0xbc, 0x25, 0xba, 0x64,
+  0xbc, 0x8c, 0x05, 0x70, 0x3d, 0x0e, 0xdc, 0xaa, 0xbc, 0x63, 0x17, 0x03, 0x3d,
+  0x03, 0x9d, 0x36, 0x3c, 0xe3, 0xf5, 0x6e, 0x3d, 0x01, 0xf8, 0x12, 0xbd, 0x15,
+  0x62, 0xb3, 0x3c, 0xe1, 0x20, 0x1f, 0x3d, 0xbd, 0x41, 0x8d, 0x3d, 0x7b, 0x02,
+  0x47, 0x3d, 0x8e, 0x9c, 0x93, 0xbc, 0x82, 0xa1, 0x81, 0xbd, 0xb9, 0x59, 0x6e,
+  0x3c, 0xc6, 0x93, 0x07, 0xbd, 0x4c, 0x87, 0x44, 0x3d, 0x6a, 0x66, 0x49, 0xbd,
+  0x80, 0xd5, 0x4b, 0xbb, 0x70, 0xd5, 0x09, 0x3c, 0x20, 0x85, 0x06, 0x3c, 0x7e,
+  0xd6, 0x42, 0x3d, 0x5d, 0x10, 0x01, 0x3c, 0x71, 0xbe, 0x6c, 0xbc, 0xcc, 0xba,
+  0x2d, 0xbd, 0xbf, 0xf6, 0x90, 0xbd, 0x59, 0xb8, 0x8c, 0x3d, 0x4a, 0xe8, 0x87,
+  0xbc, 0xee, 0xd3, 0xd1, 0x3c, 0xde, 0xdd, 0xa6, 0xbb, 0x26, 0x06, 0x6a, 0xbc,
+  0x1f, 0xa2, 0x88, 0xbd, 0x00, 0x6c, 0x24, 0xbb, 0x36, 0xf0, 0x00, 0x3c, 0x1e,
+  0x54, 0x86, 0xbb, 0x55, 0x5e, 0x01, 0xbc, 0x3e, 0x0e, 0xe8, 0x3c, 0xbd, 0x02,
+  0x70, 0xbb, 0x8e, 0xb9, 0x85, 0x3d, 0x8e, 0x8a, 0x5d, 0xbb, 0xa4, 0x21, 0x13,
+  0x3d, 0xd1, 0x77, 0x16, 0xbc, 0x40, 0x95, 0x1d, 0x3c, 0x58, 0x2f, 0xbb, 0x3c,
+  0xf5, 0x88, 0x86, 0xbb, 0xa0, 0x02, 0x83, 0xbd, 0x93, 0xb8, 0x0a, 0x3c, 0xfd,
+  0x65, 0xe2, 0xbb, 0x24, 0x21, 0x11, 0x3d, 0xc6, 0x89, 0x8c, 0xbd, 0xc3, 0xa9,
+  0x7a, 0xbd, 0x43, 0xcf, 0x81, 0xbd, 0xde, 0x81, 0x58, 0xbd, 0x3d, 0x35, 0x23,
+  0x3d, 0xbe, 0x81, 0x90, 0xbd, 0xd3, 0xd2, 0xbb, 0x3c, 0x60, 0x68, 0xe5, 0xbc,
+  0x25, 0x64, 0xa8, 0xbb, 0x8e, 0x5e, 0x4e, 0xbd, 0xc3, 0xa4, 0xd3, 0xbc, 0xb0,
+  0x99, 0xf7, 0xbc, 0x2d, 0x56, 0x17, 0xbd, 0x44, 0x65, 0x2b, 0x3d, 0xa7, 0x80,
+  0x05, 0xbd, 0xfc, 0xe1, 0x02, 0x3d, 0x65, 0xa7, 0x68, 0x3d, 0x52, 0x5d, 0x8b,
+  0xbd, 0x6a, 0x9e, 0x83, 0xbd, 0xd4, 0xac, 0x1a, 0xbc, 0x3e, 0x6b, 0x7d, 0xbc,
+  0xeb, 0xff, 0x40, 0xbd, 0xcd, 0xd2, 0x21, 0x3d, 0x7e, 0xf1, 0x70, 0xbd, 0x9b,
+  0xc6, 0x6a, 0xbb, 0x1e, 0xb9, 0x20, 0x3d, 0xfd, 0x9b, 0x61, 0xbd, 0x57, 0xf3,
+  0x5a, 0xbd, 0x5d, 0xbe, 0xbb, 0x3b, 0xd3, 0xc8, 0x50, 0xbd, 0x38, 0x8a, 0x5e,
+  0xbd, 0x86, 0x65, 0x57, 0x3d, 0x02, 0xc7, 0x85, 0xbd, 0x95, 0x0a, 0x80, 0x3d,
+  0x08, 0xcd, 0x66, 0x3c, 0x68, 0x38, 0x3d, 0x3c, 0xad, 0x64, 0x12, 0xbd, 0x20,
+  0x0d, 0xcc, 0x3c, 0x63, 0x2c, 0x3f, 0x3d, 0xf6, 0xe1, 0xdc, 0x3c, 0x5f, 0xa6,
+  0x35, 0x3d, 0x7b, 0xf6, 0x68, 0xbd, 0x9e, 0x65, 0xd2, 0x3c, 0x13, 0x63, 0x9d,
+  0xbb, 0xd6, 0x42, 0x51, 0xbc, 0xa2, 0xc5, 0x52, 0xbc, 0x6a, 0x3d, 0x3f, 0x3d,
+  0xa6, 0xde, 0xf8, 0xbc, 0x01, 0xa1, 0x5b, 0x3d, 0x8d, 0xdf, 0x16, 0xbd, 0x62,
+  0x4d, 0x35, 0xba, 0x22, 0xca, 0x30, 0xbd, 0x50, 0x22, 0x72, 0xbc, 0xf1, 0xaa,
+  0x96, 0xbd, 0x52, 0xf4, 0xd9, 0x3c, 0x08, 0x89, 0x6d, 0x3d, 0x90, 0x97, 0xa9,
+  0x3c, 0x20, 0x9d, 0x0b, 0x3c, 0x47, 0x97, 0xf5, 0xbc, 0x7f, 0xc1, 0x3c, 0x3d,
+  0x77, 0xa7, 0xeb, 0x3b, 0xe2, 0x0c, 0x77, 0x3d, 0xca, 0x57, 0x3e, 0x3d, 0x16,
+  0x46, 0x38, 0xbd, 0x15, 0xde, 0x87, 0x3d, 0x10, 0x09, 0x0a, 0xbd, 0xa0, 0xfa,
+  0x56, 0x3b, 0xba, 0x6c, 0x2f, 0x3d, 0x0f, 0xb9, 0x70, 0x3c, 0x35, 0xb8, 0x8c,
+  0xbd, 0x88, 0xad, 0xc5, 0xbc, 0xb2, 0x0b, 0x40, 0xbd, 0x63, 0x62, 0x80, 0xbd,
+  0xb4, 0xd9, 0x78, 0x3c, 0x91, 0x49, 0x8a, 0xbd, 0x59, 0x3c, 0x47, 0x3d, 0xb1,
+  0xb7, 0x3a, 0xbd, 0x0f, 0x07, 0xea, 0x3b, 0xca, 0x89, 0x50, 0xbd, 0xf6, 0x2c,
+  0x27, 0xbd, 0x3f, 0xf7, 0x37, 0x3c, 0x1c, 0x12, 0x23, 0x3c, 0x6d, 0x88, 0x97,
+  0xbd, 0x06, 0x09, 0x66, 0x3d, 0x40, 0xac, 0x80, 0xbc, 0xac, 0xea, 0x7c, 0xbd,
+  0x7e, 0xfb, 0x1a, 0x3d, 0x11, 0xd1, 0x65, 0x3d, 0x56, 0x13, 0xee, 0xbc, 0xa5,
+  0xe1, 0x69, 0xbd, 0x47, 0xff, 0x45, 0xbc, 0x20, 0xba, 0x2e, 0xbd, 0xff, 0x15,
+  0x48, 0xbc, 0x01, 0xd5, 0x8f, 0x3d, 0x42, 0x0f, 0x37, 0x3c, 0x68, 0xbc, 0xcc,
+  0x3c, 0xf4, 0x1e, 0x39, 0xbd, 0x00, 0x6c, 0x07, 0xb9, 0xe4, 0x6e, 0xb2, 0x3c,
+  0x9b, 0x53, 0x88, 0xbd, 0x20, 0xf2, 0xef, 0xbc, 0xd3, 0xf3, 0x8e, 0x3d, 0xbc,
+  0xe9, 0xa6, 0xbc, 0xa3, 0xb6, 0x6b, 0xbc, 0x73, 0xeb, 0xdd, 0xbc, 0xdf, 0xa3,
+  0x04, 0xbd, 0x1a, 0x9f, 0x21, 0x3c, 0x1d, 0xb7, 0x89, 0xbb, 0x28, 0x66, 0x85,
+  0xbc, 0xf9, 0x7f, 0x95, 0xbd, 0x4c, 0x07, 0xfa, 0xbc, 0x52, 0x7d, 0x29, 0x3d,
+  0x66, 0x78, 0x24, 0xbc, 0xd4, 0x70, 0xfa, 0xbc, 0x20, 0xdb, 0x02, 0xbd, 0x51,
+  0x27, 0x09, 0xbd, 0xb6, 0xb6, 0x42, 0x3d, 0x37, 0xa4, 0x3f, 0xbd, 0xfc, 0x30,
+  0xb2, 0xbb, 0x2b, 0xa7, 0xb7, 0x3c, 0x77, 0xf6, 0x2e, 0x3d, 0x4e, 0x18, 0x6c,
+  0x3d, 0xb0, 0xb9, 0xe4, 0x3c, 0xa6, 0xce, 0x89, 0xbd, 0x18, 0x9a, 0xc2, 0x3c,
+  0x8d, 0xdc, 0x51, 0xbd, 0x50, 0x09, 0x0a, 0x3d, 0xd8, 0x90, 0x6c, 0xbc, 0x28,
+  0x48, 0x96, 0xbc, 0x50, 0x5f, 0x62, 0xbc, 0x8b, 0xbc, 0x82, 0xbd, 0xb0, 0x24,
+  0xce, 0x3b, 0x54, 0xb0, 0x4b, 0x3c, 0xd8, 0x02, 0x59, 0x3c, 0x0b, 0x7d, 0xa0,
+  0x3c, 0x2a, 0x6f, 0xfa, 0xbc, 0x51, 0xf4, 0x0a, 0xbd, 0xe5, 0xdd, 0x45, 0x3d,
+  0x69, 0xcb, 0x5f, 0x3d, 0x59, 0xee, 0x1b, 0x3d, 0x15, 0x0c, 0x6d, 0x3d, 0xb4,
+  0xe8, 0x3a, 0x3c, 0xd6, 0x4c, 0x71, 0x3d, 0x2c, 0x6c, 0x5f, 0xbc, 0x23, 0xc7,
+  0x96, 0x3c, 0x90, 0xfd, 0xef, 0xb9, 0x80, 0x9a, 0xce, 0xbc, 0xc8, 0xa7, 0xfa,
+  0xbc, 0x3f, 0x84, 0x4d, 0xbc, 0xb9, 0x1e, 0x63, 0x3d, 0x91, 0xff, 0x16, 0xbd,
+  0xe4, 0x6d, 0x65, 0xbc, 0xbb, 0x19, 0x69, 0xbc, 0xf0, 0xba, 0xfe, 0xbc, 0xbb,
+  0xe6, 0x30, 0x3d, 0x12, 0x3a, 0x4d, 0x3d, 0x08, 0xa7, 0x79, 0x3d, 0x37, 0x6c,
+  0x88, 0x3d, 0xb4, 0x66, 0xf1, 0xba, 0xb8, 0x48, 0xcc, 0xbc, 0x61, 0xb9, 0x1d,
+  0xbd, 0x8a, 0x51, 0x45, 0xbd, 0x2e, 0x8a, 0x59, 0x3d, 0x88, 0xe0, 0x7d, 0xbd,
+  0x53, 0xc6, 0x8e, 0xbd, 0x0e, 0x7b, 0x5a, 0x3d, 0x13, 0xc2, 0xcb, 0xbc, 0x57,
+  0xcd, 0x8b, 0xbd, 0x60, 0x8c, 0x4e, 0xbd, 0xe2, 0x03, 0x07, 0x3d, 0x5f, 0x0d,
+  0x80, 0x3c, 0x5f, 0xc8, 0x3d, 0x3d, 0x89, 0x06, 0xc8, 0x3c, 0x17, 0x2b, 0x88,
+  0x3d, 0xf6, 0x31, 0x63, 0x3d, 0x51, 0x2b, 0x60, 0xbd, 0xc9, 0x26, 0x67, 0xbd,
+  0x02, 0x8e, 0x4f, 0xbd, 0xbd, 0x67, 0x20, 0x3d, 0x53, 0xfa, 0x64, 0xbb, 0x27,
+  0x16, 0x28, 0xbd, 0x45, 0x52, 0xfb, 0xbb, 0x66, 0x53, 0x8d, 0x3c, 0x0c, 0x18,
+  0x74, 0xbc, 0x60, 0x98, 0x19, 0x3d, 0xd2, 0x7c, 0x3c, 0x3d, 0x77, 0x65, 0x90,
+  0xbc, 0x69, 0x1e, 0x3e, 0xbd, 0x04, 0x22, 0x7f, 0xbc, 0x7c, 0x5d, 0x2c, 0xbc,
+  0x51, 0xb3, 0x1f, 0xbc, 0xc4, 0xaf, 0xbf, 0xbc, 0xa8, 0xc5, 0x59, 0x3c, 0xfe,
+  0x08, 0x62, 0x3d, 0x7c, 0x3a, 0x56, 0x3d, 0x4a, 0xaf, 0x38, 0x3d, 0xd9, 0x9e,
+  0x26, 0xbd, 0x48, 0xc2, 0x16, 0xbc, 0x6e, 0xcc, 0xec, 0xbc, 0x05, 0x78, 0x0e,
+  0xbc, 0xd2, 0x5c, 0x51, 0xbd, 0x44, 0x63, 0x6b, 0x3d, 0x7c, 0xfd, 0xca, 0xbb,
+  0x62, 0xda, 0x30, 0x3c, 0xc4, 0xcc, 0x61, 0x3d, 0xdc, 0xa6, 0x34, 0xbd, 0xff,
+  0x8f, 0x24, 0xbc, 0x68, 0x37, 0xf6, 0xbc, 0xd1, 0x4d, 0x25, 0xbd, 0x33, 0x6e,
+  0x91, 0x3c, 0x60, 0x57, 0x6b, 0x3d, 0x04, 0xf7, 0x34, 0xbd, 0x90, 0xe7, 0x30,
+  0x3d, 0x8e, 0x22, 0x65, 0xbd, 0x62, 0xcf, 0xb6, 0x3c, 0xce, 0x5d, 0x9f, 0x3c,
+  0xa0, 0x0a, 0x43, 0xbd, 0x1e, 0x7b, 0x56, 0xbd, 0x1f, 0x6a, 0x93, 0xbd, 0x60,
+  0x5e, 0x39, 0x3d, 0x4d, 0x17, 0x8e, 0xbd, 0x28, 0x00, 0xad, 0x3c, 0x79, 0xd0,
+  0xab, 0xbb, 0x15, 0xf3, 0x1a, 0xbd, 0x28, 0x13, 0x05, 0x3c, 0x90, 0x55, 0x20,
+  0x3d, 0x98, 0x9b, 0xc4, 0x3c, 0x32, 0x5f, 0x86, 0xbd, 0x6d, 0xf8, 0x52, 0xbd,
+  0xcc, 0x28, 0xae, 0x3c, 0x96, 0xc7, 0x81, 0x3d, 0x04, 0x2e, 0x5b, 0xbc, 0xdd,
+  0xce, 0xb2, 0x3c, 0x14, 0x5d, 0x67, 0x3d, 0x74, 0xe8, 0x77, 0x3d, 0x2e, 0xf5,
+  0x51, 0x3d, 0x21, 0x78, 0x7a, 0xbd, 0x62, 0xea, 0x6a, 0xbd, 0x36, 0x1c, 0xf4,
+  0xbc, 0xd0, 0x98, 0xda, 0x3b, 0x26, 0x14, 0x8a, 0xbd, 0xf2, 0xa4, 0x67, 0xbd,
+  0xb2, 0xa7, 0x39, 0xbd, 0x93, 0xa6, 0xd6, 0x3c, 0xe1, 0xa9, 0xe4, 0x3b, 0x49,
+  0xca, 0x3f, 0x3d, 0x07, 0xe3, 0x64, 0x3d, 0x1e, 0xf5, 0x4d, 0xbd, 0x4e, 0xc3,
+  0x8a, 0xbd, 0x88, 0xf9, 0xf8, 0x3c, 0xc6, 0x2a, 0xba, 0xbc, 0x56, 0xd7, 0xb1,
+  0xbc, 0xbd, 0xff, 0x10, 0x3c, 0xfe, 0x3d, 0x16, 0xbd, 0x88, 0xdd, 0x5f, 0x3c,
+  0x66, 0xd4, 0x50, 0xbd, 0xe2, 0x59, 0x62, 0x3d, 0x1c, 0xdf, 0xac, 0x3c, 0xc2,
+  0x72, 0xb7, 0xbc, 0xe2, 0x19, 0x4d, 0xbd, 0xc1, 0xbb, 0xa1, 0x3c, 0xf2, 0x8f,
+  0x24, 0x3d, 0x2f, 0xb1, 0xeb, 0xbc, 0xa7, 0xe6, 0x13, 0xbd, 0x4c, 0x51, 0x7c,
+  0xbd, 0x23, 0x87, 0x3e, 0xbd, 0x65, 0x03, 0x86, 0x3b, 0x5d, 0x13, 0x15, 0x3d,
+  0x44, 0x77, 0x96, 0xba, 0xe9, 0x74, 0x0a, 0x3d, 0xb4, 0xd0, 0x59, 0xbd, 0x4c,
+  0x9a, 0x22, 0x3d, 0x82, 0x1b, 0x85, 0x3d, 0x09, 0x1e, 0xf9, 0x3c, 0x20, 0xcf,
+  0x97, 0xbd, 0xf9, 0x46, 0x0e, 0xbd, 0xba, 0x0d, 0x82, 0x3d, 0xf6, 0xf1, 0xd7,
+  0x3c, 0x8e, 0x08, 0xf8, 0xbc, 0x4d, 0xbf, 0x22, 0xbd, 0xd0, 0x25, 0x8a, 0x3c,
+  0xa8, 0x71, 0x2e, 0xbd, 0xd9, 0xaa, 0x24, 0x3a, 0x48, 0x85, 0x6c, 0xbd, 0x90,
+  0x0e, 0x8c, 0x3c, 0x3c, 0x45, 0x50, 0x3d, 0x71, 0xab, 0x65, 0x3d, 0x60, 0x38,
+  0xdb, 0x3b, 0x9b, 0x94, 0x81, 0xbd, 0xc0, 0xaa, 0xb3, 0xbc, 0xc8, 0x46, 0x93,
+  0xbc, 0x3a, 0x19, 0xea, 0xbc, 0x16, 0xab, 0x36, 0xbc, 0x20, 0x52, 0x74, 0xbd,
+  0xbd, 0x3b, 0x75, 0x3d, 0xea, 0xef, 0xc3, 0xbc, 0x54, 0xbe, 0x26, 0xbd, 0x88,
+  0x03, 0x6c, 0x3d, 0xa0, 0x3e, 0x4a, 0x3d, 0x46, 0x60, 0x0a, 0x3d, 0xf9, 0x88,
+  0x59, 0x3d, 0xa2, 0x8a, 0x87, 0xbd, 0xde, 0x60, 0x48, 0x3d, 0xc6, 0x87, 0x60,
+  0x3d, 0x05, 0x18, 0x3d, 0xbc, 0xa8, 0x15, 0x01, 0x3d, 0x68, 0x46, 0x41, 0xbd,
+  0x7f, 0x8e, 0x58, 0x3d, 0xc6, 0xa4, 0xf6, 0x3c, 0x22, 0xbc, 0x73, 0x3d, 0xe8,
+  0x2d, 0x83, 0x3c, 0x97, 0x7f, 0x8b, 0xbb, 0xe6, 0x83, 0x81, 0xbc, 0x42, 0x79,
+  0x5b, 0x3d, 0x62, 0xfb, 0xd4, 0x3b, 0xf3, 0x51, 0x06, 0xbd, 0xb0, 0x65, 0x79,
+  0x3d, 0xbc, 0x83, 0xdc, 0x3c, 0xbe, 0xbd, 0x8c, 0x3d, 0x64, 0xdf, 0x13, 0x3d,
+  0x1f, 0xa8, 0x44, 0xbd, 0x1e, 0x7f, 0x87, 0xbc, 0x15, 0x05, 0x6c, 0xbd, 0x43,
+  0x6b, 0x75, 0xbd, 0x38, 0x5a, 0x64, 0x3d, 0xb8, 0x35, 0x2c, 0x3c, 0x93, 0x41,
+  0xd5, 0xb9, 0xf4, 0x66, 0x79, 0xbc, 0xd9, 0xda, 0xae, 0xbc, 0xd6, 0x82, 0xd4,
+  0x3b, 0x48, 0x9e, 0x3e, 0xbd, 0x0c, 0x2c, 0xb7, 0xbc, 0xba, 0x9c, 0x2f, 0xbd,
+  0x9c, 0x53, 0x4f, 0x3d, 0xf5, 0x5f, 0xe6, 0x3c, 0x60, 0x8e, 0x1f, 0x3b, 0xa6,
+  0x27, 0x4a, 0xbd, 0xe5, 0x82, 0x9b, 0x3c, 0xb7, 0xe1, 0x84, 0x3d, 0x13, 0x34,
+  0x34, 0xbc, 0x58, 0xca, 0x09, 0x3d, 0xe2, 0x9f, 0x70, 0x3d, 0x7b, 0x73, 0xa1,
+  0xbc, 0xdb, 0x26, 0x08, 0xbd, 0xc0, 0x46, 0xce, 0xba, 0xfc, 0xde, 0xe1, 0x3c,
+  0xf5, 0xd5, 0xbc, 0x3c, 0x03, 0x9b, 0x16, 0x3d, 0x61, 0xda, 0x16, 0xbd, 0x9c,
+  0x34, 0x15, 0xbd, 0x6c, 0xae, 0x50, 0xbd, 0xc0, 0x47, 0x89, 0xbd, 0xf0, 0xff,
+  0x52, 0x3d, 0xa2, 0xf2, 0x01, 0x3d, 0x7c, 0x68, 0x1a, 0x3d, 0x70, 0x77, 0x58,
+  0xbd, 0x62, 0xb8, 0xb3, 0x3c, 0xd8, 0x2e, 0x07, 0xbc, 0xe6, 0x32, 0x8b, 0x3d,
+  0x6b, 0xa2, 0x53, 0x3d, 0x12, 0xfa, 0x55, 0xbd, 0x7d, 0x83, 0x28, 0x3d, 0x92,
+  0xa8, 0x73, 0xbd, 0xd5, 0xd5, 0x9c, 0x3c, 0xe5, 0x93, 0x83, 0x3c, 0xf9, 0xc8,
+  0xb3, 0xbc, 0xfb, 0x27, 0x78, 0xbd, 0xa6, 0x7d, 0x5b, 0x3d, 0x9c, 0x51, 0x4d,
+  0x3d, 0x25, 0x60, 0x4b, 0x3d, 0xba, 0x91, 0x96, 0xb9, 0xd7, 0xaf, 0xc3, 0x3c,
+  0x34, 0x25, 0x3c, 0x3d, 0x3a, 0x04, 0x3a, 0x3d, 0x86, 0xb2, 0x30, 0x3c, 0x90,
+  0xcf, 0x46, 0x3d, 0x96, 0xee, 0xe2, 0xbc, 0x9c, 0x30, 0xa7, 0x3c, 0x56, 0xe3,
+  0x5a, 0xbd, 0x2f, 0xb6, 0x23, 0x3d, 0xda, 0x3e, 0x3c, 0xbd, 0x6e, 0xa0, 0x5c,
+  0x3d, 0x28, 0xe0, 0x6e, 0xbd, 0x1a, 0x52, 0x34, 0x3d, 0xb8, 0xcd, 0x27, 0xbc,
+  0x4a, 0xb4, 0x22, 0x3d, 0x1c, 0xd7, 0x64, 0xbc, 0x8f, 0xd9, 0x1d, 0xbd, 0xa2,
+  0x1e, 0x17, 0x3d, 0x78, 0xed, 0xe2, 0x3c, 0x82, 0x5e, 0x0d, 0x3c, 0x93, 0x9d,
+  0x58, 0xbd, 0x35, 0x43, 0x8a, 0xbd, 0xbd, 0xa6, 0xdf, 0x3c, 0x11, 0xc3, 0x3b,
+  0x3d, 0x6c, 0xad, 0x58, 0xbd, 0x2e, 0x39, 0x1f, 0x3d, 0x45, 0x7d, 0x00, 0x3a,
+  0xa9, 0xb2, 0x5b, 0x3d, 0x00, 0x38, 0x81, 0x38, 0xaa, 0x9f, 0xc9, 0x3a, 0xaa,
+  0x79, 0x73, 0xbd, 0x39, 0x7b, 0xf7, 0x3b, 0xc4, 0x9f, 0x4e, 0xbd, 0xa1, 0x0c,
+  0x64, 0x3a, 0x9b, 0x06, 0x5f, 0xbd, 0x32, 0x21, 0x6d, 0xbd, 0xbe, 0x94, 0x4e,
+  0x3d, 0x7c, 0x40, 0xf9, 0x3c, 0xc8, 0xac, 0xca, 0x3c, 0x30, 0x76, 0x50, 0xbd,
+  0x08, 0x66, 0x93, 0xbd, 0x0b, 0x4c, 0xb9, 0x3c, 0x8e, 0xef, 0x26, 0x3d, 0xe3,
+  0x00, 0x68, 0x3d, 0x51, 0x3a, 0x84, 0xbd, 0x54, 0xac, 0xb3, 0xbc, 0x95, 0x17,
+  0x91, 0xbd, 0x04, 0xf2, 0x31, 0x3d, 0x48, 0xbb, 0x20, 0x3c, 0xf3, 0x82, 0x88,
+  0xbd, 0xdd, 0x5e, 0x4e, 0xbd, 0x95, 0x9e, 0x45, 0xbd, 0x62, 0xce, 0x51, 0xbd,
+  0xa3, 0x8b, 0x3b, 0x3d, 0x40, 0xdb, 0x85, 0x3d, 0x33, 0xdc, 0xc1, 0xbc, 0xa7,
+  0xb6, 0x7d, 0xbd, 0xd3, 0x99, 0x40, 0xbc, 0x6b, 0x63, 0x18, 0x3d, 0x73, 0x2f,
+  0x63, 0xbc, 0xf8, 0xa2, 0x4a, 0xbc, 0xa5, 0x0b, 0x76, 0x3d, 0xd5, 0x88, 0x79,
+  0x3d, 0x97, 0x41, 0x98, 0x3c, 0xe8, 0x20, 0x16, 0x3d, 0xcc, 0x47, 0x78, 0xbd,
+  0xfd, 0x9a, 0xae, 0x3c, 0xf2, 0xe2, 0x8a, 0xbd, 0x07, 0xd1, 0x19, 0x3d, 0xd4,
+  0xef, 0x68, 0xbc, 0x82, 0x5d, 0x51, 0x3d, 0x0c, 0x61, 0xc8, 0xba, 0xc1, 0xd5,
+  0x36, 0xbd, 0xf2, 0x3c, 0x1d, 0x3d, 0x86, 0xdf, 0x65, 0x3d, 0x04, 0x4c, 0x87,
+  0x3d, 0xe9, 0x46, 0x91, 0x3d, 0xc0, 0x63, 0x33, 0xbc, 0x7c, 0xd0, 0xbf, 0x3c,
+  0xe8, 0xfe, 0x55, 0xbd, 0x18, 0x50, 0x53, 0x3c, 0x51, 0x99, 0xb0, 0xbb, 0x50,
+  0x90, 0xec, 0x3b, 0x3d, 0x3a, 0x69, 0xbd, 0x6e, 0x49, 0x09, 0xbc, 0x74, 0x12,
+  0xde, 0xbc, 0xad, 0x0c, 0x87, 0x3c, 0x35, 0x8f, 0x41, 0x3d, 0x5e, 0xa8, 0x3b,
+  0xbd, 0x28, 0x85, 0x61, 0x3d, 0xfe, 0xb2, 0xe1, 0x3b, 0xec, 0xbb, 0x0e, 0x3d,
+  0x04, 0xe3, 0x05, 0x3d, 0x10, 0xeb, 0x07, 0xbd, 0x63, 0x3a, 0x68, 0x3d, 0x55,
+  0x9c, 0x49, 0x3b, 0x58, 0xdc, 0x62, 0x3d, 0x33, 0x78, 0x03, 0x3d, 0x0f, 0xc8,
+  0x7a, 0xbd, 0xa3, 0x94, 0x83, 0xbd, 0xf7, 0x86, 0x5d, 0xbd, 0xcb, 0xd6, 0x82,
+  0x3d, 0xcb, 0x78, 0x82, 0xbd, 0xcb, 0x8b, 0x46, 0xbc, 0x44, 0xff, 0x75, 0xbd,
+  0x63, 0xc6, 0x48, 0x3d, 0x50, 0x1b, 0x14, 0xbc, 0x57, 0xd1, 0xe1, 0x3c, 0x60,
+  0xa8, 0xe2, 0x3c, 0x00, 0xa0, 0xf8, 0xb9, 0x9c, 0x9f, 0x24, 0x3d, 0x10, 0x2c,
+  0x4a, 0x3c, 0x90, 0xdf, 0xbc, 0xbc, 0x9e, 0xae, 0xa4, 0xbc, 0xf7, 0x31, 0x66,
+  0xbd, 0x1e, 0x83, 0x14, 0x3c, 0x9b, 0xaa, 0x91, 0x3b, 0x91, 0x24, 0x11, 0xbd,
+  0x54, 0x0b, 0x90, 0x3b, 0x30, 0xa4, 0x64, 0x3d, 0x69, 0xa8, 0x81, 0x3d, 0x5e,
+  0x35, 0x03, 0xbb, 0xcc, 0xce, 0xa6, 0x3c, 0x2f, 0x18, 0xfd, 0xbc, 0x50, 0x81,
+  0xe2, 0xbb, 0x40, 0x4b, 0x16, 0x3d, 0xc0, 0x66, 0x63, 0xbd, 0x5f, 0xcd, 0x9b,
+  0xbc, 0x2f, 0xf8, 0x25, 0xbd, 0xa0, 0x4d, 0x7a, 0x3c, 0x81, 0x0c, 0x5a, 0xbd,
+  0x54, 0xa9, 0x6a, 0x3d, 0xc0, 0x3b, 0x3c, 0xbd, 0xb4, 0x63, 0xfb, 0x3c, 0x26,
+  0x9c, 0x11, 0x3d, 0x06, 0xea, 0xa3, 0xbc, 0x3f, 0x44, 0x92, 0xbc, 0x00, 0x88,
+  0x6f, 0x3b, 0xd8, 0x6f, 0x36, 0xbd, 0xe0, 0xad, 0x89, 0x3d, 0x52, 0xfb, 0x72,
+  0x3d, 0x64, 0x05, 0x64, 0xbc, 0xd7, 0x2a, 0x57, 0xbd, 0x02, 0x49, 0xad, 0xbc,
+  0x38, 0xf1, 0x2d, 0xbd, 0x8a, 0x2e, 0x8b, 0x3d, 0x39, 0x44, 0x12, 0xbd, 0xfc,
+  0xa0, 0xb8, 0xbc, 0x32, 0x17, 0x8a, 0xbd, 0x7e, 0xbf, 0x6b, 0x3d, 0x32, 0x76,
+  0xad, 0xbc, 0xb0, 0x21, 0x58, 0x3d, 0x62, 0xf5, 0x59, 0x3d, 0xb3, 0x5f, 0x98,
+  0x3c, 0xa4, 0x02, 0x2c, 0x3b, 0x59, 0x69, 0x97, 0xbd, 0x70, 0xcf, 0x91, 0x3b,
+  0x6b, 0xc3, 0x47, 0xbd, 0x10, 0xfe, 0xd4, 0xbc, 0x08, 0x93, 0xd1, 0x3b, 0xf5,
+  0xe9, 0x14, 0xbd, 0x9a, 0x9c, 0x7b, 0x3d, 0x15, 0x75, 0x54, 0x3d, 0x09, 0xbf,
+  0x57, 0xbc, 0xbf, 0x09, 0x29, 0xbb, 0xf5, 0x6d, 0x91, 0xbd, 0xb8, 0x41, 0xbd,
+  0x3c, 0x80, 0x60, 0x6e, 0x3c, 0xab, 0xf2, 0x4f, 0xbd, 0x81, 0x36, 0x79, 0x3d,
+  0x6a, 0x5a, 0x85, 0xbd, 0xf2, 0xac, 0x36, 0x3d, 0x92, 0x7c, 0xc0, 0xbc, 0x00,
+  0x12, 0x06, 0x3c, 0xfe, 0x9c, 0x66, 0x3d, 0xa0, 0xf3, 0xbb, 0xbb, 0x37, 0xb0,
+  0x74, 0xbd, 0x18, 0xb1, 0x10, 0xbd, 0x82, 0xd7, 0xe2, 0xbc, 0x87, 0xee, 0x14,
+  0x3d, 0xe9, 0x2a, 0x40, 0xbd, 0xe3, 0x0d, 0x53, 0x3c, 0x5c, 0x02, 0x93, 0x3c,
+  0x25, 0x0f, 0x49, 0xbd, 0x88, 0xd8, 0x3f, 0x3d, 0x58, 0xf0, 0x39, 0xbd, 0xe3,
+  0x0a, 0x3b, 0xbd, 0xeb, 0x61, 0x01, 0x3d, 0xb4, 0xa0, 0x6b, 0xbd, 0x1d, 0x4b,
+  0x90, 0xbd, 0xb2, 0x31, 0x34, 0xbd, 0xaa, 0x20, 0xad, 0x3a, 0xd5, 0x1e, 0x3a,
+  0xbd, 0xf4, 0x05, 0x38, 0x3d, 0x1b, 0xb2, 0x46, 0xbc, 0x2c, 0xd7, 0x3e, 0x3d,
+  0xec, 0x98, 0xc7, 0x3c, 0xe7, 0xd3, 0x21, 0xbd, 0x07, 0x35, 0x60, 0xbd, 0x2b,
+  0xb9, 0xfd, 0xbc, 0x9b, 0x69, 0x36, 0x3d, 0xdf, 0xdf, 0x6f, 0xbd, 0x5a, 0x80,
+  0x81, 0xbd, 0x9b, 0x67, 0xf2, 0x3b, 0x20, 0x94, 0xde, 0xbb, 0xc5, 0xfc, 0x29,
+  0xbd, 0x0c, 0x34, 0x30, 0xbd, 0x50, 0xbb, 0xc9, 0xbc, 0x92, 0x32, 0x93, 0xbc,
+  0x12, 0xf9, 0x69, 0xbd, 0x1c, 0x84, 0x3a, 0xbc, 0x88, 0x93, 0x84, 0xbd, 0x07,
+  0x7e, 0xb5, 0x3c, 0xe6, 0xb8, 0x4a, 0x3d, 0xde, 0x7c, 0x55, 0x3d, 0x16, 0x69,
+  0xf0, 0xbc, 0x91, 0x57, 0x5b, 0xbd, 0xa2, 0x4a, 0x26, 0x3d, 0x5b, 0xdc, 0xaf,
+  0xba, 0xe8, 0x30, 0xe1, 0xbc, 0xf8, 0x97, 0x21, 0x3d, 0x00, 0x3e, 0x11, 0x3c,
+  0x92, 0x1c, 0xb1, 0xbc, 0xce, 0x5f, 0xa3, 0x3c, 0x2d, 0x13, 0x88, 0xbd, 0xbc,
+  0x64, 0xbc, 0x3c, 0xd1, 0x47, 0x97, 0xbb, 0xf2, 0x46, 0x55, 0x3d, 0x70, 0x6e,
+  0x09, 0x3d, 0x6b, 0x66, 0x93, 0xbd, 0x26, 0xf4, 0xcb, 0xbc, 0x59, 0xb5, 0x84,
+  0xbc, 0x13, 0x19, 0x8d, 0x3d, 0x35, 0xf3, 0x3e, 0xbc, 0x9d, 0xf8, 0x78, 0x3d,
+  0x75, 0x6d, 0x4f, 0x3d, 0xd4, 0x8a, 0xd7, 0x3c, 0x74, 0x49, 0x0d, 0xbd, 0x40,
+  0x3d, 0xcd, 0x3a, 0xa2, 0xb6, 0x64, 0x3d, 0x73, 0xc5, 0x90, 0x3d, 0x5b, 0x4e,
+  0x85, 0xbd, 0xf6, 0x1b, 0x64, 0x3d, 0x15, 0x44, 0xbf, 0xbc, 0x4c, 0xb6, 0x0e,
+  0x3d, 0xaf, 0x91, 0x06, 0xbc, 0xa0, 0xc6, 0xdf, 0x3c, 0xb7, 0xb5, 0x66, 0x3d,
+  0x23, 0x0d, 0x68, 0xbd, 0xcf, 0x9f, 0xe9, 0xbc, 0xcd, 0xa5, 0x1f, 0xbd, 0x92,
+  0x3c, 0x5b, 0x3d, 0x0c, 0x92, 0x57, 0x3d, 0x73, 0xa2, 0x2e, 0xbd, 0x4a, 0xeb,
+  0x23, 0xbc, 0x6b, 0xa1, 0x3c, 0xba, 0xd2, 0x19, 0xbb, 0xbc, 0x44, 0x55, 0x29,
+  0xbd, 0xcd, 0x07, 0x34, 0xbd, 0xbf, 0xaa, 0xf9, 0xba, 0x18, 0x7b, 0x8a, 0xbc,
+  0x4a, 0xe1, 0x5d, 0x3d, 0x28, 0x1b, 0x38, 0x3c, 0xfd, 0x1b, 0xd0, 0x3b, 0xdd,
+  0x1c, 0x92, 0xbb, 0xf4, 0x64, 0x31, 0x3c, 0x82, 0x22, 0x44, 0x3d, 0x22, 0xd5,
+  0x0c, 0xbd, 0x63, 0x1f, 0x24, 0xbd, 0xd0, 0xe3, 0x03, 0x3c, 0xfc, 0x32, 0x22,
+  0xbc, 0x26, 0x4e, 0xba, 0xbc, 0xf2, 0x18, 0xa8, 0xbc, 0x1d, 0xb1, 0x43, 0xbc,
+  0x4b, 0x52, 0x17, 0xbd, 0xe1, 0xf7, 0x05, 0x3d, 0xdb, 0xfb, 0xd9, 0x3c, 0x0b,
+  0x58, 0x8e, 0xbc, 0xc1, 0x1f, 0x81, 0x3d, 0xa0, 0x6f, 0x36, 0xbd, 0x52, 0xec,
+  0x57, 0xbd, 0x6a, 0x3b, 0x06, 0xbd, 0xb5, 0x5b, 0x9c, 0xbc, 0x08, 0xb1, 0x32,
+  0xbc, 0xc0, 0xde, 0x85, 0xbd, 0x2d, 0xd5, 0xd2, 0x3c, 0xa6, 0x1d, 0x14, 0xbc,
+  0x8d, 0x5e, 0xd8, 0x3c, 0x83, 0x8e, 0xcf, 0xbc, 0xa0, 0xc2, 0x83, 0xbd, 0xce,
+  0x5f, 0x3b, 0xbd, 0x60, 0xbc, 0x7d, 0xbc, 0x8e, 0x9c, 0x7f, 0xbd, 0xb3, 0x61,
+  0x0b, 0xbd, 0x1c, 0x2b, 0xc9, 0x3c, 0xbc, 0xb7, 0x6f, 0x3c, 0x61, 0x58, 0xda,
+  0xbc, 0xcc, 0x72, 0x23, 0x3c, 0x28, 0x64, 0x61, 0x3c, 0x5a, 0x19, 0x42, 0x3d,
+  0xb0, 0x39, 0x13, 0x3c, 0xe6, 0x3a, 0xf7, 0xbc, 0xc4, 0xaf, 0xc4, 0x3c, 0xd2,
+  0x14, 0xd0, 0xbc, 0x1a, 0x00, 0xb8, 0xbc, 0xf9, 0x9e, 0x23, 0xbd, 0xdf, 0x82,
+  0x6a, 0xbd, 0x7a, 0xc2, 0x18, 0xbc, 0xbf, 0xb0, 0x11, 0xbc, 0x2d, 0x48, 0x5b,
+  0xbd, 0xff, 0xff, 0x46, 0x3c, 0x6c, 0x6c, 0x36, 0x3c, 0xec, 0x21, 0x8a, 0xbd,
+  0x02, 0x85, 0xe0, 0x3c, 0xdf, 0x2e, 0x42, 0xbd, 0xf0, 0xa5, 0x24, 0x3d, 0x0a,
+  0xd1, 0x00, 0x3d, 0x58, 0x44, 0xb3, 0x3c, 0xc9, 0xe4, 0x33, 0x39, 0xba, 0x0f,
+  0xb9, 0xbc, 0xba, 0x18, 0x64, 0x3c, 0x9e, 0xc4, 0x50, 0xbc, 0x5f, 0x96, 0x4c,
+  0x3d, 0xbc, 0xdc, 0x61, 0x3d, 0xba, 0xaf, 0x38, 0x3d, 0xf1, 0x21, 0x89, 0x3d,
+  0x60, 0x95, 0x05, 0x3c, 0xc6, 0xb2, 0x6e, 0xbc, 0x5f, 0x2d, 0x21, 0xbd, 0xee,
+  0x52, 0x23, 0x3d, 0x3c, 0xc0, 0x1d, 0xbc, 0x3e, 0xcd, 0x84, 0x3d, 0x00, 0xc5,
+  0xa8, 0x39, 0x06, 0x5b, 0x4a, 0xbd, 0xec, 0x4b, 0x1b, 0xbd, 0x05, 0x4c, 0x17,
+  0xbd, 0x18, 0x01, 0x56, 0x3c, 0xcd, 0x05, 0x87, 0xbd, 0xe4, 0x37, 0x41, 0xbc,
+  0xdc, 0x36, 0x84, 0x3d, 0xa1, 0xd7, 0x09, 0x3d, 0x44, 0xf4, 0x63, 0xbd, 0x56,
+  0x62, 0x78, 0xbd, 0x12, 0x57, 0x3b, 0xbd, 0x43, 0xcd, 0x71, 0xbb, 0xa3, 0xf6,
+  0x10, 0x3d, 0x3a, 0x9f, 0xff, 0xbc, 0x6f, 0xdd, 0x8d, 0x3d, 0xb3, 0xd7, 0x08,
+  0xbd, 0x3e, 0x97, 0x76, 0x3d, 0x99, 0x60, 0x02, 0xbd, 0x08, 0x27, 0x8d, 0x3d,
+  0xf1, 0x51, 0x29, 0x3d, 0x48, 0x9d, 0xfe, 0x3c, 0x97, 0xb9, 0x72, 0xbd, 0x35,
+  0x21, 0xab, 0xbc, 0xc3, 0x96, 0x69, 0x3c, 0x05, 0x44, 0x05, 0x3d, 0x80, 0x79,
+  0x75, 0x3a, 0x94, 0x62, 0xfe, 0x3b, 0x47, 0xb4, 0x64, 0x3c, 0xbb, 0x50, 0x29,
+  0xbd, 0xe9, 0xb8, 0x6e, 0xbd, 0x2e, 0xab, 0x26, 0xbc, 0x54, 0x42, 0xb6, 0xbc,
+  0x08, 0xdb, 0x22, 0xbd, 0xae, 0x42, 0x78, 0x3d, 0x3c, 0xba, 0x2c, 0xbc, 0x46,
+  0xf1, 0x6e, 0x3d, 0xed, 0xb1, 0x88, 0xbd, 0x96, 0x2c, 0x75, 0x3d, 0x26, 0x69,
+  0x90, 0xbd, 0x9b, 0x7b, 0x77, 0xbc, 0x9a, 0xbc, 0x05, 0xbd, 0x85, 0xb1, 0x19,
+  0xbd, 0xb8, 0x33, 0x8b, 0xbd, 0xfa, 0xa3, 0x8b, 0xbc, 0xc6, 0x36, 0xf2, 0x3c,
+  0x4e, 0x81, 0xa2, 0xbc, 0xa7, 0x85, 0x73, 0xbd, 0xca, 0xe5, 0x93, 0xbc, 0xc8,
+  0x3d, 0x0e, 0x3d, 0x75, 0x3c, 0x00, 0xbd, 0x28, 0x32, 0x0e, 0x3d, 0x8f, 0x29,
+  0x04, 0xbc, 0x0c, 0x29, 0x37, 0xbd, 0x47, 0x11, 0x83, 0xbd, 0x82, 0x57, 0x2a,
+  0xbd, 0x45, 0x1f, 0x6b, 0xbc, 0x66, 0xaf, 0x7d, 0xbd, 0xa8, 0x5a, 0x25, 0xbd,
+  0x96, 0xc0, 0x14, 0x3b, 0xba, 0xf0, 0x1b, 0xbd, 0xe0, 0x71, 0x44, 0xbb, 0x9c,
+  0x09, 0xb9, 0xbc, 0x45, 0xda, 0x77, 0x3c, 0x2b, 0x5d, 0x80, 0x3d, 0xaa, 0xf0,
+  0x21, 0x3d, 0xa0, 0x25, 0x31, 0x3d, 0x34, 0xc8, 0x3b, 0xbd, 0x90, 0x50, 0xf6,
+  0xbc, 0x53, 0xed, 0x04, 0x3a, 0x26, 0xf8, 0x6e, 0x3d, 0x6d, 0x73, 0x0f, 0x3d,
+  0xe8, 0xac, 0x43, 0x3d, 0xf1, 0x03, 0x8a, 0x3c, 0xc4, 0x94, 0x3d, 0x3d, 0x3c,
+  0x89, 0x8b, 0x3d, 0x62, 0x99, 0x0f, 0x3d, 0xb6, 0x30, 0x8d, 0x3c, 0xfa, 0x8f,
+  0x25, 0x3c, 0x4c, 0x45, 0xd2, 0xbc, 0x00, 0x5d, 0xc0, 0x3c, 0xae, 0x8d, 0x6c,
+  0xbd, 0xcb, 0xa3, 0x92, 0xbd, 0xc4, 0x1e, 0xbb, 0xbc, 0x63, 0xf8, 0xaa, 0x3c,
+  0xd7, 0x7c, 0x81, 0x3d, 0xbf, 0x33, 0x41, 0x3c, 0x80, 0x59, 0x69, 0xbb, 0x0a,
+  0x75, 0x37, 0xbd, 0x29, 0xdc, 0x1b, 0xbd, 0x10, 0x1f, 0x46, 0xbd, 0xee, 0xb4,
+  0x5d, 0x3d, 0xfa, 0x40, 0x95, 0xbd, 0x02, 0xd8, 0x19, 0xbd, 0xa8, 0xd0, 0xf0,
+  0xbc, 0x0a, 0xb8, 0xc4, 0x3c, 0x68, 0xa8, 0x11, 0xbd, 0x24, 0x4f, 0x3e, 0x3d,
+  0x39, 0x99, 0x90, 0xbd, 0x7c, 0x43, 0x13, 0xbd, 0x86, 0xe5, 0x8f, 0xbd, 0xa4,
+  0x16, 0xb4, 0xbc, 0xa0, 0xe9, 0xf2, 0x3c, 0x91, 0x68, 0x5d, 0xbd, 0x51, 0x92,
+  0x85, 0x3d, 0xd2, 0x4d, 0x35, 0xbd, 0xc7, 0x44, 0x3e, 0xbd, 0x20, 0xf6, 0xe0,
+  0x3c, 0x6b, 0x38, 0x35, 0x3d, 0xd2, 0x2b, 0x2a, 0xbb, 0xc8, 0xbf, 0x0c, 0xbd,
+  0xec, 0xd6, 0xfc, 0x3b, 0x1c, 0xae, 0xa9, 0xbc, 0x28, 0x65, 0xb3, 0x3c, 0xdf,
+  0x29, 0x98, 0xbc, 0x11, 0x52, 0xbd, 0x3c, 0x4d, 0x7d, 0xac, 0x3c, 0x95, 0xcb,
+  0x09, 0xbc, 0xc5, 0xc5, 0xf8, 0xbc, 0xe6, 0x99, 0x3f, 0x3c, 0xb0, 0x51, 0xfd,
+  0xbc, 0x88, 0x6b, 0xe0, 0xbc, 0xaa, 0x84, 0x83, 0xbd, 0x98, 0x79, 0x8d, 0x3c,
+  0xda, 0x5f, 0xf2, 0x3c, 0xb3, 0xcc, 0x7a, 0x3d, 0xc9, 0x55, 0x08, 0x3d, 0xd1,
+  0x83, 0x33, 0x3d, 0x6c, 0xc1, 0x66, 0xbc, 0x80, 0xf9, 0x62, 0xba, 0xe4, 0xd5,
+  0x88, 0xbd, 0x60, 0x31, 0xd2, 0xbc, 0x2b, 0x89, 0x86, 0x3d, 0x1b, 0x1e, 0x53,
+  0xbd, 0xfa, 0x0c, 0x07, 0xbd, 0x50, 0xe8, 0xb5, 0xbc, 0x4f, 0xc6, 0x65, 0xbd,
+  0xef, 0x09, 0x75, 0xbd, 0xd5, 0x47, 0x0c, 0xbd, 0xcc, 0x4e, 0x89, 0xbd, 0x9c,
+  0x69, 0xe3, 0x3c, 0x52, 0xea, 0x9d, 0xbc, 0x01, 0x0e, 0x86, 0xbc, 0x2a, 0x61,
+  0x72, 0xbd, 0x85, 0xbc, 0x87, 0x3d, 0x21, 0xf7, 0x42, 0x3d, 0x0b, 0x60, 0x23,
+  0xbd, 0x0f, 0x0f, 0xed, 0xbc, 0x7d, 0x05, 0xd2, 0xbc, 0x6e, 0x5e, 0x5f, 0xbd,
+  0x36, 0x52, 0x92, 0xbd, 0x7e, 0x96, 0x05, 0xbb, 0x6e, 0x51, 0x98, 0x3a, 0xe5,
+  0x11, 0x19, 0xbd, 0x00, 0xcf, 0x84, 0xbb, 0x61, 0x5e, 0xed, 0x3c, 0x60, 0xcf,
+  0x50, 0xbb, 0xce, 0xbe, 0x07, 0x3c, 0x5c, 0x81, 0x20, 0x3d, 0x45, 0x85, 0xf6,
+  0xbc, 0x1d, 0xb7, 0x91, 0x3d, 0x38, 0x08, 0x59, 0x3c, 0x28, 0x93, 0x4b, 0x3d,
+  0x3a, 0xc4, 0x87, 0xbd, 0x44, 0x7f, 0x04, 0xbd, 0xdd, 0x17, 0x81, 0x3d, 0xbe,
+  0x94, 0x48, 0x3d, 0x88, 0x6a, 0xce, 0xba, 0x93, 0x5b, 0x20, 0x3d, 0xab, 0x05,
+  0x90, 0xbd, 0xf9, 0x71, 0xc4, 0x3c, 0x6c, 0xd4, 0x7a, 0x3d, 0x4a, 0x2d, 0x20,
+  0x3d, 0x94, 0xd7, 0x88, 0x3d, 0x82, 0xb5, 0x87, 0xbd, 0x55, 0x15, 0xec, 0x3b,
+  0xc0, 0x09, 0xe4, 0xba, 0x31, 0x50, 0xfc, 0x3c, 0x25, 0x49, 0x6e, 0x3c, 0x5c,
+  0x79, 0x92, 0xbc, 0xed, 0xab, 0x14, 0xbd, 0x24, 0x3e, 0xaa, 0x3c, 0x98, 0x43,
+  0x58, 0x3d, 0x2f, 0x00, 0x62, 0x3d, 0x3c, 0x09, 0x2d, 0x3d, 0xe3, 0x27, 0x85,
+  0x3c, 0x7a, 0x37, 0x06, 0x3d, 0x49, 0xe6, 0x62, 0xbd, 0x71, 0x53, 0x94, 0xbd,
+  0xc4, 0xeb, 0xd0, 0xbb, 0xd8, 0xed, 0x11, 0x3c, 0xfe, 0x75, 0x8c, 0xbc, 0xc4,
+  0xeb, 0x16, 0xbd, 0xb8, 0xb8, 0xf7, 0x3c, 0x30, 0x85, 0xaa, 0xbb, 0xcb, 0x9f,
+  0x16, 0xbd, 0x1d, 0xed, 0x8d, 0x3d, 0x0f, 0xf3, 0x08, 0xbd, 0x8e, 0x3c, 0x13,
+  0x3d, 0xc4, 0x04, 0x74, 0x3d, 0x60, 0xeb, 0x35, 0xbd, 0xe7, 0xcf, 0x38, 0x3d,
+  0x12, 0xde, 0xaf, 0x3c, 0xca, 0x71, 0x04, 0x3d, 0x1c, 0xd8, 0xeb, 0x3c, 0xc6,
+  0xfc, 0xb3, 0x3c, 0xa0, 0x37, 0x5a, 0x3d, 0xbe, 0xcc, 0x59, 0x3c, 0x4c, 0x95,
+  0x9a, 0xbc, 0xa6, 0xff, 0xa8, 0x3b, 0xcd, 0x7d, 0x7d, 0xbd, 0x5c, 0xe7, 0xba,
+  0x3c, 0xf9, 0x97, 0x02, 0xbd, 0x3a, 0xd3, 0x80, 0xbd, 0xcd, 0xbe, 0x97, 0xbd,
+  0x3b, 0x0d, 0x35, 0xba, 0x76, 0x27, 0x44, 0x3d, 0x63, 0xae, 0x8a, 0x3d, 0x03,
+  0x4c, 0x68, 0xbd, 0xe5, 0x9d, 0x0f, 0xbc, 0x6f, 0x5d, 0x45, 0xbb, 0x48, 0x3a,
+  0x74, 0x3d, 0x85, 0xfa, 0x37, 0xbd, 0x31, 0xf5, 0x1c, 0x3d, 0x0b, 0x19, 0x52,
+  0xbd, 0x00, 0xcd, 0x9e, 0xb9, 0xdb, 0xe5, 0x84, 0xbd, 0x83, 0xf1, 0x7f, 0xbd,
+  0xb7, 0x44, 0x63, 0xbd, 0x44, 0x0a, 0x98, 0xbd, 0x60, 0xd8, 0x23, 0xbb, 0xd1,
+  0x69, 0x61, 0xbd, 0x71, 0x41, 0x5a, 0xbd, 0x2f, 0xd9, 0x70, 0xbd, 0xc3, 0xb8,
+  0xd3, 0x3c, 0x38, 0xa7, 0x99, 0x3c, 0xe0, 0xa0, 0x21, 0xbd, 0xd2, 0x90, 0xa8,
+  0xb8, 0xff, 0xae, 0x32, 0x3c, 0x65, 0x1a, 0x0d, 0x3d, 0xa6, 0xd0, 0x39, 0xbd,
+  0xdd, 0xb4, 0x18, 0xbd, 0xb0, 0xa0, 0xbc, 0x3c, 0xa0, 0xe4, 0x8b, 0x3d, 0x90,
+  0xe6, 0x25, 0x3d, 0x7c, 0x20, 0x5d, 0x3d, 0x74, 0x50, 0xda, 0xbb, 0x4a, 0xe0,
+  0x70, 0x3d, 0x02, 0x36, 0x13, 0x3d, 0xaa, 0xab, 0x05, 0xbd, 0xec, 0xda, 0x10,
+  0xbd, 0xd1, 0x40, 0x35, 0xbd, 0xd2, 0x14, 0x3a, 0xbd, 0xd6, 0x7f, 0x06, 0xbd,
+  0x55, 0xf8, 0x31, 0x3d, 0xea, 0xc4, 0x5c, 0x3d, 0xd6, 0x89, 0x52, 0x3d, 0x68,
+  0xe6, 0x44, 0x3d, 0xd5, 0x64, 0x20, 0xbd, 0x18, 0x41, 0xc8, 0x3c, 0x10, 0xfa,
+  0x44, 0x3d, 0x30, 0x39, 0x20, 0xbc, 0x27, 0x26, 0x85, 0x3d, 0x9e, 0x02, 0x48,
+  0x3d, 0x59, 0xbb, 0xad, 0xbc, 0x67, 0x3c, 0xe3, 0xbc, 0xcc, 0x6e, 0x4b, 0xbd,
+  0x08, 0xf9, 0x1c, 0xbd, 0x50, 0x02, 0xa8, 0x3c, 0x77, 0x8c, 0x21, 0xbd, 0x1b,
+  0x8e, 0x0c, 0x3c, 0x0a, 0xe3, 0x76, 0x3d, 0x60, 0xa0, 0xa6, 0xbc, 0x30, 0x1d,
+  0x2c, 0x3d, 0x89, 0xab, 0x57, 0xbd, 0x39, 0xdf, 0x8e, 0x3b, 0x4e, 0xd0, 0x81,
+  0x3d, 0x6f, 0xc7, 0x0c, 0x3d, 0xb8, 0x21, 0x12, 0x3d, 0x32, 0xe6, 0x5a, 0x3d,
+  0x26, 0xbf, 0x64, 0x3c, 0xa8, 0xaf, 0x35, 0x3d, 0x0e, 0x6e, 0xb4, 0xbc, 0x78,
+  0x59, 0xa8, 0x3c, 0xd1, 0xca, 0x5c, 0xbd, 0x3a, 0x40, 0x53, 0x3d, 0x30, 0x50,
+  0x0c, 0xbc, 0x11, 0xd3, 0x35, 0xbd, 0x06, 0x5b, 0x89, 0xbd, 0x2e, 0xe3, 0x63,
+  0x3d, 0xc5, 0xdc, 0x0e, 0xbd, 0x60, 0x04, 0x2d, 0xbb, 0xae, 0xfb, 0x42, 0x3d,
+  0x83, 0x52, 0xcd, 0xbc, 0x20, 0x53, 0x06, 0x3d, 0xd5, 0xc6, 0x38, 0x3c, 0xa7,
+  0xa9, 0xf4, 0xbc, 0x9b, 0x2d, 0x89, 0x3d, 0x70, 0x74, 0x83, 0x3c, 0x06, 0x87,
+  0xe7, 0x3b, 0x97, 0xa3, 0x92, 0x3c, 0x38, 0x5f, 0xf7, 0x3c, 0xdf, 0x71, 0x3b,
+  0xbd, 0xfe, 0x14, 0x4d, 0x3d, 0x0a, 0x42, 0xb8, 0xbc, 0xb4, 0xf6, 0x2f, 0x3c,
+  0x33, 0xe6, 0x94, 0xbd, 0x26, 0x39, 0x71, 0xbd, 0x10, 0xf4, 0x6e, 0xbd, 0xe4,
+  0x3f, 0x09, 0xbd, 0x35, 0xe6, 0xb7, 0x3c, 0x9b, 0x3a, 0x10, 0xbd, 0x4d, 0x58,
+  0x43, 0xbd, 0x3e, 0x25, 0x2c, 0xbd, 0x38, 0xdc, 0x4f, 0x3c, 0x06, 0xf5, 0xff,
+  0xbc, 0x33, 0x3e, 0x81, 0xbd, 0x27, 0x99, 0x8e, 0xbb, 0x27, 0xc9, 0x68, 0xbd,
+  0xce, 0x6c, 0x81, 0x3c, 0x0e, 0xab, 0x67, 0xbd, 0x50, 0x8a, 0x2f, 0x3c, 0x30,
+  0x32, 0x37, 0x3d, 0x49, 0xd1, 0x0e, 0xbd, 0x60, 0xe2, 0x38, 0x3d, 0xf8, 0xd0,
+  0x9f, 0x3c, 0x3e, 0x8a, 0x0d, 0x3d, 0x7e, 0x2f, 0x6a, 0xbd, 0xe8, 0x0f, 0xab,
+  0x3b, 0x6e, 0x3d, 0x49, 0xbd, 0xba, 0xdd, 0x00, 0x3d, 0x80, 0x40, 0xdc, 0x3b,
+  0x18, 0x06, 0x76, 0x3d, 0x48, 0xe5, 0x6d, 0x3d, 0xca, 0xcf, 0xa9, 0xbc, 0x3c,
+  0xb8, 0x50, 0xbc, 0x70, 0xbf, 0x76, 0x3c, 0x0c, 0xbc, 0x1c, 0x3d, 0x59, 0x70,
+  0xf3, 0xbc, 0x21, 0xaa, 0x83, 0xbc, 0xf6, 0x67, 0x4f, 0xbd, 0x86, 0xa6, 0x71,
+  0x3c, 0x69, 0xd6, 0x48, 0x3c, 0x50, 0x60, 0x56, 0x3d, 0x9c, 0x25, 0x50, 0xbd,
+  0x10, 0x27, 0x76, 0x3c, 0x98, 0x24, 0x7b, 0xbd, 0x6c, 0xb9, 0x01, 0xbc, 0xe6,
+  0xea, 0x85, 0x3d, 0x0e, 0xa0, 0xf5, 0x3b, 0xb4, 0xb3, 0x0e, 0x3d, 0xe2, 0xc0,
+  0xa1, 0x3c, 0x4c, 0x2c, 0xf6, 0xbc, 0xc8, 0x58, 0x25, 0x3c, 0xd0, 0x2c, 0xeb,
+  0x3c, 0xa8, 0x0f, 0xfa, 0x3c, 0x50, 0xc1, 0xd6, 0xbb, 0x42, 0x81, 0x4d, 0xbd,
+  0x37, 0x4c, 0x88, 0xbd, 0xf4, 0x1a, 0xd2, 0xbc, 0x94, 0xb7, 0xaf, 0xbb, 0xaf,
+  0xeb, 0x0f, 0x3d, 0xed, 0x56, 0xa3, 0x3c, 0x5e, 0x0a, 0x87, 0x3d, 0x5c, 0x4a,
+  0x64, 0xbc, 0x37, 0x90, 0x62, 0x3c, 0x57, 0xcd, 0xbb, 0x3b, 0x50, 0x0c, 0x76,
+  0xbd, 0x1c, 0x48, 0x87, 0xbc, 0x38, 0x8a, 0x4e, 0x3c, 0xda, 0x2b, 0x3a, 0x3d,
+  0xba, 0x1a, 0x81, 0xbc, 0x29, 0xca, 0xba, 0x3c, 0x78, 0x39, 0x2b, 0xbd, 0xd4,
+  0x80, 0xe2, 0xbb, 0x08, 0x96, 0x95, 0x3c, 0x55, 0x08, 0x50, 0x3c, 0xbd, 0xed,
+  0x15, 0xbd, 0xd0, 0xeb, 0xe5, 0xbb, 0xa5, 0x5a, 0x22, 0xbc, 0x6c, 0xe7, 0x8f,
+  0xbc, 0x63, 0x73, 0xb2, 0x3c, 0xc0, 0xae, 0x13, 0x3c, 0x54, 0xbd, 0x6f, 0xbd,
+  0x9e, 0x5a, 0x60, 0x3d, 0x62, 0xe8, 0x34, 0x3d, 0x38, 0x91, 0x24, 0x3d, 0x10,
+  0xac, 0x03, 0x3c, 0x04, 0xc0, 0x83, 0xbd, 0x16, 0x48, 0x7e, 0xbd, 0x64, 0x7a,
+  0x40, 0xbc, 0x52, 0xcf, 0x4a, 0x3d, 0xa1, 0x54, 0x1f, 0xb9, 0x61, 0x19, 0x8c,
+  0x3d, 0x08, 0xfa, 0x5a, 0xbd, 0x2a, 0xf5, 0x67, 0x3d, 0xb3, 0xcc, 0x12, 0xbd,
+  0xc3, 0x2a, 0x65, 0x3d, 0x06, 0xbb, 0x41, 0xbd, 0xfc, 0xc0, 0x09, 0xbd, 0x2c,
+  0xdf, 0xa7, 0xbc, 0xb7, 0xfe, 0x5d, 0xbd, 0xcb, 0x10, 0xa3, 0xbb, 0x75, 0xc3,
+  0xcd, 0x3c, 0x2b, 0xd5, 0x0e, 0x3d, 0x11, 0x1c, 0x83, 0x3d, 0x71, 0xdc, 0xb2,
+  0xbc, 0xda, 0xe1, 0x86, 0xbd, 0x39, 0xf2, 0x50, 0x3c, 0x40, 0x25, 0x50, 0x3b,
+  0x18, 0x17, 0x43, 0xbc, 0x6b, 0xa6, 0x88, 0x3c, 0x60, 0x10, 0x5d, 0xbd, 0x0e,
+  0x88, 0xa1, 0x3c, 0xa6, 0xd3, 0xe4, 0xbc, 0x11, 0x76, 0x88, 0xbc, 0x1e, 0x07,
+  0x6c, 0x3d, 0xa6, 0x6e, 0x1b, 0x3d, 0xc0, 0x30, 0x30, 0x3d, 0xf2, 0x34, 0x8d,
+  0xbd, 0xc0, 0xe2, 0x18, 0x3b, 0xce, 0xef, 0x83, 0xbc, 0xe7, 0x31, 0x0e, 0xbd,
+  0xd1, 0xf1, 0x8b, 0xbd, 0xba, 0x6e, 0x3e, 0xbc, 0xc7, 0x45, 0x08, 0xbd, 0x57,
+  0x7e, 0x56, 0x3d, 0x6d, 0xaf, 0x68, 0xbd, 0xef, 0x94, 0x28, 0xbd, 0x65, 0xf5,
+  0xa5, 0x3c, 0xea, 0x2c, 0x43, 0xbd, 0x5c, 0xc6, 0x5d, 0x3c, 0x3e, 0x7e, 0x3f,
+  0xbd, 0xd4, 0xa5, 0x7c, 0xbd, 0x14, 0x39, 0x35, 0xbd, 0xc5, 0x8a, 0x08, 0xbd,
+  0x7e, 0xc0, 0x0c, 0x3d, 0x45, 0xbb, 0x84, 0x3c, 0x0d, 0x10, 0x6f, 0x39, 0x81,
+  0x04, 0x4b, 0x3c, 0x5b, 0x45, 0xff, 0x3c, 0xab, 0xd1, 0x74, 0xbd, 0x98, 0x8a,
+  0x38, 0x3c, 0xe3, 0xc7, 0xa9, 0x3c, 0x8b, 0x12, 0x7f, 0xbd, 0x6f, 0xb7, 0xc5,
+  0x3a, 0x95, 0x7e, 0xaf, 0x3c, 0x50, 0xc8, 0xc5, 0x3b, 0xf9, 0x02, 0x89, 0xbd,
+  0x6e, 0x63, 0xa2, 0xbc, 0x0c, 0x74, 0x32, 0x3d, 0xea, 0x32, 0x79, 0x3d, 0x0e,
+  0x34, 0x91, 0xbd, 0xa1, 0x87, 0xec, 0xbc, 0x1c, 0xd4, 0x17, 0x3d, 0xe1, 0xb0,
+  0x74, 0x3d, 0xe9, 0x8e, 0xc6, 0x3c, 0x8a, 0x62, 0x55, 0xbc, 0x51, 0x37, 0x95,
+  0xbd, 0x2b, 0xc8, 0xbd, 0xbc, 0x8e, 0xe4, 0xef, 0xbc, 0x11, 0x49, 0x0d, 0x3d,
+  0xe8, 0xcc, 0x16, 0x3d, 0xc6, 0xa8, 0xc8, 0x3c, 0x98, 0x01, 0x88, 0x3c, 0xbd,
+  0x8e, 0x46, 0xbd, 0xab, 0x7d, 0xd4, 0xbc, 0x7a, 0xde, 0xb6, 0xbc, 0xf9, 0x44,
+  0xcd, 0xbc, 0xad, 0xae, 0x13, 0xbc, 0x8d, 0xb5, 0x21, 0xbd, 0x48, 0xfb, 0x05,
+  0xbc, 0x1d, 0x6d, 0x84, 0x3d, 0x4c, 0x32, 0x8a, 0x3c, 0xa8, 0xe9, 0x69, 0x3c,
+  0xa6, 0xba, 0x1b, 0xbd, 0xe5, 0xfa, 0x12, 0x3d, 0xea, 0xea, 0x11, 0x3d, 0xa4,
+  0xa1, 0x10, 0xbd, 0x0c, 0x0e, 0xad, 0x3d, 0x04, 0xeb, 0x1c, 0xbd, 0xe5, 0x6d,
+  0x0f, 0xbd, 0x1e, 0x40, 0xea, 0x3d, 0xfa, 0xc5, 0x36, 0x3d, 0x7a, 0xd3, 0x34,
+  0xbd, 0xe2, 0xe5, 0x4b, 0xbd, 0x27, 0x35, 0xf0, 0xbd, 0x60, 0x53, 0xc6, 0xbc,
+  0xb4, 0x7c, 0x0b, 0xbd, 0x0c, 0xc1, 0xbd, 0x39, 0x4b, 0xfb, 0x67, 0x3c, 0x4c,
+  0x65, 0xc4, 0x3c, 0x23, 0x9d, 0x88, 0x3c, 0x7c, 0x7e, 0xa0, 0x3b, 0x7f, 0xd2,
+  0x94, 0x3b, 0x45, 0xd2, 0x24, 0x3d, 0x00, 0xd4, 0xf5, 0xbb, 0x13, 0xf0, 0x99,
+  0x3d, 0xd6, 0x36, 0xa0, 0x3a, 0x28, 0xb0, 0x5d, 0x3d, 0x9f, 0xf9, 0x81, 0xbd,
+  0x42, 0x4b, 0x98, 0x3d, 0x29, 0x10, 0x7d, 0x3d, 0x8e, 0xe9, 0xf5, 0xbc, 0xfb,
+  0xc1, 0x91, 0xbc, 0x71, 0xda, 0xe2, 0xbc, 0x1e, 0x75, 0x3b, 0xbd, 0xbe, 0x22,
+  0x2f, 0x3d, 0xfa, 0xb6, 0x27, 0xba, 0x8c, 0x36, 0x86, 0x3c, 0x45, 0x63, 0xcf,
+  0xbc, 0x13, 0x05, 0x5e, 0xbc, 0xba, 0xc5, 0x24, 0xbd, 0xcd, 0x6d, 0x0b, 0x3c,
+  0x5d, 0xe6, 0x00, 0x3b, 0x82, 0xbb, 0xcf, 0xbc, 0xdb, 0x1f, 0x31, 0xbd, 0x91,
+  0x32, 0x95, 0xbc, 0x81, 0xff, 0x0b, 0xba, 0xa7, 0xe4, 0x0f, 0x3d, 0x50, 0xd4,
+  0x2c, 0x3d, 0x4c, 0x82, 0x27, 0x3c, 0x54, 0x76, 0x69, 0x3c, 0xef, 0x41, 0x53,
+  0xbb, 0x7b, 0x88, 0x26, 0xbd, 0xfa, 0x19, 0x51, 0x3d, 0x83, 0xe9, 0x89, 0xbd,
+  0x96, 0xa7, 0x4a, 0x3d, 0x87, 0xf0, 0xe6, 0xbc, 0x2b, 0x59, 0x61, 0xbc, 0x4a,
+  0x9a, 0x7d, 0x3d, 0x7c, 0x95, 0x54, 0x38, 0xa6, 0x6e, 0x69, 0x3d, 0xf3, 0x84,
+  0x27, 0xbd, 0x84, 0x7f, 0x26, 0x3c, 0xc3, 0xe1, 0x58, 0x3b, 0xa7, 0x2d, 0xa5,
+  0x3d, 0x13, 0x70, 0x2a, 0xbd, 0xae, 0x66, 0x1f, 0x3d, 0x6d, 0x44, 0xff, 0xbc,
+  0x66, 0x10, 0xb2, 0x3c, 0x94, 0xd5, 0x98, 0xb9, 0x00, 0xc8, 0xef, 0x3d, 0x5c,
+  0x00, 0x2f, 0xbc, 0xd7, 0xb1, 0xf6, 0x3c, 0x1b, 0xdb, 0xe1, 0x3c, 0xaa, 0x78,
+  0xe0, 0x3c, 0xb5, 0xe8, 0xd1, 0x3c, 0xda, 0x9e, 0x39, 0xbc, 0xe4, 0x90, 0x84,
+  0xbc, 0x42, 0x92, 0x6f, 0xbd, 0xdd, 0xd7, 0x8a, 0x3d, 0xd3, 0x62, 0x90, 0x3c,
+  0x1c, 0x20, 0x52, 0x3d, 0x1e, 0x29, 0x72, 0xbd, 0xf4, 0x8e, 0x1c, 0x3d, 0xd9,
+  0xda, 0xaf, 0xbc, 0x60, 0x11, 0x8e, 0xbb, 0x71, 0xc1, 0xbf, 0xbc, 0xec, 0x7f,
+  0x3d, 0x3c, 0xe5, 0x10, 0x3d, 0xbd, 0x1a, 0xbf, 0x69, 0x3d, 0x3f, 0x56, 0x0b,
+  0xbb, 0x19, 0x64, 0x9d, 0x3c, 0xe1, 0x00, 0x05, 0x3d, 0x4f, 0x77, 0x8e, 0x3d,
+  0x0f, 0x4d, 0x35, 0x3d, 0xe5, 0x6d, 0x4d, 0xbd, 0x9d, 0xb6, 0x58, 0x3c, 0x64,
+  0x44, 0x30, 0xba, 0x08, 0xe8, 0xaa, 0x3c, 0x73, 0xe7, 0x0b, 0x3d, 0x71, 0x00,
+  0x8c, 0x3d, 0x1a, 0xd9, 0xeb, 0x3c, 0xde, 0x78, 0xf2, 0xbb, 0xe5, 0x50, 0xcb,
+  0x3d, 0x03, 0x80, 0x7f, 0x3b, 0xb4, 0xf7, 0x1a, 0x3d, 0x32, 0xf5, 0xb0, 0x3d,
+  0x1c, 0x38, 0xe5, 0x3c, 0xb1, 0x72, 0x05, 0x3d, 0xc3, 0x92, 0xcf, 0x3c, 0xdc,
+  0x7b, 0x0c, 0xbe, 0x95, 0x0b, 0xfc, 0x3c, 0x5f, 0x34, 0x18, 0x3d, 0xc2, 0x08,
+  0x19, 0xbd, 0x25, 0xd4, 0x7b, 0x3d, 0x1e, 0xca, 0x88, 0xbd, 0x57, 0x5f, 0x9a,
+  0x3d, 0x57, 0x98, 0x80, 0x3d, 0x20, 0x7d, 0xdd, 0x3c, 0xdf, 0xb3, 0x65, 0x3d,
+  0x88, 0xde, 0x8d, 0xbd, 0x45, 0x90, 0x9d, 0x3d, 0x8a, 0xf8, 0xfa, 0xbc, 0xdf,
+  0xe2, 0xef, 0xb9, 0x21, 0x8d, 0x5a, 0xbc, 0x3e, 0x45, 0x17, 0x3c, 0x11, 0x8d,
+  0x8d, 0xbd, 0xb9, 0xd3, 0x2b, 0xb9, 0xd1, 0x2b, 0x24, 0xbc, 0x7e, 0x0e, 0x00,
+  0x3b, 0xfd, 0xc2, 0x2e, 0xbd, 0x80, 0x7d, 0x0d, 0x3d, 0x91, 0x8a, 0x49, 0x3d,
+  0xba, 0x7e, 0x10, 0x3d, 0xc3, 0x56, 0x2a, 0x3d, 0x1a, 0x4d, 0x6e, 0x3d, 0x20,
+  0x44, 0x90, 0x3c, 0x2f, 0xd8, 0x79, 0x3d, 0x7b, 0x5c, 0xab, 0x3d, 0x64, 0xa5,
+  0xe1, 0x3c, 0x26, 0x94, 0x31, 0x3d, 0xcc, 0xaf, 0xec, 0xbd, 0xc0, 0x25, 0x4b,
+  0xbd, 0xd1, 0x06, 0x87, 0x3d, 0x97, 0x3c, 0x44, 0xbd, 0x9c, 0x81, 0xc2, 0xbc,
+  0x0a, 0xd3, 0x1a, 0xbd, 0x0d, 0xe3, 0x00, 0xbd, 0x08, 0x6e, 0x53, 0xbd, 0x67,
+  0x84, 0x1a, 0x3d, 0xeb, 0xd0, 0x2f, 0x3d, 0x76, 0xea, 0x46, 0x3b, 0x3e, 0x6e,
+  0xbe, 0xbc, 0xf3, 0x6a, 0x11, 0x3d, 0x13, 0xed, 0xb8, 0x3c, 0xc1, 0x4f, 0x9a,
+  0x3d, 0xd6, 0x9a, 0x31, 0xbd, 0xcc, 0x51, 0x0e, 0x3d, 0x60, 0x8c, 0x89, 0x3d,
+  0x66, 0xc1, 0x41, 0xbd, 0x75, 0x80, 0xa2, 0x3d, 0x40, 0xbb, 0x5c, 0x3b, 0x6f,
+  0xb6, 0x90, 0x3d, 0xb7, 0x62, 0x02, 0x3c, 0x54, 0x75, 0x78, 0x3d, 0x3d, 0x29,
+  0xaf, 0x3d, 0x53, 0x5f, 0x97, 0x3d, 0xaf, 0x83, 0x91, 0xbc, 0xc9, 0x29, 0x55,
+  0x3d, 0xda, 0x00, 0x82, 0xbb, 0x8d, 0xcd, 0x2e, 0x3d, 0x9d, 0xcb, 0x88, 0xbd,
+  0x4d, 0x93, 0x3d, 0xbd, 0x55, 0xb8, 0x66, 0xbd, 0x98, 0xf2, 0x4e, 0xbc, 0xf9,
+  0xe0, 0x28, 0xbc, 0x6f, 0x30, 0x2d, 0x3d, 0xd8, 0xe6, 0x9e, 0x3d, 0x81, 0xcf,
+  0x31, 0xbd, 0x31, 0x50, 0x45, 0xbd, 0x90, 0x9e, 0x2f, 0xbd, 0x4b, 0x9a, 0x9a,
+  0x3d, 0x2f, 0x1a, 0xb3, 0xbc, 0x05, 0x59, 0x9b, 0xbc, 0xa6, 0x4f, 0x9b, 0xbc,
+  0x24, 0x10, 0x9e, 0xbd, 0x91, 0x8e, 0xa5, 0x3c, 0x0c, 0x2a, 0x43, 0x3d, 0x85,
+  0x85, 0x87, 0xbd, 0x00, 0x61, 0x36, 0xbd, 0x10, 0xb9, 0x43, 0xbc, 0x58, 0x2c,
+  0x24, 0x3b, 0xb7, 0x4f, 0x80, 0x3d, 0x46, 0x0f, 0x29, 0xbd, 0x76, 0x68, 0x44,
+  0xbd, 0x57, 0xcf, 0x18, 0xbd, 0x24, 0x15, 0x94, 0x3d, 0x13, 0x57, 0x98, 0x3d,
+  0x5e, 0xd6, 0x9c, 0x3d, 0xa0, 0x16, 0x9e, 0x3d, 0x66, 0x87, 0x83, 0xbd, 0x19,
+  0x6d, 0x8b, 0x3d, 0x24, 0x60, 0x9a, 0xbc, 0x00, 0x60, 0xea, 0xbb, 0xba, 0x09,
+  0x5f, 0xbd, 0xdc, 0xdd, 0xaa, 0x3b, 0x95, 0x08, 0xe9, 0xbc, 0x82, 0x0c, 0xc6,
+  0x3c, 0x19, 0xb1, 0xda, 0xbc, 0x80, 0x2e, 0x4b, 0x3c, 0xed, 0xab, 0x29, 0x3d,
+  0x17, 0x38, 0x51, 0x3d, 0x52, 0xa3, 0xef, 0x3c, 0xfd, 0x1c, 0x88, 0xbc, 0x40,
+  0x9f, 0x3a, 0x3c, 0x87, 0x8a, 0xbe, 0xbc, 0xe5, 0xf4, 0x2a, 0xbd, 0x01, 0x1f,
+  0x32, 0x3d, 0x2c, 0xbf, 0x3d, 0xbc, 0x33, 0xd3, 0xf9, 0xbb, 0xc4, 0x58, 0x2d,
+  0xbd, 0x5d, 0xa3, 0x8f, 0x3d, 0x27, 0x5d, 0x90, 0xbc, 0xcf, 0x00, 0x82, 0x3d,
+  0x0b, 0x65, 0xa7, 0x3d, 0x52, 0x11, 0xff, 0xbc, 0x37, 0xca, 0x18, 0xbd, 0xb9,
+  0x2f, 0x9d, 0x3c, 0x36, 0x90, 0x68, 0x3d, 0x85, 0x61, 0x6b, 0x3d, 0x27, 0xb0,
+  0x89, 0xbc, 0xcb, 0xb5, 0xac, 0xbb, 0xf4, 0x4b, 0x79, 0xbc, 0x34, 0x73, 0xe7,
+  0xbc, 0x81, 0x9b, 0x86, 0x3c, 0x58, 0xc2, 0xce, 0x3c, 0x0a, 0x63, 0x2c, 0xbd,
+  0xf6, 0xd3, 0xcf, 0xbd, 0xea, 0xf1, 0x01, 0xbd, 0x7a, 0x64, 0xe0, 0xbc, 0x12,
+  0x3a, 0x28, 0x3d, 0x98, 0xe9, 0x98, 0x3d, 0x95, 0xf1, 0xa8, 0xbc, 0x88, 0xb4,
+  0x2a, 0x3d, 0x81, 0xdf, 0xc4, 0xbc, 0x62, 0xb8, 0xfb, 0xbc, 0x46, 0xd2, 0x90,
+  0xbd, 0x74, 0x0a, 0xc4, 0x3c, 0x8e, 0x57, 0x6f, 0x3d, 0xf9, 0xea, 0x78, 0x3d,
+  0xdc, 0x6e, 0x62, 0xbd, 0x46, 0xe2, 0x16, 0xbd, 0xa6, 0x36, 0x37, 0xbd, 0xf5,
+  0x36, 0x35, 0xbd, 0x9a, 0x4f, 0xb8, 0xbc, 0xf2, 0xab, 0x15, 0x3c, 0xee, 0x55,
+  0xd7, 0x3b, 0xfa, 0xd0, 0x1c, 0xbd, 0xd4, 0x6b, 0x97, 0xbc, 0x91, 0x57, 0x51,
+  0xbd, 0x7c, 0xc9, 0x64, 0x3d, 0xf8, 0x29, 0xcd, 0xbc, 0x75, 0x65, 0x67, 0x3d,
+  0xaa, 0xd9, 0xa3, 0x3c, 0x55, 0xff, 0x8f, 0x3c, 0x7c, 0x18, 0x46, 0xbd, 0x92,
+  0x18, 0x2c, 0x3d, 0x3a, 0x9f, 0x8a, 0xbc, 0xee, 0xd4, 0x05, 0x3d, 0x37, 0x03,
+  0xaa, 0xbd, 0xe9, 0x50, 0x07, 0xbe, 0x1a, 0x94, 0x18, 0x3d, 0x79, 0x69, 0x03,
+  0xbd, 0x7f, 0xc8, 0xd4, 0xbc, 0x25, 0xa7, 0x86, 0x3a, 0x17, 0xf1, 0x00, 0x3c,
+  0xfd, 0x40, 0x10, 0x3d, 0x6e, 0x29, 0xf7, 0x3c, 0x05, 0xb0, 0x38, 0xbd, 0x7e,
+  0x44, 0x5a, 0xbc, 0x0e, 0xdf, 0x66, 0x3d, 0x08, 0x9d, 0x10, 0xbc, 0xff, 0x12,
+  0x8e, 0xbb, 0x01, 0x3f, 0x67, 0xbc, 0x6e, 0xa6, 0x4f, 0x3d, 0xca, 0x07, 0x63,
+  0xbd, 0x97, 0x61, 0x4b, 0x3d, 0x71, 0x21, 0x34, 0x3d, 0x4f, 0xa2, 0x6d, 0x3d,
+  0x8f, 0xf5, 0xe8, 0xbd, 0x72, 0x55, 0x4b, 0xbd, 0xee, 0xb2, 0xe9, 0xbc, 0xf2,
+  0x49, 0xa7, 0x3d, 0x89, 0x22, 0xf5, 0x3c, 0xd8, 0x73, 0xcb, 0x3d, 0xbb, 0x15,
+  0x81, 0x3d, 0x33, 0xf1, 0x5c, 0x3d, 0xa7, 0x30, 0x96, 0xbd, 0x4b, 0x2c, 0x58,
+  0xbd, 0x34, 0x05, 0x00, 0x3d, 0xbd, 0x81, 0x92, 0x3d, 0x67, 0x5b, 0x5f, 0xbc,
+  0xb4, 0x1e, 0xe6, 0xbd, 0x7c, 0x56, 0x00, 0x3c, 0x7c, 0x6d, 0xa8, 0x3c, 0x9b,
+  0x21, 0xbd, 0xbb, 0x71, 0xf4, 0x48, 0xbd, 0xf8, 0xe1, 0x87, 0xbd, 0xd7, 0x4f,
+  0xaf, 0xbc, 0x08, 0xef, 0xd9, 0x3c, 0x3e, 0x7b, 0x24, 0x3c, 0xa8, 0xcc, 0xe7,
+  0x3c, 0xf0, 0xa0, 0x4a, 0xbd, 0x45, 0xbf, 0x39, 0xbd, 0x4e, 0xb6, 0xd6, 0x3c,
+  0xfb, 0xfb, 0x49, 0x3d, 0xdd, 0x90, 0x4e, 0x3c, 0x0c, 0xb0, 0x83, 0x3d, 0x2d,
+  0x83, 0x42, 0x3c, 0x1f, 0x45, 0xeb, 0xbb, 0xd3, 0x7e, 0xf2, 0x3b, 0x4d, 0x22,
+  0xa6, 0xbd, 0x40, 0x45, 0x5c, 0xbb, 0x8c, 0xa5, 0x1c, 0xbd, 0x57, 0xd9, 0x86,
+  0x3d, 0x45, 0xfc, 0x4e, 0x3d, 0xc5, 0x64, 0x24, 0x3d, 0xc9, 0xf4, 0x27, 0x3c,
+  0xc7, 0x86, 0x08, 0x3d, 0x9c, 0x3c, 0x13, 0x3b, 0xab, 0x69, 0x12, 0x3d, 0x0d,
+  0xfa, 0x80, 0x3d, 0x6b, 0x86, 0x15, 0xbd, 0x93, 0x11, 0x1e, 0xbd, 0x70, 0x3b,
+  0x02, 0x3b, 0x50, 0x75, 0x06, 0xbd, 0x61, 0xe8, 0x7b, 0xbc, 0x5a, 0x15, 0xa7,
+  0x3d, 0x47, 0x26, 0x0b, 0x3c, 0xb8, 0x03, 0x98, 0x3c, 0xce, 0xcc, 0x8e, 0x3d,
+  0x12, 0x6c, 0xba, 0xbc, 0xca, 0x74, 0x5f, 0xbd, 0x84, 0x45, 0xd6, 0x3d, 0x2a,
+  0xc6, 0xb3, 0xbc, 0x75, 0x88, 0x53, 0x3d, 0x44, 0xc0, 0x37, 0x3c, 0x69, 0x7c,
+  0x59, 0x3d, 0xc1, 0xa5, 0xe5, 0xbc, 0x61, 0xc0, 0x9f, 0x3c, 0xbc, 0x7d, 0x7e,
+  0xbc, 0x9c, 0x18, 0x79, 0xbd, 0x09, 0x70, 0x16, 0x3d, 0xdd, 0x36, 0x0b, 0x3d,
+  0xcc, 0xba, 0xc8, 0x3c, 0xe6, 0xae, 0x18, 0xbc, 0xd6, 0x1a, 0x20, 0xbd, 0x43,
+  0x22, 0x24, 0xbc, 0xcc, 0x3e, 0xd4, 0x3c, 0xe2, 0x43, 0x1a, 0xbb, 0x02, 0x94,
+  0xd5, 0x3c, 0x24, 0x73, 0x3d, 0x3d, 0x4d, 0x1c, 0xce, 0x3c, 0x94, 0xea, 0x4a,
+  0x3d, 0x33, 0x7a, 0x09, 0x3d, 0xf4, 0xcc, 0x66, 0xbd, 0x13, 0xb9, 0x9e, 0xbd,
+  0x98, 0xbe, 0xb4, 0xbc, 0x19, 0x14, 0x21, 0x3d, 0x97, 0xca, 0x50, 0x3d, 0x8f,
+  0x3f, 0x2f, 0xbc, 0x69, 0x98, 0x25, 0x3d, 0x55, 0x13, 0x80, 0xbc, 0xef, 0x2e,
+  0x82, 0x3d, 0x24, 0xea, 0x71, 0xbd, 0x84, 0x97, 0x32, 0xbd, 0xb0, 0xaa, 0xaf,
+  0x3c, 0xfa, 0x13, 0x9b, 0x3d, 0x56, 0xa5, 0x2b, 0x3d, 0x03, 0x06, 0x2d, 0xbc,
+  0x6c, 0x24, 0x39, 0xbd, 0x46, 0x80, 0x29, 0x3d, 0x64, 0xdb, 0x61, 0xbb, 0x85,
+  0x2a, 0x22, 0xbd, 0x9f, 0x47, 0xc1, 0x3d, 0x71, 0xc5, 0x85, 0xbd, 0x00, 0x31,
+  0x9c, 0xb9, 0xc4, 0xd0, 0x2e, 0xbd, 0x08, 0x5d, 0x36, 0x3d, 0x41, 0x70, 0x3f,
+  0xbd, 0x01, 0xc0, 0x87, 0x3c, 0x05, 0xf1, 0x37, 0xbc, 0xaf, 0x5d, 0xd4, 0xbb,
+  0x10, 0xa9, 0x1c, 0x3d, 0xb8, 0xa9, 0x62, 0xba, 0xae, 0x29, 0x71, 0x3d, 0x51,
+  0x57, 0x73, 0xbc, 0x05, 0x0a, 0xb8, 0xbd, 0xe3, 0x38, 0xa1, 0xbd, 0x3d, 0x08,
+  0x13, 0x3d, 0x54, 0x69, 0x80, 0xbd, 0xe9, 0x65, 0x60, 0xbd, 0x2e, 0x02, 0x88,
+  0x3d, 0x00, 0xdf, 0x58, 0xbb, 0xde, 0x06, 0x35, 0xbd, 0x1e, 0x3f, 0x0a, 0xbd,
+  0x35, 0xe2, 0x15, 0xbd, 0xa6, 0xe3, 0x99, 0x3d, 0x42, 0x8e, 0x2e, 0xbd, 0x9b,
+  0x10, 0x97, 0xbd, 0xd9, 0x36, 0xca, 0x3b, 0x27, 0x9f, 0x5c, 0xbd, 0xb8, 0x0c,
+  0x25, 0xbd, 0x61, 0xe3, 0x8e, 0x3d, 0x8b, 0x23, 0xa5, 0xbc, 0xf4, 0xda, 0x47,
+  0xbd, 0x30, 0x95, 0xac, 0x3c, 0xe1, 0xb0, 0xab, 0xbd, 0xb0, 0x5a, 0x15, 0x3d,
+  0x58, 0x7e, 0x35, 0x3d, 0x13, 0xeb, 0x48, 0xbc, 0x00, 0xe6, 0x80, 0x3c, 0x39,
+  0x59, 0x21, 0xbb, 0xca, 0xf7, 0xbe, 0x3d, 0x2a, 0xb9, 0x37, 0x3d, 0x26, 0x13,
+  0x80, 0x3d, 0x9e, 0xbd, 0xc7, 0x3c, 0xb6, 0xd6, 0x50, 0xbd, 0xa6, 0x52, 0x82,
+  0x3d, 0x39, 0xa3, 0x81, 0xb9, 0xe3, 0xb2, 0xf8, 0xbd, 0xc5, 0x84, 0x54, 0xbd,
+  0xba, 0xea, 0x27, 0x3d, 0x1e, 0xce, 0xcf, 0x3c, 0x0d, 0xd3, 0x6f, 0x3c, 0xa7,
+  0xce, 0x87, 0xbc, 0x67, 0xe3, 0x5e, 0xbd, 0xf6, 0xdc, 0x3b, 0x3d, 0xca, 0x8f,
+  0x23, 0xbd, 0x69, 0x20, 0x9e, 0x3b, 0x32, 0x59, 0x2e, 0x3d, 0x12, 0x32, 0x09,
+  0xbd, 0xa1, 0xc3, 0x2a, 0x3c, 0x68, 0x2a, 0x6b, 0xbc, 0xf7, 0xbf, 0x92, 0xbc,
+  0x97, 0x8c, 0x97, 0x3d, 0x8e, 0xc6, 0x74, 0x3c, 0x04, 0x01, 0x47, 0x3c, 0x6b,
+  0x51, 0xf0, 0x3d, 0x0e, 0xf6, 0x3b, 0x3b, 0xee, 0xeb, 0x5d, 0x3d, 0x98, 0x69,
+  0x9b, 0x3c, 0xb5, 0x47, 0xfc, 0xbc, 0x5e, 0x56, 0x40, 0xbc, 0x15, 0x4e, 0xad,
+  0xbb, 0x84, 0xcf, 0x96, 0x3c, 0xe3, 0x32, 0xbe, 0xbc, 0x36, 0xcd, 0xc8, 0x3d,
+  0x70, 0xb8, 0x97, 0x3d, 0xd9, 0xc3, 0x28, 0xbd, 0x6c, 0xec, 0x7b, 0x3d, 0xbf,
+  0x32, 0xc6, 0xbd, 0x98, 0x0d, 0x0f, 0xbe, 0x32, 0xaa, 0x95, 0x3d, 0x6e, 0x2c,
+  0xfd, 0xbc, 0x10, 0x45, 0xc1, 0xbb, 0x4d, 0x8b, 0x03, 0x3d, 0xe4, 0x05, 0xde,
+  0xbc, 0x0d, 0x7c, 0xbe, 0x3c, 0x07, 0x24, 0x77, 0x3d, 0x98, 0xb0, 0x2a, 0x3c,
+  0x21, 0xc9, 0xa3, 0x3c, 0x1a, 0x6d, 0x69, 0x3d, 0x33, 0xf6, 0xeb, 0xbc, 0x40,
+  0x77, 0x90, 0x3d, 0x6c, 0xf5, 0x99, 0x3c, 0x42, 0x69, 0x08, 0x3d, 0x9b, 0x3f,
+  0xde, 0xbc, 0xe0, 0x71, 0x04, 0xbd, 0x6a, 0xcd, 0xfe, 0xbb, 0x77, 0xd6, 0xb3,
+  0x3d, 0xf9, 0xb4, 0xcc, 0x3b, 0x6a, 0x1c, 0x70, 0x3d, 0x10, 0x34, 0x15, 0xbc,
+  0x82, 0x15, 0x3a, 0x3d, 0xa8, 0xa6, 0x02, 0x3d, 0x06, 0x03, 0xaa, 0x3d, 0x15,
+  0x2c, 0xe6, 0xbc, 0xac, 0xf0, 0xdc, 0x3c, 0xa7, 0x3b, 0xef, 0xbc, 0x7a, 0xa7,
+  0x93, 0x3d, 0xaf, 0x46, 0x87, 0x3c, 0xf9, 0x13, 0x76, 0xbb, 0x30, 0x99, 0x15,
+  0xbd, 0x36, 0xd1, 0x8f, 0xbc, 0xc9, 0x26, 0xaf, 0x3d, 0xc0, 0xa3, 0x5b, 0x3c,
+  0x69, 0x65, 0x84, 0xbd, 0x1e, 0x30, 0x81, 0x3d, 0xb4, 0xbc, 0x22, 0x3d, 0x16,
+  0x60, 0x52, 0x3d, 0x5e, 0xfe, 0x6a, 0xbc, 0x16, 0x65, 0x34, 0xbd, 0xfe, 0xab,
+  0xf0, 0x3c, 0xe1, 0xfd, 0x90, 0x3d, 0xd4, 0x61, 0x6a, 0xbd, 0x55, 0xd1, 0x85,
+  0xbd, 0x87, 0x6f, 0x66, 0xbd, 0x29, 0x4a, 0x8d, 0x3a, 0xec, 0x8f, 0x91, 0x3d,
+  0x07, 0x75, 0x5a, 0x3b, 0x95, 0x09, 0x27, 0x3b, 0x25, 0x10, 0xd3, 0x3d, 0xde,
+  0xfe, 0x0b, 0xbd, 0xe8, 0xd4, 0xc4, 0x3c, 0x4e, 0xda, 0x7d, 0x3c, 0x54, 0xb5,
+  0xe8, 0xba, 0x69, 0x46, 0x40, 0x3d, 0xd1, 0xd6, 0x48, 0x3c, 0xfa, 0xb9, 0x87,
+  0x39, 0x5a, 0x17, 0x20, 0xbc, 0xd5, 0x9b, 0x66, 0x3d, 0x19, 0x23, 0xac, 0x3c,
+  0x56, 0x76, 0x5a, 0xbd, 0x7e, 0x50, 0x3c, 0xbc, 0x02, 0x8b, 0x17, 0xbd, 0x42,
+  0x85, 0xc6, 0xbd, 0x06, 0x12, 0x9f, 0x3d, 0xad, 0x96, 0xc7, 0xbb, 0xd9, 0xfc,
+  0xff, 0xbb, 0xb9, 0x86, 0x71, 0x3c, 0xc7, 0xf6, 0x3f, 0xbd, 0xc2, 0x39, 0xf7,
+  0x3a, 0x25, 0xcb, 0xf0, 0x3c, 0xfe, 0x25, 0xb0, 0xbb, 0xd3, 0x39, 0x02, 0x3d,
+  0xf8, 0xa3, 0x08, 0xbd, 0xba, 0xf2, 0x4e, 0xbd, 0x53, 0x83, 0x46, 0xbd, 0xae,
+  0x06, 0x06, 0x3d, 0x69, 0xf3, 0x8f, 0x3d, 0xd3, 0x57, 0x35, 0x3c, 0x05, 0x92,
+  0xb9, 0x3c, 0x60, 0x8e, 0x5b, 0x3b, 0xab, 0x7a, 0x8d, 0xbc, 0xf6, 0xdf, 0x87,
+  0xbd, 0x0d, 0xc5, 0x81, 0x3d, 0xec, 0x93, 0x5f, 0x3d, 0xf6, 0x54, 0x85, 0x3d,
+  0x86, 0xb3, 0x16, 0xbc, 0x7d, 0x95, 0x97, 0x3d, 0xff, 0xd8, 0x0c, 0x3d, 0x21,
+  0x38, 0x6e, 0xbd, 0x68, 0xfc, 0x83, 0x3d, 0x5c, 0x54, 0x1b, 0xbc, 0x26, 0x1d,
+  0x03, 0x3d, 0xd8, 0xaa, 0x90, 0xbd, 0xa9, 0x58, 0x0b, 0x3b, 0x02, 0x4e, 0x40,
+  0xbd, 0xdc, 0x76, 0xe0, 0xbb, 0x14, 0x2e, 0x24, 0x3d, 0xbb, 0x6b, 0xfe, 0x3b,
+  0xfd, 0xb5, 0x99, 0xbd, 0x4b, 0x2b, 0x0e, 0xbd, 0x2f, 0xc8, 0x69, 0xbd, 0xff,
+  0xf0, 0x04, 0x3d, 0x46, 0x9c, 0x13, 0x3c, 0x74, 0x89, 0x2e, 0x3d, 0xbe, 0x6e,
+  0x52, 0xbd, 0x59, 0x23, 0x34, 0x3d, 0x72, 0x3a, 0x3e, 0xbd, 0xf8, 0x03, 0x7a,
+  0x3d, 0x8e, 0xab, 0x74, 0x3c, 0x6e, 0x5e, 0x82, 0x3d, 0x16, 0x5b, 0x25, 0x3c,
+  0x56, 0x2c, 0xe7, 0xbd, 0x19, 0x4d, 0xc0, 0x3d, 0x8a, 0xb3, 0xdb, 0xbd, 0x34,
+  0xe5, 0x67, 0xbc, 0x0f, 0x5d, 0x35, 0x3d, 0xad, 0xad, 0x94, 0x3d, 0xa5, 0xc3,
+  0xba, 0xba, 0xb4, 0x7f, 0x02, 0x3e, 0xde, 0xcd, 0x8d, 0x3d, 0xc3, 0xa4, 0xa4,
+  0xbd, 0x7e, 0x1b, 0x37, 0x3d, 0xde, 0xb4, 0x91, 0xbd, 0x78, 0xf2, 0x62, 0xbd,
+  0x25, 0x4f, 0x60, 0xbd, 0x4e, 0xd2, 0x25, 0xbd, 0xd3, 0xc3, 0xe8, 0xbb, 0x7f,
+  0x00, 0x68, 0x3d, 0x7a, 0x9c, 0x1e, 0xbd, 0x17, 0x70, 0x81, 0x3c, 0xda, 0xb3,
+  0x68, 0x3d, 0xab, 0xf3, 0xb4, 0xbc, 0x46, 0x70, 0x16, 0xbd, 0x22, 0xe5, 0x82,
+  0x3d, 0x75, 0x02, 0x5a, 0x3d, 0xb5, 0xce, 0x86, 0xbd, 0x20, 0x29, 0xa8, 0xbb,
+  0xe5, 0x29, 0x95, 0xbd, 0x63, 0x0c, 0x5f, 0xbd, 0x42, 0x39, 0x99, 0xbc, 0x27,
+  0xd6, 0x82, 0xbb, 0x33, 0x1c, 0xda, 0xbc, 0x93, 0x96, 0x76, 0x3d, 0xd3, 0x8c,
+  0xd3, 0xbd, 0x75, 0x39, 0xe1, 0x3d, 0x42, 0x5b, 0x98, 0xbd, 0x5a, 0xc4, 0x4f,
+  0x3d, 0x3b, 0xb0, 0x14, 0xbd, 0xfc, 0x99, 0x4b, 0xbc, 0xd4, 0x88, 0x13, 0xbb,
+  0x6c, 0xca, 0xc4, 0x3d, 0xd4, 0xdc, 0xb1, 0x3d, 0x62, 0x2a, 0x8d, 0x3c, 0xd8,
+  0x1b, 0xb7, 0x3c, 0x0b, 0x8d, 0xba, 0xbb, 0x78, 0x25, 0x5c, 0xbd, 0xb9, 0xc6,
+  0xbb, 0xba, 0x26, 0x58, 0xc5, 0xbd, 0x5d, 0x48, 0xb7, 0xbd, 0x71, 0x0d, 0x0e,
+  0x3d, 0xa8, 0xa7, 0x54, 0xbd, 0x88, 0xfe, 0x84, 0xbc, 0x0b, 0x64, 0x1b, 0xbc,
+  0xba, 0xaa, 0x8e, 0x3c, 0x89, 0x54, 0xa5, 0xbc, 0xde, 0x32, 0x9c, 0x3c, 0x90,
+  0x13, 0x66, 0xbd, 0xb2, 0x5e, 0x11, 0xbd, 0xd0, 0x5e, 0xfb, 0xbb, 0x2e, 0x6c,
+  0x8c, 0xbd, 0x09, 0x4b, 0x2f, 0xbc, 0xa8, 0x5d, 0x27, 0xbd, 0xad, 0xd8, 0x2e,
+  0x3d, 0x78, 0x5e, 0xf0, 0x3c, 0x8e, 0xc0, 0x12, 0x3d, 0x49, 0xb5, 0xca, 0xbd,
+  0x1b, 0x2e, 0xb0, 0x3d, 0xeb, 0x3c, 0x8b, 0xbd, 0xe2, 0x4b, 0xd6, 0xbc, 0x14,
+  0xdf, 0xc3, 0x3c, 0x42, 0x9c, 0x87, 0x3c, 0xb7, 0x90, 0x18, 0x3d, 0xcb, 0x8a,
+  0xd8, 0x3d, 0xc1, 0x0c, 0x97, 0x3d, 0x35, 0xe8, 0xd3, 0x3c, 0xb1, 0x05, 0x28,
+  0x3d, 0x03, 0xd2, 0xbc, 0x3d, 0x56, 0xce, 0x44, 0x3d, 0x9f, 0xbf, 0x24, 0x3d,
+  0x21, 0x81, 0x81, 0xbd, 0xc0, 0xa2, 0xda, 0xbd, 0x50, 0x42, 0x27, 0x3d, 0x5f,
+  0xb2, 0xb9, 0x3c, 0x04, 0x67, 0x6c, 0x3d, 0xce, 0x89, 0x2c, 0xbd, 0x08, 0x2d,
+  0x4b, 0x3c, 0x88, 0x86, 0xf7, 0x3c, 0xcd, 0x8e, 0x94, 0x3d, 0x5a, 0x47, 0x6f,
+  0x3d, 0x67, 0xf4, 0xa2, 0xbd, 0xe3, 0x50, 0x91, 0xbd, 0xde, 0x9e, 0x84, 0x3d,
+  0xb3, 0x05, 0xbf, 0x3c, 0x10, 0x17, 0x34, 0x3d, 0xf4, 0x1f, 0x0e, 0xbd, 0x47,
+  0xb9, 0x49, 0x3d, 0xb1, 0x61, 0x10, 0x3d, 0x2a, 0x64, 0x90, 0xbd, 0x1e, 0xc9,
+  0xb8, 0x3c, 0x7d, 0x23, 0xb8, 0xbd, 0x19, 0x60, 0x85, 0x3d, 0x44, 0xb5, 0x4d,
+  0xbd, 0x05, 0x79, 0xec, 0x3b, 0xea, 0x1e, 0x21, 0xbd, 0xeb, 0x34, 0x59, 0x3d,
+  0x50, 0xa9, 0x00, 0x3d, 0x72, 0xf1, 0x4c, 0xb9, 0x98, 0x35, 0xc1, 0x3d, 0xbb,
+  0x18, 0x36, 0x3d, 0x19, 0x70, 0x62, 0xbd, 0xc5, 0xae, 0x75, 0x3d, 0x27, 0x77,
+  0xec, 0xbc, 0xab, 0x6d, 0xe1, 0xbd, 0x75, 0x4a, 0xae, 0x3c, 0x2d, 0xea, 0x18,
+  0xbb, 0xdc, 0x0e, 0x7b, 0x3d, 0xb2, 0x28, 0x24, 0xbd, 0x69, 0xd2, 0x78, 0xbd,
+  0xed, 0x29, 0x5f, 0xbc, 0xd9, 0x6e, 0x44, 0x3d, 0x3c, 0x6c, 0x87, 0xbd, 0xa5,
+  0xdf, 0x96, 0xbc, 0x1c, 0x4c, 0x35, 0x3d, 0x54, 0x97, 0x57, 0xbd, 0xe9, 0x88,
+  0x40, 0xbd, 0x6d, 0x9d, 0x71, 0x3c, 0x3f, 0x74, 0xaf, 0xbb, 0x41, 0xfa, 0x4b,
+  0x3d, 0x20, 0xe8, 0x7a, 0xbc, 0xe4, 0x37, 0xbe, 0xbd, 0xfa, 0xa2, 0x44, 0xbc,
+  0x2a, 0x3c, 0x61, 0xbd, 0xec, 0x0f, 0x0c, 0x3d, 0xd7, 0xef, 0x82, 0xbd, 0x0b,
+  0xe4, 0xd2, 0xbc, 0xd2, 0x57, 0x04, 0x3c, 0xa8, 0x6e, 0xce, 0x3d, 0x3c, 0xd8,
+  0xa4, 0x3b, 0x1d, 0x19, 0x45, 0xbd, 0xd6, 0x4d, 0x70, 0x3c, 0xed, 0x12, 0xf0,
+  0xbc, 0x1f, 0xc6, 0x4c, 0x3c, 0xeb, 0x27, 0x8e, 0xbc, 0x6a, 0xf8, 0x4f, 0x3d,
+  0xcf, 0x2c, 0xe3, 0xbd, 0x3b, 0xc9, 0x05, 0xbb, 0xe0, 0xfa, 0xfd, 0x3c, 0xfe,
+  0xb8, 0xfb, 0xbc, 0x84, 0xd9, 0x8b, 0x3d, 0xad, 0x88, 0x00, 0x3d, 0x21, 0xfa,
+  0x47, 0x3d, 0xf6, 0x17, 0x0d, 0xbd, 0xc5, 0x0c, 0xf1, 0x3c, 0xec, 0x3c, 0x13,
+  0xbd, 0x1a, 0x06, 0x4b, 0xbd, 0x76, 0x04, 0xa4, 0xbc, 0x89, 0x87, 0x92, 0x3d,
+  0xd2, 0xc6, 0xaf, 0x3d, 0xb1, 0xb1, 0x12, 0x3d, 0x99, 0xa4, 0x23, 0x3d, 0x25,
+  0x73, 0x75, 0x3b, 0x18, 0x34, 0xa1, 0xbd, 0xc0, 0x90, 0xa5, 0x3d, 0xaa, 0xa8,
+  0x14, 0xbd, 0x6c, 0xbc, 0xf3, 0x3c, 0x8a, 0x47, 0x51, 0xbc, 0xab, 0xfc, 0x2a,
+  0x3d, 0xc8, 0xb7, 0x68, 0x3d, 0xff, 0xbf, 0x72, 0x3d, 0x38, 0x39, 0x95, 0x3d,
+  0xdc, 0x49, 0x94, 0xbc, 0xbd, 0xce, 0x90, 0x3c, 0xcd, 0x13, 0x35, 0x3d, 0xd4,
+  0xd9, 0x51, 0xbd, 0x16, 0xde, 0xfb, 0xbc, 0xc7, 0x00, 0xb9, 0xbd, 0x38, 0x8e,
+  0x2e, 0xbc, 0xcb, 0xce, 0x5e, 0x3d, 0x44, 0x22, 0x7a, 0x3c, 0x70, 0x0a, 0x93,
+  0x3d, 0x9c, 0x88, 0x81, 0x3a, 0x02, 0x89, 0x01, 0xbd, 0x52, 0x9b, 0x50, 0xbc,
+  0xc7, 0x6f, 0x46, 0x3c, 0x41, 0xb4, 0x57, 0x3d, 0x79, 0x89, 0xd2, 0x3b, 0x20,
+  0xab, 0x75, 0x3b, 0x40, 0xf2, 0xea, 0x3c, 0x8f, 0x29, 0x8c, 0x3d, 0xb0, 0x20,
+  0x45, 0xbd, 0xf4, 0x67, 0x8c, 0x3d, 0xbf, 0x3f, 0x9d, 0x3c, 0xa7, 0x71, 0x01,
+  0xbd, 0x37, 0x6b, 0x02, 0xbc, 0x68, 0xc4, 0x2a, 0x3d, 0x43, 0x60, 0x9b, 0xbc,
+  0x72, 0xb9, 0x73, 0xbd, 0x90, 0xc4, 0x13, 0x3c, 0xba, 0xbf, 0x50, 0xbb, 0x86,
+  0x75, 0x78, 0xbd, 0x2e, 0xaf, 0x69, 0xbc, 0xdb, 0x89, 0xbc, 0x3d, 0x05, 0x7f,
+  0xa8, 0xbd, 0x42, 0x5f, 0x02, 0x3d, 0xe1, 0x3c, 0x12, 0xbd, 0xfd, 0xdf, 0x41,
+  0x3d, 0x2e, 0xda, 0xe3, 0xbb, 0x80, 0x3c, 0x5f, 0xbd, 0x26, 0x2b, 0x1f, 0xbd,
+  0xa8, 0xed, 0xd5, 0x3c, 0xa6, 0x84, 0xf1, 0x3c, 0xbe, 0xd2, 0x9a, 0xbb, 0x5b,
+  0x04, 0x61, 0x3d, 0x2b, 0xe5, 0x06, 0xbd, 0xc9, 0xb8, 0x85, 0x3c, 0x64, 0x7a,
+  0xc7, 0x3d, 0x4c, 0x12, 0xc9, 0x3c, 0x69, 0x12, 0x63, 0xbd, 0x88, 0x73, 0xbf,
+  0x3c, 0xfc, 0x66, 0x50, 0xbb, 0x64, 0x31, 0x9a, 0xbd, 0xeb, 0x81, 0x8d, 0x3d,
+  0x7e, 0x4e, 0xc5, 0x3c, 0x15, 0x80, 0x96, 0x3d, 0xb9, 0x1f, 0x65, 0xbd, 0xe3,
+  0x99, 0xda, 0xbd, 0x94, 0x02, 0x4a, 0x3c, 0xbf, 0x7b, 0x26, 0x3d, 0x20, 0xae,
+  0x9d, 0xbb, 0x84, 0x49, 0x1e, 0x3d, 0x88, 0x11, 0x17, 0x3d, 0x45, 0x77, 0x73,
+  0x3c, 0x76, 0x33, 0xaa, 0x3c, 0x28, 0x4d, 0x4b, 0x3d, 0x49, 0x89, 0x37, 0x3c,
+  0x3f, 0xe6, 0x92, 0xbd, 0xc8, 0x39, 0xa0, 0x3c, 0xd6, 0xff, 0x0a, 0x3b, 0xb4,
+  0xef, 0xad, 0xbd, 0xdb, 0x17, 0x19, 0x3c, 0x9a, 0x54, 0x7c, 0xbd, 0xe7, 0x50,
+  0xcc, 0x3c, 0x91, 0xeb, 0x75, 0xbd, 0x9a, 0x45, 0xac, 0x3d, 0xd3, 0x80, 0x4d,
+  0xbd, 0x17, 0x6c, 0x19, 0x3c, 0x47, 0xb1, 0x1f, 0xbd, 0xef, 0x17, 0x1d, 0xbd,
+  0xa2, 0xc8, 0x58, 0xbc, 0xf9, 0xc6, 0x81, 0xbb, 0x70, 0xfc, 0xa1, 0x3b, 0x70,
+  0x74, 0x38, 0x3d, 0xb9, 0x93, 0x6c, 0x3d, 0xb5, 0x22, 0x89, 0x3d, 0xa8, 0x15,
+  0xed, 0xbb, 0xee, 0x0c, 0xac, 0xbc, 0xbf, 0xca, 0xbe, 0xbc, 0x8e, 0x0d, 0xbf,
+  0xbd, 0xfb, 0x0c, 0x92, 0x3c, 0x3d, 0x1e, 0x61, 0xbd, 0xe1, 0xb2, 0x08, 0xbd,
+  0xcd, 0xab, 0x75, 0xbb, 0xc5, 0x1a, 0x2f, 0x3d, 0x4f, 0x02, 0x92, 0x3c, 0x8f,
+  0x47, 0x20, 0x3d, 0x33, 0xac, 0xc3, 0x3d, 0xc9, 0xdc, 0xbd, 0xbc, 0x68, 0x6e,
+  0xb4, 0x3b, 0x32, 0x32, 0xdc, 0x3d, 0xd8, 0xff, 0x92, 0x3d, 0xb3, 0xa4, 0x6f,
+  0xbd, 0xf0, 0xbe, 0x13, 0xbd, 0xff, 0xf5, 0xdf, 0xbd, 0x67, 0xeb, 0x94, 0x3c,
+  0xb2, 0xe8, 0x57, 0xbb, 0x92, 0x3f, 0xdc, 0xbb, 0xe3, 0x5f, 0x6b, 0x3c, 0x02,
+  0xcc, 0x6c, 0xbd, 0x25, 0xa1, 0x57, 0xbd, 0x22, 0x01, 0x82, 0x3d, 0xc3, 0xcf,
+  0xb2, 0x3c, 0xed, 0x35, 0x56, 0xbb, 0xe3, 0xf0, 0x8c, 0x3d, 0xdb, 0xf1, 0xb1,
+  0xbc, 0xaa, 0xe4, 0xc2, 0x3b, 0x53, 0x9c, 0xf6, 0xbc, 0x15, 0x86, 0x92, 0x3d,
+  0xe4, 0xf9, 0x39, 0x3d, 0x09, 0xa5, 0xa8, 0xbc, 0x6e, 0x89, 0xd1, 0xbc, 0x47,
+  0xd4, 0x7b, 0x3c, 0x7b, 0xff, 0xab, 0x3c, 0x15, 0x58, 0x8d, 0xbd, 0x7b, 0x21,
+  0xac, 0x3c, 0xda, 0xe5, 0xad, 0xbc, 0x8b, 0xfc, 0xd8, 0xbc, 0x8c, 0xe1, 0x0e,
+  0xbc, 0x36, 0x43, 0xc6, 0x3d, 0xfa, 0x15, 0x8b, 0xbc, 0xb8, 0xd0, 0x07, 0x3d,
+  0xd9, 0x12, 0x9c, 0x3c, 0x81, 0x20, 0x4f, 0xbd, 0xd8, 0x7f, 0x18, 0x3b, 0x38,
+  0xd4, 0x33, 0xbc, 0x00, 0x0f, 0xe2, 0xbd, 0x25, 0xa8, 0xf2, 0x3c, 0x87, 0xa6,
+  0x96, 0xbd, 0x84, 0xc3, 0xa8, 0x3c, 0xf4, 0x7a, 0x8b, 0x3c, 0xfd, 0xbd, 0x55,
+  0xbc, 0x45, 0x00, 0x97, 0xbd, 0x81, 0x3a, 0xbd, 0x3b, 0x21, 0x43, 0x30, 0xbd,
+  0x94, 0x58, 0xa5, 0x3b, 0x30, 0x2f, 0x12, 0xbd, 0xcb, 0xd3, 0x32, 0x3d, 0x36,
+  0xd2, 0x7c, 0xbd, 0xf2, 0x77, 0x49, 0x3d, 0x87, 0xdd, 0x87, 0xbc, 0x3d, 0x1a,
+  0x02, 0x3d, 0x5a, 0x1b, 0xc1, 0x3c, 0x04, 0xaf, 0x33, 0xbd, 0x84, 0x02, 0x1d,
+  0x3d, 0x47, 0x7d, 0x21, 0xbd, 0x46, 0xc4, 0x24, 0x3d, 0x8f, 0x16, 0x27, 0x3d,
+  0xce, 0x48, 0x22, 0x3d, 0xd9, 0x6b, 0xa3, 0x3c, 0x31, 0x91, 0xbb, 0x3c, 0xef,
+  0x24, 0x88, 0xbb, 0x1e, 0x6e, 0x41, 0xbd, 0x81, 0xea, 0x80, 0x3d, 0xa6, 0xa7,
+  0xf2, 0x3d, 0x74, 0xcf, 0xd7, 0x3c, 0x4c, 0x85, 0xf6, 0xbc, 0x57, 0xac, 0x0f,
+  0x3c, 0x1c, 0x44, 0x53, 0xbd, 0x44, 0x55, 0x35, 0x3d, 0x14, 0x45, 0x11, 0x3d,
+  0x0d, 0xfa, 0xff, 0xbc, 0xe0, 0xef, 0x32, 0x3d, 0x6c, 0x60, 0xac, 0x3b, 0xd2,
+  0xe0, 0xab, 0xbb, 0x77, 0x02, 0x3f, 0xbd, 0xcd, 0x77, 0x44, 0x3d, 0x4f, 0x8c,
+  0x3e, 0xbd, 0x74, 0xd6, 0x5a, 0xbd, 0x33, 0xb6, 0xf2, 0xbc, 0x94, 0xe4, 0x0e,
+  0x3b, 0x6c, 0x9b, 0xa9, 0x3a, 0x61, 0xd7, 0xea, 0xbc, 0xf6, 0x70, 0xe9, 0x3c,
+  0x06, 0x81, 0xeb, 0xbc, 0x51, 0x88, 0x47, 0xbb, 0x6c, 0xfb, 0x6d, 0x3d, 0x0a,
+  0x9d, 0x29, 0xbb, 0xa0, 0x45, 0x36, 0x3c, 0xe5, 0xd9, 0xb8, 0x3c, 0x09, 0xf4,
+  0x09, 0xbd, 0x2a, 0x13, 0x54, 0xbc, 0xad, 0xb0, 0xa3, 0x3d, 0x5a, 0x07, 0xff,
+  0x3c, 0x18, 0x10, 0xc9, 0x3c, 0x15, 0xf6, 0x07, 0xbd, 0x05, 0x70, 0x60, 0x3d,
+  0xb5, 0xbd, 0x50, 0x3d, 0xeb, 0xe1, 0x11, 0x3d, 0xdf, 0x70, 0x40, 0xbd, 0x51,
+  0x6f, 0x67, 0xbd, 0x61, 0xbf, 0xd0, 0x3c, 0x39, 0x5e, 0x14, 0xbd, 0xae, 0x58,
+  0xa1, 0x3d, 0xa2, 0x03, 0x88, 0x3d, 0x85, 0x40, 0x89, 0xbd, 0x3e, 0x4f, 0x21,
+  0x3c, 0x8b, 0x40, 0xcf, 0x3c, 0xa8, 0x0d, 0x76, 0x3d, 0x2f, 0x57, 0xf4, 0x3b,
+  0x78, 0x71, 0x8f, 0x3c, 0x15, 0x80, 0x72, 0x3d, 0x35, 0xc6, 0xe6, 0xbc, 0x1e,
+  0xdb, 0x8d, 0x3d, 0xc1, 0x52, 0x58, 0x3d, 0x1e, 0x0c, 0x37, 0x3d, 0x68, 0xdd,
+  0x25, 0x3d, 0x1a, 0x65, 0x59, 0xbc, 0x22, 0xe3, 0x8b, 0x3d, 0x29, 0xb2, 0x44,
+  0xbd, 0x56, 0x71, 0x34, 0xbd, 0x1c, 0x3f, 0x7c, 0xbb, 0x88, 0x17, 0x72, 0xbc,
+  0xbb, 0xb5, 0xae, 0x3c, 0xdd, 0x7b, 0xd5, 0x3c, 0xd3, 0x2f, 0x93, 0x3d, 0x07,
+  0x46, 0x38, 0x3d, 0x55, 0x2b, 0x47, 0x3d, 0xd2, 0x5c, 0xda, 0x3d, 0xa4, 0x8e,
+  0x80, 0x3d, 0xe6, 0xdb, 0xc9, 0x3c, 0xf3, 0x2d, 0x3f, 0xbd, 0x66, 0x10, 0xd1,
+  0xbd, 0xde, 0xa5, 0xda, 0x3c, 0xab, 0x8c, 0xe4, 0x3c, 0x85, 0x1c, 0xc0, 0x3c,
+  0xba, 0xe5, 0x95, 0xbd, 0x25, 0x50, 0x92, 0x3c, 0x25, 0x15, 0xc9, 0xba, 0x43,
+  0xdc, 0x63, 0xbc, 0x65, 0xd6, 0x07, 0x3d, 0x87, 0x8c, 0x0e, 0xbc, 0x0d, 0x90,
+  0x87, 0x3d, 0x9a, 0x0e, 0x4a, 0x3d, 0x67, 0x54, 0x4a, 0x3d, 0x63, 0x8b, 0x24,
+  0xbd, 0x56, 0x2c, 0xcf, 0xbc, 0x28, 0x2a, 0x23, 0x3d, 0xc6, 0x80, 0xa3, 0xbc,
+  0x66, 0xe5, 0x09, 0xbd, 0x69, 0xdb, 0x93, 0x3d, 0x00, 0xc7, 0x7e, 0xbd, 0xe0,
+  0x18, 0x06, 0x3d, 0x02, 0xb9, 0x77, 0xbd, 0x43, 0x60, 0x55, 0x3c, 0x46, 0x45,
+  0xa4, 0x3d, 0xb1, 0x0a, 0xac, 0x3c, 0x8a, 0xc5, 0x8e, 0x3d, 0xf6, 0x60, 0x31,
+  0xbc, 0x9b, 0x2d, 0xb0, 0x3a, 0xc3, 0xc4, 0x4a, 0xbd, 0x96, 0x31, 0x82, 0xbd,
+  0x4e, 0x50, 0x59, 0x3c, 0x2f, 0xf7, 0xd4, 0xbd, 0x18, 0xc1, 0x2b, 0xbd, 0xb8,
+  0x26, 0x9d, 0x3c, 0xd6, 0x9c, 0x3b, 0xbd, 0xb6, 0xdd, 0x11, 0xbd, 0x4e, 0x51,
+  0xd9, 0x3b, 0xbd, 0xfd, 0x3b, 0xbd, 0xe2, 0xe9, 0x35, 0xbc, 0x0d, 0xb1, 0x9c,
+  0x3c, 0x02, 0x6e, 0xab, 0x3c, 0xc9, 0x70, 0x25, 0x3c, 0xae, 0xe4, 0x60, 0xbd,
+  0x11, 0xc2, 0x49, 0x3d, 0x9b, 0x09, 0xaf, 0xbc, 0xbc, 0x74, 0x75, 0x3c, 0x38,
+  0x61, 0x16, 0x3d, 0x0c, 0x99, 0x94, 0x3d, 0x01, 0x83, 0x03, 0xbb, 0xc5, 0x45,
+  0x1b, 0x3d, 0x82, 0xab, 0x6f, 0x3c, 0xe1, 0x41, 0xce, 0x3c, 0x86, 0xd5, 0x79,
+  0xbd, 0x0e, 0x6c, 0x69, 0x3d, 0xcf, 0xbb, 0x87, 0x3d, 0x65, 0x17, 0xb4, 0xbc,
+  0xca, 0x64, 0x07, 0x3e, 0x7d, 0x34, 0xca, 0x3d, 0x40, 0x0d, 0xfb, 0x3c, 0x0e,
+  0xea, 0xc2, 0x3c, 0x06, 0x26, 0x88, 0xbc, 0xed, 0x76, 0x84, 0x3d, 0xca, 0x92,
+  0xa4, 0xbc, 0x4c, 0x98, 0x74, 0xbd, 0x62, 0x77, 0xdb, 0xbd, 0x97, 0xba, 0x87,
+  0x3d, 0xe9, 0x05, 0x95, 0xbd, 0xcc, 0xfd, 0x99, 0x3d, 0x36, 0x01, 0x0b, 0xbd,
+  0x23, 0x33, 0x7d, 0x3d, 0x2f, 0xba, 0x5c, 0x3d, 0xaa, 0xed, 0xb2, 0xbc, 0xfc,
+  0xe7, 0x97, 0x3d, 0xaa, 0x40, 0x7d, 0x3d, 0x2a, 0x5f, 0x5e, 0x3d, 0x51, 0x91,
+  0x7d, 0xbd, 0xc8, 0xf8, 0x2a, 0x3d, 0x7b, 0x8c, 0x2f, 0x3d, 0x35, 0xe0, 0xb9,
+  0xbb, 0xc4, 0x0b, 0x56, 0xbd, 0xcf, 0xd0, 0xb8, 0x3c, 0xf7, 0xef, 0x61, 0x3d,
+  0xf5, 0x33, 0x9a, 0x3d, 0x07, 0xd8, 0xf0, 0xbc, 0x34, 0x49, 0x61, 0xbd, 0x7c,
+  0x0c, 0x74, 0xbd, 0x0c, 0x85, 0xf7, 0xbc, 0xeb, 0x13, 0xdd, 0xbc, 0x70, 0x3a,
+  0xd1, 0x3c, 0xd0, 0x31, 0xe1, 0x3d, 0xbf, 0xb4, 0x90, 0xbd, 0x6c, 0x8a, 0x4f,
+  0xbc, 0x89, 0x66, 0x29, 0xbc, 0x5d, 0x8a, 0x18, 0xbd, 0xa4, 0x2b, 0x91, 0xbd,
+  0x6a, 0x8d, 0x2b, 0xb9, 0x44, 0x9f, 0xf1, 0xbd, 0xe3, 0x9a, 0x87, 0x3c, 0x3c,
+  0x77, 0x5c, 0x3d, 0x1b, 0x6f, 0x50, 0xbd, 0x43, 0x9e, 0x41, 0xbd, 0x13, 0x6f,
+  0x5d, 0x3d, 0x44, 0x7f, 0x67, 0x3c, 0xf5, 0x9e, 0x31, 0x3c, 0xc0, 0x48, 0x8b,
+  0x3d, 0x48, 0xc4, 0xd0, 0xbc, 0x80, 0x20, 0x17, 0x3a, 0x4c, 0x44, 0x42, 0x3b,
+  0xcd, 0x50, 0x0e, 0x3d, 0xf8, 0xdd, 0x6a, 0x3d, 0xa7, 0xa4, 0x57, 0x3c, 0x5c,
+  0x60, 0x94, 0x3c, 0xd4, 0x6e, 0x34, 0xbc, 0xa3, 0xa2, 0x8e, 0xbd, 0x88, 0xe0,
+  0xad, 0x3d, 0xdb, 0xd6, 0x9f, 0xbd, 0x14, 0xcb, 0x61, 0xbd, 0x02, 0x50, 0x7f,
+  0xbd, 0xb9, 0x4c, 0x9d, 0x3d, 0x0d, 0x5a, 0x88, 0x3d, 0x8b, 0x0a, 0x06, 0x3c,
+  0xdf, 0x17, 0x8e, 0x3d, 0x75, 0x07, 0x0c, 0x3d, 0x5d, 0xd3, 0x52, 0xbd, 0x22,
+  0x56, 0x0b, 0x3a, 0x62, 0x34, 0xcb, 0xbc, 0x55, 0x58, 0xaa, 0x3c, 0x72, 0x28,
+  0xa3, 0xbd, 0x60, 0x8d, 0x3f, 0xbc, 0x5b, 0xaa, 0x51, 0xbb, 0xa8, 0x60, 0x31,
+  0xbd, 0x8c, 0xc5, 0xfb, 0x3c, 0x90, 0x97, 0x3f, 0xbc, 0x94, 0x3a, 0x45, 0xbd,
+  0xb5, 0xc1, 0x8d, 0xbd, 0x07, 0xd0, 0x08, 0x3d, 0x47, 0x05, 0xe2, 0xbb, 0x69,
+  0x2e, 0x16, 0x3d, 0xd0, 0x2d, 0x50, 0xbd, 0xd3, 0x88, 0x9e, 0x3d, 0x2f, 0x19,
+  0xbb, 0xbc, 0x20, 0x1f, 0xa4, 0x3d, 0x38, 0x4e, 0x9c, 0xbc, 0x71, 0x5a, 0x6e,
+  0x3c, 0x47, 0x9a, 0x49, 0x3d, 0x7a, 0x7b, 0x07, 0x3a, 0x54, 0xf5, 0xcd, 0x3d,
+  0x54, 0xb0, 0xde, 0x3c, 0xb0, 0xbd, 0x1b, 0x3c, 0x31, 0x85, 0x2c, 0xbd, 0xda,
+  0x03, 0xe4, 0xbb, 0x9e, 0xf5, 0x87, 0x3d, 0xef, 0x15, 0x41, 0x3d, 0x82, 0x56,
+  0xa3, 0x3d, 0xfa, 0x31, 0x5e, 0xbd, 0xf2, 0x5e, 0x5f, 0xbb, 0x1c, 0xda, 0x9f,
+  0x3d, 0x45, 0x09, 0x71, 0xbc, 0x37, 0x80, 0x9a, 0x3b, 0x5a, 0x7a, 0xfd, 0xbc,
+  0x37, 0x4f, 0x1a, 0xbe, 0xfa, 0x30, 0xeb, 0xbc, 0xa9, 0xd5, 0x74, 0xbd, 0x18,
+  0xad, 0x9b, 0xbc, 0x00, 0xc4, 0xce, 0x3a, 0x98, 0x58, 0x19, 0x3c, 0xf0, 0x22,
+  0xa1, 0x3b, 0x84, 0xfa, 0x08, 0xbd, 0x6f, 0xfe, 0x96, 0x3d, 0xe3, 0xc4, 0x90,
+  0x3d, 0xa0, 0xc8, 0x5a, 0xbc, 0x97, 0x7f, 0xc2, 0xbc, 0xea, 0xcc, 0xcc, 0x3c,
+  0xae, 0xb0, 0x9c, 0xbc, 0x49, 0xdf, 0x97, 0xbc, 0xdd, 0x01, 0x18, 0xbd, 0x66,
+  0x26, 0xa7, 0xbc, 0x2a, 0x3d, 0x59, 0xbd, 0x93, 0x1b, 0x1a, 0x3d, 0xd9, 0x46,
+  0xcc, 0x3c, 0x00, 0xf0, 0x34, 0x3a, 0x99, 0x3d, 0xc0, 0xbc, 0x08, 0xb1, 0x09,
+  0x3c, 0xbe, 0xfb, 0x79, 0x3d, 0xa9, 0x90, 0x86, 0xbd, 0xa2, 0x17, 0x8f, 0xbd,
+  0x30, 0x94, 0x8a, 0xbb, 0xd9, 0xd7, 0x82, 0x3d, 0xe4, 0xea, 0x2f, 0xbd, 0x7e,
+  0x59, 0x73, 0xbd, 0x46, 0x73, 0xe2, 0xbc, 0xe0, 0xd4, 0x42, 0xbc, 0x3c, 0x6c,
+  0xdf, 0x3c, 0x08, 0xce, 0xf9, 0x3c, 0xfc, 0xe4, 0x79, 0xbd, 0xac, 0x5c, 0x4f,
+  0xbd, 0x60, 0x67, 0x12, 0xbb, 0xb2, 0xcf, 0xbf, 0xbc, 0xe2, 0x7c, 0x31, 0xbd,
+  0xb6, 0xc7, 0x18, 0x3d, 0xdc, 0x89, 0x90, 0xbd, 0x0c, 0xf7, 0x99, 0xbc, 0xa0,
+  0x2a, 0x3c, 0xbd, 0x92, 0x1b, 0x38, 0x3d, 0x34, 0xe9, 0x86, 0xbd, 0x69, 0x76,
+  0x6d, 0xbd, 0x76, 0x2b, 0x6e, 0x3d, 0x70, 0x53, 0x3f, 0x3d, 0x22, 0xe5, 0x4c,
+  0x3d, 0x52, 0x57, 0xfc, 0xbc, 0xf8, 0x6b, 0x31, 0xbd, 0xb4, 0xb1, 0xa3, 0x3c,
+  0x10, 0x0c, 0x60, 0x3c, 0xbc, 0x80, 0x85, 0xbd, 0xe6, 0x9f, 0x78, 0xbd, 0x00,
+  0x20, 0x90, 0xba, 0xbc, 0x54, 0x5d, 0xbd, 0x6c, 0xd7, 0xc5, 0xbc, 0x87, 0x6b,
+  0x87, 0x3d, 0x0a, 0x34, 0x0c, 0x3d, 0x44, 0xe5, 0x47, 0xbd, 0xe0, 0xd3, 0x05,
+  0x3b, 0x23, 0x83, 0x11, 0xbd, 0xab, 0x22, 0x8c, 0xbd, 0x48, 0x17, 0xe9, 0x3c,
+  0xbd, 0x8a, 0x89, 0x3d, 0xc0, 0x3a, 0x71, 0x3b, 0x08, 0x52, 0x61, 0x3c, 0x40,
+  0xb4, 0x6d, 0x3c, 0xa0, 0x6a, 0xa0, 0x3b, 0x00, 0xc4, 0xb9, 0x39, 0x74, 0x71,
+  0xa8, 0x3c, 0x13, 0xa7, 0x90, 0xbd, 0x04, 0xb5, 0xb4, 0xbc, 0x70, 0x36, 0x31,
+  0x3c, 0x28, 0x25, 0x0f, 0x3c, 0xfc, 0x08, 0x46, 0xbd, 0x80, 0xa0, 0xa5, 0xba,
+  0xe2, 0x11, 0x6f, 0xbd, 0x39, 0xf0, 0x31, 0xbd, 0xd8, 0xbe, 0x2f, 0xbd, 0x68,
+  0x21, 0x4d, 0xbd, 0x64, 0x1b, 0x8e, 0xbd, 0x80, 0xd4, 0x78, 0xba, 0x92, 0x81,
+  0x5a, 0xbd, 0xf4, 0xf9, 0x57, 0xbd, 0x80, 0x59, 0xa2, 0x3c, 0x22, 0xe6, 0xde,
+  0xbc, 0x91, 0xdf, 0x87, 0xbd, 0x3a, 0xea, 0x22, 0xbd, 0xba, 0xf7, 0x75, 0x3d,
+  0xba, 0x8a, 0x0c, 0x3d, 0x81, 0xa7, 0x8d, 0xbd, 0x90, 0xee, 0x50, 0xbd, 0x14,
+  0xa3, 0x90, 0xbd, 0xdc, 0xdf, 0x81, 0x3c, 0x4a, 0xb5, 0x66, 0xbd, 0x10, 0xa0,
+  0x94, 0x3b, 0x9a, 0x12, 0x2d, 0xbd, 0xda, 0x60, 0x42, 0xbd, 0xea, 0x9f, 0xb0,
+  0xbc, 0x38, 0xfc, 0x02, 0x3d, 0xa6, 0x08, 0x04, 0x3d, 0x23, 0xf6, 0x03, 0xbd,
+  0xa2, 0x7a, 0x63, 0x3d, 0x26, 0xca, 0x36, 0x3d, 0x96, 0xd3, 0x0d, 0x3d, 0x3f,
+  0xfd, 0x89, 0x3d, 0x08, 0xa3, 0x24, 0xbd, 0x28, 0x10, 0x57, 0xbc, 0xbb, 0xb9,
+  0x83, 0x3d, 0x50, 0x2b, 0xb5, 0x3b, 0x9c, 0x94, 0x19, 0xbc, 0xc4, 0x4d, 0x9a,
+  0xbc, 0x91, 0xf8, 0x0d, 0xbd, 0x63, 0x13, 0x7d, 0xbd, 0xed, 0xd0, 0x02, 0xbd,
+  0x1c, 0x10, 0x85, 0xbd, 0x00, 0xca, 0x36, 0x3c, 0xc8, 0x17, 0x7a, 0x3c, 0x24,
+  0x32, 0xc7, 0xbc, 0x88, 0x75, 0xa5, 0x3c, 0x2e, 0x18, 0x39, 0xbd, 0xd4, 0xa9,
+  0xfb, 0x3c, 0x8c, 0x61, 0x48, 0x3d, 0x40, 0x34, 0xb1, 0xba, 0xb7, 0xec, 0x83,
+  0x3d, 0x7c, 0x1d, 0x5a, 0x3d, 0x30, 0x5c, 0x91, 0x3c, 0xcb, 0x9d, 0x85, 0x3d,
+  0x74, 0xa8, 0x35, 0x3d, 0x93, 0x54, 0x76, 0xbd, 0xa3, 0xb8, 0x8c, 0xbd, 0xf3,
+  0x38, 0x8d, 0xbd, 0x45, 0x41, 0x8d, 0xbd, 0xb0, 0x35, 0x2c, 0x3d, 0x79, 0x2f,
+  0x91, 0x3d, 0x1c, 0xa0, 0xde, 0xbc, 0x26, 0xd7, 0x53, 0xbd, 0xec, 0x6e, 0x11,
+  0x3d, 0x1c, 0x44, 0x8f, 0x3c, 0x2b, 0x97, 0x2b, 0xbd, 0x78, 0x4e, 0x62, 0xbc,
+  0x4a, 0x20, 0xe3, 0xbc, 0x2e, 0x7e, 0xd5, 0xbc, 0x34, 0xe0, 0xcc, 0xbc, 0x00,
+  0xd9, 0x05, 0x3d, 0x6e, 0xe3, 0xd8, 0xbc, 0x32, 0x01, 0x51, 0x3d, 0x57, 0x4a,
+  0x83, 0x3d, 0x98, 0x90, 0x4c, 0xbd, 0x0d, 0x8e, 0x8b, 0x3d, 0x76, 0x2c, 0x32,
+  0x3d, 0x6a, 0x76, 0x91, 0xbd, 0xc8, 0xf9, 0x85, 0x3c, 0x40, 0x2b, 0x80, 0x3a,
+  0xe0, 0x00, 0xe3, 0xbb, 0x00, 0x06, 0x79, 0xb9, 0x27, 0xbd, 0x8f, 0x3d, 0xce,
+  0x76, 0x2c, 0x3d, 0x56, 0x63, 0xd7, 0xbc, 0x30, 0x52, 0xf0, 0xbb, 0x69, 0x1f,
+  0x85, 0xbd, 0x7e, 0xdb, 0x64, 0xbd, 0x85, 0xd6, 0x87, 0x3d, 0x92, 0xc0, 0x70,
+  0x3d, 0x4c, 0x7a, 0x78, 0xbc, 0x6c, 0x7d, 0x2b, 0xbd, 0x6f, 0x2b, 0x85, 0x3d,
+  0x98, 0x48, 0x39, 0xbd, 0x8c, 0x9d, 0xce, 0x3c, 0x08, 0xf9, 0x5c, 0xbc, 0xe8,
+  0x5a, 0xcd, 0x3c, 0x88, 0xb0, 0x3c, 0x3d, 0xf8, 0x88, 0x4e, 0xbd, 0x30, 0x8f,
+  0x38, 0x3c, 0xba, 0xa1, 0xc9, 0xbc, 0xba, 0xdc, 0x6d, 0x3d, 0xc0, 0x39, 0x5a,
+  0xbb, 0xa6, 0x2d, 0x1d, 0x3d, 0x04, 0xde, 0xe4, 0x3c, 0x24, 0x67, 0x4f, 0xbd,
+  0xde, 0xc0, 0x7c, 0x3d, 0x31, 0x68, 0x09, 0xbd, 0x01, 0x59, 0x80, 0xbd, 0x13,
+  0x09, 0x91, 0x3d, 0xc8, 0xdd, 0x18, 0x3d, 0x2b, 0x88, 0x91, 0x3d, 0x50, 0xef,
+  0x80, 0x3c, 0xec, 0x4a, 0x65, 0xbc, 0xb0, 0xca, 0x0a, 0x3d, 0x48, 0x1f, 0x29,
+  0xbd, 0x56, 0xe9, 0x3a, 0x3d, 0xd0, 0x9c, 0x67, 0xbc, 0xe0, 0x47, 0xdb, 0xbc,
+  0xd8, 0x70, 0x4a, 0xbd, 0x86, 0x63, 0x39, 0xbd, 0xfb, 0x2a, 0x10, 0xbd, 0xbc,
+  0xfb, 0x42, 0xbd, 0xdc, 0x59, 0xe4, 0xbc, 0x2e, 0x08, 0x5f, 0xbd, 0x34, 0xb6,
+  0xe1, 0x3c, 0x76, 0x68, 0x22, 0x3d, 0x18, 0x3d, 0x14, 0x3c, 0xa5, 0xa2, 0x8b,
+  0xbd, 0x9c, 0x97, 0x87, 0xbd, 0xbd, 0x22, 0x87, 0x3d, 0x20, 0x18, 0x57, 0x3c,
+  0xb6, 0x45, 0x5e, 0x3d, 0xa4, 0x1e, 0x63, 0xbd, 0x88, 0x1f, 0x68, 0x3c, 0xe0,
+  0x00, 0x4f, 0x3d, 0x34, 0xe0, 0x5a, 0xbc, 0xd4, 0xd3, 0x61, 0xbc, 0x40, 0x8f,
+  0x14, 0xbb, 0xae, 0x4e, 0x94, 0xbc, 0x8d, 0x80, 0x61, 0xbd, 0x11, 0xcc, 0x85,
+  0x3d, 0xb4, 0x7b, 0x24, 0xbd, 0x3e, 0x81, 0x15, 0x3d, 0xaa, 0xe5, 0x85, 0xbd,
+  0xa0, 0xa4, 0x2c, 0xbb, 0x02, 0x5e, 0x25, 0x3d, 0x5d, 0x8b, 0x37, 0xbd, 0xa1,
+  0xb0, 0x25, 0xbd, 0x4a, 0xa5, 0x6b, 0x3d, 0xd3, 0x4a, 0x92, 0x3d, 0x40, 0x57,
+  0x06, 0x3d, 0x20, 0xdd, 0x30, 0x3b, 0xb0, 0x9e, 0xd3, 0x3c, 0x62, 0xb5, 0xd8,
+  0xbc, 0xa0, 0xec, 0x93, 0xbb, 0x20, 0xc4, 0x7a, 0x3b, 0xc0, 0x64, 0xfe, 0x3b,
+  0xcb, 0xb4, 0x90, 0x3d, 0x3f, 0x87, 0x8c, 0x3d, 0xfa, 0x94, 0x21, 0x3d, 0x9c,
+  0xc3, 0x03, 0x3d, 0xc2, 0x4f, 0x8d, 0xbc, 0x22, 0x1e, 0xd2, 0xbc, 0xa0, 0xd5,
+  0x66, 0xbc, 0xba, 0xf8, 0xcd, 0xbc, 0x7f, 0x26, 0x60, 0xbd, 0x6c, 0x27, 0x90,
+  0x3c, 0xf4, 0xd5, 0x85, 0x3c, 0xc0, 0x88, 0x3c, 0xbb, 0x8e, 0x17, 0x9d, 0xbc,
+  0x34, 0xb8, 0xef, 0x3c, 0x78, 0x16, 0xbd, 0x3c, 0x41, 0x5e, 0x90, 0xbd, 0x3e,
+  0x1c, 0x40, 0x3d, 0xeb, 0xf2, 0x8c, 0x3d, 0xd4, 0xb2, 0xa8, 0xbc, 0x0a, 0xae,
+  0x29, 0x3d, 0x40, 0x78, 0x1c, 0xbb, 0x60, 0xfb, 0xd1, 0x3c, 0x9d, 0xd0, 0x84,
+  0x3d, 0x8a, 0xcc, 0x08, 0x3d, 0x72, 0x4d, 0x41, 0x3d, 0xa9, 0x49, 0x50, 0xbd,
+  0x92, 0x44, 0x1c, 0x3d, 0xc8, 0x15, 0x5f, 0xbd, 0x1a, 0xda, 0xb6, 0xbc, 0xb4,
+  0x03, 0xd1, 0x3c, 0xdc, 0x8e, 0xb0, 0x3c, 0x88, 0x61, 0x7a, 0xbc, 0xb0, 0xab,
+  0xc4, 0xbb, 0xa2, 0x9f, 0x35, 0xbd, 0xac, 0xc1, 0x1e, 0xbd, 0x78, 0xd0, 0x54,
+  0x3d, 0x22, 0x03, 0xa9, 0xbc, 0x00, 0x71, 0x30, 0xbb, 0x30, 0xaa, 0xc8, 0x3b,
+  0xa9, 0x9c, 0x35, 0xbd, 0x00, 0xb3, 0x09, 0xbb, 0x40, 0x51, 0x2e, 0x3c, 0xc8,
+  0xb4, 0x23, 0x3c, 0x6d, 0xf4, 0x06, 0xbd, 0xaa, 0x77, 0x6f, 0x3d, 0xce, 0xc4,
+  0xb1, 0xbc, 0x6f, 0x91, 0x8b, 0x3d, 0x5f, 0xc4, 0x8a, 0x3d, 0xe4, 0x1f, 0xac,
+  0x3c, 0x4c, 0xc1, 0x89, 0x3c, 0x4c, 0x09, 0x5d, 0xbd, 0x38, 0x91, 0x3e, 0x3c,
+  0xe0, 0x15, 0x30, 0xbd, 0x60, 0x09, 0xd2, 0x3c, 0xe0, 0x4f, 0x35, 0xbb, 0xe8,
+  0xf2, 0xdf, 0xbc, 0x40, 0xa5, 0xcc, 0xba, 0x28, 0xaa, 0x04, 0xbc, 0xb4, 0x3b,
+  0x3d, 0xbc, 0xa8, 0xbc, 0x9d, 0x3c, 0x22, 0x77, 0x51, 0x3d, 0xd3, 0x53, 0x48,
+  0xbd, 0x80, 0x2a, 0x2c, 0x3b, 0x4e, 0x95, 0x79, 0x3d, 0x9c, 0x2c, 0x52, 0xbd,
+  0xac, 0x7e, 0xd9, 0x3c, 0x76, 0xd7, 0x78, 0x3d, 0x00, 0xe8, 0x78, 0xbd, 0x2e,
+  0x63, 0x0f, 0x3d, 0xeb, 0x59, 0x14, 0xbd, 0x84, 0xd4, 0x1c, 0xbc, 0x1d, 0x54,
+  0x1a, 0xbd, 0xe0, 0x16, 0x5c, 0xbb, 0x5c, 0xf1, 0x48, 0x3d, 0x94, 0x95, 0x59,
+  0xbc, 0x48, 0x14, 0x37, 0xbd, 0x3e, 0x60, 0x76, 0x3d, 0xb4, 0x88, 0xdb, 0x3c,
+  0x24, 0xf3, 0x8b, 0xbc, 0xb8, 0x6e, 0x0f, 0x3d, 0x00, 0x2c, 0xda, 0x3a, 0x79,
+  0x80, 0x88, 0x3d, 0x58, 0xf7, 0x26, 0x3c, 0x10, 0x19, 0x45, 0x3d, 0xf9, 0xba,
+  0x6a, 0xbd, 0x0e, 0x30, 0x43, 0x3d, 0xe0, 0x09, 0x68, 0x3b, 0x51, 0x84, 0x8f,
+  0xbd, 0x6a, 0xa1, 0x7a, 0xbd, 0xbc, 0x1c, 0x72, 0xbd, 0x94, 0xf7, 0x75, 0xbd,
+  0xc8, 0x32, 0x69, 0xbd, 0xf5, 0x29, 0x1e, 0xbd, 0x00, 0xe7, 0x59, 0x3a, 0x90,
+  0x9c, 0x84, 0xbd, 0x5c, 0x5f, 0x2f, 0xbd, 0x50, 0x8c, 0x95, 0xbb, 0x00, 0x13,
+  0x85, 0xbd, 0x26, 0xab, 0x7f, 0xbd, 0xc8, 0x91, 0x2a, 0xbc, 0x34, 0xda, 0xd2,
+  0xbc, 0x2c, 0xb7, 0x4b, 0x3d, 0x73, 0xe4, 0x2b, 0xbd, 0x48, 0x46, 0x8f, 0xbd,
+  0x0c, 0xa7, 0x36, 0xbd, 0x58, 0x23, 0x9f, 0x3c, 0xec, 0x5b, 0x2e, 0x3d, 0x28,
+  0xde, 0x34, 0xbd, 0x00, 0xd5, 0x8e, 0x3b, 0x76, 0xa2, 0x76, 0x3d, 0x64, 0xe8,
+  0x4d, 0x3d, 0x47, 0xc2, 0x82, 0xbd, 0x90, 0x0c, 0x8b, 0xbd, 0x9c, 0x98, 0x1a,
+  0x3d, 0x74, 0xd4, 0xd1, 0xbc, 0xd6, 0x3b, 0x78, 0x3d, 0x88, 0xad, 0x04, 0xbd,
+  0x5c, 0x4e, 0xbf, 0x3c, 0x20, 0xd8, 0x5b, 0x3c, 0x68, 0x77, 0x0e, 0xbc, 0xc0,
+  0x8a, 0xc8, 0x3b, 0x00, 0x68, 0x5d, 0xba, 0x4c, 0x05, 0x30, 0x3d, 0x20, 0xb7,
+  0x56, 0x3d, 0xa0, 0x6e, 0xef, 0x3c, 0xb4, 0x50, 0x1c, 0x3d, 0x5c, 0x0f, 0x68,
+  0xbd, 0xf7, 0x3c, 0x53, 0xbd, 0x96, 0xa5, 0x0c, 0x3d, 0x3a, 0x6c, 0x07, 0x3d,
+  0xa0, 0x60, 0x2c, 0xbd, 0x20, 0xaf, 0xbf, 0xbc, 0x00, 0x2d, 0x05, 0xbb, 0xe0,
+  0x97, 0x4b, 0x3b, 0x32, 0xdc, 0x37, 0x3d, 0xe2, 0x39, 0x54, 0xbd, 0x2a, 0xde,
+  0xeb, 0xbc, 0x1e, 0x8b, 0x6d, 0x3d, 0x0c, 0x92, 0xd6, 0xbc, 0xec, 0x48, 0x19,
+  0xbc, 0x23, 0xd9, 0x90, 0xbd, 0x84, 0x8b, 0x83, 0xbd, 0xc8, 0x8c, 0x7c, 0x3c,
+  0xfe, 0xca, 0x7d, 0xbd, 0x06, 0xb7, 0x69, 0x3d, 0x34, 0x35, 0xb0, 0x3c, 0x52,
+  0x14, 0x56, 0xbd, 0xf4, 0xf3, 0x43, 0xbd, 0x34, 0x5e, 0xbf, 0xbc, 0x9c, 0x32,
+  0x1e, 0x3d, 0xa0, 0x4d, 0xe0, 0x3b, 0x00, 0x68, 0x5d, 0xb8, 0x9e, 0x47, 0x7b,
+  0x3d, 0xe1, 0xcd, 0x8b, 0x3d, 0xb8, 0x10, 0x8f, 0xbc, 0xc8, 0x30, 0x28, 0x3c,
+  0xec, 0x42, 0x28, 0x3d, 0xfe, 0xea, 0x8a, 0xbd, 0x36, 0x76, 0x1a, 0xbd, 0xfa,
+  0x9c, 0xca, 0xbc, 0x10, 0xe9, 0x82, 0xbd, 0x72, 0x8b, 0x7b, 0x3d, 0x46, 0x75,
+  0x1c, 0xbd, 0x5a, 0xb9, 0x06, 0xbd, 0x6c, 0xa7, 0x25, 0xbc, 0x6a, 0x37, 0xd3,
+  0xbc, 0xbc, 0x78, 0x85, 0x3c, 0x98, 0xb7, 0x01, 0x3d, 0x3c, 0xb7, 0x0d, 0x3d,
+  0x3c, 0x57, 0x21, 0xbc, 0x28, 0xfb, 0xa7, 0x3c, 0x18, 0x3f, 0x49, 0x3c, 0x81,
+  0x34, 0x8d, 0xbd, 0xb4, 0xfb, 0x6e, 0xbd, 0x60, 0x97, 0x95, 0x3c, 0xac, 0xdd,
+  0x86, 0xbc, 0xd8, 0x6e, 0xda, 0x3c, 0xd8, 0xd9, 0x3d, 0x3d, 0x90, 0xa6, 0xea,
+  0x3c, 0x40, 0x67, 0x3f, 0x3d, 0x3a, 0x43, 0x69, 0x3d, 0x0a, 0x20, 0x5e, 0x3d,
+  0x33, 0x91, 0x12, 0xbd, 0xb4, 0xc5, 0x31, 0xbd, 0x0e, 0x96, 0x45, 0x3d, 0xc6,
+  0x22, 0x37, 0xbd, 0x7c, 0x12, 0x44, 0x3d, 0xc9, 0x61, 0x8a, 0x3d, 0x1c, 0x66,
+  0x44, 0x3d, 0xa2, 0x51, 0x30, 0x3d, 0xc8, 0xdb, 0xd9, 0x3c, 0xd3, 0xfb, 0x8e,
+  0xbd, 0x08, 0x6a, 0x91, 0xbd, 0xea, 0x2e, 0x48, 0xbd, 0x60, 0x5b, 0x22, 0xbb,
+  0x06, 0x39, 0x53, 0x3d, 0x84, 0xb4, 0x0b, 0xbd, 0xa0, 0x77, 0xfa, 0x3b, 0x84,
+  0xaf, 0xaa, 0x3c, 0x47, 0xd2, 0x86, 0xbd, 0xe3, 0xef, 0x43, 0xbd, 0x36, 0x8d,
+  0x16, 0x3d, 0x85, 0xa6, 0x85, 0x3d, 0x8e, 0xda, 0xa0, 0xbc, 0xc3, 0x58, 0x80,
+  0xbd, 0x93, 0x30, 0x0f, 0xbd, 0x0c, 0x85, 0xcf, 0xbc, 0xc0, 0x8c, 0x2a, 0x3c,
+  0x02, 0xe2, 0x0d, 0xbd, 0xe9, 0xf8, 0x8c, 0xbd, 0x15, 0x8d, 0x8b, 0x3d, 0xf3,
+  0x1f, 0x8b, 0xbd, 0x0f, 0xa0, 0x80, 0xbd, 0xee, 0x04, 0x63, 0x3d, 0xb4, 0x7a,
+  0xf6, 0xbc, 0x60, 0x5b, 0x2e, 0xbc, 0x04, 0x6d, 0x42, 0x3d, 0x8a, 0xfc, 0x1c,
+  0x3d, 0x52, 0xb0, 0x27, 0x3d, 0xe8, 0xf9, 0x35, 0xbd, 0xd4, 0xc2, 0x1b, 0x3d,
+  0x00, 0x3a, 0x0b, 0xbb, 0x80, 0x7e, 0x4b, 0x3c, 0x06, 0xba, 0x3e, 0xbd, 0x70,
+  0xc9, 0x35, 0xbd, 0xe0, 0x8b, 0x9d, 0xbb, 0x16, 0x05, 0x2f, 0xbd, 0xa0, 0xeb,
+  0x03, 0x3c, 0x40, 0x3e, 0x95, 0xbc, 0xea, 0x76, 0x73, 0xbd, 0x90, 0xb0, 0xe8,
+  0x3c, 0x3e, 0x61, 0x42, 0xbd, 0x17, 0x02, 0x8d, 0xbd, 0x42, 0x66, 0x1d, 0x3d,
+  0xfe, 0x31, 0x68, 0x3d, 0x52, 0x8e, 0x30, 0xbd, 0x6b, 0xca, 0x10, 0xbd, 0xbd,
+  0xcc, 0x80, 0xbd, 0x38, 0x91, 0x53, 0xbd, 0x90, 0xd7, 0xd3, 0x3c, 0x00, 0x0c,
+  0xf4, 0x3b, 0x82, 0xf5, 0x3f, 0xbd, 0xb2, 0xa9, 0x04, 0x3d, 0x62, 0x67, 0x5c,
+  0x3d, 0x86, 0xab, 0x91, 0xbc, 0xc2, 0x2b, 0xe8, 0xbc, 0x3a, 0x8a, 0x67, 0xbd,
+  0xcc, 0x83, 0xdb, 0x3c, 0xf0, 0x8a, 0x03, 0x3c, 0x94, 0x78, 0x53, 0x3d, 0x9c,
+  0x1b, 0xd4, 0x3c, 0xdb, 0xf9, 0x89, 0x3d, 0x40, 0xa5, 0x10, 0x3b, 0x89, 0xed,
+  0x80, 0xbd, 0x6e, 0xb8, 0x57, 0xbd, 0x12, 0xc2, 0xcf, 0xbc, 0x44, 0x32, 0xb1,
+  0x3c, 0xd5, 0xed, 0x34, 0xbd, 0x5e, 0x6c, 0x5c, 0xbd, 0x68, 0x69, 0x85, 0x3c,
+  0x30, 0xdb, 0xb6, 0xbb, 0x00, 0x7f, 0xe0, 0x3c, 0x80, 0x24, 0x1e, 0x3b, 0x78,
+  0x6f, 0x81, 0xbc, 0x3a, 0x27, 0x1b, 0x3d, 0x7f, 0xb5, 0x8a, 0xbd, 0xbb, 0xc1,
+  0x8e, 0x3d, 0xa8, 0x7e, 0x69, 0x3c, 0x00, 0x80, 0x47, 0xbb, 0x21, 0xb9, 0x15,
+  0xbd, 0x14, 0x0b, 0x8e, 0x3c, 0xa2, 0x1b, 0x55, 0x3d, 0x28, 0xea, 0x5b, 0xbd,
+  0x10, 0x9a, 0x43, 0x3d, 0x40, 0xf6, 0x8a, 0x3a, 0x58, 0xb1, 0x92, 0xbc, 0x5c,
+  0x0a, 0x4e, 0xbd, 0x10, 0xec, 0x1f, 0xbd, 0xa8, 0x31, 0xa7, 0x3c, 0x60, 0xfa,
+  0x9f, 0xbb, 0xf0, 0x04, 0xa3, 0xbb, 0xc4, 0xd8, 0x5f, 0xbd, 0xba, 0x5f, 0x66,
+  0xbd, 0x52, 0x94, 0x97, 0xbc, 0x1a, 0x9b, 0x22, 0xbd, 0xaa, 0x28, 0x59, 0x3d,
+  0xaa, 0x06, 0x64, 0xbd, 0xe7, 0xc2, 0x83, 0xbd, 0xd0, 0x3d, 0xd0, 0xbc, 0x00,
+  0x8c, 0xa3, 0x39, 0xd0, 0x27, 0x0c, 0xbc, 0x40, 0x8f, 0x79, 0xbc, 0x9e, 0x32,
+  0x7f, 0x3d, 0xac, 0x9b, 0xfd, 0xbc, 0xb1, 0x17, 0x91, 0x3d, 0xa8, 0xca, 0x4e,
+  0x3d, 0x40, 0xc3, 0xb7, 0x3a, 0xc0, 0x8e, 0x78, 0xbb, 0x3f, 0x3c, 0x83, 0x3d,
+  0x47, 0xdc, 0x81, 0xbd, 0x5b, 0xe6, 0x1c, 0xbd, 0x70, 0xe3, 0xc8, 0xbc, 0x70,
+  0x12, 0xd6, 0xbb, 0x0c, 0xb6, 0xe3, 0x3c, 0x88, 0x2a, 0x22, 0x3c, 0xd6, 0xbf,
+  0x8d, 0xbd, 0xde, 0x15, 0x20, 0x3d, 0x76, 0x83, 0x3e, 0xbd, 0x85, 0x35, 0x80,
+  0x3d, 0xc1, 0x0b, 0x87, 0x3d, 0xbf, 0x64, 0x18, 0xbd, 0x80, 0x22, 0x68, 0x3b,
+  0xc4, 0xb0, 0xb0, 0x3c, 0xa2, 0xf2, 0x4f, 0xbd, 0xb6, 0x63, 0x04, 0x3d, 0xc0,
+  0x4a, 0xc9, 0x3c, 0x36, 0x66, 0xc0, 0xbc, 0x64, 0x7a, 0x4c, 0x3d, 0xc1, 0x5b,
+  0x8c, 0x3d, 0xae, 0xa2, 0x41, 0x3d, 0x66, 0x93, 0x01, 0x3d, 0x6c, 0xb7, 0x37,
+  0xbd, 0x8c, 0x03, 0x28, 0xbd, 0x7c, 0xf6, 0x69, 0xbd, 0xa2, 0xe7, 0x0d, 0xbd,
+  0xb0, 0xf3, 0x41, 0x3d, 0xc0, 0xbf, 0xc4, 0x3b, 0xe2, 0x58, 0x46, 0xbd, 0x02,
+  0xb4, 0x60, 0x3d, 0xa2, 0xf8, 0x29, 0x3d, 0x90, 0xf7, 0xc8, 0x3b, 0xee, 0xad,
+  0x43, 0x3d, 0x1b, 0x51, 0x12, 0xbd, 0xee, 0xc3, 0x91, 0xbd, 0x20, 0xad, 0x58,
+  0x3c, 0xc6, 0x54, 0x3a, 0x3d, 0xea, 0xba, 0x60, 0xbd, 0x7e, 0x31, 0x22, 0x3d,
+  0x98, 0xe6, 0x80, 0xbd, 0x00, 0x41, 0x29, 0x3b, 0x85, 0xec, 0x8c, 0x3d, 0x7a,
+  0x8e, 0x3e, 0x3d, 0x42, 0x31, 0xfc, 0xbc, 0x58, 0x3c, 0x08, 0x3c, 0xdc, 0x04,
+  0xb5, 0xbc, 0x9e, 0xbf, 0x0f, 0xbd, 0x70, 0xad, 0x2a, 0xbc, 0x6c, 0x83, 0x8c,
+  0xbc, 0x6a, 0xd4, 0x6c, 0xbd, 0x62, 0x1b, 0x8e, 0xbc, 0x94, 0x48, 0x1f, 0xbd,
+  0x35, 0xe0, 0x3d, 0xbd, 0x60, 0x91, 0x88, 0x3b, 0x6c, 0x16, 0x07, 0x3d, 0x30,
+  0xa0, 0x93, 0x3b, 0x3c, 0xec, 0x5e, 0xbc, 0x66, 0xbf, 0x51, 0xbd, 0xfc, 0x42,
+  0x47, 0x3d, 0x78, 0x73, 0x71, 0x3c, 0x62, 0x96, 0x89, 0xbd, 0x50, 0x2b, 0xca,
+  0x3c, 0x98, 0xc5, 0x21, 0x3c, 0xbb, 0x4b, 0x19, 0xbd, 0x36, 0x22, 0x75, 0x3d,
+  0x44, 0x6e, 0x7d, 0xbd, 0xec, 0x88, 0x8d, 0x3c, 0xa8, 0x57, 0x0e, 0x3c, 0x96,
+  0x97, 0x01, 0x3d, 0x1c, 0x9c, 0x59, 0x3d, 0xc4, 0x0b, 0x31, 0x3d, 0x60, 0xf0,
+  0x6c, 0xbc, 0xb8, 0xa9, 0xb4, 0x3c, 0xd8, 0xbb, 0x33, 0xbc, 0x98, 0x35, 0x99,
+  0x3c, 0xd2, 0x49, 0x3d, 0xbd, 0xe6, 0xc9, 0x5b, 0x3d, 0x42, 0xf7, 0x41, 0x3d,
+  0xda, 0x13, 0x37, 0xbd, 0x96, 0x91, 0x94, 0xbc, 0xb8, 0xde, 0x89, 0x3c, 0xda,
+  0x37, 0x08, 0xbd, 0x20, 0xda, 0x3e, 0x3c, 0xda, 0xe8, 0x61, 0xbd, 0x70, 0x8a,
+  0x29, 0x3d, 0x18, 0xa4, 0x8f, 0xbd, 0x20, 0xee, 0x56, 0x3c, 0x70, 0xc3, 0xc8,
+  0xbc, 0x5c, 0xf4, 0x99, 0x3c, 0x54, 0xd5, 0x4b, 0xbd, 0x88, 0xcf, 0x6a, 0x3c,
+  0xa5, 0xc7, 0x1c, 0xbd, 0x10, 0x98, 0xb3, 0xbb, 0x9a, 0xe0, 0x86, 0xbd, 0x3e,
+  0x34, 0x87, 0xbd, 0xfa, 0x36, 0x7d, 0x3d, 0x40, 0x64, 0xfe, 0xbc, 0xd0, 0x4f,
+  0x67, 0xbd, 0x21, 0xda, 0x72, 0xbd, 0x2e, 0x02, 0x38, 0xbd, 0xc6, 0xd9, 0xff,
+  0xbc, 0x1a, 0x30, 0xb9, 0xbc, 0x58, 0xea, 0x58, 0x3c, 0xb1, 0xb7, 0x03, 0xbd,
+  0x80, 0x5b, 0xfc, 0x3a, 0x43, 0x60, 0x80, 0x3d, 0xa8, 0x67, 0x4a, 0xbd, 0x68,
+  0xd8, 0x3e, 0x3c, 0xf0, 0xe8, 0x2a, 0x3c, 0x68, 0x26, 0x3f, 0xbd, 0x28, 0x26,
+  0x73, 0xbd, 0x38, 0xe5, 0x24, 0x3d, 0x00, 0xb0, 0xa1, 0xba, 0x7e, 0x0f, 0x18,
+  0xbd, 0x35, 0x0d, 0x7c, 0xbd, 0x14, 0xa7, 0x3f, 0x3d, 0x16, 0x49, 0x0e, 0x3d,
+  0x2e, 0xd8, 0x90, 0xbd, 0x50, 0xc3, 0x21, 0xbd, 0xd4, 0x13, 0x44, 0x3d, 0x70,
+  0x10, 0xfd, 0x3b, 0x7b, 0x43, 0x87, 0x3d, 0x64, 0xb7, 0xf9, 0x3c, 0xd6, 0xc6,
+  0xb7, 0xbc, 0x00, 0xd8, 0xbb, 0x3b, 0xe0, 0x1b, 0x42, 0xbb, 0x68, 0x5c, 0xcf,
+  0xbc, 0xea, 0xfb, 0x8e, 0xbd, 0xdc, 0x09, 0x33, 0x3d, 0x80, 0xef, 0xb9, 0x3c,
+  0x00, 0xde, 0x92, 0xb9, 0x31, 0x42, 0x08, 0xbd, 0x80, 0x6d, 0x40, 0x3b, 0x80,
+  0xab, 0x20, 0x3d, 0xc0, 0x60, 0xc3, 0xba, 0x0b, 0xb6, 0x5e, 0xbd, 0xd4, 0x28,
+  0x3e, 0xbd, 0x47, 0x7b, 0x87, 0x3d, 0x81, 0x52, 0x84, 0x3d, 0x90, 0x8e, 0xc2,
+  0x3c, 0x04, 0x5b, 0xf3, 0xbc, 0x70, 0xa9, 0xea, 0x3c, 0x55, 0x55, 0x4d, 0xbd,
+  0x52, 0x8b, 0x59, 0xbd, 0xf2, 0xeb, 0x56, 0x3d, 0x1e, 0xc7, 0x3f, 0x3d, 0xe0,
+  0x52, 0xa3, 0x3b, 0x16, 0x93, 0x9d, 0xbc, 0x28, 0xeb, 0x36, 0x3d, 0x70, 0x4c,
+  0x1d, 0x3d, 0x8d, 0x81, 0x14, 0xbd, 0xb0, 0x22, 0xa0, 0xbb, 0x50, 0xfa, 0x87,
+  0x3c, 0x33, 0xc6, 0x2d, 0xbd, 0xd3, 0xd8, 0x85, 0x3d, 0xe8, 0xfd, 0x15, 0x3c,
+  0x20, 0x79, 0xe4, 0x3b, 0xb0, 0xd4, 0x4f, 0xbd, 0x24, 0xe9, 0xb5, 0x3c, 0xba,
+  0x47, 0x27, 0x3d, 0x23, 0xef, 0x02, 0xbd, 0xf0, 0xac, 0x31, 0x3d, 0x62, 0xde,
+  0xdd, 0xbc, 0x2c, 0xa0, 0x29, 0x3d, 0xa5, 0xec, 0x85, 0x3d, 0xa9, 0x1b, 0x8d,
+  0x3d, 0x2c, 0x6c, 0xa2, 0xbc, 0xf0, 0xc7, 0x37, 0xbc, 0x6c, 0xf7, 0xc5, 0xbc,
+  0xf4, 0x1d, 0x1c, 0xbc, 0x20, 0x3c, 0xc9, 0x3b, 0x9d, 0xff, 0x0b, 0xbd, 0x10,
+  0xa3, 0x53, 0x3d, 0x64, 0xbb, 0xc9, 0xbc, 0xfc, 0x8d, 0xe8, 0xbc, 0x20, 0x1f,
+  0x5a, 0x3c, 0x11, 0xe2, 0x17, 0xbd, 0xe0, 0x37, 0x97, 0x3b, 0x88, 0x44, 0x2a,
+  0xbd, 0x88, 0x79, 0x4c, 0xbd, 0xa8, 0x9e, 0x0d, 0x3c, 0x15, 0x54, 0x8c, 0x3d,
+  0xcb, 0x9b, 0x87, 0x3d, 0x18, 0xdd, 0x07, 0xbd, 0x2b, 0x33, 0x81, 0xbd, 0xb2,
+  0x57, 0x2e, 0xbd, 0x18, 0xc5, 0x2b, 0xbd, 0x88, 0x10, 0x91, 0xbd, 0x66, 0x69,
+  0x15, 0x3d, 0x98, 0x6c, 0xf7, 0x3c, 0x10, 0x05, 0x07, 0xbc, 0x44, 0x3b, 0xc6,
+  0xbc, 0x30, 0x43, 0xa8, 0x3b, 0x5b, 0xd8, 0x38, 0xbd, 0x66, 0x01, 0xe8, 0xbc,
+  0x36, 0xef, 0xaf, 0xbc, 0x88, 0x76, 0x24, 0x3c, 0x3a, 0x71, 0x5d, 0x3d, 0x30,
+  0xa0, 0x38, 0xbc, 0x04, 0x86, 0xf5, 0xbc, 0x30, 0xdc, 0x7c, 0x3c, 0x0c, 0x37,
+  0x2f, 0xbd, 0x80, 0xa4, 0x1f, 0xba, 0x2c, 0xa1, 0x2f, 0xbd, 0xb0, 0xb7, 0xa0,
+  0x3c, 0x37, 0xb1, 0x14, 0xbd, 0xb6, 0x07, 0x54, 0xbd, 0xb0, 0xbf, 0xd7, 0xbc,
+  0x6c, 0xc8, 0x2c, 0x3d, 0x2c, 0x09, 0x31, 0x3d, 0x04, 0x69, 0xe4, 0xbc, 0xa0,
+  0x5e, 0x7a, 0xbb, 0x90, 0x52, 0xb3, 0x3c, 0x4e, 0x6b, 0x84, 0xbd, 0xcc, 0x7e,
+  0x25, 0x3d, 0x30, 0x08, 0x99, 0xbb, 0x00, 0x08, 0xfc, 0x3b, 0xaa, 0xf0, 0x66,
+  0x3d, 0x13, 0xa5, 0x8a, 0x3d, 0xc8, 0x1c, 0xad, 0xbc, 0xf1, 0x48, 0x82, 0x3d,
+  0x7d, 0x18, 0x80, 0xbd, 0x14, 0x52, 0xa6, 0x3c, 0x10, 0x21, 0x9c, 0xbb, 0xfc,
+  0xda, 0x31, 0xbc, 0x0e, 0x65, 0xd2, 0xbc, 0x74, 0x2a, 0xcd, 0xbc, 0xb6, 0xb6,
+  0x64, 0x3d, 0x24, 0x32, 0x55, 0x3d, 0x8e, 0xc7, 0xbc, 0xbc, 0x94, 0x15, 0x89,
+  0x3c, 0x72, 0x1e, 0x3b, 0x3d, 0xb0, 0x0e, 0x25, 0x3c, 0xf8, 0x00, 0xad, 0x3c,
+  0xc1, 0xb3, 0x92, 0xbd, 0xce, 0xcf, 0x33, 0x3d, 0xe8, 0xec, 0x6a, 0x3c, 0x9e,
+  0x76, 0x9c, 0xbc, 0x4e, 0x5f, 0x29, 0xbd, 0x7c, 0xa7, 0x88, 0x3c, 0x00, 0xf3,
+  0xbf, 0x3c, 0x10, 0x12, 0x26, 0x3c, 0xf4, 0x7c, 0x4b, 0x3d, 0x90, 0x83, 0xec,
+  0xbb, 0xb6, 0x48, 0x92, 0xbd, 0x5c, 0x63, 0x47, 0x3d, 0x3f, 0xb2, 0x71, 0xbd,
+  0x60, 0x1f, 0x7e, 0xbc, 0xbc, 0xff, 0x9a, 0xbc, 0x96, 0x17, 0xb2, 0xbc, 0x78,
+  0x09, 0x0a, 0x3c, 0xa5, 0xbb, 0x8d, 0x3d, 0x80, 0x7e, 0xbd, 0x3a, 0x8c, 0x61,
+  0x8f, 0xbd, 0x70, 0x44, 0x19, 0x3d, 0xde, 0x63, 0x4b, 0x3d, 0x00, 0x61, 0x0b,
+  0xbb, 0x36, 0x70, 0x32, 0xbd, 0xc6, 0x8f, 0x71, 0x3d, 0xf0, 0xf7, 0xa0, 0xbc,
+  0x00, 0x80, 0x01, 0xb8, 0xe4, 0xc6, 0x93, 0x3c, 0x08, 0xd4, 0x3b, 0x3c, 0x96,
+  0x32, 0x40, 0x3d, 0xb8, 0x22, 0x31, 0x3d, 0x4a, 0xd9, 0x6f, 0x3d, 0x28, 0x10,
+  0x2c, 0xbc, 0x94, 0x4b, 0x9c, 0xbc, 0x90, 0x38, 0x57, 0x3d, 0xa4, 0x0d, 0x81,
+  0xbc, 0x90, 0xa5, 0xb6, 0x3c, 0x9d, 0xfe, 0x78, 0xbd, 0x3c, 0x24, 0x19, 0x3d,
+  0xa8, 0x56, 0x0c, 0x3d, 0x6b, 0xec, 0x54, 0xbd, 0x10, 0x49, 0x94, 0xbb, 0x80,
+  0x25, 0xe9, 0x3c, 0xe4, 0xb5, 0xe2, 0xbc, 0x68, 0xb2, 0x10, 0x3d, 0x6a, 0x13,
+  0xe0, 0xbc, 0x3a, 0x69, 0x44, 0xbd, 0x18, 0x3f, 0xfc, 0x3c, 0x6e, 0x08, 0x60,
+  0x3d, 0x5e, 0x5b, 0xa2, 0xbc, 0x7c, 0xbd, 0x81, 0xbd, 0xf0, 0xf9, 0xd6, 0x3b,
+  0xfa, 0x80, 0x14, 0xbd, 0xdb, 0xb0, 0x8d, 0xbd, 0xb0, 0x41, 0xe5, 0x3b, 0xe0,
+  0x03, 0xe3, 0x3c, 0xf4, 0x88, 0x07, 0xbd, 0x52, 0x89, 0xd0, 0xbc, 0x90, 0x90,
+  0x10, 0x3d, 0x9c, 0xc3, 0x3e, 0x3d, 0x2f, 0x07, 0x09, 0xbd, 0x7e, 0x67, 0xf6,
+  0xbc, 0xde, 0x88, 0xe1, 0xbc, 0xbe, 0x4b, 0x08, 0xbd, 0xac, 0xc1, 0x24, 0x3d,
+  0x5e, 0xd5, 0x3c, 0x3d, 0x80, 0x9e, 0x01, 0xbc, 0xa6, 0xdb, 0xc7, 0xbc, 0xbb,
+  0x37, 0x83, 0xbd, 0x34, 0x71, 0x50, 0x3d, 0x10, 0x46, 0x2d, 0xbd, 0x71, 0x50,
+  0x67, 0xbd, 0x20, 0x2e, 0x15, 0xbb, 0xaa, 0x05, 0x74, 0x3d, 0xc1, 0xb5, 0x79,
+  0xbd, 0x21, 0xaa, 0x44, 0xbd, 0xda, 0xbd, 0x0c, 0xbd, 0xb1, 0xee, 0x8c, 0x3d,
+  0x54, 0x83, 0x83, 0xbd, 0x5e, 0xe5, 0x75, 0x3d, 0x52, 0x3d, 0x73, 0x3d, 0x40,
+  0xf3, 0xd4, 0x3c, 0x9a, 0x1a, 0x78, 0x3d, 0x85, 0x49, 0x62, 0xbd, 0x6b, 0x57,
+  0x91, 0x3d, 0x30, 0xd7, 0x3f, 0x3d, 0xed, 0x16, 0x3f, 0xbd, 0xd0, 0xf4, 0x85,
+  0xbb, 0x47, 0x5e, 0x1e, 0xbd, 0x70, 0xe9, 0x87, 0x3c, 0x87, 0x5d, 0x80, 0xbd,
+  0xa0, 0x7a, 0xb6, 0xbb, 0x03, 0x86, 0x84, 0xbd, 0x50, 0x4c, 0x74, 0x3c, 0x85,
+  0x86, 0x80, 0x3d, 0x00, 0xe2, 0x56, 0xbb, 0x7e, 0xb0, 0x16, 0xbd, 0x10, 0xa9,
+  0x80, 0xbd, 0xe0, 0x8b, 0x47, 0x3d, 0x19, 0x07, 0x68, 0xbd, 0x4e, 0xd8, 0x70,
+  0x3d, 0xa8, 0x10, 0x2a, 0x3d, 0x22, 0x23, 0x96, 0xbc, 0x92, 0xe3, 0x72, 0xbd,
+  0xb8, 0x0f, 0x13, 0x3d, 0x16, 0xc3, 0x53, 0x3d, 0xa4, 0x95, 0x41, 0x3d, 0x02,
+  0xc3, 0x6f, 0x3d, 0x48, 0x02, 0xac, 0xbc, 0x40, 0x53, 0x6d, 0x3b, 0xf4, 0x2a,
+  0x19, 0xbc, 0x10, 0x1f, 0xc2, 0xbb, 0x21, 0xb8, 0x69, 0xbd, 0x97, 0x8c, 0x8a,
+  0x3d, 0x38, 0x13, 0xb4, 0x3c, 0xf1, 0x0d, 0x8d, 0x3d, 0x00, 0x69, 0x30, 0x3d,
+  0x38, 0x92, 0xf9, 0x3c, 0xb5, 0xff, 0x8a, 0x3d, 0x15, 0x27, 0x91, 0x3d, 0x96,
+  0xd4, 0x00, 0x3d, 0x66, 0xde, 0x1c, 0x3d, 0x7c, 0x48, 0x40, 0x3d, 0x08, 0x06,
+  0xf2, 0x3c, 0x8e, 0xfe, 0x71, 0x3d, 0x90, 0xa1, 0xc6, 0xbb, 0x88, 0x57, 0x05,
+  0x3c, 0x80, 0x92, 0x6d, 0x3a, 0x80, 0x99, 0xc9, 0xba, 0x0f, 0x0f, 0x33, 0xbd,
+  0x76, 0xfc, 0x31, 0x3d, 0xd8, 0x9f, 0x23, 0xbd, 0x8c, 0x07, 0x07, 0xbd, 0x68,
+  0x38, 0x5e, 0x3c, 0xf0, 0x39, 0xbf, 0xbc, 0x6c, 0x16, 0xfc, 0x3c, 0x94, 0xf2,
+  0xb4, 0xbc, 0x20, 0x52, 0xc4, 0xbb, 0xb7, 0x3f, 0x02, 0xbd, 0x78, 0x48, 0x61,
+  0xbd, 0x48, 0xad, 0x6b, 0xbd, 0xcd, 0xb1, 0x8c, 0x3d, 0x20, 0x28, 0xcd, 0x3c,
+  0xb4, 0x49, 0x53, 0x3d, 0x30, 0x59, 0x06, 0x3c, 0xda, 0xea, 0x83, 0xbd, 0xf8,
+  0xe2, 0x16, 0xbd, 0x96, 0xc3, 0x77, 0x3d, 0x2c, 0x90, 0xf6, 0x3c, 0x94, 0x78,
+  0x4d, 0xbc, 0x75, 0x0d, 0x2f, 0xbd, 0xa2, 0x00, 0xa7, 0xbc, 0x32, 0xec, 0x7c,
+  0x3d, 0x6c, 0x7a, 0x5a, 0xbc, 0x7e, 0x59, 0x58, 0x3d, 0x60, 0x65, 0x91, 0x3b,
+  0x28, 0x8b, 0x75, 0xbd, 0x22, 0xa7, 0x7b, 0x3d, 0xc4, 0xdd, 0x39, 0x3d, 0xe4,
+  0x54, 0xa3, 0xbc, 0xb6, 0x39, 0x30, 0x3d, 0x38, 0x91, 0x35, 0x3c, 0xd0, 0xb9,
+  0x10, 0x3c, 0x4c, 0x8a, 0xab, 0x3c, 0x04, 0x8d, 0x0e, 0xbd, 0x20, 0xc2, 0xcb,
+  0x3b, 0x32, 0xbe, 0x58, 0xbd, 0xec, 0x4e, 0x03, 0x3d, 0xf0, 0x59, 0xee, 0x3c,
+  0x18, 0x48, 0x0d, 0xbc, 0xa0, 0xfd, 0xe6, 0xbb, 0x8c, 0x9c, 0x4b, 0x3d, 0xa8,
+  0xe8, 0x13, 0x3c, 0x14, 0xb9, 0x4e, 0xbd, 0xe6, 0xbf, 0x03, 0x3d, 0xf0, 0x7a,
+  0xdd, 0xbc, 0xc8, 0x1b, 0x91, 0xbc, 0x9b, 0x2a, 0x24, 0xbd, 0x98, 0x93, 0x01,
+  0xbc, 0x1a, 0x0c, 0x34, 0x3d, 0xfe, 0xfa, 0xa3, 0xbc, 0x7c, 0x82, 0xbd, 0x3c,
+  0x70, 0x96, 0xe8, 0x3c, 0xa6, 0x08, 0x67, 0x3d, 0x48, 0x11, 0x68, 0xbc, 0x90,
+  0xfb, 0x58, 0xbd, 0x91, 0x9e, 0x8b, 0xbd, 0x4b, 0xd8, 0x87, 0xbd, 0x6a, 0x90,
+  0x63, 0x3d, 0x36, 0xa5, 0x20, 0x3d, 0x30, 0x61, 0x3d, 0x3d, 0x56, 0x99, 0x11,
+  0xbd, 0xce, 0xff, 0x70, 0x3d, 0xd5, 0x52, 0x3d, 0xbd, 0x44, 0x1e, 0x92, 0x3c,
+  0x6e, 0xb4, 0x44, 0xbd, 0x42, 0xeb, 0xec, 0xbc, 0xa2, 0xea, 0x85, 0xbc, 0x40,
+  0x48, 0x01, 0x3b, 0x52, 0xcd, 0x75, 0x3d, 0xe9, 0xa7, 0x08, 0xbd, 0x61, 0x2e,
+  0x0c, 0xbd, 0x06, 0xda, 0x24, 0x3d, 0xce, 0xfc, 0xf7, 0xbc, 0x62, 0xab, 0x7d,
+  0x3d, 0x2f, 0x02, 0x89, 0xbd, 0xea, 0x05, 0x48, 0xbd, 0xea, 0x7c, 0x7b, 0xbd,
+  0x80, 0x05, 0x8c, 0xba, 0xba, 0x77, 0x3d, 0xbd, 0xfa, 0xee, 0x34, 0xbd, 0xd2,
+  0x24, 0x28, 0x3d, 0x30, 0xb2, 0x40, 0xbd, 0x52, 0x8b, 0x18, 0x3d, 0xe3, 0xfc,
+  0x8b, 0x3d, 0x58, 0x86, 0x65, 0xbc, 0x64, 0x1e, 0xa8, 0xbc, 0xba, 0xc7, 0x75,
+  0x3d, 0xdb, 0xb4, 0x80, 0x3d, 0x07, 0x16, 0x67, 0xbd, 0x84, 0x95, 0x6d, 0xbc,
+  0x11, 0xb3, 0x1e, 0xbd, 0x40, 0x9b, 0x56, 0xbb, 0x7e, 0x66, 0x57, 0x3d, 0xca,
+  0x1c, 0x5e, 0x3d, 0x20, 0xef, 0xe5, 0x3b, 0xd3, 0x0f, 0x2e, 0xbd, 0x8a, 0xdf,
+  0x81, 0xbd, 0x58, 0xc9, 0x0f, 0x3d, 0xbc, 0x54, 0x63, 0xbd, 0x60, 0x24, 0x85,
+  0xbd, 0x5a, 0xa5, 0xda, 0xbc, 0x12, 0x87, 0x01, 0x3d, 0xf6, 0xc0, 0x96, 0xbc,
+  0x78, 0x46, 0x1d, 0x3d, 0xb6, 0x90, 0x62, 0xbd, 0xc0, 0x43, 0x94, 0x3b, 0xf0,
+  0xed, 0xce, 0xbb, 0xb8, 0x25, 0x14, 0xbc, 0xf4, 0x5c, 0x20, 0xbc, 0xd8, 0x5b,
+  0x1c, 0x3d, 0x44, 0xcb, 0x4c, 0xbc, 0x2e, 0xf6, 0x36, 0x3d, 0x94, 0xa7, 0xe6,
+  0xbc, 0xd8, 0xac, 0x4f, 0x3c, 0x06, 0x78, 0x11, 0x3d, 0xe6, 0x53, 0x14, 0x3d,
+  0x3b, 0x4b, 0x25, 0xbd, 0x03, 0xb6, 0x88, 0xbd, 0xd0, 0xc2, 0x2b, 0x3c, 0xc5,
+  0xf9, 0x12, 0xbd, 0x78, 0x6f, 0xf5, 0x3c, 0xc6, 0xc0, 0x63, 0x3d, 0x60, 0xd4,
+  0xa9, 0x3c, 0x1b, 0x87, 0x92, 0x3d, 0x70, 0x70, 0x35, 0xbd, 0xb8, 0xaa, 0x17,
+  0x3d, 0xec, 0x13, 0xde, 0xbc, 0x04, 0xc8, 0x8c, 0x3c, 0x3c, 0xcd, 0xf4, 0x3c,
+  0x66, 0x81, 0x4b, 0x3d, 0x3e, 0x59, 0x8b, 0xbd, 0xb8, 0xab, 0x04, 0x3c, 0xdc,
+  0x9a, 0xd8, 0x3c, 0x00, 0x22, 0x4d, 0x3d, 0x08, 0x10, 0x93, 0x3c, 0x64, 0x64,
+  0x7e, 0xbc, 0x32, 0xd1, 0x00, 0x3d, 0xfc, 0x6a, 0x2a, 0xbd, 0x04, 0x05, 0xa8,
+  0x3c, 0x4c, 0xb2, 0xc3, 0x3c, 0x57, 0x68, 0x0d, 0xbd, 0x18, 0x0f, 0x6e, 0xbd,
+  0x31, 0x3c, 0x0d, 0xbd, 0xa0, 0xef, 0xe0, 0xbb, 0x5a, 0xa3, 0xf2, 0xbc, 0xb3,
+  0xcd, 0x88, 0x3d, 0x0c, 0x86, 0x6e, 0xbc, 0x78, 0x6a, 0x14, 0xbc, 0x51, 0x9b,
+  0x2e, 0xbd, 0x45, 0x0b, 0x22, 0xbd, 0xf0, 0x38, 0x9e, 0x3c, 0x53, 0x6c, 0x87,
+  0x3d, 0x00, 0x20, 0x2d, 0x3a, 0x40, 0xea, 0xd2, 0xba, 0xcd, 0x35, 0x88, 0xbd,
+  0xb2, 0xad, 0x62, 0x3d, 0xf6, 0x83, 0xb9, 0xbc, 0x92, 0xb4, 0x4b, 0x3d, 0xe6,
+  0x0e, 0x86, 0xbc, 0x55, 0x4e, 0x85, 0x3d, 0x7e, 0x89, 0x05, 0x3d, 0xa1, 0xb1,
+  0x83, 0x3d, 0x7c, 0x7c, 0xf5, 0x3c, 0xdb, 0x2e, 0x8c, 0xbd, 0x98, 0x94, 0x5c,
+  0xbd, 0x0c, 0xfd, 0xb9, 0xbc, 0x40, 0x7e, 0xa5, 0x3c, 0xc0, 0x1e, 0xd6, 0x3a,
+  0x88, 0x80, 0x1d, 0x3c, 0x48, 0x6f, 0xfe, 0x3c, 0x2a, 0x7a, 0xde, 0xbc, 0x9c,
+  0x7d, 0x1a, 0xbd, 0x70, 0xd8, 0x1b, 0x3c, 0xa8, 0x27, 0x75, 0xbd, 0x92, 0x9a,
+  0x53, 0x3d, 0xb3, 0x0a, 0x8b, 0x3d, 0xd0, 0xe2, 0x10, 0x3c, 0xb0, 0x82, 0x9d,
+  0x3b, 0x38, 0x23, 0x10, 0x3c, 0xc0, 0xfb, 0xab, 0xbb, 0x7a, 0xff, 0x77, 0xbd,
+  0x3f, 0x50, 0x91, 0x3d, 0x30, 0x33, 0x01, 0x3c, 0x48, 0x28, 0x43, 0x3d, 0xd4,
+  0x59, 0xac, 0xbc, 0xa3, 0xa9, 0x0d, 0xbd, 0x1c, 0x90, 0x52, 0xbd, 0x40, 0xa7,
+  0x57, 0x3c, 0x94, 0x79, 0x28, 0xbd, 0xf0, 0x27, 0x9b, 0x3c, 0x02, 0x37, 0x7d,
+  0x3d, 0x14, 0x5b, 0x94, 0xbc, 0xde, 0x3f, 0x2c, 0xbd, 0x06, 0xe5, 0x2b, 0xbd,
+  0x58, 0x3a, 0x01, 0xbd, 0xda, 0x88, 0xa5, 0xbc, 0x27, 0x42, 0x08, 0xbd, 0x30,
+  0x39, 0xd1, 0x3b, 0xdc, 0xf2, 0xb6, 0xbc, 0x78, 0xe4, 0xe9, 0x3c, 0x56, 0xdd,
+  0x8c, 0xbc, 0x20, 0xbf, 0x17, 0x3d, 0x8a, 0x7a, 0x5e, 0xbd, 0x6a, 0x3e, 0xac,
+  0xbc, 0xb2, 0x0d, 0x7b, 0x3d, 0x02, 0x11, 0xae, 0xbc, 0x8c, 0x5a, 0x14, 0x3d,
+  0xba, 0x7e, 0xa6, 0xbc, 0xdc, 0x76, 0x0c, 0x3d, 0xfc, 0x09, 0x5a, 0x3d, 0x4e,
+  0x8d, 0x8b, 0xbd, 0xd4, 0x0c, 0xa3, 0xbc, 0x7f, 0x0e, 0x8f, 0xbd, 0x20, 0x38,
+  0x62, 0xbb, 0xe0, 0x57, 0xf8, 0xbb, 0x00, 0x7b, 0x12, 0xba, 0x5c, 0x6f, 0xbe,
+  0x3c, 0x40, 0xc3, 0x2a, 0x3b, 0xf4, 0xe3, 0xb4, 0x3c, 0xda, 0x17, 0x4d, 0x3d,
+  0xd0, 0xca, 0x1e, 0x3d, 0x80, 0x09, 0xaa, 0x3c, 0xce, 0x89, 0x5d, 0x3d, 0x24,
+  0x5d, 0x0f, 0x3d, 0xa0, 0x6d, 0x44, 0x3c, 0x0e, 0x09, 0x92, 0xbc, 0x00, 0xde,
+  0x57, 0x3c, 0x91, 0x01, 0x73, 0xbd, 0x5e, 0x90, 0x1a, 0x3d, 0x4c, 0xf8, 0xd6,
+  0x3c, 0xf8, 0x9a, 0x91, 0xbd, 0xe2, 0x1c, 0x5d, 0xbd, 0x80, 0xde, 0x76, 0x3b,
+  0xd6, 0x26, 0x2c, 0x3d, 0x00, 0xd0, 0x39, 0xbc, 0xfc, 0x5d, 0xee, 0xbc, 0x7a,
+  0xdc, 0x83, 0xbc, 0x3b, 0x14, 0x81, 0x3d, 0x30, 0x85, 0xf3, 0x3c, 0x0e, 0x0d,
+  0x85, 0xbd, 0x86, 0x9f, 0xcf, 0xbc, 0x32, 0xf9, 0xfa, 0xbc, 0xdc, 0x92, 0x8e,
+  0xbd, 0xf0, 0xf2, 0x45, 0x3c, 0xb2, 0xcd, 0x31, 0xbd, 0x40, 0x13, 0xcc, 0xba,
+  0x81, 0x90, 0x0b, 0xbd, 0xf5, 0xd9, 0x7d, 0xbd, 0x74, 0xf2, 0xc1, 0xbc, 0x8e,
+  0xb9, 0x2b, 0x3d, 0xb0, 0xef, 0x7e, 0xbd, 0x00, 0x57, 0x81, 0x3c, 0xc2, 0x40,
+  0x76, 0xbd, 0xaf, 0xe7, 0x08, 0xbd, 0x02, 0x79, 0x26, 0x3d, 0x77, 0x1f, 0x2f,
+  0xbd, 0x20, 0x66, 0x1c, 0x3c, 0x28, 0x56, 0xc2, 0x3c, 0xe8, 0x78, 0x0e, 0x3c,
+  0xb8, 0x4e, 0x2c, 0xbc, 0xd0, 0x97, 0x26, 0xbc, 0x5e, 0x8f, 0x3b, 0x3d, 0x30,
+  0xff, 0x28, 0x3c, 0x91, 0x25, 0x92, 0x3d, 0x20, 0xd1, 0x20, 0xbc, 0x24, 0xb8,
+  0x23, 0xbd, 0xfc, 0xca, 0x55, 0xbc, 0xf8, 0x46, 0xf0, 0x3c, 0xf7, 0x15, 0x88,
+  0x3d, 0x96, 0x4a, 0x78, 0x3d, 0x40, 0xdb, 0xce, 0xba, 0x50, 0x38, 0xed, 0x3b,
+  0x3a, 0xfd, 0x00, 0x3d, 0x40, 0x1d, 0x3d, 0xbb, 0x8a, 0xd6, 0xae, 0xbc, 0x10,
+  0x55, 0x7a, 0xbd, 0x91, 0x66, 0x59, 0x3d, 0x40, 0x74, 0xd5, 0xbc, 0x76, 0x92,
+  0xb9, 0xbc, 0xa0, 0x5c, 0x4d, 0x3d, 0x59, 0xd0, 0x4a, 0x3d, 0x65, 0xa7, 0x5e,
+  0xbd, 0x45, 0x6b, 0xea, 0x3d, 0x2b, 0x08, 0xdf, 0x3c, 0xb3, 0x37, 0x6e, 0x3d,
+  0xfa, 0xad, 0xe0, 0xbc, 0xc3, 0xd2, 0x01, 0xbe, 0x24, 0x15, 0x90, 0x3d, 0x42,
+  0xd3, 0xc4, 0x3c, 0x2b, 0xd6, 0x00, 0x3c, 0x9b, 0xf7, 0xcc, 0x3d, 0x7c, 0xc1,
+  0x37, 0x3d, 0x4c, 0x98, 0xb6, 0x3d, 0x65, 0xac, 0x04, 0x3d, 0xbe, 0x0d, 0xf6,
+  0x3c, 0x0a, 0x47, 0xb9, 0xbd, 0xa0, 0x2d, 0x4f, 0x3b, 0x44, 0x5d, 0xd1, 0xbc,
+  0x3c, 0x8b, 0x82, 0x3d, 0xf8, 0xf9, 0x02, 0xbd, 0x21, 0xa7, 0x39, 0xbd, 0xa2,
+  0x22, 0x82, 0x3d, 0xda, 0x8a, 0xb9, 0xbd, 0x6c, 0x42, 0x95, 0xbc, 0x98, 0x7b,
+  0x9a, 0x3d, 0x1d, 0x34, 0x40, 0xbd, 0x68, 0xfa, 0x6f, 0x3c, 0xd6, 0x23, 0xa0,
+  0x3d, 0x5a, 0xe0, 0x71, 0x3d, 0xda, 0xb5, 0x20, 0xbd, 0x0d, 0x43, 0xe0, 0x3c,
+  0x77, 0xeb, 0x0c, 0x3d, 0x97, 0x10, 0xf9, 0x3c, 0xdb, 0xd9, 0xe6, 0x3a, 0xcb,
+  0xff, 0x63, 0xbd, 0x75, 0x4f, 0xbf, 0xb9, 0x69, 0x4a, 0x20, 0xbd, 0xa2, 0xbf,
+  0x56, 0x3d, 0xcc, 0xfe, 0x0e, 0xbe, 0xbe, 0xe9, 0x2e, 0x3d, 0x32, 0x25, 0x5d,
+  0xbd, 0x77, 0x8a, 0x43, 0xbd, 0xc8, 0x8d, 0x4d, 0x3d, 0xd7, 0x87, 0xe4, 0x3c,
+  0xc4, 0xf1, 0x50, 0x3d, 0x1a, 0xb6, 0x1a, 0x3d, 0x70, 0x13, 0x0f, 0x3c, 0xeb,
+  0x1e, 0x6f, 0xbc, 0x4a, 0x22, 0x12, 0x3d, 0x7b, 0xe9, 0xcd, 0x3c, 0x1a, 0x2d,
+  0x93, 0xbd, 0x21, 0xcd, 0x4b, 0xbd, 0x52, 0x94, 0x21, 0x3d, 0x1c, 0xb7, 0x0e,
+  0xbd, 0x15, 0xea, 0x0c, 0xbd, 0x55, 0x60, 0xb0, 0x3b, 0xb4, 0x1d, 0xd0, 0x3d,
+  0x43, 0xa2, 0x7b, 0xbd, 0xc9, 0x7b, 0x12, 0xbd, 0x64, 0x4f, 0x87, 0xbd, 0xea,
+  0x0f, 0x8c, 0x3d, 0x07, 0x3a, 0xbb, 0xbd, 0xa8, 0xb6, 0x62, 0xbd, 0x74, 0xe8,
+  0x84, 0x3d, 0xc2, 0x72, 0x6a, 0x3d, 0x58, 0xba, 0x67, 0xbb, 0x31, 0xf4, 0xb2,
+  0x3d, 0x04, 0x0e, 0x92, 0xbd, 0xd4, 0x9f, 0x7a, 0x3d, 0x81, 0xd4, 0x89, 0xbc,
+  0xe5, 0xe2, 0xe7, 0xbd, 0xb2, 0xd7, 0x51, 0xbd, 0x64, 0x57, 0x52, 0xbd, 0xb4,
+  0x3f, 0x73, 0xbc, 0x22, 0x15, 0x4e, 0x3d, 0xe9, 0xf0, 0x4c, 0x3d, 0x05, 0x9b,
+  0xfa, 0xbc, 0x28, 0xc4, 0xa1, 0x3d, 0xd2, 0x16, 0x51, 0x3d, 0xa0, 0x9f, 0x8f,
+  0xbb, 0xc9, 0x02, 0x82, 0x3d, 0x13, 0x45, 0x84, 0x3c, 0x0a, 0x79, 0xc9, 0x3c,
+  0xb9, 0x89, 0x19, 0xbd, 0x57, 0x1f, 0x86, 0xbb, 0xaa, 0xfa, 0xa0, 0x3d, 0x27,
+  0x94, 0x00, 0xbd, 0x95, 0xf0, 0x86, 0xbd, 0x70, 0x37, 0x81, 0xbc, 0x0a, 0x32,
+  0x09, 0x3d, 0x18, 0x6d, 0x18, 0xbd, 0x16, 0x40, 0x7e, 0x3d, 0x69, 0xfb, 0xaa,
+  0xbc, 0x31, 0x93, 0x17, 0xbd, 0x3e, 0xc6, 0x59, 0xbc, 0x17, 0xc8, 0xe7, 0x3c,
+  0x9e, 0x08, 0xc3, 0x3c, 0x79, 0x41, 0x12, 0x3d, 0xc8, 0xc2, 0x37, 0xbc, 0x3f,
+  0xc1, 0x8f, 0xbd, 0xd9, 0x75, 0x94, 0xbd, 0x8c, 0xc3, 0x97, 0x3d, 0x36, 0xad,
+  0x1b, 0xbe, 0x28, 0x9f, 0x80, 0xbc, 0x79, 0x5c, 0x84, 0xbc, 0x20, 0x29, 0x6b,
+  0x3d, 0xe1, 0xad, 0xd1, 0xbb, 0xa4, 0x2c, 0x08, 0x3d, 0x6e, 0x13, 0x52, 0xbd,
+  0x4c, 0x51, 0x60, 0x3d, 0xc0, 0xae, 0x92, 0x3d, 0xd3, 0x90, 0x35, 0xbd, 0x04,
+  0x9e, 0x5f, 0xbd, 0x8c, 0xad, 0xee, 0xbc, 0x6f, 0x0b, 0x3e, 0x3d, 0xfb, 0x15,
+  0x1c, 0x3c, 0x2f, 0x67, 0x98, 0xbb, 0x90, 0x7f, 0x9f, 0x3d, 0x21, 0x97, 0x2a,
+  0xbc, 0xa0, 0x67, 0x9d, 0xbd, 0x5d, 0x64, 0x18, 0x3d, 0xaf, 0x36, 0xd9, 0x3b,
+  0xe0, 0x06, 0xdc, 0x3c, 0xd0, 0x51, 0x8e, 0x3c, 0x48, 0x40, 0x56, 0x3d, 0xac,
+  0x63, 0xb2, 0xbc, 0x63, 0x31, 0xf6, 0xbc, 0x48, 0x65, 0x07, 0x3d, 0x9c, 0x92,
+  0x8d, 0xbd, 0x5c, 0xbb, 0x96, 0xbc, 0xa7, 0xdc, 0x07, 0x3c, 0xc4, 0xe5, 0xd8,
+  0x3c, 0xb9, 0xea, 0x11, 0x3c, 0x10, 0x39, 0x13, 0x3a, 0x18, 0x34, 0x28, 0xbd,
+  0xf4, 0x41, 0x6c, 0x3c, 0x25, 0x46, 0x12, 0xbd, 0xf9, 0x23, 0x3f, 0x3d, 0xfc,
+  0x1d, 0xd9, 0x3d, 0x68, 0xc6, 0xa9, 0xbc, 0x97, 0x32, 0x1c, 0xbd, 0x3f, 0x51,
+  0xbf, 0x3d, 0x7e, 0xd5, 0x3c, 0x3c, 0xda, 0x77, 0xcb, 0xbd, 0x10, 0x52, 0xb6,
+  0xbc, 0xd8, 0xbd, 0x9b, 0x3d, 0x43, 0xd7, 0x7c, 0x3d, 0x4c, 0x78, 0xb2, 0xbc,
+  0x7c, 0xda, 0xc9, 0xbc, 0x31, 0x8c, 0x4d, 0x3d, 0x82, 0x0e, 0xcb, 0xbc, 0xed,
+  0xf9, 0xe8, 0x3b, 0xa8, 0x08, 0x4b, 0x3d, 0x38, 0x3c, 0x4a, 0xbd, 0x1d, 0xd9,
+  0x0f, 0xbd, 0xd6, 0x17, 0x86, 0x3b, 0xa1, 0x90, 0xab, 0x3d, 0x91, 0xcc, 0x8f,
+  0xbd, 0x07, 0xfa, 0x39, 0x3d, 0x11, 0x95, 0x03, 0x3d, 0x29, 0x0f, 0x31, 0xbc,
+  0x87, 0xab, 0x3c, 0x3d, 0xc8, 0xe5, 0x5c, 0xb9, 0x44, 0x79, 0x44, 0xbd, 0x6d,
+  0x4c, 0x90, 0xbc, 0x86, 0x90, 0xa5, 0xbc, 0x47, 0x61, 0x39, 0xbe, 0xf9, 0xeb,
+  0x17, 0x3b, 0xea, 0x28, 0xe4, 0xbc, 0x79, 0x88, 0x12, 0xbc, 0x7a, 0x61, 0xdd,
+  0x3d, 0x7f, 0xfe, 0x49, 0x3d, 0x78, 0x92, 0x5c, 0xbd, 0x6d, 0xe2, 0xa4, 0x3b,
+  0x68, 0x57, 0x27, 0xbd, 0x61, 0x22, 0xaf, 0x3c, 0x02, 0x98, 0x6e, 0x3d, 0x74,
+  0x02, 0xbb, 0x3d, 0x33, 0x4d, 0x24, 0xbd, 0x3e, 0x93, 0x81, 0xbc, 0xb2, 0x1e,
+  0x1f, 0x3d, 0xb5, 0x79, 0x64, 0x3b, 0xbc, 0xfb, 0xf6, 0xbc, 0x61, 0x0c, 0xcd,
+  0xbd, 0xc1, 0x64, 0x08, 0x3c, 0x6f, 0x3d, 0x27, 0xbd, 0x10, 0xd3, 0xdb, 0xbc,
+  0xe4, 0xb6, 0xd2, 0x3b, 0x51, 0x12, 0x81, 0x3d, 0x37, 0xee, 0x87, 0xbc, 0xdd,
+  0x80, 0xaf, 0x39, 0x90, 0x85, 0xaf, 0x3d, 0x80, 0x5f, 0x12, 0xbc, 0xcb, 0x3c,
+  0x63, 0xbd, 0x81, 0x3c, 0x85, 0x3d, 0x10, 0xe7, 0x54, 0xbc, 0xa6, 0xb7, 0x98,
+  0xbc, 0x07, 0x98, 0x2f, 0x3d, 0x70, 0x80, 0x28, 0xbe, 0x7a, 0xe5, 0x77, 0x3d,
+  0x0b, 0x81, 0x51, 0xbd, 0xb1, 0xdf, 0x35, 0xbc, 0xd2, 0xf7, 0x0b, 0x3d, 0xbe,
+  0x9e, 0x02, 0xbd, 0xa2, 0xc0, 0x03, 0x3d, 0x97, 0xf5, 0x2f, 0xbb, 0xc6, 0x6b,
+  0x13, 0xbd, 0x81, 0xbc, 0xe8, 0xbb, 0x2a, 0x57, 0x63, 0x3d, 0x49, 0x18, 0x51,
+  0xbc, 0xd7, 0x9e, 0x44, 0xbd, 0x51, 0x59, 0xb8, 0x3b, 0x5b, 0x9b, 0x86, 0x3c,
+  0x1d, 0x63, 0x8a, 0x3d, 0x15, 0xc7, 0x94, 0xbd, 0x43, 0xc8, 0x05, 0xbd, 0x7b,
+  0xc8, 0x26, 0x3d, 0xdc, 0x03, 0xbd, 0x3c, 0xa0, 0x16, 0x2b, 0xbd, 0x33, 0x15,
+  0xfa, 0x3c, 0xfe, 0xce, 0x91, 0xbc, 0x0f, 0x1e, 0xe3, 0x3b, 0x01, 0x19, 0x2b,
+  0xbd, 0x26, 0xff, 0x53, 0x3c, 0x4f, 0x22, 0x91, 0xbb, 0xf6, 0x4f, 0x84, 0xbd,
+  0xc5, 0xf6, 0x8a, 0x3d, 0x76, 0xcf, 0x90, 0xbd, 0x4d, 0x0e, 0xb7, 0x3d, 0x90,
+  0x1f, 0xd0, 0xbc, 0xd8, 0xa6, 0x7c, 0xbd, 0x39, 0xa0, 0x70, 0x3c, 0x33, 0x14,
+  0x91, 0xbd, 0xa4, 0x66, 0x12, 0xbb, 0xfd, 0x3b, 0x4e, 0x3d, 0x87, 0x72, 0x0c,
+  0x3d, 0xa1, 0x1b, 0x7b, 0xbc, 0xe0, 0x0f, 0xb5, 0xbc, 0x74, 0x49, 0x42, 0xbd,
+  0x61, 0x8f, 0x34, 0x3d, 0x40, 0x4a, 0xb0, 0xbc, 0x19, 0xf3, 0x14, 0x3d, 0x5c,
+  0xd5, 0x8a, 0x3d, 0x4e, 0xd1, 0x54, 0x3d, 0xd8, 0x0b, 0x0d, 0x3d, 0x04, 0x61,
+  0x85, 0x3d, 0x7e, 0x9e, 0x33, 0x3d, 0xd7, 0x75, 0xcb, 0x3b, 0x71, 0x7a, 0x89,
+  0xbb, 0xb5, 0x56, 0x62, 0xbd, 0x00, 0xe5, 0x87, 0xbc, 0x84, 0x92, 0xca, 0xbc,
+  0xf4, 0x15, 0xbb, 0xbc, 0xe7, 0xae, 0xc5, 0x3a, 0x8a, 0x96, 0x98, 0x3c, 0x55,
+  0xb6, 0x9a, 0xbc, 0x59, 0x6f, 0x2c, 0x3d, 0x5b, 0x3b, 0x14, 0x3c, 0xd7, 0xb4,
+  0xa6, 0x3b, 0x3f, 0x09, 0x21, 0x3d, 0x64, 0xfc, 0x54, 0x3c, 0x03, 0xd5, 0xf4,
+  0xbc, 0x06, 0x74, 0xb6, 0xbd, 0xd5, 0x70, 0x0b, 0xbd, 0xa6, 0xf8, 0x4b, 0x3c,
+  0xea, 0x46, 0x32, 0xbd, 0xb4, 0x06, 0x3b, 0x3c, 0xc2, 0xa8, 0x0d, 0xbb, 0x12,
+  0x60, 0x6f, 0x3c, 0x20, 0xca, 0x10, 0x3c, 0x05, 0xcc, 0xa6, 0xbc, 0x7a, 0xdd,
+  0xdf, 0xbb, 0xcc, 0x65, 0x9e, 0x3c, 0x02, 0x81, 0xe3, 0x3c, 0x58, 0x15, 0x90,
+  0x3d, 0x80, 0x4a, 0xb2, 0xbd, 0xd3, 0x92, 0x8d, 0x3d, 0xc8, 0x03, 0xd9, 0xbc,
+  0xc9, 0xce, 0x49, 0xbd, 0x57, 0xb1, 0x87, 0xbc, 0xf8, 0xc8, 0xb9, 0x3d, 0xb5,
+  0x6a, 0x02, 0xbd, 0x60, 0xe3, 0x24, 0x3d, 0xb3, 0xdd, 0x4d, 0x3d, 0x87, 0x6d,
+  0x0e, 0xbd, 0xea, 0x2d, 0x67, 0xbd, 0x62, 0x3b, 0xa9, 0xbc, 0xd1, 0x23, 0x79,
+  0x3d, 0x27, 0x90, 0x1a, 0x3d, 0xfa, 0xf4, 0xa3, 0x3c, 0x88, 0xf8, 0x76, 0xbd,
+  0x48, 0x27, 0x4e, 0xbd, 0xad, 0xe7, 0x6d, 0x3c, 0xbd, 0x3f, 0xba, 0x3d, 0x6a,
+  0x30, 0xb8, 0xbd, 0x2e, 0x5c, 0xc7, 0xbb, 0x76, 0x8f, 0x85, 0xbc, 0x9d, 0x0f,
+  0x48, 0x3d, 0xae, 0x8b, 0xa4, 0x3d, 0x72, 0xca, 0x36, 0x3d, 0xcd, 0xab, 0xad,
+  0xbc, 0xf4, 0x68, 0x11, 0xbd, 0xe4, 0xf0, 0x20, 0x39, 0x85, 0x8d, 0x52, 0xbd,
+  0x73, 0x80, 0x89, 0x3d, 0x3e, 0x97, 0x11, 0xbd, 0x44, 0xe7, 0x13, 0x3d, 0x25,
+  0xc3, 0x68, 0x3d, 0x4f, 0x88, 0x1c, 0x3d, 0x51, 0x5f, 0x86, 0xbc, 0xce, 0x97,
+  0xfb, 0xbc, 0x0e, 0x5c, 0x11, 0xbd, 0x00, 0x0f, 0x05, 0x3d, 0x8c, 0x5a, 0xe2,
+  0x3c, 0xdb, 0x30, 0x8c, 0x3d, 0x69, 0xac, 0xd6, 0x3c, 0xb6, 0x26, 0x22, 0x3d,
+  0x11, 0x74, 0x72, 0xbd, 0x85, 0xc5, 0x4e, 0x3b, 0x9c, 0x72, 0x9e, 0x3d, 0xa6,
+  0x49, 0x25, 0xbd, 0x9e, 0x77, 0x23, 0x3c, 0x01, 0xbf, 0x35, 0xbc, 0xf9, 0x0a,
+  0x06, 0xbd, 0x66, 0xc8, 0x70, 0xbd, 0xb9, 0x54, 0x80, 0x3d, 0x70, 0x83, 0xd1,
+  0xbc, 0x7b, 0x7a, 0xd5, 0xbc, 0x72, 0x5e, 0x1e, 0xbd, 0x7d, 0xb0, 0x24, 0x3d,
+  0x88, 0x95, 0x3b, 0x3d, 0xb9, 0xc0, 0x4f, 0xbc, 0xf6, 0xf0, 0xcc, 0x3c, 0x6e,
+  0x8d, 0x20, 0x3c, 0x0e, 0xe0, 0x8f, 0xbd, 0xfe, 0xd6, 0x2f, 0xbe, 0x40, 0x5e,
+  0x05, 0x3c, 0x43, 0x3c, 0x1f, 0x3d, 0x2b, 0xfe, 0x63, 0xbd, 0xac, 0xfc, 0x78,
+  0x3d, 0x89, 0xc7, 0x7b, 0xbd, 0xf8, 0x57, 0x38, 0xbd, 0x27, 0xf8, 0x9f, 0x3c,
+  0xfe, 0xbe, 0x93, 0xbc, 0xa7, 0x0b, 0x52, 0xbc, 0xf9, 0xc1, 0xae, 0x3c, 0x84,
+  0xf4, 0x6a, 0xbc, 0x3c, 0xcf, 0xf6, 0xba, 0x16, 0x08, 0x95, 0xbc, 0xcf, 0xf0,
+  0x57, 0xbd, 0x5e, 0x93, 0x98, 0xbd, 0x84, 0x6a, 0xb4, 0x3d, 0xf6, 0x01, 0xe7,
+  0xbc, 0x52, 0x9a, 0x85, 0xbc, 0x25, 0x22, 0x99, 0x3d, 0x00, 0xa0, 0x87, 0xbb,
+  0xf8, 0xb5, 0x0e, 0xbc, 0xcd, 0xd6, 0x3d, 0x3d, 0x01, 0x80, 0x2d, 0xbe, 0xf5,
+  0xcb, 0x94, 0x3d, 0x65, 0x93, 0x7f, 0xbc, 0x90, 0x42, 0x98, 0x3c, 0x1c, 0x10,
+  0x13, 0x3d, 0xed, 0xb4, 0x8e, 0x3d, 0xdb, 0xd9, 0x01, 0xbd, 0x18, 0xe6, 0x8b,
+  0x3c, 0x64, 0x69, 0x60, 0x3b, 0x63, 0x00, 0x1c, 0xbd, 0xe4, 0x57, 0x43, 0x3d,
+  0xac, 0x16, 0xdc, 0x3d, 0x3d, 0x41, 0x3d, 0xbd, 0x18, 0xcb, 0x34, 0xbd, 0x28,
+  0x93, 0x06, 0x3b, 0xf2, 0x17, 0x02, 0xbd, 0x2d, 0x29, 0x07, 0xbd, 0xde, 0xd1,
+  0x88, 0xbc, 0xd8, 0x1e, 0x86, 0x3d, 0xda, 0xd2, 0xe3, 0xbb, 0xb6, 0xd8, 0x66,
+  0xbd, 0xe9, 0xbd, 0x91, 0x3d, 0xd2, 0xf8, 0xa1, 0x3d, 0xce, 0x41, 0x1f, 0x3d,
+  0x33, 0x84, 0xfa, 0xbc, 0xa7, 0x81, 0x8f, 0x3c, 0xe2, 0xf0, 0xda, 0xbc, 0x8d,
+  0x67, 0x2a, 0x3d, 0xee, 0x5c, 0xef, 0x3d, 0x00, 0xf6, 0x3c, 0xbb, 0xcd, 0xa3,
+  0x70, 0x3d, 0x3a, 0x58, 0x89, 0x3d, 0x03, 0xe3, 0x15, 0xbe, 0xfc, 0x75, 0x10,
+  0x3c, 0xcc, 0xc4, 0x23, 0xbc, 0xd8, 0x48, 0x1f, 0x3c, 0xb2, 0x7c, 0xa1, 0x3a,
+  0x7f, 0x0b, 0xda, 0x3d, 0x0d, 0xd0, 0x03, 0x3d, 0xf3, 0xca, 0xd9, 0x3b, 0x72,
+  0x97, 0x1a, 0x3c, 0x5c, 0x19, 0xfa, 0xbd, 0xaa, 0x5d, 0x12, 0x3d, 0x75, 0xda,
+  0x58, 0x3d, 0xec, 0x05, 0xb1, 0x3c, 0x6a, 0x21, 0xd9, 0xbc, 0x1d, 0x2c, 0x8c,
+  0x3c, 0xfa, 0x2f, 0x1e, 0xbd, 0x93, 0x81, 0x98, 0xba, 0x42, 0x27, 0x62, 0xbd,
+  0x1a, 0xe3, 0xa5, 0x3d, 0x17, 0x24, 0x18, 0xbc, 0x73, 0x8a, 0x24, 0xbd, 0xea,
+  0x88, 0x92, 0xbc, 0x9d, 0x8d, 0xf7, 0xbc, 0xb4, 0xa6, 0xc8, 0xbd, 0xa0, 0xdd,
+  0x8e, 0xbd, 0x4c, 0x81, 0x72, 0x3d, 0x59, 0x67, 0x48, 0xbd, 0x23, 0x21, 0xb3,
+  0x3c, 0x6a, 0xc5, 0x43, 0x3d, 0x13, 0x50, 0x85, 0x3d, 0x0a, 0xd5, 0xb9, 0x3c,
+  0xf3, 0xe6, 0x2b, 0xbd, 0x32, 0x6c, 0xe6, 0xbc, 0x11, 0x7c, 0x05, 0x3d, 0x99,
+  0xeb, 0x48, 0xbc, 0x7d, 0x87, 0x35, 0xbd, 0x8b, 0x42, 0x5f, 0x3d, 0xae, 0x56,
+  0x10, 0x3d, 0x02, 0x1e, 0x96, 0x3d, 0xf7, 0x64, 0xab, 0x3d, 0x66, 0xc3, 0xa2,
+  0x3c, 0xe6, 0x36, 0xd8, 0xbc, 0x8c, 0xaa, 0x29, 0x3d, 0x52, 0x0b, 0x8b, 0xbc,
+  0xce, 0x93, 0xef, 0xbc, 0xd9, 0x9b, 0x2c, 0xbd, 0x4a, 0x7a, 0xe6, 0x3c, 0xa1,
+  0xdb, 0xaa, 0x3d, 0xfe, 0xac, 0x77, 0x3c, 0xd0, 0x02, 0xe2, 0xbc, 0x1c, 0xec,
+  0xef, 0xbc, 0xe0, 0x92, 0xad, 0xbd, 0x46, 0xe8, 0x02, 0x3d, 0xd0, 0x99, 0x45,
+  0x3b, 0x8a, 0xbc, 0x3f, 0xbd, 0x02, 0x86, 0x84, 0xbd, 0x34, 0xfb, 0xc3, 0xbd,
+  0x71, 0xb4, 0xb7, 0x3d, 0xc0, 0x74, 0x42, 0xbb, 0xba, 0xef, 0x5d, 0xbc, 0x2b,
+  0xd3, 0x21, 0x3c, 0x5a, 0xa2, 0xe4, 0xbc, 0x9f, 0xa9, 0x80, 0xbd, 0xa0, 0x48,
+  0xb3, 0x3d, 0x39, 0xbb, 0xa4, 0xbd, 0xa9, 0x25, 0xb4, 0x3d, 0xb7, 0x12, 0xf3,
+  0xbc, 0x25, 0x61, 0x37, 0xbd, 0xb9, 0x66, 0x80, 0x3d, 0xcd, 0xce, 0xcf, 0x3d,
+  0x9f, 0xd0, 0x90, 0xbc, 0xd7, 0xbd, 0xf4, 0x3c, 0x20, 0x96, 0x8e, 0xbd, 0xd9,
+  0xdf, 0x00, 0xbe, 0x8c, 0xf9, 0x5d, 0xbc, 0x58, 0xf0, 0x1e, 0x3d, 0xee, 0xec,
+  0x2f, 0xbd, 0x32, 0x6b, 0x46, 0xbd, 0x72, 0x10, 0x2e, 0x3d, 0x33, 0x5a, 0x09,
+  0xbd, 0x43, 0x78, 0x14, 0x3d, 0x33, 0xde, 0xa1, 0xbd, 0xcd, 0x6e, 0x35, 0x3c,
+  0x05, 0x48, 0x22, 0xbd, 0x5b, 0x57, 0x80, 0x3d, 0x66, 0x64, 0xd7, 0x3b, 0x26,
+  0xf1, 0x1a, 0x3c, 0x81, 0x24, 0x8a, 0xbd, 0x00, 0x84, 0x5e, 0xbd, 0xbc, 0xc0,
+  0xdc, 0x3b, 0x74, 0x77, 0xa3, 0x3d, 0x8a, 0x55, 0xe3, 0x3c, 0x84, 0x75, 0x2e,
+  0x3d, 0x45, 0x17, 0x3c, 0x3d, 0xcf, 0xd9, 0x62, 0xbd, 0x6e, 0x1c, 0xd2, 0x3c,
+  0x6e, 0xe1, 0x21, 0xbe, 0x36, 0xf2, 0x95, 0x3d, 0x44, 0x50, 0x00, 0xba, 0x87,
+  0x5b, 0xc8, 0xbc, 0xeb, 0xe0, 0xbd, 0x3d, 0x92, 0x7c, 0xff, 0x3c, 0x34, 0x97,
+  0x32, 0x3d, 0x8f, 0x57, 0x73, 0x3d, 0x70, 0xfe, 0x5b, 0x3c, 0xba, 0x43, 0xee,
+  0xbc, 0xa8, 0x7b, 0x06, 0x3c, 0xfc, 0x87, 0x8f, 0x3d, 0xf2, 0xd6, 0x43, 0xbd,
+  0x18, 0x3c, 0x11, 0xbc, 0x1e, 0xc3, 0x62, 0x3c, 0x46, 0x98, 0x9e, 0x3c, 0x5a,
+  0x90, 0xc4, 0xbc, 0xe6, 0x6b, 0x72, 0xbd, 0xce, 0x30, 0xa7, 0x3d, 0x81, 0xa2,
+  0x10, 0xbd, 0x4e, 0x75, 0x24, 0x3d, 0xff, 0x9d, 0xea, 0xbc, 0x25, 0x08, 0x92,
+  0x3c, 0x50, 0x0a, 0xf0, 0xbb, 0xf0, 0x91, 0x8d, 0xbc, 0x4c, 0xd8, 0xc8, 0x3c,
+  0x16, 0xbb, 0x5d, 0xbd, 0x24, 0x8d, 0x32, 0x3d, 0x75, 0x67, 0x64, 0x3d, 0xe0,
+  0x67, 0x46, 0x3b, 0xbc, 0x93, 0xbb, 0x3c, 0xd2, 0x74, 0x17, 0xbd, 0x45, 0x88,
+  0x21, 0xbe, 0x4d, 0x15, 0x95, 0x3d, 0x41, 0x5c, 0xe7, 0xbb, 0xc9, 0x97, 0xfd,
+  0xbc, 0x3b, 0xe2, 0x0f, 0xbd, 0x57, 0x38, 0xab, 0x3d, 0x13, 0x12, 0xeb, 0x3c,
+  0x92, 0x5d, 0x4f, 0x3d, 0xf0, 0x1f, 0xbf, 0xbc, 0x37, 0x63, 0xf7, 0xbc, 0xa8,
+  0x76, 0x32, 0x3c, 0x97, 0xd3, 0xc9, 0xbc, 0x28, 0x83, 0x5b, 0x3d, 0xe2, 0x0f,
+  0x90, 0xbd, 0x31, 0x0b, 0x8a, 0xbd, 0x04, 0x7c, 0xd5, 0xbc, 0x16, 0x5d, 0xa7,
+  0x3a, 0x54, 0x36, 0x4f, 0xbd, 0x4d, 0xae, 0x64, 0x3d, 0xfd, 0x4c, 0x94, 0xbc,
+  0x72, 0x3f, 0x96, 0xbc, 0x41, 0xd7, 0xfa, 0x3b, 0x52, 0x45, 0x03, 0xbc, 0x1f,
+  0x50, 0xa6, 0xbd, 0x28, 0xb9, 0x78, 0x3c, 0x16, 0xa5, 0x77, 0x3c, 0xf2, 0x4e,
+  0xa1, 0x3c, 0x84, 0xb6, 0x84, 0xbd, 0xc5, 0x78, 0xdc, 0x3c, 0xb4, 0xd1, 0x27,
+  0xbd, 0x04, 0x20, 0x8d, 0xbd, 0xa0, 0x12, 0x36, 0x3c, 0xce, 0xb5, 0x31, 0xbe,
+  0x4b, 0xfd, 0x44, 0xbc, 0xe3, 0x38, 0x00, 0xbd, 0xca, 0x35, 0x60, 0x3c, 0xc6,
+  0xe4, 0x93, 0xb6, 0xc9, 0x84, 0xc0, 0x3a, 0xb3, 0x53, 0x88, 0x3d, 0x08, 0x37,
+  0x0b, 0x3c, 0xd9, 0x6d, 0x00, 0xbb, 0x54, 0x22, 0xcc, 0xbb, 0x3c, 0x72, 0xa7,
+  0xbc, 0x39, 0xbd, 0xc0, 0x3d, 0xc7, 0xb5, 0x0a, 0x3b, 0xe3, 0xbc, 0x38, 0xbc,
+  0x0d, 0x1c, 0x1f, 0xbc, 0xbc, 0x5b, 0x42, 0xbc, 0xf3, 0x43, 0xb2, 0x3c, 0x5e,
+  0x7e, 0xc3, 0xbc, 0x40, 0xbf, 0x47, 0x3c, 0xe7, 0x7d, 0x3e, 0xbc, 0x30, 0xf4,
+  0x13, 0xbc, 0x5f, 0x8d, 0xd1, 0x3c, 0xe1, 0x93, 0xe7, 0xbc, 0x73, 0x12, 0x87,
+  0xbc, 0x52, 0xb6, 0x9d, 0x3b, 0xf6, 0xda, 0x8d, 0x3d, 0x6b, 0xb8, 0x03, 0x3c,
+  0x58, 0x8e, 0x25, 0xbd, 0x7b, 0xaa, 0x8a, 0xbc, 0x75, 0xd1, 0x84, 0x3d, 0x0e,
+  0x90, 0xcd, 0xbc, 0x17, 0x0e, 0x8b, 0x3d, 0x87, 0x5e, 0x04, 0xbd, 0xe5, 0x99,
+  0x9b, 0xbc, 0x0a, 0xdd, 0x3b, 0x3d, 0x22, 0xc9, 0x83, 0xbc, 0xb8, 0x42, 0x3f,
+  0x3d, 0x86, 0x99, 0x90, 0x3d, 0x41, 0x4e, 0xa2, 0x3d, 0xf0, 0x89, 0x4f, 0xbd,
+  0xa6, 0x28, 0x75, 0xbd, 0xea, 0xf1, 0x56, 0xbd, 0x96, 0xb0, 0x9b, 0xbc, 0x01,
+  0x85, 0xb5, 0x3d, 0xcf, 0x71, 0x4c, 0x3d, 0x98, 0xf9, 0x6d, 0xbc, 0xc8, 0x59,
+  0x38, 0xbd, 0x12, 0x6f, 0x7b, 0x3d, 0x61, 0xac, 0xf1, 0xbb, 0xd4, 0x32, 0x4a,
+  0x3d, 0x92, 0x25, 0x45, 0x3d, 0x53, 0x88, 0x6d, 0xbd, 0xa0, 0x69, 0xda, 0xbb,
+  0xf2, 0xf2, 0xda, 0x3b, 0xf3, 0x4d, 0x84, 0xbc, 0x61, 0x96, 0xda, 0x3c, 0xa3,
+  0x9c, 0x9a, 0x3b, 0x70, 0x04, 0x93, 0xbb, 0x11, 0x0f, 0xe7, 0xbc, 0x06, 0x52,
+  0x86, 0xbd, 0x0f, 0xf5, 0x6c, 0xbd, 0xe1, 0x4c, 0x8d, 0x3d, 0x59, 0x20, 0xa0,
+  0xbd, 0xf8, 0x29, 0x94, 0x3d, 0x3f, 0x89, 0x86, 0xbd, 0x15, 0x66, 0x15, 0xbd,
+  0xad, 0x80, 0xdf, 0x3c, 0x5b, 0xd4, 0x6c, 0xbc, 0x2c, 0x5f, 0x60, 0x3c, 0x2b,
+  0x82, 0xd5, 0x3c, 0x3f, 0x7e, 0x14, 0xbd, 0x6c, 0xe8, 0xaf, 0xbb, 0xee, 0x8b,
+  0x27, 0xbd, 0xa0, 0xa8, 0x20, 0xbd, 0xe8, 0x39, 0x54, 0xbc, 0x9b, 0x57, 0xb7,
+  0x3d, 0x6a, 0x42, 0x81, 0x3d, 0xd3, 0x09, 0x10, 0xbd, 0x95, 0xd4, 0x3a, 0x3d,
+  0x48, 0xe1, 0xb8, 0xbc, 0xf4, 0x91, 0xa0, 0xbd, 0x8e, 0x67, 0x5e, 0xbd, 0x3b,
+  0x3d, 0xa0, 0x3d, 0x82, 0x2e, 0x85, 0x3d, 0x10, 0x91, 0x8c, 0xbb, 0x63, 0xb7,
+  0x75, 0xbd, 0xf5, 0xd8, 0x35, 0xbd, 0xea, 0x58, 0x11, 0xbb, 0xc4, 0x87, 0xe5,
+  0xbc, 0xb4, 0x14, 0xce, 0x3d, 0x86, 0x00, 0x0b, 0x3c, 0x91, 0x4b, 0xb2, 0xbd,
+  0xa9, 0x2e, 0x93, 0x3d, 0xc3, 0x3a, 0xc3, 0xbb, 0x7c, 0x8a, 0x83, 0xbd, 0xd2,
+  0xb1, 0x2e, 0xbd, 0xbb, 0x27, 0xa9, 0xbd, 0xa7, 0x9f, 0x41, 0x3d, 0x0a, 0x47,
+  0x15, 0xbd, 0xeb, 0x11, 0xca, 0x3c, 0xfe, 0x0d, 0xef, 0xbc, 0x71, 0x53, 0x52,
+  0x3d, 0x0b, 0x4b, 0x44, 0x3c, 0x9d, 0xbf, 0x10, 0xbb, 0xf9, 0x31, 0xe6, 0x3c,
+  0x97, 0x60, 0xbd, 0xbd, 0x8c, 0x40, 0x87, 0x3c, 0x30, 0x66, 0x18, 0x3d, 0x1a,
+  0x2b, 0xcd, 0x3c, 0x52, 0x92, 0x7e, 0xbd, 0x58, 0xee, 0x02, 0x3d, 0x0a, 0x85,
+  0xf7, 0xbc, 0x76, 0x75, 0x7f, 0xbd, 0xff, 0x11, 0xde, 0x3b, 0x5b, 0x43, 0x4b,
+  0x3d, 0xa2, 0x53, 0x3f, 0xbd, 0x90, 0xf3, 0x42, 0xbd, 0x5b, 0xb9, 0x1e, 0x3d,
+  0x43, 0x66, 0x46, 0xbc, 0x3e, 0x79, 0x7f, 0xbd, 0x24, 0xa8, 0xa0, 0xbd, 0xd5,
+  0xb2, 0xd2, 0x3c, 0xf6, 0x82, 0x7d, 0x3b, 0x52, 0x09, 0x4e, 0xbd, 0x23, 0x30,
+  0xfa, 0x3d, 0x62, 0xb4, 0x72, 0x3d, 0xa6, 0x3c, 0x98, 0x3c, 0x20, 0x3f, 0xdd,
+  0xbb, 0xb0, 0xfa, 0x4f, 0xbd, 0x0f, 0x36, 0x24, 0xbb, 0x19, 0xbc, 0x7d, 0xbd,
+  0x8d, 0xab, 0x2e, 0x3d, 0x1e, 0x67, 0x61, 0x3d, 0x8a, 0x39, 0x61, 0xbb, 0xb1,
+  0xa0, 0x01, 0xbc, 0x0d, 0x75, 0x64, 0xbc, 0x89, 0xd7, 0x84, 0xbd, 0x1f, 0x26,
+  0xa6, 0xbd, 0x7a, 0x67, 0x62, 0x3d, 0x3d, 0x4d, 0x06, 0xbb, 0xff, 0xe4, 0x92,
+  0x3d, 0x32, 0x12, 0x95, 0xbc, 0x4b, 0x2e, 0x8b, 0xbc, 0x8b, 0x4a, 0x14, 0x3c,
+  0xea, 0x08, 0x81, 0xbd, 0xb3, 0x3e, 0xb3, 0xbd, 0x96, 0x40, 0xef, 0x3c, 0xc6,
+  0xf4, 0x83, 0xbd, 0x70, 0x8a, 0xad, 0xbc, 0x28, 0x6d, 0x26, 0xbd, 0x0e, 0x8f,
+  0x89, 0x3a, 0xbc, 0x30, 0xc8, 0xbd, 0x81, 0x3c, 0x22, 0xbd, 0x19, 0x06, 0xb4,
+  0x3d, 0x2a, 0xbf, 0x2a, 0x3d, 0xc9, 0xd4, 0x00, 0xbd, 0x74, 0x7d, 0x9b, 0x3b,
+  0xc5, 0x7a, 0x13, 0xbd, 0xbf, 0x24, 0x18, 0xbc, 0x63, 0x21, 0xfd, 0x3c, 0x8f,
+  0x45, 0xf6, 0xbd, 0xf6, 0xb7, 0x85, 0x3c, 0x49, 0xc7, 0xee, 0xbb, 0x31, 0x16,
+  0x9c, 0x3d, 0x86, 0x9e, 0x44, 0x3d, 0x97, 0x25, 0x99, 0x3d, 0x33, 0x23, 0xa6,
+  0x3d, 0x7f, 0x66, 0x2b, 0x3d, 0xbd, 0xe9, 0x43, 0x3d, 0x11, 0x56, 0x76, 0xbc,
+  0x30, 0x7c, 0x87, 0xbb, 0xfe, 0xae, 0xfb, 0xb8, 0x4c, 0x48, 0x47, 0xbd, 0x74,
+  0x13, 0x8b, 0xbd, 0x26, 0x22, 0x87, 0x3d, 0x22, 0xb0, 0x87, 0x3d, 0x9f, 0xc6,
+  0x74, 0xbd, 0x7a, 0x47, 0x70, 0x3c, 0xe0, 0x41, 0x8b, 0x3d, 0xfb, 0xa2, 0x43,
+  0xbc, 0x63, 0x0d, 0x21, 0xbd, 0x8a, 0x60, 0x36, 0xbb, 0x54, 0xe8, 0x59, 0x3c,
+  0x21, 0xd4, 0xa9, 0x3b, 0x00, 0x5b, 0x20, 0x3d, 0x61, 0x25, 0x72, 0x3d, 0x39,
+  0x8d, 0x3b, 0x3d, 0x5e, 0xcd, 0x4f, 0x3d, 0xa0, 0x47, 0x0c, 0xbd, 0x34, 0xc9,
+  0x09, 0x3d, 0xb8, 0x59, 0xa2, 0xbc, 0x9a, 0xa3, 0x82, 0x3d, 0x1b, 0xd4, 0x1f,
+  0xbe, 0xa4, 0x45, 0x9d, 0x3d, 0x9e, 0x03, 0xc6, 0x3c, 0x0c, 0x23, 0x30, 0x3d,
+  0x9c, 0xb4, 0xec, 0xbb, 0xf8, 0x66, 0x9c, 0xbc, 0x6c, 0x32, 0x7e, 0x3d, 0x4b,
+  0x32, 0x51, 0x3d, 0x64, 0x32, 0x75, 0x3d, 0x1b, 0xc9, 0xd1, 0x3c, 0x98, 0xac,
+  0x05, 0x3d, 0x4a, 0x99, 0x74, 0x3b, 0x40, 0x86, 0x41, 0xbd, 0xf6, 0xa7, 0x03,
+  0xbd, 0x95, 0x47, 0x23, 0x3c, 0x78, 0xf3, 0x0c, 0x3d, 0xf4, 0x66, 0xdc, 0x3b,
+  0x4d, 0x45, 0xbf, 0xbb, 0x65, 0x4b, 0x73, 0xbc, 0x51, 0x10, 0x8c, 0x3c, 0x5e,
+  0x5a, 0x67, 0x3d, 0xd7, 0x47, 0x82, 0x3d, 0xdc, 0x32, 0x9c, 0xbc, 0xe4, 0xa5,
+  0x87, 0xbd, 0xc2, 0xd2, 0xc4, 0xbd, 0x08, 0xbe, 0x6e, 0x3d, 0xa8, 0x8b, 0xf1,
+  0x3c, 0x10, 0xc0, 0xb1, 0xbc, 0x12, 0x09, 0x88, 0x3d, 0x3f, 0x54, 0x25, 0x3d,
+  0x11, 0x70, 0x26, 0x3b, 0xdd, 0x48, 0x18, 0x3c, 0x01, 0x3c, 0xee, 0xbd, 0x4f,
+  0x63, 0x36, 0xbc, 0xea, 0x7e, 0x3f, 0x3d, 0x86, 0x4d, 0x45, 0x3d, 0x4b, 0x63,
+  0x70, 0xbc, 0x32, 0xdf, 0xc0, 0x3d, 0x50, 0x3c, 0x13, 0x3c, 0x0e, 0x61, 0xa3,
+  0x3d, 0xe8, 0xc5, 0x37, 0xbd, 0x3b, 0xd7, 0x01, 0xbd, 0x20, 0x1b, 0x89, 0xbc,
+  0x70, 0x18, 0xee, 0xbc, 0x3e, 0xeb, 0xfa, 0xbb, 0x18, 0xda, 0xda, 0x3c, 0xd6,
+  0x82, 0x19, 0xbd, 0xf1, 0x7e, 0x88, 0xbd, 0x39, 0x1d, 0xb8, 0xbb, 0x67, 0x98,
+  0x1c, 0x3d, 0x72, 0x83, 0x90, 0x3d, 0xd3, 0x17, 0x6b, 0xbd, 0xcc, 0x55, 0xa8,
+  0x3c, 0x18, 0x2e, 0x2c, 0xbd, 0x08, 0xc4, 0x34, 0x3c, 0xf8, 0x8f, 0x51, 0xbd,
+  0x88, 0x62, 0xfe, 0x3c, 0xbc, 0xe0, 0xb1, 0xbc, 0x09, 0x93, 0x88, 0xbb, 0x95,
+  0x9c, 0xda, 0x3c, 0x83, 0xda, 0x3a, 0xbd, 0xb8, 0x82, 0x81, 0x3c, 0x39, 0xa8,
+  0x8a, 0xbd, 0x8b, 0xb0, 0x31, 0xbb, 0x4a, 0x2c, 0x07, 0xbe, 0xec, 0x84, 0x9b,
+  0x3c, 0xc9, 0x97, 0x56, 0x3d, 0x3d, 0xce, 0x97, 0xbd, 0xa6, 0xe3, 0xbc, 0x3d,
+  0x91, 0xc4, 0x0f, 0x3d, 0x35, 0xe9, 0xd1, 0xbc, 0x10, 0x48, 0x17, 0x3c, 0x9a,
+  0x86, 0x86, 0xbd, 0x08, 0x63, 0xf9, 0xbc, 0xb0, 0xb0, 0x98, 0x3c, 0x3e, 0x7e,
+  0x4e, 0x3d, 0xe0, 0x6f, 0x73, 0xbc, 0xa5, 0x9e, 0x03, 0xbd, 0x7c, 0x39, 0x53,
+  0x39, 0x6d, 0x86, 0x40, 0xba, 0x1d, 0x71, 0x86, 0x3d, 0x62, 0xec, 0x9d, 0x3c,
+  0x03, 0x1e, 0x29, 0x3d, 0xbd, 0xbf, 0xd2, 0xbd, 0xce, 0x1c, 0x0c, 0x3d, 0x7f,
+  0xb3, 0x9c, 0x3d, 0x93, 0xa6, 0xa1, 0xbc, 0xb9, 0xf4, 0x6b, 0xbd, 0x17, 0xce,
+  0x40, 0xbd, 0x33, 0x15, 0x00, 0x3d, 0xd3, 0x33, 0x9c, 0x3d, 0x01, 0xc6, 0xec,
+  0x3c, 0x65, 0x42, 0xba, 0x3c, 0x33, 0x73, 0xec, 0xbc, 0x47, 0xf8, 0x00, 0x3d,
+  0xd1, 0x1b, 0x66, 0x3d, 0x10, 0x9b, 0x0b, 0xbe, 0xe6, 0x45, 0x48, 0xbd, 0x90,
+  0x46, 0xbd, 0x3c, 0x29, 0xe0, 0xb5, 0xbc, 0x50, 0x42, 0x6a, 0x3d, 0x00, 0x37,
+  0x9e, 0x3d, 0xc1, 0x54, 0xa0, 0x3c, 0x00, 0x3c, 0x2f, 0xbb, 0x05, 0x4f, 0xa7,
+  0xbc, 0x3d, 0x86, 0x68, 0xbd, 0x24, 0x65, 0x51, 0xbc, 0xff, 0x74, 0x21, 0x3d,
+  0x81, 0x5d, 0x25, 0x3d, 0x5d, 0xd0, 0x7a, 0xbd, 0x37, 0xb1, 0x40, 0xbd, 0xf0,
+  0xfd, 0x3d, 0x3d, 0x1e, 0xb2, 0x2a, 0xbc, 0x62, 0x35, 0x9e, 0xbd, 0xeb, 0x65,
+  0x51, 0xbc, 0x6f, 0xf6, 0x9a, 0xbd, 0x82, 0x5b, 0x81, 0xbc, 0xd7, 0x8a, 0x29,
+  0x3d, 0x5a, 0x89, 0x81, 0xbb, 0x6d, 0xf8, 0xe0, 0x3c, 0xa6, 0x56, 0x3c, 0x3d,
+  0x9d, 0xc6, 0x49, 0xbc, 0xdf, 0x38, 0x79, 0x3c, 0x51, 0x74, 0x4e, 0x3d, 0x02,
+  0xb4, 0x2e, 0xbd, 0x6e, 0x2c, 0x52, 0xbd, 0x98, 0x05, 0x96, 0x3c, 0x5e, 0xef,
+  0x12, 0x3d, 0xa9, 0x44, 0x29, 0xbd, 0x29, 0xcf, 0x47, 0x3d, 0x08, 0x33, 0xa3,
+  0xbd, 0xc7, 0xe5, 0x26, 0x3c, 0x16, 0xf0, 0xc7, 0xbc, 0x89, 0xde, 0xa2, 0x3a,
+  0x57, 0x77, 0xb9, 0x3b, 0xa0, 0x30, 0x9d, 0x3c, 0xd9, 0xf8, 0x91, 0xbc, 0xdc,
+  0xac, 0x41, 0x3c, 0xc9, 0xe5, 0x1a, 0xbd, 0x66, 0xcc, 0x89, 0x3d, 0xae, 0x83,
+  0x95, 0xbd, 0xf6, 0x92, 0xd3, 0x3c, 0x6a, 0x9a, 0xf7, 0x3c, 0xb4, 0xf9, 0x7c,
+  0xbb, 0x79, 0xd8, 0x99, 0xbc, 0x82, 0x88, 0xb6, 0xbc, 0xf7, 0xdf, 0xb3, 0x3d,
+  0x57, 0xa6, 0xa7, 0xbd, 0x2e, 0x22, 0xd9, 0xbc, 0xd6, 0x67, 0x91, 0xbc, 0x54,
+  0x25, 0x32, 0x3d, 0xc3, 0x91, 0x93, 0xbd, 0x1d, 0x77, 0x33, 0x3b, 0x56, 0xc9,
+  0x8b, 0x3d, 0xbf, 0xe2, 0x21, 0x3c, 0xf5, 0x88, 0x80, 0xbd, 0xee, 0x4f, 0xd8,
+  0xbc, 0xbf, 0x1c, 0x83, 0xbd, 0xa4, 0x91, 0x61, 0x3d, 0xdc, 0xc1, 0x74, 0x3d,
+  0xb4, 0x4d, 0x90, 0xbd, 0x80, 0x3d, 0xbb, 0x3c, 0x27, 0x03, 0xa2, 0xbb, 0x7e,
+  0x7e, 0xd9, 0x3c, 0xf4, 0x18, 0x5f, 0xbc, 0xb1, 0xde, 0x83, 0x3d, 0xd5, 0xee,
+  0x20, 0xbd, 0xbe, 0xa8, 0x7a, 0xbc, 0x01, 0x94, 0x03, 0xbd, 0x27, 0xa8, 0xfc,
+  0xbd, 0x72, 0x14, 0x56, 0x3d, 0x79, 0x46, 0x0d, 0xbc, 0x69, 0x23, 0xd1, 0x3c,
+  0x3b, 0x33, 0x49, 0x3d, 0x8d, 0xef, 0x18, 0x3b, 0xe9, 0xe1, 0x8f, 0xbd, 0x4f,
+  0x45, 0x05, 0x3d, 0x28, 0x80, 0x49, 0x3c, 0xbd, 0x49, 0x18, 0x3d, 0xfd, 0xd4,
+  0x86, 0x3c, 0xcc, 0x56, 0xa6, 0x3c, 0x37, 0x8e, 0xef, 0x3a, 0x57, 0x1e, 0x5f,
+  0x3d, 0xc2, 0xef, 0x68, 0xbc, 0x24, 0xc0, 0xbe, 0xbd, 0x9c, 0xfd, 0xa0, 0x3b,
+  0x48, 0x3b, 0x5d, 0x3d, 0xcf, 0xe0, 0x2c, 0xbd, 0x49, 0x51, 0xa7, 0x3d, 0x65,
+  0xcf, 0x7a, 0xbc, 0x27, 0x68, 0x4c, 0xbd, 0x00, 0xed, 0x99, 0xbc, 0x2a, 0xac,
+  0x5d, 0xbd, 0x6b, 0x5c, 0x9a, 0x3c, 0x71, 0xb7, 0x51, 0x3c, 0x1a, 0x04, 0x60,
+  0xbd, 0x4b, 0xb8, 0x42, 0x3d, 0xf6, 0x92, 0x4f, 0x3d, 0xcb, 0x7a, 0xc4, 0x3c,
+  0xc2, 0x1f, 0x85, 0x3d, 0xbf, 0x4c, 0x3b, 0x3b, 0x52, 0x04, 0x9a, 0xbd, 0x3a,
+  0x5c, 0x29, 0x3d, 0x5f, 0x4e, 0xb1, 0x3d, 0xfc, 0x4e, 0x87, 0xbc, 0x59, 0x10,
+  0xaa, 0x3d, 0x99, 0xff, 0x43, 0x3d, 0x20, 0x80, 0x8e, 0x3c, 0x79, 0x81, 0x3e,
+  0xbd, 0xfe, 0x38, 0xab, 0xbd, 0x3d, 0x72, 0xad, 0x3d, 0x18, 0xa1, 0x64, 0xbd,
+  0xa0, 0x6e, 0xb0, 0xbb, 0x19, 0x6b, 0x00, 0x3d, 0x6b, 0x7b, 0x15, 0xbc, 0x45,
+  0xb5, 0xa6, 0xbd, 0xef, 0x81, 0x05, 0xbd, 0x9f, 0xe8, 0x37, 0x3d, 0x71, 0xbe,
+  0xb6, 0xbc, 0x22, 0x55, 0xd6, 0xbc, 0x0d, 0x9b, 0xcf, 0x3c, 0x47, 0xa3, 0x92,
+  0x3d, 0xfd, 0x13, 0x74, 0x3d, 0x4f, 0xef, 0x53, 0x3d, 0x8b, 0xeb, 0x0f, 0xbd,
+  0xf9, 0x86, 0x00, 0x3d, 0xb8, 0xd1, 0x68, 0xbc, 0x68, 0xa4, 0x1c, 0xbd, 0x96,
+  0x27, 0x01, 0x3d, 0x28, 0x65, 0x4a, 0x3d, 0xef, 0xa3, 0x41, 0xbd, 0xdd, 0xd4,
+  0xac, 0x3c, 0x24, 0x42, 0x48, 0x3d, 0x55, 0x49, 0x99, 0x39, 0x7a, 0x2f, 0xde,
+  0xbc, 0x7f, 0xff, 0x94, 0x3d, 0x76, 0x44, 0x14, 0xbd, 0xea, 0xa9, 0x05, 0x3d,
+  0xd1, 0xa5, 0x2c, 0x3d, 0xfa, 0x4f, 0x0c, 0xbd, 0xda, 0x0a, 0x6d, 0xbd, 0x52,
+  0x92, 0x47, 0x3d, 0x8b, 0x87, 0x8b, 0x3d, 0xd0, 0x89, 0x48, 0xbd, 0xaa, 0xbe,
+  0x03, 0x3d, 0xa0, 0x14, 0x6d, 0xbd, 0x20, 0x3a, 0x80, 0x3d, 0x08, 0x2f, 0x86,
+  0xbd, 0xf9, 0xfd, 0xa4, 0xbd, 0xde, 0xd5, 0x92, 0xbc, 0xcd, 0x8a, 0x64, 0x3d,
+  0x48, 0xd0, 0x6c, 0x3d, 0x6a, 0xa3, 0xfa, 0xbc, 0xc3, 0xc7, 0x36, 0xbd, 0xb1,
+  0x87, 0x2e, 0xbd, 0x3b, 0x6c, 0x9e, 0x3d, 0x56, 0x18, 0x1a, 0xbe, 0x9e, 0xd1,
+  0xf5, 0x3c, 0xb9, 0xfe, 0xc3, 0xbc, 0x46, 0xbc, 0x40, 0xbd, 0x94, 0x3a, 0x48,
+  0x3d, 0xbc, 0x4e, 0xbb, 0x3d, 0xa0, 0x7b, 0x94, 0xbc, 0xd8, 0xeb, 0x91, 0x3d,
+  0x95, 0xa1, 0x99, 0xbd, 0xf4, 0x73, 0x9c, 0x3b, 0x23, 0x2d, 0x8e, 0x3d, 0x46,
+  0x9c, 0xa5, 0xbb, 0x61, 0x13, 0x50, 0xbd, 0xad, 0x99, 0xf8, 0x3c, 0xd2, 0xac,
+  0x7d, 0xbd, 0xc1, 0xb2, 0x6d, 0xbc, 0xf7, 0xde, 0x9f, 0xbd, 0x60, 0x72, 0x15,
+  0x3d, 0x69, 0xaf, 0xa2, 0x3d, 0xfd, 0x72, 0x79, 0x3d, 0xd0, 0xc0, 0xa1, 0xbb,
+  0x80, 0x21, 0x4f, 0x3d, 0xbc, 0x91, 0x0a, 0xbc, 0x23, 0xa3, 0xee, 0xbc, 0xd0,
+  0x1a, 0xbb, 0xbd, 0x2a, 0x71, 0x35, 0x3d, 0x21, 0x26, 0x66, 0x3d, 0xb4, 0x17,
+  0x89, 0xbb, 0x54, 0x4f, 0x80, 0xbc, 0x47, 0x10, 0xf3, 0xbc, 0x22, 0x75, 0x6c,
+  0x3d, 0xb1, 0x75, 0x00, 0x3d, 0xe2, 0xf4, 0xf5, 0xbd, 0xbe, 0xbc, 0x7b, 0x3d,
+  0xe3, 0x01, 0xc1, 0xbc, 0x05, 0x25, 0x82, 0xbb, 0x3f, 0x02, 0x5d, 0xbb, 0xa9,
+  0xc1, 0x5a, 0x3d, 0xea, 0xe4, 0x5e, 0x3c, 0x96, 0xd6, 0xa5, 0x3c, 0xcb, 0x77,
+  0xa4, 0x3c, 0xb2, 0x4f, 0x06, 0xbd, 0x84, 0xc3, 0x2c, 0xbd, 0x48, 0xdc, 0x9d,
+  0x3b, 0xdb, 0xd6, 0xbb, 0xbc, 0xc8, 0xdf, 0x98, 0xbc, 0x29, 0x14, 0x31, 0x3d,
+  0x6f, 0xfa, 0x4f, 0xbd, 0x7c, 0xb4, 0xaa, 0xbd, 0xe0, 0xeb, 0x2e, 0xbd, 0x53,
+  0x3f, 0xc4, 0x3d, 0xbc, 0xcb, 0x38, 0x3d, 0x30, 0x45, 0x30, 0x3c, 0xf0, 0xc1,
+  0x0c, 0xbd, 0xb3, 0x20, 0x39, 0xbd, 0x80, 0xe2, 0x8b, 0x3b, 0x35, 0x31, 0x05,
+  0xbd, 0xf5, 0xaa, 0x49, 0xbc, 0x7d, 0x08, 0x0a, 0x3d, 0xdd, 0x96, 0x84, 0xbc,
+  0x0f, 0xb9, 0x4c, 0x3d, 0x49, 0xea, 0x86, 0x3d, 0xc9, 0xd0, 0x75, 0xbb, 0xcd,
+  0x9b, 0xd1, 0x3d, 0x7a, 0x5e, 0x6f, 0xbd, 0x4a, 0x2e, 0xc0, 0xba, 0x3b, 0x7d,
+  0x7d, 0xbd, 0x2b, 0x8f, 0xfe, 0xbb, 0x2a, 0xf4, 0xce, 0x3d, 0xf6, 0xfc, 0x06,
+  0xbc, 0xdd, 0x02, 0x4a, 0x3c, 0x71, 0x3c, 0x03, 0xbd, 0x03, 0x9a, 0x90, 0xbd,
+  0x76, 0xb7, 0xb3, 0xbd, 0xa2, 0xd1, 0x47, 0xbd, 0xc1, 0x56, 0x6e, 0x3d, 0xff,
+  0x97, 0x57, 0x3d, 0x50, 0x57, 0xe6, 0xbc, 0x8f, 0xb3, 0x3d, 0xbd, 0x75, 0x8e,
+  0x80, 0xbd, 0xc7, 0x6c, 0x43, 0xbc, 0xaa, 0xe3, 0x9d, 0xbd, 0x6f, 0xe4, 0x1d,
+  0x3d, 0x3a, 0x57, 0x98, 0x3c, 0x6c, 0x08, 0x5c, 0x3d, 0xeb, 0xd2, 0xa5, 0xbb,
+  0xf7, 0x60, 0x08, 0xbc, 0x72, 0x03, 0x3b, 0xbd, 0xe7, 0xc1, 0x8f, 0x3d, 0xb6,
+  0x1f, 0x98, 0x3d, 0x59, 0xff, 0x88, 0x3d, 0x51, 0xe9, 0x73, 0xbc, 0x1f, 0x91,
+  0xa5, 0x3d, 0x3b, 0x64, 0x17, 0xbd, 0x5b, 0xa5, 0x80, 0x3d, 0x03, 0x38, 0x85,
+  0x3d, 0xbe, 0x27, 0x90, 0xbd, 0x4e, 0x87, 0xa3, 0xbc, 0xc1, 0xbb, 0x22, 0xbc,
+  0x8b, 0x25, 0xd0, 0xbb, 0x6a, 0x2f, 0x1d, 0x3d, 0x0a, 0xdd, 0x48, 0x3d, 0x0b,
+  0x37, 0x37, 0x3d, 0x2a, 0x68, 0x1a, 0x3d, 0xc8, 0x85, 0x4a, 0x3d, 0x0a, 0xa5,
+  0x03, 0x3c, 0xd2, 0x41, 0x12, 0x3d, 0x25, 0xc3, 0x24, 0x3b, 0x1a, 0x95, 0x33,
+  0x3d, 0xbf, 0xfd, 0xd7, 0x3c, 0xce, 0xff, 0x6e, 0xbc, 0x91, 0xc5, 0x0f, 0x3c,
+  0x7e, 0x5f, 0x64, 0xbd, 0x64, 0x7d, 0x1c, 0xbd, 0x42, 0x2d, 0xba, 0x3d, 0x99,
+  0x69, 0xa5, 0x3c, 0x39, 0x7d, 0x72, 0xbd, 0x6a, 0xbf, 0x8f, 0x3b, 0xaa, 0x43,
+  0x02, 0x3d, 0xb7, 0xb7, 0x35, 0xbd, 0x97, 0xaf, 0x6c, 0x3c, 0x62, 0x39, 0xd6,
+  0xbc, 0x33, 0xd6, 0x85, 0x3d, 0x4c, 0x50, 0x47, 0x3d, 0x26, 0x4b, 0x57, 0x3d,
+  0xf8, 0x80, 0x15, 0x3c, 0x9e, 0x69, 0x05, 0xbc, 0xa4, 0x13, 0xb5, 0x3d, 0x41,
+  0x17, 0xda, 0xbd, 0x48, 0x79, 0x2b, 0xbb, 0xb4, 0x86, 0xcc, 0xbb, 0xad, 0x20,
+  0x95, 0xbd, 0x20, 0xf5, 0x01, 0x3e, 0x23, 0x9e, 0x9b, 0x3d, 0xdb, 0xfe, 0x38,
+  0x3b, 0x23, 0x42, 0x57, 0x3b, 0x42, 0x99, 0x59, 0x3d, 0xf2, 0x9d, 0xba, 0xbd,
+  0x92, 0xe5, 0x5d, 0x3d, 0x20, 0x17, 0x07, 0xbb, 0xf0, 0x57, 0x08, 0x3d, 0x7d,
+  0xed, 0x91, 0xbc, 0x2e, 0xc4, 0x8d, 0xbd, 0xdb, 0x15, 0xc2, 0x3c, 0xaa, 0xc3,
+  0xe6, 0xbb, 0x90, 0x5d, 0xb4, 0xbc, 0xee, 0xaa, 0x9a, 0x3d, 0x74, 0x6d, 0x22,
+  0xbb, 0x00, 0x65, 0xc2, 0xb9, 0x37, 0x30, 0x07, 0xbd, 0x85, 0xbd, 0x60, 0xbb,
+  0x2b, 0x40, 0xd7, 0x3c, 0xca, 0x82, 0x33, 0xbd, 0x29, 0xb2, 0x81, 0x3d, 0x08,
+  0xee, 0xd5, 0x3c, 0x28, 0x34, 0xdf, 0x3c, 0x3d, 0x41, 0x67, 0xbd, 0x0c, 0x1e,
+  0xf7, 0x3c, 0x9c, 0x86, 0xe4, 0x3c, 0x36, 0x7c, 0x07, 0x3d, 0xc7, 0x27, 0x04,
+  0xbd, 0x45, 0xcb, 0x77, 0x3d, 0xcf, 0x66, 0x14, 0xbd, 0x29, 0xae, 0x3f, 0xbd,
+  0x70, 0x86, 0x25, 0xbc, 0x08, 0xc9, 0xa6, 0x3c, 0x70, 0xa3, 0xa8, 0xbb, 0xbe,
+  0x82, 0x49, 0x3d, 0x13, 0xa1, 0x73, 0xbd, 0xd5, 0x6c, 0x35, 0xbd, 0x98, 0xfa,
+  0x3a, 0x3c, 0xff, 0x0c, 0xe2, 0xb9, 0x37, 0xe9, 0xf2, 0xbb, 0x78, 0x2d, 0x89,
+  0xbd, 0xec, 0x2c, 0x88, 0xbc, 0x97, 0x7f, 0x2e, 0x3d, 0x9e, 0x32, 0x88, 0xbd,
+  0x17, 0xdb, 0x20, 0xbd, 0xde, 0xbd, 0xc7, 0x3b, 0x30, 0x01, 0xf4, 0x3c, 0xf8,
+  0x47, 0x05, 0xbd, 0xab, 0x0c, 0xdf, 0x3c, 0x8b, 0xdc, 0xa5, 0x3c, 0x62, 0x53,
+  0x78, 0xbd, 0xf1, 0x6e, 0x56, 0x3d, 0x1e, 0xf2, 0x79, 0x3d, 0x0a, 0xce, 0x9b,
+  0xbc, 0x18, 0xed, 0xaf, 0x3c, 0xd1, 0x1d, 0x8a, 0x3d, 0x78, 0xe8, 0x6e, 0x3c,
+  0x1d, 0x2a, 0x84, 0x3d, 0x90, 0xb3, 0x80, 0x3d, 0x26, 0x1f, 0x74, 0x3d, 0x14,
+  0xc6, 0x79, 0xbb, 0x37, 0x9d, 0x18, 0x3d, 0x1a, 0x28, 0x86, 0x3d, 0x8b, 0x8e,
+  0x0f, 0xbd, 0x50, 0x3e, 0x82, 0xbc, 0x6f, 0x35, 0x70, 0xbd, 0xa5, 0xa6, 0x88,
+  0x3d, 0xb6, 0xe7, 0x2a, 0xbd, 0x57, 0x46, 0x0a, 0x3d, 0xd6, 0xba, 0x34, 0xbd,
+  0xc2, 0xf8, 0xc1, 0xbc, 0x2e, 0xe5, 0x30, 0xbd, 0xd5, 0x76, 0x85, 0x3d, 0xb4,
+  0xeb, 0x88, 0xbd, 0xb5, 0x44, 0x40, 0x3d, 0x08, 0x9a, 0x8f, 0xbd, 0xe4, 0xa2,
+  0xdf, 0x3c, 0x40, 0x83, 0xaf, 0x3a, 0xe0, 0xfb, 0x20, 0x3b, 0x84, 0xc3, 0xf1,
+  0x3c, 0x13, 0x24, 0x88, 0xbd, 0x03, 0x21, 0x4a, 0xbd, 0xd6, 0x14, 0x39, 0x3d,
+  0x10, 0x2c, 0x84, 0xbd, 0x47, 0xe0, 0xed, 0xbc, 0x8e, 0xfd, 0x91, 0xbc, 0x0e,
+  0x42, 0x93, 0xbc, 0xe4, 0x43, 0x6b, 0x3d, 0x96, 0xc7, 0x36, 0x3d, 0xb0, 0xc2,
+  0xac, 0xbb, 0x28, 0x29, 0x74, 0x3d, 0xf0, 0x10, 0xb5, 0xbb, 0x09, 0x5e, 0x6c,
+  0x3d, 0xc3, 0xa9, 0x97, 0x3c, 0x4f, 0xc1, 0x9c, 0x3c, 0x4e, 0xc4, 0xf0, 0x3c,
+  0x4e, 0x42, 0xfa, 0xbc, 0x9a, 0x53, 0x79, 0x3c, 0x9e, 0xc3, 0xd8, 0xbc, 0xfe,
+  0x1e, 0x57, 0x3c, 0xa2, 0xec, 0x3f, 0xba, 0xfa, 0x34, 0x12, 0x3d, 0x43, 0x1c,
+  0xd4, 0x3c, 0xf3, 0x3f, 0xa5, 0x3a, 0xda, 0xa7, 0x96, 0xbd, 0x6a, 0x5f, 0x2a,
+  0x3d, 0xbd, 0x83, 0xd3, 0xbb, 0xb8, 0x9c, 0x5b, 0xbd, 0x67, 0xbb, 0x2d, 0x3c,
+  0x44, 0x9a, 0xb0, 0xbc, 0x5c, 0x1b, 0xe6, 0x3c, 0x10, 0xfd, 0x67, 0xbd, 0x3b,
+  0x8e, 0x94, 0xbd, 0xf3, 0x97, 0xca, 0xbb, 0x3a, 0xae, 0x3f, 0x3c, 0xd2, 0xbe,
+  0x81, 0x3d, 0xd7, 0x2c, 0x86, 0xbd, 0x48, 0xc8, 0xbf, 0xbc, 0x00, 0x15, 0x5e,
+  0xbc, 0x43, 0x09, 0x1d, 0x3d, 0x3d, 0xe7, 0x75, 0xbd, 0x38, 0xe4, 0x5f, 0x3c,
+  0x8f, 0xe1, 0x09, 0x3d, 0xab, 0xa4, 0x16, 0xbd, 0x69, 0x15, 0x35, 0x3d, 0x6d,
+  0x6a, 0x20, 0xbd, 0xa1, 0xd2, 0x9b, 0xbb, 0x89, 0xfb, 0xd1, 0x3c, 0x91, 0x05,
+  0x82, 0x3d, 0x5c, 0x10, 0x3c, 0xbd, 0x7e, 0x4d, 0x5d, 0x3d, 0x5a, 0xac, 0x44,
+  0xbc, 0xe5, 0x82, 0xfd, 0xbc, 0xd7, 0xc2, 0x82, 0xbd, 0xe7, 0xd3, 0x5f, 0x3d,
+  0x3e, 0x16, 0x1e, 0x3d, 0x72, 0xcf, 0x9c, 0xbd, 0xf9, 0x44, 0xa2, 0xbc, 0x1c,
+  0x64, 0x69, 0xba, 0x9e, 0xc1, 0x01, 0x3c, 0x07, 0xc9, 0x81, 0xbd, 0x18, 0x75,
+  0x25, 0xbd, 0x12, 0x0b, 0xfd, 0xbc, 0x00, 0x54, 0xd5, 0x38, 0x73, 0x47, 0x85,
+  0xbd, 0xaa, 0x08, 0x68, 0x3d, 0xa5, 0xf5, 0xa8, 0xbc, 0xd7, 0xea, 0x16, 0x3d,
+  0x38, 0x81, 0x2a, 0xbd, 0xb0, 0x44, 0x45, 0x3d, 0xe6, 0x66, 0x71, 0x3d, 0x39,
+  0x4d, 0x58, 0xbc, 0x6c, 0xd5, 0xbc, 0xbc, 0x40, 0x65, 0xab, 0x3c, 0x92, 0x4f,
+  0x83, 0x3d, 0x46, 0xb4, 0x83, 0x3d, 0xf3, 0x7b, 0x5e, 0xbd, 0x8f, 0x77, 0x98,
+  0xbc, 0x28, 0xd3, 0xe2, 0xbc, 0xa8, 0x94, 0xdc, 0xbc, 0xdc, 0x3a, 0x03, 0x39,
+  0x6e, 0xd2, 0x81, 0x3c, 0x49, 0x64, 0xb8, 0xbc, 0xdb, 0x96, 0x03, 0xbd, 0xeb,
+  0x90, 0x4c, 0x3d, 0xcc, 0xc7, 0x45, 0xbc, 0xca, 0xbc, 0x4a, 0xbd, 0xcc, 0xf4,
+  0x90, 0x3c, 0x1e, 0x78, 0x93, 0x3b, 0xe8, 0x46, 0x68, 0xbd, 0x02, 0xe7, 0x78,
+  0xbc, 0x95, 0x12, 0x48, 0xbd, 0x36, 0xd3, 0x60, 0xbd, 0x0b, 0x6a, 0x1c, 0x3d,
+  0x9c, 0xa6, 0xb4, 0x3c, 0x20, 0xe6, 0xca, 0x3c, 0x52, 0x5e, 0x97, 0xbd, 0xe8,
+  0x0f, 0x10, 0xbd, 0x01, 0xe8, 0x51, 0xbd, 0xf1, 0x2a, 0x0e, 0xbd, 0x1d, 0x03,
+  0x85, 0x3a, 0x00, 0x7f, 0x50, 0x3d, 0x5a, 0x91, 0xd7, 0xbc, 0xc5, 0x55, 0x3b,
+  0x3d, 0xd6, 0x47, 0x8a, 0xbd, 0x2d, 0x40, 0x80, 0x3d, 0x49, 0x84, 0xd9, 0xbb,
+  0x2c, 0x7d, 0x5a, 0x3d, 0x94, 0x2d, 0xcd, 0x3c, 0x84, 0xe9, 0x90, 0xbd, 0x67,
+  0xf2, 0x95, 0xbd, 0xf6, 0x29, 0x12, 0xbd, 0x7b, 0x2e, 0x64, 0x3d, 0xf5, 0x42,
+  0x01, 0xbd, 0x42, 0x57, 0x2b, 0x3d, 0x0d, 0xd5, 0x99, 0xbd, 0xdf, 0xd5, 0x4b,
+  0xbd, 0xc4, 0x97, 0x4a, 0xbd, 0xb1, 0xb5, 0xa0, 0x3c, 0x97, 0xa5, 0x13, 0xbb,
+  0xda, 0x02, 0x11, 0x3d, 0x6e, 0x22, 0xce, 0xbb, 0x9f, 0x3e, 0xf0, 0x3c, 0x92,
+  0x5d, 0xb5, 0xbc, 0xda, 0x5e, 0x45, 0x3d, 0x53, 0x93, 0x0a, 0x3d, 0xa4, 0xf0,
+  0x8b, 0x3c, 0x4a, 0x4c, 0x04, 0x3d, 0x76, 0xc7, 0x8e, 0x3c, 0x55, 0xba, 0x39,
+  0x3c, 0xa5, 0xed, 0x8c, 0xbd, 0x16, 0x33, 0x80, 0xbd, 0x32, 0xd7, 0x3b, 0x3d,
+  0x07, 0xe9, 0x62, 0xbd, 0x6e, 0x01, 0x76, 0x3d, 0x42, 0x8b, 0x5e, 0xbd, 0x30,
+  0x56, 0x07, 0x3d, 0x2c, 0x8b, 0xdb, 0xbc, 0xaf, 0xff, 0x8f, 0xbd, 0xf3, 0x4a,
+  0x5d, 0xbd, 0xb0, 0x52, 0xb7, 0x3b, 0x29, 0x47, 0x9c, 0xbc, 0x5a, 0x8d, 0x30,
+  0xbd, 0x71, 0xf8, 0x07, 0x3d, 0xc0, 0x46, 0x27, 0xbd, 0x93, 0x7d, 0x89, 0xbc,
+  0xd2, 0x61, 0x39, 0x3d, 0x8d, 0x18, 0x69, 0x3c, 0x43, 0xd6, 0x18, 0xbc, 0x00,
+  0x37, 0x0f, 0xba, 0x68, 0x4c, 0x4a, 0x3d, 0x4a, 0x6d, 0x6c, 0xbd, 0x63, 0x4a,
+  0x7c, 0xbc, 0x0e, 0xed, 0x6b, 0xbd, 0x43, 0xc3, 0x97, 0xbd, 0xd0, 0x48, 0xa4,
+  0xbb, 0xb4, 0x48, 0xa0, 0x3c, 0x89, 0x3c, 0x89, 0xbd, 0x00, 0xa7, 0xb4, 0x39,
+  0xe2, 0xd3, 0x5e, 0x3d, 0x19, 0x2b, 0x10, 0xbc, 0x46, 0xef, 0x9a, 0xbd, 0x1c,
+  0x32, 0xac, 0x3c, 0xe2, 0x57, 0x4b, 0x3d, 0xf7, 0x44, 0x41, 0x3d, 0x84, 0x06,
+  0x89, 0xbc, 0x20, 0xf0, 0xb7, 0x3b, 0x3a, 0x7b, 0x50, 0x3d, 0xc0, 0xe4, 0x59,
+  0xbd, 0x06, 0x58, 0x19, 0x3d, 0x80, 0x23, 0xe1, 0x3b, 0xe2, 0xdc, 0x8c, 0xbd,
+  0xdc, 0x0a, 0x84, 0x3d, 0x96, 0xfe, 0x23, 0xbb, 0x45, 0x27, 0x40, 0xbd, 0x5d,
+  0xc4, 0x0f, 0x3d, 0xcc, 0xe2, 0xab, 0xbc, 0x64, 0xec, 0xf8, 0xbc, 0x5e, 0x9d,
+  0x1f, 0xbd, 0xa4, 0x84, 0x16, 0xbd, 0x26, 0x34, 0x99, 0xbd, 0xeb, 0x94, 0x91,
+  0x3d, 0xae, 0x2b, 0x25, 0x3d, 0x7d, 0x8a, 0x2c, 0x3d, 0x65, 0xdb, 0xa1, 0xbc,
+  0xb9, 0x5c, 0x2a, 0x3d, 0xe4, 0x06, 0x1d, 0xbb, 0xb6, 0xca, 0x17, 0x3d, 0xc8,
+  0xd8, 0x12, 0x3d, 0x5c, 0xf3, 0x28, 0xbd, 0x44, 0x6b, 0x85, 0xbc, 0xa0, 0x1c,
+  0x05, 0x3b, 0x1e, 0x13, 0x49, 0x3d, 0xd0, 0xbc, 0x07, 0x3d, 0xe4, 0xe8, 0x33,
+  0x3c, 0xe1, 0xbe, 0x4c, 0x3d, 0xcf, 0xa9, 0x0d, 0x3c, 0x52, 0x61, 0x62, 0x3d,
+  0x2e, 0x19, 0x63, 0x3d, 0xbe, 0x72, 0x86, 0x3d, 0x20, 0x7b, 0x34, 0x3c, 0xa0,
+  0x1b, 0x6d, 0xbb, 0xbe, 0xdf, 0xd9, 0x3a, 0x6b, 0xae, 0x4e, 0x3d, 0x3b, 0x38,
+  0x7d, 0xbd, 0xa1, 0xee, 0x3b, 0x3d, 0x51, 0x91, 0x37, 0x3b, 0x26, 0x34, 0xe4,
+  0xbc, 0x13, 0x50, 0x8c, 0xbd, 0x5b, 0x2d, 0x52, 0xbd, 0xb3, 0xf6, 0x5d, 0xbc,
+  0x82, 0x69, 0x3f, 0xbb, 0xf3, 0x6b, 0x14, 0x3d, 0xe8, 0x54, 0x9a, 0x3c, 0x42,
+  0xa5, 0x35, 0x3d, 0x99, 0x10, 0x0b, 0xbc, 0x87, 0x55, 0x2d, 0xbd, 0x1f, 0x1a,
+  0x16, 0xbd, 0x99, 0xaa, 0x16, 0xbc, 0x1a, 0x04, 0x3e, 0xbd, 0x62, 0x5f, 0x12,
+  0x3d, 0xea, 0x90, 0x18, 0x3d, 0x32, 0x9f, 0x17, 0x3d, 0x1c, 0x6f, 0xba, 0x3c,
+  0xce, 0xe2, 0x13, 0x3d, 0x47, 0xa2, 0xdb, 0xbc, 0xf7, 0x85, 0x4f, 0xbd, 0x24,
+  0x60, 0xc8, 0xbc, 0xea, 0x00, 0x5e, 0xbd, 0x08, 0x73, 0x58, 0x3d, 0xf3, 0x42,
+  0x85, 0xbd, 0x0e, 0xcd, 0x91, 0xbd, 0x3c, 0xba, 0xb1, 0xbc, 0x48, 0x41, 0x01,
+  0x3d, 0xb1, 0xcf, 0x64, 0x3d, 0x6f, 0x25, 0x9a, 0xbc, 0xda, 0xaa, 0xce, 0x3c,
+  0x22, 0x5f, 0x62, 0x3d, 0xf9, 0x36, 0x9b, 0xbd, 0x85, 0x6f, 0x81, 0x3d, 0x22,
+  0xd8, 0x2e, 0xbd, 0x72, 0x49, 0x19, 0xbd, 0x21, 0x3c, 0xb9, 0xba, 0xc5, 0x69,
+  0x8a, 0xbd, 0x68, 0xec, 0x08, 0xbd, 0xd9, 0x7e, 0x06, 0xbd, 0x0e, 0xa4, 0x36,
+  0x3d, 0x9e, 0xbb, 0x65, 0xbd, 0xaf, 0x04, 0x81, 0x3d, 0x07, 0xa0, 0x7b, 0xbd,
+  0xa7, 0x30, 0x51, 0xbd, 0x15, 0x8e, 0x05, 0x3c, 0xe0, 0x7a, 0x7c, 0x3c, 0x43,
+  0x90, 0x04, 0x3d, 0x00, 0xf1, 0x4b, 0xbb, 0xe0, 0xe9, 0x29, 0x3b, 0x6f, 0x91,
+  0x1d, 0xbd, 0xff, 0xc5, 0xd0, 0x3c, 0x6b, 0x02, 0xe3, 0x3c, 0xba, 0x1f, 0x53,
+  0xbc, 0x0e, 0xd5, 0x7e, 0x3d, 0x54, 0xe0, 0x97, 0xbc, 0x00, 0x7a, 0xf2, 0xb9,
+  0x66, 0x00, 0x84, 0x3d, 0x62, 0x17, 0x08, 0xbd, 0x5a, 0x30, 0x46, 0x3d, 0x75,
+  0xb1, 0x37, 0xbd, 0x6f, 0x28, 0x55, 0x3c, 0xe0, 0xc4, 0x82, 0xbd, 0xfc, 0xf5,
+  0xb2, 0xbc, 0x96, 0xdc, 0x0a, 0xbb, 0x83, 0x2a, 0x91, 0x3c, 0x29, 0x21, 0x40,
+  0x3d, 0xff, 0x1f, 0x9c, 0xbd, 0x82, 0xb2, 0x5d, 0x3d, 0x8e, 0x14, 0x2c, 0x3d,
+  0xec, 0xb2, 0xed, 0xbc, 0xb8, 0xa0, 0x3a, 0xbc, 0x66, 0x70, 0x11, 0xbc, 0x49,
+  0xa6, 0xd0, 0xbc, 0x55, 0x34, 0x14, 0xbc, 0xb4, 0x65, 0x80, 0x3d, 0x76, 0x98,
+  0x87, 0xbd, 0x23, 0x3d, 0xa2, 0x3c, 0xaa, 0xc5, 0x7e, 0x3d, 0xb7, 0x41, 0x91,
+  0xbd, 0x9f, 0xe6, 0x80, 0xbd, 0x20, 0x0a, 0x13, 0x3c, 0xc8, 0xa0, 0xf3, 0x3c,
+  0x51, 0xf3, 0x04, 0x3d, 0x61, 0x7e, 0x0c, 0x3d, 0xbe, 0x25, 0x47, 0x3d, 0x25,
+  0x2b, 0x2b, 0x3d, 0xa9, 0x7a, 0x3f, 0xbd, 0xc2, 0xd4, 0xe3, 0xbc, 0x67, 0xc5,
+  0x79, 0x3d, 0x10, 0x4b, 0xb0, 0x3c, 0xb8, 0xd1, 0x87, 0x3c, 0xd3, 0x7b, 0x54,
+  0xbd, 0x81, 0x81, 0xcc, 0x3c, 0x85, 0x81, 0x15, 0x3d, 0xaa, 0xa8, 0xb0, 0x3b,
+  0x4b, 0x90, 0xae, 0x3c, 0xaa, 0x38, 0x0f, 0x3d, 0x92, 0x82, 0x0a, 0xbd, 0xfd,
+  0x99, 0x51, 0x3d, 0x90, 0x87, 0x0b, 0xbd, 0xc6, 0x71, 0x58, 0xbd, 0x4f, 0x17,
+  0x86, 0x38, 0x03, 0x9a, 0x00, 0xbd, 0xeb, 0xae, 0x34, 0xbd, 0xab, 0x28, 0x19,
+  0x3b, 0xc5, 0x48, 0x6c, 0xbd, 0x4a, 0xa3, 0x7c, 0xbd, 0x1f, 0xe7, 0x00, 0x3c,
+  0xf4, 0xd8, 0xd8, 0x3c, 0xbc, 0x01, 0x59, 0xbd, 0xa9, 0x77, 0xb5, 0xbb, 0x67,
+  0xc3, 0x82, 0x3d, 0x37, 0xd8, 0x8c, 0x3d, 0xea, 0x92, 0x59, 0x3d, 0x30, 0x97,
+  0x31, 0x3d, 0x36, 0xb9, 0x23, 0xbb, 0x98, 0x99, 0x7f, 0xbd, 0x0b, 0xfd, 0x8e,
+  0xbc, 0x80, 0xc6, 0x5c, 0xbd, 0xb2, 0xf0, 0x76, 0x3d, 0x7e, 0x01, 0xe5, 0xbc,
+  0x0a, 0x94, 0x08, 0x3d, 0xb2, 0x9b, 0x7b, 0xbd, 0xdc, 0x27, 0x6b, 0xbd, 0x32,
+  0x1e, 0x41, 0x3d, 0x4b, 0xd8, 0x8a, 0xbd, 0xe6, 0xdc, 0xd5, 0x3c, 0x72, 0xfd,
+  0x09, 0xbd, 0x33, 0x80, 0xc5, 0xba, 0xbc, 0xdd, 0xc0, 0x3b, 0xf4, 0x31, 0x9a,
+  0xbd, 0x29, 0x45, 0xd9, 0x3c, 0x02, 0x33, 0xd8, 0xbc, 0x97, 0x48, 0x73, 0x3d,
+  0x7f, 0x13, 0x88, 0xbd, 0x9b, 0xed, 0x40, 0xbd, 0xae, 0x86, 0x7d, 0xbd, 0xea,
+  0xa5, 0x4a, 0x3b, 0x8d, 0xd4, 0xd8, 0x3c, 0x57, 0xc1, 0x28, 0xbc, 0x6a, 0xb8,
+  0x15, 0x3d, 0x30, 0xb0, 0xdc, 0xbb, 0x71, 0x34, 0x05, 0xbd, 0x39, 0x9c, 0x8a,
+  0x3d, 0x98, 0xdd, 0x45, 0xbc, 0xf1, 0xcc, 0xcb, 0xbc, 0xe1, 0xf6, 0xd8, 0x3c,
+  0xae, 0xb9, 0x18, 0xbb, 0x67, 0x50, 0x82, 0x3d, 0x20, 0x71, 0x82, 0x3d, 0x0e,
+  0x45, 0x4a, 0xbd, 0x30, 0x86, 0xbe, 0xbb, 0x60, 0xc7, 0x07, 0x3d, 0xdb, 0xf7,
+  0x04, 0xbd, 0x9a, 0xc3, 0xb2, 0xbc, 0xe0, 0x58, 0xf5, 0xbc, 0x12, 0x0a, 0x48,
+  0x3d, 0xf7, 0x85, 0x2e, 0x3d, 0xab, 0x2b, 0xe6, 0x3b, 0xed, 0x4c, 0x15, 0xbc,
+  0x99, 0x4b, 0xb1, 0xbc, 0xa1, 0x82, 0x09, 0x3d, 0x8b, 0x84, 0x09, 0xbd, 0x85,
+  0x5a, 0x38, 0xbb, 0x83, 0xc7, 0x80, 0xbd, 0xfe, 0xf3, 0x67, 0xbd, 0x6e, 0x25,
+  0x6f, 0x3d, 0x00, 0xa4, 0xf8, 0xbc, 0x3a, 0x24, 0x17, 0xbc, 0xb2, 0x0d, 0x8a,
+  0x3c, 0x87, 0xac, 0x69, 0x3d, 0xcd, 0x5f, 0x89, 0xbc, 0x9e, 0x08, 0x7d, 0xbd,
+  0x4c, 0xa4, 0xa0, 0xbc, 0x63, 0x21, 0x2c, 0x3d, 0x5a, 0x78, 0x71, 0xbd, 0xa2,
+  0xe8, 0x71, 0x3d, 0x2b, 0xc9, 0xc1, 0xbb, 0x6f, 0x4f, 0x78, 0xbd, 0xa9, 0xee,
+  0xdf, 0x3c, 0x3c, 0xe2, 0xb3, 0xbc, 0x64, 0xa2, 0x7d, 0xbc, 0xcc, 0x2c, 0x35,
+  0x3d, 0xfd, 0x8c, 0x86, 0x3d, 0xe9, 0x57, 0xf3, 0x3c, 0xc1, 0x84, 0x82, 0x3d,
+  0x8e, 0x7a, 0x6c, 0xbd, 0xf1, 0x40, 0x04, 0x3d, 0x7e, 0x17, 0x5b, 0x3d, 0x74,
+  0xba, 0x83, 0x3a, 0x6f, 0x01, 0x86, 0xbd, 0x62, 0x58, 0x69, 0xbd, 0x33, 0xcd,
+  0x07, 0x3d, 0x6e, 0xc5, 0x8c, 0xbd, 0x5a, 0x4c, 0x99, 0x3c, 0x87, 0xb8, 0xf0,
+  0x3c, 0xc1, 0x64, 0x8a, 0x3c, 0x4c, 0x69, 0x23, 0xbd, 0x93, 0x75, 0x80, 0x3d,
+  0x54, 0x27, 0x87, 0xbd, 0xdc, 0x3e, 0x62, 0x3d, 0x9e, 0xdb, 0x43, 0xbc, 0x03,
+  0xd4, 0x65, 0xbd, 0x4c, 0xb6, 0x59, 0x3d, 0xc4, 0xa1, 0xe8, 0xbc, 0xf3, 0xdc,
+  0x87, 0x3d, 0xf5, 0x34, 0x82, 0xbc, 0x4e, 0x2d, 0xe2, 0x3b, 0xd6, 0x1e, 0x3d,
+  0xbd, 0xea, 0x0c, 0x83, 0x3d, 0x34, 0x3e, 0x20, 0xbd, 0xb6, 0x87, 0x77, 0x3c,
+  0x9c, 0x9a, 0xe4, 0xba, 0x48, 0x21, 0xa5, 0xbc, 0xb3, 0x81, 0x89, 0x3d, 0xf4,
+  0x2c, 0x49, 0x3d, 0x98, 0xb5, 0xd6, 0xbc, 0x88, 0xdb, 0x30, 0xbd, 0xa4, 0x2f,
+  0x88, 0xbc, 0x67, 0xc1, 0xb6, 0xbc, 0x8e, 0xba, 0xb8, 0xbc, 0xdd, 0x22, 0xc2,
+  0x3c, 0xaf, 0x08, 0x8f, 0x3b, 0xa5, 0x85, 0xcb, 0xbc, 0x26, 0x24, 0x2c, 0x3d,
+  0x2c, 0x73, 0x35, 0x3c, 0xf9, 0xb2, 0xaf, 0xbb, 0xf2, 0x50, 0x2f, 0xbd, 0x15,
+  0x10, 0x31, 0x3c, 0x75, 0xdb, 0x67, 0x3d, 0x5c, 0xe2, 0xfe, 0x3c, 0x51, 0xe0,
+  0x8d, 0x3d, 0x1c, 0x25, 0xb9, 0x3c, 0xcf, 0x20, 0x80, 0x3d, 0x5c, 0x61, 0xdf,
+  0x3c, 0x9a, 0x2e, 0x5d, 0x3d, 0x4d, 0x63, 0xd8, 0x3c, 0x23, 0x0e, 0x32, 0xbc,
+  0x6a, 0xaa, 0x61, 0x3d, 0xa3, 0x74, 0x86, 0xbd, 0x60, 0x32, 0x73, 0x3b, 0xe3,
+  0x8b, 0x73, 0xbc, 0x6d, 0x26, 0x40, 0x3d, 0x8c, 0xbb, 0xbf, 0xbb, 0x4f, 0x89,
+  0xf9, 0x3c, 0x6a, 0xfe, 0x0b, 0x3d, 0x43, 0x89, 0x3f, 0xbd, 0xe6, 0x1f, 0xda,
+  0xbc, 0xdf, 0x48, 0x36, 0xbd, 0xd8, 0x5a, 0x8f, 0xbd, 0x58, 0x20, 0xfc, 0x3c,
+  0xec, 0xc0, 0x69, 0x3d, 0xc9, 0x17, 0x06, 0xbd, 0xc1, 0x2b, 0xd9, 0x3b, 0xba,
+  0x7f, 0x73, 0x3a, 0xde, 0xd4, 0xbd, 0xbc, 0x9f, 0x94, 0xd6, 0x3c, 0xfe, 0xb3,
+  0x56, 0x3c, 0xbd, 0xda, 0xd0, 0xbc, 0x9c, 0x13, 0x6c, 0xbc, 0x10, 0x12, 0xab,
+  0x3c, 0x94, 0x9f, 0x1d, 0xbd, 0x78, 0xbb, 0x9d, 0x3c, 0x6c, 0xca, 0x00, 0xbd,
+  0x4c, 0xb7, 0xb8, 0x3c, 0x09, 0x38, 0xd3, 0x3c, 0x4c, 0x70, 0x91, 0x3c, 0xe9,
+  0x6b, 0x26, 0xbc, 0x57, 0x19, 0xa4, 0x3c, 0xd2, 0xf7, 0x54, 0x3d, 0x0f, 0x9a,
+  0x48, 0x3d, 0xd0, 0xe2, 0x8f, 0x3b, 0x58, 0x63, 0x13, 0x3c, 0x81, 0xda, 0x1b,
+  0xbd, 0x77, 0x24, 0x83, 0x3c, 0xd7, 0x64, 0xc7, 0x3b, 0xb0, 0xf6, 0x6b, 0xbc,
+  0x8a, 0xaa, 0x62, 0x3d, 0xa4, 0x13, 0xbb, 0xbc, 0xe8, 0x06, 0xb3, 0x3c, 0xb1,
+  0x41, 0x77, 0x3d, 0x1c, 0xac, 0xe0, 0x3c, 0x40, 0x0f, 0x25, 0x3c, 0x89, 0xc0,
+  0x54, 0x3c, 0xec, 0x1d, 0x7a, 0x3d, 0x41, 0x1e, 0x31, 0x3d, 0x51, 0x3e, 0x26,
+  0x3d, 0x00, 0x55, 0x39, 0xbd, 0x2e, 0x9d, 0x7f, 0x3d, 0x2f, 0xe9, 0x4d, 0xbd,
+  0x46, 0x85, 0x35, 0xbd, 0xa2, 0x67, 0xf8, 0x3c, 0x16, 0x0f, 0x82, 0xbd, 0xcd,
+  0x48, 0x9a, 0x3b, 0x62, 0xd9, 0x08, 0x3d, 0x67, 0x0f, 0x5a, 0xbc, 0xd0, 0x09,
+  0x56, 0xbc, 0x31, 0x38, 0xda, 0xbc, 0x67, 0xf7, 0xa1, 0xbc, 0x8c, 0x2a, 0x79,
+  0xbd, 0xb3, 0xf5, 0xb1, 0xbc, 0xe8, 0xf4, 0x8b, 0xbd, 0x5f, 0x45, 0x11, 0xbd,
+  0x9f, 0x79, 0x1e, 0xbd, 0xf5, 0xbf, 0x86, 0x3d, 0x4e, 0xd8, 0xed, 0xbc, 0xcd,
+  0x66, 0x5b, 0x3c, 0x4a, 0x74, 0x8f, 0x3b, 0xe3, 0x98, 0x4f, 0x3d, 0x0d, 0x54,
+  0x91, 0xbb, 0x24, 0xb6, 0x1b, 0x3d, 0xd8, 0x0d, 0xb7, 0xbc, 0x04, 0x76, 0x31,
+  0xbd, 0x10, 0x43, 0x11, 0xbd, 0x0e, 0xc2, 0x02, 0xbd, 0x88, 0x66, 0x43, 0x3c,
+  0xb5, 0xda, 0x95, 0xbb, 0x07, 0x09, 0x28, 0xbd, 0x22, 0xcc, 0x19, 0xbd, 0xf0,
+  0x47, 0xfe, 0x3c, 0x10, 0x43, 0xfb, 0xbc, 0x5f, 0x5f, 0x2c, 0x3d, 0xfb, 0xce,
+  0x18, 0xbc, 0xcd, 0x87, 0x6a, 0x3d, 0xee, 0xf6, 0x61, 0xbd, 0x37, 0x86, 0x12,
+  0x3d, 0x4c, 0x01, 0xb7, 0x3c, 0x8c, 0x44, 0x19, 0xbd, 0xc1, 0x3d, 0xa6, 0x3c,
+  0xcd, 0xf1, 0x5e, 0xbb, 0x9e, 0xe0, 0x41, 0x3d, 0x8c, 0xfb, 0x95, 0xbd, 0xa7,
+  0x04, 0xc1, 0xbb, 0xcc, 0xf0, 0x25, 0xbd, 0x1c, 0x72, 0x81, 0x3c, 0x76, 0xf2,
+  0x6d, 0x3d, 0x3b, 0xf9, 0x86, 0x3d, 0xc2, 0xbe, 0x4a, 0x3d, 0x5d, 0x80, 0x5a,
+  0xbd, 0x63, 0x28, 0x3b, 0xbd, 0xb4, 0xb7, 0x5e, 0x3d, 0x04, 0x5b, 0x57, 0x3d,
+  0x64, 0xac, 0x56, 0xbd, 0xb6, 0x67, 0x35, 0xbd, 0xb1, 0xc7, 0x0b, 0x3d, 0x0c,
+  0xae, 0x2d, 0x3d, 0xcc, 0x4c, 0x7d, 0xbc, 0x2f, 0x01, 0x34, 0x3d, 0xa8, 0x4e,
+  0x63, 0x3d, 0xa3, 0xad, 0xb8, 0xbc, 0x32, 0x0c, 0x25, 0xbd, 0x66, 0x15, 0xab,
+  0xbc, 0x8a, 0x1a, 0x10, 0x3d, 0xca, 0xcb, 0x46, 0x3d, 0x4a, 0xe5, 0xfe, 0x3c,
+  0x4a, 0xcc, 0xa6, 0x3c, 0x2e, 0x05, 0x4f, 0xbb, 0x31, 0xef, 0x62, 0xbc, 0xa0,
+  0xeb, 0x7c, 0xbd, 0x49, 0x9b, 0x13, 0x3d, 0x07, 0x55, 0x82, 0x3d, 0xca, 0x81,
+  0x1d, 0xbd, 0x67, 0xc0, 0x52, 0x3b, 0xae, 0xd6, 0x0d, 0x3d, 0x53, 0x79, 0x70,
+  0xbd, 0x9c, 0x93, 0xa8, 0xbc, 0x5b, 0xbb, 0x58, 0x3d, 0x73, 0x1d, 0x0b, 0xbd,
+  0xe8, 0xe9, 0x0f, 0x3d, 0x3b, 0xda, 0xbd, 0xbb, 0x66, 0x91, 0x80, 0x3d, 0x46,
+  0xcc, 0xe8, 0xbc, 0x86, 0xe3, 0x32, 0x3d, 0x37, 0x9f, 0x5f, 0xbc, 0x9a, 0x06,
+  0x19, 0xbd, 0xec, 0xb6, 0x78, 0xbd, 0xd9, 0xd5, 0x49, 0xbd, 0xe8, 0xf9, 0x59,
+  0x3c, 0x48, 0x30, 0x8c, 0x3c, 0x03, 0x1d, 0x8a, 0x3d, 0x4d, 0x47, 0xc6, 0x3c,
+  0x77, 0x88, 0x9d, 0xbd, 0x3e, 0xf0, 0x63, 0xbd, 0x83, 0x92, 0x2b, 0xbd, 0x9a,
+  0xb0, 0x05, 0x3d, 0xee, 0x10, 0x86, 0x3c, 0xf1, 0xb2, 0x92, 0xbd, 0x2a, 0x0e,
+  0x3f, 0xbd, 0x6c, 0xfc, 0xbb, 0xbb, 0x62, 0xee, 0x16, 0x3a, 0xf8, 0xdb, 0xa1,
+  0x3c, 0x1c, 0xce, 0x43, 0xbd, 0xd3, 0xbf, 0x64, 0xbd, 0xe6, 0xb9, 0xc4, 0x3c,
+  0x43, 0x6b, 0x63, 0x3c, 0xe8, 0xbd, 0x87, 0x3c, 0x95, 0x2d, 0x29, 0x3d, 0x10,
+  0xbd, 0x7a, 0xbc, 0x26, 0xe3, 0x8e, 0xbd, 0xa1, 0x64, 0x70, 0xbd, 0xf7, 0x22,
+  0x8f, 0x3d, 0x68, 0x73, 0x95, 0xbc, 0x33, 0x1c, 0xdb, 0xbc, 0x95, 0x44, 0x11,
+  0x3d, 0xc5, 0x6c, 0x86, 0xbd, 0xf8, 0x9b, 0x8a, 0xbd, 0x48, 0xba, 0x13, 0x3c,
+  0x6a, 0x54, 0x28, 0xbd, 0xd0, 0xaa, 0x15, 0xbd, 0x32, 0x4e, 0x56, 0x3d, 0x8e,
+  0x65, 0x4b, 0x3d, 0x62, 0x4d, 0x76, 0xbc, 0x65, 0x5f, 0x05, 0x3d, 0x40, 0xb5,
+  0xb5, 0xbb, 0x1a, 0xd6, 0x83, 0x3d, 0x9d, 0xea, 0xa7, 0x3b, 0x73, 0x19, 0x59,
+  0x3c, 0xb2, 0x83, 0x25, 0xbd, 0x38, 0x93, 0x9e, 0x3c, 0x95, 0xe2, 0x7a, 0x3c,
+  0xc6, 0x09, 0x95, 0xbd, 0xfe, 0x8a, 0x84, 0x3d, 0x09, 0x99, 0x8c, 0x3d, 0x3d,
+  0xb5, 0x0e, 0xbd, 0x1e, 0x91, 0x8c, 0xbd, 0xc1, 0x52, 0xce, 0x3c, 0xc2, 0xa5,
+  0x88, 0xbd, 0x9c, 0x3f, 0x97, 0xbd, 0x79, 0x5b, 0xd3, 0x3c, 0x20, 0xf6, 0xfd,
+  0x3c, 0xcf, 0x37, 0x5f, 0x3c, 0x41, 0xc8, 0x6e, 0xbd, 0xa4, 0xde, 0xf8, 0x3c,
+  0xe6, 0x88, 0x19, 0xbc, 0xe3, 0x00, 0x01, 0x3d, 0xa7, 0x4e, 0x1e, 0xbd, 0xb8,
+  0xa1, 0x65, 0xbd, 0xbf, 0xfd, 0x81, 0xbd, 0xf0, 0x80, 0xe8, 0xbb, 0x3c, 0x62,
+  0xdc, 0x3c, 0x02, 0x96, 0x70, 0x3d, 0x05, 0x55, 0x7d, 0xbd, 0x66, 0xb3, 0x15,
+  0x3d, 0xa7, 0x8e, 0x16, 0xbd, 0xf5, 0xcf, 0x06, 0x3d, 0x5b, 0x78, 0xdf, 0xbc,
+  0x54, 0xcc, 0x2c, 0xbd, 0xdc, 0x15, 0xc6, 0xbc, 0xeb, 0xaf, 0x87, 0x3d, 0x3b,
+  0x65, 0x95, 0xbd, 0x52, 0x02, 0x65, 0x3d, 0x0a, 0x99, 0x0a, 0xbc, 0x6a, 0xfd,
+  0x67, 0x3d, 0x00, 0x53, 0x3e, 0xbd, 0xa0, 0xbe, 0xe4, 0xbc, 0xaa, 0x76, 0xf4,
+  0x3c, 0xd9, 0x22, 0x3c, 0xbd, 0x28, 0xa2, 0x3b, 0x3b, 0x44, 0x27, 0x7e, 0xbd,
+  0xb3, 0xd4, 0xa8, 0x3c, 0xb3, 0x30, 0x29, 0x3b, 0xd0, 0x0f, 0x3b, 0x3b, 0x74,
+  0x3e, 0x8a, 0xbd, 0x2f, 0x61, 0x1f, 0xbd, 0x58, 0x65, 0x4a, 0xbd, 0xd7, 0xb7,
+  0xf8, 0xbc, 0xfd, 0x91, 0x25, 0xbd, 0xfd, 0xd2, 0x39, 0xbd, 0x49, 0xa6, 0x82,
+  0x3d, 0xd8, 0x60, 0x04, 0x3d, 0xf8, 0x76, 0xac, 0x3c, 0x18, 0x61, 0x2d, 0xbc,
+  0xd6, 0xf2, 0x0b, 0xbd, 0x18, 0x53, 0x01, 0x3c, 0xac, 0x10, 0xb7, 0x3c, 0x22,
+  0xab, 0xd0, 0xbc, 0x40, 0x50, 0x3b, 0x3a, 0xf4, 0x70, 0x44, 0xbd, 0xb8, 0xaa,
+  0x81, 0xbd, 0x09, 0x70, 0x8f, 0x3c, 0x51, 0x00, 0xc5, 0xbc, 0x41, 0x17, 0xb8,
+  0xbc, 0xd2, 0xe1, 0x07, 0xbd, 0x58, 0xa0, 0x95, 0xbd, 0x7d, 0x24, 0x4b, 0xbd,
+  0x47, 0x50, 0x5f, 0x3d, 0x4a, 0x41, 0x1e, 0x3d, 0xc1, 0x38, 0x21, 0xbd, 0xbd,
+  0x82, 0x13, 0x3d, 0xdb, 0xe8, 0x4d, 0xbd, 0x76, 0x8d, 0x1d, 0xbc, 0x96, 0x2f,
+  0x72, 0x3d, 0xa9, 0x4c, 0x56, 0xbd, 0xe3, 0x39, 0x79, 0x3d, 0xf2, 0xaa, 0x0e,
+  0x3d, 0xee, 0xfa, 0x27, 0x3d, 0x70, 0x0c, 0x24, 0x3c, 0x3c, 0xf8, 0x7e, 0xbd,
+  0xc2, 0x3b, 0x55, 0xbb, 0x83, 0x9c, 0xcc, 0x3b, 0x52, 0x0f, 0x5d, 0x3d, 0x86,
+  0x3f, 0x3a, 0xbc, 0xf0, 0xbb, 0xbc, 0xbb, 0xe0, 0xff, 0xaf, 0x3c, 0x12, 0xca,
+  0x22, 0x3c, 0xd4, 0x78, 0x41, 0xbc, 0xc9, 0xaa, 0x1f, 0xbd, 0x7c, 0x59, 0x9e,
+  0x3a, 0x1a, 0x15, 0x4d, 0xbc, 0x25, 0x53, 0xfa, 0xbc, 0x6e, 0xbb, 0x82, 0xbc,
+  0xc2, 0x7d, 0x8d, 0x3c, 0xa8, 0x73, 0x19, 0xbd, 0x04, 0x34, 0x4c, 0xbc, 0xbb,
+  0x37, 0x5e, 0x3d, 0xb8, 0xc0, 0x30, 0x3d, 0xac, 0x71, 0x9d, 0xbd, 0xf8, 0x58,
+  0x2a, 0x3b, 0xd0, 0x94, 0xa4, 0x3b, 0xeb, 0x76, 0x5a, 0xbc, 0xcf, 0x43, 0x94,
+  0x3c, 0x48, 0x10, 0x66, 0x3d, 0x35, 0xee, 0x78, 0xbc, 0x29, 0x9a, 0x64, 0x3c,
+  0x39, 0x2a, 0x27, 0x3d, 0xab, 0x94, 0x8a, 0x3d, 0xb2, 0x3c, 0x0f, 0xbd, 0x76,
+  0x7f, 0x46, 0xbd, 0x68, 0xb2, 0x96, 0xbc, 0x98, 0xa2, 0x61, 0x3d, 0x97, 0x72,
+  0x92, 0xbd, 0xde, 0xac, 0x51, 0xbd, 0x03, 0xb8, 0x74, 0x3d, 0xb5, 0x3b, 0x8a,
+  0xbc, 0x70, 0xbf, 0x42, 0xbd, 0xf0, 0x0f, 0xf9, 0x3b, 0xb6, 0x4d, 0xc5, 0x3c,
+  0x16, 0xeb, 0x72, 0x3d, 0x90, 0x81, 0xcd, 0xbb, 0x00, 0x8b, 0x0b, 0xbc, 0xb1,
+  0x02, 0xa5, 0x3c, 0xee, 0xa7, 0x7d, 0xbd, 0xf0, 0x26, 0x0e, 0xbd, 0x1c, 0xb0,
+  0x52, 0xbd, 0x80, 0xdd, 0x2f, 0xbd, 0x43, 0xbb, 0xeb, 0xbc, 0xf9, 0xa6, 0xd1,
+  0xbc, 0xb1, 0x67, 0x29, 0xbd, 0xaa, 0xee, 0xf4, 0x3b, 0xc4, 0xab, 0x59, 0xbd,
+  0xb8, 0x83, 0x36, 0x3d, 0x20, 0xfc, 0x60, 0x3b, 0x28, 0xdd, 0x59, 0xbd, 0x5c,
+  0x16, 0xd1, 0xbc, 0x00, 0xbc, 0xcb, 0xbc, 0x9f, 0x8e, 0x62, 0xbc, 0x8e, 0xde,
+  0x53, 0xbd, 0xec, 0x4f, 0x26, 0x3d, 0xde, 0x94, 0x46, 0xbd, 0x50, 0x30, 0x0e,
+  0x3c, 0x20, 0xef, 0x7b, 0xbd, 0x83, 0x86, 0x38, 0x3c, 0x5a, 0xff, 0x1f, 0xbd,
+  0x61, 0x3e, 0xd5, 0xbc, 0x0b, 0xac, 0x65, 0x3c, 0xfd, 0x06, 0xa5, 0x3c, 0x2c,
+  0x94, 0x47, 0xbd, 0xe2, 0xc3, 0x7e, 0x3d, 0x40, 0xac, 0x67, 0x3d, 0xa4, 0x7a,
+  0x77, 0xbc, 0xfc, 0x13, 0xe7, 0x3c, 0x56, 0x69, 0x80, 0x3d, 0x27, 0x58, 0x18,
+  0x3d, 0x1e, 0x95, 0x0e, 0x3d, 0x3f, 0xa8, 0x41, 0x3d, 0x0f, 0xbb, 0x16, 0xbd,
+  0x45, 0x72, 0x89, 0xbd, 0xf1, 0xd2, 0xfb, 0x3c, 0x8f, 0x6b, 0x65, 0x3d, 0x50,
+  0x8a, 0x05, 0x3c, 0x99, 0x24, 0x90, 0xbd, 0xc8, 0x4d, 0x4f, 0x3d, 0x80, 0xb8,
+  0xd2, 0x3b, 0xe5, 0x51, 0xae, 0x3b, 0x25, 0x33, 0x2a, 0xbd, 0x05, 0x12, 0xd7,
+  0x3c, 0xc2, 0x1b, 0x33, 0x3c, 0x5f, 0x8d, 0x07, 0xbc, 0x79, 0x60, 0x26, 0x3d,
+  0xf7, 0x63, 0x83, 0x3d, 0x88, 0xb4, 0xc7, 0xbc, 0x40, 0x5d, 0xb0, 0xba, 0x6e,
+  0xaf, 0x39, 0xbd, 0x50, 0x93, 0xf3, 0x3c, 0xc4, 0x3b, 0x53, 0x3c, 0xf9, 0x8b,
+  0x60, 0xbd, 0x74, 0x4e, 0xbd, 0x3c, 0x40, 0xe6, 0xdd, 0x3c, 0x30, 0x78, 0x18,
+  0x3d, 0xaa, 0xed, 0x76, 0x3d, 0xd7, 0x20, 0x4b, 0x3d, 0x30, 0x08, 0xd1, 0x3c,
+  0x52, 0xf0, 0x61, 0x3d, 0x75, 0xea, 0x6a, 0x3d, 0x93, 0xef, 0xeb, 0x3c, 0x35,
+  0xad, 0x96, 0xbd, 0xca, 0x41, 0x21, 0x3d, 0x59, 0x18, 0x1e, 0x3d, 0x2c, 0xa8,
+  0x81, 0xbd, 0x7e, 0xdb, 0xd7, 0x3c, 0xfc, 0x7e, 0x1b, 0xbd, 0x26, 0x25, 0x86,
+  0x3d, 0xa9, 0x58, 0x9b, 0xbd, 0x0a, 0xef, 0xfa, 0xbc, 0xfe, 0x74, 0x74, 0x3d,
+  0xb0, 0x51, 0x80, 0xbd, 0x29, 0x42, 0x88, 0x3a, 0x56, 0xe7, 0x8c, 0xbb, 0x16,
+  0x5f, 0x43, 0x3d, 0x5b, 0x1d, 0x4c, 0x3c, 0xae, 0x9d, 0xbd, 0xbb, 0xbc, 0xcf,
+  0x44, 0xbc, 0x78, 0x8d, 0x6c, 0x3d, 0x30, 0x99, 0x2c, 0x3d, 0x52, 0x17, 0x9e,
+  0xbc, 0x3d, 0x52, 0x18, 0xbd, 0xfa, 0xcc, 0xb4, 0x3c, 0x9d, 0x56, 0x8d, 0x3d,
+  0x7e, 0xa0, 0x18, 0x3d, 0x88, 0x7b, 0x94, 0xbd, 0xe8, 0x02, 0xc7, 0xbc, 0x08,
+  0x22, 0x37, 0x3c, 0x18, 0x3b, 0x5d, 0xbd, 0xa4, 0xbb, 0xb4, 0x3c, 0xb0, 0x8d,
+  0x06, 0x3d, 0xe8, 0xf4, 0xb0, 0xbb, 0xb4, 0x8b, 0x31, 0xbc, 0xf8, 0xdf, 0xf4,
+  0x3c, 0x29, 0x19, 0x80, 0xbb, 0x29, 0x4c, 0x60, 0x3c, 0x4b, 0x11, 0x93, 0xbd,
+  0x4b, 0xbd, 0x66, 0xbd, 0x62, 0x8e, 0x88, 0x3c, 0xfe, 0xa2, 0x37, 0x3d, 0x41,
+  0xe1, 0x36, 0xbd, 0xbe, 0x7b, 0xc1, 0x3b, 0x6c, 0xff, 0xba, 0x3c, 0x8f, 0xae,
+  0xab, 0xbc, 0x7b, 0x37, 0xd5, 0xbc, 0x0d, 0xac, 0x18, 0xbd, 0xf2, 0xcb, 0x1d,
+  0x3d, 0xbb, 0xb0, 0x30, 0x3c, 0xbb, 0x1a, 0x41, 0x3b, 0x5b, 0x36, 0x11, 0xbd,
+  0x96, 0xb3, 0x86, 0x3d, 0x0b, 0xcb, 0xf9, 0x3c, 0x5c, 0x23, 0x60, 0xbc, 0x62,
+  0xe1, 0x33, 0xbd, 0x10, 0x91, 0x5e, 0x3d, 0xdf, 0xc8, 0x6c, 0xbd, 0xe7, 0x19,
+  0x60, 0x3d, 0x87, 0xa0, 0x5b, 0x3c, 0x8a, 0xc5, 0x65, 0x3d, 0x6c, 0x2e, 0x31,
+  0x3d, 0x99, 0xc7, 0x1a, 0x3d, 0xe8, 0xe6, 0x6f, 0x3c, 0x10, 0x95, 0xd9, 0x3b,
+  0x1d, 0xdd, 0x19, 0xbd, 0xdc, 0xfe, 0x32, 0x3d, 0x83, 0x85, 0x05, 0x3d, 0xd8,
+  0x24, 0x16, 0x3d, 0xf7, 0x73, 0x20, 0xbd, 0x77, 0x07, 0xc4, 0x3c, 0xdf, 0xd0,
+  0x92, 0x3c, 0x1a, 0x7d, 0x2c, 0xba, 0xb0, 0x19, 0xe8, 0xbc, 0x9e, 0x97, 0xec,
+  0xbb, 0x33, 0xb2, 0xb1, 0x3c, 0x89, 0xde, 0x81, 0xbd, 0x9d, 0xae, 0x57, 0xbc,
+  0x31, 0xd9, 0xbb, 0x3c, 0xa0, 0x2d, 0x27, 0x3d, 0x00, 0x99, 0x43, 0x3c, 0x2e,
+  0x32, 0x9d, 0xbc, 0xa2, 0x6d, 0x81, 0x3d, 0x38, 0xce, 0xc3, 0xbc, 0x8e, 0xd7,
+  0x7a, 0x3d, 0x2a, 0x89, 0x00, 0xbc, 0x2e, 0x52, 0x9f, 0xbc, 0x20, 0x47, 0x4d,
+  0xbd, 0xd9, 0x79, 0x5f, 0x3d, 0x09, 0x2c, 0x97, 0x3c, 0x9c, 0x28, 0x5f, 0x3b,
+  0x9d, 0xd3, 0x65, 0x3d, 0x44, 0x63, 0xbb, 0xbc, 0x0c, 0xfe, 0xc0, 0x3c, 0x71,
+  0xfa, 0x08, 0xbd, 0x40, 0x4a, 0xac, 0x3b, 0xca, 0x9d, 0x7a, 0x3d, 0xbd, 0x1c,
+  0x52, 0xbd, 0xc8, 0x90, 0x0e, 0x3d, 0x6b, 0x89, 0xbd, 0xbc, 0xa0, 0x74, 0x77,
+  0x3c, 0x8a, 0xe4, 0x44, 0xbd, 0x5f, 0x81, 0x56, 0x3c, 0x39, 0x9a, 0xc9, 0xbc,
+  0x33, 0xf4, 0x07, 0xbd, 0x48, 0xe0, 0x94, 0xbd, 0x3f, 0xfc, 0xdf, 0xbc, 0x41,
+  0x3e, 0xa9, 0x3c, 0x18, 0x06, 0x0e, 0x3c, 0xfb, 0xb9, 0xe2, 0x3c, 0x12, 0x14,
+  0x26, 0xbc, 0x8b, 0x15, 0x97, 0xbd, 0x43, 0xc8, 0x23, 0xbd, 0x8e, 0x30, 0xf7,
+  0x3a, 0x4c, 0xdc, 0x4f, 0xbd, 0x52, 0x50, 0x3c, 0xbc, 0xda, 0x70, 0x1b, 0x3d,
+  0xfc, 0xbc, 0x3a, 0x3d, 0x76, 0x5a, 0x39, 0xbd, 0x48, 0xc3, 0x50, 0x3d, 0xf9,
+  0xd3, 0x81, 0xbd, 0x1e, 0xdf, 0x09, 0xbd, 0xd3, 0xa3, 0x7a, 0x3d, 0x71, 0x42,
+  0x6b, 0xbd, 0x7e, 0x3a, 0x4e, 0x3d, 0xd0, 0x26, 0xc5, 0xbb, 0xde, 0x7d, 0x2d,
+  0x3d, 0xc0, 0xda, 0xd8, 0xba, 0x18, 0x43, 0x63, 0x3c, 0xb5, 0x93, 0xb6, 0x3c,
+  0xc7, 0xee, 0x49, 0xbd, 0xb2, 0x73, 0x47, 0xbd, 0xa6, 0x66, 0x3b, 0x3d, 0xea,
+  0xa2, 0x04, 0xbd, 0xde, 0x2b, 0x44, 0x3d, 0x41, 0x80, 0xee, 0x3c, 0x11, 0xbe,
+  0x72, 0x3c, 0x46, 0xdf, 0x63, 0xbc, 0x4d, 0xc3, 0xfb, 0xbc, 0x3d, 0xbc, 0x86,
+  0x3d, 0xf7, 0xad, 0x02, 0xbd, 0x7d, 0xb7, 0x0f, 0xbd, 0x99, 0x8c, 0x51, 0x3c,
+  0x85, 0xce, 0x50, 0xbd, 0x0d, 0xe0, 0x41, 0x3d, 0x3a, 0xb3, 0x21, 0xbb, 0xd0,
+  0x0b, 0xdd, 0xbb, 0x94, 0x62, 0x25, 0xbd, 0xc0, 0xab, 0xd1, 0xbc, 0xf0, 0xf6,
+  0x89, 0xbb, 0xbe, 0x10, 0xb9, 0xbc, 0x68, 0x2e, 0x3a, 0x3c, 0x22, 0x34, 0x20,
+  0xbd, 0x4d, 0xd9, 0x75, 0xbc, 0x74, 0x5d, 0x00, 0x3d, 0xf3, 0xd5, 0x5e, 0x3d,
+  0x7c, 0x61, 0xcc, 0xbc, 0x56, 0x76, 0x13, 0x3d, 0xda, 0x68, 0xe3, 0x3b, 0xa3,
+  0xa1, 0x89, 0x3d, 0xd0, 0xfa, 0x16, 0x3d, 0xf1, 0x86, 0x48, 0x3c, 0x71, 0x81,
+  0x83, 0x3b, 0x31, 0x30, 0x2a, 0xbd, 0x4e, 0xc0, 0xd6, 0x3c, 0xe6, 0xf3, 0xfd,
+  0xba, 0x6d, 0x46, 0x96, 0x3c, 0x60, 0xcc, 0x67, 0xbd, 0x11, 0x9c, 0xc6, 0x3c,
+  0xa8, 0x63, 0x21, 0xbd, 0xdb, 0xb3, 0x70, 0xbc, 0x42, 0x46, 0x38, 0xbd, 0x88,
+  0x73, 0x00, 0xbc, 0x48, 0x5e, 0x4e, 0x3d, 0x2d, 0x95, 0x26, 0xbd, 0xa0, 0x22,
+  0xb3, 0x3c, 0x56, 0xfb, 0x91, 0xbd, 0x51, 0x13, 0x06, 0x3c, 0x85, 0x69, 0x8a,
+  0x3d, 0x23, 0xf8, 0x89, 0xbd, 0x61, 0x24, 0xd3, 0xbc, 0x28, 0xd0, 0x0a, 0x3c,
+  0xe9, 0x4e, 0x85, 0x3d, 0xde, 0x12, 0x93, 0xbb, 0x18, 0x55, 0xdd, 0x3b, 0x57,
+  0xc2, 0x22, 0xbd, 0x85, 0x3f, 0x0a, 0xbd, 0x9d, 0x49, 0x86, 0x3d, 0x50, 0x01,
+  0x8f, 0x3b, 0x2c, 0xbf, 0xf5, 0xbc, 0x6b, 0xec, 0x04, 0x3c, 0x92, 0x0e, 0x9b,
+  0xbc, 0xfc, 0xe0, 0x28, 0xbd, 0x16, 0xeb, 0x9d, 0xbb, 0x20, 0xde, 0xf9, 0x3c,
+  0x58, 0x77, 0x06, 0xbd, 0x5c, 0x2a, 0x92, 0xbc, 0x62, 0x8d, 0xf6, 0xbc, 0x88,
+  0xcc, 0xa3, 0xbb, 0x60, 0xbf, 0xdb, 0x3c, 0x2c, 0xcb, 0x69, 0xbd, 0xe3, 0xcf,
+  0x89, 0xbb, 0x35, 0xad, 0x81, 0xbd, 0xf1, 0x3d, 0x3d, 0xbd, 0x05, 0x62, 0x81,
+  0x3d, 0x4e, 0xbe, 0x4d, 0x3c, 0x7e, 0xbf, 0x85, 0x3d, 0xfb, 0xc4, 0x23, 0xbb,
+  0xd8, 0x1b, 0x78, 0x3d, 0x1d, 0xd7, 0x9d, 0xbd, 0x5d, 0x69, 0x15, 0x3d, 0xb6,
+  0x7a, 0x93, 0xbc, 0x8c, 0xf1, 0xdf, 0xbc, 0xec, 0xfa, 0x2b, 0x3d, 0x40, 0xda,
+  0x86, 0x3a, 0x1c, 0x0e, 0x2f, 0xbd, 0x38, 0x71, 0x4c, 0x3d, 0x68, 0x87, 0x9a,
+  0xbd, 0x12, 0x86, 0x91, 0xbd, 0x60, 0x8f, 0x95, 0xbd, 0xd0, 0xe1, 0xf4, 0xbc,
+  0xa2, 0x77, 0x3f, 0x3d, 0xc0, 0xcd, 0xa1, 0x3c, 0xa2, 0x69, 0x6e, 0xbd, 0xba,
+  0xc9, 0x79, 0x3d, 0x6d, 0x05, 0xec, 0xbc, 0xb0, 0x63, 0x57, 0x3d, 0xfa, 0x05,
+  0xd4, 0xbc, 0xb2, 0xd2, 0x93, 0x3b, 0x7e, 0x40, 0x09, 0xbd, 0xf0, 0x2e, 0xd6,
+  0x3c, 0x00, 0x7b, 0x69, 0xbd, 0x6e, 0x10, 0x29, 0xbd, 0x69, 0x91, 0x92, 0xbb,
+  0x90, 0x9e, 0x38, 0x3d, 0x99, 0x1b, 0x69, 0xbd, 0x32, 0xd2, 0x49, 0x3d, 0x9d,
+  0xa4, 0x5d, 0xbd, 0x8b, 0x8e, 0x20, 0xbd, 0xcf, 0x0b, 0x92, 0xbd, 0x3c, 0xb7,
+  0xfb, 0x3c, 0xdf, 0xf9, 0x58, 0x3d, 0xa7, 0xf0, 0x3e, 0xbb, 0x6c, 0x7e, 0xbd,
+  0x3c, 0x83, 0xdf, 0x12, 0x3d, 0x37, 0x97, 0x84, 0x3d, 0xe0, 0x4e, 0x36, 0x3d,
+  0xf6, 0x06, 0x90, 0xbd, 0x07, 0xc0, 0xce, 0x3c, 0xb1, 0xc0, 0x49, 0x3d, 0x7b,
+  0x76, 0x02, 0x3c, 0x29, 0x97, 0x93, 0x3b, 0x16, 0x46, 0x45, 0xbd, 0x10, 0xb1,
+  0x92, 0x3b, 0x26, 0x69, 0x45, 0x3d, 0x1e, 0x1a, 0x6d, 0x3d, 0x60, 0x9f, 0xe3,
+  0x3b, 0x07, 0xab, 0x5f, 0x3d, 0x65, 0xce, 0x35, 0xbd, 0x61, 0x0d, 0x43, 0xbd,
+  0x56, 0xa7, 0x79, 0x3d, 0x61, 0x67, 0x37, 0x3d, 0x26, 0xf4, 0x90, 0xbd, 0x73,
+  0x2e, 0x1b, 0x3d, 0x39, 0x48, 0xe2, 0xb9, 0x57, 0x1e, 0x32, 0x3d, 0xaa, 0x2d,
+  0x16, 0x3c, 0xae, 0x6a, 0x94, 0xbc, 0xc1, 0x8b, 0x1e, 0xbd, 0xf1, 0x42, 0x4f,
+  0xbd, 0x6d, 0x34, 0x66, 0x3d, 0xc2, 0x39, 0x6a, 0xbd, 0x6e, 0x02, 0xab, 0x3c,
+  0xa8, 0x60, 0x3d, 0xbd, 0x69, 0x24, 0x93, 0xbd, 0xd2, 0x91, 0x8a, 0xbd, 0xfe,
+  0xa0, 0x30, 0xbd, 0xbd, 0x15, 0x28, 0xbd, 0x00, 0x1c, 0x02, 0x3a, 0x2e, 0xe2,
+  0x5b, 0xbb, 0xda, 0x90, 0x4d, 0x3d, 0x56, 0xc4, 0xd3, 0xbc, 0x25, 0xb8, 0x6d,
+  0x3d, 0x89, 0xe0, 0x47, 0x3d, 0x60, 0x4b, 0x04, 0xbb, 0x00, 0xd5, 0xdc, 0x39,
+  0x33, 0xc0, 0x7e, 0x3d, 0xce, 0x0c, 0x51, 0xbd, 0xb2, 0x49, 0xf0, 0xbc, 0xc8,
+  0x62, 0xa2, 0xbc, 0xdc, 0x45, 0x2a, 0x3d, 0x5e, 0xe2, 0x1b, 0xbd, 0xa6, 0x02,
+  0x9a, 0xbd, 0xe2, 0xf0, 0x89, 0xbd, 0xff, 0x15, 0xa8, 0xbc, 0xc2, 0x94, 0xb9,
+  0x3c, 0x8a, 0x28, 0x8b, 0xbc, 0x27, 0x32, 0x7d, 0x3d, 0x2b, 0x24, 0x75, 0xbd,
+  0xc1, 0x7f, 0x05, 0xbd, 0x8b, 0x7f, 0x28, 0xbd, 0xa4, 0xd9, 0x9a, 0xbc, 0x03,
+  0xc7, 0x23, 0xbc, 0xac, 0xd5, 0x6d, 0xbc, 0xfb, 0xf5, 0x70, 0xbc, 0x5c, 0x28,
+  0x5c, 0xbd, 0xf5, 0xa5, 0x54, 0x3d, 0xc4, 0x5f, 0x87, 0xbd, 0x28, 0x92, 0x51,
+  0x3c, 0x10, 0xc1, 0x87, 0x3d, 0x00, 0xeb, 0x1c, 0x3c, 0x9a, 0x6a, 0x52, 0x3d,
+  0x95, 0xc5, 0x1a, 0x3d, 0x9d, 0x84, 0x9b, 0x3c, 0x56, 0x33, 0xda, 0xbc, 0x28,
+  0x01, 0x64, 0x3d, 0xb1, 0x80, 0x4f, 0xbd, 0x50, 0x61, 0x89, 0xbd, 0xe0, 0x1f,
+  0x30, 0xbb, 0x63, 0x5a, 0x86, 0x3d, 0x06, 0x30, 0x56, 0x3d, 0xc6, 0x8e, 0x4e,
+  0xbd, 0xd1, 0xb8, 0xc6, 0xbc, 0xc6, 0x6c, 0xf4, 0xbc, 0x6c, 0x6f, 0x21, 0x3d,
+  0xea, 0x45, 0x86, 0x3c, 0xe7, 0x7b, 0x1c, 0xbd, 0xba, 0x38, 0x54, 0xbd, 0xa4,
+  0x78, 0x82, 0x3d, 0xdc, 0x98, 0x18, 0xbc, 0xa0, 0x85, 0x0d, 0x3d, 0x9e, 0xe7,
+  0x55, 0xbd, 0x8e, 0x64, 0x30, 0x3d, 0xda, 0xf4, 0x48, 0x3d, 0x69, 0xdc, 0xe8,
+  0x3c, 0x68, 0xc7, 0x0d, 0xbd, 0xdf, 0x7e, 0xb4, 0x3c, 0x3a, 0x30, 0x57, 0x3d,
+  0xc5, 0x7a, 0x1a, 0xbc, 0x42, 0xa7, 0x8c, 0x3d, 0xb1, 0x9c, 0x4f, 0x3d, 0xa0,
+  0x74, 0x36, 0xbc, 0x7e, 0x74, 0x25, 0x3d, 0xc8, 0x7c, 0x48, 0x3d, 0x7f, 0x68,
+  0x55, 0x3c, 0xa6, 0x62, 0xf8, 0xbc, 0x16, 0x5b, 0x2d, 0x3d, 0x79, 0x57, 0x6a,
+  0xbd, 0x86, 0xf0, 0x8b, 0xbc, 0x20, 0x1c, 0x3f, 0x3c, 0x92, 0x3d, 0x20, 0x3d,
+  0x40, 0x29, 0x7b, 0xbd, 0x32, 0x88, 0x5b, 0x3d, 0x28, 0x79, 0x2c, 0x3c, 0xeb,
+  0x80, 0xe3, 0x3c, 0xe5, 0x28, 0xa1, 0x3c, 0x95, 0xbb, 0x88, 0x3d, 0x1b, 0xa9,
+  0x95, 0xbc, 0xb0, 0x35, 0x5b, 0x3d, 0x02, 0xbd, 0x8e, 0xbc, 0x62, 0xe7, 0x1d,
+  0xbd, 0xad, 0xe5, 0xca, 0x3c, 0x6f, 0x93, 0x3f, 0xb9, 0x51, 0x7d, 0x48, 0xbd,
+  0x06, 0x75, 0x68, 0x3d, 0xa7, 0x08, 0x7b, 0xbd, 0x5e, 0xeb, 0x73, 0xba, 0xa1,
+  0x83, 0x31, 0x3d, 0xcd, 0x92, 0x55, 0x3c, 0x88, 0xdb, 0x3f, 0xbd, 0x67, 0x9c,
+  0x35, 0x3d, 0xa9, 0x4b, 0x14, 0x3d, 0x94, 0x6b, 0x6c, 0xbc, 0x6c, 0xa8, 0xe7,
+  0x3c, 0xc0, 0x02, 0xf7, 0xbb, 0xcb, 0xbc, 0x85, 0x3a, 0xf1, 0x91, 0xf0, 0xbc,
+  0x72, 0x77, 0x83, 0x3d, 0x68, 0xab, 0x30, 0x3d, 0xa0, 0x17, 0x96, 0xbc, 0x7d,
+  0xe6, 0x19, 0xbd, 0x18, 0x2c, 0x22, 0x3d, 0x88, 0x14, 0xaa, 0x3c, 0x40, 0x4d,
+  0xb3, 0xbc, 0x4c, 0xc2, 0x7a, 0xbc, 0xf8, 0x68, 0x53, 0x3c, 0x16, 0x1d, 0xc6,
+  0xbb, 0x2f, 0x2c, 0x71, 0xbd, 0xa3, 0x55, 0x80, 0x3d, 0x96, 0x18, 0x07, 0x3d,
+  0x34, 0xa8, 0xa1, 0xbc, 0x2b, 0x39, 0x58, 0x3d, 0x23, 0xc6, 0x68, 0x3d, 0x46,
+  0x84, 0x55, 0x3d, 0x0d, 0xd6, 0x3e, 0x3c, 0x2e, 0xc2, 0x0d, 0x3d, 0x88, 0x20,
+  0x26, 0x3c, 0x44, 0x1b, 0x23, 0x3d, 0x7f, 0x54, 0x8b, 0xbd, 0xda, 0xa3, 0x54,
+  0xbd, 0x9e, 0xad, 0x32, 0x3d, 0x17, 0x7c, 0x78, 0x3d, 0xcd, 0x11, 0x9f, 0xbc,
+  0x2c, 0x53, 0x57, 0x3b, 0x1a, 0x5a, 0x0a, 0xbd, 0x6d, 0x40, 0x67, 0x3d, 0x52,
+  0xb6, 0x56, 0x3d, 0x1c, 0x07, 0x96, 0xbd, 0xb0, 0x1c, 0x14, 0xbd, 0xc3, 0xda,
+  0x2b, 0x3c, 0x7a, 0x02, 0x61, 0x3d, 0xbd, 0x9f, 0x2a, 0xbd, 0x72, 0xf9, 0xbf,
+  0xbc, 0x79, 0xfe, 0xa3, 0x3c, 0xfc, 0x45, 0x43, 0xbd, 0x9e, 0xd3, 0x7b, 0x3d,
+  0x70, 0x3a, 0x6e, 0xbd, 0x78, 0xdc, 0x30, 0x3c, 0x93, 0x36, 0x67, 0x3d, 0x63,
+  0x08, 0x84, 0x3d, 0x5e, 0x4f, 0x40, 0x3a, 0xc5, 0xd9, 0xc1, 0x3c, 0xea, 0x6b,
+  0x31, 0x3d, 0x1e, 0xf8, 0xdc, 0xbb, 0x0b, 0x30, 0xfd, 0xbc, 0xc6, 0xf2, 0x87,
+  0x3d, 0xc5, 0xc9, 0xc7, 0x3c, 0x98, 0x0c, 0xba, 0x3b, 0xcf, 0x1a, 0x8d, 0xbd,
+  0x90, 0xa5, 0xe1, 0xbb, 0x16, 0xc3, 0x64, 0x3d, 0x03, 0x3a, 0x95, 0x3c, 0xaa,
+  0x98, 0x32, 0xbd, 0x95, 0xa5, 0x95, 0xbd, 0xde, 0x9e, 0x88, 0x3a, 0xbb, 0x39,
+  0x8e, 0xbd, 0x3d, 0xf1, 0x30, 0x3d, 0x6e, 0x57, 0x8c, 0x3d, 0xf3, 0x90, 0x25,
+  0xbd, 0xf8, 0x97, 0x2e, 0xbd, 0x21, 0xf3, 0x1b, 0x3d, 0x34, 0xd9, 0x5d, 0xbc,
+  0x24, 0x60, 0x23, 0xbc, 0x32, 0x24, 0xa6, 0x3b, 0x01, 0xf1, 0x61, 0xbd, 0x69,
+  0x3b, 0xaa, 0x3c, 0x54, 0xf0, 0x53, 0xbd, 0x40, 0x67, 0x64, 0x3b, 0x00, 0x84,
+  0xa1, 0xbb, 0xda, 0xb5, 0x6e, 0x3d, 0x0f, 0xfb, 0x3d, 0xbc, 0xf9, 0xf3, 0x0c,
+  0xbd, 0x5b, 0x52, 0xd1, 0xbb, 0x43, 0xf7, 0x04, 0xbd, 0xf9, 0x67, 0x7c, 0x3d,
+  0x36, 0xed, 0x30, 0xbd, 0xcf, 0x53, 0x62, 0x3c, 0x03, 0xbb, 0x79, 0xbd, 0x6d,
+  0xc8, 0x40, 0x3d, 0xc5, 0x5c, 0x19, 0x3d, 0x0e, 0xd5, 0x2d, 0xbd, 0x2d, 0x89,
+  0x92, 0x3d, 0xf3, 0xcc, 0x15, 0x3d, 0xe2, 0x92, 0x9e, 0xbc, 0x44, 0x74, 0x8e,
+  0xbd, 0x6b, 0x27, 0x96, 0xbd, 0x86, 0xcb, 0xe8, 0x3c, 0xab, 0xda, 0x99, 0xbb,
+  0xf6, 0x99, 0x19, 0xbb, 0xe8, 0xb3, 0x49, 0x3d, 0xa4, 0x79, 0x85, 0x3c, 0x4f,
+  0xb4, 0xf5, 0xbc, 0x5c, 0x1a, 0xa9, 0xbc, 0xa7, 0x63, 0x1f, 0xbd, 0x33, 0xff,
+  0x46, 0xbd, 0x39, 0x7f, 0x97, 0xbd, 0xd8, 0x75, 0x85, 0xbd, 0x55, 0x97, 0x94,
+  0xbc, 0x3e, 0x73, 0xb0, 0x3c, 0xf8, 0xb8, 0xee, 0x3c, 0xa0, 0xe4, 0x6e, 0x3b,
+  0x00, 0xde, 0x54, 0x3b, 0x3b, 0x2d, 0x90, 0xbc, 0xae, 0xd9, 0x89, 0xbd, 0x65,
+  0x3d, 0xf9, 0x3c, 0x5f, 0x64, 0x8a, 0xbd, 0x88, 0x25, 0x7c, 0xbb, 0x8c, 0x64,
+  0x35, 0xbc, 0x63, 0x28, 0x0c, 0x3d, 0x2d, 0x9c, 0xde, 0xbb, 0x62, 0x5c, 0x96,
+  0xbc, 0x12, 0x3c, 0x35, 0x3d, 0x50, 0x11, 0xcc, 0x3b, 0x56, 0x1a, 0x80, 0xbd,
+  0xd0, 0x1a, 0x98, 0xba, 0x88, 0xe4, 0x58, 0x3d, 0x09, 0xc2, 0x9e, 0x3b, 0xce,
+  0xc4, 0x3c, 0xbc, 0x88, 0x46, 0x09, 0xbd, 0xea, 0xde, 0x04, 0x3c, 0xd4, 0x45,
+  0x5d, 0xbd, 0x18, 0x90, 0x7e, 0x3d, 0x99, 0x67, 0x91, 0x3d, 0x8d, 0x01, 0xd7,
+  0xbc, 0x61, 0xdc, 0x6b, 0x3d, 0x36, 0x17, 0x96, 0x3c, 0x7e, 0x27, 0x6f, 0x3d,
+  0x52, 0xcb, 0xf7, 0x3c, 0xfc, 0x54, 0x75, 0xbc, 0x36, 0xbd, 0x25, 0x3d, 0x86,
+  0xd1, 0x7b, 0xbd, 0x5c, 0x19, 0x12, 0x3d, 0xda, 0xfb, 0x03, 0x3d, 0xee, 0x5f,
+  0x37, 0xbd, 0xd4, 0x39, 0x34, 0xbd, 0xb4, 0x2f, 0x8b, 0xbd, 0x29, 0xd4, 0x99,
+  0xbd, 0x4e, 0x31, 0x4a, 0x3c, 0x3a, 0x73, 0x7b, 0x3d, 0x97, 0x99, 0xac, 0xbb,
+  0x77, 0xe4, 0xac, 0xbc, 0x0c, 0x31, 0xc3, 0xbb, 0xd7, 0xdb, 0x85, 0x3d, 0x31,
+  0x4d, 0xd5, 0xbb, 0xb8, 0x71, 0xda, 0x3c, 0x7c, 0x01, 0x5a, 0x3d, 0x32, 0xe9,
+  0x57, 0x3d, 0x6f, 0xd9, 0x7a, 0x3d, 0x38, 0x6a, 0x77, 0xbc, 0x7b, 0x63, 0x5c,
+  0xbd, 0x8c, 0xe0, 0x02, 0xbd, 0xf2, 0x35, 0x47, 0x3d, 0x93, 0x0e, 0x59, 0xbd,
+  0xf8, 0xfa, 0x63, 0x3d, 0x1c, 0x59, 0x49, 0xbd, 0x48, 0x00, 0x3c, 0xbc, 0x52,
+  0xd8, 0x14, 0x3d, 0xc3, 0x56, 0x42, 0x3c, 0x7d, 0x74, 0xa9, 0x3c, 0x15, 0x40,
+  0x83, 0x3d, 0x9c, 0x8d, 0xe2, 0xbc, 0x47, 0xdb, 0x86, 0x3d, 0xcc, 0x7f, 0x2d,
+  0xbd, 0x39, 0xdd, 0x8f, 0x3d, 0xe8, 0xe7, 0x0c, 0x3c, 0xc0, 0xc6, 0xfa, 0x3a,
+  0x5e, 0x6c, 0x85, 0xbd, 0xae, 0x8d, 0x79, 0x3d, 0x29, 0x90, 0xd8, 0x3c, 0x09,
+  0x17, 0x85, 0xbc, 0x4d, 0xf9, 0x71, 0xbd, 0x74, 0xa6, 0xf3, 0xbb, 0xf0, 0x65,
+  0xee, 0xbc, 0x42, 0x45, 0x7b, 0x3d, 0xdc, 0x2b, 0x5e, 0xbd, 0x35, 0x5f, 0x3f,
+  0x3d, 0x10, 0x00, 0xdd, 0x3b, 0xb8, 0xd0, 0x94, 0xbc, 0xe8, 0xb4, 0xcc, 0xbc,
+  0xb3, 0x71, 0x2d, 0x3c, 0x00, 0x36, 0xc0, 0x3c, 0x3e, 0x20, 0x1e, 0xbd, 0x0e,
+  0xdf, 0x62, 0x3c, 0x55, 0xdc, 0x44, 0x3d, 0x27, 0x0e, 0x3a, 0xbc, 0x6b, 0xd4,
+  0x8c, 0x3c, 0xcc, 0xcc, 0x7f, 0xbd, 0xd4, 0x43, 0x3d, 0xbd, 0x5b, 0xac, 0x58,
+  0x3c, 0xf0, 0x58, 0xd2, 0xbc, 0x49, 0x1d, 0x38, 0x3d, 0x09, 0x7c, 0x1d, 0xbd,
+  0x7a, 0x5b, 0x00, 0xbd, 0xe4, 0x6e, 0xf0, 0x3c, 0x4a, 0xd3, 0x56, 0x3d, 0x28,
+  0x12, 0x8d, 0xbc, 0xbe, 0x44, 0x65, 0x3d, 0x0a, 0xd4, 0x16, 0xbc, 0xb0, 0x96,
+  0x16, 0xbd, 0xfa, 0xf1, 0x8d, 0x3d, 0x41, 0xd6, 0x74, 0x3d, 0xb5, 0x79, 0x85,
+  0xbd, 0x5d, 0xfb, 0x8e, 0xbc, 0xd8, 0x46, 0x86, 0xba, 0x2f, 0xa2, 0x8b, 0xbd,
+  0xd8, 0x91, 0x90, 0xbc, 0xf7, 0x73, 0xe6, 0xbc, 0x6c, 0x45, 0xac, 0x3c, 0xe4,
+  0xbe, 0x60, 0xbc, 0x4b, 0x18, 0x7f, 0x3d, 0x1f, 0xb0, 0x39, 0x3c, 0xc0, 0x64,
+  0x71, 0x3d, 0x2f, 0x99, 0x3e, 0xbd, 0xa8, 0x87, 0x2f, 0x3d, 0xdc, 0xb3, 0x94,
+  0xbd, 0xfa, 0xe2, 0x8c, 0xbd, 0x28, 0xb5, 0x2a, 0x3c, 0xa3, 0x13, 0x31, 0xbd,
+  0xe6, 0xae, 0xfc, 0xbc, 0x98, 0xb6, 0x68, 0xbd, 0x41, 0xdf, 0x66, 0x3b, 0xde,
+  0xc5, 0x2e, 0xbd, 0x24, 0x8c, 0x4c, 0xbd, 0xdb, 0x77, 0xe8, 0x3b, 0xc0, 0x23,
+  0xc1, 0xbc, 0x50, 0xcb, 0x98, 0xbc, 0x44, 0x4b, 0x32, 0x3d, 0xd0, 0xd5, 0xf9,
+  0xbc, 0x40, 0x77, 0xea, 0x3b, 0xaf, 0x97, 0xbc, 0x3c, 0x9f, 0x07, 0x8d, 0x3d,
+  0x26, 0xc4, 0x87, 0xbc, 0x48, 0xff, 0x1b, 0x3d, 0x90, 0x07, 0xc0, 0x3b, 0xa0,
+  0xeb, 0x61, 0xbb, 0x61, 0x90, 0x8c, 0x3d, 0x46, 0x0b, 0x89, 0xbd, 0x61, 0x99,
+  0x09, 0xbd, 0x27, 0xb3, 0x3a, 0xbc, 0xad, 0x56, 0xff, 0xbc, 0xa6, 0xaf, 0x7f,
+  0x3d, 0x50, 0x1d, 0x09, 0xbd, 0x82, 0xfd, 0xcd, 0xbc, 0x31, 0x6c, 0x4d, 0x3d,
+  0x6d, 0xe8, 0x8c, 0x3c, 0x59, 0x5e, 0xb7, 0xbb, 0xa8, 0x14, 0x49, 0x3d, 0x86,
+  0xe4, 0x89, 0xbc, 0x41, 0xc7, 0x0c, 0xbd, 0xf5, 0x84, 0x80, 0x3d, 0x31, 0x71,
+  0x88, 0x3d, 0x3b, 0xcf, 0x84, 0xbd, 0x4f, 0xc3, 0x89, 0x3d, 0x24, 0x62, 0x21,
+  0xbd, 0xb0, 0xc2, 0xdb, 0x3b, 0xf8, 0xc8, 0x46, 0xbd, 0xa5, 0xe0, 0x89, 0x3d,
+  0x89, 0x41, 0x29, 0x3c, 0x90, 0xbd, 0xe7, 0x3c, 0x78, 0xc9, 0x42, 0xbc, 0x1f,
+  0xd6, 0x82, 0x3d, 0xfb, 0xcd, 0x87, 0xbd, 0x2a, 0xd2, 0x24, 0xbd, 0x86, 0x49,
+  0x6d, 0xbd, 0x62, 0x20, 0xc8, 0xba, 0xb0, 0xc4, 0xec, 0xbc, 0xdf, 0x68, 0xb4,
+  0x3a, 0xe3, 0x0f, 0xe7, 0x3c, 0x41, 0xd5, 0x2e, 0xbd, 0xd4, 0xd6, 0x7c, 0xbd,
+  0xb6, 0xd8, 0x2f, 0x3d, 0x2e, 0x95, 0xf2, 0xbc, 0x7c, 0xa4, 0xd0, 0xbc, 0x84,
+  0x63, 0x61, 0x3d, 0xfe, 0x1c, 0x26, 0x3d, 0x29, 0x38, 0x6e, 0x3c, 0xff, 0xb9,
+  0x12, 0xbd, 0xbc, 0xc6, 0x8d, 0x3d, 0xe1, 0xf5, 0x94, 0xbd, 0xd6, 0x91, 0x86,
+  0xbd, 0x88, 0xb9, 0x58, 0xbc, 0x50, 0x18, 0xb0, 0xbb, 0x95, 0x6f, 0x84, 0x3d,
+  0xd1, 0x02, 0x2c, 0xbd, 0xdd, 0xec, 0x00, 0x3d, 0x2c, 0x87, 0x33, 0x3c, 0x83,
+  0xae, 0x83, 0xbd, 0xf9, 0xfc, 0xc7, 0x3b, 0x54, 0x47, 0x34, 0xbc, 0xdc, 0xeb,
+  0x44, 0xbc, 0xc1, 0x33, 0x1f, 0xbd, 0x2e, 0xa0, 0xe7, 0xbc, 0x18, 0x92, 0x5b,
+  0xbc, 0x75, 0xee, 0x48, 0x3d, 0xcf, 0xe5, 0x29, 0x3c, 0xdd, 0xfb, 0xcd, 0xbc,
+  0x1e, 0xfe, 0x15, 0xbd, 0xfa, 0x83, 0x24, 0xbd, 0x74, 0xa7, 0x1b, 0x3d, 0x79,
+  0x43, 0xf6, 0x3c, 0xc1, 0x09, 0xcc, 0xbb, 0x23, 0xce, 0x51, 0x3d, 0x90, 0xbd,
+  0x6d, 0xbd, 0xd3, 0x87, 0xa9, 0x3c, 0xa6, 0x5c, 0x6b, 0x3d, 0x30, 0xbc, 0xd0,
+  0xbb, 0x43, 0x24, 0x71, 0xbd, 0xf1, 0xc3, 0x69, 0xbc, 0xcc, 0x77, 0x5d, 0xbd,
+  0xf5, 0x11, 0x95, 0xbd, 0x90, 0x17, 0xc7, 0xbc, 0x44, 0x6c, 0x85, 0xbd, 0xeb,
+  0x43, 0xd6, 0x3c, 0xe3, 0x8d, 0x8b, 0x3d, 0xbf, 0x68, 0x3d, 0xbd, 0x6d, 0x69,
+  0x86, 0xbd, 0xb5, 0x14, 0x8f, 0xbd, 0xe9, 0x70, 0x0c, 0xbc, 0x97, 0x30, 0x78,
+  0x3d, 0xd2, 0x1f, 0x57, 0xbd, 0x08, 0xe4, 0x28, 0x3d, 0x34, 0x1f, 0xf3, 0xbc,
+  0x18, 0xb7, 0x66, 0xbc, 0x00, 0x60, 0x30, 0x3c, 0xc1, 0x3d, 0x1f, 0xbd, 0x26,
+  0x9a, 0x85, 0x3d, 0xc6, 0x32, 0x88, 0xbd, 0x36, 0x33, 0x5c, 0xbd, 0x81, 0xb7,
+  0x89, 0xbd, 0x9f, 0x29, 0xeb, 0xbb, 0xe3, 0x50, 0x3d, 0x3d, 0x24, 0x66, 0x88,
+  0xbd, 0xcc, 0xc0, 0x0d, 0x3d, 0xd2, 0xa9, 0x92, 0x3c, 0x54, 0x72, 0x02, 0x3d,
+  0xd5, 0x3b, 0x90, 0xbb, 0x3d, 0x9f, 0x63, 0xbd, 0xed, 0xbe, 0x18, 0xbd, 0x59,
+  0xec, 0x6e, 0x3b, 0x28, 0xf2, 0x29, 0xbc, 0xc7, 0xce, 0xab, 0x3c, 0xf4, 0xc8,
+  0x79, 0xbd, 0x7c, 0x71, 0x30, 0x3d, 0x75, 0xbb, 0x80, 0xbc, 0x5c, 0xc6, 0x6b,
+  0xbd, 0x61, 0x73, 0x3c, 0x3d, 0x74, 0x82, 0x33, 0xbd, 0xd2, 0x32, 0x79, 0x3c,
+  0x9c, 0x80, 0xb6, 0xbb, 0xef, 0xee, 0x5f, 0x3d, 0xf8, 0x07, 0x30, 0xbd, 0xb1,
+  0x7f, 0x2f, 0xbd, 0xc2, 0x76, 0x36, 0xbd, 0x9e, 0x38, 0xa3, 0x3c, 0x7c, 0x4e,
+  0x47, 0xbc, 0x48, 0xce, 0x1a, 0x3d, 0xfc, 0xcd, 0xc2, 0x3c, 0x65, 0xb0, 0x07,
+  0x3d, 0x51, 0x39, 0x1c, 0x3d, 0x27, 0x56, 0x87, 0x3d, 0x63, 0x07, 0xdd, 0x3c,
+  0x2b, 0xd5, 0x82, 0x3d, 0xb0, 0x9d, 0x85, 0xbd, 0xc5, 0x43, 0xf0, 0x3c, 0x19,
+  0x0c, 0x95, 0x3b, 0x28, 0x64, 0x6b, 0xbd, 0x8e, 0x23, 0x09, 0xbd, 0xfa, 0x58,
+  0xfc, 0x3b, 0x40, 0xca, 0x5d, 0x3c, 0xa0, 0xbe, 0x58, 0xbd, 0xb1, 0x3b, 0x91,
+  0xbd, 0xd1, 0x73, 0xf0, 0x3a, 0x1d, 0x07, 0x31, 0x3d, 0x7d, 0x80, 0x07, 0x3d,
+  0xda, 0x52, 0x44, 0x3c, 0x78, 0x62, 0x58, 0x3c, 0x8d, 0x84, 0x01, 0x3d, 0x66,
+  0x36, 0x76, 0xbd, 0x68, 0xd0, 0x03, 0xbc, 0x43, 0x54, 0x56, 0x3c, 0xae, 0xac,
+  0x59, 0x3d, 0x36, 0xce, 0x48, 0xbd, 0xd4, 0xc1, 0x65, 0xbc, 0xd9, 0xee, 0x34,
+  0x3c, 0x80, 0x4c, 0x66, 0xba, 0x88, 0xe1, 0x3c, 0x3c, 0xc8, 0xb7, 0x04, 0x3d,
+  0x90, 0xdf, 0xdf, 0x3c, 0x20, 0x76, 0x1c, 0x3b, 0xfb, 0x80, 0x1e, 0x3d, 0x7e,
+  0xbd, 0x19, 0x3d, 0x1f, 0x28, 0x96, 0xbb, 0x19, 0xa6, 0x3c, 0x3c, 0x3f, 0xc7,
+  0xf9, 0xbc, 0x4a, 0xc2, 0x1a, 0xbd, 0xd5, 0xa0, 0x86, 0xbd, 0x3a, 0xc8, 0xd6,
+  0x3c, 0xc3, 0x1a, 0x5a, 0x3d, 0x1a, 0x8c, 0x91, 0xbd, 0xd0, 0x10, 0x67, 0x3d,
+  0x42, 0x5b, 0x16, 0x3d, 0xa3, 0xd2, 0x5b, 0xbc, 0x6c, 0xa0, 0xb6, 0x3c, 0x65,
+  0xe2, 0x1d, 0xbd, 0x9a, 0xdf, 0x0e, 0xbd, 0xc0, 0x74, 0xcf, 0x3b, 0x84, 0xe1,
+  0xc1, 0x3c, 0x2a, 0xed, 0x60, 0x3d, 0xe3, 0x10, 0xe4, 0xbc, 0x3f, 0xcc, 0x8b,
+  0xbd, 0x95, 0xa5, 0x8b, 0x3d, 0xd8, 0xc3, 0x00, 0xbd, 0x85, 0x56, 0x75, 0x3d,
+  0xac, 0x3a, 0x5b, 0x3d, 0x6a, 0x5d, 0xed, 0xbb, 0xbb, 0xd3, 0xd5, 0x3c, 0xac,
+  0xb0, 0x3f, 0x3d, 0x70, 0x1a, 0x6b, 0x3c, 0x70, 0xca, 0x28, 0x3c, 0xa2, 0x71,
+  0xde, 0xbc, 0x00, 0x22, 0x77, 0x3a, 0x43, 0x45, 0x21, 0xbd, 0x17, 0xa9, 0x34,
+  0x3d, 0x4d, 0x49, 0x2d, 0xbd, 0xb5, 0xd6, 0x8b, 0x3d, 0x84, 0xa5, 0xbd, 0xbc,
+  0x9d, 0x7f, 0x02, 0xbd, 0x85, 0x08, 0x80, 0xbd, 0xff, 0x2d, 0x8f, 0xbc, 0x04,
+  0x5f, 0x3b, 0xbd, 0xba, 0xce, 0x17, 0xbd, 0xf3, 0xfc, 0x80, 0x3d, 0xe1, 0x9c,
+  0x8c, 0xbd, 0xaf, 0x1c, 0xc6, 0x3c, 0x77, 0x31, 0x12, 0x3d, 0xde, 0x28, 0x49,
+  0xbd, 0x0d, 0xe3, 0x1f, 0xbd, 0x2a, 0x71, 0x30, 0xbc, 0x1e, 0x04, 0x35, 0x3d,
+  0x08, 0x0a, 0xad, 0x3b, 0xe9, 0x97, 0x98, 0xbc, 0x26, 0xe3, 0x00, 0x3c, 0xbe,
+  0xf9, 0xbb, 0xbc, 0x77, 0x23, 0x34, 0xbd, 0x55, 0x69, 0x61, 0x3d, 0xc4, 0xb9,
+  0x8d, 0xbd, 0x5f, 0x82, 0x81, 0x3d, 0x68, 0xff, 0x16, 0xbc, 0x2c, 0xa2, 0x91,
+  0xbc, 0x67, 0x62, 0x78, 0xbd, 0x76, 0x32, 0x13, 0x3d, 0x68, 0x26, 0x2b, 0x3d,
+  0x1a, 0xbb, 0xdc, 0xbc, 0xae, 0x91, 0x84, 0x3d, 0xc0, 0xfe, 0x8d, 0xbd, 0xfe,
+  0x28, 0x88, 0xbc, 0x02, 0x43, 0x0e, 0xbc, 0x0b, 0x35, 0x69, 0xbb, 0xb4, 0xf8,
+  0x8b, 0xbd, 0xad, 0x86, 0x6e, 0xbd, 0x5c, 0x92, 0x19, 0xbd, 0x03, 0x18, 0x59,
+  0xbd, 0x58, 0x48, 0x55, 0xbc, 0x2e, 0xaf, 0x4d, 0x3d, 0x70, 0x1a, 0x59, 0xbc,
+  0x63, 0xf3, 0x3d, 0xbd, 0x97, 0xcd, 0x8f, 0xbd, 0x4b, 0x2b, 0x75, 0x3d, 0x78,
+  0xf6, 0x78, 0xbd, 0x40, 0x84, 0x01, 0xbd, 0x04, 0xb6, 0x05, 0xbd, 0x21, 0xa7,
+  0xf7, 0x3c, 0x9e, 0x08, 0xc5, 0x3c, 0x3b, 0xde, 0xa8, 0xbc, 0x04, 0x81, 0x85,
+  0x3c, 0x7d, 0x36, 0xd2, 0x3c, 0x02, 0xf0, 0xd0, 0xbc, 0xcb, 0xe0, 0x68, 0x3d,
+  0xb3, 0x19, 0x89, 0xbd, 0x39, 0xf7, 0x5f, 0x3d, 0x6a, 0x8f, 0x05, 0xbc, 0x7c,
+  0xc8, 0x91, 0xbc, 0xec, 0xc4, 0x93, 0x3c, 0xa0, 0x62, 0x3a, 0xbb, 0x59, 0xfc,
+  0x1a, 0xbd, 0xc9, 0xcd, 0x95, 0xbd, 0x57, 0xc3, 0x5b, 0xbb, 0x67, 0x2f, 0xe4,
+  0x3c, 0x13, 0xcc, 0xa5, 0x3c, 0x1d, 0x6c, 0x39, 0xbc, 0x50, 0x64, 0x83, 0x3c,
+  0x50, 0x6d, 0x5b, 0xbc, 0xda, 0x2a, 0xcd, 0x3c, 0x09, 0xb3, 0x96, 0xbd, 0x91,
+  0x4f, 0x34, 0x3d, 0x33, 0xd0, 0x17, 0xbd, 0x1d, 0x22, 0x86, 0xbd, 0x9c, 0x1e,
+  0x0d, 0xbd, 0xd4, 0x2b, 0x9c, 0xba, 0x67, 0xb5, 0xa7, 0xbc, 0x0f, 0xe2, 0x76,
+  0xbd, 0x4b, 0xb9, 0x71, 0x3d, 0x69, 0xa9, 0x9c, 0xbc, 0x30, 0x44, 0x47, 0x3d,
+  0xf0, 0xdc, 0x95, 0x3c, 0xe2, 0x1d, 0x22, 0xbd, 0xaa, 0xb5, 0x58, 0xbd, 0x9d,
+  0x59, 0x7d, 0xbd, 0xa4, 0x92, 0x95, 0x3c, 0x40, 0xaa, 0x8d, 0xbd, 0xf0, 0x3e,
+  0xb4, 0x3c, 0xc2, 0x03, 0x2a, 0xbd, 0xb0, 0xc5, 0x29, 0xbd, 0xc0, 0x7c, 0x42,
+  0xbd, 0xea, 0x99, 0x7e, 0x3d, 0xd6, 0xbc, 0x15, 0x3d, 0xb9, 0xda, 0x37, 0xbd,
+  0xd0, 0x21, 0x9e, 0x3c, 0x79, 0x2e, 0xab, 0xbb, 0x73, 0x17, 0xcd, 0xbc, 0x7c,
+  0x01, 0xe3, 0x3c, 0xb7, 0xb8, 0xf2, 0x3c, 0x11, 0x4b, 0x45, 0x3d, 0x87, 0x86,
+  0x9a, 0x3c, 0x2c, 0x70, 0x57, 0xbd, 0x55, 0xdf, 0x1d, 0xbd, 0xf5, 0x86, 0xa6,
+  0xbc, 0x21, 0x96, 0x49, 0xbd, 0x36, 0x4c, 0x75, 0xbd, 0xc9, 0x1c, 0xa0, 0x3c,
+  0x5d, 0xba, 0x26, 0x3d, 0xd6, 0x56, 0x02, 0x3d, 0x69, 0x90, 0x12, 0xbc, 0x08,
+  0x5b, 0x0f, 0xbd, 0x81, 0xce, 0x92, 0xbc, 0x3a, 0xb8, 0x5f, 0x3d, 0x7a, 0xaf,
+  0xe7, 0x3c, 0x4d, 0x4b, 0x60, 0xbc, 0x78, 0xc0, 0x6c, 0xbd, 0x85, 0x6f, 0xe7,
+  0x3c, 0xaa, 0xc1, 0xb3, 0x3c, 0x8b, 0xe4, 0xb7, 0x3c, 0xdd, 0xd0, 0x39, 0x3d,
+  0x48, 0x49, 0x1b, 0x3d, 0xe2, 0x74, 0x28, 0xbd, 0x86, 0x4a, 0x47, 0x3d, 0x30,
+  0x77, 0xad, 0x3b, 0xe0, 0xa8, 0x0e, 0xbc, 0xec, 0x36, 0xd1, 0x3c, 0xe3, 0x01,
+  0x8f, 0xbd, 0x56, 0x6c, 0x34, 0xbd, 0x8a, 0x99, 0x20, 0xbb, 0xb1, 0x89, 0x12,
+  0x3d, 0xea, 0x43, 0x39, 0xbd, 0x26, 0x16, 0xd2, 0x3c, 0xe2, 0x88, 0xc8, 0x3c,
+  0x63, 0x15, 0xa0, 0x3c, 0x8d, 0x95, 0x3a, 0x3d, 0x86, 0x69, 0x26, 0xbd, 0x4c,
+  0x38, 0xdb, 0x3b, 0xe0, 0xfa, 0x49, 0x3d, 0x62, 0xdf, 0xb4, 0xbc, 0x6a, 0xe4,
+  0x89, 0xbc, 0x63, 0x50, 0x6d, 0x3d, 0xfa, 0x35, 0x46, 0xbd, 0xcb, 0xcb, 0x8c,
+  0xbc, 0x46, 0x94, 0x66, 0x3d, 0xdd, 0xf8, 0xa2, 0xbc, 0x00, 0x34, 0x8c, 0x3d,
+  0x0a, 0xa1, 0x05, 0x3d, 0x73, 0x92, 0x91, 0xbd, 0x64, 0x3e, 0xf4, 0xbc, 0xcd,
+  0x5a, 0xa4, 0xbc, 0xe6, 0xce, 0x4b, 0x3d, 0x68, 0xb0, 0xcf, 0xbc, 0x38, 0xd3,
+  0xe2, 0x3b, 0xfd, 0x03, 0x38, 0xbd, 0x11, 0xc0, 0x92, 0xbd, 0xa8, 0x82, 0x50,
+  0x3d, 0x2a, 0x9a, 0xaf, 0xbc, 0x0e, 0xea, 0x7b, 0x3d, 0x11, 0xf4, 0x95, 0xbc,
+  0x34, 0xed, 0xb6, 0x3c, 0x2b, 0x26, 0x6f, 0xbd, 0x15, 0xad, 0x7c, 0x3d, 0x19,
+  0xc6, 0xed, 0x3c, 0x00, 0xf8, 0x81, 0xbd, 0x74, 0x82, 0x63, 0xbd, 0x62, 0x76,
+  0x53, 0xbd, 0x48, 0x4f, 0x78, 0x3d, 0x76, 0x0e, 0x5c, 0xbb, 0x24, 0x30, 0x30,
+  0xbd, 0x86, 0x0a, 0x14, 0x3d, 0x08, 0x29, 0xb3, 0xbc, 0xef, 0x7c, 0x2a, 0xbd,
+  0x90, 0xb8, 0x09, 0x3d, 0x47, 0x45, 0x66, 0xbc, 0x30, 0x23, 0xb7, 0xbc, 0x8f,
+  0xd2, 0x5e, 0x3d, 0x31, 0x72, 0x33, 0x3d, 0x26, 0xdc, 0x88, 0xbd, 0xeb, 0x0b,
+  0x24, 0xbc, 0x14, 0x3c, 0xe9, 0xbc, 0x38, 0xc6, 0xd3, 0x3c, 0x55, 0xd6, 0x09,
+  0xbd, 0xe5, 0xf7, 0x21, 0xbb, 0x7d, 0x03, 0x0d, 0x3d, 0xe9, 0x91, 0xd6, 0xbb,
+  0x00, 0x90, 0xe4, 0x3a, 0x21, 0x2c, 0x1a, 0x3d, 0x0c, 0xe1, 0x82, 0x3c, 0x0a,
+  0xb6, 0x38, 0x3d, 0x6c, 0x03, 0xe9, 0x3c, 0x83, 0x86, 0x05, 0x3d, 0x01, 0x6e,
+  0x86, 0x3d, 0x99, 0xc2, 0x47, 0xbd, 0x27, 0x07, 0x57, 0x3d, 0xed, 0xd2, 0x59,
+  0x3d, 0x0f, 0xa1, 0x0a, 0xbc, 0x12, 0x62, 0x6c, 0x3d, 0x16, 0x50, 0xf8, 0x3b,
+  0x00, 0xf3, 0xdc, 0x3c, 0x5c, 0x4e, 0xa6, 0xbc, 0xfa, 0x73, 0x42, 0x3c, 0xd2,
+  0x38, 0x8a, 0xbd, 0x35, 0x94, 0x8d, 0xbc, 0x69, 0x22, 0x3e, 0xbd, 0x83, 0xec,
+  0x6f, 0xbc, 0xb6, 0x37, 0xb4, 0x3c, 0xf1, 0xa7, 0x83, 0x3d, 0x62, 0xbc, 0x82,
+  0x3d, 0x88, 0x5d, 0xb8, 0xbc, 0xdd, 0x4d, 0x96, 0xbc, 0xaa, 0x38, 0x23, 0xbd,
+  0x88, 0x3f, 0x4d, 0xbc, 0xc5, 0x2d, 0xfc, 0x3c, 0x78, 0x63, 0x20, 0x3d, 0xe5,
+  0x87, 0x88, 0x3d, 0x08, 0xed, 0x77, 0xbc, 0x38, 0xef, 0x85, 0xbc, 0x19, 0xc5,
+  0x90, 0x3d, 0xba, 0xc7, 0x4e, 0x3d, 0xe4, 0xc2, 0xd6, 0x3c, 0xac, 0x97, 0x22,
+  0xbc, 0xa4, 0x4d, 0x55, 0xbd, 0x02, 0x71, 0x8b, 0xbd, 0xce, 0x55, 0x86, 0x3d,
+  0xf9, 0x00, 0x9c, 0xbc, 0xbc, 0x84, 0x51, 0x3d, 0x3c, 0xaa, 0x21, 0xbd, 0xb3,
+  0x0f, 0x43, 0xbd, 0x15, 0x2e, 0x90, 0xbd, 0xa9, 0x5c, 0x7a, 0x3d, 0x11, 0x1e,
+  0x4b, 0x3d, 0xc7, 0x35, 0xc9, 0xbc, 0x86, 0x61, 0x77, 0xbd, 0x5c, 0xbb, 0x21,
+  0xbc, 0x39, 0x3c, 0x6d, 0x3d, 0xaa, 0xde, 0xdd, 0x3a, 0xe5, 0xad, 0x0b, 0xbd,
+  0xd5, 0x2c, 0x8f, 0xbd, 0x9b, 0xd2, 0x40, 0xbc, 0xae, 0xd1, 0x27, 0x3d, 0xa4,
+  0x43, 0x61, 0x3c, 0x96, 0x2f, 0x26, 0xbd, 0x4c, 0xdb, 0x50, 0xbd, 0xd0, 0xee,
+  0x55, 0xbc, 0xa9, 0xdf, 0x62, 0x3d, 0xa9, 0xc7, 0x14, 0xbd, 0x02, 0x65, 0x41,
+  0x3b, 0xdc, 0x7c, 0x20, 0x3c, 0xb5, 0xb9, 0x89, 0x3d, 0x43, 0xc8, 0x8f, 0xbd,
+  0xe5, 0x6b, 0x3e, 0x3c, 0xcb, 0x96, 0x8d, 0xbd, 0xe8, 0x9b, 0x7d, 0xbd, 0xad,
+  0x41, 0x91, 0x3d, 0x84, 0x7b, 0xc2, 0x3c, 0xe9, 0xf8, 0x8c, 0x3c, 0x6d, 0x06,
+  0xf1, 0xbb, 0xac, 0xcc, 0x43, 0x3d, 0x11, 0xd2, 0xe3, 0x3c, 0x69, 0xb6, 0x76,
+  0xbc, 0x19, 0x3b, 0x71, 0xbd, 0x82, 0x8a, 0xb9, 0xbc, 0x28, 0x56, 0x3a, 0x3d,
+  0xf6, 0x2b, 0x3c, 0x3d, 0x0f, 0x6e, 0xe1, 0xbb, 0x96, 0x11, 0x84, 0xbc, 0xae,
+  0xf7, 0x81, 0x3d, 0xd2, 0xd1, 0x80, 0x3d, 0x97, 0xc3, 0xe6, 0xbc, 0x89, 0xe2,
+  0x57, 0x3c, 0x3d, 0x6e, 0x8e, 0xbc, 0xca, 0x02, 0x4d, 0xbd, 0x62, 0x3c, 0xc1,
+  0xbc, 0x16, 0x10, 0xed, 0xba, 0x3f, 0xe1, 0xef, 0x3c, 0x0a, 0x5c, 0xab, 0xbc,
+  0x21, 0xad, 0xd1, 0xbb, 0xbc, 0xfe, 0x32, 0x3c, 0xac, 0x6c, 0x71, 0xbd, 0x15,
+  0x98, 0x14, 0x3d, 0xb6, 0xee, 0x3a, 0x3c, 0x35, 0x4c, 0x87, 0x3d, 0xb6, 0xcd,
+  0x4c, 0x3d, 0x10, 0xf7, 0xcc, 0x3b, 0xdb, 0x8a, 0x19, 0xbd, 0x00, 0x38, 0xdb,
+  0xb8, 0xb3, 0x1b, 0x8e, 0xbd, 0x50, 0xa8, 0x41, 0xbd, 0x64, 0x53, 0x85, 0xbd,
+  0x46, 0xcf, 0xcd, 0xbb, 0x65, 0xaf, 0xa4, 0x3c, 0x78, 0x82, 0x22, 0xbd, 0xb1,
+  0xb2, 0x19, 0xbd, 0xaa, 0x2b, 0xe5, 0xbc, 0xb8, 0x9c, 0x3d, 0x3d, 0x30, 0x82,
+  0x8c, 0x3c, 0xd9, 0x2c, 0x89, 0xbd, 0x27, 0x33, 0x8f, 0x3d, 0x20, 0x09, 0x87,
+  0x3d, 0x50, 0x15, 0x05, 0xbd, 0x4b, 0xc1, 0x96, 0xbd, 0x82, 0x2a, 0x33, 0x3d,
+  0xc1, 0x9b, 0x6c, 0xbd, 0xac, 0x51, 0x0c, 0xbd, 0xd7, 0xbc, 0x59, 0xbd, 0x69,
+  0x2b, 0x37, 0x3c, 0xc0, 0xef, 0x26, 0xbd, 0xc8, 0xba, 0x59, 0x3c, 0xda, 0x1b,
+  0x18, 0xbd, 0x11, 0xfb, 0x8b, 0x3d, 0xbf, 0xc8, 0x3d, 0xbd, 0x52, 0x1b, 0x00,
+  0x3d, 0xe8, 0x9d, 0x4d, 0xba, 0xe4, 0x9d, 0x44, 0x3d, 0x87, 0x63, 0x06, 0xbd,
+  0x76, 0xc3, 0x83, 0x3d, 0x32, 0xe3, 0x84, 0xbd, 0x5a, 0x34, 0x11, 0x3d, 0xe0,
+  0xb2, 0x0e, 0xbd, 0xa8, 0x02, 0x8a, 0xbd, 0x9c, 0x92, 0x10, 0x3d, 0x47, 0xfd,
+  0x90, 0xbd, 0x24, 0x45, 0x3c, 0x3d, 0x67, 0x62, 0x96, 0xbd, 0xbb, 0x91, 0x79,
+  0xbd, 0x80, 0x99, 0x5b, 0xbd, 0x93, 0x7f, 0x83, 0xbd, 0x75, 0x82, 0x10, 0xbd,
+  0x07, 0xb0, 0xa7, 0xbb, 0x5b, 0x41, 0x66, 0xbd, 0x82, 0xeb, 0x7a, 0xbc, 0x52,
+  0xca, 0x57, 0xbd, 0x7e, 0xe3, 0x66, 0x3c, 0xab, 0x22, 0x68, 0xbd, 0x51, 0x4b,
+  0xa9, 0xbc, 0x5e, 0x13, 0xa7, 0xbc, 0xe3, 0x6b, 0x88, 0xbb, 0x80, 0x4c, 0x02,
+  0x3d, 0xf3, 0x3c, 0x59, 0xbd, 0xb2, 0x10, 0x7e, 0x3d, 0x1a, 0x9d, 0x13, 0xbd,
+  0x8d, 0xd0, 0x5b, 0x3d, 0xca, 0x7a, 0x74, 0x3d, 0x16, 0x53, 0x4b, 0x3d, 0xc9,
+  0x0a, 0x89, 0xbd, 0x44, 0x7e, 0x1b, 0xbc, 0x11, 0xca, 0xb2, 0xbc, 0x09, 0xe0,
+  0x27, 0xbd, 0xe4, 0xed, 0xfb, 0x3c, 0xe4, 0x1a, 0xf9, 0xbc, 0x50, 0x47, 0x2e,
+  0x3d, 0x1b, 0xed, 0x4e, 0x3d, 0x6d, 0x7c, 0x81, 0xbd, 0x72, 0x2a, 0xdc, 0xbc,
+  0x6f, 0xa7, 0x59, 0x3d, 0xc0, 0xbd, 0x1e, 0xbc, 0xb2, 0xaf, 0xb9, 0xbc, 0x07,
+  0x39, 0xba, 0xbc, 0xf4, 0x63, 0x46, 0xbd, 0x45, 0x7b, 0x1a, 0x3d, 0x79, 0xe9,
+  0xf7, 0x3c, 0x9e, 0xba, 0xf0, 0xbc, 0xc1, 0x09, 0xbb, 0x3c, 0x0e, 0x21, 0x52,
+  0xbc, 0xed, 0x78, 0x43, 0x3b, 0x73, 0x07, 0x62, 0x3d, 0x71, 0x92, 0x84, 0x3d,
+  0x7b, 0x59, 0xb2, 0xbc, 0xe0, 0xba, 0x34, 0xbc, 0x0c, 0x23, 0x14, 0xbd, 0x93,
+  0x93, 0x1f, 0xbd, 0xb7, 0x20, 0x6b, 0xbd, 0x8e, 0x60, 0x8c, 0xbd, 0x00, 0xe9,
+  0x8c, 0x3d, 0xdf, 0xb4, 0xe1, 0xbb, 0xa0, 0x1a, 0xbf, 0xbc, 0xf6, 0x4c, 0x80,
+  0x3c, 0x74, 0xeb, 0x18, 0x3d, 0x28, 0x64, 0x8c, 0x3c, 0xba, 0xbd, 0xd3, 0xbc,
+  0x56, 0xc0, 0x6f, 0x3d, 0x09, 0x02, 0x88, 0xbd, 0x02, 0xd5, 0x58, 0x3d, 0xc1,
+  0x57, 0x31, 0x3d, 0xfc, 0x52, 0x48, 0x3d, 0x61, 0xdc, 0x64, 0xbd, 0xa7, 0xc3,
+  0x2b, 0x3d, 0x3b, 0xea, 0x13, 0xbc, 0x0e, 0xac, 0x3c, 0xbd, 0x7e, 0x92, 0x86,
+  0x3c, 0xbf, 0x14, 0x29, 0xbc, 0xf3, 0x91, 0x7f, 0x3d, 0xf1, 0x9a, 0xac, 0x3c,
+  0xf8, 0xf5, 0x76, 0x3c, 0xa2, 0x0f, 0x86, 0xbd, 0xc3, 0xeb, 0xb7, 0x3a, 0xff,
+  0x56, 0x6c, 0x3d, 0x1c, 0xcc, 0x5a, 0xbd, 0x97, 0x3f, 0x78, 0x3d, 0x92, 0xea,
+  0x9d, 0xbc, 0xbc, 0x51, 0x6a, 0x3d, 0xc5, 0x44, 0x65, 0x3c, 0xbc, 0x66, 0x30,
+  0x3d, 0x70, 0xe2, 0x26, 0xbd, 0x2e, 0xbe, 0x19, 0x3d, 0x5e, 0xf3, 0x82, 0x3d,
+  0x32, 0x2f, 0x86, 0xbd, 0x53, 0x73, 0x81, 0x3d, 0x86, 0xef, 0xa2, 0xbc, 0xdb,
+  0xda, 0x62, 0xbd, 0x82, 0x4e, 0xd3, 0xbc, 0x80, 0xed, 0x93, 0xba, 0x50, 0xc2,
+  0xd6, 0x3b, 0x82, 0x22, 0xf1, 0xbc, 0x49, 0xd7, 0x7a, 0xbc, 0xe9, 0x00, 0x85,
+  0x3d, 0xb7, 0x12, 0x4c, 0xbd, 0x90, 0x25, 0x08, 0xb9, 0x2e, 0x76, 0xcb, 0xbc,
+  0x47, 0x11, 0x97, 0xbd, 0x06, 0x96, 0x2f, 0x3d, 0x44, 0x62, 0x65, 0x3d, 0xe7,
+  0xa5, 0x1f, 0x3d, 0x2e, 0x9e, 0xbf, 0xbc, 0x00, 0xd8, 0x6c, 0xbc, 0x20, 0xd1,
+  0x44, 0xbb, 0x19, 0x61, 0x32, 0x3c, 0xf4, 0x7a, 0x30, 0x3d, 0x11, 0x7b, 0xe4,
+  0xbc, 0x6e, 0x1c, 0x50, 0x3b, 0x9b, 0x64, 0x64, 0xbd, 0x89, 0x52, 0x1f, 0x3d,
+  0x65, 0x20, 0x2c, 0x3d, 0xb9, 0x45, 0xd7, 0x3c, 0xe8, 0x37, 0x8e, 0x3d, 0x40,
+  0x5e, 0x50, 0x3c, 0x7a, 0x66, 0x68, 0xbd, 0x45, 0x1b, 0x31, 0xbd, 0xcb, 0x31,
+  0x47, 0x3d, 0x2f, 0x4a, 0xb3, 0x3c, 0x97, 0x3d, 0xbc, 0xbc, 0x55, 0x24, 0x80,
+  0xbd, 0x85, 0x56, 0x69, 0xbc, 0x0e, 0x0a, 0x34, 0x3d, 0xec, 0xe8, 0x54, 0xbd,
+  0xeb, 0x92, 0x6d, 0xbd, 0xe2, 0x61, 0x41, 0x3c, 0xf3, 0x3c, 0x93, 0xbd, 0x10,
+  0xea, 0xbd, 0xb7, 0x42, 0xec, 0x3b, 0xbd, 0x66, 0xe6, 0x80, 0xbd, 0x84, 0xd9,
+  0x85, 0x3d, 0x2c, 0xd8, 0xac, 0x3c, 0x72, 0x8e, 0x48, 0x3c, 0x11, 0xa8, 0x9c,
+  0xbc, 0x08, 0x31, 0x39, 0x3d, 0x0f, 0x3c, 0x7c, 0x3d, 0x58, 0xba, 0x25, 0x3d,
+  0xce, 0x5f, 0x27, 0x3c, 0x7c, 0x7b, 0x65, 0x3d, 0x96, 0xd6, 0x1e, 0x3d, 0x48,
+  0x03, 0x73, 0xbd, 0x84, 0x7a, 0x26, 0xbd, 0x92, 0x82, 0x72, 0xbd, 0xeb, 0x8a,
+  0x0c, 0xbd, 0x84, 0xe7, 0x5f, 0xbd, 0x0b, 0x83, 0xfc, 0x3c, 0xfb, 0xed, 0x8e,
+  0xbd, 0x52, 0xe2, 0x65, 0x3d, 0xd1, 0xa1, 0x4e, 0xbb, 0x5f, 0x41, 0xce, 0xbc,
+  0x4b, 0x3d, 0x15, 0xbb, 0x20, 0xc8, 0x90, 0xbd, 0x29, 0xfb, 0x28, 0xbd, 0x04,
+  0x06, 0x8a, 0xbd, 0x8a, 0x65, 0x30, 0x3d, 0x00, 0x49, 0x93, 0x3a, 0x6e, 0xb0,
+  0x61, 0x3d, 0x94, 0xcc, 0x87, 0xbc, 0x10, 0x13, 0x3a, 0x3d, 0x5a, 0x7e, 0x7f,
+  0xbd, 0x4c, 0x1f, 0xd7, 0xbc, 0x82, 0xb3, 0x1e, 0x3d, 0x7e, 0xca, 0x00, 0xbc,
+  0xe7, 0x69, 0xe4, 0xbb, 0xd5, 0xad, 0x1f, 0x3d, 0xb6, 0x02, 0x72, 0x3d, 0x4b,
+  0x4f, 0x91, 0xbc, 0x69, 0xd1, 0xd2, 0xbc, 0xf4, 0x42, 0xce, 0x3c, 0xf9, 0x95,
+  0x8f, 0x3d, 0x5f, 0xd1, 0x52, 0x3c, 0xec, 0xd5, 0x67, 0x3d, 0x79, 0x25, 0x84,
+  0xba, 0xf3, 0x43, 0x5f, 0x3d, 0x39, 0xdc, 0x2b, 0x3d, 0xc6, 0x40, 0x67, 0xbd,
+  0xbb, 0xfa, 0x02, 0xbd, 0xf6, 0x13, 0x31, 0xbc, 0x1a, 0x8a, 0x5b, 0x3d, 0x28,
+  0x8c, 0x3d, 0xba, 0xbd, 0x41, 0x46, 0x3d, 0xc8, 0xb7, 0x80, 0xbb, 0xd7, 0xc5,
+  0x71, 0x3b, 0x2a, 0x9d, 0x51, 0xbd, 0xfb, 0xe8, 0x66, 0xbd, 0x49, 0x55, 0xad,
+  0xbc, 0x80, 0x74, 0x36, 0xbd, 0x00, 0x48, 0xc7, 0xbc, 0xec, 0x9e, 0xf8, 0x3c,
+  0x2d, 0x31, 0x7e, 0x3d, 0x5d, 0xdd, 0x94, 0xbd, 0xfd, 0xce, 0x57, 0x3d, 0xe2,
+  0x28, 0x0b, 0xbc, 0x00, 0xec, 0x38, 0x3d, 0x88, 0x2f, 0xc9, 0xbc, 0xe8, 0x5d,
+  0x69, 0x3d, 0xd8, 0x1a, 0x04, 0xbc, 0xa5, 0x91, 0x78, 0x3d, 0x4f, 0x30, 0x06,
+  0xbc, 0xdf, 0x59, 0x51, 0x3d, 0x00, 0xb6, 0x8f, 0x3a, 0x9f, 0x7e, 0x76, 0xbd,
+  0x66, 0xc5, 0x1d, 0x3d, 0x99, 0x26, 0x91, 0xbd, 0x82, 0x51, 0x8e, 0xbd, 0xf6,
+  0xf9, 0x81, 0xbc, 0x60, 0x4a, 0x9d, 0x3c, 0x40, 0xfa, 0xf8, 0xbb, 0x96, 0x7a,
+  0xf4, 0xbb, 0x8d, 0xfb, 0x02, 0xbd, 0xf0, 0xf1, 0xa8, 0x3c, 0xc9, 0xa7, 0x38,
+  0xbd, 0x85, 0xc8, 0x4b, 0xbc, 0xc8, 0x56, 0x13, 0x3d, 0x61, 0x4d, 0x88, 0xbd,
+  0x4e, 0xe1, 0x42, 0x3d, 0xec, 0x20, 0x7c, 0xbc, 0x49, 0x1c, 0x91, 0x3d, 0x40,
+  0xea, 0x8d, 0xbd, 0x90, 0xa9, 0x5b, 0xbd, 0xe1, 0x98, 0x8e, 0xbd, 0x2f, 0x06,
+  0xed, 0xbc, 0xa9, 0xa1, 0xe0, 0x3c, 0x54, 0xa1, 0x76, 0xbd, 0x21, 0x88, 0x70,
+  0xbd, 0x16, 0x25, 0x23, 0xbd, 0xb6, 0xdf, 0x4f, 0x3d, 0xaf, 0x39, 0x57, 0x3d,
+  0x3f, 0xfa, 0x2a, 0xbd, 0xda, 0x39, 0xcf, 0x3c, 0xf6, 0x8b, 0x5e, 0x3d, 0x49,
+  0x9e, 0xec, 0xbc, 0x5c, 0x6b, 0x7f, 0x3d, 0x38, 0xf8, 0x8a, 0xbc, 0x15, 0xc8,
+  0x8a, 0xbd, 0xc9, 0xb5, 0x3f, 0x3d, 0x1c, 0xcd, 0x97, 0xbd, 0x3c, 0xa4, 0xb0,
+  0xba, 0x85, 0x05, 0x18, 0xbc, 0x0b, 0xf9, 0x81, 0xbd, 0xa7, 0x64, 0x84, 0xbc,
+  0x17, 0xa4, 0x86, 0x3d, 0x74, 0xbc, 0x6d, 0xbd, 0xbe, 0xaa, 0xe0, 0x3c, 0x70,
+  0x71, 0x01, 0x3d, 0x34, 0x7c, 0x3b, 0x3d, 0xf7, 0xe5, 0x4a, 0x3d, 0x0b, 0x8a,
+  0xe2, 0x3c, 0x3a, 0xce, 0x8c, 0xbd, 0xc3, 0x45, 0x17, 0xbc, 0x06, 0x14, 0x40,
+  0xbd, 0xc8, 0x4e, 0x2a, 0x3d, 0x1e, 0x87, 0x38, 0x3d, 0x12, 0xe6, 0x8e, 0x3d,
+  0x5d, 0x26, 0x24, 0xbc, 0x96, 0x16, 0x0e, 0xbb, 0xbd, 0x7b, 0xe7, 0xbb, 0xee,
+  0xf1, 0x86, 0xbc, 0x21, 0x44, 0xe1, 0xba, 0x34, 0xc7, 0x76, 0xbd, 0x84, 0x41,
+  0x0f, 0xba, 0x79, 0x2a, 0x77, 0x3d, 0xe0, 0x52, 0xce, 0x3c, 0xd3, 0xbd, 0x0c,
+  0x3d, 0xff, 0x57, 0x8b, 0x3d, 0xc6, 0x60, 0xed, 0x3b, 0xfc, 0x72, 0x7f, 0xbd,
+  0x18, 0xaa, 0x20, 0x3c, 0xcd, 0x28, 0x0d, 0x3d, 0x18, 0xf7, 0xdb, 0x3a, 0xd6,
+  0x93, 0x6a, 0x3d, 0x46, 0x48, 0x55, 0xbd, 0x01, 0x2f, 0x7c, 0x3d, 0x75, 0x2d,
+  0x80, 0x3c, 0x4c, 0x22, 0xd0, 0x3c, 0x17, 0x6d, 0x8b, 0xbb, 0x34, 0x25, 0xec,
+  0xbc, 0x04, 0x8e, 0x56, 0x3d, 0xd8, 0xab, 0x88, 0x3d, 0x20, 0x51, 0x88, 0xbc,
+  0x71, 0xdb, 0xd4, 0x3c, 0x41, 0xe5, 0x03, 0xbd, 0x28, 0x8d, 0x0c, 0x3c, 0xa1,
+  0xe2, 0x7d, 0xbd, 0x10, 0xb2, 0xcd, 0x3c, 0x3b, 0xa9, 0xdf, 0xbc, 0x2d, 0x71,
+  0x73, 0x3d, 0xfa, 0xcb, 0xd3, 0x3c, 0xb4, 0x04, 0x10, 0xbb, 0xca, 0xec, 0x8c,
+  0xbd, 0xd1, 0x28, 0x9a, 0x3c, 0x0f, 0x12, 0x2f, 0x3d, 0x93, 0x67, 0x2a, 0x3d,
+  0x94, 0x98, 0xb7, 0x3c, 0x8e, 0x0f, 0xae, 0xbc, 0xc6, 0x7c, 0xd9, 0x3c, 0xa0,
+  0x4d, 0x3b, 0xbb, 0x20, 0xf7, 0xd5, 0x3c, 0x7b, 0xa2, 0x72, 0xbd, 0xc5, 0xb9,
+  0xbd, 0x3c, 0x59, 0x61, 0x1e, 0x3d, 0x8b, 0x95, 0x8c, 0xbd, 0xbe, 0xbf, 0x9b,
+  0xbc, 0x0f, 0x63, 0x7b, 0x3d, 0x92, 0x1a, 0x66, 0x3c, 0x4f, 0xef, 0xa0, 0x38,
+  0x8c, 0x24, 0xd9, 0xbc, 0x7d, 0xfa, 0xf8, 0xbc, 0xde, 0xe7, 0x85, 0x3d, 0xa2,
+  0xd6, 0x13, 0xbd, 0x5e, 0x38, 0x3d, 0xbd, 0xe7, 0x7e, 0xb0, 0x3d, 0xc5, 0x86,
+  0xba, 0xbc, 0x49, 0x12, 0x93, 0xbd, 0x8e, 0x9e, 0xea, 0x3d, 0x48, 0x93, 0x84,
+  0xbd, 0x33, 0x48, 0xc7, 0xbc, 0x23, 0x1f, 0x5f, 0x3d, 0x51, 0x20, 0xb5, 0xbb,
+  0x93, 0xfa, 0x90, 0x3d, 0x99, 0xe1, 0x31, 0xbd, 0x82, 0x3e, 0x89, 0xbd, 0x99,
+  0x5e, 0xe0, 0xbc, 0x0c, 0xc2, 0x03, 0x3d, 0xe2, 0x69, 0xb2, 0x3c, 0x3d, 0xdb,
+  0x6e, 0xbd, 0x37, 0xd2, 0x36, 0x3c, 0x89, 0x66, 0x1e, 0xbd, 0xeb, 0x8a, 0x88,
+  0x3d, 0x1a, 0x34, 0x3d, 0x3d, 0x84, 0x3a, 0x24, 0x3d, 0x2f, 0xd2, 0x78, 0xbd,
+  0x45, 0x13, 0x82, 0x3d, 0x70, 0x07, 0x94, 0x3d, 0xf9, 0xc5, 0x7f, 0xbd, 0x40,
+  0x1b, 0x04, 0xbd, 0x74, 0x6f, 0x3a, 0x3d, 0xa0, 0x7d, 0xf8, 0xbc, 0x7e, 0x95,
+  0x61, 0x3d, 0xc0, 0x56, 0x5d, 0x3b, 0x16, 0xa4, 0x06, 0x3d, 0x4b, 0x46, 0xbf,
+  0xbd, 0x64, 0x97, 0xe8, 0xbc, 0x79, 0xbd, 0x75, 0x3a, 0x50, 0xb6, 0x6a, 0x3c,
+  0x7b, 0xcc, 0x29, 0x3c, 0xa8, 0x8f, 0x17, 0x3d, 0xf0, 0xf6, 0xbc, 0x3b, 0x48,
+  0x26, 0x78, 0xbd, 0x96, 0x9b, 0xe4, 0x3b, 0x87, 0xe5, 0x70, 0x3c, 0x88, 0xf2,
+  0xac, 0xbb, 0x79, 0x75, 0x05, 0x3c, 0x06, 0x38, 0xa5, 0x3d, 0x8b, 0x4e, 0x0a,
+  0x3d, 0xf9, 0x2d, 0x95, 0x3d, 0x08, 0xca, 0x7f, 0x3d, 0xc7, 0x5e, 0x1c, 0x3d,
+  0xf2, 0xbc, 0x57, 0xbc, 0xc6, 0xaf, 0x5a, 0xbd, 0x7f, 0xc5, 0xc7, 0x3c, 0x69,
+  0x5c, 0x00, 0x3c, 0x69, 0xaf, 0x8a, 0x3d, 0x60, 0x07, 0x01, 0x3d, 0xc3, 0x8f,
+  0xff, 0x3a, 0xd5, 0x44, 0x1d, 0x3d, 0x66, 0x63, 0x2a, 0xbd, 0xe9, 0xd3, 0x9a,
+  0xbd, 0x50, 0xc0, 0x0a, 0xbd, 0x32, 0x2d, 0xc6, 0xbc, 0xf0, 0xb1, 0xd4, 0xbb,
+  0x48, 0xcc, 0xdc, 0x3a, 0xcd, 0x33, 0x6f, 0x3d, 0xea, 0x34, 0x95, 0xbd, 0xb8,
+  0x4b, 0x2f, 0xbc, 0xe0, 0xa1, 0x0f, 0xbc, 0x0f, 0xee, 0x01, 0x3c, 0x5e, 0x3d,
+  0x35, 0x3d, 0x6e, 0x51, 0x81, 0xbd, 0xfa, 0x8d, 0x8b, 0x3c, 0x51, 0xc5, 0x0a,
+  0x3d, 0x8a, 0xa8, 0xc4, 0xbc, 0x66, 0x86, 0x19, 0xbd, 0x50, 0x08, 0x8e, 0x3d,
+  0x22, 0x74, 0xdd, 0x3b, 0xdb, 0xf4, 0xea, 0x3a, 0xa1, 0x2d, 0x68, 0x3d, 0x7e,
+  0x82, 0xc6, 0x3d, 0xe6, 0x89, 0x16, 0xbd, 0xe2, 0x72, 0x78, 0xbd, 0x25, 0xe0,
+  0x82, 0xbd, 0xc2, 0x61, 0x66, 0x3c, 0xb2, 0x57, 0x66, 0x3d, 0x47, 0xa3, 0x40,
+  0xbc, 0xf7, 0x00, 0x3e, 0xbd, 0x78, 0x7e, 0x42, 0x3d, 0xc3, 0x09, 0x83, 0x3d,
+  0x1d, 0xac, 0x09, 0x3d, 0x37, 0xc0, 0xd7, 0x3b, 0xae, 0xbb, 0x34, 0xbd, 0x12,
+  0x34, 0x95, 0x3d, 0xf8, 0x3f, 0x20, 0x3d, 0xa8, 0x30, 0x0b, 0xbd, 0x09, 0x71,
+  0x02, 0xbd, 0xb7, 0xbc, 0x80, 0x3d, 0x9e, 0x24, 0x48, 0x3d, 0xbb, 0xe7, 0xa6,
+  0x3d, 0x59, 0xd4, 0x28, 0xbd, 0x98, 0x85, 0x14, 0xbc, 0x25, 0xbe, 0xae, 0x3c,
+  0x1b, 0x82, 0x85, 0x3c, 0x6c, 0x23, 0xc3, 0x3c, 0x7a, 0xe2, 0x03, 0xbd, 0x75,
+  0x65, 0x3a, 0x3d, 0x9e, 0x34, 0x76, 0x3b, 0xe1, 0x36, 0x05, 0x3d, 0xd6, 0x9a,
+  0x37, 0xbd, 0x66, 0x1c, 0x99, 0x3c, 0x9d, 0x65, 0x2a, 0xbd, 0xc3, 0xdd, 0x60,
+  0xbc, 0x6c, 0xa8, 0x06, 0xbd, 0xb8, 0xb4, 0x85, 0xbd, 0xca, 0x5d, 0x65, 0x3c,
+  0xe2, 0xce, 0xfa, 0x3c, 0x18, 0xe2, 0x29, 0x3d, 0x4a, 0xd0, 0x31, 0xbc, 0x78,
+  0xd4, 0x52, 0x3d, 0x7a, 0x03, 0x47, 0x3d, 0x0e, 0x3a, 0xde, 0xbc, 0xd1, 0x1c,
+  0x72, 0xbd, 0x39, 0xb2, 0x8c, 0xbd, 0x1a, 0x1c, 0xba, 0xbd, 0x20, 0x30, 0x5e,
+  0x3b, 0x4b, 0x1f, 0x40, 0xbc, 0x70, 0x8b, 0xbd, 0x3c, 0x02, 0x15, 0x12, 0xbd,
+  0x92, 0x7d, 0x52, 0xbd, 0x98, 0x66, 0x78, 0xbc, 0x73, 0x75, 0x74, 0x3d, 0x91,
+  0x42, 0x88, 0x3d, 0x8a, 0x00, 0x26, 0xbd, 0xca, 0xd7, 0x86, 0x3d, 0xea, 0xcb,
+  0x66, 0xbd, 0xb8, 0x28, 0x26, 0x3c, 0xd5, 0x36, 0x90, 0xbd, 0xfa, 0x19, 0x5a,
+  0x3d, 0xb2, 0x02, 0x81, 0xbd, 0xe3, 0x63, 0x8d, 0x3d, 0xad, 0x2e, 0x0e, 0x3d,
+  0x01, 0x74, 0x4b, 0xbd, 0xa3, 0x91, 0x08, 0x3d, 0x6d, 0xa0, 0x23, 0xbd, 0x84,
+  0xbd, 0x0a, 0xbd, 0x28, 0x54, 0x95, 0xba, 0x1c, 0x4a, 0x2f, 0x3d, 0xf0, 0x67,
+  0xaf, 0xbc, 0xcc, 0x1e, 0x18, 0x3d, 0xd5, 0xf0, 0x29, 0x3d, 0xd9, 0x19, 0x0a,
+  0xbc, 0x91, 0xf8, 0x1c, 0xbc, 0xf0, 0x4b, 0x1a, 0x3d, 0xc8, 0xdc, 0x52, 0xbc,
+  0x65, 0x2b, 0x6c, 0xbd, 0x9f, 0x08, 0x9a, 0xbd, 0x11, 0xd4, 0x9e, 0xbc, 0xb0,
+  0xa3, 0x0d, 0x3c, 0x20, 0x50, 0xd7, 0x3c, 0x65, 0xfc, 0xb7, 0xbc, 0x43, 0xf5,
+  0x0d, 0xbd, 0xb9, 0x3c, 0x2a, 0x3d, 0x66, 0xb3, 0x5b, 0x3d, 0x6d, 0x26, 0xa0,
+  0x3d, 0x3a, 0xc0, 0x15, 0xbb, 0x67, 0x1b, 0x0b, 0x3c, 0x20, 0x72, 0xa6, 0xbd,
+  0xe2, 0x14, 0xa5, 0xbc, 0x37, 0x10, 0x92, 0x3d, 0x24, 0x2d, 0x1c, 0x3d, 0x47,
+  0xbd, 0x2b, 0xbd, 0x68, 0x0f, 0xa5, 0x3d, 0x96, 0x58, 0x98, 0x3d, 0x25, 0x20,
+  0xd3, 0x3b, 0xc2, 0x1b, 0xbd, 0x3d, 0x17, 0x2a, 0xa5, 0xbb, 0x34, 0x7e, 0x47,
+  0x3d, 0x36, 0xb6, 0xd0, 0x3b, 0x6a, 0xba, 0xf3, 0x3c, 0x54, 0x95, 0x25, 0xbd,
+  0x99, 0x51, 0x81, 0x3d, 0xe6, 0x1b, 0x20, 0xbc, 0x2e, 0xc2, 0x3b, 0xbd, 0xb8,
+  0xa6, 0x17, 0xbd, 0x86, 0x1f, 0xd7, 0x3c, 0x60, 0x69, 0x8d, 0x3d, 0x00, 0x02,
+  0x76, 0xbd, 0x86, 0xdb, 0x85, 0x3b, 0x52, 0xb1, 0xd7, 0x3d, 0x7c, 0xd1, 0x4f,
+  0xbd, 0xb0, 0xe7, 0x13, 0xbd, 0xee, 0xe2, 0x0f, 0x3d, 0x2e, 0x0a, 0x11, 0xbd,
+  0x59, 0x7e, 0x04, 0xbd, 0xf1, 0xdf, 0x10, 0xbc, 0x9f, 0xfd, 0x90, 0xbc, 0x0a,
+  0xec, 0x47, 0x3c, 0x9b, 0x06, 0x5a, 0x3d, 0x0e, 0xe3, 0xee, 0xbc, 0x3b, 0xbf,
+  0xc7, 0x3b, 0x1e, 0xc7, 0x17, 0xbd, 0x65, 0x6d, 0x75, 0x3c, 0x81, 0x92, 0xc3,
+  0x3c, 0xee, 0x48, 0x9e, 0x3c, 0x6d, 0x2e, 0x4f, 0xbd, 0x42, 0x85, 0x64, 0xbd,
+  0xe9, 0x0a, 0xbb, 0xbc, 0x73, 0x3f, 0x40, 0xbd, 0xbd, 0x8c, 0xae, 0x3b, 0x4a,
+  0xae, 0x31, 0x3d, 0x9e, 0x39, 0xfd, 0x3c, 0xd7, 0x4e, 0xe0, 0xbd, 0xf6, 0x05,
+  0x05, 0xbd, 0xbf, 0x61, 0x31, 0x3c, 0xba, 0x2f, 0x51, 0x3d, 0x16, 0xef, 0xdd,
+  0x3c, 0x23, 0x64, 0x18, 0x3c, 0x44, 0x4b, 0xce, 0xbc, 0x13, 0xbd, 0xd7, 0xbc,
+  0xc8, 0xc8, 0xb8, 0xbc, 0x76, 0x69, 0x19, 0xbd, 0x76, 0x51, 0x9c, 0xbd, 0xbe,
+  0xbc, 0x7d, 0x3d, 0xa3, 0xa2, 0x74, 0x3d, 0xfe, 0xad, 0x06, 0x3c, 0x74, 0xb4,
+  0x0f, 0x3b, 0x9f, 0x83, 0x8d, 0x3d, 0xa5, 0x84, 0x70, 0x3d, 0x99, 0xa1, 0xe6,
+  0xbc, 0xf2, 0xf1, 0xbd, 0xbc, 0x29, 0xd8, 0x42, 0xbc, 0x48, 0xb0, 0xa7, 0x3c,
+  0xce, 0x31, 0x0b, 0xbd, 0x8b, 0xef, 0x39, 0x3d, 0xc5, 0x28, 0xa4, 0x3c, 0xcd,
+  0x1b, 0xb7, 0x3c, 0x3f, 0x50, 0x55, 0xbd, 0xf4, 0xa8, 0x9d, 0x3d, 0xe3, 0xdb,
+  0xac, 0x3c, 0x5c, 0xae, 0x68, 0xbc, 0x8e, 0xf1, 0x0f, 0xbc, 0x17, 0x29, 0x87,
+  0x3c, 0x19, 0x45, 0x23, 0xbd, 0xf0, 0x0f, 0x12, 0xbd, 0x06, 0x74, 0x8b, 0xbd,
+  0x10, 0x65, 0x00, 0x3d, 0xa3, 0x9d, 0x8a, 0x3d, 0x1e, 0xf4, 0x3d, 0x3d, 0x4e,
+  0x40, 0x7b, 0x3c, 0xa0, 0xc8, 0xf7, 0xbb, 0x2e, 0x19, 0x1a, 0xbc, 0x37, 0x47,
+  0x36, 0xbd, 0x8b, 0x65, 0x6d, 0x3d, 0xc0, 0xcd, 0x21, 0xbd, 0x60, 0xb6, 0xa3,
+  0xbb, 0xa9, 0x58, 0x42, 0xbc, 0x94, 0x1c, 0x73, 0xbd, 0x82, 0xa5, 0xad, 0xbc,
+  0x51, 0xe5, 0xb5, 0x3d, 0xbd, 0xa1, 0x59, 0x3d, 0x13, 0x5b, 0xdb, 0xbc, 0x44,
+  0xdc, 0xd3, 0xbc, 0xc8, 0x3f, 0xa5, 0x3d, 0x5d, 0x7c, 0x68, 0x3d, 0xcd, 0xb4,
+  0xa7, 0xbc, 0x58, 0x2b, 0x48, 0x3d, 0xe6, 0x22, 0xf6, 0xbc, 0xde, 0x4b, 0x0b,
+  0xbd, 0x71, 0x8f, 0x44, 0xbd, 0x8d, 0xa0, 0x17, 0xbd, 0xd3, 0xd3, 0x36, 0x3d,
+  0x40, 0x04, 0x3c, 0xbd, 0x4a, 0xdf, 0x82, 0x3b, 0x23, 0x72, 0x20, 0x3d, 0xf5,
+  0x84, 0x80, 0xbd, 0xf9, 0x1c, 0xf3, 0xbc, 0x84, 0xd9, 0x86, 0xbd, 0x28, 0x42,
+  0x48, 0xbd, 0x90, 0xd7, 0x32, 0x3d, 0x80, 0x98, 0x01, 0xbc, 0x7f, 0x7a, 0x82,
+  0xbd, 0x59, 0x12, 0xf3, 0x3c, 0x9b, 0x63, 0xaa, 0xbc, 0x5e, 0x84, 0xb5, 0xbd,
+  0x95, 0x77, 0x90, 0x3d, 0xad, 0x26, 0xb4, 0xbd, 0xda, 0xfb, 0x0a, 0xbd, 0x44,
+  0x70, 0x73, 0x3d, 0x70, 0x45, 0x41, 0x3d, 0xe6, 0x6b, 0x73, 0x3c, 0x93, 0x01,
+  0x78, 0xbd, 0xc3, 0xda, 0xa2, 0x3d, 0x46, 0x41, 0x83, 0x3d, 0x16, 0x40, 0x32,
+  0x3d, 0xa7, 0xfb, 0xa7, 0xbd, 0xc0, 0x57, 0x28, 0x3b, 0xd0, 0x2b, 0x84, 0xbc,
+  0x85, 0x89, 0x88, 0x3d, 0xc4, 0xa3, 0x8f, 0xbc, 0xbb, 0xc6, 0x96, 0xbd, 0x7c,
+  0xae, 0x36, 0xbd, 0xf8, 0x8b, 0x85, 0x3d, 0xfa, 0x35, 0xf5, 0x3c, 0xad, 0x86,
+  0x63, 0xbc, 0x7c, 0xc1, 0x54, 0x3d, 0xad, 0xfc, 0x09, 0xbd, 0x3a, 0x1f, 0xf2,
+  0x3c, 0xf4, 0x35, 0x65, 0x3c, 0xd0, 0x53, 0x38, 0xbd, 0x99, 0xf8, 0x36, 0x3d,
+  0x95, 0xaf, 0x67, 0x3d, 0xd2, 0x76, 0x44, 0x3d, 0x03, 0x46, 0x82, 0x3d, 0xdc,
+  0xe2, 0x53, 0xbd, 0x49, 0x59, 0x7b, 0xbd, 0x1c, 0x8b, 0xaf, 0x3a, 0x80, 0x30,
+  0x27, 0xbd, 0xdb, 0x9c, 0x87, 0xbd, 0x8e, 0x09, 0x5c, 0x3d, 0x5e, 0x5d, 0x5d,
+  0x3d, 0xcc, 0x97, 0xaa, 0xbb, 0x81, 0xe0, 0xb9, 0xbc, 0x61, 0x3a, 0x9a, 0x3b,
+  0xc9, 0x99, 0x9f, 0x3d, 0x2d, 0x52, 0x10, 0xbd, 0x90, 0x0b, 0xa1, 0x3c, 0xaf,
+  0x88, 0x81, 0xbd, 0xf4, 0x7a, 0x89, 0xbc, 0xb3, 0xe1, 0xc5, 0xbc, 0x8e, 0xe5,
+  0x8a, 0xbd, 0x6d, 0xd9, 0x70, 0x3b, 0xdd, 0x1b, 0xa1, 0x3c, 0xdd, 0xeb, 0x42,
+  0xbd, 0x01, 0xcb, 0xf2, 0x3c, 0x8e, 0x4f, 0xff, 0xbc, 0x28, 0x5e, 0x6a, 0xbc,
+  0x3f, 0xff, 0x26, 0x3d, 0xc4, 0xfa, 0x87, 0xbc, 0xcb, 0x5e, 0x32, 0xbd, 0x1f,
+  0xb7, 0xd1, 0xbd, 0x40, 0xb6, 0x8b, 0x3c, 0x22, 0xf5, 0xa5, 0xbc, 0x5e, 0xa1,
+  0xf7, 0xbc, 0x1a, 0x43, 0x11, 0x3d, 0xc9, 0xfe, 0x18, 0xbd, 0x34, 0x8b, 0x2f,
+  0x3d, 0x2f, 0xe3, 0x8d, 0x3d, 0xaf, 0x7b, 0x69, 0xbd, 0x63, 0x9d, 0xac, 0x3d,
+  0xce, 0x45, 0x50, 0xbd, 0xe1, 0x8f, 0x6b, 0xbd, 0x6e, 0xc6, 0x07, 0xbd, 0x58,
+  0x1e, 0x12, 0x3c, 0x79, 0xdd, 0x06, 0x3d, 0xea, 0x26, 0x83, 0xbd, 0xaa, 0x63,
+  0xce, 0x3d, 0x3a, 0xb3, 0x81, 0x3b, 0x35, 0x9a, 0xc6, 0x3c, 0x27, 0xc4, 0x59,
+  0xbd, 0x74, 0x21, 0x30, 0x3d, 0xfe, 0x21, 0x8f, 0xbc, 0xb2, 0x86, 0x78, 0xbc,
+  0xbb, 0x4f, 0xd7, 0xbd, 0xda, 0xfe, 0x2c, 0xbd, 0x7b, 0x99, 0x21, 0x3b, 0x61,
+  0xe4, 0x68, 0xbd, 0x66, 0xfd, 0xb2, 0xba, 0xbe, 0x3d, 0x53, 0x3d, 0x53, 0x3f,
+  0x5c, 0xbd, 0x5b, 0xf9, 0xc4, 0x3c, 0x1c, 0xa3, 0x6c, 0x3d, 0x61, 0x44, 0xfa,
+  0x3c, 0x35, 0xb8, 0xd9, 0x3c, 0x6d, 0x40, 0xc8, 0xbc, 0xbf, 0x20, 0x2a, 0x3d,
+  0x84, 0xbd, 0x80, 0x3c, 0x19, 0x27, 0x1c, 0x3d, 0xc8, 0xf0, 0x56, 0x3c, 0x74,
+  0x85, 0x29, 0x3c, 0xce, 0x5a, 0x91, 0xbc, 0x1f, 0xc3, 0x89, 0xbc, 0x8a, 0xec,
+  0x62, 0x3d, 0xd0, 0xc0, 0xd2, 0xbb, 0x29, 0x30, 0x36, 0x3d, 0x71, 0xd4, 0xaf,
+  0x3c, 0x29, 0x52, 0xb9, 0xbc, 0x33, 0xc8, 0x2c, 0x3a, 0x97, 0x8e, 0x18, 0xbb,
+  0xda, 0xa7, 0x28, 0xbd, 0xaf, 0x8c, 0xc1, 0xbc, 0x62, 0xbb, 0xc7, 0x3b, 0xda,
+  0x12, 0xbb, 0xbc, 0x7a, 0xfb, 0x3a, 0xbd, 0x04, 0xc0, 0xe3, 0x3c, 0x0f, 0x84,
+  0xdd, 0xbd, 0xa4, 0x83, 0x87, 0x3d, 0x38, 0x8b, 0x5f, 0xbd, 0x60, 0xb4, 0x98,
+  0x3c, 0x99, 0xef, 0x5d, 0x3b, 0xda, 0x0b, 0x83, 0x3d, 0x49, 0xf9, 0x93, 0x3d,
+  0xe4, 0x29, 0x51, 0xbd, 0x5e, 0x33, 0x4b, 0xbd, 0x7a, 0xc5, 0xd5, 0x3b, 0xc2,
+  0xbc, 0x67, 0x3d, 0x89, 0xa1, 0x55, 0xbd, 0x91, 0x0f, 0x55, 0x3d, 0xf8, 0x89,
+  0x82, 0xbd, 0x4c, 0xdc, 0xc6, 0xbc, 0xc9, 0xb0, 0x3e, 0xbd, 0x7c, 0x95, 0x25,
+  0x3d, 0xa2, 0x9f, 0xe1, 0x3b, 0x17, 0xcf, 0x90, 0xbb, 0xd6, 0x9c, 0x47, 0x3b,
+  0xf6, 0x12, 0x74, 0x3d, 0xba, 0x2e, 0xde, 0x3c, 0x3e, 0x06, 0x74, 0x3d, 0x32,
+  0x23, 0x5e, 0xbc, 0x02, 0xf3, 0x88, 0xbd, 0x16, 0x5d, 0xdd, 0xbc, 0x50, 0x9b,
+  0x0a, 0xbd, 0x8e, 0x56, 0xb9, 0xbc, 0xc8, 0x8b, 0x18, 0x3d, 0xfd, 0x15, 0x80,
+  0x3d, 0x4c, 0x97, 0x5a, 0xbc, 0xe2, 0x63, 0xa4, 0xbc, 0xc3, 0x3d, 0x84, 0xbc,
+  0x7e, 0xa2, 0x83, 0x3b, 0x6e, 0x8b, 0x4e, 0x3c, 0x24, 0xb4, 0xb3, 0xbb, 0x03,
+  0x9e, 0xfd, 0x3b, 0xa4, 0x8b, 0x53, 0x3d, 0xbc, 0x81, 0x61, 0xbd, 0x59, 0xde,
+  0x48, 0x3d, 0x21, 0x16, 0x61, 0xbd, 0x31, 0xbc, 0x1c, 0xbd, 0xfc, 0xe8, 0xf4,
+  0x3c, 0x88, 0x36, 0x59, 0x3d, 0x12, 0x10, 0xf8, 0xbb, 0xe4, 0x7b, 0x5f, 0xbc,
+  0xf0, 0x9d, 0x9e, 0x3c, 0xfb, 0x94, 0xdb, 0xbc, 0x54, 0x67, 0x65, 0xbc, 0x5e,
+  0x6e, 0x3b, 0xbd, 0x12, 0x92, 0x59, 0x3c, 0xf3, 0x69, 0x8b, 0x3b, 0x78, 0x99,
+  0xdd, 0x3c, 0x85, 0x31, 0x21, 0x3d, 0xe4, 0x6c, 0x33, 0x3d, 0x9c, 0x58, 0x87,
+  0xbd, 0xd9, 0xf5, 0x31, 0xbc, 0xce, 0xac, 0xb9, 0x3d, 0x0e, 0x2c, 0x5c, 0x3d,
+  0x6a, 0x94, 0xa9, 0x3d, 0x0e, 0xca, 0x4d, 0xbc, 0x68, 0x0f, 0x4d, 0xbd, 0xd5,
+  0x31, 0xa6, 0xbc, 0xf1, 0xdc, 0x9b, 0x3d, 0x71, 0x4d, 0xfd, 0xbc, 0xcc, 0x43,
+  0x1a, 0x3d, 0x1f, 0x4f, 0x51, 0x3d, 0xf0, 0x07, 0xa4, 0x3b, 0x1a, 0x75, 0x40,
+  0x3d, 0xf6, 0xef, 0x13, 0x3d, 0x58, 0x08, 0x04, 0xbd, 0xf3, 0x55, 0x58, 0x3d,
+  0x55, 0x7e, 0x6d, 0xbd, 0x96, 0x39, 0x78, 0xbd, 0x19, 0x7d, 0x7f, 0xbd, 0xc3,
+  0x4a, 0x9a, 0xbd, 0x64, 0xad, 0x24, 0x3d, 0xc8, 0xab, 0x10, 0x3b, 0xa2, 0x7f,
+  0x76, 0xbd, 0xdd, 0xb6, 0x2e, 0x3d, 0xdb, 0xbf, 0x88, 0x3d, 0x49, 0x2e, 0xbd,
+  0xbb, 0xdb, 0xdc, 0x86, 0x3d, 0x06, 0xf9, 0x85, 0xbd, 0x3c, 0x44, 0x39, 0xbc,
+  0x8b, 0x1c, 0x32, 0x3d, 0xf6, 0x3c, 0x7a, 0x3d, 0x68, 0x1f, 0x13, 0xbd, 0x1d,
+  0x1c, 0xed, 0x3c, 0xa8, 0x9b, 0x08, 0xbc, 0xe4, 0x25, 0xf6, 0xbc, 0xf6, 0xd8,
+  0x19, 0xbd, 0x24, 0x39, 0x2f, 0xbd, 0x59, 0x25, 0x86, 0xbd, 0xbf, 0xf8, 0x78,
+  0xbd, 0x33, 0xec, 0x93, 0xbd, 0x65, 0xdd, 0x55, 0xbd, 0x9d, 0x16, 0x05, 0xbd,
+  0x69, 0xe6, 0x79, 0x3d, 0x64, 0xfd, 0xf0, 0xbc, 0xf7, 0xa3, 0x63, 0xbc, 0xb4,
+  0x5f, 0xdb, 0xbc, 0x72, 0x22, 0x13, 0x3d, 0x0e, 0x28, 0x03, 0xbd, 0x64, 0x4b,
+  0xad, 0x3c, 0xcb, 0x9c, 0x15, 0xbd, 0x58, 0x24, 0x55, 0x3d, 0x85, 0x90, 0x18,
+  0xbc, 0x87, 0xb7, 0x95, 0x3d, 0x5e, 0xd9, 0x78, 0xbd, 0xa6, 0x19, 0x80, 0x3d,
+  0xd3, 0xf6, 0x08, 0x3d, 0x8c, 0x74, 0x43, 0xbd, 0x06, 0x77, 0x8f, 0xbd, 0x68,
+  0xc4, 0x6f, 0xbd, 0x6f, 0x45, 0x03, 0x3b, 0xb4, 0xf9, 0x9c, 0x3c, 0xe2, 0x85,
+  0x8f, 0x3c, 0x3a, 0x70, 0x92, 0x3d, 0x06, 0xaa, 0x28, 0xbd, 0x51, 0x46, 0xc2,
+  0xbd, 0x39, 0xf2, 0x8f, 0x3d, 0xda, 0xbd, 0x4e, 0x3d, 0x68, 0x6d, 0x57, 0xbc,
+  0xb3, 0x41, 0x8b, 0x3d, 0xa8, 0x83, 0xa3, 0xbc, 0x3a, 0x05, 0xbf, 0xbc, 0x5b,
+  0x8d, 0x6e, 0x3d, 0xfa, 0x17, 0x8b, 0xbd, 0xff, 0x33, 0x03, 0x3c, 0x4e, 0x35,
+  0x6d, 0xbb, 0xf5, 0x98, 0x31, 0xbd, 0xfe, 0x46, 0x20, 0x3c, 0xb7, 0x91, 0x5d,
+  0x3d, 0xa9, 0x64, 0x97, 0x3c, 0xd8, 0x6a, 0x59, 0xbd, 0x0b, 0xfb, 0x7c, 0x3d,
+  0x05, 0xf1, 0x26, 0xbd, 0xd4, 0xfd, 0x2a, 0x3d, 0x70, 0xca, 0x1d, 0x3d, 0x76,
+  0x80, 0xc7, 0xbc, 0xfa, 0x43, 0x7e, 0x3d, 0x6e, 0xda, 0xb6, 0x3c, 0x63, 0x63,
+  0x25, 0xbd, 0x39, 0xad, 0x9c, 0xbc, 0x89, 0xa0, 0xbf, 0xbd, 0xc7, 0xd6, 0x19,
+  0x3d, 0x36, 0x1d, 0x22, 0x3c, 0x11, 0x87, 0x8b, 0xbd, 0xa8, 0x59, 0x39, 0xbd,
+  0xe4, 0x1d, 0x02, 0x3c, 0xf1, 0x0d, 0xf7, 0xbd, 0x16, 0x10, 0xb8, 0x3b, 0x03,
+  0xfc, 0xa4, 0x3c, 0x32, 0x06, 0x8f, 0xbc, 0x47, 0x59, 0xa3, 0xbc, 0xac, 0x7f,
+  0xda, 0xbc, 0x4b, 0x26, 0x80, 0x3d, 0x73, 0x33, 0x31, 0xbc, 0x83, 0x75, 0x98,
+  0xbd, 0xb7, 0x95, 0x65, 0xbd, 0x64, 0x01, 0x21, 0xbd, 0xb8, 0x86, 0x8a, 0x3b,
+  0xe5, 0x85, 0x4a, 0xbd, 0xe5, 0xc1, 0x45, 0xbc, 0x97, 0x00, 0xab, 0x3c, 0xb6,
+  0x55, 0x1b, 0xbd, 0x41, 0xcb, 0x01, 0x3d, 0x3c, 0x4e, 0x2f, 0xbc, 0x4c, 0x54,
+  0xad, 0x3c, 0x70, 0xec, 0x58, 0x3c, 0x57, 0x6e, 0xf9, 0x3c, 0xac, 0xa8, 0x28,
+  0xbd, 0xea, 0x4c, 0xce, 0xbb, 0x5f, 0x87, 0x1d, 0xbd, 0x0d, 0xe2, 0x5c, 0x3d,
+  0x1d, 0x21, 0x31, 0xbd, 0xf5, 0x47, 0xd7, 0xbd, 0xb5, 0xd5, 0x0c, 0xbd, 0x81,
+  0x2b, 0xff, 0x3c, 0x40, 0x81, 0xd2, 0x3c, 0xc3, 0x64, 0x77, 0x3c, 0xd6, 0xdd,
+  0xc9, 0xbc, 0xee, 0x42, 0x9e, 0xbc, 0x4a, 0xdb, 0x3c, 0x3d, 0xc2, 0x58, 0x82,
+  0x3d, 0xfa, 0x36, 0x24, 0xbd, 0x36, 0x2e, 0x86, 0x3d, 0x68, 0xee, 0x5e, 0xbd,
+  0x3c, 0x29, 0x1e, 0xbc, 0x80, 0x1f, 0x88, 0xbd, 0x27, 0xab, 0xb7, 0xbc, 0xce,
+  0x18, 0xa7, 0xbd, 0xf6, 0x96, 0xa7, 0xbc, 0xde, 0x1b, 0x0a, 0xbd, 0x15, 0x9b,
+  0x1d, 0x3c, 0x2e, 0xb4, 0x9d, 0x3d, 0x61, 0xba, 0xbe, 0xbc, 0xb8, 0xc8, 0x6a,
+  0x3d, 0xcc, 0x06, 0xa8, 0xbd, 0x83, 0xae, 0x13, 0xbc, 0x3d, 0xb4, 0x4c, 0xbd,
+  0xcc, 0xb5, 0x65, 0xbc, 0x0d, 0xad, 0x8b, 0x3c, 0x0e, 0x2f, 0x91, 0x3c, 0x1a,
+  0xfa, 0x1e, 0x3d, 0xbf, 0xe3, 0xf8, 0x3c, 0x21, 0x8d, 0x8c, 0xbc, 0x30, 0x1b,
+  0xcb, 0xbc, 0x34, 0x68, 0xf2, 0x3a, 0xed, 0x13, 0x0f, 0xbd, 0x66, 0x39, 0x61,
+  0xbd, 0xee, 0x87, 0x42, 0x3d, 0xc0, 0x58, 0x69, 0xbc, 0x3e, 0xe4, 0xd5, 0x3c,
+  0x46, 0x68, 0x30, 0xbd, 0x6c, 0x68, 0xad, 0x3c, 0x36, 0x63, 0x13, 0x3d, 0x0c,
+  0xf5, 0xf7, 0xbc, 0x56, 0x99, 0x71, 0x3d, 0x4a, 0xba, 0x10, 0x3d, 0xfc, 0xba,
+  0x3e, 0x3d, 0x5a, 0xd8, 0x82, 0x3d, 0x70, 0x17, 0x92, 0xbd, 0x0f, 0x9b, 0x77,
+  0xbd, 0x06, 0x4d, 0x78, 0x3d, 0xcb, 0x90, 0x96, 0x3d, 0xa5, 0x6d, 0x04, 0xbd,
+  0x4a, 0x4f, 0x0f, 0xbc, 0x83, 0x77, 0x3a, 0x3d, 0xdf, 0x43, 0x39, 0x3d, 0x17,
+  0x17, 0xf7, 0x3c, 0x3d, 0x1a, 0x44, 0xbd, 0x42, 0x1b, 0xdb, 0xbc, 0x1f, 0x26,
+  0x82, 0xbd, 0xfd, 0x51, 0xa5, 0x3d, 0xc5, 0x70, 0x45, 0x3d, 0x00, 0x17, 0xa1,
+  0x3c, 0xe1, 0x5c, 0x56, 0xbd, 0x57, 0x8c, 0xe6, 0xbc, 0x87, 0x07, 0xef, 0x3b,
+  0x9b, 0x41, 0xbf, 0xbd, 0xa1, 0x85, 0xd5, 0x3c, 0x07, 0x20, 0x0a, 0xbd, 0xc0,
+  0x19, 0xf3, 0xbb, 0x1f, 0xb5, 0xba, 0x3b, 0xa0, 0x79, 0x86, 0xbc, 0x62, 0x56,
+  0x40, 0xbd, 0x51, 0xf1, 0xa8, 0x3c, 0x83, 0x80, 0x86, 0x3c, 0x18, 0x2b, 0x2d,
+  0x3d, 0x8d, 0x66, 0xb6, 0x3c, 0x1d, 0xac, 0x2e, 0xbd, 0x91, 0xbc, 0x3e, 0xbd,
+  0xfb, 0x80, 0x75, 0x3d, 0x7d, 0xa1, 0x54, 0xba, 0x0f, 0xd1, 0x2f, 0xbd, 0xcb,
+  0x3a, 0x14, 0xbd, 0x76, 0xd3, 0x82, 0xbc, 0x15, 0x06, 0xf5, 0x39, 0xa4, 0xdb,
+  0x6e, 0x3d, 0x42, 0x46, 0xb7, 0x3c, 0xa3, 0x20, 0x00, 0x3d, 0xfc, 0x4f, 0x2b,
+  0xbd, 0x06, 0xb1, 0x7e, 0x3d, 0xf8, 0x37, 0xc9, 0xbc, 0x0d, 0x90, 0xd7, 0xbc,
+  0xb7, 0x8e, 0x0e, 0x3d, 0x68, 0xd8, 0x1d, 0xbc, 0x57, 0xb5, 0x11, 0x3d, 0x68,
+  0x20, 0x0b, 0x3d, 0x85, 0xda, 0x1e, 0xbd, 0xe0, 0xc0, 0x6b, 0xbd, 0x44, 0x69,
+  0x96, 0xbd, 0xec, 0xbd, 0x38, 0xbc, 0x09, 0x65, 0x85, 0xbd, 0xb4, 0xf4, 0x57,
+  0xbd, 0x35, 0xe4, 0xb2, 0xbc, 0xf7, 0x90, 0xd0, 0x3c, 0x78, 0xd1, 0x83, 0xbd,
+  0xe7, 0x8d, 0x1b, 0xbd, 0x49, 0xa3, 0x94, 0x3d, 0x56, 0xf3, 0x44, 0xbd, 0xb2,
+  0xce, 0x5e, 0x3d, 0x42, 0x8e, 0x37, 0xbd, 0x22, 0x3e, 0x79, 0xbd, 0xa0, 0x71,
+  0x6c, 0x3d, 0x23, 0x13, 0xb3, 0xbb, 0x0d, 0x32, 0x21, 0x3c, 0x35, 0x5e, 0xfd,
+  0xba, 0x0d, 0x0c, 0xbd, 0x3b, 0xcb, 0x0c, 0xaa, 0xbb, 0x33, 0xe8, 0x08, 0xbd,
+  0x43, 0x7a, 0xa5, 0xbc, 0x15, 0x50, 0x89, 0x3d, 0xd1, 0x86, 0x5b, 0x3d, 0x2a,
+  0xd8, 0x4c, 0x3d, 0xe1, 0x63, 0x19, 0xbc, 0xee, 0xf0, 0x6f, 0x3d, 0xfa, 0xc2,
+  0x44, 0x3d, 0x88, 0x3c, 0x6b, 0xbd, 0xe3, 0x24, 0xbb, 0xbc, 0x4c, 0xe6, 0x21,
+  0x3b, 0x47, 0xf2, 0xa1, 0xbc, 0x46, 0x96, 0xfd, 0x3c, 0x4c, 0x21, 0x86, 0xbd,
+  0x32, 0x28, 0x83, 0xbc, 0x70, 0x39, 0xa0, 0xbd, 0x80, 0xca, 0x4d, 0xbd, 0xc4,
+  0x91, 0x8d, 0xbc, 0xab, 0xae, 0x08, 0x3c, 0x54, 0xff, 0xb5, 0xbb, 0x76, 0xae,
+  0xbe, 0x3c, 0xd8, 0xd1, 0xa5, 0x3d, 0x03, 0x0c, 0x44, 0x3d, 0x92, 0x96, 0x40,
+  0xbd, 0xd5, 0xc5, 0x1f, 0x3d, 0xdf, 0x09, 0xc0, 0x3c, 0xfb, 0x0d, 0x5f, 0x3d,
+  0xfd, 0x07, 0x04, 0x3d, 0x1c, 0x43, 0x9a, 0xbd, 0xd7, 0x14, 0x72, 0xbd, 0x2d,
+  0x50, 0x84, 0xbd, 0x6a, 0x16, 0x7d, 0x38, 0xa6, 0xff, 0x90, 0x3d, 0x44, 0xb7,
+  0xcc, 0x3c, 0x5d, 0x5f, 0x69, 0xbd, 0x92, 0x8d, 0x6d, 0x3d, 0xf9, 0x02, 0x99,
+  0xbc, 0xe5, 0x7a, 0xc5, 0xbd, 0xde, 0x5c, 0x69, 0x3d, 0xee, 0xbf, 0xf4, 0x3c,
+  0x92, 0x19, 0x96, 0x3d, 0xf3, 0x5b, 0x35, 0xbd, 0xf3, 0x90, 0x3b, 0x3d, 0x90,
+  0xe2, 0xc2, 0xbc, 0x98, 0x91, 0xf9, 0xbc, 0x3b, 0x3b, 0x82, 0xbd, 0xb0, 0x85,
+  0x30, 0x3d, 0x14, 0x12, 0xea, 0xbc, 0x21, 0x84, 0x8c, 0x3d, 0x93, 0xcd, 0x65,
+  0x3d, 0xc9, 0x26, 0xda, 0xbc, 0xd5, 0xc3, 0x4e, 0x3c, 0xcc, 0x6e, 0x0f, 0x3d,
+  0x8d, 0xaf, 0x47, 0x3c, 0x9c, 0xfa, 0xe1, 0x3c, 0x3c, 0xe0, 0x4c, 0x3d, 0x79,
+  0x22, 0xed, 0x3c, 0xf4, 0x05, 0x3a, 0x3d, 0x59, 0xc0, 0x22, 0xbd, 0x5e, 0xaa,
+  0xf8, 0xbc, 0xc4, 0xda, 0x22, 0x3c, 0x76, 0x88, 0xaf, 0x3c, 0x1c, 0xf4, 0x3b,
+  0x3d, 0x4e, 0x6a, 0x1b, 0x3d, 0x60, 0xc7, 0x85, 0x3c, 0xb2, 0xc7, 0x75, 0x3d,
+  0xbd, 0xe4, 0xbe, 0xbc, 0x54, 0x8e, 0x82, 0x3d, 0x36, 0x27, 0x6a, 0xbc, 0x0d,
+  0x99, 0x00, 0xbd, 0x38, 0x5e, 0x9f, 0xbc, 0x9d, 0x49, 0xd6, 0x3d, 0xbb, 0x1a,
+  0x85, 0x3d, 0x6f, 0x89, 0x9f, 0x3c, 0xc5, 0x0b, 0xa7, 0xbc, 0x9e, 0x5a, 0xfa,
+  0xbc, 0xd3, 0x59, 0x50, 0xba, 0x3f, 0xc6, 0xbc, 0xbd, 0xb3, 0x9c, 0x12, 0xbd,
+  0x05, 0x39, 0xd6, 0x3b, 0x58, 0x14, 0x0d, 0x3d, 0x63, 0x0e, 0x19, 0x3d, 0x69,
+  0x9b, 0xa2, 0x3d, 0x68, 0x4d, 0x13, 0x3c, 0x06, 0x73, 0x64, 0xbd, 0x28, 0x79,
+  0x3c, 0xbd, 0x26, 0x23, 0x28, 0xbc, 0xb5, 0xa2, 0xa5, 0xba, 0xf6, 0x5f, 0x89,
+  0xbc, 0x66, 0x2e, 0x79, 0xbd, 0x90, 0xee, 0x54, 0xbc, 0x99, 0xf4, 0x4e, 0x3c,
+  0xdb, 0xdc, 0xd0, 0xbc, 0x3f, 0xed, 0x43, 0xbd, 0x03, 0xdf, 0xf4, 0x3c, 0x7d,
+  0x40, 0x2b, 0x3c, 0xfb, 0x1d, 0x64, 0x3d, 0xcd, 0x1f, 0xb8, 0x3d, 0xb1, 0xb2,
+  0x0f, 0x3d, 0x30, 0xf6, 0x38, 0xbd, 0x54, 0xef, 0x84, 0xbc, 0x2f, 0x3f, 0xac,
+  0xbd, 0xe0, 0xe1, 0xc4, 0xbc, 0x49, 0x0a, 0x03, 0xbd, 0xb8, 0x78, 0x43, 0xbc,
+  0xbf, 0xbc, 0x80, 0x3a, 0x1a, 0x41, 0x39, 0x3d, 0xd0, 0x5d, 0x8c, 0x3d, 0x8d,
+  0x8f, 0x5e, 0xbc, 0xfd, 0x1b, 0xed, 0xbd, 0x22, 0x7c, 0x99, 0xbc, 0x4c, 0xb3,
+  0x1d, 0xbc, 0x10, 0xbb, 0x1c, 0x3c, 0x19, 0x89, 0xd3, 0xbc, 0x2a, 0x64, 0x37,
+  0x3d, 0x11, 0x87, 0x00, 0x3c, 0x39, 0x0d, 0x1c, 0x3d, 0xb8, 0xeb, 0xde, 0xbc,
+  0x26, 0x9d, 0x05, 0xbd, 0x51, 0xca, 0x0d, 0xbd, 0xa9, 0xe0, 0xbc, 0x3c, 0xd6,
+  0x01, 0x2d, 0xbd, 0x72, 0x14, 0xd3, 0x3c, 0xf2, 0x07, 0x81, 0x3c, 0xe4, 0xbb,
+  0x00, 0x3d, 0x0b, 0x42, 0x09, 0x3b, 0x0e, 0x99, 0x71, 0xbd, 0x32, 0x91, 0x10,
+  0xbd, 0xa0, 0x0b, 0x05, 0xbd, 0x7f, 0xf8, 0xf6, 0x3c, 0xd4, 0x72, 0xbd, 0x3c,
+  0xdf, 0xcc, 0x8a, 0x3d, 0x0e, 0x3d, 0x24, 0x3d, 0x71, 0x5a, 0x52, 0xbd, 0xb6,
+  0x11, 0xda, 0xbc, 0x5b, 0xec, 0x9c, 0x3d, 0x4a, 0x73, 0xfd, 0xbc, 0xc1, 0x2b,
+  0x9f, 0xbd, 0x06, 0xed, 0x2f, 0xbd, 0x38, 0x4c, 0x53, 0x3d, 0x36, 0x8d, 0xc1,
+  0x3c, 0x14, 0x26, 0xa3, 0xbd, 0x2d, 0x2f, 0x0a, 0xbb, 0xfd, 0x7d, 0xa5, 0xbd,
+  0x10, 0xbe, 0xe4, 0x3b, 0x77, 0x22, 0x6a, 0x3d, 0xdd, 0x33, 0xc3, 0x3c, 0x3e,
+  0x8e, 0xbb, 0xbd, 0x60, 0x54, 0x81, 0x3d, 0x02, 0xcf, 0x15, 0x3d, 0x06, 0x28,
+  0xd5, 0x3d, 0xda, 0xb6, 0x6f, 0xbd, 0xf6, 0x93, 0x86, 0xbc, 0x98, 0x16, 0x45,
+  0x3d, 0xdc, 0x9e, 0x47, 0x3c, 0x8b, 0x3a, 0x82, 0xbd, 0x11, 0x05, 0xb6, 0xbd,
+  0x0e, 0x26, 0xc1, 0xbc, 0xe2, 0xdc, 0xab, 0x3d, 0x10, 0x6e, 0x84, 0x3d, 0x49,
+  0x2f, 0x1c, 0xbb, 0x0e, 0x73, 0x7a, 0x3c, 0x82, 0x17, 0x29, 0x3d, 0x88, 0x40,
+  0x91, 0x3b, 0x2d, 0xcd, 0xf3, 0xbc, 0xcc, 0x39, 0x37, 0xbd, 0xb0, 0x03, 0x17,
+  0x3d, 0xb8, 0xd0, 0x22, 0x3d, 0xc6, 0x69, 0x90, 0x3c, 0x09, 0x0f, 0xc2, 0x3b,
+  0x7a, 0x64, 0xcc, 0xbc, 0x26, 0x93, 0x22, 0x3d, 0xa3, 0xe0, 0x4b, 0xbd, 0x7d,
+  0xca, 0x2f, 0xbb, 0xda, 0x26, 0x19, 0x3d, 0xe7, 0x88, 0x47, 0xbc, 0x4e, 0x0f,
+  0x3b, 0x3d, 0xf8, 0x1c, 0x1c, 0x3d, 0xb4, 0x23, 0x8e, 0x3d, 0xaf, 0xa6, 0x10,
+  0xbd, 0xfc, 0x9a, 0x9c, 0x3c, 0x35, 0x69, 0x9f, 0x3d, 0xe4, 0x5f, 0x8f, 0xbd,
+  0xc7, 0xe3, 0x98, 0x3d, 0xab, 0xb8, 0xcc, 0x3b, 0x6a, 0xa9, 0x0f, 0xbd, 0x0d,
+  0x8a, 0x6a, 0xbd, 0x1e, 0xec, 0x10, 0x3d, 0xa0, 0x13, 0xe8, 0x3b, 0xc0, 0x77,
+  0x93, 0x3c, 0x3f, 0x03, 0x0b, 0x3d, 0xde, 0x40, 0xb4, 0x3c, 0xfc, 0xdb, 0x06,
+  0xbd, 0xc3, 0x86, 0x90, 0x3d, 0x54, 0x89, 0x37, 0x3d, 0x55, 0xd4, 0x8d, 0xbd,
+  0x39, 0x31, 0xb7, 0xbc, 0xab, 0x31, 0xc0, 0xbc, 0x60, 0x17, 0xdb, 0xbb, 0x49,
+  0xa9, 0x2f, 0xbc, 0xbf, 0xcb, 0xd6, 0x3b, 0x83, 0x93, 0x16, 0x3d, 0xba, 0xdd,
+  0x1b, 0xbd, 0xd1, 0x6a, 0x17, 0x3d, 0x45, 0x0f, 0x1d, 0xbd, 0xa3, 0xc1, 0xb5,
+  0xbd, 0x88, 0x0e, 0x6e, 0x3d, 0x41, 0x5d, 0x06, 0x3d, 0xd8, 0xeb, 0xb4, 0x3c,
+  0xe5, 0xc8, 0x88, 0xbb, 0x48, 0x65, 0x47, 0x3d, 0xff, 0xe8, 0xa6, 0xbd, 0x12,
+  0x2a, 0x10, 0xbd, 0xd0, 0x90, 0x8b, 0x3d, 0x17, 0x08, 0xfc, 0xbc, 0x8e, 0xb4,
+  0x9a, 0xbc, 0x70, 0x79, 0x3f, 0x3d, 0xd8, 0xad, 0x06, 0x3c, 0xf8, 0x4e, 0x81,
+  0xbd, 0x82, 0xf1, 0x71, 0xbd, 0x9f, 0x19, 0xcc, 0xbd, 0xaf, 0x6a, 0x45, 0x3d,
+  0x4e, 0x39, 0x25, 0x3d, 0x17, 0x43, 0x74, 0x3d, 0x52, 0x51, 0x53, 0xbd, 0x53,
+  0x10, 0x5f, 0xbd, 0x5f, 0x60, 0xf7, 0x3c, 0xf4, 0x07, 0x6d, 0x3d, 0x68, 0x1d,
+  0x29, 0x3d, 0xd6, 0xf7, 0xad, 0xbc, 0x09, 0x0d, 0x8f, 0xbd, 0x17, 0xae, 0xd7,
+  0x3c, 0x63, 0xf2, 0xc7, 0xbc, 0x4e, 0xa0, 0x05, 0xbd, 0x53, 0x3b, 0xc5, 0xbc,
+  0x81, 0xf4, 0x82, 0x3d, 0x5e, 0xc9, 0x56, 0xbd, 0x32, 0xb8, 0xbd, 0xbc, 0xf2,
+  0x3e, 0xc7, 0xbc, 0x76, 0x7f, 0x76, 0xbd, 0x19, 0x45, 0x13, 0xbd, 0xb9, 0x17,
+  0x88, 0x3d, 0xef, 0x15, 0x68, 0xbd, 0x7a, 0xb8, 0xf6, 0x3a, 0xa8, 0x56, 0x72,
+  0xbb, 0x96, 0x68, 0xce, 0x3d, 0x13, 0x43, 0x0a, 0xbd, 0x87, 0x3f, 0x91, 0x3c,
+  0xd7, 0x12, 0x8b, 0x3b, 0x2f, 0x85, 0xbf, 0xbc, 0x33, 0xfc, 0x62, 0xbc, 0x5f,
+  0xb3, 0x8f, 0xbc, 0x9f, 0x1a, 0xf5, 0xbc, 0x3b, 0x75, 0x68, 0x3d, 0x58, 0xae,
+  0x3c, 0x3d, 0xe3, 0x00, 0x5d, 0x3d, 0xcf, 0x69, 0x9c, 0x3d, 0xdb, 0x20, 0xb3,
+  0x39, 0x31, 0x1a, 0x7a, 0xbc, 0x11, 0x37, 0xd0, 0x3c, 0x1d, 0x5d, 0x84, 0x3d,
+  0xb2, 0x5d, 0xe9, 0xbc, 0x24, 0x74, 0xe5, 0xbc, 0x86, 0x1d, 0xea, 0xbb, 0x65,
+  0x94, 0x76, 0x3d, 0x9a, 0xb2, 0xeb, 0x3c, 0x62, 0x9f, 0x44, 0xbb, 0xca, 0x35,
+  0xa8, 0xbc, 0x25, 0x51, 0x23, 0x3d, 0xa9, 0xac, 0x00, 0xbd, 0xb9, 0x13, 0xa6,
+  0x3d, 0x3e, 0x3e, 0x10, 0xbc, 0x5f, 0x40, 0x8b, 0x3d, 0x75, 0xef, 0x70, 0x3b,
+  0xf8, 0x66, 0xa4, 0x3c, 0x69, 0x24, 0x84, 0x3c, 0x2a, 0xd2, 0x76, 0xbc, 0x67,
+  0xef, 0x9f, 0xbc, 0xe1, 0x67, 0xcb, 0xbc, 0xe1, 0x4c, 0xa9, 0xbd, 0x18, 0xb6,
+  0x96, 0x3d, 0x29, 0xaa, 0x84, 0xbd, 0x80, 0x0d, 0x5b, 0x3d, 0x35, 0xe7, 0x02,
+  0x3d, 0xea, 0xf8, 0x46, 0xbd, 0xba, 0x63, 0x42, 0x3d, 0x3e, 0x6d, 0x83, 0x3d,
+  0x0d, 0x47, 0x3c, 0xbd, 0x79, 0xe3, 0xa1, 0x3c, 0x7b, 0x77, 0x17, 0xbd, 0x4d,
+  0x55, 0x53, 0x3d, 0xc3, 0x91, 0x7e, 0xbd, 0x9b, 0x6b, 0x49, 0x3d, 0x30, 0xad,
+  0xc7, 0xbc, 0xc1, 0x27, 0x3e, 0xbd, 0xea, 0xaf, 0x51, 0x3d, 0x12, 0x3a, 0x94,
+  0xbc, 0xf1, 0x36, 0xf1, 0x3c, 0x6a, 0x5a, 0x93, 0x3b, 0x88, 0x1e, 0xb1, 0xbc,
+  0x3c, 0x43, 0x37, 0xbd, 0x74, 0xda, 0x9a, 0xbd, 0x53, 0x3d, 0x7b, 0x3d, 0xe7,
+  0x18, 0xdd, 0xbc, 0xba, 0x1b, 0xd9, 0xbc, 0xe8, 0x9a, 0x64, 0xbd, 0xca, 0x36,
+  0x2b, 0x3d, 0xc6, 0x99, 0xbc, 0x3c, 0xa6, 0x76, 0x72, 0x3d, 0x59, 0x8a, 0xb5,
+  0x3c, 0x07, 0xf8, 0xd7, 0x3d, 0xdd, 0xaf, 0x2a, 0xb8, 0x77, 0xac, 0xb7, 0x3c,
+  0x53, 0xd6, 0x12, 0xbd, 0x19, 0x6c, 0x63, 0x3c, 0xe0, 0xf5, 0x32, 0xbd, 0x72,
+  0xc2, 0xae, 0xbd, 0x04, 0x6b, 0x12, 0x3c, 0xea, 0x76, 0x99, 0x3d, 0x5e, 0x14,
+  0x25, 0xbd, 0x16, 0x01, 0x01, 0xbc, 0x6d, 0x0e, 0xb8, 0x3d, 0x78, 0x70, 0x85,
+  0x3b, 0x7b, 0xb9, 0x55, 0xbb, 0x59, 0xa4, 0x2f, 0x3d, 0xbb, 0xf1, 0x4e, 0xbc,
+  0x6e, 0x1e, 0x6f, 0x3d, 0x6d, 0xd0, 0x82, 0x3d, 0xa1, 0x2a, 0x38, 0xbd, 0x82,
+  0x0e, 0x81, 0x3d, 0x51, 0x1a, 0xe8, 0x3c, 0x78, 0x0f, 0xb2, 0xbc, 0xdb, 0x4a,
+  0x9f, 0x3d, 0xeb, 0xf7, 0x5f, 0x3b, 0xf0, 0x3e, 0xe2, 0xbc, 0x9c, 0x11, 0x91,
+  0x3c, 0xb0, 0xbd, 0x1a, 0x3c, 0xce, 0x3f, 0x1c, 0xbb, 0x0e, 0xe3, 0x0b, 0x3d,
+  0x2e, 0x44, 0x15, 0x3d, 0x90, 0x12, 0xe8, 0x3c, 0x84, 0xb7, 0x46, 0x3d, 0x4f,
+  0x51, 0x90, 0x3c, 0x5f, 0xee, 0xe8, 0x3c, 0x8f, 0xa8, 0xd2, 0xbb, 0x86, 0x20,
+  0x7c, 0x3d, 0xe8, 0x1f, 0x48, 0xbc, 0xbb, 0x7f, 0x59, 0x3d, 0x62, 0xf1, 0x8a,
+  0xbc, 0x94, 0x28, 0x0c, 0x3c, 0xdd, 0x8f, 0x1a, 0xbd, 0xad, 0x5a, 0xa8, 0x39,
+  0x4d, 0x0c, 0x71, 0x3d, 0x96, 0xa2, 0x91, 0x3d, 0xe7, 0x9c, 0x69, 0xbc, 0x1f,
+  0x9d, 0x0c, 0xbd, 0x6e, 0xbe, 0xe7, 0x3c, 0x97, 0x28, 0x35, 0xbd, 0x11, 0xb7,
+  0x8c, 0xbd, 0x3b, 0xc0, 0xc1, 0x3c, 0x02, 0x96, 0xd7, 0x3c, 0x79, 0x02, 0x4d,
+  0xbc, 0x6c, 0xad, 0xb7, 0x3c, 0x9a, 0xef, 0x29, 0x3d, 0xe9, 0x73, 0x9b, 0x3d,
+  0x58, 0xd3, 0x17, 0x3d, 0xea, 0xcc, 0x2d, 0xbd, 0x64, 0x3a, 0x9e, 0xbd, 0x9a,
+  0x8b, 0x3c, 0xbd, 0x4f, 0x97, 0x88, 0xbc, 0x1b, 0x18, 0x27, 0xbc, 0x22, 0xdc,
+  0xde, 0xbd, 0xb4, 0xbe, 0x94, 0xba, 0x5a, 0xc7, 0xe0, 0x3b, 0xe9, 0xd7, 0x07,
+  0x3c, 0xcb, 0x47, 0xf2, 0x3c, 0x04, 0xca, 0x2f, 0x3d, 0x25, 0x4d, 0xd9, 0x3c,
+  0xc1, 0xb9, 0x37, 0xbd, 0xa1, 0x9a, 0x0c, 0x3d, 0x78, 0xae, 0x88, 0xbd, 0x02,
+  0xb5, 0x98, 0x3d, 0x63, 0x8b, 0x79, 0xbd, 0xab, 0xe4, 0xaa, 0x3d, 0x5a, 0x1e,
+  0x02, 0xbc, 0x16, 0x17, 0x68, 0x3b, 0xf8, 0x36, 0x0d, 0x3b, 0x1f, 0x67, 0x8c,
+  0xbd, 0xbc, 0x52, 0xe2, 0xbc, 0x2f, 0xee, 0xe2, 0xbb, 0x46, 0x45, 0x08, 0x3d,
+  0xd2, 0xea, 0xc9, 0x3c, 0x00, 0xcc, 0x5c, 0x3d, 0x1e, 0x1f, 0x54, 0x3c, 0x10,
+  0x3e, 0x8e, 0x3c, 0x1e, 0x6d, 0x5f, 0xbd, 0xfb, 0xdb, 0x64, 0x3d, 0x62, 0x27,
+  0xb5, 0xbd, 0x0a, 0x8c, 0x51, 0xbd, 0x5e, 0x4d, 0xae, 0xbd, 0xd4, 0xd2, 0x65,
+  0x3d, 0x88, 0xc4, 0xc0, 0x3c, 0x25, 0x97, 0xb9, 0xbb, 0x6d, 0x7c, 0x5b, 0x3d,
+  0x42, 0x2f, 0x0e, 0xbb, 0x42, 0xfc, 0xb3, 0xba, 0x38, 0x1c, 0xae, 0xbc, 0x4d,
+  0xba, 0x7a, 0xbd, 0x15, 0xf7, 0x9d, 0x3d, 0x51, 0xc4, 0x82, 0x3d, 0x70, 0xa9,
+  0x47, 0x3d, 0x68, 0x1c, 0xdf, 0x3c, 0xef, 0x44, 0x71, 0x3c, 0xdf, 0x7d, 0x80,
+  0x3d, 0x6c, 0x6c, 0xcd, 0xbc, 0x9b, 0xf2, 0x68, 0x3d, 0x61, 0x10, 0x64, 0x3d,
+  0x31, 0x19, 0xda, 0x3c, 0xc3, 0x1c, 0xdc, 0xbb, 0xe1, 0x30, 0x13, 0xbc, 0x4d,
+  0xd5, 0xaf, 0xbb, 0x39, 0xaa, 0x43, 0xbd, 0x9a, 0x51, 0x75, 0xbd, 0xc3, 0x2b,
+  0x5e, 0x3c, 0x2f, 0x60, 0xed, 0x3c, 0x2a, 0x8e, 0x87, 0x3d, 0x0e, 0x88, 0x08,
+  0xbd, 0xcb, 0x1a, 0xc2, 0x3b, 0x86, 0xdb, 0x44, 0xbd, 0x3c, 0xb2, 0xd8, 0xbc,
+  0xd8, 0x5c, 0x2a, 0x3d, 0xf9, 0xb9, 0x06, 0xbd, 0xf6, 0x2f, 0x52, 0x3d, 0xda,
+  0x46, 0xe9, 0x3b, 0xeb, 0x10, 0xd5, 0x3c, 0x5a, 0x5a, 0x70, 0x3b, 0x58, 0xd3,
+  0x30, 0x3c, 0xb3, 0x7e, 0x00, 0xbd, 0x81, 0x37, 0x56, 0xbd, 0x0a, 0x66, 0x12,
+  0xbd, 0xd7, 0xca, 0x80, 0xbd, 0x89, 0x4c, 0x52, 0x3d, 0x42, 0x49, 0xab, 0x3c,
+  0x79, 0xe8, 0xa6, 0xbd, 0xa2, 0x35, 0xd5, 0xbd, 0xa3, 0x0c, 0x0e, 0xbd, 0x4f,
+  0x10, 0x8a, 0x3d, 0xd4, 0xbe, 0x64, 0x3d, 0x38, 0x13, 0xfd, 0x3d, 0x86, 0xc8,
+  0x82, 0xbd, 0xd2, 0x11, 0x46, 0x3d, 0xcc, 0x13, 0x6a, 0x3d, 0x29, 0x91, 0xe2,
+  0xbc, 0x9a, 0x59, 0xc8, 0xbc, 0x6d, 0xd3, 0x79, 0xbd, 0x00, 0x17, 0xbd, 0x3d,
+  0x2f, 0x3d, 0x13, 0xbd, 0xf2, 0x5e, 0x5a, 0x3d, 0x91, 0xd3, 0x22, 0xbc, 0x8d,
+  0x7d, 0xdd, 0x3c, 0xcb, 0xd3, 0x47, 0x3d, 0x51, 0x39, 0x43, 0x3d, 0x8e, 0xba,
+  0xb3, 0x3c, 0xcf, 0xdc, 0x5d, 0xbc, 0xe8, 0xf4, 0x69, 0xbd, 0x75, 0xed, 0x4a,
+  0xbd, 0x3e, 0xa3, 0x52, 0x3d, 0x55, 0xbe, 0x6e, 0xbd, 0x84, 0x86, 0xb3, 0xbc,
+  0x7d, 0x3b, 0x4f, 0xbd, 0xd0, 0x9c, 0x8f, 0xbb, 0xe4, 0x9f, 0x39, 0x3d, 0x10,
+  0x5c, 0xf0, 0xbb, 0x64, 0x15, 0x82, 0xbc, 0x12, 0xf8, 0x45, 0x3d, 0xf6, 0xfc,
+  0x40, 0x3d, 0x64, 0x01, 0x84, 0xbc, 0x4e, 0x97, 0x28, 0x3d, 0xc0, 0xb8, 0x30,
+  0x3d, 0xf8, 0x94, 0x71, 0xbd, 0x59, 0x5a, 0x61, 0xbd, 0x9e, 0x55, 0x8d, 0xbd,
+  0x00, 0x77, 0xfa, 0xbc, 0x9c, 0xbf, 0x17, 0x3d, 0x94, 0x7a, 0x4f, 0xbd, 0xb1,
+  0xa6, 0x8f, 0xbd, 0xad, 0xc3, 0x8a, 0x3d, 0xf0, 0xca, 0x8b, 0x3c, 0x2a, 0xe4,
+  0x2b, 0xbd, 0x34, 0x81, 0x44, 0xbd, 0x48, 0x55, 0x52, 0xbd, 0x2e, 0x7e, 0x63,
+  0x3d, 0x3a, 0x07, 0x4e, 0x3d, 0xb0, 0xb9, 0x7a, 0x3c, 0x18, 0x7d, 0x6e, 0xbc,
+  0x7a, 0x0e, 0x3c, 0xbd, 0xdc, 0x81, 0x8c, 0xbd, 0xc8, 0xa4, 0x71, 0x3c, 0xca,
+  0x20, 0x28, 0x3d, 0x28, 0x36, 0xf6, 0x3c, 0x28, 0xef, 0x3c, 0x3d, 0x88, 0x83,
+  0x3e, 0x3c, 0x74, 0x45, 0x34, 0x3d, 0x80, 0x11, 0x06, 0xba, 0x8c, 0xd1, 0x79,
+  0xbc, 0x84, 0x71, 0x26, 0xbd, 0x98, 0x15, 0x15, 0x3c, 0x4a, 0x0e, 0x92, 0xbc,
+  0x75, 0x17, 0x83, 0x3d, 0xfc, 0x9c, 0xc1, 0xbc, 0x4c, 0xe3, 0xb5, 0x3c, 0x10,
+  0xc9, 0x23, 0x3c, 0xd0, 0xde, 0x1a, 0x3c, 0x22, 0x15, 0x92, 0xbd, 0xe6, 0x39,
+  0x48, 0xbd, 0x16, 0x40, 0x91, 0xbd, 0x5c, 0xf1, 0xb4, 0x3c, 0x4a, 0xf7, 0xbc,
+  0xbc, 0x80, 0x48, 0x44, 0x3c, 0xc8, 0x47, 0x15, 0xbc, 0xcb, 0x39, 0x4d, 0xbd,
+  0x04, 0xe1, 0xc0, 0x3c, 0x86, 0x40, 0x43, 0xbd, 0x3f, 0x39, 0x6a, 0xbd, 0x00,
+  0xfd, 0x30, 0xbb, 0x18, 0x14, 0x60, 0xbc, 0xf0, 0x88, 0x12, 0x3d, 0x21, 0xf7,
+  0x90, 0x3d, 0xfc, 0xcc, 0xa1, 0x3c, 0xa6, 0x1f, 0x2d, 0x3d, 0x0a, 0x14, 0x46,
+  0xbd, 0x37, 0x3c, 0x5f, 0xbd, 0x32, 0x53, 0x94, 0xbc, 0x58, 0x51, 0xb1, 0xbc,
+  0xd7, 0x03, 0x89, 0x3d, 0xfe, 0x03, 0x37, 0xbd, 0x9e, 0x06, 0x89, 0xbd, 0xbc,
+  0xf6, 0x41, 0x3d, 0xf0, 0x87, 0x32, 0x3d, 0xdc, 0x11, 0xeb, 0xbc, 0x4a, 0x89,
+  0x3b, 0x3d, 0xd2, 0xf1, 0x2b, 0x3d, 0x78, 0xcb, 0x38, 0xbc, 0x46, 0xda, 0xff,
+  0xbc, 0xee, 0x9c, 0x8d, 0xbd, 0x14, 0x8e, 0xcd, 0xbc, 0x08, 0x6f, 0x05, 0x3d,
+  0x00, 0xac, 0x8e, 0xbd, 0x90, 0xa2, 0x84, 0xbb, 0x9b, 0x36, 0x32, 0xbd, 0x2b,
+  0x3f, 0x89, 0x3d, 0x80, 0x9a, 0x03, 0xbb, 0x06, 0xac, 0x17, 0x3d, 0xf8, 0x22,
+  0x3f, 0xbd, 0x75, 0xae, 0x90, 0xbd, 0x76, 0xdd, 0x3e, 0xbd, 0x7c, 0x72, 0x92,
+  0x3c, 0x4c, 0x38, 0x44, 0xbd, 0xba, 0x8f, 0x21, 0x3d, 0x00, 0x88, 0x7e, 0xbb,
+  0xdc, 0xd2, 0x92, 0x3c, 0x1a, 0x45, 0x77, 0x3d, 0x54, 0xa1, 0x50, 0xbc, 0x44,
+  0xea, 0x2d, 0x3d, 0x8e, 0xbd, 0x1d, 0x3d, 0x1b, 0xb9, 0x88, 0x3d, 0x20, 0xc4,
+  0x8b, 0xbd, 0x43, 0x9e, 0x05, 0xbd, 0x80, 0x93, 0x4a, 0x3d, 0x02, 0xb3, 0x8a,
+  0xbd, 0x40, 0x5c, 0xbb, 0x3b, 0x54, 0x22, 0x37, 0xbd, 0x04, 0xd5, 0xed, 0xbc,
+  0xae, 0xce, 0x87, 0xbd, 0x0c, 0x0f, 0xe3, 0xbc, 0xc1, 0x1f, 0x48, 0xbd, 0x68,
+  0x6a, 0x9a, 0x3c, 0xd0, 0x0b, 0x8f, 0x3c, 0xc8, 0x5c, 0x00, 0x3d, 0x60, 0xf9,
+  0xd5, 0xbb, 0x57, 0x9a, 0x88, 0xbd, 0xf2, 0x1a, 0x8d, 0xbd, 0x52, 0x69, 0x63,
+  0x3d, 0xb8, 0x69, 0x89, 0x3c, 0x56, 0xfb, 0x0a, 0x3d, 0x00, 0xc3, 0x10, 0xba,
+  0x0e, 0xcd, 0x56, 0xbd, 0x1a, 0xf7, 0x61, 0x3d, 0xf8, 0x95, 0x8b, 0xbd, 0x3c,
+  0x34, 0x14, 0xbd, 0xed, 0xc6, 0x8f, 0x3d, 0xee, 0xc2, 0x1c, 0x3d, 0xa0, 0x9d,
+  0x04, 0xbb, 0xfd, 0x06, 0x56, 0xbd, 0xa0, 0xe7, 0x12, 0x3b, 0xae, 0x01, 0xbd,
+  0xbc, 0xb0, 0x52, 0x16, 0x3d, 0x00, 0x9e, 0x97, 0xba, 0x40, 0xaf, 0x58, 0x3d,
+  0xa4, 0x80, 0x97, 0x3c, 0xa0, 0x07, 0x22, 0x3b, 0x59, 0x3b, 0x01, 0xbd, 0x83,
+  0x64, 0x87, 0x3d, 0x0e, 0xfd, 0x96, 0xbc, 0x3a, 0xf8, 0x7b, 0xbd, 0x7d, 0x61,
+  0x0a, 0xbd, 0xe2, 0x4c, 0x58, 0xbd, 0xc0, 0x1b, 0x81, 0xbb, 0x70, 0x48, 0x0b,
+  0x3d, 0x5a, 0x4c, 0x94, 0xbc, 0x6a, 0x49, 0x5b, 0x3d, 0x58, 0x79, 0x7a, 0x3c,
+  0x54, 0xe4, 0x10, 0xbd, 0x0f, 0x05, 0x8c, 0x3d, 0x00, 0x70, 0xb3, 0xba, 0xfe,
+  0x52, 0xec, 0xbc, 0x80, 0x87, 0xe5, 0x3b, 0x76, 0x35, 0x7f, 0x3d, 0x20, 0x23,
+  0x36, 0x3b, 0x48, 0xe0, 0x16, 0x3d, 0x0e, 0xdb, 0x53, 0x3d, 0x76, 0x7d, 0xcb,
+  0xbc, 0x79, 0xf8, 0x5c, 0xbd, 0x8a, 0x7c, 0x39, 0x3d, 0x8c, 0x87, 0x1d, 0x3d,
+  0x3a, 0x32, 0x08, 0xbd, 0x54, 0xa9, 0x6a, 0xbc, 0x22, 0xad, 0xad, 0xbc, 0xd2,
+  0x4b, 0x68, 0x3d, 0x86, 0x89, 0xee, 0xbc, 0x42, 0xee, 0x7d, 0x3d, 0x56, 0x9e,
+  0x46, 0x3d, 0x58, 0xcd, 0xd0, 0x3c, 0xb4, 0x6d, 0x9f, 0x3c, 0x0c, 0x5b, 0x20,
+  0xbd, 0x40, 0xe8, 0x2c, 0x3b, 0x23, 0xd1, 0x80, 0x3d, 0xee, 0x0f, 0xc8, 0xbc,
+  0x1c, 0x52, 0xd5, 0x3c, 0x68, 0x8d, 0x63, 0xbc, 0x9c, 0xb3, 0x37, 0xbd, 0x0c,
+  0x04, 0xde, 0x3c, 0x50, 0x20, 0x93, 0x3b, 0xac, 0xef, 0xf6, 0x3c, 0xac, 0x6e,
+  0x93, 0xbc, 0x92, 0x06, 0x64, 0x3d, 0x28, 0xdd, 0x74, 0x3c, 0xf7, 0x67, 0x86,
+  0x3d, 0x2c, 0x86, 0x43, 0x3d, 0x30, 0x55, 0x89, 0xbd, 0xa0, 0xf0, 0xd7, 0xbb,
+  0xe4, 0x7f, 0x05, 0x3d, 0x18, 0xf7, 0x3f, 0x3c, 0x46, 0xaf, 0xcb, 0xbc, 0x80,
+  0xf0, 0xb3, 0x3b, 0xdc, 0xe9, 0x81, 0x3c, 0xef, 0x3f, 0x5c, 0xbd, 0xfe, 0xb8,
+  0xa1, 0xbc, 0x90, 0x44, 0x41, 0x3c, 0x4e, 0xc8, 0x30, 0xbd, 0x63, 0x6e, 0x72,
+  0xbd, 0xbc, 0x52, 0xbf, 0xbc, 0x7c, 0x04, 0x47, 0xbd, 0x4c, 0xe3, 0x4e, 0xbd,
+  0x34, 0x8b, 0x36, 0x3d, 0xd1, 0xf2, 0x33, 0xbd, 0x16, 0x48, 0x09, 0x3d, 0x8c,
+  0x31, 0x00, 0xbd, 0xd9, 0x91, 0x8e, 0xbd, 0xf2, 0x8d, 0x64, 0xbd, 0x48, 0x20,
+  0xbf, 0xbc, 0x60, 0x89, 0x53, 0x3b, 0x00, 0x96, 0x71, 0x3a, 0x44, 0x6e, 0x8c,
+  0xbd, 0x90, 0x6b, 0x7d, 0xbd, 0x64, 0x71, 0xa6, 0x3c, 0x52, 0x23, 0x70, 0x3d,
+  0xf3, 0x05, 0x80, 0x3d, 0xb4, 0xe2, 0x68, 0xbd, 0x20, 0x6f, 0xf9, 0x3b, 0x60,
+  0x31, 0x2c, 0x3d, 0x30, 0x78, 0x4b, 0xbd, 0xd8, 0xae, 0x23, 0xbc, 0x40, 0xea,
+  0xc5, 0x3a, 0xd0, 0xe7, 0x86, 0xbd, 0xa0, 0x57, 0x47, 0x3d, 0x70, 0x78, 0xab,
+  0x3b, 0x1c, 0xab, 0xb1, 0xbc, 0x2a, 0x75, 0x5d, 0xbd, 0xd0, 0xd1, 0x26, 0xbd,
+  0x90, 0x93, 0x3a, 0xbd, 0xb4, 0x8a, 0xe9, 0xbc, 0xac, 0xf1, 0xa5, 0xbc, 0x10,
+  0xa3, 0xa7, 0xbb, 0x02, 0xb2, 0x73, 0xbd, 0x2e, 0x27, 0xb7, 0xbc, 0xd0, 0x0c,
+  0x92, 0xbd, 0x0e, 0x8e, 0x77, 0x3d, 0x5a, 0x78, 0x0a, 0x3d, 0xf4, 0xa9, 0xc5,
+  0x3c, 0x82, 0x8a, 0x15, 0x3d, 0x3d, 0x25, 0x13, 0xbd, 0x7e, 0x35, 0x12, 0xbd,
+  0x2a, 0xd2, 0x6e, 0x3d, 0x78, 0x60, 0xcb, 0xbc, 0x70, 0x92, 0x81, 0xbd, 0xca,
+  0x3f, 0x2f, 0xbd, 0x3b, 0x71, 0x67, 0xbd, 0x80, 0x79, 0x83, 0xba, 0xc6, 0x2a,
+  0x47, 0x3d, 0x86, 0x99, 0x72, 0x3d, 0x6c, 0x59, 0x8f, 0x3c, 0x73, 0x59, 0x14,
+  0xbd, 0x23, 0x83, 0x82, 0x3d, 0x94, 0x4d, 0x8b, 0xbd, 0x9c, 0x05, 0x2f, 0xbd,
+  0x60, 0xae, 0x57, 0x3d, 0x95, 0x1c, 0x86, 0x3d, 0x26, 0xaf, 0x78, 0x3d, 0x47,
+  0x4b, 0x4e, 0xbd, 0x96, 0xfd, 0x75, 0x3d, 0xb2, 0x63, 0x35, 0x3d, 0xc0, 0x00,
+  0xa3, 0x3b, 0x12, 0x16, 0x3d, 0x3d, 0x8e, 0xd2, 0x56, 0xbd, 0x02, 0xff, 0xec,
+  0xbc, 0x96, 0x20, 0xcc, 0xbc, 0xf4, 0x61, 0x0b, 0x3d, 0x20, 0x12, 0x58, 0x3b,
+  0x5a, 0xa3, 0x4c, 0x3d, 0x80, 0x86, 0x64, 0x3b, 0x0e, 0x77, 0x70, 0x3d, 0xd0,
+  0x7b, 0xe8, 0xbb, 0x92, 0x2d, 0x20, 0xbd, 0xc8, 0x33, 0x6f, 0xbc, 0xf8, 0x0f,
+  0x76, 0x3c, 0x3a, 0xea, 0x36, 0x3d, 0xc0, 0x6c, 0x47, 0x3b, 0x00, 0x3b, 0x98,
+  0xbc, 0x88, 0x52, 0x3b, 0x3c, 0xa8, 0x58, 0x54, 0x3c, 0x5a, 0xff, 0x4f, 0x3d,
+  0xfe, 0x26, 0x5e, 0x3d, 0x7c, 0x39, 0x8e, 0xbc, 0x96, 0x37, 0x75, 0x3d, 0xbd,
+  0x95, 0x86, 0xbd, 0x6b, 0x40, 0x91, 0x3d, 0x40, 0x14, 0x3a, 0xbb, 0xf0, 0xe0,
+  0x0f, 0xbc, 0xeb, 0x23, 0x82, 0x3d, 0xe0, 0x7c, 0x8e, 0x3b, 0x60, 0x71, 0x11,
+  0xbc, 0x3e, 0x89, 0x2c, 0xbd, 0x9a, 0x0a, 0x7f, 0xbd, 0xe8, 0x86, 0xcd, 0x3c,
+  0xd4, 0x1d, 0xfe, 0x3c, 0xc6, 0x1f, 0x63, 0x3d, 0xe8, 0x6a, 0x2d, 0x3c, 0xec,
+  0xb5, 0x02, 0x3d, 0x78, 0xcb, 0xe0, 0xbc, 0x74, 0x19, 0x64, 0xbc, 0xf0, 0xf7,
+  0x69, 0xbc, 0x11, 0x97, 0x92, 0xbd, 0xe2, 0x89, 0x8b, 0xbd, 0x36, 0xe1, 0xa2,
+  0xbc, 0x38, 0x7d, 0xb2, 0xbc, 0xf4, 0x26, 0x16, 0x3d, 0x70, 0x40, 0x90, 0xbd,
+  0xe0, 0x0a, 0x70, 0x3c, 0x86, 0xb8, 0x35, 0x3d, 0x67, 0xd7, 0x8d, 0x3d, 0xd0,
+  0xdc, 0x17, 0xbc, 0x10, 0xf7, 0xcd, 0xbb, 0xfe, 0x64, 0x59, 0x3d, 0x34, 0xf3,
+  0x3c, 0xbd, 0x40, 0xfe, 0xae, 0xba, 0xd1, 0x87, 0x85, 0x3d, 0x10, 0x58, 0x65,
+  0xbd, 0x66, 0xaf, 0x5d, 0xbd, 0x42, 0x56, 0x5d, 0x3d, 0x7c, 0xce, 0x5f, 0xbd,
+  0xc0, 0x38, 0x96, 0x3a, 0x33, 0x59, 0x90, 0x3d, 0x06, 0x1a, 0xa6, 0xbc, 0xd4,
+  0xb0, 0x83, 0x3c, 0xa8, 0xf4, 0x07, 0x3c, 0xa5, 0x8f, 0x90, 0x3d, 0x36, 0xd8,
+  0xc0, 0xbc, 0xf0, 0xf5, 0x31, 0x3d, 0x30, 0x56, 0x88, 0xbd, 0x3c, 0x96, 0x05,
+  0xbd, 0x89, 0xc2, 0x89, 0x3d, 0x19, 0x10, 0x06, 0xbd, 0xa2, 0xaa, 0x63, 0x3d,
+  0x5e, 0x9b, 0x76, 0xbd, 0xa5, 0x57, 0x8c, 0x3d, 0x48, 0xe9, 0x2a, 0x3c, 0xe0,
+  0xd9, 0x3a, 0x3b, 0xd3, 0x1c, 0x7f, 0xbd, 0x8c, 0x60, 0x21, 0xbc, 0x38, 0xc1,
+  0x67, 0xbc, 0xf0, 0x83, 0x62, 0x3c, 0x58, 0xcb, 0x3f, 0x3d, 0xc7, 0xd9, 0x83,
+  0x3d, 0x3e, 0xf5, 0x90, 0xbd, 0xeb, 0xb8, 0x8b, 0xbd, 0x0a, 0x86, 0x05, 0x3d,
+  0x61, 0xb6, 0x39, 0xbd, 0x56, 0x8f, 0x04, 0x3d, 0x19, 0xbd, 0x33, 0xbd, 0x24,
+  0xd1, 0x50, 0x3d, 0xd0, 0x14, 0xf8, 0x3c, 0x2c, 0x43, 0x49, 0x3d, 0x98, 0xa1,
+  0x53, 0xbc, 0xc2, 0x43, 0x26, 0x3d, 0x8e, 0xed, 0xff, 0xbc, 0xb7, 0x58, 0x75,
+  0xbd, 0x00, 0xb7, 0x85, 0x3a, 0x8c, 0xb1, 0x83, 0xbc, 0x08, 0x40, 0x92, 0xbd,
+  0x35, 0x28, 0x08, 0xbd, 0x30, 0x4f, 0x84, 0x3c, 0x34, 0x0b, 0x22, 0xbc, 0x30,
+  0x1a, 0x07, 0x3c, 0xaa, 0xd6, 0x87, 0xbd, 0xa2, 0xfd, 0x7d, 0xbd, 0xfe, 0xa0,
+  0xb7, 0xbc, 0xa2, 0x0a, 0x33, 0x3d, 0x10, 0x60, 0xe4, 0xbb, 0x64, 0x49, 0x10,
+  0xbd, 0xf4, 0xd0, 0x48, 0xbc, 0x12, 0x7a, 0x38, 0x3d, 0x28, 0xb9, 0xee, 0xbc,
+  0x05, 0xbe, 0x50, 0xbd, 0xce, 0x2f, 0xd5, 0xbc, 0x04, 0x8f, 0x39, 0xbd, 0xa8,
+  0x16, 0x0c, 0xbd, 0x64, 0xe1, 0x79, 0xbc, 0xd4, 0x20, 0x8c, 0x3c, 0x28, 0x73,
+  0x1c, 0x3d, 0x20, 0x66, 0x97, 0x3c, 0x66, 0x6e, 0xc1, 0xbc, 0x6d, 0xfc, 0x91,
+  0xbd, 0xc5, 0x79, 0x89, 0xbd, 0xd0, 0x3c, 0x90, 0x3c, 0xfc, 0x19, 0x55, 0xbd,
+  0x72, 0x96, 0x80, 0xbd, 0x80, 0x81, 0x46, 0x3d, 0xea, 0x10, 0x30, 0x3d, 0x00,
+  0xdc, 0xe2, 0x3b, 0x44, 0x30, 0x78, 0xbc, 0x3a, 0x5b, 0x39, 0x3d, 0x00, 0x8d,
+  0x8c, 0xbb, 0x70, 0x9f, 0x3b, 0xbc, 0x1c, 0xa9, 0x5c, 0xbc, 0x04, 0xa9, 0xe4,
+  0xbc, 0x3a, 0xd9, 0x39, 0x3d, 0xa0, 0x11, 0xfd, 0x3c, 0x76, 0x3b, 0xf9, 0xbc,
+  0xb9, 0xdd, 0x6f, 0xbd, 0xf5, 0xcb, 0x91, 0xbd, 0xee, 0x45, 0x5d, 0xbd, 0x13,
+  0x1c, 0x8d, 0xbd, 0x10, 0xb7, 0xb6, 0x3b, 0x60, 0xc8, 0x77, 0x3b, 0x70, 0x4d,
+  0xbf, 0xbb, 0x38, 0x4f, 0x80, 0xbd, 0xa9, 0x6b, 0x92, 0xbd, 0x78, 0x8e, 0x7e,
+  0x3c, 0x70, 0xd1, 0x6e, 0x3c, 0x79, 0x4c, 0x85, 0xbd, 0xcc, 0xac, 0x2b, 0x3d,
+  0x49, 0x46, 0x5f, 0xbd, 0x68, 0x60, 0x6d, 0xbc, 0x50, 0x53, 0xe4, 0x3b, 0x35,
+  0x39, 0x81, 0x3d, 0xf0, 0x01, 0x12, 0x3c, 0x4c, 0x27, 0x8b, 0xbd, 0xce, 0x8d,
+  0x71, 0x3d, 0xcc, 0x9a, 0x8e, 0xbd, 0x9e, 0x6f, 0xcd, 0xbc, 0xea, 0x23, 0x19,
+  0x3d, 0xac, 0xed, 0x95, 0x3c, 0x76, 0x32, 0x68, 0x3d, 0x08, 0xcc, 0x58, 0x3c,
+  0xc8, 0xe2, 0xcc, 0x3c, 0xf1, 0x85, 0x81, 0x3d, 0x06, 0xdc, 0x6b, 0x3d, 0x16,
+  0x15, 0xf0, 0xbc, 0xda, 0x56, 0x4e, 0x3d, 0x58, 0x5c, 0x90, 0xbc, 0xe4, 0x79,
+  0x37, 0xbd, 0x40, 0x1b, 0x6a, 0xbd, 0x00, 0x4e, 0x63, 0x3b, 0xbc, 0xfc, 0x35,
+  0x3d, 0xe6, 0x87, 0xf9, 0xbc, 0xb0, 0xfc, 0x0c, 0x3d, 0x96, 0x7f, 0x53, 0xbd,
+  0x1e, 0xe1, 0x04, 0x3d, 0x10, 0x11, 0x87, 0x3c, 0xce, 0xd1, 0x42, 0x3d, 0x1c,
+  0x27, 0xca, 0xbc, 0xd8, 0x71, 0xfa, 0x3c, 0xea, 0xce, 0x76, 0x3d, 0x2c, 0x0e,
+  0xbc, 0x3c, 0x9b, 0x96, 0x48, 0xbd, 0x60, 0x7b, 0x93, 0xbb, 0x8a, 0x69, 0xa8,
+  0xbc, 0xc0, 0xcd, 0x79, 0x3c, 0xd0, 0xe0, 0x87, 0xbd, 0xe6, 0x91, 0x53, 0xbd,
+  0x96, 0xe0, 0x03, 0x3d, 0x8b, 0x7a, 0x81, 0xbd, 0x16, 0x64, 0x80, 0xbd, 0x84,
+  0xac, 0x87, 0x3c, 0xf8, 0xb7, 0xfc, 0xbc, 0x63, 0x2a, 0x38, 0xbd, 0x5a, 0x71,
+  0x35, 0xbd, 0xda, 0xff, 0x49, 0xbd, 0x50, 0xcd, 0xdb, 0xbb, 0xc0, 0x85, 0x37,
+  0xbb, 0x2a, 0x21, 0x35, 0x3d, 0xb6, 0x59, 0xcc, 0xbc, 0x10, 0x02, 0xe7, 0x3b,
+  0x78, 0xf5, 0x54, 0xbc, 0xb0, 0x3c, 0x58, 0x3c, 0xf4, 0x96, 0x59, 0x3d, 0x10,
+  0xd7, 0xd2, 0xbb, 0x1a, 0x0c, 0x79, 0x3d, 0x48, 0x2c, 0x6b, 0x3c, 0xc0, 0x44,
+  0x89, 0xbb, 0x5c, 0xf0, 0xa3, 0x3c, 0xd0, 0x1c, 0x07, 0x3d, 0x02, 0xcd, 0x94,
+  0xbc, 0xa8, 0x51, 0x99, 0xbc, 0xc0, 0xb9, 0x40, 0x3c, 0xe0, 0x85, 0x86, 0x3c,
+  0x74, 0x77, 0x9f, 0x3c, 0x15, 0xe0, 0x71, 0xbd, 0x00, 0xf1, 0xfc, 0xb9, 0x50,
+  0x39, 0x11, 0x3c, 0xb7, 0x13, 0x81, 0x3d, 0x60, 0x31, 0xe5, 0x3c, 0x8c, 0x42,
+  0xf6, 0xbc, 0x4c, 0x34, 0x8a, 0xbc, 0xb8, 0x26, 0xe6, 0x3c, 0xf4, 0x56, 0x69,
+  0xbc, 0xcc, 0xb4, 0xa1, 0x3c, 0xf0, 0x8e, 0x48, 0xbd, 0xcb, 0xab, 0x91, 0xbd,
+  0x00, 0xc4, 0x5e, 0xbb, 0xdd, 0xf5, 0x8c, 0x3d, 0xc8, 0x1a, 0x8a, 0x3c, 0x1c,
+  0x9c, 0xda, 0xbc, 0x89, 0x6e, 0x83, 0x3d, 0x00, 0x6e, 0x3c, 0x39, 0x80, 0x82,
+  0xd0, 0x3a, 0x00, 0x09, 0xc2, 0xb9, 0x04, 0x06, 0x38, 0xbc, 0x0a, 0x7a, 0xf7,
+  0xbc, 0x50, 0xac, 0x1d, 0x3c, 0x9e, 0xd8, 0xfa, 0xbc, 0xea, 0xed, 0x71, 0xbd,
+  0x7f, 0xf6, 0x0a, 0xbd, 0x20, 0x2d, 0x30, 0x3b, 0xd0, 0x7c, 0x96, 0x3b, 0x2e,
+  0x61, 0x3f, 0x3d, 0xb0, 0x0a, 0x2d, 0x3d, 0x80, 0xac, 0x47, 0xbb, 0x7a, 0x9e,
+  0xe6, 0xbc, 0x50, 0x90, 0x44, 0x3c, 0x0d, 0x23, 0x8e, 0xbd, 0x00, 0x3a, 0x59,
+  0x3a, 0x12, 0xa5, 0x52, 0xbd, 0xbc, 0x90, 0xac, 0x3c, 0x00, 0x77, 0xe1, 0x3a,
+  0x83, 0x27, 0x8a, 0xbd, 0x40, 0xcd, 0xb0, 0xbc, 0x6a, 0xf8, 0x22, 0x3d, 0xc0,
+  0xfe, 0xc8, 0xbb, 0x52, 0x28, 0x63, 0x3d, 0xb2, 0xd2, 0xbe, 0xbc, 0x80, 0x68,
+  0x42, 0xbc, 0xa4, 0x31, 0x58, 0xbc, 0xae, 0xda, 0x3a, 0xbd, 0xcb, 0xd7, 0x80,
+  0xbd, 0x32, 0x43, 0x60, 0x3d, 0x52, 0xc1, 0xa9, 0xbc, 0x18, 0x3a, 0x2d, 0x3c,
+  0x8e, 0x17, 0x5f, 0xbd, 0x9d, 0xcc, 0x85, 0x3d, 0x5c, 0x7c, 0x12, 0x3d, 0xde,
+  0x24, 0x78, 0x3d, 0xec, 0xba, 0x16, 0x3d, 0xd1, 0xb1, 0x3d, 0xbd, 0xf0, 0x7f,
+  0xe3, 0x3c, 0xe0, 0xf7, 0xef, 0xbb, 0x28, 0x65, 0x18, 0xbd, 0x7a, 0x38, 0x48,
+  0x3d, 0xad, 0xff, 0x81, 0xbd, 0x72, 0xe6, 0x69, 0x3d, 0x98, 0x35, 0x08, 0xbd,
+  0x16, 0xb5, 0x3a, 0xbd, 0x26, 0x18, 0x52, 0xbd, 0xc4, 0xb5, 0xc9, 0x3c, 0xbc,
+  0xcc, 0x93, 0x3c, 0x6e, 0x74, 0xc9, 0xbc, 0xae, 0x05, 0x14, 0x3d, 0x96, 0x6c,
+  0x78, 0x3d, 0x48, 0xe7, 0x7a, 0xbc, 0xe2, 0x8b, 0x65, 0xbd, 0xda, 0x9c, 0x97,
+  0xbc, 0xbc, 0xc8, 0xab, 0x3c, 0xf0, 0xb1, 0x5f, 0xbd, 0xbe, 0x43, 0x3d, 0x3d,
+  0xf8, 0xc7, 0x81, 0xbd, 0xd0, 0xc7, 0xcd, 0x3c, 0xfe, 0x77, 0x72, 0xbd, 0x32,
+  0x3c, 0x7c, 0x3d, 0xfa, 0x2e, 0x84, 0xbc, 0x4c, 0xbc, 0x04, 0x3d, 0xc6, 0x29,
+  0x8f, 0xbd, 0x4c, 0x07, 0xb8, 0x3c, 0x51, 0xb8, 0x45, 0xbd, 0x4c, 0x84, 0x7b,
+  0xbd, 0x8e, 0x26, 0x3e, 0xbd, 0x48, 0xcc, 0x96, 0xbc, 0xb0, 0x59, 0x32, 0x3d,
+  0xd6, 0x47, 0xba, 0xbc, 0xf9, 0x32, 0x81, 0x3d, 0xb0, 0xb8, 0x88, 0xbb, 0x80,
+  0x93, 0xfd, 0x3a, 0x4a, 0x8d, 0x39, 0x3d, 0x88, 0x34, 0xa1, 0x3c, 0x20, 0x3b,
+  0x53, 0x3b, 0x10, 0x26, 0x35, 0x3d, 0x50, 0xab, 0x77, 0xbc, 0x89, 0x68, 0x69,
+  0xbd, 0x56, 0xd0, 0x15, 0x3d, 0x56, 0x3f, 0x3e, 0xbd, 0xa0, 0x94, 0xb5, 0x3c,
+  0xa9, 0x10, 0x90, 0xbd, 0xfa, 0xe9, 0x48, 0xbd, 0x66, 0x62, 0x6a, 0x3d, 0xdc,
+  0x51, 0xb0, 0x3c, 0x20, 0x13, 0x4d, 0xbd, 0x40, 0xbf, 0xe5, 0xba, 0x50, 0x61,
+  0x9e, 0x3b, 0xa0, 0xbd, 0xeb, 0xbc, 0xd9, 0x55, 0x48, 0xbd, 0x4c, 0xbf, 0x0e,
+  0xbd, 0x80, 0x28, 0x20, 0x3b, 0xea, 0x77, 0x72, 0x3d, 0x08, 0xd6, 0x02, 0x3d,
+  0x7b, 0x14, 0x42, 0xbd, 0x8c, 0x7f, 0x91, 0x3c, 0x82, 0xe4, 0x16, 0xbd, 0x30,
+  0x61, 0xaf, 0x3c, 0xd2, 0x5c, 0x5a, 0xbd, 0xc0, 0x16, 0x69, 0x3b, 0xe9, 0x5b,
+  0x84, 0x3d, 0x49, 0xc3, 0x7e, 0xbd, 0x90, 0x7f, 0xf7, 0x3c, 0x3e, 0xd5, 0x85,
+  0xbd, 0x38, 0xb7, 0x43, 0x3c, 0x4e, 0x4d, 0xc0, 0xbc, 0x00, 0x78, 0xea, 0x3a,
+  0x32, 0xb2, 0x92, 0xbd, 0xb0, 0xc3, 0x1d, 0x3c, 0x90, 0xc2, 0x23, 0x3c, 0x80,
+  0x14, 0xc5, 0x3b, 0x00, 0xf1, 0x87, 0xbc, 0x26, 0xf4, 0x8a, 0xbd, 0x10, 0xa6,
+  0x9a, 0x3b, 0x78, 0x8b, 0x72, 0xbd, 0x85, 0xef, 0x12, 0xbd, 0xd8, 0x93, 0x02,
+  0x3d, 0x80, 0x8b, 0xca, 0x3a, 0x18, 0x72, 0x17, 0xbc, 0x65, 0x2d, 0x83, 0x3d,
+  0xfb, 0xe9, 0x81, 0x3d, 0x60, 0xf3, 0x46, 0xbd, 0xb4, 0xab, 0x1a, 0xbc, 0x30,
+  0x0c, 0xf9, 0x3c, 0xb6, 0xc5, 0x63, 0xbd, 0x8e, 0x20, 0xdd, 0xbc, 0x5c, 0x18,
+  0x97, 0xbc, 0x10, 0x42, 0x43, 0x3d, 0x11, 0xab, 0x84, 0x3d, 0xec, 0xcf, 0x30,
+  0x3d, 0x38, 0x0e, 0x6a, 0x3c, 0x3e, 0x40, 0xd9, 0xbc, 0xce, 0x14, 0x14, 0x3d,
+  0x5c, 0xe6, 0x71, 0xbc, 0xf8, 0xd8, 0xf2, 0x3c, 0x98, 0x96, 0x21, 0xbc, 0xbe,
+  0xdb, 0x18, 0xbd, 0xe6, 0x7f, 0x28, 0xbd, 0xab, 0x56, 0x23, 0xbd, 0xc2, 0x40,
+  0x8e, 0xbd, 0x8c, 0x92, 0xc3, 0x3c, 0xd4, 0x0a, 0x13, 0xbd, 0xbe, 0x25, 0x05,
+  0x3d, 0x12, 0x58, 0x0d, 0x3d, 0xd7, 0x65, 0x79, 0xbd, 0x9c, 0x54, 0x4e, 0x3d,
+  0x02, 0x2a, 0x40, 0x3d, 0xef, 0xcd, 0x01, 0xbd, 0x11, 0x5c, 0x92, 0x3d, 0xb0,
+  0x03, 0x95, 0x3c, 0xa0, 0x08, 0x19, 0x3b, 0x79, 0xad, 0x8c, 0x3d, 0x19, 0x93,
+  0x7a, 0xbd, 0x40, 0xfa, 0xc6, 0xbb, 0x68, 0xb6, 0xa8, 0x3c, 0x45, 0x29, 0x8d,
+  0xbd, 0x90, 0x3e, 0x13, 0xbc, 0x1a, 0x2d, 0x70, 0x3d, 0xc1, 0xdd, 0x6a, 0xbd,
+  0x50, 0x75, 0x01, 0xbd, 0xc1, 0x8d, 0x91, 0xbd, 0xdd, 0x3f, 0x84, 0xbd, 0xa3,
+  0xc6, 0x8d, 0x3d, 0xce, 0x23, 0x5b, 0x3d, 0x7e, 0xfb, 0x7d, 0x3d, 0xd5, 0xf4,
+  0x23, 0xbd, 0x4c, 0x65, 0x8d, 0xbc, 0xb0, 0x76, 0x89, 0xbd, 0x28, 0xc4, 0x82,
+  0xbd, 0x40, 0x70, 0x71, 0x3b, 0xfa, 0x55, 0x8e, 0xbc, 0x40, 0x08, 0xf0, 0x3a,
+  0x02, 0x81, 0x56, 0x3d, 0xfe, 0x51, 0xf8, 0xbc, 0x1a, 0xcd, 0x91, 0xbd, 0xfb,
+  0x66, 0x7b, 0xbd, 0xb0, 0xbb, 0xf2, 0xbc, 0xbb, 0x24, 0x23, 0xbd, 0x5c, 0x6c,
+  0x6d, 0xbd, 0x08, 0xa0, 0x8b, 0x3c, 0xb7, 0x93, 0x1d, 0xbd, 0x74, 0x9f, 0x21,
+  0x3d, 0x1c, 0x43, 0x33, 0xbd, 0x66, 0x2c, 0x1c, 0xbd, 0xfe, 0xf5, 0x11, 0xbd,
+  0x10, 0x32, 0xef, 0xbc, 0x40, 0x70, 0x6f, 0xbb, 0xa1, 0xca, 0x8f, 0x3d, 0x12,
+  0x42, 0x13, 0x3d, 0x38, 0x2e, 0xf3, 0x3c, 0x16, 0x69, 0x77, 0x3d, 0x6d, 0xa9,
+  0x1e, 0xbd, 0xdc, 0xf5, 0xba, 0xbc, 0xc4, 0xe8, 0x1f, 0xbd, 0xfc, 0xc7, 0x08,
+  0x3d, 0x8c, 0x9a, 0x28, 0x3d, 0x80, 0xbb, 0x14, 0x3b, 0xce, 0x47, 0x68, 0x3d,
+  0xd3, 0x75, 0x10, 0xbd, 0x30, 0x9e, 0xb1, 0x3b, 0x48, 0x08, 0x80, 0x3c, 0x53,
+  0xbe, 0x7e, 0xbd, 0x54, 0xdd, 0x5c, 0xbd, 0x89, 0x15, 0x77, 0xbd, 0x20, 0x13,
+  0x00, 0x3b, 0xab, 0x6a, 0x15, 0xbd, 0x70, 0x62, 0x0b, 0xbc, 0xb6, 0x69, 0x44,
+  0x3d, 0x9e, 0x71, 0x44, 0x3d, 0xfb, 0x84, 0x1e, 0xbd, 0xc8, 0x25, 0x3e, 0xbc,
+  0xa8, 0x9e, 0xa6, 0x3c, 0xa0, 0x0c, 0x0b, 0x3d, 0x48, 0xe7, 0xb1, 0xbc, 0x2f,
+  0xfc, 0x8a, 0x3d, 0xbc, 0x2a, 0x27, 0xbc, 0x80, 0x69, 0x38, 0x3c, 0xa0, 0x89,
+  0xb4, 0xbb, 0x10, 0xb6, 0x56, 0xbc, 0x80, 0xaa, 0x37, 0x3b, 0xbd, 0x66, 0x1d,
+  0xbd, 0xb9, 0x3e, 0x6c, 0xbd, 0x14, 0xc1, 0x1e, 0x3d, 0x10, 0xd3, 0xa5, 0x3b,
+  0x1c, 0x9a, 0x43, 0xbc, 0xa0, 0xb3, 0xdd, 0xbc, 0xf8, 0x82, 0xb8, 0x3c, 0xc8,
+  0x76, 0x1b, 0x3d, 0x7e, 0x2b, 0x5c, 0x3d, 0x20, 0xd8, 0x7f, 0xbd, 0x88, 0xe0,
+  0xa0, 0x3c, 0x1c, 0x48, 0x26, 0x3d, 0x50, 0x53, 0x1e, 0x3c, 0xf0, 0x07, 0x54,
+  0x3c, 0xc9, 0xde, 0x05, 0xbd, 0x2c, 0x34, 0x84, 0x3c, 0xa8, 0x30, 0x1b, 0x3c,
+  0x6c, 0xa1, 0x3c, 0xbd, 0x00, 0x58, 0xc1, 0xb8, 0xf0, 0xd4, 0xf9, 0x3b, 0xf0,
+  0xb3, 0x2e, 0x3d, 0x14, 0xe3, 0x4f, 0x3d, 0x70, 0x0b, 0x73, 0x3c, 0x8b, 0xca,
+  0x89, 0xbd, 0x9c, 0xd8, 0x85, 0x3c, 0x9c, 0x34, 0x4b, 0xbc, 0xf5, 0x38, 0x71,
+  0xbd, 0x01, 0xe5, 0x84, 0x3d, 0xd4, 0xde, 0x25, 0xbc, 0x80, 0xc0, 0xb1, 0xbb,
+  0x80, 0xca, 0xfc, 0x3b, 0x78, 0xe0, 0x2d, 0xbd, 0xda, 0x90, 0x29, 0xbd, 0x3a,
+  0xdb, 0x37, 0xbd, 0x00, 0x81, 0xa1, 0xbb, 0x3a, 0xcb, 0x71, 0xbd, 0x1c, 0x8e,
+  0x29, 0xbc, 0x68, 0x0a, 0x5f, 0xbc, 0x0f, 0x86, 0x91, 0xbd, 0x98, 0x61, 0x62,
+  0x3c, 0x82, 0x06, 0x4e, 0xbd, 0xa0, 0x7a, 0x35, 0x3b, 0xfa, 0xbc, 0x31, 0x3d,
+  0xee, 0x18, 0x3a, 0x3d, 0xe0, 0xf0, 0x9d, 0xbb, 0x87, 0xba, 0x8f, 0x3d, 0x0e,
+  0x75, 0x24, 0x3d, 0x92, 0xf6, 0x77, 0x3d, 0x78, 0xda, 0x72, 0xbc, 0xe4, 0x5c,
+  0x55, 0xbc, 0xe3, 0xbf, 0x87, 0x3d, 0x74, 0x55, 0x5c, 0xbd, 0x88, 0x2b, 0x0b,
+  0xbc, 0x68, 0xd5, 0x21, 0x3d, 0x0a, 0x05, 0x94, 0xbc, 0x5f, 0xb7, 0x8a, 0x3d,
+  0x48, 0x83, 0x5c, 0x3c, 0x08, 0x83, 0x77, 0xbc, 0xc4, 0x31, 0xd6, 0x3c, 0xb8,
+  0x48, 0x52, 0x3c, 0x00, 0xcb, 0xda, 0x3b, 0x32, 0x6a, 0x5f, 0xbd, 0x76, 0x7f,
+  0x8f, 0xbd, 0xc0, 0xb7, 0xb2, 0x3c, 0x91, 0x5e, 0x1d, 0xbd, 0x92, 0x5d, 0x62,
+  0x3d, 0x9c, 0x2b, 0x65, 0xbd, 0x3e, 0xe5, 0x2a, 0x3d, 0x29, 0xb7, 0x81, 0xbd,
+  0x74, 0xa2, 0xda, 0x3c, 0x1a, 0xcb, 0x15, 0x3d, 0x56, 0x35, 0x60, 0x3d, 0x50,
+  0x4a, 0x4f, 0xbc, 0xb2, 0x3c, 0x73, 0x3d, 0x88, 0x39, 0x71, 0xbd, 0xa0, 0x73,
+  0x7d, 0xbd, 0x18, 0x14, 0xac, 0x3c, 0xa8, 0x1a, 0x57, 0x3d, 0x00, 0x3a, 0x77,
+  0xbc, 0x2a, 0xd5, 0x93, 0xbc, 0x7e, 0x27, 0x41, 0x3d, 0xa0, 0x96, 0x19, 0x3d,
+  0x18, 0x3e, 0xe5, 0x3c, 0x56, 0xda, 0x0d, 0x3d, 0xb2, 0x5f, 0x1d, 0x3d, 0x0c,
+  0x27, 0xd6, 0x3c, 0xc6, 0x34, 0x89, 0xbd, 0x84, 0xe7, 0x65, 0xbd, 0xfc, 0x87,
+  0xba, 0x3c, 0xd6, 0x7b, 0x3b, 0xbd, 0xe8, 0xf4, 0x49, 0xbd, 0x70, 0x19, 0x0d,
+  0x3c, 0x5a, 0x0c, 0x18, 0x3d, 0xe6, 0x0e, 0x26, 0x3d, 0x12, 0xa0, 0x61, 0xbd,
+  0xec, 0xa3, 0x26, 0x3d, 0xf4, 0xef, 0xe0, 0x3c, 0xdd, 0xc0, 0x88, 0xbd, 0x08,
+  0x87, 0x0e, 0x3d, 0x2b, 0xb7, 0x18, 0xbd, 0xe6, 0xd5, 0x1f, 0xbd, 0x38, 0xc1,
+  0x37, 0x3c, 0x88, 0x9a, 0x74, 0xbd, 0x04, 0xce, 0x04, 0x3d, 0x00, 0x5c, 0xab,
+  0xbc, 0xbd, 0x47, 0x4b, 0xbd, 0xf0, 0xc1, 0x33, 0xbc, 0x2c, 0x4d, 0xca, 0x3c,
+  0x84, 0xfd, 0xed, 0xbc, 0x6c, 0xf2, 0x2c, 0x3d, 0x1b, 0x24, 0x87, 0x3d, 0x7a,
+  0x67, 0x8f, 0xbc, 0x84, 0xab, 0x50, 0xbc, 0x84, 0xd2, 0x0b, 0x3d, 0x18, 0x03,
+  0x03, 0x3d, 0x80, 0x54, 0x01, 0x3d, 0xbc, 0x41, 0xd8, 0x3c, 0x60, 0xe4, 0x34,
+  0x3d, 0x3d, 0xfb, 0x26, 0xbd, 0xcc, 0x6f, 0x1f, 0x3d, 0xc0, 0xb0, 0x30, 0xbb,
+  0x7f, 0xb2, 0x83, 0xbd, 0x8f, 0xed, 0x91, 0x3d, 0xa0, 0xe6, 0xe2, 0xbb, 0xfa,
+  0x94, 0x67, 0x3d, 0x70, 0xd4, 0x69, 0xbd, 0x80, 0xba, 0xed, 0x3c, 0xce, 0x26,
+  0xb8, 0xbc, 0xfe, 0xd9, 0x1c, 0x3d, 0xae, 0x09, 0x0e, 0x3d, 0x4f, 0x3d, 0x52,
+  0xbd, 0x87, 0xde, 0x62, 0xbd, 0x02, 0x63, 0xff, 0xbc, 0x70, 0x60, 0xbd, 0x3b,
+  0x3c, 0x3f, 0xe7, 0x3c, 0x9c, 0x9c, 0x34, 0xbd, 0x82, 0xcf, 0x82, 0xbd, 0xa2,
+  0xdb, 0x39, 0x3d, 0x70, 0x89, 0xe8, 0x3c, 0xad, 0x61, 0x80, 0xbd, 0xd8, 0x58,
+  0x34, 0xbd, 0xf6, 0x79, 0x5f, 0xbd, 0xd0, 0x9b, 0xc6, 0x3c, 0x02, 0x91, 0x0f,
+  0x3d, 0x90, 0xe4, 0xc1, 0x3b, 0xff, 0xa7, 0x8e, 0x3d, 0x99, 0x07, 0x92, 0xbd,
+  0x30, 0x36, 0xe4, 0x3b, 0xf0, 0xd6, 0x38, 0xbd, 0xea, 0x6d, 0x2d, 0xbd, 0x0e,
+  0x11, 0xf6, 0xbc, 0x80, 0x5b, 0x53, 0x3b, 0x1c, 0x44, 0x41, 0x3d, 0xab, 0x98,
+  0x7b, 0xbd, 0x20, 0x36, 0x71, 0x3b, 0x87, 0x93, 0x20, 0xbd, 0xb0, 0x35, 0x27,
+  0xbd, 0xd2, 0x2b, 0x75, 0x3d, 0x90, 0x12, 0xdc, 0xbc, 0x06, 0x6c, 0x2b, 0x3d,
+  0xe0, 0x86, 0x20, 0xbb, 0x9d, 0xdd, 0x88, 0x3d, 0xec, 0xe2, 0x19, 0x3d, 0x70,
+  0x76, 0xb4, 0x3c, 0x0e, 0x49, 0x42, 0xbd, 0x34, 0x9c, 0xe3, 0x3c, 0xe0, 0x1d,
+  0xf8, 0xbb, 0xfc, 0x83, 0xc2, 0xbc, 0xdc, 0xe1, 0x8d, 0xbc, 0x04, 0x9b, 0xa7,
+  0x3c, 0x54, 0x5a, 0xfc, 0x3c, 0x80, 0x63, 0x14, 0xba, 0xcc, 0x46, 0x08, 0x3d,
+  0x46, 0xf5, 0x2b, 0x3d, 0xe0, 0x8b, 0x48, 0x3d, 0xa0, 0x99, 0xfd, 0x3b, 0x41,
+  0x57, 0x87, 0x3d, 0xe4, 0xcb, 0x56, 0xbd, 0x1f, 0xa4, 0x3f, 0xbd, 0xac, 0x66,
+  0x85, 0x3c, 0xaa, 0x3a, 0x55, 0x3d, 0x32, 0x06, 0x29, 0x3d, 0x9a, 0xb8, 0x5a,
+  0xbd, 0x00, 0xfc, 0xbb, 0xba, 0xd7, 0x80, 0x86, 0x3d, 0xb4, 0x7c, 0xf5, 0x3c,
+  0xac, 0xf4, 0x36, 0x3d, 0x82, 0xef, 0x65, 0x3d, 0x49, 0x63, 0x5c, 0xbd, 0x66,
+  0xe0, 0x8f, 0xbd, 0x42, 0x66, 0x28, 0x3d, 0xfc, 0xec, 0x08, 0x3d, 0x0a, 0x9c,
+  0x1e, 0x3d, 0x65, 0x3c, 0x45, 0xbd, 0x73, 0x4f, 0x88, 0x3d, 0xec, 0x1e, 0xbf,
+  0xbc, 0xee, 0xa7, 0x55, 0x3d, 0x10, 0x84, 0x57, 0x3c, 0xd4, 0x12, 0xdf, 0x3c,
+  0xa8, 0x8f, 0x8f, 0xbd, 0x56, 0x80, 0x89, 0xbd, 0x08, 0xc5, 0x09, 0xbc, 0xfd,
+  0x84, 0x22, 0xbd, 0xb2, 0x0a, 0x66, 0x3d, 0x0a, 0x86, 0x61, 0x3d, 0x79, 0xf8,
+  0x81, 0xbd, 0x7a, 0x81, 0x49, 0xbd, 0x88, 0x62, 0x7f, 0x3c, 0x8c, 0x81, 0x71,
+  0xbd, 0x42, 0x9e, 0x86, 0xbd, 0x30, 0x5d, 0xf6, 0x3b, 0x6c, 0xc0, 0x29, 0xbc,
+  0x88, 0x30, 0xdf, 0xbc, 0xda, 0xed, 0xf4, 0xbc, 0x98, 0x29, 0x34, 0xbd, 0xc0,
+  0x10, 0xbe, 0x3a, 0x9b, 0x69, 0x8c, 0x3d, 0x40, 0x02, 0x98, 0xba, 0x2b, 0x85,
+  0x76, 0xbd, 0x0c, 0xfd, 0xd3, 0x3c, 0x62, 0x37, 0x08, 0x3d, 0x0a, 0xe3, 0xe9,
+  0xbc, 0x80, 0x1c, 0xc9, 0x3a, 0x54, 0x4b, 0x39, 0xbc, 0x28, 0xae, 0x7a, 0x3c,
+  0x60, 0xd7, 0xe9, 0x3b, 0x08, 0xbe, 0x52, 0xbd, 0x04, 0x99, 0x3d, 0xbd, 0xd0,
+  0xd2, 0x13, 0xbd, 0x1a, 0x86, 0x8e, 0xbc, 0xeb, 0xaa, 0x6a, 0xbd, 0x00, 0x23,
+  0xa3, 0xb9, 0xc8, 0x76, 0x77, 0xbc, 0x36, 0x45, 0x72, 0xbd, 0xe4, 0xd7, 0x8a,
+  0xbc, 0xfd, 0xfa, 0x8c, 0x3d, 0x2b, 0xc3, 0x07, 0xbd, 0x6d, 0xd0, 0x87, 0x3d,
+  0xec, 0xa4, 0xde, 0x3c, 0x92, 0x4b, 0x65, 0x3d, 0x20, 0x6c, 0x2c, 0xbd, 0x00,
+  0xb7, 0x0c, 0x3b, 0x96, 0x7f, 0x4b, 0x3d, 0xec, 0xe9, 0xdb, 0xbc, 0xaa, 0x06,
+  0x3b, 0x3d, 0x20, 0x8c, 0x33, 0x3d, 0xe1, 0x03, 0x18, 0xbd, 0xe0, 0xa5, 0x0a,
+  0xbc, 0x30, 0x1d, 0x5f, 0x3c, 0xfc, 0x28, 0x6d, 0xbd, 0x43, 0x41, 0x90, 0x3d,
+  0x58, 0x87, 0x30, 0x3c, 0xdd, 0x8c, 0x60, 0xbd, 0xec, 0x2a, 0xba, 0xbc, 0xf2,
+  0x9d, 0xa9, 0xbc, 0x30, 0xb0, 0x06, 0x3c, 0x68, 0x3e, 0x53, 0x3c, 0x78, 0xab,
+  0xff, 0xbc, 0xa8, 0x34, 0x0d, 0xbc, 0x4e, 0x3f, 0x01, 0x3d, 0x00, 0x96, 0x44,
+  0x3b, 0x2c, 0xa3, 0xda, 0x3c, 0xba, 0xc4, 0x2e, 0xbd, 0x72, 0xbd, 0x2f, 0x3d,
+  0xfc, 0x1b, 0x7d, 0xbc, 0x9e, 0xbf, 0x7e, 0x3d, 0x02, 0x94, 0x19, 0x3d, 0x94,
+  0x36, 0x4f, 0x3d, 0xf1, 0xee, 0x68, 0xbd, 0x54, 0x9c, 0x87, 0x3c, 0xfa, 0x3e,
+  0x7e, 0x3d, 0x02, 0xec, 0x84, 0xbc, 0x12, 0xe7, 0x89, 0xbd, 0xa4, 0x90, 0xa6,
+  0x3c, 0x3c, 0x7a, 0x89, 0xbc, 0x86, 0x5d, 0x54, 0x3d, 0xa4, 0xad, 0x53, 0xbc,
+  0x32, 0xc5, 0x00, 0x3d, 0x1e, 0x53, 0x0b, 0x3d, 0xef, 0xae, 0x02, 0xbd, 0x7c,
+  0xd8, 0x03, 0x3d, 0x38, 0x0e, 0xa5, 0xbc, 0x51, 0xc4, 0x83, 0x3d, 0x66, 0xcb,
+  0x8f, 0xbd, 0xa6, 0xfe, 0xb6, 0xbc, 0xa4, 0xb1, 0x97, 0x3c, 0x00, 0xad, 0xb2,
+  0x3a, 0x0f, 0xb7, 0x33, 0xbd, 0x37, 0x1f, 0x6f, 0xbd, 0x57, 0x39, 0x8c, 0x3d,
+  0x54, 0xe4, 0xb7, 0xbc, 0x1e, 0x63, 0x52, 0xbd, 0x00, 0x3b, 0x43, 0xbd, 0x50,
+  0x48, 0xf1, 0xbb, 0x18, 0x01, 0x81, 0xbd, 0x90, 0x1c, 0xaf, 0xbc, 0x06, 0xf8,
+  0x7d, 0xbd, 0xf0, 0xe0, 0xa5, 0xbc, 0x08, 0x06, 0xc3, 0x3c, 0x22, 0xff, 0x83,
+  0xbc, 0x4c, 0xef, 0x88, 0xbd, 0x36, 0xf2, 0x77, 0x3d, 0x54, 0x3b, 0xd4, 0xbc,
+  0xa7, 0xa2, 0x8e, 0x3d, 0xac, 0xb2, 0x99, 0x3c, 0x10, 0x08, 0x88, 0xbb, 0x81,
+  0x58, 0x8d, 0xbd, 0xf8, 0x25, 0x29, 0xbd, 0x1c, 0x0f, 0x26, 0xbd, 0x8e, 0x7a,
+  0x81, 0xbd, 0x5c, 0x14, 0x8d, 0xbd, 0x81, 0xdd, 0x8f, 0xbd, 0xc8, 0xa2, 0x5f,
+  0xbc, 0xc0, 0x48, 0xda, 0xba, 0xfe, 0x26, 0x14, 0x3d, 0xe2, 0x9a, 0x89, 0xbd,
+  0x66, 0x8d, 0x59, 0x3d, 0xd8, 0xf8, 0x45, 0x3d, 0x0b, 0xb1, 0x04, 0xbd, 0x7a,
+  0x32, 0xdd, 0xbc, 0x00, 0x01, 0x24, 0xbb, 0xc5, 0x97, 0x87, 0xbd, 0x7c, 0xea,
+  0x46, 0x3d, 0x85, 0xc1, 0x81, 0x3d, 0xe8, 0x63, 0x24, 0x3d, 0x5d, 0xb3, 0x84,
+  0xbd, 0xca, 0xa4, 0x04, 0x3d, 0xea, 0xe8, 0xf0, 0xbc, 0xdc, 0x41, 0x05, 0xbd,
+  0xe8, 0x40, 0x4c, 0xbd, 0xb0, 0xb7, 0x2d, 0x3d, 0xa9, 0x0c, 0x1f, 0xbd, 0xd0,
+  0x50, 0x97, 0x3b, 0x3f, 0x9c, 0x0f, 0xbd, 0xac, 0xa8, 0x59, 0xbd, 0xdb, 0x76,
+  0x87, 0x3d, 0x08, 0xd7, 0x52, 0x3c, 0xc8, 0xf0, 0x1c, 0x3d, 0xec, 0xc1, 0x4a,
+  0x3d, 0x44, 0x87, 0x81, 0x3c, 0xbe, 0x6f, 0x13, 0x3d, 0x80, 0x36, 0x49, 0x3c,
+  0xae, 0xea, 0x73, 0x3d, 0x70, 0xd3, 0x2d, 0x3d, 0xde, 0xbb, 0x9d, 0xbc, 0xaa,
+  0xba, 0x32, 0x3d, 0x7b, 0xc1, 0x3c, 0xbd, 0x42, 0x4e, 0x5f, 0xbd, 0x9a, 0xd4,
+  0x75, 0xbd, 0x52, 0x8d, 0x4a, 0x3d, 0xb4, 0x42, 0x8f, 0x3c, 0x20, 0x32, 0x92,
+  0xbc, 0x39, 0x52, 0x0a, 0xbd, 0xd8, 0xf6, 0x21, 0xbd, 0x8b, 0x5e, 0x26, 0xbd,
+  0x42, 0x45, 0x5b, 0xbd, 0x06, 0x86, 0x7f, 0xbd, 0x65, 0x5a, 0x57, 0xbd, 0x78,
+  0x0a, 0x41, 0xbd, 0x5d, 0x12, 0x89, 0xbd, 0x40, 0x70, 0x34, 0xbc, 0xa0, 0x15,
+  0x43, 0xbb, 0x76, 0xc5, 0x48, 0x3d, 0x40, 0x0b, 0x36, 0x3d, 0x40, 0x3a, 0x3f,
+  0x3b, 0x58, 0xc4, 0xa3, 0x3c, 0x70, 0xdc, 0xdf, 0x3c, 0x50, 0x13, 0x1c, 0x3d,
+  0xc0, 0x6d, 0xcc, 0xbb, 0x62, 0xc7, 0x32, 0xbd, 0x15, 0x3f, 0x8b, 0x3d, 0xb5,
+  0x5b, 0x14, 0xbd, 0xf1, 0x00, 0x3f, 0xbd, 0x90, 0xe9, 0x53, 0x3c, 0xae, 0xa0,
+  0x1f, 0xbd, 0x54, 0x4f, 0xc8, 0xbc, 0x7c, 0x0b, 0x3a, 0xbc, 0x96, 0x74, 0x38,
+  0x3d, 0xa6, 0x9b, 0x3f, 0xbd, 0xf4, 0xfd, 0x88, 0xbc, 0x18, 0x1c, 0x97, 0xbc,
+  0xc8, 0xcf, 0xea, 0x3c, 0xd9, 0x76, 0x8c, 0x3d, 0x3e, 0x07, 0x87, 0xbc, 0xa8,
+  0xb5, 0x3f, 0x3c, 0x74, 0x96, 0x79, 0xbd, 0x30, 0xfc, 0x4e, 0x3c, 0x60, 0x75,
+  0x25, 0x3d, 0x28, 0xd6, 0x7a, 0x3c, 0x38, 0xf6, 0x3e, 0x3c, 0x90, 0xd8, 0xf6,
+  0xbc, 0x0a, 0x8b, 0x78, 0x3d, 0x94, 0x29, 0xc7, 0xbc, 0xa0, 0x3e, 0xe9, 0xbc,
+  0x20, 0xfc, 0xa9, 0x3c, 0xde, 0xab, 0xd2, 0xbc, 0x97, 0x63, 0x8b, 0xbd, 0xa0,
+  0xe7, 0x52, 0xbb, 0xa4, 0xf2, 0x36, 0xbc, 0x50, 0x49, 0xb9, 0xbb, 0x1f, 0x9e,
+  0x88, 0x3d, 0x86, 0xea, 0x9d, 0xbc, 0x38, 0x1b, 0xf5, 0x3c, 0x46, 0xea, 0x1e,
+  0xbd, 0x00, 0xad, 0x18, 0xba, 0x1e, 0x19, 0x6b, 0xbd, 0xa4, 0x1f, 0x90, 0x3c,
+  0xf5, 0xb4, 0x42, 0xbd, 0x48, 0xf2, 0x1f, 0xbd, 0x26, 0x05, 0x12, 0x3d, 0x80,
+  0x01, 0x58, 0xbd, 0xee, 0x98, 0x51, 0xbd, 0xb8, 0xcd, 0x96, 0xbc, 0x65, 0xbc,
+  0x81, 0x3d, 0x90, 0x57, 0xcd, 0x3b, 0xa0, 0x9a, 0x30, 0x3c, 0xa6, 0xa4, 0x82,
+  0xbd, 0x20, 0xa1, 0xc6, 0xbb, 0x95, 0x3a, 0x8c, 0xbd, 0x00, 0xa2, 0x72, 0x3c,
+  0x00, 0xd6, 0x58, 0x3b, 0xc8, 0x1f, 0x7d, 0x3c, 0xf0, 0x98, 0xe1, 0xbb, 0x02,
+  0x83, 0xe7, 0xbc, 0x9a, 0xc9, 0x67, 0x3d, 0xf5, 0x03, 0x90, 0xbd, 0x00, 0x9e,
+  0x55, 0xba, 0x80, 0xa0, 0x05, 0x3b, 0x00, 0x53, 0x6d, 0x3c, 0x16, 0xc9, 0x6a,
+  0x3d, 0x96, 0x11, 0x04, 0x3d, 0x10, 0x45, 0xff, 0xbb, 0xd2, 0x78, 0x2a, 0xbd,
+  0xbb, 0xe1, 0x8d, 0xbd, 0x8c, 0x4a, 0xc7, 0xbc, 0x20, 0x1c, 0x23, 0x3d, 0x10,
+  0xb3, 0xff, 0x3b, 0xd8, 0xec, 0x36, 0x3c, 0x64, 0xf1, 0xa7, 0x3d, 0x22, 0xd3,
+  0xb0, 0xbd, 0xba, 0xd3, 0xc4, 0x3c, 0x7f, 0x35, 0x0a, 0x3d, 0xb1, 0xba, 0xc0,
+  0x3d, 0x70, 0x6e, 0x10, 0x3c, 0x0b, 0x3f, 0x43, 0x3d, 0x75, 0x57, 0x4f, 0xbd,
+  0xf7, 0xae, 0x5e, 0xbd, 0xd6, 0xc7, 0x9f, 0x3d, 0x15, 0x89, 0x08, 0x3d, 0x02,
+  0x77, 0x49, 0x3c, 0x19, 0x3b, 0xc5, 0xbc, 0xa2, 0x8d, 0x43, 0xbd, 0x7b, 0x63,
+  0x22, 0xbc, 0xb8, 0x4c, 0xbe, 0x3d, 0x98, 0x23, 0x2a, 0xbd, 0xd2, 0x49, 0x69,
+  0xbd, 0x58, 0xae, 0x14, 0x3d, 0xdc, 0x52, 0x85, 0xbd, 0xd0, 0x91, 0xea, 0x3c,
+  0x93, 0x04, 0x5c, 0x3d, 0xdf, 0xf9, 0x20, 0x3d, 0xd3, 0x87, 0x3f, 0xbd, 0xae,
+  0xe4, 0x6a, 0x3c, 0xed, 0x34, 0x27, 0x3c, 0x79, 0x2d, 0x67, 0x3d, 0x63, 0xb8,
+  0x57, 0xbc, 0x9f, 0x7f, 0x79, 0xbd, 0x44, 0x92, 0x9b, 0x3d, 0x60, 0x08, 0x40,
+  0xbd, 0xde, 0x4c, 0x9c, 0x3c, 0xdd, 0x61, 0x21, 0x3c, 0x86, 0xd4, 0x15, 0xbd,
+  0xf9, 0xd9, 0xe1, 0xbd, 0x40, 0xc7, 0x2f, 0x3d, 0xa7, 0x36, 0x89, 0x3d, 0x8a,
+  0xdc, 0xa0, 0xbd, 0x5a, 0x12, 0x99, 0x3c, 0x8a, 0x63, 0xfa, 0xba, 0x77, 0x80,
+  0xa2, 0xbd, 0x68, 0x8f, 0x19, 0xbc, 0x91, 0x17, 0xfc, 0x3c, 0xc7, 0x5f, 0xa0,
+  0x3c, 0x21, 0x34, 0xf2, 0xbc, 0x09, 0x55, 0x1d, 0xbc, 0xcf, 0x87, 0x01, 0xbc,
+  0xba, 0xe9, 0x8c, 0x3d, 0x07, 0xf7, 0x93, 0x3c, 0xe2, 0x86, 0x80, 0x3c, 0xd7,
+  0xf7, 0x45, 0xbd, 0x8d, 0x5c, 0x55, 0x3d, 0x40, 0x89, 0x73, 0x3c, 0x7a, 0xe1,
+  0x5c, 0x3c, 0x6a, 0x34, 0xe7, 0xbc, 0x25, 0x79, 0xaa, 0x3a, 0x13, 0x23, 0xa1,
+  0x3d, 0x4b, 0x1e, 0xe1, 0x3c, 0x49, 0xbb, 0xb5, 0xbc, 0xa6, 0x19, 0xa9, 0x3c,
+  0x4e, 0xf1, 0x2a, 0x3d, 0x69, 0x81, 0xac, 0x3c, 0x00, 0x31, 0x46, 0x3c, 0x84,
+  0x9b, 0x17, 0xbd, 0xa3, 0x50, 0x70, 0x3d, 0xf9, 0x6d, 0x91, 0xbd, 0x41, 0x1f,
+  0xad, 0x3b, 0x9c, 0x7c, 0xa5, 0xbc, 0xd7, 0xa0, 0x8f, 0xbb, 0xfe, 0xeb, 0x05,
+  0x3d, 0xc5, 0x31, 0xc5, 0x3a, 0x9a, 0x3c, 0x08, 0x3d, 0xc2, 0x6d, 0x27, 0xbd,
+  0xa5, 0xc1, 0x7a, 0x3c, 0x4c, 0x25, 0x41, 0xbd, 0x3e, 0x6e, 0xd0, 0x3c, 0x6b,
+  0x0e, 0x6d, 0x3d, 0xb4, 0x47, 0x86, 0x3c, 0x60, 0xc8, 0x03, 0x3d, 0x78, 0xb8,
+  0xb3, 0x3d, 0xfb, 0x4b, 0x0d, 0x3d, 0x44, 0x4c, 0xc0, 0x3b, 0xd1, 0xa8, 0x33,
+  0xbc, 0xf8, 0x4d, 0x8d, 0xbd, 0x3b, 0xeb, 0x15, 0xbd, 0x16, 0xef, 0x19, 0xbb,
+  0x66, 0x45, 0x2c, 0xbd, 0x50, 0x0b, 0xab, 0xbb, 0x95, 0x0b, 0x06, 0xbd, 0x2c,
+  0x1f, 0x33, 0xbd, 0xe4, 0xa5, 0xb7, 0x3a, 0xa0, 0xa0, 0xe4, 0xbc, 0x6c, 0x3b,
+  0x65, 0x3d, 0x1e, 0xa8, 0x8b, 0x3b, 0xe0, 0xb7, 0x82, 0x3c, 0x3f, 0x77, 0x5b,
+  0x3d, 0xd1, 0xd3, 0x0a, 0x3c, 0xdd, 0xbc, 0xaa, 0xbd, 0xb2, 0x81, 0x91, 0xbc,
+  0x0f, 0xcb, 0x5d, 0x3d, 0x08, 0xa9, 0xf0, 0xbc, 0x9b, 0xc4, 0x0c, 0x3c, 0xf7,
+  0x0d, 0x64, 0xbc, 0x1c, 0xa0, 0xa5, 0xbc, 0x5b, 0x1d, 0x2d, 0xbd, 0x03, 0x78,
+  0x59, 0x3d, 0x1b, 0x8a, 0x13, 0x3d, 0xaa, 0x9c, 0x14, 0xbd, 0x57, 0xe2, 0xf1,
+  0x3c, 0x5f, 0xaa, 0x58, 0x3d, 0x6c, 0x19, 0xb5, 0xbc, 0x20, 0xeb, 0x3c, 0x3d,
+  0xe0, 0xda, 0xd5, 0x3c, 0x54, 0x6f, 0x6f, 0xbd, 0x91, 0x64, 0x82, 0x3d, 0xed,
+  0xcd, 0x10, 0x3b, 0xec, 0x91, 0x1c, 0x3d, 0xad, 0xee, 0xc0, 0x3c, 0xb9, 0x84,
+  0xb8, 0x3d, 0x67, 0xe4, 0x19, 0xba, 0xc5, 0xca, 0x00, 0x3b, 0xbc, 0x29, 0xcb,
+  0xbc, 0xca, 0x3c, 0x20, 0xbd, 0x6e, 0xed, 0x2e, 0xbd, 0xd8, 0x47, 0x83, 0xbd,
+  0x1f, 0x0b, 0x52, 0xbd, 0x10, 0x29, 0x29, 0x3c, 0xfa, 0x35, 0xd2, 0xbc, 0xbe,
+  0x31, 0x1b, 0x3d, 0x9c, 0x28, 0xdc, 0xbc, 0xb7, 0x93, 0x70, 0xbb, 0x7b, 0xa8,
+  0x83, 0xbc, 0xcb, 0xf0, 0x9a, 0x3c, 0x53, 0x7d, 0x31, 0xbd, 0x8a, 0x47, 0x4a,
+  0x3c, 0xf2, 0xe7, 0x79, 0xbd, 0xe7, 0x10, 0x64, 0xbc, 0x69, 0xf1, 0xa9, 0xbc,
+  0x5c, 0xfc, 0x9b, 0x3d, 0x5a, 0xcf, 0x14, 0x3d, 0xec, 0x08, 0x63, 0x3d, 0x69,
+  0x0f, 0x99, 0xbd, 0x6a, 0x76, 0xeb, 0x3c, 0xbd, 0x2f, 0x8f, 0x3d, 0xa0, 0x54,
+  0x8f, 0x3d, 0x7e, 0x08, 0x84, 0x3d, 0xba, 0x94, 0x42, 0x3d, 0x7c, 0xae, 0xf9,
+  0xbd, 0x70, 0x32, 0x7f, 0x3c, 0x2f, 0xd3, 0x88, 0xbc, 0x9a, 0x1a, 0x49, 0x3d,
+  0xf6, 0xed, 0x54, 0xbd, 0x7e, 0x15, 0x66, 0x3d, 0x81, 0x94, 0x7f, 0x3d, 0x4a,
+  0xfb, 0x5f, 0x3c, 0xd7, 0x10, 0x3a, 0x3c, 0xf8, 0x02, 0x89, 0xbd, 0x9f, 0x9c,
+  0xb9, 0xbc, 0x02, 0x4c, 0x5b, 0x3d, 0x80, 0xe7, 0x33, 0x3c, 0x55, 0x86, 0x99,
+  0x3d, 0x9d, 0xa9, 0xad, 0xbd, 0x9e, 0x1b, 0x76, 0xbb, 0xb8, 0x62, 0x49, 0x3d,
+  0x22, 0x21, 0x65, 0x3d, 0x22, 0x6d, 0x0f, 0x3d, 0x60, 0x23, 0x87, 0xbc, 0xc8,
+  0xfc, 0x26, 0xbd, 0xc5, 0x47, 0x8c, 0xbd, 0x22, 0x6e, 0xe2, 0xbc, 0xf0, 0x78,
+  0x2e, 0x3d, 0xa4, 0x7f, 0xa5, 0xbc, 0xf1, 0x41, 0xae, 0x3d, 0xa4, 0x08, 0x0b,
+  0x3d, 0xe8, 0xbb, 0x1c, 0xbc, 0xf8, 0xdd, 0x85, 0xbc, 0x72, 0x87, 0xea, 0x3c,
+  0x4a, 0xaa, 0x9a, 0x3d, 0x86, 0xdb, 0xb6, 0x3d, 0x0f, 0xb5, 0xd1, 0xba, 0xfc,
+  0x88, 0x62, 0xbd, 0x08, 0x54, 0xfd, 0x3d, 0x35, 0xf8, 0x2e, 0xbd, 0x3b, 0xbb,
+  0xc9, 0x3d, 0x9c, 0xb6, 0x57, 0x3d, 0x03, 0x65, 0x58, 0x3d, 0x13, 0xd0, 0x1d,
+  0xbd, 0xbb, 0xb1, 0xbf, 0xbc, 0x78, 0x00, 0xde, 0xbc, 0x5c, 0xcb, 0x48, 0xbd,
+  0xd3, 0xa1, 0x85, 0x3d, 0x08, 0x35, 0xf6, 0xbc, 0x4c, 0x66, 0x89, 0x3d, 0x09,
+  0x92, 0xa6, 0xbc, 0x64, 0x99, 0x9e, 0xbd, 0xae, 0x80, 0x85, 0xbd, 0x99, 0xe0,
+  0xe2, 0x3c, 0x8e, 0x75, 0x66, 0xbc, 0x1e, 0x8c, 0xb9, 0xbd, 0x57, 0x43, 0xa8,
+  0x3c, 0x31, 0x71, 0xac, 0xbc, 0xb5, 0x75, 0x01, 0x3d, 0x10, 0x39, 0x5c, 0xbd,
+  0xa6, 0xf9, 0x7b, 0xbd, 0xf6, 0xea, 0x5d, 0x3d, 0xd3, 0x34, 0xc7, 0xbc, 0x4e,
+  0xdc, 0x76, 0xbc, 0x7c, 0x98, 0x26, 0x3c, 0xfb, 0x7a, 0x27, 0xbd, 0x44, 0xe6,
+  0x44, 0xbd, 0x26, 0xc5, 0xb2, 0x3d, 0xb1, 0x6e, 0xfa, 0xbd, 0x79, 0xcc, 0x29,
+  0xbd, 0x08, 0xae, 0x46, 0xbc, 0x9d, 0x74, 0x67, 0x3d, 0xa3, 0xb6, 0x98, 0x3d,
+  0x92, 0xae, 0x3f, 0xbc, 0xef, 0x8c, 0x90, 0x3d, 0xeb, 0x4c, 0x02, 0xbc, 0x21,
+  0x7d, 0xe5, 0x3c, 0xd4, 0x6f, 0x47, 0xbd, 0x1a, 0xe8, 0x84, 0x3c, 0x0c, 0x96,
+  0x85, 0xbd, 0xa9, 0x69, 0xa7, 0xbb, 0x8c, 0x1e, 0x82, 0xba, 0xff, 0x78, 0x04,
+  0xbc, 0x25, 0xb9, 0xaa, 0xbd, 0x0b, 0x03, 0x48, 0xbc, 0xb3, 0xbb, 0x88, 0xbd,
+  0x00, 0x26, 0xba, 0xbd, 0x82, 0x41, 0x81, 0x3d, 0xfa, 0x3d, 0xc7, 0x3c, 0x38,
+  0x5c, 0x49, 0xbd, 0x0d, 0x4d, 0x3a, 0x3d, 0x67, 0x58, 0x0a, 0xbd, 0x7e, 0xf6,
+  0x82, 0x3b, 0x1a, 0x7a, 0x7b, 0x3d, 0xba, 0xff, 0x84, 0x3c, 0x46, 0x87, 0x84,
+  0x3c, 0xe8, 0x6c, 0x29, 0x3d, 0x8c, 0x6a, 0xac, 0xbc, 0x89, 0x34, 0x91, 0xbd,
+  0xb9, 0xaf, 0xa6, 0x3c, 0xe0, 0x9e, 0xaf, 0xbc, 0xd2, 0x7a, 0x38, 0x3d, 0xac,
+  0xbf, 0xc9, 0x3d, 0x73, 0xa1, 0x13, 0x3d, 0x7d, 0xe1, 0xf2, 0x3c, 0x73, 0xec,
+  0xcf, 0x3b, 0xfd, 0x7b, 0x8e, 0x3d, 0x1e, 0xb2, 0xf3, 0xbc, 0xdc, 0x32, 0x03,
+  0xbe, 0x5e, 0xfa, 0x1b, 0x3d, 0xdc, 0x1a, 0x25, 0x3d, 0x00, 0xcd, 0x48, 0xba,
+  0x13, 0x9d, 0xbe, 0x3d, 0x2e, 0x05, 0x77, 0xbd, 0x17, 0x74, 0x9e, 0xbd, 0xae,
+  0xc5, 0x62, 0x3c, 0x95, 0xf4, 0x59, 0x3d, 0x36, 0xd2, 0xa4, 0x3d, 0xab, 0x2b,
+  0x84, 0xbc, 0x87, 0x89, 0x55, 0x3d, 0xd0, 0xde, 0x5d, 0xbc, 0xcd, 0xb0, 0xce,
+  0xbc, 0x29, 0xa0, 0xc8, 0xbc, 0x8a, 0x0b, 0xf1, 0x3c, 0xb8, 0xce, 0x9c, 0x3c,
+  0x14, 0xd1, 0x36, 0x3d, 0x50, 0x4b, 0x08, 0xbd, 0x85, 0x95, 0x4b, 0xbd, 0x31,
+  0x9e, 0xcf, 0xbc, 0xff, 0x96, 0x83, 0x3d, 0x6c, 0x32, 0x15, 0x3c, 0x6d, 0xfd,
+  0xb0, 0x3d, 0x05, 0xd8, 0x33, 0xbd, 0x1b, 0x74, 0x8d, 0xbd, 0xfb, 0x92, 0x21,
+  0xbd, 0xde, 0x6c, 0x8f, 0xbc, 0xcc, 0x1e, 0x0f, 0xbd, 0xfa, 0xc4, 0xb8, 0xbb,
+  0xc6, 0xe2, 0x1e, 0x3d, 0x9b, 0xd2, 0x99, 0xbb, 0x0f, 0x21, 0x5a, 0xbd, 0x32,
+  0xb3, 0x8b, 0x3c, 0x08, 0x0c, 0x2e, 0x3b, 0x81, 0xda, 0x5f, 0xbd, 0x44, 0x42,
+  0x81, 0x3c, 0x11, 0xf4, 0xb3, 0xbb, 0xf5, 0x91, 0xdd, 0xbd, 0x20, 0xdd, 0xb0,
+  0x3b, 0x94, 0xc1, 0xe4, 0x3c, 0x7c, 0x2f, 0x5d, 0xbd, 0x8b, 0x1f, 0xf3, 0x3c,
+  0xf7, 0xc1, 0xd1, 0xbd, 0x2e, 0x5f, 0x5d, 0xbd, 0x35, 0x2c, 0x92, 0x3b, 0x47,
+  0x24, 0x34, 0x3d, 0x7f, 0x44, 0x71, 0x3d, 0x39, 0xd7, 0xfc, 0x3c, 0x60, 0x34,
+  0x49, 0xbd, 0x70, 0xdc, 0x80, 0x3c, 0x3b, 0xe4, 0x5d, 0xbc, 0x7d, 0x7f, 0xe3,
+  0x3c, 0x6d, 0x96, 0x2e, 0x3d, 0x7b, 0x5c, 0x15, 0x3d, 0xc3, 0x8f, 0x78, 0x3c,
+  0x5b, 0x2f, 0x2d, 0xbc, 0x30, 0xfd, 0x3a, 0x3d, 0x79, 0x6a, 0xbb, 0x3d, 0x1a,
+  0xb0, 0x4d, 0x3c, 0xe2, 0x91, 0x9a, 0x3b, 0x3c, 0x03, 0xa4, 0x3d, 0xa9, 0x2a,
+  0x3a, 0xbd, 0xfc, 0xbb, 0x88, 0x3d, 0x16, 0x7f, 0x2a, 0x3c, 0xdd, 0xfc, 0x43,
+  0x3d, 0x41, 0x34, 0x3f, 0x3d, 0x80, 0x68, 0x76, 0xbd, 0xbb, 0xab, 0xa9, 0x3d,
+  0x4f, 0x4c, 0x17, 0x3d, 0xa3, 0x6e, 0x48, 0x3c, 0x24, 0xdf, 0xed, 0xbc, 0xa9,
+  0xca, 0x8e, 0xbd, 0x28, 0x64, 0x51, 0x3d, 0x65, 0xea, 0x94, 0x3d, 0x80, 0xc3,
+  0x08, 0x3b, 0xba, 0xc6, 0x38, 0x3d, 0xa3, 0x2f, 0x64, 0xba, 0x16, 0xc1, 0x28,
+  0x3d, 0xfb, 0x5a, 0x4c, 0x3c, 0xd9, 0x21, 0x26, 0xbd, 0xb9, 0x19, 0xbd, 0x3d,
+  0xba, 0x00, 0x59, 0x3c, 0xeb, 0x40, 0x14, 0xbc, 0x24, 0x37, 0xe9, 0xbc, 0x5e,
+  0x99, 0xd0, 0xbc, 0x7c, 0xbc, 0x18, 0xbd, 0x71, 0x23, 0x56, 0x3d, 0xca, 0xa7,
+  0x30, 0xbe, 0x37, 0x29, 0x5b, 0xbd, 0x73, 0xfa, 0x30, 0x3d, 0xb7, 0x67, 0xcd,
+  0xbc, 0x92, 0xa3, 0x54, 0x3c, 0xf8, 0x54, 0xaa, 0x3d, 0xba, 0x13, 0x8c, 0x3d,
+  0x35, 0xa3, 0xa6, 0x3c, 0x11, 0x44, 0x1d, 0xbc, 0x56, 0xe4, 0x18, 0xbd, 0xd6,
+  0x33, 0xab, 0x3c, 0x2c, 0x70, 0xa8, 0xbc, 0xa0, 0xd7, 0xc8, 0xb8, 0x56, 0xd9,
+  0x69, 0x3d, 0xab, 0xaf, 0x5e, 0xbd, 0x09, 0xbf, 0xb1, 0xbd, 0xad, 0xf1, 0x50,
+  0x3c, 0xe0, 0x69, 0x47, 0xbd, 0x21, 0x32, 0x2b, 0xbb, 0x66, 0x24, 0x90, 0xbd,
+  0xf8, 0xca, 0xbf, 0xbc, 0x1f, 0x85, 0x02, 0xbd, 0xc9, 0x47, 0xa6, 0x3d, 0xaa,
+  0xeb, 0x9b, 0xbc, 0xcf, 0x49, 0x88, 0xbd, 0x40, 0xf0, 0x4e, 0xbc, 0xe3, 0x45,
+  0x16, 0x3d, 0xd4, 0x2e, 0xa4, 0xbc, 0xaf, 0xe6, 0x81, 0x3d, 0x62, 0xef, 0x2c,
+  0xbc, 0x95, 0xea, 0x63, 0xbd, 0x33, 0x76, 0x9e, 0x3d, 0x16, 0xdf, 0xd6, 0xbd,
+  0xa4, 0xb0, 0xde, 0x39, 0xee, 0xfc, 0x89, 0x3d, 0xbd, 0x48, 0xbe, 0x3b, 0xd1,
+  0xbb, 0x31, 0xbc, 0x69, 0x1b, 0x26, 0xbd, 0xc1, 0x34, 0xec, 0x3c, 0x33, 0x47,
+  0xd5, 0x3c, 0xd0, 0xfb, 0x5c, 0x3b, 0xec, 0x71, 0x27, 0xbc, 0x48, 0x88, 0x62,
+  0x3c, 0x60, 0x89, 0x76, 0x3b, 0x4c, 0x07, 0xe8, 0x3c, 0xd5, 0xb4, 0x16, 0x3d,
+  0x9d, 0x21, 0x9f, 0x3c, 0x9d, 0x78, 0xb3, 0xbd, 0xeb, 0x74, 0x21, 0xbd, 0xdb,
+  0x5e, 0x75, 0xbd, 0x02, 0xf1, 0x9b, 0x3d, 0x50, 0x67, 0x30, 0xbc, 0xc4, 0xa7,
+  0xe6, 0x3c, 0x77, 0x75, 0x6e, 0x3c, 0xfd, 0x7e, 0x9e, 0xbb, 0x79, 0xed, 0x77,
+  0xbc, 0x18, 0x82, 0x40, 0x3d, 0x18, 0xd1, 0x93, 0x3d, 0x4a, 0xa2, 0x32, 0xbb,
+  0x83, 0xd5, 0x51, 0x3c, 0xa1, 0x52, 0xd9, 0x38, 0x6a, 0x5e, 0xb4, 0x3d, 0x73,
+  0xb2, 0x1f, 0xbd, 0x02, 0xe7, 0x06, 0xbd, 0x25, 0x20, 0x5c, 0xbd, 0x6a, 0x66,
+  0x16, 0x3d, 0xef, 0x75, 0x7c, 0x3d, 0x4b, 0xa8, 0x89, 0x3d, 0x17, 0x5e, 0x82,
+  0xbc, 0xd7, 0x41, 0x80, 0x3d, 0x67, 0x41, 0xaf, 0xbc, 0x93, 0x11, 0x9b, 0x3d,
+  0x4a, 0x03, 0xb3, 0xbd, 0x0d, 0x82, 0x32, 0xbd, 0x39, 0x35, 0xee, 0xbc, 0x07,
+  0x60, 0x87, 0xbd, 0x51, 0xb7, 0x4d, 0x3b, 0xe4, 0x6e, 0xbf, 0xbb, 0x24, 0x01,
+  0x36, 0xbd, 0x24, 0x02, 0x10, 0xbd, 0xfe, 0x24, 0x4f, 0xbd, 0xaf, 0xc2, 0x34,
+  0xbc, 0x21, 0x39, 0xd9, 0x3c, 0x80, 0x73, 0x88, 0x3c, 0x8e, 0xaf, 0x84, 0xbd,
+  0x1e, 0x05, 0x8b, 0xbd, 0xd2, 0xa7, 0x0e, 0x3d, 0x53, 0xe6, 0x89, 0x3b, 0xf3,
+  0xd7, 0xa7, 0x3d, 0x58, 0xf7, 0x29, 0x3d, 0xb1, 0x45, 0x9f, 0x3c, 0x3d, 0xf4,
+  0x73, 0x3d, 0x73, 0xd2, 0x4d, 0xbd, 0x6f, 0x4a, 0x0f, 0x3d, 0xc1, 0x60, 0x95,
+  0xbd, 0xf4, 0x0f, 0x8e, 0x3d, 0x83, 0x58, 0xed, 0xbd, 0x58, 0x39, 0x12, 0x3c,
+  0x20, 0x58, 0x39, 0x3d, 0xf4, 0xc9, 0x14, 0x3d, 0x5f, 0xa1, 0x0a, 0x3d, 0xd0,
+  0x80, 0x42, 0xbd, 0x2b, 0xc9, 0x35, 0xbd, 0xa5, 0xe0, 0xf9, 0xbc, 0x11, 0xe4,
+  0x8b, 0x3c, 0x0f, 0x18, 0x33, 0xbd, 0xb7, 0x53, 0x8f, 0xbc, 0xa8, 0xfe, 0x4f,
+  0xbd, 0x1f, 0x8d, 0xf9, 0x3b, 0x33, 0x31, 0xa6, 0x3d, 0xb7, 0x6d, 0x03, 0x3c,
+  0x80, 0xaa, 0xda, 0xbd, 0x82, 0x6e, 0xc5, 0x3c, 0x22, 0xaa, 0xba, 0x3c, 0xfd,
+  0xd9, 0xcd, 0x3c, 0x16, 0x60, 0x5a, 0x3c, 0x48, 0xdb, 0x36, 0x3d, 0x10, 0xf4,
+  0x84, 0xbc, 0x78, 0xf4, 0x8c, 0x3d, 0x24, 0xd3, 0xf2, 0xbc, 0x8e, 0xac, 0x16,
+  0xbd, 0x41, 0x7a, 0xf1, 0x3c, 0xd3, 0x25, 0x77, 0x3d, 0x26, 0xf2, 0x63, 0x3d,
+  0x7a, 0xb2, 0xa0, 0x3d, 0x00, 0xbb, 0xa4, 0x3c, 0x11, 0xd2, 0xf7, 0xbc, 0x92,
+  0x58, 0xa7, 0x3d, 0xa1, 0x9e, 0xaf, 0xbd, 0x38, 0xb3, 0x0b, 0x3c, 0xf3, 0xbb,
+  0x62, 0x3c, 0x98, 0x07, 0x9c, 0x3d, 0xa3, 0x56, 0xba, 0xba, 0x1a, 0x8d, 0x95,
+  0x3d, 0x13, 0x14, 0x7b, 0x3d, 0xfe, 0x05, 0xb3, 0x3d, 0xd2, 0x56, 0x01, 0x3c,
+  0x9e, 0xad, 0x44, 0x3d, 0xc7, 0xd7, 0x98, 0x3c, 0x1e, 0xfb, 0x18, 0x3d, 0x58,
+  0x4c, 0x53, 0xbc, 0xf2, 0x16, 0xf1, 0xbb, 0xae, 0x3a, 0xad, 0xbd, 0x3d, 0xdd,
+  0x40, 0xbd, 0x9f, 0xa1, 0x9c, 0xbd, 0xb6, 0xb7, 0x09, 0xbc, 0x74, 0xc3, 0xbc,
+  0xbd, 0x22, 0xf9, 0x61, 0xbc, 0x71, 0x46, 0x80, 0xbc, 0x26, 0x48, 0x53, 0xbd,
+  0x6a, 0xb7, 0x5d, 0x3d, 0xb9, 0xc9, 0x66, 0x3d, 0xaf, 0x27, 0x00, 0xbd, 0x24,
+  0x28, 0xd3, 0x3a, 0x53, 0xfb, 0x5d, 0xbd, 0xf4, 0x8b, 0x8a, 0x3d, 0x80, 0x14,
+  0x8e, 0xbd, 0x72, 0xcc, 0xa7, 0x3d, 0xd4, 0x5b, 0xff, 0xbc, 0xdf, 0x54, 0x43,
+  0xbd, 0x6a, 0x25, 0xe1, 0x3b, 0xe2, 0xe9, 0x09, 0xbd, 0x55, 0xad, 0x63, 0xbd,
+  0x14, 0xb6, 0xa9, 0x3b, 0x0c, 0xba, 0xd8, 0xbc, 0xc3, 0x6d, 0x53, 0xbd, 0x42,
+  0xa5, 0x5f, 0xbd, 0x7b, 0x04, 0x22, 0xbd, 0x15, 0x56, 0x77, 0x3c, 0x53, 0x67,
+  0xe6, 0xbc, 0x69, 0xe6, 0x89, 0x3c, 0x80, 0xcc, 0xbb, 0xbb, 0xea, 0x11, 0xb5,
+  0x3d, 0x02, 0x35, 0xb6, 0x3b, 0x98, 0x78, 0x19, 0x3d, 0xae, 0x02, 0xdd, 0xbd,
+  0x88, 0x78, 0x35, 0x3c, 0x30, 0x8b, 0x9d, 0xbd, 0xce, 0x4f, 0xad, 0xbd, 0x27,
+  0xf3, 0xcf, 0x3c, 0xda, 0x15, 0x82, 0xbd, 0x50, 0x43, 0x86, 0x3c, 0xff, 0x0b,
+  0xca, 0x3b, 0xec, 0x3f, 0xd1, 0xbc, 0x53, 0xc4, 0x15, 0x3d, 0x72, 0x9f, 0x12,
+  0x3d, 0xcb, 0x3b, 0xcc, 0x3c, 0x90, 0xd2, 0x3a, 0x3d, 0x42, 0x53, 0x0d, 0xbc,
+  0x46, 0x82, 0x93, 0x3d, 0xe9, 0x9a, 0xb1, 0xbd, 0x05, 0x99, 0x98, 0xbb, 0x52,
+  0x17, 0x71, 0xbd, 0x6e, 0xb6, 0x8d, 0xbd, 0x0f, 0xe1, 0x66, 0xbd, 0x2b, 0x2f,
+  0x1b, 0x3d, 0x97, 0x2f, 0xf4, 0xbc, 0xc0, 0xc0, 0x0f, 0x3d, 0xf3, 0x36, 0x6f,
+  0x3d, 0x38, 0x99, 0x97, 0x3c, 0xca, 0x4a, 0xca, 0xbd, 0xe2, 0x66, 0x11, 0x3b,
+  0xa8, 0xe8, 0x03, 0xbd, 0x60, 0xbf, 0x7e, 0xbb, 0x6d, 0x53, 0xb9, 0x3d, 0x50,
+  0x02, 0x0c, 0x3c, 0xe3, 0x5f, 0xbb, 0xbd, 0xd1, 0xc0, 0xbd, 0xbc, 0x42, 0x35,
+  0x89, 0x3d, 0x36, 0x8e, 0x9c, 0xbd, 0xac, 0x4a, 0x92, 0xbd, 0x7c, 0xb8, 0x65,
+  0xbd, 0x77, 0xdd, 0x5e, 0xbd, 0x58, 0x55, 0x38, 0xbd, 0x2e, 0xa6, 0x67, 0x3c,
+  0x7d, 0x81, 0x0b, 0xbd, 0x7b, 0xda, 0x92, 0x3d, 0x07, 0xec, 0x98, 0xbc, 0x6c,
+  0x89, 0x35, 0xbd, 0x1b, 0x09, 0x0a, 0x3d, 0xca, 0x57, 0x27, 0x3c, 0xab, 0xff,
+  0x2e, 0x3d, 0x97, 0xd7, 0x8d, 0xbd, 0xfa, 0x59, 0xb3, 0x3d, 0xb2, 0x38, 0x31,
+  0x3d, 0xd2, 0x30, 0x2b, 0x3d, 0xa5, 0x8d, 0xa4, 0x3b, 0xc9, 0xca, 0xe4, 0x3c,
+  0x0a, 0x75, 0x99, 0x3d, 0x3f, 0x85, 0x08, 0x3d, 0xff, 0x4e, 0x4e, 0x3d, 0x00,
+  0xfb, 0x74, 0x3d, 0x90, 0x22, 0xb2, 0xbb, 0xed, 0xe6, 0x8c, 0xbb, 0x23, 0x48,
+  0xe6, 0x3b, 0xfc, 0x6e, 0x62, 0xbd, 0xd5, 0x72, 0x58, 0x3d, 0xc8, 0x23, 0xce,
+  0x3c, 0xf2, 0x1f, 0x3b, 0x3c, 0xd0, 0x69, 0xc6, 0x3b, 0x18, 0x15, 0x62, 0x3c,
+  0xa8, 0x0a, 0x2b, 0x3d, 0x94, 0xed, 0x79, 0xbd, 0xf1, 0xff, 0x81, 0xbc, 0xb8,
+  0x90, 0x3e, 0xbd, 0x4d, 0x8e, 0x25, 0x3d, 0x04, 0x91, 0xef, 0x3d, 0xb9, 0x57,
+  0x17, 0x3d, 0x3a, 0xef, 0x01, 0xbd, 0xc4, 0x52, 0x59, 0xbc, 0x8a, 0x5e, 0x8e,
+  0xbd, 0xe7, 0x23, 0xf5, 0xbc, 0x4f, 0xe7, 0x1f, 0xbd, 0x1f, 0x86, 0x82, 0xbc,
+  0x1e, 0xf9, 0x53, 0x3d, 0xdf, 0x9c, 0x0a, 0x3c, 0xbf, 0xc9, 0xcc, 0x3c, 0xec,
+  0xa1, 0x3e, 0xbc, 0x9c, 0x8e, 0x5e, 0x3a, 0xfd, 0xd8, 0x90, 0xbc, 0xe8, 0x4c,
+  0xc7, 0xbc, 0xf2, 0x0f, 0x4b, 0x3a, 0x08, 0x9d, 0xbc, 0xbc, 0xab, 0x39, 0x4d,
+  0x3d, 0xea, 0x3d, 0x6b, 0x3d, 0x5c, 0x84, 0x80, 0x3d, 0x7d, 0x95, 0xf8, 0xbc,
+  0x70, 0xb2, 0x18, 0xbd, 0x2a, 0x02, 0x79, 0x3d, 0xe8, 0xd9, 0x3c, 0x3d, 0x67,
+  0xaf, 0x29, 0x3d, 0x39, 0x45, 0x27, 0xbd, 0x0a, 0x7b, 0x12, 0xbd, 0xbb, 0xdc,
+  0xe9, 0xbc, 0x73, 0x04, 0x83, 0xbd, 0x5d, 0xe4, 0x1c, 0xbd, 0xf0, 0x70, 0x29,
+  0x3d, 0x87, 0x1e, 0x0d, 0xbd, 0x39, 0x86, 0xf0, 0x3c, 0xf5, 0x57, 0x3e, 0xbd,
+  0xc8, 0x3c, 0x18, 0xbc, 0xf4, 0xa8, 0xa0, 0x3d, 0x5c, 0xa0, 0x6c, 0x3d, 0x02,
+  0x7a, 0x7e, 0xbc, 0x0b, 0xb6, 0x6d, 0xbd, 0xb0, 0x9a, 0xa8, 0x3c, 0xee, 0x24,
+  0x11, 0x3d, 0x54, 0x87, 0xf7, 0xbc, 0x57, 0x52, 0x70, 0xbd, 0x1e, 0x35, 0x46,
+  0xbd, 0x38, 0x2d, 0x82, 0x3d, 0x9d, 0x1a, 0x3c, 0xbd, 0x53, 0x7b, 0xa6, 0x3d,
+  0x29, 0x4b, 0xab, 0x3d, 0x0c, 0x43, 0x2d, 0x3d, 0x1a, 0x12, 0x95, 0x3d, 0x3b,
+  0xf1, 0x3e, 0x3d, 0x80, 0xf6, 0x8d, 0xbd, 0x1b, 0xb6, 0xb4, 0xbc, 0x98, 0x23,
+  0x79, 0xbd, 0xb7, 0xf6, 0xc5, 0x3d, 0x10, 0xd5, 0x48, 0x3d, 0x58, 0x7c, 0x9f,
+  0xbd, 0xa0, 0x5a, 0x16, 0xbd, 0x82, 0xfb, 0x8e, 0xbd, 0x0b, 0xec, 0xed, 0xbc,
+  0x92, 0xb7, 0xa3, 0xbd, 0xd5, 0xfd, 0x85, 0xbd, 0x54, 0xc9, 0x20, 0x3d, 0xad,
+  0xa1, 0x90, 0xbd, 0x83, 0xd6, 0xfb, 0xbc, 0xe2, 0x46, 0x43, 0x3b, 0xfe, 0xa6,
+  0xbd, 0xb7, 0x8f, 0xd3, 0xaf, 0x3d, 0x75, 0xb9, 0x9d, 0x3d, 0xd5, 0xfc, 0x2a,
+  0x3c, 0xc6, 0x7e, 0xd6, 0xbc, 0x08, 0xcd, 0x4c, 0xbd, 0xcf, 0x4f, 0x73, 0x3d,
+  0x3e, 0x7f, 0xb7, 0xbc, 0xbc, 0xa9, 0xfd, 0xbc, 0xf4, 0x8b, 0xa6, 0xbc, 0x11,
+  0x90, 0xd0, 0xbc, 0x47, 0xf7, 0x4d, 0x3c, 0xed, 0x09, 0x64, 0xbd, 0x61, 0x49,
+  0x8d, 0xbc, 0xc8, 0xd3, 0x3c, 0x3d, 0x72, 0x23, 0x88, 0x3d, 0xc3, 0xa7, 0x2e,
+  0x3d, 0x67, 0x01, 0x2d, 0xbd, 0xcc, 0x34, 0xa0, 0xbd, 0x7e, 0xc7, 0xf8, 0xbc,
+  0x0c, 0xf5, 0xaf, 0xbb, 0x6e, 0xa6, 0x4f, 0x3d, 0xe2, 0xb9, 0x88, 0xbd, 0x87,
+  0x6f, 0xf9, 0xbc, 0x82, 0x23, 0x16, 0x3c, 0x10, 0x0c, 0x69, 0x3b, 0xab, 0x02,
+  0xe2, 0x3c, 0x57, 0x6a, 0x08, 0xba, 0x4e, 0xc7, 0x6a, 0x3d, 0x30, 0x86, 0x6d,
+  0x3c, 0xee, 0xb3, 0x84, 0x3d, 0xf9, 0xc4, 0x3a, 0x3d, 0x6f, 0x21, 0x8d, 0xbb,
+  0xef, 0x7e, 0xc1, 0x3b, 0x05, 0xca, 0x12, 0xbc, 0x8a, 0x77, 0x2b, 0xbd, 0x1e,
+  0x23, 0x32, 0x3d, 0x32, 0x8b, 0x03, 0x3d, 0xd3, 0x33, 0x0a, 0xbd, 0x3f, 0xdd,
+  0x59, 0xbd, 0x18, 0xfa, 0x00, 0x3d, 0x46, 0x0b, 0xdd, 0x3b, 0x96, 0x2b, 0x4c,
+  0xbd, 0xc8, 0xcc, 0xa7, 0x3d, 0xe2, 0xad, 0x2e, 0x3d, 0xbc, 0x68, 0x54, 0x3d,
+  0xcb, 0x88, 0xae, 0x3c, 0x00, 0xd8, 0x15, 0xbc, 0x18, 0x4b, 0xb5, 0xbd, 0x89,
+  0x31, 0x93, 0xbd, 0x84, 0xd3, 0x57, 0x3d, 0x86, 0x2c, 0x6c, 0x3d, 0x18, 0x08,
+  0xb1, 0x3d, 0x14, 0x61, 0xbc, 0xbc, 0x25, 0xa4, 0x27, 0xbd, 0xfa, 0xdd, 0xb7,
+  0xbd, 0x81, 0xaf, 0x1d, 0xbc, 0x06, 0x91, 0x5d, 0x3d, 0x54, 0xfb, 0xc9, 0xbc,
+  0x0b, 0x35, 0x9a, 0x3b, 0x48, 0x7f, 0x1c, 0xbd, 0xaa, 0x85, 0x54, 0x3d, 0x3e,
+  0x43, 0xfe, 0xbb, 0xcb, 0xf9, 0xbf, 0x3b, 0x4b, 0x03, 0xed, 0x3c, 0xe0, 0x7f,
+  0x85, 0x3d, 0xe2, 0x52, 0x82, 0x3d, 0x98, 0x11, 0x94, 0x3d, 0x39, 0x2d, 0x26,
+  0x3c, 0xce, 0x96, 0x5e, 0xbd, 0x6c, 0x42, 0x31, 0xbd, 0xca, 0x90, 0xd4, 0x3b,
+  0x66, 0xa9, 0xc0, 0xbd, 0x23, 0x2e, 0x8d, 0x3d, 0x26, 0xc8, 0x4a, 0xbc, 0x2a,
+  0xbd, 0x09, 0xbd, 0x26, 0xa5, 0xe6, 0x3c, 0x1e, 0x7c, 0xaa, 0x3d, 0x1b, 0x52,
+  0x15, 0x3d, 0xb2, 0xa4, 0x81, 0x3d, 0x73, 0x78, 0x8a, 0x3c, 0x60, 0x6d, 0x4a,
+  0xbd, 0x60, 0xc1, 0x3b, 0xbc, 0x14, 0xc6, 0xfb, 0x3c, 0x48, 0x70, 0x05, 0xbd,
+  0xc1, 0xa4, 0x98, 0x3d, 0x71, 0x0a, 0xc4, 0xbd, 0x25, 0xdd, 0x31, 0xbd, 0x99,
+  0x3a, 0x94, 0xbd, 0xa1, 0x45, 0xbf, 0x3c, 0x54, 0x14, 0xbf, 0xbc, 0xfd, 0x98,
+  0xd2, 0xbd, 0xca, 0x27, 0x87, 0xbd, 0x1a, 0x52, 0x3a, 0x3d, 0xc3, 0xcf, 0x42,
+  0xbc, 0x4c, 0x2f, 0xe0, 0x3a, 0x96, 0x3f, 0x5e, 0x3b, 0xba, 0xc2, 0x1d, 0xbd,
+  0xed, 0x26, 0x42, 0xbd, 0xf6, 0xe0, 0xb4, 0x3d, 0xbe, 0x39, 0x23, 0xbc, 0x05,
+  0x9d, 0xba, 0x3c, 0xe9, 0x38, 0x2f, 0xbb, 0x15, 0x9c, 0xbb, 0x3d, 0x22, 0xca,
+  0x66, 0x3c, 0x10, 0x16, 0xdb, 0xbc, 0x11, 0x3d, 0xda, 0x3d, 0xac, 0x48, 0x37,
+  0xbd, 0xac, 0x3e, 0x08, 0xbd, 0x8b, 0xb1, 0x7f, 0x3d, 0xe7, 0x31, 0xa3, 0x3c,
+  0xd5, 0xe9, 0xb6, 0x3d, 0x53, 0xc1, 0x19, 0xbd, 0x2f, 0xc2, 0x35, 0xbd, 0xf9,
+  0xa6, 0xa2, 0xbd, 0x46, 0x22, 0x2b, 0x3d, 0x2a, 0x2c, 0x3b, 0xbd, 0xf3, 0x8e,
+  0x07, 0x3c, 0xff, 0xb1, 0x09, 0xbd, 0xbd, 0x01, 0x0f, 0xbb, 0x04, 0x7f, 0x4a,
+  0xbd, 0xb9, 0xca, 0x87, 0x3d, 0x4e, 0x96, 0x12, 0xbc, 0x7b, 0x9a, 0x7d, 0x3d,
+  0x1b, 0x48, 0x08, 0xbc, 0x1b, 0x36, 0x8a, 0x3d, 0xd1, 0x48, 0xe1, 0x3c, 0xb9,
+  0xb0, 0x6f, 0x3d, 0x51, 0x6a, 0x83, 0xbb, 0xaa, 0xf0, 0xac, 0x3d, 0x61, 0xdb,
+  0x43, 0xbd, 0x2e, 0xcf, 0xa2, 0x3d, 0xa6, 0x41, 0x89, 0x3d, 0x53, 0x86, 0xe1,
+  0xbc, 0xda, 0x91, 0x9a, 0xbd, 0xba, 0xf7, 0x86, 0x3d, 0x8b, 0x8c, 0xab, 0xbd,
+  0xa2, 0x2c, 0x6b, 0x3d, 0x31, 0x66, 0x83, 0x3c, 0xce, 0xd5, 0x0e, 0xbd, 0x35,
+  0x29, 0x73, 0x3d, 0x9b, 0xf7, 0xb0, 0x3d, 0x51, 0x33, 0x21, 0x3d, 0x4c, 0xa1,
+  0x4b, 0x3d, 0x58, 0xe3, 0xd5, 0xbc, 0x9f, 0xe4, 0x68, 0x3b, 0xed, 0x0b, 0x1e,
+  0x3b, 0xc8, 0x06, 0x8c, 0x3c, 0x67, 0x47, 0x17, 0xbd, 0x63, 0xb4, 0xd1, 0xbc,
+  0xf3, 0x34, 0x55, 0xbc, 0xde, 0x7b, 0x31, 0xbd, 0x17, 0x4e, 0x74, 0xba, 0x8b,
+  0x65, 0x43, 0xbc, 0x01, 0xcc, 0xa0, 0x3d, 0xc7, 0x20, 0xa2, 0xbd, 0x63, 0x70,
+  0x67, 0x3c, 0x65, 0xa0, 0x8d, 0x3d, 0xdf, 0xc9, 0x3d, 0xbc, 0x2f, 0xfa, 0x44,
+  0x3b, 0xd2, 0xcf, 0x42, 0x3d, 0x9a, 0x40, 0x06, 0x3d, 0x67, 0x53, 0x4b, 0xbc,
+  0x43, 0x50, 0x4a, 0x3c, 0x23, 0xb9, 0xa1, 0xbc, 0xad, 0x34, 0xe3, 0xbc, 0xac,
+  0xc4, 0x4f, 0xbd, 0x4b, 0x40, 0xe5, 0xbb, 0xc3, 0xf1, 0x50, 0xbd, 0x98, 0x34,
+  0x28, 0xbd, 0x28, 0xf8, 0xae, 0x3d, 0xd1, 0x27, 0x8f, 0x3c, 0xb4, 0x8c, 0x8b,
+  0x3d, 0x73, 0xf2, 0x07, 0xbb, 0x65, 0x39, 0x61, 0xbd, 0x9a, 0x90, 0xcb, 0xbb,
+  0x18, 0x2f, 0x8e, 0xbd, 0x65, 0xab, 0x4b, 0x3d, 0xd1, 0x40, 0x64, 0xbd, 0x10,
+  0xdb, 0x83, 0xbd, 0x3b, 0x12, 0xa5, 0x3d, 0x31, 0x45, 0x78, 0x3d, 0xa4, 0xb1,
+  0x26, 0x3d, 0xac, 0x10, 0x42, 0xbc, 0xbe, 0x62, 0xb3, 0xbd, 0x4e, 0x3d, 0x76,
+  0x3c, 0x66, 0x0e, 0xde, 0xbc, 0x4f, 0x82, 0xd0, 0xbd, 0xf1, 0x86, 0x8e, 0xbd,
+  0xf1, 0xe8, 0x37, 0x3c, 0xb7, 0xbb, 0x0e, 0x3d, 0x1c, 0xc4, 0x05, 0x3d, 0x15,
+  0x50, 0x86, 0x3d, 0x81, 0x10, 0x92, 0x3b, 0x0a, 0xff, 0xed, 0x3c, 0x91, 0x9b,
+  0xb3, 0xbb, 0xb5, 0xba, 0x26, 0xbc, 0x89, 0xef, 0x0f, 0x3d, 0x52, 0xde, 0x47,
+  0x3d, 0x9d, 0x0f, 0x0c, 0x3d, 0x80, 0xee, 0xcb, 0xbd, 0xe2, 0xc7, 0x82, 0xbd,
+  0x1a, 0xf6, 0x64, 0x3c, 0xaf, 0xa7, 0xbf, 0xbc, 0xfc, 0x41, 0x37, 0x3c, 0xf9,
+  0x88, 0xfe, 0xbc, 0xdf, 0x47, 0x8d, 0xbc, 0x55, 0x09, 0x0b, 0xbd, 0x32, 0x50,
+  0x00, 0xbd, 0x83, 0x62, 0xaf, 0xbc, 0xdc, 0xac, 0x5e, 0xbd, 0xb6, 0x22, 0x54,
+  0xbd, 0x74, 0xd7, 0x00, 0x3c, 0xe3, 0x5a, 0xcb, 0xbc, 0xaa, 0x37, 0x25, 0xbd,
+  0x64, 0x98, 0x5f, 0x3d, 0x81, 0xdf, 0x8b, 0x3c, 0x23, 0xef, 0x66, 0x3b, 0x84,
+  0x67, 0x55, 0xbb, 0xd2, 0x11, 0x98, 0xbd, 0x2b, 0x15, 0x82, 0x3d, 0xeb, 0x1e,
+  0xc6, 0x3c, 0x56, 0x83, 0xcb, 0xba, 0xd0, 0xc7, 0x2d, 0x3d, 0xd1, 0xcd, 0x0c,
+  0x3d, 0xe4, 0x5c, 0x5a, 0xbc, 0x4a, 0xf3, 0x73, 0xbd, 0x43, 0xdc, 0xfe, 0x3c,
+  0x00, 0xd6, 0x2f, 0x3d, 0x06, 0x22, 0x49, 0xbb, 0x4e, 0x45, 0x71, 0xbc, 0xb3,
+  0x3c, 0x00, 0x3d, 0x1a, 0xae, 0x58, 0xbd, 0x15, 0x61, 0x92, 0x3d, 0x14, 0xb9,
+  0xf8, 0xbc, 0x15, 0x2c, 0x1b, 0x3d, 0x31, 0x97, 0x3b, 0xbc, 0xe2, 0xe7, 0x18,
+  0x3d, 0xcf, 0xf0, 0x1f, 0xbd, 0x7c, 0x1e, 0x0f, 0x3d, 0xb1, 0x27, 0x7f, 0xbd,
+  0xb8, 0xdd, 0xb2, 0xbd, 0xcc, 0xc2, 0x44, 0x3d, 0x44, 0x5c, 0x06, 0xbd, 0x4f,
+  0x6a, 0x4a, 0xbd, 0x43, 0x2c, 0x87, 0x3d, 0xb7, 0xe9, 0x48, 0xbd, 0x60, 0x01,
+  0x07, 0xbd, 0x0b, 0xe4, 0x78, 0x3a, 0x92, 0x5d, 0x64, 0xbd, 0x7c, 0xcf, 0x81,
+  0xbc, 0xe2, 0x59, 0xab, 0x3c, 0xf0, 0xbc, 0x68, 0xbc, 0xc3, 0x2d, 0x3d, 0x3d,
+  0x27, 0xb2, 0xce, 0x3d, 0x44, 0x61, 0x0e, 0x3c, 0x94, 0x6d, 0x02, 0xbd, 0xe5,
+  0x6f, 0xc2, 0x3c, 0x70, 0xab, 0x8a, 0x3a, 0x14, 0xab, 0x04, 0x3c, 0x9d, 0xd4,
+  0xab, 0x3d, 0x0a, 0x7d, 0x64, 0x3c, 0x17, 0xb5, 0xce, 0x3b, 0x66, 0xbd, 0x24,
+  0x3d, 0xed, 0xce, 0x77, 0xbd, 0xed, 0x6e, 0x7f, 0xbd, 0x70, 0xe8, 0x10, 0xbc,
+  0x6a, 0x80, 0x37, 0x3d, 0x2d, 0x0b, 0x83, 0x3d, 0x8e, 0x4b, 0x5e, 0xbd, 0xd6,
+  0x38, 0x34, 0xbd, 0xce, 0xaf, 0x88, 0x3d, 0xef, 0x64, 0x10, 0xbc, 0xa0, 0x8b,
+  0xac, 0xbd, 0x70, 0xa5, 0x50, 0x3c, 0x87, 0x3d, 0x83, 0x3d, 0x70, 0x63, 0x57,
+  0xbd, 0xf3, 0x6a, 0x44, 0x3d, 0x3a, 0x49, 0xda, 0xbd, 0x1b, 0x74, 0xde, 0xbd,
+  0x0d, 0xb2, 0x34, 0x3d, 0x04, 0x0f, 0x87, 0x3d, 0x04, 0xb1, 0x25, 0xbd, 0x5f,
+  0x2c, 0x01, 0xbc, 0x9a, 0x55, 0x6b, 0x3b, 0xad, 0xdf, 0x5e, 0x3d, 0x7f, 0x85,
+  0x2a, 0x3c, 0xfa, 0x88, 0xfa, 0xbc, 0x0d, 0x79, 0x8b, 0xbd, 0x01, 0x45, 0x73,
+  0x3d, 0x11, 0xde, 0xb6, 0x3c, 0xcc, 0xb5, 0xa4, 0x3c, 0xe8, 0xc5, 0x67, 0xbc,
+  0x66, 0x99, 0x92, 0x3d, 0x36, 0xb0, 0x79, 0xbd, 0x14, 0x41, 0xa7, 0x3d, 0xfe,
+  0x98, 0xcf, 0x3c, 0x32, 0xf7, 0x0a, 0x3d, 0xa6, 0x4a, 0x45, 0x3d, 0x83, 0xa0,
+  0x9e, 0x3d, 0x86, 0x2e, 0x71, 0x3d, 0x92, 0x9c, 0x4d, 0x3d, 0xed, 0x24, 0xeb,
+  0xbc, 0x3e, 0xfe, 0xc0, 0xbc, 0xcd, 0x6e, 0x4f, 0x3c, 0x83, 0x86, 0xa5, 0xbd,
+  0xa4, 0xd7, 0xa5, 0xbc, 0xe0, 0x9a, 0x38, 0x3d, 0xe2, 0x79, 0xcd, 0x3c, 0x4a,
+  0xe2, 0xa1, 0x3c, 0x94, 0x66, 0xd1, 0xbc, 0xe6, 0xed, 0x9b, 0x3c, 0x68, 0xb1,
+  0x41, 0x3b, 0x1b, 0x65, 0x0b, 0x3d, 0xdd, 0x50, 0xae, 0xbd, 0x29, 0xf9, 0xfc,
+  0xbc, 0x33, 0xe6, 0x37, 0xbd, 0xb6, 0x53, 0xbb, 0x3c, 0x0c, 0x5e, 0xf6, 0x3d,
+  0x75, 0xbb, 0xf6, 0xbc, 0xf8, 0xc6, 0x9a, 0x3d, 0x8f, 0xe5, 0xc4, 0x3c, 0x88,
+  0xee, 0x33, 0xbc, 0x73, 0xb2, 0x87, 0x3c, 0xd4, 0xd8, 0x58, 0x3c, 0x15, 0x37,
+  0x82, 0x3d, 0xc1, 0x4f, 0x38, 0xbc, 0xba, 0x8e, 0xf9, 0xbb, 0x7c, 0x56, 0xe0,
+  0xbd, 0xca, 0x23, 0x94, 0xbc, 0x24, 0x41, 0xae, 0x3d, 0x89, 0x4e, 0x9a, 0x3c,
+  0xcb, 0x28, 0xe3, 0x3c, 0xf1, 0xfa, 0x05, 0x3d, 0xe3, 0xa4, 0x80, 0xbd, 0x6f,
+  0xda, 0x16, 0x3d, 0xc7, 0xee, 0x77, 0xbd, 0xa8, 0xe3, 0xb1, 0xbc, 0x6f, 0x70,
+  0x90, 0xbc, 0x78, 0x35, 0x48, 0x3d, 0xac, 0xdb, 0x23, 0xbd, 0x4e, 0xbd, 0xe4,
+  0xbb, 0x79, 0x88, 0xd0, 0xbb, 0xf2, 0xa9, 0xb6, 0xbd, 0x54, 0x46, 0x5d, 0xbd,
+  0xc6, 0xb2, 0x95, 0x3d, 0xe6, 0x67, 0x52, 0x3d, 0xa6, 0x5d, 0x7f, 0xbd, 0x0b,
+  0xe5, 0xad, 0x3b, 0x91, 0xf6, 0x0c, 0x3c, 0x33, 0x45, 0xab, 0xbc, 0xa7, 0x84,
+  0xb3, 0xbc, 0xf5, 0xb0, 0x6c, 0x3c, 0x08, 0xc9, 0xb4, 0x3c, 0x61, 0x9d, 0x8b,
+  0x3c, 0x0d, 0x19, 0x87, 0x3d, 0xaa, 0xbc, 0xd3, 0xbc, 0x85, 0x92, 0x8e, 0x3b,
+  0xfc, 0x26, 0x49, 0xbd, 0x56, 0x7e, 0x7f, 0x3d, 0xf3, 0x85, 0x61, 0xbd, 0x8c,
+  0x5b, 0xf0, 0x3c, 0x14, 0x09, 0x65, 0xbd, 0x66, 0x78, 0x38, 0xbb, 0x2c, 0x69,
+  0x4d, 0xbd, 0x33, 0x31, 0x46, 0x3d, 0x6d, 0xb8, 0xa6, 0xbc, 0x69, 0x4e, 0xc3,
+  0x3d, 0xc9, 0x54, 0x93, 0xbd, 0x1a, 0x80, 0x83, 0x3d, 0x06, 0x1b, 0xa8, 0x3c,
+  0xf0, 0x64, 0x65, 0x3c, 0xae, 0xd7, 0xb2, 0x3d, 0x03, 0xc0, 0xf0, 0x3c, 0x9d,
+  0xbf, 0x84, 0xbd, 0xa6, 0x60, 0xfd, 0xbd, 0x58, 0x27, 0x41, 0x3d, 0x3f, 0x70,
+  0x9f, 0x3c, 0x13, 0x59, 0x37, 0xbd, 0x6b, 0x61, 0x4e, 0xbd, 0xb5, 0xf3, 0x26,
+  0x39, 0x10, 0x99, 0xc5, 0x3c, 0x7c, 0xda, 0x28, 0x3d, 0x23, 0x7b, 0x78, 0x3b,
+  0xa5, 0x5f, 0x1c, 0xbd, 0x8e, 0x82, 0xd0, 0x3c, 0x42, 0x5a, 0x29, 0x3d, 0x5c,
+  0x7a, 0x1d, 0xb8, 0xf8, 0x4e, 0x3c, 0xbc, 0x24, 0xee, 0x52, 0x3b, 0x56, 0xfa,
+  0x0b, 0x3d, 0xe2, 0xa4, 0xc4, 0x3b, 0xd1, 0x51, 0xe1, 0xbd, 0x22, 0xbb, 0x7f,
+  0xbd, 0xd3, 0x54, 0x6d, 0x3d, 0x75, 0x61, 0xaa, 0x3d, 0x4a, 0xd4, 0x33, 0x3d,
+  0x2d, 0x5f, 0x91, 0x3c, 0x38, 0xc6, 0xe3, 0xb9, 0x91, 0x94, 0x38, 0x3d, 0x87,
+  0x92, 0xd5, 0x3c, 0xb3, 0x59, 0x34, 0xbd, 0x74, 0x48, 0x64, 0xbd, 0x90, 0xb1,
+  0xba, 0x3c, 0xd1, 0x21, 0x97, 0x3c, 0xb9, 0x24, 0xa7, 0x3c, 0xa0, 0xe7, 0xe8,
+  0xbd, 0xf1, 0xc5, 0x45, 0x3c, 0x93, 0x0e, 0x2e, 0x3d, 0x31, 0x84, 0xd5, 0xbc,
+  0xd7, 0x86, 0xbf, 0x3c, 0x5b, 0xae, 0xb8, 0x3c, 0xc3, 0x7e, 0xf3, 0xbc, 0xb1,
+  0xd7, 0x0c, 0x3d, 0x2a, 0x33, 0xcc, 0x3d, 0x86, 0x09, 0x6b, 0x3d, 0xb6, 0xa4,
+  0x97, 0x3d, 0x15, 0x03, 0x89, 0x3d, 0x5c, 0x5c, 0x85, 0x3d, 0x47, 0x39, 0x65,
+  0x3d, 0xd2, 0x8b, 0x06, 0xbd, 0x6c, 0xed, 0x55, 0x3b, 0x30, 0xd5, 0x99, 0xbc,
+  0x7d, 0x00, 0xb5, 0xbb, 0x54, 0xe8, 0x12, 0xbd, 0x8c, 0x6f, 0x3e, 0x3c, 0x07,
+  0x15, 0x9a, 0x3d, 0xf2, 0x93, 0xa1, 0x3d, 0x0a, 0xf7, 0x7c, 0x3d, 0x89, 0xe9,
+  0xc0, 0x3c, 0xc4, 0x63, 0x6d, 0x3d, 0x02, 0x6a, 0xa9, 0x3d, 0x85, 0x9b, 0x4b,
+  0x3d, 0x20, 0x90, 0x99, 0x3c, 0xcd, 0xb5, 0x1f, 0x3d, 0x7f, 0x5e, 0x72, 0xbd,
+  0x19, 0x42, 0x08, 0xbc, 0x4c, 0xd0, 0x60, 0xbd, 0x28, 0x45, 0x5d, 0xbd, 0x9f,
+  0x9e, 0x95, 0xbd, 0xf8, 0x82, 0x82, 0xbd, 0x14, 0xd6, 0x3c, 0x3d, 0x55, 0x69,
+  0x6e, 0x3d, 0x6e, 0xd1, 0x37, 0xbc, 0x6a, 0x72, 0x34, 0xbd, 0x67, 0x77, 0xa4,
+  0xbc, 0xd0, 0xb2, 0xaa, 0x3d, 0xfa, 0xbb, 0x32, 0x3d, 0x5b, 0xfd, 0x1e, 0x3d,
+  0x6b, 0x18, 0x8a, 0x3b, 0xd1, 0xe0, 0x3b, 0x3c, 0x0e, 0xaa, 0xb8, 0xbc, 0xd8,
+  0x60, 0x73, 0x3d, 0x18, 0xea, 0xac, 0x3d, 0x0a, 0x98, 0x8c, 0xbd, 0xa8, 0xae,
+  0x90, 0x3d, 0xa4, 0x92, 0x81, 0x3b, 0xfa, 0x7d, 0x67, 0x3d, 0xd1, 0x86, 0xad,
+  0x3d, 0xa0, 0x03, 0x2e, 0xbc, 0xa7, 0x6d, 0xf7, 0x3c, 0x93, 0xfe, 0x81, 0x3d,
+  0x55, 0x43, 0xdd, 0x3b, 0x9e, 0xc7, 0x19, 0x3d, 0xc1, 0x4e, 0x1e, 0x3d, 0x4a,
+  0xb6, 0x3c, 0xbd, 0xae, 0x17, 0x16, 0xbd, 0xa1, 0xf5, 0x4d, 0xbd, 0x89, 0x2c,
+  0x04, 0xbd, 0xd3, 0xeb, 0x93, 0x3d, 0x35, 0xae, 0x19, 0x3c, 0xf8, 0x48, 0xa5,
+  0x3c, 0x94, 0x41, 0xf4, 0xbc, 0x67, 0x32, 0x41, 0xbd, 0x19, 0x2d, 0x38, 0x3d,
+  0x57, 0x90, 0x6f, 0xbc, 0xea, 0xb3, 0x89, 0xbc, 0x73, 0x19, 0x5b, 0x3d, 0x9d,
+  0x72, 0xae, 0x3d, 0xb9, 0x8b, 0x23, 0xbd, 0xa4, 0x13, 0x43, 0xbc, 0xd0, 0x4d,
+  0x12, 0x3d, 0xd7, 0xa3, 0x38, 0xbd, 0xc9, 0xb4, 0xd5, 0x3d, 0x4b, 0x93, 0x24,
+  0x3c, 0xd2, 0xfa, 0xe8, 0xbc, 0xdb, 0xa3, 0x0b, 0xbd, 0xc2, 0xdd, 0x5e, 0x3d,
+  0x4c, 0x2c, 0xa5, 0xbd, 0xd2, 0x24, 0x77, 0xbd, 0x50, 0xd3, 0xa1, 0x3d, 0xca,
+  0xe7, 0x00, 0x3a, 0xbf, 0x15, 0xed, 0xbc, 0x83, 0xc3, 0x60, 0x3d, 0xba, 0x44,
+  0x82, 0x3d, 0xa4, 0x8d, 0x93, 0x3d, 0x7a, 0xdf, 0x92, 0xbd, 0x2e, 0x60, 0xcd,
+  0x3b, 0x8a, 0xc9, 0x67, 0x3d, 0xbc, 0x59, 0x2e, 0xbd, 0xd6, 0x96, 0xb0, 0x3d,
+  0x89, 0x2f, 0xd1, 0xbc, 0x18, 0xd2, 0x0c, 0xbc, 0xc4, 0xf8, 0x84, 0x3d, 0x50,
+  0xc8, 0x52, 0xbd, 0xa8, 0xc1, 0x58, 0xbd, 0xa3, 0xe1, 0x26, 0x3d, 0x61, 0x05,
+  0x00, 0x3d, 0x5d, 0xe9, 0x84, 0x3d, 0xc2, 0x44, 0x37, 0x3d, 0xfb, 0xf3, 0xb0,
+  0xbc, 0x69, 0x4b, 0x6c, 0xbd, 0xa9, 0x6b, 0xa4, 0xbc, 0x77, 0x53, 0x84, 0x3c,
+  0x12, 0x21, 0x0c, 0xbd, 0x0d, 0x59, 0x08, 0xbc, 0x44, 0xb6, 0x11, 0xbd, 0xaa,
+  0xef, 0x8e, 0x3d, 0x4e, 0x39, 0x32, 0x3d, 0x40, 0x7f, 0x7a, 0xbd, 0xa8, 0x2d,
+  0xbf, 0xbc, 0x3a, 0xff, 0x30, 0x3d, 0xff, 0x61, 0xbb, 0x3b, 0xc3, 0xdf, 0x96,
+  0xbc, 0x22, 0x74, 0x53, 0xbd, 0x69, 0x07, 0x8a, 0xbd, 0x46, 0x58, 0xe0, 0x3c,
+  0x91, 0x62, 0x31, 0xbd, 0x38, 0x57, 0x01, 0xbc, 0x09, 0x74, 0x93, 0xbc, 0x3e,
+  0xb2, 0x8a, 0x3c, 0xd8, 0x12, 0x1d, 0xbd, 0xd7, 0xf6, 0xc2, 0xbc, 0x86, 0x55,
+  0x11, 0x3c, 0x28, 0x0d, 0x70, 0x3d, 0x98, 0xa3, 0x8a, 0x3d, 0x7b, 0xf0, 0x93,
+  0xbd, 0xc2, 0x7c, 0x0b, 0xbd, 0xfa, 0x05, 0xcc, 0x3c, 0x5f, 0x77, 0x19, 0x3d,
+  0xe0, 0x09, 0xb3, 0x3c, 0x13, 0x77, 0x8a, 0xbc, 0x1f, 0x76, 0x36, 0x3c, 0xfb,
+  0x4f, 0x97, 0x3d, 0x1f, 0xec, 0x31, 0x3d, 0xf9, 0x14, 0x79, 0x3d, 0x50, 0xab,
+  0x92, 0xbd, 0xda, 0x3c, 0xf3, 0xba, 0x2f, 0x4d, 0x72, 0xbc, 0x0f, 0x3a, 0xc6,
+  0x3c, 0x7e, 0xf5, 0x40, 0xbd, 0x0f, 0xf2, 0x87, 0xbd, 0xc9, 0x6e, 0xef, 0xbc,
+  0x06, 0xec, 0xce, 0xbc, 0x3d, 0x26, 0x2b, 0xbd, 0x4a, 0x6a, 0x53, 0x3d, 0x1b,
+  0x90, 0x1a, 0xbb, 0x39, 0xb6, 0x23, 0x3d, 0xa2, 0xbd, 0x88, 0xbd, 0xd7, 0x0d,
+  0x2a, 0xbc, 0xf5, 0xf6, 0x94, 0xbd, 0xf0, 0xd7, 0x52, 0xbc, 0x85, 0x99, 0x83,
+  0xbd, 0xdd, 0xc4, 0x8c, 0xbd, 0xaa, 0x19, 0x4a, 0x3d, 0x26, 0x21, 0xec, 0x3c,
+  0x0f, 0xe7, 0x1b, 0xbc, 0x39, 0x8e, 0xea, 0xbc, 0x03, 0xdc, 0x2f, 0xbd, 0x03,
+  0x8c, 0x8c, 0x3d, 0xe4, 0xcb, 0x7f, 0xbc, 0xc6, 0xb9, 0xfd, 0x3b, 0x78, 0x5b,
+  0x44, 0xbd, 0xd0, 0x3d, 0x89, 0xbc, 0xe0, 0xdb, 0xc2, 0xbc, 0x84, 0x8d, 0x39,
+  0xbd, 0x9a, 0x7b, 0x9a, 0x3b, 0x5d, 0xb4, 0x88, 0xbc, 0xf3, 0xf0, 0x8e, 0xbd,
+  0x27, 0x0c, 0x41, 0x3d, 0xe7, 0x60, 0xa0, 0x3c, 0x86, 0xb6, 0xa9, 0xbc, 0x15,
+  0x55, 0x4f, 0xbd, 0xf4, 0x53, 0xfb, 0xbc, 0xdf, 0x4d, 0x0d, 0x3d, 0x06, 0x46,
+  0x7d, 0xbd, 0x37, 0x4d, 0xb0, 0xbc, 0x7d, 0x65, 0x1e, 0xbd, 0x30, 0x1a, 0x00,
+  0xbb, 0x16, 0x56, 0x28, 0xbd, 0xb4, 0xef, 0xdd, 0xbc, 0xcc, 0xbc, 0x40, 0xbd,
+  0x95, 0xce, 0x84, 0xbd, 0x97, 0x26, 0x98, 0xbd, 0x86, 0x1f, 0x80, 0xbd, 0x64,
+  0x16, 0x97, 0x3c, 0x9b, 0xd0, 0x22, 0x3c, 0x05, 0x08, 0x52, 0xbb, 0xd2, 0x11,
+  0x8e, 0xbd, 0x3c, 0xa3, 0x8c, 0x3d, 0x4c, 0xdb, 0xa0, 0xbd, 0x24, 0xe2, 0x0a,
+  0xbd, 0x24, 0x87, 0x69, 0x3c, 0x7c, 0x72, 0xb2, 0x3c, 0xda, 0xcd, 0x0c, 0x3d,
+  0xd1, 0x51, 0x4c, 0x3d, 0xb6, 0xaf, 0x30, 0xbd, 0x07, 0xa0, 0x64, 0x3d, 0x09,
+  0x30, 0x59, 0x3d, 0x68, 0xb3, 0x06, 0xbd, 0x01, 0x85, 0xe4, 0xbc, 0x10, 0x9f,
+  0x2a, 0xbd, 0xe0, 0x85, 0x93, 0x3d, 0x71, 0xe0, 0x13, 0xbd, 0x28, 0x8b, 0x8e,
+  0x3c, 0x53, 0x74, 0x71, 0xbc, 0x6a, 0x6d, 0xad, 0x3d, 0x88, 0xf7, 0x32, 0x3c,
+  0xfb, 0xde, 0x41, 0x3c, 0x90, 0x33, 0x4c, 0xba, 0x89, 0xe4, 0x1d, 0x3c, 0x47,
+  0x26, 0xb5, 0xbc, 0x5c, 0x9c, 0x9d, 0xbd, 0xd4, 0xe8, 0xdb, 0x3b, 0x7f, 0x88,
+  0x99, 0x3d, 0x79, 0xd9, 0xb8, 0xbc, 0x76, 0x00, 0xb9, 0x3d, 0x74, 0x04, 0xb9,
+  0xbc, 0xde, 0x84, 0x38, 0x3d, 0x5c, 0x38, 0x91, 0x3d, 0x80, 0x37, 0x04, 0xbd,
+  0xfa, 0x1a, 0x34, 0x3d, 0x36, 0x16, 0x11, 0x3d, 0xf3, 0x66, 0x86, 0x3d, 0x84,
+  0x83, 0x16, 0xbd, 0xec, 0x1a, 0x43, 0xbd, 0x06, 0xf8, 0x64, 0x3d, 0x96, 0x19,
+  0x31, 0x3b, 0x75, 0x30, 0x9e, 0x3d, 0xf5, 0xfa, 0xd1, 0xbb, 0x96, 0xf3, 0xc8,
+  0xbc, 0x84, 0x0f, 0x6d, 0xbd, 0xd1, 0x3e, 0x77, 0x3c, 0xbb, 0xb8, 0xf1, 0xbc,
+  0x49, 0xf5, 0x70, 0x3d, 0x33, 0x33, 0x44, 0xbd, 0xc9, 0xca, 0xf5, 0x3c, 0x5d,
+  0xe3, 0x2c, 0xbc, 0x06, 0x48, 0xb8, 0x3d, 0xfe, 0xac, 0x12, 0x3d, 0x1d, 0xd6,
+  0x86, 0x3d, 0x54, 0xa5, 0x39, 0x3d, 0x4d, 0x88, 0xeb, 0x3c, 0x14, 0xe2, 0x3e,
+  0x3c, 0xb5, 0xe9, 0xd3, 0xbc, 0x97, 0xe0, 0x7e, 0x3c, 0x9b, 0xa2, 0x5a, 0xbc,
+  0x14, 0xab, 0x89, 0x3d, 0x4a, 0xdc, 0x93, 0x3d, 0xe8, 0xee, 0xb5, 0xbc, 0x5f,
+  0x9a, 0x9b, 0x3b, 0x26, 0x69, 0x55, 0x3c, 0x7d, 0x50, 0x89, 0xbc, 0xe0, 0x93,
+  0x8c, 0x3b, 0x44, 0xbc, 0x23, 0xbd, 0x47, 0x76, 0x85, 0x3d, 0xfd, 0x6a, 0x25,
+  0x39, 0x3e, 0x57, 0x9c, 0x3d, 0x70, 0xdd, 0xd0, 0x3b, 0x40, 0xdf, 0x3b, 0x3d,
+  0x47, 0x5c, 0xbd, 0xbc, 0x90, 0x3d, 0x33, 0xbd, 0xd8, 0xc6, 0x76, 0xbd, 0xf2,
+  0xd8, 0x51, 0x3d, 0x17, 0x60, 0x9c, 0xbd, 0x32, 0x78, 0x1b, 0xbd, 0xb4, 0xef,
+  0x70, 0x3d, 0xfa, 0x9d, 0xb6, 0x3b, 0x88, 0x5c, 0xe0, 0x3a, 0x47, 0x1b, 0xf8,
+  0xbc, 0x3b, 0x66, 0xcb, 0xba, 0x30, 0xe1, 0x04, 0xbd, 0x58, 0xbe, 0x87, 0xbd,
+  0xc2, 0xa5, 0x10, 0xbc, 0x48, 0x34, 0xa3, 0x3d, 0x44, 0xa4, 0x77, 0x3d, 0x7d,
+  0xe5, 0x94, 0xba, 0x23, 0xd9, 0xa3, 0xbc, 0xf6, 0xf6, 0xc6, 0xbc, 0xea, 0xd8,
+  0x31, 0xbd, 0x9f, 0x50, 0x24, 0x3d, 0xc8, 0x2a, 0x37, 0x3d, 0xaf, 0xe4, 0x82,
+  0x3d, 0x28, 0x20, 0x70, 0x3d, 0xa3, 0x27, 0x52, 0x3d, 0xbd, 0x34, 0x8a, 0x3c,
+  0x8c, 0x2c, 0xde, 0x3c, 0x35, 0xf4, 0x70, 0xbd, 0x35, 0x89, 0x19, 0x3d, 0x54,
+  0x59, 0x46, 0xb9, 0xa6, 0xfb, 0xc0, 0xbc, 0x56, 0x95, 0x8d, 0x3d, 0xd1, 0x4f,
+  0x71, 0x3d, 0xe1, 0xe3, 0x9f, 0x3d, 0x05, 0xe2, 0x82, 0xbd, 0xb7, 0xcf, 0x06,
+  0x3d, 0x02, 0x28, 0xa3, 0xbc, 0xd0, 0xcf, 0x48, 0x3d, 0x8e, 0x69, 0x3b, 0xbc,
+  0x1e, 0x83, 0x14, 0xbb, 0x72, 0x67, 0x82, 0x3b, 0x64, 0x7d, 0xeb, 0xbc, 0x2a,
+  0x76, 0xe5, 0xba, 0x6a, 0xd8, 0x3c, 0xbd, 0x10, 0xc0, 0x4c, 0x3d, 0x64, 0x44,
+  0x64, 0x3d, 0xbe, 0xb4, 0x31, 0xbd, 0x0c, 0x43, 0x09, 0xbd, 0xa4, 0x6d, 0x8d,
+  0xbd, 0xd0, 0xbf, 0x4a, 0x3d, 0x09, 0x76, 0x90, 0xbd, 0x29, 0x9c, 0x0b, 0x3d,
+  0x7c, 0x61, 0x74, 0xbd, 0xb9, 0x1c, 0x1c, 0xbd, 0x09, 0x6d, 0xad, 0x3b, 0x3e,
+  0xb4, 0x93, 0xbc, 0x1f, 0x5a, 0xa4, 0x3c, 0xe2, 0x7a, 0x89, 0xbd, 0x1c, 0x1d,
+  0x49, 0x3c, 0x0c, 0xc3, 0x06, 0xbd, 0xf9, 0xe2, 0xd6, 0x3c, 0x1a, 0x44, 0x57,
+  0xbd, 0x7a, 0xac, 0x50, 0x3d, 0x39, 0xe4, 0xc4, 0x3c, 0xfb, 0x1e, 0x04, 0x3d,
+  0x8a, 0xf6, 0x53, 0xbd, 0xfc, 0xac, 0x62, 0xbc, 0x44, 0xcc, 0x20, 0x3d, 0xf6,
+  0x5e, 0xa0, 0x3c, 0x88, 0x20, 0xcd, 0xba, 0x6b, 0xc7, 0x1c, 0xbd, 0x66, 0xd2,
+  0x16, 0xbb, 0x8b, 0x02, 0x58, 0xbd, 0x17, 0x15, 0x83, 0x3d, 0xef, 0x6a, 0x84,
+  0x3d, 0x00, 0x91, 0xd1, 0xba, 0x9a, 0xa6, 0x83, 0x3d, 0x6e, 0x12, 0x9c, 0xbd,
+  0x4c, 0x00, 0x46, 0x3d, 0x08, 0x8e, 0xcf, 0x3b, 0x53, 0x98, 0xb9, 0xbc, 0x5c,
+  0x33, 0x43, 0x3d, 0x05, 0x7b, 0x03, 0xbd, 0x82, 0x26, 0x35, 0xbd, 0xbf, 0x76,
+  0x75, 0xbd, 0x08, 0x78, 0x49, 0xbd, 0xe1, 0x7e, 0x53, 0xbc, 0xf0, 0x64, 0xf2,
+  0x3c, 0x56, 0xaf, 0x1a, 0x3d, 0x1c, 0x8f, 0x08, 0x3d, 0x11, 0xac, 0x91, 0xbd,
+  0xe8, 0x21, 0x06, 0x3d, 0xf5, 0xbb, 0xdb, 0xbc, 0x0c, 0xc9, 0x81, 0xbd, 0x74,
+  0x76, 0x83, 0xbd, 0x5e, 0xf3, 0x40, 0xbd, 0xd6, 0xbb, 0x98, 0x3d, 0x4b, 0x9a,
+  0x93, 0x3c, 0x25, 0x64, 0x9d, 0xbd, 0xf4, 0xf4, 0x9e, 0xbc, 0x66, 0xbe, 0x2b,
+  0xbb, 0xad, 0xa4, 0x82, 0x3c, 0x76, 0x08, 0x5d, 0xbd, 0x2c, 0xf4, 0x2f, 0xbd,
+  0xb3, 0x5e, 0x84, 0x3d, 0x62, 0xad, 0x06, 0x3d, 0x6a, 0xe5, 0xea, 0xbc, 0xd8,
+  0x06, 0x23, 0x3d, 0x85, 0x25, 0xeb, 0xbc, 0xa9, 0x01, 0xab, 0xbb, 0x28, 0xe4,
+  0xf3, 0x3c, 0x9f, 0x9e, 0x8e, 0xbd, 0x3f, 0xe2, 0x2c, 0xbc, 0xe0, 0xfd, 0xc1,
+  0x3c, 0x84, 0x67, 0xa7, 0xbb, 0xc5, 0x1d, 0xfc, 0xbc, 0xee, 0x05, 0x6b, 0xbd,
+  0x9a, 0x29, 0xc9, 0xbc, 0x35, 0x9c, 0x0f, 0x3d, 0xff, 0xd3, 0x1c, 0xbd, 0x60,
+  0x5c, 0x3d, 0xbd, 0x85, 0xf0, 0x81, 0x3d, 0xe6, 0x58, 0x0f, 0xbc, 0xda, 0x46,
+  0x01, 0xbd, 0xe4, 0xae, 0x88, 0xbd, 0xe2, 0x4a, 0x47, 0xbd, 0x51, 0xf0, 0x7e,
+  0xbd, 0x18, 0xc7, 0x82, 0x3d, 0x85, 0xf7, 0x26, 0x3d, 0x7f, 0xe0, 0xc0, 0xbc,
+  0x28, 0xa7, 0x56, 0x3b, 0x86, 0xe9, 0x17, 0xbb, 0x75, 0xc7, 0x81, 0x3d, 0x0c,
+  0x95, 0x19, 0xbc, 0x27, 0x0d, 0x62, 0xbd, 0xae, 0x2f, 0x14, 0x3b, 0xcf, 0x26,
+  0x47, 0xbd, 0x75, 0xe8, 0x26, 0x3d, 0x99, 0x94, 0x48, 0x3d, 0xac, 0xe6, 0x3f,
+  0x3d, 0x50, 0xa8, 0xee, 0x3c, 0x25, 0x3e, 0xef, 0xbc, 0x98, 0xfe, 0x37, 0xbc,
+  0x05, 0x4b, 0x28, 0x3d, 0xa5, 0x42, 0xfc, 0x3c, 0x40, 0xda, 0x68, 0x3d, 0xf7,
+  0x91, 0x35, 0x3d, 0xae, 0xa1, 0x1a, 0x3d, 0xeb, 0xc7, 0x1b, 0xbd, 0x98, 0x7d,
+  0xb1, 0x3c, 0xf7, 0xe7, 0x0b, 0xbd, 0x72, 0x31, 0x47, 0x3d, 0x47, 0xeb, 0x85,
+  0xbd, 0x4f, 0x71, 0x1f, 0xbc, 0xae, 0x19, 0x1b, 0xbd, 0x30, 0xc5, 0xd7, 0xbb,
+  0x94, 0xbe, 0x05, 0x3d, 0x39, 0x66, 0x94, 0x3c, 0x68, 0xab, 0x65, 0xbc, 0x4a,
+  0x43, 0xd3, 0xbc, 0x66, 0x6e, 0x22, 0x3d, 0x2c, 0xb6, 0x45, 0x3d, 0xec, 0xf0,
+  0x09, 0xbd, 0x15, 0x84, 0xd6, 0x3c, 0x67, 0xb6, 0x5e, 0xbd, 0x48, 0xb9, 0x1b,
+  0x3d, 0xef, 0x6b, 0x36, 0x3d, 0xfa, 0x9f, 0x60, 0x3c, 0xfb, 0x49, 0x8c, 0x3d,
+  0x50, 0x0b, 0xfd, 0x3c, 0x43, 0x24, 0xf5, 0x3c, 0x48, 0xf5, 0x1c, 0x3d, 0x24,
+  0xed, 0x55, 0xbd, 0x12, 0x2a, 0x33, 0xbd, 0x6f, 0x59, 0x3b, 0xbb, 0xeb, 0x66,
+  0xe0, 0xbc, 0x7b, 0x67, 0x60, 0xbb, 0x19, 0x8c, 0x85, 0x3c, 0x72, 0x71, 0x22,
+  0x3b, 0x7f, 0xa1, 0x22, 0xbd, 0x9e, 0xcd, 0x04, 0x3d, 0x00, 0xf6, 0xff, 0xb9,
+  0xdf, 0x8b, 0x16, 0xbd, 0xc1, 0x0c, 0xfd, 0x3c, 0x9b, 0xf9, 0x5b, 0xbd, 0x71,
+  0x73, 0x8c, 0x3d, 0x0f, 0x55, 0x63, 0x3d, 0x20, 0xbf, 0xb9, 0x3c, 0xa3, 0xc5,
+  0x85, 0x3d, 0xfd, 0x98, 0x2e, 0xbd, 0xb4, 0x02, 0x2e, 0xbc, 0xe2, 0x12, 0x46,
+  0xbc, 0x90, 0x41, 0x6f, 0xbd, 0x0d, 0xc7, 0x68, 0x3d, 0x4e, 0x58, 0x4f, 0x3c,
+  0xc0, 0xeb, 0x1d, 0xbb, 0x3d, 0xcb, 0x9f, 0xbd, 0x29, 0x0c, 0x7f, 0x3d, 0x8a,
+  0x62, 0x4d, 0xbc, 0x01, 0x3c, 0x7b, 0x3d, 0x3c, 0x41, 0xb8, 0x3c, 0xa9, 0x70,
+  0x53, 0x3d, 0x32, 0x94, 0xab, 0x3d, 0xdc, 0x75, 0x4c, 0x3d, 0xab, 0x5d, 0xd6,
+  0xbc, 0xae, 0x74, 0x0a, 0xbd, 0x7f, 0xf5, 0xec, 0x3c, 0xff, 0x6e, 0x4c, 0xbd,
+  0x0c, 0x65, 0x16, 0xbc, 0x4f, 0x2a, 0x58, 0x3c, 0xe2, 0x17, 0xa0, 0x3d, 0x6a,
+  0x10, 0x83, 0xbc, 0xfc, 0x40, 0xc0, 0x3d, 0xbc, 0xa0, 0xad, 0xbc, 0xde, 0xdc,
+  0x98, 0x3d, 0xaf, 0x54, 0x84, 0xbb, 0x64, 0xcd, 0xdf, 0x3c, 0xab, 0x93, 0x2c,
+  0xbc, 0x44, 0x5c, 0x29, 0x3c, 0xac, 0x7f, 0x27, 0x3d, 0xb2, 0x34, 0xee, 0x3c,
+  0x66, 0xf2, 0xd9, 0x3c, 0x4d, 0xaf, 0x86, 0x3d, 0xee, 0x79, 0x10, 0xbd, 0xa2,
+  0x84, 0x31, 0xbd, 0xe2, 0xf9, 0x43, 0x3d, 0x26, 0x87, 0xf1, 0x3b, 0xf0, 0x3a,
+  0x8f, 0xbd, 0x3e, 0x23, 0x5d, 0xbd, 0x75, 0x0a, 0x7c, 0x3d, 0x15, 0xe4, 0x5a,
+  0xbd, 0x45, 0xb3, 0xb2, 0x3c, 0xe3, 0xc4, 0x36, 0x3d, 0x7d, 0x89, 0x9f, 0x3c,
+  0x9e, 0x54, 0xaa, 0xbb, 0x89, 0x2e, 0x88, 0xbd, 0xad, 0xe0, 0x89, 0xbc, 0x69,
+  0xe9, 0x66, 0xbd, 0x94, 0xa9, 0xf4, 0xbc, 0xb3, 0xde, 0x21, 0xbd, 0x0b, 0x5a,
+  0x82, 0xbd, 0x55, 0x78, 0x00, 0x3d, 0x1f, 0x1d, 0xa2, 0xbd, 0x5c, 0xe4, 0x4b,
+  0xbd, 0x63, 0x9e, 0xa6, 0xbd, 0x44, 0xdb, 0x75, 0xbd, 0x6a, 0xe7, 0xf3, 0xbc,
+  0xdc, 0xa5, 0x2c, 0xbd, 0xc7, 0xcd, 0x8d, 0x3c, 0xd4, 0x97, 0x85, 0x3c, 0xc5,
+  0x19, 0x4a, 0xbc, 0x48, 0x7d, 0x09, 0xbc, 0xd6, 0x74, 0x2c, 0xbd, 0x94, 0xb6,
+  0xf9, 0x3c, 0xfd, 0x54, 0x8d, 0x3d, 0xdf, 0x85, 0x57, 0x3d, 0x82, 0x58, 0x67,
+  0x3d, 0x67, 0x4a, 0xe8, 0xba, 0xec, 0xb0, 0xe9, 0x3c, 0x9a, 0xf0, 0x1f, 0x3d,
+  0x80, 0xbc, 0x7e, 0xbd, 0x15, 0xe3, 0x16, 0x3d, 0x49, 0xb7, 0x33, 0xbc, 0x03,
+  0xbe, 0x65, 0xbd, 0x6c, 0x41, 0x8b, 0x3d, 0x93, 0x68, 0x85, 0xbc, 0x50, 0x1a,
+  0x50, 0xbd, 0x10, 0xbe, 0x7f, 0xbc, 0x15, 0x0c, 0x58, 0xbc, 0x48, 0xe9, 0x92,
+  0xbd, 0x48, 0x67, 0x3e, 0xbc, 0x38, 0x60, 0x66, 0xbd, 0x76, 0xac, 0x9e, 0xbd,
+  0x4d, 0xc9, 0x61, 0x3d, 0x0b, 0xa6, 0x9f, 0xbd, 0x8f, 0x08, 0xcb, 0x3c, 0x60,
+  0x17, 0x35, 0x3d, 0x60, 0x75, 0x7a, 0x3c, 0x24, 0x97, 0x48, 0x3a, 0x64, 0x78,
+  0x90, 0xbc, 0xf3, 0x93, 0xb8, 0xbb, 0x46, 0x84, 0x69, 0xbd, 0xd6, 0x71, 0x43,
+  0x3d, 0xb4, 0x2b, 0x62, 0xbc, 0x47, 0x6b, 0x08, 0x3c, 0x0e, 0x23, 0xeb, 0xbc,
+  0xf4, 0xc8, 0xb0, 0xbc, 0x3f, 0x17, 0xbe, 0xbc, 0x11, 0xc5, 0x99, 0x3d, 0x50,
+  0x81, 0x15, 0x3d, 0x8e, 0xd8, 0x7d, 0x3d, 0xfd, 0x07, 0x8d, 0xbb, 0x7a, 0x46,
+  0xea, 0x3c, 0x7d, 0xc9, 0x2c, 0x3d, 0x1e, 0x27, 0x2f, 0x3d, 0x67, 0x04, 0x05,
+  0xbc, 0x8f, 0x0a, 0x71, 0xbc, 0x44, 0xcb, 0x78, 0xbc, 0x3b, 0x8e, 0x17, 0x3d,
+  0x8c, 0x61, 0xf6, 0x3c, 0xdf, 0x7a, 0x54, 0x3d, 0x93, 0xe6, 0xaa, 0xbc, 0xef,
+  0x19, 0xd2, 0xbc, 0xb8, 0xec, 0x13, 0x3d, 0xed, 0x16, 0x39, 0x3d, 0x7c, 0xb2,
+  0xdc, 0x3c, 0x03, 0xf9, 0x84, 0xb9, 0xe7, 0xbd, 0x70, 0xbc, 0xea, 0x33, 0x77,
+  0x3d, 0xa8, 0xd3, 0x55, 0x3c, 0x3b, 0x55, 0x04, 0x3c, 0x72, 0x75, 0x67, 0xbc,
+  0xde, 0x63, 0x4b, 0xbc, 0x73, 0xc5, 0x01, 0xbd, 0x2e, 0x1b, 0x01, 0x3c, 0xb2,
+  0xeb, 0x57, 0x3d, 0x81, 0xaa, 0x2d, 0xbd, 0x68, 0x5f, 0x1c, 0xbd, 0x0e, 0x36,
+  0x77, 0x3d, 0xd9, 0xb5, 0x27, 0x3c, 0x99, 0x74, 0x27, 0x3d, 0xae, 0x86, 0x74,
+  0xbd, 0x57, 0x12, 0x0e, 0xbd, 0x37, 0x30, 0x2a, 0x3d, 0x5e, 0xf5, 0x3b, 0x3d,
+  0x37, 0x81, 0x6f, 0x3d, 0xd3, 0xe7, 0x4b, 0xbd, 0x4a, 0x7f, 0x85, 0x3d, 0xce,
+  0x31, 0x21, 0x3d, 0xda, 0xf8, 0x86, 0xbc, 0x5e, 0x6d, 0x1f, 0x3c, 0x80, 0x1b,
+  0x06, 0x3b, 0xd7, 0x82, 0x5f, 0x3d, 0x74, 0xc0, 0x26, 0xbd, 0x1d, 0x0e, 0x8d,
+  0xbc, 0x00, 0xfe, 0x06, 0x3d, 0x5f, 0x91, 0x79, 0xbd, 0x53, 0x7a, 0xee, 0xbc,
+  0x64, 0x03, 0x41, 0x3d, 0x66, 0xa9, 0xfa, 0xba, 0x67, 0x37, 0x40, 0xbd, 0xd8,
+  0x7f, 0x23, 0xbd, 0x1a, 0x9f, 0x03, 0xbc, 0x93, 0x26, 0x03, 0xbd, 0xeb, 0xf7,
+  0x58, 0xbc, 0x04, 0xe4, 0xdc, 0xb9, 0xb6, 0xbb, 0x9b, 0x3b, 0x9e, 0x4b, 0x14,
+  0x3d, 0x5a, 0x9a, 0xd4, 0xba, 0x59, 0xcd, 0x21, 0xbd, 0x00, 0xc3, 0x85, 0x3c,
+  0xec, 0xbf, 0xf2, 0xbc, 0x0e, 0x59, 0x3a, 0xbd, 0xa7, 0x8f, 0x81, 0x3d, 0x11,
+  0x2d, 0x63, 0xbd, 0x55, 0x42, 0xe8, 0xbc, 0x6b, 0x6e, 0x8c, 0x3c, 0xa3, 0x84,
+  0x1d, 0xbd, 0x8c, 0xda, 0x4f, 0x3c, 0xb2, 0x36, 0xd1, 0x3c, 0x4f, 0x27, 0x71,
+  0x3d, 0xf8, 0x32, 0x8c, 0x3c, 0x5c, 0xe8, 0x69, 0xbc, 0x42, 0xcb, 0x24, 0x3d,
+  0x8f, 0xd8, 0x6b, 0xbd, 0x87, 0xd2, 0x9c, 0xbd, 0xc5, 0x3f, 0xb5, 0x3c, 0x08,
+  0xfc, 0xf9, 0x3c, 0x5b, 0x21, 0x7e, 0x3d, 0xef, 0x06, 0x65, 0xbc, 0xda, 0x92,
+  0x02, 0x3c, 0xb1, 0xf0, 0x99, 0xbc, 0x2e, 0x72, 0xe7, 0xbc, 0x32, 0x44, 0x6a,
+  0xbd, 0xdd, 0xbb, 0x20, 0x3b, 0xa1, 0xbf, 0xa3, 0x3c, 0xd2, 0x4f, 0x9b, 0x3c,
+  0xf8, 0x55, 0xbe, 0x3c, 0x35, 0xe3, 0x0a, 0x3d, 0xf0, 0x8a, 0x89, 0xbc, 0xd7,
+  0xd7, 0x6f, 0x3d, 0x96, 0xd9, 0x70, 0xbd, 0x00, 0x50, 0x20, 0x39, 0x1f, 0xa7,
+  0x17, 0x3d, 0x4f, 0x4f, 0xc3, 0xbb, 0xf6, 0x99, 0x40, 0xbd, 0x87, 0xd4, 0x2a,
+  0xbd, 0x09, 0x54, 0x06, 0x3d, 0x87, 0x46, 0xf4, 0xbb, 0x9c, 0x12, 0x12, 0x3c,
+  0x2f, 0xc9, 0xd1, 0x3c, 0x4c, 0x47, 0x4e, 0x3d, 0xf9, 0x77, 0x64, 0xbd, 0xd1,
+  0xa5, 0x17, 0xbd, 0xf3, 0x5b, 0xdb, 0x3c, 0x98, 0x30, 0x55, 0x3d, 0x3f, 0x3d,
+  0x37, 0xbd, 0x54, 0x12, 0xed, 0xbc, 0x30, 0x26, 0x1d, 0x3d, 0x72, 0x80, 0x8a,
+  0x3d, 0xf1, 0xd7, 0x4c, 0xbd, 0xa9, 0xc7, 0x83, 0x3d, 0x86, 0xba, 0x93, 0xbd,
+  0x6b, 0x0a, 0x90, 0xbd, 0x96, 0x8c, 0x64, 0xbd, 0x40, 0x70, 0xf1, 0x3a, 0xc0,
+  0x39, 0x79, 0x3d, 0x27, 0xda, 0x24, 0xbc, 0x36, 0x2e, 0x3c, 0x3d, 0xb0, 0xbe,
+  0x90, 0xbd, 0x20, 0x68, 0x14, 0xbc, 0x00, 0xa4, 0x3e, 0xbc, 0x85, 0xb9, 0x44,
+  0xbd, 0xa2, 0x06, 0x52, 0xbd, 0x6e, 0xae, 0x4a, 0xbd, 0xbe, 0x73, 0x6c, 0xbd,
+  0x49, 0xee, 0x3e, 0xbd, 0x36, 0x8a, 0xe0, 0x3c, 0x7f, 0x94, 0x8a, 0xbd, 0x19,
+  0x1d, 0x11, 0xbd, 0x15, 0x3e, 0x55, 0xbd, 0x4b, 0xcd, 0x7b, 0x3d, 0x63, 0xd7,
+  0x9f, 0xba, 0x83, 0xcb, 0x37, 0xbd, 0xa4, 0x4f, 0x21, 0xbd, 0xa5, 0xaf, 0xec,
+  0xbc, 0xcd, 0x46, 0xae, 0xbd, 0xe8, 0x66, 0x9d, 0x3c, 0x7c, 0x84, 0xa6, 0xbc,
+  0x85, 0xcc, 0x7f, 0x3d, 0xa5, 0x28, 0xa6, 0xbd, 0x2f, 0x3a, 0x55, 0xbc, 0xb4,
+  0x8b, 0xc8, 0xbc, 0xd3, 0x90, 0x5e, 0x3d, 0x49, 0x79, 0x81, 0xbd, 0x50, 0xc3,
+  0x79, 0xbc, 0x90, 0x04, 0x9b, 0xbd, 0x1e, 0xdb, 0x73, 0x3d, 0x97, 0x15, 0x7e,
+  0x3c, 0x5f, 0xf6, 0x83, 0x3d, 0x1d, 0x20, 0x32, 0x3c, 0xda, 0x32, 0x7a, 0xbd,
+  0x8f, 0xa0, 0x69, 0x3c, 0x20, 0xe0, 0x87, 0xbd, 0x08, 0xb7, 0x2f, 0x3d, 0x5e,
+  0x6c, 0x26, 0xbd, 0xba, 0xa8, 0xbe, 0xbc, 0xb3, 0x9b, 0xb7, 0xbc, 0xc1, 0x3e,
+  0x8e, 0x3d, 0x45, 0x90, 0x3f, 0xbd, 0x82, 0xee, 0x0c, 0x3d, 0x62, 0xe1, 0x38,
+  0xbc, 0x30, 0x95, 0x8b, 0x3c, 0xc6, 0x6b, 0x58, 0x3d, 0x7c, 0xca, 0x06, 0xbd,
+  0x03, 0xa3, 0x7b, 0x3d, 0x77, 0xef, 0x83, 0x3c, 0x24, 0xc7, 0x69, 0x3d, 0xf6,
+  0xed, 0x35, 0xbd, 0xaa, 0x2d, 0x33, 0x3d, 0x71, 0x69, 0x72, 0x3c, 0xed, 0x0d,
+  0x80, 0x3c, 0x02, 0x0d, 0x47, 0x3d, 0x30, 0x51, 0x86, 0xbc, 0x0a, 0xad, 0x8d,
+  0xbc, 0x80, 0xab, 0x1c, 0x3d, 0x68, 0x17, 0x3d, 0x3d, 0x47, 0x3c, 0x36, 0xbd,
+  0x32, 0x58, 0xfb, 0x3c, 0x27, 0x47, 0x82, 0x3d, 0xb8, 0x9c, 0x92, 0xbc, 0xab,
+  0xa8, 0xaf, 0xbb, 0x97, 0xb4, 0x7b, 0x3d, 0xdb, 0x16, 0xad, 0xbc, 0xa8, 0x50,
+  0x8b, 0xbd, 0x50, 0x91, 0x4d, 0x3c, 0xe1, 0x69, 0x73, 0x3c, 0x62, 0x4f, 0x30,
+  0xbd, 0x00, 0x70, 0x6a, 0x3c, 0x57, 0xbb, 0x8f, 0x3d, 0xe6, 0x60, 0x44, 0xbd,
+  0x33, 0x5a, 0xc2, 0xbc, 0xe6, 0xae, 0x82, 0xbd, 0x1e, 0xad, 0x6e, 0xbd, 0xc9,
+  0x43, 0x30, 0x3d, 0x30, 0x4a, 0x65, 0x3c, 0x79, 0x1d, 0xc7, 0x3c, 0x97, 0xab,
+  0x1e, 0x3b, 0x95, 0x60, 0xd7, 0xbc, 0xcc, 0xed, 0xa1, 0xbc, 0xa3, 0x6d, 0x6b,
+  0xbd, 0xd8, 0xc4, 0x30, 0x3c, 0xcf, 0x3e, 0x8b, 0xbc, 0x82, 0xd9, 0x0d, 0xbc,
+  0x6b, 0x1f, 0xdb, 0xbc, 0xb7, 0x65, 0x76, 0xbd, 0x19, 0x3a, 0xfb, 0x3c, 0xe8,
+  0x08, 0x08, 0xbd, 0x0b, 0xdb, 0x00, 0xbd, 0x4c, 0x51, 0x19, 0xbd, 0x2e, 0x6c,
+  0x37, 0x3d, 0xc0, 0xdf, 0x1e, 0x3b, 0x64, 0x10, 0x49, 0x3d, 0x77, 0x9b, 0xca,
+  0xbc, 0xca, 0x17, 0xfb, 0xbc, 0xe6, 0xa4, 0x92, 0x3d, 0xfd, 0x90, 0x77, 0x3d,
+  0x82, 0x5e, 0x6b, 0x3d, 0xe5, 0x15, 0x3c, 0x3d, 0xc3, 0x45, 0xf9, 0xbb, 0x0c,
+  0x61, 0x88, 0xbd, 0x26, 0xa1, 0x68, 0xbd, 0x67, 0x2c, 0x1e, 0xbd, 0x2b, 0xfe,
+  0x3e, 0xbd, 0xb9, 0x45, 0x0b, 0xbd, 0x8e, 0x79, 0x09, 0xbd, 0x16, 0xdf, 0x45,
+  0xbd, 0x52, 0xbb, 0x24, 0xbc, 0x84, 0x55, 0x78, 0xbd, 0xb7, 0x6d, 0x55, 0x3d,
+  0xb8, 0xe4, 0x8a, 0x3d, 0xcc, 0x8e, 0x2d, 0xbd, 0xf8, 0x0a, 0x13, 0x3c, 0xda,
+  0x22, 0x23, 0x3d, 0xee, 0x07, 0x1e, 0x3d, 0xee, 0x5c, 0x38, 0xbd, 0x1b, 0xfa,
+  0xc1, 0xbc, 0x62, 0x88, 0x82, 0xbc, 0x9e, 0x6c, 0x39, 0xbd, 0xe8, 0xc8, 0x90,
+  0xbd, 0xb2, 0xaf, 0x0e, 0xbd, 0x87, 0xc1, 0x61, 0xbc, 0x91, 0xcf, 0x21, 0x3b,
+  0xaa, 0x52, 0x88, 0xbd, 0x2b, 0xcb, 0x8e, 0xbd, 0x42, 0x58, 0xb0, 0x3c, 0x72,
+  0x3e, 0x9a, 0x3c, 0x1e, 0x92, 0x09, 0x3d, 0xc6, 0x67, 0x9a, 0xbd, 0xa0, 0xb0,
+  0x29, 0x3b, 0x51, 0x6e, 0x0c, 0xbd, 0x88, 0x0d, 0x4d, 0xbd, 0x1c, 0xc3, 0xee,
+  0x3c, 0x43, 0xfc, 0x61, 0x3d, 0x74, 0x13, 0x84, 0x3c, 0x10, 0xbc, 0xd4, 0x3c,
+  0x8a, 0x20, 0x9d, 0x39, 0x0a, 0x33, 0xdd, 0x3b, 0xee, 0x75, 0x96, 0xbd, 0x77,
+  0x4f, 0xa2, 0x3c, 0x1a, 0x55, 0xe4, 0xbc, 0x17, 0x4b, 0x5c, 0xbc, 0xe8, 0x22,
+  0x5a, 0xbd, 0xcf, 0xa8, 0x46, 0x3c, 0x2e, 0x1d, 0x2c, 0xbd, 0x7c, 0x53, 0x62,
+  0xbc, 0x4e, 0xdc, 0x25, 0x3d, 0x3c, 0x94, 0x4e, 0xbd, 0xba, 0x9a, 0x3b, 0xbd,
+  0x32, 0x01, 0x02, 0x3d, 0x57, 0xd2, 0x80, 0x3d, 0x88, 0x7d, 0xb4, 0xbc, 0x81,
+  0xbf, 0x7f, 0xbd, 0xf7, 0xbb, 0x89, 0x3d, 0xa0, 0xba, 0x30, 0x3d, 0x13, 0xd5,
+  0x91, 0x3d, 0xc7, 0x59, 0x37, 0x3d, 0x3c, 0xc1, 0x95, 0xbd, 0x41, 0x62, 0x94,
+  0xbc, 0x09, 0x66, 0x25, 0xbc, 0x4a, 0x10, 0x84, 0xbd, 0xf0, 0x61, 0x09, 0x3d,
+  0x7c, 0xba, 0x6d, 0x3d, 0x43, 0x44, 0x60, 0x3d, 0xbc, 0x42, 0x2d, 0x3d, 0x09,
+  0x6d, 0x2d, 0x3d, 0x3b, 0x61, 0xb1, 0x3c, 0xd7, 0xb2, 0x36, 0xbc, 0x10, 0xe9,
+  0x06, 0xbd, 0xd4, 0x30, 0x64, 0x3d, 0x4e, 0xb2, 0x8d, 0xbc, 0x54, 0x0d, 0x24,
+  0xbd, 0xb6, 0x13, 0xe8, 0x3c, 0xe1, 0xd2, 0xd3, 0x3c, 0xd2, 0xc8, 0x99, 0xbc,
+  0x5c, 0x05, 0x75, 0x3d, 0x58, 0x19, 0x91, 0x3d, 0x66, 0x5b, 0x03, 0xbd, 0xf4,
+  0x88, 0xbd, 0xbc, 0xff, 0x51, 0x93, 0xbc, 0xaa, 0xc8, 0x3e, 0x3d, 0x57, 0x16,
+  0xbc, 0xba, 0xf4, 0xe1, 0xa0, 0xbd, 0x3a, 0x82, 0x94, 0xbd, 0x77, 0xfa, 0x86,
+  0xbd, 0xa6, 0xfd, 0x84, 0xbb, 0x91, 0x28, 0xeb, 0xbb, 0x86, 0xfd, 0xca, 0xbc,
+  0x7f, 0xd4, 0x10, 0xbc, 0xea, 0x09, 0x08, 0xbd, 0xbe, 0x9e, 0x23, 0xbc, 0x5a,
+  0x6a, 0x4f, 0xbd, 0x00, 0xf1, 0x54, 0x3d, 0xf4, 0x72, 0xb8, 0xbc, 0x0a, 0xde,
+  0x0f, 0x3d, 0x27, 0x61, 0x1b, 0x3d, 0xed, 0xb6, 0x49, 0xbd, 0x11, 0x6d, 0xfb,
+  0x3c, 0x51, 0x41, 0x75, 0x3d, 0x0b, 0x3b, 0x68, 0x3d, 0x1e, 0xb2, 0x6c, 0xbd,
+  0xd0, 0x5a, 0xfe, 0x3c, 0x3d, 0xa0, 0x30, 0xbd, 0xc8, 0xf9, 0x89, 0x3c, 0x10,
+  0x06, 0x72, 0x3d, 0xed, 0x61, 0xe1, 0x3a, 0x35, 0x65, 0x7e, 0x3d, 0x16, 0x6c,
+  0x4d, 0x3d, 0x8a, 0xf6, 0x5a, 0x3d, 0x3e, 0x18, 0x64, 0x3d, 0x36, 0x9a, 0xbe,
+  0x3c, 0x14, 0xa7, 0xba, 0xbc, 0x93, 0x98, 0xe3, 0x3c, 0x14, 0x13, 0x30, 0x3d,
+  0xa8, 0x9a, 0x71, 0xbc, 0xd0, 0x9e, 0xfd, 0xbc, 0x10, 0x8b, 0xa7, 0xbd, 0xb9,
+  0x47, 0x2f, 0x3d, 0x44, 0xff, 0x9c, 0xbd, 0x5b, 0x84, 0x3e, 0xbd, 0xc6, 0xa4,
+  0xaa, 0x3c, 0x5b, 0xa9, 0x0e, 0xbd, 0x6b, 0xa6, 0x33, 0x3d, 0x65, 0x26, 0x46,
+  0x3d, 0x8e, 0x5d, 0xdc, 0xbc, 0x62, 0xcf, 0x43, 0xbd, 0xfd, 0x0e, 0x86, 0x3d,
+  0x52, 0xd5, 0xf3, 0x3c, 0x10, 0x00, 0x50, 0xbc, 0x55, 0xec, 0x6c, 0xbd, 0x9b,
+  0x21, 0x46, 0x3d, 0xb3, 0xe4, 0x80, 0xbc, 0xa1, 0xf7, 0x84, 0xbd, 0x64, 0x01,
+  0x4e, 0xbd, 0x01, 0xfb, 0x3e, 0xbc, 0x28, 0xfc, 0xac, 0xbc, 0x84, 0xf6, 0x17,
+  0x3c, 0x69, 0x7c, 0xd9, 0xbc, 0x30, 0xb8, 0xfe, 0xbc, 0x0e, 0x3a, 0x87, 0xbd,
+  0x88, 0xad, 0x93, 0xbd, 0xe1, 0x85, 0x8d, 0xbd, 0x42, 0x8c, 0x12, 0x3d, 0x41,
+  0x59, 0x84, 0xbd, 0x1c, 0x0e, 0x70, 0xbb, 0xb0, 0x9e, 0xd3, 0xbc, 0x3c, 0x03,
+  0xdb, 0xbb, 0xf4, 0x19, 0x01, 0x3d, 0x6f, 0x20, 0xc6, 0x3c, 0x77, 0xc0, 0xb4,
+  0x3c, 0x4a, 0xa0, 0xa7, 0x3c, 0x1c, 0xaa, 0x2a, 0xbd, 0x49, 0x9b, 0x60, 0xbd,
+  0x30, 0xff, 0xf9, 0xbc, 0x2f, 0x70, 0xc9, 0xbb, 0x72, 0x4b, 0x8f, 0xbd, 0x47,
+  0xc6, 0x34, 0x3d, 0x18, 0x49, 0x21, 0x3c, 0x04, 0x19, 0x30, 0x3d, 0x74, 0xbe,
+  0x7b, 0xbb, 0xbc, 0x92, 0x43, 0xbc, 0x6f, 0xb6, 0xdf, 0xbc, 0x20, 0xdb, 0x90,
+  0x3c, 0x45, 0x29, 0x95, 0xbc, 0x4c, 0x9c, 0xa6, 0x3c, 0x2b, 0xbf, 0xe4, 0xbc,
+  0xa9, 0x41, 0xff, 0xbc, 0x62, 0x15, 0xd4, 0x3c, 0x29, 0x60, 0x8e, 0xbd, 0x8d,
+  0xce, 0x56, 0xbc, 0x84, 0x09, 0x41, 0x3d, 0x16, 0xb8, 0x35, 0x3d, 0x03, 0x5c,
+  0x09, 0xbd, 0x82, 0xfe, 0x64, 0x3d, 0x16, 0x2e, 0x6d, 0xbd, 0xbf, 0x4b, 0x05,
+  0xbd, 0x15, 0x9a, 0x28, 0xbd, 0x1d, 0x3d, 0x4f, 0xbd, 0x7c, 0x8a, 0x99, 0x3b,
+  0xf9, 0x8c, 0x35, 0xbd, 0xef, 0xc2, 0x2a, 0xbd, 0xe6, 0xea, 0x85, 0xbc, 0xfd,
+  0xf1, 0xde, 0x3b, 0xce, 0xb3, 0x5f, 0x3d, 0x2f, 0x4a, 0x30, 0xbc, 0xc5, 0xa1,
+  0x09, 0xbd, 0x63, 0x5f, 0x5e, 0xbd, 0x44, 0xc9, 0xc2, 0xbc, 0xb6, 0x2a, 0xf8,
+  0xbc, 0x58, 0x39, 0x34, 0x3d, 0x49, 0xbe, 0x5c, 0xbd, 0x45, 0xad, 0x1d, 0x3c,
+  0x3f, 0x9f, 0x19, 0xbd, 0xfb, 0xef, 0x2e, 0x3c, 0xd5, 0xe8, 0x88, 0x3c, 0x13,
+  0x36, 0x5c, 0xbd, 0x04, 0xeb, 0x78, 0x3c, 0x6e, 0x39, 0x64, 0x3d, 0xdc, 0x1e,
+  0x70, 0x3d, 0x79, 0x43, 0x4d, 0x3d, 0xfd, 0x0f, 0x30, 0xbd, 0xd2, 0x88, 0x18,
+  0x3d, 0x87, 0x62, 0xcc, 0x3c, 0x00, 0x39, 0x30, 0x3d, 0xba, 0xa0, 0xfa, 0xbc,
+  0x00, 0x3d, 0x41, 0x3d, 0xed, 0xfa, 0x73, 0xbd, 0x0c, 0x09, 0x54, 0xbd, 0x77,
+  0x2f, 0x5f, 0xbd, 0x01, 0x38, 0x7f, 0xbd, 0x98, 0x08, 0xee, 0xbc, 0x53, 0x34,
+  0x48, 0xbc, 0x8a, 0x25, 0x72, 0xbc, 0xf3, 0x71, 0x70, 0xbd, 0x44, 0xdf, 0x1b,
+  0x3d, 0xd8, 0x6e, 0x6f, 0xbd, 0xdf, 0x4d, 0x23, 0x3c, 0x9c, 0xfb, 0x21, 0x3d,
+  0x72, 0xe1, 0xa4, 0xbc, 0x74, 0xc3, 0x2e, 0xbd, 0x63, 0x0c, 0x8a, 0xbc, 0x24,
+  0x09, 0x6e, 0xbd, 0xbb, 0x68, 0x68, 0xbd, 0x7d, 0xd7, 0x6c, 0x3d, 0xd8, 0x63,
+  0x63, 0x3c, 0x1a, 0x16, 0xdb, 0xbb, 0x86, 0x5e, 0x40, 0xbd, 0x50, 0x6d, 0x31,
+  0xbb, 0xdd, 0xb6, 0x96, 0xbd, 0x19, 0x27, 0x56, 0xbd, 0xf3, 0xd5, 0x11, 0x3d,
+  0x91, 0x8e, 0x68, 0x3d, 0xea, 0xed, 0x86, 0xbd, 0xd6, 0x51, 0x87, 0xbc, 0xfb,
+  0x6c, 0x76, 0xbd, 0x50, 0x6f, 0x38, 0x3d, 0x9b, 0xa5, 0x71, 0xbd, 0x9b, 0x1f,
+  0x16, 0xbd, 0x25, 0xee, 0x93, 0x3d, 0xa9, 0x05, 0xca, 0xbc, 0x9f, 0xee, 0x36,
+  0xbd, 0x5c, 0x03, 0x28, 0x3d, 0x52, 0x3b, 0xb1, 0x3c, 0xe3, 0x45, 0x13, 0x3d,
+  0x38, 0xec, 0x82, 0xbd, 0xba, 0xc6, 0x5f, 0x3d, 0x18, 0xf7, 0x59, 0x3d, 0xc4,
+  0x2f, 0x89, 0x3c, 0x3c, 0x23, 0xd1, 0xbc, 0x39, 0xa7, 0x28, 0x3d, 0x07, 0x78,
+  0x17, 0xbc, 0x72, 0xe3, 0xaf, 0xbc, 0x15, 0x2e, 0x2d, 0x3d, 0x2c, 0x3d, 0xa3,
+  0x3c, 0x33, 0x96, 0x18, 0xbd, 0xee, 0x47, 0x30, 0xbd, 0x56, 0xc0, 0x0e, 0xbd,
+  0xae, 0x3b, 0x74, 0x3c, 0x79, 0x3e, 0x94, 0x3d, 0xee, 0x19, 0x3d, 0xbd, 0x8d,
+  0x14, 0x7a, 0xbd, 0x49, 0xfa, 0x2e, 0x3d, 0x9a, 0x0e, 0x8e, 0xbd, 0x41, 0x87,
+  0x45, 0x3c, 0x3b, 0x28, 0x66, 0xbd, 0x3d, 0xbd, 0x20, 0x3d, 0x60, 0x4e, 0x80,
+  0xbd, 0x7a, 0x3c, 0x50, 0xbd, 0xaa, 0x0f, 0x9e, 0xbd, 0xa2, 0x81, 0x57, 0xbd,
+  0x69, 0xf7, 0x27, 0x3d, 0x62, 0x88, 0x17, 0xbc, 0x47, 0x5d, 0xac, 0x3c, 0xe7,
+  0x41, 0x31, 0xbd, 0xde, 0xec, 0x85, 0xbd, 0x74, 0xa1, 0x48, 0xbd, 0x80, 0x0d,
+  0x2a, 0xbd, 0x5e, 0x67, 0x7e, 0x3c, 0x35, 0xa5, 0xc6, 0x3c, 0xc4, 0xeb, 0x89,
+  0xbc, 0xcb, 0xa7, 0x97, 0x3c, 0x0f, 0xca, 0x68, 0x3c, 0xeb, 0x57, 0xea, 0xbc,
+  0x88, 0xf8, 0xb3, 0x3c, 0x44, 0x92, 0xee, 0x3c, 0x89, 0xa1, 0x92, 0x3d, 0x61,
+  0xa5, 0x23, 0x3a, 0x1e, 0x6c, 0x28, 0xbd, 0x18, 0x89, 0xa4, 0x3c, 0xd1, 0x26,
+  0x47, 0x3b, 0x4a, 0x06, 0x80, 0x3c, 0x3a, 0x5f, 0x58, 0xbd, 0x6e, 0x1d, 0x77,
+  0xbd, 0xe1, 0x43, 0x89, 0x3a, 0x41, 0xd0, 0x71, 0xbc, 0x90, 0x43, 0x40, 0xbd,
+  0xa5, 0xc3, 0x3a, 0x3c, 0xc2, 0x45, 0xb1, 0xbb, 0xf1, 0x81, 0x32, 0x3d, 0x80,
+  0x8e, 0x20, 0x3d, 0x0a, 0xbd, 0x14, 0x3d, 0xbb, 0x93, 0x3e, 0xbd, 0x50, 0x1f,
+  0x5b, 0x3d, 0xb7, 0xd1, 0x99, 0xbd, 0xbe, 0x77, 0x4b, 0x3d, 0x5f, 0xd4, 0x58,
+  0x3d, 0xdc, 0xab, 0xa4, 0x3c, 0x41, 0x6c, 0x78, 0xbd, 0xbd, 0x11, 0x71, 0x3c,
+  0xc9, 0x97, 0x50, 0xbd, 0x93, 0xca, 0xe9, 0x3b, 0xec, 0x1b, 0xb4, 0xbc, 0xcf,
+  0xb1, 0x48, 0x3c, 0x26, 0xd1, 0x99, 0x3c, 0x9b, 0xca, 0x26, 0xbd, 0xe0, 0xaf,
+  0x2f, 0xbc, 0xef, 0x23, 0x84, 0xbd, 0x10, 0x75, 0xe1, 0x3b, 0xe6, 0x8c, 0x3c,
+  0x3d, 0xad, 0x1a, 0x48, 0x3d, 0xfe, 0x04, 0x3f, 0x3d, 0xf2, 0x2f, 0xe0, 0xbc,
+  0x98, 0x58, 0xe3, 0xbb, 0xe2, 0x78, 0x84, 0x3d, 0xde, 0x9e, 0x97, 0x3b, 0xe3,
+  0x90, 0x35, 0xbd, 0xb9, 0xf5, 0x57, 0x3c, 0x29, 0x97, 0x18, 0x3c, 0xa7, 0xe6,
+  0x02, 0x3d, 0x6e, 0xd3, 0x0b, 0x3d, 0x09, 0x9f, 0x51, 0xbd, 0xca, 0x5b, 0xac,
+  0x3a, 0x38, 0xd9, 0x55, 0xbd, 0xc0, 0x50, 0x0b, 0x3d, 0x63, 0xe8, 0x69, 0xbd,
+  0x96, 0xeb, 0x86, 0xbd, 0x43, 0x18, 0x26, 0x3d, 0x76, 0xab, 0xd8, 0x3a, 0xe3,
+  0x0e, 0xb9, 0xbc, 0xed, 0xb2, 0x33, 0x3c, 0x67, 0x1d, 0x7c, 0xbd, 0x13, 0x39,
+  0xa8, 0x3b, 0x4b, 0xa3, 0x39, 0xbd, 0x17, 0xb9, 0x44, 0xbd, 0x88, 0x76, 0x43,
+  0xbd, 0xdd, 0x31, 0x61, 0xbd, 0x2d, 0x7d, 0xae, 0xbc, 0xe9, 0xb8, 0x05, 0x3d,
+  0xdd, 0x80, 0x2a, 0xbd, 0x55, 0x66, 0x08, 0xbd, 0xea, 0x09, 0x8a, 0xbd, 0x13,
+  0xd8, 0x0d, 0xbd, 0x7e, 0x9d, 0x5a, 0x3d, 0x08, 0x68, 0x8d, 0x3c, 0x02, 0x87,
+  0xdc, 0x3c, 0xfb, 0x55, 0xda, 0xb9, 0xc4, 0x69, 0x71, 0xbd, 0xd1, 0x02, 0xf6,
+  0xbc, 0x92, 0x01, 0x0c, 0x3d, 0xbb, 0x2c, 0x40, 0xbd, 0x82, 0x69, 0x97, 0x3d,
+  0x2b, 0xda, 0x57, 0xbd, 0x7b, 0x9b, 0xe0, 0x3b, 0xff, 0xfd, 0x4b, 0xbd, 0x5c,
+  0xa6, 0x2e, 0x3d, 0x40, 0xec, 0x85, 0xbd, 0x3b, 0x5d, 0x17, 0xbd, 0x52, 0x04,
+  0x2c, 0xbd, 0x61, 0x00, 0x20, 0x3c, 0x65, 0x33, 0x28, 0xbc, 0x77, 0x76, 0x07,
+  0x3d, 0x7a, 0xff, 0x32, 0x3b, 0xb9, 0x96, 0x59, 0xbd, 0xe0, 0xe1, 0x43, 0xbd,
+  0x17, 0xa7, 0x6b, 0xbd, 0xf8, 0xa6, 0x4d, 0xbd, 0x4f, 0xc3, 0x9d, 0xbb, 0xfa,
+  0x3a, 0x39, 0xbd, 0xe3, 0x59, 0x9a, 0xbd, 0xbd, 0xb9, 0x43, 0xbc, 0x21, 0xc4,
+  0x0c, 0x3c, 0x3e, 0x70, 0x47, 0xbd, 0x42, 0xcf, 0x93, 0x3b, 0x9b, 0xe0, 0x34,
+  0x3d, 0x00, 0x5d, 0xeb, 0x39, 0x5f, 0x65, 0x80, 0xbd, 0x37, 0x8a, 0x65, 0x3d,
+  0x0e, 0x1b, 0x67, 0xbc, 0xa0, 0x0a, 0x68, 0x3c, 0xc5, 0x6d, 0xf7, 0x3c, 0xe1,
+  0x9d, 0x85, 0x3d, 0xa8, 0xe7, 0x69, 0xbd, 0x30, 0x9c, 0x36, 0xbd, 0xcf, 0x55,
+  0xdf, 0x3c, 0x85, 0xe9, 0x4c, 0x3d, 0x3e, 0x03, 0x8a, 0xbd, 0x19, 0xe1, 0x86,
+  0xbb, 0xa0, 0x51, 0xec, 0x3c, 0x11, 0xc9, 0x84, 0x3d, 0x48, 0xa9, 0x1d, 0x3d,
+  0x1c, 0xd6, 0xee, 0x3b, 0x82, 0x07, 0x96, 0xbc, 0x33, 0x6b, 0xd0, 0x3c, 0x62,
+  0x62, 0xb6, 0x3c, 0x4a, 0x35, 0x62, 0x3d, 0x10, 0x85, 0x66, 0xbd, 0xc9, 0xf5,
+  0x53, 0xbc, 0x70, 0x4a, 0xfa, 0x3b, 0xa5, 0x21, 0x33, 0xbd, 0xe7, 0x07, 0x40,
+  0x3b, 0x6d, 0xe3, 0x16, 0x3d, 0x11, 0xa2, 0xa7, 0x3a, 0x01, 0x73, 0x95, 0xbc,
+  0x5c, 0xd1, 0x2e, 0xbd, 0x5c, 0x41, 0x00, 0xbd, 0x02, 0x40, 0x8a, 0x3d, 0x66,
+  0xcf, 0x2b, 0x3d, 0x3d, 0x54, 0x8b, 0xbc, 0x1b, 0x25, 0x44, 0x3d, 0x56, 0xda,
+  0x15, 0xbd, 0xfc, 0x0c, 0xc1, 0xbc, 0x4d, 0xcd, 0x5e, 0xbd, 0x40, 0x55, 0x2c,
+  0x3d, 0xb9, 0xe6, 0xc5, 0xbc, 0x6b, 0x0d, 0xd2, 0xba, 0xd0, 0x10, 0x28, 0x3c,
+  0x6b, 0xd8, 0x63, 0xbd, 0xf7, 0xed, 0xca, 0x3c, 0xa3, 0x63, 0x5a, 0x3b, 0x45,
+  0x41, 0x8e, 0x3d, 0x48, 0x23, 0xd7, 0x3c, 0x71, 0xbb, 0xa8, 0x3c, 0xe2, 0x55,
+  0x98, 0x3c, 0x27, 0xae, 0x5e, 0xbc, 0x06, 0x79, 0xb4, 0xbb, 0x8c, 0xdb, 0x13,
+  0xbd, 0x7b, 0x59, 0x18, 0x3d, 0xbb, 0x91, 0xfc, 0xbc, 0x4b, 0x7d, 0x80, 0xbd,
+  0x58, 0x76, 0x8a, 0x3c, 0x5f, 0x71, 0xa8, 0x3c, 0xb3, 0x8f, 0x89, 0xbd, 0xb4,
+  0x4c, 0x64, 0xbd, 0xf9, 0x1a, 0x81, 0x3d, 0x8f, 0xa5, 0x90, 0xbd, 0x24, 0x93,
+  0xbf, 0x3c, 0x1c, 0x73, 0x68, 0x3d, 0xa5, 0x53, 0x4a, 0xbd, 0xec, 0x40, 0x34,
+  0xbd, 0xb2, 0x5f, 0x90, 0x3d, 0x0d, 0xe3, 0x11, 0x3d, 0x5b, 0x77, 0x91, 0x3d,
+  0xe4, 0x5b, 0x8b, 0x3d, 0x99, 0x6e, 0x6a, 0xbd, 0x05, 0xcb, 0x99, 0xbd, 0xb5,
+  0x26, 0x1f, 0xbd, 0xfd, 0xc3, 0x2f, 0xbd, 0xd2, 0x82, 0x96, 0x3d, 0x06, 0xf6,
+  0x78, 0xbd, 0x8e, 0x08, 0x30, 0x3d, 0x16, 0x22, 0x6d, 0xbd, 0xda, 0x25, 0x4b,
+  0x3d, 0xf7, 0x44, 0x43, 0xbc, 0xba, 0x20, 0xbc, 0xbc, 0x41, 0xd7, 0x04, 0xbc,
+  0xe1, 0x62, 0x0d, 0xbd, 0x93, 0x78, 0x2f, 0xbd, 0x2a, 0xad, 0xd5, 0xbc, 0x13,
+  0xd3, 0x6f, 0xbd, 0x88, 0xc4, 0x12, 0xbd, 0x49, 0x73, 0x84, 0xbd, 0xd6, 0x50,
+  0x2c, 0x3d, 0xa9, 0xb7, 0x7d, 0xbd, 0x9a, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00,
+  0x00, 0x08, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80, 0x04, 0x00, 0x00,
+  0xae, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xc0,
+  0x02, 0x74, 0xbb, 0xc6, 0x58, 0x47, 0x39, 0x07, 0x36, 0x4d, 0x3c, 0xf5, 0x20,
+  0xc5, 0x3c, 0xce, 0x88, 0x6c, 0x3a, 0xd2, 0x40, 0x7d, 0xbc, 0x2f, 0x7e, 0xf5,
+  0x3a, 0x3d, 0xe1, 0x3e, 0xbc, 0xda, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+  0x40, 0x00, 0x00, 0x00, 0x1d, 0xe1, 0xa3, 0xbc, 0xe7, 0x98, 0x88, 0x3c, 0xe4,
+  0xc0, 0x49, 0x3b, 0xa6, 0x49, 0x38, 0x3c, 0x0e, 0x65, 0xbc, 0xbc, 0xd8, 0x59,
+  0x73, 0xbc, 0x15, 0x66, 0x0a, 0xbd, 0x7c, 0x75, 0x24, 0xba, 0x37, 0xc4, 0x65,
+  0x3c, 0x94, 0x0d, 0x84, 0x3c, 0x26, 0xcc, 0x87, 0x3c, 0x59, 0xea, 0x03, 0xbd,
+  0x33, 0x39, 0x48, 0xbc, 0xac, 0x3e, 0x6d, 0x3c, 0xc7, 0x46, 0xb1, 0xbb, 0xcf,
+  0xee, 0x07, 0x3d, 0x26, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x00,
+  0x00, 0x00, 0x7c, 0xe9, 0x43, 0x3c, 0xd3, 0x16, 0xd7, 0xbc, 0x15, 0x37, 0x4a,
+  0xba, 0xa4, 0xad, 0x1c, 0x3c, 0x20, 0x66, 0x3b, 0xbb, 0x22, 0x84, 0x97, 0x3a,
+  0xa5, 0x65, 0x86, 0x3c, 0x68, 0x0b, 0xf7, 0xbb, 0x52, 0xaf, 0x8c, 0x3b, 0xe1,
+  0x81, 0x00, 0x3d, 0x3c, 0xf9, 0xd9, 0x3c, 0x96, 0xa8, 0x80, 0x3c, 0x94, 0xdf,
+  0x21, 0x3c, 0xc7, 0x26, 0xd7, 0x3a, 0x96, 0xb2, 0x8c, 0x3c, 0x17, 0x29, 0x20,
+  0x3c, 0xfa, 0xe0, 0x59, 0x3c, 0xf7, 0x08, 0x14, 0x3c, 0xad, 0x71, 0x61, 0x3c,
+  0x2e, 0x73, 0x1a, 0xbc, 0x0f, 0xd0, 0x55, 0xbb, 0xa8, 0xde, 0x68, 0x3c, 0xd9,
+  0x86, 0x44, 0x3c, 0x54, 0x22, 0x05, 0xbc, 0x3c, 0x7a, 0x92, 0x3c, 0x70, 0x16,
+  0x01, 0x3c, 0x69, 0x1e, 0xaf, 0xbb, 0xe8, 0x4b, 0xc5, 0xbc, 0x8b, 0xfd, 0x23,
+  0x3c, 0xb8, 0x1e, 0xfd, 0xbc, 0x49, 0x11, 0x50, 0xbb, 0x2a, 0x7b, 0x9c, 0x3c,
+  0xb2, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x6e,
+  0x5f, 0x06, 0xba, 0xca, 0x9c, 0x99, 0xbb, 0x00, 0x00, 0x00, 0x00, 0xa4, 0x8a,
+  0xfe, 0xba, 0x12, 0xed, 0xa7, 0x3c, 0xc0, 0x7d, 0x37, 0xbb, 0xa3, 0x8a, 0x30,
+  0xbb, 0xd0, 0x95, 0x99, 0xbc, 0x00, 0x00, 0x00, 0x00, 0x81, 0x9c, 0x1c, 0x3d,
+  0x5c, 0x2a, 0x8e, 0xbb, 0x8c, 0xc0, 0x1a, 0xbb, 0x5b, 0xa1, 0xe5, 0x3b, 0x00,
+  0x00, 0x00, 0x00, 0x6a, 0x50, 0xef, 0x3c, 0xdc, 0xbc, 0x9a, 0x3a, 0x00, 0x00,
+  0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+  0x00, 0x08, 0x00, 0x00, 0x00, 0x6e, 0x6b, 0xdf, 0xbb, 0x54, 0xe6, 0xe6, 0x3c,
+  0xd0, 0xf4, 0xff, 0xff, 0xd4, 0xf4, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x4d,
+  0x4c, 0x49, 0x52, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64,
+  0x2e, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e,
+  0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0,
+  0x02, 0x00, 0x00, 0xa4, 0x02, 0x00, 0x00, 0xa8, 0x02, 0x00, 0x00, 0x04, 0x00,
+  0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00,
+  0x00, 0x38, 0x02, 0x00, 0x00, 0xd4, 0x01, 0x00, 0x00, 0x80, 0x01, 0x00, 0x00,
+  0x3c, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x8c,
+  0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5a, 0xfe,
+  0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x10, 0x00, 0x00,
+  0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x68, 0xf5, 0xff, 0xff,
+  0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x13,
+  0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x8e, 0xfe,
+  0xff, 0xff, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00,
+  0x00, 0x20, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00,
+  0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+  0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x12, 0x00,
+  0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a,
+  0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01,
+  0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x11, 0x00,
+  0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0xee, 0xfe, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02,
+  0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x00, 0x10, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xd0,
+  0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x03, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x7e, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00,
+  0x00, 0x00, 0x6e, 0xff, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+  0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0e,
+  0x00, 0x00, 0x00, 0x5e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x50, 0xff, 0xff,
+  0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x03,
+  0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x04, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x1a, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c,
+  0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x05, 0x34, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x01,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x17, 0x00, 0x10, 0x00,
+  0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+  0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01,
+  0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00,
+  0x00, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00,
+  0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x28, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x13, 0x00, 0x0c, 0x00, 0x08,
+  0x00, 0x07, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00,
+  0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+  0x00, 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x09, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14,
+  0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00,
+  0x00, 0x00, 0x10, 0x08, 0x00, 0x00, 0xc4, 0x07, 0x00, 0x00, 0x7c, 0x07, 0x00,
+  0x00, 0x44, 0x07, 0x00, 0x00, 0x0c, 0x07, 0x00, 0x00, 0xd4, 0x06, 0x00, 0x00,
+  0x88, 0x06, 0x00, 0x00, 0x2c, 0x06, 0x00, 0x00, 0xe0, 0x05, 0x00, 0x00, 0x8c,
+  0x05, 0x00, 0x00, 0x38, 0x05, 0x00, 0x00, 0xe4, 0x04, 0x00, 0x00, 0x28, 0x04,
+  0x00, 0x00, 0xb4, 0x03, 0x00, 0x00, 0xf8, 0x02, 0x00, 0x00, 0x84, 0x02, 0x00,
+  0x00, 0xc8, 0x01, 0x00, 0x00, 0x54, 0x01, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x58, 0xf8, 0xff, 0xff, 0x14,
+  0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x15, 0x00,
+  0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff,
+  0xff, 0x02, 0x00, 0x00, 0x00, 0x3c, 0xf8, 0xff, 0xff, 0x19, 0x00, 0x00, 0x00,
+  0x53, 0x74, 0x61, 0x74, 0x65, 0x66, 0x75, 0x6c, 0x50, 0x61, 0x72, 0x74, 0x69,
+  0x74, 0x69, 0x6f, 0x6e, 0x65, 0x64, 0x43, 0x61, 0x6c, 0x6c, 0x3a, 0x30, 0x00,
+  0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+  0x00, 0xac, 0xf8, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x02,
+  0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x90, 0xf8,
+  0xff, 0xff, 0x5b, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74,
+  0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f,
+  0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x3b, 0x73, 0x65,
+  0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64,
+  0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x52, 0x65, 0x6c, 0x75,
+  0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36,
+  0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x42,
+  0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00,
+  0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x40, 0xf9, 0xff, 0xff, 0x14, 0x00, 0x00,
+  0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
+  0x3c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80,
+  0x04, 0x00, 0x00, 0x24, 0xf9, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65,
+  0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x66,
+  0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f, 0x52, 0x65, 0x73,
+  0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01,
+  0x00, 0x00, 0x00, 0x80, 0x04, 0x00, 0x00, 0x9c, 0xf9, 0xff, 0xff, 0x14, 0x00,
+  0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00,
+  0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x88,
+  0xf9, 0xff, 0xff, 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e,
+  0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70,
+  0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x38, 0x2f,
+  0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01,
+  0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00,
+  0x00, 0x00, 0x0c, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00,
+  0x00, 0x24, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x0c,
+  0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xf8, 0xf9, 0xff, 0xff, 0x6e, 0x00,
+  0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f,
+  0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33,
+  0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74,
+  0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64,
+  0x5f, 0x32, 0x34, 0x33, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b,
+  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+  0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43,
+  0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f,
+  0x32, 0x34, 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0xc4, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24,
+  0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00,
+  0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00,
+  0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xb0, 0xfa, 0xff, 0xff,
+  0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61,
+  0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c,
+  0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x37, 0x2f, 0x4d, 0x61, 0x78,
+  0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x34,
+  0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00,
+  0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+  0x00, 0xff, 0xff, 0xff, 0xff, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x20, 0xfb, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73,
+  0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f,
+  0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x52, 0x65,
+  0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c,
+  0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+  0x32, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71,
+  0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f,
+  0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x43, 0x6f, 0x6e, 0x76,
+  0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32,
+  0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00,
+  0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0xec, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04,
+  0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, 0x1f, 0x00,
+  0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xd8, 0xfb, 0xff, 0xff, 0x27, 0x00, 0x00,
+  0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36,
+  0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67,
+  0x32, 0x64, 0x5f, 0x31, 0x39, 0x36, 0x2f, 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f,
+  0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00,
+  0x00, 0x1f, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x5c, 0xfc, 0xff, 0xff,
+  0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0d,
+  0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff,
+  0xff, 0xff, 0x3e, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00,
+  0x00, 0x48, 0xfc, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,
+  0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e,
+  0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b,
+  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+  0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x42,
+  0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e,
+  0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32,
+  0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b,
+  0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x62, 0x69,
+  0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3e,
+  0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x56, 0xfd,
+  0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00,
+  0x00, 0x2c, 0x00, 0x00, 0x00, 0xe8, 0xfc, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00,
+  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+  0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43,
+  0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00,
+  0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0xa6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0b,
+  0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x38, 0xfd, 0xff, 0xff, 0x1f, 0x00,
+  0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f,
+  0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32,
+  0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x10,
+  0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00,
+  0x00, 0x00, 0xf6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x0a, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x88, 0xfd, 0xff, 0xff,
+  0x1f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61,
+  0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32,
+  0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00,
+  0x00, 0x08, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x46, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10,
+  0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0xd8, 0xfd,
+  0xff, 0xff, 0x1e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74,
+  0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f,
+  0x31, 0x36, 0x34, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02,
+  0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x8e, 0xfe,
+  0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00,
+  0x00, 0x2c, 0x00, 0x00, 0x00, 0x20, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00,
+  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+  0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61,
+  0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x80, 0x04, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x14, 0x00,
+  0x13, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x14,
+  0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x02, 0x2c, 0x00, 0x00, 0x00, 0x7c, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00,
+  0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36,
+  0x33, 0x2f, 0x66, 0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f,
+  0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00,
+  0x00, 0x00, 0x2e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x06, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xc0, 0xfe, 0xff, 0xff,
+  0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+  0x31, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00,
+  0x00, 0x00, 0x62, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x05, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xf4, 0xfe, 0xff, 0xff,
+  0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+  0x32, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00,
+  0x00, 0x00, 0x96, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x28, 0xff, 0xff, 0xff,
+  0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+  0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00,
+  0x00, 0x00, 0xca, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+  0x00, 0x03, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x5c, 0xff, 0xff, 0xff,
+  0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33,
+  0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c,
+  0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0,
+  0xff, 0xff, 0xff, 0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f,
+  0x31, 0x36, 0x34, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x00, 0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x14,
+  0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00,
+  0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+  0x00, 0xff, 0xff, 0xff, 0xff, 0x40, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x22,
+  0x00, 0x00, 0x00, 0x73, 0x65, 0x72, 0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65,
+  0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f,
+  0x32, 0x34, 0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3a, 0x30, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x40,
+  0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00,
+  0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+  0x00, 0xdc, 0xff, 0xff, 0xff, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+  0xe8, 0xff, 0xff, 0xff, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0xf4,
+  0xff, 0xff, 0xff, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x0c, 0x00,
+  0x0c, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00,
+  0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03
+};
diff --git a/media/libaom/src/av1/encoder/dwt.c b/media/libaom/src/av1/encoder/dwt.c
index 04088b25f9..5dfbcb677b 100644
--- a/media/libaom/src/av1/encoder/dwt.c
+++ b/media/libaom/src/av1/encoder/dwt.c
@@ -70,14 +70,14 @@ static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass,
 }
 
 static void dyadic_analyze_53_uint8_input(int levels, int width, int height,
-                                          uint8_t *x, int pitch_x,
+                                          const uint8_t *x, int pitch_x,
                                           tran_low_t *c, int pitch_c,
                                           int dwt_scale_bits, int hbd) {
   int lv, i, j, nh, nw, hh = height, hw = width;
   tran_low_t buffer[2 * DWT_MAX_LENGTH];
 
   if (hbd) {
-    uint16_t *x16 = CONVERT_TO_SHORTPTR(x);
+    const uint16_t *x16 = CONVERT_TO_SHORTPTR(x);
     for (i = 0; i < height; i++) {
       for (j = 0; j < width; j++) {
         c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits;
@@ -109,12 +109,12 @@ static void dyadic_analyze_53_uint8_input(int levels, int width, int height,
   }
 }
 
-void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
-                               int hbd) {
+void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output,
+                               int stride, int hbd) {
   dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd);
 }
 
-int av1_haar_ac_sad(tran_low_t *output, int bw, int bh, int stride) {
+int av1_haar_ac_sad(const tran_low_t *output, int bw, int bh, int stride) {
   int acsad = 0;
 
   for (int r = 0; r < bh; ++r)
@@ -147,9 +147,23 @@ uint32_t av1_variance(uint8_t *input, int bw, int bh, int stride) {
   return sse - (uint32_t)(((int64_t)sum * sum) / (bw * bh));
 }
 
-int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd) {
+static int haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride,
+                                       int hbd) {
   tran_low_t output[64];
 
   av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
   return av1_haar_ac_sad(output, 8, 8, 8);
 }
+
+int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
+                                        int hbd, int num_8x8_rows,
+                                        int num_8x8_cols) {
+  int64_t wavelet_energy = 0;
+  for (int r8 = 0; r8 < num_8x8_rows; ++r8) {
+    for (int c8 = 0; c8 < num_8x8_cols; ++c8) {
+      wavelet_energy += haar_ac_sad_8x8_uint8_input(
+          input + c8 * 8 + r8 * 8 * stride, stride, hbd);
+    }
+  }
+  return wavelet_energy;
+}
diff --git a/media/libaom/src/av1/encoder/dwt.h b/media/libaom/src/av1/encoder/dwt.h
index 37306c6a5f..443b6bc12c 100644
--- a/media/libaom/src/av1/encoder/dwt.h
+++ b/media/libaom/src/av1/encoder/dwt.h
@@ -17,9 +17,11 @@
 
 #define DWT_MAX_LENGTH 64
 
-void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride);
-void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
-                               int hbd);
-int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd);
+void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output,
+                               int stride, int hbd);
+
+int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
+                                        int hbd, int num_8x8_rows,
+                                        int num_8x8_cols);
 
 #endif  // AOM_AV1_ENCODER_DWT_H_
diff --git a/media/libaom/src/av1/encoder/enc_enums.h b/media/libaom/src/av1/encoder/enc_enums.h
index 5a06514838..20cefa16a5 100644
--- a/media/libaom/src/av1/encoder/enc_enums.h
+++ b/media/libaom/src/av1/encoder/enc_enums.h
@@ -68,132 +68,132 @@ enum {
   THR_COMP_NEAREST_NEARESTLG,
   THR_COMP_NEAREST_NEARESTBA,
 
+  THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_GLOBAL_GLOBALLB,
+
   THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEWLA,
   THR_COMP_NEW_NEARESTLA,
   THR_COMP_NEAREST_NEWLA,
   THR_COMP_NEW_NEARLA,
   THR_COMP_NEAR_NEWLA,
-  THR_COMP_NEW_NEWLA,
   THR_COMP_GLOBAL_GLOBALLA,
 
   THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEWL2A,
   THR_COMP_NEW_NEARESTL2A,
   THR_COMP_NEAREST_NEWL2A,
   THR_COMP_NEW_NEARL2A,
   THR_COMP_NEAR_NEWL2A,
-  THR_COMP_NEW_NEWL2A,
   THR_COMP_GLOBAL_GLOBALL2A,
 
   THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEWL3A,
   THR_COMP_NEW_NEARESTL3A,
   THR_COMP_NEAREST_NEWL3A,
   THR_COMP_NEW_NEARL3A,
   THR_COMP_NEAR_NEWL3A,
-  THR_COMP_NEW_NEWL3A,
   THR_COMP_GLOBAL_GLOBALL3A,
 
   THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEWGA,
   THR_COMP_NEW_NEARESTGA,
   THR_COMP_NEAREST_NEWGA,
   THR_COMP_NEW_NEARGA,
   THR_COMP_NEAR_NEWGA,
-  THR_COMP_NEW_NEWGA,
   THR_COMP_GLOBAL_GLOBALGA,
 
-  THR_COMP_NEAR_NEARLB,
-  THR_COMP_NEW_NEARESTLB,
-  THR_COMP_NEAREST_NEWLB,
-  THR_COMP_NEW_NEARLB,
-  THR_COMP_NEAR_NEWLB,
-  THR_COMP_NEW_NEWLB,
-  THR_COMP_GLOBAL_GLOBALLB,
-
   THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEWL2B,
   THR_COMP_NEW_NEARESTL2B,
   THR_COMP_NEAREST_NEWL2B,
   THR_COMP_NEW_NEARL2B,
   THR_COMP_NEAR_NEWL2B,
-  THR_COMP_NEW_NEWL2B,
   THR_COMP_GLOBAL_GLOBALL2B,
 
   THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEWL3B,
   THR_COMP_NEW_NEARESTL3B,
   THR_COMP_NEAREST_NEWL3B,
   THR_COMP_NEW_NEARL3B,
   THR_COMP_NEAR_NEWL3B,
-  THR_COMP_NEW_NEWL3B,
   THR_COMP_GLOBAL_GLOBALL3B,
 
   THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEWGB,
   THR_COMP_NEW_NEARESTGB,
   THR_COMP_NEAREST_NEWGB,
   THR_COMP_NEW_NEARGB,
   THR_COMP_NEAR_NEWGB,
-  THR_COMP_NEW_NEWGB,
   THR_COMP_GLOBAL_GLOBALGB,
 
   THR_COMP_NEAR_NEARLA2,
+  THR_COMP_NEW_NEWLA2,
   THR_COMP_NEW_NEARESTLA2,
   THR_COMP_NEAREST_NEWLA2,
   THR_COMP_NEW_NEARLA2,
   THR_COMP_NEAR_NEWLA2,
-  THR_COMP_NEW_NEWLA2,
   THR_COMP_GLOBAL_GLOBALLA2,
 
   THR_COMP_NEAR_NEARL2A2,
+  THR_COMP_NEW_NEWL2A2,
   THR_COMP_NEW_NEARESTL2A2,
   THR_COMP_NEAREST_NEWL2A2,
   THR_COMP_NEW_NEARL2A2,
   THR_COMP_NEAR_NEWL2A2,
-  THR_COMP_NEW_NEWL2A2,
   THR_COMP_GLOBAL_GLOBALL2A2,
 
   THR_COMP_NEAR_NEARL3A2,
+  THR_COMP_NEW_NEWL3A2,
   THR_COMP_NEW_NEARESTL3A2,
   THR_COMP_NEAREST_NEWL3A2,
   THR_COMP_NEW_NEARL3A2,
   THR_COMP_NEAR_NEWL3A2,
-  THR_COMP_NEW_NEWL3A2,
   THR_COMP_GLOBAL_GLOBALL3A2,
 
   THR_COMP_NEAR_NEARGA2,
+  THR_COMP_NEW_NEWGA2,
   THR_COMP_NEW_NEARESTGA2,
   THR_COMP_NEAREST_NEWGA2,
   THR_COMP_NEW_NEARGA2,
   THR_COMP_NEAR_NEWGA2,
-  THR_COMP_NEW_NEWGA2,
   THR_COMP_GLOBAL_GLOBALGA2,
 
   THR_COMP_NEAR_NEARLL2,
+  THR_COMP_NEW_NEWLL2,
   THR_COMP_NEW_NEARESTLL2,
   THR_COMP_NEAREST_NEWLL2,
   THR_COMP_NEW_NEARLL2,
   THR_COMP_NEAR_NEWLL2,
-  THR_COMP_NEW_NEWLL2,
   THR_COMP_GLOBAL_GLOBALLL2,
 
   THR_COMP_NEAR_NEARLL3,
+  THR_COMP_NEW_NEWLL3,
   THR_COMP_NEW_NEARESTLL3,
   THR_COMP_NEAREST_NEWLL3,
   THR_COMP_NEW_NEARLL3,
   THR_COMP_NEAR_NEWLL3,
-  THR_COMP_NEW_NEWLL3,
   THR_COMP_GLOBAL_GLOBALLL3,
 
   THR_COMP_NEAR_NEARLG,
+  THR_COMP_NEW_NEWLG,
   THR_COMP_NEW_NEARESTLG,
   THR_COMP_NEAREST_NEWLG,
   THR_COMP_NEW_NEARLG,
   THR_COMP_NEAR_NEWLG,
-  THR_COMP_NEW_NEWLG,
   THR_COMP_GLOBAL_GLOBALLG,
 
   THR_COMP_NEAR_NEARBA,
+  THR_COMP_NEW_NEWBA,
   THR_COMP_NEW_NEARESTBA,
   THR_COMP_NEAREST_NEWBA,
   THR_COMP_NEW_NEARBA,
   THR_COMP_NEAR_NEWBA,
-  THR_COMP_NEW_NEWBA,
   THR_COMP_GLOBAL_GLOBALBA,
 
   THR_DC,
@@ -216,6 +216,8 @@ enum {
   NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START,
   THR_MODE_START = THR_NEARESTMV,
   THR_MODE_END = MAX_MODES,
+  THR_INTER_MODE_START = THR_MODE_START,
+  THR_INTER_MODE_END = THR_DC,
   THR_INVALID = 255
 } UENUM1BYTE(THR_MODES);
 
@@ -248,6 +250,17 @@ enum {
   MAX_REFS
 } UENUM1BYTE(THR_MODES_SUB8X8);
 
+enum {
+  FULL_TXFM_RD,
+  LOW_TXFM_RD,
+} UENUM1BYTE(TXFM_RD_MODEL);
+
+enum {
+  USE_FULL_RD = 0,
+  USE_FAST_RD,
+  USE_LARGESTALL,
+} UENUM1BYTE(TX_SIZE_SEARCH_METHOD);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/encode_strategy.c b/media/libaom/src/av1/encoder/encode_strategy.c
index 8eb73d8d3a..73da66a54f 100644
--- a/media/libaom/src/av1/encoder/encode_strategy.c
+++ b/media/libaom/src/av1/encoder/encode_strategy.c
@@ -11,14 +11,13 @@
 
 #include <stdint.h>
 
+#include "av1/common/blockd.h"
 #include "config/aom_config.h"
 #include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_codec.h"
 #include "aom/aom_encoder.h"
 
-#include "aom_ports/system_state.h"
-
 #if CONFIG_MISMATCH_DEBUG
 #include "aom_util/debug_util.h"
 #endif  // CONFIG_MISMATCH_DEBUG
@@ -29,86 +28,101 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder_alloc.h"
 #include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
 #include "av1/encoder/pass2_strategy.h"
 #include "av1/encoder/temporal_filter.h"
+#if CONFIG_THREE_PASS
+#include "av1/encoder/thirdpass.h"
+#endif  // CONFIG_THREE_PASS
 #include "av1/encoder/tpl_model.h"
 
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
 #define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1)
 
+static INLINE void set_refresh_frame_flags(
+    RefreshFrameInfo *const refresh_frame, bool refresh_gf, bool refresh_bwdref,
+    bool refresh_arf) {
+  refresh_frame->golden_frame = refresh_gf;
+  refresh_frame->bwd_ref_frame = refresh_bwdref;
+  refresh_frame->alt_ref_frame = refresh_arf;
+}
+
 void av1_configure_buffer_updates(AV1_COMP *const cpi,
-                                  EncodeFrameParams *const frame_params,
+                                  RefreshFrameInfo *const refresh_frame,
                                   const FRAME_UPDATE_TYPE type,
+                                  const REFBUF_STATE refbuf_state,
                                   int force_refresh_all) {
   // NOTE(weitinglin): Should we define another function to take care of
   // cpi->rc.is_$Source_Type to make this function as it is in the comment?
-
-  const ExternalFlags *const ext_flags = &cpi->ext_flags;
+  const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+      &cpi->ext_flags.refresh_frame;
   cpi->rc.is_src_frame_alt_ref = 0;
 
   switch (type) {
     case KF_UPDATE:
-      frame_params->refresh_golden_frame = 1;
-      frame_params->refresh_bwd_ref_frame = 1;
-      frame_params->refresh_alt_ref_frame = 1;
+      set_refresh_frame_flags(refresh_frame, true, true, true);
       break;
 
     case LF_UPDATE:
-      frame_params->refresh_golden_frame = 0;
-      frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt_ref_frame = 0;
+      set_refresh_frame_flags(refresh_frame, false, false, false);
       break;
 
     case GF_UPDATE:
-      frame_params->refresh_golden_frame = 1;
-      frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt_ref_frame = 0;
+      set_refresh_frame_flags(refresh_frame, true, false, false);
       break;
 
     case OVERLAY_UPDATE:
-      frame_params->refresh_golden_frame = 1;
-      frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt_ref_frame = 0;
+      if (refbuf_state == REFBUF_RESET)
+        set_refresh_frame_flags(refresh_frame, true, true, true);
+      else
+        set_refresh_frame_flags(refresh_frame, true, false, false);
 
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
 
     case ARF_UPDATE:
-      frame_params->refresh_golden_frame = 0;
       // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
-      frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt_ref_frame = 1;
+      if (refbuf_state == REFBUF_RESET)
+        set_refresh_frame_flags(refresh_frame, true, true, true);
+      else
+        set_refresh_frame_flags(refresh_frame, false, false, true);
+
       break;
 
     case INTNL_OVERLAY_UPDATE:
-      frame_params->refresh_golden_frame = 0;
-      frame_params->refresh_bwd_ref_frame = 0;
-      frame_params->refresh_alt_ref_frame = 0;
-
+      set_refresh_frame_flags(refresh_frame, false, false, false);
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
 
     case INTNL_ARF_UPDATE:
-      frame_params->refresh_golden_frame = 0;
-      frame_params->refresh_bwd_ref_frame = 1;
-      frame_params->refresh_alt_ref_frame = 0;
+      set_refresh_frame_flags(refresh_frame, false, true, false);
       break;
 
     default: assert(0); break;
   }
 
-  if (ext_flags->refresh_frame_flags_pending &&
+  if (ext_refresh_frame_flags->update_pending &&
       (!is_stat_generation_stage(cpi))) {
-    frame_params->refresh_golden_frame = ext_flags->refresh_golden_frame;
-    frame_params->refresh_alt_ref_frame = ext_flags->refresh_alt_ref_frame;
-    frame_params->refresh_bwd_ref_frame = ext_flags->refresh_bwd_ref_frame;
-  }
-
-  if (force_refresh_all) {
-    frame_params->refresh_golden_frame = 1;
-    frame_params->refresh_bwd_ref_frame = 1;
-    frame_params->refresh_alt_ref_frame = 1;
-  }
+    set_refresh_frame_flags(refresh_frame,
+                            ext_refresh_frame_flags->golden_frame,
+                            ext_refresh_frame_flags->bwd_ref_frame,
+                            ext_refresh_frame_flags->alt_ref_frame);
+    GF_GROUP *gf_group = &cpi->ppi->gf_group;
+    if (ext_refresh_frame_flags->golden_frame)
+      gf_group->update_type[cpi->gf_frame_index] = GF_UPDATE;
+    if (ext_refresh_frame_flags->alt_ref_frame)
+      gf_group->update_type[cpi->gf_frame_index] = ARF_UPDATE;
+    if (ext_refresh_frame_flags->bwd_ref_frame)
+      gf_group->update_type[cpi->gf_frame_index] = INTNL_ARF_UPDATE;
+  }
+
+  if (force_refresh_all)
+    set_refresh_frame_flags(refresh_frame, true, true, true);
 }
 
 static void set_additional_frame_flags(const AV1_COMMON *const cm,
@@ -124,64 +138,6 @@ static void set_additional_frame_flags(const AV1_COMMON *const cm,
   }
 }
 
-static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
-  if (cpi->common.show_frame) {
-    if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
-        cpi->common.current_frame.frame_type == KEY_FRAME) {
-      // If this is a show_existing_frame with a source other than altref,
-      // or if it is not a displayed forward keyframe, the keyframe update
-      // counters were incremented when it was originally encoded.
-      cpi->rc.frames_since_key++;
-      cpi->rc.frames_to_key--;
-    }
-  }
-}
-
-static INLINE int is_frame_droppable(const SVC *const svc,
-                                     const ExternalFlags *const ext_flags) {
-  // Droppable frame is only used by external refresh flags. VoD setting won't
-  // trigger its use case.
-  if (svc->external_ref_frame_config)
-    return svc->non_reference_frame;
-  else if (ext_flags->refresh_frame_flags_pending)
-    return !(ext_flags->refresh_alt_ref_frame ||
-             ext_flags->refresh_alt2_ref_frame ||
-             ext_flags->refresh_bwd_ref_frame ||
-             ext_flags->refresh_golden_frame || ext_flags->refresh_last_frame);
-  else
-    return 0;
-}
-
-static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
-  // TODO(weitinglin): Updating this counter for is_frame_droppable
-  // is a work-around to handle the condition when a frame is drop.
-  // We should fix the cpi->common.show_frame flag
-  // instead of checking the other condition to update the counter properly.
-  if (cpi->common.show_frame ||
-      is_frame_droppable(&cpi->svc, &cpi->ext_flags)) {
-    // Decrement count down till next gf
-    if (cpi->rc.frames_till_gf_update_due > 0)
-      cpi->rc.frames_till_gf_update_due--;
-  }
-}
-
-static INLINE void update_gf_group_index(AV1_COMP *cpi) {
-  // Increment the gf group index ready for the next frame. If this is
-  // a show_existing_frame with a source other than altref, or if it is not
-  // a displayed forward keyframe, the index was incremented when it was
-  // originally encoded.
-  if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
-      cpi->common.current_frame.frame_type == KEY_FRAME) {
-    ++cpi->gf_group.index;
-  }
-}
-
-static void update_rc_counts(AV1_COMP *cpi) {
-  update_keyframe_counters(cpi);
-  update_frames_till_gf_update(cpi);
-  update_gf_group_index(cpi);
-}
-
 static void set_ext_overrides(AV1_COMMON *const cm,
                               EncodeFrameParams *const frame_params,
                               ExternalFlags *const ext_flags) {
@@ -209,49 +165,50 @@ static void set_ext_overrides(AV1_COMMON *const cm,
   frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME;
 }
 
-static int get_current_frame_ref_type(
-    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
-  // We choose the reference "type" of this frame from the flags which indicate
-  // which reference frames will be refreshed by it.  More than one  of these
-  // flags may be set, so the order here implies an order of precedence. This is
-  // just used to choose the primary_ref_frame (as the most recent reference
-  // buffer of the same reference-type as the current frame)
-
-  (void)frame_params;
-  // TODO(jingning): This table should be a lot simpler with the new
-  // ARF system in place. Keep frame_params for the time being as we are
-  // still evaluating a few design options.
-  switch (cpi->gf_group.layer_depth[cpi->gf_group.index]) {
-    case 0: return 0;
-    case 1: return 1;
-    case MAX_ARF_LAYERS:
-    case MAX_ARF_LAYERS + 1: return 4;
-    default: return 7;
-  }
-}
-
 static int choose_primary_ref_frame(
-    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
+    AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
   const AV1_COMMON *const cm = &cpi->common;
 
   const int intra_only = frame_params->frame_type == KEY_FRAME ||
                          frame_params->frame_type == INTRA_ONLY_FRAME;
-  if (intra_only || frame_params->error_resilient_mode || cpi->use_svc ||
+  if (intra_only || frame_params->error_resilient_mode ||
       cpi->ext_flags.use_primary_ref_none) {
     return PRIMARY_REF_NONE;
   }
 
   // In large scale case, always use Last frame's frame contexts.
   // Note(yunqing): In other cases, primary_ref_frame is chosen based on
-  // cpi->gf_group.layer_depth[cpi->gf_group.index], which also controls
+  // cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], which also controls
   // frame bit allocation.
   if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME);
 
+  if (cpi->ppi->use_svc) return av1_svc_primary_ref_frame(cpi);
+
   // Find the most recent reference frame with the same reference type as the
   // current frame
-  const int current_ref_type = get_current_frame_ref_type(cpi, frame_params);
-  int wanted_fb = cpi->fb_of_context_type[current_ref_type];
-
+  const int current_ref_type = get_current_frame_ref_type(cpi);
+  int wanted_fb = cpi->ppi->fb_of_context_type[current_ref_type];
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FRAME_PARALLEL_ENCODE_2 && \
+    CONFIG_FPMT_TEST
+  if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+    GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    if (gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+      int frame_level = gf_group->frame_parallel_level[cpi->gf_frame_index];
+      // Book keep wanted_fb of frame_parallel_level 1 frame in an FP2 set.
+      if (frame_level == 1) {
+        cpi->wanted_fb = wanted_fb;
+      }
+      // Use the wanted_fb of level 1 frame in an FP2 for a level 2 frame in the
+      // set.
+      if (frame_level == 2 &&
+          gf_group->update_type[cpi->gf_frame_index - 1] == INTNL_ARF_UPDATE) {
+        assert(gf_group->frame_parallel_level[cpi->gf_frame_index - 1] == 1);
+        wanted_fb = cpi->wanted_fb;
+      }
+    }
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FRAME_PARALLEL_ENCODE_2 &&
+        // CONFIG_FPMT_TEST
   int primary_ref_frame = PRIMARY_REF_NONE;
   for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) {
@@ -262,76 +219,27 @@ static int choose_primary_ref_frame(
   return primary_ref_frame;
 }
 
-static void update_fb_of_context_type(
-    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
-    int *const fb_of_context_type) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int current_frame_ref_type =
-      get_current_frame_ref_type(cpi, frame_params);
-
-  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
-      cpi->ext_flags.use_primary_ref_none) {
-    for (int i = 0; i < REF_FRAMES; i++) {
-      fb_of_context_type[i] = -1;
-    }
-    fb_of_context_type[current_frame_ref_type] =
-        cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME)
-                       : get_ref_frame_map_idx(cm, ALTREF_FRAME);
-  }
-
-  if (!encode_show_existing_frame(cm)) {
-    // Refresh fb_of_context_type[]: see encoder.h for explanation
-    if (cm->current_frame.frame_type == KEY_FRAME) {
-      // All ref frames are refreshed, pick one that will live long enough
-      fb_of_context_type[current_frame_ref_type] = 0;
-    } else {
-      // If more than one frame is refreshed, it doesn't matter which one we
-      // pick so pick the first.  LST sometimes doesn't refresh any: this is ok
-
-      for (int i = 0; i < REF_FRAMES; i++) {
-        if (cm->current_frame.refresh_frame_flags & (1 << i)) {
-          fb_of_context_type[current_frame_ref_type] = i;
-          break;
-        }
-      }
-    }
-  }
-}
-
-static int get_order_offset(const GF_GROUP *const gf_group,
-                            const EncodeFrameParams *const frame_params) {
-  // shown frame by definition has order offset 0
-  // show_existing_frame ignores order_offset and simply takes the order_hint
-  // from the reference frame being shown.
-  if (frame_params->show_frame || frame_params->show_existing_frame) return 0;
-
-  const int arf_offset =
-      AOMMIN((MAX_GF_INTERVAL - 1), gf_group->arf_src_offset[gf_group->index]);
-  return AOMMIN((MAX_GF_INTERVAL - 1), arf_offset);
-}
-
 static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) {
   TimeStamps *time_stamps = &cpi->time_stamps;
   int64_t this_duration;
   int step = 0;
 
   // Clear down mmx registers
-  aom_clear_system_state();
 
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
+  if (cpi->ppi->use_svc && cpi->svc.spatial_layer_id > 0) {
     cpi->framerate = cpi->svc.base_framerate;
     av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
     return;
   }
 
-  if (ts_start == time_stamps->first_ever) {
+  if (ts_start == time_stamps->first_ts_start) {
     this_duration = ts_end - ts_start;
     step = 1;
   } else {
     int64_t last_duration =
-        time_stamps->prev_end_seen - time_stamps->prev_start_seen;
+        time_stamps->prev_ts_end - time_stamps->prev_ts_start;
 
-    this_duration = ts_end - time_stamps->prev_end_seen;
+    this_duration = ts_end - time_stamps->prev_ts_end;
 
     // do a step update if the duration changes by 10%
     if (last_duration)
@@ -340,80 +248,30 @@ static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) {
 
   if (this_duration) {
     if (step) {
-      av1_new_framerate(cpi, 10000000.0 / this_duration);
+      cpi->new_framerate = 10000000.0 / this_duration;
+      av1_new_framerate(cpi, cpi->new_framerate);
     } else {
       // Average this frame's rate into the last second's average
       // frame rate. If we haven't seen 1 second yet, then average
       // over the whole interval seen.
       const double interval =
-          AOMMIN((double)(ts_end - time_stamps->first_ever), 10000000.0);
+          AOMMIN((double)(ts_end - time_stamps->first_ts_start), 10000000.0);
       double avg_duration = 10000000.0 / cpi->framerate;
       avg_duration *= (interval - avg_duration + this_duration);
       avg_duration /= interval;
-
-      av1_new_framerate(cpi, 10000000.0 / avg_duration);
+      cpi->new_framerate = (10000000.0 / avg_duration);
+      // For parallel frames update cpi->framerate with new_framerate
+      // during av1_post_encode_updates()
+      double framerate =
+          (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+              ? cpi->framerate
+              : cpi->new_framerate;
+      av1_new_framerate(cpi, framerate);
     }
   }
-  time_stamps->prev_start_seen = ts_start;
-  time_stamps->prev_end_seen = ts_end;
-}
 
-// If this is an alt-ref, returns the offset of the source frame used
-// as the arf midpoint. Otherwise, returns 0.
-static int get_arf_src_index(GF_GROUP *gf_group, int pass) {
-  int arf_src_index = 0;
-  if (pass != 1) arf_src_index = gf_group->arf_src_offset[gf_group->index];
-  return arf_src_index;
-}
-
-// Called if this frame is an ARF or ARF2. Also handles forward-keyframes
-// For an ARF set arf2=0, for ARF2 set arf2=1
-// temporal_filtered is set to 1 if we temporally filter the ARF frame, so that
-// the correct post-filter buffer can be used.
-static struct lookahead_entry *setup_arf_frame(
-    AV1_COMP *const cpi, const int arf_src_index, int *code_arf,
-    EncodeFrameParams *const frame_params, int *show_existing_alt_ref) {
-  AV1_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
-#if !CONFIG_REALTIME_ONLY
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-#endif
-
-  assert(arf_src_index <= rc->frames_to_key);
-  *code_arf = 0;
-
-  struct lookahead_entry *source =
-      av1_lookahead_peek(cpi->lookahead, arf_src_index, cpi->compressor_stage);
-
-  if (source != NULL) {
-    cm->showable_frame = 1;
-
-    // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
-    if (arf_src_index == rc->frames_to_key) {
-      // Skip temporal filtering and mark as intra_only if we have a fwd_kf
-      cpi->no_show_kf = 1;
-    } else {
-#if !CONFIG_REALTIME_ONLY
-      if (oxcf->arnr_max_frames > 0) {
-        // Produce the filtered ARF frame.
-        cm->current_frame.frame_type = INTER_FRAME;
-        FRAME_UPDATE_TYPE frame_update_type =
-            get_frame_update_type(&cpi->gf_group);
-        av1_configure_buffer_updates(cpi, frame_params, frame_update_type, 0);
-        *code_arf =
-            av1_temporal_filter(cpi, arf_src_index, show_existing_alt_ref);
-        if (*code_arf) {
-          aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
-        }
-      }
-#else
-      (void)show_existing_alt_ref;
-#endif
-    }
-    frame_params->show_frame = 0;
-  }
-  rc->source_alt_ref_pending = 0;
-  return source;
+  time_stamps->prev_ts_start = ts_start;
+  time_stamps->prev_ts_end = ts_end;
 }
 
 // Determine whether there is a forced keyframe pending in the lookahead buffer
@@ -428,7 +286,7 @@ int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
       // so there isn't a forced key-frame pending.
       return -1;
     } else if (e->flags == AOM_EFLAG_FORCE_KF) {
-      return (i + 1);
+      return i;
     } else {
       continue;
     }
@@ -441,38 +299,81 @@ int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
 // temporal_filtered, flush, and frame_update_type are outputs.
 // Return the frame source, or NULL if we couldn't find one
 static struct lookahead_entry *choose_frame_source(
-    AV1_COMP *const cpi, int *const code_arf, int *const flush,
-    struct lookahead_entry **last_source, EncodeFrameParams *const frame_params,
-    int *show_existing_alt_ref) {
+    AV1_COMP *const cpi, int *const flush, int *pop_lookahead,
+    struct lookahead_entry **last_source,
+    EncodeFrameParams *const frame_params) {
   AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   struct lookahead_entry *source = NULL;
-  *code_arf = 0;
 
-  // Should we encode an alt-ref frame.
-  int arf_src_index = get_arf_src_index(&cpi->gf_group, cpi->oxcf.pass);
+  // Source index in lookahead buffer.
+  int src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
+
   // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q
-  if (arf_src_index &&
-      (is_forced_keyframe_pending(cpi->lookahead, arf_src_index,
+  if (src_index &&
+      (is_forced_keyframe_pending(cpi->ppi->lookahead, src_index,
                                   cpi->compressor_stage) != -1) &&
-      cpi->oxcf.rc_mode != AOM_Q) {
-    arf_src_index = 0;
+      cpi->oxcf.rc_cfg.mode != AOM_Q && !is_stat_generation_stage(cpi)) {
+    src_index = 0;
     *flush = 1;
   }
 
-  if (arf_src_index)
-    source = setup_arf_frame(cpi, arf_src_index, code_arf, frame_params,
-                             show_existing_alt_ref);
+  // If the current frame is arf, then we should not pop from the lookahead
+  // buffer. If the current frame is not arf, then pop it. This assumes the
+  // first frame in the GF group is not arf. May need to change if it is not
+  // true.
+  *pop_lookahead = (src_index == 0);
+  // If this is a key frame and keyframe filtering is enabled with overlay,
+  // then do not pop.
+  if (*pop_lookahead && cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1 &&
+      gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE &&
+      !is_stat_generation_stage(cpi) && cpi->ppi->lookahead) {
+    if (cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz &&
+        (*flush ||
+         cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz ==
+             cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].pop_sz)) {
+      *pop_lookahead = 0;
+    }
+  }
+
+  // LAP stage does not have ARFs or forward key-frames,
+  // hence, always pop_lookahead here.
+  if (is_stat_generation_stage(cpi)) {
+    *pop_lookahead = 1;
+    src_index = 0;
+  }
 
-  if (!source) {
+  frame_params->show_frame = *pop_lookahead;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FPMT_TEST
+  if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE) {
+#else
+  {
+#endif  // CONFIG_FPMT_TEST
+    // Future frame in parallel encode set
+    if (gf_group->src_offset[cpi->gf_frame_index] != 0 &&
+        !is_stat_generation_stage(cpi))
+      src_index = gf_group->src_offset[cpi->gf_frame_index];
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+  if (frame_params->show_frame) {
+    // show frame, pop from buffer
     // Get last frame source.
     if (cm->current_frame.frame_number > 0) {
-      *last_source =
-          av1_lookahead_peek(cpi->lookahead, -1, cpi->compressor_stage);
+      *last_source = av1_lookahead_peek(cpi->ppi->lookahead, src_index - 1,
+                                        cpi->compressor_stage);
     }
     // Read in the source frame.
-    source = av1_lookahead_pop(cpi->lookahead, *flush, cpi->compressor_stage);
-    if (source == NULL) return NULL;
-    frame_params->show_frame = 1;
+    source = av1_lookahead_peek(cpi->ppi->lookahead, src_index,
+                                cpi->compressor_stage);
+  } else {
+    // no show frames are arf frames
+    source = av1_lookahead_peek(cpi->ppi->lookahead, src_index,
+                                cpi->compressor_stage);
+    if (source != NULL) {
+      cm->showable_frame = 1;
+    }
   }
   return source;
 }
@@ -485,14 +386,14 @@ static int allow_show_existing(const AV1_COMP *const cpi,
   if (cpi->common.current_frame.frame_number == 0) return 0;
 
   const struct lookahead_entry *lookahead_src =
-      av1_lookahead_peek(cpi->lookahead, 0, cpi->compressor_stage);
+      av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage);
   if (lookahead_src == NULL) return 1;
 
   const int is_error_resilient =
-      cpi->oxcf.error_resilient_mode ||
+      cpi->oxcf.tool_cfg.error_resilient_mode ||
       (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
-  const int is_s_frame =
-      cpi->oxcf.s_frame_mode || (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+  const int is_s_frame = cpi->oxcf.kf_cfg.enable_sframe ||
+                         (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
   const int is_key_frame =
       (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY);
   return !(is_error_resilient || is_s_frame) || is_key_frame;
@@ -500,8 +401,10 @@ static int allow_show_existing(const AV1_COMP *const cpi,
 
 // Update frame_flags to tell the encoder's caller what sort of frame was
 // encoded.
-static void update_frame_flags(AV1_COMP *cpi, unsigned int *frame_flags) {
-  if (encode_show_existing_frame(&cpi->common)) {
+static void update_frame_flags(const AV1_COMMON *const cm,
+                               const RefreshFrameInfo *const refresh_frame,
+                               unsigned int *frame_flags) {
+  if (encode_show_existing_frame(cm)) {
     *frame_flags &= ~FRAMEFLAGS_GOLDEN;
     *frame_flags &= ~FRAMEFLAGS_BWDREF;
     *frame_flags &= ~FRAMEFLAGS_ALTREF;
@@ -509,25 +412,25 @@ static void update_frame_flags(AV1_COMP *cpi, unsigned int *frame_flags) {
     return;
   }
 
-  if (cpi->refresh_golden_frame == 1) {
+  if (refresh_frame->golden_frame) {
     *frame_flags |= FRAMEFLAGS_GOLDEN;
   } else {
     *frame_flags &= ~FRAMEFLAGS_GOLDEN;
   }
 
-  if (cpi->refresh_alt_ref_frame == 1) {
+  if (refresh_frame->alt_ref_frame) {
     *frame_flags |= FRAMEFLAGS_ALTREF;
   } else {
     *frame_flags &= ~FRAMEFLAGS_ALTREF;
   }
 
-  if (cpi->refresh_bwd_ref_frame == 1) {
+  if (refresh_frame->bwd_ref_frame) {
     *frame_flags |= FRAMEFLAGS_BWDREF;
   } else {
     *frame_flags &= ~FRAMEFLAGS_BWDREF;
   }
 
-  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+  if (cm->current_frame.frame_type == KEY_FRAME) {
     *frame_flags |= FRAMEFLAGS_KEY;
   } else {
     *frame_flags &= ~FRAMEFLAGS_KEY;
@@ -587,167 +490,141 @@ static void dump_ref_frame_images(AV1_COMP *cpi) {
 #endif  // DUMP_REF_FRAME_IMAGES == 1
 
 int av1_get_refresh_ref_frame_map(int refresh_frame_flags) {
-  int ref_map_index = INVALID_IDX;
+  int ref_map_index;
 
   for (ref_map_index = 0; ref_map_index < REF_FRAMES; ++ref_map_index)
     if ((refresh_frame_flags >> ref_map_index) & 1) break;
 
+  if (ref_map_index == REF_FRAMES) ref_map_index = INVALID_IDX;
   return ref_map_index;
 }
 
-static void update_arf_stack(int ref_map_index,
-                             RefBufferStack *ref_buffer_stack) {
-  if (ref_buffer_stack->arf_stack_size >= 0) {
-    if (ref_buffer_stack->arf_stack[0] == ref_map_index)
-      stack_pop(ref_buffer_stack->arf_stack, &ref_buffer_stack->arf_stack_size);
-  }
-
-  if (ref_buffer_stack->lst_stack_size) {
-    for (int i = ref_buffer_stack->lst_stack_size - 1; i >= 0; --i) {
-      if (ref_buffer_stack->lst_stack[i] == ref_map_index) {
-        for (int idx = i; idx < ref_buffer_stack->lst_stack_size - 1; ++idx)
-          ref_buffer_stack->lst_stack[idx] =
-              ref_buffer_stack->lst_stack[idx + 1];
-        ref_buffer_stack->lst_stack[ref_buffer_stack->lst_stack_size - 1] =
-            INVALID_IDX;
-        --ref_buffer_stack->lst_stack_size;
+static int get_free_ref_map_index(RefFrameMapPair ref_map_pairs[REF_FRAMES]) {
+  for (int idx = 0; idx < REF_FRAMES; ++idx)
+    if (ref_map_pairs[idx].disp_order == -1) return idx;
+  return INVALID_IDX;
+}
+
+static int get_refresh_idx(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                           int update_arf,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                           GF_GROUP *gf_group, int gf_index,
+                           int enable_refresh_skip,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                           int cur_frame_disp) {
+  int arf_count = 0;
+  int oldest_arf_order = INT32_MAX;
+  int oldest_arf_idx = -1;
+
+  int oldest_frame_order = INT32_MAX;
+  int oldest_idx = -1;
+
+  for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+    RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx];
+    if (ref_pair.disp_order == -1) continue;
+    const int frame_order = ref_pair.disp_order;
+    const int reference_frame_level = ref_pair.pyr_level;
+    // Keep future frames and three closest previous frames in output order.
+    if (frame_order > cur_frame_disp - 3) continue;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    if (enable_refresh_skip) {
+      int skip_frame = 0;
+      // Prevent refreshing a frame in gf_group->skip_frame_refresh.
+      for (int i = 0; i < REF_FRAMES; i++) {
+        int frame_to_skip = gf_group->skip_frame_refresh[gf_index][i];
+        if (frame_to_skip == INVALID_IDX) break;
+        if (frame_order == frame_to_skip) {
+          skip_frame = 1;
+          break;
+        }
       }
+      if (skip_frame) continue;
     }
-  }
-
-  if (ref_buffer_stack->gld_stack_size) {
-    for (int i = ref_buffer_stack->gld_stack_size - 1; i >= 0; --i) {
-      if (ref_buffer_stack->gld_stack[i] == ref_map_index) {
-        for (int idx = i; idx < ref_buffer_stack->gld_stack_size - 1; ++idx)
-          ref_buffer_stack->gld_stack[idx] =
-              ref_buffer_stack->gld_stack[idx + 1];
-        ref_buffer_stack->gld_stack[ref_buffer_stack->gld_stack_size - 1] =
-            INVALID_IDX;
-        --ref_buffer_stack->gld_stack_size;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+
+    // Keep track of the oldest level 1 frame if the current frame is also level
+    // 1.
+    if (reference_frame_level == 1) {
+      // If there are more than 2 level 1 frames in the reference list,
+      // discard the oldest.
+      if (frame_order < oldest_arf_order) {
+        oldest_arf_order = frame_order;
+        oldest_arf_idx = map_idx;
       }
+      arf_count++;
+      continue;
     }
-  }
-}
 
-// Update reference frame stack info.
-void av1_update_ref_frame_map(AV1_COMP *cpi,
-                              FRAME_UPDATE_TYPE frame_update_type,
-                              int show_existing_frame, int ref_map_index,
-                              RefBufferStack *ref_buffer_stack) {
-  AV1_COMMON *const cm = &cpi->common;
-  // TODO(jingning): Consider the S-frame same as key frame for the
-  // reference frame tracking purpose. The logic might be better
-  // expressed than converting the frame update type.
-  if (frame_is_sframe(cm)) frame_update_type = KEY_FRAME;
-
-  if (is_frame_droppable(&cpi->svc, &cpi->ext_flags)) return;
-
-  switch (frame_update_type) {
-    case KEY_FRAME:
-      if (show_existing_frame)
-        ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
-                                  &ref_buffer_stack->arf_stack_size);
-      stack_reset(ref_buffer_stack->lst_stack,
-                  &ref_buffer_stack->lst_stack_size);
-      stack_reset(ref_buffer_stack->gld_stack,
-                  &ref_buffer_stack->gld_stack_size);
-      stack_reset(ref_buffer_stack->arf_stack,
-                  &ref_buffer_stack->arf_stack_size);
-      stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size,
-                 ref_map_index);
-      break;
-    case GF_UPDATE:
-      update_arf_stack(ref_map_index, ref_buffer_stack);
-      stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size,
-                 ref_map_index);
-      // For nonrd_mode: update LAST as well on GF_UPDATE frame.
-      if (cpi->sf.rt_sf.use_nonrd_pick_mode)
-        stack_push(ref_buffer_stack->lst_stack,
-                   &ref_buffer_stack->lst_stack_size, ref_map_index);
-      break;
-    case LF_UPDATE:
-      update_arf_stack(ref_map_index, ref_buffer_stack);
-      stack_push(ref_buffer_stack->lst_stack, &ref_buffer_stack->lst_stack_size,
-                 ref_map_index);
-      break;
-    case ARF_UPDATE:
-    case INTNL_ARF_UPDATE:
-      update_arf_stack(ref_map_index, ref_buffer_stack);
-      stack_push(ref_buffer_stack->arf_stack, &ref_buffer_stack->arf_stack_size,
-                 ref_map_index);
-      break;
-    case OVERLAY_UPDATE:
-      ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
-                                &ref_buffer_stack->arf_stack_size);
-      stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size,
-                 ref_map_index);
-      break;
-    case INTNL_OVERLAY_UPDATE:
-      ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
-                                &ref_buffer_stack->arf_stack_size);
-      stack_push(ref_buffer_stack->lst_stack, &ref_buffer_stack->lst_stack_size,
-                 ref_map_index);
-      break;
-    default: assert(0 && "unknown type");
+    // Update the overall oldest reference frame.
+    if (frame_order < oldest_frame_order) {
+      oldest_frame_order = frame_order;
+      oldest_idx = map_idx;
+    }
   }
-  return;
+  if (update_arf && arf_count > 2) return oldest_arf_idx;
+  if (oldest_idx >= 0) return oldest_idx;
+  if (oldest_arf_idx >= 0) return oldest_arf_idx;
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  if (oldest_idx == -1) {
+    assert(arf_count > 2 && enable_refresh_skip);
+    return oldest_arf_idx;
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+  assert(0 && "No valid refresh index found");
+  return -1;
 }
 
-static int get_free_ref_map_index(const RefBufferStack *ref_buffer_stack) {
-  for (int idx = 0; idx < REF_FRAMES; ++idx) {
-    int is_free = 1;
-    for (int i = 0; i < ref_buffer_stack->arf_stack_size; ++i) {
-      if (ref_buffer_stack->arf_stack[i] == idx) {
-        is_free = 0;
-        break;
-      }
-    }
-
-    for (int i = 0; i < ref_buffer_stack->lst_stack_size; ++i) {
-      if (ref_buffer_stack->lst_stack[i] == idx) {
-        is_free = 0;
-        break;
-      }
-    }
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FRAME_PARALLEL_ENCODE_2
+// Computes the reference refresh index for INTNL_ARF_UPDATE frame.
+int av1_calc_refresh_idx_for_intnl_arf(
+    AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+    int gf_index) {
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
 
-    for (int i = 0; i < ref_buffer_stack->gld_stack_size; ++i) {
-      if (ref_buffer_stack->gld_stack[i] == idx) {
-        is_free = 0;
-        break;
-      }
-    }
+  // Search for the open slot to store the current frame.
+  int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs);
 
-    if (is_free) return idx;
+  // Use a free slot if available.
+  if (free_fb_index != INVALID_IDX) {
+    return free_fb_index;
+  } else {
+    int enable_refresh_skip = !is_one_pass_rt_params(cpi);
+    int refresh_idx =
+        get_refresh_idx(ref_frame_map_pairs, 0, gf_group, gf_index,
+                        enable_refresh_skip, gf_group->display_idx[gf_index]);
+    return refresh_idx;
   }
-  return INVALID_IDX;
 }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FRAME_PARALLEL_ENCODE_2
 
-int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
-                                const EncodeFrameParams *const frame_params,
-                                FRAME_UPDATE_TYPE frame_update_type,
-                                const RefBufferStack *const ref_buffer_stack) {
+int av1_get_refresh_frame_flags(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
+    FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order,
+    RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) {
   const AV1_COMMON *const cm = &cpi->common;
-  const ExternalFlags *const ext_flags = &cpi->ext_flags;
-  const SVC *const svc = &cpi->svc;
+  const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+      &cpi->ext_flags.refresh_frame;
+
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (gf_group->refbuf_state[gf_index] == REFBUF_RESET)
+    return SELECT_ALL_BUF_SLOTS;
+
+  // TODO(jingning): Deprecate the following operations.
   // Switch frames and shown key-frames overwrite all reference slots
-  if ((frame_params->frame_type == KEY_FRAME && frame_params->show_frame) ||
-      frame_params->frame_type == S_FRAME)
-    return 0xFF;
+  if (frame_params->frame_type == S_FRAME) return SELECT_ALL_BUF_SLOTS;
 
   // show_existing_frames don't actually send refresh_frame_flags so set the
   // flags to 0 to keep things consistent.
-  if (frame_params->show_existing_frame &&
-      (!frame_params->error_resilient_mode ||
-       frame_params->frame_type == KEY_FRAME)) {
-    return 0;
-  }
+  if (frame_params->show_existing_frame) return 0;
 
-  if (is_frame_droppable(svc, ext_flags)) return 0;
+  const SVC *const svc = &cpi->svc;
+  if (is_frame_droppable(svc, ext_refresh_frame_flags)) return 0;
 
   int refresh_mask = 0;
 
-  if (ext_flags->refresh_frame_flags_pending) {
-    if (svc->external_ref_frame_config) {
+  if (ext_refresh_frame_flags->update_pending) {
+    if (svc->set_ref_frame_config) {
       for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
         int ref_frame_map_idx = svc->ref_idx[i];
         refresh_mask |= svc->refresh[ref_frame_map_idx] << ref_frame_map_idx;
@@ -759,97 +636,62 @@ int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
     // order to preserve the behaviour of the flag overrides.
     int ref_frame_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
     if (ref_frame_map_idx != INVALID_IDX)
-      refresh_mask |= ext_flags->refresh_last_frame << ref_frame_map_idx;
+      refresh_mask |= ext_refresh_frame_flags->last_frame << ref_frame_map_idx;
 
     ref_frame_map_idx = get_ref_frame_map_idx(cm, EXTREF_FRAME);
     if (ref_frame_map_idx != INVALID_IDX)
-      refresh_mask |= ext_flags->refresh_bwd_ref_frame << ref_frame_map_idx;
+      refresh_mask |= ext_refresh_frame_flags->bwd_ref_frame
+                      << ref_frame_map_idx;
 
     ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF2_FRAME);
     if (ref_frame_map_idx != INVALID_IDX)
-      refresh_mask |= ext_flags->refresh_alt2_ref_frame << ref_frame_map_idx;
+      refresh_mask |= ext_refresh_frame_flags->alt2_ref_frame
+                      << ref_frame_map_idx;
 
     if (frame_update_type == OVERLAY_UPDATE) {
       ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
       if (ref_frame_map_idx != INVALID_IDX)
-        refresh_mask |= ext_flags->refresh_golden_frame << ref_frame_map_idx;
+        refresh_mask |= ext_refresh_frame_flags->golden_frame
+                        << ref_frame_map_idx;
     } else {
       ref_frame_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
       if (ref_frame_map_idx != INVALID_IDX)
-        refresh_mask |= ext_flags->refresh_golden_frame << ref_frame_map_idx;
+        refresh_mask |= ext_refresh_frame_flags->golden_frame
+                        << ref_frame_map_idx;
 
       ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
       if (ref_frame_map_idx != INVALID_IDX)
-        refresh_mask |= ext_flags->refresh_alt_ref_frame << ref_frame_map_idx;
+        refresh_mask |= ext_refresh_frame_flags->alt_ref_frame
+                        << ref_frame_map_idx;
     }
     return refresh_mask;
   }
 
   // Search for the open slot to store the current frame.
-  int free_fb_index = get_free_ref_map_index(ref_buffer_stack);
-  switch (frame_update_type) {
-    case KF_UPDATE:
-    case GF_UPDATE:
-      if (free_fb_index != INVALID_IDX) {
-        refresh_mask = 1 << free_fb_index;
-      } else {
-        if (ref_buffer_stack->gld_stack_size)
-          refresh_mask =
-              1 << ref_buffer_stack
-                       ->gld_stack[ref_buffer_stack->gld_stack_size - 1];
-        else
-          refresh_mask =
-              1 << ref_buffer_stack
-                       ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
-      }
-      break;
-    case LF_UPDATE:
-      if (free_fb_index != INVALID_IDX) {
-        refresh_mask = 1 << free_fb_index;
-      } else {
-        if (ref_buffer_stack->lst_stack_size >= 2)
-          refresh_mask =
-              1 << ref_buffer_stack
-                       ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
-        else if (ref_buffer_stack->gld_stack_size >= 2)
-          refresh_mask =
-              1 << ref_buffer_stack
-                       ->gld_stack[ref_buffer_stack->gld_stack_size - 1];
-        else
-          assert(0 && "No ref map index found");
-      }
-      break;
-    case ARF_UPDATE:
-      if (free_fb_index != INVALID_IDX) {
-        refresh_mask = 1 << free_fb_index;
-      } else {
-        if (ref_buffer_stack->gld_stack_size >= 3)
-          refresh_mask =
-              1 << ref_buffer_stack
-                       ->gld_stack[ref_buffer_stack->gld_stack_size - 1];
-        else if (ref_buffer_stack->lst_stack_size >= 2)
-          refresh_mask =
-              1 << ref_buffer_stack
-                       ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
-        else
-          assert(0 && "No ref map index found");
-      }
-      break;
-    case INTNL_ARF_UPDATE:
-      if (free_fb_index != INVALID_IDX) {
-        refresh_mask = 1 << free_fb_index;
-      } else {
-        refresh_mask =
-            1 << ref_buffer_stack
-                     ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
-      }
-      break;
-    case OVERLAY_UPDATE: break;
-    case INTNL_OVERLAY_UPDATE: break;
-    default: assert(0); break;
-  }
+  int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs);
+
+  // No refresh necessary for these frame types.
+  if (frame_update_type == OVERLAY_UPDATE ||
+      frame_update_type == INTNL_OVERLAY_UPDATE)
+    return refresh_mask;
 
-  return refresh_mask;
+  // If there is an open slot, refresh that one instead of replacing a
+  // reference.
+  if (free_fb_index != INVALID_IDX) {
+    refresh_mask = 1 << free_fb_index;
+    return refresh_mask;
+  }
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  const int enable_refresh_skip = !is_one_pass_rt_params(cpi);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+  const int update_arf = frame_update_type == ARF_UPDATE;
+  const int refresh_idx =
+      get_refresh_idx(ref_frame_map_pairs, update_arf,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                      &cpi->ppi->gf_group, gf_index, enable_refresh_skip,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                      cur_disp_order);
+  return 1 << refresh_idx;
 }
 
 #if !CONFIG_REALTIME_ONLY
@@ -859,78 +701,177 @@ void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params.sb_size);
+  av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params->sb_size);
 
-  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
-                         cm->seq_params.subsampling_y, num_planes);
+  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+                         cm->seq_params->subsampling_y, num_planes);
 
   set_mi_offsets(&cm->mi_params, xd, 0, 0);
 }
 
-// Apply temporal filtering to key frames and encode the filtered frame.
-// If the current frame is not key frame, this function is identical to
-// av1_encode().
+// Apply temporal filtering to source frames and encode the filtered frame.
+// If the current frame does not require filtering, this function is identical
+// to av1_encode() except that tpl is not performed.
 static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
                               EncodeFrameInput *const frame_input,
                               EncodeFrameParams *const frame_params,
                               EncodeFrameResults *const frame_results) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) start_timing(cpi, denoise_and_encode_time);
+#endif
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  const int is_second_arf =
+      av1_gop_is_second_arf(gf_group, cpi->gf_frame_index);
 
   // Decide whether to apply temporal filtering to the source frame.
   int apply_filtering =
-      frame_params->frame_type == KEY_FRAME &&
-      oxcf->enable_keyframe_filtering && !is_stat_generation_stage(cpi) &&
-      !frame_params->show_existing_frame &&
-      cpi->rc.frames_to_key > TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME &&
-      !is_lossless_requested(oxcf) && oxcf->arnr_max_frames > 0;
+      av1_is_temporal_filter_on(oxcf) && !is_stat_generation_stage(cpi);
+  if (update_type != KF_UPDATE && update_type != ARF_UPDATE && !is_second_arf) {
+    apply_filtering = 0;
+  }
   if (apply_filtering) {
-    const double y_noise_level = av1_estimate_noise_from_single_plane(
-        frame_input->source, 0, cm->seq_params.bit_depth);
-    apply_filtering = y_noise_level > 0;
+    if (frame_params->frame_type == KEY_FRAME) {
+      // TODO(angiebird): Move the noise level check to av1_tf_info_filtering.
+      // Decide whether it is allowed to perform key frame filtering
+      int allow_kf_filtering = oxcf->kf_cfg.enable_keyframe_filtering &&
+                               !frame_params->show_existing_frame &&
+                               !is_lossless_requested(&oxcf->rc_cfg);
+      if (allow_kf_filtering) {
+        const double y_noise_level = av1_estimate_noise_from_single_plane(
+            frame_input->source, 0, cm->seq_params->bit_depth,
+            NOISE_ESTIMATION_EDGE_THRESHOLD);
+        apply_filtering = y_noise_level > 0;
+      } else {
+        apply_filtering = 0;
+      }
+      // If we are doing kf filtering, set up a few things.
+      if (apply_filtering) {
+        av1_setup_past_independence(cm);
+      }
+    } else if (is_second_arf) {
+      apply_filtering = cpi->sf.hl_sf.second_alt_ref_filtering;
+    }
   }
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) start_timing(cpi, apply_filtering_time);
+#endif
   // Save the pointer to the original source image.
-  YV12_BUFFER_CONFIG *source_kf_buffer = frame_input->source;
-
-  // Apply filtering to key frame.
+  YV12_BUFFER_CONFIG *source_buffer = frame_input->source;
+  // apply filtering to frame
   if (apply_filtering) {
-    // Initialization for frame motion estimation.
-    MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-    av1_init_mi_buffers(&cm->mi_params);
-    setup_mi(cpi, frame_input->source);
-    av1_init_macroblockd(cm, xd, NULL);
-    memset(
-        cpi->mbmi_ext_info.frame_base, 0,
-        cpi->mbmi_ext_info.alloc_size * sizeof(*cpi->mbmi_ext_info.frame_base));
-
-    av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
-    av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
-    av1_set_rd_speed_thresholds(cpi);
-    av1_setup_frame_buf_refs(cm);
-    av1_setup_frame_sign_bias(cm);
-    av1_frame_init_quantizer(cpi);
-    av1_setup_past_independence(cm);
-
-    if (!frame_params->show_frame) {
-      int arf_src_index = get_arf_src_index(&cpi->gf_group, cpi->oxcf.pass);
-      av1_temporal_filter(cpi, -1 * arf_src_index, NULL);
-    } else {
-      av1_temporal_filter(cpi, -1, NULL);
+    int show_existing_alt_ref = 0;
+    FRAME_DIFF frame_diff;
+    int top_index = 0;
+    int bottom_index = 0;
+    const int q_index = av1_rc_pick_q_and_bounds(
+        cpi, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height,
+        cpi->gf_frame_index, &bottom_index, &top_index);
+
+    // TODO(bohanli): figure out why we need frame_type in cm here.
+    cm->current_frame.frame_type = frame_params->frame_type;
+    if (update_type == KF_UPDATE || update_type == ARF_UPDATE) {
+      YV12_BUFFER_CONFIG *tf_buf = av1_tf_info_get_filtered_buf(
+          &cpi->ppi->tf_info, cpi->gf_frame_index, &frame_diff);
+      if (tf_buf != NULL) {
+        frame_input->source = tf_buf;
+        show_existing_alt_ref = av1_check_show_filtered_frame(
+            tf_buf, &frame_diff, q_index, cm->seq_params->bit_depth);
+        if (show_existing_alt_ref) {
+          cpi->common.showable_frame |= 1;
+        }
+      }
+      if (gf_group->frame_type[cpi->gf_frame_index] != KEY_FRAME) {
+        cpi->ppi->show_existing_alt_ref = show_existing_alt_ref;
+      }
+    }
+
+    if (is_second_arf) {
+      YV12_BUFFER_CONFIG *tf_buf_second_arf =
+          &cpi->ppi->tf_info.tf_buf_second_arf;
+      // We didn't apply temporal filtering for second arf ahead in
+      // av1_tf_info_filtering().
+      const int arf_src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
+      // Right now, we are still using tf_buf_second_arf due to
+      // implementation complexity.
+      // TODO(angiebird): Reuse tf_info->tf_buf here.
+      av1_temporal_filter(cpi, arf_src_index, cpi->gf_frame_index, &frame_diff,
+                          tf_buf_second_arf);
+      show_existing_alt_ref = av1_check_show_filtered_frame(
+          tf_buf_second_arf, &frame_diff, q_index, cm->seq_params->bit_depth);
+      if (show_existing_alt_ref) {
+        aom_extend_frame_borders(tf_buf_second_arf, av1_num_planes(cm));
+        frame_input->source = tf_buf_second_arf;
+        aom_copy_metadata_to_frame_buffer(frame_input->source,
+                                          source_buffer->metadata);
+      }
+      // Currently INTNL_ARF_UPDATE only do show_existing.
+      cpi->common.showable_frame |= 1;
     }
-    aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
-    // Use the filtered frame for encoding.
-    frame_input->source = &cpi->alt_ref_buffer;
-    // Copy metadata info to alt-ref buffer.
-    aom_remove_metadata_from_frame_buffer(frame_input->source);
-    aom_copy_metadata_to_frame_buffer(frame_input->source,
-                                      source_kf_buffer->metadata);
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) end_timing(cpi, apply_filtering_time);
+#endif
 
-  if (frame_params->frame_type == KEY_FRAME && !is_stat_generation_stage(cpi) &&
-      oxcf->enable_tpl_model && oxcf->lag_in_frames > 0 &&
-      frame_params->show_frame) {
-    av1_tpl_setup_stats(cpi, 0, frame_params, frame_input);
+  int set_mv_params = frame_params->frame_type == KEY_FRAME ||
+                      update_type == ARF_UPDATE || update_type == GF_UPDATE;
+  cm->show_frame = frame_params->show_frame;
+  cm->current_frame.frame_type = frame_params->frame_type;
+  // TODO(bohanli): Why is this? what part of it is necessary?
+  av1_set_frame_size(cpi, cm->superres_upscaled_width,
+                     cm->superres_upscaled_height);
+  if (set_mv_params) av1_set_mv_search_params(cpi);
+
+#if CONFIG_RD_COMMAND
+  if (frame_params->frame_type == KEY_FRAME) {
+    char filepath[] = "rd_command.txt";
+    av1_read_rd_command(filepath, &cpi->rd_command);
+  }
+#endif  // CONFIG_RD_COMMAND
+  if (cpi->gf_frame_index == 0 && !is_stat_generation_stage(cpi)) {
+    // perform tpl after filtering
+    int allow_tpl =
+        oxcf->gf_cfg.lag_in_frames > 1 && oxcf->algo_cfg.enable_tpl_model;
+    if (gf_group->size > MAX_LENGTH_TPL_FRAME_STATS) {
+      allow_tpl = 0;
+    }
+    if (frame_params->frame_type == KEY_FRAME) {
+      // TODO(angiebird): handle disable_filtered_key_tpl properly
+      allow_tpl = allow_tpl && !cpi->sf.tpl_sf.disable_filtered_key_tpl;
+    } else {
+      // In rare case, it's possible to have non ARF/GF update_type here.
+      // We should set allow_tpl to zero in the situation
+      allow_tpl =
+          allow_tpl && (update_type == ARF_UPDATE || update_type == GF_UPDATE);
+    }
+
+    if (allow_tpl) {
+      if (!cpi->skip_tpl_setup_stats) {
+        av1_tpl_preload_rc_estimate(cpi, frame_params);
+        av1_tpl_setup_stats(cpi, 0, frame_params);
+#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS
+        assert(cpi->gf_frame_index == 0);
+        av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data,
+                                       gf_group, cm->seq_params->bit_depth);
+#endif
+      }
+    } else {
+      av1_init_tpl_stats(&cpi->ppi->tpl_data);
+    }
+#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+    if (cpi->oxcf.pass == AOM_RC_SECOND_PASS &&
+        cpi->second_pass_log_stream != NULL) {
+      TPL_INFO *tpl_info;
+      AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info)));
+      av1_pack_tpl_info(tpl_info, gf_group, &cpi->ppi->tpl_data);
+      av1_write_tpl_info(tpl_info, cpi->second_pass_log_stream,
+                         cpi->common.error);
+      aom_free(tpl_info);
+    }
+#endif  // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
   }
 
   if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
@@ -939,106 +880,299 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
   }
 
   // Set frame_input source to true source for psnr calculation.
-  if (apply_filtering) {
-    cpi->source = source_kf_buffer;
-    cpi->unscaled_source = source_kf_buffer;
-  }
-
+  if (apply_filtering && is_psnr_calc_enabled(cpi)) {
+    cpi->source = av1_realloc_and_scale_if_required(
+        cm, source_buffer, &cpi->scaled_source, cm->features.interp_filter, 0,
+        false, true, cpi->oxcf.border_in_pixels,
+        cpi->oxcf.tool_cfg.enable_global_motion);
+    cpi->unscaled_source = source_buffer;
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2) end_timing(cpi, denoise_and_encode_time);
+#endif
   return AOM_CODEC_OK;
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-static INLINE int find_unused_ref_frame(const int *used_ref_frames,
-                                        const int *stack, int stack_size) {
-  for (int i = 0; i < stack_size; ++i) {
-    const int this_ref = stack[i];
-    int ref_idx = 0;
-    for (ref_idx = 0; ref_idx <= ALTREF_FRAME - LAST_FRAME; ++ref_idx) {
-      if (this_ref == used_ref_frames[ref_idx]) break;
-    }
-
-    // not in use
-    if (ref_idx > ALTREF_FRAME - LAST_FRAME) return this_ref;
+/*!\cond */
+// Struct to keep track of relevant reference frame data.
+typedef struct {
+  int map_idx;
+  int disp_order;
+  int pyr_level;
+  int used;
+} RefBufMapData;
+/*!\endcond */
+
+// Comparison function to sort reference frames in ascending display order.
+static int compare_map_idx_pair_asc(const void *a, const void *b) {
+  if (((RefBufMapData *)a)->disp_order == ((RefBufMapData *)b)->disp_order) {
+    return 0;
+  } else if (((const RefBufMapData *)a)->disp_order >
+             ((const RefBufMapData *)b)->disp_order) {
+    return 1;
+  } else {
+    return -1;
   }
-
-  return INVALID_IDX;
 }
 
-void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack) {
-  AV1_COMMON *cm = &cpi->common;
-  int *const remapped_ref_idx = cm->remapped_ref_idx;
-  int *const arf_stack = ref_buffer_stack->arf_stack;
-  int *const lst_stack = ref_buffer_stack->lst_stack;
-  int *const gld_stack = ref_buffer_stack->gld_stack;
-  const int arf_stack_size = ref_buffer_stack->arf_stack_size;
-  const int lst_stack_size = ref_buffer_stack->lst_stack_size;
-  const int gld_stack_size = ref_buffer_stack->gld_stack_size;
-
-  // Initialization
-  for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX;
-
-  if (arf_stack_size) {
-    remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] = arf_stack[arf_stack_size - 1];
-
-    if (arf_stack_size > 1)
-      remapped_ref_idx[BWDREF_FRAME - LAST_FRAME] = arf_stack[0];
-
-    if (arf_stack_size > 2)
-      remapped_ref_idx[ALTREF2_FRAME - LAST_FRAME] = arf_stack[1];
-  }
-
-  if (lst_stack_size) {
-    remapped_ref_idx[LAST_FRAME - LAST_FRAME] = lst_stack[0];
-
-    if (lst_stack_size > 1)
-      remapped_ref_idx[LAST2_FRAME - LAST_FRAME] = lst_stack[1];
+// Checks to see if a particular reference frame is already in the reference
+// frame map.
+static int is_in_ref_map(RefBufMapData *map, int disp_order, int n_frames) {
+  for (int i = 0; i < n_frames; i++) {
+    if (disp_order == map[i].disp_order) return 1;
   }
+  return 0;
+}
 
-  if (gld_stack_size) {
-    remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] = gld_stack[0];
+// Add a reference buffer index to a named reference slot.
+static void add_ref_to_slot(RefBufMapData *ref, int *const remapped_ref_idx,
+                            int frame) {
+  remapped_ref_idx[frame - LAST_FRAME] = ref->map_idx;
+  ref->used = 1;
+}
 
-    if (gld_stack_size > 1) {
-      if (arf_stack_size <= 1)
-        remapped_ref_idx[BWDREF_FRAME - LAST_FRAME] = gld_stack[1];
-      else
-        remapped_ref_idx[LAST3_FRAME - LAST_FRAME] = gld_stack[1];
+// Threshold dictating when we are allowed to start considering
+// leaving lowest level frames unmapped.
+#define LOW_LEVEL_FRAMES_TR 5
+
+// Find which reference buffer should be left out of the named mapping.
+// This is because there are 8 reference buffers and only 7 named slots.
+static void set_unmapped_ref(RefBufMapData *buffer_map, int n_bufs,
+                             int n_min_level_refs, int min_level,
+                             int cur_frame_disp) {
+  int max_dist = 0;
+  int unmapped_idx = -1;
+  if (n_bufs <= ALTREF_FRAME) return;
+  for (int i = 0; i < n_bufs; i++) {
+    if (buffer_map[i].used) continue;
+    if (buffer_map[i].pyr_level != min_level ||
+        n_min_level_refs >= LOW_LEVEL_FRAMES_TR) {
+      int dist = abs(cur_frame_disp - buffer_map[i].disp_order);
+      if (dist > max_dist) {
+        max_dist = dist;
+        unmapped_idx = i;
+      }
     }
   }
+  assert(unmapped_idx >= 0 && "Unmapped reference not found");
+  buffer_map[unmapped_idx].used = 1;
+}
 
-  for (int idx = ALTREF_FRAME - LAST_FRAME; idx >= 0; --idx) {
-    int ref_map_index = remapped_ref_idx[idx];
-
-    if (ref_map_index != INVALID_IDX) continue;
+void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                        int cur_frame_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                        const AV1_COMP *cpi, int gf_index,
+                        int is_parallel_encode,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                        int remapped_ref_idx[REF_FRAMES]) {
+  int buf_map_idx = 0;
 
-    ref_map_index =
-        find_unused_ref_frame(remapped_ref_idx, arf_stack, arf_stack_size);
+  // Initialize reference frame mappings.
+  for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX;
 
-    if (ref_map_index == INVALID_IDX) {
-      ref_map_index =
-          find_unused_ref_frame(remapped_ref_idx, gld_stack, gld_stack_size);
+  RefBufMapData buffer_map[REF_FRAMES];
+  int n_bufs = 0;
+  memset(buffer_map, 0, REF_FRAMES * sizeof(buffer_map[0]));
+  int min_level = MAX_ARF_LAYERS;
+  int max_level = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  int skip_ref_unmapping = 0;
+  int is_one_pass_rt = is_one_pass_rt_params(cpi);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+
+  // Go through current reference buffers and store display order, pyr level,
+  // and map index.
+  for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+    // Get reference frame buffer.
+    RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx];
+    if (ref_pair.disp_order == -1) continue;
+    const int frame_order = ref_pair.disp_order;
+    // Avoid duplicates.
+    if (is_in_ref_map(buffer_map, frame_order, n_bufs)) continue;
+    const int reference_frame_level = ref_pair.pyr_level;
+
+    // Keep track of the lowest and highest levels that currently exist.
+    if (reference_frame_level < min_level) min_level = reference_frame_level;
+    if (reference_frame_level > max_level) max_level = reference_frame_level;
+
+    buffer_map[n_bufs].map_idx = map_idx;
+    buffer_map[n_bufs].disp_order = frame_order;
+    buffer_map[n_bufs].pyr_level = reference_frame_level;
+    buffer_map[n_bufs].used = 0;
+    n_bufs++;
+  }
+
+  // Sort frames in ascending display order.
+  qsort(buffer_map, n_bufs, sizeof(buffer_map[0]), compare_map_idx_pair_asc);
+
+  int n_min_level_refs = 0;
+  int closest_past_ref = -1;
+  int golden_idx = -1;
+  int altref_idx = -1;
+
+  // Find the GOLDEN_FRAME and BWDREF_FRAME.
+  // Also collect various stats about the reference frames for the remaining
+  // mappings.
+  for (int i = n_bufs - 1; i >= 0; i--) {
+    if (buffer_map[i].pyr_level == min_level) {
+      // Keep track of the number of lowest level frames.
+      n_min_level_refs++;
+      if (buffer_map[i].disp_order < cur_frame_disp && golden_idx == -1 &&
+          remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] == INVALID_IDX) {
+        // Save index for GOLDEN.
+        golden_idx = i;
+      } else if (buffer_map[i].disp_order > cur_frame_disp &&
+                 altref_idx == -1 &&
+                 remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] == INVALID_IDX) {
+        // Save index for ALTREF.
+        altref_idx = i;
+      }
+    } else if (buffer_map[i].disp_order == cur_frame_disp) {
+      // Map the BWDREF_FRAME if this is the show_existing_frame.
+      add_ref_to_slot(&buffer_map[i], remapped_ref_idx, BWDREF_FRAME);
     }
 
-    if (ref_map_index == INVALID_IDX) {
-      ref_map_index =
-          find_unused_ref_frame(remapped_ref_idx, lst_stack, lst_stack_size);
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    // During parallel encodes of lower layer frames, exclude the first frame
+    // (frame_parallel_level 1) from being used for the reference assignment of
+    // the second frame (frame_parallel_level 2).
+    if (!is_one_pass_rt && gf_group->frame_parallel_level[gf_index] == 2 &&
+        gf_group->frame_parallel_level[gf_index - 1] == 1 &&
+        gf_group->update_type[gf_index - 1] == INTNL_ARF_UPDATE) {
+      assert(gf_group->update_type[gf_index] == INTNL_ARF_UPDATE);
+#if CONFIG_FPMT_TEST
+      is_parallel_encode = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE)
+                               ? is_parallel_encode
+                               : 0;
+#endif  // CONFIG_FPMT_TEST
+      // If parallel cpis are active, use ref_idx_to_skip, else, use display
+      // index.
+      assert(IMPLIES(is_parallel_encode, cpi->ref_idx_to_skip != INVALID_IDX));
+      assert(IMPLIES(!is_parallel_encode,
+                     gf_group->skip_frame_as_ref[gf_index] != INVALID_IDX));
+      buffer_map[i].used = is_parallel_encode
+                               ? (buffer_map[i].map_idx == cpi->ref_idx_to_skip)
+                               : (buffer_map[i].disp_order ==
+                                  gf_group->skip_frame_as_ref[gf_index]);
+      // In case a ref frame is excluded from being used during assignment,
+      // skip the call to set_unmapped_ref(). Applicable in steady state.
+      if (buffer_map[i].used) skip_ref_unmapping = 1;
     }
-
-    if (ref_map_index != INVALID_IDX)
-      remapped_ref_idx[idx] = ref_map_index;
-    else
-      remapped_ref_idx[idx] = ref_buffer_stack->gld_stack[0];
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+
+    // Keep track of where the frames change from being past frames to future
+    // frames.
+    if (buffer_map[i].disp_order < cur_frame_disp && closest_past_ref < 0)
+      closest_past_ref = i;
+  }
+
+  // Do not map GOLDEN and ALTREF based on their pyramid level if all reference
+  // frames have the same level.
+  if (n_min_level_refs <= n_bufs) {
+    // Map the GOLDEN_FRAME.
+    if (golden_idx > -1)
+      add_ref_to_slot(&buffer_map[golden_idx], remapped_ref_idx, GOLDEN_FRAME);
+    // Map the ALTREF_FRAME.
+    if (altref_idx > -1)
+      add_ref_to_slot(&buffer_map[altref_idx], remapped_ref_idx, ALTREF_FRAME);
+  }
+
+  // Find the buffer to be excluded from the mapping.
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  if (!skip_ref_unmapping)
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+    set_unmapped_ref(buffer_map, n_bufs, n_min_level_refs, min_level,
+                     cur_frame_disp);
+
+  // Place past frames in LAST_FRAME, LAST2_FRAME, and LAST3_FRAME.
+  for (int frame = LAST_FRAME; frame < GOLDEN_FRAME; frame++) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer
+    // in decreasing ouptut order relative to current picture.
+    int next_buf_max = 0;
+    int next_disp_order = INT_MIN;
+    for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used &&
+          buffer_map[buf_map_idx].disp_order < cur_frame_disp &&
+          buffer_map[buf_map_idx].disp_order > next_disp_order) {
+        next_disp_order = buffer_map[buf_map_idx].disp_order;
+        next_buf_max = buf_map_idx;
+      }
+    }
+    buf_map_idx = next_buf_max;
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Place future frames (if there are any) in BWDREF_FRAME and ALTREF2_FRAME.
+  for (int frame = BWDREF_FRAME; frame < REF_FRAMES; frame++) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer
+    // in increasing ouptut order relative to current picture.
+    int next_buf_max = 0;
+    int next_disp_order = INT_MAX;
+    for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used &&
+          buffer_map[buf_map_idx].disp_order > cur_frame_disp &&
+          buffer_map[buf_map_idx].disp_order < next_disp_order) {
+        next_disp_order = buffer_map[buf_map_idx].disp_order;
+        next_buf_max = buf_map_idx;
+      }
+    }
+    buf_map_idx = next_buf_max;
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Place remaining past frames.
+  buf_map_idx = closest_past_ref;
+  for (int frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer.
+    for (; buf_map_idx >= 0; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used) break;
+    }
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Place remaining future frames.
+  buf_map_idx = n_bufs - 1;
+  for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; frame--) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer.
+    for (; buf_map_idx > closest_past_ref; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used) break;
+    }
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
   }
+
+  // Fill any slots that are empty (should only happen for the first 7 frames).
+  for (int i = 0; i < REF_FRAMES; ++i)
+    if (remapped_ref_idx[i] == INVALID_IDX) remapped_ref_idx[i] = 0;
 }
 
 int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
                         uint8_t *const dest, unsigned int *frame_flags,
                         int64_t *const time_stamp, int64_t *const time_end,
                         const aom_rational64_t *const timestamp_ratio,
-                        int flush) {
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+                        int *const pop_lookahead, int flush) {
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
-  GF_GROUP *gf_group = &cpi->gf_group;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
   ExternalFlags *const ext_flags = &cpi->ext_flags;
+  GFConfig *const gf_cfg = &oxcf->gf_cfg;
 
   EncodeFrameInput frame_input;
   EncodeFrameParams frame_params;
@@ -1047,61 +1181,177 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
   memset(&frame_params, 0, sizeof(frame_params));
   memset(&frame_results, 0, sizeof(frame_results));
 
+#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+  VBR_RATECTRL_INFO *vbr_rc_info = &cpi->vbr_rc_info;
+  if (oxcf->pass == AOM_RC_THIRD_PASS && vbr_rc_info->ready == 0) {
+    THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF];
+    av1_open_second_pass_log(cpi, 1);
+    FILE *second_pass_log_stream = cpi->second_pass_log_stream;
+    fseek(second_pass_log_stream, 0, SEEK_END);
+    size_t file_size = ftell(second_pass_log_stream);
+    rewind(second_pass_log_stream);
+    size_t read_size = 0;
+    while (read_size < file_size) {
+      THIRD_PASS_GOP_INFO gop_info;
+      struct aom_internal_error_info *error = cpi->common.error;
+      // Read in GOP information from the second pass file.
+      av1_read_second_pass_gop_info(second_pass_log_stream, &gop_info, error);
+      TPL_INFO *tpl_info;
+      AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info)));
+      av1_read_tpl_info(tpl_info, second_pass_log_stream, error);
+      // Read in per-frame info from second-pass encoding
+      av1_read_second_pass_per_frame_info(second_pass_log_stream, frame_info,
+                                          gop_info.num_frames, error);
+      av1_vbr_rc_append_tpl_info(vbr_rc_info, tpl_info);
+      read_size = ftell(second_pass_log_stream);
+      aom_free(tpl_info);
+    }
+    av1_close_second_pass_log(cpi);
+    if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
+      vbr_rc_info->base_q_index = cpi->oxcf.rc_cfg.cq_level;
+      av1_vbr_rc_compute_q_indices(
+          vbr_rc_info->base_q_index, vbr_rc_info->total_frame_count,
+          vbr_rc_info->qstep_ratio_list, cm->seq_params->bit_depth,
+          vbr_rc_info->q_index_list);
+    } else {
+      vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q(
+          vbr_rc_info->total_bit_budget, cm->seq_params->bit_depth,
+          vbr_rc_info->scale_factors, vbr_rc_info->total_frame_count,
+          vbr_rc_info->update_type_list, vbr_rc_info->qstep_ratio_list,
+          vbr_rc_info->txfm_stats_list, vbr_rc_info->q_index_list, NULL);
+    }
+    vbr_rc_info->ready = 1;
+#if CONFIG_RATECTRL_LOG
+    rc_log_record_chunk_info(&cpi->rc_log, vbr_rc_info->base_q_index,
+                             vbr_rc_info->total_frame_count);
+#endif  // CONFIG_RATECTRL_LOG
+  }
+#endif  // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+
+  // Check if we need to stuff more src frames
+  if (flush == 0) {
+    int srcbuf_size =
+        av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
+    int pop_size =
+        av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage);
+
+    // Continue buffering look ahead buffer.
+    if (srcbuf_size < pop_size) return -1;
+  }
+
+  if (!av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage)) {
+#if !CONFIG_REALTIME_ONLY
+    if (flush && oxcf->pass == AOM_RC_FIRST_PASS &&
+        !cpi->ppi->twopass.first_pass_done) {
+      av1_end_first_pass(cpi); /* get last stats packet */
+      cpi->ppi->twopass.first_pass_done = 1;
+    }
+#endif
+    return -1;
+  }
+
   // TODO(sarahparker) finish bit allocation for one pass pyramid
-  if (has_no_stats_stage(cpi) && oxcf->rc_mode != AOM_Q) {
-    cpi->oxcf.gf_max_pyr_height =
-        AOMMIN(cpi->oxcf.gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS);
-    cpi->oxcf.gf_min_pyr_height =
-        AOMMIN(cpi->oxcf.gf_min_pyr_height, cpi->oxcf.gf_max_pyr_height);
+  if (has_no_stats_stage(cpi)) {
+    gf_cfg->gf_max_pyr_height =
+        AOMMIN(gf_cfg->gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS);
+    gf_cfg->gf_min_pyr_height =
+        AOMMIN(gf_cfg->gf_min_pyr_height, gf_cfg->gf_max_pyr_height);
   }
 
+  // Allocation of mi buffers.
+  alloc_mb_mode_info_buffers(cpi);
+
+  cpi->skip_tpl_setup_stats = 0;
+#if !CONFIG_REALTIME_ONLY
+  if (oxcf->pass != AOM_RC_FIRST_PASS) {
+    TplParams *const tpl_data = &cpi->ppi->tpl_data;
+    if (tpl_data->tpl_stats_pool[0] == NULL) {
+      av1_setup_tpl_buffers(cpi->ppi, &cm->mi_params, oxcf->frm_dim_cfg.width,
+                            oxcf->frm_dim_cfg.height, 0,
+                            oxcf->gf_cfg.lag_in_frames);
+    }
+  }
+  cpi->twopass_frame.this_frame = NULL;
+  const int use_one_pass_rt_params = is_one_pass_rt_params(cpi);
+  if (!use_one_pass_rt_params && !is_stat_generation_stage(cpi)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_get_second_pass_params_time);
+#endif
+
+    // Initialise frame_level_rate_correction_factors with value previous
+    // to the parallel frames.
+    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      for (int i = 0; i < RATE_FACTOR_LEVELS; i++) {
+        cpi->rc.frame_level_rate_correction_factors[i] =
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+            (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)
+                ? cpi->ppi->p_rc.temp_rate_correction_factors[i]
+                :
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+                cpi->ppi->p_rc.rate_correction_factors[i];
+      }
+    }
+
+    // copy mv_stats from ppi to frame_level cpi.
+    cpi->mv_stats = cpi->ppi->mv_stats;
+    av1_get_second_pass_params(cpi, &frame_params, *frame_flags);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_get_second_pass_params_time);
+#endif
+  }
+#endif
+
   if (!is_stat_generation_stage(cpi)) {
-    // If this is a forward keyframe, mark as a show_existing_frame
-    if (cpi->oxcf.fwd_kf_enabled && (gf_group->index == gf_group->size) &&
-        gf_group->update_type[1] == ARF_UPDATE && cpi->rc.frames_to_key == 0) {
+    // TODO(jingning): fwd key frame always uses show existing frame?
+    if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE &&
+        gf_group->refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
       frame_params.show_existing_frame = 1;
     } else {
       frame_params.show_existing_frame =
-          ((oxcf->enable_overlay == 0 || cpi->sf.hl_sf.disable_overlay_frames ||
-            cpi->show_existing_alt_ref) &&
-           gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) ||
-          gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
+          (cpi->ppi->show_existing_alt_ref &&
+           gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) ||
+          gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE;
     }
     frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags);
 
     // Reset show_existing_alt_ref decision to 0 after it is used.
-    if (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) {
-      cpi->show_existing_alt_ref = 0;
+    if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+      cpi->ppi->show_existing_alt_ref = 0;
     }
   } else {
     frame_params.show_existing_frame = 0;
   }
 
-  int code_arf = 0;
   struct lookahead_entry *source = NULL;
   struct lookahead_entry *last_source = NULL;
   if (frame_params.show_existing_frame) {
-    source = av1_lookahead_pop(cpi->lookahead, flush, cpi->compressor_stage);
+    source = av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage);
+    *pop_lookahead = 1;
     frame_params.show_frame = 1;
   } else {
-    int show_existing_alt_ref = 0;
-    source = choose_frame_source(cpi, &code_arf, &flush, &last_source,
-                                 &frame_params, &show_existing_alt_ref);
-    if (gf_group->update_type[gf_group->index] == ARF_UPDATE)
-      cpi->show_existing_alt_ref = show_existing_alt_ref;
+    source = choose_frame_source(cpi, &flush, pop_lookahead, &last_source,
+                                 &frame_params);
   }
 
   if (source == NULL) {  // If no source was found, we can't encode a frame.
 #if !CONFIG_REALTIME_ONLY
-    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+    if (flush && oxcf->pass == AOM_RC_FIRST_PASS &&
+        !cpi->ppi->twopass.first_pass_done) {
       av1_end_first_pass(cpi); /* get last stats packet */
-      cpi->twopass.first_pass_done = 1;
+      cpi->ppi->twopass.first_pass_done = 1;
     }
 #endif
     return -1;
   }
 
-  frame_input.source = code_arf ? &cpi->alt_ref_buffer : &source->img;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // reset src_offset to allow actual encode call for this frame to get its
+  // source.
+  gf_group->src_offset[cpi->gf_frame_index] = 0;
+#endif
+
+  // Source may be changed if temporal filtered later.
+  frame_input.source = &source->img;
   frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
   frame_input.ts_duration = source->ts_end - source->ts_start;
   // Save unfiltered source. It is used in av1_get_second_pass_params().
@@ -1109,14 +1359,21 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
 
   *time_stamp = source->ts_start;
   *time_end = source->ts_end;
-  if (source->ts_start < cpi->time_stamps.first_ever) {
-    cpi->time_stamps.first_ever = source->ts_start;
-    cpi->time_stamps.prev_end_seen = source->ts_start;
+  if (source->ts_start < cpi->time_stamps.first_ts_start) {
+    cpi->time_stamps.first_ts_start = source->ts_start;
+    cpi->time_stamps.prev_ts_end = source->ts_start;
   }
 
   av1_apply_encoding_flags(cpi, source->flags);
-  if (!frame_params.show_existing_frame)
-    *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+  *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      cpi->framerate = cpi->temp_framerate;
+    }
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
 
   // Shown frames and arf-overlay frames need frame-rate considering
   if (frame_params.show_frame)
@@ -1129,7 +1386,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
           &cm->film_grain_params);
     } else {
       cm->cur_frame->film_grain_params_present =
-          cm->seq_params.film_grain_params_present;
+          cm->seq_params->film_grain_params_present;
     }
     // only one operating point supported now
     const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp);
@@ -1137,16 +1394,28 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
     cm->frame_presentation_time = (uint32_t)pts64;
   }
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_get_one_pass_rt_params_time);
+#endif
 #if CONFIG_REALTIME_ONLY
   av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags);
+  if (cpi->oxcf.speed >= 5 && cpi->ppi->number_spatial_layers == 1 &&
+      cpi->ppi->number_temporal_layers == 1)
+    av1_set_reference_structure_one_pass_rt(cpi, cpi->gf_frame_index == 0);
 #else
-  if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME &&
-      oxcf->lag_in_frames == 0)
+  if (use_one_pass_rt_params) {
     av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags);
-  else if (!is_stat_generation_stage(cpi))
-    av1_get_second_pass_params(cpi, &frame_params, &frame_input, *frame_flags);
+    if (cpi->oxcf.speed >= 5 && cpi->ppi->number_spatial_layers == 1 &&
+        cpi->ppi->number_temporal_layers == 1)
+      av1_set_reference_structure_one_pass_rt(cpi, cpi->gf_frame_index == 0);
+  }
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_get_one_pass_rt_params_time);
 #endif
-  FRAME_UPDATE_TYPE frame_update_type = get_frame_update_type(gf_group);
+
+  FRAME_UPDATE_TYPE frame_update_type =
+      get_frame_update_type(gf_group, cpi->gf_frame_index);
 
   if (frame_params.show_existing_frame &&
       frame_params.frame_type != KEY_FRAME) {
@@ -1160,18 +1429,23 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
   // TODO(david.turner@argondesign.com): Change all the encode strategy to
   // modify frame_params instead of cm or cpi.
 
-  // Per-frame encode speed.  In theory this can vary, but things may have been
-  // written assuming speed-level will not change within a sequence, so this
-  // parameter should be used with caution.
+  // Per-frame encode speed.  In theory this can vary, but things may have
+  // been written assuming speed-level will not change within a sequence, so
+  // this parameter should be used with caution.
   frame_params.speed = oxcf->speed;
 
   // Work out some encoding parameters specific to the pass:
-  if (has_no_stats_stage(cpi) && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+  if (has_no_stats_stage(cpi) && oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
     av1_cyclic_refresh_update_parameters(cpi);
   } else if (is_stat_generation_stage(cpi)) {
-    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&cpi->oxcf);
-    const int kf_requested = (cm->current_frame.frame_number == 0 ||
-                              (*frame_flags & FRAMEFLAGS_KEY));
+    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&oxcf->rc_cfg);
+    // Current frame is coded as a key-frame for any of the following cases:
+    // 1) First frame of a video
+    // 2) For all-intra frame encoding
+    // 3) When a key-frame is forced
+    const int kf_requested =
+        (cm->current_frame.frame_number == 0 ||
+         oxcf->kf_cfg.key_freq_max == 0 || (*frame_flags & FRAMEFLAGS_KEY));
     if (kf_requested && frame_update_type != OVERLAY_UPDATE &&
         frame_update_type != INTNL_OVERLAY_UPDATE) {
       frame_params.frame_type = KEY_FRAME;
@@ -1197,18 +1471,37 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
        frame_params.frame_type == S_FRAME) &&
       !frame_params.show_existing_frame;
 
-  av1_configure_buffer_updates(cpi, &frame_params, frame_update_type,
-                               force_refresh_all);
+  av1_configure_buffer_updates(
+      cpi, &frame_params.refresh_frame, frame_update_type,
+      gf_group->refbuf_state[cpi->gf_frame_index], force_refresh_all);
 
   if (!is_stat_generation_stage(cpi)) {
     const RefCntBuffer *ref_frames[INTER_REFS_PER_FRAME];
     const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME];
 
-    if (!ext_flags->refresh_frame_flags_pending) {
-      av1_get_ref_frames(cpi, &cpi->ref_buffer_stack);
-    } else if (cpi->svc.external_ref_frame_config) {
-      for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++)
-        cm->remapped_ref_idx[i] = cpi->svc.ref_idx[i];
+    RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+    init_ref_map_pair(cpi, ref_frame_map_pairs);
+    const int order_offset = gf_group->arf_src_offset[cpi->gf_frame_index];
+    const int cur_frame_disp =
+        cpi->common.current_frame.frame_number + order_offset;
+
+    int get_ref_frames = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+    get_ref_frames =
+        (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+    if (get_ref_frames ||
+        gf_group->frame_parallel_level[cpi->gf_frame_index] == 0) {
+      if (!ext_flags->refresh_frame.update_pending) {
+        av1_get_ref_frames(ref_frame_map_pairs, cur_frame_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                           cpi, cpi->gf_frame_index, 1,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                           cm->remapped_ref_idx);
+      } else if (cpi->svc.set_ref_frame_config) {
+        for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++)
+          cm->remapped_ref_idx[i] = cpi->svc.ref_idx[i];
+      }
     }
 
     // Get the reference frames
@@ -1216,50 +1509,72 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
       ref_frames[i] = get_ref_frame_buf(cm, ref_frame_priority_order[i]);
       ref_frame_buf[i] = ref_frames[i] != NULL ? &ref_frames[i]->buf : NULL;
     }
+
     // Work out which reference frame slots may be used.
-    frame_params.ref_frame_flags = get_ref_frame_flags(
-        &cpi->sf, ref_frame_buf, ext_flags->ref_frame_flags);
+    frame_params.ref_frame_flags =
+        get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi), ref_frame_buf,
+                            ext_flags->ref_frame_flags);
 
-    frame_params.primary_ref_frame =
-        choose_primary_ref_frame(cpi, &frame_params);
-    frame_params.order_offset = get_order_offset(&cpi->gf_group, &frame_params);
+    // Set primary_ref_frame of non-reference frames as PRIMARY_REF_NONE.
+    if (cpi->ppi->gf_group.is_frame_non_ref[cpi->gf_frame_index]) {
+      frame_params.primary_ref_frame = PRIMARY_REF_NONE;
+    } else {
+      frame_params.primary_ref_frame =
+          choose_primary_ref_frame(cpi, &frame_params);
+    }
 
-    frame_params.refresh_frame_flags = av1_get_refresh_frame_flags(
-        cpi, &frame_params, frame_update_type, &cpi->ref_buffer_stack);
+    frame_params.order_offset = gf_group->arf_src_offset[cpi->gf_frame_index];
 
-    frame_params.existing_fb_idx_to_show =
-        frame_params.show_existing_frame
-            ? (frame_update_type == INTNL_OVERLAY_UPDATE
-                   ? get_ref_frame_map_idx(cm, BWDREF_FRAME)
-                   : get_ref_frame_map_idx(cm, ALTREF_FRAME))
-            : INVALID_IDX;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    // Call av1_get_refresh_frame_flags() if refresh index not available.
+    if (!cpi->refresh_idx_available) {
+#endif
+#endif
+      frame_params.refresh_frame_flags = av1_get_refresh_frame_flags(
+          cpi, &frame_params, frame_update_type, cpi->gf_frame_index,
+          cur_frame_disp, ref_frame_map_pairs);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    } else {
+      assert(cpi->ref_refresh_index != INVALID_IDX);
+      frame_params.refresh_frame_flags = (1 << cpi->ref_refresh_index);
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+
+    // Make the frames marked as is_frame_non_ref to non-reference frames.
+    if (gf_group->is_frame_non_ref[cpi->gf_frame_index])
+      frame_params.refresh_frame_flags = 0;
+
+    frame_params.existing_fb_idx_to_show = INVALID_IDX;
+    // Find the frame buffer to show based on display order.
+    if (frame_params.show_existing_frame) {
+      for (int frame = 0; frame < REF_FRAMES; frame++) {
+        const RefCntBuffer *const buf = cm->ref_frame_map[frame];
+        if (buf == NULL) continue;
+        const int frame_order = (int)buf->display_order_hint;
+        if (frame_order == cur_frame_disp)
+          frame_params.existing_fb_idx_to_show = frame;
+      }
+    }
   }
 
   // The way frame_params->remapped_ref_idx is setup is a placeholder.
   // Currently, reference buffer assignment is done by update_ref_frame_map()
-  // which is called by high-level strategy AFTER encoding a frame.  It modifies
-  // cm->remapped_ref_idx.  If you want to use an alternative method to
-  // determine reference buffer assignment, just put your assignments into
+  // which is called by high-level strategy AFTER encoding a frame.  It
+  // modifies cm->remapped_ref_idx.  If you want to use an alternative method
+  // to determine reference buffer assignment, just put your assignments into
   // frame_params->remapped_ref_idx here and they will be used when encoding
   // this frame.  If frame_params->remapped_ref_idx is setup independently of
   // cm->remapped_ref_idx then update_ref_frame_map() will have no effect.
   memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx,
          REF_FRAMES * sizeof(*cm->remapped_ref_idx));
 
-  cpi->td.mb.e_mbd.delta_qindex = 0;
+  cpi->td.mb.delta_qindex = 0;
 
   if (!frame_params.show_existing_frame) {
-    cm->quant_params.using_qmatrix = cpi->oxcf.using_qm;
-#if !CONFIG_REALTIME_ONLY
-    if (oxcf->lag_in_frames > 0 && !is_stat_generation_stage(cpi)) {
-      if (cpi->gf_group.index == 1 && cpi->oxcf.enable_tpl_model) {
-        av1_configure_buffer_updates(cpi, &frame_params, frame_update_type, 0);
-        av1_set_frame_size(cpi, cm->width, cm->height);
-        av1_tpl_setup_stats(cpi, 0, &frame_params, &frame_input);
-        assert(cpi->num_gf_group_show_frames == 1);
-      }
-    }
-#endif
+    cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm;
   }
 
 #if CONFIG_REALTIME_ONLY
@@ -1268,55 +1583,61 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
     return AOM_CODEC_ERROR;
   }
 #else
-  if (denoise_and_encode(cpi, dest, &frame_input, &frame_params,
-                         &frame_results) != AOM_CODEC_OK) {
+  if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME &&
+      gf_cfg->lag_in_frames == 0) {
+    if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  } else if (denoise_and_encode(cpi, dest, &frame_input, &frame_params,
+                                &frame_results) != AOM_CODEC_OK) {
     return AOM_CODEC_ERROR;
   }
 #endif  // CONFIG_REALTIME_ONLY
-  if (!is_stat_generation_stage(cpi))
-    cpi->num_gf_group_show_frames += frame_params.show_frame;
+
+  // This is used in rtc temporal filter case. Use true source in the PSNR
+  // calculation.
+  if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf &&
+      cpi->common.current_frame.frame_type != KEY_FRAME) {
+    assert(cpi->orig_source.buffer_alloc_sz > 0);
+    cpi->source = &cpi->orig_source;
+  }
 
   if (!is_stat_generation_stage(cpi)) {
     // First pass doesn't modify reference buffer assignment or produce frame
     // flags
-    update_frame_flags(cpi, frame_flags);
-    if (!ext_flags->refresh_frame_flags_pending) {
-      int ref_map_index =
-          av1_get_refresh_ref_frame_map(cm->current_frame.refresh_frame_flags);
-      av1_update_ref_frame_map(cpi, frame_update_type, cm->show_existing_frame,
-                               ref_map_index, &cpi->ref_buffer_stack);
-    }
+    update_frame_flags(&cpi->common, &cpi->refresh_frame, frame_flags);
+    set_additional_frame_flags(cm, frame_flags);
   }
 
 #if !CONFIG_REALTIME_ONLY
-  if (!is_stat_generation_stage(cpi)) {
 #if TXCOEFF_COST_TIMER
+  if (!is_stat_generation_stage(cpi)) {
     cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
     fprintf(stderr,
             "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
             "in us\n",
             cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
             cm->cum_txcoeff_cost_timer);
-#endif
-    av1_twopass_postencode_update(cpi);
   }
+#endif
 #endif  // !CONFIG_REALTIME_ONLY
 
-  if (!is_stat_generation_stage(cpi)) {
-    update_fb_of_context_type(cpi, &frame_params, cpi->fb_of_context_type);
-    set_additional_frame_flags(cm, frame_flags);
-    update_rc_counts(cpi);
+#if CONFIG_TUNE_VMAF
+  if (!is_stat_generation_stage(cpi) &&
+      (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+       oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) {
+    av1_update_vmaf_curve(cpi);
   }
+#endif
 
   // Unpack frame_results:
   *size = frame_results.size;
 
   // Leave a signal for a higher level caller about if this frame is droppable
   if (*size > 0) {
-    cpi->droppable = is_frame_droppable(&cpi->svc, ext_flags);
+    cpi->droppable = is_frame_droppable(&cpi->svc, &ext_flags->refresh_frame);
   }
 
-  if (cpi->use_svc) av1_save_layer_context(cpi);
-
   return AOM_CODEC_OK;
 }
diff --git a/media/libaom/src/av1/encoder/encode_strategy.h b/media/libaom/src/av1/encoder/encode_strategy.h
index b05224ba11..a04c483f50 100644
--- a/media/libaom/src/av1/encoder/encode_strategy.h
+++ b/media/libaom/src/av1/encoder/encode_strategy.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\file
+ * \brief Declares frame encoding functions.
+ */
 #ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
 #define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
 
@@ -23,40 +26,118 @@ extern "C" {
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/firstpass.h"
 
-// This function will implement high-level encode strategy, choosing frame type,
-// frame placement, etc.  It populates an EncodeFrameParams struct with the
-// results of these decisions and then calls av1_encode()
+/*!\brief Implement high-level encode strategy
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ * This function will implement high-level encode strategy, choosing frame type,
+ * frame placement, etc. It populates an EncodeFrameParams struct with the
+ * results of these decisions and then encodes the frame. The caller should use
+ * the output parameters *time_stamp and *time_end only when this function
+ * returns AOM_CODEC_OK.
+ *
+ * \param[in]    cpi         Top-level encoder structure
+ * \param[in]    size        Bitstream size
+ * \param[in]    dest        Bitstream output
+ * \param[in]    frame_flags Flags to decide how to encoding the frame
+ * \param[out]   time_stamp  Time stamp of the frame
+ * \param[out]   time_end    Time end
+ * \param[in]    timestamp_ratio Time base
+ * \param[in]    pop_lookahead Decide to pop the source frame from queue
+ * \param[in]    flush       Decide to encode one frame or the rest of frames
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ * \retval #AOM_CODEC_ERROR
+ */
 int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
                         uint8_t *const dest, unsigned int *frame_flags,
                         int64_t *const time_stamp, int64_t *const time_end,
                         const aom_rational64_t *const timestamp_ratio,
-                        int flush);
+                        int *const pop_lookahead, int flush);
 
+/*!\cond */
 // Set individual buffer update flags based on frame reference type.
 // force_refresh_all is used when we have a KEY_FRAME or S_FRAME.  It forces all
 // refresh_*_frame flags to be set, because we refresh all buffers in this case.
 void av1_configure_buffer_updates(AV1_COMP *const cpi,
-                                  EncodeFrameParams *const frame_params,
+                                  RefreshFrameInfo *const refresh_frame,
                                   const FRAME_UPDATE_TYPE type,
+                                  const REFBUF_STATE refbuf_state,
                                   int force_refresh_all);
 
-int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
-                                const EncodeFrameParams *const frame_params,
-                                FRAME_UPDATE_TYPE frame_update_type,
-                                const RefBufferStack *const ref_buffer_stack);
+int av1_get_refresh_frame_flags(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
+    FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order,
+    RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]);
 
 int av1_get_refresh_ref_frame_map(int refresh_frame_flags);
 
-void av1_update_ref_frame_map(AV1_COMP *cpi,
-                              FRAME_UPDATE_TYPE frame_update_type,
-                              int show_existing_frame, int ref_map_index,
-                              RefBufferStack *ref_buffer_stack);
-
-void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack);
+/*!\brief Obtain indices of reference frames in ref_frame_map
+ *
+ * \callgraph
+ * \callergraph
+ *
+ * \param[out]   remapped_ref_idx  An array for storing indices of reference
+ *                                 frames. The index is used to retrieve a
+ *                                 reference frame buffer from ref_frame_map
+ *                                 in AV1Common.
+ */
+void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                        int cur_frame_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                        const AV1_COMP *cpi, int gf_index,
+                        int is_parallel_encode,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                        int remapped_ref_idx[REF_FRAMES]);
 
 int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
                                const int up_to_index,
                                const COMPRESSOR_STAGE compressor_stage);
+
+static AOM_INLINE int is_frame_droppable(
+    const SVC *const svc,
+    const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) {
+  // Droppable frame is only used by external refresh flags. VoD setting won't
+  // trigger its use case.
+  if (svc->set_ref_frame_config)
+    return svc->non_reference_frame;
+  else if (ext_refresh_frame_flags->update_pending)
+    return !(ext_refresh_frame_flags->alt_ref_frame ||
+             ext_refresh_frame_flags->alt2_ref_frame ||
+             ext_refresh_frame_flags->bwd_ref_frame ||
+             ext_refresh_frame_flags->golden_frame ||
+             ext_refresh_frame_flags->last_frame);
+  else
+    return 0;
+}
+
+static AOM_INLINE int get_current_frame_ref_type(const AV1_COMP *const cpi) {
+  // We choose the reference "type" of this frame from the flags which indicate
+  // which reference frames will be refreshed by it. More than one of these
+  // flags may be set, so the order here implies an order of precedence. This is
+  // just used to choose the primary_ref_frame (as the most recent reference
+  // buffer of the same reference-type as the current frame).
+
+  switch (cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]) {
+    case 0: return 0;
+    case 1: return 1;
+    case MAX_ARF_LAYERS:
+    case MAX_ARF_LAYERS + 1: return 4;
+    default: return 7;
+  }
+}
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+int av1_calc_refresh_idx_for_intnl_arf(
+    AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+    int gf_index);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+/*!\endcond */
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/encodeframe.c b/media/libaom/src/av1/encoder/encodeframe.c
index 53b47d49e9..73ef57411f 100644
--- a/media/libaom/src/av1/encoder/encodeframe.c
+++ b/media/libaom/src/av1/encoder/encodeframe.c
@@ -23,7 +23,6 @@
 #include "aom_dsp/binary_codes_writer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/aom_timer.h"
-#include "aom_ports/system_state.h"
 
 #if CONFIG_MISMATCH_DEBUG
 #include "aom_util/debug_util.h"
@@ -31,6 +30,7 @@
 
 #include "av1/common/cfl.h"
 #include "av1/common/common.h"
+#include "av1/common/common_data.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/idct.h"
@@ -44,23 +44,26 @@
 #include "av1/common/tile_common.h"
 #include "av1/common/warped_motion.h"
 
+#include "av1/encoder/allintra_vis.h"
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
-#include "av1/encoder/corner_detect.h"
-#include "av1/encoder/global_motion.h"
+#include "av1/encoder/global_motion_facade.h"
 #include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encodetxb.h"
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
+#include "av1/encoder/intra_mode_search_utils.h"
 #include "av1/encoder/ml.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
 #if !CONFIG_REALTIME_ONLY
 #include "av1/encoder/partition_model_weights.h"
 #endif
+#include "av1/encoder/partition_search.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
@@ -73,11 +76,7 @@
 #include "av1/encoder/tune_vmaf.h"
 #endif
 
-static AOM_INLINE void encode_superblock(const AV1_COMP *const cpi,
-                                         TileDataEnc *tile_data, ThreadData *td,
-                                         TOKENEXTRA **t, RUN_TYPE dry_run,
-                                         BLOCK_SIZE bsize, int *rate);
-
+/*!\cond */
 // This is used as a reference when computing the source variance for the
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
@@ -146,50 +145,14 @@ static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16
 };
-
-typedef struct {
-  ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
-  ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
-  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
-  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
-  TXFM_CONTEXT *p_ta;
-  TXFM_CONTEXT *p_tl;
-  TXFM_CONTEXT ta[MAX_MIB_SIZE];
-  TXFM_CONTEXT tl[MAX_MIB_SIZE];
-} RD_SEARCH_MACROBLOCK_CONTEXT;
-
-enum { PICK_MODE_RD = 0, PICK_MODE_NONRD };
-
-enum {
-  SB_SINGLE_PASS,  // Single pass encoding: all ctxs get updated normally
-  SB_DRY_PASS,     // First pass of multi-pass: does not update the ctxs
-  SB_WET_PASS      // Second pass of multi-pass: finalize and update the ctx
-} UENUM1BYTE(SB_MULTI_PASS_MODE);
-
-// This struct is used to store the statistics used by sb-level multi-pass
-// encoding. Currently, this is only used to make a copy of the state before we
-// perform the first pass
-typedef struct SB_FIRST_PASS_STATS {
-  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-  RD_COUNTS rd_count;
-
-  int split_count;
-  FRAME_COUNTS fc;
-  InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
-  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
-  int current_qindex;
-
-#if CONFIG_INTERNAL_STATS
-  unsigned int mode_chosen_counts[MAX_MODES];
-#endif  // CONFIG_INTERNAL_STATS
-} SB_FIRST_PASS_STATS;
+/*!\endcond */
 
 unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
                                            const struct buf_2d *ref,
                                            BLOCK_SIZE bs) {
   unsigned int sse;
   const unsigned int var =
-      cpi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse);
+      cpi->ppi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
@@ -202,472 +165,12 @@ unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
   const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8,
                                        AV1_HIGH_VAR_OFFS_10,
                                        AV1_HIGH_VAR_OFFS_12 };
-  var =
-      cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                         CONVERT_TO_BYTEPTR(high_var_offs[off_index]), 0, &sse);
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
-}
-
-static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi,
-                                                   const struct buf_2d *ref,
-                                                   int mi_row, int mi_col,
-                                                   BLOCK_SIZE bs) {
-  unsigned int sse, var;
-  uint8_t *last_y;
-  const YV12_BUFFER_CONFIG *last =
-      get_ref_frame_yv12_buf(&cpi->common, LAST_FRAME);
-
-  assert(last != NULL);
-  last_y =
-      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
-  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
+  var = cpi->ppi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                                CONVERT_TO_BYTEPTR(high_var_offs[off_index]), 0,
+                                &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
-static BLOCK_SIZE get_rd_var_based_fixed_partition(AV1_COMP *cpi, MACROBLOCK *x,
-                                                   int mi_row, int mi_col) {
-  unsigned int var = get_sby_perpixel_diff_variance(
-      cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64);
-  if (var < 8)
-    return BLOCK_64X64;
-  else if (var < 128)
-    return BLOCK_32X32;
-  else if (var < 2048)
-    return BLOCK_16X16;
-  else
-    return BLOCK_8X8;
-}
-
-static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const CommonQuantParams *quant_params = &cm->quant_params;
-  return av1_compute_rd_mult(cpi, quant_params->base_qindex + xd->delta_qindex +
-                                      quant_params->y_dc_delta_q);
-}
-
-static AOM_INLINE void set_ssim_rdmult(const AV1_COMP *const cpi,
-                                       MACROBLOCK *const x,
-                                       const BLOCK_SIZE bsize, const int mi_row,
-                                       const int mi_col, int *const rdmult) {
-  const AV1_COMMON *const cm = &cpi->common;
-
-  const int bsize_base = BLOCK_16X16;
-  const int num_mi_w = mi_size_wide[bsize_base];
-  const int num_mi_h = mi_size_high[bsize_base];
-  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
-  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
-  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
-  int row, col;
-  double num_of_mi = 0.0;
-  double geom_mean_of_scale = 0.0;
-
-  assert(cpi->oxcf.tuning == AOM_TUNE_SSIM);
-
-  aom_clear_system_state();
-  for (row = mi_row / num_mi_w;
-       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
-    for (col = mi_col / num_mi_h;
-         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
-      const int index = row * num_cols + col;
-      geom_mean_of_scale += log(cpi->ssim_rdmult_scaling_factors[index]);
-      num_of_mi += 1.0;
-    }
-  }
-  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
-
-  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
-  *rdmult = AOMMAX(*rdmult, 0);
-  set_error_per_bit(x, *rdmult);
-  aom_clear_system_state();
-}
-
-static int get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                               const BLOCK_SIZE bsize, const int mi_row,
-                               const int mi_col, int orig_rdmult) {
-  const AV1_COMMON *const cm = &cpi->common;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int tpl_idx = cpi->gf_group.index;
-  const TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int deltaq_rdmult = set_deltaq_rdmult(cpi, xd);
-  if (tpl_frame->is_valid == 0) return deltaq_rdmult;
-  if (!is_frame_tpl_eligible((AV1_COMP *)cpi)) return deltaq_rdmult;
-  if (tpl_idx >= MAX_LAG_BUFFERS) return deltaq_rdmult;
-  if (cpi->superres_mode != SUPERRES_NONE) return deltaq_rdmult;
-  if (cpi->oxcf.aq_mode != NO_AQ) return deltaq_rdmult;
-
-  const int bsize_base = BLOCK_16X16;
-  const int num_mi_w = mi_size_wide[bsize_base];
-  const int num_mi_h = mi_size_high[bsize_base];
-  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
-  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
-  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
-  int row, col;
-  double base_block_count = 0.0;
-  double geom_mean_of_scale = 0.0;
-  aom_clear_system_state();
-  for (row = mi_row / num_mi_w;
-       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
-    for (col = mi_col / num_mi_h;
-         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
-      const int index = row * num_cols + col;
-      geom_mean_of_scale += log(cpi->tpl_sb_rdmult_scaling_factors[index]);
-      base_block_count += 1.0;
-    }
-  }
-  geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count);
-  int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5);
-  rdmult = AOMMAX(rdmult, 0);
-  set_error_per_bit(x, rdmult);
-  aom_clear_system_state();
-  if (bsize == cm->seq_params.sb_size) {
-    const int rdmult_sb = set_deltaq_rdmult(cpi, xd);
-    assert(rdmult_sb == rdmult);
-    (void)rdmult_sb;
-  }
-  return rdmult;
-}
-
-static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                              int8_t segment_id) {
-  const AV1_COMMON *const cm = &cpi->common;
-  av1_init_plane_quantizers(cpi, x, segment_id);
-  aom_clear_system_state();
-  const int segment_qindex =
-      av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
-  return av1_compute_rd_mult(cpi,
-                             segment_qindex + cm->quant_params.y_dc_delta_q);
-}
-
-static AOM_INLINE void setup_block_rdmult(const AV1_COMP *const cpi,
-                                          MACROBLOCK *const x, int mi_row,
-                                          int mi_col, BLOCK_SIZE bsize,
-                                          AQ_MODE aq_mode, MB_MODE_INFO *mbmi) {
-  x->rdmult = cpi->rd.RDMULT;
-
-  if (aq_mode != NO_AQ) {
-    assert(mbmi != NULL);
-    if (aq_mode == VARIANCE_AQ) {
-      if (cpi->vaq_refresh) {
-        const int energy = bsize <= BLOCK_16X16
-                               ? x->mb_energy
-                               : av1_log_block_var(cpi, x, bsize);
-        mbmi->segment_id = energy;
-      }
-      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
-    } else if (aq_mode == COMPLEXITY_AQ) {
-      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
-    } else if (aq_mode == CYCLIC_REFRESH_AQ) {
-      // If segment is boosted, use rdmult for that segment.
-      if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
-        x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
-    }
-  }
-
-  const AV1_COMMON *const cm = &cpi->common;
-  if (cm->delta_q_info.delta_q_present_flag &&
-      !cpi->sf.rt_sf.use_nonrd_pick_mode) {
-    x->rdmult = get_hier_tpl_rdmult(cpi, x, bsize, mi_row, mi_col, x->rdmult);
-  }
-
-  if (cpi->oxcf.tuning == AOM_TUNE_SSIM) {
-    set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
-  }
-#if CONFIG_TUNE_VMAF
-  if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
-      cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
-    av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
-  }
-#endif
-}
-
-static AOM_INLINE void set_offsets_without_segment_id(
-    const AV1_COMP *const cpi, const TileInfo *const tile, MACROBLOCK *const x,
-    int mi_row, int mi_col, BLOCK_SIZE bsize) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  assert(bsize < BLOCK_SIZES_ALL);
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-
-  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
-                        mi_row, mi_col);
-
-  set_entropy_context(xd, mi_row, mi_col, num_planes);
-  xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-
-  // Set up destination pointers.
-  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
-                       num_planes);
-
-  // Set up limit values for MV components.
-  // Mv beyond the range do not produce new/different prediction block.
-  av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
-                    mi_width, cpi->oxcf.border_in_pixels);
-
-  set_plane_n4(xd, mi_width, mi_height, num_planes);
-
-  // Set up distance of MB to edge of frame in 1/8th pel units.
-  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
-  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
-                 cm->mi_params.mi_rows, cm->mi_params.mi_cols);
-
-  // Set up source buffers.
-  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-
-  // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
-  xd->tile = *tile;
-}
-
-static AOM_INLINE void set_offsets(const AV1_COMP *const cpi,
-                                   const TileInfo *const tile,
-                                   MACROBLOCK *const x, int mi_row, int mi_col,
-                                   BLOCK_SIZE bsize) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const struct segmentation *const seg = &cm->seg;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
-
-  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
-
-  // Setup segment ID.
-  mbmi = xd->mi[0];
-  mbmi->segment_id = 0;
-  if (seg->enabled) {
-    if (seg->enabled && !cpi->vaq_refresh) {
-      const uint8_t *const map =
-          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
-      mbmi->segment_id =
-          map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0;
-    }
-    av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
-  }
-}
-
-static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
-                                                const MACROBLOCKD *xd,
-                                                const MB_MODE_INFO *mbmi) {
-  int dir;
-  for (dir = 0; dir < 2; ++dir) {
-    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
-    ++counts->switchable_interp[ctx][filter];
-  }
-}
-
-static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
-                                              const MB_MODE_INFO *mbmi) {
-  int dir;
-  for (dir = 0; dir < 2; ++dir) {
-    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
-    update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
-               SWITCHABLE_FILTERS);
-  }
-}
-
-static AOM_INLINE void update_global_motion_used(PREDICTION_MODE mode,
-                                                 BLOCK_SIZE bsize,
-                                                 const MB_MODE_INFO *mbmi,
-                                                 RD_COUNTS *rdc) {
-  if (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) {
-    const int num_4x4s = mi_size_wide[bsize] * mi_size_high[bsize];
-    int ref;
-    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-      rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s;
-    }
-  }
-}
-
-static AOM_INLINE void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
-                                     const TX_MODE tx_mode) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  if (xd->lossless[mbmi->segment_id]) {
-    mbmi->tx_size = TX_4X4;
-  } else if (tx_mode != TX_MODE_SELECT) {
-    mbmi->tx_size = tx_size_from_tx_mode(mbmi->sb_type, tx_mode);
-  } else {
-    BLOCK_SIZE bsize = mbmi->sb_type;
-    TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize);
-    mbmi->tx_size = (TX_SIZE)TXSIZEMAX(mbmi->tx_size, min_tx_size);
-  }
-  if (is_inter_block(mbmi)) {
-    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
-  }
-  const int stride = xd->tx_type_map_stride;
-  const int bw = mi_size_wide[mbmi->sb_type];
-  for (int row = 0; row < mi_size_high[mbmi->sb_type]; ++row) {
-    memset(xd->tx_type_map + row * stride, DCT_DCT,
-           bw * sizeof(xd->tx_type_map[0]));
-  }
-  av1_zero(x->blk_skip);
-  x->force_skip = 0;
-}
-
-// This function will copy the best reference mode information from
-// MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT.
-static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
-    MB_MODE_INFO_EXT *mbmi_ext,
-    const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) {
-  memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
-         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
-  memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
-         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
-  mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
-  mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
-  memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
-         sizeof(mbmi_ext->global_mvs));
-}
-
-static AOM_INLINE void update_state(const AV1_COMP *const cpi, ThreadData *td,
-                                    const PICK_MODE_CONTEXT *const ctx,
-                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                    RUN_TYPE dry_run) {
-  int i, x_idx, y;
-  const AV1_COMMON *const cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int num_planes = av1_num_planes(cm);
-  RD_COUNTS *const rdc = &td->rd_counts;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = x->plane;
-  struct macroblockd_plane *const pd = xd->plane;
-  const MB_MODE_INFO *const mi = &ctx->mic;
-  MB_MODE_INFO *const mi_addr = xd->mi[0];
-  const struct segmentation *const seg = &cm->seg;
-  const int bw = mi_size_wide[mi->sb_type];
-  const int bh = mi_size_high[mi->sb_type];
-  const int mis = mi_params->mi_stride;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-
-  assert(mi->sb_type == bsize);
-
-  *mi_addr = *mi;
-  copy_mbmi_ext_frame_to_mbmi_ext(x->mbmi_ext, &ctx->mbmi_ext_best,
-                                  av1_ref_frame_type(ctx->mic.ref_frame));
-
-  memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-
-  x->force_skip = ctx->rd_stats.skip;
-
-  xd->tx_type_map = ctx->tx_type_map;
-  xd->tx_type_map_stride = mi_size_wide[bsize];
-  // If not dry_run, copy the transform type data into the frame level buffer.
-  // Encoder will fetch tx types when writing bitstream.
-  if (!dry_run) {
-    const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
-    uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx;
-    const int mi_stride = mi_params->mi_stride;
-    for (int blk_row = 0; blk_row < bh; ++blk_row) {
-      av1_copy_array(tx_type_map + blk_row * mi_stride,
-                     xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw);
-    }
-    xd->tx_type_map = tx_type_map;
-    xd->tx_type_map_stride = mi_stride;
-  }
-
-  // If segmentation in use
-  if (seg->enabled) {
-    // For in frame complexity AQ copy the segment id from the segment map.
-    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
-      const uint8_t *const map =
-          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
-      mi_addr->segment_id =
-          map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0;
-      reset_tx_size(x, mi_addr, x->tx_mode_search_type);
-    }
-    // Else for cyclic refresh mode update the segment map, set the segment id
-    // and then update the quantizer.
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-      av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize,
-                                        ctx->rd_stats.rate, ctx->rd_stats.dist,
-                                        x->force_skip);
-    }
-    if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
-      mi_addr->uv_mode = UV_DC_PRED;
-  }
-
-  for (i = 0; i < num_planes; ++i) {
-    p[i].coeff = ctx->coeff[i];
-    p[i].qcoeff = ctx->qcoeff[i];
-    pd[i].dqcoeff = ctx->dqcoeff[i];
-    p[i].eobs = ctx->eobs[i];
-    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
-  }
-  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
-  // Restore the coding context of the MB to that that was in place
-  // when the mode was picked for it
-  for (y = 0; y < mi_height; y++) {
-    for (x_idx = 0; x_idx < mi_width; x_idx++) {
-      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
-          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
-        xd->mi[x_idx + y * mis] = mi_addr;
-      }
-    }
-  }
-
-  if (cpi->oxcf.aq_mode) av1_init_plane_quantizers(cpi, x, mi_addr->segment_id);
-
-  if (dry_run) return;
-
-#if CONFIG_INTERNAL_STATS
-  {
-    unsigned int *const mode_chosen_counts =
-        (unsigned int *)cpi->mode_chosen_counts;  // Cast const away.
-    if (frame_is_intra_only(cm)) {
-      static const int kf_mode_index[] = {
-        THR_DC /*DC_PRED*/,
-        THR_V_PRED /*V_PRED*/,
-        THR_H_PRED /*H_PRED*/,
-        THR_D45_PRED /*D45_PRED*/,
-        THR_D135_PRED /*D135_PRED*/,
-        THR_D113_PRED /*D113_PRED*/,
-        THR_D157_PRED /*D157_PRED*/,
-        THR_D203_PRED /*D203_PRED*/,
-        THR_D67_PRED /*D67_PRED*/,
-        THR_SMOOTH,   /*SMOOTH_PRED*/
-        THR_SMOOTH_V, /*SMOOTH_V_PRED*/
-        THR_SMOOTH_H, /*SMOOTH_H_PRED*/
-        THR_PAETH /*PAETH_PRED*/,
-      };
-      ++mode_chosen_counts[kf_mode_index[mi_addr->mode]];
-    } else {
-      // Note how often each mode chosen as best
-      ++mode_chosen_counts[ctx->best_mode_index];
-    }
-  }
-#endif
-  if (!frame_is_intra_only(cm)) {
-    if (is_inter_block(mi_addr)) {
-      // TODO(sarahparker): global motion stats need to be handled per-tile
-      // to be compatible with tile-based threading.
-      update_global_motion_used(mi_addr->mode, bsize, mi_addr, rdc);
-    }
-
-    if (cm->features.interp_filter == SWITCHABLE &&
-        mi_addr->motion_mode != WARPED_CAUSAL &&
-        !is_nontrans_global_motion(xd, xd->mi[0])) {
-      update_filter_type_count(td->counts, xd, mi_addr);
-    }
-
-    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
-    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
-    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
-  }
-
-  const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col);
-  const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row);
-  if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
-    av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
-}
-
 void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col, const int num_planes,
                           BLOCK_SIZE bsize) {
@@ -685,3433 +188,23 @@ void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
   }
 }
 
-static EdgeInfo edge_info(const struct buf_2d *ref, const BLOCK_SIZE bsize,
-                          const bool high_bd, const int bd) {
-  const int width = block_size_wide[bsize];
-  const int height = block_size_high[bsize];
-  // Implementation requires width to be a multiple of 8. It also requires
-  // height to be a multiple of 4, but this is always the case.
-  assert(height % 4 == 0);
-  if (width % 8 != 0) {
-    EdgeInfo ei = { .magnitude = 0, .x = 0, .y = 0 };
-    return ei;
-  }
-  return av1_edge_exists(ref->buf, ref->stride, width, height, high_bd, bd);
-}
-
-static int use_pb_simple_motion_pred_sse(const AV1_COMP *const cpi) {
-  // TODO(debargha, yuec): Not in use, need to implement a speed feature
-  // utilizing this data point, and replace '0' by the corresponding speed
-  // feature flag.
-  return 0 && !frame_is_intra_only(&cpi->common);
-}
-
-static void hybrid_intra_mode_search(AV1_COMP *cpi, MACROBLOCK *const x,
-                                     RD_STATS *rd_cost, BLOCK_SIZE bsize,
-                                     PICK_MODE_CONTEXT *ctx) {
-  // TODO(jianj): Investigate the failure of ScalabilityTest in AOM_Q mode,
-  // which sets base_qindex to 0 on keyframe.
-  if (cpi->oxcf.rc_mode != AOM_CBR || !cpi->sf.rt_sf.hybrid_intra_pickmode ||
-      bsize < BLOCK_16X16)
-    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
-  else
-    av1_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
-}
-
-static AOM_INLINE void pick_sb_modes(AV1_COMP *const cpi,
-                                     TileDataEnc *tile_data,
-                                     MACROBLOCK *const x, int mi_row,
-                                     int mi_col, RD_STATS *rd_cost,
-                                     PARTITION_TYPE partition, BLOCK_SIZE bsize,
-                                     PICK_MODE_CONTEXT *ctx, RD_STATS best_rd,
-                                     int pick_mode_type) {
-  if (best_rd.rdcost < 0) {
-    ctx->rd_stats.rdcost = INT64_MAX;
-    ctx->rd_stats.skip = 0;
-    av1_invalid_rd_stats(rd_cost);
-    return;
-  }
-
-  set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
-
-  if (ctx->rd_mode_is_ready) {
-    assert(ctx->mic.sb_type == bsize);
-    assert(ctx->mic.partition == partition);
-    rd_cost->rate = ctx->rd_stats.rate;
-    rd_cost->dist = ctx->rd_stats.dist;
-    rd_cost->rdcost = ctx->rd_stats.rdcost;
-    return;
-  }
-
-  AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
-  struct macroblock_plane *const p = x->plane;
-  struct macroblockd_plane *const pd = xd->plane;
-  const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
-  int i;
-
-#if CONFIG_COLLECT_COMPONENT_TIMING
-  start_timing(cpi, rd_pick_sb_modes_time);
-#endif
-
-  aom_clear_system_state();
-
-  mbmi = xd->mi[0];
-  mbmi->sb_type = bsize;
-  mbmi->partition = partition;
-
-#if CONFIG_RD_DEBUG
-  mbmi->mi_row = mi_row;
-  mbmi->mi_col = mi_col;
-#endif
-
-  xd->tx_type_map = x->tx_type_map;
-  xd->tx_type_map_stride = mi_size_wide[bsize];
-
-  for (i = 0; i < num_planes; ++i) {
-    p[i].coeff = ctx->coeff[i];
-    p[i].qcoeff = ctx->qcoeff[i];
-    pd[i].dqcoeff = ctx->dqcoeff[i];
-    p[i].eobs = ctx->eobs[i];
-    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
-  }
-
-  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
-
-  ctx->skippable = 0;
-  // Set to zero to make sure we do not use the previous encoded frame stats
-  mbmi->skip = 0;
-  // Reset skip mode flag.
-  mbmi->skip_mode = 0;
-
-  if (is_cur_buf_hbd(xd)) {
-    x->source_variance = av1_high_get_sby_perpixel_variance(
-        cpi, &x->plane[0].src, bsize, xd->bd);
-  } else {
-    x->source_variance =
-        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
-  }
-  if (use_pb_simple_motion_pred_sse(cpi)) {
-    const FULLPEL_MV start_mv = kZeroFullMv;
-    unsigned int var = 0;
-    av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, start_mv, 0,
-                              &x->simple_motion_pred_sse, &var);
-  }
-
-  // If the threshold for disabling wedge search is zero, it means the feature
-  // should not be used. Use a value that will always succeed in the check.
-  if (cpi->sf.inter_sf.disable_wedge_search_edge_thresh == 0) {
-    x->edge_strength = UINT16_MAX;
-    x->edge_strength_x = UINT16_MAX;
-    x->edge_strength_y = UINT16_MAX;
-  } else {
-    EdgeInfo ei =
-        edge_info(&x->plane[0].src, bsize, is_cur_buf_hbd(xd), xd->bd);
-    x->edge_strength = ei.magnitude;
-    x->edge_strength_x = ei.x;
-    x->edge_strength_y = ei.y;
-  }
-
-  // Initialize default mode evaluation params
-  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
-
-  // Save rdmult before it might be changed, so it can be restored later.
-  const int orig_rdmult = x->rdmult;
-  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
-  // Set error per bit for current rdmult
-  set_error_per_bit(x, x->rdmult);
-  av1_rd_cost_update(x->rdmult, &best_rd);
-
-  // Find best coding mode & reconstruct the MB so it is available
-  // as a predictor for MBs that follow in the SB
-  if (frame_is_intra_only(cm)) {
-#if CONFIG_COLLECT_COMPONENT_TIMING
-    start_timing(cpi, av1_rd_pick_intra_mode_sb_time);
-#endif
-    switch (pick_mode_type) {
-      case PICK_MODE_RD:
-        av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost);
-        break;
-      case PICK_MODE_NONRD:
-        hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
-        break;
-      default: assert(0 && "Unknown pick mode type.");
-    }
-#if CONFIG_COLLECT_COMPONENT_TIMING
-    end_timing(cpi, av1_rd_pick_intra_mode_sb_time);
-#endif
-  } else {
-#if CONFIG_COLLECT_COMPONENT_TIMING
-    start_timing(cpi, av1_rd_pick_inter_mode_sb_time);
-#endif
-    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
-                                         rd_cost, bsize, ctx, best_rd.rdcost);
-    } else {
-      // TODO(kyslov): do the same for pick_inter_mode_sb_seg_skip
-      switch (pick_mode_type) {
-        case PICK_MODE_RD:
-          av1_rd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx,
-                                    best_rd.rdcost);
-          break;
-        case PICK_MODE_NONRD:
-          av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx,
-                                       best_rd.rdcost);
-          break;
-        default: assert(0 && "Unknown pick mode type.");
-      }
-    }
-#if CONFIG_COLLECT_COMPONENT_TIMING
-    end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
-#endif
-  }
-
-  // Examine the resulting rate and for AQ mode 2 make a segment choice.
-  if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ &&
-      bsize >= BLOCK_16X16) {
-    av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
-  }
-
-  x->rdmult = orig_rdmult;
-
-  // TODO(jingning) The rate-distortion optimization flow needs to be
-  // refactored to provide proper exit/return handle.
-  if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
-
-  ctx->rd_stats.rate = rd_cost->rate;
-  ctx->rd_stats.dist = rd_cost->dist;
-  ctx->rd_stats.rdcost = rd_cost->rdcost;
-
-#if CONFIG_COLLECT_COMPONENT_TIMING
-  end_timing(cpi, rd_pick_sb_modes_time);
-#endif
-}
-
-static AOM_INLINE void update_inter_mode_stats(FRAME_CONTEXT *fc,
-                                               FRAME_COUNTS *counts,
-                                               PREDICTION_MODE mode,
-                                               int16_t mode_context) {
-  (void)counts;
-
-  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
-  if (mode == NEWMV) {
-#if CONFIG_ENTROPY_STATS
-    ++counts->newmv_mode[mode_ctx][0];
-#endif
-    update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
-    return;
-  }
-
-#if CONFIG_ENTROPY_STATS
-  ++counts->newmv_mode[mode_ctx][1];
-#endif
-  update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
-
-  mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
-  if (mode == GLOBALMV) {
-#if CONFIG_ENTROPY_STATS
-    ++counts->zeromv_mode[mode_ctx][0];
-#endif
-    update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
-    return;
-  }
-
-#if CONFIG_ENTROPY_STATS
-  ++counts->zeromv_mode[mode_ctx][1];
-#endif
-  update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
-
-  mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
-#if CONFIG_ENTROPY_STATS
-  ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
-#endif
-  update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
-}
-
-static AOM_INLINE void update_palette_cdf(MACROBLOCKD *xd,
-                                          const MB_MODE_INFO *const mbmi,
-                                          FRAME_COUNTS *counts) {
-  FRAME_CONTEXT *fc = xd->tile_ctx;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize);
-
-  (void)counts;
-
-  if (mbmi->mode == DC_PRED) {
-    const int n = pmi->palette_size[0];
-    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
-
-#if CONFIG_ENTROPY_STATS
-    ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
-#endif
-    update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
-               n > 0, 2);
-    if (n > 0) {
-#if CONFIG_ENTROPY_STATS
-      ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
-#endif
-      update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
-                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
-    }
-  }
-
-  if (mbmi->uv_mode == UV_DC_PRED) {
-    const int n = pmi->palette_size[1];
-    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
-
-#if CONFIG_ENTROPY_STATS
-    ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
-#endif
-    update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
-
-    if (n > 0) {
-#if CONFIG_ENTROPY_STATS
-      ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
-#endif
-      update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
-                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
-    }
-  }
-}
-
-static AOM_INLINE void sum_intra_stats(const AV1_COMMON *const cm,
-                                       FRAME_COUNTS *counts, MACROBLOCKD *xd,
-                                       const MB_MODE_INFO *const mbmi,
-                                       const MB_MODE_INFO *above_mi,
-                                       const MB_MODE_INFO *left_mi,
-                                       const int intraonly) {
-  FRAME_CONTEXT *fc = xd->tile_ctx;
-  const PREDICTION_MODE y_mode = mbmi->mode;
-  (void)counts;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-
-  if (intraonly) {
-#if CONFIG_ENTROPY_STATS
-    const PREDICTION_MODE above = av1_above_block_mode(above_mi);
-    const PREDICTION_MODE left = av1_left_block_mode(left_mi);
-    const int above_ctx = intra_mode_context[above];
-    const int left_ctx = intra_mode_context[left];
-    ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
-#endif  // CONFIG_ENTROPY_STATS
-    update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
-  } else {
-#if CONFIG_ENTROPY_STATS
-    ++counts->y_mode[size_group_lookup[bsize]][y_mode];
-#endif  // CONFIG_ENTROPY_STATS
-    update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
-  }
-
-  if (av1_filter_intra_allowed(cm, mbmi)) {
-    const int use_filter_intra_mode =
-        mbmi->filter_intra_mode_info.use_filter_intra;
-#if CONFIG_ENTROPY_STATS
-    ++counts->filter_intra[mbmi->sb_type][use_filter_intra_mode];
-    if (use_filter_intra_mode) {
-      ++counts
-            ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
-    }
-#endif  // CONFIG_ENTROPY_STATS
-    update_cdf(fc->filter_intra_cdfs[mbmi->sb_type], use_filter_intra_mode, 2);
-    if (use_filter_intra_mode) {
-      update_cdf(fc->filter_intra_mode_cdf,
-                 mbmi->filter_intra_mode_info.filter_intra_mode,
-                 FILTER_INTRA_MODES);
-    }
-  }
-  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
-#if CONFIG_ENTROPY_STATS
-    ++counts->angle_delta[mbmi->mode - V_PRED]
-                         [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
-#endif
-    update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
-               mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
-               2 * MAX_ANGLE_DELTA + 1);
-  }
-
-  if (!xd->is_chroma_ref) return;
-
-  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
-  const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
-#if CONFIG_ENTROPY_STATS
-  ++counts->uv_mode[cfl_allowed][y_mode][uv_mode];
-#endif  // CONFIG_ENTROPY_STATS
-  update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
-             UV_INTRA_MODES - !cfl_allowed);
-  if (uv_mode == UV_CFL_PRED) {
-    const int8_t joint_sign = mbmi->cfl_alpha_signs;
-    const uint8_t idx = mbmi->cfl_alpha_idx;
-
-#if CONFIG_ENTROPY_STATS
-    ++counts->cfl_sign[joint_sign];
-#endif
-    update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
-    if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
-      aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
-
-#if CONFIG_ENTROPY_STATS
-      ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
-#endif
-      update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
-    }
-    if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
-      aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
-
-#if CONFIG_ENTROPY_STATS
-      ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
-#endif
-      update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
-    }
-  }
-  if (av1_is_directional_mode(get_uv_mode(uv_mode)) &&
-      av1_use_angle_delta(bsize)) {
-#if CONFIG_ENTROPY_STATS
-    ++counts->angle_delta[uv_mode - UV_V_PRED]
-                         [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
-#endif
-    update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
-               mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
-               2 * MAX_ANGLE_DELTA + 1);
-  }
-  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
-    update_palette_cdf(xd, mbmi, counts);
-  }
-}
-
-static AOM_INLINE void update_stats(const AV1_COMMON *const cm,
-                                    ThreadData *td) {
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = xd->mi[0];
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  const CurrentFrame *const current_frame = &cm->current_frame;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  FRAME_CONTEXT *fc = xd->tile_ctx;
-  const int seg_ref_active =
-      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
-
-  if (current_frame->skip_mode_info.skip_mode_flag && !seg_ref_active &&
-      is_comp_ref_allowed(bsize)) {
-    const int skip_mode_ctx = av1_get_skip_mode_context(xd);
-#if CONFIG_ENTROPY_STATS
-    td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
-#endif
-    update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
-  }
-
-  if (!mbmi->skip_mode && !seg_ref_active) {
-    const int skip_ctx = av1_get_skip_context(xd);
-#if CONFIG_ENTROPY_STATS
-    td->counts->skip[skip_ctx][mbmi->skip]++;
-#endif
-    update_cdf(fc->skip_cdfs[skip_ctx], mbmi->skip, 2);
-  }
-
-#if CONFIG_ENTROPY_STATS
-  // delta quant applies to both intra and inter
-  const int super_block_upper_left =
-      ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-      ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
-  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
-  if (delta_q_info->delta_q_present_flag &&
-      (bsize != cm->seq_params.sb_size || !mbmi->skip) &&
-      super_block_upper_left) {
-    const int dq =
-        (mbmi->current_qindex - xd->current_qindex) / delta_q_info->delta_q_res;
-    const int absdq = abs(dq);
-    for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
-      td->counts->delta_q[i][1]++;
-    }
-    if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
-    if (delta_q_info->delta_lf_present_flag) {
-      if (delta_q_info->delta_lf_multi) {
-        const int frame_lf_count =
-            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
-          const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
-                               delta_q_info->delta_lf_res;
-          const int abs_delta_lf = abs(delta_lf);
-          for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
-            td->counts->delta_lf_multi[lf_id][i][1]++;
-          }
-          if (abs_delta_lf < DELTA_LF_SMALL)
-            td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
-        }
-      } else {
-        const int delta_lf =
-            (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
-            delta_q_info->delta_lf_res;
-        const int abs_delta_lf = abs(delta_lf);
-        for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
-          td->counts->delta_lf[i][1]++;
-        }
-        if (abs_delta_lf < DELTA_LF_SMALL)
-          td->counts->delta_lf[abs_delta_lf][0]++;
-      }
-    }
-  }
-#endif
-
-  if (!is_inter_block(mbmi)) {
-    sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
-                    frame_is_intra_only(cm));
-  }
-
-  if (av1_allow_intrabc(cm)) {
-    update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2);
-#if CONFIG_ENTROPY_STATS
-    ++td->counts->intrabc[is_intrabc_block(mbmi)];
-#endif  // CONFIG_ENTROPY_STATS
-  }
-
-  if (frame_is_intra_only(cm) || mbmi->skip_mode) return;
-
-  FRAME_COUNTS *const counts = td->counts;
-  const int inter_block = is_inter_block(mbmi);
-
-  if (!seg_ref_active) {
-#if CONFIG_ENTROPY_STATS
-    counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
-#endif
-    update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
-               inter_block, 2);
-    // If the segment reference feature is enabled we have only a single
-    // reference frame allowed for the segment so exclude it from
-    // the reference frame counts used to work out probabilities.
-    if (inter_block) {
-      const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
-      if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
-        if (is_comp_ref_allowed(bsize)) {
-#if CONFIG_ENTROPY_STATS
-          counts->comp_inter[av1_get_reference_mode_context(xd)]
-                            [has_second_ref(mbmi)]++;
-#endif  // CONFIG_ENTROPY_STATS
-          update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2);
-        }
-      }
-
-      if (has_second_ref(mbmi)) {
-        const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
-                                                      ? UNIDIR_COMP_REFERENCE
-                                                      : BIDIR_COMP_REFERENCE;
-        update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
-                   COMP_REFERENCE_TYPES);
-#if CONFIG_ENTROPY_STATS
-        counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
-                             [comp_ref_type]++;
-#endif  // CONFIG_ENTROPY_STATS
-
-        if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
-          const int bit = (ref0 == BWDREF_FRAME);
-          update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
-#if CONFIG_ENTROPY_STATS
-          counts
-              ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++;
-#endif  // CONFIG_ENTROPY_STATS
-          if (!bit) {
-            const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
-            update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
-#if CONFIG_ENTROPY_STATS
-            counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
-                                [bit1]++;
-#endif  // CONFIG_ENTROPY_STATS
-            if (bit1) {
-              update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
-                         ref1 == GOLDEN_FRAME, 2);
-#if CONFIG_ENTROPY_STATS
-              counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2]
-                                  [ref1 == GOLDEN_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-            }
-          }
-        } else {
-          const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
-          update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
-#if CONFIG_ENTROPY_STATS
-          counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
-#endif  // CONFIG_ENTROPY_STATS
-          if (!bit) {
-            update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME,
-                       2);
-#if CONFIG_ENTROPY_STATS
-            counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
-                            [ref0 == LAST2_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          } else {
-            update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME,
-                       2);
-#if CONFIG_ENTROPY_STATS
-            counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
-                            [ref0 == GOLDEN_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          }
-          update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME,
-                     2);
-#if CONFIG_ENTROPY_STATS
-          counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
-                             [ref1 == ALTREF_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          if (ref1 != ALTREF_FRAME) {
-            update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
-                       ref1 == ALTREF2_FRAME, 2);
-#if CONFIG_ENTROPY_STATS
-            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
-                               [ref1 == ALTREF2_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          }
-        }
-      } else {
-        const int bit = (ref0 >= BWDREF_FRAME);
-        update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
-#if CONFIG_ENTROPY_STATS
-        counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
-#endif  // CONFIG_ENTROPY_STATS
-        if (bit) {
-          assert(ref0 <= ALTREF_FRAME);
-          update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME,
-                     2);
-#if CONFIG_ENTROPY_STATS
-          counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
-                            [ref0 == ALTREF_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          if (ref0 != ALTREF_FRAME) {
-            update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
-                       ref0 == ALTREF2_FRAME, 2);
-#if CONFIG_ENTROPY_STATS
-            counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
-                              [ref0 == ALTREF2_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          }
-        } else {
-          const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
-          update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
-#if CONFIG_ENTROPY_STATS
-          counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
-#endif  // CONFIG_ENTROPY_STATS
-          if (!bit1) {
-            update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME,
-                       2);
-#if CONFIG_ENTROPY_STATS
-            counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
-                              [ref0 != LAST_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          } else {
-            update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME,
-                       2);
-#if CONFIG_ENTROPY_STATS
-            counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
-                              [ref0 != LAST3_FRAME]++;
-#endif  // CONFIG_ENTROPY_STATS
-          }
-        }
-      }
-
-      if (cm->seq_params.enable_interintra_compound &&
-          is_interintra_allowed(mbmi)) {
-        const int bsize_group = size_group_lookup[bsize];
-        if (mbmi->ref_frame[1] == INTRA_FRAME) {
-#if CONFIG_ENTROPY_STATS
-          counts->interintra[bsize_group][1]++;
-#endif
-          update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
-#if CONFIG_ENTROPY_STATS
-          counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
-#endif
-          update_cdf(fc->interintra_mode_cdf[bsize_group],
-                     mbmi->interintra_mode, INTERINTRA_MODES);
-          if (av1_is_wedge_used(bsize)) {
-#if CONFIG_ENTROPY_STATS
-            counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
-#endif
-            update_cdf(fc->wedge_interintra_cdf[bsize],
-                       mbmi->use_wedge_interintra, 2);
-            if (mbmi->use_wedge_interintra) {
-#if CONFIG_ENTROPY_STATS
-              counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
-#endif
-              update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index,
-                         16);
-            }
-          }
-        } else {
-#if CONFIG_ENTROPY_STATS
-          counts->interintra[bsize_group][0]++;
-#endif
-          update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
-        }
-      }
-
-      const MOTION_MODE motion_allowed =
-          cm->features.switchable_motion_mode
-              ? motion_mode_allowed(xd->global_motion, xd, mbmi,
-                                    cm->features.allow_warped_motion)
-              : SIMPLE_TRANSLATION;
-      if (mbmi->ref_frame[1] != INTRA_FRAME) {
-        if (motion_allowed == WARPED_CAUSAL) {
-#if CONFIG_ENTROPY_STATS
-          counts->motion_mode[bsize][mbmi->motion_mode]++;
-#endif
-          update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
-                     MOTION_MODES);
-        } else if (motion_allowed == OBMC_CAUSAL) {
-#if CONFIG_ENTROPY_STATS
-          counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
-#endif
-          update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2);
-        }
-      }
-
-      if (has_second_ref(mbmi)) {
-        assert(current_frame->reference_mode != SINGLE_REFERENCE &&
-               is_inter_compound_mode(mbmi->mode) &&
-               mbmi->motion_mode == SIMPLE_TRANSLATION);
-
-        const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                         cm->seq_params.enable_masked_compound;
-        if (masked_compound_used) {
-          const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
-#if CONFIG_ENTROPY_STATS
-          ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
-#endif
-          update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
-                     mbmi->comp_group_idx, 2);
-        }
-
-        if (mbmi->comp_group_idx == 0) {
-          const int comp_index_ctx = get_comp_index_context(cm, xd);
-#if CONFIG_ENTROPY_STATS
-          ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
-#endif
-          update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx,
-                     2);
-        } else {
-          assert(masked_compound_used);
-          if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
-#if CONFIG_ENTROPY_STATS
-            ++counts->compound_type[bsize][mbmi->interinter_comp.type -
-                                           COMPOUND_WEDGE];
-#endif
-            update_cdf(fc->compound_type_cdf[bsize],
-                       mbmi->interinter_comp.type - COMPOUND_WEDGE,
-                       MASKED_COMPOUND_TYPES);
-          }
-        }
-      }
-      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
-        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
-#if CONFIG_ENTROPY_STATS
-          counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
-#endif
-          update_cdf(fc->wedge_idx_cdf[bsize],
-                     mbmi->interinter_comp.wedge_index, 16);
-        }
-      }
-    }
-  }
-
-  if (inter_block && cm->features.interp_filter == SWITCHABLE &&
-      mbmi->motion_mode != WARPED_CAUSAL &&
-      !is_nontrans_global_motion(xd, mbmi)) {
-    update_filter_type_cdf(xd, mbmi);
-  }
-  if (inter_block &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    const PREDICTION_MODE mode = mbmi->mode;
-    const int16_t mode_ctx =
-        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
-    if (has_second_ref(mbmi)) {
-#if CONFIG_ENTROPY_STATS
-      ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
-#endif
-      update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
-                 INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
-    } else {
-      update_inter_mode_stats(fc, counts, mode, mode_ctx);
-    }
-
-    const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
-    if (new_mv) {
-      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-      for (int idx = 0; idx < 2; ++idx) {
-        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-          const uint8_t drl_ctx =
-              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
-          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2);
-#if CONFIG_ENTROPY_STATS
-          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
-#endif
-          if (mbmi->ref_mv_idx == idx) break;
-        }
-      }
-    }
-
-    if (have_nearmv_in_inter_mode(mbmi->mode)) {
-      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-      for (int idx = 1; idx < 3; ++idx) {
-        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-          const uint8_t drl_ctx =
-              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
-          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2);
-#if CONFIG_ENTROPY_STATS
-          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
-#endif
-          if (mbmi->ref_mv_idx == idx - 1) break;
-        }
-      }
-    }
-    if (have_newmv_in_inter_mode(mbmi->mode)) {
-      const int allow_hp = cm->features.cur_frame_force_integer_mv
-                               ? MV_SUBPEL_NONE
-                               : cm->features.allow_high_precision_mv;
-      if (new_mv) {
-        for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-          const int_mv ref_mv = av1_get_ref_mv(x, ref);
-          av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
-                              allow_hp);
-        }
-      } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) {
-        const int ref = 1;
-        const int_mv ref_mv = av1_get_ref_mv(x, ref);
-        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
-                            allow_hp);
-      } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) {
-        const int ref = 0;
-        const int_mv ref_mv = av1_get_ref_mv(x, ref);
-        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
-                            allow_hp);
-      }
-    }
-  }
-}
-
-static AOM_INLINE void restore_context(MACROBLOCK *x,
-                                       const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
-                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                       const int num_planes) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  int p;
-  const int num_4x4_blocks_wide = mi_size_wide[bsize];
-  const int num_4x4_blocks_high = mi_size_high[bsize];
-  int mi_width = mi_size_wide[bsize];
-  int mi_height = mi_size_high[bsize];
-  for (p = 0; p < num_planes; p++) {
-    int tx_col = mi_col;
-    int tx_row = mi_row & MAX_MIB_MASK;
-    memcpy(
-        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
-        ctx->a + num_4x4_blocks_wide * p,
-        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
-            xd->plane[p].subsampling_x);
-    memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
-           ctx->l + num_4x4_blocks_high * p,
-           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
-               xd->plane[p].subsampling_y);
-  }
-  memcpy(xd->above_partition_context + mi_col, ctx->sa,
-         sizeof(*xd->above_partition_context) * mi_width);
-  memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl,
-         sizeof(xd->left_partition_context[0]) * mi_height);
-  xd->above_txfm_context = ctx->p_ta;
-  xd->left_txfm_context = ctx->p_tl;
-  memcpy(xd->above_txfm_context, ctx->ta,
-         sizeof(*xd->above_txfm_context) * mi_width);
-  memcpy(xd->left_txfm_context, ctx->tl,
-         sizeof(*xd->left_txfm_context) * mi_height);
-}
-
-static AOM_INLINE void save_context(const MACROBLOCK *x,
-                                    RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
-                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                    const int num_planes) {
-  const MACROBLOCKD *xd = &x->e_mbd;
-  int p;
-  int mi_width = mi_size_wide[bsize];
-  int mi_height = mi_size_high[bsize];
-
-  // buffer the above/left context information of the block in search.
-  for (p = 0; p < num_planes; ++p) {
-    int tx_col = mi_col;
-    int tx_row = mi_row & MAX_MIB_MASK;
-    memcpy(
-        ctx->a + mi_width * p,
-        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
-        (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x);
-    memcpy(ctx->l + mi_height * p,
-           xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
-           (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y);
-  }
-  memcpy(ctx->sa, xd->above_partition_context + mi_col,
-         sizeof(*xd->above_partition_context) * mi_width);
-  memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK),
-         sizeof(xd->left_partition_context[0]) * mi_height);
-  memcpy(ctx->ta, xd->above_txfm_context,
-         sizeof(*xd->above_txfm_context) * mi_width);
-  memcpy(ctx->tl, xd->left_txfm_context,
-         sizeof(*xd->left_txfm_context) * mi_height);
-  ctx->p_ta = xd->above_txfm_context;
-  ctx->p_tl = xd->left_txfm_context;
-}
-
-static AOM_INLINE void encode_b(const AV1_COMP *const cpi,
-                                TileDataEnc *tile_data, ThreadData *td,
-                                TOKENEXTRA **tp, int mi_row, int mi_col,
-                                RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                                PARTITION_TYPE partition,
-                                PICK_MODE_CONTEXT *const ctx, int *rate) {
-  TileInfo *const tile = &tile_data->tile_info;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
-  const int origin_mult = x->rdmult;
-  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  mbmi->partition = partition;
-  update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
-
-  if (!dry_run) {
-    x->mbmi_ext_frame->cb_offset = x->cb_offset;
-    assert(x->cb_offset <
-           (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
-  }
-
-  encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
-
-  if (!dry_run) {
-    const AV1_COMMON *const cm = &cpi->common;
-    x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
-    if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 &&
-        cm->delta_q_info.delta_lf_present_flag) {
-      const int frame_lf_count =
-          av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-      for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
-        mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
-      mbmi->delta_lf_from_base = xd->delta_lf_from_base;
-    }
-    if (has_second_ref(mbmi)) {
-      if (mbmi->compound_idx == 0 ||
-          mbmi->interinter_comp.type == COMPOUND_AVERAGE)
-        mbmi->comp_group_idx = 0;
-      else
-        mbmi->comp_group_idx = 1;
-    }
-
-    // delta quant applies to both intra and inter
-    const int super_block_upper_left =
-        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
-    const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
-    if (delta_q_info->delta_q_present_flag &&
-        (bsize != cm->seq_params.sb_size || !mbmi->skip) &&
-        super_block_upper_left) {
-      xd->current_qindex = mbmi->current_qindex;
-      if (delta_q_info->delta_lf_present_flag) {
-        if (delta_q_info->delta_lf_multi) {
-          const int frame_lf_count =
-              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
-            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
-          }
-        } else {
-          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
-        }
-      }
-    }
-
-    RD_COUNTS *rdc = &td->rd_counts;
-    if (mbmi->skip_mode) {
-      assert(!frame_is_intra_only(cm));
-      rdc->skip_mode_used_flag = 1;
-      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
-        assert(has_second_ref(mbmi));
-        rdc->compound_ref_used_flag = 1;
-      }
-      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-    } else {
-      const int seg_ref_active =
-          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
-      if (!seg_ref_active) {
-        // If the segment reference feature is enabled we have only a single
-        // reference frame allowed for the segment so exclude it from
-        // the reference frame counts used to work out probabilities.
-        if (is_inter_block(mbmi)) {
-          av1_collect_neighbors_ref_counts(xd);
-          if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
-            if (has_second_ref(mbmi)) {
-              // This flag is also updated for 4x4 blocks
-              rdc->compound_ref_used_flag = 1;
-            }
-          }
-          set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-        }
-      }
-    }
-
-    if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
-
-    // Gather obmc and warped motion count to update the probability.
-    if ((!cpi->sf.inter_sf.disable_obmc &&
-         cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) ||
-        (cm->features.allow_warped_motion &&
-         cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) {
-      const int inter_block = is_inter_block(mbmi);
-      const int seg_ref_active =
-          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
-      if (!seg_ref_active && inter_block) {
-        const MOTION_MODE motion_allowed =
-            cm->features.switchable_motion_mode
-                ? motion_mode_allowed(xd->global_motion, xd, mbmi,
-                                      cm->features.allow_warped_motion)
-                : SIMPLE_TRANSLATION;
-
-        if (mbmi->ref_frame[1] != INTRA_FRAME) {
-          if (motion_allowed >= OBMC_CAUSAL) {
-            td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
-          }
-          if (motion_allowed == WARPED_CAUSAL) {
-            td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++;
-          }
-        }
-      }
-    }
-  }
-  // TODO(Ravi/Remya): Move this copy function to a better logical place
-  // This function will copy the best mode information from block
-  // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
-  // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
-  // bitstream preparation.
-  av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, x->mbmi_ext,
-                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
-  x->rdmult = origin_mult;
-}
-
-static AOM_INLINE void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
-                                 TileDataEnc *tile_data, TOKENEXTRA **tp,
-                                 int mi_row, int mi_col, RUN_TYPE dry_run,
-                                 BLOCK_SIZE bsize, PC_TREE *pc_tree,
-                                 int *rate) {
-  assert(bsize < BLOCK_SIZES_ALL);
-  const AV1_COMMON *const cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  assert(bsize < BLOCK_SIZES_ALL);
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int is_partition_root = bsize >= BLOCK_8X8;
-  const int ctx = is_partition_root
-                      ? partition_plane_context(xd, mi_row, mi_col, bsize)
-                      : -1;
-  const PARTITION_TYPE partition = pc_tree->partitioning;
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
-  int quarter_step = mi_size_wide[bsize] / 4;
-  int i;
-  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
-
-  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
-
-  if (!dry_run && ctx >= 0) {
-    const int has_rows = (mi_row + hbs) < mi_params->mi_rows;
-    const int has_cols = (mi_col + hbs) < mi_params->mi_cols;
-
-    if (has_rows && has_cols) {
-#if CONFIG_ENTROPY_STATS
-      td->counts->partition[ctx][partition]++;
-#endif
-
-      if (tile_data->allow_update_cdf) {
-        FRAME_CONTEXT *fc = xd->tile_ctx;
-        update_cdf(fc->partition_cdf[ctx], partition,
-                   partition_cdf_length(bsize));
-      }
-    }
-  }
-
-  switch (partition) {
-    case PARTITION_NONE:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
-               partition, &pc_tree->none, rate);
-      break;
-    case PARTITION_VERT:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
-               partition, &pc_tree->vertical[0], rate);
-      if (mi_col + hbs < mi_params->mi_cols) {
-        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
-                 partition, &pc_tree->vertical[1], rate);
-      }
-      break;
-    case PARTITION_HORZ:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
-               partition, &pc_tree->horizontal[0], rate);
-      if (mi_row + hbs < mi_params->mi_rows) {
-        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
-                 partition, &pc_tree->horizontal[1], rate);
-      }
-      break;
-    case PARTITION_SPLIT:
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
-                pc_tree->split[0], rate);
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
-                pc_tree->split[1], rate);
-      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
-                pc_tree->split[2], rate);
-      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
-                subsize, pc_tree->split[3], rate);
-      break;
-
-    case PARTITION_HORZ_A:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
-               partition, &pc_tree->horizontala[0], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
-               partition, &pc_tree->horizontala[1], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
-               partition, &pc_tree->horizontala[2], rate);
-      break;
-    case PARTITION_HORZ_B:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
-               partition, &pc_tree->horizontalb[0], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
-               partition, &pc_tree->horizontalb[1], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
-               bsize2, partition, &pc_tree->horizontalb[2], rate);
-      break;
-    case PARTITION_VERT_A:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
-               partition, &pc_tree->verticala[0], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
-               partition, &pc_tree->verticala[1], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
-               partition, &pc_tree->verticala[2], rate);
-
-      break;
-    case PARTITION_VERT_B:
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
-               partition, &pc_tree->verticalb[0], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
-               partition, &pc_tree->verticalb[1], rate);
-      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
-               bsize2, partition, &pc_tree->verticalb[2], rate);
-      break;
-    case PARTITION_HORZ_4:
-      for (i = 0; i < 4; ++i) {
-        int this_mi_row = mi_row + i * quarter_step;
-        if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
-
-        encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
-                 partition, &pc_tree->horizontal4[i], rate);
-      }
-      break;
-    case PARTITION_VERT_4:
-      for (i = 0; i < 4; ++i) {
-        int this_mi_col = mi_col + i * quarter_step;
-        if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
-        encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
-                 partition, &pc_tree->vertical4[i], rate);
-      }
-      break;
-    default: assert(0 && "Invalid partition type."); break;
-  }
-
-  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
-}
-
-static AOM_INLINE void set_partial_sb_partition(
-    const AV1_COMMON *const cm, MB_MODE_INFO *mi, int bh_in, int bw_in,
-    int mi_rows_remaining, int mi_cols_remaining, BLOCK_SIZE bsize,
-    MB_MODE_INFO **mib) {
-  int bh = bh_in;
-  int r, c;
-  for (r = 0; r < cm->seq_params.mib_size; r += bh) {
-    int bw = bw_in;
-    for (c = 0; c < cm->seq_params.mib_size; c += bw) {
-      const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c);
-      const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c);
-      mib[grid_index] = mi + mi_index;
-      mib[grid_index]->sb_type = find_partition_size(
-          bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
-    }
-  }
-}
-
-// This function attempts to set all mode info entries in a given superblock
-// to the same block partition size.
-// However, at the bottom and right borders of the image the requested size
-// may not be allowed in which case this code attempts to choose the largest
-// allowable partition.
-static AOM_INLINE void set_fixed_partitioning(AV1_COMP *cpi,
-                                              const TileInfo *const tile,
-                                              MB_MODE_INFO **mib, int mi_row,
-                                              int mi_col, BLOCK_SIZE bsize) {
-  AV1_COMMON *const cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int mi_rows_remaining = tile->mi_row_end - mi_row;
-  const int mi_cols_remaining = tile->mi_col_end - mi_col;
-  MB_MODE_INFO *const mi_upper_left =
-      mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col);
-  int bh = mi_size_high[bsize];
-  int bw = mi_size_wide[bsize];
-
-  assert(bsize >= mi_params->mi_alloc_bsize &&
-         "Attempted to use bsize < mi_params->mi_alloc_bsize");
-  assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
-
-  // Apply the requested partition size to the SB if it is all "in image"
-  if ((mi_cols_remaining >= cm->seq_params.mib_size) &&
-      (mi_rows_remaining >= cm->seq_params.mib_size)) {
-    for (int block_row = 0; block_row < cm->seq_params.mib_size;
-         block_row += bh) {
-      for (int block_col = 0; block_col < cm->seq_params.mib_size;
-           block_col += bw) {
-        const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col);
-        const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col);
-        mib[grid_index] = mi_upper_left + mi_index;
-        mib[grid_index]->sb_type = bsize;
-      }
-    }
-  } else {
-    // Else this is a partial SB.
-    set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
-                             mi_cols_remaining, bsize, mib);
-  }
-}
-
-static AOM_INLINE void rd_use_partition(
-    AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, MB_MODE_INFO **mib,
-    TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate,
-    int64_t *dist, int do_recon, PC_TREE *pc_tree) {
-  AV1_COMMON *const cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int num_planes = av1_num_planes(cm);
-  TileInfo *const tile_info = &tile_data->tile_info;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int bs = mi_size_wide[bsize];
-  const int hbs = bs / 2;
-  int i;
-  const int pl = (bsize >= BLOCK_8X8)
-                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
-                     : 0;
-  const PARTITION_TYPE partition =
-      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
-                           : PARTITION_NONE;
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
-  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-  RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc;
-  BLOCK_SIZE sub_subsize = BLOCK_4X4;
-  int splits_below = 0;
-  BLOCK_SIZE bs_type = mib[0]->sb_type;
-  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
-
-  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
-
-  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-
-  av1_invalid_rd_stats(&last_part_rdc);
-  av1_invalid_rd_stats(&none_rdc);
-  av1_invalid_rd_stats(&chosen_rdc);
-  av1_invalid_rd_stats(&invalid_rdc);
-
-  pc_tree->partitioning = partition;
-
-  xd->above_txfm_context =
-      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-
-  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-    x->mb_energy = av1_log_block_var(cpi, x, bsize);
-  }
-
-  // Save rdmult before it might be changed, so it can be restored later.
-  const int orig_rdmult = x->rdmult;
-  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
-
-  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
-      (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 2 ||
-       (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 &&
-        cm->quant_params.base_qindex > 190 && bsize <= BLOCK_32X32 &&
-        !frame_is_intra_only(cm)))) {
-    // Check if any of the sub blocks are further split.
-    if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
-      sub_subsize = get_partition_subsize(subsize, PARTITION_SPLIT);
-      splits_below = 1;
-      for (i = 0; i < 4; i++) {
-        int jj = i >> 1, ii = i & 0x01;
-        MB_MODE_INFO *this_mi = mib[jj * hbs * mi_params->mi_stride + ii * hbs];
-        if (this_mi && this_mi->sb_type >= sub_subsize) {
-          splits_below = 0;
-        }
-      }
-    }
-
-    // If partition is not none try none unless each of the 4 splits are split
-    // even further..
-    if (partition != PARTITION_NONE && !splits_below &&
-        mi_row + hbs < mi_params->mi_rows &&
-        mi_col + hbs < mi_params->mi_cols) {
-      pc_tree->partitioning = PARTITION_NONE;
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
-                    PARTITION_NONE, bsize, ctx_none, invalid_rdc, PICK_MODE_RD);
-
-      if (none_rdc.rate < INT_MAX) {
-        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
-        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
-      }
-
-      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-      mib[0]->sb_type = bs_type;
-      pc_tree->partitioning = partition;
-    }
-  }
-
-  switch (partition) {
-    case PARTITION_NONE:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                    PARTITION_NONE, bsize, ctx_none, invalid_rdc, PICK_MODE_RD);
-      break;
-    case PARTITION_HORZ:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
-                    invalid_rdc, PICK_MODE_RD);
-      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
-          mi_row + hbs < mi_params->mi_rows) {
-        RD_STATS tmp_rdc;
-        const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
-        av1_init_rd_stats(&tmp_rdc);
-        update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
-                          NULL);
-        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
-                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                      invalid_rdc, PICK_MODE_RD);
-        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
-          av1_invalid_rd_stats(&last_part_rdc);
-          break;
-        }
-        last_part_rdc.rate += tmp_rdc.rate;
-        last_part_rdc.dist += tmp_rdc.dist;
-        last_part_rdc.rdcost += tmp_rdc.rdcost;
-      }
-      break;
-    case PARTITION_VERT:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                    PARTITION_VERT, subsize, &pc_tree->vertical[0], invalid_rdc,
-                    PICK_MODE_RD);
-      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
-          mi_col + hbs < mi_params->mi_cols) {
-        RD_STATS tmp_rdc;
-        const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
-        av1_init_rd_stats(&tmp_rdc);
-        update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
-                          NULL);
-        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
-                      PARTITION_VERT, subsize,
-                      &pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc,
-                      PICK_MODE_RD);
-        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
-          av1_invalid_rd_stats(&last_part_rdc);
-          break;
-        }
-        last_part_rdc.rate += tmp_rdc.rate;
-        last_part_rdc.dist += tmp_rdc.dist;
-        last_part_rdc.rdcost += tmp_rdc.rdcost;
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 &&
-          none_rdc.rate < INT_MAX && none_rdc.skip == 1) {
-        av1_invalid_rd_stats(&last_part_rdc);
-        break;
-      }
-      last_part_rdc.rate = 0;
-      last_part_rdc.dist = 0;
-      last_part_rdc.rdcost = 0;
-      for (i = 0; i < 4; i++) {
-        int x_idx = (i & 1) * hbs;
-        int y_idx = (i >> 1) * hbs;
-        int jj = i >> 1, ii = i & 0x01;
-        RD_STATS tmp_rdc;
-        if ((mi_row + y_idx >= mi_params->mi_rows) ||
-            (mi_col + x_idx >= mi_params->mi_cols))
-          continue;
-
-        av1_init_rd_stats(&tmp_rdc);
-        rd_use_partition(cpi, td, tile_data,
-                         mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
-                         mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
-                         &tmp_rdc.dist, i != 3, pc_tree->split[i]);
-        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
-          av1_invalid_rd_stats(&last_part_rdc);
-          break;
-        }
-        last_part_rdc.rate += tmp_rdc.rate;
-        last_part_rdc.dist += tmp_rdc.dist;
-      }
-      break;
-    case PARTITION_VERT_A:
-    case PARTITION_VERT_B:
-    case PARTITION_HORZ_A:
-    case PARTITION_HORZ_B:
-    case PARTITION_HORZ_4:
-    case PARTITION_VERT_4:
-      assert(0 && "Cannot handle extended partition types");
-    default: assert(0); break;
-  }
-
-  if (last_part_rdc.rate < INT_MAX) {
-    last_part_rdc.rate += x->partition_cost[pl][partition];
-    last_part_rdc.rdcost =
-        RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
-  }
-
-  if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
-       cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) &&
-      partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
-      (mi_row + bs < mi_params->mi_rows ||
-       mi_row + hbs == mi_params->mi_rows) &&
-      (mi_col + bs < mi_params->mi_cols ||
-       mi_col + hbs == mi_params->mi_cols)) {
-    BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-    chosen_rdc.rate = 0;
-    chosen_rdc.dist = 0;
-
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-    pc_tree->partitioning = PARTITION_SPLIT;
-
-    // Split partition.
-    for (i = 0; i < 4; i++) {
-      int x_idx = (i & 1) * hbs;
-      int y_idx = (i >> 1) * hbs;
-      RD_STATS tmp_rdc;
-
-      if ((mi_row + y_idx >= mi_params->mi_rows) ||
-          (mi_col + x_idx >= mi_params->mi_cols))
-        continue;
-
-      save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-      pc_tree->split[i]->partitioning = PARTITION_NONE;
-      pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
-                    PARTITION_SPLIT, split_subsize, &pc_tree->split[i]->none,
-                    invalid_rdc, PICK_MODE_RD);
-
-      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
-        av1_invalid_rd_stats(&chosen_rdc);
-        break;
-      }
-
-      chosen_rdc.rate += tmp_rdc.rate;
-      chosen_rdc.dist += tmp_rdc.dist;
-
-      if (i != 3)
-        encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
-                  OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
-
-      chosen_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
-    }
-    if (chosen_rdc.rate < INT_MAX) {
-      chosen_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
-      chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
-    }
-  }
-
-  // If last_part is better set the partitioning to that.
-  if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
-    mib[0]->sb_type = bsize;
-    if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
-    chosen_rdc = last_part_rdc;
-  }
-  // If none was better set the partitioning to that.
-  if (none_rdc.rdcost < chosen_rdc.rdcost) {
-    if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
-    chosen_rdc = none_rdc;
-  }
-
-  restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-
-  // We must have chosen a partitioning and encoding or we'll fail later on.
-  // No other opportunities for success.
-  if (bsize == cm->seq_params.sb_size)
-    assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
-
-  if (do_recon) {
-    if (bsize == cm->seq_params.sb_size) {
-      // NOTE: To get estimate for rate due to the tokens, use:
-      // int rate_coeffs = 0;
-      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
-      //           bsize, pc_tree, &rate_coeffs);
-      x->cb_offset = 0;
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
-                pc_tree, NULL);
-    } else {
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
-                pc_tree, NULL);
-    }
-  }
-
-  *rate = chosen_rdc.rate;
-  *dist = chosen_rdc.dist;
-  x->rdmult = orig_rdmult;
-}
-
-static int is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
-                                   BLOCK_SIZE bsize) {
-  const int bs = mi_size_wide[bsize];
-  const int hbs = bs / 2;
-  assert(bsize >= BLOCK_8X8);
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-
-  for (int i = 0; i < 4; i++) {
-    int x_idx = (i & 1) * hbs;
-    int y_idx = (i >> 1) * hbs;
-    if ((mi_row + y_idx >= cm->mi_params.mi_rows) ||
-        (mi_col + x_idx >= cm->mi_params.mi_cols))
-      return 0;
-    if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) !=
-            PARTITION_NONE &&
-        subsize != BLOCK_8X8)
-      return 0;
-  }
-  return 1;
-}
-
-static AOM_INLINE int do_slipt_check(BLOCK_SIZE bsize) {
-  return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32);
-}
-
-static AOM_INLINE void nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
-                                           TileDataEnc *tile_data,
-                                           MB_MODE_INFO **mib, TOKENEXTRA **tp,
-                                           int mi_row, int mi_col,
-                                           BLOCK_SIZE bsize, PC_TREE *pc_tree) {
-  AV1_COMMON *const cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  TileInfo *const tile_info = &tile_data->tile_info;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  // Only square blocks from 8x8 to 128x128 are supported
-  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
-  const int bs = mi_size_wide[bsize];
-  const int hbs = bs / 2;
-  const PARTITION_TYPE partition =
-      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
-                           : PARTITION_NONE;
-  BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
-  assert(subsize <= BLOCK_LARGEST);
-  const int pl = (bsize >= BLOCK_8X8)
-                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
-                     : 0;
-
-  RD_STATS dummy_cost;
-  av1_invalid_rd_stats(&dummy_cost);
-  RD_STATS invalid_rd;
-  av1_invalid_rd_stats(&invalid_rd);
-
-  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
-
-  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-
-  pc_tree->partitioning = partition;
-
-  xd->above_txfm_context =
-      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-
-  switch (partition) {
-    case PARTITION_NONE:
-      if (cpi->sf.rt_sf.nonrd_check_partition_split && do_slipt_check(bsize) &&
-          !frame_is_intra_only(cm)) {
-        RD_STATS split_rdc, none_rdc, block_rdc;
-        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-
-        av1_init_rd_stats(&split_rdc);
-        av1_invalid_rd_stats(&none_rdc);
-
-        save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-        subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
-                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
-                      PICK_MODE_NONRD);
-        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
-        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
-        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-
-        for (int i = 0; i < 4; i++) {
-          av1_invalid_rd_stats(&block_rdc);
-          const int x_idx = (i & 1) * hbs;
-          const int y_idx = (i >> 1) * hbs;
-          if (mi_row + y_idx >= mi_params->mi_rows ||
-              mi_col + x_idx >= mi_params->mi_cols)
-            continue;
-          xd->above_txfm_context =
-              cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
-          xd->left_txfm_context =
-              xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
-          pc_tree->split[i]->partitioning = PARTITION_NONE;
-          pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                        &block_rdc, PARTITION_NONE, subsize,
-                        &pc_tree->split[i]->none, invalid_rd, PICK_MODE_NONRD);
-          split_rdc.rate += block_rdc.rate;
-          split_rdc.dist += block_rdc.dist;
-
-          encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
-                   subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
-        }
-        split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
-        split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
-        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-
-        if (none_rdc.rdcost < split_rdc.rdcost) {
-          mib[0]->sb_type = bsize;
-          pc_tree->partitioning = PARTITION_NONE;
-          encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
-                   &pc_tree->none, NULL);
-        } else {
-          mib[0]->sb_type = subsize;
-          pc_tree->partitioning = PARTITION_SPLIT;
-          for (int i = 0; i < 4; i++) {
-            const int x_idx = (i & 1) * hbs;
-            const int y_idx = (i >> 1) * hbs;
-            if (mi_row + y_idx >= mi_params->mi_rows ||
-                mi_col + x_idx >= mi_params->mi_cols)
-              continue;
-
-            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
-          }
-        }
-
-      } else {
-        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
-                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
-                      PICK_MODE_NONRD);
-        encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
-                 &pc_tree->none, NULL);
-      }
-      break;
-    case PARTITION_VERT:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
-                    PARTITION_VERT, subsize, &pc_tree->vertical[0], invalid_rd,
-                    PICK_MODE_NONRD);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
-               PARTITION_VERT, &pc_tree->vertical[0], NULL);
-      if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) {
-        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &dummy_cost,
-                      PARTITION_VERT, subsize, &pc_tree->vertical[1],
-                      invalid_rd, PICK_MODE_NONRD);
-        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize,
-                 PARTITION_VERT, &pc_tree->vertical[1], NULL);
-      }
-      break;
-    case PARTITION_HORZ:
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
-                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
-                    invalid_rd, PICK_MODE_NONRD);
-      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
-               PARTITION_HORZ, &pc_tree->horizontal[0], NULL);
-
-      if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) {
-        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &dummy_cost,
-                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                      invalid_rd, PICK_MODE_NONRD);
-        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize,
-                 PARTITION_HORZ, &pc_tree->horizontal[1], NULL);
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode &&
-          is_leaf_split_partition(cm, mi_row, mi_col, bsize) &&
-          !frame_is_intra_only(cm) && bsize <= BLOCK_32X32) {
-        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-        RD_STATS split_rdc, none_rdc;
-        av1_invalid_rd_stats(&split_rdc);
-        av1_invalid_rd_stats(&none_rdc);
-        save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-        xd->above_txfm_context =
-            cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
-        xd->left_txfm_context =
-            xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-        pc_tree->partitioning = PARTITION_NONE;
-        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
-                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
-                      PICK_MODE_NONRD);
-        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
-        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
-        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-        if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode != 2 ||
-            none_rdc.skip != 1 || pc_tree->none.mic.mode == NEWMV) {
-          av1_init_rd_stats(&split_rdc);
-          for (int i = 0; i < 4; i++) {
-            RD_STATS block_rdc;
-            av1_invalid_rd_stats(&block_rdc);
-            int x_idx = (i & 1) * hbs;
-            int y_idx = (i >> 1) * hbs;
-            if ((mi_row + y_idx >= mi_params->mi_rows) ||
-                (mi_col + x_idx >= mi_params->mi_cols))
-              continue;
-            xd->above_txfm_context =
-                cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
-            xd->left_txfm_context = xd->left_txfm_context_buffer +
-                                    ((mi_row + y_idx) & MAX_MIB_MASK);
-            pc_tree->split[i]->partitioning = PARTITION_NONE;
-            pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                          &block_rdc, PARTITION_NONE, subsize,
-                          &pc_tree->split[i]->none, invalid_rd,
-                          PICK_MODE_NONRD);
-            split_rdc.rate += block_rdc.rate;
-            split_rdc.dist += block_rdc.dist;
-
-            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
-                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
-          }
-          restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-          split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
-          split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
-        }
-        if (none_rdc.rdcost < split_rdc.rdcost) {
-          mib[0]->sb_type = bsize;
-          pc_tree->partitioning = PARTITION_NONE;
-          encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
-                   &pc_tree->none, NULL);
-        } else {
-          mib[0]->sb_type = subsize;
-          pc_tree->partitioning = PARTITION_SPLIT;
-          for (int i = 0; i < 4; i++) {
-            int x_idx = (i & 1) * hbs;
-            int y_idx = (i >> 1) * hbs;
-            if ((mi_row + y_idx >= mi_params->mi_rows) ||
-                (mi_col + x_idx >= mi_params->mi_cols))
-              continue;
-
-            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
-          }
-        }
-      } else {
-        for (int i = 0; i < 4; i++) {
-          int x_idx = (i & 1) * hbs;
-          int y_idx = (i >> 1) * hbs;
-          int jj = i >> 1, ii = i & 0x01;
-          if ((mi_row + y_idx >= mi_params->mi_rows) ||
-              (mi_col + x_idx >= mi_params->mi_cols))
-            continue;
-          nonrd_use_partition(cpi, td, tile_data,
-                              mib + jj * hbs * mi_params->mi_stride + ii * hbs,
-                              tp, mi_row + y_idx, mi_col + x_idx, subsize,
-                              pc_tree->split[i]);
-        }
-      }
-      break;
-    case PARTITION_VERT_A:
-    case PARTITION_VERT_B:
-    case PARTITION_HORZ_A:
-    case PARTITION_HORZ_B:
-    case PARTITION_HORZ_4:
-    case PARTITION_VERT_4:
-      assert(0 && "Cannot handle extended partition types");
-    default: assert(0); break;
-  }
-}
-
-#if !CONFIG_REALTIME_ONLY
-static const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p, int frm) {
-  assert(frm >= 0);
-  if (frm < 0 ||
-      p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) {
-    return NULL;
-  }
-
-  return &p->stats_buf_ctx->stats_in_start[frm];
-}
-// Checks to see if a super block is on a horizontal image edge.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-static int active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
-  int top_edge = 0;
-  int bottom_edge = cpi->common.mi_params.mi_rows;
-  int is_active_h_edge = 0;
-
-  // For two pass account for any formatting bars detected.
-  if (is_stat_consumption_stage_twopass(cpi)) {
-    const AV1_COMMON *const cm = &cpi->common;
-    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
-        &cpi->twopass, cm->current_frame.display_order_hint);
-    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
-
-    // The inactive region is specified in MBs not mi units.
-    // The image edge is in the following MB row.
-    top_edge += (int)(this_frame_stats->inactive_zone_rows * 4);
-
-    bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4);
-    bottom_edge = AOMMAX(top_edge, bottom_edge);
-  }
-
-  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
-      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
-    is_active_h_edge = 1;
-  }
-  return is_active_h_edge;
-}
-
-// Checks to see if a super block is on a vertical image edge.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
-  int left_edge = 0;
-  int right_edge = cpi->common.mi_params.mi_cols;
-  int is_active_v_edge = 0;
-
-  // For two pass account for any formatting bars detected.
-  if (is_stat_consumption_stage_twopass(cpi)) {
-    const AV1_COMMON *const cm = &cpi->common;
-    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
-        &cpi->twopass, cm->current_frame.display_order_hint);
-    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
-
-    // The inactive region is specified in MBs not mi units.
-    // The image edge is in the following MB row.
-    left_edge += (int)(this_frame_stats->inactive_zone_cols * 4);
-
-    right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4);
-    right_edge = AOMMAX(left_edge, right_edge);
-  }
-
-  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
-      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
-    is_active_v_edge = 1;
-  }
-  return is_active_v_edge;
-}
-#endif  // !CONFIG_REALTIME_ONLY
-
-static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
-  memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
-}
-
-static INLINE void load_pred_mv(MACROBLOCK *x,
-                                const PICK_MODE_CONTEXT *const ctx) {
-  memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
-}
-
 #if !CONFIG_REALTIME_ONLY
-// Try searching for an encoding for the given subblock. Returns zero if the
-// rdcost is already too high (to tell the caller not to bother searching for
-// encodings of further subblocks)
-static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
-                           TileDataEnc *tile_data, TOKENEXTRA **tp, int is_last,
-                           int mi_row, int mi_col, BLOCK_SIZE subsize,
-                           RD_STATS best_rdcost, RD_STATS *sum_rdc,
-                           PARTITION_TYPE partition,
-                           PICK_MODE_CONTEXT *prev_ctx,
-                           PICK_MODE_CONTEXT *this_ctx) {
-  MACROBLOCK *const x = &td->mb;
-  const int orig_mult = x->rdmult;
-  setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL);
-
-  av1_rd_cost_update(x->rdmult, &best_rdcost);
-  if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
-
-  RD_STATS rdcost_remaining;
-  av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining);
-  RD_STATS this_rdc;
-  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition,
-                subsize, this_ctx, rdcost_remaining, PICK_MODE_RD);
-
-  if (this_rdc.rate == INT_MAX) {
-    sum_rdc->rdcost = INT64_MAX;
-  } else {
-    sum_rdc->rate += this_rdc.rate;
-    sum_rdc->dist += this_rdc.dist;
-    av1_rd_cost_update(x->rdmult, sum_rdc);
-  }
-
-  if (sum_rdc->rdcost >= best_rdcost.rdcost) {
-    x->rdmult = orig_mult;
-    return 0;
-  }
-
-  if (!is_last) {
-    update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1);
-    encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
-  }
-
-  x->rdmult = orig_mult;
-  return 1;
-}
-
-static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
-                               TileDataEnc *tile_data, TOKENEXTRA **tp,
-                               PC_TREE *pc_tree, RD_STATS *best_rdc,
-                               PICK_MODE_CONTEXT ctxs[3],
-                               PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-                               BLOCK_SIZE bsize, PARTITION_TYPE partition,
-                               int mi_row0, int mi_col0, BLOCK_SIZE subsize0,
-                               int mi_row1, int mi_col1, BLOCK_SIZE subsize1,
-                               int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
-  const MACROBLOCK *const x = &td->mb;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-  RD_STATS sum_rdc;
-  av1_init_rd_stats(&sum_rdc);
-  sum_rdc.rate = x->partition_cost[pl][partition];
-  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row0, mi_col0, subsize0,
-                       *best_rdc, &sum_rdc, partition, ctx, &ctxs[0]))
-    return false;
-
-  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row1, mi_col1, subsize1,
-                       *best_rdc, &sum_rdc, partition, &ctxs[0], &ctxs[1]))
-    return false;
-
-  if (!rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2,
-                       *best_rdc, &sum_rdc, partition, &ctxs[1], &ctxs[2]))
-    return false;
-
-  av1_rd_cost_update(x->rdmult, &sum_rdc);
-  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
-  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
-
-  *best_rdc = sum_rdc;
-  pc_tree->partitioning = partition;
-  return true;
-}
-
-static AOM_INLINE void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
-  pc_tree->partitioning = PARTITION_NONE;
-  pc_tree->none.rd_stats.skip = 0;
-
-  if (bsize >= BLOCK_8X8) {
-    BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-    for (int idx = 0; idx < 4; ++idx)
-      reset_partition(pc_tree->split[idx], subsize);
-  }
-}
-
-// Record the ref frames that have been selected by square partition blocks.
-static AOM_INLINE void update_picked_ref_frames_mask(MACROBLOCK *const x,
-                                                     int ref_type,
-                                                     BLOCK_SIZE bsize,
-                                                     int mib_size, int mi_row,
-                                                     int mi_col) {
-  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-  const int sb_size_mask = mib_size - 1;
-  const int mi_row_in_sb = mi_row & sb_size_mask;
-  const int mi_col_in_sb = mi_col & sb_size_mask;
-  const int mi_size = mi_size_wide[bsize];
-  for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) {
-    for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) {
-      x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type;
-    }
-  }
-}
-
-// Structure to keep win flags for HORZ and VERT partition evaluations
-typedef struct {
-  bool horz_win;
-  bool vert_win;
-} RD_RECT_PART_WIN_INFO;
-
-// Decide whether to evaluate the AB partition specified by part_type based on
-// split and HORZ/VERT info
-int evaluate_ab_partition_based_on_split(
-    PC_TREE *pc_tree, PARTITION_TYPE rect_part,
-    RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1,
-    int split_idx2) {
-  int num_win = 0;
-  // Threshold for number of winners
-  // Conservative pruning for high quantizers
-  const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3);
-  bool sub_part_win = (rect_part_win_info == NULL)
-                          ? (pc_tree->partitioning == rect_part)
-                          : (rect_part == PARTITION_HORZ)
-                                ? rect_part_win_info->horz_win
-                                : rect_part_win_info->vert_win;
-  num_win += (sub_part_win) ? 1 : 0;
-  num_win +=
-      (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0;
-  num_win +=
-      (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0;
-  if (num_win < num_win_thresh) {
-    return 0;
-  }
-  return 1;
-}
-
-// Searches for the best partition pattern for a block based on the
-// rate-distortion cost, and returns a bool value to indicate whether a valid
-// partition pattern is found. The partition can recursively go down to
-// the smallest block size.
-//
-// Inputs:
-//     cpi: the global compressor setting
-//     td: thread data
-//     tile_data: tile data
-//     tp: the pointer to the start token
-//     mi_row: row coordinate of the block in a step size of MI_SIZE
-//     mi_col: column coordinate of the block in a step size of MI_SIZE
-//     bsize: block size
-//     max_sq_part: the largest square block size for prediction blocks
-//     min_sq_part: the smallest square block size for prediction blocks
-//     rd_cost: the pointer to the final rd cost of the current block
-//     best_rdc: the upper bound of rd cost for a valid partition
-//     pc_tree: the pointer to the PC_TREE node storing the picked partitions
-//              and mode info for the current block
-//     none_rd: the pointer to the rd cost in the case of not splitting the
-//              current block
-//     multi_pass_mode: SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS
-//     rect_part_win_info: the pointer to a struct storing whether horz/vert
-//                         partition outperforms previously tested partitions
-//
-// Output:
-//     a bool value indicating whether a valid partition is found
-static bool rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
-                              TileDataEnc *tile_data, TOKENEXTRA **tp,
-                              int mi_row, int mi_col, BLOCK_SIZE bsize,
-                              BLOCK_SIZE max_sq_part, BLOCK_SIZE min_sq_part,
-                              RD_STATS *rd_cost, RD_STATS best_rdc,
-                              PC_TREE *pc_tree, int64_t *none_rd,
-                              SB_MULTI_PASS_MODE multi_pass_mode,
-                              RD_RECT_PART_WIN_INFO *rect_part_win_info) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int num_planes = av1_num_planes(cm);
-  TileInfo *const tile_info = &tile_data->tile_info;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int mi_step = mi_size_wide[bsize] / 2;
-  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
-  const TOKENEXTRA *const tp_orig = *tp;
-  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
-  int tmp_partition_cost[PARTITION_TYPES];
-  BLOCK_SIZE subsize;
-  RD_STATS this_rdc, sum_rdc;
-  const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
-  int do_square_split = bsize_at_least_8x8;
-  const int pl = bsize_at_least_8x8
-                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
-                     : 0;
-  const int *partition_cost = x->partition_cost[pl];
-
-  int do_rectangular_split = cpi->oxcf.enable_rect_partitions;
-  int64_t cur_none_rd = 0;
-  int64_t split_rd[4] = { 0, 0, 0, 0 };
-  int64_t horz_rd[2] = { 0, 0 };
-  int64_t vert_rd[2] = { 0, 0 };
-  int prune_horz = 0;
-  int prune_vert = 0;
-  int terminate_partition_search = 0;
-
-  int split_ctx_is_ready[2] = { 0, 0 };
-  int horz_ctx_is_ready = 0;
-  int vert_ctx_is_ready = 0;
-  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
-  // Initialise HORZ and VERT win flags as true for all split partitions
-  RD_RECT_PART_WIN_INFO split_part_rect_win[4] = {
-    { true, true }, { true, true }, { true, true }, { true, true }
-  };
-
-  bool found_best_partition = false;
-  if (best_rdc.rdcost < 0) {
-    av1_invalid_rd_stats(rd_cost);
-    return found_best_partition;
-  }
-
-  if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
-    x->quad_tree_idx = 0;
-    x->cnn_output_valid = 0;
-  }
-
-  if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
-
-  // Override skipping rectangular partition operations for edge blocks
-  const int has_rows = (mi_row + mi_step < mi_params->mi_rows);
-  const int has_cols = (mi_col + mi_step < mi_params->mi_cols);
-  const int xss = x->e_mbd.plane[1].subsampling_x;
-  const int yss = x->e_mbd.plane[1].subsampling_y;
-
-  if (none_rd) *none_rd = 0;
-  int partition_none_allowed = has_rows && has_cols;
-  int partition_horz_allowed =
-      has_cols && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
-      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), xss,
-                           yss) != BLOCK_INVALID;
-  int partition_vert_allowed =
-      has_rows && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
-      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), xss,
-                           yss) != BLOCK_INVALID;
-
-  (void)*tp_orig;
-
-#if CONFIG_COLLECT_PARTITION_STATS
-  int partition_decisions[EXT_PARTITION_TYPES] = { 0 };
-  int partition_attempts[EXT_PARTITION_TYPES] = { 0 };
-  int64_t partition_times[EXT_PARTITION_TYPES] = { 0 };
-  struct aom_usec_timer partition_timer = { 0 };
-  int partition_timer_on = 0;
-#if CONFIG_COLLECT_PARTITION_STATS == 2
-  PartitionStats *part_stats = &cpi->partition_stats;
-#endif
-#endif
-
-  // Override partition costs at the edges of the frame in the same
-  // way as in read_partition (see decodeframe.c)
-  if (!(has_rows && has_cols)) {
-    assert(bsize_at_least_8x8 && pl >= 0);
-    const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
-    const int max_cost = av1_cost_symbol(0);
-    for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = max_cost;
-    if (has_cols) {
-      // At the bottom, the two possibilities are HORZ and SPLIT
-      aom_cdf_prob bot_cdf[2];
-      partition_gather_vert_alike(bot_cdf, partition_cdf, bsize);
-      static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
-      av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map);
-    } else if (has_rows) {
-      // At the right, the two possibilities are VERT and SPLIT
-      aom_cdf_prob rhs_cdf[2];
-      partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize);
-      static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
-      av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map);
-    } else {
-      // At the bottom right, we always split
-      tmp_partition_cost[PARTITION_SPLIT] = 0;
-    }
-
-    partition_cost = tmp_partition_cost;
-  }
-
-#ifndef NDEBUG
-  // Nothing should rely on the default value of this array (which is just
-  // leftover from encoding the previous block. Setting it to fixed pattern
-  // when debugging.
-  // bit 0, 1, 2 are blk_skip of each plane
-  // bit 4, 5, 6 are initialization checking of each plane
-  memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
-#endif  // NDEBUG
-
-  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-
-  av1_init_rd_stats(&this_rdc);
-
-  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-
-  // Save rdmult before it might be changed, so it can be restored later.
-  const int orig_rdmult = x->rdmult;
-  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
-
-  av1_rd_cost_update(x->rdmult, &best_rdc);
-
-  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
-    x->mb_energy = av1_log_block_var(cpi, x, bsize);
-
-  if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) {
-    partition_horz_allowed &= !has_rows;
-    partition_vert_allowed &= !has_cols;
-  }
-
-  xd->above_txfm_context =
-      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-
-  const int try_intra_cnn_split =
-      !cpi->is_screen_content_type && frame_is_intra_only(cm) &&
-      cpi->sf.part_sf.intra_cnn_split &&
-      cm->seq_params.sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
-      bsize >= BLOCK_8X8 &&
-      mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
-      mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
-
-  if (try_intra_cnn_split) {
-    av1_intra_mode_cnn_partition(
-        &cpi->common, x, bsize, x->quad_tree_idx, &partition_none_allowed,
-        &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split,
-        &do_square_split);
-  }
-
-  // Use simple_motion_search to prune partitions. This must be done prior to
-  // PARTITION_SPLIT to propagate the initial mvs to a smaller blocksize.
-  const int try_split_only =
-      !cpi->is_screen_content_type &&
-      cpi->sf.part_sf.simple_motion_search_split && do_square_split &&
-      bsize >= BLOCK_8X8 &&
-      mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
-      mi_col + mi_size_wide[bsize] <= mi_params->mi_cols &&
-      !frame_is_intra_only(cm) && !av1_superres_scaled(cm);
-
-  if (try_split_only) {
-    av1_simple_motion_search_based_split(
-        cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_none_allowed,
-        &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split,
-        &do_square_split);
-  }
-
-  const int try_prune_rect =
-      !cpi->is_screen_content_type &&
-      cpi->sf.part_sf.simple_motion_search_prune_rect &&
-      !frame_is_intra_only(cm) && do_rectangular_split &&
-      (do_square_split || partition_none_allowed ||
-       (prune_horz && prune_vert)) &&
-      (partition_horz_allowed || partition_vert_allowed) && bsize >= BLOCK_8X8;
-
-  if (try_prune_rect) {
-    av1_simple_motion_search_prune_rect(
-        cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_horz_allowed,
-        &partition_vert_allowed, &prune_horz, &prune_vert);
-  }
-
-  // Max and min square partition levels are defined as the partition nodes that
-  // the recursive function rd_pick_partition() can reach. To implement this:
-  // only PARTITION_NONE is allowed if the current node equals min_sq_part,
-  // only PARTITION_SPLIT is allowed if the current node exceeds max_sq_part.
-  assert(block_size_wide[min_sq_part] == block_size_high[min_sq_part]);
-  assert(block_size_wide[max_sq_part] == block_size_high[max_sq_part]);
-  assert(min_sq_part <= max_sq_part);
-  assert(block_size_wide[bsize] == block_size_high[bsize]);
-  const int max_partition_size = block_size_wide[max_sq_part];
-  const int min_partition_size = block_size_wide[min_sq_part];
-  const int blksize = block_size_wide[bsize];
-  assert(min_partition_size <= max_partition_size);
-  const int is_le_min_sq_part = blksize <= min_partition_size;
-  const int is_gt_max_sq_part = blksize > max_partition_size;
-  if (is_gt_max_sq_part) {
-    // If current block size is larger than max, only allow split.
-    partition_none_allowed = 0;
-    partition_horz_allowed = 0;
-    partition_vert_allowed = 0;
-    do_square_split = 1;
-  } else if (is_le_min_sq_part) {
-    // If current block size is less or equal to min, only allow none if valid
-    // block large enough; only allow split otherwise.
-    partition_horz_allowed = 0;
-    partition_vert_allowed = 0;
-    // only disable square split when current block is not at the picture
-    // boundary. otherwise, inherit the square split flag from previous logic
-    if (has_rows && has_cols) do_square_split = 0;
-    partition_none_allowed = !do_square_split;
-  }
-
-BEGIN_PARTITION_SEARCH:
-  if (x->must_find_valid_partition) {
-    do_square_split = bsize_at_least_8x8 && (blksize > min_partition_size);
-    partition_none_allowed =
-        has_rows && has_cols && (blksize >= min_partition_size);
-    partition_horz_allowed =
-        has_cols && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
-        (blksize > min_partition_size) &&
-        get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), xss,
-                             yss) != BLOCK_INVALID;
-    partition_vert_allowed =
-        has_rows && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
-        (blksize > min_partition_size) &&
-        get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), xss,
-                             yss) != BLOCK_INVALID;
-    terminate_partition_search = 0;
-  }
-
-  // Partition block source pixel variance.
-  unsigned int pb_source_variance = UINT_MAX;
-
-  // Partition block sse after simple motion compensation, not in use now,
-  // but will be used for upcoming speed features
-  unsigned int pb_simple_motion_pred_sse = UINT_MAX;
-  (void)pb_simple_motion_pred_sse;
-
-  // PARTITION_NONE
-  if (is_le_min_sq_part && has_rows && has_cols) partition_none_allowed = 1;
-  assert(terminate_partition_search == 0);
-  int64_t part_none_rd = INT64_MAX;
-  if (cpi->is_screen_content_type)
-    partition_none_allowed = has_rows && has_cols;
-  if (partition_none_allowed && !is_gt_max_sq_part) {
-    int pt_cost = 0;
-    if (bsize_at_least_8x8) {
-      pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
-                    ? partition_cost[PARTITION_NONE]
-                    : 0;
-    }
-    RD_STATS partition_rdcost;
-    av1_init_rd_stats(&partition_rdcost);
-    partition_rdcost.rate = pt_cost;
-    av1_rd_cost_update(x->rdmult, &partition_rdcost);
-    RD_STATS best_remain_rdcost;
-    av1_rd_stats_subtraction(x->rdmult, &best_rdc, &partition_rdcost,
-                             &best_remain_rdcost);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (best_remain_rdcost >= 0) {
-      partition_attempts[PARTITION_NONE] += 1;
-      aom_usec_timer_start(&partition_timer);
-      partition_timer_on = 1;
-    }
-#endif
-    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE,
-                  bsize, ctx_none, best_remain_rdcost, PICK_MODE_RD);
-    av1_rd_cost_update(x->rdmult, &this_rdc);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_NONE] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    pb_source_variance = x->source_variance;
-    pb_simple_motion_pred_sse = x->simple_motion_pred_sse;
-    if (none_rd) *none_rd = this_rdc.rdcost;
-    cur_none_rd = this_rdc.rdcost;
-    if (this_rdc.rate != INT_MAX) {
-      if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) {
-        const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame);
-        update_picked_ref_frames_mask(x, ref_type, bsize,
-                                      cm->seq_params.mib_size, mi_row, mi_col);
-      }
-      if (bsize_at_least_8x8) {
-        this_rdc.rate += pt_cost;
-        this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
-      }
-
-      part_none_rd = this_rdc.rdcost;
-      if (this_rdc.rdcost < best_rdc.rdcost) {
-        // Adjust dist breakout threshold according to the partition size.
-        const int64_t dist_breakout_thr =
-            cpi->sf.part_sf.partition_search_breakout_dist_thr >>
-            ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
-             (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
-        const int rate_breakout_thr =
-            cpi->sf.part_sf.partition_search_breakout_rate_thr *
-            num_pels_log2_lookup[bsize];
-
-        best_rdc = this_rdc;
-        found_best_partition = true;
-        if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
-
-        if (!frame_is_intra_only(cm) &&
-            (do_square_split || do_rectangular_split) &&
-            !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
-          const int use_ml_based_breakout =
-              bsize <= cpi->sf.part_sf.use_square_partition_only_threshold &&
-              bsize > BLOCK_4X4 && xd->bd == 8;
-          if (use_ml_based_breakout) {
-            if (av1_ml_predict_breakout(cpi, bsize, x, &this_rdc,
-                                        pb_source_variance)) {
-              do_square_split = 0;
-              do_rectangular_split = 0;
-            }
-          }
-
-          // If all y, u, v transform blocks in this partition are skippable,
-          // and the dist & rate are within the thresholds, the partition
-          // search is terminated for current branch of the partition search
-          // tree. The dist & rate thresholds are set to 0 at speed 0 to
-          // disable the early termination at that speed.
-          if (best_rdc.dist < dist_breakout_thr &&
-              best_rdc.rate < rate_breakout_thr) {
-            do_square_split = 0;
-            do_rectangular_split = 0;
-          }
-        }
-
-        if (cpi->sf.part_sf.simple_motion_search_early_term_none &&
-            cm->show_frame && !frame_is_intra_only(cm) &&
-            bsize >= BLOCK_16X16 && mi_row + mi_step < mi_params->mi_rows &&
-            mi_col + mi_step < mi_params->mi_cols &&
-            this_rdc.rdcost < INT64_MAX && this_rdc.rdcost >= 0 &&
-            this_rdc.rate < INT_MAX && this_rdc.rate >= 0 &&
-            (do_square_split || do_rectangular_split)) {
-          av1_simple_motion_search_early_term_none(cpi, x, pc_tree, mi_row,
-                                                   mi_col, bsize, &this_rdc,
-                                                   &terminate_partition_search);
-        }
-      }
-    }
-
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  // store estimated motion vector
-  if (cpi->sf.mv_sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
-
-  // PARTITION_SPLIT
-  int64_t part_split_rd = INT64_MAX;
-  if ((!terminate_partition_search && do_square_split) || is_gt_max_sq_part) {
-    av1_init_rd_stats(&sum_rdc);
-    subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-    sum_rdc.rate = partition_cost[PARTITION_SPLIT];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-
-    int idx;
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
-      partition_attempts[PARTITION_SPLIT] += 1;
-      aom_usec_timer_start(&partition_timer);
-      partition_timer_on = 1;
-    }
-#endif
-    for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
-      const int x_idx = (idx & 1) * mi_step;
-      const int y_idx = (idx >> 1) * mi_step;
-
-      if (mi_row + y_idx >= mi_params->mi_rows ||
-          mi_col + x_idx >= mi_params->mi_cols)
-        continue;
-
-      if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
-
-      pc_tree->split[idx]->index = idx;
-      int64_t *p_split_rd = &split_rd[idx];
-
-      RD_STATS best_remain_rdcost;
-      av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
-                               &best_remain_rdcost);
-
-      int curr_quad_tree_idx = 0;
-      if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
-        curr_quad_tree_idx = x->quad_tree_idx;
-        x->quad_tree_idx = 4 * curr_quad_tree_idx + idx + 1;
-      }
-      if (!rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
-                             mi_col + x_idx, subsize, max_sq_part, min_sq_part,
-                             &this_rdc, best_remain_rdcost, pc_tree->split[idx],
-                             p_split_rd, multi_pass_mode,
-                             &split_part_rect_win[idx])) {
-        av1_invalid_rd_stats(&sum_rdc);
-        break;
-      }
-      if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
-        x->quad_tree_idx = curr_quad_tree_idx;
-      }
-
-      sum_rdc.rate += this_rdc.rate;
-      sum_rdc.dist += this_rdc.dist;
-      av1_rd_cost_update(x->rdmult, &sum_rdc);
-      if (idx <= 1 && (bsize <= BLOCK_8X8 ||
-                       pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
-        const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic;
-        const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-        // Neither palette mode nor cfl predicted
-        if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
-          if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1;
-        }
-      }
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_SPLIT] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    const int reached_last_index = (idx == 4);
-
-    part_split_rd = sum_rdc.rdcost;
-    if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        found_best_partition = true;
-        pc_tree->partitioning = PARTITION_SPLIT;
-      }
-    } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) {
-      // Skip rectangular partition test when partition type none gives better
-      // rd than partition type split.
-      if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) {
-        const int partition_none_valid = cur_none_rd > 0;
-        const int partition_none_better = cur_none_rd < sum_rdc.rdcost;
-        do_rectangular_split &=
-            !(partition_none_valid && partition_none_better);
-      }
-    }
-
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }  // if (do_split)
-
-  if (cpi->sf.part_sf.ml_early_term_after_part_split_level &&
-      !frame_is_intra_only(cm) && !terminate_partition_search &&
-      do_rectangular_split &&
-      (partition_horz_allowed || partition_vert_allowed)) {
-    av1_ml_early_term_after_split(cpi, x, pc_tree, bsize, best_rdc.rdcost,
-                                  part_none_rd, part_split_rd, split_rd, mi_row,
-                                  mi_col, &terminate_partition_search);
-  }
-
-  if (!cpi->sf.part_sf.ml_early_term_after_part_split_level &&
-      cpi->sf.part_sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
-      (partition_horz_allowed || partition_vert_allowed) &&
-      !(prune_horz || prune_vert) && !terminate_partition_search) {
-    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-    av1_ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd,
-                                split_rd, &prune_horz, &prune_vert);
-  }
-
-  // PARTITION_HORZ
-  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz_allowed));
-  if (!terminate_partition_search && partition_horz_allowed && !prune_horz &&
-      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) &&
-      !is_gt_max_sq_part) {
-    av1_init_rd_stats(&sum_rdc);
-    subsize = get_partition_subsize(bsize, PARTITION_HORZ);
-    if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
-    sum_rdc.rate = partition_cost[PARTITION_HORZ];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-    RD_STATS best_remain_rdcost;
-    av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
-                             &best_remain_rdcost);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (best_remain_rdcost >= 0) {
-      partition_attempts[PARTITION_HORZ] += 1;
-      aom_usec_timer_start(&partition_timer);
-      partition_timer_on = 1;
-    }
-#endif
-    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_HORZ,
-                  subsize, &pc_tree->horizontal[0], best_remain_rdcost,
-                  PICK_MODE_RD);
-    av1_rd_cost_update(x->rdmult, &this_rdc);
-
-    if (this_rdc.rate == INT_MAX) {
-      sum_rdc.rdcost = INT64_MAX;
-    } else {
-      sum_rdc.rate += this_rdc.rate;
-      sum_rdc.dist += this_rdc.dist;
-      av1_rd_cost_update(x->rdmult, &sum_rdc);
-    }
-    horz_rd[0] = this_rdc.rdcost;
-
-    if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) {
-      const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
-      const MB_MODE_INFO *const mbmi = &pc_tree->horizontal[0].mic;
-      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-      // Neither palette mode nor cfl predicted
-      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
-        if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1;
-      }
-      update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
-      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
-
-      if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
-
-      av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
-                               &best_remain_rdcost);
-
-      pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
-                    PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                    best_remain_rdcost, PICK_MODE_RD);
-      av1_rd_cost_update(x->rdmult, &this_rdc);
-      horz_rd[1] = this_rdc.rdcost;
-
-      if (this_rdc.rate == INT_MAX) {
-        sum_rdc.rdcost = INT64_MAX;
-      } else {
-        sum_rdc.rate += this_rdc.rate;
-        sum_rdc.dist += this_rdc.dist;
-        av1_rd_cost_update(x->rdmult, &sum_rdc);
-      }
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_HORZ] += time;
-      partition_timer_on = 0;
-    }
-#endif
-
-    if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        found_best_partition = true;
-        pc_tree->partitioning = PARTITION_HORZ;
-      }
-    } else {
-      // Update HORZ win flag
-      if (rect_part_win_info != NULL) {
-        rect_part_win_info->horz_win = false;
-      }
-    }
-
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  // PARTITION_VERT
-  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert_allowed));
-  if (!terminate_partition_search && partition_vert_allowed && !prune_vert &&
-      (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step)) &&
-      !is_gt_max_sq_part) {
-    av1_init_rd_stats(&sum_rdc);
-    subsize = get_partition_subsize(bsize, PARTITION_VERT);
-
-    if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
-
-    sum_rdc.rate = partition_cost[PARTITION_VERT];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-    RD_STATS best_remain_rdcost;
-    av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
-                             &best_remain_rdcost);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (best_remain_rdcost >= 0) {
-      partition_attempts[PARTITION_VERT] += 1;
-      aom_usec_timer_start(&partition_timer);
-      partition_timer_on = 1;
-    }
-#endif
-    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_VERT,
-                  subsize, &pc_tree->vertical[0], best_remain_rdcost,
-                  PICK_MODE_RD);
-    av1_rd_cost_update(x->rdmult, &this_rdc);
-
-    if (this_rdc.rate == INT_MAX) {
-      sum_rdc.rdcost = INT64_MAX;
-    } else {
-      sum_rdc.rate += this_rdc.rate;
-      sum_rdc.dist += this_rdc.dist;
-      av1_rd_cost_update(x->rdmult, &sum_rdc);
-    }
-    vert_rd[0] = this_rdc.rdcost;
-    if (sum_rdc.rdcost < best_rdc.rdcost && has_cols) {
-      const MB_MODE_INFO *const mbmi = &pc_tree->vertical[0].mic;
-      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-      // Neither palette mode nor cfl predicted
-      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
-        if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1;
-      }
-      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
-      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
-
-      if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
-
-      av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
-                               &best_remain_rdcost);
-      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
-                    PARTITION_VERT, subsize, &pc_tree->vertical[1],
-                    best_remain_rdcost, PICK_MODE_RD);
-      av1_rd_cost_update(x->rdmult, &this_rdc);
-      vert_rd[1] = this_rdc.rdcost;
-
-      if (this_rdc.rate == INT_MAX) {
-        sum_rdc.rdcost = INT64_MAX;
-      } else {
-        sum_rdc.rate += this_rdc.rate;
-        sum_rdc.dist += this_rdc.dist;
-        av1_rd_cost_update(x->rdmult, &sum_rdc);
-      }
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_VERT] += time;
-      partition_timer_on = 0;
-    }
-#endif
-
-    av1_rd_cost_update(x->rdmult, &sum_rdc);
-    if (sum_rdc.rdcost < best_rdc.rdcost) {
-      best_rdc = sum_rdc;
-      found_best_partition = true;
-      pc_tree->partitioning = PARTITION_VERT;
-    } else {
-      // Update VERT win flag
-      if (rect_part_win_info != NULL) {
-        rect_part_win_info->vert_win = false;
-      }
-    }
-
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  if (pb_source_variance == UINT_MAX) {
-    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-    if (is_cur_buf_hbd(xd)) {
-      pb_source_variance = av1_high_get_sby_perpixel_variance(
-          cpi, &x->plane[0].src, bsize, xd->bd);
-    } else {
-      pb_source_variance =
-          av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
-    }
-  }
-
-  if (use_pb_simple_motion_pred_sse(cpi) &&
-      pb_simple_motion_pred_sse == UINT_MAX) {
-    const FULLPEL_MV start_mv = kZeroFullMv;
-    unsigned int var = 0;
-
-    av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, start_mv, 0,
-                              &pb_simple_motion_pred_sse, &var);
-  }
-
-  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !do_rectangular_split));
-
-  const int ext_partition_allowed =
-      do_rectangular_split &&
-      bsize > cpi->sf.part_sf.ext_partition_eval_thresh && has_rows && has_cols;
-
-  // The standard AB partitions are allowed whenever ext-partition-types are
-  // allowed
-  int horzab_partition_allowed =
-      ext_partition_allowed & cpi->oxcf.enable_ab_partitions;
-  int vertab_partition_allowed =
-      ext_partition_allowed & cpi->oxcf.enable_ab_partitions;
-
-  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
-    if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) {
-      // TODO(debargha,huisu@google.com): may need to tune the threshold for
-      // pb_source_variance.
-      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
-                                   (pc_tree->partitioning == PARTITION_NONE &&
-                                    pb_source_variance < 32) ||
-                                   pc_tree->partitioning == PARTITION_SPLIT);
-      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
-                                   (pc_tree->partitioning == PARTITION_NONE &&
-                                    pb_source_variance < 32) ||
-                                   pc_tree->partitioning == PARTITION_SPLIT);
-    } else {
-      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
-                                   pc_tree->partitioning == PARTITION_SPLIT);
-      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
-                                   pc_tree->partitioning == PARTITION_SPLIT);
-    }
-    horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0);
-    horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0);
-    vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0);
-    vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0);
-    split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0);
-    split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
-    split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
-    split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
-  }
-  int horza_partition_allowed = horzab_partition_allowed;
-  int horzb_partition_allowed = horzab_partition_allowed;
-  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
-    const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
-    const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
-    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
-      case 1:
-        horza_partition_allowed &= (horz_a_rd / 16 * 14 < best_rdc.rdcost);
-        horzb_partition_allowed &= (horz_b_rd / 16 * 14 < best_rdc.rdcost);
-        break;
-      case 2:
-      default:
-        horza_partition_allowed &= (horz_a_rd / 16 * 15 < best_rdc.rdcost);
-        horzb_partition_allowed &= (horz_b_rd / 16 * 15 < best_rdc.rdcost);
-        break;
-    }
-  }
-
-  int verta_partition_allowed = vertab_partition_allowed;
-  int vertb_partition_allowed = vertab_partition_allowed;
-  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
-    const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
-    const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
-    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
-      case 1:
-        verta_partition_allowed &= (vert_a_rd / 16 * 14 < best_rdc.rdcost);
-        vertb_partition_allowed &= (vert_b_rd / 16 * 14 < best_rdc.rdcost);
-        break;
-      case 2:
-      default:
-        verta_partition_allowed &= (vert_a_rd / 16 * 15 < best_rdc.rdcost);
-        vertb_partition_allowed &= (vert_b_rd / 16 * 15 < best_rdc.rdcost);
-        break;
-    }
-  }
-
-  if (cpi->sf.part_sf.ml_prune_ab_partition && ext_partition_allowed &&
-      partition_horz_allowed && partition_vert_allowed) {
-    // TODO(huisu@google.com): x->source_variance may not be the current
-    // block's variance. The correct one to use is pb_source_variance. Need to
-    // re-train the model to fix it.
-    av1_ml_prune_ab_partition(
-        bsize, pc_tree->partitioning, get_unsigned_bits(x->source_variance),
-        best_rdc.rdcost, horz_rd, vert_rd, split_rd, &horza_partition_allowed,
-        &horzb_partition_allowed, &verta_partition_allowed,
-        &vertb_partition_allowed);
-  }
-
-  horza_partition_allowed &= cpi->oxcf.enable_ab_partitions;
-  horzb_partition_allowed &= cpi->oxcf.enable_ab_partitions;
-  verta_partition_allowed &= cpi->oxcf.enable_ab_partitions;
-  vertb_partition_allowed &= cpi->oxcf.enable_ab_partitions;
-
-  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
-      horza_partition_allowed) {
-    horza_partition_allowed &= evaluate_ab_partition_based_on_split(
-        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1);
-  }
-
-  // PARTITION_HORZ_A
-  if (!terminate_partition_search && partition_horz_allowed &&
-      horza_partition_allowed && !is_gt_max_sq_part) {
-    subsize = get_partition_subsize(bsize, PARTITION_HORZ_A);
-    pc_tree->horizontala[0].rd_mode_is_ready = 0;
-    pc_tree->horizontala[1].rd_mode_is_ready = 0;
-    pc_tree->horizontala[2].rd_mode_is_ready = 0;
-    if (split_ctx_is_ready[0]) {
-      av1_copy_tree_context(&pc_tree->horizontala[0], &pc_tree->split[0]->none);
-      pc_tree->horizontala[0].mic.partition = PARTITION_HORZ_A;
-      pc_tree->horizontala[0].rd_mode_is_ready = 1;
-      if (split_ctx_is_ready[1]) {
-        av1_copy_tree_context(&pc_tree->horizontala[1],
-                              &pc_tree->split[1]->none);
-        pc_tree->horizontala[1].mic.partition = PARTITION_HORZ_A;
-        pc_tree->horizontala[1].rd_mode_is_ready = 1;
-      }
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    {
-      RD_STATS tmp_sum_rdc;
-      av1_init_rd_stats(&tmp_sum_rdc);
-      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_A];
-      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
-      if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
-        partition_attempts[PARTITION_HORZ_A] += 1;
-        aom_usec_timer_start(&partition_timer);
-        partition_timer_on = 1;
-      }
-    }
-#endif
-    found_best_partition |= rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A, mi_row, mi_col,
-        bsize2, mi_row, mi_col + mi_step, bsize2, mi_row + mi_step, mi_col,
-        subsize);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_HORZ_A] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
-      horzb_partition_allowed) {
-    horzb_partition_allowed &= evaluate_ab_partition_based_on_split(
-        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3);
-  }
-
-  // PARTITION_HORZ_B
-  if (!terminate_partition_search && partition_horz_allowed &&
-      horzb_partition_allowed && !is_gt_max_sq_part) {
-    subsize = get_partition_subsize(bsize, PARTITION_HORZ_B);
-    pc_tree->horizontalb[0].rd_mode_is_ready = 0;
-    pc_tree->horizontalb[1].rd_mode_is_ready = 0;
-    pc_tree->horizontalb[2].rd_mode_is_ready = 0;
-    if (horz_ctx_is_ready) {
-      av1_copy_tree_context(&pc_tree->horizontalb[0], &pc_tree->horizontal[0]);
-      pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B;
-      pc_tree->horizontalb[0].rd_mode_is_ready = 1;
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    {
-      RD_STATS tmp_sum_rdc;
-      av1_init_rd_stats(&tmp_sum_rdc);
-      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_B];
-      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
-      if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
-        partition_attempts[PARTITION_HORZ_B] += 1;
-        aom_usec_timer_start(&partition_timer);
-        partition_timer_on = 1;
-      }
-    }
-#endif
-    found_best_partition |= rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B, mi_row, mi_col,
-        subsize, mi_row + mi_step, mi_col, bsize2, mi_row + mi_step,
-        mi_col + mi_step, bsize2);
-
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_HORZ_B] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
-      verta_partition_allowed) {
-    verta_partition_allowed &= evaluate_ab_partition_based_on_split(
-        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2);
-  }
-
-  // PARTITION_VERT_A
-  if (!terminate_partition_search && partition_vert_allowed &&
-      verta_partition_allowed && !is_gt_max_sq_part) {
-    subsize = get_partition_subsize(bsize, PARTITION_VERT_A);
-    pc_tree->verticala[0].rd_mode_is_ready = 0;
-    pc_tree->verticala[1].rd_mode_is_ready = 0;
-    pc_tree->verticala[2].rd_mode_is_ready = 0;
-    if (split_ctx_is_ready[0]) {
-      av1_copy_tree_context(&pc_tree->verticala[0], &pc_tree->split[0]->none);
-      pc_tree->verticala[0].mic.partition = PARTITION_VERT_A;
-      pc_tree->verticala[0].rd_mode_is_ready = 1;
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    {
-      RD_STATS tmp_sum_rdc;
-      av1_init_rd_stats(&tmp_sum_rdc);
-      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_A];
-      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
-      if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
-        partition_attempts[PARTITION_VERT_A] += 1;
-        aom_usec_timer_start(&partition_timer);
-        partition_timer_on = 1;
-      }
-    }
-#endif
-    found_best_partition |= rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A, mi_row, mi_col,
-        bsize2, mi_row + mi_step, mi_col, bsize2, mi_row, mi_col + mi_step,
-        subsize);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_VERT_A] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
-      vertb_partition_allowed) {
-    vertb_partition_allowed &= evaluate_ab_partition_based_on_split(
-        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3);
-  }
-
-  // PARTITION_VERT_B
-  if (!terminate_partition_search && partition_vert_allowed &&
-      vertb_partition_allowed && !is_gt_max_sq_part) {
-    subsize = get_partition_subsize(bsize, PARTITION_VERT_B);
-    pc_tree->verticalb[0].rd_mode_is_ready = 0;
-    pc_tree->verticalb[1].rd_mode_is_ready = 0;
-    pc_tree->verticalb[2].rd_mode_is_ready = 0;
-    if (vert_ctx_is_ready) {
-      av1_copy_tree_context(&pc_tree->verticalb[0], &pc_tree->vertical[0]);
-      pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B;
-      pc_tree->verticalb[0].rd_mode_is_ready = 1;
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    {
-      RD_STATS tmp_sum_rdc;
-      av1_init_rd_stats(&tmp_sum_rdc);
-      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_B];
-      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
-      if (!frame_is_intra_only(cm) &&
-          best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
-        partition_attempts[PARTITION_VERT_B] += 1;
-        aom_usec_timer_start(&partition_timer);
-        partition_timer_on = 1;
-      }
-    }
-#endif
-    found_best_partition |= rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B, mi_row, mi_col,
-        subsize, mi_row, mi_col + mi_step, bsize2, mi_row + mi_step,
-        mi_col + mi_step, bsize2);
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_VERT_B] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
-  // PARTITION_VERT_4 for this block. This is almost the same as
-  // ext_partition_allowed, except that we don't allow 128x32 or 32x128
-  // blocks, so we require that bsize is not BLOCK_128X128.
-  const int partition4_allowed = cpi->oxcf.enable_1to4_partitions &&
-                                 ext_partition_allowed &&
-                                 bsize != BLOCK_128X128;
-
-  int partition_horz4_allowed =
-      partition4_allowed && partition_horz_allowed &&
-      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4), xss,
-                           yss) != BLOCK_INVALID;
-  int partition_vert4_allowed =
-      partition4_allowed && partition_vert_allowed &&
-      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4), xss,
-                           yss) != BLOCK_INVALID;
-  if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) {
-    partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
-                                pc_tree->partitioning == PARTITION_HORZ_A ||
-                                pc_tree->partitioning == PARTITION_HORZ_B ||
-                                pc_tree->partitioning == PARTITION_SPLIT ||
-                                pc_tree->partitioning == PARTITION_NONE);
-    partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
-                                pc_tree->partitioning == PARTITION_VERT_A ||
-                                pc_tree->partitioning == PARTITION_VERT_B ||
-                                pc_tree->partitioning == PARTITION_SPLIT ||
-                                pc_tree->partitioning == PARTITION_NONE);
-  }
-  if (cpi->sf.part_sf.ml_prune_4_partition && partition4_allowed &&
-      partition_horz_allowed && partition_vert_allowed) {
-    av1_ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning,
-                             best_rdc.rdcost, horz_rd, vert_rd, split_rd,
-                             &partition_horz4_allowed, &partition_vert4_allowed,
-                             pb_source_variance, mi_row, mi_col);
-  }
-
-  if (blksize < (min_partition_size << 2)) {
-    partition_horz4_allowed = 0;
-    partition_vert4_allowed = 0;
-  }
-
-  if (cpi->sf.part_sf.prune_4_partition_using_split_info &&
-      (partition_horz4_allowed || partition_vert4_allowed)) {
-    // Count of child blocks in which HORZ or VERT partition has won
-    int num_child_horz_win = 0, num_child_vert_win = 0;
-    for (int idx = 0; idx < 4; idx++) {
-      num_child_horz_win += (split_part_rect_win[idx].horz_win) ? 1 : 0;
-      num_child_vert_win += (split_part_rect_win[idx].vert_win) ? 1 : 0;
-    }
-
-    // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of
-    // split partiitons.
-    // Conservative pruning for high quantizers
-    const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3);
-    if (num_child_horz_win < num_win_thresh) {
-      partition_horz4_allowed = 0;
-    }
-    if (num_child_vert_win < num_win_thresh) {
-      partition_vert4_allowed = 0;
-    }
-  }
-
-  // PARTITION_HORZ_4
-  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz4_allowed));
-  if (!terminate_partition_search && partition_horz4_allowed && has_rows &&
-      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) &&
-      !is_gt_max_sq_part) {
-    av1_init_rd_stats(&sum_rdc);
-    const int quarter_step = mi_size_high[bsize] / 4;
-    PICK_MODE_CONTEXT *ctx_prev = ctx_none;
-
-    subsize = get_partition_subsize(bsize, PARTITION_HORZ_4);
-    sum_rdc.rate = partition_cost[PARTITION_HORZ_4];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
-      partition_attempts[PARTITION_HORZ_4] += 1;
-      aom_usec_timer_start(&partition_timer);
-      partition_timer_on = 1;
-    }
-#endif
-    for (int i = 0; i < 4; ++i) {
-      const int this_mi_row = mi_row + i * quarter_step;
-
-      if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
-
-      PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
-
-      ctx_this->rd_mode_is_ready = 0;
-      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row,
-                           mi_col, subsize, best_rdc, &sum_rdc,
-                           PARTITION_HORZ_4, ctx_prev, ctx_this)) {
-        av1_invalid_rd_stats(&sum_rdc);
-        break;
-      }
-
-      ctx_prev = ctx_this;
-    }
-
-    av1_rd_cost_update(x->rdmult, &sum_rdc);
-    if (sum_rdc.rdcost < best_rdc.rdcost) {
-      best_rdc = sum_rdc;
-      found_best_partition = true;
-      pc_tree->partitioning = PARTITION_HORZ_4;
-    }
-
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_HORZ_4] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  // PARTITION_VERT_4
-  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert4_allowed));
-  if (!terminate_partition_search && partition_vert4_allowed && has_cols &&
-      (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step)) &&
-      !is_gt_max_sq_part) {
-    av1_init_rd_stats(&sum_rdc);
-    const int quarter_step = mi_size_wide[bsize] / 4;
-    PICK_MODE_CONTEXT *ctx_prev = ctx_none;
-
-    subsize = get_partition_subsize(bsize, PARTITION_VERT_4);
-    sum_rdc.rate = partition_cost[PARTITION_VERT_4];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
-
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
-      partition_attempts[PARTITION_VERT_4] += 1;
-      aom_usec_timer_start(&partition_timer);
-      partition_timer_on = 1;
-    }
-#endif
-    for (int i = 0; i < 4; ++i) {
-      const int this_mi_col = mi_col + i * quarter_step;
-
-      if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
-
-      PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
-
-      ctx_this->rd_mode_is_ready = 0;
-      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row,
-                           this_mi_col, subsize, best_rdc, &sum_rdc,
-                           PARTITION_VERT_4, ctx_prev, ctx_this)) {
-        av1_invalid_rd_stats(&sum_rdc);
-        break;
-      }
-
-      ctx_prev = ctx_this;
-    }
-
-    av1_rd_cost_update(x->rdmult, &sum_rdc);
-    if (sum_rdc.rdcost < best_rdc.rdcost) {
-      best_rdc = sum_rdc;
-      found_best_partition = true;
-      pc_tree->partitioning = PARTITION_VERT_4;
-    }
-#if CONFIG_COLLECT_PARTITION_STATS
-    if (partition_timer_on) {
-      aom_usec_timer_mark(&partition_timer);
-      int64_t time = aom_usec_timer_elapsed(&partition_timer);
-      partition_times[PARTITION_VERT_4] += time;
-      partition_timer_on = 0;
-    }
-#endif
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-  }
-
-  if (bsize == cm->seq_params.sb_size && !found_best_partition) {
-    // Did not find a valid partition, go back and search again, with less
-    // constraint on which partition types to search.
-    x->must_find_valid_partition = 1;
-#if CONFIG_COLLECT_PARTITION_STATS == 2
-    part_stats->partition_redo += 1;
-#endif
-    goto BEGIN_PARTITION_SEARCH;
-  }
-
-  *rd_cost = best_rdc;
-
-#if CONFIG_COLLECT_PARTITION_STATS
-  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
-    partition_decisions[pc_tree->partitioning] += 1;
-  }
-#endif
-
-#if CONFIG_COLLECT_PARTITION_STATS == 1
-  // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each
-  // prediction block
-  FILE *f = fopen("data.csv", "a");
-  fprintf(f, "%d,%d,%d,", bsize, cm->show_frame, frame_is_intra_only(cm));
-  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
-    fprintf(f, "%d,", partition_decisions[idx]);
-  }
-  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
-    fprintf(f, "%d,", partition_attempts[idx]);
-  }
-  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
-    fprintf(f, "%ld,", partition_times[idx]);
-  }
-  fprintf(f, "\n");
-  fclose(f);
-#endif
-
-#if CONFIG_COLLECT_PARTITION_STATS == 2
-  // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for
-  // the whole clip. So we need to pass the information upstream to the encoder
-  const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
-  int *agg_attempts = part_stats->partition_attempts[bsize_idx];
-  int *agg_decisions = part_stats->partition_decisions[bsize_idx];
-  int64_t *agg_times = part_stats->partition_times[bsize_idx];
-  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
-    agg_attempts[idx] += partition_attempts[idx];
-    agg_decisions[idx] += partition_decisions[idx];
-    agg_times[idx] += partition_times[idx];
-  }
-#endif
-
-  if (found_best_partition && pc_tree->index != 3) {
-    if (bsize == cm->seq_params.sb_size) {
-      const int emit_output = multi_pass_mode != SB_DRY_PASS;
-      const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL;
-
-      x->cb_offset = 0;
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
-                pc_tree, NULL);
-    } else {
-      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
-                pc_tree, NULL);
-    }
-  }
-
-  if (bsize == cm->seq_params.sb_size) {
-    assert(best_rdc.rate < INT_MAX);
-    assert(best_rdc.dist < INT64_MAX);
-  } else {
-    assert(tp_orig == *tp);
-  }
-
-  x->rdmult = orig_rdmult;
-  return found_best_partition;
-}
-#endif  // !CONFIG_REALTIME_ONLY
-#undef NUM_SIMPLE_MOTION_FEATURES
-
-#if !CONFIG_REALTIME_ONLY
-
-static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int analysis_type,
-                            int mi_row, int mi_col, int orig_rdmult) {
-  AV1_COMMON *const cm = &cpi->common;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int tpl_idx = cpi->gf_group.index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
-  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
-  int tpl_stride = tpl_frame->stride;
-  int64_t intra_cost = 0;
-  int64_t mc_dep_cost = 0;
-  const int mi_wide = mi_size_wide[bsize];
-  const int mi_high = mi_size_high[bsize];
-
-  if (tpl_frame->is_valid == 0) return orig_rdmult;
-
-  if (!is_frame_tpl_eligible(cpi)) return orig_rdmult;
-
-  if (cpi->gf_group.index >= MAX_LAG_BUFFERS) return orig_rdmult;
-
-  int64_t mc_count = 0, mc_saved = 0;
-  int mi_count = 0;
-  const int mi_col_sr =
-      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
-  const int mi_col_end_sr =
-      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
-  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-  const int step = 1 << block_mis_log2;
-  for (int row = mi_row; row < mi_row + mi_high; row += step) {
-    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
-      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
-      TplDepStats *this_stats =
-          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
-      int64_t mc_dep_delta =
-          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
-                 this_stats->mc_dep_dist);
-      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
-      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
-      mc_count += this_stats->mc_count;
-      mc_saved += this_stats->mc_saved;
-      mi_count++;
-    }
-  }
-
-  aom_clear_system_state();
-
-  double beta = 1.0;
-  if (analysis_type == 0) {
-    if (mc_dep_cost > 0 && intra_cost > 0) {
-      const double r0 = cpi->rd.r0;
-      const double rk = (double)intra_cost / mc_dep_cost;
-      beta = (r0 / rk);
-    }
-  } else if (analysis_type == 1) {
-    const double mc_count_base = (mi_count * cpi->rd.mc_count_base);
-    beta = (mc_count + 1.0) / (mc_count_base + 1.0);
-    beta = pow(beta, 0.5);
-  } else if (analysis_type == 2) {
-    const double mc_saved_base = (mi_count * cpi->rd.mc_saved_base);
-    beta = (mc_saved + 1.0) / (mc_saved_base + 1.0);
-    beta = pow(beta, 0.5);
-  }
-
-  int rdmult = av1_get_adaptive_rdmult(cpi, beta);
-
-  aom_clear_system_state();
-
-  rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2);
-  rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2);
-
-  rdmult = AOMMAX(1, rdmult);
-
-  return rdmult;
-}
-
-static int get_tpl_stats_b(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
-                           int mi_col, int64_t *intra_cost_b,
-                           int64_t *inter_cost_b,
-                           int_mv mv_b[][INTER_REFS_PER_FRAME], int *stride) {
-  if (!cpi->oxcf.enable_tpl_model) return 0;
-  if (cpi->superres_mode != SUPERRES_NONE) return 0;
-  if (cpi->common.current_frame.frame_type == KEY_FRAME) return 0;
-  const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
-  if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE)
-    return 0;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-
-  AV1_COMMON *const cm = &cpi->common;
-  const int gf_group_index = cpi->gf_group.index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index];
-  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-  int tpl_stride = tpl_frame->stride;
-  const int mi_wide = mi_size_wide[bsize];
-  const int mi_high = mi_size_high[bsize];
-
-  if (tpl_frame->is_valid == 0) return 0;
-  if (gf_group_index >= MAX_LAG_BUFFERS) return 0;
-
-  int mi_count = 0;
-  int count = 0;
-  const int mi_col_sr =
-      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
-  const int mi_col_end_sr =
-      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
-  // mi_cols_sr is mi_cols at superres case.
-  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-
-  // TPL store unit size is not the same as the motion estimation unit size.
-  // Here always use motion estimation size to avoid getting repetitive inter/
-  // intra cost.
-  const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
-  const int step = mi_size_wide[tpl_bsize];
-  assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]);
-
-  // Stride is only based on SB size, and we fill in values for every 16x16
-  // block in a SB.
-  *stride = (mi_col_end_sr - mi_col_sr) / step;
-
-  for (int row = mi_row; row < mi_row + mi_high; row += step) {
-    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
-      // Handle partial SB, so that no invalid values are used later.
-      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) {
-        inter_cost_b[count] = INT64_MAX;
-        intra_cost_b[count] = INT64_MAX;
-        for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-          mv_b[count][i].as_int = INVALID_MV;
-        }
-        count++;
-        continue;
-      }
-
-      TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
-          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
-      inter_cost_b[count] = this_stats->inter_cost;
-      intra_cost_b[count] = this_stats->intra_cost;
-      memcpy(mv_b[count], this_stats->mv, sizeof(this_stats->mv));
-      mi_count++;
-      count++;
-    }
-  }
-
-  return mi_count;
-}
-
-// analysis_type 0: Use mc_dep_cost and intra_cost
-// analysis_type 1: Use count of best inter predictor chosen
-// analysis_type 2: Use cost reduction from intra to inter for best inter
-//                  predictor chosen
-static int get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                                      int mi_row, int mi_col) {
-  AV1_COMMON *const cm = &cpi->common;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int tpl_idx = cpi->gf_group.index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
-  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
-  int tpl_stride = tpl_frame->stride;
-  int64_t intra_cost = 0;
-  int64_t mc_dep_cost = 0;
-  const int mi_wide = mi_size_wide[bsize];
-  const int mi_high = mi_size_high[bsize];
-  const int base_qindex = cm->quant_params.base_qindex;
-
-  if (tpl_frame->is_valid == 0) return base_qindex;
-
-  if (!is_frame_tpl_eligible(cpi)) return base_qindex;
-
-  if (cpi->gf_group.index >= MAX_LAG_BUFFERS) return base_qindex;
-
-  int64_t mc_count = 0, mc_saved = 0;
-  int mi_count = 0;
-  const int mi_col_sr =
-      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
-  const int mi_col_end_sr =
-      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
-  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-  const int step = 1 << block_mis_log2;
-  for (int row = mi_row; row < mi_row + mi_high; row += step) {
-    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
-      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
-      TplDepStats *this_stats =
-          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
-      int64_t mc_dep_delta =
-          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
-                 this_stats->mc_dep_dist);
-      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
-      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
-      mc_count += this_stats->mc_count;
-      mc_saved += this_stats->mc_saved;
-      mi_count++;
-    }
-  }
-
-  aom_clear_system_state();
-
-  int offset = 0;
-  double beta = 1.0;
-  if (mc_dep_cost > 0 && intra_cost > 0) {
-    const double r0 = cpi->rd.r0;
-    const double rk = (double)intra_cost / mc_dep_cost;
-    beta = (r0 / rk);
-    assert(beta > 0.0);
-  }
-  offset = av1_get_deltaq_offset(cpi, base_qindex, beta);
-  aom_clear_system_state();
-
-  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
-  offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
-  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
-  int qindex = cm->quant_params.base_qindex + offset;
-  qindex = AOMMIN(qindex, MAXQ);
-  qindex = AOMMAX(qindex, MINQ);
-
-  return qindex;
-}
-
+/*!\brief Assigns different quantization parameters to each super
+ * block based on its TPL weight.
+ *
+ * \ingroup tpl_modelling
+ *
+ * \param[in]     cpi         Top level encoder instance structure
+ * \param[in,out] td          Thread data structure
+ * \param[in,out] x           Macro block level data for this block.
+ * \param[in]     tile_info   Tile infromation / identification
+ * \param[in]     mi_row      Block row (in "MI_SIZE" units) index
+ * \param[in]     mi_col      Block column (in "MI_SIZE" units) index
+ * \param[out]    num_planes  Number of image planes (e.g. Y,U,V)
+ *
+ * \return No return value but updates macroblock and thread data
+ * related to the q / q delta to be used.
+ */
 static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
                                      MACROBLOCK *const x,
                                      const TileInfo *const tile_info,
@@ -4121,12 +214,13 @@ static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
   assert(delta_q_info->delta_q_present_flag);
 
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
   // Delta-q modulation based on variance
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
 
+  const int delta_q_res = delta_q_info->delta_q_res;
   int current_qindex = cm->quant_params.base_qindex;
-  if (cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL) {
+  if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) {
     if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
       const int block_wavelet_energy_level =
           av1_block_wavelet_energy_level(cpi, x, sb_size);
@@ -4139,473 +233,100 @@ static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
       current_qindex =
           av1_compute_q_from_energy_level_deltaq_mode(cpi, block_var_level);
     }
-  } else if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE &&
-             cpi->oxcf.enable_tpl_model) {
+  } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_OBJECTIVE &&
+             cpi->oxcf.algo_cfg.enable_tpl_model) {
     // Setup deltaq based on tpl stats
-    current_qindex = get_q_for_deltaq_objective(cpi, sb_size, mi_row, mi_col);
-  }
-
-  const int delta_q_res = delta_q_info->delta_q_res;
-  // Right now aq only works with tpl model. So if tpl is disabled, we set the
-  // current_qindex to base_qindex.
-  if (cpi->oxcf.enable_tpl_model && cpi->oxcf.deltaq_mode != NO_DELTA_Q) {
     current_qindex =
-        clamp(current_qindex, delta_q_res, 256 - delta_q_info->delta_q_res);
-  } else {
-    current_qindex = cm->quant_params.base_qindex;
+        av1_get_q_for_deltaq_objective(cpi, td, NULL, sb_size, mi_row, mi_col);
+  } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI) {
+    current_qindex = av1_get_sbq_perceptual_ai(cpi, sb_size, mi_row, mi_col);
+  } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) {
+    current_qindex = av1_get_sbq_user_rating_based(cpi, mi_row, mi_col);
+  } else if (cpi->oxcf.q_cfg.enable_hdr_deltaq) {
+    current_qindex = av1_get_q_for_hdr(cpi, x, sb_size, mi_row, mi_col);
   }
 
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int sign_deltaq_index =
-      current_qindex - xd->current_qindex >= 0 ? 1 : -1;
-  const int deltaq_deadzone = delta_q_res / 4;
-  const int qmask = ~(delta_q_res - 1);
-  int abs_deltaq_index = abs(current_qindex - xd->current_qindex);
-  abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask;
-  current_qindex = xd->current_qindex + sign_deltaq_index * abs_deltaq_index;
-  current_qindex = AOMMAX(current_qindex, MINQ + 1);
-  assert(current_qindex > 0);
-
-  xd->delta_qindex = current_qindex - cm->quant_params.base_qindex;
-  set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+  current_qindex = av1_adjust_q_from_delta_q_res(
+      delta_q_res, xd->current_base_qindex, current_qindex);
+
+  x->delta_qindex = current_qindex - cm->quant_params.base_qindex;
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
   xd->mi[0]->current_qindex = current_qindex;
-  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0);
 
   // keep track of any non-zero delta-q used
-  td->deltaq_used |= (xd->delta_qindex != 0);
+  td->deltaq_used |= (x->delta_qindex != 0);
 
-  if (cpi->oxcf.deltalf_mode) {
+  if (cpi->oxcf.tool_cfg.enable_deltalf_mode) {
     const int delta_lf_res = delta_q_info->delta_lf_res;
     const int lfmask = ~(delta_lf_res - 1);
     const int delta_lf_from_base =
-        ((xd->delta_qindex / 2 + delta_lf_res / 2) & lfmask);
+        ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask);
     const int8_t delta_lf =
         (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
     const int frame_lf_count =
         av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-    const int mib_size = cm->seq_params.mib_size;
+    const int mib_size = cm->seq_params->mib_size;
 
     // pre-set the delta lf for loop filter. Note that this value is set
     // before mi is assigned for each block in current superblock
     for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) {
       for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) {
         const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k);
-        mi_params->mi_grid_base[grid_idx]->delta_lf_from_base = delta_lf;
+        mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf;
         for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
-          mi_params->mi_grid_base[grid_idx]->delta_lf[lf_id] = delta_lf;
+          mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf;
         }
       }
     }
   }
 }
-#endif  // !CONFIG_REALTIME_ONLY
 
-#define AVG_CDF_WEIGHT_LEFT 3
-#define AVG_CDF_WEIGHT_TOP_RIGHT 1
-
-static AOM_INLINE void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left,
-                                      aom_cdf_prob *cdf_ptr_tr, int num_cdfs,
-                                      int cdf_stride, int nsymbs, int wt_left,
-                                      int wt_tr) {
-  for (int i = 0; i < num_cdfs; i++) {
-    for (int j = 0; j <= nsymbs; j++) {
-      cdf_ptr_left[i * cdf_stride + j] =
-          (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left +
-                          (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr +
-                          ((wt_left + wt_tr) / 2)) /
-                         (wt_left + wt_tr));
-      assert(cdf_ptr_left[i * cdf_stride + j] >= 0 &&
-             cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP);
-    }
-  }
-}
-
-#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \
-  AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs))
-
-#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride)           \
-  do {                                                                     \
-    aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left;               \
-    aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr;                   \
-    int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob);       \
-    int num_cdfs = array_size / cdf_stride;                                \
-    avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \
-                   wt_left, wt_tr);                                        \
-  } while (0)
-
-static AOM_INLINE void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr,
-                               int wt_left, int wt_tr) {
-  AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4);
-  for (int i = 0; i < 2; i++) {
-    AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf,
-                MV_CLASSES);
-    AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf,
-                nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE);
-    AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE);
-    AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2);
-    AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf,
-                nmv_tr->comps[i].class0_hp_cdf, 2);
-    AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2);
-    AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf,
-                CLASS0_SIZE);
-    AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2);
-  }
-}
-
-// In case of row-based multi-threading of encoder, since we always
-// keep a top - right sync, we can average the top - right SB's CDFs and
-// the left SB's CDFs and use the same for current SB's encoding to
-// improve the performance. This function facilitates the averaging
-// of CDF and used only when row-mt is enabled in encoder.
-static AOM_INLINE void avg_cdf_symbols(FRAME_CONTEXT *ctx_left,
-                                       FRAME_CONTEXT *ctx_tr, int wt_left,
-                                       int wt_tr) {
-  AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2);
-  AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2);
-  AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10);
-  AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11);
-  AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3);
-  AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4);
-  AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE);
-  AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2);
-  AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2);
-  AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2);
-  AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2);
-  AVERAGE_CDF(ctx_left->inter_compound_mode_cdf,
-              ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
-  AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf,
-              MASKED_COMPOUND_TYPES);
-  AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16);
-  AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2);
-  AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2);
-  AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf,
-              INTERINTRA_MODES);
-  AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES);
-  AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2);
-  AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf,
-              PALETTE_SIZES);
-  AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf,
-              PALETTE_SIZES);
-  for (int j = 0; j < PALETTE_SIZES; j++) {
-    int nsymbs = j + PALETTE_MIN_SIZE;
-    AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j],
-                   ctx_tr->palette_y_color_index_cdf[j], nsymbs,
-                   CDF_SIZE(PALETTE_COLORS));
-    AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j],
-                   ctx_tr->palette_uv_color_index_cdf[j], nsymbs,
-                   CDF_SIZE(PALETTE_COLORS));
-  }
-  AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2);
-  AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2);
-  AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2);
-  AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2);
-  AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2);
-  AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2);
-  AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2);
-  AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2);
-  AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2);
-  AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2);
-  AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2);
-  AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2);
-  AVERAGE_CDF(ctx_left->skip_cdfs, ctx_tr->skip_cdfs, 2);
-  AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2);
-  avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr);
-  avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr);
-  AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2);
-  AVERAGE_CDF(ctx_left->seg.tree_cdf, ctx_tr->seg.tree_cdf, MAX_SEGMENTS);
-  AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2);
-  AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf,
-              ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
-  AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2);
-  AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf,
-              FILTER_INTRA_MODES);
-  AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf,
-              RESTORE_SWITCHABLE_TYPES);
-  AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2);
-  AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2);
-  AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES);
-  AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0],
-                 UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES));
-  AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES);
-  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
-    if (i < 4) {
-      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4,
-                     CDF_SIZE(10));
-    } else if (i < 16) {
-      AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10);
-    } else {
-      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8,
-                     CDF_SIZE(10));
-    }
-  }
-  AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf,
-              SWITCHABLE_FILTERS);
-  AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES);
-  AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf,
-              2 * MAX_ANGLE_DELTA + 1);
-  AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH,
-                 CDF_SIZE(MAX_TX_DEPTH + 1));
-  AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1],
-              MAX_TX_DEPTH + 1);
-  AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2],
-              MAX_TX_DEPTH + 1);
-  AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3],
-              MAX_TX_DEPTH + 1);
-  AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1);
-  AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1);
-  for (int i = 0; i < FRAME_LF_COUNT; i++) {
-    AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i],
-                DELTA_LF_PROBS + 1);
-  }
-  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7,
-                 CDF_SIZE(TX_TYPES));
-  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5,
-                 CDF_SIZE(TX_TYPES));
-  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16,
-                 CDF_SIZE(TX_TYPES));
-  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12,
-                 CDF_SIZE(TX_TYPES));
-  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2,
-                 CDF_SIZE(TX_TYPES));
-  AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS);
-  AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf,
-              CFL_ALPHABET_SIZE);
-}
-
-#if !CONFIG_REALTIME_ONLY
-static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
-                                               int mi_row, int mi_col) {
-  const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
-  const int orig_rdmult = cpi->rd.RDMULT;
-
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int gf_group_index = cpi->gf_group.index;
-  if (cpi->oxcf.enable_tpl_model && cpi->oxcf.aq_mode == NO_AQ &&
-      cpi->oxcf.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 &&
-      cpi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
-    const int dr =
-        get_rdmult_delta(cpi, sb_size, 0, mi_row, mi_col, orig_rdmult);
-    x->rdmult = dr;
-  }
-}
-#endif
-
-static void source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int shift) {
-  unsigned int tmp_sse;
-  unsigned int tmp_variance;
-  const BLOCK_SIZE bsize = BLOCK_64X64;
-  uint8_t *src_y = cpi->source->y_buffer;
-  int src_ystride = cpi->source->y_stride;
-  uint8_t *last_src_y = cpi->last_source->y_buffer;
-  int last_src_ystride = cpi->last_source->y_stride;
-  uint64_t avg_source_sse_threshold = 100000;        // ~5*5*(64*64)
-  uint64_t avg_source_sse_threshold_high = 1000000;  // ~15*15*(64*64)
-  uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
-#if CONFIG_AV1_HIGHBITDEPTH
-  MACROBLOCKD *xd = &x->e_mbd;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return;
-#endif
-  src_y += shift;
-  last_src_y += shift;
-  tmp_variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
-                                       last_src_ystride, &tmp_sse);
-  // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
-  // Detect large lighting change.
-  if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh)
-    x->content_state_sb = kLowVarHighSumdiff;
-  else if (tmp_sse < avg_source_sse_threshold)
-    x->content_state_sb = kLowSad;
-  else if (tmp_sse > avg_source_sse_threshold_high)
-    x->content_state_sb = kHighSad;
-}
-
-static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
-                                       TileDataEnc *tile_data,
-                                       PC_TREE *const pc_root, TOKENEXTRA **tp,
-                                       const int mi_row, const int mi_col,
-                                       const int seg_skip) {
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const TileInfo *const tile_info = &tile_data->tile_info;
-  MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
-                      get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-  if (sf->rt_sf.source_metrics_sb_nonrd && sb_size == BLOCK_64X64 &&
-      cpi->svc.number_spatial_layers <= 1 &&
-      cm->current_frame.frame_type != KEY_FRAME) {
-    int shift = cpi->source->y_stride * (mi_row << 2) + (mi_col << 2);
-    source_content_sb(cpi, x, shift);
-  }
-  if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
-    const BLOCK_SIZE bsize =
-        seg_skip ? sb_size : sf->part_sf.always_this_block_size;
-    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-  } else if (cpi->partition_search_skippable_frame) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
-    const BLOCK_SIZE bsize =
-        get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
-    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-  } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
-    set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size);
-    av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
-  }
-  assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
-         cpi->partition_search_skippable_frame ||
-         sf->part_sf.partition_search_type == VAR_BASED_PARTITION);
-  td->mb.cb_offset = 0;
-  nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                      pc_root);
-}
-
-// Memset the mbmis at the current superblock to 0
-static INLINE void reset_mbmi(CommonModeInfoParams *const mi_params,
-                              BLOCK_SIZE sb_size, int mi_row, int mi_col) {
-  // size of sb in unit of mi (BLOCK_4X4)
-  const int sb_size_mi = mi_size_wide[sb_size];
-  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
-  // size of sb in unit of allocated mi size
-  const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d;
-  assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 &&
-         "mi is not allocated as a multiple of sb!");
-  assert(mi_params->mi_stride % sb_size_mi == 0 &&
-         "mi_grid_base is not allocated as a multiple of sb!");
-
-  const int mi_rows = mi_size_high[sb_size];
-  for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) {
-    assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) <
-           mi_params->mi_stride);
-    const int mi_grid_idx =
-        get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col);
-    const int alloc_mi_idx =
-        get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col);
-    memset(&mi_params->mi_grid_base[mi_grid_idx], 0,
-           sb_size_mi * sizeof(*mi_params->mi_grid_base));
-    memset(&mi_params->tx_type_map[mi_grid_idx], 0,
-           sb_size_mi * sizeof(*mi_params->tx_type_map));
-    if (cur_mi_row % mi_alloc_size_1d == 0) {
-      memset(&mi_params->mi_alloc[alloc_mi_idx], 0,
-             sb_size_alloc_mi * sizeof(*mi_params->mi_alloc));
-    }
-  }
-}
-
-static INLINE void backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats,
-                                   const AV1_COMP *cpi, ThreadData *td,
-                                   const TileDataEnc *tile_data, int mi_row,
-                                   int mi_col) {
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const TileInfo *tile_info = &tile_data->tile_info;
-
-  const AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-
-  xd->above_txfm_context =
-      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-  save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
-
-  sb_fp_stats->rd_count = cpi->td.rd_counts;
-  sb_fp_stats->split_count = cpi->td.mb.txb_split_count;
-
-  sb_fp_stats->fc = *td->counts;
-
-  memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models,
-         sizeof(sb_fp_stats->inter_mode_rd_models));
-
-  memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact,
-         sizeof(sb_fp_stats->thresh_freq_fact));
-
-  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
-  sb_fp_stats->current_qindex =
-      cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
-
-#if CONFIG_INTERNAL_STATS
-  memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts,
-         sizeof(sb_fp_stats->mode_chosen_counts));
-#endif  // CONFIG_INTERNAL_STATS
-}
-
-static INLINE void restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats,
-                                    AV1_COMP *cpi, ThreadData *td,
-                                    TileDataEnc *tile_data, int mi_row,
-                                    int mi_col) {
-  MACROBLOCK *x = &td->mb;
-
-  const AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-
-  restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
-
-  cpi->td.rd_counts = sb_fp_stats->rd_count;
-  cpi->td.mb.txb_split_count = sb_fp_stats->split_count;
-
-  *td->counts = sb_fp_stats->fc;
-
-  memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models,
-         sizeof(sb_fp_stats->inter_mode_rd_models));
-  memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact,
-         sizeof(sb_fp_stats->thresh_freq_fact));
-
-  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
-  cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
-      sb_fp_stats->current_qindex;
-
-#if CONFIG_INTERNAL_STATS
-  memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts,
-         sizeof(sb_fp_stats->mode_chosen_counts));
-#endif  // CONFIG_INTERNAL_STATS
-}
-
-#if !CONFIG_REALTIME_ONLY
 static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
                                  int mi_col) {
   const AV1_COMMON *cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MACROBLOCK *x = &td->mb;
-  const int frame_idx = cpi->gf_group.index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  const int frame_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
 
-  av1_zero(x->search_ref_frame);
+  av1_zero(x->tpl_keep_ref_frame);
 
-  if (tpl_frame->is_valid == 0) return;
-  if (!is_frame_tpl_eligible(cpi)) return;
-  if (frame_idx >= MAX_LAG_BUFFERS) return;
-  if (cpi->superres_mode != SUPERRES_NONE) return;
-  if (cpi->oxcf.aq_mode != NO_AQ) return;
+  if (!av1_tpl_stats_ready(tpl_data, frame_idx)) return;
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return;
+  if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
 
-  const int is_overlay = cpi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE;
+  const int is_overlay =
+      cpi->ppi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE;
   if (is_overlay) {
-    memset(x->search_ref_frame, 1, sizeof(x->search_ref_frame));
+    memset(x->tpl_keep_ref_frame, 1, sizeof(x->tpl_keep_ref_frame));
     return;
   }
 
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
   TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
   const int tpl_stride = tpl_frame->stride;
   int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 };
   const int step = 1 << block_mis_log2;
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
   const int mi_row_end =
       AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows);
-  const int mi_col_end =
-      AOMMIN(mi_size_wide[sb_size] + mi_col, mi_params->mi_cols);
-
-  for (int row = mi_row; row < mi_row_end; row += step) {
-    for (int col = mi_col; col < mi_col_end; col += step) {
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      AOMMIN(coded_to_superres_mi(mi_col + mi_size_wide[sb_size],
+                                  cm->superres_scale_denominator),
+             mi_cols_sr);
+  const int row_step = step;
+  const int col_step_sr =
+      coded_to_superres_mi(step, cm->superres_scale_denominator);
+  for (int row = mi_row; row < mi_row_end; row += row_step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
       const TplDepStats *this_stats =
           &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
       int64_t tpl_pred_error[INTER_REFS_PER_FRAME] = { 0 };
@@ -4641,12 +362,12 @@ static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
     }
   }
 
-  x->search_ref_frame[INTRA_FRAME] = 1;
-  x->search_ref_frame[LAST_FRAME] = 1;
+  x->tpl_keep_ref_frame[INTRA_FRAME] = 1;
+  x->tpl_keep_ref_frame[LAST_FRAME] = 1;
 
   int cutoff_ref = 0;
   for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) {
-    x->search_ref_frame[rank_index[idx] + LAST_FRAME] = 1;
+    x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 1;
     if (idx > 2) {
       if (!cutoff_ref) {
         // If the predictive coding gains are smaller than the previous more
@@ -4658,17 +379,169 @@ static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
           cutoff_ref = 1;
       }
 
-      if (cutoff_ref) x->search_ref_frame[rank_index[idx] + LAST_FRAME] = 0;
+      if (cutoff_ref) x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 0;
     }
   }
 }
+
+static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
+                                               int mi_row, int mi_col) {
+  const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
+  const int orig_rdmult = cpi->rd.RDMULT;
+
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int gf_group_index = cpi->gf_frame_index;
+  if (cpi->oxcf.algo_cfg.enable_tpl_model && cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
+      cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 &&
+      cpi->ppi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
+    const int dr =
+        av1_get_rdmult_delta(cpi, sb_size, mi_row, mi_col, orig_rdmult);
+    x->rdmult = dr;
+  }
+}
 #endif  // !CONFIG_REALTIME_ONLY
 
+#if CONFIG_RT_ML_PARTITIONING
+// Get a prediction(stored in x->est_pred) for the whole superblock.
+static void get_estimated_pred(AV1_COMP *cpi, const TileInfo *const tile,
+                               MACROBLOCK *x, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int is_key_frame = frame_is_intra_only(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  // TODO(kyslov) Extend to 128x128
+  assert(cm->seq_params->sb_size == BLOCK_64X64);
+
+  av1_set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+
+  if (!is_key_frame) {
+    MB_MODE_INFO *mi = xd->mi[0];
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+
+    assert(yv12 != NULL);
+
+    av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         get_ref_scale_factors(cm, LAST_FRAME), 1);
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE;
+    mi->bsize = BLOCK_64X64;
+    mi->mv[0].as_int = 0;
+    mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+
+    xd->plane[0].dst.buf = x->est_pred;
+    xd->plane[0].dst.stride = 64;
+    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+  } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+    switch (xd->bd) {
+      case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break;
+      case 10:
+        memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0]));
+        break;
+      case 12:
+        memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0]));
+        break;
+    }
+#else
+    memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0]));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+#endif  // CONFIG_RT_ML_PARTITIONING
+
+#define AVG_CDF_WEIGHT_LEFT 3
+#define AVG_CDF_WEIGHT_TOP_RIGHT 1
+
+/*!\brief Encode a superblock (minimal RD search involved)
+ *
+ * \ingroup partition_search
+ * Encodes the superblock by a pre-determined partition pattern, only minor
+ * rd-based searches are allowed to adjust the initial pattern. It is only used
+ * by realtime encoding.
+ */
+static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
+                                       TileDataEnc *tile_data, TokenExtra **tp,
+                                       const int mi_row, const int mi_col,
+                                       const int seg_skip, PC_TREE *pc_root) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+                      get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+#if CONFIG_RT_ML_PARTITIONING
+  if (sf->part_sf.partition_search_type == ML_BASED_PARTITION) {
+    RD_STATS dummy_rdc;
+    get_estimated_pred(cpi, tile_info, x, mi_row, mi_col);
+    av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                             BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, pc_root);
+    return;
+  }
+#endif
+  // Set the partition
+  if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
+    // set a fixed-size partition
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    const BLOCK_SIZE bsize =
+        seg_skip ? sb_size : sf->part_sf.fixed_partition_size;
+    av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+  } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+    // set a variance-based partition
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
+  }
+  assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
+         sf->part_sf.partition_search_type == VAR_BASED_PARTITION);
+  set_cb_offsets(td->mb.cb_offset, 0, 0);
+
+  // Initialize the flag to skip cdef to 1.
+  if (sf->rt_sf.skip_cdef_sb) {
+    // If 128x128 block is used, we need to set the flag for all 4 64x64 sub
+    // "blocks".
+    const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1;
+    for (int r = 0; r < block64_in_sb; ++r) {
+      for (int c = 0; c < block64_in_sb; ++c) {
+        const int idx_in_sb =
+            r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64;
+        if (mi[idx_in_sb]) mi[idx_in_sb]->skip_cdef_curr_sb = 1;
+      }
+    }
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, nonrd_use_partition_time);
+#endif
+  av1_nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                          pc_root);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, nonrd_use_partition_time);
+#endif
+
+  if (sf->rt_sf.skip_cdef_sb) {
+    // If 128x128 block is used, we need to set the flag for all 4 64x64 sub
+    // "blocks".
+    const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1;
+    const int skip = mi[0]->skip_cdef_curr_sb;
+    for (int r = 0; r < block64_in_sb; ++r) {
+      for (int c = 0; c < block64_in_sb; ++c) {
+        const int idx_in_sb =
+            r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64;
+        if (mi[idx_in_sb]) mi[idx_in_sb]->skip_cdef_curr_sb = skip;
+      }
+    }
+  }
+}
+
 // This function initializes the stats for encode_rd_sb.
 static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
                                      const TileDataEnc *tile_data,
-                                     PC_TREE *pc_root, RD_STATS *rd_cost,
-                                     int mi_row, int mi_col,
+                                     SIMPLE_MOTION_DATA_TREE *sms_root,
+                                     RD_STATS *rd_cost, int mi_row, int mi_col,
                                      int gather_tpl_data) {
   const AV1_COMMON *cm = &cpi->common;
   const TileInfo *tile_info = &tile_data->tile_info;
@@ -4682,22 +555,28 @@ static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
        sf->part_sf.ml_early_term_after_part_split_level) &&
       !frame_is_intra_only(cm);
   if (use_simple_motion_search) {
-    init_simple_motion_search_mvs(pc_root);
+    av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_root,
+                                             mi_row, mi_col);
   }
 
 #if !CONFIG_REALTIME_ONLY
-  init_ref_frame_space(cpi, td, mi_row, mi_col);
-  x->sb_energy_level = 0;
-  x->cnn_output_valid = 0;
-  if (gather_tpl_data) {
-    if (cm->delta_q_info.delta_q_present_flag) {
-      const int num_planes = av1_num_planes(cm);
-      const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-      setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes);
-      av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col);
-    }
-    if (cpi->oxcf.enable_tpl_model) {
-      adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col);
+  if (!(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+        cpi->oxcf.gf_cfg.lag_in_frames == 0)) {
+    init_ref_frame_space(cpi, td, mi_row, mi_col);
+    x->sb_energy_level = 0;
+    x->part_search_info.cnn_output_valid = 0;
+    if (gather_tpl_data) {
+      if (cm->delta_q_info.delta_q_present_flag) {
+        const int num_planes = av1_num_planes(cm);
+        const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+        setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes);
+        av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col);
+      }
+
+      // TODO(jingning): revisit this function.
+      if (cpi->oxcf.algo_cfg.enable_tpl_model && 0) {
+        adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col);
+      }
     }
   }
 #else
@@ -4707,16 +586,20 @@ static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
   (void)gather_tpl_data;
 #endif
 
-  // Reset hash state for transform/mode rd hash information
-  reset_hash_records(x, cpi->sf.tx_sf.use_inter_txb_hash);
+  x->txfm_search_params.mode_eval_type = DEFAULT_EVAL;
+  reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
   av1_zero(x->picked_ref_frames_mask);
-  av1_zero(x->pred_mv);
   av1_invalid_rd_stats(rd_cost);
 }
 
+/*!\brief Encode a superblock (RD-search-based)
+ *
+ * \ingroup partition_search
+ * Conducts partition search for a superblock, based on rate-distortion costs,
+ * from scratch or adjusting from a pre-calculated partition pattern.
+ */
 static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
-                                    TileDataEnc *tile_data,
-                                    PC_TREE *const pc_root, TOKENEXTRA **tp,
+                                    TileDataEnc *tile_data, TokenExtra **tp,
                                     const int mi_row, const int mi_col,
                                     const int seg_skip) {
   AV1_COMMON *const cm = &cpi->common;
@@ -4725,95 +608,122 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
   const TileInfo *const tile_info = &tile_data->tile_info;
   MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
                       get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const int num_planes = av1_num_planes(cm);
   int dummy_rate;
   int64_t dummy_dist;
   RD_STATS dummy_rdc;
+  SIMPLE_MOTION_DATA_TREE *const sms_root = td->sms_root;
 
 #if CONFIG_REALTIME_ONLY
   (void)seg_skip;
 #endif  // CONFIG_REALTIME_ONLY
 
-  init_encode_rd_sb(cpi, td, tile_data, pc_root, &dummy_rdc, mi_row, mi_col, 1);
+  init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, mi_col,
+                    1);
 
+  // Encode the superblock
   if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
-    set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    // partition search starting from a variance-based partition
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
     av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
-    rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                     &dummy_rate, &dummy_dist, 1, pc_root);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, rd_use_partition_time);
+#endif
+    PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+    av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                         &dummy_rate, &dummy_dist, 1, pc_root);
+    av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, rd_use_partition_time);
+#endif
   }
 #if !CONFIG_REALTIME_ONLY
   else if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
-    const BLOCK_SIZE bsize =
-        seg_skip ? sb_size : sf->part_sf.always_this_block_size;
-    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-    rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                     &dummy_rate, &dummy_dist, 1, pc_root);
-  } else if (cpi->partition_search_skippable_frame) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    // partition search by adjusting a fixed-size partition
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
     const BLOCK_SIZE bsize =
-        get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
-    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-    rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                     &dummy_rate, &dummy_dist, 1, pc_root);
+        seg_skip ? sb_size : sf->part_sf.fixed_partition_size;
+    av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+    PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+    av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                         &dummy_rate, &dummy_dist, 1, pc_root);
+    av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0);
   } else {
+    // The most exhaustive recursive partition search
+    SuperBlockEnc *sb_enc = &x->sb_enc;
     // No stats for overlay frames. Exclude key frame.
-    x->valid_cost_b =
-        get_tpl_stats_b(cpi, sb_size, mi_row, mi_col, x->intra_cost_b,
-                        x->inter_cost_b, x->mv_b, &x->cost_stride);
+    av1_get_tpl_stats_sb(cpi, sb_size, mi_row, mi_col, sb_enc);
 
-    reset_partition(pc_root, sb_size);
+    // Reset the tree for simple motion search data
+    av1_reset_simple_motion_tree_partition(sms_root, sb_size);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, rd_pick_partition_time);
 #endif
-    BLOCK_SIZE max_sq_size = x->max_partition_size;
-    BLOCK_SIZE min_sq_size = x->min_partition_size;
 
-    if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
-      float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+    // Estimate the maximum square partition block size, which will be used
+    // as the starting block size for partitioning the sb
+    set_max_min_partition_size(sb_enc, cpi, x, sf, sb_size, mi_row, mi_col);
 
-      av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
-      max_sq_size = AOMMAX(
-          AOMMIN(av1_predict_max_partition(cpi, x, features), max_sq_size),
-          min_sq_size);
-    }
-
-    const int num_passes = cpi->oxcf.sb_multipass_unit_test ? 2 : 1;
+    // The superblock can be searched only once, or twice consecutively for
+    // better quality. Note that the meaning of passes here is different from
+    // the general concept of 1-pass/2-pass encoders.
+    const int num_passes =
+        cpi->oxcf.unit_test_cfg.sb_multipass_unit_test ? 2 : 1;
 
     if (num_passes == 1) {
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                        max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
-                        pc_root, NULL, SB_SINGLE_PASS, NULL);
+#if CONFIG_PARTITION_SEARCH_ORDER
+      if (cpi->ext_part_controller.ready && !frame_is_intra_only(cm)) {
+        av1_reset_part_sf(&cpi->sf.part_sf);
+        av1_reset_sf_for_ext_part(cpi);
+        RD_STATS this_rdc;
+        av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row,
+                                mi_col, sb_size, &this_rdc);
+      } else {
+        PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+        av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                              &dummy_rdc, dummy_rdc, pc_root, sms_root, NULL,
+                              SB_SINGLE_PASS, NULL);
+      }
+#else
+      PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+      av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                            &dummy_rdc, dummy_rdc, pc_root, sms_root, NULL,
+                            SB_SINGLE_PASS, NULL);
+#endif  // CONFIG_PARTITION_SEARCH_ORDER
     } else {
       // First pass
       SB_FIRST_PASS_STATS sb_fp_stats;
-      backup_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                        max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
-                        pc_root, NULL, SB_DRY_PASS, NULL);
+      av1_backup_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
+      PC_TREE *const pc_root_p0 = av1_alloc_pc_tree_node(sb_size);
+      av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                            &dummy_rdc, dummy_rdc, pc_root_p0, sms_root, NULL,
+                            SB_DRY_PASS, NULL);
 
       // Second pass
-      init_encode_rd_sb(cpi, td, tile_data, pc_root, &dummy_rdc, mi_row, mi_col,
-                        0);
-      reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
-      reset_partition(pc_root, sb_size);
+      init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row,
+                        mi_col, 0);
+      av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
+      av1_reset_simple_motion_tree_partition(sms_root, sb_size);
 
-      restore_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
+      av1_restore_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
 
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
-                        max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
-                        pc_root, NULL, SB_WET_PASS, NULL);
+      PC_TREE *const pc_root_p1 = av1_alloc_pc_tree_node(sb_size);
+      av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                            &dummy_rdc, dummy_rdc, pc_root_p1, sms_root, NULL,
+                            SB_WET_PASS, NULL);
     }
     // Reset to 0 so that it wouldn't be used elsewhere mistakenly.
-    x->valid_cost_b = 0;
+    sb_enc->tpl_data_count = 0;
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, rd_pick_partition_time);
 #endif
   }
 #endif  // !CONFIG_REALTIME_ONLY
 
+  // Update the inter rd model
   // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
   if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
       cm->tiles.cols == 1 && cm->tiles.rows == 1) {
@@ -4821,128 +731,161 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
   }
 }
 
-static AOM_INLINE void set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
-                                         const TileInfo *const tile_info,
-                                         const int mi_row, const int mi_col) {
-  AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+// Check if the cost update of symbols mode, coeff and dv are tile or off.
+static AOM_INLINE int is_mode_coeff_dv_upd_freq_tile_or_off(
+    const AV1_COMP *const cpi) {
+  const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf;
 
-  switch (cpi->oxcf.coeff_cost_upd_freq) {
-    case COST_UPD_TILE:  // Tile level
-      if (mi_row != tile_info->mi_row_start) break;
-      AOM_FALLTHROUGH_INTENDED;
-    case COST_UPD_SBROW:  // SB row level in tile
-      if (mi_col != tile_info->mi_col_start) break;
-      AOM_FALLTHROUGH_INTENDED;
-    case COST_UPD_SB:  // SB level
-      if (cpi->sf.inter_sf.disable_sb_level_coeff_cost_upd &&
-          mi_col != tile_info->mi_col_start)
-        break;
-      av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
-      break;
-    default: assert(0);
-  }
+  return (inter_sf->coeff_cost_upd_level <= INTERNAL_COST_UPD_TILE &&
+          inter_sf->mode_cost_upd_level <= INTERNAL_COST_UPD_TILE &&
+          cpi->sf.intra_sf.dv_cost_upd_level <= INTERNAL_COST_UPD_TILE);
+}
 
-  switch (cpi->oxcf.mode_cost_upd_freq) {
-    case COST_UPD_TILE:  // Tile level
-      if (mi_row != tile_info->mi_row_start) break;
-      AOM_FALLTHROUGH_INTENDED;
-    case COST_UPD_SBROW:  // SB row level in tile
-      if (mi_col != tile_info->mi_col_start) break;
-      AOM_FALLTHROUGH_INTENDED;
-    case COST_UPD_SB:  // SB level
-      av1_fill_mode_rates(cm, x, xd->tile_ctx);
-      break;
-    default: assert(0);
-  }
-  switch (cpi->oxcf.mv_cost_upd_freq) {
-    case COST_UPD_OFF: break;
-    case COST_UPD_TILE:  // Tile level
-      if (mi_row != tile_info->mi_row_start) break;
-      AOM_FALLTHROUGH_INTENDED;
-    case COST_UPD_SBROW:  // SB row level in tile
-      if (mi_col != tile_info->mi_col_start) break;
-      AOM_FALLTHROUGH_INTENDED;
-    case COST_UPD_SB:  // SB level
-      if (cpi->sf.inter_sf.disable_sb_level_mv_cost_upd &&
-          mi_col != tile_info->mi_col_start)
-        break;
-      av1_fill_mv_costs(xd->tile_ctx, cm->features.cur_frame_force_integer_mv,
-                        cm->features.allow_high_precision_mv, x);
-      break;
-    default: assert(0);
+// When row-mt is enabled and cost update frequencies are set to off/tile,
+// processing of current SB can start even before processing of top-right SB
+// is finished. This function checks if it is sufficient to wait for top SB
+// to finish processing before current SB starts processing.
+static AOM_INLINE int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) {
+  const MODE mode = cpi->oxcf.mode;
+  if (mode == GOOD) return 0;
+
+  if (mode == ALLINTRA)
+    return is_mode_coeff_dv_upd_freq_tile_or_off(cpi);
+  else if (mode == REALTIME)
+    return (is_mode_coeff_dv_upd_freq_tile_or_off(cpi) &&
+            cpi->sf.inter_sf.mv_cost_upd_level <= INTERNAL_COST_UPD_TILE);
+  else
+    return 0;
+}
+
+/*!\brief Determine whether grading content is needed based on sf and frame stat
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+// TODO(any): consolidate sfs to make interface cleaner
+static AOM_INLINE void grade_source_content_sb(AV1_COMP *cpi,
+                                               MACROBLOCK *const x, int mi_row,
+                                               int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  bool calc_src_content = false;
+
+  if (cpi->sf.rt_sf.source_metrics_sb_nonrd &&
+      cpi->svc.number_spatial_layers <= 1 &&
+      cm->current_frame.frame_type != KEY_FRAME) {
+    if (!cpi->sf.rt_sf.check_scene_detection || cpi->rc.frame_source_sad > 0)
+      calc_src_content = true;
+    else
+      x->content_state_sb.source_sad_nonrd = kZeroSad;
+  } else if ((cpi->sf.rt_sf.var_part_based_on_qidx >= 1) &&
+             (cm->width * cm->height <= 352 * 288)) {
+    if (cpi->rc.frame_source_sad > 0)
+      calc_src_content = true;
+    else
+      x->content_state_sb.source_sad_rd = kZeroSad;
   }
+  if (calc_src_content) av1_source_content_sb(cpi, x, mi_row, mi_col);
 }
 
+/*!\brief Encode a superblock row by breaking it into superblocks
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Do partition and mode search for an sb row: one row of superblocks filling up
+ * the width of the current tile.
+ */
 static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
                                      TileDataEnc *tile_data, int mi_row,
-                                     TOKENEXTRA **tp) {
+                                     TokenExtra **tp) {
   AV1_COMMON *const cm = &cpi->common;
   const TileInfo *const tile_info = &tile_data->tile_info;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+  bool row_mt_enabled = mt_info->row_mt_enabled;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_data->tile_info);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-  const int mib_size = cm->seq_params.mib_size;
-  const int mib_size_log2 = cm->seq_params.mib_size_log2;
+  const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const int mib_size = cm->seq_params->mib_size;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
   const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
   const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  start_timing(cpi, encode_sb_time);
+  start_timing(cpi, encode_sb_row_time);
 #endif
 
   // Initialize the left context for the new SB row
   av1_zero_left_context(xd);
 
-  // Reset delta for every tile
-  if (mi_row == tile_info->mi_row_start || cpi->row_mt) {
+  // Reset delta for quantizer and loof filters at the beginning of every tile
+  if (mi_row == tile_info->mi_row_start || row_mt_enabled) {
     if (cm->delta_q_info.delta_q_present_flag)
-      xd->current_qindex = cm->quant_params.base_qindex;
+      xd->current_base_qindex = cm->quant_params.base_qindex;
     if (cm->delta_q_info.delta_lf_present_flag) {
       av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
     }
   }
+
   reset_thresh_freq_fact(x);
 
+  // Preallocate the pc_tree for realtime coding to reduce the cost of memory
+  // allocation
+  PC_TREE *const rt_pc_root =
+      use_nonrd_mode ? av1_alloc_pc_tree_node(sb_size) : NULL;
+
   // Code each SB in the row
   for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0;
        mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) {
-    (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
-                                   sb_col_in_tile);
-    if (tile_data->allow_update_cdf && (cpi->row_mt == 1) &&
-        (tile_info->mi_row_start != mi_row)) {
+    // In realtime/allintra mode and when frequency of cost updates is off/tile,
+    // wait for the top superblock to finish encoding. Otherwise, wait for the
+    // top-right superblock to finish encoding.
+    enc_row_mt->sync_read_ptr(
+        row_mt_sync, sb_row, sb_col_in_tile - delay_wait_for_top_right_sb(cpi));
+    const int update_cdf = tile_data->allow_update_cdf && row_mt_enabled;
+    if (update_cdf && (tile_info->mi_row_start != mi_row)) {
       if ((tile_info->mi_col_start == mi_col)) {
-        // restore frame context of 1st column sb
+        // restore frame context at the 1st column sb
         memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx));
       } else {
+        // update context
         int wt_left = AVG_CDF_WEIGHT_LEFT;
         int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT;
         if (tile_info->mi_col_end > (mi_col + mib_size))
-          avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile, wt_left,
-                          wt_tr);
+          av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile,
+                              wt_left, wt_tr);
         else
-          avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1,
-                          wt_left, wt_tr);
+          av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1,
+                              wt_left, wt_tr);
       }
     }
 
-    set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col);
+    // Update the rate cost tables for some symbols
+    av1_set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col);
 
+    // Reset color coding related parameters
+    x->color_sensitivity_sb[0] = 0;
+    x->color_sensitivity_sb[1] = 0;
     x->color_sensitivity[0] = 0;
     x->color_sensitivity[1] = 0;
-    x->content_state_sb = 0;
+    x->content_state_sb.source_sad_nonrd = kMedSad;
+    x->content_state_sb.source_sad_rd = kMedSad;
+    x->content_state_sb.lighting_change = 0;
+    x->content_state_sb.low_sumdiff = 0;
+    x->force_zeromv_skip = 0;
 
-    PC_TREE *const pc_root = td->pc_root;
-    pc_root->index = 0;
+    if (cpi->oxcf.mode == ALLINTRA) {
+      x->intra_sb_rdmult_modifier = 128;
+    }
 
     xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv;
-    td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
     x->source_variance = UINT_MAX;
-    x->simple_motion_pred_sse = UINT_MAX;
+    td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
 
+    // Get segment id and skip flag
     const struct segmentation *const seg = &cm->seg;
     int seg_skip = 0;
     if (seg->enabled) {
@@ -4954,26 +897,38 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
 
+    produce_gradients_for_sb(cpi, x, sb_size, mi_row, mi_col);
+
+    init_src_var_info_of_4x4_sub_blocks(cpi, x->src_var_info_of_4x4_sub_blocks,
+                                        sb_size);
+
+    // Grade the temporal variation of the sb, the grade will be used to decide
+    // fast mode search strategy for coding blocks
+    grade_source_content_sb(cpi, x, mi_row, mi_col);
+
+    // encode the superblock
     if (use_nonrd_mode) {
-      encode_nonrd_sb(cpi, td, tile_data, pc_root, tp, mi_row, mi_col,
-                      seg_skip);
+      encode_nonrd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip,
+                      rt_pc_root);
     } else {
-      encode_rd_sb(cpi, td, tile_data, pc_root, tp, mi_row, mi_col, seg_skip);
+      encode_rd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip);
     }
 
-    if (tile_data->allow_update_cdf && (cpi->row_mt == 1) &&
-        (tile_info->mi_row_end > (mi_row + mib_size))) {
+    // Update the top-right context in row_mt coding
+    if (update_cdf && (tile_info->mi_row_end > (mi_row + mib_size))) {
       if (sb_cols_in_tile == 1)
         memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx));
       else if (sb_col_in_tile >= 1)
         memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx,
                sizeof(*xd->tile_ctx));
     }
-    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
-                                    sb_col_in_tile, sb_cols_in_tile);
+    enc_row_mt->sync_write_ptr(row_mt_sync, sb_row, sb_col_in_tile,
+                               sb_cols_in_tile);
   }
+
+  av1_free_pc_tree_recursive(rt_pc_root, av1_num_planes(cm), 0, 0);
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  end_timing(cpi, encode_sb_time);
+  end_timing(cpi, encode_sb_row_time);
 #endif
 }
 
@@ -4985,10 +940,10 @@ static AOM_INLINE void init_encode_frame_mb_context(AV1_COMP *cpi) {
 
   // Copy data over into macro block data structures.
   av1_setup_src_planes(x, cpi->source, 0, 0, num_planes,
-                       cm->seq_params.sb_size);
+                       cm->seq_params->sb_size);
 
-  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
-                         cm->seq_params.subsampling_y, num_planes);
+  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+                         cm->seq_params->subsampling_y, num_planes);
 }
 
 void av1_alloc_tile_data(AV1_COMP *cpi) {
@@ -5010,69 +965,132 @@ void av1_init_tile_data(AV1_COMP *cpi) {
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
   int tile_col, tile_row;
-  TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
-  TOKENLIST *tplist = cpi->tplist[0][0];
+  TokenInfo *const token_info = &cpi->token_info;
+  TokenExtra *pre_tok = token_info->tile_tok[0][0];
+  TokenList *tplist = token_info->tplist[0][0];
   unsigned int tile_tok = 0;
   int tplist_count = 0;
 
+  if (!is_stat_generation_stage(cpi) &&
+      cm->features.allow_screen_content_tools) {
+    // Number of tokens for which token info needs to be allocated.
+    unsigned int tokens_required =
+        get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols,
+                        MAX_SB_SIZE_LOG2, num_planes);
+    // Allocate/reallocate memory for token related info if the number of tokens
+    // required is more than the number of tokens already allocated. This could
+    // occur in case of the following:
+    // 1) If the memory is not yet allocated
+    // 2) If the frame dimensions have changed
+    const bool realloc_tokens = tokens_required > token_info->tokens_allocated;
+    if (realloc_tokens) {
+      free_token_info(token_info);
+      alloc_token_info(cm, token_info, tokens_required);
+      pre_tok = token_info->tile_tok[0][0];
+      tplist = token_info->tplist[0][0];
+    }
+  }
+
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
       TileDataEnc *const tile_data =
           &cpi->tile_data[tile_row * tile_cols + tile_col];
       TileInfo *const tile_info = &tile_data->tile_info;
       av1_tile_init(tile_info, cm, tile_row, tile_col);
-
-      cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
-      pre_tok = cpi->tile_tok[tile_row][tile_col];
-      tile_tok = allocated_tokens(
-          *tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
-      cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
-      tplist = cpi->tplist[tile_row][tile_col];
-      tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
+      tile_data->firstpass_top_mv = kZeroMv;
+      tile_data->abs_sum_level = 0;
+
+      if (is_token_info_allocated(token_info)) {
+        token_info->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+        pre_tok = token_info->tile_tok[tile_row][tile_col];
+        tile_tok = allocated_tokens(
+            tile_info, cm->seq_params->mib_size_log2 + MI_SIZE_LOG2,
+            num_planes);
+        token_info->tplist[tile_row][tile_col] = tplist + tplist_count;
+        tplist = token_info->tplist[tile_row][tile_col];
+        tplist_count = av1_get_sb_rows_in_tile(cm, tile_info);
+      }
       tile_data->allow_update_cdf = !cm->tiles.large_scale;
-      tile_data->allow_update_cdf =
-          tile_data->allow_update_cdf && !cm->features.disable_cdf_update;
+      tile_data->allow_update_cdf = tile_data->allow_update_cdf &&
+                                    !cm->features.disable_cdf_update &&
+                                    !delay_wait_for_top_right_sb(cpi);
       tile_data->tctx = *cm->fc;
     }
   }
 }
 
-void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
-                       int tile_col, int mi_row) {
-  AV1_COMMON *const cm = &cpi->common;
+// Populate the start palette token info prior to encoding an SB row.
+static AOM_INLINE void get_token_start(AV1_COMP *cpi, const TileInfo *tile_info,
+                                       int tile_row, int tile_col, int mi_row,
+                                       TokenExtra **tp) {
+  const TokenInfo *token_info = &cpi->token_info;
+  if (!is_token_info_allocated(token_info)) return;
+
+  const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  const int tile_cols = cm->tiles.cols;
-  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
-  const TileInfo *const tile_info = &this_tile->tile_info;
-  TOKENEXTRA *tok = NULL;
+  TokenList *const tplist = cpi->token_info.tplist[tile_row][tile_col];
   const int sb_row_in_tile =
-      (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2;
+      (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
+
+  get_start_tok(cpi, tile_row, tile_col, mi_row, tp,
+                cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes);
+  assert(tplist != NULL);
+  tplist[sb_row_in_tile].start = *tp;
+}
+
+// Populate the token count after encoding an SB row.
+static AOM_INLINE void populate_token_count(AV1_COMP *cpi,
+                                            const TileInfo *tile_info,
+                                            int tile_row, int tile_col,
+                                            int mi_row, TokenExtra *tok) {
+  const TokenInfo *token_info = &cpi->token_info;
+  if (!is_token_info_allocated(token_info)) return;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  TokenList *const tplist = token_info->tplist[tile_row][tile_col];
+  const int sb_row_in_tile =
+      (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
   const int tile_mb_cols =
       (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
   const int num_mb_rows_in_sb =
-      ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+      ((1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+  tplist[sb_row_in_tile].count =
+      (unsigned int)(tok - tplist[sb_row_in_tile].start);
 
-  get_start_tok(cpi, tile_row, tile_col, mi_row, &tok,
-                cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
-  cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok;
+  assert((unsigned int)(tok - tplist[sb_row_in_tile].start) <=
+         get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
+                         cm->seq_params->mib_size_log2 + MI_SIZE_LOG2,
+                         num_planes));
 
-  encode_sb_row(cpi, td, this_tile, mi_row, &tok);
+  (void)num_planes;
+  (void)tile_mb_cols;
+  (void)num_mb_rows_in_sb;
+}
 
-  cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok;
-  cpi->tplist[tile_row][tile_col][sb_row_in_tile].count =
-      (unsigned int)(cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop -
-                     cpi->tplist[tile_row][tile_col][sb_row_in_tile].start);
+/*!\brief Encode a superblock row
+ *
+ * \ingroup partition_search
+ */
+void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
+                       int tile_col, int mi_row) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+  TokenExtra *tok = NULL;
 
-  assert(
-      (unsigned int)(tok -
-                     cpi->tplist[tile_row][tile_col][sb_row_in_tile].start) <=
-      get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
-                      cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes));
+  get_token_start(cpi, tile_info, tile_row, tile_col, mi_row, &tok);
 
-  (void)tile_mb_cols;
-  (void)num_mb_rows_in_sb;
+  encode_sb_row(cpi, td, this_tile, mi_row, &tok);
+
+  populate_token_count(cpi, tile_info, tile_row, tile_col, mi_row, tok);
 }
 
+/*!\brief Encode a tile
+ *
+ * \ingroup partition_search
+ */
 void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
                      int tile_col) {
   AV1_COMMON *const cm = &cpi->common;
@@ -5087,26 +1105,40 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
   av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
                          &td->mb.e_mbd);
 
-  if (cpi->oxcf.enable_cfl_intra) cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
+  if (cpi->oxcf.intra_mode_cfg.enable_cfl_intra)
+    cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
 
-  av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
+  if (td->mb.txfm_search_info.mb_rd_record != NULL) {
+    av1_crc32c_calculator_init(
+        &td->mb.txfm_search_info.mb_rd_record->crc_calculator);
+  }
 
   for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
-       mi_row += cm->seq_params.mib_size) {
+       mi_row += cm->seq_params->mib_size) {
     av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
   }
+  this_tile->abs_sum_level = td->abs_sum_level;
 }
 
+/*!\brief Break one frame into tiles and encode the tiles
+ *
+ * \ingroup partition_search
+ *
+ * \param[in]    cpi    Top-level encoder structure
+ */
 static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
   int tile_col, tile_row;
 
-  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
-    av1_alloc_tile_data(cpi);
+  MACROBLOCK *const mb = &cpi->td.mb;
+  assert(IMPLIES(cpi->tile_data == NULL,
+                 cpi->allocated_tiles < tile_cols * tile_rows));
+  if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
 
   av1_init_tile_data(cpi);
+  av1_alloc_mb_data(cpi, mb);
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
@@ -5114,104 +1146,51 @@ static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
           &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
       cpi->td.intrabc_used = 0;
       cpi->td.deltaq_used = 0;
+      cpi->td.abs_sum_level = 0;
+      cpi->td.rd_counts.seg_tmp_pred_cost[0] = 0;
+      cpi->td.rd_counts.seg_tmp_pred_cost[1] = 0;
       cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
       cpi->td.mb.tile_pb_ctx = &this_tile->tctx;
+      // Reset cyclic refresh counters.
+      av1_init_cyclic_refresh_counters(&cpi->td.mb);
+
       av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+      // Accumulate cyclic refresh params.
+      if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+          !frame_is_intra_only(&cpi->common))
+        av1_accumulate_cyclic_refresh_counters(cpi->cyclic_refresh,
+                                               &cpi->td.mb);
       cpi->intrabc_used |= cpi->td.intrabc_used;
       cpi->deltaq_used |= cpi->td.deltaq_used;
     }
   }
-}
-
-#define GLOBAL_TRANS_TYPES_ENC 3  // highest motion model to search
-static int gm_get_params_cost(const WarpedMotionParams *gm,
-                              const WarpedMotionParams *ref_gm, int allow_hp) {
-  int params_cost = 0;
-  int trans_bits, trans_prec_diff;
-  switch (gm->wmtype) {
-    case AFFINE:
-    case ROTZOOM:
-      params_cost += aom_count_signed_primitive_refsubexpfin(
-          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-          (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
-          (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
-      params_cost += aom_count_signed_primitive_refsubexpfin(
-          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-          (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
-          (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
-      if (gm->wmtype >= AFFINE) {
-        params_cost += aom_count_signed_primitive_refsubexpfin(
-            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-            (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
-            (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
-        params_cost += aom_count_signed_primitive_refsubexpfin(
-            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-            (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
-                (1 << GM_ALPHA_PREC_BITS),
-            (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
-      }
-      AOM_FALLTHROUGH_INTENDED;
-    case TRANSLATION:
-      trans_bits = (gm->wmtype == TRANSLATION)
-                       ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
-                       : GM_ABS_TRANS_BITS;
-      trans_prec_diff = (gm->wmtype == TRANSLATION)
-                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
-                            : GM_TRANS_PREC_DIFF;
-      params_cost += aom_count_signed_primitive_refsubexpfin(
-          (1 << trans_bits) + 1, SUBEXPFIN_K,
-          (ref_gm->wmmat[0] >> trans_prec_diff),
-          (gm->wmmat[0] >> trans_prec_diff));
-      params_cost += aom_count_signed_primitive_refsubexpfin(
-          (1 << trans_bits) + 1, SUBEXPFIN_K,
-          (ref_gm->wmmat[1] >> trans_prec_diff),
-          (gm->wmmat[1] >> trans_prec_diff));
-      AOM_FALLTHROUGH_INTENDED;
-    case IDENTITY: break;
-    default: assert(0);
-  }
-  return (params_cost << AV1_PROB_COST_SHIFT);
-}
 
-static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) {
-  (void)frame;
-  switch (sf->gm_sf.gm_search_type) {
-    case GM_FULL_SEARCH: return 1;
-    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3:
-      return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
-    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2:
-      return !(frame == LAST2_FRAME || frame == LAST3_FRAME ||
-               (frame == ALTREF2_FRAME));
-    case GM_DISABLE_SEARCH: return 0;
-    default: assert(0);
-  }
-  return 1;
+  av1_dealloc_mb_data(cm, mb);
 }
 
 // Set the relative distance of a reference frame w.r.t. current frame
-static AOM_INLINE void set_rel_frame_dist(AV1_COMP *cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+static AOM_INLINE void set_rel_frame_dist(
+    const AV1_COMMON *const cm, RefFrameDistanceInfo *const ref_frame_dist_info,
+    const int ref_frame_flags) {
   MV_REFERENCE_FRAME ref_frame;
   int min_past_dist = INT32_MAX, min_future_dist = INT32_MAX;
-  cpi->nearest_past_ref = NONE_FRAME;
-  cpi->nearest_future_ref = NONE_FRAME;
+  ref_frame_dist_info->nearest_past_ref = NONE_FRAME;
+  ref_frame_dist_info->nearest_future_ref = NONE_FRAME;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    cpi->ref_relative_dist[ref_frame - LAST_FRAME] = 0;
-    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+    ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = 0;
+    if (ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
       int dist = av1_encoder_get_relative_dist(
-          order_hint_info,
           cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME],
           cm->current_frame.display_order_hint);
-      cpi->ref_relative_dist[ref_frame - LAST_FRAME] = dist;
+      ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = dist;
       // Get the nearest ref_frame in the past
       if (abs(dist) < min_past_dist && dist < 0) {
-        cpi->nearest_past_ref = ref_frame;
+        ref_frame_dist_info->nearest_past_ref = ref_frame;
         min_past_dist = abs(dist);
       }
       // Get the nearest ref_frame in the future
       if (dist < min_future_dist && dist > 0) {
-        cpi->nearest_future_ref = ref_frame;
+        ref_frame_dist_info->nearest_future_ref = ref_frame;
         min_future_dist = dist;
       }
     }
@@ -5222,14 +1201,12 @@ static INLINE int refs_are_one_sided(const AV1_COMMON *cm) {
   assert(!frame_is_intra_only(cm));
 
   int one_sided_refs = 1;
+  const int cur_display_order_hint = cm->current_frame.display_order_hint;
   for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
     const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
     if (buf == NULL) continue;
-
-    const int ref_display_order_hint = buf->display_order_hint;
-    if (av1_encoder_get_relative_dist(
-            &cm->seq_params.order_hint_info, ref_display_order_hint,
-            (int)cm->current_frame.display_order_hint) > 0) {
+    if (av1_encoder_get_relative_dist(buf->display_order_hint,
+                                      cur_display_order_hint) > 0) {
       one_sided_refs = 0;  // bwd reference
       break;
     }
@@ -5264,14 +1241,14 @@ static int check_skip_mode_enabled(AV1_COMP *const cpi) {
   const int cur_offset = (int)cm->current_frame.order_hint;
   int ref_offset[2];
   get_skip_mode_ref_offsets(cm, ref_offset);
-  const int cur_to_ref0 = get_relative_dist(&cm->seq_params.order_hint_info,
+  const int cur_to_ref0 = get_relative_dist(&cm->seq_params->order_hint_info,
                                             cur_offset, ref_offset[0]);
-  const int cur_to_ref1 = abs(get_relative_dist(&cm->seq_params.order_hint_info,
-                                                cur_offset, ref_offset[1]));
+  const int cur_to_ref1 = abs(get_relative_dist(
+      &cm->seq_params->order_hint_info, cur_offset, ref_offset[1]));
   if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0;
 
   // High Latency: Turn off skip mode if all refs are fwd.
-  if (cpi->all_one_sided_refs && cpi->oxcf.lag_in_frames > 0) return 0;
+  if (cpi->all_one_sided_refs && cpi->oxcf.gf_cfg.lag_in_frames > 0) return 0;
 
   static const int flag_list[REF_FRAMES] = { 0,
                                              AOM_LAST_FLAG,
@@ -5292,19 +1269,6 @@ static int check_skip_mode_enabled(AV1_COMP *const cpi) {
   return 1;
 }
 
-// Function to decide if we can skip the global motion parameter computation
-// for a particular ref frame
-static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) {
-  if ((ref_frame == LAST3_FRAME || ref_frame == LAST2_FRAME) &&
-      cm->global_motion[GOLDEN_FRAME].wmtype != IDENTITY) {
-    return get_relative_dist(
-               &cm->seq_params.order_hint_info,
-               cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME],
-               cm->cur_frame->ref_order_hints[GOLDEN_FRAME - LAST_FRAME]) <= 0;
-  }
-  return 0;
-}
-
 static AOM_INLINE void set_default_interp_skip_flags(
     const AV1_COMMON *cm, InterpSearchFlags *interp_search_flags) {
   const int num_planes = av1_num_planes(cm);
@@ -5313,271 +1277,24 @@ static AOM_INLINE void set_default_interp_skip_flags(
                         : INTERP_SKIP_LUMA_SKIP_CHROMA;
 }
 
-// TODO(Remya): Can include erroradv_prod_tr[] for threshold calculation
-static INLINE int64_t calc_erroradv_threshold(AV1_COMP *cpi,
-                                              int64_t ref_frame_error) {
-  if (!cpi->sf.gm_sf.disable_adaptive_warp_error_thresh)
-    return (int64_t)(
-        ref_frame_error * erroradv_tr[cpi->sf.gm_sf.gm_erroradv_type] + 0.5);
-  else
-    return INT64_MAX;
-}
-
-static void compute_global_motion_for_ref_frame(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
-    int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer,
-    MotionModel *params_by_motion, uint8_t *segment_map,
-    const int segment_map_w, const int segment_map_h,
-    const WarpedMotionParams *ref_params) {
-  ThreadData *const td = &cpi->td;
-  MACROBLOCK *const x = &td->mb;
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int i;
-  // clang-format off
-  static const double kIdentityParams[MAX_PARAMDIM - 1] = {
-     0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
-  };
-  // clang-format on
-  WarpedMotionParams tmp_wm_params;
-  const double *params_this_motion;
-  int inliers_by_motion[RANSAC_NUM_MOTIONS];
-  assert(ref_buf[frame] != NULL);
-  if (*num_frm_corners < 0) {
-    // compute interest points using FAST features
-    *num_frm_corners = av1_fast_corner_detect(
-        frm_buffer, cpi->source->y_width, cpi->source->y_height,
-        cpi->source->y_stride, frm_corners, MAX_CORNERS);
-  }
-  TransformationType model;
-
-  aom_clear_system_state();
-
-  // TODO(sarahparker, debargha): Explore do_adaptive_gm_estimation = 1
-  const int do_adaptive_gm_estimation = 0;
-
-  const int ref_frame_dist = get_relative_dist(
-      &cm->seq_params.order_hint_info, cm->current_frame.order_hint,
-      cm->cur_frame->ref_order_hints[frame - LAST_FRAME]);
-  const GlobalMotionEstimationType gm_estimation_type =
-      cm->seq_params.order_hint_info.enable_order_hint &&
-              abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation
-          ? GLOBAL_MOTION_DISFLOW_BASED
-          : GLOBAL_MOTION_FEATURE_BASED;
-  for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
-    int64_t best_warp_error = INT64_MAX;
-    // Initially set all params to identity.
-    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
-      memcpy(params_by_motion[i].params, kIdentityParams,
-             (MAX_PARAMDIM - 1) * sizeof(*(params_by_motion[i].params)));
-      params_by_motion[i].num_inliers = 0;
-    }
-
-    av1_compute_global_motion(
-        model, frm_buffer, cpi->source->y_width, cpi->source->y_height,
-        cpi->source->y_stride, frm_corners, *num_frm_corners, ref_buf[frame],
-        cpi->common.seq_params.bit_depth, gm_estimation_type, inliers_by_motion,
-        params_by_motion, RANSAC_NUM_MOTIONS);
-    int64_t ref_frame_error = 0;
-    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
-      if (inliers_by_motion[i] == 0) continue;
-
-      params_this_motion = params_by_motion[i].params;
-      av1_convert_model_to_params(params_this_motion, &tmp_wm_params);
-
-      if (tmp_wm_params.wmtype != IDENTITY) {
-        av1_compute_feature_segmentation_map(
-            segment_map, segment_map_w, segment_map_h,
-            params_by_motion[i].inliers, params_by_motion[i].num_inliers);
-
-        ref_frame_error = av1_segmented_frame_error(
-            is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
-            ref_buf[frame]->y_stride, cpi->source->y_buffer,
-            cpi->source->y_width, cpi->source->y_height, cpi->source->y_stride,
-            segment_map, segment_map_w);
-
-        int64_t erroradv_threshold =
-            calc_erroradv_threshold(cpi, ref_frame_error);
-
-        const int64_t warp_error = av1_refine_integerized_param(
-            &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd,
-            ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
-            ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
-            cpi->source->y_buffer, cpi->source->y_width, cpi->source->y_height,
-            cpi->source->y_stride, GM_REFINEMENT_COUNT, best_warp_error,
-            segment_map, segment_map_w, erroradv_threshold);
-
-        if (warp_error < best_warp_error) {
-          best_warp_error = warp_error;
-          // Save the wm_params modified by
-          // av1_refine_integerized_param() rather than motion index to
-          // avoid rerunning refine() below.
-          memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
-                 sizeof(WarpedMotionParams));
-        }
-      }
-    }
-    if (cm->global_motion[frame].wmtype <= AFFINE)
-      if (!av1_get_shear_params(&cm->global_motion[frame]))
-        cm->global_motion[frame] = default_warp_params;
-
-    if (cm->global_motion[frame].wmtype == TRANSLATION) {
-      cm->global_motion[frame].wmmat[0] =
-          convert_to_trans_prec(cm->features.allow_high_precision_mv,
-                                cm->global_motion[frame].wmmat[0]) *
-          GM_TRANS_ONLY_DECODE_FACTOR;
-      cm->global_motion[frame].wmmat[1] =
-          convert_to_trans_prec(cm->features.allow_high_precision_mv,
-                                cm->global_motion[frame].wmmat[1]) *
-          GM_TRANS_ONLY_DECODE_FACTOR;
-    }
-
-    if (cm->global_motion[frame].wmtype == IDENTITY) continue;
-
-    if (ref_frame_error == 0) continue;
-
-    // If the best error advantage found doesn't meet the threshold for
-    // this motion type, revert to IDENTITY.
-    if (!av1_is_enough_erroradvantage(
-            (double)best_warp_error / ref_frame_error,
-            gm_get_params_cost(&cm->global_motion[frame], ref_params,
-                               cm->features.allow_high_precision_mv),
-            cpi->sf.gm_sf.gm_erroradv_type)) {
-      cm->global_motion[frame] = default_warp_params;
-    }
-
-    if (cm->global_motion[frame].wmtype != IDENTITY) break;
-  }
-
-  aom_clear_system_state();
-}
-
-typedef struct {
-  int distance;
-  MV_REFERENCE_FRAME frame;
-} FrameDistPair;
-
-static INLINE void update_valid_ref_frames_for_gm(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
-    FrameDistPair *past_ref_frame, FrameDistPair *future_ref_frame,
-    int *num_past_ref_frames, int *num_future_ref_frames) {
-  AV1_COMMON *const cm = &cpi->common;
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
-  for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
-    const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME };
-    RefCntBuffer *buf = get_ref_frame_buf(cm, frame);
-    const int ref_disabled =
-        !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]);
-    ref_buf[frame] = NULL;
-    cm->global_motion[frame] = default_warp_params;
-    // Skip global motion estimation for invalid ref frames
-    if (buf == NULL ||
-        (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) {
-      cpi->gm_info.params_cost[frame] = 0;
-      continue;
-    } else {
-      ref_buf[frame] = &buf->buf;
-    }
-
-    if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
-        ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
-        do_gm_search_logic(&cpi->sf, frame) &&
-        !prune_ref_by_selective_ref_frame(
-            cpi, NULL, ref_frame, cm->cur_frame->ref_display_order_hint) &&
-        !(cpi->sf.gm_sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
-      assert(ref_buf[frame] != NULL);
-      int relative_frame_dist = av1_encoder_get_relative_dist(
-          order_hint_info, buf->display_order_hint,
-          cm->cur_frame->display_order_hint);
-      // Populate past and future ref frames
-      if (relative_frame_dist <= 0) {
-        past_ref_frame[*num_past_ref_frames].distance =
-            abs(relative_frame_dist);
-        past_ref_frame[*num_past_ref_frames].frame = frame;
-        (*num_past_ref_frames)++;
-      } else {
-        future_ref_frame[*num_future_ref_frames].distance =
-            abs(relative_frame_dist);
-        future_ref_frame[*num_future_ref_frames].frame = frame;
-        (*num_future_ref_frames)++;
-      }
-    }
-  }
-}
-
-static INLINE void compute_gm_for_valid_ref_frames(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
-    int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer,
-    MotionModel *params_by_motion, uint8_t *segment_map,
-    const int segment_map_w, const int segment_map_h) {
-  AV1_COMMON *const cm = &cpi->common;
-  GlobalMotionInfo *const gm_info = &cpi->gm_info;
-  const WarpedMotionParams *ref_params =
-      cm->prev_frame ? &cm->prev_frame->global_motion[frame]
-                     : &default_warp_params;
-
-  compute_global_motion_for_ref_frame(
-      cpi, ref_buf, frame, num_frm_corners, frm_corners, frm_buffer,
-      params_by_motion, segment_map, segment_map_w, segment_map_h, ref_params);
-
-  gm_info->params_cost[frame] =
-      gm_get_params_cost(&cm->global_motion[frame], ref_params,
-                         cm->features.allow_high_precision_mv) +
-      gm_info->type_cost[cm->global_motion[frame].wmtype] -
-      gm_info->type_cost[IDENTITY];
-}
-
-static int compare_distance(const void *a, const void *b) {
-  const int diff =
-      ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance;
-  if (diff > 0)
-    return 1;
-  else if (diff < 0)
-    return -1;
-  return 0;
-}
-
-static INLINE void compute_global_motion_for_references(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
-    FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames,
-    int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer,
-    MotionModel *params_by_motion, uint8_t *segment_map,
-    const int segment_map_w, const int segment_map_h) {
-  AV1_COMMON *const cm = &cpi->common;
-  // Compute global motion w.r.t. reference frames starting from the nearest ref
-  // frame in a given direction
-  for (int frame = 0; frame < num_ref_frames; frame++) {
-    int ref_frame = reference_frame[frame].frame;
-    compute_gm_for_valid_ref_frames(cpi, ref_buf, ref_frame, num_frm_corners,
-                                    frm_corners, frm_buffer, params_by_motion,
-                                    segment_map, segment_map_w, segment_map_h);
-    // If global motion w.r.t. current ref frame is
-    // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
-    // the remaining ref frames in that direction. The below exit is disabled
-    // when ref frame distance w.r.t. current frame is zero. E.g.:
-    // source_alt_ref_frame w.r.t. ARF frames
-    if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
-        reference_frame[frame].distance != 0 &&
-        cm->global_motion[ref_frame].wmtype != ROTZOOM)
-      break;
-  }
-}
-
 static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) {
-  if (!cpi->sf.rt_sf.use_nonrd_pick_mode &&
-      cpi->sf.inter_sf.selective_ref_frame >= 2) {
+  if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp ||
+       cpi->sf.inter_sf.disable_onesided_comp) &&
+      cpi->all_one_sided_refs) {
+    // Disable all compound references
+    cpi->prune_ref_frame_mask = (1 << MODE_CTX_REF_FRAMES) - (1 << REF_FRAMES);
+  } else if (!cpi->sf.rt_sf.use_nonrd_pick_mode &&
+             cpi->sf.inter_sf.selective_ref_frame >= 2) {
     AV1_COMMON *const cm = &cpi->common;
-    const OrderHintInfo *const order_hint_info =
-        &cm->seq_params.order_hint_info;
     const int cur_frame_display_order_hint =
         cm->current_frame.display_order_hint;
     unsigned int *ref_display_order_hint =
         cm->cur_frame->ref_display_order_hint;
     const int arf2_dist = av1_encoder_get_relative_dist(
-        order_hint_info, ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME],
+        ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME],
         cur_frame_display_order_hint);
     const int bwd_dist = av1_encoder_get_relative_dist(
-        order_hint_info, ref_display_order_hint[BWDREF_FRAME - LAST_FRAME],
+        ref_display_order_hint[BWDREF_FRAME - LAST_FRAME],
         cur_frame_display_order_hint);
 
     for (int ref_idx = REF_FRAMES; ref_idx < MODE_CTX_REF_FRAMES; ++ref_idx) {
@@ -5592,7 +1309,7 @@ static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) {
         int ref_dist[2];
         for (int i = 0; i < 2; ++i) {
           ref_dist[i] = av1_encoder_get_relative_dist(
-              order_hint_info, ref_display_order_hint[rf[i] - LAST_FRAME],
+              ref_display_order_hint[rf[i] - LAST_FRAME],
               cur_frame_display_order_hint);
         }
 
@@ -5617,8 +1334,34 @@ static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) {
   }
 }
 
-#define CHECK_PRECOMPUTED_REF_FRAME_MAP 0
+static int allow_deltaq_mode(AV1_COMP *cpi) {
+#if !CONFIG_REALTIME_ONLY
+  AV1_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  int sbs_wide = mi_size_wide[sb_size];
+  int sbs_high = mi_size_high[sb_size];
 
+  int64_t delta_rdcost = 0;
+  for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += sbs_high) {
+    for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += sbs_wide) {
+      int64_t this_delta_rdcost = 0;
+      av1_get_q_for_deltaq_objective(cpi, &cpi->td, &this_delta_rdcost, sb_size,
+                                     mi_row, mi_col);
+      delta_rdcost += this_delta_rdcost;
+    }
+  }
+  return delta_rdcost < 0;
+#else
+  (void)cpi;
+  return 1;
+#endif  // !CONFIG_REALTIME_ONLY
+}
+
+/*!\brief Encoder setup(only for the current frame), encoding, and recontruction
+ * for a single frame
+ *
+ * \ingroup high_level_algo
+ */
 static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
   ThreadData *const td = &cpi->td;
   MACROBLOCK *const x = &td->mb;
@@ -5627,9 +1370,17 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
   FeatureFlags *const features = &cm->features;
   MACROBLOCKD *const xd = &x->e_mbd;
   RD_COUNTS *const rdc = &cpi->td.rd_counts;
-  GlobalMotionInfo *const gm_info = &cpi->gm_info;
-  FrameProbInfo *const frame_probs = &cpi->frame_probs;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs;
+  FrameProbInfo *const temp_frame_probs_simulation =
+      &cpi->ppi->temp_frame_probs_simulation;
+#endif
+  FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
   IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const DELTAQ_MODE deltaq_mode = oxcf->q_cfg.deltaq_mode;
   int i;
 
   if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
@@ -5638,17 +1389,11 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
 
   set_mi_offsets(mi_params, xd, 0, 0);
 
-#if CONFIG_AV1_HIGHBITDEPTH
-  x->fwd_txfm4x4 = aom_fdct4x4;
-#else
-  x->fwd_txfm4x4 = aom_fdct4x4_lp;
-#endif
-
   av1_zero(*td->counts);
-  av1_zero(rdc->comp_pred_diff);
   av1_zero(rdc->tx_type_used);
   av1_zero(rdc->obmc_used);
   av1_zero(rdc->warped_used);
+  av1_zero(rdc->seg_tmp_pred_cost);
 
   // Reset the flag.
   cpi->intrabc_used = 0;
@@ -5657,13 +1402,20 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
     features->allow_intrabc = 0;
   }
 
-  features->allow_intrabc &= (cpi->oxcf.enable_intrabc);
+  features->allow_intrabc &= (oxcf->kf_cfg.enable_intrabc);
 
   if (features->allow_warped_motion &&
       cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
-    if (frame_probs->warped_probs[update_type] <
-        cpi->sf.inter_sf.prune_warped_prob_thresh)
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    int warped_probability =
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+        cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE
+            ? temp_frame_probs->warped_probs[update_type]
+            :
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+            frame_probs->warped_probs[update_type];
+    if (warped_probability < cpi->sf.inter_sf.prune_warped_prob_thresh)
       features->allow_warped_motion = 0;
   }
 
@@ -5692,15 +1444,19 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
     }
 
     av1_hash_table_init(intrabc_hash_info);
-    av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table);
+    if (!av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table)) {
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error allocating intrabc_hash_table");
+    }
     hash_table_created = 1;
     av1_generate_block_2x2_hash_value(intrabc_hash_info, cpi->source,
                                       block_hash_values[0], is_block_same[0]);
     // Hash data generated for screen contents is used for intraBC ME
     const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize];
     const int max_sb_size =
-        (1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2));
+        (1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2));
     int src_idx = 0;
+    bool error = false;
     for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) {
       const int dst_idx = !src_idx;
       av1_generate_block_hash_value(
@@ -5708,9 +1464,13 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
           block_hash_values[dst_idx], is_block_same[src_idx],
           is_block_same[dst_idx]);
       if (size >= min_alloc_size) {
-        av1_add_to_hash_map_by_row_with_precal_data(
-            &intrabc_hash_info->intrabc_hash_table, block_hash_values[dst_idx],
-            is_block_same[dst_idx][2], pic_width, pic_height, size);
+        if (!av1_add_to_hash_map_by_row_with_precal_data(
+                &intrabc_hash_info->intrabc_hash_table,
+                block_hash_values[dst_idx], is_block_same[dst_idx][2],
+                pic_width, pic_height, size)) {
+          error = true;
+          break;
+        }
       }
     }
 
@@ -5723,6 +1483,11 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
         aom_free(is_block_same[k][j]);
       }
     }
+
+    if (error) {
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error adding data to intrabc_hash_table");
+    }
   }
 
   const CommonQuantParams *quant_params = &cm->quant_params;
@@ -5747,39 +1512,56 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
 
   // Fix delta q resolution for the moment
   cm->delta_q_info.delta_q_res = 0;
-  if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE)
-    cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE;
-  else if (cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL)
-    cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
-  // Set delta_q_present_flag before it is used for the first time
-  cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES;
-  cm->delta_q_info.delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
-
-  // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q is used
-  // for ineligible frames. That effectively will turn off row_mt usage.
-  // Note objective delta_q and tpl eligible frames are only altref frames
-  // currently.
-  if (cm->delta_q_info.delta_q_present_flag) {
-    if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE &&
-        !is_frame_tpl_eligible(cpi))
-      cm->delta_q_info.delta_q_present_flag = 0;
-  }
+  if (cpi->oxcf.q_cfg.aq_mode != CYCLIC_REFRESH_AQ) {
+    if (deltaq_mode == DELTA_Q_OBJECTIVE)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE;
+    else if (deltaq_mode == DELTA_Q_PERCEPTUAL)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+    else if (deltaq_mode == DELTA_Q_PERCEPTUAL_AI)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+    else if (deltaq_mode == DELTA_Q_USER_RATING_BASED)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+    else if (deltaq_mode == DELTA_Q_HDR)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+    // Set delta_q_present_flag before it is used for the first time
+    cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES;
+    cm->delta_q_info.delta_q_present_flag = deltaq_mode != NO_DELTA_Q;
+
+    // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q
+    // is used for ineligible frames. That effectively will turn off row_mt
+    // usage. Note objective delta_q and tpl eligible frames are only altref
+    // frames currently.
+    const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+    if (cm->delta_q_info.delta_q_present_flag) {
+      if (deltaq_mode == DELTA_Q_OBJECTIVE &&
+          gf_group->update_type[cpi->gf_frame_index] == LF_UPDATE)
+        cm->delta_q_info.delta_q_present_flag = 0;
+
+      if (deltaq_mode == DELTA_Q_OBJECTIVE &&
+          cm->delta_q_info.delta_q_present_flag) {
+        cm->delta_q_info.delta_q_present_flag &= allow_deltaq_mode(cpi);
+      }
+    }
 
-  // Reset delta_q_used flag
-  cpi->deltaq_used = 0;
+    // Reset delta_q_used flag
+    cpi->deltaq_used = 0;
 
-  cm->delta_q_info.delta_lf_present_flag =
-      cm->delta_q_info.delta_q_present_flag && cpi->oxcf.deltalf_mode;
-  cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
+    cm->delta_q_info.delta_lf_present_flag =
+        cm->delta_q_info.delta_q_present_flag &&
+        oxcf->tool_cfg.enable_deltalf_mode;
+    cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
 
-  // update delta_q_present_flag and delta_lf_present_flag based on
-  // base_qindex
-  cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0;
-  cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0;
+    // update delta_q_present_flag and delta_lf_present_flag based on
+    // base_qindex
+    cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0;
+    cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0;
+  } else {
+    cpi->cyclic_refresh->actual_num_seg1_blocks = 0;
+    cpi->cyclic_refresh->actual_num_seg2_blocks = 0;
+    cpi->cyclic_refresh->cnt_zeromv = 0;
+  }
 
   av1_frame_init_quantizer(cpi);
-  av1_initialize_rd_consts(cpi);
-  av1_initialize_me_consts(cpi, x, quant_params->base_qindex);
 
   init_encode_frame_mb_context(cpi);
   set_default_interp_skip_flags(cm, &cpi->interp_search_flags);
@@ -5804,99 +1586,25 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
   // Figure out which ref frames can be skipped at frame level.
   setup_prune_ref_frame_mask(cpi);
 
-  x->txb_split_count = 0;
+  x->txfm_search_info.txb_split_count = 0;
 #if CONFIG_SPEED_STATS
-  x->tx_search_count = 0;
+  x->txfm_search_info.tx_search_count = 0;
 #endif  // CONFIG_SPEED_STATS
 
+#if !CONFIG_REALTIME_ONLY
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, av1_compute_global_motion_time);
 #endif
-  av1_zero(rdc->global_motion_used);
-  av1_zero(gm_info->params_cost);
-  if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
-      cpi->oxcf.enable_global_motion && !gm_info->search_done) {
-    YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
-    MotionModel params_by_motion[RANSAC_NUM_MOTIONS];
-    for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
-      memset(&params_by_motion[m], 0, sizeof(params_by_motion[m]));
-      params_by_motion[m].inliers =
-          aom_malloc(sizeof(*(params_by_motion[m].inliers)) * 2 * MAX_CORNERS);
-    }
-
-    int num_frm_corners = -1;
-    int frm_corners[2 * MAX_CORNERS];
-    unsigned char *frm_buffer = cpi->source->y_buffer;
-    if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
-      // The frame buffer is 16-bit, so we need to convert to 8 bits for the
-      // following code. We cache the result until the frame is released.
-      frm_buffer =
-          av1_downconvert_frame(cpi->source, cpi->common.seq_params.bit_depth);
-    }
-    const int segment_map_w =
-        (cpi->source->y_width + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;
-    const int segment_map_h =
-        (cpi->source->y_height + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;
-
-    uint8_t *segment_map =
-        aom_malloc(sizeof(*segment_map) * segment_map_w * segment_map_h);
-    memset(segment_map, 0,
-           sizeof(*segment_map) * segment_map_w * segment_map_h);
-
-    FrameDistPair future_ref_frame[REF_FRAMES - 1] = {
-      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
-      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
-      { -1, NONE_FRAME }
-    };
-    FrameDistPair past_ref_frame[REF_FRAMES - 1] = {
-      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
-      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
-      { -1, NONE_FRAME }
-    };
-    int num_past_ref_frames = 0;
-    int num_future_ref_frames = 0;
-    // Populate ref_buf for valid ref frames in global motion
-    update_valid_ref_frames_for_gm(cpi, ref_buf, past_ref_frame,
-                                   future_ref_frame, &num_past_ref_frames,
-                                   &num_future_ref_frames);
-
-    // Sort the ref frames in the ascending order of their distance from the
-    // current frame
-    qsort(past_ref_frame, num_past_ref_frames, sizeof(past_ref_frame[0]),
-          compare_distance);
-    qsort(future_ref_frame, num_future_ref_frames, sizeof(future_ref_frame[0]),
-          compare_distance);
-
-    // Compute global motion w.r.t. past reference frames
-    if (num_past_ref_frames > 0)
-      compute_global_motion_for_references(
-          cpi, ref_buf, past_ref_frame, num_past_ref_frames, &num_frm_corners,
-          frm_corners, frm_buffer, params_by_motion, segment_map, segment_map_w,
-          segment_map_h);
-
-    // Compute global motion w.r.t. future reference frames
-    if (num_future_ref_frames > 0)
-      compute_global_motion_for_references(
-          cpi, ref_buf, future_ref_frame, num_future_ref_frames,
-          &num_frm_corners, frm_corners, frm_buffer, params_by_motion,
-          segment_map, segment_map_w, segment_map_h);
-
-    aom_free(segment_map);
-
-    gm_info->search_done = 1;
-    for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
-      aom_free(params_by_motion[m].inliers);
-    }
-  }
-  memcpy(cm->cur_frame->global_motion, cm->global_motion,
-         REF_FRAMES * sizeof(WarpedMotionParams));
+  av1_compute_global_motion_facade(cpi);
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, av1_compute_global_motion_time);
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, av1_setup_motion_field_time);
 #endif
+  av1_calculate_ref_frame_side(cm);
   if (features->allow_ref_frame_mvs) av1_setup_motion_field(cm);
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, av1_setup_motion_field_time);
@@ -5905,17 +1613,27 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
   cm->current_frame.skip_mode_info.skip_mode_flag =
       check_skip_mode_enabled(cpi);
 
-  cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read_dummy;
-  cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write_dummy;
-  cpi->row_mt = 0;
-
-  if (cpi->oxcf.row_mt && (cpi->oxcf.max_threads > 1)) {
-    cpi->row_mt = 1;
-    cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read;
-    cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write;
+  // Initialization of skip mode cost depends on the value of
+  // 'skip_mode_flag'. This initialization happens in the function
+  // av1_fill_mode_rates(), which is in turn called in
+  // av1_initialize_rd_consts(). Thus, av1_initialize_rd_consts()
+  // has to be called after 'skip_mode_flag' is initialized.
+  av1_initialize_rd_consts(cpi);
+  av1_set_sad_per_bit(cpi, &x->sadperbit, quant_params->base_qindex);
+
+  enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
+  enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
+  mt_info->row_mt_enabled = 0;
+  mt_info->pack_bs_mt_enabled = AOMMIN(mt_info->num_mod_workers[MOD_PACK_BS],
+                                       cm->tiles.cols * cm->tiles.rows) > 1;
+
+  if (oxcf->row_mt && (mt_info->num_workers > 1)) {
+    mt_info->row_mt_enabled = 1;
+    enc_row_mt->sync_read_ptr = av1_row_mt_sync_read;
+    enc_row_mt->sync_write_ptr = av1_row_mt_sync_write;
     av1_encode_tiles_row_mt(cpi);
   } else {
-    if (AOMMIN(cpi->oxcf.max_threads, cm->tiles.cols * cm->tiles.rows) > 1)
+    if (AOMMIN(mt_info->num_workers, cm->tiles.cols * cm->tiles.rows) > 1)
       av1_encode_tiles_mt(cpi);
     else
       encode_tiles(cpi);
@@ -5940,62 +1658,179 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
           : DEFAULT_EVAL;
   const TX_SIZE_SEARCH_METHOD tx_search_type =
       cpi->winner_mode_params.tx_size_search_methods[eval_type];
-  assert(cpi->oxcf.enable_tx64 || tx_search_type != USE_LARGESTALL);
+  assert(oxcf->txfm_cfg.enable_tx64 || tx_search_type != USE_LARGESTALL);
   features->tx_mode = select_tx_mode(cm, tx_search_type);
 
-  if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
-
+  // Retain the frame level probability update conditions for parallel frames.
+  // These conditions will be consumed during postencode stage to update the
+  // probability.
+  if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+    cpi->do_update_frame_probs_txtype[cpi->num_frame_recode] =
+        cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats;
+    cpi->do_update_frame_probs_obmc[cpi->num_frame_recode] =
+        (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+         cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX);
+    cpi->do_update_frame_probs_warp[cpi->num_frame_recode] =
+        (features->allow_warped_motion &&
+         cpi->sf.inter_sf.prune_warped_prob_thresh > 0);
+    cpi->do_update_frame_probs_interpfilter[cpi->num_frame_recode] =
+        (cm->current_frame.frame_type != KEY_FRAME &&
+         cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
+         features->interp_filter == SWITCHABLE);
+  }
+
+  if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats ||
+      ((cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh !=
+        INT_MAX) &&
+       (cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != 0))) {
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
     for (i = 0; i < TX_SIZES_ALL; i++) {
       int sum = 0;
       int j;
-      int left = 1024;
+      int left = MAX_TX_TYPE_PROB;
 
       for (j = 0; j < TX_TYPES; j++)
         sum += cpi->td.rd_counts.tx_type_used[i][j];
 
       for (j = TX_TYPES - 1; j >= 0; j--) {
+        int update_txtype_frameprobs = 1;
         const int new_prob =
-            sum ? 1024 * cpi->td.rd_counts.tx_type_used[i][j] / sum
-                : (j ? 0 : 1024);
-        int prob =
-            (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
-        left -= prob;
-        if (j == 0) prob += left;
-        frame_probs->tx_type_probs[update_type][i][j] = prob;
+            sum ? MAX_TX_TYPE_PROB * cpi->td.rd_counts.tx_type_used[i][j] / sum
+                : (j ? 0 : MAX_TX_TYPE_PROB);
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+        if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+          if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] ==
+              0) {
+            int prob =
+                (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] +
+                 new_prob) >>
+                1;
+            left -= prob;
+            if (j == 0) prob += left;
+            temp_frame_probs_simulation->tx_type_probs[update_type][i][j] =
+                prob;
+            // Copy temp_frame_probs_simulation to temp_frame_probs
+            for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+                 update_type_idx++) {
+              temp_frame_probs->tx_type_probs[update_type_idx][i][j] =
+                  temp_frame_probs_simulation
+                      ->tx_type_probs[update_type_idx][i][j];
+            }
+          }
+          update_txtype_frameprobs = 0;
+        }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+        // Track the frame probabilities of parallel encode frames to update
+        // during postencode stage.
+        if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+          update_txtype_frameprobs = 0;
+          cpi->frame_new_probs[cpi->num_frame_recode]
+              .tx_type_probs[update_type][i][j] = new_prob;
+        }
+        if (update_txtype_frameprobs) {
+          int prob =
+              (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
+          left -= prob;
+          if (j == 0) prob += left;
+          frame_probs->tx_type_probs[update_type][i][j] = prob;
+        }
       }
     }
   }
 
-  if (!cpi->sf.inter_sf.disable_obmc &&
-      cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) {
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+  if (cm->seg.enabled) {
+    cm->seg.temporal_update = 1;
+    if (rdc->seg_tmp_pred_cost[0] < rdc->seg_tmp_pred_cost[1])
+      cm->seg.temporal_update = 0;
+  }
+
+  if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+      cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
 
     for (i = 0; i < BLOCK_SIZES_ALL; i++) {
       int sum = 0;
+      int update_obmc_frameprobs = 1;
       for (int j = 0; j < 2; j++) sum += cpi->td.rd_counts.obmc_used[i][j];
 
       const int new_prob =
           sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0;
-      frame_probs->obmc_probs[update_type][i] =
-          (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+      if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+        if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+          temp_frame_probs_simulation->obmc_probs[update_type][i] =
+              (temp_frame_probs_simulation->obmc_probs[update_type][i] +
+               new_prob) >>
+              1;
+          // Copy temp_frame_probs_simulation to temp_frame_probs
+          for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+               update_type_idx++) {
+            temp_frame_probs->obmc_probs[update_type_idx][i] =
+                temp_frame_probs_simulation->obmc_probs[update_type_idx][i];
+          }
+        }
+        update_obmc_frameprobs = 0;
+      }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+      // Track the frame probabilities of parallel encode frames to update
+      // during postencode stage.
+      if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+        update_obmc_frameprobs = 0;
+        cpi->frame_new_probs[cpi->num_frame_recode].obmc_probs[update_type][i] =
+            new_prob;
+      }
+      if (update_obmc_frameprobs) {
+        frame_probs->obmc_probs[update_type][i] =
+            (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+      }
     }
   }
 
   if (features->allow_warped_motion &&
       cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    int update_warp_frameprobs = 1;
     int sum = 0;
     for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i];
     const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0;
-    frame_probs->warped_probs[update_type] =
-        (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+    if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+      if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+        temp_frame_probs_simulation->warped_probs[update_type] =
+            (temp_frame_probs_simulation->warped_probs[update_type] +
+             new_prob) >>
+            1;
+        // Copy temp_frame_probs_simulation to temp_frame_probs
+        for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+             update_type_idx++) {
+          temp_frame_probs->warped_probs[update_type_idx] =
+              temp_frame_probs_simulation->warped_probs[update_type_idx];
+        }
+      }
+      update_warp_frameprobs = 0;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+    // Track the frame probabilities of parallel encode frames to update
+    // during postencode stage.
+    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      update_warp_frameprobs = 0;
+      cpi->frame_new_probs[cpi->num_frame_recode].warped_probs[update_type] =
+          new_prob;
+    }
+    if (update_warp_frameprobs) {
+      frame_probs->warped_probs[update_type] =
+          (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+    }
   }
 
   if (cm->current_frame.frame_type != KEY_FRAME &&
       cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
       features->interp_filter == SWITCHABLE) {
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
 
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       int sum = 0;
@@ -6007,34 +1842,73 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
       }
 
       for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) {
+        int update_interpfilter_frameprobs = 1;
         const int new_prob =
             sum ? 1536 * cpi->td.counts->switchable_interp[i][j] / sum
                 : (j ? 0 : 1536);
-        int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
-                    new_prob) >>
-                   1;
-        left -= prob;
-        if (j == 0) prob += left;
-        frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+        if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+          if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] ==
+              0) {
+            int prob = (temp_frame_probs_simulation
+                            ->switchable_interp_probs[update_type][i][j] +
+                        new_prob) >>
+                       1;
+            left -= prob;
+            if (j == 0) prob += left;
+            temp_frame_probs_simulation
+                ->switchable_interp_probs[update_type][i][j] = prob;
+            // Copy temp_frame_probs_simulation to temp_frame_probs
+            for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+                 update_type_idx++) {
+              temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] =
+                  temp_frame_probs_simulation
+                      ->switchable_interp_probs[update_type_idx][i][j];
+            }
+          }
+          update_interpfilter_frameprobs = 0;
+        }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+        // Track the frame probabilities of parallel encode frames to update
+        // during postencode stage.
+        if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+          update_interpfilter_frameprobs = 0;
+          cpi->frame_new_probs[cpi->num_frame_recode]
+              .switchable_interp_probs[update_type][i][j] = new_prob;
+        }
+        if (update_interpfilter_frameprobs) {
+          int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
+                      new_prob) >>
+                     1;
+          left -= prob;
+          if (j == 0) prob += left;
+          frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+        }
       }
     }
   }
-
-  if ((!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) &&
-       !cpi->sf.rt_sf.use_nonrd_pick_mode) ||
-      hash_table_created) {
+  if (hash_table_created) {
     av1_hash_table_destroy(&intrabc_hash_info->intrabc_hash_table);
   }
 }
 
+/*!\brief Setup reference frame buffers and encode a frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]    cpi    Top-level encoder structure
+ */
 void av1_encode_frame(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
   FeatureFlags *const features = &cm->features;
   const int num_planes = av1_num_planes(cm);
+  RD_COUNTS *const rdc = &cpi->td.rd_counts;
   // Indicates whether or not to use a default reduced set for ext-tx
   // rather than the potential full set of 16 transforms
-  features->reduced_tx_set_used = cpi->oxcf.reduced_tx_type_set;
+  features->reduced_tx_set_used = cpi->oxcf.txfm_cfg.reduced_tx_type_set;
 
   // Make sure segment_id is no larger than last_active_segid.
   if (cm->seg.enabled && cm->seg.update_map) {
@@ -6051,56 +1925,23 @@ void av1_encode_frame(AV1_COMP *cpi) {
   }
 
   av1_setup_frame_buf_refs(cm);
-  enforce_max_ref_frames(cpi, &cpi->ref_frame_flags);
-  set_rel_frame_dist(cpi);
+  enforce_max_ref_frames(cpi, &cpi->ref_frame_flags,
+                         cm->cur_frame->ref_display_order_hint,
+                         cm->current_frame.display_order_hint);
+  set_rel_frame_dist(&cpi->common, &cpi->ref_frame_dist_info,
+                     cpi->ref_frame_flags);
   av1_setup_frame_sign_bias(cm);
 
-#if CHECK_PRECOMPUTED_REF_FRAME_MAP
-  GF_GROUP *gf_group = &cpi->gf_group;
-  // TODO(yuec): The check is disabled on OVERLAY frames for now, because info
-  // in cpi->gf_group has been refreshed for the next GOP when the check is
-  // performed for OVERLAY frames. Since we have not support inter-GOP ref
-  // frame map computation, the precomputed ref map for an OVERLAY frame is all
-  // -1 at this point (although it is meaning before gf_group is refreshed).
-  if (!frame_is_intra_only(cm) && gf_group->index != 0) {
-    const RefCntBuffer *const golden_buf = get_ref_frame_buf(cm, GOLDEN_FRAME);
-
-    if (golden_buf) {
-      const int golden_order_hint = golden_buf->order_hint;
-
-      for (int ref = LAST_FRAME; ref < EXTREF_FRAME; ++ref) {
-        const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
-        const int ref_disp_idx_precomputed =
-            gf_group->ref_frame_disp_idx[gf_group->index][ref - LAST_FRAME];
-
-        (void)ref_disp_idx_precomputed;
-
-        if (buf != NULL) {
-          const int ref_disp_idx =
-              get_relative_dist(&cm->seq_params.order_hint_info,
-                                buf->order_hint, golden_order_hint);
-
-          if (ref_disp_idx >= 0)
-            assert(ref_disp_idx == ref_disp_idx_precomputed);
-          else
-            assert(ref_disp_idx_precomputed == -1);
-        } else {
-          assert(ref_disp_idx_precomputed == -1);
-        }
-      }
-    }
-  }
-#endif
-
 #if CONFIG_MISMATCH_DEBUG
   mismatch_reset_frame(num_planes);
 #else
   (void)num_planes;
 #endif
 
-  if (cpi->sf.hl_sf.frame_parameter_update) {
-    RD_COUNTS *const rdc = &cpi->td.rd_counts;
+  rdc->newmv_or_intra_blocks = 0;
 
+  if (cpi->sf.hl_sf.frame_parameter_update ||
+      cpi->sf.rt_sf.use_comp_ref_nonrd) {
     if (frame_is_intra_only(cm))
       current_frame->reference_mode = SINGLE_REFERENCE;
     else
@@ -6138,338 +1979,14 @@ void av1_encode_frame(AV1_COMP *cpi) {
 
     if (!cm->tiles.large_scale) {
       if (features->tx_mode == TX_MODE_SELECT &&
-          cpi->td.mb.txb_split_count == 0)
+          cpi->td.mb.txfm_search_info.txb_split_count == 0)
         features->tx_mode = TX_MODE_LARGEST;
     }
   } else {
+    // This is needed if real-time speed setting is changed on the fly
+    // from one using compound prediction to one using single reference.
+    if (current_frame->reference_mode == REFERENCE_MODE_SELECT)
+      current_frame->reference_mode = SINGLE_REFERENCE;
     encode_frame_internal(cpi);
   }
 }
-
-static AOM_INLINE void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
-                                         FRAME_COUNTS *counts, TX_SIZE tx_size,
-                                         int depth, int blk_row, int blk_col,
-                                         uint8_t allow_update_cdf) {
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int max_blocks_high = max_block_high(xd, bsize, 0);
-  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
-  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
-                                   xd->left_txfm_context + blk_row,
-                                   mbmi->sb_type, tx_size);
-  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
-  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
-
-  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-  assert(tx_size > TX_4X4);
-
-  if (depth == MAX_VARTX_DEPTH) {
-    // Don't add to counts in this case
-    mbmi->tx_size = tx_size;
-    txfm_partition_update(xd->above_txfm_context + blk_col,
-                          xd->left_txfm_context + blk_row, tx_size, tx_size);
-    return;
-  }
-
-  if (tx_size == plane_tx_size) {
-#if CONFIG_ENTROPY_STATS
-    ++counts->txfm_partition[ctx][0];
-#endif
-    if (allow_update_cdf)
-      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2);
-    mbmi->tx_size = tx_size;
-    txfm_partition_update(xd->above_txfm_context + blk_col,
-                          xd->left_txfm_context + blk_row, tx_size, tx_size);
-  } else {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsw = tx_size_wide_unit[sub_txs];
-    const int bsh = tx_size_high_unit[sub_txs];
-
-#if CONFIG_ENTROPY_STATS
-    ++counts->txfm_partition[ctx][1];
-#endif
-    if (allow_update_cdf)
-      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2);
-    ++x->txb_split_count;
-
-    if (sub_txs == TX_4X4) {
-      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
-      mbmi->tx_size = TX_4X4;
-      txfm_partition_update(xd->above_txfm_context + blk_col,
-                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
-      return;
-    }
-
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        int offsetr = row;
-        int offsetc = col;
-
-        update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
-                          blk_col + offsetc, allow_update_cdf);
-      }
-    }
-  }
-}
-
-static AOM_INLINE void tx_partition_count_update(const AV1_COMMON *const cm,
-                                                 MACROBLOCK *x,
-                                                 BLOCK_SIZE plane_bsize,
-                                                 FRAME_COUNTS *td_counts,
-                                                 uint8_t allow_update_cdf) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int mi_width = mi_size_wide[plane_bsize];
-  const int mi_height = mi_size_high[plane_bsize];
-  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
-  const int bh = tx_size_high_unit[max_tx_size];
-  const int bw = tx_size_wide_unit[max_tx_size];
-
-  xd->above_txfm_context =
-      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
-
-  for (int idy = 0; idy < mi_height; idy += bh) {
-    for (int idx = 0; idx < mi_width; idx += bw) {
-      update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
-                        allow_update_cdf);
-    }
-  }
-}
-
-static AOM_INLINE void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size,
-                                        int blk_row, int blk_col) {
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int max_blocks_high = max_block_high(xd, bsize, 0);
-  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
-  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
-  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
-
-  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-
-  if (tx_size == plane_tx_size) {
-    mbmi->tx_size = tx_size;
-    txfm_partition_update(xd->above_txfm_context + blk_col,
-                          xd->left_txfm_context + blk_row, tx_size, tx_size);
-
-  } else {
-    if (tx_size == TX_8X8) {
-      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
-      mbmi->tx_size = TX_4X4;
-      txfm_partition_update(xd->above_txfm_context + blk_col,
-                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
-      return;
-    }
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsw = tx_size_wide_unit[sub_txs];
-    const int bsh = tx_size_high_unit[sub_txs];
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        const int offsetr = blk_row + row;
-        const int offsetc = blk_col + col;
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-        set_txfm_context(xd, sub_txs, offsetr, offsetc);
-      }
-    }
-  }
-}
-
-static AOM_INLINE void tx_partition_set_contexts(const AV1_COMMON *const cm,
-                                                 MACROBLOCKD *xd,
-                                                 BLOCK_SIZE plane_bsize) {
-  const int mi_width = mi_size_wide[plane_bsize];
-  const int mi_height = mi_size_high[plane_bsize];
-  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
-  const int bh = tx_size_high_unit[max_tx_size];
-  const int bw = tx_size_wide_unit[max_tx_size];
-
-  xd->above_txfm_context =
-      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
-
-  for (int idy = 0; idy < mi_height; idy += bh) {
-    for (int idx = 0; idx < mi_width; idx += bw) {
-      set_txfm_context(xd, max_tx_size, idy, idx);
-    }
-  }
-}
-
-static AOM_INLINE void encode_superblock(const AV1_COMP *const cpi,
-                                         TileDataEnc *tile_data, ThreadData *td,
-                                         TOKENEXTRA **t, RUN_TYPE dry_run,
-                                         BLOCK_SIZE bsize, int *rate) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO **mi_4x4 = xd->mi;
-  MB_MODE_INFO *mbmi = mi_4x4[0];
-  const int seg_skip =
-      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  const int mis = cm->mi_params.mi_stride;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  const int is_inter = is_inter_block(mbmi);
-
-  // Initialize tx_mode and tx_size_search_method
-  set_tx_size_search_method(
-      cm, &cpi->winner_mode_params, x,
-      cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
-
-  const int mi_row = xd->mi_row;
-  const int mi_col = xd->mi_col;
-  if (!is_inter) {
-    xd->cfl.store_y = store_cfl_required(cm, xd);
-    mbmi->skip = 1;
-    for (int plane = 0; plane < num_planes; ++plane) {
-      av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run,
-                                   cpi->optimize_seg_arr[mbmi->segment_id]);
-    }
-
-    // If there is at least one lossless segment, force the skip for intra
-    // block to be 0, in order to avoid the segment_id to be changed by in
-    // write_segment_id().
-    if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
-        cpi->enc_seg.has_lossless_segment)
-      mbmi->skip = 0;
-
-    xd->cfl.store_y = 0;
-    if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
-      for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
-        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
-          if (!dry_run) {
-            av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size,
-                                   PALETTE_MAP, tile_data->allow_update_cdf,
-                                   td->counts);
-          } else if (dry_run == DRY_RUN_COSTCOEFFS) {
-            rate +=
-                av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP);
-          }
-        }
-      }
-    }
-
-    av1_update_txb_context(cpi, td, dry_run, bsize,
-                           tile_data->allow_update_cdf);
-  } else {
-    int ref;
-    const int is_compound = has_second_ref(mbmi);
-
-    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      const YV12_BUFFER_CONFIG *cfg =
-          get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]);
-      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
-      av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
-                           xd->block_ref_scale_factors[ref], num_planes);
-    }
-    int start_plane = (cpi->sf.rt_sf.reuse_inter_pred_nonrd) ? 1 : 0;
-    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
-                                  start_plane, av1_num_planes(cm) - 1);
-    if (mbmi->motion_mode == OBMC_CAUSAL) {
-      assert(cpi->oxcf.enable_obmc == 1);
-      av1_build_obmc_inter_predictors_sb(cm, xd);
-    }
-
-#if CONFIG_MISMATCH_DEBUG
-    if (dry_run == OUTPUT_ENABLED) {
-      for (int plane = 0; plane < num_planes; ++plane) {
-        const struct macroblockd_plane *pd = &xd->plane[plane];
-        int pixel_c, pixel_r;
-        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
-                        pd->subsampling_x, pd->subsampling_y);
-        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                                 pd->subsampling_y))
-          continue;
-        mismatch_record_block_pre(pd->dst.buf, pd->dst.stride,
-                                  cm->current_frame.order_hint, plane, pixel_c,
-                                  pixel_r, pd->width, pd->height,
-                                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
-      }
-    }
-#else
-    (void)num_planes;
-#endif
-
-    av1_encode_sb(cpi, x, bsize, dry_run);
-    av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate,
-                          tile_data->allow_update_cdf);
-  }
-
-  if (!dry_run) {
-    if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1;
-    if (x->tx_mode_search_type == TX_MODE_SELECT &&
-        !xd->lossless[mbmi->segment_id] && mbmi->sb_type > BLOCK_4X4 &&
-        !(is_inter && (mbmi->skip || seg_skip))) {
-      if (is_inter) {
-        tx_partition_count_update(cm, x, bsize, td->counts,
-                                  tile_data->allow_update_cdf);
-      } else {
-        if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
-          ++x->txb_split_count;
-        if (block_signals_txsize(bsize)) {
-          const int tx_size_ctx = get_tx_size_context(xd);
-          const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
-          const int depth = tx_size_to_depth(mbmi->tx_size, bsize);
-          const int max_depths = bsize_to_max_depth(bsize);
-
-          if (tile_data->allow_update_cdf)
-            update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
-                       depth, max_depths + 1);
-#if CONFIG_ENTROPY_STATS
-          ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth];
-#endif
-        }
-      }
-      assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
-    } else {
-      int i, j;
-      TX_SIZE intra_tx_size;
-      // The new intra coding scheme requires no change of transform size
-      if (is_inter) {
-        if (xd->lossless[mbmi->segment_id]) {
-          intra_tx_size = TX_4X4;
-        } else {
-          intra_tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type);
-        }
-      } else {
-        intra_tx_size = mbmi->tx_size;
-      }
-
-      for (j = 0; j < mi_height; j++)
-        for (i = 0; i < mi_width; i++)
-          if (mi_col + i < cm->mi_params.mi_cols &&
-              mi_row + j < cm->mi_params.mi_rows)
-            mi_4x4[mis * j + i]->tx_size = intra_tx_size;
-
-      if (intra_tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count;
-    }
-  }
-
-  if (x->tx_mode_search_type == TX_MODE_SELECT &&
-      block_signals_txsize(mbmi->sb_type) && is_inter &&
-      !(mbmi->skip || seg_skip) && !xd->lossless[mbmi->segment_id]) {
-    if (dry_run) tx_partition_set_contexts(cm, xd, bsize);
-  } else {
-    TX_SIZE tx_size = mbmi->tx_size;
-    // The new intra coding scheme requires no change of transform size
-    if (is_inter) {
-      if (xd->lossless[mbmi->segment_id]) {
-        tx_size = TX_4X4;
-      } else {
-        tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type);
-      }
-    } else {
-      tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
-    }
-    mbmi->tx_size = tx_size;
-    set_txfm_ctxs(tx_size, xd->width, xd->height,
-                  (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd);
-  }
-
-  if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) {
-    cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
-  }
-}
diff --git a/media/libaom/src/av1/encoder/encodeframe.h b/media/libaom/src/av1/encoder/encodeframe.h
index e4c4841058..36b38d59f7 100644
--- a/media/libaom/src/av1/encoder/encodeframe.h
+++ b/media/libaom/src/av1/encoder/encodeframe.h
@@ -16,6 +16,8 @@
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"
 
+#include "av1/encoder/global_motion.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -41,7 +43,6 @@ void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
                      int tile_col);
 void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td,
                        int tile_row, int tile_col, int mi_row);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/encodeframe_utils.c b/media/libaom/src/av1/encoder/encodeframe_utils.c
new file mode 100644
index 0000000000..ebe2640776
--- /dev/null
+++ b/media/libaom/src/av1/encoder/encodeframe_utils.c
@@ -0,0 +1,1640 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common_data.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/aq_variance.h"
+
+void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
+                         const BLOCK_SIZE bsize, const int mi_row,
+                         const int mi_col, int *const rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+  double num_of_mi = 0.0;
+  double geom_mean_of_scale = 0.0;
+
+  assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM);
+
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->ssim_rdmult_scaling_factors[index]);
+      num_of_mi += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+  *rdmult = AOMMAX(*rdmult, 0);
+  av1_set_error_per_bit(errorperbit, *rdmult);
+}
+
+// TODO(angiebird): Move these function to tpl_model.c
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE int set_deltaq_rdmult(const AV1_COMP *const cpi,
+                                        const MACROBLOCK *const x) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  return av1_compute_rd_mult(cpi, quant_params->base_qindex + x->delta_qindex +
+                                      quant_params->y_dc_delta_q);
+}
+
+// Return the end column for the current superblock, in unit of TPL blocks.
+static int get_superblock_tpl_column_end(const AV1_COMMON *const cm, int mi_col,
+                                         int num_mi_w) {
+  // Find the start column of this superblock.
+  const int sb_mi_col_start = (mi_col >> cm->seq_params->mib_size_log2)
+                              << cm->seq_params->mib_size_log2;
+  // Same but in superres upscaled dimension.
+  const int sb_mi_col_start_sr =
+      coded_to_superres_mi(sb_mi_col_start, cm->superres_scale_denominator);
+  // Width of this superblock in mi units.
+  const int sb_mi_width = mi_size_wide[cm->seq_params->sb_size];
+  // Same but in superres upscaled dimension.
+  const int sb_mi_width_sr =
+      coded_to_superres_mi(sb_mi_width, cm->superres_scale_denominator);
+  // Superblock end in mi units.
+  const int sb_mi_end = sb_mi_col_start_sr + sb_mi_width_sr;
+  // Superblock end in TPL units.
+  return (sb_mi_end + num_mi_w - 1) / num_mi_w;
+}
+
+int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                      const BLOCK_SIZE bsize, const int mi_row,
+                      const int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  int deltaq_rdmult = set_deltaq_rdmult(cpi, x);
+  if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult;
+  if (cm->superres_scale_denominator != SCALE_NUMERATOR) return deltaq_rdmult;
+  if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
+  if (x->rb == 0) return deltaq_rdmult;
+
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  int tpl_stride = tpl_frame->stride;
+  double intra_cost_base = 0;
+  double mc_dep_cost_base = 0;
+  double cbcmp_base = 0;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+
+      TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+
+      double cbcmp = (double)this_stats->srcrf_dist;
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+      intra_cost_base += log(dist_scaled) * cbcmp;
+      mc_dep_cost_base += log(3 * dist_scaled + mc_dep_delta) * cbcmp;
+      cbcmp_base += cbcmp;
+    }
+  }
+
+  if (cbcmp_base == 0) return deltaq_rdmult;
+
+  double rk = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base);
+  deltaq_rdmult = (int)(deltaq_rdmult * (rk / x->rb));
+
+  return AOMMAX(deltaq_rdmult, 1);
+}
+
+int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize, const int mi_row,
+                            const int mi_col, int orig_rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  const int deltaq_rdmult = set_deltaq_rdmult(cpi, x);
+  if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult;
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index))
+    return deltaq_rdmult;
+  if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
+
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int block_mi_width_sr =
+      coded_to_superres_mi(mi_size_wide[bsize], cm->superres_scale_denominator);
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (block_mi_width_sr + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  // This is required because the end col of superblock may be off by 1 in case
+  // of superres.
+  const int sb_bcol_end = get_superblock_tpl_column_end(cm, mi_col, num_mi_w);
+  int row, col;
+  double base_block_count = 0.0;
+  double geom_mean_of_scale = 0.0;
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col_sr / num_mi_h;
+         col < num_cols && col < mi_col_sr / num_mi_h + num_bcols &&
+         col < sb_bcol_end;
+         ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->ppi->tpl_sb_rdmult_scaling_factors[index]);
+      base_block_count += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count);
+  int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5);
+  rdmult = AOMMAX(rdmult, 0);
+  av1_set_error_per_bit(&x->errorperbit, rdmult);
+#if !CONFIG_RD_COMMAND
+  if (bsize == cm->seq_params->sb_size) {
+    const int rdmult_sb = set_deltaq_rdmult(cpi, x);
+    assert(rdmult_sb == rdmult);
+    (void)rdmult_sb;
+  }
+#endif  // !CONFIG_RD_COMMAND
+  return rdmult;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
+                                                const MACROBLOCKD *xd,
+                                                const MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+    ++counts->switchable_interp[ctx][filter];
+  }
+}
+
+static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
+                          const TX_MODE tx_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  if (xd->lossless[mbmi->segment_id]) {
+    mbmi->tx_size = TX_4X4;
+  } else if (tx_mode != TX_MODE_SELECT) {
+    mbmi->tx_size = tx_size_from_tx_mode(mbmi->bsize, tx_mode);
+  } else {
+    const BLOCK_SIZE bsize = mbmi->bsize;
+    const TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize);
+    if (tx_size_wide[min_tx_size] > tx_size_wide[mbmi->tx_size] ||
+        tx_size_high[min_tx_size] > tx_size_high[mbmi->tx_size])
+      mbmi->tx_size = min_tx_size;
+
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+    if (tx_size_wide[max_tx_size] < tx_size_wide[mbmi->tx_size] ||
+        tx_size_high[max_tx_size] < tx_size_high[mbmi->tx_size])
+      mbmi->tx_size = max_tx_size;
+  }
+  if (is_inter_block(mbmi)) {
+    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+  }
+  const int stride = xd->tx_type_map_stride;
+  const int bw = mi_size_wide[mbmi->bsize];
+  for (int row = 0; row < mi_size_high[mbmi->bsize]; ++row) {
+    memset(xd->tx_type_map + row * stride, DCT_DCT,
+           bw * sizeof(xd->tx_type_map[0]));
+  }
+  av1_zero(txfm_info->blk_skip);
+  txfm_info->skip_txfm = 0;
+}
+
+// This function will copy the best reference mode information from
+// MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT.
+static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+    MB_MODE_INFO_EXT *mbmi_ext,
+    const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) {
+  memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
+         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+  memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
+         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+  mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
+  mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
+  memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
+         sizeof(mbmi_ext->global_mvs));
+}
+
+void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
+                      const PICK_MODE_CONTEXT *const ctx, int mi_row,
+                      int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+  int i, x_idx, y;
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const MB_MODE_INFO *const mi = &ctx->mic;
+  MB_MODE_INFO *const mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int bw = mi_size_wide[mi->bsize];
+  const int bh = mi_size_high[mi->bsize];
+  const int mis = mi_params->mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+  assert(mi->bsize == bsize);
+
+  *mi_addr = *mi;
+  copy_mbmi_ext_frame_to_mbmi_ext(&x->mbmi_ext, &ctx->mbmi_ext_best,
+                                  av1_ref_frame_type(ctx->mic.ref_frame));
+
+  memcpy(txfm_info->blk_skip, ctx->blk_skip,
+         sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+
+  txfm_info->skip_txfm = ctx->rd_stats.skip_txfm;
+
+  xd->tx_type_map = ctx->tx_type_map;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+  // If not dry_run, copy the transform type data into the frame level buffer.
+  // Encoder will fetch tx types when writing bitstream.
+  if (!dry_run) {
+    const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+    uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx;
+    const int mi_stride = mi_params->mi_stride;
+    for (int blk_row = 0; blk_row < bh; ++blk_row) {
+      av1_copy_array(tx_type_map + blk_row * mi_stride,
+                     xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw);
+    }
+    xd->tx_type_map = tx_type_map;
+    xd->tx_type_map_stride = mi_stride;
+  }
+
+  // If segmentation in use
+  if (seg->enabled) {
+    // For in frame complexity AQ copy the segment id from the segment map.
+    if (cpi->oxcf.q_cfg.aq_mode == COMPLEXITY_AQ) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+      mi_addr->segment_id =
+          map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0;
+      reset_tx_size(x, mi_addr, x->txfm_search_params.tx_mode_search_type);
+    }
+    // Else for cyclic refresh mode update the segment map, set the segment id
+    // and then update the quantizer.
+    if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+        !cpi->rc.rtc_external_ratectrl) {
+      av1_cyclic_refresh_update_segment(cpi, x, mi_row, mi_col, bsize,
+                                        ctx->rd_stats.rate, ctx->rd_stats.dist,
+                                        txfm_info->skip_txfm, dry_run);
+    }
+    if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
+      mi_addr->uv_mode = UV_DC_PRED;
+
+    if (!dry_run && !mi_addr->skip_txfm) {
+      int cdf_num;
+      const int spatial_pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
+      const int coded_id = av1_neg_interleave(mi_addr->segment_id, spatial_pred,
+                                              seg->last_active_segid + 1);
+      int64_t spatial_cost = x->mode_costs.spatial_pred_cost[cdf_num][coded_id];
+      td->rd_counts.seg_tmp_pred_cost[0] += spatial_cost;
+
+      const int pred_segment_id =
+          cm->last_frame_seg_map
+              ? get_segment_id(mi_params, cm->last_frame_seg_map, bsize, mi_row,
+                               mi_col)
+              : 0;
+      const int use_tmp_pred = pred_segment_id == mi_addr->segment_id;
+      const int tmp_pred_ctx = av1_get_pred_context_seg_id(xd);
+      td->rd_counts.seg_tmp_pred_cost[1] +=
+          x->mode_costs.tmp_pred_cost[tmp_pred_ctx][use_tmp_pred];
+      if (!use_tmp_pred) {
+        td->rd_counts.seg_tmp_pred_cost[1] += spatial_cost;
+      }
+    }
+  }
+
+  // Count zero motion vector.
+  if (!dry_run && cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+      !frame_is_intra_only(cm)) {
+    const MV mv = mi->mv[0].as_mv;
+    if (is_inter_block(mi) && mi->ref_frame[0] == LAST_FRAME &&
+        abs(mv.row) < 8 && abs(mv.col) < 8) {
+      const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+      // Accumulate low_content_frame.
+      for (int mi_y = 0; mi_y < ymis; mi_y += 2) x->cnt_zeromv += bw << 1;
+    }
+  }
+
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    p[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+
+  const int cols =
+      AOMMIN((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width, mi_width);
+  const int rows = AOMMIN(
+      (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height, mi_height);
+  for (y = 0; y < rows; y++) {
+    for (x_idx = 0; x_idx < cols; x_idx++) xd->mi[x_idx + y * mis] = mi_addr;
+  }
+
+  if (cpi->oxcf.q_cfg.aq_mode)
+    av1_init_plane_quantizers(cpi, x, mi_addr->segment_id, 0);
+
+  if (dry_run) return;
+
+#if CONFIG_INTERNAL_STATS
+  {
+    unsigned int *const mode_chosen_counts =
+        (unsigned int *)cpi->mode_chosen_counts;  // Cast const away.
+    if (frame_is_intra_only(cm)) {
+      static const int kf_mode_index[] = {
+        THR_DC /*DC_PRED*/,
+        THR_V_PRED /*V_PRED*/,
+        THR_H_PRED /*H_PRED*/,
+        THR_D45_PRED /*D45_PRED*/,
+        THR_D135_PRED /*D135_PRED*/,
+        THR_D113_PRED /*D113_PRED*/,
+        THR_D157_PRED /*D157_PRED*/,
+        THR_D203_PRED /*D203_PRED*/,
+        THR_D67_PRED /*D67_PRED*/,
+        THR_SMOOTH,   /*SMOOTH_PRED*/
+        THR_SMOOTH_V, /*SMOOTH_V_PRED*/
+        THR_SMOOTH_H, /*SMOOTH_H_PRED*/
+        THR_PAETH /*PAETH_PRED*/,
+      };
+      ++mode_chosen_counts[kf_mode_index[mi_addr->mode]];
+    } else {
+      // Note how often each mode chosen as best
+      ++mode_chosen_counts[ctx->best_mode_index];
+    }
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
+    if (cm->features.interp_filter == SWITCHABLE &&
+        mi_addr->motion_mode != WARPED_CAUSAL &&
+        !is_nontrans_global_motion(xd, xd->mi[0])) {
+      update_filter_type_count(td->counts, xd, mi_addr);
+    }
+  }
+
+  const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row);
+  if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
+    av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
+}
+
+void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+                                 PREDICTION_MODE mode, int16_t mode_context) {
+  (void)counts;
+
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+  if (mode == NEWMV) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->newmv_mode[mode_ctx][0];
+#endif
+    update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
+    return;
+  }
+
+#if CONFIG_ENTROPY_STATS
+  ++counts->newmv_mode[mode_ctx][1];
+#endif
+  update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
+
+  mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+  if (mode == GLOBALMV) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->zeromv_mode[mode_ctx][0];
+#endif
+    update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
+    return;
+  }
+
+#if CONFIG_ENTROPY_STATS
+  ++counts->zeromv_mode[mode_ctx][1];
+#endif
+  update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
+
+  mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+#if CONFIG_ENTROPY_STATS
+  ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+#endif
+  update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
+}
+
+static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                               FRAME_COUNTS *counts) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+  (void)counts;
+
+  if (mbmi->mode == DC_PRED) {
+    const int n = pmi->palette_size[0];
+    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
+#endif
+    update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
+               n > 0, 2);
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
+                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+    }
+  }
+
+  if (mbmi->uv_mode == UV_DC_PRED) {
+    const int n = pmi->palette_size[1];
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
+#endif
+    update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
+
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
+                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+    }
+  }
+}
+
+void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+                         MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                         const MB_MODE_INFO *above_mi,
+                         const MB_MODE_INFO *left_mi, const int intraonly) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const PREDICTION_MODE y_mode = mbmi->mode;
+  (void)counts;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+
+  if (intraonly) {
+#if CONFIG_ENTROPY_STATS
+    const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+    const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+    const int above_ctx = intra_mode_context[above];
+    const int left_ctx = intra_mode_context[left];
+    ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
+  } else {
+#if CONFIG_ENTROPY_STATS
+    ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
+  }
+
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    const int use_filter_intra_mode =
+        mbmi->filter_intra_mode_info.use_filter_intra;
+#if CONFIG_ENTROPY_STATS
+    ++counts->filter_intra[mbmi->bsize][use_filter_intra_mode];
+    if (use_filter_intra_mode) {
+      ++counts
+            ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
+    }
+#endif  // CONFIG_ENTROPY_STATS
+    update_cdf(fc->filter_intra_cdfs[mbmi->bsize], use_filter_intra_mode, 2);
+    if (use_filter_intra_mode) {
+      update_cdf(fc->filter_intra_mode_cdf,
+                 mbmi->filter_intra_mode_info.filter_intra_mode,
+                 FILTER_INTRA_MODES);
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[mbmi->mode - V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
+#endif
+    update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
+               mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
+               2 * MAX_ANGLE_DELTA + 1);
+  }
+
+  if (!xd->is_chroma_ref) return;
+
+  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+  const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+#if CONFIG_ENTROPY_STATS
+  ++counts->uv_mode[cfl_allowed][y_mode][uv_mode];
+#endif  // CONFIG_ENTROPY_STATS
+  update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
+             UV_INTRA_MODES - !cfl_allowed);
+  if (uv_mode == UV_CFL_PRED) {
+    const int8_t joint_sign = mbmi->cfl_alpha_signs;
+    const uint8_t idx = mbmi->cfl_alpha_idx;
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->cfl_sign[joint_sign];
+#endif
+    update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
+    if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
+#endif
+      update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
+    }
+    if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
+#endif
+      update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(uv_mode)) &&
+      av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[uv_mode - UV_V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
+#endif
+    update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
+               mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
+               2 * MAX_ANGLE_DELTA + 1);
+  }
+  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+    update_palette_cdf(xd, mbmi, counts);
+  }
+}
+
+void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                         int mi_row, int mi_col, BLOCK_SIZE bsize,
+                         const int num_planes) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int p;
+  const int num_4x4_blocks_wide = mi_size_wide[bsize];
+  const int num_4x4_blocks_high = mi_size_high[bsize];
+  int mi_width = mi_size_wide[bsize];
+  int mi_height = mi_size_high[bsize];
+  for (p = 0; p < num_planes; p++) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
+    memcpy(
+        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+        ctx->a + num_4x4_blocks_wide * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+            xd->plane[p].subsampling_x);
+    memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+           ctx->l + num_4x4_blocks_high * p,
+           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+               xd->plane[p].subsampling_y);
+  }
+  memcpy(xd->above_partition_context + mi_col, ctx->sa,
+         sizeof(*xd->above_partition_context) * mi_width);
+  memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl,
+         sizeof(xd->left_partition_context[0]) * mi_height);
+  xd->above_txfm_context = ctx->p_ta;
+  xd->left_txfm_context = ctx->p_tl;
+  memcpy(xd->above_txfm_context, ctx->ta,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(xd->left_txfm_context, ctx->tl,
+         sizeof(*xd->left_txfm_context) * mi_height);
+}
+
+void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                      int mi_row, int mi_col, BLOCK_SIZE bsize,
+                      const int num_planes) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  int p;
+  int mi_width = mi_size_wide[bsize];
+  int mi_height = mi_size_high[bsize];
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < num_planes; ++p) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
+    memcpy(
+        ctx->a + mi_width * p,
+        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+        (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x);
+    memcpy(ctx->l + mi_height * p,
+           xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+           (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y);
+  }
+  memcpy(ctx->sa, xd->above_partition_context + mi_col,
+         sizeof(*xd->above_partition_context) * mi_width);
+  memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK),
+         sizeof(xd->left_partition_context[0]) * mi_height);
+  memcpy(ctx->ta, xd->above_txfm_context,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(ctx->tl, xd->left_txfm_context,
+         sizeof(*xd->left_txfm_context) * mi_height);
+  ctx->p_ta = xd->above_txfm_context;
+  ctx->p_tl = xd->left_txfm_context;
+}
+
+static void set_partial_sb_partition(const AV1_COMMON *const cm,
+                                     MB_MODE_INFO *mi, int bh_in, int bw_in,
+                                     int mi_rows_remaining,
+                                     int mi_cols_remaining, BLOCK_SIZE bsize,
+                                     MB_MODE_INFO **mib) {
+  int bh = bh_in;
+  int r, c;
+  for (r = 0; r < cm->seq_params->mib_size; r += bh) {
+    int bw = bw_in;
+    for (c = 0; c < cm->seq_params->mib_size; c += bw) {
+      const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c);
+      const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c);
+      mib[grid_index] = mi + mi_index;
+      mib[grid_index]->bsize = find_partition_size(
+          bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
+    }
+  }
+}
+
+// This function attempts to set all mode info entries in a given superblock
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                MB_MODE_INFO **mib, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
+  MB_MODE_INFO *const mi_upper_left =
+      mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col);
+  int bh = mi_size_high[bsize];
+  int bw = mi_size_wide[bsize];
+
+  assert(bsize >= mi_params->mi_alloc_bsize &&
+         "Attempted to use bsize < mi_params->mi_alloc_bsize");
+  assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
+
+  // Apply the requested partition size to the SB if it is all "in image"
+  if ((mi_cols_remaining >= cm->seq_params->mib_size) &&
+      (mi_rows_remaining >= cm->seq_params->mib_size)) {
+    for (int block_row = 0; block_row < cm->seq_params->mib_size;
+         block_row += bh) {
+      for (int block_col = 0; block_col < cm->seq_params->mib_size;
+           block_col += bw) {
+        const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col);
+        const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col);
+        mib[grid_index] = mi_upper_left + mi_index;
+        mib[grid_index]->bsize = bsize;
+      }
+    }
+  } else {
+    // Else this is a partial SB.
+    set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
+                             mi_cols_remaining, bsize, mib);
+  }
+}
+
+int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize) {
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  assert(bsize >= BLOCK_8X8);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  for (int i = 0; i < 4; i++) {
+    int x_idx = (i & 1) * hbs;
+    int y_idx = (i >> 1) * hbs;
+    if ((mi_row + y_idx >= cm->mi_params.mi_rows) ||
+        (mi_col + x_idx >= cm->mi_params.mi_cols))
+      return 0;
+    if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) !=
+            PARTITION_NONE &&
+        subsize != BLOCK_8X8)
+      return 0;
+  }
+  return 1;
+}
+
+#if !CONFIG_REALTIME_ONLY
+int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col, int orig_rdmult) {
+  AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  int64_t intra_cost = 0;
+  int64_t mc_dep_cost = 0;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+
+  if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) {
+    return orig_rdmult;
+  }
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+    return orig_rdmult;
+  }
+
+#ifndef NDEBUG
+  int mi_count = 0;
+#endif
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int step = 1 << block_mis_log2;
+  const int row_step = step;
+  const int col_step_sr =
+      coded_to_superres_mi(step, cm->superres_scale_denominator);
+  for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
+      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+#ifndef NDEBUG
+      mi_count++;
+#endif
+    }
+  }
+  assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+
+  double beta = 1.0;
+  if (mc_dep_cost > 0 && intra_cost > 0) {
+    const double r0 = cpi->rd.r0;
+    const double rk = (double)intra_cost / mc_dep_cost;
+    beta = (r0 / rk);
+  }
+
+  int rdmult = av1_get_adaptive_rdmult(cpi, beta);
+
+  rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2);
+  rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2);
+
+  rdmult = AOMMAX(1, rdmult);
+
+  return rdmult;
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_params.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (is_stat_consumption_stage_twopass(cpi)) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+        &cpi->ppi->twopass, cm->current_frame.display_order_hint);
+    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(this_frame_stats->inactive_zone_rows * 4);
+
+    bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4);
+    bottom_edge = AOMMAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_params.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (is_stat_consumption_stage_twopass(cpi)) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+        &cpi->ppi->twopass, cm->current_frame.display_order_hint);
+    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(this_frame_stats->inactive_zone_cols * 4);
+
+    right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4);
+    right_edge = AOMMAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+
+void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                          int mi_col, SuperBlockEnc *sb_enc) {
+  sb_enc->tpl_data_count = 0;
+
+  if (!cpi->oxcf.algo_cfg.enable_tpl_model) return;
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) return;
+  const FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE)
+    return;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int gf_group_index = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  if (!av1_tpl_stats_ready(tpl_data, gf_group_index)) return;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+
+  int mi_count = 0;
+  int count = 0;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  // mi_cols_sr is mi_cols at superres case.
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+  // TPL store unit size is not the same as the motion estimation unit size.
+  // Here always use motion estimation size to avoid getting repetitive inter/
+  // intra cost.
+  const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
+  assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]);
+  const int row_step = mi_size_high[tpl_bsize];
+  const int col_step_sr = coded_to_superres_mi(mi_size_wide[tpl_bsize],
+                                               cm->superres_scale_denominator);
+
+  // Stride is only based on SB size, and we fill in values for every 16x16
+  // block in a SB.
+  sb_enc->tpl_stride = (mi_col_end_sr - mi_col_sr) / col_step_sr;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+      assert(count < MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+      // Handle partial SB, so that no invalid values are used later.
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) {
+        sb_enc->tpl_inter_cost[count] = INT64_MAX;
+        sb_enc->tpl_intra_cost[count] = INT64_MAX;
+        for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+          sb_enc->tpl_mv[count][i].as_int = INVALID_MV;
+        }
+        count++;
+        continue;
+      }
+
+      TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+      sb_enc->tpl_inter_cost[count] = this_stats->inter_cost;
+      sb_enc->tpl_intra_cost[count] = this_stats->intra_cost;
+      memcpy(sb_enc->tpl_mv[count], this_stats->mv, sizeof(this_stats->mv));
+      mi_count++;
+      count++;
+    }
+  }
+
+  assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+  sb_enc->tpl_data_count = mi_count;
+}
+
+// analysis_type 0: Use mc_dep_cost and intra_cost
+// analysis_type 1: Use count of best inter predictor chosen
+// analysis_type 2: Use cost reduction from intra to inter for best inter
+//                  predictor chosen
+int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td,
+                                   int64_t *delta_dist, BLOCK_SIZE bsize,
+                                   int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  double intra_cost = 0;
+  double mc_dep_reg = 0;
+  double mc_dep_cost = 0;
+  double cbcmp_base = 1;
+  double srcrf_dist = 0;
+  double srcrf_sse = 0;
+  double srcrf_rate = 0;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+  const int base_qindex = cm->quant_params.base_qindex;
+
+  if (tpl_idx >= MAX_TPL_FRAME_IDX) return base_qindex;
+
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+  if (!tpl_frame->is_valid) return base_qindex;
+
+#ifndef NDEBUG
+  int mi_count = 0;
+#endif
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int step = 1 << block_mis_log2;
+  const int row_step = step;
+  const int col_step_sr =
+      coded_to_superres_mi(step, cm->superres_scale_denominator);
+  for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      double cbcmp = (double)this_stats->srcrf_dist;
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+      intra_cost += log(dist_scaled) * cbcmp;
+      mc_dep_cost += log(dist_scaled + mc_dep_delta) * cbcmp;
+      mc_dep_reg += log(3 * dist_scaled + mc_dep_delta) * cbcmp;
+      srcrf_dist += (double)(this_stats->srcrf_dist << RDDIV_BITS);
+      srcrf_sse += (double)(this_stats->srcrf_sse << RDDIV_BITS);
+      srcrf_rate += (double)this_stats->srcrf_rate;
+#ifndef NDEBUG
+      mi_count++;
+#endif
+      cbcmp_base += cbcmp;
+    }
+  }
+  assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+
+  int offset = 0;
+  double beta = 1.0;
+  double rk;
+  if (mc_dep_cost > 0 && intra_cost > 0) {
+    const double r0 = cpi->rd.r0;
+    rk = exp((intra_cost - mc_dep_cost) / cbcmp_base);
+    td->mb.rb = exp((intra_cost - mc_dep_reg) / cbcmp_base);
+    beta = (r0 / rk);
+    assert(beta > 0.0);
+  } else {
+    return base_qindex;
+  }
+  offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta);
+
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+
+  int frm_qstep = av1_dc_quant_QTX(base_qindex, 0, cm->seq_params->bit_depth);
+  int sbs_qstep =
+      av1_dc_quant_QTX(base_qindex, offset, cm->seq_params->bit_depth);
+
+  if (delta_dist) {
+    double sbs_dist = srcrf_dist * pow((double)sbs_qstep / frm_qstep, 2.0);
+    double sbs_rate = srcrf_rate * ((double)frm_qstep / sbs_qstep);
+    sbs_dist = AOMMIN(sbs_dist, srcrf_sse);
+    *delta_dist = (int64_t)((sbs_dist - srcrf_dist) / rk);
+    *delta_dist += RDCOST(tpl_frame->base_rdmult, 4 * 256, 0);
+    *delta_dist += RDCOST(tpl_frame->base_rdmult, sbs_rate - srcrf_rate, 0);
+  }
+  return qindex;
+}
+
+#if !DISABLE_HDR_LUMA_DELTAQ
+// offset table defined in Table3 of T-REC-H.Sup15 document.
+static const int hdr_thres[HDR_QP_LEVELS + 1] = { 0,   301, 367, 434, 501, 567,
+                                                  634, 701, 767, 834, 1024 };
+
+static const int hdr10_qp_offset[HDR_QP_LEVELS] = { 3,  2,  1,  0,  -1,
+                                                    -2, -3, -4, -5, -6 };
+#endif
+
+int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
+                      BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(cm->seq_params->bit_depth == AOM_BITS_10);
+
+#if DISABLE_HDR_LUMA_DELTAQ
+  (void)x;
+  (void)bsize;
+  (void)mi_row;
+  (void)mi_col;
+  return cm->quant_params.base_qindex;
+#else
+  // calculate pixel average
+  const int block_luma_avg = av1_log_block_avg(cpi, x, bsize, mi_row, mi_col);
+  // adjust offset based on average of the pixel block
+  int offset = 0;
+  for (int i = 0; i < HDR_QP_LEVELS; i++) {
+    if (block_luma_avg >= hdr_thres[i] && block_luma_avg < hdr_thres[i + 1]) {
+      offset = (int)(hdr10_qp_offset[i] * QP_SCALE_FACTOR);
+      break;
+    }
+  }
+
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+
+  return qindex;
+#endif
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                            BLOCK_SIZE bsize) {
+  if (sms_tree == NULL) return;
+  sms_tree->partitioning = PARTITION_NONE;
+
+  if (bsize >= BLOCK_8X8) {
+    BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    for (int idx = 0; idx < 4; ++idx)
+      av1_reset_simple_motion_tree_partition(sms_tree->split[idx], subsize);
+  }
+}
+
+// Record the ref frames that have been selected by square partition blocks.
+void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
+                                       BLOCK_SIZE bsize, int mib_size,
+                                       int mi_row, int mi_col) {
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+  const int sb_size_mask = mib_size - 1;
+  const int mi_row_in_sb = mi_row & sb_size_mask;
+  const int mi_col_in_sb = mi_col & sb_size_mask;
+  const int mi_size = mi_size_wide[bsize];
+  for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) {
+    for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) {
+      x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type;
+    }
+  }
+}
+
+static void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, aom_cdf_prob *cdf_ptr_tr,
+                           int num_cdfs, int cdf_stride, int nsymbs,
+                           int wt_left, int wt_tr) {
+  for (int i = 0; i < num_cdfs; i++) {
+    for (int j = 0; j <= nsymbs; j++) {
+      cdf_ptr_left[i * cdf_stride + j] =
+          (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left +
+                          (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr +
+                          ((wt_left + wt_tr) / 2)) /
+                         (wt_left + wt_tr));
+      assert(cdf_ptr_left[i * cdf_stride + j] >= 0 &&
+             cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP);
+    }
+  }
+}
+
+#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \
+  AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs))
+
+#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride)           \
+  do {                                                                     \
+    aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left;               \
+    aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr;                   \
+    int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob);       \
+    int num_cdfs = array_size / cdf_stride;                                \
+    avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \
+                   wt_left, wt_tr);                                        \
+  } while (0)
+
+static void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, int wt_left,
+                    int wt_tr) {
+  AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4);
+  for (int i = 0; i < 2; i++) {
+    AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf,
+                MV_CLASSES);
+    AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf,
+                nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf,
+                nmv_tr->comps[i].class0_hp_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf,
+                CLASS0_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2);
+  }
+}
+
+// In case of row-based multi-threading of encoder, since we always
+// keep a top - right sync, we can average the top - right SB's CDFs and
+// the left SB's CDFs and use the same for current SB's encoding to
+// improve the performance. This function facilitates the averaging
+// of CDF and used only when row-mt is enabled in encoder.
+void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
+                         int wt_left, int wt_tr) {
+  AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2);
+  AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2);
+  AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11);
+  AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3);
+  AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4);
+  AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE);
+  AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2);
+  AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2);
+  AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2);
+  AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2);
+  AVERAGE_CDF(ctx_left->inter_compound_mode_cdf,
+              ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+  AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf,
+              MASKED_COMPOUND_TYPES);
+  AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16);
+  AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2);
+  AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2);
+  AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf,
+              INTERINTRA_MODES);
+  AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES);
+  AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2);
+  AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf,
+              PALETTE_SIZES);
+  AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf,
+              PALETTE_SIZES);
+  for (int j = 0; j < PALETTE_SIZES; j++) {
+    int nsymbs = j + PALETTE_MIN_SIZE;
+    AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j],
+                   ctx_tr->palette_y_color_index_cdf[j], nsymbs,
+                   CDF_SIZE(PALETTE_COLORS));
+    AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j],
+                   ctx_tr->palette_uv_color_index_cdf[j], nsymbs,
+                   CDF_SIZE(PALETTE_COLORS));
+  }
+  AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2);
+  AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2);
+  AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2);
+  AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2);
+  AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2);
+  AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2);
+  AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2);
+  AVERAGE_CDF(ctx_left->skip_txfm_cdfs, ctx_tr->skip_txfm_cdfs, 2);
+  AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2);
+  avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr);
+  avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr);
+  AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2);
+  AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2);
+  AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf,
+              ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+  AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2);
+  AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf,
+              FILTER_INTRA_MODES);
+  AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf,
+              RESTORE_SWITCHABLE_TYPES);
+  AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2);
+  AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2);
+  AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES);
+  AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0],
+                 UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES));
+  AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES);
+  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+    if (i < 4) {
+      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4,
+                     CDF_SIZE(10));
+    } else if (i < 16) {
+      AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10);
+    } else {
+      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8,
+                     CDF_SIZE(10));
+    }
+  }
+  AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf,
+              SWITCHABLE_FILTERS);
+  AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES);
+  AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf,
+              2 * MAX_ANGLE_DELTA + 1);
+  AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH,
+                 CDF_SIZE(MAX_TX_DEPTH + 1));
+  AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1);
+  AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1);
+  for (int i = 0; i < FRAME_LF_COUNT; i++) {
+    AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i],
+                DELTA_LF_PROBS + 1);
+  }
+  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2,
+                 CDF_SIZE(TX_TYPES));
+  AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf,
+              CFL_ALPHABET_SIZE);
+}
+
+// Grade the temporal variation of the source by comparing the current sb and
+// its collocated block in the last frame.
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                           int mi_col) {
+  unsigned int tmp_sse;
+  unsigned int tmp_variance;
+  const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
+  uint8_t *src_y = cpi->source->y_buffer;
+  int src_ystride = cpi->source->y_stride;
+  uint8_t *last_src_y = cpi->last_source->y_buffer;
+  int last_src_ystride = cpi->last_source->y_stride;
+  const int offset = cpi->source->y_stride * (mi_row << 2) + (mi_col << 2);
+  uint64_t avg_source_sse_threshold[2] = { 100000,   // ~5*5*(64*64)
+                                           36000 };  // ~3*3*(64*64)
+  uint64_t avg_source_sse_threshold_high = 1000000;  // ~15*15*(64*64)
+  uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
+#if CONFIG_AV1_HIGHBITDEPTH
+  MACROBLOCKD *xd = &x->e_mbd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return;
+#endif
+  src_y += offset;
+  last_src_y += offset;
+  tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+                                            last_src_ystride, &tmp_sse);
+  // rd thresholds
+  if (tmp_sse < avg_source_sse_threshold[1])
+    x->content_state_sb.source_sad_rd = kLowSad;
+
+  // nonrd thresholds
+  if (tmp_sse == 0)
+    x->content_state_sb.source_sad_nonrd = kZeroSad;
+  else if (tmp_sse < avg_source_sse_threshold[0])
+    x->content_state_sb.source_sad_nonrd = kLowSad;
+  else if (tmp_sse > avg_source_sse_threshold_high)
+    x->content_state_sb.source_sad_nonrd = kHighSad;
+
+  // Detect large lighting change.
+  // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
+  if (tmp_sse > 0) {
+    if (tmp_variance < (tmp_sse >> 1) &&
+        (tmp_sse - tmp_variance) > sum_sq_thresh)
+      x->content_state_sb.lighting_change = 1;
+    if ((tmp_sse - tmp_variance) < (sum_sq_thresh >> 1))
+      x->content_state_sb.low_sumdiff = 1;
+  }
+
+  if (cpi->last_source->y_width != cpi->source->y_width ||
+      cpi->last_source->y_height != cpi->source->y_height)
+    return;
+  if (!cpi->sf.rt_sf.use_rtc_tf) return;
+
+  // In-place temporal filter. If psnr calculation is enabled, we store the
+  // source for that.
+  AV1_COMMON *const cm = &cpi->common;
+  // Calculate n*mean^2
+  const unsigned int nmean2 = tmp_sse - tmp_variance;
+  const int ac_q_step = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
+                                         cm->seq_params->bit_depth);
+  const unsigned int threshold = 3 * ac_q_step * ac_q_step / 2;
+
+  // TODO(yunqing): use a weighted sum instead of averaging in filtering.
+  if (tmp_variance <= threshold && nmean2 <= 15) {
+    const int shift_x[2] = { 0, cpi->source->subsampling_x };
+    const int shift_y[2] = { 0, cpi->source->subsampling_y };
+    const uint8_t h = block_size_high[bsize];
+    const uint8_t w = block_size_wide[bsize];
+
+    for (int plane = 0; plane < av1_num_planes(cm); ++plane) {
+      uint8_t *src = cpi->source->buffers[plane];
+      const int src_stride = cpi->source->strides[plane != 0];
+      uint8_t *last_src = cpi->last_source->buffers[plane];
+      const int last_src_stride = cpi->last_source->strides[plane != 0];
+      src += src_stride * (mi_row << (2 - shift_y[plane != 0])) +
+             (mi_col << (2 - shift_x[plane != 0]));
+      last_src += last_src_stride * (mi_row << (2 - shift_y[plane != 0])) +
+                  (mi_col << (2 - shift_x[plane != 0]));
+
+      for (int i = 0; i < (h >> shift_y[plane != 0]); ++i) {
+        for (int j = 0; j < (w >> shift_x[plane != 0]); ++j) {
+          src[j] = (last_src[j] + src[j]) >> 1;
+        }
+        src += src_stride;
+        last_src += last_src_stride;
+      }
+    }
+  }
+}
+
+// Memset the mbmis at the current superblock to 0
+void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size,
+                    int mi_row, int mi_col) {
+  // size of sb in unit of mi (BLOCK_4X4)
+  const int sb_size_mi = mi_size_wide[sb_size];
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  // size of sb in unit of allocated mi size
+  const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d;
+  assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 &&
+         "mi is not allocated as a multiple of sb!");
+  assert(mi_params->mi_stride % sb_size_mi == 0 &&
+         "mi_grid_base is not allocated as a multiple of sb!");
+
+  const int mi_rows = mi_size_high[sb_size];
+  for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) {
+    assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) <
+           mi_params->mi_stride);
+    const int mi_grid_idx =
+        get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col);
+    const int alloc_mi_idx =
+        get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col);
+    memset(&mi_params->mi_grid_base[mi_grid_idx], 0,
+           sb_size_mi * sizeof(*mi_params->mi_grid_base));
+    memset(&mi_params->tx_type_map[mi_grid_idx], 0,
+           sb_size_mi * sizeof(*mi_params->tx_type_map));
+    if (cur_mi_row % mi_alloc_size_1d == 0) {
+      memset(&mi_params->mi_alloc[alloc_mi_idx], 0,
+             sb_size_alloc_mi * sizeof(*mi_params->mi_alloc));
+    }
+  }
+}
+
+void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi,
+                         ThreadData *td, const TileDataEnc *tile_data,
+                         int mi_row, int mi_col) {
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const TileInfo *tile_info = &tile_data->tile_info;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
+
+  sb_fp_stats->rd_count = cpi->td.rd_counts;
+  sb_fp_stats->split_count = x->txfm_search_info.txb_split_count;
+
+  sb_fp_stats->fc = *td->counts;
+
+  memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models,
+         sizeof(sb_fp_stats->inter_mode_rd_models));
+
+  memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact,
+         sizeof(sb_fp_stats->thresh_freq_fact));
+
+  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+  sb_fp_stats->current_qindex =
+      cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts,
+         sizeof(sb_fp_stats->mode_chosen_counts));
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
+                          ThreadData *td, TileDataEnc *tile_data, int mi_row,
+                          int mi_col) {
+  MACROBLOCK *x = &td->mb;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+  av1_restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size,
+                      num_planes);
+
+  cpi->td.rd_counts = sb_fp_stats->rd_count;
+  x->txfm_search_info.txb_split_count = sb_fp_stats->split_count;
+
+  *td->counts = sb_fp_stats->fc;
+
+  memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models,
+         sizeof(sb_fp_stats->inter_mode_rd_models));
+  memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact,
+         sizeof(sb_fp_stats->thresh_freq_fact));
+
+  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+  cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
+      sb_fp_stats->current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts,
+         sizeof(sb_fp_stats->mode_chosen_counts));
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+/*! Checks whether to skip updating the entropy cost based on tile info.
+ *
+ * This function contains the common code used to skip the cost update of coeff,
+ * mode, mv and dv symbols.
+ */
+static int skip_cost_update(const SequenceHeader *seq_params,
+                            const TileInfo *const tile_info, const int mi_row,
+                            const int mi_col,
+                            INTERNAL_COST_UPDATE_TYPE upd_level) {
+  if (upd_level == INTERNAL_COST_UPD_SB) return 0;
+  if (upd_level == INTERNAL_COST_UPD_OFF) return 1;
+
+  // upd_level is at most as frequent as each sb_row in a tile.
+  if (mi_col != tile_info->mi_col_start) return 1;
+
+  if (upd_level == INTERNAL_COST_UPD_SBROW_SET) {
+    const int mib_size_log2 = seq_params->mib_size_log2;
+    const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
+    const int sb_size = seq_params->mib_size * MI_SIZE;
+    const int tile_height =
+        (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE;
+    // When upd_level = INTERNAL_COST_UPD_SBROW_SET, the cost update happens
+    // once for 2, 4 sb rows for sb size 128, sb size 64 respectively. However,
+    // as the update will not be equally spaced in smaller resolutions making
+    // it equally spaced by calculating (mv_num_rows_cost_update) the number of
+    // rows after which the cost update should happen.
+    const int sb_size_update_freq_map[2] = { 2, 4 };
+    const int update_freq_sb_rows =
+        sb_size_update_freq_map[sb_size != MAX_SB_SIZE];
+    const int update_freq_num_rows = sb_size * update_freq_sb_rows;
+    // Round-up the division result to next integer.
+    const int num_updates_per_tile =
+        (tile_height + update_freq_num_rows - 1) / update_freq_num_rows;
+    const int num_rows_update_per_tile = num_updates_per_tile * sb_size;
+    // Round-up the division result to next integer.
+    const int num_sb_rows_per_update =
+        (tile_height + num_rows_update_per_tile - 1) / num_rows_update_per_tile;
+    if ((sb_row % num_sb_rows_per_update) != 0) return 1;
+  }
+  return 0;
+}
+
+// Checks for skip status of mv cost update.
+static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
+                               const int mi_row, const int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  // For intra frames, mv cdfs are not updated during the encode. Hence, the mv
+  // cost calculation is skipped in this case.
+  if (frame_is_intra_only(cm)) return 1;
+
+  return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+                          cpi->sf.inter_sf.mv_cost_upd_level);
+}
+
+// Checks for skip status of dv cost update.
+static int skip_dv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
+                               const int mi_row, const int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  // Intrabc is only applicable to intra frames. So skip if intrabc is not
+  // allowed.
+  if (!av1_allow_intrabc(cm) || is_stat_generation_stage(cpi)) {
+    return 1;
+  }
+
+  return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+                          cpi->sf.intra_sf.dv_cost_upd_level);
+}
+
+// Update the rate costs of some symbols according to the frequency directed
+// by speed features
+void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
+                           const TileInfo *const tile_info, const int mi_row,
+                           const int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  switch (cpi->sf.inter_sf.coeff_cost_upd_level) {
+    case INTERNAL_COST_UPD_OFF:
+    case INTERNAL_COST_UPD_TILE:  // Tile level
+      break;
+    case INTERNAL_COST_UPD_SBROW_SET:  // SB row set level in tile
+    case INTERNAL_COST_UPD_SBROW:      // SB row level in tile
+    case INTERNAL_COST_UPD_SB:         // SB level
+      if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+                           cpi->sf.inter_sf.coeff_cost_upd_level))
+        break;
+      av1_fill_coeff_costs(&x->coeff_costs, xd->tile_ctx, num_planes);
+      break;
+    default: assert(0);
+  }
+
+  switch (cpi->sf.inter_sf.mode_cost_upd_level) {
+    case INTERNAL_COST_UPD_OFF:
+    case INTERNAL_COST_UPD_TILE:  // Tile level
+      break;
+    case INTERNAL_COST_UPD_SBROW_SET:  // SB row set level in tile
+    case INTERNAL_COST_UPD_SBROW:      // SB row level in tile
+    case INTERNAL_COST_UPD_SB:         // SB level
+      if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+                           cpi->sf.inter_sf.mode_cost_upd_level))
+        break;
+      av1_fill_mode_rates(cm, &x->mode_costs, xd->tile_ctx);
+      break;
+    default: assert(0);
+  }
+
+  switch (cpi->sf.inter_sf.mv_cost_upd_level) {
+    case INTERNAL_COST_UPD_OFF:
+    case INTERNAL_COST_UPD_TILE:  // Tile level
+      break;
+    case INTERNAL_COST_UPD_SBROW_SET:  // SB row set level in tile
+    case INTERNAL_COST_UPD_SBROW:      // SB row level in tile
+    case INTERNAL_COST_UPD_SB:         // SB level
+      // Checks for skip status of mv cost update.
+      if (skip_mv_cost_update(cpi, tile_info, mi_row, mi_col)) break;
+      av1_fill_mv_costs(&xd->tile_ctx->nmvc,
+                        cm->features.cur_frame_force_integer_mv,
+                        cm->features.allow_high_precision_mv, x->mv_costs);
+      break;
+    default: assert(0);
+  }
+
+  switch (cpi->sf.intra_sf.dv_cost_upd_level) {
+    case INTERNAL_COST_UPD_OFF:
+    case INTERNAL_COST_UPD_TILE:  // Tile level
+      break;
+    case INTERNAL_COST_UPD_SBROW_SET:  // SB row set level in tile
+    case INTERNAL_COST_UPD_SBROW:      // SB row level in tile
+    case INTERNAL_COST_UPD_SB:         // SB level
+      // Checks for skip status of dv cost update.
+      if (skip_dv_cost_update(cpi, tile_info, mi_row, mi_col)) break;
+      av1_fill_dv_costs(&xd->tile_ctx->ndvc, x->dv_costs);
+      break;
+    default: assert(0);
+  }
+}
diff --git a/media/libaom/src/av1/encoder/encodeframe_utils.h b/media/libaom/src/av1/encoder/encodeframe_utils.h
new file mode 100644
index 0000000000..3a0df601cd
--- /dev/null
+++ b/media/libaom/src/av1/encoder/encodeframe_utils.h
@@ -0,0 +1,572 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
+
+#include "aom_ports/aom_timer.h"
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define WRITE_FEATURE_TO_FILE 0
+
+#define FEATURE_SIZE_SMS_SPLIT_FAST 6
+#define FEATURE_SIZE_SMS_SPLIT 17
+#define FEATURE_SIZE_SMS_PRUNE_PART 25
+#define FEATURE_SIZE_SMS_TERM_NONE 28
+#define FEATURE_SIZE_FP_SMS_TERM_NONE 20
+#define FEATURE_SIZE_MAX_MIN_PART_PRED 13
+#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
+
+#define FEATURE_SMS_NONE_FLAG 1
+#define FEATURE_SMS_SPLIT_FLAG (1 << 1)
+#define FEATURE_SMS_RECT_FLAG (1 << 2)
+
+#define FEATURE_SMS_PRUNE_PART_FLAG \
+  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG)
+#define FEATURE_SMS_SPLIT_MODEL_FLAG \
+  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG)
+
+// Number of sub-partitions in rectangular partition types.
+#define SUB_PARTITIONS_RECT 2
+
+// Number of sub-partitions in split partition type.
+#define SUB_PARTITIONS_SPLIT 4
+
+// Number of sub-partitions in AB partition types.
+#define SUB_PARTITIONS_AB 3
+
+// Number of sub-partitions in 4-way partition types.
+#define SUB_PARTITIONS_PART4 4
+
+// 4part partition types.
+enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES);
+
+// AB partition types.
+enum {
+  HORZ_A = 0,
+  HORZ_B,
+  VERT_A,
+  VERT_B,
+  NUM_AB_PARTS
+} UENUM1BYTE(AB_PART_TYPE);
+
+// Rectangular partition types.
+enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE);
+
+// Structure to keep win flags for HORZ and VERT partition evaluations.
+typedef struct {
+  int rect_part_win[NUM_RECT_PARTS];
+} RD_RECT_PART_WIN_INFO;
+
+enum { PICK_MODE_RD = 0, PICK_MODE_NONRD };
+
+enum {
+  SB_SINGLE_PASS,  // Single pass encoding: all ctxs get updated normally
+  SB_DRY_PASS,     // First pass of multi-pass: does not update the ctxs
+  SB_WET_PASS      // Second pass of multi-pass: finalize and update the ctx
+} UENUM1BYTE(SB_MULTI_PASS_MODE);
+
+typedef struct {
+  ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
+  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+  TXFM_CONTEXT *p_ta;
+  TXFM_CONTEXT *p_tl;
+  TXFM_CONTEXT ta[MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[MAX_MIB_SIZE];
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+// This struct is used to store the statistics used by sb-level multi-pass
+// encoding. Currently, this is only used to make a copy of the state before we
+// perform the first pass
+typedef struct SB_FIRST_PASS_STATS {
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_COUNTS rd_count;
+
+  int split_count;
+  FRAME_COUNTS fc;
+  InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+  int current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+#endif  // CONFIG_INTERNAL_STATS
+} SB_FIRST_PASS_STATS;
+
+// This structure contains block size related
+// variables for use in rd_pick_partition().
+typedef struct {
+  // Half of block width to determine block edge.
+  int mi_step;
+
+  // Block row and column indices.
+  int mi_row;
+  int mi_col;
+
+  // Block edge row and column indices.
+  int mi_row_edge;
+  int mi_col_edge;
+
+  // Block width of current partition block.
+  int width;
+
+  // Block width of minimum partition size allowed.
+  int min_partition_size_1d;
+
+  // Flag to indicate if partition is 8x8 or higher size.
+  int bsize_at_least_8x8;
+
+  // Indicates edge blocks in frame.
+  int has_rows;
+  int has_cols;
+
+  // Block size of current partition.
+  BLOCK_SIZE bsize;
+
+  // Size of current sub-partition.
+  BLOCK_SIZE subsize;
+
+  // Size of split partition.
+  BLOCK_SIZE split_bsize2;
+} PartitionBlkParams;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+typedef struct PartitionTimingStats {
+  // Tracks the number of partition decision used in the current call to \ref
+  // av1_rd_pick_partition
+  int partition_decisions[EXT_PARTITION_TYPES];
+  // Tracks the number of partition_block searched in the current call to \ref
+  // av1_rd_pick_partition
+  int partition_attempts[EXT_PARTITION_TYPES];
+  // Tracks the time spent on each partition search in the current call to \ref
+  // av1_rd_pick_partition
+  int64_t partition_times[EXT_PARTITION_TYPES];
+  // Tracks the rdcost spent on each partition search in the current call to
+  // \ref av1_rd_pick_partition
+  int64_t partition_rdcost[EXT_PARTITION_TYPES];
+  // Timer used to time the partitions.
+  struct aom_usec_timer timer;
+  // Whether the timer is on
+  int timer_is_on;
+} PartitionTimingStats;
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
+// Structure holding state variables for partition search.
+typedef struct {
+  // Intra partitioning related info.
+  PartitionSearchInfo *intra_part_info;
+
+  // Parameters related to partition block size.
+  PartitionBlkParams part_blk_params;
+
+  // Win flags for HORZ and VERT partition evaluations.
+  RD_RECT_PART_WIN_INFO split_part_rect_win[SUB_PARTITIONS_SPLIT];
+
+  // RD cost for the current block of given partition type.
+  RD_STATS this_rdc;
+
+  // RD cost summed across all blocks of partition type.
+  RD_STATS sum_rdc;
+
+  // Array holding partition type cost.
+  int tmp_partition_cost[PARTITION_TYPES];
+
+  // Pointer to partition cost buffer
+  int *partition_cost;
+
+  // RD costs for different partition types.
+  int64_t none_rd;
+  int64_t split_rd[SUB_PARTITIONS_SPLIT];
+  // RD costs for rectangular partitions.
+  // rect_part_rd[0][i] is the RD cost of ith partition index of PARTITION_HORZ.
+  // rect_part_rd[1][i] is the RD cost of ith partition index of PARTITION_VERT.
+  int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT];
+
+  // Flags indicating if the corresponding partition was winner or not.
+  // Used to bypass similar blocks during AB partition evaluation.
+  int is_split_ctx_is_ready[2];
+  int is_rect_ctx_is_ready[NUM_RECT_PARTS];
+
+  // If true, skips the rest of partition evaluation at the current bsize level.
+  int terminate_partition_search;
+
+  // If false, skips rdopt on PARTITION_NONE.
+  int partition_none_allowed;
+
+  // If partition_rect_allowed[HORZ] is false, skips searching PARTITION_HORZ,
+  // PARTITION_HORZ_A, PARTITIO_HORZ_B, PARTITION_HORZ_4. Same holds for VERT.
+  int partition_rect_allowed[NUM_RECT_PARTS];
+
+  // If false, skips searching rectangular partition unless some logic related
+  // to edge detection holds.
+  int do_rectangular_split;
+
+  // If false, skips searching PARTITION_SPLIT.
+  int do_square_split;
+
+  // If true, prunes the corresponding PARTITION_HORZ/PARTITION_VERT. Note that
+  // this does not directly affect the extended partitions, so this can be used
+  // to prune out PARTITION_HORZ/PARTITION_VERT while still allowing rdopt of
+  // PARTITION_HORZ_AB4, etc.
+  int prune_rect_part[NUM_RECT_PARTS];
+
+  // Chroma subsampling in x and y directions.
+  int ss_x;
+  int ss_y;
+
+  // Partition plane context index.
+  int pl_ctx_idx;
+
+  // This flag will be set if best partition is found from the search.
+  bool found_best_partition;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  PartitionTimingStats part_timing_stats;
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+} PartitionSearchState;
+
+static AOM_INLINE void av1_disable_square_split_partition(
+    PartitionSearchState *part_state) {
+  part_state->do_square_split = 0;
+}
+
+// Disables all possible rectangular splits. This includes PARTITION_AB4 as they
+// depend on the corresponding partition_rect_allowed.
+static AOM_INLINE void av1_disable_rect_partitions(
+    PartitionSearchState *part_state) {
+  part_state->do_rectangular_split = 0;
+  part_state->partition_rect_allowed[HORZ] = 0;
+  part_state->partition_rect_allowed[VERT] = 0;
+}
+
+// Disables all possible splits so that only PARTITION_NONE *might* be allowed.
+static AOM_INLINE void av1_disable_all_splits(
+    PartitionSearchState *part_state) {
+  av1_disable_square_split_partition(part_state);
+  av1_disable_rect_partitions(part_state);
+}
+
+static AOM_INLINE void av1_set_square_split_only(
+    PartitionSearchState *part_state) {
+  part_state->partition_none_allowed = 0;
+  part_state->do_square_split = 1;
+  av1_disable_rect_partitions(part_state);
+}
+
+static AOM_INLINE bool av1_blk_has_rows_and_cols(
+    const PartitionBlkParams *blk_params) {
+  return blk_params->has_rows && blk_params->has_cols;
+}
+
+static AOM_INLINE bool av1_is_whole_blk_in_frame(
+    const PartitionBlkParams *blk_params,
+    const CommonModeInfoParams *mi_params) {
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+  return mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
+         mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
+}
+
+static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
+                                              const MB_MODE_INFO *mbmi,
+                                              int dual_filter) {
+  for (int dir = 0; dir < 2; ++dir) {
+    if (dir && !dual_filter) break;
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+    update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
+               SWITCHABLE_FILTERS);
+  }
+}
+
+static AOM_INLINE int set_segment_rdmult(const AV1_COMP *const cpi,
+                                         MACROBLOCK *const x,
+                                         int8_t segment_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  av1_init_plane_quantizers(cpi, x, segment_id, 0);
+  const int segment_qindex =
+      av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
+  return av1_compute_rd_mult(cpi,
+                             segment_qindex + cm->quant_params.y_dc_delta_q);
+}
+
+static AOM_INLINE int do_split_check(BLOCK_SIZE bsize) {
+  return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p,
+                                                              int frm) {
+  assert(frm >= 0);
+  if (frm < 0 ||
+      p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) {
+    return NULL;
+  }
+
+  return &p->stats_buf_ctx->stats_in_start[frm];
+}
+
+int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col, int orig_rdmult);
+
+int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step);
+
+int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step);
+
+void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                          int mi_col, SuperBlockEnc *sb_enc);
+
+int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td,
+                                   int64_t *delta_dist, BLOCK_SIZE bsize,
+                                   int mi_row, int mi_col);
+
+int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
+                      BLOCK_SIZE bsize, int mi_row, int mi_col);
+
+int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                      const BLOCK_SIZE bsize, const int mi_row,
+                      const int mi_col);
+
+int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize, const int mi_row,
+                            const int mi_col, int orig_rdmult);
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
+                         const BLOCK_SIZE bsize, const int mi_row,
+                         const int mi_col, int *const rdmult);
+
+void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
+                      const PICK_MODE_CONTEXT *const ctx, int mi_row,
+                      int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run);
+
+void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+                                 PREDICTION_MODE mode, int16_t mode_context);
+
+void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+                         MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                         const MB_MODE_INFO *above_mi,
+                         const MB_MODE_INFO *left_mi, const int intraonly);
+
+void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                         int mi_row, int mi_col, BLOCK_SIZE bsize,
+                         const int num_planes);
+
+void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                      int mi_row, int mi_col, BLOCK_SIZE bsize,
+                      const int num_planes);
+
+void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                MB_MODE_INFO **mib, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize);
+
+int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize);
+
+void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                            BLOCK_SIZE bsize);
+
+void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
+                                       BLOCK_SIZE bsize, int mib_size,
+                                       int mi_row, int mi_col);
+
+void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
+                         int wt_left, int wt_tr);
+
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                           int mi_col);
+
+void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size,
+                    int mi_row, int mi_col);
+
+void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi,
+                         ThreadData *td, const TileDataEnc *tile_data,
+                         int mi_row, int mi_col);
+
+void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
+                          ThreadData *td, TileDataEnc *tile_data, int mi_row,
+                          int mi_col);
+
+void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
+                           const TileInfo *const tile_info, const int mi_row,
+                           const int mi_col);
+
+static AOM_INLINE void av1_dealloc_mb_data(struct AV1Common *cm,
+                                           struct macroblock *mb) {
+  aom_free(mb->txfm_search_info.mb_rd_record);
+  mb->txfm_search_info.mb_rd_record = NULL;
+
+  aom_free(mb->inter_modes_info);
+  mb->inter_modes_info = NULL;
+
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; plane++) {
+    aom_free(mb->plane[plane].src_diff);
+    mb->plane[plane].src_diff = NULL;
+  }
+
+  aom_free(mb->e_mbd.seg_mask);
+  mb->e_mbd.seg_mask = NULL;
+
+  aom_free(mb->winner_mode_stats);
+  mb->winner_mode_stats = NULL;
+}
+
+static AOM_INLINE void allocate_winner_mode_stats(const AV1_COMP *cpi,
+                                                  struct macroblock *mb) {
+  const SPEED_FEATURES *sf = &cpi->sf;
+  // The winner_mode_stats buffer is not required in these cases.
+  if (is_stat_generation_stage(cpi) ||
+      (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode) ||
+      (sf->winner_mode_sf.multi_winner_mode_type == MULTI_WINNER_MODE_OFF))
+    return;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int winner_mode_count =
+      winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type];
+  CHECK_MEM_ERROR(cm, mb->winner_mode_stats,
+                  (WinnerModeStats *)aom_malloc(
+                      winner_mode_count * sizeof(mb->winner_mode_stats[0])));
+}
+
+static AOM_INLINE void av1_alloc_mb_data(const AV1_COMP *cpi,
+                                         struct macroblock *mb) {
+  const AV1_COMMON *cm = &cpi->common;
+  const SPEED_FEATURES *sf = &cpi->sf;
+  if (!sf->rt_sf.use_nonrd_pick_mode) {
+    // Memory for mb_rd_record is allocated only when use_mb_rd_hash sf is
+    // enabled.
+    if (sf->rd_sf.use_mb_rd_hash)
+      CHECK_MEM_ERROR(cm, mb->txfm_search_info.mb_rd_record,
+                      (MB_RD_RECORD *)aom_malloc(sizeof(MB_RD_RECORD)));
+    if (!frame_is_intra_only(cm))
+      CHECK_MEM_ERROR(
+          cm, mb->inter_modes_info,
+          (InterModesInfo *)aom_malloc(sizeof(*mb->inter_modes_info)));
+  }
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; plane++) {
+    const int subsampling_xy =
+        plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
+              : 0;
+    const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
+    CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff,
+                    (int16_t *)aom_memalign(
+                        32, sizeof(*mb->plane[plane].src_diff) * sb_size));
+  }
+  CHECK_MEM_ERROR(cm, mb->e_mbd.seg_mask,
+                  (uint8_t *)aom_memalign(
+                      16, 2 * MAX_SB_SQUARE * sizeof(mb->e_mbd.seg_mask[0])));
+
+  allocate_winner_mode_stats(cpi, mb);
+}
+
+// This function will compute the number of reference frames to be disabled
+// based on selective_ref_frame speed feature.
+static AOM_INLINE unsigned int get_num_refs_to_disable(
+    const AV1_COMP *cpi, const int *ref_frame_flags,
+    const unsigned int *ref_display_order_hint,
+    unsigned int cur_frame_display_index) {
+  unsigned int num_refs_to_disable = 0;
+  if (cpi->sf.inter_sf.selective_ref_frame >= 3) {
+    num_refs_to_disable++;
+    if (cpi->sf.inter_sf.selective_ref_frame >= 6) {
+      // Disable LAST2_FRAME  and ALTREF2_FRAME
+      num_refs_to_disable += 2;
+    } else if (cpi->sf.inter_sf.selective_ref_frame == 5 &&
+               *ref_frame_flags & av1_ref_frame_flag_list[LAST2_FRAME]) {
+      const int last2_frame_dist = av1_encoder_get_relative_dist(
+          ref_display_order_hint[LAST2_FRAME - LAST_FRAME],
+          cur_frame_display_index);
+      // Disable LAST2_FRAME if it is a temporally distant frame
+      if (abs(last2_frame_dist) > 2) {
+        num_refs_to_disable++;
+      }
+#if !CONFIG_REALTIME_ONLY
+      else if (is_stat_consumption_stage_twopass(cpi)) {
+        const FIRSTPASS_STATS *const this_frame_stats =
+            read_one_frame_stats(&cpi->ppi->twopass, cur_frame_display_index);
+        const double coded_error_per_mb = this_frame_stats->coded_error;
+        // Disable LAST2_FRAME if the coded error of the current frame based on
+        // first pass stats is very low.
+        if (coded_error_per_mb < 100.0) num_refs_to_disable++;
+      }
+#endif  // CONFIG_REALTIME_ONLY
+    }
+  }
+  return num_refs_to_disable;
+}
+
+static INLINE int get_max_allowed_ref_frames(
+    const AV1_COMP *cpi, const int *ref_frame_flags,
+    const unsigned int *ref_display_order_hint,
+    unsigned int cur_frame_display_index) {
+  const unsigned int max_reference_frames =
+      cpi->oxcf.ref_frm_cfg.max_reference_frames;
+  const unsigned int num_refs_to_disable = get_num_refs_to_disable(
+      cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index);
+  const unsigned int max_allowed_refs_for_given_speed =
+      INTER_REFS_PER_FRAME - num_refs_to_disable;
+  return AOMMIN(max_allowed_refs_for_given_speed, max_reference_frames);
+}
+
+// Enforce the number of references for each arbitrary frame based on user
+// options and speed.
+static AOM_INLINE void enforce_max_ref_frames(
+    AV1_COMP *cpi, int *ref_frame_flags,
+    const unsigned int *ref_display_order_hint,
+    unsigned int cur_frame_display_index) {
+  MV_REFERENCE_FRAME ref_frame;
+  int total_valid_refs = 0;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      total_valid_refs++;
+    }
+  }
+
+  const int max_allowed_refs = get_max_allowed_ref_frames(
+      cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index);
+
+  for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) {
+    const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i];
+
+    if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) {
+      continue;
+    }
+
+    switch (ref_frame_to_disable) {
+      case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break;
+      case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break;
+      case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break;
+      case BWDREF_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break;
+      default: assert(0);
+    }
+    --total_valid_refs;
+  }
+  assert(total_valid_refs <= max_allowed_refs);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
diff --git a/media/libaom/src/av1/encoder/encodemb.c b/media/libaom/src/av1/encoder/encodemb.c
index ec33362290..8dee801af1 100644
--- a/media/libaom/src/av1/encoder/encodemb.c
+++ b/media/libaom/src/av1/encoder/encodemb.c
@@ -30,24 +30,24 @@
 
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/encodemb.h"
-#include "av1/encoder/encodetxb.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/txb_rdopt.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 
-void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
-                        int16_t *diff, ptrdiff_t diff_stride,
-                        const uint8_t *src8, ptrdiff_t src_stride,
-                        const uint8_t *pred8, ptrdiff_t pred_stride) {
+void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+                        ptrdiff_t diff_stride, const uint8_t *src8,
+                        ptrdiff_t src_stride, const uint8_t *pred8,
+                        ptrdiff_t pred_stride) {
   assert(rows >= 4 && cols >= 4);
 #if CONFIG_AV1_HIGHBITDEPTH
-  if (is_cur_buf_hbd(xd)) {
+  if (bd_info.use_highbitdepth_buf) {
     aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
-                              pred8, pred_stride, xd->bd);
+                              pred8, pred_stride);
     return;
   }
 #endif
-  (void)xd;
+  (void)bd_info;
   aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
                      pred_stride);
 }
@@ -55,6 +55,7 @@ void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
   const int diff_stride = block_size_wide[plane_bsize];
@@ -66,8 +67,8 @@ void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
   uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
   int16_t *src_diff =
       &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
-  av1_subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src,
-                     src_stride, dst, dst_stride);
+  av1_subtract_block(bd_info, tx1d_height, tx1d_width, src_diff, diff_stride,
+                     src, src_stride, dst, dst_stride);
 }
 
 void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) {
@@ -77,15 +78,15 @@ void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) {
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
   const MACROBLOCKD *xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
 
-  av1_subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
-                     pd->dst.buf, pd->dst.stride);
+  av1_subtract_block(bd_info, bh, bw, p->src_diff, bw, p->src.buf,
+                     p->src.stride, pd->dst.buf, pd->dst.stride);
 }
 
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                    int block, TX_SIZE tx_size, TX_TYPE tx_type,
-                   const TXB_CTX *const txb_ctx, int fast_mode,
-                   int *rate_cost) {
+                   const TXB_CTX *const txb_ctx, int *rate_cost) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   const int eob = p->eobs[block];
@@ -93,12 +94,12 @@ int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
   if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
       xd->lossless[segment_id]) {
-    *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size);
+    *rate_cost = av1_cost_skip_txb(&x->coeff_costs, txb_ctx, plane, tx_size);
     return eob;
   }
 
-  return av1_optimize_txb_new(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
-                              rate_cost, cpi->oxcf.sharpness, fast_mode);
+  return av1_optimize_txb(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
+                          rate_cost, cpi->oxcf.algo_cfg.sharpness);
 }
 
 // Hyper-parameters for dropout optimization, based on following logics.
@@ -133,15 +134,8 @@ const int DROPOUT_MULTIPLIER_Q_BASE = 32;  // Base Q to compute multiplier.
 
 void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                         TX_TYPE tx_type, int qindex) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  const struct macroblock_plane *const p = &mb->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
-  tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
   const int tx_width = tx_size_wide[tx_size];
   const int tx_height = tx_size_high[tx_size];
-  const int max_eob = av1_get_max_eob(tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
 
   // Early return if `qindex` is out of range.
   if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) {
@@ -159,8 +153,22 @@ void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
       multiplier *
       CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX);
 
+  av1_dropout_qcoeff_num(mb, plane, block, tx_size, tx_type, dropout_num_before,
+                         dropout_num_after);
+}
+
+void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
+                            TX_SIZE tx_size, TX_TYPE tx_type,
+                            int dropout_num_before, int dropout_num_after) {
+  const struct macroblock_plane *const p = &mb->plane[plane];
+  tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+  const int max_eob = av1_get_max_eob(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+
   // Early return if there are not enough non-zero coefficients.
-  if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before) {
+  if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before ||
+      max_eob <= dropout_num_before + dropout_num_after) {
     return;
   }
 
@@ -175,7 +183,8 @@ void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
 
   for (int i = 0; i < p->eobs[block]; ++i) {
     const int scan_idx = scan_order->scan[i];
-    if (qcoeff[scan_idx] > DROPOUT_COEFF_MAX) {  // Keep large coefficients.
+    if (abs(qcoeff[scan_idx]) > DROPOUT_COEFF_MAX) {
+      // Keep large coefficients.
       count_zeros_before = 0;
       count_zeros_after = 0;
       idx = -1;
@@ -200,6 +209,7 @@ void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
     if (count_nonzeros > DROPOUT_CONTINUITY_MAX) {
       count_zeros_before = 0;
       count_zeros_after = 0;
+      count_nonzeros = 0;
       idx = -1;
       eob = i + 1;
     }
@@ -226,7 +236,7 @@ void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   if (eob != p->eobs[block]) {
     p->eobs[block] = eob;
     p->txb_entropy_ctx[block] =
-        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+        av1_get_txb_entropy_context(qcoeff, scan_order, eob);
   }
 }
 
@@ -262,29 +272,53 @@ static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES] = {
 };
 #endif
 
+// Computes the transform for DC only blocks
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+                       TxfmParam *txfm_param, int64_t per_px_mean) {
+  assert(per_px_mean != INT64_MAX);
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
+  memset(coeff, 0, sizeof(*coeff) * n_coeffs);
+  coeff[0] =
+      (tran_low_t)((per_px_mean * dc_coeff_scale[txfm_param->tx_size]) >> 12);
+}
+
 void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
                      int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
-                     QUANT_PARAM *qparam) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+                     const QUANT_PARAM *qparam) {
+  av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, txfm_param);
+  av1_quant(x, plane, block, txfm_param, qparam);
+}
+
+void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
+               BLOCK_SIZE plane_bsize, TxfmParam *txfm_param) {
   const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const SCAN_ORDER *const scan_order =
-      get_scan(txfm_param->tx_size, txfm_param->tx_type);
   const int block_offset = BLOCK_OFFSET(block);
   tran_low_t *const coeff = p->coeff + block_offset;
-  tran_low_t *const qcoeff = p->qcoeff + block_offset;
-  tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
-  uint16_t *const eob = &p->eobs[block];
   const int diff_stride = block_size_wide[plane_bsize];
 
   const int src_offset = (blk_row * diff_stride + blk_col);
   const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2];
 
   av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param,
+               const QUANT_PARAM *qparam) {
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const SCAN_ORDER *const scan_order =
+      get_scan(txfm_param->tx_size, txfm_param->tx_type);
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  tran_low_t *const qcoeff = p->qcoeff + block_offset;
+  tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
+  uint16_t *const eob = &p->eobs[block];
 
   if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
     const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
-    if (LIKELY(!x->skip_block)) {
+    if (LIKELY(!x->seg_skip_block)) {
 #if CONFIG_AV1_HIGHBITDEPTH
       quant_func_list[qparam->xform_quant_idx][txfm_param->is_hbd](
           coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam);
@@ -302,9 +336,8 @@ void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
     p->txb_entropy_ctx[block] = 0;
   } else {
     p->txb_entropy_ctx[block] =
-        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
+        av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
   }
-  return;
 }
 
 void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
@@ -358,7 +391,7 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
   MB_MODE_INFO *mbmi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
   uint8_t *dst;
   ENTROPY_CONTEXT *a, *l;
   int dummy_rate_cost = 0;
@@ -370,7 +403,12 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
   l = &args->tl[blk_row];
 
   TX_TYPE tx_type = DCT_DCT;
-  if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) {
+  const int blk_skip_idx =
+      (cpi->sf.rt_sf.use_nonrd_pick_mode && is_inter_block(mbmi))
+          ? blk_row * bw / 4 + blk_col / 2
+          : blk_row * bw + blk_col;
+  if (!is_blk_skip(x->txfm_search_info.blk_skip, plane, blk_skip_idx) &&
+      !mbmi->skip_mode) {
     tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size,
                               cm->features.reduced_tx_set_used);
     TxfmParam txfm_param;
@@ -383,8 +421,8 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
       quant_idx =
           USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
     av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
-    av1_setup_quant(tx_size, use_trellis, quant_idx, cpi->oxcf.quant_b_adapt,
-                    &quant_param);
+    av1_setup_quant(tx_size, use_trellis, quant_idx,
+                    cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
     av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
                       &quant_param);
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
@@ -400,7 +438,7 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
       TXB_CTX txb_ctx;
       get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
       av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
-                     args->cpi->sf.rd_sf.trellis_eob_fast, &dummy_rate_cost);
+                     &dummy_rate_cost);
     }
     if (!quant_param.use_optimize_b && do_dropout) {
       av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
@@ -427,8 +465,8 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
   // again.
   if (p->eobs[block] == 0 && plane == 0) {
 #if 0
-    if (args->cpi->oxcf.aq_mode == NO_AQ &&
-        args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+    if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
+        args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) {
       // TODO(jingning,angiebird,huisu@google.com): enable txk_check when
       // enable_optimize_b is true to detect potential RD bug.
       const uint8_t disable_txk_check = args->enable_optimize_b;
@@ -470,7 +508,7 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   const TX_SIZE plane_tx_size =
-      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
                                     pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
@@ -491,15 +529,17 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
     const int bsw = tx_size_wide_unit[sub_txs];
     const int bsh = tx_size_high_unit[sub_txs];
     const int step = bsh * bsw;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
     assert(bsw > 0 && bsh > 0);
 
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        const int offsetr = blk_row + row;
+    for (int row = 0; row < row_end; row += bsh) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += bsw) {
         const int offsetc = blk_col + col;
 
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
         encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
                            arg, dry_run);
         block += step;
@@ -565,7 +605,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
 
   uint8_t *dst;
   dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
@@ -574,7 +614,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
   QUANT_PARAM quant_param;
 
   av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
-  av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt,
+  av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
                   &quant_param);
   av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, DCT_DCT,
                     &quant_param);
@@ -604,12 +644,12 @@ void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   assert(bsize < BLOCK_SIZES_ALL);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  mbmi->skip = 1;
-  if (x->force_skip) return;
+  mbmi->skip_txfm = 1;
+  if (x->txfm_search_info.skip_txfm) return;
 
   struct optimize_ctx ctx;
   struct encode_b_args arg = {
-    cpi,  x,    &ctx,    &mbmi->skip,
+    cpi,  x,    &ctx,    &mbmi->skip_txfm,
     NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id]
   };
   const AV1_COMMON *const cm = &cpi->common;
@@ -683,7 +723,7 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
   PLANE_TYPE plane_type = get_plane_type(plane);
   uint16_t *eob = &p->eobs[block];
   const int dst_stride = pd->dst.stride;
@@ -694,7 +734,8 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
 
   TX_TYPE tx_type = DCT_DCT;
   const int bw = mi_size_wide[plane_bsize];
-  if (plane == 0 && is_blk_skip(x, plane, blk_row * bw + blk_col)) {
+  if (plane == 0 && is_blk_skip(x->txfm_search_info.blk_skip, plane,
+                                blk_row * bw + blk_col)) {
     *eob = 0;
     p->txb_entropy_ctx[block] = 0;
   } else {
@@ -716,8 +757,8 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
           USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
 
     av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
-    av1_setup_quant(tx_size, use_trellis, quant_idx, cpi->oxcf.quant_b_adapt,
-                    &quant_param);
+    av1_setup_quant(tx_size, use_trellis, quant_idx,
+                    cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
     av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
                       &quant_param);
 
@@ -743,7 +784,7 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
       TXB_CTX txb_ctx;
       get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
       av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
-                     args->cpi->sf.rd_sf.trellis_eob_fast, &dummy_rate_cost);
+                     &dummy_rate_cost);
     }
     if (do_dropout) {
       av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
@@ -764,8 +805,8 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
   // again.
   if (*eob == 0 && plane == 0) {
 #if 0
-    if (args->cpi->oxcf.aq_mode == NO_AQ
-        && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+    if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ
+        && args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) {
       assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] ==
           DCT_DCT);
     }
@@ -794,7 +835,7 @@ void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
   const int ss_y = pd->subsampling_y;
   ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
   ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
-  struct encode_b_args arg = { cpi, x,  NULL,    &(xd->mi[0]->skip),
+  struct encode_b_args arg = { cpi, x,  NULL,    &(xd->mi[0]->skip_txfm),
                                ta,  tl, dry_run, enable_optimize_b };
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
   if (enable_optimize_b) {
diff --git a/media/libaom/src/av1/encoder/encodemb.h b/media/libaom/src/av1/encoder/encodemb.h
index a337c83dbd..b58d13d5de 100644
--- a/media/libaom/src/av1/encoder/encodemb.h
+++ b/media/libaom/src/av1/encoder/encodemb.h
@@ -16,28 +16,13 @@
 
 #include "av1/common/av1_common_int.h"
 #include "av1/common/txb_common.h"
+#include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/block.h"
 #include "av1/encoder/tokenize.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct optimize_ctx {
-  ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE];
-  ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE];
-};
-
-struct encode_b_args {
-  const struct AV1_COMP *cpi;
-  MACROBLOCK *x;
-  struct optimize_ctx *ctx;
-  int8_t *skip;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
-  RUN_TYPE dry_run;
-  TRELLIS_OPT_TYPE enable_optimize_b;
-};
-
 enum {
   AV1_XFORM_QUANT_FP = 0,
   AV1_XFORM_QUANT_B = 1,
@@ -46,6 +31,7 @@ enum {
   AV1_XFORM_QUANT_TYPES,
 } UENUM1BYTE(AV1_XFORM_QUANT);
 
+// TODO(any): Merge OPT_TYPe and TRELLLIS_OPT_TYPE
 // Available optimization types to optimize the quantized coefficients.
 enum {
   NONE_OPT = 0,            // No optimization.
@@ -54,6 +40,29 @@ enum {
   TRELLIS_DROPOUT_OPT = 3  // Perform dropout after trellis optimization.
 } UENUM1BYTE(OPT_TYPE);
 
+enum {
+  NO_TRELLIS_OPT,          // No trellis optimization
+  FULL_TRELLIS_OPT,        // Trellis optimization in all stages
+  FINAL_PASS_TRELLIS_OPT,  // Trellis optimization in only the final encode pass
+  NO_ESTIMATE_YRD_TRELLIS_OPT  // Disable trellis in estimate_yrd_for_sb
+} UENUM1BYTE(TRELLIS_OPT_TYPE);
+
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE];
+};
+
+struct encode_b_args {
+  const struct AV1_COMP *cpi;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+  int8_t *skip;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+  RUN_TYPE dry_run;
+  TRELLIS_OPT_TYPE enable_optimize_b;
+};
+
 void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                    RUN_TYPE dry_run);
 
@@ -72,13 +81,22 @@ void av1_setup_qmatrix(const CommonQuantParams *quant_params,
                        const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
                        TX_TYPE tx_type, QUANT_PARAM *qparam);
 
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+                       TxfmParam *txfm_param, int64_t per_px_mean);
+
 void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
                      int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
-                     QUANT_PARAM *qparam);
+                     const QUANT_PARAM *qparam);
+
+void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
+               BLOCK_SIZE plane_bsize, TxfmParam *txfm_param);
+
+void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param,
+               const QUANT_PARAM *qparam);
 
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
                    int block, TX_SIZE tx_size, TX_TYPE tx_type,
-                   const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost);
+                   const TXB_CTX *const txb_ctx, int *rate_cost);
 
 // This function can be used as (i) a further optimization to reduce the
 // redundancy of quantized coefficients (a.k.a., `qcoeff`) after trellis
@@ -105,11 +123,16 @@ int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
 //   `txb_entropy_ctx`, which `mb` points to, may be modified by this function.
 void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                         TX_TYPE tx_type, int qindex);
+// Same as above, with the number of zeroes needed before/after a coeff to drop
+// it explicitly passed in, instead of being derived from qindex.
+void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
+                            TX_SIZE tx_size, TX_TYPE tx_type,
+                            int dropout_num_before, int dropout_num_after);
 
-void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
-                        int16_t *diff, ptrdiff_t diff_stride,
-                        const uint8_t *src8, ptrdiff_t src_stride,
-                        const uint8_t *pred8, ptrdiff_t pred_stride);
+void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+                        ptrdiff_t diff_stride, const uint8_t *src8,
+                        ptrdiff_t src_stride, const uint8_t *pred8,
+                        ptrdiff_t pred_stride);
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size);
@@ -138,6 +161,19 @@ static INLINE int is_trellis_used(TRELLIS_OPT_TYPE optimize_b,
     return false;
   return true;
 }
+
+// Scaling terms (precision of 12 bits) to perform tx-size specific
+// normalization that is used in DCT_DCT forward transform.
+// For transform blocks of 1:2 and 2:1       - sqrt(2) normalization is used
+// For transform blocks of 1:4 and 4:1       - factor of 2 is used
+// For transform blocks TX_8x8 and below     - an additional factor of 2 is used
+// For transform blocks max(width,height)=64 - currently not supported
+
+static const uint16_t dc_coeff_scale[TX_SIZES_ALL] = {
+  1024, 2048, 4096, 4096, 0,    1448, 1448, 2896, 2896, 2896,
+  2896, 0,    0,    2048, 2048, 4096, 4096, 0,    0
+};
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/encodemv.c b/media/libaom/src/av1/encoder/encodemv.c
index 167e9c0a37..4a7d87408c 100644
--- a/media/libaom/src/av1/encoder/encodemv.c
+++ b/media/libaom/src/av1/encoder/encodemv.c
@@ -173,8 +173,8 @@ static void build_nmv_component_cost_table(int *mvcost,
   }
 }
 
-void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
-                   nmv_context *mvctx, int usehp) {
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv,
+                   const MV *ref, nmv_context *mvctx, int usehp) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
   // If the mv_diff is zero, then we should have used near or nearest instead.
@@ -193,8 +193,7 @@ void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
   // motion vector component used.
   if (cpi->sf.mv_sf.auto_mv_step_size) {
     int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
-    cpi->mv_search_params.max_mv_magnitude =
-        AOMMAX(maxv, cpi->mv_search_params.max_mv_magnitude);
+    td->max_mv_magnitude = AOMMAX(maxv, td->max_mv_magnitude);
   }
 }
 
@@ -253,7 +252,7 @@ int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) {
     ref_mv_idx += 1;
   }
   return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
-                                   x->mbmi_ext);
+                                   &x->mbmi_ext);
 }
 
 void av1_find_best_ref_mvs_from_stack(int allow_hp,
diff --git a/media/libaom/src/av1/encoder/encodemv.h b/media/libaom/src/av1/encoder/encodemv.h
index 0d130143ec..650fc1bdaf 100644
--- a/media/libaom/src/av1/encoder/encodemv.h
+++ b/media/libaom/src/av1/encoder/encodemv.h
@@ -18,8 +18,8 @@
 extern "C" {
 #endif
 
-void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
-                   nmv_context *mvctx, int usehp);
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv,
+                   const MV *ref, nmv_context *mvctx, int usehp);
 
 void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
                          MvSubpelPrecision precision);
@@ -62,13 +62,44 @@ static INLINE uint8_t av1_log_in_base_2(unsigned int n) {
 }
 
 static INLINE MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
-  const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
-                              ? MV_CLASS_10
-                              : (MV_CLASS_TYPE)av1_log_in_base_2(z >> 3);
+  assert(z >= 0);
+  const MV_CLASS_TYPE c = (MV_CLASS_TYPE)av1_log_in_base_2(z >> 3);
+  assert(c <= MV_CLASS_10);
   if (offset) *offset = z - av1_mv_class_base(c);
   return c;
 }
 
+static INLINE int av1_check_newmv_joint_nonzero(const AV1_COMMON *cm,
+                                                MACROBLOCK *const x) {
+  (void)cm;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  if (this_mode == NEW_NEWMV) {
+    const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+    const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+    if (mbmi->mv[0].as_int == ref_mv_0.as_int ||
+        mbmi->mv[1].as_int == ref_mv_1.as_int) {
+      return 0;
+    }
+  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+    const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+    if (mbmi->mv[1].as_int == ref_mv_1.as_int) {
+      return 0;
+    }
+  } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+    const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+    if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+      return 0;
+    }
+  } else if (this_mode == NEWMV) {
+    const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+    if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+      return 0;
+    }
+  }
+  return 1;
+}
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/encoder.c b/media/libaom/src/av1/encoder/encoder.c
index 6406afd4a5..65d3a10a3d 100644
--- a/media/libaom/src/av1/encoder/encoder.c
+++ b/media/libaom/src/av1/encoder/encoder.c
@@ -13,14 +13,14 @@
 #include <float.h>
 #include <math.h>
 #include <stdio.h>
+#include <time.h>
+#include <stdlib.h>
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
-#include "config/aom_scale_rtcd.h"
-#include "config/av1_rtcd.h"
 
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
+#include "aom/aomcx.h"
+
 #if CONFIG_DENOISE
 #include "aom_dsp/grain_table.h"
 #include "aom_dsp/noise_util.h"
@@ -32,14 +32,12 @@
 #endif
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 #include "aom_scale/aom_scale.h"
 #if CONFIG_BITSTREAM_DEBUG
 #include "aom_util/debug_util.h"
 #endif  // CONFIG_BITSTREAM_DEBUG
 
 #include "av1/common/alloccommon.h"
-#include "av1/common/cdef.h"
 #include "av1/common/filter.h"
 #include "av1/common/idct.h"
 #include "av1/common/reconinter.h"
@@ -47,328 +45,54 @@
 #include "av1/common/resize.h"
 #include "av1/common/tile_common.h"
 
-#include "av1/encoder/av1_multi_thread.h"
+#include "av1/encoder/allintra_vis.h"
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/context_tree.h"
+#include "av1/encoder/dwt.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encoder_utils.h"
 #include "av1/encoder/encodetxb.h"
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/firstpass.h"
-#include "av1/encoder/grain_test_vectors.h"
 #include "av1/encoder/hash_motion.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/intra_mode_search.h"
 #include "av1/encoder/mv_prec.h"
 #include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/pickcdef.h"
 #include "av1/encoder/picklpf.h"
 #include "av1/encoder/pickrst.h"
 #include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rc_utils.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/speed_features.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/thirdpass.h"
 #include "av1/encoder/tpl_model.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/var_based_part.h"
 
-#if CONFIG_TUNE_VMAF
-#include "av1/encoder/tune_vmaf.h"
-#endif
-
 #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
 
-#if CONFIG_ENTROPY_STATS
-FRAME_COUNTS aggregate_fc;
-#endif  // CONFIG_ENTROPY_STATS
-
-#define AM_SEGMENT_ID_INACTIVE 7
-#define AM_SEGMENT_ID_ACTIVE 0
-
 // #define OUTPUT_YUV_REC
-#ifdef OUTPUT_YUV_SKINMAP
-FILE *yuv_skinmap_file = NULL;
-#endif
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
 #define FILE_NAME_LEN 100
 #endif
 
-const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = {
-  { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 },
-    { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 },
-    { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 },
-    { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 },
-    { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 },
-    { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 },
-    { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-  { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 },
-    { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 },
-    { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 },
-    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 },
-    { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 },
-    { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 },
-    { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 },
-    { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 },
-    { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 },
-    { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 },
-    { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 },
-    { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
-  { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 },
-    { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 },
-    { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 },
-    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 },
-    { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 },
-    { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 },
-    { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 },
-    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
-    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 },
-    { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 },
-    { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 },
-    { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-  { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 },
-    { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 },
-    { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 },
-    { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 },
-    { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 },
-    { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 },
-    { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 },
-    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
-    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 },
-    { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 },
-    { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 },
-    { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
-  { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 },
-    { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 },
-    { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 },
-    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 },
-    { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 },
-    { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 },
-    { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 },
-    { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 },
-    { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 },
-    { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 },
-    { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 },
-    { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
-};
-
-const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = {
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 0,  0,  0,  106, 90, 90, 97, 67, 59, 70, 28,
-    30, 38, 16, 16,  16, 0,  0,  44, 50, 26, 25 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 0,  0,  0,  98, 93, 97, 68, 82, 85, 33, 30,
-    33, 16, 16, 16, 16, 0,  0,  43, 37, 26, 16 },
-  { 0,  0,  0,  91, 80, 76, 78, 55, 49, 24, 16,
-    16, 16, 16, 16, 16, 0,  0,  29, 45, 16, 38 },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 0,  0,  0,  103, 89, 89, 89, 62, 63, 76, 34,
-    35, 32, 19, 16,  16, 0,  0,  49, 55, 29, 19 }
-};
-
-const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64,
-                                                       64, 64, 64 };
-
-// TODO(yunqing): the default probs can be trained later from better
-// performance.
-const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
-                                         [SWITCHABLE_FILTER_CONTEXTS]
-                                         [SWITCHABLE_FILTERS] = {
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } },
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } },
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } },
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } },
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } },
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } },
-                                           { { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 },
-                                             { 512, 512, 512 } }
-                                         };
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file = NULL;
+#endif
 
 static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
   switch (mode) {
@@ -384,6 +108,18 @@ static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
       *hr = 3;
       *hs = 5;
       break;
+    case THREEFOUR:
+      *hr = 3;
+      *hs = 4;
+      break;
+    case ONEFOUR:
+      *hr = 1;
+      *hs = 4;
+      break;
+    case ONEEIGHT:
+      *hr = 1;
+      *hs = 8;
+      break;
     case ONETWO:
       *hr = 1;
       *hs = 2;
@@ -396,67 +132,6 @@ static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
   }
 }
 
-// Mark all inactive blocks as active. Other segmentation features may be set
-// so memset cannot be used, instead only inactive blocks should be reset.
-static void suppress_active_map(AV1_COMP *cpi) {
-  unsigned char *const seg_map = cpi->enc_seg.map;
-  int i;
-  if (cpi->active_map.enabled || cpi->active_map.update)
-    for (i = 0;
-         i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; ++i)
-      if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
-        seg_map[i] = AM_SEGMENT_ID_ACTIVE;
-}
-
-static void apply_active_map(AV1_COMP *cpi) {
-  struct segmentation *const seg = &cpi->common.seg;
-  unsigned char *const seg_map = cpi->enc_seg.map;
-  const unsigned char *const active_map = cpi->active_map.map;
-  int i;
-
-  assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
-
-  if (frame_is_intra_only(&cpi->common)) {
-    cpi->active_map.enabled = 0;
-    cpi->active_map.update = 1;
-  }
-
-  if (cpi->active_map.update) {
-    if (cpi->active_map.enabled) {
-      for (i = 0;
-           i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
-           ++i)
-        if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
-      av1_enable_segmentation(seg);
-      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
-      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
-      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
-      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
-      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
-
-      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H,
-                      -MAX_LOOP_FILTER);
-      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V,
-                      -MAX_LOOP_FILTER);
-      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U,
-                      -MAX_LOOP_FILTER);
-      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
-                      -MAX_LOOP_FILTER);
-    } else {
-      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
-      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
-      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
-      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
-      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
-      if (seg->enabled) {
-        seg->update_data = 1;
-        seg->update_map = 1;
-      }
-    }
-    cpi->active_map.update = 0;
-  }
-}
-
 int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
                        int cols) {
   const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
@@ -516,528 +191,16 @@ int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
   }
 }
 
-// Compute the horizontal frequency components' energy in a frame
-// by calculuating the 16x4 Horizontal DCT. This is to be used to
-// decide the superresolution parameters.
-static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
-  uint64_t freq_energy[16] = { 0 };
-  const YV12_BUFFER_CONFIG *buf = cpi->source;
-  const int bd = cpi->td.mb.e_mbd.bd;
-  const int width = buf->y_crop_width;
-  const int height = buf->y_crop_height;
-  DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]);
-  int n = 0;
-  memset(freq_energy, 0, sizeof(freq_energy));
-  if (buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer);
-    for (int i = 0; i < height - 4; i += 4) {
-      for (int j = 0; j < width - 16; j += 16) {
-        av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride,
-                            H_DCT, bd);
-        for (int k = 1; k < 16; ++k) {
-          const uint64_t this_energy =
-              ((int64_t)coeff[k] * coeff[k]) +
-              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
-              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
-              ((int64_t)coeff[k + 48] * coeff[k + 48]);
-          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
-        }
-        n++;
-      }
-    }
-  } else {
-    assert(bd == 8);
-    DECLARE_ALIGNED(16, int16_t, src16[16 * 4]);
-    for (int i = 0; i < height - 4; i += 4) {
-      for (int j = 0; j < width - 16; j += 16) {
-        for (int ii = 0; ii < 4; ++ii)
-          for (int jj = 0; jj < 16; ++jj)
-            src16[ii * 16 + jj] =
-                buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)];
-        av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd);
-        for (int k = 1; k < 16; ++k) {
-          const uint64_t this_energy =
-              ((int64_t)coeff[k] * coeff[k]) +
-              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
-              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
-              ((int64_t)coeff[k + 48] * coeff[k + 48]);
-          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2);
-        }
-        n++;
-      }
-    }
-  }
-  if (n) {
-    for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n;
-    // Convert to cumulative energy
-    for (int k = 14; k > 0; --k) energy[k] += energy[k + 1];
-  } else {
-    for (int k = 1; k < 16; ++k) energy[k] = 1e+20;
-  }
-}
-
-static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-
-  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
-    return BLOCK_64X64;
-  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
-    return BLOCK_128X128;
-
-  assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
-
-  if (cpi->svc.number_spatial_layers > 1) {
-    // Use the configured size (top resolution) for spatial layers.
-    return AOMMIN(cpi->oxcf.width, cpi->oxcf.height) > 480 ? BLOCK_128X128
-                                                           : BLOCK_64X64;
-  }
-
-  // TODO(any): Possibly could improve this with a heuristic.
-  // When superres / resize is on, 'cm->width / height' can change between
-  // calls, so we don't apply this heuristic there.
-  // Things break if superblock size changes between the first pass and second
-  // pass encoding, which is why this heuristic is not configured as a
-  // speed-feature.
-  if (cpi->oxcf.superres_mode == SUPERRES_NONE &&
-      cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 1) {
-    return AOMMIN(cm->width, cm->height) > 480 ? BLOCK_128X128 : BLOCK_64X64;
-  }
-
-  return BLOCK_128X128;
-}
-
-static void setup_frame(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  // Set up entropy context depending on frame type. The decoder mandates
-  // the use of the default context, index 0, for keyframes and inter
-  // frames where the error_resilient_mode or intra_only flag is set. For
-  // other inter-frames the encoder currently uses only two contexts;
-  // context 1 for ALTREF frames and context 0 for the others.
-
-  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
-      cpi->ext_flags.use_primary_ref_none) {
-    av1_setup_past_independence(cm);
-  }
-
-  if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
-      frame_is_sframe(cm)) {
-    if (!cpi->seq_params_locked) {
-      set_sb_size(&cm->seq_params, select_sb_size(cpi));
-    }
-  } else {
-    const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
-    if (primary_ref_buf == NULL) {
-      av1_setup_past_independence(cm);
-      cm->seg.update_map = 1;
-      cm->seg.update_data = 1;
-    } else {
-      *cm->fc = primary_ref_buf->frame_context;
-    }
-  }
-
-  av1_zero(cm->cur_frame->interp_filter_selected);
-  cm->prev_frame = get_primary_ref_frame_buf(cm);
-  cpi->vaq_refresh = 0;
-}
-
-static void set_mb_mi(CommonModeInfoParams *mi_params, int width, int height) {
-  // Ensure that the decoded width and height are both multiples of
-  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
-  // subsampling is used).
-  // This simplifies the implementation of various experiments,
-  // eg. cdef, which operates on units of 8x8 luma pixels.
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
-
-  mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2;
-  mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
-  mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
-
-  mi_params->mb_cols = (mi_params->mi_cols + 2) >> 2;
-  mi_params->mb_rows = (mi_params->mi_rows + 2) >> 2;
-  mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
-
-  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
-  mi_params->mi_alloc_stride =
-      (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
-
-  assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
-         mi_size_high[mi_params->mi_alloc_bsize]);
-
-#if CONFIG_LPF_MASK
-  av1_alloc_loop_filter_mask(mi_params);
-#endif
-}
-
-static void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width,
-                          int height) {
-  const int is_4k_or_larger = AOMMIN(width, height) >= 2160;
-  mi_params->mi_alloc_bsize = is_4k_or_larger ? BLOCK_8X8 : BLOCK_4X4;
-
-  set_mb_mi(mi_params, width, height);
-}
-
-static void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params, int width,
-                                 int height) {
-  mi_params->mi_alloc_bsize = BLOCK_16X16;
+void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage) {
+  bool is_allintra = usage == ALLINTRA;
 
-  set_mb_mi(mi_params, width, height);
-}
-
-static void enc_setup_mi(CommonModeInfoParams *mi_params) {
-  const int mi_grid_size =
-      mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
-  memset(mi_params->mi_alloc, 0,
-         mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc));
-  memset(mi_params->mi_grid_base, 0,
-         mi_grid_size * sizeof(*mi_params->mi_grid_base));
-  memset(mi_params->tx_type_map, 0,
-         mi_grid_size * sizeof(*mi_params->tx_type_map));
-}
-
-static void enc_free_mi(CommonModeInfoParams *mi_params) {
-  aom_free(mi_params->mi_alloc);
-  mi_params->mi_alloc = NULL;
-  aom_free(mi_params->mi_grid_base);
-  mi_params->mi_grid_base = NULL;
-  mi_params->mi_alloc_size = 0;
-  aom_free(mi_params->tx_type_map);
-  mi_params->tx_type_map = NULL;
-}
-
-void av1_initialize_enc(void) {
   av1_rtcd();
   aom_dsp_rtcd();
   aom_scale_rtcd();
   av1_init_intra_predictors();
   av1_init_me_luts();
-  av1_rc_init_minq_luts();
-  av1_init_wedge_masks();
-}
-
-static void dealloc_context_buffers_ext(MBMIExtFrameBufferInfo *mbmi_ext_info) {
-  if (mbmi_ext_info->frame_base) {
-    aom_free(mbmi_ext_info->frame_base);
-    mbmi_ext_info->frame_base = NULL;
-    mbmi_ext_info->alloc_size = 0;
-  }
-}
-
-static void alloc_context_buffers_ext(AV1_COMMON *cm,
-                                      MBMIExtFrameBufferInfo *mbmi_ext_info) {
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-
-  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
-  const int mi_alloc_rows =
-      (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
-  const int mi_alloc_cols =
-      (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
-  const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols;
-
-  if (new_ext_mi_size > mbmi_ext_info->alloc_size) {
-    dealloc_context_buffers_ext(mbmi_ext_info);
-    CHECK_MEM_ERROR(
-        cm, mbmi_ext_info->frame_base,
-        aom_calloc(new_ext_mi_size, sizeof(*mbmi_ext_info->frame_base)));
-    mbmi_ext_info->alloc_size = new_ext_mi_size;
-  }
-  // The stride needs to be updated regardless of whether new allocation
-  // happened or not.
-  mbmi_ext_info->stride = mi_alloc_cols;
-}
-
-static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
-  pars->num_cr_points = 0;
-  pars->cr_mult = 0;
-  pars->cr_luma_mult = 0;
-  memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr));
-  memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr));
-  pars->num_cb_points = 0;
-  pars->cb_mult = 0;
-  pars->cb_luma_mult = 0;
-  pars->chroma_scaling_from_luma = 0;
-  memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb));
-  memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb));
-}
-
-static void update_film_grain_parameters(struct AV1_COMP *cpi,
-                                         const AV1EncoderConfig *oxcf) {
-  AV1_COMMON *const cm = &cpi->common;
-  cpi->oxcf = *oxcf;
-
-  if (cpi->film_grain_table) {
-    aom_film_grain_table_free(cpi->film_grain_table);
-    aom_free(cpi->film_grain_table);
-    cpi->film_grain_table = NULL;
-  }
-
-  if (oxcf->film_grain_test_vector) {
-    cm->seq_params.film_grain_params_present = 1;
-    if (cm->current_frame.frame_type == KEY_FRAME) {
-      memcpy(&cm->film_grain_params,
-             film_grain_test_vectors + oxcf->film_grain_test_vector - 1,
-             sizeof(cm->film_grain_params));
-      if (oxcf->monochrome)
-        reset_film_grain_chroma_params(&cm->film_grain_params);
-      cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
-      if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) {
-        cm->film_grain_params.clip_to_restricted_range = 0;
-      }
-    }
-  } else if (oxcf->film_grain_table_filename) {
-    cm->seq_params.film_grain_params_present = 1;
-
-    cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
-    memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t));
-
-    aom_film_grain_table_read(cpi->film_grain_table,
-                              oxcf->film_grain_table_filename, &cm->error);
-  } else {
-#if CONFIG_DENOISE
-    cm->seq_params.film_grain_params_present = (cpi->oxcf.noise_level > 0);
-#else
-    cm->seq_params.film_grain_params_present = 0;
-#endif
-    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
-  }
-}
-
-static void dealloc_compressor_data(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-
-  dealloc_context_buffers_ext(&cpi->mbmi_ext_info);
-
-  aom_free(cpi->tile_data);
-  cpi->tile_data = NULL;
-
-  // Delete sementation map
-  aom_free(cpi->enc_seg.map);
-  cpi->enc_seg.map = NULL;
-
-  av1_cyclic_refresh_free(cpi->cyclic_refresh);
-  cpi->cyclic_refresh = NULL;
-
-  aom_free(cpi->active_map.map);
-  cpi->active_map.map = NULL;
-
-  aom_free(cpi->ssim_rdmult_scaling_factors);
-  cpi->ssim_rdmult_scaling_factors = NULL;
-
-  aom_free(cpi->tpl_rdmult_scaling_factors);
-  cpi->tpl_rdmult_scaling_factors = NULL;
-
-  aom_free(cpi->tpl_sb_rdmult_scaling_factors);
-  cpi->tpl_sb_rdmult_scaling_factors = NULL;
-
-#if CONFIG_TUNE_VMAF
-  aom_free(cpi->vmaf_rdmult_scaling_factors);
-  cpi->vmaf_rdmult_scaling_factors = NULL;
-#endif
-
-  aom_free(cpi->td.mb.above_pred_buf);
-  cpi->td.mb.above_pred_buf = NULL;
-
-  aom_free(cpi->td.mb.left_pred_buf);
-  cpi->td.mb.left_pred_buf = NULL;
-
-  aom_free(cpi->td.mb.wsrc_buf);
-  cpi->td.mb.wsrc_buf = NULL;
-
-  aom_free(cpi->td.mb.inter_modes_info);
-  cpi->td.mb.inter_modes_info = NULL;
-
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++) {
-      aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]);
-      cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL;
-    }
-  aom_free(cpi->td.mb.mask_buf);
-  cpi->td.mb.mask_buf = NULL;
-
-  aom_free(cm->tpl_mvs);
-  cm->tpl_mvs = NULL;
-
-  aom_free(cpi->td.mb.mbmi_ext);
-  cpi->td.mb.mbmi_ext = NULL;
-
-  if (cpi->td.vt64x64) {
-    aom_free(cpi->td.vt64x64);
-    cpi->td.vt64x64 = NULL;
-  }
-
-  av1_free_ref_frame_buffers(cm->buffer_pool);
-  av1_free_txb_buf(cpi);
-  av1_free_context_buffers(cm);
-
-  aom_free_frame_buffer(&cpi->last_frame_uf);
-  av1_free_restoration_buffers(cm);
-  aom_free_frame_buffer(&cpi->trial_frame_rst);
-  aom_free_frame_buffer(&cpi->scaled_source);
-  aom_free_frame_buffer(&cpi->scaled_last_source);
-  aom_free_frame_buffer(&cpi->alt_ref_buffer);
-  av1_lookahead_destroy(cpi->lookahead);
-
-  aom_free(cpi->tile_tok[0][0]);
-  cpi->tile_tok[0][0] = 0;
-
-  aom_free(cpi->tplist[0][0]);
-  cpi->tplist[0][0] = NULL;
-
-  av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size);
-
-  aom_free(cpi->td.mb.palette_buffer);
-  av1_release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
-  aom_free(cpi->td.mb.tmp_conv_dst);
-  for (int j = 0; j < 2; ++j) {
-    aom_free(cpi->td.mb.tmp_obmc_bufs[j]);
-  }
-
-#if CONFIG_DENOISE
-  if (cpi->denoise_and_model) {
-    aom_denoise_and_model_free(cpi->denoise_and_model);
-    cpi->denoise_and_model = NULL;
-  }
-#endif
-  if (cpi->film_grain_table) {
-    aom_film_grain_table_free(cpi->film_grain_table);
-    cpi->film_grain_table = NULL;
-  }
-
-  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
-    aom_free(cpi->level_params.level_info[i]);
-  }
-
-  if (cpi->use_svc) av1_free_svc_cyclic_refresh(cpi);
-}
-
-static void configure_static_seg_features(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  const RATE_CONTROL *const rc = &cpi->rc;
-  struct segmentation *const seg = &cm->seg;
-
-  int high_q = (int)(rc->avg_q > 48.0);
-  int qi_delta;
-
-  // Disable and clear down for KF
-  if (cm->current_frame.frame_type == KEY_FRAME) {
-    // Clear down the global segmentation map
-    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
-    seg->update_map = 0;
-    seg->update_data = 0;
-
-    // Disable segmentation
-    av1_disable_segmentation(seg);
-
-    // Clear down the segment features.
-    av1_clearall_segfeatures(seg);
-  } else if (cpi->refresh_alt_ref_frame) {
-    // If this is an alt ref frame
-    // Clear down the global segmentation map
-    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
-    seg->update_map = 0;
-    seg->update_data = 0;
-
-    // Disable segmentation and individual segment features by default
-    av1_disable_segmentation(seg);
-    av1_clearall_segfeatures(seg);
-
-    // If segmentation was enabled set those features needed for the
-    // arf itself.
-    if (seg->enabled) {
-      seg->update_map = 1;
-      seg->update_data = 1;
-
-      qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
-                                    cm->seq_params.bit_depth);
-      av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
-      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
-      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
-      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
-      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
-
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
-
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
-    }
-  } else if (seg->enabled) {
-    // All other frames if segmentation has been enabled
-
-    // First normal frame in a valid gf or alt ref group
-    if (rc->frames_since_golden == 0) {
-      // Set up segment features for normal frames in an arf group
-      if (rc->source_alt_ref_active) {
-        seg->update_map = 0;
-        seg->update_data = 1;
-
-        qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125,
-                                      cm->seq_params.bit_depth);
-        av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
-        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
-
-        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
-        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
-        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
-        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
-
-        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
-        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
-        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
-        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
-
-        // Segment coding disabled for compred testing
-        if (high_q) {
-          av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-          av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
-          av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
-        }
-      } else {
-        // Disable segmentation and clear down features if alt ref
-        // is not active for this group
-
-        av1_disable_segmentation(seg);
-
-        memset(cpi->enc_seg.map, 0,
-               cm->mi_params.mi_rows * cm->mi_params.mi_cols);
-
-        seg->update_map = 0;
-        seg->update_data = 0;
-
-        av1_clearall_segfeatures(seg);
-      }
-    } else if (rc->is_src_frame_alt_ref) {
-      // Special case where we are coding over the top of a previous
-      // alt ref frame.
-      // Segment coding disabled for compred testing
-
-      // Enable ref frame features for segment 0 as well
-      av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
-      av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
-
-      // All mbs should use ALTREF_FRAME
-      av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
-      av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-      av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
-      av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-
-      // Skip all MBs if high Q (0,0 mv and skip coeffs)
-      if (high_q) {
-        av1_enable_segfeature(seg, 0, SEG_LVL_SKIP);
-        av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
-      }
-      // Enable data update
-      seg->update_data = 1;
-    } else {
-      // All other frames.
-
-      // No updates.. leave things as they are.
-      seg->update_map = 0;
-      seg->update_data = 0;
-    }
-  }
+  if (!is_allintra) av1_init_wedge_masks();
+  if (!is_allintra || end_usage != AOM_Q) av1_rc_init_minq_luts();
 }
 
 static void update_reference_segmentation_map(AV1_COMP *cpi) {
@@ -1056,93 +219,6 @@ static void update_reference_segmentation_map(AV1_COMP *cpi) {
   }
 }
 
-static void alloc_altref_frame_buffer(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-
-  // TODO(agrange) Check if ARF is enabled and skip allocation if not.
-  if (aom_realloc_frame_buffer(
-          &cpi->alt_ref_buffer, oxcf->width, oxcf->height,
-          seq_params->subsampling_x, seq_params->subsampling_y,
-          seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
-          cm->features.byte_alignment, NULL, NULL, NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate altref buffer");
-}
-
-static void alloc_util_frame_buffers(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
-  const int byte_alignment = cm->features.byte_alignment;
-  if (aom_realloc_frame_buffer(
-          &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate last frame buffer");
-
-  if (aom_realloc_frame_buffer(
-          &cpi->trial_frame_rst, cm->superres_upscaled_width,
-          cm->superres_upscaled_height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_RESTORATION_FRAME_BORDER, byte_alignment, NULL, NULL, NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate trial restored frame buffer");
-
-  if (aom_realloc_frame_buffer(
-          &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate scaled source buffer");
-
-  if (aom_realloc_frame_buffer(
-          &cpi->scaled_last_source, cm->width, cm->height,
-          seq_params->subsampling_x, seq_params->subsampling_y,
-          seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
-          byte_alignment, NULL, NULL, NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate scaled last source buffer");
-}
-
-static void alloc_compressor_data(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-
-  if (av1_alloc_context_buffers(cm, cm->width, cm->height)) {
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate context buffers");
-  }
-
-  int mi_rows_aligned_to_sb =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2);
-  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
-
-  if (!is_stat_generation_stage(cpi)) {
-    av1_alloc_txb_buf(cpi);
-
-    alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
-  }
-
-  aom_free(cpi->tile_tok[0][0]);
-  aom_free(cpi->tplist[0][0]);
-
-  if (!is_stat_generation_stage(cpi)) {
-    unsigned int tokens =
-        get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols,
-                        MAX_SB_SIZE_LOG2, num_planes);
-    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
-                    aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
-
-    CHECK_MEM_ERROR(cm, cpi->tplist[0][0],
-                    aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
-                               sizeof(*cpi->tplist[0][0])));
-  }
-
-  av1_setup_pc_tree(cpi, &cpi->td);
-}
-
 void av1_new_framerate(AV1_COMP *cpi, double framerate) {
   cpi->framerate = framerate < 0.1 ? 30 : framerate;
   av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
@@ -1153,7 +229,7 @@ double av1_get_compression_ratio(const AV1_COMMON *const cm,
   const int upscaled_width = cm->superres_upscaled_width;
   const int height = cm->height;
   const int luma_pic_size = upscaled_width * height;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const BITSTREAM_PROFILE profile = seq_params->profile;
   const int pic_size_profile_factor =
       profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
@@ -1164,30 +240,29 @@ double av1_get_compression_ratio(const AV1_COMMON *const cm,
   return uncompressed_frame_size / (double)encoded_frame_size;
 }
 
-static void set_tile_info(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
+static void set_tile_info(AV1_COMMON *const cm,
+                          const TileConfig *const tile_cfg) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   CommonTileParams *const tiles = &cm->tiles;
   int i, start_sb;
 
   av1_get_tile_limits(cm);
 
   // configure tile columns
-  if (cpi->oxcf.tile_width_count == 0 || cpi->oxcf.tile_height_count == 0) {
+  if (tile_cfg->tile_width_count == 0 || tile_cfg->tile_height_count == 0) {
     tiles->uniform_spacing = 1;
-    tiles->log2_cols = AOMMAX(cpi->oxcf.tile_columns, tiles->min_log2_cols);
+    tiles->log2_cols = AOMMAX(tile_cfg->tile_columns, tiles->min_log2_cols);
     tiles->log2_cols = AOMMIN(tiles->log2_cols, tiles->max_log2_cols);
   } else {
-    int mi_cols =
-        ALIGN_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2);
-    int sb_cols = mi_cols >> seq_params->mib_size_log2;
+    int sb_cols =
+        CEIL_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2);
     int size_sb, j = 0;
     tiles->uniform_spacing = 0;
     for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
       tiles->col_start_sb[i] = start_sb;
-      size_sb = cpi->oxcf.tile_widths[j++];
-      if (j >= cpi->oxcf.tile_width_count) j = 0;
+      size_sb = tile_cfg->tile_widths[j++];
+      if (j >= tile_cfg->tile_width_count) j = 0;
       start_sb += AOMMIN(size_sb, tiles->max_width_sb);
     }
     tiles->cols = i;
@@ -1198,17 +273,16 @@ static void set_tile_info(AV1_COMP *cpi) {
 
   // configure tile rows
   if (tiles->uniform_spacing) {
-    tiles->log2_rows = AOMMAX(cpi->oxcf.tile_rows, tiles->min_log2_rows);
+    tiles->log2_rows = AOMMAX(tile_cfg->tile_rows, tiles->min_log2_rows);
     tiles->log2_rows = AOMMIN(tiles->log2_rows, tiles->max_log2_rows);
   } else {
-    int mi_rows =
-        ALIGN_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2);
-    int sb_rows = mi_rows >> seq_params->mib_size_log2;
+    int sb_rows =
+        CEIL_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2);
     int size_sb, j = 0;
     for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
       tiles->row_start_sb[i] = start_sb;
-      size_sb = cpi->oxcf.tile_heights[j++];
-      if (j >= cpi->oxcf.tile_height_count) j = 0;
+      size_sb = tile_cfg->tile_heights[j++];
+      if (j >= tile_cfg->tile_height_count) j = 0;
       start_sb += AOMMIN(size_sb, tiles->max_height_sb);
     }
     tiles->rows = i;
@@ -1217,31 +291,23 @@ static void set_tile_info(AV1_COMP *cpi) {
   av1_calculate_tile_rows(seq_params, mi_params->mi_rows, tiles);
 }
 
-static void update_frame_size(AV1_COMP *cpi) {
+void av1_update_frame_size(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
-  // We need to reallocate the context buffers here in case we need more mis.
-  if (av1_alloc_context_buffers(cm, cm->width, cm->height)) {
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate context buffers");
-  }
-  av1_init_mi_buffers(&cm->mi_params);
+  // Setup mi_params here in case we need more mi's.
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  mi_params->set_mb_mi(mi_params, cm->width, cm->height,
+                       cpi->sf.part_sf.default_min_partition_size);
 
-  av1_init_macroblockd(cm, xd, NULL);
+  av1_init_macroblockd(cm, xd);
 
-  if (!is_stat_generation_stage(cpi))
-    alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
-  set_tile_info(cpi);
-}
+  if (!cpi->ppi->seq_params_locked)
+    set_sb_size(cm->seq_params,
+                av1_select_sb_size(&cpi->oxcf, cm->width, cm->height,
+                                   cpi->svc.number_spatial_layers));
 
-static void init_buffer_indices(ForceIntegerMVInfo *const force_intpel_info,
-                                int *const remapped_ref_idx) {
-  int fb_idx;
-  for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
-    remapped_ref_idx[fb_idx] = fb_idx;
-  force_intpel_info->rate_index = 0;
-  force_intpel_info->rate_size = 0;
+  set_tile_info(cm, &cpi->oxcf.tile_cfg);
 }
 
 static INLINE int does_level_match(int width, int height, double fps,
@@ -1257,81 +323,86 @@ static INLINE int does_level_match(int width, int height, double fps,
          height <= lvl_height * lvl_dim_mult;
 }
 
-static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
-                                     const AV1EncoderConfig *oxcf) {
+static void set_bitstream_level_tier(AV1_PRIMARY *const ppi, int width,
+                                     int height, double init_framerate) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
   // TODO(any): This is a placeholder function that only addresses dimensions
   // and max display sample rates.
   // Need to add checks for max bit rate, max decoded luma sample rate, header
   // rate, etc. that are not covered by this function.
   AV1_LEVEL level = SEQ_LEVEL_MAX;
-  if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 512,
-                       288, 30.0, 4)) {
+  if (does_level_match(width, height, init_framerate, 512, 288, 30.0, 4)) {
     level = SEQ_LEVEL_2_0;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              704, 396, 30.0, 4)) {
+  } else if (does_level_match(width, height, init_framerate, 704, 396, 30.0,
+                              4)) {
     level = SEQ_LEVEL_2_1;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              1088, 612, 30.0, 4)) {
+  } else if (does_level_match(width, height, init_framerate, 1088, 612, 30.0,
+                              4)) {
     level = SEQ_LEVEL_3_0;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              1376, 774, 30.0, 4)) {
+  } else if (does_level_match(width, height, init_framerate, 1376, 774, 30.0,
+                              4)) {
     level = SEQ_LEVEL_3_1;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              2048, 1152, 30.0, 3)) {
+  } else if (does_level_match(width, height, init_framerate, 2048, 1152, 30.0,
+                              3)) {
     level = SEQ_LEVEL_4_0;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              2048, 1152, 60.0, 3)) {
+  } else if (does_level_match(width, height, init_framerate, 2048, 1152, 60.0,
+                              3)) {
     level = SEQ_LEVEL_4_1;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              4096, 2176, 30.0, 2)) {
+  } else if (does_level_match(width, height, init_framerate, 4096, 2176, 30.0,
+                              2)) {
     level = SEQ_LEVEL_5_0;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              4096, 2176, 60.0, 2)) {
+  } else if (does_level_match(width, height, init_framerate, 4096, 2176, 60.0,
+                              2)) {
     level = SEQ_LEVEL_5_1;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              4096, 2176, 120.0, 2)) {
+  } else if (does_level_match(width, height, init_framerate, 4096, 2176, 120.0,
+                              2)) {
     level = SEQ_LEVEL_5_2;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              8192, 4352, 30.0, 2)) {
+  } else if (does_level_match(width, height, init_framerate, 8192, 4352, 30.0,
+                              2)) {
     level = SEQ_LEVEL_6_0;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              8192, 4352, 60.0, 2)) {
+  } else if (does_level_match(width, height, init_framerate, 8192, 4352, 60.0,
+                              2)) {
     level = SEQ_LEVEL_6_1;
-  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
-                              8192, 4352, 120.0, 2)) {
+  } else if (does_level_match(width, height, init_framerate, 8192, 4352, 120.0,
+                              2)) {
     level = SEQ_LEVEL_6_2;
   }
 
-  SequenceHeader *const seq_params = &cm->seq_params;
   for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
-    seq->seq_level_idx[i] = level;
+    seq_params->seq_level_idx[i] = level;
     // Set the maximum parameters for bitrate and buffer size for this profile,
     // level, and tier
     seq_params->op_params[i].bitrate = av1_max_level_bitrate(
-        cm->seq_params.profile, seq->seq_level_idx[i], seq->tier[i]);
+        seq_params->profile, seq_params->seq_level_idx[i], seq_params->tier[i]);
     // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
     // check
     if (seq_params->op_params[i].bitrate == 0)
       aom_internal_error(
-          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          &ppi->error, AOM_CODEC_UNSUP_BITSTREAM,
           "AV1 does not support this combination of profile, level, and tier.");
     // Buffer size in bits/s is bitrate in bits/s * 1 s
     seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
   }
 }
 
-static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
-                                  const AV1EncoderConfig *oxcf, int use_svc) {
-  seq->still_picture = (oxcf->force_video_mode == 0) && (oxcf->limit == 1);
-  seq->reduced_still_picture_hdr = seq->still_picture;
-  seq->reduced_still_picture_hdr &= !oxcf->full_still_picture_hdr;
-  seq->force_screen_content_tools = (oxcf->mode == REALTIME) ? 0 : 2;
+void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
+                               const AV1EncoderConfig *oxcf, int use_svc) {
+  SequenceHeader *const seq = &ppi->seq_params;
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+  const ToolCfg *const tool_cfg = &oxcf->tool_cfg;
+
+  seq->still_picture =
+      !tool_cfg->force_video_mode && (oxcf->input_cfg.limit == 1);
+  seq->reduced_still_picture_hdr =
+      seq->still_picture && !tool_cfg->full_still_picture_hdr;
+  seq->force_screen_content_tools = 2;
   seq->force_integer_mv = 2;
-  seq->order_hint_info.enable_order_hint = oxcf->enable_order_hint;
+  seq->order_hint_info.enable_order_hint = tool_cfg->enable_order_hint;
   seq->frame_id_numbers_present_flag =
-      !(seq->still_picture && seq->reduced_still_picture_hdr) &&
-      !oxcf->large_scale_tile && oxcf->error_resilient_mode && !use_svc;
-  if (seq->still_picture && seq->reduced_still_picture_hdr) {
+      !seq->reduced_still_picture_hdr &&
+      !oxcf->tile_cfg.enable_large_scale_tile &&
+      tool_cfg->error_resilient_mode && !use_svc;
+  if (seq->reduced_still_picture_hdr) {
     seq->order_hint_info.enable_order_hint = 0;
     seq->force_screen_content_tools = 2;
     seq->force_integer_mv = 2;
@@ -1341,11 +412,12 @@ static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
           ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1
           : -1;
 
-  seq->max_frame_width =
-      oxcf->forced_max_frame_width ? oxcf->forced_max_frame_width : oxcf->width;
-  seq->max_frame_height = oxcf->forced_max_frame_height
-                              ? oxcf->forced_max_frame_height
-                              : oxcf->height;
+  seq->max_frame_width = frm_dim_cfg->forced_max_frame_width
+                             ? frm_dim_cfg->forced_max_frame_width
+                             : frm_dim_cfg->width;
+  seq->max_frame_height = frm_dim_cfg->forced_max_frame_height
+                              ? frm_dim_cfg->forced_max_frame_height
+                              : frm_dim_cfg->height;
   seq->num_bits_width =
       (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1;
   seq->num_bits_height =
@@ -1356,23 +428,25 @@ static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
   seq->frame_id_length = FRAME_ID_LENGTH;
   seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
 
-  seq->enable_dual_filter = oxcf->enable_dual_filter;
-  seq->order_hint_info.enable_dist_wtd_comp = oxcf->enable_dist_wtd_comp;
+  seq->enable_dual_filter = tool_cfg->enable_dual_filter;
+  seq->order_hint_info.enable_dist_wtd_comp =
+      oxcf->comp_type_cfg.enable_dist_wtd_comp;
   seq->order_hint_info.enable_dist_wtd_comp &=
       seq->order_hint_info.enable_order_hint;
-  seq->order_hint_info.enable_ref_frame_mvs = oxcf->enable_ref_frame_mvs;
+  seq->order_hint_info.enable_ref_frame_mvs = tool_cfg->ref_frame_mvs_present;
   seq->order_hint_info.enable_ref_frame_mvs &=
       seq->order_hint_info.enable_order_hint;
-  seq->enable_superres = oxcf->enable_superres;
-  seq->enable_cdef = oxcf->enable_cdef;
-  seq->enable_restoration = oxcf->enable_restoration;
-  seq->enable_warped_motion = oxcf->enable_warped_motion;
-  seq->enable_interintra_compound = oxcf->enable_interintra_comp;
-  seq->enable_masked_compound = oxcf->enable_masked_comp;
-  seq->enable_intra_edge_filter = oxcf->enable_intra_edge_filter;
-  seq->enable_filter_intra = oxcf->enable_filter_intra;
-
-  set_bitstream_level_tier(seq, cm, oxcf);
+  seq->enable_superres = oxcf->superres_cfg.enable_superres;
+  seq->enable_cdef = tool_cfg->cdef_control != CDEF_NONE ? 1 : 0;
+  seq->enable_restoration = tool_cfg->enable_restoration;
+  seq->enable_warped_motion = oxcf->motion_mode_cfg.enable_warped_motion;
+  seq->enable_interintra_compound = tool_cfg->enable_interintra_comp;
+  seq->enable_masked_compound = oxcf->comp_type_cfg.enable_masked_comp;
+  seq->enable_intra_edge_filter = oxcf->intra_mode_cfg.enable_intra_edge_filter;
+  seq->enable_filter_intra = oxcf->intra_mode_cfg.enable_filter_intra;
+
+  set_bitstream_level_tier(ppi, frm_dim_cfg->width, frm_dim_cfg->height,
+                           oxcf->input_cfg.init_framerate);
 
   if (seq->operating_points_cnt_minus_1 == 0) {
     seq->operating_point_idc[0] = 0;
@@ -1383,53 +457,55 @@ static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
     // skip decoding enhancement  layers (temporal first).
     int i = 0;
     assert(seq->operating_points_cnt_minus_1 ==
-           (int)(cm->number_spatial_layers * cm->number_temporal_layers - 1));
-    for (unsigned int sl = 0; sl < cm->number_spatial_layers; sl++) {
-      for (unsigned int tl = 0; tl < cm->number_temporal_layers; tl++) {
+           (int)(ppi->number_spatial_layers * ppi->number_temporal_layers - 1));
+    for (unsigned int sl = 0; sl < ppi->number_spatial_layers; sl++) {
+      for (unsigned int tl = 0; tl < ppi->number_temporal_layers; tl++) {
         seq->operating_point_idc[i] =
-            (~(~0u << (cm->number_spatial_layers - sl)) << 8) |
-            ~(~0u << (cm->number_temporal_layers - tl));
+            (~(~0u << (ppi->number_spatial_layers - sl)) << 8) |
+            ~(~0u << (ppi->number_temporal_layers - tl));
         i++;
       }
     }
   }
 }
 
-static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
-  AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
-  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+static void init_config_sequence(struct AV1_PRIMARY *ppi,
+                                 const AV1EncoderConfig *oxcf) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+  const ColorCfg *const color_cfg = &oxcf->color_cfg;
 
-  cpi->oxcf = *oxcf;
-  cpi->framerate = oxcf->init_framerate;
+  ppi->use_svc = 0;
+  ppi->number_spatial_layers = 1;
+  ppi->number_temporal_layers = 1;
 
   seq_params->profile = oxcf->profile;
-  seq_params->bit_depth = oxcf->bit_depth;
+  seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
   seq_params->use_highbitdepth = oxcf->use_highbitdepth;
-  seq_params->color_primaries = oxcf->color_primaries;
-  seq_params->transfer_characteristics = oxcf->transfer_characteristics;
-  seq_params->matrix_coefficients = oxcf->matrix_coefficients;
-  seq_params->monochrome = oxcf->monochrome;
-  seq_params->chroma_sample_position = oxcf->chroma_sample_position;
-  seq_params->color_range = oxcf->color_range;
-  seq_params->timing_info_present = oxcf->timing_info_present;
+  seq_params->color_primaries = color_cfg->color_primaries;
+  seq_params->transfer_characteristics = color_cfg->transfer_characteristics;
+  seq_params->matrix_coefficients = color_cfg->matrix_coefficients;
+  seq_params->monochrome = oxcf->tool_cfg.enable_monochrome;
+  seq_params->chroma_sample_position = color_cfg->chroma_sample_position;
+  seq_params->color_range = color_cfg->color_range;
+  seq_params->timing_info_present = dec_model_cfg->timing_info_present;
   seq_params->timing_info.num_units_in_display_tick =
-      oxcf->timing_info.num_units_in_display_tick;
-  seq_params->timing_info.time_scale = oxcf->timing_info.time_scale;
+      dec_model_cfg->timing_info.num_units_in_display_tick;
+  seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale;
   seq_params->timing_info.equal_picture_interval =
-      oxcf->timing_info.equal_picture_interval;
+      dec_model_cfg->timing_info.equal_picture_interval;
   seq_params->timing_info.num_ticks_per_picture =
-      oxcf->timing_info.num_ticks_per_picture;
+      dec_model_cfg->timing_info.num_ticks_per_picture;
 
   seq_params->display_model_info_present_flag =
-      oxcf->display_model_info_present_flag;
+      dec_model_cfg->display_model_info_present_flag;
   seq_params->decoder_model_info_present_flag =
-      oxcf->decoder_model_info_present_flag;
-  if (oxcf->decoder_model_info_present_flag) {
+      dec_model_cfg->decoder_model_info_present_flag;
+  if (dec_model_cfg->decoder_model_info_present_flag) {
     // set the decoder model parameters in schedule mode
     seq_params->decoder_model_info.num_units_in_decoding_tick =
-        oxcf->buffer_model.num_units_in_decoding_tick;
-    cm->buffer_removal_time_present = 1;
+        dec_model_cfg->num_units_in_decoding_tick;
+    ppi->buffer_removal_time_present = 1;
     av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
     av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
   } else if (seq_params->timing_info_present &&
@@ -1459,39 +535,45 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
       seq_params->subsampling_y = 0;
     } else {
       if (seq_params->bit_depth == AOM_BITS_12) {
-        seq_params->subsampling_x = oxcf->chroma_subsampling_x;
-        seq_params->subsampling_y = oxcf->chroma_subsampling_y;
+        seq_params->subsampling_x = oxcf->input_cfg.chroma_subsampling_x;
+        seq_params->subsampling_y = oxcf->input_cfg.chroma_subsampling_y;
       } else {
         seq_params->subsampling_x = 1;
         seq_params->subsampling_y = 0;
       }
     }
   }
+  av1_change_config_seq(ppi, oxcf, NULL);
+}
+
+static void init_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+
+  cpi->oxcf = *oxcf;
+  cpi->framerate = oxcf->input_cfg.init_framerate;
+
+  cm->width = oxcf->frm_dim_cfg.width;
+  cm->height = oxcf->frm_dim_cfg.height;
+  cpi->is_dropped_frame = false;
 
-  cm->width = oxcf->width;
-  cm->height = oxcf->height;
-  set_sb_size(seq_params,
-              select_sb_size(cpi));  // set sb size before allocations
   alloc_compressor_data(cpi);
 
-  update_film_grain_parameters(cpi, oxcf);
+  av1_update_film_grain_parameters(cpi, oxcf);
 
   // Single thread case: use counts in common.
   cpi->td.counts = &cpi->counts;
 
   // Set init SVC parameters.
-  cpi->use_svc = 0;
-  cpi->svc.external_ref_frame_config = 0;
+  cpi->svc.set_ref_frame_config = 0;
   cpi->svc.non_reference_frame = 0;
   cpi->svc.number_spatial_layers = 1;
   cpi->svc.number_temporal_layers = 1;
-  cm->number_spatial_layers = 1;
-  cm->number_temporal_layers = 1;
   cm->spatial_layer_id = 0;
   cm->temporal_layer_id = 0;
 
   // change includes all joint functionality
-  av1_change_config(cpi, oxcf);
+  av1_change_config(cpi, oxcf, false);
 
   cpi->ref_frame_flags = 0;
 
@@ -1500,1322 +582,48 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
   resize_pending_params->height = 0;
 
   init_buffer_indices(&cpi->force_intpel_info, cm->remapped_ref_idx);
-}
-
-static void set_rc_buffer_sizes(RATE_CONTROL *rc,
-                                const AV1EncoderConfig *oxcf) {
-  const int64_t bandwidth = oxcf->target_bandwidth;
-  const int64_t starting = oxcf->starting_buffer_level_ms;
-  const int64_t optimal = oxcf->optimal_buffer_level_ms;
-  const int64_t maximum = oxcf->maximum_buffer_size_ms;
-
-  rc->starting_buffer_level = starting * bandwidth / 1000;
-  rc->optimal_buffer_level =
-      (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
-  rc->maximum_buffer_size =
-      (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
-}
-
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                           \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                         \
-  cpi->fn_ptr[BT].vf = VF;                                             \
-  cpi->fn_ptr[BT].svf = SVF;                                           \
-  cpi->fn_ptr[BT].svaf = SVAF;                                         \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
-  cpi->fn_ptr[BT].jsdaf = JSDAF;                                       \
-  cpi->fn_ptr[BT].jsvaf = JSVAF;
-
-#define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
-  static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
-                                     int source_stride,                        \
-                                     const uint8_t *ref_ptr, int ref_stride) { \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride);                \
-  }                                                                            \
-  static unsigned int fnname##_bits10(                                         \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride) {                                                        \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2;           \
-  }                                                                            \
-  static unsigned int fnname##_bits12(                                         \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride) {                                                        \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4;           \
-  }
-
-#define MAKE_BFP_SADAVG_WRAPPER(fnname)                                        \
-  static unsigned int fnname##_bits8(                                          \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, const uint8_t *second_pred) {                            \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred);   \
-  }                                                                            \
-  static unsigned int fnname##_bits10(                                         \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, const uint8_t *second_pred) {                            \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
-           2;                                                                  \
-  }                                                                            \
-  static unsigned int fnname##_bits12(                                         \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, const uint8_t *second_pred) {                            \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
-           4;                                                                  \
-  }
-
-#define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
-  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
-                             const uint8_t *const ref_ptr[], int ref_stride,  \
-                             unsigned int *sad_array) {                       \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
-  }                                                                           \
-  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride,      \
-                              const uint8_t *const ref_ptr[], int ref_stride, \
-                              unsigned int *sad_array) {                      \
-    int i;                                                                    \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
-    for (i = 0; i < 4; i++) sad_array[i] >>= 2;                               \
-  }                                                                           \
-  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride,      \
-                              const uint8_t *const ref_ptr[], int ref_stride, \
-                              unsigned int *sad_array) {                      \
-    int i;                                                                    \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
-    for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
-  }
-
-#define MAKE_BFP_JSADAVG_WRAPPER(fnname)                                    \
-  static unsigned int fnname##_bits8(                                       \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
-      int ref_stride, const uint8_t *second_pred,                           \
-      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
-                  jcp_param);                                               \
-  }                                                                         \
-  static unsigned int fnname##_bits10(                                      \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
-      int ref_stride, const uint8_t *second_pred,                           \
-      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
-                  jcp_param) >>                                             \
-           2;                                                               \
-  }                                                                         \
-  static unsigned int fnname##_bits12(                                      \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
-      int ref_stride, const uint8_t *second_pred,                           \
-      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
-                  jcp_param) >>                                             \
-           4;                                                               \
-  }
-
-#if CONFIG_AV1_HIGHBITDEPTH
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
-
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
-
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg)
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
-  cpi->fn_ptr[BT].msdf = MCSDF;       \
-  cpi->fn_ptr[BT].msvf = MCSVF;
-
-#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname)                           \
-  static unsigned int fnname##_bits8(                                    \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
-      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
-      int m_stride, int invert_mask) {                                   \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
-                  second_pred_ptr, m, m_stride, invert_mask);            \
-  }                                                                      \
-  static unsigned int fnname##_bits10(                                   \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
-      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
-      int m_stride, int invert_mask) {                                   \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
-                  second_pred_ptr, m, m_stride, invert_mask) >>          \
-           2;                                                            \
-  }                                                                      \
-  static unsigned int fnname##_bits12(                                   \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
-      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
-      int m_stride, int invert_mask) {                                   \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
-                  second_pred_ptr, m, m_stride, invert_mask) >>          \
-           4;                                                            \
-  }
-
-#if CONFIG_AV1_HIGHBITDEPTH
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
-#endif
-
-#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
-  cpi->fn_ptr[BT].osdf = OSDF;           \
-  cpi->fn_ptr[BT].ovf = OVF;             \
-  cpi->fn_ptr[BT].osvf = OSVF;
-
-#define MAKE_OBFP_SAD_WRAPPER(fnname)                                     \
-  static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride,  \
-                                     const int32_t *wsrc,                 \
-                                     const int32_t *msk) {                \
-    return fnname(ref, ref_stride, wsrc, msk);                            \
-  }                                                                       \
-  static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \
-                                      const int32_t *wsrc,                \
-                                      const int32_t *msk) {               \
-    return fnname(ref, ref_stride, wsrc, msk) >> 2;                       \
-  }                                                                       \
-  static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \
-                                      const int32_t *wsrc,                \
-                                      const int32_t *msk) {               \
-    return fnname(ref, ref_stride, wsrc, msk) >> 4;                       \
-  }
-
-#if CONFIG_AV1_HIGHBITDEPTH
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
-
-static void highbd_set_var_fns(AV1_COMP *const cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  if (cm->seq_params.use_highbitdepth) {
-    switch (cm->seq_params.bit_depth) {
-      case AOM_BITS_8:
-        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8,
-                   aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16,
-                   aom_highbd_8_sub_pixel_variance64x16,
-                   aom_highbd_8_sub_pixel_avg_variance64x16,
-                   aom_highbd_sad64x16x4d_bits8,
-                   aom_highbd_dist_wtd_sad64x16_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16)
-
-        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8,
-                   aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64,
-                   aom_highbd_8_sub_pixel_variance16x64,
-                   aom_highbd_8_sub_pixel_avg_variance16x64,
-                   aom_highbd_sad16x64x4d_bits8,
-                   aom_highbd_dist_wtd_sad16x64_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64)
-
-        HIGHBD_BFP(
-            BLOCK_32X8, aom_highbd_sad32x8_bits8, aom_highbd_sad32x8_avg_bits8,
-            aom_highbd_8_variance32x8, aom_highbd_8_sub_pixel_variance32x8,
-            aom_highbd_8_sub_pixel_avg_variance32x8,
-            aom_highbd_sad32x8x4d_bits8, aom_highbd_dist_wtd_sad32x8_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8)
-
-        HIGHBD_BFP(
-            BLOCK_8X32, aom_highbd_sad8x32_bits8, aom_highbd_sad8x32_avg_bits8,
-            aom_highbd_8_variance8x32, aom_highbd_8_sub_pixel_variance8x32,
-            aom_highbd_8_sub_pixel_avg_variance8x32,
-            aom_highbd_sad8x32x4d_bits8, aom_highbd_dist_wtd_sad8x32_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32)
-
-        HIGHBD_BFP(
-            BLOCK_16X4, aom_highbd_sad16x4_bits8, aom_highbd_sad16x4_avg_bits8,
-            aom_highbd_8_variance16x4, aom_highbd_8_sub_pixel_variance16x4,
-            aom_highbd_8_sub_pixel_avg_variance16x4,
-            aom_highbd_sad16x4x4d_bits8, aom_highbd_dist_wtd_sad16x4_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4)
-
-        HIGHBD_BFP(
-            BLOCK_4X16, aom_highbd_sad4x16_bits8, aom_highbd_sad4x16_avg_bits8,
-            aom_highbd_8_variance4x16, aom_highbd_8_sub_pixel_variance4x16,
-            aom_highbd_8_sub_pixel_avg_variance4x16,
-            aom_highbd_sad4x16x4d_bits8, aom_highbd_dist_wtd_sad4x16_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16)
-
-        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8,
-                   aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16,
-                   aom_highbd_8_sub_pixel_variance32x16,
-                   aom_highbd_8_sub_pixel_avg_variance32x16,
-                   aom_highbd_sad32x16x4d_bits8,
-                   aom_highbd_dist_wtd_sad32x16_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16)
-
-        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8,
-                   aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32,
-                   aom_highbd_8_sub_pixel_variance16x32,
-                   aom_highbd_8_sub_pixel_avg_variance16x32,
-                   aom_highbd_sad16x32x4d_bits8,
-                   aom_highbd_dist_wtd_sad16x32_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32)
-
-        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8,
-                   aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32,
-                   aom_highbd_8_sub_pixel_variance64x32,
-                   aom_highbd_8_sub_pixel_avg_variance64x32,
-                   aom_highbd_sad64x32x4d_bits8,
-                   aom_highbd_dist_wtd_sad64x32_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32)
-
-        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8,
-                   aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64,
-                   aom_highbd_8_sub_pixel_variance32x64,
-                   aom_highbd_8_sub_pixel_avg_variance32x64,
-                   aom_highbd_sad32x64x4d_bits8,
-                   aom_highbd_dist_wtd_sad32x64_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64)
-
-        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8,
-                   aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32,
-                   aom_highbd_8_sub_pixel_variance32x32,
-                   aom_highbd_8_sub_pixel_avg_variance32x32,
-                   aom_highbd_sad32x32x4d_bits8,
-                   aom_highbd_dist_wtd_sad32x32_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32)
-
-        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8,
-                   aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64,
-                   aom_highbd_8_sub_pixel_variance64x64,
-                   aom_highbd_8_sub_pixel_avg_variance64x64,
-                   aom_highbd_sad64x64x4d_bits8,
-                   aom_highbd_dist_wtd_sad64x64_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64)
-
-        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8,
-                   aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16,
-                   aom_highbd_8_sub_pixel_variance16x16,
-                   aom_highbd_8_sub_pixel_avg_variance16x16,
-                   aom_highbd_sad16x16x4d_bits8,
-                   aom_highbd_dist_wtd_sad16x16_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16)
-
-        HIGHBD_BFP(
-            BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8,
-            aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8,
-            aom_highbd_8_sub_pixel_avg_variance16x8,
-            aom_highbd_sad16x8x4d_bits8, aom_highbd_dist_wtd_sad16x8_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8)
-
-        HIGHBD_BFP(
-            BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8,
-            aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16,
-            aom_highbd_8_sub_pixel_avg_variance8x16,
-            aom_highbd_sad8x16x4d_bits8, aom_highbd_dist_wtd_sad8x16_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16)
-
-        HIGHBD_BFP(
-            BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8,
-            aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8,
-            aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x4d_bits8,
-            aom_highbd_dist_wtd_sad8x8_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8)
-
-        HIGHBD_BFP(
-            BLOCK_8X4, aom_highbd_sad8x4_bits8, aom_highbd_sad8x4_avg_bits8,
-            aom_highbd_8_variance8x4, aom_highbd_8_sub_pixel_variance8x4,
-            aom_highbd_8_sub_pixel_avg_variance8x4, aom_highbd_sad8x4x4d_bits8,
-            aom_highbd_dist_wtd_sad8x4_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4)
-
-        HIGHBD_BFP(
-            BLOCK_4X8, aom_highbd_sad4x8_bits8, aom_highbd_sad4x8_avg_bits8,
-            aom_highbd_8_variance4x8, aom_highbd_8_sub_pixel_variance4x8,
-            aom_highbd_8_sub_pixel_avg_variance4x8, aom_highbd_sad4x8x4d_bits8,
-            aom_highbd_dist_wtd_sad4x8_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8)
-
-        HIGHBD_BFP(
-            BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8,
-            aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4,
-            aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x4d_bits8,
-            aom_highbd_dist_wtd_sad4x4_avg_bits8,
-            aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4)
-
-        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8,
-                   aom_highbd_sad128x128_avg_bits8,
-                   aom_highbd_8_variance128x128,
-                   aom_highbd_8_sub_pixel_variance128x128,
-                   aom_highbd_8_sub_pixel_avg_variance128x128,
-                   aom_highbd_sad128x128x4d_bits8,
-                   aom_highbd_dist_wtd_sad128x128_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128)
-
-        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8,
-                   aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64,
-                   aom_highbd_8_sub_pixel_variance128x64,
-                   aom_highbd_8_sub_pixel_avg_variance128x64,
-                   aom_highbd_sad128x64x4d_bits8,
-                   aom_highbd_dist_wtd_sad128x64_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64)
-
-        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8,
-                   aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128,
-                   aom_highbd_8_sub_pixel_variance64x128,
-                   aom_highbd_8_sub_pixel_avg_variance64x128,
-                   aom_highbd_sad64x128x4d_bits8,
-                   aom_highbd_dist_wtd_sad64x128_avg_bits8,
-                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128)
-
-        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance128x128)
-        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance128x64)
-        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance64x128)
-        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance64x64)
-        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance64x32)
-        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance32x64)
-        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance32x32)
-        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance32x16)
-        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance16x32)
-        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance16x16)
-        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance8x16)
-        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance16x8)
-        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance8x8)
-        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance4x8)
-        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance8x4)
-        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance4x4)
-        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance64x16)
-        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance16x64)
-        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance32x8)
-        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance8x32)
-        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance16x4)
-        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance4x16)
-        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8,
-                    aom_highbd_obmc_variance128x128,
-                    aom_highbd_obmc_sub_pixel_variance128x128)
-        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits8,
-                    aom_highbd_obmc_variance128x64,
-                    aom_highbd_obmc_sub_pixel_variance128x64)
-        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8,
-                    aom_highbd_obmc_variance64x128,
-                    aom_highbd_obmc_sub_pixel_variance64x128)
-        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8,
-                    aom_highbd_obmc_variance64x64,
-                    aom_highbd_obmc_sub_pixel_variance64x64)
-        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits8,
-                    aom_highbd_obmc_variance64x32,
-                    aom_highbd_obmc_sub_pixel_variance64x32)
-        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits8,
-                    aom_highbd_obmc_variance32x64,
-                    aom_highbd_obmc_sub_pixel_variance32x64)
-        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits8,
-                    aom_highbd_obmc_variance32x32,
-                    aom_highbd_obmc_sub_pixel_variance32x32)
-        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits8,
-                    aom_highbd_obmc_variance32x16,
-                    aom_highbd_obmc_sub_pixel_variance32x16)
-        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits8,
-                    aom_highbd_obmc_variance16x32,
-                    aom_highbd_obmc_sub_pixel_variance16x32)
-        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits8,
-                    aom_highbd_obmc_variance16x16,
-                    aom_highbd_obmc_sub_pixel_variance16x16)
-        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits8,
-                    aom_highbd_obmc_variance8x16,
-                    aom_highbd_obmc_sub_pixel_variance8x16)
-        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits8,
-                    aom_highbd_obmc_variance16x8,
-                    aom_highbd_obmc_sub_pixel_variance16x8)
-        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits8,
-                    aom_highbd_obmc_variance8x8,
-                    aom_highbd_obmc_sub_pixel_variance8x8)
-        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits8,
-                    aom_highbd_obmc_variance4x8,
-                    aom_highbd_obmc_sub_pixel_variance4x8)
-        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits8,
-                    aom_highbd_obmc_variance8x4,
-                    aom_highbd_obmc_sub_pixel_variance8x4)
-        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8,
-                    aom_highbd_obmc_variance4x4,
-                    aom_highbd_obmc_sub_pixel_variance4x4)
-        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits8,
-                    aom_highbd_obmc_variance64x16,
-                    aom_highbd_obmc_sub_pixel_variance64x16)
-        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits8,
-                    aom_highbd_obmc_variance16x64,
-                    aom_highbd_obmc_sub_pixel_variance16x64)
-        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits8,
-                    aom_highbd_obmc_variance32x8,
-                    aom_highbd_obmc_sub_pixel_variance32x8)
-        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits8,
-                    aom_highbd_obmc_variance8x32,
-                    aom_highbd_obmc_sub_pixel_variance8x32)
-        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits8,
-                    aom_highbd_obmc_variance16x4,
-                    aom_highbd_obmc_sub_pixel_variance16x4)
-        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits8,
-                    aom_highbd_obmc_variance4x16,
-                    aom_highbd_obmc_sub_pixel_variance4x16)
-        break;
-
-      case AOM_BITS_10:
-        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits10,
-                   aom_highbd_sad64x16_avg_bits10, aom_highbd_10_variance64x16,
-                   aom_highbd_10_sub_pixel_variance64x16,
-                   aom_highbd_10_sub_pixel_avg_variance64x16,
-                   aom_highbd_sad64x16x4d_bits10,
-                   aom_highbd_dist_wtd_sad64x16_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16);
-
-        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10,
-                   aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64,
-                   aom_highbd_10_sub_pixel_variance16x64,
-                   aom_highbd_10_sub_pixel_avg_variance16x64,
-                   aom_highbd_sad16x64x4d_bits10,
-                   aom_highbd_dist_wtd_sad16x64_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64);
-
-        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10,
-                   aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8,
-                   aom_highbd_10_sub_pixel_variance32x8,
-                   aom_highbd_10_sub_pixel_avg_variance32x8,
-                   aom_highbd_sad32x8x4d_bits10,
-                   aom_highbd_dist_wtd_sad32x8_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8);
-
-        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10,
-                   aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32,
-                   aom_highbd_10_sub_pixel_variance8x32,
-                   aom_highbd_10_sub_pixel_avg_variance8x32,
-                   aom_highbd_sad8x32x4d_bits10,
-                   aom_highbd_dist_wtd_sad8x32_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32);
-
-        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10,
-                   aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4,
-                   aom_highbd_10_sub_pixel_variance16x4,
-                   aom_highbd_10_sub_pixel_avg_variance16x4,
-                   aom_highbd_sad16x4x4d_bits10,
-                   aom_highbd_dist_wtd_sad16x4_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4);
-
-        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10,
-                   aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16,
-                   aom_highbd_10_sub_pixel_variance4x16,
-                   aom_highbd_10_sub_pixel_avg_variance4x16,
-                   aom_highbd_sad4x16x4d_bits10,
-                   aom_highbd_dist_wtd_sad4x16_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16);
-
-        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10,
-                   aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16,
-                   aom_highbd_10_sub_pixel_variance32x16,
-                   aom_highbd_10_sub_pixel_avg_variance32x16,
-                   aom_highbd_sad32x16x4d_bits10,
-                   aom_highbd_dist_wtd_sad32x16_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16);
-
-        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10,
-                   aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32,
-                   aom_highbd_10_sub_pixel_variance16x32,
-                   aom_highbd_10_sub_pixel_avg_variance16x32,
-                   aom_highbd_sad16x32x4d_bits10,
-                   aom_highbd_dist_wtd_sad16x32_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32);
-
-        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10,
-                   aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32,
-                   aom_highbd_10_sub_pixel_variance64x32,
-                   aom_highbd_10_sub_pixel_avg_variance64x32,
-                   aom_highbd_sad64x32x4d_bits10,
-                   aom_highbd_dist_wtd_sad64x32_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32);
-
-        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10,
-                   aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64,
-                   aom_highbd_10_sub_pixel_variance32x64,
-                   aom_highbd_10_sub_pixel_avg_variance32x64,
-                   aom_highbd_sad32x64x4d_bits10,
-                   aom_highbd_dist_wtd_sad32x64_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64);
-
-        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10,
-                   aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32,
-                   aom_highbd_10_sub_pixel_variance32x32,
-                   aom_highbd_10_sub_pixel_avg_variance32x32,
-                   aom_highbd_sad32x32x4d_bits10,
-                   aom_highbd_dist_wtd_sad32x32_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32);
-
-        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10,
-                   aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64,
-                   aom_highbd_10_sub_pixel_variance64x64,
-                   aom_highbd_10_sub_pixel_avg_variance64x64,
-                   aom_highbd_sad64x64x4d_bits10,
-                   aom_highbd_dist_wtd_sad64x64_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64);
-
-        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10,
-                   aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16,
-                   aom_highbd_10_sub_pixel_variance16x16,
-                   aom_highbd_10_sub_pixel_avg_variance16x16,
-                   aom_highbd_sad16x16x4d_bits10,
-                   aom_highbd_dist_wtd_sad16x16_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16);
-
-        HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10,
-                   aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8,
-                   aom_highbd_10_sub_pixel_variance16x8,
-                   aom_highbd_10_sub_pixel_avg_variance16x8,
-                   aom_highbd_sad16x8x4d_bits10,
-                   aom_highbd_dist_wtd_sad16x8_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8);
-
-        HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10,
-                   aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16,
-                   aom_highbd_10_sub_pixel_variance8x16,
-                   aom_highbd_10_sub_pixel_avg_variance8x16,
-                   aom_highbd_sad8x16x4d_bits10,
-                   aom_highbd_dist_wtd_sad8x16_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16);
-
-        HIGHBD_BFP(
-            BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10,
-            aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8,
-            aom_highbd_10_sub_pixel_avg_variance8x8,
-            aom_highbd_sad8x8x4d_bits10, aom_highbd_dist_wtd_sad8x8_avg_bits10,
-            aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8);
-
-        HIGHBD_BFP(
-            BLOCK_8X4, aom_highbd_sad8x4_bits10, aom_highbd_sad8x4_avg_bits10,
-            aom_highbd_10_variance8x4, aom_highbd_10_sub_pixel_variance8x4,
-            aom_highbd_10_sub_pixel_avg_variance8x4,
-            aom_highbd_sad8x4x4d_bits10, aom_highbd_dist_wtd_sad8x4_avg_bits10,
-            aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4);
-
-        HIGHBD_BFP(
-            BLOCK_4X8, aom_highbd_sad4x8_bits10, aom_highbd_sad4x8_avg_bits10,
-            aom_highbd_10_variance4x8, aom_highbd_10_sub_pixel_variance4x8,
-            aom_highbd_10_sub_pixel_avg_variance4x8,
-            aom_highbd_sad4x8x4d_bits10, aom_highbd_dist_wtd_sad4x8_avg_bits10,
-            aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8);
-
-        HIGHBD_BFP(
-            BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10,
-            aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4,
-            aom_highbd_10_sub_pixel_avg_variance4x4,
-            aom_highbd_sad4x4x4d_bits10, aom_highbd_dist_wtd_sad4x4_avg_bits10,
-            aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4);
-
-        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits10,
-                   aom_highbd_sad128x128_avg_bits10,
-                   aom_highbd_10_variance128x128,
-                   aom_highbd_10_sub_pixel_variance128x128,
-                   aom_highbd_10_sub_pixel_avg_variance128x128,
-                   aom_highbd_sad128x128x4d_bits10,
-                   aom_highbd_dist_wtd_sad128x128_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128);
-
-        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10,
-                   aom_highbd_sad128x64_avg_bits10,
-                   aom_highbd_10_variance128x64,
-                   aom_highbd_10_sub_pixel_variance128x64,
-                   aom_highbd_10_sub_pixel_avg_variance128x64,
-                   aom_highbd_sad128x64x4d_bits10,
-                   aom_highbd_dist_wtd_sad128x64_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64);
-
-        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10,
-                   aom_highbd_sad64x128_avg_bits10,
-                   aom_highbd_10_variance64x128,
-                   aom_highbd_10_sub_pixel_variance64x128,
-                   aom_highbd_10_sub_pixel_avg_variance64x128,
-                   aom_highbd_sad64x128x4d_bits10,
-                   aom_highbd_dist_wtd_sad64x128_avg_bits10,
-                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128);
-
-        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance128x128)
-        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance128x64)
-        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance64x128)
-        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance64x64)
-        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance64x32)
-        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance32x64)
-        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance32x32)
-        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance32x16)
-        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance16x32)
-        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance16x16)
-        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance8x16)
-        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance16x8)
-        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance8x8)
-        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance4x8)
-        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance8x4)
-        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance4x4)
-        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance64x16)
-        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance16x64)
-        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance32x8)
-        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance8x32)
-        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance16x4)
-        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance4x16)
-        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10,
-                    aom_highbd_10_obmc_variance128x128,
-                    aom_highbd_10_obmc_sub_pixel_variance128x128)
-        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits10,
-                    aom_highbd_10_obmc_variance128x64,
-                    aom_highbd_10_obmc_sub_pixel_variance128x64)
-        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10,
-                    aom_highbd_10_obmc_variance64x128,
-                    aom_highbd_10_obmc_sub_pixel_variance64x128)
-        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10,
-                    aom_highbd_10_obmc_variance64x64,
-                    aom_highbd_10_obmc_sub_pixel_variance64x64)
-        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits10,
-                    aom_highbd_10_obmc_variance64x32,
-                    aom_highbd_10_obmc_sub_pixel_variance64x32)
-        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits10,
-                    aom_highbd_10_obmc_variance32x64,
-                    aom_highbd_10_obmc_sub_pixel_variance32x64)
-        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits10,
-                    aom_highbd_10_obmc_variance32x32,
-                    aom_highbd_10_obmc_sub_pixel_variance32x32)
-        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits10,
-                    aom_highbd_10_obmc_variance32x16,
-                    aom_highbd_10_obmc_sub_pixel_variance32x16)
-        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits10,
-                    aom_highbd_10_obmc_variance16x32,
-                    aom_highbd_10_obmc_sub_pixel_variance16x32)
-        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits10,
-                    aom_highbd_10_obmc_variance16x16,
-                    aom_highbd_10_obmc_sub_pixel_variance16x16)
-        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits10,
-                    aom_highbd_10_obmc_variance8x16,
-                    aom_highbd_10_obmc_sub_pixel_variance8x16)
-        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits10,
-                    aom_highbd_10_obmc_variance16x8,
-                    aom_highbd_10_obmc_sub_pixel_variance16x8)
-        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits10,
-                    aom_highbd_10_obmc_variance8x8,
-                    aom_highbd_10_obmc_sub_pixel_variance8x8)
-        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits10,
-                    aom_highbd_10_obmc_variance4x8,
-                    aom_highbd_10_obmc_sub_pixel_variance4x8)
-        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits10,
-                    aom_highbd_10_obmc_variance8x4,
-                    aom_highbd_10_obmc_sub_pixel_variance8x4)
-        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10,
-                    aom_highbd_10_obmc_variance4x4,
-                    aom_highbd_10_obmc_sub_pixel_variance4x4)
-
-        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits10,
-                    aom_highbd_10_obmc_variance64x16,
-                    aom_highbd_10_obmc_sub_pixel_variance64x16)
-
-        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits10,
-                    aom_highbd_10_obmc_variance16x64,
-                    aom_highbd_10_obmc_sub_pixel_variance16x64)
-
-        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits10,
-                    aom_highbd_10_obmc_variance32x8,
-                    aom_highbd_10_obmc_sub_pixel_variance32x8)
-
-        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits10,
-                    aom_highbd_10_obmc_variance8x32,
-                    aom_highbd_10_obmc_sub_pixel_variance8x32)
-
-        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits10,
-                    aom_highbd_10_obmc_variance16x4,
-                    aom_highbd_10_obmc_sub_pixel_variance16x4)
-
-        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits10,
-                    aom_highbd_10_obmc_variance4x16,
-                    aom_highbd_10_obmc_sub_pixel_variance4x16)
-        break;
-
-      case AOM_BITS_12:
-        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits12,
-                   aom_highbd_sad64x16_avg_bits12, aom_highbd_12_variance64x16,
-                   aom_highbd_12_sub_pixel_variance64x16,
-                   aom_highbd_12_sub_pixel_avg_variance64x16,
-                   aom_highbd_sad64x16x4d_bits12,
-                   aom_highbd_dist_wtd_sad64x16_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16);
-
-        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12,
-                   aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64,
-                   aom_highbd_12_sub_pixel_variance16x64,
-                   aom_highbd_12_sub_pixel_avg_variance16x64,
-                   aom_highbd_sad16x64x4d_bits12,
-                   aom_highbd_dist_wtd_sad16x64_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64);
-
-        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12,
-                   aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8,
-                   aom_highbd_12_sub_pixel_variance32x8,
-                   aom_highbd_12_sub_pixel_avg_variance32x8,
-                   aom_highbd_sad32x8x4d_bits12,
-                   aom_highbd_dist_wtd_sad32x8_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8);
-
-        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12,
-                   aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32,
-                   aom_highbd_12_sub_pixel_variance8x32,
-                   aom_highbd_12_sub_pixel_avg_variance8x32,
-                   aom_highbd_sad8x32x4d_bits12,
-                   aom_highbd_dist_wtd_sad8x32_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32);
-
-        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12,
-                   aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4,
-                   aom_highbd_12_sub_pixel_variance16x4,
-                   aom_highbd_12_sub_pixel_avg_variance16x4,
-                   aom_highbd_sad16x4x4d_bits12,
-                   aom_highbd_dist_wtd_sad16x4_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4);
-
-        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12,
-                   aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16,
-                   aom_highbd_12_sub_pixel_variance4x16,
-                   aom_highbd_12_sub_pixel_avg_variance4x16,
-                   aom_highbd_sad4x16x4d_bits12,
-                   aom_highbd_dist_wtd_sad4x16_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16);
-
-        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12,
-                   aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16,
-                   aom_highbd_12_sub_pixel_variance32x16,
-                   aom_highbd_12_sub_pixel_avg_variance32x16,
-                   aom_highbd_sad32x16x4d_bits12,
-                   aom_highbd_dist_wtd_sad32x16_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16);
-
-        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12,
-                   aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32,
-                   aom_highbd_12_sub_pixel_variance16x32,
-                   aom_highbd_12_sub_pixel_avg_variance16x32,
-                   aom_highbd_sad16x32x4d_bits12,
-                   aom_highbd_dist_wtd_sad16x32_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32);
-
-        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12,
-                   aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32,
-                   aom_highbd_12_sub_pixel_variance64x32,
-                   aom_highbd_12_sub_pixel_avg_variance64x32,
-                   aom_highbd_sad64x32x4d_bits12,
-                   aom_highbd_dist_wtd_sad64x32_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32);
-
-        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12,
-                   aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64,
-                   aom_highbd_12_sub_pixel_variance32x64,
-                   aom_highbd_12_sub_pixel_avg_variance32x64,
-                   aom_highbd_sad32x64x4d_bits12,
-                   aom_highbd_dist_wtd_sad32x64_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64);
-
-        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12,
-                   aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32,
-                   aom_highbd_12_sub_pixel_variance32x32,
-                   aom_highbd_12_sub_pixel_avg_variance32x32,
-                   aom_highbd_sad32x32x4d_bits12,
-                   aom_highbd_dist_wtd_sad32x32_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32);
-
-        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12,
-                   aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64,
-                   aom_highbd_12_sub_pixel_variance64x64,
-                   aom_highbd_12_sub_pixel_avg_variance64x64,
-                   aom_highbd_sad64x64x4d_bits12,
-                   aom_highbd_dist_wtd_sad64x64_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64);
-
-        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12,
-                   aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16,
-                   aom_highbd_12_sub_pixel_variance16x16,
-                   aom_highbd_12_sub_pixel_avg_variance16x16,
-                   aom_highbd_sad16x16x4d_bits12,
-                   aom_highbd_dist_wtd_sad16x16_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16);
-
-        HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12,
-                   aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8,
-                   aom_highbd_12_sub_pixel_variance16x8,
-                   aom_highbd_12_sub_pixel_avg_variance16x8,
-                   aom_highbd_sad16x8x4d_bits12,
-                   aom_highbd_dist_wtd_sad16x8_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8);
-
-        HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12,
-                   aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16,
-                   aom_highbd_12_sub_pixel_variance8x16,
-                   aom_highbd_12_sub_pixel_avg_variance8x16,
-                   aom_highbd_sad8x16x4d_bits12,
-                   aom_highbd_dist_wtd_sad8x16_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16);
-
-        HIGHBD_BFP(
-            BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12,
-            aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8,
-            aom_highbd_12_sub_pixel_avg_variance8x8,
-            aom_highbd_sad8x8x4d_bits12, aom_highbd_dist_wtd_sad8x8_avg_bits12,
-            aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8);
-
-        HIGHBD_BFP(
-            BLOCK_8X4, aom_highbd_sad8x4_bits12, aom_highbd_sad8x4_avg_bits12,
-            aom_highbd_12_variance8x4, aom_highbd_12_sub_pixel_variance8x4,
-            aom_highbd_12_sub_pixel_avg_variance8x4,
-            aom_highbd_sad8x4x4d_bits12, aom_highbd_dist_wtd_sad8x4_avg_bits12,
-            aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4);
-
-        HIGHBD_BFP(
-            BLOCK_4X8, aom_highbd_sad4x8_bits12, aom_highbd_sad4x8_avg_bits12,
-            aom_highbd_12_variance4x8, aom_highbd_12_sub_pixel_variance4x8,
-            aom_highbd_12_sub_pixel_avg_variance4x8,
-            aom_highbd_sad4x8x4d_bits12, aom_highbd_dist_wtd_sad4x8_avg_bits12,
-            aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8);
-
-        HIGHBD_BFP(
-            BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12,
-            aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4,
-            aom_highbd_12_sub_pixel_avg_variance4x4,
-            aom_highbd_sad4x4x4d_bits12, aom_highbd_dist_wtd_sad4x4_avg_bits12,
-            aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4);
-
-        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits12,
-                   aom_highbd_sad128x128_avg_bits12,
-                   aom_highbd_12_variance128x128,
-                   aom_highbd_12_sub_pixel_variance128x128,
-                   aom_highbd_12_sub_pixel_avg_variance128x128,
-                   aom_highbd_sad128x128x4d_bits12,
-                   aom_highbd_dist_wtd_sad128x128_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128);
-
-        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12,
-                   aom_highbd_sad128x64_avg_bits12,
-                   aom_highbd_12_variance128x64,
-                   aom_highbd_12_sub_pixel_variance128x64,
-                   aom_highbd_12_sub_pixel_avg_variance128x64,
-                   aom_highbd_sad128x64x4d_bits12,
-                   aom_highbd_dist_wtd_sad128x64_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64);
-
-        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12,
-                   aom_highbd_sad64x128_avg_bits12,
-                   aom_highbd_12_variance64x128,
-                   aom_highbd_12_sub_pixel_variance64x128,
-                   aom_highbd_12_sub_pixel_avg_variance64x128,
-                   aom_highbd_sad64x128x4d_bits12,
-                   aom_highbd_dist_wtd_sad64x128_avg_bits12,
-                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128);
-
-        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance128x128)
-        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance128x64)
-        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance64x128)
-        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance64x64)
-        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance64x32)
-        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance32x64)
-        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance32x32)
-        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance32x16)
-        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance16x32)
-        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance16x16)
-        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance8x16)
-        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance16x8)
-        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance8x8)
-        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance4x8)
-        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance8x4)
-        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance4x4)
-        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance64x16)
-        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance16x64)
-        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance32x8)
-        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance8x32)
-        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance16x4)
-        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance4x16)
-        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12,
-                    aom_highbd_12_obmc_variance128x128,
-                    aom_highbd_12_obmc_sub_pixel_variance128x128)
-        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits12,
-                    aom_highbd_12_obmc_variance128x64,
-                    aom_highbd_12_obmc_sub_pixel_variance128x64)
-        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12,
-                    aom_highbd_12_obmc_variance64x128,
-                    aom_highbd_12_obmc_sub_pixel_variance64x128)
-        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12,
-                    aom_highbd_12_obmc_variance64x64,
-                    aom_highbd_12_obmc_sub_pixel_variance64x64)
-        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits12,
-                    aom_highbd_12_obmc_variance64x32,
-                    aom_highbd_12_obmc_sub_pixel_variance64x32)
-        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits12,
-                    aom_highbd_12_obmc_variance32x64,
-                    aom_highbd_12_obmc_sub_pixel_variance32x64)
-        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits12,
-                    aom_highbd_12_obmc_variance32x32,
-                    aom_highbd_12_obmc_sub_pixel_variance32x32)
-        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits12,
-                    aom_highbd_12_obmc_variance32x16,
-                    aom_highbd_12_obmc_sub_pixel_variance32x16)
-        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits12,
-                    aom_highbd_12_obmc_variance16x32,
-                    aom_highbd_12_obmc_sub_pixel_variance16x32)
-        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits12,
-                    aom_highbd_12_obmc_variance16x16,
-                    aom_highbd_12_obmc_sub_pixel_variance16x16)
-        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits12,
-                    aom_highbd_12_obmc_variance8x16,
-                    aom_highbd_12_obmc_sub_pixel_variance8x16)
-        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits12,
-                    aom_highbd_12_obmc_variance16x8,
-                    aom_highbd_12_obmc_sub_pixel_variance16x8)
-        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits12,
-                    aom_highbd_12_obmc_variance8x8,
-                    aom_highbd_12_obmc_sub_pixel_variance8x8)
-        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits12,
-                    aom_highbd_12_obmc_variance4x8,
-                    aom_highbd_12_obmc_sub_pixel_variance4x8)
-        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits12,
-                    aom_highbd_12_obmc_variance8x4,
-                    aom_highbd_12_obmc_sub_pixel_variance8x4)
-        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12,
-                    aom_highbd_12_obmc_variance4x4,
-                    aom_highbd_12_obmc_sub_pixel_variance4x4)
-        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits12,
-                    aom_highbd_12_obmc_variance64x16,
-                    aom_highbd_12_obmc_sub_pixel_variance64x16)
-        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits12,
-                    aom_highbd_12_obmc_variance16x64,
-                    aom_highbd_12_obmc_sub_pixel_variance16x64)
-        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits12,
-                    aom_highbd_12_obmc_variance32x8,
-                    aom_highbd_12_obmc_sub_pixel_variance32x8)
-        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits12,
-                    aom_highbd_12_obmc_variance8x32,
-                    aom_highbd_12_obmc_sub_pixel_variance8x32)
-        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits12,
-                    aom_highbd_12_obmc_variance16x4,
-                    aom_highbd_12_obmc_sub_pixel_variance16x4)
-        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits12,
-                    aom_highbd_12_obmc_variance4x16,
-                    aom_highbd_12_obmc_sub_pixel_variance4x16)
-        break;
-
-      default:
-        assert(0 &&
-               "cm->seq_params.bit_depth should be AOM_BITS_8, "
-               "AOM_BITS_10 or AOM_BITS_12");
-    }
-  }
-}
-#endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static void realloc_segmentation_maps(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  CommonModeInfoParams *const mi_params = &cm->mi_params;
-
-  // Create the encoder segmentation map and set all entries to 0
-  aom_free(cpi->enc_seg.map);
-  CHECK_MEM_ERROR(cm, cpi->enc_seg.map,
-                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
-
-  // Create a map used for cyclic background refresh.
-  if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
-  CHECK_MEM_ERROR(
-      cm, cpi->cyclic_refresh,
-      av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols));
-
-  // Create a map used to mark inactive areas.
-  aom_free(cpi->active_map.map);
-  CHECK_MEM_ERROR(cm, cpi->active_map.map,
-                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
-}
-
-static AOM_INLINE void set_tpl_stats_block_size(int width, int height,
-                                                uint8_t *block_mis_log2) {
-  const int is_720p_or_larger = AOMMIN(width, height) >= 720;
-
-  // 0: 4x4, 1: 8x8, 2: 16x16
-  *block_mis_log2 = is_720p_or_larger ? 2 : 1;
+  av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
 }
 
-void av1_alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
-                                        CompoundTypeRdBuffers *const bufs) {
-  CHECK_MEM_ERROR(
-      cm, bufs->pred0,
-      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
-  CHECK_MEM_ERROR(
-      cm, bufs->pred1,
-      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
-  CHECK_MEM_ERROR(
-      cm, bufs->residual1,
-      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
-  CHECK_MEM_ERROR(
-      cm, bufs->diff10,
-      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
-  CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf,
-                  (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
-                                        sizeof(*bufs->tmp_best_mask_buf)));
-}
-
-void av1_release_compound_type_rd_buffers(CompoundTypeRdBuffers *const bufs) {
-  aom_free(bufs->pred0);
-  aom_free(bufs->pred1);
-  aom_free(bufs->residual1);
-  aom_free(bufs->diff10);
-  aom_free(bufs->tmp_best_mask_buf);
-  av1_zero(*bufs);  // Set all pointers to NULL for safety.
-}
-
-static void config_target_level(AV1_COMP *const cpi, AV1_LEVEL target_level,
-                                int tier) {
-  aom_clear_system_state();
-
-  AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  SequenceHeader *const seq_params = &cpi->common.seq_params;
-
-  // Adjust target bitrate to be no larger than 70% of level limit.
-  const BITSTREAM_PROFILE profile = seq_params->profile;
-  const double level_bitrate_limit =
-      av1_get_max_bitrate_for_level(target_level, tier, profile);
-  const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70);
-  oxcf->target_bandwidth = AOMMIN(oxcf->target_bandwidth, max_bitrate);
-  // Also need to update cpi->twopass.bits_left.
-  TWO_PASS *const twopass = &cpi->twopass;
-  FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats;
-  if (stats != NULL)
-    cpi->twopass.bits_left =
-        (int64_t)(stats->duration * cpi->oxcf.target_bandwidth / 10000000.0);
-
-  // Adjust max over-shoot percentage.
-  oxcf->over_shoot_pct = 0;
-
-  // Adjust max quantizer.
-  oxcf->worst_allowed_q = 255;
-
-  // Adjust number of tiles and tile columns to be under level limit.
-  int max_tiles, max_tile_cols;
-  av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols);
-  while (oxcf->tile_columns > 0 && (1 << oxcf->tile_columns) > max_tile_cols) {
-    --oxcf->tile_columns;
-  }
-  const int tile_cols = (1 << oxcf->tile_columns);
-  while (oxcf->tile_rows > 0 &&
-         tile_cols * (1 << oxcf->tile_rows) > max_tiles) {
-    --oxcf->tile_rows;
-  }
-
-  // Adjust min compression ratio.
-  const int still_picture = seq_params->still_picture;
-  const double min_cr =
-      av1_get_min_cr_for_level(target_level, tier, still_picture);
-  oxcf->min_cr = AOMMAX(oxcf->min_cr, (unsigned int)(min_cr * 100));
-}
-
-void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
-  AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
-  const int num_planes = av1_num_planes(cm);
-  RATE_CONTROL *const rc = &cpi->rc;
-  MACROBLOCK *const x = &cpi->td.mb;
-  AV1LevelParams *const level_params = &cpi->level_params;
+void av1_change_config_seq(struct AV1_PRIMARY *ppi,
+                           const AV1EncoderConfig *oxcf,
+                           bool *is_sb_size_changed) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+  const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+  const ColorCfg *const color_cfg = &oxcf->color_cfg;
 
   if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
-  seq_params->bit_depth = oxcf->bit_depth;
-  seq_params->color_primaries = oxcf->color_primaries;
-  seq_params->transfer_characteristics = oxcf->transfer_characteristics;
-  seq_params->matrix_coefficients = oxcf->matrix_coefficients;
-  seq_params->monochrome = oxcf->monochrome;
-  seq_params->chroma_sample_position = oxcf->chroma_sample_position;
-  seq_params->color_range = oxcf->color_range;
+  seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
+  seq_params->color_primaries = color_cfg->color_primaries;
+  seq_params->transfer_characteristics = color_cfg->transfer_characteristics;
+  seq_params->matrix_coefficients = color_cfg->matrix_coefficients;
+  seq_params->monochrome = oxcf->tool_cfg.enable_monochrome;
+  seq_params->chroma_sample_position = color_cfg->chroma_sample_position;
+  seq_params->color_range = color_cfg->color_range;
 
   assert(IMPLIES(seq_params->profile <= PROFILE_1,
                  seq_params->bit_depth <= AOM_BITS_10));
 
-  seq_params->timing_info_present = oxcf->timing_info_present;
+  seq_params->timing_info_present = dec_model_cfg->timing_info_present;
   seq_params->timing_info.num_units_in_display_tick =
-      oxcf->timing_info.num_units_in_display_tick;
-  seq_params->timing_info.time_scale = oxcf->timing_info.time_scale;
+      dec_model_cfg->timing_info.num_units_in_display_tick;
+  seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale;
   seq_params->timing_info.equal_picture_interval =
-      oxcf->timing_info.equal_picture_interval;
+      dec_model_cfg->timing_info.equal_picture_interval;
   seq_params->timing_info.num_ticks_per_picture =
-      oxcf->timing_info.num_ticks_per_picture;
+      dec_model_cfg->timing_info.num_ticks_per_picture;
 
   seq_params->display_model_info_present_flag =
-      oxcf->display_model_info_present_flag;
+      dec_model_cfg->display_model_info_present_flag;
   seq_params->decoder_model_info_present_flag =
-      oxcf->decoder_model_info_present_flag;
-  if (oxcf->decoder_model_info_present_flag) {
+      dec_model_cfg->decoder_model_info_present_flag;
+  if (dec_model_cfg->decoder_model_info_present_flag) {
     // set the decoder model parameters in schedule mode
     seq_params->decoder_model_info.num_units_in_decoding_tick =
-        oxcf->buffer_model.num_units_in_decoding_tick;
-    cm->buffer_removal_time_present = 1;
+        dec_model_cfg->num_units_in_decoding_tick;
+    ppi->buffer_removal_time_present = 1;
     av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
     av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
   } else if (seq_params->timing_info_present &&
@@ -2828,10 +636,70 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
         10;  // Default value (not signaled)
   }
 
-  update_film_grain_parameters(cpi, oxcf);
+  av1_update_film_grain_parameters_seq(ppi, oxcf);
+
+  int sb_size = seq_params->sb_size;
+  // Superblock size should not be updated after the first key frame.
+  if (!ppi->seq_params_locked) {
+    set_sb_size(seq_params, av1_select_sb_size(oxcf, frm_dim_cfg->width,
+                                               frm_dim_cfg->height,
+                                               ppi->number_spatial_layers));
+    for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
+      seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
+  }
+  if (is_sb_size_changed != NULL && sb_size != seq_params->sb_size)
+    *is_sb_size_changed = true;
+
+  // Init sequence level coding tools
+  // This should not be called after the first key frame.
+  if (!ppi->seq_params_locked) {
+    seq_params->operating_points_cnt_minus_1 =
+        (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1)
+            ? ppi->number_spatial_layers * ppi->number_temporal_layers - 1
+            : 0;
+    av1_init_seq_coding_tools(ppi, oxcf, ppi->use_svc);
+  }
+  seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  highbd_set_var_fns(ppi);
+#endif
+
+  set_primary_rc_buffer_sizes(oxcf, ppi);
+}
+
+void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
+                       bool is_sb_size_changed) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = cm->seq_params;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1LevelParams *const level_params = &cpi->ppi->level_params;
+  InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
+  RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  // in case of LAP, lag in frames is set according to number of lap buffers
+  // calculated at init time. This stores and restores LAP's lag in frames to
+  // prevent override by new cfg.
+  int lap_lag_in_frames = -1;
+  if (cpi->ppi->lap_enabled && cpi->compressor_stage == LAP_STAGE) {
+    lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames;
+  }
+
+  av1_update_film_grain_parameters(cpi, oxcf);
 
   cpi->oxcf = *oxcf;
-  cpi->superres_mode = oxcf->superres_mode;  // default
+  // When user provides superres_mode = AOM_SUPERRES_AUTO, we still initialize
+  // superres mode for current encoding = AOM_SUPERRES_NONE. This is to ensure
+  // that any analysis (e.g. TPL) happening outside the main encoding loop still
+  // happens at full resolution.
+  // This value will later be set appropriately just before main encoding loop.
+  cpi->superres_mode = oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO
+                           ? AOM_SUPERRES_NONE
+                           : oxcf->superres_cfg.superres_mode;  // default
   x->e_mbd.bd = (int)seq_params->bit_depth;
   x->e_mbd.global_motion = cm->global_motion;
 
@@ -2856,19 +724,20 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
                         seq_params->tier[0]);
   }
 
-  if ((has_no_stats_stage(cpi)) && (oxcf->rc_mode == AOM_Q)) {
-    rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  if (has_no_stats_stage(cpi) && (rc_cfg->mode == AOM_Q)) {
+    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL;
   } else {
-    rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+    p_rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
   }
 
-  cpi->refresh_golden_frame = 0;
-  cpi->refresh_bwd_ref_frame = 0;
+  refresh_frame->golden_frame = false;
+  refresh_frame->bwd_ref_frame = false;
 
-  cm->features.refresh_frame_context = (oxcf->frame_parallel_decoding_mode)
-                                           ? REFRESH_FRAME_CONTEXT_DISABLED
-                                           : REFRESH_FRAME_CONTEXT_BACKWARD;
-  if (oxcf->large_scale_tile)
+  cm->features.refresh_frame_context =
+      (oxcf->tool_cfg.frame_parallel_decoding_mode)
+          ? REFRESH_FRAME_CONTEXT_DISABLED
+          : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->tile_cfg.enable_large_scale_tile)
     cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
   if (x->palette_buffer == NULL) {
@@ -2876,22 +745,27 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
                     aom_memalign(16, sizeof(*x->palette_buffer)));
   }
 
-  if (x->comp_rd_buffer.pred0 == NULL) {
-    av1_alloc_compound_type_rd_buffers(cm, &x->comp_rd_buffer);
-  }
-
   if (x->tmp_conv_dst == NULL) {
     CHECK_MEM_ERROR(
         cm, x->tmp_conv_dst,
         aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
     x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
   }
-  for (int i = 0; i < 2; ++i) {
-    if (x->tmp_obmc_bufs[i] == NULL) {
-      CHECK_MEM_ERROR(cm, x->tmp_obmc_bufs[i],
-                      aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                           sizeof(*x->tmp_obmc_bufs[i])));
-      x->e_mbd.tmp_obmc_bufs[i] = x->tmp_obmc_bufs[i];
+  // The buffers 'tmp_pred_bufs[]' and 'comp_rd_buffer' are used in inter frames
+  // to store intermediate inter mode prediction results and are not required
+  // for allintra encoding mode. Hence, the memory allocations for these buffers
+  // are avoided for allintra encoding mode.
+  if (cpi->oxcf.kf_cfg.key_freq_max != 0) {
+    if (x->comp_rd_buffer.pred0 == NULL)
+      alloc_compound_type_rd_buffers(cm->error, &x->comp_rd_buffer);
+
+    for (int i = 0; i < 2; ++i) {
+      if (x->tmp_pred_bufs[i] == NULL) {
+        CHECK_MEM_ERROR(cm, x->tmp_pred_bufs[i],
+                        aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                             sizeof(*x->tmp_pred_bufs[i])));
+        x->e_mbd.tmp_obmc_bufs[i] = x->tmp_pred_bufs[i];
+      }
     }
   }
 
@@ -2899,124 +773,71 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
 
   av1_set_high_precision_mv(cpi, 1, 0);
 
-  set_rc_buffer_sizes(rc, &cpi->oxcf);
-
   // Under a configuration change, where maximum_buffer_size may change,
   // keep buffer level clipped to the maximum allowed buffer size.
-  rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
-  rc->buffer_level = AOMMIN(rc->buffer_level, rc->maximum_buffer_size);
+  p_rc->bits_off_target =
+      AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+  p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size);
 
   // Set up frame rate and related parameters rate control values.
   av1_new_framerate(cpi, cpi->framerate);
 
   // Set absolute upper and lower quality limits
-  rc->worst_quality = cpi->oxcf.worst_allowed_q;
-  rc->best_quality = cpi->oxcf.best_allowed_q;
+  rc->worst_quality = rc_cfg->worst_allowed_q;
+  rc->best_quality = rc_cfg->best_allowed_q;
 
   cm->features.interp_filter =
-      oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
+      oxcf->tile_cfg.enable_large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
   cm->features.switchable_motion_mode = 1;
 
-  if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
-    cm->render_width = cpi->oxcf.render_width;
-    cm->render_height = cpi->oxcf.render_height;
+  if (frm_dim_cfg->render_width > 0 && frm_dim_cfg->render_height > 0) {
+    cm->render_width = frm_dim_cfg->render_width;
+    cm->render_height = frm_dim_cfg->render_height;
   } else {
-    cm->render_width = cpi->oxcf.width;
-    cm->render_height = cpi->oxcf.height;
-  }
-  cm->width = cpi->oxcf.width;
-  cm->height = cpi->oxcf.height;
-
-  int sb_size = seq_params->sb_size;
-  // Superblock size should not be updated after the first key frame.
-  if (!cpi->seq_params_locked) {
-    set_sb_size(&cm->seq_params, select_sb_size(cpi));
-    for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
-      seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
+    cm->render_width = frm_dim_cfg->width;
+    cm->render_height = frm_dim_cfg->height;
   }
+  cm->width = frm_dim_cfg->width;
+  cm->height = frm_dim_cfg->height;
 
-  if (cpi->initial_width || sb_size != seq_params->sb_size) {
-    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
-        seq_params->sb_size != sb_size) {
+  if (initial_dimensions->width || is_sb_size_changed) {
+    if (cm->width > initial_dimensions->width ||
+        cm->height > initial_dimensions->height || is_sb_size_changed) {
       av1_free_context_buffers(cm);
-      av1_free_pc_tree(cpi, &cpi->td, num_planes, (BLOCK_SIZE)sb_size);
+      av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+      av1_free_sms_tree(&cpi->td);
+      av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+      cpi->td.firstpass_ctx = NULL;
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
-      cpi->initial_width = cpi->initial_height = 0;
+      initial_dimensions->width = initial_dimensions->height = 0;
     }
   }
-  update_frame_size(cpi);
+  av1_update_frame_size(cpi);
 
   rc->is_src_frame_alt_ref = 0;
 
-  set_tile_info(cpi);
+  set_tile_info(cm, &cpi->oxcf.tile_cfg);
 
-  if (!cpi->svc.external_ref_frame_config)
-    cpi->ext_flags.refresh_frame_flags_pending = 0;
+  if (!cpi->svc.set_ref_frame_config)
+    cpi->ext_flags.refresh_frame.update_pending = 0;
   cpi->ext_flags.refresh_frame_context_pending = 0;
 
-#if CONFIG_AV1_HIGHBITDEPTH
-  highbd_set_var_fns(cpi);
-#endif
-
-  // Init sequence level coding tools
-  // This should not be called after the first key frame.
-  if (!cpi->seq_params_locked) {
-    seq_params->operating_points_cnt_minus_1 =
-        (cm->number_spatial_layers > 1 || cm->number_temporal_layers > 1)
-            ? cm->number_spatial_layers * cm->number_temporal_layers - 1
-            : 0;
-    init_seq_coding_tools(&cm->seq_params, cm, oxcf, cpi->use_svc);
-  }
-
-  if (cpi->use_svc)
-    av1_update_layer_context_change_config(cpi, oxcf->target_bandwidth);
-}
-
-static INLINE void setup_tpl_buffers(AV1_COMMON *const cm,
-                                     TplParams *const tpl_data) {
-  CommonModeInfoParams *const mi_params = &cm->mi_params;
-  set_tpl_stats_block_size(cm->width, cm->height,
-                           &tpl_data->tpl_stats_block_mis_log2);
-  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  if (cpi->ppi->use_svc)
+    av1_update_layer_context_change_config(cpi, rc_cfg->target_bandwidth);
 
-  for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
-    const int mi_cols =
-        ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2);
-    const int mi_rows =
-        ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2);
+  check_reset_rc_flag(cpi);
 
-    tpl_data->tpl_stats_buffer[frame].is_valid = 0;
-    tpl_data->tpl_stats_buffer[frame].width = mi_cols >> block_mis_log2;
-    tpl_data->tpl_stats_buffer[frame].height = mi_rows >> block_mis_log2;
-    tpl_data->tpl_stats_buffer[frame].stride =
-        tpl_data->tpl_stats_buffer[frame].width;
-    tpl_data->tpl_stats_buffer[frame].mi_rows = mi_params->mi_rows;
-    tpl_data->tpl_stats_buffer[frame].mi_cols = mi_params->mi_cols;
+  // restore the value of lag_in_frame for LAP stage.
+  if (lap_lag_in_frames != -1) {
+    cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
   }
-
-  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
-    CHECK_MEM_ERROR(
-        cm, tpl_data->tpl_stats_pool[frame],
-        aom_calloc(tpl_data->tpl_stats_buffer[frame].width *
-                       tpl_data->tpl_stats_buffer[frame].height,
-                   sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr)));
-    if (aom_alloc_frame_buffer(
-            &tpl_data->tpl_rec_pool[frame], cm->width, cm->height,
-            cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-            cm->seq_params.use_highbitdepth, AOM_ENC_NO_SCALE_BORDER,
-            cm->features.byte_alignment))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                         "Failed to allocate frame buffer");
-  }
-
-  tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1];
 }
 
 static INLINE void init_frame_info(FRAME_INFO *frame_info,
                                    const AV1_COMMON *const cm) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   frame_info->frame_width = cm->width;
   frame_info->frame_height = cm->height;
   frame_info->mi_cols = mi_params->mi_cols;
@@ -3029,248 +850,78 @@ static INLINE void init_frame_info(FRAME_INFO *frame_info,
   frame_info->subsampling_y = seq_params->subsampling_y;
 }
 
-AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, BufferPool *const pool,
-                                FIRSTPASS_STATS *frame_stats_buf,
-                                COMPRESSOR_STAGE stage, int num_lap_buffers,
-                                int lap_lag_in_frames,
-                                STATS_BUFFER_CTX *stats_buf_context) {
-  AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
-  AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
+static INLINE void init_frame_index_set(FRAME_INDEX_SET *frame_index_set) {
+  frame_index_set->show_frame_count = 0;
+}
 
-  if (!cm) return NULL;
+static INLINE void update_frame_index_set(FRAME_INDEX_SET *frame_index_set,
+                                          int is_show_frame) {
+  if (is_show_frame) {
+    frame_index_set->show_frame_count++;
+  }
+}
 
-  av1_zero(*cpi);
+AV1_PRIMARY *av1_create_primary_compressor(
+    struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers,
+    const AV1EncoderConfig *oxcf) {
+  AV1_PRIMARY *volatile const ppi = aom_memalign(32, sizeof(AV1_PRIMARY));
+  if (!ppi) return NULL;
+  av1_zero(*ppi);
 
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
-  if (setjmp(cm->error.jmp)) {
-    cm->error.setjmp = 0;
-    av1_remove_compressor(cpi);
+  if (setjmp(ppi->error.jmp)) {
+    ppi->error.setjmp = 0;
+    av1_remove_primary_compressor(ppi);
     return 0;
   }
+  ppi->error.setjmp = 1;
 
-  cm->error.setjmp = 1;
-  cpi->lap_enabled = num_lap_buffers > 0;
-  cpi->compressor_stage = stage;
+  ppi->seq_params_locked = 0;
+  ppi->lap_enabled = num_lap_buffers > 0;
+  ppi->output_pkt_list = pkt_list_head;
+  ppi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+  ppi->frames_left = oxcf->input_cfg.limit;
+  ppi->num_fp_contexts = 1;
 
-  CommonModeInfoParams *const mi_params = &cm->mi_params;
-  mi_params->free_mi = enc_free_mi;
-  mi_params->setup_mi = enc_setup_mi;
-  mi_params->set_mb_mi = (oxcf->pass == 1 || cpi->compressor_stage == LAP_STAGE)
-                             ? stat_stage_set_mb_mi
-                             : enc_set_mb_mi;
-
-  mi_params->mi_alloc_bsize = BLOCK_4X4;
-
-  CHECK_MEM_ERROR(cm, cm->fc,
-                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
-  CHECK_MEM_ERROR(
-      cm, cm->default_frame_context,
-      (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
-  memset(cm->fc, 0, sizeof(*cm->fc));
-  memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
+  init_config_sequence(ppi, oxcf);
 
-  cpi->common.buffer_pool = pool;
-
-  init_config(cpi, oxcf);
-  if (cpi->compressor_stage == LAP_STAGE) {
-    cpi->oxcf.lag_in_frames = lap_lag_in_frames;
-  }
-
-  av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
-
-  cpi->rc.enable_scenecut_detection = 1;
-  if (cpi->lap_enabled &&
-      (num_lap_buffers < (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)))
-    cpi->rc.enable_scenecut_detection = 0;
-  init_frame_info(&cpi->frame_info, cm);
-
-  cm->current_frame.frame_number = 0;
-  cm->current_frame_id = -1;
-  cpi->seq_params_locked = 0;
-  cpi->partition_search_skippable_frame = 0;
-  cpi->tile_data = NULL;
-  cpi->last_show_frame_buf = NULL;
-  realloc_segmentation_maps(cpi);
-
-  cpi->refresh_alt_ref_frame = 0;
-
-  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
-#if CONFIG_INTERNAL_STATS
-  cpi->b_calculate_blockiness = 1;
-  cpi->b_calculate_consistency = 1;
-  cpi->total_inconsistency = 0;
-  cpi->psnr.worst = 100.0;
-  cpi->worst_ssim = 100.0;
-
-  cpi->count = 0;
-  cpi->bytes = 0;
-#if CONFIG_SPEED_STATS
-  cpi->tx_search_count = 0;
-#endif  // CONFIG_SPEED_STATS
-
-  if (cpi->b_calculate_psnr) {
-    cpi->total_sq_error = 0;
-    cpi->total_samples = 0;
-    cpi->tot_recode_hits = 0;
-    cpi->summed_quality = 0;
-    cpi->summed_weights = 0;
-  }
-
-  cpi->fastssim.worst = 100.0;
-  cpi->psnrhvs.worst = 100.0;
-
-  if (cpi->b_calculate_blockiness) {
-    cpi->total_blockiness = 0;
-    cpi->worst_blockiness = 0.0;
-  }
-
-  if (cpi->b_calculate_consistency) {
-    CHECK_MEM_ERROR(
-        cm, cpi->ssim_vars,
-        aom_malloc(sizeof(*cpi->ssim_vars) * 4 * cpi->common.mi_params.mi_rows *
-                   cpi->common.mi_params.mi_cols));
-    cpi->worst_consistency = 100.0;
-  }
-#endif
 #if CONFIG_ENTROPY_STATS
-  av1_zero(aggregate_fc);
+  av1_zero(ppi->aggregate_fc);
 #endif  // CONFIG_ENTROPY_STATS
 
-  cpi->time_stamps.first_ever = INT64_MAX;
-
-#ifdef OUTPUT_YUV_SKINMAP
-  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
-#endif
-#ifdef OUTPUT_YUV_REC
-  yuv_rec_file = fopen("rec.yuv", "wb");
-#endif
-
-  assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS);
-  int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS);
-  for (int i = 0; i < size; i++)
-    cpi->twopass.frame_stats_arr[i] = &frame_stats_buf[i];
-
-  cpi->twopass.stats_buf_ctx = stats_buf_context;
-  cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start;
-
-#if !CONFIG_REALTIME_ONLY
-  if (is_stat_consumption_stage(cpi)) {
-    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
-    const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
-
-    if (!cpi->lap_enabled) {
-      /*Re-initialize to stats buffer, populated by application in the case of
-       * two pass*/
-      cpi->twopass.stats_buf_ctx->stats_in_start = oxcf->two_pass_stats_in.buf;
-      cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start;
-      cpi->twopass.stats_buf_ctx->stats_in_end =
-          &cpi->twopass.stats_buf_ctx->stats_in_start[packets - 1];
-
-      av1_init_second_pass(cpi);
-    } else {
-      av1_init_single_pass_lap(cpi);
+  av1_primary_rc_init(oxcf, &ppi->p_rc);
+
+  // For two pass and lag_in_frames > 33 in LAP.
+  ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2;
+  if (ppi->lap_enabled) {
+    if ((num_lap_buffers <
+         (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)) &&
+        num_lap_buffers >= (MAX_GF_LENGTH_LAP + 3)) {
+      /*
+       * For lag in frames >= 19 and <33, enable scenecut
+       * with limited future frame prediction.
+       */
+      ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1;
+    } else if (num_lap_buffers < (MAX_GF_LENGTH_LAP + 3)) {
+      // Disable scenecut when lag_in_frames < 19.
+      ppi->p_rc.enable_scenecut_detection = DISABLE_SCENECUT;
     }
   }
-#endif
-
-  int sb_mi_size = av1_get_sb_mi_size(cm);
-
-  CHECK_MEM_ERROR(
-      cm, cpi->td.mb.above_pred_buf,
-      (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
-                                      sizeof(*cpi->td.mb.above_pred_buf)));
-  CHECK_MEM_ERROR(
-      cm, cpi->td.mb.left_pred_buf,
-      (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
-                                      sizeof(*cpi->td.mb.left_pred_buf)));
-
-  CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
-                  (int32_t *)aom_memalign(
-                      16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
-
-  CHECK_MEM_ERROR(
-      cm, cpi->td.mb.inter_modes_info,
-      (InterModesInfo *)aom_malloc(sizeof(*cpi->td.mb.inter_modes_info)));
-
-  for (int x = 0; x < 2; x++)
-    for (int y = 0; y < 2; y++)
-      CHECK_MEM_ERROR(
-          cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
-          (uint32_t *)aom_malloc(
-              AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
-              sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0])));
-
-  cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0;
-
-  CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
-                  (int32_t *)aom_memalign(
-                      16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
-
-  CHECK_MEM_ERROR(cm, cpi->td.mb.mbmi_ext,
-                  aom_calloc(sb_mi_size, sizeof(*cpi->td.mb.mbmi_ext)));
-
-  av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
-  av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
-
-  {
-    const int bsize = BLOCK_16X16;
-    const int w = mi_size_wide[bsize];
-    const int h = mi_size_high[bsize];
-    const int num_cols = (mi_params->mi_cols + w - 1) / w;
-    const int num_rows = (mi_params->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors,
-                    aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->tpl_rdmult_scaling_factors)));
-    CHECK_MEM_ERROR(cm, cpi->tpl_sb_rdmult_scaling_factors,
-                    aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->tpl_sb_rdmult_scaling_factors)));
-  }
-
-  {
-    const int bsize = BLOCK_16X16;
-    const int w = mi_size_wide[bsize];
-    const int h = mi_size_high[bsize];
-    const int num_cols = (mi_params->mi_cols + w - 1) / w;
-    const int num_rows = (mi_params->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors,
-                    aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->ssim_rdmult_scaling_factors)));
-  }
-
-#if CONFIG_TUNE_VMAF
-  {
-    const int bsize = BLOCK_64X64;
-    const int w = mi_size_wide[bsize];
-    const int h = mi_size_high[bsize];
-    const int num_cols = (mi_params->mi_cols + w - 1) / w;
-    const int num_rows = (mi_params->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->vmaf_rdmult_scaling_factors,
-                    aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->vmaf_rdmult_scaling_factors)));
-    cpi->last_frame_unsharp_amount = 0.0;
-  }
-#endif
-
-  if (!is_stat_generation_stage(cpi)) {
-    setup_tpl_buffers(cm, &cpi->tpl_data);
-  }
-
-#if CONFIG_COLLECT_PARTITION_STATS == 2
-  av1_zero(cpi->partition_stats);
-#endif
 
 #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                    \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                  \
-  cpi->fn_ptr[BT].vf = VF;                                      \
-  cpi->fn_ptr[BT].svf = SVF;                                    \
-  cpi->fn_ptr[BT].svaf = SVAF;                                  \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                              \
-  cpi->fn_ptr[BT].jsdaf = JSDAF;                                \
-  cpi->fn_ptr[BT].jsvaf = JSVAF;
-
+  ppi->fn_ptr[BT].sdf = SDF;                                    \
+  ppi->fn_ptr[BT].sdaf = SDAF;                                  \
+  ppi->fn_ptr[BT].vf = VF;                                      \
+  ppi->fn_ptr[BT].svf = SVF;                                    \
+  ppi->fn_ptr[BT].svaf = SVAF;                                  \
+  ppi->fn_ptr[BT].sdx4df = SDX4DF;                              \
+  ppi->fn_ptr[BT].jsdaf = JSDAF;                                \
+  ppi->fn_ptr[BT].jsvaf = JSVAF;
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
   BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
       aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
       aom_sad4x16x4d, aom_dist_wtd_sad4x16_avg,
@@ -3300,6 +951,7 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, BufferPool *const pool,
       aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
       aom_sad64x16x4d, aom_dist_wtd_sad64x16_avg,
       aom_dist_wtd_sub_pixel_avg_variance64x16)
+#endif  // !CONFIG_REALTIME_ONLY
 
   BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
       aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
@@ -3377,10 +1029,11 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, BufferPool *const pool,
       aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
       aom_dist_wtd_sad4x4_avg, aom_dist_wtd_sub_pixel_avg_variance4x4)
 
+#if !CONFIG_REALTIME_ONLY
 #define OBFP(BT, OSDF, OVF, OSVF) \
-  cpi->fn_ptr[BT].osdf = OSDF;    \
-  cpi->fn_ptr[BT].ovf = OVF;      \
-  cpi->fn_ptr[BT].osvf = OSVF;
+  ppi->fn_ptr[BT].osdf = OSDF;    \
+  ppi->fn_ptr[BT].ovf = OVF;      \
+  ppi->fn_ptr[BT].osvf = OSVF;
 
   OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
        aom_obmc_sub_pixel_variance128x128)
@@ -3426,10 +1079,11 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, BufferPool *const pool,
        aom_obmc_sub_pixel_variance16x64)
   OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16,
        aom_obmc_sub_pixel_variance64x16)
+#endif  // !CONFIG_REALTIME_ONLY
 
 #define MBFP(BT, MCSDF, MCSVF)  \
-  cpi->fn_ptr[BT].msdf = MCSDF; \
-  cpi->fn_ptr[BT].msvf = MCSVF;
+  ppi->fn_ptr[BT].msdf = MCSDF; \
+  ppi->fn_ptr[BT].msvf = MCSVF;
 
   MBFP(BLOCK_128X128, aom_masked_sad128x128,
        aom_masked_sub_pixel_variance128x128)
@@ -3449,39 +1103,346 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, BufferPool *const pool,
   MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
   MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
 
+#if !CONFIG_REALTIME_ONLY
   MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16)
-
   MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4)
-
   MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32)
-
   MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8)
-
   MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64)
-
   MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
+#endif
+
+#define SDSFP(BT, SDSF, SDSX4DF) \
+  ppi->fn_ptr[BT].sdsf = SDSF;   \
+  ppi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+  SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d)
+  SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d)
+  SDSFP(BLOCK_64X128, aom_sad_skip_64x128, aom_sad_skip_64x128x4d)
+  SDSFP(BLOCK_64X64, aom_sad_skip_64x64, aom_sad_skip_64x64x4d)
+  SDSFP(BLOCK_64X32, aom_sad_skip_64x32, aom_sad_skip_64x32x4d)
+
+  SDSFP(BLOCK_32X64, aom_sad_skip_32x64, aom_sad_skip_32x64x4d)
+  SDSFP(BLOCK_32X32, aom_sad_skip_32x32, aom_sad_skip_32x32x4d)
+  SDSFP(BLOCK_32X16, aom_sad_skip_32x16, aom_sad_skip_32x16x4d)
+
+  SDSFP(BLOCK_16X32, aom_sad_skip_16x32, aom_sad_skip_16x32x4d)
+  SDSFP(BLOCK_16X16, aom_sad_skip_16x16, aom_sad_skip_16x16x4d)
+  SDSFP(BLOCK_16X8, aom_sad_skip_16x8, aom_sad_skip_16x8x4d)
+  SDSFP(BLOCK_8X16, aom_sad_skip_8x16, aom_sad_skip_8x16x4d)
+  SDSFP(BLOCK_8X8, aom_sad_skip_8x8, aom_sad_skip_8x8x4d)
+
+  SDSFP(BLOCK_4X8, aom_sad_skip_4x8, aom_sad_skip_4x8x4d)
+
+#if !CONFIG_REALTIME_ONLY
+  SDSFP(BLOCK_64X16, aom_sad_skip_64x16, aom_sad_skip_64x16x4d)
+  SDSFP(BLOCK_16X64, aom_sad_skip_16x64, aom_sad_skip_16x64x4d)
+  SDSFP(BLOCK_32X8, aom_sad_skip_32x8, aom_sad_skip_32x8x4d)
+  SDSFP(BLOCK_8X32, aom_sad_skip_8x32, aom_sad_skip_8x32x4d)
+  SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d)
+#endif
+#undef SDSFP
 
 #if CONFIG_AV1_HIGHBITDEPTH
-  highbd_set_var_fns(cpi);
+  highbd_set_var_fns(ppi);
+#endif
+
+  {
+    // As cm->mi_params is a part of the frame level context (cpi), it is
+    // unavailable at this point. mi_params is created as a local temporary
+    // variable, to be passed into the functions used for allocating tpl
+    // buffers. The values in this variable are populated according to initial
+    // width and height of the frame.
+    CommonModeInfoParams mi_params;
+    enc_set_mb_mi(&mi_params, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+                  BLOCK_4X4);
+
+    const int bsize = BLOCK_16X16;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params.mi_cols + w - 1) / w;
+    const int num_rows = (mi_params.mi_rows + h - 1) / h;
+    AOM_CHECK_MEM_ERROR(
+        &ppi->error, ppi->tpl_sb_rdmult_scaling_factors,
+        aom_calloc(num_rows * num_cols,
+                   sizeof(*ppi->tpl_sb_rdmult_scaling_factors)));
+
+#if CONFIG_INTERNAL_STATS
+    ppi->b_calculate_blockiness = 1;
+    ppi->b_calculate_consistency = 1;
+
+    for (int i = 0; i <= STAT_ALL; i++) {
+      ppi->psnr[0].stat[i] = 0;
+      ppi->psnr[1].stat[i] = 0;
+
+      ppi->fastssim.stat[i] = 0;
+      ppi->psnrhvs.stat[i] = 0;
+    }
+
+    ppi->psnr[0].worst = 100.0;
+    ppi->psnr[1].worst = 100.0;
+    ppi->worst_ssim = 100.0;
+    ppi->worst_ssim_hbd = 100.0;
+
+    ppi->count[0] = 0;
+    ppi->count[1] = 0;
+    ppi->total_bytes = 0;
+
+    if (ppi->b_calculate_psnr) {
+      ppi->total_sq_error[0] = 0;
+      ppi->total_samples[0] = 0;
+      ppi->total_sq_error[1] = 0;
+      ppi->total_samples[1] = 0;
+      ppi->total_recode_hits = 0;
+      ppi->summed_quality = 0;
+      ppi->summed_weights = 0;
+      ppi->summed_quality_hbd = 0;
+      ppi->summed_weights_hbd = 0;
+    }
+
+    ppi->fastssim.worst = 100.0;
+    ppi->psnrhvs.worst = 100.0;
+
+    if (ppi->b_calculate_blockiness) {
+      ppi->total_blockiness = 0;
+      ppi->worst_blockiness = 0.0;
+    }
+
+    ppi->total_inconsistency = 0;
+    ppi->worst_consistency = 100.0;
+    if (ppi->b_calculate_consistency) {
+      AOM_CHECK_MEM_ERROR(&ppi->error, ppi->ssim_vars,
+                          aom_malloc(sizeof(*ppi->ssim_vars) * 4 *
+                                     mi_params.mi_rows * mi_params.mi_cols));
+    }
+#endif
+  }
+
+  ppi->error.setjmp = 0;
+  return ppi;
+}
+
+AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
+                                BufferPool *const pool, COMPRESSOR_STAGE stage,
+                                int lap_lag_in_frames) {
+  AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
+  AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
+
+  if (!cm) return NULL;
+
+  av1_zero(*cpi);
+
+  cpi->ppi = ppi;
+  cm->seq_params = &ppi->seq_params;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  cm->error =
+      (struct aom_internal_error_info *)aom_calloc(1, sizeof(*cm->error));
+  if (!cm->error) {
+    aom_free(cpi);
+    return NULL;
+  }
+#else
+  cm->error = &ppi->error;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error->jmp)) {
+    cm->error->setjmp = 0;
+    av1_remove_compressor(cpi);
+    return NULL;
+  }
+
+  cm->error->setjmp = 1;
+  cpi->compressor_stage = stage;
+
+  cpi->do_frame_data_update = true;
+
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  mi_params->free_mi = enc_free_mi;
+  mi_params->setup_mi = enc_setup_mi;
+  mi_params->set_mb_mi =
+      (oxcf->pass == AOM_RC_FIRST_PASS || cpi->compressor_stage == LAP_STAGE)
+          ? stat_stage_set_mb_mi
+          : enc_set_mb_mi;
+
+  mi_params->mi_alloc_bsize = BLOCK_4X4;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(
+      cm, cm->default_frame_context,
+      (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
+  memset(cm->fc, 0, sizeof(*cm->fc));
+  memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
+
+  cpi->common.buffer_pool = pool;
+
+  init_config(cpi, oxcf);
+  if (cpi->compressor_stage == LAP_STAGE) {
+    cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
+  }
+
+  av1_rc_init(&cpi->oxcf, &cpi->rc);
+
+  init_frame_info(&cpi->frame_info, cm);
+  init_frame_index_set(&cpi->frame_index_set);
+
+  cm->current_frame.frame_number = 0;
+  cm->current_frame_id = -1;
+  cpi->tile_data = NULL;
+  cpi->last_show_frame_buf = NULL;
+  realloc_segmentation_maps(cpi);
+
+  cpi->refresh_frame.alt_ref_frame = false;
+
+#if CONFIG_SPEED_STATS
+  cpi->tx_search_count = 0;
+#endif  // CONFIG_SPEED_STATS
+
+  cpi->time_stamps.first_ts_start = INT64_MAX;
+
+#ifdef OUTPUT_YUV_REC
+  yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+#ifdef OUTPUT_YUV_DENOISED
+  yuv_denoised_file = fopen("denoised.yuv", "wb");
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+  if (is_stat_consumption_stage(cpi)) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz);
+
+    if (!cpi->ppi->lap_enabled) {
+      /*Re-initialize to stats buffer, populated by application in the case of
+       * two pass*/
+      cpi->ppi->twopass.stats_buf_ctx->stats_in_start =
+          oxcf->twopass_stats_in.buf;
+      cpi->twopass_frame.stats_in =
+          cpi->ppi->twopass.stats_buf_ctx->stats_in_start;
+      cpi->ppi->twopass.stats_buf_ctx->stats_in_end =
+          &cpi->ppi->twopass.stats_buf_ctx->stats_in_start[packets - 1];
+
+      // The buffer size is packets - 1 because the last packet is total_stats.
+      av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info,
+                              oxcf->twopass_stats_in.buf, packets - 1);
+      av1_init_second_pass(cpi);
+    } else {
+      av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info, NULL, 0);
+      av1_init_single_pass_lap(cpi);
+    }
+  }
 #endif
 
+  // The buffer "obmc_buffer" is used in inter frames for fast obmc search.
+  // Hence, the memory allocation for the same is avoided for allintra encoding
+  // mode.
+  if (cpi->oxcf.kf_cfg.key_freq_max != 0)
+    alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm->error);
+
+  for (int x = 0; x < 2; x++)
+    for (int y = 0; y < 2; y++)
+      CHECK_MEM_ERROR(
+          cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
+          (uint32_t *)aom_malloc(
+              AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+              sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0])));
+
+  cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0;
+
+  av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
+  av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
+
+  CHECK_MEM_ERROR(cm, cpi->consec_zero_mv,
+                  aom_calloc((mi_params->mi_rows * mi_params->mi_cols) >> 2,
+                             sizeof(*cpi->consec_zero_mv)));
+
+  cpi->mb_weber_stats = NULL;
+  cpi->mb_delta_q = NULL;
+
+  {
+    const int bsize = BLOCK_16X16;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->ssim_rdmult_scaling_factors)));
+    CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->tpl_rdmult_scaling_factors)));
+  }
+
+#if CONFIG_TUNE_VMAF
+  {
+    const int bsize = BLOCK_64X64;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->vmaf_info.rdmult_scaling_factors)));
+    for (int i = 0; i < MAX_ARF_LAYERS; i++) {
+      cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0;
+      cpi->vmaf_info.last_frame_ysse[i] = -1.0;
+      cpi->vmaf_info.last_frame_vmaf[i] = -1.0;
+    }
+    cpi->vmaf_info.original_qindex = -1;
+    cpi->vmaf_info.vmaf_model = NULL;
+  }
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+  {
+    const int w = mi_size_wide[butteraugli_rdo_bsize];
+    const int h = mi_size_high[butteraugli_rdo_bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(
+        cm, cpi->butteraugli_info.rdmult_scaling_factors,
+        aom_malloc(num_rows * num_cols *
+                   sizeof(*cpi->butteraugli_info.rdmult_scaling_factors)));
+    memset(&cpi->butteraugli_info.source, 0,
+           sizeof(cpi->butteraugli_info.source));
+    memset(&cpi->butteraugli_info.resized_source, 0,
+           sizeof(cpi->butteraugli_info.resized_source));
+    cpi->butteraugli_info.recon_set = false;
+  }
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  av1_zero(cpi->partition_stats);
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
   /* av1_init_quantizer() is first called here. Add check in
    * av1_frame_init_quantizer() so that av1_init_quantizer is only
    * called later when needed. This will avoid unnecessary calls of
    * av1_init_quantizer() for every frame.
    */
   av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                     cm->seq_params.bit_depth);
+                     cm->seq_params->bit_depth);
   av1_qm_init(&cm->quant_params, av1_num_planes(cm));
 
   av1_loop_filter_init(cm);
   cm->superres_scale_denominator = SCALE_NUMERATOR;
-  cm->superres_upscaled_width = oxcf->width;
-  cm->superres_upscaled_height = oxcf->height;
+  cm->superres_upscaled_width = oxcf->frm_dim_cfg.width;
+  cm->superres_upscaled_height = oxcf->frm_dim_cfg.height;
+#if !CONFIG_REALTIME_ONLY
   av1_loop_restoration_precal();
+#endif
+
+  cpi->third_pass_ctx = NULL;
+  if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) {
+    av1_init_thirdpass_ctx(cm, &cpi->third_pass_ctx, NULL);
+  }
 
-  cm->error.setjmp = 0;
+  cpi->second_pass_log_stream = NULL;
 
+  cm->error->setjmp = 0;
   return cpi;
 }
 
@@ -3492,98 +1453,99 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, BufferPool *const pool,
   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
 #endif  // CONFIG_INTERNAL_STATS
 
-void av1_remove_compressor(AV1_COMP *cpi) {
-  AV1_COMMON *cm;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  int t;
+// This function will change the state and free the mutex of corresponding
+// workers and terminate the object. The object can not be re-used unless a call
+// to reset() is made.
+static AOM_INLINE void terminate_worker_data(AV1_PRIMARY *ppi) {
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  for (int t = p_mt_info->num_workers - 1; t >= 0; --t) {
+    AVxWorker *const worker = &p_mt_info->workers[t];
+    aom_get_worker_interface()->end(worker);
+  }
+}
 
-  if (!cpi) return;
+// Deallocate allocated thread_data.
+static AOM_INLINE void free_thread_data(AV1_PRIMARY *ppi) {
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  for (int t = 1; t < p_mt_info->num_workers; ++t) {
+    EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[t];
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    thread_data->td = thread_data->original_td;
+#endif
+    aom_free(thread_data->td->tctx);
+    aom_free(thread_data->td->palette_buffer);
+    aom_free(thread_data->td->tmp_conv_dst);
+    release_compound_type_rd_buffers(&thread_data->td->comp_rd_buffer);
+    for (int j = 0; j < 2; ++j) {
+      aom_free(thread_data->td->tmp_pred_bufs[j]);
+    }
+    aom_free(thread_data->td->pixel_gradient_info);
+    aom_free(thread_data->td->src_var_info_of_4x4_sub_blocks);
+    release_obmc_buffers(&thread_data->td->obmc_buffer);
+    aom_free(thread_data->td->vt64x64);
+
+    for (int x = 0; x < 2; x++) {
+      for (int y = 0; y < 2; y++) {
+        aom_free(thread_data->td->hash_value_buffer[x][y]);
+        thread_data->td->hash_value_buffer[x][y] = NULL;
+      }
+    }
+    aom_free(thread_data->td->counts);
+    av1_free_pmc(thread_data->td->firstpass_ctx,
+                 ppi->seq_params.monochrome ? 1 : MAX_MB_PLANE);
+    thread_data->td->firstpass_ctx = NULL;
+    av1_free_shared_coeff_buffer(&thread_data->td->shared_coeff_buf);
+    av1_free_sms_tree(thread_data->td);
+    aom_free(thread_data->td);
+  }
+}
 
-  cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
+void av1_remove_primary_compressor(AV1_PRIMARY *ppi) {
+  if (!ppi) return;
+#if !CONFIG_REALTIME_ONLY
+  av1_tf_info_free(&ppi->tf_info);
+#endif  // !CONFIG_REALTIME_ONLY
 
-  if (cm->current_frame.frame_number > 0) {
-#if CONFIG_ENTROPY_STATS
-    if (!is_stat_generation_stage(cpi)) {
-      fprintf(stderr, "Writing counts.stt\n");
-      FILE *f = fopen("counts.stt", "wb");
-      fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
-      fclose(f);
-    }
-#endif  // CONFIG_ENTROPY_STATS
-#if CONFIG_INTERNAL_STATS
-    aom_clear_system_state();
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    aom_free(ppi->level_params.level_info[i]);
+  }
+  av1_lookahead_destroy(ppi->lookahead);
 
-    if (!is_stat_generation_stage(cpi)) {
-      char headings[512] = { 0 };
-      char results[512] = { 0 };
-      FILE *f = fopen("opsnr.stt", "a");
-      double time_encoded =
-          (cpi->time_stamps.prev_end_seen - cpi->time_stamps.first_ever) /
-          10000000.000;
-      double total_encode_time =
-          (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
-      const double dr =
-          (double)cpi->bytes * (double)8 / (double)1000 / time_encoded;
-      const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
-      const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
-      const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
-
-      if (cpi->b_calculate_psnr) {
-        const double total_psnr = aom_sse_to_psnr(
-            (double)cpi->total_samples, peak, (double)cpi->total_sq_error);
-        const double total_ssim =
-            100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
-        snprintf(headings, sizeof(headings),
-                 "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
-                 "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
-                 "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
-                 "AVPsrnY\tAPsnrCb\tAPsnrCr");
-        snprintf(results, sizeof(results),
-                 "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f",
-                 dr, cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr,
-                 cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr, total_ssim,
-                 total_ssim, cpi->fastssim.stat[STAT_ALL] / cpi->count,
-                 cpi->psnrhvs.stat[STAT_ALL] / cpi->count, cpi->psnr.worst,
-                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst,
-                 cpi->psnr.stat[STAT_Y] / cpi->count,
-                 cpi->psnr.stat[STAT_U] / cpi->count,
-                 cpi->psnr.stat[STAT_V] / cpi->count);
-
-        if (cpi->b_calculate_blockiness) {
-          SNPRINT(headings, "\t  Block\tWstBlck");
-          SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
-          SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
-        }
+  aom_free(ppi->tpl_sb_rdmult_scaling_factors);
+  ppi->tpl_sb_rdmult_scaling_factors = NULL;
 
-        if (cpi->b_calculate_consistency) {
-          double consistency =
-              aom_sse_to_psnr((double)cpi->total_samples, peak,
-                              (double)cpi->total_inconsistency);
+  TplParams *const tpl_data = &ppi->tpl_data;
+  aom_free(tpl_data->txfm_stats_list);
 
-          SNPRINT(headings, "\tConsist\tWstCons");
-          SNPRINT2(results, "\t%7.3f", consistency);
-          SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
-        }
+  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
+    aom_free(tpl_data->tpl_stats_pool[frame]);
+    aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
+    tpl_data->tpl_stats_pool[frame] = NULL;
+  }
 
-        SNPRINT(headings, "\t    Time\tRcErr\tAbsErr");
-        SNPRINT2(results, "\t%8.0f", total_encode_time);
-        SNPRINT2(results, "\t%7.2f", rate_err);
-        SNPRINT2(results, "\t%7.2f", fabs(rate_err));
+#if !CONFIG_REALTIME_ONLY
+  av1_tpl_dealloc(&tpl_data->tpl_mt_sync);
+#endif
 
-        fprintf(f, "%s\tAPsnr611\n", headings);
-        fprintf(f, "%s\t%7.3f\n", results,
-                (6 * cpi->psnr.stat[STAT_Y] + cpi->psnr.stat[STAT_U] +
-                 cpi->psnr.stat[STAT_V]) /
-                    (cpi->count * 8));
-      }
+  terminate_worker_data(ppi);
+  free_thread_data(ppi);
 
-      fclose(f);
-    }
-#endif  // CONFIG_INTERNAL_STATS
+  aom_free(ppi->p_mt_info.tile_thr_data);
+  aom_free(ppi->p_mt_info.workers);
+
+  aom_free(ppi);
+}
+
+void av1_remove_compressor(AV1_COMP *cpi) {
+  if (!cpi) return;
+#if CONFIG_RATECTRL_LOG
+  if (cpi->oxcf.pass == 3) {
+    rc_log_show(&cpi->rc_log);
+  }
+#endif  // CONFIG_RATECTRL_LOG
+
+  AV1_COMMON *cm = &cpi->common;
+  if (cm->current_frame.frame_number > 0) {
 #if CONFIG_SPEED_STATS
     if (!is_stat_generation_stage(cpi)) {
       fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count);
@@ -3592,88 +1554,71 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 
 #if CONFIG_COLLECT_PARTITION_STATS == 2
     if (!is_stat_generation_stage(cpi)) {
-      av1_print_partition_stats(&cpi->partition_stats);
+      av1_print_fr_partition_timing_stats(&cpi->partition_stats,
+                                          "fr_part_timing_data.csv");
     }
 #endif
   }
 
-  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
-    aom_free(tpl_data->tpl_stats_pool[frame]);
-    aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
-  }
-
-  for (t = cpi->num_workers - 1; t >= 0; --t) {
-    AVxWorker *const worker = &cpi->workers[t];
-    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
-
-    // Deallocate allocated threads.
-    aom_get_worker_interface()->end(worker);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  av1_denoiser_free(&(cpi->denoiser));
+#endif
 
-    // Deallocate allocated thread data.
-    aom_free(thread_data->td->tctx);
-    if (t > 0) {
-      aom_free(thread_data->td->palette_buffer);
-      aom_free(thread_data->td->tmp_conv_dst);
-      av1_release_compound_type_rd_buffers(&thread_data->td->comp_rd_buffer);
-      for (int j = 0; j < 2; ++j) {
-        aom_free(thread_data->td->tmp_obmc_bufs[j]);
-      }
-      aom_free(thread_data->td->above_pred_buf);
-      aom_free(thread_data->td->left_pred_buf);
-      aom_free(thread_data->td->wsrc_buf);
-      aom_free(thread_data->td->vt64x64);
-
-      aom_free(thread_data->td->inter_modes_info);
-      for (int x = 0; x < 2; x++) {
-        for (int y = 0; y < 2; y++) {
-          aom_free(thread_data->td->hash_value_buffer[x][y]);
-          thread_data->td->hash_value_buffer[x][y] = NULL;
-        }
-      }
-      aom_free(thread_data->td->mask_buf);
-      aom_free(thread_data->td->counts);
-      av1_free_pc_tree(cpi, thread_data->td, num_planes,
-                       cm->seq_params.sb_size);
-      aom_free(thread_data->td->mbmi_ext);
-      aom_free(thread_data->td);
-    }
-  }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  aom_free(cm->error);
+#endif
+  aom_free(cpi->td.tctx);
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
 #if CONFIG_MULTITHREAD
-  if (cpi->row_mt_mutex_ != NULL) {
-    pthread_mutex_destroy(cpi->row_mt_mutex_);
-    aom_free(cpi->row_mt_mutex_);
+  pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_;
+  pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+  pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_;
+  if (enc_row_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(enc_row_mt_mutex_);
+    aom_free(enc_row_mt_mutex_);
+  }
+  if (gm_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(gm_mt_mutex_);
+    aom_free(gm_mt_mutex_);
+  }
+  if (pack_bs_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(pack_bs_mt_mutex_);
+    aom_free(pack_bs_mt_mutex_);
   }
 #endif
   av1_row_mt_mem_dealloc(cpi);
-  aom_free(cpi->tile_thr_data);
-  aom_free(cpi->workers);
 
-  if (cpi->num_workers > 1) {
-    av1_loop_filter_dealloc(&cpi->lf_row_sync);
-    av1_loop_restoration_dealloc(&cpi->lr_row_sync, cpi->num_workers);
+  if (mt_info->num_workers > 1) {
+    av1_loop_filter_dealloc(&mt_info->lf_row_sync);
+    av1_cdef_mt_dealloc(&mt_info->cdef_sync);
+#if !CONFIG_REALTIME_ONLY
+    int num_lr_workers =
+        av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR);
+    av1_loop_restoration_dealloc(&mt_info->lr_row_sync, num_lr_workers);
+    av1_gm_dealloc(&mt_info->gm_sync);
+    av1_tf_mt_dealloc(&mt_info->tf_sync);
+#endif
   }
 
+  av1_free_thirdpass_ctx(cpi->third_pass_ctx);
+
+  av1_close_second_pass_log(cpi);
+
   dealloc_compressor_data(cpi);
 
-#if CONFIG_INTERNAL_STATS
-  aom_free(cpi->ssim_vars);
-  cpi->ssim_vars = NULL;
-#endif  // CONFIG_INTERNAL_STATS
+  av1_ext_part_delete(&cpi->ext_part_controller);
 
   av1_remove_common(cm);
-#if CONFIG_HTB_TRELLIS
-  if (cpi->sf.use_hash_based_trellis) hbt_destroy();
-#endif  // CONFIG_HTB_TRELLIS
-  av1_free_ref_frame_buffers(cm->buffer_pool);
 
   aom_free(cpi);
 
-#ifdef OUTPUT_YUV_SKINMAP
-  fclose(yuv_skinmap_file);
-#endif
 #ifdef OUTPUT_YUV_REC
   fclose(yuv_rec_file);
 #endif
+
+#ifdef OUTPUT_YUV_DENOISED
+  fclose(yuv_denoised_file);
+#endif
 }
 
 static void generate_psnr_packet(AV1_COMP *cpi) {
@@ -3681,7 +1626,7 @@ static void generate_psnr_packet(AV1_COMP *cpi) {
   int i;
   PSNR_STATS psnr;
 #if CONFIG_AV1_HIGHBITDEPTH
-  const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth;
+  const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
   const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
   aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr,
                        bit_depth, in_bit_depth);
@@ -3694,8 +1639,20 @@ static void generate_psnr_packet(AV1_COMP *cpi) {
     pkt.data.psnr.sse[i] = psnr.sse[i];
     pkt.data.psnr.psnr[i] = psnr.psnr[i];
   }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
+      (in_bit_depth < bit_depth)) {
+    for (i = 0; i < 4; ++i) {
+      pkt.data.psnr.samples_hbd[i] = psnr.samples_hbd[i];
+      pkt.data.psnr.sse_hbd[i] = psnr.sse_hbd[i];
+      pkt.data.psnr.psnr_hbd[i] = psnr.psnr_hbd[i];
+    }
+  }
+#endif
+
   pkt.kind = AOM_CODEC_PSNR_PKT;
-  aom_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+  aom_codec_pkt_list_add(cpi->ppi->output_pkt_list, &pkt);
 }
 
 int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags) {
@@ -3729,45 +1686,6 @@ int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
   }
 }
 
-int av1_update_entropy(bool *ext_refresh_frame_context,
-                       bool *ext_refresh_frame_context_pending, bool update) {
-  *ext_refresh_frame_context = update;
-  *ext_refresh_frame_context_pending = 1;
-  return 0;
-}
-
-#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
-// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
-// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
-// not denoise the UV channels at this time. If ever we implement UV channel
-// denoising we will have to modify this.
-void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
-  uint8_t *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1, f);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, f);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, f);
-    src += s->uv_stride;
-  } while (--h);
-}
-#endif
-
 #ifdef OUTPUT_YUV_REC
 void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
   uint8_t *src = s->y_buffer;
@@ -3826,145 +1744,7 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
 }
 #endif  // OUTPUT_YUV_REC
 
-#define GM_RECODE_LOOP_NUM4X4_FACTOR 192
-static int recode_loop_test_global_motion(
-    WarpedMotionParams *const global_motion,
-    const int *const global_motion_used, int *const gm_params_cost) {
-  int i;
-  int recode = 0;
-  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-    if (global_motion[i].wmtype != IDENTITY &&
-        global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR <
-            gm_params_cost[i]) {
-      global_motion[i] = default_warp_params;
-      assert(global_motion[i].wmtype == IDENTITY);
-      gm_params_cost[i] = 0;
-      recode = 1;
-      // TODO(sarahparker): The earlier condition for recoding here was:
-      // "recode |= (rdc->global_motion_used[i] > 0);". Can we bring something
-      // similar to that back to speed up global motion?
-    }
-  }
-  return recode;
-}
-
-// Function to test for conditions that indicate we should loop
-// back and recode a frame.
-static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q,
-                            int maxq, int minq) {
-  const RATE_CONTROL *const rc = &cpi->rc;
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
-  int force_recode = 0;
-
-  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
-      (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) ||
-      (frame_is_kfgfarf &&
-       (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
-    // TODO(agrange) high_limit could be greater than the scale-down threshold.
-    if ((rc->projected_frame_size > high_limit && q < maxq) ||
-        (rc->projected_frame_size < low_limit && q > minq)) {
-      force_recode = 1;
-    } else if (cpi->oxcf.rc_mode == AOM_CQ) {
-      // Deal with frame undershoot and whether or not we are
-      // below the automatically set cq level.
-      if (q > oxcf->cq_level &&
-          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
-        force_recode = 1;
-      }
-    }
-  }
-  return force_recode;
-}
-
-static void scale_references(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MV_REFERENCE_FRAME ref_frame;
-
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
-    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
-      BufferPool *const pool = cm->buffer_pool;
-      const YV12_BUFFER_CONFIG *const ref =
-          get_ref_frame_yv12_buf(cm, ref_frame);
-
-      if (ref == NULL) {
-        cpi->scaled_ref_buf[ref_frame - 1] = NULL;
-        continue;
-      }
-
-      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
-        // Replace the reference buffer with a copy having a thicker border,
-        // if the reference buffer is higher resolution than the current
-        // frame, and the border is thin.
-        if ((ref->y_crop_width > cm->width ||
-             ref->y_crop_height > cm->height) &&
-            ref->border < AOM_BORDER_IN_PIXELS) {
-          RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame);
-          if (aom_yv12_realloc_with_new_border(
-                  &ref_fb->buf, AOM_BORDER_IN_PIXELS,
-                  cm->features.byte_alignment, num_planes) != 0) {
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                               "Failed to allocate frame buffer");
-          }
-        }
-        int force_scaling = 0;
-        RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1];
-        if (new_fb == NULL) {
-          const int new_fb_idx = get_free_fb(cm);
-          if (new_fb_idx == INVALID_IDX) {
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                               "Unable to find free frame buffer");
-          }
-          force_scaling = 1;
-          new_fb = &pool->frame_bufs[new_fb_idx];
-        }
-
-        if (force_scaling || new_fb->buf.y_crop_width != cm->width ||
-            new_fb->buf.y_crop_height != cm->height) {
-          if (aom_realloc_frame_buffer(
-                  &new_fb->buf, cm->width, cm->height,
-                  cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-                  cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                  cm->features.byte_alignment, NULL, NULL, NULL)) {
-            if (force_scaling) {
-              // Release the reference acquired in the get_free_fb() call above.
-              --new_fb->ref_count;
-            }
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                               "Failed to allocate frame buffer");
-          }
-          av1_resize_and_extend_frame(
-              ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
-          cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
-          alloc_frame_mvs(cm, new_fb);
-        }
-      } else {
-        RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
-        buf->buf.y_crop_width = ref->y_crop_width;
-        buf->buf.y_crop_height = ref->y_crop_height;
-        cpi->scaled_ref_buf[ref_frame - 1] = buf;
-        ++buf->ref_count;
-      }
-    } else {
-      if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
-    }
-  }
-}
-
-static void release_scaled_references(AV1_COMP *cpi) {
-  // TODO(isbs): only refresh the necessary frames, rather than all of them
-  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-    RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
-    if (buf != NULL) {
-      --buf->ref_count;
-      cpi->scaled_ref_buf[i] = NULL;
-    }
-  }
-}
-
-static void set_mv_search_params(AV1_COMP *cpi) {
+void av1_set_mv_search_params(AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
   MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
   const int max_mv_def = AOMMAX(cm->width, cm->height);
@@ -3978,37 +1758,57 @@ static void set_mv_search_params(AV1_COMP *cpi) {
       // after a key/intra-only frame.
       mv_search_params->max_mv_magnitude = max_mv_def;
     } else {
-      // Use cpi->max_mv_magnitude == -1 to exclude first pass case.
-      if (cm->show_frame && mv_search_params->max_mv_magnitude != -1) {
+      // Use adaptive mv steps based on previous frame stats for show frames and
+      // internal arfs.
+      FRAME_UPDATE_TYPE cur_update_type =
+          cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+      int use_auto_mv_step =
+          (cm->show_frame || cur_update_type == INTNL_ARF_UPDATE) &&
+          mv_search_params->max_mv_magnitude != -1 &&
+          cpi->sf.mv_sf.auto_mv_step_size >= 2;
+      if (use_auto_mv_step) {
         // Allow mv_steps to correspond to twice the max mv magnitude found
         // in the previous frame, capped by the default max_mv_magnitude based
         // on resolution.
         mv_search_params->mv_step_param = av1_init_search_range(
             AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude));
       }
-      mv_search_params->max_mv_magnitude = -1;
+      // Reset max_mv_magnitude based on update flag.
+      if (cpi->do_frame_data_update) mv_search_params->max_mv_magnitude = -1;
     }
   }
 }
 
-void av1_set_screen_content_options(const AV1_COMP *cpi,
-                                    FeatureFlags *features) {
+void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) {
   const AV1_COMMON *const cm = &cpi->common;
 
-  if (cm->seq_params.force_screen_content_tools != 2) {
+  if (cm->seq_params->force_screen_content_tools != 2) {
     features->allow_screen_content_tools = features->allow_intrabc =
-        cm->seq_params.force_screen_content_tools;
+        cm->seq_params->force_screen_content_tools;
+    return;
+  }
+
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+    features->allow_screen_content_tools = 1;
+    features->allow_intrabc = cpi->oxcf.mode == REALTIME ? 0 : 1;
+    cpi->is_screen_content_type = 1;
+    cpi->use_screen_content_tools = 1;
     return;
   }
 
   if (cpi->oxcf.mode == REALTIME) {
-    assert(cm->seq_params.reduced_still_picture_hdr);
     features->allow_screen_content_tools = features->allow_intrabc = 0;
     return;
   }
 
-  if (cpi->oxcf.content == AOM_CONTENT_SCREEN) {
-    features->allow_screen_content_tools = features->allow_intrabc = 1;
+  // Screen content tools are not evaluated in non-RD encoding mode unless
+  // content type is not set explicitly, i.e., when
+  // cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN, use_nonrd_pick_mode = 1
+  // and hybrid_intra_pickmode = 0. Hence, screen content detection is
+  // disabled.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+      !cpi->sf.rt_sf.hybrid_intra_pickmode) {
+    features->allow_screen_content_tools = features->allow_intrabc = 0;
     return;
   }
 
@@ -4020,7 +1820,7 @@ void av1_set_screen_content_options(const AV1_COMP *cpi,
   const int stride = cpi->unfiltered_source->y_stride;
   const int width = cpi->unfiltered_source->y_width;
   const int height = cpi->unfiltered_source->y_height;
-  const int bd = cm->seq_params.bit_depth;
+  const int bd = cm->seq_params->bit_depth;
   const int blk_w = 16;
   const int blk_h = 16;
   // These threshold values are selected experimentally.
@@ -4034,12 +1834,14 @@ void av1_set_screen_content_options(const AV1_COMP *cpi,
 
   for (int r = 0; r + blk_h <= height; r += blk_h) {
     for (int c = 0; c + blk_w <= width; c += blk_w) {
-      int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+      int count_buf[1 << 8];  // Maximum (1 << 8) bins for hbd path.
       const uint8_t *const this_src = src + r * stride + c;
-      const int n_colors =
-          use_hbd ? av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd,
-                                            count_buf)
-                  : av1_count_colors(this_src, stride, blk_w, blk_h, count_buf);
+      int n_colors;
+      if (use_hbd)
+        av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd, NULL,
+                                count_buf, &n_colors, NULL);
+      else
+        av1_count_colors(this_src, stride, blk_w, blk_h, count_buf, &n_colors);
       if (n_colors > 1 && n_colors <= color_thresh) {
         ++counts_1;
         struct buf_2d buf;
@@ -4061,221 +1863,73 @@ void av1_set_screen_content_options(const AV1_COMP *cpi,
   // requires that the block has high variance.
   features->allow_intrabc = features->allow_screen_content_tools &&
                             counts_2 * blk_h * blk_w * 12 > width * height;
-}
-
-static void set_size_independent_vars(AV1_COMP *cpi) {
-  int i;
-  AV1_COMMON *const cm = &cpi->common;
-  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-    cm->global_motion[i] = default_warp_params;
-  }
-  cpi->gm_info.search_done = 0;
-
-  av1_set_speed_features_framesize_independent(cpi, cpi->speed);
-  av1_set_rd_speed_thresholds(cpi);
-  cm->features.interp_filter = SWITCHABLE;
-  cm->features.switchable_motion_mode = 1;
-}
-
-#if !CONFIG_REALTIME_ONLY
-double av1_get_gfu_boost_projection_factor(double min_factor, double max_factor,
-                                           int frame_count) {
-  double factor = sqrt((double)frame_count);
-  factor = AOMMIN(factor, max_factor);
-  factor = AOMMAX(factor, min_factor);
-  factor = (200.0 + 10.0 * factor);
-  return factor;
-}
-
-static int get_gfu_boost_from_r0_lap(double min_factor, double max_factor,
-                                     double r0, int frames_to_key) {
-  double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor,
-                                                      frames_to_key);
-  const int boost = (int)rint(factor / r0);
-  return boost;
-}
-
-double av1_get_kf_boost_projection_factor(int frame_count) {
-  double factor = sqrt((double)frame_count);
-  factor = AOMMIN(factor, 10.0);
-  factor = AOMMAX(factor, 4.0);
-  factor = (75.0 + 14.0 * factor);
-  return factor;
-}
-
-static int get_kf_boost_from_r0(double r0, int frames_to_key) {
-  double factor = av1_get_kf_boost_projection_factor(frames_to_key);
-  const int boost = (int)rint(factor / r0);
-  return boost;
-}
-#endif
-
-#define MIN_BOOST_COMBINE_FACTOR 4.0
-#define MAX_BOOST_COMBINE_FACTOR 12.0
-int combine_prior_with_tpl_boost(double min_factor, double max_factor,
-                                 int prior_boost, int tpl_boost,
-                                 int frames_to_key) {
-  double factor = sqrt((double)frames_to_key);
-  double range = max_factor - min_factor;
-  factor = AOMMIN(factor, max_factor);
-  factor = AOMMAX(factor, min_factor);
-  factor -= min_factor;
-  int boost =
-      (int)((factor * prior_boost + (range - factor) * tpl_boost) / range);
-  return boost;
-}
-
-#if !CONFIG_REALTIME_ONLY
-static void process_tpl_stats_frame(AV1_COMP *cpi) {
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  AV1_COMMON *const cm = &cpi->common;
-
-  assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size));
-
-  const int tpl_idx = gf_group->index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
-  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-
-  if (tpl_frame->is_valid) {
-    int tpl_stride = tpl_frame->stride;
-    int64_t intra_cost_base = 0;
-    int64_t mc_dep_cost_base = 0;
-    int64_t mc_saved_base = 0;
-    int64_t mc_count_base = 0;
-    const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
-    const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-
-    for (int row = 0; row < cm->mi_params.mi_rows; row += step) {
-      for (int col = 0; col < mi_cols_sr; col += step) {
-        TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
-            row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
-        int64_t mc_dep_delta =
-            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
-                   this_stats->mc_dep_dist);
-        intra_cost_base += (this_stats->recrf_dist << RDDIV_BITS);
-        mc_dep_cost_base +=
-            (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
-        mc_count_base += this_stats->mc_count;
-        mc_saved_base += this_stats->mc_saved;
-      }
-    }
-
-    if (mc_dep_cost_base == 0) {
-      tpl_frame->is_valid = 0;
-    } else {
-      aom_clear_system_state();
-      cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
-      if (is_frame_arf_and_tpl_eligible(gf_group)) {
-        cpi->rd.arf_r0 = cpi->rd.r0;
-        if (cpi->lap_enabled) {
-          double min_boost_factor = sqrt(cpi->rc.baseline_gf_interval);
-          const int gfu_boost = get_gfu_boost_from_r0_lap(
-              min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.arf_r0,
-              cpi->rc.num_stats_required_for_gfu_boost);
-          // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost,
-          //        gfu_boost);
-          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
-              min_boost_factor, MAX_BOOST_COMBINE_FACTOR, cpi->rc.gfu_boost,
-              gfu_boost, cpi->rc.num_stats_used_for_gfu_boost);
-        } else {
-          const int gfu_boost = (int)(200.0 / cpi->rd.r0);
-          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
-              MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
-              cpi->rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
-        }
-      } else if (frame_is_intra_only(cm)) {
-        // TODO(debargha): Turn off q adjustment for kf temporarily to
-        // reduce impact on speed of encoding. Need to investigate how
-        // to mitigate the issue.
-        if (cpi->oxcf.rc_mode == AOM_Q) {
-          const int kf_boost =
-              get_kf_boost_from_r0(cpi->rd.r0, cpi->rc.frames_to_key);
-          if (cpi->lap_enabled) {
-            cpi->rc.kf_boost = combine_prior_with_tpl_boost(
-                MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
-                cpi->rc.kf_boost, kf_boost,
-                cpi->rc.num_stats_used_for_kf_boost);
-          } else {
-            cpi->rc.kf_boost = combine_prior_with_tpl_boost(
-                MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
-                cpi->rc.kf_boost, kf_boost, cpi->rc.frames_to_key);
-          }
-        }
-      }
-      cpi->rd.mc_count_base = (double)mc_count_base /
-                              (cm->mi_params.mi_rows * cm->mi_params.mi_cols);
-      cpi->rd.mc_saved_base = (double)mc_saved_base /
-                              (cm->mi_params.mi_rows * cm->mi_params.mi_cols);
-      aom_clear_system_state();
-    }
-  }
-}
-#endif  // !CONFIG_REALTIME_ONLY
-
-static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
-                                    int *top_index) {
-  AV1_COMMON *const cm = &cpi->common;
-
-  // Setup variables that depend on the dimensions of the frame.
-  av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
-
-#if !CONFIG_REALTIME_ONLY
-  if (cpi->oxcf.enable_tpl_model && is_frame_tpl_eligible(cpi)) {
-    process_tpl_stats_frame(cpi);
-    av1_tpl_rdmult_setup(cpi);
-  }
-#endif
-
-  // Decide q and q bounds.
-  *q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height,
-                                cpi->gf_group.index, bottom_index, top_index);
-
-  // Configure experimental use of segmentation for enhanced coding of
-  // static regions if indicated.
-  // Only allowed in the second pass of a two pass encode, as it requires
-  // lagged coding, and if the relevant speed feature flag is set.
-  if (is_stat_consumption_stage_twopass(cpi) &&
-      cpi->sf.hl_sf.static_segmentation)
-    configure_static_seg_features(cpi);
-}
+  cpi->use_screen_content_tools = features->allow_screen_content_tools;
+  cpi->is_screen_content_type =
+      features->allow_intrabc ||
+      (counts_1 * blk_h * blk_w * 10 > width * height * 4 &&
+       counts_2 * blk_h * blk_w * 30 > width * height);
+}
+
+// Function pointer to search site config initialization
+// of different search method functions.
+typedef void (*av1_init_search_site_config)(search_site_config *cfg, int stride,
+                                            int level);
+
+av1_init_search_site_config
+    av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS] = {
+      av1_init_dsmotion_compensation,     av1_init_motion_compensation_nstep,
+      av1_init_motion_compensation_nstep, av1_init_dsmotion_compensation,
+      av1_init_motion_compensation_hex,   av1_init_motion_compensation_bigdia,
+      av1_init_motion_compensation_square
+    };
 
 static void init_motion_estimation(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
-  const int y_stride = cpi->scaled_source.y_stride;
-  const int y_stride_src =
-      ((cpi->oxcf.width != cm->width || cpi->oxcf.height != cm->height) ||
-       av1_superres_scaled(cm))
-          ? y_stride
-          : cpi->lookahead->buf->img.y_stride;
-  int fpf_y_stride = cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride
-                                           : cpi->scaled_source.y_stride;
-
-  // Update if ss_cfg is uninitialized or the current frame has a new stride
+  const int aligned_width = (cm->width + 7) & ~7;
+  const int y_stride =
+      aom_calc_y_stride(aligned_width, cpi->oxcf.border_in_pixels);
+  const int y_stride_src = ((cpi->oxcf.frm_dim_cfg.width != cm->width ||
+                             cpi->oxcf.frm_dim_cfg.height != cm->height) ||
+                            av1_superres_scaled(cm))
+                               ? y_stride
+                               : cpi->ppi->lookahead->buf->img.y_stride;
+  int fpf_y_stride =
+      cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride : y_stride;
+
+  // Update if search_site_cfg is uninitialized or the current frame has a new
+  // stride
   const int should_update =
-      !mv_search_params->ss_cfg[SS_CFG_SRC].stride ||
-      !mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD].stride ||
-      (y_stride != mv_search_params->ss_cfg[SS_CFG_SRC].stride);
+      !mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride ||
+      !mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][DIAMOND].stride ||
+      (y_stride !=
+       mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride);
 
   if (!should_update) {
     return;
   }
 
-  if (cpi->sf.mv_sf.search_method == DIAMOND) {
-    av1_init_dsmotion_compensation(&mv_search_params->ss_cfg[SS_CFG_SRC],
-                                   y_stride);
-    av1_init_dsmotion_compensation(&mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD],
-                                   y_stride_src);
-  } else {
-    av1_init3smotion_compensation(&mv_search_params->ss_cfg[SS_CFG_SRC],
-                                  y_stride);
-    av1_init3smotion_compensation(&mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD],
-                                  y_stride_src);
+  // Initialization of search_site_cfg for NUM_DISTINCT_SEARCH_METHODS.
+  for (SEARCH_METHODS i = DIAMOND; i < NUM_DISTINCT_SEARCH_METHODS; i++) {
+    const int level = ((i == NSTEP_8PT) || (i == CLAMPED_DIAMOND)) ? 1 : 0;
+    av1_init_motion_compensation[i](
+        &mv_search_params->search_site_cfg[SS_CFG_SRC][i], y_stride, level);
+    av1_init_motion_compensation[i](
+        &mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][i], y_stride_src,
+        level);
+  }
+
+  // First pass search site config initialization.
+  av1_init_motion_fpf(&mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND],
+                      fpf_y_stride);
+  for (SEARCH_METHODS i = NSTEP; i < NUM_DISTINCT_SEARCH_METHODS; i++) {
+    memcpy(&mv_search_params->search_site_cfg[SS_CFG_FPF][i],
+           &mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND],
+           sizeof(search_site_config));
   }
-  av1_init_motion_fpf(&mv_search_params->ss_cfg[SS_CFG_FPF], fpf_y_stride);
 }
 
+#if !CONFIG_REALTIME_ONLY
 #define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
 static void set_restoration_unit_size(int width, int height, int sx, int sy,
                                       RestorationInfo *rst) {
@@ -4296,6 +1950,7 @@ static void set_restoration_unit_size(int width, int height, int sx, int sy,
   rst[1].restoration_unit_size = rst[0].restoration_unit_size >> s;
   rst[2].restoration_unit_size = rst[1].restoration_unit_size;
 }
+#endif
 
 static void init_ref_frame_bufs(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
@@ -4313,9 +1968,11 @@ static void init_ref_frame_bufs(AV1_COMP *cpi) {
 void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
                              int subsampling_x, int subsampling_y) {
   AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
+  SequenceHeader *const seq_params = cm->seq_params;
+  InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
 
-  if (!cpi->initial_width || seq_params->use_highbitdepth != use_highbitdepth ||
+  if (!initial_dimensions->width ||
+      seq_params->use_highbitdepth != use_highbitdepth ||
       seq_params->subsampling_x != subsampling_x ||
       seq_params->subsampling_y != subsampling_y) {
     seq_params->subsampling_x = subsampling_x;
@@ -4326,48 +1983,75 @@ void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
     av1_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed);
 
     if (!is_stat_generation_stage(cpi)) {
-      alloc_altref_frame_buffer(cpi);
-      alloc_util_frame_buffers(cpi);
+#if !CONFIG_REALTIME_ONLY
+      av1_tf_info_alloc(&cpi->ppi->tf_info, cpi);
+#endif  // !CONFIG_REALTIME_ONLY
     }
     init_ref_frame_bufs(cpi);
 
     init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
 
-    cpi->initial_width = cm->width;
-    cpi->initial_height = cm->height;
+    initial_dimensions->width = cm->width;
+    initial_dimensions->height = cm->height;
     cpi->initial_mbs = cm->mi_params.MBs;
   }
 }
 
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      !cpi->denoiser.frame_buffer_initialized) {
+    if (av1_denoiser_alloc(
+            cm, &cpi->svc, &cpi->denoiser, cpi->ppi->use_svc,
+            cpi->oxcf.noise_sensitivity, cm->width, cm->height,
+            cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+            cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS))
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate denoiser");
+  }
+}
+#endif
+
 // Returns 1 if the assigned width or height was <= 0.
 int av1_set_size_literal(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  av1_check_initial_width(cpi, cm->seq_params.use_highbitdepth,
-                          cm->seq_params.subsampling_x,
-                          cm->seq_params.subsampling_y);
+  InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
+  av1_check_initial_width(cpi, cm->seq_params->use_highbitdepth,
+                          cm->seq_params->subsampling_x,
+                          cm->seq_params->subsampling_y);
 
   if (width <= 0 || height <= 0) return 1;
 
   cm->width = width;
   cm->height = height;
 
-  if (cpi->initial_width && cpi->initial_height &&
-      (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
+
+  if (initial_dimensions->width && initial_dimensions->height &&
+      (cm->width > initial_dimensions->width ||
+       cm->height > initial_dimensions->height)) {
     av1_free_context_buffers(cm);
-    av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size);
+    av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+    av1_free_sms_tree(&cpi->td);
+    av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+    cpi->td.firstpass_ctx = NULL;
+    alloc_mb_mode_info_buffers(cpi);
     alloc_compressor_data(cpi);
     realloc_segmentation_maps(cpi);
-    cpi->initial_width = cpi->initial_height = 0;
+    initial_dimensions->width = initial_dimensions->height = 0;
   }
-  update_frame_size(cpi);
+  alloc_mb_mode_info_buffers(cpi);
+  av1_update_frame_size(cpi);
 
   return 0;
 }
 
 void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int ref_frame;
@@ -4378,8 +2062,16 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
     // Recalculate 'all_lossless' in case super-resolution was (un)selected.
     cm->features.all_lossless =
         cm->features.coded_lossless && !av1_superres_scaled(cm);
+
+    av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    // Reset the denoiser on the resized frame.
+    if (cpi->oxcf.noise_sensitivity > 0) {
+      av1_denoiser_free(&(cpi->denoiser));
+      setup_denoiser_buffer(cpi);
+    }
+#endif
   }
-  set_mv_search_params(cpi);
 
   if (is_stat_consumption_stage(cpi)) {
     av1_set_target_rate(cpi, cm->width, cm->height);
@@ -4396,7 +2088,7 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
     if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
                                         cm->mi_params.mi_cols,
                                         av1_num_planes(cm)))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate context buffers");
   }
 
@@ -4405,20 +2097,30 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
           &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
           cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
-          NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+          NULL, cpi->oxcf.tool_cfg.enable_global_motion, 0))
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
 
-  const int frame_width = cm->superres_upscaled_width;
-  const int frame_height = cm->superres_upscaled_height;
-  set_restoration_unit_size(frame_width, frame_height,
-                            seq_params->subsampling_x,
-                            seq_params->subsampling_y, cm->rst_info);
-  for (int i = 0; i < num_planes; ++i)
-    cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
+  if (!is_stat_generation_stage(cpi)) av1_init_cdef_worker(cpi);
+
+#if !CONFIG_REALTIME_ONLY
+  if (is_restoration_used(cm)) {
+    const int frame_width = cm->superres_upscaled_width;
+    const int frame_height = cm->superres_upscaled_height;
+    set_restoration_unit_size(frame_width, frame_height,
+                              seq_params->subsampling_x,
+                              seq_params->subsampling_y, cm->rst_info);
+    for (int i = 0; i < num_planes; ++i)
+      cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
+
+    av1_alloc_restoration_buffers(cm);
+    // Store the allocated restoration buffers in MT object.
+    if (cpi->ppi->p_mt_info.num_workers > 1) {
+      av1_init_lr_mt_buffers(cpi);
+    }
+  }
+#endif
 
-  av1_alloc_restoration_buffers(cm);
-  if (!is_stat_generation_stage(cpi)) alloc_util_frame_buffers(cpi);
   init_motion_estimation(cpi);
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
@@ -4438,382 +2140,42 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
-static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
-  // Choose an arbitrary random number
-  static unsigned int seed = 56789;
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
-  uint8_t new_denom = SCALE_NUMERATOR;
-
-  if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR;
-  switch (oxcf->resize_mode) {
-    case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
-    case RESIZE_FIXED:
-      if (cpi->common.current_frame.frame_type == KEY_FRAME)
-        new_denom = oxcf->resize_kf_scale_denominator;
-      else
-        new_denom = oxcf->resize_scale_denominator;
-      break;
-    case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
-    default: assert(0);
-  }
-  return new_denom;
-}
-
-#if CONFIG_SUPERRES_IN_RECODE
-static int superres_in_recode_allowed(const AV1_COMP *const cpi) {
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  // Empirically found to not be beneficial for AOM_Q mode and images coding.
-  return oxcf->superres_mode == SUPERRES_AUTO &&
-         (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ) &&
-         cpi->rc.frames_to_key > 1;
-}
-#endif  // CONFIG_SUPERRES_IN_RECODE
-
-#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012
-#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008
-#define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008
-#define SUPERRES_ENERGY_BY_AC_THRESH 0.2
-
-static double get_energy_by_q2_thresh(const GF_GROUP *gf_group,
-                                      const RATE_CONTROL *rc) {
-  // TODO(now): Return keyframe thresh * factor based on frame type / pyramid
-  // level.
-  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
-    return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME;
-  } else if (gf_group->update_type[gf_group->index] == KF_UPDATE) {
-    if (rc->frames_to_key <= 1)
-      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO;
-    else
-      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME;
-  } else {
-    assert(0);
-  }
-  return 0;
-}
-
-static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy,
-                                                     double threshq,
-                                                     double threshp) {
-  const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8);
-  const double tq = threshq * q * q;
-  const double tp = threshp * energy[1];
-  const double thresh = AOMMIN(tq, tp);
-  int k;
-  for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) {
-    if (energy[k - 1] > thresh) break;
-  }
-  return 3 * SCALE_NUMERATOR - k;
-}
-
-static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
-                                             int sr_kf, int sr_arf) {
-  // Use superres for Key-frames and Alt-ref frames only.
-  const GF_GROUP *gf_group = &cpi->gf_group;
-  if (gf_group->update_type[gf_group->index] != KF_UPDATE &&
-      gf_group->update_type[gf_group->index] != ARF_UPDATE) {
-    return SCALE_NUMERATOR;
-  }
-  if (gf_group->update_type[gf_group->index] == KF_UPDATE && !sr_kf) {
-    return SCALE_NUMERATOR;
-  }
-  if (gf_group->update_type[gf_group->index] == ARF_UPDATE && !sr_arf) {
-    return SCALE_NUMERATOR;
-  }
-
-  double energy[16];
-  analyze_hor_freq(cpi, energy);
-
-  const double energy_by_q2_thresh =
-      get_energy_by_q2_thresh(gf_group, &cpi->rc);
-  int denom = get_superres_denom_from_qindex_energy(
-      qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH);
-  /*
-  printf("\nenergy = [");
-  for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]);
-  printf("]\n");
-  printf("boost = %d\n",
-         (gf_group->update_type[gf_group->index] == KF_UPDATE)
-             ? cpi->rc.kf_boost
-             : cpi->rc.gfu_boost);
-  printf("denom = %d\n", denom);
-  */
-#if CONFIG_SUPERRES_IN_RECODE
-  if (superres_in_recode_allowed(cpi)) {
-    assert(cpi->superres_mode != SUPERRES_NONE);
-    // Force superres to be tried in the recode loop, as full-res is also going
-    // to be tried anyway.
-    denom = AOMMAX(denom, SCALE_NUMERATOR + 1);
-  }
-#endif  // CONFIG_SUPERRES_IN_RECODE
-  return denom;
-}
-
-// If true, SUPERRES_AUTO mode will exhaustively search over all superres
-// denominators for all frames (except overlay and internal overlay frames).
-#define SUPERRES_RECODE_ALL_RATIOS 0
-
-static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
-  // Choose an arbitrary random number
-  static unsigned int seed = 34567;
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
-  uint8_t new_denom = SCALE_NUMERATOR;
-
-  // Make sure that superres mode of the frame is consistent with the
-  // sequence-level flag.
-  assert(IMPLIES(oxcf->superres_mode != SUPERRES_NONE,
-                 cpi->common.seq_params.enable_superres));
-  assert(IMPLIES(!cpi->common.seq_params.enable_superres,
-                 oxcf->superres_mode == SUPERRES_NONE));
-  // Make sure that superres mode for current encoding is consistent with user
-  // provided superres mode.
-  assert(IMPLIES(oxcf->superres_mode != SUPERRES_AUTO,
-                 cpi->superres_mode == oxcf->superres_mode));
-
-  // Note: we must look at the current superres_mode to be tried in 'cpi' here,
-  // not the user given mode in 'oxcf'.
-  switch (cpi->superres_mode) {
-    case SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
-    case SUPERRES_FIXED:
-      if (cpi->common.current_frame.frame_type == KEY_FRAME)
-        new_denom = oxcf->superres_kf_scale_denominator;
-      else
-        new_denom = oxcf->superres_scale_denominator;
-      break;
-    case SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
-    case SUPERRES_QTHRESH: {
-      // Do not use superres when screen content tools are used.
-      if (cpi->common.features.allow_screen_content_tools) break;
-      if (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ)
-        av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
-
-      // Now decide the use of superres based on 'q'.
-      int bottom_index, top_index;
-      const int q = av1_rc_pick_q_and_bounds(
-          cpi, &cpi->rc, cpi->oxcf.width, cpi->oxcf.height, cpi->gf_group.index,
-          &bottom_index, &top_index);
-
-      const int qthresh = (frame_is_intra_only(&cpi->common))
-                              ? oxcf->superres_kf_qthresh
-                              : oxcf->superres_qthresh;
-      if (q <= qthresh) {
-        new_denom = SCALE_NUMERATOR;
-      } else {
-        new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
-      }
-      break;
-    }
-    case SUPERRES_AUTO: {
-      // Do not use superres when screen content tools are used.
-      if (cpi->common.features.allow_screen_content_tools) break;
-      if (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ)
-        av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
-
-      // Now decide the use of superres based on 'q'.
-      int bottom_index, top_index;
-      const int q = av1_rc_pick_q_and_bounds(
-          cpi, &cpi->rc, cpi->oxcf.width, cpi->oxcf.height, cpi->gf_group.index,
-          &bottom_index, &top_index);
-
-      const int qthresh = 128;
-      if (q <= qthresh) {
-        new_denom = SCALE_NUMERATOR;
-      } else {
-#if SUPERRES_RECODE_ALL_RATIOS
-        if (cpi->common.current_frame.frame_type == KEY_FRAME)
-          new_denom = oxcf->superres_kf_scale_denominator;
-        else
-          new_denom = oxcf->superres_scale_denominator;
-#else
-        new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
-#endif  // SUPERRES_RECODE_ALL_RATIOS
-      }
-      break;
-    }
-    default: assert(0);
-  }
-  return new_denom;
-}
-
-static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
-  return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
-}
-
-static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
-  // Only need to check the width, as scaling is horizontal only.
-  (void)oheight;
-  return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom);
-}
-
-static int validate_size_scales(RESIZE_MODE resize_mode,
-                                SUPERRES_MODE superres_mode, int owidth,
-                                int oheight, size_params_type *rsz) {
-  if (dimensions_are_ok(owidth, oheight, rsz)) {  // Nothing to do.
-    return 1;
-  }
-
-  // Calculate current resize scale.
-  int resize_denom =
-      AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width),
-             DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height));
-
-  if (resize_mode != RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) {
-    // Alter superres scale as needed to enforce conformity.
-    rsz->superres_denom =
-        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom;
-    if (!dimensions_are_ok(owidth, oheight, rsz)) {
-      if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom;
-    }
-  } else if (resize_mode == RESIZE_RANDOM && superres_mode != SUPERRES_RANDOM) {
-    // Alter resize scale as needed to enforce conformity.
-    resize_denom =
-        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom;
-    rsz->resize_width = owidth;
-    rsz->resize_height = oheight;
-    av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
-                              resize_denom);
-    if (!dimensions_are_ok(owidth, oheight, rsz)) {
-      if (resize_denom > SCALE_NUMERATOR) {
-        --resize_denom;
-        rsz->resize_width = owidth;
-        rsz->resize_height = oheight;
-        av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
-                                  resize_denom);
-      }
-    }
-  } else if (resize_mode == RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) {
-    // Alter both resize and superres scales as needed to enforce conformity.
-    do {
-      if (resize_denom > rsz->superres_denom)
-        --resize_denom;
-      else
-        --rsz->superres_denom;
-      rsz->resize_width = owidth;
-      rsz->resize_height = oheight;
-      av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
-                                resize_denom);
-    } while (!dimensions_are_ok(owidth, oheight, rsz) &&
-             (resize_denom > SCALE_NUMERATOR ||
-              rsz->superres_denom > SCALE_NUMERATOR));
-  } else {  // We are allowed to alter neither resize scale nor superres
-            // scale.
-    return 0;
-  }
-  return dimensions_are_ok(owidth, oheight, rsz);
-}
-
-// Calculates resize and superres params for next frame
-static size_params_type calculate_next_size_params(AV1_COMP *cpi) {
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
-  size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR };
-  int resize_denom = SCALE_NUMERATOR;
-  if (has_no_stats_stage(cpi) && cpi->use_svc &&
-      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) {
-    rsz.resize_width = cpi->common.width;
-    rsz.resize_height = cpi->common.height;
-    return rsz;
-  }
-  if (is_stat_generation_stage(cpi)) return rsz;
-  if (resize_pending_params->width && resize_pending_params->height) {
-    rsz.resize_width = resize_pending_params->width;
-    rsz.resize_height = resize_pending_params->height;
-    resize_pending_params->width = resize_pending_params->height = 0;
-  } else {
-    resize_denom = calculate_next_resize_scale(cpi);
-    rsz.resize_width = oxcf->width;
-    rsz.resize_height = oxcf->height;
-    av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
-                              resize_denom);
-  }
-  rsz.superres_denom = calculate_next_superres_scale(cpi);
-  if (!validate_size_scales(oxcf->resize_mode, cpi->superres_mode, oxcf->width,
-                            oxcf->height, &rsz))
-    assert(0 && "Invalid scale parameters");
-  return rsz;
-}
-
-static void setup_frame_size_from_params(AV1_COMP *cpi,
-                                         const size_params_type *rsz) {
-  int encode_width = rsz->resize_width;
-  int encode_height = rsz->resize_height;
-
-  AV1_COMMON *cm = &cpi->common;
-  cm->superres_upscaled_width = encode_width;
-  cm->superres_upscaled_height = encode_height;
-  cm->superres_scale_denominator = rsz->superres_denom;
-  av1_calculate_scaled_superres_size(&encode_width, &encode_height,
-                                     rsz->superres_denom);
-  av1_set_frame_size(cpi, encode_width, encode_height);
-}
-
-void av1_setup_frame_size(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  // Reset superres params from previous frame.
-  cm->superres_scale_denominator = SCALE_NUMERATOR;
-  const size_params_type rsz = calculate_next_size_params(cpi);
-  setup_frame_size_from_params(cpi, &rsz);
-
-  assert(av1_is_min_tile_width_satisfied(cm));
-}
-
-static void superres_post_encode(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-
-  if (!av1_superres_scaled(cm)) return;
-
-  assert(cpi->oxcf.enable_superres);
-  assert(!is_lossless_requested(&cpi->oxcf));
-  assert(!cm->features.all_lossless);
-
-  av1_superres_upscale(cm, NULL);
-
-  // If regular resizing is occurring the source will need to be downscaled to
-  // match the upscaled superres resolution. Otherwise the original source is
-  // used.
-  if (!av1_resize_scaled(cm)) {
-    cpi->source = cpi->unscaled_source;
-    if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
-  } else {
-    assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
-    assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
-    // Do downscale. cm->(width|height) has been updated by
-    // av1_superres_upscale
-    if (aom_realloc_frame_buffer(
-            &cpi->scaled_source, cm->superres_upscaled_width,
-            cm->superres_upscaled_height, cm->seq_params.subsampling_x,
-            cm->seq_params.subsampling_y, cm->seq_params.use_highbitdepth,
-            AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, NULL, NULL,
-            NULL))
-      aom_internal_error(
-          &cm->error, AOM_CODEC_MEM_ERROR,
-          "Failed to reallocate scaled source buffer for superres");
-    assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width);
-    assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height);
-    av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source,
-                                (int)cm->seq_params.bit_depth, num_planes);
-    cpi->source = &cpi->scaled_source;
-  }
-}
-
+/*!\brief Select and apply cdef filters and switchable restoration filters
+ *
+ * \ingroup high_level_algo
+ */
 static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
                                    MACROBLOCKD *xd, int use_restoration,
                                    int use_cdef) {
+#if !CONFIG_REALTIME_ONLY
   if (use_restoration)
     av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0);
+#else
+  (void)use_restoration;
+#endif
 
   if (use_cdef) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, cdef_time);
 #endif
+    const int num_workers = cpi->mt_info.num_mod_workers[MOD_CDEF];
     // Find CDEF parameters
-    av1_cdef_search(&cm->cur_frame->buf, cpi->source, cm, xd,
-                    cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult);
+    av1_cdef_search(&cpi->mt_info, &cm->cur_frame->buf, cpi->source, cm, xd,
+                    cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult,
+                    cpi->sf.rt_sf.skip_cdef_sb, cpi->rc.frames_since_key,
+                    cpi->oxcf.tool_cfg.cdef_control,
+                    cpi->svc.non_reference_frame);
 
     // Apply the filter
-    av1_cdef_frame(&cm->cur_frame->buf, cm, xd);
+    if (!cpi->svc.non_reference_frame) {
+      if (num_workers > 1) {
+        av1_cdef_frame_mt(cm, xd, cpi->mt_info.cdef_worker,
+                          cpi->mt_info.workers, &cpi->mt_info.cdef_sync,
+                          num_workers, av1_cdef_init_fb_row_mt);
+      } else {
+        av1_cdef_frame(&cm->cur_frame->buf, cm, xd, av1_cdef_init_fb_row);
+      }
+    }
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, cdef_time);
 #endif
@@ -4824,21 +2186,24 @@ static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
     cm->cdef_info.cdef_uv_strengths[0] = 0;
   }
 
-  superres_post_encode(cpi);
+  av1_superres_post_encode(cpi);
 
+#if !CONFIG_REALTIME_ONLY
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, loop_restoration_time);
 #endif
   if (use_restoration) {
+    MultiThreadInfo *const mt_info = &cpi->mt_info;
+    const int num_workers = mt_info->num_mod_workers[MOD_LR];
     av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1);
     av1_pick_filter_restoration(cpi->source, cpi);
     if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-      if (cpi->num_workers > 1)
-        av1_loop_restoration_filter_frame_mt(&cm->cur_frame->buf, cm, 0,
-                                             cpi->workers, cpi->num_workers,
-                                             &cpi->lr_row_sync, &cpi->lr_ctxt);
+      if (num_workers > 1)
+        av1_loop_restoration_filter_frame_mt(
+            &cm->cur_frame->buf, cm, 0, mt_info->workers, num_workers,
+            &mt_info->lr_row_sync, &cpi->lr_ctxt);
       else
         av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0,
                                           &cpi->lr_ctxt);
@@ -4851,22 +2216,39 @@ static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, loop_restoration_time);
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 }
 
+/*!\brief Select and apply in-loop deblocking filters, cdef filters, and
+ * restoration filters
+ *
+ * \ingroup high_level_algo
+ */
 static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  const int num_workers = mt_info->num_mod_workers[MOD_LPF];
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
 
-  assert(IMPLIES(is_lossless_requested(&cpi->oxcf),
+  assert(IMPLIES(is_lossless_requested(&cpi->oxcf.rc_cfg),
                  cm->features.coded_lossless && cm->features.all_lossless));
 
   const int use_loopfilter =
       !cm->features.coded_lossless && !cm->tiles.large_scale;
-  const int use_cdef = cm->seq_params.enable_cdef &&
+  const int use_cdef = cm->seq_params->enable_cdef &&
                        !cm->features.coded_lossless && !cm->tiles.large_scale;
-  const int use_restoration = cm->seq_params.enable_restoration &&
-                              !cm->features.all_lossless &&
-                              !cm->tiles.large_scale;
+  const int use_restoration = is_restoration_used(cm);
+  // lpf_opt_level = 1 : Enables dual/quad loop-filtering.
+  // lpf_opt_level is set to 1 if transform size search depth in inter blocks
+  // is limited to one as quad loop filtering assumes that all the transform
+  // blocks within a 16x8/8x16/16x16 prediction block are of the same size.
+  // lpf_opt_level = 2 : Filters both chroma planes together, in addition to
+  // enabling dual/quad loop-filtering. This is enabled when lpf pick method
+  // is LPF_PICK_FROM_Q as u and v plane filter levels are equal.
+  int lpf_opt_level = 0;
+  if (is_inter_tx_size_search_level_one(&cpi->sf.tx_sf)) {
+    lpf_opt_level = (cpi->sf.lpf_sf.lpf_pick == LPF_PICK_FROM_Q) ? 2 : 1;
+  }
 
   struct loopfilter *lf = &cm->lf;
 
@@ -4874,27 +2256,17 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
   start_timing(cpi, loop_filter_time);
 #endif
   if (use_loopfilter) {
-    aom_clear_system_state();
     av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_sf.lpf_pick);
   } else {
     lf->filter_level[0] = 0;
     lf->filter_level[1] = 0;
   }
 
-  if (lf->filter_level[0] || lf->filter_level[1]) {
-    if (cpi->num_workers > 1)
-      av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
-#if CONFIG_LPF_MASK
-                               0,
-#endif
-                               cpi->workers, cpi->num_workers,
-                               &cpi->lf_row_sync);
-    else
-      av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd,
-#if CONFIG_LPF_MASK
-                            0,
-#endif
-                            0, num_planes, 0);
+  if ((lf->filter_level[0] || lf->filter_level[1]) &&
+      !cpi->svc.non_reference_frame) {
+    av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
+                             mt_info->workers, num_workers,
+                             &mt_info->lf_row_sync, lpf_opt_level);
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, loop_filter_time);
@@ -4903,446 +2275,177 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
   cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef);
 }
 
-static void fix_interp_filter(InterpFilter *const interp_filter,
-                              const FRAME_COUNTS *const counts) {
-  if (*interp_filter == SWITCHABLE) {
-    // Check to see if only one of the filters is actually used
-    int count[SWITCHABLE_FILTERS] = { 0 };
-    int num_filters_used = 0;
-    for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
-      for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
-        count[i] += counts->switchable_interp[j][i];
-      num_filters_used += (count[i] > 0);
-    }
-    if (num_filters_used == 1) {
-      // Only one filter is used. So set the filter at frame level
-      for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
-        if (count[i]) {
-          if (i == EIGHTTAP_REGULAR) *interp_filter = i;
-          break;
-        }
-      }
-    }
-  }
-}
-
-static void finalize_encoded_frame(AV1_COMP *const cpi) {
+/*!\brief Encode a frame without the recode loop, usually used in one-pass
+ * encoding and realtime coding.
+ *
+ * \ingroup high_level_algo
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_without_recode(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  CurrentFrame *const current_frame = &cm->current_frame;
+  const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg;
+  SVC *const svc = &cpi->svc;
+  const int resize_pending = is_frame_resize_pending(cpi);
+
+  int top_index = 0, bottom_index = 0, q = 0;
+  YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source;
+  InterpFilter filter_scaler =
+      cpi->ppi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id]
+                        : EIGHTTAP_SMOOTH;
+  int phase_scaler = cpi->ppi->use_svc
+                         ? svc->downsample_filter_phase[svc->spatial_layer_id]
+                         : 0;
 
-  if (!cm->seq_params.reduced_still_picture_hdr &&
-      encode_show_existing_frame(cm)) {
-    RefCntBuffer *const frame_to_show =
-        cm->ref_frame_map[cpi->existing_fb_idx_to_show];
-
-    if (frame_to_show == NULL) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Buffer does not contain a reconstructed frame");
-    }
-    assert(frame_to_show->ref_count > 0);
-    assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
-  }
-
-  if (!encode_show_existing_frame(cm) &&
-      cm->seq_params.film_grain_params_present &&
-      (cm->show_frame || cm->showable_frame)) {
-    // Copy the current frame's film grain params to the its corresponding
-    // RefCntBuffer slot.
-    cm->cur_frame->film_grain_params = cm->film_grain_params;
-
-    // We must update the parameters if this is not an INTER_FRAME
-    if (current_frame->frame_type != INTER_FRAME)
-      cm->cur_frame->film_grain_params.update_parameters = 1;
-
-    // Iterate the random seed for the next frame.
-    cm->film_grain_params.random_seed += 3381;
-    if (cm->film_grain_params.random_seed == 0)
-      cm->film_grain_params.random_seed = 7391;
-  }
-
-  // Initialise all tiles' contexts from the global frame context
-  for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) {
-    for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) {
-      const int tile_idx = tile_row * cm->tiles.cols + tile_col;
-      cpi->tile_data[tile_idx].tctx = *cm->fc;
+  set_size_independent_vars(cpi);
+  av1_setup_frame_size(cpi);
+  av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+  av1_set_mv_search_params(cpi);
+
+  if (!cpi->ppi->use_svc) {
+    phase_scaler = 8;
+    // 2:1 scaling.
+    if ((cm->width << 1) == unscaled->y_crop_width &&
+        (cm->height << 1) == unscaled->y_crop_height) {
+      filter_scaler = BILINEAR;
+      // For lower resolutions use eighttap_smooth.
+      if (cm->width * cm->height <= 320 * 180) filter_scaler = EIGHTTAP_SMOOTH;
+    } else if ((cm->width << 2) == unscaled->y_crop_width &&
+               (cm->height << 2) == unscaled->y_crop_height) {
+      // 4:1 scaling.
+      filter_scaler = EIGHTTAP_SMOOTH;
+    } else if ((cm->width << 2) == 3 * unscaled->y_crop_width &&
+               (cm->height << 2) == 3 * unscaled->y_crop_height) {
+      // 4:3 scaling.
+      filter_scaler = EIGHTTAP_REGULAR;
     }
   }
 
-  fix_interp_filter(&cm->features.interp_filter, cpi->td.counts);
-}
-
-static int get_regulated_q_overshoot(AV1_COMP *const cpi, int q_low, int q_high,
-                                     int top_index, int bottom_index) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const RATE_CONTROL *const rc = &cpi->rc;
-
-  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-
-  int q_regulated =
-      av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                        AOMMAX(q_high, top_index), cm->width, cm->height);
+  allocate_gradient_info_for_hog(cpi);
 
-  int retries = 0;
-  while (q_regulated < q_low && retries < 10) {
-    av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-    q_regulated =
-        av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                          AOMMAX(q_high, top_index), cm->width, cm->height);
-    retries++;
-  }
-  return q_regulated;
-}
+  allocate_src_var_of_4x4_sub_block_buf(cpi);
 
-static int get_regulated_q_undershoot(AV1_COMP *const cpi, int q_high,
-                                      int top_index, int bottom_index) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const RATE_CONTROL *const rc = &cpi->rc;
+  const SPEED_FEATURES *sf = &cpi->sf;
+  if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION)
+    variance_partition_alloc(cpi);
 
-  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-  int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                      top_index, cm->width, cm->height);
+  if (cm->current_frame.frame_type == KEY_FRAME ||
+      ((sf->inter_sf.extra_prune_warped && cpi->refresh_frame.golden_frame)))
+    copy_frame_prob_info(cpi);
 
-  int retries = 0;
-  while (q_regulated > q_high && retries < 10) {
-    av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-    q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                    top_index, cm->width, cm->height);
-    retries++;
-  }
-  return q_regulated;
-}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  printf("\n Encoding a frame: \n");
+#endif
 
-// Called after encode_with_recode_loop() has just encoded a frame and packed
-// its bitstream.  This function works out whether we under- or over-shot
-// our bitrate target and adjusts q as appropriate.  Also decides whether
-// or not we should do another recode loop, indicated by *loop
-static void recode_loop_update_q(
-    AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low,
-    int *const q_high, const int top_index, const int bottom_index,
-    int *const undershoot_seen, int *const overshoot_seen,
-    int *const low_cr_seen, const int loop_at_this_size) {
-  AV1_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
-  *loop = 0;
-
-  const int min_cr = cpi->oxcf.min_cr;
-  if (min_cr > 0) {
-    aom_clear_system_state();
-    const double compression_ratio =
-        av1_get_compression_ratio(cm, rc->projected_frame_size >> 3);
-    const double target_cr = min_cr / 100.0;
-    if (compression_ratio < target_cr) {
-      *low_cr_seen = 1;
-      if (*q < rc->worst_quality) {
-        const double cr_ratio = target_cr / compression_ratio;
-        const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio));
-        *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality);
-        *q_low = AOMMAX(*q, *q_low);
-        *q_high = AOMMAX(*q, *q_high);
-        *loop = 1;
-      }
-    }
-    if (*low_cr_seen) return;
+#if CONFIG_TUNE_BUTTERAUGLI
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+    av1_setup_butteraugli_rdmult(cpi);
   }
-
-  if (cpi->oxcf.rc_mode == AOM_Q) return;
-
-  const int last_q = *q;
-  int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0;
-  av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
-                                   &frame_under_shoot_limit,
-                                   &frame_over_shoot_limit);
-  if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
-
-  if (cm->current_frame.frame_type == KEY_FRAME && rc->this_key_frame_forced &&
-      rc->projected_frame_size < rc->max_frame_bandwidth) {
-    int64_t kf_err;
-    const int64_t high_err_target = cpi->ambient_err;
-    const int64_t low_err_target = cpi->ambient_err >> 1;
-
-#if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth) {
-      kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
-    } else {
-      kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
-    }
-#else
-    kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
 #endif
-    // Prevent possible divide by zero error below for perfect KF
-    kf_err += !kf_err;
-
-    // The key frame is not good enough or we can afford
-    // to make it better without undue risk of popping.
-    if ((kf_err > high_err_target &&
-         rc->projected_frame_size <= frame_over_shoot_limit) ||
-        (kf_err > low_err_target &&
-         rc->projected_frame_size <= frame_under_shoot_limit)) {
-      // Lower q_high
-      *q_high = AOMMAX(*q - 1, *q_low);
-
-      // Adjust Q
-      *q = (int)((*q * high_err_target) / kf_err);
-      *q = AOMMIN(*q, (*q_high + *q_low) >> 1);
-    } else if (kf_err < low_err_target &&
-               rc->projected_frame_size >= frame_under_shoot_limit) {
-      // The key frame is much better than the previous frame
-      // Raise q_low
-      *q_low = AOMMIN(*q + 1, *q_high);
-
-      // Adjust Q
-      *q = (int)((*q * low_err_target) / kf_err);
-      *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1);
-    }
 
-    // Clamp Q to upper and lower limits:
-    *q = clamp(*q, *q_low, *q_high);
-    *loop = (*q != last_q);
-    return;
+  cpi->source = av1_realloc_and_scale_if_required(
+      cm, unscaled, &cpi->scaled_source, filter_scaler, phase_scaler, true,
+      false, cpi->oxcf.border_in_pixels,
+      cpi->oxcf.tool_cfg.enable_global_motion);
+  if (frame_is_intra_only(cm) || resize_pending != 0) {
+    memset(cpi->consec_zero_mv, 0,
+           ((cm->mi_params.mi_rows * cm->mi_params.mi_cols) >> 2) *
+               sizeof(*cpi->consec_zero_mv));
   }
 
-  if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q,
-                       AOMMAX(*q_high, top_index), bottom_index)) {
-    // Is the projected frame size out of range and are we allowed
-    // to attempt to recode.
-
-    // Frame size out of permitted range:
-    // Update correction factor & compute new Q to try...
-    // Frame is too large
-    if (rc->projected_frame_size > rc->this_frame_target) {
-      // Special case if the projected size is > the max allowed.
-      if (*q == *q_high &&
-          rc->projected_frame_size >= rc->max_frame_bandwidth) {
-        const double q_val_high_current =
-            av1_convert_qindex_to_q(*q_high, cm->seq_params.bit_depth);
-        const double q_val_high_new =
-            q_val_high_current *
-            ((double)rc->projected_frame_size / rc->max_frame_bandwidth);
-        *q_high = av1_find_qindex(q_val_high_new, cm->seq_params.bit_depth,
-                                  rc->best_quality, rc->worst_quality);
-      }
-
-      // Raise Qlow as to at least the current value
-      *q_low = AOMMIN(*q + 1, *q_high);
-
-      if (*undershoot_seen || loop_at_this_size > 2 ||
-          (loop_at_this_size == 2 && !frame_is_intra_only(cm))) {
-        av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-
-        *q = (*q_high + *q_low + 1) / 2;
-      } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) {
-        const int q_mid = (*q_high + *q_low + 1) / 2;
-        const int q_regulated = get_regulated_q_overshoot(
-            cpi, *q_low, *q_high, top_index, bottom_index);
-        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
-        // transition between loop_at_this_size < 2 and loop_at_this_size > 2.
-        *q = (q_mid + q_regulated + 1) / 2;
-      } else {
-        *q = get_regulated_q_overshoot(cpi, *q_low, *q_high, top_index,
-                                       bottom_index);
-      }
-
-      *overshoot_seen = 1;
-    } else {
-      // Frame is too small
-      *q_high = AOMMAX(*q - 1, *q_low);
-
-      if (*overshoot_seen || loop_at_this_size > 2 ||
-          (loop_at_this_size == 2 && !frame_is_intra_only(cm))) {
-        av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-        *q = (*q_high + *q_low) / 2;
-      } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) {
-        const int q_mid = (*q_high + *q_low) / 2;
-        const int q_regulated =
-            get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
-        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
-        // transition between loop_at_this_size < 2 and loop_at_this_size > 2.
-        *q = (q_mid + q_regulated) / 2;
-
-        // Special case reset for qlow for constrained quality.
-        // This should only trigger where there is very substantial
-        // undershoot on a frame and the auto cq level is above
-        // the user passsed in value.
-        if (cpi->oxcf.rc_mode == AOM_CQ && q_regulated < *q_low) {
-          *q_low = *q;
-        }
-      } else {
-        *q = get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
-
-        // Special case reset for qlow for constrained quality.
-        // This should only trigger where there is very substantial
-        // undershoot on a frame and the auto cq level is above
-        // the user passsed in value.
-        if (cpi->oxcf.rc_mode == AOM_CQ && *q < *q_low) {
-          *q_low = *q;
-        }
-      }
-
-      *undershoot_seen = 1;
-    }
-
-    // Clamp Q to upper and lower limits:
-    *q = clamp(*q, *q_low, *q_high);
+  if (cpi->unscaled_last_source != NULL) {
+    cpi->last_source = av1_realloc_and_scale_if_required(
+        cm, cpi->unscaled_last_source, &cpi->scaled_last_source, filter_scaler,
+        phase_scaler, true, false, cpi->oxcf.border_in_pixels,
+        cpi->oxcf.tool_cfg.enable_global_motion);
   }
 
-  *loop = (*q != last_q);
-}
-
-static int get_interp_filter_selected(const AV1_COMMON *const cm,
-                                      MV_REFERENCE_FRAME ref,
-                                      InterpFilter ifilter) {
-  const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
-  if (buf == NULL) return 0;
-  return buf->interp_filter_selected[ifilter];
-}
-
-static uint16_t setup_interp_filter_search_mask(AV1_COMP *cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  int ref_total[REF_FRAMES] = { 0 };
-  uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK;
-
-  if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
-    return mask;
-
-  for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
-    for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
-         ++ifilter) {
-      ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter);
-    }
-  }
-  int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] +
-                         ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] +
-                         ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]);
-
-  for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
-       ++ifilter) {
-    int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30;
-    if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) {
-      int filter_score =
-          get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 +
-          get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 +
-          get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 +
-          get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 +
-          get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 +
-          get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10;
-      if (filter_score < ref_total_total) {
-        DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter;
-        reset_interp_filter_allowed_mask(&mask, filt_type);
-      }
-    }
+  if (cpi->sf.rt_sf.use_temporal_noise_estimate) {
+    av1_update_noise_estimate(cpi);
   }
-  return mask;
-}
 
-#if !CONFIG_REALTIME_ONLY
-#define STRICT_PSNR_DIFF_THRESH 0.9
-// Encode key frame with/without screen content tools to determine whether
-// screen content tools should be enabled for this key frame group or not.
-// The first encoding is without screen content tools.
-// The second encoding is with screen content tools.
-// We compare the psnr and frame size to make the decision.
-static void screen_content_tools_determination(
-    AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision,
-    const int allow_intrabc_orig_decision,
-    const int is_screen_content_type_orig_decision, const int pass,
-    int *projected_size_pass, PSNR_STATS *psnr) {
-  AV1_COMMON *const cm = &cpi->common;
-  FeatureFlags *const features = &cm->features;
-  projected_size_pass[pass] = cpi->rc.projected_frame_size;
-#if CONFIG_AV1_HIGHBITDEPTH
-  const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth;
-  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
-  aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass],
-                       bit_depth, in_bit_depth);
-#else
-  aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && cpi->ppi->use_svc)
+    av1_denoiser_reset_on_first_frame(cpi);
 #endif
-  if (pass != 1) return;
 
-  const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0];
-  const int is_sc_encoding_much_better = psnr_diff > STRICT_PSNR_DIFF_THRESH;
-  if (is_sc_encoding_much_better) {
-    // Use screen content tools, if we get coding gain.
-    features->allow_screen_content_tools = 1;
-    features->allow_intrabc = cpi->intrabc_used;
-    cpi->is_screen_content_type = 1;
-  } else {
-    // Use original screen content decision.
-    features->allow_screen_content_tools =
-        allow_screen_content_tools_orig_decision;
-    features->allow_intrabc = allow_intrabc_orig_decision;
-    cpi->is_screen_content_type = is_screen_content_type_orig_decision;
-  }
-}
-
-// Set some encoding parameters to make the encoding process fast.
-// A fixed block partition size, and a large q is used.
-static void set_encoding_params_for_screen_content(AV1_COMP *cpi,
-                                                   const int pass) {
-  AV1_COMMON *const cm = &cpi->common;
-  if (pass == 0) {
-    // In the first pass, encode without screen content tools.
-    // Use a high q, and a fixed block size for fast encoding.
-    cm->features.allow_screen_content_tools = 0;
-    cm->features.allow_intrabc = 0;
-    cpi->is_screen_content_type = 0;
-    cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
-    cpi->sf.part_sf.always_this_block_size = BLOCK_32X32;
-    return;
+  // For 1 spatial layer encoding: if the (non-LAST) reference has different
+  // resolution from the source then disable that reference. This is to avoid
+  // significant increase in encode time from scaling the references in
+  // av1_scale_references. Note GOLDEN is forced to update on the (first/tigger)
+  // resized frame and ALTREF will be refreshed ~4 frames later, so both
+  // references become available again after few frames.
+  if (svc->number_spatial_layers == 1) {
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) {
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+        cpi->ref_frame_flags ^= AOM_GOLD_FLAG;
+    }
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) {
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+        cpi->ref_frame_flags ^= AOM_ALT_FLAG;
+    }
   }
-  assert(pass == 1);
-  // In the second pass, encode with screen content tools.
-  // Use a high q, and a fixed block size for fast encoding.
-  cm->features.allow_screen_content_tools = 1;
-  // TODO(chengchen): turn intrabc on could lead to data race issue.
-  // cm->allow_intrabc = 1;
-  cpi->is_screen_content_type = 1;
-  cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
-  cpi->sf.part_sf.always_this_block_size = BLOCK_32X32;
-}
 
-// Determines whether to use screen content tools for the key frame group.
-// This function modifies "cm->features.allow_screen_content_tools",
-// "cm->features.allow_intrabc" and "cpi->is_screen_content_type".
-static void determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) {
-  AV1_COMMON *const cm = &cpi->common;
-  // Variables to help determine if we should allow screen content tools.
-  int projected_size_pass[3] = { 0 };
-  PSNR_STATS psnr[3];
-  const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME;
-  const int allow_screen_content_tools_orig_decision =
-      cm->features.allow_screen_content_tools;
-  const int allow_intrabc_orig_decision = cm->features.allow_intrabc;
-  const int is_screen_content_type_orig_decision = cpi->is_screen_content_type;
-  // Turn off the encoding trial for forward key frame and superres.
-  if (cpi->sf.rt_sf.use_nonrd_pick_mode || cpi->oxcf.fwd_kf_enabled ||
-      cpi->superres_mode != SUPERRES_NONE || cpi->oxcf.mode == REALTIME ||
-      is_screen_content_type_orig_decision || !is_key_frame) {
-    return;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  int scale_references = 0;
+#if CONFIG_FPMT_TEST
+  scale_references =
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0;
+#endif  // CONFIG_FPMT_TEST
+  if (scale_references ||
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0)
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+  {
+    // For SVC the inter-layer/spatial prediction is not done for newmv
+    // (zero_mode is forced), and since the scaled references are only
+    // use for newmv search, we can avoid scaling here.
+    if (!frame_is_intra_only(cm) &&
+        !(cpi->ppi->use_svc && cpi->svc.force_zero_mode_spatial_ref))
+      av1_scale_references(cpi, filter_scaler, phase_scaler, 1);
+  }
+
+  av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+                    q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+  av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+  if ((q_cfg->deltaq_mode != NO_DELTA_Q) || q_cfg->enable_chroma_deltaq)
+    av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                       cm->seq_params->bit_depth);
+  av1_set_variance_partition_thresholds(cpi, q, 0);
+  av1_setup_frame(cpi);
+
+  // Check if this high_source_sad (scene/slide change) frame should be
+  // encoded at high/max QP, and if so, set the q and adjust some rate
+  // control parameters.
+  if (cpi->sf.rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ &&
+      (cpi->rc.high_source_sad ||
+       (cpi->ppi->use_svc && cpi->svc.high_source_sad_superframe))) {
+    if (av1_encodedframe_overshoot_cbr(cpi, &q)) {
+      av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+                        q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+      av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+      if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
+        av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                           cm->seq_params->bit_depth);
+      av1_set_variance_partition_thresholds(cpi, q, 0);
+      if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+          cm->features.primary_ref_frame == PRIMARY_REF_NONE)
+        av1_setup_frame(cpi);
+    }
   }
 
-  // TODO(chengchen): multiple encoding for the lossless mode is time consuming.
-  // Find a better way to determine whether screen content tools should be used
-  // for lossless coding.
-  // Use a high q and a fixed partition to do quick encoding.
-  const int q_for_screen_content_quick_run =
-      is_lossless_requested(&cpi->oxcf) ? q_orig : AOMMAX(q_orig, 244);
-  const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type;
-  const BLOCK_SIZE fixed_partition_block_size_orig =
-      cpi->sf.part_sf.always_this_block_size;
-
-  // Setup necessary params for encoding, including frame source, etc.
-  aom_clear_system_state();
-
-  cpi->source =
-      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
-  if (cpi->unscaled_last_source != NULL) {
-    cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
-                                             &cpi->scaled_last_source);
+  if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) {
+    suppress_active_map(cpi);
+    av1_cyclic_refresh_setup(cpi);
+    av1_apply_active_map(cpi);
   }
-
-  setup_frame(cpi);
-
   if (cm->seg.enabled) {
     if (!cm->seg.update_data && cm->prev_frame) {
       segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
@@ -5356,128 +2459,151 @@ static void determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) {
   segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
   cm->cur_frame->seg.enabled = cm->seg.enabled;
 
-  // The two encoding passes aim to help determine whether to use screen
-  // content tools, with a high q and fixed partition.
-  for (int pass = 0; pass < 2; ++pass) {
-    set_encoding_params_for_screen_content(cpi, pass);
-#if CONFIG_TUNE_VMAF
-    if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
-        cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
-        cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
-      av1_set_quantizer(
-          cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel,
-          av1_get_vmaf_base_qindex(cpi, q_for_screen_content_quick_run));
-    } else {
-#endif
-      av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel,
-                        q_for_screen_content_quick_run);
-#if CONFIG_TUNE_VMAF
+  // This is for rtc temporal filtering case.
+  if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf &&
+      cm->current_frame.frame_type != KEY_FRAME) {
+    const SequenceHeader *seq_params = cm->seq_params;
+
+    if (cpi->orig_source.buffer_alloc_sz == 0 ||
+        cpi->last_source->y_width != cpi->source->y_width ||
+        cpi->last_source->y_height != cpi->source->y_height) {
+      // Allocate a source buffer to store the true source for psnr calculation.
+      if (aom_alloc_frame_buffer(
+              &cpi->orig_source, cpi->oxcf.frm_dim_cfg.width,
+              cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x,
+              seq_params->subsampling_y, seq_params->use_highbitdepth,
+              cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0))
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate scaled buffer");
     }
-#endif
-    av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
-    if (cpi->oxcf.deltaq_mode != NO_DELTA_Q)
-      av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                         cm->seq_params.bit_depth);
 
-    av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run,
-                                          0);
-    // transform / motion compensation build reconstruction frame
-    av1_encode_frame(cpi);
-    // Screen content decision
-    screen_content_tools_determination(
-        cpi, allow_screen_content_tools_orig_decision,
-        allow_intrabc_orig_decision, is_screen_content_type_orig_decision, pass,
-        projected_size_pass, psnr);
+    aom_yv12_copy_y(cpi->source, &cpi->orig_source);
+    aom_yv12_copy_u(cpi->source, &cpi->orig_source);
+    aom_yv12_copy_v(cpi->source, &cpi->orig_source);
   }
 
-  // Set partition speed feature back.
-  cpi->sf.part_sf.partition_search_type = partition_search_type_orig;
-  cpi->sf.part_sf.always_this_block_size = fixed_partition_block_size_orig;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_encode_frame_time);
+#endif
+
+  // Set the motion vector precision based on mv stats from the last coded
+  // frame.
+  if (!frame_is_intra_only(cm)) av1_pick_and_set_high_precision_mv(cpi, q);
+
+  // transform / motion compensation build reconstruction frame
+  av1_encode_frame(cpi);
+
+  // Update some stats from cyclic refresh.
+  if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ && !cpi->rc.rtc_external_ratectrl &&
+      !frame_is_intra_only(cm))
+    av1_cyclic_refresh_postencode(cpi);
+
+  // Adjust the refresh of the golden (longer-term) reference based on QP
+  // selected for this frame. This is for CBR with 1 layer/non-svc RTC mode.
+  if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+      cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 &&
+      svc->number_temporal_layers == 1 && !cpi->rc.rtc_external_ratectrl &&
+      sf->rt_sf.gf_refresh_based_on_qp)
+    av1_adjust_gf_refresh_qp_one_pass_rt(cpi);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_encode_frame_time);
+#endif
+#if CONFIG_INTERNAL_STATS
+  ++cpi->frame_recode_hits;
+#endif
+
+  return AOM_CODEC_OK;
 }
-#endif  // CONFIG_REALTIME_ONLY
 
+#if !CONFIG_REALTIME_ONLY
+
+/*!\brief Recode loop for encoding one frame. the purpose of encoding one frame
+ * for multiple times can be approaching a target bitrate or adjusting the usage
+ * of global motions.
+ *
+ * \ingroup high_level_algo
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    size            Bitstream size
+ * \param[in]    dest            Bitstream output
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ * \retval #AOM_CODEC_ERROR
+ */
 static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
   const int allow_recode = (cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE);
   // Must allow recode if minimum compression ratio is set.
-  assert(IMPLIES(cpi->oxcf.min_cr > 0, allow_recode));
+  assert(IMPLIES(oxcf->rc_cfg.min_cr > 0, allow_recode));
 
   set_size_independent_vars(cpi);
   if (is_stat_consumption_stage_twopass(cpi) &&
       cpi->sf.interp_sf.adaptive_interp_filter_search)
     cpi->interp_search_flags.interp_filter_search_mask =
-        setup_interp_filter_search_mask(cpi);
+        av1_setup_interp_filter_search_mask(cpi);
   cpi->source->buf_8bit_valid = 0;
 
   av1_setup_frame_size(cpi);
 
-#if CONFIG_SUPERRES_IN_RECODE
-  if (superres_in_recode_allowed(cpi) && cpi->superres_mode != SUPERRES_NONE &&
+  if (av1_superres_in_recode_allowed(cpi) &&
+      cpi->superres_mode != AOM_SUPERRES_NONE &&
       cm->superres_scale_denominator == SCALE_NUMERATOR) {
     // Superres mode is currently enabled, but the denominator selected will
     // disable superres. So no need to continue, as we will go through another
     // recode loop for full-resolution after this anyway.
     return -1;
   }
-#endif  // CONFIG_SUPERRES_IN_RECODE
 
   int top_index = 0, bottom_index = 0;
   int q = 0, q_low = 0, q_high = 0;
-  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+  av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
   q_low = bottom_index;
   q_high = top_index;
-  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
-    const int num_64x64_blocks =
-        (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
-    if (cpi->td.vt64x64) {
-      if (num_64x64_blocks != cpi->td.num_64x64_blocks) {
-        aom_free(cpi->td.vt64x64);
-        cpi->td.vt64x64 = NULL;
-      }
-    }
-    if (!cpi->td.vt64x64) {
-      CHECK_MEM_ERROR(cm, cpi->td.vt64x64,
-                      aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks));
-      cpi->td.num_64x64_blocks = num_64x64_blocks;
-    }
-  }
 
-  if (cm->current_frame.frame_type == KEY_FRAME) {
-    FrameProbInfo *const frame_probs = &cpi->frame_probs;
+  av1_set_mv_search_params(cpi);
 
-    if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
-      av1_copy(frame_probs->tx_type_probs, default_tx_type_probs);
-    }
+  allocate_gradient_info_for_hog(cpi);
 
-    if (!cpi->sf.inter_sf.disable_obmc &&
-        cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) {
-      av1_copy(frame_probs->obmc_probs, default_obmc_probs);
-    }
+  allocate_src_var_of_4x4_sub_block_buf(cpi);
 
-    if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
-      av1_copy(frame_probs->warped_probs, default_warped_probs);
-    }
+  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION)
+    variance_partition_alloc(cpi);
 
-    if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
-      av1_copy(frame_probs->switchable_interp_probs,
-               default_switchable_interp_probs);
-    }
-  }
-#if !CONFIG_REALTIME_ONLY
-  // Determine whether to use screen content tools using two fast encoding.
-  determine_sc_tools_with_encoding(cpi, q);
-#endif  // CONFIG_REALTIME_ONLY
+  if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  printf("\n Encoding a frame:");
+  printf("\n Encoding a frame: \n");
+#endif
+
+#if !CONFIG_RD_COMMAND
+  // Determine whether to use screen content tools using two fast encoding.
+  if (!cpi->sf.hl_sf.disable_extra_sc_testing)
+    av1_determine_sc_tools_with_encoding(cpi, q);
+#endif  // !CONFIG_RD_COMMAND
+
+#if CONFIG_TUNE_VMAF
+  if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    av1_vmaf_neg_preprocessing(cpi, cpi->unscaled_source);
+  }
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+  cpi->butteraugli_info.recon_set = false;
+  int original_q = 0;
 #endif
 
+  cpi->num_frame_recode = 0;
+
   // Loop variables
   int loop = 0;
   int loop_count = 0;
-  int loop_at_this_size = 0;
   int overshoot_seen = 0;
   int undershoot_seen = 0;
   int low_cr_seen = 0;
@@ -5485,7 +2611,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
 
   do {
     loop = 0;
-    aom_clear_system_state();
+    int do_mv_stats_collection = 1;
 
     // if frame was scaled calculate global_motion_search again if already
     // done
@@ -5495,36 +2621,107 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
         gm_info->search_done = 0;
       }
     }
-    cpi->source =
-        av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+    cpi->source = av1_realloc_and_scale_if_required(
+        cm, cpi->unscaled_source, &cpi->scaled_source, EIGHTTAP_REGULAR, 0,
+        false, false, cpi->oxcf.border_in_pixels,
+        cpi->oxcf.tool_cfg.enable_global_motion);
+
+#if CONFIG_TUNE_BUTTERAUGLI
+    if (oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+      if (loop_count == 0) {
+        original_q = q;
+        // TODO(sdeng): different q here does not make big difference. Use a
+        // faster pass instead.
+        q = 96;
+        av1_setup_butteraugli_source(cpi);
+      } else {
+        q = original_q;
+      }
+    }
+#endif
+
     if (cpi->unscaled_last_source != NULL) {
-      cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
-                                               &cpi->scaled_last_source);
+      cpi->last_source = av1_realloc_and_scale_if_required(
+          cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+          EIGHTTAP_REGULAR, 0, false, false, cpi->oxcf.border_in_pixels,
+          cpi->oxcf.tool_cfg.enable_global_motion);
     }
 
-    if (!frame_is_intra_only(cm)) {
-      if (loop_count > 0) {
-        release_scaled_references(cpi);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    int scale_references = 0;
+#if CONFIG_FPMT_TEST
+    scale_references =
+        cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0;
+#endif  // CONFIG_FPMT_TEST
+    if (scale_references ||
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+#else
+    {
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+      if (!frame_is_intra_only(cm)) {
+        if (loop_count > 0) {
+          release_scaled_references(cpi);
+        }
+        av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0);
       }
-      scale_references(cpi);
     }
+
 #if CONFIG_TUNE_VMAF
-    if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
-        cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
-        cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
-      av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel,
-                        av1_get_vmaf_base_qindex(cpi, q));
-    } else {
-#endif
-      av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, q);
-#if CONFIG_TUNE_VMAF
+    if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+        oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+      cpi->vmaf_info.original_qindex = q;
+      q = av1_get_vmaf_base_qindex(cpi, q);
     }
 #endif
-    av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
 
-    if (cpi->oxcf.deltaq_mode != NO_DELTA_Q)
+#if CONFIG_RD_COMMAND
+    RD_COMMAND *rd_command = &cpi->rd_command;
+    RD_OPTION option = rd_command->option_ls[rd_command->frame_index];
+    if (option == RD_OPTION_SET_Q || option == RD_OPTION_SET_Q_RDMULT) {
+      q = rd_command->q_index_ls[rd_command->frame_index];
+    }
+#endif  // CONFIG_RD_COMMAND
+
+#if CONFIG_BITRATE_ACCURACY
+#if CONFIG_THREE_PASS
+    if (oxcf->pass == AOM_RC_THIRD_PASS && cpi->vbr_rc_info.ready == 1) {
+      int frame_coding_idx =
+          av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+      if (frame_coding_idx < cpi->vbr_rc_info.total_frame_count) {
+        q = cpi->vbr_rc_info.q_index_list[frame_coding_idx];
+      } else {
+        // TODO(angiebird): Investigate why sometimes there is an extra frame
+        // after the last GOP.
+        q = cpi->vbr_rc_info.base_q_index;
+      }
+    }
+#else
+    if (cpi->vbr_rc_info.q_index_list_ready) {
+      q = cpi->vbr_rc_info.q_index_list[cpi->gf_frame_index];
+    }
+#endif  // CONFIG_THREE_PASS
+#endif  // CONFIG_BITRATE_ACCURACY
+
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+    // TODO(angiebird): Move this into a function.
+    if (oxcf->pass == AOM_RC_THIRD_PASS) {
+      int frame_coding_idx =
+          av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+      double qstep_ratio = cpi->vbr_rc_info.qstep_ratio_list[frame_coding_idx];
+      FRAME_UPDATE_TYPE update_type =
+          cpi->vbr_rc_info.update_type_list[frame_coding_idx];
+      rc_log_frame_encode_param(&cpi->rc_log, frame_coding_idx, qstep_ratio, q,
+                                update_type);
+    }
+#endif  // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+
+    av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+                      q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+    av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+
+    if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
       av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                         cm->seq_params.bit_depth);
+                         cm->seq_params->bit_depth);
 
     av1_set_variance_partition_thresholds(cpi, q, 0);
 
@@ -5533,7 +2730,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
     //        cm->current_frame.frame_type, cm->superres_scale_denominator);
 
     if (loop_count == 0) {
-      setup_frame(cpi);
+      av1_setup_frame(cpi);
     } else if (get_primary_ref_frame_buf(cm) == NULL) {
       // Base q-index may have changed, so we need to assign proper default coef
       // probs before every iteration.
@@ -5541,14 +2738,10 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
       av1_setup_frame_contexts(cm);
     }
 
-    if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    if (q_cfg->aq_mode == VARIANCE_AQ) {
       av1_vaq_frame_setup(cpi);
-    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    } else if (q_cfg->aq_mode == COMPLEXITY_AQ) {
       av1_setup_in_frame_q_adj(cpi);
-    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && !allow_recode) {
-      suppress_active_map(cpi);
-      av1_cyclic_refresh_setup(cpi);
-      apply_active_map(cpi);
     }
 
     if (cm->seg.enabled) {
@@ -5583,69 +2776,99 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
 
     // transform / motion compensation build reconstruction frame
     av1_encode_frame(cpi);
-#if !CONFIG_REALTIME_ONLY
+
+    // Disable mv_stats collection for parallel frames based on update flag.
+    if (!cpi->do_frame_data_update) do_mv_stats_collection = 0;
+
     // Reset the mv_stats in case we are interrupted by an intraframe or an
     // overlay frame.
-    if (cpi->mv_stats.valid) {
-      av1_zero(cpi->mv_stats);
-    }
+    if (cpi->mv_stats.valid && do_mv_stats_collection) av1_zero(cpi->mv_stats);
+
     // Gather the mv_stats for the next frame
     if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
-        av1_frame_allows_smart_mv(cpi)) {
+        av1_frame_allows_smart_mv(cpi) && do_mv_stats_collection) {
       av1_collect_mv_stats(cpi, q);
     }
-#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, av1_encode_frame_time);
 #endif
 
-    aom_clear_system_state();
-
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+    const int do_dummy_pack = 1;
+#else   // CONFIG_BITRATE_ACCURACY
     // Dummy pack of the bitstream using up to date stats to get an
     // accurate estimate of output frame size to determine if we need
     // to recode.
     const int do_dummy_pack =
         (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
-         cpi->oxcf.rc_mode != AOM_Q) ||
-        cpi->oxcf.min_cr > 0;
+         oxcf->rc_cfg.mode != AOM_Q) ||
+        oxcf->rc_cfg.min_cr > 0;
+#endif  // CONFIG_BITRATE_ACCURACY
     if (do_dummy_pack) {
-      finalize_encoded_frame(cpi);
+      av1_finalize_encoded_frame(cpi);
       int largest_tile_id = 0;  // Output from bitstream: unused here
+      rc->coefficient_size = 0;
       if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) !=
           AOM_CODEC_OK) {
         return AOM_CODEC_ERROR;
       }
 
+      // bits used for this frame
       rc->projected_frame_size = (int)(*size) << 3;
+#if CONFIG_RD_COMMAND
+      PSNR_STATS psnr;
+      aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+      printf("q %d rdmult %d rate %d dist %" PRIu64 "\n", q, cpi->rd.RDMULT,
+             rc->projected_frame_size, psnr.sse[0]);
+      ++rd_command->frame_index;
+      if (rd_command->frame_index == rd_command->frame_count) {
+        exit(0);
+      }
+#endif  // CONFIG_RD_COMMAND
+
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+      if (oxcf->pass == AOM_RC_THIRD_PASS) {
+        int frame_coding_idx =
+            av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+        rc_log_frame_entropy(&cpi->rc_log, frame_coding_idx,
+                             rc->projected_frame_size, rc->coefficient_size);
+      }
+#endif  // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
     }
 
+#if CONFIG_TUNE_VMAF
+    if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+        oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+      q = cpi->vmaf_info.original_qindex;
+    }
+#endif
     if (allow_recode) {
       // Update q and decide whether to do a recode loop
       recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index,
                            bottom_index, &undershoot_seen, &overshoot_seen,
-                           &low_cr_seen, loop_at_this_size);
+                           &low_cr_seen, loop_count);
     }
 
-    // Special case for overlay frame.
-    if (loop && rc->is_src_frame_alt_ref &&
-        rc->projected_frame_size < rc->max_frame_bandwidth) {
-      loop = 0;
-    }
-
-    if (allow_recode && !cpi->sf.gm_sf.gm_disable_recode &&
-        recode_loop_test_global_motion(cm->global_motion,
-                                       cpi->td.rd_counts.global_motion_used,
-                                       gm_info->params_cost)) {
+#if CONFIG_TUNE_BUTTERAUGLI
+    if (loop_count == 0 && oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
       loop = 1;
+      av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.4);
     }
+#endif
+
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+    loop = 0;  // turn off recode loop when CONFIG_BITRATE_ACCURACY is on
+#endif         // CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
 
     if (loop) {
       ++loop_count;
-      ++loop_at_this_size;
-
+      cpi->num_frame_recode =
+          (cpi->num_frame_recode < (NUM_RECODES_PER_FRAME - 1))
+              ? (cpi->num_frame_recode + 1)
+              : (NUM_RECODES_PER_FRAME - 1);
 #if CONFIG_INTERNAL_STATS
-      ++cpi->tot_recode_hits;
+      ++cpi->frame_recode_hits;
 #endif
     }
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -5653,23 +2876,84 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
 #endif
   } while (loop);
 
-  // Update some stats from cyclic refresh.
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && !frame_is_intra_only(cm))
-    av1_cyclic_refresh_postencode(cpi);
-
   return AOM_CODEC_OK;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
+// TODO(jingning, paulwilkins): Set up high grain level to test
+// hardware decoders. Need to adapt the actual noise variance
+// according to the difference between reconstructed frame and the
+// source signal.
+static void set_grain_syn_params(AV1_COMMON *cm) {
+  aom_film_grain_t *film_grain_params = &cm->film_grain_params;
+  film_grain_params->apply_grain = 1;
+  film_grain_params->update_parameters = 1;
+  film_grain_params->random_seed = rand() & 0xffff;
+
+  film_grain_params->num_y_points = 1;
+  film_grain_params->scaling_points_y[0][0] = 128;
+  film_grain_params->scaling_points_y[0][1] = 100;
+
+  film_grain_params->num_cb_points = 1;
+  film_grain_params->scaling_points_cb[0][0] = 128;
+  film_grain_params->scaling_points_cb[0][1] = 100;
+
+  film_grain_params->num_cr_points = 1;
+  film_grain_params->scaling_points_cr[0][0] = 128;
+  film_grain_params->scaling_points_cr[0][1] = 100;
+
+  film_grain_params->chroma_scaling_from_luma = 0;
+  film_grain_params->scaling_shift = 1;
+  film_grain_params->ar_coeff_lag = 0;
+  film_grain_params->ar_coeff_shift = 1;
+  film_grain_params->overlap_flag = 1;
+  film_grain_params->grain_scale_shift = 0;
+}
+
+/*!\brief Recode loop or a single loop for encoding one frame, followed by
+ * in-loop deblocking filters, CDEF filters, and restoration filters.
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    size            Bitstream size
+ * \param[in]    dest            Bitstream output
+ * \param[in]    sse             Total distortion of the frame
+ * \param[in]    rate            Total rate of the frame
+ * \param[in]    largest_tile_id Tile id of the last tile
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
 static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
                                               uint8_t *dest, int64_t *sse,
                                               int64_t *rate,
                                               int *largest_tile_id) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  start_timing(cpi, encode_with_recode_loop_time);
+  start_timing(cpi, encode_with_or_without_recode_time);
+#endif
+  for (int i = 0; i < NUM_RECODES_PER_FRAME; i++) {
+    cpi->do_update_frame_probs_txtype[i] = 0;
+    cpi->do_update_frame_probs_obmc[i] = 0;
+    cpi->do_update_frame_probs_warp[i] = 0;
+    cpi->do_update_frame_probs_interpfilter[i] = 0;
+  }
+
+  cpi->do_update_vbr_bits_off_target_fast = 0;
+  int err;
+#if CONFIG_REALTIME_ONLY
+  err = encode_without_recode(cpi);
+#else
+  if (cpi->sf.hl_sf.recode_loop == DISALLOW_RECODE)
+    err = encode_without_recode(cpi);
+  else
+    err = encode_with_recode_loop(cpi, size, dest);
 #endif
-  int err = encode_with_recode_loop(cpi, size, dest);
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  end_timing(cpi, encode_with_recode_loop_time);
+  end_timing(cpi, encode_with_or_without_recode_time);
 #endif
   if (err != AOM_CODEC_OK) {
     if (err == -1) {
@@ -5683,19 +2967,21 @@ static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
     return err;
   }
 
-#ifdef OUTPUT_YUV_SKINMAP
-  if (cpi->common.current_frame.frame_number > 1) {
-    av1_compute_skin_map(cpi, yuv_skinmap_file);
+#ifdef OUTPUT_YUV_DENOISED
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) {
+    aom_write_yuv_frame(yuv_denoised_file,
+                        &cpi->denoiser.running_avg_y[INTRA_FRAME]);
   }
-#endif  // OUTPUT_YUV_SKINMAP
+#endif
 
   AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
+  SequenceHeader *const seq_params = cm->seq_params;
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
-  if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+  if (cpi->ppi->p_rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
 #if CONFIG_AV1_HIGHBITDEPTH
     if (seq_params->use_highbitdepth) {
       cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
@@ -5718,9 +3004,6 @@ static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
   cm->cur_frame->buf.render_width = cm->render_width;
   cm->cur_frame->buf.render_height = cm->render_height;
 
-  // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
-  // off.
-
   // Pick the loop filter level for the frame.
   if (!cm->features.allow_intrabc) {
     loopfilter_frame(cpi, cm);
@@ -5744,11 +3027,16 @@ static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
   aom_write_one_yuv_frame(cm, &cm->cur_frame->buf);
 #endif
 
-  finalize_encoded_frame(cpi);
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_FILM) {
+    set_grain_syn_params(cm);
+  }
+
+  av1_finalize_encoded_frame(cpi);
   // Build the bitstream
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, av1_pack_bitstream_final_time);
 #endif
+  cpi->rc.coefficient_size = 0;
   if (av1_pack_bitstream(cpi, dest, size, largest_tile_id) != AOM_CODEC_OK)
     return AOM_CODEC_ERROR;
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -5772,503 +3060,294 @@ static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
   return AOM_CODEC_OK;
 }
 
-#if CONFIG_SUPERRES_IN_RECODE
-
-static void save_cur_buf(AV1_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  AV1_COMMON *cm = &cpi->common;
-  const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf;
-  memset(&cc->copy_buffer, 0, sizeof(cc->copy_buffer));
-  if (aom_alloc_frame_buffer(&cc->copy_buffer, ybf->y_crop_width,
-                             ybf->y_crop_height, ybf->subsampling_x,
-                             ybf->subsampling_y,
-                             ybf->flags & YV12_FLAG_HIGHBITDEPTH, ybf->border,
-                             cm->features.byte_alignment) != AOM_CODEC_OK) {
-    aom_internal_error(
-        &cm->error, AOM_CODEC_MEM_ERROR,
-        "Failed to allocate copy buffer for saving coding context");
-  }
-  aom_yv12_copy_frame(ybf, &cc->copy_buffer, av1_num_planes(cm));
-}
-
-// Coding context that only needs to be saved when recode loop includes
-// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
-// restoraton).
-static void save_extra_coding_context(AV1_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  AV1_COMMON *cm = &cpi->common;
-
-  cc->lf = cm->lf;
-  cc->cdef_info = cm->cdef_info;
-  cc->rc = cpi->rc;
-}
-
-static void save_all_coding_context(AV1_COMP *cpi) {
-  save_cur_buf(cpi);
-  save_extra_coding_context(cpi);
-  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
-}
-
-static void restore_cur_buf(AV1_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  AV1_COMMON *cm = &cpi->common;
-  aom_yv12_copy_frame(&cc->copy_buffer, &cm->cur_frame->buf,
-                      av1_num_planes(cm));
-}
-
-// Coding context that only needs to be restored when recode loop includes
-// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
-// restoraton).
-static void restore_extra_coding_context(AV1_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  AV1_COMMON *cm = &cpi->common;
-  cm->lf = cc->lf;
-  cm->cdef_info = cc->cdef_info;
-  cpi->rc = cc->rc;
-}
-
-static void restore_all_coding_context(AV1_COMP *cpi) {
-  restore_cur_buf(cpi);
-  restore_extra_coding_context(cpi);
-  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
-}
-
-static void release_copy_buffer(CODING_CONTEXT *cc) {
-  aom_free_frame_buffer(&cc->copy_buffer);
-}
-
 static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
                                             uint8_t *dest,
                                             int *largest_tile_id) {
   const AV1_COMMON *const cm = &cpi->common;
-  assert(cm->seq_params.enable_superres);
-  assert(superres_in_recode_allowed(cpi));
+  assert(cm->seq_params->enable_superres);
+  assert(av1_superres_in_recode_allowed(cpi));
   aom_codec_err_t err = AOM_CODEC_OK;
-  save_all_coding_context(cpi);
+  av1_save_all_coding_context(cpi);
 
-  // Encode with superres.
-#if SUPERRES_RECODE_ALL_RATIOS
-  AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  int64_t superres_sses[SCALE_NUMERATOR];
-  int64_t superres_rates[SCALE_NUMERATOR];
-  int superres_largest_tile_ids[SCALE_NUMERATOR];
-  // Use superres for Key-frames and Alt-ref frames only.
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  if (gf_group->update_type[gf_group->index] != OVERLAY_UPDATE &&
-      gf_group->update_type[gf_group->index] != INTNL_OVERLAY_UPDATE) {
-    for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
-         ++denom) {
-      oxcf->superres_scale_denominator = denom;
-      oxcf->superres_kf_scale_denominator = denom;
-      const int this_index = denom - (SCALE_NUMERATOR + 1);
-      err = encode_with_recode_loop_and_filter(
-          cpi, size, dest, &superres_sses[this_index],
-          &superres_rates[this_index], &superres_largest_tile_ids[this_index]);
-      if (err != AOM_CODEC_OK) return err;
-      restore_all_coding_context(cpi);
-    }
-    // Reset.
-    oxcf->superres_scale_denominator = SCALE_NUMERATOR;
-    oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
-  } else {
-    for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
-         ++denom) {
-      const int this_index = denom - (SCALE_NUMERATOR + 1);
-      superres_sses[this_index] = INT64_MAX;
-      superres_rates[this_index] = INT64_MAX;
-    }
-  }
-#else
   int64_t sse1 = INT64_MAX;
   int64_t rate1 = INT64_MAX;
-  int largest_tile_id1;
-  err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse1, &rate1,
-                                           &largest_tile_id1);
-  if (err != AOM_CODEC_OK) return err;
-  restore_all_coding_context(cpi);
-#endif  // SUPERRES_RECODE_ALL_RATIOS
-
-  // Encode without superres.
+  int largest_tile_id1 = 0;
   int64_t sse2 = INT64_MAX;
   int64_t rate2 = INT64_MAX;
   int largest_tile_id2;
-  cpi->superres_mode = SUPERRES_NONE;  // To force full-res.
-  err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
-                                           &largest_tile_id2);
-  cpi->superres_mode = cpi->oxcf.superres_mode;  // Reset.
-  assert(cpi->oxcf.superres_mode == SUPERRES_AUTO);
-  if (err != AOM_CODEC_OK) return err;
-
-  // Note: Both use common rdmult based on base qindex of fullres.
-  const int64_t rdmult =
-      av1_compute_rd_mult_based_on_qindex(cpi, cm->quant_params.base_qindex);
-
-#if SUPERRES_RECODE_ALL_RATIOS
-  // Find the best rdcost among all superres denoms.
   double proj_rdcost1 = DBL_MAX;
-  int64_t sse1 = INT64_MAX;
-  int64_t rate1 = INT64_MAX;
-  int largest_tile_id1 = 0;
-  (void)sse1;
-  (void)rate1;
-  (void)largest_tile_id1;
-  int best_denom = -1;
-  for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; ++denom) {
-    const int this_index = denom - (SCALE_NUMERATOR + 1);
-    const int64_t this_sse = superres_sses[this_index];
-    const int64_t this_rate = superres_rates[this_index];
-    const int this_largest_tile_id = superres_largest_tile_ids[this_index];
-    const double this_rdcost = RDCOST_DBL(rdmult, this_rate, this_sse);
-    if (this_rdcost < proj_rdcost1) {
-      sse1 = this_sse;
-      rate1 = this_rate;
-      largest_tile_id1 = this_largest_tile_id;
-      proj_rdcost1 = this_rdcost;
-      best_denom = denom;
-    }
-  }
-#else
-  const double proj_rdcost1 = RDCOST_DBL(rdmult, rate1, sse1);
-#endif  // SUPERRES_RECODE_ALL_RATIOS
-  const double proj_rdcost2 = RDCOST_DBL(rdmult, rate2, sse2);
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const FRAME_UPDATE_TYPE update_type =
+      gf_group->update_type[cpi->gf_frame_index];
+  const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
 
-  // Re-encode with superres if it's better.
-  if (proj_rdcost1 < proj_rdcost2) {
-    restore_all_coding_context(cpi);
-    // TODO(urvang): We should avoid rerunning the recode loop by saving
-    // previous output+state, or running encode only for the selected 'q' in
-    // previous step.
-#if SUPERRES_RECODE_ALL_RATIOS
-    // Again, temporarily force the best denom.
-    oxcf->superres_scale_denominator = best_denom;
-    oxcf->superres_kf_scale_denominator = best_denom;
-#endif  // SUPERRES_RECODE_ALL_RATIOS
-    int64_t sse3 = INT64_MAX;
-    int64_t rate3 = INT64_MAX;
-    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
-                                             largest_tile_id);
-    assert(sse1 == sse3);
-    assert(rate1 == rate3);
-    assert(largest_tile_id1 == *largest_tile_id);
-#if SUPERRES_RECODE_ALL_RATIOS
-    // Reset.
-    oxcf->superres_scale_denominator = SCALE_NUMERATOR;
-    oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
-#endif  // SUPERRES_RECODE_ALL_RATIOS
-  } else {
-    *largest_tile_id = largest_tile_id2;
-  }
-
-  release_copy_buffer(&cpi->coding_context);
-
-  return err;
-}
-#endif  // CONFIG_SUPERRES_IN_RECODE
-
-#define DUMP_RECON_FRAMES 0
-
-#if DUMP_RECON_FRAMES == 1
-// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
-static void dump_filtered_recon_frames(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  const CurrentFrame *const current_frame = &cm->current_frame;
-  const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf;
-
-  if (recon_buf == NULL) {
-    printf("Frame %d is not ready.\n", current_frame->frame_number);
-    return;
-  }
-
-  static const int flag_list[REF_FRAMES] = { 0,
-                                             AOM_LAST_FLAG,
-                                             AOM_LAST2_FLAG,
-                                             AOM_LAST3_FLAG,
-                                             AOM_GOLD_FLAG,
-                                             AOM_BWD_FLAG,
-                                             AOM_ALT2_FLAG,
-                                             AOM_ALT_FLAG };
-  printf(
-      "\n***Frame=%d (frame_offset=%d, show_frame=%d, "
-      "show_existing_frame=%d) "
-      "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[",
-      current_frame->frame_number, current_frame->order_hint, cm->show_frame,
-      cm->show_existing_frame);
-  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
-    const int ref_offset = buf != NULL ? (int)buf->order_hint : -1;
-    printf(" %d(%c)", ref_offset,
-           (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N');
-  }
-  printf(" ]\n");
-
-  if (!cm->show_frame) {
-    printf("Frame %d is a no show frame, so no image dump.\n",
-           current_frame->frame_number);
-    return;
-  }
-
-  int h;
-  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
-  FILE *f_recon = NULL;
-
-  if (current_frame->frame_number == 0) {
-    if ((f_recon = fopen(file_name, "wb")) == NULL) {
-      printf("Unable to open file %s to write.\n", file_name);
-      return;
-    }
-  } else {
-    if ((f_recon = fopen(file_name, "ab")) == NULL) {
-      printf("Unable to open file %s to append.\n", file_name);
-      return;
-    }
-  }
-  printf(
-      "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
-      "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
-      "refresh_alt_ref_frame=%d, "
-      "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
-      current_frame->frame_number, cpi->gf_group.index,
-      cpi->gf_group.update_type[cpi->gf_group.index], current_frame->order_hint,
-      cm->show_frame, cm->show_existing_frame, cpi->rc.source_alt_ref_active,
-      cpi->refresh_alt_ref_frame, recon_buf->y_stride, recon_buf->uv_stride,
-      cm->width, cm->height);
-#if 0
-  int ref_frame;
-  printf("get_ref_frame_map_idx: [");
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
-    printf(" %d", get_ref_frame_map_idx(cm, ref_frame));
-  printf(" ]\n");
-#endif  // 0
-
-  // --- Y ---
-  for (h = 0; h < cm->height; ++h) {
-    fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
-           f_recon);
-  }
-  // --- U ---
-  for (h = 0; h < (cm->height >> 1); ++h) {
-    fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
-           f_recon);
-  }
-  // --- V ---
-  for (h = 0; h < (cm->height >> 1); ++h) {
-    fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
-           f_recon);
-  }
-
-  fclose(f_recon);
-}
-#endif  // DUMP_RECON_FRAMES
-
-static int is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
-                         const YV12_BUFFER_CONFIG *last_picture,
-                         ForceIntegerMVInfo *const force_intpel_info) {
-  aom_clear_system_state();
-  // check use hash ME
-  int k;
-
-  const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE;
-  const double threshold_current = 0.8;
-  const double threshold_average = 0.95;
-  const int max_history_size = 32;
-  int T = 0;  // total block
-  int C = 0;  // match with collocated block
-  int S = 0;  // smooth region but not match with collocated block
-
-  const int pic_width = cur_picture->y_width;
-  const int pic_height = cur_picture->y_height;
-  for (int i = 0; i + block_size <= pic_height; i += block_size) {
-    for (int j = 0; j + block_size <= pic_width; j += block_size) {
-      const int x_pos = j;
-      const int y_pos = i;
-      int match = 1;
-      T++;
-
-      // check whether collocated block match with current
-      uint8_t *p_cur = cur_picture->y_buffer;
-      uint8_t *p_ref = last_picture->y_buffer;
-      int stride_cur = cur_picture->y_stride;
-      int stride_ref = last_picture->y_stride;
-      p_cur += (y_pos * stride_cur + x_pos);
-      p_ref += (y_pos * stride_ref + x_pos);
-
-      if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
-        uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
-        uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
-        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
-          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
-            if (p16_cur[tmpX] != p16_ref[tmpX]) {
-              match = 0;
-            }
-          }
-          p16_cur += stride_cur;
-          p16_ref += stride_ref;
-        }
-      } else {
-        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
-          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
-            if (p_cur[tmpX] != p_ref[tmpX]) {
-              match = 0;
-            }
-          }
-          p_cur += stride_cur;
-          p_ref += stride_ref;
-        }
+  // Encode with superres.
+  if (cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_ALL) {
+    SuperResCfg *const superres_cfg = &cpi->oxcf.superres_cfg;
+    int64_t superres_sses[SCALE_NUMERATOR];
+    int64_t superres_rates[SCALE_NUMERATOR];
+    int superres_largest_tile_ids[SCALE_NUMERATOR];
+    // Use superres for Key-frames and Alt-ref frames only.
+    if (update_type != OVERLAY_UPDATE && update_type != INTNL_OVERLAY_UPDATE) {
+      for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+           ++denom) {
+        superres_cfg->superres_scale_denominator = denom;
+        superres_cfg->superres_kf_scale_denominator = denom;
+        const int this_index = denom - (SCALE_NUMERATOR + 1);
+
+        cpi->superres_mode = AOM_SUPERRES_AUTO;  // Super-res on for this loop.
+        err = encode_with_recode_loop_and_filter(
+            cpi, size, dest, &superres_sses[this_index],
+            &superres_rates[this_index],
+            &superres_largest_tile_ids[this_index]);
+        cpi->superres_mode = AOM_SUPERRES_NONE;  // Reset to default (full-res).
+        if (err != AOM_CODEC_OK) return err;
+        restore_all_coding_context(cpi);
       }
-
-      if (match) {
-        C++;
-        continue;
+      // Reset.
+      superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+      superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+    } else {
+      for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+           ++denom) {
+        const int this_index = denom - (SCALE_NUMERATOR + 1);
+        superres_sses[this_index] = INT64_MAX;
+        superres_rates[this_index] = INT64_MAX;
       }
-
-      if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
-                                         y_pos) ||
-          av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
-        S++;
-        continue;
+    }
+    // Encode without superres.
+    assert(cpi->superres_mode == AOM_SUPERRES_NONE);
+    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
+                                             &largest_tile_id2);
+    if (err != AOM_CODEC_OK) return err;
+
+    // Note: Both use common rdmult based on base qindex of fullres.
+    const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(
+        bit_depth, update_type, cm->quant_params.base_qindex);
+
+    // Find the best rdcost among all superres denoms.
+    int best_denom = -1;
+    for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+         ++denom) {
+      const int this_index = denom - (SCALE_NUMERATOR + 1);
+      const int64_t this_sse = superres_sses[this_index];
+      const int64_t this_rate = superres_rates[this_index];
+      const int this_largest_tile_id = superres_largest_tile_ids[this_index];
+      const double this_rdcost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+          rdmult, this_rate, this_sse, bit_depth);
+      if (this_rdcost < proj_rdcost1) {
+        sse1 = this_sse;
+        rate1 = this_rate;
+        largest_tile_id1 = this_largest_tile_id;
+        proj_rdcost1 = this_rdcost;
+        best_denom = denom;
       }
     }
+    const double proj_rdcost2 =
+        RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth);
+    // Re-encode with superres if it's better.
+    if (proj_rdcost1 < proj_rdcost2) {
+      restore_all_coding_context(cpi);
+      // TODO(urvang): We should avoid rerunning the recode loop by saving
+      // previous output+state, or running encode only for the selected 'q' in
+      // previous step.
+      // Again, temporarily force the best denom.
+      superres_cfg->superres_scale_denominator = best_denom;
+      superres_cfg->superres_kf_scale_denominator = best_denom;
+      int64_t sse3 = INT64_MAX;
+      int64_t rate3 = INT64_MAX;
+      cpi->superres_mode =
+          AOM_SUPERRES_AUTO;  // Super-res on for this recode loop.
+      err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
+                                               largest_tile_id);
+      cpi->superres_mode = AOM_SUPERRES_NONE;  // Reset to default (full-res).
+      assert(sse1 == sse3);
+      assert(rate1 == rate3);
+      assert(largest_tile_id1 == *largest_tile_id);
+      // Reset.
+      superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+      superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+    } else {
+      *largest_tile_id = largest_tile_id2;
+    }
+  } else {
+    assert(cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_DUAL);
+    cpi->superres_mode =
+        AOM_SUPERRES_AUTO;  // Super-res on for this recode loop.
+    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse1, &rate1,
+                                             &largest_tile_id1);
+    cpi->superres_mode = AOM_SUPERRES_NONE;  // Reset to default (full-res).
+    if (err != AOM_CODEC_OK) return err;
+    restore_all_coding_context(cpi);
+    // Encode without superres.
+    assert(cpi->superres_mode == AOM_SUPERRES_NONE);
+    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
+                                             &largest_tile_id2);
+    if (err != AOM_CODEC_OK) return err;
+
+    // Note: Both use common rdmult based on base qindex of fullres.
+    const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(
+        bit_depth, update_type, cm->quant_params.base_qindex);
+    proj_rdcost1 =
+        RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1, bit_depth);
+    const double proj_rdcost2 =
+        RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth);
+    // Re-encode with superres if it's better.
+    if (proj_rdcost1 < proj_rdcost2) {
+      restore_all_coding_context(cpi);
+      // TODO(urvang): We should avoid rerunning the recode loop by saving
+      // previous output+state, or running encode only for the selected 'q' in
+      // previous step.
+      int64_t sse3 = INT64_MAX;
+      int64_t rate3 = INT64_MAX;
+      cpi->superres_mode =
+          AOM_SUPERRES_AUTO;  // Super-res on for this recode loop.
+      err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
+                                               largest_tile_id);
+      cpi->superres_mode = AOM_SUPERRES_NONE;  // Reset to default (full-res).
+      assert(sse1 == sse3);
+      assert(rate1 == rate3);
+      assert(largest_tile_id1 == *largest_tile_id);
+    } else {
+      *largest_tile_id = largest_tile_id2;
+    }
   }
 
-  assert(T > 0);
-  double cs_rate = ((double)(C + S)) / ((double)(T));
-
-  force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate;
-
-  force_intpel_info->rate_index =
-      (force_intpel_info->rate_index + 1) % max_history_size;
-  force_intpel_info->rate_size++;
-  force_intpel_info->rate_size =
-      AOMMIN(force_intpel_info->rate_size, max_history_size);
-
-  if (cs_rate < threshold_current) {
-    return 0;
-  }
-
-  if (C == T) {
-    return 1;
-  }
-
-  double cs_average = 0.0;
-
-  for (k = 0; k < force_intpel_info->rate_size; k++) {
-    cs_average += force_intpel_info->cs_rate_array[k];
-  }
-  cs_average /= force_intpel_info->rate_size;
-
-  if (cs_average < threshold_average) {
-    return 0;
-  }
-
-  if ((T - C - S) < 0) {
-    return 1;
-  }
-
-  if (cs_average > 1.01) {
-    return 1;
-  }
-
-  return 0;
+  return err;
 }
 
-// Refresh reference frame buffers according to refresh_frame_flags.
-static void refresh_reference_frames(AV1_COMP *cpi) {
+// Conditions to disable cdf_update mode in selective mode for real-time.
+// Handle case for layers, scene change, and resizing.
+static int selective_disable_cdf_rtc(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  // All buffers are refreshed for shown keyframes and S-frames.
-
-  for (int ref_frame = 0; ref_frame < REF_FRAMES; ref_frame++) {
-    if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) {
-      assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame);
-    }
+  RATE_CONTROL *const rc = &cpi->rc;
+  // For single layer.
+  if (cpi->svc.number_spatial_layers == 1 &&
+      cpi->svc.number_temporal_layers == 1) {
+    // Don't disable on intra_only, scene change (high_source_sad = 1),
+    // or resized frame. To avoid quality loss for now, force enable at
+    // every 8 frames.
+    if (frame_is_intra_only(cm) || is_frame_resize_pending(cpi) ||
+        rc->high_source_sad || rc->frames_since_key < 10 ||
+        cm->current_frame.frame_number % 8 == 0)
+      return 0;
+    else
+      return 1;
+  } else if (cpi->svc.number_temporal_layers > 1) {
+    // Disable only on top temporal enhancement layer for now.
+    return cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1;
   }
+  return 1;
 }
 
-static void set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) {
-  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
-  ThreadData *td = &cpi->td;
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  uint8_t *y_buffer = cpi->source->y_buffer;
-  const int y_stride = cpi->source->y_stride;
-  const int block_size = BLOCK_16X16;
-
-  const int num_mi_w = mi_size_wide[block_size];
-  const int num_mi_h = mi_size_high[block_size];
-  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
-  double log_sum = 0.0;
-  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
-
-  // Loop through each 16x16 block.
-  for (int row = 0; row < num_rows; ++row) {
-    for (int col = 0; col < num_cols; ++col) {
-      double var = 0.0, num_of_var = 0.0;
-      const int index = row * num_cols + col;
-
-      // Loop through each 8x8 block.
-      for (int mi_row = row * num_mi_h;
-           mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
-           mi_row += 2) {
-        for (int mi_col = col * num_mi_w;
-             mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
-             mi_col += 2) {
-          struct buf_2d buf;
-          const int row_offset_y = mi_row << 2;
-          const int col_offset_y = mi_col << 2;
-
-          buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
-          buf.stride = y_stride;
-
-          if (use_hbd) {
-            var += av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8,
-                                                      xd->bd);
-          } else {
-            var += av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8);
-          }
-
-          num_of_var += 1.0;
-        }
-      }
-      var = var / num_of_var;
-
-      // Curve fitting with an exponential model on all 16x16 blocks from the
-      // midres dataset.
-      var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
-      cpi->ssim_rdmult_scaling_factors[index] = var;
-      log_sum += log(var);
-    }
-  }
-  log_sum = exp(log_sum / (double)(num_rows * num_cols));
+#if !CONFIG_REALTIME_ONLY
+static void subtract_stats(FIRSTPASS_STATS *section,
+                           const FIRSTPASS_STATS *frame) {
+  section->frame -= frame->frame;
+  section->weight -= frame->weight;
+  section->intra_error -= frame->intra_error;
+  section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
+  section->coded_error -= frame->coded_error;
+  section->sr_coded_error -= frame->sr_coded_error;
+  section->pcnt_inter -= frame->pcnt_inter;
+  section->pcnt_motion -= frame->pcnt_motion;
+  section->pcnt_second_ref -= frame->pcnt_second_ref;
+  section->pcnt_neutral -= frame->pcnt_neutral;
+  section->intra_skip_pct -= frame->intra_skip_pct;
+  section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->MVr -= frame->MVr;
+  section->mvr_abs -= frame->mvr_abs;
+  section->MVc -= frame->MVc;
+  section->mvc_abs -= frame->mvc_abs;
+  section->MVrv -= frame->MVrv;
+  section->MVcv -= frame->MVcv;
+  section->mv_in_out_count -= frame->mv_in_out_count;
+  section->new_mv_count -= frame->new_mv_count;
+  section->count -= frame->count;
+  section->duration -= frame->duration;
+}
+
+static void calculate_frame_avg_haar_energy(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  const FIRSTPASS_STATS *const total_stats =
+      twopass->stats_buf_ctx->total_stats;
+
+  if (is_one_pass_rt_params(cpi) ||
+      (cpi->oxcf.q_cfg.deltaq_mode != DELTA_Q_PERCEPTUAL) ||
+      (is_fp_wavelet_energy_invalid(total_stats) == 0))
+    return;
 
-  for (int row = 0; row < num_rows; ++row) {
-    for (int col = 0; col < num_cols; ++col) {
-      const int index = row * num_cols + col;
-      cpi->ssim_rdmult_scaling_factors[index] /= log_sum;
-    }
-  }
+  const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+                          ? cpi->initial_mbs
+                          : cpi->common.mi_params.MBs;
+  const YV12_BUFFER_CONFIG *const unfiltered_source = cpi->unfiltered_source;
+  const uint8_t *const src = unfiltered_source->y_buffer;
+  const int hbd = unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int stride = unfiltered_source->y_stride;
+  const BLOCK_SIZE fp_block_size =
+      get_fp_block_size(cpi->is_screen_content_type);
+  const int fp_block_size_width = block_size_wide[fp_block_size];
+  const int fp_block_size_height = block_size_high[fp_block_size];
+  const int num_unit_cols =
+      get_num_blocks(unfiltered_source->y_crop_width, fp_block_size_width);
+  const int num_unit_rows =
+      get_num_blocks(unfiltered_source->y_crop_height, fp_block_size_height);
+  const int num_8x8_cols = num_unit_cols * (fp_block_size_width / 8);
+  const int num_8x8_rows = num_unit_rows * (fp_block_size_height / 8);
+  int64_t frame_avg_wavelet_energy = av1_haar_ac_sad_mxn_uint8_input(
+      src, stride, hbd, num_8x8_rows, num_8x8_cols);
+
+  cpi->twopass_frame.frame_avg_haar_energy =
+      log(((double)frame_avg_wavelet_energy / num_mbs) + 1.0);
 }
+#endif
 
 extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc,
                                      const char *filename);
 
+/*!\brief Run the final pass encoding for 1-pass/2-pass encoding mode, and pack
+ * the bitstream
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    size            Bitstream size
+ * \param[in]    dest            Bitstream output
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
 static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
                                      uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
+  SequenceHeader *const seq_params = cm->seq_params;
   CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
   FeatureFlags *const features = &cm->features;
+  const TileConfig *const tile_cfg = &oxcf->tile_cfg;
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, encode_frame_to_data_rate_time);
 #endif
 
+  if (frame_is_intra_only(cm)) {
+    av1_set_screen_content_options(cpi, features);
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  calculate_frame_avg_haar_energy(cpi);
+#endif
+
   // frame type has been decided outside of this function call
   cm->cur_frame->frame_type = current_frame->frame_type;
 
-  cm->tiles.large_scale = cpi->oxcf.large_scale_tile;
-  cm->tiles.single_tile_decoding = cpi->oxcf.single_tile_decoding;
+  cm->tiles.large_scale = tile_cfg->enable_large_scale_tile;
+  cm->tiles.single_tile_decoding = tile_cfg->enable_single_tile_decoding;
 
   features->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
   // features->allow_ref_frame_mvs needs to be written into the frame header
@@ -6276,15 +3355,33 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   // is separated from frame_might_allow_ref_frame_mvs().
   features->allow_ref_frame_mvs &= !cm->tiles.large_scale;
 
-  features->allow_warped_motion =
-      cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm);
+  features->allow_warped_motion = oxcf->motion_mode_cfg.allow_warped_motion &&
+                                  frame_might_allow_warped_motion(cm);
 
   cpi->last_frame_type = current_frame->frame_type;
 
+  if (frame_is_sframe(cm)) {
+    GF_GROUP *gf_group = &cpi->ppi->gf_group;
+    // S frame will wipe out any previously encoded altref so we cannot place
+    // an overlay frame
+    gf_group->update_type[gf_group->size] = GF_UPDATE;
+  }
+
   if (encode_show_existing_frame(cm)) {
-    finalize_encoded_frame(cpi);
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+    // TODO(angiebird): Move this into a function.
+    if (oxcf->pass == AOM_RC_THIRD_PASS) {
+      int frame_coding_idx =
+          av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+      rc_log_frame_encode_param(
+          &cpi->rc_log, frame_coding_idx, 1, 255,
+          cpi->ppi->gf_group.update_type[cpi->gf_frame_index]);
+    }
+#endif
+    av1_finalize_encoded_frame(cpi);
     // Build the bitstream
     int largest_tile_id = 0;  // Output from bitstream: unused here
+    cpi->rc.coefficient_size = 0;
     if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
       return AOM_CODEC_ERROR;
 
@@ -6296,46 +3393,49 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
         cm->ref_frame_id[i] = display_frame_id;
     }
 
-    cpi->seq_params_locked = 1;
-
 #if DUMP_RECON_FRAMES == 1
     // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
-    dump_filtered_recon_frames(cpi);
+    av1_dump_filtered_recon_frames(cpi);
 #endif  // DUMP_RECON_FRAMES
 
     // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
     //       for the purpose to verify no mismatch between encoder and decoder.
     if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
 
-    refresh_reference_frames(cpi);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    av1_denoiser_update_ref_frame(cpi);
+#endif
 
     // Since we allocate a spot for the OVERLAY frame in the gf group, we need
     // to do post-encoding update accordingly.
-    if (cpi->rc.is_src_frame_alt_ref) {
-      av1_set_target_rate(cpi, cm->width, cm->height);
-      av1_rc_postencode_update(cpi, *size);
+    av1_set_target_rate(cpi, cm->width, cm->height);
+
+    if (is_psnr_calc_enabled(cpi)) {
+      cpi->source =
+          realloc_and_scale_source(cpi, cm->cur_frame->buf.y_crop_width,
+                                   cm->cur_frame->buf.y_crop_height);
     }
 
     ++current_frame->frame_number;
-
+    update_frame_index_set(&cpi->frame_index_set, cm->show_frame);
     return AOM_CODEC_OK;
   }
 
   // Work out whether to force_integer_mv this frame
   if (!is_stat_generation_stage(cpi) &&
       cpi->common.features.allow_screen_content_tools &&
-      !frame_is_intra_only(cm)) {
-    if (cpi->common.seq_params.force_integer_mv == 2) {
+      !frame_is_intra_only(cm) && !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    if (cpi->common.seq_params->force_integer_mv == 2) {
       // Adaptive mode: see what previous frame encoded did
       if (cpi->unscaled_last_source != NULL) {
-        features->cur_frame_force_integer_mv = is_integer_mv(
+        features->cur_frame_force_integer_mv = av1_is_integer_mv(
             cpi->source, cpi->unscaled_last_source, &cpi->force_intpel_info);
       } else {
         cpi->common.features.cur_frame_force_integer_mv = 0;
       }
     } else {
       cpi->common.features.cur_frame_force_integer_mv =
-          cpi->common.seq_params.force_integer_mv;
+          cpi->common.seq_params->force_integer_mv;
     }
   } else {
     cpi->common.features.cur_frame_force_integer_mv = 0;
@@ -6354,12 +3454,9 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
       seg->update_map = 1;
       seg->update_data = 1;
     }
-
-    // The alternate reference frame cannot be active for a key frame.
-    cpi->rc.source_alt_ref_active = 0;
   }
-  if (cpi->oxcf.mtu == 0) {
-    cpi->num_tg = cpi->oxcf.num_tile_groups;
+  if (tile_cfg->mtu == 0) {
+    cpi->num_tg = tile_cfg->num_tile_groups;
   } else {
     // Use a default value for the purposes of weighting costs in probability
     // updates
@@ -6368,26 +3465,40 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
 
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
-  if (has_no_stats_stage(cpi) && oxcf->rc_mode == AOM_CBR &&
+  if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR &&
       current_frame->frame_type != KEY_FRAME) {
-    if (av1_rc_drop_frame(cpi)) {
+    if (cpi->oxcf.rc_cfg.target_bandwidth == 0 || av1_rc_drop_frame(cpi)) {
       av1_setup_frame_size(cpi);
+      av1_set_mv_search_params(cpi);
       av1_rc_postencode_update_drop_frame(cpi);
       release_scaled_references(cpi);
+      cpi->is_dropped_frame = true;
       return AOM_CODEC_OK;
     }
   }
 
-  if (oxcf->tuning == AOM_TUNE_SSIM) set_mb_ssim_rdmult_scaling(cpi);
+  if (oxcf->tune_cfg.tuning == AOM_TUNE_SSIM) {
+    av1_set_mb_ssim_rdmult_scaling(cpi);
+  }
 
 #if CONFIG_TUNE_VMAF
-  if (oxcf->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
-      oxcf->tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+  if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+      oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN ||
+      oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
     av1_set_mb_vmaf_rdmult_scaling(cpi);
   }
 #endif
 
-  aom_clear_system_state();
+  if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI &&
+      cpi->sf.rt_sf.use_nonrd_pick_mode == 0) {
+    av1_init_mb_wiener_var_buffer(cpi);
+    av1_set_mb_wiener_variance(cpi);
+  }
+
+  if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) {
+    av1_init_mb_ur_var_buffer(cpi);
+    av1_set_mb_ur_variance(cpi);
+  }
 
 #if CONFIG_INTERNAL_STATS
   memset(cpi->mode_chosen_counts, 0,
@@ -6415,7 +3526,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
       // same across different streams of the same content current_frame_id
       // should be the same and not random. 0x37 is a chosen number as start
       // point
-      if (cpi->oxcf.sframe_enabled) cm->current_frame_id = 0x37;
+      if (oxcf->kf_cfg.sframe_dist != 0) cm->current_frame_id = 0x37;
     } else {
       cm->current_frame_id =
           (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) %
@@ -6423,7 +3534,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     }
   }
 
-  switch (cpi->oxcf.cdf_update_mode) {
+  switch (oxcf->algo_cfg.cdf_update_mode) {
     case 0:  // No CDF update for any frames(4~6% compression loss).
       features->disable_cdf_update = 1;
       break;
@@ -6433,33 +3544,43 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     case 2:
       // Strategically determine at which frames to do CDF update.
       // Currently only enable CDF update for all-intra and no-show frames(1.5%
-      // compression loss).
-      // TODO(huisu@google.com): design schemes for various trade-offs between
-      // compression quality and decoding speed.
-      features->disable_cdf_update =
-          (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
+      // compression loss) for good qualiy or allintra mode.
+      if (oxcf->mode == GOOD || oxcf->mode == ALLINTRA) {
+        features->disable_cdf_update =
+            (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
+      } else {
+        features->disable_cdf_update = selective_disable_cdf_rtc(cpi);
+      }
       break;
   }
-  seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  // Disable cdf update for the INTNL_ARF_UPDATE frame with
+  // frame_parallel_level 1.
+  if (!cpi->do_frame_data_update &&
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+    assert(cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1);
+    features->disable_cdf_update = 1;
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
   int largest_tile_id = 0;
-#if CONFIG_SUPERRES_IN_RECODE
-  if (superres_in_recode_allowed(cpi)) {
+  if (av1_superres_in_recode_allowed(cpi)) {
     if (encode_with_and_without_superres(cpi, size, dest, &largest_tile_id) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
     }
   } else {
-#endif  // CONFIG_SUPERRES_IN_RECODE
+    const aom_superres_mode orig_superres_mode = cpi->superres_mode;  // save
+    cpi->superres_mode = cpi->oxcf.superres_cfg.superres_mode;
     if (encode_with_recode_loop_and_filter(cpi, size, dest, NULL, NULL,
                                            &largest_tile_id) != AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
     }
-#if CONFIG_SUPERRES_IN_RECODE
+    cpi->superres_mode = orig_superres_mode;  // restore
   }
-#endif  // CONFIG_SUPERRES_IN_RECODE
-
-  cpi->seq_params_locked = 1;
 
   // Update reference frame ids for reference frames this frame will overwrite
   if (seq_params->frame_id_numbers_present_flag) {
@@ -6470,9 +3591,12 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     }
   }
 
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    cpi->svc.num_encoded_top_layer++;
+
 #if DUMP_RECON_FRAMES == 1
   // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
-  dump_filtered_recon_frames(cpi);
+  av1_dump_filtered_recon_frames(cpi);
 #endif  // DUMP_RECON_FRAMES
 
   if (cm->seg.enabled) {
@@ -6480,24 +3604,34 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
       update_reference_segmentation_map(cpi);
     } else if (cm->last_frame_seg_map) {
       memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map,
-             cm->mi_params.mi_cols * cm->mi_params.mi_rows * sizeof(uint8_t));
+             cm->cur_frame->mi_cols * cm->cur_frame->mi_rows *
+                 sizeof(*cm->cur_frame->seg_map));
     }
   }
 
-  if (frame_is_intra_only(cm) == 0) {
-    release_scaled_references(cpi);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  int release_scaled_refs = 0;
+#if CONFIG_FPMT_TEST
+  release_scaled_refs =
+      (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0;
+#endif  // CONFIG_FPMT_TEST
+  if (release_scaled_refs ||
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+#else
+  {
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    if (frame_is_intra_only(cm) == 0) {
+      release_scaled_references(cpi);
+    }
   }
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  av1_denoiser_update_ref_frame(cpi);
+#endif
 
   // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
   //       for the purpose to verify no mismatch between encoder and decoder.
   if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
 
-  refresh_reference_frames(cpi);
-
-#if CONFIG_ENTROPY_STATS
-  av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts);
-#endif  // CONFIG_ENTROPY_STATS
-
   if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
     *cm->fc = cpi->tile_data[largest_tile_id].tctx;
     av1_reset_cdf_symbol_counters(cm->fc);
@@ -6506,7 +3640,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     cm->cur_frame->frame_context = *cm->fc;
   }
 
-  if (cpi->oxcf.ext_tile_debug) {
+  if (tile_cfg->enable_ext_tile_debug) {
     // (yunqing) This test ensures the correctness of large scale tile coding.
     if (cm->tiles.large_scale && is_stat_consumption_stage(cpi)) {
       char fn[20] = "./fc";
@@ -6518,27 +3652,8 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     }
   }
 
-#if CONFIG_COLLECT_COMPONENT_TIMING
-  end_timing(cpi, encode_frame_to_data_rate_time);
-
-  // Print out timing information.
-  int i;
-  fprintf(stderr, "\n Frame number: %d, Frame type: %s, Show Frame: %d\n",
-          cm->current_frame.frame_number,
-          get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame);
-  for (i = 0; i < kTimingComponents; i++) {
-    cpi->component_time[i] += cpi->frame_component_time[i];
-    fprintf(stderr, " %s:  %" PRId64 " us (total: %" PRId64 " us)\n",
-            get_component_name(i), cpi->frame_component_time[i],
-            cpi->component_time[i]);
-    cpi->frame_component_time[i] = 0;
-  }
-#endif
-
   cpi->last_frame_type = current_frame->frame_type;
 
-  av1_rc_postencode_update(cpi, *size);
-
   // Clear the one shot update flags for segmentation map and mode/ref loop
   // filter deltas.
   cm->seg.update_map = 0;
@@ -6548,13 +3663,15 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   // A droppable frame might not be shown but it always
   // takes a space in the gf group. Therefore, even when
   // it is not shown, we still need update the count down.
-
   if (cm->show_frame) {
-    // Don't increment frame counters if this was an altref buffer
-    // update not a real frame
+    update_frame_index_set(&cpi->frame_index_set, cm->show_frame);
     ++current_frame->frame_number;
   }
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_frame_to_data_rate_time);
+#endif
+
   return AOM_CODEC_OK;
 }
 
@@ -6582,24 +3699,34 @@ int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
   memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx,
          REF_FRAMES * sizeof(*cm->remapped_ref_idx));
 
-  cpi->refresh_golden_frame = frame_params->refresh_golden_frame;
-  cpi->refresh_bwd_ref_frame = frame_params->refresh_bwd_ref_frame;
-  cpi->refresh_alt_ref_frame = frame_params->refresh_alt_ref_frame;
+  memcpy(&cpi->refresh_frame, &frame_params->refresh_frame,
+         sizeof(cpi->refresh_frame));
 
-  if (current_frame->frame_type == KEY_FRAME && cm->show_frame)
+  if (current_frame->frame_type == KEY_FRAME &&
+      cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
     current_frame->frame_number = 0;
+  }
 
   current_frame->order_hint =
       current_frame->frame_number + frame_params->order_offset;
+
   current_frame->display_order_hint = current_frame->order_hint;
   current_frame->order_hint %=
-      (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1));
+      (1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1));
+
+  current_frame->pyramid_level = get_true_pyr_level(
+      cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index],
+      current_frame->display_order_hint, cpi->ppi->gf_group.max_layer_depth);
 
   if (is_stat_generation_stage(cpi)) {
 #if !CONFIG_REALTIME_ONLY
-    av1_first_pass(cpi, frame_input->ts_duration);
+    if (cpi->oxcf.q_cfg.use_fixed_qp_offsets)
+      av1_noop_first_pass_frame(cpi, frame_input->ts_duration);
+    else
+      av1_first_pass(cpi, frame_input->ts_duration);
 #endif
-  } else if (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) {
+  } else if (cpi->oxcf.pass == AOM_RC_ONE_PASS ||
+             cpi->oxcf.pass >= AOM_RC_SECOND_PASS) {
     if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
@@ -6618,9 +3745,9 @@ static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd,
   AV1_COMMON *const cm = &cpi->common;
   if (!cpi->denoise_and_model) {
     cpi->denoise_and_model = aom_denoise_and_model_alloc(
-        cm->seq_params.bit_depth, block_size, noise_level);
+        cm->seq_params->bit_depth, block_size, noise_level);
     if (!cpi->denoise_and_model) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Error allocating denoise and model");
       return -1;
     }
@@ -6628,14 +3755,15 @@ static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd,
   if (!cpi->film_grain_table) {
     cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
     if (!cpi->film_grain_table) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Error allocating grain table");
       return -1;
     }
     memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table));
   }
   if (aom_denoise_and_model_run(cpi->denoise_and_model, sd,
-                                &cm->film_grain_params)) {
+                                &cm->film_grain_params,
+                                cpi->oxcf.enable_dnl_denoising)) {
     if (cm->film_grain_params.apply_grain) {
       aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time,
                                   &cm->film_grain_params);
@@ -6649,7 +3777,7 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
   AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   int res = 0;
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
@@ -6657,11 +3785,11 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
 
 #if CONFIG_TUNE_VMAF
   if (!is_stat_generation_stage(cpi) &&
-      cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) {
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) {
     av1_vmaf_frame_preprocessing(cpi, sd);
   }
   if (!is_stat_generation_stage(cpi) &&
-      cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
     av1_vmaf_blk_preprocessing(cpi, sd);
   }
 #endif
@@ -6670,43 +3798,95 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
   struct aom_usec_timer timer;
   aom_usec_timer_start(&timer);
 #endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
+
 #if CONFIG_DENOISE
-  if (cpi->oxcf.noise_level > 0)
+  // even if denoise_noise_level is > 0, we don't need need to denoise on pass
+  // 1 of 2 if enable_dnl_denoising is disabled since the 2nd pass will be
+  // encoding the original (non-denoised) frame
+  if (cpi->oxcf.noise_level > 0 && !(cpi->oxcf.pass == AOM_RC_FIRST_PASS &&
+                                     !cpi->oxcf.enable_dnl_denoising)) {
+#if !CONFIG_REALTIME_ONLY
+    // Choose a synthetic noise level for still images for enhanced perceptual
+    // quality based on an estimated noise level in the source, but only if
+    // the noise level is set on the command line to > 0.
+    if (cpi->oxcf.mode == ALLINTRA) {
+      // No noise synthesis if source is very clean.
+      // Uses a low edge threshold to focus on smooth areas.
+      // Increase output noise setting a little compared to measured value.
+      cpi->oxcf.noise_level =
+          (float)(av1_estimate_noise_from_single_plane(
+                      sd, 0, cm->seq_params->bit_depth, 16) -
+                  0.1);
+      cpi->oxcf.noise_level = (float)AOMMAX(0.0, cpi->oxcf.noise_level);
+      if (cpi->oxcf.noise_level > 0.0) {
+        cpi->oxcf.noise_level += (float)0.5;
+      }
+      cpi->oxcf.noise_level = (float)AOMMIN(5.0, cpi->oxcf.noise_level);
+    }
+#endif
+
     if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size,
                          cpi->oxcf.noise_level, time_stamp, end_time) < 0)
       res = -1;
+  }
 #endif  //  CONFIG_DENOISE
 
-  if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+  if (av1_lookahead_push(cpi->ppi->lookahead, sd, time_stamp, end_time,
                          use_highbitdepth, frame_flags))
     res = -1;
 #if CONFIG_INTERNAL_STATS
   aom_usec_timer_mark(&timer);
-  cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
+  cpi->ppi->total_time_receive_data += aom_usec_timer_elapsed(&timer);
 #endif
+
+  // Note: Regarding profile setting, the following checks are added to help
+  // choose a proper profile for the input video. The criterion is that all
+  // bitstreams must be designated as the lowest profile that match its content.
+  // E.G. A bitstream that contains 4:4:4 video must be designated as High
+  // Profile in the seq header, and likewise a bitstream that contains 4:2:2
+  // bitstream must be designated as Professional Profile in the sequence
+  // header.
   if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
       (subsampling_x != 1 || subsampling_y != 1)) {
-    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+    aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM,
                        "Non-4:2:0 color format requires profile 1 or 2");
     res = -1;
   }
   if ((seq_params->profile == PROFILE_1) &&
       !(subsampling_x == 0 && subsampling_y == 0)) {
-    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+    aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM,
                        "Profile 1 requires 4:4:4 color format");
     res = -1;
   }
   if ((seq_params->profile == PROFILE_2) &&
       (seq_params->bit_depth <= AOM_BITS_10) &&
       !(subsampling_x == 1 && subsampling_y == 0)) {
-    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
-                       "Profile 2 bit-depth < 10 requires 4:2:2 color format");
+    aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM,
+                       "Profile 2 bit-depth <= 10 requires 4:2:2 color format");
     res = -1;
   }
 
   return res;
 }
 
+#if CONFIG_ENTROPY_STATS
+void print_entropy_stats(AV1_PRIMARY *const ppi) {
+  if (!ppi->cpi) return;
+
+  if (ppi->cpi->oxcf.pass != 1 &&
+      ppi->cpi->common.current_frame.frame_number > 0) {
+    fprintf(stderr, "Writing counts.stt\n");
+    FILE *f = fopen("counts.stt", "wb");
+    fwrite(&ppi->aggregate_fc, sizeof(ppi->aggregate_fc), 1, f);
+    fclose(f);
+  }
+}
+#endif  // CONFIG_ENTROPY_STATS
+
 #if CONFIG_INTERNAL_STATS
 extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
                                  const unsigned char *img2, int img2_pitch,
@@ -6722,11 +3902,16 @@ static void adjust_image_stat(double y, double u, double v, double all,
 }
 
 static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
+  AV1_PRIMARY *const ppi = cpi->ppi;
   AV1_COMMON *const cm = &cpi->common;
   double samples = 0.0;
-  const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth;
+  const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
   const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
 
+  if (cpi->ppi->use_svc &&
+      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+    return;
+
 #if CONFIG_INTER_STATS_ONLY
   if (cm->current_frame.frame_type == KEY_FRAME) return;  // skip key frame
 #endif
@@ -6736,31 +3921,44 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
     const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
     double y, u, v, frame_all;
 
-    cpi->count++;
-    if (cpi->b_calculate_psnr) {
+    ppi->count[0]++;
+    ppi->count[1]++;
+    if (cpi->ppi->b_calculate_psnr) {
       PSNR_STATS psnr;
-      double frame_ssim2 = 0.0, weight = 0.0;
-      aom_clear_system_state();
+      double weight[2] = { 0.0, 0.0 };
+      double frame_ssim2[2] = { 0.0, 0.0 };
 #if CONFIG_AV1_HIGHBITDEPTH
       aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
 #else
       aom_calc_psnr(orig, recon, &psnr);
 #endif
       adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
-                        &cpi->psnr);
-      cpi->total_sq_error += psnr.sse[0];
-      cpi->total_samples += psnr.samples[0];
+                        &(ppi->psnr[0]));
+      ppi->total_sq_error[0] += psnr.sse[0];
+      ppi->total_samples[0] += psnr.samples[0];
       samples = psnr.samples[0];
-      // TODO(yaowu): unify these two versions into one.
-      if (cm->seq_params.use_highbitdepth)
-        frame_ssim2 =
-            aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
-      else
-        frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
 
-      cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2);
-      cpi->summed_quality += frame_ssim2 * weight;
-      cpi->summed_weights += weight;
+      aom_calc_ssim(orig, recon, bit_depth, in_bit_depth,
+                    cm->seq_params->use_highbitdepth, weight, frame_ssim2);
+
+      ppi->worst_ssim = AOMMIN(ppi->worst_ssim, frame_ssim2[0]);
+      ppi->summed_quality += frame_ssim2[0] * weight[0];
+      ppi->summed_weights += weight[0];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      // Compute PSNR based on stream bit depth
+      if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
+          (in_bit_depth < bit_depth)) {
+        adjust_image_stat(psnr.psnr_hbd[1], psnr.psnr_hbd[2], psnr.psnr_hbd[3],
+                          psnr.psnr_hbd[0], &ppi->psnr[1]);
+        ppi->total_sq_error[1] += psnr.sse_hbd[0];
+        ppi->total_samples[1] += psnr.samples_hbd[0];
+
+        ppi->worst_ssim_hbd = AOMMIN(ppi->worst_ssim_hbd, frame_ssim2[1]);
+        ppi->summed_quality_hbd += frame_ssim2[1] * weight[1];
+        ppi->summed_weights_hbd += weight[1];
+      }
+#endif
 
 #if 0
       {
@@ -6776,60 +3974,446 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
       }
 #endif
     }
-    if (cpi->b_calculate_blockiness) {
-      if (!cm->seq_params.use_highbitdepth) {
+    if (ppi->b_calculate_blockiness) {
+      if (!cm->seq_params->use_highbitdepth) {
         const double frame_blockiness =
             av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
                                recon->y_stride, orig->y_width, orig->y_height);
-        cpi->worst_blockiness = AOMMAX(cpi->worst_blockiness, frame_blockiness);
-        cpi->total_blockiness += frame_blockiness;
+        ppi->worst_blockiness = AOMMAX(ppi->worst_blockiness, frame_blockiness);
+        ppi->total_blockiness += frame_blockiness;
       }
 
-      if (cpi->b_calculate_consistency) {
-        if (!cm->seq_params.use_highbitdepth) {
+      if (ppi->b_calculate_consistency) {
+        if (!cm->seq_params->use_highbitdepth) {
           const double this_inconsistency = aom_get_ssim_metrics(
               orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
-              orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
+              orig->y_width, orig->y_height, ppi->ssim_vars, &ppi->metrics, 1);
 
           const double peak = (double)((1 << in_bit_depth) - 1);
           const double consistency =
-              aom_sse_to_psnr(samples, peak, cpi->total_inconsistency);
+              aom_sse_to_psnr(samples, peak, ppi->total_inconsistency);
           if (consistency > 0.0)
-            cpi->worst_consistency =
-                AOMMIN(cpi->worst_consistency, consistency);
-          cpi->total_inconsistency += this_inconsistency;
+            ppi->worst_consistency =
+                AOMMIN(ppi->worst_consistency, consistency);
+          ppi->total_inconsistency += this_inconsistency;
         }
       }
     }
 
     frame_all =
         aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
-    adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+    adjust_image_stat(y, u, v, frame_all, &ppi->fastssim);
     frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
-    adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
+    adjust_image_stat(y, u, v, frame_all, &ppi->psnrhvs);
+  }
+}
+
+void print_internal_stats(AV1_PRIMARY *ppi) {
+  if (!ppi->cpi) return;
+  AV1_COMP *const cpi = ppi->cpi;
+
+  if (ppi->cpi->oxcf.pass != 1 &&
+      ppi->cpi->common.current_frame.frame_number > 0) {
+    char headings[512] = { 0 };
+    char results[512] = { 0 };
+    FILE *f = fopen("opsnr.stt", "a");
+    double time_encoded =
+        (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) /
+        10000000.000;
+    double total_encode_time =
+        (ppi->total_time_receive_data + ppi->total_time_compress_data) /
+        1000.000;
+    const double dr =
+        (double)ppi->total_bytes * (double)8 / (double)1000 / time_encoded;
+    const double peak =
+        (double)((1 << ppi->cpi->oxcf.input_cfg.input_bit_depth) - 1);
+    const double target_rate =
+        (double)ppi->cpi->oxcf.rc_cfg.target_bandwidth / 1000;
+    const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
+
+    if (ppi->b_calculate_psnr) {
+      const double total_psnr = aom_sse_to_psnr(
+          (double)ppi->total_samples[0], peak, (double)ppi->total_sq_error[0]);
+      const double total_ssim =
+          100 * pow(ppi->summed_quality / ppi->summed_weights, 8.0);
+      snprintf(headings, sizeof(headings),
+               "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+               "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+               "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+               "AVPsrnY\tAPsnrCb\tAPsnrCr");
+      snprintf(results, sizeof(results),
+               "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+               "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+               "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+               "%7.3f\t%7.3f\t%7.3f",
+               dr, ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr,
+               ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr,
+               total_ssim, total_ssim,
+               ppi->fastssim.stat[STAT_ALL] / ppi->count[0],
+               ppi->psnrhvs.stat[STAT_ALL] / ppi->count[0], ppi->psnr[0].worst,
+               ppi->worst_ssim, ppi->fastssim.worst, ppi->psnrhvs.worst,
+               ppi->psnr[0].stat[STAT_Y] / ppi->count[0],
+               ppi->psnr[0].stat[STAT_U] / ppi->count[0],
+               ppi->psnr[0].stat[STAT_V] / ppi->count[0]);
+
+      if (ppi->b_calculate_blockiness) {
+        SNPRINT(headings, "\t  Block\tWstBlck");
+        SNPRINT2(results, "\t%7.3f", ppi->total_blockiness / ppi->count[0]);
+        SNPRINT2(results, "\t%7.3f", ppi->worst_blockiness);
+      }
+
+      if (ppi->b_calculate_consistency) {
+        double consistency =
+            aom_sse_to_psnr((double)ppi->total_samples[0], peak,
+                            (double)ppi->total_inconsistency);
+
+        SNPRINT(headings, "\tConsist\tWstCons");
+        SNPRINT2(results, "\t%7.3f", consistency);
+        SNPRINT2(results, "\t%7.3f", ppi->worst_consistency);
+      }
+
+      SNPRINT(headings, "\t   Time\tRcErr\tAbsErr");
+      SNPRINT2(results, "\t%8.0f", total_encode_time);
+      SNPRINT2(results, " %7.2f", rate_err);
+      SNPRINT2(results, " %7.2f", fabs(rate_err));
+
+      SNPRINT(headings, "\tAPsnr611");
+      SNPRINT2(results, " %7.3f",
+               (6 * ppi->psnr[0].stat[STAT_Y] + ppi->psnr[0].stat[STAT_U] +
+                ppi->psnr[0].stat[STAT_V]) /
+                   (ppi->count[0] * 8));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      const uint32_t in_bit_depth = ppi->cpi->oxcf.input_cfg.input_bit_depth;
+      const uint32_t bit_depth = ppi->seq_params.bit_depth;
+      // Since cpi->source->flags is not available here, but total_samples[1]
+      // will be non-zero if cpi->source->flags & YV12_FLAG_HIGHBITDEPTH was
+      // true in compute_internal_stats
+      if ((ppi->total_samples[1] > 0) && (in_bit_depth < bit_depth)) {
+        const double peak_hbd = (double)((1 << bit_depth) - 1);
+        const double total_psnr_hbd =
+            aom_sse_to_psnr((double)ppi->total_samples[1], peak_hbd,
+                            (double)ppi->total_sq_error[1]);
+        const double total_ssim_hbd =
+            100 * pow(ppi->summed_quality_hbd / ppi->summed_weights_hbd, 8.0);
+        SNPRINT(headings,
+                "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH"
+                " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH"
+                " AOMSSIMH VPSSIMPH WstSsimH");
+        SNPRINT2(results, "\t%7.3f",
+                 ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", total_psnr_hbd);
+        SNPRINT2(results, "  %7.3f",
+                 ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", total_psnr_hbd);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].stat[STAT_Y] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].stat[STAT_U] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].stat[STAT_V] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].worst);
+        SNPRINT2(results, "  %7.3f", total_ssim_hbd);
+        SNPRINT2(results, "  %7.3f", total_ssim_hbd);
+        SNPRINT2(results, "  %7.3f", ppi->worst_ssim_hbd);
+      }
+#endif
+      fprintf(f, "%s\n", headings);
+      fprintf(f, "%s\n", results);
+    }
+
+    fclose(f);
+
+    if (ppi->ssim_vars != NULL) {
+      aom_free(ppi->ssim_vars);
+      ppi->ssim_vars = NULL;
+    }
+  }
+}
+#endif  // CONFIG_INTERNAL_STATS
+
+static AOM_INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+  if (cpi->common.show_frame && cpi->rc.frames_to_key) {
+#if !CONFIG_REALTIME_ONLY
+    FIRSTPASS_INFO *firstpass_info = &cpi->ppi->twopass.firstpass_info;
+    if (firstpass_info->past_stats_count > FIRSTPASS_INFO_STATS_PAST_MIN) {
+      av1_firstpass_info_move_cur_index_and_pop(firstpass_info);
+    } else {
+      // When there is not enough past stats, we move the current
+      // index without popping the past stats
+      av1_firstpass_info_move_cur_index(firstpass_info);
+    }
+#endif
+    cpi->rc.frames_since_key++;
+    cpi->rc.frames_to_key--;
+    cpi->rc.frames_to_fwd_kf--;
+  }
+}
+
+static AOM_INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+  // TODO(weitinglin): Updating this counter for is_frame_droppable
+  // is a work-around to handle the condition when a frame is drop.
+  // We should fix the cpi->common.show_frame flag
+  // instead of checking the other condition to update the counter properly.
+  if (cpi->common.show_frame ||
+      is_frame_droppable(&cpi->svc, &cpi->ext_flags.refresh_frame)) {
+    // Decrement count down till next gf
+    if (cpi->rc.frames_till_gf_update_due > 0)
+      cpi->rc.frames_till_gf_update_due--;
+  }
+}
+
+static AOM_INLINE void update_gf_group_index(AV1_COMP *cpi) {
+  // Increment the gf group index ready for the next frame.
+  ++cpi->gf_frame_index;
+}
+
+static void update_fb_of_context_type(const AV1_COMP *const cpi,
+                                      int *const fb_of_context_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int current_frame_ref_type = get_current_frame_ref_type(cpi);
+
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+      cpi->ext_flags.use_primary_ref_none) {
+    for (int i = 0; i < REF_FRAMES; i++) {
+      fb_of_context_type[i] = -1;
+    }
+    fb_of_context_type[current_frame_ref_type] =
+        cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME)
+                       : get_ref_frame_map_idx(cm, ALTREF_FRAME);
+  }
+
+  if (!encode_show_existing_frame(cm)) {
+    // Refresh fb_of_context_type[]: see encoder.h for explanation
+    if (cm->current_frame.frame_type == KEY_FRAME) {
+      // All ref frames are refreshed, pick one that will live long enough
+      fb_of_context_type[current_frame_ref_type] = 0;
+    } else {
+      // If more than one frame is refreshed, it doesn't matter which one we
+      // pick so pick the first.  LST sometimes doesn't refresh any: this is ok
+
+      for (int i = 0; i < REF_FRAMES; i++) {
+        if (cm->current_frame.refresh_frame_flags & (1 << i)) {
+          fb_of_context_type[current_frame_ref_type] = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+  update_keyframe_counters(cpi);
+  update_frames_till_gf_update(cpi);
+  update_gf_group_index(cpi);
+}
+
+static void update_end_of_frame_stats(AV1_COMP *cpi) {
+  if (cpi->do_frame_data_update) {
+    // Store current frame loopfilter levels in ppi, if update flag is set.
+    if (!cpi->common.show_existing_frame) {
+      AV1_COMMON *const cm = &cpi->common;
+      struct loopfilter *const lf = &cm->lf;
+      cpi->ppi->filter_level[0] = lf->filter_level[0];
+      cpi->ppi->filter_level[1] = lf->filter_level[1];
+      cpi->ppi->filter_level_u = lf->filter_level_u;
+      cpi->ppi->filter_level_v = lf->filter_level_v;
+    }
+  }
+  // Store frame level mv_stats from cpi to ppi.
+  cpi->ppi->mv_stats = cpi->mv_stats;
+}
+
+// Updates frame level stats related to global motion
+static AOM_INLINE void update_gm_stats(AV1_COMP *cpi) {
+  FRAME_UPDATE_TYPE update_type =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+  int i, is_gm_present = 0;
+
+  // Check if the current frame has any valid global motion model across its
+  // reference frames
+  for (i = 0; i < REF_FRAMES; i++) {
+    if (cpi->common.global_motion[i].wmtype != IDENTITY) {
+      is_gm_present = 1;
+      break;
+    }
+  }
+  int update_actual_stats = 1;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  update_actual_stats =
+      (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+  if (!update_actual_stats) {
+    if (cpi->ppi->temp_valid_gm_model_found[update_type] == INT32_MAX) {
+      cpi->ppi->temp_valid_gm_model_found[update_type] = is_gm_present;
+    } else {
+      cpi->ppi->temp_valid_gm_model_found[update_type] |= is_gm_present;
+    }
+    int show_existing_between_parallel_frames =
+        (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+             INTNL_OVERLAY_UPDATE &&
+         cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+    if (cpi->do_frame_data_update == 1 &&
+        !show_existing_between_parallel_frames) {
+      for (i = 0; i < FRAME_UPDATE_TYPES; i++) {
+        cpi->ppi->valid_gm_model_found[i] =
+            cpi->ppi->temp_valid_gm_model_found[i];
+      }
+    }
+  }
+#endif
+  if (update_actual_stats) {
+    if (cpi->ppi->valid_gm_model_found[update_type] == INT32_MAX) {
+      cpi->ppi->valid_gm_model_found[update_type] = is_gm_present;
+    } else {
+      cpi->ppi->valid_gm_model_found[update_type] |= is_gm_present;
+    }
   }
 }
+
+void av1_post_encode_updates(AV1_COMP *const cpi,
+                             const AV1_COMP_DATA *const cpi_data) {
+  AV1_PRIMARY *const ppi = cpi->ppi;
+  AV1_COMMON *const cm = &cpi->common;
+
+  update_gm_stats(cpi);
+
+#if !CONFIG_REALTIME_ONLY
+  // Update the total stats remaining structure.
+  if (cpi->twopass_frame.this_frame != NULL &&
+      ppi->twopass.stats_buf_ctx->total_left_stats) {
+    subtract_stats(ppi->twopass.stats_buf_ctx->total_left_stats,
+                   cpi->twopass_frame.this_frame);
+  }
+#endif
+
+  if (!is_stat_generation_stage(cpi) && !cpi->is_dropped_frame) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    // Before calling refresh_reference_frames(), copy ppi->ref_frame_map_copy
+    // to cm->ref_frame_map for frame_parallel_level 2 frame in a parallel
+    // encode set of lower layer frames.
+    // TODO(Remya): Move ref_frame_map from AV1_COMMON to AV1_PRIMARY to avoid
+    // copy.
+    if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 2 &&
+        ppi->gf_group.frame_parallel_level[cpi->gf_frame_index - 1] == 1 &&
+        ppi->gf_group.update_type[cpi->gf_frame_index - 1] ==
+            INTNL_ARF_UPDATE) {
+      memcpy(cm->ref_frame_map, ppi->ref_frame_map_copy,
+             sizeof(cm->ref_frame_map));
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    refresh_reference_frames(cpi);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    // For frame_parallel_level 1 frame in a parallel encode set of lower layer
+    // frames, store the updated cm->ref_frame_map in ppi->ref_frame_map_copy.
+    if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1 &&
+        ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+      memcpy(ppi->ref_frame_map_copy, cm->ref_frame_map,
+             sizeof(cm->ref_frame_map));
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    av1_rc_postencode_update(cpi, cpi_data->frame_size);
+  }
+
+  if (cpi_data->pop_lookahead == 1) {
+    av1_lookahead_pop(cpi->ppi->lookahead, cpi_data->flush,
+                      cpi->compressor_stage);
+  }
+  if (cpi->common.show_frame) {
+    cpi->ppi->ts_start_last_show_frame = cpi_data->ts_frame_start;
+    cpi->ppi->ts_end_last_show_frame = cpi_data->ts_frame_end;
+  }
+  if (ppi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) {
+    // Initialize level info. at the beginning of each sequence.
+    if (cm->current_frame.frame_type == KEY_FRAME &&
+        ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+      av1_init_level_info(cpi);
+    }
+    av1_update_level_info(cpi, cpi_data->frame_size, cpi_data->ts_frame_start,
+                          cpi_data->ts_frame_end);
+  }
+
+  if (!is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
+    if (!has_no_stats_stage(cpi)) av1_twopass_postencode_update(cpi);
+#endif
+    update_fb_of_context_type(cpi, ppi->fb_of_context_type);
+    update_rc_counts(cpi);
+    update_end_of_frame_stats(cpi);
+  }
+
+  if (cpi->oxcf.pass == AOM_RC_THIRD_PASS && cpi->third_pass_ctx) {
+    av1_pop_third_pass_info(cpi->third_pass_ctx);
+  }
+
+  if (ppi->use_svc) av1_save_layer_context(cpi);
+
+  // Note *size = 0 indicates a dropped frame for which psnr is not calculated
+  if (ppi->b_calculate_psnr && cpi_data->frame_size > 0) {
+    if (cm->show_existing_frame ||
+        (!is_stat_generation_stage(cpi) && cm->show_frame)) {
+      generate_psnr_packet(cpi);
+    }
+  }
+
+#if CONFIG_INTERNAL_STATS
+  if (!is_stat_generation_stage(cpi)) {
+    compute_internal_stats(cpi, (int)cpi_data->frame_size);
+  }
 #endif  // CONFIG_INTERNAL_STATS
-int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
-                            size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush,
-                            const aom_rational64_t *timestamp_ratio) {
+
+  // Write frame info. Subtract 1 from frame index since if was incremented in
+  // update_rc_counts.
+  av1_write_second_pass_per_frame_info(cpi, cpi->gf_frame_index - 1);
+}
+
+int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error->jmp)) {
+    cm->error->setjmp = 0;
+    return cm->error->error_code;
+  }
+  cm->error->setjmp = 1;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+#if CONFIG_INTERNAL_STATS
+  cpi->frame_recode_hits = 0;
+  cpi->time_compress_data = 0;
+  cpi->bytes = 0;
+#endif
+#if CONFIG_ENTROPY_STATS
+  if (cpi->compressor_stage == ENCODE_STAGE) {
+    av1_zero(cpi->counts);
+  }
+#endif
+
 #if CONFIG_BITSTREAM_DEBUG
-  assert(cpi->oxcf.max_threads == 0 &&
+  assert(cpi->oxcf.max_threads <= 1 &&
          "bitstream debug tool does not support multithreading");
   bitstream_queue_record_write();
-  aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number * 2 +
-                                      cm->show_frame);
+
+  if (cm->seq_params->order_hint_info.enable_order_hint) {
+    aom_bitstream_queue_set_frame_write(cm->current_frame.order_hint * 2 +
+                                        cm->show_frame);
+  } else {
+    // This is currently used in RTC encoding. cm->show_frame is always 1.
+    aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number);
+  }
 #endif
-  if (cpi->use_svc && cm->number_spatial_layers > 1) {
+  if (cpi->ppi->use_svc && cpi->ppi->number_spatial_layers > 1) {
     av1_one_pass_cbr_svc_start_layer(cpi);
   }
 
+  cpi->is_dropped_frame = false;
   cm->showable_frame = 0;
-  *size = 0;
+  cpi_data->frame_size = 0;
+  cpi->available_bs_size = cpi_data->cx_data_sz;
 #if CONFIG_INTERNAL_STATS
   struct aom_usec_timer cmptimer;
   aom_usec_timer_start(&cmptimer);
@@ -6837,70 +4421,414 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   av1_set_high_precision_mv(cpi, 1, 0);
 
   // Normal defaults
-  cm->features.refresh_frame_context = oxcf->frame_parallel_decoding_mode
-                                           ? REFRESH_FRAME_CONTEXT_DISABLED
-                                           : REFRESH_FRAME_CONTEXT_BACKWARD;
-  if (oxcf->large_scale_tile)
+  cm->features.refresh_frame_context =
+      oxcf->tool_cfg.frame_parallel_decoding_mode
+          ? REFRESH_FRAME_CONTEXT_DISABLED
+          : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->tile_cfg.enable_large_scale_tile)
     cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
-  // Initialize fields related to forward keyframes
-  cpi->no_show_kf = 0;
+  if (assign_cur_frame_new_fb(cm) == NULL) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                       "Failed to allocate new cur_frame");
+#else
+    return AOM_CODEC_ERROR;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+  }
 
-  if (assign_cur_frame_new_fb(cm) == NULL) return AOM_CODEC_ERROR;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  // Accumulate 2nd pass time in 2-pass case or 1 pass time in 1-pass case.
+  if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0)
+    start_timing(cpi, av1_encode_strategy_time);
+#endif
 
-  const int result =
-      av1_encode_strategy(cpi, size, dest, frame_flags, time_stamp, time_end,
-                          timestamp_ratio, flush);
-  if (result != AOM_CODEC_OK && result != -1) {
-    return AOM_CODEC_ERROR;
-  } else if (result == -1) {
+  const int result = av1_encode_strategy(
+      cpi, &cpi_data->frame_size, cpi_data->cx_data, &cpi_data->lib_flags,
+      &cpi_data->ts_frame_start, &cpi_data->ts_frame_end,
+      cpi_data->timestamp_ratio, &cpi_data->pop_lookahead, cpi_data->flush);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0)
+    end_timing(cpi, av1_encode_strategy_time);
+
+  // Print out timing information.
+  // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of
+  // show_existing_frame and lag-in-frames.
+  if ((cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) &&
+      cpi->frame_component_time[0] > 100) {
+    int i;
+    uint64_t frame_total = 0, total = 0;
+    const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    FRAME_UPDATE_TYPE frame_update_type =
+        get_frame_update_type(gf_group, cpi->gf_frame_index);
+
+    fprintf(stderr,
+            "\n Frame number: %d, Frame type: %s, Show Frame: %d, Frame Update "
+            "Type: %d, Q: %d\n",
+            cm->current_frame.frame_number,
+            get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame,
+            frame_update_type, cm->quant_params.base_qindex);
+    for (i = 0; i < kTimingComponents; i++) {
+      cpi->component_time[i] += cpi->frame_component_time[i];
+      // Use av1_encode_strategy_time (i = 0) as the total time.
+      if (i == 0) {
+        frame_total = cpi->frame_component_time[0];
+        total = cpi->component_time[0];
+      }
+      fprintf(stderr,
+              " %50s:  %15" PRId64 " us [%6.2f%%] (total: %15" PRId64
+              " us [%6.2f%%])\n",
+              get_component_name(i), cpi->frame_component_time[i],
+              (float)((float)cpi->frame_component_time[i] * 100.0 /
+                      (float)frame_total),
+              cpi->component_time[i],
+              (float)((float)cpi->component_time[i] * 100.0 / (float)total));
+      cpi->frame_component_time[i] = 0;
+    }
+  }
+#endif
+
+  if (result == -1) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    cm->error->setjmp = 0;
+#endif
     // Returning -1 indicates no frame encoded; more input is required
     return -1;
   }
+  if (result != AOM_CODEC_OK) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                       "Failed to encode frame");
+#else
+    return AOM_CODEC_ERROR;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+  }
 #if CONFIG_INTERNAL_STATS
   aom_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
 #endif  // CONFIG_INTERNAL_STATS
-  if (cpi->b_calculate_psnr) {
-    if (cm->show_existing_frame ||
-        (!is_stat_generation_stage(cpi) && cm->show_frame)) {
-      generate_psnr_packet(cpi);
+
+#if CONFIG_SPEED_STATS
+  if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) {
+    cpi->tx_search_count += cpi->td.mb.txfm_search_info.tx_search_count;
+    cpi->td.mb.txfm_search_info.tx_search_count = 0;
+  }
+#endif  // CONFIG_SPEED_STATS
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  cm->error->setjmp = 0;
+#endif
+  return AOM_CODEC_OK;
+}
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+// Populates cpi->scaled_ref_buf corresponding to frames in a parallel encode
+// set. Also sets the bitmask 'ref_buffers_used_map'.
+void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map) {
+  AV1_COMMON *cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_yv12_buf(cm, ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+        continue;
+      }
+
+      // FPMT does not support scaling yet.
+      assert(ref->y_crop_width == cm->width &&
+             ref->y_crop_height == cm->height);
+
+      RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
+      cpi->scaled_ref_buf[ref_frame - 1] = buf;
+      for (int i = 0; i < FRAME_BUFFERS; ++i) {
+        if (&cm->buffer_pool->frame_bufs[i] == buf) {
+          *ref_buffers_used_map |= (1 << i);
+        }
+      }
+    } else {
+      if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
     }
   }
+}
 
-#if CONFIG_TUNE_VMAF
-  if (!is_stat_generation_stage(cpi) &&
-      (oxcf->tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
-       oxcf->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
-       oxcf->tuning == AOM_TUNE_VMAF_MAX_GAIN)) {
-    av1_update_vmaf_curve(cpi, cpi->source, &cpi->common.cur_frame->buf);
+// Increments the ref_count of frame buffers referenced by cpi->scaled_ref_buf
+// corresponding to frames in a parallel encode set.
+void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool,
+                                          int ref_buffers_used_map) {
+  for (int i = 0; i < FRAME_BUFFERS; ++i) {
+    if (ref_buffers_used_map & (1 << i)) {
+      ++buffer_pool->frame_bufs[i].ref_count;
+    }
   }
-#endif
+}
 
-  if (cpi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) {
-    // Initialize level info. at the beginning of each sequence.
-    if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
-      av1_init_level_info(cpi);
+// Releases cpi->scaled_ref_buf corresponding to frames in a parallel encode
+// set.
+void av1_release_scaled_references_fpmt(AV1_COMP *cpi) {
+  // TODO(isbs): only refresh the necessary frames, rather than all of them
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+    if (buf != NULL) {
+      cpi->scaled_ref_buf[i] = NULL;
     }
-    av1_update_level_info(cpi, *size, *time_stamp, *time_end);
   }
+}
 
-#if CONFIG_INTERNAL_STATS
-  if (!is_stat_generation_stage(cpi)) {
-    compute_internal_stats(cpi, (int)(*size));
+// Decrements the ref_count of frame buffers referenced by cpi->scaled_ref_buf
+// corresponding to frames in a parallel encode set.
+void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool,
+                                   int ref_buffers_used_map) {
+  for (int i = 0; i < FRAME_BUFFERS; ++i) {
+    if (ref_buffers_used_map & (1 << i)) {
+      --buffer_pool->frame_bufs[i].ref_count;
+    }
   }
-#endif  // CONFIG_INTERNAL_STATS
-#if CONFIG_SPEED_STATS
-  if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) {
-    cpi->tx_search_count += cpi->td.mb.tx_search_count;
-    cpi->td.mb.tx_search_count = 0;
+}
+
+// Initialize parallel frame contexts with screen content decisions.
+void av1_init_sc_decisions(AV1_PRIMARY *const ppi) {
+  AV1_COMP *const first_cpi = ppi->cpi;
+  for (int i = 1; i < ppi->num_fp_contexts; ++i) {
+    AV1_COMP *cur_cpi = ppi->parallel_cpi[i];
+    cur_cpi->common.features.allow_screen_content_tools =
+        first_cpi->common.features.allow_screen_content_tools;
+    cur_cpi->common.features.allow_intrabc =
+        first_cpi->common.features.allow_intrabc;
+    cur_cpi->use_screen_content_tools = first_cpi->use_screen_content_tools;
+    cur_cpi->is_screen_content_type = first_cpi->is_screen_content_type;
   }
-#endif  // CONFIG_SPEED_STATS
+}
 
-  aom_clear_system_state();
+AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi,
+                                          AV1_COMP_DATA *const first_cpi_data) {
+  int cpi_idx = 0;
 
-  return AOM_CODEC_OK;
+  // Loop over parallel_cpi to find the cpi that processed the current
+  // gf_frame_index ahead of time.
+  for (int i = 1; i < ppi->num_fp_contexts; i++) {
+    if (ppi->cpi->gf_frame_index == ppi->parallel_cpi[i]->gf_frame_index) {
+      cpi_idx = i;
+      break;
+    }
+  }
+
+  assert(cpi_idx > 0);
+  assert(!ppi->parallel_cpi[cpi_idx]->common.show_existing_frame);
+
+  // Release the previously-used frame-buffer.
+  if (ppi->cpi->common.cur_frame != NULL) {
+    --ppi->cpi->common.cur_frame->ref_count;
+    ppi->cpi->common.cur_frame = NULL;
+  }
+
+  // Swap the appropriate parallel_cpi with the parallel_cpi[0].
+  ppi->cpi = ppi->parallel_cpi[cpi_idx];
+  ppi->parallel_cpi[cpi_idx] = ppi->parallel_cpi[0];
+  ppi->parallel_cpi[0] = ppi->cpi;
+
+  // Copy appropriate parallel_frames_data to local data.
+  {
+    AV1_COMP_DATA *data = &ppi->parallel_frames_data[cpi_idx - 1];
+    assert(data->frame_size > 0);
+    assert(first_cpi_data->cx_data_sz > data->frame_size);
+
+    first_cpi_data->lib_flags = data->lib_flags;
+    first_cpi_data->ts_frame_start = data->ts_frame_start;
+    first_cpi_data->ts_frame_end = data->ts_frame_end;
+    memcpy(first_cpi_data->cx_data, data->cx_data, data->frame_size);
+    first_cpi_data->frame_size = data->frame_size;
+    if (ppi->cpi->common.show_frame) {
+      first_cpi_data->pop_lookahead = 1;
+    }
+  }
+
+  return ppi->cpi;
+}
+
+// Initialises frames belonging to a parallel encode set.
+int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data,
+                                    AV1_PRIMARY *const ppi,
+                                    int *ref_buffers_used_map) {
+  AV1_COMP *const first_cpi = ppi->cpi;
+  GF_GROUP *const gf_group = &ppi->gf_group;
+  int gf_index_start = first_cpi->gf_frame_index;
+  assert(gf_group->frame_parallel_level[gf_index_start] == 1);
+  int parallel_frame_count = 0;
+  int cur_frame_num = first_cpi->common.current_frame.frame_number;
+  int show_frame_count = first_cpi->frame_index_set.show_frame_count;
+  int frames_since_key = first_cpi->rc.frames_since_key;
+  int frames_to_key = first_cpi->rc.frames_to_key;
+  int frames_to_fwd_kf = first_cpi->rc.frames_to_fwd_kf;
+  int cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[gf_index_start];
+  const FIRSTPASS_STATS *stats_in = first_cpi->twopass_frame.stats_in;
+
+  assert(*ref_buffers_used_map == 0);
+
+  // Release the previously used frame-buffer by a frame_parallel_level 1 frame.
+  if (first_cpi->common.cur_frame != NULL) {
+    --first_cpi->common.cur_frame->ref_count;
+    first_cpi->common.cur_frame = NULL;
+  }
+
+  RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+  RefFrameMapPair first_ref_frame_map_pairs[REF_FRAMES];
+  init_ref_map_pair(first_cpi, first_ref_frame_map_pairs);
+  memcpy(ref_frame_map_pairs, first_ref_frame_map_pairs,
+         sizeof(RefFrameMapPair) * REF_FRAMES);
+
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  // Store the reference refresh index of frame_parallel_level 1 frame in a
+  // parallel encode set of lower layer frames.
+  if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) {
+    first_cpi->ref_refresh_index = av1_calc_refresh_idx_for_intnl_arf(
+        first_cpi, ref_frame_map_pairs, gf_index_start);
+    assert(first_cpi->ref_refresh_index != INVALID_IDX &&
+           first_cpi->ref_refresh_index < REF_FRAMES);
+    first_cpi->refresh_idx_available = true;
+    // Update ref_frame_map_pairs.
+    ref_frame_map_pairs[first_cpi->ref_refresh_index].disp_order =
+        gf_group->display_idx[gf_index_start];
+    ref_frame_map_pairs[first_cpi->ref_refresh_index].pyr_level =
+        gf_group->layer_depth[gf_index_start];
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+
+  // Set do_frame_data_update flag as false for frame_parallel_level 1 frame.
+  first_cpi->do_frame_data_update = false;
+  if (gf_group->arf_src_offset[gf_index_start] == 0) {
+    first_cpi->time_stamps.prev_ts_start = ppi->ts_start_last_show_frame;
+    first_cpi->time_stamps.prev_ts_end = ppi->ts_end_last_show_frame;
+  }
+
+  av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                     first_cpi, gf_index_start, 1,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                     first_cpi->common.remapped_ref_idx);
+
+  av1_scale_references_fpmt(first_cpi, ref_buffers_used_map);
+  parallel_frame_count++;
+
+  // Iterate through the GF_GROUP to find the remaining frame_parallel_level 2
+  // frames which are part of the current parallel encode set and initialize the
+  // required cpi elements.
+  for (int i = gf_index_start + 1; i < gf_group->size; i++) {
+    // Update frame counters if previous frame was show frame or show existing
+    // frame.
+    if (gf_group->arf_src_offset[i - 1] == 0) {
+      cur_frame_num++;
+      show_frame_count++;
+      if (frames_to_fwd_kf <= 0)
+        frames_to_fwd_kf = first_cpi->oxcf.kf_cfg.fwd_kf_dist;
+      if (frames_to_key) {
+        frames_since_key++;
+        frames_to_key--;
+        frames_to_fwd_kf--;
+      }
+      stats_in++;
+    }
+    cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[i];
+    if (gf_group->frame_parallel_level[i] == 2) {
+      AV1_COMP *cur_cpi = ppi->parallel_cpi[parallel_frame_count];
+      AV1_COMP_DATA *cur_cpi_data =
+          &ppi->parallel_frames_data[parallel_frame_count - 1];
+      cur_cpi->gf_frame_index = i;
+      cur_cpi->framerate = first_cpi->framerate;
+      cur_cpi->common.current_frame.frame_number = cur_frame_num;
+      cur_cpi->common.current_frame.frame_type = gf_group->frame_type[i];
+      cur_cpi->frame_index_set.show_frame_count = show_frame_count;
+      cur_cpi->rc.frames_since_key = frames_since_key;
+      cur_cpi->rc.frames_to_key = frames_to_key;
+      cur_cpi->rc.frames_to_fwd_kf = frames_to_fwd_kf;
+      cur_cpi->rc.active_worst_quality = first_cpi->rc.active_worst_quality;
+      cur_cpi->rc.avg_frame_bandwidth = first_cpi->rc.avg_frame_bandwidth;
+      cur_cpi->rc.max_frame_bandwidth = first_cpi->rc.max_frame_bandwidth;
+      cur_cpi->rc.min_frame_bandwidth = first_cpi->rc.min_frame_bandwidth;
+      cur_cpi->rc.intervals_till_gf_calculate_due =
+          first_cpi->rc.intervals_till_gf_calculate_due;
+      cur_cpi->mv_search_params.max_mv_magnitude =
+          first_cpi->mv_search_params.max_mv_magnitude;
+      if (gf_group->update_type[cur_cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+        cur_cpi->common.lf.mode_ref_delta_enabled = 1;
+      }
+      cur_cpi->do_frame_data_update = false;
+      // Initialize prev_ts_start and prev_ts_end for show frame(s) and show
+      // existing frame(s).
+      if (gf_group->arf_src_offset[i] == 0) {
+        // Choose source of prev frame.
+        int src_index = gf_group->src_offset[i];
+        struct lookahead_entry *prev_source = av1_lookahead_peek(
+            ppi->lookahead, src_index - 1, cur_cpi->compressor_stage);
+        // Save timestamps of prev frame.
+        cur_cpi->time_stamps.prev_ts_start = prev_source->ts_start;
+        cur_cpi->time_stamps.prev_ts_end = prev_source->ts_end;
+      }
+      cur_cpi->time_stamps.first_ts_start =
+          first_cpi->time_stamps.first_ts_start;
+
+      memcpy(cur_cpi->common.ref_frame_map, first_cpi->common.ref_frame_map,
+             sizeof(first_cpi->common.ref_frame_map));
+      cur_cpi_data->lib_flags = 0;
+      cur_cpi_data->timestamp_ratio = first_cpi_data->timestamp_ratio;
+      cur_cpi_data->flush = first_cpi_data->flush;
+      cur_cpi_data->frame_size = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+      if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) {
+        // If the first frame in a parallel encode set is INTNL_ARF_UPDATE
+        // frame, initialize lib_flags of frame_parallel_level 2 frame in the
+        // set with that of frame_parallel_level 1 frame.
+        cur_cpi_data->lib_flags = first_cpi_data->lib_flags;
+        // Store the reference refresh index of frame_parallel_level 2 frame in
+        // a parallel encode set of lower layer frames.
+        cur_cpi->ref_refresh_index =
+            av1_calc_refresh_idx_for_intnl_arf(cur_cpi, ref_frame_map_pairs, i);
+        cur_cpi->refresh_idx_available = true;
+        // Skip the reference frame which will be refreshed by
+        // frame_parallel_level 1 frame in a parallel encode set of lower layer
+        // frames.
+        cur_cpi->ref_idx_to_skip = first_cpi->ref_refresh_index;
+      } else {
+        cur_cpi->ref_idx_to_skip = INVALID_IDX;
+        cur_cpi->ref_refresh_index = INVALID_IDX;
+        cur_cpi->refresh_idx_available = false;
+      }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+      cur_cpi->twopass_frame.stats_in = stats_in;
+
+      av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                         cur_cpi, i, 1,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                         cur_cpi->common.remapped_ref_idx);
+      av1_scale_references_fpmt(cur_cpi, ref_buffers_used_map);
+      parallel_frame_count++;
+    }
+
+    // Set do_frame_data_update to true for the last frame_parallel_level 2
+    // frame in the current parallel encode set.
+    if (i == (gf_group->size - 1) ||
+        (gf_group->frame_parallel_level[i + 1] == 0 &&
+         (gf_group->update_type[i + 1] == ARF_UPDATE ||
+          gf_group->update_type[i + 1] == INTNL_ARF_UPDATE)) ||
+        gf_group->frame_parallel_level[i + 1] == 1) {
+      ppi->parallel_cpi[parallel_frame_count - 1]->do_frame_data_update = true;
+      break;
+    }
+  }
+
+  av1_increment_scaled_ref_counts_fpmt(first_cpi->common.buffer_pool,
+                                       *ref_buffers_used_map);
+
+  // Return the number of frames in the parallel encode set.
+  return parallel_frame_count;
 }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
 int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
   AV1_COMMON *cm = &cpi->common;
@@ -6912,13 +4840,12 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
       *dest = cm->cur_frame->buf;
       dest->y_width = cm->width;
       dest->y_height = cm->height;
-      dest->uv_width = cm->width >> cm->seq_params.subsampling_x;
-      dest->uv_height = cm->height >> cm->seq_params.subsampling_y;
+      dest->uv_width = cm->width >> cm->seq_params->subsampling_x;
+      dest->uv_height = cm->height >> cm->seq_params->subsampling_y;
       ret = 0;
     } else {
       ret = -1;
     }
-    aom_clear_system_state();
     return ret;
   }
 }
@@ -6930,27 +4857,17 @@ int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
   return 0;
 }
 
-static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
-                                       const YV12_BUFFER_CONFIG *b) {
-  return a->y_height == b->y_height && a->y_width == b->y_width &&
-         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
-         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
-         a->border == b->border &&
-         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
-             (b->flags & YV12_FLAG_HIGHBITDEPTH);
-}
-
 aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
                                        YV12_BUFFER_CONFIG *new_frame,
                                        YV12_BUFFER_CONFIG *sd) {
   const int num_planes = av1_num_planes(cm);
   if (!equal_dimensions_and_border(new_frame, sd))
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   else
     aom_yv12_copy_frame(new_frame, sd, num_planes);
 
-  return cm->error.error_code;
+  return cm->error->error_code;
 }
 
 int av1_set_internal_size(AV1EncoderConfig *const oxcf,
@@ -6964,9 +4881,13 @@ int av1_set_internal_size(AV1EncoderConfig *const oxcf,
   Scale2Ratio(vert_mode, &vr, &vs);
 
   // always go to the next whole number
-  resize_pending_params->width = (hs - 1 + oxcf->width * hr) / hs;
-  resize_pending_params->height = (vs - 1 + oxcf->height * vr) / vs;
+  resize_pending_params->width = (hs - 1 + oxcf->frm_dim_cfg.width * hr) / hs;
+  resize_pending_params->height = (vs - 1 + oxcf->frm_dim_cfg.height * vr) / vs;
 
+  if (horiz_mode != NORMAL || vert_mode != NORMAL) {
+    oxcf->resize_cfg.resize_mode = RESIZE_FIXED;
+    oxcf->algo_cfg.enable_tpl_model = 0;
+  }
   return 0;
 }
 
@@ -7031,14 +4952,14 @@ int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) {
   return AOM_CODEC_OK;
 }
 
-static void svc_set_updates_external_ref_frame_config(
-    ExternalFlags *const ext_flags, SVC *const svc) {
-  ext_flags->refresh_frame_flags_pending = 1;
-  ext_flags->refresh_last_frame = svc->refresh[svc->ref_idx[0]];
-  ext_flags->refresh_golden_frame = svc->refresh[svc->ref_idx[3]];
-  ext_flags->refresh_bwd_ref_frame = svc->refresh[svc->ref_idx[4]];
-  ext_flags->refresh_alt2_ref_frame = svc->refresh[svc->ref_idx[5]];
-  ext_flags->refresh_alt_ref_frame = svc->refresh[svc->ref_idx[6]];
+static void svc_set_updates_ref_frame_config(
+    ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags, SVC *const svc) {
+  ext_refresh_frame_flags->update_pending = 1;
+  ext_refresh_frame_flags->last_frame = svc->refresh[svc->ref_idx[0]];
+  ext_refresh_frame_flags->golden_frame = svc->refresh[svc->ref_idx[3]];
+  ext_refresh_frame_flags->bwd_ref_frame = svc->refresh[svc->ref_idx[4]];
+  ext_refresh_frame_flags->alt2_ref_frame = svc->refresh[svc->ref_idx[5]];
+  ext_refresh_frame_flags->alt_ref_frame = svc->refresh[svc->ref_idx[6]];
   svc->non_reference_frame = 1;
   for (int i = 0; i < REF_FRAMES; i++) {
     if (svc->refresh[i] == 1) {
@@ -7066,6 +4987,8 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
   // GOLDEN, BWDREF, ALTREF2.
 
   ExternalFlags *const ext_flags = &cpi->ext_flags;
+  ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+      &ext_flags->refresh_frame;
   ext_flags->ref_frame_flags = AOM_REFFRAME_ALL;
   if (flags &
       (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
@@ -7090,7 +5013,7 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
 
     av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
   } else {
-    if (cpi->svc.external_ref_frame_config) {
+    if (cpi->svc.set_ref_frame_config) {
       int ref = svc_set_references_external_ref_frame_config(cpi);
       av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
     }
@@ -7111,40 +5034,40 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
       upd ^= AOM_ALT2_FLAG;
     }
 
-    ext_flags->refresh_last_frame = (upd & AOM_LAST_FLAG) != 0;
-    ext_flags->refresh_golden_frame = (upd & AOM_GOLD_FLAG) != 0;
-    ext_flags->refresh_alt_ref_frame = (upd & AOM_ALT_FLAG) != 0;
-    ext_flags->refresh_bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0;
-    ext_flags->refresh_alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
-    ext_flags->refresh_frame_flags_pending = 1;
+    ext_refresh_frame_flags->last_frame = (upd & AOM_LAST_FLAG) != 0;
+    ext_refresh_frame_flags->golden_frame = (upd & AOM_GOLD_FLAG) != 0;
+    ext_refresh_frame_flags->alt_ref_frame = (upd & AOM_ALT_FLAG) != 0;
+    ext_refresh_frame_flags->bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0;
+    ext_refresh_frame_flags->alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
+    ext_refresh_frame_flags->update_pending = 1;
   } else {
-    if (cpi->svc.external_ref_frame_config)
-      svc_set_updates_external_ref_frame_config(ext_flags, &cpi->svc);
+    if (cpi->svc.set_ref_frame_config)
+      svc_set_updates_ref_frame_config(ext_refresh_frame_flags, &cpi->svc);
     else
-      ext_flags->refresh_frame_flags_pending = 0;
+      ext_refresh_frame_flags->update_pending = 0;
   }
 
-  ext_flags->use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs &
+  ext_flags->use_ref_frame_mvs = cpi->oxcf.tool_cfg.enable_ref_frame_mvs &
                                  ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0);
-  ext_flags->use_error_resilient = cpi->oxcf.error_resilient_mode |
+  ext_flags->use_error_resilient = cpi->oxcf.tool_cfg.error_resilient_mode |
                                    ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0);
   ext_flags->use_s_frame =
-      cpi->oxcf.s_frame_mode | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
+      cpi->oxcf.kf_cfg.enable_sframe | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
   ext_flags->use_primary_ref_none =
       (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0;
 
   if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
-    av1_update_entropy(&ext_flags->refresh_frame_context,
-                       &ext_flags->refresh_frame_context_pending, 0);
+    update_entropy(&ext_flags->refresh_frame_context,
+                   &ext_flags->refresh_frame_context_pending, 0);
   }
 }
 
-aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
-  if (!cpi) return NULL;
+aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi) {
+  if (!ppi) return NULL;
 
   uint8_t header_buf[512] = { 0 };
   const uint32_t sequence_header_size =
-      av1_write_sequence_header_obu(&cpi->common.seq_params, &header_buf[0]);
+      av1_write_sequence_header_obu(&ppi->seq_params, &header_buf[0]);
   assert(sequence_header_size <= sizeof(header_buf));
   if (sequence_header_size == 0) return NULL;
 
@@ -7155,7 +5078,8 @@ aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
   if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
   memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
 
-  if (av1_write_obu_header(&cpi->level_params, OBU_SEQUENCE_HEADER, 0,
+  if (av1_write_obu_header(&ppi->level_params, &ppi->cpi->frame_header_count,
+                           OBU_SEQUENCE_HEADER, 0,
                            &header_buf[0]) != obu_header_size) {
     return NULL;
   }
diff --git a/media/libaom/src/av1/encoder/encoder.h b/media/libaom/src/av1/encoder/encoder.h
index 82d00cb76a..b9a71002f6 100644
--- a/media/libaom/src/av1/encoder/encoder.h
+++ b/media/libaom/src/av1/encoder/encoder.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\file
+ * \brief Declares top-level encoder structures and functions.
+ */
 #ifndef AOM_AV1_ENCODER_ENCODER_H_
 #define AOM_AV1_ENCODER_ENCODER_H_
 
@@ -24,23 +27,33 @@
 #include "av1/common/blockd.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
 #include "av1/common/resize.h"
 #include "av1/common/thread_common.h"
 #include "av1/common/timing.h"
+
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/block.h"
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodemb.h"
+#include "av1/encoder/external_partition.h"
 #include "av1/encoder/firstpass.h"
+#include "av1/encoder/global_motion.h"
 #include "av1/encoder/level.h"
 #include "av1/encoder/lookahead.h"
 #include "av1/encoder/mcomp.h"
+#include "av1/encoder/pickcdef.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/svc_layercontext.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/thirdpass.h"
 #include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/av1_noise_estimate.h"
+#include "av1/encoder/bitstream.h"
 
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
@@ -49,6 +62,16 @@
 #if CONFIG_DENOISE
 #include "aom_dsp/noise_model.h"
 #endif
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+#if CONFIG_AV1_TEMPORAL_DENOISING
+#include "av1/encoder/av1_temporal_denoiser.h"
+#endif
+#if CONFIG_TUNE_BUTTERAUGLI
+#include "av1/encoder/tune_butteraugli.h"
+#endif
+
 #include "aom/internal/aom_codec_internal.h"
 #include "aom_util/aom_thread.h"
 
@@ -56,9 +79,23 @@
 extern "C" {
 #endif
 
+// TODO(yunqing, any): Added suppression tag to quiet Doxygen warnings. Need to
+// adjust it while we work on documentation.
+/*!\cond */
 // Number of frames required to test for scene cut detection
 #define SCENE_CUT_KEY_TEST_INTERVAL 16
 
+// Lookahead index threshold to enable temporal filtering for second arf.
+#define TF_LOOKAHEAD_IDX_THR 7
+
+#define HDR_QP_LEVELS 10
+#define CHROMA_CB_QP_SCALE 1.04
+#define CHROMA_CR_QP_SCALE 1.04
+#define CHROMA_QP_SCALE -0.46
+#define CHROMA_QP_OFFSET 9.26
+#define QP_SCALE_FACTOR 2.0
+#define DISABLE_HDR_LUMA_DELTAQ 1
+
 // Rational number with an int64 numerator
 // This structure holds a fractional value
 typedef struct aom_rational64 {
@@ -66,20 +103,14 @@ typedef struct aom_rational64 {
   int den;           // fraction denominator
 } aom_rational64_t;  // alias for struct aom_rational
 
-typedef struct {
-#if CONFIG_SUPERRES_IN_RECODE
-  struct loopfilter lf;
-  CdefInfo cdef_info;
-  YV12_BUFFER_CONFIG copy_buffer;
-  RATE_CONTROL rc;
-#endif  // CONFIG_SUPERRES_IN_RECODE
-} CODING_CONTEXT;
-
 enum {
   NORMAL = 0,
   FOURFIVE = 1,
   THREEFIVE = 2,
-  ONETWO = 3
+  THREEFOUR = 3,
+  ONEFOUR = 4,
+  ONEEIGHT = 5,
+  ONETWO = 6
 } UENUM1BYTE(AOM_SCALING);
 
 enum {
@@ -88,7 +119,9 @@ enum {
   GOOD,
   // Realtime Fast Encoding. Will force some restrictions on bitrate
   // constraints.
-  REALTIME
+  REALTIME,
+  // All intra mode. All the frames are coded as intra frames.
+  ALLINTRA
 } UENUM1BYTE(MODE);
 
 enum {
@@ -102,6 +135,31 @@ enum {
   FRAMEFLAGS_ERROR_RESILIENT = 1 << 6,
 } UENUM1BYTE(FRAMETYPE_FLAGS);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+enum {
+  PARALLEL_ENCODE = 0,
+  PARALLEL_SIMULATION_ENCODE,
+  NUM_FPMT_TEST_ENCODES
+} UENUM1BYTE(FPMT_TEST_ENC_CFG);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+// 0 level frames are sometimes used for rate control purposes, but for
+// reference mapping purposes, the minimum level should be 1.
+#define MIN_PYR_LEVEL 1
+static INLINE int get_true_pyr_level(int frame_level, int frame_order,
+                                     int max_layer_depth) {
+  if (frame_order == 0) {
+    // Keyframe case
+    return MIN_PYR_LEVEL;
+  } else if (frame_level == MAX_ARF_LAYERS) {
+    // Leaves
+    return max_layer_depth;
+  } else if (frame_level == (MAX_ARF_LAYERS + 1)) {
+    // Altrefs
+    return MIN_PYR_LEVEL;
+  }
+  return AOMMAX(MIN_PYR_LEVEL, frame_level);
+}
+
 enum {
   NO_AQ = 0,
   VARIANCE_AQ = 1,
@@ -111,374 +169,951 @@ enum {
 } UENUM1BYTE(AQ_MODE);
 enum {
   NO_DELTA_Q = 0,
-  DELTA_Q_OBJECTIVE = 1,   // Modulation to improve objective quality
-  DELTA_Q_PERCEPTUAL = 2,  // Modulation to improve perceptual quality
-  DELTA_Q_MODE_COUNT       // This should always be the last member of the enum
+  DELTA_Q_OBJECTIVE = 1,      // Modulation to improve objective quality
+  DELTA_Q_PERCEPTUAL = 2,     // Modulation to improve video perceptual quality
+  DELTA_Q_PERCEPTUAL_AI = 3,  // Perceptual quality opt for all intra mode
+  DELTA_Q_USER_RATING_BASED = 4,  // User rating based delta q mode
+  DELTA_Q_HDR = 5,    // QP adjustment based on HDR block pixel average
+  DELTA_Q_MODE_COUNT  // This should always be the last member of the enum
 } UENUM1BYTE(DELTAQ_MODE);
 
 enum {
-  RESIZE_NONE = 0,    // No frame resizing allowed.
-  RESIZE_FIXED = 1,   // All frames are coded at the specified scale.
-  RESIZE_RANDOM = 2,  // All frames are coded at a random scale.
+  RESIZE_NONE = 0,     // No frame resizing allowed.
+  RESIZE_FIXED = 1,    // All frames are coded at the specified scale.
+  RESIZE_RANDOM = 2,   // All frames are coded at a random scale.
+  RESIZE_DYNAMIC = 3,  // Frames coded at lower scale based on rate control.
   RESIZE_MODES
 } UENUM1BYTE(RESIZE_MODE);
 
 enum {
-  SUPERRES_NONE,     // No frame superres allowed.
-  SUPERRES_FIXED,    // All frames are coded at the specified scale,
-                     // and super-resolved.
-  SUPERRES_RANDOM,   // All frames are coded at a random scale,
-                     // and super-resolved.
-  SUPERRES_QTHRESH,  // Superres scale for a frame is determined based on
-                     // q_index.
-  SUPERRES_AUTO,     // Automatically select superres for appropriate frames.
-  SUPERRES_MODES
-} UENUM1BYTE(SUPERRES_MODE);
-
-typedef enum {
-  kInvalid = 0,
-  kLowSad = 1,
-  kHighSad = 2,
-  kLowVarHighSumdiff = 3,
-} CONTENT_STATE_SB;
-
-enum {
   SS_CFG_SRC = 0,
   SS_CFG_LOOKAHEAD = 1,
   SS_CFG_FPF = 2,
   SS_CFG_TOTAL = 3
 } UENUM1BYTE(SS_CFG_OFFSET);
 
-// TODO(jingning): This needs to be cleaned up next.
-#define MAX_LENGTH_TPL_FRAME_STATS (MAX_TOTAL_BUFFERS + REF_FRAMES + 1)
-
-typedef struct TplDepStats {
-  int64_t intra_cost;
-  int64_t inter_cost;
-  int64_t srcrf_dist;
-  int64_t recrf_dist;
-  int64_t srcrf_rate;
-  int64_t recrf_rate;
-  int64_t mc_dep_rate;
-  int64_t mc_dep_dist;
-  int_mv mv[INTER_REFS_PER_FRAME];
-  int ref_frame_index;
-  int64_t pred_error[INTER_REFS_PER_FRAME];
-  int64_t mc_count;
-  int64_t mc_saved;
-} TplDepStats;
-
-typedef struct TplDepFrame {
-  uint8_t is_valid;
-  TplDepStats *tpl_stats_ptr;
-  const YV12_BUFFER_CONFIG *gf_picture;
-  YV12_BUFFER_CONFIG *rec_picture;
-  int ref_map_index[REF_FRAMES];
-  int stride;
-  int width;
-  int height;
-  int mi_rows;
-  int mi_cols;
-  unsigned int frame_display_index;
-  int base_rdmult;
-} TplDepFrame;
-
-typedef struct TplParams {
-  // Block granularity of tpl score storage.
-  uint8_t tpl_stats_block_mis_log2;
-
-  // Buffer to store the frame level tpl information for each frame in a gf
-  // group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf
-  // group
-  TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS];
-
-  // Buffer to store tpl stats at block granularity.
-  // tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf
-  // group.
-  TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS];
+enum {
+  DISABLE_SCENECUT,        // For LAP, lag_in_frames < 19
+  ENABLE_SCENECUT_MODE_1,  // For LAP, lag_in_frames >=19 and < 33
+  ENABLE_SCENECUT_MODE_2   // For twopass and LAP - lag_in_frames >=33
+} UENUM1BYTE(SCENECUT_MODE);
 
-  // Buffer to store tpl reconstructed frame.
-  // tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group.
-  YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS];
+#define MAX_VBR_CORPUS_COMPLEXITY 10000
 
-  // Pointer to tpl_stats_buffer.
-  TplDepFrame *tpl_frame;
-} TplParams;
+/*!\cond */
 
 typedef enum {
-  COST_UPD_SB,
-  COST_UPD_SBROW,
-  COST_UPD_TILE,
-  COST_UPD_OFF,
+  MOD_FP,           // First pass
+  MOD_TF,           // Temporal filtering
+  MOD_TPL,          // TPL
+  MOD_GME,          // Global motion estimation
+  MOD_ENC,          // Encode stage
+  MOD_LPF,          // Deblocking loop filter
+  MOD_CDEF_SEARCH,  // CDEF search
+  MOD_CDEF,         // CDEF frame
+  MOD_LR,           // Loop restoration filtering
+  MOD_PACK_BS,      // Pack bitstream
+  MOD_FRAME_ENC,    // Frame Parallel encode
+  NUM_MT_MODULES
+} MULTI_THREADED_MODULES;
+
+/*!\endcond */
+
+/*!\enum COST_UPDATE_TYPE
+ * \brief    This enum controls how often the entropy costs should be updated.
+ * \warning  In case of any modifications/additions done to the enum
+ * COST_UPDATE_TYPE, the enum INTERNAL_COST_UPDATE_TYPE needs to be updated as
+ * well.
+ */
+typedef enum {
+  COST_UPD_SB,           /*!< Update every sb. */
+  COST_UPD_SBROW,        /*!< Update every sb rows inside a tile. */
+  COST_UPD_TILE,         /*!< Update every tile. */
+  COST_UPD_OFF,          /*!< Turn off cost updates. */
+  NUM_COST_UPDATE_TYPES, /*!< Number of cost update types. */
 } COST_UPDATE_TYPE;
 
-#define TPL_DEP_COST_SCALE_LOG2 4
-
-typedef struct AV1EncoderConfig {
-  BITSTREAM_PROFILE profile;
-  aom_bit_depth_t bit_depth;     // Codec bit-depth.
-  int width;                     // width of data passed to the compressor
-  int height;                    // height of data passed to the compressor
-  int forced_max_frame_width;    // forced maximum width of frame (if != 0)
-  int forced_max_frame_height;   // forced maximum height of frame (if != 0)
-  unsigned int input_bit_depth;  // Input bit depth.
-  double init_framerate;         // set to passed in framerate
-  int64_t target_bandwidth;      // bandwidth to be used in bits per second
-
-  int noise_sensitivity;  // pre processing blur: recommendation 0
-  int sharpness;          // sharpening output: recommendation 0:
-  int speed;
-  // maximum allowed bitrate for any intra frame in % of bitrate target.
-  unsigned int rc_max_intra_bitrate_pct;
-  // maximum allowed bitrate for any inter frame in % of bitrate target.
-  unsigned int rc_max_inter_bitrate_pct;
-  // percent of rate boost for golden frame in CBR mode.
-  unsigned int gf_cbr_boost_pct;
+/*!\enum LOOPFILTER_CONTROL
+ * \brief This enum controls to which frames loopfilter is applied.
+ */
+typedef enum {
+  LOOPFILTER_NONE = 0,      /*!< Disable loopfilter on all frames. */
+  LOOPFILTER_ALL = 1,       /*!< Enable loopfilter for all frames. */
+  LOOPFILTER_REFERENCE = 2, /*!< Disable loopfilter on non reference frames. */
+  LOOPFILTER_SELECTIVELY =
+      3, /*!< Disable loopfilter on frames with low motion. */
+} LOOPFILTER_CONTROL;
+
+/*!
+ * \brief Encoder config related to resize.
+ */
+typedef struct {
+  /*!
+   * Indicates the frame resize mode to be used by the encoder.
+   */
+  RESIZE_MODE resize_mode;
+  /*!
+   * Indicates the denominator for resize of inter frames, assuming 8 as the
+   *  numerator. Its value ranges between 8-16.
+   */
+  uint8_t resize_scale_denominator;
+  /*!
+   * Indicates the denominator for resize of key frames, assuming 8 as the
+   * numerator. Its value ranges between 8-16.
+   */
+  uint8_t resize_kf_scale_denominator;
+} ResizeCfg;
 
-  MODE mode;
-  int pass;
+/*!
+ * \brief Encoder config for coding block partitioning.
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if rectanguar partitions should be enabled.
+   */
+  bool enable_rect_partitions;
+  /*!
+   * Flag to indicate if AB partitions should be enabled.
+   */
+  bool enable_ab_partitions;
+  /*!
+   * Flag to indicate if 1:4 / 4:1 partitions should be enabled.
+   */
+  bool enable_1to4_partitions;
+  /*!
+   * Indicates the minimum partition size that should be allowed. Both width and
+   * height of a partition cannot be smaller than the min_partition_size.
+   */
+  BLOCK_SIZE min_partition_size;
+  /*!
+   * Indicates the maximum partition size that should be allowed. Both width and
+   * height of a partition cannot be larger than the max_partition_size.
+   */
+  BLOCK_SIZE max_partition_size;
+} PartitionCfg;
 
-  // Key Framing Operations
-  int auto_key;  // autodetect cut scenes and set the keyframes
-  int key_freq;  // maximum distance to key frame.
-  int sframe_dist;
-  int sframe_mode;
-  int sframe_enabled;
-  int lag_in_frames;  // how many frames lag before we start encoding
-  int fwd_kf_enabled;
+/*!
+ * \brief Encoder flags for intra prediction.
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if intra edge filtering process should be enabled.
+   */
+  bool enable_intra_edge_filter;
+  /*!
+   * Flag to indicate if recursive filtering based intra prediction should be
+   * enabled.
+   */
+  bool enable_filter_intra;
+  /*!
+   * Flag to indicate if smooth intra prediction modes should be enabled.
+   */
+  bool enable_smooth_intra;
+  /*!
+   * Flag to indicate if PAETH intra prediction mode should be enabled.
+   */
+  bool enable_paeth_intra;
+  /*!
+   * Flag to indicate if CFL uv intra mode should be enabled.
+   */
+  bool enable_cfl_intra;
+  /*!
+   * Flag to indicate if directional modes should be enabled.
+   */
+  bool enable_directional_intra;
+  /*!
+   * Flag to indicate if the subset of directional modes from D45 to D203 intra
+   * should be enabled. Has no effect if directional modes are disabled.
+   */
+  bool enable_diagonal_intra;
+  /*!
+   * Flag to indicate if delta angles for directional intra prediction should be
+   * enabled.
+   */
+  bool enable_angle_delta;
+  /*!
+   * Flag to indicate whether to automatically turn off several intral coding
+   * tools.
+   * This flag is only used when "--deltaq-mode=3" is true.
+   * When set to 1, the encoder will analyze the reconstruction quality
+   * as compared to the source image in the preprocessing pass.
+   * If the recontruction quality is considered high enough, we disable
+   * the following intra coding tools, for better encoding speed:
+   * "--enable_smooth_intra",
+   * "--enable_paeth_intra",
+   * "--enable_cfl_intra",
+   * "--enable_diagonal_intra".
+   */
+  bool auto_intra_tools_off;
+} IntraModeCfg;
 
-  // ----------------------------------------------------------------
-  // DATARATE CONTROL OPTIONS
+/*!
+ * \brief Encoder flags for transform sizes and types.
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if 64-pt transform should be enabled.
+   */
+  bool enable_tx64;
+  /*!
+   * Flag to indicate if flip and identity transform types should be enabled.
+   */
+  bool enable_flip_idtx;
+  /*!
+   * Flag to indicate if rectangular transform should be enabled.
+   */
+  bool enable_rect_tx;
+  /*!
+   * Flag to indicate whether or not to use a default reduced set for ext-tx
+   * rather than the potential full set of 16 transforms.
+   */
+  bool reduced_tx_type_set;
+  /*!
+   * Flag to indicate if transform type for intra blocks should be limited to
+   * DCT_DCT.
+   */
+  bool use_intra_dct_only;
+  /*!
+   * Flag to indicate if transform type for inter blocks should be limited to
+   * DCT_DCT.
+   */
+  bool use_inter_dct_only;
+  /*!
+   * Flag to indicate if intra blocks should use default transform type
+   * (mode-dependent) only.
+   */
+  bool use_intra_default_tx_only;
+  /*!
+   * Flag to indicate if transform size search should be enabled.
+   */
+  bool enable_tx_size_search;
+} TxfmSizeTypeCfg;
 
-  // vbr, cbr, constrained quality or constant quality
-  enum aom_rc_mode rc_mode;
+/*!
+ * \brief Encoder flags for compound prediction modes.
+ */
+typedef struct {
+  /*!
+   * Flag to indicate if distance-weighted compound type should be enabled.
+   */
+  bool enable_dist_wtd_comp;
+  /*!
+   * Flag to indicate if masked (wedge/diff-wtd) compound type should be
+   * enabled.
+   */
+  bool enable_masked_comp;
+  /*!
+   * Flag to indicate if smooth interintra mode should be enabled.
+   */
+  bool enable_smooth_interintra;
+  /*!
+   * Flag to indicate if difference-weighted compound type should be enabled.
+   */
+  bool enable_diff_wtd_comp;
+  /*!
+   * Flag to indicate if inter-inter wedge compound type should be enabled.
+   */
+  bool enable_interinter_wedge;
+  /*!
+   * Flag to indicate if inter-intra wedge compound type should be enabled.
+   */
+  bool enable_interintra_wedge;
+} CompoundTypeCfg;
 
-  // buffer targeting aggressiveness
-  int under_shoot_pct;
-  int over_shoot_pct;
+/*!
+ * \brief Encoder config related to frame super-resolution.
+ */
+typedef struct {
+  /*!
+   * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH
+   * mode is used for inter frames.
+   */
+  int superres_qthresh;
+  /*!
+   * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH
+   * mode is used for key frames.
+   */
+  int superres_kf_qthresh;
+  /*!
+   * Indicates the denominator of the fraction that specifies the ratio between
+   * the superblock width before and after upscaling for inter frames. The
+   * numerator of this fraction is equal to the constant SCALE_NUMERATOR.
+   */
+  uint8_t superres_scale_denominator;
+  /*!
+   * Indicates the denominator of the fraction that specifies the ratio between
+   * the superblock width before and after upscaling for key frames. The
+   * numerator of this fraction is equal to the constant SCALE_NUMERATOR.
+   */
+  uint8_t superres_kf_scale_denominator;
+  /*!
+   * Indicates the Super-resolution mode to be used by the encoder.
+   */
+  aom_superres_mode superres_mode;
+  /*!
+   * Flag to indicate if super-resolution should be enabled for the sequence.
+   */
+  bool enable_superres;
+} SuperResCfg;
 
-  // buffering parameters
-  int64_t starting_buffer_level_ms;
-  int64_t optimal_buffer_level_ms;
-  int64_t maximum_buffer_size_ms;
+/*!
+ * \brief Encoder config related to the coding of key frames.
+ */
+typedef struct {
+  /*!
+   * Indicates the minimum distance to a key frame.
+   */
+  int key_freq_min;
 
-  // Frame drop threshold.
-  int drop_frames_water_mark;
+  /*!
+   * Indicates the maximum distance to a key frame.
+   */
+  int key_freq_max;
 
-  // controlling quality
-  int fixed_q;
-  int worst_allowed_q;
-  int best_allowed_q;
-  int cq_level;
-  int enable_chroma_deltaq;
-  AQ_MODE aq_mode;  // Adaptive Quantization mode
-  DELTAQ_MODE deltaq_mode;
-  int deltalf_mode;
-  int enable_cdef;
-  int enable_restoration;
-  int force_video_mode;
-  int enable_obmc;
-  int disable_trellis_quant;
-  int using_qm;
-  int qm_y;
-  int qm_u;
-  int qm_v;
-  int qm_minlevel;
-  int qm_maxlevel;
-  unsigned int num_tile_groups;
-  unsigned int mtu;
+  /*!
+   * Indicates if temporal filtering should be applied on keyframe.
+   */
+  int enable_keyframe_filtering;
 
-  // Internal frame size scaling.
-  RESIZE_MODE resize_mode;
-  uint8_t resize_scale_denominator;
-  uint8_t resize_kf_scale_denominator;
+  /*!
+   * Indicates the number of frames after which a frame may be coded as an
+   * S-Frame.
+   */
+  int sframe_dist;
 
-  // Frame Super-Resolution size scaling.
-  SUPERRES_MODE superres_mode;
-  uint8_t superres_scale_denominator;
-  uint8_t superres_kf_scale_denominator;
-  int superres_qthresh;
-  int superres_kf_qthresh;
+  /*!
+   * Indicates how an S-Frame should be inserted.
+   * 1: the considered frame will be made into an S-Frame only if it is an
+   * altref frame. 2: the next altref frame will be made into an S-Frame.
+   */
+  int sframe_mode;
 
-  // Enable feature to reduce the frame quantization every x frames.
-  int frame_periodic_boost;
+  /*!
+   * Indicates if encoder should autodetect cut scenes and set the keyframes.
+   */
+  bool auto_key;
 
-  // two pass datarate control
-  int two_pass_vbrbias;  // two pass datarate control tweaks
-  int two_pass_vbrmin_section;
-  int two_pass_vbrmax_section;
-  // END DATARATE CONTROL OPTIONS
-  // ----------------------------------------------------------------
+  /*!
+   * Indicates the forward key frame distance.
+   */
+  int fwd_kf_dist;
 
-  int enable_auto_arf;
-  int enable_auto_brf;  // (b)ackward (r)ef (f)rame
+  /*!
+   * Indicates if forward keyframe reference should be enabled.
+   */
+  bool fwd_kf_enabled;
 
-  /* Bitfield defining the error resiliency features to enable.
-   * Can provide decodable frames after losses in previous
-   * frames and decodable partitions after losses in the same frame.
+  /*!
+   * Indicates if S-Frames should be enabled for the sequence.
    */
-  unsigned int error_resilient_mode;
+  bool enable_sframe;
 
-  unsigned int s_frame_mode;
+  /*!
+   * Indicates if intra block copy prediction mode should be enabled or not.
+   */
+  bool enable_intrabc;
+} KeyFrameCfg;
 
-  /* Bitfield defining the parallel decoding mode where the
-   * decoding in successive frames may be conducted in parallel
-   * just by decoding the frame headers.
+/*!
+ * \brief Encoder rate control configuration parameters
+ */
+typedef struct {
+  /*!\cond */
+  // BUFFERING PARAMETERS
+  /*!\endcond */
+  /*!
+   * Indicates the amount of data that will be buffered by the decoding
+   * application prior to beginning playback, and is expressed in units of
+   * time(milliseconds).
    */
-  unsigned int frame_parallel_decoding_mode;
+  int64_t starting_buffer_level_ms;
+  /*!
+   * Indicates the amount of data that the encoder should try to maintain in the
+   * decoder's buffer, and is expressed in units of time(milliseconds).
+   */
+  int64_t optimal_buffer_level_ms;
+  /*!
+   * Indicates the maximum amount of data that may be buffered by the decoding
+   * application, and is expressed in units of time(milliseconds).
+   */
+  int64_t maximum_buffer_size_ms;
 
-  unsigned int limit;
+  /*!
+   * Indicates the bandwidth to be used in bits per second.
+   */
+  int64_t target_bandwidth;
 
-  int arnr_max_frames;
-  int arnr_strength;
+  /*!
+   * Indicates average complexity of the corpus in single pass vbr based on
+   * LAP. 0 indicates that corpus complexity vbr mode is disabled.
+   */
+  unsigned int vbr_corpus_complexity_lap;
+  /*!
+   * Indicates the maximum allowed bitrate for any intra frame as % of bitrate
+   * target.
+   */
+  unsigned int max_intra_bitrate_pct;
+  /*!
+   * Indicates the maximum allowed bitrate for any inter frame as % of bitrate
+   * target.
+   */
+  unsigned int max_inter_bitrate_pct;
+  /*!
+   * Indicates the percentage of rate boost for golden frame in CBR mode.
+   */
+  unsigned int gf_cbr_boost_pct;
+  /*!
+   * min_cr / 100 indicates the target minimum compression ratio for each
+   * frame.
+   */
+  unsigned int min_cr;
+  /*!
+   * Indicates the frame drop threshold.
+   */
+  int drop_frames_water_mark;
+  /*!
+   * under_shoot_pct indicates the tolerance of the VBR algorithm to
+   * undershoot and is used as a trigger threshold for more agressive
+   * adaptation of Q. It's value can range from 0-100.
+   */
+  int under_shoot_pct;
+  /*!
+   * over_shoot_pct indicates the tolerance of the VBR algorithm to overshoot
+   * and is used as a trigger threshold for more agressive adaptation of Q.
+   * It's value can range from 0-1000.
+   */
+  int over_shoot_pct;
+  /*!
+   * Indicates the maximum qindex that can be used by the quantizer i.e. the
+   * worst quality qindex.
+   */
+  int worst_allowed_q;
+  /*!
+   * Indicates the minimum qindex that can be used by the quantizer i.e. the
+   * best quality qindex.
+   */
+  int best_allowed_q;
+  /*!
+   * Indicates the Constant/Constrained Quality level.
+   */
+  int cq_level;
+  /*!
+   * Indicates if the encoding mode is vbr, cbr, constrained quality or
+   * constant quality.
+   */
+  enum aom_rc_mode mode;
+  /*!
+   * Indicates the bias (expressed on a scale of 0 to 100) for determining
+   * target size for the current frame. The value 0 indicates the optimal CBR
+   * mode value should be used, and 100 indicates the optimal VBR mode value
+   * should be used.
+   */
+  int vbrbias;
+  /*!
+   * Indicates the minimum bitrate to be used for a single frame as a percentage
+   * of the target bitrate.
+   */
+  int vbrmin_section;
+  /*!
+   * Indicates the maximum bitrate to be used for a single frame as a percentage
+   * of the target bitrate.
+   */
+  int vbrmax_section;
+} RateControlCfg;
 
+/*!\cond */
+typedef struct {
+  // Indicates the number of frames lag before encoding is started.
+  int lag_in_frames;
+  // Indicates the minimum gf/arf interval to be used.
   int min_gf_interval;
+  // Indicates the maximum gf/arf interval to be used.
   int max_gf_interval;
+  // Indicates the minimum height for GF group pyramid structure to be used.
   int gf_min_pyr_height;
+  // Indicates the maximum height for GF group pyramid structure to be used.
   int gf_max_pyr_height;
+  // Indicates if automatic set and use of altref frames should be enabled.
+  bool enable_auto_arf;
+  // Indicates if automatic set and use of (b)ackward (r)ef (f)rames should be
+  // enabled.
+  bool enable_auto_brf;
+} GFConfig;
 
-  int row_mt;
+typedef struct {
+  // Indicates the number of tile groups.
+  unsigned int num_tile_groups;
+  // Indicates the MTU size for a tile group. If mtu is non-zero,
+  // num_tile_groups is set to DEFAULT_MAX_NUM_TG.
+  unsigned int mtu;
+  // Indicates the number of tile columns in log2.
   int tile_columns;
+  // Indicates the number of tile rows in log2.
   int tile_rows;
+  // Indicates the number of widths in the tile_widths[] array.
   int tile_width_count;
+  // Indicates the number of heights in the tile_heights[] array.
   int tile_height_count;
+  // Indicates the tile widths, and may be empty.
   int tile_widths[MAX_TILE_COLS];
+  // Indicates the tile heights, and may be empty.
   int tile_heights[MAX_TILE_ROWS];
+  // Indicates if large scale tile coding should be used.
+  bool enable_large_scale_tile;
+  // Indicates if single tile decoding mode should be enabled.
+  bool enable_single_tile_decoding;
+  // Indicates if EXT_TILE_DEBUG should be enabled.
+  bool enable_ext_tile_debug;
+} TileConfig;
 
-  int enable_tpl_model;
-  int enable_keyframe_filtering;
+typedef struct {
+  // Indicates the width of the input frame.
+  int width;
+  // Indicates the height of the input frame.
+  int height;
+  // If forced_max_frame_width is non-zero then it is used to force the maximum
+  // frame width written in write_sequence_header().
+  int forced_max_frame_width;
+  // If forced_max_frame_width is non-zero then it is used to force the maximum
+  // frame height written in write_sequence_header().
+  int forced_max_frame_height;
+  // Indicates the frame width after applying both super-resolution and resize
+  // to the coded frame.
+  int render_width;
+  // Indicates the frame height after applying both super-resolution and resize
+  // to the coded frame.
+  int render_height;
+} FrameDimensionCfg;
 
-  int max_threads;
+typedef struct {
+  // Indicates if warped motion should be enabled.
+  bool enable_warped_motion;
+  // Indicates if warped motion should be evaluated or not.
+  bool allow_warped_motion;
+  // Indicates if OBMC motion should be enabled.
+  bool enable_obmc;
+} MotionModeCfg;
 
-  aom_fixed_buf_t two_pass_stats_in;
+typedef struct {
+  // Timing info for each frame.
+  aom_timing_info_t timing_info;
+  // Indicates the number of time units of a decoding clock.
+  uint32_t num_units_in_decoding_tick;
+  // Indicates if decoder model information is present in the coded sequence
+  // header.
+  bool decoder_model_info_present_flag;
+  // Indicates if display model information is present in the coded sequence
+  // header.
+  bool display_model_info_present_flag;
+  // Indicates if timing info for each frame is present.
+  bool timing_info_present;
+} DecoderModelCfg;
 
-  aom_tune_metric tuning;
-  const char *vmaf_model_path;
-  aom_tune_content content;
-  int use_highbitdepth;
+typedef struct {
+  // Indicates the update frequency for coeff costs.
+  COST_UPDATE_TYPE coeff;
+  // Indicates the update frequency for mode costs.
+  COST_UPDATE_TYPE mode;
+  // Indicates the update frequency for mv costs.
+  COST_UPDATE_TYPE mv;
+  // Indicates the update frequency for dv costs.
+  COST_UPDATE_TYPE dv;
+} CostUpdateFreq;
+
+typedef struct {
+  // Indicates the maximum number of reference frames allowed per frame.
+  unsigned int max_reference_frames;
+  // Indicates if the reduced set of references should be enabled.
+  bool enable_reduced_reference_set;
+  // Indicates if one-sided compound should be enabled.
+  bool enable_onesided_comp;
+} RefFrameCfg;
+
+typedef struct {
+  // Indicates the color space that should be used.
   aom_color_primaries_t color_primaries;
+  // Indicates the characteristics of transfer function to be used.
   aom_transfer_characteristics_t transfer_characteristics;
+  // Indicates the matrix coefficients to be used for the transfer function.
   aom_matrix_coefficients_t matrix_coefficients;
+  // Indicates the chroma 4:2:0 sample position info.
   aom_chroma_sample_position_t chroma_sample_position;
-  int color_range;
-  int render_width;
-  int render_height;
-  int timing_info_present;
-  aom_timing_info_t timing_info;
-  int decoder_model_info_present_flag;
-  int display_model_info_present_flag;
-  int buffer_removal_time_present;
-  aom_dec_model_info_t buffer_model;
-  int film_grain_test_vector;
+  // Indicates if a limited color range or full color range should be used.
+  aom_color_range_t color_range;
+} ColorCfg;
+
+typedef struct {
+  // Indicates if extreme motion vector unit test should be enabled or not.
+  unsigned int motion_vector_unit_test;
+  // Indicates if superblock multipass unit test should be enabled or not.
+  unsigned int sb_multipass_unit_test;
+} UnitTestCfg;
+
+typedef struct {
+  // Indicates the file path to the VMAF model.
+  const char *vmaf_model_path;
+  // Indicates the path to the film grain parameters.
   const char *film_grain_table_filename;
+  // Indicates the visual tuning metric.
+  aom_tune_metric tuning;
+  // Indicates if the current content is screen or default type.
+  aom_tune_content content;
+  // Indicates the film grain parameters.
+  int film_grain_test_vector;
+  // Indicates the in-block distortion metric to use.
+  aom_dist_metric dist_metric;
+} TuneCfg;
+
+typedef struct {
+  // Indicates the framerate of the input video.
+  double init_framerate;
+  // Indicates the bit-depth of the input video.
+  unsigned int input_bit_depth;
+  // Indicates the maximum number of frames to be encoded.
+  unsigned int limit;
+  // Indicates the chrome subsampling x value.
+  unsigned int chroma_subsampling_x;
+  // Indicates the chrome subsampling y value.
+  unsigned int chroma_subsampling_y;
+} InputCfg;
+
+typedef struct {
+  // If true, encoder will use fixed QP offsets, that are either:
+  // - Given by the user, and stored in 'fixed_qp_offsets' array, OR
+  // - Picked automatically from cq_level.
+  int use_fixed_qp_offsets;
+  // Indicates the minimum flatness of the quantization matrix.
+  int qm_minlevel;
+  // Indicates the maximum flatness of the quantization matrix.
+  int qm_maxlevel;
+  // Indicates if adaptive quantize_b should be enabled.
+  int quant_b_adapt;
+  // Indicates the Adaptive Quantization mode to be used.
+  AQ_MODE aq_mode;
+  // Indicates the delta q mode to be used.
+  DELTAQ_MODE deltaq_mode;
+  // Indicates the delta q mode strength.
+  DELTAQ_MODE deltaq_strength;
+  // Indicates if delta quantization should be enabled in chroma planes.
+  bool enable_chroma_deltaq;
+  // Indicates if delta quantization should be enabled for hdr video
+  bool enable_hdr_deltaq;
+  // Indicates if encoding with quantization matrices should be enabled.
+  bool using_qm;
+} QuantizationCfg;
+
+/*!\endcond */
+/*!
+ * \brief Algorithm configuration parameters.
+ */
+typedef struct {
+  /*!
+   * Controls the level at which rate-distortion optimization of transform
+   * coefficients favours sharpness in the block. Has no impact on RD when set
+   * to zero (default). For values 1-7, eob and skip block optimization are
+   * avoided and rdmult is adjusted in favour of block sharpness.
+   */
+  int sharpness;
+
+  /*!
+   * Indicates the trellis optimization mode of quantized coefficients.
+   * 0: disabled
+   * 1: enabled
+   * 2: enabled for rd search
+   * 3: true for estimate yrd search
+   */
+  int disable_trellis_quant;
 
+  /*!
+   * The maximum number of frames used to create an arf.
+   */
+  int arnr_max_frames;
+
+  /*!
+   * The temporal filter strength for arf used when creating ARFs.
+   */
+  int arnr_strength;
+
+  /*!
+   * Indicates the CDF update mode
+   * 0: no update
+   * 1: update on every frame(default)
+   * 2: selectively update
+   */
   uint8_t cdf_update_mode;
+
+  /*!
+   * Indicates if RDO based on frame temporal dependency should be enabled.
+   */
+  bool enable_tpl_model;
+
+  /*!
+   * Indicates if coding of overlay frames for filtered ALTREF frames is
+   * enabled.
+   */
+  bool enable_overlay;
+
+  /*!
+   * Controls loop filtering
+   * 0: Loop filter is disabled for all frames
+   * 1: Loop filter is enabled for all frames
+   * 2: Loop filter is disabled for non-reference frames
+   * 3: Loop filter is disables for the frames with low motion
+   */
+  LOOPFILTER_CONTROL loopfilter_control;
+} AlgoCfg;
+/*!\cond */
+
+typedef struct {
+  // Indicates the codec bit-depth.
+  aom_bit_depth_t bit_depth;
+  // Indicates the superblock size that should be used by the encoder.
   aom_superblock_size_t superblock_size;
-  unsigned int large_scale_tile;
-  unsigned int single_tile_decoding;
-  uint8_t monochrome;
-  unsigned int full_still_picture_hdr;
-  int enable_dual_filter;
-  unsigned int motion_vector_unit_test;
-  unsigned int sb_multipass_unit_test;
-  unsigned int ext_tile_debug;
-  int enable_rect_partitions;
-  int enable_ab_partitions;
-  int enable_1to4_partitions;
-  int min_partition_size;
-  int max_partition_size;
-  int enable_intra_edge_filter;
-  int enable_tx64;
-  int enable_flip_idtx;
-  int enable_order_hint;
-  int enable_dist_wtd_comp;
-  int enable_ref_frame_mvs;
-  unsigned int max_reference_frames;
-  int enable_reduced_reference_set;
-  unsigned int allow_ref_frame_mvs;
-  int enable_masked_comp;
-  int enable_onesided_comp;
-  int enable_interintra_comp;
-  int enable_smooth_interintra;
-  int enable_diff_wtd_comp;
-  int enable_interinter_wedge;
-  int enable_interintra_wedge;
-  int enable_global_motion;
-  int enable_warped_motion;
-  int allow_warped_motion;
-  int enable_filter_intra;
-  int enable_smooth_intra;
-  int enable_paeth_intra;
-  int enable_cfl_intra;
-  int enable_superres;
-  int enable_overlay;
-  int enable_palette;
-  int enable_intrabc;
-  int enable_angle_delta;
-  unsigned int save_as_annexb;
+  // Indicates if loopfilter modulation should be enabled.
+  bool enable_deltalf_mode;
+  // Indicates how CDEF should be applied.
+  CDEF_CONTROL cdef_control;
+  // Indicates if loop restoration filter should be enabled.
+  bool enable_restoration;
+  // When enabled, video mode should be used even for single frame input.
+  bool force_video_mode;
+  // Indicates if the error resiliency features should be enabled.
+  bool error_resilient_mode;
+  // Indicates if frame parallel decoding feature should be enabled.
+  bool frame_parallel_decoding_mode;
+  // Indicates if the input should be encoded as monochrome.
+  bool enable_monochrome;
+  // When enabled, the encoder will use a full header even for still pictures.
+  // When disabled, a reduced header is used for still pictures.
+  bool full_still_picture_hdr;
+  // Indicates if dual interpolation filters should be enabled.
+  bool enable_dual_filter;
+  // Indicates if frame order hint should be enabled or not.
+  bool enable_order_hint;
+  // Indicates if ref_frame_mvs should be enabled at the sequence level.
+  bool ref_frame_mvs_present;
+  // Indicates if ref_frame_mvs should be enabled at the frame level.
+  bool enable_ref_frame_mvs;
+  // Indicates if interintra compound mode is enabled.
+  bool enable_interintra_comp;
+  // Indicates if global motion should be enabled.
+  bool enable_global_motion;
+  // Indicates if palette should be enabled.
+  bool enable_palette;
+} ToolCfg;
+
+/*!\endcond */
+/*!
+ * \brief Main encoder configuration data structure.
+ */
+typedef struct AV1EncoderConfig {
+  /*!\cond */
+  // Configuration related to the input video.
+  InputCfg input_cfg;
+
+  // Configuration related to frame-dimensions.
+  FrameDimensionCfg frm_dim_cfg;
+
+  /*!\endcond */
+  /*!
+   * Encoder algorithm configuration.
+   */
+  AlgoCfg algo_cfg;
+
+  /*!
+   * Configuration related to key-frames.
+   */
+  KeyFrameCfg kf_cfg;
+
+  /*!
+   * Rate control configuration
+   */
+  RateControlCfg rc_cfg;
+  /*!\cond */
+
+  // Configuration related to Quantization.
+  QuantizationCfg q_cfg;
+
+  // Internal frame size scaling.
+  ResizeCfg resize_cfg;
+
+  // Frame Super-Resolution size scaling.
+  SuperResCfg superres_cfg;
+
+  /*!\endcond */
+  /*!
+   * stats_in buffer contains all of the stats packets produced in the first
+   * pass, concatenated.
+   */
+  aom_fixed_buf_t twopass_stats_in;
+  /*!\cond */
+
+  // Configuration related to encoder toolsets.
+  ToolCfg tool_cfg;
+
+  // Configuration related to Group of frames.
+  GFConfig gf_cfg;
+
+  // Tile related configuration parameters.
+  TileConfig tile_cfg;
+
+  // Configuration related to Tune.
+  TuneCfg tune_cfg;
+
+  // Configuration related to color.
+  ColorCfg color_cfg;
+
+  // Configuration related to decoder model.
+  DecoderModelCfg dec_model_cfg;
+
+  // Configuration related to reference frames.
+  RefFrameCfg ref_frm_cfg;
+
+  // Configuration related to unit tests.
+  UnitTestCfg unit_test_cfg;
+
+  // Flags related to motion mode.
+  MotionModeCfg motion_mode_cfg;
+
+  // Flags related to intra mode search.
+  IntraModeCfg intra_mode_cfg;
+
+  // Flags related to transform size/type.
+  TxfmSizeTypeCfg txfm_cfg;
+
+  // Flags related to compound type.
+  CompoundTypeCfg comp_type_cfg;
+
+  // Partition related information.
+  PartitionCfg part_cfg;
+
+  // Configuration related to frequency of cost update.
+  CostUpdateFreq cost_upd_freq;
 
 #if CONFIG_DENOISE
+  // Indicates the noise level.
   float noise_level;
+  // Indicates the the denoisers block size.
   int noise_block_size;
+  // Indicates whether to apply denoising to the frame to be encoded
+  int enable_dnl_denoising;
 #endif
 
-  unsigned int chroma_subsampling_x;
-  unsigned int chroma_subsampling_y;
-  int reduced_tx_type_set;
-  int use_intra_dct_only;
-  int use_inter_dct_only;
-  int use_intra_default_tx_only;
-  int quant_b_adapt;
-  COST_UPDATE_TYPE coeff_cost_upd_freq;
-  COST_UPDATE_TYPE mode_cost_upd_freq;
-  COST_UPDATE_TYPE mv_cost_upd_freq;
-  int border_in_pixels;
-  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  // Noise sensitivity.
+  int noise_sensitivity;
+#endif
   // Bit mask to specify which tier each of the 32 possible operating points
   // conforms to.
   unsigned int tier_mask;
-  // If true, encoder will use fixed QP offsets, that are either:
-  // - Given by the user, and stored in 'fixed_qp_offsets' array, OR
-  // - Picked automatically from cq_level.
-  int use_fixed_qp_offsets;
-  // List of QP offsets for: keyframe, ALTREF, and 3 levels of internal ARFs.
-  // If any of these values are negative, fixed offsets are disabled.
-  // Uses internal q range.
-  double fixed_qp_offsets[FIXED_QP_OFFSET_COUNT];
-  // min_cr / 100 is the target minimum compression ratio for each frame.
-  unsigned int min_cr;
-  const cfg_options_t *encoder_cfg;
+
+  // Indicates the number of pixels off the edge of a reference frame we're
+  // allowed to go when forming an inter prediction.
+  int border_in_pixels;
+
+  // Indicates the maximum number of threads that may be used by the encoder.
+  int max_threads;
+
+  // Indicates the speed preset to be used.
+  int speed;
+
+  // Indicates the target sequence level index for each operating point(OP).
+  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+
+  // Indicates the bitstream profile to be used.
+  BITSTREAM_PROFILE profile;
+
+  /*!\endcond */
+  /*!
+   * Indicates the current encoder pass :
+   * AOM_RC_ONE_PASS = One pass encode,
+   * AOM_RC_FIRST_PASS = First pass of multiple-pass
+   * AOM_RC_SECOND_PASS = Second pass of multiple-pass
+   * AOM_RC_THIRD_PASS = Third pass of multiple-pass
+   */
+  enum aom_enc_pass pass;
+  /*!\cond */
+
+  // Total number of encoding passes.
+  int passes;
+
+  // the name of the second pass output file when passes > 2
+  const char *two_pass_output;
+
+  // the name of the second pass log file when passes > 2
+  const char *second_pass_log;
+
+  // Indicates if the encoding is GOOD or REALTIME.
+  MODE mode;
+
+  // Indicates if row-based multi-threading should be enabled or not.
+  bool row_mt;
+
+  // Indicates if frame parallel multi-threading should be enabled or not.
+  bool fp_mt;
+
+  // Indicates if 16bit frame buffers are to be used i.e., the content is >
+  // 8-bit.
+  bool use_highbitdepth;
+
+  // Indicates the bitstream syntax mode. 0 indicates bitstream is saved as
+  // Section 5 bitstream, while 1 indicates the bitstream is saved in Annex - B
+  // format.
+  bool save_as_annexb;
+
+  // The path for partition stats reading and writing, used in the experiment
+  // CONFIG_PARTITION_SEARCH_ORDER.
+  const char *partition_info_path;
+
+  // Exit the encoder when it fails to encode to a given level.
+  int strict_level_conformance;
+  /*!\endcond */
 } AV1EncoderConfig;
 
-static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
-  return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
+/*!\cond */
+static INLINE int is_lossless_requested(const RateControlCfg *const rc_cfg) {
+  return rc_cfg->best_allowed_q == 0 && rc_cfg->worst_allowed_q == 0;
 }
+/*!\endcond */
 
+/*!
+ * \brief Encoder-side probabilities for pruning of various AV1 tools
+ */
 typedef struct {
-  // obmc_probs[i][j] is the probability of OBMC being the best motion mode for
-  // jth block size and ith frame update type, averaged over past frames. If
-  // obmc_probs[i][j] < thresh, then OBMC search is pruned.
+  /*!
+   * obmc_probs[i][j] is the probability of OBMC being the best motion mode for
+   * jth block size and ith frame update type, averaged over past frames. If
+   * obmc_probs[i][j] < thresh, then OBMC search is pruned.
+   */
   int obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
 
-  // warped_probs[i] is the probability of warped motion being the best motion
-  // mode for ith frame update type, averaged over past frames. If
-  // warped_probs[i] < thresh, then warped motion search is pruned.
+  /*!
+   * warped_probs[i] is the probability of warped motion being the best motion
+   * mode for ith frame update type, averaged over past frames. If
+   * warped_probs[i] < thresh, then warped motion search is pruned.
+   */
   int warped_probs[FRAME_UPDATE_TYPES];
 
-  // tx_type_probs[i][j][k] is the probability of kth tx_type being the best
-  // for jth transform size and ith frame update type, averaged over past
-  // frames. If tx_type_probs[i][j][k] < thresh, then transform search for that
-  // type is pruned.
+  /*!
+   * tx_type_probs[i][j][k] is the probability of kth tx_type being the best
+   * for jth transform size and ith frame update type, averaged over past
+   * frames. If tx_type_probs[i][j][k] < thresh, then transform search for that
+   * type is pruned.
+   */
   int tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES];
 
-  // switchable_interp_probs[i][j][k] is the probability of kth interpolation
-  // filter being the best for jth filter context and ith frame update type,
-  // averaged over past frames. If switchable_interp_probs[i][j][k] < thresh,
-  // then interpolation filter search is pruned for that case.
+  /*!
+   * switchable_interp_probs[i][j][k] is the probability of kth interpolation
+   * filter being the best for jth filter context and ith frame update type,
+   * averaged over past frames. If switchable_interp_probs[i][j][k] < thresh,
+   * then interpolation filter search is pruned for that case.
+   */
   int switchable_interp_probs[FRAME_UPDATE_TYPES][SWITCHABLE_FILTER_CONTEXTS]
                              [SWITCHABLE_FILTERS];
 } FrameProbInfo;
 
+/*!\cond */
+
 typedef struct FRAME_COUNTS {
 // Note: This structure should only contain 'unsigned int' fields, or
 // aggregates built solely from 'unsigned int' fields/elements
@@ -544,7 +1179,7 @@ typedef struct FRAME_COUNTS {
   unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
   unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1];
   unsigned int skip_mode[SKIP_MODE_CONTEXTS][2];
-  unsigned int skip[SKIP_CONTEXTS][2];
+  unsigned int skip_txfm[SKIP_CONTEXTS][2];
   unsigned int compound_index[COMP_INDEX_CONTEXTS][2];
   unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2];
   unsigned int delta_q[DELTA_Q_PROBS][2];
@@ -591,35 +1226,56 @@ typedef struct {
 // TODO(angiebird): This is an estimated size. We still need to figure what is
 // the maximum number of modes.
 #define MAX_INTER_MODES 1024
+// TODO(any): rename this struct to something else. There is already another
+// struct called inter_mode_info, which makes this terribly confusing.
+/*!\endcond */
+/*!
+ * \brief Struct used to hold inter mode data for fast tx search.
+ *
+ * This struct is used to perform a full transform search only on winning
+ * candidates searched with an estimate for transform coding RD.
+ */
 typedef struct inter_modes_info {
+  /*!
+   * The number of inter modes for which data was stored in each of the
+   * following arrays.
+   */
   int num;
+  /*!
+   * Mode info struct for each of the candidate modes.
+   */
   MB_MODE_INFO mbmi_arr[MAX_INTER_MODES];
+  /*!
+   * The rate for each of the candidate modes.
+   */
   int mode_rate_arr[MAX_INTER_MODES];
+  /*!
+   * The sse of the predictor for each of the candidate modes.
+   */
   int64_t sse_arr[MAX_INTER_MODES];
+  /*!
+   * The estimated rd of the predictor for each of the candidate modes.
+   */
   int64_t est_rd_arr[MAX_INTER_MODES];
+  /*!
+   * The rate and mode index for each of the candidate modes.
+   */
   RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+  /*!
+   * The full rd stats for each of the candidate modes.
+   */
   RD_STATS rd_cost_arr[MAX_INTER_MODES];
+  /*!
+   * The full rd stats of luma only for each of the candidate modes.
+   */
   RD_STATS rd_cost_y_arr[MAX_INTER_MODES];
+  /*!
+   * The full rd stats of chroma only for each of the candidate modes.
+   */
   RD_STATS rd_cost_uv_arr[MAX_INTER_MODES];
 } InterModesInfo;
 
-// Encoder row synchronization
-typedef struct AV1RowMTSyncData {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t *mutex_;
-  pthread_cond_t *cond_;
-#endif
-  // Allocate memory to store the sb/mb block index in each row.
-  int *cur_col;
-  int sync_range;
-  int rows;
-} AV1RowMTSync;
-
-typedef struct AV1RowMTInfo {
-  int current_mi_row;
-  int num_threads_working;
-} AV1RowMTInfo;
-
+/*!\cond */
 typedef struct {
   // TODO(kyslov): consider changing to 64bit
 
@@ -669,102 +1325,388 @@ typedef struct {
   VP64x64 *split;
 } VP128x128;
 
+/*!\endcond */
+
+/*!
+ * \brief Thresholds for variance based partitioning.
+ */
 typedef struct {
-  // Thresholds for variance based partitioning. If block variance > threshold,
-  // then that block is forced to split.
-  // thresholds[0] - threshold for 128x128;
-  // thresholds[1] - threshold for 64x64;
-  // thresholds[2] - threshold for 32x32;
-  // thresholds[3] - threshold for 16x16;
-  // thresholds[4] - threshold for 8x8;
+  /*!
+   * If block variance > threshold, then that block is forced to split.
+   * thresholds[0] - threshold for 128x128;
+   * thresholds[1] - threshold for 64x64;
+   * thresholds[2] - threshold for 32x32;
+   * thresholds[3] - threshold for 16x16;
+   * thresholds[4] - threshold for 8x8;
+   */
   int64_t thresholds[5];
 
-  // MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual
-  // minmax > threshold_minmax, the 16x16 is forced to split.
+  /*!
+   * MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual
+   * minmax > threshold_minmax, the 16x16 is forced to split.
+   */
   int64_t threshold_minmax;
 } VarBasedPartitionInfo;
 
+/*!
+ * \brief Encoder parameters for synchronization of row based multi-threading
+ */
+typedef struct {
+#if CONFIG_MULTITHREAD
+  /**
+   * \name Synchronization objects for top-right dependency.
+   */
+  /**@{*/
+  pthread_mutex_t *mutex_; /*!< Mutex lock object */
+  pthread_cond_t *cond_;   /*!< Condition variable */
+  /**@}*/
+#endif  // CONFIG_MULTITHREAD
+  /*!
+   * Buffer to store the superblock whose encoding is complete.
+   * cur_col[i] stores the number of superblocks which finished encoding in the
+   * ith superblock row.
+   */
+  int *num_finished_cols;
+  /*!
+   * Number of extra superblocks of the top row to be complete for encoding
+   * of the current superblock to start. A value of 1 indicates top-right
+   * dependency.
+   */
+  int sync_range;
+  /*!
+   * Number of superblock rows.
+   */
+  int rows;
+  /*!
+   * The superblock row (in units of MI blocks) to be processed next.
+   */
+  int next_mi_row;
+  /*!
+   * Number of threads processing the current tile.
+   */
+  int num_threads_working;
+} AV1EncRowMultiThreadSync;
+
+/*!\cond */
+
 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
 typedef struct TileDataEnc {
   TileInfo tile_info;
-  CFL_CTX cfl;
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
   FRAME_CONTEXT *row_ctx;
+  uint64_t abs_sum_level;
   uint8_t allow_update_cdf;
   InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
-  AV1RowMTSync row_mt_sync;
-  AV1RowMTInfo row_mt_info;
+  AV1EncRowMultiThreadSync row_mt_sync;
+  MV firstpass_top_mv;
 } TileDataEnc;
 
-typedef struct {
-  TOKENEXTRA *start;
-  TOKENEXTRA *stop;
-  unsigned int count;
-} TOKENLIST;
-
-typedef struct MultiThreadHandle {
-  int allocated_tile_rows;
-  int allocated_tile_cols;
-  int allocated_sb_rows;
-  int thread_id_to_tile_id[MAX_NUM_THREADS];  // Mapping of threads to tiles
-} MultiThreadHandle;
-
 typedef struct RD_COUNTS {
-  int64_t comp_pred_diff[REFERENCE_MODES];
-  // Stores number of 4x4 blocks using global motion per reference frame.
-  int global_motion_used[REF_FRAMES];
   int compound_ref_used_flag;
   int skip_mode_used_flag;
   int tx_type_used[TX_SIZES_ALL][TX_TYPES];
   int obmc_used[BLOCK_SIZES_ALL][2];
   int warped_used[2];
+  int newmv_or_intra_blocks;
+  uint64_t seg_tmp_pred_cost[2];
 } RD_COUNTS;
 
 typedef struct ThreadData {
   MACROBLOCK mb;
   RD_COUNTS rd_counts;
   FRAME_COUNTS *counts;
-  PC_TREE *pc_tree;
-  PC_TREE *pc_root;
-  tran_low_t *tree_coeff_buf[MAX_MB_PLANE];
-  tran_low_t *tree_qcoeff_buf[MAX_MB_PLANE];
-  tran_low_t *tree_dqcoeff_buf[MAX_MB_PLANE];
-  InterModesInfo *inter_modes_info;
+  PC_TREE_SHARED_BUFFERS shared_coeff_buf;
+  SIMPLE_MOTION_DATA_TREE *sms_tree;
+  SIMPLE_MOTION_DATA_TREE *sms_root;
   uint32_t *hash_value_buffer[2][2];
-  int32_t *wsrc_buf;
-  int32_t *mask_buf;
-  uint8_t *above_pred_buf;
-  uint8_t *left_pred_buf;
+  OBMCBuffer obmc_buffer;
   PALETTE_BUFFER *palette_buffer;
   CompoundTypeRdBuffers comp_rd_buffer;
   CONV_BUF_TYPE *tmp_conv_dst;
-  uint8_t *tmp_obmc_bufs[2];
+  uint64_t abs_sum_level;
+  uint8_t *tmp_pred_bufs[2];
   int intrabc_used;
   int deltaq_used;
+  int coefficient_size;
+  int max_mv_magnitude;
+  int interp_filter_selected[SWITCHABLE];
   FRAME_CONTEXT *tctx;
-  MB_MODE_INFO_EXT *mbmi_ext;
   VP64x64 *vt64x64;
   int32_t num_64x64_blocks;
+  PICK_MODE_CONTEXT *firstpass_ctx;
+  TemporalFilterData tf_data;
+  TplTxfmStats tpl_txfm_stats;
+  // Pointer to the array of structures to store gradient information of each
+  // pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
+  // structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
+  PixelLevelGradientInfo *pixel_gradient_info;
+  // Pointer to the array of structures to store source variance information of
+  // each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to
+  // store source variance and log of source variance of each 4x4 sub-block
+  // for subsequent retrieval.
+  Block4x4VarInfo *src_var_info_of_4x4_sub_blocks;
 } ThreadData;
 
 struct EncWorkerData;
 
+/*!\endcond */
+
+/*!
+ * \brief Encoder data related to row-based multi-threading
+ */
+typedef struct {
+  /*!
+   * Number of tile rows for which row synchronization memory is allocated.
+   */
+  int allocated_tile_rows;
+  /*!
+   * Number of tile cols for which row synchronization memory is allocated.
+   */
+  int allocated_tile_cols;
+  /*!
+   * Number of rows for which row synchronization memory is allocated
+   * per tile. During first-pass/look-ahead stage this equals the
+   * maximum number of macroblock rows in a tile. During encode stage,
+   * this equals the maximum number of superblock rows in a tile.
+   */
+  int allocated_rows;
+  /*!
+   * Number of columns for which entropy context memory is allocated
+   * per tile. During encode stage, this equals the maximum number of
+   * superblock columns in a tile minus 1. The entropy context memory
+   * is not allocated during first-pass/look-ahead stage.
+   */
+  int allocated_cols;
+
+  /*!
+   * thread_id_to_tile_id[i] indicates the tile id assigned to the ith thread.
+   */
+  int thread_id_to_tile_id[MAX_NUM_THREADS];
+
+#if CONFIG_MULTITHREAD
+  /*!
+   * Mutex lock used while dispatching jobs.
+   */
+  pthread_mutex_t *mutex_;
+#endif
+
+  /**
+   * \name Row synchronization related function pointers.
+   */
+  /**@{*/
+  /*!
+   * Reader.
+   */
+  void (*sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int);
+  /*!
+   * Writer.
+   */
+  void (*sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int);
+  /**@}*/
+} AV1EncRowMultiThreadInfo;
+
+/*!
+ * \brief Max number of recodes used to track the frame probabilities.
+ */
+#define NUM_RECODES_PER_FRAME 10
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+/*!
+ * \brief Max number of frames that can be encoded in a parallel encode set.
+ */
+#define MAX_PARALLEL_FRAMES 4
+
+/*!
+ * \brief Buffers to be backed up during parallel encode set to be restored
+ * later.
+ */
+typedef struct RestoreStateBuffers {
+  /*!
+   * Backup of original CDEF srcbuf.
+   */
+  uint16_t *cdef_srcbuf;
+
+  /*!
+   * Backup of original CDEF colbuf.
+   */
+  uint16_t *cdef_colbuf[MAX_MB_PLANE];
+
+  /*!
+   * Backup of original LR rst_tmpbuf.
+   */
+  int32_t *rst_tmpbuf;
+
+  /*!
+   * Backup of original LR rlbs.
+   */
+  RestorationLineBuffers *rlbs;
+} RestoreStateBuffers;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+/*!
+ * \brief Primary Encoder parameters related to multi-threading.
+ */
+typedef struct PrimaryMultiThreadInfo {
+  /*!
+   * Number of workers created for multi-threading.
+   */
+  int num_workers;
+
+  /*!
+   * Number of workers used for different MT modules.
+   */
+  int num_mod_workers[NUM_MT_MODULES];
+
+  /*!
+   * Synchronization object used to launch job in the worker thread.
+   */
+  AVxWorker *workers;
+
+  /*!
+   * Data specific to each worker in encoder multi-threading.
+   * tile_thr_data[i] stores the worker data of the ith thread.
+   */
+  struct EncWorkerData *tile_thr_data;
+
+  /*!
+   * CDEF row multi-threading data.
+   */
+  AV1CdefWorkerData *cdef_worker;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  /*!
+   * Primary(Level 1) Synchronization object used to launch job in the worker
+   * thread.
+   */
+  AVxWorker *p_workers[MAX_PARALLEL_FRAMES];
+
+  /*!
+   * Number of primary workers created for multi-threading.
+   */
+  int p_num_workers;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+} PrimaryMultiThreadInfo;
+
+/*!
+ * \brief Encoder parameters related to multi-threading.
+ */
+typedef struct MultiThreadInfo {
+  /*!
+   * Number of workers created for multi-threading.
+   */
+  int num_workers;
+
+  /*!
+   * Number of workers used for different MT modules.
+   */
+  int num_mod_workers[NUM_MT_MODULES];
+
+  /*!
+   * Synchronization object used to launch job in the worker thread.
+   */
+  AVxWorker *workers;
+
+  /*!
+   * Data specific to each worker in encoder multi-threading.
+   * tile_thr_data[i] stores the worker data of the ith thread.
+   */
+  struct EncWorkerData *tile_thr_data;
+
+  /*!
+   * When set, indicates that row based multi-threading of the encoder is
+   * enabled.
+   */
+  bool row_mt_enabled;
+
+  /*!
+   * When set, indicates that multi-threading for bitstream packing is enabled.
+   */
+  bool pack_bs_mt_enabled;
+
+  /*!
+   * Encoder row multi-threading data.
+   */
+  AV1EncRowMultiThreadInfo enc_row_mt;
+
+  /*!
+   * Tpl row multi-threading data.
+   */
+  AV1TplRowMultiThreadInfo tpl_row_mt;
+
+  /*!
+   * Loop Filter multi-threading object.
+   */
+  AV1LfSync lf_row_sync;
+
+  /*!
+   * Loop Restoration multi-threading object.
+   */
+  AV1LrSync lr_row_sync;
+
+  /*!
+   * Pack bitstream multi-threading object.
+   */
+  AV1EncPackBSSync pack_bs_sync;
+
+  /*!
+   * Global Motion multi-threading object.
+   */
+  AV1GlobalMotionSync gm_sync;
+
+  /*!
+   * Temporal Filter multi-threading object.
+   */
+  AV1TemporalFilterSync tf_sync;
+
+  /*!
+   * CDEF search multi-threading object.
+   */
+  AV1CdefSync cdef_sync;
+
+  /*!
+   * Pointer to CDEF row multi-threading data for the frame.
+   */
+  AV1CdefWorkerData *cdef_worker;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  /*!
+   * Buffers to be stored/restored before/after parallel encode.
+   */
+  RestoreStateBuffers restore_state_buf;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+} MultiThreadInfo;
+
+/*!\cond */
+
 typedef struct ActiveMap {
   int enabled;
   int update;
   unsigned char *map;
 } ActiveMap;
 
+/*!\endcond */
+
+/*!
+ * \brief Encoder info used for decision on forcing integer motion vectors.
+ */
 typedef struct {
-  // cs_rate_array[i] is the fraction of blocks in a frame which either match
-  // with the collocated block or are smooth, where i is the rate_index.
+  /*!
+   * cs_rate_array[i] is the fraction of blocks in a frame which either match
+   * with the collocated block or are smooth, where i is the rate_index.
+   */
   double cs_rate_array[32];
-  // rate_index is used to index cs_rate_array.
+  /*!
+   * rate_index is used to index cs_rate_array.
+   */
   int rate_index;
-  // rate_size is the total number of entries populated in cs_rate_array.
+  /*!
+   * rate_size is the total number of entries populated in cs_rate_array.
+   */
   int rate_size;
 } ForceIntegerMVInfo;
 
+/*!\cond */
+
 #if CONFIG_INTERNAL_STATS
 // types of stats
 enum {
@@ -786,33 +1728,53 @@ typedef struct {
   YV12_BUFFER_CONFIG buf;
 } EncRefCntBuffer;
 
+/*!\endcond */
+
+/*!
+ * \brief Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level
+ *
+ * This is used for bitstream preparation.
+ */
 typedef struct {
-  // Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level for
-  // use in bitstream preparation. frame_base[mi_row * stride + mi_col] stores
-  // the mode information of block (mi_row,mi_col).
+  /*!
+   * frame_base[mi_row * stride + mi_col] stores the mode information of
+   * block (mi_row,mi_col).
+   */
   MB_MODE_INFO_EXT_FRAME *frame_base;
-  // Size of frame_base buffer.
+  /*!
+   * Size of frame_base buffer.
+   */
   int alloc_size;
-  // Stride of frame_base buffer.
+  /*!
+   * Stride of frame_base buffer.
+   */
   int stride;
 } MBMIExtFrameBufferInfo;
 
-#if CONFIG_COLLECT_PARTITION_STATS == 2
-typedef struct PartitionStats {
+/*!\cond */
+
+#if CONFIG_COLLECT_PARTITION_STATS
+typedef struct FramePartitionTimingStats {
   int partition_decisions[6][EXT_PARTITION_TYPES];
   int partition_attempts[6][EXT_PARTITION_TYPES];
   int64_t partition_times[6][EXT_PARTITION_TYPES];
 
   int partition_redo;
-} PartitionStats;
-#endif
+} FramePartitionTimingStats;
+#endif  // CONFIG_COLLECT_PARTITION_STATS
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
 #include "aom_ports/aom_timer.h"
 // Adjust the following to add new components.
 enum {
+  av1_encode_strategy_time,
+  av1_get_one_pass_rt_params_time,
+  av1_get_second_pass_params_time,
+  denoise_and_encode_time,
+  apply_filtering_time,
+  av1_tpl_setup_stats_time,
   encode_frame_to_data_rate_time,
-  encode_with_recode_loop_time,
+  encode_with_or_without_recode_time,
   loop_filter_time,
   cdef_time,
   loop_restoration_time,
@@ -820,25 +1782,58 @@ enum {
   av1_encode_frame_time,
   av1_compute_global_motion_time,
   av1_setup_motion_field_time,
-  encode_sb_time,
+  encode_sb_row_time,
+
   rd_pick_partition_time,
+  rd_use_partition_time,
+  choose_var_based_partitioning_time,
+  av1_prune_partitions_time,
+  none_partition_search_time,
+  split_partition_search_time,
+  rectangular_partition_search_time,
+  ab_partitions_search_time,
+  rd_pick_4partition_time,
+  encode_sb_time,
+
   rd_pick_sb_modes_time,
   av1_rd_pick_intra_mode_sb_time,
   av1_rd_pick_inter_mode_sb_time,
-  handle_intra_mode_time,
+  set_params_rd_pick_inter_mode_time,
+  skip_inter_mode_time,
+  handle_inter_mode_time,
+  evaluate_motion_mode_for_winner_candidates_time,
   do_tx_search_time,
+  handle_intra_mode_time,
+  refine_winner_mode_tx_time,
+  av1_search_palette_mode_time,
   handle_newmv_time,
   compound_type_rd_time,
   interpolation_filter_search_time,
   motion_mode_rd_time,
+
+  nonrd_use_partition_time,
+  pick_sb_modes_nonrd_time,
+  hybrid_intra_mode_search_time,
+  nonrd_pick_inter_mode_sb_time,
+  encode_b_nonrd_time,
+
   kTimingComponents,
 } UENUM1BYTE(TIMING_COMPONENT);
 
 static INLINE char const *get_component_name(int index) {
   switch (index) {
+    case av1_encode_strategy_time: return "av1_encode_strategy_time";
+    case av1_get_one_pass_rt_params_time:
+      return "av1_get_one_pass_rt_params_time";
+    case av1_get_second_pass_params_time:
+      return "av1_get_second_pass_params_time";
+    case denoise_and_encode_time: return "denoise_and_encode_time";
+    case apply_filtering_time: return "apply_filtering_time";
+    case av1_tpl_setup_stats_time: return "av1_tpl_setup_stats_time";
     case encode_frame_to_data_rate_time:
       return "encode_frame_to_data_rate_time";
-    case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
+    case encode_with_or_without_recode_time:
+      return "encode_with_or_without_recode_time";
     case loop_filter_time: return "loop_filter_time";
     case cdef_time: return "cdef_time";
     case loop_restoration_time: return "loop_restoration_time";
@@ -847,20 +1842,48 @@ static INLINE char const *get_component_name(int index) {
     case av1_compute_global_motion_time:
       return "av1_compute_global_motion_time";
     case av1_setup_motion_field_time: return "av1_setup_motion_field_time";
-    case encode_sb_time: return "encode_sb_time";
+    case encode_sb_row_time: return "encode_sb_row_time";
+
     case rd_pick_partition_time: return "rd_pick_partition_time";
+    case rd_use_partition_time: return "rd_use_partition_time";
+    case choose_var_based_partitioning_time:
+      return "choose_var_based_partitioning_time";
+    case av1_prune_partitions_time: return "av1_prune_partitions_time";
+    case none_partition_search_time: return "none_partition_search_time";
+    case split_partition_search_time: return "split_partition_search_time";
+    case rectangular_partition_search_time:
+      return "rectangular_partition_search_time";
+    case ab_partitions_search_time: return "ab_partitions_search_time";
+    case rd_pick_4partition_time: return "rd_pick_4partition_time";
+    case encode_sb_time: return "encode_sb_time";
+
     case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
     case av1_rd_pick_intra_mode_sb_time:
       return "av1_rd_pick_intra_mode_sb_time";
     case av1_rd_pick_inter_mode_sb_time:
       return "av1_rd_pick_inter_mode_sb_time";
-    case handle_intra_mode_time: return "handle_intra_mode_time";
+    case set_params_rd_pick_inter_mode_time:
+      return "set_params_rd_pick_inter_mode_time";
+    case skip_inter_mode_time: return "skip_inter_mode_time";
+    case handle_inter_mode_time: return "handle_inter_mode_time";
+    case evaluate_motion_mode_for_winner_candidates_time:
+      return "evaluate_motion_mode_for_winner_candidates_time";
     case do_tx_search_time: return "do_tx_search_time";
+    case handle_intra_mode_time: return "handle_intra_mode_time";
+    case refine_winner_mode_tx_time: return "refine_winner_mode_tx_time";
+    case av1_search_palette_mode_time: return "av1_search_palette_mode_time";
     case handle_newmv_time: return "handle_newmv_time";
     case compound_type_rd_time: return "compound_type_rd_time";
     case interpolation_filter_search_time:
       return "interpolation_filter_search_time";
     case motion_mode_rd_time: return "motion_mode_rd_time";
+
+    case nonrd_use_partition_time: return "nonrd_use_partition_time";
+    case pick_sb_modes_nonrd_time: return "pick_sb_modes_nonrd_time";
+    case hybrid_intra_mode_search_time: return "hybrid_intra_mode_search_time";
+    case nonrd_pick_inter_mode_sb_time: return "nonrd_pick_inter_mode_sb_time";
+    case encode_b_nonrd_time: return "encode_b_nonrd_time";
+
     default: assert(0);
   }
   return "error";
@@ -870,146 +1893,299 @@ static INLINE char const *get_component_name(int index) {
 // The maximum number of internal ARFs except ALTREF_FRAME
 #define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
 
+/*!\endcond */
+
+/*!
+ * \brief Parameters related to global motion search
+ */
 typedef struct {
-  // Array to store the cost for signalling each global motion model.
-  // gmtype_cost[i] stores the cost of signalling the ith Global Motion model.
-  int type_cost[TRANS_TYPES];
+  /*!
+   * Flag to indicate if global motion search needs to be rerun.
+   */
+  bool search_done;
 
-  // Array to store the cost for signalling a particular global motion model for
-  // each reference frame. gmparams_cost[i] stores the cost of signalling global
-  // motion for the ith reference frame.
-  int params_cost[REF_FRAMES];
+  /*!
+   * Array of pointers to the frame buffers holding the reference frames.
+   * ref_buf[i] stores the pointer to the reference frame of the ith
+   * reference frame type.
+   */
+  YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
 
-  // Flag to indicate if global motion search needs to be rerun.
-  bool search_done;
+  /*!
+   * Pointer to the source frame buffer.
+   */
+  unsigned char *src_buffer;
+
+  /*!
+   * Holds the number of valid reference frames in past and future directions
+   * w.r.t. the current frame. num_ref_frames[i] stores the total number of
+   * valid reference frames in 'i' direction.
+   */
+  int num_ref_frames[MAX_DIRECTIONS];
+
+  /*!
+   * Array of structure which stores the valid reference frames in past and
+   * future directions and their corresponding distance from the source frame.
+   * reference_frames[i][j] holds the jth valid reference frame type in the
+   * direction 'i' and its temporal distance from the source frame .
+   */
+  FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1];
+
+  /**
+   * \name Dimensions for which segment map is allocated.
+   */
+  /**@{*/
+  int segment_map_w; /*!< segment map width */
+  int segment_map_h; /*!< segment map height */
+  /**@}*/
+
+  /*!
+   * Holds the total number of corner points detected in the source frame.
+   */
+  int num_src_corners;
+
+  /*!
+   * Holds the x and y co-ordinates of the corner points detected in the source
+   * frame. src_corners[i] holds the x co-ordinate and src_corners[i+1] holds
+   * the y co-ordinate of the ith corner point detected.
+   */
+  int src_corners[2 * MAX_CORNERS];
 } GlobalMotionInfo;
 
+/*!
+ * \brief Initial frame dimensions
+ *
+ * Tracks the frame dimensions using which:
+ *  - Frame buffers (like altref and util frame buffers) were allocated
+ *  - Motion estimation related initializations were done
+ * This structure is helpful to reallocate / reinitialize the above when there
+ * is a change in frame dimensions.
+ */
 typedef struct {
-  // Stores the default value of skip flag depending on chroma format
-  // Set as 1 for monochrome and 3 for other color formats
+  int width;  /*!< initial width */
+  int height; /*!< initial height */
+} InitialDimensions;
+
+/*!
+ * \brief Flags related to interpolation filter search
+ */
+typedef struct {
+  /*!
+   * Stores the default value of skip flag depending on chroma format
+   * Set as 1 for monochrome and 3 for other color formats
+   */
   int default_interp_skip_flags;
-  // Filter mask to allow certain interp_filter type.
+  /*!
+   * Filter mask to allow certain interp_filter type.
+   */
   uint16_t interp_filter_search_mask;
 } InterpSearchFlags;
 
+/*!
+ * \brief Parameters for motion vector search process
+ */
 typedef struct {
-  // Largest MV component used in a frame.
-  // The value from the previous frame is used to set the full pixel search
-  // range for the current frame.
+  /*!
+   * Largest MV component used in a frame.
+   * The value from the previous frame is used to set the full pixel search
+   * range for the current frame.
+   */
   int max_mv_magnitude;
-  // Parameter indicating initial search window to be used in full-pixel search.
-  // Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window.
+  /*!
+   * Parameter indicating initial search window to be used in full-pixel search.
+   * Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window.
+   */
   int mv_step_param;
-  // Pointer to sub-pixel search function.
-  // In encoder: av1_find_best_sub_pixel_tree
-  //             av1_find_best_sub_pixel_tree_pruned
-  //             av1_find_best_sub_pixel_tree_pruned_more
-  //             av1_find_best_sub_pixel_tree_pruned_evenmore
-  // In MV unit test: av1_return_max_sub_pixel_mv
-  //                  av1_return_min_sub_pixel_mv
+  /*!
+   * Pointer to sub-pixel search function.
+   * In encoder: av1_find_best_sub_pixel_tree
+   *             av1_find_best_sub_pixel_tree_pruned
+   *             av1_find_best_sub_pixel_tree_pruned_more
+   * In MV unit test: av1_return_max_sub_pixel_mv
+   *                  av1_return_min_sub_pixel_mv
+   */
   fractional_mv_step_fp *find_fractional_mv_step;
-  // Search site configuration for full-pel MV search.
-  // ss_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple motion
-  // search.
-  // ss_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal filter
-  // ss_cfg[SS_CFG_FPF]: Used during first pass and lookahead
-  search_site_config ss_cfg[SS_CFG_TOTAL];
+  /*!
+   * Search site configuration for full-pel MV search.
+   * search_site_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple
+   * motion search. search_site_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal
+   * filter search_site_cfg[SS_CFG_FPF]: Used during first pass and lookahead
+   */
+  search_site_config search_site_cfg[SS_CFG_TOTAL][NUM_DISTINCT_SEARCH_METHODS];
 } MotionVectorSearchParams;
 
+/*!
+ * \brief Refresh frame flags for different type of frames.
+ *
+ * If the refresh flag is true for a particular reference frame, after the
+ * current frame is encoded, the reference frame gets refreshed (updated) to
+ * be the current frame. Note: Usually at most one flag will be set to true at
+ * a time. But, for key-frames, all flags are set to true at once.
+ */
+typedef struct {
+  bool golden_frame;  /*!< Refresh flag for golden frame */
+  bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */
+  bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */
+} RefreshFrameInfo;
+
+/*!
+ * \brief Desired dimensions for an externally triggered resize.
+ *
+ * When resize is triggered externally, the desired dimensions are stored in
+ * this struct until used in the next frame to be coded. These values are
+ * effective only for one frame and are reset after they are used.
+ */
 typedef struct {
-  // When resize is triggered externally, the desired dimensions are stored in
-  // this struct until used in the next frame to be coded. These values are
-  // effective only for one frame and are reset after they are used.
-  int width;
-  int height;
+  int width;  /*!< Desired resized width */
+  int height; /*!< Desired resized height */
 } ResizePendingParams;
 
+/*!
+ * \brief Refrence frame distance related variables.
+ */
 typedef struct {
-  // Threshold of transform domain distortion
-  // Index 0: Default mode evaluation, Winner mode processing is not applicable
-  // (Eg : IntraBc).
-  // Index 1: Mode evaluation.
-  // Index 2: Winner mode evaluation.
-  // Index 1 and 2 are applicable when enable_winner_mode_for_use_tx_domain_dist
-  // speed feature is ON
-  unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES];
+  /*!
+   * True relative distance of reference frames w.r.t. the current frame.
+   */
+  int ref_relative_dist[INTER_REFS_PER_FRAME];
+  /*!
+   * The nearest reference w.r.t. current frame in the past.
+   */
+  int8_t nearest_past_ref;
+  /*!
+   * The nearest reference w.r.t. current frame in the future.
+   */
+  int8_t nearest_future_ref;
+} RefFrameDistanceInfo;
+
+/*!
+ * \brief Parameters used for winner mode processing.
+ *
+ * This is a basic two pass approach: in the first pass, we reduce the number of
+ * transform searches based on some thresholds during the rdopt process to find
+ * the  "winner mode". In the second pass, we perform a more through tx search
+ * on the winner mode.
+ * There are some arrays in the struct, and their indices are used in the
+ * following manner:
+ * Index 0: Default mode evaluation, Winner mode processing is not applicable
+ * (Eg : IntraBc).
+ * Index 1: Mode evaluation.
+ * Index 2: Winner mode evaluation
+ * Index 1 and 2 are only used when the respective speed feature is on.
+ */
+typedef struct {
+  /*!
+   * Threshold to determine if trellis optimization is to be enabled
+   * based on :
+   * 0 : dist threshold
+   * 1 : satd threshold
+   * Corresponds to enable_winner_mode_for_coeff_opt speed feature.
+   */
+  unsigned int coeff_opt_thresholds[MODE_EVAL_TYPES][2];
 
-  // Factor to control R-D optimization of coeffs based on block
-  // mse.
-  // Index 0: Default mode evaluation, Winner mode processing is not applicable
-  // (Eg : IntraBc). Index 1: Mode evaluation.
-  // Index 2: Winner mode evaluation
-  // Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed
-  // feature is ON
-  unsigned int coeff_opt_dist_threshold[MODE_EVAL_TYPES];
-
-  // Transform size to be used in transform search
-  // Index 0: Default mode evaluation, Winner mode processing is not applicable
-  // (Eg : IntraBc).
-  // Index 1: Mode evaluation. Index 2: Winner mode evaluation
-  // Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed
-  // feature is ON
+  /*!
+   * Determines the tx size search method during rdopt.
+   * Corresponds to enable_winner_mode_for_tx_size_srch speed feature.
+   */
   TX_SIZE_SEARCH_METHOD tx_size_search_methods[MODE_EVAL_TYPES];
 
-  // Transform domain distortion levels
-  // Index 0: Default mode evaluation, Winner mode processing is not applicable
-  // (Eg : IntraBc).
-  // Index 1: Mode evaluation. Index 2: Winner mode evaluation
-  // Index 1 and 2 are applicable when enable_winner_mode_for_use_tx_domain_dist
-  // speed feature is ON
+  /*!
+   * Controls how often we should approximate prediction error with tx
+   * coefficients. If it's 0, then never. If 1, then it's during the tx_type
+   * search only. If 2, then always.
+   * Corresponds to tx_domain_dist_level speed feature.
+   */
   unsigned int use_transform_domain_distortion[MODE_EVAL_TYPES];
 
-  // Predict transform skip levels to be used for default, mode and winner mode
-  // evaluation. Index 0: Default mode evaluation, Winner mode processing is not
-  // applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
-  unsigned int predict_skip_level[MODE_EVAL_TYPES];
+  /*!
+   * Threshold to approximate pixel domain distortion with transform domain
+   * distortion. This is only used if use_transform_domain_distortion is on.
+   * Corresponds to enable_winner_mode_for_use_tx_domain_dist speed feature.
+   */
+  unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES];
+
+  /*!
+   * Controls how often we should try to skip the transform process based on
+   * result from dct.
+   * Corresponds to use_skip_flag_prediction speed feature.
+   */
+  unsigned int skip_txfm_level[MODE_EVAL_TYPES];
+
+  /*!
+   * Predict DC only txfm blocks for default, mode and winner mode evaluation.
+   * Index 0: Default mode evaluation, Winner mode processing is not applicable.
+   * Index 1: Mode evaluation, Index 2: Winner mode evaluation
+   */
+  unsigned int predict_dc_level[MODE_EVAL_TYPES];
 } WinnerModeParams;
 
+/*!
+ * \brief Frame refresh flags set by the external interface.
+ *
+ * Flags set by external interface to determine which reference buffers are
+ * refreshed by this frame. When set, the encoder will update the particular
+ * reference frame buffer with the contents of the current frame.
+ */
 typedef struct {
-  // Bit mask to disable certain reference frame types.
-  int ref_frame_flags;
+  bool last_frame;     /*!< Refresh flag for last frame */
+  bool golden_frame;   /*!< Refresh flag for golden frame */
+  bool bwd_ref_frame;  /*!< Refresh flag for bwd-ref frame */
+  bool alt2_ref_frame; /*!< Refresh flag for alt2-ref frame */
+  bool alt_ref_frame;  /*!< Refresh flag for alt-ref frame */
+  /*!
+   * Flag indicating if the update of refresh frame flags is pending.
+   */
+  bool update_pending;
+} ExtRefreshFrameFlagsInfo;
 
-  // Flags to determine which reference buffers are refreshed by this frame.
-  // When set, the encoder will update the particular reference frame buffer
-  // with the contents of the current frame.
-  bool refresh_last_frame;
-  bool refresh_golden_frame;
-  bool refresh_bwd_ref_frame;
-  bool refresh_alt2_ref_frame;
-  bool refresh_alt_ref_frame;
+/*!
+ * \brief Flags signalled by the external interface at frame level.
+ */
+typedef struct {
+  /*!
+   * Bit mask to disable certain reference frame types.
+   */
+  int ref_frame_flags;
 
-  // Flag to indicate that updation of refresh frame flags from external
-  // interface is pending.
-  bool refresh_frame_flags_pending;
+  /*!
+   * Frame refresh flags set by the external interface.
+   */
+  ExtRefreshFrameFlagsInfo refresh_frame;
 
-  // Flag to enable the updation of frame contexts at the end of a frame decode.
+  /*!
+   * Flag to enable the update of frame contexts at the end of a frame decode.
+   */
   bool refresh_frame_context;
 
-  // Flag to indicate that updation of refresh_frame_context from external
-  // interface is pending.
+  /*!
+   * Flag to indicate that update of refresh_frame_context from external
+   * interface is pending.
+   */
   bool refresh_frame_context_pending;
 
-  // Flag to enable temporal MV prediction.
+  /*!
+   * Flag to enable temporal MV prediction.
+   */
   bool use_ref_frame_mvs;
 
-  // Flag to code the frame as error-resilient.
+  /*!
+   * Indicates whether the current frame is to be coded as error resilient.
+   */
   bool use_error_resilient;
 
-  // Flag to code the frame as s-frame.
+  /*!
+   * Indicates whether the current frame is to be coded as s-frame.
+   */
   bool use_s_frame;
 
-  // Flag to set the frame's primary_ref_frame to PRIMARY_REF_NONE.
+  /*!
+   * Indicates whether the current frame's primary_ref_frame is set to
+   * PRIMARY_REF_NONE.
+   */
   bool use_primary_ref_none;
 } ExternalFlags;
 
-typedef struct {
-  int arf_stack[FRAME_BUFFERS];
-  int arf_stack_size;
-  int lst_stack[FRAME_BUFFERS];
-  int lst_stack_size;
-  int gld_stack[FRAME_BUFFERS];
-  int gld_stack_size;
-} RefBufferStack;
+/*!\cond */
 
 typedef struct {
   // Some misc info
@@ -1039,6 +2215,25 @@ typedef struct {
   int valid;
 } MV_STATS;
 
+typedef struct WeberStats {
+  int64_t mb_wiener_variance;
+  int64_t src_variance;
+  int64_t rec_variance;
+  int16_t src_pix_max;
+  int16_t rec_pix_max;
+  int64_t distortion;
+  int64_t satd;
+  double max_scale;
+} WeberStats;
+
+typedef struct {
+  struct loopfilter lf;
+  CdefInfo cdef_info;
+  YV12_BUFFER_CONFIG copy_buffer;
+  RATE_CONTROL rc;
+  MV_STATS mv_stats;
+} CODING_CONTEXT;
+
 typedef struct {
   int frame_width;
   int frame_height;
@@ -1052,341 +2247,1162 @@ typedef struct {
   int subsampling_y;
 } FRAME_INFO;
 
+/*!
+ * \brief This structure stores different types of frame indices.
+ */
+typedef struct {
+  int show_frame_count;
+} FRAME_INDEX_SET;
+
+/*!\endcond */
+
+/*!
+ * \brief Segmentation related information for the current frame.
+ */
 typedef struct {
-  // 3-bit number containing the segment affiliation for each 4x4 block in the
-  // frame. map[y * stride + x] contains the segment id of the 4x4 block at
-  // (x,y) position.
+  /*!
+   * 3-bit number containing the segment affiliation for each 4x4 block in the
+   * frame. map[y * stride + x] contains the segment id of the 4x4 block at
+   * (x,y) position.
+   */
   uint8_t *map;
-  // Flag to indicate if current frame has lossless segments or not.
-  // 1: frame has at least one lossless segment.
-  // 0: frame has no lossless segments.
+  /*!
+   * Flag to indicate if current frame has lossless segments or not.
+   * 1: frame has at least one lossless segment.
+   * 0: frame has no lossless segments.
+   */
   bool has_lossless_segment;
 } EncSegmentationInfo;
 
+/*!
+ * \brief Frame time stamps.
+ */
 typedef struct {
-  // Start time stamp of the previous frame
-  int64_t prev_start_seen;
-  // End time stamp of the previous frame
-  int64_t prev_end_seen;
-  // Start time stamp of the first frame
-  int64_t first_ever;
+  /*!
+   * Start time stamp of the previous frame
+   */
+  int64_t prev_ts_start;
+  /*!
+   * End time stamp of the previous frame
+   */
+  int64_t prev_ts_end;
+  /*!
+   * Start time stamp of the first frame
+   */
+  int64_t first_ts_start;
 } TimeStamps;
 
+/*!
+ * Pointers to the memory allocated for frame level transform coeff related
+ * info.
+ */
+typedef struct {
+  /*!
+   * Pointer to the transformed coefficients buffer.
+   */
+  tran_low_t *tcoeff;
+  /*!
+   * Pointer to the eobs buffer.
+   */
+  uint16_t *eobs;
+  /*!
+   * Pointer to the entropy_ctx buffer.
+   */
+  uint8_t *entropy_ctx;
+} CoeffBufferPool;
+
+/*!
+ * \brief Structure to hold data corresponding to an encoded frame.
+ */
+typedef struct AV1_COMP_DATA {
+  /*!
+   * Buffer to store packed bitstream data of a frame.
+   */
+  unsigned char *cx_data;
+
+  /*!
+   * Allocated size of the cx_data buffer.
+   */
+  size_t cx_data_sz;
+
+  /*!
+   * Size of data written in the cx_data buffer.
+   */
+  size_t frame_size;
+
+  /*!
+   * Flags for the frame.
+   */
+  unsigned int lib_flags;
+
+  /*!
+   * Time stamp for start of frame.
+   */
+  int64_t ts_frame_start;
+
+  /*!
+   * Time stamp for end of frame.
+   */
+  int64_t ts_frame_end;
+
+  /*!
+   * Flag to indicate flush call.
+   */
+  int flush;
+
+  /*!
+   * Time base for sequence.
+   */
+  const aom_rational64_t *timestamp_ratio;
+
+  /*!
+   * Decide to pop the source for this frame from input buffer queue.
+   */
+  int pop_lookahead;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  /*!
+   * Display order hint of frame whose packed data is in cx_data buffer.
+   */
+  int frame_display_order_hint;
+#endif
+} AV1_COMP_DATA;
+
+/*!
+ * \brief Top level primary encoder structure
+ */
+typedef struct AV1_PRIMARY {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  /*!
+   * Array of frame level encoder stage top level structures
+   */
+  struct AV1_COMP *parallel_cpi[MAX_PARALLEL_FRAMES];
+
+  /*!
+   * Array of structures to hold data of frames encoded in a given parallel
+   * encode set.
+   */
+  struct AV1_COMP_DATA parallel_frames_data[MAX_PARALLEL_FRAMES - 1];
+
+#if CONFIG_FPMT_TEST
+  /*!
+   * Flag which enables/disables simulation path for fpmt unit test.
+   * 0 - FPMT integration
+   * 1 - FPMT simulation
+   */
+  FPMT_TEST_ENC_CFG fpmt_unit_test_cfg;
+
+  /*!
+   * Temporary variable simulating the delayed frame_probability update.
+   */
+  FrameProbInfo temp_frame_probs;
+
+  /*!
+   * Temporary variable holding the updated frame probability across
+   * frames. Copy its value to temp_frame_probs for frame_parallel_level 0
+   * frames or last frame in parallel encode set.
+   */
+  FrameProbInfo temp_frame_probs_simulation;
+
+  /*!
+   * Temporary variable simulating the delayed update of valid global motion
+   * model across frames.
+   */
+  int temp_valid_gm_model_found[FRAME_UPDATE_TYPES];
+#endif
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  /*!
+   * Copy of cm->ref_frame_map maintained to facilitate sequential update of
+   * ref_frame_map by lower layer depth frames encoded ahead of time in a
+   * parallel encode set.
+   */
+  RefCntBuffer *ref_frame_map_copy[REF_FRAMES];
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+  /*!
+   * Start time stamp of the last encoded show frame
+   */
+  int64_t ts_start_last_show_frame;
+
+  /*!
+   * End time stamp of the last encoded show frame
+   */
+  int64_t ts_end_last_show_frame;
+
+  /*!
+   * Number of frame level contexts(cpis)
+   */
+  int num_fp_contexts;
+
+  /*!
+   * Loopfilter levels of the previous encoded frame.
+   */
+  int filter_level[2];
+
+  /*!
+   * Chrominance component loopfilter level of the previous encoded frame.
+   */
+  int filter_level_u;
+
+  /*!
+   * Chrominance component loopfilter level of the previous encoded frame.
+   */
+  int filter_level_v;
+
+  /*!
+   * Encode stage top level structure
+   * When CONFIG_FRAME_PARALLEL_ENCODE is enabled this is the same as
+   * parallel_cpi[0]
+   */
+  struct AV1_COMP *cpi;
+
+  /*!
+   * Lookahead processing stage top level structure
+   */
+  struct AV1_COMP *cpi_lap;
+
+  /*!
+   * Look-ahead context.
+   */
+  struct lookahead_ctx *lookahead;
+
+  /*!
+   * Sequence parameters have been transmitted already and locked
+   * or not. Once locked av1_change_config cannot change the seq
+   * parameters.
+   */
+  int seq_params_locked;
+
+  /*!
+   * Pointer to internal utility functions that manipulate aom_codec_* data
+   * structures.
+   */
+  struct aom_codec_pkt_list *output_pkt_list;
+
+  /*!
+   * When set, indicates that internal ARFs are enabled.
+   */
+  int internal_altref_allowed;
+
+  /*!
+   * Tell if OVERLAY frame shows existing alt_ref frame.
+   */
+  int show_existing_alt_ref;
+
+  /*!
+   * Information related to a gf group.
+   */
+  GF_GROUP gf_group;
+
+  /*!
+   * Track prior gf group state.
+   */
+  GF_STATE gf_state;
+
+  /*!
+   * Flag indicating whether look ahead processing (LAP) is enabled.
+   */
+  int lap_enabled;
+
+  /*!
+   * Parameters for AV1 bitstream levels.
+   */
+  AV1LevelParams level_params;
+
+  /*!
+   * Calculates PSNR on each frame when set to 1.
+   */
+  int b_calculate_psnr;
+
+  /*!
+   * Number of frames left to be encoded, is 0 if limit is not set.
+   */
+  int frames_left;
+
+  /*!
+   * Information related to two pass encoding.
+   */
+  TWO_PASS twopass;
+
+  /*!
+   * Rate control related parameters.
+   */
+  PRIMARY_RATE_CONTROL p_rc;
+
+  /*!
+   * Info and resources used by temporal filtering.
+   */
+  TEMPORAL_FILTER_INFO tf_info;
+  /*!
+   * Elements part of the sequence header, that are applicable for all the
+   * frames in the video.
+   */
+  SequenceHeader seq_params;
+
+  /*!
+   * Indicates whether to use SVC.
+   */
+  int use_svc;
+
+  /*!
+   * If true, buffer removal times are present.
+   */
+  bool buffer_removal_time_present;
+
+  /*!
+   * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_temporal_layers;
+
+  /*!
+   * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_spatial_layers;
+
+  /*!
+   * Code and details about current error status.
+   */
+  struct aom_internal_error_info error;
+
+  /*!
+   * Function pointers to variants of sse/sad/variance computation functions.
+   * fn_ptr[i] indicates the list of function pointers corresponding to block
+   * size i.
+   */
+  aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
+
+  /*!
+   * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
+   * the ith 16 x 16 block in raster scan order.
+   */
+  double *tpl_sb_rdmult_scaling_factors;
+
+  /*!
+   * Parameters related to tpl.
+   */
+  TplParams tpl_data;
+
+  /*!
+   * Motion vector stats of the previous encoded frame.
+   */
+  MV_STATS mv_stats;
+
+#if CONFIG_INTERNAL_STATS
+  /*!\cond */
+  uint64_t total_time_receive_data;
+  uint64_t total_time_compress_data;
+
+  unsigned int total_mode_chosen_counts[MAX_MODES];
+
+  int count[2];
+  uint64_t total_sq_error[2];
+  uint64_t total_samples[2];
+  ImageStat psnr[2];
+
+  double total_blockiness;
+  double worst_blockiness;
+
+  int total_bytes;
+  double summed_quality;
+  double summed_weights;
+  double summed_quality_hbd;
+  double summed_weights_hbd;
+  unsigned int total_recode_hits;
+  double worst_ssim;
+  double worst_ssim_hbd;
+
+  ImageStat fastssim;
+  ImageStat psnrhvs;
+
+  int b_calculate_blockiness;
+  int b_calculate_consistency;
+
+  double total_inconsistency;
+  double worst_consistency;
+  Ssimv *ssim_vars;
+  Metrics metrics;
+  /*!\endcond */
+#endif
+
+#if CONFIG_ENTROPY_STATS
+  /*!
+   * Aggregates frame counts for the sequence.
+   */
+  FRAME_COUNTS aggregate_fc;
+#endif  // CONFIG_ENTROPY_STATS
+
+  /*!
+   * For each type of reference frame, this contains the index of a reference
+   * frame buffer for a reference frame of the same type.  We use this to
+   * choose our primary reference frame (which is the most recent reference
+   * frame of the same type as the current frame).
+   */
+  int fb_of_context_type[REF_FRAMES];
+
+  /*!
+   * Primary Multi-threading parameters.
+   */
+  PrimaryMultiThreadInfo p_mt_info;
+
+  /*!
+   * Probabilities for pruning of various AV1 tools.
+   */
+  FrameProbInfo frame_probs;
+
+  /*!
+   * Indicates if a valid global motion model has been found in the different
+   * frame update types of a GF group.
+   * valid_gm_model_found[i] indicates if valid global motion model has been
+   * found in the frame update type with enum value equal to i
+   */
+  int valid_gm_model_found[FRAME_UPDATE_TYPES];
+} AV1_PRIMARY;
+
+/*!
+ * \brief Top level encoder structure.
+ */
 typedef struct AV1_COMP {
-  // Quantization and dequantization parameters for internal quantizer setup
-  // in the encoder.
+  /*!
+   * Pointer to top level primary encoder structure
+   */
+  AV1_PRIMARY *ppi;
+
+  /*!
+   * Quantization and dequantization parameters for internal quantizer setup
+   * in the encoder.
+   */
   EncQuantDequantParams enc_quant_dequant_params;
+
+  /*!
+   * Structure holding thread specific variables.
+   */
   ThreadData td;
+
+  /*!
+   * Statistics collected at frame level.
+   */
   FRAME_COUNTS counts;
 
-  // Holds buffer storing mode information at 4x4/8x8 level.
+  /*!
+   * Holds buffer storing mode information at 4x4/8x8 level.
+   */
   MBMIExtFrameBufferInfo mbmi_ext_info;
 
+  /*!
+   * Buffer holding the transform block related information.
+   * coeff_buffer_base[i] stores the transform block related information of the
+   * ith superblock in raster scan order.
+   */
   CB_COEFF_BUFFER *coeff_buffer_base;
+
+  /*!
+   * Structure holding pointers to frame level memory allocated for transform
+   * block related information.
+   */
+  CoeffBufferPool coeff_buffer_pool;
+
+  /*!
+   * Structure holding variables common to encoder and decoder.
+   */
   AV1_COMMON common;
+
+  /*!
+   * Encoder configuration related parameters.
+   */
   AV1EncoderConfig oxcf;
-  struct lookahead_ctx *lookahead;
-  int no_show_kf;
 
+  /*!
+   * Stores the trellis optimization type at segment level.
+   * optimize_seg_arr[i] stores the trellis opt type for ith segment.
+   */
   TRELLIS_OPT_TYPE optimize_seg_arr[MAX_SEGMENTS];
 
+  /*!
+   * Pointer to the frame buffer holding the source frame to be used during the
+   * current stage of encoding. It can be the raw input, temporally filtered
+   * input or scaled input.
+   */
   YV12_BUFFER_CONFIG *source;
-  YV12_BUFFER_CONFIG *last_source;  // NULL for first frame and alt_ref frames
+
+  /*!
+   * Pointer to the frame buffer holding the last raw source frame.
+   * last_source is NULL for the following cases:
+   * 1) First frame
+   * 2) Alt-ref frames
+   * 3) All frames for all-intra frame encoding.
+   */
+  YV12_BUFFER_CONFIG *last_source;
+
+  /*!
+   * Pointer to the frame buffer holding the unscaled source frame.
+   * It can be either the raw input or temporally filtered input.
+   */
   YV12_BUFFER_CONFIG *unscaled_source;
+
+  /*!
+   * Frame buffer holding the resized source frame (cropping / superres).
+   */
   YV12_BUFFER_CONFIG scaled_source;
+
+  /*!
+   * Pointer to the frame buffer holding the unscaled last source frame.
+   */
   YV12_BUFFER_CONFIG *unscaled_last_source;
+
+  /*!
+   * Frame buffer holding the resized last source frame.
+   */
   YV12_BUFFER_CONFIG scaled_last_source;
+
+  /*!
+   * Pointer to the original source frame. This is used to determine if the
+   * content is screen.
+   */
   YV12_BUFFER_CONFIG *unfiltered_source;
 
-  TplParams tpl_data;
+  /*!
+   * Frame buffer holding the orig source frame for PSNR calculation in rtc tf
+   * case.
+   */
+  YV12_BUFFER_CONFIG orig_source;
+
+  /*!
+   * Skip tpl setup when tpl data from gop length decision can be reused.
+   */
+  int skip_tpl_setup_stats;
+
+  /*!
+   * Scaling factors used in the RD multiplier modulation.
+   * TODO(sdeng): consider merge the following arrays.
+   * tpl_rdmult_scaling_factors is a temporary buffer used to store the
+   * intermediate scaling factors which are used in the calculation of
+   * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the
+   * intermediate scaling factor of the ith 16 x 16 block in raster scan order.
+   */
+  double *tpl_rdmult_scaling_factors;
 
-  // For a still frame, this flag is set to 1 to skip partition search.
-  int partition_search_skippable_frame;
+  /*!
+   * Temporal filter context.
+   */
+  TemporalFilterCtx tf_ctx;
 
-  // Variables related to forcing integer mv decisions for the current frame.
+  /*!
+   * Variables related to forcing integer mv decisions for the current frame.
+   */
   ForceIntegerMVInfo force_intpel_info;
 
-  unsigned int row_mt;
+  /*!
+   * Pointer to the buffer holding the scaled reference frames.
+   * scaled_ref_buf[i] holds the scaled reference frame of type i.
+   */
   RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME];
 
-  RefCntBuffer *last_show_frame_buf;  // last show frame buffer
-
-  // refresh_*_frame are boolean flags. If 'refresh_xyz_frame' is true, then
-  // after the current frame is encoded, the XYZ reference frame gets refreshed
-  // (updated) to be the current frame.
-  //
-  // Note: Usually at most one of these refresh flags is true at a time.
-  // But a key-frame is special, for which all the flags are true at once.
-  int refresh_golden_frame;
-  int refresh_bwd_ref_frame;
-  int refresh_alt_ref_frame;
-
-  // For each type of reference frame, this contains the index of a reference
-  // frame buffer for a reference frame of the same type.  We use this to
-  // choose our primary reference frame (which is the most recent reference
-  // frame of the same type as the current frame).
-  int fb_of_context_type[REF_FRAMES];
+  /*!
+   * Pointer to the buffer holding the last show frame.
+   */
+  RefCntBuffer *last_show_frame_buf;
+
+  /*!
+   * Refresh frame flags for golden, bwd-ref and alt-ref frames.
+   */
+  RefreshFrameInfo refresh_frame;
 
-  // Flags signalled by the external interface at frame level.
+  /*!
+   * Flags signalled by the external interface at frame level.
+   */
   ExternalFlags ext_flags;
 
+  /*!
+   * Temporary frame buffer used to store the non-loop filtered reconstructed
+   * frame during the search of loop filter level.
+   */
   YV12_BUFFER_CONFIG last_frame_uf;
+
+  /*!
+   * Temporary frame buffer used to store the loop restored frame during loop
+   * restoration search.
+   */
   YV12_BUFFER_CONFIG trial_frame_rst;
 
-  // Ambient reconstruction err target for force key frames
+  /*!
+   * Ambient reconstruction err target for force key frames.
+   */
   int64_t ambient_err;
 
+  /*!
+   * Parameters related to rate distortion optimization.
+   */
   RD_OPT rd;
 
+  /*!
+   * Temporary coding context used to save and restore when encoding with and
+   * without super-resolution.
+   */
   CODING_CONTEXT coding_context;
 
-  // Parameters related to global motion search.
+  /*!
+   * Parameters related to global motion search.
+   */
   GlobalMotionInfo gm_info;
 
-  // Parameters related to winner mode processing.
+  /*!
+   * Parameters related to winner mode processing.
+   */
   WinnerModeParams winner_mode_params;
 
-  // Frame time stamps
+  /*!
+   * Frame time stamps.
+   */
   TimeStamps time_stamps;
 
+  /*!
+   * Rate control related parameters.
+   */
   RATE_CONTROL rc;
-  double framerate;
 
-  struct aom_codec_pkt_list *output_pkt_list;
+  /*!
+   * Frame rate of the video.
+   */
+  double framerate;
 
+  /*!
+   * Bitmask indicating which reference buffers may be referenced by this frame.
+   */
   int ref_frame_flags;
 
-  // speed is passed as a per-frame parameter into the encoder
+  /*!
+   * speed is passed as a per-frame parameter into the encoder.
+   */
   int speed;
-  // sf contains fine-grained config set internally based on speed
+
+  /*!
+   * sf contains fine-grained config set internally based on speed.
+   */
   SPEED_FEATURES sf;
 
-  // Parameters for motion vector search process.
+  /*!
+   * Parameters for motion vector search process.
+   */
   MotionVectorSearchParams mv_search_params;
 
+  /*!
+   * When set, indicates that all reference frames are forward references,
+   * i.e., all the reference frames are output before the current frame.
+   */
   int all_one_sided_refs;
 
-  // Segmentation related information for current frame.
+  /*!
+   * Segmentation related information for current frame.
+   */
   EncSegmentationInfo enc_seg;
 
+  /*!
+   * Parameters related to cyclic refresh aq-mode.
+   */
   CYCLIC_REFRESH *cyclic_refresh;
+  /*!
+   * Parameters related to active map. Active maps indicate
+   * if there is any activity on a 4x4 block basis.
+   */
   ActiveMap active_map;
 
-  aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
+  /*!
+   * The frame processing order within a GOP.
+   */
+  unsigned char gf_frame_index;
 
 #if CONFIG_INTERNAL_STATS
-  uint64_t time_receive_data;
+  /*!\cond */
   uint64_t time_compress_data;
-#endif
-
-  // number of show frames encoded in current gf_group
-  int num_gf_group_show_frames;
 
-  TWO_PASS twopass;
-
-  GF_GROUP gf_group;
-
-  // To control the reference frame buffer and selection.
-  RefBufferStack ref_buffer_stack;
-
-  YV12_BUFFER_CONFIG alt_ref_buffer;
-
-  // Tell if OVERLAY frame shows existing alt_ref frame.
-  int show_existing_alt_ref;
-
-#if CONFIG_INTERNAL_STATS
   unsigned int mode_chosen_counts[MAX_MODES];
-
-  int count;
-  uint64_t total_sq_error;
-  uint64_t total_samples;
-  ImageStat psnr;
-
-  double total_blockiness;
-  double worst_blockiness;
-
   int bytes;
-  double summed_quality;
-  double summed_weights;
-  unsigned int tot_recode_hits;
-  double worst_ssim;
-
-  ImageStat fastssim;
-  ImageStat psnrhvs;
-
-  int b_calculate_blockiness;
-  int b_calculate_consistency;
-
-  double total_inconsistency;
-  double worst_consistency;
-  Ssimv *ssim_vars;
-  Metrics metrics;
+  unsigned int frame_recode_hits;
+  /*!\endcond */
 #endif
-  int b_calculate_psnr;
+
 #if CONFIG_SPEED_STATS
+  /*!
+   * For debugging: number of transform searches we have performed.
+   */
   unsigned int tx_search_count;
 #endif  // CONFIG_SPEED_STATS
 
+  /*!
+   * When set, indicates that the frame is droppable, i.e., this frame
+   * does not update any reference buffers.
+   */
   int droppable;
 
+  /*!
+   * Stores the frame parameters during encoder initialization.
+   */
   FRAME_INFO frame_info;
 
-  int initial_width;
-  int initial_height;
-  int initial_mbs;  // Number of MBs in the full-size frame; to be used to
-                    // normalize the firstpass stats. This will differ from the
-                    // number of MBs in the current frame when the frame is
-                    // scaled.
-  // Resize related parameters
+  /*!
+   * Stores different types of frame indices.
+   */
+  FRAME_INDEX_SET frame_index_set;
+
+  /*!
+   * Structure to store the dimensions of current frame.
+   */
+  InitialDimensions initial_dimensions;
+
+  /*!
+   * Number of MBs in the full-size frame; to be used to
+   * normalize the firstpass stats. This will differ from the
+   * number of MBs in the current frame when the frame is
+   * scaled.
+   */
+  int initial_mbs;
+
+  /*!
+   * Resize related parameters.
+   */
   ResizePendingParams resize_pending_params;
 
+  /*!
+   * Pointer to struct holding adaptive data/contexts/models for the tile during
+   * encoding.
+   */
   TileDataEnc *tile_data;
-  int allocated_tiles;  // Keep track of memory allocated for tiles.
-
-  TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
-  TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
+  /*!
+   * Number of tiles for which memory has been allocated for tile_data.
+   */
+  int allocated_tiles;
 
-  // Sequence parameters have been transmitted already and locked
-  // or not. Once locked av1_change_config cannot change the seq
-  // parameters.
-  int seq_params_locked;
+  /*!
+   * Structure to store the palette token related information.
+   */
+  TokenInfo token_info;
 
-  // VARIANCE_AQ segment map refresh
+  /*!
+   * VARIANCE_AQ segment map refresh.
+   */
   int vaq_refresh;
 
-  // Thresholds for variance based partitioning.
+  /*!
+   * Thresholds for variance based partitioning.
+   */
   VarBasedPartitionInfo vbp_info;
 
-  // Probabilities for pruning of various AV1 tools.
-  FrameProbInfo frame_probs;
+  /*!
+   * Number of recodes in the frame.
+   */
+  int num_frame_recode;
 
-  // Multi-threading
-  int num_workers;
-  AVxWorker *workers;
-  struct EncWorkerData *tile_thr_data;
+  /*!
+   * Current frame probability of parallel frames, across recodes.
+   */
+  FrameProbInfo frame_new_probs[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for transform type frame_probability calculation
+   */
+  int do_update_frame_probs_txtype[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for obmc frame_probability calculation
+   */
+  int do_update_frame_probs_obmc[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for warped motion frame_probability calculation
+   */
+  int do_update_frame_probs_warp[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for interpolation filter frame_probability calculation
+   */
+  int do_update_frame_probs_interpfilter[NUM_RECODES_PER_FRAME];
+
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  /*!
+   * Temporary variable for simulation.
+   * Previous frame's framerate.
+   */
+  double temp_framerate;
+#endif
+  /*!
+   * Updated framerate for the current parallel frame.
+   * cpi->framerate is updated with new_framerate during
+   * post encode updates for parallel frames.
+   */
+  double new_framerate;
+
+  /*!
+   * Retain condition for fast_extra_bits calculation.
+   */
+  int do_update_vbr_bits_off_target_fast;
+
+  /*!
+   * Multi-threading parameters.
+   */
+  MultiThreadInfo mt_info;
+
+  /*!
+   * Specifies the frame to be output. It is valid only if show_existing_frame
+   * is 1. When show_existing_frame is 0, existing_fb_idx_to_show is set to
+   * INVALID_IDX.
+   */
   int existing_fb_idx_to_show;
-  int internal_altref_allowed;
-  // A flag to indicate if intrabc is ever used in current frame.
-  int intrabc_used;
 
-  // Tables to calculate IntraBC MV cost.
-  IntraBCMVCosts dv_costs;
+  /*!
+   * A flag to indicate if intrabc is ever used in current frame.
+   */
+  int intrabc_used;
 
-  // Mark which ref frames can be skipped for encoding current frame druing RDO.
+  /*!
+   * Mark which ref frames can be skipped for encoding current frame during RDO.
+   */
   int prune_ref_frame_mask;
 
-  AV1LfSync lf_row_sync;
-  AV1LrSync lr_row_sync;
+  /*!
+   * Loop Restoration context.
+   */
   AV1LrStruct lr_ctxt;
 
+  /*!
+   * Pointer to list of tables with film grain parameters.
+   */
   aom_film_grain_table_t *film_grain_table;
+
 #if CONFIG_DENOISE
+  /*!
+   * Pointer to structure holding the denoised image buffers and the helper
+   * noise models.
+   */
   struct aom_denoise_and_model_t *denoise_and_model;
 #endif
 
-  // Flags related to interpolation filter search.
+  /*!
+   * Flags related to interpolation filter search.
+   */
   InterpSearchFlags interp_search_flags;
 
-  MultiThreadHandle multi_thread_ctxt;
-  void (*row_mt_sync_read_ptr)(AV1RowMTSync *const, int, int);
-  void (*row_mt_sync_write_ptr)(AV1RowMTSync *const, int, int, const int);
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t *row_mt_mutex_;
-#endif
-  // Set if screen content is set or relevant tools are enabled
+  /*!
+   * Turn on screen content tools flag.
+   * Note that some videos are not screen content videos, but
+   * screen content tools could also improve coding efficiency.
+   * For example, videos with large flat regions, gaming videos that look
+   * like natural videos.
+   */
+  int use_screen_content_tools;
+
+  /*!
+   * A flag to indicate "real" screen content videos.
+   * For example, screen shares, screen editing.
+   * This type is true indicates |use_screen_content_tools| must be true.
+   * In addition, rate control strategy is adjusted when this flag is true.
+   */
   int is_screen_content_type;
-#if CONFIG_COLLECT_PARTITION_STATS == 2
-  PartitionStats partition_stats;
-#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  /*!
+   * Accumulates the partition timing stat over the whole frame.
+   */
+  FramePartitionTimingStats partition_stats;
+#endif  // CONFIG_COLLECT_PARTITION_STATS
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  // component_time[] are initialized to zero while encoder starts.
+  /*!
+   * component_time[] are initialized to zero while encoder starts.
+   */
   uint64_t component_time[kTimingComponents];
+  /*!
+   * Stores timing for individual components between calls of start_timing()
+   * and end_timing().
+   */
   struct aom_usec_timer component_timer[kTimingComponents];
-  // frame_component_time[] are initialized to zero at beginning of each frame.
+  /*!
+   * frame_component_time[] are initialized to zero at beginning of each frame.
+   */
   uint64_t frame_component_time[kTimingComponents];
 #endif
 
-  // Parameters for AV1 bitstream levels.
-  AV1LevelParams level_params;
+  /*!
+   * Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
+   */
+  int frame_header_count;
 
-  // whether any no-zero delta_q was actually used
+  /*!
+   * Whether any no-zero delta_q was actually used.
+   */
   int deltaq_used;
 
-  // Indicates the true relative distance of ref frame w.r.t. current frame
-  int ref_relative_dist[INTER_REFS_PER_FRAME];
-
-  // Indicate nearest references w.r.t. current frame in past and future
-  int8_t nearest_past_ref;
-  int8_t nearest_future_ref;
+  /*!
+   * Refrence frame distance related variables.
+   */
+  RefFrameDistanceInfo ref_frame_dist_info;
 
-  // TODO(sdeng): consider merge the following arrays.
-  double *tpl_rdmult_scaling_factors;
-  double *tpl_sb_rdmult_scaling_factors;
+  /*!
+   * ssim_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
+   * the ith 16 x 16 block in raster scan order. This scaling factor is used for
+   * RD multiplier modulation when SSIM tuning is enabled.
+   */
   double *ssim_rdmult_scaling_factors;
 
 #if CONFIG_TUNE_VMAF
-  double *vmaf_rdmult_scaling_factors;
-  double last_frame_ysse;
-  double last_frame_vmaf;
-  double last_frame_unsharp_amount;
+  /*!
+   * Parameters for VMAF tuning.
+   */
+  TuneVMAFInfo vmaf_info;
 #endif
 
-  int use_svc;
+#if CONFIG_TUNE_BUTTERAUGLI
+  /*!
+   * Parameters for Butteraugli tuning.
+   */
+  TuneButteraugliInfo butteraugli_info;
+#endif
+
+  /*!
+   * Parameters for scalable video coding.
+   */
   SVC svc;
 
-  int lap_enabled;
+  /*!
+   * Indicates whether current processing stage is encode stage or LAP stage.
+   */
   COMPRESSOR_STAGE compressor_stage;
 
-  // Some motion vector stats from the last encoded frame to help us decide what
-  // precision to use to encode the current frame.
-  MV_STATS mv_stats;
-
-  // Frame type of the last frame. May be used in some heuristics for speeding
-  // up the encoding.
+  /*!
+   * Frame type of the last frame. May be used in some heuristics for speeding
+   * up the encoding.
+   */
   FRAME_TYPE last_frame_type;
+
+  /*!
+   * Number of tile-groups.
+   */
   int num_tg;
 
-  // Super-resolution mode currently being used by the encoder.
-  // This may / may not be same as user-supplied mode in oxcf->superres_mode
-  // (when we are recoding to try multiple options for example).
-  SUPERRES_MODE superres_mode;
+  /*!
+   * Super-resolution mode currently being used by the encoder.
+   * This may / may not be same as user-supplied mode in oxcf->superres_mode
+   * (when we are recoding to try multiple options for example).
+   */
+  aom_superres_mode superres_mode;
+
+  /*!
+   * First pass related data.
+   */
+  FirstPassData firstpass_data;
+
+  /*!
+   * Temporal Noise Estimate
+   */
+  NOISE_ESTIMATE noise_estimate;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  /*!
+   * Temporal Denoiser
+   */
+  AV1_DENOISER denoiser;
+#endif
+
+  /*!
+   * Count on how many consecutive times a block uses small/zeromv for encoding
+   * in a scale of 8x8 block.
+   */
+  uint8_t *consec_zero_mv;
+
+  /*!
+   * Block size of first pass encoding
+   */
+  BLOCK_SIZE fp_block_size;
+
+  /*!
+   * The counter of encoded super block, used to differentiate block names.
+   * This number starts from 0 and increases whenever a super block is encoded.
+   */
+  int sb_counter;
+
+  /*!
+   * Available bitstream buffer size in bytes
+   */
+  size_t available_bs_size;
+
+  /*!
+   * The controller of the external partition model.
+   * It is used to do partition type selection based on external models.
+   */
+  ExtPartController ext_part_controller;
+
+  /*!
+   * Motion vector stats of the current encoded frame, used to update the
+   * ppi->mv_stats during postencode.
+   */
+  MV_STATS mv_stats;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FRAME_PARALLEL_ENCODE_2
+  /*!
+   * Stores the reference refresh index for the current frame.
+   */
+  int ref_refresh_index;
+
+  /*!
+   * A flag to indicate if the reference refresh index is available for the
+   * current frame.
+   */
+  bool refresh_idx_available;
+
+  /*!
+   * Reference frame index corresponding to the frame to be excluded from being
+   * used as a reference by frame_parallel_level 2 frame in a parallel
+   * encode set of lower layer frames.
+   */
+  int ref_idx_to_skip;
+#if CONFIG_FPMT_TEST
+  /*!
+   * Stores the wanted frame buffer index for choosing primary ref frame by a
+   * frame_parallel_level 2 frame in a parallel encode set of lower layer
+   * frames.
+   */
+
+  int wanted_fb;
+#endif
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FRAME_PARALLEL_ENCODE_2
+
+  /*!
+   * A flag to indicate frames that will update their data to the primary
+   * context at the end of the encode. It is set for non-parallel frames and the
+   * last frame in encode order in a given parallel encode set.
+   */
+  bool do_frame_data_update;
+
+#if CONFIG_RD_COMMAND
+  /*!
+   *  A structure for assigning external q_index / rdmult for experiments
+   */
+  RD_COMMAND rd_command;
+#endif  // CONFIG_RD_COMMAND
+
+  /*!
+   * Buffer to store MB variance after Wiener filter.
+   */
+  WeberStats *mb_weber_stats;
+
+  /*!
+   * Buffer to store MB variance after Wiener filter.
+   */
+  BLOCK_SIZE weber_bsize;
+
+  /*!
+   * Frame level Wiener filter normalization.
+   */
+  int64_t norm_wiener_variance;
+
+  /*!
+   * Buffer to store delta-q values for delta-q mode 4.
+   */
+  int *mb_delta_q;
+
+  /*!
+   * Flag to indicate that current frame is dropped.
+   */
+  bool is_dropped_frame;
+
+#if CONFIG_BITRATE_ACCURACY
+  /*!
+   * Structure stores information needed for bitrate accuracy experiment.
+   */
+  VBR_RATECTRL_INFO vbr_rc_info;
+#endif
+
+#if CONFIG_RATECTRL_LOG
+  /*!
+   * Structure stores information of rate control decisions.
+   */
+  RATECTRL_LOG rc_log;
+#endif  // CONFIG_RATECTRL_LOG
+
+  /*!
+   * Frame level twopass status and control data
+   */
+  TWO_PASS_FRAME twopass_frame;
+
+  /*!
+   * Context needed for third pass encoding.
+   */
+  THIRD_PASS_DEC_CTX *third_pass_ctx;
+
+  /*!
+   * File pointer to second pass log
+   */
+  FILE *second_pass_log_stream;
+
+  /*!
+   * Buffer to store 64x64 SAD
+   */
+  uint64_t *src_sad_blk_64x64;
 } AV1_COMP;
 
-typedef struct {
+/*!
+ * \brief Input frames and last input frame
+ */
+typedef struct EncodeFrameInput {
+  /*!\cond */
   YV12_BUFFER_CONFIG *source;
   YV12_BUFFER_CONFIG *last_source;
   int64_t ts_duration;
+  /*!\endcond */
 } EncodeFrameInput;
 
-// EncodeFrameParams contains per-frame encoding parameters decided upon by
-// av1_encode_strategy() and passed down to av1_encode()
-struct EncodeFrameParams {
+/*!
+ * \brief contains per-frame encoding parameters decided upon by
+ * av1_encode_strategy() and passed down to av1_encode().
+ */
+typedef struct EncodeFrameParams {
+  /*!
+   * Is error resilient mode enabled
+   */
   int error_resilient_mode;
+  /*!
+   * Frame type (eg KF vs inter frame etc)
+   */
   FRAME_TYPE frame_type;
+
+  /*!\cond */
   int primary_ref_frame;
   int order_offset;
+
+  /*!\endcond */
+  /*!
+   * Should the current frame be displayed after being decoded
+   */
   int show_frame;
+
+  /*!\cond */
   int refresh_frame_flags;
 
   int show_existing_frame;
   int existing_fb_idx_to_show;
 
-  // Bitmask of which reference buffers may be referenced by this frame
+  /*!\endcond */
+  /*!
+   *  Bitmask of which reference buffers may be referenced by this frame.
+   */
   int ref_frame_flags;
 
-  // Reference buffer assignment for this frame.
+  /*!
+   *  Reference buffer assignment for this frame.
+   */
   int remapped_ref_idx[REF_FRAMES];
 
-  // Flags which determine which reference buffers are refreshed by this frame
-  int refresh_golden_frame;
-  int refresh_bwd_ref_frame;
-  int refresh_alt_ref_frame;
+  /*!
+   *  Flags which determine which reference buffers are refreshed by this
+   *  frame.
+   */
+  RefreshFrameInfo refresh_frame;
 
-  // Speed level to use for this frame: Bigger number means faster.
+  /*!
+   *  Speed level to use for this frame: Bigger number means faster.
+   */
   int speed;
-};
-typedef struct EncodeFrameParams EncodeFrameParams;
+} EncodeFrameParams;
+
+/*!\cond */
 
 // EncodeFrameResults contains information about the result of encoding a
 // single frame
@@ -1394,39 +3410,120 @@ typedef struct {
   size_t size;  // Size of resulting bitstream
 } EncodeFrameResults;
 
-// Must not be called more than once.
-void av1_initialize_enc(void);
+void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage);
 
-struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
+struct AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi,
+                                       const AV1EncoderConfig *oxcf,
                                        BufferPool *const pool,
-                                       FIRSTPASS_STATS *frame_stats_buf,
                                        COMPRESSOR_STAGE stage,
-                                       int num_lap_buffers,
-                                       int lap_lag_in_frames,
-                                       STATS_BUFFER_CTX *stats_buf_context);
+                                       int lap_lag_in_frames);
+
+struct AV1_PRIMARY *av1_create_primary_compressor(
+    struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers,
+    const AV1EncoderConfig *oxcf);
+
 void av1_remove_compressor(AV1_COMP *cpi);
 
-void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf);
+void av1_remove_primary_compressor(AV1_PRIMARY *ppi);
+
+#if CONFIG_ENTROPY_STATS
+void print_entropy_stats(AV1_PRIMARY *const ppi);
+#endif
+#if CONFIG_INTERNAL_STATS
+void print_internal_stats(AV1_PRIMARY *ppi);
+#endif
+
+void av1_change_config_seq(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
+                           bool *sb_size_changed);
+
+void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
+                       bool sb_size_changed);
 
 void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
                              int subsampling_x, int subsampling_y);
 
-// receive a frames worth of data. caller can assume that a copy of this
-// frame is made and not just a copy of the pointer..
+void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
+                               const AV1EncoderConfig *oxcf, int use_svc);
+
+void av1_post_encode_updates(AV1_COMP *const cpi,
+                             const AV1_COMP_DATA *const cpi_data);
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map);
+
+void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool,
+                                          int ref_buffers_used_map);
+
+void av1_release_scaled_references_fpmt(AV1_COMP *cpi);
+
+void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool,
+                                   int ref_buffers_used_map);
+
+void av1_init_sc_decisions(AV1_PRIMARY *const ppi);
+
+AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi,
+                                          AV1_COMP_DATA *const first_cpi_data);
+
+int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data,
+                                    AV1_PRIMARY *const ppi,
+                                    int *ref_buffers_used_map);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+/*!\endcond */
+
+/*!\brief Obtain the raw frame data
+ *
+ * \ingroup high_level_algo
+ * This function receives the raw frame data from input.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    frame_flags    Flags to decide how to encoding the frame
+ * \param[in]    sd             Contain raw frame data
+ * \param[in]    time_stamp     Time stamp of the frame
+ * \param[in]    end_time_stamp End time stamp
+ *
+ * \return Returns a value to indicate if the frame data is received
+ * successfully.
+ * \note The caller can assume that a copy of this frame is made and not just a
+ * copy of the pointer.
+ */
 int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time_stamp);
 
-int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
-                            size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush,
-                            const aom_rational64_t *timebase);
+/*!\brief Encode a frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ * This function encodes the raw frame data, and outputs the frame bit stream
+ * to the designated buffer. The caller should use the output parameters
+ * cpi_data->ts_frame_start and cpi_data->ts_frame_end only when this function
+ * returns AOM_CODEC_OK.
+ *
+ * \param[in]     cpi         Top-level encoder structure
+ * \param[in,out] cpi_data    Data corresponding to a frame encode
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ *     No frame encoded; more input is required.
+ * \retval #AOM_CODEC_ERROR
+ */
+int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data);
 
+/*!\brief Run 1-pass/2-pass encoding
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ */
 int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
                const EncodeFrameInput *const frame_input,
                const EncodeFrameParams *const frame_params,
                EncodeFrameResults *const frame_results);
 
+/*!\cond */
 int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
 
 int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
@@ -1445,8 +3542,7 @@ int av1_set_size_literal(AV1_COMP *cpi, int width, int height);
 
 void av1_set_frame_size(AV1_COMP *cpi, int width, int height);
 
-int av1_update_entropy(bool *ext_refresh_frame_context,
-                       bool *ext_refresh_frame_context_pending, bool update);
+void av1_set_mv_search_params(AV1_COMP *cpi);
 
 int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
 
@@ -1460,52 +3556,86 @@ int av1_get_quantizer(struct AV1_COMP *cpi);
 
 int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
 
-void av1_alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
-                                        CompoundTypeRdBuffers *const bufs);
-void av1_release_compound_type_rd_buffers(CompoundTypeRdBuffers *const bufs);
-
 // Set screen content options.
 // This function estimates whether to use screen content tools, by counting
 // the portion of blocks that have few luma colors.
 // Modifies:
-//   cpi->commom.allow_screen_content_tools
-//   cpi->common.allow_intrabc
+//   cpi->commom.features.allow_screen_content_tools
+//   cpi->common.features.allow_intrabc
+//   cpi->use_screen_content_tools
+//   cpi->is_screen_content_type
 // However, the estimation is not accurate and may misclassify videos.
 // A slower but more accurate approach that determines whether to use screen
-// content tools is employed later. See determine_sc_tools_with_encoding().
-void av1_set_screen_content_options(const struct AV1_COMP *cpi,
+// content tools is employed later. See av1_determine_sc_tools_with_encoding().
+void av1_set_screen_content_options(struct AV1_COMP *cpi,
                                     FeatureFlags *features);
 
-// TODO(jingning): Move these functions as primitive members for the new cpi
-// class.
-static INLINE void stack_push(int *stack, int *stack_size, int item) {
-  for (int i = *stack_size - 1; i >= 0; --i) stack[i + 1] = stack[i];
-  stack[0] = item;
-  ++*stack_size;
-}
+void av1_update_frame_size(AV1_COMP *cpi);
 
-static INLINE int stack_pop(int *stack, int *stack_size) {
-  if (*stack_size <= 0) return -1;
-
-  int item = stack[0];
-  for (int i = 0; i < *stack_size; ++i) stack[i] = stack[i + 1];
-  --*stack_size;
-
-  return item;
-}
-
-static INLINE int stack_pop_end(int *stack, int *stack_size) {
-  int item = stack[*stack_size - 1];
-  stack[*stack_size - 1] = -1;
-  --*stack_size;
-
-  return item;
+typedef struct {
+  int pyr_level;
+  int disp_order;
+} RefFrameMapPair;
+
+static INLINE void init_ref_map_pair(
+    AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) {
+  if (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE) {
+    memset(ref_frame_map_pairs, -1, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
+    return;
+  }
+  memset(ref_frame_map_pairs, 0, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
+  for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+    // Get reference frame buffer.
+    const RefCntBuffer *const buf = cpi->common.ref_frame_map[map_idx];
+    if (ref_frame_map_pairs[map_idx].disp_order == -1) continue;
+    if (buf == NULL) {
+      ref_frame_map_pairs[map_idx].disp_order = -1;
+      ref_frame_map_pairs[map_idx].pyr_level = -1;
+      continue;
+    } else if (buf->ref_count > 1) {
+      // Once the keyframe is coded, the slots in ref_frame_map will all
+      // point to the same frame. In that case, all subsequent pointers
+      // matching the current are considered "free" slots. This will find
+      // the next occurrence of the current pointer if ref_count indicates
+      // there are multiple instances of it and mark it as free.
+      for (int idx2 = map_idx + 1; idx2 < REF_FRAMES; ++idx2) {
+        const RefCntBuffer *const buf2 = cpi->common.ref_frame_map[idx2];
+        if (buf2 == buf) {
+          ref_frame_map_pairs[idx2].disp_order = -1;
+          ref_frame_map_pairs[idx2].pyr_level = -1;
+        }
+      }
+    }
+    ref_frame_map_pairs[map_idx].disp_order = (int)buf->display_order_hint;
+    ref_frame_map_pairs[map_idx].pyr_level = buf->pyramid_level;
+  }
 }
 
-static INLINE void stack_reset(int *stack, int *stack_size) {
-  for (int i = 0; i < *stack_size; ++i) stack[i] = INVALID_IDX;
-  *stack_size = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+static AOM_INLINE void calc_frame_data_update_flag(
+    GF_GROUP *const gf_group, int gf_frame_index,
+    bool *const do_frame_data_update) {
+  *do_frame_data_update = true;
+  // Set the flag to false for all frames in a given parallel encode set except
+  // the last frame in the set with frame_parallel_level = 2.
+  if (gf_group->frame_parallel_level[gf_frame_index] == 1) {
+    *do_frame_data_update = false;
+  } else if (gf_group->frame_parallel_level[gf_frame_index] == 2) {
+    // Check if this is the last frame in the set with frame_parallel_level = 2.
+    for (int i = gf_frame_index + 1; i < gf_group->size; i++) {
+      if ((gf_group->frame_parallel_level[i] == 0 &&
+           (gf_group->update_type[i] == ARF_UPDATE ||
+            gf_group->update_type[i] == INTNL_ARF_UPDATE)) ||
+          gf_group->frame_parallel_level[i] == 1) {
+        break;
+      } else if (gf_group->frame_parallel_level[i] == 2) {
+        *do_frame_data_update = false;
+        break;
+      }
+    }
+  }
 }
+#endif
 
 // av1 uses 10,000,000 ticks/second as time stamp
 #define TICKS_PER_SEC 10000000LL
@@ -1523,8 +3653,9 @@ ticks_to_timebase_units(const aom_rational64_t *timestamp_ratio, int64_t n) {
 }
 
 static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const FRAME_UPDATE_TYPE update_type =
+      gf_group->update_type[cpi->gf_frame_index];
 
   return frame_is_intra_only(&cpi->common) || update_type == ARF_UPDATE ||
          update_type == GF_UPDATE;
@@ -1543,17 +3674,6 @@ static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf(
   return buf != NULL ? &buf->buf : NULL;
 }
 
-static INLINE int enc_is_ref_frame_buf(const AV1_COMMON *const cm,
-                                       const RefCntBuffer *const frame_buf) {
-  MV_REFERENCE_FRAME ref_frame;
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
-    if (buf == NULL) continue;
-    if (frame_buf == buf) break;
-  }
-  return (ref_frame <= ALTREF_FRAME);
-}
-
 static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
   assert(buf != NULL);
   ensure_mv_buffer(buf, cm);
@@ -1561,35 +3681,18 @@ static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
   buf->height = cm->height;
 }
 
-// Token buffer is only used for palette tokens.
-static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
-                                           int sb_size_log2,
-                                           const int num_planes) {
-  // Calculate the maximum number of max superblocks in the image.
-  const int shift = sb_size_log2 - 4;
-  const int sb_size = 1 << sb_size_log2;
-  const int sb_size_square = sb_size * sb_size;
-  const int sb_rows = ALIGN_POWER_OF_TWO(mb_rows, shift) >> shift;
-  const int sb_cols = ALIGN_POWER_OF_TWO(mb_cols, shift) >> shift;
-
-  // One palette token for each pixel. There can be palettes on two planes.
-  const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square;
-
-  return sb_rows * sb_cols * sb_palette_toks;
-}
-
 // Get the allocated token size for a tile. It does the same calculation as in
 // the frame token allocation.
-static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2,
-                                            int num_planes) {
-  int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 2) >> 2;
-  int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 2) >> 2;
+static INLINE unsigned int allocated_tokens(const TileInfo *tile,
+                                            int sb_size_log2, int num_planes) {
+  int tile_mb_rows = (tile->mi_row_end - tile->mi_row_start + 2) >> 2;
+  int tile_mb_cols = (tile->mi_col_end - tile->mi_col_start + 2) >> 2;
 
   return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
 }
 
 static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
-                                 int mi_row, TOKENEXTRA **tok, int sb_size_log2,
+                                 int mi_row, TokenExtra **tok, int sb_size_log2,
                                  int num_planes) {
   AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tiles.cols;
@@ -1600,39 +3703,72 @@ static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
       (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
   const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2;
 
-  *tok = cpi->tile_tok[tile_row][tile_col] +
+  *tok = cpi->token_info.tile_tok[tile_row][tile_col] +
          get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes);
 }
 
 void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
 
 #define ALT_MIN_LAG 3
-static INLINE int is_altref_enabled(const AV1_COMP *const cpi) {
-  return cpi->oxcf.lag_in_frames >= ALT_MIN_LAG && cpi->oxcf.enable_auto_arf;
+static INLINE int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) {
+  return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf;
+}
+
+static AOM_INLINE int can_disable_altref(const GFConfig *gf_cfg) {
+  return is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) &&
+         (gf_cfg->gf_min_pyr_height == 0);
+}
+
+// Helper function to compute number of blocks on either side of the frame.
+static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
+  return (frame_length + mb_length - 1) / mb_length;
 }
 
 // Check if statistics generation stage
 static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) {
   assert(IMPLIES(cpi->compressor_stage == LAP_STAGE,
-                 cpi->oxcf.pass == 0 && cpi->lap_enabled));
-  return (cpi->oxcf.pass == 1 || (cpi->compressor_stage == LAP_STAGE));
+                 cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->ppi->lap_enabled));
+  return (cpi->oxcf.pass == AOM_RC_FIRST_PASS ||
+          (cpi->compressor_stage == LAP_STAGE));
 }
 // Check if statistics consumption stage
 static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) {
-  return (cpi->oxcf.pass == 2);
+  return (cpi->oxcf.pass >= AOM_RC_SECOND_PASS);
 }
 
 // Check if statistics consumption stage
 static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) {
   return (is_stat_consumption_stage_twopass(cpi) ||
-          (cpi->oxcf.pass == 0 && (cpi->compressor_stage == ENCODE_STAGE) &&
-           cpi->lap_enabled));
+          (cpi->oxcf.pass == AOM_RC_ONE_PASS &&
+           (cpi->compressor_stage == ENCODE_STAGE) && cpi->ppi->lap_enabled));
+}
+
+// Decide whether 'dv_costs' need to be allocated/stored during the encoding.
+static AOM_INLINE bool av1_need_dv_costs(const AV1_COMP *const cpi) {
+  return !cpi->sf.rt_sf.use_nonrd_pick_mode &&
+         av1_allow_intrabc(&cpi->common) && !is_stat_generation_stage(cpi);
 }
 
-// Check if the current stage has statistics
+/*!\endcond */
+/*!\brief Check if the current stage has statistics
+ *
+ *\ingroup two_pass_algo
+ *
+ * \param[in]    cpi     Top - level encoder instance structure
+ *
+ * \return 0 if no stats for current stage else 1
+ */
 static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) {
-  assert(IMPLIES(!cpi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
-  return (cpi->oxcf.pass == 0 && !cpi->lap_enabled);
+  assert(
+      IMPLIES(!cpi->ppi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
+  return (cpi->oxcf.pass == AOM_RC_ONE_PASS && !cpi->ppi->lap_enabled);
+}
+
+/*!\cond */
+
+static INLINE int is_one_pass_rt_params(const AV1_COMP *cpi) {
+  return has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+         cpi->oxcf.gf_cfg.lag_in_frames == 0;
 }
 
 // Function return size of frame stats buffer
@@ -1641,7 +3777,7 @@ static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
   return (num_lap_buffer > 0 ? num_lap_buffer + 1 : num_lag_buffer);
 }
 
-// TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
+// TODO(zoeliu): To set up cpi->oxcf.gf_cfg.enable_auto_brf
 
 static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
@@ -1755,17 +3891,9 @@ static const MV_REFERENCE_FRAME disable_order[] = {
   LAST3_FRAME,
   LAST2_FRAME,
   ALTREF2_FRAME,
-  GOLDEN_FRAME,
+  BWDREF_FRAME,
 };
 
-static INLINE int get_max_allowed_ref_frames(const AV1_COMP *cpi) {
-  const unsigned int max_allowed_refs_for_given_speed =
-      (cpi->sf.inter_sf.selective_ref_frame >= 3) ? INTER_REFS_PER_FRAME - 1
-                                                  : INTER_REFS_PER_FRAME;
-  return AOMMIN(max_allowed_refs_for_given_speed,
-                cpi->oxcf.max_reference_frames);
-}
-
 static const MV_REFERENCE_FRAME
     ref_frame_priority_order[INTER_REFS_PER_FRAME] = {
       LAST_FRAME,    ALTREF_FRAME, BWDREF_FRAME, GOLDEN_FRAME,
@@ -1773,6 +3901,7 @@ static const MV_REFERENCE_FRAME
     };
 
 static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf,
+                                      const int use_one_pass_rt_params,
                                       const YV12_BUFFER_CONFIG **ref_frames,
                                       const int ext_ref_frame_flags) {
   // cpi->ext_flags.ref_frame_flags allows certain reference types to be
@@ -1784,14 +3913,18 @@ static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf,
   for (int i = 1; i < INTER_REFS_PER_FRAME; ++i) {
     const YV12_BUFFER_CONFIG *const this_ref = ref_frames[i];
     // If this_ref has appeared before, mark the corresponding ref frame as
-    // invalid. For nonrd mode, only disable GOLDEN_FRAME if it's the same
-    // as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd).
-    int index = (sf->rt_sf.use_nonrd_pick_mode &&
-                 ref_frame_priority_order[i] == GOLDEN_FRAME)
-                    ? (1 + sf->rt_sf.use_nonrd_altref_frame)
-                    : i;
+    // invalid. For one_pass_rt mode, only disable GOLDEN_FRAME if it's the
+    // same as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd).
+    int index =
+        (use_one_pass_rt_params && ref_frame_priority_order[i] == GOLDEN_FRAME)
+            ? (1 + sf->rt_sf.use_nonrd_altref_frame)
+            : i;
     for (int j = 0; j < index; ++j) {
-      if (this_ref == ref_frames[j]) {
+      // If this_ref has appeared before (same as the reference corresponding
+      // to lower index j), remove it as a reference only if that reference
+      // (for index j) is actually used as a reference.
+      if (this_ref == ref_frames[j] &&
+          (flags & (1 << (ref_frame_priority_order[j] - 1)))) {
         flags &= ~(1 << (ref_frame_priority_order[i] - 1));
         break;
       }
@@ -1800,40 +3933,6 @@ static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf,
   return flags;
 }
 
-// Enforce the number of references for each arbitrary frame based on user
-// options and speed.
-static AOM_INLINE void enforce_max_ref_frames(AV1_COMP *cpi,
-                                              int *ref_frame_flags) {
-  MV_REFERENCE_FRAME ref_frame;
-  int total_valid_refs = 0;
-
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
-      total_valid_refs++;
-    }
-  }
-
-  const int max_allowed_refs = get_max_allowed_ref_frames(cpi);
-
-  for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) {
-    const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i];
-
-    if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) {
-      continue;
-    }
-
-    switch (ref_frame_to_disable) {
-      case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break;
-      case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break;
-      case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break;
-      case GOLDEN_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break;
-      default: assert(0);
-    }
-    --total_valid_refs;
-  }
-  assert(total_valid_refs <= max_allowed_refs);
-}
-
 // Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
 // failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
 // function, the memory must be freed by the caller. Both the buf member of the
@@ -1843,50 +3942,75 @@ static AOM_INLINE void enforce_max_ref_frames(AV1_COMP *cpi,
 // Note: The OBU returned is in Low Overhead Bitstream Format. Specifically,
 // the obu_has_size_field bit is set, and the buffer contains the obu_size
 // field.
-aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);
+aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi);
 
 #define MAX_GFUBOOST_FACTOR 10.0
 #define MIN_GFUBOOST_FACTOR 4.0
-double av1_get_gfu_boost_projection_factor(double min_factor, double max_factor,
-                                           int frame_count);
-double av1_get_kf_boost_projection_factor(int frame_count);
 
-#define ENABLE_KF_TPL 1
-#define MAX_PYR_LEVEL_FROMTOP_DELTAQ 0
-
-static INLINE int is_frame_kf_and_tpl_eligible(AV1_COMP *const cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  return (cm->current_frame.frame_type == KEY_FRAME) && cm->show_frame &&
-         (cpi->rc.frames_to_key > 1);
-}
-
-static INLINE int is_frame_arf_and_tpl_eligible(const GF_GROUP *gf_group) {
-  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
-  return update_type == ARF_UPDATE || update_type == GF_UPDATE;
+static INLINE int is_frame_tpl_eligible(const GF_GROUP *const gf_group,
+                                        uint8_t index) {
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[index];
+  return update_type == ARF_UPDATE || update_type == GF_UPDATE ||
+         update_type == KF_UPDATE;
 }
 
-static INLINE int is_frame_tpl_eligible(AV1_COMP *const cpi) {
-#if ENABLE_KF_TPL
-  return is_frame_kf_and_tpl_eligible(cpi) ||
-         is_frame_arf_and_tpl_eligible(&cpi->gf_group);
-#else
-  return is_frame_arf_and_tpl_eligible(&cpi->gf_group);
-#endif  // ENABLE_KF_TPL
+static INLINE int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group,
+                                                    int selective_ref_frame,
+                                                    int prune_ref_frames,
+                                                    int gf_index) {
+  return (selective_ref_frame > 0) && (prune_ref_frames > 0) &&
+         !is_frame_tpl_eligible(gf_group, gf_index);
 }
 
 // Get update type of the current frame.
-static INLINE FRAME_UPDATE_TYPE
-get_frame_update_type(const GF_GROUP *gf_group) {
-  return gf_group->update_type[gf_group->index];
+static INLINE FRAME_UPDATE_TYPE get_frame_update_type(const GF_GROUP *gf_group,
+                                                      int gf_frame_index) {
+  return gf_group->update_type[gf_frame_index];
 }
 
 static INLINE int av1_pixels_to_mi(int pixels) {
   return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2;
 }
 
+static AOM_INLINE int is_psnr_calc_enabled(const AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) &&
+         cm->show_frame;
+}
+
+static INLINE int is_frame_resize_pending(AV1_COMP *const cpi) {
+  ResizePendingParams *const resize_pending_params =
+      &cpi->resize_pending_params;
+  return (resize_pending_params->width && resize_pending_params->height &&
+          (cpi->common.width != resize_pending_params->width ||
+           cpi->common.height != resize_pending_params->height));
+}
+
+// Check if loop restoration filter is used.
+static INLINE int is_restoration_used(const AV1_COMMON *const cm) {
+  return cm->seq_params->enable_restoration && !cm->features.all_lossless &&
+         !cm->tiles.large_scale;
+}
+
+static INLINE int is_inter_tx_size_search_level_one(
+    const TX_SPEED_FEATURES *tx_sf) {
+  return (tx_sf->inter_tx_size_search_init_depth_rect >= 1 &&
+          tx_sf->inter_tx_size_search_init_depth_sqr >= 1);
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static INLINE int denoise_svc(const struct AV1_COMP *const cpi) {
+  return (!cpi->ppi->use_svc ||
+          (cpi->ppi->use_svc &&
+           cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
+}
+#endif
+
 #if CONFIG_COLLECT_PARTITION_STATS == 2
-static INLINE void av1_print_partition_stats(PartitionStats *part_stats) {
-  FILE *f = fopen("partition_stats.csv", "w");
+static INLINE void av1_print_fr_partition_timing_stats(
+    const FramePartitionTimingStats *part_stats, const char *filename) {
+  FILE *f = fopen(filename, "w");
   if (!f) {
     return;
   }
@@ -1903,7 +4027,7 @@ static INLINE void av1_print_partition_stats(PartitionStats *part_stats) {
   }
   fprintf(f, "\n");
 
-  const int bsizes[6] = { 128, 64, 32, 16, 8, 4 };
+  static const int bsizes[6] = { 128, 64, 32, 16, 8, 4 };
 
   for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) {
     fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo);
@@ -1920,7 +4044,9 @@ static INLINE void av1_print_partition_stats(PartitionStats *part_stats) {
   }
   fclose(f);
 }
+#endif  // CONFIG_COLLECT_PARTITION_STATS == 2
 
+#if CONFIG_COLLECT_PARTITION_STATS
 static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) {
   assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
          bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 ||
@@ -1935,7 +4061,7 @@ static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) {
     default: assert(0 && "Invalid bsize for partition_stats."); return -1;
   }
 }
-#endif
+#endif  // CONFIG_COLLECT_PARTITION_STATS
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
 static INLINE void start_timing(AV1_COMP *cpi, int component) {
@@ -1958,6 +4084,8 @@ static INLINE char const *get_frame_type_enum(int type) {
 }
 #endif
 
+/*!\endcond */
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/encoder_alloc.h b/media/libaom/src/av1/encoder/encoder_alloc.h
new file mode 100644
index 0000000000..eec0903900
--- /dev/null
+++ b/media/libaom/src/av1/encoder/encoder_alloc.h
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_ALLOC_H_
+#define AOM_AV1_ENCODER_ENCODER_ALLOC_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static AOM_INLINE void dealloc_context_buffers_ext(
+    MBMIExtFrameBufferInfo *mbmi_ext_info) {
+  if (mbmi_ext_info->frame_base) {
+    aom_free(mbmi_ext_info->frame_base);
+    mbmi_ext_info->frame_base = NULL;
+    mbmi_ext_info->alloc_size = 0;
+  }
+}
+
+static AOM_INLINE void alloc_context_buffers_ext(
+    AV1_COMMON *cm, MBMIExtFrameBufferInfo *mbmi_ext_info) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  const int mi_alloc_rows =
+      (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int mi_alloc_cols =
+      (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols;
+
+  if (new_ext_mi_size > mbmi_ext_info->alloc_size) {
+    dealloc_context_buffers_ext(mbmi_ext_info);
+    CHECK_MEM_ERROR(
+        cm, mbmi_ext_info->frame_base,
+        aom_calloc(new_ext_mi_size, sizeof(*mbmi_ext_info->frame_base)));
+    mbmi_ext_info->alloc_size = new_ext_mi_size;
+  }
+  // The stride needs to be updated regardless of whether new allocation
+  // happened or not.
+  mbmi_ext_info->stride = mi_alloc_cols;
+}
+
+static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  // Setup mi_params
+  mi_params->set_mb_mi(mi_params, cm->width, cm->height,
+                       cpi->sf.part_sf.default_min_partition_size);
+
+  if (!is_stat_generation_stage(cpi)) av1_alloc_txb_buf(cpi);
+
+  if (cpi->td.mb.mv_costs) {
+    aom_free(cpi->td.mb.mv_costs);
+    cpi->td.mb.mv_costs = NULL;
+  }
+  // Avoid the memory allocation of 'mv_costs' for allintra encoding mode.
+  if (cpi->oxcf.kf_cfg.key_freq_max != 0) {
+    CHECK_MEM_ERROR(cm, cpi->td.mb.mv_costs,
+                    (MvCosts *)aom_calloc(1, sizeof(MvCosts)));
+  }
+
+  av1_setup_shared_coeff_buffer(cm->seq_params, &cpi->td.shared_coeff_buf,
+                                cm->error);
+  av1_setup_sms_tree(cpi, &cpi->td);
+  cpi->td.firstpass_ctx =
+      av1_alloc_pmc(cpi, BLOCK_16X16, &cpi->td.shared_coeff_buf);
+}
+
+// Allocate mbmi buffers which are used to store mode information at block
+// level.
+static AOM_INLINE void alloc_mb_mode_info_buffers(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (av1_alloc_context_buffers(cm, cm->width, cm->height,
+                                cpi->sf.part_sf.default_min_partition_size)) {
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate context buffers");
+  }
+
+  if (!is_stat_generation_stage(cpi))
+    alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
+}
+
+static AOM_INLINE void realloc_segmentation_maps(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  // Create the encoder segmentation map and set all entries to 0
+  aom_free(cpi->enc_seg.map);
+  CHECK_MEM_ERROR(cm, cpi->enc_seg.map,
+                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+
+  // Create a map used for cyclic background refresh.
+  if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
+  CHECK_MEM_ERROR(
+      cm, cpi->cyclic_refresh,
+      av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols));
+
+  // Create a map used to mark inactive areas.
+  aom_free(cpi->active_map.map);
+  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+}
+
+static AOM_INLINE void alloc_obmc_buffers(
+    OBMCBuffer *obmc_buffer, struct aom_internal_error_info *error) {
+  AOM_CHECK_MEM_ERROR(
+      error, obmc_buffer->wsrc,
+      (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->wsrc)));
+  AOM_CHECK_MEM_ERROR(
+      error, obmc_buffer->mask,
+      (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->mask)));
+  AOM_CHECK_MEM_ERROR(
+      error, obmc_buffer->above_pred,
+      (uint8_t *)aom_memalign(
+          16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->above_pred)));
+  AOM_CHECK_MEM_ERROR(
+      error, obmc_buffer->left_pred,
+      (uint8_t *)aom_memalign(
+          16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->left_pred)));
+}
+
+static AOM_INLINE void release_obmc_buffers(OBMCBuffer *obmc_buffer) {
+  aom_free(obmc_buffer->mask);
+  aom_free(obmc_buffer->above_pred);
+  aom_free(obmc_buffer->left_pred);
+  aom_free(obmc_buffer->wsrc);
+
+  obmc_buffer->mask = NULL;
+  obmc_buffer->above_pred = NULL;
+  obmc_buffer->left_pred = NULL;
+  obmc_buffer->wsrc = NULL;
+}
+
+static AOM_INLINE void alloc_compound_type_rd_buffers(
+    struct aom_internal_error_info *error, CompoundTypeRdBuffers *const bufs) {
+  AOM_CHECK_MEM_ERROR(
+      error, bufs->pred0,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
+  AOM_CHECK_MEM_ERROR(
+      error, bufs->pred1,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
+  AOM_CHECK_MEM_ERROR(
+      error, bufs->residual1,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
+  AOM_CHECK_MEM_ERROR(
+      error, bufs->diff10,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
+  AOM_CHECK_MEM_ERROR(error, bufs->tmp_best_mask_buf,
+                      (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+                                            sizeof(*bufs->tmp_best_mask_buf)));
+}
+
+static AOM_INLINE void release_compound_type_rd_buffers(
+    CompoundTypeRdBuffers *const bufs) {
+  aom_free(bufs->pred0);
+  aom_free(bufs->pred1);
+  aom_free(bufs->residual1);
+  aom_free(bufs->diff10);
+  aom_free(bufs->tmp_best_mask_buf);
+  av1_zero(*bufs);  // Set all pointers to NULL for safety.
+}
+
+static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  TokenInfo *token_info = &cpi->token_info;
+
+  dealloc_context_buffers_ext(&cpi->mbmi_ext_info);
+
+  aom_free(cpi->tile_data);
+  cpi->tile_data = NULL;
+
+  // Delete sementation map
+  aom_free(cpi->enc_seg.map);
+  cpi->enc_seg.map = NULL;
+
+  av1_cyclic_refresh_free(cpi->cyclic_refresh);
+  cpi->cyclic_refresh = NULL;
+
+  aom_free(cpi->active_map.map);
+  cpi->active_map.map = NULL;
+
+  aom_free(cpi->ssim_rdmult_scaling_factors);
+  cpi->ssim_rdmult_scaling_factors = NULL;
+
+  aom_free(cpi->tpl_rdmult_scaling_factors);
+  cpi->tpl_rdmult_scaling_factors = NULL;
+
+#if CONFIG_TUNE_VMAF
+  aom_free(cpi->vmaf_info.rdmult_scaling_factors);
+  cpi->vmaf_info.rdmult_scaling_factors = NULL;
+  aom_close_vmaf_model(cpi->vmaf_info.vmaf_model);
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+  aom_free(cpi->butteraugli_info.rdmult_scaling_factors);
+  cpi->butteraugli_info.rdmult_scaling_factors = NULL;
+  aom_free_frame_buffer(&cpi->butteraugli_info.source);
+  aom_free_frame_buffer(&cpi->butteraugli_info.resized_source);
+#endif
+
+  release_obmc_buffers(&cpi->td.mb.obmc_buffer);
+
+  if (cpi->td.mb.mv_costs) {
+    aom_free(cpi->td.mb.mv_costs);
+    cpi->td.mb.mv_costs = NULL;
+  }
+
+  if (cpi->td.mb.dv_costs) {
+    aom_free(cpi->td.mb.dv_costs);
+    cpi->td.mb.dv_costs = NULL;
+  }
+
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++) {
+      aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]);
+      cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL;
+    }
+
+  aom_free(cm->tpl_mvs);
+  cm->tpl_mvs = NULL;
+
+  if (cpi->td.pixel_gradient_info) {
+    aom_free(cpi->td.pixel_gradient_info);
+    cpi->td.pixel_gradient_info = NULL;
+  }
+
+  if (cpi->td.src_var_info_of_4x4_sub_blocks) {
+    aom_free(cpi->td.src_var_info_of_4x4_sub_blocks);
+    cpi->td.src_var_info_of_4x4_sub_blocks = NULL;
+  }
+
+  if (cpi->td.vt64x64) {
+    aom_free(cpi->td.vt64x64);
+    cpi->td.vt64x64 = NULL;
+  }
+
+  av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+  cpi->td.firstpass_ctx = NULL;
+
+  av1_free_txb_buf(cpi);
+  av1_free_context_buffers(cm);
+
+  aom_free_frame_buffer(&cpi->last_frame_uf);
+#if !CONFIG_REALTIME_ONLY
+  av1_free_restoration_buffers(cm);
+#endif
+
+  if (!is_stat_generation_stage(cpi)) {
+    av1_free_cdef_buffers(cm, &cpi->ppi->p_mt_info.cdef_worker,
+                          &cpi->mt_info.cdef_sync);
+  }
+
+  aom_free_frame_buffer(&cpi->trial_frame_rst);
+  aom_free_frame_buffer(&cpi->scaled_source);
+  aom_free_frame_buffer(&cpi->scaled_last_source);
+  aom_free_frame_buffer(&cpi->orig_source);
+
+  free_token_info(token_info);
+
+  av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+  av1_free_sms_tree(&cpi->td);
+
+  aom_free(cpi->td.mb.palette_buffer);
+  release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
+  aom_free(cpi->td.mb.tmp_conv_dst);
+  for (int j = 0; j < 2; ++j) {
+    aom_free(cpi->td.mb.tmp_pred_bufs[j]);
+  }
+
+#if CONFIG_DENOISE
+  if (cpi->denoise_and_model) {
+    aom_denoise_and_model_free(cpi->denoise_and_model);
+    cpi->denoise_and_model = NULL;
+  }
+#endif
+  if (cpi->film_grain_table) {
+    aom_film_grain_table_free(cpi->film_grain_table);
+    cpi->film_grain_table = NULL;
+  }
+
+  if (cpi->ppi->use_svc) av1_free_svc_cyclic_refresh(cpi);
+
+  if (cpi->consec_zero_mv) {
+    aom_free(cpi->consec_zero_mv);
+    cpi->consec_zero_mv = NULL;
+  }
+
+  if (cpi->src_sad_blk_64x64) {
+    aom_free(cpi->src_sad_blk_64x64);
+    cpi->src_sad_blk_64x64 = NULL;
+  }
+
+  aom_free(cpi->mb_weber_stats);
+  cpi->mb_weber_stats = NULL;
+
+  aom_free(cpi->mb_delta_q);
+  cpi->mb_delta_q = NULL;
+}
+
+static AOM_INLINE void allocate_gradient_info_for_hog(AV1_COMP *cpi) {
+  if (!is_gradient_caching_for_hog_enabled(cpi)) return;
+
+  PixelLevelGradientInfo *pixel_gradient_info = cpi->td.pixel_gradient_info;
+  if (!pixel_gradient_info) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome;
+    CHECK_MEM_ERROR(
+        cm, pixel_gradient_info,
+        aom_malloc(sizeof(*pixel_gradient_info) * plane_types * MAX_SB_SQUARE));
+    cpi->td.pixel_gradient_info = pixel_gradient_info;
+  }
+
+  cpi->td.mb.pixel_gradient_info = pixel_gradient_info;
+}
+
+static AOM_INLINE void allocate_src_var_of_4x4_sub_block_buf(AV1_COMP *cpi) {
+  if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return;
+
+  Block4x4VarInfo *source_variance_info =
+      cpi->td.src_var_info_of_4x4_sub_blocks;
+  if (!source_variance_info) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+    const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size];
+    CHECK_MEM_ERROR(cm, source_variance_info,
+                    aom_malloc(sizeof(*source_variance_info) * mi_count_in_sb));
+    cpi->td.src_var_info_of_4x4_sub_blocks = source_variance_info;
+  }
+
+  cpi->td.mb.src_var_info_of_4x4_sub_blocks = source_variance_info;
+}
+
+static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_64x64_blocks = (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4;
+  if (cpi->td.vt64x64) {
+    if (num_64x64_blocks != cpi->td.num_64x64_blocks) {
+      aom_free(cpi->td.vt64x64);
+      cpi->td.vt64x64 = NULL;
+    }
+  }
+  if (!cpi->td.vt64x64) {
+    CHECK_MEM_ERROR(cm, cpi->td.vt64x64,
+                    aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks));
+    cpi->td.num_64x64_blocks = num_64x64_blocks;
+  }
+}
+
+static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source(
+    AV1_COMP *cpi, int scaled_width, int scaled_height) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  if (scaled_width == cpi->unscaled_source->y_crop_width &&
+      scaled_height == cpi->unscaled_source->y_crop_height) {
+    return cpi->unscaled_source;
+  }
+
+  if (aom_realloc_frame_buffer(
+          &cpi->scaled_source, scaled_width, scaled_height,
+          cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+          cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->features.byte_alignment, NULL, NULL, NULL,
+          cpi->oxcf.tool_cfg.enable_global_motion, 0))
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to reallocate scaled source buffer");
+  assert(cpi->scaled_source.y_crop_width == scaled_width);
+  assert(cpi->scaled_source.y_crop_height == scaled_height);
+  av1_resize_and_extend_frame_nonnormative(
+      cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params->bit_depth,
+      num_planes);
+  return &cpi->scaled_source;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODER_ALLOC_H_
diff --git a/media/libaom/src/av1/encoder/encoder_utils.c b/media/libaom/src/av1/encoder/encoder_utils.c
new file mode 100644
index 0000000000..c8f608551b
--- /dev/null
+++ b/media/libaom/src/av1/encoder/encoder_utils.c
@@ -0,0 +1,1430 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aomcx.h"
+
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/grain_test_vectors.h"
+#include "av1/encoder/mv_prec.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/var_based_part.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define MIN_BOOST_COMBINE_FACTOR 4.0
+#define MAX_BOOST_COMBINE_FACTOR 12.0
+
+const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = {
+  { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 },
+    { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 },
+    { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 },
+    { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 },
+    { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 },
+    { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 },
+    { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 },
+    { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 },
+    { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 },
+    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 },
+    { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 },
+    { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 },
+    { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 },
+    { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 },
+    { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 },
+    { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 },
+    { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 },
+    { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+  { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 },
+    { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 },
+    { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 },
+    { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 },
+    { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 },
+    { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 },
+    { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 },
+    { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 },
+    { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 },
+    { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 },
+    { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 },
+    { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 },
+    { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 },
+    { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 },
+    { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 },
+    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 },
+    { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 },
+    { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 },
+    { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+  { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 },
+    { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 },
+    { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 },
+    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 },
+    { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 },
+    { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 },
+    { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 },
+    { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 },
+    { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 },
+    { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 },
+    { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 },
+    { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+};
+
+const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  106, 90, 90, 97, 67, 59, 70, 28,
+    30, 38, 16, 16,  16, 0,  0,  44, 50, 26, 25 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  98, 93, 97, 68, 82, 85, 33, 30,
+    33, 16, 16, 16, 16, 0,  0,  43, 37, 26, 16 },
+  { 0,  0,  0,  91, 80, 76, 78, 55, 49, 24, 16,
+    16, 16, 16, 16, 16, 0,  0,  29, 45, 16, 38 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  103, 89, 89, 89, 62, 63, 76, 34,
+    35, 32, 19, 16,  16, 0,  0,  49, 55, 29, 19 }
+};
+
+const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64,
+                                                       64, 64, 64 };
+
+// TODO(yunqing): the default probs can be trained later from better
+// performance.
+const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
+                                         [SWITCHABLE_FILTER_CONTEXTS]
+                                         [SWITCHABLE_FILTERS] = {
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } }
+                                         };
+
+static void configure_static_seg_features(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  struct segmentation *const seg = &cm->seg;
+
+  double avg_q;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  avg_q = ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) &&
+           (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE))
+              ? cpi->ppi->p_rc.temp_avg_q
+              : cpi->ppi->p_rc.avg_q;
+#else
+  avg_q = cpi->ppi->p_rc.avg_q;
+#endif
+
+  int high_q = (int)(avg_q > 48.0);
+  int qi_delta;
+
+  // Disable and clear down for KF
+  if (cm->current_frame.frame_type == KEY_FRAME) {
+    // Clear down the global segmentation map
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+
+    // Disable segmentation
+    av1_disable_segmentation(seg);
+
+    // Clear down the segment features.
+    av1_clearall_segfeatures(seg);
+  } else if (cpi->refresh_frame.alt_ref_frame) {
+    // If this is an alt ref frame
+    // Clear down the global segmentation map
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+
+    // Disable segmentation and individual segment features by default
+    av1_disable_segmentation(seg);
+    av1_clearall_segfeatures(seg);
+
+    // If segmentation was enabled set those features needed for the
+    // arf itself.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+
+      qi_delta = av1_compute_qdelta(rc, avg_q, avg_q * 0.875,
+                                    cm->seq_params->bit_depth);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+    }
+  } else if (seg->enabled) {
+    // All other frames if segmentation has been enabled
+
+    // First normal frame in a valid gf or alt ref group
+    if (rc->frames_since_golden == 0) {
+      // Set up segment features for normal frames in an arf group
+      // Disable segmentation and clear down features if alt ref
+      // is not active for this group
+
+      av1_disable_segmentation(seg);
+
+      memset(cpi->enc_seg.map, 0,
+             cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+
+      seg->update_map = 0;
+      seg->update_data = 0;
+
+      av1_clearall_segfeatures(seg);
+    } else if (rc->is_src_frame_alt_ref) {
+      // Special case where we are coding over the top of a previous
+      // alt ref frame.
+      // Segment coding disabled for compred testing
+
+      // Enable ref frame features for segment 0 as well
+      av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+      av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+      // All mbs should use ALTREF_FRAME
+      av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+      av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+      av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+      // Skip all MBs if high Q (0,0 mv and skip coeffs)
+      if (high_q) {
+        av1_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+        av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+      }
+      // Enable data update
+      seg->update_data = 1;
+    } else {
+      // All other frames.
+
+      // No updates.. leave things as they are.
+      seg->update_map = 0;
+      seg->update_data = 0;
+    }
+  }
+}
+
+void av1_apply_active_map(AV1_COMP *cpi) {
+  struct segmentation *const seg = &cpi->common.seg;
+  unsigned char *const seg_map = cpi->enc_seg.map;
+  const unsigned char *const active_map = cpi->active_map.map;
+  int i;
+
+  assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+  if (frame_is_intra_only(&cpi->common)) {
+    cpi->active_map.enabled = 0;
+    cpi->active_map.update = 1;
+  }
+
+  if (cpi->active_map.update) {
+    if (cpi->active_map.enabled) {
+      for (i = 0;
+           i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
+           ++i)
+        if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+      av1_enable_segmentation(seg);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
+                      -MAX_LOOP_FILTER);
+    } else {
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+      if (seg->enabled) {
+        seg->update_data = 1;
+        seg->update_map = 1;
+      }
+    }
+    cpi->active_map.update = 0;
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void process_tpl_stats_frame(AV1_COMP *cpi) {
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+
+  assert(IMPLIES(gf_group->size > 0, cpi->gf_frame_index < gf_group->size));
+
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+  if (tpl_frame->is_valid) {
+    int tpl_stride = tpl_frame->stride;
+    double intra_cost_base = 0;
+    double mc_dep_cost_base = 0;
+    double cbcmp_base = 1;
+    const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+    const int row_step = step;
+    const int col_step_sr =
+        coded_to_superres_mi(step, cm->superres_scale_denominator);
+    const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+    for (int row = 0; row < cm->mi_params.mi_rows; row += row_step) {
+      for (int col = 0; col < mi_cols_sr; col += col_step_sr) {
+        TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+            row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+        double cbcmp = (double)(this_stats->srcrf_dist);
+        int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+        intra_cost_base += log(dist_scaled) * cbcmp;
+        mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp;
+        cbcmp_base += cbcmp;
+      }
+    }
+
+    if (mc_dep_cost_base == 0) {
+      tpl_frame->is_valid = 0;
+    } else {
+      cpi->rd.r0 = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base);
+      if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+        if (cpi->ppi->lap_enabled) {
+          double min_boost_factor = sqrt(cpi->ppi->p_rc.baseline_gf_interval);
+          const int gfu_boost = get_gfu_boost_from_r0_lap(
+              min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.r0,
+              cpi->ppi->p_rc.num_stats_required_for_gfu_boost);
+          // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost,
+          //        gfu_boost);
+          cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
+              min_boost_factor, MAX_BOOST_COMBINE_FACTOR,
+              cpi->ppi->p_rc.gfu_boost, gfu_boost,
+              cpi->ppi->p_rc.num_stats_used_for_gfu_boost);
+        } else {
+          const int gfu_boost = (int)(200.0 / cpi->rd.r0);
+          cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
+              MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
+              cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
+        }
+      }
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+                                 int *top_index) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  // Setup variables that depend on the dimensions of the frame.
+  av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
+
+#if !CONFIG_REALTIME_ONLY
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (cpi->oxcf.algo_cfg.enable_tpl_model &&
+      av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) {
+    process_tpl_stats_frame(cpi);
+    av1_tpl_rdmult_setup(cpi);
+  }
+#endif
+
+  // Decide q and q bounds.
+  *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, cpi->gf_frame_index,
+                                bottom_index, top_index);
+
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->oxcf.rc_cfg.mode == AOM_Q &&
+      cpi->ppi->tpl_data.tpl_frame[cpi->gf_frame_index].is_valid &&
+      !is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+    const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+    const int tpl_q = av1_tpl_get_q_index(
+        &cpi->ppi->tpl_data, cpi->gf_frame_index, cpi->rc.active_worst_quality,
+        cm->seq_params->bit_depth);
+    *q = clamp(tpl_q, rc_cfg->best_allowed_q, rc_cfg->worst_allowed_q);
+    *top_index = *bottom_index = *q;
+    if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE)
+      cpi->ppi->p_rc.arf_q = *q;
+  }
+
+  if (cpi->oxcf.q_cfg.use_fixed_qp_offsets && cpi->oxcf.rc_cfg.mode == AOM_Q) {
+    if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+      const double qratio_grad =
+          cpi->ppi->p_rc.baseline_gf_interval > 20 ? 0.2 : 0.3;
+      const double qstep_ratio =
+          0.2 +
+          (1.0 - (double)cpi->rc.active_worst_quality / MAXQ) * qratio_grad;
+      *q = av1_get_q_index_from_qstep_ratio(
+          cpi->rc.active_worst_quality, qstep_ratio, cm->seq_params->bit_depth);
+      *top_index = *bottom_index = *q;
+      if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+          gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE ||
+          gf_group->update_type[cpi->gf_frame_index] == GF_UPDATE)
+        cpi->ppi->p_rc.arf_q = *q;
+    } else if (gf_group->layer_depth[cpi->gf_frame_index] <
+               gf_group->max_layer_depth) {
+      int this_height = gf_group->layer_depth[cpi->gf_frame_index];
+      int arf_q = cpi->ppi->p_rc.arf_q;
+      while (this_height > 1) {
+        arf_q = (arf_q + cpi->oxcf.rc_cfg.cq_level + 1) / 2;
+        --this_height;
+      }
+      *top_index = *bottom_index = *q = arf_q;
+    }
+  }
+#endif
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in the second pass of a two pass encode, as it requires
+  // lagged coding, and if the relevant speed feature flag is set.
+  if (is_stat_consumption_stage_twopass(cpi) &&
+      cpi->sf.hl_sf.static_segmentation)
+    configure_static_seg_features(cpi);
+}
+
+static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
+  pars->num_cr_points = 0;
+  pars->cr_mult = 0;
+  pars->cr_luma_mult = 0;
+  memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr));
+  memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr));
+  pars->num_cb_points = 0;
+  pars->cb_mult = 0;
+  pars->cb_luma_mult = 0;
+  pars->chroma_scaling_from_luma = 0;
+  memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb));
+  memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb));
+}
+
+void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
+                                          const AV1EncoderConfig *oxcf) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  const TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+  if (tune_cfg->film_grain_test_vector || tune_cfg->film_grain_table_filename ||
+      tune_cfg->content == AOM_CONTENT_FILM) {
+    seq_params->film_grain_params_present = 1;
+  } else {
+#if CONFIG_DENOISE
+    seq_params->film_grain_params_present = (oxcf->noise_level > 0);
+#else
+    seq_params->film_grain_params_present = 0;
+#endif
+  }
+}
+
+void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
+                                      const AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  cpi->oxcf = *oxcf;
+  const TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+  if (cpi->film_grain_table) {
+    aom_film_grain_table_free(cpi->film_grain_table);
+    aom_free(cpi->film_grain_table);
+    cpi->film_grain_table = NULL;
+  }
+
+  if (tune_cfg->film_grain_test_vector) {
+    if (cm->current_frame.frame_type == KEY_FRAME) {
+      memcpy(&cm->film_grain_params,
+             film_grain_test_vectors + tune_cfg->film_grain_test_vector - 1,
+             sizeof(cm->film_grain_params));
+      if (oxcf->tool_cfg.enable_monochrome)
+        reset_film_grain_chroma_params(&cm->film_grain_params);
+      cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
+      if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) {
+        cm->film_grain_params.clip_to_restricted_range = 0;
+      }
+    }
+  } else if (tune_cfg->film_grain_table_filename) {
+    CHECK_MEM_ERROR(cm, cpi->film_grain_table,
+                    aom_calloc(1, sizeof(*cpi->film_grain_table)));
+
+    aom_film_grain_table_read(cpi->film_grain_table,
+                              tune_cfg->film_grain_table_filename, cm->error);
+  } else if (tune_cfg->content == AOM_CONTENT_FILM) {
+    cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
+    if (oxcf->tool_cfg.enable_monochrome)
+      reset_film_grain_chroma_params(&cm->film_grain_params);
+    if (cm->seq_params->color_range == AOM_CR_FULL_RANGE)
+      cm->film_grain_params.clip_to_restricted_range = 0;
+  } else {
+    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
+  }
+}
+
+void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
+                          const int phase, const int use_optimized_scaler) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      BufferPool *const pool = cm->buffer_pool;
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_yv12_buf(cm, ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+        continue;
+      }
+
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        // Replace the reference buffer with a copy having a thicker border,
+        // if the reference buffer is higher resolution than the current
+        // frame, and the border is thin.
+        if ((ref->y_crop_width > cm->width ||
+             ref->y_crop_height > cm->height) &&
+            ref->border < AOM_BORDER_IN_PIXELS) {
+          RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame);
+          if (aom_yv12_realloc_with_new_border(
+                  &ref_fb->buf, AOM_BORDER_IN_PIXELS,
+                  cm->features.byte_alignment, num_planes) != 0) {
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+        }
+        int force_scaling = 0;
+        RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1];
+        if (new_fb == NULL) {
+          const int new_fb_idx = get_free_fb(cm);
+          if (new_fb_idx == INVALID_IDX) {
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                               "Unable to find free frame buffer");
+          }
+          force_scaling = 1;
+          new_fb = &pool->frame_bufs[new_fb_idx];
+        }
+
+        if (force_scaling || new_fb->buf.y_crop_width != cm->width ||
+            new_fb->buf.y_crop_height != cm->height) {
+          if (aom_realloc_frame_buffer(
+                  &new_fb->buf, cm->width, cm->height,
+                  cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+                  cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                  cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) {
+            if (force_scaling) {
+              // Release the reference acquired in the get_free_fb() call above.
+              --new_fb->ref_count;
+            }
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+#if CONFIG_AV1_HIGHBITDEPTH
+          if (use_optimized_scaler && cm->seq_params->bit_depth == AOM_BITS_8)
+            av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
+                                        num_planes);
+          else
+            av1_resize_and_extend_frame_nonnormative(
+                ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes);
+#else
+          if (use_optimized_scaler)
+            av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
+                                        num_planes);
+          else
+            av1_resize_and_extend_frame_nonnormative(
+                ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes);
+#endif
+          cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+      } else {
+        RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
+        buf->buf.y_crop_width = ref->y_crop_width;
+        buf->buf.y_crop_height = ref->y_crop_height;
+        cpi->scaled_ref_buf[ref_frame - 1] = buf;
+        ++buf->ref_count;
+      }
+    } else {
+      if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+    }
+  }
+}
+
+BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
+                              int height, int number_spatial_layers) {
+  if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_64X64) {
+    return BLOCK_64X64;
+  }
+  if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_128X128) {
+    return BLOCK_128X128;
+  }
+#if CONFIG_TFLITE
+  if (oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) return BLOCK_64X64;
+#endif
+  // Force 64x64 superblock size to increase resolution in perceptual
+  // AQ mode.
+  if (oxcf->mode == ALLINTRA &&
+      (oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI ||
+       oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED)) {
+    return BLOCK_64X64;
+  }
+  assert(oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
+
+  if (number_spatial_layers > 1 ||
+      oxcf->resize_cfg.resize_mode != RESIZE_NONE) {
+    // Use the configured size (top resolution) for spatial layers or
+    // on resize.
+    return AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) > 720
+               ? BLOCK_128X128
+               : BLOCK_64X64;
+  } else if (oxcf->mode == REALTIME) {
+    return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64;
+  }
+
+  // TODO(any): Possibly could improve this with a heuristic.
+  // When superres / resize is on, 'cm->width / height' can change between
+  // calls, so we don't apply this heuristic there.
+  // Things break if superblock size changes between the first pass and second
+  // pass encoding, which is why this heuristic is not configured as a
+  // speed-feature.
+  if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE &&
+      oxcf->resize_cfg.resize_mode == RESIZE_NONE) {
+    int is_480p_or_lesser = AOMMIN(width, height) <= 480;
+    if ((oxcf->speed >= 1 || oxcf->mode == REALTIME) && is_480p_or_lesser)
+      return BLOCK_64X64;
+
+    // For 1080p and lower resolutions, choose SB size adaptively based on
+    // resolution and speed level for multi-thread encode.
+    int is_1080p_or_lesser = AOMMIN(width, height) <= 1080;
+    if (!is_480p_or_lesser && is_1080p_or_lesser && oxcf->mode == GOOD &&
+        oxcf->row_mt == 1 && oxcf->max_threads > 1 && oxcf->speed >= 5)
+      return BLOCK_64X64;
+  }
+  return BLOCK_128X128;
+}
+
+void av1_setup_frame(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  // Set up entropy context depending on frame type. The decoder mandates
+  // the use of the default context, index 0, for keyframes and inter
+  // frames where the error_resilient_mode or intra_only flag is set. For
+  // other inter-frames the encoder currently uses only two contexts;
+  // context 1 for ALTREF frames and context 0 for the others.
+
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+      cpi->ext_flags.use_primary_ref_none) {
+    av1_setup_past_independence(cm);
+  }
+
+  if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
+      frame_is_sframe(cm)) {
+    if (!cpi->ppi->seq_params_locked) {
+      set_sb_size(cm->seq_params,
+                  av1_select_sb_size(&cpi->oxcf, cm->width, cm->height,
+                                     cpi->svc.number_spatial_layers));
+    }
+  } else {
+    const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
+    if (primary_ref_buf == NULL) {
+      av1_setup_past_independence(cm);
+      cm->seg.update_map = 1;
+      cm->seg.update_data = 1;
+    } else {
+      *cm->fc = primary_ref_buf->frame_context;
+    }
+  }
+
+  av1_zero(cm->cur_frame->interp_filter_selected);
+  cm->prev_frame = get_primary_ref_frame_buf(cm);
+  cpi->vaq_refresh = 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static int get_interp_filter_selected(const AV1_COMMON *const cm,
+                                      MV_REFERENCE_FRAME ref,
+                                      InterpFilter ifilter) {
+  const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+  if (buf == NULL) return 0;
+  return buf->interp_filter_selected[ifilter];
+}
+
+uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int ref_total[REF_FRAMES] = { 0 };
+  uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK;
+
+  if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_frame.alt_ref_frame)
+    return mask;
+
+  for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+    for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+         ++ifilter) {
+      ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter);
+    }
+  }
+  int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] +
+                         ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] +
+                         ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]);
+
+  for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+       ++ifilter) {
+    int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30;
+    if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) {
+      int filter_score =
+          get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 +
+          get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 +
+          get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10;
+      if (filter_score < ref_total_total) {
+        DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter;
+        reset_interp_filter_allowed_mask(&mask, filt_type);
+      }
+    }
+  }
+  return mask;
+}
+
+#define STRICT_PSNR_DIFF_THRESH 0.9
+// Encode key frame with/without screen content tools to determine whether
+// screen content tools should be enabled for this key frame group or not.
+// The first encoding is without screen content tools.
+// The second encoding is with screen content tools.
+// We compare the psnr and frame size to make the decision.
+static void screen_content_tools_determination(
+    AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision,
+    const int allow_intrabc_orig_decision,
+    const int use_screen_content_tools_orig_decision,
+    const int is_screen_content_type_orig_decision, const int pass,
+    int *projected_size_pass, PSNR_STATS *psnr) {
+  AV1_COMMON *const cm = &cpi->common;
+  FeatureFlags *const features = &cm->features;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  projected_size_pass[pass] =
+      ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) &&
+       (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE))
+          ? cpi->ppi->p_rc.temp_projected_frame_size
+          : cpi->rc.projected_frame_size;
+#else
+  projected_size_pass[pass] = cpi->rc.projected_frame_size;
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+  aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass],
+                       bit_depth, in_bit_depth);
+#else
+  aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]);
+#endif
+  if (pass != 1) return;
+
+  const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0];
+  const int is_sc_encoding_much_better = psnr_diff > STRICT_PSNR_DIFF_THRESH;
+  if (is_sc_encoding_much_better) {
+    // Use screen content tools, if we get coding gain.
+    features->allow_screen_content_tools = 1;
+    features->allow_intrabc = cpi->intrabc_used;
+    cpi->use_screen_content_tools = 1;
+    cpi->is_screen_content_type = 1;
+  } else {
+    // Use original screen content decision.
+    features->allow_screen_content_tools =
+        allow_screen_content_tools_orig_decision;
+    features->allow_intrabc = allow_intrabc_orig_decision;
+    cpi->use_screen_content_tools = use_screen_content_tools_orig_decision;
+    cpi->is_screen_content_type = is_screen_content_type_orig_decision;
+  }
+}
+
+// Set some encoding parameters to make the encoding process fast.
+// A fixed block partition size, and a large q is used.
+static void set_encoding_params_for_screen_content(AV1_COMP *cpi,
+                                                   const int pass) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (pass == 0) {
+    // In the first pass, encode without screen content tools.
+    // Use a high q, and a fixed block size for fast encoding.
+    cm->features.allow_screen_content_tools = 0;
+    cm->features.allow_intrabc = 0;
+    cpi->use_screen_content_tools = 0;
+    cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+    cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+    return;
+  }
+  assert(pass == 1);
+  // In the second pass, encode with screen content tools.
+  // Use a high q, and a fixed block size for fast encoding.
+  cm->features.allow_screen_content_tools = 1;
+  // TODO(chengchen): turn intrabc on could lead to data race issue.
+  // cm->allow_intrabc = 1;
+  cpi->use_screen_content_tools = 1;
+  cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+  cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+}
+
+// Determines whether to use screen content tools for the key frame group.
+// This function modifies "cm->features.allow_screen_content_tools",
+// "cm->features.allow_intrabc" and "cpi->use_screen_content_tools".
+void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+  // Variables to help determine if we should allow screen content tools.
+  int projected_size_pass[3] = { 0 };
+  PSNR_STATS psnr[3];
+  const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME;
+  const int allow_screen_content_tools_orig_decision =
+      cm->features.allow_screen_content_tools;
+  const int allow_intrabc_orig_decision = cm->features.allow_intrabc;
+  const int use_screen_content_tools_orig_decision =
+      cpi->use_screen_content_tools;
+  const int is_screen_content_type_orig_decision = cpi->is_screen_content_type;
+  // Turn off the encoding trial for forward key frame and superres.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode || oxcf->kf_cfg.fwd_kf_enabled ||
+      cpi->superres_mode != AOM_SUPERRES_NONE || oxcf->mode == REALTIME ||
+      use_screen_content_tools_orig_decision || !is_key_frame) {
+    return;
+  }
+
+  // TODO(chengchen): multiple encoding for the lossless mode is time consuming.
+  // Find a better way to determine whether screen content tools should be used
+  // for lossless coding.
+  // Use a high q and a fixed partition to do quick encoding.
+  const int q_for_screen_content_quick_run =
+      is_lossless_requested(&oxcf->rc_cfg) ? q_orig : AOMMAX(q_orig, 244);
+  const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type;
+  const BLOCK_SIZE fixed_partition_block_size_orig =
+      cpi->sf.part_sf.fixed_partition_size;
+
+  // Setup necessary params for encoding, including frame source, etc.
+
+  cpi->source = av1_realloc_and_scale_if_required(
+      cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter,
+      0, false, false, cpi->oxcf.border_in_pixels,
+      cpi->oxcf.tool_cfg.enable_global_motion);
+  if (cpi->unscaled_last_source != NULL) {
+    cpi->last_source = av1_realloc_and_scale_if_required(
+        cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+        cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels,
+        cpi->oxcf.tool_cfg.enable_global_motion);
+  }
+
+  av1_setup_frame(cpi);
+
+  if (cm->seg.enabled) {
+    if (!cm->seg.update_data && cm->prev_frame) {
+      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+      cm->seg.enabled = cm->prev_frame->seg.enabled;
+    } else {
+      av1_calculate_segdata(&cm->seg);
+    }
+  } else {
+    memset(&cm->seg, 0, sizeof(cm->seg));
+  }
+  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+  cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+  // The two encoding passes aim to help determine whether to use screen
+  // content tools, with a high q and fixed partition.
+  for (int pass = 0; pass < 2; ++pass) {
+    set_encoding_params_for_screen_content(cpi, pass);
+    av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel,
+                      q_for_screen_content_quick_run,
+                      q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+    av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+    if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
+      av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                         cm->seq_params->bit_depth);
+
+    av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run,
+                                          0);
+    // transform / motion compensation build reconstruction frame
+    av1_encode_frame(cpi);
+    // Screen content decision
+    screen_content_tools_determination(
+        cpi, allow_screen_content_tools_orig_decision,
+        allow_intrabc_orig_decision, use_screen_content_tools_orig_decision,
+        is_screen_content_type_orig_decision, pass, projected_size_pass, psnr);
+  }
+
+  // Set partition speed feature back.
+  cpi->sf.part_sf.partition_search_type = partition_search_type_orig;
+  cpi->sf.part_sf.fixed_partition_size = fixed_partition_block_size_orig;
+
+  // Free token related info if screen content coding tools are not enabled.
+  if (!cm->features.allow_screen_content_tools)
+    free_token_info(&cpi->token_info);
+}
+#endif  // CONFIG_REALTIME_ONLY
+
+static void fix_interp_filter(InterpFilter *const interp_filter,
+                              const FRAME_COUNTS *const counts) {
+  if (*interp_filter == SWITCHABLE) {
+    // Check to see if only one of the filters is actually used
+    int count[SWITCHABLE_FILTERS] = { 0 };
+    int num_filters_used = 0;
+    for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+      for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+        count[i] += counts->switchable_interp[j][i];
+      num_filters_used += (count[i] > 0);
+    }
+    if (num_filters_used == 1) {
+      // Only one filter is used. So set the filter at frame level
+      for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        if (count[i]) {
+          if (i == EIGHTTAP_REGULAR) *interp_filter = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+void av1_finalize_encoded_frame(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+
+  if (!cm->seq_params->reduced_still_picture_hdr &&
+      encode_show_existing_frame(cm)) {
+    RefCntBuffer *const frame_to_show =
+        cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+    if (frame_to_show == NULL) {
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Buffer does not contain a reconstructed frame");
+    }
+    assert(frame_to_show->ref_count > 0);
+    assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
+  }
+
+  if (!encode_show_existing_frame(cm) &&
+      cm->seq_params->film_grain_params_present &&
+      (cm->show_frame || cm->showable_frame)) {
+    // Copy the current frame's film grain params to the its corresponding
+    // RefCntBuffer slot.
+    cm->cur_frame->film_grain_params = cm->film_grain_params;
+
+    // We must update the parameters if this is not an INTER_FRAME
+    if (current_frame->frame_type != INTER_FRAME)
+      cm->cur_frame->film_grain_params.update_parameters = 1;
+
+    // Iterate the random seed for the next frame.
+    cm->film_grain_params.random_seed += 3381;
+    if (cm->film_grain_params.random_seed == 0)
+      cm->film_grain_params.random_seed = 7391;
+  }
+
+  // Initialise all tiles' contexts from the global frame context
+  for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) {
+    for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) {
+      const int tile_idx = tile_row * cm->tiles.cols + tile_col;
+      cpi->tile_data[tile_idx].tctx = *cm->fc;
+    }
+  }
+
+  fix_interp_filter(&cm->features.interp_filter, cpi->td.counts);
+}
+
+int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
+                      const YV12_BUFFER_CONFIG *last_picture,
+                      ForceIntegerMVInfo *const force_intpel_info) {
+  // check use hash ME
+  int k;
+
+  const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE;
+  const double threshold_current = 0.8;
+  const double threshold_average = 0.95;
+  const int max_history_size = 32;
+  int T = 0;  // total block
+  int C = 0;  // match with collocated block
+  int S = 0;  // smooth region but not match with collocated block
+
+  const int pic_width = cur_picture->y_width;
+  const int pic_height = cur_picture->y_height;
+  for (int i = 0; i + block_size <= pic_height; i += block_size) {
+    for (int j = 0; j + block_size <= pic_width; j += block_size) {
+      const int x_pos = j;
+      const int y_pos = i;
+      int match = 1;
+      T++;
+
+      // check whether collocated block match with current
+      uint8_t *p_cur = cur_picture->y_buffer;
+      uint8_t *p_ref = last_picture->y_buffer;
+      int stride_cur = cur_picture->y_stride;
+      int stride_ref = last_picture->y_stride;
+      p_cur += (y_pos * stride_cur + x_pos);
+      p_ref += (y_pos * stride_ref + x_pos);
+
+      if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+        uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p16_cur[tmpX] != p16_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p16_cur += stride_cur;
+          p16_ref += stride_ref;
+        }
+      } else {
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p_cur[tmpX] != p_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p_cur += stride_cur;
+          p_ref += stride_ref;
+        }
+      }
+
+      if (match) {
+        C++;
+        continue;
+      }
+
+      if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
+                                         y_pos) ||
+          av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
+        S++;
+        continue;
+      }
+    }
+  }
+
+  assert(T > 0);
+  double cs_rate = ((double)(C + S)) / ((double)(T));
+
+  force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate;
+
+  force_intpel_info->rate_index =
+      (force_intpel_info->rate_index + 1) % max_history_size;
+  force_intpel_info->rate_size++;
+  force_intpel_info->rate_size =
+      AOMMIN(force_intpel_info->rate_size, max_history_size);
+
+  if (cs_rate < threshold_current) {
+    return 0;
+  }
+
+  if (C == T) {
+    return 1;
+  }
+
+  double cs_average = 0.0;
+
+  for (k = 0; k < force_intpel_info->rate_size; k++) {
+    cs_average += force_intpel_info->cs_rate_array[k];
+  }
+  cs_average /= force_intpel_info->rate_size;
+
+  if (cs_average < threshold_average) {
+    return 0;
+  }
+
+  if ((T - C - S) < 0) {
+    return 1;
+  }
+
+  if (cs_average > 1.01) {
+    return 1;
+  }
+
+  return 0;
+}
+
+void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  uint8_t *y_buffer = cpi->source->y_buffer;
+  const int y_stride = cpi->source->y_stride;
+  const int block_size = BLOCK_16X16;
+
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+  double log_sum = 0.0;
+  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  // Loop through each 16x16 block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      double var = 0.0, num_of_var = 0.0;
+      const int index = row * num_cols + col;
+
+      // Loop through each 8x8 block.
+      for (int mi_row = row * num_mi_h;
+           mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
+           mi_row += 2) {
+        for (int mi_col = col * num_mi_w;
+             mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
+             mi_col += 2) {
+          struct buf_2d buf;
+          const int row_offset_y = mi_row << 2;
+          const int col_offset_y = mi_col << 2;
+
+          buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+          buf.stride = y_stride;
+
+          if (use_hbd) {
+            var += av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8,
+                                                      xd->bd);
+          } else {
+            var += av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8);
+          }
+
+          num_of_var += 1.0;
+        }
+      }
+      var = var / num_of_var;
+
+      // Curve fitting with an exponential model on all 16x16 blocks from the
+      // midres dataset.
+      var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
+      cpi->ssim_rdmult_scaling_factors[index] = var;
+      log_sum += log(var);
+    }
+  }
+  log_sum = exp(log_sum / (double)(num_rows * num_cols));
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      cpi->ssim_rdmult_scaling_factors[index] /= log_sum;
+    }
+  }
+}
+
+// Coding context that only needs to be saved when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static void save_extra_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+
+  cc->lf = cm->lf;
+  cc->cdef_info = cm->cdef_info;
+  cc->rc = cpi->rc;
+  cc->mv_stats = cpi->ppi->mv_stats;
+}
+
+void av1_save_all_coding_context(AV1_COMP *cpi) {
+  save_extra_coding_context(cpi);
+  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+#if DUMP_RECON_FRAMES == 1
+
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+void av1_dump_filtered_recon_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf;
+
+  if (recon_buf == NULL) {
+    printf("Frame %d is not ready.\n", current_frame->frame_number);
+    return;
+  }
+
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  printf(
+      "\n***Frame=%d (frame_offset=%d, show_frame=%d, "
+      "show_existing_frame=%d) "
+      "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[",
+      current_frame->frame_number, current_frame->order_hint, cm->show_frame,
+      cm->show_existing_frame);
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    const int ref_offset = buf != NULL ? (int)buf->order_hint : -1;
+    printf(" %d(%c)", ref_offset,
+           (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N');
+  }
+  printf(" ]\n");
+
+  if (!cm->show_frame) {
+    printf("Frame %d is a no show frame, so no image dump.\n",
+           current_frame->frame_number);
+    return;
+  }
+
+  int h;
+  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+  FILE *f_recon = NULL;
+
+  if (current_frame->frame_number == 0) {
+    if ((f_recon = fopen(file_name, "wb")) == NULL) {
+      printf("Unable to open file %s to write.\n", file_name);
+      return;
+    }
+  } else {
+    if ((f_recon = fopen(file_name, "ab")) == NULL) {
+      printf("Unable to open file %s to append.\n", file_name);
+      return;
+    }
+  }
+  printf(
+      "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
+      "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
+      "refresh_alt_ref_frame=%d, "
+      "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
+      current_frame->frame_number, cpi->gf_frame_index,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+      current_frame->order_hint, cm->show_frame, cm->show_existing_frame,
+      cpi->rc.source_alt_ref_active, cpi->refresh_frame.alt_ref_frame,
+      recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
+#if 0
+  int ref_frame;
+  printf("get_ref_frame_map_idx: [");
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+    printf(" %d", get_ref_frame_map_idx(cm, ref_frame));
+  printf(" ]\n");
+#endif  // 0
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
+           f_recon);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+
+  fclose(f_recon);
+}
+#endif  // DUMP_RECON_FRAMES
diff --git a/media/libaom/src/av1/encoder/encoder_utils.h b/media/libaom/src/av1/encoder/encoder_utils.h
new file mode 100644
index 0000000000..5ff9ca3106
--- /dev/null
+++ b/media/libaom/src/av1/encoder/encoder_utils.h
@@ -0,0 +1,1054 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_UTILS_H_
+#define AOM_AV1_ENCODER_ENCODER_UTILS_H_
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+#define DUMP_RECON_FRAMES 0
+
+extern const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL]
+                                      [TX_TYPES];
+
+extern const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
+
+extern const int default_warped_probs[FRAME_UPDATE_TYPES];
+
+extern const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
+                                                [SWITCHABLE_FILTER_CONTEXTS]
+                                                [SWITCHABLE_FILTERS];
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static AOM_INLINE void suppress_active_map(AV1_COMP *cpi) {
+  unsigned char *const seg_map = cpi->enc_seg.map;
+  int i;
+  if (cpi->active_map.enabled || cpi->active_map.update)
+    for (i = 0;
+         i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; ++i)
+      if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+        seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+static AOM_INLINE void set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                                 int height) {
+  // Ensure that the decoded width and height are both multiples of
+  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
+  // subsampling is used).
+  // This simplifies the implementation of various experiments,
+  // eg. cdef, which operates on units of 8x8 luma pixels.
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+
+  mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
+  mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
+
+  mi_params->mb_cols = (mi_params->mi_cols + 2) >> 2;
+  mi_params->mb_rows = (mi_params->mi_rows + 2) >> 2;
+  mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
+
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  mi_params->mi_alloc_stride =
+      (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+
+  assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
+         mi_size_high[mi_params->mi_alloc_bsize]);
+}
+
+static AOM_INLINE void enc_free_mi(CommonModeInfoParams *mi_params) {
+  aom_free(mi_params->mi_alloc);
+  mi_params->mi_alloc = NULL;
+  aom_free(mi_params->mi_grid_base);
+  mi_params->mi_grid_base = NULL;
+  mi_params->mi_alloc_size = 0;
+  aom_free(mi_params->tx_type_map);
+  mi_params->tx_type_map = NULL;
+}
+
+static AOM_INLINE void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                                     int height,
+                                     BLOCK_SIZE min_partition_size) {
+  mi_params->mi_alloc_bsize = min_partition_size;
+
+  set_mb_mi(mi_params, width, height);
+}
+
+static AOM_INLINE void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params,
+                                            int width, int height,
+                                            BLOCK_SIZE min_partition_size) {
+  (void)min_partition_size;
+  mi_params->mi_alloc_bsize = BLOCK_16X16;
+
+  set_mb_mi(mi_params, width, height);
+}
+
+static AOM_INLINE void enc_setup_mi(CommonModeInfoParams *mi_params) {
+  const int mi_grid_size =
+      mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
+  memset(mi_params->mi_alloc, 0,
+         mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc));
+  memset(mi_params->mi_grid_base, 0,
+         mi_grid_size * sizeof(*mi_params->mi_grid_base));
+  memset(mi_params->tx_type_map, 0,
+         mi_grid_size * sizeof(*mi_params->tx_type_map));
+}
+
+static AOM_INLINE void init_buffer_indices(
+    ForceIntegerMVInfo *const force_intpel_info, int *const remapped_ref_idx) {
+  int fb_idx;
+  for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
+    remapped_ref_idx[fb_idx] = fb_idx;
+  force_intpel_info->rate_index = 0;
+  force_intpel_info->rate_size = 0;
+}
+
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
+  ppi->fn_ptr[BT].sdf = SDF;                                           \
+  ppi->fn_ptr[BT].sdaf = SDAF;                                         \
+  ppi->fn_ptr[BT].vf = VF;                                             \
+  ppi->fn_ptr[BT].svf = SVF;                                           \
+  ppi->fn_ptr[BT].svaf = SVAF;                                         \
+  ppi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
+  ppi->fn_ptr[BT].jsdaf = JSDAF;                                       \
+  ppi->fn_ptr[BT].jsvaf = JSVAF;
+
+#define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD)                                \
+  HIGHBD_BFP(                                                                \
+      BLOCK_##WIDTH##X##HEIGHT, aom_highbd_sad##WIDTH##x##HEIGHT##_bits##BD, \
+      aom_highbd_sad##WIDTH##x##HEIGHT##_avg_bits##BD,                       \
+      aom_highbd_##BD##_variance##WIDTH##x##HEIGHT,                          \
+      aom_highbd_##BD##_sub_pixel_variance##WIDTH##x##HEIGHT,                \
+      aom_highbd_##BD##_sub_pixel_avg_variance##WIDTH##x##HEIGHT,            \
+      aom_highbd_sad##WIDTH##x##HEIGHT##x4d_bits##BD,                        \
+      aom_highbd_dist_wtd_sad##WIDTH##x##HEIGHT##_avg_bits##BD,              \
+      aom_highbd_##BD##_dist_wtd_sub_pixel_avg_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
+  static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
+                                     int source_stride,                        \
+                                     const uint8_t *ref_ptr, int ref_stride) { \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride);                \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride) {                                                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2;           \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride) {                                                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4;           \
+  }
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname)                                        \
+  static unsigned int fnname##_bits8(                                          \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred);   \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+           2;                                                                  \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+           4;                                                                  \
+  }
+
+#define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
+  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
+                             const uint8_t *const ref_ptr[], int ref_stride,  \
+                             unsigned int *sad_array) {                       \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+  }                                                                           \
+  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 2;                               \
+  }                                                                           \
+  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
+  }
+
+#define MAKE_BFP_JSADAVG_WRAPPER(fnname)                                    \
+  static unsigned int fnname##_bits8(                                       \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param);                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits10(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           2;                                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits12(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           4;                                                               \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
+#endif
+
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg)
+#if !CONFIG_REALTIME_ONLY
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg)
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
+  ppi->fn_ptr[BT].msdf = MCSDF;       \
+  ppi->fn_ptr[BT].msvf = MCSVF;
+
+#define HIGHBD_MBFP_WRAPPER(WIDTH, HEIGHT, BD)                    \
+  HIGHBD_MBFP(BLOCK_##WIDTH##X##HEIGHT,                           \
+              aom_highbd_masked_sad##WIDTH##x##HEIGHT##_bits##BD, \
+              aom_highbd_##BD##_masked_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname)                           \
+  static unsigned int fnname##_bits8(                                    \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask);            \
+  }                                                                      \
+  static unsigned int fnname##_bits10(                                   \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask) >>          \
+           2;                                                            \
+  }                                                                      \
+  static unsigned int fnname##_bits12(                                   \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask) >>          \
+           4;                                                            \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+#if !CONFIG_REALTIME_ONLY
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
+#endif
+#endif
+
+#define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \
+  ppi->fn_ptr[BT].sdsf = SDSF;          \
+  ppi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+#define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT, BD)                   \
+  HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT,                          \
+               aom_highbd_sad_skip_##WIDTH##x##HEIGHT##_bits##BD, \
+               aom_highbd_sad_skip_##WIDTH##x##HEIGHT##x4d##_bits##BD)
+
+#define MAKE_SDSF_SKIP_SAD_WRAPPER(fnname)                                  \
+  static unsigned int fnname##_bits8(const uint8_t *src, int src_stride,    \
+                                     const uint8_t *ref, int ref_stride) {  \
+    return fnname(src, src_stride, ref, ref_stride);                        \
+  }                                                                         \
+  static unsigned int fnname##_bits10(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride) { \
+    return fnname(src, src_stride, ref, ref_stride) >> 2;                   \
+  }                                                                         \
+  static unsigned int fnname##_bits12(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride) { \
+    return fnname(src, src_stride, ref, ref_stride) >> 4;                   \
+  }
+
+#define MAKE_SDSF_SKIP_SAD_4D_WRAPPER(fnname)                                 \
+  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
+                             const uint8_t *const ref_ptr[], int ref_stride,  \
+                             unsigned int *sad_array) {                       \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+  }                                                                           \
+  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 2;                               \
+  }                                                                           \
+  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x128)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x128)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x8)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x32)
+#endif
+
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x128x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x128x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x8x4d)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x32x4d)
+#endif
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define HIGHBD_OBFP_WRAPPER_8(WIDTH, HEIGHT)                 \
+  HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT,                      \
+              aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits8, \
+              aom_highbd_obmc_variance##WIDTH##x##HEIGHT,    \
+              aom_highbd_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
+  ppi->fn_ptr[BT].osdf = OSDF;           \
+  ppi->fn_ptr[BT].ovf = OVF;             \
+  ppi->fn_ptr[BT].osvf = OSVF;
+
+#define HIGHBD_OBFP_WRAPPER(WIDTH, HEIGHT, BD)                   \
+  HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT,                          \
+              aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits##BD,  \
+              aom_highbd_##BD##_obmc_variance##WIDTH##x##HEIGHT, \
+              aom_highbd_##BD##_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname)                                     \
+  static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride,  \
+                                     const int32_t *wsrc,                 \
+                                     const int32_t *msk) {                \
+    return fnname(ref, ref_stride, wsrc, msk);                            \
+  }                                                                       \
+  static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \
+                                      const int32_t *wsrc,                \
+                                      const int32_t *msk) {               \
+    return fnname(ref, ref_stride, wsrc, msk) >> 2;                       \
+  }                                                                       \
+  static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \
+                                      const int32_t *wsrc,                \
+                                      const int32_t *msk) {               \
+    return fnname(ref, ref_stride, wsrc, msk) >> 4;                       \
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
+#endif
+
+static AOM_INLINE void highbd_set_var_fns(AV1_PRIMARY *const ppi) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  if (seq_params->use_highbitdepth) {
+    switch (seq_params->bit_depth) {
+      case AOM_BITS_8:
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_BFP_WRAPPER(64, 16, 8)
+        HIGHBD_BFP_WRAPPER(16, 64, 8)
+        HIGHBD_BFP_WRAPPER(32, 8, 8)
+        HIGHBD_BFP_WRAPPER(8, 32, 8)
+        HIGHBD_BFP_WRAPPER(16, 4, 8)
+        HIGHBD_BFP_WRAPPER(4, 16, 8)
+#endif
+        HIGHBD_BFP_WRAPPER(32, 16, 8)
+        HIGHBD_BFP_WRAPPER(16, 32, 8)
+        HIGHBD_BFP_WRAPPER(64, 32, 8)
+        HIGHBD_BFP_WRAPPER(32, 64, 8)
+        HIGHBD_BFP_WRAPPER(32, 32, 8)
+        HIGHBD_BFP_WRAPPER(64, 64, 8)
+        HIGHBD_BFP_WRAPPER(16, 16, 8)
+        HIGHBD_BFP_WRAPPER(16, 8, 8)
+        HIGHBD_BFP_WRAPPER(8, 16, 8)
+        HIGHBD_BFP_WRAPPER(8, 8, 8)
+        HIGHBD_BFP_WRAPPER(8, 4, 8)
+        HIGHBD_BFP_WRAPPER(4, 8, 8)
+        HIGHBD_BFP_WRAPPER(4, 4, 8)
+        HIGHBD_BFP_WRAPPER(128, 128, 8)
+        HIGHBD_BFP_WRAPPER(128, 64, 8)
+        HIGHBD_BFP_WRAPPER(64, 128, 8)
+
+        HIGHBD_MBFP_WRAPPER(128, 128, 8)
+        HIGHBD_MBFP_WRAPPER(128, 64, 8)
+        HIGHBD_MBFP_WRAPPER(64, 128, 8)
+        HIGHBD_MBFP_WRAPPER(64, 64, 8)
+        HIGHBD_MBFP_WRAPPER(64, 32, 8)
+        HIGHBD_MBFP_WRAPPER(32, 64, 8)
+        HIGHBD_MBFP_WRAPPER(32, 32, 8)
+        HIGHBD_MBFP_WRAPPER(32, 16, 8)
+        HIGHBD_MBFP_WRAPPER(16, 32, 8)
+        HIGHBD_MBFP_WRAPPER(16, 16, 8)
+        HIGHBD_MBFP_WRAPPER(8, 16, 8)
+        HIGHBD_MBFP_WRAPPER(16, 8, 8)
+        HIGHBD_MBFP_WRAPPER(8, 8, 8)
+        HIGHBD_MBFP_WRAPPER(4, 8, 8)
+        HIGHBD_MBFP_WRAPPER(8, 4, 8)
+        HIGHBD_MBFP_WRAPPER(4, 4, 8)
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_MBFP_WRAPPER(64, 16, 8)
+        HIGHBD_MBFP_WRAPPER(16, 64, 8)
+        HIGHBD_MBFP_WRAPPER(32, 8, 8)
+        HIGHBD_MBFP_WRAPPER(8, 32, 8)
+        HIGHBD_MBFP_WRAPPER(16, 4, 8)
+        HIGHBD_MBFP_WRAPPER(4, 16, 8)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_OBFP_WRAPPER_8(128, 128)
+        HIGHBD_OBFP_WRAPPER_8(128, 64)
+        HIGHBD_OBFP_WRAPPER_8(64, 128)
+        HIGHBD_OBFP_WRAPPER_8(64, 64)
+        HIGHBD_OBFP_WRAPPER_8(64, 32)
+        HIGHBD_OBFP_WRAPPER_8(32, 64)
+        HIGHBD_OBFP_WRAPPER_8(32, 32)
+        HIGHBD_OBFP_WRAPPER_8(32, 16)
+        HIGHBD_OBFP_WRAPPER_8(16, 32)
+        HIGHBD_OBFP_WRAPPER_8(16, 16)
+        HIGHBD_OBFP_WRAPPER_8(8, 16)
+        HIGHBD_OBFP_WRAPPER_8(16, 8)
+        HIGHBD_OBFP_WRAPPER_8(8, 8)
+        HIGHBD_OBFP_WRAPPER_8(4, 8)
+        HIGHBD_OBFP_WRAPPER_8(8, 4)
+        HIGHBD_OBFP_WRAPPER_8(4, 4)
+        HIGHBD_OBFP_WRAPPER_8(64, 16)
+        HIGHBD_OBFP_WRAPPER_8(16, 64)
+        HIGHBD_OBFP_WRAPPER_8(32, 8)
+        HIGHBD_OBFP_WRAPPER_8(8, 32)
+        HIGHBD_OBFP_WRAPPER_8(16, 4)
+        HIGHBD_OBFP_WRAPPER_8(4, 16)
+#endif
+
+        HIGHBD_SDSFP_WRAPPER(128, 128, 8)
+        HIGHBD_SDSFP_WRAPPER(128, 64, 8)
+        HIGHBD_SDSFP_WRAPPER(64, 128, 8)
+        HIGHBD_SDSFP_WRAPPER(64, 64, 8)
+        HIGHBD_SDSFP_WRAPPER(64, 32, 8)
+        HIGHBD_SDSFP_WRAPPER(32, 64, 8)
+        HIGHBD_SDSFP_WRAPPER(32, 32, 8)
+        HIGHBD_SDSFP_WRAPPER(32, 16, 8)
+        HIGHBD_SDSFP_WRAPPER(16, 32, 8)
+        HIGHBD_SDSFP_WRAPPER(16, 16, 8)
+        HIGHBD_SDSFP_WRAPPER(16, 8, 8)
+        HIGHBD_SDSFP_WRAPPER(8, 16, 8)
+        HIGHBD_SDSFP_WRAPPER(8, 8, 8)
+        HIGHBD_SDSFP_WRAPPER(4, 8, 8)
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_SDSFP_WRAPPER(64, 16, 8)
+        HIGHBD_SDSFP_WRAPPER(32, 8, 8)
+        HIGHBD_SDSFP_WRAPPER(16, 64, 8)
+        HIGHBD_SDSFP_WRAPPER(8, 32, 8)
+        HIGHBD_SDSFP_WRAPPER(4, 16, 8)
+#endif
+        break;
+
+      case AOM_BITS_10:
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_BFP_WRAPPER(64, 16, 10)
+        HIGHBD_BFP_WRAPPER(16, 64, 10)
+        HIGHBD_BFP_WRAPPER(32, 8, 10)
+        HIGHBD_BFP_WRAPPER(8, 32, 10)
+        HIGHBD_BFP_WRAPPER(16, 4, 10)
+        HIGHBD_BFP_WRAPPER(4, 16, 10)
+#endif
+        HIGHBD_BFP_WRAPPER(32, 16, 10)
+        HIGHBD_BFP_WRAPPER(16, 32, 10)
+        HIGHBD_BFP_WRAPPER(64, 32, 10)
+        HIGHBD_BFP_WRAPPER(32, 64, 10)
+        HIGHBD_BFP_WRAPPER(32, 32, 10)
+        HIGHBD_BFP_WRAPPER(64, 64, 10)
+        HIGHBD_BFP_WRAPPER(16, 16, 10)
+        HIGHBD_BFP_WRAPPER(16, 8, 10)
+        HIGHBD_BFP_WRAPPER(8, 16, 10)
+        HIGHBD_BFP_WRAPPER(8, 8, 10)
+        HIGHBD_BFP_WRAPPER(8, 4, 10)
+        HIGHBD_BFP_WRAPPER(4, 8, 10)
+        HIGHBD_BFP_WRAPPER(4, 4, 10)
+        HIGHBD_BFP_WRAPPER(128, 128, 10)
+        HIGHBD_BFP_WRAPPER(128, 64, 10)
+        HIGHBD_BFP_WRAPPER(64, 128, 10)
+
+        HIGHBD_MBFP_WRAPPER(128, 128, 10)
+        HIGHBD_MBFP_WRAPPER(128, 64, 10)
+        HIGHBD_MBFP_WRAPPER(64, 128, 10)
+        HIGHBD_MBFP_WRAPPER(64, 64, 10)
+        HIGHBD_MBFP_WRAPPER(64, 32, 10)
+        HIGHBD_MBFP_WRAPPER(32, 64, 10)
+        HIGHBD_MBFP_WRAPPER(32, 32, 10)
+        HIGHBD_MBFP_WRAPPER(32, 16, 10)
+        HIGHBD_MBFP_WRAPPER(16, 32, 10)
+        HIGHBD_MBFP_WRAPPER(16, 16, 10)
+        HIGHBD_MBFP_WRAPPER(8, 16, 10)
+        HIGHBD_MBFP_WRAPPER(16, 8, 10)
+        HIGHBD_MBFP_WRAPPER(8, 8, 10)
+        HIGHBD_MBFP_WRAPPER(4, 8, 10)
+        HIGHBD_MBFP_WRAPPER(8, 4, 10)
+        HIGHBD_MBFP_WRAPPER(4, 4, 10)
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_MBFP_WRAPPER(64, 16, 10)
+        HIGHBD_MBFP_WRAPPER(16, 64, 10)
+        HIGHBD_MBFP_WRAPPER(32, 8, 10)
+        HIGHBD_MBFP_WRAPPER(8, 32, 10)
+        HIGHBD_MBFP_WRAPPER(16, 4, 10)
+        HIGHBD_MBFP_WRAPPER(4, 16, 10)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_OBFP_WRAPPER(128, 128, 10)
+        HIGHBD_OBFP_WRAPPER(128, 64, 10)
+        HIGHBD_OBFP_WRAPPER(64, 128, 10)
+        HIGHBD_OBFP_WRAPPER(64, 64, 10)
+        HIGHBD_OBFP_WRAPPER(64, 32, 10)
+        HIGHBD_OBFP_WRAPPER(32, 64, 10)
+        HIGHBD_OBFP_WRAPPER(32, 32, 10)
+        HIGHBD_OBFP_WRAPPER(32, 16, 10)
+        HIGHBD_OBFP_WRAPPER(16, 32, 10)
+        HIGHBD_OBFP_WRAPPER(16, 16, 10)
+        HIGHBD_OBFP_WRAPPER(8, 16, 10)
+        HIGHBD_OBFP_WRAPPER(16, 8, 10)
+        HIGHBD_OBFP_WRAPPER(8, 8, 10)
+        HIGHBD_OBFP_WRAPPER(4, 8, 10)
+        HIGHBD_OBFP_WRAPPER(8, 4, 10)
+        HIGHBD_OBFP_WRAPPER(4, 4, 10)
+        HIGHBD_OBFP_WRAPPER(64, 16, 10)
+        HIGHBD_OBFP_WRAPPER(16, 64, 10)
+        HIGHBD_OBFP_WRAPPER(32, 8, 10)
+        HIGHBD_OBFP_WRAPPER(8, 32, 10)
+        HIGHBD_OBFP_WRAPPER(16, 4, 10)
+        HIGHBD_OBFP_WRAPPER(4, 16, 10)
+#endif
+
+        HIGHBD_SDSFP_WRAPPER(128, 128, 10)
+        HIGHBD_SDSFP_WRAPPER(128, 64, 10)
+        HIGHBD_SDSFP_WRAPPER(64, 128, 10)
+        HIGHBD_SDSFP_WRAPPER(64, 64, 10)
+        HIGHBD_SDSFP_WRAPPER(64, 32, 10)
+        HIGHBD_SDSFP_WRAPPER(32, 64, 10)
+        HIGHBD_SDSFP_WRAPPER(32, 32, 10)
+        HIGHBD_SDSFP_WRAPPER(32, 16, 10)
+        HIGHBD_SDSFP_WRAPPER(16, 32, 10)
+        HIGHBD_SDSFP_WRAPPER(16, 16, 10)
+        HIGHBD_SDSFP_WRAPPER(16, 8, 10)
+        HIGHBD_SDSFP_WRAPPER(8, 16, 10)
+        HIGHBD_SDSFP_WRAPPER(8, 8, 10)
+        HIGHBD_SDSFP_WRAPPER(4, 8, 10)
+
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_SDSFP_WRAPPER(64, 16, 10)
+        HIGHBD_SDSFP_WRAPPER(32, 8, 10)
+        HIGHBD_SDSFP_WRAPPER(16, 64, 10)
+        HIGHBD_SDSFP_WRAPPER(8, 32, 10)
+        HIGHBD_SDSFP_WRAPPER(4, 16, 10)
+#endif
+        break;
+
+      case AOM_BITS_12:
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_BFP_WRAPPER(64, 16, 12)
+        HIGHBD_BFP_WRAPPER(16, 64, 12)
+        HIGHBD_BFP_WRAPPER(32, 8, 12)
+        HIGHBD_BFP_WRAPPER(8, 32, 12)
+        HIGHBD_BFP_WRAPPER(16, 4, 12)
+        HIGHBD_BFP_WRAPPER(4, 16, 12)
+#endif
+        HIGHBD_BFP_WRAPPER(32, 16, 12)
+        HIGHBD_BFP_WRAPPER(16, 32, 12)
+        HIGHBD_BFP_WRAPPER(64, 32, 12)
+        HIGHBD_BFP_WRAPPER(32, 64, 12)
+        HIGHBD_BFP_WRAPPER(32, 32, 12)
+        HIGHBD_BFP_WRAPPER(64, 64, 12)
+        HIGHBD_BFP_WRAPPER(16, 16, 12)
+        HIGHBD_BFP_WRAPPER(16, 8, 12)
+        HIGHBD_BFP_WRAPPER(8, 16, 12)
+        HIGHBD_BFP_WRAPPER(8, 8, 12)
+        HIGHBD_BFP_WRAPPER(8, 4, 12)
+        HIGHBD_BFP_WRAPPER(4, 8, 12)
+        HIGHBD_BFP_WRAPPER(4, 4, 12)
+        HIGHBD_BFP_WRAPPER(128, 128, 12)
+        HIGHBD_BFP_WRAPPER(128, 64, 12)
+        HIGHBD_BFP_WRAPPER(64, 128, 12)
+
+        HIGHBD_MBFP_WRAPPER(128, 128, 12)
+        HIGHBD_MBFP_WRAPPER(128, 64, 12)
+        HIGHBD_MBFP_WRAPPER(64, 128, 12)
+        HIGHBD_MBFP_WRAPPER(64, 64, 12)
+        HIGHBD_MBFP_WRAPPER(64, 32, 12)
+        HIGHBD_MBFP_WRAPPER(32, 64, 12)
+        HIGHBD_MBFP_WRAPPER(32, 32, 12)
+        HIGHBD_MBFP_WRAPPER(32, 16, 12)
+        HIGHBD_MBFP_WRAPPER(16, 32, 12)
+        HIGHBD_MBFP_WRAPPER(16, 16, 12)
+        HIGHBD_MBFP_WRAPPER(8, 16, 12)
+        HIGHBD_MBFP_WRAPPER(16, 8, 12)
+        HIGHBD_MBFP_WRAPPER(8, 8, 12)
+        HIGHBD_MBFP_WRAPPER(4, 8, 12)
+        HIGHBD_MBFP_WRAPPER(8, 4, 12)
+        HIGHBD_MBFP_WRAPPER(4, 4, 12)
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_MBFP_WRAPPER(64, 16, 12)
+        HIGHBD_MBFP_WRAPPER(16, 64, 12)
+        HIGHBD_MBFP_WRAPPER(32, 8, 12)
+        HIGHBD_MBFP_WRAPPER(8, 32, 12)
+        HIGHBD_MBFP_WRAPPER(16, 4, 12)
+        HIGHBD_MBFP_WRAPPER(4, 16, 12)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_OBFP_WRAPPER(128, 128, 12)
+        HIGHBD_OBFP_WRAPPER(128, 64, 12)
+        HIGHBD_OBFP_WRAPPER(64, 128, 12)
+        HIGHBD_OBFP_WRAPPER(64, 64, 12)
+        HIGHBD_OBFP_WRAPPER(64, 32, 12)
+        HIGHBD_OBFP_WRAPPER(32, 64, 12)
+        HIGHBD_OBFP_WRAPPER(32, 32, 12)
+        HIGHBD_OBFP_WRAPPER(32, 16, 12)
+        HIGHBD_OBFP_WRAPPER(16, 32, 12)
+        HIGHBD_OBFP_WRAPPER(16, 16, 12)
+        HIGHBD_OBFP_WRAPPER(8, 16, 12)
+        HIGHBD_OBFP_WRAPPER(16, 8, 12)
+        HIGHBD_OBFP_WRAPPER(8, 8, 12)
+        HIGHBD_OBFP_WRAPPER(4, 8, 12)
+        HIGHBD_OBFP_WRAPPER(8, 4, 12)
+        HIGHBD_OBFP_WRAPPER(4, 4, 12)
+        HIGHBD_OBFP_WRAPPER(64, 16, 12)
+        HIGHBD_OBFP_WRAPPER(16, 64, 12)
+        HIGHBD_OBFP_WRAPPER(32, 8, 12)
+        HIGHBD_OBFP_WRAPPER(8, 32, 12)
+        HIGHBD_OBFP_WRAPPER(16, 4, 12)
+        HIGHBD_OBFP_WRAPPER(4, 16, 12)
+#endif
+
+        HIGHBD_SDSFP_WRAPPER(128, 128, 12)
+        HIGHBD_SDSFP_WRAPPER(128, 64, 12)
+        HIGHBD_SDSFP_WRAPPER(64, 128, 12)
+        HIGHBD_SDSFP_WRAPPER(64, 64, 12)
+        HIGHBD_SDSFP_WRAPPER(64, 32, 12)
+        HIGHBD_SDSFP_WRAPPER(32, 64, 12)
+        HIGHBD_SDSFP_WRAPPER(32, 32, 12)
+        HIGHBD_SDSFP_WRAPPER(32, 16, 12)
+        HIGHBD_SDSFP_WRAPPER(16, 32, 12)
+        HIGHBD_SDSFP_WRAPPER(16, 16, 12)
+        HIGHBD_SDSFP_WRAPPER(16, 8, 12)
+        HIGHBD_SDSFP_WRAPPER(8, 16, 12)
+        HIGHBD_SDSFP_WRAPPER(8, 8, 12)
+        HIGHBD_SDSFP_WRAPPER(4, 8, 12)
+
+#if !CONFIG_REALTIME_ONLY
+        HIGHBD_SDSFP_WRAPPER(64, 16, 12)
+        HIGHBD_SDSFP_WRAPPER(32, 8, 12)
+        HIGHBD_SDSFP_WRAPPER(16, 64, 12)
+        HIGHBD_SDSFP_WRAPPER(8, 32, 12)
+        HIGHBD_SDSFP_WRAPPER(4, 16, 12)
+#endif
+        break;
+
+      default:
+        assert(0 &&
+               "cm->seq_params->bit_depth should be AOM_BITS_8, "
+               "AOM_BITS_10 or AOM_BITS_12");
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void copy_frame_prob_info(AV1_COMP *cpi) {
+  FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
+  if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+    av1_copy(frame_probs->tx_type_probs, default_tx_type_probs);
+  }
+  if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+      cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+    av1_copy(frame_probs->obmc_probs, default_obmc_probs);
+  }
+  if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+    av1_copy(frame_probs->warped_probs, default_warped_probs);
+  }
+  if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+    av1_copy(frame_probs->switchable_interp_probs,
+             default_switchable_interp_probs);
+  }
+
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+    FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs;
+    if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+      av1_copy(temp_frame_probs->tx_type_probs, default_tx_type_probs);
+    }
+    if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+        cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+      av1_copy(temp_frame_probs->obmc_probs, default_obmc_probs);
+    }
+    if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+      av1_copy(temp_frame_probs->warped_probs, default_warped_probs);
+    }
+    if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+      av1_copy(temp_frame_probs->switchable_interp_probs,
+               default_switchable_interp_probs);
+    }
+
+    FrameProbInfo *const temp_frame_probs_simulation =
+        &cpi->ppi->temp_frame_probs_simulation;
+    if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+      av1_copy(temp_frame_probs_simulation->tx_type_probs,
+               default_tx_type_probs);
+    }
+    if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+        cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+      av1_copy(temp_frame_probs_simulation->obmc_probs, default_obmc_probs);
+    }
+    if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+      av1_copy(temp_frame_probs_simulation->warped_probs, default_warped_probs);
+    }
+    if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+      av1_copy(temp_frame_probs_simulation->switchable_interp_probs,
+               default_switchable_interp_probs);
+    }
+  }
+#endif
+}
+
+static AOM_INLINE void restore_cdef_coding_context(CdefInfo *const dst,
+                                                   const CdefInfo *const src) {
+  dst->cdef_bits = src->cdef_bits;
+  dst->cdef_damping = src->cdef_damping;
+  av1_copy(dst->cdef_strengths, src->cdef_strengths);
+  av1_copy(dst->cdef_uv_strengths, src->cdef_uv_strengths);
+  dst->nb_cdef_strengths = src->nb_cdef_strengths;
+}
+
+// Coding context that only needs to be restored when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static AOM_INLINE void restore_extra_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+  cm->lf = cc->lf;
+  restore_cdef_coding_context(&cm->cdef_info, &cc->cdef_info);
+  cpi->rc = cc->rc;
+  cpi->ppi->mv_stats = cc->mv_stats;
+}
+
+static AOM_INLINE int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+                                                  const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+         a->border == b->border &&
+         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+static AOM_INLINE int update_entropy(bool *ext_refresh_frame_context,
+                                     bool *ext_refresh_frame_context_pending,
+                                     bool update) {
+  *ext_refresh_frame_context = update;
+  *ext_refresh_frame_context_pending = 1;
+  return 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE int combine_prior_with_tpl_boost(double min_factor,
+                                                   double max_factor,
+                                                   int prior_boost,
+                                                   int tpl_boost,
+                                                   int frames_to_key) {
+  double factor = sqrt((double)frames_to_key);
+  double range = max_factor - min_factor;
+  factor = AOMMIN(factor, max_factor);
+  factor = AOMMAX(factor, min_factor);
+  factor -= min_factor;
+  int boost =
+      (int)((factor * prior_boost + (range - factor) * tpl_boost) / range);
+  return boost;
+}
+#endif
+
+static AOM_INLINE void set_size_independent_vars(AV1_COMP *cpi) {
+  int i;
+  AV1_COMMON *const cm = &cpi->common;
+  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    cm->global_motion[i] = default_warp_params;
+  }
+  cpi->gm_info.search_done = 0;
+
+  av1_set_speed_features_framesize_independent(cpi, cpi->speed);
+  av1_set_rd_speed_thresholds(cpi);
+  cm->features.interp_filter = SWITCHABLE;
+  cm->features.switchable_motion_mode = 1;
+}
+
+static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) {
+  // TODO(isbs): only refresh the necessary frames, rather than all of them
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+    if (buf != NULL) {
+      --buf->ref_count;
+      cpi->scaled_ref_buf[i] = NULL;
+    }
+  }
+}
+
+static AOM_INLINE void restore_all_coding_context(AV1_COMP *cpi) {
+  restore_extra_coding_context(cpi);
+  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+// Refresh reference frame buffers according to refresh_frame_flags.
+static AOM_INLINE void refresh_reference_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  // All buffers are refreshed for shown keyframes and S-frames.
+
+  for (int ref_frame = 0; ref_frame < REF_FRAMES; ref_frame++) {
+    if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) {
+      assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame);
+    }
+  }
+}
+
+void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
+                                          const AV1EncoderConfig *oxcf);
+void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
+                                      const AV1EncoderConfig *oxcf);
+
+void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
+                          const int phase, const int use_optimized_scaler);
+
+void av1_setup_frame(AV1_COMP *cpi);
+
+BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
+                              int height, int number_spatial_layers);
+
+void av1_apply_active_map(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi);
+
+void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig);
+#endif
+
+void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+                                 int *top_index);
+
+void av1_finalize_encoded_frame(AV1_COMP *const cpi);
+
+int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
+                      const YV12_BUFFER_CONFIG *last_picture,
+                      ForceIntegerMVInfo *const force_intpel_info);
+
+void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi);
+
+void av1_save_all_coding_context(AV1_COMP *cpi);
+
+#if DUMP_RECON_FRAMES == 1
+void av1_dump_filtered_recon_frames(AV1_COMP *cpi);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODER_UTILS_H_
diff --git a/media/libaom/src/av1/encoder/encodetxb.c b/media/libaom/src/av1/encoder/encodetxb.c
index 825d52a7ae..4ea4f4c80d 100644
--- a/media/libaom/src/av1/encoder/encodetxb.c
+++ b/media/libaom/src/av1/encoder/encodetxb.c
@@ -23,67 +23,64 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
-#if CONFIG_HTB_TRELLIS
-static int hbt_needs_init = 1;
-static CRC32C crc_calculator;
-static const int HBT_EOB = 16;            // also the length in opt_qcoeff
-static const int HBT_TABLE_SIZE = 65536;  // 16 bit: holds 65536 'arrays'
-static const int HBT_ARRAY_LENGTH = 256;  // 8 bit: 256 entries
-// If removed in hbt_create_hashes or increased beyond int8_t, widen deltas type
-static const int HBT_KICKOUT = 3;
-
-typedef struct OptTxbQcoeff {
-  // Use larger type if larger/no kickout value is used in hbt_create_hashes
-  int8_t deltas[16];
-  uint32_t hbt_qc_hash;
-  uint32_t hbt_ctx_hash;
-  int init;
-  int rate_cost;
-} OptTxbQcoeff;
-
-OptTxbQcoeff *hbt_hash_table;
-#endif  // CONFIG_HTB_TRELLIS
-
-typedef struct LevelDownStats {
-  int update;
-  tran_low_t low_qc;
-  tran_low_t low_dqc;
-  int64_t dist0;
-  int rate;
-  int rate_low;
-  int64_t dist;
-  int64_t dist_low;
-  int64_t rd;
-  int64_t rd_low;
-  int64_t nz_rd;
-  int64_t rd_diff;
-  int cost_diff;
-  int64_t dist_diff;
-  int new_eob;
-} LevelDownStats;
-
-static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
-                          const qm_val_t *iqmatrix) {
-  int dqv = dequant[!!coeff_idx];
-  if (iqmatrix != NULL)
-    dqv =
-        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-  return dqv;
-}
-
 void av1_alloc_txb_buf(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
-  int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
-             ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
+  CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool;
+  const int num_sb_rows =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+  const int num_sb_cols =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+  const int size = num_sb_rows * num_sb_cols;
+  const int num_planes = av1_num_planes(cm);
+  const int subsampling_x = cm->seq_params->subsampling_x;
+  const int subsampling_y = cm->seq_params->subsampling_y;
+  const int luma_max_sb_square =
+      1 << num_pels_log2_lookup[cm->seq_params->sb_size];
+  const int chroma_max_sb_square =
+      luma_max_sb_square >> (subsampling_x + subsampling_y);
+  const int num_tcoeffs =
+      size * (luma_max_sb_square + (num_planes - 1) * chroma_max_sb_square);
+  const int txb_unit_size = TX_SIZE_W_MIN * TX_SIZE_H_MIN;
 
   av1_free_txb_buf(cpi);
   // TODO(jingning): This should be further reduced.
   CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
-                  aom_memalign(32, sizeof(*cpi->coeff_buffer_base) * size));
+                  aom_malloc(sizeof(*cpi->coeff_buffer_base) * size));
+  CHECK_MEM_ERROR(
+      cm, coeff_buf_pool->tcoeff,
+      aom_memalign(32, sizeof(*coeff_buf_pool->tcoeff) * num_tcoeffs));
+  CHECK_MEM_ERROR(
+      cm, coeff_buf_pool->eobs,
+      aom_malloc(sizeof(*coeff_buf_pool->eobs) * num_tcoeffs / txb_unit_size));
+  CHECK_MEM_ERROR(cm, coeff_buf_pool->entropy_ctx,
+                  aom_malloc(sizeof(*coeff_buf_pool->entropy_ctx) *
+                             num_tcoeffs / txb_unit_size));
+
+  tran_low_t *tcoeff_ptr = coeff_buf_pool->tcoeff;
+  uint16_t *eob_ptr = coeff_buf_pool->eobs;
+  uint8_t *entropy_ctx_ptr = coeff_buf_pool->entropy_ctx;
+  for (int i = 0; i < size; i++) {
+    for (int plane = 0; plane < num_planes; plane++) {
+      const int max_sb_square =
+          (plane == AOM_PLANE_Y) ? luma_max_sb_square : chroma_max_sb_square;
+      cpi->coeff_buffer_base[i].tcoeff[plane] = tcoeff_ptr;
+      cpi->coeff_buffer_base[i].eobs[plane] = eob_ptr;
+      cpi->coeff_buffer_base[i].entropy_ctx[plane] = entropy_ctx_ptr;
+      tcoeff_ptr += max_sb_square;
+      eob_ptr += max_sb_square / txb_unit_size;
+      entropy_ctx_ptr += max_sb_square / txb_unit_size;
+    }
+  }
+}
+
+void av1_free_txb_buf(AV1_COMP *cpi) {
+  CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool;
+  aom_free(cpi->coeff_buffer_base);
+  aom_free(coeff_buf_pool->tcoeff);
+  aom_free(coeff_buf_pool->eobs);
+  aom_free(coeff_buf_pool->entropy_ctx);
 }
 
-void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
-
 static void write_golomb(aom_writer *w, int level) {
   int x = level + 1;
   int i = x;
@@ -100,30 +97,6 @@ static void write_golomb(aom_writer *w, int level) {
   for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
 }
 
-static INLINE tran_low_t get_lower_coeff(tran_low_t qc) {
-  if (qc == 0) {
-    return 0;
-  }
-  return qc > 0 ? qc - 1 : qc + 1;
-}
-
-static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int coeff_idx,
-                                           int dqv, int shift,
-                                           const qm_val_t *iqmatrix) {
-  int sign = qc < 0 ? -1 : 1;
-  if (iqmatrix != NULL)
-    dqv =
-        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-  return sign * ((abs(qc) * dqv) >> shift);
-}
-
-static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
-                                     int shift) {
-  const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
-  const int64_t error = diff * diff;
-  return error;
-}
-
 static const int8_t eob_to_pos_small[33] = {
   0, 1, 2,                                        // 0-2
   3, 3,                                           // 3-4
@@ -141,7 +114,7 @@ static const int8_t eob_to_pos_large[17] = {
   11                               // 513-
 };
 
-static INLINE int get_eob_pos_token(const int eob, int *const extra) {
+int av1_get_eob_pos_token(const int eob, int *const extra) {
   int t;
 
   if (eob < 33) {
@@ -167,7 +140,7 @@ void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
                             uint8_t allow_update_cdf) {
 #endif
   int eob_extra;
-  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
   TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
 
   const int eob_multi_size = txsize_log2_minus4[tx_size];
@@ -246,110 +219,6 @@ void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
   }
 }
 
-static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
-                        const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) {
-  int eob_extra;
-  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
-  int eob_cost = 0;
-  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
-  eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
-
-  if (av1_eob_offset_bits[eob_pt] > 0) {
-    const int eob_ctx = eob_pt - 3;
-    const int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
-    const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
-    eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
-    const int offset_bits = av1_eob_offset_bits[eob_pt];
-    if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
-  }
-  return eob_cost;
-}
-
-static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
-                                    const int (*dc_sign_cost)[2],
-                                    int dc_sign_ctx) {
-  if (coeff_idx == 0) {
-    const int sign = (qc < 0) ? 1 : 0;
-    return dc_sign_cost[dc_sign_ctx][sign];
-  }
-  return av1_cost_literal(1);
-}
-
-static const int golomb_bits_cost[32] = {
-  0,       512,     512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5,
-  512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7,
-  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9,
-  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9
-};
-static const int golomb_cost_diff[32] = {
-  0,       512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0,
-  512 * 2, 0,   0,       0, 0,       0, 0, 0, 0,       0, 0, 0, 0, 0, 0, 0
-};
-
-static INLINE int get_golomb_cost(int abs_qc) {
-  if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
-    const int length = get_msb(r) + 1;
-    return av1_cost_literal(2 * length - 1);
-  }
-  return 0;
-}
-
-static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
-                                        int *diff) {
-  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
-  int golomb_bits = 0;
-  if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS)
-    *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1];
-
-  if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) {
-    int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
-    if (r < 32) {
-      golomb_bits = golomb_bits_cost[r];
-      *diff += golomb_cost_diff[r];
-    } else {
-      golomb_bits = get_golomb_cost(level);
-      *diff += (r & (r - 1)) == 0 ? 1024 : 0;
-    }
-  }
-
-  return coeff_lps[base_range] + golomb_bits;
-}
-
-static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) {
-  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
-  return coeff_lps[base_range] + get_golomb_cost(level);
-}
-
-static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
-                          const int is_eob, const TxbInfo *const txb_info,
-                          const LV_MAP_COEFF_COST *const txb_costs,
-                          const int coeff_ctx, const TX_CLASS tx_class) {
-  const TXB_CTX *const txb_ctx = txb_info->txb_ctx;
-  const int is_nz = (qc != 0);
-  const tran_low_t abs_qc = abs(qc);
-  int cost = 0;
-  const int16_t *const scan = txb_info->scan_order->scan;
-  const int pos = scan[scan_idx];
-
-  if (is_eob) {
-    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
-  } else {
-    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
-  }
-  if (is_nz) {
-    cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost,
-                              txb_ctx->dc_sign_ctx);
-
-    if (abs_qc > NUM_BASE_LEVELS) {
-      const int ctx =
-          get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class);
-      cost += get_br_cost(abs_qc, txb_costs->lps_cost[ctx]);
-    }
-  }
-  return cost;
-}
-
 static INLINE int get_nz_map_ctx(const uint8_t *const levels,
                                  const int coeff_idx, const int bwl,
                                  const int height, const int scan_idx,
@@ -366,111 +235,6 @@ static INLINE int get_nz_map_ctx(const uint8_t *const levels,
   return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
 }
 
-static void get_dist_cost_stats(LevelDownStats *const stats, const int scan_idx,
-                                const int is_eob,
-                                const LV_MAP_COEFF_COST *const txb_costs,
-                                const TxbInfo *const txb_info,
-                                const TX_CLASS tx_class) {
-  const int16_t *const scan = txb_info->scan_order->scan;
-  const int coeff_idx = scan[scan_idx];
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const uint8_t *const levels = txb_info->levels;
-  stats->new_eob = -1;
-  stats->update = 0;
-  stats->rd_low = 0;
-  stats->rd = 0;
-  stats->nz_rd = 0;
-  stats->dist_low = 0;
-  stats->rate_low = 0;
-  stats->low_qc = 0;
-
-  const tran_low_t tqc = txb_info->tcoeff[coeff_idx];
-  const int dqv = txb_info->dequant[coeff_idx != 0];
-  const int coeff_ctx =
-      get_nz_map_ctx(levels, coeff_idx, txb_info->bwl, txb_info->height,
-                     scan_idx, is_eob, txb_info->tx_size, tx_class);
-  const int qc_cost = get_coeff_cost(qc, scan_idx, is_eob, txb_info, txb_costs,
-                                     coeff_ctx, tx_class);
-  assert(qc != 0);
-  const tran_low_t dqc = qcoeff_to_dqcoeff(qc, coeff_idx, dqv, txb_info->shift,
-                                           txb_info->iqmatrix);
-  const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift);
-
-  // distortion difference when coefficient is quantized to 0
-  const tran_low_t dqc0 =
-      qcoeff_to_dqcoeff(0, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
-
-  stats->dist0 = get_coeff_dist(tqc, dqc0, txb_info->shift);
-  stats->dist = dqc_dist - stats->dist0;
-  stats->rate = qc_cost;
-
-  stats->rd = RDCOST(txb_info->rdmult, stats->rate, stats->dist);
-
-  stats->low_qc = get_lower_coeff(qc);
-
-  if (is_eob && stats->low_qc == 0) {
-    stats->rd_low = stats->rd;  // disable selection of low_qc in this case.
-  } else {
-    if (stats->low_qc == 0) {
-      stats->dist_low = 0;
-    } else {
-      stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, coeff_idx, dqv,
-                                         txb_info->shift, txb_info->iqmatrix);
-      const int64_t low_dqc_dist =
-          get_coeff_dist(tqc, stats->low_dqc, txb_info->shift);
-      stats->dist_low = low_dqc_dist - stats->dist0;
-    }
-    const int low_qc_cost =
-        get_coeff_cost(stats->low_qc, scan_idx, is_eob, txb_info, txb_costs,
-                       coeff_ctx, tx_class);
-    stats->rate_low = low_qc_cost;
-    stats->rd_low = RDCOST(txb_info->rdmult, stats->rate_low, stats->dist_low);
-  }
-}
-
-static void get_dist_cost_stats_with_eob(
-    LevelDownStats *const stats, const int scan_idx,
-    const LV_MAP_COEFF_COST *const txb_costs, const TxbInfo *const txb_info,
-    const TX_CLASS tx_class) {
-  const int is_eob = 0;
-  get_dist_cost_stats(stats, scan_idx, is_eob, txb_costs, txb_info, tx_class);
-
-  const int16_t *const scan = txb_info->scan_order->scan;
-  const int coeff_idx = scan[scan_idx];
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const int coeff_ctx_temp = get_nz_map_ctx(
-      txb_info->levels, coeff_idx, txb_info->bwl, txb_info->height, scan_idx, 1,
-      txb_info->tx_size, tx_class);
-  const int qc_eob_cost = get_coeff_cost(qc, scan_idx, 1, txb_info, txb_costs,
-                                         coeff_ctx_temp, tx_class);
-  int64_t rd_eob = RDCOST(txb_info->rdmult, qc_eob_cost, stats->dist);
-  if (stats->low_qc != 0) {
-    const int low_qc_eob_cost =
-        get_coeff_cost(stats->low_qc, scan_idx, 1, txb_info, txb_costs,
-                       coeff_ctx_temp, tx_class);
-    int64_t rd_eob_low =
-        RDCOST(txb_info->rdmult, low_qc_eob_cost, stats->dist_low);
-    rd_eob = (rd_eob > rd_eob_low) ? rd_eob_low : rd_eob;
-  }
-
-  stats->nz_rd = AOMMIN(stats->rd_low, stats->rd) - rd_eob;
-}
-
-static INLINE void update_qcoeff(const int coeff_idx, const tran_low_t qc,
-                                 const TxbInfo *const txb_info) {
-  txb_info->qcoeff[coeff_idx] = qc;
-  txb_info->levels[get_padded_idx(coeff_idx, txb_info->bwl)] =
-      (uint8_t)clamp(abs(qc), 0, INT8_MAX);
-}
-
-static INLINE void update_coeff(const int coeff_idx, const tran_low_t qc,
-                                const TxbInfo *const txb_info) {
-  update_qcoeff(coeff_idx, qc, txb_info);
-  const int dqv = txb_info->dequant[coeff_idx != 0];
-  txb_info->dqcoeff[coeff_idx] = qcoeff_to_dqcoeff(
-      qc, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
-}
-
 void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
                            const int height, uint8_t *const levels) {
   const int stride = width + TX_PAD_HOR;
@@ -507,8 +271,9 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
                           int block, TX_SIZE tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
   const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
-  const int txb_offset =
-      x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+                         (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
   const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
   const uint16_t eob = eob_txb[block];
   const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
@@ -518,7 +283,6 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
   aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2);
   if (eob == 0) return;
 
-  const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_TYPE tx_type =
       av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
                       cm->features.reduced_tx_set_used);
@@ -528,7 +292,7 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
   }
 
   int eob_extra;
-  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
   const int eob_multi_size = txsize_log2_minus4[tx_size];
   const TX_CLASS tx_class = tx_type_to_class[tx_type];
   const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
@@ -582,7 +346,7 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
   uint8_t levels_buf[TX_PAD_2D];
   uint8_t *const levels = set_levels(levels_buf, width);
   const tran_low_t *tcoeff_txb =
-      cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
+      cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
   const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block);
   av1_txb_init_levels(tcoeff, width, height, levels);
   const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
@@ -641,14 +405,8 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
   }
 }
 
-typedef struct encode_txb_args {
-  const AV1_COMMON *cm;
-  MACROBLOCK *x;
-  aom_writer *w;
-} ENCODE_TXB_ARGS;
-
-void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
-                         aom_writer *w, BLOCK_SIZE bsize) {
+void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                               aom_writer *w, BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   const int num_planes = av1_num_planes(cm);
   int block[MAX_MB_PLANE] = { 0 };
@@ -690,1297 +448,8 @@ void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
   }
 }
 
-// TODO(angiebird): use this function whenever it's possible
-static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd,
-                            int plane, TX_SIZE tx_size, TX_TYPE tx_type,
-                            int reduced_tx_set_used) {
-  if (plane > 0) return 0;
-
-  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
-
-  const MB_MODE_INFO *mbmi = xd->mi[0];
-  const int is_inter = is_inter_block(mbmi);
-  if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
-      !xd->lossless[xd->mi[0]->segment_id]) {
-    const int ext_tx_set =
-        get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
-    if (is_inter) {
-      if (ext_tx_set > 0)
-        return x->inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
-    } else {
-      if (ext_tx_set > 0) {
-        PREDICTION_MODE intra_dir;
-        if (mbmi->filter_intra_mode_info.use_filter_intra)
-          intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
-                                             .filter_intra_mode];
-        else
-          intra_dir = mbmi->mode;
-        return x->intra_tx_type_costs[ext_tx_set][square_tx_size][intra_dir]
-                                     [tx_type];
-      }
-    }
-  }
-  return 0;
-}
-
-static INLINE void update_coeff_eob_fast(int *eob, int shift,
-                                         const int16_t *dequant_ptr,
-                                         const int16_t *scan,
-                                         const tran_low_t *coeff_ptr,
-                                         tran_low_t *qcoeff_ptr,
-                                         tran_low_t *dqcoeff_ptr) {
-  // TODO(sarahparker) make this work for aomqm
-  int eob_out = *eob;
-  int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7),
-                  dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) };
-
-  for (int i = *eob - 1; i >= 0; i--) {
-    const int rc = scan[i];
-    const int qcoeff = qcoeff_ptr[rc];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = AOMSIGN(coeff);
-    int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-    if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) {
-      eob_out--;
-      qcoeff_ptr[rc] = 0;
-      dqcoeff_ptr[rc] = 0;
-    } else {
-      break;
-    }
-  }
-
-  *eob = eob_out;
-}
-
-static AOM_FORCE_INLINE int warehouse_efficients_txb(
-    const MACROBLOCK *x, const int plane, const int block,
-    const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
-    const struct macroblock_plane *p, const int eob,
-    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
-    const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
-    int reduced_tx_set_used) {
-  const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
-  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
-  const int bwl = get_txb_bwl(tx_size);
-  const int width = get_txb_wide(tx_size);
-  const int height = get_txb_high(tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
-  const int16_t *const scan = scan_order->scan;
-  uint8_t levels_buf[TX_PAD_2D];
-  uint8_t *const levels = set_levels(levels_buf, width);
-  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
-  const int eob_multi_size = txsize_log2_minus4[tx_size];
-  const LV_MAP_EOB_COST *const eob_costs =
-      &x->eob_costs[eob_multi_size][plane_type];
-  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
-
-  av1_txb_init_levels(qcoeff, width, height, levels);
-
-  cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
-
-  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
-
-  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
-
-  const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] =
-      coeff_costs->lps_cost;
-  int c = eob - 1;
-  {
-    const int pos = scan[c];
-    const tran_low_t v = qcoeff[pos];
-    const int sign = AOMSIGN(v);
-    const int level = (v ^ sign) - sign;
-    const int coeff_ctx = coeff_contexts[pos];
-    cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
-
-    if (v) {
-      // sign bit cost
-      if (level > NUM_BASE_LEVELS) {
-        const int ctx = get_br_ctx_eob(pos, bwl, tx_class);
-        cost += get_br_cost(level, lps_cost[ctx]);
-      }
-      if (c) {
-        cost += av1_cost_literal(1);
-      } else {
-        const int sign01 = (sign ^ sign) - sign;
-        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
-        cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
-        return cost;
-      }
-    }
-  }
-  const int(*base_cost)[8] = coeff_costs->base_cost;
-  for (c = eob - 2; c >= 1; --c) {
-    const int pos = scan[c];
-    const int coeff_ctx = coeff_contexts[pos];
-    const tran_low_t v = qcoeff[pos];
-    const int level = abs(v);
-    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
-    if (v) {
-      // sign bit cost
-      cost += av1_cost_literal(1);
-      if (level > NUM_BASE_LEVELS) {
-        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
-        cost += get_br_cost(level, lps_cost[ctx]);
-      }
-    }
-  }
-  // c == 0 after previous loop
-  {
-    const int pos = scan[c];
-    const tran_low_t v = qcoeff[pos];
-    const int coeff_ctx = coeff_contexts[pos];
-    const int sign = AOMSIGN(v);
-    const int level = (v ^ sign) - sign;
-    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
-
-    if (v) {
-      // sign bit cost
-      const int sign01 = (sign ^ sign) - sign;
-      const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
-      cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
-      if (level > NUM_BASE_LEVELS) {
-        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
-        cost += get_br_cost(level, lps_cost[ctx]);
-      }
-    }
-  }
-  return cost;
-}
-
-static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian(
-    const MACROBLOCK *x, const int plane, const int block,
-    const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob,
-    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
-    const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
-    int reduced_tx_set_used) {
-  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
-
-  const int eob_multi_size = txsize_log2_minus4[tx_size];
-  const LV_MAP_EOB_COST *const eob_costs =
-      &x->eob_costs[eob_multi_size][plane_type];
-  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
-
-  cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
-
-  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
-
-  cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type);
-  return cost;
-}
-
-// Look up table of individual cost of coefficient by its quantization level.
-// determined based on Laplacian distribution conditioned on estimated context
-static const int costLUT[15] = { -1143, 53,   545,  825,  1031,
-                                 1209,  1393, 1577, 1763, 1947,
-                                 2132,  2317, 2501, 2686, 2871 };
-static const int const_term = (1 << AV1_PROB_COST_SHIFT);
-static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000;
-int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
-                                 const int block, const TX_SIZE tx_size,
-                                 const TX_TYPE tx_type) {
-  assert(plane == 0);
-
-  int cost = 0;
-  const struct macroblock_plane *p = &x->plane[plane];
-  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
-  const int16_t *scan = scan_order->scan;
-  tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
-
-  int eob = p->eobs[block];
-
-  // coeffs
-  int c = eob - 1;
-  // eob
-  {
-    const int pos = scan[c];
-    const tran_low_t v = abs(qcoeff[pos]) - 1;
-    cost += (v << (AV1_PROB_COST_SHIFT + 2));
-  }
-  // other coeffs
-  for (c = eob - 2; c >= 0; c--) {
-    const int pos = scan[c];
-    const tran_low_t v = abs(qcoeff[pos]);
-    const int idx = AOMMIN(v, 14);
-
-    cost += costLUT[idx];
-  }
-
-  // const_term does not contain DC, and log(e) does not contain eob, so both
-  // (eob-1)
-  cost += (const_term + loge_par) * (eob - 1);
-
-  return cost;
-}
-
-int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
-                        const TX_SIZE tx_size, const TX_TYPE tx_type,
-                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
-  const struct macroblock_plane *p = &x->plane[plane];
-  const int eob = p->eobs[block];
-  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const LV_MAP_COEFF_COST *const coeff_costs =
-      &x->coeff_costs[txs_ctx][plane_type];
-  if (eob == 0) {
-    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
-  }
-
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const TX_CLASS tx_class = tx_type_to_class[tx_type];
-
-  return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob,
-                                  plane_type, coeff_costs, xd, tx_type,
-                                  tx_class, reduced_tx_set_used);
-}
-
-int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
-                                  const int block, const TX_SIZE tx_size,
-                                  const TX_TYPE tx_type,
-                                  const TXB_CTX *const txb_ctx,
-                                  const int reduced_tx_set_used,
-                                  const int adjust_eob) {
-  const struct macroblock_plane *p = &x->plane[plane];
-  int eob = p->eobs[block];
-
-  if (adjust_eob) {
-    const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
-    const int16_t *scan = scan_order->scan;
-    tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block);
-    tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
-    const MACROBLOCKD *xd = &x->e_mbd;
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
-    update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan,
-                          tcoeff, qcoeff, dqcoeff);
-    p->eobs[block] = eob;
-  }
-
-  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const LV_MAP_COEFF_COST *const coeff_costs =
-      &x->coeff_costs[txs_ctx][plane_type];
-  if (eob == 0) {
-    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
-  }
-
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const TX_CLASS tx_class = tx_type_to_class[tx_type];
-
-  return warehouse_efficients_txb_laplacian(
-      x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd,
-      tx_type, tx_class, reduced_tx_set_used);
-}
-
-static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
-                        const LV_MAP_EOB_COST *txb_eob_costs, int *rate_cost) {
-  int update = 0;
-  if (txb_info->eob == 0) return update;
-  const int16_t *const scan = txb_info->scan_order->scan;
-  // forward optimize the nz_map`
-  const int init_eob = txb_info->eob;
-  const TX_CLASS tx_class = tx_type_to_class[txb_info->tx_type];
-  const int eob_cost =
-      get_eob_cost(init_eob, txb_eob_costs, txb_costs, tx_class);
-
-  // backward optimize the level-k map
-  int accu_rate = eob_cost;
-  int64_t accu_dist = 0;
-  int64_t prev_eob_rd_cost = INT64_MAX;
-  int64_t cur_eob_rd_cost = 0;
-
-  {
-    const int si = init_eob - 1;
-    const int coeff_idx = scan[si];
-    LevelDownStats stats;
-    get_dist_cost_stats(&stats, si, si == init_eob - 1, txb_costs, txb_info,
-                        tx_class);
-    if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
-      update = 1;
-      update_coeff(coeff_idx, stats.low_qc, txb_info);
-      accu_rate += stats.rate_low;
-      accu_dist += stats.dist_low;
-    } else {
-      accu_rate += stats.rate;
-      accu_dist += stats.dist;
-    }
-  }
-
-  int si = init_eob - 2;
-  int8_t has_nz_tail = 0;
-  // eob is not fixed
-  for (; si >= 0 && has_nz_tail < 2; --si) {
-    assert(si != init_eob - 1);
-    const int coeff_idx = scan[si];
-    tran_low_t qc = txb_info->qcoeff[coeff_idx];
-
-    if (qc == 0) {
-      const int coeff_ctx =
-          get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
-                               txb_info->tx_size, tx_class);
-      accu_rate += txb_costs->base_cost[coeff_ctx][0];
-    } else {
-      LevelDownStats stats;
-      get_dist_cost_stats_with_eob(&stats, si, txb_costs, txb_info, tx_class);
-      // check if it is better to make this the last significant coefficient
-      int cur_eob_rate =
-          get_eob_cost(si + 1, txb_eob_costs, txb_costs, tx_class);
-      cur_eob_rd_cost = RDCOST(txb_info->rdmult, cur_eob_rate, 0);
-      prev_eob_rd_cost =
-          RDCOST(txb_info->rdmult, accu_rate, accu_dist) + stats.nz_rd;
-      if (cur_eob_rd_cost <= prev_eob_rd_cost) {
-        update = 1;
-        for (int j = si + 1; j < txb_info->eob; j++) {
-          const int coeff_pos_j = scan[j];
-          update_coeff(coeff_pos_j, 0, txb_info);
-        }
-        txb_info->eob = si + 1;
-
-        // rerun cost calculation due to change of eob
-        accu_rate = cur_eob_rate;
-        accu_dist = 0;
-        get_dist_cost_stats(&stats, si, 1, txb_costs, txb_info, tx_class);
-        if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
-          update = 1;
-          update_coeff(coeff_idx, stats.low_qc, txb_info);
-          accu_rate += stats.rate_low;
-          accu_dist += stats.dist_low;
-        } else {
-          accu_rate += stats.rate;
-          accu_dist += stats.dist;
-        }
-
-        // reset non zero tail when new eob is found
-        has_nz_tail = 0;
-      } else {
-        int bUpdCoeff = 0;
-        if (stats.rd_low < stats.rd) {
-          if ((si < txb_info->eob - 1)) {
-            bUpdCoeff = 1;
-            update = 1;
-          }
-        } else {
-          ++has_nz_tail;
-        }
-
-        if (bUpdCoeff) {
-          update_coeff(coeff_idx, stats.low_qc, txb_info);
-          accu_rate += stats.rate_low;
-          accu_dist += stats.dist_low;
-        } else {
-          accu_rate += stats.rate;
-          accu_dist += stats.dist;
-        }
-      }
-    }
-  }  // for (si)
-
-  // eob is fixed
-  for (; si >= 0; --si) {
-    assert(si != init_eob - 1);
-    const int coeff_idx = scan[si];
-    tran_low_t qc = txb_info->qcoeff[coeff_idx];
-
-    if (qc == 0) {
-      const int coeff_ctx =
-          get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
-                               txb_info->tx_size, tx_class);
-      accu_rate += txb_costs->base_cost[coeff_ctx][0];
-    } else {
-      LevelDownStats stats;
-      get_dist_cost_stats(&stats, si, 0, txb_costs, txb_info, tx_class);
-
-      int bUpdCoeff = 0;
-      if (stats.rd_low < stats.rd) {
-        if ((si < txb_info->eob - 1)) {
-          bUpdCoeff = 1;
-          update = 1;
-        }
-      }
-      if (bUpdCoeff) {
-        update_coeff(coeff_idx, stats.low_qc, txb_info);
-        accu_rate += stats.rate_low;
-        accu_dist += stats.dist_low;
-      } else {
-        accu_rate += stats.rate;
-        accu_dist += stats.dist;
-      }
-    }
-  }  // for (si)
-
-  int non_zero_blk_rate =
-      txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][0];
-  prev_eob_rd_cost =
-      RDCOST(txb_info->rdmult, accu_rate + non_zero_blk_rate, accu_dist);
-
-  int zero_blk_rate =
-      txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][1];
-  int64_t zero_blk_rd_cost = RDCOST(txb_info->rdmult, zero_blk_rate, 0);
-  if (zero_blk_rd_cost <= prev_eob_rd_cost) {
-    update = 1;
-    for (int j = 0; j < txb_info->eob; j++) {
-      const int coeff_pos_j = scan[j];
-      update_coeff(coeff_pos_j, 0, txb_info);
-    }
-    txb_info->eob = 0;
-  }
-
-  // record total rate cost
-  *rate_cost = zero_blk_rd_cost <= prev_eob_rd_cost
-                   ? zero_blk_rate
-                   : accu_rate + non_zero_blk_rate;
-
-  if (txb_info->eob > 0) {
-    *rate_cost += txb_info->tx_type_cost;
-  }
-
-  return update;
-}
-
-#if CONFIG_HTB_TRELLIS
-static void hbt_init() {
-  hbt_hash_table =
-      aom_malloc(sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
-  memset(hbt_hash_table, 0,
-         sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
-  av1_crc32c_calculator_init(&crc_calculator);  // 31 bit: qc & ctx
-
-  hbt_needs_init = 0;
-}
-
-void hbt_destroy() { aom_free(hbt_hash_table); }
-
-static int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
-                         TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
-                         const LV_MAP_EOB_COST *txb_eob_costs,
-                         const struct macroblock_plane *p, int block,
-                         int fast_mode, int *rate_cost) {
-  (void)fast_mode;
-  const int16_t *scan = txb_info->scan_order->scan;
-  int prev_eob = txb_info->eob;
-  assert(HBT_EOB <= 16);  // Lengthen array if allowing longer eob.
-  int32_t prev_coeff[16];
-  for (int i = 0; i < prev_eob; i++) {
-    prev_coeff[i] = txb_info->qcoeff[scan[i]];
-  }
-  for (int i = prev_eob; i < HBT_EOB; i++) {
-    prev_coeff[i] = 0;  // For compiler piece of mind.
-  }
-
-  av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
-                      txb_info->levels);
-
-  const int update =
-      optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
-
-  // Overwrite old entry
-  uint16_t hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
-  uint16_t hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
-  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-      .rate_cost = *rate_cost;
-  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index].init = 1;
-  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-      .hbt_qc_hash = hbt_qc_hash;
-  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-      .hbt_ctx_hash = hbt_ctx_hash;
-  assert(prev_eob >= txb_info->eob);  // eob can't get longer
-  for (int i = 0; i < txb_info->eob; i++) {
-    // Record how coeff changed. Convention: towards zero is negative.
-    if (txb_info->qcoeff[scan[i]] > 0)
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-          .deltas[i] = txb_info->qcoeff[scan[i]] - prev_coeff[i];
-    else
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-          .deltas[i] = prev_coeff[i] - txb_info->qcoeff[scan[i]];
-  }
-  for (int i = txb_info->eob; i < prev_eob; i++) {
-    // If eob got shorter, record that all after it changed to zero.
-    if (prev_coeff[i] > 0)
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-          .deltas[i] = -prev_coeff[i];
-    else
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-          .deltas[i] = prev_coeff[i];
-  }
-  for (int i = prev_eob; i < HBT_EOB; i++) {
-    // Record 'no change' after optimized coefficients run out.
-    hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-        .deltas[i] = 0;
-  }
-
-  if (update) {
-    p->eobs[block] = txb_info->eob;
-    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
-        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
-  }
-  return txb_info->eob;
-}
-
-static int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index,
-                        TxbInfo *txb_info, const struct macroblock_plane *p,
-                        int block, int *rate_cost) {
-  const int16_t *scan = txb_info->scan_order->scan;
-  int new_eob = 0;
-  int update = 0;
-
-  for (int i = 0; i < txb_info->eob; i++) {
-    // Delta convention is negatives go towards zero, so only apply those ones.
-    if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-            .deltas[i] < 0) {
-      if (txb_info->qcoeff[scan[i]] > 0)
-        txb_info->qcoeff[scan[i]] +=
-            hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-                .deltas[i];
-      else
-        txb_info->qcoeff[scan[i]] -=
-            hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-                .deltas[i];
-
-      update = 1;
-      update_coeff(scan[i], txb_info->qcoeff[scan[i]], txb_info);
-    }
-    if (txb_info->qcoeff[scan[i]]) new_eob = i + 1;
-  }
-
-  // Rate_cost can be calculated here instead (av1_cost_coeffs_txb), but
-  // it is expensive and gives little benefit as long as qc_hash is high bit
-  *rate_cost =
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-          .rate_cost;
-
-  if (update) {
-    txb_info->eob = new_eob;
-    p->eobs[block] = txb_info->eob;
-    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
-        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
-  }
-
-  return txb_info->eob;
-}
-
-static int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
-                            TxbInfo *txb_info,
-                            const LV_MAP_COEFF_COST *txb_costs,
-                            const LV_MAP_EOB_COST *txb_eob_costs,
-                            const struct macroblock_plane *p, int block,
-                            int fast_mode, int *rate_cost) {
-  // Check for qcoeff match
-  int hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
-  int hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
-
-  if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-              .hbt_qc_hash == hbt_qc_hash &&
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-              .hbt_ctx_hash == hbt_ctx_hash &&
-      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
-          .init) {
-    return hbt_hash_hit(hbt_table_index, hbt_array_index, txb_info, p, block,
-                        rate_cost);
-  } else {
-    return hbt_hash_miss(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
-                         txb_eob_costs, p, block, fast_mode, rate_cost);
-  }
-}
-
-static int hbt_create_hashes(TxbInfo *txb_info,
-                             const LV_MAP_COEFF_COST *txb_costs,
-                             const LV_MAP_EOB_COST *txb_eob_costs,
-                             const struct macroblock_plane *p, int block,
-                             int fast_mode, int *rate_cost) {
-  // Initialize hash table if needed.
-  if (hbt_needs_init) {
-    hbt_init();
-  }
-
-  //// Hash creation
-  uint8_t txb_hash_data[256];  // Asserts below to ensure enough space.
-  const int16_t *scan = txb_info->scan_order->scan;
-  uint8_t chunk = 0;
-  int hash_data_index = 0;
-
-  // Make qc_hash.
-  int packing_index = 0;  // needed for packing.
-  for (int i = 0; i < txb_info->eob; i++) {
-    tran_low_t prechunk = txb_info->qcoeff[scan[i]];
-
-    // Softening: Improves speed. Aligns with signed deltas.
-    if (prechunk < 0) prechunk *= -1;
-
-    // Early kick out: Don't apply feature if there are large coeffs:
-    // If this kickout value is removed or raised beyond int8_t,
-    // widen deltas type in OptTxbQcoeff struct.
-    assert((int8_t)HBT_KICKOUT == HBT_KICKOUT);  // If not, widen types.
-    if (prechunk > HBT_KICKOUT) {
-      av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
-                          txb_info->levels);
-
-      const int update =
-          optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
-
-      if (update) {
-        p->eobs[block] = txb_info->eob;
-        p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
-            txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
-      }
-      return txb_info->eob;
-    }
-
-    // Since coeffs are 0 to 3, only 2 bits are needed: pack into bytes
-    if (packing_index == 0) txb_hash_data[hash_data_index] = 0;
-    chunk = prechunk << packing_index;
-    packing_index += 2;
-    txb_hash_data[hash_data_index] |= chunk;
-
-    // Full byte:
-    if (packing_index == 8) {
-      packing_index = 0;
-      hash_data_index++;
-    }
-  }
-  // Needed when packing_index != 0, to include final byte.
-  hash_data_index++;
-  assert(hash_data_index <= 64);
-  // 31 bit qc_hash: index to array
-  uint32_t hbt_qc_hash =
-      av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
-
-  // Make ctx_hash.
-  hash_data_index = 0;
-  tran_low_t prechunk;
-
-  for (int i = 0; i < txb_info->eob; i++) {
-    // Save as magnitudes towards or away from zero.
-    if (txb_info->tcoeff[scan[i]] >= 0)
-      prechunk = txb_info->tcoeff[scan[i]] - txb_info->dqcoeff[scan[i]];
-    else
-      prechunk = txb_info->dqcoeff[scan[i]] - txb_info->tcoeff[scan[i]];
-
-    chunk = prechunk & 0xff;
-    txb_hash_data[hash_data_index++] = chunk;
-  }
-
-  // Extra ctx data:
-  // Include dequants.
-  txb_hash_data[hash_data_index++] = txb_info->dequant[0] & 0xff;
-  txb_hash_data[hash_data_index++] = txb_info->dequant[1] & 0xff;
-  chunk = txb_info->txb_ctx->txb_skip_ctx & 0xff;
-  txb_hash_data[hash_data_index++] = chunk;
-  chunk = txb_info->txb_ctx->dc_sign_ctx & 0xff;
-  txb_hash_data[hash_data_index++] = chunk;
-  // eob
-  chunk = txb_info->eob & 0xff;
-  txb_hash_data[hash_data_index++] = chunk;
-  // rdmult (int64)
-  chunk = txb_info->rdmult & 0xff;
-  txb_hash_data[hash_data_index++] = chunk;
-  // tx_type
-  chunk = txb_info->tx_type & 0xff;
-  txb_hash_data[hash_data_index++] = chunk;
-  // base_eob_cost
-  for (int i = 1; i < 3; i++) {  // i = 0 are softened away
-    for (int j = 0; j < SIG_COEF_CONTEXTS_EOB; j++) {
-      chunk = (txb_costs->base_eob_cost[j][i] & 0xff00) >> 8;
-      txb_hash_data[hash_data_index++] = chunk;
-    }
-  }
-  // eob_cost
-  for (int i = 0; i < 11; i++) {
-    for (int j = 0; j < 2; j++) {
-      chunk = (txb_eob_costs->eob_cost[j][i] & 0xff00) >> 8;
-      txb_hash_data[hash_data_index++] = chunk;
-    }
-  }
-  // dc_sign_cost
-  for (int i = 0; i < 2; i++) {
-    for (int j = 0; j < DC_SIGN_CONTEXTS; j++) {
-      chunk = (txb_costs->dc_sign_cost[j][i] & 0xff00) >> 8;
-      txb_hash_data[hash_data_index++] = chunk;
-    }
-  }
-
-  assert(hash_data_index <= 256);
-  // 31 bit ctx_hash: used to index table
-  uint32_t hbt_ctx_hash =
-      av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
-  //// End hash creation
-
-  return hbt_search_match(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
-                          txb_eob_costs, p, block, fast_mode, rate_cost);
-}
-#endif  // CONFIG_HTB_TRELLIS
-
-static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
-    int ci, tran_low_t abs_qc, int coeff_ctx,
-    const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
-    const uint8_t *levels, int *cost_low) {
-  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
-  // and not the last (scan_idx != eob - 1)
-  assert(ci > 0);
-  int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
-  int diff = 0;
-  if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4];
-  if (abs_qc) {
-    cost += av1_cost_literal(1);
-    if (abs_qc > NUM_BASE_LEVELS) {
-      const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
-      int brcost_diff = 0;
-      cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx],
-                                    &brcost_diff);
-      diff += brcost_diff;
-    }
-  }
-  *cost_low = cost - diff;
-
-  return cost;
-}
-
-static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
-                                     int coeff_ctx, int dc_sign_ctx,
-                                     const LV_MAP_COEFF_COST *txb_costs,
-                                     int bwl, TX_CLASS tx_class) {
-  int cost = 0;
-  cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
-  if (abs_qc != 0) {
-    if (ci == 0) {
-      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
-    } else {
-      cost += av1_cost_literal(1);
-    }
-    if (abs_qc > NUM_BASE_LEVELS) {
-      int br_ctx;
-      br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
-      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
-    }
-  }
-  return cost;
-}
-
-static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
-                                         int sign, int coeff_ctx,
-                                         int dc_sign_ctx,
-                                         const LV_MAP_COEFF_COST *txb_costs,
-                                         int bwl, TX_CLASS tx_class,
-                                         const uint8_t *levels) {
-  int cost = 0;
-  if (is_last) {
-    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
-  } else {
-    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
-  }
-  if (abs_qc != 0) {
-    if (ci == 0) {
-      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
-    } else {
-      cost += av1_cost_literal(1);
-    }
-    if (abs_qc > NUM_BASE_LEVELS) {
-      int br_ctx;
-      if (is_last)
-        br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
-      else
-        br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
-      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
-    }
-  }
-  return cost;
-}
-
-static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
-                                  int shift, tran_low_t *qc_low,
-                                  tran_low_t *dqc_low) {
-  tran_low_t abs_qc_low = abs_qc - 1;
-  *qc_low = (-sign ^ abs_qc_low) + sign;
-  assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low);
-  tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
-  *dqc_low = (-sign ^ abs_dqc_low) + sign;
-  assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low);
-}
-
-static INLINE void update_coeff_general(
-    int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size,
-    TX_CLASS tx_class, int bwl, int height, int64_t rdmult, int shift,
-    int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
-    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
-    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels,
-    const qm_val_t *iqmatrix) {
-  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
-  const int ci = scan[si];
-  const tran_low_t qc = qcoeff[ci];
-  const int is_last = si == (eob - 1);
-  const int coeff_ctx = get_lower_levels_ctx_general(
-      is_last, si, bwl, height, levels, ci, tx_size, tx_class);
-  if (qc == 0) {
-    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
-  } else {
-    const int sign = (qc < 0) ? 1 : 0;
-    const tran_low_t abs_qc = abs(qc);
-    const tran_low_t tqc = tcoeff[ci];
-    const tran_low_t dqc = dqcoeff[ci];
-    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
-    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
-    const int rate =
-        get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
-                               dc_sign_ctx, txb_costs, bwl, tx_class, levels);
-    const int64_t rd = RDCOST(rdmult, rate, dist);
-
-    tran_low_t qc_low, dqc_low;
-    tran_low_t abs_qc_low;
-    int64_t dist_low, rd_low;
-    int rate_low;
-    if (abs_qc == 1) {
-      abs_qc_low = qc_low = dqc_low = 0;
-      dist_low = dist0;
-      rate_low = txb_costs->base_cost[coeff_ctx][0];
-    } else {
-      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
-      abs_qc_low = abs_qc - 1;
-      dist_low = get_coeff_dist(tqc, dqc_low, shift);
-      rate_low =
-          get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
-                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
-    }
-
-    rd_low = RDCOST(rdmult, rate_low, dist_low);
-    if (rd_low < rd) {
-      qcoeff[ci] = qc_low;
-      dqcoeff[ci] = dqc_low;
-      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
-      *accu_rate += rate_low;
-      *accu_dist += dist_low - dist0;
-    } else {
-      *accu_rate += rate;
-      *accu_dist += dist - dist0;
-    }
-  }
-}
-
-static AOM_FORCE_INLINE void update_coeff_simple(
-    int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
-    int bwl, int64_t rdmult, int shift, const int16_t *dequant,
-    const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
-    const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
-    uint8_t *levels, const qm_val_t *iqmatrix) {
-  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
-  (void)eob;
-  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
-  // and not the last (scan_idx != eob - 1)
-  assert(si != eob - 1);
-  assert(si > 0);
-  const int ci = scan[si];
-  const tran_low_t qc = qcoeff[ci];
-  const int coeff_ctx =
-      get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
-  if (qc == 0) {
-    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
-  } else {
-    const tran_low_t abs_qc = abs(qc);
-    const tran_low_t abs_tqc = abs(tcoeff[ci]);
-    const tran_low_t abs_dqc = abs(dqcoeff[ci]);
-    int rate_low = 0;
-    const int rate = get_two_coeff_cost_simple(
-        ci, abs_qc, coeff_ctx, txb_costs, bwl, tx_class, levels, &rate_low);
-    if (abs_dqc < abs_tqc) {
-      *accu_rate += rate;
-      return;
-    }
-
-    const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift);
-    const int64_t rd = RDCOST(rdmult, rate, dist);
-
-    const tran_low_t abs_qc_low = abs_qc - 1;
-    const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
-    const int64_t dist_low = get_coeff_dist(abs_tqc, abs_dqc_low, shift);
-    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
-
-    if (rd_low < rd) {
-      const int sign = (qc < 0) ? 1 : 0;
-      qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
-      dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
-      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
-      *accu_rate += rate_low;
-    } else {
-      *accu_rate += rate;
-    }
-  }
-}
-
-static AOM_FORCE_INLINE void update_coeff_eob(
-    int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
-    int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height,
-    int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
-    const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
-    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
-    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
-    const qm_val_t *iqmatrix) {
-  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
-  assert(si != *eob - 1);
-  const int ci = scan[si];
-  const tran_low_t qc = qcoeff[ci];
-  const int coeff_ctx =
-      get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
-  if (qc == 0) {
-    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
-  } else {
-    int lower_level = 0;
-    const tran_low_t abs_qc = abs(qc);
-    const tran_low_t tqc = tcoeff[ci];
-    const tran_low_t dqc = dqcoeff[ci];
-    const int sign = (qc < 0) ? 1 : 0;
-    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
-    int64_t dist = get_coeff_dist(tqc, dqc, shift) - dist0;
-    int rate =
-        get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
-                               txb_costs, bwl, tx_class, levels);
-    int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
-
-    tran_low_t qc_low, dqc_low;
-    tran_low_t abs_qc_low;
-    int64_t dist_low, rd_low;
-    int rate_low;
-    if (abs_qc == 1) {
-      abs_qc_low = 0;
-      dqc_low = qc_low = 0;
-      dist_low = 0;
-      rate_low = txb_costs->base_cost[coeff_ctx][0];
-      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
-    } else {
-      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
-      abs_qc_low = abs_qc - 1;
-      dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
-      rate_low =
-          get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
-                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
-      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
-    }
-
-    int lower_level_new_eob = 0;
-    const int new_eob = si + 1;
-    const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bwl, height, si);
-    const int new_eob_cost =
-        get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
-    int rate_coeff_eob =
-        new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob,
-                                          dc_sign_ctx, txb_costs, bwl,
-                                          tx_class);
-    int64_t dist_new_eob = dist;
-    int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
-
-    if (abs_qc_low > 0) {
-      const int rate_coeff_eob_low =
-          new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign,
-                                            coeff_ctx_new_eob, dc_sign_ctx,
-                                            txb_costs, bwl, tx_class);
-      const int64_t dist_new_eob_low = dist_low;
-      const int64_t rd_new_eob_low =
-          RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
-      if (rd_new_eob_low < rd_new_eob) {
-        lower_level_new_eob = 1;
-        rd_new_eob = rd_new_eob_low;
-        rate_coeff_eob = rate_coeff_eob_low;
-        dist_new_eob = dist_new_eob_low;
-      }
-    }
-
-    if (rd_low < rd) {
-      lower_level = 1;
-      rd = rd_low;
-      rate = rate_low;
-      dist = dist_low;
-    }
-
-    if (sharpness == 0 && rd_new_eob < rd) {
-      for (int ni = 0; ni < *nz_num; ++ni) {
-        int last_ci = nz_ci[ni];
-        levels[get_padded_idx(last_ci, bwl)] = 0;
-        qcoeff[last_ci] = 0;
-        dqcoeff[last_ci] = 0;
-      }
-      *eob = new_eob;
-      *nz_num = 0;
-      *accu_rate = rate_coeff_eob;
-      *accu_dist = dist_new_eob;
-      lower_level = lower_level_new_eob;
-    } else {
-      *accu_rate += rate;
-      *accu_dist += dist;
-    }
-
-    if (lower_level) {
-      qcoeff[ci] = qc_low;
-      dqcoeff[ci] = dqc_low;
-      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
-    }
-    if (qcoeff[ci]) {
-      nz_ci[*nz_num] = ci;
-      ++*nz_num;
-    }
-  }
-}
-
-static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
-                               int nz_num, int *nz_ci, int64_t rdmult,
-                               int skip_cost, int non_skip_cost,
-                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                               int sharpness) {
-  const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
-  const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
-  if (sharpness == 0 && rd_new_eob < rd) {
-    for (int i = 0; i < nz_num; ++i) {
-      const int ci = nz_ci[i];
-      qcoeff[ci] = 0;
-      dqcoeff[ci] = 0;
-      // no need to set up levels because this is the last step
-      // levels[get_padded_idx(ci, bwl)] = 0;
-    }
-    *accu_rate = 0;
-    *eob = 0;
-  }
-}
-
-int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                         int block, TX_SIZE tx_size, TX_TYPE tx_type,
-                         const TXB_CTX *const txb_ctx, int *rate_cost,
-                         int sharpness, int fast_mode) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  const struct macroblock_plane *p = &x->plane[plane];
-  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
-  const int16_t *scan = scan_order->scan;
-  const int shift = av1_get_tx_scale(tx_size);
-  int eob = p->eobs[block];
-  const int16_t *dequant = p->dequant_QTX;
-  const qm_val_t *iqmatrix =
-      av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
-  const int block_offset = BLOCK_OFFSET(block);
-  tran_low_t *qcoeff = p->qcoeff + block_offset;
-  tran_low_t *dqcoeff = pd->dqcoeff + block_offset;
-  const tran_low_t *tcoeff = p->coeff + block_offset;
-
-  // This function is not called if eob = 0.
-  assert(eob > 0);
-
-  if (fast_mode) {
-    update_coeff_eob_fast(&eob, shift, dequant, scan, tcoeff, qcoeff, dqcoeff);
-    p->eobs[block] = eob;
-    if (eob == 0) {
-      *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size);
-      return eob;
-    }
-  }
-
-  const AV1_COMMON *cm = &cpi->common;
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  const TX_CLASS tx_class = tx_type_to_class[tx_type];
-  const MB_MODE_INFO *mbmi = xd->mi[0];
-  const int bwl = get_txb_bwl(tx_size);
-  const int width = get_txb_wide(tx_size);
-  const int height = get_txb_high(tx_size);
-  assert(width == (1 << bwl));
-  const int is_inter = is_inter_block(mbmi);
-  const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
-  const int eob_multi_size = txsize_log2_minus4[tx_size];
-  const LV_MAP_EOB_COST *txb_eob_costs =
-      &x->eob_costs[eob_multi_size][plane_type];
-
-  const int rshift =
-      (sharpness +
-       (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
-            ? 7 - mbmi->segment_id
-            : 2) +
-       (cpi->oxcf.aq_mode != VARIANCE_AQ &&
-                cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL &&
-                cm->delta_q_info.delta_q_present_flag && x->sb_energy_level < 0
-            ? (3 - x->sb_energy_level)
-            : 0));
-  const int64_t rdmult =
-      (((int64_t)x->rdmult *
-        (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
-       2) >>
-      rshift;
-
-  uint8_t levels_buf[TX_PAD_2D];
-  uint8_t *const levels = set_levels(levels_buf, width);
-
-  if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels);
-
-  // TODO(angirbird): check iqmatrix
-
-  const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
-  const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
-  const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
-  int accu_rate = eob_cost;
-  int64_t accu_dist = 0;
-  int si = eob - 1;
-  const int ci = scan[si];
-  const tran_low_t qc = qcoeff[ci];
-  const tran_low_t abs_qc = abs(qc);
-  const int sign = qc < 0;
-  const int max_nz_num = 2;
-  int nz_num = 1;
-  int nz_ci[3] = { ci, 0, 0 };
-  if (abs_qc >= 2) {
-    update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
-                         bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
-                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
-                         levels, iqmatrix);
-    --si;
-  } else {
-    assert(abs_qc == 1);
-    const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, si);
-    accu_rate +=
-        get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx,
-                           txb_costs, bwl, tx_class);
-    const tran_low_t tqc = tcoeff[ci];
-    const tran_low_t dqc = dqcoeff[ci];
-    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
-    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
-    accu_dist += dist - dist0;
-    --si;
-  }
-
-#define UPDATE_COEFF_EOB_CASE(tx_class_literal)                            \
-  case tx_class_literal:                                                   \
-    for (; si >= 0 && nz_num <= max_nz_num && !fast_mode; --si) {          \
-      update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si,   \
-                       tx_size, tx_class_literal, bwl, height,             \
-                       txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
-                       txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,  \
-                       levels, sharpness, iqmatrix);                       \
-    }                                                                      \
-    break;
-  switch (tx_class) {
-    UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
-    UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ);
-    UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT);
-#undef UPDATE_COEFF_EOB_CASE
-    default: assert(false);
-  }
-
-  if (si == -1 && nz_num <= max_nz_num) {
-    update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
-                non_skip_cost, qcoeff, dqcoeff, sharpness);
-  }
-
-#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal)                             \
-  case tx_class_literal:                                                       \
-    for (; si >= 1; --si) {                                                    \
-      update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bwl, \
-                          rdmult, shift, dequant, scan, txb_costs, tcoeff,     \
-                          qcoeff, dqcoeff, levels, iqmatrix);                  \
-    }                                                                          \
-    break;
-  switch (tx_class) {
-    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);
-    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ);
-    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT);
-#undef UPDATE_COEFF_SIMPLE_CASE
-    default: assert(false);
-  }
-
-  // DC position
-  if (si == 0) {
-    // no need to update accu_dist because it's not used after this point
-    int64_t dummy_dist = 0;
-    update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
-                         bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
-                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
-                         levels, iqmatrix);
-  }
-
-  const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type,
-                                            cm->features.reduced_tx_set_used);
-  if (eob == 0)
-    accu_rate += skip_cost;
-  else
-    accu_rate += non_skip_cost + tx_type_cost;
-
-  p->eobs[block] = eob;
-  p->txb_entropy_ctx[block] =
-      av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]);
-
-  *rate_cost = accu_rate;
-  return eob;
-}
-
-// This function is deprecated, but we keep it here because hash trellis
-// is not integrated with av1_optimize_txb_new yet
-int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                     int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                     TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) {
-  const AV1_COMMON *cm = &cpi->common;
-  const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
-                                          tx_size, reduced_tx_set_used);
-  const MB_MODE_INFO *mbmi = xd->mi[0];
-  const struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  const int eob = p->eobs[block];
-  const int block_offset = BLOCK_OFFSET(block);
-  tran_low_t *qcoeff = p->qcoeff + block_offset;
-  tran_low_t *dqcoeff = pd->dqcoeff + block_offset;
-  const tran_low_t *tcoeff = p->coeff + block_offset;
-  const int16_t *dequant = p->dequant_QTX;
-  const int seg_eob = av1_get_max_eob(tx_size);
-  const int bwl = get_txb_bwl(tx_size);
-  const int width = get_txb_wide(tx_size);
-  const int height = get_txb_high(tx_size);
-  const int is_inter = is_inter_block(mbmi);
-  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
-  const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
-  const int eob_multi_size = txsize_log2_minus4[tx_size];
-  const LV_MAP_EOB_COST txb_eob_costs =
-      x->eob_costs[eob_multi_size][plane_type];
-
-  const int shift = av1_get_tx_scale(tx_size);
-  const int64_t rdmult =
-      (((int64_t)x->rdmult * plane_rd_mult[is_inter][plane_type]
-        << (2 * (xd->bd - 8))) +
-       2) >>
-      2;
-  uint8_t levels_buf[TX_PAD_2D];
-  uint8_t *const levels = set_levels(levels_buf, width);
-  const qm_val_t *iqmatrix =
-      av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
-  assert(width == (1 << bwl));
-  const int tx_type_cost =
-      get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
-  TxbInfo txb_info = {
-    qcoeff,     levels,  dqcoeff, tcoeff,   dequant,      shift, tx_size,
-    txs_ctx,    tx_type, bwl,     width,    height,       eob,   seg_eob,
-    scan_order, txb_ctx, rdmult,  iqmatrix, tx_type_cost,
-  };
-
-#if CONFIG_HTB_TRELLIS
-  // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
-  // by storing the coefficient deltas in a hash table.
-  // Currently disabled in speedfeatures.c
-  if (eob <= HBT_EOB && eob > 0 && cpi->sf.use_hash_based_trellis) {
-    return hbt_create_hashes(&txb_info, txb_costs, &txb_eob_costs, p, block,
-                             fast_mode, rate_cost);
-  }
-#else
-  (void)fast_mode;
-#endif  // CONFIG_HTB_TRELLIS
-  av1_txb_init_levels(qcoeff, width, height, levels);
-
-  const int update =
-      optimize_txb(&txb_info, txb_costs, &txb_eob_costs, rate_cost);
-
-  if (update) {
-    p->eobs[block] = txb_info.eob;
-    p->txb_entropy_ctx[block] =
-        av1_get_txb_entropy_context(qcoeff, scan_order, txb_info.eob);
-  }
-  return txb_info.eob;
-}
-
-int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
-                                const SCAN_ORDER *scan_order, int eob) {
+uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+                                    const SCAN_ORDER *scan_order, int eob) {
   const int16_t *const scan = scan_order->scan;
   int cul_level = 0;
   int c;
@@ -1994,7 +463,7 @@ int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
   cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
   set_dc_sign(&cul_level, qcoeff[0]);
 
-  return cul_level;
+  return (uint8_t)cul_level;
 }
 
 static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm,
@@ -2015,22 +484,29 @@ static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm,
   const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col,
                                           tx_size, reduced_tx_set_used);
   if (is_inter) {
-    if (cpi->oxcf.use_inter_dct_only) {
+    if (cpi->oxcf.txfm_cfg.use_inter_dct_only) {
       assert(tx_type == DCT_DCT);
     }
   } else {
-    if (cpi->oxcf.use_intra_dct_only) {
+    if (cpi->oxcf.txfm_cfg.use_intra_dct_only) {
       assert(tx_type == DCT_DCT);
-    } else if (cpi->oxcf.use_intra_default_tx_only) {
+    } else if (cpi->oxcf.txfm_cfg.use_intra_default_tx_only) {
       const TX_TYPE default_type = get_default_tx_type(
-          PLANE_TYPE_Y, xd, tx_size, cpi->is_screen_content_type);
+          PLANE_TYPE_Y, xd, tx_size, cpi->use_screen_content_tools);
       (void)default_type;
-      assert(tx_type == default_type);
+      // TODO(kyslov): We don't always respect use_intra_default_tx_only flag in
+      // NonRD and REALTIME case. Specifically we ignore it in hybrid inta mode
+      // search, when picking up intra mode in nonRD inter mode search and in RD
+      // REALTIME mode when we limit TX type usage.
+      // We need to fix txfm cfg for these cases. Meanwhile relieving the
+      // assert.
+      assert(tx_type == default_type || cpi->sf.rt_sf.use_nonrd_pick_mode ||
+             cpi->oxcf.mode == REALTIME);
     }
   }
 
   if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
-      cm->quant_params.base_qindex > 0 && !mbmi->skip &&
+      cm->quant_params.base_qindex > 0 && !mbmi->skip_txfm &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
     if (eset > 0) {
@@ -2111,8 +587,8 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
     }
 
     CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
-    const int txb_offset =
-        x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+    const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+                           (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
     uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
     uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
     entropy_ctx[block] = txb_ctx.txb_skip_ctx;
@@ -2126,7 +602,7 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
     const int segment_id = mbmi->segment_id;
     const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
     tran_low_t *tcoeff_txb =
-        cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
+        cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
     tcoeff = tcoeff_txb + block_offset;
     memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
 
@@ -2159,6 +635,10 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
       const int coeff_ctx = coeff_contexts[pos];
       const tran_low_t v = qcoeff[pos];
       const tran_low_t level = abs(v);
+      /* abs_sum_level is needed to decide the job scheduling order of
+       * pack bitstream multi-threading. This data is not needed if
+       * multi-threading is disabled. */
+      if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level;
 
       if (allow_update_cdf) {
         if (c == eob - 1) {
@@ -2220,24 +700,165 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
   } else {
     tcoeff = qcoeff;
   }
-  const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+  const uint8_t cul_level =
+      av1_get_txb_entropy_context(tcoeff, scan_order, eob);
   av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
                            blk_col, blk_row);
 }
 
-void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
-                            RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                            uint8_t allow_update_cdf) {
+void av1_record_txb_context(int plane, int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            void *arg) {
+  struct tokenize_b_args *const args = arg;
+  const AV1_COMP *cpi = args->cpi;
+  const AV1_COMMON *cm = &cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const int eob = p->eobs[block];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const TX_TYPE tx_type =
+      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                      cm->features.reduced_tx_set_used);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  tran_low_t *tcoeff;
+  assert(args->dry_run != DRY_RUN_COSTCOEFFS);
+  if (args->dry_run == OUTPUT_ENABLED) {
+    MB_MODE_INFO *mbmi = xd->mi[0];
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, plane,
+                pd->above_entropy_context + blk_col,
+                pd->left_entropy_context + blk_row, &txb_ctx);
+#if CONFIG_ENTROPY_STATS
+    const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
+    const int bwl = get_txb_bwl(tx_size);
+    const int width = get_txb_wide(tx_size);
+    const int height = get_txb_high(tx_size);
+    int cdf_idx = cm->coef_cdf_category;
+    ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif  // CONFIG_ENTROPY_STATS
+
+    CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+    const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+                           (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+    uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+    uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+    entropy_ctx[block] = txb_ctx.txb_skip_ctx;
+    eob_txb[block] = eob;
+
+    if (eob == 0) {
+      av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col,
+                               blk_row);
+      return;
+    }
+    const int segment_id = mbmi->segment_id;
+    const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+    tran_low_t *tcoeff_txb =
+        cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
+    tcoeff = tcoeff_txb + block_offset;
+    memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+#if CONFIG_ENTROPY_STATS
+    uint8_t levels_buf[TX_PAD_2D];
+    uint8_t *const levels = set_levels(levels_buf, width);
+    av1_txb_init_levels(tcoeff, width, height, levels);
+    update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size,
+                         td->counts, 0 /*allow_update_cdf*/);
+
+    const TX_CLASS tx_class = tx_type_to_class[tx_type];
+    const bool do_coeff_scan = true;
+#else
+    const bool do_coeff_scan = cpi->mt_info.pack_bs_mt_enabled;
+#endif
+    const int16_t *const scan = scan_order->scan;
+
+    // record tx type usage
+    td->rd_counts.tx_type_used[tx_size][tx_type]++;
+
+#if CONFIG_ENTROPY_STATS
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+                           td->counts, 0 /*allow_update_cdf*/);
+
+    DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+    av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
+                            coeff_contexts);
+#endif
+
+    for (int c = eob - 1; (c >= 0) && do_coeff_scan; --c) {
+      const int pos = scan[c];
+      const tran_low_t v = qcoeff[pos];
+      const tran_low_t level = abs(v);
+      /* abs_sum_level is needed to decide the job scheduling order of
+       * pack bitstream multi-threading. This data is not needed if
+       * multi-threading is disabled. */
+      if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level;
+
+#if CONFIG_ENTROPY_STATS
+      const int coeff_ctx = coeff_contexts[pos];
+      if (c == eob - 1) {
+        assert(coeff_ctx < 4);
+        ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
+                                          [coeff_ctx][AOMMIN(level, 3) - 1];
+      } else {
+        ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
+                                      [coeff_ctx][AOMMIN(level, 3)];
+      }
+      if (level > NUM_BASE_LEVELS) {
+        const int base_range = level - 1 - NUM_BASE_LEVELS;
+        const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+          const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+          for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+            ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type]
+                                   [lps][br_ctx][lps == k];
+            if (lps == k) break;
+          }
+          ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+                                       [plane_type][br_ctx][k];
+          if (k < BR_CDF_SIZE - 1) break;
+        }
+      }
+#endif
+    }
+    // Update the context needed to code the DC sign (if applicable)
+    if (tcoeff[0] != 0) {
+      const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+#if CONFIG_ENTROPY_STATS
+      const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+      ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+#endif  // CONFIG_ENTROPY_STATS
+      entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT;
+    }
+  } else {
+    tcoeff = qcoeff;
+  }
+  const uint8_t cul_level =
+      av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+  av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
+                           blk_col, blk_row);
+}
+
+void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td,
+                                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                                     uint8_t allow_update_cdf) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
-  if (mbmi->skip) {
+  if (mbmi->skip_txfm) {
     av1_reset_entropy_context(xd, bsize, num_planes);
     return;
   }
+  const foreach_transformed_block_visitor visit =
+      allow_update_cdf ? av1_update_and_record_txb_context
+                       : av1_record_txb_context;
 
   for (int plane = 0; plane < num_planes; ++plane) {
     if (plane && !xd->is_chroma_ref) break;
@@ -2245,16 +866,16 @@ void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
     const int ss_x = pd->subsampling_x;
     const int ss_y = pd->subsampling_y;
     const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
-    av1_foreach_transformed_block_in_plane(
-        xd, plane_bsize, plane, av1_update_and_record_txb_context, &arg);
+    av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, visit, &arg);
   }
 }
 
 CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
                                          int mi_col) {
   const AV1_COMMON *const cm = &cpi->common;
-  const int mib_size_log2 = cm->seq_params.mib_size_log2;
-  const int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
+  const int stride =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
   const int offset =
       (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
   return cpi->coeff_buffer_base + offset;
diff --git a/media/libaom/src/av1/encoder/encodetxb.h b/media/libaom/src/av1/encoder/encodetxb.h
index 7122895d14..67b94046b4 100644
--- a/media/libaom/src/av1/encoder/encodetxb.h
+++ b/media/libaom/src/av1/encoder/encodetxb.h
@@ -24,75 +24,250 @@
 extern "C" {
 #endif
 
+/*!\cond */
 #define TXB_SKIP_CTX_MASK 15
 #define DC_SIGN_CTX_SHIFT 4
 #define DC_SIGN_CTX_MASK 3
 
-typedef struct TxbInfo {
-  tran_low_t *qcoeff;
-  uint8_t *levels;  // absolute values and clamped to 255.
-  tran_low_t *dqcoeff;
-  const tran_low_t *tcoeff;
-  const int16_t *dequant;
-  int shift;
-  TX_SIZE tx_size;
-  TX_SIZE txs_ctx;
-  TX_TYPE tx_type;
-  int bwl;
-  int width;
-  int height;
-  int eob;
-  int seg_eob;
-  const SCAN_ORDER *scan_order;
-  TXB_CTX *txb_ctx;
-  int64_t rdmult;
-  const qm_val_t *iqmatrix;
-  int tx_type_cost;
-} TxbInfo;
+int av1_get_eob_pos_token(const int eob, int *const extra);
 
+/*!\endcond */
+/*!\brief Allocate the memory resources for all the macro blocks in the current
+ * coding frame.
+ * \ingroup coefficient_coding
+ *
+ * Each macro block will need a \ref CB_COEFF_BUFFER to store information for
+ * rate-distortion optimization and entropy coding of transform coefficients.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ */
 void av1_alloc_txb_buf(AV1_COMP *cpi);
+/*!\brief Free the memory resources for all the macro blocks in the current
+ * coding frame.
+ * \ingroup coefficient_coding
+ *
+ * See \ref av1_alloc_txb_buf and \ref CB_COEFF_BUFFER for more details.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ */
 void av1_free_txb_buf(AV1_COMP *cpi);
-int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
-                        const TX_SIZE tx_size, const TX_TYPE tx_type,
-                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used);
-int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
-                                  const int block, const TX_SIZE tx_size,
-                                  const TX_TYPE tx_type,
-                                  const TXB_CTX *const txb_ctx,
-                                  const int reduced_tx_set_used,
-                                  const int adjust_eob);
-int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
-                                 const int block, const TX_SIZE tx_size,
-                                 const TX_TYPE tx_type);
+
+/*!\brief Write quantized coefficients in a transform block into bitstream using
+ * entropy coding.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function will write the quantized coefficients in a transform block into
+ * the bitstream using entropy coding.
+ *
+ * The coding steps are as follows.
+ *
+ * 1) Code the end of block position "eob", which is the scan index of the
+ * last non-zero coefficient plus one.
+ *
+ * 2) Code the lower magnitude level (<= COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+ * for each coefficient in reversed scan order.
+ *
+ * 3) Code the sign and higher magnitude level
+ * (> COEFF_BASE_RANGE + NUM_BASE_LEVELS) in forward scan order.
+ *
+ * \param[in]    cm             Top-level structure shared by encoder and
+ * decoder
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    w              Entropy coding write pointer
+ * \param[in]    blk_row      The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane
+ * \param[in]    blk_col      The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    block          The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in]    tx_size        The given transform size
+ */
 void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
                           aom_writer *w, int blk_row, int blk_col, int plane,
                           int block, TX_SIZE tx_size);
-void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
-                         aom_writer *w, BLOCK_SIZE bsize);
-int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
-                                const SCAN_ORDER *scan_order, int eob);
-void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
-                            RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                            uint8_t allow_update_cdf);
+
+/*!\brief Write quantized coefficients of all transform blocks in an intra
+ * macroblock into the bitstream using entropy coding.
+ *
+ * \ingroup coefficient_coding
+ *
+ * All transform blocks in the intra macroblock share the same transform size.
+ *
+ * This function use \ref av1_write_coeffs_txb() to code each transform block in
+ * raster order.
+ *
+ * \param[in]    cm             Top-level structure shared by encoder and
+ * decoder
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    w              Entropy coding write pointer
+ * \param[in]    bsize          Block size of the current macroblock
+ */
+void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                               aom_writer *w, BLOCK_SIZE bsize);
+
+/*!\brief Pack the context info of the current transform block into an uint8_t.
+ * \ingroup coefficient_coding
+ *
+ * This context info will be collected and consolidated by its neighbor
+ * transform blocks for coding transform block skip flag (tx_skip) and
+ * the sign of DC coefficient (dc_sign).
+ *
+ * \param[in]    qcoeff         Buffer of quantized coefficients
+ * \param[in]    scan_order     Coding order of coefficients in the transform
+ * block
+ * \param[in]    eob            The scan index of last non-zero coefficient plus
+ * one
+ */
+uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+                                    const SCAN_ORDER *scan_order, int eob);
+
+/*!\brief Update the probability model (cdf) and the entropy context related to
+ * coefficient coding for all transform blocks in the intra macroblock.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function will go through each transform block in the intra macorblock
+ * and call \ref av1_update_and_record_txb_context to update the probability
+ * model and entropy context properly.
+ *
+ * \param[in]    cpi               Top-level encoder structure
+ * \param[in]    td                Top-level multithreading structure
+ * \param[in]    dry_run           Whether this is a dry run.
+ * \param[in]    bsize             Block size of the current macroblock
+ * \param[in]    allow_update_cdf  Allowed to update probability model (cdf) or
+ * not.
+ */
+void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td,
+                                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                                     uint8_t allow_update_cdf);
+
+/*!\brief Update the probability model (cdf) and the entropy context related to
+ * coefficient coding for a transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * There are regular mode and dry run for this funtion.
+ *
+ * Regular mode:
+ *
+ * The probability model (cdf) for each coding symbol in the
+ * transform block will be updated.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * Dry run:
+ *
+ * The probability model update will be skipped.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * \param[in]    plane        The index of the current plane.
+ * \param[in]    block        The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in]    blk_row      The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in]    blk_col      The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in]    plane_bsize  Block size for this plane. When the video source
+ * uses chroma subsampling, the block size of UV planes will be smaller than the
+ * block size of Y plane.
+ * \param[in]    tx_size      The given transform size.
+ * \param[in]    arg          This parameter will be translated into
+ * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run.
+ */
 void av1_update_and_record_txb_context(int plane, int block, int blk_row,
                                        int blk_col, BLOCK_SIZE plane_bsize,
                                        TX_SIZE tx_size, void *arg);
-#if CONFIG_HTB_TRELLIS
-void hbt_destroy();
-#endif  // CONFIG_HTB_TRELLIS
-int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                         int block, TX_SIZE tx_size, TX_TYPE tx_type,
-                         const TXB_CTX *const txb_ctx, int *rate_cost,
-                         int sharpness, int fast_mode);
 
+/*!\brief Update the entropy context related to coefficient coding for a
+ * transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * There are regular mode and dry run for this function.
+ *
+ * Regular mode:
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * Dry run:
+ *
+ * The probability model update will be skipped.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * \param[in]    plane        The index of the current plane.
+ * \param[in]    block        The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in]    blk_row      The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in]    blk_col      The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in]    plane_bsize  Block size for this plane. When the video source
+ * uses chroma subsampling, the block size of UV planes will be smaller than the
+ * block size of Y plane.
+ * \param[in]    tx_size      The given transform size.
+ * \param[in]    arg          This parameter will be translated into
+ * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run.
+ */
+void av1_record_txb_context(int plane, int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+/*!\brief Get the corresponding \ref CB_COEFF_BUFFER of the current macro block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * The macroblock's location is described by mi_row and mi_col, row and column
+ * mi indexes in the coding frame.
+ *
+ * Each mi unit is a 4x4 pixel block.
+ *
+ * \param[in]    cpi               Top-level encoder structure.
+ * \param[in]    mi_row            Row mi index of the current transform block
+ * in the frame.
+ * \param[in]    mi_col           Column mi index of the current transform
+ * block in the frame.
+ * \return       CB_COEFF_BUFFER*  Pointer of \ref CB_COEFF_BUFFER associated
+ * to this macroblock.
+ */
 CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
                                          int mi_col);
 
+/*!\brief Returns the entropy cost associated with skipping the current
+ * transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * \param[in]    coeff_costs    Table of entropy cost for coefficient coding.
+ * \param[in]    txb_ctx        Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    tx_size        The transform size
+ */
+static INLINE int av1_cost_skip_txb(const CoeffCosts *coeff_costs,
+                                    const TXB_CTX *const txb_ctx, int plane,
+                                    TX_SIZE tx_size) {
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs_ =
+      &coeff_costs->coeff_costs[txs_ctx][plane_type];
+  return coeff_costs_->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+}
+
+/*!\cond */
 // These numbers are empirically obtained.
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
   { 17, 13 },
   { 16, 10 },
 };
+/*!\endcond */
 
 #ifdef __cplusplus
 }
diff --git a/media/libaom/src/av1/encoder/ethread.c b/media/libaom/src/av1/encoder/ethread.c
index 693270b873..27af5532ed 100644
--- a/media/libaom/src/av1/encoder/ethread.c
+++ b/media/libaom/src/av1/encoder/ethread.c
@@ -9,21 +9,27 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/encoder/av1_multi_thread.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/thread_common.h"
+
+#include "av1/encoder/bitstream.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/ethread.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/firstpass.h"
+#endif
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/global_motion_facade.h"
+#include "av1/encoder/intra_mode_search_utils.h"
 #include "av1/encoder/rdopt.h"
 #include "aom_dsp/aom_dsp_common.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/tpl_model.h"
 
 static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
-  for (int i = 0; i < REFERENCE_MODES; i++)
-    td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
-
-  for (int i = 0; i < REF_FRAMES; i++)
-    td->rd_counts.global_motion_used[i] +=
-        td_t->rd_counts.global_motion_used[i];
-
   td->rd_counts.compound_ref_used_flag |=
       td_t->rd_counts.compound_ref_used_flag;
   td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
@@ -42,12 +48,17 @@ static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   for (int i = 0; i < 2; i++) {
     td->rd_counts.warped_used[i] += td_t->rd_counts.warped_used[i];
   }
+
+  td->rd_counts.seg_tmp_pred_cost[0] += td_t->rd_counts.seg_tmp_pred_cost[0];
+  td->rd_counts.seg_tmp_pred_cost[1] += td_t->rd_counts.seg_tmp_pred_cost[1];
+
+  td->rd_counts.newmv_or_intra_blocks += td_t->rd_counts.newmv_or_intra_blocks;
 }
 
 static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
-  const int mib_size = cm->seq_params.mib_size;
+  const int mib_size = cm->seq_params->mib_size;
   const int frame_lf_count =
       av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
   for (int row = 0; row < cm->tiles.rows; row++) {
@@ -63,7 +74,8 @@ static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
           const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col;
           MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str;
           MB_MODE_INFO *mbmi = mi[0];
-          if (mbmi->skip == 1 && (mbmi->sb_type == cm->seq_params.sb_size)) {
+          if (mbmi->skip_txfm == 1 &&
+              (mbmi->bsize == cm->seq_params->sb_size)) {
             for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
               mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
             mbmi->delta_lf_from_base = xd->delta_lf_from_base;
@@ -81,16 +93,16 @@ static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
   }
 }
 
-void av1_row_mt_sync_read_dummy(struct AV1RowMTSyncData *const row_mt_sync,
-                                int r, int c) {
+void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+                                int c) {
   (void)row_mt_sync;
   (void)r;
   (void)c;
   return;
 }
 
-void av1_row_mt_sync_write_dummy(struct AV1RowMTSyncData *const row_mt_sync,
-                                 int r, int c, const int cols) {
+void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+                                 int c, int cols) {
   (void)row_mt_sync;
   (void)r;
   (void)c;
@@ -98,7 +110,7 @@ void av1_row_mt_sync_write_dummy(struct AV1RowMTSyncData *const row_mt_sync,
   return;
 }
 
-void av1_row_mt_sync_read(AV1RowMTSync *const row_mt_sync, int r, int c) {
+void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c) {
 #if CONFIG_MULTITHREAD
   const int nsync = row_mt_sync->sync_range;
 
@@ -106,7 +118,7 @@ void av1_row_mt_sync_read(AV1RowMTSync *const row_mt_sync, int r, int c) {
     pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1];
     pthread_mutex_lock(mutex);
 
-    while (c > row_mt_sync->cur_col[r - 1] - nsync) {
+    while (c > row_mt_sync->num_finished_cols[r - 1] - nsync) {
       pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex);
     }
     pthread_mutex_unlock(mutex);
@@ -118,8 +130,8 @@ void av1_row_mt_sync_read(AV1RowMTSync *const row_mt_sync, int r, int c) {
 #endif  // CONFIG_MULTITHREAD
 }
 
-void av1_row_mt_sync_write(AV1RowMTSync *const row_mt_sync, int r, int c,
-                           const int cols) {
+void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c,
+                           int cols) {
 #if CONFIG_MULTITHREAD
   const int nsync = row_mt_sync->sync_range;
   int cur;
@@ -136,7 +148,7 @@ void av1_row_mt_sync_write(AV1RowMTSync *const row_mt_sync, int r, int c,
   if (sig) {
     pthread_mutex_lock(&row_mt_sync->mutex_[r]);
 
-    row_mt_sync->cur_col[r] = cur;
+    row_mt_sync->num_finished_cols[r] = cur;
 
     pthread_cond_signal(&row_mt_sync->cond_[r]);
     pthread_mutex_unlock(&row_mt_sync->mutex_[r]);
@@ -150,40 +162,38 @@ void av1_row_mt_sync_write(AV1RowMTSync *const row_mt_sync, int r, int c,
 }
 
 // Allocate memory for row synchronization
-void av1_row_mt_sync_mem_alloc(AV1RowMTSync *row_mt_sync, AV1_COMMON *cm,
-                               int rows) {
-  row_mt_sync->rows = rows;
+static void row_mt_sync_mem_alloc(AV1EncRowMultiThreadSync *row_mt_sync,
+                                  AV1_COMMON *cm, int rows) {
 #if CONFIG_MULTITHREAD
-  {
-    int i;
+  int i;
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
-                    aom_malloc(sizeof(*row_mt_sync->mutex_) * rows));
-    if (row_mt_sync->mutex_) {
-      for (i = 0; i < rows; ++i) {
-        pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
-      }
+  CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
+                  aom_malloc(sizeof(*row_mt_sync->mutex_) * rows));
+  if (row_mt_sync->mutex_) {
+    for (i = 0; i < rows; ++i) {
+      pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
     }
+  }
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
-                    aom_malloc(sizeof(*row_mt_sync->cond_) * rows));
-    if (row_mt_sync->cond_) {
-      for (i = 0; i < rows; ++i) {
-        pthread_cond_init(&row_mt_sync->cond_[i], NULL);
-      }
+  CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
+                  aom_malloc(sizeof(*row_mt_sync->cond_) * rows));
+  if (row_mt_sync->cond_) {
+    for (i = 0; i < rows; ++i) {
+      pthread_cond_init(&row_mt_sync->cond_[i], NULL);
     }
   }
 #endif  // CONFIG_MULTITHREAD
 
-  CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
-                  aom_malloc(sizeof(*row_mt_sync->cur_col) * rows));
+  CHECK_MEM_ERROR(cm, row_mt_sync->num_finished_cols,
+                  aom_malloc(sizeof(*row_mt_sync->num_finished_cols) * rows));
 
+  row_mt_sync->rows = rows;
   // Set up nsync.
   row_mt_sync->sync_range = 1;
 }
 
 // Deallocate row based multi-threading synchronization related mutex and data
-void av1_row_mt_sync_mem_dealloc(AV1RowMTSync *row_mt_sync) {
+static void row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
   if (row_mt_sync != NULL) {
 #if CONFIG_MULTITHREAD
     int i;
@@ -201,7 +211,8 @@ void av1_row_mt_sync_mem_dealloc(AV1RowMTSync *row_mt_sync) {
       aom_free(row_mt_sync->cond_);
     }
 #endif  // CONFIG_MULTITHREAD
-    aom_free(row_mt_sync->cur_col);
+    aom_free(row_mt_sync->num_finished_cols);
+
     // clear the structure as the source of this call may be dynamic change
     // in tiles in which case this call will be followed by an _alloc()
     // which may fail.
@@ -209,37 +220,90 @@ void av1_row_mt_sync_mem_dealloc(AV1RowMTSync *row_mt_sync) {
   }
 }
 
-static AOM_INLINE void assign_tile_to_thread(
-    MultiThreadHandle *multi_thread_ctxt, int num_tiles, int num_workers) {
+static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols,
+                             int alloc_row_ctx) {
+  struct AV1Common *cm = &cpi->common;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int tile_col, tile_row;
+
+  // Allocate memory for row based multi-threading
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+
+      row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_rows);
+
+      this_tile->row_ctx = NULL;
+      if (alloc_row_ctx) {
+        assert(max_cols > 0);
+        const int num_row_ctx = AOMMAX(1, (max_cols - 1));
+        CHECK_MEM_ERROR(cm, this_tile->row_ctx,
+                        (FRAME_CONTEXT *)aom_memalign(
+                            16, num_row_ctx * sizeof(*this_tile->row_ctx)));
+      }
+    }
+  }
+  enc_row_mt->allocated_tile_cols = tile_cols;
+  enc_row_mt->allocated_tile_rows = tile_rows;
+  enc_row_mt->allocated_rows = max_rows;
+  enc_row_mt->allocated_cols = max_cols - 1;
+}
+
+void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  const int tile_cols = enc_row_mt->allocated_tile_cols;
+  const int tile_rows = enc_row_mt->allocated_tile_rows;
+  int tile_col, tile_row;
+
+  // Free row based multi-threading sync memory
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+
+      row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+
+      if (cpi->oxcf.algo_cfg.cdf_update_mode) aom_free(this_tile->row_ctx);
+    }
+  }
+  enc_row_mt->allocated_rows = 0;
+  enc_row_mt->allocated_cols = 0;
+  enc_row_mt->allocated_tile_cols = 0;
+  enc_row_mt->allocated_tile_rows = 0;
+}
+
+static AOM_INLINE void assign_tile_to_thread(int *thread_id_to_tile_id,
+                                             int num_tiles, int num_workers) {
   int tile_id = 0;
   int i;
 
   for (i = 0; i < num_workers; i++) {
-    multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++;
+    thread_id_to_tile_id[i] = tile_id++;
     if (tile_id == num_tiles) tile_id = 0;
   }
 }
 
-static int get_next_job(AV1_COMP *const cpi, int *current_mi_row,
-                        int cur_tile_id) {
-  AV1_COMMON *const cm = &cpi->common;
-  TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
-  AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info;
+static AOM_INLINE int get_next_job(TileDataEnc *const tile_data,
+                                   int *current_mi_row, int mib_size) {
+  AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+  const int mi_row_end = tile_data->tile_info.mi_row_end;
 
-  if (row_mt_info->current_mi_row < this_tile->tile_info.mi_row_end) {
-    *current_mi_row = row_mt_info->current_mi_row;
-    row_mt_info->num_threads_working++;
-    row_mt_info->current_mi_row += cm->seq_params.mib_size;
+  if (row_mt_sync->next_mi_row < mi_row_end) {
+    *current_mi_row = row_mt_sync->next_mi_row;
+    row_mt_sync->num_threads_working++;
+    row_mt_sync->next_mi_row += mib_size;
     return 1;
   }
   return 0;
 }
 
-static AOM_INLINE void switch_tile_and_get_next_job(AV1_COMP *const cpi,
-                                                    int *cur_tile_id,
-                                                    int *current_mi_row,
-                                                    int *end_of_frame) {
-  AV1_COMMON *const cm = &cpi->common;
+static AOM_INLINE void switch_tile_and_get_next_job(
+    AV1_COMMON *const cm, TileDataEnc *const tile_data, int *cur_tile_id,
+    int *current_mi_row, int *end_of_frame, int is_firstpass,
+    const BLOCK_SIZE fp_block_size) {
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
 
@@ -250,18 +314,31 @@ static AOM_INLINE void switch_tile_and_get_next_job(AV1_COMP *const cpi,
   for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
       int tile_index = tile_row * tile_cols + tile_col;
-      TileDataEnc *this_tile = &cpi->tile_data[tile_index];
-      AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info;
-      int num_sb_rows_in_tile =
-          av1_get_sb_rows_in_tile(cm, this_tile->tile_info);
-      int num_sb_cols_in_tile =
-          av1_get_sb_cols_in_tile(cm, this_tile->tile_info);
+      TileDataEnc *const this_tile = &tile_data[tile_index];
+      AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+#if CONFIG_REALTIME_ONLY
+      int num_b_rows_in_tile =
+          av1_get_sb_rows_in_tile(cm, &this_tile->tile_info);
+      int num_b_cols_in_tile =
+          av1_get_sb_cols_in_tile(cm, &this_tile->tile_info);
+#else
+      int num_b_rows_in_tile =
+          is_firstpass
+              ? av1_get_unit_rows_in_tile(&this_tile->tile_info, fp_block_size)
+              : av1_get_sb_rows_in_tile(cm, &this_tile->tile_info);
+      int num_b_cols_in_tile =
+          is_firstpass
+              ? av1_get_unit_cols_in_tile(&this_tile->tile_info, fp_block_size)
+              : av1_get_sb_cols_in_tile(cm, &this_tile->tile_info);
+#endif
       int theoretical_limit_on_threads =
-          AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
-      int num_threads_working = row_mt_info->num_threads_working;
+          AOMMIN((num_b_cols_in_tile + 1) >> 1, num_b_rows_in_tile);
+      int num_threads_working = row_mt_sync->num_threads_working;
+
       if (num_threads_working < theoretical_limit_on_threads) {
         int num_mis_to_encode =
-            this_tile->tile_info.mi_row_end - row_mt_info->current_mi_row;
+            this_tile->tile_info.mi_row_end - row_mt_sync->next_mi_row;
 
         // Tile to be processed by this thread is selected on the basis of
         // availability of jobs:
@@ -287,56 +364,131 @@ static AOM_INLINE void switch_tile_and_get_next_job(AV1_COMP *const cpi,
   if (tile_id == -1) {
     *end_of_frame = 1;
   } else {
-    // Update the cur ID to the next tile ID that will be processed,
-    // which will be the least processed tile
+    // Update the current tile id to the tile id that will be processed next,
+    // which will be the least processed tile.
     *cur_tile_id = tile_id;
-    get_next_job(cpi, current_mi_row, *cur_tile_id);
+    const int unit_height = mi_size_high[fp_block_size];
+    get_next_job(&tile_data[tile_id], current_mi_row,
+                 is_firstpass ? unit_height : cm->seq_params->mib_size);
   }
 }
 
-static int enc_row_mt_worker_hook(void *arg1, void *unused) {
+#if !CONFIG_REALTIME_ONLY
+static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) {
   EncWorkerData *const thread_data = (EncWorkerData *)arg1;
   AV1_COMP *const cpi = thread_data->cpi;
   AV1_COMMON *const cm = &cpi->common;
-
-  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
   int thread_id = thread_data->thread_id;
-  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
   (void)unused;
 
   assert(cur_tile_id != -1);
 
+  const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+  const int unit_height = mi_size_high[fp_block_size];
   int end_of_frame = 0;
   while (1) {
     int current_mi_row = -1;
 #if CONFIG_MULTITHREAD
-    pthread_mutex_lock(cpi->row_mt_mutex_);
+    pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
-    if (!get_next_job(cpi, &current_mi_row, cur_tile_id)) {
+    if (!get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
+                      unit_height)) {
       // No jobs are available for the current tile. Query for the status of
       // other tiles and get the next job if available
-      switch_tile_and_get_next_job(cpi, &cur_tile_id, &current_mi_row,
-                                   &end_of_frame);
+      switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
+                                   &current_mi_row, &end_of_frame, 1,
+                                   fp_block_size);
     }
 #if CONFIG_MULTITHREAD
-    pthread_mutex_unlock(cpi->row_mt_mutex_);
+    pthread_mutex_unlock(enc_row_mt_mutex_);
 #endif
     if (end_of_frame == 1) break;
 
     TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
-    int tile_row = this_tile->tile_info.tile_row;
-    int tile_col = this_tile->tile_info.tile_col;
+    AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+    ThreadData *td = thread_data->td;
 
     assert(current_mi_row != -1 &&
-           current_mi_row <= this_tile->tile_info.mi_row_end);
+           current_mi_row < this_tile->tile_info.mi_row_end);
 
+    const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+    av1_first_pass_row(cpi, td, this_tile, current_mi_row >> unit_height_log2,
+                       fp_block_size);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+    row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+  }
+
+  return 1;
+}
+#endif
+
+static int enc_row_mt_worker_hook(void *arg1, void *unused) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *const cpi = thread_data->cpi;
+  AV1_COMMON *const cm = &cpi->common;
+  int thread_id = thread_data->thread_id;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+  int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+  (void)unused;
+
+  assert(cur_tile_id != -1);
+
+  const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+  int end_of_frame = 0;
+
+  // When master thread does not have a valid job to process, xd->tile_ctx
+  // is not set and it contains NULL pointer. This can result in NULL pointer
+  // access violation if accessed beyond the encode stage. Hence, updating
+  // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame
+  // context to avoid NULL pointer access in subsequent stages.
+  thread_data->td->mb.e_mbd.tile_ctx = cm->fc;
+  while (1) {
+    int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+    if (!get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
+                      cm->seq_params->mib_size)) {
+      // No jobs are available for the current tile. Query for the status of
+      // other tiles and get the next job if available
+      switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
+                                   &current_mi_row, &end_of_frame, 0,
+                                   fp_block_size);
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+    if (end_of_frame == 1) break;
+
+    TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
+    AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+    const TileInfo *const tile_info = &this_tile->tile_info;
+    const int tile_row = tile_info->tile_row;
+    const int tile_col = tile_info->tile_col;
     ThreadData *td = thread_data->td;
 
+    assert(current_mi_row != -1 && current_mi_row <= tile_info->mi_row_end);
+
     td->mb.e_mbd.tile_ctx = td->tctx;
     td->mb.tile_pb_ctx = &this_tile->tctx;
+    td->abs_sum_level = 0;
+
     if (this_tile->allow_update_cdf) {
       td->mb.row_ctx = this_tile->row_ctx;
-      if (current_mi_row == this_tile->tile_info.mi_row_start)
+      if (current_mi_row == tile_info->mi_row_start)
         memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
     } else {
       memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
@@ -345,16 +497,20 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
     av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
                            &td->mb.e_mbd);
 
-    cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
-    av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
+    cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
+    if (td->mb.txfm_search_info.mb_rd_record != NULL) {
+      av1_crc32c_calculator_init(
+          &td->mb.txfm_search_info.mb_rd_record->crc_calculator);
+    }
 
     av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row);
 #if CONFIG_MULTITHREAD
-    pthread_mutex_lock(cpi->row_mt_mutex_);
+    pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
-    this_tile->row_mt_info.num_threads_working--;
+    this_tile->abs_sum_level += td->abs_sum_level;
+    row_mt_sync->num_threads_working--;
 #if CONFIG_MULTITHREAD
-    pthread_mutex_unlock(cpi->row_mt_mutex_);
+    pthread_mutex_unlock(enc_row_mt_mutex_);
 #endif
   }
 
@@ -372,7 +528,7 @@ static int enc_worker_hook(void *arg1, void *unused) {
   (void)unused;
 
   for (t = thread_data->start; t < tile_rows * tile_cols;
-       t += cpi->num_workers) {
+       t += cpi->mt_info.num_workers) {
     int tile_row = t / tile_cols;
     int tile_col = t % tile_cols;
 
@@ -386,138 +542,644 @@ static int enc_worker_hook(void *arg1, void *unused) {
   return 1;
 }
 
-static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
-  AV1_COMMON *const cm = &cpi->common;
-  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  int sb_mi_size = av1_get_sb_mi_size(cm);
+void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi) {
+  cpi->mt_info.workers = ppi->p_mt_info.workers;
+  cpi->mt_info.num_workers = ppi->p_mt_info.num_workers;
+  cpi->mt_info.tile_thr_data = ppi->p_mt_info.tile_thr_data;
+  int i;
+  for (i = MOD_FP; i < NUM_MT_MODULES; i++) {
+    cpi->mt_info.num_mod_workers[i] =
+        AOMMIN(cpi->mt_info.num_workers, ppi->p_mt_info.num_mod_workers[i]);
+  }
+}
 
-  CHECK_MEM_ERROR(cm, cpi->workers,
-                  aom_malloc(num_workers * sizeof(*cpi->workers)));
+void av1_init_cdef_worker(AV1_COMP *cpi) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // The allocation is done only for level 0 parallel frames. No change
+  // in config is supported in the middle of a parallel encode set, since the
+  // rest of the MT modules also do not support dynamic change of config.
+  if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) return;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+  PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info;
+  int num_cdef_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_CDEF);
+
+  av1_alloc_cdef_buffers(&cpi->common, &p_mt_info->cdef_worker,
+                         &cpi->mt_info.cdef_sync, num_cdef_workers, 1);
+  cpi->mt_info.cdef_worker = p_mt_info->cdef_worker;
+}
 
-  CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
-                  aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+#if !CONFIG_REALTIME_ONLY
+void av1_init_lr_mt_buffers(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  AV1LrSync *lr_sync = &cpi->mt_info.lr_row_sync;
+  if (lr_sync->sync_range) {
+    int num_lr_workers =
+        av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+      return;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    lr_sync->lrworkerdata[num_lr_workers - 1].rst_tmpbuf = cm->rst_tmpbuf;
+    lr_sync->lrworkerdata[num_lr_workers - 1].rlbs = cm->rlbs;
+  }
+}
+#endif
 
 #if CONFIG_MULTITHREAD
-  if (cpi->oxcf.row_mt == 1) {
-    if (cpi->row_mt_mutex_ == NULL) {
-      CHECK_MEM_ERROR(cm, cpi->row_mt_mutex_,
-                      aom_malloc(sizeof(*(cpi->row_mt_mutex_))));
-      if (cpi->row_mt_mutex_) pthread_mutex_init(cpi->row_mt_mutex_, NULL);
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+  // Initialize enc row MT object.
+  if (is_first_pass || cpi->oxcf.row_mt == 1) {
+    AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt;
+    if (enc_row_mt->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, enc_row_mt->mutex_,
+                      aom_malloc(sizeof(*(enc_row_mt->mutex_))));
+      if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL);
     }
   }
+
+  if (!is_first_pass) {
+    // Initialize global motion MT object.
+    AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync;
+    if (gm_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, gm_sync->mutex_,
+                      aom_malloc(sizeof(*(gm_sync->mutex_))));
+      if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL);
+    }
+#if !CONFIG_REALTIME_ONLY
+    // Initialize temporal filtering MT object.
+    AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync;
+    if (tf_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, tf_sync->mutex_,
+                      aom_malloc(sizeof(*tf_sync->mutex_)));
+      if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL);
+    }
+#endif  // !CONFIG_REALTIME_ONLY
+        // Initialize CDEF MT object.
+    AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+    if (cdef_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
+                      aom_malloc(sizeof(*(cdef_sync->mutex_))));
+      if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+    }
+
+    // Initialize loop filter MT object.
+    AV1LfSync *lf_sync = &mt_info->lf_row_sync;
+    // Number of superblock rows
+    const int sb_rows =
+        CEIL_POWER_OF_TWO(cm->height >> MI_SIZE_LOG2, MAX_MIB_SIZE_LOG2);
+    PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info;
+    int num_lf_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LPF);
+
+    if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+        num_lf_workers > lf_sync->num_workers) {
+      av1_loop_filter_dealloc(lf_sync);
+      av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_lf_workers);
+    }
+
+#if !CONFIG_REALTIME_ONLY
+    if (is_restoration_used(cm)) {
+      // Initialize loop restoration MT object.
+      AV1LrSync *lr_sync = &mt_info->lr_row_sync;
+      int rst_unit_size;
+      if (cm->width * cm->height > 352 * 288)
+        rst_unit_size = RESTORATION_UNITSIZE_MAX;
+      else
+        rst_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
+      int num_rows_lr = av1_lr_count_units_in_tile(rst_unit_size, cm->height);
+      int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR);
+      if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
+          num_lr_workers > lr_sync->num_workers ||
+          MAX_MB_PLANE > lr_sync->num_planes) {
+        av1_loop_restoration_dealloc(lr_sync, num_lr_workers);
+        av1_loop_restoration_alloc(lr_sync, cm, num_lr_workers, num_rows_lr,
+                                   MAX_MB_PLANE, cm->width);
+      }
+    }
 #endif
 
-  for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+    // Initialization of pack bitstream MT object.
+    AV1EncPackBSSync *pack_bs_sync = &mt_info->pack_bs_sync;
+    if (pack_bs_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_,
+                      aom_malloc(sizeof(*pack_bs_sync->mutex_)));
+      if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL);
+    }
+  }
+}
+#endif  // CONFIG_MULTITHREAD
 
-    ++cpi->num_workers;
-    winterface->init(worker);
-    worker->thread_name = "aom enc worker";
+// Computes the number of workers to be considered while allocating memory for a
+// multi-threaded module under FPMT.
+int av1_get_num_mod_workers_for_alloc(PrimaryMultiThreadInfo *const p_mt_info,
+                                      MULTI_THREADED_MODULES mod_name) {
+  int num_mod_workers = p_mt_info->num_mod_workers[mod_name];
+  if (p_mt_info->num_mod_workers[MOD_FRAME_ENC] > 1) {
+    // TODO(anyone): Change num_mod_workers to num_mod_workers[MOD_FRAME_ENC].
+    // As frame parallel jobs will only perform multi-threading for the encode
+    // stage, we can limit the allocations according to num_enc_workers per
+    // frame parallel encode(a.k.a num_mod_workers[MOD_FRAME_ENC]).
+    num_mod_workers = p_mt_info->num_workers;
+  }
+  return num_mod_workers;
+}
 
-    thread_data->cpi = cpi;
-    thread_data->thread_id = i;
+void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) {
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+
+  assert(p_mt_info->workers != NULL);
+  assert(p_mt_info->tile_thr_data != NULL);
+
+  int num_workers = p_mt_info->num_workers;
+  int num_enc_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_ENC);
+  for (int i = num_workers - 1; i >= 0; i--) {
+    EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i];
 
     if (i > 0) {
       // Allocate thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td,
-                      aom_memalign(32, sizeof(*thread_data->td)));
+      AOM_CHECK_MEM_ERROR(&ppi->error, thread_data->td,
+                          aom_memalign(32, sizeof(*thread_data->td)));
       av1_zero(*thread_data->td);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      thread_data->original_td = thread_data->td;
+#endif
+
+      // Set up shared coeff buffers.
+      av1_setup_shared_coeff_buffer(
+          &ppi->seq_params, &thread_data->td->shared_coeff_buf, &ppi->error);
+      AOM_CHECK_MEM_ERROR(
+          &ppi->error, thread_data->td->tmp_conv_dst,
+          aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+                               sizeof(*thread_data->td->tmp_conv_dst)));
 
-      // Set up pc_tree.
-      thread_data->td->pc_tree = NULL;
-      av1_setup_pc_tree(cpi, thread_data->td);
+      if (i < p_mt_info->num_mod_workers[MOD_FP]) {
+        // Set up firstpass PICK_MODE_CONTEXT.
+        thread_data->td->firstpass_ctx = av1_alloc_pmc(
+            ppi->cpi, BLOCK_16X16, &thread_data->td->shared_coeff_buf);
+      }
 
-      CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
-                      (uint8_t *)aom_memalign(
-                          16, MAX_MB_PLANE * MAX_SB_SQUARE *
-                                  sizeof(*thread_data->td->above_pred_buf)));
-      CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
-                      (uint8_t *)aom_memalign(
-                          16, MAX_MB_PLANE * MAX_SB_SQUARE *
-                                  sizeof(*thread_data->td->left_pred_buf)));
+      if (!is_first_pass && i < num_enc_workers) {
+        // Set up sms_tree.
+        av1_setup_sms_tree(ppi->cpi, thread_data->td);
+
+        for (int x = 0; x < 2; x++)
+          for (int y = 0; y < 2; y++)
+            AOM_CHECK_MEM_ERROR(
+                &ppi->error, thread_data->td->hash_value_buffer[x][y],
+                (uint32_t *)aom_malloc(
+                    AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+                    sizeof(*thread_data->td->hash_value_buffer[0][0])));
+
+        // Allocate frame counters in thread data.
+        AOM_CHECK_MEM_ERROR(&ppi->error, thread_data->td->counts,
+                            aom_calloc(1, sizeof(*thread_data->td->counts)));
+
+        // Allocate buffers used by palette coding mode.
+        AOM_CHECK_MEM_ERROR(
+            &ppi->error, thread_data->td->palette_buffer,
+            aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
+
+        // The buffers 'tmp_pred_bufs[]', 'comp_rd_buffer' and 'obmc_buffer' are
+        // used in inter frames to store intermediate inter mode prediction
+        // results and are not required for allintra encoding mode. Hence, the
+        // memory allocations for these buffers are avoided for allintra
+        // encoding mode.
+        if (ppi->cpi->oxcf.kf_cfg.key_freq_max != 0) {
+          alloc_obmc_buffers(&thread_data->td->obmc_buffer, &ppi->error);
+
+          alloc_compound_type_rd_buffers(&ppi->error,
+                                         &thread_data->td->comp_rd_buffer);
+
+          for (int j = 0; j < 2; ++j) {
+            AOM_CHECK_MEM_ERROR(
+                &ppi->error, thread_data->td->tmp_pred_bufs[j],
+                aom_memalign(32,
+                             2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                 sizeof(*thread_data->td->tmp_pred_bufs[j])));
+          }
+        }
 
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->wsrc_buf,
-          (int32_t *)aom_memalign(
-              16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
-
-      CHECK_MEM_ERROR(cm, thread_data->td->inter_modes_info,
-                      (InterModesInfo *)aom_malloc(
-                          sizeof(*thread_data->td->inter_modes_info)));
-
-      for (int x = 0; x < 2; x++)
-        for (int y = 0; y < 2; y++)
-          CHECK_MEM_ERROR(
-              cm, thread_data->td->hash_value_buffer[x][y],
-              (uint32_t *)aom_malloc(
-                  AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
-                  sizeof(*thread_data->td->hash_value_buffer[0][0])));
+        if (is_gradient_caching_for_hog_enabled(ppi->cpi)) {
+          const int plane_types = PLANE_TYPES >> ppi->seq_params.monochrome;
+          AOM_CHECK_MEM_ERROR(
+              &ppi->error, thread_data->td->pixel_gradient_info,
+              aom_malloc(sizeof(*thread_data->td->pixel_gradient_info) *
+                         plane_types * MAX_SB_SQUARE));
+        }
 
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->mask_buf,
-          (int32_t *)aom_memalign(
-              16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
-      // Allocate frame counters in thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td->counts,
-                      aom_calloc(1, sizeof(*thread_data->td->counts)));
-
-      // Allocate buffers used by palette coding mode.
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->palette_buffer,
-          aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
+        if (is_src_var_for_4x4_sub_blocks_caching_enabled(ppi->cpi)) {
+          const BLOCK_SIZE sb_size = ppi->cpi->common.seq_params->sb_size;
+          const int mi_count_in_sb =
+              mi_size_wide[sb_size] * mi_size_high[sb_size];
 
-      av1_alloc_compound_type_rd_buffers(cm, &thread_data->td->comp_rd_buffer);
+          AOM_CHECK_MEM_ERROR(
+              &ppi->error, thread_data->td->src_var_info_of_4x4_sub_blocks,
+              aom_malloc(
+                  sizeof(*thread_data->td->src_var_info_of_4x4_sub_blocks) *
+                  mi_count_in_sb));
+        }
 
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->tmp_conv_dst,
-          aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
-                               sizeof(*thread_data->td->tmp_conv_dst)));
-      for (int j = 0; j < 2; ++j) {
-        CHECK_MEM_ERROR(
-            cm, thread_data->td->tmp_obmc_bufs[j],
-            aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                 sizeof(*thread_data->td->tmp_obmc_bufs[j])));
+        if (ppi->cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
+          const int num_64x64_blocks =
+              (ppi->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+          AOM_CHECK_MEM_ERROR(
+              &ppi->error, thread_data->td->vt64x64,
+              aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks));
+        }
       }
+    }
 
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->mbmi_ext,
-          aom_calloc(sb_mi_size, sizeof(*thread_data->td->mbmi_ext)));
-
-      if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
-        const int num_64x64_blocks =
-            (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
-        CHECK_MEM_ERROR(
-            cm, thread_data->td->vt64x64,
-            aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks));
+    if (!is_first_pass && ppi->cpi->oxcf.row_mt == 1 && i < num_enc_workers) {
+      if (i == 0) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+        for (int j = 0; j < ppi->num_fp_contexts; j++) {
+          AOM_CHECK_MEM_ERROR(&ppi->error, ppi->parallel_cpi[j]->td.tctx,
+                              (FRAME_CONTEXT *)aom_memalign(
+                                  16, sizeof(*ppi->parallel_cpi[j]->td.tctx)));
+        }
+#else
+        AOM_CHECK_MEM_ERROR(
+            &ppi->error, ppi->cpi->td.tctx,
+            (FRAME_CONTEXT *)aom_memalign(16, sizeof(*ppi->cpi->td.tctx)));
+#endif
+      } else {
+        AOM_CHECK_MEM_ERROR(
+            &ppi->error, thread_data->td->tctx,
+            (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx)));
       }
+    }
+  }
+}
+
+void av1_create_workers(AV1_PRIMARY *ppi, int num_workers) {
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  AOM_CHECK_MEM_ERROR(&ppi->error, p_mt_info->workers,
+                      aom_malloc(num_workers * sizeof(*p_mt_info->workers)));
 
+  AOM_CHECK_MEM_ERROR(
+      &ppi->error, p_mt_info->tile_thr_data,
+      aom_calloc(num_workers, sizeof(*p_mt_info->tile_thr_data)));
+
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &p_mt_info->workers[i];
+    EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i];
+
+    winterface->init(worker);
+    worker->thread_name = "aom enc worker";
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i > 0) {
       // Create threads
       if (!winterface->reset(worker))
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+        aom_internal_error(&ppi->error, AOM_CODEC_ERROR,
                            "Tile encoder thread creation failed");
-    } else {
-      // Main thread acts as a worker and uses the thread data in cpi.
-      thread_data->td = &cpi->td;
     }
-    if (cpi->oxcf.row_mt == 1)
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->tctx,
-          (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx)));
     winterface->sync(worker);
+
+    ++p_mt_info->num_workers;
+  }
+}
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+// This function returns 1 if frame parallel encode is supported for
+// the current configuration. Returns 0 otherwise.
+static AOM_INLINE int is_fpmt_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
+  // FPMT is enabled for AOM_Q and AOM_VBR.
+  // TODO(Tarun): Test and enable resize config.
+  if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) {
+    return 0;
+  }
+  if (ppi->use_svc) {
+    return 0;
+  }
+  if (oxcf->tile_cfg.enable_large_scale_tile) {
+    return 0;
+  }
+  if (oxcf->dec_model_cfg.timing_info_present) {
+    return 0;
+  }
+  if (oxcf->mode != GOOD) {
+    return 0;
+  }
+  if (oxcf->tool_cfg.error_resilient_mode) {
+    return 0;
+  }
+  if (oxcf->resize_cfg.resize_mode) {
+    return 0;
+  }
+  if (oxcf->pass != AOM_RC_SECOND_PASS) {
+    return 0;
+  }
+  if (oxcf->max_threads < 2) {
+    return 0;
+  }
+  if (!oxcf->fp_mt) {
+    return 0;
+  }
+
+  return 1;
+}
+
+int av1_check_fpmt_config(AV1_PRIMARY *const ppi,
+                          AV1EncoderConfig *const oxcf) {
+  if (is_fpmt_config(ppi, oxcf)) return 1;
+  // Reset frame parallel configuration for unsupported config
+  if (ppi->num_fp_contexts > 1) {
+    for (int i = 1; i < ppi->num_fp_contexts; i++) {
+      // Release the previously-used frame-buffer
+      if (ppi->parallel_cpi[i]->common.cur_frame != NULL) {
+        --ppi->parallel_cpi[i]->common.cur_frame->ref_count;
+        ppi->parallel_cpi[i]->common.cur_frame = NULL;
+      }
+    }
+
+    int cur_gf_index = ppi->cpi->gf_frame_index;
+    int reset_size = AOMMAX(0, ppi->gf_group.size - cur_gf_index);
+    av1_zero_array(&ppi->gf_group.frame_parallel_level[cur_gf_index],
+                   reset_size);
+    av1_zero_array(&ppi->gf_group.is_frame_non_ref[cur_gf_index], reset_size);
+    av1_zero_array(&ppi->gf_group.src_offset[cur_gf_index], reset_size);
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    memset(&ppi->gf_group.skip_frame_refresh[cur_gf_index][0], INVALID_IDX,
+           sizeof(ppi->gf_group.skip_frame_refresh[cur_gf_index][0]) *
+               reset_size * REF_FRAMES);
+    memset(&ppi->gf_group.skip_frame_as_ref[cur_gf_index], INVALID_IDX,
+           sizeof(ppi->gf_group.skip_frame_as_ref[cur_gf_index]) * reset_size);
+#endif
+    ppi->num_fp_contexts = 1;
+  }
+  return 0;
+}
+
+// A large value for threads used to compute the max num_enc_workers
+// possible for each resolution.
+#define MAX_THREADS 100
+
+// Computes the max number of enc workers possible for each resolution.
+static AOM_INLINE int compute_max_num_enc_workers(
+    CommonModeInfoParams *const mi_params, int mib_size_log2) {
+  int num_sb_rows = CEIL_POWER_OF_TWO(mi_params->mi_rows, mib_size_log2);
+  int num_sb_cols = CEIL_POWER_OF_TWO(mi_params->mi_cols, mib_size_log2);
+
+  return AOMMIN((num_sb_cols + 1) >> 1, num_sb_rows);
+}
+
+// Computes the number of frame parallel(fp) contexts to be created
+// based on the number of max_enc_workers.
+int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
+  ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = 0;
+  if (!av1_check_fpmt_config(ppi, oxcf)) {
+    return 1;
+  }
+  int max_num_enc_workers = compute_max_num_enc_workers(
+      &ppi->cpi->common.mi_params, ppi->cpi->common.seq_params->mib_size_log2);
+  // Scaling factors and rounding factors used to tune worker_per_frame
+  // computation.
+  int rounding_factor[2] = { 2, 4 };
+  int scaling_factor[2] = { 4, 8 };
+  int is_480p_or_lesser =
+      AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) <= 480;
+  int is_sb_64 = 0;
+  if (ppi->cpi != NULL)
+    is_sb_64 = ppi->cpi->common.seq_params->sb_size == BLOCK_64X64;
+  // A parallel frame encode has at least 1/4th the
+  // theoretical limit of max enc workers in default case. For resolutions
+  // larger than 480p, if SB size is 64x64, optimal performance is obtained with
+  // limit of 1/8.
+  int index = (!is_480p_or_lesser && is_sb_64) ? 1 : 0;
+  int workers_per_frame =
+      AOMMAX(1, (max_num_enc_workers + rounding_factor[index]) /
+                    scaling_factor[index]);
+  int max_threads = oxcf->max_threads;
+  int num_fp_contexts = max_threads / workers_per_frame;
+  // Based on empirical results, FPMT gains with multi-tile are significant when
+  // more parallel frames are available. Use FPMT with multi-tile encode only
+  // when sufficient threads are available for parallel encode of
+  // MAX_PARALLEL_FRAMES frames.
+  if (oxcf->tile_cfg.tile_columns > 0 || oxcf->tile_cfg.tile_rows > 0) {
+    if (num_fp_contexts < MAX_PARALLEL_FRAMES) num_fp_contexts = 1;
+  }
+
+  num_fp_contexts = AOMMAX(1, AOMMIN(num_fp_contexts, MAX_PARALLEL_FRAMES));
+  // Limit recalculated num_fp_contexts to ppi->num_fp_contexts.
+  num_fp_contexts = (ppi->num_fp_contexts == 1)
+                        ? num_fp_contexts
+                        : AOMMIN(num_fp_contexts, ppi->num_fp_contexts);
+  if (num_fp_contexts > 1) {
+    ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] =
+        AOMMIN(max_num_enc_workers * num_fp_contexts, oxcf->max_threads);
   }
+  return num_fp_contexts;
 }
 
-static AOM_INLINE void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
+// Computes the number of workers to process each of the parallel frames.
+static AOM_INLINE int compute_num_workers_per_frame(
+    const int num_workers, const int parallel_frame_count) {
+  // Number of level 2 workers per frame context (floor division).
+  int workers_per_frame = (num_workers / parallel_frame_count);
+  return workers_per_frame;
+}
+
+// Prepare level 1 workers. This function is only called for
+// parallel_frame_count > 1. This function populates the mt_info structure of
+// frame level contexts appropriately by dividing the total number of available
+// workers amongst the frames as level 2 workers. It also populates the hook and
+// data members of level 1 workers.
+static AOM_INLINE void prepare_fpmt_workers(AV1_PRIMARY *ppi,
+                                            AV1_COMP_DATA *first_cpi_data,
+                                            AVxWorkerHook hook,
+                                            int parallel_frame_count) {
+  assert(parallel_frame_count <= ppi->num_fp_contexts &&
+         parallel_frame_count > 1);
+
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  int num_workers = p_mt_info->num_workers;
+
+  int frame_idx = 0;
+  int i = 0;
+  while (i < num_workers) {
+    // Assign level 1 worker
+    AVxWorker *frame_worker = p_mt_info->p_workers[frame_idx] =
+        &p_mt_info->workers[i];
+    AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx];
+    MultiThreadInfo *mt_info = &cur_cpi->mt_info;
+    AV1_COMMON *const cm = &cur_cpi->common;
+    const int num_planes = av1_num_planes(cm);
+
+    // Assign start of level 2 worker pool
+    mt_info->workers = &p_mt_info->workers[i];
+    mt_info->tile_thr_data = &p_mt_info->tile_thr_data[i];
+    // Assign number of workers for each frame in the parallel encode set.
+    mt_info->num_workers = compute_num_workers_per_frame(
+        num_workers - i, parallel_frame_count - frame_idx);
+    for (int j = MOD_FP; j < NUM_MT_MODULES; j++) {
+      mt_info->num_mod_workers[j] =
+          AOMMIN(mt_info->num_workers, ppi->p_mt_info.num_mod_workers[j]);
+    }
+    if (ppi->p_mt_info.cdef_worker != NULL) {
+      mt_info->cdef_worker = &ppi->p_mt_info.cdef_worker[i];
+
+      // Back up the original cdef_worker pointers.
+      mt_info->restore_state_buf.cdef_srcbuf = mt_info->cdef_worker->srcbuf;
+      for (int plane = 0; plane < num_planes; plane++)
+        mt_info->restore_state_buf.cdef_colbuf[plane] =
+            mt_info->cdef_worker->colbuf[plane];
+    }
+#if !CONFIG_REALTIME_ONLY
+    if (is_restoration_used(cm)) {
+      // Back up the original LR buffers before update.
+      int idx = i + mt_info->num_workers - 1;
+      mt_info->restore_state_buf.rst_tmpbuf =
+          mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf;
+      mt_info->restore_state_buf.rlbs =
+          mt_info->lr_row_sync.lrworkerdata[idx].rlbs;
+
+      // Update LR buffers.
+      mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = cm->rst_tmpbuf;
+      mt_info->lr_row_sync.lrworkerdata[idx].rlbs = cm->rlbs;
+    }
+#endif
+
+    // At this stage, the thread specific CDEF buffers for the current frame's
+    // 'common' and 'cdef_sync' only need to be allocated. 'cdef_worker' has
+    // already been allocated across parallel frames.
+    av1_alloc_cdef_buffers(cm, &p_mt_info->cdef_worker, &mt_info->cdef_sync,
+                           p_mt_info->num_workers, 0);
+
+    frame_worker->hook = hook;
+    frame_worker->data1 = cur_cpi;
+    frame_worker->data2 = (frame_idx == 0)
+                              ? first_cpi_data
+                              : &ppi->parallel_frames_data[frame_idx - 1];
+    frame_idx++;
+    i += mt_info->num_workers;
+  }
+  p_mt_info->p_num_workers = parallel_frame_count;
+}
+
+// Launch level 1 workers to perform frame parallel encode.
+static AOM_INLINE void launch_fpmt_workers(AV1_PRIMARY *ppi) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  // Encode a frame
+  int num_workers = ppi->p_mt_info.p_num_workers;
+
   for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
+    if (i == 0)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+}
 
-    // Set the starting tile for each thread.
-    thread_data->start = i;
+// Synchronize level 1 workers.
+static AOM_INLINE void sync_fpmt_workers(AV1_PRIMARY *ppi) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int num_workers = ppi->p_mt_info.p_num_workers;
+  int had_error = 0;
+  // Points to error in the earliest display order frame in the parallel set.
+  const struct aom_internal_error_info *error;
 
+  // Encoding ends.
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
+    if (!winterface->sync(worker)) {
+      had_error = 1;
+      error = ((AV1_COMP *)worker->data1)->common.error;
+    }
+  }
+
+  if (had_error)
+    aom_internal_error(&ppi->error, error->error_code, "%s", error->detail);
+}
+
+// Restore worker states after parallel encode.
+static AOM_INLINE void restore_workers_after_fpmt(AV1_PRIMARY *ppi,
+                                                  int parallel_frame_count) {
+  assert(parallel_frame_count <= ppi->num_fp_contexts &&
+         parallel_frame_count > 1);
+  (void)parallel_frame_count;
+
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  int num_workers = p_mt_info->num_workers;
+
+  int frame_idx = 0;
+  int i = 0;
+  while (i < num_workers) {
+    AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx];
+    MultiThreadInfo *mt_info = &cur_cpi->mt_info;
+    const AV1_COMMON *const cm = &cur_cpi->common;
+    const int num_planes = av1_num_planes(cm);
+
+    // Restore the original cdef_worker pointers.
+    if (ppi->p_mt_info.cdef_worker != NULL) {
+      mt_info->cdef_worker->srcbuf = mt_info->restore_state_buf.cdef_srcbuf;
+      for (int plane = 0; plane < num_planes; plane++)
+        mt_info->cdef_worker->colbuf[plane] =
+            mt_info->restore_state_buf.cdef_colbuf[plane];
+    }
+#if !CONFIG_REALTIME_ONLY
+    if (is_restoration_used(cm)) {
+      // Restore the original LR buffers.
+      int idx = i + mt_info->num_workers - 1;
+      mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf =
+          mt_info->restore_state_buf.rst_tmpbuf;
+      mt_info->lr_row_sync.lrworkerdata[idx].rlbs =
+          mt_info->restore_state_buf.rlbs;
+    }
+#endif
+
+    frame_idx++;
+    i += mt_info->num_workers;
+  }
+}
+
+static int get_compressed_data_hook(void *arg1, void *arg2) {
+  AV1_COMP *cpi = (AV1_COMP *)arg1;
+  AV1_COMP_DATA *cpi_data = (AV1_COMP_DATA *)arg2;
+  int status = av1_get_compressed_data(cpi, cpi_data);
+
+  // AOM_CODEC_OK(0) means no error.
+  return !status;
+}
+
+// This function encodes the raw frame data for each frame in parallel encode
+// set, and outputs the frame bit stream to the designated buffers.
+int av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
+                                 AV1_COMP_DATA *const first_cpi_data) {
+  // Bitmask for the frame buffers referenced by cpi->scaled_ref_buf
+  // corresponding to frames in the current parallel encode set.
+  int ref_buffers_used_map = 0;
+  int frames_in_parallel_set = av1_init_parallel_frame_context(
+      first_cpi_data, ppi, &ref_buffers_used_map);
+  prepare_fpmt_workers(ppi, first_cpi_data, get_compressed_data_hook,
+                       frames_in_parallel_set);
+  launch_fpmt_workers(ppi);
+  sync_fpmt_workers(ppi);
+  restore_workers_after_fpmt(ppi, frames_in_parallel_set);
+
+  // Release cpi->scaled_ref_buf corresponding to frames in the current parallel
+  // encode set.
+  for (int i = 0; i < frames_in_parallel_set; ++i) {
+    av1_release_scaled_references_fpmt(ppi->parallel_cpi[i]);
+  }
+  av1_decrement_ref_counts_fpmt(ppi->cpi->common.buffer_pool,
+                                ref_buffers_used_map);
+  return AOM_CODEC_OK;
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info,
+                                      int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
     if (i == 0)
       winterface->execute(worker);
     else
@@ -525,36 +1187,55 @@ static AOM_INLINE void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
   }
 }
 
-static AOM_INLINE void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
+static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info,
+                                        AV1_COMMON *const cm, int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   int had_error = 0;
 
   // Encoding ends.
-  for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &cpi->workers[i];
+  for (int i = num_workers - 1; i > 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
     had_error |= !winterface->sync(worker);
   }
 
   if (had_error)
-    aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
                        "Failed to encode tile data");
 }
 
 static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
                                                        int num_workers) {
   for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &cpi->workers[i];
+    AVxWorker *const worker = &cpi->mt_info.workers[i];
     EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
     cpi->intrabc_used |= thread_data->td->intrabc_used;
     cpi->deltaq_used |= thread_data->td->deltaq_used;
+    // Accumulate cyclic refresh params.
+    if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+        !frame_is_intra_only(&cpi->common))
+      av1_accumulate_cyclic_refresh_counters(cpi->cyclic_refresh,
+                                             &thread_data->td->mb);
+    if (thread_data->td != &cpi->td) {
+      // Keep these conditional expressions in sync with the corresponding ones
+      // in prepare_enc_workers().
+      if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+        aom_free(thread_data->td->mb.mv_costs);
+      }
+      if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+        aom_free(thread_data->td->mb.dv_costs);
+      }
+    }
+    av1_dealloc_mb_data(&cpi->common, &thread_data->td->mb);
 
     // Accumulate counters.
     if (i > 0) {
       av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
       accumulate_rd_opt(&cpi->td, thread_data->td);
-      cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
+      cpi->td.mb.txfm_search_info.txb_split_count +=
+          thread_data->td->mb.txfm_search_info.txb_split_count;
 #if CONFIG_SPEED_STATS
-      cpi->td.mb.tx_search_count += thread_data->td->mb.tx_search_count;
+      cpi->td.mb.txfm_search_info.tx_search_count +=
+          thread_data->td->mb.txfm_search_info.tx_search_count;
 #endif  // CONFIG_SPEED_STATS
     }
   }
@@ -562,26 +1243,43 @@ static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
 
 static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
                                            int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1_COMMON *const cm = &cpi->common;
   for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
 
     worker->hook = hook;
     worker->data1 = thread_data;
     worker->data2 = NULL;
 
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+#if !CONFIG_FRAME_PARALLEL_ENCODE
+    }
+#else
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
     thread_data->td->intrabc_used = 0;
     thread_data->td->deltaq_used = 0;
+    thread_data->td->abs_sum_level = 0;
+    thread_data->td->rd_counts.seg_tmp_pred_cost[0] = 0;
+    thread_data->td->rd_counts.seg_tmp_pred_cost[1] = 0;
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
       thread_data->td->rd_counts = cpi->td.rd_counts;
-      thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
-      thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
-      thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
+      thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer;
 
-      thread_data->td->mb.inter_modes_info = thread_data->td->inter_modes_info;
       for (int x = 0; x < 2; x++) {
         for (int y = 0; y < 2; y++) {
           memcpy(thread_data->td->hash_value_buffer[x][y],
@@ -592,9 +1290,32 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
               thread_data->td->hash_value_buffer[x][y];
         }
       }
-      thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
-      thread_data->td->mb.mbmi_ext = thread_data->td->mbmi_ext;
+      // Keep these conditional expressions in sync with the corresponding ones
+      // in accumulate_counters_enc_workers().
+      if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+        CHECK_MEM_ERROR(cm, thread_data->td->mb.mv_costs,
+                        (MvCosts *)aom_malloc(sizeof(MvCosts)));
+        memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
+               sizeof(MvCosts));
+      }
+      if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+        // Reset dv_costs to NULL for worker threads when dv cost update is
+        // enabled so that only dv_cost_upd_level needs to be checked before the
+        // aom_free() call for the same.
+        thread_data->td->mb.dv_costs = NULL;
+        if (av1_need_dv_costs(cpi)) {
+          CHECK_MEM_ERROR(cm, thread_data->td->mb.dv_costs,
+                          (IntraBCMVCosts *)aom_malloc(sizeof(IntraBCMVCosts)));
+          memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs,
+                 sizeof(IntraBCMVCosts));
+        }
+      }
     }
+    av1_alloc_mb_data(cpi, &thread_data->td->mb);
+
+    // Reset cyclic refresh counters.
+    av1_init_cyclic_refresh_counters(&thread_data->td->mb);
+
     if (thread_data->td->counts != &cpi->counts) {
       memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
     }
@@ -604,38 +1325,133 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
       thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
       thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
       for (int j = 0; j < 2; ++j) {
-        thread_data->td->mb.tmp_obmc_bufs[j] =
-            thread_data->td->tmp_obmc_bufs[j];
+        thread_data->td->mb.tmp_pred_bufs[j] =
+            thread_data->td->tmp_pred_bufs[j];
       }
+      thread_data->td->mb.pixel_gradient_info =
+          thread_data->td->pixel_gradient_info;
+
+      thread_data->td->mb.src_var_info_of_4x4_sub_blocks =
+          thread_data->td->src_var_info_of_4x4_sub_blocks;
 
       thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
       for (int j = 0; j < 2; ++j) {
         thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
-            thread_data->td->mb.tmp_obmc_bufs[j];
+            thread_data->td->mb.tmp_pred_bufs[j];
+      }
+    }
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                              int num_workers) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+#if !CONFIG_FRAME_PARALLEL_ENCODE
+    }
+#else
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      // Keep this conditional expression in sync with the corresponding one
+      // in av1_fp_encode_tiles_row_mt().
+      if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+        CHECK_MEM_ERROR(cm, thread_data->td->mb.mv_costs,
+                        (MvCosts *)aom_malloc(sizeof(MvCosts)));
+        memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
+               sizeof(MvCosts));
       }
     }
+
+    av1_alloc_mb_data(cpi, &thread_data->td->mb);
   }
 }
+#endif
+
+// Computes the number of workers for row multi-threading of encoding stage
+static AOM_INLINE int compute_num_enc_row_mt_workers(AV1_COMMON *const cm,
+                                                     int max_threads) {
+  TileInfo tile_info;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int total_num_threads_row_mt = 0;
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      av1_tile_init(&tile_info, cm, row, col);
+      const int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, &tile_info);
+      const int num_sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, &tile_info);
+      total_num_threads_row_mt +=
+          AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
+    }
+  }
+  return AOMMIN(max_threads, total_num_threads_row_mt);
+}
+
+// Computes the number of workers for tile multi-threading of encoding stage
+static AOM_INLINE int compute_num_enc_tile_mt_workers(AV1_COMMON *const cm,
+                                                      int max_threads) {
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  return AOMMIN(max_threads, tile_cols * tile_rows);
+}
+
+// Find max worker of all MT stages
+int av1_get_max_num_workers(const AV1_COMP *cpi) {
+  int max_num_workers = 0;
+  for (int i = MOD_FP; i < NUM_MT_MODULES; i++)
+    max_num_workers =
+        AOMMAX(cpi->ppi->p_mt_info.num_mod_workers[i], max_num_workers);
+  assert(max_num_workers >= 1);
+  return AOMMIN(max_num_workers, cpi->oxcf.max_threads);
+}
+
+// Computes the number of workers for encoding stage (row/tile multi-threading)
+int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers) {
+  if (max_workers <= 1) return 1;
+  if (cpi->oxcf.row_mt)
+    return compute_num_enc_row_mt_workers(&cpi->common, max_workers);
+  else
+    return compute_num_enc_tile_mt_workers(&cpi->common, max_workers);
+}
 
 void av1_encode_tiles_mt(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
-  int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows);
+  int num_workers = mt_info->num_mod_workers[MOD_ENC];
 
-  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
-    av1_alloc_tile_data(cpi);
+  assert(IMPLIES(cpi->tile_data == NULL,
+                 cpi->allocated_tiles < tile_cols * tile_rows));
+  if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
 
   av1_init_tile_data(cpi);
-  // Only run once to create threads and allocate thread data.
-  if (cpi->num_workers == 0) {
-    create_enc_workers(cpi, num_workers);
-  } else {
-    num_workers = AOMMIN(num_workers, cpi->num_workers);
-  }
+  num_workers = AOMMIN(num_workers, mt_info->num_workers);
+
   prepare_enc_workers(cpi, enc_worker_hook, num_workers);
-  launch_enc_workers(cpi, num_workers);
-  sync_enc_workers(cpi, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, cm, num_workers);
   accumulate_counters_enc_workers(cpi, num_workers);
 }
 
@@ -651,61 +1467,115 @@ void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
   for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i];
 }
 
-void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
+// Computes the maximum number of sb_rows for row multi-threading of encoding
+// stage
+static AOM_INLINE void compute_max_sb_rows_cols(AV1_COMP *cpi, int *max_sb_rows,
+                                                int *max_sb_cols) {
   AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
-  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
-  int num_workers = 0;
-  int total_num_threads_row_mt = 0;
-  int max_sb_rows = 0;
-
-  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
-    av1_row_mt_mem_dealloc(cpi);
-    av1_alloc_tile_data(cpi);
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      const int tile_index = row * cm->tiles.cols + col;
+      const TileInfo *const tile_info = &cpi->tile_data[tile_index].tile_info;
+      const int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, tile_info);
+      const int num_sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+      *max_sb_rows = AOMMAX(*max_sb_rows, num_sb_rows_in_tile);
+      *max_sb_cols = AOMMAX(*max_sb_cols, num_sb_cols_in_tile);
+    }
   }
+}
 
-  av1_init_tile_data(cpi);
+#if !CONFIG_REALTIME_ONLY
+// Computes the number of workers for firstpass stage (row/tile multi-threading)
+int av1_fp_compute_num_enc_workers(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int total_num_threads_row_mt = 0;
+  TileInfo tile_info;
+
+  if (cpi->oxcf.max_threads <= 1) return 1;
 
   for (int row = 0; row < tile_rows; row++) {
     for (int col = 0; col < tile_cols; col++) {
-      TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col];
-      int num_sb_rows_in_tile =
-          av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
-      int num_sb_cols_in_tile =
-          av1_get_sb_cols_in_tile(cm, tile_data->tile_info);
+      av1_tile_init(&tile_info, cm, row, col);
+      const int num_mb_rows_in_tile =
+          av1_get_unit_rows_in_tile(&tile_info, cpi->fp_block_size);
+      const int num_mb_cols_in_tile =
+          av1_get_unit_cols_in_tile(&tile_info, cpi->fp_block_size);
       total_num_threads_row_mt +=
-          AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
-      max_sb_rows = AOMMAX(max_sb_rows, num_sb_rows_in_tile);
+          AOMMIN((num_mb_cols_in_tile + 1) >> 1, num_mb_rows_in_tile);
+    }
+  }
+  return AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt);
+}
+
+// Computes the maximum number of mb_rows for row multi-threading of firstpass
+// stage
+static AOM_INLINE int fp_compute_max_mb_rows(const AV1_COMMON *const cm,
+                                             const TileDataEnc *const tile_data,
+                                             const BLOCK_SIZE fp_block_size) {
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int max_mb_rows = 0;
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      const int tile_index = row * cm->tiles.cols + col;
+      const TileInfo *const tile_info = &tile_data[tile_index].tile_info;
+      const int num_mb_rows_in_tile =
+          av1_get_unit_rows_in_tile(tile_info, fp_block_size);
+      max_mb_rows = AOMMAX(max_mb_rows, num_mb_rows_in_tile);
     }
   }
-  // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of
-  // post-processing stages in encoder is quiet low, so limiting the number of
-  // threads to the theoretical limit in row-mt does not have much impact on
-  // post-processing multi-threading stage. Need to revisit this when
-  // post-processing time starts shooting up.
-  num_workers = AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt);
+  return max_mb_rows;
+}
+#endif
+
+void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
+  int max_sb_rows = 0, max_sb_cols = 0;
+  int num_workers = mt_info->num_mod_workers[MOD_ENC];
+
+  assert(IMPLIES(cpi->tile_data == NULL,
+                 cpi->allocated_tiles < tile_cols * tile_rows));
+  if (cpi->allocated_tiles < tile_cols * tile_rows) {
+    av1_row_mt_mem_dealloc(cpi);
+    av1_alloc_tile_data(cpi);
+  }
+
+  av1_init_tile_data(cpi);
+
+  compute_max_sb_rows_cols(cpi, &max_sb_rows, &max_sb_cols);
 
-  if (multi_thread_ctxt->allocated_tile_cols != tile_cols ||
-      multi_thread_ctxt->allocated_tile_rows != tile_rows ||
-      multi_thread_ctxt->allocated_sb_rows != max_sb_rows) {
+  if (enc_row_mt->allocated_tile_cols != tile_cols ||
+      enc_row_mt->allocated_tile_rows != tile_rows ||
+      enc_row_mt->allocated_rows != max_sb_rows ||
+      enc_row_mt->allocated_cols != (max_sb_cols - 1)) {
     av1_row_mt_mem_dealloc(cpi);
-    av1_row_mt_mem_alloc(cpi, max_sb_rows);
+    row_mt_mem_alloc(cpi, max_sb_rows, max_sb_cols,
+                     cpi->oxcf.algo_cfg.cdf_update_mode);
   }
 
-  memset(multi_thread_ctxt->thread_id_to_tile_id, -1,
-         sizeof(*multi_thread_ctxt->thread_id_to_tile_id) * MAX_NUM_THREADS);
+  memset(thread_id_to_tile_id, -1,
+         sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
 
   for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
-      int tile_id = tile_row * tile_cols + tile_col;
-      TileDataEnc *this_tile = &cpi->tile_data[tile_id];
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+      AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
 
-      // Initialize cur_col to -1 for all rows.
-      memset(this_tile->row_mt_sync.cur_col, -1,
-             sizeof(*this_tile->row_mt_sync.cur_col) * max_sb_rows);
-      this_tile->row_mt_info.current_mi_row = this_tile->tile_info.mi_row_start;
-      this_tile->row_mt_info.num_threads_working = 0;
+      // Initialize num_finished_cols to -1 for all rows.
+      memset(row_mt_sync->num_finished_cols, -1,
+             sizeof(*row_mt_sync->num_finished_cols) * max_sb_rows);
+      row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
+      row_mt_sync->num_threads_working = 0;
 
       av1_inter_mode_data_init(this_tile);
       av1_zero_above_context(cm, &cpi->td.mb.e_mbd,
@@ -714,16 +1584,1212 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
     }
   }
 
-  // Only run once to create threads and allocate thread data.
-  if (cpi->num_workers == 0) {
-    create_enc_workers(cpi, num_workers);
-  } else {
-    num_workers = AOMMIN(num_workers, cpi->num_workers);
-  }
-  assign_tile_to_thread(multi_thread_ctxt, tile_cols * tile_rows, num_workers);
+  num_workers = AOMMIN(num_workers, mt_info->num_workers);
+
+  assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
+                        num_workers);
   prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers);
-  launch_enc_workers(cpi, num_workers);
-  sync_enc_workers(cpi, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, cm, num_workers);
   if (cm->delta_q_info.delta_lf_present_flag) update_delta_lf_for_row_mt(cpi);
   accumulate_counters_enc_workers(cpi, num_workers);
 }
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
+  int num_workers = 0;
+  int max_mb_rows = 0;
+
+  assert(IMPLIES(cpi->tile_data == NULL,
+                 cpi->allocated_tiles < tile_cols * tile_rows));
+  if (cpi->allocated_tiles < tile_cols * tile_rows) {
+    av1_row_mt_mem_dealloc(cpi);
+    av1_alloc_tile_data(cpi);
+  }
+
+  av1_init_tile_data(cpi);
+
+  const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+  max_mb_rows = fp_compute_max_mb_rows(cm, cpi->tile_data, fp_block_size);
+
+  // For pass = 1, compute the no. of workers needed. For single-pass encode
+  // (pass = 0), no. of workers are already computed.
+  if (mt_info->num_mod_workers[MOD_FP] == 0)
+    num_workers = av1_fp_compute_num_enc_workers(cpi);
+  else
+    num_workers = mt_info->num_mod_workers[MOD_FP];
+
+  if (enc_row_mt->allocated_tile_cols != tile_cols ||
+      enc_row_mt->allocated_tile_rows != tile_rows ||
+      enc_row_mt->allocated_rows != max_mb_rows) {
+    av1_row_mt_mem_dealloc(cpi);
+    row_mt_mem_alloc(cpi, max_mb_rows, -1, 0);
+  }
+
+  memset(thread_id_to_tile_id, -1,
+         sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
+
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+      AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+      // Initialize num_finished_cols to -1 for all rows.
+      memset(row_mt_sync->num_finished_cols, -1,
+             sizeof(*row_mt_sync->num_finished_cols) * max_mb_rows);
+      row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
+      row_mt_sync->num_threads_working = 0;
+    }
+  }
+
+  num_workers = AOMMIN(num_workers, mt_info->num_workers);
+  assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
+                        num_workers);
+  fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, cm, num_workers);
+  for (int i = num_workers - 1; i >= 0; i--) {
+    EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
+    if (thread_data->td != &cpi->td) {
+      // Keep this conditional expression in sync with the corresponding one
+      // in fp_prepare_enc_workers().
+      if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+        aom_free(thread_data->td->mb.mv_costs);
+      }
+      assert(!thread_data->td->mb.dv_costs);
+    }
+    av1_dealloc_mb_data(cm, &thread_data->td->mb);
+  }
+}
+
+void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+                                    int r, int c) {
+  (void)tpl_mt_sync;
+  (void)r;
+  (void)c;
+  return;
+}
+
+void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+                                     int r, int c, int cols) {
+  (void)tpl_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+  return;
+}
+
+void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
+                              int c) {
+#if CONFIG_MULTITHREAD
+  int nsync = tpl_row_mt_sync->sync_range;
+
+  if (r) {
+    pthread_mutex_t *const mutex = &tpl_row_mt_sync->mutex_[r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > tpl_row_mt_sync->num_finished_cols[r - 1] - nsync)
+      pthread_cond_wait(&tpl_row_mt_sync->cond_[r - 1], mutex);
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)tpl_row_mt_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
+                               int c, int cols) {
+#if CONFIG_MULTITHREAD
+  int nsync = tpl_row_mt_sync->sync_range;
+  int cur;
+  // Only signal when there are enough encoded blocks for next row to run.
+  int sig = 1;
+
+  if (c < cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&tpl_row_mt_sync->mutex_[r]);
+
+    tpl_row_mt_sync->num_finished_cols[r] = cur;
+
+    pthread_cond_signal(&tpl_row_mt_sync->cond_[r]);
+    pthread_mutex_unlock(&tpl_row_mt_sync->mutex_[r]);
+  }
+#else
+  (void)tpl_row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Each worker calls tpl_worker_hook() and computes the tpl data.
+static int tpl_worker_hook(void *arg1, void *unused) {
+  (void)unused;
+  EncWorkerData *thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *cpi = thread_data->cpi;
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCK *x = &thread_data->td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats;
+  CommonModeInfoParams *mi_params = &cm->mi_params;
+  BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+  TX_SIZE tx_size = max_txsize_lookup[bsize];
+  int mi_height = mi_size_high[bsize];
+  int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working;
+
+  av1_init_tpl_txfm_stats(tpl_txfm_stats);
+
+  for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows;
+       mi_row += num_active_workers * mi_height) {
+    // Motion estimation row boundary
+    av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
+                          cpi->oxcf.border_in_pixels);
+    xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+    xd->mb_to_bottom_edge =
+        GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+    av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, x, mi_row, bsize, tx_size);
+  }
+  return 1;
+}
+
+// Deallocate tpl synchronization related mutex and data.
+void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync) {
+  assert(tpl_sync != NULL);
+
+#if CONFIG_MULTITHREAD
+  if (tpl_sync->mutex_ != NULL) {
+    for (int i = 0; i < tpl_sync->rows; ++i)
+      pthread_mutex_destroy(&tpl_sync->mutex_[i]);
+    aom_free(tpl_sync->mutex_);
+  }
+  if (tpl_sync->cond_ != NULL) {
+    for (int i = 0; i < tpl_sync->rows; ++i)
+      pthread_cond_destroy(&tpl_sync->cond_[i]);
+    aom_free(tpl_sync->cond_);
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  aom_free(tpl_sync->num_finished_cols);
+  // clear the structure as the source of this call may be a resize in which
+  // case this call will be followed by an _alloc() which may fail.
+  av1_zero(*tpl_sync);
+}
+
+// Allocate memory for tpl row synchronization.
+void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm,
+                   int mb_rows) {
+  tpl_sync->rows = mb_rows;
+#if CONFIG_MULTITHREAD
+  {
+    CHECK_MEM_ERROR(cm, tpl_sync->mutex_,
+                    aom_malloc(sizeof(*tpl_sync->mutex_) * mb_rows));
+    if (tpl_sync->mutex_) {
+      for (int i = 0; i < mb_rows; ++i)
+        pthread_mutex_init(&tpl_sync->mutex_[i], NULL);
+    }
+
+    CHECK_MEM_ERROR(cm, tpl_sync->cond_,
+                    aom_malloc(sizeof(*tpl_sync->cond_) * mb_rows));
+    if (tpl_sync->cond_) {
+      for (int i = 0; i < mb_rows; ++i)
+        pthread_cond_init(&tpl_sync->cond_[i], NULL);
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+  CHECK_MEM_ERROR(cm, tpl_sync->num_finished_cols,
+                  aom_malloc(sizeof(*tpl_sync->num_finished_cols) * mb_rows));
+
+  // Set up nsync.
+  tpl_sync->sync_range = 1;
+}
+
+// Each worker is prepared by assigning the hook function and individual thread
+// data.
+static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                           int num_workers) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+#if !CONFIG_FRAME_PARALLEL_ENCODE
+    }
+#else
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      // OBMC buffers are used only to init MS params and remain unused when
+      // called from tpl, hence set the buffers to defaults.
+      av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
+      thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+      thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+    }
+  }
+}
+
+// Accumulate transform stats after tpl.
+static void tpl_accumulate_txfm_stats(ThreadData *main_td,
+                                      const MultiThreadInfo *mt_info,
+                                      int num_workers) {
+  TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    ThreadData *td = thread_data->td;
+    if (td != main_td) {
+      const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
+      av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats);
+    }
+  }
+}
+
+// Implements multi-threading for tpl.
+void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  CommonModeInfoParams *mi_params = &cm->mi_params;
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  TplParams *tpl_data = &cpi->ppi->tpl_data;
+  AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync;
+  int mb_rows = mi_params->mb_rows;
+  int num_workers =
+      AOMMIN(mt_info->num_mod_workers[MOD_TPL], mt_info->num_workers);
+
+  if (mb_rows != tpl_sync->rows) {
+    av1_tpl_dealloc(tpl_sync);
+    av1_tpl_alloc(tpl_sync, cm, mb_rows);
+  }
+  tpl_sync->num_threads_working = num_workers;
+
+  // Initialize cur_mb_col to -1 for all MB rows.
+  memset(tpl_sync->num_finished_cols, -1,
+         sizeof(*tpl_sync->num_finished_cols) * mb_rows);
+
+  prepare_tpl_workers(cpi, tpl_worker_hook, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, cm, num_workers);
+  tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers);
+}
+
+// Deallocate memory for temporal filter multi-thread synchronization.
+void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync) {
+  assert(tf_sync != NULL);
+#if CONFIG_MULTITHREAD
+  if (tf_sync->mutex_ != NULL) {
+    pthread_mutex_destroy(tf_sync->mutex_);
+    aom_free(tf_sync->mutex_);
+  }
+#endif  // CONFIG_MULTITHREAD
+  tf_sync->next_tf_row = 0;
+}
+
+// Checks if a job is available. If job is available,
+// populates next_tf_row and returns 1, else returns 0.
+static AOM_INLINE int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync,
+                                      int *current_mb_row, int mb_rows) {
+  int do_next_row = 0;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_;
+  pthread_mutex_lock(tf_mutex_);
+#endif
+  if (tf_mt_sync->next_tf_row < mb_rows) {
+    *current_mb_row = tf_mt_sync->next_tf_row;
+    tf_mt_sync->next_tf_row++;
+    do_next_row = 1;
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(tf_mutex_);
+#endif
+  return do_next_row;
+}
+
+// Hook function for each thread in temporal filter multi-threading.
+static int tf_worker_hook(void *arg1, void *unused) {
+  (void)unused;
+  EncWorkerData *thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *cpi = thread_data->cpi;
+  ThreadData *td = thread_data->td;
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  AV1TemporalFilterSync *tf_sync = &cpi->mt_info.tf_sync;
+  const struct scale_factors *scale = &cpi->tf_ctx.sf;
+  const int num_planes = av1_num_planes(&cpi->common);
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  MACROBLOCKD *mbd = &td->mb.e_mbd;
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  MB_MODE_INFO **input_mb_mode_info;
+  tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
+  tf_setup_macroblockd(mbd, &td->tf_data, scale);
+
+  int current_mb_row = -1;
+
+  while (tf_get_next_job(tf_sync, &current_mb_row, tf_ctx->mb_rows))
+    av1_tf_do_filtering_row(cpi, td, current_mb_row);
+
+  tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
+
+  return 1;
+}
+
+// Assigns temporal filter hook function and thread data to each worker.
+static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                               int num_workers, int is_highbitdepth) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  mt_info->tf_sync.next_tf_row = 0;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+#if !CONFIG_FRAME_PARALLEL_ENCODE
+    }
+#else
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      // OBMC buffers are used only to init MS params and remain unused when
+      // called from tf, hence set the buffers to defaults.
+      av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
+      if (!tf_alloc_and_reset_data(&thread_data->td->tf_data,
+                                   cpi->tf_ctx.num_pels, is_highbitdepth)) {
+        aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+                           "Error allocating temporal filter data");
+      }
+    }
+  }
+}
+
+// Deallocate thread specific data for temporal filter.
+static void tf_dealloc_thread_data(AV1_COMP *cpi, int num_workers,
+                                   int is_highbitdepth) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+    ThreadData *td = thread_data->td;
+    if (td != &cpi->td) tf_dealloc_data(&td->tf_data, is_highbitdepth);
+  }
+}
+
+// Accumulate sse and sum after temporal filtering.
+static void tf_accumulate_frame_diff(AV1_COMP *cpi, int num_workers) {
+  FRAME_DIFF *total_diff = &cpi->td.tf_data.diff;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &cpi->mt_info.workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    ThreadData *td = thread_data->td;
+    FRAME_DIFF *diff = &td->tf_data.diff;
+    if (td != &cpi->td) {
+      total_diff->sse += diff->sse;
+      total_diff->sum += diff->sum;
+    }
+  }
+}
+
+// Implements multi-threading for temporal filter.
+void av1_tf_do_filtering_mt(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;
+
+  int num_workers =
+      AOMMIN(mt_info->num_mod_workers[MOD_TF], mt_info->num_workers);
+
+  prepare_tf_workers(cpi, tf_worker_hook, num_workers, is_highbitdepth);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, cm, num_workers);
+  tf_accumulate_frame_diff(cpi, num_workers);
+  tf_dealloc_thread_data(cpi, num_workers, is_highbitdepth);
+}
+
+// Checks if a job is available in the current direction. If a job is available,
+// frame_idx will be populated and returns 1, else returns 0.
+static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx,
+                                      int cur_dir) {
+  GlobalMotionInfo *gm_info = &cpi->gm_info;
+  JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
+
+  int total_refs = gm_info->num_ref_frames[cur_dir];
+  int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir];
+
+  if (cur_frame_to_process < total_refs && !job_info->early_exit[cur_dir]) {
+    *frame_idx = gm_info->reference_frames[cur_dir][cur_frame_to_process].frame;
+    job_info->next_frame_to_process[cur_dir] += 1;
+    return 1;
+  }
+  return 0;
+}
+
+// Switches the current direction and calls the function get_next_gm_job() if
+// the speed feature 'prune_ref_frame_for_gm_search' is not set.
+static AOM_INLINE void switch_direction(AV1_COMP *cpi, int *frame_idx,
+                                        int *cur_dir) {
+  if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search) return;
+  // Switch the direction and get next job
+  *cur_dir = !(*cur_dir);
+  get_next_gm_job(cpi, frame_idx, *(cur_dir));
+}
+
+// Initializes inliers, num_inliers and segment_map.
+static AOM_INLINE void init_gm_thread_data(
+    const GlobalMotionInfo *gm_info, GlobalMotionThreadData *thread_data) {
+  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+    MotionModel motion_params = thread_data->params_by_motion[m];
+    av1_zero(motion_params.params);
+    motion_params.num_inliers = 0;
+  }
+
+  av1_zero_array(thread_data->segment_map,
+                 gm_info->segment_map_w * gm_info->segment_map_h);
+}
+
+// Hook function for each thread in global motion multi-threading.
+static int gm_mt_worker_hook(void *arg1, void *unused) {
+  (void)unused;
+
+  EncWorkerData *thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *cpi = thread_data->cpi;
+  GlobalMotionInfo *gm_info = &cpi->gm_info;
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  JobInfo *job_info = &mt_info->gm_sync.job_info;
+  int thread_id = thread_data->thread_id;
+  GlobalMotionThreadData *gm_thread_data =
+      &mt_info->gm_sync.thread_data[thread_id];
+  int cur_dir = job_info->thread_id_to_dir[thread_id];
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+#endif
+
+  while (1) {
+    int ref_buf_idx = -1;
+    int ref_frame_idx = -1;
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(gm_mt_mutex_);
+#endif
+
+    // Populates ref_buf_idx(the reference frame type) for which global motion
+    // estimation will be done.
+    if (!get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) {
+      // No jobs are available for the current direction. Switch
+      // to other direction and get the next job, if available.
+      switch_direction(cpi, &ref_buf_idx, &cur_dir);
+    }
+
+    // 'ref_frame_idx' holds the index of the current reference frame type in
+    // gm_info->reference_frames. job_info->next_frame_to_process will be
+    // incremented in get_next_gm_job() and hence subtracting by 1.
+    ref_frame_idx = job_info->next_frame_to_process[cur_dir] - 1;
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+
+    if (ref_buf_idx == -1) break;
+
+    init_gm_thread_data(gm_info, gm_thread_data);
+
+    // Compute global motion for the given ref_buf_idx.
+    av1_compute_gm_for_valid_ref_frames(
+        cpi, gm_info->ref_buf, ref_buf_idx, gm_info->num_src_corners,
+        gm_info->src_corners, gm_info->src_buffer,
+        gm_thread_data->params_by_motion, gm_thread_data->segment_map,
+        gm_info->segment_map_w, gm_info->segment_map_h);
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(gm_mt_mutex_);
+#endif
+    assert(ref_frame_idx != -1);
+    // If global motion w.r.t. current ref frame is
+    // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
+    // the remaining ref frames in that direction. The below exit is disabled
+    // when ref frame distance w.r.t. current frame is zero. E.g.:
+    // source_alt_ref_frame w.r.t. ARF frames.
+    if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
+        gm_info->reference_frames[cur_dir][ref_frame_idx].distance != 0 &&
+        cpi->common.global_motion[ref_buf_idx].wmtype != ROTZOOM)
+      job_info->early_exit[cur_dir] = 1;
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+  }
+  return 1;
+}
+
+// Assigns global motion hook function and thread data to each worker.
+static AOM_INLINE void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                          int num_workers) {
+  MultiThreadInfo *mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+#if !CONFIG_FRAME_PARALLEL_ENCODE
+    }
+#else
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+  }
+}
+
+// Assigns available threads to past/future direction.
+static AOM_INLINE void assign_thread_to_dir(int8_t *thread_id_to_dir,
+                                            int num_workers) {
+  int8_t frame_dir_idx = 0;
+
+  for (int i = 0; i < num_workers; i++) {
+    thread_id_to_dir[i] = frame_dir_idx++;
+    if (frame_dir_idx == MAX_DIRECTIONS) frame_dir_idx = 0;
+  }
+}
+
+// Computes number of workers for global motion multi-threading.
+static AOM_INLINE int compute_gm_workers(const AV1_COMP *cpi) {
+  int total_refs =
+      cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1];
+  int num_gm_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search
+                           ? AOMMIN(MAX_DIRECTIONS, total_refs)
+                           : total_refs;
+  num_gm_workers = AOMMIN(num_gm_workers, cpi->mt_info.num_workers);
+  return (num_gm_workers);
+}
+
+// Frees the memory allocated for each worker in global motion multi-threading.
+void av1_gm_dealloc(AV1GlobalMotionSync *gm_sync_data) {
+  if (gm_sync_data->thread_data != NULL) {
+    for (int j = 0; j < gm_sync_data->allocated_workers; j++) {
+      GlobalMotionThreadData *thread_data = &gm_sync_data->thread_data[j];
+      aom_free(thread_data->segment_map);
+
+      for (int m = 0; m < RANSAC_NUM_MOTIONS; m++)
+        aom_free(thread_data->params_by_motion[m].inliers);
+    }
+    aom_free(gm_sync_data->thread_data);
+  }
+}
+
+// Allocates memory for inliers and segment_map for each worker in global motion
+// multi-threading.
+static AOM_INLINE void gm_alloc(AV1_COMP *cpi, int num_workers) {
+  AV1_COMMON *cm = &cpi->common;
+  AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
+  GlobalMotionInfo *gm_info = &cpi->gm_info;
+
+  gm_sync->allocated_workers = num_workers;
+  gm_sync->allocated_width = cpi->source->y_width;
+  gm_sync->allocated_height = cpi->source->y_height;
+
+  CHECK_MEM_ERROR(cm, gm_sync->thread_data,
+                  aom_malloc(sizeof(*gm_sync->thread_data) * num_workers));
+
+  for (int i = 0; i < num_workers; i++) {
+    GlobalMotionThreadData *thread_data = &gm_sync->thread_data[i];
+    CHECK_MEM_ERROR(
+        cm, thread_data->segment_map,
+        aom_malloc(sizeof(*thread_data->segment_map) * gm_info->segment_map_w *
+                   gm_info->segment_map_h));
+
+    for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+      CHECK_MEM_ERROR(
+          cm, thread_data->params_by_motion[m].inliers,
+          aom_malloc(sizeof(*thread_data->params_by_motion[m].inliers) * 2 *
+                     MAX_CORNERS));
+    }
+  }
+}
+
+// Implements multi-threading for global motion.
+void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
+  AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
+  JobInfo *job_info = &gm_sync->job_info;
+
+  av1_zero(*job_info);
+
+  int num_workers = compute_gm_workers(cpi);
+
+  if (num_workers > gm_sync->allocated_workers ||
+      cpi->source->y_width != gm_sync->allocated_width ||
+      cpi->source->y_height != gm_sync->allocated_height) {
+    av1_gm_dealloc(gm_sync);
+    gm_alloc(cpi, num_workers);
+  }
+
+  assign_thread_to_dir(job_info->thread_id_to_dir, num_workers);
+  prepare_gm_workers(cpi, gm_mt_worker_hook, num_workers);
+  launch_workers(&cpi->mt_info, num_workers);
+  sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+// Compare and order tiles based on absolute sum of tx coeffs.
+static int compare_tile_order(const void *a, const void *b) {
+  const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a;
+  const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b;
+
+  if (tile_a->abs_sum_level > tile_b->abs_sum_level)
+    return -1;
+  else if (tile_a->abs_sum_level == tile_b->abs_sum_level)
+    return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1);
+  else
+    return 1;
+}
+
+// Get next tile index to be processed for pack bitstream
+static AOM_INLINE int get_next_pack_bs_tile_idx(
+    AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) {
+  assert(pack_bs_sync->next_job_idx <= num_tiles);
+  if (pack_bs_sync->next_job_idx == num_tiles) return -1;
+
+  return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++]
+      .tile_idx;
+}
+
+// Calculates bitstream chunk size based on total buffer size and tile or tile
+// group size.
+static AOM_INLINE size_t get_bs_chunk_size(int tg_or_tile_size,
+                                           const int frame_or_tg_size,
+                                           size_t *remain_buf_size,
+                                           size_t max_buf_size,
+                                           int is_last_chunk) {
+  size_t this_chunk_size;
+  assert(*remain_buf_size > 0);
+  if (is_last_chunk) {
+    this_chunk_size = *remain_buf_size;
+    *remain_buf_size = 0;
+  } else {
+    const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size;
+    this_chunk_size = (size_t)(size_scale / frame_or_tg_size);
+    *remain_buf_size -= this_chunk_size;
+    assert(*remain_buf_size > 0);
+  }
+  assert(this_chunk_size > 0);
+  return this_chunk_size;
+}
+
+// Initializes params required for pack bitstream tile.
+static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst,
+                                     struct aom_write_bit_buffer *saved_wb,
+                                     PackBSParams *const pack_bs_params_arr,
+                                     uint8_t obu_extn_header) {
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int num_tiles = tiles->cols * tiles->rows;
+  // Fixed size tile groups for the moment
+  const int num_tg_hdrs = cpi->num_tg;
+  // Tile group size in terms of number of tiles.
+  const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs;
+  uint8_t *tile_dst = dst;
+  uint8_t *tile_data_curr = dst;
+  // Max tile group count can not be more than MAX_TILES.
+  int tg_size_mi[MAX_TILES] = { 0 };  // Size of tile group in mi units
+  int tile_idx;
+  int tg_idx = 0;
+  int tile_count_in_tg = 0;
+  int new_tg = 1;
+
+  // Populate pack bitstream params of all tiles.
+  for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info;
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    // Calculate tile size in mi units.
+    const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) *
+                             (tile_info->mi_row_end - tile_info->mi_row_start);
+    int is_last_tile_in_tg = 0;
+    tile_count_in_tg++;
+    if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1))
+      is_last_tile_in_tg = 1;
+
+    // Populate pack bitstream params of this tile.
+    pack_bs_params->curr_tg_hdr_size = 0;
+    pack_bs_params->obu_extn_header = obu_extn_header;
+    pack_bs_params->saved_wb = saved_wb;
+    pack_bs_params->obu_header_size = 0;
+    pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg;
+    pack_bs_params->new_tg = new_tg;
+    pack_bs_params->tile_col = tile_info->tile_col;
+    pack_bs_params->tile_row = tile_info->tile_row;
+    pack_bs_params->tile_size_mi = tile_size_mi;
+    tg_size_mi[tg_idx] += tile_size_mi;
+
+    if (new_tg) new_tg = 0;
+    if (is_last_tile_in_tg) {
+      tile_count_in_tg = 0;
+      new_tg = 1;
+      tg_idx++;
+    }
+  }
+
+  assert(cpi->available_bs_size > 0);
+  size_t tg_buf_size[MAX_TILES] = { 0 };
+  size_t max_buf_size = cpi->available_bs_size;
+  size_t remain_buf_size = max_buf_size;
+  const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols;
+
+  tile_idx = 0;
+  // Prepare obu, tile group and frame header of each tile group.
+  for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) {
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    int is_last_tg = tg_idx == cpi->num_tg - 1;
+    // Prorate bitstream buffer size based on tile group size and available
+    // buffer size. This buffer will be used to store headers and tile data.
+    tg_buf_size[tg_idx] =
+        get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size,
+                          max_buf_size, is_last_tg);
+
+    pack_bs_params->dst = tile_dst;
+    pack_bs_params->tile_data_curr = tile_dst;
+
+    // Write obu, tile group and frame header at first tile in the tile
+    // group.
+    av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx);
+    tile_dst += tg_buf_size[tg_idx];
+
+    // Exclude headers from tile group buffer size.
+    tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size;
+    tile_idx += tg_size_in_tiles;
+  }
+
+  tg_idx = 0;
+  // Calculate bitstream buffer size of each tile in the tile group.
+  for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+
+    if (pack_bs_params->new_tg) {
+      max_buf_size = tg_buf_size[tg_idx];
+      remain_buf_size = max_buf_size;
+    }
+
+    // Prorate bitstream buffer size of this tile based on tile size and
+    // available buffer size. For this proration, header size is not accounted.
+    const size_t tile_buf_size = get_bs_chunk_size(
+        pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size,
+        max_buf_size, pack_bs_params->is_last_tile_in_tg);
+    pack_bs_params->tile_buf_size = tile_buf_size;
+
+    // Update base address of bitstream buffer for tile and tile group.
+    if (pack_bs_params->new_tg) {
+      tile_dst = pack_bs_params->dst;
+      tile_data_curr = pack_bs_params->tile_data_curr;
+      // Account header size in first tile of a tile group.
+      pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size;
+    } else {
+      pack_bs_params->dst = tile_dst;
+      pack_bs_params->tile_data_curr = tile_data_curr;
+    }
+
+    if (pack_bs_params->is_last_tile_in_tg) tg_idx++;
+    tile_dst += pack_bs_params->tile_buf_size;
+  }
+}
+
+// Worker hook function of pack bitsteam multithreading.
+static int pack_bs_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  PackBSParams *const pack_bs_params = (PackBSParams *)arg2;
+  AV1_COMP *const cpi = thread_data->cpi;
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int num_tiles = tiles->cols * tiles->rows;
+
+  while (1) {
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pack_bs_sync->mutex_);
+#endif
+    const int tile_idx = get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pack_bs_sync->mutex_);
+#endif
+    if (tile_idx == -1) break;
+    TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+    thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+
+    av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]);
+  }
+
+  return 1;
+}
+
+// Prepares thread data and workers of pack bitsteam multithreading.
+static void prepare_pack_bs_workers(AV1_COMP *const cpi,
+                                    PackBSParams *const pack_bs_params,
+                                    AVxWorkerHook hook, const int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+#if !CONFIG_FRAME_PARALLEL_ENCODE
+    }
+#else
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+    if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb;
+
+    thread_data->cpi = cpi;
+    thread_data->start = i;
+    thread_data->thread_id = i;
+    av1_reset_pack_bs_thread_data(thread_data->td);
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = pack_bs_params;
+  }
+
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync;
+  const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols;
+  pack_bs_sync->next_job_idx = 0;
+
+  PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order;
+  // Reset tile order data of pack bitstream
+  av1_zero_array(pack_bs_tile_order, num_tiles);
+
+  // Populate pack bitstream tile order structure
+  for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    pack_bs_tile_order[tile_idx].abs_sum_level =
+        cpi->tile_data[tile_idx].abs_sum_level;
+    pack_bs_tile_order[tile_idx].tile_idx = tile_idx;
+  }
+
+  // Sort tiles in descending order based on tile area.
+  qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order),
+        compare_tile_order);
+}
+
+// Accumulates data after pack bitsteam processing.
+static void accumulate_pack_bs_data(
+    AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr,
+    uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info,
+    int *const largest_tile_id, unsigned int *max_tile_size,
+    uint32_t *const obu_header_size, uint8_t **tile_data_start,
+    const int num_workers) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int tile_count = tiles->cols * tiles->rows;
+  // Fixed size tile groups for the moment
+  size_t curr_tg_data_size = 0;
+  int is_first_tg = 1;
+  uint8_t *curr_tg_start = dst;
+  size_t src_offset = 0;
+  size_t dst_offset = 0;
+
+  for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) {
+    // PackBSParams stores all parameters required to pack tile and header
+    // info.
+    const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    uint32_t tile_size = 0;
+
+    if (pack_bs_params->new_tg) {
+      curr_tg_start = dst + *total_size;
+      curr_tg_data_size = pack_bs_params->curr_tg_hdr_size;
+      *tile_data_start += pack_bs_params->curr_tg_hdr_size;
+      *obu_header_size = pack_bs_params->obu_header_size;
+    }
+    curr_tg_data_size +=
+        pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4);
+
+    if (pack_bs_params->buf.size > *max_tile_size) {
+      *largest_tile_id = tile_idx;
+      *max_tile_size = (unsigned int)pack_bs_params->buf.size;
+    }
+    tile_size +=
+        (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size;
+
+    // Pack all the chunks of tile bitstreams together
+    if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size);
+
+    if (pack_bs_params->is_last_tile_in_tg)
+      av1_write_last_tile_info(
+          cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size,
+          curr_tg_start, &tile_size, tile_data_start, largest_tile_id,
+          &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header);
+    src_offset += pack_bs_params->tile_buf_size;
+    dst_offset += tile_size;
+    *total_size += tile_size;
+  }
+
+  // Accumulate thread data
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int idx = num_workers - 1; idx >= 0; idx--) {
+    ThreadData const *td = mt_info->tile_thr_data[idx].td;
+    av1_accumulate_pack_bs_thread_data(cpi, td);
+  }
+}
+
+void av1_write_tile_obu_mt(
+    AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+    unsigned int *max_tile_size, uint32_t *const obu_header_size,
+    uint8_t **tile_data_start, const int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+  PackBSParams pack_bs_params[MAX_TILES];
+  uint32_t tile_size[MAX_TILES] = { 0 };
+
+  for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++)
+    pack_bs_params[tile_idx].total_size = &tile_size[tile_idx];
+
+  init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header);
+  prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook,
+                          num_workers);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, &cpi->common, num_workers);
+  accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info,
+                          largest_tile_id, max_tile_size, obu_header_size,
+                          tile_data_start, num_workers);
+}
+
+// Deallocate memory for CDEF search multi-thread synchronization.
+void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) {
+  (void)cdef_sync;
+  assert(cdef_sync != NULL);
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_ != NULL) {
+    pthread_mutex_destroy(cdef_sync->mutex_);
+    aom_free(cdef_sync->mutex_);
+  }
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Updates the row and column indices of the next job to be processed.
+// Also updates end_of_frame flag when the processing of all blocks is complete.
+static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) {
+  cdef_sync->fbc++;
+  if (cdef_sync->fbc == nhfb) {
+    cdef_sync->fbr++;
+    if (cdef_sync->fbr == nvfb) {
+      cdef_sync->end_of_frame = 1;
+    } else {
+      cdef_sync->fbc = 0;
+    }
+  }
+}
+
+// Initializes cdef_sync parameters.
+static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) {
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+#endif  // CONFIG_MULTITHREAD
+  cdef_sync->end_of_frame = 0;
+  cdef_sync->fbr = 0;
+  cdef_sync->fbc = 0;
+}
+
+// Checks if a job is available. If job is available,
+// populates next job information and returns 1, else returns 0.
+static AOM_INLINE int cdef_get_next_job(AV1CdefSync *cdef_sync,
+                                        CdefSearchCtx *cdef_search_ctx,
+                                        int *cur_fbr, int *cur_fbc,
+                                        int *sb_count) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(cdef_sync->mutex_);
+#endif  // CONFIG_MULTITHREAD
+  int do_next_block = 0;
+  const int nvfb = cdef_search_ctx->nvfb;
+  const int nhfb = cdef_search_ctx->nhfb;
+
+  // If a block is skip, do not process the block and
+  // check the skip condition for the next block.
+  while ((!cdef_sync->end_of_frame) &&
+         (cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr,
+                       cdef_sync->fbc))) {
+    update_next_job_info(cdef_sync, nvfb, nhfb);
+  }
+
+  // Populates information needed for current job and update the row,
+  // column indices of the next block to be processed.
+  if (cdef_sync->end_of_frame == 0) {
+    do_next_block = 1;
+    *cur_fbr = cdef_sync->fbr;
+    *cur_fbc = cdef_sync->fbc;
+    *sb_count = cdef_search_ctx->sb_count;
+    cdef_search_ctx->sb_count++;
+    update_next_job_info(cdef_sync, nvfb, nhfb);
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(cdef_sync->mutex_);
+#endif  // CONFIG_MULTITHREAD
+  return do_next_block;
+}
+
+// Hook function for each thread in CDEF search multi-threading.
+static int cdef_filter_block_worker_hook(void *arg1, void *arg2) {
+  AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
+  CdefSearchCtx *cdef_search_ctx = (CdefSearchCtx *)arg2;
+  int cur_fbr, cur_fbc, sb_count;
+  while (cdef_get_next_job(cdef_sync, cdef_search_ctx, &cur_fbr, &cur_fbc,
+                           &sb_count)) {
+    av1_cdef_mse_calc_block(cdef_search_ctx, cur_fbr, cur_fbc, sb_count);
+  }
+  return 1;
+}
+
+// Assigns CDEF search hook function and thread data to each worker.
+static void prepare_cdef_workers(MultiThreadInfo *mt_info,
+                                 CdefSearchCtx *cdef_search_ctx,
+                                 AVxWorkerHook hook, int num_workers) {
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    worker->hook = hook;
+    worker->data1 = &mt_info->cdef_sync;
+    worker->data2 = cdef_search_ctx;
+  }
+}
+
+// Implements multi-threading for CDEF search.
+void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
+                                CdefSearchCtx *cdef_search_ctx) {
+  AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+  const int num_workers = mt_info->num_mod_workers[MOD_CDEF_SEARCH];
+
+  cdef_reset_job_info(cdef_sync);
+  prepare_cdef_workers(mt_info, cdef_search_ctx, cdef_filter_block_worker_hook,
+                       num_workers);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, cm, num_workers);
+}
+
+// Computes num_workers for temporal filter multi-threading.
+static AOM_INLINE int compute_num_tf_workers(AV1_COMP *cpi) {
+  // For single-pass encode, using no. of workers as per tf block size was not
+  // found to improve speed. Hence the thread assignment for single-pass encode
+  // is kept based on compute_num_enc_workers().
+  if (cpi->oxcf.pass < AOM_RC_SECOND_PASS)
+    return (av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads));
+
+  if (cpi->oxcf.max_threads <= 1) return 1;
+
+  const int frame_height = cpi->common.height;
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+  const int mb_height = block_size_high[block_size];
+  const int mb_rows = get_num_blocks(frame_height, mb_height);
+  return AOMMIN(cpi->oxcf.max_threads, mb_rows);
+}
+
+// Computes num_workers for tpl multi-threading.
+static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) {
+  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for loop filter multi-threading.
+static AOM_INLINE int compute_num_lf_workers(AV1_COMP *cpi) {
+  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for cdef multi-threading.
+static AOM_INLINE int compute_num_cdef_workers(AV1_COMP *cpi) {
+  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for loop-restoration multi-threading.
+static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) {
+  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for pack bitstream multi-threading.
+static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) {
+  if (cpi->oxcf.max_threads <= 1) return 1;
+  return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads);
+}
+
+int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) {
+  int num_mod_workers = 0;
+  switch (mod_name) {
+    case MOD_FP:
+      if (cpi->oxcf.pass >= AOM_RC_SECOND_PASS)
+        num_mod_workers = 0;
+      else
+        num_mod_workers =
+            av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+      break;
+    case MOD_TF: num_mod_workers = compute_num_tf_workers(cpi); break;
+    case MOD_TPL: num_mod_workers = compute_num_tpl_workers(cpi); break;
+    case MOD_GME: num_mod_workers = 1; break;
+    case MOD_ENC:
+      num_mod_workers = av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+      break;
+    case MOD_LPF: num_mod_workers = compute_num_lf_workers(cpi); break;
+    case MOD_CDEF_SEARCH:
+      num_mod_workers = compute_num_cdef_workers(cpi);
+      break;
+    case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break;
+    case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break;
+    case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break;
+    case MOD_FRAME_ENC:
+      num_mod_workers = cpi->ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC];
+      break;
+    default: assert(0); break;
+  }
+  return (num_mod_workers);
+}
+// Computes the number of workers for each MT modules in the encoder
+void av1_compute_num_workers_for_mt(AV1_COMP *cpi) {
+  for (int i = MOD_FP; i < NUM_MT_MODULES; i++)
+    cpi->ppi->p_mt_info.num_mod_workers[i] =
+        compute_num_mod_workers(cpi, (MULTI_THREADED_MODULES)i);
+}
diff --git a/media/libaom/src/av1/encoder/ethread.h b/media/libaom/src/av1/encoder/ethread.h
index 1830759504..b8497724d2 100644
--- a/media/libaom/src/av1/encoder/ethread.h
+++ b/media/libaom/src/av1/encoder/ethread.h
@@ -18,35 +18,111 @@ extern "C" {
 
 struct AV1_COMP;
 struct ThreadData;
-struct AV1RowMTSyncData;
 
 typedef struct EncWorkerData {
   struct AV1_COMP *cpi;
   struct ThreadData *td;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  struct ThreadData *original_td;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   int start;
   int thread_id;
 } EncWorkerData;
 
-void av1_row_mt_sync_read(AV1RowMTSync *const row_mt_sync, int r, int c);
-void av1_row_mt_sync_write(AV1RowMTSync *const row_mt_sync, int r, int c,
-                           const int cols);
+void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c);
+void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c,
+                           int cols);
 
-void av1_row_mt_sync_read_dummy(struct AV1RowMTSyncData *const row_mt_sync,
-                                int r, int c);
-void av1_row_mt_sync_write_dummy(struct AV1RowMTSyncData *const row_mt_sync,
-                                 int r, int c, const int cols);
-
-void av1_row_mt_sync_mem_dealloc(AV1RowMTSync *row_mt_sync);
-// Allocate memory for row based multi-threading synchronization.
-void av1_row_mt_sync_mem_alloc(AV1RowMTSync *row_mt_sync, struct AV1Common *cm,
-                               int rows);
+void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+                                int c);
+void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+                                 int c, int cols);
 
 void av1_encode_tiles_mt(struct AV1_COMP *cpi);
 void av1_encode_tiles_row_mt(struct AV1_COMP *cpi);
 
+#if !CONFIG_REALTIME_ONLY
+void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi);
+
+int av1_fp_compute_num_enc_workers(AV1_COMP *cpi);
+#endif
+
 void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
                                  const struct FRAME_COUNTS *counts);
 
+void av1_row_mt_mem_dealloc(AV1_COMP *cpi);
+
+void av1_global_motion_estimation_mt(AV1_COMP *cpi);
+
+void av1_gm_dealloc(AV1GlobalMotionSync *gm_sync_data);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+                                    int r, int c);
+void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+                                     int r, int c, int cols);
+
+void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_mt_sync, int r,
+                              int c);
+void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_mt_sync, int r,
+                               int c, int cols);
+
+void av1_mc_flow_dispenser_mt(AV1_COMP *cpi);
+
+void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync);
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_tf_do_filtering_mt(AV1_COMP *cpi);
+
+void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync);
+
+void av1_compute_num_workers_for_mt(AV1_COMP *cpi);
+
+int av1_get_max_num_workers(const AV1_COMP *cpi);
+
+void av1_create_workers(AV1_PRIMARY *ppi, int num_workers);
+
+void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi);
+
+void av1_init_cdef_worker(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_init_lr_mt_buffers(AV1_COMP *cpi);
+#endif
+
+#if CONFIG_MULTITHREAD
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass);
+#endif  // CONFIG_MULTITHREAD
+
+int av1_get_num_mod_workers_for_alloc(PrimaryMultiThreadInfo *const p_mt_info,
+                                      MULTI_THREADED_MODULES mod_name);
+
+void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass);
+
+void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
+                                CdefSearchCtx *cdef_search_ctx);
+
+void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync);
+
+void av1_write_tile_obu_mt(
+    AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+    unsigned int *max_tile_size, uint32_t *const obu_header_size,
+    uint8_t **tile_data_start, const int num_workers);
+
+int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers);
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf);
+
+int av1_check_fpmt_config(AV1_PRIMARY *const ppi, AV1EncoderConfig *const oxcf);
+
+int av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
+                                 AV1_COMP_DATA *const first_cpi_data);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/extend.c b/media/libaom/src/av1/encoder/extend.c
index 934cf5644b..e1b1e69ca7 100644
--- a/media/libaom/src/av1/encoder/extend.c
+++ b/media/libaom/src/av1/encoder/extend.c
@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <assert.h>
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
@@ -19,18 +21,24 @@
 static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
                                   uint8_t *dst, int dst_pitch, int w, int h,
                                   int extend_top, int extend_left,
-                                  int extend_bottom, int extend_right) {
+                                  int extend_bottom, int extend_right,
+                                  int chroma_step) {
   int i, linesize;
-
   // copy the left and right most columns out
   const uint8_t *src_ptr1 = src;
-  const uint8_t *src_ptr2 = src + w - 1;
+  const uint8_t *src_ptr2 = src + (w - 1) * chroma_step;
   uint8_t *dst_ptr1 = dst - extend_left;
   uint8_t *dst_ptr2 = dst + w;
 
   for (i = 0; i < h; i++) {
     memset(dst_ptr1, src_ptr1[0], extend_left);
-    memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    if (chroma_step == 1) {
+      memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    } else {
+      for (int j = 0; j < w; j++) {
+        dst_ptr1[extend_left + j] = src_ptr1[chroma_step * j];
+      }
+    }
     memset(dst_ptr2, src_ptr2[0], extend_right);
     src_ptr1 += src_pitch;
     src_ptr2 += src_pitch;
@@ -45,6 +53,7 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
   dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
   dst_ptr2 = dst + dst_pitch * (h)-extend_left;
   linesize = extend_left + extend_right + w;
+  assert(linesize <= dst_pitch);
 
   for (i = 0; i < extend_top; i++) {
     memcpy(dst_ptr1, src_ptr1, linesize);
@@ -88,6 +97,7 @@ static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
   dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
   dst_ptr2 = dst + dst_pitch * (h)-extend_left;
   linesize = extend_left + extend_right + w;
+  assert(linesize <= dst_pitch);
 
   for (i = 0; i < extend_top; i++) {
     memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
@@ -111,8 +121,8 @@ void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const int eb_y = AOMMAX(src->y_height + dst->border,
                           ALIGN_POWER_OF_TWO(src->y_height, 6)) -
                    src->y_crop_height;
-  const int uv_width_subsampling = (src->uv_width != src->y_width);
-  const int uv_height_subsampling = (src->uv_height != src->y_height);
+  const int uv_width_subsampling = src->subsampling_x;
+  const int uv_height_subsampling = src->subsampling_y;
   const int et_uv = et_y >> uv_height_subsampling;
   const int el_uv = el_y >> uv_width_subsampling;
   const int eb_uv = eb_y >> uv_height_subsampling;
@@ -122,12 +132,10 @@ void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
     highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
                                  dst->y_stride, src->y_crop_width,
                                  src->y_crop_height, et_y, el_y, eb_y, er_y);
-    if (src->u_buffer) {
+    if (!src->monochrome) {
       highbd_copy_and_extend_plane(
           src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
           src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
-    }
-    if (src->v_buffer) {
       highbd_copy_and_extend_plane(
           src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
           src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
@@ -137,15 +145,19 @@ void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
 
   copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
                         dst->y_stride, src->y_crop_width, src->y_crop_height,
-                        et_y, el_y, eb_y, er_y);
-  if (src->u_buffer) {
+                        et_y, el_y, eb_y, er_y, 1);
+  if (!src->monochrome) {
+    // detect nv12 format
+    const int chroma_step = src->v_buffer ? 1 : 2;
+    const uint8_t *src_v_buffer =
+        src->v_buffer ? src->v_buffer : src->u_buffer + 1;
     copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
                           dst->uv_stride, src->uv_crop_width,
-                          src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
-  }
-  if (src->v_buffer) {
-    copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
+                          src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv,
+                          chroma_step);
+    copy_and_extend_plane(src_v_buffer, src->uv_stride, dst->v_buffer,
                           dst->uv_stride, src->uv_crop_width,
-                          src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+                          src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv,
+                          chroma_step);
   }
 }
diff --git a/media/libaom/src/av1/encoder/external_partition.c b/media/libaom/src/av1/encoder/external_partition.c
new file mode 100644
index 0000000000..79f8b4c8a4
--- /dev/null
+++ b/media/libaom/src/av1/encoder/external_partition.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/encoder/external_partition.h"
+
+aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
+                                    aom_ext_part_config_t config,
+                                    ExtPartController *ext_part_controller) {
+  if (ext_part_controller == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  ext_part_controller->funcs = funcs;
+  ext_part_controller->config = config;
+  const aom_ext_part_status_t status = ext_part_controller->funcs.create_model(
+      ext_part_controller->funcs.priv, &ext_part_controller->config,
+      &ext_part_controller->model);
+  if (status == AOM_EXT_PART_ERROR) {
+    return AOM_CODEC_ERROR;
+  } else if (status == AOM_EXT_PART_TEST) {
+    ext_part_controller->test_mode = 1;
+    ext_part_controller->ready = 0;
+    return AOM_CODEC_OK;
+  }
+  assert(status == AOM_EXT_PART_OK);
+  ext_part_controller->ready = 1;
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller) {
+  if (ext_part_controller == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  av1_zero(ext_part_controller);
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller) {
+  if (ext_part_controller == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  if (ext_part_controller->ready) {
+    const aom_ext_part_status_t status =
+        ext_part_controller->funcs.delete_model(ext_part_controller->model);
+    if (status != AOM_EXT_PART_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return av1_ext_part_init(ext_part_controller);
+}
+
+bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
+                                         aom_partition_decision_t *decision) {
+  assert(ext_part_controller != NULL);
+  assert(ext_part_controller->ready);
+  assert(decision != NULL);
+  const aom_ext_part_status_t status =
+      ext_part_controller->funcs.get_partition_decision(
+          ext_part_controller->model, decision);
+  if (status != AOM_EXT_PART_OK) return false;
+  return true;
+}
+
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+                                       const aom_partition_stats_t *stats) {
+  assert(ext_part_controller != NULL);
+  assert(ext_part_controller->ready);
+  assert(stats != NULL);
+  const aom_ext_part_status_t status =
+      ext_part_controller->funcs.send_partition_stats(
+          ext_part_controller->model, stats);
+  if (status != AOM_EXT_PART_OK) return false;
+  return true;
+}
+
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+                                const aom_partition_features_t *features) {
+  assert(ext_part_controller != NULL);
+  assert(ext_part_controller->ready);
+  assert(features != NULL);
+  const aom_ext_part_status_t status = ext_part_controller->funcs.send_features(
+      ext_part_controller->model, features);
+  if (status != AOM_EXT_PART_OK) return false;
+  return true;
+}
+
+aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode(
+    const ExtPartController *ext_part_controller) {
+  return ext_part_controller->funcs.decision_mode;
+}
diff --git a/media/libaom/src/av1/encoder/external_partition.h b/media/libaom/src/av1/encoder/external_partition.h
new file mode 100644
index 0000000000..f74973e9eb
--- /dev/null
+++ b/media/libaom/src/av1/encoder/external_partition.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
+#define AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
+
+#include <stdbool.h>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!\cond */
+
+typedef struct ExtPartController {
+  int ready;
+  int test_mode;
+  aom_ext_part_config_t config;
+  aom_ext_part_model_t model;
+  aom_ext_part_funcs_t funcs;
+} ExtPartController;
+
+aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
+                                    aom_ext_part_config_t config,
+                                    ExtPartController *ext_part_controller);
+
+aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller);
+
+aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller);
+
+bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
+                                         aom_partition_decision_t *decision);
+
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+                                       const aom_partition_stats_t *stats);
+
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+                                const aom_partition_features_t *features);
+
+aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode(
+    const ExtPartController *ext_part_controller);
+
+/*!\endcond */
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
diff --git a/media/libaom/src/av1/encoder/firstpass.c b/media/libaom/src/av1/encoder/firstpass.c
index 0955510ca9..ff8ed9698d 100644
--- a/media/libaom/src/av1/encoder/firstpass.c
+++ b/media/libaom/src/av1/encoder/firstpass.c
@@ -20,13 +20,13 @@
 #include "aom_dsp/variance.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 #include "aom_scale/aom_scale.h"
 #include "aom_scale/yv12config.h"
 
 #include "av1/common/entropymv.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"  // av1_setup_dst_planes()
+#include "av1/common/reconintra.h"
 #include "av1/common/txb_common.h"
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/av1_quantize.h"
@@ -37,6 +37,7 @@
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/mcomp.h"
@@ -53,6 +54,8 @@
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
 
+#define INVALID_FP_STATS_TO_PREDICT_FLAT_GOP -1
+
 static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats,
                                     struct aom_codec_pkt_list *pktlist) {
   struct aom_codec_cx_pkt pkt;
@@ -107,10 +110,13 @@ void av1_twopass_zero_stats(FIRSTPASS_STATS *section) {
   section->new_mv_count = 0.0;
   section->count = 0.0;
   section->duration = 1.0;
+  section->is_flash = 0;
+  section->noise_var = 0;
+  section->cor_coeff = 1.0;
 }
 
-static AOM_INLINE void accumulate_stats(FIRSTPASS_STATS *section,
-                                        const FIRSTPASS_STATS *frame) {
+void av1_accumulate_stats(FIRSTPASS_STATS *section,
+                          const FIRSTPASS_STATS *frame) {
   section->frame += frame->frame;
   section->weight += frame->weight;
   section->intra_error += frame->intra_error;
@@ -136,9 +142,49 @@ static AOM_INLINE void accumulate_stats(FIRSTPASS_STATS *section,
   section->duration += frame->duration;
 }
 
+static int get_unit_rows(const BLOCK_SIZE fp_block_size, const int mb_rows) {
+  const int height_mi_log2 = mi_size_high_log2[fp_block_size];
+  const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16];
+  if (height_mi_log2 > mb_height_mi_log2) {
+    return mb_rows >> (height_mi_log2 - mb_height_mi_log2);
+  }
+
+  return mb_rows << (mb_height_mi_log2 - height_mi_log2);
+}
+
+static int get_unit_cols(const BLOCK_SIZE fp_block_size, const int mb_cols) {
+  const int width_mi_log2 = mi_size_wide_log2[fp_block_size];
+  const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16];
+  if (width_mi_log2 > mb_width_mi_log2) {
+    return mb_cols >> (width_mi_log2 - mb_width_mi_log2);
+  }
+
+  return mb_cols << (mb_width_mi_log2 - width_mi_log2);
+}
+
+// TODO(chengchen): can we simplify it even if resize has to be considered?
+static int get_num_mbs(const BLOCK_SIZE fp_block_size,
+                       const int num_mbs_16X16) {
+  const int width_mi_log2 = mi_size_wide_log2[fp_block_size];
+  const int height_mi_log2 = mi_size_high_log2[fp_block_size];
+  const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16];
+  const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16];
+  // TODO(chengchen): Now this function assumes a square block is used.
+  // It does not support rectangular block sizes.
+  assert(width_mi_log2 == height_mi_log2);
+  if (width_mi_log2 > mb_width_mi_log2) {
+    return num_mbs_16X16 >> ((width_mi_log2 - mb_width_mi_log2) +
+                             (height_mi_log2 - mb_height_mi_log2));
+  }
+
+  return num_mbs_16X16 << ((mb_width_mi_log2 - width_mi_log2) +
+                           (mb_height_mi_log2 - height_mi_log2));
+}
+
 void av1_end_first_pass(AV1_COMP *cpi) {
-  if (cpi->twopass.stats_buf_ctx->total_stats)
-    output_stats(cpi->twopass.stats_buf_ctx->total_stats, cpi->output_pkt_list);
+  if (cpi->ppi->twopass.stats_buf_ctx->total_stats && !cpi->ppi->lap_enabled)
+    output_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats,
+                 cpi->ppi->output_pkt_list);
 }
 
 static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
@@ -203,9 +249,9 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
 
 // Refine the motion search range according to the frame dimension
 // for first pass test.
-static int get_search_range(const AV1_COMP *cpi) {
+static int get_search_range(const InitialDimensions *initial_dimensions) {
   int sr = 0;
-  const int dim = AOMMIN(cpi->initial_width, cpi->initial_height);
+  const int dim = AOMMIN(initial_dimensions->width, initial_dimensions->height);
 
   while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
   return sr;
@@ -218,25 +264,30 @@ static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
   int tmp_err;
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
-  const int sr = get_search_range(cpi);
-  const int step_param = 3 + sr;
+  const int sr = get_search_range(&cpi->initial_dimensions);
+  const int step_param = cpi->sf.fp_sf.reduce_mv_step_param + sr;
 
   const search_site_config *first_pass_search_sites =
-      &cpi->mv_search_params.ss_cfg[SS_CFG_FPF];
+      cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
+  const int fine_search_interval =
+      cpi->is_screen_content_type && cpi->common.features.allow_intrabc;
   FULLPEL_MOTION_SEARCH_PARAMS ms_params;
   av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv,
-                                     first_pass_search_sites);
-  ms_params.search_method = NSTEP;
+                                     first_pass_search_sites,
+                                     fine_search_interval);
+  av1_set_mv_search_method(&ms_params, first_pass_search_sites, NSTEP);
 
   FULLPEL_MV this_best_mv;
   tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL,
                                   &this_best_mv, NULL);
 
   if (tmp_err < INT_MAX) {
-    tmp_err = av1_get_mvpred_sse(x, &this_best_mv, ref_mv, &v_fn_ptr) +
+    aom_variance_fn_ptr_t v_fn_ptr = cpi->ppi->fn_ptr[bsize];
+    const MSBuffers *ms_buffers = &ms_params.ms_buffers;
+    tmp_err = av1_get_mvpred_sse(&ms_params.mv_cost_params, this_best_mv,
+                                 &v_fn_ptr, ms_buffers->src, ms_buffers->ref) +
               new_mv_mode_penalty;
   }
 
@@ -247,18 +298,35 @@ static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
 }
 
 static BLOCK_SIZE get_bsize(const CommonModeInfoParams *const mi_params,
-                            int mb_row, int mb_col) {
-  if (mi_size_wide[BLOCK_16X16] * mb_col + mi_size_wide[BLOCK_8X8] <
-      mi_params->mi_cols) {
-    return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
-                   mi_params->mi_rows
-               ? BLOCK_16X16
-               : BLOCK_16X8;
+                            const BLOCK_SIZE fp_block_size, const int unit_row,
+                            const int unit_col) {
+  const int unit_width = mi_size_wide[fp_block_size];
+  const int unit_height = mi_size_high[fp_block_size];
+  const int is_half_width =
+      unit_width * unit_col + unit_width / 2 >= mi_params->mi_cols;
+  const int is_half_height =
+      unit_height * unit_row + unit_height / 2 >= mi_params->mi_rows;
+  const int max_dimension =
+      AOMMAX(block_size_wide[fp_block_size], block_size_high[fp_block_size]);
+  int square_block_size = 0;
+  // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
+  switch (max_dimension) {
+    case 4: square_block_size = 0; break;
+    case 8: square_block_size = 1; break;
+    case 16: square_block_size = 2; break;
+    case 32: square_block_size = 3; break;
+    case 64: square_block_size = 4; break;
+    case 128: square_block_size = 5; break;
+    default: assert(0 && "First pass block size is not supported!"); break;
+  }
+  if (is_half_width && is_half_height) {
+    return subsize_lookup[PARTITION_SPLIT][square_block_size];
+  } else if (is_half_width) {
+    return subsize_lookup[PARTITION_VERT][square_block_size];
+  } else if (is_half_height) {
+    return subsize_lookup[PARTITION_HORZ][square_block_size];
   } else {
-    return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
-                   mi_params->mi_rows
-               ? BLOCK_8X16
-               : BLOCK_8X8;
+    return fp_block_size;
   }
 }
 
@@ -289,55 +357,78 @@ static double raw_motion_error_stdev(int *raw_motion_err_list,
   return raw_err_stdev;
 }
 
-// This structure contains several key parameters to be accumulate for this
-// frame.
-typedef struct {
-  // Intra prediction error.
-  int64_t intra_error;
-  // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
-  int64_t frame_avg_wavelet_energy;
-  // Best of intra pred error and inter pred error using last frame as ref.
-  int64_t coded_error;
-  // Best of intra pred error and inter pred error using golden frame as ref.
-  int64_t sr_coded_error;
-  // Best of intra pred error and inter pred error using altref frame as ref.
-  int64_t tr_coded_error;
-  // Count of motion vector.
-  int mv_count;
-  // Count of blocks that pick inter prediction (inter pred error is smaller
-  // than intra pred error).
-  int inter_count;
-  // Count of blocks that pick second ref (golden frame).
-  int second_ref_count;
-  // Count of blocks that pick third ref (altref frame).
-  int third_ref_count;
-  // Count of blocks where the inter and intra are very close and very low.
-  double neutral_count;
-  // Count of blocks where intra error is very small.
-  int intra_skip_count;
-  // Start row.
-  int image_data_start_row;
-  // Count of unique non-zero motion vectors.
-  int new_mv_count;
-  // Sum of inward motion vectors.
-  int sum_in_vectors;
-  // Sum of motion vector row.
-  int sum_mvr;
-  // Sum of motion vector column.
-  int sum_mvc;
-  // Sum of absolute value of motion vector row.
-  int sum_mvr_abs;
-  // Sum of absolute value of motion vector column.
-  int sum_mvc_abs;
-  // Sum of the square of motion vector row.
-  int64_t sum_mvrs;
-  // Sum of the square of motion vector column.
-  int64_t sum_mvcs;
-  // A factor calculated using intra pred error.
-  double intra_factor;
-  // A factor that measures brightness.
-  double brightness_factor;
-} FRAME_STATS;
+static AOM_INLINE int calc_wavelet_energy(const AV1EncoderConfig *oxcf) {
+  return oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL;
+}
+typedef struct intra_pred_block_pass1_args {
+  const SequenceHeader *seq_params;
+  MACROBLOCK *x;
+} intra_pred_block_pass1_args;
+
+static INLINE void copy_rect(uint8_t *dst, int dstride, const uint8_t *src,
+                             int sstride, int width, int height, int use_hbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd) {
+    aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), sstride,
+                             CONVERT_TO_SHORTPTR(dst), dstride, width, height);
+  } else {
+    aom_convolve_copy(src, sstride, dst, dstride, width, height);
+  }
+#else
+  (void)use_hbd;
+  aom_convolve_copy(src, sstride, dst, dstride, width, height);
+#endif
+}
+
+static void first_pass_intra_pred_and_calc_diff(int plane, int block,
+                                                int blk_row, int blk_col,
+                                                BLOCK_SIZE plane_bsize,
+                                                TX_SIZE tx_size, void *arg) {
+  (void)block;
+  struct intra_pred_block_pass1_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  MACROBLOCK_PLANE *const p = &x->plane[plane];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const SequenceHeader *seq_params = args->seq_params;
+  const int src_stride = p->src.stride;
+  uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+
+  av1_predict_intra_block(
+      xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width,
+      pd->height, tx_size, mbmi->mode, 0, 0, FILTER_INTRA_MODES, src,
+      src_stride, dst, dst_stride, blk_col, blk_row, plane);
+
+  av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+}
+
+static void first_pass_predict_intra_block_for_luma_plane(
+    const SequenceHeader *seq_params, MACROBLOCK *x, BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = AOM_PLANE_Y;
+  const MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = pd->dst.buf;
+  const MACROBLOCK_PLANE *const p = &x->plane[plane];
+  const int src_stride = p->src.stride;
+  const uint8_t *src = p->src.buf;
+
+  intra_pred_block_pass1_args args = { seq_params, x };
+  av1_foreach_transformed_block_in_plane(
+      xd, plane_bsize, plane, first_pass_intra_pred_and_calc_diff, &args);
+
+  // copy source data to recon buffer, as the recon buffer will be used as a
+  // reference frame subsequently.
+  copy_rect(dst, dst_stride, src, src_stride, block_size_wide[bsize],
+            block_size_high[bsize], seq_params->use_highbitdepth);
+}
 
 #define UL_INTRA_THRESH 50
 #define INVALID_ROW -1
@@ -347,8 +438,8 @@ typedef struct {
 //   cpi: the encoder setting. Only a few params in it will be used.
 //   this_frame: the current frame buffer.
 //   tile: tile information (not used in first pass, already init to zero)
-//   mb_row: row index in the unit of first pass block size.
-//   mb_col: column index in the unit of first pass block size.
+//   unit_row: row index in the unit of first pass block size.
+//   unit_col: column index in the unit of first pass block size.
 //   y_offset: the offset of y frame buffer, indicating the starting point of
 //             the current block.
 //   uv_offset: the offset of u and v frame buffer, indicating the starting
@@ -366,47 +457,41 @@ typedef struct {
 // Returns:
 //   this_intra_error.
 static int firstpass_intra_prediction(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG *const this_frame,
-    const TileInfo *const tile, const int mb_row, const int mb_col,
+    AV1_COMP *cpi, ThreadData *td, YV12_BUFFER_CONFIG *const this_frame,
+    const TileInfo *const tile, const int unit_row, const int unit_col,
     const int y_offset, const int uv_offset, const BLOCK_SIZE fp_block_size,
     const int qindex, FRAME_STATS *const stats) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const SequenceHeader *const seq_params = &cm->seq_params;
-  MACROBLOCK *const x = &cpi->td.mb;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int mb_scale = mi_size_wide[fp_block_size];
-  const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+  const int unit_scale = mi_size_wide[fp_block_size];
   const int num_planes = av1_num_planes(cm);
-  const BLOCK_SIZE bsize = get_bsize(mi_params, mb_row, mb_col);
+  const BLOCK_SIZE bsize =
+      get_bsize(mi_params, fp_block_size, unit_row, unit_col);
 
-  aom_clear_system_state();
-  set_mi_offsets(mi_params, xd, mb_row * mb_scale, mb_col * mb_scale);
+  set_mi_offsets(mi_params, xd, unit_row * unit_scale, unit_col * unit_scale);
   xd->plane[0].dst.buf = this_frame->y_buffer + y_offset;
   xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset;
   xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset;
-  xd->left_available = (mb_col != 0);
-  xd->mi[0]->sb_type = bsize;
+  xd->left_available = (unit_col != 0);
+  xd->mi[0]->bsize = bsize;
   xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-  set_mi_row_col(xd, tile, mb_row * mb_scale, mi_size_high[bsize],
-                 mb_col * mb_scale, mi_size_wide[bsize], mi_params->mi_rows,
+  set_mi_row_col(xd, tile, unit_row * unit_scale, mi_size_high[bsize],
+                 unit_col * unit_scale, mi_size_wide[bsize], mi_params->mi_rows,
                  mi_params->mi_cols);
   set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
   xd->mi[0]->segment_id = 0;
   xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
   xd->mi[0]->mode = DC_PRED;
-  xd->mi[0]->tx_size =
-      use_dc_pred ? (bsize >= fp_block_size ? TX_16X16 : TX_8X8) : TX_4X4;
+  xd->mi[0]->tx_size = TX_4X4;
 
-  av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
+  if (cpi->sf.fp_sf.disable_recon)
+    first_pass_predict_intra_block_for_luma_plane(seq_params, x, bsize);
+  else
+    av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
   int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff);
-
-  if (this_intra_error < UL_INTRA_THRESH) {
-    ++stats->intra_skip_count;
-  } else if ((mb_col > 0) && (stats->image_data_start_row == INVALID_ROW)) {
-    stats->image_data_start_row = mb_row;
-  }
-
   if (seq_params->use_highbitdepth) {
     switch (seq_params->bit_depth) {
       case AOM_BITS_8: break;
@@ -420,7 +505,12 @@ static int firstpass_intra_prediction(
     }
   }
 
-  aom_clear_system_state();
+  if (this_intra_error < UL_INTRA_THRESH) {
+    ++stats->intra_skip_count;
+  } else if ((unit_col > 0) && (stats->image_data_start_row == INVALID_ROW)) {
+    stats->image_data_start_row = unit_row;
+  }
+
   double log_intra = log(this_intra_error + 1.0);
   if (log_intra < 10.0) {
     stats->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
@@ -434,6 +524,19 @@ static int firstpass_intra_prediction(
   } else {
     level_sample = x->plane[0].src.buf[0];
   }
+
+  if (seq_params->use_highbitdepth) {
+    switch (seq_params->bit_depth) {
+      case AOM_BITS_8: break;
+      case AOM_BITS_10: level_sample >>= 2; break;
+      case AOM_BITS_12: level_sample >>= 4; break;
+      default:
+        assert(0 &&
+               "seq_params->bit_depth should be AOM_BITS_8, "
+               "AOM_BITS_10 or AOM_BITS_12");
+        return -1;
+    }
+  }
   if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
     stats->brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
   } else {
@@ -452,14 +555,22 @@ static int firstpass_intra_prediction(
   // Accumulate the intra error.
   stats->intra_error += (int64_t)this_intra_error;
 
-  const int hbd = is_cur_buf_hbd(xd);
-  const int stride = x->plane[0].src.stride;
-  uint8_t *buf = x->plane[0].src.buf;
-  for (int r8 = 0; r8 < 2; ++r8) {
-    for (int c8 = 0; c8 < 2; ++c8) {
-      stats->frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
-          buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
-    }
+  // Stats based on wavelet energy is used in the following cases :
+  // 1. ML model which predicts if a flat structure (golden-frame only structure
+  // without ALT-REF and Internal-ARFs) is better. This ML model is enabled in
+  // constant quality mode under certain conditions.
+  // 2. Delta qindex mode is set as DELTA_Q_PERCEPTUAL.
+  // Thus, wavelet energy calculation is enabled for the above cases.
+  if (calc_wavelet_energy(&cpi->oxcf)) {
+    const int hbd = is_cur_buf_hbd(xd);
+    const int stride = x->plane[0].src.stride;
+    const int num_8x8_rows = block_size_high[fp_block_size] / 8;
+    const int num_8x8_cols = block_size_wide[fp_block_size] / 8;
+    const uint8_t *buf = x->plane[0].src.buf;
+    stats->frame_avg_wavelet_energy += av1_haar_ac_sad_mxn_uint8_input(
+        buf, stride, hbd, num_8x8_rows, num_8x8_cols);
+  } else {
+    stats->frame_avg_wavelet_energy = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP;
   }
 
   return this_intra_error;
@@ -486,13 +597,13 @@ static int get_prediction_error_bitdepth(const int is_high_bitdepth,
 static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
                                 const int mb_row, const int mb_col,
                                 const int mb_rows, const int mb_cols,
-                                MV *last_mv, FRAME_STATS *stats) {
+                                MV *last_non_zero_mv, FRAME_STATS *stats) {
   if (is_zero_mv(&best_mv)) return;
 
   ++stats->mv_count;
   // Non-zero vector, was it different from the last non zero vector?
-  if (!is_equal_mv(&best_mv, last_mv)) ++stats->new_mv_count;
-  *last_mv = best_mv;
+  if (!is_equal_mv(&best_mv, last_non_zero_mv)) ++stats->new_mv_count;
+  *last_non_zero_mv = best_mv;
 
   // Does the row vector point inwards or outwards?
   if (mb_row < mb_rows / 2) {
@@ -525,7 +636,6 @@ static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
   }
 }
 
-#define LOW_MOTION_ERROR_THRESH 25
 // Computes and returns the inter prediction error from the last frame.
 // Computes inter prediction errors from the golden and alt ref frams and
 // Updates stats accordingly.
@@ -533,21 +643,20 @@ static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
 //   cpi: the encoder setting. Only a few params in it will be used.
 //   last_frame: the frame buffer of the last frame.
 //   golden_frame: the frame buffer of the golden frame.
-//   alt_ref_frame: the frame buffer of the alt ref frame.
-//   mb_row: row index in the unit of first pass block size.
-//   mb_col: column index in the unit of first pass block size.
+//   unit_row: row index in the unit of first pass block size.
+//   unit_col: column index in the unit of first pass block size.
 //   recon_yoffset: the y offset of the reconstructed  frame buffer,
 //                  indicating the starting point of the current block.
 //   recont_uvoffset: the u/v offset of the reconstructed frame buffer,
 //                    indicating the starting point of the current block.
 //   src_yoffset: the y offset of the source frame buffer.
-//   alt_ref_frame_offset: the y offset of the alt ref frame buffer.
 //   fp_block_size: first pass block size.
 //   this_intra_error: the intra prediction error of this block.
 //   raw_motion_err_counts: the count of raw motion vectors.
 //   raw_motion_err_list: the array that records the raw motion error.
-//   best_ref_mv: best reference mv found so far.
-//   last_mv: last mv.
+//   ref_mv: the reference used to start the motion search
+//   best_mv: the best mv found
+//   last_non_zero_mv: the last non zero mv found in this tile row.
 //   stats: frame encoding stats.
 //  Modifies:
 //    raw_motion_err_list
@@ -557,33 +666,35 @@ static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
 //  Returns:
 //    this_inter_error
 static int firstpass_inter_prediction(
-    AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const last_frame,
-    const YV12_BUFFER_CONFIG *const golden_frame,
-    const YV12_BUFFER_CONFIG *const alt_ref_frame, const int mb_row,
-    const int mb_col, const int recon_yoffset, const int recon_uvoffset,
-    const int src_yoffset, const int alt_ref_frame_yoffset,
-    const BLOCK_SIZE fp_block_size, const int this_intra_error,
-    const int raw_motion_err_counts, int *raw_motion_err_list, MV *best_ref_mv,
-    MV *last_mv, FRAME_STATS *stats) {
+    AV1_COMP *cpi, ThreadData *td, const YV12_BUFFER_CONFIG *const last_frame,
+    const YV12_BUFFER_CONFIG *const golden_frame, const int unit_row,
+    const int unit_col, const int recon_yoffset, const int recon_uvoffset,
+    const int src_yoffset, const BLOCK_SIZE fp_block_size,
+    const int this_intra_error, const int raw_motion_err_counts,
+    int *raw_motion_err_list, const MV ref_mv, MV *best_mv,
+    MV *last_non_zero_mv, FRAME_STATS *stats) {
   int this_inter_error = this_intra_error;
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CurrentFrame *const current_frame = &cm->current_frame;
-  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int is_high_bitdepth = is_cur_buf_hbd(xd);
   const int bitdepth = xd->bd;
-  const int mb_scale = mi_size_wide[fp_block_size];
-  const BLOCK_SIZE bsize = get_bsize(mi_params, mb_row, mb_col);
+  const int unit_scale = mi_size_wide[fp_block_size];
+  const BLOCK_SIZE bsize =
+      get_bsize(mi_params, fp_block_size, unit_row, unit_col);
   const int fp_block_size_height = block_size_wide[fp_block_size];
+  const int unit_width = mi_size_wide[fp_block_size];
+  const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows);
+  const int unit_cols = get_unit_cols(fp_block_size, mi_params->mb_cols);
   // Assume 0,0 motion with no mv overhead.
   FULLPEL_MV mv = kZeroFullMv;
-  FULLPEL_MV tmp_mv = kZeroFullMv;
   xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
   // Set up limit values for motion vectors to prevent them extending
   // outside the UMV borders.
-  av1_set_mv_col_limits(mi_params, &x->mv_limits, (mb_col << 2),
-                        (fp_block_size_height >> MI_SIZE_LOG2),
+  av1_set_mv_col_limits(mi_params, &x->mv_limits, unit_col * unit_width,
+                        fp_block_size_height >> MI_SIZE_LOG2,
                         cpi->oxcf.border_in_pixels);
 
   int motion_error =
@@ -601,16 +712,17 @@ static int firstpass_inter_prediction(
       is_high_bitdepth, bitdepth, bsize, &x->plane[0].src,
       &unscaled_last_source_buf_2d);
   raw_motion_err_list[raw_motion_err_counts] = raw_motion_error;
+  const FIRST_PASS_SPEED_FEATURES *const fp_sf = &cpi->sf.fp_sf;
 
-  // TODO(pengchong): Replace the hard-coded threshold
-  if (raw_motion_error > LOW_MOTION_ERROR_THRESH) {
+  if (raw_motion_error > fp_sf->skip_motion_search_threshold) {
     // Test last reference frame using the previous best mv as the
     // starting point (best reference) for the search.
-    first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error);
+    first_pass_motion_search(cpi, x, &ref_mv, &mv, &motion_error);
 
     // If the current best reference mv is not centered on 0,0 then do a
     // 0,0 based search as well.
-    if (!is_zero_mv(best_ref_mv)) {
+    if ((fp_sf->skip_zeromv_motion_search == 0) && !is_zero_mv(&ref_mv)) {
+      FULLPEL_MV tmp_mv = kZeroFullMv;
       int tmp_err = INT_MAX;
       first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
 
@@ -623,6 +735,7 @@ static int firstpass_inter_prediction(
     // Motion search in 2nd reference frame.
     int gf_motion_error = motion_error;
     if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+      FULLPEL_MV tmp_mv = kZeroFullMv;
       // Assume 0,0 motion with no mv overhead.
       xd->plane[0].pre[0].buf = golden_frame->y_buffer + recon_yoffset;
       xd->plane[0].pre[0].stride = golden_frame->y_stride;
@@ -646,48 +759,18 @@ static int firstpass_inter_prediction(
       stats->sr_coded_error += motion_error;
     }
 
-    // Motion search in 3rd reference frame.
-    int alt_motion_error = motion_error;
-    if (alt_ref_frame != NULL) {
-      xd->plane[0].pre[0].buf = alt_ref_frame->y_buffer + alt_ref_frame_yoffset;
-      xd->plane[0].pre[0].stride = alt_ref_frame->y_stride;
-      alt_motion_error =
-          get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
-                                        &x->plane[0].src, &xd->plane[0].pre[0]);
-      first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &alt_motion_error);
-    }
-    if (alt_motion_error < motion_error && alt_motion_error < gf_motion_error &&
-        alt_motion_error < this_intra_error) {
-      ++stats->third_ref_count;
-    }
-    // In accumulating a score for the 3rd reference frame take the
-    // best of the motion predicted score and the intra coded error
-    // (just as will be done for) accumulation of "coded_error" for
-    // the last frame.
-    if (alt_ref_frame != NULL) {
-      stats->tr_coded_error += AOMMIN(alt_motion_error, this_intra_error);
-    } else {
-      // TODO(chengchen): I believe logically this should also be changed to
-      // stats->tr_coded_error += AOMMIN(alt_motion_error, this_intra_error).
-      stats->tr_coded_error += motion_error;
-    }
-
     // Reset to last frame as reference buffer.
     xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
     xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset;
     xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset;
   } else {
     stats->sr_coded_error += motion_error;
-    stats->tr_coded_error += motion_error;
   }
 
   // Start by assuming that intra mode is best.
-  best_ref_mv->row = 0;
-  best_ref_mv->col = 0;
+  *best_mv = kZeroMv;
 
   if (motion_error <= this_intra_error) {
-    aom_clear_system_state();
-
     // Keep a count of cases where the inter and intra were very close
     // and very low. This helps with scene cut detection for example in
     // cropped clips with black bars at the sides or top and bottom.
@@ -702,32 +785,55 @@ static int firstpass_inter_prediction(
           (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error);
     }
 
-    const MV best_mv = get_mv_from_fullmv(&mv);
+    *best_mv = get_mv_from_fullmv(&mv);
     this_inter_error = motion_error;
     xd->mi[0]->mode = NEWMV;
-    xd->mi[0]->mv[0].as_mv = best_mv;
+    xd->mi[0]->mv[0].as_mv = *best_mv;
     xd->mi[0]->tx_size = TX_4X4;
     xd->mi[0]->ref_frame[0] = LAST_FRAME;
     xd->mi[0]->ref_frame[1] = NONE_FRAME;
-    av1_enc_build_inter_predictor(cm, xd, mb_row * mb_scale, mb_col * mb_scale,
-                                  NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y);
-    av1_encode_sby_pass1(cpi, x, bsize);
-    stats->sum_mvr += best_mv.row;
-    stats->sum_mvr_abs += abs(best_mv.row);
-    stats->sum_mvc += best_mv.col;
-    stats->sum_mvc_abs += abs(best_mv.col);
-    stats->sum_mvrs += best_mv.row * best_mv.row;
-    stats->sum_mvcs += best_mv.col * best_mv.col;
+
+    if (fp_sf->disable_recon == 0) {
+      av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale,
+                                    unit_col * unit_scale, NULL, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+      av1_encode_sby_pass1(cpi, x, bsize);
+    }
+    stats->sum_mvr += best_mv->row;
+    stats->sum_mvr_abs += abs(best_mv->row);
+    stats->sum_mvc += best_mv->col;
+    stats->sum_mvc_abs += abs(best_mv->col);
+    stats->sum_mvrs += best_mv->row * best_mv->row;
+    stats->sum_mvcs += best_mv->col * best_mv->col;
     ++stats->inter_count;
 
-    *best_ref_mv = best_mv;
-    accumulate_mv_stats(best_mv, mv, mb_row, mb_col, mi_params->mb_rows,
-                        mi_params->mb_cols, last_mv, stats);
+    accumulate_mv_stats(*best_mv, mv, unit_row, unit_col, unit_rows, unit_cols,
+                        last_non_zero_mv, stats);
   }
 
   return this_inter_error;
 }
 
+// Normalize the first pass stats.
+// Error / counters are normalized to each MB.
+// MVs are normalized to the width/height of the frame.
+static void normalize_firstpass_stats(FIRSTPASS_STATS *fps,
+                                      double num_mbs_16x16, double f_w,
+                                      double f_h) {
+  fps->coded_error /= num_mbs_16x16;
+  fps->sr_coded_error /= num_mbs_16x16;
+  fps->intra_error /= num_mbs_16x16;
+  fps->frame_avg_wavelet_energy /= num_mbs_16x16;
+
+  fps->MVr /= f_h;
+  fps->mvr_abs /= f_h;
+  fps->MVc /= f_w;
+  fps->mvc_abs /= f_w;
+  fps->MVrv /= (f_h * f_h);
+  fps->MVcv /= (f_w * f_w);
+  fps->new_mv_count /= num_mbs_16x16;
+}
+
 // Updates the first pass stats of this frame.
 // Input:
 //   cpi: the encoder setting. Only a few params in it will be used.
@@ -746,8 +852,9 @@ static void update_firstpass_stats(AV1_COMP *cpi,
                                    const FRAME_STATS *const stats,
                                    const double raw_err_stdev,
                                    const int frame_number,
-                                   const int64_t ts_duration) {
-  TWO_PASS *twopass = &cpi->twopass;
+                                   const int64_t ts_duration,
+                                   const BLOCK_SIZE fp_block_size) {
+  TWO_PASS *twopass = &cpi->ppi->twopass;
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
@@ -757,26 +864,31 @@ static void update_firstpass_stats(AV1_COMP *cpi,
   // where the typical "real" energy per MB also falls.
   // Initial estimate here uses sqrt(mbs) to define the min_err, where the
   // number of mbs is proportional to the image area.
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                             : mi_params->MBs;
+  const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+                                ? cpi->initial_mbs
+                                : mi_params->MBs;
+  // Number of actual units used in the first pass, it can be other square
+  // block sizes than 16X16.
+  const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16);
   const double min_err = 200 * sqrt(num_mbs);
 
   fps.weight = stats->intra_factor * stats->brightness_factor;
   fps.frame = frame_number;
   fps.coded_error = (double)(stats->coded_error >> 8) + min_err;
   fps.sr_coded_error = (double)(stats->sr_coded_error >> 8) + min_err;
-  fps.tr_coded_error = (double)(stats->tr_coded_error >> 8) + min_err;
   fps.intra_error = (double)(stats->intra_error >> 8) + min_err;
   fps.frame_avg_wavelet_energy = (double)stats->frame_avg_wavelet_energy;
   fps.count = 1.0;
   fps.pcnt_inter = (double)stats->inter_count / num_mbs;
   fps.pcnt_second_ref = (double)stats->second_ref_count / num_mbs;
-  fps.pcnt_third_ref = (double)stats->third_ref_count / num_mbs;
   fps.pcnt_neutral = (double)stats->neutral_count / num_mbs;
   fps.intra_skip_pct = (double)stats->intra_skip_count / num_mbs;
   fps.inactive_zone_rows = (double)stats->image_data_start_row;
-  fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
+  fps.inactive_zone_cols = (double)0;  // Placeholder: not currently supported.
   fps.raw_error_stdev = raw_err_stdev;
+  fps.is_flash = 0;
+  fps.noise_var = (double)0;
+  fps.cor_coeff = (double)1.0;
 
   if (stats->mv_count > 0) {
     fps.MVr = (double)stats->sum_mvr / stats->mv_count;
@@ -809,18 +921,25 @@ static void update_firstpass_stats(AV1_COMP *cpi,
   // cpi->source_time_stamp.
   fps.duration = (double)ts_duration;
 
+  normalize_firstpass_stats(&fps, num_mbs_16X16, cm->width, cm->height);
+
   // We will store the stats inside the persistent twopass struct (and NOT the
   // local variable 'fps'), and then cpi->output_pkt_list will point to it.
   *this_frame_stats = fps;
-  output_stats(this_frame_stats, cpi->output_pkt_list);
-  if (cpi->twopass.stats_buf_ctx->total_stats != NULL) {
-    accumulate_stats(cpi->twopass.stats_buf_ctx->total_stats, &fps);
+  if (!cpi->ppi->lap_enabled) {
+    output_stats(this_frame_stats, cpi->ppi->output_pkt_list);
+  } else {
+    av1_firstpass_info_push(&twopass->firstpass_info, this_frame_stats);
+  }
+  if (cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL) {
+    av1_accumulate_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, &fps);
   }
   /*In the case of two pass, first pass uses it as a circular buffer,
    * when LAP is enabled it is used as a linear buffer*/
   twopass->stats_buf_ctx->stats_in_end++;
-  if ((cpi->oxcf.pass == 1) && (twopass->stats_buf_ctx->stats_in_end >=
-                                twopass->stats_buf_ctx->stats_in_buf_end)) {
+  if ((cpi->oxcf.pass == AOM_RC_FIRST_PASS) &&
+      (twopass->stats_buf_ctx->stats_in_end >=
+       twopass->stats_buf_ctx->stats_in_buf_end)) {
     twopass->stats_buf_ctx->stats_in_end =
         twopass->stats_buf_ctx->stats_in_start;
   }
@@ -845,70 +964,330 @@ static void print_reconstruction_frame(
   fclose(recon_file);
 }
 
+static FRAME_STATS accumulate_frame_stats(FRAME_STATS *mb_stats, int mb_rows,
+                                          int mb_cols) {
+  FRAME_STATS stats = { 0 };
+  int i, j;
+
+  stats.image_data_start_row = INVALID_ROW;
+  for (j = 0; j < mb_rows; j++) {
+    for (i = 0; i < mb_cols; i++) {
+      FRAME_STATS mb_stat = mb_stats[j * mb_cols + i];
+      stats.brightness_factor += mb_stat.brightness_factor;
+      stats.coded_error += mb_stat.coded_error;
+      stats.frame_avg_wavelet_energy += mb_stat.frame_avg_wavelet_energy;
+      if (stats.image_data_start_row == INVALID_ROW &&
+          mb_stat.image_data_start_row != INVALID_ROW) {
+        stats.image_data_start_row = mb_stat.image_data_start_row;
+      }
+      stats.inter_count += mb_stat.inter_count;
+      stats.intra_error += mb_stat.intra_error;
+      stats.intra_factor += mb_stat.intra_factor;
+      stats.intra_skip_count += mb_stat.intra_skip_count;
+      stats.mv_count += mb_stat.mv_count;
+      stats.neutral_count += mb_stat.neutral_count;
+      stats.new_mv_count += mb_stat.new_mv_count;
+      stats.second_ref_count += mb_stat.second_ref_count;
+      stats.sr_coded_error += mb_stat.sr_coded_error;
+      stats.sum_in_vectors += mb_stat.sum_in_vectors;
+      stats.sum_mvc += mb_stat.sum_mvc;
+      stats.sum_mvc_abs += mb_stat.sum_mvc_abs;
+      stats.sum_mvcs += mb_stat.sum_mvcs;
+      stats.sum_mvr += mb_stat.sum_mvr;
+      stats.sum_mvr_abs += mb_stat.sum_mvr_abs;
+      stats.sum_mvrs += mb_stat.sum_mvrs;
+    }
+  }
+  return stats;
+}
+
+static void setup_firstpass_data(AV1_COMMON *const cm,
+                                 FirstPassData *firstpass_data,
+                                 const int unit_rows, const int unit_cols) {
+  CHECK_MEM_ERROR(cm, firstpass_data->raw_motion_err_list,
+                  aom_calloc(unit_rows * unit_cols,
+                             sizeof(*firstpass_data->raw_motion_err_list)));
+  CHECK_MEM_ERROR(
+      cm, firstpass_data->mb_stats,
+      aom_calloc(unit_rows * unit_cols, sizeof(*firstpass_data->mb_stats)));
+  for (int j = 0; j < unit_rows; j++) {
+    for (int i = 0; i < unit_cols; i++) {
+      firstpass_data->mb_stats[j * unit_cols + i].image_data_start_row =
+          INVALID_ROW;
+    }
+  }
+}
+
+static void free_firstpass_data(FirstPassData *firstpass_data) {
+  aom_free(firstpass_data->raw_motion_err_list);
+  aom_free(firstpass_data->mb_stats);
+}
+
+int av1_get_unit_rows_in_tile(const TileInfo *tile,
+                              const BLOCK_SIZE fp_block_size) {
+  const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+  const int mi_rows = tile->mi_row_end - tile->mi_row_start;
+  const int unit_rows = CEIL_POWER_OF_TWO(mi_rows, unit_height_log2);
+
+  return unit_rows;
+}
+
+int av1_get_unit_cols_in_tile(const TileInfo *tile,
+                              const BLOCK_SIZE fp_block_size) {
+  const int unit_width_log2 = mi_size_wide_log2[fp_block_size];
+  const int mi_cols = tile->mi_col_end - tile->mi_col_start;
+  const int unit_cols = CEIL_POWER_OF_TWO(mi_cols, unit_width_log2);
+
+  return unit_cols;
+}
+
 #define FIRST_PASS_ALT_REF_DISTANCE 16
+static void first_pass_tile(AV1_COMP *cpi, ThreadData *td,
+                            TileDataEnc *tile_data,
+                            const BLOCK_SIZE fp_block_size) {
+  TileInfo *tile = &tile_data->tile_info;
+  const int unit_height = mi_size_high[fp_block_size];
+  const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+  for (int mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+       mi_row += unit_height) {
+    av1_first_pass_row(cpi, td, tile_data, mi_row >> unit_height_log2,
+                       fp_block_size);
+  }
+}
+
+static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  const int num_planes = av1_num_planes(&cpi->common);
+  for (int plane = 0; plane < num_planes; plane++) {
+    const int subsampling_xy =
+        plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
+              : 0;
+    const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
+    CHECK_MEM_ERROR(
+        cm, cpi->td.mb.plane[plane].src_diff,
+        (int16_t *)aom_memalign(
+            32, sizeof(*cpi->td.mb.plane[plane].src_diff) * sb_size));
+  }
+  for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileDataEnc *const tile_data =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      first_pass_tile(cpi, &cpi->td, tile_data, fp_block_size);
+    }
+  }
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cpi->td.mb.plane[plane].src_diff) {
+      aom_free(cpi->td.mb.plane[plane].src_diff);
+      cpi->td.mb.plane[plane].src_diff = NULL;
+    }
+  }
+}
+
+void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+                        const int unit_row, const BLOCK_SIZE fp_block_size) {
+  MACROBLOCK *const x = &td->mb;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TileInfo *tile = &tile_data->tile_info;
+  const int qindex = find_fp_qindex(seq_params->bit_depth);
+  const int fp_block_size_width = block_size_high[fp_block_size];
+  const int fp_block_size_height = block_size_wide[fp_block_size];
+  const int unit_width = mi_size_wide[fp_block_size];
+  const int unit_width_log2 = mi_size_wide_log2[fp_block_size];
+  const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+  const int unit_cols = mi_params->mb_cols * 4 / unit_width;
+  int raw_motion_err_counts = 0;
+  int unit_row_in_tile = unit_row - (tile->mi_row_start >> unit_height_log2);
+  int unit_col_start = tile->mi_col_start >> unit_width_log2;
+  int unit_cols_in_tile = av1_get_unit_cols_in_tile(tile, fp_block_size);
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+  AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+
+  const YV12_BUFFER_CONFIG *const last_frame =
+      get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const YV12_BUFFER_CONFIG *golden_frame =
+      get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
+
+  PICK_MODE_CONTEXT *ctx = td->firstpass_ctx;
+  FRAME_STATS *mb_stats =
+      cpi->firstpass_data.mb_stats + unit_row * unit_cols + unit_col_start;
+  int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list +
+                             unit_row * unit_cols + unit_col_start;
+  MV *first_top_mv = &tile_data->firstpass_top_mv;
+
+  for (int i = 0; i < num_planes; ++i) {
+    x->plane[i].coeff = ctx->coeff[i];
+    x->plane[i].qcoeff = ctx->qcoeff[i];
+    x->plane[i].eobs = ctx->eobs[i];
+    x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+    x->plane[i].dqcoeff = ctx->dqcoeff[i];
+  }
+
+  const int src_y_stride = cpi->source->y_stride;
+  const int recon_y_stride = this_frame->y_stride;
+  const int recon_uv_stride = this_frame->uv_stride;
+  const int uv_mb_height =
+      fp_block_size_height >> (this_frame->y_height > this_frame->uv_height);
+
+  MV best_ref_mv = kZeroMv;
+  MV last_mv;
+
+  // Reset above block coeffs.
+  xd->up_available = (unit_row_in_tile != 0);
+  int recon_yoffset = (unit_row * recon_y_stride * fp_block_size_height) +
+                      (unit_col_start * fp_block_size_width);
+  int src_yoffset = (unit_row * src_y_stride * fp_block_size_height) +
+                    (unit_col_start * fp_block_size_width);
+  int recon_uvoffset = (unit_row * recon_uv_stride * uv_mb_height) +
+                       (unit_col_start * uv_mb_height);
+
+  // Set up limit values for motion vectors to prevent them extending
+  // outside the UMV borders.
+  av1_set_mv_row_limits(
+      mi_params, &x->mv_limits, (unit_row << unit_height_log2),
+      (fp_block_size_height >> MI_SIZE_LOG2), cpi->oxcf.border_in_pixels);
+
+  av1_setup_src_planes(x, cpi->source, unit_row << unit_height_log2,
+                       tile->mi_col_start, num_planes, fp_block_size);
+
+  // Fix - zero the 16x16 block first. This ensures correct this_intra_error for
+  // block sizes smaller than 16x16.
+  av1_zero_array(x->plane[0].src_diff, 256);
+
+  for (int unit_col_in_tile = 0; unit_col_in_tile < unit_cols_in_tile;
+       unit_col_in_tile++) {
+    const int unit_col = unit_col_start + unit_col_in_tile;
+
+    enc_row_mt->sync_read_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile);
+
+    if (unit_col_in_tile == 0) {
+      last_mv = *first_top_mv;
+    }
+    int this_intra_error = firstpass_intra_prediction(
+        cpi, td, this_frame, tile, unit_row, unit_col, recon_yoffset,
+        recon_uvoffset, fp_block_size, qindex, mb_stats);
+
+    if (!frame_is_intra_only(cm)) {
+      const int this_inter_error = firstpass_inter_prediction(
+          cpi, td, last_frame, golden_frame, unit_row, unit_col, recon_yoffset,
+          recon_uvoffset, src_yoffset, fp_block_size, this_intra_error,
+          raw_motion_err_counts, raw_motion_err_list, best_ref_mv, &best_ref_mv,
+          &last_mv, mb_stats);
+      if (unit_col_in_tile == 0) {
+        *first_top_mv = last_mv;
+      }
+      mb_stats->coded_error += this_inter_error;
+      ++raw_motion_err_counts;
+    } else {
+      mb_stats->sr_coded_error += this_intra_error;
+      mb_stats->coded_error += this_intra_error;
+    }
+
+    // Adjust to the next column of MBs.
+    x->plane[0].src.buf += fp_block_size_width;
+    x->plane[1].src.buf += uv_mb_height;
+    x->plane[2].src.buf += uv_mb_height;
+
+    recon_yoffset += fp_block_size_width;
+    src_yoffset += fp_block_size_width;
+    recon_uvoffset += uv_mb_height;
+    mb_stats++;
+
+    enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile,
+                               unit_cols_in_tile);
+  }
+}
+
+void av1_noop_first_pass_frame(AV1_COMP *cpi, const int64_t ts_duration) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int unit_rows = get_unit_rows(BLOCK_16X16, mi_params->mb_rows);
+  const int unit_cols = get_unit_cols(BLOCK_16X16, mi_params->mb_cols);
+  setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols);
+  FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats;
+  FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols);
+  free_firstpass_data(&cpi->firstpass_data);
+  update_firstpass_stats(cpi, &stats, 1.0, current_frame->frame_number,
+                         ts_duration, BLOCK_16X16);
+}
+
 void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
   MACROBLOCK *const x = &cpi->td.mb;
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CurrentFrame *const current_frame = &cm->current_frame;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
-  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
-  MV last_mv = kZeroMv;
   const int qindex = find_fp_qindex(seq_params->bit_depth);
+
   // Detect if the key frame is screen content type.
   if (frame_is_intra_only(cm)) {
     FeatureFlags *const features = &cm->features;
     av1_set_screen_content_options(cpi, features);
-    cpi->is_screen_content_type = features->allow_screen_content_tools;
   }
-  // First pass coding proceeds in raster scan order with unit size of 16x16.
-  const BLOCK_SIZE fp_block_size = BLOCK_16X16;
-  const int fp_block_size_width = block_size_high[fp_block_size];
-  const int fp_block_size_height = block_size_wide[fp_block_size];
-  int *raw_motion_err_list;
-  int raw_motion_err_counts = 0;
-  CHECK_MEM_ERROR(cm, raw_motion_err_list,
-                  aom_calloc(mi_params->mb_rows * mi_params->mb_cols,
-                             sizeof(*raw_motion_err_list)));
-  // Tiling is ignored in the first pass.
-  TileInfo tile;
-  av1_tile_init(&tile, cm, 0, 0);
-  FRAME_STATS stats = { 0 };
-  stats.image_data_start_row = INVALID_ROW;
+
+  // Prepare the speed features
+  av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+
+  // Unit size for the first pass encoding.
+  const BLOCK_SIZE fp_block_size =
+      get_fp_block_size(cpi->is_screen_content_type);
+
+  // Number of rows in the unit size.
+  // Note mi_params->mb_rows and mi_params->mb_cols are in the unit of 16x16.
+  const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows);
+  const int unit_cols = get_unit_cols(fp_block_size, mi_params->mb_cols);
+
+  // Set fp_block_size, for the convenience of multi-thread usage.
+  cpi->fp_block_size = fp_block_size;
+
+  setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols);
+  int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list;
+  FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats;
+
+  // multi threading info
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  if (cpi->allocated_tiles < tile_cols * tile_rows) {
+    av1_row_mt_mem_dealloc(cpi);
+    av1_alloc_tile_data(cpi);
+  }
+
+  av1_init_tile_data(cpi);
 
   const YV12_BUFFER_CONFIG *const last_frame =
       get_ref_frame_yv12_buf(cm, LAST_FRAME);
   const YV12_BUFFER_CONFIG *golden_frame =
       get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
-  const YV12_BUFFER_CONFIG *alt_ref_frame = NULL;
-  const int alt_ref_offset =
-      FIRST_PASS_ALT_REF_DISTANCE -
-      (current_frame->frame_number % FIRST_PASS_ALT_REF_DISTANCE);
-  if (alt_ref_offset < FIRST_PASS_ALT_REF_DISTANCE) {
-    const struct lookahead_entry *const alt_ref_frame_buffer =
-        av1_lookahead_peek(cpi->lookahead, alt_ref_offset,
-                           cpi->compressor_stage);
-    if (alt_ref_frame_buffer != NULL) {
-      alt_ref_frame = &alt_ref_frame_buffer->img;
-    }
-  }
   YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
   // First pass code requires valid last and new frame buffers.
   assert(this_frame != NULL);
   assert(frame_is_intra_only(cm) || (last_frame != NULL));
 
   av1_setup_frame_size(cpi);
-  aom_clear_system_state();
+  av1_set_mv_search_params(cpi);
 
   set_mi_offsets(mi_params, xd, 0, 0);
-  xd->mi[0]->sb_type = fp_block_size;
+  xd->mi[0]->bsize = fp_block_size;
 
   // Do not use periodic key frames.
   cpi->rc.frames_to_key = INT_MAX;
 
-  av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, qindex);
+  av1_set_quantizer(
+      cm, cpi->oxcf.q_cfg.qm_minlevel, cpi->oxcf.q_cfg.qm_maxlevel, qindex,
+      cpi->oxcf.q_cfg.enable_chroma_deltaq, cpi->oxcf.q_cfg.enable_hdr_deltaq);
 
   av1_setup_block_planes(xd, seq_params->subsampling_x,
                          seq_params->subsampling_y, num_planes);
@@ -927,104 +1306,55 @@ void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
   xd->cfl.store_y = 0;
   av1_frame_init_quantizer(cpi);
 
-  for (int i = 0; i < num_planes; ++i) {
-    x->plane[i].coeff = ctx->coeff[i];
-    x->plane[i].qcoeff = ctx->qcoeff[i];
-    x->plane[i].eobs = ctx->eobs[i];
-    x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
-    xd->plane[i].dqcoeff = ctx->dqcoeff[i];
-  }
-
+  av1_default_coef_probs(cm);
+  av1_init_mode_probs(cm->fc);
   av1_init_mv_probs(cm);
   av1_initialize_rd_consts(cpi);
 
-  const int src_y_stride = cpi->source->y_stride;
-  const int recon_y_stride = this_frame->y_stride;
-  const int recon_uv_stride = this_frame->uv_stride;
-  const int uv_mb_height =
-      fp_block_size_height >> (this_frame->y_height > this_frame->uv_height);
+  enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
+  enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
 
-  for (int mb_row = 0; mb_row < mi_params->mb_rows; ++mb_row) {
-    MV best_ref_mv = kZeroMv;
-
-    // Reset above block coeffs.
-    xd->up_available = (mb_row != 0);
-    int recon_yoffset = (mb_row * recon_y_stride * fp_block_size_height);
-    int src_yoffset = (mb_row * src_y_stride * fp_block_size_height);
-    int recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
-    int alt_ref_frame_yoffset =
-        (alt_ref_frame != NULL)
-            ? mb_row * alt_ref_frame->y_stride * fp_block_size_height
-            : -1;
-
-    // Set up limit values for motion vectors to prevent them extending
-    // outside the UMV borders.
-    av1_set_mv_row_limits(mi_params, &x->mv_limits, (mb_row << 2),
-                          (fp_block_size_height >> MI_SIZE_LOG2),
-                          cpi->oxcf.border_in_pixels);
-
-    for (int mb_col = 0; mb_col < mi_params->mb_cols; ++mb_col) {
-      int this_intra_error = firstpass_intra_prediction(
-          cpi, this_frame, &tile, mb_row, mb_col, recon_yoffset, recon_uvoffset,
-          fp_block_size, qindex, &stats);
-
-      if (!frame_is_intra_only(cm)) {
-        const int this_inter_error = firstpass_inter_prediction(
-            cpi, last_frame, golden_frame, alt_ref_frame, mb_row, mb_col,
-            recon_yoffset, recon_uvoffset, src_yoffset, alt_ref_frame_yoffset,
-            fp_block_size, this_intra_error, raw_motion_err_counts,
-            raw_motion_err_list, &best_ref_mv, &last_mv, &stats);
-        stats.coded_error += this_inter_error;
-        ++raw_motion_err_counts;
-      } else {
-        stats.sr_coded_error += this_intra_error;
-        stats.tr_coded_error += this_intra_error;
-        stats.coded_error += this_intra_error;
-      }
-
-      // Adjust to the next column of MBs.
-      x->plane[0].src.buf += fp_block_size_width;
-      x->plane[1].src.buf += uv_mb_height;
-      x->plane[2].src.buf += uv_mb_height;
-
-      recon_yoffset += fp_block_size_width;
-      src_yoffset += fp_block_size_width;
-      recon_uvoffset += uv_mb_height;
-      alt_ref_frame_yoffset += fp_block_size_width;
-    }
-    // Adjust to the next row of MBs.
-    x->plane[0].src.buf += fp_block_size_height * x->plane[0].src.stride -
-                           fp_block_size_width * mi_params->mb_cols;
-    x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride -
-                           uv_mb_height * mi_params->mb_cols;
-    x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride -
-                           uv_mb_height * mi_params->mb_cols;
+  if (mt_info->num_workers > 1) {
+    enc_row_mt->sync_read_ptr = av1_row_mt_sync_read;
+    enc_row_mt->sync_write_ptr = av1_row_mt_sync_write;
+    av1_fp_encode_tiles_row_mt(cpi);
+  } else {
+    first_pass_tiles(cpi, fp_block_size);
   }
+
+  FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols);
+  int total_raw_motion_err_count =
+      frame_is_intra_only(cm) ? 0 : unit_rows * unit_cols;
   const double raw_err_stdev =
-      raw_motion_error_stdev(raw_motion_err_list, raw_motion_err_counts);
-  aom_free(raw_motion_err_list);
+      raw_motion_error_stdev(raw_motion_err_list, total_raw_motion_err_count);
+  free_firstpass_data(&cpi->firstpass_data);
 
   // Clamp the image start to rows/2. This number of rows is discarded top
   // and bottom as dead data so rows / 2 means the frame is blank.
-  if ((stats.image_data_start_row > mi_params->mb_rows / 2) ||
+  if ((stats.image_data_start_row > unit_rows / 2) ||
       (stats.image_data_start_row == INVALID_ROW)) {
-    stats.image_data_start_row = mi_params->mb_rows / 2;
+    stats.image_data_start_row = unit_rows / 2;
   }
   // Exclude any image dead zone
   if (stats.image_data_start_row > 0) {
     stats.intra_skip_count =
         AOMMAX(0, stats.intra_skip_count -
-                      (stats.image_data_start_row * mi_params->mb_cols * 2));
+                      (stats.image_data_start_row * unit_cols * 2));
   }
 
-  TWO_PASS *twopass = &cpi->twopass;
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                             : mi_params->MBs;
+  TWO_PASS *twopass = &cpi->ppi->twopass;
+  const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+                                ? cpi->initial_mbs
+                                : mi_params->MBs;
+  // Number of actual units used in the first pass, it can be other square
+  // block sizes than 16X16.
+  const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16);
   stats.intra_factor = stats.intra_factor / (double)num_mbs;
   stats.brightness_factor = stats.brightness_factor / (double)num_mbs;
   FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
   update_firstpass_stats(cpi, &stats, raw_err_stdev,
-                         current_frame->frame_number, ts_duration);
+                         current_frame->frame_number, ts_duration,
+                         fp_block_size);
 
   // Copy the previous Last Frame back into gf buffer if the prediction is good
   // enough... but also don't allow it to lag too far.
@@ -1063,3 +1393,121 @@ void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
 
   ++current_frame->frame_number;
 }
+
+aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info,
+                                        FIRSTPASS_STATS *ext_stats_buf,
+                                        int ext_stats_buf_size) {
+  assert(IMPLIES(ext_stats_buf == NULL, ext_stats_buf_size == 0));
+  if (ext_stats_buf == NULL) {
+    firstpass_info->stats_buf = firstpass_info->static_stats_buf;
+    firstpass_info->stats_buf_size =
+        sizeof(firstpass_info->static_stats_buf) /
+        sizeof(firstpass_info->static_stats_buf[0]);
+    firstpass_info->start_index = 0;
+    firstpass_info->cur_index = 0;
+    firstpass_info->stats_count = 0;
+    firstpass_info->future_stats_count = 0;
+    firstpass_info->past_stats_count = 0;
+    av1_zero(firstpass_info->total_stats);
+    if (ext_stats_buf_size == 0) {
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    firstpass_info->stats_buf = ext_stats_buf;
+    firstpass_info->stats_buf_size = ext_stats_buf_size;
+    firstpass_info->start_index = 0;
+    firstpass_info->cur_index = 0;
+    firstpass_info->stats_count = firstpass_info->stats_buf_size;
+    firstpass_info->future_stats_count = firstpass_info->stats_count;
+    firstpass_info->past_stats_count = 0;
+    av1_zero(firstpass_info->total_stats);
+    for (int i = 0; i < firstpass_info->stats_count; ++i) {
+      av1_accumulate_stats(&firstpass_info->total_stats,
+                           &firstpass_info->stats_buf[i]);
+    }
+  }
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_firstpass_info_move_cur_index(
+    FIRSTPASS_INFO *firstpass_info) {
+  assert(firstpass_info->future_stats_count +
+             firstpass_info->past_stats_count ==
+         firstpass_info->stats_count);
+  if (firstpass_info->future_stats_count > 1) {
+    firstpass_info->cur_index =
+        (firstpass_info->cur_index + 1) % firstpass_info->stats_buf_size;
+    --firstpass_info->future_stats_count;
+    ++firstpass_info->past_stats_count;
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_ERROR;
+  }
+}
+
+aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info) {
+  if (firstpass_info->stats_count > 0 && firstpass_info->past_stats_count > 0) {
+    const int next_start =
+        (firstpass_info->start_index + 1) % firstpass_info->stats_buf_size;
+    firstpass_info->start_index = next_start;
+    --firstpass_info->stats_count;
+    --firstpass_info->past_stats_count;
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_ERROR;
+  }
+}
+
+aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop(
+    FIRSTPASS_INFO *firstpass_info) {
+  aom_codec_err_t ret = av1_firstpass_info_move_cur_index(firstpass_info);
+  if (ret != AOM_CODEC_OK) return ret;
+  ret = av1_firstpass_info_pop(firstpass_info);
+  return ret;
+}
+
+aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info,
+                                        const FIRSTPASS_STATS *input_stats) {
+  if (firstpass_info->stats_count < firstpass_info->stats_buf_size) {
+    const int next_index =
+        (firstpass_info->start_index + firstpass_info->stats_count) %
+        firstpass_info->stats_buf_size;
+    firstpass_info->stats_buf[next_index] = *input_stats;
+    ++firstpass_info->stats_count;
+    ++firstpass_info->future_stats_count;
+    av1_accumulate_stats(&firstpass_info->total_stats, input_stats);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_ERROR;
+  }
+}
+
+const FIRSTPASS_STATS *av1_firstpass_info_peek(
+    const FIRSTPASS_INFO *firstpass_info, int offset_from_cur) {
+  if (offset_from_cur >= -firstpass_info->past_stats_count &&
+      offset_from_cur < firstpass_info->future_stats_count) {
+    const int index = (firstpass_info->cur_index + offset_from_cur) %
+                      firstpass_info->stats_buf_size;
+    return &firstpass_info->stats_buf[index];
+  } else {
+    return NULL;
+  }
+}
+
+int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info,
+                                    int offset_from_cur) {
+  if (offset_from_cur < firstpass_info->future_stats_count) {
+    return firstpass_info->future_stats_count - offset_from_cur;
+  }
+  return 0;
+}
+
+int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info,
+                                  int offset_from_cur) {
+  if (offset_from_cur >= -firstpass_info->past_stats_count) {
+    return offset_from_cur + firstpass_info->past_stats_count;
+  }
+  return 0;
+}
diff --git a/media/libaom/src/av1/encoder/firstpass.h b/media/libaom/src/av1/encoder/firstpass.h
index 99d4445395..e9afdf507e 100644
--- a/media/libaom/src/av1/encoder/firstpass.h
+++ b/media/libaom/src/av1/encoder/firstpass.h
@@ -29,105 +29,372 @@ extern "C" {
 #define MIN_MV_IN_OUT 0.4
 
 #define VLOW_MOTION_THRESHOLD 950
+struct ThreadData;
 
+/*!
+ * \brief The stucture of acummulated frame stats in the first pass.
+ *
+ * Errors (coded_error, intra_error, etc.) and counters (new_mv_count) are
+ * normalized to each MB. MV related stats (MVc, MVr, etc.) are normalized to
+ * the frame width and height. See function normalize_firstpass_stats.
+ */
 typedef struct {
-  // Frame number in display order, if stats are for a single frame.
-  // No real meaning for a collection of frames.
+  /*!
+   * Frame number in display order, if stats are for a single frame.
+   * No real meaning for a collection of frames.
+   */
   double frame;
-  // Weight assigned to this frame (or total weight for the collection of
-  // frames) currently based on intra factor and brightness factor. This is used
-  // to distribute bits betweeen easier and harder frames.
+  /*!
+   * Weight assigned to this frame (or total weight for the collection of
+   * frames) currently based on intra factor and brightness factor. This is used
+   * to distribute bits betweeen easier and harder frames.
+   */
   double weight;
-  // Intra prediction error.
+  /*!
+   * Intra prediction error.
+   */
   double intra_error;
-  // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+  /*!
+   * Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+   */
   double frame_avg_wavelet_energy;
-  // Best of intra pred error and inter pred error using last frame as ref.
+  /*!
+   * Best of intra pred error and inter pred error using last frame as ref.
+   */
   double coded_error;
-  // Best of intra pred error and inter pred error using golden frame as ref.
+  /*!
+   * Best of intra pred error and inter pred error using golden frame as ref.
+   */
   double sr_coded_error;
-  // Best of intra pred error and inter pred error using altref frame as ref.
-  double tr_coded_error;
-  // Percentage of blocks with inter pred error < intra pred error.
+  /*!
+   * Percentage of blocks with inter pred error < intra pred error.
+   */
   double pcnt_inter;
-  // Percentage of blocks using (inter prediction and) non-zero motion vectors.
+  /*!
+   * Percentage of blocks using (inter prediction and) non-zero motion vectors.
+   */
   double pcnt_motion;
-  // Percentage of blocks where golden frame was better than last or intra:
-  // inter pred error using golden frame < inter pred error using last frame and
-  // inter pred error using golden frame < intra pred error
+  /*!
+   * Percentage of blocks where golden frame was better than last or intra:
+   * inter pred error using golden frame < inter pred error using last frame and
+   * inter pred error using golden frame < intra pred error
+   */
   double pcnt_second_ref;
-  // Percentage of blocks where altref frame was better than intra, last, golden
-  double pcnt_third_ref;
-  // Percentage of blocks where intra and inter prediction errors were very
-  // close. Note that this is a 'weighted count', that is, the so blocks may be
-  // weighted by how close the two errors were.
+  /*!
+   * Percentage of blocks where intra and inter prediction errors were very
+   * close. Note that this is a 'weighted count', that is, the so blocks may be
+   * weighted by how close the two errors were.
+   */
   double pcnt_neutral;
-  // Percentage of blocks that have almost no intra error residual
-  // (i.e. are in effect completely flat and untextured in the intra
-  // domain). In natural videos this is uncommon, but it is much more
-  // common in animations, graphics and screen content, so may be used
-  // as a signal to detect these types of content.
+  /*!
+   * Percentage of blocks that have almost no intra error residual
+   * (i.e. are in effect completely flat and untextured in the intra
+   * domain). In natural videos this is uncommon, but it is much more
+   * common in animations, graphics and screen content, so may be used
+   * as a signal to detect these types of content.
+   */
   double intra_skip_pct;
-  // Image mask rows top and bottom.
+  /*!
+   * Image mask rows top and bottom.
+   */
   double inactive_zone_rows;
-  // Image mask columns at left and right edges.
+  /*!
+   * Image mask columns at left and right edges.
+   */
   double inactive_zone_cols;
-  // Average of row motion vectors.
+  /*!
+   * Average of row motion vectors.
+   */
   double MVr;
-  // Mean of absolute value of row motion vectors.
+  /*!
+   * Mean of absolute value of row motion vectors.
+   */
   double mvr_abs;
-  // Mean of column motion vectors.
+  /*!
+   * Mean of column motion vectors.
+   */
   double MVc;
-  // Mean of absolute value of column motion vectors.
+  /*!
+   * Mean of absolute value of column motion vectors.
+   */
   double mvc_abs;
-  // Variance of row motion vectors.
+  /*!
+   * Variance of row motion vectors.
+   */
   double MVrv;
-  // Variance of column motion vectors.
+  /*!
+   * Variance of column motion vectors.
+   */
   double MVcv;
-  // Value in range [-1,1] indicating fraction of row and column motion vectors
-  // that point inwards (negative MV value) or outwards (positive MV value).
-  // For example, value of 1 indicates, all row/column MVs are inwards.
+  /*!
+   * Value in range [-1,1] indicating fraction of row and column motion vectors
+   * that point inwards (negative MV value) or outwards (positive MV value).
+   * For example, value of 1 indicates, all row/column MVs are inwards.
+   */
   double mv_in_out_count;
-  // Count of unique non-zero motion vectors.
+  /*!
+   * Count of unique non-zero motion vectors.
+   */
   double new_mv_count;
-  // Duration of the frame / collection of frames.
+  /*!
+   * Duration of the frame / collection of frames.
+   */
   double duration;
-  // 1.0 if stats are for a single frame, OR
-  // Number of frames in this collection for which the stats are accumulated.
+  /*!
+   * 1.0 if stats are for a single frame, OR
+   * Number of frames in this collection for which the stats are accumulated.
+   */
   double count;
-  // standard deviation for (0, 0) motion prediction error
+  /*!
+   * standard deviation for (0, 0) motion prediction error
+   */
   double raw_error_stdev;
+  /*!
+   * Whether the frame contains a flash
+   */
+  int64_t is_flash;
+  /*!
+   * Estimated noise variance
+   */
+  double noise_var;
+  /*!
+   * Correlation coefficient with the previous frame
+   */
+  double cor_coeff;
 } FIRSTPASS_STATS;
 
+// We want to keep one past stats for key frame detection
+// in test_candidate_kf()
+#define FIRSTPASS_INFO_STATS_PAST_MIN 1
+
+// The size of static buffer used in FIRSTPASS_INFO.
+#define FIRSTPASS_INFO_STATIC_BUF_SIZE \
+  (MAX_LAP_BUFFERS + FIRSTPASS_INFO_STATS_PAST_MIN)
+
+/*!
+ * \brief  Data structure used for managing first pass stats
+ */
+typedef struct {
+  /*!
+   * A static buffer that will be used when no ext_stats_buf is assigned. The
+   * ext_stats_buf is assigned through av1_firstpass_info_init() when the user
+   * already has a pre-existing firstpass stats that is stored in an external
+   * buffer. The ext_stats_buf is usually used in two pass mode. When using one
+   * pass mode, we generate "firstpass" stats and encode the video in the same
+   * pass. In this scenario, the stats will be pushed and popped from
+   * static_stats_buf.
+   */
+  FIRSTPASS_STATS static_stats_buf[FIRSTPASS_INFO_STATIC_BUF_SIZE];
+  /*!
+   * A pointer to first pass stats.
+   * Note that this buffer will be used as ring buffer.
+   */
+  FIRSTPASS_STATS *stats_buf;
+  /*!
+   * size of stats_buf
+   */
+  int stats_buf_size;
+  /*!
+   * start index of the available frame stats
+   * Note that start_index doesn't always point to
+   * current frame's stats because we need to
+   * keep past stats as well. To access current
+   * frame's stats, please use cur_index.
+   */
+  int start_index;
+
+  /*!
+   * count available stats stored in stats_buf
+   * the following condition should stay true
+   * stats_count = future_stats_count + past_stats_count
+   */
+  int stats_count;
+
+  /*!
+   *  index of the current frame's stats
+   */
+  int cur_index;
+
+  /*!
+   * count available future stats including current stats
+   */
+  int future_stats_count;
+
+  /*!
+   * count available past stats EXCLUDING current stats
+   */
+  int past_stats_count;
+
+  /*!
+   * Accumulation of the stats being pushed into firstpass_info
+   */
+  FIRSTPASS_STATS total_stats;
+} FIRSTPASS_INFO;
+
+/*!\brief Init firstpass_info
+ *
+ * If using ext_stats_buf, the buffer needs to stay available during encoding
+ * process.
+ *
+ * \ingroup rate_control
+ * \param[out]   firstpass_info      struct of firstpass_info.
+ * \param[in]    ext_stats_buf       external stats buffer. Pass in NULL if
+ *                                   choose to use internal static_stats_buf.
+ * \param[in]    ext_stats_buf_size  external stats buffer size. Pass in 0 if
+ * choose to use internal static_stats_buf. \return status
+ */
+aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info,
+                                        FIRSTPASS_STATS *ext_stats_buf,
+                                        int ext_stats_buf_size);
+
+/*!\brief Move cur_index by 1
+ *
+ * \ingroup rate_control
+ * \param[out]   firstpass_info      struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_move_cur_index(
+    FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Pop a stats from firstpass_info
+ *
+ * \ingroup rate_control
+ * \param[out]   firstpass_info      struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Move cur_index by 1 and pop a stats from firstpass_info
+ *
+ * \ingroup rate_control
+ * \param[out]   firstpass_info      struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop(
+    FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Push a stats into firstpass_info
+ *
+ * Note that the input stats will be copied into firstpass_info.
+ * \ingroup rate_control
+ * \param[out]  firstpass_info      struct of firstpass_info.
+ * \param[in]   input_stats         input stats
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info,
+                                        const FIRSTPASS_STATS *input_stats);
+
+/*!\brief Peek at a stats from firstpass_info
+ *
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in]  firstpass_info      struct of firstpass_info.
+ * \param[in]  offset_from_cur  index offset from cur_index.
+ * \return pointer to the stats. The pointer will be NULL if
+ *         stats_index_offset is invalid.
+ */
+const FIRSTPASS_STATS *av1_firstpass_info_peek(
+    const FIRSTPASS_INFO *firstpass_info, int offset_from_cur);
+
+/*!\brief Count the future stats from the target in firstpass_info
+ * Note that the target stats will be counted as well.
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in]  firstpass_info    struct of firstpass_info.
+ * \param[in]  offset_from_cur  target stats's inffset
+ *                               from cur_index.
+ * \return Number of stats in the future after the target stats
+ *         including itself.
+ */
+int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info,
+                                    int offset_from_cur);
+
+/*!\brief Count the past stats before the target in firstpass_info
+ * Note that the target stats will NOT be counted.
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in]  firstpass_info    struct of firstpass_info.
+ * \param[in]  offset_from_cur  target stats's index offset
+ *                               from cur_index.
+ * \return Number of stats in the past before the target stats
+ *         excluding itself.
+ */
+int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info,
+                                  int offset_from_cur);
+
+/*!\cond */
 #define FC_ANIMATION_THRESH 0.15
 enum {
   FC_NORMAL = 0,
   FC_GRAPHICS_ANIMATION = 1,
   FRAME_CONTENT_TYPES = 2
 } UENUM1BYTE(FRAME_CONTENT_TYPE);
+/*!\endcond */
 
-typedef struct {
-  unsigned char index;
+/*!
+ * \brief  Data related to the current GF/ARF group and the
+ * individual frames within the group
+ */
+typedef struct GF_GROUP {
+  /*!\cond */
+  // Frame update type, e.g. ARF/GF/LF/Overlay
   FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH];
   unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH];
   // The number of frames displayed so far within the GOP at a given coding
   // frame.
   unsigned char cur_frame_idx[MAX_STATIC_GF_GROUP_LENGTH];
-  unsigned char frame_disp_idx[MAX_STATIC_GF_GROUP_LENGTH];
-  int ref_frame_disp_idx[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
-  int ref_frame_gop_idx[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
-
-  // TODO(jingning): Unify the data structure used here after the new control
-  // mechanism is in place.
   int layer_depth[MAX_STATIC_GF_GROUP_LENGTH];
   int arf_boost[MAX_STATIC_GF_GROUP_LENGTH];
   int max_layer_depth;
   int max_layer_depth_allowed;
   // This is currently only populated for AOM_Q mode
-  unsigned char q_val[MAX_STATIC_GF_GROUP_LENGTH];
+  int q_val[MAX_STATIC_GF_GROUP_LENGTH];
   int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH];
-  int size;
+  // The frame coding type - inter/intra frame
+  FRAME_TYPE frame_type[MAX_STATIC_GF_GROUP_LENGTH];
+  // The reference frame buffer control - update or reset
+  REFBUF_STATE refbuf_state[MAX_STATIC_GF_GROUP_LENGTH];
+  int arf_index;  // the index in the gf group of ARF, if no arf, then -1
+  int size;       // The total length of a GOP
+  // Indicates the level of parallelism in frame parallel encodes.
+  // 0 : frame is independently encoded (not part of parallel encodes).
+  // 1 : frame is the first in encode order in a given parallel encode set.
+  // 2 : frame occurs later in encode order in a given parallel encode set.
+  int frame_parallel_level[MAX_STATIC_GF_GROUP_LENGTH];
+  // Indicates whether a frame should act as non-reference frame.
+  // 0 : frame is a reference frame.
+  // 1 : frame is a non-reference frame.
+  int is_frame_non_ref[MAX_STATIC_GF_GROUP_LENGTH];
+
+  // The offset into lookahead_ctx for choosing
+  // source of frame parallel encodes.
+  int src_offset[MAX_STATIC_GF_GROUP_LENGTH];
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  // Stores the display order hint of each frame in the current GF_GROUP.
+  int display_idx[MAX_STATIC_GF_GROUP_LENGTH];
+  // Stores the display order hint of the frames not to be
+  // refreshed by the current frame.
+  int skip_frame_refresh[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
+  // Stores the display order hint of the frame to be excluded during reference
+  // assignment.
+  int skip_frame_as_ref[MAX_STATIC_GF_GROUP_LENGTH];
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+  /*!\endcond */
 } GF_GROUP;
+/*!\cond */
+
+typedef struct {
+  // Track if the last frame in a GOP has higher quality.
+  int arf_gf_boost_lst;
+} GF_STATE;
 
 typedef struct {
   FIRSTPASS_STATS *stats_in_start;
@@ -137,31 +404,33 @@ typedef struct {
   FIRSTPASS_STATS *total_left_stats;
 } STATS_BUFFER_CTX;
 
+/*!\endcond */
+
+/*!
+ * \brief Two pass status and control data.
+ */
 typedef struct {
+  /*!\cond */
   unsigned int section_intra_rating;
   // Circular queue of first pass stats stored for most recent frames.
   // cpi->output_pkt_list[i].data.twopass_stats.buf points to actual data stored
   // here.
   FIRSTPASS_STATS *frame_stats_arr[MAX_LAP_BUFFERS + 1];
   int frame_stats_next_idx;  // Index to next unused element in frame_stats_arr.
-  const FIRSTPASS_STATS *stats_in;
   STATS_BUFFER_CTX *stats_buf_ctx;
+  FIRSTPASS_INFO firstpass_info;  // This is the first pass data structure
+                                  // intended to replace stats_in
   int first_pass_done;
   int64_t bits_left;
   double modified_error_min;
   double modified_error_max;
   double modified_error_left;
-  double mb_av_energy;
-  double frame_avg_haar_energy;
-
-  // An indication of the content type of the current frame
-  FRAME_CONTENT_TYPE fr_content_type;
 
   // Projected total bits available for a key frame group of frames
   int64_t kf_group_bits;
 
   // Error score of frames still to be coded in kf group
-  int64_t kf_group_error_left;
+  double kf_group_error_left;
 
   // Over time correction for bits per macro block estimation
   double bpm_factor;
@@ -177,18 +446,136 @@ typedef struct {
   int extend_minq;
   int extend_maxq;
   int extend_minq_fast;
+  /*!\endcond */
 } TWO_PASS;
 
+/*!
+ * \brief Frame level Two pass status and control data.
+ */
+typedef struct {
+  /*!\cond */
+  const FIRSTPASS_STATS *stats_in;
+  // Pointer to the stats of the current frame.
+  const FIRSTPASS_STATS *this_frame;
+  double mb_av_energy;
+  // An indication of the content type of the current frame
+  FRAME_CONTENT_TYPE fr_content_type;
+  double frame_avg_haar_energy;
+  /*!\endcond */
+} TWO_PASS_FRAME;
+
+/*!\cond */
+
+// This structure contains several key parameters to be accumulated for this
+// frame.
+typedef struct {
+  // Intra prediction error.
+  int64_t intra_error;
+  // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+  int64_t frame_avg_wavelet_energy;
+  // Best of intra pred error and inter pred error using last frame as ref.
+  int64_t coded_error;
+  // Best of intra pred error and inter pred error using golden frame as ref.
+  int64_t sr_coded_error;
+  // Count of motion vector.
+  int mv_count;
+  // Count of blocks that pick inter prediction (inter pred error is smaller
+  // than intra pred error).
+  int inter_count;
+  // Count of blocks that pick second ref (golden frame).
+  int second_ref_count;
+  // Count of blocks where the inter and intra are very close and very low.
+  double neutral_count;
+  // Count of blocks where intra error is very small.
+  int intra_skip_count;
+  // Start row.
+  int image_data_start_row;
+  // Count of unique non-zero motion vectors.
+  int new_mv_count;
+  // Sum of inward motion vectors.
+  int sum_in_vectors;
+  // Sum of motion vector row.
+  int sum_mvr;
+  // Sum of motion vector column.
+  int sum_mvc;
+  // Sum of absolute value of motion vector row.
+  int sum_mvr_abs;
+  // Sum of absolute value of motion vector column.
+  int sum_mvc_abs;
+  // Sum of the square of motion vector row.
+  int64_t sum_mvrs;
+  // Sum of the square of motion vector column.
+  int64_t sum_mvcs;
+  // A factor calculated using intra pred error.
+  double intra_factor;
+  // A factor that measures brightness.
+  double brightness_factor;
+} FRAME_STATS;
+
+// This structure contains first pass data.
+typedef struct {
+  // Buffer holding frame stats for all MACROBLOCKs.
+  // mb_stats[i] stores the FRAME_STATS of the ith
+  // MB in raster scan order.
+  FRAME_STATS *mb_stats;
+  // Buffer to store the prediction error of the (0,0) motion
+  // vector using the last source frame as the reference.
+  // raw_motion_err_list[i] stores the raw_motion_err of
+  // the ith MB in raster scan order.
+  int *raw_motion_err_list;
+} FirstPassData;
+
 struct AV1_COMP;
 struct EncodeFrameParams;
 struct AV1EncoderConfig;
+struct TileDataEnc;
 
-void av1_rc_get_first_pass_params(struct AV1_COMP *cpi);
-void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
+static INLINE int is_fp_wavelet_energy_invalid(
+    const FIRSTPASS_STATS *fp_stats) {
+  assert(fp_stats != NULL);
+  return (fp_stats->frame_avg_wavelet_energy < 0);
+}
+
+static INLINE BLOCK_SIZE get_fp_block_size(int is_screen_content_type) {
+  return (is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16);
+}
+
+int av1_get_unit_rows_in_tile(const TileInfo *tile,
+                              const BLOCK_SIZE fp_block_size);
+int av1_get_unit_cols_in_tile(const TileInfo *tile,
+                              const BLOCK_SIZE fp_block_size);
+
+void av1_first_pass_row(struct AV1_COMP *cpi, struct ThreadData *td,
+                        struct TileDataEnc *tile_data, const int mb_row,
+                        const BLOCK_SIZE fp_block_size);
 void av1_end_first_pass(struct AV1_COMP *cpi);
 
 void av1_twopass_zero_stats(FIRSTPASS_STATS *section);
+void av1_accumulate_stats(FIRSTPASS_STATS *section,
+                          const FIRSTPASS_STATS *frame);
+/*!\endcond */
+
+/*!\brief AV1 first pass encoding.
+ *
+ * \ingroup rate_control
+ * This function is the first encoding pass for the two pass encoding mode.
+ * It encodes the whole video and collect essential information.
+ * Two pass encoding is an encoding mode in the reference software (libaom)
+ * of AV1 for high performance encoding. The first pass is a fast encoding
+ * process to collect essential information to help the second pass make
+ * encoding decisions and improve coding quality. The collected stats is used
+ * in rate control, for example, to determine frame cut, the position of
+ * alternative reference frame (ARF), etc.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    ts_duration    Duration of the frame / collection of frames
+ *
+ * \return Nothing is returned. Instead, the "TWO_PASS" structure inside "cpi"
+ * is modified to store information computed in this function.
+ */
+void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
 
+void av1_noop_first_pass_frame(struct AV1_COMP *cpi, const int64_t ts_duration);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/global_motion.c b/media/libaom/src/av1/encoder/global_motion.c
index 9623ec3018..7fa006087d 100644
--- a/media/libaom/src/av1/encoder/global_motion.c
+++ b/media/libaom/src/av1/encoder/global_motion.c
@@ -11,6 +11,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdbool.h>
 #include <memory.h>
 #include <math.h>
 #include <assert.h>
@@ -64,11 +65,9 @@ typedef struct {
   double *level_dy_buffer;
 } ImagePyramid;
 
-int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost,
-                                 int erroradv_type) {
-  assert(erroradv_type < GM_ERRORADV_TR_TYPES);
-  return best_erroradvantage < erroradv_tr[erroradv_type] &&
-         best_erroradvantage * params_cost < erroradv_prod_tr[erroradv_type];
+int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) {
+  return best_erroradvantage < erroradv_tr &&
+         best_erroradvantage * params_cost < erroradv_prod_tr;
 }
 
 static void convert_to_params(const double *params, int32_t *model) {
@@ -155,7 +154,7 @@ static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
       wm->wmmat[4] = -wm->wmmat[3];
       wm->wmmat[5] = wm->wmmat[2];
       AOM_FALLTHROUGH_INTENDED;
-    case AFFINE: wm->wmmat[6] = wm->wmmat[7] = 0; break;
+    case AFFINE: break;
     default: assert(0);
   }
   wm->wmtype = wmtype;
@@ -376,9 +375,10 @@ unsigned char *av1_downconvert_frame(YV12_BUFFER_CONFIG *frm, int bit_depth) {
   return buf_8bit;
 }
 
-static void get_inliers_from_indices(MotionModel *params,
+static bool get_inliers_from_indices(MotionModel *params,
                                      int *correspondences) {
   int *inliers_tmp = (int *)aom_malloc(2 * MAX_CORNERS * sizeof(*inliers_tmp));
+  if (!inliers_tmp) return false;
   memset(inliers_tmp, 0, 2 * MAX_CORNERS * sizeof(*inliers_tmp));
 
   for (int i = 0; i < params->num_inliers; i++) {
@@ -388,6 +388,7 @@ static void get_inliers_from_indices(MotionModel *params,
   }
   memcpy(params->inliers, inliers_tmp, sizeof(*inliers_tmp) * 2 * MAX_CORNERS);
   aom_free(inliers_tmp);
+  return true;
 }
 
 #define FEAT_COUNT_TR 3
@@ -421,8 +422,8 @@ void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
 }
 
 static int compute_global_motion_feature_based(
-    TransformationType type, unsigned char *frm_buffer, int frm_width,
-    int frm_height, int frm_stride, int *frm_corners, int num_frm_corners,
+    TransformationType type, unsigned char *src_buffer, int src_width,
+    int src_height, int src_stride, int *src_corners, int num_src_corners,
     YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
     MotionModel *params_by_motion, int num_motions) {
   int i;
@@ -443,10 +444,11 @@ static int compute_global_motion_feature_based(
 
   // find correspondences between the two images
   correspondences =
-      (int *)malloc(num_frm_corners * 4 * sizeof(*correspondences));
+      (int *)malloc(num_src_corners * 4 * sizeof(*correspondences));
+  if (!correspondences) return 0;
   num_correspondences = av1_determine_correspondence(
-      frm_buffer, (int *)frm_corners, num_frm_corners, ref_buffer,
-      (int *)ref_corners, num_ref_corners, frm_width, frm_height, frm_stride,
+      src_buffer, (int *)src_corners, num_src_corners, ref_buffer,
+      (int *)ref_corners, num_ref_corners, src_width, src_height, src_stride,
       ref->y_stride, correspondences);
 
   ransac(correspondences, num_correspondences, num_inliers_by_motion,
@@ -457,8 +459,10 @@ static int compute_global_motion_feature_based(
     if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences ||
         num_correspondences == 0) {
       num_inliers_by_motion[i] = 0;
-    } else {
-      get_inliers_from_indices(&params_by_motion[i], correspondences);
+    } else if (!get_inliers_from_indices(&params_by_motion[i],
+                                         correspondences)) {
+      free(correspondences);
+      return 0;
     }
   }
 
@@ -712,12 +716,17 @@ static INLINE void sobel_xy_image_gradient(const uint8_t *src, int src_stride,
 static ImagePyramid *alloc_pyramid(int width, int height, int pad_size,
                                    int compute_gradient) {
   ImagePyramid *pyr = aom_malloc(sizeof(*pyr));
+  if (!pyr) return NULL;
   pyr->has_gradient = compute_gradient;
   // 2 * width * height is the upper bound for a buffer that fits
   // all pyramid levels + padding for each level
   const int buffer_size = sizeof(*pyr->level_buffer) * 2 * width * height +
                           (width + 2 * pad_size) * 2 * pad_size * N_LEVELS;
   pyr->level_buffer = aom_malloc(buffer_size);
+  if (!pyr->level_buffer) {
+    aom_free(pyr);
+    return NULL;
+  }
   memset(pyr->level_buffer, 0, buffer_size);
 
   if (compute_gradient) {
@@ -855,13 +864,18 @@ static INLINE void compute_flow_at_point(unsigned char *frm, unsigned char *ref,
 }
 
 // make sure flow_u and flow_v start at 0
-static void compute_flow_field(ImagePyramid *frm_pyr, ImagePyramid *ref_pyr,
+static bool compute_flow_field(ImagePyramid *frm_pyr, ImagePyramid *ref_pyr,
                                double *flow_u, double *flow_v) {
   int cur_width, cur_height, cur_stride, cur_loc, patch_loc, patch_center;
   double *u_upscale =
       aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
   double *v_upscale =
       aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+  if (!(u_upscale && v_upscale)) {
+    aom_free(u_upscale);
+    aom_free(v_upscale);
+    return false;
+  }
 
   assert(frm_pyr->n_levels == ref_pyr->n_levels);
 
@@ -905,6 +919,7 @@ static void compute_flow_field(ImagePyramid *frm_pyr, ImagePyramid *ref_pyr,
   }
   aom_free(u_upscale);
   aom_free(v_upscale);
+  return true;
 }
 
 static int compute_global_motion_disflow_based(
@@ -941,40 +956,43 @@ static int compute_global_motion_disflow_based(
   int compute_gradient = 1;
   ImagePyramid *frm_pyr =
       alloc_pyramid(frm_width, frm_height, pad_size, compute_gradient);
+  if (!frm_pyr) return 0;
   compute_flow_pyramids(frm_buffer, frm_width, frm_height, frm_stride, n_levels,
                         pad_size, compute_gradient, frm_pyr);
   // Allocate ref image pyramids
   compute_gradient = 0;
   ImagePyramid *ref_pyr =
       alloc_pyramid(ref_width, ref_height, pad_size, compute_gradient);
+  if (!ref_pyr) {
+    free_pyramid(frm_pyr);
+    return 0;
+  }
   compute_flow_pyramids(ref_buffer, ref_width, ref_height, ref->y_stride,
                         n_levels, pad_size, compute_gradient, ref_pyr);
 
+  int ret = 0;
   double *flow_u =
       aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
   double *flow_v =
       aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+  if (!(flow_u && flow_v)) goto Error;
 
   memset(flow_u, 0,
          frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
   memset(flow_v, 0,
          frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
 
-  compute_flow_field(frm_pyr, ref_pyr, flow_u, flow_v);
+  if (!compute_flow_field(frm_pyr, ref_pyr, flow_u, flow_v)) goto Error;
 
   // find correspondences between the two images using the flow field
   correspondences = aom_malloc(num_frm_corners * 4 * sizeof(*correspondences));
+  if (!correspondences) goto Error;
   num_correspondences = determine_disflow_correspondence(
       frm_corners, num_frm_corners, flow_u, flow_v, frm_width, frm_height,
       frm_pyr->strides[0], correspondences);
   ransac(correspondences, num_correspondences, num_inliers_by_motion,
          params_by_motion, num_motions);
 
-  free_pyramid(frm_pyr);
-  free_pyramid(ref_pyr);
-  aom_free(correspondences);
-  aom_free(flow_u);
-  aom_free(flow_v);
   // Set num_inliers = 0 for motions with too few inliers so they are ignored.
   for (int i = 0; i < num_motions; ++i) {
     if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) {
@@ -984,15 +1002,25 @@ static int compute_global_motion_disflow_based(
 
   // Return true if any one of the motions has inliers.
   for (int i = 0; i < num_motions; ++i) {
-    if (num_inliers_by_motion[i] > 0) return 1;
+    if (num_inliers_by_motion[i] > 0) {
+      ret = 1;
+      break;
+    }
   }
-  return 0;
+
+  aom_free(correspondences);
+Error:
+  free_pyramid(frm_pyr);
+  free_pyramid(ref_pyr);
+  aom_free(flow_u);
+  aom_free(flow_v);
+  return ret;
 }
 
 int av1_compute_global_motion(TransformationType type,
-                              unsigned char *frm_buffer, int frm_width,
-                              int frm_height, int frm_stride, int *frm_corners,
-                              int num_frm_corners, YV12_BUFFER_CONFIG *ref,
+                              unsigned char *src_buffer, int src_width,
+                              int src_height, int src_stride, int *src_corners,
+                              int num_src_corners, YV12_BUFFER_CONFIG *ref,
                               int bit_depth,
                               GlobalMotionEstimationType gm_estimation_type,
                               int *num_inliers_by_motion,
@@ -1000,13 +1028,13 @@ int av1_compute_global_motion(TransformationType type,
   switch (gm_estimation_type) {
     case GLOBAL_MOTION_FEATURE_BASED:
       return compute_global_motion_feature_based(
-          type, frm_buffer, frm_width, frm_height, frm_stride, frm_corners,
-          num_frm_corners, ref, bit_depth, num_inliers_by_motion,
+          type, src_buffer, src_width, src_height, src_stride, src_corners,
+          num_src_corners, ref, bit_depth, num_inliers_by_motion,
           params_by_motion, num_motions);
     case GLOBAL_MOTION_DISFLOW_BASED:
       return compute_global_motion_disflow_based(
-          type, frm_buffer, frm_width, frm_height, frm_stride, frm_corners,
-          num_frm_corners, ref, bit_depth, num_inliers_by_motion,
+          type, src_buffer, src_width, src_height, src_stride, src_corners,
+          num_src_corners, ref, bit_depth, num_inliers_by_motion,
           params_by_motion, num_motions);
     default: assert(0 && "Unknown global motion estimation type");
   }
diff --git a/media/libaom/src/av1/encoder/global_motion.h b/media/libaom/src/av1/encoder/global_motion.h
index 0a6d0ecac2..a70bfa8eba 100644
--- a/media/libaom/src/av1/encoder/global_motion.h
+++ b/media/libaom/src/av1/encoder/global_motion.h
@@ -14,6 +14,8 @@
 
 #include "aom/aom_integer.h"
 #include "aom_scale/yv12config.h"
+#include "aom_util/aom_thread.h"
+
 #include "av1/common/mv.h"
 #include "av1/common/warped_motion.h"
 
@@ -24,6 +26,7 @@ extern "C" {
 #define MAX_CORNERS 4096
 #define RANSAC_NUM_MOTIONS 1
 #define GM_REFINEMENT_COUNT 5
+#define MAX_DIRECTIONS 2
 
 typedef enum {
   GLOBAL_MOTION_FEATURE_BASED,
@@ -38,16 +41,70 @@ typedef struct {
   int num_inliers;
 } MotionModel;
 
+// The structure holds a valid reference frame type and its temporal distance
+// from the source frame.
+typedef struct {
+  int distance;
+  MV_REFERENCE_FRAME frame;
+} FrameDistPair;
+
+typedef struct {
+  // Array of structure which holds the global motion parameters for a given
+  // motion model. params_by_motion[i] holds the parameters for a given motion
+  // model for the ith ransac motion.
+  MotionModel params_by_motion[RANSAC_NUM_MOTIONS];
+
+  // Pointer to hold inliers from motion model.
+  uint8_t *segment_map;
+} GlobalMotionThreadData;
+
+typedef struct {
+  // Holds the mapping of each thread to past/future direction.
+  // thread_id_to_dir[i] indicates the direction id (past - 0/future - 1)
+  // assigned to the ith thread.
+  int8_t thread_id_to_dir[MAX_NUM_THREADS];
+
+  // A flag which holds the early exit status based on the speed feature
+  // 'prune_ref_frame_for_gm_search'. early_exit[i] will be set if the speed
+  // feature based early exit happens in the direction 'i'.
+  int8_t early_exit[MAX_DIRECTIONS];
+
+  // Counter for the next reference frame to be processed.
+  // next_frame_to_process[i] will hold the count of next reference frame to be
+  // processed in the direction 'i'.
+  int8_t next_frame_to_process[MAX_DIRECTIONS];
+} JobInfo;
+
+typedef struct {
+  // Data related to assigning jobs for global motion multi-threading.
+  JobInfo job_info;
+
+  // Data specific to each worker in global motion multi-threading.
+  // thread_data[i] stores the thread specific data for worker 'i'.
+  GlobalMotionThreadData *thread_data;
+
+#if CONFIG_MULTITHREAD
+  // Mutex lock used while dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif
+
+  // Width and height for which segment_map is allocated for each thread.
+  int allocated_width;
+  int allocated_height;
+
+  // Number of workers for which thread_data is allocated.
+  int8_t allocated_workers;
+} AV1GlobalMotionSync;
+
 void av1_convert_model_to_params(const double *params,
                                  WarpedMotionParams *model);
 
 // TODO(sarahparker) These need to be retuned for speed 0 and 1 to
 // maximize gains from segmented error metric
-static const double erroradv_tr[] = { 0.65, 0.60, 0.65 };
-static const double erroradv_prod_tr[] = { 20000, 18000, 16000 };
+static const double erroradv_tr = 0.65;
+static const double erroradv_prod_tr = 20000;
 
-int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost,
-                                 int erroradv_type);
+int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost);
 
 void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
                                           int height, int *inliers,
@@ -88,9 +145,9 @@ int64_t av1_refine_integerized_param(
   num_inliers entry is 0 should be ignored by the caller.
 */
 int av1_compute_global_motion(TransformationType type,
-                              unsigned char *frm_buffer, int frm_width,
-                              int frm_height, int frm_stride, int *frm_corners,
-                              int num_frm_corners, YV12_BUFFER_CONFIG *ref,
+                              unsigned char *src_buffer, int src_width,
+                              int src_height, int src_stride, int *src_corners,
+                              int num_src_corners, YV12_BUFFER_CONFIG *ref,
                               int bit_depth,
                               GlobalMotionEstimationType gm_estimation_type,
                               int *num_inliers_by_motion,
diff --git a/media/libaom/src/av1/encoder/global_motion_facade.c b/media/libaom/src/av1/encoder/global_motion_facade.c
new file mode 100644
index 0000000000..4fe4411463
--- /dev/null
+++ b/media/libaom/src/av1/encoder/global_motion_facade.c
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/binary_codes_writer.h"
+
+#include "av1/encoder/corner_detect.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/rdopt.h"
+
+// Highest motion model to search.
+#define GLOBAL_TRANS_TYPES_ENC 3
+
+// Computes the cost for the warp parameters.
+static int gm_get_params_cost(const WarpedMotionParams *gm,
+                              const WarpedMotionParams *ref_gm, int allow_hp) {
+  int params_cost = 0;
+  int trans_bits, trans_prec_diff;
+  switch (gm->wmtype) {
+    case AFFINE:
+    case ROTZOOM:
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
+          (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+          (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+      if (gm->wmtype >= AFFINE) {
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+            (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                (1 << GM_ALPHA_PREC_BITS),
+            (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      }
+      AOM_FALLTHROUGH_INTENDED;
+    case TRANSLATION:
+      trans_bits = (gm->wmtype == TRANSLATION)
+                       ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                       : GM_ABS_TRANS_BITS;
+      trans_prec_diff = (gm->wmtype == TRANSLATION)
+                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                            : GM_TRANS_PREC_DIFF;
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[0] >> trans_prec_diff),
+          (gm->wmmat[0] >> trans_prec_diff));
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[1] >> trans_prec_diff),
+          (gm->wmmat[1] >> trans_prec_diff));
+      AOM_FALLTHROUGH_INTENDED;
+    case IDENTITY: break;
+    default: assert(0);
+  }
+  return (params_cost << AV1_PROB_COST_SHIFT);
+}
+
+// Calculates the threshold to be used for warp error computation.
+static AOM_INLINE int64_t calc_erroradv_threshold(int64_t ref_frame_error) {
+  return (int64_t)(ref_frame_error * erroradv_tr + 0.5);
+}
+
+// For the given reference frame, computes the global motion parameters for
+// different motion models and finds the best.
+static AOM_INLINE void compute_global_motion_for_ref_frame(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    int num_src_corners, int *src_corners, unsigned char *src_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map,
+    const int segment_map_w, const int segment_map_h,
+    const WarpedMotionParams *ref_params) {
+  ThreadData *const td = &cpi->td;
+  MACROBLOCK *const x = &td->mb;
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int i;
+  int src_width = cpi->source->y_width;
+  int src_height = cpi->source->y_height;
+  int src_stride = cpi->source->y_stride;
+  // clang-format off
+  static const double kIdentityParams[MAX_PARAMDIM - 1] = {
+     0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
+  };
+  // clang-format on
+  WarpedMotionParams tmp_wm_params;
+  const double *params_this_motion;
+  int inliers_by_motion[RANSAC_NUM_MOTIONS];
+  assert(ref_buf[frame] != NULL);
+  TransformationType model;
+
+  // TODO(sarahparker, debargha): Explore do_adaptive_gm_estimation = 1
+  const int do_adaptive_gm_estimation = 0;
+
+  const int ref_frame_dist = get_relative_dist(
+      &cm->seq_params->order_hint_info, cm->current_frame.order_hint,
+      cm->cur_frame->ref_order_hints[frame - LAST_FRAME]);
+  const GlobalMotionEstimationType gm_estimation_type =
+      cm->seq_params->order_hint_info.enable_order_hint &&
+              abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation
+          ? GLOBAL_MOTION_DISFLOW_BASED
+          : GLOBAL_MOTION_FEATURE_BASED;
+  for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
+    int64_t best_warp_error = INT64_MAX;
+    // Initially set all params to identity.
+    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+      memcpy(params_by_motion[i].params, kIdentityParams,
+             (MAX_PARAMDIM - 1) * sizeof(*(params_by_motion[i].params)));
+      params_by_motion[i].num_inliers = 0;
+    }
+
+    av1_compute_global_motion(model, src_buffer, src_width, src_height,
+                              src_stride, src_corners, num_src_corners,
+                              ref_buf[frame], cpi->common.seq_params->bit_depth,
+                              gm_estimation_type, inliers_by_motion,
+                              params_by_motion, RANSAC_NUM_MOTIONS);
+    int64_t ref_frame_error = 0;
+    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+      if (inliers_by_motion[i] == 0) continue;
+
+      params_this_motion = params_by_motion[i].params;
+      av1_convert_model_to_params(params_this_motion, &tmp_wm_params);
+
+      if (tmp_wm_params.wmtype != IDENTITY) {
+        av1_compute_feature_segmentation_map(
+            segment_map, segment_map_w, segment_map_h,
+            params_by_motion[i].inliers, params_by_motion[i].num_inliers);
+
+        ref_frame_error = av1_segmented_frame_error(
+            is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
+            ref_buf[frame]->y_stride, cpi->source->y_buffer, src_width,
+            src_height, src_stride, segment_map, segment_map_w);
+
+        const int64_t erroradv_threshold =
+            calc_erroradv_threshold(ref_frame_error);
+
+        const int64_t warp_error = av1_refine_integerized_param(
+            &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd,
+            ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
+            ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
+            cpi->source->y_buffer, src_width, src_height, src_stride,
+            GM_REFINEMENT_COUNT, best_warp_error, segment_map, segment_map_w,
+            erroradv_threshold);
+
+        if (warp_error < best_warp_error) {
+          best_warp_error = warp_error;
+          // Save the wm_params modified by
+          // av1_refine_integerized_param() rather than motion index to
+          // avoid rerunning refine() below.
+          memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
+                 sizeof(WarpedMotionParams));
+        }
+      }
+    }
+    if (cm->global_motion[frame].wmtype <= AFFINE)
+      if (!av1_get_shear_params(&cm->global_motion[frame]))
+        cm->global_motion[frame] = default_warp_params;
+
+    if (cm->global_motion[frame].wmtype == TRANSLATION) {
+      cm->global_motion[frame].wmmat[0] =
+          convert_to_trans_prec(cm->features.allow_high_precision_mv,
+                                cm->global_motion[frame].wmmat[0]) *
+          GM_TRANS_ONLY_DECODE_FACTOR;
+      cm->global_motion[frame].wmmat[1] =
+          convert_to_trans_prec(cm->features.allow_high_precision_mv,
+                                cm->global_motion[frame].wmmat[1]) *
+          GM_TRANS_ONLY_DECODE_FACTOR;
+    }
+
+    if (cm->global_motion[frame].wmtype == IDENTITY) continue;
+
+    if (ref_frame_error == 0) continue;
+
+    // If the best error advantage found doesn't meet the threshold for
+    // this motion type, revert to IDENTITY.
+    if (!av1_is_enough_erroradvantage(
+            (double)best_warp_error / ref_frame_error,
+            gm_get_params_cost(&cm->global_motion[frame], ref_params,
+                               cm->features.allow_high_precision_mv))) {
+      cm->global_motion[frame] = default_warp_params;
+    }
+
+    if (cm->global_motion[frame].wmtype != IDENTITY) break;
+  }
+}
+
+// Computes global motion for the given reference frame.
+void av1_compute_gm_for_valid_ref_frames(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    int num_src_corners, int *src_corners, unsigned char *src_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map, int segment_map_w,
+    int segment_map_h) {
+  AV1_COMMON *const cm = &cpi->common;
+  const WarpedMotionParams *ref_params =
+      cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                     : &default_warp_params;
+
+  compute_global_motion_for_ref_frame(
+      cpi, ref_buf, frame, num_src_corners, src_corners, src_buffer,
+      params_by_motion, segment_map, segment_map_w, segment_map_h, ref_params);
+}
+
+// Loops over valid reference frames and computes global motion estimation.
+static AOM_INLINE void compute_global_motion_for_references(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+    FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames,
+    int num_src_corners, int *src_corners, unsigned char *src_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map,
+    const int segment_map_w, const int segment_map_h) {
+  // Computation of frame corners for the source frame will be done already.
+  assert(num_src_corners != -1);
+  AV1_COMMON *const cm = &cpi->common;
+  // Compute global motion w.r.t. reference frames starting from the nearest ref
+  // frame in a given direction.
+  for (int frame = 0; frame < num_ref_frames; frame++) {
+    int ref_frame = reference_frame[frame].frame;
+    av1_compute_gm_for_valid_ref_frames(
+        cpi, ref_buf, ref_frame, num_src_corners, src_corners, src_buffer,
+        params_by_motion, segment_map, segment_map_w, segment_map_h);
+    // If global motion w.r.t. current ref frame is
+    // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
+    // the remaining ref frames in that direction. The below exit is disabled
+    // when ref frame distance w.r.t. current frame is zero. E.g.:
+    // source_alt_ref_frame w.r.t. ARF frames.
+    if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
+        reference_frame[frame].distance != 0 &&
+        cm->global_motion[ref_frame].wmtype != ROTZOOM)
+      break;
+  }
+}
+
+// Compares the distance in 'a' and 'b'. Returns 1 if the frame corresponding to
+// 'a' is farther, -1 if the frame corresponding to 'b' is farther, 0 otherwise.
+static int compare_distance(const void *a, const void *b) {
+  const int diff =
+      ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance;
+  if (diff > 0)
+    return 1;
+  else if (diff < 0)
+    return -1;
+  return 0;
+}
+
+static int disable_gm_search_based_on_stats(const AV1_COMP *const cpi) {
+  int is_gm_present = 1;
+
+  // Check number of GM models only in GF groups with ARF frames. GM param
+  // estimation is always done in the case of GF groups with no ARF frames (flat
+  // gops)
+  if (cpi->ppi->gf_group.arf_index > -1) {
+    // valid_gm_model_found is initialized to INT32_MAX in the beginning of
+    // every GF group.
+    // Therefore, GM param estimation is always done for all frames until
+    // at least 1 frame each of ARF_UPDATE, INTNL_ARF_UPDATE and LF_UPDATE are
+    // encoded in a GF group For subsequent frames, GM param estimation is
+    // disabled, if no valid models have been found in all the three update
+    // types.
+    is_gm_present = (cpi->ppi->valid_gm_model_found[ARF_UPDATE] != 0) ||
+                    (cpi->ppi->valid_gm_model_found[INTNL_ARF_UPDATE] != 0) ||
+                    (cpi->ppi->valid_gm_model_found[LF_UPDATE] != 0);
+  }
+  return !is_gm_present;
+}
+
+// Prunes reference frames for global motion estimation based on the speed
+// feature 'gm_search_type'.
+static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) {
+  (void)frame;
+  switch (sf->gm_sf.gm_search_type) {
+    case GM_FULL_SEARCH: return 1;
+    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3:
+      return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
+    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2:
+      return !(frame == LAST2_FRAME || frame == LAST3_FRAME ||
+               (frame == ALTREF2_FRAME));
+    case GM_DISABLE_SEARCH: return 0;
+    default: assert(0);
+  }
+  return 1;
+}
+
+// Populates valid reference frames in past/future directions in
+// 'reference_frames' and their count in 'num_ref_frames'.
+static AOM_INLINE void update_valid_ref_frames_for_gm(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+    FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1],
+    int *num_ref_frames) {
+  AV1_COMMON *const cm = &cpi->common;
+  int *num_past_ref_frames = &num_ref_frames[0];
+  int *num_future_ref_frames = &num_ref_frames[1];
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
+      gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, cpi->gf_frame_index);
+  int cur_frame_gm_disabled = 0;
+
+  if (cpi->sf.gm_sf.disable_gm_search_based_on_stats) {
+    cur_frame_gm_disabled = disable_gm_search_based_on_stats(cpi);
+  }
+
+  for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
+    const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME };
+    RefCntBuffer *buf = get_ref_frame_buf(cm, frame);
+    const int ref_disabled =
+        !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]);
+    ref_buf[frame] = NULL;
+    cm->global_motion[frame] = default_warp_params;
+    // Skip global motion estimation for invalid ref frames
+    if (buf == NULL ||
+        (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) {
+      continue;
+    } else {
+      ref_buf[frame] = &buf->buf;
+    }
+
+    int prune_ref_frames =
+        ref_pruning_enabled &&
+        prune_ref_by_selective_ref_frame(cpi, NULL, ref_frame,
+                                         cm->cur_frame->ref_display_order_hint);
+
+    if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
+        ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
+        do_gm_search_logic(&cpi->sf, frame) && !prune_ref_frames &&
+        !cur_frame_gm_disabled) {
+      assert(ref_buf[frame] != NULL);
+      const int relative_frame_dist = av1_encoder_get_relative_dist(
+          buf->display_order_hint, cm->cur_frame->display_order_hint);
+      // Populate past and future ref frames.
+      // reference_frames[0][] indicates past direction and
+      // reference_frames[1][] indicates future direction.
+      if (relative_frame_dist <= 0) {
+        reference_frames[0][*num_past_ref_frames].distance =
+            abs(relative_frame_dist);
+        reference_frames[0][*num_past_ref_frames].frame = frame;
+        (*num_past_ref_frames)++;
+      } else {
+        reference_frames[1][*num_future_ref_frames].distance =
+            abs(relative_frame_dist);
+        reference_frames[1][*num_future_ref_frames].frame = frame;
+        (*num_future_ref_frames)++;
+      }
+    }
+  }
+}
+
+// Deallocates segment_map and inliers.
+static AOM_INLINE void dealloc_global_motion_data(MotionModel *params_by_motion,
+                                                  uint8_t *segment_map) {
+  aom_free(segment_map);
+
+  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+    aom_free(params_by_motion[m].inliers);
+  }
+}
+
+// Allocates and initializes memory for segment_map and MotionModel.
+static AOM_INLINE bool alloc_global_motion_data(MotionModel *params_by_motion,
+                                                uint8_t **segment_map,
+                                                const int segment_map_w,
+                                                const int segment_map_h) {
+  av1_zero_array(params_by_motion, RANSAC_NUM_MOTIONS);
+  for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+    params_by_motion[m].inliers =
+        aom_malloc(sizeof(*(params_by_motion[m].inliers)) * 2 * MAX_CORNERS);
+    if (!params_by_motion[m].inliers) {
+      dealloc_global_motion_data(params_by_motion, NULL);
+      return false;
+    }
+  }
+
+  *segment_map = (uint8_t *)aom_calloc(segment_map_w * segment_map_h,
+                                       sizeof(*segment_map));
+  if (!*segment_map) {
+    dealloc_global_motion_data(params_by_motion, NULL);
+    return false;
+  }
+  return true;
+}
+
+// Initializes parameters used for computing global motion.
+static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) {
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  YV12_BUFFER_CONFIG *source = cpi->source;
+
+  gm_info->src_buffer = source->y_buffer;
+  if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // The source buffer is 16-bit, so we need to convert to 8 bits for the
+    // following code. We cache the result until the source frame is released.
+    gm_info->src_buffer =
+        av1_downconvert_frame(source, cpi->common.seq_params->bit_depth);
+  }
+
+  gm_info->segment_map_w =
+      (source->y_width + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;
+  gm_info->segment_map_h =
+      (source->y_height + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;
+
+  memset(gm_info->reference_frames, -1,
+         sizeof(gm_info->reference_frames[0][0]) * MAX_DIRECTIONS *
+             (REF_FRAMES - 1));
+  av1_zero(gm_info->num_ref_frames);
+
+  // Populate ref_buf for valid ref frames in global motion
+  update_valid_ref_frames_for_gm(cpi, gm_info->ref_buf,
+                                 gm_info->reference_frames,
+                                 gm_info->num_ref_frames);
+
+  // Sort the past and future ref frames in the ascending order of their
+  // distance from the current frame. reference_frames[0] => past direction
+  // and reference_frames[1] => future direction.
+  qsort(gm_info->reference_frames[0], gm_info->num_ref_frames[0],
+        sizeof(gm_info->reference_frames[0][0]), compare_distance);
+  qsort(gm_info->reference_frames[1], gm_info->num_ref_frames[1],
+        sizeof(gm_info->reference_frames[1][0]), compare_distance);
+
+  gm_info->num_src_corners = -1;
+  // If at least one valid reference frame exists in past/future directions,
+  // compute interest points of source frame using FAST features.
+  if (gm_info->num_ref_frames[0] > 0 || gm_info->num_ref_frames[1] > 0) {
+    gm_info->num_src_corners = av1_fast_corner_detect(
+        gm_info->src_buffer, source->y_width, source->y_height,
+        source->y_stride, gm_info->src_corners, MAX_CORNERS);
+  }
+}
+
+// Computes global motion w.r.t. valid reference frames.
+static AOM_INLINE void global_motion_estimation(AV1_COMP *cpi) {
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  MotionModel params_by_motion[RANSAC_NUM_MOTIONS];
+  uint8_t *segment_map = NULL;
+
+  alloc_global_motion_data(params_by_motion, &segment_map,
+                           gm_info->segment_map_w, gm_info->segment_map_h);
+
+  // Compute global motion w.r.t. past reference frames and future reference
+  // frames
+  for (int dir = 0; dir < MAX_DIRECTIONS; dir++) {
+    if (gm_info->num_ref_frames[dir] > 0)
+      compute_global_motion_for_references(
+          cpi, gm_info->ref_buf, gm_info->reference_frames[dir],
+          gm_info->num_ref_frames[dir], gm_info->num_src_corners,
+          gm_info->src_corners, gm_info->src_buffer, params_by_motion,
+          segment_map, gm_info->segment_map_w, gm_info->segment_map_h);
+  }
+
+  dealloc_global_motion_data(params_by_motion, segment_map);
+}
+
+// Global motion estimation for the current frame is computed.This computation
+// happens once per frame and the winner motion model parameters are stored in
+// cm->cur_frame->global_motion.
+void av1_compute_global_motion_facade(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+
+  if (cpi->oxcf.tool_cfg.enable_global_motion) {
+    if (cpi->gf_frame_index == 0) {
+      for (int i = 0; i < FRAME_UPDATE_TYPES; i++) {
+        cpi->ppi->valid_gm_model_found[i] = INT32_MAX;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+        if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)
+          cpi->ppi->temp_valid_gm_model_found[i] = INT32_MAX;
+#endif
+      }
+    }
+  }
+
+  if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
+      cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done) {
+    setup_global_motion_info_params(cpi);
+    if (cpi->mt_info.num_workers > 1)
+      av1_global_motion_estimation_mt(cpi);
+    else
+      global_motion_estimation(cpi);
+    gm_info->search_done = 1;
+  }
+  memcpy(cm->cur_frame->global_motion, cm->global_motion,
+         sizeof(cm->cur_frame->global_motion));
+}
diff --git a/media/libaom/src/av1/encoder/global_motion_facade.h b/media/libaom/src/av1/encoder/global_motion_facade.h
new file mode 100644
index 0000000000..52df19d42f
--- /dev/null
+++ b/media/libaom/src/av1/encoder/global_motion_facade.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+void av1_compute_gm_for_valid_ref_frames(
+    struct AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    int num_src_corners, int *src_corners, unsigned char *src_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map, int segment_map_w,
+    int segment_map_h);
+void av1_compute_global_motion_facade(struct AV1_COMP *cpi);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
diff --git a/media/libaom/src/av1/encoder/gop_structure.c b/media/libaom/src/av1/encoder/gop_structure.c
index 1ed71a0f99..7b1380d8ad 100644
--- a/media/libaom/src/av1/encoder/gop_structure.c
+++ b/media/libaom/src/av1/encoder/gop_structure.c
@@ -11,301 +11,885 @@
 
 #include <stdint.h>
 
+#include "av1/common/blockd.h"
 #include "config/aom_config.h"
 #include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_codec.h"
 #include "aom/aom_encoder.h"
 
-#include "aom_ports/system_state.h"
-
 #include "av1/common/av1_common_int.h"
 
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/gop_structure.h"
+#include "av1/encoder/pass2_strategy.h"
+
+// This function sets gf_group->frame_parallel_level for LF_UPDATE frames based
+// on the value of parallel_frame_count.
+static void set_frame_parallel_level(int *frame_parallel_level,
+                                     int *parallel_frame_count,
+                                     int max_parallel_frames) {
+  assert(*parallel_frame_count > 0);
+  // parallel_frame_count > 1 indicates subsequent frame(s) in the current
+  // parallel encode set.
+  *frame_parallel_level = 1 + (*parallel_frame_count > 1);
+  // Update the count of no. of parallel frames.
+  (*parallel_frame_count)++;
+  if (*parallel_frame_count > max_parallel_frames) *parallel_frame_count = 1;
+}
+
+// This function sets gf_group->src_offset based on frame_parallel_level.
+// Outputs are gf_group->src_offset and first_frame_index
+static void set_src_offset(GF_GROUP *const gf_group, int *first_frame_index,
+                           int cur_frame_idx, int frame_ind) {
+  if (gf_group->frame_parallel_level[frame_ind] > 0) {
+    if (gf_group->frame_parallel_level[frame_ind] == 1) {
+      *first_frame_index = cur_frame_idx;
+    }
+
+    // Obtain the offset of the frame at frame_ind in the lookahead queue by
+    // subtracting the display order hints of the current frame from the display
+    // order hint of the first frame in parallel encoding set (at
+    // first_frame_index).
+    gf_group->src_offset[frame_ind] =
+        (cur_frame_idx + gf_group->arf_src_offset[frame_ind]) -
+        *first_frame_index;
+  }
+}
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+// Sets the GF_GROUP params for LF_UPDATE frames.
+static AOM_INLINE void set_params_for_leaf_frames(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+    GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+    int *parallel_frame_count, int max_parallel_frames,
+    int do_frame_parallel_encode, int *first_frame_index, int *cur_disp_index,
+    int layer_depth, int start, int end) {
+  gf_group->update_type[*frame_ind] = LF_UPDATE;
+  gf_group->arf_src_offset[*frame_ind] = 0;
+  gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+  gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
+  gf_group->frame_type[*frame_ind] = INTER_FRAME;
+  gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+  gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, layer_depth);
+  gf_group->display_idx[*frame_ind] = (*cur_disp_index);
+  gf_group->arf_boost[*frame_ind] =
+      av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start,
+                         end - start, 0, NULL, NULL, 0);
+  ++(*cur_disp_index);
+
+  // Set the level of parallelism for the LF_UPDATE frame.
+  if (do_frame_parallel_encode) {
+    set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind],
+                             parallel_frame_count, max_parallel_frames);
+    // Set LF_UPDATE frames as non-reference frames.
+    gf_group->is_frame_non_ref[*frame_ind] = 1;
+  }
+  set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+
+  ++(*frame_ind);
+  ++(*cur_frame_idx);
+}
+
+// Sets the GF_GROUP params for INTNL_OVERLAY_UPDATE frames.
+static AOM_INLINE void set_params_for_intnl_overlay_frames(
+    GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+    int *first_frame_index, int *cur_disp_index, int layer_depth) {
+  gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+  gf_group->arf_src_offset[*frame_ind] = 0;
+  gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+  gf_group->layer_depth[*frame_ind] = layer_depth;
+  gf_group->frame_type[*frame_ind] = INTER_FRAME;
+  gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+  gf_group->display_idx[*frame_ind] = (*cur_disp_index);
+  ++(*cur_disp_index);
+
+  set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+  ++(*frame_ind);
+  ++(*cur_frame_idx);
+}
+
+// Sets the GF_GROUP params for INTNL_ARF_UPDATE frames.
+static AOM_INLINE void set_params_for_internal_arfs(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+    GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+    int *parallel_frame_count, int max_parallel_frames,
+    int do_frame_parallel_encode, int *first_frame_index, int depth_thr,
+    int *cur_disp_idx, int layer_depth, int arf_src_offset, int offset,
+    int f_frames, int b_frames) {
+  gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+  gf_group->arf_src_offset[*frame_ind] = arf_src_offset;
+  gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+  gf_group->layer_depth[*frame_ind] = layer_depth;
+  gf_group->frame_type[*frame_ind] = INTER_FRAME;
+  gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+  gf_group->display_idx[*frame_ind] =
+      (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind];
+  gf_group->arf_boost[*frame_ind] =
+      av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, offset,
+                         f_frames, b_frames, NULL, NULL, 0);
+
+  if (do_frame_parallel_encode) {
+    if (depth_thr != INT_MAX) {
+      assert(depth_thr == 3 || depth_thr == 4);
+      assert(IMPLIES(depth_thr == 3, layer_depth == 4));
+      assert(IMPLIES(depth_thr == 4, layer_depth == 5));
+      // Set frame_parallel_level of the first frame in the given layer to 1.
+      if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) {
+        gf_group->frame_parallel_level[*frame_ind] = 1;
+      } else {
+        // Set frame_parallel_level of the consecutive frame in the same given
+        // layer to 2.
+        assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1);
+        gf_group->frame_parallel_level[*frame_ind] = 2;
+        // Store the display order hints of the past 2 INTNL_ARF_UPDATE
+        // frames which would not have been displayed at the time of the encode
+        // of current frame.
+        gf_group->skip_frame_refresh[*frame_ind][0] =
+            gf_group->display_idx[(*frame_ind) - 1];
+        gf_group->skip_frame_refresh[*frame_ind][1] =
+            gf_group->display_idx[(*frame_ind) - 2];
+        // Set the display_idx of frame_parallel_level 1 frame in
+        // gf_group->skip_frame_as_ref.
+        gf_group->skip_frame_as_ref[*frame_ind] =
+            gf_group->display_idx[(*frame_ind) - 1];
+      }
+    }
+    // If max_parallel_frames is not exceeded and if the frame will not be
+    // temporally filtered, encode the next internal ARF frame in parallel.
+    if (*parallel_frame_count > 1 &&
+        *parallel_frame_count <= max_parallel_frames) {
+      if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR)
+        gf_group->frame_parallel_level[*frame_ind] = 2;
+      *parallel_frame_count = 1;
+    }
+  }
+  set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+  ++(*frame_ind);
+}
+
+// Set parameters for frames between 'start' and 'end' (excluding both).
+static void set_multi_layer_params_for_fp(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc,
+    RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end,
+    int *cur_frame_idx, int *frame_ind, int *parallel_frame_count,
+    int max_parallel_frames, int do_frame_parallel_encode,
+    int *first_frame_index, int depth_thr, int *cur_disp_idx, int layer_depth) {
+  const int num_frames_to_process = end - start;
+
+  // Either we are at the last level of the pyramid, or we don't have enough
+  // frames between 'l' and 'r' to create one more level.
+  if (layer_depth > gf_group->max_layer_depth_allowed ||
+      num_frames_to_process < 3) {
+    // Leaf nodes.
+    while (start < end) {
+      set_params_for_leaf_frames(twopass, twopass_frame, p_rc, frame_info,
+                                 gf_group, cur_frame_idx, frame_ind,
+                                 parallel_frame_count, max_parallel_frames,
+                                 do_frame_parallel_encode, first_frame_index,
+                                 cur_disp_idx, layer_depth, start, end);
+      ++start;
+    }
+  } else {
+    const int m = (start + end - 1) / 2;
+
+    // Internal ARF.
+    int arf_src_offset = m - start;
+    set_params_for_internal_arfs(
+        twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+        frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, INT_MAX, cur_disp_idx,
+        layer_depth, arf_src_offset, m, end - m, m - start);
+
+    // If encode reordering is enabled, configure the multi-layers accordingly
+    // and return. For e.g., the encode order for gf-interval 16 after
+    // reordering would be 0-> 16-> 8-> 4-> 2-> 6-> 1-> 3-> 5-> 7-> 12-> 10->
+    // 14-> 9-> 11-> 13-> 15.
+    if (layer_depth >= depth_thr) {
+      int m1 = (m + start - 1) / 2;
+      int m2 = (m + 1 + end) / 2;
+      int arf_src_offsets[2] = { m1 - start, m2 - start };
+      // Parameters to compute arf_boost.
+      int offset[2] = { m1, m2 };
+      int f_frames[2] = { m - m1, end - m2 };
+      int b_frames[2] = { m1 - start, m2 - (m + 1) };
+
+      // Set GF_GROUP params for INTNL_ARF_UPDATE frames which are reordered.
+      for (int i = 0; i < 2; i++) {
+        set_params_for_internal_arfs(
+            twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+            frame_ind, parallel_frame_count, max_parallel_frames,
+            do_frame_parallel_encode, first_frame_index, depth_thr,
+            cur_disp_idx, layer_depth + 1, arf_src_offsets[i], offset[i],
+            f_frames[i], b_frames[i]);
+      }
+
+      // Initialize the start and end indices to configure LF_UPDATE frames.
+      int start_idx[4] = { start, m1 + 1, m + 1, end - 1 };
+      int end_idx[4] = { m1, m, m2, end };
+      int layer_depth_for_intnl_overlay[4] = { layer_depth + 1, layer_depth,
+                                               layer_depth + 1, INVALID_IDX };
+
+      // Set GF_GROUP params for the rest of LF_UPDATE and INTNL_OVERLAY_UPDATE
+      // frames after reordering.
+      for (int i = 0; i < 4; i++) {
+        set_multi_layer_params_for_fp(
+            twopass, twopass_frame, gf_group, p_rc, rc, frame_info,
+            start_idx[i], end_idx[i], cur_frame_idx, frame_ind,
+            parallel_frame_count, max_parallel_frames, do_frame_parallel_encode,
+            first_frame_index, depth_thr, cur_disp_idx, layer_depth + 2);
+        if (layer_depth_for_intnl_overlay[i] != INVALID_IDX)
+          set_params_for_intnl_overlay_frames(
+              gf_group, cur_frame_idx, frame_ind, first_frame_index,
+              cur_disp_idx, layer_depth_for_intnl_overlay[i]);
+      }
+      return;
+    }
+
+    // Frames displayed before this internal ARF.
+    set_multi_layer_params_for_fp(
+        twopass, twopass_frame, gf_group, p_rc, rc, frame_info, start, m,
+        cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx,
+        layer_depth + 1);
+
+    // Overlay for internal ARF.
+    set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind,
+                                        first_frame_index, cur_disp_idx,
+                                        layer_depth);
+
+    // Frames displayed after this internal ARF.
+    set_multi_layer_params_for_fp(
+        twopass, twopass_frame, gf_group, p_rc, rc, frame_info, m + 1, end,
+        cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx,
+        layer_depth + 1);
+  }
+}
+
+// Structure for bookkeeping start, end and display indices to configure
+// INTNL_ARF_UPDATE frames.
+typedef struct {
+  int start;
+  int end;
+  int display_index;
+} FRAME_REORDER_INFO;
+
+// Updates the stats required to configure the GF_GROUP.
+static AOM_INLINE void fill_arf_frame_stats(FRAME_REORDER_INFO *arf_frame_stats,
+                                            int arf_frame_index,
+                                            int display_idx, int start,
+                                            int end) {
+  arf_frame_stats[arf_frame_index].start = start;
+  arf_frame_stats[arf_frame_index].end = end;
+  arf_frame_stats[arf_frame_index].display_index = display_idx;
+}
+
+// Sets GF_GROUP params for INTNL_ARF_UPDATE frames. Also populates
+// doh_gf_index_map and arf_frame_stats.
+static AOM_INLINE void set_params_for_internal_arfs_in_gf14(
+    GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+    int *cur_frame_idx, int *cur_disp_idx, int *frame_ind,
+    int *count_arf_frames, int *doh_gf_index_map, int start, int end,
+    int layer_depth, int layer_with_parallel_encodes) {
+  int index = (start + end - 1) / 2;
+  gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+  gf_group->arf_src_offset[*frame_ind] = index - 1;
+  gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+  gf_group->layer_depth[*frame_ind] = layer_depth;
+  gf_group->frame_type[*frame_ind] = INTER_FRAME;
+  gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+  gf_group->display_idx[*frame_ind] =
+      (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind];
+
+  // Update the display index of the current frame with its gf index.
+  doh_gf_index_map[index] = *frame_ind;
+  if (layer_with_parallel_encodes) {
+    assert(layer_depth == 4);
+    // Set frame_parallel_level of the first frame in the given layer depth
+    // to 1.
+    if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) {
+      gf_group->frame_parallel_level[*frame_ind] = 1;
+    } else {
+      // Set frame_parallel_level of the consecutive frame in the same given
+      // layer depth to 2.
+      assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1);
+      gf_group->frame_parallel_level[*frame_ind] = 2;
+      // Set the display_idx of frame_parallel_level 1 frame in
+      // gf_group->skip_frame_as_ref.
+      gf_group->skip_frame_as_ref[*frame_ind] =
+          gf_group->display_idx[(*frame_ind) - 1];
+    }
+  }
+  ++(*frame_ind);
+
+  // Update arf_frame_stats.
+  fill_arf_frame_stats(arf_frame_stats, *count_arf_frames, index, start, end);
+  ++(*count_arf_frames);
+}
+
+// Sets GF_GROUP params for all INTNL_ARF_UPDATE frames in the given layer
+// dpeth.
+static AOM_INLINE void set_params_for_cur_layer_frames(
+    GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+    int *cur_frame_idx, int *cur_disp_idx, int *frame_ind,
+    int *count_arf_frames, int *doh_gf_index_map, int num_dir, int node_start,
+    int node_end, int layer_depth) {
+  assert(num_dir < 3);
+  int start, end;
+  // Iterate through the nodes in the previous layer depth.
+  for (int i = node_start; i < node_end; i++) {
+    // For each node, check if a frame can be coded as INTNL_ARF_UPDATE frame on
+    // either direction.
+    for (int dir = 0; dir < num_dir; dir++) {
+      // Checks for a frame to the left of current node.
+      if (dir == 0) {
+        start = arf_frame_stats[i].start;
+        end = arf_frame_stats[i].display_index;
+      } else {
+        // Checks for a frame to the right of current node.
+        start = arf_frame_stats[i].display_index + 1;
+        end = arf_frame_stats[i].end;
+      }
+      const int num_frames_to_process = end - start;
+      // Checks if a frame can be coded as INTNL_ARF_UPDATE frame. If
+      // num_frames_to_process is less than 3, then there are not enough frames
+      // between 'start' and 'end' to create another level.
+      if (num_frames_to_process >= 3) {
+        // Flag to indicate the lower layer depths for which parallel encoding
+        // is enabled. Currently enabled for layer 4 frames.
+        int layer_with_parallel_encodes = layer_depth == 4;
+        set_params_for_internal_arfs_in_gf14(
+            gf_group, arf_frame_stats, cur_frame_idx, cur_disp_idx, frame_ind,
+            count_arf_frames, doh_gf_index_map, start, end, layer_depth,
+            layer_with_parallel_encodes);
+      }
+    }
+  }
+}
+
+// Configures multi-layers of the GF_GROUP when consecutive encode of frames in
+// the same layer depth is enbaled.
+static AOM_INLINE void set_multi_layer_params_for_gf14(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+    GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+    int *cur_frame_idx, int *frame_ind, int *count_arf_frames,
+    int *doh_gf_index_map, int *parallel_frame_count, int *first_frame_index,
+    int *cur_disp_index, int gf_interval, int layer_depth,
+    int max_parallel_frames) {
+  assert(layer_depth == 2);
+  assert(gf_group->max_layer_depth_allowed >= 4);
+  int layer, node_start, node_end = 0;
+  // Maximum layer depth excluding LF_UPDATE frames is 4 since applicable only
+  // for gf-interval 14.
+  const int max_layer_depth = 4;
+  // Iterate through each layer depth starting from 2 till 'max_layer_depth'.
+  for (layer = layer_depth; layer <= max_layer_depth; layer++) {
+    // 'node_start' and 'node_end' indicate the number of nodes from the
+    // previous layer depth to be considered. It also corresponds to the indices
+    // of arf_frame_stats.
+    node_start = node_end;
+    node_end = (*count_arf_frames);
+    // 'num_dir' indicates the number of directions to traverse w.r.t. a given
+    // node in order to choose an INTNL_ARF_UPDATE frame. Layer depth 2 would
+    // have only one frame and hence needs to traverse only in the left
+    // direction w.r.t the node in the previous layer.
+    int num_dir = layer == 2 ? 1 : 2;
+    set_params_for_cur_layer_frames(gf_group, arf_frame_stats, cur_frame_idx,
+                                    cur_disp_index, frame_ind, count_arf_frames,
+                                    doh_gf_index_map, num_dir, node_start,
+                                    node_end, layer);
+  }
+
+  for (int i = 1; i < gf_interval; i++) {
+    // Since doh_gf_index_map is already populated for all INTNL_ARF_UPDATE
+    // frames in the GF_GROUP, any frame with INVALID_IDX would correspond to an
+    // LF_UPDATE frame.
+    if (doh_gf_index_map[i] == INVALID_IDX) {
+      // LF_UPDATE frames.
+      // TODO(Remya): Correct start and end parameters passed to
+      // set_params_for_leaf_frames() once encode reordering for gf-interval 14
+      // is enbaled for parallel encode of lower layer frames.
+      set_params_for_leaf_frames(
+          twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+          frame_ind, parallel_frame_count, max_parallel_frames, 1,
+          first_frame_index, cur_disp_index, layer, 0, 0);
+    } else {
+      // In order to obtain the layer depths of INTNL_OVERLAY_UPDATE frames, get
+      // the gf index of corresponding INTNL_ARF_UPDATE frames.
+      int intnl_arf_index = doh_gf_index_map[i];
+      int ld = gf_group->layer_depth[intnl_arf_index];
+      set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind,
+                                          first_frame_index, cur_disp_index,
+                                          ld);
+    }
+  }
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
 // Set parameters for frames between 'start' and 'end' (excluding both).
-static void set_multi_layer_params(const TWO_PASS *twopass,
-                                   GF_GROUP *const gf_group, RATE_CONTROL *rc,
-                                   FRAME_INFO *frame_info, int start, int end,
-                                   int *cur_frame_idx, int *frame_ind,
-                                   int arf_ind, int layer_depth) {
-  const int num_frames_to_process = end - start - 1;
-  assert(num_frames_to_process >= 0);
-  if (num_frames_to_process == 0) return;
+static void set_multi_layer_params(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc,
+    RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end,
+    int *cur_frame_idx, int *frame_ind, int *parallel_frame_count,
+    int max_parallel_frames, int do_frame_parallel_encode,
+    int *first_frame_index, int layer_depth) {
+  const int num_frames_to_process = end - start;
 
   // Either we are at the last level of the pyramid, or we don't have enough
   // frames between 'l' and 'r' to create one more level.
   if (layer_depth > gf_group->max_layer_depth_allowed ||
       num_frames_to_process < 3) {
     // Leaf nodes.
-    while (++start < end) {
+    while (start < end) {
       gf_group->update_type[*frame_ind] = LF_UPDATE;
       gf_group->arf_src_offset[*frame_ind] = 0;
-      ++*cur_frame_idx;
       gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
-      gf_group->frame_disp_idx[*frame_ind] = start;
       gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
-      gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(
-          twopass, rc, frame_info, start, end - start, 0, NULL, NULL);
+      gf_group->arf_boost[*frame_ind] =
+          av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start,
+                             end - start, 0, NULL, NULL, 0);
+      gf_group->frame_type[*frame_ind] = INTER_FRAME;
+      gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
       gf_group->max_layer_depth =
           AOMMAX(gf_group->max_layer_depth, layer_depth);
+      // Set the level of parallelism for the LF_UPDATE frame.
+      if (do_frame_parallel_encode) {
+        set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind],
+                                 parallel_frame_count, max_parallel_frames);
+        // Set LF_UPDATE frames as non-reference frames.
+        gf_group->is_frame_non_ref[*frame_ind] = 1;
+      }
+      set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
       ++(*frame_ind);
+      ++(*cur_frame_idx);
+      ++start;
     }
   } else {
-    const int m = (start + end) / 2;
+    const int m = (start + end - 1) / 2;
 
     // Internal ARF.
     gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
-    gf_group->arf_src_offset[*frame_ind] = m - start - 1;
+    gf_group->arf_src_offset[*frame_ind] = m - start;
     gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
-    gf_group->frame_disp_idx[*frame_ind] = m;
     gf_group->layer_depth[*frame_ind] = layer_depth;
+    gf_group->frame_type[*frame_ind] = INTER_FRAME;
+    gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+
+    if (do_frame_parallel_encode) {
+      // If max_parallel_frames is not exceeded and if the frame will not be
+      // temporally filtered, encode the next internal ARF frame in parallel.
+      if (*parallel_frame_count > 1 &&
+          *parallel_frame_count <= max_parallel_frames) {
+        if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR)
+          gf_group->frame_parallel_level[*frame_ind] = 2;
+        *parallel_frame_count = 1;
+      }
+    }
+    set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
 
     // Get the boost factor for intermediate ARF frames.
-    gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(
-        twopass, rc, frame_info, m, end - m, m - start, NULL, NULL);
+    gf_group->arf_boost[*frame_ind] =
+        av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, m, end - m,
+                           m - start, NULL, NULL, 0);
     ++(*frame_ind);
 
     // Frames displayed before this internal ARF.
-    set_multi_layer_params(twopass, gf_group, rc, frame_info, start, m,
-                           cur_frame_idx, frame_ind, 1, layer_depth + 1);
+    set_multi_layer_params(
+        twopass, twopass_frame, gf_group, p_rc, rc, frame_info, start, m,
+        cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, layer_depth + 1);
 
     // Overlay for internal ARF.
     gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
     gf_group->arf_src_offset[*frame_ind] = 0;
     gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
-    gf_group->frame_disp_idx[*frame_ind] = m;
     gf_group->arf_boost[*frame_ind] = 0;
     gf_group->layer_depth[*frame_ind] = layer_depth;
+    gf_group->frame_type[*frame_ind] = INTER_FRAME;
+    gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+
+    set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
     ++(*frame_ind);
+    ++(*cur_frame_idx);
 
     // Frames displayed after this internal ARF.
-    set_multi_layer_params(twopass, gf_group, rc, frame_info, m, end,
-                           cur_frame_idx, frame_ind, arf_ind, layer_depth + 1);
+    set_multi_layer_params(
+        twopass, twopass_frame, gf_group, p_rc, rc, frame_info, m + 1, end,
+        cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, layer_depth + 1);
   }
 }
 
 static int construct_multi_layer_gf_structure(
     AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group,
-    RATE_CONTROL *rc, FRAME_INFO *const frame_info, int gf_interval,
+    RATE_CONTROL *rc, FRAME_INFO *const frame_info, int baseline_gf_interval,
     FRAME_UPDATE_TYPE first_frame_update_type) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  // TODO(angiebird): Why do we need "-1" here?
+  const int gf_interval = baseline_gf_interval - 1;
   int frame_index = 0;
+  int cur_frame_index = 0;
 
-  // Keyframe / Overlay frame / Golden frame.
-  assert(gf_interval >= 1);
-  assert(first_frame_update_type == KF_UPDATE ||
-         first_frame_update_type == OVERLAY_UPDATE ||
-         first_frame_update_type == GF_UPDATE);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  // Set the display order hint for the first frame in the GF_GROUP.
+  int cur_disp_index = (first_frame_update_type == KF_UPDATE)
+                           ? 0
+                           : cpi->common.current_frame.frame_number;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+  // Initialize gf_group->frame_parallel_level and gf_group->is_frame_non_ref to
+  // 0.
+  memset(
+      gf_group->frame_parallel_level, 0,
+      sizeof(gf_group->frame_parallel_level[0]) * MAX_STATIC_GF_GROUP_LENGTH);
+  memset(gf_group->is_frame_non_ref, 0,
+         sizeof(gf_group->is_frame_non_ref[0]) * MAX_STATIC_GF_GROUP_LENGTH);
+  memset(gf_group->src_offset, 0,
+         sizeof(gf_group->src_offset[0]) * MAX_STATIC_GF_GROUP_LENGTH);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  // Initialize gf_group->skip_frame_refresh and gf_group->skip_frame_as_ref
+  // with INVALID_IDX.
+  memset(gf_group->skip_frame_refresh, INVALID_IDX,
+         sizeof(gf_group->skip_frame_refresh[0][0]) *
+             MAX_STATIC_GF_GROUP_LENGTH * REF_FRAMES);
+  memset(gf_group->skip_frame_as_ref, INVALID_IDX,
+         sizeof(gf_group->skip_frame_as_ref[0]) * MAX_STATIC_GF_GROUP_LENGTH);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+  int kf_decomp = cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1;
+  // This is a patch that fixes https://crbug.com/aomedia/3163
+  // enable_keyframe_filtering > 1 will introduce an extra overlay frame at
+  // key frame location. However when
+  // baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH, we can't
+  // afford to have an extra overlay frame. Otherwise, the gf_group->size will
+  // become MAX_STATIC_GF_GROUP_LENGTH + 1, which causes memory error.
+  // A cheap solution is to turn of kf_decomp here.
+  // TODO(angiebird): Find a systematic way to solve this issue.
+  if (baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH) {
+    kf_decomp = 0;
+  }
+  if (first_frame_update_type == KF_UPDATE) {
+    gf_group->update_type[frame_index] = kf_decomp ? ARF_UPDATE : KF_UPDATE;
+    gf_group->arf_src_offset[frame_index] = 0;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+    gf_group->layer_depth[frame_index] = 0;
+    gf_group->frame_type[frame_index] = KEY_FRAME;
+    gf_group->refbuf_state[frame_index] = REFBUF_RESET;
+    gf_group->max_layer_depth = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    gf_group->display_idx[frame_index] = cur_disp_index;
+    if (!kf_decomp) cur_disp_index++;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    ++frame_index;
 
-  gf_group->update_type[frame_index] = first_frame_update_type;
-  gf_group->arf_src_offset[frame_index] = 0;
-  gf_group->cur_frame_idx[frame_index] = 0;
-  gf_group->layer_depth[frame_index] =
-      first_frame_update_type == OVERLAY_UPDATE ? MAX_ARF_LAYERS + 1 : 0;
-  gf_group->max_layer_depth = 0;
-  ++frame_index;
+    if (kf_decomp) {
+      gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+      gf_group->arf_src_offset[frame_index] = 0;
+      gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+      gf_group->layer_depth[frame_index] = 0;
+      gf_group->frame_type[frame_index] = INTER_FRAME;
+      gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+      gf_group->max_layer_depth = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+      gf_group->display_idx[frame_index] = cur_disp_index;
+      cur_disp_index++;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+      ++frame_index;
+    }
+    cur_frame_index++;
+  }
+
+  if (first_frame_update_type == GF_UPDATE) {
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->arf_src_offset[frame_index] = 0;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+    gf_group->layer_depth[frame_index] = 0;
+    gf_group->frame_type[frame_index] = INTER_FRAME;
+    gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+    gf_group->max_layer_depth = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    gf_group->display_idx[frame_index] = cur_disp_index;
+    cur_disp_index++;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    ++frame_index;
+    ++cur_frame_index;
+  }
 
   // ALTREF.
   const int use_altref = gf_group->max_layer_depth_allowed > 0;
+  int is_fwd_kf = rc->frames_to_fwd_kf == gf_interval;
+
   if (use_altref) {
     gf_group->update_type[frame_index] = ARF_UPDATE;
-    gf_group->arf_src_offset[frame_index] = gf_interval - 1;
-    gf_group->cur_frame_idx[frame_index] = 0;
-    gf_group->frame_disp_idx[frame_index] = gf_interval;
+    gf_group->arf_src_offset[frame_index] = gf_interval - cur_frame_index;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
     gf_group->layer_depth[frame_index] = 1;
-    gf_group->arf_boost[frame_index] = cpi->rc.gfu_boost;
+    gf_group->arf_boost[frame_index] = cpi->ppi->p_rc.gfu_boost;
+    gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME;
+    gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
     gf_group->max_layer_depth = 1;
+    gf_group->arf_index = frame_index;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    gf_group->display_idx[frame_index] =
+        cur_disp_index + gf_group->arf_src_offset[frame_index];
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     ++frame_index;
+  } else {
+    gf_group->arf_index = -1;
   }
 
-  int cur_frame_index = 0;
-  // Rest of the frames.
-  set_multi_layer_params(twopass, gf_group, rc, frame_info, 0, gf_interval,
-                         &cur_frame_index, &frame_index, 0, use_altref + 1);
+  // Flag to indicate if multi-layer configuration is complete.
+  int is_multi_layer_configured = 0;
+
+  // Running count of no. of frames that is part of a given parallel
+  // encode set in a gf_group. Value of 1 indicates no parallel encode.
+  int parallel_frame_count = 1;
+  // Enable parallel encode of frames if gf_group has a multi-layer pyramid
+  // structure with minimum 4 layers.
+  int do_frame_parallel_encode = (cpi->ppi->num_fp_contexts > 1 && use_altref &&
+                                  gf_group->max_layer_depth_allowed >= 4);
+
+  int first_frame_index = cur_frame_index;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  if (do_frame_parallel_encode) {
+    // construct_multi_layer_gf_structure() takes the input parameter
+    // 'gf_interval' as p_rc->baseline_gf_interval - 1 . Below code computes the
+    // actual GF_GROUP length by compensating for this offset.
+    int actual_gf_length = ((first_frame_update_type == KF_UPDATE) ||
+                            (first_frame_update_type == GF_UPDATE))
+                               ? gf_interval
+                               : gf_interval + 1;
+
+    // In order to facilitate parallel encoding of frames in lower layer depths,
+    // encode reordering is done. Currently encode reordering is enabled only
+    // for gf-intervals 16 and 32. NOTE: Since the buffer holding the
+    // reference frames is of size 8 (ref_frame_map[REF_FRAMES]), there is a
+    // limitation on the number of hidden frames possible at any given point and
+    // hence the reordering is enabled only for gf-intervals 16 and 32.
+    // Disabling encode reordering for gf-interval 14 since some cross-frame
+    // dependencies related to temporal filtering for FPMT is currently not
+    // handled.
+    int disable_gf14_reorder = 1;
+    if (actual_gf_length == 14 && !disable_gf14_reorder) {
+      // This array holds the gf index of INTNL_ARF_UPDATE frames in the slot
+      // corresponding to their display order hint. This is used while
+      // configuring the LF_UPDATE frames and INTNL_OVERLAY_UPDATE frames.
+      int doh_gf_index_map[FIXED_GF_INTERVAL];
+      // Initialize doh_gf_index_map with INVALID_IDX.
+      memset(&doh_gf_index_map[0], INVALID_IDX,
+             (sizeof(doh_gf_index_map[0]) * FIXED_GF_INTERVAL));
+
+      FRAME_REORDER_INFO arf_frame_stats[REF_FRAMES - 1];
+      // Store the stats corresponding to layer 1 frame.
+      fill_arf_frame_stats(arf_frame_stats, 0, actual_gf_length, 1,
+                           actual_gf_length);
+      int count_arf_frames = 1;
+
+      // Sets multi-layer params for gf-interval 14 to consecutively encode
+      // frames in the same layer depth, i.e., encode order would be 0-> 14->
+      // 7-> 3-> 10-> 5-> 12-> 1-> 2-> 4-> 6-> 8-> 9-> 11-> 13.
+      // TODO(Remya): Set GF_GROUP param 'arf_boost' for all frames.
+      set_multi_layer_params_for_gf14(
+          twopass, &cpi->twopass_frame, p_rc, frame_info, gf_group,
+          arf_frame_stats, &cur_frame_index, &frame_index, &count_arf_frames,
+          doh_gf_index_map, &parallel_frame_count, &first_frame_index,
+          &cur_disp_index, actual_gf_length, use_altref + 1,
+          cpi->ppi->num_fp_contexts);
+
+      // Set gf_group->skip_frame_refresh.
+      for (int i = 0; i < actual_gf_length; i++) {
+        int count = 0;
+        if (gf_group->update_type[i] == INTNL_ARF_UPDATE) {
+          for (int j = 0; j < i; j++) {
+            // Store the display order hint of the frames which would not
+            // have been displayed at the encode call of frame 'i'.
+            if ((gf_group->display_idx[j] < gf_group->display_idx[i]) &&
+                gf_group->update_type[j] == INTNL_ARF_UPDATE) {
+              gf_group->skip_frame_refresh[i][count++] =
+                  gf_group->display_idx[j];
+            }
+          }
+        }
+      }
+    } else {
+      // Set layer depth threshold for reordering as per the gf length.
+      int depth_thr =
+          (actual_gf_length == 16) ? 3 : (actual_gf_length == 32) ? 4 : INT_MAX;
+
+      set_multi_layer_params_for_fp(
+          twopass, &cpi->twopass_frame, gf_group, p_rc, rc, frame_info,
+          cur_frame_index, gf_interval, &cur_frame_index, &frame_index,
+          &parallel_frame_count, cpi->ppi->num_fp_contexts,
+          do_frame_parallel_encode, &first_frame_index, depth_thr,
+          &cur_disp_index, use_altref + 1);
+    }
+    is_multi_layer_configured = 1;
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
-  // The end frame will be Overlay frame for an ARF GOP; otherwise set it to
-  // be GF, for consistency, which will be updated in the next GOP.
-  gf_group->update_type[frame_index] = use_altref ? OVERLAY_UPDATE : GF_UPDATE;
-  gf_group->arf_src_offset[frame_index] = 0;
-  return frame_index;
-}
+  // Rest of the frames.
+  if (!is_multi_layer_configured)
+    set_multi_layer_params(
+        twopass, &cpi->twopass_frame, gf_group, p_rc, rc, frame_info,
+        cur_frame_index, gf_interval, &cur_frame_index, &frame_index,
+        &parallel_frame_count, cpi->ppi->num_fp_contexts,
+        do_frame_parallel_encode, &first_frame_index, use_altref + 1);
 
-#define CHECK_GF_PARAMETER 0
-#if CHECK_GF_PARAMETER
-void check_frame_params(GF_GROUP *const gf_group, int gf_interval) {
-  static const char *update_type_strings[FRAME_UPDATE_TYPES] = {
-    "KF_UPDATE",       "LF_UPDATE",      "GF_UPDATE",
-    "ARF_UPDATE",      "OVERLAY_UPDATE", "INTNL_OVERLAY_UPDATE",
-    "INTNL_ARF_UPDATE"
-  };
-  FILE *fid = fopen("GF_PARAMS.txt", "a");
-
-  fprintf(fid, "\ngf_interval = {%d}\n", gf_interval);
-  for (int i = 0; i < gf_group->size; ++i) {
-    fprintf(fid, "#%2d : %s %d %d %d %d\n", i,
-            update_type_strings[gf_group->update_type[i]],
-            gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
-            gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
+  if (use_altref) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->arf_src_offset[frame_index] = 0;
+    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+    gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS;
+    gf_group->arf_boost[frame_index] = NORMAL_BOOST;
+    gf_group->frame_type[frame_index] = INTER_FRAME;
+    gf_group->refbuf_state[frame_index] =
+        is_fwd_kf ? REFBUF_RESET : REFBUF_UPDATE;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    gf_group->display_idx[frame_index] = cur_disp_index;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    ++frame_index;
+  } else {
+    for (; cur_frame_index <= gf_interval; ++cur_frame_index) {
+      gf_group->update_type[frame_index] = LF_UPDATE;
+      gf_group->arf_src_offset[frame_index] = 0;
+      gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+      gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS;
+      gf_group->arf_boost[frame_index] = NORMAL_BOOST;
+      gf_group->frame_type[frame_index] = INTER_FRAME;
+      gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+      gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2);
+      set_src_offset(gf_group, &first_frame_index, cur_frame_index,
+                     frame_index);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+      gf_group->display_idx[frame_index] = cur_disp_index;
+      cur_disp_index++;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+      ++frame_index;
+    }
   }
-
-  fprintf(fid, "number of nodes in each level: \n");
-  for (int i = 0; i < gf_group->pyramid_height; ++i) {
-    fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]);
+  if (do_frame_parallel_encode) {
+    // Iterate through the gf_group and reset frame_parallel_level to 0 in case
+    // a frame is marked as frame_parallel_level 1 with no subsequent
+    // frame_parallel_level 2 frame(s).
+    int level1_frame_idx = INT_MAX;
+    int level2_frame_count = 0;
+    for (int frame_idx = 0; frame_idx < frame_index; frame_idx++) {
+      if (gf_group->frame_parallel_level[frame_idx] == 1) {
+        // Set frame_parallel_level to 0 if only one frame is present in a
+        // parallel encode set.
+        if (level1_frame_idx != INT_MAX && !level2_frame_count)
+          gf_group->frame_parallel_level[level1_frame_idx] = 0;
+        // Book-keep frame_idx of frame_parallel_level 1 frame and reset the
+        // count of frame_parallel_level 2 frames in the corresponding parallel
+        // encode set.
+        level1_frame_idx = frame_idx;
+        level2_frame_count = 0;
+      }
+      if (gf_group->frame_parallel_level[frame_idx] == 2) level2_frame_count++;
+    }
+    // If frame_parallel_level is set to 1 for the last LF_UPDATE
+    // frame in the gf_group, reset it to zero since there are no subsequent
+    // frames in the gf_group.
+    if (gf_group->frame_parallel_level[frame_index - 2] == 1) {
+      assert(gf_group->update_type[frame_index - 2] == LF_UPDATE);
+      gf_group->frame_parallel_level[frame_index - 2] = 0;
+    }
   }
-  fprintf(fid, "\n");
-  fclose(fid);
-}
-#endif  // CHECK_GF_PARAMETER
 
-#define REF_IDX(ref) ((ref)-LAST_FRAME)
+  for (int gf_idx = frame_index; gf_idx < MAX_STATIC_GF_GROUP_LENGTH;
+       ++gf_idx) {
+    gf_group->update_type[gf_idx] = LF_UPDATE;
+    gf_group->arf_src_offset[gf_idx] = 0;
+    gf_group->cur_frame_idx[gf_idx] = gf_idx;
+    gf_group->layer_depth[gf_idx] = MAX_ARF_LAYERS;
+    gf_group->arf_boost[gf_idx] = NORMAL_BOOST;
+    gf_group->frame_type[gf_idx] = INTER_FRAME;
+    gf_group->refbuf_state[gf_idx] = REFBUF_UPDATE;
+    gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2);
+  }
 
-static INLINE void reset_ref_frame_idx(int *ref_idx, int reset_value) {
-  for (int i = 0; i < REF_FRAMES; ++i) ref_idx[i] = reset_value;
+  return frame_index;
 }
 
-static INLINE void set_ref_frame_disp_idx(GF_GROUP *const gf_group) {
-  for (int i = 0; i < gf_group->size; ++i) {
-    for (int ref = 0; ref < INTER_REFS_PER_FRAME + 1; ++ref) {
-      int ref_gop_idx = gf_group->ref_frame_gop_idx[i][ref];
-      if (ref_gop_idx == -1) {
-        gf_group->ref_frame_disp_idx[i][ref] = -1;
-      } else {
-        gf_group->ref_frame_disp_idx[i][ref] =
-            gf_group->frame_disp_idx[ref_gop_idx];
-      }
-    }
+static void set_ld_layer_depth(GF_GROUP *gf_group, int gop_length) {
+  int log_gop_length = 0;
+  while ((1 << log_gop_length) < gop_length) {
+    ++log_gop_length;
   }
-}
 
-static void set_gop_ref_frame_map(GF_GROUP *const gf_group) {
-  // Initialize the reference slots as all -1.
-  for (int frame_idx = 0; frame_idx < gf_group->size; ++frame_idx)
-    reset_ref_frame_idx(gf_group->ref_frame_gop_idx[frame_idx], -1);
-
-  // Set the map for frames in the current gop
-  for (int frame_idx = 0; frame_idx < gf_group->size; ++frame_idx) {
-    const FRAME_UPDATE_TYPE update_type = gf_group->update_type[frame_idx];
-    // TODO(yuec): need to figure out how to determine
-    // (1) whether a KEY_FRAME has show_frame on
-    // (2) whether a frame with INTNL_OVERLAY_UPDATE type has
-    //     show_existing_frame on
-    const int show_frame =
-        update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE;
-    const int show_existing_frame =
-        update_type == OVERLAY_UPDATE || update_type == INTNL_OVERLAY_UPDATE;
-
-    int this_ref_map[INTER_REFS_PER_FRAME + 1];
-    memcpy(this_ref_map, gf_group->ref_frame_gop_idx[frame_idx],
-           sizeof(this_ref_map));
-    int *next_ref_map = &gf_group->ref_frame_gop_idx[frame_idx + 1][0];
-
-    switch (update_type) {
-      case KF_UPDATE:
-        if (show_frame) {
-          reset_ref_frame_idx(this_ref_map, frame_idx);
-        } else {
-          this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx;
-          this_ref_map[REF_IDX(EXTREF_FRAME)] = frame_idx;
-          this_ref_map[REF_IDX(ALTREF2_FRAME)] = frame_idx;
-          this_ref_map[REF_IDX(GOLDEN_FRAME)] = frame_idx;
-          this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx;
-        }
-        break;
-      case LF_UPDATE: this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx; break;
-      case GF_UPDATE:
-        this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx;
-        this_ref_map[REF_IDX(GOLDEN_FRAME)] = frame_idx;
-        break;
-      case OVERLAY_UPDATE:
-        this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx;
-        break;
-      case ARF_UPDATE: this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx; break;
-      case INTNL_OVERLAY_UPDATE:
-        if (!show_existing_frame)
-          this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx;
-        break;
-      case INTNL_ARF_UPDATE:
-        this_ref_map[REF_IDX(EXTREF_FRAME)] = frame_idx;
-        break;
-      default: assert(0); break;
-    }
-
-    memcpy(next_ref_map, this_ref_map, sizeof(this_ref_map));
-
-    switch (update_type) {
-      case LF_UPDATE:
-      case GF_UPDATE:
-        next_ref_map[REF_IDX(LAST3_FRAME)] = this_ref_map[REF_IDX(LAST2_FRAME)];
-        next_ref_map[REF_IDX(LAST2_FRAME)] = this_ref_map[REF_IDX(LAST_FRAME)];
-        next_ref_map[REF_IDX(LAST_FRAME)] = this_ref_map[REF_IDX(LAST3_FRAME)];
-        break;
-      case INTNL_OVERLAY_UPDATE:
-        if (!show_existing_frame) {
-          next_ref_map[REF_IDX(LAST3_FRAME)] =
-              this_ref_map[REF_IDX(LAST2_FRAME)];
-          next_ref_map[REF_IDX(LAST2_FRAME)] =
-              this_ref_map[REF_IDX(LAST_FRAME)];
-          next_ref_map[REF_IDX(LAST_FRAME)] =
-              this_ref_map[REF_IDX(LAST3_FRAME)];
-        } else {
-          next_ref_map[REF_IDX(LAST_FRAME)] =
-              this_ref_map[REF_IDX(BWDREF_FRAME)];
-          next_ref_map[REF_IDX(LAST2_FRAME)] =
-              this_ref_map[REF_IDX(LAST_FRAME)];
-          next_ref_map[REF_IDX(LAST3_FRAME)] =
-              this_ref_map[REF_IDX(LAST2_FRAME)];
-          next_ref_map[REF_IDX(BWDREF_FRAME)] =
-              this_ref_map[REF_IDX(ALTREF2_FRAME)];
-          next_ref_map[REF_IDX(ALTREF2_FRAME)] =
-              this_ref_map[REF_IDX(EXTREF_FRAME)];
-          next_ref_map[REF_IDX(EXTREF_FRAME)] =
-              this_ref_map[REF_IDX(LAST3_FRAME)];
-        }
-        break;
-      case INTNL_ARF_UPDATE:
-        if (!show_existing_frame) {
-          next_ref_map[REF_IDX(BWDREF_FRAME)] =
-              this_ref_map[REF_IDX(EXTREF_FRAME)];
-          next_ref_map[REF_IDX(ALTREF2_FRAME)] =
-              this_ref_map[REF_IDX(BWDREF_FRAME)];
-          next_ref_map[REF_IDX(EXTREF_FRAME)] =
-              this_ref_map[REF_IDX(ALTREF2_FRAME)];
-        }
-        break;
-      case OVERLAY_UPDATE:
-        next_ref_map[REF_IDX(ALTREF_FRAME)] =
-            this_ref_map[REF_IDX(GOLDEN_FRAME)];
-        next_ref_map[REF_IDX(GOLDEN_FRAME)] =
-            this_ref_map[REF_IDX(ALTREF_FRAME)];
-        break;
-      default: break;
+  for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) {
+    int count = 0;
+    // Find the trailing zeros
+    for (; count < MAX_ARF_LAYERS; ++count) {
+      if ((gf_index >> count) & 0x01) break;
     }
+    gf_group->layer_depth[gf_index] = AOMMAX(log_gop_length - count, 0);
   }
-
-  // Set the map in display order index by converting from gop indices in the
-  // above map
-  set_ref_frame_disp_idx(gf_group);
+  gf_group->max_layer_depth = log_gop_length;
 }
 
-void av1_gop_setup_structure(AV1_COMP *cpi,
-                             const EncodeFrameParams *const frame_params) {
+void av1_gop_setup_structure(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  GF_GROUP *const gf_group = &cpi->gf_group;
-  TWO_PASS *const twopass = &cpi->twopass;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FRAME_INFO *const frame_info = &cpi->frame_info;
-  const int key_frame = (frame_params->frame_type == KEY_FRAME);
-  const FRAME_UPDATE_TYPE first_frame_update_type =
-      key_frame ? KF_UPDATE
-                : rc->source_alt_ref_active ? OVERLAY_UPDATE : GF_UPDATE;
+  const int key_frame = rc->frames_since_key == 0;
+  FRAME_UPDATE_TYPE first_frame_update_type = ARF_UPDATE;
+
+  if (key_frame)
+    first_frame_update_type = KF_UPDATE;
+  else if (!cpi->ppi->gf_state.arf_gf_boost_lst)
+    first_frame_update_type = GF_UPDATE;
+
   gf_group->size = construct_multi_layer_gf_structure(
-      cpi, twopass, gf_group, rc, frame_info, rc->baseline_gf_interval,
+      cpi, twopass, gf_group, rc, frame_info, p_rc->baseline_gf_interval,
       first_frame_update_type);
 
-  set_gop_ref_frame_map(gf_group);
+  if (gf_group->max_layer_depth_allowed == 0)
+    set_ld_layer_depth(gf_group, p_rc->baseline_gf_interval);
+}
 
-#if CHECK_GF_PARAMETER
-  check_frame_params(gf_group, rc->baseline_gf_interval);
-#endif
+int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group,
+                                   int gf_frame_index) {
+  return gf_group->frame_type[gf_frame_index] == KEY_FRAME &&
+         gf_group->refbuf_state[gf_frame_index] == REFBUF_UPDATE;
+}
+
+int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index) {
+  const int arf_src_offset = gf_group->arf_src_offset[gf_frame_index];
+  // TODO(angiebird): when gf_group->size == 32, it's possble to
+  // have "two" second arf. Check if this is acceptable.
+  if (gf_group->update_type[gf_frame_index] == INTNL_ARF_UPDATE &&
+      arf_src_offset >= TF_LOOKAHEAD_IDX_THR) {
+    return 1;
+  }
+  return 0;
 }
diff --git a/media/libaom/src/av1/encoder/gop_structure.h b/media/libaom/src/av1/encoder/gop_structure.h
index 0c775c7b49..eb20c84616 100644
--- a/media/libaom/src/av1/encoder/gop_structure.h
+++ b/media/libaom/src/av1/encoder/gop_structure.h
@@ -18,24 +18,76 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-
+/*!\cond */
 struct AV1_COMP;
 struct EncodeFrameParams;
 
 #define MIN_ARF_GF_BOOST 240
 #define NORMAL_BOOST 100
 
-// Set up the Group-Of-Pictures structure for this GF_GROUP.  This involves
-// deciding where to place the various FRAME_UPDATE_TYPEs in the group.  It does
-// this primarily by setting the contents of
-// cpi->twopass.gf_group.update_type[].
-void av1_gop_setup_structure(
-    struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params);
-
-int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
-                       FRAME_INFO *frame_info, int offset, int f_frames,
-                       int b_frames, int *num_fpstats_used,
-                       int *num_fpstats_required);
+/*!\endcond */
+
+/*!\brief Set up the Group-Of-Pictures structure for this GF_GROUP.
+ *
+ *\ingroup rate_control
+ *
+ * This function defines the Group-Of-Pictures structure for this GF_GROUP.
+ * This involves deciding where to place the various FRAME_UPDATE_TYPEs in
+ * the group. It does this primarily by updateing entries in
+ * cpi->twopass.gf_group.update_type[].
+ *
+ * \param[in]    cpi          Top - level encoder instance structure
+ *
+ * \return No return value but this function updates group data structures.
+ */
+void av1_gop_setup_structure(struct AV1_COMP *cpi);
+
+/*!\brief Distributes bits to frames in a group
+ *
+ *\ingroup rate_control
+ *
+ * This function decides on the allocation of bits between the different
+ * frames and types of frame in a GF/ARF group.
+ *
+ * \param[in]   cpi           Top - level encoder instance structure
+ * \param[in]   rc            Rate control data
+ * \param[in]   gf_group      GF/ARF group data structure
+ * \param[in]   is_key_frame  Indicates if the first frame in the group is
+ *                            also a key frame.
+ * \param[in]   use_arf       Are ARF frames enabled or is this a GF only
+ *                            uni-directional group.
+ * \param[in]   gf_group_bits Bits available to be allocated.
+ *
+ * \return No return but updates the rate control and group data structures
+ *         to reflect the allocation of bits.
+ */
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+                            GF_GROUP *gf_group, int is_key_frame, int use_arf,
+                            int64_t gf_group_bits);
+
+/*!\brief Check whether a frame in the GOP is a forward key frame
+ *
+ *\ingroup rate_control
+ *
+ * \param[in]   gf_group       GF/ARF group data structure
+ * \param[in]   gf_frame_index GOP index
+ *
+ * \return Return 1 if it is a forward key frame, otherwise return 0
+ */
+int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group,
+                                   int gf_frame_index);
+
+/*!\brief Check whether a frame in the GOP is the second arf
+ *
+ *\ingroup rate_control
+ *
+ * \param[in]   gf_group       GF/ARF group data structure
+ * \param[in]   gf_frame_index GOP index
+ *
+ * \return Return 1 if it is the second arf
+ */
+int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/hash_motion.c b/media/libaom/src/av1/encoder/hash_motion.c
index 310cde886b..164aa09783 100644
--- a/media/libaom/src/av1/encoder/hash_motion.c
+++ b/media/libaom/src/av1/encoder/hash_motion.c
@@ -10,6 +10,7 @@
  */
 
 #include <assert.h>
+#include <stdbool.h>
 
 #include "config/av1_rtcd.h"
 
@@ -120,23 +121,26 @@ void av1_hash_table_destroy(hash_table *p_hash_table) {
   p_hash_table->p_lookup_table = NULL;
 }
 
-void av1_hash_table_create(hash_table *p_hash_table) {
+bool av1_hash_table_create(hash_table *p_hash_table) {
   if (p_hash_table->p_lookup_table != NULL) {
     av1_hash_table_clear_all(p_hash_table);
-    return;
+    return true;
   }
   p_hash_table->p_lookup_table =
-      (Vector **)aom_malloc(sizeof(p_hash_table->p_lookup_table[0]) * kMaxAddr);
-  memset(p_hash_table->p_lookup_table, 0,
-         sizeof(p_hash_table->p_lookup_table[0]) * kMaxAddr);
+      (Vector **)aom_calloc(kMaxAddr, sizeof(p_hash_table->p_lookup_table[0]));
+  if (!p_hash_table) return false;
+  return true;
 }
 
-static void hash_table_add_to_table(hash_table *p_hash_table,
+static bool hash_table_add_to_table(hash_table *p_hash_table,
                                     uint32_t hash_value,
                                     block_hash *curr_block_hash) {
   if (p_hash_table->p_lookup_table[hash_value] == NULL) {
     p_hash_table->p_lookup_table[hash_value] =
         aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0]));
+    if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+      return false;
+    }
     aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
                      sizeof(curr_block_hash[0]));
     aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
@@ -145,6 +149,7 @@ static void hash_table_add_to_table(hash_table *p_hash_table,
     aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
                          curr_block_hash);
   }
+  return true;
 }
 
 int32_t av1_hash_table_count(const hash_table *p_hash_table,
@@ -307,7 +312,7 @@ void av1_generate_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
   }
 }
 
-void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
                                                  uint32_t *pic_hash[2],
                                                  int8_t *pic_is_same,
                                                  int pic_width, int pic_height,
@@ -335,10 +340,14 @@ void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
         const uint32_t hash_value1 = (src_hash[0][pos] & crc_mask) + add_value;
         curr_block_hash.hash_value2 = src_hash[1][pos];
 
-        hash_table_add_to_table(p_hash_table, hash_value1, &curr_block_hash);
+        if (!hash_table_add_to_table(p_hash_table, hash_value1,
+                                     &curr_block_hash)) {
+          return false;
+        }
       }
     }
   }
+  return true;
 }
 
 int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
diff --git a/media/libaom/src/av1/encoder/hash_motion.h b/media/libaom/src/av1/encoder/hash_motion.h
index e4ea1f3948..8974ba27cb 100644
--- a/media/libaom/src/av1/encoder/hash_motion.h
+++ b/media/libaom/src/av1/encoder/hash_motion.h
@@ -12,6 +12,8 @@
 #ifndef AOM_AV1_ENCODER_HASH_MOTION_H_
 #define AOM_AV1_ENCODER_HASH_MOTION_H_
 
+#include <stdbool.h>
+
 #include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
@@ -56,7 +58,7 @@ typedef struct intrabc_hash_info {
 void av1_hash_table_init(IntraBCHashInfo *intra_bc_hash_info);
 void av1_hash_table_clear_all(hash_table *p_hash_table);
 void av1_hash_table_destroy(hash_table *p_hash_table);
-void av1_hash_table_create(hash_table *p_hash_table);
+bool av1_hash_table_create(hash_table *p_hash_table);
 int32_t av1_hash_table_count(const hash_table *p_hash_table,
                              uint32_t hash_value);
 Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
@@ -74,7 +76,7 @@ void av1_generate_block_hash_value(IntraBCHashInfo *intra_bc_hash_info,
                                    uint32_t *dst_pic_block_hash[2],
                                    int8_t *src_pic_block_same_info[3],
                                    int8_t *dst_pic_block_same_info[3]);
-void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
                                                  uint32_t *pic_hash[2],
                                                  int8_t *pic_is_same,
                                                  int pic_width, int pic_height,
diff --git a/media/libaom/src/av1/encoder/hybrid_fwd_txfm.c b/media/libaom/src/av1/encoder/hybrid_fwd_txfm.c
index 06990857a0..eda5ddf78c 100644
--- a/media/libaom/src/av1/encoder/hybrid_fwd_txfm.c
+++ b/media/libaom/src/av1/encoder/hybrid_fwd_txfm.c
@@ -14,6 +14,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "av1/common/idct.h"
+#include "av1/common/blockd.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
 
 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
@@ -134,6 +135,7 @@ static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
                        txfm_param->bd);
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
@@ -161,6 +163,7 @@ static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
   av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
                       txfm_param->bd);
 }
+#endif
 
 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TxfmParam *txfm_param) {
@@ -204,6 +207,7 @@ static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
                        bd);
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
   assert(txfm_param->tx_type == DCT_DCT);
@@ -219,6 +223,7 @@ static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff,
   const int bd = txfm_param->bd;
   av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
 }
+#endif
 
 static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
@@ -255,12 +260,7 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
     case TX_64X32:
       highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
       break;
-    case TX_16X64:
-      highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_64X16:
-      highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param);
-      break;
+
     case TX_32X32:
       highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
       break;
@@ -291,6 +291,7 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
     case TX_4X4:
       highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
       break;
+#if !CONFIG_REALTIME_ONLY
     case TX_4X16:
       highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
       break;
@@ -303,6 +304,36 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
     case TX_32X8:
       highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
       break;
+    case TX_16X64:
+      highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_64X16:
+      highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+#endif
     default: assert(0); break;
   }
 }
+
+void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
+                    const int16_t *src_diff, int src_stride,
+                    tran_low_t *coeff) {
+  if (use_hadamard) {
+    switch (tx_size) {
+      case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+      case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break;
+      case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break;
+      case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break;
+      default: assert(0);
+    }
+  } else {
+    TxfmParam txfm_param;
+    txfm_param.tx_type = DCT_DCT;
+    txfm_param.tx_size = tx_size;
+    txfm_param.lossless = 0;
+    txfm_param.bd = bd_info.bit_depth;
+    txfm_param.is_hbd = bd_info.use_highbitdepth_buf;
+    txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+    av1_fwd_txfm(src_diff, coeff, src_stride, &txfm_param);
+  }
+}
diff --git a/media/libaom/src/av1/encoder/hybrid_fwd_txfm.h b/media/libaom/src/av1/encoder/hybrid_fwd_txfm.h
index daabc7119a..30f8a2258b 100644
--- a/media/libaom/src/av1/encoder/hybrid_fwd_txfm.h
+++ b/media/libaom/src/av1/encoder/hybrid_fwd_txfm.h
@@ -24,6 +24,15 @@ void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
 void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TxfmParam *txfm_param);
 
+/*!\brief Apply Hadamard or DCT transform
+ *
+ * \callergraph
+ * DCT and Hadamard transforms are commonly used for quick RD score estimation.
+ * The coeff buffer's size should be equal to the number of pixels
+ * corresponding to tx_size.
+ */
+void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
+                    const int16_t *src_diff, int src_stride, tran_low_t *coeff);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/interp_search.c b/media/libaom/src/av1/encoder/interp_search.c
index 6b7317be77..c313372305 100644
--- a/media/libaom/src/av1/encoder/interp_search.c
+++ b/media/libaom/src/av1/encoder/interp_search.c
@@ -106,22 +106,16 @@ int av1_find_interp_filter_match(
   return match_found_idx;
 }
 
-static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2],
-                                int num_planes) {
-  const BUFFER_SET *buf0 = dst_bufs[0];
-  dst_bufs[0] = dst_bufs[1];
-  dst_bufs[1] = buf0;
-  restore_dst_buf(xd, *dst_bufs[0], num_planes);
-}
-
 static INLINE int get_switchable_rate(MACROBLOCK *const x,
                                       const int_interpfilters filters,
-                                      const int ctx[2]) {
-  int inter_filter_cost;
+                                      const int ctx[2], int dual_filter) {
   const InterpFilter filter0 = filters.as_filters.y_filter;
-  const InterpFilter filter1 = filters.as_filters.x_filter;
-  inter_filter_cost = x->switchable_interp_costs[ctx[0]][filter0];
-  inter_filter_cost += x->switchable_interp_costs[ctx[1]][filter1];
+  int inter_filter_cost =
+      x->mode_costs.switchable_interp_costs[ctx[0]][filter0];
+  if (dual_filter) {
+    const InterpFilter filter1 = filters.as_filters.x_filter;
+    inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx[1]][filter1];
+  }
   return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
 }
 
@@ -136,7 +130,7 @@ static INLINE void interp_model_rd_eval(
   RD_STATS tmp_rd_stats;
   av1_init_rd_stats(&tmp_rd_stats);
 
-  // Skip inter predictor if the predictor is already avilable.
+  // Skip inter predictor if the predictor is already available.
   if (!is_skip_build_pred) {
     const int mi_row = xd->mi_row;
     const int mi_col = xd->mi_col;
@@ -148,8 +142,8 @@ static INLINE void interp_model_rd_eval(
                      ? MODELRD_LEGACY
                      : MODELRD_TYPE_INTERP_FILTER](
       cpi, bsize, x, xd, plane_from, plane_to, &tmp_rd_stats.rate,
-      &tmp_rd_stats.dist, &tmp_rd_stats.skip, &tmp_rd_stats.sse, NULL, NULL,
-      NULL);
+      &tmp_rd_stats.dist, &tmp_rd_stats.skip_txfm, &tmp_rd_stats.sse, NULL,
+      NULL, NULL);
 
   av1_merge_rd_stats(rd_stats, &tmp_rd_stats);
 }
@@ -175,7 +169,8 @@ static INLINE int64_t interpolation_filter_rd(
   const int_interpfilters last_best = mbmi->interp_filters;
   mbmi->interp_filters = filter_sets[filter_idx];
   const int tmp_rs =
-      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
+                          cm->seq_params->enable_dual_filter);
 
   int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0);
   if (min_rd > *rd) {
@@ -189,12 +184,12 @@ static INLINE int64_t interpolation_filter_rd(
   assert((rd_stats_luma->rate >= 0) && (rd_stats->rate >= 0));
   assert((rd_stats_luma->dist >= 0) && (rd_stats->dist >= 0));
   assert((rd_stats_luma->sse >= 0) && (rd_stats->sse >= 0));
-  assert((rd_stats_luma->skip == 0) || (rd_stats_luma->skip == 1));
-  assert((rd_stats->skip == 0) || (rd_stats->skip == 1));
+  assert((rd_stats_luma->skip_txfm == 0) || (rd_stats_luma->skip_txfm == 1));
+  assert((rd_stats->skip_txfm == 0) || (rd_stats->skip_txfm == 1));
   assert((skip_pred >= 0) &&
          (skip_pred <= interp_search_flags->default_interp_skip_flags));
 
-  // When skip pred is equal to default_interp_skip_flags,
+  // When skip_txfm pred is equal to default_interp_skip_flags,
   // skip both luma and chroma MC.
   // For mono-chrome images:
   // num_planes = 1 and cpi->default_interp_skip_flags = 1,
@@ -446,14 +441,29 @@ static INLINE void find_best_non_dual_interp_filter(
       interp_search_flags->interp_filter_search_mask;
 
   if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
     const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0);
     const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1);
-    const int *switchable_interp_p0 =
-        cpi->frame_probs.switchable_interp_probs[update_type][ctx0];
-    const int *switchable_interp_p1 =
-        cpi->frame_probs.switchable_interp_probs[update_type][ctx1];
-
+    int use_actual_frame_probs = 1;
+    const int *switchable_interp_p0;
+    const int *switchable_interp_p1;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+    use_actual_frame_probs =
+        (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+    if (!use_actual_frame_probs) {
+      switchable_interp_p0 = (int *)cpi->ppi->temp_frame_probs
+                                 .switchable_interp_probs[update_type][ctx0];
+      switchable_interp_p1 = (int *)cpi->ppi->temp_frame_probs
+                                 .switchable_interp_probs[update_type][ctx1];
+    }
+#endif
+    if (use_actual_frame_probs) {
+      switchable_interp_p0 =
+          cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx0];
+      switchable_interp_p1 =
+          cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx1];
+    }
     static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 };
     const int thresh = thr[update_type];
     for (i = 0; i < SWITCHABLE_FILTERS; i++) {
@@ -604,6 +614,43 @@ static INLINE void calc_interp_skip_pred_flag(MACROBLOCK *const x,
   }
 }
 
+/*!\brief AV1 interpolation filter search
+ *
+ * \ingroup inter_mode_search
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     tile_data         Pointer to struct holding adaptive
+ *                                  data/contexts/models for the tile during
+ *                                  encoding.
+ * \param[in]     x                 Pointer to struc holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     bsize             Current block size.
+ * \param[in]     tmp_dst           A temporary prediction buffer to hold a
+ *                                  computed prediction.
+ * \param[in,out] orig_dst          A prediction buffer to hold a computed
+ *                                  prediction. This will eventually hold the
+ *                                  final prediction, and the tmp_dst info will
+ *                                  be copied here.
+ * \param[in,out] rd                The RD cost associated with the selected
+ *                                  interpolation filter parameters.
+ * \param[in,out] switchable_rate   The rate associated with using a SWITCHABLE
+ *                                  filter mode.
+ * \param[in,out] skip_build_pred   Indicates whether or not to build the inter
+ *                                  predictor. If this is 0, the inter predictor
+ *                                  has already been built and thus we can avoid
+ *                                  repeating computation.
+ * \param[in]     args              HandleInterModeArgs struct holding
+ *                                  miscellaneous arguments for inter mode
+ *                                  search. See the documentation for this
+ *                                  struct for a description of each member.
+ * \param[in]     ref_best_rd       Best RD found so far for this block.
+ *                                  It is used for early termination of this
+ *                                  search if the RD exceeds this value.
+ *
+ * \return Returns INT64_MAX if the filter parameters are invalid and the
+ * current motion mode being tested should be skipped. It returns 0 if the
+ * parameter search is a success.
+ */
 int64_t av1_interpolation_filter_search(
     MACROBLOCK *const x, const AV1_COMP *const cpi,
     const TileDataEnc *tile_data, BLOCK_SIZE bsize,
@@ -642,7 +689,8 @@ int64_t av1_interpolation_filter_search(
   switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0);
   switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
   *switchable_rate =
-      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
+                          cm->seq_params->enable_dual_filter);
 
   // Do MC evaluation for default filter_type.
   // Luma MC
@@ -706,7 +754,7 @@ int64_t av1_interpolation_filter_search(
   restore_dst_buf(xd, *tmp_dst, num_planes);
   const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
   // Evaluate dual interp filters
-  if (cm->seq_params.enable_dual_filter) {
+  if (cm->seq_params->enable_dual_filter) {
     if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) {
       fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
                                  &rd_stats_luma, &rd_stats, switchable_rate,
diff --git a/media/libaom/src/av1/encoder/interp_search.h b/media/libaom/src/av1/encoder/interp_search.h
index 401e14f5bc..8eba483d03 100644
--- a/media/libaom/src/av1/encoder/interp_search.h
+++ b/media/libaom/src/av1/encoder/interp_search.h
@@ -20,6 +20,7 @@
 extern "C" {
 #endif
 
+/*!\cond */
 #define MAX_INTERP_FILTER_STATS 128
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
 
@@ -32,33 +33,141 @@ typedef struct {
   int64_t rd;
   unsigned int pred_sse;
 } INTERPOLATION_FILTER_STATS;
+/*!\endcond */
 
-typedef struct {
-  // OBMC secondary prediction buffers and respective strides
+/*!\brief Miscellaneous arguments for inter mode search.
+ */
+typedef struct HandleInterModeArgs {
+  /*!
+   * Buffer for the above predictor in OBMC
+   */
   uint8_t *above_pred_buf[MAX_MB_PLANE];
+  /*!
+   * Stride for the above predictor in OBMC
+   */
   int above_pred_stride[MAX_MB_PLANE];
+  /*!
+   * Buffer for the left predictor in OBMC
+   */
   uint8_t *left_pred_buf[MAX_MB_PLANE];
+  /*!
+   * Stride for the left predictor in OBMC
+   */
   int left_pred_stride[MAX_MB_PLANE];
+  /*!
+   * Pointer to the first member in a 2D array which holds
+   * single reference mode motion vectors to be used as a starting
+   * point in the mv search for compound modes. Each array is length REF_FRAMES,
+   * meaning there is a slot for a single reference motion vector for
+   * each possible reference frame. The 2D array consists of N of these arrays,
+   * where N is the length of the reference mv stack computed for the single
+   * reference case for that particular reference frame.
+   */
   int_mv (*single_newmv)[REF_FRAMES];
-  // Pointer to array of motion vectors to use for each ref and their rates
-  // Should point to first of 2 arrays in 2D array
+  /*!
+   * Pointer to the first array of a 2D array with the same setup as
+   * single_newmv array above. This is a 2D array to hold the rate
+   * corresponding to each of the single reference mode motion vectors
+   * held in single_newmv.
+   */
   int (*single_newmv_rate)[REF_FRAMES];
+  /*!
+   * Pointer to the first array of a 2D array with the same setup as
+   * single_newmv array above. This is a 2D array to hold a 0 or 1
+   * validity value corresponding to each of the single reference mode motion
+   * vectors held in single_newmv.
+   */
   int (*single_newmv_valid)[REF_FRAMES];
-  // Pointer to array of predicted rate-distortion
-  // Should point to first of 2 arrays in 2D array
+  /*!
+   * Pointer to the first array in a 3D array of predicted rate-distortion.
+   * The dimensions of this structure are:
+   * (number of possible inter modes) X
+   * (number of reference MVs) X
+   * (number of reference frames).
+   */
   int64_t (*modelled_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+  /*!
+   * Holds an estimated entropy cost for picking the current reference frame.
+   * This is used to compute an rd estimate.
+   */
   int ref_frame_cost;
+  /*!
+   * Holds an estimated entropy cost for picking single or compound
+   * reference. This is used to compute an rd estimate.
+   */
   int single_comp_cost;
+  /*!
+   * Pointer to the first element in a 3D array holding rd's of
+   * SIMPLE_TRANSLATION used to prune out the motion mode search in single ref
+   * modes used to determine compound ref modes. The full structure is:
+   * (number of inter modes) X (length of refmv list) X (number of ref frames)
+   */
   int64_t (*simple_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+  /*!
+   * An integer value 0 or 1 which indicates whether or not to skip the motion
+   * mode search and default to SIMPLE_TRANSLATION as a speed feature.
+   */
   int skip_motion_mode;
+  /*!
+   * A pointer to the first element in an array of INTERINTRA_MODE types. This
+   * contains the best inter_intra mode for each reference frame.
+   */
   INTERINTRA_MODE *inter_intra_mode;
-  int single_ref_first_pass;
-  SimpleRDState *simple_rd_state;
-  // [comp_idx][saved stat_idx]
+  /*!
+   * Array of saved interpolation filter stats collected to avoid repeating
+   * an interpolation filter search when the mv and ref_frame are the same
+   * as a previous search.
+   */
   INTERPOLATION_FILTER_STATS interp_filter_stats[MAX_INTERP_FILTER_STATS];
+
+  /*!
+   * Stack to store full pixel search start mv of NEWMV mode.
+   */
+  FULLPEL_MV start_mv_stack[(MAX_REF_MV_SEARCH - 1) * 2];
+
+  /*!
+   * Count of mvs in start mv stack.
+   */
+  int start_mv_cnt;
+
+  /*!
+   * Index of the last set of saved stats in the interp_filter_stats array.
+   */
   int interp_filter_stats_idx;
+  /*!
+   * Estimated wedge index.
+   */
+  int wedge_index;
+  /*!
+   * Estimated wedge sign.
+   */
+  int wedge_sign;
+  /*!
+   * Estimated diff wtd index.
+   */
+  int diffwtd_index;
+  /*!
+   * Estimated cmp mode.
+   */
+  int cmp_mode[MODE_CTX_REF_FRAMES];
+  /*!
+   * The best sse during single new_mv search. Note that the sse here comes from
+   * single_motion_search, and not from interpolation_filter_search. This has
+   * two implications:
+   * 1. The mv used to calculate the sse here does not have to be the best sse
+   *    found in handle_inter_mode.
+   * 2. Even if the mvs agree, the sse here can differ from the sse in \ref
+   *    MACROBLOCK::pred_sse due to different interpolation filter used.
+   */
+  unsigned int best_single_sse_in_refs[REF_FRAMES];
+  /*!
+   * Holds the sse of best mode so far in the mode evaluation process. This is
+   * used in intermediate termination of NEWMV mode evaluation.
+   */
+  unsigned int best_pred_sse;
 } HandleInterModeArgs;
 
+/*!\cond */
 static const int_interpfilters filter_sets[DUAL_FILTER_SET_SIZE] = {
   { 0x00000000 }, { 0x00010000 }, { 0x00020000 },  // y = 0
   { 0x00000001 }, { 0x00010001 }, { 0x00020001 },  // y = 1
@@ -78,6 +187,7 @@ int64_t av1_interpolation_filter_search(
     int64_t *const rd, int *const switchable_rate, int *skip_build_pred,
     HandleInterModeArgs *args, int64_t ref_best_rd);
 
+/*!\endcond */
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/intra_mode_search.c b/media/libaom/src/av1/encoder/intra_mode_search.c
index 43192a9452..c81edccc4b 100644
--- a/media/libaom/src/av1/encoder/intra_mode_search.c
+++ b/media/libaom/src/av1/encoder/intra_mode_search.c
@@ -9,13 +9,16 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "av1/common/av1_common_int.h"
+#include "av1/common/reconintra.h"
+
 #include "av1/encoder/intra_mode_search.h"
-#include "av1/encoder/model_rd.h"
+#include "av1/encoder/intra_mode_search_utils.h"
 #include "av1/encoder/palette.h"
-#include "av1/common/pred_common.h"
-#include "av1/common/reconintra.h"
+#include "av1/encoder/speed_features.h"
 #include "av1/encoder/tx_search.h"
 
+/*!\cond */
 static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
   DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, PAETH_PRED,
   SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED,   D157_PRED,
@@ -29,390 +32,197 @@ static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
   UV_D113_PRED,   UV_D45_PRED,
 };
 
-#define BINS 32
-static const float intra_hog_model_bias[DIRECTIONAL_MODES] = {
-  0.450578f,  0.695518f,  -0.717944f, -0.639894f,
-  -0.602019f, -0.453454f, 0.055857f,  -0.465480f,
+// The bitmask corresponds to the filter intra modes as defined in enums.h
+// FILTER_INTRA_MODE enumeration type. Setting a bit to 0 in the mask means to
+// disable the evaluation of corresponding filter intra mode. The table
+// av1_derived_filter_intra_mode_used_flag is used when speed feature
+// prune_filter_intra_level is 1. The evaluated filter intra modes are union
+// of the following:
+// 1) FILTER_DC_PRED
+// 2) mode that corresponds to best mode so far of DC_PRED, V_PRED, H_PRED,
+// D157_PRED and PAETH_PRED. (Eg: FILTER_V_PRED if best mode so far is V_PRED).
+static const uint8_t av1_derived_filter_intra_mode_used_flag[INTRA_MODES] = {
+  0x01,  // DC_PRED:           0000 0001
+  0x03,  // V_PRED:            0000 0011
+  0x05,  // H_PRED:            0000 0101
+  0x01,  // D45_PRED:          0000 0001
+  0x01,  // D135_PRED:         0000 0001
+  0x01,  // D113_PRED:         0000 0001
+  0x09,  // D157_PRED:         0000 1001
+  0x01,  // D203_PRED:         0000 0001
+  0x01,  // D67_PRED:          0000 0001
+  0x01,  // SMOOTH_PRED:       0000 0001
+  0x01,  // SMOOTH_V_PRED:     0000 0001
+  0x01,  // SMOOTH_H_PRED:     0000 0001
+  0x11   // PAETH_PRED:        0001 0001
 };
 
-static const float intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = {
-  -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f,
-  -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f,
-  -0.434156f, 0.322868f,  2.260546f,  3.368715f,  3.989290f,  3.308487f,
-  2.277893f,  0.923793f,  0.026412f,  -0.385174f, -0.718622f, -1.408867f,
-  -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f,
-  -2.985709f, -3.447155f, 3.758139f,  3.204353f,  2.170998f,  0.826587f,
-  -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f,
-  -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f,
-  -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f,
-  -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f,
-  -0.088058f, 0.753494f,  2.092413f,  3.215266f,  -3.300277f, -2.748658f,
-  -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f,
-  -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f,
-  -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f,
-  0.813112f,  1.702213f,  2.653045f,  3.351749f,  3.243554f,  3.199409f,
-  2.437856f,  1.468854f,  0.533039f,  -0.099065f, -0.622643f, -2.200732f,
-  -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f,  1.975043f,
-  3.179528f,  3.939064f,  3.454379f,  3.689386f,  3.116411f,  1.970991f,
-  0.798406f,  -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f,
-  -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f,
-  -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f,
-  -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f,
-  -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f,  1.416882f,
-  2.572884f,  3.607755f,  3.974820f,  3.997783f,  2.970459f,  0.791687f,
-  -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f,
-  -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f,
-  -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f,
-  2.794130f,  3.685984f,  3.745195f,  3.252444f,  2.316108f,  1.399146f,
-  -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f,
-  -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f,
-  -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f,
-  -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f,
-  -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f,
-  -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f,
-  -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f,
-  -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f,
-  0.716997f,  1.481393f,  2.216702f,  2.737986f,  3.109809f,  3.226084f,
-  2.490098f,  -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f,
-  -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f,
-  -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f,
-  -1.430687f, 0.872896f,  2.766550f,  3.610080f,  3.578041f,  3.334928f,
-  2.586680f,  1.895721f,  1.122195f,  0.488519f,  -0.140689f, -0.799076f,
-  -1.222860f, -1.502437f, -1.900969f, -3.206816f,
+// The bitmask corresponds to the chroma intra modes as defined in enums.h
+// UV_PREDICTION_MODE enumeration type. Setting a bit to 0 in the mask means to
+// disable the evaluation of corresponding chroma intra mode. The table
+// av1_derived_chroma_intra_mode_used_flag is used when speed feature
+// prune_chroma_modes_using_luma_winner is enabled. The evaluated chroma
+// intra modes are union of the following:
+// 1) UV_DC_PRED
+// 2) UV_SMOOTH_PRED
+// 3) UV_CFL_PRED
+// 4) mode that corresponds to luma intra mode winner (Eg : UV_V_PRED if luma
+// intra mode winner is V_PRED).
+static const uint16_t av1_derived_chroma_intra_mode_used_flag[INTRA_MODES] = {
+  0x2201,  // DC_PRED:           0010 0010 0000 0001
+  0x2203,  // V_PRED:            0010 0010 0000 0011
+  0x2205,  // H_PRED:            0010 0010 0000 0101
+  0x2209,  // D45_PRED:          0010 0010 0000 1001
+  0x2211,  // D135_PRED:         0010 0010 0001 0001
+  0x2221,  // D113_PRED:         0010 0010 0010 0001
+  0x2241,  // D157_PRED:         0010 0010 0100 0001
+  0x2281,  // D203_PRED:         0010 0010 1000 0001
+  0x2301,  // D67_PRED:          0010 0011 0000 0001
+  0x2201,  // SMOOTH_PRED:       0010 0010 0000 0001
+  0x2601,  // SMOOTH_V_PRED:     0010 0110 0000 0001
+  0x2a01,  // SMOOTH_H_PRED:     0010 1010 0000 0001
+  0x3201   // PAETH_PRED:        0011 0010 0000 0001
 };
 
-static void generate_hog(const uint8_t *src, int stride, int rows, int cols,
-                         float *hist) {
-  const float step = (float)PI / BINS;
-  float total = 0.1f;
-  src += stride;
-  for (int r = 1; r < rows - 1; ++r) {
-    for (int c = 1; c < cols - 1; ++c) {
-      const uint8_t *above = &src[c - stride];
-      const uint8_t *below = &src[c + stride];
-      const uint8_t *left = &src[c - 1];
-      const uint8_t *right = &src[c + 1];
-      // Calculate gradient using Sobel fitlers.
-      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
-                     (left[-stride] + 2 * left[0] + left[stride]);
-      const int dy = (below[-1] + 2 * below[0] + below[1]) -
-                     (above[-1] + 2 * above[0] + above[1]);
-      if (dx == 0 && dy == 0) continue;
-      const int temp = abs(dx) + abs(dy);
-      if (!temp) continue;
-      total += temp;
-      if (dx == 0) {
-        hist[0] += temp / 2;
-        hist[BINS - 1] += temp / 2;
-      } else {
-        const float angle = atanf(dy * 1.0f / dx);
-        int idx = (int)roundf(angle / step) + BINS / 2;
-        idx = AOMMIN(idx, BINS - 1);
-        idx = AOMMAX(idx, 0);
-        hist[idx] += temp;
-      }
-    }
-    src += stride;
-  }
+DECLARE_ALIGNED(16, static const uint8_t, all_zeros[MAX_SB_SIZE]) = { 0 };
+DECLARE_ALIGNED(16, static const uint16_t,
+                highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf,
+                                 const int stride, const int is_hbd) {
+  unsigned int sse;
 
-  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+  if (is_hbd)
+    return vf(buf, stride, CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse);
+  else
+    return vf(buf, stride, all_zeros, 0, &sse);
 }
 
-static void generate_hog_hbd(const uint8_t *src8, int stride, int rows,
-                             int cols, float *hist) {
-  const float step = (float)PI / BINS;
-  float total = 0.1f;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  src += stride;
-  for (int r = 1; r < rows - 1; ++r) {
-    for (int c = 1; c < cols - 1; ++c) {
-      const uint16_t *above = &src[c - stride];
-      const uint16_t *below = &src[c + stride];
-      const uint16_t *left = &src[c - 1];
-      const uint16_t *right = &src[c + 1];
-      // Calculate gradient using Sobel fitlers.
-      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
-                     (left[-stride] + 2 * left[0] + left[stride]);
-      const int dy = (below[-1] + 2 * below[0] + below[1]) -
-                     (above[-1] + 2 * above[0] + above[1]);
-      if (dx == 0 && dy == 0) continue;
-      const int temp = abs(dx) + abs(dy);
-      if (!temp) continue;
-      total += temp;
-      if (dx == 0) {
-        hist[0] += temp / 2;
-        hist[BINS - 1] += temp / 2;
+// Computes average of log(1 + variance) across 4x4 sub-blocks for source and
+// reconstructed blocks.
+static void compute_avg_log_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                     const BLOCK_SIZE bs,
+                                     double *avg_log_src_variance,
+                                     double *avg_log_recon_variance) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
+  const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1);
+  const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1);
+  const int right_overflow =
+      (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+  const int bottom_overflow =
+      (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+  const int bw = (MI_SIZE * mi_size_wide[bs] - right_overflow);
+  const int bh = (MI_SIZE * mi_size_high[bs] - bottom_overflow);
+  const int is_hbd = is_cur_buf_hbd(xd);
+
+  for (int i = 0; i < bh; i += MI_SIZE) {
+    const int r = mi_row_in_sb + (i >> MI_SIZE_LOG2);
+    for (int j = 0; j < bw; j += MI_SIZE) {
+      const int c = mi_col_in_sb + (j >> MI_SIZE_LOG2);
+      const int mi_offset = r * mi_size_wide[sb_size] + c;
+      Block4x4VarInfo *block_4x4_var_info =
+          &x->src_var_info_of_4x4_sub_blocks[mi_offset];
+      int src_var = block_4x4_var_info->var;
+      double log_src_var = block_4x4_var_info->log_var;
+      // Compute average of log(1 + variance) for the source block from 4x4
+      // sub-block variance values. Calculate and store 4x4 sub-block variance
+      // and log(1 + variance), if the values present in
+      // src_var_of_4x4_sub_blocks are invalid. Reuse the same if it is readily
+      // available with valid values.
+      if (src_var < 0) {
+        src_var = av1_calc_normalized_variance(
+            cpi->ppi->fn_ptr[BLOCK_4X4].vf,
+            x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+            x->plane[0].src.stride, is_hbd);
+        block_4x4_var_info->var = src_var;
+        log_src_var = log(1.0 + src_var / 16.0);
+        block_4x4_var_info->log_var = log_src_var;
       } else {
-        const float angle = atanf(dy * 1.0f / dx);
-        int idx = (int)roundf(angle / step) + BINS / 2;
-        idx = AOMMIN(idx, BINS - 1);
-        idx = AOMMAX(idx, 0);
-        hist[idx] += temp;
+        // When source variance is already calculated and available for
+        // retrieval, check if log(1 + variance) is also available. If it is
+        // available, then retrieve from buffer. Else, calculate the same and
+        // store to the buffer.
+        if (log_src_var < 0) {
+          log_src_var = log(1.0 + src_var / 16.0);
+          block_4x4_var_info->log_var = log_src_var;
+        }
       }
-    }
-    src += stride;
-  }
+      *avg_log_src_variance += log_src_var;
 
-  for (int i = 0; i < BINS; ++i) hist[i] /= total;
-}
-
-static void prune_intra_mode_with_hog(const MACROBLOCK *x, BLOCK_SIZE bsize,
-                                      float th,
-                                      uint8_t *directional_mode_skip_mask) {
-  aom_clear_system_state();
-
-  const int bh = block_size_high[bsize];
-  const int bw = block_size_wide[bsize];
-  const MACROBLOCKD *xd = &x->e_mbd;
-  const int rows =
-      (xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh;
-  const int cols =
-      (xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw;
-  const int src_stride = x->plane[0].src.stride;
-  const uint8_t *src = x->plane[0].src.buf;
-  float hist[BINS] = { 0.0f };
-  if (is_cur_buf_hbd(xd)) {
-    generate_hog_hbd(src, src_stride, rows, cols, hist);
-  } else {
-    generate_hog(src, src_stride, rows, cols, hist);
-  }
-
-  for (int i = 0; i < DIRECTIONAL_MODES; ++i) {
-    float this_score = intra_hog_model_bias[i];
-    const float *weights = &intra_hog_model_weights[i * BINS];
-    for (int j = 0; j < BINS; ++j) {
-      this_score += weights[j] * hist[j];
+      const int recon_var = av1_calc_normalized_variance(
+          cpi->ppi->fn_ptr[BLOCK_4X4].vf,
+          xd->plane[0].dst.buf + i * xd->plane[0].dst.stride + j,
+          xd->plane[0].dst.stride, is_hbd);
+      *avg_log_recon_variance += log(1.0 + recon_var / 16.0);
     }
-    if (this_score < th) directional_mode_skip_mask[i + 1] = 1;
   }
 
-  aom_clear_system_state();
+  const int blocks = (bw * bh) / 16;
+  *avg_log_src_variance /= (double)blocks;
+  *avg_log_recon_variance /= (double)blocks;
 }
 
-#undef BINS
-
-// Model based RD estimation for luma intra blocks.
-static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                               BLOCK_SIZE bsize, int mode_cost) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  RD_STATS this_rd_stats;
-  int row, col;
-  int64_t temp_sse, this_rd;
-  TX_SIZE tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type);
-  const int stepr = tx_size_high_unit[tx_size];
-  const int stepc = tx_size_wide_unit[tx_size];
-  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
-  const int max_blocks_high = max_block_high(xd, bsize, 0);
-  mbmi->tx_size = tx_size;
-  // Prediction.
-  for (row = 0; row < max_blocks_high; row += stepr) {
-    for (col = 0; col < max_blocks_wide; col += stepc) {
-      av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
+// Returns a factor to be applied to the RD value based on how well the
+// reconstructed block variance matches the source variance.
+static double intra_rd_variance_factor(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bs) {
+  double threshold = INTRA_RD_VAR_THRESH(cpi->oxcf.speed);
+  // For non-positive threshold values, the comparison of source and
+  // reconstructed variances with threshold evaluates to false
+  // (src_var < threshold/rec_var < threshold) as these metrics are greater than
+  // than 0. Hence further calculations are skipped.
+  if (threshold <= 0) return 1.0;
+
+  double variance_rd_factor = 1.0;
+  double avg_log_src_variance = 0.0;
+  double avg_log_recon_variance = 0.0;
+  double var_diff = 0.0;
+
+  compute_avg_log_variance(cpi, x, bs, &avg_log_src_variance,
+                           &avg_log_recon_variance);
+
+  // Dont allow 0 to prevent / 0 below.
+  avg_log_src_variance += 0.000001;
+  avg_log_recon_variance += 0.000001;
+
+  if (avg_log_src_variance >= avg_log_recon_variance) {
+    var_diff = (avg_log_src_variance - avg_log_recon_variance);
+    if ((var_diff > 0.5) && (avg_log_recon_variance < threshold)) {
+      variance_rd_factor = 1.0 + ((var_diff * 2) / avg_log_src_variance);
     }
-  }
-  // RD estimation.
-  model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model ? MODELRD_LEGACY
-                                                   : MODELRD_TYPE_INTRA](
-      cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, &this_rd_stats.dist,
-      &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL);
-  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
-    mode_cost +=
-        x->angle_delta_cost[mbmi->mode - V_PRED]
-                           [MAX_ANGLE_DELTA + mbmi->angle_delta[PLANE_TYPE_Y]];
-  }
-  if (mbmi->mode == DC_PRED &&
-      av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) {
-    if (mbmi->filter_intra_mode_info.use_filter_intra) {
-      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode;
-      mode_cost += x->filter_intra_cost[mbmi->sb_type][1] +
-                   x->filter_intra_mode_cost[mode];
-    } else {
-      mode_cost += x->filter_intra_cost[mbmi->sb_type][0];
-    }
-  }
-  this_rd =
-      RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
-  return this_rd;
-}
-
-// Update the intra model yrd and prune the current mode if the new estimate
-// y_rd > 1.5 * best_model_rd.
-static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
-                                                MACROBLOCK *x, BLOCK_SIZE bsize,
-                                                int mode_info_cost,
-                                                int64_t *best_model_rd) {
-  const int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, mode_info_cost);
-  if (*best_model_rd != INT64_MAX &&
-      this_model_rd > *best_model_rd + (*best_model_rd >> 1)) {
-    return 1;
-  } else if (this_model_rd < *best_model_rd) {
-    *best_model_rd = this_model_rd;
-  }
-  return 0;
-}
-
-// Run RD calculation with given luma intra prediction angle., and return
-// the RD cost. Update the best mode info. if the RD cost is the best so far.
-static int64_t calc_rd_given_intra_angle(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
-    int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
-    RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
-    int64_t *best_rd, int64_t *best_model_rd, uint8_t *best_tx_type_map,
-    uint8_t *best_blk_skip, int skip_model_rd) {
-  RD_STATS tokenonly_rd_stats;
-  int64_t this_rd;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const int n4 = bsize_to_num_blk(bsize);
-  assert(!is_inter_block(mbmi));
-  mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
-  if (!skip_model_rd) {
-    if (model_intra_yrd_and_prune(cpi, x, bsize, mode_cost, best_model_rd)) {
-      return INT64_MAX;
+  } else {
+    var_diff = (avg_log_recon_variance - avg_log_src_variance);
+    if ((var_diff > 0.5) && (avg_log_src_variance < threshold)) {
+      variance_rd_factor = 1.0 + (var_diff / (2 * avg_log_src_variance));
     }
   }
-  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
-                                    best_rd_in);
-  if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
 
-  int this_rate =
-      mode_cost + tokenonly_rd_stats.rate +
-      x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
-  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  // Limit adjustment;
+  variance_rd_factor = AOMMIN(3.0, variance_rd_factor);
 
-  if (this_rd < *best_rd) {
-    memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
-    av1_copy_array(best_tx_type_map, xd->tx_type_map, n4);
-    *best_rd = this_rd;
-    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y];
-    *best_tx_size = mbmi->tx_size;
-    *rate = this_rate;
-    rd_stats->rate = tokenonly_rd_stats.rate;
-    rd_stats->dist = tokenonly_rd_stats.dist;
-    rd_stats->skip = tokenonly_rd_stats.skip;
-  }
-  return this_rd;
-}
-
-static INLINE int write_uniform_cost(int n, int v) {
-  const int l = get_unsigned_bits(n);
-  const int m = (1 << l) - n;
-  if (l == 0) return 0;
-  if (v < m)
-    return av1_cost_literal(l - 1);
-  else
-    return av1_cost_literal(l);
+  return variance_rd_factor;
 }
+/*!\endcond */
 
-// Return the rate cost for luma prediction mode info. of intra blocks.
-static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x,
-                                  const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
-                                  int mode_cost) {
-  int total_rate = mode_cost;
-  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
-  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
-  const int use_intrabc = mbmi->use_intrabc;
-  // Can only activate one mode.
-  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
-          use_filter_intra) <= 1);
-  const int try_palette = av1_allow_palette(
-      cpi->common.features.allow_screen_content_tools, mbmi->sb_type);
-  if (try_palette && mbmi->mode == DC_PRED) {
-    const MACROBLOCKD *xd = &x->e_mbd;
-    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
-    const int mode_ctx = av1_get_palette_mode_ctx(xd);
-    total_rate += x->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
-    if (use_palette) {
-      const uint8_t *const color_map = xd->plane[0].color_index_map;
-      int block_width, block_height, rows, cols;
-      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
-                               &cols);
-      const int plt_size = mbmi->palette_mode_info.palette_size[0];
-      int palette_mode_cost =
-          x->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
-          write_uniform_cost(plt_size, color_map[0]);
-      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
-      palette_mode_cost +=
-          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
-                                   n_cache, cpi->common.seq_params.bit_depth);
-      palette_mode_cost +=
-          av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
-      total_rate += palette_mode_cost;
-    }
-  }
-  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
-    total_rate += x->filter_intra_cost[mbmi->sb_type][use_filter_intra];
-    if (use_filter_intra) {
-      total_rate += x->filter_intra_mode_cost[mbmi->filter_intra_mode_info
-                                                  .filter_intra_mode];
-    }
-  }
-  if (av1_is_directional_mode(mbmi->mode)) {
-    if (av1_use_angle_delta(bsize)) {
-      total_rate += x->angle_delta_cost[mbmi->mode - V_PRED]
-                                       [MAX_ANGLE_DELTA +
-                                        mbmi->angle_delta[PLANE_TYPE_Y]];
-    }
-  }
-  if (av1_allow_intrabc(&cpi->common))
-    total_rate += x->intrabc_cost[use_intrabc];
-  return total_rate;
-}
-
-// Return the rate cost for chroma prediction mode info. of intra blocks.
-static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x,
-                                   const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
-                                   int mode_cost) {
-  int total_rate = mode_cost;
-  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
-  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
-  // Can only activate one mode.
-  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
-
-  const int try_palette = av1_allow_palette(
-      cpi->common.features.allow_screen_content_tools, mbmi->sb_type);
-  if (try_palette && mode == UV_DC_PRED) {
-    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
-    total_rate +=
-        x->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
-    if (use_palette) {
-      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
-      const int plt_size = pmi->palette_size[1];
-      const MACROBLOCKD *xd = &x->e_mbd;
-      const uint8_t *const color_map = xd->plane[1].color_index_map;
-      int palette_mode_cost =
-          x->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
-          write_uniform_cost(plt_size, color_map[0]);
-      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
-      palette_mode_cost += av1_palette_color_cost_uv(
-          pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
-      palette_mode_cost +=
-          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
-      total_rate += palette_mode_cost;
-    }
-  }
-  if (av1_is_directional_mode(get_uv_mode(mode))) {
-    if (av1_use_angle_delta(bsize)) {
-      total_rate +=
-          x->angle_delta_cost[mode - V_PRED][mbmi->angle_delta[PLANE_TYPE_UV] +
-                                             MAX_ANGLE_DELTA];
-    }
-  }
-  return total_rate;
-}
-
-// Return 1 if an filter intra mode is selected; return 0 otherwise.
+/*!\brief Search for the best filter_intra mode when coding intra frame.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function loops through all filter_intra modes to find the best one.
+ *
+ * \return Returns 1 if a new filter_intra mode is selected; 0 otherwise.
+ */
 static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     int *rate, int *rate_tokenonly,
                                     int64_t *distortion, int *skippable,
                                     BLOCK_SIZE bsize, int mode_cost,
+                                    PREDICTION_MODE best_mode_so_far,
                                     int64_t *best_rd, int64_t *best_model_rd,
                                     PICK_MODE_CONTEXT *ctx) {
+  // Skip the evaluation of filter intra modes.
+  if (cpi->sf.intra_sf.prune_filter_intra_level == 2) return 0;
+
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   int filter_intra_selected_flag = 0;
@@ -420,18 +230,34 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   TX_SIZE best_tx_size = TX_8X8;
   FILTER_INTRA_MODE_INFO filter_intra_mode_info;
   uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  (void)ctx;
   av1_zero(filter_intra_mode_info);
   mbmi->filter_intra_mode_info.use_filter_intra = 1;
   mbmi->mode = DC_PRED;
   mbmi->palette_mode_info.palette_size[0] = 0;
 
+  // Skip the evaluation of filter-intra if cached MB_MODE_INFO does not have
+  // filter-intra as winner.
+  if (x->use_mb_mode_cache &&
+      !x->mb_mode_cache->filter_intra_mode_info.use_filter_intra)
+    return 0;
+
   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
     int64_t this_rd;
     RD_STATS tokenonly_rd_stats;
     mbmi->filter_intra_mode_info.filter_intra_mode = mode;
 
-    if (model_intra_yrd_and_prune(cpi, x, bsize, mode_cost, best_model_rd)) {
+    if ((cpi->sf.intra_sf.prune_filter_intra_level == 1) &&
+        !(av1_derived_filter_intra_mode_used_flag[best_mode_so_far] &
+          (1 << mode)))
+      continue;
+
+    // Skip the evaluation of modes that do not match with the winner mode in
+    // x->mb_mode_cache.
+    if (x->use_mb_mode_cache &&
+        mode != x->mb_mode_cache->filter_intra_mode_info.filter_intra_mode)
+      continue;
+
+    if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) {
       continue;
     }
     av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
@@ -439,26 +265,30 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (tokenonly_rd_stats.rate == INT_MAX) continue;
     const int this_rate =
         tokenonly_rd_stats.rate +
-        intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
+    // Visual quality adjustment based on recon vs source variance.
+    if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) {
+      this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize));
+    }
+
     // Collect mode stats for multiwinner mode processing
     const int txfm_search_done = 1;
     store_winner_mode_stats(
         &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
-        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-        txfm_search_done);
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
     if (this_rd < *best_rd) {
       *best_rd = this_rd;
       best_tx_size = mbmi->tx_size;
       filter_intra_mode_info = mbmi->filter_intra_mode_info;
       av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
-      memcpy(ctx->blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip,
+             sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
       *rate = this_rate;
       *rate_tokenonly = tokenonly_rd_stats.rate;
       *distortion = tokenonly_rd_stats.dist;
-      *skippable = tokenonly_rd_stats.skip;
+      *skippable = tokenonly_rd_stats.skip_txfm;
       filter_intra_selected_flag = 1;
     }
   }
@@ -474,8 +304,8 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 }
 
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
-                     int *val_count) {
+void av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                      int *val_count, int *num_colors) {
   const int max_pix_val = 1 << 8;
   memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
   for (int r = 0; r < rows; ++r) {
@@ -489,743 +319,121 @@ int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
   for (int i = 0; i < max_pix_val; ++i) {
     if (val_count[i]) ++n;
   }
-  return n;
+  *num_colors = n;
 }
 
-int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth, int *val_count) {
+void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+                             int cols, int bit_depth, int *val_count,
+                             int *bin_val_count, int *num_color_bins,
+                             int *num_colors) {
   assert(bit_depth <= 12);
+  const int max_bin_val = 1 << 8;
   const int max_pix_val = 1 << bit_depth;
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+  memset(bin_val_count, 0, max_bin_val * sizeof(val_count[0]));
+  if (val_count != NULL)
+    memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
   for (int r = 0; r < rows; ++r) {
     for (int c = 0; c < cols; ++c) {
-      const int this_val = src[r * stride + c];
-      assert(this_val < max_pix_val);
-      if (this_val >= max_pix_val) return 0;
-      ++val_count[this_val];
+      /*
+       * Down-convert the pixels to 8-bit domain before counting.
+       * This provides consistency of behavior for palette search
+       * between lbd and hbd encodes. This down-converted pixels
+       * are only used for calculating the threshold (n).
+       */
+      const int this_val = ((src[r * stride + c]) >> (bit_depth - 8));
+      assert(this_val < max_bin_val);
+      if (this_val >= max_bin_val) continue;
+      ++bin_val_count[this_val];
+      if (val_count != NULL) ++val_count[(src[r * stride + c])];
     }
   }
   int n = 0;
-  for (int i = 0; i < max_pix_val; ++i) {
-    if (val_count[i]) ++n;
-  }
-  return n;
-}
-
-// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
-// new_height'. Extra rows and columns are filled in by copying last valid
-// row/column.
-static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
-                                                int orig_width, int orig_height,
-                                                int new_width, int new_height) {
-  int j;
-  assert(new_width >= orig_width);
-  assert(new_height >= orig_height);
-  if (new_width == orig_width && new_height == orig_height) return;
-
-  for (j = orig_height - 1; j >= 0; --j) {
-    memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
-    // Copy last column to extra columns.
-    memset(color_map + j * new_width + orig_width,
-           color_map[j * new_width + orig_width - 1], new_width - orig_width);
-  }
-  // Copy last row to extra rows.
-  for (j = orig_height; j < new_height; ++j) {
-    memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
-           new_width);
-  }
-}
-
-// Bias toward using colors in the cache.
-// TODO(huisu): Try other schemes to improve compression.
-static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
-                                               int n_cache, int n_colors,
-                                               int stride, int *centroids) {
-  if (n_cache <= 0) return;
-  for (int i = 0; i < n_colors * stride; i += stride) {
-    int min_diff = abs(centroids[i] - (int)color_cache[0]);
-    int idx = 0;
-    for (int j = 1; j < n_cache; ++j) {
-      const int this_diff = abs(centroids[i] - color_cache[j]);
-      if (this_diff < min_diff) {
-        min_diff = this_diff;
-        idx = j;
-      }
-    }
-    if (min_diff <= 1) centroids[i] = color_cache[idx];
-  }
-}
-
-// Given the base colors as specified in centroids[], calculate the RD cost
-// of palette mode.
-static AOM_INLINE void palette_rd_y(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
-    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
-    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
-    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
-    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
-    uint8_t *tx_type_map, int *beat_best_pallette_rd) {
-  optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
-  const int num_unique_colors = av1_remove_duplicates(centroids, n);
-  if (num_unique_colors < PALETTE_MIN_SIZE) {
-    // Too few unique colors to create a palette. And DC_PRED will work
-    // well for that case anyway. So skip.
-    return;
-  }
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  if (cpi->common.seq_params.use_highbitdepth) {
-    for (int i = 0; i < num_unique_colors; ++i) {
-      pmi->palette_colors[i] = clip_pixel_highbd(
-          (int)centroids[i], cpi->common.seq_params.bit_depth);
-    }
-  } else {
-    for (int i = 0; i < num_unique_colors; ++i) {
-      pmi->palette_colors[i] = clip_pixel(centroids[i]);
-    }
-  }
-  pmi->palette_size[0] = num_unique_colors;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  uint8_t *const color_map = xd->plane[0].color_index_map;
-  int block_width, block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
-                           &cols);
-  av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors,
-                   1);
-  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
-
-  const int palette_mode_cost =
-      intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
-  if (model_intra_yrd_and_prune(cpi, x, bsize, palette_mode_cost,
-                                best_model_rd)) {
-    return;
-  }
-
-  RD_STATS tokenonly_rd_stats;
-  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
-                                    *best_rd);
-  if (tokenonly_rd_stats.rate == INT_MAX) return;
-  int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
-  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
-    tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
-  }
-  // Collect mode stats for multiwinner mode processing
-  const int txfm_search_done = 1;
-  store_winner_mode_stats(
-      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize,
-      this_rd, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-      txfm_search_done);
-  if (this_rd < *best_rd) {
-    *best_rd = this_rd;
-    // Setting beat_best_rd flag because current mode rd is better than best_rd.
-    // This flag need to be updated only for palette evaluation in key frames
-    if (beat_best_rd) *beat_best_rd = 1;
-    memcpy(best_palette_color_map, color_map,
-           block_width * block_height * sizeof(color_map[0]));
-    *best_mbmi = *mbmi;
-    memcpy(blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-    av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
-    if (rate) *rate = this_rate;
-    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
-    if (distortion) *distortion = tokenonly_rd_stats.dist;
-    if (skippable) *skippable = tokenonly_rd_stats.skip;
-    if (beat_best_pallette_rd) *beat_best_pallette_rd = 1;
+  // Count the colors based on 8-bit domain used to gate the palette path
+  for (int i = 0; i < max_bin_val; ++i) {
+    if (bin_val_count[i]) ++n;
   }
-}
-
-static AOM_INLINE int perform_top_color_coarse_palette_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data,
-    const int *const top_colors, int start_n, int end_n, int step_size,
-    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
-    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
-    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
-    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
-    uint8_t *tx_type_map) {
-  int centroids[PALETTE_MAX_SIZE];
-  int n = start_n;
-  int top_color_winner = end_n + 1;
-  while (1) {
-    int beat_best_pallette_rd = 0;
-    for (int i = 0; i < n; ++i) centroids[i] = top_colors[i];
-    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_pallette_rd);
-    // Break if current palette colors is not winning
-    if (beat_best_pallette_rd) top_color_winner = n;
-    n += step_size;
-    if (n > end_n) break;
-  }
-  return top_color_winner;
-}
+  *num_color_bins = n;
 
-static AOM_INLINE int perform_k_means_coarse_palette_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub,
-    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
-    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
-    int data_points) {
-  int centroids[PALETTE_MAX_SIZE];
-  const int max_itr = 50;
-  int n = start_n;
-  int k_means_winner = end_n + 1;
-  while (1) {
-    int beat_best_pallette_rd = 0;
-    for (int i = 0; i < n; ++i) {
-      centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+  // Count the actual hbd colors used to create top_colors
+  n = 0;
+  if (val_count != NULL) {
+    for (int i = 0; i < max_pix_val; ++i) {
+      if (val_count[i]) ++n;
     }
-    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
-    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_pallette_rd);
-    // Break if current palette colors is not winning
-    if (beat_best_pallette_rd) k_means_winner = n;
-    n += step_size;
-    if (n > end_n) break;
+    *num_colors = n;
   }
-  return k_means_winner;
 }
 
-// Perform palette search for top colors from minimum palette colors (/maximum)
-// with a step-size of 1 (/-1)
-static AOM_INLINE int perform_top_color_palette_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors,
-    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
-    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip, uint8_t *tx_type_map) {
-  int centroids[PALETTE_MAX_SIZE];
-  int n = start_n;
-  assert((step_size == -1) || (step_size == 1) || (step_size == 0) ||
-         (step_size == 2));
-  assert(IMPLIES(step_size == -1, start_n > end_n));
-  assert(IMPLIES(step_size == 1, start_n < end_n));
-  while (1) {
-    int beat_best_pallette_rd = 0;
-    for (int i = 0; i < n; ++i) centroids[i] = top_colors[i];
-    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_pallette_rd);
-    // Break if current palette colors is not winning
-    if ((cpi->sf.intra_sf.prune_palette_search_level == 2) &&
-        !beat_best_pallette_rd)
-      return n;
-    n += step_size;
-    if (n == end_n) break;
-  }
-  return n;
-}
-// Perform k-means based palette search from minimum palette colors (/maximum)
-// with a step-size of 1 (/-1)
-static AOM_INLINE int perform_k_means_palette_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub,
-    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
-    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
-    int data_points) {
-  int centroids[PALETTE_MAX_SIZE];
-  const int max_itr = 50;
-  int n = start_n;
-  assert((step_size == -1) || (step_size == 1) || (step_size == 0) ||
-         (step_size == 2));
-  assert(IMPLIES(step_size == -1, start_n > end_n));
-  assert(IMPLIES(step_size == 1, start_n < end_n));
-  while (1) {
-    int beat_best_pallette_rd = 0;
-    for (int i = 0; i < n; ++i) {
-      centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
-    }
-    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
-    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_pallette_rd);
-    // Break if current palette colors is not winning
-    if ((cpi->sf.intra_sf.prune_palette_search_level == 2) &&
-        !beat_best_pallette_rd)
-      return n;
-    n += step_size;
-    if (n == end_n) break;
-  }
-  return n;
-}
-
-#define START_N_STAGE2(x)                         \
-  ((x == PALETTE_MIN_SIZE) ? PALETTE_MIN_SIZE + 1 \
-                           : AOMMAX(x - 1, PALETTE_MIN_SIZE));
-#define END_N_STAGE2(x, end_n) \
-  ((x == end_n) ? x - 1 : AOMMIN(x + 1, PALETTE_MAX_SIZE));
-
-static AOM_INLINE void update_start_end_stage_2(int *start_n_stage2,
-                                                int *end_n_stage2,
-                                                int *step_size_stage2,
-                                                int winner, int end_n) {
-  *start_n_stage2 = START_N_STAGE2(winner);
-  *end_n_stage2 = END_N_STAGE2(winner, end_n);
-  *step_size_stage2 = *end_n_stage2 - *start_n_stage2;
-}
-
-// Start index and step size below are chosen to evaluate unique
-// candidates in neighbor search, in case a winner candidate is found in
-// coarse search. Example,
-// 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step
-// size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8.
-// If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2
-// (3) and 8 (7).
-// 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same
-// as for 8 colors) then step size should also be 2, to cover all
-// candidates. Coarse search will evaluate 2, 4 and 6. If winner is either
-// 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3,
-// coarse search will evaluate 3 and 6. For the winner, unique neighbors
-// (3: 2,4 or 6: 5,7) would be evaluated.
-
-// start index for coarse palette search for dominant colors and k-means
-static const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
-                                                                    3, 3, 2,
-                                                                    3, 3, 2 };
-// step size for coarse palette search for dominant colors and k-means
-static const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
-                                                                      3, 3, 3,
-                                                                      3, 3, 3 };
-
-static void rd_pick_palette_intra_sby(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-    int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
-    int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
-    int64_t *distortion, int *skippable, int *beat_best_rd,
-    PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, uint8_t *tx_type_map) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
-                           bsize));
-
-  const int src_stride = x->plane[0].src.stride;
-  const uint8_t *const src = x->plane[0].src.buf;
-  int block_width, block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
-                           &cols);
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
-  const int is_hbd = seq_params->use_highbitdepth;
-  const int bit_depth = seq_params->bit_depth;
-  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
-  int colors;
-  if (is_hbd) {
-    colors = av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth,
-                                     count_buf);
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi) {
+  if (mode_idx < INTRA_MODE_END) {
+    mbmi->mode = intra_rd_search_mode_order[mode_idx];
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
   } else {
-    colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
-  }
-
-  uint8_t *const color_map = xd->plane[0].color_index_map;
-  if (colors > 1 && colors <= 64) {
-    int *const data = x->palette_buffer->kmeans_data_buf;
-    int centroids[PALETTE_MAX_SIZE];
-    int lb, ub;
-    if (is_hbd) {
-      int *data_pt = data;
-      const uint16_t *src_pt = CONVERT_TO_SHORTPTR(src);
-      lb = ub = src_pt[0];
-      for (int r = 0; r < rows; ++r) {
-        for (int c = 0; c < cols; ++c) {
-          const int val = src_pt[c];
-          data_pt[c] = val;
-          lb = AOMMIN(lb, val);
-          ub = AOMMAX(ub, val);
-        }
-        src_pt += src_stride;
-        data_pt += cols;
-      }
-    } else {
-      int *data_pt = data;
-      const uint8_t *src_pt = src;
-      lb = ub = src[0];
-      for (int r = 0; r < rows; ++r) {
-        for (int c = 0; c < cols; ++c) {
-          const int val = src_pt[c];
-          data_pt[c] = val;
-          lb = AOMMIN(lb, val);
-          ub = AOMMAX(ub, val);
-        }
-        src_pt += src_stride;
-        data_pt += cols;
-      }
-    }
-
-    mbmi->mode = DC_PRED;
-    mbmi->filter_intra_mode_info.use_filter_intra = 0;
-
-    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
-
-    // Find the dominant colors, stored in top_colors[].
-    int top_colors[PALETTE_MAX_SIZE] = { 0 };
-    for (int i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
-      int max_count = 0;
-      for (int j = 0; j < (1 << bit_depth); ++j) {
-        if (count_buf[j] > max_count) {
-          max_count = count_buf[j];
-          top_colors[i] = j;
-        }
-      }
-      assert(max_count > 0);
-      count_buf[top_colors[i]] = 0;
-    }
-
-    // Try the dominant colors directly.
-    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
-    // where the dominant colors and the k-means results are similar.
-    if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
-        (colors > PALETTE_MIN_SIZE)) {
-      const int end_n = AOMMIN(colors, PALETTE_MAX_SIZE);
-      assert(PALETTE_MAX_SIZE == 8);
-      assert(PALETTE_MIN_SIZE == 2);
-      // Choose the start index and step size for coarse search based on number
-      // of colors
-      const int start_n = start_n_lookup_table[end_n];
-      const int step_size = step_size_lookup_table[end_n];
-      // Perform top color coarse palette search to find the winner candidate
-      const int top_color_winner = perform_top_color_coarse_palette_search(
-          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n, end_n,
-          step_size, color_cache, n_cache, best_mbmi, best_palette_color_map,
-          best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
-          beat_best_rd, ctx, best_blk_skip, tx_type_map);
-      // Evaluate neighbors for the winner color (if winner is found) in the
-      // above coarse search for dominant colors
-      if (top_color_winner <= end_n) {
-        int start_n_stage2, end_n_stage2, step_size_stage2;
-        update_start_end_stage_2(&start_n_stage2, &end_n_stage2,
-                                 &step_size_stage2, top_color_winner, end_n);
-        // perform finer search for the winner candidate
-        perform_top_color_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n_stage2,
-            end_n_stage2 + step_size_stage2, step_size_stage2, color_cache,
-            n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd,
-            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map);
-      }
-      // K-means clustering.
-      // Perform k-means coarse palette search to find the winner candidate
-      const int k_means_winner = perform_k_means_coarse_palette_search(
-          cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n,
-          step_size, color_cache, n_cache, best_mbmi, best_palette_color_map,
-          best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
-          beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
-          rows * cols);
-      // Evaluate neighbors for the winner color (if winner is found) in the
-      // above coarse search for k-means
-      if (k_means_winner <= end_n) {
-        int start_n_stage2, end_n_stage2, step_size_stage2;
-        update_start_end_stage_2(&start_n_stage2, &end_n_stage2,
-                                 &step_size_stage2, k_means_winner, end_n);
-        // perform finer search for the winner candidate
-        perform_k_means_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n_stage2,
-            end_n_stage2 + step_size_stage2, step_size_stage2, color_cache,
-            n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd,
-            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map, color_map, rows * cols);
-      }
-    } else {
-      const int start_n = AOMMIN(colors, PALETTE_MAX_SIZE),
-                end_n = PALETTE_MIN_SIZE;
-      // Perform top color palette search from start_n
-      const int top_color_winner = perform_top_color_palette_search(
-          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n,
-          end_n - 1, -1, color_cache, n_cache, best_mbmi,
-          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
-          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
-
-      if (top_color_winner > end_n) {
-        // Perform top color palette search in reverse order for the remaining
-        // colors
-        perform_top_color_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, end_n,
-            top_color_winner, 1, color_cache, n_cache, best_mbmi,
-            best_palette_color_map, best_rd, best_model_rd, rate,
-            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map);
-      }
-      // K-means clustering.
-      if (colors == PALETTE_MIN_SIZE) {
-        // Special case: These colors automatically become the centroids.
-        assert(colors == 2);
-        centroids[0] = lb;
-        centroids[1] = ub;
-        palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
-                     color_cache, n_cache, best_mbmi, best_palette_color_map,
-                     best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                     skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                     NULL);
-      } else {
-        // Perform k-means palette search from start_n
-        const int k_means_winner = perform_k_means_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n - 1,
-            -1, color_cache, n_cache, best_mbmi, best_palette_color_map,
-            best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
-            beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
-            rows * cols);
-        if (k_means_winner > end_n) {
-          // Perform k-means palette search in reverse order for the remaining
-          // colors
-          perform_k_means_palette_search(
-              cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, end_n,
-              k_means_winner, 1, color_cache, n_cache, best_mbmi,
-              best_palette_color_map, best_rd, best_model_rd, rate,
-              rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-              best_blk_skip, tx_type_map, color_map, rows * cols);
-        }
-      }
-    }
+    mbmi->mode = (mode_idx - INTRA_MODE_END) / (MAX_ANGLE_DELTA * 2) + V_PRED;
+    int angle_delta = (mode_idx - INTRA_MODE_END) % (MAX_ANGLE_DELTA * 2);
+    mbmi->angle_delta[PLANE_TYPE_Y] =
+        (angle_delta < 3 ? (angle_delta - 3) : (angle_delta - 2));
   }
-
-  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
-    memcpy(color_map, best_palette_color_map,
-           block_width * block_height * sizeof(best_palette_color_map[0]));
-  }
-  *mbmi = *best_mbmi;
 }
 
-static AOM_INLINE void rd_pick_palette_intra_sbuv(
-    const AV1_COMP *const cpi, MACROBLOCK *x, int dc_mode_cost,
-    uint8_t *best_palette_color_map, MB_MODE_INFO *const best_mbmi,
-    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
-                           mbmi->sb_type));
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
-  int this_rate;
-  int64_t this_rd;
-  int colors_u, colors_v, colors;
-  const int src_stride = x->plane[1].src.stride;
-  const uint8_t *const src_u = x->plane[1].src.buf;
-  const uint8_t *const src_v = x->plane[2].src.buf;
-  uint8_t *const color_map = xd->plane[1].color_index_map;
-  RD_STATS tokenonly_rd_stats;
-  int plane_block_width, plane_block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
-                           &plane_block_height, &rows, &cols);
-
-  mbmi->uv_mode = UV_DC_PRED;
-
-  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
-  if (seq_params->use_highbitdepth) {
-    colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
-                                       seq_params->bit_depth, count_buf);
-    colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
-                                       seq_params->bit_depth, count_buf);
+static AOM_INLINE int get_model_rd_index_for_pruning(
+    const MACROBLOCK *const x,
+    const INTRA_MODE_SPEED_FEATURES *const intra_sf) {
+  const int top_intra_model_count_allowed =
+      intra_sf->top_intra_model_count_allowed;
+  if (!intra_sf->adapt_top_model_rd_count_using_neighbors)
+    return top_intra_model_count_allowed - 1;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const PREDICTION_MODE mode = xd->mi[0]->mode;
+  int model_rd_index_for_pruning = top_intra_model_count_allowed - 1;
+  int is_left_mode_neq_cur_mode = 0, is_above_mode_neq_cur_mode = 0;
+  if (xd->left_available)
+    is_left_mode_neq_cur_mode = xd->left_mbmi->mode != mode;
+  if (xd->up_available)
+    is_above_mode_neq_cur_mode = xd->above_mbmi->mode != mode;
+  // The pruning of luma intra modes is made more aggressive at lower quantizers
+  // and vice versa. The value for model_rd_index_for_pruning is derived as
+  // follows.
+  // qidx 0 to 127: Reduce the index of a candidate used for comparison only if
+  // the current mode does not match either of the available neighboring modes.
+  // qidx 128 to 255: Reduce the index of a candidate used for comparison only
+  // if the current mode does not match both the available neighboring modes.
+  if (x->qindex <= 127) {
+    if (is_left_mode_neq_cur_mode || is_above_mode_neq_cur_mode)
+      model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0);
   } else {
-    colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
-    colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
-  }
-
-  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
-
-  colors = colors_u > colors_v ? colors_u : colors_v;
-  if (colors > 1 && colors <= 64) {
-    int r, c, n, i, j;
-    const int max_itr = 50;
-    int lb_u, ub_u, val_u;
-    int lb_v, ub_v, val_v;
-    int *const data = x->palette_buffer->kmeans_data_buf;
-    int centroids[2 * PALETTE_MAX_SIZE];
-
-    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
-    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
-    if (seq_params->use_highbitdepth) {
-      lb_u = src_u16[0];
-      ub_u = src_u16[0];
-      lb_v = src_v16[0];
-      ub_v = src_v16[0];
-    } else {
-      lb_u = src_u[0];
-      ub_u = src_u[0];
-      lb_v = src_v[0];
-      ub_v = src_v[0];
-    }
-
-    for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; ++c) {
-        if (seq_params->use_highbitdepth) {
-          val_u = src_u16[r * src_stride + c];
-          val_v = src_v16[r * src_stride + c];
-          data[(r * cols + c) * 2] = val_u;
-          data[(r * cols + c) * 2 + 1] = val_v;
-        } else {
-          val_u = src_u[r * src_stride + c];
-          val_v = src_v[r * src_stride + c];
-          data[(r * cols + c) * 2] = val_u;
-          data[(r * cols + c) * 2 + 1] = val_v;
-        }
-        if (val_u < lb_u)
-          lb_u = val_u;
-        else if (val_u > ub_u)
-          ub_u = val_u;
-        if (val_v < lb_v)
-          lb_v = val_v;
-        else if (val_v > ub_v)
-          ub_v = val_v;
-      }
-    }
-
-    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
-         --n) {
-      for (i = 0; i < n; ++i) {
-        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
-        centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
-      }
-      av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
-      optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
-      // Sort the U channel colors in ascending order.
-      for (i = 0; i < 2 * (n - 1); i += 2) {
-        int min_idx = i;
-        int min_val = centroids[i];
-        for (j = i + 2; j < 2 * n; j += 2)
-          if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
-        if (min_idx != i) {
-          int temp_u = centroids[i], temp_v = centroids[i + 1];
-          centroids[i] = centroids[min_idx];
-          centroids[i + 1] = centroids[min_idx + 1];
-          centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
-        }
-      }
-      av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
-      extend_palette_color_map(color_map, cols, rows, plane_block_width,
-                               plane_block_height);
-      pmi->palette_size[1] = n;
-      for (i = 1; i < 3; ++i) {
-        for (j = 0; j < n; ++j) {
-          if (seq_params->use_highbitdepth)
-            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
-                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
-          else
-            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
-                clip_pixel((int)centroids[j * 2 + i - 1]);
-        }
-      }
-
-      av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
-      if (tokenonly_rd_stats.rate == INT_MAX) continue;
-      this_rate = tokenonly_rd_stats.rate +
-                  intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
-      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-      if (this_rd < *best_rd) {
-        *best_rd = this_rd;
-        *best_mbmi = *mbmi;
-        memcpy(best_palette_color_map, color_map,
-               plane_block_width * plane_block_height *
-                   sizeof(best_palette_color_map[0]));
-        *rate = this_rate;
-        *distortion = tokenonly_rd_stats.dist;
-        *rate_tokenonly = tokenonly_rd_stats.rate;
-        *skippable = tokenonly_rd_stats.skip;
-      }
-    }
-  }
-  if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
-    memcpy(color_map, best_palette_color_map,
-           plane_block_width * plane_block_height *
-               sizeof(best_palette_color_map[0]));
+    if (is_left_mode_neq_cur_mode && is_above_mode_neq_cur_mode)
+      model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0);
   }
+  return model_rd_index_for_pruning;
 }
 
-void av1_restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  int src_stride = x->plane[1].src.stride;
-  const uint8_t *const src_u = x->plane[1].src.buf;
-  const uint8_t *const src_v = x->plane[2].src.buf;
-  int *const data = x->palette_buffer->kmeans_data_buf;
-  int centroids[2 * PALETTE_MAX_SIZE];
-  uint8_t *const color_map = xd->plane[1].color_index_map;
-  int r, c;
-  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
-  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
-  int plane_block_width, plane_block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
-                           &plane_block_height, &rows, &cols);
-
-  for (r = 0; r < rows; ++r) {
-    for (c = 0; c < cols; ++c) {
-      if (cpi->common.seq_params.use_highbitdepth) {
-        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
-        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
-      } else {
-        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
-        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+                       int64_t top_intra_model_rd[], int max_model_cnt_allowed,
+                       int model_rd_index_for_pruning) {
+  const double thresh_best = 1.50;
+  const double thresh_top = 1.00;
+  for (int i = 0; i < max_model_cnt_allowed; i++) {
+    if (this_model_rd < top_intra_model_rd[i]) {
+      for (int j = max_model_cnt_allowed - 1; j > i; j--) {
+        top_intra_model_rd[j] = top_intra_model_rd[j - 1];
       }
+      top_intra_model_rd[i] = this_model_rd;
+      break;
     }
   }
+  if (top_intra_model_rd[model_rd_index_for_pruning] != INT64_MAX &&
+      this_model_rd >
+          thresh_top * top_intra_model_rd[model_rd_index_for_pruning])
+    return 1;
 
-  for (r = 1; r < 3; ++r) {
-    for (c = 0; c < pmi->palette_size[1]; ++c) {
-      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
-    }
-  }
-
-  av1_calc_indices(data, centroids, color_map, rows * cols,
-                   pmi->palette_size[1], 2);
-  extend_palette_color_map(color_map, cols, rows, plane_block_width,
-                           plane_block_height);
-}
-
-static AOM_INLINE void choose_intra_uv_mode(
-    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
-    TX_SIZE max_tx_size, int *rate_uv, int *rate_uv_tokenonly, int64_t *dist_uv,
-    int *skip_uv, UV_PREDICTION_MODE *mode_uv) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  // Use an estimated rd for uv_intra based on DC_PRED if the
-  // appropriate speed flag is set.
-  init_sbuv_mode(mbmi);
-  if (!xd->is_chroma_ref) {
-    *rate_uv = 0;
-    *rate_uv_tokenonly = 0;
-    *dist_uv = 0;
-    *skip_uv = 1;
-    *mode_uv = UV_DC_PRED;
-    return;
-  }
-
-  // Only store reconstructed luma when there's chroma RDO. When there's no
-  // chroma RDO, the reconstructed luma will be stored in encode_superblock().
-  xd->cfl.store_y = store_cfl_required_rdo(cm, x);
-  if (xd->cfl.store_y) {
-    // Restore reconstructed luma values.
-    av1_encode_intra_block_plane(cpi, x, mbmi->sb_type, AOM_PLANE_Y,
-                                 DRY_RUN_NORMAL,
-                                 cpi->optimize_seg_arr[mbmi->segment_id]);
-    xd->cfl.store_y = 0;
-  }
-  av1_rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
-                              skip_uv, bsize, max_tx_size);
-  *mode_uv = mbmi->uv_mode;
+  if (this_model_rd != INT64_MAX &&
+      this_model_rd > thresh_best * (*best_model_rd))
+    return 1;
+  if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+  return 0;
 }
 
 // Run RD calculation with given chroma intra prediction angle., and return
@@ -1251,13 +459,20 @@ static int64_t pick_intra_angle_routine_sbuv(
     *rate = this_rate;
     rd_stats->rate = tokenonly_rd_stats.rate;
     rd_stats->dist = tokenonly_rd_stats.dist;
-    rd_stats->skip = tokenonly_rd_stats.skip;
+    rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm;
   }
   return this_rd;
 }
 
-// With given chroma directional intra prediction mode, pick the best angle
-// delta. Return true if a RD cost that is smaller than the input one is found.
+/*!\brief Search for the best angle delta for chroma prediction
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * Given a chroma directional intra prediction mode, this function will try to
+ * estimate the best delta_angle.
+ *
+ * \returns Return if there is a new mode with smaller rdcost than best_rd.
+ */
 static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     BLOCK_SIZE bsize, int rate_overhead,
                                     int64_t best_rd, int *rate,
@@ -1269,7 +484,7 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
 
   rd_stats->rate = INT_MAX;
-  rd_stats->skip = 0;
+  rd_stats->skip_txfm = 0;
   rd_stats->dist = INT64_MAX;
   for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
 
@@ -1315,186 +530,321 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 
 #define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
   (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
-static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
-                             TX_SIZE tx_size, int64_t best_rd) {
+
+static void cfl_idx_to_sign_and_alpha(int cfl_idx, CFL_SIGN_TYPE *cfl_sign,
+                                      int *cfl_alpha) {
+  int cfl_linear_idx = cfl_idx - CFL_INDEX_ZERO;
+  if (cfl_linear_idx == 0) {
+    *cfl_sign = CFL_SIGN_ZERO;
+    *cfl_alpha = 0;
+  } else {
+    *cfl_sign = cfl_linear_idx > 0 ? CFL_SIGN_POS : CFL_SIGN_NEG;
+    *cfl_alpha = abs(cfl_linear_idx) - 1;
+  }
+}
+
+static int64_t cfl_compute_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              int plane, TX_SIZE tx_size,
+                              BLOCK_SIZE plane_bsize, int cfl_idx,
+                              int fast_mode, RD_STATS *rd_stats) {
+  assert(IMPLIES(fast_mode, rd_stats == NULL));
+  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  const MACROBLOCKD_PLANE *pd = &xd->plane[AOM_PLANE_U];
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
+  int cfl_plane = get_cfl_pred_type(plane);
+  CFL_SIGN_TYPE cfl_sign;
+  int cfl_alpha;
+  cfl_idx_to_sign_and_alpha(cfl_idx, &cfl_sign, &cfl_alpha);
+  // We conly build CFL for a given plane, the other plane's sign is dummy
+  int dummy_sign = CFL_SIGN_NEG;
+  const int8_t orig_cfl_alpha_signs = mbmi->cfl_alpha_signs;
+  const uint8_t orig_cfl_alpha_idx = mbmi->cfl_alpha_idx;
+  mbmi->cfl_alpha_signs =
+      PLANE_SIGN_TO_JOINT_SIGN(cfl_plane, cfl_sign, dummy_sign);
+  mbmi->cfl_alpha_idx = (cfl_alpha << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha;
+  int64_t cfl_cost;
+  if (fast_mode) {
+    cfl_cost =
+        intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hadamard=*/0);
+  } else {
+    av1_init_rd_stats(rd_stats);
+    av1_txfm_rd_in_plane(x, cpi, rd_stats, INT64_MAX, 0, plane, plane_bsize,
+                         tx_size, FTXS_NONE, 0);
+    av1_rd_cost_update(x->rdmult, rd_stats);
+    cfl_cost = rd_stats->rdcost;
+  }
+  mbmi->cfl_alpha_signs = orig_cfl_alpha_signs;
+  mbmi->cfl_alpha_idx = orig_cfl_alpha_idx;
+  return cfl_cost;
+}
 
-  assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
-  assert(plane_bsize < BLOCK_SIZES_ALL);
-  if (!xd->lossless[mbmi->segment_id]) {
-    assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
-    assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
-  }
+static void cfl_pick_plane_parameter(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                     int plane, TX_SIZE tx_size,
+                                     int cfl_search_range,
+                                     RD_STATS cfl_rd_arr[CFL_MAGS_SIZE]) {
+  assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+  MACROBLOCKD *const xd = &x->e_mbd;
 
   xd->cfl.use_dc_pred_cache = 1;
-  const int64_t mode_rd =
-      RDCOST(x->rdmult,
-             x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0);
-  int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
-  int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
-#if CONFIG_DEBUG
-  int best_rate_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
-#endif  // CONFIG_DEBUG
-
-  const int skip_trellis = 0;
-  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
-    RD_STATS rd_stats;
-    av1_init_rd_stats(&rd_stats);
-    for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
-      best_rd_uv[joint_sign][plane] = INT64_MAX;
-      best_c[joint_sign][plane] = 0;
-    }
-    // Collect RD stats for an alpha value of zero in this plane.
-    // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid.
-    for (int i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) {
-      const int8_t joint_sign =
-          PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i);
-      if (i == CFL_SIGN_NEG) {
-        mbmi->cfl_alpha_idx = 0;
-        mbmi->cfl_alpha_signs = joint_sign;
-        av1_txfm_rd_in_plane(
-            x, cpi, &rd_stats, best_rd, 0, plane + 1, plane_bsize, tx_size,
-            cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE, skip_trellis);
-        if (rd_stats.rate == INT_MAX) break;
-      }
-      const int alpha_rate = x->cfl_cost[joint_sign][plane][0];
-      best_rd_uv[joint_sign][plane] =
-          RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
-#if CONFIG_DEBUG
-      best_rate_uv[joint_sign][plane] = rd_stats.rate;
-#endif  // CONFIG_DEBUG
-    }
-  }
 
-  int8_t best_joint_sign = -1;
-
-  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
-    for (int pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) {
-      int progress = 0;
-      for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
-        int flag = 0;
-        RD_STATS rd_stats;
-        if (c > 2 && progress < c) break;
-        av1_init_rd_stats(&rd_stats);
-        for (int i = 0; i < CFL_SIGNS; i++) {
-          const int8_t joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i);
-          if (i == 0) {
-            mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
-            mbmi->cfl_alpha_signs = joint_sign;
-            av1_txfm_rd_in_plane(
-                x, cpi, &rd_stats, best_rd, 0, plane + 1, plane_bsize, tx_size,
-                cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE, skip_trellis);
-            if (rd_stats.rate == INT_MAX) break;
-          }
-          const int alpha_rate = x->cfl_cost[joint_sign][plane][c];
-          int64_t this_rd =
-              RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
-          if (this_rd >= best_rd_uv[joint_sign][plane]) continue;
-          best_rd_uv[joint_sign][plane] = this_rd;
-          best_c[joint_sign][plane] = c;
-#if CONFIG_DEBUG
-          best_rate_uv[joint_sign][plane] = rd_stats.rate;
-#endif  // CONFIG_DEBUG
-          flag = 2;
-          if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue;
-          this_rd += mode_rd + best_rd_uv[joint_sign][!plane];
-          if (this_rd >= best_rd) continue;
-          best_rd = this_rd;
-          best_joint_sign = joint_sign;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->uv_mode == UV_CFL_PRED);
+  const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+
+  const int dir_ls[2] = { 1, -1 };
+
+  int est_best_cfl_idx = CFL_INDEX_ZERO;
+  if (cfl_search_range < CFL_MAGS_SIZE) {
+    int fast_mode = 1;
+    int start_cfl_idx = CFL_INDEX_ZERO;
+    int64_t best_cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize,
+                                           start_cfl_idx, fast_mode, NULL);
+    for (int si = 0; si < 2; ++si) {
+      const int dir = dir_ls[si];
+      for (int i = 1; i < CFL_MAGS_SIZE; ++i) {
+        int cfl_idx = start_cfl_idx + dir * i;
+        if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break;
+        int64_t cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize,
+                                          cfl_idx, fast_mode, NULL);
+        if (cfl_cost < best_cfl_cost) {
+          best_cfl_cost = cfl_cost;
+          est_best_cfl_idx = cfl_idx;
+        } else {
+          break;
         }
-        progress += flag;
       }
     }
   }
 
-  int best_rate_overhead = INT_MAX;
-  uint8_t ind = 0;
-  if (best_joint_sign >= 0) {
-    const int u = best_c[best_joint_sign][CFL_PRED_U];
-    const int v = best_c[best_joint_sign][CFL_PRED_V];
-    ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
-    best_rate_overhead = x->cfl_cost[best_joint_sign][CFL_PRED_U][u] +
-                         x->cfl_cost[best_joint_sign][CFL_PRED_V][v];
-#if CONFIG_DEBUG
-    xd->cfl.rate = x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] +
-                   best_rate_overhead +
-                   best_rate_uv[best_joint_sign][CFL_PRED_U] +
-                   best_rate_uv[best_joint_sign][CFL_PRED_V];
-#endif  // CONFIG_DEBUG
-  } else {
-    best_joint_sign = 0;
+  for (int cfl_idx = 0; cfl_idx < CFL_MAGS_SIZE; ++cfl_idx) {
+    av1_invalid_rd_stats(&cfl_rd_arr[cfl_idx]);
   }
 
-  mbmi->cfl_alpha_idx = ind;
-  mbmi->cfl_alpha_signs = best_joint_sign;
+  int fast_mode = 0;
+  int start_cfl_idx = est_best_cfl_idx;
+  cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, start_cfl_idx, fast_mode,
+                 &cfl_rd_arr[start_cfl_idx]);
+  for (int si = 0; si < 2; ++si) {
+    const int dir = dir_ls[si];
+    for (int i = 1; i < cfl_search_range; ++i) {
+      int cfl_idx = start_cfl_idx + dir * i;
+      if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break;
+      cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, cfl_idx, fast_mode,
+                     &cfl_rd_arr[cfl_idx]);
+    }
+  }
   xd->cfl.use_dc_pred_cache = 0;
   xd->cfl.dc_pred_is_cached[0] = 0;
   xd->cfl.dc_pred_is_cached[1] = 0;
-  return best_rate_overhead;
+}
+
+/*!\brief Pick the optimal parameters for Chroma to Luma (CFL) component
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ *
+ * This function will use DCT_DCT followed by computing SATD (sum of absolute
+ * transformed differences) to estimate the RD score and find the best possible
+ * CFL parameter.
+ *
+ * Then the function will apply a full RD search near the best possible CFL
+ * parameter to find the best actual CFL parameter.
+ *
+ * Side effect:
+ * We use ths buffers in x->plane[] and xd->plane[] as throw-away buffers for RD
+ * search.
+ *
+ * \param[in] x                Encoder prediction block structure.
+ * \param[in] cpi              Top-level encoder instance structure.
+ * \param[in] tx_size          Transform size.
+ * \param[in] ref_best_rd      Reference best RD.
+ * \param[in] cfl_search_range The search range of full RD search near the
+ *                             estimated best CFL parameter.
+ *
+ * \param[out]   best_rd_stats          RD stats of the best CFL parameter
+ * \param[out]   best_cfl_alpha_idx     Best CFL alpha index
+ * \param[out]   best_cfl_alpha_signs   Best CFL joint signs
+ *
+ */
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
+                             TX_SIZE tx_size, int64_t ref_best_rd,
+                             int cfl_search_range, RD_STATS *best_rd_stats,
+                             uint8_t *best_cfl_alpha_idx,
+                             int8_t *best_cfl_alpha_signs) {
+  assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+  const ModeCosts *mode_costs = &x->mode_costs;
+  RD_STATS cfl_rd_arr_u[CFL_MAGS_SIZE];
+  RD_STATS cfl_rd_arr_v[CFL_MAGS_SIZE];
+
+  av1_invalid_rd_stats(best_rd_stats);
+
+  cfl_pick_plane_parameter(cpi, x, 1, tx_size, cfl_search_range, cfl_rd_arr_u);
+  cfl_pick_plane_parameter(cpi, x, 2, tx_size, cfl_search_range, cfl_rd_arr_v);
+
+  for (int ui = 0; ui < CFL_MAGS_SIZE; ++ui) {
+    if (cfl_rd_arr_u[ui].rate == INT_MAX) continue;
+    int cfl_alpha_u;
+    CFL_SIGN_TYPE cfl_sign_u;
+    cfl_idx_to_sign_and_alpha(ui, &cfl_sign_u, &cfl_alpha_u);
+    for (int vi = 0; vi < CFL_MAGS_SIZE; ++vi) {
+      if (cfl_rd_arr_v[vi].rate == INT_MAX) continue;
+      int cfl_alpha_v;
+      CFL_SIGN_TYPE cfl_sign_v;
+      cfl_idx_to_sign_and_alpha(vi, &cfl_sign_v, &cfl_alpha_v);
+      // cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO is not a
+      // valid parameter for CFL
+      if (cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO) continue;
+      int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1;
+      RD_STATS rd_stats = cfl_rd_arr_u[ui];
+      av1_merge_rd_stats(&rd_stats, &cfl_rd_arr_v[vi]);
+      if (rd_stats.rate != INT_MAX) {
+        rd_stats.rate +=
+            mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u];
+        rd_stats.rate +=
+            mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v];
+      }
+      av1_rd_cost_update(x->rdmult, &rd_stats);
+      if (rd_stats.rdcost < best_rd_stats->rdcost) {
+        *best_rd_stats = rd_stats;
+        *best_cfl_alpha_idx =
+            (cfl_alpha_u << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha_v;
+        *best_cfl_alpha_signs = joint_sign;
+      }
+    }
+  }
+  if (best_rd_stats->rdcost >= ref_best_rd) {
+    av1_invalid_rd_stats(best_rd_stats);
+    // Set invalid CFL parameters here since the rdcost is not better than
+    // ref_best_rd.
+    *best_cfl_alpha_idx = 0;
+    *best_cfl_alpha_signs = 0;
+    return 0;
+  }
+  return 1;
 }
 
 int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     int *rate, int *rate_tokenonly,
                                     int64_t *distortion, int *skippable,
                                     BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
   MB_MODE_INFO best_mbmi = *mbmi;
   int64_t best_rd = INT64_MAX, this_rd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
+
+  init_sbuv_mode(mbmi);
 
+  // Return if the current block does not correspond to a chroma block.
+  if (!xd->is_chroma_ref) {
+    *rate = 0;
+    *rate_tokenonly = 0;
+    *distortion = 0;
+    *skippable = 1;
+    return INT64_MAX;
+  }
+
+  // Only store reconstructed luma when there's chroma RDO. When there's no
+  // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+  xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+  if (xd->cfl.store_y) {
+    // Restore reconstructed luma values.
+    // TODO(chiyotsai@google.com): right now we are re-computing the txfm in
+    // this function everytime we search through uv modes. There is some
+    // potential speed up here if we cache the result to avoid redundant
+    // computation.
+    av1_encode_intra_block_plane(cpi, x, mbmi->bsize, AOM_PLANE_Y,
+                                 DRY_RUN_NORMAL,
+                                 cpi->optimize_seg_arr[mbmi->segment_id]);
+    xd->cfl.store_y = 0;
+  }
+  IntraModeSearchState intra_search_state;
+  init_intra_mode_search_state(&intra_search_state);
+
+  // Search through all non-palette modes.
   for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
     int this_rate;
     RD_STATS tokenonly_rd_stats;
     UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
+    const int is_diagonal_mode = av1_is_diagonal_mode(get_uv_mode(mode));
     const int is_directional_mode = av1_is_directional_mode(get_uv_mode(mode));
+
+    if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra)
+      continue;
+    if (is_directional_mode &&
+        !cpi->oxcf.intra_mode_cfg.enable_directional_intra)
+      continue;
+
     if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
           (1 << mode)))
       continue;
-    if (!cpi->oxcf.enable_smooth_intra && mode >= UV_SMOOTH_PRED &&
+    if (!intra_mode_cfg->enable_smooth_intra && mode >= UV_SMOOTH_PRED &&
         mode <= UV_SMOOTH_H_PRED)
       continue;
 
-    if (!cpi->oxcf.enable_paeth_intra && mode == UV_PAETH_PRED) continue;
+    if (!intra_mode_cfg->enable_paeth_intra && mode == UV_PAETH_PRED) continue;
+
+    assert(mbmi->mode < INTRA_MODES);
+    if (cpi->sf.intra_sf.prune_chroma_modes_using_luma_winner &&
+        !(av1_derived_chroma_intra_mode_used_flag[mbmi->mode] & (1 << mode)))
+      continue;
 
     mbmi->uv_mode = mode;
-    int cfl_alpha_rate = 0;
+
+    // Init variables for cfl and angle delta
+    const SPEED_FEATURES *sf = &cpi->sf;
+    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
     if (mode == UV_CFL_PRED) {
-      if (!is_cfl_allowed(xd) || !cpi->oxcf.enable_cfl_intra) continue;
+      if (!is_cfl_allowed(xd) || !intra_mode_cfg->enable_cfl_intra) continue;
       assert(!is_directional_mode);
       const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
-      cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
-      if (cfl_alpha_rate == INT_MAX) continue;
-    }
-    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
-    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type) &&
-        cpi->oxcf.enable_angle_delta) {
+      if (!cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd,
+                             sf->intra_sf.cfl_search_range, &tokenonly_rd_stats,
+                             &mbmi->cfl_alpha_idx, &mbmi->cfl_alpha_signs)) {
+        continue;
+      }
+    } else if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) &&
+               intra_mode_cfg->enable_angle_delta) {
+      if (sf->intra_sf.chroma_intra_pruning_with_hog &&
+          !intra_search_state.dir_mode_skip_mask_ready) {
+        static const float thresh[2][4] = {
+          { -1.2f, 0.0f, 0.0f, 1.2f },    // Interframe
+          { -1.2f, -1.2f, -0.6f, 0.4f },  // Intraframe
+        };
+        const int is_chroma = 1;
+        const int is_intra_frame = frame_is_intra_only(cm);
+        prune_intra_mode_with_hog(
+            x, bsize, cm->seq_params->sb_size,
+            thresh[is_intra_frame]
+                  [sf->intra_sf.chroma_intra_pruning_with_hog - 1],
+            intra_search_state.directional_mode_skip_mask, is_chroma);
+        intra_search_state.dir_mode_skip_mask_ready = 1;
+      }
+      if (intra_search_state.directional_mode_skip_mask[mode]) {
+        continue;
+      }
+
+      // Search through angle delta
       const int rate_overhead =
-          x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
+          mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
       if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
                                     &this_rate, &tokenonly_rd_stats))
         continue;
     } else {
+      // Predict directly if we don't need to search for angle delta.
       if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
         continue;
       }
     }
     const int mode_cost =
-        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] +
-        cfl_alpha_rate;
+        mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
     this_rate = tokenonly_rd_stats.rate +
                 intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
-    if (mode == UV_CFL_PRED) {
-      assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
-#if CONFIG_DEBUG
-      if (!xd->lossless[mbmi->segment_id])
-        assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost);
-#endif  // CONFIG_DEBUG
-    }
     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
     if (this_rd < best_rd) {
@@ -1503,19 +853,21 @@ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       *rate = this_rate;
       *rate_tokenonly = tokenonly_rd_stats.rate;
       *distortion = tokenonly_rd_stats.dist;
-      *skippable = tokenonly_rd_stats.skip;
+      *skippable = tokenonly_rd_stats.skip_txfm;
     }
   }
 
+  // Search palette mode
   const int try_palette =
-      cpi->oxcf.enable_palette &&
+      cpi->oxcf.tool_cfg.enable_palette &&
       av1_allow_palette(cpi->common.features.allow_screen_content_tools,
-                        mbmi->sb_type);
+                        mbmi->bsize);
   if (try_palette) {
     uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
-    rd_pick_palette_intra_sbuv(
+    av1_rd_pick_palette_intra_sbuv(
         cpi, x,
-        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED],
+        mode_costs
+            ->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED],
         best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly,
         distortion, skippable);
   }
@@ -1526,28 +878,29 @@ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   return best_rd;
 }
 
-int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
-                            RD_STATS *this_rd_cost, PICK_MODE_CONTEXT *ctx,
-                            BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
-                            PALETTE_MODE_INFO *const pmi,
-                            unsigned int *ref_costs_single,
-                            IntraModeSearchState *intra_search_state,
+// Searches palette mode for luma channel in inter frame.
+int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
+                            const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                            PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost,
                             int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
+  MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   int rate2 = 0;
-  int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd,
-          best_model_rd_palette = INT64_MAX;
+  int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd;
   int skippable = 0;
-  TX_SIZE uv_tx = TX_4X4;
   uint8_t *const best_palette_color_map =
       x->palette_buffer->best_palette_color_map;
   uint8_t *const color_map = xd->plane[0].color_index_map;
   MB_MODE_INFO best_mbmi_palette = *mbmi;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
   uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *const intra_mode_cost =
+      mode_costs->mbmode_cost[size_group_lookup[bsize]];
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
 
@@ -1555,37 +908,44 @@ int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
   mbmi->uv_mode = UV_DC_PRED;
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
+  av1_zero(pmi->palette_size);
+
   RD_STATS rd_stats_y;
   av1_invalid_rd_stats(&rd_stats_y);
-  rd_pick_palette_intra_sby(
-      cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette,
-      best_palette_color_map, &best_rd_palette, &best_model_rd_palette,
-      &rd_stats_y.rate, NULL, &rd_stats_y.dist, &rd_stats_y.skip, NULL, ctx,
-      best_blk_skip, best_tx_type_map);
+  av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED],
+                                &best_mbmi_palette, best_palette_color_map,
+                                &best_rd_palette, &rd_stats_y.rate, NULL,
+                                &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
+                                ctx, best_blk_skip, best_tx_type_map);
   if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
     this_rd_cost->rdcost = INT64_MAX;
     return skippable;
   }
 
-  memcpy(x->blk_skip, best_blk_skip,
+  memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
          sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
   av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
   memcpy(color_map, best_palette_color_map,
          rows * cols * sizeof(best_palette_color_map[0]));
 
-  skippable = rd_stats_y.skip;
+  skippable = rd_stats_y.skip_txfm;
   distortion2 = rd_stats_y.dist;
-  rate2 = rd_stats_y.rate + ref_costs_single[INTRA_FRAME];
+  rate2 = rd_stats_y.rate + ref_frame_cost;
   if (num_planes > 1) {
-    uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
     if (intra_search_state->rate_uv_intra == INT_MAX) {
-      choose_intra_uv_mode(
-          cpi, x, bsize, uv_tx, &intra_search_state->rate_uv_intra,
-          &intra_search_state->rate_uv_tokenonly, &intra_search_state->dist_uvs,
-          &intra_search_state->skip_uvs, &intra_search_state->mode_uv);
+      // We have not found any good uv mode yet, so we need to search for it.
+      TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+      av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra,
+                                  &intra_search_state->rate_uv_tokenonly,
+                                  &intra_search_state->dist_uvs,
+                                  &intra_search_state->skip_uvs, bsize, uv_tx);
+      intra_search_state->mode_uv = mbmi->uv_mode;
       intra_search_state->pmi_uv = *pmi;
       intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
     }
+
+    // We have found at least one good uv mode before, so copy and paste it
+    // over.
     mbmi->uv_mode = intra_search_state->mode_uv;
     pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
     if (pmi->palette_size[1] > 0) {
@@ -1602,9 +962,9 @@ int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
   if (skippable) {
     rate2 -= rd_stats_y.rate;
     if (num_planes > 1) rate2 -= intra_search_state->rate_uv_tokenonly;
-    rate2 += x->skip_cost[av1_get_skip_context(xd)][1];
+    rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1];
   } else {
-    rate2 += x->skip_cost[av1_get_skip_context(xd)][0];
+    rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
   }
   this_rd = RDCOST(x->rdmult, rate2, distortion2);
   this_rd_cost->rate = rate2;
@@ -1613,7 +973,76 @@ int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
   return skippable;
 }
 
-// Given selected prediction mode, search for the best tx type and size.
+void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                                  PICK_MODE_CONTEXT *ctx,
+                                  RD_STATS *this_rd_cost, int64_t best_rd) {
+  MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int64_t best_rd_palette = best_rd, this_rd;
+  uint8_t *const best_palette_color_map =
+      x->palette_buffer->best_palette_color_map;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  MB_MODE_INFO best_mbmi_palette = *mbmi;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *const intra_mode_cost =
+      mode_costs->mbmode_cost[size_group_lookup[bsize]];
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  av1_zero(pmi->palette_size);
+
+  RD_STATS rd_stats_y;
+  av1_invalid_rd_stats(&rd_stats_y);
+  av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED],
+                                &best_mbmi_palette, best_palette_color_map,
+                                &best_rd_palette, &rd_stats_y.rate, NULL,
+                                &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
+                                ctx, best_blk_skip, best_tx_type_map);
+  if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
+    this_rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+  memcpy(color_map, best_palette_color_map,
+         rows * cols * sizeof(best_palette_color_map[0]));
+
+  rd_stats_y.rate += ref_frame_cost;
+
+  if (rd_stats_y.skip_txfm) {
+    rd_stats_y.rate =
+        ref_frame_cost +
+        mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1];
+  } else {
+    rd_stats_y.rate +=
+        mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
+  }
+  this_rd = RDCOST(x->rdmult, rd_stats_y.rate, rd_stats_y.dist);
+  this_rd_cost->rate = rd_stats_y.rate;
+  this_rd_cost->dist = rd_stats_y.dist;
+  this_rd_cost->rdcost = this_rd;
+  this_rd_cost->skip_txfm = rd_stats_y.skip_txfm;
+}
+
+/*!\brief Get the intra prediction by searching through tx_type and tx_size.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * Currently this function is only used in the intra frame code path for
+ * winner-mode processing.
+ *
+ * \return Returns whether the current mode is an improvement over best_rd.
+ */
 static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                                       BLOCK_SIZE bsize, const int *bmode_costs,
                                       int64_t *best_rd, int *rate,
@@ -1628,7 +1057,7 @@ static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
   if (rd_stats.rate == INT_MAX) return 0;
   int this_rate_tokenonly = rd_stats.rate;
-  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
     // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
     // in the tokenonly rate, but for intra blocks, tx_size is always coded
     // (prediction granularity), so we account for it in the full rate,
@@ -1637,7 +1066,7 @@ static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
   const int this_rate =
       rd_stats.rate +
-      intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0);
   const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist);
   if (this_rd < *best_rd) {
     *best_mbmi = *mbmi;
@@ -1645,300 +1074,245 @@ static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
     *rate = this_rate;
     *rate_tokenonly = this_rate_tokenonly;
     *distortion = rd_stats.dist;
-    *skippable = rd_stats.skip;
-    av1_copy_array(ctx->blk_skip, x->blk_skip, ctx->num_4x4_blk);
+    *skippable = rd_stats.skip_txfm;
+    av1_copy_array(ctx->blk_skip, x->txfm_search_info.blk_skip,
+                   ctx->num_4x4_blk);
     av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
     return 1;
   }
   return 0;
 }
 
-// With given luma directional intra prediction mode, pick the best angle delta
-// Return the RD cost corresponding to the best angle delta.
-static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                       int *rate, RD_STATS *rd_stats,
-                                       BLOCK_SIZE bsize, int mode_cost,
-                                       int64_t best_rd, int64_t *best_model_rd,
-                                       int skip_model_rd_for_zero_deg) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
+/*!\brief Search for the best filter_intra mode when coding inter frame.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function loops through all filter_intra modes to find the best one.
+ *
+ * \return Returns nothing, but updates the mbmi and rd_stats.
+ */
+static INLINE void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x,
+                                            BLOCK_SIZE bsize,
+                                            const PICK_MODE_CONTEXT *ctx,
+                                            RD_STATS *rd_stats_y, int mode_cost,
+                                            int64_t best_rd,
+                                            int64_t best_rd_so_far) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->mode == DC_PRED &&
+         av1_filter_intra_allowed_bsize(&cpi->common, bsize));
 
-  int best_angle_delta = 0;
-  int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+  RD_STATS rd_stats_y_fi;
+  int filter_intra_selected_flag = 0;
   TX_SIZE best_tx_size = mbmi->tx_size;
+  FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+         sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
   uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-
-  for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
-
-  int first_try = 1;
-  for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    for (int i = 0; i < 2; ++i) {
-      const int64_t best_rd_in =
-          (best_rd == INT64_MAX) ? INT64_MAX
-                                 : (best_rd + (best_rd >> (first_try ? 3 : 5)));
-      const int64_t this_rd = calc_rd_given_intra_angle(
-          cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta,
-          MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
-          &best_rd, best_model_rd, best_tx_type_map, best_blk_skip,
-          (skip_model_rd_for_zero_deg & !angle_delta));
-      rd_cost[2 * angle_delta + i] = this_rd;
-      if (first_try && this_rd == INT64_MAX) return best_rd;
-      first_try = 0;
-      if (angle_delta == 0) {
-        rd_cost[1] = this_rd;
-        break;
-      }
+  av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+  mbmi->filter_intra_mode_info.use_filter_intra = 1;
+  for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; fi_mode < FILTER_INTRA_MODES;
+       ++fi_mode) {
+    mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize, best_rd);
+    if (rd_stats_y_fi.rate == INT_MAX) continue;
+    const int this_rate_tmp =
+        rd_stats_y_fi.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+    const int64_t this_rd_tmp =
+        RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
+
+    if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) {
+      break;
     }
-  }
-
-  assert(best_rd != INT64_MAX);
-  for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    for (int i = 0; i < 2; ++i) {
-      int skip_search = 0;
-      const int64_t rd_thresh = best_rd + (best_rd >> 5);
-      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
-          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
-        skip_search = 1;
-      if (!skip_search) {
-        calc_rd_given_intra_angle(
-            cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta,
-            MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
-            &best_rd, best_model_rd, best_tx_type_map, best_blk_skip, 0);
-      }
+    if (this_rd_tmp < best_rd_so_far) {
+      best_tx_size = mbmi->tx_size;
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+      memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+             sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+      best_fi_mode = fi_mode;
+      *rd_stats_y = rd_stats_y_fi;
+      filter_intra_selected_flag = 1;
+      best_rd_so_far = this_rd_tmp;
     }
   }
 
-  if (rd_stats->rate != INT_MAX) {
-    mbmi->tx_size = best_tx_size;
-    mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
-    const int n4 = bsize_to_num_blk(bsize);
-    memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
-    av1_copy_array(xd->tx_type_map, best_tx_type_map, n4);
+  mbmi->tx_size = best_tx_size;
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+  memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+         sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+
+  if (filter_intra_selected_flag) {
+    mbmi->filter_intra_mode_info.use_filter_intra = 1;
+    mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
+  } else {
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
   }
-  return best_rd;
 }
 
-int64_t av1_handle_intra_mode(IntraModeSearchState *intra_search_state,
-                              const AV1_COMP *cpi, MACROBLOCK *x,
-                              BLOCK_SIZE bsize, int ref_frame_cost,
-                              const PICK_MODE_CONTEXT *ctx, int disable_skip,
-                              RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-                              RD_STATS *rd_stats_uv, int64_t best_rd,
-                              int64_t *best_intra_rd, int8_t best_mbmode_skip) {
+// Evaluate a given luma intra-mode in inter frames.
+int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
+                            const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                            const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
+                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y,
+                            int64_t *best_model_rd,
+                            int64_t top_intra_model_rd[]) {
   const AV1_COMMON *cm = &cpi->common;
-  const SPEED_FEATURES *const sf = &cpi->sf;
+  const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(mbmi->ref_frame[0] == INTRA_FRAME);
   const PREDICTION_MODE mode = mbmi->mode;
+  const ModeCosts *mode_costs = &x->mode_costs;
   const int mode_cost =
-      x->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost;
+      mode_costs->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost;
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+
+  int known_rate = mode_cost;
   const int intra_cost_penalty = av1_get_intra_cost_penalty(
       cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q,
-      cm->seq_params.bit_depth);
-  const int skip_ctx = av1_get_skip_context(xd);
+      cm->seq_params->bit_depth);
 
-  int known_rate = mode_cost;
-  known_rate += ref_frame_cost;
   if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty;
-  known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+  known_rate += AOMMIN(mode_costs->skip_txfm_cost[skip_ctx][0],
+                       mode_costs->skip_txfm_cost[skip_ctx][1]);
   const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
   if (known_rd > best_rd) {
     intra_search_state->skip_intra_modes = 1;
-    return INT64_MAX;
+    return 0;
   }
 
   const int is_directional_mode = av1_is_directional_mode(mode);
   if (is_directional_mode && av1_use_angle_delta(bsize) &&
-      cpi->oxcf.enable_angle_delta) {
-    if (sf->intra_sf.intra_pruning_with_hog &&
-        !intra_search_state->angle_stats_ready) {
-      prune_intra_mode_with_hog(x, bsize,
-                                cpi->sf.intra_sf.intra_pruning_with_hog_thresh,
-                                intra_search_state->directional_mode_skip_mask);
-      intra_search_state->angle_stats_ready = 1;
+      cpi->oxcf.intra_mode_cfg.enable_angle_delta) {
+    if (intra_sf->intra_pruning_with_hog &&
+        !intra_search_state->dir_mode_skip_mask_ready) {
+      const float thresh[4] = { -1.2f, 0.0f, 0.0f, 1.2f };
+      const int is_chroma = 0;
+      prune_intra_mode_with_hog(x, bsize, cm->seq_params->sb_size,
+                                thresh[intra_sf->intra_pruning_with_hog - 1],
+                                intra_search_state->directional_mode_skip_mask,
+                                is_chroma);
+      intra_search_state->dir_mode_skip_mask_ready = 1;
     }
-    if (intra_search_state->directional_mode_skip_mask[mode]) return INT64_MAX;
-    av1_init_rd_stats(rd_stats_y);
-    rd_stats_y->rate = INT_MAX;
-    int64_t model_rd = INT64_MAX;
-    int rate_dummy;
-    rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize, mode_cost,
-                            best_rd, &model_rd, 0);
-
-  } else {
-    av1_init_rd_stats(rd_stats_y);
-    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
-    av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
+    if (intra_search_state->directional_mode_skip_mask[mode]) return 0;
   }
+  const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+  const int64_t this_model_rd =
+      intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+
+  const int model_rd_index_for_pruning =
+      get_model_rd_index_for_pruning(x, intra_sf);
+
+  if (prune_intra_y_mode(this_model_rd, best_model_rd, top_intra_model_rd,
+                         intra_sf->top_intra_model_count_allowed,
+                         model_rd_index_for_pruning))
+    return 0;
+  av1_init_rd_stats(rd_stats_y);
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
 
   // Pick filter intra modes.
   if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
-    int try_filter_intra = 0;
+    int try_filter_intra = 1;
     int64_t best_rd_so_far = INT64_MAX;
     if (rd_stats_y->rate != INT_MAX) {
+      // best_rd_so_far is the rdcost of DC_PRED without using filter_intra.
+      // Later, in filter intra search, best_rd_so_far is used for comparison.
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
       const int tmp_rate =
-          rd_stats_y->rate + x->filter_intra_cost[bsize][0] + mode_cost;
+          rd_stats_y->rate +
+          intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
       best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
       try_filter_intra = (best_rd_so_far / 2) <= best_rd;
-    } else {
-      try_filter_intra = !best_mbmode_skip;
+    } else if (intra_sf->skip_filter_intra_in_inter_frames >= 1) {
+      // As rd cost of luma intra dc mode is more than best_rd (i.e.,
+      // rd_stats_y->rate = INT_MAX), skip the evaluation of filter intra modes.
+      try_filter_intra = 0;
     }
 
     if (try_filter_intra) {
-      RD_STATS rd_stats_y_fi;
-      int filter_intra_selected_flag = 0;
-      TX_SIZE best_tx_size = mbmi->tx_size;
-      FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
-      uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-      memcpy(best_blk_skip, x->blk_skip,
-             sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
-      uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-      av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
-      mbmi->filter_intra_mode_info.use_filter_intra = 1;
-      for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED;
-           fi_mode < FILTER_INTRA_MODES; ++fi_mode) {
-        mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
-        av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize,
-                                          best_rd);
-        if (rd_stats_y_fi.rate == INT_MAX) continue;
-        const int this_rate_tmp =
-            rd_stats_y_fi.rate +
-            intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
-        const int64_t this_rd_tmp =
-            RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
-
-        if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) {
-          break;
-        }
-        if (this_rd_tmp < best_rd_so_far) {
-          best_tx_size = mbmi->tx_size;
-          av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
-          memcpy(best_blk_skip, x->blk_skip,
-                 sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
-          best_fi_mode = fi_mode;
-          *rd_stats_y = rd_stats_y_fi;
-          filter_intra_selected_flag = 1;
-          best_rd_so_far = this_rd_tmp;
-        }
-      }
-
-      mbmi->tx_size = best_tx_size;
-      av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
-      memcpy(x->blk_skip, best_blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-
-      if (filter_intra_selected_flag) {
-        mbmi->filter_intra_mode_info.use_filter_intra = 1;
-        mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
-      } else {
-        mbmi->filter_intra_mode_info.use_filter_intra = 0;
-      }
+      handle_filter_intra_mode(cpi, x, bsize, ctx, rd_stats_y, mode_cost,
+                               best_rd, best_rd_so_far);
     }
   }
 
-  if (rd_stats_y->rate == INT_MAX) return INT64_MAX;
-
-  const int mode_cost_y =
-      intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
-  av1_init_rd_stats(rd_stats);
-  av1_init_rd_stats(rd_stats_uv);
-  const int num_planes = av1_num_planes(cm);
-  if (num_planes > 1) {
-    PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-    const int try_palette =
-        cpi->oxcf.enable_palette &&
-        av1_allow_palette(cm->features.allow_screen_content_tools,
-                          mbmi->sb_type);
-    const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
-    if (intra_search_state->rate_uv_intra == INT_MAX) {
-      const int rate_y =
-          rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate;
-      const int64_t rdy =
-          RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist);
-      if (best_rd < (INT64_MAX / 2) && rdy > (best_rd + (best_rd >> 2))) {
-        intra_search_state->skip_intra_modes = 1;
-        return INT64_MAX;
-      }
-      choose_intra_uv_mode(
-          cpi, x, bsize, uv_tx, &intra_search_state->rate_uv_intra,
-          &intra_search_state->rate_uv_tokenonly, &intra_search_state->dist_uvs,
-          &intra_search_state->skip_uvs, &intra_search_state->mode_uv);
-      if (try_palette) intra_search_state->pmi_uv = *pmi;
-      intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
-
-      const int uv_rate = intra_search_state->rate_uv_tokenonly;
-      const int64_t uv_dist = intra_search_state->dist_uvs;
-      const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
-      if (uv_rd > best_rd) {
-        intra_search_state->skip_intra_modes = 1;
-        return INT64_MAX;
-      }
-    }
+  if (rd_stats_y->rate == INT_MAX) return 0;
 
-    rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly;
-    rd_stats_uv->dist = intra_search_state->dist_uvs;
-    rd_stats_uv->skip = intra_search_state->skip_uvs;
-    rd_stats->skip = rd_stats_y->skip && rd_stats_uv->skip;
-    mbmi->uv_mode = intra_search_state->mode_uv;
-    if (try_palette) {
-      pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
-      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
-             intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
-             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
-    }
-    mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+  *mode_cost_y = intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+  const int rate_y = rd_stats_y->skip_txfm
+                         ? mode_costs->skip_txfm_cost[skip_ctx][1]
+                         : rd_stats_y->rate;
+  *rd_y = RDCOST(x->rdmult, rate_y + *mode_cost_y, rd_stats_y->dist);
+  if (best_rd < (INT64_MAX / 2) && *rd_y > (best_rd + (best_rd >> 2))) {
+    intra_search_state->skip_intra_modes = 1;
+    return 0;
   }
 
-  rd_stats->rate = rd_stats_y->rate + mode_cost_y;
-  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
-    // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
-    // in the tokenonly rate, but for intra blocks, tx_size is always coded
-    // (prediction granularity), so we account for it in the full rate,
-    // not the tokenonly rate.
-    rd_stats_y->rate -= tx_size_cost(x, bsize, mbmi->tx_size);
-  }
-  if (num_planes > 1 && xd->is_chroma_ref) {
-    const int uv_mode_cost =
-        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode];
-    rd_stats->rate +=
-        rd_stats_uv->rate +
-        intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
-  }
-  if (mode != DC_PRED && mode != PAETH_PRED) {
-    rd_stats->rate += intra_cost_penalty;
-  }
+  return 1;
+}
 
-  // Intra block is always coded as non-skip
-  rd_stats->skip = 0;
-  rd_stats->dist = rd_stats_y->dist + rd_stats_uv->dist;
-  // Add in the cost of the no skip flag.
-  rd_stats->rate += x->skip_cost[skip_ctx][0];
-  // Calculate the final RD estimate for this mode.
-  const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-  // Keep record of best intra rd
-  if (this_rd < *best_intra_rd) {
-    *best_intra_rd = this_rd;
-    intra_search_state->best_intra_mode = mode;
-  }
+int av1_search_intra_uv_modes_in_interframe(
+    IntraModeSearchState *intra_search_state, const AV1_COMP *cpi,
+    MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats,
+    const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->ref_frame[0] == INTRA_FRAME);
+
+  // TODO(chiyotsai@google.com): Consolidate the chroma search code here with
+  // the one in av1_search_palette_mode.
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int try_palette =
+      cpi->oxcf.tool_cfg.enable_palette &&
+      av1_allow_palette(cm->features.allow_screen_content_tools, mbmi->bsize);
 
-  if (sf->intra_sf.skip_intra_in_interframe) {
-    if (best_rd < (INT64_MAX / 2) && this_rd > (best_rd + (best_rd >> 1)))
+  assert(intra_search_state->rate_uv_intra == INT_MAX);
+  if (intra_search_state->rate_uv_intra == INT_MAX) {
+    // If no good uv-predictor had been found, search for it.
+    const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra,
+                                &intra_search_state->rate_uv_tokenonly,
+                                &intra_search_state->dist_uvs,
+                                &intra_search_state->skip_uvs, bsize, uv_tx);
+    intra_search_state->mode_uv = mbmi->uv_mode;
+    if (try_palette) intra_search_state->pmi_uv = *pmi;
+    intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+
+    const int uv_rate = intra_search_state->rate_uv_tokenonly;
+    const int64_t uv_dist = intra_search_state->dist_uvs;
+    const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
+    if (uv_rd > best_rd) {
+      // If there is no good intra uv-mode available, we can skip all intra
+      // modes.
       intra_search_state->skip_intra_modes = 1;
+      return 0;
+    }
   }
 
-  if (!disable_skip) {
-    for (int i = 0; i < REFERENCE_MODES; ++i) {
-      intra_search_state->best_pred_rd[i] =
-          AOMMIN(intra_search_state->best_pred_rd[i], this_rd);
-    }
+  // If we are here, then the encoder has found at least one good intra uv
+  // predictor, so we can directly copy its statistics over.
+  // TODO(any): the stats here is not right if the best uv mode is CFL but the
+  // best y mode is palette.
+  rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly;
+  rd_stats_uv->dist = intra_search_state->dist_uvs;
+  rd_stats_uv->skip_txfm = intra_search_state->skip_uvs;
+  rd_stats->skip_txfm = rd_stats_y->skip_txfm && rd_stats_uv->skip_txfm;
+  mbmi->uv_mode = intra_search_state->mode_uv;
+  if (try_palette) {
+    pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
+    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+           intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
+           2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
   }
-  return this_rd;
+  mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+
+  return 1;
 }
 
-// This function is used only for intra_only frames
+// Finds the best non-intrabc mode on an intra frame.
 int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    int *rate, int *rate_tokenonly,
                                    int64_t *distortion, int *skippable,
@@ -1954,11 +1328,12 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   // function
   int beat_best_rd = 0;
   const int *bmode_costs;
+  const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int try_palette =
-      cpi->oxcf.enable_palette &&
+      cpi->oxcf.tool_cfg.enable_palette &&
       av1_allow_palette(cpi->common.features.allow_screen_content_tools,
-                        mbmi->sb_type);
+                        mbmi->bsize);
   uint8_t *best_palette_color_map =
       try_palette ? x->palette_buffer->best_palette_color_map : NULL;
   const MB_MODE_INFO *above_mi = xd->above_mbmi;
@@ -1967,13 +1342,19 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   const PREDICTION_MODE L = av1_left_block_mode(left_mi);
   const int above_ctx = intra_mode_context[A];
   const int left_ctx = intra_mode_context[L];
-  bmode_costs = x->y_mode_costs[above_ctx][left_ctx];
+  bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx];
 
   mbmi->angle_delta[PLANE_TYPE_Y] = 0;
-  if (cpi->sf.intra_sf.intra_pruning_with_hog) {
-    prune_intra_mode_with_hog(x, bsize,
-                              cpi->sf.intra_sf.intra_pruning_with_hog_thresh,
-                              directional_mode_skip_mask);
+  const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf;
+  if (intra_sf->intra_pruning_with_hog) {
+    // Less aggressive thresholds are used here than those used in inter frame
+    // encoding in av1_handle_intra_y_mode() because we want key frames/intra
+    // frames to have higher quality.
+    const float thresh[4] = { -1.2f, -1.2f, -0.6f, 0.4f };
+    const int is_chroma = 0;
+    prune_intra_mode_with_hog(x, bsize, cpi->common.seq_params->sb_size,
+                              thresh[intra_sf->intra_pruning_with_hog - 1],
+                              directional_mode_skip_mask, is_chroma);
   }
   mbmi->filter_intra_mode_info.use_filter_intra = 0;
   pmi->palette_size[0] = 0;
@@ -1982,47 +1363,89 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   set_mode_eval_params(cpi, x, MODE_EVAL);
 
   MB_MODE_INFO best_mbmi = *mbmi;
-  av1_zero(x->winner_mode_stats);
+  const int max_winner_mode_count =
+      winner_mode_count_allowed[cpi->sf.winner_mode_sf.multi_winner_mode_type];
+  zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats);
   x->winner_mode_count = 0;
 
-  /* Y Search for intra prediction mode */
-  for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
+  // Searches the intra-modes except for intrabc, palette, and filter_intra.
+  int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+  for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+    top_intra_model_rd[i] = INT64_MAX;
+  }
+  for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT;
+       ++mode_idx) {
+    set_y_mode_and_delta_angle(mode_idx, mbmi);
     RD_STATS this_rd_stats;
     int this_rate, this_rate_tokenonly, s;
+    int is_diagonal_mode;
     int64_t this_distortion, this_rd;
-    mbmi->mode = intra_rd_search_mode_order[mode_idx];
-    if ((!cpi->oxcf.enable_smooth_intra ||
-         cpi->sf.intra_sf.disable_smooth_intra) &&
-        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
-         mbmi->mode == SMOOTH_V_PRED))
+
+    is_diagonal_mode = av1_is_diagonal_mode(mbmi->mode);
+    if (is_diagonal_mode && !intra_mode_cfg->enable_diagonal_intra) continue;
+    if (av1_is_directional_mode(mbmi->mode) &&
+        !intra_mode_cfg->enable_directional_intra)
       continue;
-    if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
-    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
 
-    if (model_intra_yrd_and_prune(cpi, x, bsize, bmode_costs[mbmi->mode],
-                                  &best_model_rd)) {
+    // The smooth prediction mode appears to be more frequently picked
+    // than horizontal / vertical smooth prediction modes. Hence treat
+    // them differently in speed features.
+    if ((!intra_mode_cfg->enable_smooth_intra ||
+         intra_sf->disable_smooth_intra) &&
+        (mbmi->mode == SMOOTH_H_PRED || mbmi->mode == SMOOTH_V_PRED))
       continue;
-    }
+    if (!intra_mode_cfg->enable_smooth_intra && mbmi->mode == SMOOTH_PRED)
+      continue;
+
+    // The functionality of filter intra modes and smooth prediction
+    // overlap. Hence smooth prediction is pruned only if all the
+    // filter intra modes are enabled.
+    if (intra_sf->disable_smooth_intra &&
+        intra_sf->prune_filter_intra_level == 0 && mbmi->mode == SMOOTH_PRED)
+      continue;
+    if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED)
+      continue;
+
+    // Skip the evaluation of modes that do not match with the winner mode in
+    // x->mb_mode_cache.
+    if (x->use_mb_mode_cache && mbmi->mode != x->mb_mode_cache->mode) continue;
 
     is_directional_mode = av1_is_directional_mode(mbmi->mode);
     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
-    if (is_directional_mode && av1_use_angle_delta(bsize) &&
-        cpi->oxcf.enable_angle_delta) {
-      this_rd_stats.rate = INT_MAX;
-      rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
-                              bmode_costs[mbmi->mode], best_rd, &best_model_rd,
-                              1);
-    } else {
-      av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
-    }
+    if (is_directional_mode &&
+        !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) &&
+        mbmi->angle_delta[PLANE_TYPE_Y] != 0)
+      continue;
+
+    // Use intra_y_mode_mask speed feature to skip intra mode evaluation.
+    if (!(intra_sf->intra_y_mode_mask[max_txsize_lookup[bsize]] &
+          (1 << mbmi->mode)))
+      continue;
+
+    const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+    const int64_t this_model_rd =
+        intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+
+    const int model_rd_index_for_pruning =
+        get_model_rd_index_for_pruning(x, intra_sf);
+
+    if (prune_intra_y_mode(this_model_rd, &best_model_rd, top_intra_model_rd,
+                           intra_sf->top_intra_model_count_allowed,
+                           model_rd_index_for_pruning))
+      continue;
+
+    // Builds the actual prediction. The prediction from
+    // model_intra_yrd_and_prune was just an estimation that did not take into
+    // account the effect of txfm pipeline, so we need to redo it for real
+    // here.
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
     this_rate_tokenonly = this_rd_stats.rate;
     this_distortion = this_rd_stats.dist;
-    s = this_rd_stats.skip;
+    s = this_rd_stats.skip_txfm;
 
     if (this_rate_tokenonly == INT_MAX) continue;
 
-    if (!xd->lossless[mbmi->segment_id] &&
-        block_signals_txsize(mbmi->sb_type)) {
+    if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
       // av1_pick_uniform_tx_size_type_yrd above includes the cost of the
       // tx_size in the tokenonly rate, but for intra blocks, tx_size is always
       // coded (prediction granularity), so we account for it in the full rate,
@@ -2031,14 +1454,19 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
     this_rate =
         this_rd_stats.rate +
-        intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0);
     this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
+
+    // Visual quality adjustment based on recon vs source variance.
+    if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) {
+      this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize));
+    }
+
     // Collect mode stats for multiwinner mode processing
     const int txfm_search_done = 1;
     store_winner_mode_stats(
         &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
-        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-        txfm_search_done);
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
     if (this_rd < best_rd) {
       best_mbmi = *mbmi;
       best_rd = this_rd;
@@ -2049,26 +1477,30 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       *rate_tokenonly = this_rate_tokenonly;
       *distortion = this_distortion;
       *skippable = s;
-      memcpy(ctx->blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip,
+             sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
       av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
     }
   }
 
+  // Searches palette
   if (try_palette) {
-    rd_pick_palette_intra_sby(
+    av1_rd_pick_palette_intra_sby(
         cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map,
-        &best_rd, &best_model_rd, rate, rate_tokenonly, distortion, skippable,
-        &beat_best_rd, ctx, ctx->blk_skip, ctx->tx_type_map);
+        &best_rd, rate, rate_tokenonly, distortion, skippable, &beat_best_rd,
+        ctx, ctx->blk_skip, ctx->tx_type_map);
   }
 
+  // Searches filter_intra
   if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
     if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
                                  skippable, bsize, bmode_costs[DC_PRED],
-                                 &best_rd, &best_model_rd, ctx)) {
+                                 best_mbmi.mode, &best_rd, &best_model_rd,
+                                 ctx)) {
       best_mbmi = *mbmi;
     }
   }
+
   // No mode is identified with less rd value than best_rd passed to this
   // function. In such cases winner mode processing is not necessary and return
   // best_rd as INT64_MAX to indicate best mode is not identified
@@ -2077,7 +1509,7 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   // In multi-winner mode processing, perform tx search for few best modes
   // identified during mode evaluation. Winner mode processing uses best tx
   // configuration for tx search.
-  if (cpi->sf.winner_mode_sf.enable_multiwinner_mode_process) {
+  if (cpi->sf.winner_mode_sf.multi_winner_mode_type) {
     int best_mode_idx = 0;
     int block_width, block_height;
     uint8_t *color_map_dst = xd->plane[PLANE_TYPE_Y].color_index_map;
@@ -2086,7 +1518,7 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 
     for (int mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) {
       *mbmi = x->winner_mode_stats[mode_idx].mbmi;
-      if (is_winner_mode_processing_enabled(cpi, mbmi, mbmi->mode)) {
+      if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) {
         // Restore color_map of palette mode before winner mode processing
         if (mbmi->palette_mode_info.palette_size[0] > 0) {
           uint8_t *color_map_src =
@@ -2118,7 +1550,7 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     // If previous searches use only the default tx type/no R-D optimization of
     // quantized coeffs, do an extra search for the best tx type/better R-D
     // optimization of quantized coeffs
-    if (is_winner_mode_processing_enabled(cpi, mbmi, best_mbmi.mode)) {
+    if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) {
       // Set params for winner mode evaluation
       set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
       *mbmi = best_mbmi;
diff --git a/media/libaom/src/av1/encoder/intra_mode_search.h b/media/libaom/src/av1/encoder/intra_mode_search.h
index 4b5d31c3ec..0968558e7d 100644
--- a/media/libaom/src/av1/encoder/intra_mode_search.h
+++ b/media/libaom/src/av1/encoder/intra_mode_search.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\file
+ * \brief Declares high level functions to search through intra modes.
+ */
 #ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
 #define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
 
@@ -18,46 +21,308 @@
 extern "C" {
 #endif
 
+/*! \brief Variables related to intra-mode search during inter frame coding.
+ *
+ * \ingroup intra_mode_search
+ * This is a set of variables used during intra-mode search for inter frames.
+ * This includes an histogram of gradient speed features and a cache of uv
+ * prediction to avoid repeated search of chroma prediction.
+ */
 typedef struct IntraModeSearchState {
-  int skip_intra_modes;
+  /*!
+   * \brief The best luma intra-mode found so far
+   */
   PREDICTION_MODE best_intra_mode;
-  int angle_stats_ready;
+
+  /** \name Speed feature variables
+   * Variables to help with pruning some luma intra-modes during inter frame
+   * coding process.
+   */
+  /**@{*/
+  /*!
+   * \brief Whether to terminate all intra mode search.
+   */
+  int skip_intra_modes;
+  /*!
+   * \brief Whether a directional mode is pruned.
+   */
   uint8_t directional_mode_skip_mask[INTRA_MODES];
-  int rate_uv_intra;
-  int rate_uv_tokenonly;
-  int64_t dist_uvs;
-  int skip_uvs;
-  UV_PREDICTION_MODE mode_uv;
-  PALETTE_MODE_INFO pmi_uv;
-  int8_t uv_angle_delta;
-  int64_t best_pred_rd[REFERENCE_MODES];
+  /*!
+   * \brief Whether \ref directional_mode_skip_mask is valid for pruning.
+   */
+  int dir_mode_skip_mask_ready;
+  /**@}*/
+
+  /** \name Chroma mode search cache
+   * A cache of the best chroma prediction mode to avoid having to search for
+   * chroma predictions repeatedly in \ref
+   * av1_search_intra_uv_modes_in_interframe()
+   */
+  /**@{*/
+  int rate_uv_intra;          /*!< \brief Total rate to transmit uv_mode */
+  int rate_uv_tokenonly;      /*!< \brief Rate transmit txfm tokens */
+  int64_t dist_uvs;           /*!< \brief Distortion of the uv_mode's recon */
+  int skip_uvs;               /*!< \brief Whether the uv txfm is skippable */
+  UV_PREDICTION_MODE mode_uv; /*!< \brief The best uv mode */
+  PALETTE_MODE_INFO pmi_uv;   /*!< \brief Color map if mode_uv is palette */
+  int8_t uv_angle_delta;      /*!< \brief Angle delta if mode_uv directional */
+  /**@}*/
 } IntraModeSearchState;
 
-void av1_restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x);
-int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
-                            RD_STATS *this_rd_cost, PICK_MODE_CONTEXT *ctx,
-                            BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
-                            PALETTE_MODE_INFO *const pmi,
-                            unsigned int *ref_costs_single,
-                            IntraModeSearchState *intra_search_state,
-                            int64_t best_rd);
+/*!\brief Evaluate a given luma intra-mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function handles an intra-mode luma prediction when the current frame
+ * is an inter frame. This is the intra-mode counterpart of handle_inter_mode.
+ * This function performs an intra luma prediction using the mode specified by
+ * x->e_mbd.mi[0]->mode. This function does *not* support palette mode
+ * prediction in the luma channel.
+ *
+ * \param[in,out]    intra_search_state Structure to intra search state.
+ * \param[in]        cpi                Top-level encoder structure.
+ * \param[in,out]    x                  Pointer to structure holding all the
+ *                                      data for the current macroblock.
+ * \param[in]        bsize              Current partition block size.
+ * \param[in]        ref_frame_cost     The entropy cost for signaling that the
+ *                                      current ref frame is an intra frame.
+ * \param[in]        ctx                Structure to hold the number of 4x4 blks
+ *                                      to copy tx_type and txfm_skip arrays.
+ * \param[out]       rd_stats_y         Struct to keep track of the current
+ *                                      intra-mode's rd_stats (luma only).
+ * \param[in]        best_rd            Best RD seen for this block so far.
+ * \param[out]       mode_cost_y        The cost needed to signal the current
+ *                                      intra mode.
+ * \param[out]       rd_y               The rdcost of the chosen mode.
+ * \param[in]        best_model_rd      Best model RD seen for this block so far
+ * \param[in]        top_intra_model_rd Top intra model RD seen for this
+ *                                      block so far.
+ *
+ * \return Returns 1 if a valid intra mode is found, 0 otherwise.
+ * The corresponding values in x->e_mbd.mi[0], rd_stats_y, mode_cost_y, and
+ * rd_y are also updated. Moreover, in the first evaluation with directional
+ * mode, a prune_mask computed with histogram of gradient is also stored in
+ * intra_search_state.
+ */
+int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
+                            const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                            const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
+                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y,
+                            int64_t *best_model_rd,
+                            int64_t top_intra_model_rd[]);
 
-int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    int *rate, int *rate_tokenonly,
-                                    int64_t *distortion, int *skippable,
-                                    BLOCK_SIZE bsize, TX_SIZE max_tx_size);
+/*!\brief Search through all chroma intra-modes for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function handles intra-mode chroma prediction when the current frame
+ * is an inter frame. This is done by calling \ref av1_rd_pick_intra_sbuv_mode
+ * with some additional book-keeping.
+ *
+ * \param[in,out]    intra_search_state Structure to intra search state.
+ * \param[in]        cpi                Top-level encoder structure.
+ * \param[in,out]    x                  Pointer to structure holding all the
+ *                                      data for the current macroblock.
+ * \param[in]        bsize              Current partition block size.
+ * \param[out]       rd_stats           Struct to keep track of the current
+ *                                      intra-mode's rd_stats (all planes).
+ * \param[out]       rd_stats_y         Struct to keep track of the current
+ *                                      intra-mode's rd_stats (luma only).
+ * \param[out]       rd_stats_uv        Struct to keep track of the current
+ *                                      intra-mode's rd_stats (chroma only).
+ * \param[in]        best_rd            Best RD seen for this block so far.
+ *
+ * \return Returns 1 if a valid intra mode is found, 0 otherwise.
+ * The corresponding values in x->e_mbd.mi[0], rd_stats(_y|_uv)  are also
+ * updated. Moreover, in the first evocation of the function, the chroma intra
+ * mode result is cached in intra_search_state to be used in subsequent calls.
+ */
+int av1_search_intra_uv_modes_in_interframe(
+    IntraModeSearchState *intra_search_state, const AV1_COMP *cpi,
+    MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats,
+    const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd);
 
-int64_t av1_handle_intra_mode(IntraModeSearchState *intra_search_state,
-                              const AV1_COMP *cpi, MACROBLOCK *x,
-                              BLOCK_SIZE bsize, int ref_frame_cost,
-                              const PICK_MODE_CONTEXT *ctx, int disable_skip,
-                              RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-                              RD_STATS *rd_stats_uv, int64_t best_rd,
-                              int64_t *best_intra_rd, int8_t best_mbmode_skip);
+/*!\brief Evaluate luma palette mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function handles luma palette mode when the current frame is an
+ * inter frame.
+ *
+ * \param[in]    intra_search_state Structure to hold the best luma intra mode
+ *                                  and cache chroma prediction for speed up.
+ * \param[in]    cpi                Top-level encoder structure.
+ * \param[in]    x                  Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]    bsize              Current partition block size.
+ * \param[in]    ref_frame_cost     The entropy cost for signaling that the
+ *                                  current ref frame is an intra frame.
+ * \param[in]    ctx                Structure to hold the number of 4x4 blks to
+ *                                  copy the tx_type and txfm_skip arrays.
+ * \param[in]    this_rd_cost       Struct to keep track of palette mode's
+ *                                  rd_stats.
+ * \param[in]    best_rd            Best RD seen for this block so far.
+ *
+ * \return Returns whether luma palette mode can skip the txfm. The
+ * corresponding mbmi, this_rd_costs, intra_search_state, and tx_type arrays in
+ * ctx are also updated.
+ */
+int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
+                            const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                            PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost,
+                            int64_t best_rd);
 
+/*!\brief Evaluate luma palette mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function handles luma palette mode when the current frame is an
+ * inter frame.
+ *
+ * \param[in]    cpi                Top-level encoder structure.
+ * \param[in]    x                  Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]    bsize              Current partition block size.
+ * \param[in]    ref_frame_cost     The entropy cost for signaling that the
+ *                                  current ref frame is an intra frame.
+ * \param[in]    ctx                Structure to hold the number of 4x4 blks to
+ *                                  copy the tx_type and txfm_skip arrays.
+ * \param[in]    this_rd_cost       Struct to keep track of palette mode's
+ *                                  rd_stats.
+ * \param[in]    best_rd            Best RD seen for this block so far.
+ *
+ * \return Returns nothing.
+ */
+void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+                                  PICK_MODE_CONTEXT *ctx,
+                                  RD_STATS *this_rd_cost, int64_t best_rd);
+
+/*!\brief Perform intra-mode search on luma channels for intra frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function performs intra-mode search on the luma channel when the
+ * current frame is intra-only. This function does not search intrabc mode,
+ * but it does search palette and filter_intra.
+ *
+ * \param[in]    cpi                Top-level encoder structure.
+ * \param[in]    x                  Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]    rate               The total rate needed to predict the current
+ *                                  chroma block.
+ * \param[in]    rate_tokenonly     The rate without the cost of sending the
+ *                                  prediction modes.
+ *                                  chroma block.
+ *                                  after the reconstruction.
+ * \param[in]    distortion         The chroma distortion of the best prediction
+ *                                  after the reconstruction.
+ * \param[in]    skippable          Whether we can skip txfm process.
+ * \param[in]    bsize              Current partition block size.
+ * \param[in]    best_rd            Best RD seen for this block so far.
+ * \param[in]    ctx                Structure to hold the number of 4x4 blks to
+ *                                  copy the tx_type and txfm_skip arrays.
+ *
+ * \return Returns the rd_cost if this function finds a mode better than
+ * best_rd, otherwise returns INT64_MAX. This also updates the mbmi, the rate
+ * and distortion, and the tx_type arrays in ctx.
+ */
 int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    int *rate, int *rate_tokenonly,
                                    int64_t *distortion, int *skippable,
                                    BLOCK_SIZE bsize, int64_t best_rd,
                                    PICK_MODE_CONTEXT *ctx);
+
+/*!\brief Perform intra-mode search on chroma channels.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function performs intra-mode search on the chroma channels. Just like
+ * \ref av1_rd_pick_intra_sby_mode(), this function searches over palette mode
+ * (filter_intra is not available on chroma planes). Unlike \ref
+ * av1_rd_pick_intra_sby_mode() this function is used by both inter and intra
+ * frames.
+ *
+ * \param[in]    cpi                Top-level encoder structure.
+ * \param[in]    x                  Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]    rate               The total rate needed to predict the current
+ *                                  chroma block.
+ * \param[in]    rate_tokenonly     The rate without the cost of sending the
+ *                                  prediction modes.
+ *                                  chroma block.
+ *                                  after the reconstruction.
+ * \param[in]    distortion         The chroma distortion of the best prediction
+ *                                  after the reconstruction.
+ * \param[in]    skippable          Whether we can skip txfm process.
+ * \param[in]    bsize              Current partition block size.
+ * \param[in]    max_tx_size        The maximum tx_size available
+ *
+ * \return Returns the rd_cost of the best uv mode found. This also updates the
+ * mbmi, the rate and distortion, distortion.
+ */
+int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, TX_SIZE max_tx_size);
+
+/*! \brief Return the number of colors in src. Used by palette mode.
+ */
+void av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                      int *val_count, int *num_colors);
+
+/*! \brief See \ref av1_count_colors(), but for highbd.
+ */
+void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+                             int cols, int bit_depth, int *val_count,
+                             int *val_count_8bit, int *num_color_bins,
+                             int *num_colors);
+
+/*! \brief Initializes the \ref IntraModeSearchState struct.
+ */
+static AOM_INLINE void init_intra_mode_search_state(
+    IntraModeSearchState *intra_search_state) {
+  memset(intra_search_state, 0, sizeof(*intra_search_state));
+  intra_search_state->rate_uv_intra = INT_MAX;
+}
+
+/*! \brief set the luma intra mode and delta angles for a given mode index.
+ * The total number of luma intra mode is LUMA_MODE_COUNT = 61.
+ * The first 13 modes are from DC_PRED to PAETH_PRED, followed by directional
+ * modes. Each of the main 8 directional modes have 6 = MAX_ANGLE_DELTA * 2
+ * delta angles.
+ * \param[in]    mode_idx           mode index in intra mode decision
+ *                                  process.
+ * \param[in]    mbmi               Pointer to structure holding
+ *                                  the mode info for the current macroblock.
+ */
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi);
+
+/*! \brief prune luma intra mode based on the model rd.
+ * \param[in]    this_model_rd              model rd for current mode.
+ * \param[in]    best_model_rd              Best model RD seen for this block so
+ *                                          far.
+ * \param[in]    top_intra_model_rd         Top intra model RD seen for this
+ *                                          block so far.
+ * \param[in]    max_model_cnt_allowed      The maximum number of top intra
+ *                                          model RD allowed.
+ * \param[in]    model_rd_index_for_pruning Index of the candidate used for
+ *                                          pruning based on model rd.
+ */
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+                       int64_t top_intra_model_rd[], int max_model_cnt_allowed,
+                       int model_rd_index_for_pruning);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
diff --git a/media/libaom/src/av1/encoder/intra_mode_search_utils.h b/media/libaom/src/av1/encoder/intra_mode_search_utils.h
new file mode 100644
index 0000000000..4519e4629d
--- /dev/null
+++ b/media/libaom/src/av1/encoder/intra_mode_search_utils.h
@@ -0,0 +1,689 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Defines utility functions used in intra mode search.
+ *
+ * This includes rdcost estimations, histogram based pruning, etc.
+ */
+#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
+#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
+
+#include "av1/common/enums.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+// Macro for computing the speed-preset dependent threshold which is used for
+// deciding whether to enable/disable variance calculations in
+// intra_rd_variance_factor().
+#define INTRA_RD_VAR_THRESH(X) (1.0 - (0.25 * (X)))
+
+#define BINS 32
+static const float av1_intra_hog_model_bias[DIRECTIONAL_MODES] = {
+  0.450578f,  0.695518f,  -0.717944f, -0.639894f,
+  -0.602019f, -0.453454f, 0.055857f,  -0.465480f,
+};
+
+static const float av1_intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = {
+  -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f,
+  -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f,
+  -0.434156f, 0.322868f,  2.260546f,  3.368715f,  3.989290f,  3.308487f,
+  2.277893f,  0.923793f,  0.026412f,  -0.385174f, -0.718622f, -1.408867f,
+  -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f,
+  -2.985709f, -3.447155f, 3.758139f,  3.204353f,  2.170998f,  0.826587f,
+  -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f,
+  -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f,
+  -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f,
+  -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f,
+  -0.088058f, 0.753494f,  2.092413f,  3.215266f,  -3.300277f, -2.748658f,
+  -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f,
+  -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f,
+  -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f,
+  0.813112f,  1.702213f,  2.653045f,  3.351749f,  3.243554f,  3.199409f,
+  2.437856f,  1.468854f,  0.533039f,  -0.099065f, -0.622643f, -2.200732f,
+  -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f,  1.975043f,
+  3.179528f,  3.939064f,  3.454379f,  3.689386f,  3.116411f,  1.970991f,
+  0.798406f,  -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f,
+  -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f,
+  -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f,
+  -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f,
+  -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f,  1.416882f,
+  2.572884f,  3.607755f,  3.974820f,  3.997783f,  2.970459f,  0.791687f,
+  -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f,
+  -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f,
+  -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f,
+  2.794130f,  3.685984f,  3.745195f,  3.252444f,  2.316108f,  1.399146f,
+  -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f,
+  -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f,
+  -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f,
+  -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f,
+  -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f,
+  -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f,
+  -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f,
+  -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f,
+  0.716997f,  1.481393f,  2.216702f,  2.737986f,  3.109809f,  3.226084f,
+  2.490098f,  -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f,
+  -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f,
+  -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f,
+  -1.430687f, 0.872896f,  2.766550f,  3.610080f,  3.578041f,  3.334928f,
+  2.586680f,  1.895721f,  1.122195f,  0.488519f,  -0.140689f, -0.799076f,
+  -1.222860f, -1.502437f, -1.900969f, -3.206816f,
+};
+
+static const NN_CONFIG av1_intra_hog_model_nnconfig = {
+  BINS,               // num_inputs
+  DIRECTIONAL_MODES,  // num_outputs
+  0,                  // num_hidden_layers
+  { 0 },
+  {
+      av1_intra_hog_model_weights,
+  },
+  {
+      av1_intra_hog_model_bias,
+  },
+};
+
+#define FIX_PREC_BITS (16)
+static AOM_INLINE int get_hist_bin_idx(int dx, int dy) {
+  const int32_t ratio = (dy * (1 << FIX_PREC_BITS)) / dx;
+
+  // Find index by bisection
+  static const int thresholds[BINS] = {
+    -1334015, -441798, -261605, -183158, -138560, -109331, -88359, -72303,
+    -59392,   -48579,  -39272,  -30982,  -23445,  -16400,  -9715,  -3194,
+    3227,     9748,    16433,   23478,   31015,   39305,   48611,  59425,
+    72336,    88392,   109364,  138593,  183191,  261638,  441831, INT32_MAX
+  };
+
+  int lo_idx = 0, hi_idx = BINS - 1;
+  // Divide into segments of size 8 gives better performance than binary search
+  // here.
+  if (ratio <= thresholds[7]) {
+    lo_idx = 0;
+    hi_idx = 7;
+  } else if (ratio <= thresholds[15]) {
+    lo_idx = 8;
+    hi_idx = 15;
+  } else if (ratio <= thresholds[23]) {
+    lo_idx = 16;
+    hi_idx = 23;
+  } else {
+    lo_idx = 24;
+    hi_idx = 31;
+  }
+
+  for (int idx = lo_idx; idx <= hi_idx; idx++) {
+    if (ratio <= thresholds[idx]) {
+      return idx;
+    }
+  }
+  assert(0 && "No valid histogram bin found!");
+  return BINS - 1;
+}
+#undef FIX_PREC_BITS
+
+// Normalizes the hog data.
+static AOM_INLINE void normalize_hog(float total, float *hist) {
+  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static AOM_INLINE void lowbd_generate_hog(const uint8_t *src, int stride,
+                                          int rows, int cols, float *hist) {
+  float total = 0.1f;
+  src += stride;
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint8_t *above = &src[c - stride];
+      const uint8_t *below = &src[c + stride];
+      const uint8_t *left = &src[c - 1];
+      const uint8_t *right = &src[c + 1];
+      // Calculate gradient using Sobel filters.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      if (dx == 0 && dy == 0) continue;
+      const int temp = abs(dx) + abs(dy);
+      if (!temp) continue;
+      total += temp;
+      if (dx == 0) {
+        hist[0] += temp / 2;
+        hist[BINS - 1] += temp / 2;
+      } else {
+        const int idx = get_hist_bin_idx(dx, dy);
+        assert(idx >= 0 && idx < BINS);
+        hist[idx] += temp;
+      }
+    }
+    src += stride;
+  }
+
+  normalize_hog(total, hist);
+}
+
+// Computes and stores pixel level gradient information of a given superblock
+// for LBD encode.
+static AOM_INLINE void lowbd_compute_gradient_info_sb(MACROBLOCK *const x,
+                                                      BLOCK_SIZE sb_size,
+                                                      PLANE_TYPE plane) {
+  PixelLevelGradientInfo *const grad_info_sb =
+      x->pixel_gradient_info + plane * MAX_SB_SQUARE;
+  const uint8_t *src = x->plane[plane].src.buf;
+  const int stride = x->plane[plane].src.stride;
+  const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+  const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+  const int sb_height = block_size_high[sb_size] >> ss_y;
+  const int sb_width = block_size_wide[sb_size] >> ss_x;
+  src += stride;
+  for (int r = 1; r < sb_height - 1; ++r) {
+    for (int c = 1; c < sb_width - 1; ++c) {
+      const uint8_t *above = &src[c - stride];
+      const uint8_t *below = &src[c + stride];
+      const uint8_t *left = &src[c - 1];
+      const uint8_t *right = &src[c + 1];
+      // Calculate gradient using Sobel filters.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0);
+      grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum =
+          (uint16_t)(abs(dx) + abs(dy));
+      grad_info_sb[r * sb_width + c].hist_bin_idx =
+          (dx != 0) ? get_hist_bin_idx(dx, dy) : -1;
+    }
+    src += stride;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void highbd_generate_hog(const uint8_t *src8, int stride,
+                                           int rows, int cols, float *hist) {
+  float total = 0.1f;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  src += stride;
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint16_t *above = &src[c - stride];
+      const uint16_t *below = &src[c + stride];
+      const uint16_t *left = &src[c - 1];
+      const uint16_t *right = &src[c + 1];
+      // Calculate gradient using Sobel filters.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      if (dx == 0 && dy == 0) continue;
+      const int temp = abs(dx) + abs(dy);
+      if (!temp) continue;
+      total += temp;
+      if (dx == 0) {
+        hist[0] += temp / 2;
+        hist[BINS - 1] += temp / 2;
+      } else {
+        const int idx = get_hist_bin_idx(dx, dy);
+        assert(idx >= 0 && idx < BINS);
+        hist[idx] += temp;
+      }
+    }
+    src += stride;
+  }
+
+  normalize_hog(total, hist);
+}
+
+// Computes and stores pixel level gradient information of a given superblock
+// for HBD encode.
+static AOM_INLINE void highbd_compute_gradient_info_sb(MACROBLOCK *const x,
+                                                       BLOCK_SIZE sb_size,
+                                                       PLANE_TYPE plane) {
+  PixelLevelGradientInfo *const grad_info_sb =
+      x->pixel_gradient_info + plane * MAX_SB_SQUARE;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[plane].src.buf);
+  const int stride = x->plane[plane].src.stride;
+  const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+  const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+  const int sb_height = block_size_high[sb_size] >> ss_y;
+  const int sb_width = block_size_wide[sb_size] >> ss_x;
+  src += stride;
+  for (int r = 1; r < sb_height - 1; ++r) {
+    for (int c = 1; c < sb_width - 1; ++c) {
+      const uint16_t *above = &src[c - stride];
+      const uint16_t *below = &src[c + stride];
+      const uint16_t *left = &src[c - 1];
+      const uint16_t *right = &src[c + 1];
+      // Calculate gradient using Sobel filters.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0);
+      grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum =
+          (uint16_t)(abs(dx) + abs(dy));
+      grad_info_sb[r * sb_width + c].hist_bin_idx =
+          (dx != 0) ? get_hist_bin_idx(dx, dy) : -1;
+    }
+    src += stride;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void generate_hog(const uint8_t *src8, int stride, int rows,
+                                    int cols, float *hist, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
+    highbd_generate_hog(src8, stride, rows, cols, hist);
+    return;
+  }
+#else
+  (void)highbd;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  lowbd_generate_hog(src8, stride, rows, cols, hist);
+}
+
+static AOM_INLINE void compute_gradient_info_sb(MACROBLOCK *const x,
+                                                BLOCK_SIZE sb_size,
+                                                PLANE_TYPE plane) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(&x->e_mbd)) {
+    highbd_compute_gradient_info_sb(x, sb_size, plane);
+    return;
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  lowbd_compute_gradient_info_sb(x, sb_size, plane);
+}
+
+// Gradient caching at superblock level is allowed only if all of the following
+// conditions are satisfied:
+// (1) The current frame is an intra only frame
+// (2) Non-RD mode decisions are not enabled
+// (3) The sf partition_search_type is set to SEARCH_PARTITION
+// (4) Either intra_pruning_with_hog or chroma_intra_pruning_with_hog is enabled
+//
+// SB level caching of gradient data may not help in speedup for the following
+// cases:
+// (1) Inter frames (due to early intra gating)
+// (2) When partition_search_type is not SEARCH_PARTITION
+// Hence, gradient data is computed at block level in such cases.
+static AOM_INLINE bool is_gradient_caching_for_hog_enabled(
+    const AV1_COMP *const cpi) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  return frame_is_intra_only(&cpi->common) && !sf->rt_sf.use_nonrd_pick_mode &&
+         (sf->part_sf.partition_search_type == SEARCH_PARTITION) &&
+         (sf->intra_sf.intra_pruning_with_hog ||
+          sf->intra_sf.chroma_intra_pruning_with_hog);
+}
+
+// Function to generate pixel level gradient information for a given superblock.
+// Sets the flags 'is_sb_gradient_cached' for the specific plane-type if
+// gradient info is generated for the same.
+static AOM_INLINE void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x,
+                                                BLOCK_SIZE sb_size, int mi_row,
+                                                int mi_col) {
+  // Initialise flags related to hog data caching.
+  x->is_sb_gradient_cached[PLANE_TYPE_Y] = false;
+  x->is_sb_gradient_cached[PLANE_TYPE_UV] = false;
+  if (!is_gradient_caching_for_hog_enabled(cpi)) return;
+
+  const SPEED_FEATURES *sf = &cpi->sf;
+  const int num_planes = av1_num_planes(&cpi->common);
+
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
+
+  if (sf->intra_sf.intra_pruning_with_hog) {
+    compute_gradient_info_sb(x, sb_size, PLANE_TYPE_Y);
+    x->is_sb_gradient_cached[PLANE_TYPE_Y] = true;
+  }
+  if (sf->intra_sf.chroma_intra_pruning_with_hog && num_planes > 1) {
+    compute_gradient_info_sb(x, sb_size, PLANE_TYPE_UV);
+    x->is_sb_gradient_cached[PLANE_TYPE_UV] = true;
+  }
+}
+
+// Reuses the pixel level gradient data generated at superblock level for block
+// level histogram computation.
+static AOM_INLINE void generate_hog_using_gradient_cache(const MACROBLOCK *x,
+                                                         int rows, int cols,
+                                                         BLOCK_SIZE sb_size,
+                                                         PLANE_TYPE plane,
+                                                         float *hist) {
+  float total = 0.1f;
+  const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+  const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+  const int sb_width = block_size_wide[sb_size] >> ss_x;
+
+  // Derive the offset from the starting of the superblock in order to locate
+  // the block level gradient data in the cache.
+  const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1);
+  const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1);
+  const int block_offset_in_grad_cache =
+      sb_width * (mi_row_in_sb << (MI_SIZE_LOG2 - ss_y)) +
+      (mi_col_in_sb << (MI_SIZE_LOG2 - ss_x));
+  const PixelLevelGradientInfo *grad_info_blk = x->pixel_gradient_info +
+                                                plane * MAX_SB_SQUARE +
+                                                block_offset_in_grad_cache;
+
+  // Retrieve the cached gradient information and generate the histogram.
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint16_t abs_dx_abs_dy_sum =
+          grad_info_blk[r * sb_width + c].abs_dx_abs_dy_sum;
+      if (!abs_dx_abs_dy_sum) continue;
+      total += abs_dx_abs_dy_sum;
+      const bool is_dx_zero = grad_info_blk[r * sb_width + c].is_dx_zero;
+      if (is_dx_zero) {
+        hist[0] += abs_dx_abs_dy_sum >> 1;
+        hist[BINS - 1] += abs_dx_abs_dy_sum >> 1;
+      } else {
+        const int8_t idx = grad_info_blk[r * sb_width + c].hist_bin_idx;
+        assert(idx >= 0 && idx < BINS);
+        hist[idx] += abs_dx_abs_dy_sum;
+      }
+    }
+  }
+  normalize_hog(total, hist);
+}
+
+static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                    BLOCK_SIZE sb_size, int plane, float *hog) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int bh = block_size_high[bsize];
+  const int bw = block_size_wide[bsize];
+  const int rows =
+      ((xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh) >>
+      ss_y;
+  const int cols =
+      ((xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw) >>
+      ss_x;
+
+  // If gradient data is already generated at SB level, reuse the cached data.
+  // Otherwise, compute the data.
+  if (x->is_sb_gradient_cached[plane]) {
+    generate_hog_using_gradient_cache(x, rows, cols, sb_size, plane, hog);
+  } else {
+    const uint8_t *src = x->plane[plane].src.buf;
+    const int src_stride = x->plane[plane].src.stride;
+    generate_hog(src, src_stride, rows, cols, hog, is_cur_buf_hbd(xd));
+  }
+
+  // Scale the hog so the luma and chroma are on the same scale
+  for (int b = 0; b < BINS; ++b) {
+    hog[b] *= (1 + ss_x) * (1 + ss_y);
+  }
+}
+
+static AOM_INLINE void prune_intra_mode_with_hog(
+    const MACROBLOCK *x, BLOCK_SIZE bsize, BLOCK_SIZE sb_size, float th,
+    uint8_t *directional_mode_skip_mask, int is_chroma) {
+  const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y;
+  float hist[BINS] = { 0.0f };
+  collect_hog_data(x, bsize, sb_size, plane, hist);
+
+  // Make prediction for each of the mode
+  float scores[DIRECTIONAL_MODES] = { 0.0f };
+  av1_nn_predict(hist, &av1_intra_hog_model_nnconfig, 1, scores);
+  for (UV_PREDICTION_MODE uv_mode = UV_V_PRED; uv_mode <= UV_D67_PRED;
+       uv_mode++) {
+    if (scores[uv_mode - UV_V_PRED] <= th) {
+      directional_mode_skip_mask[uv_mode] = 1;
+    }
+  }
+}
+#undef BINS
+
+int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf,
+                                 const int stride, const int is_hbd);
+
+// Returns whether caching of source variance for 4x4 sub-blocks is allowed.
+static AOM_INLINE bool is_src_var_for_4x4_sub_blocks_caching_enabled(
+    const AV1_COMP *const cpi) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  if (cpi->oxcf.mode != ALLINTRA) return false;
+
+  if (sf->part_sf.partition_search_type == SEARCH_PARTITION) return true;
+
+  if (INTRA_RD_VAR_THRESH(cpi->oxcf.speed) <= 0 ||
+      (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode))
+    return false;
+
+  return true;
+}
+
+// Initialize the members of Block4x4VarInfo structure to -1 at the start
+// of every superblock.
+static AOM_INLINE void init_src_var_info_of_4x4_sub_blocks(
+    const AV1_COMP *const cpi, Block4x4VarInfo *src_var_info_of_4x4_sub_blocks,
+    const BLOCK_SIZE sb_size) {
+  if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return;
+
+  const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size];
+  for (int i = 0; i < mi_count_in_sb; i++) {
+    src_var_info_of_4x4_sub_blocks[i].var = -1;
+    src_var_info_of_4x4_sub_blocks[i].log_var = -1.0;
+  }
+}
+
+// Returns the cost needed to send a uniformly distributed r.v.
+static AOM_INLINE int write_uniform_cost(int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return 0;
+  if (v < m)
+    return av1_cost_literal(l - 1);
+  else
+    return av1_cost_literal(l);
+}
+/*!\endcond */
+
+/*!\brief Returns the rate cost for luma prediction mode info of intra blocks.
+ *
+ * \callergraph
+ */
+static AOM_INLINE int intra_mode_info_cost_y(const AV1_COMP *cpi,
+                                             const MACROBLOCK *x,
+                                             const MB_MODE_INFO *mbmi,
+                                             BLOCK_SIZE bsize, int mode_cost,
+                                             int discount_color_cost) {
+  int total_rate = mode_cost;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
+  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
+  const int use_intrabc = mbmi->use_intrabc;
+  // Can only activate one mode.
+  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
+          use_filter_intra) <= 1);
+  const int try_palette = av1_allow_palette(
+      cpi->common.features.allow_screen_content_tools, mbmi->bsize);
+  if (try_palette && mbmi->mode == DC_PRED) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+    const int mode_ctx = av1_get_palette_mode_ctx(xd);
+    total_rate +=
+        mode_costs->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
+    if (use_palette) {
+      const uint8_t *const color_map = xd->plane[0].color_index_map;
+      int block_width, block_height, rows, cols;
+      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                               &cols);
+      const int plt_size = mbmi->palette_mode_info.palette_size[0];
+      int palette_mode_cost =
+          mode_costs
+              ->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+      palette_mode_cost +=
+          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
+                                   n_cache, cpi->common.seq_params->bit_depth);
+      if (!discount_color_cost)
+        palette_mode_cost +=
+            av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
+    total_rate += mode_costs->filter_intra_cost[mbmi->bsize][use_filter_intra];
+    if (use_filter_intra) {
+      total_rate +=
+          mode_costs->filter_intra_mode_cost[mbmi->filter_intra_mode_info
+                                                 .filter_intra_mode];
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode)) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          mode_costs->angle_delta_cost[mbmi->mode - V_PRED]
+                                      [MAX_ANGLE_DELTA +
+                                       mbmi->angle_delta[PLANE_TYPE_Y]];
+    }
+  }
+  if (av1_allow_intrabc(&cpi->common))
+    total_rate += mode_costs->intrabc_cost[use_intrabc];
+  return total_rate;
+}
+
+/*!\brief Return the rate cost for chroma prediction mode info of intra blocks.
+ *
+ * \callergraph
+ */
+static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi,
+                                              const MACROBLOCK *x,
+                                              const MB_MODE_INFO *mbmi,
+                                              BLOCK_SIZE bsize, int mode_cost) {
+  int total_rate = mode_cost;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
+  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
+  // Can only activate one mode.
+  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+
+  const int try_palette = av1_allow_palette(
+      cpi->common.features.allow_screen_content_tools, mbmi->bsize);
+  if (try_palette && mode == UV_DC_PRED) {
+    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+    total_rate +=
+        mode_costs->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
+    if (use_palette) {
+      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+      const int plt_size = pmi->palette_size[1];
+      const MACROBLOCKD *xd = &x->e_mbd;
+      const uint8_t *const color_map = xd->plane[1].color_index_map;
+      int palette_mode_cost =
+          mode_costs
+              ->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+      palette_mode_cost += av1_palette_color_cost_uv(
+          pmi, color_cache, n_cache, cpi->common.seq_params->bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(mode))) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          mode_costs->angle_delta_cost[mode - V_PRED]
+                                      [mbmi->angle_delta[PLANE_TYPE_UV] +
+                                       MAX_ANGLE_DELTA];
+    }
+  }
+  return total_rate;
+}
+
+/*!\cond */
+// Makes a quick intra prediction and estimate the rdcost with a model without
+// going through the whole txfm/quantize/itxfm process.
+static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x,
+                              int plane, BLOCK_SIZE plane_bsize,
+                              TX_SIZE tx_size, int use_hadamard) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  int row, col;
+  assert(!is_inter_block(xd->mi[0]));
+  const int stepr = tx_size_high_unit[tx_size];
+  const int stepc = tx_size_wide_unit[tx_size];
+  const int txbw = tx_size_wide[tx_size];
+  const int txbh = tx_size_high[tx_size];
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  int64_t satd_cost = 0;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  // Prediction.
+  for (row = 0; row < max_blocks_high; row += stepr) {
+    for (col = 0; col < max_blocks_wide; col += stepc) {
+      av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+      // Here we use p->src_diff and p->coeff as temporary buffers for
+      // prediction residue and transform coefficients. The buffers are only
+      // used in this for loop, therefore we don't need to properly add offset
+      // to the buffers.
+      av1_subtract_block(
+          bd_info, txbh, txbw, p->src_diff, block_size_wide[plane_bsize],
+          p->src.buf + (((row * p->src.stride) + col) << 2), p->src.stride,
+          pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride);
+      av1_quick_txfm(use_hadamard, tx_size, bd_info, p->src_diff,
+                     block_size_wide[plane_bsize], p->coeff);
+      satd_cost += aom_satd(p->coeff, tx_size_2d[tx_size]);
+    }
+  }
+  return satd_cost;
+}
+/*!\endcond */
+
+/*!\brief Estimate the luma rdcost of a given intra mode and try to prune it.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function first makes a quick luma prediction and estimates the rdcost
+ * with a model without going through the txfm, then try to prune the current
+ * mode if the new estimate y_rd > 1.25 * best_model_rd.
+ *
+ * \return Returns 1 if the given mode is prune; 0 otherwise.
+ */
+static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
+                                                MACROBLOCK *x, BLOCK_SIZE bsize,
+                                                int64_t *best_model_rd) {
+  const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+  const int plane = 0;
+  const AV1_COMMON *cm = &cpi->common;
+  const int64_t this_model_rd =
+      intra_model_rd(cm, x, plane, bsize, tx_size, /*use_hadamard=*/1);
+  if (*best_model_rd != INT64_MAX &&
+      this_model_rd > *best_model_rd + (*best_model_rd >> 2)) {
+    return 1;
+  } else if (this_model_rd < *best_model_rd) {
+    *best_model_rd = this_model_rd;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
diff --git a/media/libaom/src/av1/encoder/k_means_template.h b/media/libaom/src/av1/encoder/k_means_template.h
index 9e526b88b7..e794caf293 100644
--- a/media/libaom/src/av1/encoder/k_means_template.h
+++ b/media/libaom/src/av1/encoder/k_means_template.h
@@ -13,6 +13,7 @@
 #include <stdint.h>
 #include <string.h>
 
+#include "av1/common/blockd.h"
 #include "av1/encoder/palette.h"
 #include "av1/encoder/random.h"
 
@@ -93,9 +94,15 @@ static int64_t RENAME(calc_total_dist)(const int *data, const int *centroids,
 void RENAME(av1_k_means)(const int *data, int *centroids, uint8_t *indices,
                          int n, int k, int max_itr) {
   int pre_centroids[2 * PALETTE_MAX_SIZE];
-  uint8_t pre_indices[MAX_SB_SQUARE];
+  uint8_t pre_indices[MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT];
 
-  RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+  assert(n <= MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT);
+
+#if AV1_K_MEANS_DIM - 2
+  av1_calc_indices_dim1(data, centroids, indices, n, k);
+#else
+  av1_calc_indices_dim2(data, centroids, indices, n, k);
+#endif
   int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
 
   for (int i = 0; i < max_itr; ++i) {
@@ -105,7 +112,11 @@ void RENAME(av1_k_means)(const int *data, int *centroids, uint8_t *indices,
     memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
 
     RENAME(calc_centroids)(data, centroids, indices, n, k);
-    RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+#if AV1_K_MEANS_DIM - 2
+    av1_calc_indices_dim1(data, centroids, indices, n, k);
+#else
+    av1_calc_indices_dim2(data, centroids, indices, n, k);
+#endif
     this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
 
     if (this_dist > pre_dist) {
diff --git a/media/libaom/src/av1/encoder/level.c b/media/libaom/src/av1/encoder/level.c
index 3403a3a84b..e3abe35dd1 100644
--- a/media/libaom/src/av1/encoder/level.c
+++ b/media/libaom/src/av1/encoder/level.c
@@ -9,8 +9,6 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_ports/system_state.h"
-
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/level.h"
 
@@ -353,7 +351,7 @@ static double time_to_decode_frame(const AV1_COMMON *const cm,
     if (spatial_layer_dimensions_present_flag) {
       assert(0 && "Spatial layer dimensions not supported yet.");
     } else {
-      const SequenceHeader *const seq_params = &cm->seq_params;
+      const SequenceHeader *const seq_params = cm->seq_params;
       const int max_frame_width = seq_params->max_frame_width;
       const int max_frame_height = seq_params->max_frame_height;
       luma_samples = max_frame_width * max_frame_height;
@@ -413,18 +411,19 @@ static double get_presentation_time(const DECODER_MODEL *const decoder_model,
 }
 
 #define MAX_TIME 1e16
-double time_next_buffer_is_free(const DECODER_MODEL *const decoder_model) {
-  if (decoder_model->num_decoded_frame == 0) {
-    return (double)decoder_model->decoder_buffer_delay / 90000.0;
+double time_next_buffer_is_free(int num_decoded_frame, int decoder_buffer_delay,
+                                const FRAME_BUFFER *frame_buffer_pool,
+                                double current_time) {
+  if (num_decoded_frame == 0) {
+    return (double)decoder_buffer_delay / 90000.0;
   }
 
   double buf_free_time = MAX_TIME;
   for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
-    const FRAME_BUFFER *const this_buffer =
-        &decoder_model->frame_buffer_pool[i];
+    const FRAME_BUFFER *const this_buffer = &frame_buffer_pool[i];
     if (this_buffer->decoder_ref_count == 0) {
       if (this_buffer->player_ref_count == 0) {
-        return decoder_model->current_time;
+        return current_time;
       }
       const double presentation_time = this_buffer->presentation_time;
       if (presentation_time >= 0.0 && presentation_time < buf_free_time) {
@@ -436,12 +435,16 @@ double time_next_buffer_is_free(const DECODER_MODEL *const decoder_model) {
 }
 #undef MAX_TIME
 
-static double get_removal_time(const DECODER_MODEL *const decoder_model) {
-  if (decoder_model->mode == SCHEDULE_MODE) {
+static double get_removal_time(int mode, int num_decoded_frame,
+                               int decoder_buffer_delay,
+                               const FRAME_BUFFER *frame_buffer_pool,
+                               double current_time) {
+  if (mode == SCHEDULE_MODE) {
     assert(0 && "SCHEDULE_MODE IS NOT SUPPORTED YET");
     return INVALID_TIME;
   } else {
-    return time_next_buffer_is_free(decoder_model);
+    return time_next_buffer_is_free(num_decoded_frame, decoder_buffer_delay,
+                                    frame_buffer_pool, current_time);
   }
 }
 
@@ -467,13 +470,11 @@ void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model) {
 // op_index is the operating point index.
 void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level,
                             int op_index, DECODER_MODEL *const decoder_model) {
-  aom_clear_system_state();
-
   decoder_model->status = DECODER_MODEL_OK;
   decoder_model->level = level;
 
   const AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   decoder_model->bit_rate = get_max_bitrate(
       av1_level_defs + level, seq_params->tier[op_index], seq_params->profile);
 
@@ -524,13 +525,93 @@ void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level,
   decoder_model->decode_rate = av1_level_defs[level].max_decode_rate;
 }
 
+DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf(
+    const AV1_COMP *const cpi, size_t coded_bits,
+    const DECODER_MODEL *const decoder_model) {
+  DECODER_MODEL_STATUS status = DECODER_MODEL_OK;
+
+  if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) {
+    return status;
+  }
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int show_existing_frame = cm->show_existing_frame;
+
+  size_t cur_coded_bits = decoder_model->coded_bits + coded_bits;
+  int num_decoded_frame = decoder_model->num_decoded_frame;
+  if (!show_existing_frame) ++num_decoded_frame;
+
+  if (show_existing_frame) {
+    return status;
+  } else {
+    const double removal_time = get_removal_time(
+        decoder_model->mode, num_decoded_frame,
+        decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool,
+        decoder_model->current_time);
+    if (removal_time < 0.0) {
+      status = DECODE_FRAME_BUF_UNAVAILABLE;
+      return status;
+    }
+
+    // A frame with show_existing_frame being false indicates the end of a DFG.
+    // Update the bits arrival time of this DFG.
+    const double buffer_delay = (decoder_model->encoder_buffer_delay +
+                                 decoder_model->decoder_buffer_delay) /
+                                90000.0;
+    const double latest_arrival_time = removal_time - buffer_delay;
+    const double first_bit_arrival_time =
+        AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time);
+    const double last_bit_arrival_time =
+        first_bit_arrival_time +
+        (double)cur_coded_bits / decoder_model->bit_rate;
+    // Smoothing buffer underflows if the last bit arrives after the removal
+    // time.
+    if (last_bit_arrival_time > removal_time &&
+        !decoder_model->is_low_delay_mode) {
+      status = SMOOTHING_BUFFER_UNDERFLOW;
+      return status;
+    }
+
+    // Check if the smoothing buffer overflows.
+    const DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue;
+    if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) {
+      assert(0);
+    }
+
+    double total_interval = queue->total_interval;
+    int qhead = queue->head;
+    int qsize = queue->size;
+    // Remove the DFGs with removal time earlier than last_bit_arrival_time.
+    while (queue->buf[qhead].removal_time <= last_bit_arrival_time &&
+           qsize > 0) {
+      if (queue->buf[qhead].removal_time - first_bit_arrival_time +
+              total_interval >
+          1.0) {
+        status = SMOOTHING_BUFFER_OVERFLOW;
+        return status;
+      }
+      total_interval -= queue->buf[qhead].last_bit_arrival_time -
+                        queue->buf[qhead].first_bit_arrival_time;
+      qhead = (qhead + 1) % DFG_INTERVAL_QUEUE_SIZE;
+      --qsize;
+    }
+    total_interval += last_bit_arrival_time - first_bit_arrival_time;
+    // The smoothing buffer can hold at most "bit_rate" bits, which is
+    // equivalent to 1 second of total interval.
+    if (total_interval > 1.0) {
+      status = SMOOTHING_BUFFER_OVERFLOW;
+      return status;
+    }
+
+    return status;
+  }
+}
+
 void av1_decoder_model_process_frame(const AV1_COMP *const cpi,
                                      size_t coded_bits,
                                      DECODER_MODEL *const decoder_model) {
   if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) return;
 
-  aom_clear_system_state();
-
   const AV1_COMMON *const cm = &cpi->common;
   const int luma_pic_size = cm->superres_upscaled_width * cm->height;
   const int show_existing_frame = cm->show_existing_frame;
@@ -551,7 +632,10 @@ void av1_decoder_model_process_frame(const AV1_COMP *const cpi,
       update_ref_buffers(decoder_model, display_idx, 0xFF);
     }
   } else {
-    const double removal_time = get_removal_time(decoder_model);
+    const double removal_time = get_removal_time(
+        decoder_model->mode, decoder_model->num_decoded_frame,
+        decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool,
+        decoder_model->current_time);
     if (removal_time < 0.0) {
       decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE;
       return;
@@ -641,7 +725,7 @@ void av1_decoder_model_process_frame(const AV1_COMP *const cpi,
     if (decoder_model->initial_presentation_delay < 0.0) {
       // Display can begin after required number of frames have been buffered.
       if (frames_in_buffer_pool(decoder_model) >=
-          decoder_model->initial_display_delay) {
+          decoder_model->initial_display_delay - 1) {
         decoder_model->initial_presentation_delay = decoder_model->current_time;
         // Update presentation time for each shown frame in the frame buffer.
         for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
@@ -690,7 +774,7 @@ void av1_decoder_model_process_frame(const AV1_COMP *const cpi,
 void av1_init_level_info(AV1_COMP *cpi) {
   for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) {
     AV1LevelInfo *const this_level_info =
-        cpi->level_params.level_info[op_index];
+        cpi->ppi->level_params.level_info[op_index];
     if (!this_level_info) continue;
     memset(this_level_info, 0, sizeof(*this_level_info));
     AV1LevelSpec *const level_spec = &this_level_info->level_spec;
@@ -1048,7 +1132,7 @@ static void scan_past_frames(const FrameWindowBuffer *const buffer,
 void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
                            int64_t ts_end) {
   AV1_COMMON *const cm = &cpi->common;
-  const AV1LevelParams *const level_params = &cpi->level_params;
+  const AV1LevelParams *const level_params = &cpi->ppi->level_params;
 
   const int upscaled_width = cm->superres_upscaled_width;
   const int width = cm->width;
@@ -1057,7 +1141,7 @@ void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
   const int tile_rows = cm->tiles.rows;
   const int tiles = tile_cols * tile_rows;
   const int luma_pic_size = upscaled_width * height;
-  const int frame_header_count = level_params->frame_header_count;
+  const int frame_header_count = cpi->frame_header_count;
   const int show_frame = cm->show_frame;
   const int show_existing_frame = cm->show_existing_frame;
 
@@ -1070,15 +1154,11 @@ void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
                  &min_cropped_tile_width, &min_cropped_tile_height,
                  &tile_width_is_valid);
 
-  aom_clear_system_state();
   const double compression_ratio = av1_get_compression_ratio(cm, size);
-  const double total_time_encoded =
-      (cpi->time_stamps.prev_end_seen - cpi->time_stamps.first_ever) /
-      (double)TICKS_PER_SEC;
 
   const int temporal_layer_id = cm->temporal_layer_id;
   const int spatial_layer_id = cm->spatial_layer_id;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const BITSTREAM_PROFILE profile = seq_params->profile;
   const int is_still_picture = seq_params->still_picture;
   // update level_stats
@@ -1131,7 +1211,9 @@ void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
           show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0;
       scan_past_frames(buffer, encoded_frames_in_last_second, level_spec,
                        level_stats);
-      level_stats->total_time_encoded = total_time_encoded;
+      level_stats->total_time_encoded +=
+          (cpi->time_stamps.prev_ts_end - cpi->time_stamps.prev_ts_start) /
+          (double)TICKS_PER_SEC;
     }
 
     DECODER_MODEL *const decoder_models = level_info->decoder_models;
@@ -1141,7 +1223,7 @@ void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
 
     // Check whether target level is met.
     const AV1_LEVEL target_level = level_params->target_seq_level_idx[i];
-    if (target_level < SEQ_LEVELS) {
+    if (target_level < SEQ_LEVELS && cpi->oxcf.strict_level_conformance == 1) {
       assert(is_valid_seq_level_idx(target_level));
       const int tier = seq_params->tier[i];
       const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
@@ -1149,7 +1231,7 @@ void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
       if (fail_id != TARGET_LEVEL_OK) {
         const int target_level_major = 2 + (target_level >> 2);
         const int target_level_minor = target_level & 3;
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+        aom_internal_error(cm->error, AOM_CODEC_ERROR,
                            "Failed to encode to the target level %d_%d. %s",
                            target_level_major, target_level_minor,
                            level_fail_messages[fail_id]);
@@ -1182,3 +1264,15 @@ aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params,
 
   return AOM_CODEC_OK;
 }
+
+aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params,
+                                             const AV1LevelParams *level_params,
+                                             int *target_seq_level_idx) {
+  for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
+    target_seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+    if (!((level_params->keep_level_stats >> op) & 1)) continue;
+    target_seq_level_idx[op] = level_params->target_seq_level_idx[op];
+  }
+
+  return AOM_CODEC_OK;
+}
diff --git a/media/libaom/src/av1/encoder/level.h b/media/libaom/src/av1/encoder/level.h
index 5e0cce2007..ebf2a1c19d 100644
--- a/media/libaom/src/av1/encoder/level.h
+++ b/media/libaom/src/av1/encoder/level.h
@@ -164,8 +164,6 @@ typedef struct AV1LevelParams {
   uint32_t keep_level_stats;
   // Level information for each operating point.
   AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS];
-  // Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
-  int frame_header_count;
 } AV1LevelParams;
 
 static INLINE int is_in_operating_point(int operating_point,
@@ -187,6 +185,10 @@ aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params,
                                       const AV1LevelParams *level_params,
                                       int *seq_level_idx);
 
+aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params,
+                                             const AV1LevelParams *level_params,
+                                             int *target_seq_level_idx);
+
 // Print the status of the decoder model(for debugging).
 void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model);
 
@@ -197,6 +199,14 @@ void av1_decoder_model_process_frame(const struct AV1_COMP *const cpi,
                                      size_t coded_bits,
                                      DECODER_MODEL *const decoder_model);
 
+// This function uses the decoder model to check whether there could be
+// SMOOTHING_BUFFER_UNDERFLOW or SMOOTHING_BUFFER_OVERFLOW. It does not
+// update the content of decoder_model, and can be used to target certain
+// encoding level in the recode loop.
+DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf(
+    const struct AV1_COMP *const cpi, size_t coded_bits,
+    const DECODER_MODEL *const decoder_model);
+
 // Return max bitrate(bps) for given level.
 double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier,
                                      BITSTREAM_PROFILE profile);
diff --git a/media/libaom/src/av1/encoder/lookahead.c b/media/libaom/src/av1/encoder/lookahead.c
index 0f7c819893..fe513ce10d 100644
--- a/media/libaom/src/av1/encoder/lookahead.c
+++ b/media/libaom/src/av1/encoder/lookahead.c
@@ -45,23 +45,32 @@ void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
 struct lookahead_ctx *av1_lookahead_init(
     unsigned int width, unsigned int height, unsigned int subsampling_x,
     unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
-    const int border_in_pixels, int byte_alignment, int num_lap_buffers) {
-  struct lookahead_ctx *ctx = NULL;
+    const int border_in_pixels, int byte_alignment, int num_lap_buffers,
+    bool is_all_intra, int enable_global_motion) {
   int lag_in_frames = AOMMAX(1, depth);
 
+  // For all-intra frame encoding, previous source frames are not required.
+  // Hence max_pre_frames is set to 0 in this case. As previous source frames
+  // are accessed using a negative index to av1_lookahead_peek(), setting
+  // max_pre_frames to 0 will cause av1_lookahead_peek() to return NULL for a
+  // negative index.
+  const uint8_t max_pre_frames = is_all_intra ? 0 : MAX_PRE_FRAMES;
+
   // Add the lags to depth and clamp
   depth += num_lap_buffers;
   depth = clamp(depth, 1, MAX_TOTAL_BUFFERS);
 
   // Allocate memory to keep previous source frames available.
-  depth += MAX_PRE_FRAMES;
+  depth += max_pre_frames;
 
   // Allocate the lookahead structures
-  ctx = calloc(1, sizeof(*ctx));
+  struct lookahead_ctx *ctx = calloc(1, sizeof(*ctx));
   if (ctx) {
     unsigned int i;
     ctx->max_sz = depth;
-    ctx->read_ctxs[ENCODE_STAGE].pop_sz = ctx->max_sz - MAX_PRE_FRAMES;
+    ctx->push_frame_count = 0;
+    ctx->max_pre_frames = max_pre_frames;
+    ctx->read_ctxs[ENCODE_STAGE].pop_sz = ctx->max_sz - ctx->max_pre_frames;
     ctx->read_ctxs[ENCODE_STAGE].valid = 1;
     if (num_lap_buffers) {
       ctx->read_ctxs[LAP_STAGE].pop_sz = lag_in_frames;
@@ -70,11 +79,10 @@ struct lookahead_ctx *av1_lookahead_init(
     ctx->buf = calloc(depth, sizeof(*ctx->buf));
     if (!ctx->buf) goto fail;
     for (i = 0; i < depth; i++) {
-      aom_free_frame_buffer(&ctx->buf[i].img);
-      if (aom_realloc_frame_buffer(&ctx->buf[i].img, width, height,
-                                   subsampling_x, subsampling_y,
-                                   use_highbitdepth, border_in_pixels,
-                                   byte_alignment, NULL, NULL, NULL))
+      if (aom_realloc_frame_buffer(
+              &ctx->buf[i].img, width, height, subsampling_x, subsampling_y,
+              use_highbitdepth, border_in_pixels, byte_alignment, NULL, NULL,
+              NULL, enable_global_motion, 0))
         goto fail;
     }
   }
@@ -84,10 +92,9 @@ fail:
   return NULL;
 }
 
-int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
                        int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        aom_enc_frame_flags_t flags) {
-  struct lookahead_entry *buf;
   int width = src->y_crop_width;
   int height = src->y_crop_height;
   int uv_width = src->uv_crop_width;
@@ -97,13 +104,13 @@ int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
   int larger_dimensions, new_dimensions;
 
   assert(ctx->read_ctxs[ENCODE_STAGE].valid == 1);
-  if (ctx->read_ctxs[ENCODE_STAGE].sz + 1 + MAX_PRE_FRAMES > ctx->max_sz)
+  if (ctx->read_ctxs[ENCODE_STAGE].sz + 1 + ctx->max_pre_frames > ctx->max_sz)
     return 1;
   ctx->read_ctxs[ENCODE_STAGE].sz++;
   if (ctx->read_ctxs[LAP_STAGE].valid) {
     ctx->read_ctxs[LAP_STAGE].sz++;
   }
-  buf = pop(ctx, &ctx->write_idx);
+  struct lookahead_entry *buf = pop(ctx, &ctx->write_idx);
 
   new_dimensions = width != buf->img.y_crop_width ||
                    height != buf->img.y_crop_height ||
@@ -119,7 +126,7 @@ int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
     memset(&new_img, 0, sizeof(new_img));
     if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
                                subsampling_y, use_highbitdepth,
-                               AOM_BORDER_IN_PIXELS, 0))
+                               AOM_BORDER_IN_PIXELS, 0, 0))
       return 1;
     aom_free_frame_buffer(&buf->img);
     buf->img = new_img;
@@ -136,7 +143,9 @@ int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
 
   buf->ts_start = ts_start;
   buf->ts_end = ts_end;
+  buf->display_idx = ctx->push_frame_count;
   buf->flags = flags;
+  ++ctx->push_frame_count;
   aom_remove_metadata_from_frame_buffer(&buf->img);
   aom_copy_metadata_to_frame_buffer(&buf->img, src->metadata);
   return 0;
@@ -159,12 +168,11 @@ struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
 struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
                                            COMPRESSOR_STAGE stage) {
   struct lookahead_entry *buf = NULL;
-  struct read_ctx *read_ctx = NULL;
   if (ctx == NULL) {
     return buf;
   }
 
-  read_ctx = &ctx->read_ctxs[stage];
+  struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
   assert(read_ctx->valid == 1);
   if (index >= 0) {
     // Forward peek
@@ -175,7 +183,7 @@ struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
     }
   } else if (index < 0) {
     // Backward peek
-    if (-index <= MAX_PRE_FRAMES) {
+    if (-index <= ctx->max_pre_frames) {
       index += (int)(read_ctx->read_idx);
       if (index < 0) index += (int)(ctx->max_sz);
       buf = ctx->buf + index;
@@ -187,19 +195,17 @@ struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
 
 unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
                                  COMPRESSOR_STAGE stage) {
-  struct read_ctx *read_ctx = NULL;
   assert(ctx != NULL);
 
-  read_ctx = &ctx->read_ctxs[stage];
+  struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
   assert(read_ctx->valid == 1);
   return read_ctx->sz;
 }
 
 int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage) {
-  struct read_ctx *read_ctx = NULL;
   assert(ctx != NULL);
 
-  read_ctx = &ctx->read_ctxs[stage];
+  struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
   assert(read_ctx->valid == 1);
   return read_ctx->pop_sz;
 }
diff --git a/media/libaom/src/av1/encoder/lookahead.h b/media/libaom/src/av1/encoder/lookahead.h
index 03693d383f..c9e1c9a52b 100644
--- a/media/libaom/src/av1/encoder/lookahead.h
+++ b/media/libaom/src/av1/encoder/lookahead.h
@@ -9,9 +9,14 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\file
+ * \brief Describes look ahead buffer operations.
+ */
 #ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_
 #define AOM_AV1_ENCODER_LOOKAHEAD_H_
 
+#include <stdbool.h>
+
 #include "aom_scale/yv12config.h"
 #include "aom/aom_integer.h"
 
@@ -19,8 +24,9 @@
 extern "C" {
 #endif
 
-#define MAX_LAG_BUFFERS 35
-#define MAX_LAP_BUFFERS 35
+/*!\cond */
+#define MAX_LAG_BUFFERS 48
+#define MAX_LAP_BUFFERS 48
 #define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS)
 #define LAP_LAG_IN_FRAMES 17
 
@@ -28,6 +34,7 @@ struct lookahead_entry {
   YV12_BUFFER_CONFIG img;
   int64_t ts_start;
   int64_t ts_end;
+  int display_idx;
   aom_enc_frame_flags_t flags;
 };
 
@@ -48,7 +55,11 @@ struct lookahead_ctx {
   int write_idx;                         /* Write index */
   struct read_ctx read_ctxs[MAX_STAGES]; /* Read context */
   struct lookahead_entry *buf;           /* Buffer list */
+  int push_frame_count; /* Number of frames that have been pushed in the queue*/
+  uint8_t
+      max_pre_frames; /* Maximum number of past frames allowed in the queue */
 };
+/*!\endcond */
 
 /**\brief Initializes the lookahead stage
  *
@@ -58,7 +69,8 @@ struct lookahead_ctx {
 struct lookahead_ctx *av1_lookahead_init(
     unsigned int width, unsigned int height, unsigned int subsampling_x,
     unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
-    const int border_in_pixels, int byte_alignment, int num_lap_buffers);
+    const int border_in_pixels, int byte_alignment, int num_lap_buffers,
+    bool is_all_intra, int enable_global_motion);
 
 /**\brief Destroys the lookahead stage
  */
@@ -69,29 +81,26 @@ void av1_lookahead_destroy(struct lookahead_ctx *ctx);
  * This function will copy the source image into a new framebuffer with
  * the expected stride/border.
  *
- * If active_map is non-NULL and there is only one frame in the queue, then copy
- * only active macroblocks.
- *
  * \param[in] ctx         Pointer to the lookahead context
  * \param[in] src         Pointer to the image to enqueue
  * \param[in] ts_start    Timestamp for the start of this frame
  * \param[in] ts_end      Timestamp for the end of this frame
+ * \param[in] use_highbitdepth Tell if HBD is used
  * \param[in] flags       Flags set on this frame
- * \param[in] active_map  Map that specifies which macroblock is active
  */
-int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
                        int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        aom_enc_frame_flags_t flags);
 
 /**\brief Get the next source buffer to encode
  *
- *
  * \param[in] ctx       Pointer to the lookahead context
  * \param[in] drain     Flag indicating the buffer should be drained
  *                      (return a buffer regardless of the current queue depth)
+ * \param[in] stage     Encoder stage
  *
- * \retval NULL, if drain set and queue is empty
- * \retval NULL, if drain not set and queue not of the configured depth
+ * \retval Return NULL, if drain set and queue is empty, or if drain not set and
+ * queue not of the configured depth.
  */
 struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
                                           COMPRESSOR_STAGE stage);
@@ -100,19 +109,20 @@ struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
  *
  * \param[in] ctx       Pointer to the lookahead context
  * \param[in] index     Index of the frame to be returned, 0 == next frame
+ * \param[in] stage     Encoder stage
  *
- * \retval NULL, if no buffer exists at the specified index
+ * \retval Return NULL, if no buffer exists at the specified index
  */
 struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
                                            COMPRESSOR_STAGE stage);
 
 /**\brief Get the number of frames currently in the lookahead queue
- *
- * \param[in] ctx       Pointer to the lookahead context
  */
 unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
                                  COMPRESSOR_STAGE stage);
 
+/**\brief Get pop_sz value
+ */
 int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage);
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/av1/encoder/mcomp.c b/media/libaom/src/av1/encoder/mcomp.c
index 43f7f5c6c9..753f04fba6 100644
--- a/media/libaom/src/av1/encoder/mcomp.c
+++ b/media/libaom/src/av1/encoder/mcomp.c
@@ -33,15 +33,22 @@
 #include "av1/encoder/reconinter_enc.h"
 
 static INLINE void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params,
-                                       const MACROBLOCK *x, const MV *ref_mv) {
+                                       const MvCosts *mv_costs,
+                                       const MV *ref_mv, int errorperbit,
+                                       int sadperbit) {
   mv_cost_params->ref_mv = ref_mv;
   mv_cost_params->full_ref_mv = get_fullmv_from_mv(ref_mv);
-  mv_cost_params->error_per_bit = x->errorperbit;
-  mv_cost_params->sad_per_bit = x->sadperbit;
-  mv_cost_params->mvjcost = x->nmv_vec_cost;
-  mv_cost_params->mvcost[0] = x->mv_cost_stack[0];
-  mv_cost_params->mvcost[1] = x->mv_cost_stack[1];
-  mv_cost_params->mv_cost_type = x->mv_cost_type;
+  mv_cost_params->mv_cost_type = MV_COST_ENTROPY;
+  mv_cost_params->error_per_bit = errorperbit;
+  mv_cost_params->sad_per_bit = sadperbit;
+  // For allintra encoding mode, 'mv_costs' is not allocated. Hence, the
+  // population of mvjcost and mvcost are avoided. In case of IntraBC, these
+  // values are populated from 'dv_costs' in av1_set_ms_to_intra_mode().
+  if (mv_costs != NULL) {
+    mv_cost_params->mvjcost = mv_costs->nmv_joint_cost;
+    mv_cost_params->mvcost[0] = mv_costs->mv_cost_stack[0];
+    mv_cost_params->mvcost[1] = mv_costs->mv_cost_stack[1];
+  }
 }
 
 static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {
@@ -50,38 +57,103 @@ static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {
 
   av1_set_ms_compound_refs(ms_buffers, NULL, NULL, 0, 0);
 
-  ms_buffers->wsrc = x->wsrc_buf;
-  ms_buffers->obmc_mask = x->mask_buf;
+  ms_buffers->wsrc = x->obmc_buffer.wsrc;
+  ms_buffers->obmc_mask = x->obmc_buffer.mask;
+}
+
+static AOM_INLINE SEARCH_METHODS
+get_faster_search_method(SEARCH_METHODS search_method) {
+  // Note on search method's accuracy:
+  //  1. NSTEP
+  //  2. DIAMOND
+  //  3. BIGDIA \approx SQUARE
+  //  4. HEX.
+  //  5. FAST_HEX \approx FAST_DIAMOND
+  switch (search_method) {
+    case NSTEP: return DIAMOND;
+    case NSTEP_8PT: return DIAMOND;
+    case DIAMOND: return BIGDIA;
+    case CLAMPED_DIAMOND: return BIGDIA;
+    case BIGDIA: return HEX;
+    case SQUARE: return HEX;
+    case HEX: return FAST_HEX;
+    case FAST_HEX: return FAST_HEX;
+    case FAST_DIAMOND: return FAST_DIAMOND;
+    case FAST_BIGDIA: return FAST_BIGDIA;
+    default: assert(0 && "Invalid search method!"); return DIAMOND;
+  }
+}
+
+void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) {
+  obmc_buffer->wsrc = NULL;
+  obmc_buffer->mask = NULL;
+  obmc_buffer->above_pred = NULL;
+  obmc_buffer->left_pred = NULL;
 }
 
 void av1_make_default_fullpel_ms_params(
     FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
     const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
-    const search_site_config *search_sites) {
+    const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
+    int fine_search_interval) {
+  const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+
   // High level params
   ms_params->bsize = bsize;
-  ms_params->vfp = &cpi->fn_ptr[bsize];
+  ms_params->vfp = &cpi->ppi->fn_ptr[bsize];
 
   init_ms_buffers(&ms_params->ms_buffers, x);
 
-  ms_params->search_method = cpi->sf.mv_sf.search_method;
-  ms_params->search_sites = search_sites;
+  SEARCH_METHODS search_method = mv_sf->search_method;
+  if (mv_sf->use_bsize_dependent_search_method) {
+    const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]);
+    if (min_dim >= 32) {
+      search_method = get_faster_search_method(search_method);
+    }
+  }
 
-  ms_params->mesh_patterns[0] = cpi->sf.mv_sf.mesh_patterns;
-  ms_params->mesh_patterns[1] = cpi->sf.mv_sf.intrabc_mesh_patterns;
-  ms_params->force_mesh_thresh = cpi->sf.mv_sf.exhaustive_searches_thresh;
-  ms_params->prune_mesh_search = cpi->sf.mv_sf.prune_mesh_search;
+  av1_set_mv_search_method(ms_params, search_sites, search_method);
+
+  const int use_downsampled_sad =
+      mv_sf->use_downsampled_sad && block_size_high[bsize] >= 16;
+  if (use_downsampled_sad) {
+    ms_params->sdf = ms_params->vfp->sdsf;
+    ms_params->sdx4df = ms_params->vfp->sdsx4df;
+  } else {
+    ms_params->sdf = ms_params->vfp->sdf;
+    ms_params->sdx4df = ms_params->vfp->sdx4df;
+  }
+
+  ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
+  ms_params->mesh_patterns[1] = mv_sf->intrabc_mesh_patterns;
+  ms_params->force_mesh_thresh = mv_sf->exhaustive_searches_thresh;
+  ms_params->prune_mesh_search =
+      (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_2) ? 1 : 0;
+  ms_params->mesh_search_mv_diff_threshold = 4;
   ms_params->run_mesh_search = 0;
+  ms_params->fine_search_interval = fine_search_interval;
 
   ms_params->is_intra_mode = 0;
 
-  ms_params->fast_obmc_search = cpi->sf.mv_sf.obmc_full_pixel_search_level;
+  ms_params->fast_obmc_search = mv_sf->obmc_full_pixel_search_level;
 
   ms_params->mv_limits = x->mv_limits;
   av1_set_mv_search_range(&ms_params->mv_limits, ref_mv);
 
   // Mvcost params
-  init_mv_cost_params(&ms_params->mv_cost_params, x, ref_mv);
+  init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv,
+                      x->errorperbit, x->sadperbit);
+}
+
+void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const IntraBCMVCosts *dv_costs) {
+  ms_params->is_intra_mode = 1;
+
+  MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+
+  mv_cost_params->mvjcost = dv_costs->joint_mv;
+  mv_cost_params->mvcost[0] = dv_costs->dv_costs[0];
+  mv_cost_params->mvcost[1] = dv_costs->dv_costs[1];
 }
 
 void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
@@ -98,10 +170,11 @@ void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
   av1_set_subpel_mv_search_range(&ms_params->mv_limits, &x->mv_limits, ref_mv);
 
   // Mvcost params
-  init_mv_cost_params(&ms_params->mv_cost_params, x, ref_mv);
+  init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv,
+                      x->errorperbit, x->sadperbit);
 
   // Subpel variance params
-  ms_params->var_params.vfp = &cpi->fn_ptr[bsize];
+  ms_params->var_params.vfp = &cpi->ppi->fn_ptr[bsize];
   ms_params->var_params.subpel_search_type =
       cpi->sf.mv_sf.use_accurate_subpel_search;
   ms_params->var_params.w = block_size_wide[bsize];
@@ -187,7 +260,7 @@ static INLINE int mv_cost(const MV *mv, const int *joint_cost,
 // nearest 2 ** 7.
 // This is NOT used during motion compensation.
 int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
-                    int *mvcost[2], int weight) {
+                    int *const mvcost[2], int weight) {
   const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
   return ROUND_POWER_OF_TWO(
       mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7);
@@ -224,6 +297,9 @@ static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv,
 
 static INLINE int mv_err_cost_(const MV *mv,
                                const MV_COST_PARAMS *mv_cost_params) {
+  if (mv_cost_params->mv_cost_type == MV_COST_NONE) {
+    return 0;
+  }
   return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost,
                      mv_cost_params->mvcost, mv_cost_params->error_per_bit,
                      mv_cost_params->mv_cost_type);
@@ -270,18 +346,24 @@ static INLINE int mvsad_err_cost_(const FULLPEL_MV *mv,
 #define MAX_PATTERN_CANDIDATES 8  // max number of candidates per scale
 #define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
 
-void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) {
-  int ss_count = 0;
+// Search site initialization for DIAMOND / CLAMPED_DIAMOND search methods.
+// level = 0: DIAMOND, level = 1: CLAMPED_DIAMOND.
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride,
+                                    int level) {
+  int num_search_steps = 0;
   int stage_index = MAX_MVSEARCH_STEPS - 1;
 
-  cfg->ss[stage_index][0].mv.col = cfg->ss[stage_index][0].mv.row = 0;
-  cfg->ss[stage_index][0].offset = 0;
+  cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0;
+  cfg->site[stage_index][0].offset = 0;
   cfg->stride = stride;
 
-  for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) {
+  // Choose the initial step size depending on level.
+  const int first_step = (level > 0) ? (MAX_FIRST_STEP / 4) : MAX_FIRST_STEP;
+
+  for (int radius = first_step; radius > 0;) {
     int num_search_pts = 8;
 
-    const FULLPEL_MV ss_mvs[13] = {
+    const FULLPEL_MV search_site_mvs[13] = {
       { 0, 0 },           { -radius, 0 },      { radius, 0 },
       { 0, -radius },     { 0, radius },       { -radius, -radius },
       { radius, radius }, { -radius, radius }, { radius, -radius },
@@ -289,24 +371,26 @@ void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) {
 
     int i;
     for (i = 0; i <= num_search_pts; ++i) {
-      search_site *const ss = &cfg->ss[stage_index][i];
-      ss->mv = ss_mvs[i];
-      ss->offset = get_offset_from_fullmv(&ss->mv, stride);
+      search_site *const site = &cfg->site[stage_index][i];
+      site->mv = search_site_mvs[i];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
     }
     cfg->searches_per_step[stage_index] = num_search_pts;
     cfg->radius[stage_index] = radius;
+    // Update the search radius based on level.
+    if (!level || ((stage_index < 9) && level)) radius /= 2;
     --stage_index;
-    ++ss_count;
+    ++num_search_steps;
   }
-  cfg->ss_count = ss_count;
+  cfg->num_search_steps = num_search_steps;
 }
 
 void av1_init_motion_fpf(search_site_config *cfg, int stride) {
-  int ss_count = 0;
+  int num_search_steps = 0;
   int stage_index = MAX_MVSEARCH_STEPS - 1;
 
-  cfg->ss[stage_index][0].mv.col = cfg->ss[stage_index][0].mv.row = 0;
-  cfg->ss[stage_index][0].offset = 0;
+  cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0;
+  cfg->site[stage_index][0].offset = 0;
   cfg->stride = stride;
 
   for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) {
@@ -315,7 +399,7 @@ void av1_init_motion_fpf(search_site_config *cfg, int stride) {
     int num_search_pts = 12;
     if (radius == 1) num_search_pts = 8;
 
-    const FULLPEL_MV ss_mvs[13] = {
+    const FULLPEL_MV search_site_mvs[13] = {
       { 0, 0 },
       { -radius, 0 },
       { radius, 0 },
@@ -333,31 +417,35 @@ void av1_init_motion_fpf(search_site_config *cfg, int stride) {
 
     int i;
     for (i = 0; i <= num_search_pts; ++i) {
-      search_site *const ss = &cfg->ss[stage_index][i];
-      ss->mv = ss_mvs[i];
-      ss->offset = get_offset_from_fullmv(&ss->mv, stride);
+      search_site *const site = &cfg->site[stage_index][i];
+      site->mv = search_site_mvs[i];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
     }
     cfg->searches_per_step[stage_index] = num_search_pts;
     cfg->radius[stage_index] = radius;
     --stage_index;
-    ++ss_count;
+    ++num_search_steps;
   }
-  cfg->ss_count = ss_count;
+  cfg->num_search_steps = num_search_steps;
 }
 
-void av1_init3smotion_compensation(search_site_config *cfg, int stride) {
-  int ss_count = 0;
+// Search site initialization for NSTEP / NSTEP_8PT search methods.
+// level = 0: NSTEP, level = 1: NSTEP_8PT.
+void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride,
+                                        int level) {
+  int num_search_steps = 0;
   int stage_index = 0;
   cfg->stride = stride;
   int radius = 1;
-  for (stage_index = 0; stage_index < 15; ++stage_index) {
+  const int num_stages = (level > 0) ? 16 : 15;
+  for (stage_index = 0; stage_index < num_stages; ++stage_index) {
     int tan_radius = AOMMAX((int)(0.41 * radius), 1);
     int num_search_pts = 12;
-    if (radius <= 5) {
+    if ((radius <= 5) || (level > 0)) {
       tan_radius = radius;
       num_search_pts = 8;
     }
-    const FULLPEL_MV ss_mvs[13] = {
+    const FULLPEL_MV search_site_mvs[13] = {
       { 0, 0 },
       { -radius, 0 },
       { radius, 0 },
@@ -374,17 +462,176 @@ void av1_init3smotion_compensation(search_site_config *cfg, int stride) {
     };
 
     for (int i = 0; i <= num_search_pts; ++i) {
-      search_site *const ss = &cfg->ss[stage_index][i];
-      ss->mv = ss_mvs[i];
-      ss->offset = get_offset_from_fullmv(&ss->mv, stride);
+      search_site *const site = &cfg->site[stage_index][i];
+      site->mv = search_site_mvs[i];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
     }
     cfg->searches_per_step[stage_index] = num_search_pts;
     cfg->radius[stage_index] = radius;
-    ++ss_count;
+    ++num_search_steps;
     if (stage_index < 12)
       radius = (int)AOMMAX((radius * 1.5 + 0.5), radius + 1);
   }
-  cfg->ss_count = ss_count;
+  cfg->num_search_steps = num_search_steps;
+}
+
+// Search site initialization for BIGDIA / FAST_BIGDIA / FAST_DIAMOND
+// search methods.
+void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride,
+                                         int level) {
+  (void)level;
+  cfg->stride = stride;
+  // First scale has 4-closest points, the rest have 8 points in diamond
+  // shape at increasing scales
+  static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+    4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+
+  // BIGDIA search method candidates.
+  // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
+  static const FULLPEL_MV
+      site_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+          { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, { 0, 0 }, { 0, 0 },
+            { 0, 0 }, { 0, 0 } },
+          { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
+            { -1, 1 }, { -2, 0 } },
+          { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
+            { -2, 2 }, { -4, 0 } },
+          { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
+            { -4, 4 }, { -8, 0 } },
+          { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
+            { -8, 8 }, { -16, 0 } },
+          { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
+            { 0, 32 }, { -16, 16 }, { -32, 0 } },
+          { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
+            { 0, 64 }, { -32, 32 }, { -64, 0 } },
+          { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
+            { 0, 128 }, { -64, 64 }, { -128, 0 } },
+          { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 },
+            { 128, 128 }, { 0, 256 }, { -128, 128 }, { -256, 0 } },
+          { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 },
+            { 256, 256 }, { 0, 512 }, { -256, 256 }, { -512, 0 } },
+          { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
+            { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
+        };
+
+  /* clang-format on */
+  int radius = 1;
+  for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+    cfg->searches_per_step[i] = bigdia_num_candidates[i];
+    cfg->radius[i] = radius;
+    for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) {
+      search_site *const site = &cfg->site[i][j];
+      site->mv = site_candidates[i][j];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
+    }
+    radius *= 2;
+  }
+  cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+// Search site initialization for SQUARE search method.
+void av1_init_motion_compensation_square(search_site_config *cfg, int stride,
+                                         int level) {
+  (void)level;
+  cfg->stride = stride;
+  // All scales have 8 closest points in square shape.
+  static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+
+  // Square search method candidates.
+  // Note that the largest candidate step at each scale is 2^scale.
+  /* clang-format off */
+    static const FULLPEL_MV
+        square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+             { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+               { -1, 1 }, { -1, 0 } },
+             { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
+               { -2, 2 }, { -2, 0 } },
+             { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
+               { -4, 4 }, { -4, 0 } },
+             { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
+               { -8, 8 }, { -8, 0 } },
+             { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
+               { 0, 16 }, { -16, 16 }, { -16, 0 } },
+             { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
+               { 0, 32 }, { -32, 32 }, { -32, 0 } },
+             { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
+               { 0, 64 }, { -64, 64 }, { -64, 0 } },
+             { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 },
+               { 128, 128 }, { 0, 128 }, { -128, 128 }, { -128, 0 } },
+             { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 },
+               { 256, 256 }, { 0, 256 }, { -256, 256 }, { -256, 0 } },
+             { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 },
+               { 512, 512 }, { 0, 512 }, { -512, 512 }, { -512, 0 } },
+             { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
+               { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
+    };
+
+  /* clang-format on */
+  int radius = 1;
+  for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+    cfg->searches_per_step[i] = square_num_candidates[i];
+    cfg->radius[i] = radius;
+    for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) {
+      search_site *const site = &cfg->site[i][j];
+      site->mv = square_candidates[i][j];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
+    }
+    radius *= 2;
+  }
+  cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+// Search site initialization for HEX / FAST_HEX search methods.
+void av1_init_motion_compensation_hex(search_site_config *cfg, int stride,
+                                      int level) {
+  (void)level;
+  cfg->stride = stride;
+  // First scale has 8-closest points, the rest have 6 points in hex shape
+  // at increasing scales.
+  static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
+                                                              6, 6, 6, 6, 6 };
+  // Note that the largest candidate step at each scale is 2^scale.
+  /* clang-format off */
+    static const FULLPEL_MV
+        hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+        { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+          { -1, 1 }, { -1, 0 } },
+        { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
+        { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
+        { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
+        { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 },
+          { -8, 16 }, { -16, 0 } },
+        { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
+          { -32, 0 } },
+        { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
+          { -64, 0 } },
+        { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 },
+          { -64, 128 }, { -128, 0 } },
+        { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 },
+          { -128, 256 }, { -256, 0 } },
+        { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 },
+          { -256, 512 }, { -512, 0 } },
+        { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
+          { -512, 1024 }, { -1024, 0 } },
+    };
+
+  /* clang-format on */
+  int radius = 1;
+  for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+    cfg->searches_per_step[i] = hex_num_candidates[i];
+    cfg->radius[i] = radius;
+    for (int j = 0; j < hex_num_candidates[i]; ++j) {
+      search_site *const site = &cfg->site[i][j];
+      site->mv = hex_candidates[i][j];
+      site->offset = get_offset_from_fullmv(&site->mv, stride);
+    }
+    radius *= 2;
+  }
+  cfg->num_search_steps = MAX_PATTERN_SCALES;
 }
 
 // Checks whether the mv is within range of the mv_limits
@@ -421,11 +668,10 @@ static INLINE int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                                  const struct buf_2d *const src,
                                  const uint8_t *const ref_address,
                                  const int ref_stride) {
-  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   const uint8_t *src_buf = src->buf;
   const int src_stride = src->stride;
 
-  return vfp->sdf(src_buf, src_stride, ref_address, ref_stride);
+  return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
 }
 
 static INLINE int get_mvpred_compound_var_cost(
@@ -445,9 +691,9 @@ static INLINE int get_mvpred_compound_var_cost(
   int bestsme;
 
   if (mask) {
-    bestsme = vfp->msvf(src_buf, src_stride, 0, 0,
-                        get_buf_from_fullmv(ref, this_mv), ref_stride,
-                        second_pred, mask, mask_stride, invert_mask, &unused);
+    bestsme = vfp->msvf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
+                        src_buf, src_stride, second_pred, mask, mask_stride,
+                        invert_mask, &unused);
   } else if (second_pred) {
     bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
                         src_buf, src_stride, &unused, second_pred);
@@ -481,7 +727,7 @@ static INLINE int get_mvpred_compound_sad(
   } else if (second_pred) {
     return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred);
   } else {
-    return vfp->sdf(src_buf, src_stride, ref_address, ref_stride);
+    return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
   }
 }
 
@@ -576,44 +822,127 @@ static AOM_FORCE_INLINE void calc_int_sad_list(
   }
 }
 
-#define CHECK_BETTER                                                      \
-  if (thissad < bestsad) {                                                \
-    int tmp_thissad = thissad;                                            \
-    if (use_mvcost) thissad += mvsad_err_cost_(&this_mv, mv_cost_params); \
-    if (thissad < bestsad) {                                              \
-      raw_bestsad = tmp_thissad;                                          \
-      bestsad = thissad;                                                  \
-      best_site = i;                                                      \
-    }                                                                     \
+// Computes motion vector cost and adds to the sad cost.
+// Then updates the best sad and motion vectors.
+// Inputs:
+//   this_sad: the sad to be evaluated.
+//   mv: the current motion vector.
+//   mv_cost_params: a structure containing information to compute mv cost.
+//   best_sad: the current best sad.
+//   raw_best_sad (optional): the current best sad without calculating mv cost.
+//   best_mv: the current best motion vector.
+//   second_best_mv (optional): the second best motion vector up to now.
+// Modifies:
+//   best_sad, raw_best_sad, best_mv, second_best_mv
+//   If the current sad is lower than the current best sad.
+// Returns:
+//   Whether the input sad (mv) is better than the current best.
+static int update_mvs_and_sad(const unsigned int this_sad, const FULLPEL_MV *mv,
+                              const MV_COST_PARAMS *mv_cost_params,
+                              unsigned int *best_sad,
+                              unsigned int *raw_best_sad, FULLPEL_MV *best_mv,
+                              FULLPEL_MV *second_best_mv) {
+  if (this_sad >= *best_sad) return 0;
+
+  // Add the motion vector cost.
+  const unsigned int sad = this_sad + mvsad_err_cost_(mv, mv_cost_params);
+  if (sad < *best_sad) {
+    if (raw_best_sad) *raw_best_sad = this_sad;
+    *best_sad = sad;
+    if (second_best_mv) *second_best_mv = *best_mv;
+    *best_mv = *mv;
+    return 1;
   }
+  return 0;
+}
+
+// Calculate sad4 and update the bestmv information
+// in FAST_DIAMOND search method.
+static void calc_sad4_update_bestmv(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+    FULLPEL_MV *temp_best_mv, unsigned int *bestsad, unsigned int *raw_bestsad,
+    int search_step, int *best_site, int cand_start) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site *site = ms_params->search_sites->site[search_step];
+
+  unsigned char const *block_offset[4];
+  unsigned int sads[4];
+  const uint8_t *best_address;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+  best_address = get_buf_from_fullmv(ref, temp_best_mv);
+  // Loop over number of candidates.
+  for (int j = 0; j < 4; j++)
+    block_offset[j] = site[cand_start + j].offset + best_address;
+
+  // 4-point sad calculation.
+  ms_params->sdx4df(src_buf, src_stride, block_offset, ref->stride, sads);
+
+  for (int j = 0; j < 4; j++) {
+    const FULLPEL_MV this_mv = {
+      temp_best_mv->row + site[cand_start + j].mv.row,
+      temp_best_mv->col + site[cand_start + j].mv.col
+    };
+    const int found_better_mv = update_mvs_and_sad(
+        sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+        /*second_best_mv=*/NULL);
+    if (found_better_mv) *best_site = cand_start + j;
+  }
+}
+
+// Calculate sad and update the bestmv information
+// in FAST_DIAMOND search method.
+static void calc_sad_update_bestmv(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+    FULLPEL_MV *temp_best_mv, unsigned int *bestsad, unsigned int *raw_bestsad,
+    int search_step, int *best_site, const int num_candidates, int cand_start) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site *site = ms_params->search_sites->site[search_step];
+  // Loop over number of candidates.
+  for (int i = cand_start; i < num_candidates; i++) {
+    const FULLPEL_MV this_mv = { temp_best_mv->row + site[i].mv.row,
+                                 temp_best_mv->col + site[i].mv.col };
+    if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue;
+    int thissad = get_mvpred_sad(
+        ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref->stride);
+    const int found_better_mv = update_mvs_and_sad(
+        thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+        /*second_best_mv=*/NULL);
+    if (found_better_mv) *best_site = i;
+  }
+}
 
 // Generic pattern search function that searches over multiple scales.
 // Each scale can have a different number of candidates and shape of
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
-static int pattern_search(
-    FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-    const int search_param, const int do_init_search,
-    const int num_candidates[MAX_PATTERN_SCALES],
-    const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES],
-    int *cost_list, FULLPEL_MV *best_mv) {
-  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+static int pattern_search(FULLPEL_MV start_mv,
+                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                          int search_step, const int do_init_search,
+                          int *cost_list, FULLPEL_MV *best_mv) {
+  static const int search_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
   };
   int i, s, t;
 
   const struct buf_2d *const src = ms_params->ms_buffers.src;
   const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const search_site_config *search_sites = ms_params->search_sites;
+  const int *num_candidates = search_sites->searches_per_step;
   const int ref_stride = ref->stride;
   const int last_is_4 = num_candidates[0] == 4;
   int br, bc;
-  int bestsad = INT_MAX, raw_bestsad = INT_MAX;
+  unsigned int bestsad = UINT_MAX, raw_bestsad = UINT_MAX;
   int thissad;
   int k = -1;
-  const int use_mvcost = ms_params->mv_cost_params.mv_cost_type != MV_COST_NONE;
   const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
-  assert(search_param < MAX_MVSEARCH_STEPS);
-  int best_init_s = search_param_to_steps[search_param];
+  search_step = AOMMIN(search_step, MAX_MVSEARCH_STEPS - 1);
+  assert(search_step >= 0);
+  int best_init_s = search_steps[search_step];
   // adjust ref_mv to make sure it is within MV range
   clamp_fullmv(&start_mv, &ms_params->mv_limits);
   br = start_mv.row;
@@ -637,23 +966,27 @@ static int pattern_search(
     best_init_s = -1;
     for (t = 0; t <= s; ++t) {
       int best_site = -1;
+      FULLPEL_MV temp_best_mv;
+      temp_best_mv.row = br;
+      temp_best_mv.col = bc;
       if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) {
-        for (i = 0; i < num_candidates[t]; i++) {
-          const FULLPEL_MV this_mv = { br + candidates[t][i].row,
-                                       bc + candidates[t][i].col };
-          thissad = get_mvpred_sad(
-              ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-          CHECK_BETTER
+        // Call 4-point sad for multiples of 4 candidates.
+        const int no_of_4_cand_loops = num_candidates[t] >> 2;
+        for (i = 0; i < no_of_4_cand_loops; i++) {
+          calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv,
+                                  &temp_best_mv, &bestsad, &raw_bestsad, t,
+                                  &best_site, i * 4);
         }
+        // Rest of the candidates
+        const int remaining_cand = num_candidates[t] % 4;
+        calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv,
+                               &temp_best_mv, &bestsad, &raw_bestsad, t,
+                               &best_site, remaining_cand,
+                               no_of_4_cand_loops * 4);
       } else {
-        for (i = 0; i < num_candidates[t]; i++) {
-          const FULLPEL_MV this_mv = { br + candidates[t][i].row,
-                                       bc + candidates[t][i].col };
-          if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue;
-          thissad = get_mvpred_sad(
-              ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-          CHECK_BETTER
-        }
+        calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv,
+                               &temp_best_mv, &bestsad, &raw_bestsad, t,
+                               &best_site, num_candidates[t], 0);
       }
       if (best_site == -1) {
         continue;
@@ -663,8 +996,8 @@ static int pattern_search(
       }
     }
     if (best_init_s != -1) {
-      br += candidates[best_init_s][k].row;
-      bc += candidates[best_init_s][k].col;
+      br += search_sites->site[best_init_s][k].mv.row;
+      bc += search_sites->site[best_init_s][k].mv.col;
     }
   }
 
@@ -678,31 +1011,34 @@ static int pattern_search(
     for (; s >= last_s; s--) {
       // No need to search all points the 1st time if initial search was used
       if (!do_init_search || s != best_init_s) {
+        FULLPEL_MV temp_best_mv;
+        temp_best_mv.row = br;
+        temp_best_mv.col = bc;
         if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
-          for (i = 0; i < num_candidates[s]; i++) {
-            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
-                                         bc + candidates[s][i].col };
-            thissad = get_mvpred_sad(
-                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+          // Call 4-point sad for multiples of 4 candidates.
+          const int no_of_4_cand_loops = num_candidates[s] >> 2;
+          for (i = 0; i < no_of_4_cand_loops; i++) {
+            calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv,
+                                    &temp_best_mv, &bestsad, &raw_bestsad, s,
+                                    &best_site, i * 4);
           }
+          // Rest of the candidates
+          const int remaining_cand = num_candidates[s] % 4;
+          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv,
+                                 &temp_best_mv, &bestsad, &raw_bestsad, s,
+                                 &best_site, remaining_cand,
+                                 no_of_4_cand_loops * 4);
         } else {
-          for (i = 0; i < num_candidates[s]; i++) {
-            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
-                                         bc + candidates[s][i].col };
-            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
-              continue;
-            thissad = get_mvpred_sad(
-                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
-          }
+          calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv,
+                                 &temp_best_mv, &bestsad, &raw_bestsad, s,
+                                 &best_site, num_candidates[s], 0);
         }
 
         if (best_site == -1) {
           continue;
         } else {
-          br += candidates[s][best_site].row;
-          bc += candidates[s][best_site].col;
+          br += search_sites->site[s][best_site].mv.row;
+          bc += search_sites->site[s][best_site].mv.col;
           k = best_site;
         }
       }
@@ -717,31 +1053,39 @@ static int pattern_search(
         if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
             const FULLPEL_MV this_mv = {
-              br + candidates[s][next_chkpts_indices[i]].row,
-              bc + candidates[s][next_chkpts_indices[i]].col
+              br + search_sites->site[s][next_chkpts_indices[i]].mv.row,
+              bc + search_sites->site[s][next_chkpts_indices[i]].mv.col
             };
             thissad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+            const int found_better_mv =
+                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
+                                   &raw_bestsad, best_mv,
+                                   /*second_best_mv=*/NULL);
+            if (found_better_mv) best_site = i;
           }
         } else {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
             const FULLPEL_MV this_mv = {
-              br + candidates[s][next_chkpts_indices[i]].row,
-              bc + candidates[s][next_chkpts_indices[i]].col
+              br + search_sites->site[s][next_chkpts_indices[i]].mv.row,
+              bc + search_sites->site[s][next_chkpts_indices[i]].mv.col
             };
             if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
               continue;
             thissad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+            const int found_better_mv =
+                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
+                                   &raw_bestsad, best_mv,
+                                   /*second_best_mv=*/NULL);
+            if (found_better_mv) best_site = i;
           }
         }
 
         if (best_site != -1) {
           k = next_chkpts_indices[best_site];
-          br += candidates[s][k].row;
-          bc += candidates[s][k].col;
+          br += search_sites->site[s][k].mv.row;
+          bc += search_sites->site[s][k].mv.col;
         }
       } while (best_site != -1);
     }
@@ -753,27 +1097,35 @@ static int pattern_search(
       if (!do_init_search || s != best_init_s) {
         if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
           for (i = 0; i < num_candidates[s]; i++) {
-            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
-                                         bc + candidates[s][i].col };
+            const FULLPEL_MV this_mv = { br + search_sites->site[s][i].mv.row,
+                                         bc + search_sites->site[s][i].mv.col };
             cost_list[i + 1] = thissad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+            const int found_better_mv =
+                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
+                                   &raw_bestsad, best_mv,
+                                   /*second_best_mv=*/NULL);
+            if (found_better_mv) best_site = i;
           }
         } else {
           for (i = 0; i < num_candidates[s]; i++) {
-            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
-                                         bc + candidates[s][i].col };
+            const FULLPEL_MV this_mv = { br + search_sites->site[s][i].mv.row,
+                                         bc + search_sites->site[s][i].mv.col };
             if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
               continue;
             cost_list[i + 1] = thissad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+            const int found_better_mv =
+                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
+                                   &raw_bestsad, best_mv,
+                                   /*second_best_mv=*/NULL);
+            if (found_better_mv) best_site = i;
           }
         }
 
         if (best_site != -1) {
-          br += candidates[s][best_site].row;
-          bc += candidates[s][best_site].col;
+          br += search_sites->site[s][best_site].mv.row;
+          bc += search_sites->site[s][best_site].mv.col;
           k = best_site;
         }
       }
@@ -790,18 +1142,22 @@ static int pattern_search(
         if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
             const FULLPEL_MV this_mv = {
-              br + candidates[s][next_chkpts_indices[i]].row,
-              bc + candidates[s][next_chkpts_indices[i]].col
+              br + search_sites->site[s][next_chkpts_indices[i]].mv.row,
+              bc + search_sites->site[s][next_chkpts_indices[i]].mv.col
             };
             cost_list[next_chkpts_indices[i] + 1] = thissad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+            const int found_better_mv =
+                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
+                                   &raw_bestsad, best_mv,
+                                   /*second_best_mv=*/NULL);
+            if (found_better_mv) best_site = i;
           }
         } else {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
             const FULLPEL_MV this_mv = {
-              br + candidates[s][next_chkpts_indices[i]].row,
-              bc + candidates[s][next_chkpts_indices[i]].col
+              br + search_sites->site[s][next_chkpts_indices[i]].mv.row,
+              bc + search_sites->site[s][next_chkpts_indices[i]].mv.col
             };
             if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
               cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
@@ -809,14 +1165,18 @@ static int pattern_search(
             }
             cost_list[next_chkpts_indices[i] + 1] = thissad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
-            CHECK_BETTER
+            const int found_better_mv =
+                update_mvs_and_sad(thissad, &this_mv, mv_cost_params, &bestsad,
+                                   &raw_bestsad, best_mv,
+                                   /*second_best_mv=*/NULL);
+            if (found_better_mv) best_site = i;
           }
         }
 
         if (best_site != -1) {
           k = next_chkpts_indices[best_site];
-          br += candidates[s][k].row;
-          bc += candidates[s][k].col;
+          br += search_sites->site[s][k].mv.row;
+          bc += search_sites->site[s][k].mv.col;
         }
       }
     }
@@ -844,160 +1204,72 @@ static int pattern_search(
   const int var_cost = get_mvpred_var_cost(ms_params, best_mv);
   return var_cost;
 }
-#undef CHECK_BETTER
 
 // For the following foo_search, the input arguments are:
-// x: The struct used to hold a bunch of random configs.
 // start_mv: where we are starting our motion search
-// search_param: how many steps to skip in our motion search. For example,
+// ms_params: a collection of motion search parameters
+// search_step: how many steps to skip in our motion search. For example,
 //   a value 3 suggests that 3 search steps have already taken place prior to
 //   this function call, so we jump directly to step 4 of the search process
-// sad_per_bit: a multiplier used to convert rate to sad cost
 // do_init_search: if on, do an initial search of all possible scales around the
 //   start_mv, and then pick the best scale.
 // cond_list: used to hold the cost around the best full mv so we can use it to
 //   speed up subpel search later.
-// vfp: a function pointer to the simd function so we can compute the cost
-//   efficiently
-// ref_mv: the reference mv used to compute the mv cost
+// best_mv: the best mv found in the motion search
 static int hex_search(const FULLPEL_MV start_mv,
                       const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                      const int search_param, const int do_init_search,
+                      const int search_step, const int do_init_search,
                       int *cost_list, FULLPEL_MV *best_mv) {
-  // First scale has 8-closest points, the rest have 6 points in hex shape
-  // at increasing scales
-  static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
-                                                              6, 6, 6, 6, 6 };
-  // Note that the largest candidate step at each scale is 2^scale
-  /* clang-format off */
-  static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
-    { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 },
-      { -1, 0 } },
-    { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
-    { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
-    { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
-    { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } },
-    { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
-      { -32, 0 } },
-    { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
-      { -64, 0 } },
-    { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 },
-      { -128, 0 } },
-    { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 },
-      { -256, 0 } },
-    { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 },
-      { -512, 0 } },
-    { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
-      { -512, 1024 }, { -1024, 0 } },
-  };
-  /* clang-format on */
-  return pattern_search(start_mv, ms_params, search_param, do_init_search,
-                        hex_num_candidates, hex_candidates, cost_list, best_mv);
+  return pattern_search(start_mv, ms_params, search_step, do_init_search,
+                        cost_list, best_mv);
 }
 
 static int bigdia_search(const FULLPEL_MV start_mv,
                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                         const int search_param, const int do_init_search,
+                         const int search_step, const int do_init_search,
                          int *cost_list, FULLPEL_MV *best_mv) {
-  // First scale has 4-closest points, the rest have 8 points in diamond
-  // shape at increasing scales
-  static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
-    4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  };
-  // Note that the largest candidate step at each scale is 2^scale
-  /* clang-format off */
-  static const MV
-      bigdia_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
-        { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } },
-        { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
-          { -1, 1 }, { -2, 0 } },
-        { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
-          { -2, 2 }, { -4, 0 } },
-        { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
-          { -4, 4 }, { -8, 0 } },
-        { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
-          { -8, 8 }, { -16, 0 } },
-        { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
-          { 0, 32 }, { -16, 16 }, { -32, 0 } },
-        { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
-          { 0, 64 }, { -32, 32 }, { -64, 0 } },
-        { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
-          { 0, 128 }, { -64, 64 }, { -128, 0 } },
-        { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 },
-          { 0, 256 }, { -128, 128 }, { -256, 0 } },
-        { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 },
-          { 0, 512 }, { -256, 256 }, { -512, 0 } },
-        { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
-          { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
-      };
-  /* clang-format on */
-  return pattern_search(start_mv, ms_params, search_param, do_init_search,
-                        bigdia_num_candidates, bigdia_candidates, cost_list,
-                        best_mv);
+  return pattern_search(start_mv, ms_params, search_step, do_init_search,
+                        cost_list, best_mv);
 }
 
 static int square_search(const FULLPEL_MV start_mv,
                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                         const int search_param, const int do_init_search,
+                         const int search_step, const int do_init_search,
                          int *cost_list, FULLPEL_MV *best_mv) {
-  // All scales have 8 closest points in square shape
-  static const int square_num_candidates[MAX_PATTERN_SCALES] = {
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  };
-  // Note that the largest candidate step at each scale is 2^scale
-  /* clang-format off */
-  static const MV
-      square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
-        { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
-          { -1, 1 }, { -1, 0 } },
-        { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
-          { -2, 2 }, { -2, 0 } },
-        { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
-          { -4, 4 }, { -4, 0 } },
-        { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
-          { -8, 8 }, { -8, 0 } },
-        { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
-          { 0, 16 }, { -16, 16 }, { -16, 0 } },
-        { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
-          { 0, 32 }, { -32, 32 }, { -32, 0 } },
-        { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
-          { 0, 64 }, { -64, 64 }, { -64, 0 } },
-        { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 },
-          { 0, 128 }, { -128, 128 }, { -128, 0 } },
-        { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 },
-          { 0, 256 }, { -256, 256 }, { -256, 0 } },
-        { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 },
-          { 0, 512 }, { -512, 512 }, { -512, 0 } },
-        { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
-          { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
-      };
-  /* clang-format on */
-  return pattern_search(start_mv, ms_params, search_param, do_init_search,
-                        square_num_candidates, square_candidates, cost_list,
-                        best_mv);
+  return pattern_search(start_mv, ms_params, search_step, do_init_search,
+                        cost_list, best_mv);
 }
 
 static int fast_hex_search(const FULLPEL_MV start_mv,
                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                           const int search_param, const int do_init_search,
+                           const int search_step, const int do_init_search,
                            int *cost_list, FULLPEL_MV *best_mv) {
   return hex_search(start_mv, ms_params,
-                    AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                    do_init_search, cost_list, best_mv);
+                    AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), do_init_search,
+                    cost_list, best_mv);
 }
 
 static int fast_dia_search(const FULLPEL_MV start_mv,
                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                           const int search_param, const int do_init_search,
+                           const int search_step, const int do_init_search,
                            int *cost_list, FULLPEL_MV *best_mv) {
   return bigdia_search(start_mv, ms_params,
-                       AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                       AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step),
+                       do_init_search, cost_list, best_mv);
+}
+
+static int fast_bigdia_search(const FULLPEL_MV start_mv,
+                              const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const int search_step, const int do_init_search,
+                              int *cost_list, FULLPEL_MV *best_mv) {
+  return bigdia_search(start_mv, ms_params,
+                       AOMMAX(MAX_MVSEARCH_STEPS - 3, search_step),
                        do_init_search, cost_list, best_mv);
 }
 
 static int diamond_search_sad(FULLPEL_MV start_mv,
                               const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                              const int search_param, int *num00,
+                              const int search_step, int *num00,
                               FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
   const struct buf_2d *const src = ms_params->ms_buffers.src;
   const struct buf_2d *const ref = ms_params->ms_buffers.ref;
@@ -1005,7 +1277,6 @@ static int diamond_search_sad(FULLPEL_MV start_mv,
   const int ref_stride = ref->stride;
   const uint8_t *best_address;
 
-  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   const uint8_t *mask = ms_params->ms_buffers.mask;
   const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
   const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
@@ -1018,9 +1289,9 @@ static int diamond_search_sad(FULLPEL_MV start_mv,
 
   clamp_fullmv(&start_mv, &ms_params->mv_limits);
 
-  // search_param determines the length of the initial step and hence the number
+  // search_step determines the length of the initial step and hence the number
   // of iterations.
-  const int tot_steps = cfg->ss_count - search_param;
+  const int tot_steps = cfg->num_search_steps - search_step;
 
   *num00 = 0;
   *best_mv = start_mv;
@@ -1032,16 +1303,16 @@ static int diamond_search_sad(FULLPEL_MV start_mv,
 
   int next_step_size = tot_steps > 2 ? cfg->radius[tot_steps - 2] : 1;
   for (int step = tot_steps - 1; step >= 0; --step) {
-    const search_site *ss = cfg->ss[step];
+    const search_site *site = cfg->site[step];
     best_site = 0;
     if (step > 0) next_step_size = cfg->radius[step - 1];
 
     int all_in = 1, j;
     // Trap illegal vectors
-    all_in &= best_mv->row + ss[1].mv.row >= ms_params->mv_limits.row_min;
-    all_in &= best_mv->row + ss[2].mv.row <= ms_params->mv_limits.row_max;
-    all_in &= best_mv->col + ss[3].mv.col >= ms_params->mv_limits.col_min;
-    all_in &= best_mv->col + ss[4].mv.col <= ms_params->mv_limits.col_max;
+    all_in &= best_mv->row + site[1].mv.row >= ms_params->mv_limits.row_min;
+    all_in &= best_mv->row + site[2].mv.row <= ms_params->mv_limits.row_max;
+    all_in &= best_mv->col + site[3].mv.col >= ms_params->mv_limits.col_min;
+    all_in &= best_mv->col + site[4].mv.col <= ms_params->mv_limits.col_max;
 
     // TODO(anyone): Implement 4 points search for msdf&sdaf
     if (all_in && !mask && !second_pred) {
@@ -1052,13 +1323,13 @@ static int diamond_search_sad(FULLPEL_MV start_mv,
         unsigned int sads[4];
 
         for (j = 0; j < 4; j++)
-          block_offset[j] = ss[idx + j].offset + best_address;
+          block_offset[j] = site[idx + j].offset + best_address;
 
-        vfp->sdx4df(src_buf, src_stride, block_offset, ref_stride, sads);
+        ms_params->sdx4df(src_buf, src_stride, block_offset, ref_stride, sads);
         for (j = 0; j < 4; j++) {
           if (sads[j] < bestsad) {
-            const FULLPEL_MV this_mv = { best_mv->row + ss[idx + j].mv.row,
-                                         best_mv->col + ss[idx + j].mv.col };
+            const FULLPEL_MV this_mv = { best_mv->row + site[idx + j].mv.row,
+                                         best_mv->col + site[idx + j].mv.col };
             unsigned int thissad =
                 sads[j] + mvsad_err_cost_(&this_mv, mv_cost_params);
             if (thissad < bestsad) {
@@ -1070,11 +1341,11 @@ static int diamond_search_sad(FULLPEL_MV start_mv,
       }
     } else {
       for (int idx = 1; idx <= cfg->searches_per_step[step]; idx++) {
-        const FULLPEL_MV this_mv = { best_mv->row + ss[idx].mv.row,
-                                     best_mv->col + ss[idx].mv.col };
+        const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row,
+                                     best_mv->col + site[idx].mv.col };
 
         if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
-          const uint8_t *const check_here = ss[idx].offset + best_address;
+          const uint8_t *const check_here = site[idx].offset + best_address;
           unsigned int thissad;
 
           thissad =
@@ -1095,9 +1366,9 @@ static int diamond_search_sad(FULLPEL_MV start_mv,
       if (second_best_mv) {
         *second_best_mv = *best_mv;
       }
-      best_mv->row += ss[best_site].mv.row;
-      best_mv->col += ss[best_site].mv.col;
-      best_address += ss[best_site].offset;
+      best_mv->row += site[best_site].mv.row;
+      best_mv->col += site[best_site].mv.col;
+      best_address += site[best_site].offset;
       is_off_center = 1;
     }
 
@@ -1133,7 +1404,7 @@ static int full_pixel_diamond(const FULLPEL_MV start_mv,
 
   // If there won't be more n-step search, check to see if refining search is
   // needed.
-  const int further_steps = cfg->ss_count - 1 - step_param;
+  const int further_steps = cfg->num_search_steps - 1 - step_param;
   while (n < further_steps) {
     ++n;
 
@@ -1176,7 +1447,6 @@ static int exhaustive_mesh_search(FULLPEL_MV start_mv,
                                   const int range, const int step,
                                   FULLPEL_MV *best_mv,
                                   FULLPEL_MV *second_best_mv) {
-  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   const struct buf_2d *const src = ms_params->ms_buffers.src;
   const struct buf_2d *const ref = ms_params->ms_buffers.ref;
@@ -1184,7 +1454,7 @@ static int exhaustive_mesh_search(FULLPEL_MV start_mv,
   unsigned int best_sad = INT_MAX;
   int r, c, i;
   int start_col, end_col, start_row, end_row;
-  int col_step = (step > 1) ? step : 4;
+  const int col_step = (step > 1) ? step : 4;
 
   assert(step >= 1);
 
@@ -1205,16 +1475,8 @@ static int exhaustive_mesh_search(FULLPEL_MV start_mv,
         const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c };
         unsigned int sad = get_mvpred_sad(
             ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
-        if (sad < best_sad) {
-          sad += mvsad_err_cost_(&mv, mv_cost_params);
-          if (sad < best_sad) {
-            best_sad = sad;
-            if (second_best_mv) {
-              *second_best_mv = *best_mv;
-            }
-            *best_mv = mv;
-          }
-        }
+        update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad,
+                           /*raw_best_sad=*/NULL, best_mv, second_best_mv);
       } else {
         // 4 sads in a single call if we are checking every location
         if (c + 3 <= end_col) {
@@ -1224,20 +1486,15 @@ static int exhaustive_mesh_search(FULLPEL_MV start_mv,
             const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
             addrs[i] = get_buf_from_fullmv(ref, &mv);
           }
-          vfp->sdx4df(src->buf, src->stride, addrs, ref_stride, sads);
+
+          ms_params->sdx4df(src->buf, src->stride, addrs, ref_stride, sads);
 
           for (i = 0; i < 4; ++i) {
             if (sads[i] < best_sad) {
               const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
-              const unsigned int sad =
-                  sads[i] + mvsad_err_cost_(&mv, mv_cost_params);
-              if (sad < best_sad) {
-                best_sad = sad;
-                if (second_best_mv) {
-                  *second_best_mv = *best_mv;
-                }
-                *best_mv = mv;
-              }
+              update_mvs_and_sad(sads[i], &mv, mv_cost_params, &best_sad,
+                                 /*raw_best_sad=*/NULL, best_mv,
+                                 second_best_mv);
             }
           }
         } else {
@@ -1245,16 +1502,8 @@ static int exhaustive_mesh_search(FULLPEL_MV start_mv,
             const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
             unsigned int sad = get_mvpred_sad(
                 ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
-            if (sad < best_sad) {
-              sad += mvsad_err_cost_(&mv, mv_cost_params);
-              if (sad < best_sad) {
-                best_sad = sad;
-                if (second_best_mv) {
-                  *second_best_mv = *best_mv;
-                }
-                *best_mv = mv;
-              }
-            }
+            update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad,
+                               /*raw_best_sad=*/NULL, best_mv, second_best_mv);
           }
         }
       }
@@ -1295,6 +1544,15 @@ static int full_pixel_exhaustive(const FULLPEL_MV start_mv,
   range = AOMMAX(range, (5 * AOMMAX(abs(best_mv->row), abs(best_mv->col))) / 4);
   range = AOMMIN(range, kMaxRange);
   interval = AOMMAX(interval, range / baseline_interval_divisor);
+  // Use a small search step/interval for certain kind of clips.
+  // For example, screen content clips with a lot of texts.
+  // Large interval could lead to a false matching position, and it can't find
+  // the best global candidate in following iterations due to reduced search
+  // range. The solution here is to use a small search iterval in the beginning
+  // and thus reduces the chance of missing the best candidate.
+  if (ms_params->fine_search_interval) {
+    interval = AOMMIN(interval, 4);
+  }
 
   // initial search
   bestsme = exhaustive_mesh_search(*best_mv, ms_params, range, interval,
@@ -1419,10 +1677,6 @@ int av1_full_pixel_search(const FULLPEL_MV start_mv,
     MARK_MV_INVALID(second_best_mv);
   }
 
-  assert(ms_params->ms_buffers.second_pred == NULL &&
-         ms_params->ms_buffers.mask == NULL &&
-         "av1_full_pixel_search does not support compound pred");
-
   if (cost_list) {
     cost_list[0] = INT_MAX;
     cost_list[1] = INT_MAX;
@@ -1432,6 +1686,10 @@ int av1_full_pixel_search(const FULLPEL_MV start_mv,
   }
 
   switch (search_method) {
+    case FAST_BIGDIA:
+      var = fast_bigdia_search(start_mv, ms_params, step_param, 0, cost_list,
+                               best_mv);
+      break;
     case FAST_DIAMOND:
       var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
                             best_mv);
@@ -1452,7 +1710,9 @@ int av1_full_pixel_search(const FULLPEL_MV start_mv,
           bigdia_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
       break;
     case NSTEP:
+    case NSTEP_8PT:
     case DIAMOND:
+    case CLAMPED_DIAMOND:
       var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list,
                                best_mv, second_best_mv);
       break;
@@ -1460,12 +1720,13 @@ int av1_full_pixel_search(const FULLPEL_MV start_mv,
   }
 
   // Should we allow a follow on exhaustive search?
-  if (!run_mesh_search && search_method == NSTEP) {
-    int exhuastive_thr = ms_params->force_mesh_thresh;
-    exhuastive_thr >>=
+  if (!run_mesh_search &&
+      ((search_method == NSTEP) || (search_method == NSTEP_8PT))) {
+    int exhaustive_thr = ms_params->force_mesh_thresh;
+    exhaustive_thr >>=
         10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
     // Threshold variance for an exhaustive full search.
-    if (var > exhuastive_thr) run_mesh_search = 1;
+    if (var > exhaustive_thr) run_mesh_search = 1;
   }
 
   // TODO(yunqing): the following is used to reduce mesh search in temporal
@@ -1473,11 +1734,45 @@ int av1_full_pixel_search(const FULLPEL_MV start_mv,
   if (!is_intra_mode && ms_params->prune_mesh_search) {
     const int full_pel_mv_diff = AOMMAX(abs(start_mv.row - best_mv->row),
                                         abs(start_mv.col - best_mv->col));
-    if (full_pel_mv_diff <= 4) {
+    if (full_pel_mv_diff <= ms_params->mesh_search_mv_diff_threshold) {
       run_mesh_search = 0;
     }
   }
 
+  if (ms_params->sdf != ms_params->vfp->sdf) {
+    // If we are skipping rows when we perform the motion search, we need to
+    // check the quality of skipping. If it's bad, then we run mesh search with
+    // skip row features off.
+    // TODO(chiyotsai@google.com): Handle the case where we have a vertical
+    // offset of 1 before we hit this statement to avoid having to redo
+    // motion search.
+    const struct buf_2d *src = ms_params->ms_buffers.src;
+    const struct buf_2d *ref = ms_params->ms_buffers.ref;
+    const int src_stride = src->stride;
+    const int ref_stride = ref->stride;
+
+    const uint8_t *src_address = src->buf;
+    const uint8_t *best_address = get_buf_from_fullmv(ref, best_mv);
+    const int sad =
+        ms_params->vfp->sdf(src_address, src_stride, best_address, ref_stride);
+    const int skip_sad =
+        ms_params->vfp->sdsf(src_address, src_stride, best_address, ref_stride);
+    // We will keep the result of skipping rows if it's good enough. Here, good
+    // enough means the error is less than 1 per pixel.
+    const int kSADThresh =
+        1 << (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+    if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= AOMMAX(sad, 1) * 9) {
+      // There is a large discrepancy between skipping and not skipping, so we
+      // need to redo the motion search.
+      FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params;
+      new_ms_params.sdf = new_ms_params.vfp->sdf;
+      new_ms_params.sdx4df = new_ms_params.vfp->sdx4df;
+
+      return av1_full_pixel_search(start_mv, &new_ms_params, step_param,
+                                   cost_list, best_mv, second_best_mv);
+    }
+  }
+
   if (run_mesh_search) {
     int var_ex;
     FULLPEL_MV tmp_mv_ex;
@@ -1545,7 +1840,7 @@ int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd,
       const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos),
                       GET_MV_SUBPEL(ref_block_hash.x - x_pos) };
       if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize,
-                           cpi->common.seq_params.mib_size_log2))
+                           cpi->common.seq_params->mib_size_log2))
         continue;
 
       FULLPEL_MV hash_mv;
@@ -1672,8 +1967,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
   if (xd->bd != 8) {
     unsigned int sad;
     best_int_mv->as_fullmv = kZeroFullMv;
-    sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
-                                 xd->plane[0].pre[0].buf, ref_stride);
+    sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                      xd->plane[0].pre[0].buf, ref_stride);
 
     if (scaled_ref_frame) {
       int i;
@@ -1716,7 +2011,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
   FULLPEL_MV this_mv = best_int_mv->as_fullmv;
   src_buf = x->plane[0].src.buf;
   ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
-  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+  best_sad =
+      cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
 
   {
     const uint8_t *const pos[4] = {
@@ -1726,7 +2022,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
       ref_buf + ref_stride,
     };
 
-    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+    cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride,
+                                   this_sad);
   }
 
   for (idx = 0; idx < 4; ++idx) {
@@ -1749,7 +2046,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
 
   ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
 
-  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+  tmp_sad =
+      cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
   if (best_sad > tmp_sad) {
     best_int_mv->as_fullmv = this_mv;
     best_sad = tmp_sad;
@@ -1836,7 +2134,7 @@ static int obmc_refining_search_sad(
 
 static int obmc_diamond_search_sad(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv,
-    FULLPEL_MV *best_mv, int search_param, int *num00) {
+    FULLPEL_MV *best_mv, int search_step, int *num00) {
   const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
   const search_site_config *cfg = ms_params->search_sites;
   const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
@@ -1844,16 +2142,13 @@ static int obmc_diamond_search_sad(
   const int32_t *wsrc = ms_buffers->wsrc;
   const int32_t *mask = ms_buffers->obmc_mask;
   const struct buf_2d *const ref_buf = ms_buffers->ref;
-  // search_param determines the length of the initial step and hence the number
-  // of iterations
-  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
-  // (MAX_FIRST_STEP/4) pel... etc.
 
-  const int tot_steps = MAX_MVSEARCH_STEPS - 1 - search_param;
+  // search_step determines the length of the initial step and hence the number
+  // of iterations.
+  const int tot_steps = cfg->num_search_steps - search_step;
   const uint8_t *best_address, *init_ref;
   int best_sad = INT_MAX;
   int best_site = 0;
-  int step;
 
   clamp_fullmv(&start_mv, &ms_params->mv_limits);
   best_address = init_ref = get_buf_from_fullmv(ref_buf, &start_mv);
@@ -1864,14 +2159,14 @@ static int obmc_diamond_search_sad(
   best_sad = fn_ptr->osdf(best_address, ref_buf->stride, wsrc, mask) +
              mvsad_err_cost_(best_mv, mv_cost_params);
 
-  for (step = tot_steps; step >= 0; --step) {
-    const search_site *const ss = cfg->ss[step];
+  for (int step = tot_steps - 1; step >= 0; --step) {
+    const search_site *const site = cfg->site[step];
     best_site = 0;
     for (int idx = 1; idx <= cfg->searches_per_step[step]; ++idx) {
-      const FULLPEL_MV mv = { best_mv->row + ss[idx].mv.row,
-                              best_mv->col + ss[idx].mv.col };
+      const FULLPEL_MV mv = { best_mv->row + site[idx].mv.row,
+                              best_mv->col + site[idx].mv.col };
       if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
-        int sad = fn_ptr->osdf(best_address + ss[idx].offset, ref_buf->stride,
+        int sad = fn_ptr->osdf(best_address + site[idx].offset, ref_buf->stride,
                                wsrc, mask);
         if (sad < best_sad) {
           sad += mvsad_err_cost_(&mv, mv_cost_params);
@@ -1885,9 +2180,9 @@ static int obmc_diamond_search_sad(
     }
 
     if (best_site != 0) {
-      best_mv->row += ss[best_site].mv.row;
-      best_mv->col += ss[best_site].mv.col;
-      best_address += ss[best_site].offset;
+      best_mv->row += site[best_site].mv.row;
+      best_mv->col += site[best_site].mv.col;
+      best_address += site[best_site].offset;
     } else if (best_address == init_ref) {
       (*num00)++;
     }
@@ -1897,7 +2192,7 @@ static int obmc_diamond_search_sad(
 
 static int obmc_full_pixel_diamond(
     const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv,
-    int step_param, int do_refine, FULLPEL_MV *best_mv) {
+    int step_param, FULLPEL_MV *best_mv) {
   const search_site_config *cfg = ms_params->search_sites;
   FULLPEL_MV tmp_mv;
   int thissme, n, num00 = 0;
@@ -1908,8 +2203,7 @@ static int obmc_full_pixel_diamond(
 
   // If there won't be more n-step search, check to see if refining search is
   // needed.
-  const int further_steps = cfg->ss_count - 1 - step_param;
-  if (n > further_steps) do_refine = 0;
+  const int further_steps = cfg->num_search_steps - 1 - step_param;
 
   while (n < further_steps) {
     ++n;
@@ -1921,9 +2215,6 @@ static int obmc_full_pixel_diamond(
                                         step_param + n, &num00);
       if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv);
 
-      // check to see if refining search is needed.
-      if (num00 > further_steps - n) do_refine = 0;
-
       if (thissme < bestsme) {
         bestsme = thissme;
         *best_mv = tmp_mv;
@@ -1931,16 +2222,6 @@ static int obmc_full_pixel_diamond(
     }
   }
 
-  // final 1-away diamond refining search
-  if (do_refine) {
-    tmp_mv = *best_mv;
-    thissme = obmc_refining_search_sad(ms_params, &tmp_mv);
-    if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv);
-    if (thissme < bestsme) {
-      bestsme = thissme;
-      *best_mv = tmp_mv;
-    }
-  }
   return bestsme;
 }
 
@@ -1948,9 +2229,8 @@ int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
                                const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
                                const int step_param, FULLPEL_MV *best_mv) {
   if (!ms_params->fast_obmc_search) {
-    const int do_refine = 1;
-    const int bestsme = obmc_full_pixel_diamond(ms_params, start_mv, step_param,
-                                                do_refine, best_mv);
+    const int bestsme =
+        obmc_full_pixel_diamond(ms_params, start_mv, step_param, best_mv);
     return bestsme;
   } else {
     *best_mv = start_mv;
@@ -1980,7 +2260,6 @@ static INLINE int get_subpel_part(int x) { return x & 7; }
 
 // Gets the address of the ref buffer at subpel location (r, c), rounded to the
 // nearest fullpel precision toward - \infty
-
 static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
                                              const MV mv) {
   const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3);
@@ -2115,14 +2394,19 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
 // both prediction error and residue into account. It is suffixed "fast" because
 // it uses bilinear filter to estimate the prediction.
 static INLINE unsigned int check_better_fast(
-    const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
-    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion, int *has_better_mv) {
+    unsigned int *sse1, int *distortion, int *has_better_mv, int is_scaled) {
   unsigned int cost;
   if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
     unsigned int sse;
-    int thismse = estimated_pref_error(this_mv, var_params, &sse);
+    int thismse;
+    if (is_scaled) {
+      thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse);
+    } else {
+      thismse = estimated_pref_error(this_mv, var_params, &sse);
+    }
     cost = mv_err_cost_(this_mv, mv_cost_params);
     cost += thismse;
 
@@ -2180,39 +2464,41 @@ static INLINE MV get_best_diag_step(int step_size, unsigned int left_cost,
 // search in the best quadrant. This uses bilinear filter to speed up the
 // calculation.
 static AOM_FORCE_INLINE MV first_level_check_fast(
-    const MV this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv,
+    int hstep, const SubpelMvLimits *mv_limits,
     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion) {
+    unsigned int *sse1, int *distortion, int is_scaled) {
   // Check the four cardinal directions
   const MV left_mv = { this_mv.row, this_mv.col - hstep };
   int dummy = 0;
-  const unsigned int left =
-      check_better_fast(&left_mv, best_mv, mv_limits, var_params,
-                        mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int left = check_better_fast(
+      xd, cm, &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+      sse1, distortion, &dummy, is_scaled);
 
   const MV right_mv = { this_mv.row, this_mv.col + hstep };
-  const unsigned int right =
-      check_better_fast(&right_mv, best_mv, mv_limits, var_params,
-                        mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int right = check_better_fast(
+      xd, cm, &right_mv, best_mv, mv_limits, var_params, mv_cost_params,
+      besterr, sse1, distortion, &dummy, is_scaled);
 
   const MV top_mv = { this_mv.row - hstep, this_mv.col };
-  const unsigned int up =
-      check_better_fast(&top_mv, best_mv, mv_limits, var_params, mv_cost_params,
-                        besterr, sse1, distortion, &dummy);
+  const unsigned int up = check_better_fast(
+      xd, cm, &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+      sse1, distortion, &dummy, is_scaled);
 
   const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
-  const unsigned int down =
-      check_better_fast(&bottom_mv, best_mv, mv_limits, var_params,
-                        mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int down = check_better_fast(
+      xd, cm, &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params,
+      besterr, sse1, distortion, &dummy, is_scaled);
 
   const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
   const MV diag_mv = { this_mv.row + diag_step.row,
                        this_mv.col + diag_step.col };
 
   // Check the diagonal direction with the best mv
-  check_better_fast(&diag_mv, best_mv, mv_limits, var_params, mv_cost_params,
-                    besterr, sse1, distortion, &dummy);
+  check_better_fast(xd, cm, &diag_mv, best_mv, mv_limits, var_params,
+                    mv_cost_params, besterr, sse1, distortion, &dummy,
+                    is_scaled);
 
   return diag_step;
 }
@@ -2220,10 +2506,11 @@ static AOM_FORCE_INLINE MV first_level_check_fast(
 // Performs a following up search after first_level_check_fast is called. This
 // performs two extra chess pattern searches in the best quadrant.
 static AOM_FORCE_INLINE void second_level_check_fast(
-    const MV this_mv, const MV diag_step, MV *best_mv, int hstep,
-    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, const MV diag_step,
+    MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion) {
+    unsigned int *sse1, int *distortion, int is_scaled) {
   assert(diag_step.row == hstep || diag_step.row == -hstep);
   assert(diag_step.col == hstep || diag_step.col == -hstep);
   const int tr = this_mv.row;
@@ -2236,39 +2523,47 @@ static AOM_FORCE_INLINE void second_level_check_fast(
     assert(diag_step.row == br - tr);
     const MV chess_mv_1 = { br, bc + diag_step.col };
     const MV chess_mv_2 = { br + diag_step.row, bc };
-    check_better_fast(&chess_mv_1, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion, &dummy);
+    check_better_fast(xd, cm, &chess_mv_1, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
 
-    check_better_fast(&chess_mv_2, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion, &dummy);
+    check_better_fast(xd, cm, &chess_mv_2, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
   } else if (tr == br && tc != bc) {
     assert(diag_step.col == bc - tc);
     // Continue searching in the best direction
     const MV bottom_long_mv = { br + hstep, bc + diag_step.col };
     const MV top_long_mv = { br - hstep, bc + diag_step.col };
-    check_better_fast(&bottom_long_mv, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion, &dummy);
-    check_better_fast(&top_long_mv, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion, &dummy);
+    check_better_fast(xd, cm, &bottom_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
+    check_better_fast(xd, cm, &top_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
 
     // Search in the direction opposite of the best quadrant
     const MV rev_mv = { br - diag_step.row, bc };
-    check_better_fast(&rev_mv, best_mv, mv_limits, var_params, mv_cost_params,
-                      besterr, sse1, distortion, &dummy);
+    check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
   } else if (tr != br && tc == bc) {
     assert(diag_step.row == br - tr);
     // Continue searching in the best direction
     const MV right_long_mv = { br + diag_step.row, bc + hstep };
     const MV left_long_mv = { br + diag_step.row, bc - hstep };
-    check_better_fast(&right_long_mv, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion, &dummy);
-    check_better_fast(&left_long_mv, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion, &dummy);
+    check_better_fast(xd, cm, &right_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
+    check_better_fast(xd, cm, &left_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
 
     // Search in the direction opposite of the best quadrant
     const MV rev_mv = { br, bc - diag_step.col };
-    check_better_fast(&rev_mv, best_mv, mv_limits, var_params, mv_cost_params,
-                      besterr, sse1, distortion, &dummy);
+    check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy,
+                      is_scaled);
   }
 }
 
@@ -2276,17 +2571,18 @@ static AOM_FORCE_INLINE void second_level_check_fast(
 // searches the four cardinal directions, and perform several
 // diagonal/chess-pattern searches in the best quadrant.
 static AOM_FORCE_INLINE void two_level_checks_fast(
-    const MV this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv,
+    int hstep, const SubpelMvLimits *mv_limits,
     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion, int iters) {
-  const MV diag_step =
-      first_level_check_fast(this_mv, best_mv, hstep, mv_limits, var_params,
-                             mv_cost_params, besterr, sse1, distortion);
+    unsigned int *sse1, int *distortion, int iters, int is_scaled) {
+  const MV diag_step = first_level_check_fast(
+      xd, cm, this_mv, best_mv, hstep, mv_limits, var_params, mv_cost_params,
+      besterr, sse1, distortion, is_scaled);
   if (iters > 1) {
-    second_level_check_fast(this_mv, diag_step, best_mv, hstep, mv_limits,
-                            var_params, mv_cost_params, besterr, sse1,
-                            distortion);
+    second_level_check_fast(xd, cm, this_mv, diag_step, best_mv, hstep,
+                            mv_limits, var_params, mv_cost_params, besterr,
+                            sse1, distortion, is_scaled);
   }
 }
 
@@ -2334,7 +2630,7 @@ static AOM_FORCE_INLINE void second_level_check_v2(
     MV *best_mv, const SubpelMvLimits *mv_limits,
     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion) {
+    unsigned int *sse1, int *distortion, int is_scaled) {
   assert(best_mv->row == this_mv.row + diag_step.row ||
          best_mv->col == this_mv.col + diag_step.col);
   if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
@@ -2365,18 +2661,18 @@ static AOM_FORCE_INLINE void second_level_check_v2(
                    mv_cost_params, besterr, sse1, distortion, &has_better_mv);
     }
   } else {
-    check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion,
-                      &has_better_mv);
-    check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params,
-                      mv_cost_params, besterr, sse1, distortion,
-                      &has_better_mv);
+    check_better_fast(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &has_better_mv,
+                      is_scaled);
+    check_better_fast(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &has_better_mv,
+                      is_scaled);
 
     // Do an additional search if the second iteration gives a better mv
     if (has_better_mv) {
-      check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params,
+      check_better_fast(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
                         mv_cost_params, besterr, sse1, distortion,
-                        &has_better_mv);
+                        &has_better_mv, is_scaled);
     }
   }
 }
@@ -2494,80 +2790,18 @@ static INLINE int check_repeated_mv_and_update(int_mv *last_mv_search_list,
   return 0;
 }
 
-int av1_find_best_sub_pixel_tree_pruned_evenmore(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm,
-    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
-    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
-  (void)cm;
-  const int allow_hp = ms_params->allow_hp;
-  const int forced_stop = ms_params->forced_stop;
-  const int iters_per_step = ms_params->iters_per_step;
-  const int *cost_list = ms_params->cost_list;
-  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
-  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
-  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
-
-  // The iteration we are current searching for. Iter 0 corresponds to fullpel
-  // mv, iter 1 to half pel, and so on
-  int iter = 0;
-  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
-  unsigned int besterr = INT_MAX;
-  *bestmv = start_mv;
-
-  besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
-                               distortion);
-
-  if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
-    return INT_MAX;
-  }
-  iter++;
-
-  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
-      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
-      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
-    int ir, ic;
-    int dummy = 0;
-    get_cost_surf_min(cost_list, &ir, &ic, 2);
-    if (ir != 0 || ic != 0) {
-      const MV this_mv = { start_mv.row + 2 * ir, start_mv.col + 2 * ic };
-      check_better_fast(&this_mv, bestmv, mv_limits, var_params, mv_cost_params,
-                        &besterr, sse1, distortion, &dummy);
-    }
+static AOM_INLINE int setup_center_error_facade(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *bestmv,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion,
+    int is_scaled) {
+  if (is_scaled) {
+    return upsampled_setup_center_error(xd, cm, bestmv, var_params,
+                                        mv_cost_params, sse1, distortion);
   } else {
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
-
-    // Each subsequent iteration checks at least one point in common with
-    // the last iteration could be 2 ( if diag selected) 1/4 pel
-    if (forced_stop != HALF_PEL) {
-      if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
-        return INT_MAX;
-      }
-      iter++;
-
-      hstep >>= 1;
-      start_mv = *bestmv;
-      two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                            mv_cost_params, &besterr, sse1, distortion,
-                            iters_per_step);
-    }
+    return setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                              distortion);
   }
-
-  if (allow_hp && forced_stop == EIGHTH_PEL) {
-    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
-      return INT_MAX;
-    }
-    iter++;
-
-    hstep >>= 1;
-    start_mv = *bestmv;
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
-  }
-
-  return besterr;
 }
 
 int av1_find_best_sub_pixel_tree_pruned_more(
@@ -2590,8 +2824,15 @@ int av1_find_best_sub_pixel_tree_pruned_more(
   unsigned int besterr = INT_MAX;
   *bestmv = start_mv;
 
-  besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
-                               distortion);
+  const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+                                             ? &cm->sf_identity
+                                             : xd->block_ref_scale_factors[0];
+  const int is_scaled = av1_is_scaled(sf);
+  besterr = setup_center_error_facade(
+      xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion, is_scaled);
+
+  // If forced_stop is FULL_PEL, return.
+  if (forced_stop == FULL_PEL) return besterr;
 
   if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
     return INT_MAX;
@@ -2607,18 +2848,19 @@ int av1_find_best_sub_pixel_tree_pruned_more(
       const MV this_mv = { start_mv.row + ir * hstep,
                            start_mv.col + ic * hstep };
       int dummy = 0;
-      check_better_fast(&this_mv, bestmv, mv_limits, var_params, mv_cost_params,
-                        &besterr, sse1, distortion, &dummy);
+      check_better_fast(xd, cm, &this_mv, bestmv, mv_limits, var_params,
+                        mv_cost_params, &besterr, sse1, distortion, &dummy,
+                        is_scaled);
     }
   } else {
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
   }
 
   // Each subsequent iteration checks at least one point in common with
   // the last iteration could be 2 ( if diag selected) 1/4 pel
-  if (forced_stop != HALF_PEL) {
+  if (forced_stop < HALF_PEL) {
     if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
       return INT_MAX;
     }
@@ -2626,9 +2868,9 @@ int av1_find_best_sub_pixel_tree_pruned_more(
 
     hstep >>= 1;
     start_mv = *bestmv;
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
   }
 
   if (allow_hp && forced_stop == EIGHTH_PEL) {
@@ -2639,9 +2881,9 @@ int av1_find_best_sub_pixel_tree_pruned_more(
 
     hstep >>= 1;
     start_mv = *bestmv;
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
   }
 
   return besterr;
@@ -2667,8 +2909,16 @@ int av1_find_best_sub_pixel_tree_pruned(
   unsigned int besterr = INT_MAX;
   *bestmv = start_mv;
 
-  besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
-                               distortion);
+  const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+                                             ? &cm->sf_identity
+                                             : xd->block_ref_scale_factors[0];
+  const int is_scaled = av1_is_scaled(sf);
+  besterr = setup_center_error_facade(
+      xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion, is_scaled);
+
+  // If forced_stop is FULL_PEL, return.
+  if (forced_stop == FULL_PEL) return besterr;
+
   if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
     return INT_MAX;
   }
@@ -2694,47 +2944,59 @@ int av1_find_best_sub_pixel_tree_pruned(
 
     switch (whichdir) {
       case 0:  // bottom left quadrant
-        check_better_fast(&left_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&bottom_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&bottom_left_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &bottom_left_mv, bestmv, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, &dummy, is_scaled);
         break;
       case 1:  // bottom right quadrant
-        check_better_fast(&right_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&bottom_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&bottom_right_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &bottom_right_mv, bestmv, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, &dummy, is_scaled);
         break;
       case 2:  // top left quadrant
-        check_better_fast(&left_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&top_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&top_left_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &top_left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
         break;
       case 3:  // top right quadrant
-        check_better_fast(&right_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&top_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
-        check_better_fast(&top_right_mv, bestmv, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
+        check_better_fast(xd, cm, &top_right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy,
+                          is_scaled);
         break;
     }
   } else {
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
   }
 
   // Each subsequent iteration checks at least one point in common with
   // the last iteration could be 2 ( if diag selected) 1/4 pel
-  if (forced_stop != HALF_PEL) {
+  if (forced_stop < HALF_PEL) {
     if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
       return INT_MAX;
     }
@@ -2742,9 +3004,9 @@ int av1_find_best_sub_pixel_tree_pruned(
 
     hstep >>= 1;
     start_mv = *bestmv;
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
   }
 
   if (allow_hp && forced_stop == EIGHTH_PEL) {
@@ -2755,9 +3017,9 @@ int av1_find_best_sub_pixel_tree_pruned(
 
     hstep >>= 1;
     start_mv = *bestmv;
-    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
-                          mv_cost_params, &besterr, sse1, distortion,
-                          iters_per_step);
+    two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+                          var_params, mv_cost_params, &besterr, sse1,
+                          distortion, iters_per_step, is_scaled);
   }
 
   return besterr;
@@ -2786,6 +3048,11 @@ int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
 
   *bestmv = start_mv;
 
+  const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+                                             ? &cm->sf_identity
+                                             : xd->block_ref_scale_factors[0];
+  const int is_scaled = av1_is_scaled(sf);
+
   if (subpel_search_type != USE_2_TAPS_ORIG) {
     besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params,
                                            mv_cost_params, sse1, distortion);
@@ -2794,6 +3061,9 @@ int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
                                  distortion);
   }
 
+  // If forced_stop is FULL_PEL, return.
+  if (!round) return besterr;
+
   for (int iter = 0; iter < round; ++iter) {
     MV iter_center_mv = *bestmv;
     if (check_repeated_mv_and_update(last_mv_search_list, iter_center_mv,
@@ -2807,16 +3077,16 @@ int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
                                     mv_limits, var_params, mv_cost_params,
                                     &besterr, sse1, distortion);
     } else {
-      diag_step = first_level_check_fast(iter_center_mv, bestmv, hstep,
+      diag_step = first_level_check_fast(xd, cm, iter_center_mv, bestmv, hstep,
                                          mv_limits, var_params, mv_cost_params,
-                                         &besterr, sse1, distortion);
+                                         &besterr, sse1, distortion, is_scaled);
     }
 
     // Check diagonal sub-pixel position
     if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
       second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
                             mv_limits, var_params, mv_cost_params, &besterr,
-                            sse1, distortion);
+                            sse1, distortion, is_scaled);
     }
 
     hstep >>= 1;
@@ -2881,6 +3151,7 @@ int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
   return besterr;
 }
 
+#if !CONFIG_REALTIME_ONLY
 // Computes the cost of the current predictor by going through the whole
 // av1_enc_build_inter_predictor pipeline. This is mainly used by warped mv
 // during motion_mode_rd. We are going through the whole
@@ -2948,9 +3219,10 @@ unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
       if (av1_is_subpelmv_in_range(mv_limits, this_mv)) {
         memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
         memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
-        if (total_samples > 1)
+        if (total_samples > 1) {
           mbmi->num_proj_ref =
               av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
+        }
 
         if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
                                  this_mv.row, this_mv.col, &mbmi->wm_params,
@@ -2979,6 +3251,7 @@ unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
   mbmi->num_proj_ref = best_num_proj_ref;
   return bestmse;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 // =============================================================================
 //  Subpixel Motion Search: OBMC
 // =============================================================================
@@ -3324,22 +3597,18 @@ int av1_find_best_obmc_sub_pixel_tree_up(
 // =============================================================================
 //  Public cost function: mv_cost + pred error
 // =============================================================================
-int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
-                       const MV *ref_mv, const aom_variance_fn_ptr_t *vfp) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const MV mv = get_mv_from_fullmv(best_mv);
-  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
+int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
+                       const FULLPEL_MV best_mv,
+                       const aom_variance_fn_ptr_t *vfp,
+                       const struct buf_2d *src, const struct buf_2d *pre) {
+  const MV mv = get_mv_from_fullmv(&best_mv);
   unsigned int sse, var;
 
-  var = vfp->vf(what->buf, what->stride, get_buf_from_fullmv(in_what, best_mv),
-                in_what->stride, &sse);
+  var = vfp->vf(src->buf, src->stride, get_buf_from_fullmv(pre, &best_mv),
+                pre->stride, &sse);
   (void)var;
 
-  return sse + mv_err_cost(&mv, ref_mv, x->nmv_vec_cost,
-                           CONVERT_TO_CONST_MVCOST(x->mv_cost_stack),
-                           x->errorperbit, mv_cost_type);
+  return sse + mv_err_cost_(&mv, mv_cost_params);
 }
 
 static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params,
@@ -3348,13 +3617,11 @@ static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params,
                                     const aom_variance_fn_ptr_t *vfp,
                                     const struct buf_2d *src,
                                     const struct buf_2d *pre) {
-  const struct buf_2d *const what = src;
-  const struct buf_2d *const in_what = pre;
   const MV mv = get_mv_from_fullmv(&best_mv);
   unsigned int unused;
 
-  return vfp->svaf(get_buf_from_fullmv(in_what, &best_mv), in_what->stride, 0,
-                   0, what->buf, what->stride, &unused, second_pred) +
+  return vfp->svaf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0,
+                   src->buf, src->stride, &unused, second_pred) +
          mv_err_cost_(&mv, mv_cost_params);
 }
 
@@ -3363,14 +3630,12 @@ static INLINE int get_mvpred_mask_var(
     const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
     int invert_mask, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src,
     const struct buf_2d *pre) {
-  const struct buf_2d *const what = src;
-  const struct buf_2d *const in_what = pre;
   const MV mv = get_mv_from_fullmv(&best_mv);
   unsigned int unused;
 
-  return vfp->msvf(what->buf, what->stride, 0, 0,
-                   get_buf_from_fullmv(in_what, &best_mv), in_what->stride,
-                   second_pred, mask, mask_stride, invert_mask, &unused) +
+  return vfp->msvf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0,
+                   src->buf, src->stride, second_pred, mask, mask_stride,
+                   invert_mask, &unused) +
          mv_err_cost_(&mv, mv_cost_params);
 }
 
diff --git a/media/libaom/src/av1/encoder/mcomp.h b/media/libaom/src/av1/encoder/mcomp.h
index 73135d8591..67f2328b10 100644
--- a/media/libaom/src/av1/encoder/mcomp.h
+++ b/media/libaom/src/av1/encoder/mcomp.h
@@ -14,6 +14,7 @@
 
 #include "av1/common/mv.h"
 #include "av1/encoder/block.h"
+#include "av1/encoder/rd.h"
 
 #include "aom_dsp/variance.h"
 
@@ -42,8 +43,9 @@ typedef struct search_site {
 } search_site;
 
 typedef struct search_site_config {
-  search_site ss[MAX_MVSEARCH_STEPS * 2][16 + 1];
-  int ss_count;
+  search_site site[MAX_MVSEARCH_STEPS * 2][16 + 1];
+  // Number of search steps.
+  int num_search_steps;
   int searches_per_step[MAX_MVSEARCH_STEPS * 2];
   int radius[MAX_MVSEARCH_STEPS * 2];
   int stride;
@@ -60,21 +62,34 @@ struct SPEED_FEATURES;
 // =============================================================================
 //  Cost functions
 // =============================================================================
+
+enum {
+  MV_COST_ENTROPY,    // Use the entropy rate of the mv as the cost
+  MV_COST_L1_LOWRES,  // Use the l1 norm of the mv as the cost (<480p)
+  MV_COST_L1_MIDRES,  // Use the l1 norm of the mv as the cost (>=480p)
+  MV_COST_L1_HDRES,   // Use the l1 norm of the mv as the cost (>=720p)
+  MV_COST_NONE        // Use 0 as as cost irrespective of the current mv
+} UENUM1BYTE(MV_COST_TYPE);
+
 typedef struct {
+  // The reference mv used to compute the mv cost
   const MV *ref_mv;
   FULLPEL_MV full_ref_mv;
+  MV_COST_TYPE mv_cost_type;
   const int *mvjcost;
   const int *mvcost[2];
   int error_per_bit;
+  // A multiplier used to convert rate to sad cost
   int sad_per_bit;
-  MV_COST_TYPE mv_cost_type;
 } MV_COST_PARAMS;
 
 int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
-                    int *mvcost[2], int weight);
+                    int *const mvcost[2], int weight);
 
-int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
-                       const MV *ref_mv, const aom_variance_fn_ptr_t *vfp);
+int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
+                       const FULLPEL_MV best_mv,
+                       const aom_variance_fn_ptr_t *vfp,
+                       const struct buf_2d *src, const struct buf_2d *pre);
 int av1_get_mvpred_compound_var(const MV_COST_PARAMS *ms_params,
                                 const FULLPEL_MV best_mv,
                                 const uint8_t *second_pred, const uint8_t *mask,
@@ -116,23 +131,50 @@ static INLINE void av1_set_ms_compound_refs(MSBuffers *ms_buffers,
 //  Fullpixel Motion Search
 // =============================================================================
 enum {
+  // Search 8-points in the radius grid around center, up to 11 search stages.
   DIAMOND = 0,
+  // Search 12-points in the radius/tan_radius grid around center,
+  // up to 15 search stages.
   NSTEP = 1,
-  HEX = 2,
-  BIGDIA = 3,
-  SQUARE = 4,
-  FAST_HEX = 5,
-  FAST_DIAMOND = 6
+  // Search 8-points in the radius grid around center, up to 16 search stages.
+  NSTEP_8PT = 2,
+  // Search 8-points in the radius grid around center, upto 11 search stages
+  // with clamping of search radius.
+  CLAMPED_DIAMOND = 3,
+  // Search maximum 8-points in the radius grid around center,
+  // up to 11 search stages. First stage consists of 8 search points
+  // and the rest with 6 search points each in hex shape.
+  HEX = 4,
+  // Search maximum 8-points in the radius grid around center,
+  // up to 11 search stages. First stage consists of 4 search
+  // points and the rest with 8 search points each.
+  BIGDIA = 5,
+  // Search 8-points in the square grid around center, up to 11 search stages.
+  SQUARE = 6,
+  // HEX search with up to 2 stages.
+  FAST_HEX = 7,
+  // BIGDIA search with up to 2 stages.
+  FAST_DIAMOND = 8,
+  // BIGDIA search with up to 3 stages.
+  FAST_BIGDIA = 9,
+  // Total number of search methods.
+  NUM_SEARCH_METHODS,
+  // Number of distinct search methods.
+  NUM_DISTINCT_SEARCH_METHODS = SQUARE + 1,
 } UENUM1BYTE(SEARCH_METHODS);
 
 // This struct holds fullpixel motion search parameters that should be constant
 // during the search
 typedef struct {
   BLOCK_SIZE bsize;
+  // A function pointer to the simd function for fast computation
   const aom_variance_fn_ptr_t *vfp;
 
   MSBuffers ms_buffers;
 
+  // WARNING: search_method should be regarded as a private variable and should
+  // not be modified directly so it is in sync with search_sites. To modify it,
+  // use av1_set_mv_search_method.
   SEARCH_METHODS search_method;
   const search_site_config *search_sites;
   FullMvLimits mv_limits;
@@ -141,30 +183,84 @@ typedef struct {
                           // prune_mesh_search.
   int prune_mesh_search;  // Disables mesh search if the best_mv after a normal
                           // search if close to the start_mv.
+  int mesh_search_mv_diff_threshold;  // mv diff threshold to enable
+                                      // prune_mesh_search
   int force_mesh_thresh;  // Forces mesh search if the residue variance is
                           // higher than the threshold.
   const struct MESH_PATTERN *mesh_patterns[2];
 
+  // Use maximum search interval of 4 if true. This helps motion search to find
+  // the best motion vector for screen content types.
+  int fine_search_interval;
+
   int is_intra_mode;
 
   int fast_obmc_search;
 
   // For calculating mv cost
   MV_COST_PARAMS mv_cost_params;
+
+  // Stores the function used to compute the sad. This can be different from the
+  // sdf in vfp (e.g. downsampled sad and not sad) to allow speed up.
+  aom_sad_fn_t sdf;
+  aom_sad_multi_d_fn_t sdx4df;
 } FULLPEL_MOTION_SEARCH_PARAMS;
 
-void av1_make_default_fullpel_ms_params(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
-                                        const struct AV1_COMP *cpi,
-                                        const MACROBLOCK *x, BLOCK_SIZE bsize,
-                                        const MV *ref_mv,
-                                        const search_site_config *search_sites);
+void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer);
+
+void av1_make_default_fullpel_ms_params(
+    FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
+    const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
+    const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
+    int fine_search_interval);
+
+/*! Sets the \ref FULLPEL_MOTION_SEARCH_PARAMS to intra mode. */
+void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const IntraBCMVCosts *dv_costs);
 
-// Sets up configs for fullpixel diamond search
-void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
-// Sets up configs for firstpass motion search
+// Sets up configs for fullpixel DIAMOND / CLAMPED_DIAMOND search method.
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride,
+                                    int level);
+// Sets up configs for firstpass motion search.
 void av1_init_motion_fpf(search_site_config *cfg, int stride);
-// Sets up configs for all other types of motion search
-void av1_init3smotion_compensation(search_site_config *cfg, int stride);
+// Sets up configs for NSTEP / NSTEP_8PT motion search method.
+void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride,
+                                        int level);
+// Sets up configs for BIGDIA / FAST_DIAMOND / FAST_BIGDIA
+// motion search method.
+void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride,
+                                         int level);
+// Sets up configs for HEX or FAST_HEX motion search method.
+void av1_init_motion_compensation_hex(search_site_config *cfg, int stride,
+                                      int level);
+// Sets up configs for SQUARE motion search method.
+void av1_init_motion_compensation_square(search_site_config *cfg, int stride,
+                                         int level);
+
+// Mv beyond the range do not produce new/different prediction block.
+static INLINE void av1_set_mv_search_method(
+    FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
+    SEARCH_METHODS search_method) {
+  // Array to inform which all search methods are having
+  // same candidates and different in number of search steps.
+  static const SEARCH_METHODS search_method_lookup[NUM_SEARCH_METHODS] = {
+    DIAMOND,          // DIAMOND
+    NSTEP,            // NSTEP
+    NSTEP_8PT,        // NSTEP_8PT
+    CLAMPED_DIAMOND,  // CLAMPED_DIAMOND
+    HEX,              // HEX
+    BIGDIA,           // BIGDIA
+    SQUARE,           // SQUARE
+    HEX,              // FAST_HEX
+    BIGDIA,           // FAST_DIAMOND
+    BIGDIA            // FAST_BIGDIA
+  };
+
+  ms_params->search_method = search_method;
+  ms_params->search_sites =
+      &search_sites[search_method_lookup[ms_params->search_method]];
+}
 
 // Set up limit values for MV components.
 // Mv beyond the range do not produce new/different prediction block.
@@ -281,7 +377,6 @@ typedef int(fractional_mv_step_fp)(MACROBLOCKD *xd, const AV1_COMMON *const cm,
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more;
-extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_evenmore;
 extern fractional_mv_step_fp av1_return_max_sub_pixel_mv;
 extern fractional_mv_step_fp av1_return_min_sub_pixel_mv;
 extern fractional_mv_step_fp av1_find_best_obmc_sub_pixel_tree_up;
diff --git a/media/libaom/src/av1/encoder/ml.c b/media/libaom/src/av1/encoder/ml.c
index 57228ec918..5078fb1743 100644
--- a/media/libaom/src/av1/encoder/ml.c
+++ b/media/libaom/src/av1/encoder/ml.c
@@ -16,7 +16,7 @@
 #include "av1/encoder/ml.h"
 
 void av1_nn_output_prec_reduce(float *const output, int num_output) {
-  const int prec_bits = 11;
+  const int prec_bits = 9;
   const int prec = 1 << prec_bits;
   const float inv_prec = (float)(1.0 / prec);
   for (int i = 0; i < num_output; i++) {
@@ -143,14 +143,44 @@ void av1_nn_softmax(const float *input, float *output, int n) {
   // Softmax function is invariant to adding the same constant
   // to all input values, so we subtract the maximum input to avoid
   // possible overflow.
-  float max_inp = input[0];
-  for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]);
+  float max_input = input[0];
+  for (int i = 1; i < n; i++) max_input = AOMMAX(max_input, input[i]);
   float sum_out = 0.0f;
   for (int i = 0; i < n; i++) {
     // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
-    const float normalized_input = AOMMAX(input[i] - max_inp, -10.0f);
-    output[i] = (float)exp(normalized_input);
+    const float normalized_input = AOMMAX(input[i] - max_input, -10.0f);
+    output[i] = expf(normalized_input);
     sum_out += output[i];
   }
   for (int i = 0; i < n; i++) output[i] /= sum_out;
 }
+
+static AOM_INLINE float approx_exp(float y) {
+#define A ((1 << 23) / 0.69314718056f)  // (1 << 23) / ln(2)
+#define B \
+  127  // Offset for the exponent according to IEEE floating point standard.
+#define C 60801  // Magic number controls the accuracy of approximation
+  union {
+    float as_float;
+    int32_t as_int32;
+  } container;
+  container.as_int32 = ((int32_t)(y * A)) + ((B << 23) - C);
+  return container.as_float;
+#undef A
+#undef B
+#undef C
+}
+
+void av1_nn_fast_softmax_16_c(const float *input, float *output) {
+  const int kNumClasses = 16;
+  float max_input = input[0];
+  for (int i = 1; i < kNumClasses; i++) max_input = AOMMAX(max_input, input[i]);
+  float sum_out = 0.0f;
+  for (int i = 0; i < kNumClasses; i++) {
+    // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
+    const float normalized_input = AOMMAX(input[i] - max_input, -10.0f);
+    output[i] = approx_exp(normalized_input);
+    sum_out += output[i];
+  }
+  for (int i = 0; i < kNumClasses; i++) output[i] /= sum_out;
+}
diff --git a/media/libaom/src/av1/encoder/ml.h b/media/libaom/src/av1/encoder/ml.h
index 62d543d6b9..566f9271dd 100644
--- a/media/libaom/src/av1/encoder/ml.h
+++ b/media/libaom/src/av1/encoder/ml.h
@@ -71,6 +71,9 @@ void av1_nn_predict_v2(const float *features, NN_CONFIG_V2 *nn_config,
 // output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
 void av1_nn_softmax(const float *input, float *output, int n);
 
+// A faster but less accurate version of av1_nn_softmax(input, output, 16)
+void av1_nn_fast_softmax_16_c(const float *input, float *output);
+
 // Applies a precision reduction to output of av1_nn_predict to prevent
 // mismatches between C and SIMD implementations.
 void av1_nn_output_prec_reduce(float *const output, int num_output);
diff --git a/media/libaom/src/av1/encoder/model_rd.h b/media/libaom/src/av1/encoder/model_rd.h
index c353c8f85b..db5ede4948 100644
--- a/media/libaom/src/av1/encoder/model_rd.h
+++ b/media/libaom/src/av1/encoder/model_rd.h
@@ -17,7 +17,6 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/pustats.h"
 #include "av1/encoder/rdopt_utils.h"
-#include "aom_ports/system_state.h"
 #include "config/aom_dsp_rtcd.h"
 
 #ifdef __cplusplus
@@ -134,7 +133,6 @@ static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi,
     if (dist) *dist = 0;
     return;
   }
-  aom_clear_system_state();
   const double sse_norm = (double)sse / num_samples;
   const double qstepsqr = (double)qstep * qstep;
   const double xqr = log2(sse_norm / qstepsqr);
@@ -145,7 +143,6 @@ static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi,
   const double dist_f = dist_by_sse_norm_f * sse_norm;
   int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
   int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
-  aom_clear_system_state();
 
   // Check if skip is better
   if (rate_i == 0) {
diff --git a/media/libaom/src/av1/encoder/motion_search_facade.c b/media/libaom/src/av1/encoder/motion_search_facade.c
index 8db1423e7a..cad4e6225a 100644
--- a/media/libaom/src/av1/encoder/motion_search_facade.c
+++ b/media/libaom/src/av1/encoder/motion_search_facade.c
@@ -9,22 +9,22 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_ports/system_state.h"
-
 #include "av1/common/reconinter.h"
 
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/interp_search.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/tpl_model.h"
+#include "av1/encoder/tx_search.h"
 
 #define RIGHT_SHIFT_MV(x) (((x) + 3 + ((x) >= 0)) >> 3)
 
 typedef struct {
-  FULLPEL_MV fmv;
+  int_mv fmv;
   int weight;
 } cand_mv_t;
 
@@ -37,10 +37,91 @@ static int compare_weight(const void *a, const void *b) {
   return 0;
 }
 
+// Allow more mesh searches for screen content type on the ARF.
+static int use_fine_search_interval(const AV1_COMP *const cpi) {
+  return cpi->is_screen_content_type &&
+         cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == ARF_UPDATE &&
+         cpi->oxcf.speed <= 2;
+}
+
+// Iterate through the tpl and collect the mvs to be used as candidates
+static INLINE void get_mv_candidate_from_tpl(const AV1_COMP *const cpi,
+                                             const MACROBLOCK *x,
+                                             BLOCK_SIZE bsize, int ref,
+                                             cand_mv_t *cand, int *cand_count,
+                                             int *total_cand_weight) {
+  const SuperBlockEnc *sb_enc = &x->sb_enc;
+  if (!sb_enc->tpl_data_count) {
+    return;
+  }
+
+  const AV1_COMMON *cm = &cpi->common;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  const BLOCK_SIZE tpl_bsize =
+      convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+  const int tplw = mi_size_wide[tpl_bsize];
+  const int tplh = mi_size_high[tpl_bsize];
+  const int nw = mi_size_wide[bsize] / tplw;
+  const int nh = mi_size_high[bsize] / tplh;
+
+  if (nw >= 1 && nh >= 1) {
+    const int of_h = mi_row % mi_size_high[cm->seq_params->sb_size];
+    const int of_w = mi_col % mi_size_wide[cm->seq_params->sb_size];
+    const int start = of_h / tplh * sb_enc->tpl_stride + of_w / tplw;
+    int valid = 1;
+
+    // Assign large weight to start_mv, so it is always tested.
+    cand[0].weight = nw * nh;
+
+    for (int k = 0; k < nh; k++) {
+      for (int l = 0; l < nw; l++) {
+        const int_mv mv =
+            sb_enc
+                ->tpl_mv[start + k * sb_enc->tpl_stride + l][ref - LAST_FRAME];
+        if (mv.as_int == INVALID_MV) {
+          valid = 0;
+          break;
+        }
+
+        const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row),
+                                 GET_MV_RAWPEL(mv.as_mv.col) };
+        int unique = 1;
+        for (int m = 0; m < *cand_count; m++) {
+          if (RIGHT_SHIFT_MV(fmv.row) ==
+                  RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.row) &&
+              RIGHT_SHIFT_MV(fmv.col) ==
+                  RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.col)) {
+            unique = 0;
+            cand[m].weight++;
+            break;
+          }
+        }
+
+        if (unique) {
+          cand[*cand_count].fmv.as_fullmv = fmv;
+          cand[*cand_count].weight = 1;
+          (*cand_count)++;
+        }
+      }
+      if (!valid) break;
+    }
+
+    if (valid) {
+      *total_cand_weight = 2 * nh * nw;
+      if (*cand_count > 2)
+        qsort(cand, *cand_count, sizeof(cand[0]), &compare_weight);
+    }
+  }
+}
+
 void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                               BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
                               int search_range, inter_mode_info *mode_info,
-                              int_mv *best_mv) {
+                              int_mv *best_mv,
+                              struct HandleInterModeArgs *const args) {
   MACROBLOCKD *xd = &x->e_mbd;
   const AV1_COMMON *cm = &cpi->common;
   const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
@@ -53,6 +134,7 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
       av1_get_scaled_ref_frame(cpi, ref);
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
+  const MvCosts *mv_costs = x->mv_costs;
 
   if (scaled_ref_frame) {
     // Swap out the reference frame for a version that's been scaled to
@@ -80,43 +162,6 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
     step_param = mv_search_params->mv_step_param;
   }
 
-  if (cpi->sf.mv_sf.adaptive_motion_search && bsize < cm->seq_params.sb_size) {
-    int boffset =
-        2 * (mi_size_wide_log2[cm->seq_params.sb_size] -
-             AOMMIN(mi_size_high_log2[bsize], mi_size_wide_log2[bsize]));
-    step_param = AOMMAX(step_param, boffset);
-  }
-
-  if (cpi->sf.mv_sf.adaptive_motion_search) {
-    int bwl = mi_size_wide_log2[bsize];
-    int bhl = mi_size_high_log2[bsize];
-    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
-
-    if (tlevel < 5) {
-      step_param += 2;
-      step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 1);
-    }
-
-    // prev_mv_sad is not setup for dynamically scaled frames.
-    if (cpi->oxcf.resize_mode != RESIZE_RANDOM) {
-      int i;
-      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
-        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
-          x->pred_mv[ref].row = 0;
-          x->pred_mv[ref].col = 0;
-          best_mv->as_int = INVALID_MV;
-
-          if (scaled_ref_frame) {
-            // Swap back the original buffers before returning.
-            for (int j = 0; j < num_planes; ++j)
-              xd->plane[j].pre[ref_idx] = backup_yv12[j];
-          }
-          return;
-        }
-      }
-    }
-  }
-
   const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
   FULLPEL_MV start_mv;
   if (mbmi->motion_mode != SIMPLE_TRANSLATION)
@@ -125,64 +170,47 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
     start_mv = get_fullmv_from_mv(&ref_mv);
 
   // cand stores start_mv and all possible MVs in a SB.
-  cand_mv_t cand[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB + 1] = {
-    { { 0, 0 }, 0 }
-  };
-  cand[0].fmv = start_mv;
+  cand_mv_t cand[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB + 1];
+  av1_zero(cand);
+  cand[0].fmv.as_fullmv = start_mv;
   int cnt = 1;
   int total_weight = 0;
 
   if (!cpi->sf.mv_sf.full_pixel_search_level &&
       mbmi->motion_mode == SIMPLE_TRANSLATION) {
-    if (x->valid_cost_b) {
-      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
-      const int tplw = mi_size_wide[tpl_bsize];
-      const int tplh = mi_size_high[tpl_bsize];
-      const int nw = mi_size_wide[bsize] / tplw;
-      const int nh = mi_size_high[bsize] / tplh;
-
-      if (nw >= 1 && nh >= 1) {
-        const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
-        const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
-        const int start = of_h / tplh * x->cost_stride + of_w / tplw;
-        int valid = 1;
-
-        // Assign large weight to start_mv, so it is always tested.
-        cand[0].weight = nw * nh;
-
-        for (int k = 0; k < nh; k++) {
-          for (int l = 0; l < nw; l++) {
-            const int_mv mv =
-                x->mv_b[start + k * x->cost_stride + l][ref - LAST_FRAME];
-            if (mv.as_int == INVALID_MV) {
-              valid = 0;
-              break;
-            }
-
-            const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row),
-                                     GET_MV_RAWPEL(mv.as_mv.col) };
-            int unique = 1;
-            for (int m = 0; m < cnt; m++) {
-              if (RIGHT_SHIFT_MV(fmv.row) == RIGHT_SHIFT_MV(cand[m].fmv.row) &&
-                  RIGHT_SHIFT_MV(fmv.col) == RIGHT_SHIFT_MV(cand[m].fmv.col)) {
-                unique = 0;
-                cand[m].weight++;
-                break;
-              }
-            }
+    get_mv_candidate_from_tpl(cpi, x, bsize, ref, cand, &cnt, &total_weight);
+  }
 
-            if (unique) {
-              cand[cnt].fmv = fmv;
-              cand[cnt].weight = 1;
-              cnt++;
-            }
-          }
-          if (!valid) break;
+  const int cand_cnt = AOMMIN(2, cnt);
+  // TODO(any): Test the speed feature for OBMC_CAUSAL mode.
+  if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv &&
+      mbmi->motion_mode == SIMPLE_TRANSLATION) {
+    const int stack_size = args->start_mv_cnt;
+    for (int cand_idx = 0; cand_idx < cand_cnt; cand_idx++) {
+      int_mv *fmv_cand = &cand[cand_idx].fmv;
+      int skip_cand_mv = 0;
+
+      // Check difference between mvs in the stack and candidate mv.
+      for (int stack_idx = 0; stack_idx < stack_size; stack_idx++) {
+        FULLPEL_MV *fmv_stack = &args->start_mv_stack[stack_idx];
+        const int row = abs(fmv_stack->row - fmv_cand->as_fullmv.row);
+        const int col = abs(fmv_stack->col - fmv_cand->as_fullmv.col);
+
+        if (row <= 1 && col <= 1) {
+          skip_cand_mv = 1;
+          break;
         }
-
-        if (valid) {
-          total_weight = 2 * nh * nw;
-          if (cnt > 2) qsort(cand, cnt, sizeof(cand[0]), &compare_weight);
+      }
+      if (skip_cand_mv) {
+        // Mark the candidate mv as invalid so that motion search gets skipped.
+        cand[cand_idx].fmv.as_int = INVALID_MV;
+      } else {
+        // Store start mv candidate of full-pel search in the mv stack (except
+        // last ref_mv_idx).
+        if (mbmi->ref_mv_idx != MAX_REF_MV_SEARCH - 1) {
+          args->start_mv_stack[args->start_mv_cnt] = fmv_cand->as_fullmv;
+          args->start_mv_cnt++;
+          assert(args->start_mv_cnt <= (MAX_REF_MV_SEARCH - 1) * 2);
         }
       }
     }
@@ -190,14 +218,16 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   // Further reduce the search range.
   if (search_range < INT_MAX) {
-    const search_site_config *ss_cfg = &mv_search_params->ss_cfg[SS_CFG_SRC];
-    // MAx step_param is ss_cfg->ss_count.
+    const search_site_config *search_site_cfg =
+        &mv_search_params
+             ->search_site_cfg[SS_CFG_SRC][cpi->sf.mv_sf.search_method];
+    // Max step_param is search_site_cfg->num_search_steps.
     if (search_range < 1) {
-      step_param = ss_cfg->ss_count;
+      step_param = search_site_cfg->num_search_steps;
     } else {
-      while (ss_cfg->radius[ss_cfg->ss_count - step_param - 1] >
-                 (search_range << 1) &&
-             ss_cfg->ss_count - step_param - 1 > 0)
+      while (search_site_cfg->radius[search_site_cfg->num_search_steps -
+                                     step_param - 1] > (search_range << 1) &&
+             search_site_cfg->num_search_steps - step_param - 1 > 0)
         step_param++;
     }
   }
@@ -206,23 +236,28 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
   int_mv second_best_mv;
   best_mv->as_int = second_best_mv.as_int = INVALID_MV;
 
+  // Allow more mesh searches for screen content type on the ARF.
+  const int fine_search_interval = use_fine_search_interval(cpi);
   const search_site_config *src_search_sites =
-      &mv_search_params->ss_cfg[SS_CFG_SRC];
+      mv_search_params->search_site_cfg[SS_CFG_SRC];
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
-                                     src_search_sites);
+                                     src_search_sites, fine_search_interval);
 
   switch (mbmi->motion_mode) {
     case SIMPLE_TRANSLATION: {
+      // Perform a search with the top 2 candidates
       int sum_weight = 0;
-
-      for (int m = 0; m < cnt; m++) {
-        FULLPEL_MV smv = cand[m].fmv;
+      for (int m = 0; m < cand_cnt; m++) {
+        int_mv smv = cand[m].fmv;
         FULLPEL_MV this_best_mv, this_second_best_mv;
 
-        int thissme = av1_full_pixel_search(
-            smv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list),
-            &this_best_mv, &this_second_best_mv);
+        if (smv.as_int == INVALID_MV) continue;
+
+        int thissme =
+            av1_full_pixel_search(smv.as_fullmv, &full_ms_params, step_param,
+                                  cond_cost_list(cpi, cost_list), &this_best_mv,
+                                  &this_second_best_mv);
 
         if (thissme < bestsme) {
           bestsme = thissme;
@@ -231,7 +266,7 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
         }
 
         sum_weight += cand[m].weight;
-        if (m >= 2 || 4 * sum_weight > 3 * total_weight) break;
+        if (4 * sum_weight > 3 * total_weight) break;
       }
     } break;
     case OBMC_CAUSAL:
@@ -240,6 +275,7 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
       break;
     default: assert(0 && "Invalid motion mode!\n");
   }
+  if (best_mv->as_int == INVALID_MV) return;
 
   if (scaled_ref_frame) {
     // Swap back the original buffers for subpel motion search.
@@ -248,23 +284,20 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
-  // Terminate search with the current ref_idx if we have already encountered
-  // another ref_mv in the drl such that:
-  //  1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
-  //     search process as the current fullpel_mv.
-  //  2. The rate needed to encode the current fullpel_mv is larger than that
-  //     for the other ref_mv.
-  if (cpi->sf.inter_sf.skip_repeated_full_newmv &&
+  // Terminate search with the current ref_idx based on fullpel mv, rate cost,
+  // and other know cost.
+  if (cpi->sf.inter_sf.skip_newmv_in_drl >= 2 &&
       mbmi->motion_mode == SIMPLE_TRANSLATION &&
       best_mv->as_int != INVALID_MV) {
     int_mv this_mv;
     this_mv.as_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
     const int ref_mv_idx = mbmi->ref_mv_idx;
     const int this_mv_rate =
-        av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, x->nmv_vec_cost,
-                        x->mv_cost_stack, MV_COST_WEIGHT);
+        av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                        mv_costs->mv_cost_stack, MV_COST_WEIGHT);
     mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int;
     mode_info[ref_mv_idx].full_mv_rate = this_mv_rate;
+    mode_info[ref_mv_idx].full_mv_bestsme = bestsme;
 
     for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
       // Check if the motion search result same as previous results
@@ -285,6 +318,19 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
           return;
         }
       }
+
+      // Terminate the evaluation of current ref_mv_idx based on bestsme and
+      // drl_cost.
+      const int psme = mode_info[prev_ref_idx].full_mv_bestsme;
+      if (psme == INT_MAX) continue;
+      const int thr =
+          cpi->sf.inter_sf.skip_newmv_in_drl == 3 ? (psme + (psme >> 2)) : psme;
+      if (cpi->sf.inter_sf.skip_newmv_in_drl >= 3 &&
+          mode_info[ref_mv_idx].full_mv_bestsme > thr &&
+          mode_info[prev_ref_idx].drl_cost < mode_info[ref_mv_idx].drl_cost) {
+        best_mv->as_int = INVALID_MV;
+        return;
+      }
     }
   }
 
@@ -294,6 +340,8 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   const int use_fractional_mv =
       bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+  int best_mv_rate = 0;
+  int mv_rate_calculated = 0;
   if (use_fractional_mv) {
     int_mv fractional_ms_list[3];
     av1_set_fractional_mv(fractional_ms_list);
@@ -308,20 +356,74 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
       case SIMPLE_TRANSLATION:
         if (cpi->sf.mv_sf.use_accurate_subpel_search) {
           const int try_second = second_best_mv.as_int != INVALID_MV &&
-                                 second_best_mv.as_int != best_mv->as_int;
+                                 second_best_mv.as_int != best_mv->as_int &&
+                                 (cpi->sf.mv_sf.disable_second_mv <= 1);
           const int best_mv_var = mv_search_params->find_fractional_mv_step(
               xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &dis,
               &x->pred_sse[ref], fractional_ms_list);
 
           if (try_second) {
+            struct macroblockd_plane *p = xd->plane;
+            const BUFFER_SET orig_dst = {
+              { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+              { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+            };
+            int64_t rd = INT64_MAX;
+            if (!cpi->sf.mv_sf.disable_second_mv) {
+              // Calculate actual rd cost.
+              mbmi->mv[0].as_mv = best_mv->as_mv;
+              av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+                                            bsize, 0, 0);
+              av1_subtract_plane(x, bsize, 0);
+              RD_STATS this_rd_stats;
+              av1_init_rd_stats(&this_rd_stats);
+              av1_estimate_txfm_yrd(cpi, x, &this_rd_stats, INT64_MAX, bsize,
+                                    max_txsize_rect_lookup[bsize]);
+              int this_mv_rate = av1_mv_bit_cost(
+                  &best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                  mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+              rd = RDCOST(x->rdmult, this_mv_rate + this_rd_stats.rate,
+                          this_rd_stats.dist);
+            }
+
             MV this_best_mv;
             subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
             if (av1_is_subpelmv_in_range(&ms_params.mv_limits,
                                          subpel_start_mv)) {
+              unsigned int sse;
               const int this_var = mv_search_params->find_fractional_mv_step(
                   xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis,
-                  &x->pred_sse[ref], fractional_ms_list);
-              if (this_var < best_mv_var) best_mv->as_mv = this_best_mv;
+                  &sse, fractional_ms_list);
+
+              if (!cpi->sf.mv_sf.disable_second_mv) {
+                // If cpi->sf.mv_sf.disable_second_mv is 0, use actual rd cost
+                // to choose the better MV.
+                mbmi->mv[0].as_mv = this_best_mv;
+                av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+                                              bsize, 0, 0);
+                av1_subtract_plane(x, bsize, 0);
+                RD_STATS tmp_rd_stats;
+                av1_init_rd_stats(&tmp_rd_stats);
+                av1_estimate_txfm_yrd(cpi, x, &tmp_rd_stats, INT64_MAX, bsize,
+                                      max_txsize_rect_lookup[bsize]);
+                int tmp_mv_rate = av1_mv_bit_cost(
+                    &this_best_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                    mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+                int64_t tmp_rd =
+                    RDCOST(x->rdmult, tmp_rd_stats.rate + tmp_mv_rate,
+                           tmp_rd_stats.dist);
+                if (tmp_rd < rd) {
+                  best_mv->as_mv = this_best_mv;
+                  x->pred_sse[ref] = sse;
+                }
+              } else {
+                // If cpi->sf.mv_sf.disable_second_mv = 1, use var to decide the
+                // best MV.
+                if (this_var < best_mv_var) {
+                  best_mv->as_mv = this_best_mv;
+                  x->pred_sse[ref] = sse;
+                }
+              }
             }
           }
         } else {
@@ -337,19 +439,58 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
         break;
       default: assert(0 && "Invalid motion mode!\n");
     }
+
+    // Terminate search with the current ref_idx based on subpel mv and rate
+    // cost.
+    if (cpi->sf.inter_sf.skip_newmv_in_drl >= 1 && args != NULL &&
+        mbmi->motion_mode == SIMPLE_TRANSLATION &&
+        best_mv->as_int != INVALID_MV) {
+      const int ref_mv_idx = mbmi->ref_mv_idx;
+      best_mv_rate =
+          av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                          mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+      mv_rate_calculated = 1;
+
+      for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
+        if (!args->single_newmv_valid[prev_ref_idx][ref]) continue;
+        // Check if the motion vectors are the same.
+        if (best_mv->as_int == args->single_newmv[prev_ref_idx][ref].as_int) {
+          // Skip this evaluation if the previous one is skipped.
+          if (mode_info[prev_ref_idx].skip) {
+            mode_info[ref_mv_idx].skip = 1;
+            break;
+          }
+          // Compare the rate cost that we current know.
+          const int prev_rate_cost =
+              args->single_newmv_rate[prev_ref_idx][ref] +
+              mode_info[prev_ref_idx].drl_cost;
+          const int this_rate_cost =
+              best_mv_rate + mode_info[ref_mv_idx].drl_cost;
+
+          if (prev_rate_cost <= this_rate_cost) {
+            // If the current rate_cost is worse than the previous rate_cost,
+            // then we terminate the search for this ref_mv_idx.
+            mode_info[ref_mv_idx].skip = 1;
+            break;
+          }
+        }
+      }
+    }
   }
-  *rate_mv = av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, x->nmv_vec_cost,
-                             x->mv_cost_stack, MV_COST_WEIGHT);
 
-  if (cpi->sf.mv_sf.adaptive_motion_search &&
-      mbmi->motion_mode == SIMPLE_TRANSLATION)
-    x->pred_mv[ref] = best_mv->as_mv;
+  if (mv_rate_calculated) {
+    *rate_mv = best_mv_rate;
+  } else {
+    *rate_mv =
+        av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                        mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+  }
 }
 
-void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                             BLOCK_SIZE bsize, int_mv *cur_mv,
-                             const uint8_t *mask, int mask_stride,
-                             int *rate_mv) {
+int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, int_mv *cur_mv,
+                            const uint8_t *mask, int mask_stride, int *rate_mv,
+                            int allow_second_mv) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   const int pw = block_size_wide[bsize];
@@ -361,6 +502,7 @@ void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   assert(has_second_ref(mbmi));
   const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
   const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+  const MvCosts *mv_costs = x->mv_costs;
   int_mv ref_mv[2];
   int ite, ref;
 
@@ -383,7 +525,8 @@ void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   // Prediction buffer from second frame.
   DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
   uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
-  int_mv best_mv;
+
+  int_mv best_mv, second_best_mv;
 
   // Allow joint search multiple times iteratively for each reference frame
   // and break out of the search loop if it couldn't find a better mv.
@@ -442,18 +585,17 @@ void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     av1_enc_build_one_inter_predictor(second_pred, pw, &cur_mv[!id].as_mv,
                                       &inter_pred_params);
 
-    const int order_idx = id != 0;
-    av1_dist_wtd_comp_weight_assign(
-        cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
-        &xd->jcp_param.bck_offset, &xd->jcp_param.use_dist_wtd_comp_avg, 1);
-
     // Do full-pixel compound motion search on the current reference frame.
     if (id) xd->plane[plane].pre[0] = ref_yv12[id];
 
     // Make motion search params
     FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+    const search_site_config *src_search_sites =
+        cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
     av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
-                                       &ref_mv[id].as_mv, NULL);
+                                       &ref_mv[id].as_mv, src_search_sites,
+                                       /*fine_search_interval=*/0);
+
     av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
                              mask_stride, id);
 
@@ -461,16 +603,21 @@ void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv);
 
     // Small-range full-pixel motion search.
-    bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
-                                       &best_mv.as_fullmv);
-
-    if (bestsme < INT_MAX) {
-      bestsme = av1_get_mvpred_compound_var(
-          &full_ms_params.mv_cost_params, best_mv.as_fullmv, second_pred, mask,
-          mask_stride, id, &cpi->fn_ptr[bsize], &x->plane[0].src,
-          &ref_yv12[id]);
+    if (!cpi->sf.mv_sf.disable_extensive_joint_motion_search &&
+        mbmi->interinter_comp.type != COMPOUND_WEDGE) {
+      bestsme =
+          av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+                                &best_mv.as_fullmv, &second_best_mv.as_fullmv);
+    } else {
+      bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
+                                         &best_mv.as_fullmv);
+      second_best_mv = best_mv;
     }
 
+    const int try_second = second_best_mv.as_int != INVALID_MV &&
+                           second_best_mv.as_int != best_mv.as_int &&
+                           allow_second_mv;
+
     // Restore the pointer to the first (possibly scaled) prediction buffer.
     if (id) xd->plane[plane].pre[0] = ref_yv12[0];
 
@@ -504,6 +651,20 @@ void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
       MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
       bestsme = cpi->mv_search_params.find_fractional_mv_step(
           xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, &sse, NULL);
+
+      if (try_second) {
+        MV this_best_mv;
+        MV subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
+        if (av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)) {
+          const int thissme = cpi->mv_search_params.find_fractional_mv_step(
+              xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis, &sse,
+              NULL);
+          if (thissme < bestsme) {
+            best_mv.as_mv = this_best_mv;
+            bestsme = thissme;
+          }
+        }
+      }
     }
 
     // Restore the pointer to the first prediction buffer.
@@ -520,19 +681,21 @@ void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   for (ref = 0; ref < 2; ++ref) {
     const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
-    *rate_mv +=
-        av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, x->nmv_vec_cost,
-                        x->mv_cost_stack, MV_COST_WEIGHT);
+    *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv,
+                                mv_costs->nmv_joint_cost,
+                                mv_costs->mv_cost_stack, MV_COST_WEIGHT);
   }
+
+  return AOMMIN(last_besterr[0], last_besterr[1]);
 }
 
 // Search for the best mv for one component of a compound,
 // given that the other component is fixed.
-void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                                       BLOCK_SIZE bsize, MV *this_mv,
-                                       const uint8_t *second_pred,
-                                       const uint8_t *mask, int mask_stride,
-                                       int *rate_mv, int ref_idx) {
+int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, MV *this_mv,
+                                      const uint8_t *second_pred,
+                                      const uint8_t *mask, int mask_stride,
+                                      int *rate_mv, int ref_idx) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
@@ -540,6 +703,7 @@ void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   const int ref = mbmi->ref_frame[ref_idx];
   const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
   struct macroblockd_plane *const pd = &xd->plane[0];
+  const MvCosts *mv_costs = x->mv_costs;
 
   struct buf_2d backup_yv12[MAX_MB_PLANE];
   const YV12_BUFFER_CONFIG *const scaled_ref_frame =
@@ -550,7 +714,6 @@ void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   // Store the first prediction buffer.
   struct buf_2d orig_yv12;
-  struct buf_2d ref_yv12 = pd->pre[ref_idx];
   if (ref_idx) {
     orig_yv12 = pd->pre[0];
     pd->pre[0] = pd->pre[ref_idx];
@@ -575,8 +738,12 @@ void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   // Make motion search params
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  const search_site_config *src_search_sites =
+      cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
-                                     &ref_mv.as_mv, NULL);
+                                     &ref_mv.as_mv, src_search_sites,
+                                     /*fine_search_interval=*/0);
+
   av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
                            mask_stride, ref_idx);
 
@@ -584,14 +751,8 @@ void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv);
 
   // Small-range full-pixel motion search.
-  bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
-                                     &best_mv.as_fullmv);
-
-  if (bestsme < INT_MAX) {
-    bestsme = av1_get_mvpred_compound_var(
-        &full_ms_params.mv_cost_params, best_mv.as_fullmv, second_pred, mask,
-        mask_stride, ref_idx, &cpi->fn_ptr[bsize], &x->plane[0].src, &ref_yv12);
-  }
+  bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+                                  &best_mv.as_fullmv, NULL);
 
   if (scaled_ref_frame) {
     // Swap back the original buffers for subpel motion search.
@@ -626,8 +787,9 @@ void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   *rate_mv = 0;
 
-  *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                              x->mv_cost_stack, MV_COST_WEIGHT);
+  *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, mv_costs->nmv_joint_cost,
+                              mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+  return bestsme;
 }
 
 static AOM_INLINE void build_second_inter_pred(const AV1_COMP *cpi,
@@ -666,15 +828,11 @@ static AOM_INLINE void build_second_inter_pred(const AV1_COMP *cpi,
   // Get the prediction block from the 'other' reference frame.
   av1_enc_build_one_inter_predictor(second_pred, pw, other_mv,
                                     &inter_pred_params);
-
-  av1_dist_wtd_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
-                                  &xd->jcp_param.bck_offset,
-                                  &xd->jcp_param.use_dist_wtd_comp_avg, 1);
 }
 
 // Wrapper for av1_compound_single_motion_search, for the common case
 // where the second prediction is also an inter mode.
-void av1_compound_single_motion_search_interinter(
+int av1_compound_single_motion_search_interinter(
     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
     const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) {
   MACROBLOCKD *xd = &x->e_mbd;
@@ -692,8 +850,8 @@ void av1_compound_single_motion_search_interinter(
   MV *this_mv = &cur_mv[ref_idx].as_mv;
   const MV *other_mv = &cur_mv[!ref_idx].as_mv;
   build_second_inter_pred(cpi, x, bsize, other_mv, ref_idx, second_pred);
-  av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred, mask,
-                                    mask_stride, rate_mv, ref_idx);
+  return av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred,
+                                           mask, mask_stride, rate_mv, ref_idx);
 }
 
 static AOM_INLINE void do_masked_motion_search_indexed(
@@ -703,7 +861,7 @@ static AOM_INLINE void do_masked_motion_search_indexed(
   // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  BLOCK_SIZE sb_type = mbmi->sb_type;
+  BLOCK_SIZE sb_type = mbmi->bsize;
   const uint8_t *mask;
   const int mask_stride = block_size_wide[bsize];
 
@@ -715,7 +873,8 @@ static AOM_INLINE void do_masked_motion_search_indexed(
     av1_compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mask,
                                                  mask_stride, rate_mv, which);
   } else if (which == 2) {
-    av1_joint_motion_search(cpi, x, bsize, tmp_mv, mask, mask_stride, rate_mv);
+    av1_joint_motion_search(cpi, x, bsize, tmp_mv, mask, mask_stride, rate_mv,
+                            !cpi->sf.mv_sf.disable_second_mv);
   }
 }
 
@@ -728,7 +887,11 @@ int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
   MB_MODE_INFO *const mbmi = xd->mi[0];
   int_mv tmp_mv[2];
   int tmp_rate_mv = 0;
-  mbmi->interinter_comp.seg_mask = xd->seg_mask;
+  // TODO(jingning): The average compound mode has proper SAD and variance
+  // functions implemented, and is triggerd by setting the mask pointer as
+  // Null. Need to further implement those for frame distance weighted mode.
+  mbmi->interinter_comp.seg_mask =
+      mbmi->interinter_comp.type == COMPOUND_AVERAGE ? NULL : xd->seg_mask;
   const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp;
 
   if (this_mode == NEW_NEWMV) {
@@ -761,7 +924,7 @@ int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
   set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
 
   MB_MODE_INFO *mbmi = xd->mi[0];
-  mbmi->sb_type = bsize;
+  mbmi->bsize = bsize;
   mbmi->ref_frame[0] = ref;
   mbmi->ref_frame[1] = NONE_FRAME;
   mbmi->motion_mode = SIMPLE_TRANSLATION;
@@ -773,9 +936,12 @@ int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
   struct buf_2d backup_yv12;
   // ref_mv is used to calculate the cost of the motion vector
   const MV ref_mv = kZeroMv;
-  const int step_param = cpi->mv_search_params.mv_step_param;
+  const int step_param =
+      AOMMIN(cpi->mv_search_params.mv_step_param +
+                 cpi->sf.part_sf.simple_motion_search_reduce_search_steps,
+             MAX_MVSEARCH_STEPS - 2);
   const search_site_config *src_search_sites =
-      &cpi->mv_search_params.ss_cfg[SS_CFG_SRC];
+      cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
   int cost_list[5];
   const int ref_idx = 0;
   int var;
@@ -790,9 +956,11 @@ int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
                          num_planes);
   }
 
+  // Allow more mesh searches for screen content type on the ARF.
+  const int fine_search_interval = use_fine_search_interval(cpi);
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
-                                     src_search_sites);
+                                     src_search_sites, fine_search_interval);
 
   var = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
                               cond_cost_list(cpi, cost_list),
@@ -830,8 +998,6 @@ int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
   av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
                                 AOM_PLANE_Y, AOM_PLANE_Y);
 
-  aom_clear_system_state();
-
   if (scaled_ref_frame) {
     xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
   }
@@ -855,7 +1021,7 @@ int_mv av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
   const uint8_t *dst = xd->plane[0].dst.buf;
   const int dst_stride = xd->plane[0].dst.stride;
 
-  *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
+  *var = cpi->ppi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
 
   return best_mv;
 }
diff --git a/media/libaom/src/av1/encoder/motion_search_facade.h b/media/libaom/src/av1/encoder/motion_search_facade.h
index 3b86e93766..bf81fe243a 100644
--- a/media/libaom/src/av1/encoder/motion_search_facade.h
+++ b/media/libaom/src/av1/encoder/motion_search_facade.h
@@ -18,26 +18,27 @@
 extern "C" {
 #endif
 
+// TODO(any): rename this struct to something else. There is already another
+// struct called inter_modes_info, which makes this terribly confusing.
 typedef struct {
-  int64_t rd;
   int drl_cost;
-
-  int rate_mv;
-  int_mv mv;
-
   int_mv full_search_mv;
   int full_mv_rate;
+  int full_mv_bestsme;
+  int skip;
 } inter_mode_info;
 
+struct HandleInterModeArgs;
 void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                               BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
                               int search_range, inter_mode_info *mode_info,
-                              int_mv *best_mv);
+                              int_mv *best_mv,
+                              struct HandleInterModeArgs *const args);
 
-void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                             BLOCK_SIZE bsize, int_mv *cur_mv,
-                             const uint8_t *mask, int mask_stride,
-                             int *rate_mv);
+int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, int_mv *cur_mv,
+                            const uint8_t *mask, int mask_stride, int *rate_mv,
+                            int allow_second_mv);
 
 int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
                                           MACROBLOCK *x,
@@ -45,15 +46,15 @@ int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
                                           const BLOCK_SIZE bsize,
                                           const PREDICTION_MODE this_mode);
 
-void av1_compound_single_motion_search_interinter(
+int av1_compound_single_motion_search_interinter(
     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
     const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx);
 
-void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                                       BLOCK_SIZE bsize, MV *this_mv,
-                                       const uint8_t *second_pred,
-                                       const uint8_t *mask, int mask_stride,
-                                       int *rate_mv, int ref_idx);
+int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, MV *this_mv,
+                                      const uint8_t *second_pred,
+                                      const uint8_t *mask, int mask_stride,
+                                      int *rate_mv, int ref_idx);
 
 // Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
 // ref. Note that this sets the offset of mbmi, so we will need to reset it
diff --git a/media/libaom/src/av1/encoder/mv_prec.c b/media/libaom/src/av1/encoder/mv_prec.c
index 8fcbde98e0..b64f4dcd0e 100644
--- a/media/libaom/src/av1/encoder/mv_prec.c
+++ b/media/libaom/src/av1/encoder/mv_prec.c
@@ -11,8 +11,6 @@
 
 #include "config/aom_config.h"
 
-#include "aom_ports/system_state.h"
-
 #include "av1/encoder/encodemv.h"
 #if !CONFIG_REALTIME_ONLY
 #include "av1/encoder/misc_model_weights.h"
@@ -139,7 +137,6 @@ static AOM_INLINE void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv,
   const MV lp_diff = use_hp ? truncated_diff : diff;
   const int lp_mv_joint = av1_get_mv_joint(&lp_diff);
 
-  aom_clear_system_state();
   const int mv_joint_rate = get_symbol_cost(joint_cdf, mv_joint);
   const int hp_mv_joint_rate = get_symbol_cost(joint_cdf, hp_mv_joint);
   const int lp_mv_joint_rate = get_symbol_cost(joint_cdf, lp_mv_joint);
@@ -224,13 +221,13 @@ static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats,
   }
 
   // Add texture information
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->bsize;
   const int num_rows = block_size_high[bsize];
   const int num_cols = block_size_wide[bsize];
   const int y_stride = cpi->source->y_stride;
   const int px_row = 4 * mi_row, px_col = 4 * mi_col;
   const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
-  const int bd = cm->seq_params.bit_depth;
+  const int bd = cm->seq_params->bit_depth;
   if (buf_is_hbd) {
     uint16_t *source_buf =
         CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col;
@@ -339,8 +336,8 @@ static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats,
   const int mi_row_end = tile_info->mi_row_end;
   const int mi_col_start = tile_info->mi_col_start;
   const int mi_col_end = tile_info->mi_col_end;
-  const int sb_size_mi = cm->seq_params.mib_size;
-  BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const int sb_size_mi = cm->seq_params->mib_size;
+  BLOCK_SIZE sb_size = cm->seq_params->sb_size;
   for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) {
     for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) {
       collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size);
@@ -376,7 +373,6 @@ static AOM_INLINE int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats,
   const AV1_COMMON *cm = &cpi->common;
   const int order_hint = cpi->common.current_frame.order_hint;
   const int order_diff = order_hint - mv_stats->order;
-  aom_clear_system_state();
   const float area = (float)(cm->width * cm->height);
   float features[MV_PREC_FEATURE_SIZE] = {
     (float)current_q,
@@ -414,14 +410,17 @@ static AOM_INLINE int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats,
 
 void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex) {
   int use_hp = qindex < HIGH_PRECISION_MV_QTHRESH;
+#if !CONFIG_REALTIME_ONLY
+  MV_STATS *mv_stats = &cpi->mv_stats;
+#endif  // !CONFIG_REALTIME_ONLY
 
   if (cpi->sf.hl_sf.high_precision_mv_usage == QTR_ONLY) {
     use_hp = 0;
   }
 #if !CONFIG_REALTIME_ONLY
   else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
-           av1_frame_allows_smart_mv(cpi) && cpi->mv_stats.valid) {
-    use_hp = get_smart_mv_prec(cpi, &cpi->mv_stats, qindex);
+           av1_frame_allows_smart_mv(cpi) && mv_stats->valid) {
+    use_hp = get_smart_mv_prec(cpi, mv_stats, qindex);
   }
 #endif  // !CONFIG_REALTIME_ONLY
 
diff --git a/media/libaom/src/av1/encoder/mv_prec.h b/media/libaom/src/av1/encoder/mv_prec.h
index 8df8b96dc6..55108b6cdb 100644
--- a/media/libaom/src/av1/encoder/mv_prec.h
+++ b/media/libaom/src/av1/encoder/mv_prec.h
@@ -21,8 +21,8 @@
 void av1_collect_mv_stats(AV1_COMP *cpi, int current_q);
 
 static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) {
-  const int gf_group_index = cpi->gf_group.index;
-  const int gf_update_type = cpi->gf_group.update_type[gf_group_index];
+  const int gf_group_index = cpi->gf_frame_index;
+  const int gf_update_type = cpi->ppi->gf_group.update_type[gf_group_index];
   return !frame_is_intra_only(&cpi->common) &&
          !(gf_update_type == INTNL_OVERLAY_UPDATE ||
            gf_update_type == OVERLAY_UPDATE);
@@ -32,15 +32,19 @@ static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) {
 static AOM_INLINE void av1_set_high_precision_mv(
     AV1_COMP *cpi, int allow_high_precision_mv,
     int cur_frame_force_integer_mv) {
-  MACROBLOCK *const x = &cpi->td.mb;
+  MvCosts *const mv_costs = cpi->td.mb.mv_costs;
+  // Avoid accessing 'mv_costs' when it is not allocated.
+  if (mv_costs == NULL) return;
+
   const int copy_hp = cpi->common.features.allow_high_precision_mv =
       allow_high_precision_mv && !cur_frame_force_integer_mv;
-  x->nmvcost[0] = &x->nmv_costs[0][MV_MAX];
-  x->nmvcost[1] = &x->nmv_costs[1][MV_MAX];
-  x->nmvcost_hp[0] = &x->nmv_costs_hp[0][MV_MAX];
-  x->nmvcost_hp[1] = &x->nmv_costs_hp[1][MV_MAX];
-  int *(*src)[2] = copy_hp ? &x->nmvcost_hp : &x->nmvcost;
-  x->mv_cost_stack = *src;
+
+  mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX];
+  mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX];
+  mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX];
+  mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX];
+  mv_costs->mv_cost_stack =
+      copy_hp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost;
 }
 
 void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex);
diff --git a/media/libaom/src/av1/encoder/nonrd_opt.h b/media/libaom/src/av1/encoder/nonrd_opt.h
new file mode 100644
index 0000000000..39049e5929
--- /dev/null
+++ b/media/libaom/src/av1/encoder/nonrd_opt.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_NONRD_OPT_H_
+#define AOM_AV1_ENCODER_NONRD_OPT_H_
+
+#include "av1/encoder/rdopt_utils.h"
+
+/*!\brief Finds predicted motion vectors for a block.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds predicted motion vectors for a block from a certain reference frame.
+ * First, it fills reference MV stack, then picks the test from the stack and
+ * predicts the final MV for a block for each mode.
+ * \param[in]    cpi                      Top-level encoder structure
+ * \param[in]    x                        Pointer to structure holding all the
+ *                                        data for the current macroblock
+ * \param[in]    ref_frame                Reference frame for which to find
+ *                                        ref MVs
+ * \param[in]    frame_mv                 Predicted MVs for a block
+ * \param[in]    tile_data                Pointer to struct holding adaptive
+ *                                        data/contexts/models for the tile
+ *                                        during encoding
+ * \param[in]    yv12_mb                  Buffer to hold predicted block
+ * \param[in]    bsize                    Current block size
+ * \param[in]    force_skip_low_temp_var  Flag indicating possible mode search
+ *                                        prune for low temporal variance block
+ * \param[in]    skip_pred_mv             Flag indicating to skip av1_mv_pred
+ *
+ * \return Nothing is returned. Instead, predicted MVs are placed into
+ * \c frame_mv array
+ */
+static INLINE void find_predictors(
+    AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+    int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], TileDataEnc *tile_data,
+    struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize,
+    int force_skip_low_temp_var, int skip_pred_mv) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
+  const int num_planes = av1_num_planes(cm);
+  (void)tile_data;
+
+  x->pred_mv_sad[ref_frame] = INT_MAX;
+  x->pred_mv0_sad[ref_frame] = INT_MAX;
+  x->pred_mv1_sad[ref_frame] = INT_MAX;
+  frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+  // TODO(kyslov) this needs various further optimizations. to be continued..
+  assert(yv12 != NULL);
+  if (yv12 != NULL) {
+    const struct scale_factors *const sf =
+        get_ref_scale_factors_const(cm, ref_frame);
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+    av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                     xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                     mbmi_ext->mode_context);
+    // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+    // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+    av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+    av1_find_best_ref_mvs_from_stack(
+        cm->features.allow_high_precision_mv, mbmi_ext, ref_frame,
+        &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0);
+    frame_mv[GLOBALMV][ref_frame] = mbmi_ext->global_mvs[ref_frame];
+    // Early exit for non-LAST frame if force_skip_low_temp_var is set.
+    if (!av1_is_scaled(sf) && bsize >= BLOCK_8X8 && !skip_pred_mv &&
+        !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) {
+      av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+                  bsize);
+    }
+  }
+  av1_count_overlappable_neighbors(cm, xd);
+  mbmi->num_proj_ref = 1;
+}
+
+#endif  // AOM_AV1_ENCODER_NONRD_OPT_H_
diff --git a/media/libaom/src/av1/encoder/nonrd_pickmode.c b/media/libaom/src/av1/encoder/nonrd_pickmode.c
index a1180015c7..b7729e9c9c 100644
--- a/media/libaom/src/av1/encoder/nonrd_pickmode.c
+++ b/media/libaom/src/av1/encoder/nonrd_pickmode.c
@@ -15,6 +15,9 @@
 #include <math.h>
 #include <stdio.h>
 
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/blockd.h"
+#include "av1/encoder/encoder.h"
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
 
@@ -23,7 +26,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/encoder/model_rd.h"
 #include "av1/common/mvref_common.h"
@@ -32,10 +34,15 @@
 #include "av1/common/reconintra.h"
 
 #include "av1/encoder/encodemv.h"
+#include "av1/encoder/nonrd_opt.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/intra_mode_search.h"
 
 extern int g_pick_inter_mode_cnt;
+/*!\cond */
 typedef struct {
   uint8_t *data;
   int stride;
@@ -46,11 +53,18 @@ typedef struct {
   PRED_BUFFER *best_pred;
   PREDICTION_MODE best_mode;
   TX_SIZE best_tx_size;
-  TX_SIZE best_intra_tx_size;
+  TX_TYPE tx_type;
   MV_REFERENCE_FRAME best_ref_frame;
   MV_REFERENCE_FRAME best_second_ref_frame;
   uint8_t best_mode_skip_txfm;
+  uint8_t best_mode_initial_skip_flag;
   int_interpfilters best_pred_filter;
+  MOTION_MODE best_motion_mode;
+  WarpedMotionParams wm_params;
+  int num_proj_ref;
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE / 4];
+  PALETTE_MODE_INFO pmi;
+  int64_t best_sse;
 } BEST_PICKMODE;
 
 typedef struct {
@@ -58,12 +72,16 @@ typedef struct {
   PREDICTION_MODE pred_mode;
 } REF_MODE;
 
-static const int pos_shift_16x16[4][4] = {
-  { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
-};
+typedef struct {
+  InterpFilter filter_x;
+  InterpFilter filter_y;
+} INTER_FILTER;
+/*!\endcond */
+
+#define NUM_INTER_MODES_RT 9
+#define NUM_INTER_MODES_REDUCED 8
 
-#define RT_INTER_MODES 9
-static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
+static const REF_MODE ref_mode_set_rt[NUM_INTER_MODES_RT] = {
   { LAST_FRAME, NEARESTMV },   { LAST_FRAME, NEARMV },
   { LAST_FRAME, NEWMV },       { GOLDEN_FRAME, NEARESTMV },
   { GOLDEN_FRAME, NEARMV },    { GOLDEN_FRAME, NEWMV },
@@ -71,6 +89,15 @@ static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
   { ALTREF_FRAME, NEWMV }
 };
 
+// GLOBALMV in the set below is in fact ZEROMV as we don't do global ME in RT
+// mode
+static const REF_MODE ref_mode_set_reduced[NUM_INTER_MODES_REDUCED] = {
+  { LAST_FRAME, GLOBALMV },   { LAST_FRAME, NEARESTMV },
+  { GOLDEN_FRAME, GLOBALMV }, { LAST_FRAME, NEARMV },
+  { LAST_FRAME, NEWMV },      { GOLDEN_FRAME, NEARESTMV },
+  { GOLDEN_FRAME, NEARMV },   { GOLDEN_FRAME, NEWMV }
+};
+
 static const THR_MODES mode_idx[REF_FRAMES][4] = {
   { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH },
   { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV },
@@ -82,6 +109,14 @@ static const THR_MODES mode_idx[REF_FRAMES][4] = {
 static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED,
                                                    SMOOTH_PRED };
 
+static const INTER_FILTER filters_ref_set[9] = {
+  { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }, { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH },
+  { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH },  { EIGHTTAP_SMOOTH, EIGHTTAP_REGULAR },
+  { MULTITAP_SHARP, MULTITAP_SHARP },     { EIGHTTAP_REGULAR, MULTITAP_SHARP },
+  { MULTITAP_SHARP, EIGHTTAP_REGULAR },   { EIGHTTAP_SMOOTH, MULTITAP_SHARP },
+  { MULTITAP_SHARP, EIGHTTAP_SMOOTH }
+};
+
 static INLINE int mode_offset(const PREDICTION_MODE mode) {
   if (mode >= NEARESTMV) {
     return INTER_OFFSET(mode);
@@ -104,17 +139,93 @@ enum {
   INTER_NEAR_NEW = (1 << NEARMV) | (1 << NEWMV),
 };
 
+static INLINE int early_term_inter_search_with_sse(int early_term_idx,
+                                                   BLOCK_SIZE bsize,
+                                                   int64_t this_sse,
+                                                   int64_t best_sse,
+                                                   PREDICTION_MODE this_mode) {
+  // Aggressiveness to terminate inter mode search early is adjusted based on
+  // speed and block size.
+  static const double early_term_thresh[4][4] = { { 0.65, 0.65, 0.65, 0.7 },
+                                                  { 0.6, 0.65, 0.85, 0.9 },
+                                                  { 0.5, 0.5, 0.55, 0.6 },
+                                                  { 0.6, 0.75, 0.85, 0.85 } };
+  static const double early_term_thresh_newmv_nearestmv[4] = { 0.3, 0.3, 0.3,
+                                                               0.3 };
+
+  const int size_group = size_group_lookup[bsize];
+  assert(size_group < 4);
+  assert((early_term_idx > 0) && (early_term_idx < EARLY_TERM_INDICES));
+  const double threshold =
+      ((early_term_idx == EARLY_TERM_IDX_4) &&
+       (this_mode == NEWMV || this_mode == NEARESTMV))
+          ? early_term_thresh_newmv_nearestmv[size_group]
+          : early_term_thresh[early_term_idx - 1][size_group];
+
+  // Terminate inter mode search early based on best sse so far.
+  if ((early_term_idx > 0) && (threshold * this_sse > best_sse)) {
+    return 1;
+  }
+  return 0;
+}
+
 static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
+  bp->best_sse = INT64_MAX;
   bp->best_mode = NEARESTMV;
   bp->best_ref_frame = LAST_FRAME;
+  bp->best_second_ref_frame = NONE_FRAME;
   bp->best_tx_size = TX_8X8;
-  bp->best_intra_tx_size = TX_8X8;
+  bp->tx_type = DCT_DCT;
   bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
   bp->best_mode_skip_txfm = 0;
-  bp->best_second_ref_frame = NONE_FRAME;
+  bp->best_mode_initial_skip_flag = 0;
   bp->best_pred = NULL;
+  bp->best_motion_mode = SIMPLE_TRANSLATION;
+  bp->num_proj_ref = 0;
+  memset(&bp->wm_params, 0, sizeof(bp->wm_params));
+  memset(&bp->blk_skip, 0, sizeof(bp->blk_skip));
+  memset(&bp->pmi, 0, sizeof(bp->pmi));
 }
 
+static INLINE int subpel_select(AV1_COMP *cpi, BLOCK_SIZE bsize, int_mv *mv) {
+  int mv_thresh = 4;
+  const int is_low_resoln =
+      (cpi->common.width * cpi->common.height <= 320 * 240);
+  mv_thresh = (bsize > BLOCK_32X32) ? 2 : (bsize > BLOCK_16X16) ? 4 : 6;
+  if (cpi->rc.avg_frame_low_motion > 0 && cpi->rc.avg_frame_low_motion < 40)
+    mv_thresh = 12;
+  mv_thresh = (is_low_resoln) ? mv_thresh >> 1 : mv_thresh;
+  if (abs(mv->as_fullmv.row) >= mv_thresh ||
+      abs(mv->as_fullmv.col) >= mv_thresh)
+    return HALF_PEL;
+  else
+    return cpi->sf.mv_sf.subpel_force_stop;
+}
+
+/*!\brief Runs Motion Estimation for a specific block and specific ref frame.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds the best Motion Vector by running Motion Estimation for a specific
+ * block and a specific reference frame. Exits early if RDCost of Full Pel part
+ * exceeds best RD Cost fund so far
+ * \param[in]    cpi                      Top-level encoder structure
+ * \param[in]    x                        Pointer to structure holding all the
+ *                                        data for the current macroblock
+ * \param[in]    bsize                    Current block size
+ * \param[in]    mi_row                   Row index in 4x4 units
+ * \param[in]    mi_col                   Column index in 4x4 units
+ * \param[in]    tmp_mv                   Pointer to best found New MV
+ * \param[in]    rate_mv                  Pointer to Rate of the best new MV
+ * \param[in]    best_rd_sofar            RD Cost of the best mode found so far
+ * \param[in]    use_base_mv              Flag, indicating that tmp_mv holds
+ *                                        specific MV to start the search with
+ *
+ * \return Returns 0 if ME was terminated after Full Pel Search because too
+ * high RD Cost. Otherwise returns 1. Best New MV is placed into \c tmp_mv.
+ * Rate estimation for this vector is placed to \c rate_mv
+ */
 static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
                                   int_mv *tmp_mv, int *rate_mv,
@@ -124,7 +235,9 @@ static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
   const int num_planes = av1_num_planes(cm);
   MB_MODE_INFO *mi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
-  int step_param = cpi->mv_search_params.mv_step_param;
+  int step_param = (cpi->sf.rt_sf.fullpel_search_step_param)
+                       ? cpi->sf.rt_sf.fullpel_search_step_param
+                       : cpi->mv_search_params.mv_step_param;
   FULLPEL_MV start_mv;
   const int ref = mi->ref_frame[0];
   const MV ref_mv = av1_get_ref_mv(x, mi->ref_mv_idx).as_mv;
@@ -152,22 +265,22 @@ static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
     center_mv = ref_mv;
   else
     center_mv = tmp_mv->as_mv;
-
   const search_site_config *src_search_sites =
-      &cpi->mv_search_params.ss_cfg[SS_CFG_SRC];
+      cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
-                                     src_search_sites);
+                                     src_search_sites,
+                                     /*fine_search_interval=*/0);
 
-  av1_full_pixel_search(start_mv, &full_ms_params, step_param,
-                        cond_cost_list(cpi, cost_list), &tmp_mv->as_fullmv,
-                        NULL);
+  const unsigned int full_var_rd = av1_full_pixel_search(
+      start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list),
+      &tmp_mv->as_fullmv, NULL);
 
   // calculate the bit cost on motion vector
   MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv);
 
-  *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->nmv_vec_cost,
-                             x->mv_cost_stack, MV_COST_WEIGHT);
+  *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->mv_costs->nmv_joint_cost,
+                             x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
 
   // TODO(kyslov) Account for Rate Mode!
   rv = !(RDCOST(x->rdmult, (*rate_mv), 0) > best_rd_sofar);
@@ -176,65 +289,114 @@ static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
     SUBPEL_MOTION_SEARCH_PARAMS ms_params;
     av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
                                       cost_list);
+    if (cpi->sf.rt_sf.force_half_pel_block &&
+        cpi->sf.mv_sf.subpel_force_stop < HALF_PEL)
+      ms_params.forced_stop = subpel_select(cpi, bsize, tmp_mv);
+    if (cpi->sf.rt_sf.reduce_zeromv_mvres && ref_mv.row == 0 &&
+        ref_mv.col == 0 && start_mv.row == 0 && start_mv.col == 0) {
+      // If both the refmv and the fullpel results show zero mv, then there is
+      // high likelihood that the current block is static. So we can try to
+      // reduce the mv resolution here.
+      // These thresholds are the mean var rd collected from multiple encoding
+      // runs.
+      if ((bsize == BLOCK_64X64 && full_var_rd * 40 < 62267 * 7) ||
+          (bsize == BLOCK_32X32 && full_var_rd * 8 < 42380) ||
+          (bsize == BLOCK_16X16 && full_var_rd * 8 < 10127)) {
+        ms_params.forced_stop = HALF_PEL;
+      }
+    }
+
     MV subpel_start_mv = get_mv_from_fullmv(&tmp_mv->as_fullmv);
     cpi->mv_search_params.find_fractional_mv_step(
         xd, cm, &ms_params, subpel_start_mv, &tmp_mv->as_mv, &dis,
         &x->pred_sse[ref], NULL);
 
-    *rate_mv = av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmv_vec_cost,
-                               x->mv_cost_stack, MV_COST_WEIGHT);
+    *rate_mv =
+        av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->mv_costs->nmv_joint_cost,
+                        x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
   }
 
   if (scaled_ref_frame) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
   }
+  // Final MV can not be equal to referance MV as this will trigger assert
+  // later. This can happen if both NEAREST and NEAR modes were skipped
+  rv = (tmp_mv->as_mv.col != ref_mv.col || tmp_mv->as_mv.row != ref_mv.row);
   return rv;
 }
 
+/*!\brief Searches for the best New Motion Vector.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds the best Motion Vector by doing Motion Estimation. Uses reduced
+ * complexity ME for non-LAST frames or calls \c combined_motion_search
+ * for LAST reference frame
+ * \param[in]    cpi                      Top-level encoder structure
+ * \param[in]    x                        Pointer to structure holding all the
+ *                                        data for the current macroblock
+ * \param[in]    frame_mv                 Array that holds MVs for all modes
+ *                                        and ref frames
+ * \param[in]    ref_frame                Reference frame for which to find
+ *                                        the best New MVs
+ * \param[in]    gf_temporal_ref          Flag, indicating temporal reference
+ *                                        for GOLDEN frame
+ * \param[in]    bsize                    Current block size
+ * \param[in]    mi_row                   Row index in 4x4 units
+ * \param[in]    mi_col                   Column index in 4x4 units
+ * \param[in]    rate_mv                  Pointer to Rate of the best new MV
+ * \param[in]    best_rdc                 Pointer to the RD Cost for the best
+ *                                        mode found so far
+ *
+ * \return Returns -1 if the search was not done, otherwise returns 0.
+ * Best New MV is placed into \c frame_mv array, Rate estimation for this
+ * vector is placed to \c rate_mv
+ */
 static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
                          int_mv frame_mv[][REF_FRAMES],
                          MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref,
-                         BLOCK_SIZE bsize, int mi_row, int mi_col,
-                         int best_pred_sad, int *rate_mv, RD_STATS *best_rdc) {
+                         BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_mv,
+                         RD_STATS *best_rdc) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mi = xd->mi[0];
   AV1_COMMON *cm = &cpi->common;
-  if (ref_frame > LAST_FRAME && gf_temporal_ref &&
-      cpi->oxcf.rc_mode == AOM_CBR) {
+  if (ref_frame > LAST_FRAME && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+      gf_temporal_ref) {
     int tmp_sad;
     int dis;
-    int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
 
     if (bsize < BLOCK_16X16) return -1;
 
     tmp_sad = av1_int_pro_motion_estimation(
         cpi, x, bsize, mi_row, mi_col,
-        &x->mbmi_ext->ref_mv_stack[ref_frame][0].this_mv.as_mv);
+        &x->mbmi_ext.ref_mv_stack[ref_frame][0].this_mv.as_mv);
 
     if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
-    if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1;
 
     frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
     int_mv best_mv = mi->mv[0];
     best_mv.as_mv.row >>= 3;
     best_mv.as_mv.col >>= 3;
     MV ref_mv = av1_get_ref_mv(x, 0).as_mv;
-
-    *rate_mv =
-        av1_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, &ref_mv,
-                        x->nmv_vec_cost, x->mv_cost_stack, MV_COST_WEIGHT);
     frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
     frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
 
     SUBPEL_MOTION_SEARCH_PARAMS ms_params;
-    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
-                                      cost_list);
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, NULL);
+    if (cpi->sf.rt_sf.force_half_pel_block &&
+        cpi->sf.mv_sf.subpel_force_stop < HALF_PEL)
+      ms_params.forced_stop = subpel_select(cpi, bsize, &best_mv);
     MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
     cpi->mv_search_params.find_fractional_mv_step(
         xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis,
         &x->pred_sse[ref_frame], NULL);
     frame_mv[NEWMV][ref_frame].as_int = best_mv.as_int;
+
+    *rate_mv = av1_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, &ref_mv,
+                               x->mv_costs->nmv_joint_cost,
+                               x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
   } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
                                      &frame_mv[NEWMV][ref_frame], rate_mv,
                                      best_rdc->rdcost, 0)) {
@@ -244,52 +406,10 @@ static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
   return 0;
 }
 
-static INLINE void find_predictors(
-    AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
-    int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int *ref_frame_skip_mask,
-    const int flag_list[4], TileDataEnc *tile_data,
-    struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize,
-    int force_skip_low_temp_var) {
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
-  const int num_planes = av1_num_planes(cm);
-  (void)tile_data;
-
-  x->pred_mv_sad[ref_frame] = INT_MAX;
-  frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
-  // TODO(kyslov) this needs various further optimizations. to be continued..
-  if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
-    const struct scale_factors *const sf =
-        get_ref_scale_factors_const(cm, ref_frame);
-    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
-    av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
-                     xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
-                     mbmi_ext->mode_context);
-    // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
-    // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
-    av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
-    av1_find_best_ref_mvs_from_stack(
-        cm->features.allow_high_precision_mv, mbmi_ext, ref_frame,
-        &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0);
-    // Early exit for non-LAST frame if force_skip_low_temp_var is set.
-    if (!av1_is_scaled(sf) && bsize >= BLOCK_8X8 &&
-        !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) {
-      av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
-                  bsize);
-    }
-  } else {
-    *ref_frame_skip_mask |= (1 << ref_frame);
-  }
-  av1_count_overlappable_neighbors(cm, xd);
-  mbmi->num_proj_ref = 1;
-}
-
 static void estimate_single_ref_frame_costs(const AV1_COMMON *cm,
                                             const MACROBLOCKD *xd,
-                                            const MACROBLOCK *x, int segment_id,
+                                            const ModeCosts *mode_costs,
+                                            int segment_id,
                                             unsigned int *ref_costs_single) {
   int seg_ref_active =
       segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
@@ -297,176 +417,75 @@ static void estimate_single_ref_frame_costs(const AV1_COMMON *cm,
     memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
   } else {
     int intra_inter_ctx = av1_get_intra_inter_context(xd);
-    ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0];
-    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
-
-    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
-      ref_costs_single[i] = base_cost;
-
-    const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd);
-    const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd);
-    const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd);
-    const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd);
-    const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd);
-    const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd);
-
-    // Determine cost of a single ref frame, where frame types are represented
-    // by a tree:
-    // Level 0: add cost whether this ref is a forward or backward ref
-    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
-    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1];
-    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
-
-    // Level 1: if this ref is forward ref,
-    // add cost whether it is last/last2 or last3/golden
-    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0];
-    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0];
-    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1];
-    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1];
-
-    // Level 1: if this ref is backward ref
-    // then add cost whether this ref is altref or backward ref
-    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0];
-    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0];
-    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1];
-
-    // Level 2: further add cost whether this ref is last or last2
-    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0];
-    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1];
-
-    // Level 2: last3 or golden
-    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0];
-    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1];
-
-    // Level 2: bwdref or altref2
-    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0];
-    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1];
-  }
-}
-
-static void estimate_comp_ref_frame_costs(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x,
-    int segment_id, unsigned int (*ref_costs_comp)[REF_FRAMES]) {
-  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
-      memset(ref_costs_comp[ref_frame], 0,
-             REF_FRAMES * sizeof((*ref_costs_comp)[0]));
-  } else {
-    int intra_inter_ctx = av1_get_intra_inter_context(xd);
-    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
-
-    if (cm->current_frame.reference_mode != SINGLE_REFERENCE) {
-      // Similar to single ref, determine cost of compound ref frames.
-      // cost_compound_refs = cost_first_ref + cost_second_ref
-      const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd);
-      const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd);
-      const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd);
-      const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd);
-      const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd);
-
-      const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
-      unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 };
-
-      ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
-          ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
-              base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1];
-      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
-      ref_bicomp_costs[ALTREF_FRAME] = 0;
-
-      // cost of first ref frame
-      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
-      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
-      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
-      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
-
-      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0];
-      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1];
-
-      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0];
-      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1];
-
-      // cost of second ref frame
-      ref_bicomp_costs[BWDREF_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
-      ref_bicomp_costs[ALTREF2_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
-      ref_bicomp_costs[ALTREF_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
-
-      ref_bicomp_costs[BWDREF_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
-      ref_bicomp_costs[ALTREF2_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
-
-      // cost: if one ref frame is forward ref, the other ref is backward ref
-      for (int ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
-        for (int ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
-          ref_costs_comp[ref0][ref1] =
-              ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
-        }
-      }
-
-      // cost: if both ref frames are the same side.
-      const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd);
-      const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
-      const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
-      ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
-      ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
-      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
-      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
-    } else {
-      for (int ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
-        for (int ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
-          ref_costs_comp[ref0][ref1] = 512;
-      }
-      ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
-      ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
-      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
-      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
-    }
+    ref_costs_single[INTRA_FRAME] =
+        mode_costs->intra_inter_cost[intra_inter_ctx][0];
+    unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
+    ref_costs_single[LAST_FRAME] = base_cost;
+    ref_costs_single[GOLDEN_FRAME] = base_cost;
+    ref_costs_single[ALTREF_FRAME] = base_cost;
+    // add cost for last, golden, altref
+    ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[0][0][0];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][0][1];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][1][0];
+    ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][0][1];
+    ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][2][0];
   }
 }
 
 static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                                  MACROBLOCK *const x, unsigned int var,
-                                 unsigned int sse) {
+                                 unsigned int sse, int *force_skip) {
   MACROBLOCKD *const xd = &x->e_mbd;
   TX_SIZE tx_size;
-  if (x->tx_mode_search_type == TX_MODE_SELECT) {
-    if (sse > (var << 2))
-      tx_size = AOMMIN(max_txsize_lookup[bsize],
-                       tx_mode_to_biggest_tx_size[x->tx_mode_search_type]);
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  if (txfm_params->tx_mode_search_type == TX_MODE_SELECT) {
+    int multiplier = 8;
+    unsigned int var_thresh = 0;
+    unsigned int is_high_var = 1;
+    // Use quantizer based thresholds to determine transform size.
+    if (cpi->sf.rt_sf.tx_size_level_based_on_qstep) {
+      const int qband = x->qindex >> (QINDEX_BITS - 2);
+      const int mult[4] = { 8, 7, 6, 5 };
+      assert(qband < 4);
+      multiplier = mult[qband];
+      const int qstep = x->plane[0].dequant_QTX[1] >> (xd->bd - 5);
+      const unsigned int qstep_sq = qstep * qstep;
+      var_thresh = qstep_sq * 2;
+      if (cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) {
+        // If the sse is low for low source variance blocks, mark those as
+        // transform skip.
+        // Note: Though qstep_sq is based on ac qstep, the threshold is kept
+        // low so that reliable early estimate of tx skip can be obtained
+        // through its comparison with sse.
+        if (sse < qstep_sq && x->source_variance < qstep_sq &&
+            x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0)
+          *force_skip = 1;
+        // Further lower transform size based on aq mode only if residual
+        // variance is high.
+        is_high_var = (var >= var_thresh);
+      }
+    }
+    // Choose larger transform size for blocks where dc component is dominant or
+    // the ac component is low.
+    if (sse > ((var * multiplier) >> 2) || (var < var_thresh))
+      tx_size =
+          AOMMIN(max_txsize_lookup[bsize],
+                 tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
     else
       tx_size = TX_8X8;
 
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
-        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
+    if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && is_high_var)
       tx_size = TX_8X8;
     else if (tx_size > TX_16X16)
       tx_size = TX_16X16;
   } else {
-    tx_size = AOMMIN(max_txsize_lookup[bsize],
-                     tx_mode_to_biggest_tx_size[x->tx_mode_search_type]);
+    tx_size =
+        AOMMIN(max_txsize_lookup[bsize],
+               tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
   }
 
-  if (x->tx_mode_search_type != ONLY_4X4 && bsize > BLOCK_32X32)
+  if (txfm_params->tx_mode_search_type != ONLY_4X4 && bsize > BLOCK_32X32)
     tx_size = TX_16X16;
 
   return AOMMIN(tx_size, TX_16X16);
@@ -483,20 +502,32 @@ static void block_variance(const uint8_t *src, int src_stride,
                            const uint8_t *ref, int ref_stride, int w, int h,
                            unsigned int *sse, int *sum, int block_size,
                            uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
-  int i, j, k = 0;
-
+  int k = 0;
   *sse = 0;
   *sum = 0;
 
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      aom_get8x8var(src + src_stride * i + j, src_stride,
-                    ref + ref_stride * i + j, ref_stride, &sse8x8[k],
-                    &sum8x8[k]);
-      *sse += sse8x8[k];
-      *sum += sum8x8[k];
+  // This function is called for block sizes >= BLOCK_32x32. As per the design
+  // the aom_get_sse_sum_8x8_quad() processes four 8x8 blocks (in a 8x32) per
+  // call. Hence the width and height of the block need to be at least 8 and 32
+  // samples respectively.
+  assert(w >= 32);
+  assert(h >= 8);
+  for (int i = 0; i < h; i += block_size) {
+    for (int j = 0; j < w; j += 32) {
+      aom_get_sse_sum_8x8_quad(src + src_stride * i + j, src_stride,
+                               ref + ref_stride * i + j, ref_stride, &sse8x8[k],
+                               &sum8x8[k]);
+
+      *sse += sse8x8[k] + sse8x8[k + 1] + sse8x8[k + 2] + sse8x8[k + 3];
+      *sum += sum8x8[k] + sum8x8[k + 1] + sum8x8[k + 2] + sum8x8[k + 3];
       var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
-      k++;
+      var8x8[k + 1] = sse8x8[k + 1] -
+                      (uint32_t)(((int64_t)sum8x8[k + 1] * sum8x8[k + 1]) >> 6);
+      var8x8[k + 2] = sse8x8[k + 2] -
+                      (uint32_t)(((int64_t)sum8x8[k + 2] * sum8x8[k + 2]) >> 6);
+      var8x8[k + 3] = sse8x8[k + 3] -
+                      (uint32_t)(((int64_t)sum8x8[k + 3] * sum8x8[k + 3]) >> 6);
+      k += 4;
     }
   }
 }
@@ -538,10 +569,9 @@ static int ac_thr_factor(const int speed, const int width, const int height,
 
 static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
                                       int mi_row, int mi_col, MACROBLOCK *x,
-                                      MACROBLOCKD *xd, int *out_rate,
-                                      int64_t *out_dist, unsigned int *var_y,
-                                      unsigned int *sse_y, int *early_term,
-                                      int calculate_rd) {
+                                      MACROBLOCKD *xd, RD_STATS *rd_stats,
+                                      int *early_term, int calculate_rd,
+                                      int64_t best_sse) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -552,6 +582,7 @@ static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
   const uint32_t ac_quant = p->dequant_QTX[1];
   const int64_t dc_thr = dc_quant * dc_quant >> 6;
   int64_t ac_thr = ac_quant * ac_quant >> 6;
+  int test_skip = 1;
   unsigned int var;
   int sum;
 
@@ -563,26 +594,56 @@ static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
   unsigned int var8x8[256] = { 0 };
   TX_SIZE tx_size;
   int k;
+
+  if (x->force_zeromv_skip) {
+    *early_term = 1;
+    rd_stats->rate = 0;
+    rd_stats->dist = 0;
+    rd_stats->sse = 0;
+    return;
+  }
+
   // Calculate variance for whole partition, and also save 8x8 blocks' variance
   // to be used in following transform skipping test.
   block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
                  4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
   var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
 
-  *var_y = var;
-  *sse_y = sse;
+  rd_stats->sse = sse;
 
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+      cpi->oxcf.speed > 5)
+    ac_thr = av1_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level,
+                                     (abs(sum) >> (bw + bh)),
+                                     cpi->svc.temporal_layer_id);
+  else
+    ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
+                            cpi->common.height, abs(sum) >> (bw + bh));
+#else
   ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
                           cpi->common.height, abs(sum) >> (bw + bh));
 
-  tx_size = calculate_tx_size(cpi, bsize, x, var, sse);
+#endif
+  // Skipping test
+  *early_term = 0;
+  tx_size = calculate_tx_size(cpi, bsize, x, var, sse, early_term);
   // The code below for setting skip flag assumes tranform size of at least 8x8,
   // so force this lower limit on transform.
   if (tx_size < TX_8X8) tx_size = TX_8X8;
   xd->mi[0]->tx_size = tx_size;
 
+  MB_MODE_INFO *const mi = xd->mi[0];
+  if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search &&
+      early_term_inter_search_with_sse(
+          cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse,
+          mi->mode))
+    test_skip = 0;
+
+  if (*early_term) test_skip = 0;
+
   // Evaluate if the partition block is a skippable block in Y plane.
-  {
+  if (test_skip) {
     unsigned int sse16x16[64] = { 0 };
     int sum16x16[64] = { 0 };
     unsigned int var16x16[64] = { 0 };
@@ -613,8 +674,6 @@ static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
       calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32,
                          sse32x32, sum32x32);
 
-    // Skipping test
-    *early_term = 0;
     for (k = 0; k < num; k++)
       // Check if all ac coefficients can be quantized to zero.
       if (!(var_tx[k] < ac_thr || var == 0)) {
@@ -651,9 +710,9 @@ static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
               (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> 3;
           av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, i,
                                         i);
-          var_uv[j] = cpi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride,
-                                               puvd->dst.buf, puvd->dst.stride,
-                                               &sse_uv[j]);
+          var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(
+              puv->src.buf, puv->src.stride, puvd->dst.buf, puvd->dst.stride,
+              &sse_uv[j]);
           if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
               (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
             skip_uv[j] = 1;
@@ -666,27 +725,25 @@ static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
       }
     }
   }
-  if (calculate_rd && out_dist != NULL && out_rate != NULL) {
+  if (calculate_rd) {
     if (!*early_term) {
       const int bwide = block_size_wide[bsize];
       const int bhigh = block_size_high[bsize];
 
       model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh,
-                            out_rate, out_dist);
+                            &rd_stats->rate, &rd_stats->dist);
     }
 
     if (*early_term) {
-      *out_rate = 0;
-      *out_dist = sse << 4;
+      rd_stats->rate = 0;
+      rd_stats->dist = sse << 4;
     }
   }
 }
 
 static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                              MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum,
-                              int64_t *out_dist_sum, int *skip_txfm_sb,
-                              int64_t *skip_sse_sb, unsigned int *var_y,
-                              unsigned int *sse_y, int calculate_rd) {
+                              MACROBLOCK *x, MACROBLOCKD *xd,
+                              RD_STATS *rd_stats, int calculate_rd) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -700,11 +757,12 @@ static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   int rate;
   int64_t dist;
 
-  unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
-                                           pd->dst.buf, pd->dst.stride, &sse);
-  xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse);
+  unsigned int var = cpi->ppi->fn_ptr[bsize].vf(
+      p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse);
+  int force_skip = 0;
+  xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse, &force_skip);
 
-  if (calculate_rd) {
+  if (calculate_rd && (!force_skip || ref == INTRA_FRAME)) {
     const int bwide = block_size_wide[bsize];
     const int bhigh = block_size_high[bsize];
     model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh, &rate,
@@ -713,29 +771,128 @@ static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
     rate = INT_MAX;  // this will be overwritten later with block_yrd
     dist = INT_MAX;
   }
-  *var_y = var;
-  *sse_y = sse;
+  rd_stats->sse = sse;
   x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
 
+  if (force_skip && ref > INTRA_FRAME) {
+    rate = 0;
+    dist = (int64_t)sse << 4;
+  }
+
   assert(rate >= 0);
 
-  if (skip_txfm_sb) *skip_txfm_sb = rate == 0;
-  if (skip_sse_sb) *skip_sse_sb = sse << 4;
+  rd_stats->skip_txfm = (rate == 0);
   rate = AOMMIN(rate, INT_MAX);
-  *out_rate_sum = (int)rate;
-  *out_dist_sum = dist;
+  rd_stats->rate = rate;
+  rd_stats->dist = dist;
 }
 
-static void block_yrd(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
-                      RD_STATS *this_rdc, int *skippable, int64_t *sse,
-                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+static INLINE void aom_process_hadamard_8x16(MACROBLOCK *x, int max_blocks_high,
+                                             int max_blocks_wide, int num_4x4_w,
+                                             int step, int block_step) {
+  struct macroblock_plane *const p = &x->plane[0];
+  const int bw = 4 * num_4x4_w;
+  const int num_4x4 = AOMMIN(num_4x4_w, max_blocks_wide);
+  int block = 0;
+
+  for (int r = 0; r < max_blocks_high; r += block_step) {
+    for (int c = 0; c < num_4x4; c += 2 * block_step) {
+      const int16_t *src_diff = &p->src_diff[(r * bw + c) << 2];
+      int16_t *low_coeff = (int16_t *)p->coeff + BLOCK_OFFSET(block);
+      aom_hadamard_8x8_dual(src_diff, (ptrdiff_t)bw, low_coeff);
+      block += 2 * step;
+    }
+  }
+}
+
+#define DECLARE_LOOP_VARS_BLOCK_YRD()                                      \
+  const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; \
+  const int block_offset = BLOCK_OFFSET(block + s);                        \
+  int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;           \
+  int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;         \
+  int16_t *const low_dqcoeff = (int16_t *)p->dqcoeff + block_offset;       \
+  uint16_t *const eob = &p->eobs[block + s];                               \
+  const int diff_stride = bw;                                              \
+  const int16_t *src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define DECLARE_HBD_LOOP_VARS_BLOCK_YRD()              \
+  tran_low_t *const coeff = p->coeff + block_offset;   \
+  tran_low_t *const qcoeff = p->qcoeff + block_offset; \
+  tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
+
+static AOM_FORCE_INLINE void update_yrd_loop_vars_hbd(
+    MACROBLOCK *x, int *skippable, const int step, const int ncoeffs,
+    tran_low_t *const coeff, tran_low_t *const qcoeff,
+    tran_low_t *const dqcoeff, RD_STATS *this_rdc, int *eob_cost,
+    const int tx_blk_id) {
+  const int is_txfm_skip = (ncoeffs == 0);
+  *skippable &= is_txfm_skip;
+  x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip;
+  *eob_cost += get_msb(ncoeffs + 1);
+
+  int64_t dummy;
+  if (ncoeffs == 1)
+    this_rdc->rate += (int)abs(qcoeff[0]);
+  else if (ncoeffs > 1)
+    this_rdc->rate += aom_satd(qcoeff, step << 4);
+
+  this_rdc->dist += av1_block_error(coeff, dqcoeff, step << 4, &dummy) >> 2;
+}
+#endif
+static AOM_FORCE_INLINE void update_yrd_loop_vars(
+    MACROBLOCK *x, int *skippable, const int step, const int ncoeffs,
+    int16_t *const low_coeff, int16_t *const low_qcoeff,
+    int16_t *const low_dqcoeff, RD_STATS *this_rdc, int *eob_cost,
+    const int tx_blk_id) {
+  const int is_txfm_skip = (ncoeffs == 0);
+  *skippable &= is_txfm_skip;
+  x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip;
+  *eob_cost += get_msb(ncoeffs + 1);
+  if (ncoeffs == 1)
+    this_rdc->rate += (int)abs(low_qcoeff[0]);
+  else if (ncoeffs > 1)
+    this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4);
+
+  this_rdc->dist += av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2;
+}
+
+/*!\brief Calculates RD Cost using Hadamard transform.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost using Hadamard transform. For low bit depth this function
+ * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    mi_row         Row index in 4x4 units
+ * \param[in]    mi_col         Column index in 4x4 units
+ * \param[in]    this_rdc       Pointer to calculated RD Cost
+ * \param[in]    skippable      Pointer to a flag indicating possible tx skip
+ * \param[in]    bsize          Current block size
+ * \param[in]    tx_size        Transform size
+ * \param[in]    tx_type        Transform kernel type
+ * \param[in]    is_inter_mode  Flag to indicate inter mode
+ *
+ * \return Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc. \c skippable flag is set if there is no non-zero quantized
+ * coefficients for Hadamard transform
+ */
+void av1_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
+                   int mi_col, RD_STATS *this_rdc, int *skippable,
+                   BLOCK_SIZE bsize, TX_SIZE tx_size, TX_TYPE tx_type,
+                   int is_inter_mode) {
   MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblockd_plane *pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
+  assert(bsize < BLOCK_SIZES_ALL);
   const int num_4x4_w = mi_size_wide[bsize];
   const int num_4x4_h = mi_size_high[bsize];
   const int step = 1 << (tx_size << 1);
   const int block_step = (1 << tx_size);
+  const int row_step = step * num_4x4_w / block_step;
   int block = 0;
   const int max_blocks_wide =
       num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
@@ -744,16 +901,22 @@ static void block_yrd(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
   int eob_cost = 0;
   const int bw = 4 * num_4x4_w;
   const int bh = 4 * num_4x4_h;
+  const int use_hbd = is_cur_buf_hbd(xd);
+  int num_blk_skip_w = num_4x4_w;
+  int sh_blk_skip = 0;
+  if (is_inter_mode) {
+    num_blk_skip_w = num_4x4_w >> 1;
+    sh_blk_skip = 1;
+  }
 
   (void)mi_row;
   (void)mi_col;
   (void)cpi;
 
 #if CONFIG_AV1_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (use_hbd) {
     aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
-                              p->src.stride, pd->dst.buf, pd->dst.stride,
-                              x->e_mbd.bd);
+                              p->src.stride, pd->dst.buf, pd->dst.stride);
   } else {
     aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
                        pd->dst.buf, pd->dst.stride);
@@ -764,125 +927,189 @@ static void block_yrd(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
 #endif
 
   *skippable = 1;
-  // Keep track of the row and column of the blocks we use so that we know
-  // if we are in the unrestricted motion border.
-  for (int r = 0; r < max_blocks_high; r += block_step) {
-    for (int c = 0; c < num_4x4_w; c += block_step) {
-      if (c < max_blocks_wide) {
-        const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
-        const int block_offset = BLOCK_OFFSET(block);
+  int tx_wd = 0;
+  switch (tx_size) {
+    case TX_64X64:
+      assert(0);  // Not implemented
+      break;
+    case TX_32X32:
+      assert(0);  // Not used
+      break;
+    case TX_16X16: tx_wd = 16; break;
+    case TX_8X8: tx_wd = 8; break;
+    default:
+      assert(tx_size == TX_4X4);
+      tx_wd = 4;
+      break;
+  }
+
+  this_rdc->dist = 0;
+  this_rdc->rate = 0;
+#if !CONFIG_AV1_HIGHBITDEPTH
+  if (tx_type == IDTX) {
+    // Keep track of the row and column of the blocks we use so that we know
+    // if we are in the unrestricted motion border.
+    for (int r = 0; r < max_blocks_high; r += block_step) {
+      for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
+        DECLARE_LOOP_VARS_BLOCK_YRD()
+
+        for (int idy = 0; idy < tx_wd; ++idy)
+          for (int idx = 0; idx < tx_wd; ++idx)
+            low_coeff[idy * tx_wd + idx] =
+                src_diff[idy * diff_stride + idx] * 8;
+
+        av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX,
+                        p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                        p->dequant_QTX, eob, scan_order->scan,
+                        scan_order->iscan);
+        assert(*eob <= 1024);
+        update_yrd_loop_vars(x, skippable, step, *eob, low_coeff, low_qcoeff,
+                             low_dqcoeff, this_rdc, &eob_cost,
+                             (r * num_blk_skip_w + c) >> sh_blk_skip);
+      }
+      block += row_step;
+    }
+  } else {
+#else
+  {
+    (void)tx_wd;
+#endif
+    // For block sizes 8x16 or above, Hadamard txfm of two adjacent 8x8 blocks
+    // can be done per function call. Hence the call of Hadamard txfm is
+    // abstracted here for the specified cases.
+    const int is_tx_8x8_dual_applicable =
+        (tx_size == TX_8X8 && block_size_wide[bsize] >= 16 &&
+         block_size_high[bsize] >= 8);
+    if (is_tx_8x8_dual_applicable) {
+      aom_process_hadamard_8x16(x, max_blocks_high, max_blocks_wide, num_4x4_w,
+                                step, block_step);
+    }
+
+    // Keep track of the row and column of the blocks we use so that we know
+    // if we are in the unrestricted motion border.
+    for (int r = 0; r < max_blocks_high; r += block_step) {
+      for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
+        DECLARE_LOOP_VARS_BLOCK_YRD()
 #if CONFIG_AV1_HIGHBITDEPTH
-        tran_low_t *const coeff = p->coeff + block_offset;
-        tran_low_t *const qcoeff = p->qcoeff + block_offset;
-        tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
+        DECLARE_HBD_LOOP_VARS_BLOCK_YRD()
 #else
-        int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
-        int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
-        int16_t *const low_dqcoeff = (int16_t *)pd->dqcoeff + block_offset;
+        (void)use_hbd;
 #endif
-        uint16_t *const eob = &p->eobs[block];
-        const int diff_stride = bw;
-        const int16_t *src_diff;
-        src_diff = &p->src_diff[(r * diff_stride + c) << 2];
 
         switch (tx_size) {
-          case TX_64X64:
-            assert(0);  // Not implemented
-            break;
-          case TX_32X32:
-            assert(0);  // Not used
-            break;
 #if CONFIG_AV1_HIGHBITDEPTH
           case TX_16X16:
-            aom_hadamard_16x16(src_diff, diff_stride, coeff);
-            av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
-                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
-                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
-                            scan_order->iscan);
+            if (use_hbd) {
+              aom_hadamard_16x16(src_diff, diff_stride, coeff);
+              av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                              dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            } else {
+              if (tx_type == IDTX) {
+                aom_pixel_scale(src_diff, diff_stride, low_coeff, 3, 2, 2);
+              } else {
+                aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+              }
+              av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
+                              p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                              p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            }
             break;
           case TX_8X8:
-            aom_hadamard_8x8(src_diff, diff_stride, coeff);
-            av1_quantize_fp(coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX,
-                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
-                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
-                            scan_order->iscan);
+            if (use_hbd) {
+              aom_hadamard_8x8(src_diff, diff_stride, coeff);
+              av1_quantize_fp(coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                              dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            } else {
+              if (tx_type == IDTX) {
+                aom_pixel_scale(src_diff, diff_stride, low_coeff, 3, 1, 1);
+              } else {
+                aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+              }
+              av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX,
+                              p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                              p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            }
+            break;
+          default:
+            assert(tx_size == TX_4X4);
+            if (use_hbd) {
+              aom_fdct4x4(src_diff, coeff, diff_stride);
+              av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                              dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            } else {
+              if (tx_type == IDTX) {
+                for (int idy = 0; idy < 4; ++idy)
+                  for (int idx = 0; idx < 4; ++idx)
+                    low_coeff[idy * 4 + idx] = src_diff[idy * diff_stride + idx]
+                                               << 3;
+              } else {
+                aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+              }
+              av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX,
+                              p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                              p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            }
             break;
 #else
           case TX_16X16:
             aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
             av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
                             p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
-                            p->dequant_QTX, eob, scan_order->scan);
+                            p->dequant_QTX, eob, scan_order->scan,
+                            scan_order->iscan);
             break;
           case TX_8X8:
-            aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+            if (!is_tx_8x8_dual_applicable) {
+              aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+            } else {
+              assert(is_tx_8x8_dual_applicable);
+            }
             av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX,
                             low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
-                            scan_order->scan);
+                            scan_order->scan, scan_order->iscan);
             break;
           default:
-            assert(tx_size == TX_4X4);
-            x->fwd_txfm4x4(src_diff, low_coeff, diff_stride);
+            aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
             av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
                             low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
-                            scan_order->scan);
+                            scan_order->scan, scan_order->iscan);
             break;
 #endif
         }
-        *skippable &= (*eob == 0);
-        eob_cost += 1;
+        assert(*eob <= 1024);
+#if CONFIG_AV1_HIGHBITDEPTH
+        if (use_hbd)
+          update_yrd_loop_vars_hbd(x, skippable, step, *eob, coeff, qcoeff,
+                                   dqcoeff, this_rdc, &eob_cost,
+                                   (r * num_blk_skip_w + c) >> sh_blk_skip);
+        else
+#endif
+          update_yrd_loop_vars(x, skippable, step, *eob, low_coeff, low_qcoeff,
+                               low_dqcoeff, this_rdc, &eob_cost,
+                               (r * num_blk_skip_w + c) >> sh_blk_skip);
       }
-      block += step;
+      block += row_step;
     }
   }
-  this_rdc->skip = *skippable;
-  this_rdc->rate = 0;
-  if (*sse < INT64_MAX) {
-    *sse = (*sse << 6) >> 2;
+  this_rdc->skip_txfm = *skippable;
+  if (this_rdc->sse < INT64_MAX) {
+    this_rdc->sse = (this_rdc->sse << 6) >> 2;
     if (*skippable) {
-      this_rdc->dist = *sse;
+      this_rdc->dist = 0;
+      this_rdc->dist = this_rdc->sse;
       return;
     }
   }
 
-  block = 0;
-  this_rdc->dist = 0;
-  for (int r = 0; r < max_blocks_high; r += block_step) {
-    for (int c = 0; c < num_4x4_w; c += block_step) {
-      if (c < max_blocks_wide) {
-        const int block_offset = BLOCK_OFFSET(block);
-        uint16_t *const eob = &p->eobs[block];
-#if CONFIG_AV1_HIGHBITDEPTH
-        int64_t dummy;
-        tran_low_t *const coeff = p->coeff + block_offset;
-        tran_low_t *const qcoeff = p->qcoeff + block_offset;
-        tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
-
-        if (*eob == 1)
-          this_rdc->rate += (int)abs(qcoeff[0]);
-        else if (*eob > 1)
-          this_rdc->rate += aom_satd(qcoeff, step << 4);
-
-        this_rdc->dist +=
-            av1_block_error(coeff, dqcoeff, step << 4, &dummy) >> 2;
-#else
-        int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
-        int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
-        int16_t *const low_dqcoeff = (int16_t *)pd->dqcoeff + block_offset;
-
-        if (*eob == 1)
-          this_rdc->rate += (int)abs(low_qcoeff[0]);
-        else if (*eob > 1)
-          this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4);
-
-        this_rdc->dist +=
-            av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2;
-#endif
-      }
-      block += step;
-    }
-  }
-
   // If skippable is set, rate gets clobbered later.
   this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
   this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
@@ -915,24 +1142,20 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
 #endif  // CONFIG_INTERNAL_STATS
   MACROBLOCKD *const xd = &x->e_mbd;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
-  ctx->rd_stats.skip = x->force_skip;
-  memset(ctx->blk_skip, 0, sizeof(ctx->blk_skip[0]) * ctx->num_4x4_blk);
-  memset(ctx->tx_type_map, DCT_DCT,
-         sizeof(ctx->tx_type_map[0]) * ctx->num_4x4_blk);
-  ctx->skippable = x->force_skip;
+  ctx->rd_stats.skip_txfm = txfm_info->skip_txfm;
+
+  ctx->skippable = txfm_info->skip_txfm;
 #if CONFIG_INTERNAL_STATS
   ctx->best_mode_index = mode_index;
 #endif  // CONFIG_INTERNAL_STATS
   ctx->mic = *xd->mi[0];
-  ctx->skippable = x->force_skip;
-  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext,
+  ctx->skippable = txfm_info->skip_txfm;
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
                                       av1_ref_frame_type(xd->mi[0]->ref_frame));
-  ctx->comp_pred_diff = 0;
-  ctx->hybrid_pred_diff = 0;
-  ctx->single_pred_diff = 0;
 }
 
 static int get_pred_buffer(PRED_BUFFER *p, int len) {
@@ -949,10 +1172,10 @@ static void free_pred_buffer(PRED_BUFFER *p) {
   if (p != NULL) p->in_use = 0;
 }
 
-static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
+static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode,
                        int16_t mode_context) {
   if (is_inter_compound_mode(mode)) {
-    return x
+    return mode_costs
         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
   }
 
@@ -962,19 +1185,19 @@ static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
   assert(is_inter_mode(mode));
 
   if (mode == NEWMV) {
-    mode_cost = x->newmv_mode_cost[mode_ctx][0];
+    mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0];
     return mode_cost;
   } else {
-    mode_cost = x->newmv_mode_cost[mode_ctx][1];
+    mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1];
     mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
 
     if (mode == GLOBALMV) {
-      mode_cost += x->zeromv_mode_cost[mode_ctx][0];
+      mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0];
       return mode_cost;
     } else {
-      mode_cost += x->zeromv_mode_cost[mode_ctx][1];
+      mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1];
       mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
-      mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
       return mode_cost;
     }
   }
@@ -982,19 +1205,24 @@ static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
 
 static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode,
                             RD_STATS *this_rdc, BLOCK_SIZE bsize, int mv_row,
-                            int mv_col, int speed, uint32_t spatial_variance) {
+                            int mv_col, int speed, uint32_t spatial_variance,
+                            CONTENT_STATE_SB content_state_sb) {
   // Bias against MVs associated with NEWMV mode that are very different from
   // top/left neighbors.
   if (this_mode == NEWMV) {
     int al_mv_average_row;
     int al_mv_average_col;
-    int left_row, left_col;
     int row_diff, col_diff;
     int above_mv_valid = 0;
     int left_mv_valid = 0;
-    int above_row = 0;
-    int above_col = 0;
-
+    int above_row = INVALID_MV_ROW_COL, above_col = INVALID_MV_ROW_COL;
+    int left_row = INVALID_MV_ROW_COL, left_col = INVALID_MV_ROW_COL;
+    if (bsize >= BLOCK_64X64 && content_state_sb.source_sad_nonrd != kHighSad &&
+        spatial_variance < 300 &&
+        (mv_row > 16 || mv_row < -16 || mv_col > 16 || mv_col < -16)) {
+      this_rdc->rdcost = this_rdc->rdcost << 2;
+      return;
+    }
     if (xd->above_mbmi) {
       above_mv_valid = xd->above_mbmi->mv[0].as_int != INVALID_MV;
       above_row = xd->above_mbmi->mv[0].as_mv.row;
@@ -1035,9 +1263,8 @@ static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode,
 
 static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
                                MACROBLOCK *x, MACROBLOCKD *xd,
-                               RD_STATS *this_rdc, unsigned int *var_y,
-                               unsigned int *sse_y, int start_plane,
-                               int stop_plane) {
+                               RD_STATS *this_rdc, int64_t *sse_y,
+                               int start_plane, int stop_plane) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -1045,12 +1272,11 @@ static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
   int rate;
   int64_t dist;
   int i;
-  uint32_t tot_var = *var_y;
-  uint32_t tot_sse = *sse_y;
+  int64_t tot_sse = *sse_y;
 
   this_rdc->rate = 0;
   this_rdc->dist = 0;
-  this_rdc->skip = 0;
+  this_rdc->skip_txfm = 0;
 
   for (i = start_plane; i <= stop_plane; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
@@ -1061,10 +1287,9 @@ static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
     unsigned int var;
     if (!x->color_sensitivity[i - 1]) continue;
 
-    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
-                             pd->dst.stride, &sse);
+    var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                                  pd->dst.stride, &sse);
     assert(sse >= var);
-    tot_var += var;
     tot_sse += sse;
 
     av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
@@ -1081,20 +1306,20 @@ static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
   }
 
   if (this_rdc->rate == 0) {
-    this_rdc->skip = 1;
+    this_rdc->skip_txfm = 1;
   }
 
   if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >=
-      RDCOST(x->rdmult, 0, ((int64_t)tot_sse) << 4)) {
+      RDCOST(x->rdmult, 0, tot_sse << 4)) {
     this_rdc->rate = 0;
     this_rdc->dist = tot_sse << 4;
-    this_rdc->skip = 1;
+    this_rdc->skip_txfm = 1;
   }
 
-  *var_y = tot_var;
   *sse_y = tot_sse;
 }
 
+/*!\cond */
 struct estimate_block_intra_args {
   AV1_COMP *cpi;
   MACROBLOCK *x;
@@ -1102,7 +1327,27 @@ struct estimate_block_intra_args {
   int skippable;
   RD_STATS *rdc;
 };
+/*!\endcond */
 
+/*!\brief Estimation of RD cost of an intra mode for Non-RD optimized case.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost for an intra mode for a single TX block using Hadamard
+ * transform.
+ * \param[in]    plane          Color plane
+ * \param[in]    block          Index of a TX block in a prediction block
+ * \param[in]    row            Row of a current TX block
+ * \param[in]    col            Column of a current TX block
+ * \param[in]    plane_bsize    Block size of a current prediction block
+ * \param[in]    tx_size        Transform size
+ * \param[in]    arg            Pointer to a structure that holds paramaters
+ *                              for intra mode search
+ *
+ * \return Nothing is returned. Instead, best mode and RD Cost of the best mode
+ * are set in \c args->rdc and \c args->mode
+ */
 static void estimate_block_intra(int plane, int block, int row, int col,
                                  BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                                  void *arg) {
@@ -1122,20 +1367,18 @@ static void estimate_block_intra(int plane, int block, int row, int col,
 
   (void)block;
 
+  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+  av1_invalid_rd_stats(&this_rdc);
+
   p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
   pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
 
-  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
-
   if (plane == 0) {
-    int64_t this_sse = INT64_MAX;
-    block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, &this_sse, bsize_tx,
-              AOMMIN(tx_size, TX_16X16));
+    av1_block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, bsize_tx,
+                  AOMMIN(tx_size, TX_16X16), DCT_DCT, 0);
   } else {
-    unsigned int var = 0;
-    unsigned int sse = 0;
-    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane,
-                       plane);
+    int64_t sse = 0;
+    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &sse, plane, plane);
   }
 
   p->src.buf = src_buf_base;
@@ -1149,185 +1392,185 @@ static INLINE void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x,
                                            MV_REFERENCE_FRAME ref_frame,
                                            THR_MODES best_mode_idx,
                                            PREDICTION_MODE mode) {
-  THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
-  int *freq_fact = &x->thresh_freq_fact[bsize][thr_mode_idx];
-  if (thr_mode_idx == best_mode_idx) {
-    *freq_fact -= (*freq_fact >> 4);
-  } else {
-    *freq_fact =
-        AOMMIN(*freq_fact + RD_THRESH_INC,
-               cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+  const THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+  const BLOCK_SIZE min_size = AOMMAX(bsize - 3, BLOCK_4X4);
+  const BLOCK_SIZE max_size = AOMMIN(bsize + 6, BLOCK_128X128);
+  for (BLOCK_SIZE bs = min_size; bs <= max_size; bs += 3) {
+    int *freq_fact = &x->thresh_freq_fact[bs][thr_mode_idx];
+    if (thr_mode_idx == best_mode_idx) {
+      *freq_fact -= (*freq_fact >> 4);
+    } else {
+      *freq_fact =
+          AOMMIN(*freq_fact + RD_THRESH_INC,
+                 cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+    }
   }
 }
 
-static INLINE int get_force_skip_low_temp_var_small_sb(uint8_t *variance_low,
-                                                       int mi_row, int mi_col,
-                                                       BLOCK_SIZE bsize) {
-  // Relative indices of MB inside the superblock.
-  const int mi_x = mi_row & 0xF;
-  const int mi_y = mi_col & 0xF;
-  // Relative indices of 16x16 block inside the superblock.
-  const int i = mi_x >> 2;
-  const int j = mi_y >> 2;
-  int force_skip_low_temp_var = 0;
-  // Set force_skip_low_temp_var based on the block size and block offset.
-  switch (bsize) {
-    case BLOCK_64X64: force_skip_low_temp_var = variance_low[0]; break;
-    case BLOCK_64X32:
-      if (!mi_y && !mi_x) {
-        force_skip_low_temp_var = variance_low[1];
-      } else if (!mi_y && mi_x) {
-        force_skip_low_temp_var = variance_low[2];
-      }
-      break;
-    case BLOCK_32X64:
-      if (!mi_y && !mi_x) {
-        force_skip_low_temp_var = variance_low[3];
-      } else if (mi_y && !mi_x) {
-        force_skip_low_temp_var = variance_low[4];
-      }
-      break;
-    case BLOCK_32X32:
-      if (!mi_y && !mi_x) {
-        force_skip_low_temp_var = variance_low[5];
-      } else if (mi_y && !mi_x) {
-        force_skip_low_temp_var = variance_low[6];
-      } else if (!mi_y && mi_x) {
-        force_skip_low_temp_var = variance_low[7];
-      } else if (mi_y && mi_x) {
-        force_skip_low_temp_var = variance_low[8];
-      }
-      break;
-    case BLOCK_32X16:
-    case BLOCK_16X32:
-    case BLOCK_16X16:
-      force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]];
-      break;
-    default: break;
-  }
-
-  return force_skip_low_temp_var;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void av1_pickmode_ctx_den_update(
+    AV1_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig,
+    unsigned int ref_frame_cost[REF_FRAMES],
+    int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int reuse_inter_pred,
+    BEST_PICKMODE *bp) {
+  ctx_den->zero_last_cost_orig = zero_last_cost_orig;
+  ctx_den->ref_frame_cost = ref_frame_cost;
+  ctx_den->frame_mv = frame_mv;
+  ctx_den->reuse_inter_pred = reuse_inter_pred;
+  ctx_den->best_tx_size = bp->best_tx_size;
+  ctx_den->best_mode = bp->best_mode;
+  ctx_den->best_ref_frame = bp->best_ref_frame;
+  ctx_den->best_pred_filter = bp->best_pred_filter;
+  ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm;
 }
 
-static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row,
-                                              int mi_col, BLOCK_SIZE bsize) {
-  int force_skip_low_temp_var = 0;
-  int x, y;
-  x = (mi_col & 0x1F) >> 4;
-  // y = (mi_row & 0x1F) >> 4;
-  // const int idx64 = (y << 1) + x;
-  y = (mi_row & 0x17) >> 3;
-  const int idx64 = y + x;
-
-  x = (mi_col & 0xF) >> 3;
-  // y = (mi_row & 0xF) >> 3;
-  // const int idx32 = (y << 1) + x;
-  y = (mi_row & 0xB) >> 2;
-  const int idx32 = y + x;
-
-  x = (mi_col & 0x7) >> 2;
-  // y = (mi_row & 0x7) >> 2;
-  // const int idx16 = (y << 1) + x;
-  y = (mi_row & 0x5) >> 1;
-  const int idx16 = y + x;
-  // Set force_skip_low_temp_var based on the block size and block offset.
-  switch (bsize) {
-    case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break;
-    case BLOCK_128X64:
-      assert((mi_col & 0x1F) == 0);
-      force_skip_low_temp_var = variance_low[1 + ((mi_row & 0x1F) != 0)];
-      break;
-    case BLOCK_64X128:
-      assert((mi_row & 0x1F) == 0);
-      force_skip_low_temp_var = variance_low[3 + ((mi_col & 0x1F) != 0)];
-      break;
-    case BLOCK_64X64:
-      // Location of this 64x64 block inside the 128x128 superblock
-      force_skip_low_temp_var = variance_low[5 + idx64];
-      break;
-    case BLOCK_64X32:
-      x = (mi_col & 0x1F) >> 4;
-      y = (mi_row & 0x1F) >> 3;
-      /*
-      .---------------.---------------.
-      | x=0,y=0,idx=0 | x=0,y=0,idx=2 |
-      :---------------+---------------:
-      | x=0,y=1,idx=1 | x=1,y=1,idx=3 |
-      :---------------+---------------:
-      | x=0,y=2,idx=4 | x=1,y=2,idx=6 |
-      :---------------+---------------:
-      | x=0,y=3,idx=5 | x=1,y=3,idx=7 |
-      '---------------'---------------'
-      */
-      const int idx64x32 = (x << 1) + (y % 2) + ((y >> 1) << 2);
-      force_skip_low_temp_var = variance_low[9 + idx64x32];
-      break;
-    case BLOCK_32X64:
-      x = (mi_col & 0x1F) >> 3;
-      y = (mi_row & 0x1F) >> 4;
-      const int idx32x64 = (y << 2) + x;
-      force_skip_low_temp_var = variance_low[17 + idx32x64];
-      break;
-    case BLOCK_32X32:
-      force_skip_low_temp_var = variance_low[25 + (idx64 << 2) + idx32];
-      break;
-    case BLOCK_32X16:
-    case BLOCK_16X32:
-    case BLOCK_16X16:
-      force_skip_low_temp_var =
-          variance_low[41 + (idx64 << 4) + (idx32 << 2) + idx16];
-      break;
-    default: break;
+static void recheck_zeromv_after_denoising(
+    AV1_COMP *cpi, MB_MODE_INFO *const mi, MACROBLOCK *x, MACROBLOCKD *const xd,
+    AV1_DENOISER_DECISION decision, AV1_PICKMODE_CTX_DEN *ctx_den,
+    struct buf_2d yv12_mb[4][MAX_MB_PLANE], RD_STATS *best_rdc,
+    BEST_PICKMODE *best_pickmode, BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on
+  // denoised result. Only do this under noise conditions, and if rdcost of
+  // ZEROMV onoriginal source is not significantly higher than rdcost of best
+  // mode.
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level > kLow &&
+      ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) &&
+      ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
+       (ctx_den->best_ref_frame == GOLDEN_FRAME &&
+        cpi->svc.number_spatial_layers == 1 &&
+        decision == FILTER_ZEROMV_BLOCK))) {
+    // Check if we should pick ZEROMV on denoised signal.
+    AV1_COMMON *const cm = &cpi->common;
+    RD_STATS this_rdc;
+    const ModeCosts *mode_costs = &x->mode_costs;
+    TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+    MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+
+    mi->mode = GLOBALMV;
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE_FRAME;
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME);
+    mi->mv[0].as_int = 0;
+    mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+    xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
+    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+    model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, 1);
+
+    const int16_t mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
+    this_rdc.rate += cost_mv_ref(mode_costs, GLOBALMV, mode_ctx);
+
+    this_rdc.rate += ctx_den->ref_frame_cost[LAST_FRAME];
+    this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+    txfm_info->skip_txfm = this_rdc.skip_txfm;
+    // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source
+    // is higher than best_ref mode (on original source).
+    if (this_rdc.rdcost > best_rdc->rdcost) {
+      this_rdc = *best_rdc;
+      mi->mode = best_pickmode->best_mode;
+      mi->ref_frame[0] = best_pickmode->best_ref_frame;
+      set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME);
+      mi->interp_filters = best_pickmode->best_pred_filter;
+      if (best_pickmode->best_ref_frame == INTRA_FRAME) {
+        mi->mv[0].as_int = INVALID_MV;
+      } else {
+        mi->mv[0].as_int = ctx_den
+                               ->frame_mv[best_pickmode->best_mode]
+                                         [best_pickmode->best_ref_frame]
+                               .as_int;
+        if (ctx_den->reuse_inter_pred) {
+          xd->plane[0].pre[0] = yv12_mb[GOLDEN_FRAME][0];
+          av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+        }
+      }
+      mi->tx_size = best_pickmode->best_tx_size;
+      txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm;
+    } else {
+      ctx_den->best_ref_frame = LAST_FRAME;
+      *best_rdc = this_rdc;
+    }
   }
-  return force_skip_low_temp_var;
 }
+#endif  // CONFIG_AV1_TEMPORAL_DENOISING
 
 #define FILTER_SEARCH_SIZE 2
+
+/*!\brief Searches for the best intrpolation filter
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Iterates through subset of possible interpolation filters (EIGHTTAP_REGULAR,
+ * EIGTHTAP_SMOOTH, MULTITAP_SHARP, depending on FILTER_SEARCH_SIZE) and selects
+ * the one that gives lowest RD cost. RD cost is calculated using curvfit model.
+ * Support for dual filters (different filters in the x & y directions) is
+ * allowed if sf.interp_sf.disable_dual_filter = 0.
+ *
+ * \param[in]    cpi                  Top-level encoder structure
+ * \param[in]    x                    Pointer to structure holding all the
+ *                                    data for the current macroblock
+ * \param[in]    this_rdc             Pointer to calculated RD Cost
+ * \param[in]    mi_row               Row index in 4x4 units
+ * \param[in]    mi_col               Column index in 4x4 units
+ * \param[in]    tmp                  Pointer to a temporary buffer for
+ *                                    prediction re-use
+ * \param[in]    bsize                Current block size
+ * \param[in]    reuse_inter_pred     Flag, indicating prediction re-use
+ * \param[out]   this_mode_pred       Pointer to store prediction buffer
+ *                                    for prediction re-use
+ * \param[out]   this_early_term      Flag, indicating that transform can be
+ *                                    skipped
+ * \param[in]    use_model_yrd_large  Flag, indicating special logic to handle
+ *                                    large blocks
+ * \param[in]    best_sse             Best sse so far.
+ *
+ * \return Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc and best filter is placed to \c mi->interp_filters. In case
+ * \c reuse_inter_pred flag is set, this function also ouputs
+ * \c this_mode_pred. Also \c this_early_temp is set if transform can be
+ * skipped
+ */
 static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
                               int mi_row, int mi_col, PRED_BUFFER *tmp,
                               BLOCK_SIZE bsize, int reuse_inter_pred,
-                              PRED_BUFFER **this_mode_pred, unsigned int *var_y,
-                              unsigned int *sse_y, int *this_early_term,
-                              int use_model_yrd_large, int64_t *sse_block_yrd) {
+                              PRED_BUFFER **this_mode_pred,
+                              int *this_early_term, int use_model_yrd_large,
+                              int64_t best_sse) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
   MB_MODE_INFO *const mi = xd->mi[0];
   const int bw = block_size_wide[bsize];
-  int pf_rate[FILTER_SEARCH_SIZE] = { 0 };
-  int64_t pf_dist[FILTER_SEARCH_SIZE] = { 0 };
-  unsigned int pf_var[FILTER_SEARCH_SIZE] = { 0 };
-  unsigned int pf_sse[FILTER_SEARCH_SIZE] = { 0 };
-  int64_t pf_sse_block_yrd[FILTER_SEARCH_SIZE] = { 0 };
-  TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE] = { 0 };
+  int dim_factor =
+      (cpi->sf.interp_sf.disable_dual_filter == 0) ? FILTER_SEARCH_SIZE : 1;
+  RD_STATS pf_rd_stats[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 };
+  TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 };
   PRED_BUFFER *current_pred = *this_mode_pred;
-  int skip_txfm[FILTER_SEARCH_SIZE] = { 0 };
   int best_skip = 0;
   int best_early_term = 0;
   int64_t best_cost = INT64_MAX;
   int best_filter_index = -1;
-  InterpFilter filters[FILTER_SEARCH_SIZE] = { EIGHTTAP_REGULAR,
-                                               EIGHTTAP_SMOOTH };
-  int i;
-  for (i = 0; i < FILTER_SEARCH_SIZE; ++i) {
+  for (int i = 0; i < FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE; ++i) {
     int64_t cost;
-    InterpFilter filter = filters[i];
-    mi->interp_filters = av1_broadcast_interp_filter(filter);
+    if (cpi->sf.interp_sf.disable_dual_filter &&
+        filters_ref_set[i].filter_x != filters_ref_set[i].filter_y)
+      continue;
+    mi->interp_filters.as_filters.x_filter = filters_ref_set[i].filter_x;
+    mi->interp_filters.as_filters.y_filter = filters_ref_set[i].filter_y;
     av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
     if (use_model_yrd_large)
-      model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &pf_rate[i],
-                                &pf_dist[i], &pf_var[i], &pf_sse[i],
-                                this_early_term, 1);
+      model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+                                &pf_rd_stats[i], this_early_term, 1, best_sse);
     else
-      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[i], &pf_dist[i],
-                        &skip_txfm[i], NULL, &pf_var[i], &pf_sse[i], 1);
-    pf_rate[i] += av1_get_switchable_rate(x, xd, cm->features.interp_filter);
-    cost = RDCOST(x->rdmult, pf_rate[i], pf_dist[i]);
+      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
+    pf_rd_stats[i].rate += av1_get_switchable_rate(
+        x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter);
+    cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
     pf_tx_size[i] = mi->tx_size;
     if (cost < best_cost) {
       best_filter_index = i;
       best_cost = cost;
-      best_skip = skip_txfm[i];
+      best_skip = pf_rd_stats[i].skip_txfm;
       best_early_term = *this_early_term;
       if (reuse_inter_pred) {
         if (*this_mode_pred != current_pred) {
@@ -1340,26 +1583,224 @@ static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
       }
     }
   }
-  assert(best_filter_index >= 0 && best_filter_index < FILTER_SEARCH_SIZE);
+  assert(best_filter_index >= 0 &&
+         best_filter_index < dim_factor * FILTER_SEARCH_SIZE);
   if (reuse_inter_pred && *this_mode_pred != current_pred)
     free_pred_buffer(current_pred);
 
-  mi->interp_filters = av1_broadcast_interp_filter(filters[best_filter_index]);
+  mi->interp_filters.as_filters.x_filter =
+      filters_ref_set[best_filter_index].filter_x;
+  mi->interp_filters.as_filters.y_filter =
+      filters_ref_set[best_filter_index].filter_y;
   mi->tx_size = pf_tx_size[best_filter_index];
-  this_rdc->rate = pf_rate[best_filter_index];
-  this_rdc->dist = pf_dist[best_filter_index];
-  *var_y = pf_var[best_filter_index];
-  *sse_y = pf_sse[best_filter_index];
-  *sse_block_yrd = pf_sse_block_yrd[best_filter_index];
-  this_rdc->skip = (best_skip || best_early_term);
+  this_rdc->rate = pf_rd_stats[best_filter_index].rate;
+  this_rdc->dist = pf_rd_stats[best_filter_index].dist;
+  this_rdc->sse = pf_rd_stats[best_filter_index].sse;
+  this_rdc->skip_txfm = (best_skip || best_early_term);
   *this_early_term = best_early_term;
   if (reuse_inter_pred) {
     pd->dst.buf = (*this_mode_pred)->data;
     pd->dst.stride = (*this_mode_pred)->stride;
-  } else if (best_filter_index < FILTER_SEARCH_SIZE - 1) {
+  } else if (best_filter_index < dim_factor * FILTER_SEARCH_SIZE - 1) {
     av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
   }
 }
+#if !CONFIG_REALTIME_ONLY
+#define MOTION_MODE_SEARCH_SIZE 2
+
+static AOM_INLINE int is_warped_mode_allowed(const AV1_COMP *cpi,
+                                             MACROBLOCK *const x,
+                                             const MB_MODE_INFO *mbmi) {
+  const FeatureFlags *const features = &cpi->common.features;
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  if (cpi->sf.inter_sf.extra_prune_warped) return 0;
+  if (has_second_ref(mbmi)) return 0;
+  MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+
+  if (features->switchable_motion_mode) {
+    // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+    // is allowed.
+    last_motion_mode_allowed = motion_mode_allowed(
+        xd->global_motion, xd, mbmi, features->allow_warped_motion);
+  }
+
+  if (last_motion_mode_allowed == WARPED_CAUSAL) {
+    return 1;
+  }
+
+  return 0;
+}
+
+static void calc_num_proj_ref(AV1_COMP *cpi, MACROBLOCK *x, MB_MODE_INFO *mi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const FeatureFlags *const features = &cm->features;
+
+  mi->num_proj_ref = 1;
+  WARP_SAMPLE_INFO *const warp_sample_info =
+      &x->warp_sample_info[mi->ref_frame[0]];
+  int *pts0 = warp_sample_info->pts;
+  int *pts_inref0 = warp_sample_info->pts_inref;
+  MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+
+  if (features->switchable_motion_mode) {
+    // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+    // is allowed.
+    last_motion_mode_allowed = motion_mode_allowed(
+        xd->global_motion, xd, mi, features->allow_warped_motion);
+  }
+
+  if (last_motion_mode_allowed == WARPED_CAUSAL) {
+    if (warp_sample_info->num < 0) {
+      warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0);
+    }
+    mi->num_proj_ref = warp_sample_info->num;
+  }
+}
+
+static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               int *this_early_term, int use_model_yrd_large,
+                               int *rate_mv, int64_t best_sse) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const FeatureFlags *const features = &cm->features;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  RD_STATS pf_rd_stats[MOTION_MODE_SEARCH_SIZE] = { 0 };
+  int best_skip = 0;
+  int best_early_term = 0;
+  int64_t best_cost = INT64_MAX;
+  int best_mode_index = -1;
+  const int interp_filter = features->interp_filter;
+
+  const MOTION_MODE motion_modes[MOTION_MODE_SEARCH_SIZE] = {
+    SIMPLE_TRANSLATION, WARPED_CAUSAL
+  };
+  int mode_search_size = is_warped_mode_allowed(cpi, x, mi) ? 2 : 1;
+
+  WARP_SAMPLE_INFO *const warp_sample_info =
+      &x->warp_sample_info[mi->ref_frame[0]];
+  int *pts0 = warp_sample_info->pts;
+  int *pts_inref0 = warp_sample_info->pts_inref;
+
+  const int total_samples = mi->num_proj_ref;
+  if (total_samples == 0) {
+    // Do not search WARPED_CAUSAL if there are no samples to use to determine
+    // warped parameters.
+    mode_search_size = 1;
+  }
+
+  const MB_MODE_INFO base_mbmi = *mi;
+  MB_MODE_INFO best_mbmi;
+
+  for (int i = 0; i < mode_search_size; ++i) {
+    int64_t cost = INT64_MAX;
+    MOTION_MODE motion_mode = motion_modes[i];
+    *mi = base_mbmi;
+    mi->motion_mode = motion_mode;
+    if (motion_mode == SIMPLE_TRANSLATION) {
+      mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
+      if (use_model_yrd_large)
+        model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+                                  &pf_rd_stats[i], this_early_term, 1,
+                                  best_sse);
+      else
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
+      pf_rd_stats[i].rate +=
+          av1_get_switchable_rate(x, xd, cm->features.interp_filter,
+                                  cm->seq_params->enable_dual_filter);
+      cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
+    } else if (motion_mode == WARPED_CAUSAL) {
+      int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+      const ModeCosts *mode_costs = &x->mode_costs;
+      mi->wm_params.wmtype = DEFAULT_WMTYPE;
+      mi->interp_filters =
+          av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+
+      memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+      memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+      // Select the samples according to motion vector difference
+      if (mi->num_proj_ref > 1) {
+        mi->num_proj_ref = av1_selectSamples(&mi->mv[0].as_mv, pts, pts_inref,
+                                             mi->num_proj_ref, bsize);
+      }
+
+      // Compute the warped motion parameters with a least squares fit
+      //  using the collected samples
+      if (!av1_find_projection(mi->num_proj_ref, pts, pts_inref, bsize,
+                               mi->mv[0].as_mv.row, mi->mv[0].as_mv.col,
+                               &mi->wm_params, mi_row, mi_col)) {
+        if (mi->mode == NEWMV) {
+          const int_mv mv0 = mi->mv[0];
+          const WarpedMotionParams wm_params0 = mi->wm_params;
+          const int num_proj_ref0 = mi->num_proj_ref;
+
+          const int_mv ref_mv = av1_get_ref_mv(x, 0);
+          SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+          av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+                                            &ref_mv.as_mv, NULL);
+
+          // Refine MV in a small range.
+          av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
+                               total_samples);
+          if (mi->mv[0].as_int == ref_mv.as_int) {
+            continue;
+          }
+
+          if (mv0.as_int != mi->mv[0].as_int) {
+            // Keep the refined MV and WM parameters.
+            int tmp_rate_mv = av1_mv_bit_cost(
+                &mi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost,
+                x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+            *rate_mv = tmp_rate_mv;
+          } else {
+            // Restore the old MV and WM parameters.
+            mi->mv[0] = mv0;
+            mi->wm_params = wm_params0;
+            mi->num_proj_ref = num_proj_ref0;
+          }
+        }
+        // Build the warped predictor
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      av1_num_planes(cm) - 1);
+        if (use_model_yrd_large)
+          model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+                                    &pf_rd_stats[i], this_early_term, 1,
+                                    best_sse);
+        else
+          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
+
+        pf_rd_stats[i].rate +=
+            mode_costs->motion_mode_cost[bsize][mi->motion_mode];
+        cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
+      } else {
+        cost = INT64_MAX;
+      }
+    }
+    if (cost < best_cost) {
+      best_mode_index = i;
+      best_cost = cost;
+      best_skip = pf_rd_stats[i].skip_txfm;
+      best_early_term = *this_early_term;
+      best_mbmi = *mi;
+    }
+  }
+  assert(best_mode_index >= 0 && best_mode_index < FILTER_SEARCH_SIZE);
+
+  *mi = best_mbmi;
+  this_rdc->rate = pf_rd_stats[best_mode_index].rate;
+  this_rdc->dist = pf_rd_stats[best_mode_index].dist;
+  this_rdc->sse = pf_rd_stats[best_mode_index].sse;
+  this_rdc->skip_txfm = (best_skip || best_early_term);
+  *this_early_term = best_early_term;
+  if (best_mode_index < FILTER_SEARCH_SIZE - 1) {
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
 
 #define COLLECT_PICK_MODE_STAT 0
 
@@ -1379,6 +1820,7 @@ typedef struct _mode_search_stat {
 static void compute_intra_yprediction(const AV1_COMMON *cm,
                                       PREDICTION_MODE mode, BLOCK_SIZE bsize,
                                       MACROBLOCK *x, MACROBLOCKD *xd) {
+  const SequenceHeader *seq_params = cm->seq_params;
   struct macroblockd_plane *const pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
   uint8_t *const src_buf_base = p->src.buf;
@@ -1405,32 +1847,37 @@ static void compute_intra_yprediction(const AV1_COMMON *cm,
     for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
       p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
       pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
-      av1_predict_intra_block(cm, xd, block_size_wide[bsize],
-                              block_size_high[bsize], tx_size, mode, 0, 0,
-                              FILTER_INTRA_MODES, pd->dst.buf, dst_stride,
-                              pd->dst.buf, dst_stride, 0, 0, plane);
+      av1_predict_intra_block(
+          xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+          block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0,
+          FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride,
+          0, 0, plane);
     }
   }
   p->src.buf = src_buf_base;
   pd->dst.buf = dst_buf_base;
 }
 
-void av1_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
-                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mi = xd->mi[0];
   RD_STATS this_rdc, best_rdc;
   struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   const TX_SIZE intra_tx_size =
       AOMMIN(max_txsize_lookup[bsize],
-             tx_mode_to_biggest_tx_size[x->tx_mode_search_type]);
+             tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
   int *bmode_costs;
+  PREDICTION_MODE best_mode = DC_PRED;
   const MB_MODE_INFO *above_mi = xd->above_mbmi;
   const MB_MODE_INFO *left_mi = xd->left_mbmi;
   const PREDICTION_MODE A = av1_above_block_mode(above_mi);
   const PREDICTION_MODE L = av1_left_block_mode(left_mi);
-  bmode_costs = x->y_mode_costs[A][L];
+  const int above_ctx = intra_mode_context[A];
+  const int left_ctx = intra_mode_context[L];
+  bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx];
 
   av1_invalid_rd_stats(&best_rdc);
   av1_invalid_rd_stats(&this_rdc);
@@ -1447,22 +1894,31 @@ void av1_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
     args.skippable = 1;
     args.rdc = &this_rdc;
     mi->tx_size = intra_tx_size;
+    mi->mode = this_mode;
     av1_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra,
                                            &args);
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
     if (args.skippable) {
-      this_rdc.rate = av1_cost_symbol(av1_get_skip_cdf(xd)[1]);
+      this_rdc.rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
     } else {
-      this_rdc.rate += av1_cost_symbol(av1_get_skip_cdf(xd)[0]);
+      this_rdc.rate += x->mode_costs.skip_txfm_cost[skip_ctx][0];
     }
     this_rdc.rate += bmode_costs[this_mode];
     this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
 
     if (this_rdc.rdcost < best_rdc.rdcost) {
       best_rdc = this_rdc;
-      mi->mode = this_mode;
+      best_mode = this_mode;
+      if (!this_rdc.skip_txfm) {
+        memset(ctx->blk_skip, 0,
+               sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+      }
     }
   }
 
+  mi->mode = best_mode;
+  // Keep DC for UV since mode test is based on Y channel only.
+  mi->uv_mode = UV_DC_PRED;
   *rd_cost = best_rdc;
 
 #if CONFIG_INTERNAL_STATS
@@ -1472,95 +1928,656 @@ void av1_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
 #endif  // CONFIG_INTERNAL_STATS
 }
 
+static AOM_INLINE int is_same_gf_and_last_scale(AV1_COMMON *cm) {
+  struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME);
+  struct scale_factors *const sf_golden =
+      get_ref_scale_factors(cm, GOLDEN_FRAME);
+  return ((sf_last->x_scale_fp == sf_golden->x_scale_fp) &&
+          (sf_last->y_scale_fp == sf_golden->y_scale_fp));
+}
+
+static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
+                                              MB_MODE_INFO *mi, int mi_row,
+                                              int mi_col, int bsize,
+                                              int gf_temporal_ref,
+                                              int use_ref_frame[],
+                                              int *force_skip_low_temp_var) {
+  AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
+
+  // For SVC the usage of alt_ref is determined by the ref_frame_flags.
+  int use_alt_ref_frame =
+      cpi->ppi->use_svc || cpi->sf.rt_sf.use_nonrd_altref_frame;
+  int use_golden_ref_frame = 1;
+  int use_last_ref_frame = 1;
+
+  if (cpi->ppi->use_svc)
+    use_last_ref_frame =
+        cpi->ref_frame_flags & AOM_LAST_FLAG ? use_last_ref_frame : 0;
+
+  // Only remove golden and altref reference below if last is a reference,
+  // which may not be the case for svc.
+  if (use_last_ref_frame && cpi->rc.frames_since_golden == 0 &&
+      gf_temporal_ref) {
+    use_golden_ref_frame = 0;
+  }
+  if (use_last_ref_frame && cpi->sf.rt_sf.short_circuit_low_temp_var &&
+      x->nonrd_prune_ref_frame_search) {
+    if (is_small_sb)
+      *force_skip_low_temp_var = av1_get_force_skip_low_temp_var_small_sb(
+          &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+    else
+      *force_skip_low_temp_var = av1_get_force_skip_low_temp_var(
+          &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+    // If force_skip_low_temp_var is set, skip golden reference.
+    if (*force_skip_low_temp_var) {
+      use_golden_ref_frame = 0;
+      use_alt_ref_frame = 0;
+    }
+  }
+
+  if (use_last_ref_frame &&
+      (x->nonrd_prune_ref_frame_search > 2 || x->force_zeromv_skip ||
+       (x->nonrd_prune_ref_frame_search > 1 && bsize > BLOCK_64X64))) {
+    use_golden_ref_frame = 0;
+    use_alt_ref_frame = 0;
+    // Keep golden (longer-term) reference if sb has high source sad, for
+    // frames whose average souce_sad is below threshold. This is to try to
+    // capture case where only part of frame has high motion.
+    if (x->content_state_sb.source_sad_nonrd >= kHighSad &&
+        bsize <= BLOCK_32X32 && cpi->rc.frame_source_sad < 50000)
+      use_golden_ref_frame = 1;
+  }
+
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+    use_golden_ref_frame = 1;
+    use_alt_ref_frame = 0;
+  }
+
+  use_alt_ref_frame =
+      cpi->ref_frame_flags & AOM_ALT_FLAG ? use_alt_ref_frame : 0;
+  use_golden_ref_frame =
+      cpi->ref_frame_flags & AOM_GOLD_FLAG ? use_golden_ref_frame : 0;
+
+  use_ref_frame[ALTREF_FRAME] = use_alt_ref_frame;
+  use_ref_frame[GOLDEN_FRAME] = use_golden_ref_frame;
+  use_ref_frame[LAST_FRAME] = use_last_ref_frame;
+  // For now keep this assert on, but we should remove it for svc mode,
+  // as the user may want to generate an intra-only frame (no inter-modes).
+  // Remove this assert in subsequent CL when nonrd_pickmode is tested for the
+  // case of intra-only frame (no references enabled).
+  assert(use_last_ref_frame || use_golden_ref_frame || use_alt_ref_frame);
+}
+
+/*!\brief Estimates best intra mode for inter mode search
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ *
+ * Using heuristics based on best inter mode, block size, and other decides
+ * whether to check intra modes. If so, estimates and selects best intra mode
+ * from the reduced set of intra modes (max 4 intra modes checked)
+ *
+ * \param[in]    cpi                      Top-level encoder structure
+ * \param[in]    x                        Pointer to structure holding all the
+ *                                        data for the current macroblock
+ * \param[in]    bsize                    Current block size
+ * \param[in]    use_modeled_non_rd_cost  Flag, indicating usage of curvfit
+ *                                        model for RD cost
+ * \param[in]    best_early_term          Flag, indicating that TX for the
+ *                                        best inter mode was skipped
+ * \param[in]    ref_cost_intra           Cost of signalling intra mode
+ * \param[in]    reuse_prediction         Flag, indicating prediction re-use
+ * \param[in]    orig_dst                 Original destination buffer
+ * \param[in]    tmp_buffers              Pointer to a temporary buffers for
+ *                                        prediction re-use
+ * \param[out]   this_mode_pred           Pointer to store prediction buffer
+ *                                        for prediction re-use
+ * \param[in]    best_rdc                 Pointer to RD cost for the best
+ *                                        selected intra mode
+ * \param[in]    best_pickmode            Pointer to a structure containing
+ *                                        best mode picked so far
+ * \param[in]    ctx                      Pointer to structure holding coding
+ *                                        contexts and modes for the block
+ *
+ * \return Nothing is returned. Instead, calculated RD cost is placed to
+ * \c best_rdc and best selected mode is placed to \c best_pickmode
+ */
+static void estimate_intra_mode(
+    AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int use_modeled_non_rd_cost,
+    int best_early_term, unsigned int ref_cost_intra, int reuse_prediction,
+    struct buf_2d *orig_dst, PRED_BUFFER *tmp_buffers,
+    PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc,
+    BEST_PICKMODE *best_pickmode, PICK_MODE_CONTEXT *ctx) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const unsigned char segment_id = mi->segment_id;
+  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+  const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+
+  const CommonQuantParams *quant_params = &cm->quant_params;
+
+  RD_STATS this_rdc;
+
+  int intra_cost_penalty = av1_get_intra_cost_penalty(
+      quant_params->base_qindex, quant_params->y_dc_delta_q,
+      cm->seq_params->bit_depth);
+  int64_t inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
+  int perform_intra_pred = cpi->sf.rt_sf.check_intra_pred_nonrd;
+  int force_intra_check = 0;
+  // For spatial enhancemanent layer: turn off intra prediction if the
+  // previous spatial layer as golden ref is not chosen as best reference.
+  // only do this for temporal enhancement layer and on non-key frames.
+  if (cpi->svc.spatial_layer_id > 0 &&
+      best_pickmode->best_ref_frame != GOLDEN_FRAME &&
+      cpi->svc.temporal_layer_id > 0 &&
+      !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)
+    perform_intra_pred = 0;
+
+  int do_early_exit_rdthresh = 1;
+
+  uint32_t spatial_var_thresh = 50;
+  int motion_thresh = 32;
+  // Adjust thresholds to make intra mode likely tested if the other
+  // references (golden, alt) are skipped/not checked. For now always
+  // adjust for svc mode.
+  if (cpi->ppi->use_svc || (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 &&
+                            cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0)) {
+    spatial_var_thresh = 150;
+    motion_thresh = 0;
+  }
+
+  // Some adjustments to checking intra mode based on source variance.
+  if (x->source_variance < spatial_var_thresh) {
+    // If the best inter mode is large motion or non-LAST ref reduce intra cost
+    // penalty, so intra mode is more likely tested.
+    if (best_rdc->rdcost != INT64_MAX &&
+        (best_pickmode->best_ref_frame != LAST_FRAME ||
+         abs(mi->mv[0].as_mv.row) >= motion_thresh ||
+         abs(mi->mv[0].as_mv.col) >= motion_thresh)) {
+      intra_cost_penalty = intra_cost_penalty >> 2;
+      inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
+      do_early_exit_rdthresh = 0;
+    }
+    if (x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) &&
+        x->content_state_sb.source_sad_nonrd >= kHighSad)
+      force_intra_check = 1;
+    // For big blocks worth checking intra (since only DC will be checked),
+    // even if best_early_term is set.
+    if (bsize >= BLOCK_32X32) best_early_term = 0;
+  } else if (cpi->sf.rt_sf.source_metrics_sb_nonrd &&
+             x->content_state_sb.source_sad_nonrd == kLowSad) {
+    perform_intra_pred = 0;
+  }
+
+  if (best_rdc->skip_txfm && best_pickmode->best_mode_initial_skip_flag) {
+    if (cpi->sf.rt_sf.skip_intra_pred == 1 && best_pickmode->best_mode != NEWMV)
+      perform_intra_pred = 0;
+    else if (cpi->sf.rt_sf.skip_intra_pred == 2)
+      perform_intra_pred = 0;
+  }
+
+  if (!(best_rdc->rdcost == INT64_MAX || force_intra_check ||
+        (perform_intra_pred && !best_early_term &&
+         best_rdc->rdcost > inter_mode_thresh &&
+         bsize <= cpi->sf.part_sf.max_intra_bsize))) {
+    return;
+  }
+
+  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+  TX_SIZE intra_tx_size = AOMMIN(
+      AOMMIN(max_txsize_lookup[bsize],
+             tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
+      TX_16X16);
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+      cpi->rc.high_source_sad && x->source_variance > spatial_var_thresh &&
+      bsize <= BLOCK_16X16)
+    intra_tx_size = TX_4X4;
+
+  PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+  if (reuse_prediction && best_pred != NULL) {
+    const int bh = block_size_high[bsize];
+    const int bw = block_size_wide[bsize];
+    if (best_pred->data == orig_dst->buf) {
+      *this_mode_pred = &tmp_buffers[get_pred_buffer(tmp_buffers, 3)];
+      aom_convolve_copy(best_pred->data, best_pred->stride,
+                        (*this_mode_pred)->data, (*this_mode_pred)->stride, bw,
+                        bh);
+      best_pickmode->best_pred = *this_mode_pred;
+    }
+  }
+  pd->dst = *orig_dst;
+
+  for (int i = 0; i < 4; ++i) {
+    const PREDICTION_MODE this_mode = intra_mode_list[i];
+    const THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
+    const int64_t mode_rd_thresh = rd_threshes[mode_index];
+
+    if (i > 2 || !(force_intra_check == 1 &&
+                   best_pickmode->best_ref_frame != INTRA_FRAME)) {
+      if (!((1 << this_mode) &
+            cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize]))
+        continue;
+    }
+
+    if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+      // For spatially flat blocks with zero motion only check
+      // DC mode.
+      if (cpi->sf.rt_sf.source_metrics_sb_nonrd &&
+          x->content_state_sb.source_sad_nonrd == kZeroSad &&
+          x->source_variance == 0 && this_mode != DC_PRED)
+        continue;
+    }
+
+    if (rd_less_than_thresh(best_rdc->rdcost, mode_rd_thresh,
+                            rd_thresh_freq_fact[mode_index]) &&
+        (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) {
+      continue;
+    }
+    const BLOCK_SIZE uv_bsize = get_plane_block_size(
+        bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+
+    mi->mode = this_mode;
+    mi->ref_frame[0] = INTRA_FRAME;
+    mi->ref_frame[1] = NONE_FRAME;
+
+    av1_invalid_rd_stats(&this_rdc);
+    args.mode = this_mode;
+    args.skippable = 1;
+    args.rdc = &this_rdc;
+    mi->tx_size = intra_tx_size;
+    compute_intra_yprediction(cm, this_mode, bsize, x, xd);
+    // Look into selecting tx_size here, based on prediction residual.
+    if (use_modeled_non_rd_cost)
+      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, 1);
+    else
+      av1_block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &args.skippable, bsize,
+                    mi->tx_size, DCT_DCT, 0);
+    // TODO(kyslov@) Need to account for skippable
+    if (x->color_sensitivity[0]) {
+      av1_foreach_transformed_block_in_plane(xd, uv_bsize, 1,
+                                             estimate_block_intra, &args);
+    }
+    if (x->color_sensitivity[1]) {
+      av1_foreach_transformed_block_in_plane(xd, uv_bsize, 2,
+                                             estimate_block_intra, &args);
+    }
+
+    int mode_cost = 0;
+    if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) {
+      mode_cost +=
+          x->mode_costs.angle_delta_cost[this_mode - V_PRED]
+                                        [MAX_ANGLE_DELTA +
+                                         mi->angle_delta[PLANE_TYPE_Y]];
+    }
+    if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+      mode_cost += x->mode_costs.filter_intra_cost[bsize][0];
+    }
+    this_rdc.rate += ref_cost_intra;
+    this_rdc.rate += intra_cost_penalty;
+    this_rdc.rate += mode_cost;
+    this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+
+    if (this_rdc.rdcost < best_rdc->rdcost) {
+      *best_rdc = this_rdc;
+      best_pickmode->best_mode = this_mode;
+      best_pickmode->best_tx_size = mi->tx_size;
+      best_pickmode->best_ref_frame = INTRA_FRAME;
+      best_pickmode->best_second_ref_frame = NONE;
+      best_pickmode->best_mode_skip_txfm = this_rdc.skip_txfm;
+      if (!this_rdc.skip_txfm) {
+        memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip,
+               sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+      }
+      mi->uv_mode = this_mode;
+      mi->mv[0].as_int = INVALID_MV;
+      mi->mv[1].as_int = INVALID_MV;
+    }
+  }
+  mi->tx_size = best_pickmode->best_tx_size;
+}
+
+static AOM_INLINE int is_filter_search_enabled(const AV1_COMP *cpi, int mi_row,
+                                               int mi_col, BLOCK_SIZE bsize,
+                                               int segment_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int enable_filter_search = 0;
+
+  if (cpi->sf.rt_sf.use_nonrd_filter_search) {
+    enable_filter_search = 1;
+    if (cpi->sf.interp_sf.cb_pred_filter_search) {
+      const int bsl = mi_size_wide_log2[bsize];
+      enable_filter_search =
+          (((mi_row + mi_col) >> bsl) +
+           get_chessboard_index(cm->current_frame.frame_number)) &
+          0x1;
+      if (cyclic_refresh_segment_id_boosted(segment_id))
+        enable_filter_search = 1;
+    }
+  }
+  return enable_filter_search;
+}
+
+static AOM_INLINE int skip_mode_by_threshold(
+    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, int_mv mv,
+    int frames_since_golden, const int *const rd_threshes,
+    const int *const rd_thresh_freq_fact, int64_t best_cost, int best_skip,
+    int extra_shift) {
+  int skip_this_mode = 0;
+  const THR_MODES mode_index = mode_idx[ref_frame][INTER_OFFSET(mode)];
+  int64_t mode_rd_thresh =
+      best_skip ? ((int64_t)rd_threshes[mode_index]) << (extra_shift + 1)
+                : ((int64_t)rd_threshes[mode_index]) << extra_shift;
+
+  // Increase mode_rd_thresh value for non-LAST for improved encoding
+  // speed
+  if (ref_frame != LAST_FRAME) {
+    mode_rd_thresh = mode_rd_thresh << 1;
+    if (ref_frame == GOLDEN_FRAME && frames_since_golden > 4)
+      mode_rd_thresh = mode_rd_thresh << (extra_shift + 1);
+  }
+
+  if (rd_less_than_thresh(best_cost, mode_rd_thresh,
+                          rd_thresh_freq_fact[mode_index]))
+    if (mv.as_int != 0) skip_this_mode = 1;
+
+  return skip_this_mode;
+}
+
+static AOM_INLINE int skip_mode_by_low_temp(
+    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
+    CONTENT_STATE_SB content_state_sb, int_mv mv, int force_skip_low_temp_var) {
+  // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var
+  // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
+  // later.
+  if (force_skip_low_temp_var && ref_frame != LAST_FRAME && mv.as_int != 0) {
+    return 1;
+  }
+
+  if (content_state_sb.source_sad_nonrd != kHighSad && bsize >= BLOCK_64X64 &&
+      force_skip_low_temp_var && mode == NEWMV) {
+    return 1;
+  }
+  return 0;
+}
+
+static AOM_INLINE int skip_mode_by_bsize_and_ref_frame(
+    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
+    int extra_prune, unsigned int sse_zeromv_norm, int more_prune) {
+  const unsigned int thresh_skip_golden = 500;
+
+  if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden &&
+      mode == NEWMV)
+    return 1;
+
+  if (bsize == BLOCK_128X128 && mode == NEWMV) return 1;
+
+  // Skip testing non-LAST if this flag is set.
+  if (extra_prune) {
+    if (extra_prune > 1 && ref_frame != LAST_FRAME &&
+        (bsize > BLOCK_16X16 && mode == NEWMV))
+      return 1;
+
+    if (ref_frame != LAST_FRAME && mode == NEARMV) return 1;
+
+    if (more_prune && bsize >= BLOCK_32X32 && mode == NEARMV) return 1;
+  }
+  return 0;
+}
+
+void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                           BLOCK_SIZE bsize, int y_sad,
+                           unsigned int source_variance) {
+  const int factor = (bsize >= BLOCK_32X32) ? 2 : 3;
+  NOISE_LEVEL noise_level = kLow;
+  int norm_sad =
+      y_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+  // If the spatial source variance is high and the normalized y_sad
+  // is low, then y-channel is likely good for mode estimation, so keep
+  // color_sensitivity off. For low noise content for now, since there is
+  // some bdrate regression for noisy color clip.
+  if (cpi->noise_estimate.enabled)
+    noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
+  if (noise_level == kLow && source_variance > 1000 && norm_sad < 50) {
+    x->color_sensitivity[0] = 0;
+    x->color_sensitivity[1] = 0;
+    return;
+  }
+  for (int i = 1; i <= 2; ++i) {
+    if (x->color_sensitivity[i - 1] == 2 || source_variance < 50) {
+      struct macroblock_plane *const p = &x->plane[i];
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      const BLOCK_SIZE bs =
+          get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+      const int uv_sad = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
+                                                  pd->dst.buf, pd->dst.stride);
+      const int norm_uv_sad =
+          uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]);
+      x->color_sensitivity[i - 1] =
+          uv_sad > (factor * (y_sad >> 3)) && norm_uv_sad > 40;
+      if (source_variance < 50 && norm_uv_sad > 100)
+        x->color_sensitivity[i - 1] = 1;
+    }
+  }
+}
+
+void setup_compound_prediction(AV1_COMP *cpi, MACROBLOCK *x,
+                               struct buf_2d yv12_mb[8][MAX_MB_PLANE],
+                               int *use_ref_frame_mask, int flag_comp,
+                               int *ref_mv_idx) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  MV_REFERENCE_FRAME rf[2] = { LAST_FRAME, GOLDEN_FRAME };
+  MV_REFERENCE_FRAME ref_frame_comp;
+  if (flag_comp == 1) {
+    rf[1] = LAST2_FRAME;
+  } else if (flag_comp == 2) {
+    rf[1] = ALTREF_FRAME;
+  }
+  if (!use_ref_frame_mask[rf[1]]) {
+    // Need to setup pred_block, if it hasn't been done in find_predictors.
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, rf[1]);
+    const int num_planes = av1_num_planes(cm);
+    if (yv12 != NULL) {
+      const struct scale_factors *const sf =
+          get_ref_scale_factors_const(cm, rf[1]);
+      av1_setup_pred_block(xd, yv12_mb[rf[1]], yv12, sf, sf, num_planes);
+    }
+  }
+  ref_frame_comp = av1_ref_frame_type(rf);
+  mbmi_ext->mode_context[ref_frame_comp] = 0;
+  mbmi_ext->ref_mv_count[ref_frame_comp] = UINT8_MAX;
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame_comp, mbmi_ext->ref_mv_count,
+                   xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                   mbmi_ext->mode_context);
+  av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_comp);
+  *ref_mv_idx = mbmi->ref_mv_idx + 1;
+}
+
+static void set_compound_mode(MACROBLOCK *x, int comp_index, int ref_frame,
+                              int ref_frame2, int ref_mv_idx,
+                              int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+                              PREDICTION_MODE *this_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  *this_mode = GLOBAL_GLOBALMV;
+  mi->ref_frame[0] = ref_frame;
+  mi->ref_frame[1] = ref_frame2;
+  mi->compound_idx = 1;
+  mi->comp_group_idx = 0;
+  mi->interinter_comp.type = COMPOUND_AVERAGE;
+  MV_REFERENCE_FRAME ref_frame_comp = av1_ref_frame_type(mi->ref_frame);
+  if (comp_index % 3 == 0) {
+    frame_mv[*this_mode][ref_frame].as_int = 0;
+    frame_mv[*this_mode][ref_frame2].as_int = 0;
+  } else if (comp_index % 3 == 1) {
+    *this_mode = NEAREST_NEARESTMV;
+    frame_mv[*this_mode][ref_frame].as_int =
+        xd->ref_mv_stack[ref_frame_comp][0].this_mv.as_int;
+    frame_mv[*this_mode][ref_frame2].as_int =
+        xd->ref_mv_stack[ref_frame_comp][0].comp_mv.as_int;
+  } else if (comp_index % 3 == 2) {
+    *this_mode = NEAR_NEARMV;
+    frame_mv[*this_mode][ref_frame].as_int =
+        xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].this_mv.as_int;
+    frame_mv[*this_mode][ref_frame2].as_int =
+        xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].comp_mv.as_int;
+  }
+}
+
+static int skip_comp_based_on_sad(AV1_COMP *cpi, MACROBLOCK *x,
+                                  const int mi_row, const int mi_col,
+                                  BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(!(mi_row % 16) && !(mi_col % 16));
+  const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+                                ? (cm->seq_params->mib_size >> 1)
+                                : cm->seq_params->mib_size;
+  const int sb_cols =
+      (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+  const uint64_t sad_skp_comp_th[2][3] = { { 2700, 3100 },    // CPU 9
+                                           { 2700, 3200 } };  // CPU 10
+  const uint64_t sad_blkwise_var_th = 5000;
+  const float qindex_th_scale[5] = { 0.75f, 0.9f, 1.0f, 1.1f, 1.25f };
+  const int qindex_band = (5 * x->qindex) >> QINDEX_BITS;
+  assert(qindex_band < 5);
+  const int sp_idx = (cpi->sf.rt_sf.sad_based_comp_prune >= 2);
+  const int bsize_idx = (bsize == BLOCK_128X128);
+  const uint64_t sad_skp_comp_th_val = (uint64_t)(
+      sad_skp_comp_th[sp_idx][bsize_idx] * qindex_th_scale[qindex_band]);
+  uint64_t blk_sad = 0, sad00, sad01, sad10, sad11, min_sad, max_sad;
+  const int sbi_col = mi_col / 16;
+  const int sbi_row = mi_row / 16;
+  const uint64_t *cur_blk_sad =
+      &cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
+
+  if (bsize == BLOCK_128X128) {
+    sad00 = cur_blk_sad[0];
+    sad01 = cur_blk_sad[1];
+    sad10 = cur_blk_sad[sb_cols];
+    sad11 = cur_blk_sad[1 + sb_cols];
+    min_sad = AOMMIN(AOMMIN(AOMMIN(sad00, sad01), sad10), sad11);
+    max_sad = AOMMAX(AOMMAX(AOMMAX(sad00, sad01), sad10), sad11);
+    if (max_sad - min_sad > sad_blkwise_var_th) return 0;
+    blk_sad = (sad00 + sad01 + sad10 + sad11 + 2) >> 2;
+  } else if (bsize == BLOCK_128X64) {
+    sad00 = cur_blk_sad[0];
+    sad01 = cur_blk_sad[1];
+    min_sad = AOMMIN(sad00, sad01);
+    max_sad = AOMMAX(sad00, sad01);
+    if (max_sad - min_sad > sad_blkwise_var_th) return 0;
+    blk_sad = (sad00 + sad01 + 1) >> 1;
+  } else if (bsize == BLOCK_64X128) {
+    sad00 = cur_blk_sad[0];
+    sad10 = cur_blk_sad[sb_cols];
+    min_sad = AOMMIN(sad00, sad10);
+    max_sad = AOMMAX(sad00, sad10);
+    if (max_sad - min_sad > sad_blkwise_var_th) return 0;
+    blk_sad = (sad00 + sad10 + 1) >> 1;
+  } else if (bsize <= BLOCK_64X64) {
+    blk_sad = cur_blk_sad[0];
+  } else {
+    assert(0);
+  }
+
+  if (blk_sad < sad_skp_comp_th_val) return 1;
+
+  return 0;
+}
+
 void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                                   MACROBLOCK *x, RD_STATS *rd_cost,
-                                  BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                                  int64_t best_rd_so_far) {
+                                  BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   AV1_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-
+  const InterpFilter filter_ref = cm->features.interp_filter;
+  const InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
   BEST_PICKMODE best_pickmode;
-  int inter_mode_mask[BLOCK_SIZES];
 #if COLLECT_PICK_MODE_STAT
   static mode_search_stat ms_stat;
 #endif
-  MV_REFERENCE_FRAME ref_frame;
-  MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame;
+  MV_REFERENCE_FRAME ref_frame, ref_frame2;
   int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
+  int_mv frame_mv_best[MB_MODE_COUNT][REF_FRAMES];
   uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
-  struct buf_2d yv12_mb[8][MAX_MB_PLANE];
-  static const int flag_list[8] = { 0, AOM_LAST_FLAG, 0, 0, AOM_GOLD_FLAG, 0,
-                                    0, AOM_ALT_FLAG };
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
   RD_STATS this_rdc, best_rdc;
-  // var_y and sse_y are saved to be used in skipping checking
-  unsigned int sse_y = UINT_MAX;
-  unsigned int var_y = UINT_MAX;
-  const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
+  const unsigned char segment_id = mi->segment_id;
+  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
-  InterpFilter filter_ref;
-  int ref_frame_skip_mask = 0;
-  int best_pred_sad = INT_MAX;
   int best_early_term = 0;
-  unsigned int ref_costs_single[REF_FRAMES],
-      ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  unsigned int ref_costs_single[REF_FRAMES];
   int force_skip_low_temp_var = 0;
-  int skip_ref_find_pred[8] = { 0 };
+  int use_ref_frame_mask[REF_FRAMES] = { 0 };
   unsigned int sse_zeromv_norm = UINT_MAX;
-  const unsigned int thresh_skip_golden = 500;
-  int gf_temporal_ref = 0;
-  const struct segmentation *const seg = &cm->seg;
-  int num_inter_modes = RT_INTER_MODES;
-  unsigned char segment_id = mi->segment_id;
+  // Use mode set that includes zeromv (via globalmv) for speed >= 9 for
+  // content with low motion, and always for force_zeromv_skip.
+  int use_zeromv =
+      cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN ||
+      ((cpi->oxcf.speed >= 9 && cpi->rc.avg_frame_low_motion > 70) ||
+       cpi->sf.rt_sf.nonrd_agressive_skip || x->force_zeromv_skip);
+  int skip_pred_mv = 0;
+  const int num_inter_modes =
+      use_zeromv ? NUM_INTER_MODES_REDUCED : NUM_INTER_MODES_RT;
+  const REF_MODE *const ref_mode_set =
+      use_zeromv ? ref_mode_set_reduced : ref_mode_set_rt;
   PRED_BUFFER tmp[4];
   DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 128 * 128]);
   PRED_BUFFER *this_mode_pred = NULL;
-  const int reuse_inter_pred =
-      cpi->sf.rt_sf.reuse_inter_pred_nonrd && cm->seq_params.bit_depth == 8;
+  const int reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd &&
+                               cm->seq_params->bit_depth == AOM_BITS_8;
+
   const int bh = block_size_high[bsize];
   const int bw = block_size_wide[bsize];
   const int pixels_in_block = bh * bw;
+  const int num_8x8_blocks = ctx->num_4x4_blk / 4;
   struct buf_2d orig_dst = pd->dst;
   const CommonQuantParams *quant_params = &cm->quant_params;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
 #if COLLECT_PICK_MODE_STAT
   aom_usec_timer_start(&ms_stat.timer2);
 #endif
-  int intra_cost_penalty = av1_get_intra_cost_penalty(
-      quant_params->base_qindex, quant_params->y_dc_delta_q,
-      cm->seq_params.bit_depth);
-  int64_t inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
-  const int perform_intra_pred = cpi->sf.rt_sf.check_intra_pred_nonrd;
-  int use_modeled_non_rd_cost = 0;
-  int enable_filter_search = 0;
-  InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
   int64_t thresh_sad_pred = INT64_MAX;
-
-  (void)best_rd_so_far;
-
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int svc_mv_col = 0;
+  int svc_mv_row = 0;
+  int force_mv_inter_layer = 0;
+  int use_modeled_non_rd_cost = 0;
+  int comp_pred = 0;
+  int num_comp_modes_ref = 0;
+  int tot_num_comp_modes = 9;
+  int ref_mv_idx = 0;
+  int skip_comp_mode = 0;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  const int denoise_recheck_zeromv = 1;
+  AV1_PICKMODE_CTX_DEN ctx_den;
+  int64_t zero_last_cost_orig = INT64_MAX;
+  int denoise_svc_pickmode = 1;
+  const int resize_pending = is_frame_resize_pending(cpi);
+#endif
+  x->color_sensitivity[0] = x->color_sensitivity_sb[0];
+  x->color_sensitivity[1] = x->color_sensitivity_sb[1];
   init_best_pickmode(&best_pickmode);
 
-  for (int i = 0; i < BLOCK_SIZES; ++i) inter_mode_mask[i] = INTER_ALL;
-
-  // TODO(kyslov) Move this to Speed Features
-  inter_mode_mask[BLOCK_128X128] = INTER_NEAREST_NEAR;
-
-  struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME);
-  struct scale_factors *const sf_golden =
-      get_ref_scale_factors(cm, GOLDEN_FRAME);
-  gf_temporal_ref = 1;
-  // For temporal long term prediction, check that the golden reference
-  // is same scale as last reference, otherwise disable.
-  if ((sf_last->x_scale_fp != sf_golden->x_scale_fp) ||
-      (sf_last->y_scale_fp != sf_golden->y_scale_fp)) {
-    gf_temporal_ref = 0;
-  }
+  const ModeCosts *mode_costs = &x->mode_costs;
 
-  av1_collect_neighbors_ref_counts(xd);
-
-  estimate_single_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single);
-  if (cpi->sf.rt_sf.use_comp_ref_nonrd)
-    estimate_comp_ref_frame_costs(cm, xd, x, segment_id, ref_costs_comp);
+  estimate_single_ref_frame_costs(cm, xd, mode_costs, segment_id,
+                                  ref_costs_single);
 
   memset(&mode_checked[0][0], 0, MB_MODE_COUNT * REF_FRAMES);
   if (reuse_inter_pred) {
@@ -1574,66 +2591,69 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     tmp[3].in_use = 0;
   }
 
-  x->force_skip = 0;
-
-  // Instead of using av1_get_pred_context_switchable_interp(xd) to assign
-  // filter_ref, we use a less strict condition on assigning filter_ref.
-  // This is to reduce the probabily of entering the flow of not assigning
-  // filter_ref and then skip filter search.
-  filter_ref = cm->features.interp_filter;
+  txfm_info->skip_txfm = 0;
 
   // initialize mode decisions
   av1_invalid_rd_stats(&best_rdc);
   av1_invalid_rd_stats(&this_rdc);
   av1_invalid_rd_stats(rd_cost);
-  mi->sb_type = bsize;
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    x->warp_sample_info[i].num = -1;
+  }
+
+  mi->bsize = bsize;
   mi->ref_frame[0] = NONE_FRAME;
   mi->ref_frame[1] = NONE_FRAME;
 
-  usable_ref_frame =
-      cpi->sf.rt_sf.use_nonrd_altref_frame ? ALTREF_FRAME : GOLDEN_FRAME;
-
-  if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref) {
-    skip_ref_find_pred[GOLDEN_FRAME] = 1;
-    if (!cpi->sf.rt_sf.use_nonrd_altref_frame) usable_ref_frame = LAST_FRAME;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    // if (cpi->ppi->use_svc) denoise_svc_pickmode =
+    // av1_denoise_svc_non_key(cpi);
+    if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
+      av1_denoiser_reset_frame_stats(ctx);
   }
+#endif
 
-  const int mi_row = xd->mi_row;
-  const int mi_col = xd->mi_col;
-  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
-  if (cpi->sf.rt_sf.short_circuit_low_temp_var &&
-      x->nonrd_prune_ref_frame_search) {
-    if (is_small_sb)
-      force_skip_low_temp_var = get_force_skip_low_temp_var_small_sb(
-          &x->variance_low[0], mi_row, mi_col, bsize);
-    else
-      force_skip_low_temp_var = get_force_skip_low_temp_var(
-          &x->variance_low[0], mi_row, mi_col, bsize);
-    // If force_skip_low_temp_var is set, skip golden reference.
-    if (force_skip_low_temp_var) {
-      usable_ref_frame = LAST_FRAME;
-    }
+  const int gf_temporal_ref = is_same_gf_and_last_scale(cm);
+
+  // If the lower spatial layer uses an averaging filter for downsampling
+  // (phase = 8), the target decimated pixel is shifted by (1/2, 1/2) relative
+  // to source, so use subpel motion vector to compensate. The nonzero motion
+  // is half pixel shifted to left and top, so (-4, -4). This has more effect
+  // on higher resolutins, so condition it on that for now.
+  if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+      svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
+      cm->width * cm->height > 640 * 480) {
+    svc_mv_col = -4;
+    svc_mv_row = -4;
   }
 
-  // If the segment reference frame feature is enabled and it's set to GOLDEN
-  // reference, then make sure we don't skip checking GOLDEN, this is to
-  // prevent possibility of not picking any mode.
-  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
-      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
-    usable_ref_frame = GOLDEN_FRAME;
-    skip_ref_find_pred[GOLDEN_FRAME] = 0;
+  get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref,
+                         use_ref_frame_mask, &force_skip_low_temp_var);
+
+  skip_pred_mv = (x->nonrd_prune_ref_frame_search > 2 &&
+                  x->color_sensitivity[0] != 2 && x->color_sensitivity[1] != 2);
+
+  // Compound modes per reference pair (GOLDEN_LAST/LAST2_LAST/ALTREF_LAST):
+  // (0_0)/(NEAREST_NEAREST)/(NEAR_NEAR).
+  // For now to reduce slowdowm, use only (0,0) for blocks above 16x16
+  // for non-svc case or on enhancement layers for svc.
+  if (cpi->sf.rt_sf.use_comp_ref_nonrd && is_comp_ref_allowed(bsize)) {
+    if (cpi->ppi->use_svc && cpi->svc.temporal_layer_id == 0)
+      num_comp_modes_ref = 2;
+    else if (bsize > BLOCK_16X16)
+      num_comp_modes_ref = 1;
+    else
+      tot_num_comp_modes = 0;
+  } else {
+    tot_num_comp_modes = 0;
   }
 
   for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME;
-       ref_frame_iter <= usable_ref_frame; ++ref_frame_iter) {
-    // Skip find_predictor if the reference frame is not in the
-    // ref_frame_flags (i.e., not used as a reference for this frame).
-    skip_ref_find_pred[ref_frame_iter] =
-        !(cpi->ref_frame_flags & flag_list[ref_frame_iter]);
-    if (!skip_ref_find_pred[ref_frame_iter]) {
-      find_predictors(cpi, x, ref_frame_iter, frame_mv, &ref_frame_skip_mask,
-                      flag_list, tile_data, yv12_mb, bsize,
-                      force_skip_low_temp_var);
+       ref_frame_iter <= ALTREF_FRAME; ++ref_frame_iter) {
+    if (use_ref_frame_mask[ref_frame_iter]) {
+      find_predictors(cpi, x, ref_frame_iter, frame_mv, tile_data, yv12_mb,
+                      bsize, force_skip_low_temp_var, skip_pred_mv);
     }
   }
 
@@ -1644,57 +2664,106 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 
   const int large_block = bsize >= BLOCK_32X32;
   const int use_model_yrd_large =
-      cpi->oxcf.rc_mode == AOM_CBR && large_block &&
+      cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block &&
       !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
-      quant_params->base_qindex && cm->seq_params.bit_depth == 8;
+      quant_params->base_qindex && cm->seq_params->bit_depth == 8;
+
+  const int enable_filter_search =
+      is_filter_search_enabled(cpi, mi_row, mi_col, bsize, segment_id);
+
+  // TODO(marpan): Look into reducing these conditions. For now constrain
+  // it to avoid significant bdrate loss.
+  if (cpi->sf.rt_sf.use_modeled_non_rd_cost) {
+    if (cpi->svc.non_reference_frame)
+      use_modeled_non_rd_cost = 1;
+    else if (cpi->svc.number_temporal_layers > 1 &&
+             cpi->svc.temporal_layer_id == 0)
+      use_modeled_non_rd_cost = 0;
+    else
+      use_modeled_non_rd_cost =
+          (quant_params->base_qindex > 120 && x->source_variance > 100 &&
+           bsize <= BLOCK_16X16 && !x->content_state_sb.lighting_change &&
+           x->content_state_sb.source_sad_nonrd != kHighSad);
+  }
 
 #if COLLECT_PICK_MODE_STAT
   ms_stat.num_blocks[bsize]++;
 #endif
   init_mbmi(mi, DC_PRED, NONE_FRAME, NONE_FRAME, cm);
-  mi->tx_size =
-      AOMMIN(AOMMIN(max_txsize_lookup[bsize],
-                    tx_mode_to_biggest_tx_size[x->tx_mode_search_type]),
-             TX_16X16);
+  mi->tx_size = AOMMIN(
+      AOMMIN(max_txsize_lookup[bsize],
+             tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
+      TX_16X16);
 
-  // TODO(marpan): Look into reducing these conditions. For now constrain
-  // it to avoid significant bdrate loss.
-  if (cpi->sf.rt_sf.use_modeled_non_rd_cost &&
-      quant_params->base_qindex > 120 && x->source_variance > 100 &&
-      bsize <= BLOCK_16X16 && x->content_state_sb != kLowVarHighSumdiff &&
-      x->content_state_sb != kHighSad)
-    use_modeled_non_rd_cost = 1;
+  // Skip compound mode based on sad
+  if ((cpi->sf.rt_sf.sad_based_comp_prune) && (bsize >= BLOCK_64X64) &&
+      (cpi->src_sad_blk_64x64 != NULL))
+    skip_comp_mode = skip_comp_based_on_sad(cpi, x, mi_row, mi_col, bsize);
 
-  if (cpi->sf.rt_sf.use_nonrd_filter_search) {
-    enable_filter_search = 1;
-    if (cpi->sf.interp_sf.cb_pred_filter_search) {
-      const int bsl = mi_size_wide_log2[bsize];
-      enable_filter_search =
-          (((mi_row + mi_col) >> bsl) +
-           get_chessboard_index(cm->current_frame.frame_number)) &
-          0x1;
-    }
-    if (x->source_variance <=
-        cpi->sf.interp_sf.disable_filter_search_var_thresh)
-      enable_filter_search = 0;
-  }
+  for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) {
+    const struct segmentation *const seg = &cm->seg;
 
-  for (int idx = 0; idx < num_inter_modes; ++idx) {
     int rate_mv = 0;
-    int mode_rd_thresh;
-    int mode_index;
-    int64_t this_sse;
     int is_skippable;
     int this_early_term = 0;
     int skip_this_mv = 0;
-    int comp_pred = 0;
-    int force_mv_inter_layer = 0;
+    comp_pred = 0;
     PREDICTION_MODE this_mode;
-    MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-    second_ref_frame = NONE_FRAME;
-
-    this_mode = ref_mode_set[idx].pred_mode;
-    ref_frame = ref_mode_set[idx].ref_frame;
+    MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+    RD_STATS nonskip_rdc;
+    av1_invalid_rd_stats(&nonskip_rdc);
+    memset(txfm_info->blk_skip, 0,
+           sizeof(txfm_info->blk_skip[0]) * num_8x8_blocks);
+
+    if (idx >= num_inter_modes) {
+      if (skip_comp_mode) continue;
+      int comp_index = idx - num_inter_modes;
+      if (comp_index % 3 == 0) {
+        int i = 0;
+        ref_mv_idx = 0;
+        // Only needs to be done once per reference pair.
+        if (comp_index == 3) i = 1;
+        if (comp_index == 6) i = 2;
+        if (cpi->sf.rt_sf.ref_frame_comp_nonrd[i])
+          setup_compound_prediction(cpi, x, yv12_mb, use_ref_frame_mask, i,
+                                    &ref_mv_idx);
+      }
+      // num_comp_modes_ref == 1 only do (0,0)
+      if (num_comp_modes_ref == 1 && comp_index % 3 != 0) continue;
+      // num_comp_modes_ref == 2 only do (0,0) and (NEAREST_NEAREST)
+      if (num_comp_modes_ref == 2 && comp_index % 3 == 2) continue;
+      ref_frame = LAST_FRAME;
+      ref_frame2 = GOLDEN_FRAME;
+      if (comp_index >= 0 && comp_index < 3) {
+        // comp_index = 0,1,2 for (0/NEAREST/NEAR) for GOLDEN_LAST.
+        if (cpi->sf.rt_sf.ref_frame_comp_nonrd[0] == 0 ||
+            !(cpi->ref_frame_flags & AOM_GOLD_FLAG))
+          continue;
+      } else if (comp_index >= 3 && comp_index < 6) {
+        // comp_index = 3,4,5 for (0/NEAREST/NEAR) for LAST2_LAST.
+        ref_frame2 = LAST2_FRAME;
+        if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1] == 0 ||
+            !(cpi->ref_frame_flags & AOM_LAST2_FLAG))
+          continue;
+      } else if (comp_index >= 6 && comp_index < 9) {
+        // comp_index = 6,7,8 for (0/NEAREST/NEAR) for ALTREF_LAST.
+        ref_frame2 = ALTREF_FRAME;
+        if (cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 0 ||
+            !(cpi->ref_frame_flags & AOM_ALT_FLAG))
+          continue;
+      }
+      set_compound_mode(x, comp_index, ref_frame, ref_frame2, ref_mv_idx,
+                        frame_mv, &this_mode);
+      if (this_mode != GLOBAL_GLOBALMV &&
+          frame_mv[this_mode][ref_frame].as_int == 0 &&
+          frame_mv[this_mode][ref_frame2].as_int == 0)
+        continue;
+      comp_pred = 1;
+    } else {
+      this_mode = ref_mode_set[idx].pred_mode;
+      ref_frame = ref_mode_set[idx].ref_frame;
+      ref_frame2 = NONE_FRAME;
+    }
 
 #if COLLECT_PICK_MODE_STAT
     aom_usec_timer_start(&ms_stat.timer1);
@@ -1702,107 +2771,109 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif
     mi->mode = this_mode;
     mi->ref_frame[0] = ref_frame;
+    mi->ref_frame[1] = ref_frame2;
 
-    if (ref_frame > usable_ref_frame) continue;
-    if (skip_ref_find_pred[ref_frame]) continue;
+    if (!use_ref_frame_mask[ref_frame]) continue;
 
-    // Skip non-zero motion for SVC if skip_nonzeromv_ref is set.
-    if (cpi->use_svc && frame_mv[this_mode][ref_frame].as_int != 0) {
-      if (ref_frame == LAST_FRAME && cpi->svc.skip_nonzeromv_last)
-        continue;
-      else if (ref_frame == GOLDEN_FRAME && cpi->svc.skip_nonzeromv_gf)
+    if (x->force_zeromv_skip &&
+        (this_mode != GLOBALMV || ref_frame != LAST_FRAME))
+      continue;
+
+    force_mv_inter_layer = 0;
+    if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+        ((ref_frame == LAST_FRAME && svc->skip_mvsearch_last) ||
+         (ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf))) {
+      // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
+      // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
+      // Skip newmv and filter search.
+      force_mv_inter_layer = 1;
+      if (this_mode == NEWMV) {
+        frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col;
+        frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row;
+      } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col ||
+                 frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) {
         continue;
+      }
     }
 
     // If the segment reference frame feature is enabled then do nothing if the
     // current ref frame is not allowed.
-    if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
-        get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
       continue;
 
-    if (ref_frame != LAST_FRAME && cpi->oxcf.rc_mode == AOM_CBR &&
-        sse_zeromv_norm < thresh_skip_golden && this_mode == NEWMV)
-      continue;
-
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
-
-    if (!(inter_mode_mask[bsize] & (1 << this_mode))) continue;
-
-    // Skip testing non-LAST if this flag is set.
-    if (x->nonrd_prune_ref_frame_search) {
-      if (x->nonrd_prune_ref_frame_search > 1 && ref_frame != LAST_FRAME &&
-          (bsize > BLOCK_64X64 || (bsize > BLOCK_16X16 && this_mode == NEWMV)))
+    // For screen content:
+    if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+      // If source_sad is computed: skip non-zero motion
+      // check for stationary (super)blocks. Otherwise if superblock
+      // has motion skip the modes with zero motion for flat blocks.
+      if (cpi->sf.rt_sf.source_metrics_sb_nonrd) {
+        if ((frame_mv[this_mode][ref_frame].as_int != 0 &&
+             x->content_state_sb.source_sad_nonrd == kZeroSad) ||
+            (frame_mv[this_mode][ref_frame].as_int == 0 &&
+             x->content_state_sb.source_sad_nonrd != kZeroSad &&
+             x->source_variance == 0))
+          continue;
+      }
+      // Skip NEWMV search on scene cuts for flat blocks.
+      if (cpi->rc.high_source_sad && this_mode == NEWMV &&
+          (x->source_variance < 100))
         continue;
-
-      if (ref_frame != LAST_FRAME && this_mode == NEARMV) continue;
     }
 
-    // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var
-    // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
-    // later.
-    if (!force_mv_inter_layer && force_skip_low_temp_var &&
-        ref_frame != LAST_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) {
+    if (skip_mode_by_bsize_and_ref_frame(
+            this_mode, ref_frame, bsize, x->nonrd_prune_ref_frame_search,
+            sse_zeromv_norm, cpi->sf.rt_sf.nonrd_agressive_skip))
       continue;
-    }
 
-#if 0
-        if (x->content_state_sb != kVeryHighSad &&
-        (cpi->sf.short_circuit_low_temp_var >= 2 ||
-        (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64))
-        && force_skip_low_temp_var && ref_frame == LAST_FRAME && this_mode ==
-            NEWMV)  {
-          continue;
-        }
-#endif
+    if (skip_mode_by_low_temp(this_mode, ref_frame, bsize, x->content_state_sb,
+                              frame_mv[this_mode][ref_frame],
+                              force_skip_low_temp_var))
+      continue;
 
     // Disable this drop out case if the ref frame segment level feature is
     // enabled for this segment. This is to prevent the possibility that we
     // end up unable to pick any mode.
-    if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) {
+    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
       // Check for skipping GOLDEN and ALTREF based pred_mv_sad.
       if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0 &&
           x->pred_mv_sad[ref_frame] != INT_MAX && ref_frame != LAST_FRAME) {
-        if ((int64_t)(x->pred_mv_sad[ref_frame]) > thresh_sad_pred)
-          ref_frame_skip_mask |= (1 << ref_frame);
+        if ((int64_t)(x->pred_mv_sad[ref_frame]) > thresh_sad_pred) continue;
       }
-      if (ref_frame_skip_mask & (1 << ref_frame)) continue;
+    }
+    // Check for skipping NEARMV based on pred_mv_sad.
+    if (this_mode == NEARMV && x->pred_mv1_sad[ref_frame] != INT_MAX &&
+        x->pred_mv1_sad[ref_frame] > (x->pred_mv0_sad[ref_frame] << 1))
+      continue;
+
+    if (!comp_pred) {
+      if (skip_mode_by_threshold(
+              this_mode, ref_frame, frame_mv[this_mode][ref_frame],
+              cpi->rc.frames_since_golden, rd_threshes, rd_thresh_freq_fact,
+              best_rdc.rdcost, best_pickmode.best_mode_skip_txfm,
+              (cpi->sf.rt_sf.nonrd_agressive_skip ? 1 : 0)))
+        continue;
     }
 
     // Select prediction reference frames.
     for (int i = 0; i < MAX_MB_PLANE; i++) {
       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[ref_frame2][i];
     }
 
     mi->ref_frame[0] = ref_frame;
-    mi->ref_frame[1] = second_ref_frame;
-    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
-
-    mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
-    mode_rd_thresh = best_pickmode.best_mode_skip_txfm
-                         ? rd_threshes[mode_index] << 1
-                         : rd_threshes[mode_index];
-
-    // Increase mode_rd_thresh value for non-LAST for improved encoding
-    // speed
-    if (ref_frame != LAST_FRAME) {
-      mode_rd_thresh = mode_rd_thresh << 1;
-      if (ref_frame == GOLDEN_FRAME && cpi->rc.frames_since_golden > 4)
-        mode_rd_thresh = mode_rd_thresh << 1;
-    }
-
-    if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                            rd_thresh_freq_fact[mode_index]))
-      if (frame_mv[this_mode][ref_frame].as_int != 0) continue;
+    mi->ref_frame[1] = ref_frame2;
+    set_ref_ptrs(cm, xd, ref_frame, ref_frame2);
 
     if (this_mode == NEWMV && !force_mv_inter_layer) {
       if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize,
-                        mi_row, mi_col, best_pred_sad, &rate_mv, &best_rdc))
+                        mi_row, mi_col, &rate_mv, &best_rdc))
         continue;
     }
 
     for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV;
          inter_mv_mode++) {
-      if (inter_mv_mode == this_mode || comp_pred) continue;
+      if (inter_mv_mode == this_mode) continue;
       if (mode_checked[inter_mv_mode][ref_frame] &&
           frame_mv[this_mode][ref_frame].as_int ==
               frame_mv[inter_mv_mode][ref_frame].as_int) {
@@ -1811,11 +2882,13 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
       }
     }
 
-    if (skip_this_mv) continue;
+    if (skip_this_mv && !comp_pred) continue;
 
     mi->mode = this_mode;
     mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
     mi->mv[1].as_int = 0;
+    if (comp_pred) mi->mv[1].as_int = frame_mv[this_mode][ref_frame2].as_int;
+
     if (reuse_inter_pred) {
       if (!this_mode_pred) {
         this_mode_pred = &tmp[3];
@@ -1828,103 +2901,178 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 #if COLLECT_PICK_MODE_STAT
     ms_stat.num_nonskipped_searches[bsize][this_mode]++;
 #endif
-    if (enable_filter_search &&
+
+    if (idx == 0 && !skip_pred_mv) {
+      // Set color sensitivity on first tested mode only.
+      // Use y-sad already computed in find_predictors: take the sad with motion
+      // vector closest to 0; the uv-sad computed below in set_color_sensitivity
+      // is for zeromv.
+      int y_sad = x->pred_mv0_sad[LAST_FRAME];
+      if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
+          (abs(frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
+           abs(frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
+              (abs(frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
+               abs(frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
+        y_sad = x->pred_mv1_sad[LAST_FRAME];
+      set_color_sensitivity(cpi, x, xd, bsize, y_sad, x->source_variance);
+    }
+    mi->motion_mode = SIMPLE_TRANSLATION;
+#if !CONFIG_REALTIME_ONLY
+    if (cpi->oxcf.motion_mode_cfg.allow_warped_motion) {
+      calc_num_proj_ref(cpi, x, mi);
+    }
+#endif
+
+    if (enable_filter_search && !force_mv_inter_layer && !comp_pred &&
         ((mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07)) &&
         (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search)) {
       search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize,
-                        reuse_inter_pred, &this_mode_pred, &var_y, &sse_y,
-                        &this_early_term, use_model_yrd_large, &this_sse);
+                        reuse_inter_pred, &this_mode_pred, &this_early_term,
+                        use_model_yrd_large, best_pickmode.best_sse);
+#if !CONFIG_REALTIME_ONLY
+    } else if (cpi->oxcf.motion_mode_cfg.allow_warped_motion &&
+               this_mode == NEWMV) {
+      search_motion_mode(cpi, x, &this_rdc, mi_row, mi_col, bsize,
+                         &this_early_term, use_model_yrd_large, &rate_mv,
+                         best_pickmode.best_sse);
+      if (this_mode == NEWMV) {
+        frame_mv[this_mode][ref_frame] = mi->mv[0];
+      }
+#endif
     } else {
       mi->interp_filters =
           (filter_ref == SWITCHABLE)
               ? av1_broadcast_interp_filter(default_interp_filter)
               : av1_broadcast_interp_filter(filter_ref);
-      av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+      if (force_mv_inter_layer)
+        mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // If it is sub-pel motion and best filter was not selected in
+      // search_filter_ref() for all blocks, then check top and left values and
+      // force smooth if both were selected to be smooth.
+      if (cpi->sf.interp_sf.cb_pred_filter_search &&
+          (mi->mv[0].as_mv.row & 0x07 || mi->mv[0].as_mv.col & 0x07)) {
+        if (xd->left_mbmi && xd->above_mbmi) {
+          if ((xd->left_mbmi->interp_filters.as_filters.x_filter ==
+                   EIGHTTAP_SMOOTH &&
+               xd->above_mbmi->interp_filters.as_filters.x_filter ==
+                   EIGHTTAP_SMOOTH))
+            mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_SMOOTH);
+        }
+      }
+      if (!comp_pred)
+        av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+      else
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      0);
+
       if (use_model_yrd_large) {
-        model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, NULL, NULL,
-                                  &var_y, &sse_y, &this_early_term,
-                                  use_modeled_non_rd_cost);
+        model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &this_rdc,
+                                  &this_early_term, use_modeled_non_rd_cost,
+                                  best_pickmode.best_sse);
       } else {
-        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
-                          &this_rdc.skip, NULL, &var_y, &sse_y,
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc,
                           use_modeled_non_rd_cost);
       }
     }
 
     if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0) {
       sse_zeromv_norm =
-          sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+          (unsigned int)(this_rdc.sse >> (b_width_log2_lookup[bsize] +
+                                          b_height_log2_lookup[bsize]));
     }
 
-    const int skip_ctx = av1_get_skip_context(xd);
-    const int skip_cost = x->skip_cost[skip_ctx][1];
-    const int no_skip_cost = x->skip_cost[skip_ctx][0];
-    if (!this_early_term) {
+    if (cpi->sf.rt_sf.sse_early_term_inter_search &&
+        early_term_inter_search_with_sse(
+            cpi->sf.rt_sf.sse_early_term_inter_search, bsize, this_rdc.sse,
+            best_pickmode.best_sse, this_mode)) {
+      if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
+      continue;
+    }
+
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
+    const int skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][1];
+    const int no_skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][0];
+    const int64_t sse_y = this_rdc.sse;
+    if (this_early_term) {
+      this_rdc.skip_txfm = 1;
+      this_rdc.rate = skip_txfm_cost;
+      this_rdc.dist = this_rdc.sse << 4;
+    } else {
       if (use_modeled_non_rd_cost) {
-        if (this_rdc.skip) {
-          this_rdc.rate = skip_cost;
+        if (this_rdc.skip_txfm) {
+          this_rdc.rate = skip_txfm_cost;
         } else {
-          this_rdc.rate += no_skip_cost;
+          this_rdc.rate += no_skip_txfm_cost;
         }
       } else {
-        this_sse = (int64_t)sse_y;
-        block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &is_skippable, &this_sse,
-                  bsize, mi->tx_size);
-        if (this_rdc.skip) {
-          this_rdc.rate = skip_cost;
-        } else {
-          if (RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist) >=
-              RDCOST(x->rdmult, 0,
-                     this_sse)) {  // this_sse already multiplied by 16 in
-                                   // block_yrd
-            this_rdc.skip = 1;
-            this_rdc.rate = skip_cost;
-            this_rdc.dist = this_sse;
-          } else {
-            this_rdc.rate += no_skip_cost;
+        av1_block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &is_skippable, bsize,
+                      mi->tx_size, DCT_DCT, 1);
+        if (this_rdc.skip_txfm ||
+            RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist) >=
+                RDCOST(x->rdmult, 0, this_rdc.sse)) {
+          if (!this_rdc.skip_txfm) {
+            // Need to store "real" rdc for possible furure use if UV rdc
+            // disallows tx skip
+            nonskip_rdc = this_rdc;
+            nonskip_rdc.rate += no_skip_txfm_cost;
           }
+          this_rdc.rate = skip_txfm_cost;
+          this_rdc.skip_txfm = 1;
+          this_rdc.dist = this_rdc.sse;
+        } else {
+          this_rdc.rate += no_skip_txfm_cost;
         }
       }
-    } else {
-      this_rdc.skip = 1;
-      this_rdc.rate = skip_cost;
-      this_rdc.dist = sse_y << 4;
-    }
-
-    if (!this_early_term &&
-        (x->color_sensitivity[0] || x->color_sensitivity[1])) {
-      RD_STATS rdc_uv;
-      const BLOCK_SIZE uv_bsize = get_plane_block_size(
-          bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
-      if (x->color_sensitivity[0]) {
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
-                                      AOM_PLANE_U, AOM_PLANE_U);
-      }
-      if (x->color_sensitivity[1]) {
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
-                                      AOM_PLANE_V, AOM_PLANE_V);
+      if ((x->color_sensitivity[0] || x->color_sensitivity[1])) {
+        RD_STATS rdc_uv;
+        const BLOCK_SIZE uv_bsize = get_plane_block_size(
+            bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+        if (x->color_sensitivity[0]) {
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                        AOM_PLANE_U, AOM_PLANE_U);
+        }
+        if (x->color_sensitivity[1]) {
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                        AOM_PLANE_V, AOM_PLANE_V);
+        }
+        model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &this_rdc.sse, 1, 2);
+        // Restore Y rdc if UV rdc disallows txfm skip
+        if (this_rdc.skip_txfm && !rdc_uv.skip_txfm &&
+            nonskip_rdc.rate != INT_MAX)
+          this_rdc = nonskip_rdc;
+        this_rdc.rate += rdc_uv.rate;
+        this_rdc.dist += rdc_uv.dist;
+        this_rdc.skip_txfm = this_rdc.skip_txfm && rdc_uv.skip_txfm;
       }
-      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
-      this_rdc.rate += rdc_uv.rate;
-      this_rdc.dist += rdc_uv.dist;
-      this_rdc.skip = this_rdc.skip && rdc_uv.skip;
     }
 
     // TODO(kyslov) account for UV prediction cost
     this_rdc.rate += rate_mv;
     const int16_t mode_ctx =
         av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
-    this_rdc.rate += cost_mv_ref(x, this_mode, mode_ctx);
+    this_rdc.rate += cost_mv_ref(mode_costs, this_mode, mode_ctx);
 
     this_rdc.rate += ref_costs_single[ref_frame];
 
     this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
-    if (cpi->oxcf.rc_mode == AOM_CBR) {
+    if (cpi->oxcf.rc_cfg.mode == AOM_CBR && !comp_pred) {
       newmv_diff_bias(xd, this_mode, &this_rdc, bsize,
                       frame_mv[this_mode][ref_frame].as_mv.row,
                       frame_mv[this_mode][ref_frame].as_mv.col, cpi->speed,
-                      x->source_variance);
+                      x->source_variance, x->content_state_sb);
     }
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode &&
+        cpi->denoiser.denoising_level > kDenLowLow) {
+      av1_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx);
+      // Keep track of zero_last cost.
+      if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0)
+        zero_last_cost_orig = this_rdc.rdcost;
+    }
+#else
+    (void)sse_y;
+#endif
 
     mode_checked[this_mode][ref_frame] = 1;
 #if COLLECT_PICK_MODE_STAT
@@ -1935,12 +3083,30 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     if (this_rdc.rdcost < best_rdc.rdcost) {
       best_rdc = this_rdc;
       best_early_term = this_early_term;
+      best_pickmode.best_sse = sse_y;
       best_pickmode.best_mode = this_mode;
+      best_pickmode.best_motion_mode = mi->motion_mode;
+      best_pickmode.wm_params = mi->wm_params;
+      best_pickmode.num_proj_ref = mi->num_proj_ref;
       best_pickmode.best_pred_filter = mi->interp_filters;
       best_pickmode.best_tx_size = mi->tx_size;
       best_pickmode.best_ref_frame = ref_frame;
-      best_pickmode.best_mode_skip_txfm = this_rdc.skip;
-      best_pickmode.best_second_ref_frame = second_ref_frame;
+      best_pickmode.best_second_ref_frame = ref_frame2;
+      best_pickmode.best_mode_skip_txfm = this_rdc.skip_txfm;
+      best_pickmode.best_mode_initial_skip_flag =
+          (nonskip_rdc.rate == INT_MAX && this_rdc.skip_txfm);
+      if (!best_pickmode.best_mode_skip_txfm && !use_modeled_non_rd_cost) {
+        memcpy(best_pickmode.blk_skip, txfm_info->blk_skip,
+               sizeof(txfm_info->blk_skip[0]) * num_8x8_blocks);
+      }
+
+      // This is needed for the compound modes.
+      frame_mv_best[this_mode][ref_frame].as_int =
+          frame_mv[this_mode][ref_frame].as_int;
+      if (ref_frame2 > NONE_FRAME)
+        frame_mv_best[this_mode][ref_frame2].as_int =
+            frame_mv[this_mode][ref_frame2].as_int;
+
       if (reuse_inter_pred) {
         free_pred_buffer(best_pickmode.best_pred);
         best_pickmode.best_pred = this_mode_pred;
@@ -1948,159 +3114,121 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     } else {
       if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
     }
-    if (best_early_term && idx > 0) {
-      x->force_skip = 1;
+    if (best_early_term && (idx > 0 || cpi->sf.rt_sf.nonrd_agressive_skip)) {
+      txfm_info->skip_txfm = 1;
       break;
     }
   }
 
   mi->mode = best_pickmode.best_mode;
+  mi->motion_mode = best_pickmode.best_motion_mode;
+  mi->wm_params = best_pickmode.wm_params;
+  mi->num_proj_ref = best_pickmode.num_proj_ref;
   mi->interp_filters = best_pickmode.best_pred_filter;
   mi->tx_size = best_pickmode.best_tx_size;
   memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size));
   mi->ref_frame[0] = best_pickmode.best_ref_frame;
   mi->mv[0].as_int =
-      frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int;
-  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
-  x->force_skip = best_rdc.skip;
-
+      frame_mv_best[best_pickmode.best_mode][best_pickmode.best_ref_frame]
+          .as_int;
+  mi->mv[1].as_int = 0;
+  if (best_pickmode.best_second_ref_frame > INTRA_FRAME) {
+    mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+    mi->mv[1].as_int = frame_mv_best[best_pickmode.best_mode]
+                                    [best_pickmode.best_second_ref_frame]
+                                        .as_int;
+  }
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
   mi->angle_delta[PLANE_TYPE_Y] = 0;
   mi->angle_delta[PLANE_TYPE_UV] = 0;
   mi->filter_intra_mode_info.use_filter_intra = 0;
 
-  uint32_t spatial_var_thresh = 50;
-  int motion_thresh = 32;
-  // Adjust thresholds to make intra mode likely tested if the other
-  // references (golden, alt) are skipped/not checked.
-  if (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 &&
-      cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0) {
-    spatial_var_thresh = 150;
-    motion_thresh = 0;
-  }
-  int do_early_exit_rdthresh = 1;
-  // Some adjustments to checking intra mode based on source variance.
-  if (x->source_variance < spatial_var_thresh) {
-    // If the best inter mode is large motion or non-LAST ref reduce intra cost
-    // penalty, so intra mode is more likely tested.
-    if (best_pickmode.best_ref_frame != LAST_FRAME ||
-        abs(mi->mv[0].as_mv.row) >= motion_thresh ||
-        abs(mi->mv[0].as_mv.col) >= motion_thresh) {
-      intra_cost_penalty = intra_cost_penalty >> 2;
-      inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
-      do_early_exit_rdthresh = 0;
-    }
-    // For big blocks worth checking intra (since only DC will be checked),
-    // even if best_early_term is set.
-    if (bsize >= BLOCK_32X32) best_early_term = 0;
-  }
-
-  if (best_rdc.rdcost == INT64_MAX ||
-      (perform_intra_pred && !best_early_term &&
-       best_rdc.rdcost > inter_mode_thresh &&
-       bsize <= cpi->sf.part_sf.max_intra_bsize)) {
-    int64_t this_sse = INT64_MAX;
-    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
-    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
-    TX_SIZE intra_tx_size =
-        AOMMIN(AOMMIN(max_txsize_lookup[bsize],
-                      tx_mode_to_biggest_tx_size[x->tx_mode_search_type]),
-               TX_16X16);
-
-    if (reuse_inter_pred && best_pred != NULL) {
-      if (best_pred->data == orig_dst.buf) {
-        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
-        aom_convolve_copy(best_pred->data, best_pred->stride,
-                          this_mode_pred->data, this_mode_pred->stride, 0, 0, 0,
-                          0, bw, bh);
-        best_pickmode.best_pred = this_mode_pred;
+  if (!x->force_zeromv_skip)
+    estimate_intra_mode(cpi, x, bsize, use_modeled_non_rd_cost, best_early_term,
+                        ref_costs_single[INTRA_FRAME], reuse_inter_pred,
+                        &orig_dst, tmp, &this_mode_pred, &best_rdc,
+                        &best_pickmode, ctx);
+
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+      !x->force_zeromv_skip && is_inter_mode(best_pickmode.best_mode) &&
+      (!cpi->sf.rt_sf.prune_idtx_nonrd ||
+       (cpi->sf.rt_sf.prune_idtx_nonrd && bsize <= BLOCK_32X32 &&
+        best_pickmode.best_mode_skip_txfm != 1 && x->source_variance > 200))) {
+    RD_STATS idtx_rdc;
+    av1_init_rd_stats(&idtx_rdc);
+    int is_skippable;
+    this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+    pd->dst.buf = this_mode_pred->data;
+    pd->dst.stride = bw;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
+    av1_block_yrd(cpi, x, mi_row, mi_col, &idtx_rdc, &is_skippable, bsize,
+                  mi->tx_size, IDTX, 1);
+    int64_t idx_rdcost = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist);
+    if (idx_rdcost < best_rdc.rdcost) {
+      best_pickmode.tx_type = IDTX;
+      best_rdc.rdcost = idx_rdcost;
+      best_pickmode.best_mode_skip_txfm = idtx_rdc.skip_txfm;
+      if (!idtx_rdc.skip_txfm) {
+        memcpy(best_pickmode.blk_skip, txfm_info->blk_skip,
+               sizeof(txfm_info->blk_skip[0]) * num_8x8_blocks);
       }
+      xd->tx_type_map[0] = best_pickmode.tx_type;
+      memset(ctx->tx_type_map, best_pickmode.tx_type, ctx->num_4x4_blk);
+      memset(xd->tx_type_map, best_pickmode.tx_type, ctx->num_4x4_blk);
     }
     pd->dst = orig_dst;
+  }
 
-    for (int i = 0; i < 4; ++i) {
-      const PREDICTION_MODE this_mode = intra_mode_list[i];
-      const THR_MODES mode_index =
-          mode_idx[INTRA_FRAME][mode_offset(this_mode)];
-      const int mode_rd_thresh = rd_threshes[mode_index];
+  int try_palette =
+      cpi->oxcf.tool_cfg.enable_palette &&
+      av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                        mi->bsize);
+  try_palette = try_palette && is_mode_intra(best_pickmode.best_mode) &&
+                x->source_variance > 0 && !x->force_zeromv_skip &&
+                (cpi->rc.high_source_sad || x->source_variance > 500);
 
-      // Only check DC for blocks >= 32X32.
-      if (this_mode > 0 && bsize >= BLOCK_32X32) continue;
+  if (try_palette) {
+    const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
 
-      if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                              rd_thresh_freq_fact[mode_index]) &&
-          (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) {
-        continue;
-      }
-      const BLOCK_SIZE uv_bsize = get_plane_block_size(
-          bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
-
-      mi->mode = this_mode;
-      mi->ref_frame[0] = INTRA_FRAME;
-      mi->ref_frame[1] = NONE_FRAME;
-
-      this_rdc.dist = this_rdc.rate = 0;
-      args.mode = this_mode;
-      args.skippable = 1;
-      args.rdc = &this_rdc;
-      mi->tx_size = intra_tx_size;
-      compute_intra_yprediction(cm, this_mode, bsize, x, xd);
-      // Look into selecting tx_size here, based on prediction residual.
-      if (use_modeled_non_rd_cost)
-        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
-                          &this_rdc.skip, NULL, &var_y, &sse_y, 1);
-      else
-        block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &args.skippable, &this_sse,
-                  bsize, mi->tx_size);
-      // TODO(kyslov@) Need to account for skippable
-      if (x->color_sensitivity[0]) {
-        av1_foreach_transformed_block_in_plane(xd, uv_bsize, 1,
-                                               estimate_block_intra, &args);
-      }
-      if (x->color_sensitivity[1]) {
-        av1_foreach_transformed_block_in_plane(xd, uv_bsize, 2,
-                                               estimate_block_intra, &args);
-      }
-
-      int mode_cost = 0;
-      if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) {
-        mode_cost += x->angle_delta_cost[this_mode - V_PRED]
-                                        [MAX_ANGLE_DELTA +
-                                         mi->angle_delta[PLANE_TYPE_Y]];
-      }
-      if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
-        mode_cost += x->filter_intra_cost[bsize][0];
-      }
-      this_rdc.rate += ref_costs_single[INTRA_FRAME];
-      this_rdc.rate += intra_cost_penalty;
-      this_rdc.rate += mode_cost;
-      this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
-
-      if (this_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = this_rdc;
-        best_pickmode.best_mode = this_mode;
-        best_pickmode.best_intra_tx_size = mi->tx_size;
-        best_pickmode.best_ref_frame = INTRA_FRAME;
-        best_pickmode.best_second_ref_frame = NONE_FRAME;
-        mi->uv_mode = this_mode;
-        mi->mv[0].as_int = INVALID_MV;
-        mi->mv[1].as_int = INVALID_MV;
+    av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx,
+                                 &this_rdc, best_rdc.rdcost);
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_pickmode.pmi = mi->palette_mode_info;
+      best_pickmode.best_mode = DC_PRED;
+      mi->mv[0].as_int = 0;
+      best_rdc.rate = this_rdc.rate;
+      best_rdc.dist = this_rdc.dist;
+      best_rdc.rdcost = this_rdc.rdcost;
+      best_pickmode.best_mode_skip_txfm = this_rdc.skip_txfm;
+      if (!this_rdc.skip_txfm) {
+        memcpy(ctx->blk_skip, txfm_info->blk_skip,
+               sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
       }
-    }
-
-    // Reset mb_mode_info to the best inter mode.
-    if (best_pickmode.best_ref_frame != INTRA_FRAME) {
-      mi->tx_size = best_pickmode.best_tx_size;
-    } else {
-      mi->tx_size = best_pickmode.best_intra_tx_size;
+      if (xd->tx_type_map[0] != DCT_DCT)
+        av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
     }
   }
 
   pd->dst = orig_dst;
+  if (try_palette) mi->palette_mode_info = best_pickmode.pmi;
   mi->mode = best_pickmode.best_mode;
   mi->ref_frame[0] = best_pickmode.best_ref_frame;
   mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+  txfm_info->skip_txfm = best_pickmode.best_mode_skip_txfm;
+  if (!txfm_info->skip_txfm) {
+    // For inter modes: copy blk_skip from best_pickmode, which is
+    // defined for 8x8 blocks. If palette or intra mode was selected
+    // as best then blk_skip is already copied into the ctx.
+    if (best_pickmode.best_mode >= INTRA_MODE_END)
+      memcpy(ctx->blk_skip, best_pickmode.blk_skip,
+             sizeof(best_pickmode.blk_skip[0]) * num_8x8_blocks);
+  }
+  if (has_second_ref(mi)) {
+    mi->comp_group_idx = 0;
+    mi->compound_idx = 1;
+    mi->interinter_comp.type = COMPOUND_AVERAGE;
+  }
 
   if (!is_inter_block(mi)) {
     mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS);
@@ -2110,10 +3238,29 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     PRED_BUFFER *const best_pred = best_pickmode.best_pred;
     if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
       aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
-                        pd->dst.stride, 0, 0, 0, 0, bw, bh);
+                        pd->dst.stride, bw, bh);
     }
   }
-  if (cpi->sf.inter_sf.adaptive_rd_thresh) {
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && resize_pending == 0 &&
+      denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow &&
+      cpi->denoiser.reset == 0) {
+    AV1_DENOISER_DECISION decision = COPY_BLOCK;
+    ctx->sb_skip_denoising = 0;
+    av1_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_costs_single,
+                                frame_mv, reuse_inter_pred, &best_pickmode);
+    av1_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision,
+                         gf_temporal_ref);
+    if (denoise_recheck_zeromv)
+      recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den,
+                                     yv12_mb, &best_rdc, &best_pickmode, bsize,
+                                     mi_row, mi_col);
+    best_pickmode.best_ref_frame = ctx_den.best_ref_frame;
+  }
+#endif
+
+  if (cpi->sf.inter_sf.adaptive_rd_thresh && !has_second_ref(mi)) {
     THR_MODES best_mode_idx =
         mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)];
     if (best_pickmode.best_ref_frame == INTRA_FRAME) {
@@ -2124,13 +3271,10 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                                 intra_mode_list[i]);
       }
     } else {
-      for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
-        PREDICTION_MODE this_mode;
-        if (best_pickmode.best_ref_frame != ref_frame) continue;
-        for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-          update_thresh_freq_fact(cpi, x, bsize, ref_frame, best_mode_idx,
-                                  this_mode);
-        }
+      PREDICTION_MODE this_mode;
+      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+        update_thresh_freq_fact(cpi, x, bsize, best_pickmode.best_ref_frame,
+                                best_mode_idx, this_mode);
       }
     }
   }
diff --git a/media/libaom/src/av1/encoder/optical_flow.c b/media/libaom/src/av1/encoder/optical_flow.c
new file mode 100644
index 0000000000..dc168e7aee
--- /dev/null
+++ b/media/libaom/src/av1/encoder/optical_flow.c
@@ -0,0 +1,1113 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <math.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/mathutils.h"
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/optical_flow.h"
+#include "av1/encoder/sparse_linear_solver.h"
+#include "av1/encoder/reconinter_enc.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+
+void av1_init_opfl_params(OPFL_PARAMS *opfl_params) {
+  opfl_params->pyramid_levels = OPFL_PYRAMID_LEVELS;
+  opfl_params->warping_steps = OPFL_WARPING_STEPS;
+  opfl_params->lk_params = NULL;
+}
+
+void av1_init_lk_params(LK_PARAMS *lk_params) {
+  lk_params->window_size = OPFL_WINDOW_SIZE;
+}
+
+// Helper function to determine whether a frame is encoded with high bit-depth.
+static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
+  return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+}
+
+// Helper function to determine whether optical flow method is sparse.
+static INLINE int is_sparse(const OPFL_PARAMS *opfl_params) {
+  return (opfl_params->flags & OPFL_FLAG_SPARSE) ? 1 : 0;
+}
+
+static void gradients_over_window(const YV12_BUFFER_CONFIG *frame,
+                                  const YV12_BUFFER_CONFIG *ref_frame,
+                                  const double x_coord, const double y_coord,
+                                  const int window_size, const int bit_depth,
+                                  double *ix, double *iy, double *it,
+                                  LOCALMV *mv);
+
+// coefficients for bilinear interpolation on unit square
+static int pixel_interp(const double x, const double y, const double b00,
+                        const double b01, const double b10, const double b11) {
+  const int xint = (int)x;
+  const int yint = (int)y;
+  const double xdec = x - xint;
+  const double ydec = y - yint;
+  const double a = (1 - xdec) * (1 - ydec);
+  const double b = xdec * (1 - ydec);
+  const double c = (1 - xdec) * ydec;
+  const double d = xdec * ydec;
+  // if x, y are already integers, this results to b00
+  int interp = (int)round(a * b00 + b * b01 + c * b10 + d * b11);
+  return interp;
+}
+
+// Scharr filter to compute spatial gradient
+static void spatial_gradient(const YV12_BUFFER_CONFIG *frame, const int x_coord,
+                             const int y_coord, const int direction,
+                             double *derivative) {
+  double *filter;
+  // Scharr filters
+  double gx[9] = { -3, 0, 3, -10, 0, 10, -3, 0, 3 };
+  double gy[9] = { -3, -10, -3, 0, 0, 0, 3, 10, 3 };
+  if (direction == 0) {  // x direction
+    filter = gx;
+  } else {  // y direction
+    filter = gy;
+  }
+  int idx = 0;
+  double d = 0;
+  for (int yy = -1; yy <= 1; yy++) {
+    for (int xx = -1; xx <= 1; xx++) {
+      d += filter[idx] *
+           frame->y_buffer[(y_coord + yy) * frame->y_stride + (x_coord + xx)];
+      idx++;
+    }
+  }
+  // normalization scaling factor for scharr
+  *derivative = d / 32.0;
+}
+
+// Determine the spatial gradient at subpixel locations
+// For example, when reducing images for pyramidal LK,
+// corners found in original image may be at subpixel locations.
+static void gradient_interp(double *fullpel_deriv, const double x_coord,
+                            const double y_coord, const int w, const int h,
+                            double *derivative) {
+  const int xint = (int)x_coord;
+  const int yint = (int)y_coord;
+  double interp;
+  if (xint + 1 > w - 1 || yint + 1 > h - 1) {
+    interp = fullpel_deriv[yint * w + xint];
+  } else {
+    interp = pixel_interp(x_coord, y_coord, fullpel_deriv[yint * w + xint],
+                          fullpel_deriv[yint * w + (xint + 1)],
+                          fullpel_deriv[(yint + 1) * w + xint],
+                          fullpel_deriv[(yint + 1) * w + (xint + 1)]);
+  }
+
+  *derivative = interp;
+}
+
+static void temporal_gradient(const YV12_BUFFER_CONFIG *frame,
+                              const YV12_BUFFER_CONFIG *frame2,
+                              const double x_coord, const double y_coord,
+                              const int bit_depth, double *derivative,
+                              LOCALMV *mv) {
+  const int w = 2;
+  const int h = 2;
+  uint8_t pred1[4];
+  uint8_t pred2[4];
+
+  const int y = (int)y_coord;
+  const int x = (int)x_coord;
+  const double ydec = y_coord - y;
+  const double xdec = x_coord - x;
+  const int is_intrabc = 0;  // Is intra-copied?
+  const int is_high_bitdepth = is_frame_high_bitdepth(frame2);
+  const int subsampling_x = 0, subsampling_y = 0;  // for y-buffer
+  const int_interpfilters interp_filters =
+      av1_broadcast_interp_filter(MULTITAP_SHARP);
+  const int plane = 0;  // y-plane
+  const struct buf_2d ref_buf2 = { NULL, frame2->y_buffer, frame2->y_crop_width,
+                                   frame2->y_crop_height, frame2->y_stride };
+  struct scale_factors scale;
+  av1_setup_scale_factors_for_frame(&scale, frame->y_crop_width,
+                                    frame->y_crop_height, frame->y_crop_width,
+                                    frame->y_crop_height);
+  InterPredParams inter_pred_params;
+  av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+                        subsampling_y, bit_depth, is_high_bitdepth, is_intrabc,
+                        &scale, &ref_buf2, interp_filters);
+  inter_pred_params.interp_filter_params[0] =
+      &av1_interp_filter_params_list[interp_filters.as_filters.x_filter];
+  inter_pred_params.interp_filter_params[1] =
+      &av1_interp_filter_params_list[interp_filters.as_filters.y_filter];
+  inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+  MV newmv = { .row = (int16_t)round((mv->row + xdec) * 8),
+               .col = (int16_t)round((mv->col + ydec) * 8) };
+  av1_enc_build_one_inter_predictor(pred2, w, &newmv, &inter_pred_params);
+  const struct buf_2d ref_buf1 = { NULL, frame->y_buffer, frame->y_crop_width,
+                                   frame->y_crop_height, frame->y_stride };
+  av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+                        subsampling_y, bit_depth, is_high_bitdepth, is_intrabc,
+                        &scale, &ref_buf1, interp_filters);
+  inter_pred_params.interp_filter_params[0] =
+      &av1_interp_filter_params_list[interp_filters.as_filters.x_filter];
+  inter_pred_params.interp_filter_params[1] =
+      &av1_interp_filter_params_list[interp_filters.as_filters.y_filter];
+  inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+  MV zeroMV = { .row = (int16_t)round(xdec * 8),
+                .col = (int16_t)round(ydec * 8) };
+  av1_enc_build_one_inter_predictor(pred1, w, &zeroMV, &inter_pred_params);
+
+  *derivative = pred2[0] - pred1[0];
+}
+
+// Numerical differentiate over window_size x window_size surrounding (x,y)
+// location. Alters ix, iy, it to contain numerical partial derivatives
+static void gradients_over_window(const YV12_BUFFER_CONFIG *frame,
+                                  const YV12_BUFFER_CONFIG *ref_frame,
+                                  const double x_coord, const double y_coord,
+                                  const int window_size, const int bit_depth,
+                                  double *ix, double *iy, double *it,
+                                  LOCALMV *mv) {
+  const double left = x_coord - window_size / 2.0;
+  const double top = y_coord - window_size / 2.0;
+  // gradient operators need pixel before and after (start at 1)
+  const double x_start = AOMMAX(1, left);
+  const double y_start = AOMMAX(1, top);
+  const int frame_height = frame->y_crop_height;
+  const int frame_width = frame->y_crop_width;
+  double deriv_x;
+  double deriv_y;
+  double deriv_t;
+
+  const double x_end = AOMMIN(x_coord + window_size / 2.0, frame_width - 2);
+  const double y_end = AOMMIN(y_coord + window_size / 2.0, frame_height - 2);
+  const int xs = (int)AOMMAX(1, x_start - 1);
+  const int ys = (int)AOMMAX(1, y_start - 1);
+  const int xe = (int)AOMMIN(x_end + 2, frame_width - 2);
+  const int ye = (int)AOMMIN(y_end + 2, frame_height - 2);
+  // with normalization, gradients may be double values
+  double *fullpel_dx = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_x));
+  double *fullpel_dy = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_y));
+  if (!fullpel_dx || !fullpel_dy) {
+    aom_free(fullpel_dx);
+    aom_free(fullpel_dy);
+    return;
+  }
+
+  // TODO(any): This could be more efficient in the case that x_coord
+  // and y_coord are integers.. but it may look more messy.
+
+  // calculate spatial gradients at full pixel locations
+  for (int j = ys; j < ye; j++) {
+    for (int i = xs; i < xe; i++) {
+      spatial_gradient(frame, i, j, 0, &deriv_x);
+      spatial_gradient(frame, i, j, 1, &deriv_y);
+      int idx = (j - ys) * (xe - xs) + (i - xs);
+      fullpel_dx[idx] = deriv_x;
+      fullpel_dy[idx] = deriv_y;
+    }
+  }
+  // compute numerical differentiation for every pixel in window
+  // (this potentially includes subpixels)
+  for (double j = y_start; j < y_end; j++) {
+    for (double i = x_start; i < x_end; i++) {
+      temporal_gradient(frame, ref_frame, i, j, bit_depth, &deriv_t, mv);
+      gradient_interp(fullpel_dx, i - xs, j - ys, xe - xs, ye - ys, &deriv_x);
+      gradient_interp(fullpel_dy, i - xs, j - ys, xe - xs, ye - ys, &deriv_y);
+      int idx = (int)(j - top) * window_size + (int)(i - left);
+      ix[idx] = deriv_x;
+      iy[idx] = deriv_y;
+      it[idx] = deriv_t;
+    }
+  }
+  // TODO(any): to avoid setting deriv arrays to zero for every iteration,
+  // could instead pass these two values back through function call
+  // int first_idx = (int)(y_start - top) * window_size + (int)(x_start - left);
+  // int width = window_size - ((int)(x_start - left) + (int)(left + window_size
+  // - x_end));
+
+  aom_free(fullpel_dx);
+  aom_free(fullpel_dy);
+}
+
+// To compute eigenvalues of 2x2 matrix: Solve for lambda where
+// Determinant(matrix - lambda*identity) == 0
+static void eigenvalues_2x2(const double *matrix, double *eig) {
+  const double a = 1;
+  const double b = -1 * matrix[0] - matrix[3];
+  const double c = -1 * matrix[1] * matrix[2] + matrix[0] * matrix[3];
+  // quadratic formula
+  const double discriminant = b * b - 4 * a * c;
+  eig[0] = (-b - sqrt(discriminant)) / (2.0 * a);
+  eig[1] = (-b + sqrt(discriminant)) / (2.0 * a);
+  // double check that eigenvalues are ordered by magnitude
+  if (fabs(eig[0]) > fabs(eig[1])) {
+    double tmp = eig[0];
+    eig[0] = eig[1];
+    eig[1] = tmp;
+  }
+}
+
+// Shi-Tomasi corner detection criteria
+static double corner_score(const YV12_BUFFER_CONFIG *frame_to_filter,
+                           const YV12_BUFFER_CONFIG *ref_frame, const int x,
+                           const int y, double *i_x, double *i_y, double *i_t,
+                           const int n, const int bit_depth) {
+  double eig[2];
+  LOCALMV mv = { .row = 0, .col = 0 };
+  // TODO(any): technically, ref_frame and i_t are not used by corner score
+  // so these could be replaced by dummy variables,
+  // or change this to spatial gradient function over window only
+  gradients_over_window(frame_to_filter, ref_frame, x, y, n, bit_depth, i_x,
+                        i_y, i_t, &mv);
+  double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 };
+  multiply_mat(i_x, i_x, Mres1, 1, n * n, 1);
+  multiply_mat(i_x, i_y, Mres2, 1, n * n, 1);
+  multiply_mat(i_y, i_y, Mres3, 1, n * n, 1);
+  double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] };
+  eigenvalues_2x2(M, eig);
+  return fabs(eig[0]);
+}
+
+// Finds corners in frame_to_filter
+// For less strict requirements (i.e. more corners), decrease threshold
+static int detect_corners(const YV12_BUFFER_CONFIG *frame_to_filter,
+                          const YV12_BUFFER_CONFIG *ref_frame,
+                          const int maxcorners, int *ref_corners,
+                          const int bit_depth) {
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  // TODO(any): currently if maxcorners is decreased, then it only means
+  // corners will be omited from bottom-right of image. if maxcorners
+  // is actually used, then this algorithm would need to re-iterate
+  // and choose threshold based on that
+  assert(maxcorners == frame_height * frame_width);
+  int countcorners = 0;
+  const double threshold = 0.1;
+  double score;
+  const int n = 3;
+  double i_x[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  double i_y[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  double i_t[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  const int fromedge = n;
+  double max_score = corner_score(frame_to_filter, ref_frame, fromedge,
+                                  fromedge, i_x, i_y, i_t, n, bit_depth);
+  // rough estimate of max corner score in image
+  for (int x = fromedge; x < frame_width - fromedge; x += 1) {
+    for (int y = fromedge; y < frame_height - fromedge; y += frame_height / 5) {
+      for (int i = 0; i < n * n; i++) {
+        i_x[i] = 0;
+        i_y[i] = 0;
+        i_t[i] = 0;
+      }
+      score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n,
+                           bit_depth);
+      if (score > max_score) {
+        max_score = score;
+      }
+    }
+  }
+  // score all the points and choose corners over threshold
+  for (int x = fromedge; x < frame_width - fromedge; x += 1) {
+    for (int y = fromedge;
+         (y < frame_height - fromedge) && countcorners < maxcorners; y += 1) {
+      for (int i = 0; i < n * n; i++) {
+        i_x[i] = 0;
+        i_y[i] = 0;
+        i_t[i] = 0;
+      }
+      score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n,
+                           bit_depth);
+      if (score > threshold * max_score) {
+        ref_corners[countcorners * 2] = x;
+        ref_corners[countcorners * 2 + 1] = y;
+        countcorners++;
+      }
+    }
+  }
+  return countcorners;
+}
+
+// weights is an nxn matrix. weights is filled with a gaussian function,
+// with independent variable: distance from the center point.
+static void gaussian(const double sigma, const int n, const int normalize,
+                     double *weights) {
+  double total_weight = 0;
+  for (int j = 0; j < n; j++) {
+    for (int i = 0; i < n; i++) {
+      double distance = sqrt(pow(n / 2 - i, 2) + pow(n / 2 - j, 2));
+      double weight = exp(-0.5 * pow(distance / sigma, 2));
+      weights[j * n + i] = weight;
+      total_weight += weight;
+    }
+  }
+  if (normalize == 1) {
+    for (int j = 0; j < n; j++) {
+      weights[j] = weights[j] / total_weight;
+    }
+  }
+}
+
+static double convolve(const double *filter, const int *img, const int size) {
+  double result = 0;
+  for (int i = 0; i < size; i++) {
+    result += filter[i] * img[i];
+  }
+  return result;
+}
+
+// Applies a Gaussian low-pass smoothing filter to produce
+// a corresponding lower resolution image with halved dimensions
+static void reduce(uint8_t *img, int height, int width, int stride,
+                   uint8_t *reduced_img) {
+  const int new_width = width / 2;
+  const int window_size = 5;
+  const double gaussian_filter[25] = {
+    1. / 256, 1.0 / 64, 3. / 128, 1. / 64,  1. / 256, 1. / 64, 1. / 16,
+    3. / 32,  1. / 16,  1. / 64,  3. / 128, 3. / 32,  9. / 64, 3. / 32,
+    3. / 128, 1. / 64,  1. / 16,  3. / 32,  1. / 16,  1. / 64, 1. / 256,
+    1. / 64,  3. / 128, 1. / 64,  1. / 256
+  };
+  // filter is 5x5 so need prev and forward 2 pixels
+  int img_section[25];
+  for (int y = 0; y < height - 1; y += 2) {
+    for (int x = 0; x < width - 1; x += 2) {
+      int i = 0;
+      for (int yy = y - window_size / 2; yy <= y + window_size / 2; yy++) {
+        for (int xx = x - window_size / 2; xx <= x + window_size / 2; xx++) {
+          int yvalue = yy;
+          int xvalue = xx;
+          // copied pixels outside the boundary
+          if (yvalue < 0) yvalue = 0;
+          if (xvalue < 0) xvalue = 0;
+          if (yvalue >= height) yvalue = height - 1;
+          if (xvalue >= width) xvalue = width - 1;
+          img_section[i++] = img[yvalue * stride + xvalue];
+        }
+      }
+      reduced_img[(y / 2) * new_width + (x / 2)] = (uint8_t)convolve(
+          gaussian_filter, img_section, window_size * window_size);
+    }
+  }
+}
+
+static int cmpfunc(const void *a, const void *b) {
+  return (*(int *)a - *(int *)b);
+}
+static void filter_mvs(const MV_FILTER_TYPE mv_filter, const int frame_height,
+                       const int frame_width, LOCALMV *localmvs, MV *mvs) {
+  const int n = 5;  // window size
+  // for smoothing filter
+  const double gaussian_filter[25] = {
+    1. / 256, 1. / 64,  3. / 128, 1. / 64,  1. / 256, 1. / 64, 1. / 16,
+    3. / 32,  1. / 16,  1. / 64,  3. / 128, 3. / 32,  9. / 64, 3. / 32,
+    3. / 128, 1. / 64,  1. / 16,  3. / 32,  1. / 16,  1. / 64, 1. / 256,
+    1. / 64,  3. / 128, 1. / 64,  1. / 256
+  };
+  // for median filter
+  int mvrows[25];
+  int mvcols[25];
+  if (mv_filter != MV_FILTER_NONE) {
+    for (int y = 0; y < frame_height; y++) {
+      for (int x = 0; x < frame_width; x++) {
+        int center_idx = y * frame_width + x;
+        int i = 0;
+        double filtered_row = 0;
+        double filtered_col = 0;
+        for (int yy = y - n / 2; yy <= y + n / 2; yy++) {
+          for (int xx = x - n / 2; xx <= x + n / 2; xx++) {
+            int yvalue = yy;
+            int xvalue = xx;
+            // copied pixels outside the boundary
+            if (yvalue < 0) yvalue = 0;
+            if (xvalue < 0) xvalue = 0;
+            if (yvalue >= frame_height) yvalue = frame_height - 1;
+            if (xvalue >= frame_width) xvalue = frame_width - 1;
+            int index = yvalue * frame_width + xvalue;
+            if (mv_filter == MV_FILTER_SMOOTH) {
+              filtered_row += mvs[index].row * gaussian_filter[i];
+              filtered_col += mvs[index].col * gaussian_filter[i];
+            } else if (mv_filter == MV_FILTER_MEDIAN) {
+              mvrows[i] = mvs[index].row;
+              mvcols[i] = mvs[index].col;
+            }
+            i++;
+          }
+        }
+
+        MV mv = mvs[center_idx];
+        if (mv_filter == MV_FILTER_SMOOTH) {
+          mv.row = (int16_t)filtered_row;
+          mv.col = (int16_t)filtered_col;
+        } else if (mv_filter == MV_FILTER_MEDIAN) {
+          qsort(mvrows, 25, sizeof(mv.row), cmpfunc);
+          qsort(mvcols, 25, sizeof(mv.col), cmpfunc);
+          mv.row = mvrows[25 / 2];
+          mv.col = mvcols[25 / 2];
+        }
+        LOCALMV localmv = { .row = ((double)mv.row) / 8,
+                            .col = ((double)mv.row) / 8 };
+        localmvs[y * frame_width + x] = localmv;
+        // if mvs array is immediately updated here, then the result may
+        // propagate to other pixels.
+      }
+    }
+    for (int i = 0; i < frame_height * frame_width; i++) {
+      MV mv = { .row = (int16_t)round(8 * localmvs[i].row),
+                .col = (int16_t)round(8 * localmvs[i].col) };
+      mvs[i] = mv;
+    }
+  }
+}
+
+// Computes optical flow at a single pyramid level,
+// using Lucas-Kanade algorithm.
+// Modifies mvs array.
+static void lucas_kanade(const YV12_BUFFER_CONFIG *from_frame,
+                         const YV12_BUFFER_CONFIG *to_frame, const int level,
+                         const LK_PARAMS *lk_params, const int num_ref_corners,
+                         int *ref_corners, const int mv_stride,
+                         const int bit_depth, LOCALMV *mvs) {
+  assert(lk_params->window_size > 0 && lk_params->window_size % 2 == 0);
+  const int n = lk_params->window_size;
+  // algorithm is sensitive to window size
+  double *i_x = (double *)aom_malloc(n * n * sizeof(*i_x));
+  double *i_y = (double *)aom_malloc(n * n * sizeof(*i_y));
+  double *i_t = (double *)aom_malloc(n * n * sizeof(*i_t));
+  double *weights = (double *)aom_malloc(n * n * sizeof(*weights));
+  if (!i_x || !i_y || !i_t || !weights) goto free_lk_buf;
+
+  const int expand_multiplier = (int)pow(2, level);
+  double sigma = 0.2 * n;
+  // normalizing doesn't really affect anything since it's applied
+  // to every component of M and b
+  gaussian(sigma, n, 0, weights);
+  for (int i = 0; i < num_ref_corners; i++) {
+    const double x_coord = 1.0 * ref_corners[i * 2] / expand_multiplier;
+    const double y_coord = 1.0 * ref_corners[i * 2 + 1] / expand_multiplier;
+    int highres_x = ref_corners[i * 2];
+    int highres_y = ref_corners[i * 2 + 1];
+    int mv_idx = highres_y * (mv_stride) + highres_x;
+    LOCALMV mv_old = mvs[mv_idx];
+    mv_old.row = mv_old.row / expand_multiplier;
+    mv_old.col = mv_old.col / expand_multiplier;
+    // using this instead of memset, since it's not completely
+    // clear if zero memset works on double arrays
+    for (int j = 0; j < n * n; j++) {
+      i_x[j] = 0;
+      i_y[j] = 0;
+      i_t[j] = 0;
+    }
+    gradients_over_window(from_frame, to_frame, x_coord, y_coord, n, bit_depth,
+                          i_x, i_y, i_t, &mv_old);
+    double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 };
+    double bres1[1] = { 0 }, bres2[1] = { 0 };
+    for (int j = 0; j < n * n; j++) {
+      Mres1[0] += weights[j] * i_x[j] * i_x[j];
+      Mres2[0] += weights[j] * i_x[j] * i_y[j];
+      Mres3[0] += weights[j] * i_y[j] * i_y[j];
+      bres1[0] += weights[j] * i_x[j] * i_t[j];
+      bres2[0] += weights[j] * i_y[j] * i_t[j];
+    }
+    double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] };
+    double b[2] = { -1 * bres1[0], -1 * bres2[0] };
+    double eig[2] = { 1, 1 };
+    eigenvalues_2x2(M, eig);
+    double threshold = 0.1;
+    if (fabs(eig[0]) > threshold) {
+      // if M is not invertible, then displacement
+      // will default to zeros
+      double u[2] = { 0, 0 };
+      linsolve(2, M, 2, b, u);
+      int mult = 1;
+      if (level != 0)
+        mult = expand_multiplier;  // mv doubles when resolution doubles
+      LOCALMV mv = { .row = (mult * (u[0] + mv_old.row)),
+                     .col = (mult * (u[1] + mv_old.col)) };
+      mvs[mv_idx] = mv;
+      mvs[mv_idx] = mv;
+    }
+  }
+free_lk_buf:
+  aom_free(weights);
+  aom_free(i_t);
+  aom_free(i_x);
+  aom_free(i_y);
+}
+
+// Warp the src_frame to warper_frame according to mvs.
+// mvs point to src_frame
+static void warp_back_frame(YV12_BUFFER_CONFIG *warped_frame,
+                            const YV12_BUFFER_CONFIG *src_frame,
+                            const LOCALMV *mvs, int mv_stride) {
+  int w, h;
+  const int fw = src_frame->y_crop_width;
+  const int fh = src_frame->y_crop_height;
+  const int src_fs = src_frame->y_stride, warped_fs = warped_frame->y_stride;
+  const uint8_t *src_buf = src_frame->y_buffer;
+  uint8_t *warped_buf = warped_frame->y_buffer;
+  double temp;
+  for (h = 0; h < fh; h++) {
+    for (w = 0; w < fw; w++) {
+      double cord_x = (double)w + mvs[h * mv_stride + w].col;
+      double cord_y = (double)h + mvs[h * mv_stride + w].row;
+      cord_x = fclamp(cord_x, 0, (double)(fw - 1));
+      cord_y = fclamp(cord_y, 0, (double)(fh - 1));
+      const int floorx = (int)floor(cord_x);
+      const int floory = (int)floor(cord_y);
+      const double fracx = cord_x - (double)floorx;
+      const double fracy = cord_y - (double)floory;
+
+      temp = 0;
+      for (int hh = 0; hh < 2; hh++) {
+        const double weighth = hh ? (fracy) : (1 - fracy);
+        for (int ww = 0; ww < 2; ww++) {
+          const double weightw = ww ? (fracx) : (1 - fracx);
+          int y = floory + hh;
+          int x = floorx + ww;
+          y = clamp(y, 0, fh - 1);
+          x = clamp(x, 0, fw - 1);
+          temp += (double)src_buf[y * src_fs + x] * weightw * weighth;
+        }
+      }
+      warped_buf[h * warped_fs + w] = (uint8_t)round(temp);
+    }
+  }
+}
+
+// Same as warp_back_frame, but using a better interpolation filter.
+static void warp_back_frame_intp(YV12_BUFFER_CONFIG *warped_frame,
+                                 const YV12_BUFFER_CONFIG *src_frame,
+                                 const LOCALMV *mvs, int mv_stride) {
+  int w, h;
+  const int fw = src_frame->y_crop_width;
+  const int fh = src_frame->y_crop_height;
+  const int warped_fs = warped_frame->y_stride;
+  uint8_t *warped_buf = warped_frame->y_buffer;
+  const int blk = 2;
+  uint8_t temp_blk[4];
+
+  const int is_intrabc = 0;  // Is intra-copied?
+  const int is_high_bitdepth = is_frame_high_bitdepth(src_frame);
+  const int subsampling_x = 0, subsampling_y = 0;  // for y-buffer
+  const int_interpfilters interp_filters =
+      av1_broadcast_interp_filter(MULTITAP_SHARP2);
+  const int plane = 0;  // y-plane
+  const struct buf_2d ref_buf2 = { NULL, src_frame->y_buffer,
+                                   src_frame->y_crop_width,
+                                   src_frame->y_crop_height,
+                                   src_frame->y_stride };
+  const int bit_depth = src_frame->bit_depth;
+  struct scale_factors scale;
+  av1_setup_scale_factors_for_frame(
+      &scale, src_frame->y_crop_width, src_frame->y_crop_height,
+      src_frame->y_crop_width, src_frame->y_crop_height);
+
+  for (h = 0; h < fh; h++) {
+    for (w = 0; w < fw; w++) {
+      InterPredParams inter_pred_params;
+      av1_init_inter_params(&inter_pred_params, blk, blk, h, w, subsampling_x,
+                            subsampling_y, bit_depth, is_high_bitdepth,
+                            is_intrabc, &scale, &ref_buf2, interp_filters);
+      inter_pred_params.interp_filter_params[0] =
+          &av1_interp_filter_params_list[interp_filters.as_filters.x_filter];
+      inter_pred_params.interp_filter_params[1] =
+          &av1_interp_filter_params_list[interp_filters.as_filters.y_filter];
+      inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+      MV newmv = { .row = (int16_t)round((mvs[h * mv_stride + w].row) * 8),
+                   .col = (int16_t)round((mvs[h * mv_stride + w].col) * 8) };
+      av1_enc_build_one_inter_predictor(temp_blk, blk, &newmv,
+                                        &inter_pred_params);
+      warped_buf[h * warped_fs + w] = temp_blk[0];
+    }
+  }
+}
+
+#define DERIVATIVE_FILTER_LENGTH 7
+double filter[DERIVATIVE_FILTER_LENGTH] = { -1.0 / 60, 9.0 / 60,  -45.0 / 60, 0,
+                                            45.0 / 60, -9.0 / 60, 1.0 / 60 };
+
+// Get gradient of the whole frame
+static void get_frame_gradients(const YV12_BUFFER_CONFIG *from_frame,
+                                const YV12_BUFFER_CONFIG *to_frame, double *ix,
+                                double *iy, double *it, int grad_stride) {
+  int w, h, k, idx;
+  const int fw = from_frame->y_crop_width;
+  const int fh = from_frame->y_crop_height;
+  const int from_fs = from_frame->y_stride, to_fs = to_frame->y_stride;
+  const uint8_t *from_buf = from_frame->y_buffer;
+  const uint8_t *to_buf = to_frame->y_buffer;
+
+  const int lh = DERIVATIVE_FILTER_LENGTH;
+  const int hleft = (lh - 1) / 2;
+
+  for (h = 0; h < fh; h++) {
+    for (w = 0; w < fw; w++) {
+      // x
+      ix[h * grad_stride + w] = 0;
+      for (k = 0; k < lh; k++) {
+        // if we want to make this block dependent, need to extend the
+        // boundaries using other initializations.
+        idx = w + k - hleft;
+        idx = clamp(idx, 0, fw - 1);
+        ix[h * grad_stride + w] += filter[k] * 0.5 *
+                                   ((double)from_buf[h * from_fs + idx] +
+                                    (double)to_buf[h * to_fs + idx]);
+      }
+      // y
+      iy[h * grad_stride + w] = 0;
+      for (k = 0; k < lh; k++) {
+        // if we want to make this block dependent, need to extend the
+        // boundaries using other initializations.
+        idx = h + k - hleft;
+        idx = clamp(idx, 0, fh - 1);
+        iy[h * grad_stride + w] += filter[k] * 0.5 *
+                                   ((double)from_buf[idx * from_fs + w] +
+                                    (double)to_buf[idx * to_fs + w]);
+      }
+      // t
+      it[h * grad_stride + w] =
+          (double)to_buf[h * to_fs + w] - (double)from_buf[h * from_fs + w];
+    }
+  }
+}
+
+// Solve for linear equations given by the H-S method
+static void solve_horn_schunck(const double *ix, const double *iy,
+                               const double *it, int grad_stride, int width,
+                               int height, const LOCALMV *init_mvs,
+                               int init_mv_stride, LOCALMV *mvs,
+                               int mv_stride) {
+  // TODO(bohanli): May just need to allocate the buffers once per optical flow
+  // calculation
+  int *row_pos = aom_calloc(width * height * 28, sizeof(*row_pos));
+  int *col_pos = aom_calloc(width * height * 28, sizeof(*col_pos));
+  double *values = aom_calloc(width * height * 28, sizeof(*values));
+  double *mv_vec = aom_calloc(width * height * 2, sizeof(*mv_vec));
+  double *mv_init_vec = aom_calloc(width * height * 2, sizeof(*mv_init_vec));
+  double *temp_b = aom_calloc(width * height * 2, sizeof(*temp_b));
+  double *b = aom_calloc(width * height * 2, sizeof(*b));
+  if (!row_pos || !col_pos || !values || !mv_vec || !mv_init_vec || !temp_b ||
+      !b) {
+    goto free_hs_solver_buf;
+  }
+
+  // the location idx for neighboring pixels, k < 4 are the 4 direct neighbors
+  const int check_locs_y[12] = { 0, 0, -1, 1, -1, -1, 1, 1, 0, 0, -2, 2 };
+  const int check_locs_x[12] = { -1, 1, 0, 0, -1, 1, -1, 1, -2, 2, 0, 0 };
+
+  int h, w, checkh, checkw, k, ret;
+  const int offset = height * width;
+  SPARSE_MTX A;
+  int c = 0;
+  const double lambda = 100;
+
+  for (w = 0; w < width; w++) {
+    for (h = 0; h < height; h++) {
+      mv_init_vec[w * height + h] = init_mvs[h * init_mv_stride + w].col;
+      mv_init_vec[w * height + h + offset] =
+          init_mvs[h * init_mv_stride + w].row;
+    }
+  }
+
+  // get matrix A
+  for (w = 0; w < width; w++) {
+    for (h = 0; h < height; h++) {
+      int center_num_direct = 4;
+      const int center_idx = w * height + h;
+      if (w == 0 || w == width - 1) center_num_direct--;
+      if (h == 0 || h == height - 1) center_num_direct--;
+      // diagonal entry for this row from the center pixel
+      double cor_w = center_num_direct * center_num_direct + center_num_direct;
+      row_pos[c] = center_idx;
+      col_pos[c] = center_idx;
+      values[c] = lambda * cor_w;
+      c++;
+      row_pos[c] = center_idx + offset;
+      col_pos[c] = center_idx + offset;
+      values[c] = lambda * cor_w;
+      c++;
+      // other entries from direct neighbors
+      for (k = 0; k < 4; k++) {
+        checkh = h + check_locs_y[k];
+        checkw = w + check_locs_x[k];
+        if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) {
+          continue;
+        }
+        int this_idx = checkw * height + checkh;
+        int this_num_direct = 4;
+        if (checkw == 0 || checkw == width - 1) this_num_direct--;
+        if (checkh == 0 || checkh == height - 1) this_num_direct--;
+        cor_w = -center_num_direct - this_num_direct;
+        row_pos[c] = center_idx;
+        col_pos[c] = this_idx;
+        values[c] = lambda * cor_w;
+        c++;
+        row_pos[c] = center_idx + offset;
+        col_pos[c] = this_idx + offset;
+        values[c] = lambda * cor_w;
+        c++;
+      }
+      // entries from neighbors on the diagonal corners
+      for (k = 4; k < 8; k++) {
+        checkh = h + check_locs_y[k];
+        checkw = w + check_locs_x[k];
+        if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) {
+          continue;
+        }
+        int this_idx = checkw * height + checkh;
+        cor_w = 2;
+        row_pos[c] = center_idx;
+        col_pos[c] = this_idx;
+        values[c] = lambda * cor_w;
+        c++;
+        row_pos[c] = center_idx + offset;
+        col_pos[c] = this_idx + offset;
+        values[c] = lambda * cor_w;
+        c++;
+      }
+      // entries from neighbors with dist of 2
+      for (k = 8; k < 12; k++) {
+        checkh = h + check_locs_y[k];
+        checkw = w + check_locs_x[k];
+        if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) {
+          continue;
+        }
+        int this_idx = checkw * height + checkh;
+        cor_w = 1;
+        row_pos[c] = center_idx;
+        col_pos[c] = this_idx;
+        values[c] = lambda * cor_w;
+        c++;
+        row_pos[c] = center_idx + offset;
+        col_pos[c] = this_idx + offset;
+        values[c] = lambda * cor_w;
+        c++;
+      }
+    }
+  }
+  ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
+                            2 * width * height, &A);
+  if (ret < 0) goto free_hs_solver_buf;
+  // subtract init mv part from b
+  av1_mtx_vect_multi_left(&A, mv_init_vec, temp_b, 2 * width * height);
+  for (int i = 0; i < 2 * width * height; i++) {
+    b[i] = -temp_b[i];
+  }
+  av1_free_sparse_mtx_elems(&A);
+
+  // add cross terms to A and modify b with ExEt / EyEt
+  for (w = 0; w < width; w++) {
+    for (h = 0; h < height; h++) {
+      int curidx = w * height + h;
+      // modify b
+      b[curidx] += -ix[h * grad_stride + w] * it[h * grad_stride + w];
+      b[curidx + offset] += -iy[h * grad_stride + w] * it[h * grad_stride + w];
+      // add cross terms to A
+      row_pos[c] = curidx;
+      col_pos[c] = curidx + offset;
+      values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w];
+      c++;
+      row_pos[c] = curidx + offset;
+      col_pos[c] = curidx;
+      values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w];
+      c++;
+    }
+  }
+  // Add diagonal terms to A
+  for (int i = 0; i < c; i++) {
+    if (row_pos[i] == col_pos[i]) {
+      if (row_pos[i] < offset) {
+        w = row_pos[i] / height;
+        h = row_pos[i] % height;
+        values[i] += pow(ix[h * grad_stride + w], 2);
+      } else {
+        w = (row_pos[i] - offset) / height;
+        h = (row_pos[i] - offset) % height;
+        values[i] += pow(iy[h * grad_stride + w], 2);
+      }
+    }
+  }
+
+  ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
+                            2 * width * height, &A);
+  if (ret < 0) goto free_hs_solver_buf;
+
+  // solve for the mvs
+  ret = av1_conjugate_gradient_sparse(&A, b, 2 * width * height, mv_vec);
+  if (ret < 0) goto free_hs_solver_buf;
+
+  // copy mvs
+  for (w = 0; w < width; w++) {
+    for (h = 0; h < height; h++) {
+      mvs[h * mv_stride + w].col = mv_vec[w * height + h];
+      mvs[h * mv_stride + w].row = mv_vec[w * height + h + offset];
+    }
+  }
+free_hs_solver_buf:
+  aom_free(row_pos);
+  aom_free(col_pos);
+  aom_free(values);
+  aom_free(mv_vec);
+  aom_free(mv_init_vec);
+  aom_free(b);
+  aom_free(temp_b);
+  av1_free_sparse_mtx_elems(&A);
+}
+
+// Calculate optical flow from from_frame to to_frame using the H-S method.
+static void horn_schunck(const YV12_BUFFER_CONFIG *from_frame,
+                         const YV12_BUFFER_CONFIG *to_frame, const int level,
+                         const int mv_stride, const int mv_height,
+                         const int mv_width, const OPFL_PARAMS *opfl_params,
+                         LOCALMV *mvs) {
+  // mvs are always on level 0, here we define two new mv arrays that is of size
+  // of this level.
+  const int fw = from_frame->y_crop_width;
+  const int fh = from_frame->y_crop_height;
+  const int factor = (int)pow(2, level);
+  int w, h, k, init_mv_stride;
+  LOCALMV *init_mvs = NULL, *refine_mvs = NULL;
+  double *ix = NULL, *iy = NULL, *it = NULL;
+  YV12_BUFFER_CONFIG temp_frame;
+  temp_frame.y_buffer = NULL;
+  if (level == 0) {
+    init_mvs = mvs;
+    init_mv_stride = mv_stride;
+  } else {
+    init_mvs = aom_calloc(fw * fh, sizeof(*mvs));
+    if (!init_mvs) goto free_hs_buf;
+    init_mv_stride = fw;
+    for (h = 0; h < fh; h++) {
+      for (w = 0; w < fw; w++) {
+        init_mvs[h * init_mv_stride + w].row =
+            mvs[h * factor * mv_stride + w * factor].row / (double)factor;
+        init_mvs[h * init_mv_stride + w].col =
+            mvs[h * factor * mv_stride + w * factor].col / (double)factor;
+      }
+    }
+  }
+  refine_mvs = aom_calloc(fw * fh, sizeof(*mvs));
+  if (!refine_mvs) goto free_hs_buf;
+  // temp frame for warping
+  temp_frame.y_buffer =
+      (uint8_t *)aom_calloc(fh * fw, sizeof(*temp_frame.y_buffer));
+  if (!temp_frame.y_buffer) goto free_hs_buf;
+  temp_frame.y_crop_height = fh;
+  temp_frame.y_crop_width = fw;
+  temp_frame.y_stride = fw;
+  // gradient buffers
+  ix = aom_calloc(fw * fh, sizeof(*ix));
+  iy = aom_calloc(fw * fh, sizeof(*iy));
+  it = aom_calloc(fw * fh, sizeof(*it));
+  if (!ix || !iy || !it) goto free_hs_buf;
+  // For each warping step
+  for (k = 0; k < opfl_params->warping_steps; k++) {
+    // warp from_frame with init_mv
+    if (level == 0) {
+      warp_back_frame_intp(&temp_frame, to_frame, init_mvs, init_mv_stride);
+    } else {
+      warp_back_frame(&temp_frame, to_frame, init_mvs, init_mv_stride);
+    }
+    // calculate frame gradients
+    get_frame_gradients(from_frame, &temp_frame, ix, iy, it, fw);
+    // form linear equations and solve mvs
+    solve_horn_schunck(ix, iy, it, fw, fw, fh, init_mvs, init_mv_stride,
+                       refine_mvs, fw);
+    // update init_mvs
+    for (h = 0; h < fh; h++) {
+      for (w = 0; w < fw; w++) {
+        init_mvs[h * init_mv_stride + w].col += refine_mvs[h * fw + w].col;
+        init_mvs[h * init_mv_stride + w].row += refine_mvs[h * fw + w].row;
+      }
+    }
+  }
+  // copy back the mvs if needed
+  if (level != 0) {
+    for (h = 0; h < mv_height; h++) {
+      for (w = 0; w < mv_width; w++) {
+        mvs[h * mv_stride + w].row =
+            init_mvs[h / factor * init_mv_stride + w / factor].row *
+            (double)factor;
+        mvs[h * mv_stride + w].col =
+            init_mvs[h / factor * init_mv_stride + w / factor].col *
+            (double)factor;
+      }
+    }
+  }
+free_hs_buf:
+  if (level != 0) aom_free(init_mvs);
+  aom_free(refine_mvs);
+  aom_free(temp_frame.y_buffer);
+  aom_free(ix);
+  aom_free(iy);
+  aom_free(it);
+}
+
+// Apply optical flow iteratively at each pyramid level
+static void pyramid_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+                                 const YV12_BUFFER_CONFIG *to_frame,
+                                 const int bit_depth,
+                                 const OPFL_PARAMS *opfl_params,
+                                 const OPTFLOW_METHOD method, LOCALMV *mvs) {
+  assert(opfl_params->pyramid_levels > 0 &&
+         opfl_params->pyramid_levels <= MAX_PYRAMID_LEVELS);
+  int levels = opfl_params->pyramid_levels;
+  const int frame_height = from_frame->y_crop_height;
+  const int frame_width = from_frame->y_crop_width;
+  if ((frame_height / pow(2.0, levels - 1) < 50 ||
+       frame_height / pow(2.0, levels - 1) < 50) &&
+      levels > 1)
+    levels = levels - 1;
+  uint8_t *images1[MAX_PYRAMID_LEVELS] = { NULL };
+  uint8_t *images2[MAX_PYRAMID_LEVELS] = { NULL };
+  int *ref_corners = NULL;
+
+  images1[0] = from_frame->y_buffer;
+  images2[0] = to_frame->y_buffer;
+  YV12_BUFFER_CONFIG *buffers1 = aom_malloc(levels * sizeof(*buffers1));
+  YV12_BUFFER_CONFIG *buffers2 = aom_malloc(levels * sizeof(*buffers2));
+  if (!buffers1 || !buffers2) goto free_pyramid_buf;
+  buffers1[0] = *from_frame;
+  buffers2[0] = *to_frame;
+  int fw = frame_width;
+  int fh = frame_height;
+  for (int i = 1; i < levels; i++) {
+    // TODO(bohanli): may need to extend buffers for better interpolation SIMD
+    images1[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images1[i]));
+    images2[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images2[i]));
+    if (!images1[i] || !images2[i]) goto free_pyramid_buf;
+    int stride;
+    if (i == 1)
+      stride = from_frame->y_stride;
+    else
+      stride = fw;
+    reduce(images1[i - 1], fh, fw, stride, images1[i]);
+    reduce(images2[i - 1], fh, fw, stride, images2[i]);
+    fh /= 2;
+    fw /= 2;
+    YV12_BUFFER_CONFIG a = { .y_buffer = images1[i],
+                             .y_crop_width = fw,
+                             .y_crop_height = fh,
+                             .y_stride = fw };
+    YV12_BUFFER_CONFIG b = { .y_buffer = images2[i],
+                             .y_crop_width = fw,
+                             .y_crop_height = fh,
+                             .y_stride = fw };
+    buffers1[i] = a;
+    buffers2[i] = b;
+  }
+  // Compute corners for specific frame
+  int num_ref_corners = 0;
+  if (is_sparse(opfl_params)) {
+    int maxcorners = from_frame->y_crop_width * from_frame->y_crop_height;
+    ref_corners = aom_malloc(maxcorners * 2 * sizeof(*ref_corners));
+    if (!ref_corners) goto free_pyramid_buf;
+    num_ref_corners = detect_corners(from_frame, to_frame, maxcorners,
+                                     ref_corners, bit_depth);
+  }
+  const int stop_level = 0;
+  for (int i = levels - 1; i >= stop_level; i--) {
+    if (method == LUCAS_KANADE) {
+      assert(is_sparse(opfl_params));
+      lucas_kanade(&buffers1[i], &buffers2[i], i, opfl_params->lk_params,
+                   num_ref_corners, ref_corners, buffers1[0].y_crop_width,
+                   bit_depth, mvs);
+    } else if (method == HORN_SCHUNCK) {
+      assert(!is_sparse(opfl_params));
+      horn_schunck(&buffers1[i], &buffers2[i], i, buffers1[0].y_crop_width,
+                   buffers1[0].y_crop_height, buffers1[0].y_crop_width,
+                   opfl_params, mvs);
+    }
+  }
+free_pyramid_buf:
+  for (int i = 1; i < levels; i++) {
+    aom_free(images1[i]);
+    aom_free(images2[i]);
+  }
+  aom_free(ref_corners);
+  aom_free(buffers1);
+  aom_free(buffers2);
+}
+// Computes optical flow by applying algorithm at
+// multiple pyramid levels of images (lower-resolution, smoothed images)
+// This accounts for larger motions.
+// Inputs:
+//   from_frame Frame buffer.
+//   to_frame: Frame buffer. MVs point from_frame -> to_frame.
+//   from_frame_idx: Index of from_frame.
+//   to_frame_idx: Index of to_frame. Return all zero MVs when idx are equal.
+//   bit_depth:
+//   opfl_params: contains algorithm-specific parameters.
+//   mv_filter: MV_FILTER_NONE, MV_FILTER_SMOOTH, or MV_FILTER_MEDIAN.
+//   method: LUCAS_KANADE, HORN_SCHUNCK
+//   mvs: pointer to MVs. Contains initialization, and modified
+//   based on optical flow. Must have
+//   dimensions = from_frame->y_crop_width * from_frame->y_crop_height
+void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+                      const YV12_BUFFER_CONFIG *to_frame,
+                      const int from_frame_idx, const int to_frame_idx,
+                      const int bit_depth, const OPFL_PARAMS *opfl_params,
+                      const MV_FILTER_TYPE mv_filter,
+                      const OPTFLOW_METHOD method, MV *mvs) {
+  const int frame_height = from_frame->y_crop_height;
+  const int frame_width = from_frame->y_crop_width;
+  // TODO(any): deal with the case where frames are not of the same dimensions
+  assert(frame_height == to_frame->y_crop_height &&
+         frame_width == to_frame->y_crop_width);
+  if (from_frame_idx == to_frame_idx) {
+    // immediately return all zero mvs when frame indices are equal
+    for (int yy = 0; yy < frame_height; yy++) {
+      for (int xx = 0; xx < frame_width; xx++) {
+        MV mv = { .row = 0, .col = 0 };
+        mvs[yy * frame_width + xx] = mv;
+      }
+    }
+    return;
+  }
+
+  // Initialize double mvs based on input parameter mvs array
+  LOCALMV *localmvs =
+      aom_malloc(frame_height * frame_width * sizeof(*localmvs));
+  if (!localmvs) return;
+
+  filter_mvs(MV_FILTER_SMOOTH, frame_height, frame_width, localmvs, mvs);
+
+  for (int i = 0; i < frame_width * frame_height; i++) {
+    MV mv = mvs[i];
+    LOCALMV localmv = { .row = ((double)mv.row) / 8,
+                        .col = ((double)mv.col) / 8 };
+    localmvs[i] = localmv;
+  }
+  // Apply optical flow algorithm
+  pyramid_optical_flow(from_frame, to_frame, bit_depth, opfl_params, method,
+                       localmvs);
+
+  // Update original mvs array
+  for (int j = 0; j < frame_height; j++) {
+    for (int i = 0; i < frame_width; i++) {
+      int idx = j * frame_width + i;
+      if (j + localmvs[idx].row < 0 || j + localmvs[idx].row >= frame_height ||
+          i + localmvs[idx].col < 0 || i + localmvs[idx].col >= frame_width) {
+        continue;
+      }
+      MV mv = { .row = (int16_t)round(8 * localmvs[idx].row),
+                .col = (int16_t)round(8 * localmvs[idx].col) };
+      mvs[idx] = mv;
+    }
+  }
+
+  filter_mvs(mv_filter, frame_height, frame_width, localmvs, mvs);
+
+  aom_free(localmvs);
+}
+#endif
diff --git a/media/libaom/src/av1/encoder/optical_flow.h b/media/libaom/src/av1/encoder/optical_flow.h
new file mode 100644
index 0000000000..2fbe474d77
--- /dev/null
+++ b/media/libaom/src/av1/encoder/optical_flow.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_OPTICAL_FLOW_H_
+#define AOM_AV1_ENCODER_OPTICAL_FLOW_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/common/mv.h"
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_OPTICAL_FLOW_API
+
+typedef enum { LUCAS_KANADE, HORN_SCHUNCK } OPTFLOW_METHOD;
+
+typedef enum {
+  MV_FILTER_NONE,
+  MV_FILTER_SMOOTH,
+  MV_FILTER_MEDIAN
+} MV_FILTER_TYPE;
+
+typedef struct LOCALMV {
+  double row;
+  double col;
+} LOCALMV;
+
+#define MAX_PYRAMID_LEVELS 5
+// default options for optical flow
+#define OPFL_WINDOW_SIZE 15
+#define OPFL_PYRAMID_LEVELS 3  // total levels
+#define OPFL_WARPING_STEPS 3
+
+// parameters specific to Lucas-Kanade
+typedef struct lk_params {
+  int window_size;
+} LK_PARAMS;
+
+// generic structure to contain parameters for all
+// optical flow algorithms
+typedef struct opfl_params {
+  int pyramid_levels;
+  int warping_steps;
+  LK_PARAMS *lk_params;
+  int flags;
+} OPFL_PARAMS;
+
+#define OPFL_FLAG_SPARSE 1
+
+void av1_init_opfl_params(OPFL_PARAMS *opfl_params);
+
+void av1_init_lk_params(LK_PARAMS *lk_params);
+
+void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+                      const YV12_BUFFER_CONFIG *to_frame,
+                      const int from_frame_idx, const int to_frame_idx,
+                      const int bit_depth, const OPFL_PARAMS *opfl_params,
+                      const MV_FILTER_TYPE mv_filter,
+                      const OPTFLOW_METHOD method, MV *mvs);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_OPTICAL_FLOW_H_
diff --git a/media/libaom/src/av1/encoder/palette.c b/media/libaom/src/av1/encoder/palette.c
index e61cd02ce4..69f4523ef1 100644
--- a/media/libaom/src/av1/encoder/palette.c
+++ b/media/libaom/src/av1/encoder/palette.c
@@ -12,9 +12,17 @@
 #include <math.h>
 #include <stdlib.h>
 
+#include "av1/common/pred_common.h"
+
+#include "av1/encoder/block.h"
 #include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
 #include "av1/encoder/palette.h"
 #include "av1/encoder/random.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/tx_search.h"
 
 #define AV1_K_MEANS_DIM 1
 #include "av1/encoder/k_means_template.h"
@@ -115,7 +123,7 @@ int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
 }
 
 int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
-                             uint16_t *color_cache, int n_cache,
+                             const uint16_t *color_cache, int n_cache,
                              int bit_depth) {
   const int n = pmi->palette_size[0];
   int out_cache_colors[PALETTE_MAX_SIZE];
@@ -129,7 +137,7 @@ int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
 }
 
 int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
-                              uint16_t *color_cache, int n_cache,
+                              const uint16_t *color_cache, int n_cache,
                               int bit_depth) {
   const int n = pmi->palette_size[1];
   int total_bits = 0;
@@ -152,3 +160,745 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
   total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw);
   return av1_cost_literal(total_bits);
 }
+
+// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
+// new_height'. Extra rows and columns are filled in by copying last valid
+// row/column.
+static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
+                                                int orig_width, int orig_height,
+                                                int new_width, int new_height) {
+  int j;
+  assert(new_width >= orig_width);
+  assert(new_height >= orig_height);
+  if (new_width == orig_width && new_height == orig_height) return;
+
+  for (j = orig_height - 1; j >= 0; --j) {
+    memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
+    // Copy last column to extra columns.
+    memset(color_map + j * new_width + orig_width,
+           color_map[j * new_width + orig_width - 1], new_width - orig_width);
+  }
+  // Copy last row to extra rows.
+  for (j = orig_height; j < new_height; ++j) {
+    memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
+           new_width);
+  }
+}
+
+// Bias toward using colors in the cache.
+// TODO(huisu): Try other schemes to improve compression.
+static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
+                                               int n_cache, int n_colors,
+                                               int stride, int *centroids,
+                                               int bit_depth) {
+  if (n_cache <= 0) return;
+  for (int i = 0; i < n_colors * stride; i += stride) {
+    int min_diff = abs(centroids[i] - (int)color_cache[0]);
+    int idx = 0;
+    for (int j = 1; j < n_cache; ++j) {
+      const int this_diff = abs(centroids[i] - color_cache[j]);
+      if (this_diff < min_diff) {
+        min_diff = this_diff;
+        idx = j;
+      }
+    }
+    const int min_threshold = 4 << (bit_depth - 8);
+    if (min_diff <= min_threshold) centroids[i] = color_cache[idx];
+  }
+}
+
+/*!\brief Calculate the luma palette cost from a given color palette
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * Given the base colors as specified in centroids[], calculate the RD cost
+ * of palette mode.
+ */
+static AOM_INLINE void palette_rd_y(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
+    uint16_t *color_cache, int n_cache, bool do_header_rd_based_gating,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
+    uint8_t *tx_type_map, int *beat_best_palette_rd,
+    bool *do_header_rd_based_breakout, int discount_color_cost) {
+  if (do_header_rd_based_breakout != NULL) *do_header_rd_based_breakout = false;
+  optimize_palette_colors(color_cache, n_cache, n, 1, centroids,
+                          cpi->common.seq_params->bit_depth);
+  const int num_unique_colors = av1_remove_duplicates(centroids, n);
+  if (num_unique_colors < PALETTE_MIN_SIZE) {
+    // Too few unique colors to create a palette. And DC_PRED will work
+    // well for that case anyway. So skip.
+    return;
+  }
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  if (cpi->common.seq_params->use_highbitdepth) {
+    for (int i = 0; i < num_unique_colors; ++i) {
+      pmi->palette_colors[i] = clip_pixel_highbd(
+          (int)centroids[i], cpi->common.seq_params->bit_depth);
+    }
+  } else {
+    for (int i = 0; i < num_unique_colors; ++i) {
+      pmi->palette_colors[i] = clip_pixel(centroids[i]);
+    }
+  }
+  pmi->palette_size[0] = num_unique_colors;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors,
+                   1);
+  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
+
+  RD_STATS tokenonly_rd_stats;
+  int this_rate;
+
+  if (do_header_rd_based_gating) {
+    assert(do_header_rd_based_breakout != NULL);
+    const int palette_mode_rate = intra_mode_info_cost_y(
+        cpi, x, mbmi, bsize, dc_mode_cost, discount_color_cost);
+    const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0);
+    // Less aggressive pruning when prune_luma_palette_size_search_level == 1.
+    const int header_rd_shift =
+        (cpi->sf.intra_sf.prune_luma_palette_size_search_level == 1) ? 1 : 0;
+    // Terminate further palette_size search, if the header cost corresponding
+    // to lower palette_size is more than *best_rd << header_rd_shift. This
+    // logic is implemented with a right shift in the LHS to prevent a possible
+    // overflow with the left shift in RHS.
+    if ((header_rd >> header_rd_shift) > *best_rd) {
+      *do_header_rd_based_breakout = true;
+      return;
+    }
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                      *best_rd);
+    if (tokenonly_rd_stats.rate == INT_MAX) return;
+    this_rate = tokenonly_rd_stats.rate + palette_mode_rate;
+  } else {
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                      *best_rd);
+    if (tokenonly_rd_stats.rate == INT_MAX) return;
+    this_rate = tokenonly_rd_stats.rate +
+                intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost,
+                                       discount_color_cost);
+  }
+
+  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
+    tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+  // Collect mode stats for multiwinner mode processing
+  const int txfm_search_done = 1;
+  store_winner_mode_stats(
+      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize,
+      this_rd, cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    // Setting beat_best_rd flag because current mode rd is better than best_rd.
+    // This flag need to be updated only for palette evaluation in key frames
+    if (beat_best_rd) *beat_best_rd = 1;
+    memcpy(best_palette_color_map, color_map,
+           block_width * block_height * sizeof(color_map[0]));
+    *best_mbmi = *mbmi;
+    memcpy(blk_skip, x->txfm_search_info.blk_skip,
+           sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+    av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    if (rate) *rate = this_rate;
+    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+    if (distortion) *distortion = tokenonly_rd_stats.dist;
+    if (skippable) *skippable = tokenonly_rd_stats.skip_txfm;
+    if (beat_best_palette_rd) *beat_best_palette_rd = 1;
+  }
+}
+
+static AOM_INLINE int is_iter_over(int curr_idx, int end_idx, int step_size) {
+  assert(step_size != 0);
+  return (step_size > 0) ? curr_idx >= end_idx : curr_idx <= end_idx;
+}
+
+// Performs count-based palette search with number of colors in interval
+// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can
+// be less than start_n. Saves the last numbers searched in last_n_searched and
+// returns the best number of colors found.
+static AOM_INLINE int perform_top_color_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors,
+    int start_n, int end_n, int step_size, bool do_header_rd_based_gating,
+    int *last_n_searched, uint16_t *color_cache, int n_cache,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+    uint8_t *tx_type_map, int discount_color_cost) {
+  int centroids[PALETTE_MAX_SIZE];
+  int n = start_n;
+  int top_color_winner = end_n;
+  /* clang-format off */
+  assert(IMPLIES(step_size < 0, start_n > end_n));
+  /* clang-format on */
+  assert(IMPLIES(step_size > 0, start_n < end_n));
+  while (!is_iter_over(n, end_n, step_size)) {
+    int beat_best_palette_rd = 0;
+    bool do_header_rd_based_breakout = false;
+    memcpy(centroids, top_colors, n * sizeof(top_colors[0]));
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, do_header_rd_based_gating, best_mbmi,
+                 best_palette_color_map, best_rd, rate, rate_tokenonly,
+                 distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+                 tx_type_map, &beat_best_palette_rd,
+                 &do_header_rd_based_breakout, discount_color_cost);
+    *last_n_searched = n;
+    if (do_header_rd_based_breakout) {
+      // Terminate palette_size search by setting last_n_searched to end_n.
+      *last_n_searched = end_n;
+      break;
+    }
+    if (beat_best_palette_rd) {
+      top_color_winner = n;
+    } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
+      // At search level 2, we return immediately if we don't see an improvement
+      return top_color_winner;
+    }
+    n += step_size;
+  }
+  return top_color_winner;
+}
+
+// Performs k-means based palette search with number of colors in interval
+// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can
+// be less than start_n. Saves the last numbers searched in last_n_searched and
+// returns the best number of colors found.
+static AOM_INLINE int perform_k_means_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lower_bound,
+    int upper_bound, int start_n, int end_n, int step_size,
+    bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache,
+    int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
+    int data_points, int discount_color_cost) {
+  int centroids[PALETTE_MAX_SIZE];
+  const int max_itr = 50;
+  int n = start_n;
+  int top_color_winner = end_n;
+  /* clang-format off */
+  assert(IMPLIES(step_size < 0, start_n > end_n));
+  /* clang-format on */
+  assert(IMPLIES(step_size > 0, start_n < end_n));
+  while (!is_iter_over(n, end_n, step_size)) {
+    int beat_best_palette_rd = 0;
+    bool do_header_rd_based_breakout = false;
+    for (int i = 0; i < n; ++i) {
+      centroids[i] =
+          lower_bound + (2 * i + 1) * (upper_bound - lower_bound) / n / 2;
+    }
+    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, do_header_rd_based_gating, best_mbmi,
+                 best_palette_color_map, best_rd, rate, rate_tokenonly,
+                 distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+                 tx_type_map, &beat_best_palette_rd,
+                 &do_header_rd_based_breakout, discount_color_cost);
+    *last_n_searched = n;
+    if (do_header_rd_based_breakout) {
+      // Terminate palette_size search by setting last_n_searched to end_n.
+      *last_n_searched = end_n;
+      break;
+    }
+    if (beat_best_palette_rd) {
+      top_color_winner = n;
+    } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
+      // At search level 2, we return immediately if we don't see an improvement
+      return top_color_winner;
+    }
+    n += step_size;
+  }
+  return top_color_winner;
+}
+
+// Sets the parameters to search the current number of colors +- 1
+static AOM_INLINE void set_stage2_params(int *min_n, int *max_n, int *step_size,
+                                         int winner, int end_n) {
+  // Set min to winner - 1 unless we are already at the border, then we set it
+  // to winner + 1
+  *min_n = (winner == PALETTE_MIN_SIZE) ? (PALETTE_MIN_SIZE + 1)
+                                        : AOMMAX(winner - 1, PALETTE_MIN_SIZE);
+  // Set max to winner + 1 unless we are already at the border, then we set it
+  // to winner - 1
+  *max_n =
+      (winner == end_n) ? (winner - 1) : AOMMIN(winner + 1, PALETTE_MAX_SIZE);
+
+  // Set the step size to max_n - min_n so we only search those two values.
+  // If max_n == min_n, then set step_size to 1 to avoid infinite loop later.
+  *step_size = AOMMAX(1, *max_n - *min_n);
+}
+
+static AOM_INLINE void fill_data_and_get_bounds(
+    const uint8_t *src, const int src_stride, const int rows, const int cols,
+    const int is_high_bitdepth, int *data, int *lower_bound, int *upper_bound) {
+  if (is_high_bitdepth) {
+    const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+    *lower_bound = *upper_bound = src_ptr[0];
+    for (int r = 0; r < rows; ++r) {
+      for (int c = 0; c < cols; ++c) {
+        const int val = src_ptr[c];
+        data[c] = val;
+        *lower_bound = AOMMIN(*lower_bound, val);
+        *upper_bound = AOMMAX(*upper_bound, val);
+      }
+      src_ptr += src_stride;
+      data += cols;
+    }
+    return;
+  }
+
+  // low bit depth
+  *lower_bound = *upper_bound = src[0];
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      const int val = src[c];
+      data[c] = val;
+      *lower_bound = AOMMIN(*lower_bound, val);
+      *upper_bound = AOMMAX(*upper_bound, val);
+    }
+    src += src_stride;
+    data += cols;
+  }
+}
+
+void av1_rd_pick_palette_intra_sby(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+    uint8_t *tx_type_map) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                           bsize));
+  assert(PALETTE_MAX_SIZE == 8);
+  assert(PALETTE_MIN_SIZE == 2);
+
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *const src = x->plane[0].src.buf;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  const SequenceHeader *const seq_params = cpi->common.seq_params;
+  const int is_hbd = seq_params->use_highbitdepth;
+  const int bit_depth = seq_params->bit_depth;
+  const int discount_color_cost = cpi->sf.rt_sf.use_nonrd_pick_mode;
+  int unused;
+
+  int count_buf[1 << 12];      // Maximum (1 << 12) color levels.
+  int count_buf_8bit[1 << 8];  // Maximum (1 << 8) bins for hbd path.
+  int colors, colors_threshold = 0;
+  if (is_hbd) {
+    av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth, count_buf,
+                            count_buf_8bit, &colors_threshold, &colors);
+  } else {
+    av1_count_colors(src, src_stride, rows, cols, count_buf, &colors);
+    colors_threshold = colors;
+  }
+
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  if (colors_threshold > 1 && colors_threshold <= 64) {
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[PALETTE_MAX_SIZE];
+    int lower_bound, upper_bound;
+    fill_data_and_get_bounds(src, src_stride, rows, cols, is_hbd, data,
+                             &lower_bound, &upper_bound);
+
+    mbmi->mode = DC_PRED;
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+
+    // Find the dominant colors, stored in top_colors[].
+    int top_colors[PALETTE_MAX_SIZE] = { 0 };
+    for (int i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
+      int max_count = 0;
+      for (int j = 0; j < (1 << bit_depth); ++j) {
+        if (count_buf[j] > max_count) {
+          max_count = count_buf[j];
+          top_colors[i] = j;
+        }
+      }
+      assert(max_count > 0);
+      count_buf[top_colors[i]] = 0;
+    }
+
+    // The following are the approaches used for header rdcost based gating
+    // for early termination for different values of prune_palette_search_level.
+    // 0: Pruning based on header rdcost for ascending order palette_size
+    // search.
+    // 1: When colors > PALETTE_MIN_SIZE, enabled only for coarse palette_size
+    // search and for finer search do_header_rd_based_gating parameter is
+    // explicitly passed as 'false'.
+    // 2: Enabled only for ascending order palette_size search and for
+    // descending order search do_header_rd_based_gating parameter is explicitly
+    // passed as 'false'.
+    const bool do_header_rd_based_gating =
+        cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0;
+
+    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
+    // where the dominant colors and the k-means results are similar.
+    if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
+        (colors > PALETTE_MIN_SIZE)) {
+      // Start index and step size below are chosen to evaluate unique
+      // candidates in neighbor search, in case a winner candidate is found in
+      // coarse search. Example,
+      // 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step
+      // size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8.
+      // If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2
+      // (3) and 8 (7).
+      // 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same
+      // as for 8 colors) then step size should also be 2, to cover all
+      // candidates. Coarse search will evaluate 2, 4 and 6. If winner is either
+      // 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3,
+      // coarse search will evaluate 3 and 6. For the winner, unique neighbors
+      // (3: 2,4 or 6: 5,7) would be evaluated.
+
+      // Start index for coarse palette search for dominant colors and k-means
+      const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+                                                                   3, 3, 2,
+                                                                   3, 3, 2 };
+      // Step size for coarse palette search for dominant colors and k-means
+      const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+                                                                     3, 3, 3,
+                                                                     3, 3, 3 };
+
+      // Choose the start index and step size for coarse search based on number
+      // of colors
+      const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE);
+      const int min_n = start_n_lookup_table[max_n];
+      const int step_size = step_size_lookup_table[max_n];
+      assert(min_n >= PALETTE_MIN_SIZE);
+      // Perform top color coarse palette search to find the winner candidate
+      const int top_color_winner = perform_top_color_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
+          step_size, do_header_rd_based_gating, &unused, color_cache, n_cache,
+          best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+          discount_color_cost);
+      // Evaluate neighbors for the winner color (if winner is found) in the
+      // above coarse search for dominant colors
+      if (top_color_winner <= max_n) {
+        int stage2_min_n, stage2_max_n, stage2_step_size;
+        set_stage2_params(&stage2_min_n, &stage2_max_n, &stage2_step_size,
+                          top_color_winner, max_n);
+        // perform finer search for the winner candidate
+        perform_top_color_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, stage2_min_n,
+            stage2_max_n + 1, stage2_step_size,
+            /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
+            best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+            distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+            tx_type_map, discount_color_cost);
+      }
+      // K-means clustering.
+      // Perform k-means coarse palette search to find the winner candidate
+      const int k_means_winner = perform_k_means_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+          min_n, max_n + 1, step_size, do_header_rd_based_gating, &unused,
+          color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+          rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+          best_blk_skip, tx_type_map, color_map, rows * cols,
+          discount_color_cost);
+      // Evaluate neighbors for the winner color (if winner is found) in the
+      // above coarse search for k-means
+      if (k_means_winner <= max_n) {
+        int start_n_stage2, end_n_stage2, step_size_stage2;
+        set_stage2_params(&start_n_stage2, &end_n_stage2, &step_size_stage2,
+                          k_means_winner, max_n);
+        // perform finer search for the winner candidate
+        perform_k_means_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+            start_n_stage2, end_n_stage2 + 1, step_size_stage2,
+            /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
+            best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+            distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+            tx_type_map, color_map, rows * cols, discount_color_cost);
+      }
+    } else {
+      const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE),
+                min_n = PALETTE_MIN_SIZE;
+      // Perform top color palette search in ascending order
+      int last_n_searched = min_n;
+      perform_top_color_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
+          1, do_header_rd_based_gating, &last_n_searched, color_cache, n_cache,
+          best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+          discount_color_cost);
+      if (last_n_searched < max_n) {
+        // Search in descending order until we get to the previous best
+        perform_top_color_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n,
+            last_n_searched, -1, /*do_header_rd_based_gating=*/false, &unused,
+            color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map, discount_color_cost);
+      }
+      // K-means clustering.
+      if (colors == PALETTE_MIN_SIZE) {
+        // Special case: These colors automatically become the centroids.
+        assert(colors == 2);
+        centroids[0] = lower_bound;
+        centroids[1] = upper_bound;
+        palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
+                     color_cache, n_cache, /*do_header_rd_based_gating=*/false,
+                     best_mbmi, best_palette_color_map, best_rd, rate,
+                     rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+                     best_blk_skip, tx_type_map, NULL, NULL,
+                     discount_color_cost);
+      } else {
+        // Perform k-means palette search in ascending order
+        last_n_searched = min_n;
+        perform_k_means_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+            min_n, max_n + 1, 1, do_header_rd_based_gating, &last_n_searched,
+            color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map, color_map, rows * cols,
+            discount_color_cost);
+        if (last_n_searched < max_n) {
+          // Search in descending order until we get to the previous best
+          perform_k_means_palette_search(
+              cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+              max_n, last_n_searched, -1, /*do_header_rd_based_gating=*/false,
+              &unused, color_cache, n_cache, best_mbmi, best_palette_color_map,
+              best_rd, rate, rate_tokenonly, distortion, skippable,
+              beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+              rows * cols, discount_color_cost);
+        }
+      }
+    }
+  }
+
+  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           block_width * block_height * sizeof(best_palette_color_map[0]));
+  }
+  *mbmi = *best_mbmi;
+}
+
+void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x,
+                                    int dc_mode_cost,
+                                    uint8_t *best_palette_color_map,
+                                    MB_MODE_INFO *const best_mbmi,
+                                    int64_t *best_rd, int *rate,
+                                    int *rate_tokenonly, int64_t *distortion,
+                                    int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                           mbmi->bsize));
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const SequenceHeader *const seq_params = cpi->common.seq_params;
+  int this_rate;
+  int64_t this_rd;
+  int colors_u, colors_v;
+  int colors_threshold_u = 0, colors_threshold_v = 0, colors_threshold = 0;
+  const int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  RD_STATS tokenonly_rd_stats;
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  mbmi->uv_mode = UV_DC_PRED;
+  int count_buf[1 << 12];      // Maximum (1 << 12) color levels.
+  int count_buf_8bit[1 << 8];  // Maximum (1 << 8) bins for hbd path.
+  if (seq_params->use_highbitdepth) {
+    av1_count_colors_highbd(src_u, src_stride, rows, cols,
+                            seq_params->bit_depth, count_buf, count_buf_8bit,
+                            &colors_threshold_u, &colors_u);
+    av1_count_colors_highbd(src_v, src_stride, rows, cols,
+                            seq_params->bit_depth, count_buf, count_buf_8bit,
+                            &colors_threshold_v, &colors_v);
+  } else {
+    av1_count_colors(src_u, src_stride, rows, cols, count_buf, &colors_u);
+    av1_count_colors(src_v, src_stride, rows, cols, count_buf, &colors_v);
+    colors_threshold_u = colors_u;
+    colors_threshold_v = colors_v;
+  }
+
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+
+  colors_threshold = colors_threshold_u > colors_threshold_v
+                         ? colors_threshold_u
+                         : colors_threshold_v;
+  if (colors_threshold > 1 && colors_threshold <= 64) {
+    int r, c, n, i, j;
+    const int max_itr = 50;
+    int lb_u, ub_u, val_u;
+    int lb_v, ub_v, val_v;
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[2 * PALETTE_MAX_SIZE];
+
+    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+    if (seq_params->use_highbitdepth) {
+      lb_u = src_u16[0];
+      ub_u = src_u16[0];
+      lb_v = src_v16[0];
+      ub_v = src_v16[0];
+    } else {
+      lb_u = src_u[0];
+      ub_u = src_u[0];
+      lb_v = src_v[0];
+      ub_v = src_v[0];
+    }
+
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) {
+        if (seq_params->use_highbitdepth) {
+          val_u = src_u16[r * src_stride + c];
+          val_v = src_v16[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        } else {
+          val_u = src_u[r * src_stride + c];
+          val_v = src_v[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        }
+        if (val_u < lb_u)
+          lb_u = val_u;
+        else if (val_u > ub_u)
+          ub_u = val_u;
+        if (val_v < lb_v)
+          lb_v = val_v;
+        else if (val_v > ub_v)
+          ub_v = val_v;
+      }
+    }
+
+    const int colors = colors_u > colors_v ? colors_u : colors_v;
+    const int max_colors =
+        colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
+    for (n = PALETTE_MIN_SIZE; n <= max_colors; ++n) {
+      for (i = 0; i < n; ++i) {
+        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+        centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
+      }
+      av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
+      optimize_palette_colors(color_cache, n_cache, n, 2, centroids,
+                              cpi->common.seq_params->bit_depth);
+      // Sort the U channel colors in ascending order.
+      for (i = 0; i < 2 * (n - 1); i += 2) {
+        int min_idx = i;
+        int min_val = centroids[i];
+        for (j = i + 2; j < 2 * n; j += 2)
+          if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
+        if (min_idx != i) {
+          int temp_u = centroids[i], temp_v = centroids[i + 1];
+          centroids[i] = centroids[min_idx];
+          centroids[i + 1] = centroids[min_idx + 1];
+          centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
+        }
+      }
+      av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
+      extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                               plane_block_height);
+      pmi->palette_size[1] = n;
+      for (i = 1; i < 3; ++i) {
+        for (j = 0; j < n; ++j) {
+          if (seq_params->use_highbitdepth)
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
+          else
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+                clip_pixel((int)centroids[j * 2 + i - 1]);
+        }
+      }
+
+      if (cpi->sf.intra_sf.early_term_chroma_palette_size_search) {
+        const int palette_mode_rate =
+            intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+        const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0);
+        // Terminate further palette_size search, if header cost corresponding
+        // to lower palette_size is more than the best_rd.
+        if (header_rd >= *best_rd) break;
+        av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+        if (tokenonly_rd_stats.rate == INT_MAX) continue;
+        this_rate = tokenonly_rd_stats.rate + palette_mode_rate;
+      } else {
+        av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+        if (tokenonly_rd_stats.rate == INT_MAX) continue;
+        this_rate = tokenonly_rd_stats.rate +
+                    intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+      }
+
+      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        *best_mbmi = *mbmi;
+        memcpy(best_palette_color_map, color_map,
+               plane_block_width * plane_block_height *
+                   sizeof(best_palette_color_map[0]));
+        *rate = this_rate;
+        *distortion = tokenonly_rd_stats.dist;
+        *rate_tokenonly = tokenonly_rd_stats.rate;
+        *skippable = tokenonly_rd_stats.skip_txfm;
+      }
+    }
+  }
+  if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           plane_block_width * plane_block_height *
+               sizeof(best_palette_color_map[0]));
+  }
+}
+
+void av1_restore_uv_color_map(const AV1_COMP *cpi, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  int *const data = x->palette_buffer->kmeans_data_buf;
+  int centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  int r, c;
+  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      if (cpi->common.seq_params->use_highbitdepth) {
+        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+      } else {
+        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+      }
+    }
+  }
+
+  for (r = 1; r < 3; ++r) {
+    for (c = 0; c < pmi->palette_size[1]; ++c) {
+      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+    }
+  }
+
+  av1_calc_indices(data, centroids, color_map, rows * cols,
+                   pmi->palette_size[1], 2);
+  extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                           plane_block_height);
+}
diff --git a/media/libaom/src/av1/encoder/palette.h b/media/libaom/src/av1/encoder/palette.h
index 8b88c4755c..7d9a72f61d 100644
--- a/media/libaom/src/av1/encoder/palette.h
+++ b/media/libaom/src/av1/encoder/palette.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\file
+ * \brief Declares functions used in palette search.
+ */
 #ifndef AOM_AV1_ENCODER_PALETTE_H_
 #define AOM_AV1_ENCODER_PALETTE_H_
 
@@ -18,41 +21,74 @@
 extern "C" {
 #endif
 
-#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim
+struct AV1_COMP;
+struct PICK_MODE_CONTEXT;
+struct macroblock;
+
+/*!\cond */
+#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim##_c
 
-void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const int *data,
-                                             const int *centroids,
-                                             uint8_t *indices, int n, int k);
-void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const int *data,
-                                             const int *centroids,
-                                             uint8_t *indices, int n, int k);
 void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int *data, int *centroids,
                                         uint8_t *indices, int n, int k,
                                         int max_itr);
 void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int *data, int *centroids,
                                         uint8_t *indices, int n, int k,
                                         int max_itr);
+/*!\endcond */
 
-// Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim',
-// calculate the centroid 'indices' for the data points.
+/*!\brief Calculates the cluster to which each data point belong.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    data               The data points whose cluster indices are
+ *                                  to be computed. The data layout is
+ *                                  NUM_DATA_POINTS X DATA_DIM.
+ * \param[in]    centroids          Pointer to the centroids. The data layout
+ *                                  is NUM_CENTROIDS X DATA_DIM.
+ * \param[in]    indices            Pointer to store the computed indices.
+ * \param[in]    n                  Number of data points.
+ * \param[in]    k                  Number of clusters.
+ * \param[in]    dim                Data dimension.
+ *
+ * \return Returns nothing, but saves each data's cluster index in indices.
+ */
 static INLINE void av1_calc_indices(const int *data, const int *centroids,
                                     uint8_t *indices, int n, int k, int dim) {
+  assert(n > 0);
+  assert(k > 0);
   if (dim == 1) {
-    AV1_K_MEANS_RENAME(av1_calc_indices, 1)(data, centroids, indices, n, k);
+    av1_calc_indices_dim1(data, centroids, indices, n, k);
   } else if (dim == 2) {
-    AV1_K_MEANS_RENAME(av1_calc_indices, 2)(data, centroids, indices, n, k);
+    av1_calc_indices_dim2(data, centroids, indices, n, k);
   } else {
     assert(0 && "Untemplated k means dimension");
   }
 }
 
-// Given 'n' 'data' points and an initial guess of 'k' 'centroids' each of
-// dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get
-// updated 'centroids' and the centroid 'indices' for elements in 'data'.
-// Note: the output centroids are rounded off to nearest integers.
+/*!\brief Performs k-means cluster on the data.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    data               The data points to be clustered. The data
+ *                                  layout is NUM_DATA_POINTS X DATA_DIM.
+ * \param[in]    centroids          Pointer to store the computed centroids.
+ *                                  The data layout is
+ *                                  NUM_CENTROIDS X DATA_DIM.
+ * \param[in]    indices            Pointer to store the computed indices. For
+ *                                  each training data.
+ * \param[in]    n                  Number of data points.
+ * \param[in]    k                  Number of clusters.
+ * \param[in]    dim                Data dimension.
+ * \param[in]    max_itr            Maximum number of iterations to run.
+ *
+ * \return Returns nothing, but saves each cluster's centroid in centroids and
+ * each data's cluster index in indices.
+ *
+ * \attention The output centroids are rounded off to nearest integers.
+ */
 static INLINE void av1_k_means(const int *data, int *centroids,
                                uint8_t *indices, int n, int k, int dim,
                                int max_itr) {
+  assert(n > 0);
+  assert(k > 0);
   if (dim == 1) {
     AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr);
   } else if (dim == 2) {
@@ -62,33 +98,115 @@ static INLINE void av1_k_means(const int *data, int *centroids,
   }
 }
 
-// Given a list of centroids, returns the unique number of centroids 'k', and
-// puts these unique centroids in first 'k' indices of 'centroids' array.
-// Ideally, the centroids should be rounded to integers before calling this
-// method.
+/*!\brief Removes duplicated centroid indices.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    centroids          A list of centroids index.
+ * \param[in]    num_centroids      Number of centroids.
+ *
+ * \return Returns the number of unique centroids and saves the unique centroids
+ * in beginning of the centroids array.
+ *
+ * \attention The centroids should be rounded to integers before calling this
+ * method.
+ */
 int av1_remove_duplicates(int *centroids, int num_centroids);
 
-// Given a color cache and a set of base colors, find if each cache color is
-// present in the base colors, record the binary results in "cache_color_found".
-// Record the colors that are not in the color cache in "out_cache_colors".
+/*!\brief Checks what colors are in the color cache.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    color_cache          A cache of colors.
+ * \param[in]    n_cache              Number of colors in the cache.
+ * \param[in]    colors               New base colors.
+ * \param[in]    n_colors             Number of new colors.
+ * \param[in]    cache_color_found    Stores what cached colors are presented in
+ *                                    colors.
+ * \param[in]    out_cache_colors     Stores what colors are not in the cache.
+ *
+ * \return Returns the number of colors that are not in cache. In addition,
+ * records whether each cache color is presented in colors in cache_color_found,
+ * and stores and stores the out of cache colors in out_cache_colors.
+ */
 int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
                           const uint16_t *colors, int n_colors,
                           uint8_t *cache_color_found, int *out_cache_colors);
 
-// Return the number of bits used to transmit each v palette color delta;
-// assign zero_count with the number of deltas being 0.
+/*!\brief Gets the rate cost for each delta-encoding v palette.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    pmi                  Struct that stores the palette mode info.
+ * \param[in]    bit_depth            Pixel bitdepth of the sequence.
+ * \param[in]    zero_count           Stores the number of zero deltas.
+ * \param[in]    min_bits             Minimum bits for the deltas. Sets to
+ *                                    bit_depth - 4.
+ *
+ * \return Returns the number of bits used to transmit each v palette color
+ * delta and assigns zero_count with the number of deltas being 0.
+ */
 int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
                                  int bit_depth, int *zero_count, int *min_bits);
 
-// Return the rate cost for transmitting luma palette color values.
+/*!\brief Gets the rate cost for transmitting luma palette color values.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    pmi                  Struct that stores the palette mode info.
+ * \param[in]    color_cache          Color cache presented at the decoder.
+ * \param[in]    n_cache              Number of colors in the cache.
+ * \param[in]    bit_depth            Pixel bitdepth of the sequence.
+ *
+ * \return Returns the rate needed to transmit the palette. Note that this does
+ * not include the cost of transmitted the color map.
+ */
 int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
-                             uint16_t *color_cache, int n_cache, int bit_depth);
+                             const uint16_t *color_cache, int n_cache,
+                             int bit_depth);
 
-// Return the rate cost for transmitting chroma palette color values.
+/*!\brief Gets the rate cost for transmitting luma palette chroma values.
+ *
+ * \ingroup palette_mode_search
+ * \param[in]    pmi                  Struct that stores the palette mode info.
+ * \param[in]    color_cache          Color cache presented at the decoder.
+ * \param[in]    n_cache              Number of colors in the cache.
+ * \param[in]    bit_depth            Pixel bitdepth of the sequence.
+ *
+ * \return Returns the rate needed to transmit the palette. Note that this does
+ * not include the cost of transmitted the color map.
+ */
 int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
-                              uint16_t *color_cache, int n_cache,
+                              const uint16_t *color_cache, int n_cache,
                               int bit_depth);
 
+/*!\brief Search for the best palette in the luma plane.
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * This function is used in both inter and intra frame coding.
+ */
+void av1_rd_pick_palette_intra_sby(
+    const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize,
+    int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, struct PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map);
+
+/*!\brief Search for the best palette in the chroma plane.
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * This function is used in both inter and intra frame coding.
+ */
+void av1_rd_pick_palette_intra_sbuv(const struct AV1_COMP *cpi,
+                                    struct macroblock *x, int dc_mode_cost,
+                                    uint8_t *best_palette_color_map,
+                                    MB_MODE_INFO *const best_mbmi,
+                                    int64_t *best_rd, int *rate,
+                                    int *rate_tokenonly, int64_t *distortion,
+                                    int *skippable);
+
+/*!\brief Resets palette color map for chroma channels.
+ */
+void av1_restore_uv_color_map(const struct AV1_COMP *cpi, struct macroblock *x);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/partition_search.c b/media/libaom/src/av1/encoder/partition_search.c
new file mode 100644
index 0000000000..80aae1ba6e
--- /dev/null
+++ b/media/libaom/src/av1/encoder/partition_search.c
@@ -0,0 +1,5768 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/txfm_common.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/partition_search.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/av1_ml_partition_models.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define COLLECT_MOTION_SEARCH_FEATURE_SB 0
+#define ML_PARTITION_WHOLE_TREE_DECISION 0
+
+void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
+  part_sf->partition_search_type = SEARCH_PARTITION;
+  part_sf->less_rectangular_check_level = 0;
+  part_sf->use_square_partition_only_threshold = BLOCK_128X128;
+  part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+  part_sf->default_max_partition_size = BLOCK_LARGEST;
+  part_sf->default_min_partition_size = BLOCK_4X4;
+  part_sf->adjust_var_based_rd_partitioning = 0;
+  part_sf->max_intra_bsize = BLOCK_LARGEST;
+  // This setting only takes effect when partition_search_type is set
+  // to FIXED_PARTITION.
+  part_sf->fixed_partition_size = BLOCK_16X16;
+  // Recode loop tolerance %.
+  part_sf->partition_search_breakout_dist_thr = 0;
+  part_sf->partition_search_breakout_rate_thr = 0;
+  part_sf->prune_ext_partition_types_search_level = 0;
+  part_sf->prune_part4_search = 0;
+  part_sf->ml_prune_partition = 0;
+  part_sf->ml_early_term_after_part_split_level = 0;
+  for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
+    part_sf->ml_partition_search_breakout_thresh[i] =
+        -1;  // -1 means not enabled.
+  }
+  part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0;
+  part_sf->simple_motion_search_split = 0;
+  part_sf->simple_motion_search_prune_rect = 0;
+  part_sf->simple_motion_search_early_term_none = 0;
+  part_sf->simple_motion_search_reduce_search_steps = 0;
+  part_sf->intra_cnn_based_part_prune_level = 0;
+  part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+  part_sf->rect_partition_eval_thresh = BLOCK_128X128;
+  part_sf->prune_ext_part_using_split_info = 0;
+  part_sf->prune_rectangular_split_based_on_qidx = 0;
+  part_sf->early_term_after_none_split = 0;
+  part_sf->ml_predict_breakout_level = 0;
+  part_sf->prune_sub_8x8_partition_level = 0;
+  part_sf->simple_motion_search_rect_split = 0;
+  part_sf->reuse_prev_rd_results_for_part_ab = 0;
+  part_sf->reuse_best_prediction_for_part_ab = 0;
+  part_sf->use_best_rd_for_pruning = 0;
+  part_sf->skip_non_sq_part_based_on_none = 0;
+}
+
+// Reset speed features that works for the baseline encoding, but
+// blocks the external partition search.
+void av1_reset_sf_for_ext_part(AV1_COMP *const cpi) {
+  cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions = 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+// If input |features| is NULL, write tpl stats to file for each super block.
+// Otherwise, store tpl stats to |features|.
+// The tpl stats is computed in the unit of tpl_bsize_1d (16x16).
+// When writing to text file:
+// The first row contains super block position, super block size,
+// tpl unit length, number of units in the super block.
+// The second row contains the intra prediction cost for each unit.
+// The third row contains the inter prediction cost for each unit.
+// The forth row contains the motion compensated dependency cost for each unit.
+static void collect_tpl_stats_sb(const AV1_COMP *const cpi,
+                                 const BLOCK_SIZE bsize, const int mi_row,
+                                 const int mi_col,
+                                 aom_partition_features_t *features) {
+  const AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE ||
+      gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+    return;
+  }
+
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  // If tpl stats is not established, early return
+  if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) {
+    if (features != NULL) features->sb_features.tpl_features.available = 0;
+    return;
+  }
+
+  const int tpl_stride = tpl_frame->stride;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+  const int mi_width =
+      AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+  const int mi_height =
+      AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+  const int col_steps = (mi_width / step) + ((mi_width % step) > 0);
+  const int row_steps = (mi_height / step) + ((mi_height % step) > 0);
+  const int num_blocks = col_steps * row_steps;
+
+  if (features == NULL) {
+    char filename[256];
+    snprintf(filename, sizeof(filename), "%s/tpl_feature_sb%d",
+             cpi->oxcf.partition_info_path, cpi->sb_counter);
+    FILE *pfile = fopen(filename, "w");
+    fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize,
+            tpl_data->tpl_bsize_1d, num_blocks);
+    int count = 0;
+    for (int row = 0; row < mi_height; row += step) {
+      for (int col = 0; col < mi_width; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                       tpl_data->tpl_stats_block_mis_log2)];
+        fprintf(pfile, "%.0f", (double)this_stats->intra_cost);
+        if (count < num_blocks - 1) fprintf(pfile, ",");
+        ++count;
+      }
+    }
+    fprintf(pfile, "\n");
+    count = 0;
+    for (int row = 0; row < mi_height; row += step) {
+      for (int col = 0; col < mi_width; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                       tpl_data->tpl_stats_block_mis_log2)];
+        fprintf(pfile, "%.0f", (double)this_stats->inter_cost);
+        if (count < num_blocks - 1) fprintf(pfile, ",");
+        ++count;
+      }
+    }
+    fprintf(pfile, "\n");
+    count = 0;
+    for (int row = 0; row < mi_height; row += step) {
+      for (int col = 0; col < mi_width; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                       tpl_data->tpl_stats_block_mis_log2)];
+        const int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        fprintf(pfile, "%.0f", (double)mc_dep_delta);
+        if (count < num_blocks - 1) fprintf(pfile, ",");
+        ++count;
+      }
+    }
+    fclose(pfile);
+  } else {
+    features->sb_features.tpl_features.available = 1;
+    features->sb_features.tpl_features.tpl_unit_length = tpl_data->tpl_bsize_1d;
+    features->sb_features.tpl_features.num_units = num_blocks;
+    int count = 0;
+    for (int row = 0; row < mi_height; row += step) {
+      for (int col = 0; col < mi_width; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                       tpl_data->tpl_stats_block_mis_log2)];
+        const int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        features->sb_features.tpl_features.intra_cost[count] =
+            this_stats->intra_cost;
+        features->sb_features.tpl_features.inter_cost[count] =
+            this_stats->inter_cost;
+        features->sb_features.tpl_features.mc_dep_cost[count] = mc_dep_delta;
+        ++count;
+      }
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+                              FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
+                              int blk_row, int blk_col,
+                              uint8_t allow_update_cdf) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                   xd->left_txfm_context + blk_row, mbmi->bsize,
+                                   tx_size);
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+  assert(tx_size > TX_4X4);
+
+  if (depth == MAX_VARTX_DEPTH) {
+    // Don't add to counts in this case
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
+
+  if (tx_size == plane_tx_size) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->txfm_partition[ctx][0];
+#endif
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2);
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->txfm_partition[ctx][1];
+#endif
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2);
+    ++x->txfm_search_info.txb_split_count;
+
+    if (sub_txs == TX_4X4) {
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+      return;
+    }
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = row;
+        int offsetc = col;
+
+        update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+                          blk_col + offsetc, allow_update_cdf);
+      }
+    }
+  }
+}
+
+static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                      BLOCK_SIZE plane_bsize,
+                                      FRAME_COUNTS *td_counts,
+                                      uint8_t allow_update_cdf) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int mi_width = mi_size_wide[plane_bsize];
+  const int mi_height = mi_size_high[plane_bsize];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+  for (int idy = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
+      update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
+                        allow_update_cdf);
+    }
+  }
+}
+
+static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
+                             int blk_col) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+
+  } else {
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+      return;
+    }
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+    for (int row = 0; row < row_end; row += bsh) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += bsw) {
+        const int offsetc = blk_col + col;
+        set_txfm_context(xd, sub_txs, offsetr, offsetc);
+      }
+    }
+  }
+}
+
+static void tx_partition_set_contexts(const AV1_COMMON *const cm,
+                                      MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) {
+  const int mi_width = mi_size_wide[plane_bsize];
+  const int mi_height = mi_size_high[plane_bsize];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+  for (int idy = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
+      set_txfm_context(xd, max_tx_size, idy, idx);
+    }
+  }
+}
+
+static void update_zeromv_cnt(const AV1_COMP *const cpi,
+                              const MB_MODE_INFO *const mi, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize) {
+  if (mi->ref_frame[0] != LAST_FRAME || !is_inter_block(mi) ||
+      mi->segment_id > CR_SEGMENT_ID_BOOST2) {
+    return;
+  }
+  const AV1_COMMON *const cm = &cpi->common;
+  const MV mv = mi->mv[0].as_mv;
+  const int bw = mi_size_wide[bsize] >> 1;
+  const int bh = mi_size_high[bsize] >> 1;
+  const int xmis = AOMMIN((cm->mi_params.mi_cols - mi_col) >> 1, bw);
+  const int ymis = AOMMIN((cm->mi_params.mi_rows - mi_row) >> 1, bh);
+  const int block_index =
+      (mi_row >> 1) * (cm->mi_params.mi_cols >> 1) + (mi_col >> 1);
+  for (int y = 0; y < ymis; y++) {
+    for (int x = 0; x < xmis; x++) {
+      // consec_zero_mv is in the scale of 8x8 blocks
+      const int map_offset = block_index + y * (cm->mi_params.mi_cols >> 1) + x;
+      if (abs(mv.row) < 10 && abs(mv.col) < 10) {
+        if (cpi->consec_zero_mv[map_offset] < 255)
+          cpi->consec_zero_mv[map_offset]++;
+      } else {
+        cpi->consec_zero_mv[map_offset] = 0;
+      }
+    }
+  }
+}
+
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                              ThreadData *td, TokenExtra **t, RUN_TYPE dry_run,
+                              BLOCK_SIZE bsize, int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO **mi_4x4 = xd->mi;
+  MB_MODE_INFO *mbmi = mi_4x4[0];
+  const int seg_skip =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+  const int mis = cm->mi_params.mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int is_inter = is_inter_block(mbmi);
+
+  // Initialize tx_mode and tx_size_search_method
+  TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  set_tx_size_search_method(
+      cm, &cpi->winner_mode_params, txfm_params,
+      cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  if (!is_inter) {
+    xd->cfl.store_y = store_cfl_required(cm, xd);
+    mbmi->skip_txfm = 1;
+    for (int plane = 0; plane < num_planes; ++plane) {
+      av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run,
+                                   cpi->optimize_seg_arr[mbmi->segment_id]);
+    }
+
+    // If there is at least one lossless segment, force the skip for intra
+    // block to be 0, in order to avoid the segment_id to be changed by in
+    // write_segment_id().
+    if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
+        cpi->enc_seg.has_lossless_segment)
+      mbmi->skip_txfm = 0;
+
+    xd->cfl.store_y = 0;
+    if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+      for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
+        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+          if (!dry_run) {
+            av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size,
+                                   PALETTE_MAP, tile_data->allow_update_cdf,
+                                   td->counts);
+          } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+            *rate +=
+                av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP);
+          }
+        }
+      }
+    }
+
+    av1_update_intra_mb_txb_context(cpi, td, dry_run, bsize,
+                                    tile_data->allow_update_cdf);
+  } else {
+    int ref;
+    const int is_compound = has_second_ref(mbmi);
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const YV12_BUFFER_CONFIG *cfg =
+          get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]);
+      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+      av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                           xd->block_ref_scale_factors[ref], num_planes);
+    }
+    // Predicted sample of inter mode (for Luma plane) cannot be reused if
+    // nonrd_check_partition_merge_mode or nonrd_check_partition_split speed
+    // feature is enabled, Since in such cases the buffer may not contain the
+    // predicted sample of best mode.
+    const int start_plane =
+        (cpi->sf.rt_sf.reuse_inter_pred_nonrd &&
+         (!cpi->sf.rt_sf.nonrd_check_partition_merge_mode) &&
+         (!cpi->sf.rt_sf.nonrd_check_partition_split) &&
+         cm->seq_params->bit_depth == AOM_BITS_8)
+            ? 1
+            : 0;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                  start_plane, av1_num_planes(cm) - 1);
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
+      assert(cpi->oxcf.motion_mode_cfg.enable_obmc);
+      av1_build_obmc_inter_predictors_sb(cm, xd);
+    }
+
+#if CONFIG_MISMATCH_DEBUG
+    if (dry_run == OUTPUT_ENABLED) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        const struct macroblockd_plane *pd = &xd->plane[plane];
+        int pixel_c, pixel_r;
+        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                        pd->subsampling_x, pd->subsampling_y);
+        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                 pd->subsampling_y))
+          continue;
+        mismatch_record_block_pre(pd->dst.buf, pd->dst.stride,
+                                  cm->current_frame.order_hint, plane, pixel_c,
+                                  pixel_r, pd->width, pd->height,
+                                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+      }
+    }
+#else
+    (void)num_planes;
+#endif
+
+    av1_encode_sb(cpi, x, bsize, dry_run);
+    av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate,
+                          tile_data->allow_update_cdf);
+  }
+
+  if (!dry_run) {
+    if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1;
+    if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+        !xd->lossless[mbmi->segment_id] && mbmi->bsize > BLOCK_4X4 &&
+        !(is_inter && (mbmi->skip_txfm || seg_skip))) {
+      if (is_inter) {
+        tx_partition_count_update(cm, x, bsize, td->counts,
+                                  tile_data->allow_update_cdf);
+      } else {
+        if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
+          ++x->txfm_search_info.txb_split_count;
+        if (block_signals_txsize(bsize)) {
+          const int tx_size_ctx = get_tx_size_context(xd);
+          const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+          const int depth = tx_size_to_depth(mbmi->tx_size, bsize);
+          const int max_depths = bsize_to_max_depth(bsize);
+
+          if (tile_data->allow_update_cdf)
+            update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+                       depth, max_depths + 1);
+#if CONFIG_ENTROPY_STATS
+          ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth];
+#endif
+        }
+      }
+      assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
+    } else {
+      int i, j;
+      TX_SIZE intra_tx_size;
+      // The new intra coding scheme requires no change of transform size
+      if (is_inter) {
+        if (xd->lossless[mbmi->segment_id]) {
+          intra_tx_size = TX_4X4;
+        } else {
+          intra_tx_size =
+              tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
+        }
+      } else {
+        intra_tx_size = mbmi->tx_size;
+      }
+
+      const int cols = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_width);
+      const int rows = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_height);
+      for (j = 0; j < rows; j++) {
+        for (i = 0; i < cols; i++) mi_4x4[mis * j + i]->tx_size = intra_tx_size;
+      }
+
+      if (intra_tx_size != max_txsize_rect_lookup[bsize])
+        ++x->txfm_search_info.txb_split_count;
+    }
+  }
+
+  if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+      block_signals_txsize(mbmi->bsize) && is_inter &&
+      !(mbmi->skip_txfm || seg_skip) && !xd->lossless[mbmi->segment_id]) {
+    if (dry_run) tx_partition_set_contexts(cm, xd, bsize);
+  } else {
+    TX_SIZE tx_size = mbmi->tx_size;
+    // The new intra coding scheme requires no change of transform size
+    if (is_inter) {
+      if (xd->lossless[mbmi->segment_id]) {
+        tx_size = TX_4X4;
+      } else {
+        tx_size = tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
+      }
+    } else {
+      tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
+    }
+    mbmi->tx_size = tx_size;
+    set_txfm_ctxs(tx_size, xd->width, xd->height,
+                  (mbmi->skip_txfm || seg_skip) && is_inter_block(mbmi), xd);
+  }
+
+  if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) {
+    cfl_store_block(xd, mbmi->bsize, mbmi->tx_size);
+  }
+  if (!dry_run) {
+    if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->svc.temporal_layer_id == 0 &&
+        cpi->sf.rt_sf.use_temporal_noise_estimate &&
+        (!cpi->ppi->use_svc ||
+         (cpi->ppi->use_svc &&
+          !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)))
+      update_zeromv_cnt(cpi, mbmi, mi_row, mi_col, bsize);
+  }
+}
+
+static void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               AQ_MODE aq_mode, MB_MODE_INFO *mbmi) {
+  x->rdmult = cpi->rd.RDMULT;
+
+  if (aq_mode != NO_AQ) {
+    assert(mbmi != NULL);
+    if (aq_mode == VARIANCE_AQ) {
+      if (cpi->vaq_refresh) {
+        const int energy = bsize <= BLOCK_16X16
+                               ? x->mb_energy
+                               : av1_log_block_var(cpi, x, bsize);
+        mbmi->segment_id = energy;
+      }
+      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+    } else if (aq_mode == COMPLEXITY_AQ) {
+      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+    } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+      // If segment is boosted, use rdmult for that segment.
+      if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+        x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+    }
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cm->delta_q_info.delta_q_present_flag &&
+      !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    x->rdmult = av1_get_cb_rdmult(cpi, x, bsize, mi_row, mi_col);
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM) {
+    av1_set_ssim_rdmult(cpi, &x->errorperbit, bsize, mi_row, mi_col,
+                        &x->rdmult);
+  }
+#if CONFIG_TUNE_VMAF
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN ||
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+  }
+#endif
+#if CONFIG_TUNE_BUTTERAUGLI
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+    av1_set_butteraugli_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+  }
+#endif
+  if (cpi->oxcf.mode == ALLINTRA) {
+    x->rdmult = (int)(((int64_t)x->rdmult * x->intra_sb_rdmult_modifier) >> 7);
+  }
+
+  // Check to make sure that the adjustments above have not caused the
+  // rd multiplier to be truncated to 0.
+  x->rdmult = (x->rdmult > 0) ? x->rdmult : 1;
+}
+
+void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
+                                        const TileInfo *const tile,
+                                        MACROBLOCK *const x, int mi_row,
+                                        int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+
+  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                        mi_row, mi_col);
+
+  set_entropy_context(xd, mi_row, mi_col, num_planes);
+  xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  // Set up destination pointers.
+  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
+                    mi_width, cpi->oxcf.border_in_pixels);
+
+  set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+
+  // Set up source buffers.
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+
+  // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
+  xd->tile = *tile;
+}
+
+void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+                     MACROBLOCK *const x, int mi_row, int mi_col,
+                     BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+
+  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+
+  // Setup segment ID.
+  mbmi = xd->mi[0];
+  mbmi->segment_id = 0;
+  if (seg->enabled) {
+    if (seg->enabled && !cpi->vaq_refresh) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+      mbmi->segment_id =
+          map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0;
+    }
+    av1_init_plane_quantizers(cpi, x, mbmi->segment_id, 0);
+  }
+#ifndef NDEBUG
+  x->last_set_offsets_loc.mi_row = mi_row;
+  x->last_set_offsets_loc.mi_col = mi_col;
+  x->last_set_offsets_loc.bsize = bsize;
+#endif  // NDEBUG
+}
+
+/*!\brief Hybrid intra mode search.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This is top level function for mode search for intra frames in non-RD
+ * optimized case. Depending on speed feature and block size it calls
+ * either non-RD or RD optimized intra mode search.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ *
+ * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+
+static AOM_INLINE void hybrid_intra_mode_search(AV1_COMP *cpi,
+                                                MACROBLOCK *const x,
+                                                RD_STATS *rd_cost,
+                                                BLOCK_SIZE bsize,
+                                                PICK_MODE_CONTEXT *ctx) {
+  int use_rdopt = 0;
+  const int hybrid_intra_pickmode = cpi->sf.rt_sf.hybrid_intra_pickmode;
+  // Use rd pick for intra mode search based on block size and variance.
+  if (hybrid_intra_pickmode && bsize < BLOCK_16X16) {
+    unsigned int var_thresh[3] = { 0, 101, 201 };
+    assert(hybrid_intra_pickmode <= 3);
+    if (x->source_variance >= var_thresh[hybrid_intra_pickmode - 1])
+      use_rdopt = 1;
+  }
+
+  if (use_rdopt)
+    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+  else
+    av1_nonrd_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+}
+
+// For real time/allintra row-mt enabled multi-threaded encoding with cost
+// update frequency set to COST_UPD_TILE/COST_UPD_OFF, tile ctxt is not updated
+// at superblock level. Thus, it is not required for the encoding of top-right
+// superblock be complete for updating tile ctxt. However, when encoding a block
+// whose right edge is also the superblock edge, intra and inter mode evaluation
+// (ref mv list population) require the encoding of the top-right superblock to
+// be complete. So, here, we delay the waiting of threads until the need for the
+// data from the top-right superblock region.
+static AOM_INLINE void wait_for_top_right_sb(
+    AV1EncRowMultiThreadInfo *enc_row_mt, AV1EncRowMultiThreadSync *row_mt_sync,
+    TileInfo *tile_info, BLOCK_SIZE sb_size, int sb_mi_size_log2,
+    BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int sb_size_in_mi = mi_size_wide[sb_size];
+  const int bw_in_mi = mi_size_wide[bsize];
+  const int blk_row_in_sb = mi_row & (sb_size_in_mi - 1);
+  const int blk_col_in_sb = mi_col & (sb_size_in_mi - 1);
+  const int top_right_block_in_sb =
+      (blk_row_in_sb == 0) && (blk_col_in_sb + bw_in_mi >= sb_size_in_mi);
+
+  // Don't wait if the block is the not the top-right block in the superblock.
+  if (!top_right_block_in_sb) return;
+
+  // Wait for the top-right superblock to finish encoding.
+  const int sb_row_in_tile =
+      (mi_row - tile_info->mi_row_start) >> sb_mi_size_log2;
+  const int sb_col_in_tile =
+      (mi_col - tile_info->mi_col_start) >> sb_mi_size_log2;
+
+  enc_row_mt->sync_read_ptr(row_mt_sync, sb_row_in_tile, sb_col_in_tile);
+}
+
+/*!\brief Interface for AV1 mode search for an individual coding block
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Searches prediction modes, transform, and coefficient coding modes for an
+ * individual coding block. This function is the top-level interface that
+ * directs the encoder to the proper mode search function, among these
+ * implemented for inter/intra + rd/non-rd + non-skip segment/skip segment.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+ *                              data/contexts/models for the tile during
+ *                              encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+ *                              the current macroblock
+ * \param[in]    mi_row         Row coordinate of the block in a step size of
+ *                              MI_SIZE
+ * \param[in]    mi_col         Column coordinate of the block in a step size of
+ *                              MI_SIZE
+ * \param[in]    rd_cost        Pointer to structure holding rate and distortion
+ *                              stats for the current block
+ * \param[in]    partition      Partition mode of the parent block
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Pointer to structure holding coding contexts and
+ *                              chosen modes for the current block
+ * \param[in]    best_rd        Upper bound of rd cost of a valid partition
+ *
+ * \return Nothing is returned. Instead, the chosen modes and contexts necessary
+ * for reconstruction are stored in ctx, the rate-distortion stats are stored in
+ * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be
+ * signalled by an INT64_MAX rd_cost->rdcost.
+ */
+static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                          MACROBLOCK *const x, int mi_row, int mi_col,
+                          RD_STATS *rd_cost, PARTITION_TYPE partition,
+                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                          RD_STATS best_rd) {
+  if (cpi->sf.part_sf.use_best_rd_for_pruning && best_rd.rdcost < 0) {
+    ctx->rd_stats.rdcost = INT64_MAX;
+    ctx->rd_stats.skip_txfm = 0;
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
+
+  av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+
+  if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab &&
+      ctx->rd_mode_is_ready) {
+    assert(ctx->mic.bsize == bsize);
+    assert(ctx->mic.partition == partition);
+    rd_cost->rate = ctx->rd_stats.rate;
+    rd_cost->dist = ctx->rd_stats.dist;
+    rd_cost->rdcost = ctx->rd_stats.rdcost;
+    return;
+  }
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+  int i;
+
+  // This is only needed for real time/allintra row-mt enabled multi-threaded
+  // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
+  wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
+                        &tile_data->tile_info, cm->seq_params->sb_size,
+                        cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_sb_modes_time);
+#endif
+
+  mbmi = xd->mi[0];
+  mbmi->bsize = bsize;
+  mbmi->partition = partition;
+
+#if CONFIG_RD_DEBUG
+  mbmi->mi_row = mi_row;
+  mbmi->mi_col = mi_col;
+#endif
+
+  // Sets up the tx_type_map buffer in MACROBLOCKD.
+  xd->tx_type_map = txfm_info->tx_type_map_;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    p[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+
+  ctx->skippable = 0;
+  // Set to zero to make sure we do not use the previous encoded frame stats
+  mbmi->skip_txfm = 0;
+  // Reset skip mode flag.
+  mbmi->skip_mode = 0;
+
+  if (is_cur_buf_hbd(xd)) {
+    x->source_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    x->source_variance =
+        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+  // Set error per bit for current rdmult
+  av1_set_error_per_bit(&x->errorperbit, x->rdmult);
+  av1_rd_cost_update(x->rdmult, &best_rd);
+
+  // If set best_rd.rdcost to INT64_MAX, the encoder will not use any previous
+  // rdcost information for the following mode search.
+  // Disabling the feature could get some coding gain, with encoder slowdown.
+  if (!cpi->sf.part_sf.use_best_rd_for_pruning) {
+    av1_invalid_rd_stats(&best_rd);
+  }
+
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+  } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+                                         rd_cost, bsize, ctx, best_rd.rdcost);
+    } else {
+      av1_rd_pick_inter_mode(cpi, tile_data, x, rd_cost, bsize, ctx,
+                             best_rd.rdcost);
+    }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+  }
+
+  // Examine the resulting rate and for AQ mode 2 make a segment choice.
+  if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ &&
+      bsize >= BLOCK_16X16) {
+    av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+  }
+
+  x->rdmult = orig_rdmult;
+
+  // TODO(jingning) The rate-distortion optimization flow needs to be
+  // refactored to provide proper exit/return handle.
+  if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
+
+  ctx->rd_stats.rate = rd_cost->rate;
+  ctx->rd_stats.dist = rd_cost->dist;
+  ctx->rd_stats.rdcost = rd_cost->rdcost;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_sb_modes_time);
+#endif
+}
+
+static void update_stats(const AV1_COMMON *const cm, ThreadData *td) {
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  const BLOCK_SIZE bsize = mbmi->bsize;
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const int seg_ref_active =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+
+  if (current_frame->skip_mode_info.skip_mode_flag && !seg_ref_active &&
+      is_comp_ref_allowed(bsize)) {
+    const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+#if CONFIG_ENTROPY_STATS
+    td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
+#endif
+    update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
+  }
+
+  if (!mbmi->skip_mode && !seg_ref_active) {
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
+#if CONFIG_ENTROPY_STATS
+    td->counts->skip_txfm[skip_ctx][mbmi->skip_txfm]++;
+#endif
+    update_cdf(fc->skip_txfm_cdfs[skip_ctx], mbmi->skip_txfm, 2);
+  }
+
+#if CONFIG_ENTROPY_STATS
+  // delta quant applies to both intra and inter
+  const int super_block_upper_left =
+      ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+      ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0);
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  if (delta_q_info->delta_q_present_flag &&
+      (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) &&
+      super_block_upper_left) {
+    const int dq = (mbmi->current_qindex - xd->current_base_qindex) /
+                   delta_q_info->delta_q_res;
+    const int absdq = abs(dq);
+    for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
+      td->counts->delta_q[i][1]++;
+    }
+    if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
+    if (delta_q_info->delta_lf_present_flag) {
+      if (delta_q_info->delta_lf_multi) {
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
+                               delta_q_info->delta_lf_res;
+          const int abs_delta_lf = abs(delta_lf);
+          for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+            td->counts->delta_lf_multi[lf_id][i][1]++;
+          }
+          if (abs_delta_lf < DELTA_LF_SMALL)
+            td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
+        }
+      } else {
+        const int delta_lf =
+            (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+            delta_q_info->delta_lf_res;
+        const int abs_delta_lf = abs(delta_lf);
+        for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+          td->counts->delta_lf[i][1]++;
+        }
+        if (abs_delta_lf < DELTA_LF_SMALL)
+          td->counts->delta_lf[abs_delta_lf][0]++;
+      }
+    }
+  }
+#endif
+
+  if (!is_inter_block(mbmi)) {
+    av1_sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
+                        frame_is_intra_only(cm));
+  }
+
+  if (av1_allow_intrabc(cm)) {
+    const int is_intrabc = is_intrabc_block(mbmi);
+    update_cdf(fc->intrabc_cdf, is_intrabc, 2);
+#if CONFIG_ENTROPY_STATS
+    ++td->counts->intrabc[is_intrabc];
+#endif  // CONFIG_ENTROPY_STATS
+    if (is_intrabc) {
+      const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      const int_mv dv_ref = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+      av1_update_mv_stats(&mbmi->mv[0].as_mv, &dv_ref.as_mv, &fc->ndvc,
+                          MV_SUBPEL_NONE);
+    }
+  }
+
+  if (frame_is_intra_only(cm) || mbmi->skip_mode) return;
+
+  FRAME_COUNTS *const counts = td->counts;
+  const int inter_block = is_inter_block(mbmi);
+
+  if (!seg_ref_active) {
+#if CONFIG_ENTROPY_STATS
+    counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+#endif
+    update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
+               inter_block, 2);
+    // If the segment reference feature is enabled we have only a single
+    // reference frame allowed for the segment so exclude it from
+    // the reference frame counts used to work out probabilities.
+    if (inter_block) {
+      const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+      if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
+        if (is_comp_ref_allowed(bsize)) {
+#if CONFIG_ENTROPY_STATS
+          counts->comp_inter[av1_get_reference_mode_context(xd)]
+                            [has_second_ref(mbmi)]++;
+#endif  // CONFIG_ENTROPY_STATS
+          update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2);
+        }
+      }
+
+      if (has_second_ref(mbmi)) {
+        const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+                                                      ? UNIDIR_COMP_REFERENCE
+                                                      : BIDIR_COMP_REFERENCE;
+        update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
+                   COMP_REFERENCE_TYPES);
+#if CONFIG_ENTROPY_STATS
+        counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
+                             [comp_ref_type]++;
+#endif  // CONFIG_ENTROPY_STATS
+
+        if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+          const int bit = (ref0 == BWDREF_FRAME);
+          update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+          counts
+              ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit) {
+            const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+            update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
+                                [bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
+            if (bit1) {
+              update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
+                         ref1 == GOLDEN_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+              counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2]
+                                  [ref1 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            }
+          }
+        } else {
+          const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+          update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit) {
+            update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
+                            [ref0 == LAST2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          } else {
+            update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
+                            [ref0 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+          update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME,
+                     2);
+#if CONFIG_ENTROPY_STATS
+          counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
+                             [ref1 == ALTREF_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (ref1 != ALTREF_FRAME) {
+            update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
+                       ref1 == ALTREF2_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
+                               [ref1 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        }
+      } else {
+        const int bit = (ref0 >= BWDREF_FRAME);
+        update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+        counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+        if (bit) {
+          assert(ref0 <= ALTREF_FRAME);
+          update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME,
+                     2);
+#if CONFIG_ENTROPY_STATS
+          counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+                            [ref0 == ALTREF_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (ref0 != ALTREF_FRAME) {
+            update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
+                       ref0 == ALTREF2_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
+                              [ref0 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        } else {
+          const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+          update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit1) {
+            update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
+                              [ref0 != LAST_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          } else {
+            update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
+                              [ref0 != LAST3_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        }
+      }
+
+      if (cm->seq_params->enable_interintra_compound &&
+          is_interintra_allowed(mbmi)) {
+        const int bsize_group = size_group_lookup[bsize];
+        if (mbmi->ref_frame[1] == INTRA_FRAME) {
+#if CONFIG_ENTROPY_STATS
+          counts->interintra[bsize_group][1]++;
+#endif
+          update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+#endif
+          update_cdf(fc->interintra_mode_cdf[bsize_group],
+                     mbmi->interintra_mode, INTERINTRA_MODES);
+          if (av1_is_wedge_used(bsize)) {
+#if CONFIG_ENTROPY_STATS
+            counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+#endif
+            update_cdf(fc->wedge_interintra_cdf[bsize],
+                       mbmi->use_wedge_interintra, 2);
+            if (mbmi->use_wedge_interintra) {
+#if CONFIG_ENTROPY_STATS
+              counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
+#endif
+              update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index,
+                         16);
+            }
+          }
+        } else {
+#if CONFIG_ENTROPY_STATS
+          counts->interintra[bsize_group][0]++;
+#endif
+          update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
+        }
+      }
+
+      const MOTION_MODE motion_allowed =
+          cm->features.switchable_motion_mode
+              ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                    cm->features.allow_warped_motion)
+              : SIMPLE_TRANSLATION;
+      if (mbmi->ref_frame[1] != INTRA_FRAME) {
+        if (motion_allowed == WARPED_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+          counts->motion_mode[bsize][mbmi->motion_mode]++;
+#endif
+          update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
+                     MOTION_MODES);
+        } else if (motion_allowed == OBMC_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+          counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+#endif
+          update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2);
+        }
+      }
+
+      if (has_second_ref(mbmi)) {
+        assert(current_frame->reference_mode != SINGLE_REFERENCE &&
+               is_inter_compound_mode(mbmi->mode) &&
+               mbmi->motion_mode == SIMPLE_TRANSLATION);
+
+        const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                         cm->seq_params->enable_masked_compound;
+        if (masked_compound_used) {
+          const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+#if CONFIG_ENTROPY_STATS
+          ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
+#endif
+          update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
+                     mbmi->comp_group_idx, 2);
+        }
+
+        if (mbmi->comp_group_idx == 0) {
+          const int comp_index_ctx = get_comp_index_context(cm, xd);
+#if CONFIG_ENTROPY_STATS
+          ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
+#endif
+          update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx,
+                     2);
+        } else {
+          assert(masked_compound_used);
+          if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+            ++counts->compound_type[bsize][mbmi->interinter_comp.type -
+                                           COMPOUND_WEDGE];
+#endif
+            update_cdf(fc->compound_type_cdf[bsize],
+                       mbmi->interinter_comp.type - COMPOUND_WEDGE,
+                       MASKED_COMPOUND_TYPES);
+          }
+        }
+      }
+      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+          counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
+#endif
+          update_cdf(fc->wedge_idx_cdf[bsize],
+                     mbmi->interinter_comp.wedge_index, 16);
+        }
+      }
+    }
+  }
+
+  if (inter_block && cm->features.interp_filter == SWITCHABLE &&
+      mbmi->motion_mode != WARPED_CAUSAL &&
+      !is_nontrans_global_motion(xd, mbmi)) {
+    update_filter_type_cdf(xd, mbmi, cm->seq_params->enable_dual_filter);
+  }
+  if (inter_block &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const PREDICTION_MODE mode = mbmi->mode;
+    const int16_t mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+    if (has_second_ref(mbmi)) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+#endif
+      update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
+                 INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
+    } else {
+      av1_update_inter_mode_stats(fc, counts, mode, mode_ctx);
+    }
+
+    const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+    if (new_mv) {
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      for (int idx = 0; idx < 2; ++idx) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+          const uint8_t drl_ctx =
+              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2);
+#if CONFIG_ENTROPY_STATS
+          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+#endif
+          if (mbmi->ref_mv_idx == idx) break;
+        }
+      }
+    }
+
+    if (have_nearmv_in_inter_mode(mbmi->mode)) {
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      for (int idx = 1; idx < 3; ++idx) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+          const uint8_t drl_ctx =
+              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2);
+#if CONFIG_ENTROPY_STATS
+          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+#endif
+          if (mbmi->ref_mv_idx == idx - 1) break;
+        }
+      }
+    }
+    if (have_newmv_in_inter_mode(mbmi->mode)) {
+      const int allow_hp = cm->features.cur_frame_force_integer_mv
+                               ? MV_SUBPEL_NONE
+                               : cm->features.allow_high_precision_mv;
+      if (new_mv) {
+        for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+          const int_mv ref_mv = av1_get_ref_mv(x, ref);
+          av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                              allow_hp);
+        }
+      } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) {
+        const int ref = 1;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                            allow_hp);
+      } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) {
+        const int ref = 0;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                            allow_hp);
+      }
+    }
+  }
+}
+
+/*!\brief Reconstructs an individual coding block
+ *
+ * \ingroup partition_search
+ * Reconstructs an individual coding block by applying the chosen modes stored
+ * in ctx, also updates mode counts and entropy models.
+ *
+ * \param[in]    cpi       Top-level encoder structure
+ * \param[in]    tile_data Pointer to struct holding adaptive
+ *                         data/contexts/models for the tile during encoding
+ * \param[in]    td        Pointer to thread data
+ * \param[in]    tp        Pointer to the starting token
+ * \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]    mi_col    Column coordinate of the block in a step size of
+ *                         MI_SIZE
+ * \param[in]    dry_run   A code indicating whether it is part of the final
+ *                         pass for reconstructing the superblock
+ * \param[in]    bsize     Current block size
+ * \param[in]    partition Partition mode of the parent block
+ * \param[in]    ctx       Pointer to structure holding coding contexts and the
+ *                         chosen modes for the current block
+ * \param[in]    rate      Pointer to the total rate for the current block
+ *
+ * \return Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * will be updated in the pixel buffers in td->mb.e_mbd. Also, the chosen modes
+ * will be stored in the MB_MODE_INFO buffer td->mb.e_mbd.mi[0].
+ */
+static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                     ThreadData *td, TokenExtra **tp, int mi_row, int mi_col,
+                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                     PARTITION_TYPE partition, PICK_MODE_CONTEXT *const ctx,
+                     int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int subsampling_x = cm->seq_params->subsampling_x;
+  const int subsampling_y = cm->seq_params->subsampling_y;
+
+  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  const int origin_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->partition = partition;
+  av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+
+  if (!dry_run) {
+    set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
+                   x->cb_offset[PLANE_TYPE_UV]);
+    assert(x->cb_offset[PLANE_TYPE_Y] <
+           (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]));
+    assert(x->cb_offset[PLANE_TYPE_UV] <
+           ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
+            (subsampling_x + subsampling_y)));
+  }
+
+  encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
+
+  if (!dry_run) {
+    update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
+    if (bsize == cpi->common.seq_params->sb_size && mbmi->skip_txfm == 1 &&
+        cm->delta_q_info.delta_lf_present_flag) {
+      const int frame_lf_count =
+          av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+      for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+        mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+      mbmi->delta_lf_from_base = xd->delta_lf_from_base;
+    }
+    if (has_second_ref(mbmi)) {
+      if (mbmi->compound_idx == 0 ||
+          mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+        mbmi->comp_group_idx = 0;
+      else
+        mbmi->comp_group_idx = 1;
+    }
+
+    // delta quant applies to both intra and inter
+    const int super_block_upper_left =
+        ((mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+        ((mi_col & (cm->seq_params->mib_size - 1)) == 0);
+    const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+    if (delta_q_info->delta_q_present_flag &&
+        (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) &&
+        super_block_upper_left) {
+      xd->current_base_qindex = mbmi->current_qindex;
+      if (delta_q_info->delta_lf_present_flag) {
+        if (delta_q_info->delta_lf_multi) {
+          const int frame_lf_count =
+              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+          }
+        } else {
+          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+        }
+      }
+    }
+
+    RD_COUNTS *rdc = &td->rd_counts;
+    if (mbmi->skip_mode) {
+      assert(!frame_is_intra_only(cm));
+      rdc->skip_mode_used_flag = 1;
+      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+        assert(has_second_ref(mbmi));
+        rdc->compound_ref_used_flag = 1;
+      }
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    } else {
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active) {
+        // If the segment reference feature is enabled we have only a single
+        // reference frame allowed for the segment so exclude it from
+        // the reference frame counts used to work out probabilities.
+        if (is_inter_block(mbmi)) {
+          av1_collect_neighbors_ref_counts(xd);
+          if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+            if (has_second_ref(mbmi)) {
+              // This flag is also updated for 4x4 blocks
+              rdc->compound_ref_used_flag = 1;
+            }
+          }
+          set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        }
+      }
+    }
+
+    if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
+
+    // Gather obmc and warped motion count to update the probability.
+    if ((cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+         cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) ||
+        (cm->features.allow_warped_motion &&
+         cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) {
+      const int inter_block = is_inter_block(mbmi);
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active && inter_block) {
+        const MOTION_MODE motion_allowed =
+            cm->features.switchable_motion_mode
+                ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                      cm->features.allow_warped_motion)
+                : SIMPLE_TRANSLATION;
+
+        if (mbmi->ref_frame[1] != INTRA_FRAME) {
+          if (motion_allowed >= OBMC_CAUSAL) {
+            td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+          }
+          if (motion_allowed == WARPED_CAUSAL) {
+            td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++;
+          }
+        }
+      }
+    }
+  }
+  // TODO(Ravi/Remya): Move this copy function to a better logical place
+  // This function will copy the best mode information from block
+  // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
+  // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
+  // bitstream preparation.
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  x->rdmult = origin_mult;
+}
+
+/*!\brief Reconstructs a partition (may contain multiple coding blocks)
+ *
+ * \ingroup partition_search
+ * Reconstructs a sub-partition of the superblock by applying the chosen modes
+ * and partition trees stored in pc_tree.
+ *
+ * \param[in]    cpi       Top-level encoder structure
+ * \param[in]    td        Pointer to thread data
+ * \param[in]    tile_data Pointer to struct holding adaptive
+ *                         data/contexts/models for the tile during encoding
+ * \param[in]    tp        Pointer to the starting token
+ * \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]    mi_col    Column coordinate of the block in a step size of
+ *                         MI_SIZE
+ * \param[in]    dry_run   A code indicating whether it is part of the final
+ *                         pass for reconstructing the superblock
+ * \param[in]    bsize     Current block size
+ * \param[in]    pc_tree   Pointer to the PC_TREE node storing the picked
+ *                         partitions and mode info for the current block
+ * \param[in]    rate      Pointer to the total rate for the current block
+ *
+ * \return Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * will be updated in the pixel buffers in td->mb.e_mbd.
+ */
+static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
+                      TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+                      int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                      PC_TREE *pc_tree, int *rate) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int is_partition_root = bsize >= BLOCK_8X8;
+  const int ctx = is_partition_root
+                      ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                      : -1;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  int quarter_step = mi_size_wide[bsize] / 4;
+  int i;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+  if (subsize == BLOCK_INVALID) return;
+
+  if (!dry_run && ctx >= 0) {
+    const int has_rows = (mi_row + hbs) < mi_params->mi_rows;
+    const int has_cols = (mi_col + hbs) < mi_params->mi_cols;
+
+    if (has_rows && has_cols) {
+#if CONFIG_ENTROPY_STATS
+      td->counts->partition[ctx][partition]++;
+#endif
+
+      if (tile_data->allow_update_cdf) {
+        FRAME_CONTEXT *fc = xd->tile_ctx;
+        update_cdf(fc->partition_cdf[ctx], partition,
+                   partition_cdf_length(bsize));
+      }
+    }
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->none, rate);
+      break;
+    case PARTITION_VERT:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->vertical[0], rate);
+      if (mi_col + hbs < mi_params->mi_cols) {
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                 partition, pc_tree->vertical[1], rate);
+      }
+      break;
+    case PARTITION_HORZ:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->horizontal[0], rate);
+      if (mi_row + hbs < mi_params->mi_rows) {
+        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                 partition, pc_tree->horizontal[1], rate);
+      }
+      break;
+    case PARTITION_SPLIT:
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+                pc_tree->split[0], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                pc_tree->split[1], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                pc_tree->split[2], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
+                subsize, pc_tree->split[3], rate);
+      break;
+
+    case PARTITION_HORZ_A:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, pc_tree->horizontala[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+               partition, pc_tree->horizontala[2], rate);
+      break;
+    case PARTITION_HORZ_B:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, pc_tree->horizontalb[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, pc_tree->horizontalb[2], rate);
+      break;
+    case PARTITION_VERT_A:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, pc_tree->verticala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, pc_tree->verticala[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+               partition, pc_tree->verticala[2], rate);
+
+      break;
+    case PARTITION_VERT_B:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, pc_tree->verticalb[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, pc_tree->verticalb[2], rate);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
+
+        encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
+                 partition, pc_tree->horizontal4[i], rate);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
+        encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
+                 partition, pc_tree->vertical4[i], rate);
+      }
+      break;
+    default: assert(0 && "Invalid partition type."); break;
+  }
+
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+static AOM_INLINE int is_adjust_var_based_part_enabled(
+    AV1_COMMON *const cm, const PARTITION_SPEED_FEATURES *const part_sf,
+    BLOCK_SIZE bsize) {
+  if (part_sf->partition_search_type != VAR_BASED_PARTITION) return 0;
+  if (part_sf->adjust_var_based_rd_partitioning == 0 ||
+      part_sf->adjust_var_based_rd_partitioning > 2)
+    return 0;
+
+  if (bsize <= BLOCK_32X32) return 1;
+  if (part_sf->adjust_var_based_rd_partitioning == 2) {
+    const int is_larger_qindex = cm->quant_params.base_qindex > 190;
+    const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
+    return is_360p_or_larger && is_larger_qindex && bsize == BLOCK_64X64;
+  }
+  return 0;
+}
+
+/*!\brief AV1 block partition search (partition estimation and partial search).
+*
+* \ingroup partition_search
+* Encode the block by applying pre-calculated partition patterns that are
+* represented by coding block sizes stored in the mbmi array. Minor partition
+* adjustments are tested and applied if they lead to lower rd costs. The
+* partition types are limited to a basic set: none, horz, vert, and split.
+*
+* \param[in]    cpi       Top-level encoder structure
+* \param[in]    td        Pointer to thread data
+* \param[in]    tile_data Pointer to struct holding adaptive
+data/contexts/models for the tile during encoding
+* \param[in]    mib       Array representing MB_MODE_INFO pointers for mi
+blocks starting from the first pixel of the current
+block
+* \param[in]    tp        Pointer to the starting token
+* \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+* \param[in]    mi_col    Column coordinate of the block in a step size of
+MI_SIZE
+* \param[in]    bsize     Current block size
+* \param[in]    rate      Pointer to the final rate for encoding the current
+block
+* \param[in]    dist      Pointer to the final distortion of the current block
+* \param[in]    do_recon  Whether the reconstruction function needs to be run,
+either for finalizing a superblock or providing
+reference for future sub-partitions
+* \param[in]    pc_tree   Pointer to the PC_TREE node holding the picked
+partitions and mode info for the current block
+*
+* \return Nothing is returned. The pc_tree struct is modified to store the
+* picked partition and modes. The rate and dist are also updated with those
+* corresponding to the best partition found.
+*/
+void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+                          MB_MODE_INFO **mib, TokenExtra **tp, int mi_row,
+                          int mi_col, BLOCK_SIZE bsize, int *rate,
+                          int64_t *dist, int do_recon, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc;
+  BLOCK_SIZE bs_type = mib[0]->bsize;
+  int use_partition_none = 0;
+  x->try_merge_partition = 0;
+
+  if (pc_tree->none == NULL) {
+    pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+  }
+  PICK_MODE_CONTEXT *ctx_none = pc_tree->none;
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+  // In rt mode, currently the min partition size is BLOCK_8X8.
+  assert(bsize >= cpi->sf.part_sf.default_min_partition_size);
+
+  av1_invalid_rd_stats(&last_part_rdc);
+  av1_invalid_rd_stats(&none_rdc);
+  av1_invalid_rd_stats(&chosen_rdc);
+  av1_invalid_rd_stats(&invalid_rdc);
+
+  pc_tree->partitioning = partition;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
+  }
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+  if (partition != PARTITION_NONE &&
+      is_adjust_var_based_part_enabled(cm, &cpi->sf.part_sf, bsize) &&
+      (mi_row + hbs < mi_params->mi_rows &&
+       mi_col + hbs < mi_params->mi_cols)) {
+    assert(bsize > cpi->sf.part_sf.default_min_partition_size);
+    mib[0]->bsize = bsize;
+    pc_tree->partitioning = PARTITION_NONE;
+    x->try_merge_partition = 1;
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, PARTITION_NONE,
+                  bsize, ctx_none, invalid_rdc);
+
+    if (none_rdc.rate < INT_MAX) {
+      none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+      none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+    }
+
+    // Try to skip split partition evaluation based on none partition
+    // characteristics.
+    if (none_rdc.rate < INT_MAX && none_rdc.skip_txfm == 1) {
+      use_partition_none = 1;
+    }
+
+    av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+    mib[0]->bsize = bs_type;
+    pc_tree->partitioning = partition;
+  }
+
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+    pc_tree->split[i]->index = i;
+  }
+  switch (partition) {
+    case PARTITION_NONE:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_NONE, bsize, ctx_none, invalid_rdc);
+      break;
+    case PARTITION_HORZ:
+      if (use_partition_none) {
+        av1_invalid_rd_stats(&last_part_rdc);
+        break;
+      }
+
+      for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+        pc_tree->horizontal[i] =
+            av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+      }
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_HORZ, subsize, pc_tree->horizontal[0],
+                    invalid_rdc);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_row + hbs < mi_params->mi_rows) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_h = pc_tree->horizontal[0];
+        av1_init_rd_stats(&tmp_rdc);
+        av1_update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+                          NULL);
+        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+                      PARTITION_HORZ, subsize, pc_tree->horizontal[1],
+                      invalid_rdc);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_VERT:
+      if (use_partition_none) {
+        av1_invalid_rd_stats(&last_part_rdc);
+        break;
+      }
+
+      for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+        pc_tree->vertical[i] =
+            av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+      }
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_VERT, subsize, pc_tree->vertical[0], invalid_rdc);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_col + hbs < mi_params->mi_cols) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_v = pc_tree->vertical[0];
+        av1_init_rd_stats(&tmp_rdc);
+        av1_update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+                          NULL);
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+                      PARTITION_VERT, subsize,
+                      pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (use_partition_none) {
+        av1_invalid_rd_stats(&last_part_rdc);
+        break;
+      }
+
+      last_part_rdc.rate = 0;
+      last_part_rdc.dist = 0;
+      last_part_rdc.rdcost = 0;
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
+        int jj = i >> 1, ii = i & 0x01;
+        RD_STATS tmp_rdc;
+        if ((mi_row + y_idx >= mi_params->mi_rows) ||
+            (mi_col + x_idx >= mi_params->mi_cols))
+          continue;
+
+        av1_init_rd_stats(&tmp_rdc);
+        av1_rd_use_partition(
+            cpi, td, tile_data,
+            mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
+            mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+            &tmp_rdc.dist, i != (SUB_PARTITIONS_SPLIT - 1), pc_tree->split[i]);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+      }
+      break;
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4:
+      assert(0 && "Cannot handle extended partition types");
+    default: assert(0); break;
+  }
+
+  if (last_part_rdc.rate < INT_MAX) {
+    last_part_rdc.rate += mode_costs->partition_cost[pl][partition];
+    last_part_rdc.rdcost =
+        RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
+  }
+
+  if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
+       cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) &&
+      partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
+      (mi_row + bs < mi_params->mi_rows ||
+       mi_row + hbs == mi_params->mi_rows) &&
+      (mi_col + bs < mi_params->mi_cols ||
+       mi_col + hbs == mi_params->mi_cols)) {
+    BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    chosen_rdc.rate = 0;
+    chosen_rdc.dist = 0;
+
+    av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+    pc_tree->partitioning = PARTITION_SPLIT;
+
+    // Split partition.
+    for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+      int x_idx = (i & 1) * hbs;
+      int y_idx = (i >> 1) * hbs;
+      RD_STATS tmp_rdc;
+
+      if ((mi_row + y_idx >= mi_params->mi_rows) ||
+          (mi_col + x_idx >= mi_params->mi_cols))
+        continue;
+
+      av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      pc_tree->split[i]->partitioning = PARTITION_NONE;
+      if (pc_tree->split[i]->none == NULL)
+        pc_tree->split[i]->none =
+            av1_alloc_pmc(cpi, split_subsize, &td->shared_coeff_buf);
+      pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+                    PARTITION_SPLIT, split_subsize, pc_tree->split[i]->none,
+                    invalid_rdc);
+
+      av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+        av1_invalid_rd_stats(&chosen_rdc);
+        break;
+      }
+
+      chosen_rdc.rate += tmp_rdc.rate;
+      chosen_rdc.dist += tmp_rdc.dist;
+
+      if (i != SUB_PARTITIONS_SPLIT - 1)
+        encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
+                  OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
+
+      chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+    }
+    if (chosen_rdc.rate < INT_MAX) {
+      chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
+    }
+  }
+
+  // If last_part is better set the partitioning to that.
+  if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+    mib[0]->bsize = bs_type;
+    if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
+
+    chosen_rdc = last_part_rdc;
+  }
+  // If none was better set the partitioning to that.
+  if (none_rdc.rdcost < INT64_MAX &&
+      none_rdc.rdcost - (none_rdc.rdcost >> 9) < chosen_rdc.rdcost) {
+    mib[0]->bsize = bsize;
+    if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+    chosen_rdc = none_rdc;
+  }
+
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  // We must have chosen a partitioning and encoding or we'll fail later on.
+  // No other opportunities for success.
+  if (bsize == cm->seq_params->sb_size)
+    assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_sb_time);
+#endif
+  if (do_recon) {
+    if (bsize == cm->seq_params->sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      set_cb_offsets(x->cb_offset, 0, 0);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_sb_time);
+#endif
+
+  *rate = chosen_rdc.rate;
+  *dist = chosen_rdc.dist;
+  x->rdmult = orig_rdmult;
+}
+
+static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                           ThreadData *td, TokenExtra **tp, int mi_row,
+                           int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                           PARTITION_TYPE partition,
+                           PICK_MODE_CONTEXT *const ctx, int *rate) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing((AV1_COMP *)cpi, encode_b_nonrd_time);
+#endif
+  const AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  const int origin_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->partition = partition;
+  av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+  const int subsampling_x = cpi->common.seq_params->subsampling_x;
+  const int subsampling_y = cpi->common.seq_params->subsampling_y;
+  if (!dry_run) {
+    set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
+                   x->cb_offset[PLANE_TYPE_UV]);
+    assert(x->cb_offset[PLANE_TYPE_Y] <
+           (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]));
+    assert(x->cb_offset[PLANE_TYPE_UV] <
+           ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
+            (subsampling_x + subsampling_y)));
+  }
+
+  encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
+  if (!dry_run) {
+    update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
+    if (has_second_ref(mbmi)) {
+      if (mbmi->compound_idx == 0 ||
+          mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+        mbmi->comp_group_idx = 0;
+      else
+        mbmi->comp_group_idx = 1;
+      mbmi->compound_idx = 1;
+    }
+    RD_COUNTS *const rdc = &td->rd_counts;
+    if (mbmi->skip_mode) {
+      assert(!frame_is_intra_only(cm));
+      rdc->skip_mode_used_flag = 1;
+      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+          has_second_ref(mbmi)) {
+        rdc->compound_ref_used_flag = 1;
+      }
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    } else {
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active) {
+        // If the segment reference feature is enabled we have only a single
+        // reference frame allowed for the segment so exclude it from
+        // the reference frame counts used to work out probabilities.
+        if (is_inter_block(mbmi)) {
+          av1_collect_neighbors_ref_counts(xd);
+          if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+              has_second_ref(mbmi)) {
+            // This flag is also updated for 4x4 blocks
+            rdc->compound_ref_used_flag = 1;
+          }
+          set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        }
+      }
+    }
+    if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY &&
+        (mbmi->mode == NEWMV || mbmi->mode < INTRA_MODE_END)) {
+      int32_t blocks = mi_size_high[bsize] * mi_size_wide[bsize];
+      rdc->newmv_or_intra_blocks += blocks;
+    }
+    if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
+  }
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm &&
+      !cpi->rc.rtc_external_ratectrl)
+    av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize);
+  // TODO(Ravi/Remya): Move this copy function to a better logical place
+  // This function will copy the best mode information from block
+  // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
+  // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
+  // bitstream preparation.
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  x->rdmult = origin_mult;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing((AV1_COMP *)cpi, encode_b_nonrd_time);
+#endif
+}
+
+/*!\brief Top level function to pick block mode for non-RD optimized case
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Searches prediction modes, transform, and coefficient coding modes for an
+ * individual coding block. This function is the top-level function that is
+ * used for non-RD optimized mode search (controlled by
+ * \c cpi->sf.rt_sf.use_nonrd_pick_mode). Depending on frame type it calls
+ * inter/skip/hybrid-intra mode search functions
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+ *                              data/contexts/models for the tile during
+ *                              encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+ *                              the current macroblock
+ * \param[in]    mi_row         Row coordinate of the block in a step size of
+ *                              MI_SIZE
+ * \param[in]    mi_col         Column coordinate of the block in a step size of
+ *                              MI_SIZE
+ * \param[in]    rd_cost        Pointer to structure holding rate and distortion
+ *                              stats for the current block
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Pointer to structure holding coding contexts and
+ *                              chosen modes for the current block
+ *
+ * \return Nothing is returned. Instead, the chosen modes and contexts necessary
+ * for reconstruction are stored in ctx, the rate-distortion stats are stored in
+ * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be
+ * signalled by an INT64_MAX rd_cost->rdcost.
+ */
+static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                                MACROBLOCK *const x, int mi_row, int mi_col,
+                                RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                PICK_MODE_CONTEXT *ctx) {
+  // For nonrd mode, av1_set_offsets is already called at the superblock level
+  // in encode_nonrd_sb when we determine the partitioning.
+  if (bsize != cpi->common.seq_params->sb_size) {
+    av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+  }
+  assert(x->last_set_offsets_loc.mi_row == mi_row &&
+         x->last_set_offsets_loc.mi_col == mi_col &&
+         x->last_set_offsets_loc.bsize == bsize);
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  int i;
+
+  // This is only needed for real time/allintra row-mt enabled multi-threaded
+  // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
+  wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
+                        &tile_data->tile_info, cm->seq_params->sb_size,
+                        cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, pick_sb_modes_nonrd_time);
+#endif
+  // Sets up the tx_type_map buffer in MACROBLOCKD.
+  xd->tx_type_map = txfm_info->tx_type_map_;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    p[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+  if (is_cur_buf_hbd(xd)) {
+    x->source_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    x->source_variance =
+        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+  // Set error per bit for current rdmult
+  av1_set_error_per_bit(&x->errorperbit, x->rdmult);
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, hybrid_intra_mode_search_time);
+#endif
+    hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, hybrid_intra_mode_search_time);
+#endif
+  } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, nonrd_pick_inter_mode_sb_time);
+#endif
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      RD_STATS invalid_rd;
+      av1_invalid_rd_stats(&invalid_rd);
+      // TODO(kyslov): add av1_nonrd_pick_inter_mode_sb_seg_skip
+      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+                                         rd_cost, bsize, ctx,
+                                         invalid_rd.rdcost);
+    } else {
+      av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx);
+    }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, nonrd_pick_inter_mode_sb_time);
+#endif
+  }
+  if (cpi->sf.rt_sf.skip_cdef_sb) {
+    // Find the corresponding 64x64 block. It'll be the 128x128 block if that's
+    // the block size.
+    const int mi_row_sb = mi_row - mi_row % MI_SIZE_64X64;
+    const int mi_col_sb = mi_col - mi_col % MI_SIZE_64X64;
+    MB_MODE_INFO **mi_sb =
+        cm->mi_params.mi_grid_base +
+        get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb);
+    // Do not skip if intra or new mv is picked, or color sensitivity is set.
+    // Never skip on slide/scene change.
+    mi_sb[0]->skip_cdef_curr_sb =
+        mi_sb[0]->skip_cdef_curr_sb && !cpi->rc.high_source_sad &&
+        !(x->color_sensitivity[0] || x->color_sensitivity[1]) &&
+        !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV);
+    // Store in the pickmode context.
+    ctx->mic.skip_cdef_curr_sb = mi_sb[0]->skip_cdef_curr_sb;
+  }
+  x->rdmult = orig_rdmult;
+  ctx->rd_stats.rate = rd_cost->rate;
+  ctx->rd_stats.dist = rd_cost->dist;
+  ctx->rd_stats.rdcost = rd_cost->rdcost;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, pick_sb_modes_nonrd_time);
+#endif
+}
+
+// Evaluate if the sub-partitions can be merged directly into a large partition
+// without calculating the RD cost.
+static void direct_partition_merging(AV1_COMP *cpi, ThreadData *td,
+                                     TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                                     int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+  MB_MODE_INFO **b0 = mib;
+  MB_MODE_INFO **b1 = mib + hbs;
+  MB_MODE_INFO **b2 = mib + hbs * mi_params->mi_stride;
+  MB_MODE_INFO **b3 = mib + hbs * mi_params->mi_stride + hbs;
+
+  // Check if the following conditions are met. This can be updated
+  // later with more support added.
+  const int further_split = b0[0]->bsize < subsize || b1[0]->bsize < subsize ||
+                            b2[0]->bsize < subsize || b3[0]->bsize < subsize;
+  if (further_split) return;
+
+  const int no_skip = !b0[0]->skip_txfm || !b1[0]->skip_txfm ||
+                      !b2[0]->skip_txfm || !b3[0]->skip_txfm;
+  if (no_skip) return;
+
+  const int compound = (b0[0]->ref_frame[1] != b1[0]->ref_frame[1] ||
+                        b0[0]->ref_frame[1] != b2[0]->ref_frame[1] ||
+                        b0[0]->ref_frame[1] != b3[0]->ref_frame[1] ||
+                        b0[0]->ref_frame[1] > NONE_FRAME);
+  if (compound) return;
+
+  // Intra modes aren't considered here.
+  const int different_ref = (b0[0]->ref_frame[0] != b1[0]->ref_frame[0] ||
+                             b0[0]->ref_frame[0] != b2[0]->ref_frame[0] ||
+                             b0[0]->ref_frame[0] != b3[0]->ref_frame[0] ||
+                             b0[0]->ref_frame[0] <= INTRA_FRAME);
+  if (different_ref) return;
+
+  const int different_mode =
+      (b0[0]->mode != b1[0]->mode || b0[0]->mode != b2[0]->mode ||
+       b0[0]->mode != b3[0]->mode);
+  if (different_mode) return;
+
+  const int unsupported_mode =
+      (b0[0]->mode != NEARESTMV && b0[0]->mode != GLOBALMV);
+  if (unsupported_mode) return;
+
+  const int different_mv = (b0[0]->mv[0].as_int != b1[0]->mv[0].as_int ||
+                            b0[0]->mv[0].as_int != b2[0]->mv[0].as_int ||
+                            b0[0]->mv[0].as_int != b3[0]->mv[0].as_int);
+  if (different_mv) return;
+
+  const int unsupported_motion_mode =
+      (b0[0]->motion_mode != b1[0]->motion_mode ||
+       b0[0]->motion_mode != b2[0]->motion_mode ||
+       b0[0]->motion_mode != b3[0]->motion_mode ||
+       b0[0]->motion_mode != SIMPLE_TRANSLATION);
+  if (unsupported_motion_mode) return;
+
+  const int diffent_filter =
+      (b0[0]->interp_filters.as_int != b1[0]->interp_filters.as_int ||
+       b0[0]->interp_filters.as_int != b2[0]->interp_filters.as_int ||
+       b0[0]->interp_filters.as_int != b3[0]->interp_filters.as_int);
+  if (diffent_filter) return;
+
+  const int different_seg = (b0[0]->segment_id != b1[0]->segment_id ||
+                             b0[0]->segment_id != b2[0]->segment_id ||
+                             b0[0]->segment_id != b3[0]->segment_id);
+  if (different_seg) return;
+
+  // Evaluate the ref_mv.
+  MB_MODE_INFO **this_mi = mib;
+  BLOCK_SIZE orig_bsize = this_mi[0]->bsize;
+  const PARTITION_TYPE orig_partition = this_mi[0]->partition;
+
+  this_mi[0]->bsize = bsize;
+  this_mi[0]->partition = PARTITION_NONE;
+  this_mi[0]->skip_txfm = 1;
+
+  // TODO(yunqing): functions called below can be optimized with
+  // removing unrelated operations.
+  av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row,
+                                     mi_col, bsize);
+
+  const MV_REFERENCE_FRAME ref_frame = this_mi[0]->ref_frame[0];
+  int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+  int force_skip_low_temp_var = 0;
+  int skip_pred_mv = 0;
+
+  for (int i = 0; i < MB_MODE_COUNT; ++i) {
+    for (int j = 0; j < REF_FRAMES; ++j) {
+      frame_mv[i][j].as_int = INVALID_MV;
+    }
+  }
+  x->color_sensitivity[0] = x->color_sensitivity_sb[0];
+  x->color_sensitivity[1] = x->color_sensitivity_sb[1];
+  skip_pred_mv = (x->nonrd_prune_ref_frame_search > 2 &&
+                  x->color_sensitivity[0] != 2 && x->color_sensitivity[1] != 2);
+
+  find_predictors(cpi, x, ref_frame, frame_mv, tile_data, yv12_mb, bsize,
+                  force_skip_low_temp_var, skip_pred_mv);
+
+  int continue_merging = 1;
+  if (frame_mv[NEARESTMV][ref_frame].as_mv.row != b0[0]->mv[0].as_mv.row ||
+      frame_mv[NEARESTMV][ref_frame].as_mv.col != b0[0]->mv[0].as_mv.col)
+    continue_merging = 0;
+
+  if (!continue_merging) {
+    this_mi[0]->bsize = orig_bsize;
+    this_mi[0]->partition = orig_partition;
+
+    // TODO(yunqing): Store the results and restore here instead of
+    // calling find_predictors() again.
+    av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row,
+                                       mi_col, this_mi[0]->bsize);
+    find_predictors(cpi, x, ref_frame, frame_mv, tile_data, yv12_mb,
+                    this_mi[0]->bsize, force_skip_low_temp_var, skip_pred_mv);
+  } else {
+    struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame);
+    const int is_scaled = av1_is_scaled(sf);
+    const int is_y_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 8) ||
+                               (abs(this_mi[0]->mv[0].as_mv.col) % 8);
+    const int is_uv_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 16) ||
+                                (abs(this_mi[0]->mv[0].as_mv.col) % 16);
+
+    if (cpi->ppi->use_svc || is_scaled || is_y_subpel_mv || is_uv_subpel_mv) {
+      const int num_planes = av1_num_planes(cm);
+      set_ref_ptrs(cm, xd, ref_frame, this_mi[0]->ref_frame[1]);
+      const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame);
+      av1_setup_pre_planes(xd, 0, cfg, mi_row, mi_col,
+                           xd->block_ref_scale_factors[0], num_planes);
+
+      if (!cpi->ppi->use_svc && !is_scaled && !is_y_subpel_mv) {
+        assert(is_uv_subpel_mv == 1);
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 1,
+                                      num_planes - 1);
+      } else {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      num_planes - 1);
+      }
+    }
+
+    // Copy out mbmi_ext information.
+    MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+    MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame = x->mbmi_ext_frame;
+    av1_copy_mbmi_ext_to_mbmi_ext_frame(
+        mbmi_ext_frame, mbmi_ext, av1_ref_frame_type(this_mi[0]->ref_frame));
+
+    const BLOCK_SIZE this_subsize =
+        get_partition_subsize(bsize, this_mi[0]->partition);
+    // Update partition contexts.
+    update_ext_partition_context(xd, mi_row, mi_col, this_subsize, bsize,
+                                 this_mi[0]->partition);
+
+    const int num_planes = av1_num_planes(cm);
+    av1_reset_entropy_context(xd, bsize, num_planes);
+
+    // Note: use x->txfm_search_params.tx_mode_search_type instead of
+    // cm->features.tx_mode here.
+    TX_SIZE tx_size =
+        tx_size_from_tx_mode(bsize, x->txfm_search_params.tx_mode_search_type);
+    if (xd->lossless[this_mi[0]->segment_id]) tx_size = TX_4X4;
+    this_mi[0]->tx_size = tx_size;
+    memset(this_mi[0]->inter_tx_size, this_mi[0]->tx_size,
+           sizeof(this_mi[0]->inter_tx_size));
+
+    // Update txfm contexts.
+    xd->above_txfm_context =
+        cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+    xd->left_txfm_context =
+        xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+    set_txfm_ctxs(this_mi[0]->tx_size, xd->width, xd->height,
+                  this_mi[0]->skip_txfm && is_inter_block(this_mi[0]), xd);
+
+    // Update mi for this partition block.
+    for (int y = 0; y < bs; y++) {
+      for (int x_idx = 0; x_idx < bs; x_idx++) {
+        this_mi[x_idx + y * mi_params->mi_stride] = this_mi[0];
+      }
+    }
+  }
+}
+
+/*!\brief AV1 block partition application (minimal RD search).
+*
+* \ingroup partition_search
+* \callgraph
+* \callergraph
+* Encode the block by applying pre-calculated partition patterns that are
+* represented by coding block sizes stored in the mbmi array. The only
+* partition adjustment allowed is merging leaf split nodes if it leads to a
+* lower rd cost. The partition types are limited to a basic set: none, horz,
+* vert, and split. This function is only used in the real-time mode.
+*
+* \param[in]    cpi       Top-level encoder structure
+* \param[in]    td        Pointer to thread data
+* \param[in]    tile_data Pointer to struct holding adaptive
+data/contexts/models for the tile during encoding
+* \param[in]    mib       Array representing MB_MODE_INFO pointers for mi
+blocks starting from the first pixel of the current
+block
+* \param[in]    tp        Pointer to the starting token
+* \param[in]    mi_row    Row coordinate of the block in a step size of MI_SIZE
+* \param[in]    mi_col    Column coordinate of the block in a step size of
+MI_SIZE
+* \param[in]    bsize     Current block size
+* \param[in]    pc_tree   Pointer to the PC_TREE node holding the picked
+partitions and mode info for the current block
+*
+* \return Nothing is returned. The pc_tree struct is modified to store the
+* picked partition and modes.
+*/
+void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+                             TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                             TokenExtra **tp, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  // Only square blocks from 8x8 to 128x128 are supported
+  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  assert(subsize <= BLOCK_LARGEST);
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+
+  RD_STATS dummy_cost;
+  av1_invalid_rd_stats(&dummy_cost);
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  pc_tree->partitioning = partition;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+  switch (partition) {
+    case PARTITION_NONE:
+      if (!pc_tree->none) {
+        pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+      } else {
+        av1_reset_pmc(pc_tree->none);
+      }
+      if (cpi->sf.rt_sf.nonrd_check_partition_split && do_split_check(bsize) &&
+          !frame_is_intra_only(cm)) {
+        RD_STATS split_rdc, none_rdc, block_rdc;
+        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+
+        av1_init_rd_stats(&split_rdc);
+        av1_invalid_rd_stats(&none_rdc);
+
+        av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+                            pc_tree->none);
+        none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+        av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+        for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+          av1_invalid_rd_stats(&block_rdc);
+          const int x_idx = (i & 1) * hbs;
+          const int y_idx = (i >> 1) * hbs;
+          if (mi_row + y_idx >= mi_params->mi_rows ||
+              mi_col + x_idx >= mi_params->mi_cols)
+            continue;
+          xd->above_txfm_context =
+              cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+          xd->left_txfm_context =
+              xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+          pc_tree->split[i]->partitioning = PARTITION_NONE;
+          pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                              &block_rdc, subsize, pc_tree->split[i]->none);
+          split_rdc.rate += block_rdc.rate;
+          split_rdc.dist += block_rdc.dist;
+
+          encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx,
+                         1, subsize, PARTITION_NONE, pc_tree->split[i]->none,
+                         NULL);
+        }
+        split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+        split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+        av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+        if (none_rdc.rdcost < split_rdc.rdcost) {
+          mib[0]->bsize = bsize;
+          pc_tree->partitioning = PARTITION_NONE;
+          encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
+                         partition, pc_tree->none, NULL);
+        } else {
+          mib[0]->bsize = subsize;
+          pc_tree->partitioning = PARTITION_SPLIT;
+          for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+            const int x_idx = (i & 1) * hbs;
+            const int y_idx = (i >> 1) * hbs;
+            if (mi_row + y_idx >= mi_params->mi_rows ||
+                mi_col + x_idx >= mi_params->mi_cols)
+              continue;
+            encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx,
+                           mi_col + x_idx, 0, subsize, PARTITION_NONE,
+                           pc_tree->split[i]->none, NULL);
+          }
+        }
+
+      } else {
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                            bsize, pc_tree->none);
+        encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
+                       partition, pc_tree->none, NULL);
+      }
+      break;
+    case PARTITION_VERT:
+      for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+        if (!pc_tree->vertical[i]) {
+          pc_tree->vertical[i] =
+              av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+        } else {
+          av1_reset_pmc(pc_tree->vertical[i]);
+        }
+      }
+      pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                          subsize, pc_tree->vertical[0]);
+      encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+                     PARTITION_VERT, pc_tree->vertical[0], NULL);
+      if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) {
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col + hbs,
+                            &dummy_cost, subsize, pc_tree->vertical[1]);
+        encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize,
+                       PARTITION_VERT, pc_tree->vertical[1], NULL);
+      }
+      break;
+    case PARTITION_HORZ:
+      for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+        if (!pc_tree->horizontal[i]) {
+          pc_tree->horizontal[i] =
+              av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+        } else {
+          av1_reset_pmc(pc_tree->horizontal[i]);
+        }
+      }
+      pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                          subsize, pc_tree->horizontal[0]);
+      encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+                     PARTITION_HORZ, pc_tree->horizontal[0], NULL);
+
+      if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) {
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + hbs, mi_col,
+                            &dummy_cost, subsize, pc_tree->horizontal[1]);
+        encode_b_nonrd(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize,
+                       PARTITION_HORZ, pc_tree->horizontal[1], NULL);
+      }
+      break;
+    case PARTITION_SPLIT:
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+        if (!pc_tree->split[i]) {
+          pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+        }
+        pc_tree->split[i]->index = i;
+      }
+      if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode &&
+          av1_is_leaf_split_partition(cm, mi_row, mi_col, bsize) &&
+          !frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+        RD_STATS split_rdc, none_rdc;
+        av1_invalid_rd_stats(&split_rdc);
+        av1_invalid_rd_stats(&none_rdc);
+        av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        xd->above_txfm_context =
+            cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+        xd->left_txfm_context =
+            xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+        pc_tree->partitioning = PARTITION_NONE;
+        if (!pc_tree->none) {
+          pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+        } else {
+          av1_reset_pmc(pc_tree->none);
+        }
+        pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+                            pc_tree->none);
+        none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+        av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 ||
+            none_rdc.skip_txfm != 1 || pc_tree->none->mic.mode == NEWMV) {
+          const int is_larger_qindex = cm->quant_params.base_qindex > 100;
+          const int do_split =
+              (cpi->sf.rt_sf.nonrd_check_partition_merge_mode == 3)
+                  ? (bsize <= BLOCK_32X32 ||
+                     (is_larger_qindex && bsize <= BLOCK_64X64))
+                  : 1;
+          if (do_split) {
+            av1_init_rd_stats(&split_rdc);
+            split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+            for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+              RD_STATS block_rdc;
+              av1_invalid_rd_stats(&block_rdc);
+              int x_idx = (i & 1) * hbs;
+              int y_idx = (i >> 1) * hbs;
+              if ((mi_row + y_idx >= mi_params->mi_rows) ||
+                  (mi_col + x_idx >= mi_params->mi_cols))
+                continue;
+              xd->above_txfm_context =
+                  cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+              xd->left_txfm_context = xd->left_txfm_context_buffer +
+                                      ((mi_row + y_idx) & MAX_MIB_MASK);
+              if (!pc_tree->split[i]->none) {
+                pc_tree->split[i]->none =
+                    av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+              } else {
+                av1_reset_pmc(pc_tree->split[i]->none);
+              }
+              pc_tree->split[i]->partitioning = PARTITION_NONE;
+              pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx,
+                                  mi_col + x_idx, &block_rdc, subsize,
+                                  pc_tree->split[i]->none);
+              // TODO(yunqingwang): The rate here did not include the cost of
+              // signaling PARTITION_NONE token in the sub-blocks.
+              split_rdc.rate += block_rdc.rate;
+              split_rdc.dist += block_rdc.dist;
+
+              av1_rd_cost_update(x->rdmult, &split_rdc);
+
+              if (none_rdc.rdcost < split_rdc.rdcost) {
+                break;
+              }
+
+              encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx,
+                             mi_col + x_idx, 1, subsize, PARTITION_NONE,
+                             pc_tree->split[i]->none, NULL);
+            }
+            av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+            split_rdc.rdcost =
+                RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+          }
+        }
+        if (none_rdc.rdcost < split_rdc.rdcost) {
+          mib[0]->bsize = bsize;
+          pc_tree->partitioning = PARTITION_NONE;
+          encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
+                         partition, pc_tree->none, NULL);
+        } else {
+          mib[0]->bsize = subsize;
+          pc_tree->partitioning = PARTITION_SPLIT;
+          for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+            int x_idx = (i & 1) * hbs;
+            int y_idx = (i >> 1) * hbs;
+            if ((mi_row + y_idx >= mi_params->mi_rows) ||
+                (mi_col + x_idx >= mi_params->mi_cols))
+              continue;
+
+            // Note: We don't reset pc_tree->split[i]->none here because it
+            // could contain results from the additional check. Instead, it is
+            // reset before we enter the nonrd_check_partition_merge_mode
+            // condition.
+            if (!pc_tree->split[i]->none) {
+              pc_tree->split[i]->none =
+                  av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+            }
+            encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx,
+                           mi_col + x_idx, 0, subsize, PARTITION_NONE,
+                           pc_tree->split[i]->none, NULL);
+          }
+        }
+      } else {
+        for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+          int x_idx = (i & 1) * hbs;
+          int y_idx = (i >> 1) * hbs;
+          int jj = i >> 1, ii = i & 0x01;
+          if ((mi_row + y_idx >= mi_params->mi_rows) ||
+              (mi_col + x_idx >= mi_params->mi_cols))
+            continue;
+          av1_nonrd_use_partition(
+              cpi, td, tile_data,
+              mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
+              mi_row + y_idx, mi_col + x_idx, subsize, pc_tree->split[i]);
+        }
+
+        // Note: Palette, cfl are not supported.
+        if (!frame_is_intra_only(cm) && !tile_data->allow_update_cdf &&
+            cpi->sf.rt_sf.partition_direct_merging &&
+            mode_costs->partition_cost[pl][PARTITION_NONE] <
+                mode_costs->partition_cost[pl][PARTITION_SPLIT] &&
+            (mi_row + bs <= mi_params->mi_rows) &&
+            (mi_col + bs <= mi_params->mi_cols)) {
+          direct_partition_merging(cpi, td, tile_data, mib, mi_row, mi_col,
+                                   bsize);
+        }
+      }
+      break;
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4:
+      assert(0 && "Cannot handle extended partition types");
+    default: assert(0); break;
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Try searching for an encoding for the given subblock. Returns zero if the
+// rdcost is already too high (to tell the caller not to bother searching for
+// encodings of further subblocks).
+static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TokenExtra **tp, int is_last,
+                           int mi_row, int mi_col, BLOCK_SIZE subsize,
+                           RD_STATS best_rdcost, RD_STATS *sum_rdc,
+                           PARTITION_TYPE partition,
+                           PICK_MODE_CONTEXT *this_ctx) {
+  MACROBLOCK *const x = &td->mb;
+  const int orig_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL);
+
+  av1_rd_cost_update(x->rdmult, &best_rdcost);
+
+  RD_STATS rdcost_remaining;
+  av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining);
+  RD_STATS this_rdc;
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition,
+                subsize, this_ctx, rdcost_remaining);
+
+  if (this_rdc.rate == INT_MAX) {
+    sum_rdc->rdcost = INT64_MAX;
+  } else {
+    sum_rdc->rate += this_rdc.rate;
+    sum_rdc->dist += this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, sum_rdc);
+  }
+
+  if (sum_rdc->rdcost >= best_rdcost.rdcost) {
+    x->rdmult = orig_mult;
+    return 0;
+  }
+
+  if (!is_last) {
+    av1_update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1);
+    encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
+  }
+
+  x->rdmult = orig_mult;
+  return 1;
+}
+
+// Tests an AB partition, and updates the encoder status, the pick mode
+// contexts, the best rdcost, and the best partition.
+static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
+                               TileDataEnc *tile_data, TokenExtra **tp,
+                               PC_TREE *pc_tree, RD_STATS *best_rdc,
+                               int64_t *this_rdcost,
+                               PICK_MODE_CONTEXT *ctxs[SUB_PARTITIONS_AB],
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               PARTITION_TYPE partition,
+                               const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB],
+                               const int ab_mi_pos[SUB_PARTITIONS_AB][2],
+                               const MB_MODE_INFO **mode_cache) {
+  MACROBLOCK *const x = &td->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  RD_STATS sum_rdc;
+  av1_init_rd_stats(&sum_rdc);
+  sum_rdc.rate = x->mode_costs.partition_cost[pl][partition];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+  // Loop over sub-partitions in AB partition type.
+  for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
+    if (mode_cache && mode_cache[i]) {
+      x->use_mb_mode_cache = 1;
+      x->mb_mode_cache = mode_cache[i];
+    }
+    const int mode_search_success =
+        rd_try_subblock(cpi, td, tile_data, tp, i == SUB_PARTITIONS_AB - 1,
+                        ab_mi_pos[i][0], ab_mi_pos[i][1], ab_subsize[i],
+                        *best_rdc, &sum_rdc, partition, ctxs[i]);
+    x->use_mb_mode_cache = 0;
+    x->mb_mode_cache = NULL;
+    if (!mode_search_success) {
+      return false;
+    }
+  }
+
+  av1_rd_cost_update(x->rdmult, &sum_rdc);
+  *this_rdcost = sum_rdc.rdcost;
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+  *this_rdcost = sum_rdc.rdcost;
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
+
+  *best_rdc = sum_rdc;
+  pc_tree->partitioning = partition;
+  return true;
+}
+
+#if CONFIG_COLLECT_PARTITION_STATS
+static void init_partition_block_timing_stats(
+    PartitionTimingStats *part_timing_stats) {
+  av1_zero(*part_timing_stats);
+}
+
+static INLINE void start_partition_block_timer(
+    PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type) {
+  assert(!part_timing_stats->timer_is_on);
+  part_timing_stats->partition_attempts[partition_type] += 1;
+  aom_usec_timer_start(&part_timing_stats->timer);
+  part_timing_stats->timer_is_on = 1;
+}
+
+static INLINE void end_partition_block_timer(
+    PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type,
+    int64_t rdcost) {
+  if (part_timing_stats->timer_is_on) {
+    aom_usec_timer_mark(&part_timing_stats->timer);
+    const int64_t time = aom_usec_timer_elapsed(&part_timing_stats->timer);
+    part_timing_stats->partition_times[partition_type] += time;
+    part_timing_stats->partition_rdcost[partition_type] = rdcost;
+    part_timing_stats->timer_is_on = 0;
+  }
+}
+static INLINE void print_partition_timing_stats_with_rdcost(
+    const PartitionTimingStats *part_timing_stats, int mi_row, int mi_col,
+    BLOCK_SIZE bsize, FRAME_UPDATE_TYPE frame_update_type, int frame_number,
+    const RD_STATS *best_rdc, const char *filename) {
+  FILE *f = fopen(filename, "a");
+  fprintf(f, "%d,%d,%d,%d,%d,%d,%" PRId64 ",%" PRId64 ",", bsize, frame_number,
+          frame_update_type, mi_row, mi_col, best_rdc->rate, best_rdc->dist,
+          best_rdc->rdcost);
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    if (part_timing_stats->partition_rdcost[idx] == INT64_MAX) {
+      fprintf(f, "%d,", -1);
+    } else {
+      fprintf(f, "%" PRId64 ",", part_timing_stats->partition_rdcost[idx]);
+    }
+  }
+  fprintf(f, "\n");
+  fclose(f);
+}
+
+static INLINE void print_partition_timing_stats(
+    const PartitionTimingStats *part_timing_stats, int intra_only,
+    int show_frame, const BLOCK_SIZE bsize, const char *filename) {
+  FILE *f = fopen(filename, "a");
+  fprintf(f, "%d,%d,%d,", bsize, show_frame, intra_only);
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]);
+  }
+  fprintf(f, "\n");
+  fclose(f);
+}
+
+static INLINE void accumulate_partition_timing_stats(
+    FramePartitionTimingStats *fr_part_timing_stats,
+    const PartitionTimingStats *part_timing_stats, BLOCK_SIZE bsize) {
+  const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
+  int *agg_attempts = fr_part_timing_stats->partition_attempts[bsize_idx];
+  int *agg_decisions = fr_part_timing_stats->partition_decisions[bsize_idx];
+  int64_t *agg_times = fr_part_timing_stats->partition_times[bsize_idx];
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    agg_attempts[idx] += part_timing_stats->partition_attempts[idx];
+    agg_decisions[idx] += part_timing_stats->partition_decisions[idx];
+    agg_times[idx] += part_timing_stats->partition_times[idx];
+  }
+}
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
+// Initialize state variables of partition search used in
+// av1_rd_pick_partition().
+static void init_partition_search_state_params(
+    MACROBLOCK *x, AV1_COMP *const cpi, PartitionSearchState *part_search_state,
+    int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams *blk_params = &part_search_state->part_blk_params;
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+
+  // Initialization of block size related parameters.
+  blk_params->mi_step = mi_size_wide[bsize] / 2;
+  blk_params->mi_row = mi_row;
+  blk_params->mi_col = mi_col;
+  blk_params->mi_row_edge = mi_row + blk_params->mi_step;
+  blk_params->mi_col_edge = mi_col + blk_params->mi_step;
+  blk_params->width = block_size_wide[bsize];
+  blk_params->min_partition_size_1d =
+      block_size_wide[x->sb_enc.min_partition_size];
+  blk_params->subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  blk_params->split_bsize2 = blk_params->subsize;
+  blk_params->bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+  blk_params->bsize = bsize;
+
+  // Check if the partition corresponds to edge block.
+  blk_params->has_rows = (blk_params->mi_row_edge < mi_params->mi_rows);
+  blk_params->has_cols = (blk_params->mi_col_edge < mi_params->mi_cols);
+
+  // Update intra partitioning related info.
+  part_search_state->intra_part_info = &x->part_search_info;
+  // Prepare for segmentation CNN-based partitioning for intra-frame.
+  if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
+    part_search_state->intra_part_info->quad_tree_idx = 0;
+    part_search_state->intra_part_info->cnn_output_valid = 0;
+  }
+
+  // Set partition plane context index.
+  part_search_state->pl_ctx_idx =
+      blk_params->bsize_at_least_8x8
+          ? partition_plane_context(xd, mi_row, mi_col, bsize)
+          : 0;
+
+  // Partition cost buffer update
+  ModeCosts *mode_costs = &x->mode_costs;
+  part_search_state->partition_cost =
+      mode_costs->partition_cost[part_search_state->pl_ctx_idx];
+
+  // Initialize HORZ and VERT win flags as true for all split partitions.
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+    part_search_state->split_part_rect_win[i].rect_part_win[HORZ] = true;
+    part_search_state->split_part_rect_win[i].rect_part_win[VERT] = true;
+  }
+
+  // Initialize the rd cost.
+  av1_init_rd_stats(&part_search_state->this_rdc);
+
+  // Initialize RD costs for partition types to 0.
+  part_search_state->none_rd = 0;
+  av1_zero(part_search_state->split_rd);
+  av1_zero(part_search_state->rect_part_rd);
+
+  // Initialize SPLIT partition to be not ready.
+  av1_zero(part_search_state->is_split_ctx_is_ready);
+  // Initialize HORZ and VERT partitions to be not ready.
+  av1_zero(part_search_state->is_rect_ctx_is_ready);
+
+  // Chroma subsampling.
+  part_search_state->ss_x = x->e_mbd.plane[1].subsampling_x;
+  part_search_state->ss_y = x->e_mbd.plane[1].subsampling_y;
+
+  // Initialize partition search flags to defaults.
+  part_search_state->terminate_partition_search = 0;
+  part_search_state->do_square_split = blk_params->bsize_at_least_8x8;
+  part_search_state->do_rectangular_split =
+      cpi->oxcf.part_cfg.enable_rect_partitions &&
+      blk_params->bsize_at_least_8x8;
+  av1_zero(part_search_state->prune_rect_part);
+
+  // Initialize allowed partition types for the partition block.
+  part_search_state->partition_none_allowed =
+      av1_blk_has_rows_and_cols(blk_params);
+  part_search_state->partition_rect_allowed[HORZ] =
+      part_search_state->do_rectangular_split && blk_params->has_cols &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ),
+                           part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+  part_search_state->partition_rect_allowed[VERT] =
+      part_search_state->do_rectangular_split && blk_params->has_rows &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT),
+                           part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+
+  // Reset the flag indicating whether a partition leading to a rdcost lower
+  // than the bound best_rdc has been found.
+  part_search_state->found_best_partition = false;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  init_partition_block_timing_stats(&part_search_state->part_timing_stats);
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+}
+
+// Override partition cost buffer for the edge blocks.
+static void set_partition_cost_for_edge_blk(
+    AV1_COMMON const *cm, PartitionSearchState *part_search_state) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  assert(blk_params.bsize_at_least_8x8 && part_search_state->pl_ctx_idx >= 0);
+  const aom_cdf_prob *partition_cdf =
+      cm->fc->partition_cdf[part_search_state->pl_ctx_idx];
+  const int max_cost = av1_cost_symbol(0);
+  for (PARTITION_TYPE i = 0; i < PARTITION_TYPES; ++i)
+    part_search_state->tmp_partition_cost[i] = max_cost;
+  if (blk_params.has_cols) {
+    // At the bottom, the two possibilities are HORZ and SPLIT.
+    aom_cdf_prob bot_cdf[2];
+    partition_gather_vert_alike(bot_cdf, partition_cdf, blk_params.bsize);
+    static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+    av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, bot_cdf,
+                             bot_inv_map);
+  } else if (blk_params.has_rows) {
+    // At the right, the two possibilities are VERT and SPLIT.
+    aom_cdf_prob rhs_cdf[2];
+    partition_gather_horz_alike(rhs_cdf, partition_cdf, blk_params.bsize);
+    static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+    av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, rhs_cdf,
+                             rhs_inv_map);
+  } else {
+    // At the bottom right, we always split.
+    part_search_state->tmp_partition_cost[PARTITION_SPLIT] = 0;
+  }
+  // Override the partition cost buffer.
+  part_search_state->partition_cost = part_search_state->tmp_partition_cost;
+}
+
+// Reset the partition search state flags when
+// must_find_valid_partition is equal to 1.
+static AOM_INLINE void reset_part_limitations(
+    AV1_COMP *const cpi, PartitionSearchState *part_search_state) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int is_rect_part_allowed =
+      blk_params.bsize_at_least_8x8 &&
+      cpi->oxcf.part_cfg.enable_rect_partitions &&
+      (blk_params.width > blk_params.min_partition_size_1d);
+  part_search_state->do_square_split =
+      blk_params.bsize_at_least_8x8 &&
+      (blk_params.width > blk_params.min_partition_size_1d);
+  part_search_state->partition_none_allowed =
+      av1_blk_has_rows_and_cols(&blk_params) &&
+      (blk_params.width >= blk_params.min_partition_size_1d);
+  part_search_state->partition_rect_allowed[HORZ] =
+      blk_params.has_cols && is_rect_part_allowed &&
+      get_plane_block_size(
+          get_partition_subsize(blk_params.bsize, PARTITION_HORZ),
+          part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID;
+  part_search_state->partition_rect_allowed[VERT] =
+      blk_params.has_rows && is_rect_part_allowed &&
+      get_plane_block_size(
+          get_partition_subsize(blk_params.bsize, PARTITION_VERT),
+          part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID;
+  part_search_state->terminate_partition_search = 0;
+}
+
+// Rectangular partitions evaluation at sub-block level.
+static void rd_pick_rect_partition(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                                   MACROBLOCK *x,
+                                   PICK_MODE_CONTEXT *cur_partition_ctx,
+                                   PartitionSearchState *part_search_state,
+                                   RD_STATS *best_rdc, const int idx,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                   PARTITION_TYPE partition_type) {
+  // Obtain the remainder from the best rd cost
+  // for further processing of partition.
+  RD_STATS best_remain_rdcost;
+  av1_rd_stats_subtraction(x->rdmult, best_rdc, &part_search_state->sum_rdc,
+                           &best_remain_rdcost);
+
+  // Obtain the best mode for the partition sub-block.
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &part_search_state->this_rdc,
+                partition_type, bsize, cur_partition_ctx, best_remain_rdcost);
+  av1_rd_cost_update(x->rdmult, &part_search_state->this_rdc);
+
+  // Update the partition rd cost with the current sub-block rd.
+  if (part_search_state->this_rdc.rate == INT_MAX) {
+    part_search_state->sum_rdc.rdcost = INT64_MAX;
+  } else {
+    part_search_state->sum_rdc.rate += part_search_state->this_rdc.rate;
+    part_search_state->sum_rdc.dist += part_search_state->this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc);
+  }
+  const RECT_PART_TYPE rect_part =
+      partition_type == PARTITION_HORZ ? HORZ : VERT;
+  part_search_state->rect_part_rd[rect_part][idx] =
+      part_search_state->this_rdc.rdcost;
+}
+
+typedef int (*active_edge_info)(const AV1_COMP *cpi, int mi_col, int mi_step);
+
+// Checks if HORZ / VERT partition search is allowed.
+static AOM_INLINE int is_rect_part_allowed(
+    const AV1_COMP *cpi, const PartitionSearchState *part_search_state,
+    const active_edge_info *active_edge, RECT_PART_TYPE rect_part,
+    const int mi_pos) {
+  const PartitionBlkParams *blk_params = &part_search_state->part_blk_params;
+  const int is_part_allowed =
+      (!part_search_state->terminate_partition_search &&
+       part_search_state->partition_rect_allowed[rect_part] &&
+       !part_search_state->prune_rect_part[rect_part] &&
+       (part_search_state->do_rectangular_split ||
+        active_edge[rect_part](cpi, mi_pos, blk_params->mi_step)));
+  return is_part_allowed;
+}
+
+static void rectangular_partition_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
+    RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    RD_RECT_PART_WIN_INFO *rect_part_win_info, const RECT_PART_TYPE start_type,
+    const RECT_PART_TYPE end_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  RD_STATS *sum_rdc = &part_search_state->sum_rdc;
+  const int rect_partition_type[NUM_RECT_PARTS] = { PARTITION_HORZ,
+                                                    PARTITION_VERT };
+
+  // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][0]: mi_row postion of
+  //                                           HORZ and VERT partition types.
+  // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][1]: mi_col postion of
+  //                                           HORZ and VERT partition types.
+  const int mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][2] = {
+    { { blk_params.mi_row, blk_params.mi_col },
+      { blk_params.mi_row_edge, blk_params.mi_col } },
+    { { blk_params.mi_row, blk_params.mi_col },
+      { blk_params.mi_row, blk_params.mi_col_edge } }
+  };
+
+  // Initialize active edge_type function pointer
+  // for HOZR and VERT partition types.
+  active_edge_info active_edge_type[NUM_RECT_PARTS] = { av1_active_h_edge,
+                                                        av1_active_v_edge };
+
+  // Indicates edge blocks for HORZ and VERT partition types.
+  const int is_not_edge_block[NUM_RECT_PARTS] = { blk_params.has_rows,
+                                                  blk_params.has_cols };
+
+  // Initialize pc tree context for HORZ and VERT partition types.
+  PICK_MODE_CONTEXT **cur_ctx[NUM_RECT_PARTS][SUB_PARTITIONS_RECT] = {
+    { &pc_tree->horizontal[0], &pc_tree->horizontal[1] },
+    { &pc_tree->vertical[0], &pc_tree->vertical[1] }
+  };
+
+  // Loop over rectangular partition types.
+  for (RECT_PART_TYPE i = start_type; i <= end_type; i++) {
+    assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+                   !part_search_state->partition_rect_allowed[i]));
+
+    // Check if the HORZ / VERT partition search is to be performed.
+    if (!is_rect_part_allowed(cpi, part_search_state, active_edge_type, i,
+                              mi_pos_rect[i][0][i]))
+      continue;
+
+    // Sub-partition idx.
+    int sub_part_idx = 0;
+    PARTITION_TYPE partition_type = rect_partition_type[i];
+    blk_params.subsize =
+        get_partition_subsize(blk_params.bsize, partition_type);
+    assert(blk_params.subsize <= BLOCK_LARGEST);
+    av1_init_rd_stats(sum_rdc);
+    for (int j = 0; j < SUB_PARTITIONS_RECT; j++) {
+      if (cur_ctx[i][j][0] == NULL) {
+        cur_ctx[i][j][0] =
+            av1_alloc_pmc(cpi, blk_params.subsize, &td->shared_coeff_buf);
+      }
+    }
+    sum_rdc->rate = part_search_state->partition_cost[partition_type];
+    sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, 0);
+#if CONFIG_COLLECT_PARTITION_STATS
+    PartitionTimingStats *part_timing_stats =
+        &part_search_state->part_timing_stats;
+    if (best_rdc->rdcost - sum_rdc->rdcost >= 0) {
+      start_partition_block_timer(part_timing_stats, partition_type);
+    }
+#endif
+
+    // First sub-partition evaluation in HORZ / VERT partition type.
+    rd_pick_rect_partition(
+        cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state,
+        best_rdc, 0, mi_pos_rect[i][sub_part_idx][0],
+        mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type);
+
+    // Start of second sub-partition evaluation.
+    // Evaluate second sub-partition if the first sub-partition cost
+    // is less than the best cost and if it is not an edge block.
+    if (sum_rdc->rdcost < best_rdc->rdcost && is_not_edge_block[i]) {
+      const MB_MODE_INFO *const mbmi = &cur_ctx[i][sub_part_idx][0]->mic;
+      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted.
+      if (pmi->palette_size[PLANE_TYPE_Y] == 0 &&
+          pmi->palette_size[PLANE_TYPE_UV] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED)
+          part_search_state->is_rect_ctx_is_ready[i] = 1;
+      }
+      av1_update_state(cpi, td, cur_ctx[i][sub_part_idx][0], blk_params.mi_row,
+                       blk_params.mi_col, blk_params.subsize, DRY_RUN_NORMAL);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL,
+                        blk_params.subsize, NULL);
+
+      // Second sub-partition evaluation in HORZ / VERT partition type.
+      sub_part_idx = 1;
+      rd_pick_rect_partition(
+          cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state,
+          best_rdc, 1, mi_pos_rect[i][sub_part_idx][0],
+          mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type);
+    }
+    // Update HORZ / VERT best partition.
+    if (sum_rdc->rdcost < best_rdc->rdcost) {
+      sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, sum_rdc->dist);
+      if (sum_rdc->rdcost < best_rdc->rdcost) {
+        *best_rdc = *sum_rdc;
+        part_search_state->found_best_partition = true;
+        pc_tree->partitioning = partition_type;
+      }
+    } else {
+      // Update HORZ / VERT win flag.
+      if (rect_part_win_info != NULL)
+        rect_part_win_info->rect_part_win[i] = false;
+    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (part_timing_stats->timer_is_on) {
+      end_partition_block_timer(part_timing_stats, partition_type,
+                                sum_rdc->rdcost);
+    }
+#endif
+    av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col,
+                        blk_params.bsize, av1_num_planes(cm));
+  }
+}
+
+// AB partition type evaluation.
+static void rd_pick_ab_part(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PC_TREE *pc_tree, PICK_MODE_CONTEXT *dst_ctxs[SUB_PARTITIONS_AB],
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB],
+    const int ab_mi_pos[SUB_PARTITIONS_AB][2], const PARTITION_TYPE part_type,
+    const MB_MODE_INFO **mode_cache) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const int bsize = blk_params.bsize;
+  int64_t this_rdcost = 0;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  PartitionTimingStats *part_timing_stats =
+      &part_search_state->part_timing_stats;
+  {
+    RD_STATS tmp_sum_rdc;
+    av1_init_rd_stats(&tmp_sum_rdc);
+    tmp_sum_rdc.rate = part_search_state->partition_cost[part_type];
+    tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+    if (best_rdc->rdcost - tmp_sum_rdc.rdcost >= 0) {
+      start_partition_block_timer(part_timing_stats, part_type);
+    }
+  }
+#endif
+
+  // Test this partition and update the best partition.
+  const bool find_best_ab_part = rd_test_partition3(
+      cpi, td, tile_data, tp, pc_tree, best_rdc, &this_rdcost, dst_ctxs, mi_row,
+      mi_col, bsize, part_type, ab_subsize, ab_mi_pos, mode_cache);
+  part_search_state->found_best_partition |= find_best_ab_part;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (part_timing_stats->timer_is_on) {
+    if (!find_best_ab_part) this_rdcost = INT64_MAX;
+    end_partition_block_timer(part_timing_stats, part_type, this_rdcost);
+  }
+#endif
+  av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// Set mode search context.
+static AOM_INLINE void set_mode_search_ctx(
+    PC_TREE *pc_tree, const int is_ctx_ready[NUM_AB_PARTS][2],
+    PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2]) {
+  mode_srch_ctx[HORZ_B][0] = &pc_tree->horizontal[0];
+  mode_srch_ctx[VERT_B][0] = &pc_tree->vertical[0];
+
+  if (is_ctx_ready[HORZ_A][0])
+    mode_srch_ctx[HORZ_A][0] = &pc_tree->split[0]->none;
+
+  if (is_ctx_ready[VERT_A][0])
+    mode_srch_ctx[VERT_A][0] = &pc_tree->split[0]->none;
+
+  if (is_ctx_ready[HORZ_A][1])
+    mode_srch_ctx[HORZ_A][1] = &pc_tree->split[1]->none;
+}
+
+static AOM_INLINE void copy_partition_mode_from_mode_context(
+    const MB_MODE_INFO **dst_mode, const PICK_MODE_CONTEXT *ctx) {
+  if (ctx && ctx->rd_stats.rate < INT_MAX) {
+    *dst_mode = &ctx->mic;
+  } else {
+    *dst_mode = NULL;
+  }
+}
+
+static AOM_INLINE void copy_partition_mode_from_pc_tree(
+    const MB_MODE_INFO **dst_mode, const PC_TREE *pc_tree) {
+  if (pc_tree) {
+    copy_partition_mode_from_mode_context(dst_mode, pc_tree->none);
+  } else {
+    *dst_mode = NULL;
+  }
+}
+
+static AOM_INLINE void set_mode_cache_for_partition_ab(
+    const MB_MODE_INFO **mode_cache, const PC_TREE *pc_tree,
+    AB_PART_TYPE ab_part_type) {
+  switch (ab_part_type) {
+    case HORZ_A:
+      copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]);
+      copy_partition_mode_from_mode_context(&mode_cache[2],
+                                            pc_tree->horizontal[1]);
+      break;
+    case HORZ_B:
+      copy_partition_mode_from_mode_context(&mode_cache[0],
+                                            pc_tree->horizontal[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]);
+      copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]);
+      break;
+    case VERT_A:
+      copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]);
+      copy_partition_mode_from_mode_context(&mode_cache[2],
+                                            pc_tree->vertical[1]);
+      break;
+    case VERT_B:
+      copy_partition_mode_from_mode_context(&mode_cache[0],
+                                            pc_tree->vertical[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]);
+      copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]);
+      break;
+    default: assert(0 && "Invalid ab partition type!\n");
+  }
+}
+
+// AB Partitions type search.
+static void ab_partitions_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PC_TREE *pc_tree, PartitionSearchState *part_search_state,
+    RD_STATS *best_rdc, RD_RECT_PART_WIN_INFO *rect_part_win_info,
+    int pb_source_variance, int ext_partition_allowed,
+    const AB_PART_TYPE start_type, const AB_PART_TYPE end_type) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const int bsize = blk_params.bsize;
+
+  if (part_search_state->terminate_partition_search) {
+    return;
+  }
+
+  int ab_partitions_allowed[NUM_AB_PARTS];
+  // Prune AB partitions
+  av1_prune_ab_partitions(cpi, x, pc_tree, pb_source_variance, best_rdc->rdcost,
+                          rect_part_win_info, ext_partition_allowed,
+                          part_search_state, ab_partitions_allowed);
+
+  // Flags to indicate whether the mode search is done.
+  const int is_ctx_ready[NUM_AB_PARTS][2] = {
+    { part_search_state->is_split_ctx_is_ready[0],
+      part_search_state->is_split_ctx_is_ready[1] },
+    { part_search_state->is_rect_ctx_is_ready[HORZ], 0 },
+    { part_search_state->is_split_ctx_is_ready[0], 0 },
+    { part_search_state->is_rect_ctx_is_ready[VERT], 0 }
+  };
+
+  // Current partition context.
+  PICK_MODE_CONTEXT **cur_part_ctxs[NUM_AB_PARTS] = { pc_tree->horizontala,
+                                                      pc_tree->horizontalb,
+                                                      pc_tree->verticala,
+                                                      pc_tree->verticalb };
+
+  // Context of already evaluted partition types.
+  PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2];
+  // Set context of already evaluted partition types.
+  set_mode_search_ctx(pc_tree, is_ctx_ready, mode_srch_ctx);
+
+  // Array of sub-partition size of AB partition types.
+  const BLOCK_SIZE ab_subsize[NUM_AB_PARTS][SUB_PARTITIONS_AB] = {
+    { blk_params.split_bsize2, blk_params.split_bsize2,
+      get_partition_subsize(bsize, PARTITION_HORZ_A) },
+    { get_partition_subsize(bsize, PARTITION_HORZ_B), blk_params.split_bsize2,
+      blk_params.split_bsize2 },
+    { blk_params.split_bsize2, blk_params.split_bsize2,
+      get_partition_subsize(bsize, PARTITION_VERT_A) },
+    { get_partition_subsize(bsize, PARTITION_VERT_B), blk_params.split_bsize2,
+      blk_params.split_bsize2 }
+  };
+
+  // Array of mi_row, mi_col positions corresponds to each sub-partition in AB
+  // partition types.
+  const int ab_mi_pos[NUM_AB_PARTS][SUB_PARTITIONS_AB][2] = {
+    { { mi_row, mi_col },
+      { mi_row, blk_params.mi_col_edge },
+      { blk_params.mi_row_edge, mi_col } },
+    { { mi_row, mi_col },
+      { blk_params.mi_row_edge, mi_col },
+      { blk_params.mi_row_edge, blk_params.mi_col_edge } },
+    { { mi_row, mi_col },
+      { blk_params.mi_row_edge, mi_col },
+      { mi_row, blk_params.mi_col_edge } },
+    { { mi_row, mi_col },
+      { mi_row, blk_params.mi_col_edge },
+      { blk_params.mi_row_edge, blk_params.mi_col_edge } }
+  };
+
+  // Loop over AB partition types.
+  for (AB_PART_TYPE ab_part_type = start_type; ab_part_type <= end_type;
+       ab_part_type++) {
+    const PARTITION_TYPE part_type = ab_part_type + PARTITION_HORZ_A;
+
+    // Check if the AB partition search is to be performed.
+    if (!ab_partitions_allowed[ab_part_type]) {
+      continue;
+    }
+
+    blk_params.subsize = get_partition_subsize(bsize, part_type);
+    for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
+      // Set AB partition context.
+      cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc(
+          cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf);
+      // Set mode as not ready.
+      cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0;
+    }
+
+    if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab) {
+      // We can copy directly the mode search results if we have already
+      // searched the current block and the contexts match.
+      if (is_ctx_ready[ab_part_type][0]) {
+        av1_copy_tree_context(cur_part_ctxs[ab_part_type][0],
+                              mode_srch_ctx[ab_part_type][0][0]);
+        cur_part_ctxs[ab_part_type][0]->mic.partition = part_type;
+        cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1;
+        if (is_ctx_ready[ab_part_type][1]) {
+          av1_copy_tree_context(cur_part_ctxs[ab_part_type][1],
+                                mode_srch_ctx[ab_part_type][1][0]);
+          cur_part_ctxs[ab_part_type][1]->mic.partition = part_type;
+          cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1;
+        }
+      }
+    }
+
+    // Even if the contexts don't match, we can still speed up by reusing the
+    // previous prediction mode.
+    const MB_MODE_INFO *mode_cache[3] = { NULL, NULL, NULL };
+    if (cpi->sf.part_sf.reuse_best_prediction_for_part_ab) {
+      set_mode_cache_for_partition_ab(mode_cache, pc_tree, ab_part_type);
+    }
+
+    // Evaluation of AB partition type.
+    rd_pick_ab_part(cpi, td, tile_data, tp, x, x_ctx, pc_tree,
+                    cur_part_ctxs[ab_part_type], part_search_state, best_rdc,
+                    ab_subsize[ab_part_type], ab_mi_pos[ab_part_type],
+                    part_type, mode_cache);
+  }
+}
+
+// Set mi positions for HORZ4 / VERT4 sub-block partitions.
+static void set_mi_pos_partition4(const int inc_step[NUM_PART4_TYPES],
+                                  int mi_pos[SUB_PARTITIONS_PART4][2],
+                                  const int mi_row, const int mi_col) {
+  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; i++) {
+    mi_pos[i][0] = mi_row + i * inc_step[HORZ4];
+    mi_pos[i][1] = mi_col + i * inc_step[VERT4];
+  }
+}
+
+// Set context and RD cost for HORZ4 / VERT4 partition types.
+static void set_4_part_ctx_and_rdcost(
+    MACROBLOCK *x, const AV1_COMP *const cpi, ThreadData *td,
+    PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4],
+    PartitionSearchState *part_search_state, PARTITION_TYPE partition_type,
+    BLOCK_SIZE bsize) {
+  // Initialize sum_rdc RD cost structure.
+  av1_init_rd_stats(&part_search_state->sum_rdc);
+  const int subsize = get_partition_subsize(bsize, partition_type);
+  part_search_state->sum_rdc.rate =
+      part_search_state->partition_cost[partition_type];
+  part_search_state->sum_rdc.rdcost =
+      RDCOST(x->rdmult, part_search_state->sum_rdc.rate, 0);
+  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i)
+    cur_part_ctx[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+}
+
+// Partition search of HORZ4 / VERT4 partition types.
+static void rd_pick_4partition(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PC_TREE *pc_tree, PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4],
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    const int inc_step[NUM_PART4_TYPES], PARTITION_TYPE partition_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  // mi positions needed for HORZ4 and VERT4 partition types.
+  int mi_pos_check[NUM_PART4_TYPES] = { cm->mi_params.mi_rows,
+                                        cm->mi_params.mi_cols };
+  const PART4_TYPES part4_idx = (partition_type != PARTITION_HORZ_4);
+  int mi_pos[SUB_PARTITIONS_PART4][2];
+
+  blk_params.subsize = get_partition_subsize(blk_params.bsize, partition_type);
+  // Set partition context and RD cost.
+  set_4_part_ctx_and_rdcost(x, cpi, td, cur_part_ctx, part_search_state,
+                            partition_type, blk_params.bsize);
+  // Set mi positions for sub-block sizes.
+  set_mi_pos_partition4(inc_step, mi_pos, blk_params.mi_row, blk_params.mi_col);
+#if CONFIG_COLLECT_PARTITION_STATS
+  PartitionTimingStats *part_timing_stats =
+      &part_search_state->part_timing_stats;
+  if (best_rdc->rdcost - part_search_state->sum_rdc.rdcost >= 0) {
+    start_partition_block_timer(part_timing_stats, partition_type);
+  }
+#endif
+  // Loop over sub-block partitions.
+  for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    if (i > 0 && mi_pos[i][part4_idx] >= mi_pos_check[part4_idx]) break;
+
+    // Sub-block evaluation of Horz4 / Vert4 partition type.
+    cur_part_ctx[i]->rd_mode_is_ready = 0;
+    if (!rd_try_subblock(
+            cpi, td, tile_data, tp, (i == SUB_PARTITIONS_PART4 - 1),
+            mi_pos[i][0], mi_pos[i][1], blk_params.subsize, *best_rdc,
+            &part_search_state->sum_rdc, partition_type, cur_part_ctx[i])) {
+      av1_invalid_rd_stats(&part_search_state->sum_rdc);
+      break;
+    }
+  }
+
+  // Calculate the total cost and update the best partition.
+  av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc);
+  if (part_search_state->sum_rdc.rdcost < best_rdc->rdcost) {
+    *best_rdc = part_search_state->sum_rdc;
+    part_search_state->found_best_partition = true;
+    pc_tree->partitioning = partition_type;
+  }
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (part_timing_stats->timer_is_on) {
+    end_partition_block_timer(part_timing_stats, partition_type,
+                              part_search_state->sum_rdc.rdcost);
+  }
+#endif
+  av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col,
+                      blk_params.bsize, av1_num_planes(cm));
+}
+
+// Prune 4-way partitions based on the number of horz/vert wins
+// in the current block and sub-blocks in PARTITION_SPLIT.
+static void prune_4_partition_using_split_info(
+    AV1_COMP *const cpi, MACROBLOCK *x, PartitionSearchState *part_search_state,
+    int part4_search_allowed[NUM_PART4_TYPES]) {
+  PART4_TYPES cur_part[NUM_PART4_TYPES] = { HORZ4, VERT4 };
+  // Count of child blocks in which HORZ or VERT partition has won
+  int num_child_rect_win[NUM_RECT_PARTS] = { 0, 0 };
+  // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of
+  // split partiitons.
+  // Conservative pruning for high quantizers.
+  const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3);
+
+  for (RECT_PART_TYPE i = HORZ; i < NUM_RECT_PARTS; i++) {
+    if (!(cpi->sf.part_sf.prune_ext_part_using_split_info &&
+          part4_search_allowed[cur_part[i]]))
+      continue;
+    // Loop over split partitions.
+    // Get rectangular partitions winner info of split partitions.
+    for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; idx++)
+      num_child_rect_win[i] +=
+          (part_search_state->split_part_rect_win[idx].rect_part_win[i]) ? 1
+                                                                         : 0;
+    if (num_child_rect_win[i] < num_win_thresh) {
+      part4_search_allowed[cur_part[i]] = 0;
+    }
+  }
+}
+
+// Prune 4-way partition search.
+static void prune_4_way_partition_search(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    int pb_source_variance, int ext_partition_allowed,
+    int part4_search_allowed[NUM_PART4_TYPES]) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+
+  // Disable 4-way partition search flags for width less than a multiple of the
+  // minimum partition width.
+  if (blk_params.width < (blk_params.min_partition_size_1d
+                          << cpi->sf.part_sf.prune_part4_search)) {
+    part4_search_allowed[HORZ4] = 0;
+    part4_search_allowed[VERT4] = 0;
+    return;
+  }
+
+  const int bsize = blk_params.bsize;
+  PARTITION_TYPE cur_part[NUM_PART4_TYPES] = { PARTITION_HORZ_4,
+                                               PARTITION_VERT_4 };
+  const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+  // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
+  // PARTITION_VERT_4 for this block. This is almost the same as
+  // ext_partition_allowed, except that we don't allow 128x32 or 32x128
+  // blocks, so we require that bsize is not BLOCK_128X128.
+  const int partition4_allowed = part_cfg->enable_1to4_partitions &&
+                                 ext_partition_allowed &&
+                                 bsize != BLOCK_128X128;
+
+  for (PART4_TYPES i = HORZ4; i < NUM_PART4_TYPES; i++) {
+    part4_search_allowed[i] =
+        partition4_allowed && part_search_state->partition_rect_allowed[i] &&
+        get_plane_block_size(get_partition_subsize(bsize, cur_part[i]),
+                             part_search_state->ss_x,
+                             part_search_state->ss_y) != BLOCK_INVALID;
+  }
+  // Pruning: pruning out 4-way partitions based on the current best partition.
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) {
+    part4_search_allowed[HORZ4] &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                    pc_tree->partitioning == PARTITION_HORZ_A ||
+                                    pc_tree->partitioning == PARTITION_HORZ_B ||
+                                    pc_tree->partitioning == PARTITION_SPLIT ||
+                                    pc_tree->partitioning == PARTITION_NONE);
+    part4_search_allowed[VERT4] &= (pc_tree->partitioning == PARTITION_VERT ||
+                                    pc_tree->partitioning == PARTITION_VERT_A ||
+                                    pc_tree->partitioning == PARTITION_VERT_B ||
+                                    pc_tree->partitioning == PARTITION_SPLIT ||
+                                    pc_tree->partitioning == PARTITION_NONE);
+  }
+
+  // Pruning: pruning out some 4-way partitions using a DNN taking rd costs of
+  // sub-blocks from basic partition types.
+  if (cpi->sf.part_sf.ml_prune_partition && partition4_allowed &&
+      part_search_state->partition_rect_allowed[HORZ] &&
+      part_search_state->partition_rect_allowed[VERT]) {
+    av1_ml_prune_4_partition(cpi, x, pc_tree->partitioning, best_rdc->rdcost,
+                             part_search_state, part4_search_allowed,
+                             pb_source_variance);
+  }
+
+  // Pruning: pruning out 4-way partitions based on the number of horz/vert wins
+  // in the current block and sub-blocks in PARTITION_SPLIT.
+  prune_4_partition_using_split_info(cpi, x, part_search_state,
+                                     part4_search_allowed);
+}
+
+// Set params needed for PARTITION_NONE search.
+static void set_none_partition_params(const AV1_COMP *const cpi, ThreadData *td,
+                                      MACROBLOCK *x, PC_TREE *pc_tree,
+                                      PartitionSearchState *part_search_state,
+                                      RD_STATS *best_remain_rdcost,
+                                      RD_STATS *best_rdc, int *pt_cost) {
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  RD_STATS partition_rdcost;
+  // Set PARTITION_NONE context.
+  if (pc_tree->none == NULL)
+    pc_tree->none = av1_alloc_pmc(cpi, blk_params.bsize, &td->shared_coeff_buf);
+
+  // Set PARTITION_NONE type cost.
+  if (part_search_state->partition_none_allowed) {
+    if (blk_params.bsize_at_least_8x8) {
+      *pt_cost = part_search_state->partition_cost[PARTITION_NONE] < INT_MAX
+                     ? part_search_state->partition_cost[PARTITION_NONE]
+                     : 0;
+    }
+
+    // Initialize the RD stats structure.
+    av1_init_rd_stats(&partition_rdcost);
+    partition_rdcost.rate = *pt_cost;
+    av1_rd_cost_update(x->rdmult, &partition_rdcost);
+    av1_rd_stats_subtraction(x->rdmult, best_rdc, &partition_rdcost,
+                             best_remain_rdcost);
+  }
+}
+
+// Skip other partitions based on PARTITION_NONE rd cost.
+static void prune_partitions_after_none(AV1_COMP *const cpi, MACROBLOCK *x,
+                                        SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                        PICK_MODE_CONTEXT *ctx_none,
+                                        PartitionSearchState *part_search_state,
+                                        RD_STATS *best_rdc,
+                                        unsigned int *pb_source_variance) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  RD_STATS *this_rdc = &part_search_state->this_rdc;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  if (!frame_is_intra_only(cm) &&
+      (part_search_state->do_square_split ||
+       part_search_state->do_rectangular_split) &&
+      !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
+    const int use_ml_based_breakout =
+        bsize <= cpi->sf.part_sf.use_square_partition_only_threshold &&
+        bsize > BLOCK_4X4 && cpi->sf.part_sf.ml_predict_breakout_level >= 1;
+    if (use_ml_based_breakout) {
+      av1_ml_predict_breakout(cpi, x, this_rdc, *pb_source_variance, xd->bd,
+                              part_search_state);
+    }
+
+    // Adjust dist breakout threshold according to the partition size.
+    const int64_t dist_breakout_thr =
+        cpi->sf.part_sf.partition_search_breakout_dist_thr >>
+        ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+         (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
+    const int rate_breakout_thr =
+        cpi->sf.part_sf.partition_search_breakout_rate_thr *
+        num_pels_log2_lookup[bsize];
+    // If all y, u, v transform blocks in this partition are skippable,
+    // and the dist & rate are within the thresholds, the partition
+    // search is terminated for current branch of the partition search
+    // tree. The dist & rate thresholds are set to 0 at speed 0 to
+    // disable the early termination at that speed.
+    if (best_rdc->dist < dist_breakout_thr &&
+        best_rdc->rate < rate_breakout_thr) {
+      part_search_state->do_square_split = 0;
+      part_search_state->do_rectangular_split = 0;
+    }
+  }
+
+  // Early termination: using simple_motion_search features and the
+  // rate, distortion, and rdcost of PARTITION_NONE, a DNN will make a
+  // decision on early terminating at PARTITION_NONE.
+  if (cpi->sf.part_sf.simple_motion_search_early_term_none && cm->show_frame &&
+      !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 &&
+      av1_blk_has_rows_and_cols(&blk_params) && this_rdc->rdcost < INT64_MAX &&
+      this_rdc->rdcost >= 0 && this_rdc->rate < INT_MAX &&
+      this_rdc->rate >= 0 &&
+      (part_search_state->do_square_split ||
+       part_search_state->do_rectangular_split)) {
+    av1_simple_motion_search_early_term_none(cpi, x, sms_tree, this_rdc,
+                                             part_search_state);
+  }
+}
+
+// Decide early termination and rectangular partition pruning
+// based on PARTITION_NONE and PARTITION_SPLIT costs.
+static void prune_partitions_after_split(
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    int64_t part_none_rd, int64_t part_split_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  // Early termination: using the rd costs of PARTITION_NONE and subblocks
+  // from PARTITION_SPLIT to determine an early breakout.
+  if (cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+      !frame_is_intra_only(cm) &&
+      !part_search_state->terminate_partition_search &&
+      part_search_state->do_rectangular_split &&
+      (part_search_state->partition_rect_allowed[HORZ] ||
+       part_search_state->partition_rect_allowed[VERT])) {
+    av1_ml_early_term_after_split(
+        cpi, x, sms_tree, best_rdc->rdcost, part_none_rd, part_split_rd,
+        part_search_state->split_rd, part_search_state);
+  }
+
+  // Use the rd costs of PARTITION_NONE and subblocks from PARTITION_SPLIT
+  // to prune out rectangular partitions in some directions.
+  if (!cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+      cpi->sf.part_sf.ml_prune_partition && !frame_is_intra_only(cm) &&
+      (part_search_state->partition_rect_allowed[HORZ] ||
+       part_search_state->partition_rect_allowed[VERT]) &&
+      !(part_search_state->prune_rect_part[HORZ] ||
+        part_search_state->prune_rect_part[VERT]) &&
+      !part_search_state->terminate_partition_search) {
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(cm),
+                         bsize);
+    av1_ml_prune_rect_partition(cpi, x, best_rdc->rdcost,
+                                part_search_state->none_rd,
+                                part_search_state->split_rd, part_search_state);
+  }
+}
+
+// PARTITION_NONE search.
+static void none_partition_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, MACROBLOCK *x,
+    PC_TREE *pc_tree, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    unsigned int *pb_source_variance, int64_t *none_rd, int64_t *part_none_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  RD_STATS *this_rdc = &part_search_state->this_rdc;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const BLOCK_SIZE bsize = blk_params.bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  if (part_search_state->terminate_partition_search ||
+      !part_search_state->partition_none_allowed)
+    return;
+
+  int pt_cost = 0;
+  RD_STATS best_remain_rdcost;
+  av1_invalid_rd_stats(&best_remain_rdcost);
+
+  // Set PARTITION_NONE context and cost.
+  set_none_partition_params(cpi, td, x, pc_tree, part_search_state,
+                            &best_remain_rdcost, best_rdc, &pt_cost);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  // Timer start for partition None.
+  PartitionTimingStats *part_timing_stats =
+      &part_search_state->part_timing_stats;
+  if (best_remain_rdcost.rdcost >= 0) {
+    start_partition_block_timer(part_timing_stats, PARTITION_NONE);
+  }
+#endif
+  // PARTITION_NONE evaluation and cost update.
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, PARTITION_NONE,
+                bsize, pc_tree->none, best_remain_rdcost);
+
+  av1_rd_cost_update(x->rdmult, this_rdc);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  // Timer end for partition None.
+  if (part_timing_stats->timer_is_on) {
+    RD_STATS tmp_rdc;
+    av1_init_rd_stats(&tmp_rdc);
+    if (this_rdc->rate != INT_MAX) {
+      tmp_rdc.rate = this_rdc->rate;
+      tmp_rdc.dist = this_rdc->dist;
+      tmp_rdc.rdcost = this_rdc->rdcost;
+      if (blk_params.bsize_at_least_8x8) {
+        tmp_rdc.rate += pt_cost;
+        tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
+      }
+    }
+    end_partition_block_timer(part_timing_stats, PARTITION_NONE,
+                              tmp_rdc.rdcost);
+  }
+#endif
+  *pb_source_variance = x->source_variance;
+  if (none_rd) *none_rd = this_rdc->rdcost;
+  part_search_state->none_rd = this_rdc->rdcost;
+  if (this_rdc->rate != INT_MAX) {
+    // Record picked ref frame to prune ref frames for other partition types.
+    if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) {
+      const int ref_type = av1_ref_frame_type(pc_tree->none->mic.ref_frame);
+      av1_update_picked_ref_frames_mask(
+          x, ref_type, bsize, cm->seq_params->mib_size, mi_row, mi_col);
+    }
+
+    // Calculate the total cost and update the best partition.
+    if (blk_params.bsize_at_least_8x8) {
+      this_rdc->rate += pt_cost;
+      this_rdc->rdcost = RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist);
+    }
+    *part_none_rd = this_rdc->rdcost;
+    if (this_rdc->rdcost < best_rdc->rdcost) {
+      *best_rdc = *this_rdc;
+      part_search_state->found_best_partition = true;
+      if (blk_params.bsize_at_least_8x8) {
+        pc_tree->partitioning = PARTITION_NONE;
+      }
+
+      // Disable split and rectangular partition search
+      // based on PARTITION_NONE cost.
+      prune_partitions_after_none(cpi, x, sms_tree, pc_tree->none,
+                                  part_search_state, best_rdc,
+                                  pb_source_variance);
+    }
+  }
+  av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// PARTITION_SPLIT search.
+static void split_partition_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
+    SIMPLE_MOTION_DATA_TREE *sms_tree, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+    SB_MULTI_PASS_MODE multi_pass_mode, int64_t *part_split_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const int bsize = blk_params.bsize;
+  assert(bsize < BLOCK_SIZES_ALL);
+  RD_STATS sum_rdc = part_search_state->sum_rdc;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  // Check if partition split is allowed.
+  if (part_search_state->terminate_partition_search ||
+      !part_search_state->do_square_split)
+    return;
+
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    if (pc_tree->split[i] == NULL)
+      pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+    pc_tree->split[i]->index = i;
+  }
+
+  // Initialization of this partition RD stats.
+  av1_init_rd_stats(&sum_rdc);
+  sum_rdc.rate = part_search_state->partition_cost[PARTITION_SPLIT];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+  int idx;
+#if CONFIG_COLLECT_PARTITION_STATS
+  PartitionTimingStats *part_timing_stats =
+      &part_search_state->part_timing_stats;
+  if (best_rdc->rdcost - sum_rdc.rdcost >= 0) {
+    start_partition_block_timer(part_timing_stats, PARTITION_SPLIT);
+  }
+#endif
+  // Recursive partition search on 4 sub-blocks.
+  for (idx = 0; idx < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc->rdcost;
+       ++idx) {
+    const int x_idx = (idx & 1) * blk_params.mi_step;
+    const int y_idx = (idx >> 1) * blk_params.mi_step;
+
+    if (mi_row + y_idx >= mi_params->mi_rows ||
+        mi_col + x_idx >= mi_params->mi_cols)
+      continue;
+
+    pc_tree->split[idx]->index = idx;
+    int64_t *p_split_rd = &part_search_state->split_rd[idx];
+    RD_STATS best_remain_rdcost;
+    av1_rd_stats_subtraction(x->rdmult, best_rdc, &sum_rdc,
+                             &best_remain_rdcost);
+
+    int curr_quad_tree_idx = 0;
+    if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+      curr_quad_tree_idx = part_search_state->intra_part_info->quad_tree_idx;
+      part_search_state->intra_part_info->quad_tree_idx =
+          4 * curr_quad_tree_idx + idx + 1;
+    }
+    // Split partition evaluation of corresponding idx.
+    // If the RD cost exceeds the best cost then do not
+    // evaluate other split sub-partitions.
+    SIMPLE_MOTION_DATA_TREE *const sms_tree_split =
+        (sms_tree == NULL) ? NULL : sms_tree->split[idx];
+    if (!av1_rd_pick_partition(
+            cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+            &part_search_state->this_rdc, best_remain_rdcost,
+            pc_tree->split[idx], sms_tree_split, p_split_rd, multi_pass_mode,
+            &part_search_state->split_part_rect_win[idx])) {
+      av1_invalid_rd_stats(&sum_rdc);
+      break;
+    }
+    if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+      part_search_state->intra_part_info->quad_tree_idx = curr_quad_tree_idx;
+    }
+
+    sum_rdc.rate += part_search_state->this_rdc.rate;
+    sum_rdc.dist += part_search_state->this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, &sum_rdc);
+
+    // Set split ctx as ready for use.
+    if (idx <= 1 && (bsize <= BLOCK_8X8 ||
+                     pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
+      const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none->mic;
+      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted.
+      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED)
+          part_search_state->is_split_ctx_is_ready[idx] = 1;
+      }
+    }
+  }
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (part_timing_stats->timer_is_on) {
+    end_partition_block_timer(part_timing_stats, PARTITION_SPLIT,
+                              sum_rdc.rdcost);
+  }
+#endif
+  const int reached_last_index = (idx == SUB_PARTITIONS_SPLIT);
+
+  // Calculate the total cost and update the best partition.
+  *part_split_rd = sum_rdc.rdcost;
+  if (reached_last_index && sum_rdc.rdcost < best_rdc->rdcost) {
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+    if (sum_rdc.rdcost < best_rdc->rdcost) {
+      *best_rdc = sum_rdc;
+      part_search_state->found_best_partition = true;
+      pc_tree->partitioning = PARTITION_SPLIT;
+    }
+  } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) {
+    // Skip rectangular partition test when partition type none gives better
+    // rd than partition type split.
+    if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) {
+      const int partition_none_valid = part_search_state->none_rd > 0;
+      const int partition_none_better =
+          part_search_state->none_rd < sum_rdc.rdcost;
+      part_search_state->do_rectangular_split &=
+          !(partition_none_valid && partition_none_better);
+    }
+  }
+  av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// The max number of nodes in the partition tree.
+// The number of leaf nodes is (128x128) / (4x4) = 1024.
+// The number of All possible parent nodes is 1 + 2 + ... + 512 = 1023.
+#define NUM_NODES 2048
+
+static void write_partition_tree(AV1_COMP *const cpi,
+                                 const PC_TREE *const pc_tree,
+                                 const BLOCK_SIZE bsize, const int mi_row,
+                                 const int mi_col) {
+  (void)mi_row;
+  (void)mi_col;
+  const char *path = cpi->oxcf.partition_info_path;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path,
+           cpi->sb_counter, 0);
+  FILE *pfile = fopen(filename, "w");
+  fprintf(pfile, "%d", bsize);
+
+  // Write partition type with BFS order.
+  const PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+  int q_idx = 0;
+  int last_idx = 1;
+  int num_nodes = 1;
+
+  // First traversal to get number of leaf nodes.
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const PC_TREE *node = tree_node_queue[q_idx];
+    if (node->partitioning == PARTITION_SPLIT) {
+      for (int i = 0; i < 4; ++i) {
+        tree_node_queue[last_idx] = node->split[i];
+        ++last_idx;
+      }
+      num_nodes += 4;
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  const int num_leafs = last_idx;
+  fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1);
+
+  // Write partitions for each node.
+  q_idx = 0;
+  last_idx = 1;
+  num_nodes = 1;
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const PC_TREE *node = tree_node_queue[q_idx];
+    fprintf(pfile, ",%d", node->partitioning);
+    if (node->partitioning == PARTITION_SPLIT) {
+      for (int i = 0; i < 4; ++i) {
+        tree_node_queue[last_idx] = node->split[i];
+        ++last_idx;
+      }
+      num_nodes += 4;
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  fprintf(pfile, "\n");
+
+  fclose(pfile);
+}
+
+static void verify_write_partition_tree(const AV1_COMP *const cpi,
+                                        const PC_TREE *const pc_tree,
+                                        const BLOCK_SIZE bsize,
+                                        const int config_id, const int mi_row,
+                                        const int mi_col) {
+  (void)mi_row;
+  (void)mi_col;
+  const char *path = cpi->oxcf.partition_info_path;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/verify_partition_tree_sb%d_c%d",
+           path, cpi->sb_counter, config_id);
+  FILE *pfile = fopen(filename, "w");
+  fprintf(pfile, "%d", bsize);
+
+  // Write partition type with BFS order.
+  const PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+  int q_idx = 0;
+  int last_idx = 1;
+  int num_nodes = 1;
+
+  // First traversal to get number of leaf nodes.
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const PC_TREE *node = tree_node_queue[q_idx];
+    if (node != NULL && node->partitioning == PARTITION_SPLIT) {
+      for (int i = 0; i < 4; ++i) {
+        tree_node_queue[last_idx] = node->split[i];
+        ++last_idx;
+      }
+      num_nodes += 4;
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  const int num_leafs = last_idx;
+  fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1);
+
+  // Write partitions for each node.
+  q_idx = 0;
+  last_idx = 1;
+  num_nodes = 1;
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const PC_TREE *node = tree_node_queue[q_idx];
+    if (node != NULL) {  // suppress warning
+      fprintf(pfile, ",%d", node->partitioning);
+      if (node->partitioning == PARTITION_SPLIT) {
+        for (int i = 0; i < 4; ++i) {
+          tree_node_queue[last_idx] = node->split[i];
+          ++last_idx;
+        }
+        num_nodes += 4;
+      }
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  fprintf(pfile, "\n");
+
+  fclose(pfile);
+}
+
+static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree,
+                               const int config_id) {
+  const char *path = cpi->oxcf.partition_info_path;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path,
+           cpi->sb_counter, config_id);
+  FILE *pfile = fopen(filename, "r");
+  if (pfile == NULL) {
+    printf("Can't find the file: %s\n", filename);
+    exit(0);
+  }
+
+  int read_bsize;
+  int num_nodes;
+  int num_configs;
+  fscanf(pfile, "%d,%d,%d", &read_bsize, &num_nodes, &num_configs);
+  assert(read_bsize == cpi->common.seq_params->sb_size);
+  BLOCK_SIZE bsize = (BLOCK_SIZE)read_bsize;
+  assert(bsize == pc_tree->block_size);
+
+  PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+  int last_idx = 1;
+  int q_idx = 0;
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    int partitioning;
+    fscanf(pfile, ",%d", &partitioning);
+    assert(partitioning >= PARTITION_NONE &&
+           partitioning < EXT_PARTITION_TYPES);
+    PC_TREE *node = tree_node_queue[q_idx];
+    if (node != NULL) {
+      node->partitioning = partitioning;
+      bsize = node->block_size;
+    }
+    if (partitioning == PARTITION_SPLIT) {
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      for (int i = 0; i < 4; ++i) {
+        if (node != NULL) {  // Suppress warning
+          node->split[i] = av1_alloc_pc_tree_node(subsize);
+          node->split[i]->index = i;
+          tree_node_queue[last_idx] = node->split[i];
+          ++last_idx;
+        }
+      }
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+  fclose(pfile);
+
+  return num_configs;
+}
+
+static RD_STATS rd_search_for_fixed_partition(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row, int mi_col,
+    const BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  RD_STATS best_rdc;
+  av1_invalid_rd_stats(&best_rdc);
+  int sum_subblock_rate = 0;
+  int64_t sum_subblock_dist = 0;
+  PartitionSearchState part_search_state;
+  init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
+                                     bsize);
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c).
+  PartitionBlkParams blk_params = part_search_state.part_blk_params;
+  if (!av1_blk_has_rows_and_cols(&blk_params))
+    set_partition_cost_for_edge_blk(cm, &part_search_state);
+
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+  (void)orig_rdmult;
+
+  // Set the context.
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  assert(bsize < BLOCK_SIZES_ALL);
+  unsigned int pb_source_variance = UINT_MAX;
+  int64_t part_none_rd = INT64_MAX;
+  int64_t none_rd = INT64_MAX;
+  int inc_step[NUM_PART4_TYPES] = { 0 };
+  if (partition == PARTITION_HORZ_4) inc_step[HORZ4] = mi_size_high[bsize] / 4;
+  if (partition == PARTITION_VERT_4) inc_step[VERT4] = mi_size_wide[bsize] / 4;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
+                            &part_search_state, &best_rdc, &pb_source_variance,
+                            &none_rd, &part_none_rd);
+      break;
+    case PARTITION_HORZ:
+      rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+                                   &part_search_state, &best_rdc, NULL, HORZ,
+                                   HORZ);
+      break;
+    case PARTITION_VERT:
+      rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+                                   &part_search_state, &best_rdc, NULL, VERT,
+                                   VERT);
+      break;
+    case PARTITION_HORZ_A:
+      ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                           &part_search_state, &best_rdc, NULL,
+                           pb_source_variance, 1, HORZ_A, HORZ_A);
+      break;
+    case PARTITION_HORZ_B:
+      ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                           &part_search_state, &best_rdc, NULL,
+                           pb_source_variance, 1, HORZ_B, HORZ_B);
+      break;
+    case PARTITION_VERT_A:
+      ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                           &part_search_state, &best_rdc, NULL,
+                           pb_source_variance, 1, VERT_A, VERT_A);
+      break;
+    case PARTITION_VERT_B:
+      ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                           &part_search_state, &best_rdc, NULL,
+                           pb_source_variance, 1, VERT_B, VERT_B);
+      break;
+    case PARTITION_HORZ_4:
+      rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                         pc_tree->horizontal4, &part_search_state, &best_rdc,
+                         inc_step, PARTITION_HORZ_4);
+      break;
+    case PARTITION_VERT_4:
+      rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                         pc_tree->vertical4, &part_search_state, &best_rdc,
+                         inc_step, PARTITION_VERT_4);
+      break;
+    case PARTITION_SPLIT:
+      for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; ++idx) {
+        const BLOCK_SIZE subsize =
+            get_partition_subsize(bsize, PARTITION_SPLIT);
+        assert(subsize < BLOCK_SIZES_ALL);
+        const int next_mi_row =
+            idx < 2 ? mi_row : mi_row + mi_size_high[subsize];
+        const int next_mi_col =
+            idx % 2 == 0 ? mi_col : mi_col + mi_size_wide[subsize];
+        if (next_mi_row >= cm->mi_params.mi_rows ||
+            next_mi_col >= cm->mi_params.mi_cols) {
+          continue;
+        }
+        const RD_STATS subblock_rdc = rd_search_for_fixed_partition(
+            cpi, td, tile_data, tp, sms_tree->split[idx], next_mi_row,
+            next_mi_col, subsize, pc_tree->split[idx]);
+        sum_subblock_rate += subblock_rdc.rate;
+        sum_subblock_dist += subblock_rdc.dist;
+      }
+      best_rdc.rate = sum_subblock_rate;
+      best_rdc.rate += part_search_state.partition_cost[PARTITION_SPLIT];
+      best_rdc.dist = sum_subblock_dist;
+      best_rdc.rdcost = RDCOST(x->rdmult, best_rdc.rate, best_rdc.dist);
+      break;
+    default: assert(0 && "invalid partition type."); exit(0);
+  }
+  // Note: it is necessary to restore context information.
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  if (bsize != cm->seq_params->sb_size) {
+    encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+              pc_tree, NULL);
+  }
+  x->rdmult = orig_rdmult;
+
+  return best_rdc;
+}
+
+static void prepare_sb_features_before_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row,
+    int mi_col, const BLOCK_SIZE bsize, aom_partition_features_t *features) {
+  av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col,
+                                        bsize, features);
+  collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, features);
+}
+
+static void update_partition_stats(const RD_STATS *const this_rdcost,
+                                   aom_partition_stats_t *stats) {
+  stats->rate = this_rdcost->rate;
+  stats->dist = this_rdcost->dist;
+  stats->rdcost = this_rdcost->rdcost;
+}
+
+static void build_pc_tree_from_part_decision(
+    const aom_partition_decision_t *partition_decision,
+    const BLOCK_SIZE this_bsize, PC_TREE *pc_tree) {
+  BLOCK_SIZE bsize = this_bsize;
+  int num_nodes = partition_decision->num_nodes;
+  PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+  int last_idx = 1;
+  int q_idx = 0;
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const int partitioning = partition_decision->partition_decision[q_idx];
+    assert(partitioning >= PARTITION_NONE &&
+           partitioning < EXT_PARTITION_TYPES);
+    PC_TREE *node = tree_node_queue[q_idx];
+    if (node != NULL) {
+      node->partitioning = partitioning;
+      bsize = node->block_size;
+    }
+    if (partitioning == PARTITION_SPLIT) {
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      for (int i = 0; i < 4; ++i) {
+        if (node != NULL) {  // Suppress warning
+          node->split[i] = av1_alloc_pc_tree_node(subsize);
+          node->split[i]->index = i;
+          tree_node_queue[last_idx] = node->split[i];
+          ++last_idx;
+        }
+      }
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+}
+
+// The ML model needs to provide the whole decision tree for the superblock.
+static bool ml_partition_search_whole_tree(AV1_COMP *const cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
+                                           TokenExtra **tp,
+                                           SIMPLE_MOTION_DATA_TREE *sms_root,
+                                           int mi_row, int mi_col,
+                                           const BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  aom_partition_features_t features;
+  prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
+                                    &features);
+  features.mi_row = mi_row;
+  features.mi_col = mi_col;
+  features.frame_width = cpi->frame_info.frame_width;
+  features.frame_height = cpi->frame_info.frame_height;
+  features.block_size = bsize;
+  av1_ext_part_send_features(ext_part_controller, &features);
+  PC_TREE *pc_tree;
+
+  // rd mode search (dry run) for a valid partition decision from the ml model.
+  aom_partition_decision_t partition_decision;
+  do {
+    const bool valid_decision = av1_ext_part_get_partition_decision(
+        ext_part_controller, &partition_decision);
+    if (!valid_decision) return false;
+
+    // First, let's take the easy approach.
+    // We require that the ml model has to provide partition decisions for the
+    // whole superblock.
+    pc_tree = av1_alloc_pc_tree_node(bsize);
+    build_pc_tree_from_part_decision(&partition_decision, bsize, pc_tree);
+
+    const RD_STATS this_rdcost = rd_search_for_fixed_partition(
+        cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree);
+    aom_partition_stats_t stats;
+    update_partition_stats(&this_rdcost, &stats);
+    av1_ext_part_send_partition_stats(ext_part_controller, &stats);
+    if (!partition_decision.is_final_decision) {
+      av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0);
+    }
+  } while (!partition_decision.is_final_decision);
+
+  // Encode with the selected mode and partition.
+  set_cb_offsets(x->cb_offset, 0, 0);
+  encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+            pc_tree, NULL);
+
+  av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0);
+
+  return true;
+}
+
+// Use a bitmask to represent the valid partition types for the current
+// block. "1" represents the corresponding partition type is vaild.
+// The least significant bit represents "PARTITION_NONE", the
+// largest significant bit represents "PARTITION_VERT_4", follow
+// the enum order for PARTITION_TYPE in "enums.h"
+static int get_valid_partition_types(
+    const AV1_COMP *const cpi,
+    const PartitionSearchState *const part_search_state,
+    const BLOCK_SIZE bsize) {
+  const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+  const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  int valid_types = 0;
+  // PARTITION_NONE
+  valid_types |= (part_search_state->partition_none_allowed << 0);
+  // PARTITION_HORZ
+  valid_types |= (part_search_state->partition_rect_allowed[HORZ] << 1);
+  // PARTITION_VERT
+  valid_types |= (part_search_state->partition_rect_allowed[VERT] << 2);
+  // PARTITION_SPLIT
+  valid_types |= (part_search_state->do_square_split << 3);
+  // PARTITION_HORZ_A
+  const int ext_partition_allowed = part_search_state->do_rectangular_split &&
+                                    av1_blk_has_rows_and_cols(&blk_params);
+  const int horzab_partition_allowed =
+      ext_partition_allowed && part_cfg->enable_ab_partitions &&
+      part_search_state->partition_rect_allowed[HORZ];
+  valid_types |= (horzab_partition_allowed << 4);
+  // PARTITION_HORZ_B
+  valid_types |= (horzab_partition_allowed << 5);
+  // PARTITION_VERT_A
+  const int vertab_partition_allowed =
+      ext_partition_allowed && part_cfg->enable_ab_partitions &&
+      part_search_state->partition_rect_allowed[VERT];
+  valid_types |= (vertab_partition_allowed << 6);
+  // PARTITION_VERT_B
+  valid_types |= (vertab_partition_allowed << 7);
+  // PARTITION_HORZ_4
+  const int partition4_allowed = part_cfg->enable_1to4_partitions &&
+                                 ext_partition_allowed &&
+                                 bsize != BLOCK_128X128;
+  const int horz4_allowed =
+      partition4_allowed && part_search_state->partition_rect_allowed[HORZ] &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4),
+                           part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+  valid_types |= (horz4_allowed << 8);
+  // PARTITION_VERT_4
+  const int vert4_allowed =
+      partition4_allowed && part_search_state->partition_rect_allowed[HORZ] &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4),
+                           part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+  valid_types |= (vert4_allowed << 9);
+
+  return valid_types;
+}
+
+static void prepare_tpl_stats_block(const AV1_COMP *const cpi,
+                                    const BLOCK_SIZE bsize, const int mi_row,
+                                    const int mi_col, int64_t *intra_cost,
+                                    int64_t *inter_cost, int64_t *mc_dep_cost) {
+  const AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE ||
+      gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+    return;
+  }
+
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  // If tpl stats is not established, early return
+  if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) {
+    return;
+  }
+
+  const int tpl_stride = tpl_frame->stride;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+  const int mi_width =
+      AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+  const int mi_height =
+      AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+
+  int64_t sum_intra_cost = 0;
+  int64_t sum_inter_cost = 0;
+  int64_t sum_mc_dep_cost = 0;
+  for (int row = 0; row < mi_height; row += step) {
+    for (int col = 0; col < mi_width; col += step) {
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                     tpl_data->tpl_stats_block_mis_log2)];
+      sum_intra_cost += this_stats->intra_cost;
+      sum_inter_cost += this_stats->inter_cost;
+      const int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      sum_mc_dep_cost += mc_dep_delta;
+    }
+  }
+
+  *intra_cost = sum_intra_cost;
+  *inter_cost = sum_inter_cost;
+  *mc_dep_cost = sum_mc_dep_cost;
+}
+
+static bool recursive_partition(AV1_COMP *const cpi, ThreadData *td,
+                                TileDataEnc *tile_data, TokenExtra **tp,
+                                SIMPLE_MOTION_DATA_TREE *sms_root,
+                                PC_TREE *pc_tree, int mi_row, int mi_col,
+                                const BLOCK_SIZE bsize, RD_STATS *this_rdcost) {
+  const AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) {
+    return false;
+  }
+  aom_partition_decision_t partition_decision;
+  do {
+    PartitionSearchState part_search_state;
+    // Initialization of state variables used in partition search.
+    // TODO(chengchen): check if there is hidden conditions that don't allow
+    // all possible partition types.
+    init_partition_search_state_params(x, cpi, &part_search_state, mi_row,
+                                       mi_col, bsize);
+    // Override partition costs at the edges of the frame in the same
+    // way as in read_partition (see decodeframe.c).
+    PartitionBlkParams blk_params = part_search_state.part_blk_params;
+    if (!av1_blk_has_rows_and_cols(&blk_params))
+      set_partition_cost_for_edge_blk(cm, &part_search_state);
+    const int orig_rdmult = x->rdmult;
+    setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+    const int valid_partition_types =
+        get_valid_partition_types(cpi, &part_search_state, bsize);
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    const int qindex = av1_get_qindex(&cm->seg, xd->mi[0]->segment_id,
+                                      cm->quant_params.base_qindex);
+    // RD multiplier
+    const int rdmult = x->rdmult;
+    // pyramid level
+    const int pyramid_level =
+        cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
+    x->rdmult = orig_rdmult;
+    // Neighbor information
+    const int has_above = !!xd->above_mbmi;
+    const int has_left = !!xd->left_mbmi;
+    const BLOCK_SIZE above_bsize =
+        has_above ? xd->above_mbmi->bsize : BLOCK_INVALID;
+    const BLOCK_SIZE left_bsize =
+        has_left ? xd->left_mbmi->bsize : BLOCK_INVALID;
+    const int above_block_width =
+        above_bsize == BLOCK_INVALID ? -1 : block_size_wide[above_bsize];
+    const int above_block_height =
+        above_bsize == BLOCK_INVALID ? -1 : block_size_high[above_bsize];
+    const int left_block_width =
+        left_bsize == BLOCK_INVALID ? -1 : block_size_wide[left_bsize];
+    const int left_block_height =
+        left_bsize == BLOCK_INVALID ? -1 : block_size_high[left_bsize];
+    // Prepare simple motion search stats as features
+    unsigned int block_sse = -1;
+    unsigned int block_var = -1;
+    unsigned int sub_block_sse[4] = { -1, -1, -1, -1 };
+    unsigned int sub_block_var[4] = { -1, -1, -1, -1 };
+    unsigned int horz_block_sse[2] = { -1, -1 };
+    unsigned int horz_block_var[2] = { -1, -1 };
+    unsigned int vert_block_sse[2] = { -1, -1 };
+    unsigned int vert_block_var[2] = { -1, -1 };
+    av1_prepare_motion_search_features_block(
+        cpi, td, tile_data, mi_row, mi_col, bsize, valid_partition_types,
+        &block_sse, &block_var, sub_block_sse, sub_block_var, horz_block_sse,
+        horz_block_var, vert_block_sse, vert_block_var);
+    // Prepare tpl stats for the current block as features
+    int64_t tpl_intra_cost = -1;
+    int64_t tpl_inter_cost = -1;
+    int64_t tpl_mc_dep_cost = -1;
+    prepare_tpl_stats_block(cpi, bsize, mi_row, mi_col, &tpl_intra_cost,
+                            &tpl_inter_cost, &tpl_mc_dep_cost);
+
+    aom_partition_features_t features;
+    features.mi_row = mi_row;
+    features.mi_col = mi_col;
+    features.frame_width = cpi->frame_info.frame_width;
+    features.frame_height = cpi->frame_info.frame_height;
+    features.block_size = bsize;
+    features.valid_partition_types = valid_partition_types;
+    features.update_type = update_type;
+    features.qindex = qindex;
+    features.rdmult = rdmult;
+    features.pyramid_level = pyramid_level;
+    features.has_above_block = has_above;
+    features.above_block_width = above_block_width;
+    features.above_block_height = above_block_height;
+    features.has_left_block = has_left;
+    features.left_block_width = left_block_width;
+    features.left_block_height = left_block_height;
+    features.block_sse = block_sse;
+    features.block_var = block_var;
+    for (int i = 0; i < 4; ++i) {
+      features.sub_block_sse[i] = sub_block_sse[i];
+      features.sub_block_var[i] = sub_block_var[i];
+    }
+    for (int i = 0; i < 2; ++i) {
+      features.horz_block_sse[i] = horz_block_sse[i];
+      features.horz_block_var[i] = horz_block_var[i];
+      features.vert_block_sse[i] = vert_block_sse[i];
+      features.vert_block_var[i] = vert_block_var[i];
+    }
+    features.tpl_intra_cost = tpl_intra_cost;
+    features.tpl_inter_cost = tpl_inter_cost;
+    features.tpl_mc_dep_cost = tpl_mc_dep_cost;
+    av1_ext_part_send_features(ext_part_controller, &features);
+    const bool valid_decision = av1_ext_part_get_partition_decision(
+        ext_part_controller, &partition_decision);
+    if (!valid_decision) return false;
+    pc_tree->partitioning = partition_decision.current_decision;
+
+    av1_init_rd_stats(this_rdcost);
+    if (partition_decision.current_decision == PARTITION_SPLIT) {
+      assert(block_size_wide[bsize] >= 8 && block_size_high[bsize] >= 8);
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      RD_STATS split_rdc[SUB_PARTITIONS_SPLIT];
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+        av1_init_rd_stats(&split_rdc[i]);
+        if (pc_tree->split[i] == NULL)
+          pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+        pc_tree->split[i]->index = i;
+      }
+      const int orig_rdmult_tmp = x->rdmult;
+      setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+      // TODO(chengchen): check boundary conditions
+      // top-left
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[0],
+                          mi_row, mi_col, subsize, &split_rdc[0]);
+      // top-right
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[1],
+                          mi_row, mi_col + mi_size_wide[subsize], subsize,
+                          &split_rdc[1]);
+      // bottom-left
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[2],
+                          mi_row + mi_size_high[subsize], mi_col, subsize,
+                          &split_rdc[2]);
+      // bottom_right
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[3],
+                          mi_row + mi_size_high[subsize],
+                          mi_col + mi_size_wide[subsize], subsize,
+                          &split_rdc[3]);
+      this_rdcost->rate += part_search_state.partition_cost[PARTITION_SPLIT];
+      // problem is here, the rdmult is different from the rdmult in sub block.
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+        this_rdcost->rate += split_rdc[i].rate;
+        this_rdcost->dist += split_rdc[i].dist;
+        av1_rd_cost_update(x->rdmult, this_rdcost);
+      }
+      x->rdmult = orig_rdmult_tmp;
+    } else {
+      *this_rdcost = rd_search_for_fixed_partition(
+          cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree);
+    }
+
+    aom_partition_stats_t stats;
+    update_partition_stats(this_rdcost, &stats);
+    av1_ext_part_send_partition_stats(ext_part_controller, &stats);
+    if (!partition_decision.is_final_decision) {
+      if (partition_decision.current_decision == PARTITION_SPLIT) {
+        for (int i = 0; i < 4; ++i) {
+          if (pc_tree->split[i] != NULL) {
+            av1_free_pc_tree_recursive(pc_tree->split[i], av1_num_planes(cm), 0,
+                                       0);
+            pc_tree->split[i] = NULL;
+          }
+        }
+      }
+    }
+  } while (!partition_decision.is_final_decision);
+
+  return true;
+}
+
+// The ML model only needs to make decisions for the current block each time.
+static bool ml_partition_search_partial(AV1_COMP *const cpi, ThreadData *td,
+                                        TileDataEnc *tile_data, TokenExtra **tp,
+                                        SIMPLE_MOTION_DATA_TREE *sms_root,
+                                        int mi_row, int mi_col,
+                                        const BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  aom_partition_features_t features;
+  prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
+                                    &features);
+  features.mi_row = mi_row;
+  features.mi_col = mi_col;
+  features.frame_width = cpi->frame_info.frame_width;
+  features.frame_height = cpi->frame_info.frame_height;
+  features.block_size = bsize;
+  av1_ext_part_send_features(ext_part_controller, &features);
+  PC_TREE *pc_tree;
+  pc_tree = av1_alloc_pc_tree_node(bsize);
+
+  RD_STATS rdcost;
+  const bool valid_partition =
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree, mi_row,
+                          mi_col, bsize, &rdcost);
+  if (!valid_partition) {
+    return false;
+  }
+
+  // Encode with the selected mode and partition.
+  set_cb_offsets(x->cb_offset, 0, 0);
+  encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+            pc_tree, NULL);
+
+  av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0);
+
+  return true;
+}
+
+bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
+                             TileDataEnc *tile_data, TokenExtra **tp,
+                             SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
+                             int mi_col, const BLOCK_SIZE bsize,
+                             RD_STATS *best_rd_cost) {
+  if (cpi->ext_part_controller.ready) {
+    bool valid_search = true;
+    const aom_ext_part_decision_mode_t decision_mode =
+        av1_get_ext_part_decision_mode(&cpi->ext_part_controller);
+    if (decision_mode == AOM_EXT_PART_WHOLE_TREE) {
+      valid_search = ml_partition_search_whole_tree(
+          cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize);
+    } else if (decision_mode == AOM_EXT_PART_RECURSIVE) {
+      valid_search = ml_partition_search_partial(
+          cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize);
+    } else {
+      assert(0 && "Unknown decision mode.");
+      return false;
+    }
+    if (!valid_search) {
+      assert(0 && "Invalid search from ML model, partition search failed.");
+      exit(0);
+    }
+    return true;
+  }
+
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  int best_idx = 0;
+  int64_t min_rdcost = INT64_MAX;
+  int num_configs;
+  RD_STATS *rdcost = NULL;
+  int i = 0;
+  do {
+    PC_TREE *const pc_tree = av1_alloc_pc_tree_node(bsize);
+    num_configs = read_partition_tree(cpi, pc_tree, i);
+    if (i == 0) {
+      CHECK_MEM_ERROR(cm, rdcost, aom_calloc(num_configs, sizeof(*rdcost)));
+    }
+    if (num_configs <= 0) {
+      av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0);
+      if (rdcost != NULL) aom_free(rdcost);
+      exit(0);
+      return false;
+    }
+    verify_write_partition_tree(cpi, pc_tree, bsize, i, mi_row, mi_col);
+    // Encode the block with the given partition tree. Get rdcost and encoding
+    // time.
+    rdcost[i] = rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root,
+                                              mi_row, mi_col, bsize, pc_tree);
+
+    if (rdcost[i].rdcost < min_rdcost) {
+      min_rdcost = rdcost[i].rdcost;
+      best_idx = i;
+      *best_rd_cost = rdcost[i];
+    }
+    av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0);
+    ++i;
+  } while (i < num_configs);
+
+  // Encode with the partition configuration with the smallest rdcost.
+  PC_TREE *const pc_tree = av1_alloc_pc_tree_node(bsize);
+  read_partition_tree(cpi, pc_tree, best_idx);
+  rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root, mi_row,
+                                mi_col, bsize, pc_tree);
+  set_cb_offsets(x->cb_offset, 0, 0);
+  encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+            pc_tree, NULL);
+
+  av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0);
+  aom_free(rdcost);
+  ++cpi->sb_counter;
+
+  return true;
+}
+
+static void log_sub_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+                              double *var_min, double *var_max) {
+  // This functions returns a the minimum and maximum log variances for 4x4
+  // sub blocks in the current block.
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int is_hbd = is_cur_buf_hbd(xd);
+  const int right_overflow =
+      (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+  const int bottom_overflow =
+      (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+  const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+  const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+  // Initialize minimum variance to a large value and maximum variance to 0.
+  double min_var_4x4 = (double)INT_MAX;
+  double max_var_4x4 = 0.0;
+
+  for (int i = 0; i < bh; i += MI_SIZE) {
+    for (int j = 0; j < bw; j += MI_SIZE) {
+      int var;
+      // Calculate the 4x4 sub-block variance.
+      var = av1_calc_normalized_variance(
+          cpi->ppi->fn_ptr[BLOCK_4X4].vf,
+          x->plane[0].src.buf + (i * x->plane[0].src.stride) + j,
+          x->plane[0].src.stride, is_hbd);
+
+      // Record min and max for over-arching block
+      min_var_4x4 = AOMMIN(min_var_4x4, var);
+      max_var_4x4 = AOMMAX(max_var_4x4, var);
+    }
+  }
+  *var_min = log(1.0 + min_var_4x4 / 16.0);
+  *var_max = log(1.0 + max_var_4x4 / 16.0);
+}
+
+static AOM_INLINE void set_sms_tree_partitioning(
+    SIMPLE_MOTION_DATA_TREE *sms_tree, PARTITION_TYPE partition) {
+  if (sms_tree == NULL) return;
+  sms_tree->partitioning = partition;
+}
+
+/*!\brief AV1 block partition search (full search).
+*
+* \ingroup partition_search
+* \callgraph
+* Searches for the best partition pattern for a block based on the
+* rate-distortion cost, and returns a bool value to indicate whether a valid
+* partition pattern is found. The partition can recursively go down to the
+* smallest block size.
+*
+* \param[in]    cpi                Top-level encoder structure
+* \param[in]    td                 Pointer to thread data
+* \param[in]    tile_data          Pointer to struct holding adaptive
+data/contexts/models for the tile during
+encoding
+* \param[in]    tp                 Pointer to the starting token
+* \param[in]    mi_row             Row coordinate of the block in a step size
+of MI_SIZE
+* \param[in]    mi_col             Column coordinate of the block in a step
+size of MI_SIZE
+* \param[in]    bsize              Current block size
+* \param[in]    rd_cost            Pointer to the final rd cost of the block
+* \param[in]    best_rdc           Upper bound of rd cost of a valid partition
+* \param[in]    pc_tree            Pointer to the PC_TREE node storing the
+picked partitions and mode info for the
+current block
+* \param[in]    sms_tree           Pointer to struct holding simple motion
+search data for the current block
+* \param[in]    none_rd            Pointer to the rd cost in the case of not
+splitting the current block
+* \param[in]    multi_pass_mode    SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS
+* \param[in]    rect_part_win_info Pointer to struct storing whether horz/vert
+partition outperforms previously tested
+partitions
+*
+* \return A bool value is returned indicating if a valid partition is found.
+* The pc_tree struct is modified to store the picked partition and modes.
+* The rd_cost struct is also updated with the RD stats corresponding to the
+* best partition found.
+*/
+bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
+                           RD_STATS best_rdc, PC_TREE *pc_tree,
+                           SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd,
+                           SB_MULTI_PASS_MODE multi_pass_mode,
+                           RD_RECT_PART_WIN_INFO *rect_part_win_info) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  const TokenExtra *const tp_orig = *tp;
+  PartitionSearchState part_search_state;
+
+  // Initialization of state variables used in partition search.
+  init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
+                                     bsize);
+  PartitionBlkParams blk_params = part_search_state.part_blk_params;
+
+  set_sms_tree_partitioning(sms_tree, PARTITION_NONE);
+  if (best_rdc.rdcost < 0) {
+    av1_invalid_rd_stats(rd_cost);
+    return part_search_state.found_best_partition;
+  }
+  if (bsize == cm->seq_params->sb_size) x->must_find_valid_partition = 0;
+
+  // Override skipping rectangular partition operations for edge blocks.
+  if (none_rd) *none_rd = 0;
+  (void)*tp_orig;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  // Stats at the current quad tree
+  PartitionTimingStats *part_timing_stats =
+      &part_search_state.part_timing_stats;
+  // Stats aggregated at frame level
+  FramePartitionTimingStats *fr_part_timing_stats = &cpi->partition_stats;
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c).
+  if (!av1_blk_has_rows_and_cols(&blk_params))
+    set_partition_cost_for_edge_blk(cm, &part_search_state);
+
+  // Disable rectangular partitions for inner blocks when the current block is
+  // forced to only use square partitions.
+  if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) {
+    part_search_state.partition_rect_allowed[HORZ] &= !blk_params.has_rows;
+    part_search_state.partition_rect_allowed[VERT] &= !blk_params.has_cols;
+  }
+
+#ifndef NDEBUG
+  // Nothing should rely on the default value of this array (which is just
+  // leftover from encoding the previous block. Setting it to fixed pattern
+  // when debugging.
+  // bit 0, 1, 2 are blk_skip of each plane
+  // bit 4, 5, 6 are initialization checking of each plane
+  memset(x->txfm_search_info.blk_skip, 0x77,
+         sizeof(x->txfm_search_info.blk_skip));
+#endif  // NDEBUG
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  // Set buffers and offsets.
+  av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  if (cpi->oxcf.mode == ALLINTRA) {
+    if (bsize == cm->seq_params->sb_size) {
+      double var_min, var_max;
+      log_sub_block_var(cpi, x, bsize, &var_min, &var_max);
+
+      x->intra_sb_rdmult_modifier = 128;
+      if ((var_min < 2.0) && (var_max > 4.0)) {
+        if ((var_max - var_min) > 8.0) {
+          x->intra_sb_rdmult_modifier -= 48;
+        } else {
+          x->intra_sb_rdmult_modifier -= (int)((var_max - var_min) * 6);
+        }
+      }
+    }
+  }
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+  // Apply simple motion search for the entire super block with fixed block
+  // size, e.g., 16x16, to collect features and write to files for the
+  // external ML model.
+  // TODO(chengchen): reduce motion search. This function is similar to
+  // av1_get_max_min_partition_features().
+  if (COLLECT_MOTION_SEARCH_FEATURE_SB && !frame_is_intra_only(cm) &&
+      bsize == cm->seq_params->sb_size) {
+    av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col,
+                                          bsize, /*features=*/NULL);
+    collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, /*features=*/NULL);
+  }
+
+  // Update rd cost of the bound using the current multiplier.
+  av1_rd_cost_update(x->rdmult, &best_rdc);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
+
+  // Set the context.
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_prune_partitions_time);
+#endif
+  // Pruning: before searching any partition type, using source and simple
+  // motion search results to prune out unlikely partitions.
+  av1_prune_partitions_before_search(cpi, x, sms_tree, &part_search_state);
+
+  // Pruning: eliminating partition types leading to coding block sizes outside
+  // the min and max bsize limitations set from the encoder.
+  av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_prune_partitions_time);
+#endif
+
+  // Partition search
+BEGIN_PARTITION_SEARCH:
+  // If a valid partition is required, usually when the first round cannot find
+  // a valid one under the cost limit after pruning, reset the limitations on
+  // partition types and intra cnn output.
+  if (x->must_find_valid_partition) {
+    reset_part_limitations(cpi, &part_search_state);
+    // Invalidate intra cnn output for key frames.
+    if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
+      part_search_state.intra_part_info->quad_tree_idx = 0;
+      part_search_state.intra_part_info->cnn_output_valid = 0;
+    }
+  }
+  // Partition block source pixel variance.
+  unsigned int pb_source_variance = UINT_MAX;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, none_partition_search_time);
+#endif
+
+  // Further pruning or in some cases reverse pruning when allintra is set
+  // This code helps visual and in some cases metrics quality where the current
+  // block comprises at least one very low variance sub-block and at least one
+  // where the variance is much higher.
+  //
+  // The idea is that in such cases there is danger of ringing and other visual
+  // artifacts from a high variance feature such as an edge into a very low
+  // variance region.
+  //
+  // The approach taken is to force break down / split to a smaller block size
+  // to try and separate out the low variance and well predicted blocks from the
+  // more complex ones and to prevent propagation of ringing over a large
+  // region.
+  if ((cpi->oxcf.mode == ALLINTRA) && (bsize >= BLOCK_16X16)) {
+    double var_min, var_max;
+    log_sub_block_var(cpi, x, bsize, &var_min, &var_max);
+
+    if ((var_min < 0.272) && ((var_max - var_min) > 3.0)) {
+      part_search_state.partition_none_allowed = 0;
+      part_search_state.terminate_partition_search = 0;
+      part_search_state.do_square_split = 1;
+    }
+  }
+
+  // PARTITION_NONE search stage.
+  int64_t part_none_rd = INT64_MAX;
+  none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
+                        &part_search_state, &best_rdc, &pb_source_variance,
+                        none_rd, &part_none_rd);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, none_partition_search_time);
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, split_partition_search_time);
+#endif
+  // PARTITION_SPLIT search stage.
+  int64_t part_split_rd = INT64_MAX;
+  split_partition_search(cpi, td, tile_data, tp, x, pc_tree, sms_tree, &x_ctx,
+                         &part_search_state, &best_rdc, multi_pass_mode,
+                         &part_split_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, split_partition_search_time);
+#endif
+  // Terminate partition search for child partition,
+  // when NONE and SPLIT partition rd_costs are INT64_MAX.
+  if (cpi->sf.part_sf.early_term_after_none_split &&
+      part_none_rd == INT64_MAX && part_split_rd == INT64_MAX &&
+      !x->must_find_valid_partition && (bsize != cm->seq_params->sb_size)) {
+    part_search_state.terminate_partition_search = 1;
+  }
+
+  // Do not evaluate non-square partitions if NONE partition did not choose a
+  // newmv mode and is skippable.
+  if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 2) &&
+      (pc_tree->none != NULL)) {
+    if (x->qindex <= 200 && is_inter_mode(pc_tree->none->mic.mode) &&
+        !have_newmv_in_inter_mode(pc_tree->none->mic.mode) &&
+        pc_tree->none->skippable && !x->must_find_valid_partition &&
+        bsize >= BLOCK_16X16)
+      part_search_state.do_rectangular_split = 0;
+  }
+
+  // Prune partitions based on PARTITION_NONE and PARTITION_SPLIT.
+  prune_partitions_after_split(cpi, x, sms_tree, &part_search_state, &best_rdc,
+                               part_none_rd, part_split_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rectangular_partition_search_time);
+#endif
+  // Rectangular partitions search stage.
+  rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+                               &part_search_state, &best_rdc,
+                               rect_part_win_info, HORZ, VERT);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rectangular_partition_search_time);
+#endif
+
+  if (pb_source_variance == UINT_MAX) {
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+    if (is_cur_buf_hbd(xd)) {
+      pb_source_variance = av1_high_get_sby_perpixel_variance(
+          cpi, &x->plane[0].src, bsize, xd->bd);
+    } else {
+      pb_source_variance =
+          av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+    }
+  }
+
+  assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+                 !part_search_state.do_rectangular_split));
+
+  int ext_partition_allowed =
+      part_search_state.do_rectangular_split &&
+      bsize > cpi->sf.part_sf.ext_partition_eval_thresh &&
+      av1_blk_has_rows_and_cols(&blk_params);
+
+  // Do not evaluate extended partitions if NONE partition is skippable.
+  if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 1) &&
+      (pc_tree->none != NULL)) {
+    if (pc_tree->none->skippable && !x->must_find_valid_partition &&
+        bsize >= BLOCK_16X16)
+      ext_partition_allowed = 0;
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, ab_partitions_search_time);
+#endif
+  // AB partitions search stage.
+  ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                       &part_search_state, &best_rdc, rect_part_win_info,
+                       pb_source_variance, ext_partition_allowed, HORZ_A,
+                       VERT_B);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, ab_partitions_search_time);
+#endif
+
+  // 4-way partitions search stage.
+  int part4_search_allowed[NUM_PART4_TYPES] = { 1, 1 };
+  // Prune 4-way partition search.
+  prune_4_way_partition_search(cpi, x, pc_tree, &part_search_state, &best_rdc,
+                               pb_source_variance, ext_partition_allowed,
+                               part4_search_allowed);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_4partition_time);
+#endif
+  // PARTITION_HORZ_4
+  assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+                 !part4_search_allowed[HORZ4]));
+  if (!part_search_state.terminate_partition_search &&
+      part4_search_allowed[HORZ4]) {
+    const int inc_step[NUM_PART4_TYPES] = { mi_size_high[blk_params.bsize] / 4,
+                                            0 };
+    // Evaluation of Horz4 partition type.
+    rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                       pc_tree->horizontal4, &part_search_state, &best_rdc,
+                       inc_step, PARTITION_HORZ_4);
+  }
+
+  // PARTITION_VERT_4
+  assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+                 !part4_search_allowed[VERT4]));
+  if (!part_search_state.terminate_partition_search &&
+      part4_search_allowed[VERT4] && blk_params.has_cols) {
+    const int inc_step[NUM_PART4_TYPES] = { 0, mi_size_wide[blk_params.bsize] /
+                                                   4 };
+    // Evaluation of Vert4 partition type.
+    rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+                       pc_tree->vertical4, &part_search_state, &best_rdc,
+                       inc_step, PARTITION_VERT_4);
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_4partition_time);
+#endif
+
+  if (bsize == cm->seq_params->sb_size &&
+      !part_search_state.found_best_partition) {
+    // Did not find a valid partition, go back and search again, with less
+    // constraint on which partition types to search.
+    x->must_find_valid_partition = 1;
+#if CONFIG_COLLECT_PARTITION_STATS
+    fr_part_timing_stats->partition_redo += 1;
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+    goto BEGIN_PARTITION_SEARCH;
+  }
+
+  // Store the final rd cost
+  *rd_cost = best_rdc;
+
+  // Also record the best partition in simple motion data tree because it is
+  // necessary for the related speed features.
+  set_sms_tree_partitioning(sms_tree, pc_tree->partitioning);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
+    part_timing_stats->partition_decisions[pc_tree->partitioning] += 1;
+  }
+
+  // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each
+  // prediction block.
+  print_partition_timing_stats_with_rdcost(
+      part_timing_stats, mi_row, mi_col, bsize,
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+      cm->current_frame.frame_number, &best_rdc, "part_timing.csv");
+  const bool print_timing_stats = false;
+  if (print_timing_stats) {
+    print_partition_timing_stats(part_timing_stats, cm->show_frame,
+                                 frame_is_intra_only(cm), bsize,
+                                 "part_timing_data.csv");
+  }
+  // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for
+  // the whole clip. So we need to pass the information upstream to the encoder.
+  accumulate_partition_timing_stats(fr_part_timing_stats, part_timing_stats,
+                                    bsize);
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
+  // Reset the PC_TREE deallocation flag.
+  int pc_tree_dealloc = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_sb_time);
+#endif
+  // If a valid partition is found and reconstruction is required for future
+  // sub-blocks in the same group.
+  if (part_search_state.found_best_partition && pc_tree->index != 3) {
+    if (bsize == cm->seq_params->sb_size) {
+      // Encode the superblock.
+      const int emit_output = multi_pass_mode != SB_DRY_PASS;
+      const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL;
+
+      // Write partition tree to file. Not used by default.
+      if (COLLECT_MOTION_SEARCH_FEATURE_SB) {
+        write_partition_tree(cpi, pc_tree, bsize, mi_row, mi_col);
+        ++cpi->sb_counter;
+      }
+
+      set_cb_offsets(x->cb_offset, 0, 0);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
+                pc_tree, NULL);
+      // Dealloc the whole PC_TREE after a superblock is done.
+      av1_free_pc_tree_recursive(pc_tree, num_planes, 0, 0);
+      pc_tree_dealloc = 1;
+    } else {
+      // Encode the smaller blocks in DRY_RUN mode.
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_sb_time);
+#endif
+
+  // If the tree still exists (non-superblock), dealloc most nodes, only keep
+  // nodes for the best partition and PARTITION_NONE.
+  if (pc_tree_dealloc == 0)
+    av1_free_pc_tree_recursive(pc_tree, num_planes, 1, 1);
+
+  if (bsize == cm->seq_params->sb_size) {
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+
+  // Restore the rd multiplier.
+  x->rdmult = orig_rdmult;
+  return part_search_state.found_best_partition;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef COLLECT_MOTION_SEARCH_FEATURE_SB
+
+#if CONFIG_RT_ML_PARTITIONING
+#define FEATURES 6
+#define LABELS 2
+static int ml_predict_var_partitioning(AV1_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bsize, int mi_row,
+                                       int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const NN_CONFIG *nn_config = NULL;
+  const float *means = NULL;
+  const float *vars = NULL;
+  switch (bsize) {
+    case BLOCK_64X64:
+      nn_config = &av1_var_part_nnconfig_64;
+      means = av1_var_part_means_64;
+      vars = av1_var_part_vars_64;
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_var_part_nnconfig_32;
+      means = av1_var_part_means_32;
+      vars = av1_var_part_vars_32;
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_var_part_nnconfig_16;
+      means = av1_var_part_means_16;
+      vars = av1_var_part_vars_16;
+      break;
+    case BLOCK_8X8:
+    default: assert(0 && "Unexpected block size."); return -1;
+  }
+
+  if (!nn_config) return -1;
+
+  {
+    const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f;
+    float features[FEATURES] = { 0.0f };
+    const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+                                      cm->seq_params->bit_depth);
+    int feature_idx = 0;
+    float score[LABELS];
+
+    features[feature_idx] =
+        (logf((float)(dc_q * dc_q) / 256.0f + 1.0f) - means[feature_idx]) /
+        sqrtf(vars[feature_idx]);
+    feature_idx++;
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize);
+    {
+      const int bs = block_size_wide[bsize];
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      const int sb_offset_row = 4 * (mi_row & 15);
+      const int sb_offset_col = 4 * (mi_col & 15);
+      const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+      const uint8_t *src = x->plane[0].src.buf;
+      const int src_stride = x->plane[0].src.stride;
+      const int pred_stride = 64;
+      unsigned int sse;
+      int i;
+      // Variance of whole block.
+      const unsigned int var =
+          cpi->ppi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+      const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+      features[feature_idx] = (logf((float)var + 1.0f) - means[feature_idx]) /
+                              sqrtf(vars[feature_idx]);
+      feature_idx++;
+      for (i = 0; i < 4; ++i) {
+        const int x_idx = (i & 1) * bs / 2;
+        const int y_idx = (i >> 1) * bs / 2;
+        const int src_offset = y_idx * src_stride + x_idx;
+        const int pred_offset = y_idx * pred_stride + x_idx;
+        // Variance of quarter block.
+        const unsigned int sub_var =
+            cpi->ppi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+                                         pred + pred_offset, pred_stride, &sse);
+        const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+        features[feature_idx] =
+            (var_ratio - means[feature_idx]) / sqrtf(vars[feature_idx]);
+        feature_idx++;
+      }
+    }
+    //    for (int i = 0; i<FEATURES; i++)
+    //      printf("F_%d, %f; ", i, features[i]);
+    assert(feature_idx == FEATURES);
+    av1_nn_predict(features, nn_config, 1, score);
+    //    printf("Score %f, thr %f ", (float)score[0], thresh);
+    if (score[0] > thresh) return PARTITION_SPLIT;
+    if (score[0] < -thresh) return PARTITION_NONE;
+    return -1;
+  }
+}
+#undef FEATURES
+#undef LABELS
+
+// Uncomment for collecting data for ML-based partitioning
+// #define _COLLECT_GROUND_TRUTH_
+
+#ifdef _COLLECT_GROUND_TRUTH_
+static int store_partition_data(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                                int mi_row, int mi_col, PARTITION_TYPE part) {
+  AV1_COMMON *const cm = &cpi->common;
+  char fname[128];
+  switch (bsize) {
+    case BLOCK_64X64: sprintf(fname, "data_64x64.txt"); break;
+    case BLOCK_32X32: sprintf(fname, "data_32x32.txt"); break;
+    case BLOCK_16X16: sprintf(fname, "data_16x16.txt"); break;
+    case BLOCK_8X8: sprintf(fname, "data_8x8.txt"); break;
+    default: assert(0 && "Unexpected block size."); return -1;
+  }
+
+  float features[6];  // DC_Q, VAR, VAR_RATIO-0..3
+
+  FILE *f = fopen(fname, "a");
+
+  {
+    const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+                                      cm->seq_params->bit_depth);
+    int feature_idx = 0;
+
+    features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f);
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize);
+    {
+      const int bs = block_size_wide[bsize];
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      const int sb_offset_row = 4 * (mi_row & 15);
+      const int sb_offset_col = 4 * (mi_col & 15);
+      const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+      const uint8_t *src = x->plane[0].src.buf;
+      const int src_stride = x->plane[0].src.stride;
+      const int pred_stride = 64;
+      unsigned int sse;
+      int i;
+      // Variance of whole block.
+      /*
+                if (bs == 8)
+                {
+                  int r, c;
+                  printf("%d %d\n", mi_row, mi_col);
+                  for (r = 0; r < bs; ++r) {
+                    for (c = 0; c < bs; ++c) {
+                      printf("%3d ",
+                             src[r * src_stride + c] - pred[64 * r + c]);
+                    }
+                    printf("\n");
+                  }
+                  printf("\n");
+                }
+      */
+      const unsigned int var =
+          cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+      const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+      features[feature_idx++] = logf((float)var + 1.0f);
+
+      fprintf(f, "%f,%f,", features[0], features[1]);
+      for (i = 0; i < 4; ++i) {
+        const int x_idx = (i & 1) * bs / 2;
+        const int y_idx = (i >> 1) * bs / 2;
+        const int src_offset = y_idx * src_stride + x_idx;
+        const int pred_offset = y_idx * pred_stride + x_idx;
+        // Variance of quarter block.
+        const unsigned int sub_var =
+            cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+                                    pred + pred_offset, pred_stride, &sse);
+        const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+        features[feature_idx++] = var_ratio;
+        fprintf(f, "%f,", var_ratio);
+      }
+
+      fprintf(f, "%d\n", part == PARTITION_NONE ? 0 : 1);
+    }
+
+    fclose(f);
+    return -1;
+  }
+}
+#endif
+
+static void duplicate_mode_info_in_sb(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col,
+                                      BLOCK_SIZE bsize) {
+  const int block_width =
+      AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+  const int block_height =
+      AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+  const int mi_stride = xd->mi_stride;
+  MB_MODE_INFO *const src_mi = xd->mi[0];
+  int i, j;
+
+  for (j = 0; j < block_height; ++j)
+    for (i = 0; i < block_width; ++i) xd->mi[j * mi_stride + i] = src_mi;
+}
+
+static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+    MB_MODE_INFO_EXT *const mbmi_ext,
+    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, uint8_t ref_frame_type) {
+  memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
+         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+  memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
+         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+  mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
+  mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
+  memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
+         sizeof(mbmi_ext->global_mvs));
+}
+
+static void fill_mode_info_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int hbs = mi_size_wide[bsize] >> 1;
+  PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+  assert(bsize >= BLOCK_8X8);
+
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+    return;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      set_mode_info_offsets(&cm->mi_params, &cpi->mbmi_ext_info, x, xd, mi_row,
+                            mi_col);
+      *(xd->mi[0]) = pc_tree->none->mic;
+      copy_mbmi_ext_frame_to_mbmi_ext(
+          &x->mbmi_ext, &pc_tree->none->mbmi_ext_best, LAST_FRAME);
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+      break;
+    case PARTITION_SPLIT: {
+      fill_mode_info_sb(cpi, x, mi_row, mi_col, subsize, pc_tree->split[0]);
+      fill_mode_info_sb(cpi, x, mi_row, mi_col + hbs, subsize,
+                        pc_tree->split[1]);
+      fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col, subsize,
+                        pc_tree->split[2]);
+      fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col + hbs, subsize,
+                        pc_tree->split[3]);
+      break;
+    }
+    default: break;
+  }
+}
+
+void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
+                              TileDataEnc *tile_data, TokenExtra **tp,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              RD_STATS *rd_cost, int do_recon, int64_t best_rd,
+                              PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int hbs = mi_size_wide[bsize] >> 1;
+  TokenExtra *tp_orig = *tp;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  RD_STATS this_rdc, best_rdc;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  int do_split = bsize > BLOCK_8X8;
+  // Override skipping rectangular partition operations for edge blocks
+  const int force_horz_split = (mi_row + 2 * hbs > cm->mi_params.mi_rows);
+  const int force_vert_split = (mi_col + 2 * hbs > cm->mi_params.mi_cols);
+
+  int partition_none_allowed = !force_horz_split && !force_vert_split;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);  // Square partition only
+  assert(cm->seq_params->sb_size == BLOCK_64X64);      // Small SB so far
+
+  (void)*tp_orig;
+
+  av1_invalid_rd_stats(&best_rdc);
+  best_rdc.rdcost = best_rd;
+#ifndef _COLLECT_GROUND_TRUTH_
+  if (partition_none_allowed && do_split) {
+    const int ml_predicted_partition =
+        ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col);
+    if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
+    if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
+  }
+#endif
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+    PICK_MODE_CONTEXT *ctx = pc_tree->none;
+
+// Flip for RDO based pick mode
+#if 0
+    RD_STATS dummy;
+    av1_invalid_rd_stats(&dummy);
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+                  PARTITION_NONE, bsize, ctx, dummy);
+#else
+    pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize,
+                        ctx);
+#endif
+    if (this_rdc.rate != INT_MAX) {
+      const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+
+      this_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+      this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = this_rdc;
+        if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+      }
+    }
+  }
+
+  // PARTITION_SPLIT
+  if (do_split) {
+    RD_STATS sum_rdc;
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+    av1_init_rd_stats(&sum_rdc);
+
+    for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+      pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+      pc_tree->split[i]->index = i;
+    }
+
+    int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+    sum_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+    for (int i = 0;
+         i < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
+      const int x_idx = (i & 1) * hbs;
+      const int y_idx = (i >> 1) * hbs;
+
+      if (mi_row + y_idx >= cm->mi_params.mi_rows ||
+          mi_col + x_idx >= cm->mi_params.mi_cols)
+        continue;
+      av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+                               mi_col + x_idx, subsize, &this_rdc, i < 3,
+                               best_rdc.rdcost - sum_rdc.rdcost,
+                               pc_tree->split[i]);
+
+      if (this_rdc.rate == INT_MAX) {
+        av1_invalid_rd_stats(&sum_rdc);
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+    }
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
+      pc_tree->partitioning = PARTITION_SPLIT;
+    }
+  }
+
+#ifdef _COLLECT_GROUND_TRUTH_
+  store_partition_data(cpi, x, bsize, mi_row, mi_col, pc_tree->partitioning);
+#endif
+
+  *rd_cost = best_rdc;
+
+  av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+  if (best_rdc.rate == INT_MAX) {
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
+
+  // update mode info array
+  fill_mode_info_sb(cpi, x, mi_row, mi_col, bsize, pc_tree);
+
+  if (do_recon) {
+    if (bsize == cm->seq_params->sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      set_cb_offsets(x->cb_offset, 0, 0);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+  if (bsize == BLOCK_64X64 && do_recon) {
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+}
+#endif  // CONFIG_RT_ML_PARTITIONING
diff --git a/media/libaom/src/av1/encoder/partition_search.h b/media/libaom/src/av1/encoder/partition_search.h
new file mode 100644
index 0000000000..2577e79f1a
--- /dev/null
+++ b/media/libaom/src/av1/encoder/partition_search.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_SEARCH_H_
+#define AOM_AV1_ENCODER_PARTITION_SEARCH_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/tokenize.h"
+
+void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
+                                        const TileInfo *const tile,
+                                        MACROBLOCK *const x, int mi_row,
+                                        int mi_col, BLOCK_SIZE bsize);
+void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+                     MACROBLOCK *const x, int mi_row, int mi_col,
+                     BLOCK_SIZE bsize);
+void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+                          MB_MODE_INFO **mib, TokenExtra **tp, int mi_row,
+                          int mi_col, BLOCK_SIZE bsize, int *rate,
+                          int64_t *dist, int do_recon, PC_TREE *pc_tree);
+void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+                             TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                             TokenExtra **tp, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, PC_TREE *pc_tree);
+#if CONFIG_RT_ML_PARTITIONING
+void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
+                              TileDataEnc *tile_data, TokenExtra **tp,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              RD_STATS *rd_cost, int do_recon, int64_t best_rd,
+                              PC_TREE *pc_tree);
+#endif
+void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf);
+void av1_reset_sf_for_ext_part(AV1_COMP *const cpi);
+
+bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
+                             TileDataEnc *tile_data, TokenExtra **tp,
+                             SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
+                             int mi_col, BLOCK_SIZE bsize,
+                             RD_STATS *best_rd_cost);
+bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
+                           RD_STATS best_rdc, PC_TREE *pc_tree,
+                           SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd,
+                           SB_MULTI_PASS_MODE multi_pass_mode,
+                           RD_RECT_PART_WIN_INFO *rect_part_win_info);
+
+static AOM_INLINE void set_cb_offsets(uint16_t *cb_offset,
+                                      const uint16_t cb_offset_y,
+                                      const uint16_t cb_offset_uv) {
+  cb_offset[PLANE_TYPE_Y] = cb_offset_y;
+  cb_offset[PLANE_TYPE_UV] = cb_offset_uv;
+}
+
+static AOM_INLINE void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize,
+                                         const int subsampling_x,
+                                         const int subsampling_y) {
+  x->cb_offset[PLANE_TYPE_Y] += block_size_wide[bsize] * block_size_high[bsize];
+  if (x->e_mbd.is_chroma_ref) {
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, subsampling_x, subsampling_y);
+    assert(plane_bsize != BLOCK_INVALID);
+    x->cb_offset[PLANE_TYPE_UV] +=
+        block_size_wide[plane_bsize] * block_size_high[plane_bsize];
+  }
+}
+
+#endif  // AOM_AV1_ENCODER_PARTITION_SEARCH_H_
diff --git a/media/libaom/src/av1/encoder/partition_strategy.c b/media/libaom/src/av1/encoder/partition_strategy.c
index cc820ba242..c4024b49a2 100644
--- a/media/libaom/src/av1/encoder/partition_strategy.c
+++ b/media/libaom/src/av1/encoder/partition_strategy.c
@@ -11,10 +11,10 @@
 
 #include <float.h>
 
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/thirdpass.h"
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_ports/system_state.h"
-
 #include "av1/common/enums.h"
 #include "av1/common/reconinter.h"
 
@@ -27,13 +27,56 @@
 
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/partition_search.h"
 #include "av1/encoder/rdopt.h"
 
 #if !CONFIG_REALTIME_ONLY
 static AOM_INLINE void simple_motion_search_prune_part_features(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get);
-#endif
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
+    int features_to_get);
+
+static bool ext_ml_model_decision_before_none(
+    AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT],
+    int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_rectangular_split,
+    int *do_square_split);
+
+static bool ext_ml_model_decision_before_none_part2(
+    AV1_COMP *cpi,
+    const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART],
+    int *prune_horz, int *prune_vert);
+
+static bool ext_ml_model_decision_after_none(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_none, int *do_square_split,
+    int *do_rectangular_split);
+
+static bool ext_ml_model_decision_after_none_part2(
+    AV1_COMP *const cpi, const float *const features_terminate,
+    int *terminate_partition_search);
+
+static bool ext_ml_model_decision_after_split(
+    AV1_COMP *const cpi, const float *const features_terminate,
+    int *terminate_partition_search);
+
+static bool ext_ml_model_decision_after_split_part2(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_prune, int *prune_rect_part_horz,
+    int *prune_rect_part_vert);
+
+static bool ext_ml_model_decision_after_rect(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_rect, int *horza_partition_allowed,
+    int *horzb_partition_allowed, int *verta_partition_allowed,
+    int *vertb_partition_allowed);
+
+static bool ext_ml_model_decision_after_part_ab(
+    AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+    int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+    int *const partition_vert4_allowed, unsigned int pb_source_variance,
+    int mi_row, int mi_col);
 
 static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
   switch (bsize) {
@@ -46,7 +89,45 @@ static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
   }
 }
 
-#if !CONFIG_REALTIME_ONLY
+static char *get_feature_file_name(int id) {
+  static char *feature_file_names[] = {
+    "feature_before_partition_none",
+    "feature_before_partition_none_prune_rect",
+    "feature_after_partition_none_prune",
+    "feature_after_partition_none_terminate",
+    "feature_after_partition_split_terminate",
+    "feature_after_partition_split_prune_rect",
+    "feature_after_partition_rect",
+    "feature_after_partition_ab",
+  };
+
+  return feature_file_names[id];
+}
+
+static void write_features_to_file(const char *const path,
+                                   const bool is_test_mode,
+                                   const float *features,
+                                   const int feature_size, const int id,
+                                   const int bsize, const int mi_row,
+                                   const int mi_col) {
+  if (!WRITE_FEATURE_TO_FILE && !is_test_mode) return;
+
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/%s", path,
+           get_feature_file_name(id));
+  FILE *pfile = fopen(filename, "a");
+  if (pfile == NULL) return;
+  if (!is_test_mode) {
+    fprintf(pfile, "%d,%d,%d,%d,%d\n", id, bsize, mi_row, mi_col, feature_size);
+  }
+  for (int i = 0; i < feature_size; ++i) {
+    fprintf(pfile, "%.6f", features[i]);
+    if (i < feature_size - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  fclose(pfile);
+}
+
 // TODO(chiyotsai@google.com): This is very much a work in progress. We still
 // need to the following:
 //   -- add support for hdres
@@ -54,23 +135,24 @@ static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
 //   -- use reconstructed pixels instead of source pixels for padding
 //   -- use chroma pixels in addition to luma pixels
 void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
-                                  int bsize, int quad_tree_idx,
-                                  int *partition_none_allowed,
-                                  int *partition_horz_allowed,
-                                  int *partition_vert_allowed,
-                                  int *do_rectangular_split,
-                                  int *do_square_split) {
-  assert(cm->seq_params.sb_size >= BLOCK_64X64 &&
+                                  int quad_tree_idx,
+                                  int intra_cnn_based_part_prune_level,
+                                  PartitionSearchState *part_state) {
+  assert(cm->seq_params->sb_size >= BLOCK_64X64 &&
          "Invalid sb_size for intra_cnn!");
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
   const int bsize_idx = convert_bsize_to_idx(bsize);
 
   if (bsize == BLOCK_128X128) {
     return;
   }
 
+  PartitionSearchInfo *part_info = &x->part_search_info;
+
   // Precompute the CNN part and cache the result in MACROBLOCK
-  if (bsize == BLOCK_64X64 && !x->cnn_output_valid) {
-    aom_clear_system_state();
+  if (bsize == BLOCK_64X64 && !part_info->cnn_output_valid) {
     const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config;
 
     // Prepare the output
@@ -82,7 +164,7 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
     float *output_buffer[CNN_TOT_OUT_CH];
 
     float **cur_output_buf = output_buffer;
-    float *curr_buf_ptr = x->cnn_buffer;
+    float *curr_buf_ptr = part_info->cnn_buffer;
     for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
       const int num_chs = out_chs[output_idx];
       const int ch_size = output_dims[output_idx] * output_dims[output_idx];
@@ -105,9 +187,10 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
     const int bit_depth = xd->bd;
     const int dc_q =
         av1_dc_quant_QTX(x->qindex, 0, bit_depth) >> (bit_depth - 8);
-    x->log_q = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
-    x->log_q = (x->log_q - av1_intra_mode_cnn_partition_mean[0]) /
-               av1_intra_mode_cnn_partition_std[0];
+    part_info->log_q = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+    part_info->log_q =
+        (part_info->log_q - av1_intra_mode_cnn_partition_mean[0]) /
+        av1_intra_mode_cnn_partition_std[0];
 
     const int width = 65, height = 65,
               stride = x->plane[AOM_PLANE_Y].src.stride;
@@ -117,20 +200,28 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
         CONVERT_TO_SHORTPTR(x->plane[AOM_PLANE_Y].src.buf) - stride - 1
       };
 
-      av1_cnn_predict_img_multi_out_highbd(image, width, height, stride,
-                                           cnn_config, &thread_data, bit_depth,
-                                           &output);
+      if (!av1_cnn_predict_img_multi_out_highbd(image, width, height, stride,
+                                                cnn_config, &thread_data,
+                                                bit_depth, &output)) {
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                           "Error allocating CNN data");
+        return;
+      }
     } else {
       uint8_t *image[1] = { x->plane[AOM_PLANE_Y].src.buf - stride - 1 };
 
-      av1_cnn_predict_img_multi_out(image, width, height, stride, cnn_config,
-                                    &thread_data, &output);
+      if (!av1_cnn_predict_img_multi_out(image, width, height, stride,
+                                         cnn_config, &thread_data, &output)) {
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                           "Error allocating CNN data");
+        return;
+      }
     }
 
-    x->cnn_output_valid = 1;
+    part_info->cnn_output_valid = 1;
   }
 
-  if (!x->cnn_output_valid) {
+  if (!part_info->cnn_output_valid) {
     return;
   }
 
@@ -144,11 +235,10 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
 
   const NN_CONFIG *dnn_config = dnn_configs[bsize_idx];
 
-  aom_clear_system_state();
   float dnn_features[100];
   float logits[4] = { 0.0f };
 
-  const float *branch_0 = x->cnn_buffer;
+  const float *branch_0 = part_info->cnn_buffer;
   const float *branch_1 = branch_0 + CNN_BRANCH_0_OUT_SIZE;
   const float *branch_2 = branch_1 + CNN_BRANCH_1_OUT_SIZE;
   const float *branch_3 = branch_2 + CNN_BRANCH_2_OUT_SIZE;
@@ -165,7 +255,7 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
         dnn_features[f_idx++] = branch_1[lin_idx + ch_idx * spa_stride];
       }
     }
-    dnn_features[f_idx++] = x->log_q;
+    dnn_features[f_idx++] = part_info->log_q;
   } else if (bsize == BLOCK_32X32) {
     int f_idx = 0;
     for (int idx = 0; idx < CNN_BRANCH_0_OUT_CH; idx++) {
@@ -177,7 +267,7 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
     for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
       dnn_features[f_idx++] = branch_1[curr_lin_idx + ch_idx * spa_stride];
     }
-    dnn_features[f_idx++] = x->log_q;
+    dnn_features[f_idx++] = part_info->log_q;
   } else if (bsize == BLOCK_16X16) {
     int f_idx = 0;
     const int prev_quad_idx = (quad_tree_idx - 1) / 4;
@@ -192,7 +282,7 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
     for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) {
       dnn_features[f_idx++] = branch_2[curr_lin_idx + ch_idx * spa_stride];
     }
-    dnn_features[f_idx++] = x->log_q;
+    dnn_features[f_idx++] = part_info->log_q;
   } else if (bsize == BLOCK_8X8) {
     int f_idx = 0;
     const int prev_quad_idx = (quad_tree_idx - 1) / 4;
@@ -207,14 +297,13 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
     for (int ch_idx = 0; ch_idx < CNN_BRANCH_3_OUT_CH; ch_idx++) {
       dnn_features[f_idx++] = branch_3[curr_lin_idx + ch_idx * spa_stride];
     }
-    dnn_features[f_idx++] = x->log_q;
+    dnn_features[f_idx++] = part_info->log_q;
   } else {
     assert(0 && "Invalid bsize in intra_cnn partition");
   }
 
   // Make decision
   av1_nn_predict(dnn_features, dnn_config, 1, logits);
-  aom_clear_system_state();
 
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
@@ -237,25 +326,51 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
   }
 
   if (logits[0] > split_only_thresh) {
-    *partition_none_allowed = 0;
-    *partition_horz_allowed = 0;
-    *partition_vert_allowed = 0;
-    *do_rectangular_split = 0;
+    // As screen contents tend to choose larger partitions, do not prune
+    // PARTITION_NONE when intra_cnn_based_part_prune_level=1.
+    if (intra_cnn_based_part_prune_level != 1) {
+      part_state->partition_none_allowed = 0;
+    }
+    part_state->do_square_split = 1;
+    av1_disable_rect_partitions(part_state);
   }
 
   if (logits[0] < no_split_thresh) {
-    *do_square_split = 0;
+    av1_disable_square_split_partition(part_state);
   }
 }
 
-void av1_simple_motion_search_based_split(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
-    int *partition_horz_allowed, int *partition_vert_allowed,
-    int *do_rectangular_split, int *do_square_split) {
-  aom_clear_system_state();
+static INLINE int get_simple_motion_search_prune_agg(int qindex,
+                                                     int prune_level,
+                                                     int is_rect_part) {
+  assert(prune_level < TOTAL_AGG_LVLS);
+  if (prune_level == NO_PRUNING) {
+    return -1;
+  }
 
+  // Aggressiveness value for SIMPLE_MOTION_SEARCH_PRUNE_LEVEL except
+  // QIDX_BASED_AGG_LVL
+  const int sms_prune_agg_levels[TOTAL_SIMPLE_AGG_LVLS] = { 0, 1, 2, 3 };
+  if (prune_level < TOTAL_SIMPLE_AGG_LVLS) {
+    return sms_prune_agg_levels[prune_level];
+  }
+
+  // Map the QIDX_BASED_AGG_LVL to corresponding aggressiveness value.
+  // Aggressive pruning for lower quantizers in non-boosted frames to prune
+  // rectangular partitions.
+  const int qband = is_rect_part ? (qindex <= 90 ? 1 : 0) : 0;
+  const int sms_prune_agg_qindex_based[2] = { 1, 2 };
+  return sms_prune_agg_qindex_based[qband];
+}
+
+void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x,
+                                          SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                          PartitionSearchState *part_state) {
   const AV1_COMMON *const cm = &cpi->common;
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
   const int bsize_idx = convert_bsize_to_idx(bsize);
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
@@ -269,7 +384,12 @@ void av1_simple_motion_search_based_split(
   const float *ml_std = av1_simple_motion_search_split_std[bsize_idx];
   const NN_CONFIG *nn_config =
       av1_simple_motion_search_split_nn_config[bsize_idx];
-  const int agg = cpi->sf.part_sf.simple_motion_search_prune_agg;
+
+  const int agg = get_simple_motion_search_prune_agg(
+      x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 0);
+  if (agg < 0) {
+    return;
+  }
 
   const float split_only_thresh =
       av1_simple_motion_search_split_thresh[agg][res_idx][bsize_idx];
@@ -277,9 +397,25 @@ void av1_simple_motion_search_based_split(
       av1_simple_motion_search_no_split_thresh[agg][res_idx][bsize_idx];
 
   float features[FEATURE_SIZE_SMS_SPLIT] = { 0.0f };
-  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+  simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, features,
                                            FEATURE_SMS_SPLIT_MODEL_FLAG);
+
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features,
+                         FEATURE_SIZE_SMS_SPLIT, 0, bsize, mi_row, mi_col);
+
+  // Note: it is intended to not normalize the features here, to keep it
+  // consistent for all features collected and passed to the external model.
+  if (ext_ml_model_decision_before_none(
+          cpi, features, &part_state->partition_none_allowed,
+          &part_state->partition_rect_allowed[HORZ],
+          &part_state->partition_rect_allowed[VERT],
+          &part_state->do_rectangular_split, &part_state->do_square_split)) {
+    return;
+  }
+
   for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) {
     features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx];
   }
@@ -287,18 +423,27 @@ void av1_simple_motion_search_based_split(
   float score = 0.0f;
 
   av1_nn_predict(features, nn_config, 1, &score);
-  aom_clear_system_state();
 
   if (score > split_only_thresh) {
-    *partition_none_allowed = 0;
-    *partition_horz_allowed = 0;
-    *partition_vert_allowed = 0;
-    *do_rectangular_split = 0;
+    av1_set_square_split_only(part_state);
   }
 
   if (cpi->sf.part_sf.simple_motion_search_split >= 2 &&
       score < no_split_thresh) {
-    *do_square_split = 0;
+    av1_disable_square_split_partition(part_state);
+  }
+
+  // If the score is very low, prune rectangular split since it is unlikely to
+  // occur.
+  if (cpi->sf.part_sf.simple_motion_search_rect_split) {
+    const float scale = res_idx >= 2 ? 3.0f : 2.0f;
+    const float rect_split_thresh =
+        scale * av1_simple_motion_search_no_split_thresh
+                    [cpi->sf.part_sf.simple_motion_search_rect_split][res_idx]
+                    [bsize_idx];
+    if (score < rect_split_thresh) {
+      part_state->do_rectangular_split = 0;
+    }
   }
 }
 
@@ -306,12 +451,12 @@ void av1_simple_motion_search_based_split(
 // the refs and returns the ref with the smallest sse. Returns -1 if none of the
 // ref in the list is available. Also stores the best sse and var in best_sse,
 // best_var, respectively. If save_mv is 0, don't update mv_ref_fulls in
-// pc_tree. If save_mv is 1, update mv_ref_fulls under pc_tree and the
+// sms_tree. If save_mv is 1, update mv_ref_fulls under sms_tree and the
 // subtrees.
 static int simple_motion_search_get_best_ref(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, const int *const refs, int num_refs,
-    int use_subpixel, int save_mv, unsigned int *best_sse,
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, const int *const refs,
+    int num_refs, int use_subpixel, int save_mv, unsigned int *best_sse,
     unsigned int *best_var) {
   const AV1_COMMON *const cm = &cpi->common;
   int best_ref = -1;
@@ -336,12 +481,12 @@ static int simple_motion_search_get_best_ref(
     const int ref = refs[ref_idx];
 
     if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) {
-      const FULLPEL_MV *start_mvs = pc_tree->start_mvs;
+      const FULLPEL_MV *start_mvs = sms_tree->start_mvs;
       unsigned int curr_sse = 0, curr_var = 0;
       int_mv best_mv =
           av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
                                    start_mvs[ref], num_planes, use_subpixel);
-      curr_var = cpi->fn_ptr[bsize].vf(
+      curr_var = cpi->ppi->fn_ptr[bsize].vf(
           x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
           xd->plane[0].dst.stride, &curr_sse);
       if (curr_sse < *best_sse) {
@@ -351,14 +496,14 @@ static int simple_motion_search_get_best_ref(
       }
 
       if (save_mv) {
-        pc_tree->start_mvs[ref].row = best_mv.as_mv.row / 8;
-        pc_tree->start_mvs[ref].col = best_mv.as_mv.col / 8;
+        sms_tree->start_mvs[ref].row = best_mv.as_mv.row / 8;
+        sms_tree->start_mvs[ref].col = best_mv.as_mv.col / 8;
 
         if (bsize >= BLOCK_8X8) {
-          for (int r_idx = 0; r_idx < 4; r_idx++) {
+          for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) {
             // Propagate the new motion vectors to a lower level
-            PC_TREE *sub_tree = pc_tree->split[r_idx];
-            sub_tree->start_mvs[ref] = pc_tree->start_mvs[ref];
+            SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx];
+            sub_tree->start_mvs[ref] = sms_tree->start_mvs[ref];
           }
         }
       }
@@ -369,10 +514,10 @@ static int simple_motion_search_get_best_ref(
 }
 
 // Collects features using simple_motion_search and store them in features. The
-// features are also cached in PC_TREE. By default, the features collected are
-// the sse and var from the subblocks flagged by features_to_get. Furthermore,
-// if features is not NULL, then 7 more features are appended to the end of
-// features:
+// features are also cached in SIMPLE_MOTION_DATA_TREE. By default, the features
+// collected are the sse and var from the subblocks flagged by features_to_get.
+// Furthermore, if features is not NULL, then 7 more features are appended to
+// the end of features:
 //  - log(1.0 + dc_q ** 2)
 //  - whether an above macroblock exists
 //  - width of above macroblock
@@ -381,11 +526,13 @@ static int simple_motion_search_get_best_ref(
 //  - width of left macroblock
 //  - height of left macroblock
 static AOM_INLINE void simple_motion_search_prune_part_features(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get) {
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
+    int features_to_get) {
   const int w_mi = mi_size_wide[bsize];
   const int h_mi = mi_size_high[bsize];
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+  assert(bsize >= BLOCK_8X8);
   assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] ||
          cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
 
@@ -396,21 +543,21 @@ static AOM_INLINE void simple_motion_search_prune_part_features(
   const int use_subpixel = 1;
 
   // Doing whole block first to update the mv
-  if (!pc_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) {
-    simple_motion_search_get_best_ref(cpi, x, pc_tree, mi_row, mi_col, bsize,
+  if (!sms_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) {
+    simple_motion_search_get_best_ref(cpi, x, sms_tree, mi_row, mi_col, bsize,
                                       ref_list, num_refs, use_subpixel, 1,
-                                      &pc_tree->sms_none_feat[0],
-                                      &pc_tree->sms_none_feat[1]);
-    pc_tree->sms_none_valid = 1;
+                                      &sms_tree->sms_none_feat[0],
+                                      &sms_tree->sms_none_feat[1]);
+    sms_tree->sms_none_valid = 1;
   }
 
   // Split subblocks
   if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
     const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-    for (int r_idx = 0; r_idx < 4; r_idx++) {
+    for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) {
       const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
       const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
-      PC_TREE *sub_tree = pc_tree->split[r_idx];
+      SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx];
 
       if (!sub_tree->sms_none_valid) {
         simple_motion_search_get_best_ref(
@@ -423,46 +570,45 @@ static AOM_INLINE void simple_motion_search_prune_part_features(
   }
 
   // Rectangular subblocks
-  if (!pc_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) {
+  if (!sms_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) {
     // Horz subblock
     BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
-    for (int r_idx = 0; r_idx < 2; r_idx++) {
+    for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) {
       const int sub_mi_col = mi_col + 0;
       const int sub_mi_row = mi_row + r_idx * h_mi / 2;
 
       simple_motion_search_get_best_ref(
-          cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
-          use_subpixel, 0, &pc_tree->sms_rect_feat[2 * r_idx],
-          &pc_tree->sms_rect_feat[2 * r_idx + 1]);
+          cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+          use_subpixel, 0, &sms_tree->sms_rect_feat[2 * r_idx],
+          &sms_tree->sms_rect_feat[2 * r_idx + 1]);
     }
 
     // Vert subblock
     subsize = get_partition_subsize(bsize, PARTITION_VERT);
-    for (int r_idx = 0; r_idx < 2; r_idx++) {
+    for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) {
       const int sub_mi_col = mi_col + r_idx * w_mi / 2;
       const int sub_mi_row = mi_row + 0;
 
       simple_motion_search_get_best_ref(
-          cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
-          use_subpixel, 0, &pc_tree->sms_rect_feat[4 + 2 * r_idx],
-          &pc_tree->sms_rect_feat[4 + 2 * r_idx + 1]);
+          cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+          use_subpixel, 0, &sms_tree->sms_rect_feat[4 + 2 * r_idx],
+          &sms_tree->sms_rect_feat[4 + 2 * r_idx + 1]);
     }
-    pc_tree->sms_rect_valid = 1;
+    sms_tree->sms_rect_valid = 1;
   }
 
   if (!features) return;
 
-  aom_clear_system_state();
   int f_idx = 0;
   if (features_to_get & FEATURE_SMS_NONE_FLAG) {
     for (int sub_idx = 0; sub_idx < 2; sub_idx++) {
-      features[f_idx++] = logf(1.0f + pc_tree->sms_none_feat[sub_idx]);
+      features[f_idx++] = logf(1.0f + sms_tree->sms_none_feat[sub_idx]);
     }
   }
 
   if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
-    for (int sub_idx = 0; sub_idx < 4; sub_idx++) {
-      PC_TREE *sub_tree = pc_tree->split[sub_idx];
+    for (int sub_idx = 0; sub_idx < SUB_PARTITIONS_SPLIT; sub_idx++) {
+      SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[sub_idx];
       features[f_idx++] = logf(1.0f + sub_tree->sms_none_feat[0]);
       features[f_idx++] = logf(1.0f + sub_tree->sms_none_feat[1]);
     }
@@ -470,7 +616,7 @@ static AOM_INLINE void simple_motion_search_prune_part_features(
 
   if (features_to_get & FEATURE_SMS_RECT_FLAG) {
     for (int sub_idx = 0; sub_idx < 8; sub_idx++) {
-      features[f_idx++] = logf(1.0f + pc_tree->sms_rect_feat[sub_idx]);
+      features[f_idx++] = logf(1.0f + sms_tree->sms_rect_feat[sub_idx]);
     }
   }
 
@@ -484,8 +630,8 @@ static AOM_INLINE void simple_motion_search_prune_part_features(
   // Neighbor stuff
   const int has_above = !!xd->above_mbmi;
   const int has_left = !!xd->left_mbmi;
-  const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->sb_type : bsize;
-  const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->sb_type : bsize;
+  const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->bsize : bsize;
+  const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->bsize : bsize;
   features[f_idx++] = (float)has_above;
   features[f_idx++] = (float)mi_size_wide_log2[above_bsize];
   features[f_idx++] = (float)mi_size_high_log2[above_bsize];
@@ -495,13 +641,13 @@ static AOM_INLINE void simple_motion_search_prune_part_features(
 }
 
 void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
-                                         PC_TREE *pc_tree, int mi_row,
-                                         int mi_col, BLOCK_SIZE bsize,
-                                         int *partition_horz_allowed,
-                                         int *partition_vert_allowed,
-                                         int *prune_horz, int *prune_vert) {
-  aom_clear_system_state();
+                                         SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                         PartitionSearchState *part_state) {
   const AV1_COMMON *const cm = &cpi->common;
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
   const int bsize_idx = convert_bsize_to_idx(bsize);
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
@@ -514,7 +660,12 @@ void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
   const float *ml_mean = av1_simple_motion_search_prune_rect_mean[bsize_idx],
               *ml_std = av1_simple_motion_search_prune_rect_std[bsize_idx];
 
-  const int agg = cpi->sf.part_sf.simple_motion_search_prune_agg;
+  const int agg = get_simple_motion_search_prune_agg(
+      x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 1);
+  if (agg < 0) {
+    return;
+  }
+
   const float prune_thresh =
       av1_simple_motion_search_prune_rect_thresh[agg][res_idx][bsize_idx];
 
@@ -525,9 +676,29 @@ void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
 
   // Get features
   float features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f };
-  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+  simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, features,
                                            FEATURE_SMS_PRUNE_PART_FLAG);
+
+  // Note: it is intended to not normalize the features here, to keep it
+  // consistent for all features collected and passed to the external model.
+  if (cpi->sf.part_sf.simple_motion_search_prune_rect &&
+      !frame_is_intra_only(cm) &&
+      (part_state->partition_rect_allowed[HORZ] ||
+       part_state->partition_rect_allowed[VERT]) &&
+      bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
+    // Write features to file
+    write_features_to_file(
+        cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode,
+        features, FEATURE_SIZE_SMS_PRUNE_PART, 1, bsize, mi_row, mi_col);
+
+    if (ext_ml_model_decision_before_none_part2(
+            cpi, features, &part_state->prune_rect_part[HORZ],
+            &part_state->prune_rect_part[VERT])) {
+      return;
+    }
+  }
+
   for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) {
     features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
   }
@@ -540,17 +711,15 @@ void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
                               : EXT_PARTITION_TYPES;
 
   av1_nn_predict(features, nn_config, 1, scores);
-  aom_clear_system_state();
 
   av1_nn_softmax(scores, probs, num_classes);
 
   // Determine if we should prune rectangular partitions.
-  if (cpi->sf.part_sf.simple_motion_search_prune_rect &&
-      !frame_is_intra_only(cm) &&
-      (*partition_horz_allowed || *partition_vert_allowed) &&
-      bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
-    *prune_horz = probs[PARTITION_HORZ] <= prune_thresh;
-    *prune_vert = probs[PARTITION_VERT] <= prune_thresh;
+  if (probs[PARTITION_HORZ] <= prune_thresh) {
+    part_state->prune_rect_part[HORZ] = 1;
+  }
+  if (probs[PARTITION_VERT] <= prune_thresh) {
+    part_state->prune_rect_part[VERT] = 1;
   }
 }
 
@@ -560,16 +729,15 @@ void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
 //  - The frame is not intra only
 //  - The current bsize is > BLOCK_8X8
 //  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
-void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
-                                              MACROBLOCK *x, PC_TREE *pc_tree,
-                                              int mi_row, int mi_col,
-                                              BLOCK_SIZE bsize,
-                                              const RD_STATS *none_rdc,
-                                              int *early_terminate) {
-  // TODO(chiyotsai@google.com): There are other features we can extract from
-  // PARTITION_NONE. Play with this later.
+void av1_simple_motion_search_early_term_none(
+    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    const RD_STATS *none_rdc, PartitionSearchState *part_state) {
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
   float features[FEATURE_SIZE_SMS_TERM_NONE] = { 0.0f };
-  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+  simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, features,
                                            FEATURE_SMS_PRUNE_PART_FLAG);
   int f_idx = FEATURE_SIZE_SMS_PRUNE_PART;
@@ -604,6 +772,16 @@ void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
     assert(0 && "Unexpected block size in simple_motion_term_none");
   }
 
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features,
+                         FEATURE_SIZE_SMS_TERM_NONE, 3, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_none_part2(
+          cpi, features, &part_state->terminate_partition_search)) {
+    return;
+  }
+
   if (ml_model) {
     float score = 0.0f;
     for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) {
@@ -613,7 +791,7 @@ void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
     score += ml_model[FEATURE_SIZE_SMS_TERM_NONE];
 
     if (score >= 0.0f) {
-      *early_terminate = 1;
+      part_state->terminate_partition_search = 1;
     }
   }
 }
@@ -623,14 +801,14 @@ void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
                                         float *features) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
 
+  // Currently this only allows 128X128 SB size. May extend it to 64X64 SB size.
   assert(sb_size == BLOCK_128X128);
 
   int f_idx = 0;
 
   const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
-  aom_clear_system_state();
   const float log_q_sq = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
 
   // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb
@@ -665,7 +843,6 @@ void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
       int_mv best_mv = av1_simple_motion_sse_var(
           cpi, x, this_mi_row, this_mi_col, mb_size, start_mv, 0, &sse, &var);
 
-      aom_clear_system_state();
       const float mv_row = (float)(best_mv.as_mv.row / 8);
       const float mv_col = (float)(best_mv.as_mv.col / 8);
       const float log_sse = logf(1.0f + (float)sse);
@@ -687,15 +864,18 @@ void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
       if (log_sse < min_log_sse) min_log_sse = log_sse;
       if (log_sse > max_log_sse) max_log_sse = log_sse;
     }
-  aom_clear_system_state();
-  const float avg_mv_row = sum_mv_row / 64.0f;
-  const float var_mv_row = sum_mv_row_sq / 64.0f - avg_mv_row * avg_mv_row;
+  const int blks = mb_rows * mb_cols;
+  const float avg_mv_row = sum_mv_row / (float)blks;
+  const float var_mv_row =
+      sum_mv_row_sq / (float)blks - avg_mv_row * avg_mv_row;
 
-  const float avg_mv_col = sum_mv_col / 64.0f;
-  const float var_mv_col = sum_mv_col_sq / 64.0f - avg_mv_col * avg_mv_col;
+  const float avg_mv_col = sum_mv_col / (float)blks;
+  const float var_mv_col =
+      sum_mv_col_sq / (float)blks - avg_mv_col * avg_mv_col;
 
-  const float avg_log_sse = sum_log_sse / 64.0f;
-  const float var_log_sse = sum_log_sse_sq / 64.0f - avg_log_sse * avg_log_sse;
+  const float avg_log_sse = sum_log_sse / (float)blks;
+  const float var_log_sse =
+      sum_log_sse_sq / (float)blks - avg_log_sse * avg_log_sse;
 
   features[f_idx++] = avg_log_sse;
   features[f_idx++] = avg_mv_col;
@@ -714,32 +894,46 @@ void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
   assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED);
 }
 
-BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+// Convert result index to block size.
+// result idx     block size
+//     0          BLOCK_16X16
+//     1          BLOCK_32X32
+//     2          BLOCK_64X64
+//     3          BLOCK_128X128
+static BLOCK_SIZE get_block_size(int idx) {
+  return (BLOCK_SIZE)((idx + 2) * 3);
+}
+
+BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
                                      const float *features) {
-  float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f },
-        probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+  float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
   const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config;
 
   assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
          NOT_IN_USE);
 
-  aom_clear_system_state();
   av1_nn_predict(features, nn_config, 1, scores);
-  av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
 
   int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1;
   if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
       DIRECT_PRED) {
     result = 0;
-    float max_prob = probs[0];
+    float max_score = scores[0];
     for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) {
-      if (probs[i] > max_prob) {
-        max_prob = probs[i];
+      if (scores[i] > max_score) {
+        max_score = scores[i];
         result = i;
       }
     }
-  } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
-             RELAXED_PRED) {
+    return get_block_size(result);
+  }
+
+  float probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+  av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
+
+  if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+      RELAXED_PRED) {
     for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
          --result) {
       if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
@@ -749,8 +943,8 @@ BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
     }
   } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
              ADAPT_PRED) {
-    const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
-    MACROBLOCKD *const xd = &x->e_mbd;
+    const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
+    const MACROBLOCKD *const xd = &x->e_mbd;
     // TODO(debargha): x->source_variance is unavailable at this point,
     // so compute. The redundant recomputation later can be removed.
     const unsigned int source_variance =
@@ -770,28 +964,28 @@ BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
     }
   }
 
-  return (BLOCK_SIZE)((result + 2) * 3);
+  return get_block_size(result);
 }
 
 // Get the minimum partition block width and height(in log scale) under a
-// PC_TREE.
-static AOM_INLINE void get_min_bsize(const PC_TREE *pc_tree, int *min_bw,
-                                     int *min_bh) {
-  if (!pc_tree) return;
+// SIMPLE_MOTION_DATA_TREE.
+static AOM_INLINE void get_min_bsize(const SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                     int *min_bw, int *min_bh) {
+  if (!sms_tree) return;
 
-  const BLOCK_SIZE bsize = pc_tree->block_size;
+  const BLOCK_SIZE bsize = sms_tree->block_size;
   if (bsize == BLOCK_4X4) {
     *min_bw = 0;
     *min_bh = 0;
     return;
   }
 
-  PARTITION_TYPE part_type = pc_tree->partitioning;
+  PARTITION_TYPE part_type = sms_tree->partitioning;
   if (part_type == PARTITION_INVALID) return;
 
   if (part_type == PARTITION_SPLIT) {
-    for (int i = 0; i < 4; ++i) {
-      get_min_bsize(pc_tree->split[i], min_bw, min_bh);
+    for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+      get_min_bsize(sms_tree->split[i], min_bw, min_bh);
     }
   } else {
     if (part_type == PARTITION_HORZ_A || part_type == PARTITION_HORZ_B ||
@@ -815,13 +1009,17 @@ static INLINE void add_rd_feature(int64_t rd, int64_t best_rd, float *features,
 
 #define FEATURES 31
 void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
-                                   PC_TREE *const pc_tree, BLOCK_SIZE bsize,
+                                   SIMPLE_MOTION_DATA_TREE *const sms_tree,
                                    int64_t best_rd, int64_t part_none_rd,
                                    int64_t part_split_rd,
-                                   int64_t *split_block_rd, int mi_row,
-                                   int mi_col,
-                                   int *const terminate_partition_search) {
-  if (best_rd <= 0 || best_rd == INT64_MAX || *terminate_partition_search)
+                                   int64_t *split_block_rd,
+                                   PartitionSearchState *part_state) {
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  if (best_rd <= 0 || best_rd == INT64_MAX ||
+      part_state->terminate_partition_search)
     return;
 
   const AV1_COMMON *const cm = &cpi->common;
@@ -862,53 +1060,66 @@ void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
   int f_idx = 0;
   float features[FEATURES] = { 0.0f };
 
-  aom_clear_system_state();
-
   features[f_idx++] = logf(1.0f + (float)dc_q / 4.0f);
   features[f_idx++] = logf(1.0f + (float)best_rd / bs / bs / 1024.0f);
 
   add_rd_feature(part_none_rd, best_rd, features, &f_idx);
   add_rd_feature(part_split_rd, best_rd, features, &f_idx);
 
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
     add_rd_feature(split_block_rd[i], best_rd, features, &f_idx);
     int min_bw = MAX_SB_SIZE_LOG2;
     int min_bh = MAX_SB_SIZE_LOG2;
-    get_min_bsize(pc_tree->split[i], &min_bw, &min_bh);
+    get_min_bsize(sms_tree->split[i], &min_bw, &min_bh);
     features[f_idx++] = (float)min_bw;
     features[f_idx++] = (float)min_bh;
   }
 
-  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+  simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, NULL,
                                            FEATURE_SMS_PRUNE_PART_FLAG);
 
-  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->sms_none_feat[1]);
 
-  features[f_idx++] = logf(1.0f + (float)pc_tree->split[0]->sms_none_feat[1]);
-  features[f_idx++] = logf(1.0f + (float)pc_tree->split[1]->sms_none_feat[1]);
-  features[f_idx++] = logf(1.0f + (float)pc_tree->split[2]->sms_none_feat[1]);
-  features[f_idx++] = logf(1.0f + (float)pc_tree->split[3]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->split[0]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->split[1]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->split[2]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->split[3]->sms_none_feat[1]);
 
-  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[1]);
-  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[3]);
-  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[5]);
-  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[7]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->sms_rect_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->sms_rect_feat[3]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->sms_rect_feat[5]);
+  features[f_idx++] = logf(1.0f + (float)sms_tree->sms_rect_feat[7]);
 
   assert(f_idx == FEATURES);
 
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features, FEATURES,
+                         4, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_split(
+          cpi, features, &part_state->terminate_partition_search)) {
+    return;
+  }
+
   float score = 0.0f;
   av1_nn_predict(features, nn_config, 1, &score);
   // Score is indicator of confidence that we should NOT terminate.
-  if (score < thresh) *terminate_partition_search = 1;
+  if (score < thresh) {
+    part_state->terminate_partition_search = 1;
+  }
 }
 #undef FEATURES
 
-void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
-                                 const MACROBLOCK *const x, BLOCK_SIZE bsize,
+void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x,
                                  int64_t best_rd, int64_t none_rd,
-                                 int64_t *split_rd, int *const dst_prune_horz,
-                                 int *const dst_prune_vert) {
+                                 const int64_t *split_rd,
+                                 PartitionSearchState *part_state) {
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
   if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
   best_rd = AOMMAX(best_rd, 1);
   const NN_CONFIG *nn_config = NULL;
@@ -938,7 +1149,6 @@ void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
     default: assert(0 && "Unexpected bsize.");
   }
   if (!nn_config) return;
-  aom_clear_system_state();
 
   // 1. Compute input features
   float features[9];
@@ -947,7 +1157,7 @@ void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
   for (int i = 0; i < 5; i++) features[i] = 1.0f;
   if (none_rd > 0 && none_rd < 1000000000)
     features[0] = (float)none_rd / (float)best_rd;
-  for (int i = 0; i < 4; i++) {
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
     if (split_rd[i] > 0 && split_rd[i] < 1000000000)
       features[1 + i] = (float)split_rd[i] / (float)best_rd;
   }
@@ -964,12 +1174,12 @@ void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
   }
   whole_block_variance = AOMMAX(whole_block_variance, 1);
 
-  int split_variance[4];
+  int split_variance[SUB_PARTITIONS_SPLIT];
   const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
   struct buf_2d buf;
   buf.stride = x->plane[0].src.stride;
   const int bw = block_size_wide[bsize];
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
     const int x_idx = (i & 1) * bw / 2;
     const int y_idx = (i >> 1) * bw / 2;
     buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
@@ -981,31 +1191,44 @@ void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
     }
   }
 
-  for (int i = 0; i < 4; i++)
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++)
     features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
 
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features,
+                         /*feature_size=*/9, 5, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_split_part2(
+          &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+          features, &part_state->prune_rect_part[HORZ],
+          &part_state->prune_rect_part[VERT])) {
+    return;
+  }
+
   // 2. Do the prediction and prune 0-2 partitions based on their probabilities
   float raw_scores[3] = { 0.0f };
   av1_nn_predict(features, nn_config, 1, raw_scores);
-  aom_clear_system_state();
   float probs[3] = { 0.0f };
   av1_nn_softmax(raw_scores, probs, 3);
 
   // probs[0] is the probability of the fact that both rectangular partitions
   // are worse than current best_rd
-  if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1;
-  if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1;
+  if (probs[1] <= cur_thresh) part_state->prune_rect_part[HORZ] = 1;
+  if (probs[2] <= cur_thresh) part_state->prune_rect_part[VERT] = 1;
 }
 
 // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
 // considered.
-void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
-                               int64_t best_rd, int64_t horz_rd[2],
-                               int64_t vert_rd[2], int64_t split_rd[4],
-                               int *const horza_partition_allowed,
-                               int *const horzb_partition_allowed,
-                               int *const verta_partition_allowed,
-                               int *const vertb_partition_allowed) {
+void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx,
+                               int64_t best_rd,
+                               PartitionSearchState *part_state,
+                               int *ab_partitions_allowed) {
+  const PartitionBlkParams blk_params = part_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const int bsize = blk_params.bsize;
+
   if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
   const NN_CONFIG *nn_config = NULL;
   switch (bsize) {
@@ -1018,8 +1241,6 @@ void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
   }
   if (!nn_config) return;
 
-  aom_clear_system_state();
-
   // Generate features.
   float features[10];
   int feature_index = 0;
@@ -1028,17 +1249,20 @@ void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
   const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
   int sub_block_rdcost[8] = { 0 };
   int rd_index = 0;
-  for (int i = 0; i < 2; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    const int64_t *horz_rd = part_state->rect_part_rd[HORZ];
     if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)horz_rd[i];
     ++rd_index;
   }
-  for (int i = 0; i < 2; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    const int64_t *vert_rd = part_state->rect_part_rd[VERT];
     if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)vert_rd[i];
     ++rd_index;
   }
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    const int64_t *split_rd = part_state->split_rd;
     if (split_rd[i] > 0 && split_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)split_rd[i];
     ++rd_index;
@@ -1052,10 +1276,24 @@ void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
   }
   assert(feature_index == 10);
 
+  // Write features to file
+  if (!frame_is_intra_only(&cpi->common)) {
+    write_features_to_file(cpi->oxcf.partition_info_path,
+                           cpi->ext_part_controller.test_mode, features,
+                           /*feature_size=*/10, 6, bsize, mi_row, mi_col);
+  }
+
+  if (ext_ml_model_decision_after_rect(
+          &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+          features, &ab_partitions_allowed[HORZ_A],
+          &ab_partitions_allowed[HORZ_B], &ab_partitions_allowed[VERT_A],
+          &ab_partitions_allowed[VERT_B])) {
+    return;
+  }
+
   // Calculate scores using the NN model.
   float score[16] = { 0.0f };
   av1_nn_predict(features, nn_config, 1, score);
-  aom_clear_system_state();
   int int_score[16];
   int max_score = -1000;
   for (int i = 0; i < 16; ++i) {
@@ -1070,16 +1308,13 @@ void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
     case BLOCK_32X32: thresh -= 100; break;
     default: break;
   }
-  *horza_partition_allowed = 0;
-  *horzb_partition_allowed = 0;
-  *verta_partition_allowed = 0;
-  *vertb_partition_allowed = 0;
+  av1_zero_array(ab_partitions_allowed, NUM_AB_PARTS);
   for (int i = 0; i < 16; ++i) {
     if (int_score[i] >= thresh) {
-      if ((i >> 0) & 1) *horza_partition_allowed = 1;
-      if ((i >> 1) & 1) *horzb_partition_allowed = 1;
-      if ((i >> 2) & 1) *verta_partition_allowed = 1;
-      if ((i >> 3) & 1) *vertb_partition_allowed = 1;
+      if ((i >> 0) & 1) ab_partitions_allowed[HORZ_A] = 1;
+      if ((i >> 1) & 1) ab_partitions_allowed[HORZ_B] = 1;
+      if ((i >> 2) & 1) ab_partitions_allowed[VERT_A] = 1;
+      if ((i >> 3) & 1) ab_partitions_allowed[VERT_B] = 1;
     }
   }
 }
@@ -1087,15 +1322,27 @@ void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
 #define FEATURES 18
 #define LABELS 4
 // Use a ML model to predict if horz4 and vert4 should be considered.
-void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                              BLOCK_SIZE bsize, int part_ctx, int64_t best_rd,
-                              int64_t horz_rd[2], int64_t vert_rd[2],
-                              int64_t split_rd[4],
-                              int *const partition_horz4_allowed,
-                              int *const partition_vert4_allowed,
-                              unsigned int pb_source_variance, int mi_row,
-                              int mi_col) {
+void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+                              int part_ctx, int64_t best_rd,
+                              PartitionSearchState *part_state,
+                              int *part4_allowed,
+                              unsigned int pb_source_variance) {
+  const PartitionBlkParams blk_params = part_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const int bsize = blk_params.bsize;
+
+  int64_t(*rect_part_rd)[SUB_PARTITIONS_RECT] = part_state->rect_part_rd;
+  int64_t *split_rd = part_state->split_rd;
+  if (ext_ml_model_decision_after_part_ab(
+          cpi, x, bsize, part_ctx, best_rd, rect_part_rd, split_rd,
+          &part4_allowed[HORZ4], &part4_allowed[VERT4], pb_source_variance,
+          mi_row, mi_col))
+    return;
+
   if (best_rd >= 1000000000) return;
+  int64_t *horz_rd = rect_part_rd[HORZ4];
+  int64_t *vert_rd = rect_part_rd[VERT4];
   const NN_CONFIG *nn_config = NULL;
   switch (bsize) {
     case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
@@ -1105,8 +1352,6 @@ void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
   }
   if (!nn_config) return;
 
-  aom_clear_system_state();
-
   // Generate features.
   float features[FEATURES];
   int feature_index = 0;
@@ -1116,17 +1361,17 @@ void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
   const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
   int sub_block_rdcost[8] = { 0 };
   int rd_index = 0;
-  for (int i = 0; i < 2; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
     if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)horz_rd[i];
     ++rd_index;
   }
-  for (int i = 0; i < 2; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
     if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)vert_rd[i];
     ++rd_index;
   }
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
     if (split_rd[i] > 0 && split_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)split_rd[i];
     ++rd_index;
@@ -1140,8 +1385,8 @@ void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
   }
 
   // Get variance of the 1:4 and 4:1 sub-blocks.
-  unsigned int horz_4_source_var[4] = { 0 };
-  unsigned int vert_4_source_var[4] = { 0 };
+  unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+  unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
   {
     BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
     BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
@@ -1155,7 +1400,7 @@ void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
     horz_4_src.stride = src_stride;
     vert_4_src.stride = src_stride;
 
-    for (int i = 0; i < 4; ++i) {
+    for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
       horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
       vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
 
@@ -1176,14 +1421,14 @@ void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
   const float denom = (float)(pb_source_variance + 1);
   const float low_b = 0.1f;
   const float high_b = 10.0f;
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
     // Ratio between the 4:1 sub-block variance and the whole-block variance.
     float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
     if (var_ratio < low_b) var_ratio = low_b;
     if (var_ratio > high_b) var_ratio = high_b;
     features[feature_index++] = var_ratio;
   }
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
     // Ratio between the 1:4 sub-block RD and the whole-block RD.
     float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
     if (var_ratio < low_b) var_ratio = low_b;
@@ -1192,10 +1437,16 @@ void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
   }
   assert(feature_index == FEATURES);
 
+  // Write features to file
+  if (!frame_is_intra_only(&cpi->common)) {
+    write_features_to_file(cpi->oxcf.partition_info_path,
+                           cpi->ext_part_controller.test_mode, features,
+                           FEATURES, 7, bsize, mi_row, mi_col);
+  }
+
   // Calculate scores using the NN model.
   float score[LABELS] = { 0.0f };
   av1_nn_predict(features, nn_config, 1, score);
-  aom_clear_system_state();
   int int_score[LABELS];
   int max_score = -1000;
   for (int i = 0; i < LABELS; ++i) {
@@ -1211,12 +1462,11 @@ void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
     case BLOCK_64X64: thresh -= 200; break;
     default: break;
   }
-  *partition_horz4_allowed = 0;
-  *partition_vert4_allowed = 0;
+  av1_zero_array(part4_allowed, NUM_PART4_TYPES);
   for (int i = 0; i < LABELS; ++i) {
     if (int_score[i] >= thresh) {
-      if ((i >> 0) & 1) *partition_horz4_allowed = 1;
-      if ((i >> 1) & 1) *partition_vert4_allowed = 1;
+      if ((i >> 0) & 1) part4_allowed[HORZ4] = 1;
+      if ((i >> 1) & 1) part4_allowed[VERT4] = 1;
     }
   }
 }
@@ -1224,10 +1474,14 @@ void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
 #undef LABELS
 
 #define FEATURES 4
-int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                            const MACROBLOCK *const x,
-                            const RD_STATS *const rd_stats,
-                            unsigned int pb_source_variance) {
+void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x,
+                             const RD_STATS *const rd_stats,
+                             unsigned int pb_source_variance, int bit_depth,
+                             PartitionSearchState *part_state) {
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
   const NN_CONFIG *nn_config = NULL;
   int thresh = 0;
   switch (bsize) {
@@ -1253,12 +1507,16 @@ int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
       break;
     default: assert(0 && "Unexpected bsize.");
   }
-  if (!nn_config || thresh < 0) return 0;
+  if (!nn_config || thresh < 0) return;
+
+  const float ml_predict_breakout_thresh_scale[3] = { 1.15f, 1.05f, 1.0f };
+  thresh = (int)((float)thresh *
+                 ml_predict_breakout_thresh_scale
+                     [cpi->sf.part_sf.ml_predict_breakout_level - 1]);
 
   // Generate feature values.
   float features[FEATURES];
   int feature_index = 0;
-  aom_clear_system_state();
 
   const int num_pels_log2 = num_pels_log2_lookup[bsize];
   float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
@@ -1272,17 +1530,1063 @@ int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
 
   features[feature_index++] = (float)pb_source_variance;
 
-  const int dc_q = (int)x->plane[0].dequant_QTX[0];
+  const int dc_q = (int)x->plane[0].dequant_QTX[0] >> (bit_depth - 8);
   features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
   assert(feature_index == FEATURES);
 
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features, FEATURES,
+                         2, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_none(&cpi->ext_part_controller,
+                                       frame_is_intra_only(&cpi->common),
+                                       features, &part_state->do_square_split,
+                                       &part_state->do_rectangular_split)) {
+    return;
+  }
+
   // Calculate score using the NN model.
   float score = 0.0f;
   av1_nn_predict(features, nn_config, 1, &score);
-  aom_clear_system_state();
 
   // Make decision.
-  return (int)(score * 100) >= thresh;
+  if ((int)(score * 100) >= thresh) {
+    part_state->do_square_split = 0;
+    part_state->do_rectangular_split = 0;
+  }
 }
 #undef FEATURES
+
+void av1_prune_partitions_before_search(AV1_COMP *const cpi,
+                                        MACROBLOCK *const x,
+                                        SIMPLE_MOTION_DATA_TREE *const sms_tree,
+                                        PartitionSearchState *part_state) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  if (cpi->third_pass_ctx) {
+    int mi_row = blk_params->mi_row;
+    int mi_col = blk_params->mi_col;
+    double ratio_h, ratio_w;
+    av1_get_third_pass_ratio(cpi->third_pass_ctx, 0, cm->height, cm->width,
+                             &ratio_h, &ratio_w);
+    THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+        cpi->third_pass_ctx, 0, mi_row, mi_col, ratio_h, ratio_w);
+    BLOCK_SIZE third_pass_bsize =
+        av1_get_third_pass_adjusted_blk_size(this_mi, ratio_h, ratio_w);
+    // check the actual partition of this block in the second pass
+    PARTITION_TYPE third_pass_part =
+        av1_third_pass_get_sb_part_type(cpi->third_pass_ctx, this_mi);
+
+    int is_edge = (mi_row + mi_size_high[bsize] >= cm->mi_params.mi_rows) ||
+                  (mi_col + mi_size_wide[bsize] >= cm->mi_params.mi_cols);
+
+    if (!is_edge && block_size_wide[bsize] >= 16) {
+      // If in second pass we used rectangular partition, then do not search for
+      // rectangular partition in the different direction.
+      if (third_pass_part != PARTITION_NONE) {
+        if (third_pass_part == PARTITION_HORZ ||
+            third_pass_part == PARTITION_HORZ_4 ||
+            third_pass_part == PARTITION_HORZ_A ||
+            third_pass_part == PARTITION_HORZ_B) {
+          part_state->partition_rect_allowed[VERT] = 0;
+        } else if (third_pass_part == PARTITION_VERT ||
+                   third_pass_part == PARTITION_VERT_4 ||
+                   third_pass_part == PARTITION_VERT_A ||
+                   third_pass_part == PARTITION_VERT_B) {
+          part_state->partition_rect_allowed[HORZ] = 0;
+        }
+      }
+
+      int minSize = AOMMIN(block_size_wide[third_pass_bsize],
+                           block_size_high[third_pass_bsize]);
+      int maxSize = AOMMAX(block_size_wide[third_pass_bsize],
+                           block_size_high[third_pass_bsize]);
+      if (block_size_wide[bsize] < minSize / 4) {
+        // Current partition is too small, just terminate
+        part_state->terminate_partition_search = 1;
+        return;
+      } else if (block_size_wide[bsize] < minSize / 2) {
+        if (third_pass_part != PARTITION_NONE) {
+          // Current partition is very small, and in second pass we used
+          // rectangular partition. Terminate the search here then.
+          part_state->terminate_partition_search = 1;
+          return;
+        } else {
+          // Partition is small, but we still check this partition, only disable
+          // further splits.
+          // TODO(any): check why this is not covered by the termination for <
+          // minSize/4.
+          av1_disable_square_split_partition(part_state);
+          av1_disable_rect_partitions(part_state);
+          return;
+        }
+      } else if (block_size_wide[bsize] > maxSize) {
+        // Partition is larger than in the second pass. Only allow split.
+        av1_set_square_split_only(part_state);
+        return;
+      } else if (block_size_wide[bsize] >= minSize &&
+                 block_size_wide[bsize] <= maxSize) {
+        // Partition is within a range where it is very likely to find a good
+        // choice, so do not prune anything.
+        return;
+      }
+    }
+  }
+
+  // Prune rectangular partitions for larger blocks.
+  if (bsize > cpi->sf.part_sf.rect_partition_eval_thresh) {
+    part_state->do_rectangular_split = 0;
+    part_state->partition_rect_allowed[HORZ] = 0;
+    part_state->partition_rect_allowed[VERT] = 0;
+  }
+
+  // Prune rectangular, AB and 4-way partition based on q index and block size
+  if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 1) {
+    if (bsize == BLOCK_8X8 && x->qindex < 35)
+      av1_disable_rect_partitions(part_state);
+
+  } else if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 2) {
+    // Enumeration difference between two square partitions
+    const int sqr_bsize_step = BLOCK_32X32 - BLOCK_16X16;
+    int max_bsize =
+        BLOCK_32X32 - (x->qindex * 3 / QINDEX_RANGE) * sqr_bsize_step;
+    max_bsize = AOMMAX(max_bsize, BLOCK_4X4);
+    const BLOCK_SIZE max_prune_bsize =
+        (BLOCK_SIZE)AOMMIN(max_bsize, BLOCK_32X32);
+
+    // Prune partition
+    // qidx 0 to 85: prune bsize below BLOCK_32X32
+    // qidx 86 to 170: prune bsize below BLOCK_16X16
+    // qidx 171 to 255: prune bsize below BLOCK_8X8
+    if (bsize < max_prune_bsize) {
+      av1_disable_rect_partitions(part_state);
+    }
+  }
+
+  if (cpi->sf.part_sf.prune_sub_8x8_partition_level && (bsize == BLOCK_8X8)) {
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    int prune_sub_8x8 = 1;
+    if (cpi->sf.part_sf.prune_sub_8x8_partition_level == 1) {
+      int num_neighbors_lt_8x8 = 0;
+      if (xd->left_available)
+        num_neighbors_lt_8x8 += (xd->left_mbmi->bsize <= BLOCK_8X8);
+      if (xd->up_available)
+        num_neighbors_lt_8x8 += (xd->above_mbmi->bsize <= BLOCK_8X8);
+      // Evaluate only if both left and above blocks are of size <= BLOCK_8X8.
+      if (num_neighbors_lt_8x8 == 2) {
+        prune_sub_8x8 = 0;
+      }
+    }
+    if (prune_sub_8x8) {
+      av1_disable_all_splits(part_state);
+    }
+  }
+
+  // A CNN-based speed feature pruning out either split or all non-split
+  // partition in INTRA frame coding.
+  const int try_intra_cnn_based_part_prune =
+      frame_is_intra_only(cm) &&
+      cpi->sf.part_sf.intra_cnn_based_part_prune_level &&
+      cm->seq_params->sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
+      blk_params->bsize_at_least_8x8 &&
+      av1_is_whole_blk_in_frame(blk_params, mi_params);
+
+  if (try_intra_cnn_based_part_prune) {
+    av1_intra_mode_cnn_partition(
+        &cpi->common, x, x->part_search_info.quad_tree_idx,
+        cpi->sf.part_sf.intra_cnn_based_part_prune_level, part_state);
+  }
+
+  // Use simple motion search to prune out split or non-split partitions. This
+  // must be done prior to PARTITION_SPLIT to propagate the initial mvs to a
+  // smaller blocksize.
+  const int try_split_only =
+      cpi->sf.part_sf.simple_motion_search_split &&
+      part_state->do_square_split && blk_params->bsize_at_least_8x8 &&
+      av1_is_whole_blk_in_frame(blk_params, mi_params) &&
+      !frame_is_intra_only(cm) && !av1_superres_scaled(cm);
+
+  if (try_split_only) {
+    av1_simple_motion_search_based_split(cpi, x, sms_tree, part_state);
+  }
+
+  // Use simple motion search to prune out rectangular partition in some
+  // direction. The results are stored in prune_horz and prune_vert in order to
+  // bypass future related pruning checks if a pruning decision has been made.
+
+  // We want to search at least one partition mode, so don't prune if NONE and
+  // SPLIT are disabled.
+  const int non_rect_part_allowed =
+      part_state->do_square_split || part_state->partition_none_allowed;
+  // Only run the model if the partitions are not already pruned.
+  const int rect_part_allowed = part_state->do_rectangular_split &&
+                                ((part_state->partition_rect_allowed[HORZ] &&
+                                  !part_state->prune_rect_part[HORZ]) ||
+                                 (part_state->partition_rect_allowed[VERT] &&
+                                  !part_state->prune_rect_part[VERT]));
+
+  const int try_prune_rect = cpi->sf.part_sf.simple_motion_search_prune_rect &&
+                             !frame_is_intra_only(cm) &&
+                             non_rect_part_allowed && rect_part_allowed &&
+                             !av1_superres_scaled(cm);
+
+  if (try_prune_rect) {
+    av1_simple_motion_search_prune_rect(cpi, x, sms_tree, part_state);
+  }
+}
+
+#ifndef NDEBUG
+static AOM_INLINE int is_bsize_square(BLOCK_SIZE bsize) {
+  return block_size_wide[bsize] == block_size_high[bsize];
+}
+#endif  // NDEBUG
+
+void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc,
+                                           PartitionSearchState *part_state) {
+  assert(is_bsize_square(sb_enc->max_partition_size));
+  assert(is_bsize_square(sb_enc->min_partition_size));
+  assert(sb_enc->min_partition_size <= sb_enc->max_partition_size);
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+  assert(is_bsize_square(bsize));
+  const int max_partition_size_1d = block_size_wide[sb_enc->max_partition_size];
+  const int min_partition_size_1d = block_size_wide[sb_enc->min_partition_size];
+  const int bsize_1d = block_size_wide[bsize];
+  assert(min_partition_size_1d <= max_partition_size_1d);
+  const int is_le_min_sq_part = bsize_1d <= min_partition_size_1d;
+  const int is_gt_max_sq_part = bsize_1d > max_partition_size_1d;
+  if (is_gt_max_sq_part) {
+    // If current block size is larger than max, only allow split.
+    av1_set_square_split_only(part_state);
+  } else if (is_le_min_sq_part) {
+    // If current block size is less or equal to min, only allow none if valid
+    // block large enough; only allow split otherwise.
+    av1_disable_rect_partitions(part_state);
+
+    // only disable square split when current block is not at the picture
+    // boundary. otherwise, inherit the square split flag from previous logic
+    if (av1_blk_has_rows_and_cols(blk_params)) {
+      part_state->do_square_split = 0;
+    }
+    part_state->partition_none_allowed = !(part_state->do_square_split);
+  }
+}
+
+// Decide whether to evaluate the AB partition specified by part_type based on
+// split and HORZ/VERT info
+int evaluate_ab_partition_based_on_split(
+    const PC_TREE *pc_tree, PARTITION_TYPE rect_part,
+    const RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1,
+    int split_idx2) {
+  int num_win = 0;
+  // Threshold for number of winners
+  // Conservative pruning for high quantizers
+  const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3);
+  int sub_part_win = (rect_part_win_info == NULL)
+                         ? (pc_tree->partitioning == rect_part)
+                         : (rect_part == PARTITION_HORZ)
+                               ? rect_part_win_info->rect_part_win[HORZ]
+                               : rect_part_win_info->rect_part_win[VERT];
+  num_win += (sub_part_win) ? 1 : 0;
+  if (pc_tree->split[split_idx1]) {
+    num_win +=
+        (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0;
+  } else {
+    num_win += 1;
+  }
+  if (pc_tree->split[split_idx2]) {
+    num_win +=
+        (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0;
+  } else {
+    num_win += 1;
+  }
+  if (num_win < num_win_thresh) {
+    return 0;
+  }
+  return 1;
+}
+
+void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x,
+                             const PC_TREE *pc_tree, int pb_source_variance,
+                             int64_t best_rdcost,
+                             const RD_RECT_PART_WIN_INFO *rect_part_win_info,
+                             bool ext_partition_allowed,
+                             PartitionSearchState *part_state,
+                             int *ab_partitions_allowed) {
+  int64_t *horz_rd = part_state->rect_part_rd[HORZ];
+  int64_t *vert_rd = part_state->rect_part_rd[VERT];
+  int64_t *split_rd = part_state->split_rd;
+  const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+  // The standard AB partitions are allowed initially if ext-partition-types are
+  // allowed.
+  int horzab_partition_allowed = ext_partition_allowed &&
+                                 part_cfg->enable_ab_partitions &&
+                                 part_state->partition_rect_allowed[HORZ];
+  int vertab_partition_allowed = ext_partition_allowed &&
+                                 part_cfg->enable_ab_partitions &&
+                                 part_state->partition_rect_allowed[VERT];
+
+  // Pruning: pruning out AB partitions on one main direction based on the
+  // current best partition and source variance.
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) {
+      // TODO(debargha,huisu@google.com): may need to tune the threshold for
+      // pb_source_variance.
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    pb_source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    pb_source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    } else {
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    }
+    horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0);
+    horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0);
+    vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0);
+    vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0);
+    split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0);
+    split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
+    split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
+    split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
+  }
+
+  // Pruning: pruning out horz_a or horz_b if the combined rdcost of its
+  // subblocks estimated from previous partitions is much higher than the best
+  // rd so far.
+  ab_partitions_allowed[HORZ_A] = horzab_partition_allowed;
+  ab_partitions_allowed[HORZ_B] = horzab_partition_allowed;
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
+    const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
+    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+      case 1:
+        ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 14 < best_rdcost);
+        ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 14 < best_rdcost);
+        break;
+      case 2:
+      default:
+        ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 15 < best_rdcost);
+        ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 15 < best_rdcost);
+        break;
+    }
+  }
+
+  // Pruning: pruning out vert_a or vert_b if the combined rdcost of its
+  // subblocks estimated from previous partitions is much higher than the best
+  // rd so far.
+  ab_partitions_allowed[VERT_A] = vertab_partition_allowed;
+  ab_partitions_allowed[VERT_B] = vertab_partition_allowed;
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
+    const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
+    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+      case 1:
+        ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 14 < best_rdcost);
+        ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 14 < best_rdcost);
+        break;
+      case 2:
+      default:
+        ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 15 < best_rdcost);
+        ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 15 < best_rdcost);
+        break;
+    }
+  }
+
+  // Pruning: pruning out some ab partitions using a DNN taking rd costs of
+  // sub-blocks from previous basic partition types.
+  if (cpi->sf.part_sf.ml_prune_partition && ext_partition_allowed &&
+      part_state->partition_rect_allowed[HORZ] &&
+      part_state->partition_rect_allowed[VERT]) {
+    // TODO(huisu@google.com): x->source_variance may not be the current
+    // block's variance. The correct one to use is pb_source_variance. Need to
+    // re-train the model to fix it.
+    av1_ml_prune_ab_partition(cpi, pc_tree->partitioning,
+                              get_unsigned_bits(x->source_variance),
+                              best_rdcost, part_state, ab_partitions_allowed);
+  }
+
+  // Pruning: pruning AB partitions based on the number of horz/vert wins
+  // in the current block and sub-blocks in PARTITION_SPLIT.
+  if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+      ab_partitions_allowed[HORZ_A]) {
+    ab_partitions_allowed[HORZ_A] &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1);
+  }
+  if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+      ab_partitions_allowed[HORZ_B]) {
+    ab_partitions_allowed[HORZ_B] &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3);
+  }
+  if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+      ab_partitions_allowed[VERT_A]) {
+    ab_partitions_allowed[VERT_A] &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2);
+  }
+  if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+      ab_partitions_allowed[VERT_B]) {
+    ab_partitions_allowed[VERT_B] &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3);
+  }
+}
+
+// Prepare features for the external model. Specifically, features after
+// ab partition is searched.
+static void prepare_features_after_part_ab(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    int part_ctx, int64_t best_rd,
+    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], unsigned int pb_source_variance,
+    int mi_row, int mi_col, aom_partition_features_t *const features) {
+  int64_t *horz_rd = rect_part_rd[HORZ];
+  int64_t *vert_rd = rect_part_rd[VERT];
+
+  // Generate features.
+  int feature_index = 0;
+  features->after_part_ab.f[feature_index++] = (float)part_ctx;
+  features->after_part_ab.f[feature_index++] =
+      (float)get_unsigned_bits(pb_source_variance);
+
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features->after_part_ab.f[feature_index++] = rd_ratio;
+  }
+
+  // Get variance of the 1:4 and 4:1 sub-blocks.
+  unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+  unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+  {
+    BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+    BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+                         av1_num_planes(&cpi->common), bsize);
+    const int src_stride = x->plane[0].src.stride;
+    uint8_t *src = x->plane[0].src.buf;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+
+    struct buf_2d horz_4_src, vert_4_src;
+    horz_4_src.stride = src_stride;
+    vert_4_src.stride = src_stride;
+
+    for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+      horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
+      vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
+
+      if (is_cur_buf_hbd(xd)) {
+        horz_4_source_var[i] = av1_high_get_sby_perpixel_variance(
+            cpi, &horz_4_src, horz_4_bs, xd->bd);
+        vert_4_source_var[i] = av1_high_get_sby_perpixel_variance(
+            cpi, &vert_4_src, vert_4_bs, xd->bd);
+      } else {
+        horz_4_source_var[i] =
+            av1_get_sby_perpixel_variance(cpi, &horz_4_src, horz_4_bs);
+        vert_4_source_var[i] =
+            av1_get_sby_perpixel_variance(cpi, &vert_4_src, vert_4_bs);
+      }
+    }
+  }
+
+  const float denom = (float)(pb_source_variance + 1);
+  const float low_b = 0.1f;
+  const float high_b = 10.0f;
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    // Ratio between the 4:1 sub-block variance and the whole-block variance.
+    float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features->after_part_ab.f[feature_index++] = var_ratio;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    // Ratio between the 1:4 sub-block RD and the whole-block RD.
+    float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features->after_part_ab.f[feature_index++] = var_ratio;
+  }
+  assert(feature_index == 18);
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions before partition none. Specifically, these parameters:
+// partition_none_allowed
+// partition_horz_allowed
+// partition_vert_allowed
+// do_rectangular_split
+// do_square_split
+static bool ext_ml_model_decision_before_none(
+    AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT],
+    int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_rectangular_split,
+    int *do_square_split) {
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (!ext_part_controller->ready) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE;
+  for (int i = 0; i < FEATURE_SIZE_SMS_SPLIT; ++i) {
+    features.before_part_none.f[i] = features_from_motion[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *partition_none_allowed = decision.partition_none_allowed;
+  *partition_horz_allowed = decision.partition_rect_allowed[HORZ];
+  *partition_vert_allowed = decision.partition_rect_allowed[VERT];
+  *do_rectangular_split = decision.do_rectangular_split;
+  *do_square_split = decision.do_square_split;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions before partition none. Specifically, these parameters:
+// prune_horz
+// prune_vert
+static bool ext_ml_model_decision_before_none_part2(
+    AV1_COMP *cpi,
+    const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART],
+    int *prune_horz, int *prune_vert) {
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (!ext_part_controller->ready) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2;
+  for (int i = 0; i < FEATURE_SIZE_SMS_PRUNE_PART; ++i) {
+    features.before_part_none.f_part2[i] = features_from_motion[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *prune_horz = decision.prune_rect_part[HORZ];
+  *prune_vert = decision.prune_rect_part[VERT];
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// do_square_split
+// do_rectangular_split
+bool ext_ml_model_decision_after_none(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_none, int *do_square_split,
+    int *do_rectangular_split) {
+  if (!ext_part_controller->ready || is_intra_frame) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_NONE;
+  for (int i = 0; i < 4; ++i) {
+    features.after_part_none.f[i] = features_after_none[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *do_square_split = decision.do_square_split;
+  *do_rectangular_split = decision.do_rectangular_split;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// terminate_partition_search
+bool ext_ml_model_decision_after_none_part2(
+    AV1_COMP *const cpi, const float *const features_terminate,
+    int *terminate_partition_search) {
+  AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (!ext_part_controller->ready || frame_is_intra_only(cm)) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_NONE_PART2;
+  for (int i = 0; i < FEATURE_SIZE_SMS_TERM_NONE; ++i) {
+    features.after_part_none.f_terminate[i] = features_terminate[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *terminate_partition_search = decision.terminate_partition_search;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// terminate_partition_search
+bool ext_ml_model_decision_after_split(AV1_COMP *const cpi,
+                                       const float *const features_terminate,
+                                       int *terminate_partition_search) {
+  const AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (frame_is_intra_only(cm) || !cpi->ext_part_controller.ready) {
+    return false;
+  }
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT;
+  for (int i = 0; i < 31; ++i) {
+    features.after_part_split.f_terminate[i] = features_terminate[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *terminate_partition_search = decision.terminate_partition_search;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// prune_rect_part[HORZ]
+// prune_rect_part[VERT]
+bool ext_ml_model_decision_after_split_part2(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_prune, int *prune_rect_part_horz,
+    int *prune_rect_part_vert) {
+  if (is_intra_frame || !ext_part_controller->ready) {
+    return false;
+  }
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2;
+  for (int i = 0; i < 9; ++i) {
+    features.after_part_split.f_prune_rect[i] = features_prune[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *prune_rect_part_horz = decision.prune_rect_part[0];
+  *prune_rect_part_vert = decision.prune_rect_part[1];
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after rectangular partition. Specifically, these parameters:
+// horza_partition_allowed
+// horzb_partition_allowed
+// verta_partition_allowed
+// vertb_partition_allowed
+static bool ext_ml_model_decision_after_rect(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_rect, int *horza_partition_allowed,
+    int *horzb_partition_allowed, int *verta_partition_allowed,
+    int *vertb_partition_allowed) {
+  if (is_intra_frame || !ext_part_controller->ready) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_RECT;
+  for (int i = 0; i < 10; ++i) {
+    features.after_part_rect.f[i] = features_after_rect[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *horza_partition_allowed = decision.horza_partition_allowed;
+  *horzb_partition_allowed = decision.horzb_partition_allowed;
+  *verta_partition_allowed = decision.verta_partition_allowed;
+  *vertb_partition_allowed = decision.vertb_partition_allowed;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after AB partition. Specifically, these parameters:
+// partition_vert4_allowed
+// partition_horz4_allowed
+static bool ext_ml_model_decision_after_part_ab(
+    AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+    int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+    int *const partition_vert4_allowed, unsigned int pb_source_variance,
+    int mi_row, int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+
+  if (!frame_is_intra_only(cm) && ext_part_controller->ready) {
+    // Setup features.
+    aom_partition_features_t features;
+    features.id = AOM_EXT_PART_FEATURE_AFTER_AB;
+    prepare_features_after_part_ab(cpi, x, bsize, part_ctx, best_rd,
+                                   rect_part_rd, split_rd, pb_source_variance,
+                                   mi_row, mi_col, &features);
+
+    // Send necessary features to the external model.
+    av1_ext_part_send_features(ext_part_controller, &features);
+
+    // Get partition decisions from the external model.
+    aom_partition_decision_t decision;
+    const bool valid_decision =
+        av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+    if (!valid_decision) return false;
+
+    // Populate decisions
+    *partition_horz4_allowed = decision.partition_horz4_allowed;
+    *partition_vert4_allowed = decision.partition_vert4_allowed;
+
+    return true;
+  }
+
+  return false;
+}
+
+// This function resembles "av1_setup_sms_tree()" in context_tree.c
+// with function signature change.
+static SIMPLE_MOTION_DATA_TREE *setup_sms_tree(
+    AV1_COMP *const cpi, SIMPLE_MOTION_DATA_TREE *sms_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int stat_generation_stage = is_stat_generation_stage(cpi);
+  const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+  const int tree_nodes =
+      av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+  int sms_tree_index = 0;
+  SIMPLE_MOTION_DATA_TREE *this_sms;
+  int square_index = 1;
+  int nodes;
+  this_sms = &sms_tree[0];
+
+  if (!stat_generation_stage) {
+    const int leaf_factor = is_sb_size_128 ? 4 : 1;
+    const int leaf_nodes = 256 * leaf_factor;
+
+    // Sets up all the leaf nodes in the tree.
+    for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) {
+      SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+      tree->block_size = square[0];
+    }
+
+    // Each node has 4 leaf nodes, fill each block_size level of the tree
+    // from leafs to the root.
+    for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+      for (int i = 0; i < nodes; ++i) {
+        SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+        tree->block_size = square[square_index];
+        for (int j = 0; j < 4; j++) tree->split[j] = this_sms++;
+        ++sms_tree_index;
+      }
+      ++square_index;
+    }
+  } else {
+    // Allocation for firstpass/LAP stage
+    // TODO(Mufaddal): refactor square_index to use a common block_size macro
+    // from firstpass.c
+    SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+    square_index = 2;
+    tree->block_size = square[square_index];
+  }
+
+  // Set up the root node for the largest superblock size
+  return &sms_tree[tree_nodes - 1];
+}
+
+static void write_motion_feature_to_file(
+    const char *const path, const int sb_counter, const unsigned int *block_sse,
+    const unsigned int *block_var, const int num_blocks, const BLOCK_SIZE bsize,
+    const BLOCK_SIZE fixed_block_size, const int mi_row, const int mi_col) {
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/motion_search_feature_sb%d", path,
+           sb_counter);
+  FILE *pfile = fopen(filename, "w");
+  fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize,
+          block_size_wide[fixed_block_size], num_blocks);
+  for (int i = 0; i < num_blocks; ++i) {
+    fprintf(pfile, "%d", block_sse[i]);
+    if (i < num_blocks - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  for (int i = 0; i < num_blocks; ++i) {
+    fprintf(pfile, "%d", block_var[i]);
+    if (i < num_blocks - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  fclose(pfile);
+}
+
+void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
+                                           const int mi_row, const int mi_col,
+                                           const BLOCK_SIZE bsize,
+                                           aom_partition_features_t *features) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (frame_is_intra_only(cm)) return;
+
+  MACROBLOCK *const x = &td->mb;
+  const BLOCK_SIZE fixed_block_size = BLOCK_16X16;
+  const int col_step = mi_size_wide[fixed_block_size];
+  const int row_step = mi_size_high[fixed_block_size];
+  SIMPLE_MOTION_DATA_TREE *sms_tree = NULL;
+  const int stat_generation_stage = is_stat_generation_stage(cpi);
+  const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+  const int tree_nodes =
+      av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+  CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree)));
+  SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize);
+  av1_init_simple_motion_search_mvs_for_sb(cpi, NULL, x, sms_root, mi_row,
+                                           mi_col);
+  av1_reset_simple_motion_tree_partition(sms_root, bsize);
+  const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+                                                        : LAST_FRAME };
+  const int mi_width =
+      AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+  const int mi_height =
+      AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+  const int col_steps = (mi_width / col_step) + ((mi_width % col_step) > 0);
+  const int row_steps = (mi_height / row_step) + ((mi_height % row_step) > 0);
+  const int num_blocks = col_steps * row_steps;
+  unsigned int *block_sse = aom_calloc(num_blocks, sizeof(*block_sse));
+  unsigned int *block_var = aom_calloc(num_blocks, sizeof(*block_var));
+  if (!(block_sse && block_var)) {
+    aom_free(sms_tree);
+    aom_free(block_sse);
+    aom_free(block_var);
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating block_sse & block_var");
+  }
+  int idx = 0;
+
+  for (int row = mi_row;
+       row < AOMMIN(mi_row + mi_size_high[bsize], cm->mi_params.mi_rows);
+       row += row_step) {
+    for (int col = mi_col;
+         col < AOMMIN(mi_col + mi_size_wide[bsize], cm->mi_params.mi_cols);
+         col += col_step) {
+      simple_motion_search_get_best_ref(
+          cpi, x, sms_root, row, col, fixed_block_size, ref_list,
+          /*num_refs=*/1, /*use_subpixel=*/1,
+          /*save_mv=*/1, &block_sse[idx], &block_var[idx]);
+      ++idx;
+    }
+  }
+  if (features == NULL) {
+    write_motion_feature_to_file(cpi->oxcf.partition_info_path, cpi->sb_counter,
+                                 block_sse, block_var, idx, bsize,
+                                 fixed_block_size, mi_row, mi_col);
+  } else {
+    features->sb_features.motion_features.unit_length =
+        block_size_wide[fixed_block_size];
+    features->sb_features.motion_features.num_units = idx;
+    for (int i = 0; i < idx; ++i) {
+      features->sb_features.motion_features.block_sse[i] = block_sse[i];
+      features->sb_features.motion_features.block_var[i] = block_var[i];
+    }
+  }
+
+  aom_free(block_sse);
+  aom_free(block_var);
+  aom_free(sms_tree);
+}
+
+void av1_prepare_motion_search_features_block(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    const int mi_row, const int mi_col, const BLOCK_SIZE bsize,
+    const int valid_partition_types, unsigned int *block_sse,
+    unsigned int *block_var, unsigned int sub_block_sse[4],
+    unsigned int sub_block_var[4], unsigned int horz_block_sse[2],
+    unsigned int horz_block_var[2], unsigned int vert_block_sse[2],
+    unsigned int vert_block_var[2]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (frame_is_intra_only(cm)) return;
+  MACROBLOCK *const x = &td->mb;
+  SIMPLE_MOTION_DATA_TREE *sms_tree = NULL;
+  const int stat_generation_stage = is_stat_generation_stage(cpi);
+  const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+  const int tree_nodes =
+      av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+  CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree)));
+  SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize);
+  av1_reset_simple_motion_tree_partition(sms_root, bsize);
+  const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+                                                        : LAST_FRAME };
+  const int sub_mi_width = mi_size_wide[bsize] / 2;
+  const int sub_mi_height = sub_mi_width;
+  simple_motion_search_get_best_ref(
+      cpi, x, sms_root, mi_row, mi_col, bsize, ref_list, /*num_refs=*/1,
+      /*use_subpixel=*/1, /*save_mv=*/1, block_sse, block_var);
+  // Split to 4 sub blocks.
+  if (valid_partition_types & (1 << PARTITION_SPLIT)) {
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    for (int i = 0; i < 4; ++i) {
+      const int row = mi_row + (i >> 1) * sub_mi_height;
+      const int col = mi_col + (i & 1) * sub_mi_width;
+      simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize,
+                                        ref_list, /*num_refs=*/1,
+                                        /*use_subpixel=*/1, /*save_mv=*/1,
+                                        &sub_block_sse[i], &sub_block_var[i]);
+    }
+  }
+  // Horizontal split
+  if (valid_partition_types & (1 << PARTITION_HORZ)) {
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+    for (int i = 0; i < 2; ++i) {
+      const int row = mi_row + (i & 1) * sub_mi_height;
+      const int col = mi_col;
+      simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize,
+                                        ref_list, /*num_refs=*/1,
+                                        /*use_subpixel=*/1, /*save_mv=*/1,
+                                        &horz_block_sse[i], &horz_block_var[i]);
+    }
+  }
+  // Vertical split
+  if (valid_partition_types & (1 << PARTITION_VERT)) {
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
+    for (int i = 0; i < 2; ++i) {
+      const int row = mi_row;
+      const int col = mi_col + (i & 1) * sub_mi_width;
+      simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize,
+                                        ref_list, /*num_refs=*/1,
+                                        /*use_subpixel=*/1, /*save_mv=*/1,
+                                        &vert_block_sse[i], &vert_block_var[i]);
+    }
+  }
+
+  aom_free(sms_tree);
+}
 #endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void init_simple_motion_search_mvs(
+    SIMPLE_MOTION_DATA_TREE *sms_tree, const FULLPEL_MV *start_mvs) {
+  memcpy(sms_tree->start_mvs, start_mvs, sizeof(sms_tree->start_mvs));
+  av1_zero(sms_tree->sms_none_feat);
+  av1_zero(sms_tree->sms_rect_feat);
+  av1_zero(sms_tree->sms_none_valid);
+  av1_zero(sms_tree->sms_rect_valid);
+
+  if (sms_tree->block_size >= BLOCK_8X8) {
+    init_simple_motion_search_mvs(sms_tree->split[0], start_mvs);
+    init_simple_motion_search_mvs(sms_tree->split[1], start_mvs);
+    init_simple_motion_search_mvs(sms_tree->split[2], start_mvs);
+    init_simple_motion_search_mvs(sms_tree->split[3], start_mvs);
+  }
+}
+
+void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi,
+                                              const TileInfo *tile_info,
+                                              MACROBLOCK *x,
+                                              SIMPLE_MOTION_DATA_TREE *sms_root,
+                                              int mi_row, int mi_col) {
+  // Use the NEARESTMV of the sb as the start mv
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FULLPEL_MV ref_mvs[REF_FRAMES];
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  av1_zero(ref_mvs);
+  // If tile_info is NULL, assume that the offsets have already been set.
+  if (tile_info) {
+    av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col,
+                                       sb_size);
+  }
+
+  MB_MODE_INFO_EXT mbmi_ext;
+  const int ref_frame =
+      cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+  av1_find_mv_refs(cm, xd, xd->mi[0], ref_frame, mbmi_ext.ref_mv_count,
+                   xd->ref_mv_stack, xd->weight, NULL, mbmi_ext.global_mvs,
+                   mbmi_ext.mode_context);
+  if (mbmi_ext.ref_mv_count[ref_frame] > 0) {
+    ref_mvs[ref_frame] =
+        get_fullmv_from_mv(&xd->ref_mv_stack[ref_frame][0].this_mv.as_mv);
+  } else {
+    ref_mvs[ref_frame] =
+        get_fullmv_from_mv(&mbmi_ext.global_mvs[ref_frame].as_mv);
+  }
+
+  init_simple_motion_search_mvs(sms_root, ref_mvs);
+}
diff --git a/media/libaom/src/av1/encoder/partition_strategy.h b/media/libaom/src/av1/encoder/partition_strategy.h
index f9b4d8bfde..84683f5fd4 100644
--- a/media/libaom/src/av1/encoder/partition_strategy.h
+++ b/media/libaom/src/av1/encoder/partition_strategy.h
@@ -13,52 +13,28 @@
 #define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
 
 #include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encoder.h"
 
-#define FEATURE_SIZE_SMS_SPLIT_FAST 6
-#define FEATURE_SIZE_SMS_SPLIT 17
-#define FEATURE_SIZE_SMS_PRUNE_PART 25
-#define FEATURE_SIZE_SMS_TERM_NONE 28
-#define FEATURE_SIZE_FP_SMS_TERM_NONE 20
-#define FEATURE_SIZE_MAX_MIN_PART_PRED 13
-#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
-
-#define FEATURE_SMS_NONE_FLAG 1
-#define FEATURE_SMS_SPLIT_FLAG (1 << 1)
-#define FEATURE_SMS_RECT_FLAG (1 << 2)
-
-#define FEATURE_SMS_PRUNE_PART_FLAG \
-  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG)
-#define FEATURE_SMS_SPLIT_MODEL_FLAG \
-  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG)
-
 void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
-                                  int bsize, int label_idx,
-                                  int *partition_none_allowed,
-                                  int *partition_horz_allowed,
-                                  int *partition_vert_allowed,
-                                  int *do_rectangular_split,
-                                  int *do_square_split);
+                                  int label_idx,
+                                  int intra_cnn_based_part_prune_level,
+                                  PartitionSearchState *part_state);
 
 // Performs a simple_motion_search with a single reference frame and extract
 // the variance of residues. Then use the features to determine whether we want
 // to go straight to splitting without trying PARTITION_NONE
-void av1_simple_motion_search_based_split(
-    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
-    int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
-    int *partition_horz_allowed, int *partition_vert_allowed,
-    int *do_rectangular_split, int *do_square_split);
+void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x,
+                                          SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                          PartitionSearchState *part_state);
 
 // Performs a simple_motion_search with two reference frames and extract
 // the variance of residues. Then use the features to determine whether we want
 // to prune some partitions.
 void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
-                                         PC_TREE *pc_tree, int mi_row,
-                                         int mi_col, BLOCK_SIZE bsize,
-                                         int *partition_horz_allowed,
-                                         int *partition_vert_allowed,
-                                         int *prune_horz, int *prune_vert);
+                                         SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                         PartitionSearchState *part_state);
 
 #if !CONFIG_REALTIME_ONLY
 // Early terminates PARTITION_NONE using simple_motion_search features and the
@@ -68,11 +44,10 @@ void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
 //  - The current bsize is > BLOCK_8X8
 //  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
 void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
-                                              MACROBLOCK *x, PC_TREE *pc_tree,
-                                              int mi_row, int mi_col,
-                                              BLOCK_SIZE bsize,
+                                              MACROBLOCK *x,
+                                              SIMPLE_MOTION_DATA_TREE *sms_tree,
                                               const RD_STATS *none_rdc,
-                                              int *early_terminate);
+                                              PartitionSearchState *part_state);
 
 // Get the features for selecting the max and min partition size. Currently this
 // performs simple_motion_search on 16X16 subblocks of the current superblock,
@@ -82,17 +57,17 @@ void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
                                         float *features);
 
 // Predict the maximum BLOCK_SIZE to be used to encoder the current superblock.
-BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
                                      const float *features);
 
 // Attempts an early termination after PARTITION_SPLIT.
 void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
-                                   PC_TREE *const pc_tree, BLOCK_SIZE bsize,
+                                   SIMPLE_MOTION_DATA_TREE *const sms_tree,
                                    int64_t best_rd, int64_t part_none_rd,
                                    int64_t part_split_rd,
-                                   int64_t *split_block_rd, int mi_row,
-                                   int mi_col,
-                                   int *const terminate_partition_search);
+                                   int64_t *split_block_rd,
+                                   PartitionSearchState *part_state);
 
 // Use the rdcost ratio and source var ratio to prune PARTITION_HORZ and
 // PARTITION_VERT.
@@ -100,37 +75,71 @@ void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
 // no information about rectangular partitions. Preliminary experiments suggest
 // that we can get better performance by adding in q_index and rectangular
 // sse/var from SMS. We should retrain and tune this model later.
-void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
-                                 const MACROBLOCK *const x, BLOCK_SIZE bsize,
+void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x,
                                  int64_t best_rd, int64_t none_rd,
-                                 int64_t *split_rd, int *const dst_prune_horz,
-                                 int *const dst_prune_vert);
+                                 const int64_t *split_rd,
+                                 PartitionSearchState *part_state);
 
 // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
 // considered.
-void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
-                               int64_t best_rd, int64_t horz_rd[2],
-                               int64_t vert_rd[2], int64_t split_rd[4],
-                               int *const horza_partition_allowed,
-                               int *const horzb_partition_allowed,
-                               int *const verta_partition_allowed,
-                               int *const vertb_partition_allowed);
+void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx,
+                               int64_t best_rd,
+                               PartitionSearchState *part_state,
+                               int *ab_partitions_allowed);
 
 // Use a ML model to predict if horz4 and vert4 should be considered.
-void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                              BLOCK_SIZE bsize, int part_ctx, int64_t best_rd,
-                              int64_t horz_rd[2], int64_t vert_rd[2],
-                              int64_t split_rd[4],
-                              int *const partition_horz4_allowed,
-                              int *const partition_vert4_allowed,
-                              unsigned int pb_source_variance, int mi_row,
-                              int mi_col);
-
-// ML-based partition search breakout after PARTITION_NONE
-int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                            const MACROBLOCK *const x,
-                            const RD_STATS *const rd_stats,
-                            unsigned int pb_source_variance);
+void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+                              int part_ctx, int64_t best_rd,
+                              PartitionSearchState *part_state,
+                              int *part4_allowed,
+                              unsigned int pb_source_variance);
+
+// ML-based partition search breakout after PARTITION_NONE.
+void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x,
+                             const RD_STATS *const rd_stats,
+                             unsigned int pb_source_variance, int bit_depth,
+                             PartitionSearchState *part_state);
+
+// The first round of partition pruning determined before any partition
+// has been tested. The decisions will be updated and passed back
+// to the partition search function.
+void av1_prune_partitions_before_search(AV1_COMP *const cpi,
+                                        MACROBLOCK *const x,
+                                        SIMPLE_MOTION_DATA_TREE *const sms_tree,
+                                        PartitionSearchState *part_state);
+
+// Prune out partitions that lead to coding block sizes outside the min and max
+// bsizes set by the encoder. Max and min square partition levels are defined as
+// the partition nodes that the recursive function rd_pick_partition() can
+// reach. To implement this: only PARTITION_NONE is allowed if the current node
+// equals max_partition_size, only PARTITION_SPLIT is allowed if the current
+// node exceeds max_partition_size.
+void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc,
+                                           PartitionSearchState *part_state);
+
+// Prune out AB partitions based on rd decisions made from testing the
+// basic partitions.
+void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x,
+                             const PC_TREE *pc_tree, int pb_source_variance,
+                             int64_t best_rdcost,
+                             const RD_RECT_PART_WIN_INFO *rect_part_win_info,
+                             bool ext_partition_allowed,
+                             PartitionSearchState *part_state,
+                             int *ab_partitions_allowed);
+
+void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
+                                           const int mi_row, const int mi_col,
+                                           const BLOCK_SIZE bsize,
+                                           aom_partition_features_t *features);
+void av1_prepare_motion_search_features_block(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    const int mi_row, const int mi_col, const BLOCK_SIZE bsize,
+    const int valid_partition_types, unsigned int *block_sse,
+    unsigned int *block_var, unsigned int sub_block_sse[4],
+    unsigned int sub_block_var[4], unsigned int horz_block_sse[2],
+    unsigned int horz_block_var[2], unsigned int vert_block_sse[2],
+    unsigned int vert_block_var[2]);
 #endif  // !CONFIG_REALTIME_ONLY
 
 // A simplified version of set_offsets meant to be used for
@@ -176,21 +185,11 @@ static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi,
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
 }
 
-static INLINE void init_simple_motion_search_mvs(PC_TREE *pc_tree) {
-  av1_zero(pc_tree->start_mvs);
-
-  av1_zero(pc_tree->sms_none_feat);
-  av1_zero(pc_tree->sms_rect_feat);
-  av1_zero(pc_tree->sms_none_valid);
-  av1_zero(pc_tree->sms_rect_valid);
-
-  if (pc_tree->block_size >= BLOCK_8X8) {
-    init_simple_motion_search_mvs(pc_tree->split[0]);
-    init_simple_motion_search_mvs(pc_tree->split[1]);
-    init_simple_motion_search_mvs(pc_tree->split[2]);
-    init_simple_motion_search_mvs(pc_tree->split[3]);
-  }
-}
+void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi,
+                                              const TileInfo *tile_info,
+                                              MACROBLOCK *x,
+                                              SIMPLE_MOTION_DATA_TREE *sms_root,
+                                              int mi_row, int mi_col);
 
 static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params,
                              int mi_row, int mi_col, BLOCK_SIZE sb_size) {
@@ -201,22 +200,66 @@ static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params,
          (mi_col + sb_mi_wide) <= mi_params->mi_cols;
 }
 
+#if !CONFIG_REALTIME_ONLY
 // Do not use this criteria for screen content videos.
 // Since screen content videos could often find good predictors and the largest
 // block size is likely to be used.
-static INLINE int use_auto_max_partition(AV1_COMP *const cpi,
+static INLINE int use_auto_max_partition(const AV1_COMP *const cpi,
                                          BLOCK_SIZE sb_size, int mi_row,
                                          int mi_col) {
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  AV1_COMMON *const cm = &cpi->common;
-  return !frame_is_intra_only(cm) && !cpi->is_screen_content_type &&
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const AV1_COMMON *const cm = &cpi->common;
+  return !frame_is_intra_only(cm) && !cpi->use_screen_content_tools &&
          cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
              NOT_IN_USE &&
          sb_size == BLOCK_128X128 &&
          is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) &&
-         cpi->gf_group.update_type[cpi->gf_group.index] != OVERLAY_UPDATE &&
-         cpi->gf_group.update_type[cpi->gf_group.index] != INTNL_OVERLAY_UPDATE;
+         cpi->ppi->gf_group.update_type[cpi->gf_frame_index] !=
+             OVERLAY_UPDATE &&
+         cpi->ppi->gf_group.update_type[cpi->gf_frame_index] !=
+             INTNL_OVERLAY_UPDATE;
+}
+
+static BLOCK_SIZE dim_to_size(int dim) {
+  switch (dim) {
+    case 4: return BLOCK_4X4;
+    case 8: return BLOCK_8X8;
+    case 16: return BLOCK_16X16;
+    case 32: return BLOCK_32X32;
+    case 64: return BLOCK_64X64;
+    case 128: return BLOCK_128X128;
+    default: assert(0); return 0;
+  }
 }
 
+static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc,
+                                                  AV1_COMP *cpi, MACROBLOCK *x,
+                                                  const SPEED_FEATURES *sf,
+                                                  BLOCK_SIZE sb_size,
+                                                  int mi_row, int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+
+  sb_enc->max_partition_size =
+      AOMMIN(sf->part_sf.default_max_partition_size,
+             dim_to_size(cpi->oxcf.part_cfg.max_partition_size));
+  sb_enc->min_partition_size =
+      AOMMAX(sf->part_sf.default_min_partition_size,
+             dim_to_size(cpi->oxcf.part_cfg.min_partition_size));
+  sb_enc->max_partition_size =
+      AOMMIN(sb_enc->max_partition_size, cm->seq_params->sb_size);
+  sb_enc->min_partition_size =
+      AOMMIN(sb_enc->min_partition_size, cm->seq_params->sb_size);
+
+  if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
+    float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+
+    av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
+    sb_enc->max_partition_size =
+        AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features),
+                      sb_enc->max_partition_size),
+               sb_enc->min_partition_size);
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
 #endif  // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
diff --git a/media/libaom/src/av1/encoder/pass2_strategy.c b/media/libaom/src/av1/encoder/pass2_strategy.c
index 6adc1fbf9d..51d2200756 100644
--- a/media/libaom/src/av1/encoder/pass2_strategy.c
+++ b/media/libaom/src/av1/encoder/pass2_strategy.c
@@ -9,6 +9,14 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\defgroup gf_group_algo Golden Frame Group
+ * \ingroup high_level_algo
+ * Algorithms regarding determining the length of GF groups and defining GF
+ * group structures.
+ * @{
+ */
+/*! @} - end defgroup gf_group_algo */
+
 #include <stdint.h>
 
 #include "config/aom_config.h"
@@ -17,8 +25,6 @@
 #include "aom/aom_codec.h"
 #include "aom/aom_encoder.h"
 
-#include "aom_ports/system_state.h"
-
 #include "av1/common/av1_common_int.h"
 
 #include "av1/encoder/encoder.h"
@@ -26,14 +32,19 @@
 #include "av1/encoder/gop_structure.h"
 #include "av1/encoder/pass2_strategy.h"
 #include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/thirdpass.h"
 #include "av1/encoder/tpl_model.h"
-#include "av1/encoder/use_flat_gop_model_params.h"
 #include "av1/encoder/encode_strategy.h"
 
 #define DEFAULT_KF_BOOST 2300
 #define DEFAULT_GF_BOOST 2000
 #define GROUP_ADAPTIVE_MAXQ 1
+
 static void init_gf_stats(GF_GROUP_STATS *gf_stats);
+static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params,
+                                 int is_final_pass);
 
 // Calculate an active area of the image that discounts formatting
 // bars and partially discounts other 0 energy areas.
@@ -51,20 +62,21 @@ static double calculate_active_area(const FRAME_INFO *frame_info,
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
 #define ACT_AREA_CORRECTION 0.5
-static double calculate_modified_err(const FRAME_INFO *frame_info,
-                                     const TWO_PASS *twopass,
-                                     const AV1EncoderConfig *oxcf,
-                                     const FIRSTPASS_STATS *this_frame) {
-  const FIRSTPASS_STATS *const stats = twopass->stats_buf_ctx->total_stats;
-  if (stats == NULL) {
+static double calculate_modified_err_new(const FRAME_INFO *frame_info,
+                                         const FIRSTPASS_STATS *total_stats,
+                                         const FIRSTPASS_STATS *this_stats,
+                                         int vbrbias, double modified_error_min,
+                                         double modified_error_max) {
+  if (total_stats == NULL) {
     return 0;
   }
-  const double av_weight = stats->weight / stats->count;
-  const double av_err = (stats->coded_error * av_weight) / stats->count;
+  const double av_weight = total_stats->weight / total_stats->count;
+  const double av_err =
+      (total_stats->coded_error * av_weight) / total_stats->count;
   double modified_error =
-      av_err * pow(this_frame->coded_error * this_frame->weight /
+      av_err * pow(this_stats->coded_error * this_stats->weight /
                        DOUBLE_DIVIDE_CHECK(av_err),
-                   oxcf->two_pass_vbrbias / 100.0);
+                   vbrbias / 100.0);
 
   // Correction for active area. Frames with a reduced active area
   // (eg due to formatting bars) have a higher error per mb for the
@@ -72,80 +84,69 @@ static double calculate_modified_err(const FRAME_INFO *frame_info,
   // 0.5N blocks of complexity 2X is a little easier than coding N
   // blocks of complexity X.
   modified_error *=
-      pow(calculate_active_area(frame_info, this_frame), ACT_AREA_CORRECTION);
+      pow(calculate_active_area(frame_info, this_stats), ACT_AREA_CORRECTION);
 
-  return fclamp(modified_error, twopass->modified_error_min,
-                twopass->modified_error_max);
+  return fclamp(modified_error, modified_error_min, modified_error_max);
+}
+
+static double calculate_modified_err(const FRAME_INFO *frame_info,
+                                     const TWO_PASS *twopass,
+                                     const AV1EncoderConfig *oxcf,
+                                     const FIRSTPASS_STATS *this_frame) {
+  const FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
+  return calculate_modified_err_new(
+      frame_info, total_stats, this_frame, oxcf->rc_cfg.vbrbias,
+      twopass->modified_error_min, twopass->modified_error_max);
 }
 
 // Resets the first pass file to the given position using a relative seek from
 // the current position.
-static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) {
-  p->stats_in = position;
+static void reset_fpf_position(TWO_PASS_FRAME *p_frame,
+                               const FIRSTPASS_STATS *position) {
+  p_frame->stats_in = position;
 }
 
-static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
-  if (p->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+static int input_stats(TWO_PASS *p, TWO_PASS_FRAME *p_frame,
+                       FIRSTPASS_STATS *fps) {
+  if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
 
-  *fps = *p->stats_in;
-  ++p->stats_in;
+  *fps = *p_frame->stats_in;
+  ++p_frame->stats_in;
   return 1;
 }
 
-static int input_stats_lap(TWO_PASS *p, FIRSTPASS_STATS *fps) {
-  if (p->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+static int input_stats_lap(TWO_PASS *p, TWO_PASS_FRAME *p_frame,
+                           FIRSTPASS_STATS *fps) {
+  if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
 
-  *fps = *p->stats_in;
+  *fps = *p_frame->stats_in;
   /* Move old stats[0] out to accommodate for next frame stats  */
   memmove(p->frame_stats_arr[0], p->frame_stats_arr[1],
-          (p->stats_buf_ctx->stats_in_end - p->stats_in - 1) *
+          (p->stats_buf_ctx->stats_in_end - p_frame->stats_in - 1) *
               sizeof(FIRSTPASS_STATS));
   p->stats_buf_ctx->stats_in_end--;
   return 1;
 }
 
 // Read frame stats at an offset from the current position.
-static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
-  if ((offset >= 0 && p->stats_in + offset >= p->stats_buf_ctx->stats_in_end) ||
-      (offset < 0 && p->stats_in + offset < p->stats_buf_ctx->stats_in_start)) {
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p,
+                                               const TWO_PASS_FRAME *p_frame,
+                                               int offset) {
+  if ((offset >= 0 &&
+       p_frame->stats_in + offset >= p->stats_buf_ctx->stats_in_end) ||
+      (offset < 0 &&
+       p_frame->stats_in + offset < p->stats_buf_ctx->stats_in_start)) {
     return NULL;
   }
 
-  return &p->stats_in[offset];
-}
-
-static void subtract_stats(FIRSTPASS_STATS *section,
-                           const FIRSTPASS_STATS *frame) {
-  section->frame -= frame->frame;
-  section->weight -= frame->weight;
-  section->intra_error -= frame->intra_error;
-  section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
-  section->coded_error -= frame->coded_error;
-  section->sr_coded_error -= frame->sr_coded_error;
-  section->pcnt_inter -= frame->pcnt_inter;
-  section->pcnt_motion -= frame->pcnt_motion;
-  section->pcnt_second_ref -= frame->pcnt_second_ref;
-  section->pcnt_neutral -= frame->pcnt_neutral;
-  section->intra_skip_pct -= frame->intra_skip_pct;
-  section->inactive_zone_rows -= frame->inactive_zone_rows;
-  section->inactive_zone_cols -= frame->inactive_zone_cols;
-  section->MVr -= frame->MVr;
-  section->mvr_abs -= frame->mvr_abs;
-  section->MVc -= frame->MVc;
-  section->mvc_abs -= frame->mvc_abs;
-  section->MVrv -= frame->MVrv;
-  section->MVcv -= frame->MVcv;
-  section->mv_in_out_count -= frame->mv_in_out_count;
-  section->new_mv_count -= frame->new_mv_count;
-  section->count -= frame->count;
-  section->duration -= frame->duration;
+  return &p_frame->stats_in[offset];
 }
 
 // This function returns the maximum target rate per frame.
 static int frame_max_bits(const RATE_CONTROL *rc,
                           const AV1EncoderConfig *oxcf) {
   int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
-                      (int64_t)oxcf->two_pass_vbrmax_section) /
+                      (int64_t)oxcf->rc_cfg.vbrmax_section) /
                      100;
   if (max_bits < 0)
     max_bits = 0;
@@ -170,18 +171,109 @@ static double calc_correction_factor(double err_per_mb, int q) {
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-static void twopass_update_bpm_factor(TWO_PASS *twopass) {
+// Based on history adjust expectations of bits per macroblock.
+static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
+  TWO_PASS *twopass = &cpi->ppi->twopass;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+
   // Based on recent history adjust expectations of bits per macroblock.
-  double last_group_rate_err =
-      (double)twopass->rolling_arf_group_actual_bits /
-      DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
-  last_group_rate_err = AOMMAX(0.25, AOMMIN(4.0, last_group_rate_err));
-  twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
-  twopass->bpm_factor = AOMMAX(0.25, AOMMIN(4.0, twopass->bpm_factor));
+  double damp_fac = AOMMAX(5.0, rate_err_tol / 10.0);
+  double rate_err_factor = 1.0;
+  const double adj_limit = AOMMAX(0.20, (double)(100 - rate_err_tol) / 200.0);
+  const double min_fac = 1.0 - adj_limit;
+  const double max_fac = 1.0 + adj_limit;
+
+  if (cpi->third_pass_ctx && cpi->third_pass_ctx->frame_info_count > 0) {
+    int64_t actual_bits = 0;
+    int64_t target_bits = 0;
+    double factor = 0.0;
+    int count = 0;
+    for (int i = 0; i < cpi->third_pass_ctx->frame_info_count; i++) {
+      actual_bits += cpi->third_pass_ctx->frame_info[i].actual_bits;
+      target_bits += cpi->third_pass_ctx->frame_info[i].bits_allocated;
+      factor += cpi->third_pass_ctx->frame_info[i].bpm_factor;
+      count++;
+    }
+
+    if (count == 0) {
+      factor = 1.0;
+    } else {
+      factor /= (double)count;
+    }
+
+    factor *= (double)actual_bits / DOUBLE_DIVIDE_CHECK((double)target_bits);
+
+    if ((twopass->bpm_factor <= 1 && factor < twopass->bpm_factor) ||
+        (twopass->bpm_factor >= 1 && factor > twopass->bpm_factor)) {
+      twopass->bpm_factor = factor;
+      twopass->bpm_factor =
+          AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor));
+    }
+  }
+
+  int err_estimate = p_rc->rate_error_estimate;
+  int64_t bits_left = cpi->ppi->twopass.bits_left;
+  int64_t total_actual_bits = p_rc->total_actual_bits;
+  int64_t bits_off_target = p_rc->vbr_bits_off_target;
+  double rolling_arf_group_actual_bits =
+      (double)twopass->rolling_arf_group_actual_bits;
+  double rolling_arf_group_target_bits =
+      (double)twopass->rolling_arf_group_target_bits;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  const int is_parallel_frame =
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 ? 1 : 0;
+  const int simulate_parallel_frame =
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE
+          ? is_parallel_frame
+          : 0;
+  total_actual_bits = simulate_parallel_frame ? p_rc->temp_total_actual_bits
+                                              : p_rc->total_actual_bits;
+  bits_off_target = simulate_parallel_frame ? p_rc->temp_vbr_bits_off_target
+                                            : p_rc->vbr_bits_off_target;
+  bits_left = simulate_parallel_frame ? p_rc->temp_bits_left
+                                      : cpi->ppi->twopass.bits_left;
+  rolling_arf_group_target_bits =
+      (double)(simulate_parallel_frame
+                   ? p_rc->temp_rolling_arf_group_target_bits
+                   : twopass->rolling_arf_group_target_bits);
+  rolling_arf_group_actual_bits =
+      (double)(simulate_parallel_frame
+                   ? p_rc->temp_rolling_arf_group_actual_bits
+                   : twopass->rolling_arf_group_actual_bits);
+  err_estimate = simulate_parallel_frame ? p_rc->temp_rate_error_estimate
+                                         : p_rc->rate_error_estimate;
+#endif
+
+  if (p_rc->bits_off_target && total_actual_bits > 0) {
+    if (cpi->ppi->lap_enabled) {
+      rate_err_factor = rolling_arf_group_actual_bits /
+                        DOUBLE_DIVIDE_CHECK(rolling_arf_group_target_bits);
+    } else {
+      rate_err_factor = 1.0 - ((double)(bits_off_target) /
+                               AOMMAX(total_actual_bits, bits_left));
+    }
+    rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor));
+
+    // Adjustment is damped if this is 1 pass with look ahead processing
+    // (as there are only ever a few frames of data) and for all but the first
+    // GOP in normal two pass.
+    if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) {
+      rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac);
+    }
+  }
+
+  // Is the rate control trending in the right direction. Only make
+  // an adjustment if things are getting worse.
+  if ((rate_err_factor < 1.0 && err_estimate >= 0) ||
+      (rate_err_factor > 1.0 && err_estimate <= 0)) {
+    twopass->bpm_factor *= rate_err_factor;
+    twopass->bpm_factor = AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor));
+  }
 }
 
 static int qbpm_enumerator(int rate_err_tol) {
-  return 1350000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75);
+  return 1200000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75);
 }
 
 // Similar to find_qindex_by_rate() function in ratectrl.c, but includes
@@ -211,94 +303,121 @@ static int find_qindex_by_rate_with_correction(
   return low;
 }
 
-static int get_twopass_worst_quality(AV1_COMP *cpi, const double section_err,
+/*!\brief Choose a target maximum Q for a group of frames
+ *
+ * \ingroup rate_control
+ *
+ * This function is used to estimate a suitable maximum Q for a
+ * group of frames. Inititally it is called to get a crude estimate
+ * for the whole clip. It is then called for each ARF/GF group to get
+ * a revised estimate for that group.
+ *
+ * \param[in]    cpi                 Top-level encoder structure
+ * \param[in]    av_frame_err        The average per frame coded error score
+ *                                   for frames making up this section/group.
+ * \param[in]    inactive_zone       Used to mask off /ignore part of the
+ *                                   frame. The most common use case is where
+ *                                   a wide format video (e.g. 16:9) is
+ *                                   letter-boxed into a more square format.
+ *                                   Here we want to ignore the bands at the
+ *                                   top and bottom.
+ * \param[in]    av_target_bandwidth The target bits per frame
+ *
+ * \return The maximum Q for frames in the group.
+ */
+static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err,
                                      double inactive_zone,
-                                     int section_target_bandwidth,
-                                     double group_weight_factor) {
+                                     int av_target_bandwidth) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+  inactive_zone = fclamp(inactive_zone, 0.0, 0.9999);
 
-  inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
-
-  if (section_target_bandwidth <= 0) {
+  if (av_target_bandwidth <= 0) {
     return rc->worst_quality;  // Highest value allowed
   } else {
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+    const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
                             ? cpi->initial_mbs
                             : cpi->common.mi_params.MBs;
     const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
-    const double av_err_per_mb = section_err / active_mbs;
+    const double av_err_per_mb = av_frame_err / (1.0 - inactive_zone);
     const int target_norm_bits_per_mb =
-        (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
-        active_mbs;
-    int rate_err_tol =
-        AOMMIN(cpi->oxcf.under_shoot_pct, cpi->oxcf.over_shoot_pct);
+        (int)((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs;
+    int rate_err_tol = AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
+
+    // Update bpm correction factor based on previous GOP rate error.
+    twopass_update_bpm_factor(cpi, rate_err_tol);
 
-    twopass_update_bpm_factor(&cpi->twopass);
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     int q = find_qindex_by_rate_with_correction(
-        target_norm_bits_per_mb, cpi->common.seq_params.bit_depth,
-        av_err_per_mb, group_weight_factor, rate_err_tol, rc->best_quality,
-        rc->worst_quality);
+        target_norm_bits_per_mb, cpi->common.seq_params->bit_depth,
+        av_err_per_mb, cpi->ppi->twopass.bpm_factor, rate_err_tol,
+        rc->best_quality, rc->worst_quality);
 
     // Restriction on active max q for constrained quality mode.
-    if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level);
+    if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level);
     return q;
   }
 }
 
-#define SR_DIFF_PART 0.0015
-#define MOTION_AMP_PART 0.003
 #define INTRA_PART 0.005
 #define DEFAULT_DECAY_LIMIT 0.75
-#define LOW_SR_DIFF_TRHESH 0.1
-#define SR_DIFF_MAX 128.0
+#define LOW_SR_DIFF_TRHESH 0.01
 #define NCOUNT_FRAME_II_THRESH 5.0
+#define LOW_CODED_ERR_PER_MB 0.01
 
-static double get_sr_decay_rate(const FRAME_INFO *frame_info,
-                                const FIRSTPASS_STATS *frame) {
-  const int num_mbs = frame_info->num_mbs;
-  double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+/* This function considers how the quality of prediction may be deteriorating
+ * with distance. It comapres the coded error for the last frame and the
+ * second reference frame (usually two frames old) and also applies a factor
+ * based on the extent of INTRA coding.
+ *
+ * The decay factor is then used to reduce the contribution of frames further
+ * from the alt-ref or golden frame, to the bitframe boost calculation for that
+ * alt-ref or golden frame.
+ */
+static double get_sr_decay_rate(const FIRSTPASS_STATS *frame) {
+  double sr_diff = (frame->sr_coded_error - frame->coded_error);
   double sr_decay = 1.0;
   double modified_pct_inter;
   double modified_pcnt_intra;
-  const double motion_amplitude_factor =
-      frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
 
   modified_pct_inter = frame->pcnt_inter;
-  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
-      (double)NCOUNT_FRAME_II_THRESH) {
+  if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
+      ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+       (double)NCOUNT_FRAME_II_THRESH)) {
     modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
   }
   modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
 
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
-    sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX);
-    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
-               (MOTION_AMP_PART * motion_amplitude_factor) -
-               (INTRA_PART * modified_pcnt_intra);
+    double sr_diff_part = ((sr_diff * 0.25) / frame->intra_error);
+    sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra);
   }
-  return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+  return AOMMAX(sr_decay, DEFAULT_DECAY_LIMIT);
 }
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_zero_motion_factor(const FRAME_INFO *frame_info,
-                                     const FIRSTPASS_STATS *frame) {
+static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) {
   const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
-  double sr_decay = get_sr_decay_rate(frame_info, frame);
+  double sr_decay = get_sr_decay_rate(frame);
   return AOMMIN(sr_decay, zero_motion_pct);
 }
 
-#define ZM_POWER_FACTOR 0.75
+#define DEFAULT_ZM_FACTOR 0.5
+static double get_prediction_decay_rate(const FIRSTPASS_STATS *frame_stats) {
+  const double sr_decay_rate = get_sr_decay_rate(frame_stats);
+  double zero_motion_factor =
+      DEFAULT_ZM_FACTOR * (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
 
-static double get_prediction_decay_rate(const FRAME_INFO *frame_info,
-                                        const FIRSTPASS_STATS *next_frame) {
-  const double sr_decay_rate = get_sr_decay_rate(frame_info, next_frame);
-  const double zero_motion_factor =
-      (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
-                  ZM_POWER_FACTOR));
+  // Clamp value to range 0.0 to 1.0
+  // This should happen anyway if input values are sensibly clamped but checked
+  // here just in case.
+  if (zero_motion_factor > 1.0)
+    zero_motion_factor = 1.0;
+  else if (zero_motion_factor < 0.0)
+    zero_motion_factor = 0.0;
 
   return AOMMAX(zero_motion_factor,
                 (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
@@ -307,7 +426,8 @@ static double get_prediction_decay_rate(const FRAME_INFO *frame_info,
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(TWO_PASS *const twopass,
+static int detect_transition_to_still(const FIRSTPASS_INFO *firstpass_info,
+                                      int next_stats_index,
                                       const int min_gf_interval,
                                       const int frame_interval,
                                       const int still_interval,
@@ -318,16 +438,19 @@ static int detect_transition_to_still(TWO_PASS *const twopass,
   // instead of a clean scene cut.
   if (frame_interval > min_gf_interval && loop_decay_rate >= 0.999 &&
       last_decay_rate < 0.9) {
-    int j;
-    // Look ahead a few frames to see if static condition persists...
-    for (j = 0; j < still_interval; ++j) {
-      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
-      if (stats >= twopass->stats_buf_ctx->stats_in_end) break;
-
-      if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+    int stats_left =
+        av1_firstpass_info_future_count(firstpass_info, next_stats_index);
+    if (stats_left >= still_interval) {
+      int j;
+      // Look ahead a few frames to see if static condition persists...
+      for (j = 0; j < still_interval; ++j) {
+        const FIRSTPASS_STATS *stats =
+            av1_firstpass_info_peek(firstpass_info, next_stats_index + j);
+        if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+      }
+      // Only if it does do we signal a transition to still.
+      return j == still_interval;
     }
-    // Only if it does do we signal a transition to still.
-    return j == still_interval;
   }
   return 0;
 }
@@ -335,8 +458,10 @@ static int detect_transition_to_still(TWO_PASS *const twopass,
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
 // reflect this.
-static int detect_flash(const TWO_PASS *twopass, const int offset) {
-  const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+static int detect_flash(const TWO_PASS *twopass,
+                        const TWO_PASS_FRAME *twopass_frame, const int offset) {
+  const FIRSTPASS_STATS *const next_frame =
+      read_frame_stats(twopass, twopass_frame, offset);
 
   // What we are looking for here is a situation where there is a
   // brief break in prediction (such as a flash) but subsequent frames
@@ -350,7 +475,8 @@ static int detect_flash(const TWO_PASS *twopass, const int offset) {
 
 // Update the motion related elements to the GF arf boost calculation.
 static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
-                                          GF_GROUP_STATS *gf_stats) {
+                                          GF_GROUP_STATS *gf_stats, double f_w,
+                                          double f_h) {
   const double pct = stats->pcnt_motion;
 
   // Accumulate Motion In/Out of frame stats.
@@ -367,9 +493,11 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
         fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
 
     gf_stats->mv_ratio_accumulator +=
-        pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
+        pct *
+        (mvr_ratio < stats->mvr_abs * f_h ? mvr_ratio : stats->mvr_abs * f_h);
     gf_stats->mv_ratio_accumulator +=
-        pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
+        pct *
+        (mvc_ratio < stats->mvc_abs * f_w ? mvc_ratio : stats->mvc_abs * f_w);
   }
 }
 
@@ -384,17 +512,16 @@ static void accumulate_this_frame_stats(const FIRSTPASS_STATS *stats,
   gf_stats->gf_group_inactive_zone_rows += stats->inactive_zone_rows;
 }
 
-static void accumulate_next_frame_stats(
-    const FIRSTPASS_STATS *stats, const FRAME_INFO *frame_info,
-    TWO_PASS *const twopass, const int flash_detected,
-    const int frames_since_key, const int cur_idx, const int can_disable_arf,
-    const int min_gf_interval, GF_GROUP_STATS *gf_stats) {
-  accumulate_frame_motion_stats(stats, gf_stats);
+void av1_accumulate_next_frame_stats(const FIRSTPASS_STATS *stats,
+                                     const int flash_detected,
+                                     const int frames_since_key,
+                                     const int cur_idx,
+                                     GF_GROUP_STATS *gf_stats, int f_w,
+                                     int f_h) {
+  accumulate_frame_motion_stats(stats, gf_stats, f_w, f_h);
   // sum up the metric values of current gf group
   gf_stats->avg_sr_coded_error += stats->sr_coded_error;
-  gf_stats->avg_tr_coded_error += stats->tr_coded_error;
   gf_stats->avg_pcnt_second_ref += stats->pcnt_second_ref;
-  gf_stats->avg_pcnt_third_ref += stats->pcnt_third_ref;
   gf_stats->avg_new_mv_count += stats->new_mv_count;
   gf_stats->avg_wavelet_energy += stats->frame_avg_wavelet_energy;
   if (fabs(stats->raw_error_stdev) > 0.000001) {
@@ -405,45 +532,23 @@ static void accumulate_next_frame_stats(
   // Accumulate the effect of prediction quality decay
   if (!flash_detected) {
     gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate;
-    gf_stats->loop_decay_rate = get_prediction_decay_rate(frame_info, stats);
+    gf_stats->loop_decay_rate = get_prediction_decay_rate(stats);
 
     gf_stats->decay_accumulator =
         gf_stats->decay_accumulator * gf_stats->loop_decay_rate;
 
     // Monitor for static sections.
     if ((frames_since_key + cur_idx - 1) > 1) {
-      gf_stats->zero_motion_accumulator =
-          AOMMIN(gf_stats->zero_motion_accumulator,
-                 get_zero_motion_factor(frame_info, stats));
-    }
-
-    // Break clause to detect very still sections after motion. For example,
-    // a static image after a fade or other transition.
-    if (can_disable_arf &&
-        detect_transition_to_still(twopass, min_gf_interval, cur_idx, 5,
-                                   gf_stats->loop_decay_rate,
-                                   gf_stats->last_loop_decay_rate)) {
-      gf_stats->allow_alt_ref = 0;
+      gf_stats->zero_motion_accumulator = AOMMIN(
+          gf_stats->zero_motion_accumulator, get_zero_motion_factor(stats));
     }
   }
 }
 
-static void average_gf_stats(const int total_frame,
-                             const FIRSTPASS_STATS *last_stat,
-                             GF_GROUP_STATS *gf_stats) {
+static void average_gf_stats(const int total_frame, GF_GROUP_STATS *gf_stats) {
   if (total_frame) {
     gf_stats->avg_sr_coded_error /= total_frame;
-    gf_stats->avg_tr_coded_error /= total_frame;
     gf_stats->avg_pcnt_second_ref /= total_frame;
-    if (total_frame - 1) {
-      gf_stats->avg_pcnt_third_ref_nolast =
-          (gf_stats->avg_pcnt_third_ref - last_stat->pcnt_third_ref) /
-          (total_frame - 1);
-    } else {
-      gf_stats->avg_pcnt_third_ref_nolast =
-          gf_stats->avg_pcnt_third_ref / total_frame;
-    }
-    gf_stats->avg_pcnt_third_ref /= total_frame;
     gf_stats->avg_new_mv_count /= total_frame;
     gf_stats->avg_wavelet_energy /= total_frame;
   }
@@ -452,36 +557,6 @@ static void average_gf_stats(const int total_frame,
     gf_stats->avg_raw_err_stdev /= gf_stats->non_zero_stdev_count;
 }
 
-static void get_features_from_gf_stats(const GF_GROUP_STATS *gf_stats,
-                                       const GF_FRAME_STATS *first_frame,
-                                       const GF_FRAME_STATS *last_frame,
-                                       const int num_mbs,
-                                       const int constrained_gf_group,
-                                       const int kf_zeromotion_pct,
-                                       const int num_frames, float *features) {
-  *features++ = (float)gf_stats->abs_mv_in_out_accumulator;
-  *features++ = (float)(gf_stats->avg_new_mv_count / num_mbs);
-  *features++ = (float)gf_stats->avg_pcnt_second_ref;
-  *features++ = (float)gf_stats->avg_pcnt_third_ref;
-  *features++ = (float)gf_stats->avg_pcnt_third_ref_nolast;
-  *features++ = (float)(gf_stats->avg_sr_coded_error / num_mbs);
-  *features++ = (float)(gf_stats->avg_tr_coded_error / num_mbs);
-  *features++ = (float)(gf_stats->avg_wavelet_energy / num_mbs);
-  *features++ = (float)(constrained_gf_group);
-  *features++ = (float)gf_stats->decay_accumulator;
-  *features++ = (float)(first_frame->frame_coded_error / num_mbs);
-  *features++ = (float)(first_frame->frame_sr_coded_error / num_mbs);
-  *features++ = (float)(first_frame->frame_tr_coded_error / num_mbs);
-  *features++ = (float)(first_frame->frame_err / num_mbs);
-  *features++ = (float)(kf_zeromotion_pct);
-  *features++ = (float)(last_frame->frame_coded_error / num_mbs);
-  *features++ = (float)(last_frame->frame_sr_coded_error / num_mbs);
-  *features++ = (float)(last_frame->frame_tr_coded_error / num_mbs);
-  *features++ = (float)num_frames;
-  *features++ = (float)gf_stats->mv_ratio_accumulator;
-  *features++ = (float)gf_stats->non_zero_stdev_count;
-}
-
 #define BOOST_FACTOR 12.5
 static double baseline_err_per_mb(const FRAME_INFO *frame_info) {
   unsigned int screen_area = frame_info->frame_height * frame_info->frame_width;
@@ -495,22 +570,18 @@ static double baseline_err_per_mb(const FRAME_INFO *frame_info) {
   }
 }
 
-static double calc_frame_boost(const RATE_CONTROL *rc,
+static double calc_frame_boost(const PRIMARY_RATE_CONTROL *p_rc,
                                const FRAME_INFO *frame_info,
                                const FIRSTPASS_STATS *this_frame,
                                double this_frame_mv_in_out, double max_boost) {
   double frame_boost;
-  const double lq = av1_convert_qindex_to_q(rc->avg_frame_qindex[INTER_FRAME],
+  const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME],
                                             frame_info->bit_depth);
   const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
   const double active_area = calculate_active_area(frame_info, this_frame);
-  int num_mbs = frame_info->num_mbs;
-
-  // Correct for any inactive region in the image
-  num_mbs = (int)AOMMAX(1, num_mbs * active_area);
 
   // Underlying boost factor is based on inter error ratio.
-  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * num_mbs,
+  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area,
                        this_frame->intra_error * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
   frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
@@ -527,22 +598,18 @@ static double calc_frame_boost(const RATE_CONTROL *rc,
   return AOMMIN(frame_boost, max_boost * boost_q_correction);
 }
 
-static double calc_kf_frame_boost(const RATE_CONTROL *rc,
+static double calc_kf_frame_boost(const PRIMARY_RATE_CONTROL *p_rc,
                                   const FRAME_INFO *frame_info,
                                   const FIRSTPASS_STATS *this_frame,
                                   double *sr_accumulator, double max_boost) {
   double frame_boost;
-  const double lq = av1_convert_qindex_to_q(rc->avg_frame_qindex[INTER_FRAME],
+  const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME],
                                             frame_info->bit_depth);
   const double boost_q_correction = AOMMIN((0.50 + (lq * 0.015)), 2.00);
   const double active_area = calculate_active_area(frame_info, this_frame);
-  int num_mbs = frame_info->num_mbs;
-
-  // Correct for any inactive region in the image
-  num_mbs = (int)AOMMAX(1, num_mbs * active_area);
 
   // Underlying boost factor is based on inter error ratio.
-  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * num_mbs,
+  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area,
                        this_frame->intra_error * active_area) /
                 DOUBLE_DIVIDE_CHECK(
                     (this_frame->coded_error + *sr_accumulator) * active_area);
@@ -562,8 +629,8 @@ static double calc_kf_frame_boost(const RATE_CONTROL *rc,
   return AOMMIN(frame_boost, max_boost * boost_q_correction);
 }
 
-static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost,
-                                   int frames_to_project,
+static int get_projected_gfu_boost(const PRIMARY_RATE_CONTROL *p_rc,
+                                   int gfu_boost, int frames_to_project,
                                    int num_stats_used_for_gfu_boost) {
   /*
    * If frames_to_project is equal to num_stats_used_for_gfu_boost,
@@ -573,7 +640,7 @@ static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost,
    */
   if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost;
 
-  double min_boost_factor = sqrt(rc->baseline_gf_interval);
+  double min_boost_factor = sqrt(p_rc->baseline_gf_interval);
   // Get the current tpl factor (number of frames = frames_to_project).
   double tpl_factor = av1_get_gfu_boost_projection_factor(
       min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project);
@@ -586,11 +653,14 @@ static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost,
 }
 
 #define GF_MAX_BOOST 90.0
+#define GF_MIN_BOOST 50
 #define MIN_DECAY_FACTOR 0.01
-int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
-                       FRAME_INFO *frame_info, int offset, int f_frames,
-                       int b_frames, int *num_fpstats_used,
-                       int *num_fpstats_required) {
+int av1_calc_arf_boost(const TWO_PASS *twopass,
+                       const TWO_PASS_FRAME *twopass_frame,
+                       const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+                       int offset, int f_frames, int b_frames,
+                       int *num_fpstats_used, int *num_fpstats_required,
+                       int project_gfu_boost) {
   int i;
   GF_GROUP_STATS gf_stats;
   init_gf_stats(&gf_stats);
@@ -601,21 +671,23 @@ int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
 
   // Search forward from the proposed arf/next gf position.
   for (i = 0; i < f_frames; ++i) {
-    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    const FIRSTPASS_STATS *this_frame =
+        read_frame_stats(twopass, twopass_frame, i + offset);
     if (this_frame == NULL) break;
 
     // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(this_frame, &gf_stats);
+    accumulate_frame_motion_stats(this_frame, &gf_stats,
+                                  frame_info->frame_width,
+                                  frame_info->frame_height);
 
     // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(twopass, i + offset) ||
-                     detect_flash(twopass, i + offset + 1);
+    flash_detected = detect_flash(twopass, twopass_frame, i + offset) ||
+                     detect_flash(twopass, twopass_frame, i + offset + 1);
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      gf_stats.decay_accumulator *=
-          get_prediction_decay_rate(frame_info, this_frame);
+      gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame);
       gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
                                        ? MIN_DECAY_FACTOR
                                        : gf_stats.decay_accumulator;
@@ -623,7 +695,7 @@ int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
 
     boost_score +=
         gf_stats.decay_accumulator *
-        calc_frame_boost(rc, frame_info, this_frame,
+        calc_frame_boost(p_rc, frame_info, this_frame,
                          gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
     if (num_fpstats_used) (*num_fpstats_used)++;
   }
@@ -635,21 +707,23 @@ int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
   init_gf_stats(&gf_stats);
   // Search backward towards last gf position.
   for (i = -1; i >= -b_frames; --i) {
-    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    const FIRSTPASS_STATS *this_frame =
+        read_frame_stats(twopass, twopass_frame, i + offset);
     if (this_frame == NULL) break;
 
     // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(this_frame, &gf_stats);
+    accumulate_frame_motion_stats(this_frame, &gf_stats,
+                                  frame_info->frame_width,
+                                  frame_info->frame_height);
 
     // We want to discount the the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(twopass, i + offset) ||
-                     detect_flash(twopass, i + offset + 1);
+    flash_detected = detect_flash(twopass, twopass_frame, i + offset) ||
+                     detect_flash(twopass, twopass_frame, i + offset + 1);
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      gf_stats.decay_accumulator *=
-          get_prediction_decay_rate(frame_info, this_frame);
+      gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame);
       gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
                                        ? MIN_DECAY_FACTOR
                                        : gf_stats.decay_accumulator;
@@ -657,22 +731,22 @@ int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
 
     boost_score +=
         gf_stats.decay_accumulator *
-        calc_frame_boost(rc, frame_info, this_frame,
+        calc_frame_boost(p_rc, frame_info, this_frame,
                          gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
     if (num_fpstats_used) (*num_fpstats_used)++;
   }
   arf_boost += (int)boost_score;
 
-  if (num_fpstats_required) {
+  if (project_gfu_boost) {
+    assert(num_fpstats_required != NULL);
+    assert(num_fpstats_used != NULL);
     *num_fpstats_required = f_frames + b_frames;
-    if (num_fpstats_used) {
-      arf_boost = get_projected_gfu_boost(rc, arf_boost, *num_fpstats_required,
-                                          *num_fpstats_used);
-    }
+    arf_boost = get_projected_gfu_boost(p_rc, arf_boost, *num_fpstats_required,
+                                        *num_fpstats_used);
   }
 
-  if (arf_boost < ((b_frames + f_frames) * 50))
-    arf_boost = ((b_frames + f_frames) * 50);
+  if (arf_boost < ((b_frames + f_frames) * GF_MIN_BOOST))
+    arf_boost = ((b_frames + f_frames) * GF_MIN_BOOST);
 
   return arf_boost;
 }
@@ -696,11 +770,23 @@ static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
   return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
 }
 
-// Calculate the total bits to allocate in this GF/ARF group.
+/*!\brief Calculates the bit target for this GF/ARF group
+ *
+ * \ingroup rate_control
+ *
+ * Calculates the total bits to allocate in this GF/ARF group.
+ *
+ * \param[in]    cpi              Top-level encoder structure
+ * \param[in]    gf_group_err     Cumulative coded error score for the
+ *                                frames making up this group.
+ *
+ * \return The target total number of bits for this GF/ARF group.
+ */
 static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
                                              double gf_group_err) {
   const RATE_CONTROL *const rc = &cpi->rc;
-  const TWO_PASS *const twopass = &cpi->twopass;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const TWO_PASS *const twopass = &cpi->ppi->twopass;
   const int max_bits = frame_max_bits(rc, &cpi->oxcf);
   int64_t total_group_bits;
 
@@ -720,8 +806,8 @@ static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
                                : total_group_bits;
 
   // Clip based on user supplied data rate variability limit.
-  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
-    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+  if (total_group_bits > (int64_t)max_bits * p_rc->baseline_gf_interval)
+    total_group_bits = (int64_t)max_bits * p_rc->baseline_gf_interval;
 
   return total_group_bits;
 }
@@ -754,7 +840,6 @@ static int calculate_boost_bits(int frame_count, int boost,
 // inverse of calculate_boost_bits().
 static int calculate_boost_factor(int frame_count, int bits,
                                   int64_t total_group_bits) {
-  aom_clear_system_state();
   return (int)(100.0 * frame_count * bits / (total_group_bits - bits));
 }
 
@@ -767,7 +852,8 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
                                               int64_t group_bits,
                                               int frame_type) {
   const AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const int temporal_layer_id = cm->temporal_layer_id;
   const int spatial_layer_id = cm->spatial_layer_id;
   for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1;
@@ -778,7 +864,7 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
     }
 
     const AV1_LEVEL target_level =
-        cpi->level_params.target_seq_level_idx[index];
+        cpi->ppi->level_params.target_seq_level_idx[index];
     if (target_level >= SEQ_LEVELS) continue;
 
     assert(is_valid_seq_level_idx(target_level));
@@ -792,18 +878,20 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
       const int level_enforced_max_kf_bits = target_bits_per_frame * 8;
       if (bits_assigned > level_enforced_max_kf_bits) {
         const int frames = rc->frames_to_key - 1;
-        rc->kf_boost = calculate_boost_factor(
+        p_rc->kf_boost = calculate_boost_factor(
             frames, level_enforced_max_kf_bits, group_bits);
-        bits_assigned = calculate_boost_bits(frames, rc->kf_boost, group_bits);
+        bits_assigned =
+            calculate_boost_bits(frames, p_rc->kf_boost, group_bits);
       }
     } else if (frame_type == 1) {
       // Maximum bits for arf is 4 times the target_bits_per_frame.
       const int level_enforced_max_arf_bits = target_bits_per_frame * 4;
       if (bits_assigned > level_enforced_max_arf_bits) {
-        rc->gfu_boost = calculate_boost_factor(
-            rc->baseline_gf_interval, level_enforced_max_arf_bits, group_bits);
-        bits_assigned = calculate_boost_bits(rc->baseline_gf_interval,
-                                             rc->gfu_boost, group_bits);
+        p_rc->gfu_boost =
+            calculate_boost_factor(p_rc->baseline_gf_interval,
+                                   level_enforced_max_arf_bits, group_bits);
+        bits_assigned = calculate_boost_bits(p_rc->baseline_gf_interval,
+                                             p_rc->gfu_boost, group_bits);
       }
     } else {
       assert(0);
@@ -813,12 +901,12 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
   return bits_assigned;
 }
 
-// Compile time switch on alternate algorithm to allocate bits in ARF groups
-// #define ALT_ARF_ALLOCATION
-#ifdef ALT_ARF_ALLOCATION
+// Allocate bits to each frame in a GF / ARF group
 double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0,  0.70, 0.55, 0.60,
                                               0.60, 1.0,  1.0 };
-static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
+static void allocate_gf_group_bits(GF_GROUP *gf_group,
+                                   PRIMARY_RATE_CONTROL *const p_rc,
+                                   RATE_CONTROL *const rc,
                                    int64_t gf_group_bits, int gf_arf_bits,
                                    int key_frame, int use_arf) {
   int64_t total_group_bits = gf_group_bits;
@@ -826,26 +914,17 @@ static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
   const int gf_group_size = gf_group->size;
   int layer_frames[MAX_ARF_LAYERS + 1] = { 0 };
 
-  // Subtract the extra bits set aside for ARF frames from the Group Total
-  if (use_arf || !key_frame) total_group_bits -= gf_arf_bits;
-
-  if (rc->baseline_gf_interval)
-    base_frame_bits = (int)(total_group_bits / rc->baseline_gf_interval);
-  else
-    base_frame_bits = (int)1;
-
   // For key frames the frame target rate is already set and it
   // is also the golden frame.
   // === [frame_index == 0] ===
-  int frame_index = 0;
-  if (!key_frame) {
-    if (rc->source_alt_ref_active)
-      gf_group->bit_allocation[frame_index] = 0;
-    else
-      gf_group->bit_allocation[frame_index] =
-          base_frame_bits + (int)(gf_arf_bits * layer_fraction[1]);
-  }
-  frame_index++;
+  int frame_index = !!key_frame;
+
+  // Subtract the extra bits set aside for ARF frames from the Group Total
+  if (use_arf) total_group_bits -= gf_arf_bits;
+
+  int num_frames =
+      AOMMAX(1, p_rc->baseline_gf_interval - (rc->frames_since_key == 0));
+  base_frame_bits = (int)(total_group_bits / num_frames);
 
   // Check the number of frames in each layer in case we have a
   // non standard group length.
@@ -853,7 +932,6 @@ static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
   for (int idx = frame_index; idx < gf_group_size; ++idx) {
     if ((gf_group->update_type[idx] == ARF_UPDATE) ||
         (gf_group->update_type[idx] == INTNL_ARF_UPDATE)) {
-      // max_arf_layer = AOMMAX(max_arf_layer, gf_group->layer_depth[idx]);
       layer_frames[gf_group->layer_depth[idx]]++;
     }
   }
@@ -888,92 +966,23 @@ static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
   // in the next GOP. For GF group, next GOP will overwrite the rate allocation.
   // Setting this frame to use 0 bit (of out the current GOP budget) will
   // simplify logics in reference frame management.
-  gf_group->bit_allocation[gf_group_size] = 0;
-}
-#else
-static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
-                                   int64_t gf_group_bits, int gf_arf_bits,
-                                   int key_frame, int use_arf) {
-  int64_t total_group_bits = gf_group_bits;
-
-  // For key frames the frame target rate is already set and it
-  // is also the golden frame.
-  // === [frame_index == 0] ===
-  int frame_index = 0;
-  if (!key_frame) {
-    if (rc->source_alt_ref_active)
-      gf_group->bit_allocation[frame_index] = 0;
-    else
-      gf_group->bit_allocation[frame_index] = gf_arf_bits;
-  }
-
-  // Deduct the boost bits for arf (or gf if it is not a key frame)
-  // from the group total.
-  if (use_arf || !key_frame) total_group_bits -= gf_arf_bits;
-
-  frame_index++;
-
-  // Store the bits to spend on the ARF if there is one.
-  // === [frame_index == 1] ===
-  if (use_arf) {
-    gf_group->bit_allocation[frame_index] = gf_arf_bits;
-    ++frame_index;
-  }
-
-  const int gf_group_size = gf_group->size;
-  int arf_depth_bits[MAX_ARF_LAYERS + 1] = { 0 };
-  int arf_depth_count[MAX_ARF_LAYERS + 1] = { 0 };
-  int arf_depth_boost[MAX_ARF_LAYERS + 1] = { 0 };
-  int total_arfs = 0;
-  int total_overlays = rc->source_alt_ref_active;
-
-  for (int idx = 0; idx < gf_group_size; ++idx) {
-    if (gf_group->update_type[idx] == ARF_UPDATE ||
-        gf_group->update_type[idx] == INTNL_ARF_UPDATE ||
-        gf_group->update_type[idx] == LF_UPDATE) {
-      arf_depth_boost[gf_group->layer_depth[idx]] += gf_group->arf_boost[idx];
-      ++arf_depth_count[gf_group->layer_depth[idx]];
-    }
-  }
-
-  for (int idx = 2; idx <= MAX_ARF_LAYERS; ++idx) {
-    arf_depth_bits[idx] =
-        calculate_boost_bits(rc->baseline_gf_interval - total_arfs -
-                                 total_overlays - arf_depth_count[idx],
-                             arf_depth_boost[idx], total_group_bits);
-    total_group_bits -= arf_depth_bits[idx];
-    total_arfs += arf_depth_count[idx];
-  }
-
-  for (int idx = frame_index; idx < gf_group_size; ++idx) {
-    switch (gf_group->update_type[idx]) {
-      case ARF_UPDATE:
-      case INTNL_ARF_UPDATE:
-      case LF_UPDATE:
-        gf_group->bit_allocation[idx] =
-            (int)(((int64_t)arf_depth_bits[gf_group->layer_depth[idx]] *
-                   gf_group->arf_boost[idx]) /
-                  arf_depth_boost[gf_group->layer_depth[idx]]);
-        break;
-      case INTNL_OVERLAY_UPDATE:
-      case OVERLAY_UPDATE:
-      default: gf_group->bit_allocation[idx] = 0; break;
-    }
-  }
-
-  // Set the frame following the current GOP to 0 bit allocation. For ARF
-  // groups, this next frame will be overlay frame, which is the first frame
-  // in the next GOP. For GF group, next GOP will overwrite the rate allocation.
-  // Setting this frame to use 0 bit (of out the current GOP budget) will
-  // simplify logics in reference frame management.
-  gf_group->bit_allocation[gf_group_size] = 0;
+  if (gf_group_size < MAX_STATIC_GF_GROUP_LENGTH)
+    gf_group->bit_allocation[gf_group_size] = 0;
 }
-#endif
 
 // Returns true if KF group and GF group both are almost completely static.
-static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion) {
-  return (gf_zero_motion >= 0.995) &&
-         (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
+static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion,
+                                   int is_lap_enabled) {
+  if (is_lap_enabled) {
+    /*
+     * when LAP enabled kf_zero_motion is not reliable, so use strict
+     * constraint on gf_zero_motion.
+     */
+    return (gf_zero_motion >= 0.999);
+  } else {
+    return (gf_zero_motion >= 0.995) &&
+           (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
+  }
 }
 
 #define ARF_ABS_ZOOM_THRESH 4.4
@@ -982,17 +991,24 @@ static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start,
                                 int active_min_gf_interval,
                                 GF_GROUP_STATS *gf_stats) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
   // Motion breakout threshold for loop below depends on image size.
   const double mv_ratio_accumulator_thresh =
-      (cpi->initial_height + cpi->initial_width) / 4.0;
+      (initial_dimensions->height + initial_dimensions->width) / 4.0;
 
   if (!flash_detected) {
     // Break clause to detect very still sections after motion. For example,
     // a static image after a fade or other transition.
-    if (detect_transition_to_still(
-            twopass, rc->min_gf_interval, frame_index - cur_start, 5,
-            gf_stats->loop_decay_rate, gf_stats->last_loop_decay_rate)) {
+
+    // TODO(angiebird): This is a temporary change, we will avoid using
+    // twopass_frame.stats_in in the follow-up CL
+    int index = (int)(cpi->twopass_frame.stats_in -
+                      twopass->stats_buf_ctx->stats_in_start);
+    if (detect_transition_to_still(&twopass->firstpass_info, index,
+                                   rc->min_gf_interval, frame_index - cur_start,
+                                   5, gf_stats->loop_decay_rate,
+                                   gf_stats->last_loop_decay_rate)) {
       return 1;
     }
   }
@@ -1011,271 +1027,872 @@ static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start,
   // so we can continue for more frames.
   if (((frame_index - cur_start) >= active_max_gf_interval + 1) &&
       !is_almost_static(gf_stats->zero_motion_accumulator,
-                        twopass->kf_zeromotion_pct)) {
+                        twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled)) {
     return 1;
   }
   return 0;
 }
 
-#define MAX_PAD_GF_CHECK 6  // padding length to check for gf length
-#define AVG_SI_THRES 0.6    // thres for average silouette
-#define GF_SHRINK_OUTPUT 0  // print output for gf length decision
-int determine_high_err_gf(double *errs, int *is_high, double *si, int len,
-                          double *ratio, int gf_start, int gf_end,
-                          int before_pad) {
-  (void)gf_start;
-  (void)gf_end;
-  (void)before_pad;
-  // alpha and beta controls the threshold placement
-  // e.g. a smaller alpha makes the lower group more rigid
-  const double alpha = 0.5;
-  const double beta = 1 - alpha;
-  double mean = 0;
-  double mean_low = 0;
-  double mean_high = 0;
-  double prev_mean_low = 0;
-  double prev_mean_high = 0;
-  int count_low = 0;
-  int count_high = 0;
-  // calculate mean of errs
-  for (int i = 0; i < len; i++) {
-    mean += errs[i];
-  }
-  mean /= len;
-  // separate into two initial groups with greater / lower than mean
-  for (int i = 0; i < len; i++) {
-    if (errs[i] <= mean) {
-      is_high[i] = 0;
-      count_low++;
-      prev_mean_low += errs[i];
-    } else {
-      is_high[i] = 1;
-      count_high++;
-      prev_mean_high += errs[i];
-    }
-  }
-  prev_mean_low /= count_low;
-  prev_mean_high /= count_high;
-  // kmeans to refine
-  int count = 0;
-  while (count < 10) {
-    // re-group
-    mean_low = 0;
-    mean_high = 0;
-    count_low = 0;
-    count_high = 0;
-    double thres = prev_mean_low * alpha + prev_mean_high * beta;
-    for (int i = 0; i < len; i++) {
-      if (errs[i] <= thres) {
-        is_high[i] = 0;
-        count_low++;
-        mean_low += errs[i];
-      } else {
-        is_high[i] = 1;
-        count_high++;
-        mean_high += errs[i];
+static int is_shorter_gf_interval_better(AV1_COMP *cpi,
+                                         EncodeFrameParams *frame_params) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  int gop_length_decision_method = cpi->sf.tpl_sf.gop_length_decision_method;
+  int shorten_gf_interval;
+
+  av1_tpl_preload_rc_estimate(cpi, frame_params);
+
+  if (gop_length_decision_method == 2) {
+    // GF group length is decided based on GF boost and tpl stats of ARFs from
+    // base layer, (base+1) layer.
+    shorten_gf_interval =
+        (p_rc->gfu_boost <
+         p_rc->num_stats_used_for_gfu_boost * GF_MIN_BOOST * 1.4) &&
+        !av1_tpl_setup_stats(cpi, 3, frame_params);
+  } else {
+    int do_complete_tpl = 1;
+    GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    int is_temporal_filter_enabled =
+        (rc->frames_since_key > 0 && gf_group->arf_index > -1);
+
+    if (gop_length_decision_method == 1) {
+      // Check if tpl stats of ARFs from base layer, (base+1) layer,
+      // (base+2) layer can decide the GF group length.
+      int gop_length_eval = av1_tpl_setup_stats(cpi, 2, frame_params);
+
+      if (gop_length_eval != 2) {
+        do_complete_tpl = 0;
+        shorten_gf_interval = !gop_length_eval;
       }
     }
-    mean_low /= count_low;
-    mean_high /= count_high;
 
-    // break if not changed much
-    if (fabs((mean_low - prev_mean_low) / (prev_mean_low + 0.00001)) <
-            0.00001 &&
-        fabs((mean_high - prev_mean_high) / (prev_mean_high + 0.00001)) <
-            0.00001)
-      break;
+    if (do_complete_tpl) {
+      // Decide GF group length based on complete tpl stats.
+      shorten_gf_interval = !av1_tpl_setup_stats(cpi, 1, frame_params);
+      // Tpl stats is reused when the ARF is temporally filtered and GF
+      // interval is not shortened.
+      if (is_temporal_filter_enabled && !shorten_gf_interval) {
+        cpi->skip_tpl_setup_stats = 1;
+#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS
+        assert(cpi->gf_frame_index == 0);
+        av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data,
+                                       gf_group,
+                                       cpi->common.seq_params->bit_depth);
+#endif  // CONFIG_BITRATE_ACCURACY
+      }
+    }
+  }
+  return shorten_gf_interval;
+}
 
-    // update means
-    prev_mean_high = mean_high;
-    prev_mean_low = mean_low;
-
-    count++;
-  }
-
-  // count how many jumps of group changes
-  int num_change = 0;
-  for (int i = 0; i < len - 1; i++) {
-    if (is_high[i] != is_high[i + 1]) num_change++;
-  }
-
-  // get silhouette as a measure of the classification quality
-  double avg_si = 0;
-  // ai: avg dist of its own class, bi: avg dist to the other class
-  double ai, bi;
-  if (count_low > 1 && count_high > 1) {
-    for (int i = 0; i < len; i++) {
-      ai = 0;
-      bi = 0;
-      // calculate average distance to everyone in the same group
-      // and in the other group
-      for (int j = 0; j < len; j++) {
-        if (i == j) continue;
-        if (is_high[i] == is_high[j]) {
-          ai += fabs(errs[i] - errs[j]);
-        } else {
-          bi += fabs(errs[i] - errs[j]);
-        }
+#define MIN_SHRINK_LEN 6  // the minimum length of gf if we are shrinking
+#define SMOOTH_FILT_LEN 7
+#define HALF_FILT_LEN (SMOOTH_FILT_LEN / 2)
+#define WINDOW_SIZE 7
+#define HALF_WIN (WINDOW_SIZE / 2)
+// A 7-tap gaussian smooth filter
+const double smooth_filt[SMOOTH_FILT_LEN] = { 0.006, 0.061, 0.242, 0.383,
+                                              0.242, 0.061, 0.006 };
+
+// Smooth filter intra_error and coded_error in firstpass stats.
+// If stats[i].is_flash==1, the ith element should not be used in the filtering.
+static void smooth_filter_stats(const FIRSTPASS_STATS *stats, int start_idx,
+                                int last_idx, double *filt_intra_err,
+                                double *filt_coded_err) {
+  int i, j;
+  for (i = start_idx; i <= last_idx; i++) {
+    double total_wt = 0;
+    for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+      int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
+      if (stats[idx].is_flash) continue;
+
+      filt_intra_err[i] +=
+          smooth_filt[j + HALF_FILT_LEN] * stats[idx].intra_error;
+      total_wt += smooth_filt[j + HALF_FILT_LEN];
+    }
+    if (total_wt > 0.01) {
+      filt_intra_err[i] /= total_wt;
+    } else {
+      filt_intra_err[i] = stats[i].intra_error;
+    }
+  }
+  for (i = start_idx; i <= last_idx; i++) {
+    double total_wt = 0;
+    for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+      int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
+      // Coded error involves idx and idx - 1.
+      if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue;
+
+      filt_coded_err[i] +=
+          smooth_filt[j + HALF_FILT_LEN] * stats[idx].coded_error;
+      total_wt += smooth_filt[j + HALF_FILT_LEN];
+    }
+    if (total_wt > 0.01) {
+      filt_coded_err[i] /= total_wt;
+    } else {
+      filt_coded_err[i] = stats[i].coded_error;
+    }
+  }
+}
+
+// Calculate gradient
+static void get_gradient(const double *values, int start, int last,
+                         double *grad) {
+  if (start == last) {
+    grad[start] = 0;
+    return;
+  }
+  for (int i = start; i <= last; i++) {
+    int prev = AOMMAX(i - 1, start);
+    int next = AOMMIN(i + 1, last);
+    grad[i] = (values[next] - values[prev]) / (next - prev);
+  }
+}
+
+static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start,
+                              int first, int last) {
+  // Identify unstable areas caused by scenecuts.
+  // Find the max and 2nd max coded error, and the average of the rest frames.
+  // If there is only one frame that yields a huge coded error, it is likely a
+  // scenecut.
+  double this_ratio, max_prev_ratio, max_next_ratio, max_prev_coded,
+      max_next_coded;
+
+  if (last - first == 0) return -1;
+
+  for (int i = first; i <= last; i++) {
+    if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash))
+      continue;
+    double temp_intra = AOMMAX(stats_start[i].intra_error, 0.01);
+    this_ratio = stats_start[i].coded_error / temp_intra;
+    // find the avg ratio in the preceding neighborhood
+    max_prev_ratio = 0;
+    max_prev_coded = 0;
+    for (int j = AOMMAX(first, i - HALF_WIN); j < i; j++) {
+      if (stats_start[j].is_flash || (j > 0 && stats_start[j - 1].is_flash))
+        continue;
+      temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
+      double temp_ratio = stats_start[j].coded_error / temp_intra;
+      if (temp_ratio > max_prev_ratio) {
+        max_prev_ratio = temp_ratio;
       }
-      if (is_high[i] == 0) {
-        ai = ai / (count_low - 1);
-        bi = bi / count_high;
-      } else {
-        ai = ai / (count_high - 1);
-        bi = bi / count_low;
+      if (stats_start[j].coded_error > max_prev_coded) {
+        max_prev_coded = stats_start[j].coded_error;
       }
-      if (ai <= bi) {
-        si[i] = 1 - ai / (bi + 0.00001);
-      } else {
-        si[i] = bi / (ai + 0.00001) - 1;
+    }
+    // find the avg ratio in the following neighborhood
+    max_next_ratio = 0;
+    max_next_coded = 0;
+    for (int j = i + 1; j <= AOMMIN(i + HALF_WIN, last); j++) {
+      if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash))
+        continue;
+      temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
+      double temp_ratio = stats_start[j].coded_error / temp_intra;
+      if (temp_ratio > max_next_ratio) {
+        max_next_ratio = temp_ratio;
       }
-      avg_si += si[i];
+      if (stats_start[j].coded_error > max_next_coded) {
+        max_next_coded = stats_start[j].coded_error;
+      }
+    }
+
+    if (max_prev_ratio < 0.001 && max_next_ratio < 0.001) {
+      // the ratios are very small, only check a small fixed threshold
+      if (this_ratio < 0.02) continue;
+    } else {
+      // check if this frame has a larger ratio than the neighborhood
+      double max_sr = stats_start[i].sr_coded_error;
+      if (i < last) max_sr = AOMMAX(max_sr, stats_start[i + 1].sr_coded_error);
+      double max_sr_fr_ratio =
+          max_sr / AOMMAX(stats_start[i].coded_error, 0.01);
+
+      if (max_sr_fr_ratio > 1.2) continue;
+      if (this_ratio < 2 * AOMMAX(max_prev_ratio, max_next_ratio) &&
+          stats_start[i].coded_error <
+              2 * AOMMAX(max_prev_coded, max_next_coded)) {
+        continue;
+      }
+    }
+    return i;
+  }
+  return -1;
+}
+
+// Remove the region with index next_region.
+// parameter merge: 0: merge with previous; 1: merge with next; 2:
+// merge with both, take type from previous if possible
+// After removing, next_region will be the index of the next region.
+static void remove_region(int merge, REGIONS *regions, int *num_regions,
+                          int *next_region) {
+  int k = *next_region;
+  assert(k < *num_regions);
+  if (*num_regions == 1) {
+    *num_regions = 0;
+    return;
+  }
+  if (k == 0) {
+    merge = 1;
+  } else if (k == *num_regions - 1) {
+    merge = 0;
+  }
+  int num_merge = (merge == 2) ? 2 : 1;
+  switch (merge) {
+    case 0:
+      regions[k - 1].last = regions[k].last;
+      *next_region = k;
+      break;
+    case 1:
+      regions[k + 1].start = regions[k].start;
+      *next_region = k + 1;
+      break;
+    case 2:
+      regions[k - 1].last = regions[k + 1].last;
+      *next_region = k;
+      break;
+    default: assert(0);
+  }
+  *num_regions -= num_merge;
+  for (k = *next_region - (merge == 1); k < *num_regions; k++) {
+    regions[k] = regions[k + num_merge];
+  }
+}
+
+// Insert a region in the cur_region_idx. The start and last should both be in
+// the current region. After insertion, the cur_region_idx will point to the
+// last region that was splitted from the original region.
+static void insert_region(int start, int last, REGION_TYPES type,
+                          REGIONS *regions, int *num_regions,
+                          int *cur_region_idx) {
+  int k = *cur_region_idx;
+  REGION_TYPES this_region_type = regions[k].type;
+  int this_region_last = regions[k].last;
+  int num_add = (start != regions[k].start) + (last != regions[k].last);
+  // move the following regions further to the back
+  for (int r = *num_regions - 1; r > k; r--) {
+    regions[r + num_add] = regions[r];
+  }
+  *num_regions += num_add;
+  if (start > regions[k].start) {
+    regions[k].last = start - 1;
+    k++;
+    regions[k].start = start;
+  }
+  regions[k].type = type;
+  if (last < this_region_last) {
+    regions[k].last = last;
+    k++;
+    regions[k].start = last + 1;
+    regions[k].last = this_region_last;
+    regions[k].type = this_region_type;
+  } else {
+    regions[k].last = this_region_last;
+  }
+  *cur_region_idx = k;
+}
+
+// Get the average of stats inside a region.
+static void analyze_region(const FIRSTPASS_STATS *stats, int k,
+                           REGIONS *regions) {
+  int i;
+  regions[k].avg_cor_coeff = 0;
+  regions[k].avg_sr_fr_ratio = 0;
+  regions[k].avg_intra_err = 0;
+  regions[k].avg_coded_err = 0;
+
+  int check_first_sr = (k != 0);
+
+  for (i = regions[k].start; i <= regions[k].last; i++) {
+    if (i > regions[k].start || check_first_sr) {
+      double num_frames =
+          (double)(regions[k].last - regions[k].start + check_first_sr);
+      double max_coded_error =
+          AOMMAX(stats[i].coded_error, stats[i - 1].coded_error);
+      double this_ratio =
+          stats[i].sr_coded_error / AOMMAX(max_coded_error, 0.001);
+      regions[k].avg_sr_fr_ratio += this_ratio / num_frames;
     }
-    avg_si /= len;
+
+    regions[k].avg_intra_err +=
+        stats[i].intra_error / (double)(regions[k].last - regions[k].start + 1);
+    regions[k].avg_coded_err +=
+        stats[i].coded_error / (double)(regions[k].last - regions[k].start + 1);
+
+    regions[k].avg_cor_coeff +=
+        AOMMAX(stats[i].cor_coeff, 0.001) /
+        (double)(regions[k].last - regions[k].start + 1);
+    regions[k].avg_noise_var +=
+        AOMMAX(stats[i].noise_var, 0.001) /
+        (double)(regions[k].last - regions[k].start + 1);
   }
+}
 
-  int reset = 0;
-  *ratio = mean_high / (mean_low + 0.00001);
-  // if the two groups too similar, or
-  // if too many numbers of changes, or
-  // silhouette is too small, not confident
-  // reset everything to 0 later so we fallback to the original decision
-  if (*ratio < 1.3 || num_change > AOMMAX(len / 3, 6) ||
-      avg_si < AVG_SI_THRES) {
-    reset = 1;
+// Calculate the regions stats of every region.
+static void get_region_stats(const FIRSTPASS_STATS *stats, REGIONS *regions,
+                             int num_regions) {
+  for (int k = 0; k < num_regions; k++) {
+    analyze_region(stats, k, regions);
   }
+}
+
+// Find tentative stable regions
+static int find_stable_regions(const FIRSTPASS_STATS *stats,
+                               const double *grad_coded, int this_start,
+                               int this_last, REGIONS *regions) {
+  int i, j, k = 0;
+  regions[k].start = this_start;
+  for (i = this_start; i <= this_last; i++) {
+    // Check mean and variance of stats in a window
+    double mean_intra = 0.001, var_intra = 0.001;
+    double mean_coded = 0.001, var_coded = 0.001;
+    int count = 0;
+    for (j = -HALF_WIN; j <= HALF_WIN; j++) {
+      int idx = AOMMIN(AOMMAX(i + j, this_start), this_last);
+      if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue;
+      mean_intra += stats[idx].intra_error;
+      var_intra += stats[idx].intra_error * stats[idx].intra_error;
+      mean_coded += stats[idx].coded_error;
+      var_coded += stats[idx].coded_error * stats[idx].coded_error;
+      count++;
+    }
 
-#if GF_SHRINK_OUTPUT
-  printf("\n");
-  for (int i = 0; i < len; i++) {
-    printf("%d: err %.1f, ishigh %d, si %.2f, (i=%d)\n",
-           gf_start + i - before_pad, errs[i], is_high[i], si[i], gf_end);
+    REGION_TYPES cur_type;
+    if (count > 0) {
+      mean_intra /= (double)count;
+      var_intra /= (double)count;
+      mean_coded /= (double)count;
+      var_coded /= (double)count;
+      int is_intra_stable = (var_intra / (mean_intra * mean_intra) < 1.03);
+      int is_coded_stable = (var_coded / (mean_coded * mean_coded) < 1.04 &&
+                             fabs(grad_coded[i]) / mean_coded < 0.05) ||
+                            mean_coded / mean_intra < 0.05;
+      int is_coded_small = mean_coded < 0.5 * mean_intra;
+      cur_type = (is_intra_stable && is_coded_stable && is_coded_small)
+                     ? STABLE_REGION
+                     : HIGH_VAR_REGION;
+    } else {
+      cur_type = HIGH_VAR_REGION;
+    }
+
+    // mark a new region if type changes
+    if (i == regions[k].start) {
+      // first frame in the region
+      regions[k].type = cur_type;
+    } else if (cur_type != regions[k].type) {
+      // Append a new region
+      regions[k].last = i - 1;
+      regions[k + 1].start = i;
+      regions[k + 1].type = cur_type;
+      k++;
+    }
   }
-  printf(
-      "count: %d, mean_high: %.1f, mean_low: %.1f, avg_si: %.2f, num_change: "
-      "%d, ratio %.2f, reset: %d\n",
-      count, mean_high, mean_low, avg_si, num_change,
-      mean_high / (mean_low + 0.000001), reset);
-#endif
+  regions[k].last = this_last;
+  return k + 1;
+}
 
-  if (reset) {
-    memset(is_high, 0, sizeof(is_high[0]) * len);
-    memset(si, 0, sizeof(si[0]) * len);
+// Clean up regions that should be removed or merged.
+static void cleanup_regions(REGIONS *regions, int *num_regions) {
+  int k = 0;
+  while (k < *num_regions) {
+    if ((k > 0 && regions[k - 1].type == regions[k].type &&
+         regions[k].type != SCENECUT_REGION) ||
+        regions[k].last < regions[k].start) {
+      remove_region(0, regions, num_regions, &k);
+    } else {
+      k++;
+    }
   }
-  return reset;
 }
 
-#if GROUP_ADAPTIVE_MAXQ
-#define RC_FACTOR_MIN 0.75
-#define RC_FACTOR_MAX 1.25
-#endif  // GROUP_ADAPTIVE_MAXQ
-#define MIN_FWD_KF_INTERVAL 8
-#define MIN_SHRINK_LEN 6      // the minimum length of gf if we are shrinking
-#define SI_HIGH AVG_SI_THRES  // high quality classification
-#define SI_LOW 0.3            // very unsure classification
-// this function finds an low error frame previously to the current last frame
-// in the gf group, and set the last frame to it.
-// The resulting last frame is then returned by *cur_last_ptr
-// *cur_start_ptr and cut_pos[n] could also change due to shrinking
-// previous gf groups
-void set_last_prev_low_err(int *cur_start_ptr, int *cur_last_ptr, int *cut_pos,
-                           int count_cuts, int before_pad, double ratio,
-                           int *is_high, double *si, int prev_lows) {
-  int n;
-  int cur_start = *cur_start_ptr;
-  int cur_last = *cur_last_ptr;
-  for (n = cur_last; n >= cur_start + MIN_SHRINK_LEN; n--) {
-    // try to find a point that is very probable to be good
-    if (is_high[n - cur_start + before_pad] == 0 &&
-        si[n - cur_start + before_pad] > SI_HIGH) {
-      *cur_last_ptr = n;
-      return;
+// Remove regions that are of type and shorter than length.
+// Merge it with its neighboring regions.
+static void remove_short_regions(REGIONS *regions, int *num_regions,
+                                 REGION_TYPES type, int length) {
+  int k = 0;
+  while (k < *num_regions && (*num_regions) > 1) {
+    if ((regions[k].last - regions[k].start + 1 < length &&
+         regions[k].type == type)) {
+      // merge current region with the previous and next regions
+      remove_region(2, regions, num_regions, &k);
+    } else {
+      k++;
     }
   }
-  // could not find a low-err point, then let's try find an "unsure"
-  // point at least
-  for (n = cur_last; n >= cur_start + MIN_SHRINK_LEN; n--) {
-    if ((is_high[n - cur_start + before_pad] == 0) ||
-        (is_high[n - cur_start + before_pad] &&
-         si[n - cur_start + before_pad] < SI_LOW)) {
-      *cur_last_ptr = n;
-      return;
+  cleanup_regions(regions, num_regions);
+}
+
+static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats,
+                                          REGIONS *regions, int *num_regions) {
+  int i, j, k;
+  // Remove regions that are too short. Likely noise.
+  remove_short_regions(regions, num_regions, STABLE_REGION, HALF_WIN);
+  remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+
+  get_region_stats(stats, regions, *num_regions);
+
+  // Adjust region boundaries. The thresholds are empirically obtained, but
+  // overall the performance is not very sensitive to small changes to them.
+  for (k = 0; k < *num_regions; k++) {
+    if (regions[k].type == STABLE_REGION) continue;
+    if (k > 0) {
+      // Adjust previous boundary.
+      // First find the average intra/coded error in the previous
+      // neighborhood.
+      double avg_intra_err = 0;
+      const int starti = AOMMAX(regions[k - 1].last - WINDOW_SIZE + 1,
+                                regions[k - 1].start + 1);
+      const int lasti = regions[k - 1].last;
+      int counti = 0;
+      for (i = starti; i <= lasti; i++) {
+        avg_intra_err += stats[i].intra_error;
+        counti++;
+      }
+      if (counti > 0) {
+        avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001);
+        int count_coded = 0, count_grad = 0;
+        for (j = lasti + 1; j <= regions[k].last; j++) {
+          const int intra_close =
+              fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
+          const int coded_small = stats[j].coded_error / avg_intra_err < 0.1;
+          const int coeff_close = stats[j].cor_coeff > 0.995;
+          if (!coeff_close || !coded_small) count_coded--;
+          if (intra_close && count_coded >= 0 && count_grad >= 0) {
+            // this frame probably belongs to the previous stable region
+            regions[k - 1].last = j;
+            regions[k].start = j + 1;
+          } else {
+            break;
+          }
+        }
+      }
+    }  // if k > 0
+    if (k < *num_regions - 1) {
+      // Adjust next boundary.
+      // First find the average intra/coded error in the next neighborhood.
+      double avg_intra_err = 0;
+      const int starti = regions[k + 1].start;
+      const int lasti = AOMMIN(regions[k + 1].last - 1,
+                               regions[k + 1].start + WINDOW_SIZE - 1);
+      int counti = 0;
+      for (i = starti; i <= lasti; i++) {
+        avg_intra_err += stats[i].intra_error;
+        counti++;
+      }
+      if (counti > 0) {
+        avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001);
+        // At the boundary, coded error is large, but still the frame is stable
+        int count_coded = 1, count_grad = 1;
+        for (j = starti - 1; j >= regions[k].start; j--) {
+          const int intra_close =
+              fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
+          const int coded_small =
+              stats[j + 1].coded_error / avg_intra_err < 0.1;
+          const int coeff_close = stats[j].cor_coeff > 0.995;
+          if (!coeff_close || !coded_small) count_coded--;
+          if (intra_close && count_coded >= 0 && count_grad >= 0) {
+            // this frame probably belongs to the next stable region
+            regions[k + 1].start = j;
+            regions[k].last = j - 1;
+          } else {
+            break;
+          }
+        }
+      }
+    }  // if k < *num_regions - 1
+  }    // end of loop over all regions
+
+  cleanup_regions(regions, num_regions);
+  remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+  get_region_stats(stats, regions, *num_regions);
+
+  // If a stable regions has higher error than neighboring high var regions,
+  // or if the stable region has a lower average correlation,
+  // then it should be merged with them
+  k = 0;
+  while (k < *num_regions && (*num_regions) > 1) {
+    if (regions[k].type == STABLE_REGION &&
+        (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE &&
+        ((k > 0 &&  // previous regions
+          (regions[k].avg_coded_err > regions[k - 1].avg_coded_err * 1.01 ||
+           regions[k].avg_cor_coeff < regions[k - 1].avg_cor_coeff * 0.999)) &&
+         (k < *num_regions - 1 &&  // next region
+          (regions[k].avg_coded_err > regions[k + 1].avg_coded_err * 1.01 ||
+           regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff * 0.999)))) {
+      // merge current region with the previous and next regions
+      remove_region(2, regions, num_regions, &k);
+      analyze_region(stats, k - 1, regions);
+    } else if (regions[k].type == HIGH_VAR_REGION &&
+               (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE &&
+               ((k > 0 &&  // previous regions
+                 (regions[k].avg_coded_err <
+                      regions[k - 1].avg_coded_err * 0.99 ||
+                  regions[k].avg_cor_coeff >
+                      regions[k - 1].avg_cor_coeff * 1.001)) &&
+                (k < *num_regions - 1 &&  // next region
+                 (regions[k].avg_coded_err <
+                      regions[k + 1].avg_coded_err * 0.99 ||
+                  regions[k].avg_cor_coeff >
+                      regions[k + 1].avg_cor_coeff * 1.001)))) {
+      // merge current region with the previous and next regions
+      remove_region(2, regions, num_regions, &k);
+      analyze_region(stats, k - 1, regions);
+    } else {
+      k++;
     }
   }
-  if (prev_lows) {
-    // try with shrinking previous all_zero interval
-    for (n = cur_start + MIN_SHRINK_LEN - 1; n > cur_start; n--) {
-      if (is_high[n - cur_start + before_pad] == 0 &&
-          si[n - cur_start + before_pad] > SI_HIGH) {
-        int tentative_start = n - MIN_SHRINK_LEN;
-        // check if the previous interval can shrink this much
-        int available =
-            tentative_start - cut_pos[count_cuts - 2] > MIN_SHRINK_LEN &&
-            cur_start - tentative_start < prev_lows;
-        // shrinking too agressively may worsen performance
-        // set stricter thres for shorter length
-        double ratio_thres =
-            1.0 * (cur_start - tentative_start) / (double)(MIN_SHRINK_LEN) +
-            1.0;
-
-        if (available && (ratio > ratio_thres)) {
-          cut_pos[count_cuts - 1] = tentative_start;
-          *cur_start_ptr = tentative_start;
-          *cur_last_ptr = n;
-          return;
+
+  remove_short_regions(regions, num_regions, STABLE_REGION, WINDOW_SIZE);
+  remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+}
+
+// Identify blending regions.
+static void find_blending_regions(const FIRSTPASS_STATS *stats,
+                                  REGIONS *regions, int *num_regions) {
+  int i, k = 0;
+  // Blending regions will have large content change, therefore will have a
+  // large consistent change in intra error.
+  int count_stable = 0;
+  while (k < *num_regions) {
+    if (regions[k].type == STABLE_REGION) {
+      k++;
+      count_stable++;
+      continue;
+    }
+    int dir = 0;
+    int start = 0, last;
+    for (i = regions[k].start; i <= regions[k].last; i++) {
+      // First mark the regions that has consistent large change of intra error.
+      if (k == 0 && i == regions[k].start) continue;
+      if (stats[i].is_flash || (i > 0 && stats[i - 1].is_flash)) continue;
+      double grad = stats[i].intra_error - stats[i - 1].intra_error;
+      int large_change = fabs(grad) / AOMMAX(stats[i].intra_error, 0.01) > 0.05;
+      int this_dir = 0;
+      if (large_change) {
+        this_dir = (grad > 0) ? 1 : -1;
+      }
+      // the current trend continues
+      if (dir == this_dir) continue;
+      if (dir != 0) {
+        // Mark the end of a new large change group and add it
+        last = i - 1;
+        insert_region(start, last, BLENDING_REGION, regions, num_regions, &k);
+      }
+      dir = this_dir;
+      if (k == 0 && i == regions[k].start + 1) {
+        start = i - 1;
+      } else {
+        start = i;
+      }
+    }
+    if (dir != 0) {
+      last = regions[k].last;
+      insert_region(start, last, BLENDING_REGION, regions, num_regions, &k);
+    }
+    k++;
+  }
+
+  // If the blending region has very low correlation, mark it as high variance
+  // since we probably cannot benefit from it anyways.
+  get_region_stats(stats, regions, *num_regions);
+  for (k = 0; k < *num_regions; k++) {
+    if (regions[k].type != BLENDING_REGION) continue;
+    if (regions[k].last == regions[k].start || regions[k].avg_cor_coeff < 0.6 ||
+        count_stable == 0)
+      regions[k].type = HIGH_VAR_REGION;
+  }
+  get_region_stats(stats, regions, *num_regions);
+
+  // It is possible for blending to result in a "dip" in intra error (first
+  // decrease then increase). Therefore we need to find the dip and combine the
+  // two regions.
+  k = 1;
+  while (k < *num_regions) {
+    if (k < *num_regions - 1 && regions[k].type == HIGH_VAR_REGION) {
+      // Check if this short high variance regions is actually in the middle of
+      // a blending region.
+      if (regions[k - 1].type == BLENDING_REGION &&
+          regions[k + 1].type == BLENDING_REGION &&
+          regions[k].last - regions[k].start < 3) {
+        int prev_dir = (stats[regions[k - 1].last].intra_error -
+                        stats[regions[k - 1].last - 1].intra_error) > 0
+                           ? 1
+                           : -1;
+        int next_dir = (stats[regions[k + 1].last].intra_error -
+                        stats[regions[k + 1].last - 1].intra_error) > 0
+                           ? 1
+                           : -1;
+        if (prev_dir < 0 && next_dir > 0) {
+          // This is possibly a mid region of blending. Check the ratios
+          double ratio_thres = AOMMIN(regions[k - 1].avg_sr_fr_ratio,
+                                      regions[k + 1].avg_sr_fr_ratio) *
+                               0.95;
+          if (regions[k].avg_sr_fr_ratio > ratio_thres) {
+            regions[k].type = BLENDING_REGION;
+            remove_region(2, regions, num_regions, &k);
+            analyze_region(stats, k - 1, regions);
+            continue;
+          }
+        }
+      }
+    }
+    // Check if we have a pair of consecutive blending regions.
+    if (regions[k - 1].type == BLENDING_REGION &&
+        regions[k].type == BLENDING_REGION) {
+      int prev_dir = (stats[regions[k - 1].last].intra_error -
+                      stats[regions[k - 1].last - 1].intra_error) > 0
+                         ? 1
+                         : -1;
+      int next_dir = (stats[regions[k].last].intra_error -
+                      stats[regions[k].last - 1].intra_error) > 0
+                         ? 1
+                         : -1;
+
+      // if both are too short, no need to check
+      int total_length = regions[k].last - regions[k - 1].start + 1;
+      if (total_length < 4) {
+        regions[k - 1].type = HIGH_VAR_REGION;
+        k++;
+        continue;
+      }
+
+      int to_merge = 0;
+      if (prev_dir < 0 && next_dir > 0) {
+        // In this case we check the last frame in the previous region.
+        double prev_length =
+            (double)(regions[k - 1].last - regions[k - 1].start + 1);
+        double last_ratio, ratio_thres;
+        if (prev_length < 2.01) {
+          // if the previous region is very short
+          double max_coded_error =
+              AOMMAX(stats[regions[k - 1].last].coded_error,
+                     stats[regions[k - 1].last - 1].coded_error);
+          last_ratio = stats[regions[k - 1].last].sr_coded_error /
+                       AOMMAX(max_coded_error, 0.001);
+          ratio_thres = regions[k].avg_sr_fr_ratio * 0.95;
+        } else {
+          double max_coded_error =
+              AOMMAX(stats[regions[k - 1].last].coded_error,
+                     stats[regions[k - 1].last - 1].coded_error);
+          last_ratio = stats[regions[k - 1].last].sr_coded_error /
+                       AOMMAX(max_coded_error, 0.001);
+          double prev_ratio =
+              (regions[k - 1].avg_sr_fr_ratio * prev_length - last_ratio) /
+              (prev_length - 1.0);
+          ratio_thres = AOMMIN(prev_ratio, regions[k].avg_sr_fr_ratio) * 0.95;
+        }
+        if (last_ratio > ratio_thres) {
+          to_merge = 1;
         }
       }
+
+      if (to_merge) {
+        remove_region(0, regions, num_regions, &k);
+        analyze_region(stats, k - 1, regions);
+        continue;
+      } else {
+        // These are possibly two separate blending regions. Mark the boundary
+        // frame as HIGH_VAR_REGION to separate the two.
+        int prev_k = k - 1;
+        insert_region(regions[prev_k].last, regions[prev_k].last,
+                      HIGH_VAR_REGION, regions, num_regions, &prev_k);
+        analyze_region(stats, prev_k, regions);
+        k = prev_k + 1;
+        analyze_region(stats, k, regions);
+      }
     }
+    k++;
   }
-  if (prev_lows) {
-    // try with shrinking previous all_zero interval with unsure points
-    for (n = cur_start + MIN_SHRINK_LEN - 1; n > cur_start; n--) {
-      if ((is_high[n - cur_start + before_pad] == 0) ||
-          (is_high[n - cur_start + before_pad] &&
-           si[n - cur_start + before_pad] < SI_LOW)) {
-        int tentative_start = n - MIN_SHRINK_LEN;
-        // check if the previous interval can shrink this much
-        int available =
-            tentative_start - cut_pos[count_cuts - 2] > MIN_SHRINK_LEN &&
-            cur_start - tentative_start < prev_lows;
-        // shrinking too agressively may worsen performance
-        double ratio_thres =
-            1.0 * (cur_start - tentative_start) / (double)(MIN_SHRINK_LEN) +
-            1.0;
-
-        if (available && (ratio > ratio_thres)) {
-          cut_pos[count_cuts - 1] = tentative_start;
-          *cur_start_ptr = tentative_start;
-          *cur_last_ptr = n;
-          return;
+  cleanup_regions(regions, num_regions);
+}
+
+// Clean up decision for blendings. Remove blending regions that are too short.
+// Also if a very short high var region is between a blending and a stable
+// region, just merge it with one of them.
+static void cleanup_blendings(REGIONS *regions, int *num_regions) {
+  int k = 0;
+  while (k<*num_regions && * num_regions> 1) {
+    int is_short_blending = regions[k].type == BLENDING_REGION &&
+                            regions[k].last - regions[k].start + 1 < 5;
+    int is_short_hv = regions[k].type == HIGH_VAR_REGION &&
+                      regions[k].last - regions[k].start + 1 < 5;
+    int has_stable_neighbor =
+        ((k > 0 && regions[k - 1].type == STABLE_REGION) ||
+         (k < *num_regions - 1 && regions[k + 1].type == STABLE_REGION));
+    int has_blend_neighbor =
+        ((k > 0 && regions[k - 1].type == BLENDING_REGION) ||
+         (k < *num_regions - 1 && regions[k + 1].type == BLENDING_REGION));
+    int total_neighbors = (k > 0) + (k < *num_regions - 1);
+
+    if (is_short_blending ||
+        (is_short_hv &&
+         has_stable_neighbor + has_blend_neighbor >= total_neighbors)) {
+      // Remove this region.Try to determine whether to combine it with the
+      // previous or next region.
+      int merge;
+      double prev_diff =
+          (k > 0)
+              ? fabs(regions[k].avg_cor_coeff - regions[k - 1].avg_cor_coeff)
+              : 1;
+      double next_diff =
+          (k < *num_regions - 1)
+              ? fabs(regions[k].avg_cor_coeff - regions[k + 1].avg_cor_coeff)
+              : 1;
+      // merge == 0 means to merge with previous, 1 means to merge with next
+      merge = prev_diff > next_diff;
+      remove_region(merge, regions, num_regions, &k);
+    } else {
+      k++;
+    }
+  }
+  cleanup_regions(regions, num_regions);
+}
+
+void av1_identify_regions(const FIRSTPASS_STATS *const stats_start,
+                          int total_frames, int offset, REGIONS *regions,
+                          int *total_regions) {
+  int k;
+  if (total_frames <= 1) return;
+
+  // store the initial decisions
+  REGIONS temp_regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
+  av1_zero_array(temp_regions, MAX_FIRSTPASS_ANALYSIS_FRAMES);
+  // buffers for filtered stats
+  double filt_intra_err[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
+  double filt_coded_err[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
+  double grad_coded[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
+
+  int cur_region = 0, this_start = 0, this_last;
+
+  int next_scenecut = -1;
+  do {
+    // first get the obvious scenecuts
+    next_scenecut =
+        find_next_scenecut(stats_start, this_start, total_frames - 1);
+    this_last = (next_scenecut >= 0) ? (next_scenecut - 1) : total_frames - 1;
+
+    // low-pass filter the needed stats
+    smooth_filter_stats(stats_start, this_start, this_last, filt_intra_err,
+                        filt_coded_err);
+    get_gradient(filt_coded_err, this_start, this_last, grad_coded);
+
+    // find tentative stable regions and unstable regions
+    int num_regions = find_stable_regions(stats_start, grad_coded, this_start,
+                                          this_last, temp_regions);
+
+    adjust_unstable_region_bounds(stats_start, temp_regions, &num_regions);
+
+    get_region_stats(stats_start, temp_regions, num_regions);
+
+    // Try to identify blending regions in the unstable regions
+    find_blending_regions(stats_start, temp_regions, &num_regions);
+    cleanup_blendings(temp_regions, &num_regions);
+
+    // The flash points should all be considered high variance points
+    k = 0;
+    while (k < num_regions) {
+      if (temp_regions[k].type != STABLE_REGION) {
+        k++;
+        continue;
+      }
+      int start = temp_regions[k].start;
+      int last = temp_regions[k].last;
+      for (int i = start; i <= last; i++) {
+        if (stats_start[i].is_flash) {
+          insert_region(i, i, HIGH_VAR_REGION, temp_regions, &num_regions, &k);
         }
       }
+      k++;
     }
-  }  // prev_lows
-  return;
+    cleanup_regions(temp_regions, &num_regions);
+
+    // copy the regions in the scenecut group
+    for (k = 0; k < num_regions; k++) {
+      if (temp_regions[k].last < temp_regions[k].start &&
+          k == num_regions - 1) {
+        num_regions--;
+        break;
+      }
+      regions[k + cur_region] = temp_regions[k];
+    }
+    cur_region += num_regions;
+
+    // add the scenecut region
+    if (next_scenecut > -1) {
+      // add the scenecut region, and find the next scenecut
+      regions[cur_region].type = SCENECUT_REGION;
+      regions[cur_region].start = next_scenecut;
+      regions[cur_region].last = next_scenecut;
+      cur_region++;
+      this_start = next_scenecut + 1;
+    }
+  } while (next_scenecut >= 0);
+
+  *total_regions = cur_region;
+  get_region_stats(stats_start, regions, *total_regions);
+
+  for (k = 0; k < *total_regions; k++) {
+    // If scenecuts are very minor, mark them as high variance.
+    if (regions[k].type != SCENECUT_REGION ||
+        regions[k].avg_cor_coeff *
+                (1 - stats_start[regions[k].start].noise_var /
+                         regions[k].avg_intra_err) <
+            0.8) {
+      continue;
+    }
+    regions[k].type = HIGH_VAR_REGION;
+  }
+  cleanup_regions(regions, total_regions);
+  get_region_stats(stats_start, regions, *total_regions);
+
+  for (k = 0; k < *total_regions; k++) {
+    regions[k].start += offset;
+    regions[k].last += offset;
+  }
+}
+
+static int find_regions_index(const REGIONS *regions, int num_regions,
+                              int frame_idx) {
+  for (int k = 0; k < num_regions; k++) {
+    if (regions[k].start <= frame_idx && regions[k].last >= frame_idx) {
+      return k;
+    }
+  }
+  return -1;
 }
 
-// This function decides the gf group length of future frames in batch
-// rc->gf_intervals is modified to store the group lengths
+/*!\brief Determine the length of future GF groups.
+ *
+ * \ingroup gf_group_algo
+ * This function decides the gf group length of future frames in batch
+ *
+ * \param[in]    cpi              Top-level encoder structure
+ * \param[in]    max_gop_length   Maximum length of the GF group
+ * \param[in]    max_intervals    Maximum number of intervals to decide
+ *
+ * \return Nothing is returned. Instead, cpi->ppi->rc.gf_intervals is
+ * changed to store the decided GF group lengths.
+ */
 static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
                                 int max_intervals) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FIRSTPASS_STATS next_frame;
-  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
-  FRAME_INFO *frame_info = &cpi->frame_info;
+  const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+  const FIRSTPASS_STATS *const stats = start_pos - (rc->frames_since_key == 0);
+
+  const int f_w = cpi->common.width;
+  const int f_h = cpi->common.height;
   int i;
 
   int flash_detected;
 
-  aom_clear_system_state();
   av1_zero(next_frame);
 
   if (has_no_stats_stage(cpi)) {
     for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) {
-      rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length);
+      p_rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length);
     }
-    rc->cur_gf_index = 0;
+    p_rc->cur_gf_index = 0;
     rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS;
     return;
   }
@@ -1284,45 +1901,37 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
   const int active_min_gf_interval = rc->min_gf_interval;
   const int active_max_gf_interval =
       AOMMIN(rc->max_gf_interval, max_gop_length);
+  const int min_shrink_int = AOMMAX(MIN_SHRINK_LEN, active_min_gf_interval);
 
-  i = 0;
-  max_intervals = cpi->lap_enabled ? 1 : max_intervals;
-  int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { 0 };
+  i = (rc->frames_since_key == 0);
+  max_intervals = cpi->ppi->lap_enabled ? 1 : max_intervals;
   int count_cuts = 1;
-  int cur_start = 0, cur_last;
+  // If cpi->gf_state.arf_gf_boost_lst is 0, we are starting with a KF or GF.
+  int cur_start = -1 + !cpi->ppi->gf_state.arf_gf_boost_lst, cur_last;
+  int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { -1 };
   int cut_here;
-  int prev_lows = 0;
   GF_GROUP_STATS gf_stats;
   init_gf_stats(&gf_stats);
   while (count_cuts < max_intervals + 1) {
-    ++i;
-
     // reaches next key frame, break here
     if (i >= rc->frames_to_key) {
-      cut_pos[count_cuts] = i - 1;
-      count_cuts++;
-      break;
-    }
-
-    // reached maximum len, but nothing special yet (almost static)
-    // let's look at the next interval
-    if (i - cur_start >= rc->static_scene_max_gf_interval) {
+      cut_here = 2;
+    } else if (i - cur_start >= rc->static_scene_max_gf_interval) {
+      // reached maximum len, but nothing special yet (almost static)
+      // let's look at the next interval
       cut_here = 1;
-    } else {
+    } else if (EOF == input_stats(twopass, &cpi->twopass_frame, &next_frame)) {
       // reaches last frame, break
-      if (EOF == input_stats(twopass, &next_frame)) {
-        cut_pos[count_cuts] = i - 1;
-        count_cuts++;
-        break;
-      }
+      cut_here = 2;
+    } else {
       // Test for the case where there is a brief flash but the prediction
       // quality back to an earlier frame is then restored.
-      flash_detected = detect_flash(twopass, 0);
+      flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0);
       // TODO(bohanli): remove redundant accumulations here, or unify
       // this and the ones in define_gf_group
-      accumulate_next_frame_stats(&next_frame, frame_info, twopass,
-                                  flash_detected, rc->frames_since_key, i, 0,
-                                  rc->min_gf_interval, &gf_stats);
+      av1_accumulate_next_frame_stats(&next_frame, flash_detected,
+                                      rc->frames_since_key, i, &gf_stats, f_w,
+                                      f_h);
 
       cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected,
                                active_max_gf_interval, active_min_gf_interval,
@@ -1330,146 +1939,254 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
     }
     if (cut_here) {
       cur_last = i - 1;  // the current last frame in the gf group
+      int ori_last = cur_last;
+      // The region frame idx does not start from the same frame as cur_start
+      // and cur_last. Need to offset them.
+      int offset = rc->frames_since_key - p_rc->regions_offset;
+      REGIONS *regions = p_rc->regions;
+      int num_regions = p_rc->num_regions;
+
+      int scenecut_idx = -1;
       // only try shrinking if interval smaller than active_max_gf_interval
-      if (cur_last - cur_start <= active_max_gf_interval) {
-        // determine in the current decided gop the higher and lower errs
-        int n;
-        double ratio;
-
-        // load neighboring coded errs
-        int is_high[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 };
-        double errs[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 };
-        double si[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 };
-        int before_pad =
-            AOMMIN(MAX_PAD_GF_CHECK, rc->frames_since_key - 1 + cur_start);
-        int after_pad =
-            AOMMIN(MAX_PAD_GF_CHECK, rc->frames_to_key - cur_last - 1);
-        for (n = cur_start - before_pad; n <= cur_last + after_pad; n++) {
-          if (start_pos + n - 1 > twopass->stats_buf_ctx->stats_in_end) {
-            after_pad = n - cur_last - 1;
-            assert(after_pad >= 0);
+      if (cur_last - cur_start <= active_max_gf_interval &&
+          cur_last > cur_start) {
+        // find the region indices of where the first and last frame belong.
+        int k_start =
+            find_regions_index(regions, num_regions, cur_start + offset);
+        int k_last =
+            find_regions_index(regions, num_regions, cur_last + offset);
+        if (cur_start + offset == 0) k_start = 0;
+
+        // See if we have a scenecut in between
+        for (int r = k_start + 1; r <= k_last; r++) {
+          if (regions[r].type == SCENECUT_REGION &&
+              regions[r].last - offset - cur_start > active_min_gf_interval) {
+            scenecut_idx = r;
             break;
-          } else if (start_pos + n - 1 <
-                     twopass->stats_buf_ctx->stats_in_start) {
-            before_pad = cur_start - n - 1;
-            continue;
           }
-          errs[n + before_pad - cur_start] = (start_pos + n - 1)->coded_error;
         }
-        const int len = before_pad + after_pad + cur_last - cur_start + 1;
-        const int reset = determine_high_err_gf(
-            errs, is_high, si, len, &ratio, cur_start, cur_last, before_pad);
-
-        // if the current frame may have high error, try shrinking
-        if (is_high[cur_last - cur_start + before_pad] == 1 ||
-            (!reset && si[cur_last - cur_start + before_pad] < SI_LOW)) {
-          // try not to cut in high err area
-          set_last_prev_low_err(&cur_start, &cur_last, cut_pos, count_cuts,
-                                before_pad, ratio, is_high, si, prev_lows);
-        }  // if current frame high error
-        // count how many trailing lower error frames we have in this decided
-        // gf group
-        prev_lows = 0;
-        for (n = cur_last - 1; n > cur_start + MIN_SHRINK_LEN; n--) {
-          if (is_high[n - cur_start + before_pad] == 0 &&
-              (si[n - cur_start + before_pad] > SI_HIGH || reset)) {
-            prev_lows++;
-          } else {
-            break;
+
+        // if the found scenecut is very close to the end, ignore it.
+        if (regions[num_regions - 1].last - regions[scenecut_idx].last < 4) {
+          scenecut_idx = -1;
+        }
+
+        if (scenecut_idx != -1) {
+          // If we have a scenecut, then stop at it.
+          // TODO(bohanli): add logic here to stop before the scenecut and for
+          // the next gop start from the scenecut with GF
+          int is_minor_sc =
+              (regions[scenecut_idx].avg_cor_coeff *
+                   (1 - stats[regions[scenecut_idx].start - offset].noise_var /
+                            regions[scenecut_idx].avg_intra_err) >
+               0.6);
+          cur_last = regions[scenecut_idx].last - offset - !is_minor_sc;
+        } else {
+          int is_last_analysed = (k_last == num_regions - 1) &&
+                                 (cur_last + offset == regions[k_last].last);
+          int not_enough_regions =
+              k_last - k_start <=
+              1 + (regions[k_start].type == SCENECUT_REGION);
+          // if we are very close to the end, then do not shrink since it may
+          // introduce intervals that are too short
+          if (!(is_last_analysed && not_enough_regions)) {
+            const double arf_length_factor = 0.1;
+            double best_score = 0;
+            int best_j = -1;
+            const int first_frame = regions[0].start - offset;
+            const int last_frame = regions[num_regions - 1].last - offset;
+            // score of how much the arf helps the whole GOP
+            double base_score = 0.0;
+            // Accumulate base_score in
+            for (int j = cur_start + 1; j < cur_start + min_shrink_int; j++) {
+              if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break;
+              base_score = (base_score + 1.0) * stats[j].cor_coeff;
+            }
+            int met_blending = 0;   // Whether we have met blending areas before
+            int last_blending = 0;  // Whether the previous frame if blending
+            for (int j = cur_start + min_shrink_int; j <= cur_last; j++) {
+              if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break;
+              base_score = (base_score + 1.0) * stats[j].cor_coeff;
+              int this_reg =
+                  find_regions_index(regions, num_regions, j + offset);
+              if (this_reg < 0) continue;
+              // A GOP should include at most 1 blending region.
+              if (regions[this_reg].type == BLENDING_REGION) {
+                last_blending = 1;
+                if (met_blending) {
+                  break;
+                } else {
+                  base_score = 0;
+                  continue;
+                }
+              } else {
+                if (last_blending) met_blending = 1;
+                last_blending = 0;
+              }
+
+              // Add the factor of how good the neighborhood is for this
+              // candidate arf.
+              double this_score = arf_length_factor * base_score;
+              double temp_accu_coeff = 1.0;
+              // following frames
+              int count_f = 0;
+              for (int n = j + 1; n <= j + 3 && n <= last_frame; n++) {
+                if (stats + n >= twopass->stats_buf_ctx->stats_in_end) break;
+                temp_accu_coeff *= stats[n].cor_coeff;
+                this_score +=
+                    temp_accu_coeff *
+                    (1 - stats[n].noise_var /
+                             AOMMAX(regions[this_reg].avg_intra_err, 0.001));
+                count_f++;
+              }
+              // preceding frames
+              temp_accu_coeff = 1.0;
+              for (int n = j; n > j - 3 * 2 + count_f && n > first_frame; n--) {
+                if (stats + n < twopass->stats_buf_ctx->stats_in_start) break;
+                temp_accu_coeff *= stats[n].cor_coeff;
+                this_score +=
+                    temp_accu_coeff *
+                    (1 - stats[n].noise_var /
+                             AOMMAX(regions[this_reg].avg_intra_err, 0.001));
+              }
+
+              if (this_score > best_score) {
+                best_score = this_score;
+                best_j = j;
+              }
+            }
+
+            // For blending areas, move one more frame in case we missed the
+            // first blending frame.
+            int best_reg =
+                find_regions_index(regions, num_regions, best_j + offset);
+            if (best_reg < num_regions - 1 && best_reg > 0) {
+              if (regions[best_reg - 1].type == BLENDING_REGION &&
+                  regions[best_reg + 1].type == BLENDING_REGION) {
+                if (best_j + offset == regions[best_reg].start &&
+                    best_j + offset < regions[best_reg].last) {
+                  best_j += 1;
+                } else if (best_j + offset == regions[best_reg].last &&
+                           best_j + offset > regions[best_reg].start) {
+                  best_j -= 1;
+                }
+              }
+            }
+
+            if (cur_last - best_j < 2) best_j = cur_last;
+            if (best_j > 0 && best_score > 0.1) cur_last = best_j;
+            // if cannot find anything, just cut at the original place.
           }
         }
       }
       cut_pos[count_cuts] = cur_last;
       count_cuts++;
 
-      // reset pointers to the shrinked location
-      twopass->stats_in = start_pos + cur_last;
+      // reset pointers to the shrunken location
+      cpi->twopass_frame.stats_in = start_pos + cur_last;
       cur_start = cur_last;
+      int cur_region_idx =
+          find_regions_index(regions, num_regions, cur_start + 1 + offset);
+      if (cur_region_idx >= 0)
+        if (regions[cur_region_idx].type == SCENECUT_REGION) cur_start++;
+
       i = cur_last;
 
+      if (cut_here > 1 && cur_last == ori_last) break;
+
       // reset accumulators
       init_gf_stats(&gf_stats);
     }
+    ++i;
   }
 
   // save intervals
   rc->intervals_till_gf_calculate_due = count_cuts - 1;
   for (int n = 1; n < count_cuts; n++) {
-    rc->gf_intervals[n - 1] = cut_pos[n] + 1 - cut_pos[n - 1];
-  }
-  rc->cur_gf_index = 0;
-  twopass->stats_in = start_pos;
-
-#if GF_SHRINK_OUTPUT
-  printf("\nf_to_key: %d, count_cut: %d. ", rc->frames_to_key, count_cuts);
-  for (int n = 0; n < count_cuts; n++) {
-    printf("%d ", cut_pos[n]);
+    p_rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1];
   }
-  printf("\n");
-
-  for (int n = 0; n < rc->intervals_till_gf_calculate_due; n++) {
-    printf("%d ", rc->gf_intervals[n]);
-  }
-  printf("\n\n");
-#endif
+  p_rc->cur_gf_index = 0;
+  cpi->twopass_frame.stats_in = start_pos;
 }
 
 static void correct_frames_to_key(AV1_COMP *cpi) {
   int lookahead_size =
-      (int)av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage) + 1;
+      (int)av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
   if (lookahead_size <
-      av1_lookahead_pop_sz(cpi->lookahead, cpi->compressor_stage)) {
+      av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage)) {
+    assert(
+        IMPLIES(cpi->oxcf.pass != AOM_RC_ONE_PASS && cpi->ppi->frames_left > 0,
+                lookahead_size == cpi->ppi->frames_left));
     cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size);
+  } else if (cpi->ppi->frames_left > 0) {
+    // Correct frames to key based on limit
+    cpi->rc.frames_to_key =
+        AOMMIN(cpi->rc.frames_to_key, cpi->ppi->frames_left);
   }
 }
 
-static void define_gf_group_pass0(AV1_COMP *cpi,
-                                  const EncodeFrameParams *const frame_params) {
+/*!\brief Define a GF group in one pass mode when no look ahead stats are
+ * available.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group, along with various
+ * parameters regarding bit-allocation and quality setup in the special
+ * case of one pass encoding where no lookahead stats are avialable.
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ *
+ * \return Nothing is returned. Instead, cpi->ppi->gf_group is changed.
+ */
+static void define_gf_group_pass0(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GFConfig *const gf_cfg = &oxcf->gf_cfg;
   int target;
 
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+  if (oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
     av1_cyclic_refresh_set_golden_update(cpi);
   } else {
-    rc->baseline_gf_interval = rc->gf_intervals[rc->cur_gf_index];
+    p_rc->baseline_gf_interval = p_rc->gf_intervals[p_rc->cur_gf_index];
     rc->intervals_till_gf_calculate_due--;
-    rc->cur_gf_index++;
+    p_rc->cur_gf_index++;
   }
 
   // correct frames_to_key when lookahead queue is flushing
   correct_frames_to_key(cpi);
 
-  if (rc->baseline_gf_interval > rc->frames_to_key)
-    rc->baseline_gf_interval = rc->frames_to_key;
+  if (p_rc->baseline_gf_interval > rc->frames_to_key)
+    p_rc->baseline_gf_interval = rc->frames_to_key;
 
-  rc->gfu_boost = DEFAULT_GF_BOOST;
-  rc->constrained_gf_group =
-      (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
+  p_rc->gfu_boost = DEFAULT_GF_BOOST;
+  p_rc->constrained_gf_group =
+      (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
 
-  gf_group->max_layer_depth_allowed = cpi->oxcf.gf_max_pyr_height;
+  gf_group->max_layer_depth_allowed = oxcf->gf_cfg.gf_max_pyr_height;
 
   // Rare case when the look-ahead is less than the target GOP length, can't
   // generate ARF frame.
-  if (rc->baseline_gf_interval > cpi->oxcf.lag_in_frames ||
-      !is_altref_enabled(cpi) || rc->baseline_gf_interval < rc->min_gf_interval)
+  if (p_rc->baseline_gf_interval > gf_cfg->lag_in_frames ||
+      !is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) ||
+      p_rc->baseline_gf_interval < rc->min_gf_interval)
     gf_group->max_layer_depth_allowed = 0;
 
   // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
-  av1_gop_setup_structure(cpi, frame_params);
+  av1_gop_setup_structure(cpi);
 
   // Allocate bits to each of the frames in the GF group.
   // TODO(sarahparker) Extend this to work with pyramid structure.
   for (int cur_index = 0; cur_index < gf_group->size; ++cur_index) {
     const FRAME_UPDATE_TYPE cur_update_type = gf_group->update_type[cur_index];
-    if (cpi->oxcf.rc_mode == AOM_CBR) {
-      if (cur_update_type == KEY_FRAME) {
+    if (oxcf->rc_cfg.mode == AOM_CBR) {
+      if (cur_update_type == KF_UPDATE) {
         target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
       } else {
         target = av1_calc_pframe_target_size_one_pass_cbr(cpi, cur_update_type);
       }
     } else {
-      if (cur_update_type == KEY_FRAME) {
+      if (cur_update_type == KF_UPDATE) {
         target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
       } else {
         target = av1_calc_pframe_target_size_one_pass_vbr(cpi, cur_update_type);
@@ -1479,41 +2196,9 @@ static void define_gf_group_pass0(AV1_COMP *cpi,
   }
 }
 
-static INLINE void set_baseline_gf_interval(AV1_COMP *cpi, int arf_position,
-                                            int active_max_gf_interval,
-                                            int use_alt_ref,
-                                            int is_final_pass) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  // Set the interval until the next gf.
-  // If forward keyframes are enabled, ensure the final gf group obeys the
-  // MIN_FWD_KF_INTERVAL.
-  if (cpi->oxcf.fwd_kf_enabled && use_alt_ref &&
-      ((twopass->stats_in - arf_position + rc->frames_to_key) <
-       twopass->stats_buf_ctx->stats_in_end) &&
-      cpi->rc.next_is_fwd_key) {
-    if (arf_position == rc->frames_to_key) {
-      rc->baseline_gf_interval = arf_position;
-      // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
-    } else if ((rc->frames_to_key - arf_position <
-                AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
-               (rc->frames_to_key != arf_position)) {
-      // if possible, merge the last two gf groups
-      if (rc->frames_to_key <= active_max_gf_interval) {
-        rc->baseline_gf_interval = rc->frames_to_key;
-        if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
-        // if merging the last two gf groups creates a group that is too long,
-        // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
-      } else {
-        rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
-        if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
-      }
-    } else {
-      rc->baseline_gf_interval = arf_position - rc->source_alt_ref_pending;
-    }
-  } else {
-    rc->baseline_gf_interval = arf_position - rc->source_alt_ref_pending;
-  }
+static INLINE void set_baseline_gf_interval(PRIMARY_RATE_CONTROL *p_rc,
+                                            int arf_position) {
+  p_rc->baseline_gf_interval = arf_position;
 }
 
 // initialize GF_GROUP_STATS
@@ -1533,189 +2218,305 @@ static void init_gf_stats(GF_GROUP_STATS *gf_stats) {
   gf_stats->abs_mv_in_out_accumulator = 0.0;
 
   gf_stats->avg_sr_coded_error = 0.0;
-  gf_stats->avg_tr_coded_error = 0.0;
   gf_stats->avg_pcnt_second_ref = 0.0;
-  gf_stats->avg_pcnt_third_ref = 0.0;
-  gf_stats->avg_pcnt_third_ref_nolast = 0.0;
   gf_stats->avg_new_mv_count = 0.0;
   gf_stats->avg_wavelet_energy = 0.0;
   gf_stats->avg_raw_err_stdev = 0.0;
   gf_stats->non_zero_stdev_count = 0;
+}
+
+static void accumulate_gop_stats(AV1_COMP *cpi, int is_intra_only, int f_w,
+                                 int f_h, FIRSTPASS_STATS *next_frame,
+                                 const FIRSTPASS_STATS *start_pos,
+                                 GF_GROUP_STATS *gf_stats, int *idx) {
+  int i, flash_detected;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  RATE_CONTROL *const rc = &cpi->rc;
+  FRAME_INFO *frame_info = &cpi->frame_info;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  init_gf_stats(gf_stats);
+  av1_zero(*next_frame);
+
+  // If this is a key frame or the overlay from a previous arf then
+  // the error score / cost of this frame has already been accounted for.
+  i = is_intra_only;
+  // get the determined gf group length from p_rc->gf_intervals
+  while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) {
+    // read in the next frame
+    if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break;
+    // Accumulate error score of frames in this gf group.
+    double mod_frame_err =
+        calculate_modified_err(frame_info, twopass, oxcf, next_frame);
+    // accumulate stats for this frame
+    accumulate_this_frame_stats(next_frame, mod_frame_err, gf_stats);
+    ++i;
+  }
+
+  reset_fpf_position(&cpi->twopass_frame, start_pos);
+
+  i = is_intra_only;
+  input_stats(twopass, &cpi->twopass_frame, next_frame);
+  while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) {
+    // read in the next frame
+    if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break;
+
+    // Test for the case where there is a brief flash but the prediction
+    // quality back to an earlier frame is then restored.
+    flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0);
+
+    // accumulate stats for next frame
+    av1_accumulate_next_frame_stats(next_frame, flash_detected,
+                                    rc->frames_since_key, i, gf_stats, f_w,
+                                    f_h);
+
+    ++i;
+  }
+
+  i = p_rc->gf_intervals[p_rc->cur_gf_index];
+  average_gf_stats(i, gf_stats);
+
+  *idx = i;
+}
+
+static void update_gop_length(RATE_CONTROL *rc, PRIMARY_RATE_CONTROL *p_rc,
+                              int idx, int is_final_pass) {
+  if (is_final_pass) {
+    rc->intervals_till_gf_calculate_due--;
+    p_rc->cur_gf_index++;
+  }
 
-  gf_stats->allow_alt_ref = 0;
+  // Was the group length constrained by the requirement for a new KF?
+  p_rc->constrained_gf_group = (idx >= rc->frames_to_key) ? 1 : 0;
+
+  set_baseline_gf_interval(p_rc, idx);
+  rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
 }
 
-// Analyse and define a gf/arf group.
 #define MAX_GF_BOOST 5400
-static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
-                            const EncodeFrameParams *const frame_params,
-                            int max_gop_length, int is_final_pass) {
+#define REDUCE_GF_LENGTH_THRESH 4
+#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
+#define REDUCE_GF_LENGTH_BY 1
+static void set_gop_bits_boost(AV1_COMP *cpi, int i, int is_intra_only,
+                               int is_final_pass, int use_alt_ref,
+                               int alt_offset, const FIRSTPASS_STATS *start_pos,
+                               GF_GROUP_STATS *gf_stats) {
+  // Should we use the alternate reference frame.
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
-  AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *const twopass = &cpi->twopass;
-  FIRSTPASS_STATS next_frame;
-  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
-  GF_GROUP *gf_group = &cpi->gf_group;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
   FRAME_INFO *frame_info = &cpi->frame_info;
-  int i;
-
-  int flash_detected;
-  int64_t gf_group_bits;
-  const int is_intra_only = frame_params->frame_type == KEY_FRAME ||
-                            frame_params->frame_type == INTRA_ONLY_FRAME;
-  const int arf_active_or_kf = is_intra_only || rc->source_alt_ref_active;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
 
-  cpi->internal_altref_allowed = (oxcf->gf_max_pyr_height > 1);
+  int ext_len = i - is_intra_only;
+  if (use_alt_ref) {
+    const int forward_frames = (rc->frames_to_key - i >= ext_len)
+                                   ? ext_len
+                                   : AOMMAX(0, rc->frames_to_key - i);
 
-  // Reset the GF group data structures unless this is a key
-  // frame in which case it will already have been done.
-  if (!is_intra_only) {
-    av1_zero(cpi->gf_group);
+    // Calculate the boost for alt ref.
+    p_rc->gfu_boost = av1_calc_arf_boost(
+        twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset,
+        forward_frames, ext_len, &p_rc->num_stats_used_for_gfu_boost,
+        &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled);
+  } else {
+    reset_fpf_position(&cpi->twopass_frame, start_pos);
+    p_rc->gfu_boost = AOMMIN(
+        MAX_GF_BOOST,
+        av1_calc_arf_boost(
+            twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset, ext_len,
+            0, &p_rc->num_stats_used_for_gfu_boost,
+            &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled));
   }
 
-  aom_clear_system_state();
-  av1_zero(next_frame);
+#define LAST_ALR_BOOST_FACTOR 0.2f
+  p_rc->arf_boost_factor = 1.0;
+  if (use_alt_ref && !is_lossless_requested(rc_cfg)) {
+    // Reduce the boost of altref in the last gf group
+    if (rc->frames_to_key - ext_len == REDUCE_GF_LENGTH_BY ||
+        rc->frames_to_key - ext_len == 0) {
+      p_rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+    }
+  }
 
-  if (has_no_stats_stage(cpi)) {
-    define_gf_group_pass0(cpi, frame_params);
-    return;
+  // Reset the file position.
+  reset_fpf_position(&cpi->twopass_frame, start_pos);
+  if (cpi->ppi->lap_enabled) {
+    // Since we don't have enough stats to know the actual error of the
+    // gf group, we assume error of each frame to be equal to 1 and set
+    // the error of the group as baseline_gf_interval.
+    gf_stats->gf_group_err = p_rc->baseline_gf_interval;
   }
+  // Calculate the bits to be allocated to the gf/arf group as a whole
+  p_rc->gf_group_bits =
+      calculate_total_gf_group_bits(cpi, gf_stats->gf_group_err);
 
-  // correct frames_to_key when lookahead queue is emptying
-  if (cpi->lap_enabled) {
-    correct_frames_to_key(cpi);
+#if GROUP_ADAPTIVE_MAXQ
+  // Calculate an estimate of the maxq needed for the group.
+  // We are more agressive about correcting for sections
+  // where there could be significant overshoot than for easier
+  // sections where we do not wish to risk creating an overshoot
+  // of the allocated bit budget.
+  if ((rc_cfg->mode != AOM_Q) && (p_rc->baseline_gf_interval > 1) &&
+      is_final_pass) {
+    const int vbr_group_bits_per_frame =
+        (int)(p_rc->gf_group_bits / p_rc->baseline_gf_interval);
+    const double group_av_err =
+        gf_stats->gf_group_raw_error / p_rc->baseline_gf_interval;
+    const double group_av_skip_pct =
+        gf_stats->gf_group_skip_pct / p_rc->baseline_gf_interval;
+    const double group_av_inactive_zone =
+        ((gf_stats->gf_group_inactive_zone_rows * 2) /
+         (p_rc->baseline_gf_interval * (double)cm->mi_params.mb_rows));
+
+    int tmp_q;
+    tmp_q = get_twopass_worst_quality(
+        cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
+        vbr_group_bits_per_frame);
+    rc->active_worst_quality = AOMMAX(tmp_q, rc->active_worst_quality >> 1);
   }
+#endif
 
-  GF_GROUP_STATS gf_stats;
-  init_gf_stats(&gf_stats);
-  GF_FRAME_STATS first_frame_stats, last_frame_stats;
+  // Adjust KF group bits and error remaining.
+  if (is_final_pass) twopass->kf_group_error_left -= gf_stats->gf_group_err;
 
-  gf_stats.allow_alt_ref = is_altref_enabled(cpi);
-  const int can_disable_arf = (oxcf->gf_min_pyr_height == MIN_PYRAMID_LVL);
+  // Reset the file position.
+  reset_fpf_position(&cpi->twopass_frame, start_pos);
 
-  // Load stats for the current frame.
-  double mod_frame_err =
-      calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+  // Calculate a section intra ratio used in setting max loop filter.
+  if (rc->frames_since_key != 0) {
+    twopass->section_intra_rating = calculate_section_intra_ratio(
+        start_pos, twopass->stats_buf_ctx->stats_in_end,
+        p_rc->baseline_gf_interval);
+  }
 
-  // Note the error of the frame at the start of the group. This will be
-  // the GF frame error if we code a normal gf.
-  first_frame_stats.frame_err = mod_frame_err;
-  first_frame_stats.frame_coded_error = this_frame->coded_error;
-  first_frame_stats.frame_sr_coded_error = this_frame->sr_coded_error;
-  first_frame_stats.frame_tr_coded_error = this_frame->tr_coded_error;
+  av1_gop_bit_allocation(cpi, rc, gf_group, rc->frames_since_key == 0,
+                         use_alt_ref, p_rc->gf_group_bits);
 
-  // If this is a key frame or the overlay from a previous arf then
-  // the error score / cost of this frame has already been accounted for.
-  if (arf_active_or_kf) {
-    gf_stats.gf_group_err -= first_frame_stats.frame_err;
-#if GROUP_ADAPTIVE_MAXQ
-    gf_stats.gf_group_raw_error -= this_frame->coded_error;
-#endif
-    gf_stats.gf_group_skip_pct -= this_frame->intra_skip_pct;
-    gf_stats.gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
+  // TODO(jingning): Generalize this condition.
+  if (is_final_pass) {
+    cpi->ppi->gf_state.arf_gf_boost_lst = use_alt_ref;
+
+    // Reset rolling actual and target bits counters for ARF groups.
+    twopass->rolling_arf_group_target_bits = 1;
+    twopass->rolling_arf_group_actual_bits = 1;
+  }
+#if CONFIG_BITRATE_ACCURACY
+  if (is_final_pass) {
+    av1_vbr_rc_set_gop_bit_budget(&cpi->vbr_rc_info,
+                                  p_rc->baseline_gf_interval);
   }
+#endif
+}
 
-  // TODO(urvang): Try logic to vary min and max interval based on q.
-  const int active_min_gf_interval = rc->min_gf_interval;
-  const int active_max_gf_interval =
-      AOMMIN(rc->max_gf_interval, max_gop_length);
+/*!\brief Define a GF group.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group, along with various
+ * parameters regarding bit-allocation and quality setup.
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    frame_params    Structure with frame parameters
+ * \param[in]    is_final_pass   Whether this is the final pass for the
+ *                               GF group, or a trial (non-zero)
+ *
+ * \return Nothing is returned. Instead, cpi->ppi->gf_group is changed.
+ */
+static void define_gf_group(AV1_COMP *cpi, EncodeFrameParams *frame_params,
+                            int is_final_pass) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const GFConfig *const gf_cfg = &oxcf->gf_cfg;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+  const int f_w = cm->width;
+  const int f_h = cm->height;
+  int i;
+  const int is_intra_only = rc->frames_since_key == 0;
 
-  i = 0;
-  // get the determined gf group length from rc->gf_intervals
-  while (i < rc->gf_intervals[rc->cur_gf_index]) {
-    ++i;
-    // Accumulate error score of frames in this gf group.
-    mod_frame_err =
-        calculate_modified_err(frame_info, twopass, oxcf, this_frame);
-    // accumulate stats for this frame
-    accumulate_this_frame_stats(this_frame, mod_frame_err, &gf_stats);
+  cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1);
 
-    // read in the next frame
-    if (EOF == input_stats(twopass, &next_frame)) break;
+  // Reset the GF group data structures unless this is a key
+  // frame in which case it will already have been done.
+  if (!is_intra_only) {
+    av1_zero(cpi->ppi->gf_group);
+    cpi->gf_frame_index = 0;
+  }
 
-    // Test for the case where there is a brief flash but the prediction
-    // quality back to an earlier frame is then restored.
-    flash_detected = detect_flash(twopass, 0);
+  if (has_no_stats_stage(cpi)) {
+    define_gf_group_pass0(cpi);
+    return;
+  }
 
-    // accumulate stats for next frame
-    accumulate_next_frame_stats(
-        &next_frame, frame_info, twopass, flash_detected, rc->frames_since_key,
-        i, can_disable_arf, rc->min_gf_interval, &gf_stats);
+  if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) {
+    int ret = define_gf_group_pass3(cpi, frame_params, is_final_pass);
+    if (ret == 0) return;
 
-    *this_frame = next_frame;
+    av1_free_thirdpass_ctx(cpi->third_pass_ctx);
+    cpi->third_pass_ctx = NULL;
   }
-  // save the errs for the last frame
-  last_frame_stats.frame_coded_error = next_frame.coded_error;
-  last_frame_stats.frame_sr_coded_error = next_frame.sr_coded_error;
-  last_frame_stats.frame_tr_coded_error = next_frame.tr_coded_error;
 
-  if (is_final_pass) {
-    rc->intervals_till_gf_calculate_due--;
-    rc->cur_gf_index++;
+  // correct frames_to_key when lookahead queue is emptying
+  if (cpi->ppi->lap_enabled) {
+    correct_frames_to_key(cpi);
   }
 
-  // Was the group length constrained by the requirement for a new KF?
-  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+  GF_GROUP_STATS gf_stats;
+  accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos,
+                       &gf_stats, &i);
 
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                          ? cpi->initial_mbs
-                          : cm->mi_params.MBs;
-  assert(num_mbs > 0);
+  const int can_disable_arf = !gf_cfg->gf_min_pyr_height;
 
-  average_gf_stats(i, &next_frame, &gf_stats);
+  // If this is a key frame or the overlay from a previous arf then
+  // the error score / cost of this frame has already been accounted for.
+  const int active_min_gf_interval = rc->min_gf_interval;
 
   // Disable internal ARFs for "still" gf groups.
   //   zero_motion_accumulator: minimum percentage of (0,0) motion;
   //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
   //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
   //                            motion error per block of each frame.
-  const int can_disable_internal_arfs =
-      (oxcf->gf_min_pyr_height <= MIN_PYRAMID_LVL + 1);
+  const int can_disable_internal_arfs = gf_cfg->gf_min_pyr_height <= 1;
   if (can_disable_internal_arfs &&
       gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION &&
-      gf_stats.avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+      gf_stats.avg_sr_coded_error < MAX_SR_CODED_ERROR &&
       gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) {
-    cpi->internal_altref_allowed = 0;
+    cpi->ppi->internal_altref_allowed = 0;
   }
 
   int use_alt_ref;
   if (can_disable_arf) {
-    use_alt_ref = !is_almost_static(gf_stats.zero_motion_accumulator,
-                                    twopass->kf_zeromotion_pct) &&
-                  gf_stats.allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
-                  (i >= MIN_GF_INTERVAL) &&
-                  (cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
-
-    // TODO(urvang): Improve and use model for VBR, CQ etc as well.
-    if (use_alt_ref && cpi->oxcf.rc_mode == AOM_Q &&
-        cpi->oxcf.cq_level <= 200) {
-      aom_clear_system_state();
-      float features[21];
-      get_features_from_gf_stats(
-          &gf_stats, &first_frame_stats, &last_frame_stats, num_mbs,
-          rc->constrained_gf_group, twopass->kf_zeromotion_pct, i, features);
-      // Infer using ML model.
-      float score;
-      av1_nn_predict(features, &av1_use_flat_gop_nn_config, 1, &score);
-      use_alt_ref = (score <= 0.0);
-    }
-  } else {
-    assert(cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
     use_alt_ref =
-        gf_stats.allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && (i > 2);
+        !is_almost_static(gf_stats.zero_motion_accumulator,
+                          twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled) &&
+        p_rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) &&
+        (i >= MIN_GF_INTERVAL);
+  } else {
+    use_alt_ref = p_rc->use_arf_in_this_kf_group &&
+                  (i < gf_cfg->lag_in_frames) && (i > 2);
+  }
+  if (use_alt_ref) {
+    gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height;
+  } else {
+    gf_group->max_layer_depth_allowed = 0;
   }
 
-#define REDUCE_GF_LENGTH_THRESH 4
-#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
-#define REDUCE_GF_LENGTH_BY 1
   int alt_offset = 0;
   // The length reduction strategy is tweaked for certain cases, and doesn't
   // work well for certain other cases.
   const int allow_gf_length_reduction =
-      ((cpi->oxcf.rc_mode == AOM_Q && cpi->oxcf.cq_level <= 128) ||
-       !cpi->internal_altref_allowed) &&
-      !is_lossless_requested(&cpi->oxcf);
+      ((rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 128) ||
+       !cpi->ppi->internal_altref_allowed) &&
+      !is_lossless_requested(rc_cfg);
 
   if (allow_gf_length_reduction && use_alt_ref) {
     // adjust length of this gf group if one of the following condition met
@@ -1740,135 +2541,100 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
         alt_offset = -roll_back;
         i -= roll_back;
         if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
+        p_rc->gf_intervals[p_rc->cur_gf_index] -= roll_back;
+        reset_fpf_position(&cpi->twopass_frame, start_pos);
+        accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame,
+                             start_pos, &gf_stats, &i);
       }
     }
   }
 
-  // Should we use the alternate reference frame.
-  if (use_alt_ref) {
-    rc->source_alt_ref_pending = 1;
-    gf_group->max_layer_depth_allowed = cpi->oxcf.gf_max_pyr_height;
-    set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref,
-                             is_final_pass);
-
-    const int forward_frames = (rc->frames_to_key - i >= i - 1)
-                                   ? i - 1
-                                   : AOMMAX(0, rc->frames_to_key - i);
-
-    // Calculate the boost for alt ref.
-    rc->gfu_boost = av1_calc_arf_boost(
-        twopass, rc, frame_info, alt_offset, forward_frames, (i - 1),
-        cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL,
-        cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL);
-  } else {
-    reset_fpf_position(twopass, start_pos);
-    rc->source_alt_ref_pending = 0;
-    gf_group->max_layer_depth_allowed = 0;
-    set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref,
-                             is_final_pass);
-
-    rc->gfu_boost = AOMMIN(
-        MAX_GF_BOOST,
-        av1_calc_arf_boost(
-            twopass, rc, frame_info, alt_offset, (i - 1), 0,
-            cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL,
-            cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL));
-  }
+  update_gop_length(rc, p_rc, i, is_final_pass);
 
-  // rc->gf_intervals assumes the usage of alt_ref, therefore adding one overlay
-  // frame to the next gf. If no alt_ref is used, should substract 1 frame from
-  // the next gf group.
-  // TODO(bohanli): should incorporate the usage of alt_ref into
-  // calculate_gf_length
-  if (is_final_pass && rc->source_alt_ref_pending == 0 &&
-      rc->intervals_till_gf_calculate_due > 0) {
-    rc->gf_intervals[rc->cur_gf_index]--;
-  }
+  // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+  av1_gop_setup_structure(cpi);
 
-#define LAST_ALR_BOOST_FACTOR 0.2f
-  rc->arf_boost_factor = 1.0;
-  if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
-    // Reduce the boost of altref in the last gf group
-    if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY ||
-        rc->frames_to_key - i == 0) {
-      rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
-    }
-  }
+  set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref,
+                     alt_offset, start_pos, &gf_stats);
 
-  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+  frame_params->frame_type =
+      rc->frames_since_key == 0 ? KEY_FRAME : INTER_FRAME;
+  frame_params->show_frame =
+      !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+        gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE);
+}
 
-  // Reset the file position.
-  reset_fpf_position(twopass, start_pos);
+/*!\brief Define a GF group for the third apss.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group for the third pass, along
+ * with various parameters regarding bit-allocation and quality setup based on
+ * the two-pass bitstream.
+ * Much of the function still uses the strategies used for the second pass and
+ * relies on first pass statistics. It is expected that over time these portions
+ * would be replaced with strategies specific to the third pass.
+ *
+ * \param[in]    cpi             Top-level encoder structure
+ * \param[in]    frame_params    Structure with frame parameters
+ * \param[in]    is_final_pass   Whether this is the final pass for the
+ *                               GF group, or a trial (non-zero)
+ *
+ * \return       0: Success;
+ *              -1: There are conflicts between the bitstream and current config
+ *               The values in cpi->ppi->gf_group are also changed.
+ */
+static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params,
+                                 int is_final_pass) {
+  if (!cpi->third_pass_ctx) return -1;
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const GFConfig *const gf_cfg = &oxcf->gf_cfg;
+  const int f_w = cm->width;
+  const int f_h = cm->height;
+  int i;
+  const int is_intra_only = rc->frames_since_key == 0;
 
-  // Calculate the bits to be allocated to the gf/arf group as a whole
-  gf_group_bits = calculate_total_gf_group_bits(cpi, gf_stats.gf_group_err);
-  rc->gf_group_bits = gf_group_bits;
+  cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1);
 
-#if GROUP_ADAPTIVE_MAXQ
-  // Calculate an estimate of the maxq needed for the group.
-  // We are more agressive about correcting for sections
-  // where there could be significant overshoot than for easier
-  // sections where we do not wish to risk creating an overshoot
-  // of the allocated bit budget.
-  if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) {
-    const int vbr_group_bits_per_frame =
-        (int)(gf_group_bits / rc->baseline_gf_interval);
-    const double group_av_err =
-        gf_stats.gf_group_raw_error / rc->baseline_gf_interval;
-    const double group_av_skip_pct =
-        gf_stats.gf_group_skip_pct / rc->baseline_gf_interval;
-    const double group_av_inactive_zone =
-        ((gf_stats.gf_group_inactive_zone_rows * 2) /
-         (rc->baseline_gf_interval * (double)cm->mi_params.mb_rows));
+  // Reset the GF group data structures unless this is a key
+  // frame in which case it will already have been done.
+  if (!is_intra_only) {
+    av1_zero(cpi->ppi->gf_group);
+    cpi->gf_frame_index = 0;
+  }
 
-    int tmp_q;
-    // rc factor is a weight factor that corrects for local rate control drift.
-    double rc_factor = 1.0;
-    int64_t bits = cpi->oxcf.target_bandwidth;
+  GF_GROUP_STATS gf_stats;
+  accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos,
+                       &gf_stats, &i);
 
-    if (bits > 0) {
-      int rate_error;
+  const int can_disable_arf = !gf_cfg->gf_min_pyr_height;
 
-      rate_error = (int)((rc->vbr_bits_off_target * 100) / bits);
-      rate_error = clamp(rate_error, -100, 100);
-      if (rate_error > 0) {
-        rc_factor = AOMMAX(RC_FACTOR_MIN, (double)(100 - rate_error) / 100.0);
-      } else {
-        rc_factor = AOMMIN(RC_FACTOR_MAX, (double)(100 - rate_error) / 100.0);
-      }
-    }
+  // TODO(any): set cpi->ppi->internal_altref_allowed accordingly;
 
-    tmp_q = get_twopass_worst_quality(
-        cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
-        vbr_group_bits_per_frame, rc_factor);
-    rc->active_worst_quality = AOMMAX(tmp_q, rc->active_worst_quality >> 1);
+  int use_alt_ref = av1_check_use_arf(cpi->third_pass_ctx);
+  if (use_alt_ref == 0 && !can_disable_arf) return -1;
+  if (use_alt_ref) {
+    gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height;
+  } else {
+    gf_group->max_layer_depth_allowed = 0;
   }
-#endif
 
-  // Adjust KF group bits and error remaining.
-  if (is_final_pass)
-    twopass->kf_group_error_left -= (int64_t)gf_stats.gf_group_err;
+  update_gop_length(rc, p_rc, i, is_final_pass);
 
   // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
-  av1_gop_setup_structure(cpi, frame_params);
-
-  // Reset the file position.
-  reset_fpf_position(twopass, start_pos);
+  av1_gop_setup_structure(cpi);
 
-  // Calculate a section intra ratio used in setting max loop filter.
-  if (frame_params->frame_type != KEY_FRAME) {
-    twopass->section_intra_rating = calculate_section_intra_ratio(
-        start_pos, twopass->stats_buf_ctx->stats_in_end,
-        rc->baseline_gf_interval);
-  }
-
-  // Reset rolling actual and target bits counters for ARF groups.
-  twopass->rolling_arf_group_target_bits = 1;
-  twopass->rolling_arf_group_actual_bits = 1;
+  set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref, 0,
+                     start_pos, &gf_stats);
 
-  av1_gop_bit_allocation(cpi, rc, gf_group,
-                         frame_params->frame_type == KEY_FRAME, use_alt_ref,
-                         gf_group_bits);
+  frame_params->frame_type = cpi->third_pass_ctx->frame_info[0].frame_type;
+  frame_params->show_frame = cpi->third_pass_ctx->frame_info[0].is_show_frame;
+  return 0;
 }
 
 // #define FIXED_ARF_BITS
@@ -1878,20 +2644,22 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
 void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
                             GF_GROUP *gf_group, int is_key_frame, int use_arf,
                             int64_t gf_group_bits) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   // Calculate the extra bits to be used for boosted frame(s)
 #ifdef FIXED_ARF_BITS
   int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits);
 #else
-  int gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
-                                         rc->gfu_boost, gf_group_bits);
+  int gf_arf_bits = calculate_boost_bits(
+      p_rc->baseline_gf_interval - (rc->frames_since_key == 0), p_rc->gfu_boost,
+      gf_group_bits);
 #endif
 
   gf_arf_bits = adjust_boost_bits_for_target_level(cpi, rc, gf_arf_bits,
                                                    gf_group_bits, 1);
 
   // Allocate bits to each of the frames in the GF group.
-  allocate_gf_group_bits(gf_group, rc, gf_group_bits, gf_arf_bits, is_key_frame,
-                         use_arf);
+  allocate_gf_group_bits(gf_group, p_rc, rc, gf_group_bits, gf_arf_bits,
+                         is_key_frame, use_arf);
 }
 
 // Minimum % intra coding observed in first pass (1.0 = 100%)
@@ -1907,7 +2675,7 @@ void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
 #define VERY_LOW_INTER_THRESH 0.05
 // Maximum threshold for the relative ratio of intra error score vs best
 // inter error score.
-#define KF_II_ERR_THRESHOLD 2.5
+#define KF_II_ERR_THRESHOLD 1.9
 // In real scene cuts there is almost always a sharp change in the intra
 // or inter error score.
 #define ERR_CHANGE_THRESHOLD 0.4
@@ -1915,6 +2683,25 @@ void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
 // ratio in the next frame.
 #define II_IMPROVEMENT_THRESHOLD 3.5
 #define KF_II_MAX 128.0
+// Intra / Inter threshold very low
+#define VERY_LOW_II 1.5
+// Clean slide transitions we expect a sharp single frame spike in error.
+#define ERROR_SPIKE 5.0
+
+// Slide show transition detection.
+// Tests for case where there is very low error either side of the current frame
+// but much higher just for this frame. This can help detect key frames in
+// slide shows even where the slides are pictures of different sizes.
+// Also requires that intra and inter errors are very similar to help eliminate
+// harmful false positives.
+// It will not help if the transition is a fade or other multi-frame effect.
+static int slide_transition(const FIRSTPASS_STATS *this_frame,
+                            const FIRSTPASS_STATS *last_frame,
+                            const FIRSTPASS_STATS *next_frame) {
+  return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) &&
+         (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) &&
+         (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE));
+}
 
 // Threshold for use of the lagging second reference frame. High second ref
 // usage may point to a transient event like a flash or occlusion rather than
@@ -1933,90 +2720,114 @@ static double get_second_ref_usage_thresh(int frame_count_so_far) {
              second_ref_usage_thresh_max_delta;
 }
 
-static int test_candidate_kf(TWO_PASS *twopass,
-                             const FIRSTPASS_STATS *last_frame,
-                             const FIRSTPASS_STATS *this_frame,
-                             const FIRSTPASS_STATS *next_frame,
-                             int frame_count_so_far, enum aom_rc_mode rc_mode) {
+static int test_candidate_kf(const FIRSTPASS_INFO *firstpass_info,
+                             int this_stats_index, int frame_count_so_far,
+                             enum aom_rc_mode rc_mode, int scenecut_mode,
+                             int num_mbs) {
+  const FIRSTPASS_STATS *last_stats =
+      av1_firstpass_info_peek(firstpass_info, this_stats_index - 1);
+  const FIRSTPASS_STATS *this_stats =
+      av1_firstpass_info_peek(firstpass_info, this_stats_index);
+  const FIRSTPASS_STATS *next_stats =
+      av1_firstpass_info_peek(firstpass_info, this_stats_index + 1);
+  if (last_stats == NULL || this_stats == NULL || next_stats == NULL) {
+    return 0;
+  }
+
   int is_viable_kf = 0;
-  double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+  double pcnt_intra = 1.0 - this_stats->pcnt_inter;
   double modified_pcnt_inter =
-      this_frame->pcnt_inter - this_frame->pcnt_neutral;
+      this_stats->pcnt_inter - this_stats->pcnt_neutral;
   const double second_ref_usage_thresh =
       get_second_ref_usage_thresh(frame_count_so_far);
+  int frames_to_test_after_candidate_key = SCENE_CUT_KEY_TEST_INTERVAL;
+  int count_for_tolerable_prediction = 3;
+
+  // We do "-1" because the candidate key is not counted.
+  int stats_after_this_stats =
+      av1_firstpass_info_future_count(firstpass_info, this_stats_index) - 1;
+
+  if (scenecut_mode == ENABLE_SCENECUT_MODE_1) {
+    if (stats_after_this_stats < 3) {
+      return 0;
+    } else {
+      frames_to_test_after_candidate_key = 3;
+      count_for_tolerable_prediction = 1;
+    }
+  }
+  // Make sure we have enough stats after the candidate key.
+  frames_to_test_after_candidate_key =
+      AOMMIN(frames_to_test_after_candidate_key, stats_after_this_stats);
 
   // Does the frame satisfy the primary criteria of a key frame?
   // See above for an explanation of the test criteria.
   // If so, then examine how well it predicts subsequent frames.
   if (IMPLIES(rc_mode == AOM_Q, frame_count_so_far >= 3) &&
-      (this_frame->pcnt_second_ref < second_ref_usage_thresh) &&
-      (next_frame->pcnt_second_ref < second_ref_usage_thresh) &&
-      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+      (this_stats->pcnt_second_ref < second_ref_usage_thresh) &&
+      (next_stats->pcnt_second_ref < second_ref_usage_thresh) &&
+      ((this_stats->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+       slide_transition(this_stats, last_stats, next_stats) ||
        ((pcnt_intra > MIN_INTRA_LEVEL) &&
         (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
-        ((this_frame->intra_error /
-          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+        ((this_stats->intra_error /
+          DOUBLE_DIVIDE_CHECK(this_stats->coded_error)) <
          KF_II_ERR_THRESHOLD) &&
-        ((fabs(last_frame->coded_error - this_frame->coded_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+        ((fabs(last_stats->coded_error - this_stats->coded_error) /
+              DOUBLE_DIVIDE_CHECK(this_stats->coded_error) >
           ERR_CHANGE_THRESHOLD) ||
-         (fabs(last_frame->intra_error - this_frame->intra_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+         (fabs(last_stats->intra_error - this_stats->intra_error) /
+              DOUBLE_DIVIDE_CHECK(this_stats->intra_error) >
           ERR_CHANGE_THRESHOLD) ||
-         ((next_frame->intra_error /
-           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+         ((next_stats->intra_error /
+           DOUBLE_DIVIDE_CHECK(next_stats->coded_error)) >
           II_IMPROVEMENT_THRESHOLD))))) {
     int i;
-    const FIRSTPASS_STATS *start_pos = twopass->stats_in;
-    FIRSTPASS_STATS local_next_frame = *next_frame;
     double boost_score = 0.0;
     double old_boost_score = 0.0;
     double decay_accumulator = 1.0;
 
     // Examine how well the key frame predicts subsequent frames.
-    for (i = 0; i < SCENE_CUT_KEY_TEST_INTERVAL; ++i) {
-      double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
-                             DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+    for (i = 1; i <= frames_to_test_after_candidate_key; ++i) {
+      // Get the next frame details
+      const FIRSTPASS_STATS *local_next_frame =
+          av1_firstpass_info_peek(firstpass_info, this_stats_index + i);
+      double next_iiratio =
+          (BOOST_FACTOR * local_next_frame->intra_error /
+           DOUBLE_DIVIDE_CHECK(local_next_frame->coded_error));
 
       if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
 
       // Cumulative effect of decay in prediction quality.
-      if (local_next_frame.pcnt_inter > 0.85)
-        decay_accumulator *= local_next_frame.pcnt_inter;
+      if (local_next_frame->pcnt_inter > 0.85)
+        decay_accumulator *= local_next_frame->pcnt_inter;
       else
-        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
+        decay_accumulator *= (0.85 + local_next_frame->pcnt_inter) / 2.0;
 
       // Keep a running total.
       boost_score += (decay_accumulator * next_iiratio);
 
       // Test various breakout clauses.
-      if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
-          (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) <
+      // TODO(any): Test of intra error should be normalized to an MB.
+      if ((local_next_frame->pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+          (((local_next_frame->pcnt_inter - local_next_frame->pcnt_neutral) <
             0.20) &&
            (next_iiratio < 3.0)) ||
           ((boost_score - old_boost_score) < 3.0) ||
-          (local_next_frame.intra_error < 200)) {
+          (local_next_frame->intra_error < (200.0 / (double)num_mbs))) {
         break;
       }
 
       old_boost_score = boost_score;
-
-      // Get the next frame details
-      if (EOF == input_stats(twopass, &local_next_frame)) break;
     }
 
     // If there is tolerable prediction for at least the next 3 frames then
     // break out else discard this potential key frame and move on
-    if (boost_score > 30.0 && (i > 3)) {
+    if (boost_score > 30.0 && (i > count_for_tolerable_prediction)) {
       is_viable_kf = 1;
     } else {
-      // Reset the file position
-      reset_fpf_position(twopass, start_pos);
-
       is_viable_kf = 0;
     }
   }
-
   return is_viable_kf;
 }
 
@@ -2028,10 +2839,8 @@ static int test_candidate_kf(TWO_PASS *twopass,
 #define MIN_STATIC_KF_BOOST 5400  // Minimum boost for static KF interval
 
 static int detect_app_forced_key(AV1_COMP *cpi) {
-  if (cpi->oxcf.fwd_kf_enabled) cpi->rc.next_is_fwd_key = 1;
   int num_frames_to_app_forced_key = is_forced_keyframe_pending(
-      cpi->lookahead, cpi->lookahead->max_sz, cpi->compressor_stage);
-  if (num_frames_to_app_forced_key != -1) cpi->rc.next_is_fwd_key = 0;
+      cpi->ppi->lookahead, cpi->ppi->lookahead->max_sz, cpi->compressor_stage);
   return num_frames_to_app_forced_key;
 }
 
@@ -2041,33 +2850,49 @@ static int get_projected_kf_boost(AV1_COMP *cpi) {
    * all stats needed for prior boost calculation are available.
    * Hence projecting the prior boost is not needed in this cases.
    */
-  if (cpi->rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key)
-    return cpi->rc.kf_boost;
+  if (cpi->ppi->p_rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key)
+    return cpi->ppi->p_rc.kf_boost;
 
   // Get the current tpl factor (number of frames = frames_to_key).
   double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key);
   // Get the tpl factor when number of frames = num_stats_used_for_kf_boost.
-  double tpl_factor_num_stats =
-      av1_get_kf_boost_projection_factor(cpi->rc.num_stats_used_for_kf_boost);
+  double tpl_factor_num_stats = av1_get_kf_boost_projection_factor(
+      cpi->ppi->p_rc.num_stats_used_for_kf_boost);
   int projected_kf_boost =
-      (int)rint((tpl_factor * cpi->rc.kf_boost) / tpl_factor_num_stats);
+      (int)rint((tpl_factor * cpi->ppi->p_rc.kf_boost) / tpl_factor_num_stats);
   return projected_kf_boost;
 }
 
-static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
-                              double *kf_group_err,
-                              int num_frames_to_detect_scenecut) {
-  TWO_PASS *const twopass = &cpi->twopass;
-  RATE_CONTROL *const rc = &cpi->rc;
+/*!\brief Determine the location of the next key frame
+ *
+ * \ingroup gf_group_algo
+ * This function decides the placement of the next key frame when a
+ * scenecut is detected or the maximum key frame distance is reached.
+ *
+ * \param[in]    cpi              Top-level encoder structure
+ * \param[in]    firstpass_info   struct for firstpass info
+ * \param[in]    num_frames_to_detect_scenecut Maximum lookahead frames.
+ * \param[in]    search_start_idx   the start index for searching key frame.
+ *                                  Set it to one if we already know the
+ *                                  current frame is key frame. Otherwise,
+ *                                  set it to zero.
+ *
+ * \return       Number of frames to the next key including the current frame.
+ */
+static int define_kf_interval(AV1_COMP *cpi,
+                              const FIRSTPASS_INFO *firstpass_info,
+                              int num_frames_to_detect_scenecut,
+                              int search_start_idx) {
+  const TWO_PASS *const twopass = &cpi->ppi->twopass;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
-  FIRSTPASS_STATS last_frame;
   double decay_accumulator = 1.0;
   int i = 0, j;
-  int frames_to_key = 1;
+  int frames_to_key = search_start_idx;
   int frames_since_key = rc->frames_since_key + 1;
-  FRAME_INFO *const frame_info = &cpi->frame_info;
-  int num_stats_used_for_kf_boost = 1;
   int scenecut_detected = 0;
 
   int num_frames_to_next_key = detect_app_forced_key(cpi);
@@ -2087,35 +2912,33 @@ static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
   for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
 
   i = 0;
-  while (twopass->stats_in < twopass->stats_buf_ctx->stats_in_end &&
+  const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
+                          ? cpi->initial_mbs
+                          : cpi->common.mi_params.MBs;
+  const int future_stats_count =
+      av1_firstpass_info_future_count(firstpass_info, 0);
+  while (frames_to_key < future_stats_count &&
          frames_to_key < num_frames_to_detect_scenecut) {
-    // Accumulate total number of stats available till next key frame
-    num_stats_used_for_kf_boost++;
-
-    // Accumulate kf group error.
-    if (kf_group_err != NULL)
-      *kf_group_err +=
-          calculate_modified_err(frame_info, twopass, oxcf, this_frame);
-
-    // Load the next frame's stats.
-    last_frame = *this_frame;
-    input_stats(twopass, this_frame);
-
     // Provided that we are not at the end of the file...
-    if (cpi->rc.enable_scenecut_detection && cpi->oxcf.auto_key &&
-        twopass->stats_in < twopass->stats_buf_ctx->stats_in_end) {
+    if ((cpi->ppi->p_rc.enable_scenecut_detection > 0) && kf_cfg->auto_key &&
+        frames_to_key + 1 < future_stats_count) {
       double loop_decay_rate;
 
       // Check for a scene cut.
-      if (test_candidate_kf(twopass, &last_frame, this_frame, twopass->stats_in,
-                            frames_since_key, oxcf->rc_mode)) {
-        scenecut_detected = 1;
-        break;
+      if (frames_since_key >= kf_cfg->key_freq_min) {
+        scenecut_detected = test_candidate_kf(
+            &twopass->firstpass_info, frames_to_key, frames_since_key,
+            oxcf->rc_cfg.mode, cpi->ppi->p_rc.enable_scenecut_detection,
+            num_mbs);
+        if (scenecut_detected) {
+          break;
+        }
       }
 
       // How fast is the prediction quality decaying?
-      loop_decay_rate =
-          get_prediction_decay_rate(frame_info, twopass->stats_in);
+      const FIRSTPASS_STATS *next_stats =
+          av1_firstpass_info_peek(firstpass_info, frames_to_key + 1);
+      loop_decay_rate = get_prediction_decay_rate(next_stats);
 
       // We want to know something about the recent past... rather than
       // as used elsewhere where we are concerned with decay in prediction
@@ -2127,11 +2950,17 @@ static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
 
       // Special check for transition or high motion followed by a
       // static scene.
-      if (detect_transition_to_still(twopass, rc->min_gf_interval, i,
-                                     cpi->oxcf.key_freq - i, loop_decay_rate,
-                                     decay_accumulator)) {
-        scenecut_detected = 1;
-        break;
+      if (frames_since_key >= kf_cfg->key_freq_min) {
+        scenecut_detected = detect_transition_to_still(
+            firstpass_info, frames_to_key + 1, rc->min_gf_interval, i,
+            kf_cfg->key_freq_max - i, loop_decay_rate, decay_accumulator);
+        if (scenecut_detected) {
+          // In the case of transition followed by a static scene, the key frame
+          // could be a good predictor for the following frames, therefore we
+          // do not use an arf.
+          p_rc->use_arf_in_this_kf_group = 0;
+          break;
+        }
       }
 
       // Step on to the next frame.
@@ -2139,76 +2968,234 @@ static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
       ++frames_since_key;
 
       // If we don't have a real key frame within the next two
-      // key_freq intervals then break out of the loop.
-      if (frames_to_key >= 2 * cpi->oxcf.key_freq) break;
+      // key_freq_max intervals then break out of the loop.
+      if (frames_to_key >= 2 * kf_cfg->key_freq_max) {
+        break;
+      }
     } else {
       ++frames_to_key;
       ++frames_since_key;
     }
     ++i;
   }
-
-  if (kf_group_err != NULL)
-    rc->num_stats_used_for_kf_boost = num_stats_used_for_kf_boost;
-
-  if (cpi->lap_enabled && !scenecut_detected)
+  if (cpi->ppi->lap_enabled && !scenecut_detected)
     frames_to_key = num_frames_to_next_key;
 
   return frames_to_key;
 }
 
+static double get_kf_group_avg_error(TWO_PASS *twopass,
+                                     TWO_PASS_FRAME *twopass_frame,
+                                     const FIRSTPASS_STATS *first_frame,
+                                     const FIRSTPASS_STATS *start_position,
+                                     int frames_to_key) {
+  FIRSTPASS_STATS cur_frame = *first_frame;
+  int num_frames, i;
+  double kf_group_avg_error = 0.0;
+
+  reset_fpf_position(twopass_frame, start_position);
+
+  for (i = 0; i < frames_to_key; ++i) {
+    kf_group_avg_error += cur_frame.coded_error;
+    if (EOF == input_stats(twopass, twopass_frame, &cur_frame)) break;
+  }
+  num_frames = i + 1;
+  num_frames = AOMMIN(num_frames, frames_to_key);
+  kf_group_avg_error = kf_group_avg_error / num_frames;
+
+  return (kf_group_avg_error);
+}
+
+static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err,
+                                 double kf_group_avg_error) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  int64_t kf_group_bits;
+  if (cpi->ppi->lap_enabled) {
+    kf_group_bits = (int64_t)rc->frames_to_key * rc->avg_frame_bandwidth;
+    if (cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap) {
+      double vbr_corpus_complexity_lap =
+          cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap / 10.0;
+      /* Get the average corpus complexity of the frame */
+      kf_group_bits = (int64_t)(
+          kf_group_bits * (kf_group_avg_error / vbr_corpus_complexity_lap));
+    }
+  } else {
+    kf_group_bits = (int64_t)(twopass->bits_left *
+                              (kf_group_err / twopass->modified_error_left));
+  }
+
+  return kf_group_bits;
+}
+
+static int calc_avg_stats(AV1_COMP *cpi, FIRSTPASS_STATS *avg_frame_stat) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FIRSTPASS_STATS cur_frame;
+  av1_zero(cur_frame);
+  int num_frames = 0;
+  // Accumulate total stat using available number of stats.
+  for (num_frames = 0; num_frames < (rc->frames_to_key - 1); ++num_frames) {
+    if (EOF == input_stats(twopass, &cpi->twopass_frame, &cur_frame)) break;
+    av1_accumulate_stats(avg_frame_stat, &cur_frame);
+  }
+
+  if (num_frames < 2) {
+    return num_frames;
+  }
+  // Average the total stat
+  avg_frame_stat->weight = avg_frame_stat->weight / num_frames;
+  avg_frame_stat->intra_error = avg_frame_stat->intra_error / num_frames;
+  avg_frame_stat->frame_avg_wavelet_energy =
+      avg_frame_stat->frame_avg_wavelet_energy / num_frames;
+  avg_frame_stat->coded_error = avg_frame_stat->coded_error / num_frames;
+  avg_frame_stat->sr_coded_error = avg_frame_stat->sr_coded_error / num_frames;
+  avg_frame_stat->pcnt_inter = avg_frame_stat->pcnt_inter / num_frames;
+  avg_frame_stat->pcnt_motion = avg_frame_stat->pcnt_motion / num_frames;
+  avg_frame_stat->pcnt_second_ref =
+      avg_frame_stat->pcnt_second_ref / num_frames;
+  avg_frame_stat->pcnt_neutral = avg_frame_stat->pcnt_neutral / num_frames;
+  avg_frame_stat->intra_skip_pct = avg_frame_stat->intra_skip_pct / num_frames;
+  avg_frame_stat->inactive_zone_rows =
+      avg_frame_stat->inactive_zone_rows / num_frames;
+  avg_frame_stat->inactive_zone_cols =
+      avg_frame_stat->inactive_zone_cols / num_frames;
+  avg_frame_stat->MVr = avg_frame_stat->MVr / num_frames;
+  avg_frame_stat->mvr_abs = avg_frame_stat->mvr_abs / num_frames;
+  avg_frame_stat->MVc = avg_frame_stat->MVc / num_frames;
+  avg_frame_stat->mvc_abs = avg_frame_stat->mvc_abs / num_frames;
+  avg_frame_stat->MVrv = avg_frame_stat->MVrv / num_frames;
+  avg_frame_stat->MVcv = avg_frame_stat->MVcv / num_frames;
+  avg_frame_stat->mv_in_out_count =
+      avg_frame_stat->mv_in_out_count / num_frames;
+  avg_frame_stat->new_mv_count = avg_frame_stat->new_mv_count / num_frames;
+  avg_frame_stat->count = avg_frame_stat->count / num_frames;
+  avg_frame_stat->duration = avg_frame_stat->duration / num_frames;
+
+  return num_frames;
+}
+
+static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err,
+                                 double *zero_motion_accumulator,
+                                 double *sr_accumulator, int use_avg_stat) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
+  FIRSTPASS_STATS frame_stat;
+  av1_zero(frame_stat);
+  int i = 0, num_stat_used = 0;
+  double boost_score = 0.0;
+  const double kf_max_boost =
+      cpi->oxcf.rc_cfg.mode == AOM_Q
+          ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+                   KF_MAX_FRAME_BOOST)
+          : KF_MAX_FRAME_BOOST;
+
+  // Calculate the average using available number of stats.
+  if (use_avg_stat) num_stat_used = calc_avg_stats(cpi, &frame_stat);
+
+  for (i = num_stat_used; i < (rc->frames_to_key - 1); ++i) {
+    if (!use_avg_stat &&
+        EOF == input_stats(twopass, &cpi->twopass_frame, &frame_stat))
+      break;
+
+    // Monitor for static sections.
+    // For the first frame in kf group, the second ref indicator is invalid.
+    if (i > 0) {
+      *zero_motion_accumulator =
+          AOMMIN(*zero_motion_accumulator, get_zero_motion_factor(&frame_stat));
+    } else {
+      *zero_motion_accumulator = frame_stat.pcnt_inter - frame_stat.pcnt_motion;
+    }
+
+    // Not all frames in the group are necessarily used in calculating boost.
+    if ((*sr_accumulator < (kf_raw_err * 1.50)) &&
+        (i <= rc->max_gf_interval * 2)) {
+      double frame_boost;
+      double zm_factor;
+
+      // Factor 0.75-1.25 based on how much of frame is static.
+      zm_factor = (0.75 + (*zero_motion_accumulator / 2.0));
+
+      if (i < 2) *sr_accumulator = 0.0;
+      frame_boost =
+          calc_kf_frame_boost(&cpi->ppi->p_rc, frame_info, &frame_stat,
+                              sr_accumulator, kf_max_boost);
+      boost_score += frame_boost * zm_factor;
+    }
+  }
+  return boost_score;
+}
+
+/*!\brief Interval(in seconds) to clip key-frame distance to in LAP.
+ */
+#define MAX_KF_BITS_INTERVAL_SINGLE_PASS 5
+
+/*!\brief Determine the next key frame group
+ *
+ * \ingroup gf_group_algo
+ * This function decides the placement of the next key frame, and
+ * calculates the bit allocation of the KF group and the keyframe itself.
+ *
+ * \param[in]    cpi              Top-level encoder structure
+ * \param[in]    this_frame       Pointer to first pass stats
+ *
+ * \return Nothing is returned.
+ */
 static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   FRAME_INFO *const frame_info = &cpi->frame_info;
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
   const FIRSTPASS_STATS first_frame = *this_frame;
   FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_INFO *firstpass_info = &twopass->firstpass_info;
   av1_zero(next_frame);
 
   rc->frames_since_key = 0;
+  // Use arfs if possible.
+  p_rc->use_arf_in_this_kf_group = is_altref_enabled(
+      oxcf->gf_cfg.lag_in_frames, oxcf->gf_cfg.enable_auto_arf);
 
   // Reset the GF group data structures.
   av1_zero(*gf_group);
-
-  // Clear the alt ref active flag and last group multi arf flags as they
-  // can never be set for a key frame.
-  rc->source_alt_ref_active = 0;
+  cpi->gf_frame_index = 0;
 
   // KF is always a GF so clear frames till next gf counter.
   rc->frames_till_gf_update_due = 0;
 
-  rc->frames_to_key = 1;
-
   if (has_no_stats_stage(cpi)) {
     int num_frames_to_app_forced_key = detect_app_forced_key(cpi);
-    rc->this_key_frame_forced =
+    p_rc->this_key_frame_forced =
         current_frame->frame_number != 0 && rc->frames_to_key == 0;
     if (num_frames_to_app_forced_key != -1)
       rc->frames_to_key = num_frames_to_app_forced_key;
     else
-      rc->frames_to_key = AOMMAX(1, cpi->oxcf.key_freq);
+      rc->frames_to_key = AOMMAX(1, kf_cfg->key_freq_max);
     correct_frames_to_key(cpi);
-    rc->kf_boost = DEFAULT_KF_BOOST;
-    rc->source_alt_ref_active = 0;
+    p_rc->kf_boost = DEFAULT_KF_BOOST;
     gf_group->update_type[0] = KF_UPDATE;
     return;
   }
   int i;
-  const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+  const FIRSTPASS_STATS *const start_position = cpi->twopass_frame.stats_in;
   int kf_bits = 0;
   double zero_motion_accumulator = 1.0;
   double boost_score = 0.0;
   double kf_raw_err = 0.0;
   double kf_mod_err = 0.0;
-  double kf_group_err = 0.0;
   double sr_accumulator = 0.0;
-  int frames_to_key;
+  double kf_group_avg_error = 0.0;
+  int frames_to_key, frames_to_key_clipped = INT_MAX;
+  int64_t kf_group_bits_clipped = INT64_MAX;
+
   // Is this a forced key frame by interval.
-  rc->this_key_frame_forced = rc->next_key_frame_forced;
+  p_rc->this_key_frame_forced = p_rc->next_key_frame_forced;
 
   twopass->kf_group_bits = 0;        // Total bits available to kf group
   twopass->kf_group_error_left = 0;  // Group modified error score.
@@ -2216,65 +3203,79 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   kf_raw_err = this_frame->intra_error;
   kf_mod_err = calculate_modified_err(frame_info, twopass, oxcf, this_frame);
 
-  frames_to_key =
-      define_kf_interval(cpi, this_frame, &kf_group_err, oxcf->key_freq);
+  // We assume the current frame is a key frame and we are looking for the next
+  // key frame. Therefore search_start_idx = 1
+  frames_to_key = define_kf_interval(cpi, firstpass_info, kf_cfg->key_freq_max,
+                                     /*search_start_idx=*/1);
 
-  if (frames_to_key != -1)
-    rc->frames_to_key = AOMMIN(oxcf->key_freq, frames_to_key);
-  else
-    rc->frames_to_key = oxcf->key_freq;
+  if (frames_to_key != -1) {
+    rc->frames_to_key = AOMMIN(kf_cfg->key_freq_max, frames_to_key);
+  } else {
+    rc->frames_to_key = kf_cfg->key_freq_max;
+  }
+
+  rc->frames_to_fwd_kf = kf_cfg->fwd_kf_dist;
 
-  if (cpi->lap_enabled) correct_frames_to_key(cpi);
+  if (cpi->ppi->lap_enabled) correct_frames_to_key(cpi);
 
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
   // This code centers the extra kf if the actual natural interval
   // is between 1x and 2x.
-  if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) {
+  if (kf_cfg->auto_key && rc->frames_to_key > kf_cfg->key_freq_max) {
     FIRSTPASS_STATS tmp_frame = first_frame;
 
     rc->frames_to_key /= 2;
 
     // Reset to the start of the group.
-    reset_fpf_position(twopass, start_position);
-
-    kf_group_err = 0.0;
-
+    reset_fpf_position(&cpi->twopass_frame, start_position);
     // Rescan to get the correct error data for the forced kf group.
     for (i = 0; i < rc->frames_to_key; ++i) {
-      kf_group_err +=
-          calculate_modified_err(frame_info, twopass, oxcf, &tmp_frame);
-      if (EOF == input_stats(twopass, &tmp_frame)) break;
+      if (EOF == input_stats(twopass, &cpi->twopass_frame, &tmp_frame)) break;
     }
-    rc->next_key_frame_forced = 1;
-  } else if ((twopass->stats_in == twopass->stats_buf_ctx->stats_in_end &&
+    p_rc->next_key_frame_forced = 1;
+  } else if ((cpi->twopass_frame.stats_in ==
+                  twopass->stats_buf_ctx->stats_in_end &&
               is_stat_consumption_stage_twopass(cpi)) ||
-             rc->frames_to_key >= cpi->oxcf.key_freq) {
-    rc->next_key_frame_forced = 1;
+             rc->frames_to_key >= kf_cfg->key_freq_max) {
+    p_rc->next_key_frame_forced = 1;
   } else {
-    rc->next_key_frame_forced = 0;
+    p_rc->next_key_frame_forced = 0;
   }
 
-  // Special case for the last key frame of the file.
-  if (twopass->stats_in >= twopass->stats_buf_ctx->stats_in_end) {
-    // Accumulate kf group error.
-    kf_group_err +=
-        calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+  double kf_group_err = 0;
+  for (i = 0; i < rc->frames_to_key; ++i) {
+    const FIRSTPASS_STATS *this_stats =
+        av1_firstpass_info_peek(&twopass->firstpass_info, i);
+    if (this_stats != NULL) {
+      // Accumulate kf group error.
+      kf_group_err += calculate_modified_err_new(
+          frame_info, &firstpass_info->total_stats, this_stats,
+          oxcf->rc_cfg.vbrbias, twopass->modified_error_min,
+          twopass->modified_error_max);
+      ++p_rc->num_stats_used_for_kf_boost;
+    }
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
-  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+  if ((twopass->bits_left > 0 && twopass->modified_error_left > 0.0) ||
+      (cpi->ppi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) {
     // Maximum number of bits for a single normal frame (not key frame).
-    const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+    const int max_bits = frame_max_bits(rc, oxcf);
 
     // Maximum number of bits allocated to the key frame group.
     int64_t max_grp_bits;
 
+    if (oxcf->rc_cfg.vbr_corpus_complexity_lap) {
+      kf_group_avg_error =
+          get_kf_group_avg_error(twopass, &cpi->twopass_frame, &first_frame,
+                                 start_position, rc->frames_to_key);
+    }
+
     // Default allocation based on bits left and relative
     // complexity of the section.
-    twopass->kf_group_bits = (int64_t)(
-        twopass->bits_left * (kf_group_err / twopass->modified_error_left));
-
+    twopass->kf_group_bits =
+        get_kf_group_bits(cpi, kf_group_err, kf_group_avg_error);
     // Clip based on maximum per frame rate defined by the user.
     max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
     if (twopass->kf_group_bits > max_grp_bits)
@@ -2284,48 +3285,30 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
   twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
 
-  // Reset the first pass file position.
-  reset_fpf_position(twopass, start_position);
-
-  // Scan through the kf group collating various stats used to determine
-  // how many bits to spend on it.
-  boost_score = 0.0;
-  const double kf_max_boost =
-      cpi->oxcf.rc_mode == AOM_Q
-          ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
-                   KF_MAX_FRAME_BOOST)
-          : KF_MAX_FRAME_BOOST;
-  for (i = 0; i < (rc->frames_to_key - 1); ++i) {
-    if (EOF == input_stats(twopass, &next_frame)) break;
-
-    // Monitor for static sections.
-    // For the first frame in kf group, the second ref indicator is invalid.
-    if (i > 0) {
-      zero_motion_accumulator =
-          AOMMIN(zero_motion_accumulator,
-                 get_zero_motion_factor(frame_info, &next_frame));
-    } else {
-      zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion;
-    }
-
-    // Not all frames in the group are necessarily used in calculating boost.
-    if ((sr_accumulator < (kf_raw_err * 1.50)) &&
-        (i <= rc->max_gf_interval * 2)) {
-      double frame_boost;
-      double zm_factor;
-
-      // Factor 0.75-1.25 based on how much of frame is static.
-      zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
-
-      if (i < 2) sr_accumulator = 0.0;
-      frame_boost = calc_kf_frame_boost(rc, frame_info, &next_frame,
-                                        &sr_accumulator, kf_max_boost);
-      boost_score += frame_boost * zm_factor;
+  if (cpi->ppi->lap_enabled) {
+    // In the case of single pass based on LAP, frames to  key may have an
+    // inaccurate value, and hence should be clipped to an appropriate
+    // interval.
+    frames_to_key_clipped =
+        (int)(MAX_KF_BITS_INTERVAL_SINGLE_PASS * cpi->framerate);
+
+    // This variable calculates the bits allocated to kf_group with a clipped
+    // frames_to_key.
+    if (rc->frames_to_key > frames_to_key_clipped) {
+      kf_group_bits_clipped =
+          (int64_t)((double)twopass->kf_group_bits * frames_to_key_clipped /
+                    rc->frames_to_key);
     }
   }
 
-  reset_fpf_position(twopass, start_position);
+  // Reset the first pass file position.
+  reset_fpf_position(&cpi->twopass_frame, start_position);
 
+  // Scan through the kf group collating various stats used to determine
+  // how many bits to spend on it.
+  boost_score = get_kf_boost_score(cpi, kf_raw_err, &zero_motion_accumulator,
+                                   &sr_accumulator, 0);
+  reset_fpf_position(&cpi->twopass_frame, start_position);
   // Store the zero motion percentage
   twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
@@ -2333,30 +3316,43 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->section_intra_rating = calculate_section_intra_ratio(
       start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key);
 
-  rc->kf_boost = (int)boost_score;
+  p_rc->kf_boost = (int)boost_score;
 
-  if (cpi->lap_enabled) {
-    rc->kf_boost = get_projected_kf_boost(cpi);
+  if (cpi->ppi->lap_enabled) {
+    if (oxcf->rc_cfg.mode == AOM_Q) {
+      p_rc->kf_boost = get_projected_kf_boost(cpi);
+    } else {
+      // TODO(any): Explore using average frame stats for AOM_Q as well.
+      boost_score = get_kf_boost_score(
+          cpi, kf_raw_err, &zero_motion_accumulator, &sr_accumulator, 1);
+      reset_fpf_position(&cpi->twopass_frame, start_position);
+      p_rc->kf_boost += (int)boost_score;
+    }
   }
 
   // Special case for static / slide show content but don't apply
   // if the kf group is very short.
   if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) &&
       (rc->frames_to_key > 8)) {
-    rc->kf_boost = AOMMAX(rc->kf_boost, MIN_STATIC_KF_BOOST);
+    p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_STATIC_KF_BOOST);
   } else {
     // Apply various clamps for min and max boost
-    rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
-    rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
+    p_rc->kf_boost = AOMMAX(p_rc->kf_boost, (rc->frames_to_key * 3));
+    p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_KF_BOOST);
 #ifdef STRICT_RC
-    rc->kf_boost = AOMMIN(rc->kf_boost, MAX_KF_BOOST);
+    p_rc->kf_boost = AOMMIN(p_rc->kf_boost, MAX_KF_BOOST);
 #endif
   }
 
   // Work out how many bits to allocate for the key frame itself.
-  kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
-                                 twopass->kf_group_bits);
-  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
+  // In case of LAP enabled for VBR, if the frames_to_key value is
+  // very high, we calculate the bits based on a clipped value of
+  // frames_to_key.
+  kf_bits = calculate_boost_bits(
+      AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, p_rc->kf_boost,
+      AOMMIN(twopass->kf_group_bits, kf_group_bits_clipped));
+  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n",
+  // p_rc->kf_boost,
   //        kf_bits, twopass->kf_zeromotion_pct);
   kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits,
                                                twopass->kf_group_bits, 0);
@@ -2368,7 +3364,13 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   gf_group->update_type[0] = KF_UPDATE;
 
   // Note the total error score of the kf group minus the key frame itself.
-  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+  if (cpi->ppi->lap_enabled)
+    // As we don't have enough stats to know the actual error of the group,
+    // we assume the complexity of each frame to be equal to 1, and set the
+    // error as the number of frames in the group(minus the keyframe).
+    twopass->kf_group_error_left = (double)(rc->frames_to_key - 1);
+  else
+    twopass->kf_group_error_left = kf_group_err - kf_mod_err;
 
   // Adjust the count of total modified error left.
   // The count of bits left is adjusted elsewhere based on real coded frame
@@ -2376,56 +3378,70 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->modified_error_left -= kf_group_err;
 }
 
-static int is_skippable_frame(const AV1_COMP *cpi) {
-  if (has_no_stats_stage(cpi)) return 0;
-  // If the current frame does not have non-zero motion vector detected in the
-  // first  pass, and so do its previous and forward frames, then this frame
-  // can be skipped for partition check, and the partition size is assigned
-  // according to the variance
-  const TWO_PASS *const twopass = &cpi->twopass;
-
-  return (!frame_is_intra_only(&cpi->common) &&
-          twopass->stats_in - 2 > twopass->stats_buf_ctx->stats_in_start &&
-          twopass->stats_in < twopass->stats_buf_ctx->stats_in_end &&
-          (twopass->stats_in - 1)->pcnt_inter -
-                  (twopass->stats_in - 1)->pcnt_motion ==
-              1 &&
-          (twopass->stats_in - 2)->pcnt_inter -
-                  (twopass->stats_in - 2)->pcnt_motion ==
-              1 &&
-          twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
-}
-
 #define ARF_STATS_OUTPUT 0
 #if ARF_STATS_OUTPUT
 unsigned int arf_count = 0;
 #endif
-#define DEFAULT_GRP_WEIGHT 1.0
+
+static int get_section_target_bandwidth(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  int section_target_bandwidth;
+  const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count -
+                                current_frame->frame_number);
+  if (cpi->ppi->lap_enabled)
+    section_target_bandwidth = (int)rc->avg_frame_bandwidth;
+  else
+    section_target_bandwidth = (int)(twopass->bits_left / frames_left);
+  return section_target_bandwidth;
+}
+
+static INLINE void set_twopass_params_based_on_fp_stats(
+    AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame_ptr) {
+  if (this_frame_ptr == NULL) return;
+
+  TWO_PASS_FRAME *twopass_frame = &cpi->twopass_frame;
+  // The multiplication by 256 reverses a scaling factor of (>> 8)
+  // applied when combining MB error values for the frame.
+  twopass_frame->mb_av_energy = log((this_frame_ptr->intra_error) + 1.0);
+
+  const FIRSTPASS_STATS *const total_stats =
+      cpi->ppi->twopass.stats_buf_ctx->total_stats;
+  if (is_fp_wavelet_energy_invalid(total_stats) == 0) {
+    twopass_frame->frame_avg_haar_energy =
+        log((this_frame_ptr->frame_avg_wavelet_energy) + 1.0);
+  }
+
+  // Set the frame content type flag.
+  if (this_frame_ptr->intra_skip_pct >= FC_ANIMATION_THRESH)
+    twopass_frame->fr_content_type = FC_GRAPHICS_ANIMATION;
+  else
+    twopass_frame->fr_content_type = FC_NORMAL;
+}
 
 static void process_first_pass_stats(AV1_COMP *cpi,
                                      FIRSTPASS_STATS *this_frame) {
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-
-  if (cpi->oxcf.rc_mode != AOM_Q && current_frame->frame_number == 0 &&
-      cpi->twopass.stats_buf_ctx->total_stats &&
-      cpi->twopass.stats_buf_ctx->total_left_stats) {
-    if (cpi->lap_enabled) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
+
+  if (cpi->oxcf.rc_cfg.mode != AOM_Q && current_frame->frame_number == 0 &&
+      cpi->gf_frame_index == 0 && total_stats &&
+      cpi->ppi->twopass.stats_buf_ctx->total_left_stats) {
+    if (cpi->ppi->lap_enabled) {
       /*
        * Accumulate total_stats using available limited number of stats,
        * and assign it to total_left_stats.
        */
-      *cpi->twopass.stats_buf_ctx->total_left_stats =
-          *cpi->twopass.stats_buf_ctx->total_stats;
+      *cpi->ppi->twopass.stats_buf_ctx->total_left_stats = *total_stats;
     }
-    const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count -
-                                  current_frame->frame_number);
-
     // Special case code for first frame.
-    const int section_target_bandwidth =
-        (int)(twopass->bits_left / frames_left);
+    const int section_target_bandwidth = get_section_target_bandwidth(cpi);
     const double section_length =
         twopass->stats_buf_ctx->total_left_stats->count;
     const double section_error =
@@ -2438,52 +3454,30 @@ static void process_first_pass_stats(AV1_COMP *cpi,
         ((double)cm->mi_params.mb_rows * section_length);
     const int tmp_q = get_twopass_worst_quality(
         cpi, section_error, section_intra_skip + section_inactive_zone,
-        section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+        section_target_bandwidth);
 
     rc->active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
-    rc->last_q[INTER_FRAME] = tmp_q;
-    rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
-    rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
-    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
-    rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
+    p_rc->last_q[INTER_FRAME] = tmp_q;
+    p_rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params->bit_depth);
+    p_rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+    p_rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.rc_cfg.best_allowed_q) / 2;
+    p_rc->avg_frame_qindex[KEY_FRAME] = p_rc->last_q[KEY_FRAME];
   }
 
-  int err = 0;
-  if (cpi->lap_enabled) {
-    err = input_stats_lap(twopass, this_frame);
-  } else {
-    err = input_stats(twopass, this_frame);
-  }
-  if (err == EOF) return;
-
-  {
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cm->mi_params.MBs;
-    // The multiplication by 256 reverses a scaling factor of (>> 8)
-    // applied when combining MB error values for the frame.
-    twopass->mb_av_energy = log((this_frame->intra_error / num_mbs) + 1.0);
-    twopass->frame_avg_haar_energy =
-        log((this_frame->frame_avg_wavelet_energy / num_mbs) + 1.0);
+  if (cpi->twopass_frame.stats_in <
+      cpi->ppi->twopass.stats_buf_ctx->stats_in_end) {
+    *this_frame = *cpi->twopass_frame.stats_in;
+    ++cpi->twopass_frame.stats_in;
   }
-
-  // Update the total stats remaining structure.
-  if (twopass->stats_buf_ctx->total_left_stats)
-    subtract_stats(twopass->stats_buf_ctx->total_left_stats, this_frame);
-
-  // Set the frame content type flag.
-  if (this_frame->intra_skip_pct >= FC_ANIMATION_THRESH)
-    twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
-  else
-    twopass->fr_content_type = FC_NORMAL;
+  set_twopass_params_based_on_fp_stats(cpi, this_frame);
 }
 
 static void setup_target_rate(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
 
-  int target_rate = gf_group->bit_allocation[gf_group->index];
+  int target_rate = gf_group->bit_allocation[cpi->gf_frame_index];
 
   if (has_no_stats_stage(cpi)) {
     av1_rc_set_frame_target(cpi, target_rate, cpi->common.width,
@@ -2493,153 +3487,371 @@ static void setup_target_rate(AV1_COMP *cpi) {
   rc->base_frame_target = target_rate;
 }
 
+static void mark_flashes(FIRSTPASS_STATS *first_stats,
+                         FIRSTPASS_STATS *last_stats) {
+  FIRSTPASS_STATS *this_stats = first_stats, *next_stats;
+  while (this_stats < last_stats - 1) {
+    next_stats = this_stats + 1;
+    if (next_stats->pcnt_second_ref > next_stats->pcnt_inter &&
+        next_stats->pcnt_second_ref >= 0.5) {
+      this_stats->is_flash = 1;
+    } else {
+      this_stats->is_flash = 0;
+    }
+    this_stats = next_stats;
+  }
+  // We always treat the last one as none flash.
+  if (last_stats - 1 >= first_stats) {
+    (last_stats - 1)->is_flash = 0;
+  }
+}
+
+// Estimate the noise variance of each frame from the first pass stats
+static void estimate_noise(FIRSTPASS_STATS *first_stats,
+                           FIRSTPASS_STATS *last_stats) {
+  FIRSTPASS_STATS *this_stats, *next_stats;
+  double C1, C2, C3, noise;
+  for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+    this_stats->noise_var = 0.0;
+    // flashes tend to have high correlation of innovations, so ignore them.
+    if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+        (this_stats - 2)->is_flash)
+      continue;
+
+    C1 = (this_stats - 1)->intra_error *
+         (this_stats->intra_error - this_stats->coded_error);
+    C2 = (this_stats - 2)->intra_error *
+         ((this_stats - 1)->intra_error - (this_stats - 1)->coded_error);
+    C3 = (this_stats - 2)->intra_error *
+         (this_stats->intra_error - this_stats->sr_coded_error);
+    if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue;
+    C1 = sqrt(C1);
+    C2 = sqrt(C2);
+    C3 = sqrt(C3);
+
+    noise = (this_stats - 1)->intra_error - C1 * C2 / C3;
+    noise = AOMMAX(noise, 0.01);
+    this_stats->noise_var = noise;
+  }
+
+  // Copy noise from the neighbor if the noise value is not trustworthy
+  for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+    if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+        (this_stats - 2)->is_flash)
+      continue;
+    if (this_stats->noise_var < 1.0) {
+      int found = 0;
+      // TODO(bohanli): consider expanding to two directions at the same time
+      for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash || next_stats->noise_var < 1.0)
+          continue;
+        found = 1;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+      if (found) continue;
+      for (next_stats = this_stats - 1; next_stats >= first_stats + 2;
+           next_stats--) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash || next_stats->noise_var < 1.0)
+          continue;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+    }
+  }
+
+  // copy the noise if this is a flash
+  for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+    if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+        (this_stats - 2)->is_flash) {
+      int found = 0;
+      for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash)
+          continue;
+        found = 1;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+      if (found) continue;
+      for (next_stats = this_stats - 1; next_stats >= first_stats + 2;
+           next_stats--) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash)
+          continue;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+    }
+  }
+
+  // if we are at the first 2 frames, copy the noise
+  for (this_stats = first_stats;
+       this_stats < first_stats + 2 && (first_stats + 2) < last_stats;
+       this_stats++) {
+    this_stats->noise_var = (first_stats + 2)->noise_var;
+  }
+}
+
+// Estimate correlation coefficient of each frame with its previous frame.
+static void estimate_coeff(FIRSTPASS_STATS *first_stats,
+                           FIRSTPASS_STATS *last_stats) {
+  FIRSTPASS_STATS *this_stats;
+  for (this_stats = first_stats + 1; this_stats < last_stats; this_stats++) {
+    const double C =
+        sqrt(AOMMAX((this_stats - 1)->intra_error *
+                        (this_stats->intra_error - this_stats->coded_error),
+                    0.001));
+    const double cor_coeff =
+        C /
+        AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, 0.001);
+
+    this_stats->cor_coeff =
+        cor_coeff *
+        sqrt(AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var,
+                    0.001) /
+             AOMMAX(this_stats->intra_error - this_stats->noise_var, 0.001));
+    // clip correlation coefficient.
+    this_stats->cor_coeff = AOMMIN(AOMMAX(this_stats->cor_coeff, 0), 1);
+  }
+  first_stats->cor_coeff = 1.0;
+}
+
 void av1_get_second_pass_params(AV1_COMP *cpi,
                                 EncodeFrameParams *const frame_params,
-                                const EncodeFrameInput *const frame_input,
                                 unsigned int frame_flags) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &cpi->gf_group;
-  AV1_COMMON *cm = &cpi->common;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
 
-  if (frame_is_intra_only(cm)) {
-    FeatureFlags *const features = &cm->features;
-    av1_set_screen_content_options(cpi, features);
-    cpi->is_screen_content_type = features->allow_screen_content_tools;
-  }
+  const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+  int update_total_stats = 0;
+
+  if (is_stat_consumption_stage(cpi) && !cpi->twopass_frame.stats_in) return;
 
-  if (is_stat_consumption_stage(cpi) && !twopass->stats_in) return;
+  assert(cpi->twopass_frame.stats_in != NULL);
+  const int update_type = gf_group->update_type[cpi->gf_frame_index];
+  frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
 
-  if (rc->frames_till_gf_update_due > 0 && !(frame_flags & FRAMEFLAGS_KEY)) {
-    assert(gf_group->index < gf_group->size);
-    const int update_type = gf_group->update_type[gf_group->index];
+  if (cpi->gf_frame_index < gf_group->size && !(frame_flags & FRAMEFLAGS_KEY)) {
+    assert(cpi->gf_frame_index < gf_group->size);
 
     setup_target_rate(cpi);
 
     // If this is an arf frame then we dont want to read the stats file or
     // advance the input pointer as we already have what we need.
     if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE) {
-      if (cpi->no_show_kf) {
-        assert(update_type == ARF_UPDATE);
-        frame_params->frame_type = KEY_FRAME;
-      } else {
-        frame_params->frame_type = INTER_FRAME;
-      }
-
-      // Do the firstpass stats indicate that this frame is skippable for the
-      // partition search?
-      if (cpi->sf.part_sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
-        cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-      }
-
+      const FIRSTPASS_STATS *const this_frame_ptr =
+          read_frame_stats(twopass, &cpi->twopass_frame,
+                           gf_group->arf_src_offset[cpi->gf_frame_index]);
+      set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr);
       return;
     }
   }
 
-  aom_clear_system_state();
-
-  if (cpi->oxcf.rc_mode == AOM_Q) rc->active_worst_quality = cpi->oxcf.cq_level;
+  if (oxcf->rc_cfg.mode == AOM_Q)
+    rc->active_worst_quality = oxcf->rc_cfg.cq_level;
   FIRSTPASS_STATS this_frame;
   av1_zero(this_frame);
   // call above fn
   if (is_stat_consumption_stage(cpi)) {
-    process_first_pass_stats(cpi, &this_frame);
+    if (cpi->gf_frame_index < gf_group->size || rc->frames_to_key == 0) {
+      process_first_pass_stats(cpi, &this_frame);
+      update_total_stats = 1;
+    }
   } else {
-    rc->active_worst_quality = cpi->oxcf.cq_level;
+    rc->active_worst_quality = oxcf->rc_cfg.cq_level;
+  }
+
+  if (cpi->gf_frame_index == gf_group->size) {
+    if (cpi->ppi->lap_enabled && cpi->ppi->p_rc.enable_scenecut_detection) {
+      const int num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1;
+      const int frames_to_key = define_kf_interval(
+          cpi, &twopass->firstpass_info, num_frames_to_detect_scenecut,
+          /*search_start_idx=*/0);
+      if (frames_to_key != -1)
+        rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key);
+    }
   }
 
   // Keyframe and section processing.
-  if (rc->frames_to_key == 0 || (frame_flags & FRAMEFLAGS_KEY)) {
-    FIRSTPASS_STATS this_frame_copy;
-    this_frame_copy = this_frame;
-    frame_params->frame_type = KEY_FRAME;
+  FIRSTPASS_STATS this_frame_copy;
+  this_frame_copy = this_frame;
+  if (rc->frames_to_key <= 0) {
+    assert(rc->frames_to_key == 0);
     // Define next KF group and assign bits to it.
+    frame_params->frame_type = KEY_FRAME;
     find_next_key_frame(cpi, &this_frame);
     this_frame = this_frame_copy;
-  } else {
-    frame_params->frame_type = INTER_FRAME;
-    const int altref_enabled = is_altref_enabled(cpi);
-    const int sframe_dist = cpi->oxcf.sframe_dist;
-    const int sframe_mode = cpi->oxcf.sframe_mode;
-    const int sframe_enabled = cpi->oxcf.sframe_enabled;
-    const int update_type = gf_group->update_type[gf_group->index];
-    CurrentFrame *const current_frame = &cpi->common.current_frame;
-    if (sframe_enabled) {
-      if (altref_enabled) {
-        if (sframe_mode == 1) {
-          // sframe_mode == 1: insert sframe if it matches altref frame.
-          if (current_frame->frame_number % sframe_dist == 0 &&
-              current_frame->frame_number != 0 && update_type == ARF_UPDATE) {
-            frame_params->frame_type = S_FRAME;
-          }
-        } else {
-          // sframe_mode != 1: if sframe will be inserted at the next available
-          // altref frame
-          if (current_frame->frame_number % sframe_dist == 0 &&
-              current_frame->frame_number != 0) {
-            rc->sframe_due = 1;
-          }
-          if (rc->sframe_due && update_type == ARF_UPDATE) {
-            frame_params->frame_type = S_FRAME;
-            rc->sframe_due = 0;
-          }
-        }
+  }
+
+  if (rc->frames_to_fwd_kf <= 0)
+    rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist;
+
+  // Define a new GF/ARF group. (Should always enter here for key frames).
+  if (cpi->gf_frame_index == gf_group->size) {
+    av1_tf_info_reset(&cpi->ppi->tf_info);
+#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS
+    vbr_rc_reset_gop_data(&cpi->vbr_rc_info);
+#endif  // CONFIG_BITRATE_ACCURACY
+    int max_gop_length =
+        (oxcf->gf_cfg.lag_in_frames >= 32)
+            ? AOMMIN(MAX_GF_INTERVAL, oxcf->gf_cfg.lag_in_frames -
+                                          oxcf->algo_cfg.arnr_max_frames / 2)
+            : MAX_GF_LENGTH_LAP;
+
+    // Use the provided gop size in low delay setting
+    if (oxcf->gf_cfg.lag_in_frames == 0) max_gop_length = rc->max_gf_interval;
+
+    // Identify regions if needed.
+    // TODO(bohanli): identify regions for all stats available.
+    if (rc->frames_since_key == 0 || rc->frames_since_key == 1 ||
+        (p_rc->frames_till_regions_update - rc->frames_since_key <
+             rc->frames_to_key &&
+         p_rc->frames_till_regions_update - rc->frames_since_key <
+             max_gop_length + 1)) {
+      // how many frames we can analyze from this frame
+      int rest_frames =
+          AOMMIN(rc->frames_to_key, MAX_FIRSTPASS_ANALYSIS_FRAMES);
+      rest_frames =
+          AOMMIN(rest_frames, (int)(twopass->stats_buf_ctx->stats_in_end -
+                                    cpi->twopass_frame.stats_in +
+                                    (rc->frames_since_key == 0)));
+      p_rc->frames_till_regions_update = rest_frames;
+
+      if (cpi->ppi->lap_enabled) {
+        mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+                     twopass->stats_buf_ctx->stats_in_end);
+        estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+                       twopass->stats_buf_ctx->stats_in_end);
+        estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+                       twopass->stats_buf_ctx->stats_in_end);
+        av1_identify_regions(cpi->twopass_frame.stats_in, rest_frames,
+                             (rc->frames_since_key == 0), p_rc->regions,
+                             &p_rc->num_regions);
       } else {
-        if (current_frame->frame_number % sframe_dist == 0 &&
-            current_frame->frame_number != 0) {
-          frame_params->frame_type = S_FRAME;
-        }
+        av1_identify_regions(
+            cpi->twopass_frame.stats_in - (rc->frames_since_key == 0),
+            rest_frames, 0, p_rc->regions, &p_rc->num_regions);
       }
     }
-  }
 
-  // Define a new GF/ARF group. (Should always enter here for key frames).
-  if (rc->frames_till_gf_update_due == 0) {
-    assert(cpi->common.current_frame.frame_number == 0 ||
-           gf_group->index == gf_group->size);
-    const FIRSTPASS_STATS *const start_position = twopass->stats_in;
-    int num_frames_to_detect_scenecut, frames_to_key;
-    if (cpi->lap_enabled && cpi->rc.enable_scenecut_detection)
-      num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1;
-    else
-      num_frames_to_detect_scenecut = 0;
-    frames_to_key = define_kf_interval(cpi, &this_frame, NULL,
-                                       num_frames_to_detect_scenecut);
-    reset_fpf_position(twopass, start_position);
-    if (frames_to_key != -1)
-      rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key);
-
-    int max_gop_length = (cpi->oxcf.lag_in_frames >= 32 &&
-                          is_stat_consumption_stage_twopass(cpi))
-                             ? MAX_GF_INTERVAL
-                             : MAX_GF_LENGTH_LAP;
-    if (rc->intervals_till_gf_calculate_due == 0) {
-      calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS);
-    }
-
-    if (max_gop_length > 16) {
-      if (rc->gf_intervals[rc->cur_gf_index] - 1 > 16) {
-        // The calculate_gf_length function is previously used with
-        // max_gop_length = 32 with look-ahead gf intervals.
-        define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0);
-        if (!av1_tpl_setup_stats(cpi, 1, frame_params, frame_input)) {
-          // Tpl decides that a shorter gf interval is better.
-          // TODO(jingning): Remove redundant computations here.
-          max_gop_length = 16;
-          calculate_gf_length(cpi, max_gop_length, 1);
+    int cur_region_idx =
+        find_regions_index(p_rc->regions, p_rc->num_regions,
+                           rc->frames_since_key - p_rc->regions_offset);
+    if ((cur_region_idx >= 0 &&
+         p_rc->regions[cur_region_idx].type == SCENECUT_REGION) ||
+        rc->frames_since_key == 0) {
+      // If we start from a scenecut, then the last GOP's arf boost is not
+      // needed for this GOP.
+      cpi->ppi->gf_state.arf_gf_boost_lst = 0;
+    }
+
+    int need_gf_len = 1;
+    if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) {
+      // set up bitstream to read
+      if (!cpi->third_pass_ctx->input_file_name && oxcf->two_pass_output) {
+        cpi->third_pass_ctx->input_file_name = oxcf->two_pass_output;
+      }
+      av1_open_second_pass_log(cpi, 1);
+      THIRD_PASS_GOP_INFO *gop_info = &cpi->third_pass_ctx->gop_info;
+      // Read in GOP information from the second pass file.
+      av1_read_second_pass_gop_info(cpi->second_pass_log_stream, gop_info,
+                                    cpi->common.error);
+#if CONFIG_BITRATE_ACCURACY
+      TPL_INFO *tpl_info;
+      AOM_CHECK_MEM_ERROR(cpi->common.error, tpl_info,
+                          aom_malloc(sizeof(*tpl_info)));
+      av1_read_tpl_info(tpl_info, cpi->second_pass_log_stream,
+                        cpi->common.error);
+      aom_free(tpl_info);
+#if CONFIG_THREE_PASS
+      // TODO(angiebird): Put this part into a func
+      cpi->vbr_rc_info.cur_gop_idx++;
+#endif  // CONFIG_THREE_PASS
+#endif  // CONFIG_BITRATE_ACCURACY
+      // Read in third_pass_info from the bitstream.
+      av1_set_gop_third_pass(cpi->third_pass_ctx);
+      // Read in per-frame info from second-pass encoding
+      av1_read_second_pass_per_frame_info(
+          cpi->second_pass_log_stream, cpi->third_pass_ctx->frame_info,
+          gop_info->num_frames, cpi->common.error);
+
+      p_rc->cur_gf_index = 0;
+      p_rc->gf_intervals[0] = cpi->third_pass_ctx->gop_info.gf_length;
+      need_gf_len = 0;
+    }
+
+    if (need_gf_len) {
+      // If we cannot obtain GF group length from second_pass_file
+      // TODO(jingning): Resolve the redundant calls here.
+      if (rc->intervals_till_gf_calculate_due == 0 || 1) {
+        calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS);
+      }
+
+      if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model &&
+          oxcf->gf_cfg.lag_in_frames >= 32 &&
+          cpi->sf.tpl_sf.gop_length_decision_method != 3) {
+        int this_idx = rc->frames_since_key +
+                       p_rc->gf_intervals[p_rc->cur_gf_index] -
+                       p_rc->regions_offset - 1;
+        int this_region =
+            find_regions_index(p_rc->regions, p_rc->num_regions, this_idx);
+        int next_region =
+            find_regions_index(p_rc->regions, p_rc->num_regions, this_idx + 1);
+        // TODO(angiebird): Figure out why this_region and next_region are -1 in
+        // unit test like AltRefFramePresenceTestLarge (aomedia:3134)
+        int is_last_scenecut =
+            p_rc->gf_intervals[p_rc->cur_gf_index] >= rc->frames_to_key ||
+            (this_region != -1 &&
+             p_rc->regions[this_region].type == SCENECUT_REGION) ||
+            (next_region != -1 &&
+             p_rc->regions[next_region].type == SCENECUT_REGION);
+
+        int ori_gf_int = p_rc->gf_intervals[p_rc->cur_gf_index];
+
+        if (p_rc->gf_intervals[p_rc->cur_gf_index] > 16 &&
+            rc->min_gf_interval <= 16) {
+          // The calculate_gf_length function is previously used with
+          // max_gop_length = 32 with look-ahead gf intervals.
+          define_gf_group(cpi, frame_params, 0);
+          av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group);
+          this_frame = this_frame_copy;
+
+          if (is_shorter_gf_interval_better(cpi, frame_params)) {
+            // A shorter gf interval is better.
+            // TODO(jingning): Remove redundant computations here.
+            max_gop_length = 16;
+            calculate_gf_length(cpi, max_gop_length, 1);
+            if (is_last_scenecut &&
+                (ori_gf_int - p_rc->gf_intervals[p_rc->cur_gf_index] < 4)) {
+              p_rc->gf_intervals[p_rc->cur_gf_index] = ori_gf_int;
+            }
+          }
         }
-      } else {
-        // Even based on 32 we still decide to use a short gf interval.
-        // Better to re-decide based on 16 then
-        max_gop_length = 16;
-        calculate_gf_length(cpi, max_gop_length, 1);
       }
     }
-    define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 1);
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    cpi->num_gf_group_show_frames = 0;
-    assert(gf_group->index == 0);
 
+    define_gf_group(cpi, frame_params, 0);
+
+    if (gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE &&
+        rc->frames_since_key > 0)
+      process_first_pass_stats(cpi, &this_frame);
+
+    define_gf_group(cpi, frame_params, 1);
+
+    // write gop info if needed for third pass. Per-frame info is written after
+    // each frame is encoded.
+    av1_write_second_pass_gop_info(cpi);
+
+    av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group);
+
+    rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+    assert(cpi->gf_frame_index == 0);
 #if ARF_STATS_OUTPUT
     {
       FILE *fpfile;
@@ -2647,33 +3859,48 @@ void av1_get_second_pass_params(AV1_COMP *cpi,
       ++arf_count;
       fprintf(fpfile, "%10d %10d %10d %10d %10d\n",
               cpi->common.current_frame.frame_number,
-              rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
-              rc->gfu_boost);
+              rc->frames_till_gf_update_due, cpi->ppi->p_rc.kf_boost, arf_count,
+              p_rc->gfu_boost);
 
       fclose(fpfile);
     }
 #endif
   }
-  assert(gf_group->index < gf_group->size);
+  assert(cpi->gf_frame_index < gf_group->size);
+
+  if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+      gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+    reset_fpf_position(&cpi->twopass_frame, start_pos);
 
-  // Do the firstpass stats indicate that this frame is skippable for the
-  // partition search?
-  if (cpi->sf.part_sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
-    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+    const FIRSTPASS_STATS *const this_frame_ptr =
+        read_frame_stats(twopass, &cpi->twopass_frame,
+                         gf_group->arf_src_offset[cpi->gf_frame_index]);
+    set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr);
+  } else {
+    // Back up this frame's stats for updating total stats during post encode.
+    cpi->twopass_frame.this_frame = update_total_stats ? start_pos : NULL;
   }
 
+  frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
   setup_target_rate(cpi);
 }
 
 void av1_init_second_pass(AV1_COMP *cpi) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FRAME_INFO *const frame_info = &cpi->frame_info;
   double frame_rate;
   FIRSTPASS_STATS *stats;
 
   if (!twopass->stats_buf_ctx->stats_in_end) return;
 
+  mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+               twopass->stats_buf_ctx->stats_in_end);
+  estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+                 twopass->stats_buf_ctx->stats_in_end);
+  estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+                 twopass->stats_buf_ctx->stats_in_end);
+
   stats = twopass->stats_buf_ctx->total_stats;
 
   *stats = *twopass->stats_buf_ctx->stats_in_end;
@@ -2687,7 +3914,16 @@ void av1_init_second_pass(AV1_COMP *cpi) {
   // first pass.
   av1_new_framerate(cpi, frame_rate);
   twopass->bits_left =
-      (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+      (int64_t)(stats->duration * oxcf->rc_cfg.target_bandwidth / 10000000.0);
+
+#if CONFIG_BITRATE_ACCURACY
+  av1_vbr_rc_init(&cpi->vbr_rc_info, cpi->ppi->twopass.bits_left,
+                  (int)round(stats->count));
+#endif
+
+#if CONFIG_RATECTRL_LOG
+  rc_log_init(&cpi->rc_log);
+#endif
 
   // This variable monitors how far behind the second ref update is lagging.
   twopass->sr_update_lag = 1;
@@ -2697,12 +3933,12 @@ void av1_init_second_pass(AV1_COMP *cpi) {
   {
     const double avg_error =
         stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
-    const FIRSTPASS_STATS *s = twopass->stats_in;
+    const FIRSTPASS_STATS *s = cpi->twopass_frame.stats_in;
     double modified_error_total = 0.0;
     twopass->modified_error_min =
-        (avg_error * oxcf->two_pass_vbrmin_section) / 100;
+        (avg_error * oxcf->rc_cfg.vbrmin_section) / 100;
     twopass->modified_error_max =
-        (avg_error * oxcf->two_pass_vbrmax_section) / 100;
+        (avg_error * oxcf->rc_cfg.vbrmax_section) / 100;
     while (s < twopass->stats_buf_ctx->stats_in_end) {
       modified_error_total +=
           calculate_modified_err(frame_info, twopass, oxcf, s);
@@ -2712,10 +3948,10 @@ void av1_init_second_pass(AV1_COMP *cpi) {
   }
 
   // Reset the vbr bits off target counters
-  cpi->rc.vbr_bits_off_target = 0;
-  cpi->rc.vbr_bits_off_target_fast = 0;
+  cpi->ppi->p_rc.vbr_bits_off_target = 0;
+  cpi->ppi->p_rc.vbr_bits_off_target_fast = 0;
 
-  cpi->rc.rate_error_estimate = 0;
+  cpi->ppi->p_rc.rate_error_estimate = 0;
 
   // Static sequence monitor variables.
   twopass->kf_zeromotion_pct = 100;
@@ -2730,7 +3966,7 @@ void av1_init_second_pass(AV1_COMP *cpi) {
 }
 
 void av1_init_single_pass_lap(AV1_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
 
   if (!twopass->stats_buf_ctx->stats_in_end) return;
 
@@ -2743,10 +3979,10 @@ void av1_init_single_pass_lap(AV1_COMP *cpi) {
   twopass->modified_error_left = 0.0;
 
   // Reset the vbr bits off target counters
-  cpi->rc.vbr_bits_off_target = 0;
-  cpi->rc.vbr_bits_off_target_fast = 0;
+  cpi->ppi->p_rc.vbr_bits_off_target = 0;
+  cpi->ppi->p_rc.vbr_bits_off_target_fast = 0;
 
-  cpi->rc.rate_error_estimate = 0;
+  cpi->ppi->p_rc.rate_error_estimate = 0;
 
   // Static sequence monitor variables.
   twopass->kf_zeromotion_pct = 100;
@@ -2764,42 +4000,98 @@ void av1_init_single_pass_lap(AV1_COMP *cpi) {
 #define MINQ_ADJ_LIMIT_CQ 20
 #define HIGH_UNDERSHOOT_RATIO 2
 void av1_twopass_postencode_update(AV1_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
-  const int bits_used = rc->base_frame_target;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+
+  // Increment the stats_in pointer.
+  if (is_stat_consumption_stage(cpi) &&
+      (cpi->gf_frame_index < cpi->ppi->gf_group.size ||
+       rc->frames_to_key == 0)) {
+    const int update_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+    if (update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE) {
+      FIRSTPASS_STATS this_frame;
+      --cpi->twopass_frame.stats_in;
+      if (cpi->ppi->lap_enabled) {
+        input_stats_lap(twopass, &cpi->twopass_frame, &this_frame);
+      } else {
+        input_stats(twopass, &cpi->twopass_frame, &this_frame);
+      }
+    } else if (cpi->ppi->lap_enabled) {
+      cpi->twopass_frame.stats_in =
+          cpi->ppi->twopass.stats_buf_ctx->stats_in_start;
+    }
+  }
 
   // VBR correction is done through rc->vbr_bits_off_target. Based on the
   // sign of this value, a limited % adjustment is made to the target rate
   // of subsequent frames, to try and push it back towards 0. This method
   // is designed to prevent extreme behaviour at the end of a clip
   // or group of frames.
-  rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
-  twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0);
+  p_rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+  twopass->bits_left = AOMMAX(twopass->bits_left - rc->base_frame_target, 0);
+
+  if (cpi->do_update_vbr_bits_off_target_fast) {
+    // Subtract current frame's fast_extra_bits.
+    p_rc->vbr_bits_off_target_fast -= rc->frame_level_fast_extra_bits;
+    rc->frame_level_fast_extra_bits = 0;
+  }
 
   // Target vs actual bits for this arf group.
-  twopass->rolling_arf_group_target_bits += rc->this_frame_target;
+  twopass->rolling_arf_group_target_bits += rc->base_frame_target;
   twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
 
   // Calculate the pct rc error.
-  if (rc->total_actual_bits) {
-    rc->rate_error_estimate =
-        (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
-    rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+  if (p_rc->total_actual_bits) {
+    p_rc->rate_error_estimate =
+        (int)((p_rc->vbr_bits_off_target * 100) / p_rc->total_actual_bits);
+    p_rc->rate_error_estimate = clamp(p_rc->rate_error_estimate, -100, 100);
   } else {
-    rc->rate_error_estimate = 0;
+    p_rc->rate_error_estimate = 0;
   }
 
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  /* The variables temp_vbr_bits_off_target, temp_bits_left,
+   * temp_rolling_arf_group_target_bits, temp_rolling_arf_group_actual_bits
+   * temp_rate_error_estimate are introduced for quality simulation purpose,
+   * it retains the value previous to the parallel encode frames. The
+   * variables are updated based on the update flag.
+   *
+   * If there exist show_existing_frames between parallel frames, then to
+   * retain the temp state do not update it. */
+  const int simulate_parallel_frame =
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+  int show_existing_between_parallel_frames =
+      (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+           INTNL_OVERLAY_UPDATE &&
+       cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+
+  if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+      simulate_parallel_frame) {
+    cpi->ppi->p_rc.temp_vbr_bits_off_target = p_rc->vbr_bits_off_target;
+    cpi->ppi->p_rc.temp_bits_left = twopass->bits_left;
+    cpi->ppi->p_rc.temp_rolling_arf_group_target_bits =
+        twopass->rolling_arf_group_target_bits;
+    cpi->ppi->p_rc.temp_rolling_arf_group_actual_bits =
+        twopass->rolling_arf_group_actual_bits;
+    cpi->ppi->p_rc.temp_rate_error_estimate = p_rc->rate_error_estimate;
+  }
+#endif
   // Update the active best quality pyramid.
   if (!rc->is_src_frame_alt_ref) {
-    const int pyramid_level = cpi->gf_group.layer_depth[cpi->gf_group.index];
+    const int pyramid_level =
+        cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
     int i;
     for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) {
-      rc->active_best_quality[i] = cpi->common.quant_params.base_qindex;
-      // if (pyramid_level >= 2) {
-      //   rc->active_best_quality[pyramid_level] =
-      //     AOMMAX(rc->active_best_quality[pyramid_level],
-      //            cpi->common.base_qindex);
-      // }
+      p_rc->active_best_quality[i] = cpi->common.quant_params.base_qindex;
+#if CONFIG_TUNE_VMAF
+      if (cpi->vmaf_info.original_qindex != -1 &&
+          (cpi->oxcf.tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+           cpi->oxcf.tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) {
+        p_rc->active_best_quality[i] = cpi->vmaf_info.original_qindex;
+      }
+#endif
     }
   }
 
@@ -2813,55 +4105,54 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) {
             " %10d %10d %10d %10.4lf %10.4lf %10.4lf %10.4lf\n",
             cm->current_frame.frame_number, rc->base_frame_target,
             rc->projected_frame_size, rc->total_actual_bits,
-            rc->vbr_bits_off_target, rc->rate_error_estimate,
+            rc->vbr_bits_off_target, p_rc->rate_error_estimate,
             twopass->rolling_arf_group_target_bits,
             twopass->rolling_arf_group_actual_bits,
             (double)twopass->rolling_arf_group_actual_bits /
                 (double)twopass->rolling_arf_group_target_bits,
             twopass->bpm_factor,
-            av1_convert_qindex_to_q(quant_params->base_qindex,
-                                    cm->seq_params.bit_depth),
+            av1_convert_qindex_to_q(cpi->common.quant_params.base_qindex,
+                                    cm->seq_params->bit_depth),
             av1_convert_qindex_to_q(rc->active_worst_quality,
-                                    cm->seq_params.bit_depth));
+                                    cm->seq_params->bit_depth));
     fclose(fpfile);
   }
 #endif
 
   if (cpi->common.current_frame.frame_type != KEY_FRAME) {
-    twopass->kf_group_bits -= bits_used;
+    twopass->kf_group_bits -= rc->base_frame_target;
     twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
   }
   twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
 
   // If the rate control is drifting consider adjustment to min or maxq.
-  if ((cpi->oxcf.rc_mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) {
-    const int maxq_adj_limit = rc->worst_quality - rc->active_worst_quality;
-    const int minq_adj_limit =
-        (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
-
+  if ((rc_cfg->mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) {
+    int maxq_adj_limit;
+    int minq_adj_limit;
+    maxq_adj_limit = rc->worst_quality - rc->active_worst_quality;
+    minq_adj_limit =
+        (rc_cfg->mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
     // Undershoot.
-    if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+    if (p_rc->rate_error_estimate > rc_cfg->under_shoot_pct) {
       --twopass->extend_maxq;
-      if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+      if (p_rc->rolling_target_bits >= p_rc->rolling_actual_bits)
         ++twopass->extend_minq;
       // Overshoot.
-    } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+    } else if (p_rc->rate_error_estimate < -rc_cfg->over_shoot_pct) {
       --twopass->extend_minq;
-      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+      if (p_rc->rolling_target_bits < p_rc->rolling_actual_bits)
         ++twopass->extend_maxq;
     } else {
       // Adjustment for extreme local overshoot.
       if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
           rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
         ++twopass->extend_maxq;
-
       // Unwind undershoot or overshoot adjustment.
-      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+      if (p_rc->rolling_target_bits < p_rc->rolling_actual_bits)
         --twopass->extend_minq;
-      else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+      else if (p_rc->rolling_target_bits > p_rc->rolling_actual_bits)
         --twopass->extend_maxq;
     }
-
     twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
     twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
 
@@ -2872,24 +4163,214 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) {
     if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
       int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
       if (rc->projected_frame_size < fast_extra_thresh) {
-        rc->vbr_bits_off_target_fast +=
+        p_rc->vbr_bits_off_target_fast +=
             fast_extra_thresh - rc->projected_frame_size;
-        rc->vbr_bits_off_target_fast =
-            AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+        p_rc->vbr_bits_off_target_fast = AOMMIN(p_rc->vbr_bits_off_target_fast,
+                                                (4 * rc->avg_frame_bandwidth));
 
         // Fast adaptation of minQ if necessary to use up the extra bits.
         if (rc->avg_frame_bandwidth) {
-          twopass->extend_minq_fast =
-              (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+          twopass->extend_minq_fast = (int)(p_rc->vbr_bits_off_target_fast * 8 /
+                                            rc->avg_frame_bandwidth);
         }
         twopass->extend_minq_fast = AOMMIN(
             twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
-      } else if (rc->vbr_bits_off_target_fast) {
+      } else if (p_rc->vbr_bits_off_target_fast) {
         twopass->extend_minq_fast = AOMMIN(
             twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
       } else {
         twopass->extend_minq_fast = 0;
       }
     }
+
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+    if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+        simulate_parallel_frame) {
+      cpi->ppi->p_rc.temp_vbr_bits_off_target_fast =
+          p_rc->vbr_bits_off_target_fast;
+      cpi->ppi->p_rc.temp_extend_minq = twopass->extend_minq;
+      cpi->ppi->p_rc.temp_extend_maxq = twopass->extend_maxq;
+      cpi->ppi->p_rc.temp_extend_minq_fast = twopass->extend_minq_fast;
+    }
+#endif
+  }
+
+  // Update the frame probabilities obtained from parallel encode frames
+  FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  /* The variable temp_active_best_quality is introduced only for quality
+   * simulation purpose, it retains the value previous to the parallel
+   * encode frames. The variable is updated based on the update flag.
+   *
+   * If there exist show_existing_frames between parallel frames, then to
+   * retain the temp state do not update it. */
+  if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+      simulate_parallel_frame) {
+    int i;
+    const int pyramid_level =
+        cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
+    if (!rc->is_src_frame_alt_ref) {
+      for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i)
+        cpi->ppi->p_rc.temp_active_best_quality[i] =
+            p_rc->active_best_quality[i];
+    }
+  }
+
+  // Update the frame probabilities obtained from parallel encode frames
+  FrameProbInfo *const temp_frame_probs_simulation =
+      simulate_parallel_frame ? &cpi->ppi->temp_frame_probs_simulation
+                              : frame_probs;
+  FrameProbInfo *const temp_frame_probs =
+      simulate_parallel_frame ? &cpi->ppi->temp_frame_probs : NULL;
+#endif
+  int i, j, loop;
+  // Sequentially do average on temp_frame_probs_simulation which holds
+  // probabilities of last frame before parallel encode
+  for (loop = 0; loop <= cpi->num_frame_recode; loop++) {
+    // Sequentially update tx_type_probs
+    if (cpi->do_update_frame_probs_txtype[loop] &&
+        (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)) {
+      const FRAME_UPDATE_TYPE update_type =
+          get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+      for (i = 0; i < TX_SIZES_ALL; i++) {
+        int left = 1024;
+
+        for (j = TX_TYPES - 1; j >= 0; j--) {
+          const int new_prob =
+              cpi->frame_new_probs[loop].tx_type_probs[update_type][i][j];
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+          int prob =
+              (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] +
+               new_prob) >>
+              1;
+          left -= prob;
+          if (j == 0) prob += left;
+          temp_frame_probs_simulation->tx_type_probs[update_type][i][j] = prob;
+#else
+          int prob =
+              (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
+          left -= prob;
+          if (j == 0) prob += left;
+          frame_probs->tx_type_probs[update_type][i][j] = prob;
+#endif
+        }
+      }
+    }
+
+    // Sequentially update obmc_probs
+    if (cpi->do_update_frame_probs_obmc[loop] &&
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      const FRAME_UPDATE_TYPE update_type =
+          get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+      for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+        const int new_prob =
+            cpi->frame_new_probs[loop].obmc_probs[update_type][i];
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+        temp_frame_probs_simulation->obmc_probs[update_type][i] =
+            (temp_frame_probs_simulation->obmc_probs[update_type][i] +
+             new_prob) >>
+            1;
+#else
+        frame_probs->obmc_probs[update_type][i] =
+            (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+#endif
+      }
+    }
+
+    // Sequentially update warped_probs
+    if (cpi->do_update_frame_probs_warp[loop] &&
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      const FRAME_UPDATE_TYPE update_type =
+          get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+      const int new_prob = cpi->frame_new_probs[loop].warped_probs[update_type];
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+      temp_frame_probs_simulation->warped_probs[update_type] =
+          (temp_frame_probs_simulation->warped_probs[update_type] + new_prob) >>
+          1;
+#else
+      frame_probs->warped_probs[update_type] =
+          (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+#endif
+    }
+
+    // Sequentially update switchable_interp_probs
+    if (cpi->do_update_frame_probs_interpfilter[loop] &&
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      const FRAME_UPDATE_TYPE update_type =
+          get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+        int left = 1536;
+
+        for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) {
+          const int new_prob = cpi->frame_new_probs[loop]
+                                   .switchable_interp_probs[update_type][i][j];
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+          int prob = (temp_frame_probs_simulation
+                          ->switchable_interp_probs[update_type][i][j] +
+                      new_prob) >>
+                     1;
+          left -= prob;
+          if (j == 0) prob += left;
+
+          temp_frame_probs_simulation
+              ->switchable_interp_probs[update_type][i][j] = prob;
+#else
+          int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
+                      new_prob) >>
+                     1;
+          left -= prob;
+          if (j == 0) prob += left;
+          frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+#endif
+        }
+      }
+    }
+  }
+
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  // Copying temp_frame_probs_simulation to temp_frame_probs based on
+  // the flag
+  if (cpi->do_frame_data_update &&
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+      simulate_parallel_frame) {
+    for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+         update_type_idx++) {
+      for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+        temp_frame_probs->obmc_probs[update_type_idx][i] =
+            temp_frame_probs_simulation->obmc_probs[update_type_idx][i];
+      }
+      temp_frame_probs->warped_probs[update_type_idx] =
+          temp_frame_probs_simulation->warped_probs[update_type_idx];
+      for (i = 0; i < TX_SIZES_ALL; i++) {
+        for (j = 0; j < TX_TYPES; j++) {
+          temp_frame_probs->tx_type_probs[update_type_idx][i][j] =
+              temp_frame_probs_simulation->tx_type_probs[update_type_idx][i][j];
+        }
+      }
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+        for (j = 0; j < SWITCHABLE_FILTERS; j++) {
+          temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] =
+              temp_frame_probs_simulation
+                  ->switchable_interp_probs[update_type_idx][i][j];
+        }
+      }
+    }
   }
+#endif
+  // Update framerate obtained from parallel encode frames
+  if (cpi->common.show_frame &&
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+    cpi->framerate = cpi->new_framerate;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  // SIMULATION PURPOSE
+  int show_existing_between_parallel_frames_cndn =
+      (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+           INTNL_OVERLAY_UPDATE &&
+       cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+  if (cpi->common.show_frame && !show_existing_between_parallel_frames_cndn &&
+      cpi->do_frame_data_update && simulate_parallel_frame)
+    cpi->temp_framerate = cpi->framerate;
+#endif
 }
diff --git a/media/libaom/src/av1/encoder/pass2_strategy.h b/media/libaom/src/av1/encoder/pass2_strategy.h
index 437fb8f79d..6234623a57 100644
--- a/media/libaom/src/av1/encoder/pass2_strategy.h
+++ b/media/libaom/src/av1/encoder/pass2_strategy.h
@@ -18,8 +18,15 @@ extern "C" {
 
 struct AV1_COMP;
 struct EncodeFrameParams;
-// structure of accumulated stats and features in a gf group
+
+#include "av1/encoder/encoder.h"
+
+/*!\endcond */
+/*!
+ * \brief accumulated stats and features in a gf group
+ */
 typedef struct {
+  /*!\cond */
   double gf_group_err;
   double gf_group_raw_error;
   double gf_group_skip_pct;
@@ -35,40 +42,113 @@ typedef struct {
   double abs_mv_in_out_accumulator;
 
   double avg_sr_coded_error;
-  double avg_tr_coded_error;
   double avg_pcnt_second_ref;
-  double avg_pcnt_third_ref;
-  double avg_pcnt_third_ref_nolast;
   double avg_new_mv_count;
   double avg_wavelet_energy;
   double avg_raw_err_stdev;
   int non_zero_stdev_count;
-
-  unsigned int allow_alt_ref;
+  /*!\endcond */
 } GF_GROUP_STATS;
 
+/*!
+ * \brief accumulated stats and features for a frame
+ */
 typedef struct {
+  /*!\cond */
   double frame_err;
   double frame_coded_error;
   double frame_sr_coded_error;
-  double frame_tr_coded_error;
+  /*!\endcond */
 } GF_FRAME_STATS;
+/*!cond */
 
 void av1_init_second_pass(struct AV1_COMP *cpi);
 
 void av1_init_single_pass_lap(AV1_COMP *cpi);
 
+/*!\endcond */
+/*!\brief Main per frame entry point for second pass of two pass encode
+ *
+ *\ingroup rate_control
+ *
+ * This function is called for each frame in the second pass of a two pass
+ * encode. It checks the frame type and if a new KF or GF/ARF is due.
+ * When a KF is due it calls find_next_key_frame() to work out how long
+ * this key frame group will be and assign bits to the key frame.
+ * At the start of a new GF/ARF group it calls calculate_gf_length()
+ * and define_gf_group() which are the main functions responsible for
+ * defining the size and structure of the new GF/ARF group.
+ *
+ * \param[in]    cpi           Top - level encoder instance structure
+ * \param[in]    frame_params  Per frame encoding parameters
+ * \param[in]    frame_flags   Frame type and coding flags
+ *
+ * \return No return but analyses first pass stats and assigns a target
+ *         number of bits to the current frame and a target Q range.
+ */
 void av1_get_second_pass_params(struct AV1_COMP *cpi,
                                 struct EncodeFrameParams *const frame_params,
-                                const EncodeFrameInput *const frame_input,
                                 unsigned int frame_flags);
 
+/*!\brief Adjustments to two pass and rate control after each frame.
+ *
+ *\ingroup rate_control
+ *
+ * This function is called after each frame to make adjustments to
+ * heuristics and data structures that relate to rate control.
+ *
+ * \param[in]    cpi       Top - level encoder instance structure
+ *
+ * \return No return value but this function updates various rate control
+ *         related data structures that for example track overshoot and
+ *         undershoot.
+ */
 void av1_twopass_postencode_update(struct AV1_COMP *cpi);
 
+/*!\brief Distributes bits to frames in a group
+ *
+ *\ingroup rate_control
+ *
+ * This function decides on the allocation of bits between the different
+ * frames and types of frame in a GF/ARF group.
+ *
+ * \param[in]   cpi           Top - level encoder instance structure
+ * \param[in]   rc            Rate control data
+ * \param[in]   gf_group      GF/ARF group data structure
+ * \param[in]   is_key_frame  Indicates if the first frame in the group is
+ *                            also a key frame.
+ * \param[in]   use_arf       Are ARF frames enabled or is this a GF only
+ *                            uni-directional group.
+ * \param[in]   gf_group_bits Bits available to be allocated.
+ *
+ * \return No return but updates the rate control and group data structures
+ *         to reflect the allocation of bits.
+ */
 void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
                             GF_GROUP *gf_group, int is_key_frame, int use_arf,
                             int64_t gf_group_bits);
 
+int av1_calc_arf_boost(const TWO_PASS *twopass,
+                       const TWO_PASS_FRAME *twopass_frame,
+                       const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+                       int offset, int f_frames, int b_frames,
+                       int *num_fpstats_used, int *num_fpstats_required,
+                       int project_gfu_boost);
+
+void av1_accumulate_next_frame_stats(const FIRSTPASS_STATS *stats,
+                                     const int flash_detected,
+                                     const int frames_since_key,
+                                     const int cur_idx,
+                                     GF_GROUP_STATS *gf_stats, int f_w,
+                                     int f_h);
+// Identify stable and unstable regions from first pass stats.
+// stats_start points to the first frame to analyze.
+// |offset| is the offset from the current frame to the frame stats_start is
+// pointing to.
+void av1_identify_regions(const FIRSTPASS_STATS *const stats_start,
+                          int total_frames, int offset, REGIONS *regions,
+                          int *total_regions);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/pickcdef.c b/media/libaom/src/av1/encoder/pickcdef.c
index a1092fd595..557c9eebc0 100644
--- a/media/libaom/src/av1/encoder/pickcdef.c
+++ b/media/libaom/src/av1/encoder/pickcdef.c
@@ -10,44 +10,64 @@
  */
 
 #include <math.h>
+#include <stdbool.h>
 #include <string.h>
 
+#include "config/aom_dsp_rtcd.h"
 #include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_integer.h"
-#include "aom_ports/system_state.h"
 #include "av1/common/av1_common_int.h"
-#include "av1/common/cdef.h"
 #include "av1/common/reconinter.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/pickcdef.h"
+
+// Get primary and secondary filter strength for the given strength index and
+// search method
+static INLINE void get_cdef_filter_strengths(CDEF_PICK_METHOD pick_method,
+                                             int *pri_strength,
+                                             int *sec_strength,
+                                             int strength_idx) {
+  const int tot_sec_filter =
+      (pick_method == CDEF_FAST_SEARCH_LVL5)
+          ? REDUCED_SEC_STRENGTHS_LVL5
+          : ((pick_method >= CDEF_FAST_SEARCH_LVL3) ? REDUCED_SEC_STRENGTHS_LVL3
+                                                    : CDEF_SEC_STRENGTHS);
+  const int pri_idx = strength_idx / tot_sec_filter;
+  const int sec_idx = strength_idx % tot_sec_filter;
+  *pri_strength = pri_idx;
+  *sec_strength = sec_idx;
+  if (pick_method == CDEF_FULL_SEARCH) return;
 
-#define REDUCED_PRI_STRENGTHS_LVL1 8
-#define REDUCED_PRI_STRENGTHS_LVL2 5
-
-#define REDUCED_TOTAL_STRENGTHS_LVL1 \
-  (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS)
-#define REDUCED_TOTAL_STRENGTHS_LVL2 \
-  (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS)
-#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
-
-static const int priconv_lvl1[REDUCED_TOTAL_STRENGTHS_LVL1] = { 0, 1, 2,  3,
-                                                                5, 7, 10, 13 };
-static const int priconv_lvl2[REDUCED_TOTAL_STRENGTHS_LVL2] = { 0, 2, 4, 8,
-                                                                14 };
-static const int nb_cdef_strengths[CDEF_PICK_METHODS] = {
-  TOTAL_STRENGTHS, REDUCED_TOTAL_STRENGTHS_LVL1, REDUCED_TOTAL_STRENGTHS_LVL2,
-  TOTAL_STRENGTHS
-};
-
-// Get primary strength value for the given index and search method
-static INLINE int get_pri_strength(CDEF_PICK_METHOD pick_method, int pri_idx) {
   switch (pick_method) {
-    case CDEF_FAST_SEARCH_LVL1: return priconv_lvl1[pri_idx];
-    case CDEF_FAST_SEARCH_LVL2: return priconv_lvl2[pri_idx];
-    default: assert(0 && "Invalid CDEF primary index"); return -1;
+    case CDEF_FAST_SEARCH_LVL1: *pri_strength = priconv_lvl1[pri_idx]; break;
+    case CDEF_FAST_SEARCH_LVL2: *pri_strength = priconv_lvl2[pri_idx]; break;
+    case CDEF_FAST_SEARCH_LVL3:
+      *pri_strength = priconv_lvl2[pri_idx];
+      *sec_strength = secconv_lvl3[sec_idx];
+      break;
+    case CDEF_FAST_SEARCH_LVL4:
+      *pri_strength = priconv_lvl4[pri_idx];
+      *sec_strength = secconv_lvl3[sec_idx];
+      break;
+    case CDEF_FAST_SEARCH_LVL5:
+      *pri_strength = priconv_lvl5[pri_idx];
+      *sec_strength = secconv_lvl5[sec_idx];
+      break;
+    default: assert(0 && "Invalid CDEF search method");
   }
 }
 
+// Store CDEF filter strength calculated from strength index for given search
+// method
+#define STORE_CDEF_FILTER_STRENGTH(cdef_strength, pick_method, strength_idx) \
+  do {                                                                       \
+    get_cdef_filter_strengths((pick_method), &pri_strength, &sec_strength,   \
+                              (strength_idx));                               \
+    cdef_strength = pri_strength * CDEF_SEC_STRENGTHS + sec_strength;        \
+  } while (0)
+
 /* Search for the best strength to add as an option, knowing we
    already selected nb_strengths options. */
 static uint64_t search_one(int *lev, int nb_strengths,
@@ -141,8 +161,8 @@ static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
                                       int sb_count,
                                       CDEF_PICK_METHOD pick_method) {
   uint64_t best_tot_mse;
-  int fast = (pick_method == CDEF_FAST_SEARCH_LVL1 ||
-              pick_method == CDEF_FAST_SEARCH_LVL2);
+  int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
+              pick_method <= CDEF_FAST_SEARCH_LVL5);
   int i;
   best_tot_mse = (uint64_t)1 << 63;
   /* Greedy search: add one strength options at a time. */
@@ -190,14 +210,7 @@ static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
   return best_tot_mse;
 }
 
-typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const void *src,
-                          int src_voffset, int src_hoffset, int sstride,
-                          int vsize, int hsize);
-typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src,
-                                        cdef_list *dlist, int cdef_count,
-                                        BLOCK_SIZE bsize, int coeff_shift,
-                                        int row, int col);
-
+#if CONFIG_AV1_HIGHBITDEPTH
 static void copy_sb16_16_highbd(uint16_t *dst, int dstride, const void *src,
                                 int src_voffset, int src_hoffset, int sstride,
                                 int vsize, int hsize) {
@@ -207,6 +220,7 @@ static void copy_sb16_16_highbd(uint16_t *dst, int dstride, const void *src,
   for (r = 0; r < vsize; r++)
     memcpy(dst + r * dstride, base + r * sstride, hsize * sizeof(*base));
 }
+#endif
 
 static void copy_sb16_16(uint16_t *dst, int dstride, const void *src,
                          int src_voffset, int src_hoffset, int sstride,
@@ -219,33 +233,6 @@ static void copy_sb16_16(uint16_t *dst, int dstride, const void *src,
       dst[r * dstride + c] = (uint16_t)base[r * sstride + c];
 }
 
-static INLINE uint64_t mse_wxh_16bit_highbd(uint16_t *dst, int dstride,
-                                            uint16_t *src, int sstride, int w,
-                                            int h) {
-  uint64_t sum = 0;
-  int i, j;
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      int e = dst[i * dstride + j] - src[i * sstride + j];
-      sum += e * e;
-    }
-  }
-  return sum;
-}
-
-static INLINE uint64_t mse_wxh_16bit(uint8_t *dst, int dstride, uint16_t *src,
-                                     int sstride, int w, int h) {
-  uint64_t sum = 0;
-  int i, j;
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
-      sum += e * e;
-    }
-  }
-  return sum;
-}
-
 static INLINE void init_src_params(int *src_stride, int *width, int *height,
                                    int *width_log2, int *height_log2,
                                    BLOCK_SIZE bsize) {
@@ -255,7 +242,7 @@ static INLINE void init_src_params(int *src_stride, int *width, int *height,
   *width_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
   *height_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
 }
-
+#if CONFIG_AV1_HIGHBITDEPTH
 /* Compute MSE only on the blocks we filtered. */
 static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src,
                                          cdef_list *dlist, int cdef_count,
@@ -273,13 +260,13 @@ static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src,
   for (bi = 0; bi < cdef_count; bi++) {
     by = dlist[bi].by;
     bx = dlist[bi].bx;
-    sum += mse_wxh_16bit_highbd(
+    sum += aom_mse_wxh_16bit_highbd(
         &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
         &src[bi << (height_log2 + width_log2)], src_stride, width, height);
   }
   return sum >> 2 * coeff_shift;
 }
-
+#endif
 static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src,
                                   cdef_list *dlist, int cdef_count,
                                   BLOCK_SIZE bsize, int coeff_shift, int row,
@@ -296,41 +283,250 @@ static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src,
   for (bi = 0; bi < cdef_count; bi++) {
     by = dlist[bi].by;
     bx = dlist[bi].bx;
-    sum += mse_wxh_16bit(
+    sum += aom_mse_wxh_16bit(
         &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
         &src[bi << (height_log2 + width_log2)], src_stride, width, height);
   }
   return sum >> 2 * coeff_shift;
 }
 
-static int sb_all_skip(const CommonModeInfoParams *const mi_params, int mi_row,
-                       int mi_col) {
-  const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64);
-  const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64);
-  const int stride = mi_params->mi_stride;
-  MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col;
-  for (int r = 0; r < maxr; ++r, mbmi += stride) {
-    for (int c = 0; c < maxc; ++c) {
-      if (!mbmi[c]->skip) return 0;
+// Calculates MSE at block level.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters related to
+//   CDEF search context.
+//   fbr: Row index in units of 64x64 block
+//   fbc: Column index in units of 64x64 block
+// Returns:
+//   Nothing will be returned. Contents of cdef_search_ctx will be modified.
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
+                             int sb_count) {
+  const CommonModeInfoParams *const mi_params = cdef_search_ctx->mi_params;
+  const YV12_BUFFER_CONFIG *ref = cdef_search_ctx->ref;
+  const int coeff_shift = cdef_search_ctx->coeff_shift;
+  const int *mi_wide_l2 = cdef_search_ctx->mi_wide_l2;
+  const int *mi_high_l2 = cdef_search_ctx->mi_high_l2;
+
+  // Declare and initialize the temporary buffers.
+  DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+  DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
+  cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
+  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
+  int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+  int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+  int hb_step = 1, vb_step = 1;
+  BLOCK_SIZE bs;
+
+  const MB_MODE_INFO *const mbmi =
+      mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                              MI_SIZE_64X64 * fbc];
+
+  uint8_t *ref_buffer[MAX_MB_PLANE] = { ref->y_buffer, ref->u_buffer,
+                                        ref->v_buffer };
+  int ref_stride[MAX_MB_PLANE] = { ref->y_stride, ref->uv_stride,
+                                   ref->uv_stride };
+
+  if (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64 ||
+      mbmi->bsize == BLOCK_64X128) {
+    bs = mbmi->bsize;
+    if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
+      nhb = AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+      hb_step = 2;
+    }
+    if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
+      nvb = AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+      vb_step = 2;
+    }
+  } else {
+    bs = BLOCK_64X64;
+  }
+  // Get number of 8x8 blocks which are not skip. Cdef processing happens for
+  // 8x8 blocks which are not skip.
+  const int cdef_count = av1_cdef_compute_sb_list(
+      mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs);
+
+  const int yoff = CDEF_VBORDER * (fbr != 0);
+  const int xoff = CDEF_HBORDER * (fbc != 0);
+  int dirinit = 0;
+  for (int pli = 0; pli < cdef_search_ctx->num_planes; pli++) {
+    for (int i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
+    /* We avoid filtering the pixels for which some of the pixels to
+    average are outside the frame. We could change the filter instead,
+    but it would add special cases for any future vectorization. */
+    const int ysize = (nvb << mi_high_l2[pli]) +
+                      CDEF_VBORDER * (fbr + vb_step < cdef_search_ctx->nvfb) +
+                      yoff;
+    const int xsize = (nhb << mi_wide_l2[pli]) +
+                      CDEF_HBORDER * (fbc + hb_step < cdef_search_ctx->nhfb) +
+                      xoff;
+    const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli];
+    const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
+    struct macroblockd_plane pd = cdef_search_ctx->plane[pli];
+    cdef_search_ctx->copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
+                             pd.dst.buf, row - yoff, col - xoff, pd.dst.stride,
+                             ysize, xsize);
+    for (int gi = 0; gi < cdef_search_ctx->total_strengths; gi++) {
+      int pri_strength, sec_strength;
+      get_cdef_filter_strengths(cdef_search_ctx->pick_method, &pri_strength,
+                                &sec_strength, gi);
+      av1_cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in,
+                         cdef_search_ctx->xdec[pli], cdef_search_ctx->ydec[pli],
+                         dir, &dirinit, var, pli, dlist, cdef_count,
+                         pri_strength, sec_strength + (sec_strength == 3),
+                         cdef_search_ctx->damping, coeff_shift);
+      const uint64_t curr_mse = cdef_search_ctx->compute_cdef_dist_fn(
+          ref_buffer[pli], ref_stride[pli], tmp_dst, dlist, cdef_count,
+          cdef_search_ctx->bsize[pli], coeff_shift, row, col);
+      if (pli < 2)
+        cdef_search_ctx->mse[pli][sb_count][gi] = curr_mse;
+      else
+        cdef_search_ctx->mse[1][sb_count][gi] += curr_mse;
+    }
+  }
+  cdef_search_ctx->sb_index[sb_count] =
+      MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc;
+}
+
+// MSE calculation at frame level.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters related to
+//   CDEF search context.
+// Returns:
+//   Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx) {
+  // Loop over each sb.
+  for (int fbr = 0; fbr < cdef_search_ctx->nvfb; ++fbr) {
+    for (int fbc = 0; fbc < cdef_search_ctx->nhfb; ++fbc) {
+      // Checks if cdef processing can be skipped for particular sb.
+      if (cdef_sb_skip(cdef_search_ctx->mi_params, fbr, fbc)) continue;
+      // Calculate mse for each sb and store the relevant sb index.
+      av1_cdef_mse_calc_block(cdef_search_ctx, fbr, fbc,
+                              cdef_search_ctx->sb_count);
+      cdef_search_ctx->sb_count++;
     }
   }
-  return 1;
 }
 
-static void pick_cdef_from_qp(AV1_COMMON *const cm) {
-  const int bd = cm->seq_params.bit_depth;
+// Allocates memory for members of CdefSearchCtx.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters
+//   related to CDEF search context.
+// Returns:
+//   Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static AOM_INLINE bool cdef_alloc_data(CdefSearchCtx *cdef_search_ctx) {
+  const int nvfb = cdef_search_ctx->nvfb;
+  const int nhfb = cdef_search_ctx->nhfb;
+  cdef_search_ctx->sb_index =
+      aom_malloc(nvfb * nhfb * sizeof(cdef_search_ctx->sb_index));
+  cdef_search_ctx->sb_count = 0;
+  cdef_search_ctx->mse[0] =
+      aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb);
+  cdef_search_ctx->mse[1] =
+      aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb);
+  if (!(cdef_search_ctx->sb_index && cdef_search_ctx->mse[0] &&
+        cdef_search_ctx->mse[1])) {
+    aom_free(cdef_search_ctx->sb_index);
+    aom_free(cdef_search_ctx->mse[0]);
+    aom_free(cdef_search_ctx->mse[1]);
+    return false;
+  }
+  return true;
+}
+
+// Deallocates the memory allocated for members of CdefSearchCtx.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters
+//   related to CDEF search context.
+// Returns:
+//   Nothing will be returned.
+static AOM_INLINE void cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) {
+  aom_free(cdef_search_ctx->mse[0]);
+  aom_free(cdef_search_ctx->mse[1]);
+  aom_free(cdef_search_ctx->sb_index);
+}
+
+// Initialize the parameters related to CDEF search context.
+// Inputs:
+//   frame: Pointer to compressed frame buffer
+//   ref: Pointer to the frame buffer holding the source frame
+//   cm: Pointer to top level common structure
+//   xd: Pointer to common current coding block structure
+//   cdef_search_ctx: Pointer to the structure containing parameters related to
+//   CDEF search context.
+//   pick_method: Search method used to select CDEF parameters
+// Returns:
+//   Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame,
+                                        const YV12_BUFFER_CONFIG *ref,
+                                        AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        CdefSearchCtx *cdef_search_ctx,
+                                        CDEF_PICK_METHOD pick_method) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  cdef_search_ctx->mi_params = &cm->mi_params;
+  cdef_search_ctx->ref = ref;
+  cdef_search_ctx->nvfb =
+      (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  cdef_search_ctx->nhfb =
+      (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
+  cdef_search_ctx->damping = 3 + (cm->quant_params.base_qindex >> 6);
+  cdef_search_ctx->total_strengths = nb_cdef_strengths[pick_method];
+  cdef_search_ctx->num_planes = num_planes;
+  cdef_search_ctx->pick_method = pick_method;
+  cdef_search_ctx->sb_count = 0;
+  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
+                       num_planes);
+  // Initialize plane wise information.
+  for (int pli = 0; pli < num_planes; pli++) {
+    cdef_search_ctx->xdec[pli] = xd->plane[pli].subsampling_x;
+    cdef_search_ctx->ydec[pli] = xd->plane[pli].subsampling_y;
+    cdef_search_ctx->bsize[pli] =
+        cdef_search_ctx->ydec[pli]
+            ? (cdef_search_ctx->xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
+            : (cdef_search_ctx->xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
+    cdef_search_ctx->mi_wide_l2[pli] =
+        MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+    cdef_search_ctx->mi_high_l2[pli] =
+        MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+    cdef_search_ctx->plane[pli] = xd->plane[pli];
+  }
+  // Function pointer initialization.
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (cm->seq_params->use_highbitdepth) {
+    cdef_search_ctx->copy_fn = copy_sb16_16_highbd;
+    cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist_highbd;
+  } else {
+    cdef_search_ctx->copy_fn = copy_sb16_16;
+    cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist;
+  }
+#else
+  cdef_search_ctx->copy_fn = copy_sb16_16;
+  cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist;
+#endif
+}
+
+static void pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
+                              int frames_since_key) {
+  const int bd = cm->seq_params->bit_depth;
   const int q =
       av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8);
   CdefInfo *const cdef_info = &cm->cdef_info;
-  cdef_info->cdef_bits = 0;
-  cdef_info->nb_cdef_strengths = 1;
+  // Check the speed feature to avoid extra signaling.
+  if (skip_cdef) {
+    cdef_info->cdef_bits = 1;
+    cdef_info->nb_cdef_strengths = 2;
+  } else {
+    cdef_info->cdef_bits = 0;
+    cdef_info->nb_cdef_strengths = 1;
+  }
   cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6);
 
   int predicted_y_f1 = 0;
   int predicted_y_f2 = 0;
   int predicted_uv_f1 = 0;
   int predicted_uv_f2 = 0;
-  aom_clear_system_state();
   if (!frame_is_intra_only(cm)) {
     predicted_y_f1 = clamp((int)roundf(q * q * -0.0000023593946f +
                                        q * 0.0068615186f + 0.02709886f),
@@ -363,160 +559,78 @@ static void pick_cdef_from_qp(AV1_COMMON *const cm) {
   cdef_info->cdef_uv_strengths[0] =
       predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2;
 
+  if (skip_cdef) {
+    cdef_info->cdef_strengths[1] = 0;
+    cdef_info->cdef_uv_strengths[1] = 0;
+  }
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   MB_MODE_INFO **mbmi = mi_params->mi_grid_base;
   for (int r = 0; r < nvfb; ++r) {
     for (int c = 0; c < nhfb; ++c) {
-      mbmi[MI_SIZE_64X64 * c]->cdef_strength = 0;
+      MB_MODE_INFO *current_mbmi = mbmi[MI_SIZE_64X64 * c];
+      current_mbmi->cdef_strength = 0;
+      if (skip_cdef && current_mbmi->skip_cdef_curr_sb &&
+          frames_since_key > 10) {
+        current_mbmi->cdef_strength = 1;
+      }
     }
     mbmi += MI_SIZE_64X64 * mi_params->mi_stride;
   }
 }
 
-void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
-                     AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method,
-                     int rdmult) {
-  if (pick_method == CDEF_PICK_FROM_Q) {
-    pick_cdef_from_qp(cm);
+void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
+                     const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
+                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult,
+                     int skip_cdef_feature, int frames_since_key,
+                     CDEF_CONTROL cdef_control, int non_reference_frame) {
+  assert(cdef_control != CDEF_NONE);
+  if (cdef_control == CDEF_REFERENCE && non_reference_frame) {
+    CdefInfo *const cdef_info = &cm->cdef_info;
+    cdef_info->nb_cdef_strengths = 1;
+    cdef_info->cdef_bits = 0;
+    cdef_info->cdef_strengths[0] = 0;
+    cdef_info->cdef_uv_strengths[0] = 0;
     return;
   }
 
-  cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
-  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
-  int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  if (pick_method == CDEF_PICK_FROM_Q) {
+    pick_cdef_from_qp(cm, skip_cdef_feature, frames_since_key);
+    return;
+  }
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
   const int damping = 3 + (cm->quant_params.base_qindex >> 6);
-  const int fast = (pick_method == CDEF_FAST_SEARCH_LVL1 ||
-                    pick_method == CDEF_FAST_SEARCH_LVL2);
-  const int total_strengths = nb_cdef_strengths[pick_method];
-  DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+  const int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
+                    pick_method <= CDEF_FAST_SEARCH_LVL5);
   const int num_planes = av1_num_planes(cm);
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
-                       num_planes);
-  uint64_t(*mse[2])[TOTAL_STRENGTHS];
-  mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
-  mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
-
-  int bsize[3];
-  int mi_wide_l2[3];
-  int mi_high_l2[3];
-  int xdec[3];
-  int ydec[3];
-  uint8_t *ref_buffer[3] = { ref->y_buffer, ref->u_buffer, ref->v_buffer };
-  int ref_stride[3] = { ref->y_stride, ref->uv_stride, ref->uv_stride };
-
-  for (int pli = 0; pli < num_planes; pli++) {
-    xdec[pli] = xd->plane[pli].subsampling_x;
-    ydec[pli] = xd->plane[pli].subsampling_y;
-    bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
-                           : (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
-    mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
-    mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+  CdefSearchCtx cdef_search_ctx;
+  // Initialize parameters related to CDEF search context.
+  cdef_params_init(frame, ref, cm, xd, &cdef_search_ctx, pick_method);
+  // Allocate CDEF search context buffers.
+  if (!cdef_alloc_data(&cdef_search_ctx)) {
+    CdefInfo *const cdef_info = &cm->cdef_info;
+    cdef_info->nb_cdef_strengths = 0;
+    cdef_info->cdef_bits = 0;
+    cdef_info->cdef_strengths[0] = 0;
+    cdef_info->cdef_uv_strengths[0] = 0;
+    return;
   }
-
-  copy_fn_t copy_fn;
-  compute_cdef_dist_t compute_cdef_dist_fn;
-
-  if (cm->seq_params.use_highbitdepth) {
-    copy_fn = copy_sb16_16_highbd;
-    compute_cdef_dist_fn = compute_cdef_dist_highbd;
+  // Frame level mse calculation.
+  if (mt_info->num_workers > 1) {
+    av1_cdef_mse_calc_frame_mt(cm, mt_info, &cdef_search_ctx);
   } else {
-    copy_fn = copy_sb16_16;
-    compute_cdef_dist_fn = compute_cdef_dist;
+    cdef_mse_calc_frame(&cdef_search_ctx);
   }
 
-  DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
-  uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
-  const int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
-  int sb_count = 0;
-  for (int fbr = 0; fbr < nvfb; ++fbr) {
-    for (int fbc = 0; fbc < nhfb; ++fbc) {
-      // No filtering if the entire filter block is skipped
-      if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64))
-        continue;
-
-      const MB_MODE_INFO *const mbmi =
-          mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
-                                  MI_SIZE_64X64 * fbc];
-      if (((fbc & 1) &&
-           (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64)) ||
-          ((fbr & 1) &&
-           (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_64X128)))
-        continue;
-
-      int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
-      int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
-      int hb_step = 1;
-      int vb_step = 1;
-      BLOCK_SIZE bs;
-      if (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64 ||
-          mbmi->sb_type == BLOCK_64X128) {
-        bs = mbmi->sb_type;
-        if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
-          nhb =
-              AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
-          hb_step = 2;
-        }
-        if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
-          nvb =
-              AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
-          vb_step = 2;
-        }
-      } else {
-        bs = BLOCK_64X64;
-      }
-
-      const int cdef_count = av1_cdef_compute_sb_list(
-          mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs);
-
-      const int yoff = CDEF_VBORDER * (fbr != 0);
-      const int xoff = CDEF_HBORDER * (fbc != 0);
-      int dirinit = 0;
-      for (int pli = 0; pli < num_planes; pli++) {
-        for (int i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
-        /* We avoid filtering the pixels for which some of the pixels to
-           average are outside the frame. We could change the filter instead,
-           but it would add special cases for any future vectorization. */
-        const int ysize = (nvb << mi_high_l2[pli]) +
-                          CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff;
-        const int xsize = (nhb << mi_wide_l2[pli]) +
-                          CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff;
-        const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli];
-        const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
-        for (int gi = 0; gi < total_strengths; gi++) {
-          int pri_strength = gi / CDEF_SEC_STRENGTHS;
-          if (fast) pri_strength = get_pri_strength(pick_method, pri_strength);
-          const int sec_strength = gi % CDEF_SEC_STRENGTHS;
-          copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
-                  xd->plane[pli].dst.buf, row - yoff, col - xoff,
-                  xd->plane[pli].dst.stride, ysize, xsize);
-          av1_cdef_filter_fb(
-              NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli], dir,
-              &dirinit, var, pli, dlist, cdef_count, pri_strength,
-              sec_strength + (sec_strength == 3), damping, coeff_shift);
-          const uint64_t curr_mse = compute_cdef_dist_fn(
-              ref_buffer[pli], ref_stride[pli], tmp_dst, dlist, cdef_count,
-              bsize[pli], coeff_shift, row, col);
-          if (pli < 2)
-            mse[pli][sb_count][gi] = curr_mse;
-          else
-            mse[1][sb_count][gi] += curr_mse;
-        }
-      }
-      sb_index[sb_count++] =
-          MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc;
-    }
-  }
-
-  /* Search for different number of signalling bits. */
+  /* Search for different number of signaling bits. */
   int nb_strength_bits = 0;
   uint64_t best_rd = UINT64_MAX;
   CdefInfo *const cdef_info = &cm->cdef_info;
+  int sb_count = cdef_search_ctx.sb_count;
+  uint64_t(*mse[2])[TOTAL_STRENGTHS];
+  mse[0] = cdef_search_ctx.mse[0];
+  mse[1] = cdef_search_ctx.mse[1];
   for (int i = 0; i <= 3; i++) {
     int best_lev0[CDEF_MAX_STRENGTHS];
     int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
@@ -560,28 +674,23 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
         best_mse = curr;
       }
     }
-    mi_params->mi_grid_base[sb_index[i]]->cdef_strength = best_gi;
+    mi_params->mi_grid_base[cdef_search_ctx.sb_index[i]]->cdef_strength =
+        best_gi;
   }
-
   if (fast) {
     for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) {
       const int luma_strength = cdef_info->cdef_strengths[j];
       const int chroma_strength = cdef_info->cdef_uv_strengths[j];
-      int pri_strength;
-      pri_strength =
-          get_pri_strength(pick_method, luma_strength / CDEF_SEC_STRENGTHS);
-      cdef_info->cdef_strengths[j] = pri_strength * CDEF_SEC_STRENGTHS +
-                                     (luma_strength % CDEF_SEC_STRENGTHS);
-      pri_strength =
-          get_pri_strength(pick_method, chroma_strength / CDEF_SEC_STRENGTHS);
-      cdef_info->cdef_uv_strengths[j] = pri_strength * CDEF_SEC_STRENGTHS +
-                                        (chroma_strength % CDEF_SEC_STRENGTHS);
+      int pri_strength, sec_strength;
+
+      STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_strengths[j], pick_method,
+                                 luma_strength);
+      STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_uv_strengths[j], pick_method,
+                                 chroma_strength);
     }
   }
 
   cdef_info->cdef_damping = damping;
-
-  aom_free(mse[0]);
-  aom_free(mse[1]);
-  aom_free(sb_index);
+  // Deallocate CDEF search context buffers.
+  cdef_dealloc_data(&cdef_search_ctx);
 }
diff --git a/media/libaom/src/av1/encoder/pickcdef.h b/media/libaom/src/av1/encoder/pickcdef.h
new file mode 100644
index 0000000000..d52cb4bc66
--- /dev/null
+++ b/media/libaom/src/av1/encoder/pickcdef.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_PICKCDEF_H_
+#define AOM_AV1_ENCODER_PICKCDEF_H_
+
+#include "av1/common/cdef.h"
+#include "av1/encoder/speed_features.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\enum CDEF_CONTROL
+ * \brief This enum controls to which frames CDEF is applied.
+ */
+typedef enum {
+  CDEF_NONE = 0,      /*!< Disable CDEF on all frames. */
+  CDEF_ALL = 1,       /*!< Enable CDEF for all frames. */
+  CDEF_REFERENCE = 2, /*!< Disable CDEF on non reference frames. */
+} CDEF_CONTROL;
+
+/*!\cond */
+struct MultiThreadInfo;
+
+#define REDUCED_PRI_STRENGTHS_LVL1 8
+#define REDUCED_PRI_STRENGTHS_LVL2 5
+#define REDUCED_SEC_STRENGTHS_LVL3 2
+#define REDUCED_SEC_STRENGTHS_LVL5 1
+#define REDUCED_PRI_STRENGTHS_LVL4 2
+
+#define REDUCED_TOTAL_STRENGTHS_LVL1 \
+  (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS)
+#define REDUCED_TOTAL_STRENGTHS_LVL2 \
+  (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS)
+#define REDUCED_TOTAL_STRENGTHS_LVL3 \
+  (REDUCED_PRI_STRENGTHS_LVL2 * REDUCED_SEC_STRENGTHS_LVL3)
+#define REDUCED_TOTAL_STRENGTHS_LVL4 \
+  (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL3)
+#define REDUCED_TOTAL_STRENGTHS_LVL5 \
+  (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL5)
+#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
+
+static const int priconv_lvl1[REDUCED_PRI_STRENGTHS_LVL1] = { 0, 1, 2,  3,
+                                                              5, 7, 10, 13 };
+static const int priconv_lvl2[REDUCED_PRI_STRENGTHS_LVL2] = { 0, 2, 4, 8, 14 };
+static const int priconv_lvl4[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 11 };
+static const int priconv_lvl5[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 5 };
+static const int secconv_lvl3[REDUCED_SEC_STRENGTHS_LVL3] = { 0, 2 };
+static const int secconv_lvl5[REDUCED_SEC_STRENGTHS_LVL5] = { 0 };
+static const int nb_cdef_strengths[CDEF_PICK_METHODS] = {
+  TOTAL_STRENGTHS,
+  REDUCED_TOTAL_STRENGTHS_LVL1,
+  REDUCED_TOTAL_STRENGTHS_LVL2,
+  REDUCED_TOTAL_STRENGTHS_LVL3,
+  REDUCED_TOTAL_STRENGTHS_LVL4,
+  REDUCED_TOTAL_STRENGTHS_LVL5,
+  TOTAL_STRENGTHS
+};
+
+typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const void *src,
+                          int src_voffset, int src_hoffset, int sstride,
+                          int vsize, int hsize);
+typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src,
+                                        cdef_list *dlist, int cdef_count,
+                                        BLOCK_SIZE bsize, int coeff_shift,
+                                        int row, int col);
+
+/*! \brief CDEF search context.
+ */
+typedef struct {
+  /*!
+   * Pointer to the frame buffer holding the source frame
+   */
+  const YV12_BUFFER_CONFIG *ref;
+  /*!
+   * Pointer to params related to MB_MODE_INFO arrays and related info
+   */
+  CommonModeInfoParams *mi_params;
+  /*!
+   * Info specific to each plane
+   */
+  struct macroblockd_plane plane[MAX_MB_PLANE];
+  /*!
+   * Function pointer of copy_fn
+   */
+  copy_fn_t copy_fn;
+  /*!
+   * Function pointer of compute_cdef_dist_fn
+   */
+  compute_cdef_dist_t compute_cdef_dist_fn;
+  /*!
+   *  Number of strenghts evaluated in CDEF filter search
+   */
+  int total_strengths;
+  /*!
+   * Bit-depth dependent shift
+   */
+  int coeff_shift;
+  /*!
+   * CDEF damping factor
+   */
+  int damping;
+  /*!
+   * Search method used to select CDEF parameters
+   */
+  int pick_method;
+  /*!
+   * Number of planes
+   */
+  int num_planes;
+  /*!
+   * Log2 of width of the MI unit in pixels. mi_wide_l2[i]
+   * indicates the width of the MI unit in pixels for the ith plane
+   */
+  int mi_wide_l2[MAX_MB_PLANE];
+  /*!
+   * Log2 of height of the MI unit in pixels. mi_high_l2[i]
+   * indicates the height of the MI unit in pixels for the ith plane
+   */
+  int mi_high_l2[MAX_MB_PLANE];
+  /*!
+   * Subsampling in x direction. xdec[i] indicates the subsampling
+   * for the ith plane
+   */
+  int xdec[MAX_MB_PLANE];
+  /*!
+   * Subsampling in y direction. ydec[i] indicates the subsampling
+   * for the ith plane
+   */
+  int ydec[MAX_MB_PLANE];
+  /*!
+   * bsize[i] indicates the block size of ith plane
+   */
+  int bsize[MAX_MB_PLANE];
+  /*!
+   * Number of 64x64 blocks in vertical direction of a frame
+   */
+  int nvfb;
+  /*!
+   * Number of 64x64 blocks in horizontal direction of a frame
+   */
+  int nhfb;
+  /*!
+   * Pointer to the mean squared error between the CDEF filtered block and the
+   * source block. mse[i][j][k] stores the MSE of the ith plane (i=0 corresponds
+   * to Y-plane, i=1 corresponds to U and V planes), jth block and kth strength
+   * index
+   */
+  uint64_t (*mse[2])[TOTAL_STRENGTHS];
+  /*!
+   * Holds the position (in units of mi's) of the cdef filtered
+   * block in raster scan order
+   */
+  int *sb_index;
+  /*!
+   * Holds the count of cdef filtered blocks
+   */
+  int sb_count;
+} CdefSearchCtx;
+
+static INLINE int sb_all_skip(const CommonModeInfoParams *const mi_params,
+                              int mi_row, int mi_col) {
+  const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64);
+  const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64);
+  const int stride = mi_params->mi_stride;
+  MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col;
+  for (int r = 0; r < maxr; ++r, mbmi += stride) {
+    for (int c = 0; c < maxc; ++c) {
+      if (!mbmi[c]->skip_txfm) return 0;
+    }
+  }
+  return 1;
+}
+
+// Checks if cdef processing can be skipped for particular sb.
+// Inputs:
+//   cdef_search_ctx: Pointer to the structure containing parameters related to
+//   CDEF search context.
+//   fbr: Row index in units of 64x64 block
+//   fbc: Column index in units of 64x64 block
+// Returns:
+//   1/0 will be returned to indicate skip/don't skip cdef processing of sb
+//   respectively.
+static INLINE int cdef_sb_skip(const CommonModeInfoParams *const mi_params,
+                               int fbr, int fbc) {
+  const MB_MODE_INFO *const mbmi =
+      mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                              MI_SIZE_64X64 * fbc];
+  // No filtering if the entire filter block is skipped.
+  if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64))
+    return 1;
+  // Skip odd numbered 64x64 block rows(cols) when bsize is BLOCK_128X128,
+  // BLOCK_64X128(BLOCK_128X128, BLOCK_128X64) as for such blocks CDEF filtering
+  // is done at the corresponding block sizes.
+  if (((fbc & 1) &&
+       (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) ||
+      ((fbr & 1) &&
+       (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128)))
+    return 1;
+  return 0;
+}
+
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
+                             int sb_count);
+/*!\endcond */
+
+/*!\brief AV1 CDEF parameter search
+ *
+ * \ingroup in_loop_cdef
+ *
+ * Searches for optimal CDEF parameters for frame
+ *
+ * \param[in]      mt_info      Pointer to multi-threading parameters
+ * \param[in]      frame        Compressed frame buffer
+ * \param[in]      ref          Source frame buffer
+ * \param[in,out]  cm           Pointer to top level common structure
+ * \param[in]      xd           Pointer to common current coding block structure
+ * \param[in]      pick_method  The method used to select params
+ * \param[in]      rdmult       rd multiplier to use in making param choices
+ * \param[in]      skip_cdef_feature Speed feature to skip cdef
+ * \param[in]      frames_since_key Number of frames since key frame
+ * \param[in]      cdef_control  Parameter that controls CDEF application
+ * \param[in]      non_reference_frame Indicates if current frame is
+ * non-reference
+ *
+ * \return Nothing is returned. Instead, optimal CDEF parameters are stored
+ * in the \c cdef_info structure of type \ref CdefInfo inside \c cm:
+ * \arg \c cdef_bits: Bits of strength parameters
+ * \arg \c nb_cdef_strengths: Number of strength parameters
+ * \arg \c cdef_strengths: list of \c nb_cdef_strengths strength parameters
+ * for the luma plane.
+ * \arg \c uv_cdef_strengths: list of \c nb_cdef_strengths strength parameters
+ * for the chroma planes.
+ * \arg \c damping_factor: CDEF damping factor.
+ *
+ */
+void av1_cdef_search(struct MultiThreadInfo *mt_info,
+                     const YV12_BUFFER_CONFIG *frame,
+                     const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
+                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult,
+                     int skip_cdef_feature, int frames_since_key,
+                     CDEF_CONTROL cdef_control, int non_reference_frame);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_ENCODER_PICKCDEF_H_
diff --git a/media/libaom/src/av1/encoder/picklpf.c b/media/libaom/src/av1/encoder/picklpf.c
index 17c9965519..3aebe22f8f 100644
--- a/media/libaom/src/av1/encoder/picklpf.c
+++ b/media/libaom/src/av1/encoder/picklpf.c
@@ -39,8 +39,8 @@ static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
 
 int av1_get_max_filter_level(const AV1_COMP *cpi) {
   if (is_stat_consumption_stage_twopass(cpi)) {
-    return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
-                                                 : MAX_LOOP_FILTER;
+    return cpi->ppi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+                                                      : MAX_LOOP_FILTER;
   } else {
     return MAX_LOOP_FILTER;
   }
@@ -49,6 +49,8 @@ int av1_get_max_filter_level(const AV1_COMP *cpi) {
 static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
                                 AV1_COMP *const cpi, int filt_level,
                                 int partial_frame, int plane, int dir) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  int num_workers = mt_info->num_mod_workers[MOD_LPF];
   AV1_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
@@ -67,24 +69,15 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
     case 2: cm->lf.filter_level_v = filter_level[0]; break;
   }
 
-  // TODO(any): please enable multi-thread and remove the flag when loop
-  // filter mask is compatible with multi-thread.
-  if (cpi->num_workers > 1)
-    av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane,
-                             plane + 1, partial_frame,
-#if CONFIG_LPF_MASK
-                             0,
-#endif
-                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
-  else
-    av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd,
-#if CONFIG_LPF_MASK
-                          0,
-#endif
-                          plane, plane + 1, partial_frame);
+  // lpf_opt_level = 1 : Enables dual/quad loop-filtering.
+  int lpf_opt_level = is_inter_tx_size_search_level_one(&cpi->sf.tx_sf);
+
+  av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane,
+                           plane + 1, partial_frame, mt_info->workers,
+                           num_workers, &mt_info->lf_row_sync, lpf_opt_level);
 
   filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane,
-                               cm->seq_params.use_highbitdepth);
+                               cm->seq_params->use_highbitdepth);
 
   // Re-instate the unfiltered frame
   yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane);
@@ -94,15 +87,14 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
 
 static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                                int partial_frame,
-                               const int *last_frame_filter_level,
-                               double *best_cost_ret, int plane, int dir) {
+                               const int *last_frame_filter_level, int plane,
+                               int dir) {
   const AV1_COMMON *const cm = &cpi->common;
   const int min_filter_level = 0;
   const int max_filter_level = av1_get_max_filter_level(cpi);
   int filt_direction = 0;
   int64_t best_err;
   int filt_best;
-  MACROBLOCK *x = &cpi->td.mb;
 
   // Start the search at the previous frame filter level unless it is now out of
   // range.
@@ -128,6 +120,13 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   // Sum squared error at each filter level
   int64_t ss_err[MAX_LOOP_FILTER + 1];
 
+  const int use_coarse_search = cpi->sf.lpf_sf.use_coarse_filter_level_search;
+  assert(use_coarse_search <= 1);
+  static const int min_filter_step_lookup[2] = { 0, 2 };
+  // min_filter_step_thesh determines the stopping criteria for the search.
+  // The search is terminated when filter_step equals min_filter_step_thesh.
+  const int min_filter_step_thesh = min_filter_step_lookup[use_coarse_search];
+
   // Set each entry to -1
   memset(ss_err, 0xFF, sizeof(ss_err));
   yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane);
@@ -135,7 +134,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   filt_best = filt_mid;
   ss_err[filt_mid] = best_err;
 
-  while (filter_step > 0) {
+  while (filter_step > min_filter_step_thesh) {
     const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
     const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
 
@@ -143,8 +142,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
     if ((is_stat_consumption_stage_twopass(cpi)) &&
-        (cpi->twopass.section_intra_rating < 20))
-      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+        (cpi->ppi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->ppi->twopass.section_intra_rating) / 20;
 
     // yx, bias less for large block size
     if (cm->features.tx_mode != ONLY_4X4) bias >>= 1;
@@ -190,23 +189,35 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     }
   }
 
-  // Update best error
-  best_err = ss_err[filt_best];
-
-  if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
   return filt_best;
 }
 
 void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                            LPF_PICK_METHOD method) {
   AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   struct loopfilter *const lf = &cm->lf;
+  int disable_filter_rt_screen = 0;
   (void)sd;
 
   lf->sharpness_level = 0;
   cpi->td.mb.rdmult = cpi->rd.RDMULT;
 
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+      cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->sf.rt_sf.skip_lf_screen)
+    disable_filter_rt_screen = av1_cyclic_refresh_disable_lf_cdef(cpi);
+
+  if (disable_filter_rt_screen ||
+      cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_NONE ||
+      (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_REFERENCE &&
+       cpi->svc.non_reference_frame)) {
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+    return;
+  }
+
   if (method == LPF_PICK_MINIMAL_LPF) {
     lf->filter_level[0] = 0;
     lf->filter_level[1] = 0;
@@ -214,12 +225,16 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     const int min_filter_level = 0;
     const int max_filter_level = av1_get_max_filter_level(cpi);
     const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
-                                   cm->seq_params.bit_depth);
+                                   seq_params->bit_depth);
     // based on tests result for rtc test set
     // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
-    const int strength_boost_q_treshold = 700;
-    const int inter_frame_multiplier =
-        q > strength_boost_q_treshold ? 12034 : 6017;
+    const int strength_boost_q_treshold = 0;
+    int inter_frame_multiplier =
+        (q > strength_boost_q_treshold ||
+         (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+          cpi->common.width * cpi->common.height > 352 * 288))
+            ? 12034
+            : 6017;
     // These values were determined by linear fitting the result of the
     // searched level for 8 bit depth:
     // Keyframes: filt_guess = q * 0.06699 - 1.60817
@@ -228,7 +243,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     // And high bit depth separately:
     // filt_guess = q * 0.316206 + 3.87252
     int filt_guess;
-    switch (cm->seq_params.bit_depth) {
+    switch (seq_params->bit_depth) {
       case AOM_BITS_8:
         filt_guess =
             (cm->current_frame.frame_type == KEY_FRAME)
@@ -247,7 +262,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                "or AOM_BITS_12");
         return;
     }
-    if (cm->seq_params.bit_depth != AOM_BITS_8 &&
+    if (seq_params->bit_depth != AOM_BITS_8 &&
         cm->current_frame.frame_type == KEY_FRAME)
       filt_guess -= 4;
     // TODO(chengchen): retrain the model for Y, U, V filter levels
@@ -255,31 +270,60 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
     lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
     lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
+    if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY &&
+        !frame_is_intra_only(cm)) {
+      if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+        lf->filter_level[0] = 0;
+        lf->filter_level[1] = 0;
+      } else {
+        const int num4x4 = (cm->width >> 2) * (cm->height >> 2);
+        const int newmv_thresh = 7;
+        const int distance_since_key_thresh = 5;
+        if ((cpi->td.rd_counts.newmv_or_intra_blocks * 100 / num4x4) <
+                newmv_thresh &&
+            cpi->rc.frames_since_key > distance_since_key_thresh) {
+          lf->filter_level[0] = 0;
+          lf->filter_level[1] = 0;
+        }
+      }
+    }
   } else {
-    const int last_frame_filter_level[4] = { lf->filter_level[0],
-                                             lf->filter_level[1],
-                                             lf->filter_level_u,
-                                             lf->filter_level_v };
+    int last_frame_filter_level[4] = { 0 };
+    if (!frame_is_intra_only(cm)) {
+      last_frame_filter_level[0] = cpi->ppi->filter_level[0];
+      last_frame_filter_level[1] = cpi->ppi->filter_level[1];
+      last_frame_filter_level[2] = cpi->ppi->filter_level_u;
+      last_frame_filter_level[3] = cpi->ppi->filter_level_v;
+    }
+    // The frame buffer last_frame_uf is used to store the non-loop filtered
+    // reconstructed frame in search_filter_level().
+    if (aom_realloc_frame_buffer(
+            &cpi->last_frame_uf, cm->width, cm->height,
+            seq_params->subsampling_x, seq_params->subsampling_y,
+            seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+            cm->features.byte_alignment, NULL, NULL, NULL, 0, 0))
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate last frame buffer");
 
     lf->filter_level[0] = lf->filter_level[1] =
         search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                            last_frame_filter_level, NULL, 0, 2);
+                            last_frame_filter_level, 0, 2);
     if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) {
       lf->filter_level[0] =
           search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                              last_frame_filter_level, NULL, 0, 0);
+                              last_frame_filter_level, 0, 0);
       lf->filter_level[1] =
           search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                              last_frame_filter_level, NULL, 0, 1);
+                              last_frame_filter_level, 0, 1);
     }
 
     if (num_planes > 1) {
       lf->filter_level_u =
           search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                              last_frame_filter_level, NULL, 1, 0);
+                              last_frame_filter_level, 1, 0);
       lf->filter_level_v =
           search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                              last_frame_filter_level, NULL, 2, 0);
+                              last_frame_filter_level, 2, 0);
     }
   }
 }
diff --git a/media/libaom/src/av1/encoder/picklpf.h b/media/libaom/src/av1/encoder/picklpf.h
index 357097ae1b..727335517b 100644
--- a/media/libaom/src/av1/encoder/picklpf.h
+++ b/media/libaom/src/av1/encoder/picklpf.h
@@ -21,6 +21,141 @@ extern "C" {
 struct yv12_buffer_config;
 struct AV1_COMP;
 int av1_get_max_filter_level(const AV1_COMP *cpi);
+
+/*!\brief Algorithm for AV1 loop filter level selection.
+ *
+ * \ingroup in_loop_filter
+ * This function determines proper filter levels used for in-loop filter
+ * (deblock filter).
+ *
+ * \param[in]    sd             The pointer of frame buffer
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    method         The method used to select filter levels
+ *
+ * \par
+ * method includes:
+ * \arg \c LPF_PICK_FROM_FULL_IMAGE:  Try the full image with different values.
+ * \arg \c LPF_PICK_FROM_FULL_IMAGE_NON_DUAL: Try the full image filter search
+ * with non-dual filter only.
+ * \arg \c LPF_PICK_FROM_SUBIMAGE: Try a small portion of the image with
+ * different values.
+ * \arg \c LPF_PICK_FROM_Q: Estimate the level based on quantizer and frame type
+ * \arg \c LPF_PICK_MINIMAL_LPF: Pick 0 to disable LPF if LPF was enabled last
+ * frame
+ *
+ * \return Nothing is returned. Instead, filter levels below are stored in the
+ * "loopfilter" structure inside "cpi":
+ * \arg \c filter_level[0]: the vertical filter level for Y plane
+ * \arg \c filter_level[1]: the horizontal filter level for Y plane
+ * \arg \c filter_level_u: the filter level for U plane
+ * \arg \c filter_level_v: the filter level for V plane
+ *
+ * \n
+ * \b Overview
+ * \par
+ * The workflow of deblock filter is shown in Fig.1. \n
+ * Boundary pixels pass through a non-flatness check, followed by a step that
+ * determines smoothness and selects proper types of filters
+ * (4-, 6-, 8-, 14-tap filter). \n
+ * If non-flatness criteria is not satisfied, the encoder will not apply
+ * deblock filtering on these boundary pixels.
+ * \image html filter_flow.png "Fig.1. The workflow of deblock filter" width=70%
+ *
+ * \par
+ * The non-flatness is determined by the boundary pixels and thresholds as shown
+ * in Fig.2. \n
+ * Filtering is applied when \n
+ * \f$|p_0-p_1|<thr_1\f$   and   \f$|q_0-q_1|<thr_1\f$   and
+ * \f$2*|p_0-q_0|+|p_1-q_1|/2<thr_2\f$ \n
+ * \image html filter_thr.png "Fig.2. Non-flatness of pixel boundary" height=40%
+ *
+ * \par
+ * Thresholds ("thr_1" and "thr_2") are determined by the filter level. \n
+ * In AV1, for each frame, we employ the four filter levels, based on these
+ * observations: \n
+ * Luma and chroma planes have different characteristics, including subsampling
+ * (different plane size), coding quality (chroma planes are better coded). \n
+ * Therefore chroma planes need less deblocking filtering than luma plane. \n
+ * In addition, content texture has different spatial characteristics: vertical
+ * and horizontal direction may need different level of filtering. \n
+ * The selection of these filter levels is described in the following section.
+ *
+ * \par
+ * \b Algorithm
+ * \par
+ * The encoder selects filter levels given the current frame buffer, and the
+ * method. \n
+ * By default, "LPF_PICK_FROM_FULL_IMAGE" is used, which should provide
+ * the most appropriate filter levels. \n
+ * For video on demand (VOD) mode, if speed setting is larger than 5,
+ * "LPF_PICK_FROM_FULL_IMAGE_NON_DUAL" is used. \n
+ * For real-time mode, if speed setting is larger than 5, "LPF_PICK_FROM_Q" is
+ * used.
+ *
+ * \par
+ * "LPF_PICK_FROM_FULL_IMAGE" method: determine filter levels sequentially
+ * by a filter level search procedure (function "search_filter_level"). \n
+ * The order is: \n
+ * First search and determine the filter level for Y plane.
+ * Let vertical filter level (filter_level[0]) and the horizontal filter level
+ * (filter_level[1]) be equal to it. \n
+ * Keep the horizontal filter level the same and search and determine the
+ * vertical filter level. \n
+ * Search and determine the horizontal filter level. \n
+ * Search and determine filter level for U plane. \n
+ * Search and determine filter level for V plane.
+ *
+ * \par
+ * Search and determine filter level is fulfilled by function
+ * "search_filter_level". \n
+ * It starts with a base filter level ("filt_mid") initialized by the
+ * corresponding last frame's filter level. \n
+ * A filter step ("filter_step") is determined as:
+ * filter_step = filt_mid < 16 ? 4 : filt_mid / 4. \n
+ * Then a modified binary search strategy is employed to find a proper
+ * filter level. \n
+ * In each iteration, set filt_low = filt_mid - filter_step,
+ * filt_high = filt_mid + filter_step. \n
+ * We now have three candidate levels, "filt_mid", "filt_low" and "filt_high".
+ * \n
+ * Deblock filtering is applied on the current frame with candidate filter
+ * levels and the sum of squared error (SSE) between source and filtered frame
+ * is computed. \n
+ * Set "filt_best" to the filter level of the smallest SSE. If "filter_best"
+ * equals to "filt_mid", halve the filter_step. Otherwise, set filt_mid =
+ * filt_best. \n
+ * Go to the next iteration until "filter_step" is 0. \n
+ * Note that in the comparison of SSEs between SSE[filt_low] and SSE[filt_mid],
+ * a "bias" is introduced to slightly raise the filter level. \n
+ * It is based on the observation that low filter levels tend to yield a smaller
+ * SSE and produce a higher PSNR for the current frame, \n
+ * while oversmoothing it and degradating the quality for prediction for future
+ * frames and leanding to a suboptimal performance overall. \n
+ * Function "try_filter_frame" is the referrence for applying deblock filtering
+ * with a given filter level and computatition of SSE.
+ *
+ * \par
+ * "LPF_PICK_FROM_FULL_IMAGE_NON_DUAL" method: almost the same as
+ * "LPF_PICK_FROM_FULL_IMAGE", \n
+ * just without separately searching for appropriate filter levels for vertical
+ * and horizontal filters.
+ *
+ * \par
+ * "LPF_PICK_FROM_Q" method: filter levels are determined by the
+ * quantization factor (q). \n
+ * For 8 bit: \n
+ *   Keyframes: filt_guess = q * 0.06699 - 1.60817 \n
+ *   Other frames: filt_guess = q * inter_frame_multiplier + 2.48225 \n
+ *   inter_frame_multiplier = q > 700 ? 0.04590 : 0.02295 \n
+ * For 10 bit and 12 bit: \n
+ * filt_guess = q * 0.316206 + 3.87252 \n
+ * Then filter_level[0] = filter_level[1] = filter_level_u = filter_level_v =
+ * clamp(filt_guess, min_filter_level, max_filter_level) \n
+ * Where min_filter_level = 0, max_filter_level = 64 \n
+ * The equations were determined by linear fitting using filter levels
+ * generated by "LPF_PICK_FROM_FULL_IMAGE" method.
+ *
+ */
 void av1_pick_filter_level(const struct yv12_buffer_config *sd,
                            struct AV1_COMP *cpi, LPF_PICK_METHOD method);
 #ifdef __cplusplus
diff --git a/media/libaom/src/av1/encoder/pickrst.c b/media/libaom/src/av1/encoder/pickrst.c
index ccbe1cc3ea..008c469be5 100644
--- a/media/libaom/src/av1/encoder/pickrst.c
+++ b/media/libaom/src/av1/encoder/pickrst.c
@@ -19,17 +19,16 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/mathutils.h"
 #include "aom_dsp/psnr.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 #include "av1/common/av1_common_int.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/restoration.h"
 
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/encoder.h"
-#include "av1/encoder/mathutils.h"
 #include "av1/encoder/picklpf.h"
 #include "av1/encoder/pickrst.h"
 
@@ -133,7 +132,7 @@ typedef struct {
   RestUnitSearchInfo *rusi;
 
   // Speed features
-  const SPEED_FEATURES *sf;
+  const LOOP_FILTER_SPEED_FEATURES *lpf_sf;
 
   uint8_t *dgd_buffer;
   int dgd_stride;
@@ -166,8 +165,8 @@ static AOM_INLINE void reset_rsc(RestSearchCtxt *rsc) {
 
 static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
                                 const AV1_COMMON *cm, const MACROBLOCK *x,
-                                const SPEED_FEATURES *sf, int plane,
-                                RestUnitSearchInfo *rusi,
+                                const LOOP_FILTER_SPEED_FEATURES *lpf_sf,
+                                int plane, RestUnitSearchInfo *rusi,
                                 YV12_BUFFER_CONFIG *dst, RestSearchCtxt *rsc) {
   rsc->src = src;
   rsc->dst = dst;
@@ -175,7 +174,7 @@ static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
   rsc->x = x;
   rsc->plane = plane;
   rsc->rusi = rusi;
-  rsc->sf = sf;
+  rsc->lpf_sf = lpf_sf;
 
   const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
   const int is_uv = plane != AOM_PLANE_Y;
@@ -199,8 +198,8 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
   const int is_uv = plane > 0;
   const RestorationInfo *rsi = &cm->rst_info[plane];
   RestorationLineBuffers rlbs;
-  const int bit_depth = cm->seq_params.bit_depth;
-  const int highbd = cm->seq_params.use_highbitdepth;
+  const int bit_depth = cm->seq_params->bit_depth;
+  const int highbd = cm->seq_params->use_highbitdepth;
 
   const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf;
   // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
@@ -209,8 +208,8 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
 
   av1_loop_restoration_filter_unit(
       limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
-      is_uv && cm->seq_params.subsampling_x,
-      is_uv && cm->seq_params.subsampling_y, highbd, bit_depth,
+      is_uv && cm->seq_params->subsampling_x,
+      is_uv && cm->seq_params->subsampling_y, highbd, bit_depth,
       fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
       rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
 
@@ -631,11 +630,13 @@ void av1_calc_proj_params_c(const uint8_t *src8, int width, int height,
   }
 }
 
-static AOM_INLINE void av1_calc_proj_params_high_bd_c(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2],
-    const sgr_params_type *params) {
+void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height,
+                                    int src_stride, const uint8_t *dat8,
+                                    int dat_stride, int32_t *flt0,
+                                    int flt0_stride, int32_t *flt1,
+                                    int flt1_stride, int64_t H[2][2],
+                                    int64_t C[2],
+                                    const sgr_params_type *params) {
   if ((params->r[0] > 0) && (params->r[1] > 0)) {
     calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8,
                                      dat_stride, flt0, flt0_stride, flt1,
@@ -672,11 +673,20 @@ static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width,
                              flt0, flt0_stride, flt1, flt1_stride, H, C,
                              params);
     }
-  } else {
-    av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+  }
+#if CONFIG_AV1_HIGHBITDEPTH
+  else {  // NOLINT
+    if ((width & 0x7) == 0) {
+      av1_calc_proj_params_high_bd(src8, width, height, src_stride, dat8,
                                    dat_stride, flt0, flt0_stride, flt1,
                                    flt1_stride, H, C, params);
+    } else {
+      av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, flt1,
+                                     flt1_stride, H, C, params);
+    }
   }
+#endif
 
   if (params->r[0] == 0) {
     // H matrix is now only the scalar H[1][1]
@@ -763,12 +773,10 @@ static AOM_INLINE void compute_sgrproj_err(
   int exq[2];
   apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
             pu_width, pu_height, flt0, flt1, flt_stride);
-  aom_clear_system_state();
   const sgr_params_type *const params = &av1_sgr_params[ep];
   get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
                     use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
                     params);
-  aom_clear_system_state();
   encode_xq(exq, exqd, params);
   *err = finer_search_pixel_proj_error(
       src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
@@ -875,10 +883,10 @@ static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
 
   const MACROBLOCK *const x = rsc->x;
   const AV1_COMMON *const cm = rsc->cm;
-  const int highbd = cm->seq_params.use_highbitdepth;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int highbd = cm->seq_params->use_highbitdepth;
+  const int bit_depth = cm->seq_params->bit_depth;
 
-  const int64_t bits_none = x->sgrproj_restore_cost[0];
+  const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0];
   // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set
   if (rusi->skip_sgr_eval) {
     rsc->bits += bits_none;
@@ -894,8 +902,8 @@ static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
       rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
 
   const int is_uv = rsc->plane > 0;
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
   const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
   const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
 
@@ -903,7 +911,7 @@ static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
       dgd_start, limits->h_end - limits->h_start,
       limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
       rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
-      tmpbuf, rsc->sf->lpf_sf.enable_sgr_ep_pruning);
+      tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning);
 
   RestorationUnitInfo rui;
   rui.restoration_type = RESTORE_SGRPROJ;
@@ -911,17 +919,16 @@ static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
 
   rusi->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, tile, &rui);
 
-  const int64_t bits_sgr = x->sgrproj_restore_cost[1] +
+  const int64_t bits_sgr = x->mode_costs.sgrproj_restore_cost[1] +
                            (count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj)
                             << AV1_PROB_COST_SHIFT);
-
-  double cost_none =
-      RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
-  double cost_sgr =
-      RDCOST_DBL(x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ]);
+  double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE], bit_depth);
+  double cost_sgr = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ], bit_depth);
   if (rusi->sgrproj.ep < 10)
     cost_sgr *=
-        (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->lpf_sf.dual_sgr_penalty_level);
+        (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
 
   RestorationType rtype =
       (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
@@ -932,40 +939,77 @@ static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
   if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
 }
 
+void acc_stat_one_line(const uint8_t *dgd, const uint8_t *src, int dgd_stride,
+                       int h_start, int h_end, uint8_t avg,
+                       const int wiener_halfwin, const int wiener_win2,
+                       int32_t *M_int32, int32_t *H_int32, int count) {
+  int j, k, l;
+  int16_t Y[WIENER_WIN2];
+
+  for (j = h_start; j < h_end; j++) {
+    const int16_t X = (int16_t)src[j] - (int16_t)avg;
+    int idx = 0;
+    for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+      for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+        Y[idx] =
+            (int16_t)dgd[(count + l) * dgd_stride + (j + k)] - (int16_t)avg;
+        idx++;
+      }
+    }
+    assert(idx == wiener_win2);
+    for (k = 0; k < wiener_win2; ++k) {
+      M_int32[k] += (int32_t)Y[k] * X;
+      for (l = k; l < wiener_win2; ++l) {
+        // H is a symmetric matrix, so we only need to fill out the upper
+        // triangle here. We can copy it down to the lower triangle outside
+        // the (i, j) loops.
+        H_int32[k * wiener_win2 + l] += (int32_t)Y[k] * Y[l];
+      }
+    }
+  }
+}
+
 void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
                          int h_start, int h_end, int v_start, int v_end,
-                         int dgd_stride, int src_stride, int64_t *M,
-                         int64_t *H) {
-  int i, j, k, l;
-  int16_t Y[WIENER_WIN2];
+                         int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+                         int use_downsampled_wiener_stats) {
+  int i, k, l;
   const int wiener_win2 = wiener_win * wiener_win;
   const int wiener_halfwin = (wiener_win >> 1);
   uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+  int32_t M_row[WIENER_WIN2] = { 0 };
+  int32_t H_row[WIENER_WIN2 * WIENER_WIN2] = { 0 };
+  int downsample_factor =
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
 
   memset(M, 0, sizeof(*M) * wiener_win2);
   memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
-  for (i = v_start; i < v_end; i++) {
-    for (j = h_start; j < h_end; j++) {
-      const int16_t X = (int16_t)src[i * src_stride + j] - (int16_t)avg;
-      int idx = 0;
-      for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
-        for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
-          Y[idx] = (int16_t)dgd[(i + l) * dgd_stride + (j + k)] - (int16_t)avg;
-          idx++;
-        }
-      }
-      assert(idx == wiener_win2);
-      for (k = 0; k < wiener_win2; ++k) {
-        M[k] += (int32_t)Y[k] * X;
-        for (l = k; l < wiener_win2; ++l) {
-          // H is a symmetric matrix, so we only need to fill out the upper
-          // triangle here. We can copy it down to the lower triangle outside
-          // the (i, j) loops.
-          H[k * wiener_win2 + l] += (int32_t)Y[k] * Y[l];
-        }
+
+  for (i = v_start; i < v_end; i = i + downsample_factor) {
+    if (use_downsampled_wiener_stats &&
+        (v_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+      downsample_factor = v_end - i;
+    }
+
+    memset(M_row, 0, sizeof(int32_t) * WIENER_WIN2);
+    memset(H_row, 0, sizeof(int32_t) * WIENER_WIN2 * WIENER_WIN2);
+    acc_stat_one_line(dgd, src + i * src_stride, dgd_stride, h_start, h_end,
+                      avg, wiener_halfwin, wiener_win2, M_row, H_row, i);
+
+    for (k = 0; k < wiener_win2; ++k) {
+      // Scale M matrix based on the downsampling factor
+      M[k] += ((int64_t)M_row[k] * downsample_factor);
+      for (l = k; l < wiener_win2; ++l) {
+        // H is a symmetric matrix, so we only need to fill out the upper
+        // triangle here. We can copy it down to the lower triangle outside
+        // the (i, j) loops.
+        // Scale H Matrix based on the downsampling factor
+        H[k * wiener_win2 + l] +=
+            ((int64_t)H_row[k * wiener_win2 + l] * downsample_factor);
       }
     }
   }
+
   for (k = 0; k < wiener_win2; ++k) {
     for (l = k + 1; l < wiener_win2; ++l) {
       H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
@@ -1037,7 +1081,7 @@ static INLINE int wrap_index(int i, int wiener_win) {
 // Solve linear equations to find Wiener filter tap values
 // Taps are output scaled by WIENER_FILT_STEP
 static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b,
-                           int32_t *x) {
+                           int64_t *x) {
   for (int k = 0; k < n - 1; k++) {
     // Partial pivoting: bring the row with the largest pivot to the top
     for (int i = n - 1; i > k; i--) {
@@ -1072,7 +1116,7 @@ static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b,
       c += A[i * stride + j] * x[j] / WIENER_TAP_SCALE_FACTOR;
     }
     // Store filter taps x in scaled form.
-    x[i] = (int32_t)(WIENER_TAP_SCALE_FACTOR * (b[i] - c) / A[i * stride + i]);
+    x[i] = WIENER_TAP_SCALE_FACTOR * (b[i] - c) / A[i * stride + i];
   }
 
   return 1;
@@ -1082,7 +1126,7 @@ static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b,
 static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
                                         int64_t **Hc, int32_t *a, int32_t *b) {
   int i, j;
-  int32_t S[WIENER_WIN];
+  int64_t S[WIENER_WIN];
   int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
   const int wiener_win2 = wiener_win * wiener_win;
   const int wiener_halfwin1 = (wiener_win >> 1) + 1;
@@ -1130,7 +1174,10 @@ static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
       S[i] = S[wiener_win - 1 - i];
       S[wiener_halfwin1 - 1] -= 2 * S[i];
     }
-    memcpy(a, S, wiener_win * sizeof(*a));
+    for (i = 0; i < wiener_win; ++i) {
+      a[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)),
+                           (1 << (WIENER_FILT_BITS - 1)) - 1);
+    }
   }
 }
 
@@ -1138,7 +1185,7 @@ static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
 static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
                                         int64_t **Hc, int32_t *a, int32_t *b) {
   int i, j;
-  int32_t S[WIENER_WIN];
+  int64_t S[WIENER_WIN];
   int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
   const int wiener_win2 = wiener_win * wiener_win;
   const int wiener_halfwin1 = (wiener_win >> 1) + 1;
@@ -1187,12 +1234,15 @@ static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
       S[i] = S[wiener_win - 1 - i];
       S[wiener_halfwin1 - 1] -= 2 * S[i];
     }
-    memcpy(b, S, wiener_win * sizeof(*b));
+    for (i = 0; i < wiener_win; ++i) {
+      b[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)),
+                           (1 << (WIENER_FILT_BITS - 1)) - 1);
+    }
   }
 }
 
-static int wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H,
-                                    int32_t *a, int32_t *b) {
+static void wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H,
+                                     int32_t *a, int32_t *b) {
   static const int32_t init_filt[WIENER_WIN] = {
     WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV,
     WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV,
@@ -1221,7 +1271,6 @@ static int wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H,
     update_b_sep_sym(wiener_win, Mc, Hc, a, b);
     iter++;
   }
-  return 1;
 }
 
 // Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares
@@ -1238,8 +1287,6 @@ static int64_t compute_score(int wiener_win, int64_t *M, int64_t *H,
   const int plane_off = (WIENER_WIN - wiener_win) >> 1;
   const int wiener_win2 = wiener_win * wiener_win;
 
-  aom_clear_system_state();
-
   a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = WIENER_FILT_STEP;
   for (i = 0; i < WIENER_HALFWIN; ++i) {
     a[i] = a[WIENER_WIN - i - 1] = vfilt[i];
@@ -1274,7 +1321,7 @@ static AOM_INLINE void finalize_sym_filter(int wiener_win, int32_t *f,
   const int wiener_halfwin = (wiener_win >> 1);
 
   for (i = 0; i < wiener_halfwin; ++i) {
-    const int64_t dividend = f[i] * WIENER_FILT_STEP;
+    const int64_t dividend = (int64_t)f[i] * WIENER_FILT_STEP;
     const int64_t divisor = WIENER_TAP_SCALE_FACTOR;
     // Perform this division with proper rounding rather than truncation
     if (dividend < 0) {
@@ -1458,19 +1505,19 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
   const MACROBLOCK *const x = rsc->x;
-  const int64_t bits_none = x->wiener_restore_cost[0];
+  const int64_t bits_none = x->mode_costs.wiener_restore_cost[0];
 
   // Skip Wiener search for low variance contents
-  if (rsc->sf->lpf_sf.prune_wiener_based_on_src_var) {
+  if (rsc->lpf_sf->prune_wiener_based_on_src_var) {
     const int scale[3] = { 0, 1, 2 };
     // Obtain the normalized Qscale
     const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0,
-                                    rsc->cm->seq_params.bit_depth) >>
+                                    rsc->cm->seq_params->bit_depth) >>
                    3;
     // Derive threshold as sqr(normalized Qscale) * scale / 16,
     const uint64_t thresh =
-        (qs * qs * scale[rsc->sf->lpf_sf.prune_wiener_based_on_src_var]) >> 4;
-    const int highbd = rsc->cm->seq_params.use_highbitdepth;
+        (qs * qs * scale[rsc->lpf_sf->prune_wiener_based_on_src_var]) >> 4;
+    const int highbd = rsc->cm->seq_params->use_highbitdepth;
     const uint64_t src_var =
         var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
     // Do not perform Wiener search if source variance is lower than threshold
@@ -1481,8 +1528,7 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
       rsc->sse += rusi->sse[RESTORE_NONE];
       rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
       rusi->sse[RESTORE_WIENER] = INT64_MAX;
-      if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2)
-        rusi->skip_sgr_eval = 1;
+      if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
       return;
     }
   }
@@ -1491,7 +1537,7 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
       (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
 
   int reduced_wiener_win = wiener_win;
-  if (rsc->sf->lpf_sf.reduce_wiener_window_size) {
+  if (rsc->lpf_sf->reduce_wiener_window_size) {
     reduced_wiener_win =
         (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN_REDUCED : WIENER_WIN_CHROMA;
   }
@@ -1502,30 +1548,27 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
 
 #if CONFIG_AV1_HIGHBITDEPTH
   const AV1_COMMON *const cm = rsc->cm;
-  if (cm->seq_params.use_highbitdepth) {
+  if (cm->seq_params->use_highbitdepth) {
+    // TODO(any) : Add support for use_downsampled_wiener_stats SF in HBD
+    // functions
     av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer,
                              rsc->src_buffer, limits->h_start, limits->h_end,
                              limits->v_start, limits->v_end, rsc->dgd_stride,
-                             rsc->src_stride, M, H, cm->seq_params.bit_depth);
+                             rsc->src_stride, M, H, cm->seq_params->bit_depth);
   } else {
     av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
                       limits->h_start, limits->h_end, limits->v_start,
-                      limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
+                      limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H,
+                      rsc->lpf_sf->use_downsampled_wiener_stats);
   }
 #else
   av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
                     limits->h_start, limits->h_end, limits->v_start,
-                    limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
+                    limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H,
+                    rsc->lpf_sf->use_downsampled_wiener_stats);
 #endif
 
-  if (!wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter)) {
-    rsc->bits += bits_none;
-    rsc->sse += rusi->sse[RESTORE_NONE];
-    rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
-    rusi->sse[RESTORE_WIENER] = INT64_MAX;
-    if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
-    return;
-  }
+  wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter);
 
   RestorationUnitInfo rui;
   memset(&rui, 0, sizeof(rui));
@@ -1542,12 +1585,10 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
     rsc->sse += rusi->sse[RESTORE_NONE];
     rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
     rusi->sse[RESTORE_WIENER] = INT64_MAX;
-    if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
+    if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
     return;
   }
 
-  aom_clear_system_state();
-
   rusi->sse[RESTORE_WIENER] = finer_tile_search_wiener(
       rsc, limits, tile_rect, &rui, reduced_wiener_win);
   rusi->wiener = rui.wiener_info;
@@ -1560,14 +1601,16 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
   }
 
   const int64_t bits_wiener =
-      x->wiener_restore_cost[1] +
+      x->mode_costs.wiener_restore_cost[1] +
       (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener)
        << AV1_PROB_COST_SHIFT);
 
-  double cost_none =
-      RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
-  double cost_wiener =
-      RDCOST_DBL(x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER]);
+  double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE],
+      rsc->cm->seq_params->bit_depth);
+  double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER],
+      rsc->cm->seq_params->bit_depth);
 
   RestorationType rtype =
       (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
@@ -1575,9 +1618,9 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
 
   // Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and
   // RESTORE_NONE or based on best_rtype
-  if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 1) {
+  if (rsc->lpf_sf->prune_sgr_based_on_wiener == 1) {
     rusi->skip_sgr_eval = cost_wiener > (1.01 * cost_none);
-  } else if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) {
+  } else if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) {
     rusi->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE;
   }
 
@@ -1598,7 +1641,7 @@ static AOM_INLINE void search_norestore(const RestorationTileLimits *limits,
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
-  const int highbd = rsc->cm->seq_params.use_highbitdepth;
+  const int highbd = rsc->cm->seq_params->use_highbitdepth;
   rusi->sse[RESTORE_NONE] = sse_restoration_unit(
       limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd);
 
@@ -1649,11 +1692,11 @@ static AOM_INLINE void search_switchable(const RestorationTileLimits *limits,
       default: assert(0); break;
     }
     const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT;
-    const int64_t bits = x->switchable_restore_cost[r] + coeff_bits;
-    double cost = RDCOST_DBL(x->rdmult, bits >> 4, sse);
+    const int64_t bits = x->mode_costs.switchable_restore_cost[r] + coeff_bits;
+    double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+        x->rdmult, bits >> 4, sse, rsc->cm->seq_params->bit_depth);
     if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
-      cost *=
-          (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->lpf_sf.dual_sgr_penalty_level);
+      cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
     if (r == 0 || cost < best_cost) {
       best_cost = cost;
       best_bits = bits;
@@ -1690,7 +1733,8 @@ static double search_rest_type(RestSearchCtxt *rsc, RestorationType rtype) {
 
   av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc,
                                  &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL);
-  return RDCOST_DBL(rsc->x->rdmult, rsc->bits >> 4, rsc->sse);
+  return RDCOST_DBL_WITH_NATIVE_BD_DIST(
+      rsc->x->rdmult, rsc->bits >> 4, rsc->sse, rsc->cm->seq_params->bit_depth);
 }
 
 static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) {
@@ -1700,16 +1744,22 @@ static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) {
 
 void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   assert(!cm->features.all_lossless);
 
+  av1_fill_lr_rates(&x->mode_costs, x->e_mbd.tile_ctx);
+
   int ntiles[2];
   for (int is_uv = 0; is_uv < 2; ++is_uv)
     ntiles[is_uv] = rest_tiles_in_plane(cm, is_uv);
 
   assert(ntiles[1] <= ntiles[0]);
-  RestUnitSearchInfo *rusi =
-      (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * ntiles[0]);
+  RestUnitSearchInfo *rusi;
+  CHECK_MEM_ERROR(
+      cm, rusi,
+      (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * ntiles[0]));
 
   // If the restoration unit dimensions are not multiples of
   // rsi->restoration_unit_size then some elements of the rusi array may be
@@ -1717,13 +1767,24 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
   // problem, as these elements are ignored later, but in order to quiet
   // Valgrind's warnings we initialise the array below.
   memset(rusi, 0, sizeof(*rusi) * ntiles[0]);
-  cpi->td.mb.rdmult = cpi->rd.RDMULT;
+  x->rdmult = cpi->rd.RDMULT;
+
+  // Allocate the frame buffer trial_frame_rst, which is used to temporarily
+  // store the loop restored frame.
+  if (aom_realloc_frame_buffer(
+          &cpi->trial_frame_rst, cm->superres_upscaled_width,
+          cm->superres_upscaled_height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_RESTORATION_FRAME_BORDER, cm->features.byte_alignment, NULL, NULL,
+          NULL, 0, 0))
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate trial restored frame buffer");
 
   RestSearchCtxt rsc;
   const int plane_start = AOM_PLANE_Y;
   const int plane_end = num_planes > 1 ? AOM_PLANE_V : AOM_PLANE_Y;
   for (int plane = plane_start; plane <= plane_end; ++plane) {
-    init_rsc(src, &cpi->common, &cpi->td.mb, &cpi->sf, plane, rusi,
+    init_rsc(src, &cpi->common, x, &cpi->sf.lpf_sf, plane, rusi,
              &cpi->trial_frame_rst, &rsc);
 
     const int plane_ntiles = ntiles[plane > 0];
@@ -1733,8 +1794,9 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
     double best_cost = 0;
     RestorationType best_rtype = RESTORE_NONE;
 
-    const int highbd = rsc.cm->seq_params.use_highbitdepth;
-    if (!cpi->sf.lpf_sf.disable_loop_restoration_chroma || !plane) {
+    const int highbd = rsc.cm->seq_params->use_highbitdepth;
+    if ((plane && !cpi->sf.lpf_sf.disable_loop_restoration_chroma) ||
+        (!plane && !cpi->sf.lpf_sf.disable_loop_restoration_luma)) {
       av1_extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
                        rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
                        highbd);
diff --git a/media/libaom/src/av1/encoder/pickrst.h b/media/libaom/src/av1/encoder/pickrst.h
index eee30553d4..46a4b48f2c 100644
--- a/media/libaom/src/av1/encoder/pickrst.h
+++ b/media/libaom/src/av1/encoder/pickrst.h
@@ -16,7 +16,6 @@ extern "C" {
 #endif
 
 #include "av1/encoder/encoder.h"
-#include "aom_ports/system_state.h"
 
 struct yv12_buffer_config;
 struct AV1_COMP;
@@ -57,6 +56,39 @@ static INLINE uint16_t find_average_highbd(const uint16_t *src, int h_start,
 }
 #endif
 
+/*!\brief Algorithm for AV1 loop restoration search and estimation.
+ *
+ * \ingroup in_loop_restoration
+ * This function determines proper restoration filter types and
+ * associated parameters for each restoration unit in a frame.
+ *
+ * \param[in]       sd           Source frame buffer
+ * \param[in,out]   cpi          Top-level encoder structure
+ *
+ * \return Nothing is returned. Instead, chosen restoration filter
+ * types and parameters are stored per plane in the \c rst_info structure
+ * of type \ref RestorationInfo inside \c cpi->common:
+ * \arg \c rst_info[ \c 0 ]: Chosen parameters for Y plane
+ * \arg \c rst_info[ \c 1 ]: Chosen parameters for U plane if it exists
+ * \arg \c rst_info[ \c 2 ]: Chosen parameters for V plane if it exists
+ * \par
+ * The following fields in each \c rst_info[ \c p], \c p = 0, 1, 2
+ * are populated:
+ * \arg \c rst_info[ \c p ].\c frame_restoration_type
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ],
+ * for each \c u in 0, 1, ..., \c n( \c p ) - 1,
+ * where \c n( \c p ) is the number of restoration units in plane \c p.
+ * \par
+ * The following fields in each \c rst_info[ \c p ].\c unit_info[ \c u ],
+ * \c p = 0, 1, 2 and \c u = 0, 1, ..., \c n( \c p ) - 1, of type
+ * \ref RestorationUnitInfo are populated:
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c wiener_info OR
+ *      \c rst_info[ \c p ].\c unit_info[ \c u ].\c sgrproj_info OR
+ *      neither, depending on
+ *      \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type
+ *
+ */
 void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/av1/encoder/ransac.c b/media/libaom/src/av1/encoder/ransac.c
index 07e1a5f5f8..e849b34f0c 100644
--- a/media/libaom/src/av1/encoder/ransac.c
+++ b/media/libaom/src/av1/encoder/ransac.c
@@ -15,8 +15,8 @@
 #include <stdlib.h>
 #include <assert.h>
 
+#include "aom_dsp/mathutils.h"
 #include "av1/encoder/ransac.h"
-#include "av1/encoder/mathutils.h"
 #include "av1/encoder/random.h"
 
 #define MAX_MINPTS 4
@@ -225,6 +225,7 @@ static int find_translation(int np, double *pts1, double *pts2, double *mat) {
 static int find_rotzoom(int np, double *pts1, double *pts2, double *mat) {
   const int np2 = np * 2;
   double *a = (double *)aom_malloc(sizeof(*a) * (np2 * 5 + 20));
+  if (a == NULL) return 1;
   double *b = a + np2 * 4;
   double *temp = b + np2;
   int i;
diff --git a/media/libaom/src/av1/encoder/ratectrl.c b/media/libaom/src/av1/encoder/ratectrl.c
index 433163f2e7..40da4f4564 100644
--- a/media/libaom/src/av1/encoder/ratectrl.c
+++ b/media/libaom/src/av1/encoder/ratectrl.c
@@ -19,7 +19,7 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
+#include "aom_ports/aom_once.h"
 
 #include "av1/common/alloccommon.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
@@ -95,8 +95,9 @@ static int kf_low = 400;
 
 // How many times less pixels there are to encode given the current scaling.
 // Temporary replacement for rcf_mult and rate_thresh_mult.
-static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) {
-  return (double)(cpi->oxcf.width * cpi->oxcf.height) / (width * height);
+static double resize_rate_factor(const FrameDimensionCfg *const frm_dim_cfg,
+                                 int width, int height) {
+  return (double)(frm_dim_cfg->width * frm_dim_cfg->height) / (width * height);
 }
 
 // Functions to compute the active minq lookup table entries based on a
@@ -129,7 +130,7 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
   }
 }
 
-void av1_rc_init_minq_luts(void) {
+static void rc_init_minq_luts(void) {
   init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
                  arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
                  inter_minq_8, rtc_minq_8, AOM_BITS_8);
@@ -141,6 +142,8 @@ void av1_rc_init_minq_luts(void) {
                  inter_minq_12, rtc_minq_12, AOM_BITS_12);
 }
 
+void av1_rc_init_minq_luts(void) { aom_once(rc_init_minq_luts); }
+
 // These functions use formulaic calculations to make playing with the
 // quantizer tables easier. If necessary they can be replaced by lookup
 // tables if and when things settle down in the experimental bitstream
@@ -157,9 +160,13 @@ double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
 }
 
 int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                       double correction_factor, aom_bit_depth_t bit_depth) {
+                       double correction_factor, aom_bit_depth_t bit_depth,
+                       const int is_screen_content_type) {
   const double q = av1_convert_qindex_to_q(qindex, bit_depth);
   int enumerator = frame_type == KEY_FRAME ? 2000000 : 1500000;
+  if (is_screen_content_type) {
+    enumerator = frame_type == KEY_FRAME ? 1000000 : 750000;
+  }
 
   assert(correction_factor <= MAX_BPB_FACTOR &&
          correction_factor >= MIN_BPB_FACTOR);
@@ -169,10 +176,10 @@ int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
 }
 
 int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
-                           double correction_factor,
-                           aom_bit_depth_t bit_depth) {
-  const int bpm =
-      (int)(av1_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
+                           double correction_factor, aom_bit_depth_t bit_depth,
+                           const int is_screen_content_type) {
+  const int bpm = (int)(av1_rc_bits_per_mb(frame_type, q, correction_factor,
+                                           bit_depth, is_screen_content_type));
   return AOMMAX(FRAME_OVERHEAD_BITS,
                 (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
 }
@@ -197,25 +204,25 @@ int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target,
 
   // Clip the frame target to the maximum allowed value.
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
-  if (oxcf->rc_max_inter_bitrate_pct) {
+  if (oxcf->rc_cfg.max_inter_bitrate_pct) {
     const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+        rc->avg_frame_bandwidth * oxcf->rc_cfg.max_inter_bitrate_pct / 100;
     target = AOMMIN(target, max_rate);
   }
 
   return target;
 }
 
-int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int target) {
+int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int64_t target) {
   const RATE_CONTROL *rc = &cpi->rc;
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  if (oxcf->rc_max_intra_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
+  const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+  if (rc_cfg->max_intra_bitrate_pct) {
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * rc_cfg->max_intra_bitrate_pct / 100;
     target = AOMMIN(target, max_rate);
   }
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
-  return target;
+  return (int)target;
 }
 
 // Update the buffer level for higher temporal layers, given the encoded current
@@ -227,31 +234,58 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
     const int layer =
         LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
     LAYER_CONTEXT *lc = &svc->layer_context[layer];
-    RATE_CONTROL *lrc = &lc->rc;
-    lrc->bits_off_target +=
-        (int)(lc->target_bandwidth / lc->framerate) - encoded_frame_size;
+    PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
+    lp_rc->bits_off_target +=
+        (int)round(lc->target_bandwidth / lc->framerate) - encoded_frame_size;
     // Clip buffer level to maximum buffer size for the layer.
-    lrc->bits_off_target =
-        AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
-    lrc->buffer_level = lrc->bits_off_target;
+    lp_rc->bits_off_target =
+        AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size);
+    lp_rc->buffer_level = lp_rc->bits_off_target;
   }
 }
 // Update the buffer level: leaky bucket model.
 static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
   const AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
 
   // Non-viewable frames are a special case and are treated as pure overhead.
   if (!cm->show_frame)
-    rc->bits_off_target -= encoded_frame_size;
+    p_rc->bits_off_target -= encoded_frame_size;
   else
-    rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+    p_rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
 
   // Clip the buffer level to the maximum specified buffer size.
-  rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
-  rc->buffer_level = rc->bits_off_target;
-
-  if (cpi->use_svc) update_layer_buffer_level(&cpi->svc, encoded_frame_size);
+  p_rc->bits_off_target =
+      AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+  // For screen-content mode: don't let buffel level go below threshold,
+  // given here as -rc->maximum_ buffer_size, to allow buffer to come back
+  // up sooner after slide change with big oveshoot.
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)
+    p_rc->bits_off_target =
+        AOMMAX(p_rc->bits_off_target, -p_rc->maximum_buffer_size);
+  p_rc->buffer_level = p_rc->bits_off_target;
+
+  if (cpi->ppi->use_svc)
+    update_layer_buffer_level(&cpi->svc, encoded_frame_size);
+
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  /* The variable temp_buffer_level is introduced for quality
+   * simulation purpose, it retains the value previous to the parallel
+   * encode frames. The variable is updated based on the update flag.
+   *
+   * If there exist show_existing_frames between parallel frames, then to
+   * retain the temp state do not update it. */
+  int show_existing_between_parallel_frames =
+      (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+           INTNL_OVERLAY_UPDATE &&
+       cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+
+  if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+    p_rc->temp_buffer_level = p_rc->buffer_level;
+  }
+#endif
 }
 
 int av1_rc_get_default_min_gf_interval(int width, int height,
@@ -280,81 +314,116 @@ int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
   return AOMMAX(interval, min_gf_interval);
 }
 
-void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
-  int i;
-
-  if (pass == 0 && oxcf->rc_mode == AOM_CBR) {
-    rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
-    rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+void av1_primary_rc_init(const AV1EncoderConfig *oxcf,
+                         PRIMARY_RATE_CONTROL *p_rc) {
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  int worst_allowed_q = rc_cfg->worst_allowed_q;
+
+  int min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+  int max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+  if (min_gf_interval == 0)
+    min_gf_interval = av1_rc_get_default_min_gf_interval(
+        oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+        oxcf->input_cfg.init_framerate);
+  if (max_gf_interval == 0)
+    max_gf_interval = av1_rc_get_default_max_gf_interval(
+        oxcf->input_cfg.init_framerate, min_gf_interval);
+  p_rc->baseline_gf_interval = (min_gf_interval + max_gf_interval) / 2;
+  p_rc->this_key_frame_forced = 0;
+  p_rc->next_key_frame_forced = 0;
+  p_rc->ni_frames = 0;
+
+  p_rc->tot_q = 0.0;
+  p_rc->total_actual_bits = 0;
+  p_rc->total_target_bits = 0;
+  p_rc->buffer_level = p_rc->starting_buffer_level;
+
+  if (oxcf->target_seq_level_idx[0] < SEQ_LEVELS) {
+    worst_allowed_q = 255;
+  }
+  if (oxcf->pass == AOM_RC_ONE_PASS && rc_cfg->mode == AOM_CBR) {
+    p_rc->avg_frame_qindex[KEY_FRAME] = worst_allowed_q;
+    p_rc->avg_frame_qindex[INTER_FRAME] = worst_allowed_q;
   } else {
-    rc->avg_frame_qindex[KEY_FRAME] =
-        (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
-    rc->avg_frame_qindex[INTER_FRAME] =
-        (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
-  }
-
-  rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
-  rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
-
-  rc->buffer_level = rc->starting_buffer_level;
-  rc->bits_off_target = rc->starting_buffer_level;
-
-  rc->rolling_target_bits = rc->avg_frame_bandwidth;
-  rc->rolling_actual_bits = rc->avg_frame_bandwidth;
-  rc->long_rolling_target_bits = rc->avg_frame_bandwidth;
-  rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
+    p_rc->avg_frame_qindex[KEY_FRAME] =
+        (worst_allowed_q + rc_cfg->best_allowed_q) / 2;
+    p_rc->avg_frame_qindex[INTER_FRAME] =
+        (worst_allowed_q + rc_cfg->best_allowed_q) / 2;
+  }
+  p_rc->avg_q = av1_convert_qindex_to_q(rc_cfg->worst_allowed_q,
+                                        oxcf->tool_cfg.bit_depth);
+  p_rc->last_q[KEY_FRAME] = rc_cfg->best_allowed_q;
+  p_rc->last_q[INTER_FRAME] = rc_cfg->worst_allowed_q;
+
+  for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+    p_rc->rate_correction_factors[i] = 0.7;
+  }
+  p_rc->rate_correction_factors[KF_STD] = 1.0;
+  p_rc->bits_off_target = p_rc->starting_buffer_level;
+
+  p_rc->rolling_target_bits =
+      (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate);
+  p_rc->rolling_actual_bits =
+      (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate);
+}
 
-  rc->total_actual_bits = 0;
-  rc->total_target_bits = 0;
-  rc->total_target_vs_actual = 0;
+void av1_rc_init(const AV1EncoderConfig *oxcf, RATE_CONTROL *rc) {
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
 
   rc->frames_since_key = 8;  // Sensible default for first frame.
-  rc->this_key_frame_forced = 0;
-  rc->next_key_frame_forced = 0;
-  rc->source_alt_ref_pending = 0;
-  rc->source_alt_ref_active = 0;
 
   rc->frames_till_gf_update_due = 0;
-  rc->ni_av_qi = oxcf->worst_allowed_q;
+  rc->ni_av_qi = rc_cfg->worst_allowed_q;
   rc->ni_tot_qi = 0;
-  rc->ni_frames = 0;
 
-  rc->tot_q = 0.0;
-  rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
-
-  for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
-    rc->rate_correction_factors[i] = 0.7;
-  }
-  rc->rate_correction_factors[KF_STD] = 1.0;
-  rc->min_gf_interval = oxcf->min_gf_interval;
-  rc->max_gf_interval = oxcf->max_gf_interval;
+  rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+  rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
   if (rc->min_gf_interval == 0)
     rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
-        oxcf->width, oxcf->height, oxcf->init_framerate);
+        oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+        oxcf->input_cfg.init_framerate);
   if (rc->max_gf_interval == 0)
     rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
-        oxcf->init_framerate, rc->min_gf_interval);
-  rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+        oxcf->input_cfg.init_framerate, rc->min_gf_interval);
+  rc->avg_frame_low_motion = 0;
+
+  rc->resize_state = ORIG;
+  rc->resize_avg_qp = 0;
+  rc->resize_buffer_underflow = 0;
+  rc->resize_count = 0;
+  rc->rtc_external_ratectrl = 0;
+  rc->frame_level_fast_extra_bits = 0;
 }
 
 int av1_rc_drop_frame(AV1_COMP *cpi) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  const int simulate_parallel_frame =
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+  int64_t buffer_level =
+      simulate_parallel_frame ? p_rc->temp_buffer_level : p_rc->buffer_level;
+#else
+  int64_t buffer_level = p_rc->buffer_level;
+#endif
 
-  if (!oxcf->drop_frames_water_mark) {
+  if (!oxcf->rc_cfg.drop_frames_water_mark) {
     return 0;
   } else {
-    if (rc->buffer_level < 0) {
+    if (buffer_level < 0) {
       // Always drop if buffer is below 0.
       return 1;
     } else {
       // If buffer is below drop_mark, for now just drop every other frame
       // (starting with the next frame) until it increases back over drop_mark.
-      int drop_mark =
-          (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100);
-      if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
+      int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark *
+                            p_rc->optimal_buffer_level / 100);
+      if ((buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
         --rc->decimation_factor;
-      } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) {
+      } else if (buffer_level <= drop_mark && rc->decimation_factor == 0) {
         rc->decimation_factor = 1;
       }
       if (rc->decimation_factor > 0) {
@@ -375,8 +444,12 @@ int av1_rc_drop_frame(AV1_COMP *cpi) {
 
 static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1_COMMON *const cm = &cpi->common;
-  const int max_delta = 16;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const int max_delta_down =
+      (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) ? 8 : 16;
+  const int max_delta_up = 20;
   const int change_avg_frame_bandwidth =
       abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
       0.1 * (rc->avg_frame_bandwidth);
@@ -387,18 +460,53 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
       (cm->width != cm->prev_frame->width ||
        cm->height != cm->prev_frame->height || change_avg_frame_bandwidth);
   // Apply some control/clamp to QP under certain conditions.
-  if (cm->current_frame.frame_type != KEY_FRAME && !cpi->use_svc &&
+  if (cm->current_frame.frame_type != KEY_FRAME && !cpi->ppi->use_svc &&
       rc->frames_since_key > 1 && !change_target_bits_mb &&
-      (!cpi->oxcf.gf_cbr_boost_pct ||
-       !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame))) {
+      (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct ||
+       !(refresh_frame->alt_ref_frame || refresh_frame->golden_frame))) {
     // Make sure q is between oscillating Qs to prevent resonance.
     if (rc->rc_1_frame * rc->rc_2_frame == -1 &&
         rc->q_1_frame != rc->q_2_frame) {
-      q = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame),
-                AOMMAX(rc->q_1_frame, rc->q_2_frame));
+      int qclamp = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame),
+                         AOMMAX(rc->q_1_frame, rc->q_2_frame));
+      // If the previous frame had overshoot and the current q needs to
+      // increase above the clamped value, reduce the clamp for faster reaction
+      // to overshoot.
+      if (cpi->rc.rc_1_frame == -1 && q > qclamp && rc->frames_since_key > 10)
+        q = (q + qclamp) >> 1;
+      else
+        q = qclamp;
+    }
+    // Adjust Q base on source content change from scene detection.
+    if (cpi->sf.rt_sf.check_scene_detection && rc->prev_avg_source_sad > 0 &&
+        rc->frames_since_key > 10 && rc->frame_source_sad > 0 &&
+        !cpi->ppi->use_svc) {
+      const int bit_depth = cm->seq_params->bit_depth;
+      double delta =
+          (double)rc->avg_source_sad / (double)rc->prev_avg_source_sad - 1.0;
+      // Push Q downwards if content change is decreasing and buffer level
+      // is stable (at least 1/4-optimal level), so not overshooting. Do so
+      // only for high Q to avoid excess overshoot.
+      // Else reduce decrease in Q from previous frame if content change is
+      // increasing and buffer is below max (so not undershooting).
+      if (delta < 0.0 &&
+          p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) &&
+          q > (rc->worst_quality >> 1)) {
+        double q_adj_factor = 1.0 + 0.5 * tanh(4.0 * delta);
+        double q_val = av1_convert_qindex_to_q(q, bit_depth);
+        q += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+      } else if (rc->q_1_frame - q > 0 && delta > 0.1 &&
+                 p_rc->buffer_level < AOMMIN(p_rc->maximum_buffer_size,
+                                             p_rc->optimal_buffer_level << 1)) {
+        q = (3 * q + rc->q_1_frame) >> 2;
+      }
     }
     // Limit the decrease in Q from previous frame.
-    if (rc->q_1_frame - q > max_delta) q = rc->q_1_frame - max_delta;
+    if (rc->q_1_frame - q > max_delta_down) q = rc->q_1_frame - max_delta_down;
+    // Limit the increase in Q from previous frame.
+    else if (q - rc->q_1_frame > max_delta_up &&
+             cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN)
+      q = rc->q_1_frame + max_delta_up;
   }
   // For single spatial layer: if resolution has increased push q closer
   // to the active_worst to avoid excess overshoot.
@@ -419,85 +527,154 @@ static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = {
   GF_ARF_LOW,    // INTNL_ARF_UPDATE
 };
 
-static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group) {
-  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group,
+                                               int gf_frame_index) {
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index];
   assert(update_type < FRAME_UPDATE_TYPES);
   return rate_factor_levels[update_type];
 }
 
+/*!\brief Gets a rate vs Q correction factor
+ *
+ * This function returns the current value of a correction factor used to
+ * dynamilcally adjust the relationship between Q and the expected number
+ * of bits for the frame.
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   width                 Frame width
+ * \param[in]   height                Frame height
+ *
+ * \return Returns a correction factor for the current frame
+ */
 static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
                                          int height) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   double rcf;
+  double rate_correction_factors_kfstd;
+  double rate_correction_factors_gfarfstd;
+  double rate_correction_factors_internormal;
+
+  rate_correction_factors_kfstd =
+      (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+          ? rc->frame_level_rate_correction_factors[KF_STD]
+          : p_rc->rate_correction_factors[KF_STD];
+  rate_correction_factors_gfarfstd =
+      (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+          ? rc->frame_level_rate_correction_factors[GF_ARF_STD]
+          : p_rc->rate_correction_factors[GF_ARF_STD];
+  rate_correction_factors_internormal =
+      (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+          ? rc->frame_level_rate_correction_factors[INTER_NORMAL]
+          : p_rc->rate_correction_factors[INTER_NORMAL];
 
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
-    rcf = rc->rate_correction_factors[KF_STD];
+    rcf = rate_correction_factors_kfstd;
   } else if (is_stat_consumption_stage(cpi)) {
-    const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
-    rcf = rc->rate_correction_factors[rf_lvl];
+    const RATE_FACTOR_LEVEL rf_lvl =
+        get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    double rate_correction_factors_rflvl =
+        (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+            ? rc->frame_level_rate_correction_factors[rf_lvl]
+            : p_rc->rate_correction_factors[rf_lvl];
+    rcf = rate_correction_factors_rflvl;
   } else {
-    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
-        (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
-      rcf = rc->rate_correction_factors[GF_ARF_STD];
+    if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) &&
+        !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
+        (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
+         cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20))
+      rcf = rate_correction_factors_gfarfstd;
     else
-      rcf = rc->rate_correction_factors[INTER_NORMAL];
+      rcf = rate_correction_factors_internormal;
   }
-  rcf *= resize_rate_factor(cpi, width, height);
+  rcf *= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
   return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 }
 
-static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width,
-                                       int height) {
+/*!\brief Sets a rate vs Q correction factor
+ *
+ * This function updates the current value of a correction factor used to
+ * dynamilcally adjust the relationship between Q and the expected number
+ * of bits for the frame.
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   is_encode_stage       Indicates if recode loop or post-encode
+ * \param[in]   factor                New correction factor
+ * \param[in]   width                 Frame width
+ * \param[in]   height                Frame height
+ *
+ * \return None but updates the rate correction factor for the
+ *         current frame type in cpi->rc.
+ */
+static void set_rate_correction_factor(AV1_COMP *cpi, int is_encode_stage,
+                                       double factor, int width, int height) {
   RATE_CONTROL *const rc = &cpi->rc;
-
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  int update_default_rcf = 1;
   // Normalize RCF to account for the size-dependent scaling factor.
-  factor /= resize_rate_factor(cpi, width, height);
+  factor /= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
 
   factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
-    rc->rate_correction_factors[KF_STD] = factor;
+    p_rc->rate_correction_factors[KF_STD] = factor;
   } else if (is_stat_consumption_stage(cpi)) {
-    const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
-    rc->rate_correction_factors[rf_lvl] = factor;
+    const RATE_FACTOR_LEVEL rf_lvl =
+        get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    if (is_encode_stage &&
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      rc->frame_level_rate_correction_factors[rf_lvl] = factor;
+      update_default_rcf = 0;
+    }
+    if (update_default_rcf) p_rc->rate_correction_factors[rf_lvl] = factor;
   } else {
-    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
-        (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
-      rc->rate_correction_factors[GF_ARF_STD] = factor;
-    else
-      rc->rate_correction_factors[INTER_NORMAL] = factor;
+    if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) &&
+        !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
+        (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
+         cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) {
+      p_rc->rate_correction_factors[GF_ARF_STD] = factor;
+    } else {
+      if (is_encode_stage &&
+          cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+        rc->frame_level_rate_correction_factors[INTER_NORMAL] = factor;
+        update_default_rcf = 0;
+      }
+      if (update_default_rcf)
+        p_rc->rate_correction_factors[INTER_NORMAL] = factor;
+    }
   }
 }
 
-void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width,
-                                           int height) {
+void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int is_encode_stage,
+                                           int width, int height) {
   const AV1_COMMON *const cm = &cpi->common;
   int correction_factor = 100;
   double rate_correction_factor =
       get_rate_correction_factor(cpi, width, height);
   double adjustment_limit;
   const int MBs = av1_get_MBs(width, height);
-
   int projected_size_based_on_q = 0;
 
   // Do not update the rate factors for arf overlay frames.
   if (cpi->rc.is_src_frame_alt_ref) return;
 
   // Clear down mmx registers to allow floating point in what follows
-  aom_clear_system_state();
 
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
   // Stay in double to avoid int overflow when values are large
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
     projected_size_based_on_q =
         av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
   } else {
     projected_size_based_on_q = av1_estimate_bits_at_q(
         cm->current_frame.frame_type, cm->quant_params.base_qindex, MBs,
-        rate_correction_factor, cm->seq_params.bit_depth);
+        rate_correction_factor, cm->seq_params->bit_depth,
+        cpi->is_screen_content_type);
   }
   // Work out a size correction factor.
   if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
@@ -542,7 +719,8 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width,
       rate_correction_factor = MIN_BPB_FACTOR;
   }
 
-  set_rate_correction_factor(cpi, rate_correction_factor, width, height);
+  set_rate_correction_factor(cpi, is_encode_stage, rate_correction_factor,
+                             width, height);
 }
 
 // Calculate rate for the given 'q'.
@@ -552,18 +730,32 @@ static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh,
   return use_cyclic_refresh
              ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor)
              : av1_rc_bits_per_mb(cm->current_frame.frame_type, q,
-                                  correction_factor, cm->seq_params.bit_depth);
+                                  correction_factor, cm->seq_params->bit_depth,
+                                  cpi->is_screen_content_type);
 }
 
-// Similar to find_qindex_by_rate() function in ratectrl.c, but returns the q
-// index with rate just above or below the desired rate, depending on which of
-// the two rates is closer to the desired rate.
-// Also, respects the selected aq_mode when computing the rate.
+/*!\brief Searches for a Q index value predicted to give an average macro
+ * block rate closest to the target value.
+ *
+ * Similar to find_qindex_by_rate() function, but returns a q index with a
+ * rate just above or below the desired rate, depending on which of the two
+ * rates is closer to the desired rate.
+ * Also, respects the selected aq_mode when computing the rate.
+ *
+ * \ingroup rate_control
+ * \param[in]   desired_bits_per_mb   Target bits per mb
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   correction_factor     Current Q to rate correction factor
+ * \param[in]   best_qindex           Min allowed Q value.
+ * \param[in]   worst_qindex          Max allowed Q value.
+ *
+ * \return Returns a correction factor for the current frame
+ */
 static int find_closest_qindex_by_rate(int desired_bits_per_mb,
                                        const AV1_COMP *cpi,
                                        double correction_factor,
                                        int best_qindex, int worst_qindex) {
-  const int use_cyclic_refresh = cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+  const int use_cyclic_refresh = cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
                                  cpi->cyclic_refresh->apply_cyclic_refresh;
 
   // Find 'qindex' based on 'desired_bits_per_mb'.
@@ -621,7 +813,7 @@ int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
   int q =
       find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor,
                                   active_best_quality, active_worst_quality);
-  if (cpi->oxcf.rc_mode == AOM_CBR && has_no_stats_stage(cpi))
+  if (cpi->oxcf.rc_cfg.mode == AOM_CBR && has_no_stats_stage(cpi))
     return adjust_q_cbr(cpi, q, active_worst_quality);
 
   return q;
@@ -642,56 +834,77 @@ static int get_active_quality(int q, int gfu_boost, int low, int high,
   }
 }
 
-static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+static int get_kf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q,
                                  aom_bit_depth_t bit_depth) {
   int *kf_low_motion_minq;
   int *kf_high_motion_minq;
   ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
   ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
-  return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
+  return get_active_quality(q, p_rc->kf_boost, kf_low, kf_high,
                             kf_low_motion_minq, kf_high_motion_minq);
 }
 
-static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
-                                 aom_bit_depth_t bit_depth) {
+static int get_gf_active_quality_no_rc(int gfu_boost, int q,
+                                       aom_bit_depth_t bit_depth) {
   int *arfgf_low_motion_minq;
   int *arfgf_high_motion_minq;
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
-  return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
+  return get_active_quality(q, gfu_boost, gf_low, gf_high,
                             arfgf_low_motion_minq, arfgf_high_motion_minq);
 }
 
+static int get_gf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q,
+                                 aom_bit_depth_t bit_depth) {
+  return get_gf_active_quality_no_rc(p_rc->gfu_boost, q, bit_depth);
+}
+
 static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
   int *arfgf_high_motion_minq;
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
   return arfgf_high_motion_minq[q];
 }
 
-static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
+static int calc_active_worst_quality_no_stats_vbr(const AV1_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const unsigned int curr_frame = cpi->common.current_frame.frame_number;
   int active_worst_quality;
+  int last_q_key_frame;
+  int last_q_inter_frame;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  const int simulate_parallel_frame =
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+  last_q_key_frame = simulate_parallel_frame ? p_rc->temp_last_q[KEY_FRAME]
+                                             : p_rc->last_q[KEY_FRAME];
+  last_q_inter_frame = simulate_parallel_frame ? p_rc->temp_last_q[INTER_FRAME]
+                                               : p_rc->last_q[INTER_FRAME];
+#else
+  last_q_key_frame = p_rc->last_q[KEY_FRAME];
+  last_q_inter_frame = p_rc->last_q[INTER_FRAME];
+#endif
 
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
     active_worst_quality =
-        curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2;
+        curr_frame == 0 ? rc->worst_quality : last_q_key_frame * 2;
   } else {
     if (!rc->is_src_frame_alt_ref &&
-        (cpi->refresh_golden_frame || cpi->refresh_bwd_ref_frame ||
-         cpi->refresh_alt_ref_frame)) {
-      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
-                                             : rc->last_q[INTER_FRAME];
+        (refresh_frame->golden_frame || refresh_frame->bwd_ref_frame ||
+         refresh_frame->alt_ref_frame)) {
+      active_worst_quality =
+          curr_frame == 1 ? last_q_key_frame * 5 / 4 : last_q_inter_frame;
     } else {
-      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2
-                                             : rc->last_q[INTER_FRAME] * 2;
+      active_worst_quality =
+          curr_frame == 1 ? last_q_key_frame * 2 : last_q_inter_frame * 2;
     }
   }
   return AOMMIN(active_worst_quality, rc->worst_quality);
 }
 
 // Adjust active_worst_quality level based on buffer level.
-static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) {
+static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) {
   // Adjust active_worst_quality: If buffer is above the optimal/target level,
   // bring active_worst_quality down depending on fullness of buffer.
   // If buffer is below the optimal level, let the active_worst_quality go from
@@ -699,8 +912,11 @@ static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) {
   // (at buffer = critical level).
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+  const SVC *const svc = &cpi->svc;
+  unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
   // Buffer level below which we push active_worst to worst_quality.
-  int64_t critical_level = rc->optimal_buffer_level >> 3;
+  int64_t critical_level = p_rc->optimal_buffer_level >> 3;
   int64_t buff_lvl_step = 0;
   int adjustment = 0;
   int active_worst_quality;
@@ -710,31 +926,42 @@ static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) {
   // for the first few frames following key frame. These are both initialized
   // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
   // So for first few frames following key, the qp of that key frame is weighted
-  // into the active_worst_quality setting.
-  ambient_qp = (cm->current_frame.frame_number < 5)
-                   ? AOMMIN(rc->avg_frame_qindex[INTER_FRAME],
-                            rc->avg_frame_qindex[KEY_FRAME])
-                   : rc->avg_frame_qindex[INTER_FRAME];
+  // into the active_worst_quality setting. For SVC the key frame should
+  // correspond to layer (0, 0), so use that for layer context.
+  int avg_qindex_key = p_rc->avg_frame_qindex[KEY_FRAME];
+  if (svc->number_temporal_layers > 1) {
+    int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers);
+    const LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    const PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+    avg_qindex_key = lp_rc->avg_frame_qindex[KEY_FRAME];
+    if (svc->temporal_layer_id == 0)
+      avg_qindex_key =
+          AOMMIN(lp_rc->avg_frame_qindex[KEY_FRAME], lp_rc->last_q[KEY_FRAME]);
+  }
+  ambient_qp = (cm->current_frame.frame_number < num_frames_weight_key)
+                   ? AOMMIN(p_rc->avg_frame_qindex[INTER_FRAME], avg_qindex_key)
+                   : p_rc->avg_frame_qindex[INTER_FRAME];
   active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4);
-  if (rc->buffer_level > rc->optimal_buffer_level) {
+  if (p_rc->buffer_level > p_rc->optimal_buffer_level) {
     // Adjust down.
     // Maximum limit for down adjustment, ~30%.
     int max_adjustment_down = active_worst_quality / 3;
     if (max_adjustment_down) {
-      buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) /
-                       max_adjustment_down);
+      buff_lvl_step =
+          ((p_rc->maximum_buffer_size - p_rc->optimal_buffer_level) /
+           max_adjustment_down);
       if (buff_lvl_step)
-        adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
+        adjustment = (int)((p_rc->buffer_level - p_rc->optimal_buffer_level) /
                            buff_lvl_step);
       active_worst_quality -= adjustment;
     }
-  } else if (rc->buffer_level > critical_level) {
+  } else if (p_rc->buffer_level > critical_level) {
     // Adjust up from ambient Q.
     if (critical_level) {
-      buff_lvl_step = (rc->optimal_buffer_level - critical_level);
+      buff_lvl_step = (p_rc->optimal_buffer_level - critical_level);
       if (buff_lvl_step) {
         adjustment = (int)((rc->worst_quality - ambient_qp) *
-                           (rc->optimal_buffer_level - rc->buffer_level) /
+                           (p_rc->optimal_buffer_level - p_rc->buffer_level) /
                            buff_lvl_step);
       }
       active_worst_quality = ambient_qp + adjustment;
@@ -746,26 +973,26 @@ static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) {
   return active_worst_quality;
 }
 
-static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
-                                             int height, int *bottom_index,
-                                             int *top_index) {
+// Calculate the active_best_quality level.
+static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi,
+                                                 int active_worst_quality,
+                                                 int width, int height) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const CurrentFrame *const current_frame = &cm->current_frame;
-  int active_best_quality;
-  int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
-  int q;
   int *rtc_minq;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
+  int active_best_quality = rc->best_quality;
   ASSIGN_MINQ_TABLE(bit_depth, rtc_minq);
 
   if (frame_is_intra_only(cm)) {
-    active_best_quality = rc->best_quality;
     // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
-    if (rc->this_key_frame_forced) {
-      int qindex = rc->last_boosted_qindex;
+    if (p_rc->this_key_frame_forced) {
+      int qindex = p_rc->last_boosted_qindex;
       double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
       int delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
                                             (last_boosted_q * 0.75), bit_depth);
@@ -774,48 +1001,95 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
       // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
       double q_val;
-
-      active_best_quality =
-          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
-
+      active_best_quality = get_kf_active_quality(
+          p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth);
       // Allow somewhat lower kf minq with small image formats.
       if ((width * height) <= (352 * 288)) {
         q_adj_factor -= 0.25;
       }
-
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
       q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
       active_best_quality +=
           av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
     }
-  } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
-             cpi->oxcf.gf_cbr_boost_pct &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+  } else if (!rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
+             cpi->oxcf.rc_cfg.gf_cbr_boost_pct &&
+             (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
+    int q = active_worst_quality;
     if (rc->frames_since_key > 1 &&
-        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
-      q = rc->avg_frame_qindex[INTER_FRAME];
-    } else {
-      q = active_worst_quality;
+        p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = p_rc->avg_frame_qindex[INTER_FRAME];
     }
-    active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+    active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
   } else {
     // Use the lower of active_worst_quality and recent/average Q.
-    if (current_frame->frame_number > 1) {
-      if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
-        active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]];
-      else
-        active_best_quality = rtc_minq[active_worst_quality];
-    } else {
-      if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
-        active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]];
-      else
-        active_best_quality = rtc_minq[active_worst_quality];
-    }
+    FRAME_TYPE frame_type =
+        (current_frame->frame_number > 1) ? INTER_FRAME : KEY_FRAME;
+    if (p_rc->avg_frame_qindex[frame_type] < active_worst_quality)
+      active_best_quality = rtc_minq[p_rc->avg_frame_qindex[frame_type]];
+    else
+      active_best_quality = rtc_minq[active_worst_quality];
   }
+  return active_best_quality;
+}
+
+#if RT_PASSIVE_STRATEGY
+static int get_q_passive_strategy(const AV1_COMP *const cpi,
+                                  const int q_candidate, const int threshold) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  int sum = 0;
+  int count = 0;
+  int i = 1;
+  while (i < MAX_Q_HISTORY) {
+    int frame_id = current_frame->frame_number - i;
+    if (frame_id <= 0) break;
+    sum += p_rc->q_history[frame_id % MAX_Q_HISTORY];
+    ++count;
+    ++i;
+  }
+  if (count > 0) {
+    const int avg_q = sum / count;
+    if (abs(avg_q - q_candidate) <= threshold) return avg_q;
+  }
+  return q_candidate;
+}
+#endif  // RT_PASSIVE_STRATEGY
+
+/*!\brief Picks q and q bounds given CBR rate control parameters in \c cpi->rc.
+ *
+ * Handles the special case when using:
+ * - Constant bit-rate mode: \c cpi->oxcf.rc_cfg.mode == \ref AOM_CBR, and
+ * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are
+ * NOT available.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       width        Coded frame width
+ * \param[in]       height       Coded frame height
+ * \param[out]      bottom_index Bottom bound for q index (best quality)
+ * \param[out]      top_index    Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width,
+                                             int height, int *bottom_index,
+                                             int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  int q;
+  const int bit_depth = cm->seq_params->bit_depth;
+  int active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
+  int active_best_quality = calc_active_best_quality_no_stats_cbr(
+      cpi, active_worst_quality, width, height);
+  assert(has_no_stats_stage(cpi));
+  assert(cpi->oxcf.rc_cfg.mode == AOM_CBR);
 
   // Clip the active best and worst quality values to limits
   active_best_quality =
@@ -827,22 +1101,28 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
   *bottom_index = active_best_quality;
 
   // Limit Q range for the adaptive loop.
-  if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
-      !(current_frame->frame_number == 0)) {
+  if (current_frame->frame_type == KEY_FRAME && !p_rc->this_key_frame_forced &&
+      current_frame->frame_number != 0) {
     int qdelta = 0;
-    aom_clear_system_state();
     qdelta = av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type,
-                                        active_worst_quality, 2.0, bit_depth);
+                                        active_worst_quality, 2.0,
+                                        cpi->is_screen_content_type, bit_depth);
     *top_index = active_worst_quality + qdelta;
     *top_index = AOMMAX(*top_index, *bottom_index);
   }
 
   // Special case code to try and match quality with forced key frames
-  if (current_frame->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
-    q = rc->last_boosted_qindex;
+  if (current_frame->frame_type == KEY_FRAME && p_rc->this_key_frame_forced) {
+    q = p_rc->last_boosted_qindex;
   } else {
     q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
                           active_worst_quality, width, height);
+#if RT_PASSIVE_STRATEGY
+    if (current_frame->frame_type != KEY_FRAME &&
+        cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+      q = get_q_passive_strategy(cpi, q, 50);
+    }
+#endif  // RT_PASSIVE_STRATEGY
     if (q > *top_index) {
       // Special case when we are targeting the max allowed rate
       if (rc->this_frame_target >= rc->max_frame_bandwidth)
@@ -851,6 +1131,14 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
         q = *top_index;
     }
   }
+  // Special case: we force the first few frames to use low q such that
+  // these frames are encoded at a high quality, which provides good
+  // references for following frames.
+  if (current_frame->frame_type != KEY_FRAME && !cpi->ppi->use_svc &&
+      current_frame->frame_number >= 10 && current_frame->frame_number <= 15) {
+    q = AOMMIN(p_rc->last_kf_qindex + 108, AOMMAX(5, q - 9));
+    q = AOMMAX(q, rc->best_quality);
+  }
 
   assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
   assert(*bottom_index <= rc->worst_quality &&
@@ -864,16 +1152,18 @@ static int gf_group_pyramid_level(const GF_GROUP *gf_group, int gf_index) {
 }
 
 static int get_active_cq_level(const RATE_CONTROL *rc,
+                               const PRIMARY_RATE_CONTROL *p_rc,
                                const AV1EncoderConfig *const oxcf,
-                               int intra_only, SUPERRES_MODE superres_mode,
+                               int intra_only, aom_superres_mode superres_mode,
                                int superres_denom) {
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
   static const double cq_adjust_threshold = 0.1;
-  int active_cq_level = oxcf->cq_level;
-  (void)intra_only;
-  if (oxcf->rc_mode == AOM_CQ || oxcf->rc_mode == AOM_Q) {
+  int active_cq_level = rc_cfg->cq_level;
+  if (rc_cfg->mode == AOM_CQ || rc_cfg->mode == AOM_Q) {
     // printf("Superres %d %d %d = %d\n", superres_denom, intra_only,
     //        rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1));
-    if ((superres_mode == SUPERRES_QTHRESH || superres_mode == SUPERRES_AUTO) &&
+    if ((superres_mode == AOM_SUPERRES_QTHRESH ||
+         superres_mode == AOM_SUPERRES_AUTO) &&
         superres_denom != SCALE_NUMERATOR) {
       int mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO;
       if (intra_only && rc->frames_to_key <= 1) {
@@ -887,8 +1177,8 @@ static int get_active_cq_level(const RATE_CONTROL *rc,
           active_cq_level - ((superres_denom - SCALE_NUMERATOR) * mult), 0);
     }
   }
-  if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) {
-    const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+  if (rc_cfg->mode == AOM_CQ && p_rc->total_target_bits > 0) {
+    const double x = (double)p_rc->total_actual_bits / p_rc->total_target_bits;
     if (x < cq_adjust_threshold) {
       active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
     }
@@ -896,76 +1186,66 @@ static int get_active_cq_level(const RATE_CONTROL *rc,
   return active_cq_level;
 }
 
-static int get_q_using_fixed_offsets(const AV1EncoderConfig *const oxcf,
-                                     const RATE_CONTROL *const rc,
-                                     const GF_GROUP *const gf_group,
-                                     int gf_index, int cq_level,
-                                     int bit_depth) {
-  assert(oxcf->use_fixed_qp_offsets);
-  assert(oxcf->rc_mode == AOM_Q);
-  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_index];
-
-  int offset_idx = -1;
-  if (update_type == KF_UPDATE) {
-    if (rc->frames_to_key == 1) {
-      // Image / intra-only coding: ignore offsets.
-      return cq_level;
-    }
-    offset_idx = 0;
-  } else if (update_type == ARF_UPDATE || update_type == GF_UPDATE) {
-    offset_idx = 1;
-  } else if (update_type == INTNL_ARF_UPDATE) {
-    offset_idx =
-        AOMMIN(gf_group->layer_depth[gf_index], FIXED_QP_OFFSET_COUNT - 1);
-  } else {  // Leaf level / overlay frame.
-    assert(update_type == LF_UPDATE || update_type == OVERLAY_UPDATE ||
-           update_type == INTNL_OVERLAY_UPDATE);
-    return cq_level;  // Directly Return worst quality allowed.
-  }
-  assert(offset_idx >= 0 && offset_idx < FIXED_QP_OFFSET_COUNT);
-  assert(oxcf->fixed_qp_offsets[offset_idx] >= 0);
-
-  // Get qindex offset, by first converting to 'q' and then back.
-  const double q_val_orig = av1_convert_qindex_to_q(cq_level, bit_depth);
-  const double q_val_target =
-      AOMMAX(q_val_orig - oxcf->fixed_qp_offsets[offset_idx], 0.0);
-  const int delta_qindex =
-      av1_compute_qdelta(rc, q_val_orig, q_val_target, bit_depth);
-  return AOMMAX(cq_level + delta_qindex, 0);
-}
-
-static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
-                                             int height, int *bottom_index,
-                                             int *top_index) {
+/*!\brief Picks q and q bounds given non-CBR rate control params in \c cpi->rc.
+ *
+ * Handles the special case when using:
+ * - Any rate control other than constant bit-rate mode:
+ * \c cpi->oxcf.rc_cfg.mode != \ref AOM_CBR, and
+ * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are
+ * NOT available.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       width        Coded frame width
+ * \param[in]       height       Coded frame height
+ * \param[out]      bottom_index Bottom bound for q index (best quality)
+ * \param[out]      top_index    Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
+                                         int height, int *bottom_index,
+                                         int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const int cq_level =
-      get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
-                          cm->superres_scale_denominator);
-  const int bit_depth = cm->seq_params.bit_depth;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
 
-  if (oxcf->use_fixed_qp_offsets) {
-    return get_q_using_fixed_offsets(oxcf, rc, &cpi->gf_group,
-                                     cpi->gf_group.index, cq_level, bit_depth);
-  }
+  assert(has_no_stats_stage(cpi));
+  assert(rc_mode == AOM_VBR ||
+         (!USE_UNRESTRICTED_Q_IN_CQ_MODE && rc_mode == AOM_CQ) ||
+         rc_mode == AOM_Q);
+
+  const int cq_level =
+      get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+                          cpi->superres_mode, cm->superres_scale_denominator);
+  const int bit_depth = cm->seq_params->bit_depth;
 
   int active_best_quality;
-  int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
+  int active_worst_quality = calc_active_worst_quality_no_stats_vbr(cpi);
   int q;
   int *inter_minq;
   ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
 
   if (frame_is_intra_only(cm)) {
-    if (oxcf->rc_mode == AOM_Q) {
+    if (rc_mode == AOM_Q) {
       const int qindex = cq_level;
       const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
       const int delta_qindex =
           av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
-    } else if (rc->this_key_frame_forced) {
-      const int qindex = rc->last_boosted_qindex;
+    } else if (p_rc->this_key_frame_forced) {
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+      const int simulate_parallel_frame =
+          cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+          cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+      int qindex = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex
+                                           : p_rc->last_boosted_qindex;
+#else
+      int qindex = p_rc->last_boosted_qindex;
+#endif
       const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
       const int delta_qindex = av1_compute_qdelta(
           rc, last_boosted_q, last_boosted_q * 0.75, bit_depth);
@@ -973,8 +1253,8 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
     } else {  // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
 
-      active_best_quality =
-          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+      active_best_quality = get_kf_active_quality(
+          p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((width * height) <= (352 * 288)) {
@@ -990,33 +1270,33 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
       }
     }
   } else if (!rc->is_src_frame_alt_ref &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+             (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
     q = (rc->frames_since_key > 1 &&
-         rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
-            ? rc->avg_frame_qindex[INTER_FRAME]
-            : rc->avg_frame_qindex[KEY_FRAME];
+         p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+            ? p_rc->avg_frame_qindex[INTER_FRAME]
+            : p_rc->avg_frame_qindex[KEY_FRAME];
     // For constrained quality dont allow Q less than the cq level
-    if (oxcf->rc_mode == AOM_CQ) {
+    if (rc_mode == AOM_CQ) {
       if (q < cq_level) q = cq_level;
-      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+      active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
-    } else if (oxcf->rc_mode == AOM_Q) {
+    } else if (rc_mode == AOM_Q) {
       const int qindex = cq_level;
       const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
       const int delta_qindex =
-          (cpi->refresh_alt_ref_frame)
+          (refresh_frame->alt_ref_frame)
               ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth)
               : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
     } else {
-      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+      active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
     }
   } else {
-    if (oxcf->rc_mode == AOM_Q) {
+    if (rc_mode == AOM_Q) {
       const int qindex = cq_level;
       const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
       const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
@@ -1028,12 +1308,13 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       // Use the lower of active_worst_quality and recent/average Q.
-      active_best_quality = (current_frame->frame_number > 1)
-                                ? inter_minq[rc->avg_frame_qindex[INTER_FRAME]]
-                                : inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      active_best_quality =
+          (current_frame->frame_number > 1)
+              ? inter_minq[p_rc->avg_frame_qindex[INTER_FRAME]]
+              : inter_minq[p_rc->avg_frame_qindex[KEY_FRAME]];
       // For the constrained quality mode we don't want
       // q to fall below the cq level.
-      if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+      if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
         active_best_quality = cq_level;
       }
     }
@@ -1051,27 +1332,35 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
   // Limit Q range for the adaptive loop.
   {
     int qdelta = 0;
-    aom_clear_system_state();
-    if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
-        !(current_frame->frame_number == 0)) {
-      qdelta = av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type,
-                                          active_worst_quality, 2.0, bit_depth);
+    if (current_frame->frame_type == KEY_FRAME &&
+        !p_rc->this_key_frame_forced && current_frame->frame_number != 0) {
+      qdelta = av1_compute_qdelta_by_rate(
+          &cpi->rc, current_frame->frame_type, active_worst_quality, 2.0,
+          cpi->is_screen_content_type, bit_depth);
     } else if (!rc->is_src_frame_alt_ref &&
-               (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-      qdelta =
-          av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type,
-                                     active_worst_quality, 1.75, bit_depth);
+               (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
+      qdelta = av1_compute_qdelta_by_rate(
+          &cpi->rc, current_frame->frame_type, active_worst_quality, 1.75,
+          cpi->is_screen_content_type, bit_depth);
     }
     *top_index = active_worst_quality + qdelta;
     *top_index = AOMMAX(*top_index, *bottom_index);
   }
 
-  if (oxcf->rc_mode == AOM_Q) {
+  if (rc_mode == AOM_Q) {
     q = active_best_quality;
     // Special case code to try and match quality with forced key frames
   } else if ((current_frame->frame_type == KEY_FRAME) &&
-             rc->this_key_frame_forced) {
-    q = rc->last_boosted_qindex;
+             p_rc->this_key_frame_forced) {
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+    const int simulate_parallel_frame =
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+        cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+    q = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex
+                                : p_rc->last_boosted_qindex;
+#else
+    q = p_rc->last_boosted_qindex;
+#endif
   } else {
     q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
                           active_worst_quality, width, height);
@@ -1091,42 +1380,42 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
   return q;
 }
 
-static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
-  1.00,  // INTER_NORMAL
-  1.50,  // GF_ARF_LOW
-  2.00,  // GF_ARF_STD
-  2.00,  // KF_STD
-};
-
+static const double arf_layer_deltas[MAX_ARF_LAYERS + 1] = { 2.50, 2.00, 1.75,
+                                                             1.50, 1.25, 1.15,
+                                                             1.0 };
 int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
-  const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
-  const FRAME_TYPE frame_type = (rf_lvl == KF_STD) ? KEY_FRAME : INTER_FRAME;
-  double rate_factor;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const RATE_FACTOR_LEVEL rf_lvl =
+      get_rate_factor_level(gf_group, cpi->gf_frame_index);
+  const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
+  const int arf_layer = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+  const double rate_factor =
+      (rf_lvl == INTER_NORMAL) ? 1.0 : arf_layer_deltas[arf_layer];
 
-  rate_factor = rate_factor_deltas[rf_lvl];
-  if (rf_lvl == GF_ARF_LOW) {
-    rate_factor -= (cpi->gf_group.layer_depth[cpi->gf_group.index] - 2) * 0.1;
-    rate_factor = AOMMAX(rate_factor, 1.0);
-  }
   return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q, rate_factor,
-                                    cpi->common.seq_params.bit_depth);
+                                    cpi->is_screen_content_type,
+                                    cpi->common.seq_params->bit_depth);
 }
 
 // This unrestricted Q selection on CQ mode is useful when testing new features,
 // but may lead to Q being out of range on current RC restrictions
 #if USE_UNRESTRICTED_Q_IN_CQ_MODE
-static int rc_pick_q_and_bounds_one_pass_cq(const AV1_COMP *cpi, int width,
+static int rc_pick_q_and_bounds_no_stats_cq(const AV1_COMP *cpi, int width,
                                             int height, int *bottom_index,
                                             int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm),
-                                           cm->superres_scale_denominator);
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int cq_level =
+      get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
+                          cm->superres_scale_denominator);
+  const int bit_depth = cm->seq_params->bit_depth;
   const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth);
   (void)width;
   (void)height;
+  assert(has_no_stats_stage(cpi));
+  assert(cpi->oxcf.rc_cfg.mode == AOM_CQ);
+
   *top_index = q;
   *bottom_index = q;
 
@@ -1135,43 +1424,43 @@ static int rc_pick_q_and_bounds_one_pass_cq(const AV1_COMP *cpi, int width,
 #endif  // USE_UNRESTRICTED_Q_IN_CQ_MODE
 
 #define STATIC_MOTION_THRESH 95
-static void get_intra_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
-                                            int height, int *active_best,
-                                            int *active_worst, int cq_level,
-                                            int is_fwd_kf) {
+static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+                                   int *active_best, int *active_worst,
+                                   int cq_level) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   int active_best_quality;
   int active_worst_quality = *active_worst;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
 
-  if (rc->frames_to_key == 1 && oxcf->rc_mode == AOM_Q) {
+  if (rc->frames_to_key <= 1 && oxcf->rc_cfg.mode == AOM_Q) {
     // If the next frame is also a key frame or the current frame is the
     // only frame in the sequence in AOM_Q mode, just use the cq_level
     // as q.
     active_best_quality = cq_level;
     active_worst_quality = cq_level;
-  } else if (is_fwd_kf) {
-    // Handle the special case for forward reference key frames.
-    // Increase the boost because this keyframe is used as a forward and
-    // backward reference.
-    const int qindex = rc->last_boosted_qindex;
-    const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
-    const int delta_qindex = av1_compute_qdelta(
-        rc, last_boosted_q, last_boosted_q * 0.25, bit_depth);
-    active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
-  } else if (rc->this_key_frame_forced) {
+  } else if (p_rc->this_key_frame_forced) {
     // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
     double last_boosted_q;
     int delta_qindex;
     int qindex;
-
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+    const int simulate_parallel_frame =
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+        cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+    int last_boosted_qindex = simulate_parallel_frame
+                                  ? p_rc->temp_last_boosted_qindex
+                                  : p_rc->last_boosted_qindex;
+#else
+    int last_boosted_qindex = p_rc->last_boosted_qindex;
+#endif
     if (is_stat_consumption_stage_twopass(cpi) &&
-        cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-      qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+        cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      qindex = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex);
       active_best_quality = qindex;
       last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
       delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
@@ -1179,7 +1468,7 @@ static void get_intra_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
       active_worst_quality =
           AOMMIN(qindex + delta_qindex, active_worst_quality);
     } else {
-      qindex = rc->last_boosted_qindex;
+      qindex = last_boosted_qindex;
       last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
       delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
                                         last_boosted_q * 0.50, bit_depth);
@@ -1192,10 +1481,13 @@ static void get_intra_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
 
     // Baseline value derived from cpi->active_worst_quality and kf boost.
     active_best_quality =
-        get_kf_active_quality(rc, active_worst_quality, bit_depth);
+        get_kf_active_quality(p_rc, active_worst_quality, bit_depth);
+    if (cpi->is_screen_content_type) {
+      active_best_quality /= 2;
+    }
 
     if (is_stat_consumption_stage_twopass(cpi) &&
-        cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+        cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
       active_best_quality /= 3;
     }
 
@@ -1206,7 +1498,8 @@ static void get_intra_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
 
     // Make a further adjustment based on the kf zero motion measure.
     if (is_stat_consumption_stage_twopass(cpi))
-      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+      q_adj_factor +=
+          0.05 - (0.001 * (double)cpi->ppi->twopass.kf_zeromotion_pct);
 
     // Convert the adjustment factor to a qindex delta
     // on active_best_quality.
@@ -1216,9 +1509,9 @@ static void get_intra_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
 
     // Tweak active_best_quality for AOM_Q mode when superres is on, as this
     // will be used directly as 'q' later.
-    if (oxcf->rc_mode == AOM_Q &&
-        (cpi->superres_mode == SUPERRES_QTHRESH ||
-         cpi->superres_mode == SUPERRES_AUTO) &&
+    if (oxcf->rc_cfg.mode == AOM_Q &&
+        (cpi->superres_mode == AOM_SUPERRES_QTHRESH ||
+         cpi->superres_mode == AOM_SUPERRES_AUTO) &&
         cm->superres_scale_denominator != SCALE_NUMERATOR) {
       active_best_quality =
           AOMMAX(active_best_quality -
@@ -1237,31 +1530,55 @@ static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
                                                  int *active_best) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
-  const int bit_depth = cpi->common.seq_params.bit_depth;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const int bit_depth = cpi->common.seq_params->bit_depth;
   int active_best_quality = *active_best;
   int active_worst_quality = *active_worst;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  const int simulate_parallel_frame =
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+  int extend_minq_fast = simulate_parallel_frame
+                             ? p_rc->temp_extend_minq_fast
+                             : cpi->ppi->twopass.extend_minq_fast;
+  int extend_minq = simulate_parallel_frame ? p_rc->temp_extend_minq
+                                            : cpi->ppi->twopass.extend_minq;
+  int extend_maxq = simulate_parallel_frame ? p_rc->temp_extend_maxq
+                                            : cpi->ppi->twopass.extend_maxq;
+#endif
   // Extension to max or min Q if undershoot or overshoot is outside
   // the permitted range.
-  if (cpi->oxcf.rc_mode != AOM_Q) {
+  if (cpi->oxcf.rc_cfg.mode != AOM_Q) {
     if (frame_is_intra_only(cm) ||
         (!rc->is_src_frame_alt_ref &&
-         (cpi->refresh_golden_frame || is_intrl_arf_boost ||
-          cpi->refresh_alt_ref_frame))) {
+         (refresh_frame->golden_frame || is_intrl_arf_boost ||
+          refresh_frame->alt_ref_frame))) {
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+      active_best_quality -= (extend_minq + extend_minq_fast);
+      active_worst_quality += (extend_maxq / 2);
+#else
       active_best_quality -=
-          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
-      active_worst_quality += (cpi->twopass.extend_maxq / 2);
+          (cpi->ppi->twopass.extend_minq + cpi->ppi->twopass.extend_minq_fast);
+      active_worst_quality += (cpi->ppi->twopass.extend_maxq / 2);
+#endif
     } else {
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+      active_best_quality -= (extend_minq + extend_minq_fast) / 2;
+      active_worst_quality += extend_maxq;
+#else
       active_best_quality -=
-          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
-      active_worst_quality += cpi->twopass.extend_maxq;
+          (cpi->ppi->twopass.extend_minq + cpi->ppi->twopass.extend_minq_fast) /
+          2;
+      active_worst_quality += cpi->ppi->twopass.extend_maxq;
+#endif
     }
   }
 
-  aom_clear_system_state();
 #ifndef STRICT_RC
   // Static forced key frames Q restrictions dealt with elsewhere.
-  if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced ||
-      (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+  if (!(frame_is_intra_only(cm)) || !p_rc->this_key_frame_forced ||
+      (cpi->ppi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
     const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality);
     active_worst_quality =
         AOMMAX(active_worst_quality + qdelta, active_best_quality);
@@ -1271,7 +1588,8 @@ static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
   // Modify active_best_quality for downscaled normal frames.
   if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
     int qdelta = av1_compute_qdelta_by_rate(
-        rc, cm->current_frame.frame_type, active_best_quality, 2.0, bit_depth);
+        rc, cm->current_frame.frame_type, active_best_quality, 2.0,
+        cpi->is_screen_content_type, bit_depth);
     active_best_quality =
         AOMMAX(active_best_quality + qdelta, rc->best_quality);
   }
@@ -1285,25 +1603,51 @@ static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
   *active_worst = active_worst_quality;
 }
 
+/*!\brief Gets a Q value to use  for the current frame
+ *
+ *
+ * Selects a Q value from a permitted range that we estimate
+ * will result in approximately the target number of bits.
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   width                 Width of frame
+ * \param[in]   height                Height of frame
+ * \param[in]   active_worst_quality  Max Q allowed
+ * \param[in]   active_best_quality   Min Q allowed
+ *
+ * \return The suggested Q for this frame.
+ */
 static int get_q(const AV1_COMP *cpi, const int width, const int height,
                  const int active_worst_quality,
                  const int active_best_quality) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   int q;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  const int simulate_parallel_frame =
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+      cpi->ppi->fpmt_unit_test_cfg;
+  int last_boosted_qindex = simulate_parallel_frame
+                                ? p_rc->temp_last_boosted_qindex
+                                : p_rc->last_boosted_qindex;
+#else
+  int last_boosted_qindex = p_rc->last_boosted_qindex;
+#endif
 
-  if (cpi->oxcf.rc_mode == AOM_Q ||
-      (frame_is_intra_only(cm) && !rc->this_key_frame_forced &&
-       cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
+  if (cpi->oxcf.rc_cfg.mode == AOM_Q ||
+      (frame_is_intra_only(cm) && !p_rc->this_key_frame_forced &&
+       cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
        rc->frames_to_key > 1)) {
     q = active_best_quality;
     // Special case code to try and match quality with forced key frames.
-  } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
+  } else if (frame_is_intra_only(cm) && p_rc->this_key_frame_forced) {
     // If static since last kf use better of last boosted and last kf q.
-    if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-      q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+    if (cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      q = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex);
     } else {
-      q = AOMMIN(rc->last_boosted_qindex,
+      q = AOMMIN(last_boosted_qindex,
                  (active_best_quality + active_worst_quality) / 2);
     }
     q = clamp(q, active_best_quality, active_worst_quality);
@@ -1330,18 +1674,28 @@ static int get_active_best_quality(const AV1_COMP *const cpi,
                                    const int active_worst_quality,
                                    const int cq_level, const int gf_index) {
   const AV1_COMMON *const cm = &cpi->common;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const GF_GROUP *gf_group = &cpi->gf_group;
-  const int rc_mode = oxcf->rc_mode;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
   int *inter_minq;
   ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
   int active_best_quality = 0;
   const int is_intrl_arf_boost =
       gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
-  const int is_leaf_frame = !(cpi->refresh_golden_frame ||
-                              cpi->refresh_alt_ref_frame || is_intrl_arf_boost);
+  int is_leaf_frame =
+      !(gf_group->update_type[gf_index] == ARF_UPDATE ||
+        gf_group->update_type[gf_index] == GF_UPDATE || is_intrl_arf_boost);
+
+  // TODO(jingning): Consider to rework this hack that covers issues incurred
+  // in lightfield setting.
+  if (cm->tiles.large_scale) {
+    is_leaf_frame = !(refresh_frame->golden_frame ||
+                      refresh_frame->alt_ref_frame || is_intrl_arf_boost);
+  }
   const int is_overlay_frame = rc->is_src_frame_alt_ref;
 
   if (is_leaf_frame || is_overlay_frame) {
@@ -1356,30 +1710,25 @@ static int get_active_best_quality(const AV1_COMP *const cpi,
     return active_best_quality;
   }
 
-  // TODO(chengchen): can we remove this condition?
-  if (rc_mode == AOM_Q && !cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
-    return cq_level;
-  }
-
   // Determine active_best_quality for frames that are not leaf or overlay.
   int q = active_worst_quality;
   // Use the lower of active_worst_quality and recent
   // average Q as basis for GF/ARF best Q limit unless last frame was
   // a key frame.
   if (rc->frames_since_key > 1 &&
-      rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
-    q = rc->avg_frame_qindex[INTER_FRAME];
+      p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+    q = p_rc->avg_frame_qindex[INTER_FRAME];
   }
   if (rc_mode == AOM_CQ && q < cq_level) q = cq_level;
-  active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+  active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
   // Constrained quality use slightly lower active best.
   if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16;
   const int min_boost = get_gf_high_motion_quality(q, bit_depth);
   const int boost = min_boost - active_best_quality;
-  active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+  active_best_quality = min_boost - (int)(boost * p_rc->arf_boost_factor);
   if (!is_intrl_arf_boost) return active_best_quality;
 
-  if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = rc->arf_q;
+  if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = p_rc->arf_q;
   int this_height = gf_group_pyramid_level(gf_group, gf_index);
   while (this_height > 1) {
     active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
@@ -1388,21 +1737,118 @@ static int get_active_best_quality(const AV1_COMP *const cpi,
   return active_best_quality;
 }
 
-static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
-                                         int height, int gf_index,
-                                         int *bottom_index, int *top_index) {
+// Returns the q_index for a single frame in the GOP.
+// This function assumes that rc_mode == AOM_Q mode.
+int av1_q_mode_get_q_index(int base_q_index, int gf_update_type,
+                           int gf_pyramid_level, int arf_q) {
+  const int is_intrl_arf_boost = gf_update_type == INTNL_ARF_UPDATE;
+  int is_leaf_or_overlay_frame = gf_update_type == LF_UPDATE ||
+                                 gf_update_type == OVERLAY_UPDATE ||
+                                 gf_update_type == INTNL_OVERLAY_UPDATE;
+
+  if (is_leaf_or_overlay_frame) return base_q_index;
+
+  if (!is_intrl_arf_boost) return arf_q;
+
+  int active_best_quality = arf_q;
+  int active_worst_quality = base_q_index;
+
+  while (gf_pyramid_level > 1) {
+    active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
+    --gf_pyramid_level;
+  }
+  return active_best_quality;
+}
+
+// Returns the q_index for the ARF in the GOP.
+int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
+                        double arf_boost_factor) {
+  int active_best_quality =
+      get_gf_active_quality_no_rc(gfu_boost, base_q_index, bit_depth);
+  const int min_boost = get_gf_high_motion_quality(base_q_index, bit_depth);
+  const int boost = min_boost - active_best_quality;
+  return min_boost - (int)(boost * arf_boost_factor);
+}
+
+static int rc_pick_q_and_bounds_q_mode(const AV1_COMP *cpi, int width,
+                                       int height, int gf_index,
+                                       int *bottom_index, int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const GF_GROUP *gf_group = &cpi->gf_group;
   const int cq_level =
-      get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
-                          cm->superres_scale_denominator);
-  const int bit_depth = cm->seq_params.bit_depth;
+      get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+                          cpi->superres_mode, cm->superres_scale_denominator);
+  int active_best_quality = 0;
+  int active_worst_quality = rc->active_worst_quality;
+  int q;
+
+  if (frame_is_intra_only(cm)) {
+    get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
+                           &active_worst_quality, cq_level);
+  } else {
+    //  Active best quality limited by previous layer.
+    active_best_quality =
+        get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index);
+  }
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  *top_index = AOMMAX(*top_index, rc->best_quality);
+  *top_index = AOMMIN(*top_index, rc->worst_quality);
+
+  *bottom_index = AOMMAX(*bottom_index, rc->best_quality);
+  *bottom_index = AOMMIN(*bottom_index, rc->worst_quality);
+
+  q = active_best_quality;
+
+  q = AOMMAX(q, rc->best_quality);
+  q = AOMMIN(q, rc->worst_quality);
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+
+  return q;
+}
 
-  if (oxcf->use_fixed_qp_offsets) {
-    return get_q_using_fixed_offsets(oxcf, rc, gf_group, gf_group->index,
-                                     cq_level, bit_depth);
+/*!\brief Picks q and q bounds given rate control parameters in \c cpi->rc.
+ *
+ * Handles the the general cases not covered by
+ * \ref rc_pick_q_and_bounds_no_stats_cbr() and
+ * \ref rc_pick_q_and_bounds_no_stats()
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       width        Coded frame width
+ * \param[in]       height       Coded frame height
+ * \param[in]       gf_index     Index of this frame in the golden frame group
+ * \param[out]      bottom_index Bottom bound for q index (best quality)
+ * \param[out]      top_index    Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+                                int gf_index, int *bottom_index,
+                                int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(has_no_stats_stage(cpi),
+                 cpi->oxcf.rc_cfg.mode == AOM_Q &&
+                     gf_group->update_type[gf_index] != ARF_UPDATE));
+  const int cq_level =
+      get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+                          cpi->superres_mode, cm->superres_scale_denominator);
+
+  if (oxcf->rc_cfg.mode == AOM_Q) {
+    return rc_pick_q_and_bounds_q_mode(cpi, width, height, gf_index,
+                                       bottom_index, top_index);
   }
 
   int active_best_quality = 0;
@@ -1413,24 +1859,39 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
       gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
 
   if (frame_is_intra_only(cm)) {
-    const int is_fwd_kf =
-        cm->current_frame.frame_type == KEY_FRAME && cm->show_frame == 0;
-    get_intra_q_and_bounds_two_pass(cpi, width, height, &active_best_quality,
-                                    &active_worst_quality, cq_level, is_fwd_kf);
+    get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
+                           &active_worst_quality, cq_level);
 #ifdef STRICT_RC
     active_best_quality = 0;
 #endif
   } else {
-#ifdef STRICT_RC
     //  Active best quality limited by previous layer.
     const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index);
-    active_best_quality =
-        rc->active_best_quality[pyramid_level - 1] +
-        AOMMAX((rc->active_best_quality[pyramid_level - 1] / 10), 5);
+
+    if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS)) {
+      active_best_quality = get_active_best_quality(cpi, active_worst_quality,
+                                                    cq_level, gf_index);
+    } else {
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+      const int simulate_parallel_frame =
+          cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+          cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+      int local_active_best_quality =
+          simulate_parallel_frame
+              ? p_rc->temp_active_best_quality[pyramid_level - 1]
+              : p_rc->active_best_quality[pyramid_level - 1];
+      active_best_quality = local_active_best_quality + 1;
 #else
-    active_best_quality =
-        get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index);
+      active_best_quality = p_rc->active_best_quality[pyramid_level - 1] + 1;
+#endif
+
+      active_best_quality = AOMMIN(active_best_quality, active_worst_quality);
+#ifdef STRICT_RC
+      active_best_quality += (active_worst_quality - active_best_quality) / 16;
+#else
+      active_best_quality += (active_worst_quality - active_best_quality) / 2;
 #endif
+    }
 
     // For alt_ref and GF frames (including internal arf frames) adjust the
     // worst allowed quality as well. This insures that even on hard
@@ -1438,7 +1899,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
     // leaf (non arf) frames. This is important to the TPL model which assumes
     // Q drops with each arf level.
     if (!(rc->is_src_frame_alt_ref) &&
-        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame ||
+        (refresh_frame->golden_frame || refresh_frame->alt_ref_frame ||
          is_intrl_arf_boost)) {
       active_worst_quality =
           (active_best_quality + (3 * active_worst_quality) + 2) / 4;
@@ -1455,11 +1916,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
     active_worst_quality = q;
   }
 
-#ifdef STRICT_RC
-  *top_index = rc->worst_quality;
-#else
   *top_index = active_worst_quality;
-#endif
   *bottom_index = active_best_quality;
 
   assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
@@ -1470,32 +1927,33 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
   return q;
 }
 
-int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, RATE_CONTROL *rc, int width,
-                             int height, int gf_index, int *bottom_index,
-                             int *top_index) {
+int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+                             int gf_index, int *bottom_index, int *top_index) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   int q;
-  // TODO(sarahparker) merge onepass vbr and altref q computation
-  // with two pass
-  const GF_GROUP *gf_group = &cpi->gf_group;
-  if ((cpi->oxcf.rc_mode != AOM_Q ||
+  // TODO(sarahparker) merge no-stats vbr and altref q computation
+  // with rc_pick_q_and_bounds().
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if ((cpi->oxcf.rc_cfg.mode != AOM_Q ||
        gf_group->update_type[gf_index] == ARF_UPDATE) &&
       has_no_stats_stage(cpi)) {
-    if (cpi->oxcf.rc_mode == AOM_CBR)
-      q = rc_pick_q_and_bounds_one_pass_cbr(cpi, width, height, bottom_index,
+    if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
+      q = rc_pick_q_and_bounds_no_stats_cbr(cpi, width, height, bottom_index,
                                             top_index);
 #if USE_UNRESTRICTED_Q_IN_CQ_MODE
-    else if (cpi->oxcf.rc_mode == AOM_CQ)
-      q = rc_pick_q_and_bounds_one_pass_cq(cpi, width, height, bottom_index,
+    } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) {
+      q = rc_pick_q_and_bounds_no_stats_cq(cpi, width, height, bottom_index,
                                            top_index);
 #endif  // USE_UNRESTRICTED_Q_IN_CQ_MODE
-    else
-      q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index,
-                                            top_index);
+    } else {
+      q = rc_pick_q_and_bounds_no_stats(cpi, width, height, bottom_index,
+                                        top_index);
+    }
   } else {
-    q = rc_pick_q_and_bounds_two_pass(cpi, width, height, gf_index,
-                                      bottom_index, top_index);
+    q = rc_pick_q_and_bounds(cpi, width, height, gf_index, bottom_index,
+                             top_index);
   }
-  if (gf_group->update_type[gf_index] == ARF_UPDATE) rc->arf_q = q;
+  if (gf_group->update_type[gf_index] == ARF_UPDATE) p_rc->arf_q = q;
 
   return q;
 }
@@ -1503,14 +1961,15 @@ int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, RATE_CONTROL *rc, int width,
 void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit) {
-  if (cpi->oxcf.rc_mode == AOM_Q) {
+  if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
     *frame_under_shoot_limit = 0;
     *frame_over_shoot_limit = INT_MAX;
   } else {
     // For very small rate targets where the fractional adjustment
     // may be tiny make sure there is at least a minimum range.
-    const int tolerance =
-        AOMMAX(100, (cpi->sf.hl_sf.recode_tolerance * frame_target) / 100);
+    assert(cpi->sf.hl_sf.recode_tolerance <= 100);
+    const int tolerance = (int)AOMMAX(
+        100, ((int64_t)cpi->sf.hl_sf.recode_tolerance * frame_target) / 100);
     *frame_under_shoot_limit = AOMMAX(frame_target - tolerance, 0);
     *frame_over_shoot_limit =
         AOMMIN(frame_target + tolerance, cpi->rc.max_frame_bandwidth);
@@ -1524,9 +1983,11 @@ void av1_rc_set_frame_target(AV1_COMP *cpi, int target, int width, int height) {
   rc->this_frame_target = target;
 
   // Modify frame size target when down-scaled.
-  if (av1_frame_scaled(cm))
+  if (av1_frame_scaled(cm) && cpi->oxcf.rc_cfg.mode != AOM_CBR) {
     rc->this_frame_target =
-        (int)(rc->this_frame_target * resize_rate_factor(cpi, width, height));
+        (int)(rc->this_frame_target *
+              resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height));
+  }
 
   // Target rate per SB64 (including partial SB64s.
   rc->sb64_target_rate =
@@ -1537,27 +1998,14 @@ static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
   // this frame refreshes means next frames don't unless specified by user
   RATE_CONTROL *const rc = &cpi->rc;
   rc->frames_since_golden = 0;
-
-  // Mark the alt ref as done (setting to 0 means no further alt refs pending).
-  rc->source_alt_ref_pending = 0;
-
-  // Set the alternate reference frame active flag
-  rc->source_alt_ref_active = 1;
 }
 
 static void update_golden_frame_stats(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
 
   // Update the Golden frame usage counts.
-  if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
+  if (cpi->refresh_frame.golden_frame || rc->is_src_frame_alt_ref) {
     rc->frames_since_golden = 0;
-
-    // If we are not using alt ref in the up and coming group clear the arf
-    // active flag. In multi arf group case, if the index is not 0 then
-    // we are overlaying a mid group arf so should not reset the flag.
-    if (!rc->source_alt_ref_pending && (gf_group->index == 0))
-      rc->source_alt_ref_active = 0;
   } else if (cpi->common.show_frame) {
     rc->frames_since_golden++;
   }
@@ -1567,55 +2015,61 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   const AV1_COMMON *const cm = &cpi->common;
   const CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
 
   const int is_intrnl_arf =
-      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
+      gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
 
   const int qindex = cm->quant_params.base_qindex;
 
+#if RT_PASSIVE_STRATEGY
+  const int frame_number = current_frame->frame_number % MAX_Q_HISTORY;
+  p_rc->q_history[frame_number] = qindex;
+#endif  // RT_PASSIVE_STRATEGY
+
   // Update rate control heuristics
   rc->projected_frame_size = (int)(bytes_used << 3);
 
   // Post encode loop adjustment of Q prediction.
-  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+  av1_rc_update_rate_correction_factors(cpi, 0, cm->width, cm->height);
 
   // Keep a record of last Q and ambient average Q.
   if (current_frame->frame_type == KEY_FRAME) {
-    rc->last_q[KEY_FRAME] = qindex;
-    rc->avg_frame_qindex[KEY_FRAME] =
-        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+    p_rc->last_q[KEY_FRAME] = qindex;
+    p_rc->avg_frame_qindex[KEY_FRAME] =
+        ROUND_POWER_OF_TWO(3 * p_rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
   } else {
-    if ((cpi->use_svc && cpi->oxcf.rc_mode == AOM_CBR) ||
+    if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) ||
         (!rc->is_src_frame_alt_ref &&
-         !(cpi->refresh_golden_frame || is_intrnl_arf ||
-           cpi->refresh_alt_ref_frame))) {
-      rc->last_q[INTER_FRAME] = qindex;
-      rc->avg_frame_qindex[INTER_FRAME] =
-          ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
-      rc->ni_frames++;
-      rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params.bit_depth);
-      rc->avg_q = rc->tot_q / rc->ni_frames;
+         !(refresh_frame->golden_frame || is_intrnl_arf ||
+           refresh_frame->alt_ref_frame))) {
+      p_rc->last_q[INTER_FRAME] = qindex;
+      p_rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(
+          3 * p_rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+      p_rc->ni_frames++;
+      p_rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params->bit_depth);
+      p_rc->avg_q = p_rc->tot_q / p_rc->ni_frames;
       // Calculate the average Q for normal inter frames (not key or GFU
       // frames).
       rc->ni_tot_qi += qindex;
-      rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+      rc->ni_av_qi = rc->ni_tot_qi / p_rc->ni_frames;
     }
   }
-
   // Keep record of last boosted (KF/GF/ARF) Q value.
   // If the current frame is coded at a lower Q then we also update it.
   // If all mbs in this group are skipped only update if the Q value is
   // better than that already stored.
   // This is used to help set quality in forced key frames to reduce popping
-  if ((qindex < rc->last_boosted_qindex) ||
+  if ((qindex < p_rc->last_boosted_qindex) ||
       (current_frame->frame_type == KEY_FRAME) ||
-      (!rc->constrained_gf_group &&
-       (cpi->refresh_alt_ref_frame || is_intrnl_arf ||
-        (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
-    rc->last_boosted_qindex = qindex;
+      (!p_rc->constrained_gf_group &&
+       (refresh_frame->alt_ref_frame || is_intrnl_arf ||
+        (refresh_frame->golden_frame && !rc->is_src_frame_alt_ref)))) {
+    p_rc->last_boosted_qindex = qindex;
   }
-  if (current_frame->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex;
+  if (current_frame->frame_type == KEY_FRAME) p_rc->last_kf_qindex = qindex;
 
   update_buffer_level(cpi, rc->projected_frame_size);
   rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth;
@@ -1623,40 +2077,62 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   // Rolling monitors of whether we are over or underspending used to help
   // regulate min and Max Q in two pass.
   if (av1_frame_scaled(cm))
-    rc->this_frame_target =
-        (int)(rc->this_frame_target /
-              resize_rate_factor(cpi, cm->width, cm->height));
+    rc->this_frame_target = (int)(rc->this_frame_target /
+                                  resize_rate_factor(&cpi->oxcf.frm_dim_cfg,
+                                                     cm->width, cm->height));
   if (current_frame->frame_type != KEY_FRAME) {
-    rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
-        rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
-    rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
-        rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
-    rc->long_rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
-        rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
-    rc->long_rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
-        rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
+    p_rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
+        p_rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+    p_rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
+        p_rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
   }
 
   // Actual bits spent
-  rc->total_actual_bits += rc->projected_frame_size;
-  rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+  p_rc->total_actual_bits += rc->projected_frame_size;
+  p_rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
 
-  rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
-
-  if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
-      (current_frame->frame_type != KEY_FRAME))
+  if (is_altref_enabled(cpi->oxcf.gf_cfg.lag_in_frames,
+                        cpi->oxcf.gf_cfg.enable_auto_arf) &&
+      refresh_frame->alt_ref_frame &&
+      (current_frame->frame_type != KEY_FRAME && !frame_is_sframe(cm)))
     // Update the alternate reference frame stats as appropriate.
     update_alt_ref_frame_stats(cpi);
   else
     // Update the Golden frame stats as appropriate.
     update_golden_frame_stats(cpi);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  /*The variables temp_avg_frame_qindex, temp_last_q, temp_avg_q,
+   * temp_last_boosted_qindex are introduced only for quality simulation
+   * purpose, it retains the value previous to the parallel encode frames. The
+   * variables are updated based on the update flag.
+   *
+   * If there exist show_existing_frames between parallel frames, then to
+   * retain the temp state do not update it. */
+  int show_existing_between_parallel_frames =
+      (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+           INTNL_OVERLAY_UPDATE &&
+       cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+
+  if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+    for (int i = 0; i < FRAME_TYPES; i++) {
+      p_rc->temp_last_q[i] = p_rc->last_q[i];
+    }
+    p_rc->temp_avg_q = p_rc->avg_q;
+    p_rc->temp_last_boosted_qindex = p_rc->last_boosted_qindex;
+    p_rc->temp_total_actual_bits = p_rc->total_actual_bits;
+    p_rc->temp_projected_frame_size = rc->projected_frame_size;
+    for (int i = 0; i < RATE_FACTOR_LEVELS; i++)
+      p_rc->temp_rate_correction_factors[i] = p_rc->rate_correction_factors[i];
+  }
+#endif
   if (current_frame->frame_type == KEY_FRAME) rc->frames_since_key = 0;
   // if (current_frame->frame_number == 1 && cm->show_frame)
   /*
   rc->this_frame_target =
-      (int)(rc->this_frame_target / resize_rate_factor(cpi, cm->width,
-  cm->height));
+      (int)(rc->this_frame_target / resize_rate_factor(&cpi->oxcf.frm_dim_cfg,
+  cm->width, cm->height));
       */
 }
 
@@ -1667,6 +2143,7 @@ void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
   cpi->rc.frames_to_key--;
   cpi->rc.rc_2_frame = 0;
   cpi->rc.rc_1_frame = 0;
+  cpi->rc.prev_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
 }
 
 int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
@@ -1705,14 +2182,15 @@ int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
 // If no such q index is found, returns 'worst_qindex'.
 static int find_qindex_by_rate(int desired_bits_per_mb,
                                aom_bit_depth_t bit_depth, FRAME_TYPE frame_type,
+                               const int is_screen_content_type,
                                int best_qindex, int worst_qindex) {
   assert(best_qindex <= worst_qindex);
   int low = best_qindex;
   int high = worst_qindex;
   while (low < high) {
     const int mid = (low + high) >> 1;
-    const int mid_bits_per_mb =
-        av1_rc_bits_per_mb(frame_type, mid, 1.0, bit_depth);
+    const int mid_bits_per_mb = av1_rc_bits_per_mb(
+        frame_type, mid, 1.0, bit_depth, is_screen_content_type);
     if (mid_bits_per_mb > desired_bits_per_mb) {
       low = mid + 1;
     } else {
@@ -1720,25 +2198,26 @@ static int find_qindex_by_rate(int desired_bits_per_mb,
     }
   }
   assert(low == high);
-  assert(av1_rc_bits_per_mb(frame_type, low, 1.0, bit_depth) <=
-             desired_bits_per_mb ||
+  assert(av1_rc_bits_per_mb(frame_type, low, 1.0, bit_depth,
+                            is_screen_content_type) <= desired_bits_per_mb ||
          low == worst_qindex);
   return low;
 }
 
 int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
                                int qindex, double rate_target_ratio,
+                               const int is_screen_content_type,
                                aom_bit_depth_t bit_depth) {
   // Look up the current projected bits per block for the base index
-  const int base_bits_per_mb =
-      av1_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth);
+  const int base_bits_per_mb = av1_rc_bits_per_mb(
+      frame_type, qindex, 1.0, bit_depth, is_screen_content_type);
 
   // Find the target bits per mb based on the base value and given ratio.
   const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
 
-  const int target_index =
-      find_qindex_by_rate(target_bits_per_mb, bit_depth, frame_type,
-                          rc->best_quality, rc->worst_quality);
+  const int target_index = find_qindex_by_rate(
+      target_bits_per_mb, bit_depth, frame_type, is_screen_content_type,
+      rc->best_quality, rc->worst_quality);
   return target_index - qindex;
 }
 
@@ -1747,17 +2226,17 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
 
   // Special case code for 1 pass fixed Q mode tests
-  if ((has_no_stats_stage(cpi)) && (oxcf->rc_mode == AOM_Q)) {
-    rc->max_gf_interval = FIXED_GF_INTERVAL;
-    rc->min_gf_interval = FIXED_GF_INTERVAL;
-    rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+  if ((has_no_stats_stage(cpi)) && (oxcf->rc_cfg.mode == AOM_Q)) {
+    rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+    rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+    rc->static_scene_max_gf_interval = rc->min_gf_interval + 1;
   } else {
     // Set Maximum gf/arf interval
-    rc->max_gf_interval = oxcf->max_gf_interval;
-    rc->min_gf_interval = oxcf->min_gf_interval;
+    rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+    rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
     if (rc->min_gf_interval == 0)
       rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
-          oxcf->width, oxcf->height, cpi->framerate);
+          oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, cpi->framerate);
     if (rc->max_gf_interval == 0)
       rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
           cpi->framerate, rc->min_gf_interval);
@@ -1766,7 +2245,7 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
      * The no.of.stats available in the case of LAP is limited,
      * hence setting to max_gf_interval.
      */
-    if (cpi->lap_enabled)
+    if (cpi->ppi->lap_enabled)
       rc->static_scene_max_gf_interval = rc->max_gf_interval + 1;
     else
       rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
@@ -1785,9 +2264,10 @@ void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
   int vbr_max_bits;
   const int MBs = av1_get_MBs(width, height);
 
-  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+  rc->avg_frame_bandwidth =
+      (int)round(oxcf->rc_cfg.target_bandwidth / cpi->framerate);
   rc->min_frame_bandwidth =
-      (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
+      (int)(rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100);
 
   rc->min_frame_bandwidth =
       AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
@@ -1800,7 +2280,7 @@ void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
   // be acheived because of a user specificed max q (e.g. when the user
   // specifies lossless encode.
   vbr_max_bits =
-      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
+      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section) /
             100);
   rc->max_frame_bandwidth =
       AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
@@ -1812,36 +2292,73 @@ void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
 // For VBR...adjustment to the frame target based on error from previous frames
 static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
   RATE_CONTROL *const rc = &cpi->rc;
-  int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  const int simulate_parallel_frame =
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+      cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+  int64_t vbr_bits_off_target = simulate_parallel_frame
+                                    ? cpi->ppi->p_rc.temp_vbr_bits_off_target
+                                    : p_rc->vbr_bits_off_target;
+#else
+  int64_t vbr_bits_off_target = p_rc->vbr_bits_off_target;
+#endif
   const int stats_count =
-      cpi->twopass.stats_buf_ctx->total_stats != NULL
-          ? (int)cpi->twopass.stats_buf_ctx->total_stats->count
+      cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL
+          ? (int)cpi->ppi->twopass.stats_buf_ctx->total_stats->count
           : 0;
   const int frame_window = AOMMIN(
       16, (int)(stats_count - (int)cpi->common.current_frame.frame_number));
-
+  assert(VBR_PCT_ADJUSTMENT_LIMIT <= 100);
   if (frame_window > 0) {
-    const int max_delta =
-        AOMMIN(abs((int)(vbr_bits_off_target / frame_window)),
-               (*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100);
+    const int max_delta = (int)AOMMIN(
+        abs((int)(vbr_bits_off_target / frame_window)),
+        ((int64_t)(*this_frame_target) * VBR_PCT_ADJUSTMENT_LIMIT) / 100);
 
     // vbr_bits_off_target > 0 means we have extra bits to spend
     // vbr_bits_off_target < 0 we are currently overshooting
     *this_frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta;
   }
 
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  int64_t vbr_bits_off_target_fast =
+      simulate_parallel_frame ? cpi->ppi->p_rc.temp_vbr_bits_off_target_fast
+                              : p_rc->vbr_bits_off_target_fast;
+#endif
   // Fast redistribution of bits arising from massive local undershoot.
   // Dont do it for kf,arf,gf or overlay frames.
-  if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
-      rc->vbr_bits_off_target_fast) {
+  if (!frame_is_kf_gf_arf(cpi) &&
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+      vbr_bits_off_target_fast &&
+#else
+      p_rc->vbr_bits_off_target_fast &&
+#endif
+      !rc->is_src_frame_alt_ref) {
     int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target);
     int fast_extra_bits;
-    fast_extra_bits = (int)AOMMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+    fast_extra_bits = (int)AOMMIN(vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits =
+        (int)AOMMIN(fast_extra_bits,
+                    AOMMAX(one_frame_bits / 8, vbr_bits_off_target_fast / 8));
+#else
+    fast_extra_bits =
+        (int)AOMMIN(p_rc->vbr_bits_off_target_fast, one_frame_bits);
     fast_extra_bits = (int)AOMMIN(
         fast_extra_bits,
-        AOMMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
-    *this_frame_target += (int)fast_extra_bits;
-    rc->vbr_bits_off_target_fast -= fast_extra_bits;
+        AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8));
+#endif
+    if (fast_extra_bits > 0) {
+      // Update this_frame_target only if additional bits are available from
+      // local undershoot.
+      *this_frame_target += (int)fast_extra_bits;
+    }
+    // Store the fast_extra_bits of the frame and reduce it from
+    // vbr_bits_off_target_fast during postencode stage.
+    rc->frame_level_fast_extra_bits = fast_extra_bits;
+    // Retaining the condition to udpate during postencode stage since
+    // fast_extra_bits are calculated based on vbr_bits_off_target_fast.
+    cpi->do_update_vbr_bits_off_target_fast = 1;
   }
 }
 
@@ -1850,7 +2367,7 @@ void av1_set_target_rate(AV1_COMP *cpi, int width, int height) {
   int target_rate = rc->base_frame_target;
 
   // Correction to rate target based on prior over or under shoot.
-  if (cpi->oxcf.rc_mode == AOM_VBR || cpi->oxcf.rc_mode == AOM_CQ)
+  if (cpi->oxcf.rc_cfg.mode == AOM_VBR || cpi->oxcf.rc_cfg.mode == AOM_CQ)
     vbr_rate_correction(cpi, &target_rate);
   av1_rc_set_frame_target(cpi, target_rate, width, height);
 }
@@ -1859,16 +2376,17 @@ int av1_calc_pframe_target_size_one_pass_vbr(
     const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) {
   static const int af_ratio = 10;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   int64_t target;
 #if USE_ALTREF_FOR_ONE_PASS
   if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE ||
       frame_update_type == ARF_UPDATE) {
-    target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval *
+    target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
               af_ratio) /
-             (rc->baseline_gf_interval + af_ratio - 1);
+             (p_rc->baseline_gf_interval + af_ratio - 1);
   } else {
-    target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
-             (rc->baseline_gf_interval + af_ratio - 1);
+    target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval) /
+             (p_rc->baseline_gf_interval + af_ratio - 1);
   }
   if (target > INT_MAX) target = INT_MAX;
 #else
@@ -1880,7 +2398,7 @@ int av1_calc_pframe_target_size_one_pass_vbr(
 int av1_calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
   static const int kf_ratio = 25;
   const RATE_CONTROL *rc = &cpi->rc;
-  const int target = rc->avg_frame_bandwidth * kf_ratio;
+  const int64_t target = (int64_t)rc->avg_frame_bandwidth * kf_ratio;
   return av1_rc_clamp_iframe_target_size(cpi, target);
 }
 
@@ -1888,26 +2406,28 @@ int av1_calc_pframe_target_size_one_pass_cbr(
     const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
-  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
-  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
+  const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+  const RateControlCfg *rc_cfg = &oxcf->rc_cfg;
+  const int64_t diff = p_rc->optimal_buffer_level - p_rc->buffer_level;
+  const int64_t one_pct_bits = 1 + p_rc->optimal_buffer_level / 100;
   int min_frame_target =
       AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
   int target;
 
-  if (oxcf->gf_cbr_boost_pct) {
-    const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+  if (rc_cfg->gf_cbr_boost_pct) {
+    const int af_ratio_pct = rc_cfg->gf_cbr_boost_pct + 100;
     if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
-      target =
-          (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
-          (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+      target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
+                af_ratio_pct) /
+               (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
     } else {
-      target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
-               (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+      target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * 100) /
+               (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
     }
   } else {
     target = rc->avg_frame_bandwidth;
   }
-  if (cpi->use_svc) {
+  if (cpi->ppi->use_svc) {
     // Note that for layers, avg_frame_bandwidth is the cumulative
     // per-frame-bandwidth. For the target size of this frame, use the
     // layer average frame size (i.e., non-cumulative per-frame-bw).
@@ -1920,17 +2440,18 @@ int av1_calc_pframe_target_size_one_pass_cbr(
   }
   if (diff > 0) {
     // Lower the target bandwidth for this frame.
-    const int pct_low = (int)AOMMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    const int pct_low =
+        (int)AOMMIN(diff / one_pct_bits, rc_cfg->under_shoot_pct);
     target -= (target * pct_low) / 200;
   } else if (diff < 0) {
     // Increase the target bandwidth for this frame.
     const int pct_high =
-        (int)AOMMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+        (int)AOMMIN(-diff / one_pct_bits, rc_cfg->over_shoot_pct);
     target += (target * pct_high) / 200;
   }
-  if (oxcf->rc_max_inter_bitrate_pct) {
+  if (rc_cfg->max_inter_bitrate_pct) {
     const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+        rc->avg_frame_bandwidth * rc_cfg->max_inter_bitrate_pct / 100;
     target = AOMMIN(target, max_rate);
   }
   return AOMMAX(min_frame_target, target);
@@ -1938,11 +2459,15 @@ int av1_calc_pframe_target_size_one_pass_cbr(
 
 int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
   const RATE_CONTROL *rc = &cpi->rc;
-  int target;
+  const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+  int64_t target;
   if (cpi->common.current_frame.frame_number == 0) {
-    target = ((rc->starting_buffer_level / 2) > INT_MAX)
+    target = ((p_rc->starting_buffer_level / 2) > INT_MAX)
                  ? INT_MAX
-                 : (int)(rc->starting_buffer_level / 2);
+                 : (int)(p_rc->starting_buffer_level / 2);
+    if (cpi->svc.number_temporal_layers > 1 && target < (INT_MAX >> 2)) {
+      target = target << AOMMIN(2, (cpi->svc.number_temporal_layers - 1));
+    }
   } else {
     int kf_boost = 32;
     double framerate = cpi->framerate;
@@ -1956,33 +2481,155 @@ int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
   return av1_rc_clamp_iframe_target_size(cpi, target);
 }
 
-static void set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
+static void set_baseline_gf_interval(AV1_COMP *cpi, FRAME_TYPE frame_type) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+    av1_cyclic_refresh_set_golden_update(cpi);
+  else
+    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  if (p_rc->baseline_gf_interval > rc->frames_to_key &&
+      cpi->oxcf.kf_cfg.auto_key)
+    p_rc->baseline_gf_interval = rc->frames_to_key;
+  p_rc->gfu_boost = DEFAULT_GF_BOOST_RT;
+  p_rc->constrained_gf_group =
+      (p_rc->baseline_gf_interval >= rc->frames_to_key &&
+       cpi->oxcf.kf_cfg.auto_key)
+          ? 1
+          : 0;
+  rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+  cpi->gf_frame_index = 0;
+  // SVC does not use GF as periodic boost.
+  // TODO(marpan): Find better way to disable this for SVC.
+  if (cpi->ppi->use_svc) {
+    SVC *const svc = &cpi->svc;
+    p_rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
+    p_rc->gfu_boost = 1;
+    p_rc->constrained_gf_group = 0;
+    rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+    for (int layer = 0;
+         layer < svc->number_spatial_layers * svc->number_temporal_layers;
+         ++layer) {
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      lc->p_rc.baseline_gf_interval = p_rc->baseline_gf_interval;
+      lc->p_rc.gfu_boost = p_rc->gfu_boost;
+      lc->p_rc.constrained_gf_group = p_rc->constrained_gf_group;
+      lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due;
+      lc->group_index = 0;
+    }
+  }
+  gf_group->size = p_rc->baseline_gf_interval;
+  gf_group->update_type[0] = (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
+  gf_group->refbuf_state[cpi->gf_frame_index] =
+      (frame_type == KEY_FRAME) ? REFBUF_RESET : REFBUF_UPDATE;
+}
+
+void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
+  const int resize_pending = is_frame_resize_pending(cpi);
+  if (!resize_pending && !rc->high_source_sad) {
+    // Check if we should disable GF refresh (if period is up),
+    // or force a GF refresh update (if we are at least halfway through
+    // period) based on QP. Look into add info on segment deltaq.
+    PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+    const int avg_qp = p_rc->avg_frame_qindex[INTER_FRAME];
+    const int allow_gf_update =
+        rc->frames_till_gf_update_due <= (p_rc->baseline_gf_interval - 10);
+    int gf_update_changed = 0;
+    int thresh = 87;
+    if (rc->frames_till_gf_update_due == 1 &&
+        cm->quant_params.base_qindex > avg_qp) {
+      // Disable GF refresh since QP is above the runninhg average QP.
+      svc->refresh[svc->gld_idx_1layer] = 0;
+      gf_update_changed = 1;
+      cpi->refresh_frame.golden_frame = 0;
+    } else if (allow_gf_update &&
+               ((cm->quant_params.base_qindex < thresh * avg_qp / 100) ||
+                (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 20))) {
+      // Force refresh since QP is well below average QP or this is a high
+      // motion frame.
+      svc->refresh[svc->gld_idx_1layer] = 1;
+      gf_update_changed = 1;
+      cpi->refresh_frame.golden_frame = 1;
+    }
+    if (gf_update_changed) {
+      set_baseline_gf_interval(cpi, INTER_FRAME);
+      int refresh_mask = 0;
+      for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+        int ref_frame_map_idx = svc->ref_idx[i];
+        refresh_mask |= svc->refresh[ref_frame_map_idx] << ref_frame_map_idx;
+      }
+      cm->current_frame.refresh_frame_flags = refresh_mask;
+    }
+  }
+}
+
+/*!\brief Setup the reference prediction structure for 1 pass real-time
+ *
+ * Set the reference prediction structure for 1 layer.
+ * Current structue is to use 3 references (LAST, GOLDEN, ALTREF),
+ * where ALT_REF always behind current by lag_alt frames, and GOLDEN is
+ * either updated on LAST with period baseline_gf_interval (fixed slot)
+ * or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7).
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       gf_update    Flag to indicate if GF is updated
+ *
+ * \return Nothing is returned. Instead the settings for the prediction
+ * structure are set in \c cpi-ext_flags; and the buffer slot index
+ * (for each of 7 references) and refresh flags (for each of the 8 slots)
+ * are set in \c cpi->svc.ref_idx[] and \c cpi->svc.refresh[].
+ */
+void av1_set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
   AV1_COMMON *const cm = &cpi->common;
   ExternalFlags *const ext_flags = &cpi->ext_flags;
+  RATE_CONTROL *const rc = &cpi->rc;
+  ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+      &ext_flags->refresh_frame;
   SVC *const svc = &cpi->svc;
-  // Specify the reference prediction structure, for 1 layer nonrd mode.
-  // Current structue is to use 3 references (LAST, GOLDEN, ALTREF),
-  // where ALT_REF always behind current by lag_alt frames, and GOLDEN is
-  // either updated on LAST with period baseline_gf_interval (fixed slot)
-  // or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7).
   const int gld_fixed_slot = 1;
-  const unsigned int lag_alt = 4;
+  unsigned int lag_alt = 4;
   int last_idx = 0;
   int last_idx_refresh = 0;
   int gld_idx = 0;
   int alt_ref_idx = 0;
-  ext_flags->refresh_frame_flags_pending = 1;
-  svc->external_ref_frame_config = 1;
+  int last2_idx = 0;
+  ext_refresh_frame_flags->update_pending = 1;
+  svc->set_ref_frame_config = 1;
   ext_flags->ref_frame_flags = 0;
-  ext_flags->refresh_last_frame = 1;
-  ext_flags->refresh_golden_frame = 0;
-  ext_flags->refresh_alt_ref_frame = 0;
+  ext_refresh_frame_flags->last_frame = 1;
+  ext_refresh_frame_flags->golden_frame = 0;
+  ext_refresh_frame_flags->alt_ref_frame = 0;
+  // Decide altref lag adaptively for rt
+  if (cpi->sf.rt_sf.sad_based_adp_altref_lag) {
+    lag_alt = 6;
+    const uint64_t th_frame_sad[4][3] = {
+      { 18000, 18000, 18000 },  // HDRES CPU 9
+      { 25000, 25000, 25000 },  // MIDRES CPU 9
+      { 40000, 30000, 20000 },  // HDRES CPU10
+      { 30000, 25000, 20000 }   // MIDRES CPU 10
+    };
+    int th_idx = cpi->sf.rt_sf.sad_based_adp_altref_lag - 1;
+    assert(th_idx < 4);
+    if (rc->avg_source_sad > th_frame_sad[th_idx][0])
+      lag_alt = 3;
+    else if (rc->avg_source_sad > th_frame_sad[th_idx][1])
+      lag_alt = 4;
+    else if (rc->avg_source_sad > th_frame_sad[th_idx][2])
+      lag_alt = 5;
+  }
   for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) svc->ref_idx[i] = 7;
   for (int i = 0; i < REF_FRAMES; ++i) svc->refresh[i] = 0;
-  // Always reference LAST, GOLDEN, ALTREF
+  // Set the reference frame flags.
   ext_flags->ref_frame_flags ^= AOM_LAST_FLAG;
-  ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
   ext_flags->ref_frame_flags ^= AOM_ALT_FLAG;
+  ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+  if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1])
+    ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG;
   const int sh = 7 - gld_fixed_slot;
   // Moving index slot for last: 0 - (sh - 1).
   if (cm->current_frame.frame_number > 1)
@@ -2000,118 +2647,574 @@ static void set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
   // Moving index for alt_ref, lag behind LAST by lag_alt frames.
   if (cm->current_frame.frame_number > lag_alt)
     alt_ref_idx = ((cm->current_frame.frame_number - lag_alt) % sh);
+  if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) {
+    // Moving index for LAST2, lag behind LAST by 2 frames.
+    if (cm->current_frame.frame_number > 2)
+      last2_idx = ((cm->current_frame.frame_number - 2) % sh);
+  }
   svc->ref_idx[0] = last_idx;          // LAST
   svc->ref_idx[1] = last_idx_refresh;  // LAST2 (for refresh of last).
-  svc->ref_idx[3] = gld_idx;           // GOLDEN
-  svc->ref_idx[6] = alt_ref_idx;       // ALT_REF
+  if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) {
+    svc->ref_idx[1] = last2_idx;         // LAST2
+    svc->ref_idx[2] = last_idx_refresh;  // LAST3 (for refresh of last).
+  }
+  svc->ref_idx[3] = gld_idx;      // GOLDEN
+  svc->ref_idx[6] = alt_ref_idx;  // ALT_REF
   // Refresh this slot, which will become LAST on next frame.
   svc->refresh[last_idx_refresh] = 1;
   // Update GOLDEN on period for fixed slot case.
-  if (gld_fixed_slot && gf_update) {
-    ext_flags->refresh_golden_frame = 1;
+  if (gld_fixed_slot && gf_update &&
+      cm->current_frame.frame_type != KEY_FRAME) {
+    ext_refresh_frame_flags->golden_frame = 1;
     svc->refresh[gld_idx] = 1;
   }
+  svc->gld_idx_1layer = gld_idx;
 }
 
-#define DEFAULT_KF_BOOST_RT 2300
-#define DEFAULT_GF_BOOST_RT 2000
+/*!\brief Check for scene detection, for 1 pass real-time mode.
+ *
+ * Compute average source sad (temporal sad: between current source and
+ * previous source) over a subset of superblocks. Use this is detect big changes
+ * in content and set the \c cpi->rc.high_source_sad flag.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ *
+ * \return Nothing is returned. Instead the flag \c cpi->rc.high_source_sad
+ * is set if scene change is detected, and \c cpi->rc.avg_source_sad is updated.
+ */
+static void rc_scene_detection_onepass_rt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  YV12_BUFFER_CONFIG const *unscaled_src = cpi->unscaled_source;
+  YV12_BUFFER_CONFIG const *unscaled_last_src = cpi->unscaled_last_source;
+  uint8_t *src_y;
+  int src_ystride;
+  int src_width;
+  int src_height;
+  uint8_t *last_src_y;
+  int last_src_ystride;
+  int last_src_width;
+  int last_src_height;
+  if (cm->spatial_layer_id != 0 || cm->width != cm->render_width ||
+      cm->height != cm->render_height || cpi->unscaled_source == NULL ||
+      cpi->unscaled_last_source == NULL) {
+    if (cpi->src_sad_blk_64x64) {
+      aom_free(cpi->src_sad_blk_64x64);
+      cpi->src_sad_blk_64x64 = NULL;
+    }
+  }
+  if (cpi->unscaled_source == NULL || cpi->unscaled_last_source == NULL) return;
+  src_y = unscaled_src->y_buffer;
+  src_ystride = unscaled_src->y_stride;
+  src_width = unscaled_src->y_width;
+  src_height = unscaled_src->y_height;
+  last_src_y = unscaled_last_src->y_buffer;
+  last_src_ystride = unscaled_last_src->y_stride;
+  last_src_width = unscaled_last_src->y_width;
+  last_src_height = unscaled_last_src->y_height;
+  if (src_width != last_src_width || src_height != last_src_height) {
+    if (cpi->src_sad_blk_64x64) {
+      aom_free(cpi->src_sad_blk_64x64);
+      cpi->src_sad_blk_64x64 = NULL;
+    }
+    return;
+  }
+  rc->high_source_sad = 0;
+  rc->high_num_blocks_with_motion = 0;
+  rc->prev_avg_source_sad = rc->avg_source_sad;
+  if (src_width == last_src_width && src_height == last_src_height) {
+    const int num_mi_cols = cm->mi_params.mi_cols;
+    const int num_mi_rows = cm->mi_params.mi_rows;
+    int num_zero_temp_sad = 0;
+    uint32_t min_thresh = 10000;
+    if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) min_thresh = 100000;
+    const BLOCK_SIZE bsize = BLOCK_64X64;
+    int full_sampling = (cm->width * cm->height < 640 * 360) ? 1 : 0;
+    // Loop over sub-sample of frame, compute average sad over 64x64 blocks.
+    uint64_t avg_sad = 0;
+    uint64_t tmp_sad = 0;
+    int num_samples = 0;
+    const int thresh = 6;
+    // SAD is computed on 64x64 blocks
+    const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+                                  ? (cm->seq_params->mib_size >> 1)
+                                  : cm->seq_params->mib_size;
+    const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+    const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
+    uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
+    int num_low_var_high_sumdiff = 0;
+    int light_change = 0;
+    // Flag to check light change or not.
+    const int check_light_change = 0;
+    // Store blkwise SAD for later use
+    if (cpi->sf.rt_sf.sad_based_comp_prune && (cm->spatial_layer_id == 0) &&
+        (cm->width == cm->render_width) && (cm->height == cm->render_height)) {
+      full_sampling = 1;
+      if (cpi->src_sad_blk_64x64 == NULL) {
+        CHECK_MEM_ERROR(
+            cm, cpi->src_sad_blk_64x64,
+            (uint64_t *)aom_calloc(sb_cols * sb_rows,
+                                   sizeof(*cpi->src_sad_blk_64x64)));
+      }
+    }
+    for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
+      for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
+        // Checker-board pattern, ignore boundary.
+        if (full_sampling ||
+            ((sbi_row > 0 && sbi_col > 0) &&
+             (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
+             ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
+              (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) {
+          tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+                                                last_src_ystride);
+          if (cpi->src_sad_blk_64x64 != NULL)
+            cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols] = tmp_sad;
+          if (check_light_change) {
+            unsigned int sse, variance;
+            variance = cpi->ppi->fn_ptr[bsize].vf(
+                src_y, src_ystride, last_src_y, last_src_ystride, &sse);
+            // Note: sse - variance = ((sum * sum) >> 12)
+            // Detect large lighting change.
+            if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) {
+              num_low_var_high_sumdiff++;
+            }
+          }
+          avg_sad += tmp_sad;
+          num_samples++;
+          if (tmp_sad == 0) num_zero_temp_sad++;
+        }
+        src_y += 64;
+        last_src_y += 64;
+      }
+      src_y += (src_ystride << 6) - (sb_cols << 6);
+      last_src_y += (last_src_ystride << 6) - (sb_cols << 6);
+    }
+    if (check_light_change && num_samples > 0 &&
+        num_low_var_high_sumdiff > (num_samples >> 1))
+      light_change = 1;
+    if (num_samples > 0) avg_sad = avg_sad / num_samples;
+    // Set high_source_sad flag if we detect very high increase in avg_sad
+    // between current and previous frame value(s). Use minimum threshold
+    // for cases where there is small change from content that is completely
+    // static.
+    if (!light_change &&
+        avg_sad >
+            AOMMAX(min_thresh, (unsigned int)(rc->avg_source_sad * thresh)) &&
+        rc->frames_since_key > 1 + cpi->svc.number_spatial_layers &&
+        num_zero_temp_sad < 3 * (num_samples >> 2))
+      rc->high_source_sad = 1;
+    else
+      rc->high_source_sad = 0;
+    rc->avg_source_sad = (3 * rc->avg_source_sad + avg_sad) >> 2;
+    rc->frame_source_sad = avg_sad;
+
+    if (num_zero_temp_sad < (3 * num_samples >> 2))
+      rc->high_num_blocks_with_motion = 1;
+  }
+  cpi->svc.high_source_sad_superframe = rc->high_source_sad;
+}
+
+/*!\brief Set the GF baseline interval for 1 pass real-time mode.
+ *
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       frame_type   frame type
+ *
+ * \return Return GF update flag, and update the \c cpi->rc with
+ * the next GF interval settings.
+ */
+static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi,
+                                             FRAME_TYPE frame_type) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int gf_update = 0;
+  const int resize_pending = is_frame_resize_pending(cpi);
+  // GF update based on frames_till_gf_update_due, also
+  // force upddate on resize pending frame or for scene change.
+  if ((resize_pending || rc->high_source_sad ||
+       rc->frames_till_gf_update_due == 0) &&
+      cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) {
+    set_baseline_gf_interval(cpi, frame_type);
+    gf_update = 1;
+  }
+  return gf_update;
+}
+
+static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
+                            int prev_width, int prev_height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  SVC *const svc = &cpi->svc;
+  double tot_scale_change = 1.0;
+  int target_bits_per_frame;
+  int active_worst_quality;
+  int qindex;
+  tot_scale_change = (double)(resize_width * resize_height) /
+                     (double)(prev_width * prev_height);
+  // Reset buffer level to optimal, update target size.
+  p_rc->buffer_level = p_rc->optimal_buffer_level;
+  p_rc->bits_off_target = p_rc->optimal_buffer_level;
+  rc->this_frame_target =
+      av1_calc_pframe_target_size_one_pass_cbr(cpi, INTER_FRAME);
+  target_bits_per_frame = rc->this_frame_target;
+  if (tot_scale_change > 4.0)
+    p_rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+  else if (tot_scale_change > 1.0)
+    p_rc->avg_frame_qindex[INTER_FRAME] =
+        (p_rc->avg_frame_qindex[INTER_FRAME] + rc->worst_quality) >> 1;
+  active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
+  qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality,
+                             active_worst_quality, resize_width, resize_height);
+  // If resize is down, check if projected q index is close to worst_quality,
+  // and if so, reduce the rate correction factor (since likely can afford
+  // lower q for resized frame).
+  if (tot_scale_change < 1.0 && qindex > 90 * cpi->rc.worst_quality / 100)
+    p_rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+  // Apply the same rate control reset to all temporal layers.
+  for (int tl = 0; tl < svc->number_temporal_layers; tl++) {
+    LAYER_CONTEXT *lc = NULL;
+    lc = &svc->layer_context[svc->spatial_layer_id *
+                                 svc->number_temporal_layers +
+                             tl];
+    lc->rc.resize_state = rc->resize_state;
+    lc->p_rc.buffer_level = lc->p_rc.optimal_buffer_level;
+    lc->p_rc.bits_off_target = lc->p_rc.optimal_buffer_level;
+    lc->p_rc.rate_correction_factors[INTER_FRAME] =
+        p_rc->rate_correction_factors[INTER_FRAME];
+  }
+  // If resize is back up: check if projected q index is too much above the
+  // previous index, and if so, reduce the rate correction factor
+  // (since prefer to keep q for resized frame at least closet to previous q).
+  // Also check if projected qindex is close to previous qindex, if so
+  // increase correction factor (to push qindex higher and avoid overshoot).
+  if (tot_scale_change >= 1.0) {
+    if (tot_scale_change < 4.0 &&
+        qindex > 130 * p_rc->last_q[INTER_FRAME] / 100)
+      p_rc->rate_correction_factors[INTER_NORMAL] *= 0.8;
+    if (qindex <= 120 * p_rc->last_q[INTER_FRAME] / 100)
+      p_rc->rate_correction_factors[INTER_NORMAL] *= 2.0;
+  }
+}
+
+/*!\brief ChecK for resize based on Q, for 1 pass real-time mode.
+ *
+ * Check if we should resize, based on average QP from past x frames.
+ * Only allow for resize at most 1/2 scale down for now, Scaling factor
+ * for each step may be 3/4 or 1/2.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ *
+ * \return Return resized width/height in \c cpi->resize_pending_params,
+ * and update some resize counters in \c rc.
+ */
+static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  RESIZE_ACTION resize_action = NO_RESIZE;
+  const int avg_qp_thr1 = 70;
+  const int avg_qp_thr2 = 50;
+  // Don't allow for resized frame to go below 160x90, resize in steps of 3/4.
+  const int min_width = (160 * 4) / 3;
+  const int min_height = (90 * 4) / 3;
+  int down_size_on = 1;
+  // Don't resize on key frame; reset the counters on key frame.
+  if (cm->current_frame.frame_type == KEY_FRAME) {
+    rc->resize_avg_qp = 0;
+    rc->resize_count = 0;
+    rc->resize_buffer_underflow = 0;
+    return;
+  }
+  // No resizing down if frame size is below some limit.
+  if ((cm->width * cm->height) < min_width * min_height) down_size_on = 0;
+
+  // Resize based on average buffer underflow and QP over some window.
+  // Ignore samples close to key frame, since QP is usually high after key.
+  if (cpi->rc.frames_since_key > cpi->framerate) {
+    const int window = AOMMIN(30, (int)(2 * cpi->framerate));
+    rc->resize_avg_qp += p_rc->last_q[INTER_FRAME];
+    if (cpi->ppi->p_rc.buffer_level <
+        (int)(30 * p_rc->optimal_buffer_level / 100))
+      ++rc->resize_buffer_underflow;
+    ++rc->resize_count;
+    // Check for resize action every "window" frames.
+    if (rc->resize_count >= window) {
+      int avg_qp = rc->resize_avg_qp / rc->resize_count;
+      // Resize down if buffer level has underflowed sufficient amount in past
+      // window, and we are at original or 3/4 of original resolution.
+      // Resize back up if average QP is low, and we are currently in a resized
+      // down state, i.e. 1/2 or 3/4 of original resolution.
+      // Currently, use a flag to turn 3/4 resizing feature on/off.
+      if (rc->resize_buffer_underflow > (rc->resize_count >> 2) &&
+          down_size_on) {
+        if (rc->resize_state == THREE_QUARTER) {
+          resize_action = DOWN_ONEHALF;
+          rc->resize_state = ONE_HALF;
+        } else if (rc->resize_state == ORIG) {
+          resize_action = DOWN_THREEFOUR;
+          rc->resize_state = THREE_QUARTER;
+        }
+      } else if (rc->resize_state != ORIG &&
+                 avg_qp < avg_qp_thr1 * cpi->rc.worst_quality / 100) {
+        if (rc->resize_state == THREE_QUARTER ||
+            avg_qp < avg_qp_thr2 * cpi->rc.worst_quality / 100) {
+          resize_action = UP_ORIG;
+          rc->resize_state = ORIG;
+        } else if (rc->resize_state == ONE_HALF) {
+          resize_action = UP_THREEFOUR;
+          rc->resize_state = THREE_QUARTER;
+        }
+      }
+      // Reset for next window measurement.
+      rc->resize_avg_qp = 0;
+      rc->resize_count = 0;
+      rc->resize_buffer_underflow = 0;
+    }
+  }
+  // If decision is to resize, reset some quantities, and check is we should
+  // reduce rate correction factor,
+  if (resize_action != NO_RESIZE) {
+    int resize_width = cpi->oxcf.frm_dim_cfg.width;
+    int resize_height = cpi->oxcf.frm_dim_cfg.height;
+    int resize_scale_num = 1;
+    int resize_scale_den = 1;
+    if (resize_action == DOWN_THREEFOUR || resize_action == UP_THREEFOUR) {
+      resize_scale_num = 3;
+      resize_scale_den = 4;
+    } else if (resize_action == DOWN_ONEHALF) {
+      resize_scale_num = 1;
+      resize_scale_den = 2;
+    }
+    resize_width = resize_width * resize_scale_num / resize_scale_den;
+    resize_height = resize_height * resize_scale_num / resize_scale_den;
+    resize_reset_rc(cpi, resize_width, resize_height, cm->width, cm->height);
+  }
+  return;
+}
+
+static INLINE int set_key_frame(AV1_COMP *cpi, unsigned int frame_flags) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  AV1_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+
+  // Very first frame has to be key frame.
+  if (cm->current_frame.frame_number == 0) return 1;
+  // Set key frame if forced by frame flags.
+  if (frame_flags & FRAMEFLAGS_KEY) return 1;
+  if (!cpi->ppi->use_svc) {
+    // Non-SVC
+    if (cpi->oxcf.kf_cfg.auto_key && rc->frames_to_key == 0) return 1;
+  } else {
+    // SVC
+    if (svc->spatial_layer_id == 0 &&
+        (cpi->oxcf.kf_cfg.auto_key &&
+         (cpi->oxcf.kf_cfg.key_freq_max == 0 ||
+          svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0)))
+      return 1;
+  }
+
+  return 0;
+}
 
 void av1_get_one_pass_rt_params(AV1_COMP *cpi,
                                 EncodeFrameParams *const frame_params,
                                 unsigned int frame_flags) {
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   AV1_COMMON *const cm = &cpi->common;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  SVC *const svc = &cpi->svc;
   ResizePendingParams *const resize_pending_params =
       &cpi->resize_pending_params;
-  int gf_update = 0;
   int target;
-  const int resize_pending =
-      (resize_pending_params->width && resize_pending_params->height &&
-       (cm->width != resize_pending_params->width ||
-        cm->height != resize_pending_params->height));
+  const int layer =
+      LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                       svc->number_temporal_layers);
   // Turn this on to explicitly set the reference structure rather than
   // relying on internal/default structure.
-  const int set_reference_structure = 1;
-  if (cpi->use_svc) {
+  if (cpi->ppi->use_svc) {
     av1_update_temporal_layer_framerate(cpi);
     av1_restore_layer_context(cpi);
   }
-  if ((!cpi->use_svc && rc->frames_to_key == 0) ||
-      (cpi->use_svc && cpi->svc.spatial_layer_id == 0 &&
-       cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) ||
-      (frame_flags & FRAMEFLAGS_KEY)) {
+  // Set frame type.
+  if (set_key_frame(cpi, frame_flags)) {
     frame_params->frame_type = KEY_FRAME;
-    rc->this_key_frame_forced =
+    p_rc->this_key_frame_forced =
         cm->current_frame.frame_number != 0 && rc->frames_to_key == 0;
-    rc->frames_to_key = cpi->oxcf.key_freq;
-    rc->kf_boost = DEFAULT_KF_BOOST_RT;
-    rc->source_alt_ref_active = 0;
-    gf_group->update_type[gf_group->index] = KF_UPDATE;
-    if (cpi->use_svc && cm->current_frame.frame_number > 0)
-      av1_svc_reset_temporal_layers(cpi, 1);
+    rc->frames_to_key = cpi->oxcf.kf_cfg.key_freq_max;
+    p_rc->kf_boost = DEFAULT_KF_BOOST_RT;
+    gf_group->update_type[cpi->gf_frame_index] = KF_UPDATE;
+    gf_group->frame_type[cpi->gf_frame_index] = KEY_FRAME;
+    gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_RESET;
+    if (cpi->ppi->use_svc) {
+      if (cm->current_frame.frame_number > 0)
+        av1_svc_reset_temporal_layers(cpi, 1);
+      svc->layer_context[layer].is_key_frame = 1;
+    }
   } else {
     frame_params->frame_type = INTER_FRAME;
-    gf_group->update_type[gf_group->index] = LF_UPDATE;
-  }
-  // GF update based on frames_till_gf_update_due, also
-  // force upddate on resize pending frame.
-  if ((resize_pending || rc->frames_till_gf_update_due == 0) &&
-      cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) {
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-      av1_cyclic_refresh_set_golden_update(cpi);
-    else
-      rc->baseline_gf_interval = MAX_GF_INTERVAL;
-    if (rc->baseline_gf_interval > rc->frames_to_key)
-      rc->baseline_gf_interval = rc->frames_to_key;
-    rc->gfu_boost = DEFAULT_GF_BOOST_RT;
-    rc->constrained_gf_group =
-        (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    gf_group->index = 0;
-    // SVC does not use GF as periodid boost.
-    // TODO(marpan): Find better way to disable this for SVC.
-    if (cpi->use_svc) {
-      SVC *const svc = &cpi->svc;
-      rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
-      rc->gfu_boost = 1;
-      rc->constrained_gf_group = 0;
-      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-      for (int layer = 0;
-           layer < svc->number_spatial_layers * svc->number_temporal_layers;
-           ++layer) {
-        LAYER_CONTEXT *const lc = &svc->layer_context[layer];
-        lc->rc.baseline_gf_interval = rc->baseline_gf_interval;
-        lc->rc.gfu_boost = rc->gfu_boost;
-        lc->rc.constrained_gf_group = rc->constrained_gf_group;
-        lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due;
-        lc->group_index = 0;
+    gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE;
+    gf_group->frame_type[cpi->gf_frame_index] = INTER_FRAME;
+    gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_UPDATE;
+    if (cpi->ppi->use_svc) {
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      lc->is_key_frame =
+          svc->spatial_layer_id == 0
+              ? 0
+              : svc->layer_context[svc->temporal_layer_id].is_key_frame;
+      // If the user is setting the SVC pattern with set_ref_frame_config and
+      // did not set any references, set the frame type to Intra-only.
+      if (svc->set_ref_frame_config) {
+        int no_references_set = 1;
+        for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+          if (svc->reference[i]) {
+            no_references_set = 0;
+            break;
+          }
+        }
+        // Set to intra_only_frame if no references are set.
+        // The stream can start decoding on INTRA_ONLY_FRAME so long as the
+        // layer with the intra_only_frame doesn't signal a reference to a slot
+        // that hasn't been set yet.
+        if (no_references_set) frame_params->frame_type = INTRA_ONLY_FRAME;
       }
     }
-    gf_group->size = rc->baseline_gf_interval;
-    gf_group->update_type[0] =
-        (frame_params->frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
-    gf_update = 1;
   }
-  if (cpi->oxcf.rc_mode == AOM_CBR) {
-    if (frame_params->frame_type == KEY_FRAME) {
+  // Check for scene change: for SVC check on base spatial layer only.
+  if (cpi->sf.rt_sf.check_scene_detection && svc->spatial_layer_id == 0)
+    rc_scene_detection_onepass_rt(cpi);
+  // Check for dynamic resize, for single spatial layer for now.
+  // For temporal layers only check on base temporal layer.
+  if (cpi->oxcf.resize_cfg.resize_mode == RESIZE_DYNAMIC) {
+    if (svc->number_spatial_layers == 1 && svc->temporal_layer_id == 0)
+      dynamic_resize_one_pass_cbr(cpi);
+    if (rc->resize_state == THREE_QUARTER) {
+      resize_pending_params->width = (3 + cpi->oxcf.frm_dim_cfg.width * 3) >> 2;
+      resize_pending_params->height =
+          (3 + cpi->oxcf.frm_dim_cfg.height * 3) >> 2;
+    } else if (rc->resize_state == ONE_HALF) {
+      resize_pending_params->width = (1 + cpi->oxcf.frm_dim_cfg.width) >> 1;
+      resize_pending_params->height = (1 + cpi->oxcf.frm_dim_cfg.height) >> 1;
+    } else {
+      resize_pending_params->width = cpi->oxcf.frm_dim_cfg.width;
+      resize_pending_params->height = cpi->oxcf.frm_dim_cfg.height;
+    }
+  } else if (is_frame_resize_pending(cpi)) {
+    resize_reset_rc(cpi, resize_pending_params->width,
+                    resize_pending_params->height, cm->width, cm->height);
+  }
+  // Set the GF interval and update flag.
+  if (!rc->rtc_external_ratectrl)
+    set_gf_interval_update_onepass_rt(cpi, frame_params->frame_type);
+  // Set target size.
+  if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
+    if (frame_params->frame_type == KEY_FRAME ||
+        frame_params->frame_type == INTRA_ONLY_FRAME) {
       target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
     } else {
       target = av1_calc_pframe_target_size_one_pass_cbr(
-          cpi, gf_group->update_type[gf_group->index]);
+          cpi, gf_group->update_type[cpi->gf_frame_index]);
     }
   } else {
-    if (frame_params->frame_type == KEY_FRAME) {
+    if (frame_params->frame_type == KEY_FRAME ||
+        frame_params->frame_type == INTRA_ONLY_FRAME) {
       target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
     } else {
       target = av1_calc_pframe_target_size_one_pass_vbr(
-          cpi, gf_group->update_type[gf_group->index]);
+          cpi, gf_group->update_type[cpi->gf_frame_index]);
     }
   }
+  if (cpi->oxcf.rc_cfg.mode == AOM_Q)
+    rc->active_worst_quality = cpi->oxcf.rc_cfg.cq_level;
+
   av1_rc_set_frame_target(cpi, target, cm->width, cm->height);
   rc->base_frame_target = target;
-  if (set_reference_structure && cpi->oxcf.speed >= 6 &&
-      cm->number_spatial_layers == 1 && cm->number_temporal_layers == 1)
-    set_reference_structure_one_pass_rt(cpi, gf_update);
   cm->current_frame.frame_type = frame_params->frame_type;
+  // For fixed mode SVC: if KSVC is enabled remove inter layer
+  // prediction on spatial enhancement layer frames for frames
+  // whose base is not KEY frame.
+  if (cpi->ppi->use_svc && !svc->use_flexible_mode && svc->ksvc_fixed_mode &&
+      svc->number_spatial_layers > 1 &&
+      !svc->layer_context[layer].is_key_frame) {
+    ExternalFlags *const ext_flags = &cpi->ext_flags;
+    ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+  }
+}
+
+int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  int thresh_qp = 7 * (rc->worst_quality >> 3);
+  // Lower thresh_qp for video (more overshoot at lower Q) to be
+  // more conservative for video.
+  if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN)
+    thresh_qp = 3 * (rc->worst_quality >> 2);
+  if (sf->rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ &&
+      cm->quant_params.base_qindex < thresh_qp) {
+    double rate_correction_factor =
+        cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL];
+    const int target_size = cpi->rc.avg_frame_bandwidth;
+    double new_correction_factor;
+    int target_bits_per_mb;
+    double q2;
+    int enumerator;
+    *q = (3 * cpi->rc.worst_quality + *q) >> 2;
+    // For screen content use the max-q set by the user to allow for less
+    // overshoot on slide changes.
+    if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)
+      *q = cpi->rc.worst_quality;
+    cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0;
+    // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as
+    // these parameters will affect QP selection for subsequent frames. If they
+    // have settled down to a very different (low QP) state, then not adjusting
+    // them may cause next frame to select low QP and overshoot again.
+    p_rc->avg_frame_qindex[INTER_FRAME] = *q;
+    p_rc->buffer_level = p_rc->optimal_buffer_level;
+    p_rc->bits_off_target = p_rc->optimal_buffer_level;
+    // Reset rate under/over-shoot flags.
+    cpi->rc.rc_1_frame = 0;
+    cpi->rc.rc_2_frame = 0;
+    // Adjust rate correction factor.
+    target_bits_per_mb =
+        (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs);
+    // Rate correction factor based on target_bits_per_mb and qp (==max_QP).
+    // This comes from the inverse computation of vp9_rc_bits_per_mb().
+    q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth);
+    enumerator = 1800000;  // Factor for inter frame.
+    enumerator += (int)(enumerator * q2) >> 12;
+    new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
+    if (new_correction_factor > rate_correction_factor) {
+      rate_correction_factor =
+          AOMMIN(2.0 * rate_correction_factor, new_correction_factor);
+      if (rate_correction_factor > MAX_BPB_FACTOR)
+        rate_correction_factor = MAX_BPB_FACTOR;
+      cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL] =
+          rate_correction_factor;
+    }
+    // For temporal layers: reset the rate control parameters across all
+    // temporal layers.
+    if (cpi->svc.number_temporal_layers > 1) {
+      SVC *svc = &cpi->svc;
+      for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+        int sl = svc->spatial_layer_id;
+        const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+        LAYER_CONTEXT *lc = &svc->layer_context[layer];
+        RATE_CONTROL *lrc = &lc->rc;
+        PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
+        lp_rc->avg_frame_qindex[INTER_FRAME] = *q;
+        lp_rc->buffer_level = lp_rc->optimal_buffer_level;
+        lp_rc->bits_off_target = lp_rc->optimal_buffer_level;
+        lrc->rc_1_frame = 0;
+        lrc->rc_2_frame = 0;
+        lp_rc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+      }
+    }
+    return 1;
+  } else {
+    return 0;
+  }
 }
diff --git a/media/libaom/src/av1/encoder/ratectrl.h b/media/libaom/src/av1/encoder/ratectrl.h
index c463786635..5ac9660ab8 100644
--- a/media/libaom/src/av1/encoder/ratectrl.h
+++ b/media/libaom/src/av1/encoder/ratectrl.h
@@ -24,6 +24,8 @@
 extern "C" {
 #endif
 
+/*!\cond */
+
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS 9
 
@@ -39,21 +41,32 @@ extern "C" {
 // The maximum duration of a GF group that is static (e.g. a slide show).
 #define MAX_STATIC_GF_GROUP_LENGTH 250
 
-// Minimum and maximum height for the new pyramid structure.
-// (Old structure supports height = 1, but does NOT support height = 4).
-#define MIN_PYRAMID_LVL 0
-#define MAX_PYRAMID_LVL 4
-
 #define MIN_GF_INTERVAL 4
 #define MAX_GF_INTERVAL 32
-#define FIXED_GF_INTERVAL 8  // Used in some testing modes only
+#define FIXED_GF_INTERVAL 16
 #define MAX_GF_LENGTH_LAP 16
 
+#define FIXED_GF_INTERVAL_RT 80
+#define MAX_GF_INTERVAL_RT 160
+
 #define MAX_NUM_GF_INTERVALS 15
 
 #define MAX_ARF_LAYERS 6
 // #define STRICT_RC
 
+#define DEFAULT_KF_BOOST_RT 2300
+#define DEFAULT_GF_BOOST_RT 2000
+
+// A passive rate control strategy for screen content type in real-time mode.
+// When it is turned on, the compression performance is improved by
+// 7.8% (overall_psnr), 5.0% (VMAF) on average. Some clips see gains
+// over 20% on metric.
+// The downside is that it does not guarantee frame size.
+// Since RT mode has a tight restriction on buffer overflow control, we
+// turn it off by default.
+#define RT_PASSIVE_STRATEGY 0
+#define MAX_Q_HISTORY 1000
+
 typedef struct {
   int resize_width;
   int resize_height;
@@ -79,50 +92,108 @@ enum {
   FRAME_UPDATE_TYPES
 } UENUM1BYTE(FRAME_UPDATE_TYPE);
 
+enum {
+  REFBUF_RESET,   // Clear reference frame buffer
+  REFBUF_UPDATE,  // Refresh reference frame buffer
+  REFBUF_STATES
+} UENUM1BYTE(REFBUF_STATE);
+
+typedef enum {
+  NO_RESIZE = 0,
+  DOWN_THREEFOUR = 1,  // From orig to 3/4.
+  DOWN_ONEHALF = 2,    // From orig or 3/4 to 1/2.
+  UP_THREEFOUR = -1,   // From 1/2 to 3/4.
+  UP_ORIG = -2,        // From 1/2 or 3/4 to orig.
+} RESIZE_ACTION;
+
+typedef enum { ORIG = 0, THREE_QUARTER = 1, ONE_HALF = 2 } RESIZE_STATE;
+
+#define MAX_FIRSTPASS_ANALYSIS_FRAMES 150
+typedef enum region_types {
+  STABLE_REGION = 0,
+  HIGH_VAR_REGION = 1,
+  SCENECUT_REGION = 2,
+  BLENDING_REGION = 3,
+} REGION_TYPES;
+
+typedef struct regions {
+  int start;
+  int last;
+  double avg_noise_var;
+  double avg_cor_coeff;
+  double avg_sr_fr_ratio;
+  double avg_intra_err;
+  double avg_coded_err;
+  REGION_TYPES type;
+} REGIONS;
+
+/*!\endcond */
+/*!
+ * \brief  Rate Control parameters and status
+ */
 typedef struct {
   // Rate targetting variables
-  int base_frame_target;  // A baseline frame target before adjustment
-                          // for previous under or over shoot.
-  int this_frame_target;  // Actual frame target after rc adjustment.
 
-  // gop bit budget
-  int64_t gf_group_bits;
+  /*!
+   * Baseline target rate for frame before adjustment for previous under or
+   * over shoot.
+   */
+  int base_frame_target;
+  /*!
+   * Target rate for frame after adjustment for previous under or over shoot.
+   */
+  int this_frame_target;  // Actual frame target after rc adjustment.
 
+  /*!
+   * Projected size for current frame
+   */
   int projected_frame_size;
-  int sb64_target_rate;
-  int last_q[FRAME_TYPES];  // Separate values for Intra/Inter
-  int last_boosted_qindex;  // Last boosted GF/KF/ARF q
-  int last_kf_qindex;       // Q index of the last key frame coded.
 
-  int gfu_boost;
-  int kf_boost;
+  /*!
+   * Bit size of transform coefficient for current frame.
+   */
+  int coefficient_size;
 
-  double rate_correction_factors[RATE_FACTOR_LEVELS];
+  /*!
+   * Super block rate target used with some adaptive quantization strategies.
+   */
+  int sb64_target_rate;
 
+  /*!
+   * Number of frames since the last ARF / GF.
+   */
   int frames_since_golden;
+
+  /*!
+   * Number of frames till the next ARF / GF is due.
+   */
   int frames_till_gf_update_due;
 
-  // number of determined gf group length left
+  /*!
+   * Number of determined gf groups left
+   */
   int intervals_till_gf_calculate_due;
-  // stores gf group length intervals
-  int gf_intervals[MAX_NUM_GF_INTERVALS];
-  // the current index in gf_intervals
-  int cur_gf_index;
 
+  /*!\cond */
   int min_gf_interval;
   int max_gf_interval;
   int static_scene_max_gf_interval;
-  int baseline_gf_interval;
-  int constrained_gf_group;
+  /*!\endcond */
+  /*!
+   * Frames before the next key frame
+   */
   int frames_to_key;
+  /*!\cond */
   int frames_since_key;
-  int this_key_frame_forced;
-  int next_key_frame_forced;
-  int source_alt_ref_pending;
-  int source_alt_ref_active;
+  int frames_to_fwd_kf;
   int is_src_frame_alt_ref;
   int sframe_due;
 
+  int high_source_sad;
+  uint64_t avg_source_sad;
+  uint64_t prev_avg_source_sad;
+  uint64_t frame_source_sad;
+
   int avg_frame_bandwidth;  // Average frame size target for clip
   int min_frame_bandwidth;  // Minimum allocation used for any frame
   int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
@@ -130,72 +201,355 @@ typedef struct {
 
   int ni_av_qi;
   int ni_tot_qi;
-  int ni_frames;
-  int avg_frame_qindex[FRAME_TYPES];
-  double tot_q;
-  double avg_q;
-
-  int64_t buffer_level;
-  int64_t bits_off_target;
-  int64_t vbr_bits_off_target;
-  int64_t vbr_bits_off_target_fast;
 
   int decimation_factor;
   int decimation_count;
 
-  int rolling_target_bits;
-  int rolling_actual_bits;
-
-  int long_rolling_target_bits;
-  int long_rolling_actual_bits;
-
-  int rate_error_estimate;
-
-  int64_t total_actual_bits;
-  int64_t total_target_bits;
-  int64_t total_target_vs_actual;
-
+  /*!\endcond */
+  /*!
+   * User specified maximum Q allowed for current frame
+   */
   int worst_quality;
+  /*!
+   * User specified minimum Q allowed for current frame
+   */
   int best_quality;
 
-  int64_t starting_buffer_level;
-  int64_t optimal_buffer_level;
-  int64_t maximum_buffer_size;
+  /*!\cond */
 
   // rate control history for last frame(1) and the frame before(2).
-  // -1: undershot
-  //  1: overshoot
+  // -1: overshoot
+  //  1: undershoot
   //  0: not initialized.
   int rc_1_frame;
   int rc_2_frame;
   int q_1_frame;
   int q_2_frame;
 
-  float_t arf_boost_factor;
-  // Q index used for ALT frame
-  int arf_q;
+  /*!\endcond */
+  /*!
+   * Proposed maximum alloed Q for current frame
+   */
   int active_worst_quality;
-  int active_best_quality[MAX_ARF_LAYERS + 1];
+
+  /*!\cond */
+  // Track amount of low motion in scene
+  int avg_frame_low_motion;
+
+  // signals if number of blocks with motion is high
+  int high_num_blocks_with_motion;
+
+  // For dynamic resize, 1 pass cbr.
+  RESIZE_STATE resize_state;
+  int resize_avg_qp;
+  int resize_buffer_underflow;
+  int resize_count;
+
+  // Flag to disable content related qp adjustment.
+  int rtc_external_ratectrl;
+
+  // Stores fast_extra_bits of the current frame.
+  int frame_level_fast_extra_bits;
+
+  double frame_level_rate_correction_factors[RATE_FACTOR_LEVELS];
+  /*!\endcond */
+} RATE_CONTROL;
+
+/*!
+ * \brief  Primary Rate Control parameters and status
+ */
+typedef struct {
+  // Sub-gop level Rate targetting variables
+
+  /*!
+   * Target bit budget for the current GF / ARF group of frame.
+   */
+  int64_t gf_group_bits;
+
+  /*!
+   * Boost factor used to calculate the extra bits allocated to the key frame
+   */
+  int kf_boost;
+
+  /*!
+   * Boost factor used to calculate the extra bits allocated to ARFs and GFs
+   */
+  int gfu_boost;
+
+  /*!
+   * Stores the determined gf group lengths for a set of gf groups
+   */
+  int gf_intervals[MAX_NUM_GF_INTERVALS];
+
+  /*!
+   * The current group's index into gf_intervals[]
+   */
+  int cur_gf_index;
+
+  /*!\cond */
+  int num_regions;
+
+  REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
+  int regions_offset;  // offset of regions from the last keyframe
+  int frames_till_regions_update;
+
+  int baseline_gf_interval;
+
+  int constrained_gf_group;
+
+  int this_key_frame_forced;
+
+  int next_key_frame_forced;
+  /*!\endcond */
+
+  /*!
+   * Initial buffuer level in ms for CBR / low delay encoding
+   */
+  int64_t starting_buffer_level;
+
+  /*!
+   * Optimum / target buffuer level in ms for CBR / low delay encoding
+   */
+  int64_t optimal_buffer_level;
+
+  /*!
+   * Maximum target buffuer level in ms for CBR / low delay encoding
+   */
+  int64_t maximum_buffer_size;
+
+  /*!
+   * Q index used for ALT frame
+   */
+  int arf_q;
+
+  /*!\cond */
+  float_t arf_boost_factor;
+
   int base_layer_qp;
 
   // Total number of stats used only for kf_boost calculation.
   int num_stats_used_for_kf_boost;
+
   // Total number of stats used only for gfu_boost calculation.
   int num_stats_used_for_gfu_boost;
+
   // Total number of stats required by gfu_boost calculation.
   int num_stats_required_for_gfu_boost;
-  int next_is_fwd_key;
+
   int enable_scenecut_detection;
-} RATE_CONTROL;
+
+  int use_arf_in_this_kf_group;
+
+  int ni_frames;
+
+  double tot_q;
+  /*!\endcond */
+
+  /*!
+   * Q used for last boosted (non leaf) frame
+   */
+  int last_kf_qindex;
+
+  /*!
+   * Average of q index of previous encoded frames in a sequence.
+   */
+  int avg_frame_qindex[FRAME_TYPES];
+
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * active_best_quality.
+   */
+  int temp_active_best_quality[MAX_ARF_LAYERS + 1];
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * last_boosted_qindex.
+   */
+  int temp_last_boosted_qindex;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * avg_q.
+   */
+  double temp_avg_q;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * last_q.
+   */
+  int temp_last_q[FRAME_TYPES];
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * projected_frame_size.
+   */
+  int temp_projected_frame_size;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * total_actual_bits.
+   */
+  int64_t temp_total_actual_bits;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * buffer_level.
+   */
+  int64_t temp_buffer_level;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * vbr_bits_off_target.
+   */
+  int64_t temp_vbr_bits_off_target;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * vbr_bits_off_target_fast.
+   */
+  int64_t temp_vbr_bits_off_target_fast;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * rate_correction_factors.
+   */
+  double temp_rate_correction_factors[RATE_FACTOR_LEVELS];
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * rate_error_estimate.
+   */
+  int temp_rate_error_estimate;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * rolling_arf_group_target_bits.
+   */
+  int temp_rolling_arf_group_target_bits;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * rolling_arf_group_actual_bits;.
+   */
+  int temp_rolling_arf_group_actual_bits;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * bits_left;.
+   */
+  int64_t temp_bits_left;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * extend_minq.
+   */
+  int temp_extend_minq;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * extend_maxq.
+   */
+  int temp_extend_maxq;
+
+  /*!
+   * Temporary variable used in simulating the delayed update of
+   * extend_minq_fast.
+   */
+  int temp_extend_minq_fast;
+#endif
+  /*!
+   * Proposed minimum allowed Q different layers in a coding pyramid
+   */
+  int active_best_quality[MAX_ARF_LAYERS + 1];
+
+  /*!
+   * Q used for last boosted (non leaf) frame (GF/KF/ARF)
+   */
+  int last_boosted_qindex;
+
+  /*!
+   * Average Q value of previous inter frames
+   */
+  double avg_q;
+
+  /*!
+   * Q used on last encoded frame of the given type.
+   */
+  int last_q[FRAME_TYPES];
+
+  /*!
+   * Correction factors used to adjust the q estimate for a given target rate
+   * in the encode loop.
+   */
+  double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+  /*!
+   * Current total consumed bits.
+   */
+  int64_t total_actual_bits;
+
+  /*!
+   * Current total target bits.
+   */
+  int64_t total_target_bits;
+
+  /*!
+   * Current buffer level.
+   */
+  int64_t buffer_level;
+
+  /*!
+   * PCT rc error.
+   */
+  int rate_error_estimate;
+
+  /*!
+   * Error bits available from previously encoded frames.
+   */
+  int64_t vbr_bits_off_target;
+
+  /*!
+   * Error bits available from previously encoded frames undershoot.
+   */
+  int64_t vbr_bits_off_target_fast;
+
+  /*!
+   * Total bits deviated from the average frame target, from previously
+   * encoded frames.
+   */
+  int64_t bits_off_target;
+
+  /*!
+   * Rolling monitor target bits updated based on current frame target size.
+   */
+  int rolling_target_bits;
+
+  /*!
+   * Rolling monitor actual bits updated based on current frame final projected
+   * size.
+   */
+  int rolling_actual_bits;
+
+  /*!
+   * The history of qindex for each frame.
+   * Only used when RT_PASSIVE_STRATEGY = 1.
+   */
+  int q_history[MAX_Q_HISTORY];
+} PRIMARY_RATE_CONTROL;
 
 struct AV1_COMP;
 struct AV1EncoderConfig;
+struct GF_GROUP;
+
+void av1_primary_rc_init(const struct AV1EncoderConfig *oxcf,
+                         PRIMARY_RATE_CONTROL *p_rc);
 
-void av1_rc_init(const struct AV1EncoderConfig *oxcf, int pass,
-                 RATE_CONTROL *rc);
+void av1_rc_init(const struct AV1EncoderConfig *oxcf, RATE_CONTROL *rc);
 
 int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
-                           double correction_factor, aom_bit_depth_t bit_depth);
+                           double correction_factor, aom_bit_depth_t bit_depth,
+                           const int is_screen_content_type);
 
 double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth);
 
@@ -210,18 +564,17 @@ int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval);
 // Generally at the high level, the following flow is expected
 // to be enforced for rate control:
 // First call per frame, one of:
-//   av1_rc_get_first_pass_params()
-//   av1_rc_get_second_pass_params()
+//   av1_get_one_pass_rt_params()
+//   av1_get_second_pass_params()
 // depending on the usage to set the rate control encode parameters desired.
 //
 // Then, call encode_frame_to_data_rate() to perform the
 // actual encode. This function will in turn call encode_frame()
-// one or more times, followed by one of:
-//   av1_rc_postencode_update()
+// one or more times, followed by:
 //   av1_rc_postencode_update_drop_frame()
 //
 // The majority of rate control parameters are only expected
-// to be set in the av1_rc_get_..._params() functions and
+// to be set in the av1_get_..._params() functions and
 // updated during the av1_rc_postencode_update...() functions.
 // The only exceptions are av1_rc_drop_frame() and
 // av1_rc_update_rate_correction_factors() functions.
@@ -236,10 +589,24 @@ void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used);
 // Post encode update of the rate control parameters for dropped frames
 void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi);
 
-// Updates rate correction factors
-// Changes only the rate correction factors in the rate control structure.
-void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi, int width,
+/*!\endcond */
+/*!\brief Updates the rate correction factor linking Q to output bits
+ *
+ * This function updates the Q rate correction factor after an encode
+ * cycle depending on whether we overshot or undershot the target rate.
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   is_encode_stage       Indicates if recode loop or post-encode
+ * \param[in]   width                 Frame width
+ * \param[in]   height                Frame height
+ *
+ * \return None but updates the relevant rate correction factor in cpi->rc
+ */
+void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi,
+                                           int is_encode_stage, int width,
                                            int height);
+/*!\cond */
 
 // Decide if we should drop this frame: For 1-pass CBR.
 // Changes only the decimation count in the rate control structure
@@ -251,23 +618,48 @@ void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit);
 
-// Picks q and q bounds given the target for bits
-int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, RATE_CONTROL *rc,
-                             int width, int height, int gf_index,
-                             int *bottom_index, int *top_index);
+/*!\endcond */
 
-// Estimates q to achieve a target bits per frame
+/*!\brief Picks q and q bounds given the rate control parameters in \c cpi->rc.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       width        Coded frame width
+ * \param[in]       height       Coded frame height
+ * \param[in]       gf_index     Index of this frame in the golden frame group
+ * \param[out]      bottom_index Bottom bound for q index (best quality)
+ * \param[out]      top_index    Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ * Also, updates \c rc->arf_q.
+ */
+int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int width, int height,
+                             int gf_index, int *bottom_index, int *top_index);
+
+/*!\brief Estimates q to achieve a target bits per frame
+ *
+ * \ingroup rate_control
+ * \param[in]   cpi                   Top level encoder instance structure
+ * \param[in]   target_bits_per_frame Frame rate target
+ * \param[in]   active_worst_quality  Max Q allowed
+ * \param[in]   active_best_quality   Min Q allowed
+ * \param[in]   width                 Frame width
+ * \param[in]   height                Frame height
+ *
+ * \return Returns a q index value
+ */
 int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
                       int active_best_quality, int active_worst_quality,
                       int width, int height);
 
+/*!\cond */
 // Estimates bits per mb for a given qindex and correction factor.
 int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                       double correction_factor, aom_bit_depth_t bit_depth);
+                       double correction_factor, aom_bit_depth_t bit_depth,
+                       const int is_screen_content_type);
 
 // Clamping utilities for bitrate targets for iframes and pframes.
 int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
-                                    int target);
+                                    int64_t target);
 int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi,
                                     int target, uint8_t frame_update_type);
 
@@ -287,6 +679,7 @@ int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
 // to a value that should equate to the given rate ratio.
 int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
                                int qindex, double rate_target_ratio,
+                               const int is_screen_content_type,
                                aom_bit_depth_t bit_depth);
 
 int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q);
@@ -303,20 +696,138 @@ int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
 void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width,
                              int height);
 
+void av1_adjust_gf_refresh_qp_one_pass_rt(struct AV1_COMP *cpi);
+
+void av1_set_reference_structure_one_pass_rt(struct AV1_COMP *cpi,
+                                             int gf_update);
+
+/*!\endcond */
+/*!\brief Calculates how many bits to use for a P frame in one pass vbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi                 Top level encoder structure
+ * \param[in]       frame_update_type   Type of frame
+ *
+ * \return	Returns the target number of bits for this frame.
+ */
 int av1_calc_pframe_target_size_one_pass_vbr(
     const struct AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type);
 
+/*!\brief Calculates how many bits to use for an i frame in one pass vbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return	Returns the target number of bits for this frame.
+ */
 int av1_calc_iframe_target_size_one_pass_vbr(const struct AV1_COMP *const cpi);
 
+/*!\brief Calculates how many bits to use for a P frame in one pass cbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi                 Top level encoder structure
+ * \param[in]       frame_update_type   Type of frame
+ *
+ * \return  Returns the target number of bits for this frame.
+ */
 int av1_calc_pframe_target_size_one_pass_cbr(
     const struct AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type);
 
+/*!\brief Calculates how many bits to use for an i frame in one pass cbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Returns the target number of bits for this frame.
+ */
 int av1_calc_iframe_target_size_one_pass_cbr(const struct AV1_COMP *cpi);
 
+/*!\brief Setup the rate control parameters for 1 pass real-time mode.
+ *
+ * - Sets the frame type and target frame size.
+ * - Sets the GF update.
+ * - Checks for scene change.
+ * - Sets the reference prediction structure for 1 layers (non-SVC).
+ * - Resets and updates are done for SVC.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       frame_params Encoder frame parameters
+ * \param[in]       frame_flags  Emcoder frame flags
+ *
+ * \return Nothing is returned. Instead the settings computed in this
+ * funtion are set in: \c frame_params, \c cpi->common, \c cpi->rc, \c cpi->svc.
+ */
 void av1_get_one_pass_rt_params(struct AV1_COMP *cpi,
                                 struct EncodeFrameParams *const frame_params,
                                 unsigned int frame_flags);
 
+/*!\brief Increase q on expected encoder overshoot, for CBR mode.
+ *
+ *  Handles the case when encoder is expected to create a large frame:
+ *  - q is increased to value closer to \c cpi->rc.worst_quality
+ *  - avg_frame_qindex is reset
+ *  - buffer levels are reset
+ *  - rate correction factor is adjusted
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]        q           Current q index
+ *
+ * \return q is returned, and updates are done to \c cpi->rc.
+ */
+int av1_encodedframe_overshoot_cbr(struct AV1_COMP *cpi, int *q);
+
+/*!\brief Compute the q_indices for a single frame.
+ *
+ * Intended to be used with AOM_Q mode.
+ *
+ * \param[in]       base_q_index      Base q index
+ * \param[in]       gf_update_type    GOP update type
+ * \param[in]       gf_pyramid_level  GOP level of the current frame
+ * \param[in]       arf_q             ARF q_index
+ *
+ * \return Returns the q_index for the current frame.
+ */
+int av1_q_mode_get_q_index(int base_q_index, int gf_update_type,
+                           int gf_pyramid_level, int arf_q);
+
+/*!\brief Compute the q_indices for the ARF of a GOP.
+ *
+ * \param[in]       base_q_index      Base q index
+ * \param[in]       gfu_boost         GFU boost
+ * \param[in]       bit_depth         Bit depth
+ * \param[in]       arf_boost_factor  ARF boost factor
+ *
+ * \return Returns the q_index for the ARF frame.
+ */
+int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
+                        double arf_boost_factor);
+
+#if !CONFIG_REALTIME_ONLY
+struct TplDepFrame;
+/*!\brief Compute the q_indices for the ARF of a GOP in Q mode.
+ *
+ * \param[in]       cpi               Top level encoder structure
+ * \param[in]       tpl_frame         Tpl Frame stats
+ *
+ * \return Returns the q_index for the ARF frame.
+ */
+int av1_get_arf_q_index_q_mode(struct AV1_COMP *cpi,
+                               struct TplDepFrame *tpl_frame);
+#endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/rc_utils.h b/media/libaom/src/av1/encoder/rc_utils.h
new file mode 100644
index 0000000000..fe22ee5afb
--- /dev/null
+++ b/media/libaom/src/av1/encoder/rc_utils.h
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RC_UTILS_H_
+#define AOM_AV1_ENCODER_RC_UTILS_H_
+
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/psnr.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static AOM_INLINE void check_reset_rc_flag(AV1_COMP *cpi) {
+  RATE_CONTROL *rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  if (cpi->common.current_frame.frame_number >
+      (unsigned int)cpi->svc.number_spatial_layers) {
+    if (cpi->ppi->use_svc) {
+      av1_svc_check_reset_layer_rc_flag(cpi);
+    } else {
+      if (rc->avg_frame_bandwidth > (3 * rc->prev_avg_frame_bandwidth >> 1) ||
+          rc->avg_frame_bandwidth < (rc->prev_avg_frame_bandwidth >> 1)) {
+        rc->rc_1_frame = 0;
+        rc->rc_2_frame = 0;
+        p_rc->bits_off_target = p_rc->optimal_buffer_level;
+        p_rc->buffer_level = p_rc->optimal_buffer_level;
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_primary_rc_buffer_sizes(const AV1EncoderConfig *oxcf,
+                                                   AV1_PRIMARY *ppi) {
+  PRIMARY_RATE_CONTROL *p_rc = &ppi->p_rc;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  const int64_t bandwidth = rc_cfg->target_bandwidth;
+  const int64_t starting = rc_cfg->starting_buffer_level_ms;
+  const int64_t optimal = rc_cfg->optimal_buffer_level_ms;
+  const int64_t maximum = rc_cfg->maximum_buffer_size_ms;
+
+  p_rc->starting_buffer_level = starting * bandwidth / 1000;
+  p_rc->optimal_buffer_level =
+      (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
+  p_rc->maximum_buffer_size =
+      (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  p_rc->bits_off_target =
+      AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+  p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size);
+}
+
+static AOM_INLINE void config_target_level(AV1_COMP *const cpi,
+                                           AV1_LEVEL target_level, int tier) {
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  SequenceHeader *const seq_params = cpi->common.seq_params;
+  TileConfig *const tile_cfg = &oxcf->tile_cfg;
+  RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  // Adjust target bitrate to be no larger than 70% of level limit.
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  const double level_bitrate_limit =
+      av1_get_max_bitrate_for_level(target_level, tier, profile);
+  const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70);
+  rc_cfg->target_bandwidth = AOMMIN(rc_cfg->target_bandwidth, max_bitrate);
+  // Also need to update cpi->ppi->twopass.bits_left.
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats;
+  if (stats != NULL)
+    cpi->ppi->twopass.bits_left =
+        (int64_t)(stats->duration * rc_cfg->target_bandwidth / 10000000.0);
+
+  // Adjust max over-shoot percentage.
+  rc_cfg->over_shoot_pct = 0;
+
+  // Adjust max quantizer.
+  rc_cfg->worst_allowed_q = 255;
+
+  // Adjust number of tiles and tile columns to be under level limit.
+  int max_tiles, max_tile_cols;
+  av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols);
+  while (tile_cfg->tile_columns > 0 &&
+         (1 << tile_cfg->tile_columns) > max_tile_cols) {
+    --tile_cfg->tile_columns;
+  }
+  const int tile_cols = (1 << tile_cfg->tile_columns);
+  while (tile_cfg->tile_rows > 0 &&
+         tile_cols * (1 << tile_cfg->tile_rows) > max_tiles) {
+    --tile_cfg->tile_rows;
+  }
+
+  // Adjust min compression ratio.
+  const int still_picture = seq_params->still_picture;
+  const double min_cr =
+      av1_get_min_cr_for_level(target_level, tier, still_picture);
+  rc_cfg->min_cr = AOMMAX(rc_cfg->min_cr, (unsigned int)(min_cr * 100));
+}
+
+#if !CONFIG_REALTIME_ONLY
+
+/*!\brief Function to test for conditions that indicate we should loop
+ * back and recode a frame.
+ *
+ * \ingroup rate_control
+ *
+ * \param[in]     cpi         Top-level encoder structure
+ * \param[in]     high_limit  Upper rate threshold
+ * \param[in]     low_limit   Lower rate threshold
+ * \param[in]     q           Current q index
+ * \param[in]     maxq        Maximum allowed q index
+ * \param[in]     minq        Minimum allowed q index
+ *
+ * \return        Indicates if a recode is required.
+ * \retval        1           Recode Required
+ * \retval        0           No Recode required
+ */
+static AOM_INLINE int recode_loop_test(AV1_COMP *cpi, int high_limit,
+                                       int low_limit, int q, int maxq,
+                                       int minq) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+  int force_recode = 0;
+
+  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) ||
+      (frame_is_kfgfarf &&
+       (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+    // TODO(agrange) high_limit could be greater than the scale-down threshold.
+    if ((rc->projected_frame_size > high_limit && q < maxq) ||
+        (rc->projected_frame_size < low_limit && q > minq)) {
+      force_recode = 1;
+    } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) {
+      // Deal with frame undershoot and whether or not we are
+      // below the automatically set cq level.
+      if (q > oxcf->rc_cfg.cq_level &&
+          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+        force_recode = 1;
+      }
+    }
+  }
+  return force_recode;
+}
+
+static AOM_INLINE double av1_get_gfu_boost_projection_factor(double min_factor,
+                                                             double max_factor,
+                                                             int frame_count) {
+  double factor = sqrt((double)frame_count);
+  factor = AOMMIN(factor, max_factor);
+  factor = AOMMAX(factor, min_factor);
+  factor = (200.0 + 10.0 * factor);
+  return factor;
+}
+
+static AOM_INLINE int get_gfu_boost_from_r0_lap(double min_factor,
+                                                double max_factor, double r0,
+                                                int frames_to_key) {
+  double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor,
+                                                      frames_to_key);
+  const int boost = (int)rint(factor / r0);
+  return boost;
+}
+
+static AOM_INLINE double av1_get_kf_boost_projection_factor(int frame_count) {
+  double factor = sqrt((double)frame_count);
+  factor = AOMMIN(factor, 10.0);
+  factor = AOMMAX(factor, 4.0);
+  factor = (75.0 + 14.0 * factor);
+  return factor;
+}
+
+static AOM_INLINE int get_regulated_q_overshoot(AV1_COMP *const cpi,
+                                                int is_encode_stage, int q_low,
+                                                int q_high, int top_index,
+                                                int bottom_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+
+  av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+                                        cm->height);
+
+  int q_regulated =
+      av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                        AOMMAX(q_high, top_index), cm->width, cm->height);
+
+  int retries = 0;
+  while (q_regulated < q_low && retries < 10) {
+    av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+                                          cm->height);
+    q_regulated =
+        av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                          AOMMAX(q_high, top_index), cm->width, cm->height);
+    retries++;
+  }
+  return q_regulated;
+}
+
+static AOM_INLINE int get_regulated_q_undershoot(AV1_COMP *const cpi,
+                                                 int is_encode_stage,
+                                                 int q_high, int top_index,
+                                                 int bottom_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+
+  av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+                                        cm->height);
+  int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                      top_index, cm->width, cm->height);
+
+  int retries = 0;
+  while (q_regulated > q_high && retries < 10) {
+    av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+                                          cm->height);
+    q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                    top_index, cm->width, cm->height);
+    retries++;
+  }
+  return q_regulated;
+}
+
+/*!\brief Called after encode_with_recode_loop() has just encoded a frame.
+ * This function works out whether we undershot or overshot our bitrate
+ *  target and adjusts q as appropriate. It also decides whether or not
+ *  we need to recode the frame to get closer to the target rate.
+ *
+ * \ingroup rate_control
+ *
+ * \param[in]     cpi             Top-level encoder structure
+ * \param[out]    loop            Should we go around the recode loop again
+ * \param[in,out] q               New q index value
+ * \param[in,out] q_low           Low q index limit for this loop itteration
+ * \param[in,out] q_high          High q index limit for this loop itteration
+ * \param[in]     top_index       Max permited new value for q index
+ * \param[in]     bottom_index    Min permited new value for q index
+ * \param[in,out] undershoot_seen Have we seen undershoot on this frame
+ * \param[in,out] overshoot_seen  Have we seen overshoot on this frame
+ * \param[in,out] low_cr_seen     Have we previously trriggered recode
+ *                                because the compression ration was less
+ *                                than a given minimum threshold.
+ * \param[in]     loop_count      Loop itterations so far.
+ *
+ */
+static AOM_INLINE void recode_loop_update_q(
+    AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low,
+    int *const q_high, const int top_index, const int bottom_index,
+    int *const undershoot_seen, int *const overshoot_seen,
+    int *const low_cr_seen, const int loop_count) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+  *loop = 0;
+
+  // Special case for overlay frame.
+  if (rc->is_src_frame_alt_ref &&
+      rc->projected_frame_size < rc->max_frame_bandwidth)
+    return;
+
+  const int min_cr = rc_cfg->min_cr;
+  if (min_cr > 0) {
+    const double compression_ratio =
+        av1_get_compression_ratio(cm, rc->projected_frame_size >> 3);
+    const double target_cr = min_cr / 100.0;
+    if (compression_ratio < target_cr) {
+      *low_cr_seen = 1;
+      if (*q < rc->worst_quality) {
+        const double cr_ratio = target_cr / compression_ratio;
+        const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio));
+        *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality);
+        *q_low = AOMMAX(*q, *q_low);
+        *q_high = AOMMAX(*q, *q_high);
+        *loop = 1;
+      }
+    }
+    if (*low_cr_seen) return;
+  }
+
+  if (cpi->ppi->level_params.keep_level_stats &&
+      !is_stat_generation_stage(cpi)) {
+    // Initialize level info. at the beginning of each sequence.
+    if (cm->current_frame.frame_type == KEY_FRAME &&
+        cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+      av1_init_level_info(cpi);
+    }
+    const AV1LevelParams *const level_params = &cpi->ppi->level_params;
+    // TODO(any): currently only checking operating point 0
+    const AV1LevelInfo *const level_info = level_params->level_info[0];
+    const DECODER_MODEL *const decoder_models = level_info->decoder_models;
+    const AV1_LEVEL target_level = level_params->target_seq_level_idx[0];
+
+    if (target_level < SEQ_LEVELS &&
+        decoder_models[target_level].status == DECODER_MODEL_OK) {
+      DECODER_MODEL_STATUS status = av1_decoder_model_try_smooth_buf(
+          cpi, rc->projected_frame_size, &decoder_models[target_level]);
+
+      if ((status == SMOOTHING_BUFFER_UNDERFLOW ||
+           status == SMOOTHING_BUFFER_OVERFLOW) &&
+          *q < rc->worst_quality) {
+        *q = AOMMIN(*q + 10, rc->worst_quality);
+        *q_low = AOMMAX(*q, *q_low);
+        *q_high = AOMMAX(*q, *q_high);
+        *loop = 1;
+        return;
+      }
+    }
+  }
+
+  if (rc_cfg->mode == AOM_Q) return;
+
+  const int last_q = *q;
+  int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0;
+  av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                   &frame_under_shoot_limit,
+                                   &frame_over_shoot_limit);
+  if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+
+  if (cm->current_frame.frame_type == KEY_FRAME &&
+      p_rc->this_key_frame_forced &&
+      rc->projected_frame_size < rc->max_frame_bandwidth) {
+    int64_t kf_err;
+    const int64_t high_err_target = cpi->ambient_err;
+    const int64_t low_err_target = cpi->ambient_err >> 1;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (cm->seq_params->use_highbitdepth) {
+      kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    } else {
+      kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    }
+#else
+    kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+    // Prevent possible divide by zero error below for perfect KF
+    kf_err += !kf_err;
+
+    // The key frame is not good enough or we can afford
+    // to make it better without undue risk of popping.
+    if ((kf_err > high_err_target &&
+         rc->projected_frame_size <= frame_over_shoot_limit) ||
+        (kf_err > low_err_target &&
+         rc->projected_frame_size <= frame_under_shoot_limit)) {
+      // Lower q_high
+      *q_high = AOMMAX(*q - 1, *q_low);
+
+      // Adjust Q
+      *q = (int)((*q * high_err_target) / kf_err);
+      *q = AOMMIN(*q, (*q_high + *q_low) >> 1);
+    } else if (kf_err < low_err_target &&
+               rc->projected_frame_size >= frame_under_shoot_limit) {
+      // The key frame is much better than the previous frame
+      // Raise q_low
+      *q_low = AOMMIN(*q + 1, *q_high);
+
+      // Adjust Q
+      *q = (int)((*q * low_err_target) / kf_err);
+      *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1);
+    }
+
+    // Clamp Q to upper and lower limits:
+    *q = clamp(*q, *q_low, *q_high);
+    *loop = (*q != last_q);
+    return;
+  }
+
+  if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q,
+                       AOMMAX(*q_high, top_index), bottom_index)) {
+    // Is the projected frame size out of range and are we allowed
+    // to attempt to recode.
+
+    // Frame size out of permitted range:
+    // Update correction factor & compute new Q to try...
+    // Frame is too large
+    if (rc->projected_frame_size > rc->this_frame_target) {
+      // Special case if the projected size is > the max allowed.
+      if (*q == *q_high &&
+          rc->projected_frame_size >= rc->max_frame_bandwidth) {
+        const double q_val_high_current =
+            av1_convert_qindex_to_q(*q_high, cm->seq_params->bit_depth);
+        const double q_val_high_new =
+            q_val_high_current *
+            ((double)rc->projected_frame_size / rc->max_frame_bandwidth);
+        *q_high = av1_find_qindex(q_val_high_new, cm->seq_params->bit_depth,
+                                  rc->best_quality, rc->worst_quality);
+      }
+
+      // Raise Qlow as to at least the current value
+      *q_low = AOMMIN(*q + 1, *q_high);
+
+      if (*undershoot_seen || loop_count > 2 ||
+          (loop_count == 2 && !frame_is_intra_only(cm))) {
+        av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height);
+
+        *q = (*q_high + *q_low + 1) / 2;
+      } else if (loop_count == 2 && frame_is_intra_only(cm)) {
+        const int q_mid = (*q_high + *q_low + 1) / 2;
+        const int q_regulated = get_regulated_q_overshoot(
+            cpi, 1, *q_low, *q_high, top_index, bottom_index);
+        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+        // transition between loop_count < 2 and loop_count > 2.
+        *q = (q_mid + q_regulated + 1) / 2;
+      } else {
+        *q = get_regulated_q_overshoot(cpi, 1, *q_low, *q_high, top_index,
+                                       bottom_index);
+      }
+
+      *overshoot_seen = 1;
+    } else {
+      // Frame is too small
+      *q_high = AOMMAX(*q - 1, *q_low);
+
+      if (*overshoot_seen || loop_count > 2 ||
+          (loop_count == 2 && !frame_is_intra_only(cm))) {
+        av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height);
+        *q = (*q_high + *q_low) / 2;
+      } else if (loop_count == 2 && frame_is_intra_only(cm)) {
+        const int q_mid = (*q_high + *q_low) / 2;
+        const int q_regulated = get_regulated_q_undershoot(
+            cpi, 1, *q_high, top_index, bottom_index);
+        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+        // transition between loop_count < 2 and loop_count > 2.
+        *q = (q_mid + q_regulated) / 2;
+
+        // Special case reset for qlow for constrained quality.
+        // This should only trigger where there is very substantial
+        // undershoot on a frame and the auto cq level is above
+        // the user passsed in value.
+        if (rc_cfg->mode == AOM_CQ && q_regulated < *q_low) {
+          *q_low = *q;
+        }
+      } else {
+        *q = get_regulated_q_undershoot(cpi, 1, *q_high, top_index,
+                                        bottom_index);
+
+        // Special case reset for qlow for constrained quality.
+        // This should only trigger where there is very substantial
+        // undershoot on a frame and the auto cq level is above
+        // the user passsed in value.
+        if (rc_cfg->mode == AOM_CQ && *q < *q_low) {
+          *q_low = *q;
+        }
+      }
+
+      *undershoot_seen = 1;
+    }
+
+    // Clamp Q to upper and lower limits:
+    *q = clamp(*q, *q_low, *q_high);
+  }
+
+  *loop = (*q != last_q);
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RC_UTILS_H_
diff --git a/media/libaom/src/av1/encoder/rd.c b/media/libaom/src/av1/encoder/rd.c
index e48c771194..17c7960393 100644
--- a/media/libaom/src/av1/encoder/rd.c
+++ b/media/libaom/src/av1/encoder/rd.c
@@ -10,6 +10,7 @@
  */
 
 #include <assert.h>
+#include <limits.h>
 #include <math.h>
 #include <stdio.h>
 
@@ -19,7 +20,7 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/bitops.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
+#include "aom_ports/aom_once.h"
 
 #include "av1/common/common.h"
 #include "av1/common/entropy.h"
@@ -84,68 +85,72 @@ static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA,
   },
 };
 
-void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
+void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs,
                          FRAME_CONTEXT *fc) {
   int i, j;
 
   for (i = 0; i < PARTITION_CONTEXTS; ++i)
-    av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i], NULL);
+    av1_cost_tokens_from_cdf(mode_costs->partition_cost[i],
+                             fc->partition_cdf[i], NULL);
 
   if (cm->current_frame.skip_mode_info.skip_mode_flag) {
-    for (i = 0; i < SKIP_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->skip_mode_cost[i], fc->skip_mode_cdfs[i],
-                               NULL);
+    for (i = 0; i < SKIP_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(mode_costs->skip_mode_cost[i],
+                               fc->skip_mode_cdfs[i], NULL);
     }
   }
 
   for (i = 0; i < SKIP_CONTEXTS; ++i) {
-    av1_cost_tokens_from_cdf(x->skip_cost[i], fc->skip_cdfs[i], NULL);
+    av1_cost_tokens_from_cdf(mode_costs->skip_txfm_cost[i],
+                             fc->skip_txfm_cdfs[i], NULL);
   }
 
   for (i = 0; i < KF_MODE_CONTEXTS; ++i)
     for (j = 0; j < KF_MODE_CONTEXTS; ++j)
-      av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->y_mode_costs[i][j],
+                               fc->kf_y_cdf[i][j], NULL);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
-    av1_cost_tokens_from_cdf(x->mbmode_cost[i], fc->y_mode_cdf[i], NULL);
+    av1_cost_tokens_from_cdf(mode_costs->mbmode_cost[i], fc->y_mode_cdf[i],
+                             NULL);
   for (i = 0; i < CFL_ALLOWED_TYPES; ++i)
     for (j = 0; j < INTRA_MODES; ++j)
-      av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i][j],
+      av1_cost_tokens_from_cdf(mode_costs->intra_uv_mode_cost[i][j],
                                fc->uv_mode_cdf[i][j], NULL);
 
-  av1_cost_tokens_from_cdf(x->filter_intra_mode_cost, fc->filter_intra_mode_cdf,
-                           NULL);
+  av1_cost_tokens_from_cdf(mode_costs->filter_intra_mode_cost,
+                           fc->filter_intra_mode_cdf, NULL);
   for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
     if (av1_filter_intra_allowed_bsize(cm, i))
-      av1_cost_tokens_from_cdf(x->filter_intra_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->filter_intra_cost[i],
                                fc->filter_intra_cdfs[i], NULL);
   }
 
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    av1_cost_tokens_from_cdf(x->switchable_interp_costs[i],
+    av1_cost_tokens_from_cdf(mode_costs->switchable_interp_costs[i],
                              fc->switchable_interp_cdf[i], NULL);
 
   for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) {
-    av1_cost_tokens_from_cdf(x->palette_y_size_cost[i],
+    av1_cost_tokens_from_cdf(mode_costs->palette_y_size_cost[i],
                              fc->palette_y_size_cdf[i], NULL);
-    av1_cost_tokens_from_cdf(x->palette_uv_size_cost[i],
+    av1_cost_tokens_from_cdf(mode_costs->palette_uv_size_cost[i],
                              fc->palette_uv_size_cdf[i], NULL);
     for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) {
-      av1_cost_tokens_from_cdf(x->palette_y_mode_cost[i][j],
+      av1_cost_tokens_from_cdf(mode_costs->palette_y_mode_cost[i][j],
                                fc->palette_y_mode_cdf[i][j], NULL);
     }
   }
 
   for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) {
-    av1_cost_tokens_from_cdf(x->palette_uv_mode_cost[i],
+    av1_cost_tokens_from_cdf(mode_costs->palette_uv_mode_cost[i],
                              fc->palette_uv_mode_cdf[i], NULL);
   }
 
   for (i = 0; i < PALETTE_SIZES; ++i) {
     for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
-      av1_cost_tokens_from_cdf(x->palette_y_color_cost[i][j],
+      av1_cost_tokens_from_cdf(mode_costs->palette_y_color_cost[i][j],
                                fc->palette_y_color_index_cdf[i][j], NULL);
-      av1_cost_tokens_from_cdf(x->palette_uv_color_cost[i][j],
+      av1_cost_tokens_from_cdf(mode_costs->palette_uv_color_cost[i][j],
                                fc->palette_uv_color_index_cdf[i][j], NULL);
     }
   }
@@ -153,8 +158,8 @@ void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
   int sign_cost[CFL_JOINT_SIGNS];
   av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL);
   for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
-    int *cost_u = x->cfl_cost[joint_sign][CFL_PRED_U];
-    int *cost_v = x->cfl_cost[joint_sign][CFL_PRED_V];
+    int *cost_u = mode_costs->cfl_cost[joint_sign][CFL_PRED_U];
+    int *cost_v = mode_costs->cfl_cost[joint_sign][CFL_PRED_V];
     if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) {
       memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u));
     } else {
@@ -173,11 +178,11 @@ void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
 
   for (i = 0; i < MAX_TX_CATS; ++i)
     for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
-      av1_cost_tokens_from_cdf(x->tx_size_cost[i][j], fc->tx_size_cdf[i][j],
-                               NULL);
+      av1_cost_tokens_from_cdf(mode_costs->tx_size_cost[i][j],
+                               fc->tx_size_cdf[i][j], NULL);
 
   for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) {
-    av1_cost_tokens_from_cdf(x->txfm_partition_cost[i],
+    av1_cost_tokens_from_cdf(mode_costs->txfm_partition_cost[i],
                              fc->txfm_partition_cdf[i], NULL);
   }
 
@@ -186,7 +191,7 @@ void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
     for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
       if (use_inter_ext_tx_for_txsize[s][i]) {
         av1_cost_tokens_from_cdf(
-            x->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i],
+            mode_costs->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i],
             av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]);
       }
     }
@@ -194,123 +199,142 @@ void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
       if (use_intra_ext_tx_for_txsize[s][i]) {
         for (j = 0; j < INTRA_MODES; ++j) {
           av1_cost_tokens_from_cdf(
-              x->intra_tx_type_costs[s][i][j], fc->intra_ext_tx_cdf[s][i][j],
+              mode_costs->intra_tx_type_costs[s][i][j],
+              fc->intra_ext_tx_cdf[s][i][j],
               av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]);
         }
       }
     }
   }
   for (i = 0; i < DIRECTIONAL_MODES; ++i) {
-    av1_cost_tokens_from_cdf(x->angle_delta_cost[i], fc->angle_delta_cdf[i],
+    av1_cost_tokens_from_cdf(mode_costs->angle_delta_cost[i],
+                             fc->angle_delta_cdf[i], NULL);
+  }
+  av1_cost_tokens_from_cdf(mode_costs->intrabc_cost, fc->intrabc_cdf, NULL);
+
+  for (i = 0; i < SPATIAL_PREDICTION_PROBS; ++i) {
+    av1_cost_tokens_from_cdf(mode_costs->spatial_pred_cost[i],
+                             fc->seg.spatial_pred_seg_cdf[i], NULL);
+  }
+
+  for (i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) {
+    av1_cost_tokens_from_cdf(mode_costs->tmp_pred_cost[i], fc->seg.pred_cdf[i],
                              NULL);
   }
-  av1_cost_tokens_from_cdf(x->switchable_restore_cost,
-                           fc->switchable_restore_cdf, NULL);
-  av1_cost_tokens_from_cdf(x->wiener_restore_cost, fc->wiener_restore_cdf,
-                           NULL);
-  av1_cost_tokens_from_cdf(x->sgrproj_restore_cost, fc->sgrproj_restore_cdf,
-                           NULL);
-  av1_cost_tokens_from_cdf(x->intrabc_cost, fc->intrabc_cdf, NULL);
 
   if (!frame_is_intra_only(cm)) {
     for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->comp_inter_cost[i], fc->comp_inter_cdf[i],
-                               NULL);
+      av1_cost_tokens_from_cdf(mode_costs->comp_inter_cost[i],
+                               fc->comp_inter_cdf[i], NULL);
     }
 
     for (i = 0; i < REF_CONTEXTS; ++i) {
       for (j = 0; j < SINGLE_REFS - 1; ++j) {
-        av1_cost_tokens_from_cdf(x->single_ref_cost[i][j],
+        av1_cost_tokens_from_cdf(mode_costs->single_ref_cost[i][j],
                                  fc->single_ref_cdf[i][j], NULL);
       }
     }
 
     for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->comp_ref_type_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->comp_ref_type_cost[i],
                                fc->comp_ref_type_cdf[i], NULL);
     }
 
     for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) {
       for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) {
-        av1_cost_tokens_from_cdf(x->uni_comp_ref_cost[i][j],
+        av1_cost_tokens_from_cdf(mode_costs->uni_comp_ref_cost[i][j],
                                  fc->uni_comp_ref_cdf[i][j], NULL);
       }
     }
 
     for (i = 0; i < REF_CONTEXTS; ++i) {
       for (j = 0; j < FWD_REFS - 1; ++j) {
-        av1_cost_tokens_from_cdf(x->comp_ref_cost[i][j], fc->comp_ref_cdf[i][j],
-                                 NULL);
+        av1_cost_tokens_from_cdf(mode_costs->comp_ref_cost[i][j],
+                                 fc->comp_ref_cdf[i][j], NULL);
       }
     }
 
     for (i = 0; i < REF_CONTEXTS; ++i) {
       for (j = 0; j < BWD_REFS - 1; ++j) {
-        av1_cost_tokens_from_cdf(x->comp_bwdref_cost[i][j],
+        av1_cost_tokens_from_cdf(mode_costs->comp_bwdref_cost[i][j],
                                  fc->comp_bwdref_cdf[i][j], NULL);
       }
     }
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->intra_inter_cost[i], fc->intra_inter_cdf[i],
-                               NULL);
+      av1_cost_tokens_from_cdf(mode_costs->intra_inter_cost[i],
+                               fc->intra_inter_cdf[i], NULL);
     }
 
     for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->newmv_mode_cost[i], fc->newmv_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->newmv_mode_cost[i], fc->newmv_cdf[i],
+                               NULL);
     }
 
     for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->zeromv_mode_cost[i], fc->zeromv_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->zeromv_mode_cost[i],
+                               fc->zeromv_cdf[i], NULL);
     }
 
     for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->refmv_mode_cost[i], fc->refmv_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->refmv_mode_cost[i], fc->refmv_cdf[i],
+                               NULL);
     }
 
     for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->drl_mode_cost0[i], fc->drl_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->drl_mode_cost0[i], fc->drl_cdf[i],
+                               NULL);
     }
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      av1_cost_tokens_from_cdf(x->inter_compound_mode_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->inter_compound_mode_cost[i],
                                fc->inter_compound_mode_cdf[i], NULL);
     for (i = 0; i < BLOCK_SIZES_ALL; ++i)
-      av1_cost_tokens_from_cdf(x->compound_type_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->compound_type_cost[i],
                                fc->compound_type_cdf[i], NULL);
     for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
       if (av1_is_wedge_used(i)) {
-        av1_cost_tokens_from_cdf(x->wedge_idx_cost[i], fc->wedge_idx_cdf[i],
-                                 NULL);
+        av1_cost_tokens_from_cdf(mode_costs->wedge_idx_cost[i],
+                                 fc->wedge_idx_cdf[i], NULL);
       }
     }
     for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
-      av1_cost_tokens_from_cdf(x->interintra_cost[i], fc->interintra_cdf[i],
-                               NULL);
-      av1_cost_tokens_from_cdf(x->interintra_mode_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->interintra_cost[i],
+                               fc->interintra_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->interintra_mode_cost[i],
                                fc->interintra_mode_cdf[i], NULL);
     }
     for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
-      av1_cost_tokens_from_cdf(x->wedge_interintra_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->wedge_interintra_cost[i],
                                fc->wedge_interintra_cdf[i], NULL);
     }
     for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
-      av1_cost_tokens_from_cdf(x->motion_mode_cost[i], fc->motion_mode_cdf[i],
-                               NULL);
+      av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost[i],
+                               fc->motion_mode_cdf[i], NULL);
     }
     for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
-      av1_cost_tokens_from_cdf(x->motion_mode_cost1[i], fc->obmc_cdf[i], NULL);
+      av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost1[i],
+                               fc->obmc_cdf[i], NULL);
     }
     for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->comp_idx_cost[i], fc->compound_index_cdf[i],
-                               NULL);
+      av1_cost_tokens_from_cdf(mode_costs->comp_idx_cost[i],
+                               fc->compound_index_cdf[i], NULL);
     }
     for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) {
-      av1_cost_tokens_from_cdf(x->comp_group_idx_cost[i],
+      av1_cost_tokens_from_cdf(mode_costs->comp_group_idx_cost[i],
                                fc->comp_group_idx_cdf[i], NULL);
     }
   }
 }
 
+void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc) {
+  av1_cost_tokens_from_cdf(mode_costs->switchable_restore_cost,
+                           fc->switchable_restore_cdf, NULL);
+  av1_cost_tokens_from_cdf(mode_costs->wiener_restore_cost,
+                           fc->wiener_restore_cdf, NULL);
+  av1_cost_tokens_from_cdf(mode_costs->sgrproj_restore_cost,
+                           fc->sgrproj_restore_cdf, NULL);
+}
+
 // Values are now correlated to quantizer.
 static int sad_per_bit_lut_8[QINDEX_RANGE];
 static int sad_per_bit_lut_10[QINDEX_RANGE];
@@ -328,23 +352,59 @@ static void init_me_luts_bd(int *bit16lut, int range,
   }
 }
 
-void av1_init_me_luts(void) {
+static void init_me_luts(void) {
   init_me_luts_bd(sad_per_bit_lut_8, QINDEX_RANGE, AOM_BITS_8);
   init_me_luts_bd(sad_per_bit_lut_10, QINDEX_RANGE, AOM_BITS_10);
   init_me_luts_bd(sad_per_bit_lut_12, QINDEX_RANGE, AOM_BITS_12);
 }
 
+void av1_init_me_luts(void) { aom_once(init_me_luts); }
+
 static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
                                          8,  8,  4,  4,  2,  2,  1,  0 };
-static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
-                                                              128, 144, 144,
-                                                              128 };
-
-int av1_compute_rd_mult_based_on_qindex(const AV1_COMP *cpi, int qindex) {
-  const int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
-  int rdmult = q * q;
-  rdmult = rdmult * 3 + (rdmult * 2 / 3);
-  switch (cpi->common.seq_params.bit_depth) {
+
+static const int rd_layer_depth_factor[7] = {
+  160, 160, 160, 160, 192, 208, 224
+};
+
+// Returns the default rd multiplier for inter frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_inter_rd_multiplier(int qindex) {
+  return 3.2 + (0.0015 * (double)qindex);
+}
+
+// Returns the default rd multiplier for ARF/Golden Frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_arf_rd_multiplier(int qindex) {
+  return 3.25 + (0.0015 * (double)qindex);
+}
+
+// Returns the default rd multiplier for key frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_kf_rd_multiplier(int qindex) {
+  return 3.3 + (0.0015 * (double)qindex);
+}
+
+int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
+                                        FRAME_UPDATE_TYPE update_type,
+                                        int qindex) {
+  const int q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+  int64_t rdmult = q * q;
+  if (update_type == KF_UPDATE) {
+    double def_rd_q_mult = def_kf_rd_multiplier(q);
+    rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
+  } else if ((update_type == GF_UPDATE) || (update_type == ARF_UPDATE)) {
+    double def_rd_q_mult = def_arf_rd_multiplier(q);
+    rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
+  } else {
+    double def_rd_q_mult = def_inter_rd_multiplier(q);
+    rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
+  }
+
+  switch (bit_depth) {
     case AOM_BITS_8: break;
     case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
     case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break;
@@ -352,71 +412,79 @@ int av1_compute_rd_mult_based_on_qindex(const AV1_COMP *cpi, int qindex) {
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-  return rdmult > 0 ? rdmult : 1;
+  return rdmult > 0 ? (int)AOMMIN(rdmult, INT_MAX) : 1;
 }
 
 int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
-  int64_t rdmult = av1_compute_rd_mult_based_on_qindex(cpi, qindex);
-  if (is_stat_consumption_stage(cpi) &&
+  const aom_bit_depth_t bit_depth = cpi->common.seq_params->bit_depth;
+  const FRAME_UPDATE_TYPE update_type =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+  int64_t rdmult =
+      av1_compute_rd_mult_based_on_qindex(bit_depth, update_type, qindex);
+  if (is_stat_consumption_stage(cpi) && !cpi->oxcf.q_cfg.use_fixed_qp_offsets &&
       (cpi->common.current_frame.frame_type != KEY_FRAME)) {
-    const GF_GROUP *const gf_group = &cpi->gf_group;
-    const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
-    const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
+    const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+    const int layer_depth =
+        AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+
+    // Layer depth adjustment
+    rdmult = (rdmult * rd_layer_depth_factor[layer_depth]) >> 7;
 
-    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
+    // ARF boost adjustment
     rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
   }
   return (int)rdmult;
 }
 
-int av1_get_deltaq_offset(const AV1_COMP *cpi, int qindex, double beta) {
+int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta) {
   assert(beta > 0.0);
-  int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
+  int q = av1_dc_quant_QTX(qindex, 0, bit_depth);
   int newq = (int)rint(q / sqrt(beta));
   int orig_qindex = qindex;
+  if (newq == q) {
+    return 0;
+  }
   if (newq < q) {
-    do {
+    while (qindex > 0) {
       qindex--;
-      q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
-    } while (newq < q && qindex > 0);
+      q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+      if (newq >= q) {
+        break;
+      }
+    }
   } else {
-    do {
+    while (qindex < MAXQ) {
       qindex++;
-      q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
-    } while (newq > q && qindex < MAXQ);
+      q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+      if (newq <= q) {
+        break;
+      }
+    }
   }
   return qindex - orig_qindex;
 }
 
+int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex,
+                                  int curr_qindex) {
+  curr_qindex = clamp(curr_qindex, delta_q_res, 256 - delta_q_res);
+  const int sign_deltaq_index = curr_qindex - prev_qindex >= 0 ? 1 : -1;
+  const int deltaq_deadzone = delta_q_res / 4;
+  const int qmask = ~(delta_q_res - 1);
+  int abs_deltaq_index = abs(curr_qindex - prev_qindex);
+  abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask;
+  int adjust_qindex = prev_qindex + sign_deltaq_index * abs_deltaq_index;
+  adjust_qindex = AOMMAX(adjust_qindex, MINQ + 1);
+  return adjust_qindex;
+}
+
 int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) {
   assert(beta > 0.0);
   const AV1_COMMON *cm = &cpi->common;
-  int64_t q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
-                               cm->seq_params.bit_depth);
-  int64_t rdmult = 0;
+  int q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+                           cm->seq_params->bit_depth);
 
-  switch (cm->seq_params.bit_depth) {
-    case AOM_BITS_8: rdmult = (int)((88 * q * q / beta) / 24); break;
-    case AOM_BITS_10:
-      rdmult = ROUND_POWER_OF_TWO((int)((88 * q * q / beta) / 24), 4);
-      break;
-    default:
-      assert(cm->seq_params.bit_depth == AOM_BITS_12);
-      rdmult = ROUND_POWER_OF_TWO((int)((88 * q * q / beta) / 24), 8);
-      break;
-  }
-
-  if (is_stat_consumption_stage(cpi) &&
-      (cm->current_frame.frame_type != KEY_FRAME)) {
-    const GF_GROUP *const gf_group = &cpi->gf_group;
-    const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
-    const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
-
-    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
-    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
-  }
-  if (rdmult < 1) rdmult = 1;
-  return (int)rdmult;
+  return (int)(av1_compute_rd_mult(cpi, q) / beta);
 }
 
 static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
@@ -437,11 +505,11 @@ static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
   return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 }
 
-void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
-  switch (cpi->common.seq_params.bit_depth) {
-    case AOM_BITS_8: x->sadperbit = sad_per_bit_lut_8[qindex]; break;
-    case AOM_BITS_10: x->sadperbit = sad_per_bit_lut_10[qindex]; break;
-    case AOM_BITS_12: x->sadperbit = sad_per_bit_lut_12[qindex]; break;
+void av1_set_sad_per_bit(const AV1_COMP *cpi, int *sadperbit, int qindex) {
+  switch (cpi->common.seq_params->bit_depth) {
+    case AOM_BITS_8: *sadperbit = sad_per_bit_lut_8[qindex]; break;
+    case AOM_BITS_10: *sadperbit = sad_per_bit_lut_10[qindex]; break;
+    case AOM_BITS_12: *sadperbit = sad_per_bit_lut_12[qindex]; break;
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
   }
@@ -455,7 +523,7 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
         av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) +
             cm->quant_params.y_dc_delta_q,
         0, MAXQ);
-    const int q = compute_rd_thresh_factor(qindex, cm->seq_params.bit_depth);
+    const int q = compute_rd_thresh_factor(qindex, cm->seq_params->bit_depth);
 
     for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
       // Threshold here seems unnecessarily harsh but fine given actual
@@ -471,12 +539,12 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
   }
 }
 
-void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc,
                           const int num_planes) {
   const int nplanes = AOMMIN(num_planes, PLANE_TYPES);
   for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) {
     for (int plane = 0; plane < nplanes; ++plane) {
-      LV_MAP_EOB_COST *pcost = &x->eob_costs[eob_multi_size][plane];
+      LV_MAP_EOB_COST *pcost = &coeff_costs->eob_costs[eob_multi_size][plane];
 
       for (int ctx = 0; ctx < 2; ++ctx) {
         aom_cdf_prob *pcdf;
@@ -496,7 +564,7 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
   }
   for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
     for (int plane = 0; plane < nplanes; ++plane) {
-      LV_MAP_COEFF_COST *pcost = &x->coeff_costs[tx_size][plane];
+      LV_MAP_COEFF_COST *pcost = &coeff_costs->coeff_costs[tx_size][plane];
 
       for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
         av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx],
@@ -564,62 +632,130 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
   }
 }
 
-void av1_fill_mv_costs(const FRAME_CONTEXT *fc, int integer_mv, int usehp,
-                       MACROBLOCK *x) {
-  x->nmvcost[0] = &x->nmv_costs[0][MV_MAX];
-  x->nmvcost[1] = &x->nmv_costs[1][MV_MAX];
-  x->nmvcost_hp[0] = &x->nmv_costs_hp[0][MV_MAX];
-  x->nmvcost_hp[1] = &x->nmv_costs_hp[1][MV_MAX];
+void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp,
+                       MvCosts *mv_costs) {
+  // Avoid accessing 'mv_costs' when it is not allocated.
+  if (mv_costs == NULL) return;
+
+  mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX];
+  mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX];
+  mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX];
+  mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX];
   if (integer_mv) {
-    av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &fc->nmvc,
-                             MV_SUBPEL_NONE);
-    x->mv_cost_stack = (int **)&x->nmvcost;
+    mv_costs->mv_cost_stack = (int **)&mv_costs->nmv_cost;
+    av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack,
+                             nmvc, MV_SUBPEL_NONE);
   } else {
-    int *(*src)[2] = usehp ? &x->nmvcost_hp : &x->nmvcost;
-    x->mv_cost_stack = *src;
-    av1_build_nmv_cost_table(
-        x->nmv_vec_cost, usehp ? x->nmvcost_hp : x->nmvcost, &fc->nmvc, usehp);
+    mv_costs->mv_cost_stack =
+        usehp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost;
+    av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack,
+                             nmvc, usehp);
   }
 }
 
+void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs) {
+  dv_costs->dv_costs[0] = &dv_costs->dv_costs_alloc[0][MV_MAX];
+  dv_costs->dv_costs[1] = &dv_costs->dv_costs_alloc[1][MV_MAX];
+  av1_build_nmv_cost_table(dv_costs->joint_mv, dv_costs->dv_costs, ndvc,
+                           MV_SUBPEL_NONE);
+}
+
+// Populates speed features based on codec control settings (of type
+// COST_UPDATE_TYPE) and expected speed feature settings (of type
+// INTERNAL_COST_UPDATE_TYPE) by considering the least frequent cost update.
+// The populated/updated speed features are used for cost updates in the
+// encoder.
+// WARNING: Population of unified cost update frequency needs to be taken care
+// accordingly, in case of any modifications/additions to the enum
+// COST_UPDATE_TYPE/INTERNAL_COST_UPDATE_TYPE.
+static INLINE void populate_unified_cost_update_freq(
+    const CostUpdateFreq cost_upd_freq, SPEED_FEATURES *const sf) {
+  INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf;
+  // Mapping of entropy cost update frequency from the encoder's codec control
+  // settings of type COST_UPDATE_TYPE to speed features of type
+  // INTERNAL_COST_UPDATE_TYPE.
+  static const INTERNAL_COST_UPDATE_TYPE
+      map_cost_upd_to_internal_cost_upd[NUM_COST_UPDATE_TYPES] = {
+        INTERNAL_COST_UPD_SB, INTERNAL_COST_UPD_SBROW, INTERNAL_COST_UPD_TILE,
+        INTERNAL_COST_UPD_OFF
+      };
+
+  inter_sf->mv_cost_upd_level =
+      AOMMIN(inter_sf->mv_cost_upd_level,
+             map_cost_upd_to_internal_cost_upd[cost_upd_freq.mv]);
+  inter_sf->coeff_cost_upd_level =
+      AOMMIN(inter_sf->coeff_cost_upd_level,
+             map_cost_upd_to_internal_cost_upd[cost_upd_freq.coeff]);
+  inter_sf->mode_cost_upd_level =
+      AOMMIN(inter_sf->mode_cost_upd_level,
+             map_cost_upd_to_internal_cost_upd[cost_upd_freq.mode]);
+  sf->intra_sf.dv_cost_upd_level =
+      AOMMIN(sf->intra_sf.dv_cost_upd_level,
+             map_cost_upd_to_internal_cost_upd[cost_upd_freq.dv]);
+}
+
+// Checks if entropy costs should be initialized/updated at frame level or not.
+static INLINE int is_frame_level_cost_upd_freq_set(
+    const AV1_COMMON *const cm, const INTERNAL_COST_UPDATE_TYPE cost_upd_level,
+    const int use_nonrd_pick_mode, const int frames_since_key) {
+  const int fill_costs =
+      frame_is_intra_only(cm) ||
+      (use_nonrd_pick_mode ? frames_since_key < 2
+                           : (cm->current_frame.frame_number & 0x07) == 1);
+  return ((!use_nonrd_pick_mode && cost_upd_level != INTERNAL_COST_UPD_OFF) ||
+          cost_upd_level == INTERNAL_COST_UPD_TILE || fill_costs);
+}
+
 void av1_initialize_rd_consts(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
+  SPEED_FEATURES *const sf = &cpi->sf;
   RD_OPT *const rd = &cpi->rd;
-
-  aom_clear_system_state();
+  int use_nonrd_pick_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
+  int frames_since_key = cpi->rc.frames_since_key;
 
   rd->RDMULT = av1_compute_rd_mult(
       cpi, cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q);
+#if CONFIG_RD_COMMAND
+  if (cpi->oxcf.pass == 2) {
+    const RD_COMMAND *rd_command = &cpi->rd_command;
+    if (rd_command->option_ls[rd_command->frame_index] ==
+        RD_OPTION_SET_Q_RDMULT) {
+      rd->RDMULT = rd_command->rdmult_ls[rd_command->frame_index];
+    }
+  }
+#endif  // CONFIG_RD_COMMAND
 
-  set_error_per_bit(x, rd->RDMULT);
+  av1_set_error_per_bit(&x->errorperbit, rd->RDMULT);
 
   set_block_thresholds(cm, rd);
 
-  if ((!cpi->sf.rt_sf.use_nonrd_pick_mode &&
-       cpi->oxcf.mv_cost_upd_freq != COST_UPD_OFF) ||
-      frame_is_intra_only(cm) || (cm->current_frame.frame_number & 0x07) == 1)
-    av1_fill_mv_costs(cm->fc, cm->features.cur_frame_force_integer_mv,
-                      cm->features.allow_high_precision_mv, x);
-
-  if (!cpi->sf.rt_sf.use_nonrd_pick_mode && frame_is_intra_only(cm) &&
-      cm->features.allow_screen_content_tools &&
-      !is_stat_generation_stage(cpi)) {
-    IntraBCMVCosts *const dv_costs = &cpi->dv_costs;
-    int *dvcost[2] = { &dv_costs->mv_component[0][MV_MAX],
-                       &dv_costs->mv_component[1][MV_MAX] };
-    av1_build_nmv_cost_table(dv_costs->joint_mv, dvcost, &cm->fc->ndvc,
-                             MV_SUBPEL_NONE);
-  }
-
-  if (!is_stat_generation_stage(cpi)) {
-    for (int i = 0; i < TRANS_TYPES; ++i)
-      // IDENTITY: 1 bit
-      // TRANSLATION: 3 bits
-      // ROTZOOM: 2 bits
-      // AFFINE: 3 bits
-      cpi->gm_info.type_cost[i] = (1 + (i > 0 ? (i == ROTZOOM ? 1 : 2) : 0))
-                                  << AV1_PROB_COST_SHIFT;
+  populate_unified_cost_update_freq(cpi->oxcf.cost_upd_freq, sf);
+  const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf;
+  // Frame level mv cost update
+  if (is_frame_level_cost_upd_freq_set(cm, inter_sf->mv_cost_upd_level,
+                                       use_nonrd_pick_mode, frames_since_key))
+    av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
+                      cm->features.allow_high_precision_mv, x->mv_costs);
+
+  // Frame level coefficient cost update
+  if (is_frame_level_cost_upd_freq_set(cm, inter_sf->coeff_cost_upd_level,
+                                       use_nonrd_pick_mode, frames_since_key))
+    av1_fill_coeff_costs(&x->coeff_costs, cm->fc, av1_num_planes(cm));
+
+  // Frame level mode cost update
+  if (is_frame_level_cost_upd_freq_set(cm, inter_sf->mode_cost_upd_level,
+                                       use_nonrd_pick_mode, frames_since_key))
+    av1_fill_mode_rates(cm, &x->mode_costs, cm->fc);
+
+  // Frame level dv cost update
+  if (av1_need_dv_costs(cpi)) {
+    if (cpi->td.mb.dv_costs == NULL) {
+      CHECK_MEM_ERROR(
+          cm, cpi->td.mb.dv_costs,
+          (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.mb.dv_costs)));
+    }
+    av1_fill_dv_costs(&cm->fc->ndvc, x->dv_costs);
   }
 }
 
@@ -989,19 +1125,15 @@ void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
                  int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
   const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
   const int_mv ref_mv =
-      av1_get_ref_mv_from_stack(0, ref_frames, 0, x->mbmi_ext);
+      av1_get_ref_mv_from_stack(0, ref_frames, 0, &x->mbmi_ext);
   const int_mv ref_mv1 =
-      av1_get_ref_mv_from_stack(0, ref_frames, 1, x->mbmi_ext);
+      av1_get_ref_mv_from_stack(0, ref_frames, 1, &x->mbmi_ext);
   MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
   int num_mv_refs = 0;
   pred_mv[num_mv_refs++] = ref_mv.as_mv;
   if (ref_mv.as_int != ref_mv1.as_int) {
     pred_mv[num_mv_refs++] = ref_mv1.as_mv;
   }
-  if (cpi->sf.mv_sf.adaptive_motion_search &&
-      block_size < x->max_partition_size) {
-    pred_mv[num_mv_refs++] = x->pred_mv[ref_frame];
-  }
 
   assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
 
@@ -1022,12 +1154,16 @@ void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
     const uint8_t *const ref_y_ptr =
         &ref_y_buffer[ref_y_stride * fp_row + fp_col];
     // Find sad for current vector.
-    const int this_sad = cpi->fn_ptr[block_size].sdf(
+    const int this_sad = cpi->ppi->fn_ptr[block_size].sdf(
         src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride);
     // Note if it is the best so far.
     if (this_sad < best_sad) {
       best_sad = this_sad;
     }
+    if (i == 0)
+      x->pred_mv0_sad[ref_frame] = this_sad;
+    else if (i == 1)
+      x->pred_mv1_sad[ref_frame] = this_sad;
   }
 
   // Note the index of the mv that worked best in the reference list.
@@ -1050,7 +1186,7 @@ void av1_setup_pred_block(const MACROBLOCKD *xd,
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
   for (int i = 0; i < num_planes; ++i) {
-    setup_pred_plane(dst + i, xd->mi[0]->sb_type, dst[i].buf,
+    setup_pred_plane(dst + i, xd->mi[0]->bsize, dst[i].buf,
                      i ? src->uv_crop_width : src->y_crop_width,
                      i ? src->uv_crop_height : src->y_crop_height,
                      dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
@@ -1069,17 +1205,16 @@ YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
 }
 
 int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
-                            InterpFilter interp_filter) {
+                            InterpFilter interp_filter, int dual_filter) {
   if (interp_filter == SWITCHABLE) {
     const MB_MODE_INFO *const mbmi = xd->mi[0];
     int inter_filter_cost = 0;
-    int dir;
-
-    for (dir = 0; dir < 2; ++dir) {
+    for (int dir = 0; dir < 2; ++dir) {
+      if (dir && !dual_filter) break;
       const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
       const InterpFilter filter =
           av1_extract_interp_filter(mbmi->interp_filters, dir);
-      inter_filter_cost += x->switchable_interp_costs[ctx][filter];
+      inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx][filter];
     }
     return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
   } else {
@@ -1286,15 +1421,32 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_D45_PRED] = 2500;
 }
 
-void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
-                               int (*factor_buf)[MAX_MODES],
-                               int use_adaptive_rd_thresh, BLOCK_SIZE bsize,
-                               THR_MODES best_mode_index) {
+static INLINE void update_thr_fact(int (*factor_buf)[MAX_MODES],
+                                   THR_MODES best_mode_index,
+                                   THR_MODES mode_start, THR_MODES mode_end,
+                                   BLOCK_SIZE min_size, BLOCK_SIZE max_size,
+                                   int max_rd_thresh_factor) {
+  for (THR_MODES mode = mode_start; mode < mode_end; ++mode) {
+    for (BLOCK_SIZE bs = min_size; bs <= max_size; ++bs) {
+      int *const fact = &factor_buf[bs][mode];
+      if (mode == best_mode_index) {
+        *fact -= (*fact >> RD_THRESH_LOG_DEC_FACTOR);
+      } else {
+        *fact = AOMMIN(*fact + RD_THRESH_INC, max_rd_thresh_factor);
+      }
+    }
+  }
+}
+
+void av1_update_rd_thresh_fact(
+    const AV1_COMMON *const cm, int (*factor_buf)[MAX_MODES],
+    int use_adaptive_rd_thresh, BLOCK_SIZE bsize, THR_MODES best_mode_index,
+    THR_MODES inter_mode_start, THR_MODES inter_mode_end,
+    THR_MODES intra_mode_start, THR_MODES intra_mode_end) {
   assert(use_adaptive_rd_thresh > 0);
-  const THR_MODES top_mode = MAX_MODES;
   const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT;
 
-  const int bsize_is_1_to_4 = bsize > cm->seq_params.sb_size;
+  const int bsize_is_1_to_4 = bsize > cm->seq_params->sb_size;
   BLOCK_SIZE min_size, max_size;
   if (bsize_is_1_to_4) {
     // This part handles block sizes with 1:4 and 4:1 aspect ratios
@@ -1303,19 +1455,13 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
     max_size = bsize;
   } else {
     min_size = AOMMAX(bsize - 2, BLOCK_4X4);
-    max_size = AOMMIN(bsize + 2, (int)cm->seq_params.sb_size);
+    max_size = AOMMIN(bsize + 2, (int)cm->seq_params->sb_size);
   }
 
-  for (THR_MODES mode = 0; mode < top_mode; ++mode) {
-    for (BLOCK_SIZE bs = min_size; bs <= max_size; ++bs) {
-      int *const fact = &factor_buf[bs][mode];
-      if (mode == best_mode_index) {
-        *fact -= (*fact >> RD_THRESH_LOG_DEC_FACTOR);
-      } else {
-        *fact = AOMMIN(*fact + RD_THRESH_INC, max_rd_thresh_factor);
-      }
-    }
-  }
+  update_thr_fact(factor_buf, best_mode_index, inter_mode_start, inter_mode_end,
+                  min_size, max_size, max_rd_thresh_factor);
+  update_thr_fact(factor_buf, best_mode_index, intra_mode_start, intra_mode_end,
+                  min_size, max_size, max_rd_thresh_factor);
 }
 
 int av1_get_intra_cost_penalty(int qindex, int qdelta,
diff --git a/media/libaom/src/av1/encoder/rd.h b/media/libaom/src/av1/encoder/rd.h
index 1addbaeb96..8d0277e3bf 100644
--- a/media/libaom/src/av1/encoder/rd.h
+++ b/media/libaom/src/av1/encoder/rd.h
@@ -19,6 +19,7 @@
 #include "av1/encoder/block.h"
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/cost.h"
+#include "av1/encoder/ratectrl.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,9 +36,9 @@ extern "C" {
   (((D) * (1 << RDDIV_BITS)) - \
    ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT))
 
-#define RDCOST_DBL(RM, R, D)                                       \
+#define RDCOST_DBL_WITH_NATIVE_BD_DIST(RM, R, D, BD)               \
   (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
-   ((double)(D) * (1 << RDDIV_BITS)))
+   ((double)((D) >> (2 * (BD - 8))) * (1 << RDDIV_BITS)))
 
 #define QIDX_SKIP_THRESH 115
 
@@ -78,24 +79,9 @@ typedef struct RD_OPT {
 
   int RDMULT;
 
-  double r0, arf_r0;
-  double mc_saved_base, mc_count_base;
+  double r0;
 } RD_OPT;
 
-typedef struct {
-  // Cost of transmitting the actual motion vector.
-  // mv_component[0][i] is the cost of motion vector with horizontal component
-  // (mv_row) equal to i - MV_MAX.
-  // mv_component[1][i] is the cost of motion vector with vertical component
-  // (mv_col) equal to i - MV_MAX.
-  int mv_component[2][MV_VALS];
-
-  // joint_mv[i] is the cost of transmitting joint mv(MV_JOINT_TYPE) of
-  // type i.
-  // TODO(huisu@google.com): we can update dv_joint_cost per SB.
-  int joint_mv[MV_JOINTS];
-} IntraBCMVCosts;
-
 static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
 #if CONFIG_RD_DEBUG
   int plane;
@@ -104,19 +90,13 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
   rd_stats->dist = 0;
   rd_stats->rdcost = 0;
   rd_stats->sse = 0;
-  rd_stats->skip = 1;
+  rd_stats->skip_txfm = 1;
   rd_stats->zero_rate = 0;
 #if CONFIG_RD_DEBUG
   // This may run into problems when monochrome video is
   // encoded, as there will only be 1 plane
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = 0;
-    {
-      int r, c;
-      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
-        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
-          rd_stats->txb_coeff_cost_map[plane][r][c] = 0;
-    }
   }
 #endif
 }
@@ -129,62 +109,49 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
   rd_stats->dist = INT64_MAX;
   rd_stats->rdcost = INT64_MAX;
   rd_stats->sse = INT64_MAX;
-  rd_stats->skip = 0;
+  rd_stats->skip_txfm = 0;
   rd_stats->zero_rate = 0;
 #if CONFIG_RD_DEBUG
   // This may run into problems when monochrome video is
   // encoded, as there will only be 1 plane
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = INT_MAX;
-    {
-      int r, c;
-      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
-        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
-          rd_stats->txb_coeff_cost_map[plane][r][c] = INT16_MAX;
-    }
   }
 #endif
 }
 
 static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
                                       const RD_STATS *rd_stats_src) {
-  assert(rd_stats_dst->rate != INT_MAX && rd_stats_src->rate != INT_MAX);
+  if (rd_stats_dst->rate == INT_MAX || rd_stats_src->rate == INT_MAX) {
+    // If rd_stats_dst or rd_stats_src has invalid rate, we will make
+    // rd_stats_dst invalid.
+    av1_invalid_rd_stats(rd_stats_dst);
+    return;
+  }
   rd_stats_dst->rate = (int)AOMMIN(
       ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX);
   if (!rd_stats_dst->zero_rate)
     rd_stats_dst->zero_rate = rd_stats_src->zero_rate;
   rd_stats_dst->dist += rd_stats_src->dist;
   rd_stats_dst->sse += rd_stats_src->sse;
-  rd_stats_dst->skip &= rd_stats_src->skip;
+  rd_stats_dst->skip_txfm &= rd_stats_src->skip_txfm;
 #if CONFIG_RD_DEBUG
   // This may run into problems when monochrome video is
   // encoded, as there will only be 1 plane
   for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
-    {
-      // TODO(angiebird): optimize this part
-      int r, c;
-      int ref_txb_coeff_cost = 0;
-      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
-        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
-          rd_stats_dst->txb_coeff_cost_map[plane][r][c] +=
-              rd_stats_src->txb_coeff_cost_map[plane][r][c];
-          ref_txb_coeff_cost += rd_stats_dst->txb_coeff_cost_map[plane][r][c];
-        }
-      assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]);
-    }
   }
 #endif
 }
 
 static INLINE void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist,
-                                           int rate, int skip, int64_t sse,
+                                           int rate, int skip_txfm, int64_t sse,
                                            int zero_rate) {
   assert(rd_stats->rate != INT_MAX && rate != INT_MAX);
   rd_stats->rate += rate;
   if (!rd_stats->zero_rate) rd_stats->zero_rate = zero_rate;
   rd_stats->dist += dist;
-  rd_stats->skip &= skip;
+  rd_stats->skip_txfm &= skip_txfm;
   rd_stats->sse += sse;
 }
 
@@ -225,14 +192,25 @@ struct TileDataEnc;
 struct AV1_COMP;
 struct macroblock;
 
-int av1_compute_rd_mult_based_on_qindex(const struct AV1_COMP *cpi, int qindex);
+/*!\brief Compute rdmult based on q index and frame update type
+ *
+ * \param[in]       bit_depth       bit depth
+ * \param[in]       update_type     frame update type
+ * \param[in]       qindex          q index
+ *
+ * \return rdmult
+ */
+int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
+                                        FRAME_UPDATE_TYPE update_type,
+                                        int qindex);
 
 int av1_compute_rd_mult(const struct AV1_COMP *cpi, int qindex);
 
 void av1_initialize_rd_consts(struct AV1_COMP *cpi);
 
-void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                              int qindex);
+// Sets the multiplier to convert mv cost to l1 error during motion search.
+void av1_set_sad_per_bit(const struct AV1_COMP *cpi, int *sadperbit,
+                         int qindex);
 
 void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
                                   unsigned int qstep, int *rate, int64_t *dist);
@@ -243,7 +221,7 @@ void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
                           double yl, double *rate_f, double *distbysse_f);
 
 int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
-                            InterpFilter interp_filter);
+                            InterpFilter interp_filter, int dual_filter);
 
 YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
                                              int ref_frame);
@@ -261,7 +239,11 @@ void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
 
 void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
                                int (*fact)[MAX_MODES], int rd_thresh,
-                               BLOCK_SIZE bsize, THR_MODES best_mode_index);
+                               BLOCK_SIZE bsize, THR_MODES best_mode_index,
+                               THR_MODES inter_mode_start,
+                               THR_MODES inter_mode_end,
+                               THR_MODES intra_mode_start,
+                               THR_MODES intra_mode_end);
 
 static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) {
   for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
@@ -271,73 +253,57 @@ static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) {
   }
 }
 
-static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+static INLINE int rd_less_than_thresh(int64_t best_rd, int64_t thresh,
                                       int thresh_fact) {
-  return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+  return best_rd < (thresh * thresh_fact >> 5) || thresh == INT_MAX;
 }
 
 void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x,
                  uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame,
                  BLOCK_SIZE block_size);
 
-static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
-  x->errorperbit = rdmult >> RD_EPB_SHIFT;
-  x->errorperbit += (x->errorperbit == 0);
+// Sets the multiplier to convert mv cost to l2 error during motion search.
+static INLINE void av1_set_error_per_bit(int *errorperbit, int rdmult) {
+  *errorperbit = AOMMAX(rdmult >> RD_EPB_SHIFT, 1);
 }
 
 // Get the threshold for R-D optimization of coefficients depending upon mode
 // decision/winner mode processing
-static INLINE uint32_t get_rd_opt_coeff_thresh(
-    const uint32_t *const coeff_opt_dist_threshold,
-    int enable_winner_mode_for_coeff_opt, int is_winner_mode) {
-  // Default initialization of threshold
-  uint32_t coeff_opt_thresh = coeff_opt_dist_threshold[DEFAULT_EVAL];
+static INLINE void get_rd_opt_coeff_thresh(
+    const uint32_t (*const coeff_opt_threshold)[2],
+    TxfmSearchParams *txfm_params, int enable_winner_mode_for_coeff_opt,
+    int is_winner_mode) {
+  if (!enable_winner_mode_for_coeff_opt) {
+    // Default initialization of threshold
+    txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[DEFAULT_EVAL][0];
+    txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[DEFAULT_EVAL][1];
+    return;
+  }
   // TODO(any): Experiment with coeff_opt_dist_threshold values when
   // enable_winner_mode_for_coeff_opt is ON
   // TODO(any): Skip the winner mode processing for blocks with lower residual
   // energy as R-D optimization of coefficients would have been enabled during
   // mode decision
-  if (enable_winner_mode_for_coeff_opt) {
-    // Use conservative threshold during mode decision and perform R-D
-    // optimization of coeffs always for winner modes
-    if (is_winner_mode)
-      coeff_opt_thresh = coeff_opt_dist_threshold[WINNER_MODE_EVAL];
-    else
-      coeff_opt_thresh = coeff_opt_dist_threshold[MODE_EVAL];
-  }
-  return coeff_opt_thresh;
-}
 
-// Used to reset the state of tx/mb rd hash information
-static INLINE void reset_hash_records(MACROBLOCK *const x,
-                                      int use_inter_txb_hash) {
-  int32_t record_idx;
-
-  // Reset the state for use_inter_txb_hash
-  if (use_inter_txb_hash) {
-    for (record_idx = 0;
-         record_idx < ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)); record_idx++)
-      x->txb_rd_record_8X8[record_idx].num =
-          x->txb_rd_record_8X8[record_idx].index_start = 0;
-    for (record_idx = 0;
-         record_idx < ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)); record_idx++)
-      x->txb_rd_record_16X16[record_idx].num =
-          x->txb_rd_record_16X16[record_idx].index_start = 0;
-    for (record_idx = 0;
-         record_idx < ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)); record_idx++)
-      x->txb_rd_record_32X32[record_idx].num =
-          x->txb_rd_record_32X32[record_idx].index_start = 0;
-    for (record_idx = 0;
-         record_idx < ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)); record_idx++)
-      x->txb_rd_record_64X64[record_idx].num =
-          x->txb_rd_record_64X64[record_idx].index_start = 0;
+  // Use conservative threshold during mode decision and perform R-D
+  // optimization of coeffs always for winner modes
+  if (is_winner_mode) {
+    txfm_params->coeff_opt_thresholds[0] =
+        coeff_opt_threshold[WINNER_MODE_EVAL][0];
+    txfm_params->coeff_opt_thresholds[1] =
+        coeff_opt_threshold[WINNER_MODE_EVAL][1];
+  } else {
+    txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[MODE_EVAL][0];
+    txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[MODE_EVAL][1];
   }
+}
 
-  // Reset the state for use_intra_txb_hash
-  x->txb_rd_record_intra.num = x->txb_rd_record_intra.index_start = 0;
+// Used to reset the state of mb rd hash information
+static INLINE void reset_mb_rd_record(MB_RD_RECORD *const mb_rd_record) {
+  if (!mb_rd_record) return;
 
   // Reset the state for use_mb_rd_hash
-  x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+  mb_rd_record->num = mb_rd_record->index_start = 0;
 }
 
 void av1_setup_pred_block(const MACROBLOCKD *xd,
@@ -350,18 +316,33 @@ void av1_setup_pred_block(const MACROBLOCKD *xd,
 int av1_get_intra_cost_penalty(int qindex, int qdelta,
                                aom_bit_depth_t bit_depth);
 
-void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
+void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs,
                          FRAME_CONTEXT *fc);
 
-void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc);
+
+void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc,
                           const int num_planes);
 
-void av1_fill_mv_costs(const FRAME_CONTEXT *fc, int integer_mv, int usehp,
-                       MACROBLOCK *x);
+void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp,
+                       MvCosts *mv_costs);
+
+void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs);
 
 int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta);
 
-int av1_get_deltaq_offset(const struct AV1_COMP *cpi, int qindex, double beta);
+int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta);
+
+/*!\brief Adjust current superblock's q_index based on delta q resolution
+ *
+ * \param[in]       delta_q_res       delta q resolution
+ * \param[in]       prev_qindex       previous superblock's q index
+ * \param[in]       curr_qindex       current superblock's q index
+ *
+ * \return the current superblock's adjusted q_index
+ */
+int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex,
+                                  int curr_qindex);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/av1/encoder/rdopt.c b/media/libaom/src/av1/encoder/rdopt.c
index 02afcd1ff0..4ec7f77d66 100644
--- a/media/libaom/src/av1/encoder/rdopt.c
+++ b/media/libaom/src/av1/encoder/rdopt.c
@@ -22,10 +22,10 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/common/av1_common_int.h"
 #include "av1/common/cfl.h"
+#include "av1/common/blockd.h"
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
 #include "av1/common/entropy.h"
@@ -53,6 +53,7 @@
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/encoder/interp_search.h"
 #include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/ml.h"
 #include "av1/encoder/mode_prune_model_weights.h"
@@ -68,6 +69,7 @@
 #include "av1/encoder/tokenize.h"
 #include "av1/encoder/tpl_model.h"
 #include "av1/encoder/tx_search.h"
+#include "av1/encoder/var_based_part.h"
 
 #define LAST_NEW_MV_INDEX 6
 
@@ -152,132 +154,132 @@ static const THR_MODES av1_default_mode_order[MAX_MODES] = {
   THR_COMP_NEAREST_NEARESTLG,
   THR_COMP_NEAREST_NEARESTBA,
 
+  THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_GLOBAL_GLOBALLB,
+
   THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEWLA,
   THR_COMP_NEW_NEARESTLA,
   THR_COMP_NEAREST_NEWLA,
   THR_COMP_NEW_NEARLA,
   THR_COMP_NEAR_NEWLA,
-  THR_COMP_NEW_NEWLA,
   THR_COMP_GLOBAL_GLOBALLA,
 
   THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEWL2A,
   THR_COMP_NEW_NEARESTL2A,
   THR_COMP_NEAREST_NEWL2A,
   THR_COMP_NEW_NEARL2A,
   THR_COMP_NEAR_NEWL2A,
-  THR_COMP_NEW_NEWL2A,
   THR_COMP_GLOBAL_GLOBALL2A,
 
   THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEWL3A,
   THR_COMP_NEW_NEARESTL3A,
   THR_COMP_NEAREST_NEWL3A,
   THR_COMP_NEW_NEARL3A,
   THR_COMP_NEAR_NEWL3A,
-  THR_COMP_NEW_NEWL3A,
   THR_COMP_GLOBAL_GLOBALL3A,
 
   THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEWGA,
   THR_COMP_NEW_NEARESTGA,
   THR_COMP_NEAREST_NEWGA,
   THR_COMP_NEW_NEARGA,
   THR_COMP_NEAR_NEWGA,
-  THR_COMP_NEW_NEWGA,
   THR_COMP_GLOBAL_GLOBALGA,
 
-  THR_COMP_NEAR_NEARLB,
-  THR_COMP_NEW_NEARESTLB,
-  THR_COMP_NEAREST_NEWLB,
-  THR_COMP_NEW_NEARLB,
-  THR_COMP_NEAR_NEWLB,
-  THR_COMP_NEW_NEWLB,
-  THR_COMP_GLOBAL_GLOBALLB,
-
   THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEWL2B,
   THR_COMP_NEW_NEARESTL2B,
   THR_COMP_NEAREST_NEWL2B,
   THR_COMP_NEW_NEARL2B,
   THR_COMP_NEAR_NEWL2B,
-  THR_COMP_NEW_NEWL2B,
   THR_COMP_GLOBAL_GLOBALL2B,
 
   THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEWL3B,
   THR_COMP_NEW_NEARESTL3B,
   THR_COMP_NEAREST_NEWL3B,
   THR_COMP_NEW_NEARL3B,
   THR_COMP_NEAR_NEWL3B,
-  THR_COMP_NEW_NEWL3B,
   THR_COMP_GLOBAL_GLOBALL3B,
 
   THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEWGB,
   THR_COMP_NEW_NEARESTGB,
   THR_COMP_NEAREST_NEWGB,
   THR_COMP_NEW_NEARGB,
   THR_COMP_NEAR_NEWGB,
-  THR_COMP_NEW_NEWGB,
   THR_COMP_GLOBAL_GLOBALGB,
 
   THR_COMP_NEAR_NEARLA2,
+  THR_COMP_NEW_NEWLA2,
   THR_COMP_NEW_NEARESTLA2,
   THR_COMP_NEAREST_NEWLA2,
   THR_COMP_NEW_NEARLA2,
   THR_COMP_NEAR_NEWLA2,
-  THR_COMP_NEW_NEWLA2,
   THR_COMP_GLOBAL_GLOBALLA2,
 
   THR_COMP_NEAR_NEARL2A2,
+  THR_COMP_NEW_NEWL2A2,
   THR_COMP_NEW_NEARESTL2A2,
   THR_COMP_NEAREST_NEWL2A2,
   THR_COMP_NEW_NEARL2A2,
   THR_COMP_NEAR_NEWL2A2,
-  THR_COMP_NEW_NEWL2A2,
   THR_COMP_GLOBAL_GLOBALL2A2,
 
   THR_COMP_NEAR_NEARL3A2,
+  THR_COMP_NEW_NEWL3A2,
   THR_COMP_NEW_NEARESTL3A2,
   THR_COMP_NEAREST_NEWL3A2,
   THR_COMP_NEW_NEARL3A2,
   THR_COMP_NEAR_NEWL3A2,
-  THR_COMP_NEW_NEWL3A2,
   THR_COMP_GLOBAL_GLOBALL3A2,
 
   THR_COMP_NEAR_NEARGA2,
+  THR_COMP_NEW_NEWGA2,
   THR_COMP_NEW_NEARESTGA2,
   THR_COMP_NEAREST_NEWGA2,
   THR_COMP_NEW_NEARGA2,
   THR_COMP_NEAR_NEWGA2,
-  THR_COMP_NEW_NEWGA2,
   THR_COMP_GLOBAL_GLOBALGA2,
 
   THR_COMP_NEAR_NEARLL2,
+  THR_COMP_NEW_NEWLL2,
   THR_COMP_NEW_NEARESTLL2,
   THR_COMP_NEAREST_NEWLL2,
   THR_COMP_NEW_NEARLL2,
   THR_COMP_NEAR_NEWLL2,
-  THR_COMP_NEW_NEWLL2,
   THR_COMP_GLOBAL_GLOBALLL2,
 
   THR_COMP_NEAR_NEARLL3,
+  THR_COMP_NEW_NEWLL3,
   THR_COMP_NEW_NEARESTLL3,
   THR_COMP_NEAREST_NEWLL3,
   THR_COMP_NEW_NEARLL3,
   THR_COMP_NEAR_NEWLL3,
-  THR_COMP_NEW_NEWLL3,
   THR_COMP_GLOBAL_GLOBALLL3,
 
   THR_COMP_NEAR_NEARLG,
+  THR_COMP_NEW_NEWLG,
   THR_COMP_NEW_NEARESTLG,
   THR_COMP_NEAREST_NEWLG,
   THR_COMP_NEW_NEARLG,
   THR_COMP_NEAR_NEWLG,
-  THR_COMP_NEW_NEWLG,
   THR_COMP_GLOBAL_GLOBALLG,
 
   THR_COMP_NEAR_NEARBA,
+  THR_COMP_NEW_NEWBA,
   THR_COMP_NEW_NEARESTBA,
   THR_COMP_NEAREST_NEWBA,
   THR_COMP_NEW_NEARBA,
   THR_COMP_NEAR_NEWBA,
-  THR_COMP_NEW_NEWBA,
   THR_COMP_GLOBAL_GLOBALBA,
 
   THR_DC,
@@ -295,23 +297,7 @@ static const THR_MODES av1_default_mode_order[MAX_MODES] = {
   THR_D45_PRED,
 };
 
-static int find_last_single_ref_mode_idx(const THR_MODES *mode_order) {
-  uint8_t mode_found[NUM_SINGLE_REF_MODES];
-  av1_zero(mode_found);
-  int num_single_ref_modes_left = NUM_SINGLE_REF_MODES;
-
-  for (int idx = 0; idx < MAX_MODES; idx++) {
-    const THR_MODES curr_mode = mode_order[idx];
-    if (curr_mode < SINGLE_REF_MODE_END) {
-      num_single_ref_modes_left--;
-    }
-    if (!num_single_ref_modes_left) {
-      return idx;
-    }
-  }
-  return -1;
-}
-
+/*!\cond */
 typedef struct SingleInterModeState {
   int64_t rd;
   MV_REFERENCE_FRAME ref_frame;
@@ -333,7 +319,11 @@ typedef struct InterModeSearchState {
   int64_t mode_threshold[MAX_MODES];
   int64_t best_intra_rd;
   unsigned int best_pred_sse;
-  int64_t best_pred_diff[REFERENCE_MODES];
+
+  /*!
+   * \brief Keep track of best intra rd for use in compound mode.
+   */
+  int64_t best_pred_rd[REFERENCE_MODES];
   // Save a set of single_newmv for each checked ref_mv.
   int_mv single_newmv[MAX_REF_MV_SEARCH][REF_FRAMES];
   int single_newmv_rate[MAX_REF_MV_SEARCH][REF_FRAMES];
@@ -341,6 +331,8 @@ typedef struct InterModeSearchState {
   int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
   // The rd of simple translation in single inter modes
   int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
+  int64_t best_single_rd[REF_FRAMES];
+  PREDICTION_MODE best_single_mode[REF_FRAMES];
 
   // Single search results by [directions][modes][reference frames]
   SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
@@ -350,7 +342,9 @@ typedef struct InterModeSearchState {
   int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM];
   MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
   IntraModeSearchState intra_search_state;
+  RD_STATS best_y_rdcost;
 } InterModeSearchState;
+/*!\endcond */
 
 void av1_inter_mode_data_init(TileDataEnc *tile_data) {
   for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
@@ -368,7 +362,6 @@ void av1_inter_mode_data_init(TileDataEnc *tile_data) {
 static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
                              int64_t sse, int *est_residue_cost,
                              int64_t *est_dist) {
-  aom_clear_system_state();
   const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
   if (md->ready) {
     if (sse < md->dist_mean) {
@@ -401,7 +394,6 @@ static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
 }
 
 void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
-  aom_clear_system_state();
   for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
     const int block_idx = inter_mode_data_block_idx(bsize);
     InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
@@ -459,7 +451,6 @@ static AOM_INLINE void inter_mode_data_push(TileDataEnc *tile_data,
   if (block_idx == -1) return;
   InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize];
   if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) {
-    aom_clear_system_state();
     const double ld = (sse - dist) * 1. / residue_cost;
     ++rd_model->num;
     rd_model->dist_sum += dist;
@@ -490,7 +481,14 @@ static AOM_INLINE void inter_modes_info_push(InterModesInfo *inter_modes_info,
 
 static int compare_rd_idx_pair(const void *a, const void *b) {
   if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) {
-    return 0;
+    // To avoid inconsistency in qsort() ordering when two elements are equal,
+    // using idx as tie breaker. Refer aomedia:2928
+    if (((RdIdxPair *)a)->idx == ((RdIdxPair *)b)->idx)
+      return 0;
+    else if (((RdIdxPair *)a)->idx > ((RdIdxPair *)b)->idx)
+      return 1;
+    else
+      return -1;
   } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) {
     return 1;
   } else {
@@ -621,12 +619,12 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x,
     if (plane && !xd->is_chroma_ref) break;
     const struct macroblock_plane *const p = &x->plane[plane];
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
-                                               pd->subsampling_y);
+    const BLOCK_SIZE bs =
+        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
     unsigned int sse;
 
-    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-                       &sse);
+    cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                            pd->dst.stride, &sse);
     total_sse += sse;
     if (!plane && sse_y) *sse_y = sse;
   }
@@ -701,10 +699,10 @@ static int conditional_skipintra(PREDICTION_MODE mode,
   return 0;
 }
 
-static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
+static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode,
                        int16_t mode_context) {
   if (is_inter_compound_mode(mode)) {
-    return x
+    return mode_costs
         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
   }
 
@@ -714,19 +712,19 @@ static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
   assert(is_inter_mode(mode));
 
   if (mode == NEWMV) {
-    mode_cost = x->newmv_mode_cost[mode_ctx][0];
+    mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0];
     return mode_cost;
   } else {
-    mode_cost = x->newmv_mode_cost[mode_ctx][1];
+    mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1];
     mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
 
     if (mode == GLOBALMV) {
-      mode_cost += x->zeromv_mode_cost[mode_ctx][0];
+      mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0];
       return mode_cost;
     } else {
-      mode_cost += x->zeromv_mode_cost[mode_ctx][1];
+      mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1];
       mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
-      mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
       return mode_cost;
     }
   }
@@ -739,7 +737,7 @@ static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
 }
 
 static AOM_INLINE void estimate_ref_frame_costs(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x,
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const ModeCosts *mode_costs,
     int segment_id, unsigned int *ref_costs_single,
     unsigned int (*ref_costs_comp)[REF_FRAMES]) {
   int seg_ref_active =
@@ -752,8 +750,9 @@ static AOM_INLINE void estimate_ref_frame_costs(
              REF_FRAMES * sizeof((*ref_costs_comp)[0]));
   } else {
     int intra_inter_ctx = av1_get_intra_inter_context(xd);
-    ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0];
-    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
+    ref_costs_single[INTRA_FRAME] =
+        mode_costs->intra_inter_cost[intra_inter_ctx][0];
+    unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
 
     for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
       ref_costs_single[i] = base_cost;
@@ -768,38 +767,41 @@ static AOM_INLINE void estimate_ref_frame_costs(
     // Determine cost of a single ref frame, where frame types are represented
     // by a tree:
     // Level 0: add cost whether this ref is a forward or backward ref
-    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0];
-    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
-    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1];
-    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF2_FRAME] +=
+        mode_costs->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1];
 
     // Level 1: if this ref is forward ref,
     // add cost whether it is last/last2 or last3/golden
-    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0];
-    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0];
-    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1];
-    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+    ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1];
 
     // Level 1: if this ref is backward ref
     // then add cost whether this ref is altref or backward ref
-    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0];
-    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0];
-    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1];
+    ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF2_FRAME] +=
+        mode_costs->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][1];
 
     // Level 2: further add cost whether this ref is last or last2
-    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0];
-    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1];
+    ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][0];
+    ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][1];
 
     // Level 2: last3 or golden
-    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0];
-    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1];
+    ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][0];
+    ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][1];
 
     // Level 2: bwdref or altref2
-    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0];
-    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1];
+    ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p6][5][0];
+    ref_costs_single[ALTREF2_FRAME] +=
+        mode_costs->single_ref_cost[ctx_p6][5][1];
 
     if (cm->current_frame.reference_mode != SINGLE_REFERENCE) {
       // Similar to single ref, determine cost of compound ref frames.
@@ -815,34 +817,42 @@ static AOM_INLINE void estimate_ref_frame_costs(
 
       ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
           ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
-              base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1];
+              base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1];
       ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
       ref_bicomp_costs[ALTREF_FRAME] = 0;
 
       // cost of first ref frame
-      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
-      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
-      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
-      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
-
-      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0];
-      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1];
-
-      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0];
-      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1];
+      ref_bicomp_costs[LAST_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST2_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST3_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
+      ref_bicomp_costs[GOLDEN_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
+
+      ref_bicomp_costs[LAST_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[LAST2_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][1];
+
+      ref_bicomp_costs[LAST3_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][0];
+      ref_bicomp_costs[GOLDEN_FRAME] +=
+          mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][1];
 
       // cost of second ref frame
       ref_bicomp_costs[BWDREF_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
       ref_bicomp_costs[ALTREF2_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
       ref_bicomp_costs[ALTREF_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
 
       ref_bicomp_costs[BWDREF_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
       ref_bicomp_costs[ALTREF2_FRAME] +=
-          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
+          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
 
       // cost: if one ref frame is forward ref, the other ref is backward ref
       int ref0, ref1;
@@ -858,22 +868,22 @@ static AOM_INLINE void estimate_ref_frame_costs(
       const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
       const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
       ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
       ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
       ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
       ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
-          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
+          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
     } else {
       int ref0, ref1;
       for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
@@ -894,22 +904,19 @@ static AOM_INLINE void store_coding_context(
 #else
     MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
 #endif  // CONFIG_INTERNAL_STATS
-    int64_t comp_pred_diff[REFERENCE_MODES], int skippable) {
+    int skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
-  ctx->rd_stats.skip = x->force_skip;
+  ctx->rd_stats.skip_txfm = x->txfm_search_info.skip_txfm;
   ctx->skippable = skippable;
 #if CONFIG_INTERNAL_STATS
   ctx->best_mode_index = mode_index;
 #endif  // CONFIG_INTERNAL_STATS
   ctx->mic = *xd->mi[0];
-  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext,
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
                                       av1_ref_frame_type(xd->mi[0]->ref_frame));
-  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
-  ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
-  ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
 }
 
 static AOM_INLINE void setup_buffer_ref_mvs_inter(
@@ -921,7 +928,7 @@ static AOM_INLINE void setup_buffer_ref_mvs_inter(
       av1_get_scaled_ref_frame(cpi, ref_frame);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const struct scale_factors *const sf =
       get_ref_scale_factors_const(cm, ref_frame);
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
@@ -979,7 +986,7 @@ static int skip_repeated_mv(const AV1_COMMON *const cm,
                             InterModeSearchState *search_state) {
   const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
   const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
   PREDICTION_MODE compare_mode = MB_MODE_COUNT;
   if (!is_comp_pred) {
@@ -1012,8 +1019,9 @@ static int skip_repeated_mv(const AV1_COMMON *const cm,
           INT64_MAX) {
         const int16_t mode_ctx =
             av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames);
-        const int compare_cost = cost_mv_ref(x, compare_mode, mode_ctx);
-        const int this_cost = cost_mv_ref(x, this_mode, mode_ctx);
+        const int compare_cost =
+            cost_mv_ref(&x->mode_costs, compare_mode, mode_ctx);
+        const int this_cost = cost_mv_ref(&x->mode_costs, this_mode, mode_ctx);
 
         // Only skip if the mode cost is larger than compare mode cost
         if (this_cost > compare_cost) {
@@ -1055,8 +1063,8 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                             const BLOCK_SIZE bsize, int_mv *cur_mv,
                             int *const rate_mv, HandleInterModeArgs *const args,
                             inter_mode_info *mode_info) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
   const int refs[2] = { mbmi->ref_frame[0],
@@ -1066,7 +1074,6 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
   if (is_comp_pred) {
     const int valid_mv0 = args->single_newmv_valid[ref_mv_idx][refs[0]];
     const int valid_mv1 = args->single_newmv_valid[ref_mv_idx][refs[1]];
-
     if (this_mode == NEW_NEWMV) {
       if (valid_mv0) {
         cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
@@ -1076,55 +1083,32 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
         cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
         clamp_mv_in_range(x, &cur_mv[1], 1);
       }
-
-      // aomenc1
-      if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize ||
-          !valid_mv0 || !valid_mv1) {
-        av1_joint_motion_search(cpi, x, bsize, cur_mv, NULL, 0, rate_mv);
-      } else {
-        *rate_mv = 0;
-        for (int i = 0; i < 2; ++i) {
-          const int_mv ref_mv = av1_get_ref_mv(x, i);
-          *rate_mv +=
-              av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                              x->mv_cost_stack, MV_COST_WEIGHT);
-        }
+      *rate_mv = 0;
+      for (int i = 0; i < 2; ++i) {
+        const int_mv ref_mv = av1_get_ref_mv(x, i);
+        *rate_mv += av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv,
+                                    x->mv_costs->nmv_joint_cost,
+                                    x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
       }
     } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
       if (valid_mv1) {
         cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
         clamp_mv_in_range(x, &cur_mv[1], 1);
       }
-
-      // aomenc2
-      if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize ||
-          !valid_mv1) {
-        av1_compound_single_motion_search_interinter(cpi, x, bsize, cur_mv,
-                                                     NULL, 0, rate_mv, 1);
-      } else {
-        const int_mv ref_mv = av1_get_ref_mv(x, 1);
-        *rate_mv =
-            av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                            x->mv_cost_stack, MV_COST_WEIGHT);
-      }
+      const int_mv ref_mv = av1_get_ref_mv(x, 1);
+      *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv,
+                                 x->mv_costs->nmv_joint_cost,
+                                 x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
     } else {
       assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
       if (valid_mv0) {
         cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
         clamp_mv_in_range(x, &cur_mv[0], 0);
       }
-
-      // aomenc3
-      if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize ||
-          !valid_mv0) {
-        av1_compound_single_motion_search_interinter(cpi, x, bsize, cur_mv,
-                                                     NULL, 0, rate_mv, 0);
-      } else {
-        const int_mv ref_mv = av1_get_ref_mv(x, 0);
-        *rate_mv =
-            av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                            x->mv_cost_stack, MV_COST_WEIGHT);
-      }
+      const int_mv ref_mv = av1_get_ref_mv(x, 0);
+      *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv,
+                                 x->mv_costs->nmv_joint_cost,
+                                 x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
     }
   } else {
     // Single ref case.
@@ -1138,7 +1122,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       MV prev_ref_mv[2] = { { 0 } };
       for (int idx = 0; idx < mbmi->ref_mv_idx; ++idx) {
         prev_ref_mv[idx] = av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame,
-                                                     idx, x->mbmi_ext)
+                                                     idx, &x->mbmi_ext)
                                .as_mv;
         const int ref_mv_diff = AOMMAX(abs(ref_mv.row - prev_ref_mv[idx].row),
                                        abs(ref_mv.col - prev_ref_mv[idx].col));
@@ -1165,47 +1149,25 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
 
     int_mv best_mv;
     av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range,
-                             mode_info, &best_mv);
+                             mode_info, &best_mv, args);
     if (best_mv.as_int == INVALID_MV) return INT64_MAX;
 
     args->single_newmv[ref_mv_idx][refs[0]] = best_mv;
     args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv;
     args->single_newmv_valid[ref_mv_idx][refs[0]] = 1;
     cur_mv[0].as_int = best_mv.as_int;
+
+    // Return after single_newmv is set.
+    if (mode_info[mbmi->ref_mv_idx].skip) return INT64_MAX;
   }
 
   return 0;
 }
 
-// If number of valid neighbours is 1,
-// 1) ROTZOOM parameters can be obtained reliably (2 parameters from
-// one neighbouring MV)
-// 2) For IDENTITY/TRANSLATION cases, warp can perform better due to
-// a different interpolation filter being used. However the quality
-// gains (due to the same) may not be much
-// For above 2 cases warp evaluation is skipped
-
-static int check_if_optimal_warp(const AV1_COMP *cpi,
-                                 WarpedMotionParams *wm_params,
-                                 int num_proj_ref) {
-  int is_valid_warp = 1;
-  if (cpi->sf.inter_sf.prune_warp_using_wmtype) {
-    TransformationType wmtype = get_wmtype(wm_params);
-    if (num_proj_ref == 1) {
-      if (wmtype != ROTZOOM) is_valid_warp = 0;
-    } else {
-      if (wmtype < ROTZOOM) is_valid_warp = 0;
-    }
-  }
-  return is_valid_warp;
-}
-
-static INLINE void update_mode_start_end_index(const AV1_COMP *const cpi,
-                                               int *mode_index_start,
-                                               int *mode_index_end,
-                                               int last_motion_mode_allowed,
-                                               int interintra_allowed,
-                                               int eval_motion_mode) {
+static INLINE void update_mode_start_end_index(
+    const AV1_COMP *const cpi, const MB_MODE_INFO *const mbmi,
+    int *mode_index_start, int *mode_index_end, int last_motion_mode_allowed,
+    int interintra_allowed, int eval_motion_mode) {
   *mode_index_start = (int)SIMPLE_TRANSLATION;
   *mode_index_end = (int)last_motion_mode_allowed + interintra_allowed;
   if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
@@ -1217,93 +1179,194 @@ static INLINE void update_mode_start_end_index(const AV1_COMP *const cpi,
       *mode_index_start = 1;
     }
   }
+  if (cpi->sf.inter_sf.extra_prune_warped && mbmi->bsize > BLOCK_16X16)
+    *mode_index_end = SIMPLE_TRANSLATION;
 }
 
-// TODO(afergs): Refactor the MBMI references in here - there's four
-// TODO(afergs): Refactor optional args - add them to a struct or remove
+/*!\brief AV1 motion mode search
+ *
+ * \ingroup inter_mode_search
+ * Function to search over and determine the motion mode. It will update
+ * mbmi->motion_mode to one of SIMPLE_TRANSLATION, OBMC_CAUSAL, or
+ * WARPED_CAUSAL and determine any necessary side information for the selected
+ * motion mode. It will also perform the full transform search, unless the
+ * input parameter do_tx_search indicates to do an estimation of the RD rather
+ * than an RD corresponding to a full transform search. It will return the
+ * RD for the final motion_mode.
+ * Do the RD search for a given inter mode and compute all information relevant
+ * to the input mode. It will compute the best MV,
+ * compound parameters (if the mode is a compound mode) and interpolation filter
+ * parameters.
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     tile_data         Pointer to struct holding adaptive
+ *                                  data/contexts/models for the tile during
+ *                                  encoding.
+ * \param[in]     x                 Pointer to struct holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     bsize             Current block size.
+ * \param[in,out] rd_stats          Struct to keep track of the overall RD
+ *                                  information.
+ * \param[in,out] rd_stats_y        Struct to keep track of the RD information
+ *                                  for only the Y plane.
+ * \param[in,out] rd_stats_uv       Struct to keep track of the RD information
+ *                                  for only the UV planes.
+ * \param[in]     args              HandleInterModeArgs struct holding
+ *                                  miscellaneous arguments for inter mode
+ *                                  search. See the documentation for this
+ *                                  struct for a description of each member.
+ * \param[in]     ref_best_rd       Best RD found so far for this block.
+ *                                  It is used for early termination of this
+ *                                  search if the RD exceeds this value.
+ * \param[in,out] ref_skip_rd       A length 2 array, where skip_rd[0] is the
+ *                                  best total RD for a skip mode so far, and
+ *                                  skip_rd[1] is the best RD for a skip mode so
+ *                                  far in luma. This is used as a speed feature
+ *                                  to skip the transform search if the computed
+ *                                  skip RD for the current mode is not better
+ *                                  than the best skip_rd so far.
+ * \param[in,out] rate_mv           The rate associated with the motion vectors.
+ *                                  This will be modified if a motion search is
+ *                                  done in the motion mode search.
+ * \param[in,out] orig_dst          A prediction buffer to hold a computed
+ *                                  prediction. This will eventually hold the
+ *                                  final prediction, and the tmp_dst info will
+ *                                  be copied here.
+ * \param[in,out] best_est_rd       Estimated RD for motion mode search if
+ *                                  do_tx_search (see below) is 0.
+ * \param[in]     do_tx_search      Parameter to indicate whether or not to do
+ *                                  a full transform search. This will compute
+ *                                  an estimated RD for the modes without the
+ *                                  transform search and later perform the full
+ *                                  transform search on the best candidates.
+ * \param[in]     inter_modes_info  InterModesInfo struct to hold inter mode
+ *                                  information to perform a full transform
+ *                                  search only on winning candidates searched
+ *                                  with an estimate for transform coding RD.
+ * \param[in]     eval_motion_mode  Boolean whether or not to evaluate motion
+ *                                  motion modes other than SIMPLE_TRANSLATION.
+ * \param[out]    yrd               Stores the rdcost corresponding to encoding
+ *                                  the luma plane.
+ * \return Returns INT64_MAX if the determined motion mode is invalid and the
+ * current motion mode being tested should be skipped. It returns 0 if the
+ * motion mode search is a success.
+ */
 static int64_t motion_mode_rd(
     const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x,
     BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-    RD_STATS *rd_stats_uv, int *disable_skip, HandleInterModeArgs *const args,
-    int64_t ref_best_rd, int64_t *ref_skip_rd, int *rate_mv,
-    const BUFFER_SET *orig_dst, int64_t *best_est_rd, int do_tx_search,
-    InterModesInfo *inter_modes_info, int eval_motion_mode) {
+    RD_STATS *rd_stats_uv, HandleInterModeArgs *const args, int64_t ref_best_rd,
+    int64_t *ref_skip_rd, int *rate_mv, const BUFFER_SET *orig_dst,
+    int64_t *best_est_rd, int do_tx_search, InterModesInfo *inter_modes_info,
+    int eval_motion_mode, int64_t *yrd) {
   const AV1_COMMON *const cm = &cpi->common;
   const FeatureFlags *const features = &cm->features;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
   const int rate2_nocoeff = rd_stats->rate;
-  int best_xskip = 0, best_disable_skip = 0;
+  int best_xskip_txfm = 0;
   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
   uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
   const int rate_mv0 = *rate_mv;
-  const int interintra_allowed = cm->seq_params.enable_interintra_compound &&
+  const int interintra_allowed = cm->seq_params->enable_interintra_compound &&
                                  is_interintra_allowed(mbmi) &&
                                  mbmi->compound_idx;
-  int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
+  WARP_SAMPLE_INFO *const warp_sample_info =
+      &x->warp_sample_info[mbmi->ref_frame[0]];
+  int *pts0 = warp_sample_info->pts;
+  int *pts_inref0 = warp_sample_info->pts_inref;
 
   assert(mbmi->ref_frame[1] != INTRA_FRAME);
   const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
-  (void)tile_data;
   av1_invalid_rd_stats(&best_rd_stats);
-  aom_clear_system_state();
   mbmi->num_proj_ref = 1;  // assume num_proj_ref >=1
   MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+  *yrd = INT64_MAX;
   if (features->switchable_motion_mode) {
+    // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+    // is allowed.
     last_motion_mode_allowed = motion_mode_allowed(
         xd->global_motion, xd, mbmi, features->allow_warped_motion);
   }
 
   if (last_motion_mode_allowed == WARPED_CAUSAL) {
-    mbmi->num_proj_ref = av1_findSamples(cm, xd, pts0, pts_inref0);
+    // Collect projection samples used in least squares approximation of
+    // the warped motion parameters if WARPED_CAUSAL is going to be searched.
+    if (warp_sample_info->num < 0) {
+      warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0);
+    }
+    mbmi->num_proj_ref = warp_sample_info->num;
   }
   const int total_samples = mbmi->num_proj_ref;
   if (total_samples == 0) {
+    // Do not search WARPED_CAUSAL if there are no samples to use to determine
+    // warped parameters.
     last_motion_mode_allowed = OBMC_CAUSAL;
   }
 
   const MB_MODE_INFO base_mbmi = *mbmi;
   MB_MODE_INFO best_mbmi;
-  SimpleRDState *const simple_states = &args->simple_rd_state[mbmi->ref_mv_idx];
   const int interp_filter = features->interp_filter;
   const int switchable_rate =
-      av1_is_interp_needed(xd) ? av1_get_switchable_rate(x, xd, interp_filter)
-                               : 0;
+      av1_is_interp_needed(xd)
+          ? av1_get_switchable_rate(x, xd, interp_filter,
+                                    cm->seq_params->enable_dual_filter)
+          : 0;
   int64_t best_rd = INT64_MAX;
   int best_rate_mv = rate_mv0;
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
   int mode_index_start, mode_index_end;
-  update_mode_start_end_index(cpi, &mode_index_start, &mode_index_end,
+  // Modify the start and end index according to speed features. For example,
+  // if SIMPLE_TRANSLATION has already been searched according to
+  // the motion_mode_for_winner_cand speed feature, update the mode_index_start
+  // to avoid searching it again.
+  update_mode_start_end_index(cpi, mbmi, &mode_index_start, &mode_index_end,
                               last_motion_mode_allowed, interintra_allowed,
                               eval_motion_mode);
+  // Main function loop. This loops over all of the possible motion modes and
+  // computes RD to determine the best one. This process includes computing
+  // any necessary side information for the motion mode and performing the
+  // transform search.
   for (int mode_index = mode_index_start; mode_index <= mode_index_end;
        mode_index++) {
     if (args->skip_motion_mode && mode_index) continue;
-    if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
-        args->single_ref_first_pass && mode_index)
-      break;
     int tmp_rate2 = rate2_nocoeff;
     const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
     int tmp_rate_mv = rate_mv0;
 
     *mbmi = base_mbmi;
     if (is_interintra_mode) {
+      // Only use SIMPLE_TRANSLATION for interintra
       mbmi->motion_mode = SIMPLE_TRANSLATION;
     } else {
       mbmi->motion_mode = (MOTION_MODE)mode_index;
       assert(mbmi->ref_frame[1] != INTRA_FRAME);
     }
 
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
-    const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] <
-                           cpi->sf.inter_sf.prune_obmc_prob_thresh;
-    if ((cpi->oxcf.enable_obmc == 0 || cpi->sf.inter_sf.disable_obmc ||
-         cpi->sf.rt_sf.use_nonrd_pick_mode || prune_obmc) &&
+    // Do not search OBMC if the probability of selecting it is below a
+    // predetermined threshold for this update_type and block size.
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    int use_actual_frame_probs = 1;
+    int prune_obmc;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+    use_actual_frame_probs =
+        (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+    if (!use_actual_frame_probs) {
+      prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] <
+                   cpi->sf.inter_sf.prune_obmc_prob_thresh;
+    }
+#endif
+    if (use_actual_frame_probs) {
+      prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] <
+                   cpi->sf.inter_sf.prune_obmc_prob_thresh;
+    }
+    if ((!cpi->oxcf.motion_mode_cfg.enable_obmc || prune_obmc) &&
         mbmi->motion_mode == OBMC_CAUSAL)
       continue;
 
@@ -1311,45 +1374,27 @@ static int64_t motion_mode_rd(
       // SIMPLE_TRANSLATION mode: no need to recalculate.
       // The prediction is calculated before motion_mode_rd() is called in
       // handle_inter_mode()
-      if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
-          !is_comp_pred) {
-        if (args->single_ref_first_pass == 0) {
-          if (simple_states->early_skipped) {
-            assert(simple_states->rd_stats.rdcost == INT64_MAX);
-            return INT64_MAX;
-          }
-          if (simple_states->rd_stats.rdcost != INT64_MAX) {
-            best_rd = simple_states->rd_stats.rdcost;
-            best_rd_stats = simple_states->rd_stats;
-            best_rd_stats_y = simple_states->rd_stats_y;
-            best_rd_stats_uv = simple_states->rd_stats_uv;
-            memcpy(best_blk_skip, simple_states->blk_skip,
-                   sizeof(x->blk_skip[0]) * xd->height * xd->width);
-            av1_copy_array(best_tx_type_map, simple_states->tx_type_map,
-                           xd->height * xd->width);
-            best_xskip = simple_states->skip;
-            best_disable_skip = simple_states->disable_skip;
-            best_mbmi = *mbmi;
-          }
-          continue;
-        }
-        simple_states->early_skipped = 0;
-      }
     } else if (mbmi->motion_mode == OBMC_CAUSAL) {
       const uint32_t cur_mv = mbmi->mv[0].as_int;
+      // OBMC_CAUSAL not allowed for compound prediction
       assert(!is_comp_pred);
       if (have_newmv_in_inter_mode(this_mode)) {
         av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL,
-                                 &mbmi->mv[0]);
+                                 &mbmi->mv[0], NULL);
         tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
       }
       if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) {
+        // Build the predictor according to the current motion vector if it has
+        // not already been built
         av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
                                       0, av1_num_planes(cm) - 1);
       }
+      // Build the inter predictor by blending the predictor corresponding to
+      // this MV, and the neighboring blocks using the OBMC model
       av1_build_obmc_inter_prediction(
           cm, xd, args->above_pred_buf, args->above_pred_stride,
           args->left_pred_buf, args->left_pred_stride);
+#if !CONFIG_REALTIME_ONLY
     } else if (mbmi->motion_mode == WARPED_CAUSAL) {
       int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
       mbmi->motion_mode = WARPED_CAUSAL;
@@ -1365,21 +1410,18 @@ static int64_t motion_mode_rd(
             &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize);
       }
 
+      // Compute the warped motion parameters with a least squares fit
+      //  using the collected samples
       if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
                                mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
                                &mbmi->wm_params, mi_row, mi_col)) {
-        // Refine MV for NEWMV mode
         assert(!is_comp_pred);
         if (have_newmv_in_inter_mode(this_mode)) {
+          // Refine MV for NEWMV mode
           const int_mv mv0 = mbmi->mv[0];
           const WarpedMotionParams wm_params0 = mbmi->wm_params;
           const int num_proj_ref0 = mbmi->num_proj_ref;
 
-          if (cpi->sf.inter_sf.prune_warp_using_wmtype) {
-            TransformationType wmtype = get_wmtype(&mbmi->wm_params);
-            if (wmtype < ROTZOOM) continue;
-          }
-
           const int_mv ref_mv = av1_get_ref_mv(x, 0);
           SUBPEL_MOTION_SEARCH_PARAMS ms_params;
           av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
@@ -1389,14 +1431,11 @@ static int64_t motion_mode_rd(
           av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
                                total_samples);
 
-          // Keep the refined MV and WM parameters.
           if (mv0.as_int != mbmi->mv[0].as_int) {
-            tmp_rate_mv = av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv.as_mv,
-                                          x->nmv_vec_cost, x->mv_cost_stack,
-                                          MV_COST_WEIGHT);
-            if (cpi->sf.mv_sf.adaptive_motion_search) {
-              x->pred_mv[mbmi->ref_frame[0]] = mbmi->mv[0].as_mv;
-            }
+            // Keep the refined MV and WM parameters.
+            tmp_rate_mv = av1_mv_bit_cost(
+                &mbmi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost,
+                x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
             tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
           } else {
             // Restore the old MV and WM parameters.
@@ -1404,16 +1443,15 @@ static int64_t motion_mode_rd(
             mbmi->wm_params = wm_params0;
             mbmi->num_proj_ref = num_proj_ref0;
           }
-        } else {
-          if (!check_if_optimal_warp(cpi, &mbmi->wm_params, mbmi->num_proj_ref))
-            continue;
         }
 
+        // Build the warped predictor
         av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
                                       av1_num_planes(cm) - 1);
       } else {
         continue;
       }
+#endif  // !CONFIG_REALTIME_ONLY
     } else if (is_interintra_mode) {
       const int ret =
           av1_handle_inter_intra_mode(cpi, x, bsize, mbmi, args, ref_best_rd,
@@ -1423,50 +1461,38 @@ static int64_t motion_mode_rd(
 
     // If we are searching newmv and the mv is the same as refmv, skip the
     // current mode
-    if (this_mode == NEW_NEWMV) {
-      const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
-      const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
-      if (mbmi->mv[0].as_int == ref_mv_0.as_int ||
-          mbmi->mv[1].as_int == ref_mv_1.as_int) {
-        continue;
-      }
-    } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
-      const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
-      if (mbmi->mv[1].as_int == ref_mv_1.as_int) {
-        continue;
-      }
-    } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
-      const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
-      if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
-        continue;
-      }
-    } else if (this_mode == NEWMV) {
-      const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
-      if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
-        continue;
-      }
-    }
+    if (!av1_check_newmv_joint_nonzero(cm, x)) continue;
 
-    x->force_skip = 0;
+    // Update rd_stats for the current motion mode
+    txfm_info->skip_txfm = 0;
     rd_stats->dist = 0;
     rd_stats->sse = 0;
-    rd_stats->skip = 1;
+    rd_stats->skip_txfm = 1;
     rd_stats->rate = tmp_rate2;
+    const ModeCosts *mode_costs = &x->mode_costs;
     if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate;
     if (interintra_allowed) {
-      rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]]
-                                          [mbmi->ref_frame[1] == INTRA_FRAME];
+      rd_stats->rate +=
+          mode_costs->interintra_cost[size_group_lookup[bsize]]
+                                     [mbmi->ref_frame[1] == INTRA_FRAME];
     }
     if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) &&
         (mbmi->ref_frame[1] != INTRA_FRAME)) {
       if (last_motion_mode_allowed == WARPED_CAUSAL) {
-        rd_stats->rate += x->motion_mode_cost[bsize][mbmi->motion_mode];
+        rd_stats->rate +=
+            mode_costs->motion_mode_cost[bsize][mbmi->motion_mode];
       } else {
-        rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
+        rd_stats->rate +=
+            mode_costs->motion_mode_cost1[bsize][mbmi->motion_mode];
       }
     }
 
+    int64_t this_yrd = INT64_MAX;
+
     if (!do_tx_search) {
+      // Avoid doing a transform search here to speed up the overall mode
+      // search. It will be done later in the mode search if the current
+      // motion mode seems promising.
       int64_t curr_sse = -1;
       int64_t sse_y = -1;
       int est_residue_cost = 0;
@@ -1474,9 +1500,6 @@ static int64_t motion_mode_rd(
       int64_t est_rd = 0;
       if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
         curr_sse = get_sse(cpi, x, &sse_y);
-        // Scale luma SSE as per bit depth so as to be consistent with
-        // model_rd_sb_fn and compound type rd
-        sse_y = ROUND_POWER_OF_TWO(sse_y, (xd->bd - 8) * 2);
         const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
                                                  &est_residue_cost, &est_dist);
         (void)has_est_rd;
@@ -1517,17 +1540,15 @@ static int64_t motion_mode_rd(
                               rd_stats->rdcost, rd_stats, rd_stats_y,
                               rd_stats_uv, mbmi);
       }
-      mbmi->skip = 0;
+      mbmi->skip_txfm = 0;
     } else {
+      // Perform full transform search
       int64_t skip_rd = INT64_MAX;
       int64_t skip_rdy = INT64_MAX;
       if (cpi->sf.inter_sf.txfm_rd_gate_level) {
         // Check if the mode is good enough based on skip RD
         int64_t sse_y = INT64_MAX;
         int64_t curr_sse = get_sse(cpi, x, &sse_y);
-        // Scale luma SSE as per bit depth so as to be consistent with
-        // model_rd_sb_fn and compound type rd
-        sse_y = ROUND_POWER_OF_TWO(sse_y, (xd->bd - 8) * 2);
         skip_rd = RDCOST(x->rdmult, rd_stats->rate, curr_sse);
         skip_rdy = RDCOST(x->rdmult, rd_stats->rate, (sse_y << 4));
         int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd[0], skip_rd,
@@ -1535,17 +1556,21 @@ static int64_t motion_mode_rd(
         if (!eval_txfm) continue;
       }
 
+      // Do transform search
+      const int mode_rate = rd_stats->rate;
       if (!av1_txfm_search(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
                            rd_stats->rate, ref_best_rd)) {
         if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
-          if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
-              !is_comp_pred) {
-            simple_states->early_skipped = 1;
-          }
           return INT64_MAX;
         }
         continue;
       }
+      const int skip_ctx = av1_get_skip_txfm_context(xd);
+      const int y_rate =
+          rd_stats->skip_txfm
+              ? x->mode_costs.skip_txfm_cost[skip_ctx][1]
+              : (rd_stats_y->rate + x->mode_costs.skip_txfm_cost[skip_ctx][0]);
+      this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y->dist);
 
       const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
       if (curr_rd < ref_best_rd) {
@@ -1553,13 +1578,11 @@ static int64_t motion_mode_rd(
         ref_skip_rd[0] = skip_rd;
         ref_skip_rd[1] = skip_rdy;
       }
-      *disable_skip = 0;
       if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
-        const int skip_ctx = av1_get_skip_context(xd);
-        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
-                             rd_stats->dist,
-                             rd_stats_y->rate + rd_stats_uv->rate +
-                                 x->skip_cost[skip_ctx][mbmi->skip]);
+        inter_mode_data_push(
+            tile_data, mbmi->bsize, rd_stats->sse, rd_stats->dist,
+            rd_stats_y->rate + rd_stats_uv->rate +
+                mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]);
       }
     }
 
@@ -1573,39 +1596,26 @@ static int64_t motion_mode_rd(
     const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
     if (mode_index == 0) {
       args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
-      if (!is_comp_pred) {
-        simple_states->rd_stats = *rd_stats;
-        simple_states->rd_stats.rdcost = tmp_rd;
-        simple_states->rd_stats_y = *rd_stats_y;
-        simple_states->rd_stats_uv = *rd_stats_uv;
-        memcpy(simple_states->blk_skip, x->blk_skip,
-               sizeof(x->blk_skip[0]) * xd->height * xd->width);
-        av1_copy_array(simple_states->tx_type_map, xd->tx_type_map,
-                       xd->height * xd->width);
-        simple_states->skip = mbmi->skip;
-        simple_states->disable_skip = *disable_skip;
-      }
     }
     if (mode_index == 0 || tmp_rd < best_rd) {
+      // Update best_rd data if this is the best motion mode so far
       best_mbmi = *mbmi;
       best_rd = tmp_rd;
       best_rd_stats = *rd_stats;
       best_rd_stats_y = *rd_stats_y;
       best_rate_mv = tmp_rate_mv;
+      *yrd = this_yrd;
       if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
-      memcpy(best_blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * xd->height * xd->width);
+      memcpy(best_blk_skip, txfm_info->blk_skip,
+             sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
       av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
-      best_xskip = mbmi->skip;
-      best_disable_skip = *disable_skip;
-      // TODO(anyone): evaluate the quality and speed trade-off of the early
-      // termination logic below.
-      // if (best_xskip) break;
+      best_xskip_txfm = mbmi->skip_txfm;
     }
   }
+  // Update RD and mbmi stats for selected motion mode
   mbmi->ref_frame[1] = ref_frame_1;
   *rate_mv = best_rate_mv;
-  if (best_rd == INT64_MAX) {
+  if (best_rd == INT64_MAX || !av1_check_newmv_joint_nonzero(cm, x)) {
     av1_invalid_rd_stats(rd_stats);
     restore_dst_buf(xd, *orig_dst, num_planes);
     return INT64_MAX;
@@ -1614,11 +1624,10 @@ static int64_t motion_mode_rd(
   *rd_stats = best_rd_stats;
   *rd_stats_y = best_rd_stats_y;
   if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
-  memcpy(x->blk_skip, best_blk_skip,
-         sizeof(x->blk_skip[0]) * xd->height * xd->width);
+  memcpy(txfm_info->blk_skip, best_blk_skip,
+         sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
   av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
-  x->force_skip = best_xskip;
-  *disable_skip = best_disable_skip;
+  txfm_info->skip_txfm = best_xskip_txfm;
 
   restore_dst_buf(xd, *orig_dst, num_planes);
   return 0;
@@ -1647,11 +1656,12 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
 
     av1_subtract_plane(x, plane_bsize, plane);
     int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh) << 4;
+    sse >>= ((cpi->frame_info.bit_depth - 8) * 2);
     total_sse += sse;
   }
   const int skip_mode_ctx = av1_get_skip_mode_context(xd);
   rd_stats->dist = rd_stats->sse = total_sse;
-  rd_stats->rate = x->skip_mode_cost[skip_mode_ctx][1];
+  rd_stats->rate = x->mode_costs.skip_mode_cost[skip_mode_ctx][1];
   rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
 
   restore_dst_buf(xd, *orig_dst, num_planes);
@@ -1734,6 +1744,41 @@ static INLINE int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
   return 1;
 }
 
+// Skip NEARESTMV and NEARMV modes based on refmv weight computed in ref mv list
+// population
+static INLINE int skip_nearest_near_mv_using_refmv_weight(
+    const MACROBLOCK *const x, const PREDICTION_MODE this_mode,
+    const int8_t ref_frame_type) {
+  if (this_mode != NEARESTMV && this_mode != NEARMV) return 0;
+
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const uint16_t *const ref_mv_weight = mbmi_ext->weight[ref_frame_type];
+  const int ref_mv_count =
+      AOMMIN(MAX_REF_MV_SEARCH, mbmi_ext->ref_mv_count[ref_frame_type]);
+
+  if (ref_mv_count == 0) return 0;
+  // If ref mv list has at least one nearest candidate do not prune NEARESTMV
+  if (this_mode == NEARESTMV && ref_mv_weight[0] >= REF_CAT_LEVEL) return 0;
+
+  // Count number of ref mvs populated from nearest candidates
+  int nearest_refmv_count = 0;
+  for (int ref_mv_idx = 0; ref_mv_idx < ref_mv_count; ref_mv_idx++) {
+    if (ref_mv_weight[ref_mv_idx] >= REF_CAT_LEVEL) nearest_refmv_count++;
+  }
+
+  // nearest_refmv_count indicates the closeness of block motion characteristics
+  // with respect to its spatial neighbor. Smaller value of nearest_refmv_count
+  // w.r.t to ref_mv_count means less correlation with its spatial neighbors.
+  // Hence less possibility for NEARESTMV and NEARMV modes becoming the best
+  // mode since these modes work well for blocks that shares similar motion
+  // characteristics with its neighbor. Thus, NEARMV mode is pruned when
+  // nearest_refmv_count is relatively smaller than ref_mv_count and NEARESTMV
+  // mode is pruned if none of the ref mvs are populated from nearest candidate.
+  const int prune_thresh = 1 + (ref_mv_count >= 2);
+  if (nearest_refmv_count < prune_thresh) return 1;
+  return 0;
+}
+
 // This function update the non-new mv for the current prediction mode
 static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
                                const AV1_COMMON *cm, const MACROBLOCK *x,
@@ -1747,15 +1792,15 @@ static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
     int_mv this_mv;
     this_mv.as_int = INVALID_MV;
     ret = get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx,
-                      skip_repeated_ref_mv, mbmi->ref_frame, x->mbmi_ext);
+                      skip_repeated_ref_mv, mbmi->ref_frame, &x->mbmi_ext);
     if (!ret) return 0;
     const PREDICTION_MODE single_mode = get_single_mode(this_mode, i);
     if (single_mode == NEWMV) {
       const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
       cur_mv[i] =
-          (i == 0) ? x->mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+          (i == 0) ? x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
                          .this_mv
-                   : x->mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+                   : x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
                          .comp_mv;
     } else {
       ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x);
@@ -1810,7 +1855,7 @@ static INLINE int is_single_newmv_valid(const HandleInterModeArgs *const args,
 static int get_drl_refmv_count(const MACROBLOCK *const x,
                                const MV_REFERENCE_FRAME *ref_frame,
                                PREDICTION_MODE mode) {
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
   const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0;
   const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
@@ -1823,15 +1868,32 @@ static int get_drl_refmv_count(const MACROBLOCK *const x,
   return ref_set;
 }
 
+// Checks if particular ref_mv_idx should be pruned.
+static int prune_ref_mv_idx_using_qindex(const int reduce_inter_modes,
+                                         const int qindex,
+                                         const int ref_mv_idx) {
+  if (reduce_inter_modes >= 3) return 1;
+  // Q-index logic based pruning is enabled only for
+  // reduce_inter_modes = 2.
+  assert(reduce_inter_modes == 2);
+  // When reduce_inter_modes=2, pruning happens as below based on q index.
+  // For q index range between 0 and 85: prune if ref_mv_idx >= 1.
+  // For q index range between 86 and 170: prune if ref_mv_idx == 2.
+  // For q index range between 171 and 255: no pruning.
+  const int min_prune_ref_mv_idx = (qindex * 3 / QINDEX_RANGE) + 1;
+  return (ref_mv_idx >= min_prune_ref_mv_idx);
+}
+
 // Whether this reference motion vector can be skipped, based on initial
 // heuristics.
-static bool ref_mv_idx_early_breakout(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                      const HandleInterModeArgs *const args,
-                                      int64_t ref_best_rd, int ref_mv_idx) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
+static bool ref_mv_idx_early_breakout(
+    const SPEED_FEATURES *const sf,
+    const RefFrameDistanceInfo *const ref_frame_dist_info, MACROBLOCK *x,
+    const HandleInterModeArgs *const args, int64_t ref_best_rd,
+    int ref_mv_idx) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
   const int is_comp_pred = has_second_ref(mbmi);
   if (sf->inter_sf.reduce_inter_modes && ref_mv_idx > 0) {
@@ -1848,29 +1910,27 @@ static bool ref_mv_idx_early_breakout(const AV1_COMP *const cpi, MACROBLOCK *x,
     // TODO(any): Experiment with reduce_inter_modes for compound prediction
     if (sf->inter_sf.reduce_inter_modes >= 2 && !is_comp_pred &&
         have_newmv_in_inter_mode(mbmi->mode)) {
-      if (mbmi->ref_frame[0] != cpi->nearest_past_ref &&
-          mbmi->ref_frame[0] != cpi->nearest_future_ref) {
+      if (mbmi->ref_frame[0] != ref_frame_dist_info->nearest_past_ref &&
+          mbmi->ref_frame[0] != ref_frame_dist_info->nearest_future_ref) {
         const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
-        if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
-            REF_CAT_LEVEL) {
+        const int do_prune = prune_ref_mv_idx_using_qindex(
+            sf->inter_sf.reduce_inter_modes, x->qindex, ref_mv_idx);
+        if (do_prune &&
+            (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
+             REF_CAT_LEVEL)) {
           return true;
         }
       }
     }
   }
-  if (sf->inter_sf.prune_single_motion_modes_by_simple_trans && !is_comp_pred &&
-      args->single_ref_first_pass == 0) {
-    if (args->simple_rd_state[ref_mv_idx].early_skipped) {
-      return true;
-    }
-  }
+
   mbmi->ref_mv_idx = ref_mv_idx;
   if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, mbmi->mode))) {
     return true;
   }
   size_t est_rd_rate = args->ref_frame_cost + args->single_comp_cost;
-  const int drl_cost =
-      get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+  const int drl_cost = get_drl_cost(
+      mbmi, mbmi_ext, x->mode_costs.drl_mode_cost0, ref_frame_type);
   est_rd_rate += drl_cost;
   if (RDCOST(x->rdmult, est_rd_rate, 0) > ref_best_rd &&
       mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
@@ -1880,16 +1940,18 @@ static bool ref_mv_idx_early_breakout(const AV1_COMP *const cpi, MACROBLOCK *x,
 }
 
 // Compute the estimated RD cost for the motion vector with simple translation.
-static int64_t simple_translation_pred_rd(
-    AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
-    HandleInterModeArgs *args, int ref_mv_idx, inter_mode_info *mode_info,
-    int64_t ref_best_rd, BLOCK_SIZE bsize) {
+static int64_t simple_translation_pred_rd(AV1_COMP *const cpi, MACROBLOCK *x,
+                                          RD_STATS *rd_stats,
+                                          HandleInterModeArgs *args,
+                                          int ref_mv_idx, int64_t ref_best_rd,
+                                          BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
   const AV1_COMMON *cm = &cpi->common;
   const int is_comp_pred = has_second_ref(mbmi);
+  const ModeCosts *mode_costs = &x->mode_costs;
 
   struct macroblockd_plane *p = xd->plane;
   const BUFFER_SET orig_dst = {
@@ -1913,9 +1975,8 @@ static int64_t simple_translation_pred_rd(
 
   rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
   const int drl_cost =
-      get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+      get_drl_cost(mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
   rd_stats->rate += drl_cost;
-  mode_info[ref_mv_idx].drl_cost = drl_cost;
 
   int_mv cur_mv[2];
   if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) {
@@ -1925,7 +1986,7 @@ static int64_t simple_translation_pred_rd(
   for (int i = 0; i < is_comp_pred + 1; ++i) {
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
-  const int ref_mv_cost = cost_mv_ref(x, mbmi->mode, mode_ctx);
+  const int ref_mv_cost = cost_mv_ref(mode_costs, mbmi->mode, mode_ctx);
   rd_stats->rate += ref_mv_cost;
 
   if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd) {
@@ -1969,8 +2030,8 @@ static INLINE bool mask_check_bit(int mask, int index) {
 static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
                                 RD_STATS *rd_stats,
                                 HandleInterModeArgs *const args,
-                                int64_t ref_best_rd, inter_mode_info *mode_info,
-                                BLOCK_SIZE bsize, const int ref_set) {
+                                int64_t ref_best_rd, BLOCK_SIZE bsize,
+                                const int ref_set) {
   AV1_COMMON *const cm = &cpi->common;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -1979,7 +2040,8 @@ static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
   // Only search indices if they have some chance of being good.
   int good_indices = 0;
   for (int i = 0; i < ref_set; ++i) {
-    if (ref_mv_idx_early_breakout(cpi, x, args, ref_best_rd, i)) {
+    if (ref_mv_idx_early_breakout(&cpi->sf, &cpi->ref_frame_dist_info, x, args,
+                                  ref_best_rd, i)) {
       continue;
     }
     mask_set_bit(&good_indices, i);
@@ -2008,7 +2070,7 @@ static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
       continue;
     }
     idx_rdcost[ref_mv_idx] = simple_translation_pred_rd(
-        cpi, x, rd_stats, args, ref_mv_idx, mode_info, ref_best_rd, bsize);
+        cpi, x, rd_stats, args, ref_mv_idx, ref_best_rd, bsize);
   }
   // Find the index with the best RD cost.
   int best_idx = 0;
@@ -2033,14 +2095,37 @@ static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
   return result;
 }
 
+/*!\brief Motion mode information for inter mode search speedup.
+ *
+ * Used in a speed feature to search motion modes other than
+ * SIMPLE_TRANSLATION only on winning candidates.
+ */
 typedef struct motion_mode_candidate {
+  /*!
+   * Mode info for the motion mode candidate.
+   */
   MB_MODE_INFO mbmi;
+  /*!
+   * Rate describing the cost of the motion vectors for this candidate.
+   */
   int rate_mv;
+  /*!
+   * Rate before motion mode search and transform coding is applied.
+   */
   int rate2_nocoeff;
+  /*!
+   * An integer value 0 or 1 which indicates whether or not to skip the motion
+   * mode search and default to SIMPLE_TRANSLATION as a speed feature for this
+   * candidate.
+   */
   int skip_motion_mode;
+  /*!
+   * Total RD cost for this candidate.
+   */
   int64_t rd_cost;
 } motion_mode_candidate;
 
+/*!\cond */
 typedef struct motion_mode_best_st_candidate {
   motion_mode_candidate motion_mode_cand[MAX_WINNER_MOTION_MODES];
   int num_motion_mode_cand;
@@ -2067,7 +2152,7 @@ static AOM_INLINE int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi,
 
 static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols,
                                                   MACROBLOCKD *xd) {
-  if (!xd->up_available) return 0;
+  if (!xd->up_available) return 1;
   const int mi_col = xd->mi_col;
   MB_MODE_INFO **cur_mbmi = xd->mi;
   // prev_row_mi points into the mi array, starting at the beginning of the
@@ -2078,7 +2163,7 @@ static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols,
   for (int above_mi_col = mi_col; above_mi_col < end_col;
        above_mi_col += mi_step) {
     MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
-    mi_step = mi_size_wide[above_mi[0]->sb_type];
+    mi_step = mi_size_wide[above_mi[0]->bsize];
     int match_found = 0;
     if (is_inter_block(*above_mi))
       match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *above_mi);
@@ -2089,7 +2174,7 @@ static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols,
 
 static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows,
                                                  MACROBLOCKD *xd) {
-  if (!xd->left_available) return 0;
+  if (!xd->left_available) return 1;
   const int mi_row = xd->mi_row;
   MB_MODE_INFO **cur_mbmi = xd->mi;
   // prev_col_mi points into the mi array, starting at the top of the
@@ -2100,7 +2185,7 @@ static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows,
   for (int left_mi_row = mi_row; left_mi_row < end_row;
        left_mi_row += mi_step) {
     MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
-    mi_step = mi_size_high[left_mi[0]->sb_type];
+    mi_step = mi_size_high[left_mi[0]->bsize];
     int match_found = 0;
     if (is_inter_block(*left_mi))
       match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *left_mi);
@@ -2108,9 +2193,19 @@ static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows,
   }
   return 0;
 }
+/*!\endcond */
 
+/*! \brief Struct used to hold TPL data to
+ * narrow down parts of the inter mode search.
+ */
 typedef struct {
+  /*!
+   * The best inter cost out of all of the reference frames.
+   */
   int64_t best_inter_cost;
+  /*!
+   * The inter cost for each reference frame.
+   */
   int64_t ref_inter_cost[INTER_REFS_PER_FRAME];
 } PruneInfoFromTpl;
 
@@ -2119,17 +2214,14 @@ typedef struct {
 static AOM_INLINE void get_block_level_tpl_stats(
     AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs,
     PruneInfoFromTpl *inter_cost_info_from_tpl) {
-  const GF_GROUP *const gf_group = &cpi->gf_group;
   AV1_COMMON *const cm = &cpi->common;
 
-  assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size));
-  const int tpl_idx = gf_group->index;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  if (!av1_tpl_stats_ready(tpl_data, tpl_idx)) return;
   const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
-  if (tpl_idx >= MAX_LAG_BUFFERS || !tpl_frame->is_valid) {
-    return;
-  }
-
   const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
   const int mi_wide = mi_size_wide[bsize];
   const int mi_high = mi_size_high[bsize];
@@ -2141,10 +2233,13 @@ static AOM_INLINE void get_block_level_tpl_stats(
       coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
   const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
 
+  const int row_step = step;
+  const int col_step_sr =
+      coded_to_superres_mi(step, cm->superres_scale_denominator);
   for (int row = mi_row; row < AOMMIN(mi_row + mi_high, cm->mi_params.mi_rows);
-       row += step) {
+       row += row_step) {
     for (int col = mi_col_sr; col < AOMMIN(mi_col_end_sr, mi_cols_sr);
-         col += step) {
+         col += col_step_sr) {
       const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
           row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
 
@@ -2175,10 +2270,12 @@ static AOM_INLINE int prune_modes_based_on_tpl_stats(
     PruneInfoFromTpl *inter_cost_info_from_tpl, const int *refs, int ref_mv_idx,
     const PREDICTION_MODE this_mode, int prune_mode_level) {
   const int have_newmv = have_newmv_in_inter_mode(this_mode);
-  if ((prune_mode_level < 3) && have_newmv) return 0;
+  if ((prune_mode_level < 2) && have_newmv) return 0;
 
-  static const int prune_level_idx[3] = { 0, 1, 1 };
-  const int prune_level = prune_level_idx[prune_mode_level - 1];
+  const int64_t best_inter_cost = inter_cost_info_from_tpl->best_inter_cost;
+  if (best_inter_cost == INT64_MAX) return 0;
+
+  const int prune_level = prune_mode_level - 1;
   int64_t cur_inter_cost;
 
   const int is_globalmv =
@@ -2190,8 +2287,8 @@ static AOM_INLINE int prune_modes_based_on_tpl_stats(
   // conservative pruning which is set based on ref_mv_idx and speed feature.
   // 'prune_index' 0, 1, 2 corresponds to ref_mv indices 0, 1 and 2. prune_index
   // 3 corresponds to GLOBALMV/GLOBAL_GLOBALMV
-  static const int tpl_inter_mode_prune_mul_factor[2][MAX_REF_MV_SEARCH + 1] = {
-    { 3, 3, 3, 2 }, { 3, 2, 2, 2 }
+  static const int tpl_inter_mode_prune_mul_factor[3][MAX_REF_MV_SEARCH + 1] = {
+    { 6, 6, 6, 4 }, { 6, 4, 4, 4 }, { 5, 4, 4, 4 }
   };
 
   const int is_comp_pred = (refs[1] > INTRA_FRAME);
@@ -2209,74 +2306,480 @@ static AOM_INLINE int prune_modes_based_on_tpl_stats(
 
   // Prune the mode if cur_inter_cost is greater than threshold times
   // best_inter_cost
-  const int64_t best_inter_cost = inter_cost_info_from_tpl->best_inter_cost;
   if (cur_inter_cost >
       ((tpl_inter_mode_prune_mul_factor[prune_level][prune_index] *
         best_inter_cost) >>
-       1))
+       2))
+    return 1;
+  return 0;
+}
+
+/*!\brief High level function to select parameters for compound mode.
+ *
+ * \ingroup inter_mode_search
+ * The main search functionality is done in the call to av1_compound_type_rd().
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     x                 Pointer to struct holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     args              HandleInterModeArgs struct holding
+ *                                  miscellaneous arguments for inter mode
+ *                                  search. See the documentation for this
+ *                                  struct for a description of each member.
+ * \param[in]     ref_best_rd       Best RD found so far for this block.
+ *                                  It is used for early termination of this
+ *                                  search if the RD exceeds this value.
+ * \param[in,out] cur_mv            Current motion vector.
+ * \param[in]     bsize             Current block size.
+ * \param[in,out] compmode_interinter_cost  RD of the selected interinter
+                                    compound mode.
+ * \param[in,out] rd_buffers        CompoundTypeRdBuffers struct to hold all
+ *                                  allocated buffers for the compound
+ *                                  predictors and masks in the compound type
+ *                                  search.
+ * \param[in,out] orig_dst          A prediction buffer to hold a computed
+ *                                  prediction. This will eventually hold the
+ *                                  final prediction, and the tmp_dst info will
+ *                                  be copied here.
+ * \param[in]     tmp_dst           A temporary prediction buffer to hold a
+ *                                  computed prediction.
+ * \param[in,out] rate_mv           The rate associated with the motion vectors.
+ *                                  This will be modified if a motion search is
+ *                                  done in the motion mode search.
+ * \param[in,out] rd_stats          Struct to keep track of the overall RD
+ *                                  information.
+ * \param[in,out] skip_rd           An array of length 2 where skip_rd[0] is the
+ *                                  best total RD for a skip mode so far, and
+ *                                  skip_rd[1] is the best RD for a skip mode so
+ *                                  far in luma. This is used as a speed feature
+ *                                  to skip the transform search if the computed
+ *                                  skip RD for the current mode is not better
+ *                                  than the best skip_rd so far.
+ * \param[in,out] skip_build_pred   Indicates whether or not to build the inter
+ *                                  predictor. If this is 0, the inter predictor
+ *                                  has already been built and thus we can avoid
+ *                                  repeating computation.
+ * \return Returns 1 if this mode is worse than one already seen and 0 if it is
+ * a viable candidate.
+ */
+static int process_compound_inter_mode(
+    AV1_COMP *const cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+    int64_t ref_best_rd, int_mv *cur_mv, BLOCK_SIZE bsize,
+    int *compmode_interinter_cost, const CompoundTypeRdBuffers *rd_buffers,
+    const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, int *rate_mv,
+    RD_STATS *rd_stats, int64_t *skip_rd, int *skip_build_pred) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const AV1_COMMON *cm = &cpi->common;
+  const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                   cm->seq_params->enable_masked_compound;
+  int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
+                         (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
+
+  const int num_planes = av1_num_planes(cm);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int is_luma_interp_done = 0;
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+  int64_t best_rd_compound;
+  int64_t rd_thresh;
+  const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT;
+  const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE;
+  rd_thresh = get_rd_thresh_from_best_rd(ref_best_rd, (1 << comp_type_rd_shift),
+                                         comp_type_rd_scale);
+  // Select compound type and any parameters related to that type
+  // (for example, the mask parameters if it is a masked mode) and compute
+  // the RD
+  *compmode_interinter_cost = av1_compound_type_rd(
+      cpi, x, args, bsize, cur_mv, mode_search_mask, masked_compound_used,
+      orig_dst, tmp_dst, rd_buffers, rate_mv, &best_rd_compound, rd_stats,
+      ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh);
+  if (ref_best_rd < INT64_MAX &&
+      (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale >
+          ref_best_rd) {
+    restore_dst_buf(xd, *orig_dst, num_planes);
+    return 1;
+  }
+
+  // Build only uv predictor for COMPOUND_AVERAGE.
+  // Note there is no need to call av1_enc_build_inter_predictor
+  // for luma if COMPOUND_AVERAGE is selected because it is the first
+  // candidate in av1_compound_type_rd, which means it used the dst_buf
+  // rather than the tmp_buf.
+  if (mbmi->interinter_comp.type == COMPOUND_AVERAGE && is_luma_interp_done) {
+    if (num_planes > 1) {
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                    AOM_PLANE_U, num_planes - 1);
+    }
+    *skip_build_pred = 1;
+  }
+  return 0;
+}
+
+// Speed feature to prune out MVs that are similar to previous MVs if they
+// don't achieve the best RD advantage.
+static int prune_ref_mv_idx_search(int ref_mv_idx, int best_ref_mv_idx,
+                                   int_mv save_mv[MAX_REF_MV_SEARCH - 1][2],
+                                   MB_MODE_INFO *mbmi, int pruning_factor) {
+  int i;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const int thr = (1 + is_comp_pred) << (pruning_factor + 1);
+
+  // Skip the evaluation if an MV match is found.
+  if (ref_mv_idx > 0) {
+    for (int idx = 0; idx < ref_mv_idx; ++idx) {
+      if (save_mv[idx][0].as_int == INVALID_MV) continue;
+
+      int mv_diff = 0;
+      for (i = 0; i < 1 + is_comp_pred; ++i) {
+        mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) +
+                   abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col);
+      }
+
+      // If this mode is not the best one, and current MV is similar to
+      // previous stored MV, terminate this ref_mv_idx evaluation.
+      if (best_ref_mv_idx == -1 && mv_diff <= thr) return 1;
+    }
+  }
+
+  if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) {
+    for (i = 0; i < is_comp_pred + 1; ++i)
+      save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int;
+  }
+
+  return 0;
+}
+
+/*!\brief Prunes ZeroMV Search Using Best NEWMV's SSE
+ *
+ * \ingroup inter_mode_search
+ *
+ * Compares the sse of zero mv and the best sse found in single new_mv. If the
+ * sse of the zero_mv is higher, returns 1 to signal zero_mv can be skipped.
+ * Else returns 0.
+ *
+ * Note that the sse of here comes from single_motion_search. So it is
+ * interpolated with the filter in motion search, not the actual interpolation
+ * filter used in encoding.
+ *
+ * \param[in]     fn_ptr            A table of function pointers to compute SSE.
+ * \param[in]     x                 Pointer to struct holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     bsize             The current block_size.
+ * \param[in]     args              The args to handle_inter_mode, used to track
+ *                                  the best SSE.
+ * \param[in]    prune_zero_mv_with_sse  The argument holds speed feature
+ *                                       prune_zero_mv_with_sse value
+ * \return Returns 1 if zero_mv is pruned, 0 otherwise.
+ */
+static AOM_INLINE int prune_zero_mv_with_sse(
+    const aom_variance_fn_ptr_t *fn_ptr, const MACROBLOCK *x, BLOCK_SIZE bsize,
+    const HandleInterModeArgs *args, int prune_zero_mv_with_sse) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+
+  const int is_comp_pred = has_second_ref(mbmi);
+  const MV_REFERENCE_FRAME *refs = mbmi->ref_frame;
+
+  // Check that the global mv is the same as ZEROMV
+  assert(mbmi->mv[0].as_int == 0);
+  assert(IMPLIES(is_comp_pred, mbmi->mv[0].as_int == 0));
+  assert(xd->global_motion[refs[0]].wmtype == TRANSLATION ||
+         xd->global_motion[refs[0]].wmtype == IDENTITY);
+
+  // Don't prune if we have invalid data
+  for (int idx = 0; idx < 1 + is_comp_pred; idx++) {
+    assert(mbmi->mv[0].as_int == 0);
+    if (args->best_single_sse_in_refs[refs[idx]] == INT32_MAX) {
+      return 0;
+    }
+  }
+
+  // Sum up the sse of ZEROMV and best NEWMV
+  unsigned int this_sse_sum = 0;
+  unsigned int best_sse_sum = 0;
+  for (int idx = 0; idx < 1 + is_comp_pred; idx++) {
+    const struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+    const struct macroblockd_plane *pd = xd->plane;
+    const struct buf_2d *src_buf = &p->src;
+    const struct buf_2d *ref_buf = &pd->pre[idx];
+    const uint8_t *src = src_buf->buf;
+    const uint8_t *ref = ref_buf->buf;
+    const int src_stride = src_buf->stride;
+    const int ref_stride = ref_buf->stride;
+
+    unsigned int this_sse;
+    fn_ptr[bsize].vf(ref, ref_stride, src, src_stride, &this_sse);
+    this_sse_sum += this_sse;
+
+    const unsigned int best_sse = args->best_single_sse_in_refs[refs[idx]];
+    best_sse_sum += best_sse;
+  }
+
+  const double mul = prune_zero_mv_with_sse > 1 ? 1.00 : 1.25;
+  if ((double)this_sse_sum > (mul * (double)best_sse_sum)) {
     return 1;
+  }
+
   return 0;
 }
 
+/*!\brief Searches for interpolation filter in realtime mode during winner eval
+ *
+ * \ingroup inter_mode_search
+ *
+ * Does a simple interpolation filter search during winner mode evaluation. This
+ * is currently only used by realtime mode as \ref
+ * av1_interpolation_filter_search is not called during realtime encoding.
+ *
+ * This funciton only searches over two possible filters. EIGHTTAP_REGULAR is
+ * always search. For lowres clips (<= 240p), MULTITAP_SHARP is also search. For
+ * higher  res slips (>240p), EIGHTTAP_SMOOTH is also searched.
+ *  *
+ * \param[in]     cpi               Pointer to the compressor. Used for feature
+ *                                  flags.
+ * \param[in,out] x                 Pointer to macroblock. This is primarily
+ *                                  used to access the buffers.
+ * \param[in]     mi_row            The current row in mi unit (4X4 pixels).
+ * \param[in]     mi_col            The current col in mi unit (4X4 pixels).
+ * \param[in]     bsize             The current block_size.
+ * \return Returns true if a predictor is built in xd->dst, false otherwise.
+ */
+static AOM_INLINE bool fast_interp_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize) {
+  static const InterpFilters filters_ref_set[3] = {
+    { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR },
+    { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH },
+    { MULTITAP_SHARP, MULTITAP_SHARP }
+  };
+
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  int64_t best_cost = INT64_MAX;
+  int best_filter_index = -1;
+  // dst_bufs[0] sores the new predictor, and dist_bifs[1] stores the best
+  const int num_planes = av1_num_planes(cm);
+  const int is_240p_or_lesser = AOMMIN(cm->width, cm->height) <= 240;
+  assert(is_inter_mode(mi->mode));
+  assert(mi->motion_mode == SIMPLE_TRANSLATION);
+  assert(!is_inter_compound_mode(mi->mode));
+
+  if (!av1_is_interp_needed(xd)) {
+    return false;
+  }
+
+  struct macroblockd_plane *pd = xd->plane;
+  const BUFFER_SET orig_dst = {
+    { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+    { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
+  };
+  uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]);
+  const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+                                 tmp_buf + 2 * MAX_SB_SQUARE },
+                               { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
+  const BUFFER_SET *dst_bufs[2] = { &orig_dst, &tmp_dst };
+
+  for (int i = 0; i < 3; ++i) {
+    if (is_240p_or_lesser) {
+      if (filters_ref_set[i].x_filter == EIGHTTAP_SMOOTH) {
+        continue;
+      }
+    } else {
+      if (filters_ref_set[i].x_filter == MULTITAP_SHARP) {
+        continue;
+      }
+    }
+    int64_t cost;
+    RD_STATS tmp_rd = { 0 };
+
+    mi->interp_filters.as_filters = filters_ref_set[i];
+    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+
+    model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model
+                       ? MODELRD_LEGACY
+                       : MODELRD_TYPE_INTERP_FILTER](
+        cpi, bsize, x, xd, AOM_PLANE_Y, AOM_PLANE_Y, &tmp_rd.rate, &tmp_rd.dist,
+        &tmp_rd.skip_txfm, &tmp_rd.sse, NULL, NULL, NULL);
+
+    tmp_rd.rate += av1_get_switchable_rate(x, xd, cm->features.interp_filter,
+                                           cm->seq_params->enable_dual_filter);
+    cost = RDCOST(x->rdmult, tmp_rd.rate, tmp_rd.dist);
+    if (cost < best_cost) {
+      best_filter_index = i;
+      best_cost = cost;
+      swap_dst_buf(xd, dst_bufs, num_planes);
+    }
+  }
+  assert(best_filter_index >= 0);
+
+  mi->interp_filters.as_filters = filters_ref_set[best_filter_index];
+
+  const bool is_best_pred_in_orig = &orig_dst == dst_bufs[1];
+
+  if (is_best_pred_in_orig) {
+    swap_dst_buf(xd, dst_bufs, num_planes);
+  } else {
+    // Note that xd->pd's bufers are kept in sync with dst_bufs[0]. So if
+    // is_best_pred_in_orig is false, that means the current buffer is the
+    // original one.
+    assert(&orig_dst == dst_bufs[0]);
+    assert(xd->plane[AOM_PLANE_Y].dst.buf == orig_dst.plane[AOM_PLANE_Y]);
+    const int width = block_size_wide[bsize];
+    const int height = block_size_high[bsize];
+#if CONFIG_AV1_HIGHBITDEPTH
+    const bool is_hbd = is_cur_buf_hbd(xd);
+    if (is_hbd) {
+      aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(tmp_dst.plane[AOM_PLANE_Y]),
+                               tmp_dst.stride[AOM_PLANE_Y],
+                               CONVERT_TO_SHORTPTR(orig_dst.plane[AOM_PLANE_Y]),
+                               orig_dst.stride[AOM_PLANE_Y], width, height);
+    } else {
+      aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y],
+                        orig_dst.plane[AOM_PLANE_Y],
+                        orig_dst.stride[AOM_PLANE_Y], width, height);
+    }
+#else
+    aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y],
+                      orig_dst.plane[AOM_PLANE_Y], orig_dst.stride[AOM_PLANE_Y],
+                      width, height);
+#endif
+  }
+
+  // Build the YUV predictor.
+  if (num_planes > 1) {
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                  AOM_PLANE_U, AOM_PLANE_V);
+  }
+
+  return true;
+}
+
+/*!\brief AV1 inter mode RD computation
+ *
+ * \ingroup inter_mode_search
+ * Do the RD search for a given inter mode and compute all information relevant
+ * to the input mode. It will compute the best MV,
+ * compound parameters (if the mode is a compound mode) and interpolation filter
+ * parameters.
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     tile_data         Pointer to struct holding adaptive
+ *                                  data/contexts/models for the tile during
+ *                                  encoding.
+ * \param[in]     x                 Pointer to structure holding all the data
+ *                                  for the current macroblock.
+ * \param[in]     bsize             Current block size.
+ * \param[in,out] rd_stats          Struct to keep track of the overall RD
+ *                                  information.
+ * \param[in,out] rd_stats_y        Struct to keep track of the RD information
+ *                                  for only the Y plane.
+ * \param[in,out] rd_stats_uv       Struct to keep track of the RD information
+ *                                  for only the UV planes.
+ * \param[in]     args              HandleInterModeArgs struct holding
+ *                                  miscellaneous arguments for inter mode
+ *                                  search. See the documentation for this
+ *                                  struct for a description of each member.
+ * \param[in]     ref_best_rd       Best RD found so far for this block.
+ *                                  It is used for early termination of this
+ *                                  search if the RD exceeds this value.
+ * \param[in]     tmp_buf           Temporary buffer used to hold predictors
+ *                                  built in this search.
+ * \param[in,out] rd_buffers        CompoundTypeRdBuffers struct to hold all
+ *                                  allocated buffers for the compound
+ *                                  predictors and masks in the compound type
+ *                                  search.
+ * \param[in,out] best_est_rd       Estimated RD for motion mode search if
+ *                                  do_tx_search (see below) is 0.
+ * \param[in]     do_tx_search      Parameter to indicate whether or not to do
+ *                                  a full transform search. This will compute
+ *                                  an estimated RD for the modes without the
+ *                                  transform search and later perform the full
+ *                                  transform search on the best candidates.
+ * \param[in,out] inter_modes_info  InterModesInfo struct to hold inter mode
+ *                                  information to perform a full transform
+ *                                  search only on winning candidates searched
+ *                                  with an estimate for transform coding RD.
+ * \param[in,out] motion_mode_cand  A motion_mode_candidate struct to store
+ *                                  motion mode information used in a speed
+ *                                  feature to search motion modes other than
+ *                                  SIMPLE_TRANSLATION only on winning
+ *                                  candidates.
+ * \param[in,out] skip_rd           A length 2 array, where skip_rd[0] is the
+ *                                  best total RD for a skip mode so far, and
+ *                                  skip_rd[1] is the best RD for a skip mode so
+ *                                  far in luma. This is used as a speed feature
+ *                                  to skip the transform search if the computed
+ *                                  skip RD for the current mode is not better
+ *                                  than the best skip_rd so far.
+ * \param[in]     inter_cost_info_from_tpl A PruneInfoFromTpl struct used to
+ *                                         narrow down the search based on data
+ *                                         collected in the TPL model.
+ * \param[out]    yrd               Stores the rdcost corresponding to encoding
+ *                                  the luma plane.
+ *
+ * \return The RD cost for the mode being searched.
+ */
 static int64_t handle_inter_mode(
     AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x,
     BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
-    RD_STATS *rd_stats_uv, int *disable_skip, HandleInterModeArgs *args,
-    int64_t ref_best_rd, uint8_t *const tmp_buf,
-    const CompoundTypeRdBuffers *rd_buffers, int64_t *best_est_rd,
-    const int do_tx_search, InterModesInfo *inter_modes_info,
-    motion_mode_candidate *motion_mode_cand, int64_t *skip_rd,
-    PruneInfoFromTpl *inter_cost_info_from_tpl) {
+    RD_STATS *rd_stats_uv, HandleInterModeArgs *args, int64_t ref_best_rd,
+    uint8_t *const tmp_buf, const CompoundTypeRdBuffers *rd_buffers,
+    int64_t *best_est_rd, const int do_tx_search,
+    InterModesInfo *inter_modes_info, motion_mode_candidate *motion_mode_cand,
+    int64_t *skip_rd, PruneInfoFromTpl *inter_cost_info_from_tpl,
+    int64_t *yrd) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
 
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  const int tpl_idx = gf_group->index;
-  TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
+#if CONFIG_REALTIME_ONLY
+  const int prune_modes_based_on_tpl = 0;
+#else   // CONFIG_REALTIME_ONLY
+  const TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const int prune_modes_based_on_tpl =
       cpi->sf.inter_sf.prune_inter_modes_based_on_tpl &&
-      tpl_idx >= MAX_LAG_BUFFERS && tpl_frame->is_valid;
+      av1_tpl_stats_ready(tpl_data, cpi->gf_frame_index);
+#endif  // CONFIG_REALTIME_ONLY
   int i;
+  // Reference frames for this mode
   const int refs[2] = { mbmi->ref_frame[0],
                         (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int rate_mv = 0;
   int64_t rd = INT64_MAX;
-  // do first prediction into the destination buffer. Do the next
+  // Do first prediction into the destination buffer. Do the next
   // prediction into a temporary buffer. Then keep track of which one
   // of these currently holds the best predictor, and use the other
   // one for future predictions. In the end, copy from tmp_buf to
   // dst if necessary.
-  struct macroblockd_plane *p = xd->plane;
+  struct macroblockd_plane *pd = xd->plane;
   const BUFFER_SET orig_dst = {
-    { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
-    { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+    { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+    { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
   };
   const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
                                  tmp_buf + 2 * MAX_SB_SQUARE },
                                { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
 
-  const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                   cm->seq_params.enable_masked_compound;
   int64_t ret_val = INT64_MAX;
   const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
   int64_t best_rd = INT64_MAX;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
   uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  int64_t best_yrd = INT64_MAX;
   MB_MODE_INFO best_mbmi = *mbmi;
-  int best_disable_skip = 0;
-  int best_xskip = 0;
+  int best_xskip_txfm = 0;
   int64_t newmv_ret_val = INT64_MAX;
   inter_mode_info mode_info[MAX_REF_MV_SEARCH];
 
-  int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
-                         (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
-
   // Do not prune the mode based on inter cost from tpl if the current ref frame
   // is the winner ref in neighbouring blocks.
   int ref_match_found_in_above_nb = 0;
@@ -2289,22 +2792,56 @@ static int64_t handle_inter_mode(
   }
 
   // First, perform a simple translation search for each of the indices. If
-  // an index performs well, it will be fully searched here.
+  // an index performs well, it will be fully searched in the main loop
+  // of this function.
   const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
   // Save MV results from first 2 ref_mv_idx.
-  int_mv save_mv[MAX_REF_MV_SEARCH - 1][2] = { { { 0 } } };
+  int_mv save_mv[MAX_REF_MV_SEARCH - 1][2];
   int best_ref_mv_idx = -1;
-  const int idx_mask = ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd,
-                                            mode_info, bsize, ref_set);
+  const int idx_mask =
+      ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd, bsize, ref_set);
   const int16_t mode_ctx =
       av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
-  const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int ref_mv_cost = cost_mv_ref(mode_costs, this_mode, mode_ctx);
   const int base_rate =
       args->ref_frame_cost + args->single_comp_cost + ref_mv_cost;
+
+  // As per the experiments, in real-time preset impact of model rd based
+  // breakouts is less on encoding time if the following conditions are true.
+  //    (1) compound mode is disabled
+  //    (2) interpolation filter search is disabled
+  // TODO(any): Check the impact of model rd based breakouts in other presets
+  const int skip_interp_search_modelrd_calc =
+      cpi->oxcf.mode == REALTIME &&
+      cm->current_frame.reference_mode == SINGLE_REFERENCE &&
+      (cpi->sf.rt_sf.skip_interp_filter_search ||
+       cpi->sf.winner_mode_sf.winner_mode_ifs);
+
+  for (i = 0; i < MAX_REF_MV_SEARCH - 1; ++i) {
+    save_mv[i][0].as_int = INVALID_MV;
+    save_mv[i][1].as_int = INVALID_MV;
+  }
+  args->start_mv_cnt = 0;
+
+  // Main loop of this function. This will  iterate over all of the ref mvs
+  // in the dynamic reference list and do the following:
+  //    1.) Get the current MV. Create newmv MV if necessary
+  //    2.) Search compound type and parameters if applicable
+  //    3.) Do interpolation filter search
+  //    4.) Build the inter predictor
+  //    5.) Pick the motion mode (SIMPLE_TRANSLATION, OBMC_CAUSAL,
+  //        WARPED_CAUSAL)
+  //    6.) Update stats if best so far
   for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    mbmi->ref_mv_idx = ref_mv_idx;
+
     mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV;
-    mode_info[ref_mv_idx].mv.as_int = INVALID_MV;
-    mode_info[ref_mv_idx].rd = INT64_MAX;
+    mode_info[ref_mv_idx].full_mv_bestsme = INT_MAX;
+    const int drl_cost = get_drl_cost(
+        mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
+    mode_info[ref_mv_idx].drl_cost = drl_cost;
+    mode_info[ref_mv_idx].skip = 0;
 
     if (!mask_check_bit(idx_mask, ref_mv_idx)) {
       // MV did not perform well in simple translation search. Skip it.
@@ -2312,6 +2849,7 @@ static int64_t handle_inter_mode(
     }
     if (prune_modes_based_on_tpl && !ref_match_found_in_above_nb &&
         !ref_match_found_in_left_nb && (ref_best_rd != INT64_MAX)) {
+      // Skip mode if TPL model indicates it will not be beneficial.
       if (prune_modes_based_on_tpl_stats(
               inter_cost_info_from_tpl, refs, ref_mv_idx, this_mode,
               cpi->sf.inter_sf.prune_inter_modes_based_on_tpl))
@@ -2319,6 +2857,7 @@ static int64_t handle_inter_mode(
     }
     av1_init_rd_stats(rd_stats);
 
+    // Initialize compound mode data
     mbmi->interinter_comp.type = COMPOUND_AVERAGE;
     mbmi->comp_group_idx = 0;
     mbmi->compound_idx = 1;
@@ -2326,13 +2865,10 @@ static int64_t handle_inter_mode(
 
     mbmi->num_proj_ref = 0;
     mbmi->motion_mode = SIMPLE_TRANSLATION;
-    mbmi->ref_mv_idx = ref_mv_idx;
 
+    // Compute cost for signalling this DRL index
     rd_stats->rate = base_rate;
-    const int drl_cost =
-        get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
     rd_stats->rate += drl_cost;
-    mode_info[ref_mv_idx].drl_cost = drl_cost;
 
     int rs = 0;
     int compmode_interinter_cost = 0;
@@ -2342,102 +2878,46 @@ static int64_t handle_inter_mode(
     // TODO(Cherma): Extend this speed feature to support compound mode
     int skip_repeated_ref_mv =
         is_comp_pred ? 0 : cpi->sf.inter_sf.skip_repeated_ref_mv;
+    // Generate the current mv according to the prediction mode
     if (!build_cur_mv(cur_mv, this_mode, cm, x, skip_repeated_ref_mv)) {
       continue;
     }
 
+    // The above call to build_cur_mv does not handle NEWMV modes. Build
+    // the mv here if we have NEWMV for any predictors.
     if (have_newmv_in_inter_mode(this_mode)) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
       start_timing(cpi, handle_newmv_time);
 #endif
-      if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
-          args->single_ref_first_pass == 0 && !is_comp_pred) {
-        const int ref0 = mbmi->ref_frame[0];
-        newmv_ret_val = args->single_newmv_valid[ref_mv_idx][ref0] ? 0 : 1;
-        cur_mv[0] = args->single_newmv[ref_mv_idx][ref0];
-        rate_mv = args->single_newmv_rate[ref_mv_idx][ref0];
-      } else {
-        newmv_ret_val =
-            handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info);
-      }
+      newmv_ret_val =
+          handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info);
 #if CONFIG_COLLECT_COMPONENT_TIMING
       end_timing(cpi, handle_newmv_time);
 #endif
 
       if (newmv_ret_val != 0) continue;
 
-      rd_stats->rate += rate_mv;
-
-      if (cpi->sf.inter_sf.skip_repeated_newmv) {
-        if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) {
-          int skip = 0;
-          int this_rate_mv = 0;
-          for (i = 0; i < ref_mv_idx; ++i) {
-            // Check if the motion search result same as previous results
-            if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int &&
-                args->single_newmv_valid[i][refs[0]]) {
-              // If the compared mode has no valid rd, it is unlikely this
-              // mode will be the best mode
-              if (mode_info[i].rd == INT64_MAX) {
-                skip = 1;
-                break;
-              }
-              // Compare the cost difference including drl cost and mv cost
-              if (mode_info[i].mv.as_int != INVALID_MV) {
-                const int compare_cost =
-                    mode_info[i].rate_mv + mode_info[i].drl_cost;
-                const int_mv ref_mv = av1_get_ref_mv(x, 0);
-                this_rate_mv = av1_mv_bit_cost(
-                    &mode_info[i].mv.as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                    x->mv_cost_stack, MV_COST_WEIGHT);
-                const int this_cost = this_rate_mv + drl_cost;
-
-                if (compare_cost <= this_cost) {
-                  skip = 1;
-                  break;
-                } else {
-                  // If the cost is less than current best result, make this
-                  // the best and update corresponding variables unless the
-                  // best_mv is the same as ref_mv. In this case we skip and
-                  // rely on NEAR(EST)MV instead
-                  if (best_mbmi.ref_mv_idx == i &&
-                      mode_info[i].mv.as_int != ref_mv.as_int) {
-                    assert(best_rd != INT64_MAX);
-                    best_mbmi.ref_mv_idx = ref_mv_idx;
-                    motion_mode_cand->rate_mv = this_rate_mv;
-                    best_rd_stats.rate += this_cost - compare_cost;
-                    best_rd = RDCOST(x->rdmult, best_rd_stats.rate,
-                                     best_rd_stats.dist);
-                    if (best_rd < ref_best_rd) ref_best_rd = best_rd;
-                    break;
-                  }
-                }
-              }
-            }
-          }
-          if (skip) {
-            const THR_MODES mode_enum = get_prediction_mode_idx(
-                best_mbmi.mode, best_mbmi.ref_frame[0], best_mbmi.ref_frame[1]);
-            // Collect mode stats for multiwinner mode processing
-            store_winner_mode_stats(
-                &cpi->common, x, &best_mbmi, &best_rd_stats, &best_rd_stats_y,
-                &best_rd_stats_uv, mode_enum, NULL, bsize, best_rd,
-                cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-                do_tx_search);
-            args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
-                args->modelled_rd[this_mode][i][refs[0]];
-            args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
-                args->simple_rd[this_mode][i][refs[0]];
-            mode_info[ref_mv_idx].rd = mode_info[i].rd;
-            mode_info[ref_mv_idx].rate_mv = this_rate_mv;
-            mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
+      if (is_inter_singleref_mode(this_mode) &&
+          cur_mv[0].as_int != INVALID_MV) {
+        const MV_REFERENCE_FRAME ref = refs[0];
+        const unsigned int this_sse = x->pred_sse[ref];
+        if (this_sse < args->best_single_sse_in_refs[ref]) {
+          args->best_single_sse_in_refs[ref] = this_sse;
+        }
 
-            restore_dst_buf(xd, orig_dst, num_planes);
+        if (cpi->sf.rt_sf.skip_newmv_mode_based_on_sse) {
+          const double scale_factor[11] = { 0.7, 0.7, 0.7, 0.7, 0.7, 0.8,
+                                            0.8, 0.9, 0.9, 0.9, 0.9 };
+          assert(num_pels_log2_lookup[bsize] >= 4);
+          if (args->best_pred_sse <
+              scale_factor[num_pels_log2_lookup[bsize] - 4] * this_sse)
             continue;
-          }
         }
       }
+
+      rd_stats->rate += rate_mv;
     }
+    // Copy the motion vector for this mode into mbmi struct
     for (i = 0; i < is_comp_pred + 1; ++i) {
       mbmi->mv[i].as_int = cur_mv[i].as_int;
     }
@@ -2447,120 +2927,84 @@ static int64_t handle_inter_mode(
       continue;
     }
 
-    if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred) {
-      // TODO(yunqing): Move this part to a separate function when it is done.
-      // Store MV result.
-      if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) {
-        for (i = 0; i < is_comp_pred + 1; ++i)
-          save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int;
-      }
-      // Skip the evaluation if an MV match is found.
-      if (ref_mv_idx > 0) {
-        int match = 0;
-        for (int idx = 0; idx < ref_mv_idx; ++idx) {
-          int mv_diff = 0;
-          for (i = 0; i < 1 + is_comp_pred; ++i) {
-            mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) +
-                       abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col);
-          }
+    // Skip the rest of the search if prune_ref_mv_idx_search speed feature
+    // is enabled, and the current MV is similar to a previous one.
+    if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred &&
+        prune_ref_mv_idx_search(ref_mv_idx, best_ref_mv_idx, save_mv, mbmi,
+                                cpi->sf.inter_sf.prune_ref_mv_idx_search))
+      continue;
 
-          // If this mode is not the best one, and current MV is similar to
-          // previous stored MV, terminate this ref_mv_idx evaluation.
-          if (best_ref_mv_idx == -1 && mv_diff < 1) {
-            match = 1;
-            break;
-          }
-        }
-        if (match == 1) continue;
+    if (cpi->sf.gm_sf.prune_zero_mv_with_sse &&
+        cpi->sf.gm_sf.gm_search_type == GM_DISABLE_SEARCH &&
+        (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) {
+      if (prune_zero_mv_with_sse(cpi->ppi->fn_ptr, x, bsize, args,
+                                 cpi->sf.gm_sf.prune_zero_mv_with_sse)) {
+        continue;
       }
     }
 
-#if CONFIG_COLLECT_COMPONENT_TIMING
-    start_timing(cpi, compound_type_rd_time);
-#endif
     int skip_build_pred = 0;
     const int mi_row = xd->mi_row;
     const int mi_col = xd->mi_col;
-    if (is_comp_pred) {
-      // Find matching interp filter or set to default interp filter
-      const int need_search = av1_is_interp_needed(xd);
-      const InterpFilter assign_filter = cm->features.interp_filter;
-      int is_luma_interp_done = 0;
-      av1_find_interp_filter_match(mbmi, cpi, assign_filter, need_search,
-                                   args->interp_filter_stats,
-                                   args->interp_filter_stats_idx);
-
-      int64_t best_rd_compound;
-      int64_t rd_thresh;
-      const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT;
-      const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE;
-      rd_thresh = get_rd_thresh_from_best_rd(
-          ref_best_rd, (1 << comp_type_rd_shift), comp_type_rd_scale);
-      compmode_interinter_cost = av1_compound_type_rd(
-          cpi, x, bsize, cur_mv, mode_search_mask, masked_compound_used,
-          &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound,
-          rd_stats, ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh);
-      if (ref_best_rd < INT64_MAX &&
-          (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale >
-              ref_best_rd) {
-        restore_dst_buf(xd, orig_dst, num_planes);
-        continue;
-      }
-      // No need to call av1_enc_build_inter_predictor for luma if
-      // COMPOUND_AVERAGE is selected because it is the first
-      // candidate in av1_compound_type_rd, and the following
-      // compound types searching uses tmp_dst buffer
-
-      if (mbmi->interinter_comp.type == COMPOUND_AVERAGE &&
-          is_luma_interp_done) {
-        if (num_planes > 1) {
-          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
-                                        bsize, AOM_PLANE_U, num_planes - 1);
-        }
-        skip_build_pred = 1;
-      }
-    }
 
+    // Handle a compound predictor, continue if it is determined this
+    // cannot be the best compound mode
+    if (is_comp_pred) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
-    end_timing(cpi, compound_type_rd_time);
+      start_timing(cpi, compound_type_rd_time);
 #endif
+      const int not_best_mode = process_compound_inter_mode(
+          cpi, x, args, ref_best_rd, cur_mv, bsize, &compmode_interinter_cost,
+          rd_buffers, &orig_dst, &tmp_dst, &rate_mv, rd_stats, skip_rd,
+          &skip_build_pred);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, compound_type_rd_time);
+#endif
+      if (not_best_mode) continue;
+    }
 
+    if (!skip_interp_search_modelrd_calc) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
-    start_timing(cpi, interpolation_filter_search_time);
+      start_timing(cpi, interpolation_filter_search_time);
 #endif
-    ret_val = av1_interpolation_filter_search(
-        x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs,
-        &skip_build_pred, args, ref_best_rd);
+      // Determine the interpolation filter for this mode
+      ret_val = av1_interpolation_filter_search(
+          x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs,
+          &skip_build_pred, args, ref_best_rd);
 #if CONFIG_COLLECT_COMPONENT_TIMING
-    end_timing(cpi, interpolation_filter_search_time);
+      end_timing(cpi, interpolation_filter_search_time);
 #endif
-    if (args->modelled_rd != NULL && !is_comp_pred) {
-      args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
-    }
-    if (ret_val != 0) {
-      restore_dst_buf(xd, orig_dst, num_planes);
-      continue;
-    } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout &&
-               ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
-      restore_dst_buf(xd, orig_dst, num_planes);
-      continue;
-    }
+      if (args->modelled_rd != NULL && !is_comp_pred) {
+        args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+      }
+      if (ret_val != 0) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout &&
+                 ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      }
 
-    if (args->modelled_rd != NULL) {
-      if (is_comp_pred) {
-        const int mode0 = compound_ref0_mode(this_mode);
-        const int mode1 = compound_ref1_mode(this_mode);
-        const int64_t mrd =
-            AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
-                   args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
-        if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
-          restore_dst_buf(xd, orig_dst, num_planes);
-          continue;
+      // Compute modelled RD if enabled
+      if (args->modelled_rd != NULL) {
+        if (is_comp_pred) {
+          const int mode0 = compound_ref0_mode(this_mode);
+          const int mode1 = compound_ref1_mode(this_mode);
+          const int64_t mrd =
+              AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                     args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+          if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
+            restore_dst_buf(xd, orig_dst, num_planes);
+            continue;
+          }
         }
       }
     }
+
     rd_stats->rate += compmode_interinter_cost;
     if (skip_build_pred != 1) {
+      // Build this inter predictor if it has not been previously built
       av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0,
                                     av1_num_planes(cm) - 1);
     }
@@ -2569,35 +3013,38 @@ static int64_t handle_inter_mode(
     start_timing(cpi, motion_mode_rd_time);
 #endif
     int rate2_nocoeff = rd_stats->rate;
+    // Determine the motion mode. This will be one of SIMPLE_TRANSLATION,
+    // OBMC_CAUSAL or WARPED_CAUSAL
+    int64_t this_yrd;
     ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y,
-                             rd_stats_uv, disable_skip, args, ref_best_rd,
-                             skip_rd, &rate_mv, &orig_dst, best_est_rd,
-                             do_tx_search, inter_modes_info, 0);
+                             rd_stats_uv, args, ref_best_rd, skip_rd, &rate_mv,
+                             &orig_dst, best_est_rd, do_tx_search,
+                             inter_modes_info, 0, &this_yrd);
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, motion_mode_rd_time);
 #endif
+    assert(
+        IMPLIES(!av1_check_newmv_joint_nonzero(cm, x), ret_val == INT64_MAX));
 
-    mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
-    mode_info[ref_mv_idx].rate_mv = rate_mv;
     if (ret_val != INT64_MAX) {
       int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-      mode_info[ref_mv_idx].rd = tmp_rd;
       const THR_MODES mode_enum = get_prediction_mode_idx(
           mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
       // Collect mode stats for multiwinner mode processing
-      store_winner_mode_stats(
-          &cpi->common, x, mbmi, rd_stats, rd_stats_y, rd_stats_uv, mode_enum,
-          NULL, bsize, tmp_rd,
-          cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, do_tx_search);
+      store_winner_mode_stats(&cpi->common, x, mbmi, rd_stats, rd_stats_y,
+                              rd_stats_uv, mode_enum, NULL, bsize, tmp_rd,
+                              cpi->sf.winner_mode_sf.multi_winner_mode_type,
+                              do_tx_search);
       if (tmp_rd < best_rd) {
+        best_yrd = this_yrd;
+        // Update the best rd stats if we found the best mode so far
         best_rd_stats = *rd_stats;
         best_rd_stats_y = *rd_stats_y;
         best_rd_stats_uv = *rd_stats_uv;
         best_rd = tmp_rd;
         best_mbmi = *mbmi;
-        best_disable_skip = *disable_skip;
-        best_xskip = x->force_skip;
-        memcpy(best_blk_skip, x->blk_skip,
+        best_xskip_txfm = txfm_info->skip_txfm;
+        memcpy(best_blk_skip, txfm_info->blk_skip,
                sizeof(best_blk_skip[0]) * xd->height * xd->width);
         av1_copy_array(best_tx_type_map, xd->tx_type_map,
                        xd->height * xd->width);
@@ -2619,12 +3066,12 @@ static int64_t handle_inter_mode(
   *rd_stats = best_rd_stats;
   *rd_stats_y = best_rd_stats_y;
   *rd_stats_uv = best_rd_stats_uv;
+  *yrd = best_yrd;
   *mbmi = best_mbmi;
-  *disable_skip = best_disable_skip;
-  x->force_skip = best_xskip;
+  txfm_info->skip_txfm = best_xskip_txfm;
   assert(IMPLIES(mbmi->comp_group_idx == 1,
                  mbmi->interinter_comp.type != COMPOUND_AVERAGE));
-  memcpy(x->blk_skip, best_blk_skip,
+  memcpy(txfm_info->blk_skip, best_blk_skip,
          sizeof(best_blk_skip[0]) * xd->height * xd->width);
   av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
 
@@ -2633,26 +3080,39 @@ static int64_t handle_inter_mode(
   return rd_stats->rdcost;
 }
 
+/*!\brief Search for the best intrabc predictor
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function performs a motion search to find the best intrabc predictor.
+ *
+ * \returns Returns the best overall rdcost (including the non-intrabc modes
+ * search before this function).
+ */
 static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                                        PICK_MODE_CONTEXT *ctx,
                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                        int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
-  if (!av1_allow_intrabc(cm) || !cpi->oxcf.enable_intrabc) return INT64_MAX;
+  if (!av1_allow_intrabc(cm) || !cpi->oxcf.kf_cfg.enable_intrabc ||
+      cpi->sf.rt_sf.use_nonrd_pick_mode)
+    return INT64_MAX;
   const int num_planes = av1_num_planes(cm);
 
   MACROBLOCKD *const xd = &x->e_mbd;
   const TileInfo *tile = &xd->tile;
   MB_MODE_INFO *mbmi = xd->mi[0];
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
-  const int sb_row = mi_row >> cm->seq_params.mib_size_log2;
-  const int sb_col = mi_col >> cm->seq_params.mib_size_log2;
+  const int sb_row = mi_row >> cm->seq_params->mib_size_log2;
+  const int sb_col = mi_col >> cm->seq_params->mib_size_log2;
 
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
   av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
                    xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
                    mbmi_ext->mode_context);
@@ -2672,7 +3132,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
 
   int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
   if (dv_ref.as_int == 0) {
-    av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row);
+    av1_find_ref_dv(&dv_ref, tile, cm->seq_params->mib_size, mi_row);
   }
   // Ref DV should not have sub-pel.
   assert((dv_ref.as_mv.col & 7) == 0);
@@ -2699,10 +3159,12 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
 
   FULLPEL_MOTION_SEARCH_PARAMS fullms_params;
   const search_site_config *lookahead_search_sites =
-      &cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
+      cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
   av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize,
-                                     &dv_ref.as_mv, lookahead_search_sites);
-  fullms_params.is_intra_mode = 1;
+                                     &dv_ref.as_mv, lookahead_search_sites,
+                                     /*fine_search_interval=*/0);
+  const IntraBCMVCosts *const dv_costs = x->dv_costs;
+  av1_set_ms_to_intra_mode(&fullms_params, dv_costs);
 
   for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
        dir < IBC_MOTION_DIRECTIONS; ++dir) {
@@ -2715,19 +3177,19 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
         fullms_params.mv_limits.row_min =
             (tile->mi_row_start - mi_row) * MI_SIZE;
         fullms_params.mv_limits.row_max =
-            (sb_row * cm->seq_params.mib_size - mi_row) * MI_SIZE - h;
+            (sb_row * cm->seq_params->mib_size - mi_row) * MI_SIZE - h;
         break;
       case IBC_MOTION_LEFT:
         fullms_params.mv_limits.col_min =
             (tile->mi_col_start - mi_col) * MI_SIZE;
         fullms_params.mv_limits.col_max =
-            (sb_col * cm->seq_params.mib_size - mi_col) * MI_SIZE - w;
+            (sb_col * cm->seq_params->mib_size - mi_col) * MI_SIZE - w;
         // TODO(aconverse@google.com): Minimize the overlap between above and
         // left areas.
         fullms_params.mv_limits.row_min =
             (tile->mi_row_start - mi_row) * MI_SIZE;
         int bottom_coded_mi_edge =
-            AOMMIN((sb_row + 1) * cm->seq_params.mib_size, tile->mi_row_end);
+            AOMMIN((sb_row + 1) * cm->seq_params->mib_size, tile->mi_row_end);
         fullms_params.mv_limits.row_max =
             (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
         break;
@@ -2765,7 +3227,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                                 get_fullmv_from_mv(&dv)))
       continue;
     if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
-                         cm->seq_params.mib_size_log2))
+                         cm->seq_params->mib_size_log2))
       continue;
 
     // DV should not have sub-pel.
@@ -2779,18 +3241,15 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     mbmi->motion_mode = SIMPLE_TRANSLATION;
     mbmi->mv[0].as_mv = dv;
     mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
-    mbmi->skip = 0;
+    mbmi->skip_txfm = 0;
     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
                                   av1_num_planes(cm) - 1);
 
-    const IntraBCMVCosts *const dv_costs = &cpi->dv_costs;
-    int *dvcost[2] = { (int *)&dv_costs->mv_component[0][MV_MAX],
-                       (int *)&dv_costs->mv_component[1][MV_MAX] };
     // TODO(aconverse@google.com): The full motion field defining discount
     // in MV_COST_WEIGHT is too large. Explore other values.
     const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv,
-                                        dvcost, MV_COST_WEIGHT_SUB);
-    const int rate_mode = x->intrabc_cost[1];
+                                        dv_costs->dv_costs, MV_COST_WEIGHT_SUB);
+    const int rate_mode = x->mode_costs.intrabc_cost[1];
     RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv;
     if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y,
                          &rd_stats_uv, rate_mode + rate_mv, INT64_MAX))
@@ -2801,15 +3260,15 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
       best_rd = rd_stats_yuv.rdcost;
       best_mbmi = *mbmi;
       best_rdstats = rd_stats_yuv;
-      memcpy(best_blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * xd->height * xd->width);
+      memcpy(best_blk_skip, txfm_info->blk_skip,
+             sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
       av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
     }
   }
   *mbmi = best_mbmi;
   *rd_stats = best_rdstats;
-  memcpy(x->blk_skip, best_blk_skip,
-         sizeof(x->blk_skip[0]) * xd->height * xd->width);
+  memcpy(txfm_info->blk_skip, best_blk_skip,
+         sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
   av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
 #if CONFIG_RD_DEBUG
   mbmi->rd_stats = *rd_stats;
@@ -2817,18 +3276,24 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
-void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
-                               RD_STATS *rd_cost, BLOCK_SIZE bsize,
+// TODO(chiyotsai@google.com): We are using struct $struct_name instead of their
+// typedef here because Doxygen doesn't know about the typedefs yet. So using
+// the typedef will prevent doxygen from finding this function and generating
+// the callgraph. Once documents for AV1_COMP and MACROBLOCK are added to
+// doxygen, we can revert back to using the typedefs.
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const int num_planes = av1_num_planes(cm);
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
-  int y_skip = 0, uv_skip = 0;
+  int y_skip_txfm = 0, uv_skip_txfm = 0;
   int64_t dist_y = 0, dist_uv = 0;
 
-  ctx->rd_stats.skip = 0;
+  ctx->rd_stats.skip_txfm = 0;
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
   mbmi->use_intrabc = 0;
@@ -2837,40 +3302,34 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
 
   const int64_t intra_yrd =
       av1_rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
-                                 &y_skip, bsize, best_rd, ctx);
+                                 &y_skip_txfm, bsize, best_rd, ctx);
 
   // Initialize default mode evaluation params
   set_mode_eval_params(cpi, x, DEFAULT_EVAL);
 
   if (intra_yrd < best_rd) {
-    // Only store reconstructed luma when there's chroma RDO. When there's no
-    // chroma RDO, the reconstructed luma will be stored in encode_superblock().
-    xd->cfl.store_y = store_cfl_required_rdo(cm, x);
-    if (xd->cfl.store_y) {
-      // Restore reconstructed luma values.
-      memcpy(x->blk_skip, ctx->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-      av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
-      av1_encode_intra_block_plane(cpi, x, bsize, AOM_PLANE_Y, DRY_RUN_NORMAL,
-                                   cpi->optimize_seg_arr[mbmi->segment_id]);
-      av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
-      xd->cfl.store_y = 0;
-    }
+    // Search intra modes for uv planes if needed
     if (num_planes > 1) {
-      init_sbuv_mode(mbmi);
-      if (xd->is_chroma_ref) {
-        const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
-        av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                                    &dist_uv, &uv_skip, bsize, max_uv_tx_size);
+      // Set up the tx variables for reproducing the y predictions in case we
+      // need it for chroma-from-luma.
+      if (xd->is_chroma_ref && store_cfl_required_rdo(cm, x)) {
+        memcpy(txfm_info->blk_skip, ctx->blk_skip,
+               sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+        av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
       }
+      const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+      av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                                  &dist_uv, &uv_skip_txfm, bsize,
+                                  max_uv_tx_size);
     }
 
     // Intra block is always coded as non-skip
     rd_cost->rate =
-        rate_y + rate_uv + x->skip_cost[av1_get_skip_context(xd)][0];
+        rate_y + rate_uv +
+        x->mode_costs.skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
     rd_cost->dist = dist_y + dist_uv;
     rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
-    rd_cost->skip = 0;
+    rd_cost->skip_txfm = 0;
   } else {
     rd_cost->rate = INT_MAX;
   }
@@ -2878,15 +3337,15 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
     best_rd = rd_cost->rdcost;
   if (rd_pick_intrabc_mode_sb(cpi, x, ctx, rd_cost, bsize, best_rd) < best_rd) {
-    ctx->rd_stats.skip = mbmi->skip;
-    memcpy(ctx->blk_skip, x->blk_skip,
-           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    ctx->rd_stats.skip_txfm = mbmi->skip_txfm;
+    memcpy(ctx->blk_skip, txfm_info->blk_skip,
+           sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
     assert(rd_cost->rate != INT_MAX);
   }
   if (rd_cost->rate == INT_MAX) return;
 
   ctx->mic = *xd->mi[0];
-  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext,
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
                                       av1_ref_frame_type(xd->mi[0]->ref_frame));
   av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
 }
@@ -2927,7 +3386,7 @@ static AOM_INLINE void rd_pick_skip_mode(
     return;
   }
 
-  if ((!cpi->oxcf.enable_onesided_comp ||
+  if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp ||
        cpi->sf.inter_sf.disable_onesided_comp) &&
       cpi->all_one_sided_refs) {
     return;
@@ -2938,12 +3397,12 @@ static AOM_INLINE void rd_pick_skip_mode(
   mbmi->ref_frame[0] = ref_frame;
   mbmi->ref_frame[1] = second_ref_frame;
   const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-  if (x->mbmi_ext->ref_mv_count[ref_frame_type] == UINT8_MAX) {
-    if (x->mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
-        x->mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
+  if (x->mbmi_ext.ref_mv_count[ref_frame_type] == UINT8_MAX) {
+    MB_MODE_INFO_EXT *mbmi_ext = &x->mbmi_ext;
+    if (mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
+        mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
       return;
     }
-    MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
     av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count,
                      xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
                      mbmi_ext->mode_context);
@@ -2964,7 +3423,9 @@ static AOM_INLINE void rd_pick_skip_mode(
   mbmi->interinter_comp.type = COMPOUND_AVERAGE;
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   mbmi->ref_mv_idx = 0;
-  mbmi->skip_mode = mbmi->skip = 1;
+  mbmi->skip_mode = mbmi->skip_txfm = 1;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
 
   set_default_interp_filters(mbmi, cm->features.interp_filter);
 
@@ -2987,11 +3448,12 @@ static AOM_INLINE void rd_pick_skip_mode(
   const int skip_mode_ctx = av1_get_skip_mode_context(xd);
   int64_t best_intra_inter_mode_cost = INT64_MAX;
   if (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX) {
-    best_intra_inter_mode_cost =
-        RDCOST(x->rdmult, rd_cost->rate + x->skip_mode_cost[skip_mode_ctx][0],
-               rd_cost->dist);
+    const ModeCosts *mode_costs = &x->mode_costs;
+    best_intra_inter_mode_cost = RDCOST(
+        x->rdmult, rd_cost->rate + mode_costs->skip_mode_cost[skip_mode_ctx][0],
+        rd_cost->dist);
     // Account for non-skip mode rate in total rd stats
-    rd_cost->rate += x->skip_mode_cost[skip_mode_ctx][0];
+    rd_cost->rate += mode_costs->skip_mode_cost[skip_mode_ctx][0];
     av1_rd_cost_update(x->rdmult, rd_cost);
   }
 
@@ -3000,43 +3462,12 @@ static AOM_INLINE void rd_pick_skip_mode(
     assert(mode_index != THR_INVALID);
     search_state->best_mbmode.skip_mode = 1;
     search_state->best_mbmode = *mbmi;
-
-    search_state->best_mbmode.skip_mode = search_state->best_mbmode.skip = 1;
-    search_state->best_mbmode.mode = NEAREST_NEARESTMV;
-    search_state->best_mbmode.ref_frame[0] = mbmi->ref_frame[0];
-    search_state->best_mbmode.ref_frame[1] = mbmi->ref_frame[1];
-    search_state->best_mbmode.mv[0].as_int = mbmi->mv[0].as_int;
-    search_state->best_mbmode.mv[1].as_int = mbmi->mv[1].as_int;
-    search_state->best_mbmode.ref_mv_idx = 0;
-
-    // Set up tx_size related variables for skip-specific loop filtering.
-    search_state->best_mbmode.tx_size =
-        block_signals_txsize(bsize)
-            ? tx_size_from_tx_mode(bsize, x->tx_mode_search_type)
-            : max_txsize_rect_lookup[bsize];
     memset(search_state->best_mbmode.inter_tx_size,
            search_state->best_mbmode.tx_size,
            sizeof(search_state->best_mbmode.inter_tx_size));
     set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height,
-                  search_state->best_mbmode.skip && is_inter_block(mbmi), xd);
-
-    // Set up color-related variables for skip mode.
-    search_state->best_mbmode.uv_mode = UV_DC_PRED;
-    search_state->best_mbmode.palette_mode_info.palette_size[0] = 0;
-    search_state->best_mbmode.palette_mode_info.palette_size[1] = 0;
-
-    search_state->best_mbmode.comp_group_idx = 0;
-    search_state->best_mbmode.compound_idx = x->compound_idx;
-    search_state->best_mbmode.interinter_comp.type = COMPOUND_AVERAGE;
-    search_state->best_mbmode.motion_mode = SIMPLE_TRANSLATION;
-
-    search_state->best_mbmode.interintra_mode =
-        (INTERINTRA_MODE)(II_DC_PRED - 1);
-    search_state->best_mbmode.filter_intra_mode_info.use_filter_intra = 0;
-
-    set_default_interp_filters(&search_state->best_mbmode,
-                               cm->features.interp_filter);
-
+                  search_state->best_mbmode.skip_txfm && is_inter_block(mbmi),
+                  xd);
     search_state->best_mode_index = mode_index;
 
     // Update rd_cost
@@ -3048,7 +3479,7 @@ static AOM_INLINE void rd_pick_skip_mode(
     search_state->best_skip2 = 1;
     search_state->best_mode_skippable = 1;
 
-    x->force_skip = 1;
+    x->txfm_search_info.skip_txfm = 1;
   }
 }
 
@@ -3057,10 +3488,10 @@ static AOM_INLINE MB_MODE_INFO *get_winner_mode_stats(
     MACROBLOCK *x, MB_MODE_INFO *best_mbmode, RD_STATS *best_rd_cost,
     int best_rate_y, int best_rate_uv, THR_MODES *best_mode_index,
     RD_STATS **winner_rd_cost, int *winner_rate_y, int *winner_rate_uv,
-    THR_MODES *winner_mode_index, int enable_multiwinner_mode_process,
+    THR_MODES *winner_mode_index, MULTI_WINNER_MODE_TYPE multi_winner_mode_type,
     int mode_idx) {
   MB_MODE_INFO *winner_mbmi;
-  if (enable_multiwinner_mode_process) {
+  if (multi_winner_mode_type) {
     assert(mode_idx >= 0 && mode_idx < x->winner_mode_count);
     WinnerModeStats *winner_mode_stat = &x->winner_mode_stats[mode_idx];
     winner_mbmi = &winner_mode_stat->mbmi;
@@ -3092,10 +3523,13 @@ static AOM_INLINE void refine_winner_mode_tx(
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   int64_t best_rd;
   const int num_planes = av1_num_planes(cm);
 
-  if (!is_winner_mode_processing_enabled(cpi, best_mbmode, best_mbmode->mode))
+  if (!is_winner_mode_processing_enabled(cpi, x, best_mbmode,
+                                         rd_cost->skip_txfm))
     return;
 
   // Set params for winner mode evaluation
@@ -3115,16 +3549,16 @@ static AOM_INLINE void refine_winner_mode_tx(
     MB_MODE_INFO *winner_mbmi = get_winner_mode_stats(
         x, best_mbmode, rd_cost, best_rate_y, best_rate_uv, best_mode_index,
         &winner_rd_stats, &winner_rate_y, &winner_rate_uv, &winner_mode_index,
-        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, mode_idx);
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, mode_idx);
 
     if (xd->lossless[winner_mbmi->segment_id] == 0 &&
         winner_mode_index != THR_INVALID &&
-        is_winner_mode_processing_enabled(cpi, winner_mbmi,
-                                          winner_mbmi->mode)) {
+        is_winner_mode_processing_enabled(cpi, x, winner_mbmi,
+                                          rd_cost->skip_txfm)) {
       RD_STATS rd_stats = *winner_rd_stats;
       int skip_blk = 0;
       RD_STATS rd_stats_y, rd_stats_uv;
-      const int skip_ctx = av1_get_skip_context(xd);
+      const int skip_ctx = av1_get_skip_txfm_context(xd);
 
       *mbmi = *winner_mbmi;
 
@@ -3140,13 +3574,27 @@ static AOM_INLINE void refine_winner_mode_tx(
       if (is_inter_mode(mbmi->mode)) {
         const int mi_row = xd->mi_row;
         const int mi_col = xd->mi_col;
-        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
-                                      av1_num_planes(cm) - 1);
+        bool is_predictor_built = false;
+        const PREDICTION_MODE prediction_mode = mbmi->mode;
+        // Do interpolation filter search for realtime mode if applicable.
+        if (cpi->sf.winner_mode_sf.winner_mode_ifs &&
+            cpi->oxcf.mode == REALTIME &&
+            cm->current_frame.reference_mode == SINGLE_REFERENCE &&
+            is_inter_mode(prediction_mode) &&
+            mbmi->motion_mode == SIMPLE_TRANSLATION &&
+            !is_inter_compound_mode(prediction_mode)) {
+          is_predictor_built =
+              fast_interp_search(cpi, x, mi_row, mi_col, bsize);
+        }
+        if (!is_predictor_built) {
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                        av1_num_planes(cm) - 1);
+        }
         if (mbmi->motion_mode == OBMC_CAUSAL)
           av1_build_obmc_inter_predictors_sb(cm, xd);
 
         av1_subtract_plane(x, bsize, 0);
-        if (x->tx_mode_search_type == TX_MODE_SELECT &&
+        if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
             !xd->lossless[mbmi->segment_id]) {
           av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
                                               INT64_MAX);
@@ -3157,7 +3605,7 @@ static AOM_INLINE void refine_winner_mode_tx(
           memset(mbmi->inter_tx_size, mbmi->tx_size,
                  sizeof(mbmi->inter_tx_size));
           for (int i = 0; i < xd->height * xd->width; ++i)
-            set_blk_skip(x, 0, i, rd_stats_y.skip);
+            set_blk_skip(txfm_info->blk_skip, 0, i, rd_stats_y.skip_txfm);
         }
       } else {
         av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
@@ -3170,20 +3618,22 @@ static AOM_INLINE void refine_winner_mode_tx(
         av1_init_rd_stats(&rd_stats_uv);
       }
 
+      const ModeCosts *mode_costs = &x->mode_costs;
       if (is_inter_mode(mbmi->mode) &&
           RDCOST(x->rdmult,
-                 x->skip_cost[skip_ctx][0] + rd_stats_y.rate + rd_stats_uv.rate,
+                 mode_costs->skip_txfm_cost[skip_ctx][0] + rd_stats_y.rate +
+                     rd_stats_uv.rate,
                  (rd_stats_y.dist + rd_stats_uv.dist)) >
-              RDCOST(x->rdmult, x->skip_cost[skip_ctx][1],
+              RDCOST(x->rdmult, mode_costs->skip_txfm_cost[skip_ctx][1],
                      (rd_stats_y.sse + rd_stats_uv.sse))) {
         skip_blk = 1;
-        rd_stats_y.rate = x->skip_cost[skip_ctx][1];
+        rd_stats_y.rate = mode_costs->skip_txfm_cost[skip_ctx][1];
         rd_stats_uv.rate = 0;
         rd_stats_y.dist = rd_stats_y.sse;
         rd_stats_uv.dist = rd_stats_uv.sse;
       } else {
         skip_blk = 0;
-        rd_stats_y.rate += x->skip_cost[skip_ctx][0];
+        rd_stats_y.rate += mode_costs->skip_txfm_cost[skip_ctx][0];
       }
       int this_rate = rd_stats.rate + rd_stats_y.rate + rd_stats_uv.rate -
                       winner_rate_y - winner_rate_uv;
@@ -3192,7 +3642,7 @@ static AOM_INLINE void refine_winner_mode_tx(
       if (best_rd > this_rd) {
         *best_mbmode = *mbmi;
         *best_mode_index = winner_mode_index;
-        av1_copy_array(ctx->blk_skip, x->blk_skip, ctx->num_4x4_blk);
+        av1_copy_array(ctx->blk_skip, txfm_info->blk_skip, ctx->num_4x4_blk);
         av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
         rd_cost->rate = this_rate;
         rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
@@ -3205,6 +3655,7 @@ static AOM_INLINE void refine_winner_mode_tx(
   }
 }
 
+/*!\cond */
 typedef struct {
   // Mask for each reference frame, specifying which prediction modes to NOT try
   // during search.
@@ -3215,6 +3666,7 @@ typedef struct {
   // (NONE_FRAME).
   bool ref_combo[REF_FRAMES][REF_FRAMES + 1];
 } mode_skip_mask_t;
+/*!\endcond */
 
 // Update 'ref_combo' mask to disable given 'ref' in single and compound modes.
 static AOM_INLINE void disable_reference(
@@ -3307,7 +3759,7 @@ static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
 
   if (sf->rt_sf.use_real_time_ref_set)
     ref_set = REF_SET_REALTIME;
-  else if (cpi->oxcf.enable_reduced_reference_set)
+  else if (cpi->oxcf.ref_frm_cfg.enable_reduced_reference_set)
     ref_set = REF_SET_REDUCED;
 
   default_skip_mask(mask, ref_set);
@@ -3355,15 +3807,17 @@ static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
     // unless ARNR filtering is enabled in which case we want
     // an unfiltered alternative. We allow near/nearest as well
     // because they may result in zero-zero MVs but be cheaper.
-    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+    if (cpi->rc.is_src_frame_alt_ref &&
+        (cpi->oxcf.algo_cfg.arnr_max_frames == 0)) {
       disable_inter_references_except_altref(mask->ref_combo);
 
       mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
       const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
       int_mv near_mv, nearest_mv, global_mv;
-      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext);
-      get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext);
-      get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames,
+                  &x->mbmi_ext);
+      get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext);
+      get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext);
 
       if (near_mv.as_int != global_mv.as_int)
         mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV);
@@ -3373,8 +3827,8 @@ static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
   }
 
   if (cpi->rc.is_src_frame_alt_ref) {
-    if (sf->inter_sf.alt_ref_search_fp) {
-      assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+    if (sf->inter_sf.alt_ref_search_fp &&
+        (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME])) {
       mask->pred_modes[ALTREF_FRAME] = 0;
       disable_inter_references_except_altref(mask->ref_combo);
       disable_reference(INTRA_FRAME, mask->ref_combo);
@@ -3382,56 +3836,128 @@ static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
   }
 
   if (sf->inter_sf.alt_ref_search_fp) {
-    if (!cm->show_frame && x->best_pred_mv_sad < INT_MAX) {
-      int sad_thresh = x->best_pred_mv_sad + (x->best_pred_mv_sad >> 3);
+    if (!cm->show_frame && x->best_pred_mv_sad[0] < INT_MAX) {
+      int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 3);
       // Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if
       // those are past frames
-      for (ref_frame = BWDREF_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-        if (cpi->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
+      MV_REFERENCE_FRAME start_frame =
+          sf->inter_sf.alt_ref_search_fp == 1 ? ALTREF2_FRAME : BWDREF_FRAME;
+      for (ref_frame = start_frame; ref_frame <= ALTREF_FRAME; ref_frame++) {
+        if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
+            0) {
+          // Prune inter modes when relative dist of ALTREF2 and ALTREF is close
+          // to the relative dist of LAST_FRAME.
+          if (sf->inter_sf.alt_ref_search_fp == 1 &&
+              (abs(cpi->ref_frame_dist_info
+                       .ref_relative_dist[ref_frame - LAST_FRAME]) >
+               1.5 * abs(cpi->ref_frame_dist_info
+                             .ref_relative_dist[LAST_FRAME - LAST_FRAME]))) {
+            continue;
+          }
           if (x->pred_mv_sad[ref_frame] > sad_thresh)
             mask->pred_modes[ref_frame] |= INTER_ALL;
+        }
       }
     }
   }
 
-  if (sf->inter_sf.adaptive_mode_search) {
-    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
-        cpi->rc.frames_since_golden >= 3)
-      if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
-        mask->pred_modes[GOLDEN_FRAME] |= INTER_ALL;
+  if (sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) {
+    if (x->best_pred_mv_sad[0] < INT_MAX) {
+      int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 1);
+      const int prune_ref_list[2] = { GOLDEN_FRAME, ALTREF_FRAME };
+
+      // Conservatively skip the modes w.r.t. GOLDEN and ALTREF references
+      for (int ref_idx = 0; ref_idx < 2; ref_idx++) {
+        ref_frame = prune_ref_list[ref_idx];
+        if (x->pred_mv_sad[ref_frame] > sad_thresh)
+          mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+      }
+    }
   }
 
   if (bsize > sf->part_sf.max_intra_bsize) {
     disable_reference(INTRA_FRAME, mask->ref_combo);
   }
 
+  if (!cpi->oxcf.tool_cfg.enable_global_motion) {
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      mask->pred_modes[ref_frame] |= (1 << GLOBALMV);
+      mask->pred_modes[ref_frame] |= (1 << GLOBAL_GLOBALMV);
+    }
+  }
+
   mask->pred_modes[INTRA_FRAME] |=
       ~(sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]]);
 }
 
-static AOM_INLINE void init_pred_buf(const MACROBLOCK *const x,
-                                     HandleInterModeArgs *const args) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  if (is_cur_buf_hbd(xd)) {
+static AOM_INLINE void init_neighbor_pred_buf(
+    const OBMCBuffer *const obmc_buffer, HandleInterModeArgs *const args,
+    int is_hbd) {
+  if (is_hbd) {
     const int len = sizeof(uint16_t);
-    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
-    args->above_pred_buf[1] =
-        CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred);
+    args->above_pred_buf[1] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred +
+                                                 (MAX_SB_SQUARE >> 1) * len);
     args->above_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
-    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+        CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + MAX_SB_SQUARE * len);
+    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->left_pred);
     args->left_pred_buf[1] =
-        CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+        CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1) * len);
     args->left_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
+        CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + MAX_SB_SQUARE * len);
   } else {
-    args->above_pred_buf[0] = x->above_pred_buf;
-    args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
-    args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
-    args->left_pred_buf[0] = x->left_pred_buf;
-    args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
-    args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
+    args->above_pred_buf[0] = obmc_buffer->above_pred;
+    args->above_pred_buf[1] = obmc_buffer->above_pred + (MAX_SB_SQUARE >> 1);
+    args->above_pred_buf[2] = obmc_buffer->above_pred + MAX_SB_SQUARE;
+    args->left_pred_buf[0] = obmc_buffer->left_pred;
+    args->left_pred_buf[1] = obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1);
+    args->left_pred_buf[2] = obmc_buffer->left_pred + MAX_SB_SQUARE;
+  }
+}
+
+static AOM_INLINE int prune_ref_frame(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                      MV_REFERENCE_FRAME ref_frame) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME rf[2];
+  av1_set_ref_frame(rf, ref_frame);
+
+  if ((cpi->prune_ref_frame_mask >> ref_frame) & 1) return 1;
+
+  if (prune_ref_by_selective_ref_frame(cpi, x, rf,
+                                       cm->cur_frame->ref_display_order_hint)) {
+    return 1;
+  }
+
+  return 0;
+}
+
+static AOM_INLINE int is_ref_frame_used_by_compound_ref(
+    int ref_frame, int skip_ref_frame_mask) {
+  for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+    if (!(skip_ref_frame_mask & (1 << r))) {
+      const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+      if (rf[0] == ref_frame || rf[1] == ref_frame) {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+static AOM_INLINE int is_ref_frame_used_in_cache(MV_REFERENCE_FRAME ref_frame,
+                                                 const MB_MODE_INFO *mi_cache) {
+  if (!mi_cache) {
+    return 0;
+  }
+
+  if (ref_frame < REF_FRAMES) {
+    return (ref_frame == mi_cache->ref_frame[0] ||
+            ref_frame == mi_cache->ref_frame[1]);
   }
+
+  // if we are here, then the current mode is compound.
+  MV_REFERENCE_FRAME cached_ref_type = av1_ref_frame_type(mi_cache->ref_frame);
+  return ref_frame == cached_ref_type;
 }
 
 // Please add/modify parameter setting in this function, making it consistent
@@ -3444,53 +3970,54 @@ static AOM_INLINE void set_params_rd_pick_inter_mode(
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   unsigned char segment_id = mbmi->segment_id;
 
-  init_pred_buf(x, args);
+  init_neighbor_pred_buf(&x->obmc_buffer, args, is_cur_buf_hbd(&x->e_mbd));
   av1_collect_neighbors_ref_counts(xd);
-  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+  estimate_ref_frame_costs(cm, xd, &x->mode_costs, segment_id, ref_costs_single,
                            ref_costs_comp);
 
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
-  MV_REFERENCE_FRAME ref_frame;
-  x->best_pred_mv_sad = INT_MAX;
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+  x->best_pred_mv_sad[0] = INT_MAX;
+  x->best_pred_mv_sad[1] = INT_MAX;
+
+  for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME;
+       ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
-    x->mbmi_ext->mode_context[ref_frame] = 0;
+    mbmi_ext->mode_context[ref_frame] = 0;
     mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
     if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
-      if (mbmi->partition != PARTITION_NONE &&
-          mbmi->partition != PARTITION_SPLIT) {
-        if (skip_ref_frame_mask & (1 << ref_frame)) {
-          int skip = 1;
-          for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
-            if (!(skip_ref_frame_mask & (1 << r))) {
-              const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
-              if (rf[0] == ref_frame || rf[1] == ref_frame) {
-                skip = 0;
-                break;
-              }
-            }
-          }
-          if (skip) continue;
-        }
+      // Skip the ref frame if the mask says skip and the ref is not used by
+      // compound ref.
+      if (skip_ref_frame_mask & (1 << ref_frame) &&
+          !is_ref_frame_used_by_compound_ref(ref_frame, skip_ref_frame_mask) &&
+          !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) {
+        continue;
       }
       assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
       setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb);
     }
-    // Store the best pred_mv_sad across all past frames
-    if (cpi->sf.inter_sf.alt_ref_search_fp &&
-        cpi->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
-      x->best_pred_mv_sad =
-          AOMMIN(x->best_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+    if (cpi->sf.inter_sf.alt_ref_search_fp ||
+        cpi->sf.rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) {
+      // Store the best pred_mv_sad across all past frames
+      if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
+          0)
+        x->best_pred_mv_sad[0] =
+            AOMMIN(x->best_pred_mv_sad[0], x->pred_mv_sad[ref_frame]);
+      else
+        // Store the best pred_mv_sad across all future frames
+        x->best_pred_mv_sad[1] =
+            AOMMIN(x->best_pred_mv_sad[1], x->pred_mv_sad[ref_frame]);
+    }
   }
-  // ref_frame = ALTREF_FRAME
-  if (!cpi->sf.rt_sf.use_real_time_ref_set) {
+
+  if (!cpi->sf.rt_sf.use_real_time_ref_set && is_comp_ref_allowed(bsize)) {
     // No second reference on RT ref set, so no need to initialize
-    for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
-      x->mbmi_ext->mode_context[ref_frame] = 0;
+    for (MV_REFERENCE_FRAME ref_frame = EXTREF_FRAME;
+         ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+      mbmi_ext->mode_context[ref_frame] = 0;
       mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
       const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
       if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) &&
@@ -3498,12 +4025,14 @@ static AOM_INLINE void set_params_rd_pick_inter_mode(
         continue;
       }
 
-      if (mbmi->partition != PARTITION_NONE &&
-          mbmi->partition != PARTITION_SPLIT) {
-        if (skip_ref_frame_mask & (1 << ref_frame)) {
-          continue;
-        }
+      if (skip_ref_frame_mask & (1 << ref_frame) &&
+          !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) {
+        continue;
       }
+      // Ref mv list population is not required, when compound references are
+      // pruned.
+      if (prune_ref_frame(cpi, x, ref_frame)) continue;
+
       av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
                        xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
                        mbmi_ext->mode_context);
@@ -3514,10 +4043,23 @@ static AOM_INLINE void set_params_rd_pick_inter_mode(
   }
 
   av1_count_overlappable_neighbors(cm, xd);
-  const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
-  const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] <
-                         cpi->sf.inter_sf.prune_obmc_prob_thresh;
-  if (cpi->oxcf.enable_obmc && !cpi->sf.inter_sf.disable_obmc && !prune_obmc) {
+  const FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  int use_actual_frame_probs = 1;
+  int prune_obmc;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  use_actual_frame_probs =
+      (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+  if (!use_actual_frame_probs) {
+    prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] <
+                 cpi->sf.inter_sf.prune_obmc_prob_thresh;
+  }
+#endif
+  if (use_actual_frame_probs) {
+    prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] <
+                 cpi->sf.inter_sf.prune_obmc_prob_thresh;
+  }
+  if (cpi->oxcf.motion_mode_cfg.enable_obmc && !prune_obmc) {
     if (check_num_overlappable_neighbors(mbmi) &&
         is_motion_variation_allowed_bsize(bsize)) {
       int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -3547,24 +4089,45 @@ static AOM_INLINE void set_params_rd_pick_inter_mode(
   set_mode_eval_params(cpi, x, MODE_EVAL);
 
   x->comp_rd_stats_idx = 0;
+
+  for (int idx = 0; idx < REF_FRAMES; idx++) {
+    args->best_single_sse_in_refs[idx] = INT32_MAX;
+  }
 }
 
-static AOM_INLINE void init_intra_mode_search_state(
-    IntraModeSearchState *intra_search_state) {
-  intra_search_state->skip_intra_modes = 0;
-  intra_search_state->best_intra_mode = DC_PRED;
-  intra_search_state->angle_stats_ready = 0;
-  av1_zero(intra_search_state->directional_mode_skip_mask);
-  intra_search_state->rate_uv_intra = INT_MAX;
-  av1_zero(intra_search_state->pmi_uv);
-  for (int i = 0; i < REFERENCE_MODES; ++i)
-    intra_search_state->best_pred_rd[i] = INT64_MAX;
+static AOM_INLINE void init_single_inter_mode_search_state(
+    InterModeSearchState *search_state) {
+  for (int dir = 0; dir < 2; ++dir) {
+    for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+        SingleInterModeState *state;
+
+        state = &search_state->single_state[dir][mode][ref_frame];
+        state->ref_frame = NONE_FRAME;
+        state->rd = INT64_MAX;
+
+        state = &search_state->single_state_modelled[dir][mode][ref_frame];
+        state->ref_frame = NONE_FRAME;
+        state->rd = INT64_MAX;
+
+        search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME;
+      }
+    }
+  }
+
+  for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+    search_state->best_single_rd[ref_frame] = INT64_MAX;
+    search_state->best_single_mode[ref_frame] = PRED_MODE_INVALID;
+  }
+  av1_zero(search_state->single_state_cnt);
+  av1_zero(search_state->single_state_modelled_cnt);
 }
 
 static AOM_INLINE void init_inter_mode_search_state(
     InterModeSearchState *search_state, const AV1_COMP *cpi,
     const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) {
   init_intra_mode_search_state(&search_state->intra_search_state);
+  av1_invalid_rd_stats(&search_state->best_y_rdcost);
 
   search_state->best_rd = best_rd_so_far;
   search_state->best_skip_rd[0] = INT64_MAX;
@@ -3594,7 +4157,7 @@ static AOM_INLINE void init_inter_mode_search_state(
   for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i)
     search_state->mode_threshold[i] = 0;
   const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
-  for (int i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
+  for (int i = LAST_NEW_MV_INDEX + 1; i < SINGLE_REF_MODE_END; ++i)
     search_state->mode_threshold[i] =
         ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >>
         RD_THRESH_FAC_FRAC_BITS;
@@ -3606,7 +4169,7 @@ static AOM_INLINE void init_inter_mode_search_state(
   av1_zero(search_state->single_newmv);
   av1_zero(search_state->single_newmv_rate);
   av1_zero(search_state->single_newmv_valid);
-  for (int i = 0; i < MB_MODE_COUNT; ++i) {
+  for (int i = SINGLE_INTER_MODE_START; i < SINGLE_INTER_MODE_END; ++i) {
     for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) {
       for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
         search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
@@ -3615,30 +4178,27 @@ static AOM_INLINE void init_inter_mode_search_state(
     }
   }
 
-  for (int dir = 0; dir < 2; ++dir) {
-    for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
-      for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
-        SingleInterModeState *state;
+  for (int i = 0; i < REFERENCE_MODES; ++i) {
+    search_state->best_pred_rd[i] = INT64_MAX;
+  }
 
-        state = &search_state->single_state[dir][mode][ref_frame];
-        state->ref_frame = NONE_FRAME;
-        state->rd = INT64_MAX;
+  if (cpi->common.current_frame.reference_mode != SINGLE_REFERENCE) {
+    for (int i = SINGLE_REF_MODE_END; i < THR_INTER_MODE_END; ++i)
+      search_state->mode_threshold[i] =
+          ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >>
+          RD_THRESH_FAC_FRAC_BITS;
 
-        state = &search_state->single_state_modelled[dir][mode][ref_frame];
-        state->ref_frame = NONE_FRAME;
-        state->rd = INT64_MAX;
-      }
-    }
-  }
-  for (int dir = 0; dir < 2; ++dir) {
-    for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
-      for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
-        search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME;
+    for (int i = COMP_INTER_MODE_START; i < COMP_INTER_MODE_END; ++i) {
+      for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) {
+        for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+          search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+          search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+        }
       }
     }
+
+    init_single_inter_mode_search_state(search_state);
   }
-  av1_zero(search_state->single_state_cnt);
-  av1_zero(search_state->single_state_modelled_cnt);
 }
 
 static bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask,
@@ -3703,6 +4263,14 @@ static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x,
   return picked_ref_frames_mask;
 }
 
+// Check if reference frame pair of the current block matches with the given
+// block.
+static INLINE int match_ref_frame_pair(const MB_MODE_INFO *mbmi,
+                                       const MV_REFERENCE_FRAME *ref_frames) {
+  return ((ref_frames[0] == mbmi->ref_frame[0]) &&
+          (ref_frames[1] == mbmi->ref_frame[1]));
+}
+
 // Case 1: return 0, means don't skip this mode
 // Case 2: return 1, means skip this mode completely
 // Case 3: return 2, means skip compound only, but still try single motion modes
@@ -3715,10 +4283,12 @@ static int inter_mode_search_order_independent_skip(
   }
 
   const int ref_type = av1_ref_frame_type(ref_frame);
-  if ((cpi->prune_ref_frame_mask >> ref_type) & 1) return 1;
+  if (!cpi->sf.rt_sf.use_real_time_ref_set)
+    if (prune_ref_frame(cpi, x, ref_type)) return 1;
 
   // This is only used in motion vector unit test.
-  if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME)
+  if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test &&
+      ref_frame[0] == INTRA_FRAME)
     return 1;
 
   const AV1_COMMON *const cm = &cpi->common;
@@ -3726,11 +4296,49 @@ static int inter_mode_search_order_independent_skip(
     return 1;
   }
 
-  const int comp_pred = ref_frame[1] > INTRA_FRAME;
-  if ((!cpi->oxcf.enable_onesided_comp ||
-       cpi->sf.inter_sf.disable_onesided_comp) &&
-      comp_pred && cpi->all_one_sided_refs) {
-    return 1;
+  // Reuse the prediction mode in cache
+  if (x->use_mb_mode_cache) {
+    const MB_MODE_INFO *cached_mi = x->mb_mode_cache;
+    const PREDICTION_MODE cached_mode = cached_mi->mode;
+    const MV_REFERENCE_FRAME *cached_frame = cached_mi->ref_frame;
+    const int cached_mode_is_single = cached_frame[1] <= INTRA_FRAME;
+
+    // If the cached mode is intra, then we just need to match the mode.
+    if (is_mode_intra(cached_mode) && mode != cached_mode) {
+      return 1;
+    }
+
+    // If the cached mode is single inter mode, then we match the mode and
+    // reference frame.
+    if (cached_mode_is_single) {
+      if (mode != cached_mode || ref_frame[0] != cached_frame[0]) {
+        return 1;
+      }
+    } else {
+      // If the cached mode is compound, then we need to consider several cases.
+      const int mode_is_single = ref_frame[1] <= INTRA_FRAME;
+      if (mode_is_single) {
+        // If the mode is single, we know the modes can't match. But we might
+        // still want to search it if compound mode depends on the current mode.
+        int skip_motion_mode_only = 0;
+        if (cached_mode == NEW_NEARMV || cached_mode == NEW_NEARESTMV) {
+          skip_motion_mode_only = (ref_frame[0] == cached_frame[0]);
+        } else if (cached_mode == NEAR_NEWMV || cached_mode == NEAREST_NEWMV) {
+          skip_motion_mode_only = (ref_frame[0] == cached_frame[1]);
+        } else if (cached_mode == NEW_NEWMV) {
+          skip_motion_mode_only = (ref_frame[0] == cached_frame[0] ||
+                                   ref_frame[0] == cached_frame[1]);
+        }
+
+        return 1 + skip_motion_mode_only;
+      } else {
+        // If both modes are compound, then everything must match.
+        if (mode != cached_mode || ref_frame[0] != cached_frame[0] ||
+            ref_frame[1] != cached_frame[1]) {
+          return 1;
+        }
+      }
+    }
   }
 
   const MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
@@ -3740,32 +4348,67 @@ static int inter_mode_search_order_independent_skip(
       x->must_find_valid_partition)
     return 0;
 
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  // Prune NEARMV and NEAR_NEARMV based on q index and neighbor's reference
+  // frames
+  if (sf->inter_sf.prune_nearmv_using_neighbors &&
+      (mode == NEAR_NEARMV || mode == NEARMV)) {
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    if (search_state->best_rd != INT64_MAX && xd->left_available &&
+        xd->up_available) {
+      const int thresholds[PRUNE_NEARMV_MAX][3] = { { 1, 0, 0 },
+                                                    { 1, 1, 0 },
+                                                    { 2, 1, 0 } };
+      const int qindex_sub_range = x->qindex * 3 / QINDEX_RANGE;
+
+      assert(sf->inter_sf.prune_nearmv_using_neighbors <= PRUNE_NEARMV_MAX &&
+             qindex_sub_range < 3);
+      const int num_ref_frame_pair_match_thresh =
+          thresholds[sf->inter_sf.prune_nearmv_using_neighbors - 1]
+                    [qindex_sub_range];
+
+      assert(num_ref_frame_pair_match_thresh <= 2 &&
+             num_ref_frame_pair_match_thresh >= 0);
+      int num_ref_frame_pair_match = 0;
+
+      num_ref_frame_pair_match = match_ref_frame_pair(xd->left_mbmi, ref_frame);
+      num_ref_frame_pair_match +=
+          match_ref_frame_pair(xd->above_mbmi, ref_frame);
+
+      // Pruning based on ref frame pair match with neighbors.
+      if (num_ref_frame_pair_match < num_ref_frame_pair_match_thresh) return 1;
+    }
+  }
+
   int skip_motion_mode = 0;
-  if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+  if (mbmi->partition != PARTITION_NONE) {
     int skip_ref = skip_ref_frame_mask & (1 << ref_type);
     if (ref_type <= ALTREF_FRAME && skip_ref) {
       // Since the compound ref modes depends on the motion estimation result of
-      // two single ref modes( best mv of single ref modes as the start point )
-      // If current single ref mode is marked skip, we need to check if it will
+      // two single ref modes (best mv of single ref modes as the start point),
+      // if current single ref mode is marked skip, we need to check if it will
       // be used in compound ref modes.
-      for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
-        if (skip_ref_frame_mask & (1 << r)) continue;
-        const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
-        if (rf[0] == ref_type || rf[1] == ref_type) {
-          // Found a not skipped compound ref mode which contains current
-          // single ref. So this single ref can't be skipped completly
-          // Just skip it's motion mode search, still try it's simple
-          // transition mode.
-          skip_motion_mode = 1;
-          skip_ref = 0;
-          break;
-        }
+      if (is_ref_frame_used_by_compound_ref(ref_type, skip_ref_frame_mask)) {
+        // Found a not skipped compound ref mode which contains current
+        // single ref. So this single ref can't be skipped completely
+        // Just skip its motion mode search, still try its simple
+        // transition mode.
+        skip_motion_mode = 1;
+        skip_ref = 0;
       }
     }
+    // If we are reusing the prediction from cache, and the current frame is
+    // required by the cache, then we cannot prune it.
+    if (is_ref_frame_used_in_cache(ref_type, x->mb_mode_cache)) {
+      skip_ref = 0;
+      // If the cache only needs the current reference type for compound
+      // prediction, then we can skip motion mode search.
+      skip_motion_mode = (ref_type <= ALTREF_FRAME &&
+                          x->mb_mode_cache->ref_frame[1] > INTRA_FRAME);
+    }
     if (skip_ref) return 1;
   }
 
-  const SPEED_FEATURES *const sf = &cpi->sf;
   if (ref_frame[0] == INTRA_FRAME) {
     if (mode != DC_PRED) {
       // Disable intra modes other than DC_PRED for blocks with low variance
@@ -3778,10 +4421,6 @@ static int inter_mode_search_order_independent_skip(
     }
   }
 
-  if (prune_ref_by_selective_ref_frame(cpi, x, ref_frame,
-                                       cm->cur_frame->ref_display_order_hint))
-    return 1;
-
   if (skip_motion_mode) return 2;
 
   return 0;
@@ -4011,8 +4650,8 @@ static int compound_skip_by_single_states(
       int_mv single_mv;
       int_mv comp_mv;
       get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, 0, single_refs,
-                  x->mbmi_ext);
-      get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, x->mbmi_ext);
+                  &x->mbmi_ext);
+      get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, &x->mbmi_ext);
       if (single_mv.as_int != comp_mv.as_int) {
         ref_mv_match[i] = 0;
         break;
@@ -4056,12 +4695,14 @@ static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi,
 // Prune compound mode using ref frames of neighbor blocks.
 static INLINE int compound_skip_using_neighbor_refs(
     MACROBLOCKD *const xd, const PREDICTION_MODE this_mode,
-    const MV_REFERENCE_FRAME *ref_frames, int prune_compound_using_neighbors) {
+    const MV_REFERENCE_FRAME *ref_frames, int prune_ext_comp_using_neighbors) {
   // Exclude non-extended compound modes from pruning
   if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
       this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
     return 0;
 
+  if (prune_ext_comp_using_neighbors >= 3) return 1;
+
   int is_ref_match[2] = { 0 };  // 0 - match for forward refs
                                 // 1 - match for backward refs
   // Check if ref frames of this block matches with left neighbor.
@@ -4076,7 +4717,51 @@ static INLINE int compound_skip_using_neighbor_refs(
   const int track_ref_match = is_ref_match[0] + is_ref_match[1];
 
   // Pruning based on ref frame match with neighbors.
-  if (track_ref_match >= prune_compound_using_neighbors) return 0;
+  if (track_ref_match >= prune_ext_comp_using_neighbors) return 0;
+  return 1;
+}
+
+// Update best single mode for the given reference frame based on simple rd.
+static INLINE void update_best_single_mode(InterModeSearchState *search_state,
+                                           const PREDICTION_MODE this_mode,
+                                           const MV_REFERENCE_FRAME ref_frame,
+                                           int64_t this_rd) {
+  if (this_rd < search_state->best_single_rd[ref_frame]) {
+    search_state->best_single_rd[ref_frame] = this_rd;
+    search_state->best_single_mode[ref_frame] = this_mode;
+  }
+}
+
+// Prune compound mode using best single mode for the same reference.
+static INLINE int skip_compound_using_best_single_mode_ref(
+    const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME *ref_frames,
+    const PREDICTION_MODE *best_single_mode,
+    int prune_comp_using_best_single_mode_ref) {
+  // Exclude non-extended compound modes from pruning
+  if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
+      this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
+    return 0;
+
+  assert(this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV);
+  const PREDICTION_MODE comp_mode_ref0 = compound_ref0_mode(this_mode);
+  // Get ref frame direction corresponding to NEWMV
+  // 0 - NEWMV corresponding to forward direction
+  // 1 - NEWMV corresponding to backward direction
+  const int newmv_dir = comp_mode_ref0 != NEWMV;
+
+  // Avoid pruning the compound mode when ref frame corresponding to NEWMV
+  // have NEWMV as single mode winner.
+  // Example: For an extended-compound mode,
+  // {mode, {fwd_frame, bwd_frame}} = {NEAR_NEWMV, {LAST_FRAME, ALTREF_FRAME}}
+  // - Ref frame corresponding to NEWMV is ALTREF_FRAME
+  // - Avoid pruning this mode, if best single mode corresponding to ref frame
+  //   ALTREF_FRAME is NEWMV
+  const PREDICTION_MODE single_mode = best_single_mode[ref_frames[newmv_dir]];
+  if (single_mode == NEWMV) return 0;
+
+  // Avoid pruning the compound mode when best single mode is not available
+  if (prune_comp_using_best_single_mode_ref == 1)
+    if (single_mode == MB_MODE_COUNT) return 0;
   return 1;
 }
 
@@ -4099,28 +4784,31 @@ static INLINE void update_search_state(
     THR_MODES new_best_mode, const MACROBLOCK *x, int txfm_search_done) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const MB_MODE_INFO *mbmi = xd->mi[0];
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int mode_is_intra =
-      (av1_mode_defs[new_best_mode].mode < INTRA_MODE_END);
-  const int skip = mbmi->skip && !mode_is_intra;
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int skip_txfm =
+      mbmi->skip_txfm && !is_mode_intra(av1_mode_defs[new_best_mode].mode);
+  const TxfmSearchInfo *txfm_info = &x->txfm_search_info;
 
   search_state->best_rd = new_best_rd_stats->rdcost;
   search_state->best_mode_index = new_best_mode;
   *best_rd_stats_dst = *new_best_rd_stats;
   search_state->best_mbmode = *mbmi;
-  search_state->best_skip2 = skip;
-  search_state->best_mode_skippable = new_best_rd_stats->skip;
+  search_state->best_skip2 = skip_txfm;
+  search_state->best_mode_skippable = new_best_rd_stats->skip_txfm;
   // When !txfm_search_done, new_best_rd_stats won't provide correct rate_y and
   // rate_uv because av1_txfm_search process is replaced by rd estimation.
-  // Therfore, we should avoid updating best_rate_y and best_rate_uv here.
+  // Therefore, we should avoid updating best_rate_y and best_rate_uv here.
   // These two values will be updated when av1_txfm_search is called.
   if (txfm_search_done) {
     search_state->best_rate_y =
         new_best_rd_stats_y->rate +
-        x->skip_cost[skip_ctx][new_best_rd_stats->skip || skip];
+        x->mode_costs.skip_txfm_cost[skip_ctx]
+                                    [new_best_rd_stats->skip_txfm || skip_txfm];
     search_state->best_rate_uv = new_best_rd_stats_uv->rate;
   }
-  memcpy(ctx->blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+  search_state->best_y_rdcost = *new_best_rd_stats_y;
+  memcpy(ctx->blk_skip, txfm_info->blk_skip,
+         sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
   av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
 }
 
@@ -4158,7 +4846,7 @@ static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
     struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
     const motion_mode_best_st_candidate *const best_motion_mode_cands,
     int do_tx_search, const BLOCK_SIZE bsize, int64_t *const best_est_rd,
-    InterModeSearchState *const search_state) {
+    InterModeSearchState *const search_state, int64_t *yrd) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -4173,7 +4861,7 @@ static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
     av1_init_rd_stats(&rd_stats);
     av1_init_rd_stats(&rd_stats_y);
     av1_init_rd_stats(&rd_stats_uv);
-    int disable_skip = 0, rate_mv;
+    int rate_mv;
 
     rate_mv = best_motion_mode_cands->motion_mode_cand[cand].rate_mv;
     args->skip_motion_mode =
@@ -4185,17 +4873,14 @@ static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
     // Continue if the best candidate is compound.
     if (!is_inter_singleref_mode(mbmi->mode)) continue;
 
-    x->force_skip = 0;
-    const int mode_index = get_prediction_mode_idx(
-        mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-    struct macroblockd_plane *p = xd->plane;
+    x->txfm_search_info.skip_txfm = 0;
+    struct macroblockd_plane *pd = xd->plane;
     const BUFFER_SET orig_dst = {
-      { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
-      { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+      { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+      { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
     };
 
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-    args->simple_rd_state = x->simple_rd_state[mode_index];
     // Initialize motion mode to simple translation
     // Calculation of switchable rate depends on it.
     mbmi->motion_mode = 0;
@@ -4207,10 +4892,11 @@ static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
 
     int64_t skip_rd[2] = { search_state->best_skip_rd[0],
                            search_state->best_skip_rd[1] };
+    int64_t this_yrd = INT64_MAX;
     int64_t ret_value = motion_mode_rd(
-        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
-        &disable_skip, args, search_state->best_rd, skip_rd, &rate_mv,
-        &orig_dst, best_est_rd, do_tx_search, inter_modes_info, 1);
+        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, args,
+        search_state->best_rd, skip_rd, &rate_mv, &orig_dst, best_est_rd,
+        do_tx_search, inter_modes_info, 1, &this_yrd);
 
     if (ret_value != INT64_MAX) {
       rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
@@ -4220,8 +4906,9 @@ static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
       store_winner_mode_stats(
           &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv,
           mode_enum, NULL, bsize, rd_stats.rdcost,
-          cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, do_tx_search);
+          cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search);
       if (rd_stats.rdcost < search_state->best_rd) {
+        *yrd = this_yrd;
         update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
                             &rd_stats_uv, mode_enum, x, do_tx_search);
         if (do_tx_search) search_state->best_skip_rd[0] = skip_rd[0];
@@ -4230,6 +4917,7 @@ static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
   }
 }
 
+/*!\cond */
 // Arguments for speed feature pruning of inter mode search
 typedef struct {
   int *skip_motion_mode;
@@ -4238,17 +4926,16 @@ typedef struct {
   int skip_ref_frame_mask;
   int reach_first_comp_mode;
   int mode_thresh_mul_fact;
-  int *intra_mode_idx_ls;
-  int *intra_mode_num;
+  int num_single_modes_processed;
   int prune_cpd_using_sr_stats_ready;
 } InterModeSFArgs;
+/*!\endcond */
 
 static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
                            int64_t *ref_frame_rd, int midx,
-                           InterModeSFArgs *args) {
+                           InterModeSFArgs *args, int is_low_temp_var) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
   // Get the actual prediction mode we are trying in this iteration
   const THR_MODES mode_enum = av1_default_mode_order[midx];
   const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
@@ -4257,18 +4944,21 @@ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
   const MV_REFERENCE_FRAME ref_frame = ref_frames[0];
   const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
   const int comp_pred = second_ref_frame > INTRA_FRAME;
-  const int last_single_ref_mode_idx =
-      find_last_single_ref_mode_idx(av1_default_mode_order);
 
-  // After we done with single reference modes, find the 2nd best RD
-  // for a reference frame. Only search compound modes that have a reference
-  // frame at least as good as the 2nd best.
-  if (sf->inter_sf.prune_compound_using_single_ref &&
-      midx == last_single_ref_mode_idx + 1) {
-    find_top_ref(ref_frame_rd);
-    args->prune_cpd_using_sr_stats_ready = 1;
+  if (ref_frame == INTRA_FRAME) return 1;
+
+  const FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  if (sf->inter_sf.skip_arf_compound && update_type == ARF_UPDATE &&
+      comp_pred) {
+    return 1;
   }
 
+  // This is for real time encoding.
+  if (is_low_temp_var && !comp_pred && ref_frame != LAST_FRAME &&
+      this_mode != NEARESTMV)
+    return 1;
+
   // Check if this mode should be skipped because it is incompatible with the
   // current frame
   if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames))
@@ -4305,37 +4995,56 @@ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
       return 1;
   }
 
-  // Speed features to prune out INTRA frames
-  if (ref_frame == INTRA_FRAME) {
-    if ((!cpi->oxcf.enable_smooth_intra || sf->intra_sf.disable_smooth_intra) &&
-        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
-         mbmi->mode == SMOOTH_V_PRED))
+  if (sf->inter_sf.prune_compound_using_single_ref && comp_pred) {
+    // After we done with single reference modes, find the 2nd best RD
+    // for a reference frame. Only search compound modes that have a reference
+    // frame at least as good as the 2nd best.
+    if (!args->prune_cpd_using_sr_stats_ready &&
+        args->num_single_modes_processed == NUM_SINGLE_REF_MODES) {
+      find_top_ref(ref_frame_rd);
+      args->prune_cpd_using_sr_stats_ready = 1;
+    }
+    if (args->prune_cpd_using_sr_stats_ready &&
+        !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame))
       return 1;
-    if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) return 1;
-    if (sf->inter_sf.adaptive_mode_search > 1)
-      if ((x->source_variance << num_pels_log2_lookup[bsize]) >
-          args->search_state->best_pred_sse)
-        return 1;
-
-    // Intra modes will be handled in another loop later.
-    assert(*args->intra_mode_num < INTRA_MODES);
-    args->intra_mode_idx_ls[(*args->intra_mode_num)++] = mode_enum;
-    return 1;
   }
 
-  if (sf->inter_sf.prune_compound_using_single_ref &&
-      args->prune_cpd_using_sr_stats_ready && comp_pred &&
-      !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame)) {
+  // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes
+  if (sf->inter_sf.skip_ext_comp_nearmv_mode &&
+      (this_mode == NEW_NEARMV || this_mode == NEAR_NEWMV)) {
     return 1;
   }
 
-  if (sf->inter_sf.prune_compound_using_neighbors && comp_pred) {
+  if (sf->inter_sf.prune_ext_comp_using_neighbors && comp_pred) {
     if (compound_skip_using_neighbor_refs(
             xd, this_mode, ref_frames,
-            sf->inter_sf.prune_compound_using_neighbors))
+            sf->inter_sf.prune_ext_comp_using_neighbors))
+      return 1;
+  }
+
+  if (sf->inter_sf.prune_comp_using_best_single_mode_ref && comp_pred) {
+    if (skip_compound_using_best_single_mode_ref(
+            this_mode, ref_frames, args->search_state->best_single_mode,
+            sf->inter_sf.prune_comp_using_best_single_mode_ref))
+      return 1;
+  }
+
+  if (sf->inter_sf.prune_nearest_near_mv_using_refmv_weight && !comp_pred) {
+    const int8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+    if (skip_nearest_near_mv_using_refmv_weight(x, this_mode, ref_frame_type))
       return 1;
   }
 
+  if (sf->rt_sf.prune_inter_modes_with_golden_ref &&
+      ref_frame == GOLDEN_FRAME && !comp_pred) {
+    const int subgop_size = AOMMIN(cpi->ppi->gf_group.size, FIXED_GF_INTERVAL);
+    if (cpi->rc.frames_since_golden > (subgop_size >> 2) &&
+        args->search_state->best_mbmode.ref_frame[0] != GOLDEN_FRAME) {
+      if ((bsize > BLOCK_16X16 && this_mode == NEWMV) || this_mode == NEARMV)
+        return 1;
+    }
+  }
+
   return 0;
 }
 
@@ -4357,20 +5066,198 @@ static void record_best_compound(REFERENCE_MODE reference_mode,
   hybrid_rd = RDCOST(rdmult, hybrid_rate, rd_stats->dist);
 
   if (!comp_pred) {
-    if (single_rd <
-        search_state->intra_search_state.best_pred_rd[SINGLE_REFERENCE])
-      search_state->intra_search_state.best_pred_rd[SINGLE_REFERENCE] =
-          single_rd;
+    if (single_rd < search_state->best_pred_rd[SINGLE_REFERENCE])
+      search_state->best_pred_rd[SINGLE_REFERENCE] = single_rd;
   } else {
-    if (single_rd <
-        search_state->intra_search_state.best_pred_rd[COMPOUND_REFERENCE])
-      search_state->intra_search_state.best_pred_rd[COMPOUND_REFERENCE] =
-          single_rd;
-  }
-  if (hybrid_rd <
-      search_state->intra_search_state.best_pred_rd[REFERENCE_MODE_SELECT])
-    search_state->intra_search_state.best_pred_rd[REFERENCE_MODE_SELECT] =
-        hybrid_rd;
+    if (single_rd < search_state->best_pred_rd[COMPOUND_REFERENCE])
+      search_state->best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+  }
+  if (hybrid_rd < search_state->best_pred_rd[REFERENCE_MODE_SELECT])
+    search_state->best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+}
+
+// Does a transform search over a list of the best inter mode candidates.
+// This is called if the original mode search computed an RD estimate
+// for the transform search rather than doing a full search.
+static void tx_search_best_inter_candidates(
+    AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x,
+    int64_t best_rd_so_far, BLOCK_SIZE bsize,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int mi_row, int mi_col,
+    InterModeSearchState *search_state, RD_STATS *rd_cost,
+    PICK_MODE_CONTEXT *ctx, int64_t *yrd) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int num_planes = av1_num_planes(cm);
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  InterModesInfo *inter_modes_info = x->inter_modes_info;
+  inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+  search_state->best_rd = best_rd_so_far;
+  search_state->best_mode_index = THR_INVALID;
+  // Initialize best mode stats for winner mode processing
+  x->winner_mode_count = 0;
+  store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID,
+                          NULL, bsize, best_rd_so_far,
+                          cpi->sf.winner_mode_sf.multi_winner_mode_type, 0);
+  inter_modes_info->num =
+      inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search
+          ? inter_modes_info->num
+          : cpi->sf.rt_sf.num_inter_modes_for_tx_search;
+  const int64_t top_est_rd =
+      inter_modes_info->num > 0
+          ? inter_modes_info
+                ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]
+          : INT64_MAX;
+  *yrd = INT64_MAX;
+  int64_t best_rd_in_this_partition = INT64_MAX;
+  int num_inter_mode_cands = inter_modes_info->num;
+  int newmv_mode_evaled = 0;
+  int max_allowed_cands = INT_MAX;
+  if (cpi->sf.inter_sf.limit_inter_mode_cands) {
+    // The bound on the no. of inter mode candidates, beyond which the
+    // candidates are limited if a newmv mode got evaluated, is set as
+    // max_allowed_cands + 1.
+    const int num_allowed_cands[5] = { INT_MAX, 10, 9, 6, 2 };
+    assert(cpi->sf.inter_sf.limit_inter_mode_cands <= 4);
+    max_allowed_cands =
+        num_allowed_cands[cpi->sf.inter_sf.limit_inter_mode_cands];
+  }
+
+  int num_mode_thresh = INT_MAX;
+  if (cpi->sf.inter_sf.limit_txfm_eval_per_mode) {
+    // Bound the no. of transform searches per prediction mode beyond a
+    // threshold.
+    const int num_mode_thresh_ary[4] = { INT_MAX, 4, 3, 0 };
+    assert(cpi->sf.inter_sf.limit_txfm_eval_per_mode <= 3);
+    num_mode_thresh =
+        num_mode_thresh_ary[cpi->sf.inter_sf.limit_txfm_eval_per_mode];
+  }
+
+  int num_tx_cands = 0;
+  int num_tx_search_modes[INTER_MODE_END - INTER_MODE_START] = { 0 };
+  // Iterate over best inter mode candidates and perform tx search
+  for (int j = 0; j < num_inter_mode_cands; ++j) {
+    const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
+    *mbmi = inter_modes_info->mbmi_arr[data_idx];
+    const PREDICTION_MODE prediction_mode = mbmi->mode;
+    int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
+    if (curr_est_rd * 0.80 > top_est_rd) break;
+
+    if (num_tx_cands > num_mode_thresh) {
+      if ((prediction_mode != NEARESTMV &&
+           num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 1) ||
+          (prediction_mode == NEARESTMV &&
+           num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 2))
+        continue;
+    }
+
+    txfm_info->skip_txfm = 0;
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Select prediction reference frames.
+    const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
+
+    bool is_predictor_built = false;
+
+    // Initialize RD stats
+    RD_STATS rd_stats;
+    RD_STATS rd_stats_y;
+    RD_STATS rd_stats_uv;
+    const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+    int64_t skip_rd = INT64_MAX;
+    if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+      // Check if the mode is good enough based on skip RD
+      int64_t curr_sse = inter_modes_info->sse_arr[data_idx];
+      skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse);
+      int eval_txfm =
+          check_txfm_eval(x, bsize, search_state->best_skip_rd[0], skip_rd,
+                          cpi->sf.inter_sf.txfm_rd_gate_level, 0);
+      if (!eval_txfm) continue;
+    }
+
+    // Build the prediction for this mode
+    if (!is_predictor_built) {
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                    av1_num_planes(cm) - 1);
+    }
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
+      av1_build_obmc_inter_predictors_sb(cm, xd);
+    }
+
+    num_tx_cands++;
+    if (have_newmv_in_inter_mode(prediction_mode)) newmv_mode_evaled = 1;
+    num_tx_search_modes[prediction_mode - INTER_MODE_START]++;
+    int64_t this_yrd = INT64_MAX;
+    // Do the transform search
+    if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+                         mode_rate, search_state->best_rd)) {
+      continue;
+    } else {
+      const int y_rate =
+          rd_stats.skip_txfm
+              ? mode_costs->skip_txfm_cost[skip_ctx][1]
+              : (rd_stats_y.rate + mode_costs->skip_txfm_cost[skip_ctx][0]);
+      this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y.dist);
+
+      if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+        inter_mode_data_push(
+            tile_data, mbmi->bsize, rd_stats.sse, rd_stats.dist,
+            rd_stats_y.rate + rd_stats_uv.rate +
+                mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]);
+      }
+    }
+    rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+    if (rd_stats.rdcost < best_rd_in_this_partition) {
+      best_rd_in_this_partition = rd_stats.rdcost;
+      *yrd = this_yrd;
+    }
+
+    const THR_MODES mode_enum = get_prediction_mode_idx(
+        prediction_mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Collect mode stats for multiwinner mode processing
+    const int txfm_search_done = 1;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum,
+        NULL, bsize, rd_stats.rdcost,
+        cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+
+    if (rd_stats.rdcost < search_state->best_rd) {
+      update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+                          &rd_stats_uv, mode_enum, x, txfm_search_done);
+      search_state->best_skip_rd[0] = skip_rd;
+      // Limit the total number of modes to be evaluated if the first is valid
+      // and transform skip or compound
+      if (cpi->sf.inter_sf.inter_mode_txfm_breakout) {
+        if (!j && (search_state->best_mbmode.skip_txfm || rd_stats.skip_txfm)) {
+          // Evaluate more candidates at high quantizers where occurrence of
+          // transform skip is high.
+          const int max_cands_cap[5] = { 2, 3, 5, 7, 9 };
+          const int qindex_band = (5 * x->qindex) >> QINDEX_BITS;
+          num_inter_mode_cands =
+              AOMMIN(max_cands_cap[qindex_band], inter_modes_info->num);
+        } else if (!j && has_second_ref(&search_state->best_mbmode)) {
+          const int aggr = cpi->sf.inter_sf.inter_mode_txfm_breakout - 1;
+          // Evaluate more candidates at low quantizers where occurrence of
+          // single reference mode is high.
+          const int max_cands_cap_cmp[2][4] = { { 10, 7, 5, 4 },
+                                                { 10, 7, 5, 3 } };
+          const int qindex_band_cmp = (4 * x->qindex) >> QINDEX_BITS;
+          num_inter_mode_cands = AOMMIN(
+              max_cands_cap_cmp[aggr][qindex_band_cmp], inter_modes_info->num);
+        }
+      }
+    }
+    // If the number of candidates evaluated exceeds max_allowed_cands, break if
+    // a newmv mode was evaluated already.
+    if ((num_tx_cands > max_allowed_cands) && newmv_mode_evaled) break;
+  }
 }
 
 // Indicates number of winner simple translation modes to be used
@@ -4419,19 +5306,381 @@ static void handle_winner_cand(
   }
 }
 
-void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
-                               MACROBLOCK *x, RD_STATS *rd_cost,
-                               const BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                               int64_t best_rd_so_far) {
+/*!\brief Search intra modes in interframes
+ *
+ * \ingroup intra_mode_search
+ *
+ * This function searches for the best intra mode when the current frame is an
+ * interframe. This function however does *not* handle luma palette mode.
+ * Palette mode is currently handled by \ref av1_search_palette_mode.
+ *
+ * This function will first iterate through the luma mode candidates to find the
+ * best luma intra mode. Once the best luma mode it's found, it will then search
+ * for the best chroma mode. Because palette mode is currently not handled by
+ * here, a cache of uv mode is stored in
+ * InterModeSearchState::intra_search_state so it can be reused later by \ref
+ * av1_search_palette_mode.
+ *
+ * \return Returns the rdcost of the current intra-mode if it's available,
+ * otherwise returns INT64_MAX. The corresponding values in x->e_mbd.mi[0],
+ * rd_stats, rd_stats_y/uv, and best_intra_rd are also updated. Moreover, in the
+ * first evocation of the function, the chroma intra mode result is cached in
+ * intra_search_state to be used in subsequent calls. In the first evaluation
+ * with directional mode, a prune_mask computed with histogram of gradient is
+ * also stored in intra_search_state.
+ *
+ * \param[in,out] search_state      Struct keep track of the prediction mode
+ *                                  search state in interframe.
+ *
+ * \param[in]     cpi               Top-level encoder structure.
+ * \param[in]     x                 Pointer to struct holding all the data for
+ *                                  the current prediction block.
+ * \param[out]    rd_cost           Stores the best rd_cost among all the
+ *                                  prediction modes searched.
+ * \param[in]     bsize             Current block size.
+ * \param[in,out] ctx               Structure to hold the number of 4x4 blks to
+ *                                  copy the tx_type and txfm_skip arrays.
+ *                                  for only the Y plane.
+ * \param[in,out] sf_args           Stores the list of intra mode candidates
+ *                                  to be searched.
+ * \param[in]     intra_ref_frame_cost  The entropy cost for signaling that the
+ *                                      current ref frame is an intra frame.
+ * \param[in]     yrd_threshold     The rdcost threshold for luma intra mode to
+ *                                  terminate chroma intra mode search.
+ *
+ * \return Returns INT64_MAX if the determined motion mode is invalid and the
+ * current motion mode being tested should be skipped. It returns 0 if the
+ * motion mode search is a success.
+ */
+static AOM_INLINE void search_intra_modes_in_interframe(
+    InterModeSearchState *search_state, const AV1_COMP *cpi, MACROBLOCK *x,
+    RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+    InterModeSFArgs *sf_args, unsigned int intra_ref_frame_cost,
+    int64_t yrd_threshold) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  IntraModeSearchState *intra_search_state = &search_state->intra_search_state;
+
+  int is_best_y_mode_intra = 0;
+  RD_STATS best_intra_rd_stats_y;
+  int64_t best_rd_y = INT64_MAX;
+  int best_mode_cost_y = -1;
+  MB_MODE_INFO best_mbmi = *xd->mi[0];
+  THR_MODES best_mode_enum = THR_INVALID;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int num_4x4 = bsize_to_num_blk(bsize);
+
+  // Performs luma search
+  int64_t best_model_rd = INT64_MAX;
+  int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+  for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+    top_intra_model_rd[i] = INT64_MAX;
+  }
+  for (int mode_idx = 0; mode_idx < LUMA_MODE_COUNT; ++mode_idx) {
+    if (sf->intra_sf.skip_intra_in_interframe &&
+        search_state->intra_search_state.skip_intra_modes)
+      break;
+    set_y_mode_and_delta_angle(mode_idx, mbmi);
+    assert(mbmi->mode < INTRA_MODE_END);
+
+    // Use intra_y_mode_mask speed feature to skip intra mode evaluation.
+    if (sf_args->mode_skip_mask->pred_modes[INTRA_FRAME] & (1 << mbmi->mode))
+      continue;
+
+    const THR_MODES mode_enum =
+        get_prediction_mode_idx(mbmi->mode, INTRA_FRAME, NONE_FRAME);
+    if ((!intra_mode_cfg->enable_smooth_intra ||
+         cpi->sf.intra_sf.disable_smooth_intra) &&
+        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+         mbmi->mode == SMOOTH_V_PRED))
+      continue;
+    if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED)
+      continue;
+    if (av1_is_directional_mode(mbmi->mode) &&
+        !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) &&
+        mbmi->angle_delta[PLANE_TYPE_Y] != 0)
+      continue;
+    const PREDICTION_MODE this_mode = mbmi->mode;
+
+    assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME);
+    assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME);
+    init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm);
+    x->txfm_search_info.skip_txfm = 0;
+
+    if (this_mode != DC_PRED) {
+      // Only search the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+          (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+        if (search_state->best_mode_index != THR_INVALID &&
+            search_state->best_mbmode.ref_frame[0] > INTRA_FRAME)
+          continue;
+      }
+      if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(
+                this_mode, search_state->intra_search_state.best_intra_mode))
+          continue;
+      }
+    }
+
+    RD_STATS intra_rd_stats_y;
+    int mode_cost_y;
+    int64_t intra_rd_y = INT64_MAX;
+    const int is_luma_result_valid = av1_handle_intra_y_mode(
+        intra_search_state, cpi, x, bsize, intra_ref_frame_cost, ctx,
+        &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y,
+        &best_model_rd, top_intra_model_rd);
+    if (is_luma_result_valid && intra_rd_y < yrd_threshold) {
+      is_best_y_mode_intra = 1;
+      if (intra_rd_y < best_rd_y) {
+        best_intra_rd_stats_y = intra_rd_stats_y;
+        best_mode_cost_y = mode_cost_y;
+        best_rd_y = intra_rd_y;
+        best_mbmi = *mbmi;
+        best_mode_enum = mode_enum;
+        memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+               sizeof(best_blk_skip[0]) * num_4x4);
+        av1_copy_array(best_tx_type_map, xd->tx_type_map, num_4x4);
+      }
+    }
+  }
+
+  if (!is_best_y_mode_intra) {
+    return;
+  }
+
+  assert(best_rd_y < INT64_MAX);
+
+  // Restores the best luma mode
+  *mbmi = best_mbmi;
+  memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * num_4x4);
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, num_4x4);
+
+  // Performs chroma search
+  RD_STATS intra_rd_stats, intra_rd_stats_uv;
+  av1_init_rd_stats(&intra_rd_stats);
+  av1_init_rd_stats(&intra_rd_stats_uv);
+  const int num_planes = av1_num_planes(cm);
+  if (num_planes > 1) {
+    const int intra_uv_mode_valid = av1_search_intra_uv_modes_in_interframe(
+        intra_search_state, cpi, x, bsize, &intra_rd_stats,
+        &best_intra_rd_stats_y, &intra_rd_stats_uv, search_state->best_rd);
+
+    if (!intra_uv_mode_valid) {
+      return;
+    }
+  }
+
+  // Merge the luma and chroma rd stats
+  assert(best_mode_cost_y >= 0);
+  intra_rd_stats.rate = best_intra_rd_stats_y.rate + best_mode_cost_y;
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
+    // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
+    // in the tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    best_intra_rd_stats_y.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const PREDICTION_MODE mode = mbmi->mode;
+  if (num_planes > 1 && xd->is_chroma_ref) {
+    const int uv_mode_cost =
+        mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode];
+    intra_rd_stats.rate +=
+        intra_rd_stats_uv.rate +
+        intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
+  }
+
+  // Intra block is always coded as non-skip
+  intra_rd_stats.skip_txfm = 0;
+  intra_rd_stats.dist = best_intra_rd_stats_y.dist + intra_rd_stats_uv.dist;
+  // Add in the cost of the no skip flag.
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  intra_rd_stats.rate += mode_costs->skip_txfm_cost[skip_ctx][0];
+  // Calculate the final RD estimate for this mode.
+  const int64_t this_rd =
+      RDCOST(x->rdmult, intra_rd_stats.rate, intra_rd_stats.dist);
+  // Keep record of best intra rd
+  if (this_rd < search_state->best_intra_rd) {
+    search_state->best_intra_rd = this_rd;
+    intra_search_state->best_intra_mode = mode;
+  }
+
+  for (int i = 0; i < REFERENCE_MODES; ++i) {
+    search_state->best_pred_rd[i] =
+        AOMMIN(search_state->best_pred_rd[i], this_rd);
+  }
+
+  intra_rd_stats.rdcost = this_rd;
+
+  // Collect mode stats for multiwinner mode processing
+  const int txfm_search_done = 1;
+  store_winner_mode_stats(
+      &cpi->common, x, mbmi, &intra_rd_stats, &best_intra_rd_stats_y,
+      &intra_rd_stats_uv, best_mode_enum, NULL, bsize, intra_rd_stats.rdcost,
+      cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+  if (intra_rd_stats.rdcost < search_state->best_rd) {
+    update_search_state(search_state, rd_cost, ctx, &intra_rd_stats,
+                        &best_intra_rd_stats_y, &intra_rd_stats_uv,
+                        best_mode_enum, x, txfm_search_done);
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Prepare inter_cost and intra_cost from TPL stats, which are used as ML
+// features in intra mode pruning.
+static AOM_INLINE void calculate_cost_from_tpl_data(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, int64_t *inter_cost, int64_t *intra_cost) {
+  const AV1_COMMON *const cm = &cpi->common;
+  // Only consider full SB.
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const int tpl_bsize_1d = cpi->ppi->tpl_data.tpl_bsize_1d;
+  const int len = (block_size_wide[sb_size] / tpl_bsize_1d) *
+                  (block_size_high[sb_size] / tpl_bsize_1d);
+  SuperBlockEnc *sb_enc = &x->sb_enc;
+  if (sb_enc->tpl_data_count == len) {
+    const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d);
+    const int tpl_stride = sb_enc->tpl_stride;
+    const int tplw = mi_size_wide[tpl_bsize];
+    const int tplh = mi_size_high[tpl_bsize];
+    const int nw = mi_size_wide[bsize] / tplw;
+    const int nh = mi_size_high[bsize] / tplh;
+    if (nw >= 1 && nh >= 1) {
+      const int of_h = mi_row % mi_size_high[sb_size];
+      const int of_w = mi_col % mi_size_wide[sb_size];
+      const int start = of_h / tplh * tpl_stride + of_w / tplw;
+
+      for (int k = 0; k < nh; k++) {
+        for (int l = 0; l < nw; l++) {
+          *inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l];
+          *intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l];
+        }
+      }
+      *inter_cost /= nw * nh;
+      *intra_cost /= nw * nh;
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+// When the speed feature skip_intra_in_interframe > 0, enable ML model to prune
+// intra mode search.
+static AOM_INLINE void skip_intra_modes_in_interframe(
+    AV1_COMMON *const cm, struct macroblock *x, BLOCK_SIZE bsize,
+    InterModeSearchState *search_state, const SPEED_FEATURES *const sf,
+    int64_t inter_cost, int64_t intra_cost) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int comp_pred = search_state->best_mbmode.ref_frame[1] > INTRA_FRAME;
+  if (sf->rt_sf.prune_intra_mode_based_on_mv_range &&
+      bsize > sf->part_sf.max_intra_bsize && !comp_pred) {
+    const MV best_mv = search_state->best_mbmode.mv[0].as_mv;
+    const int mv_thresh = 16 << sf->rt_sf.prune_intra_mode_based_on_mv_range;
+    if (abs(best_mv.row) < mv_thresh && abs(best_mv.col) < mv_thresh &&
+        x->source_variance > 128) {
+      search_state->intra_search_state.skip_intra_modes = 1;
+      return;
+    }
+  }
+
+  const unsigned int src_var_thresh_intra_skip = 1;
+  const int skip_intra_in_interframe = sf->intra_sf.skip_intra_in_interframe;
+  if (!(skip_intra_in_interframe &&
+        (x->source_variance > src_var_thresh_intra_skip)))
+    return;
+
+  // Prune intra search based on best inter mode being transfrom skip.
+  if ((skip_intra_in_interframe >= 2) && search_state->best_mbmode.skip_txfm) {
+    const int qindex_thresh[2] = { 200, MAXQ };
+    const int ind = (skip_intra_in_interframe >= 3) ? 1 : 0;
+    if (!have_newmv_in_inter_mode(search_state->best_mbmode.mode) &&
+        (x->qindex <= qindex_thresh[ind])) {
+      search_state->intra_search_state.skip_intra_modes = 1;
+      return;
+    } else if ((skip_intra_in_interframe >= 4) &&
+               (inter_cost < 0 || intra_cost < 0)) {
+      search_state->intra_search_state.skip_intra_modes = 1;
+      return;
+    }
+  }
+  // Use ML model to prune intra search.
+  if (inter_cost >= 0 && intra_cost >= 0) {
+    const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480)
+                                     ? &av1_intrap_nn_config
+                                     : &av1_intrap_hd_nn_config;
+    float nn_features[6];
+    float scores[2] = { 0.0f };
+
+    nn_features[0] = (float)search_state->best_mbmode.skip_txfm;
+    nn_features[1] = (float)mi_size_wide_log2[bsize];
+    nn_features[2] = (float)mi_size_high_log2[bsize];
+    nn_features[3] = (float)intra_cost;
+    nn_features[4] = (float)inter_cost;
+    const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+    const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd);
+    nn_features[5] = (float)(ac_q_max / ac_q);
+
+    av1_nn_predict(nn_features, nn_config, 1, scores);
+
+    // For two parameters, the max prob returned from av1_nn_softmax equals
+    // 1.0 / (1.0 + e^(-|diff_score|)). Here use scores directly to avoid the
+    // calling of av1_nn_softmax.
+    const float thresh[5] = { 1.4f, 1.4f, 1.4f, 1.4f, 1.4f };
+    assert(skip_intra_in_interframe <= 5);
+    if (scores[1] > scores[0] + thresh[skip_intra_in_interframe - 1]) {
+      search_state->intra_search_state.skip_intra_modes = 1;
+    }
+  }
+}
+
+static AOM_INLINE int get_block_temp_var(const AV1_COMP *cpi,
+                                         const MACROBLOCK *x,
+                                         BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+
+  if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION ||
+      !sf->rt_sf.short_circuit_low_temp_var ||
+      !sf->rt_sf.prune_inter_modes_using_temp_var) {
+    return 0;
+  }
+
+  const int mi_row = x->e_mbd.mi_row;
+  const int mi_col = x->e_mbd.mi_col;
+  int is_low_temp_var = 0;
+
+  if (cm->seq_params->sb_size == BLOCK_64X64)
+    is_low_temp_var = av1_get_force_skip_low_temp_var_small_sb(
+        &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+  else
+    is_low_temp_var = av1_get_force_skip_low_temp_var(
+        &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+
+  return is_low_temp_var;
+}
+
+// TODO(chiyotsai@google.com): See the todo for av1_rd_pick_intra_mode_sb.
+void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+                            struct macroblock *x, struct RD_STATS *rd_cost,
+                            BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                            int64_t best_rd_so_far) {
   AV1_COMMON *const cm = &cpi->common;
   const FeatureFlags *const features = &cm->features;
   const int num_planes = av1_num_planes(cm);
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   int i;
+  const ModeCosts *mode_costs = &x->mode_costs;
   const int *comp_inter_cost =
-      x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+      mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)];
 
   InterModeSearchState search_state;
   init_inter_mode_search_state(&search_state, cpi, x, bsize, best_rd_so_far);
@@ -4453,15 +5702,24 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                                search_state.simple_rd,
                                0,
                                interintra_modes,
-                               1,
-                               NULL,
                                { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } },
-                               0 };
+                               { { 0, 0 } },
+                               0,
+                               0,
+                               -1,
+                               -1,
+                               -1,
+                               { 0 },
+                               { 0 },
+                               UINT_MAX };
+  // Currently, is_low_temp_var is used in real time encoding.
+  const int is_low_temp_var = get_block_temp_var(cpi, x, bsize);
+
+  for (i = 0; i < MODE_CTX_REF_FRAMES; ++i) args.cmp_mode[i] = -1;
   // Indicates the appropriate number of simple translation winner modes for
   // exhaustive motion mode evaluation
   const int max_winner_motion_mode_cand =
-      num_winner_motion_modes[cpi->sf.winner_mode_sf
-                                  .motion_mode_for_winner_cand];
+      num_winner_motion_modes[sf->winner_mode_sf.motion_mode_for_winner_cand];
   assert(max_winner_motion_mode_cand <= MAX_WINNER_MOTION_MODES);
   motion_mode_candidate motion_mode_cand;
   motion_mode_best_st_candidate best_motion_mode_cands;
@@ -4474,21 +5732,28 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 
   av1_invalid_rd_stats(rd_cost);
 
+  for (i = 0; i < REF_FRAMES; ++i) {
+    x->warp_sample_info[i].num = -1;
+  }
+
   // Ref frames that are selected by square partition blocks.
   int picked_ref_frames_mask = 0;
-  if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions &&
-      mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+  if (sf->inter_sf.prune_ref_frame_for_rect_partitions &&
+      mbmi->partition != PARTITION_NONE) {
     // prune_ref_frame_for_rect_partitions = 1 implies prune only extended
     // partition blocks. prune_ref_frame_for_rect_partitions >=2
     // implies prune for vert, horiz and extended partition blocks.
     if ((mbmi->partition != PARTITION_VERT &&
          mbmi->partition != PARTITION_HORZ) ||
-        cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions >= 2) {
+        sf->inter_sf.prune_ref_frame_for_rect_partitions >= 2) {
       picked_ref_frames_mask =
-          fetch_picked_ref_frames_mask(x, bsize, cm->seq_params.mib_size);
+          fetch_picked_ref_frames_mask(x, bsize, cm->seq_params->mib_size);
     }
   }
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, set_params_rd_pick_inter_mode_time);
+#endif
   // Skip ref frames that never selected by square blocks.
   const int skip_ref_frame_mask =
       picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
@@ -4500,24 +5765,23 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask,
                                 skip_ref_frame_mask, ref_costs_single,
                                 ref_costs_comp, yv12_mb);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, set_params_rd_pick_inter_mode_time);
+#endif
 
   int64_t best_est_rd = INT64_MAX;
   const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
   // If do_tx_search is 0, only estimated RD should be computed.
   // If do_tx_search is 1, all modes have TX search performed.
   const int do_tx_search =
-      !((cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 && md->ready) ||
-        (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 &&
-         num_pels_log2_lookup[bsize] > 8) ||
-        cpi->sf.rt_sf.force_tx_search_off);
+      !((sf->inter_sf.inter_mode_rd_model_estimation == 1 && md->ready) ||
+        (sf->inter_sf.inter_mode_rd_model_estimation == 2 &&
+         num_pels_log2_lookup[bsize] > 8));
   InterModesInfo *inter_modes_info = x->inter_modes_info;
   inter_modes_info->num = 0;
 
-  int intra_mode_num = 0;
-  int intra_mode_idx_ls[INTRA_MODES];
-
   // Temporary buffers used by handle_inter_mode().
-  uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_obmc_bufs[0]);
+  uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]);
 
   // The best RD found for the reference frame, among single reference modes.
   // Note that the 0-th element will contain a cut-off that is later used
@@ -4525,7 +5789,6 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   int64_t ref_frame_rd[REF_FRAMES] = { INT64_MAX, INT64_MAX, INT64_MAX,
                                        INT64_MAX, INT64_MAX, INT64_MAX,
                                        INT64_MAX, INT64_MAX };
-  const int skip_ctx = av1_get_skip_context(xd);
 
   // Prepared stats used later to check if we could skip intra mode eval.
   int64_t inter_cost = -1;
@@ -4537,10 +5800,10 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
   // Obtain the relevant tpl stats for pruning inter modes
   PruneInfoFromTpl inter_cost_info_from_tpl;
 #if !CONFIG_REALTIME_ONLY
-  if (cpi->sf.inter_sf.prune_inter_modes_based_on_tpl) {
-    // x->search_ref_frame[id] = 1 => no pruning in
+  if (sf->inter_sf.prune_inter_modes_based_on_tpl) {
+    // x->tpl_keep_ref_frame[id] = 1 => no pruning in
     // prune_ref_by_selective_ref_frame()
-    // x->search_ref_frame[id] = 0  => ref frame can be pruned in
+    // x->tpl_keep_ref_frame[id] = 0  => ref frame can be pruned in
     // prune_ref_by_selective_ref_frame()
     // Populating valid_refs[idx] = 1 ensures that
     // 'inter_cost_info_from_tpl.best_inter_cost' does not correspond to a
@@ -4549,7 +5812,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     for (MV_REFERENCE_FRAME frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
       const MV_REFERENCE_FRAME refs[2] = { frame, NONE_FRAME };
       valid_refs[frame - 1] =
-          x->search_ref_frame[frame] ||
+          x->tpl_keep_ref_frame[frame] ||
           !prune_ref_by_selective_ref_frame(
               cpi, x, refs, cm->cur_frame->ref_display_order_hint);
     }
@@ -4557,42 +5820,23 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     get_block_level_tpl_stats(cpi, bsize, mi_row, mi_col, valid_refs,
                               &inter_cost_info_from_tpl);
   }
-#endif
+
   const int do_pruning =
       (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1;
-  if (do_pruning && sf->intra_sf.skip_intra_in_interframe) {
-    // Only consider full SB.
-    int len = tpl_blocks_in_sb(cm->seq_params.sb_size);
-    if (len == x->valid_cost_b) {
-      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
-      const int tplw = mi_size_wide[tpl_bsize];
-      const int tplh = mi_size_high[tpl_bsize];
-      const int nw = mi_size_wide[bsize] / tplw;
-      const int nh = mi_size_high[bsize] / tplh;
-      if (nw >= 1 && nh >= 1) {
-        const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
-        const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
-        const int start = of_h / tplh * x->cost_stride + of_w / tplw;
-
-        for (int k = 0; k < nh; k++) {
-          for (int l = 0; l < nw; l++) {
-            inter_cost += x->inter_cost_b[start + k * x->cost_stride + l];
-            intra_cost += x->intra_cost_b[start + k * x->cost_stride + l];
-          }
-        }
-        inter_cost /= nw * nh;
-        intra_cost /= nw * nh;
-      }
-    }
-  }
-
-  // Initialize best mode stats for winner mode processing
-  av1_zero(x->winner_mode_stats);
+  if (do_pruning && sf->intra_sf.skip_intra_in_interframe &&
+      cpi->oxcf.algo_cfg.enable_tpl_model)
+    calculate_cost_from_tpl_data(cpi, x, bsize, mi_row, mi_col, &inter_cost,
+                                 &intra_cost);
+#endif  // !CONFIG_REALTIME_ONLY
+
+  // Initialize best mode stats for winner mode processing.
+  const int max_winner_mode_count =
+      winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type];
+  zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats);
   x->winner_mode_count = 0;
-  store_winner_mode_stats(
-      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize,
-      best_rd_so_far, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-      0);
+  store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID,
+                          NULL, bsize, best_rd_so_far,
+                          sf->winner_mode_sf.multi_winner_mode_type, 0);
 
   int mode_thresh_mul_fact = (1 << MODE_THRESH_QBITS);
   if (sf->inter_sf.prune_inter_modes_if_skippable) {
@@ -4607,16 +5851,29 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                               skip_ref_frame_mask,
                               0,
                               mode_thresh_mul_fact,
-                              intra_mode_idx_ls,
-                              &intra_mode_num,
+                              0,
                               0 };
+  int64_t best_inter_yrd = INT64_MAX;
 
+  // This is the main loop of this function. It loops over all possible inter
+  // modes and calls handle_inter_mode() to compute the RD for each.
   // Here midx is just an iterator index that should not be used by itself
   // except to keep track of the number of modes searched. It should be used
   // with av1_default_mode_order to get the enum that defines the mode, which
   // can be used with av1_mode_defs to get the prediction mode and the ref
   // frames.
-  for (THR_MODES midx = THR_MODE_START; midx < THR_MODE_END; ++midx) {
+  // TODO(yunqing, any): Setting mode_start and mode_end outside for-loop brings
+  // good speedup for real time case. If we decide to use compound mode in real
+  // time, maybe we can modify av1_default_mode_order table.
+  THR_MODES mode_start = THR_INTER_MODE_START;
+  THR_MODES mode_end = THR_INTER_MODE_END;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  if (current_frame->reference_mode == SINGLE_REFERENCE) {
+    mode_start = SINGLE_REF_MODE_START;
+    mode_end = SINGLE_REF_MODE_END;
+  }
+
+  for (THR_MODES midx = mode_start; midx < mode_end; ++midx) {
     // Get the actual prediction mode we are trying in this iteration
     const THR_MODES mode_enum = av1_default_mode_order[midx];
     const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
@@ -4631,11 +5888,19 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 
     init_mbmi(mbmi, this_mode, ref_frames, cm);
 
-    x->force_skip = 0;
+    txfm_info->skip_txfm = 0;
+    sf_args.num_single_modes_processed += is_single_pred;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, skip_inter_mode_time);
+#endif
     // Apply speed features to decide if this inter mode can be skipped
-    if (skip_inter_mode(cpi, x, bsize, ref_frame_rd, midx, &sf_args)) continue;
+    const int is_skip_inter_mode = skip_inter_mode(
+        cpi, x, bsize, ref_frame_rd, midx, &sf_args, is_low_temp_var);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, skip_inter_mode_time);
+#endif
+    if (is_skip_inter_mode) continue;
 
     // Select prediction reference frames.
     for (i = 0; i < num_planes; i++) {
@@ -4649,7 +5914,6 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     mbmi->ref_mv_idx = 0;
 
     const int64_t ref_best_rd = search_state.best_rd;
-    int disable_skip = 0;
     RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
     av1_init_rd_stats(&rd_stats);
 
@@ -4657,7 +5921,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                                    ? ref_costs_comp[ref_frame][second_ref_frame]
                                    : ref_costs_single[ref_frame];
     const int compmode_cost =
-        is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0;
+        is_comp_ref_allowed(mbmi->bsize) ? comp_inter_cost[comp_pred] : 0;
     const int real_compmode_cost =
         cm->current_frame.reference_mode == REFERENCE_MODE_SELECT
             ? compmode_cost
@@ -4668,26 +5932,36 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     args.single_newmv_valid = search_state.single_newmv_valid;
     args.single_comp_cost = real_compmode_cost;
     args.ref_frame_cost = ref_frame_cost;
-    if (is_single_pred) {
-      args.simple_rd_state = x->simple_rd_state[mode_enum];
-    }
+    args.best_pred_sse = search_state.best_pred_sse;
 
     int64_t skip_rd[2] = { search_state.best_skip_rd[0],
                            search_state.best_skip_rd[1] };
+    int64_t this_yrd = INT64_MAX;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, handle_inter_mode_time);
+#endif
     int64_t this_rd = handle_inter_mode(
-        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
-        &disable_skip, &args, ref_best_rd, tmp_buf, &x->comp_rd_buffer,
-        &best_est_rd, do_tx_search, inter_modes_info, &motion_mode_cand,
-        skip_rd, &inter_cost_info_from_tpl);
+        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &args,
+        ref_best_rd, tmp_buf, &x->comp_rd_buffer, &best_est_rd, do_tx_search,
+        inter_modes_info, &motion_mode_cand, skip_rd, &inter_cost_info_from_tpl,
+        &this_yrd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, handle_inter_mode_time);
+#endif
+    if (current_frame->reference_mode != SINGLE_REFERENCE) {
+      if (sf->inter_sf.prune_comp_search_by_single_result > 0 &&
+          is_inter_singleref_mode(this_mode)) {
+        collect_single_states(x, &search_state, mbmi);
+      }
 
-    if (sf->inter_sf.prune_comp_search_by_single_result > 0 &&
-        is_inter_singleref_mode(this_mode) && args.single_ref_first_pass) {
-      collect_single_states(x, &search_state, mbmi);
+      if (sf->inter_sf.prune_comp_using_best_single_mode_ref > 0 &&
+          is_inter_singleref_mode(this_mode))
+        update_best_single_mode(&search_state, this_mode, ref_frame, this_rd);
     }
 
     if (this_rd == INT64_MAX) continue;
 
-    if (mbmi->skip) {
+    if (mbmi->skip_txfm) {
       rd_stats_y.rate = 0;
       rd_stats_uv.rate = 0;
     }
@@ -4702,12 +5976,13 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
       assert(IMPLIES(comp_pred,
                      cm->current_frame.reference_mode != SINGLE_REFERENCE));
       search_state.best_pred_sse = x->pred_sse[ref_frame];
+      best_inter_yrd = this_yrd;
       update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
                           &rd_stats_uv, mode_enum, x, do_tx_search);
       if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0];
       search_state.best_skip_rd[1] = skip_rd[1];
     }
-    if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
+    if (sf->winner_mode_sf.motion_mode_for_winner_cand) {
       // Add this mode to motion mode candidate list for motion mode search
       // if using motion_mode_for_winner_cand speed feature
       handle_winner_cand(mbmi, &best_motion_mode_cands,
@@ -4716,108 +5991,34 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     }
 
     /* keep record of best compound/single-only prediction */
-    if (!disable_skip) {
-      record_best_compound(cm->current_frame.reference_mode, &rd_stats,
-                           comp_pred, x->rdmult, &search_state, compmode_cost);
-    }
+    record_best_compound(cm->current_frame.reference_mode, &rd_stats, comp_pred,
+                         x->rdmult, &search_state, compmode_cost);
   }
 
-  if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, evaluate_motion_mode_for_winner_candidates_time);
+#endif
+  if (sf->winner_mode_sf.motion_mode_for_winner_cand) {
     // For the single ref winner candidates, evaluate other motion modes (non
     // simple translation).
     evaluate_motion_mode_for_winner_candidates(
         cpi, x, rd_cost, &args, tile_data, ctx, yv12_mb,
         &best_motion_mode_cands, do_tx_search, bsize, &best_est_rd,
-        &search_state);
+        &search_state, &best_inter_yrd);
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, evaluate_motion_mode_for_winner_candidates_time);
+#endif
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, do_tx_search_time);
 #endif
   if (do_tx_search != 1) {
-    inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
-    search_state.best_rd = best_rd_so_far;
-    search_state.best_mode_index = THR_INVALID;
-    // Initialize best mode stats for winner mode processing
-    x->winner_mode_count = 0;
-    store_winner_mode_stats(
-        &cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize,
-        best_rd_so_far, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-        do_tx_search);
-    inter_modes_info->num =
-        inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search
-            ? inter_modes_info->num
-            : cpi->sf.rt_sf.num_inter_modes_for_tx_search;
-    const int64_t top_est_rd =
-        inter_modes_info->num > 0
-            ? inter_modes_info
-                  ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]
-            : INT64_MAX;
-    for (int j = 0; j < inter_modes_info->num; ++j) {
-      const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
-      *mbmi = inter_modes_info->mbmi_arr[data_idx];
-      int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
-      if (curr_est_rd * 0.80 > top_est_rd) break;
-
-      x->force_skip = 0;
-      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
-      // Select prediction reference frames.
-      const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
-      for (i = 0; i < num_planes; i++) {
-        xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
-        if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
-      }
-
-      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
-                                    av1_num_planes(cm) - 1);
-      if (mbmi->motion_mode == OBMC_CAUSAL) {
-        av1_build_obmc_inter_predictors_sb(cm, xd);
-      }
-
-      RD_STATS rd_stats;
-      RD_STATS rd_stats_y;
-      RD_STATS rd_stats_uv;
-      const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
-      int64_t skip_rd = INT64_MAX;
-      if (cpi->sf.inter_sf.txfm_rd_gate_level) {
-        // Check if the mode is good enough based on skip RD
-        int64_t curr_sse = inter_modes_info->sse_arr[data_idx];
-        skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse);
-        int eval_txfm =
-            check_txfm_eval(x, bsize, search_state.best_skip_rd[0], skip_rd,
-                            cpi->sf.inter_sf.txfm_rd_gate_level, 0);
-        if (!eval_txfm) continue;
-      }
-
-      if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
-                           mode_rate, search_state.best_rd)) {
-        continue;
-      } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
-        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
-                             rd_stats.dist,
-                             rd_stats_y.rate + rd_stats_uv.rate +
-                                 x->skip_cost[skip_ctx][mbmi->skip]);
-      }
-      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
-
-      const THR_MODES mode_enum = get_prediction_mode_idx(
-          mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
-      // Collect mode stats for multiwinner mode processing
-      const int txfm_search_done = 1;
-      store_winner_mode_stats(
-          &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv,
-          mode_enum, NULL, bsize, rd_stats.rdcost,
-          cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-          txfm_search_done);
-
-      if (rd_stats.rdcost < search_state.best_rd) {
-        update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
-                            &rd_stats_uv, mode_enum, x, txfm_search_done);
-        search_state.best_skip_rd[0] = skip_rd;
-      }
-    }
+    // A full tx search has not yet been done, do tx search for
+    // top mode candidates
+    tx_search_best_inter_candidates(cpi, tile_data, x, best_rd_so_far, bsize,
+                                    yv12_mb, mi_row, mi_col, &search_state,
+                                    rd_cost, ctx, &best_inter_yrd);
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, do_tx_search_time);
@@ -4826,117 +6027,53 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, handle_intra_mode_time);
 #endif
-
   // Gate intra mode evaluation if best of inter is skip except when source
-  // variance is extremely low
-  if (sf->intra_sf.skip_intra_in_interframe &&
-      (x->source_variance > sf->intra_sf.src_var_thresh_intra_skip)) {
-    if (inter_cost >= 0 && intra_cost >= 0) {
-      aom_clear_system_state();
-      const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480)
-                                       ? &av1_intrap_nn_config
-                                       : &av1_intrap_hd_nn_config;
-      float nn_features[6];
-      float scores[2] = { 0.0f };
-      float probs[2] = { 0.0f };
-
-      nn_features[0] = (float)search_state.best_mbmode.skip;
-      nn_features[1] = (float)mi_size_wide_log2[bsize];
-      nn_features[2] = (float)mi_size_high_log2[bsize];
-      nn_features[3] = (float)intra_cost;
-      nn_features[4] = (float)inter_cost;
-      const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
-      const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd);
-      nn_features[5] = (float)(ac_q_max / ac_q);
-
-      av1_nn_predict(nn_features, nn_config, 1, scores);
-      aom_clear_system_state();
-      av1_nn_softmax(scores, probs, 2);
-
-      if (probs[1] > 0.8) search_state.intra_search_state.skip_intra_modes = 1;
-    } else if ((search_state.best_mbmode.skip) &&
-               (sf->intra_sf.skip_intra_in_interframe >= 2)) {
-      search_state.intra_search_state.skip_intra_modes = 1;
-    }
-  }
-
-  const int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
-  for (int j = 0; j < intra_mode_num; ++j) {
-    if (sf->intra_sf.skip_intra_in_interframe &&
-        search_state.intra_search_state.skip_intra_modes)
-      break;
-    const THR_MODES mode_enum = intra_mode_idx_ls[j];
-    const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
-    const PREDICTION_MODE this_mode = mode_def->mode;
-
-    assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME);
-    assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME);
-    init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm);
-    x->force_skip = 0;
-
-    if (this_mode != DC_PRED) {
-      // Only search the oblique modes if the best so far is
-      // one of the neighboring directional modes
-      if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-          (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
-        if (search_state.best_mode_index != THR_INVALID &&
-            search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
-          continue;
-      }
-      if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-        if (conditional_skipintra(
-                this_mode, search_state.intra_search_state.best_intra_mode))
-          continue;
-      }
-    }
-
-    RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
-    intra_rd_stats.rdcost = av1_handle_intra_mode(
-        &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost,
-        ctx, 0, &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv,
-        search_state.best_rd, &search_state.best_intra_rd,
-        search_state.best_mbmode.skip);
-    // Collect mode stats for multiwinner mode processing
-    const int txfm_search_done = 1;
-    store_winner_mode_stats(
-        &cpi->common, x, mbmi, &intra_rd_stats, &intra_rd_stats_y,
-        &intra_rd_stats_uv, mode_enum, NULL, bsize, intra_rd_stats.rdcost,
-        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-        txfm_search_done);
-    if (intra_rd_stats.rdcost < search_state.best_rd) {
-      update_search_state(&search_state, rd_cost, ctx, &intra_rd_stats,
-                          &intra_rd_stats_y, &intra_rd_stats_uv, mode_enum, x,
-                          txfm_search_done);
-    }
-  }
+  // variance is extremely low and also based on max intra bsize.
+  skip_intra_modes_in_interframe(cm, x, bsize, &search_state, sf, inter_cost,
+                                 intra_cost);
+
+  const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
+  search_intra_modes_in_interframe(&search_state, cpi, x, rd_cost, bsize, ctx,
+                                   &sf_args, intra_ref_frame_cost,
+                                   best_inter_yrd);
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, handle_intra_mode_time);
 #endif
 
-  int winner_mode_count = cpi->sf.winner_mode_sf.enable_multiwinner_mode_process
-                              ? x->winner_mode_count
-                              : 1;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, refine_winner_mode_tx_time);
+#endif
+  int winner_mode_count =
+      sf->winner_mode_sf.multi_winner_mode_type ? x->winner_mode_count : 1;
   // In effect only when fast tx search speed features are enabled.
   refine_winner_mode_tx(
       cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index,
       &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
       search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, refine_winner_mode_tx_time);
+#endif
 
   // Initialize default mode evaluation params
   set_mode_eval_params(cpi, x, DEFAULT_EVAL);
 
   // Only try palette mode when the best mode so far is an intra mode.
   const int try_palette =
-      cpi->oxcf.enable_palette &&
-      av1_allow_palette(features->allow_screen_content_tools, mbmi->sb_type) &&
-      !is_inter_mode(search_state.best_mbmode.mode);
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      cpi->oxcf.tool_cfg.enable_palette &&
+      av1_allow_palette(features->allow_screen_content_tools, mbmi->bsize) &&
+      !is_inter_mode(search_state.best_mbmode.mode) && rd_cost->rate != INT_MAX;
   RD_STATS this_rd_cost;
   int this_skippable = 0;
   if (try_palette) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_search_palette_mode_time);
+#endif
     this_skippable = av1_search_palette_mode(
-        cpi, x, &this_rd_cost, ctx, bsize, mbmi, pmi, ref_costs_single,
-        &search_state.intra_search_state, search_state.best_rd);
+        &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost,
+        ctx, &this_rd_cost, search_state.best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_search_palette_mode_time);
+#endif
     if (this_rd_cost.rdcost < search_state.best_rd) {
       search_state.best_mode_index = THR_DC;
       mbmi->mv[0].as_int = 0;
@@ -4947,8 +6084,8 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
       search_state.best_mbmode = *mbmi;
       search_state.best_skip2 = 0;
       search_state.best_mode_skippable = this_skippable;
-      memcpy(ctx->blk_skip, x->blk_skip,
-             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      memcpy(ctx->blk_skip, txfm_info->blk_skip,
+             sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
       av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
     }
   }
@@ -4989,15 +6126,15 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
           search_state.best_mbmode.interp_filters.as_filters.x_filter) ||
          !is_inter_block(&search_state.best_mbmode));
 
-  if (!cpi->rc.is_src_frame_alt_ref && cpi->sf.inter_sf.adaptive_rd_thresh) {
-    av1_update_rd_thresh_fact(cm, x->thresh_freq_fact,
-                              sf->inter_sf.adaptive_rd_thresh, bsize,
-                              search_state.best_mode_index);
+  if (!cpi->rc.is_src_frame_alt_ref && sf->inter_sf.adaptive_rd_thresh) {
+    av1_update_rd_thresh_fact(
+        cm, x->thresh_freq_fact, sf->inter_sf.adaptive_rd_thresh, bsize,
+        search_state.best_mode_index, mode_start, mode_end, THR_DC, MAX_MODES);
   }
 
   // macroblock modes
   *mbmi = search_state.best_mbmode;
-  x->force_skip |= search_state.best_skip2;
+  txfm_info->skip_txfm |= search_state.best_skip2;
 
   // Note: this section is needed since the mode may have been forced to
   // GLOBALMV by the all-zero mode handling of ref-mv.
@@ -5011,30 +6148,18 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
     }
   }
 
-  for (i = 0; i < REFERENCE_MODES; ++i) {
-    if (search_state.intra_search_state.best_pred_rd[i] == INT64_MAX) {
-      search_state.best_pred_diff[i] = INT_MIN;
-    } else {
-      search_state.best_pred_diff[i] =
-          search_state.best_rd -
-          search_state.intra_search_state.best_pred_rd[i];
-    }
-  }
-
-  x->force_skip |= search_state.best_mode_skippable;
+  txfm_info->skip_txfm |= search_state.best_mode_skippable;
 
   assert(search_state.best_mode_index != THR_INVALID);
 
 #if CONFIG_INTERNAL_STATS
   store_coding_context(x, ctx, search_state.best_mode_index,
-                       search_state.best_pred_diff,
                        search_state.best_mode_skippable);
 #else
-  store_coding_context(x, ctx, search_state.best_pred_diff,
-                       search_state.best_mode_skippable);
+  store_coding_context(x, ctx, search_state.best_mode_skippable);
 #endif  // CONFIG_INTERNAL_STATS
 
-  if (pmi->palette_size[1] > 0) {
+  if (mbmi->palette_mode_info.palette_size[1] > 0) {
     assert(try_palette);
     av1_restore_uv_color_map(cpi, x);
   }
@@ -5053,10 +6178,11 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   unsigned char segment_id = mbmi->segment_id;
   const int comp_pred = 0;
   int i;
-  int64_t best_pred_diff[REFERENCE_MODES];
   unsigned int ref_costs_single[REF_FRAMES];
   unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
-  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int *comp_inter_cost =
+      mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)];
   InterpFilter best_filter = SWITCHABLE;
   int64_t this_rd = INT64_MAX;
   int rate2 = 0;
@@ -5067,7 +6193,7 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 
   av1_collect_neighbors_ref_counts(xd);
 
-  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+  estimate_ref_frame_costs(cm, xd, mode_costs, segment_id, ref_costs_single,
                            ref_costs_comp);
 
   for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
@@ -5094,7 +6220,7 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
                            mi_row, features->cur_frame_force_integer_mv)
           .as_int;
   mbmi->tx_size = max_txsize_lookup[bsize];
-  x->force_skip = 1;
+  x->txfm_search_info.skip_txfm = 1;
 
   mbmi->ref_mv_idx = 0;
 
@@ -5104,9 +6230,10 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
     int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
     mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
     // Select the samples according to motion vector difference
-    if (mbmi->num_proj_ref > 1)
+    if (mbmi->num_proj_ref > 1) {
       mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
                                              mbmi->num_proj_ref, bsize);
+    }
   }
 
   const InterpFilter interp_filter = features->interp_filter;
@@ -5116,14 +6243,13 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
     best_filter = interp_filter;
   } else {
     best_filter = EIGHTTAP_REGULAR;
-    if (av1_is_interp_needed(xd) &&
-        x->source_variance >=
-            cpi->sf.interp_sf.disable_filter_search_var_thresh) {
+    if (av1_is_interp_needed(xd)) {
       int rs;
       int best_rs = INT_MAX;
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         mbmi->interp_filters = av1_broadcast_interp_filter(i);
-        rs = av1_get_switchable_rate(x, xd, interp_filter);
+        rs = av1_get_switchable_rate(x, xd, interp_filter,
+                                     cm->seq_params->enable_dual_filter);
         if (rs < best_rs) {
           best_rs = rs;
           best_filter = mbmi->interp_filters.as_filters.y_filter;
@@ -5133,7 +6259,8 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   }
   // Set the appropriate filter
   mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
-  rate2 += av1_get_switchable_rate(x, xd, interp_filter);
+  rate2 += av1_get_switchable_rate(x, xd, interp_filter,
+                                   cm->seq_params->enable_dual_filter);
 
   if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT)
     rate2 += comp_inter_cost[comp_pred];
@@ -5159,24 +6286,25 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   if (cpi->sf.inter_sf.adaptive_rd_thresh) {
     av1_update_rd_thresh_fact(cm, x->thresh_freq_fact,
                               cpi->sf.inter_sf.adaptive_rd_thresh, bsize,
-                              THR_GLOBALMV);
+                              THR_GLOBALMV, THR_INTER_MODE_START,
+                              THR_INTER_MODE_END, THR_DC, MAX_MODES);
   }
 
-  av1_zero(best_pred_diff);
-
 #if CONFIG_INTERNAL_STATS
-  store_coding_context(x, ctx, THR_GLOBALMV, best_pred_diff, 0);
+  store_coding_context(x, ctx, THR_GLOBALMV, 0);
 #else
-  store_coding_context(x, ctx, best_pred_diff, 0);
+  store_coding_context(x, ctx, 0);
 #endif  // CONFIG_INTERNAL_STATS
 }
 
+/*!\cond */
 struct calc_target_weighted_pred_ctxt {
-  const MACROBLOCK *x;
+  const OBMCBuffer *obmc_buffer;
   const uint8_t *tmp;
   int tmp_stride;
   int overlap;
 };
+/*!\endcond */
 
 static INLINE void calc_target_weighted_pred_above(
     MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
@@ -5192,8 +6320,8 @@ static INLINE void calc_target_weighted_pred_above(
   const int bw = xd->width << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
-  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
-  int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
+  int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_col * MI_SIZE);
+  int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_col * MI_SIZE);
   const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
   const int is_hbd = is_cur_buf_hbd(xd);
 
@@ -5240,8 +6368,8 @@ static INLINE void calc_target_weighted_pred_left(
   const int bw = xd->width << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
-  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
-  int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
+  int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_row * MI_SIZE * bw);
+  int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_row * MI_SIZE * bw);
   const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
   const int is_hbd = is_cur_buf_hbd(xd);
 
@@ -5318,11 +6446,12 @@ static AOM_INLINE void calc_target_weighted_pred(
     const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
     const uint8_t *above, int above_stride, const uint8_t *left,
     int left_stride) {
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   const int bw = xd->width << MI_SIZE_LOG2;
   const int bh = xd->height << MI_SIZE_LOG2;
-  int32_t *mask_buf = x->mask_buf;
-  int32_t *wsrc_buf = x->wsrc_buf;
+  const OBMCBuffer *obmc_buffer = &x->obmc_buffer;
+  int32_t *mask_buf = obmc_buffer->mask;
+  int32_t *wsrc_buf = obmc_buffer->wsrc;
 
   const int is_hbd = is_cur_buf_hbd(xd);
   const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
@@ -5338,8 +6467,8 @@ static AOM_INLINE void calc_target_weighted_pred(
   if (xd->up_available) {
     const int overlap =
         AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
-    struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride,
-                                                   overlap };
+    struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, above,
+                                                   above_stride, overlap };
     foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd,
                                   max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                   calc_target_weighted_pred_above, &ctxt);
@@ -5354,8 +6483,8 @@ static AOM_INLINE void calc_target_weighted_pred(
   if (xd->left_available) {
     const int overlap =
         AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
-    struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride,
-                                                   overlap };
+    struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, left,
+                                                   left_stride, overlap };
     foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd,
                                  max_neighbor_obmc[mi_size_high_log2[bsize]],
                                  calc_target_weighted_pred_left, &ctxt);
@@ -5383,123 +6512,3 @@ static AOM_INLINE void calc_target_weighted_pred(
     }
   }
 }
-
-/* Use standard 3x3 Sobel matrix. Macro so it can be used for either high or
-   low bit-depth arrays. */
-#define SOBEL_X(src, stride, i, j)                       \
-  ((src)[((i)-1) + (stride) * ((j)-1)] -                 \
-   (src)[((i) + 1) + (stride) * ((j)-1)] +  /* NOLINT */ \
-   2 * (src)[((i)-1) + (stride) * (j)] -    /* NOLINT */ \
-   2 * (src)[((i) + 1) + (stride) * (j)] +  /* NOLINT */ \
-   (src)[((i)-1) + (stride) * ((j) + 1)] -  /* NOLINT */ \
-   (src)[((i) + 1) + (stride) * ((j) + 1)]) /* NOLINT */
-#define SOBEL_Y(src, stride, i, j)                       \
-  ((src)[((i)-1) + (stride) * ((j)-1)] +                 \
-   2 * (src)[(i) + (stride) * ((j)-1)] +    /* NOLINT */ \
-   (src)[((i) + 1) + (stride) * ((j)-1)] -  /* NOLINT */ \
-   (src)[((i)-1) + (stride) * ((j) + 1)] -  /* NOLINT */ \
-   2 * (src)[(i) + (stride) * ((j) + 1)] -  /* NOLINT */ \
-   (src)[((i) + 1) + (stride) * ((j) + 1)]) /* NOLINT */
-
-sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j,
-                   bool high_bd) {
-  int16_t s_x;
-  int16_t s_y;
-  if (high_bd) {
-    const uint16_t *src = CONVERT_TO_SHORTPTR(input);
-    s_x = SOBEL_X(src, stride, i, j);
-    s_y = SOBEL_Y(src, stride, i, j);
-  } else {
-    s_x = SOBEL_X(input, stride, i, j);
-    s_y = SOBEL_Y(input, stride, i, j);
-  }
-  sobel_xy r = { .x = s_x, .y = s_y };
-  return r;
-}
-
-// 8-tap Gaussian convolution filter with sigma = 1.3, sums to 128,
-// all co-efficients must be even.
-DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 2,  12, 30, 40,
-                                                               30, 12, 2,  0 };
-
-void av1_gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
-                       uint8_t *dst, bool high_bd, int bd) {
-  ConvolveParams conv_params = get_conv_params(0, 0, bd);
-  InterpFilterParams filter = { .filter_ptr = gauss_filter,
-                                .taps = 8,
-                                .subpel_shifts = 0,
-                                .interp_filter = EIGHTTAP_REGULAR };
-  // Requirements from the vector-optimized implementations.
-  assert(h % 4 == 0);
-  assert(w % 8 == 0);
-  // Because we use an eight tap filter, the stride should be at least 7 + w.
-  assert(src_stride >= w + 7);
-#if CONFIG_AV1_HIGHBITDEPTH
-  if (high_bd) {
-    av1_highbd_convolve_2d_sr(CONVERT_TO_SHORTPTR(src), src_stride,
-                              CONVERT_TO_SHORTPTR(dst), w, w, h, &filter,
-                              &filter, 0, 0, &conv_params, bd);
-  } else {
-    av1_convolve_2d_sr(src, src_stride, dst, w, w, h, &filter, &filter, 0, 0,
-                       &conv_params);
-  }
-#else
-  (void)high_bd;
-  av1_convolve_2d_sr(src, src_stride, dst, w, w, h, &filter, &filter, 0, 0,
-                     &conv_params);
-#endif
-}
-
-static EdgeInfo edge_probability(const uint8_t *input, int w, int h,
-                                 bool high_bd, int bd) {
-  // The probability of an edge in the whole image is the same as the highest
-  // probability of an edge for any individual pixel. Use Sobel as the metric
-  // for finding an edge.
-  uint16_t highest = 0;
-  uint16_t highest_x = 0;
-  uint16_t highest_y = 0;
-  // Ignore the 1 pixel border around the image for the computation.
-  for (int j = 1; j < h - 1; ++j) {
-    for (int i = 1; i < w - 1; ++i) {
-      sobel_xy g = av1_sobel(input, w, i, j, high_bd);
-      // Scale down to 8-bit to get same output regardless of bit depth.
-      int16_t g_x = g.x >> (bd - 8);
-      int16_t g_y = g.y >> (bd - 8);
-      uint16_t magnitude = (uint16_t)sqrt(g_x * g_x + g_y * g_y);
-      highest = AOMMAX(highest, magnitude);
-      highest_x = AOMMAX(highest_x, g_x);
-      highest_y = AOMMAX(highest_y, g_y);
-    }
-  }
-  EdgeInfo ei = { .magnitude = highest, .x = highest_x, .y = highest_y };
-  return ei;
-}
-
-/* Uses most of the Canny edge detection algorithm to find if there are any
- * edges in the image.
- */
-EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
-                         bool high_bd, int bd) {
-  if (w < 3 || h < 3) {
-    EdgeInfo n = { .magnitude = 0, .x = 0, .y = 0 };
-    return n;
-  }
-  uint8_t *blurred;
-  if (high_bd) {
-    blurred = CONVERT_TO_BYTEPTR(aom_memalign(32, sizeof(uint16_t) * w * h));
-  } else {
-    blurred = (uint8_t *)aom_memalign(32, sizeof(uint8_t) * w * h);
-  }
-  av1_gaussian_blur(src, src_stride, w, h, blurred, high_bd, bd);
-  // Skip the non-maximum suppression step in Canny edge detection. We just
-  // want a probability of an edge existing in the buffer, which is determined
-  // by the strongest edge in it -- we don't need to eliminate the weaker
-  // edges. Use Sobel for the edge detection.
-  EdgeInfo prob = edge_probability(blurred, w, h, high_bd, bd);
-  if (high_bd) {
-    aom_free(CONVERT_TO_SHORTPTR(blurred));
-  } else {
-    aom_free(blurred);
-  }
-  return prob;
-}
diff --git a/media/libaom/src/av1/encoder/rdopt.h b/media/libaom/src/av1/encoder/rdopt.h
index c7c99ac4bd..2fead8fc7e 100644
--- a/media/libaom/src/av1/encoder/rdopt.h
+++ b/media/libaom/src/av1/encoder/rdopt.h
@@ -35,99 +35,147 @@ struct TileInfo;
 struct macroblock;
 struct RD_STATS;
 
-// Returns the number of colors in 'src'.
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
-                     int *val_count);
-// Same as av1_count_colors(), but for high-bitdepth mode.
-int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth, int *val_count);
-
-static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx,
-                                    int plane, TX_SIZE tx_size) {
-  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const LV_MAP_COEFF_COST *const coeff_costs =
-      &x->coeff_costs[txs_ctx][plane_type];
-  return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
-}
-
+/*!\brief AV1 intra mode selection for intra frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * Top level function for rd-based intra mode selection during intra frame
+ * encoding. This function will first search for the best luma prediction by
+ * calling av1_rd_pick_intra_sby_mode, then it searches for chroma prediction
+ * with av1_rd_pick_intra_sbuv_mode. If applicable, this function ends the
+ * search with an evaluation for intrabc.
+ *
+ * \param[in]    cpi            Top-level encoder structure.
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock.
+ * \param[in]    rd_cost        Struct to keep track of the RD information.
+ * \param[in]    bsize          Current block size.
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process.
+ * \param[in]    best_rd Best   RD seen for this block so far.
+ *
+ * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
 void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
                                struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
-unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
-                                           const struct buf_2d *ref,
-                                           BLOCK_SIZE bs);
-unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
-                                                const struct buf_2d *ref,
-                                                BLOCK_SIZE bs, int bd);
-
-void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi,
-                               struct TileDataEnc *tile_data,
-                               struct macroblock *x, struct RD_STATS *rd_cost,
-                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                               int64_t best_rd_so_far);
+/*!\brief AV1 inter mode selection.
+ *
+ * \ingroup inter_mode_search
+ * \callgraph
+ * Top level function for inter mode selection. This function will loop over
+ * all possible inter modes and select the best one for the current block by
+ * computing the RD cost. The mode search and RD are computed in
+ * handle_inter_mode(), which is called from this function within the main
+ * loop.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+                                data/contexts/models for the tile during
+                                encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ * \param[in]    best_rd_so_far Best RD seen for this block so far
+ *
+ * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+                            struct macroblock *x, struct RD_STATS *rd_cost,
+                            BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                            int64_t best_rd_so_far);
 
-void av1_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
-                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+/*!\brief AV1 intra mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Top level function for Non-RD optimized intra mode selection.
+ * This finction will loop over subset of intra modes and select the best one
+ * based on calculated modelled RD cost. Only 4 intra modes are checked as
+ * specified in \c intra_mode_list. When calculating RD cost Hadamard transform
+ * of residual is used to calculate rate. Estmation of RD cost is performed
+ * in \c estimate_block_intra which is called from this function
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ *
+ * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
 
+/*!\brief AV1 inter mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * Top level function for Non-RD optimized inter mode selection.
+ * This finction will loop over subset of inter modes and select the best one
+ * based on calculated modelled RD cost. While making decisions which modes to
+ * check, this function applies heuristics based on previously checked modes,
+ * block residual variance, block size, and other factors to prune certain
+ * modes and reference frames. Currently only single reference frame modes
+ * are checked. Additional heuristics are applied to decide if intra modes
+ *  need to be checked.
+ *  *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    tile_data      Pointer to struct holding adaptive
+                                data/contexts/models for the tile during
+                                encoding
+ * \param[in]    x              Pointer to structure holding all the data for
+                                the current macroblock
+ * \param[in]    rd_cost        Struct to keep track of the RD information
+ * \param[in]    bsize          Current block size
+ * \param[in]    ctx            Structure to hold snapshot of coding context
+                                during the mode picking process
+ *
+ * \return Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
 void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi,
                                   struct TileDataEnc *tile_data,
                                   struct macroblock *x,
                                   struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
-                                  PICK_MODE_CONTEXT *ctx,
-                                  int64_t best_rd_so_far);
+                                  PICK_MODE_CONTEXT *ctx);
 
 void av1_rd_pick_inter_mode_sb_seg_skip(
     const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
     struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
     BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
 
-// The best edge strength seen in the block, as well as the best x and y
-// components of edge strength seen.
-typedef struct {
-  uint16_t magnitude;
-  uint16_t x;
-  uint16_t y;
-} EdgeInfo;
-
-/** Returns an integer indicating the strength of the edge.
- * 0 means no edge found, 556 is the strength of a solid black/white edge,
- * and the number may range higher if the signal is even stronger (e.g., on a
- * corner). high_bd is a bool indicating the source should be treated
- * as a 16-bit array. bd is the bit depth.
- */
-EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
-                         bool high_bd, int bd);
-
-/** Applies a Gaussian blur with sigma = 1.3. Used by av1_edge_exists and
- * tests.
- */
-void av1_gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
-                       uint8_t *dst, bool high_bd, int bd);
-
-/* Applies standard 3x3 Sobel matrix. */
-typedef struct {
-  int16_t x;
-  int16_t y;
-} sobel_xy;
-
-sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j,
-                   bool high_bd);
-
 void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
 void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
 
-#if !CONFIG_REALTIME_ONLY
+void av1_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
+                   int mi_col, RD_STATS *this_rdc, int *skippable,
+                   BLOCK_SIZE bsize, TX_SIZE tx_size, TX_TYPE tx_type,
+                   int is_inter_mode);
+
 static INLINE int coded_to_superres_mi(int mi_col, int denom) {
   return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR;
 }
-#endif
-
-static INLINE int av1_encoder_get_relative_dist(const OrderHintInfo *oh, int a,
-                                                int b) {
-  if (!oh->enable_order_hint) return 0;
 
+static INLINE int av1_encoder_get_relative_dist(int a, int b) {
   assert(a >= 0 && b >= 0);
   return (a - b);
 }
@@ -136,31 +184,18 @@ static INLINE int av1_encoder_get_relative_dist(const OrderHintInfo *oh, int a,
 static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) {
   const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize];
   int sb_mi_rows =
-      (mi_size_wide[cm->seq_params.sb_size] + mi_alloc_size_1d - 1) /
+      (mi_size_wide[cm->seq_params->sb_size] + mi_alloc_size_1d - 1) /
       mi_alloc_size_1d;
-  assert(mi_size_wide[cm->seq_params.sb_size] ==
-         mi_size_high[cm->seq_params.sb_size]);
+  assert(mi_size_wide[cm->seq_params->sb_size] ==
+         mi_size_high[cm->seq_params->sb_size]);
   int sb_mi_size = sb_mi_rows * sb_mi_rows;
 
   return sb_mi_size;
 }
 
-// This function will copy usable ref_mv_stack[ref_frame][4] and
-// weight[ref_frame][4] information from ref_mv_stack[ref_frame][8] and
-// weight[ref_frame][8].
-static INLINE void av1_copy_usable_ref_mv_stack_and_weight(
-    const MACROBLOCKD *xd, MB_MODE_INFO_EXT *const mbmi_ext,
-    MV_REFERENCE_FRAME ref_frame) {
-  memcpy(mbmi_ext->weight[ref_frame], xd->weight[ref_frame],
-         USABLE_REF_MV_STACK_SIZE * sizeof(xd->weight[0][0]));
-  memcpy(mbmi_ext->ref_mv_stack[ref_frame], xd->ref_mv_stack[ref_frame],
-         USABLE_REF_MV_STACK_SIZE * sizeof(xd->ref_mv_stack[0][0]));
-}
-
 // This function prunes the mode if either of the reference frame falls in the
 // pruning list
 static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame,
-                            const OrderHintInfo *const order_hint_info,
                             const unsigned int *const ref_display_order_hint,
                             const unsigned int frame_display_order_hint,
                             const int *ref_frame_list) {
@@ -170,7 +205,6 @@ static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame,
     if (ref_frame[0] == ref_frame_list[i] ||
         ref_frame[1] == ref_frame_list[i]) {
       if (av1_encoder_get_relative_dist(
-              order_hint_info,
               ref_display_order_hint[ref_frame_list[i] - LAST_FRAME],
               frame_display_order_hint) < 0)
         return 1;
@@ -179,6 +213,31 @@ static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame,
   return 0;
 }
 
+static INLINE int has_closest_ref_frames(const MV_REFERENCE_FRAME *ref_frame,
+                                         int8_t closest_past_ref,
+                                         int8_t closest_future_ref) {
+  int has_closest_past_ref =
+      (ref_frame[0] == closest_past_ref) || (ref_frame[1] == closest_past_ref);
+  int has_closest_future_ref = (ref_frame[0] == closest_future_ref) ||
+                               (ref_frame[1] == closest_future_ref);
+  return (has_closest_past_ref && has_closest_future_ref);
+}
+
+static INLINE int has_best_pred_mv_sad(const MV_REFERENCE_FRAME *ref_frame,
+                                       const MACROBLOCK *const x) {
+  int has_best_past_pred_mv_sad = 0;
+  int has_best_future_pred_mv_sad = 0;
+  if (x->best_pred_mv_sad[0] < INT_MAX && x->best_pred_mv_sad[1] < INT_MAX) {
+    has_best_past_pred_mv_sad =
+        (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[0]) ||
+        (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[0]);
+    has_best_future_pred_mv_sad =
+        (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[1]) ||
+        (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[1]);
+  }
+  return (has_best_past_pred_mv_sad && has_best_future_pred_mv_sad);
+}
+
 static INLINE int prune_ref_by_selective_ref_frame(
     const AV1_COMP *const cpi, const MACROBLOCK *const x,
     const MV_REFERENCE_FRAME *const ref_frame,
@@ -186,8 +245,6 @@ static INLINE int prune_ref_by_selective_ref_frame(
   const SPEED_FEATURES *const sf = &cpi->sf;
   if (!sf->inter_sf.selective_ref_frame) return 0;
 
-  const AV1_COMMON *const cm = &cpi->common;
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
   const int comp_pred = ref_frame[1] > INTRA_FRAME;
 
   if (sf->inter_sf.selective_ref_frame >= 2 ||
@@ -195,11 +252,19 @@ static INLINE int prune_ref_by_selective_ref_frame(
     int ref_frame_list[2] = { LAST3_FRAME, LAST2_FRAME };
 
     if (x != NULL) {
-      if (x->search_ref_frame[LAST3_FRAME]) ref_frame_list[0] = NONE_FRAME;
-      if (x->search_ref_frame[LAST2_FRAME]) ref_frame_list[1] = NONE_FRAME;
+      // Disable pruning if either tpl suggests that we keep the frame or
+      // the pred_mv gives us the best sad
+      if (x->tpl_keep_ref_frame[LAST3_FRAME] ||
+          x->pred_mv_sad[LAST3_FRAME] == x->best_pred_mv_sad[0]) {
+        ref_frame_list[0] = NONE_FRAME;
+      }
+      if (x->tpl_keep_ref_frame[LAST2_FRAME] ||
+          x->pred_mv_sad[LAST2_FRAME] == x->best_pred_mv_sad[0]) {
+        ref_frame_list[1] = NONE_FRAME;
+      }
     }
 
-    if (prune_ref(ref_frame, order_hint_info, ref_display_order_hint,
+    if (prune_ref(ref_frame, ref_display_order_hint,
                   ref_display_order_hint[GOLDEN_FRAME - LAST_FRAME],
                   ref_frame_list))
       return 1;
@@ -209,16 +274,39 @@ static INLINE int prune_ref_by_selective_ref_frame(
     int ref_frame_list[2] = { ALTREF2_FRAME, BWDREF_FRAME };
 
     if (x != NULL) {
-      if (x->search_ref_frame[ALTREF2_FRAME]) ref_frame_list[0] = NONE_FRAME;
-      if (x->search_ref_frame[BWDREF_FRAME]) ref_frame_list[1] = NONE_FRAME;
+      // Disable pruning if either tpl suggests that we keep the frame or
+      // the pred_mv gives us the best sad
+      if (x->tpl_keep_ref_frame[ALTREF2_FRAME] ||
+          x->pred_mv_sad[ALTREF2_FRAME] == x->best_pred_mv_sad[0]) {
+        ref_frame_list[0] = NONE_FRAME;
+      }
+      if (x->tpl_keep_ref_frame[BWDREF_FRAME] ||
+          x->pred_mv_sad[BWDREF_FRAME] == x->best_pred_mv_sad[0]) {
+        ref_frame_list[1] = NONE_FRAME;
+      }
     }
 
-    if (prune_ref(ref_frame, order_hint_info, ref_display_order_hint,
+    if (prune_ref(ref_frame, ref_display_order_hint,
                   ref_display_order_hint[LAST_FRAME - LAST_FRAME],
                   ref_frame_list))
       return 1;
   }
 
+  if (x != NULL && sf->inter_sf.prune_comp_ref_frames && comp_pred) {
+    int closest_ref_frames = has_closest_ref_frames(
+        ref_frame, cpi->ref_frame_dist_info.nearest_past_ref,
+        cpi->ref_frame_dist_info.nearest_future_ref);
+    if (closest_ref_frames == 0) {
+      // Prune reference frames which are not the closest to the current frame.
+      if (sf->inter_sf.prune_comp_ref_frames >= 2) {
+        return 1;
+      } else if (sf->inter_sf.prune_comp_ref_frames == 1) {
+        // Prune reference frames with non minimum pred_mv_sad.
+        if (has_best_pred_mv_sad(ref_frame, x) == 0) return 1;
+      }
+    }
+  }
+
   return 0;
 }
 
diff --git a/media/libaom/src/av1/encoder/rdopt_utils.h b/media/libaom/src/av1/encoder/rdopt_utils.h
index 53b410a224..ece3da7d05 100644
--- a/media/libaom/src/av1/encoder/rdopt_utils.h
+++ b/media/libaom/src/av1/encoder/rdopt_utils.h
@@ -86,132 +86,132 @@ static const MODE_DEFINITION av1_mode_defs[MAX_MODES] = {
   { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
 
+  { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
+
   { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
   { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
 
   { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
   { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
 
   { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
   { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
 
   { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 
-  { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
-  { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
-
   { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
   { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
   { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
   { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
-  { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
-  { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
-  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
   { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
   { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
   { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } },
 
   { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
-  { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
 
   // intra modes
@@ -230,6 +230,14 @@ static const MODE_DEFINITION av1_mode_defs[MAX_MODES] = {
   { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
 };
 
+// Number of winner modes allowed for different values of the speed feature
+// multi_winner_mode_type.
+static const int winner_mode_count_allowed[MULTI_WINNER_MODE_LEVELS] = {
+  1,  // MULTI_WINNER_MODE_OFF
+  2,  // MULTI_WINNER_MODE_FAST
+  3   // MULTI_WINNER_MODE_DEFAULT
+};
+
 static AOM_INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst,
                                        const int num_planes) {
   for (int i = 0; i < num_planes; i++) {
@@ -238,6 +246,15 @@ static AOM_INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst,
   }
 }
 
+static AOM_INLINE void swap_dst_buf(MACROBLOCKD *xd,
+                                    const BUFFER_SET *dst_bufs[2],
+                                    int num_planes) {
+  const BUFFER_SET *buf0 = dst_bufs[0];
+  dst_bufs[0] = dst_bufs[1];
+  dst_bufs[1] = buf0;
+  restore_dst_buf(xd, *dst_bufs[0], num_planes);
+}
+
 /* clang-format on */
 // Calculate rd threshold based on ref best rd and relevant scaling factors
 static AOM_INLINE int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd,
@@ -266,7 +283,8 @@ get_prediction_mode_idx(PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame,
     return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
                                    [ref_frame];
   }
-  if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) {
+  if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END &&
+      second_ref_frame != NONE_FRAME) {
     assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
     assert((second_ref_frame > INTRA_FRAME) &&
            (second_ref_frame <= ALTREF_FRAME));
@@ -334,31 +352,34 @@ static INLINE int check_txfm_eval(MACROBLOCK *const x, BLOCK_SIZE bsize,
   // Derive aggressiveness factor for gating the transform search
   // Lower value indicates more aggressiveness. Be more conservative (high
   // value) for (i) low quantizers (ii) regions where prediction is poor
-  const int scale[5] = { INT_MAX, 4, 3, 3, 2 };
+  const int scale[5] = { INT_MAX, 4, 3, 2, 2 };
   const int qslope = 2 * (!is_luma_only);
-  int aggr_factor = 1;
-  if (!is_luma_only) {
-    aggr_factor = AOMMAX(
-        1, ((MAXQ - x->qindex) * qslope + QINDEX_RANGE / 2) >> QINDEX_BITS);
+  const int level_to_qindex_map[5] = { 0, 0, 0, 80, 100 };
+  int aggr_factor = 4;
+  const int pred_qindex_thresh = level_to_qindex_map[level];
+  if (!is_luma_only && level <= 2) {
+    aggr_factor = 4 * AOMMAX(1, ROUND_POWER_OF_TWO((MAXQ - x->qindex) * qslope,
+                                                   QINDEX_BITS));
   }
-  if (best_skip_rd >
-      (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS)))
+  if ((best_skip_rd >
+       (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS))) &&
+      (x->qindex >= pred_qindex_thresh))
     aggr_factor *= scale[level];
-  // For level setting 1, be more conservative for luma only case even when
-  // prediction is good
+  // For level setting 1, be more conservative for non-luma-only case even when
+  // prediction is good.
   else if ((level <= 1) && !is_luma_only)
-    aggr_factor *= 2;
+    aggr_factor = (aggr_factor >> 2) * 6;
 
   // Be more conservative for luma only cases (called from compound type rd)
   // since best_skip_rd is computed after and skip_rd is computed (with 8-bit
   // prediction signals blended for WEDGE/DIFFWTD rather than 16-bit) before
   // interpolation filter search
-  const int luma_mul[5] = { INT_MAX, 32, 29, 20, 17 };
+  const int luma_mul[5] = { INT_MAX, 32, 29, 17, 17 };
   int mul_factor = is_luma_only ? luma_mul[level] : 16;
   int64_t rd_thresh =
       (best_skip_rd == INT64_MAX)
           ? best_skip_rd
-          : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 4);
+          : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 6);
   if (skip_rd > rd_thresh) eval_txfm = 0;
   return eval_txfm;
 }
@@ -374,21 +395,70 @@ static TX_MODE select_tx_mode(
     return TX_MODE_SELECT;
   }
 }
+
+// Checks the conditions to disable winner mode processing
+static INLINE int bypass_winner_mode_processing(const MACROBLOCK *const x,
+                                                const SPEED_FEATURES *sf,
+                                                int use_txfm_skip,
+                                                int actual_txfm_skip,
+                                                PREDICTION_MODE best_mode) {
+  const int prune_winner_mode_eval_level =
+      sf->winner_mode_sf.prune_winner_mode_eval_level;
+
+  // Disable winner mode processing for blocks with low source variance.
+  // The aggressiveness of this pruning logic reduces as qindex increases.
+  // The threshold decreases linearly from 64 as qindex varies from 0 to 255.
+  if (prune_winner_mode_eval_level == 1) {
+    const unsigned int src_var_thresh = 64 - 48 * x->qindex / (MAXQ + 1);
+    if (x->source_variance < src_var_thresh) return 1;
+  } else if (prune_winner_mode_eval_level == 2) {
+    // Skip winner mode processing of blocks for which transform turns out to be
+    // skip due to nature of eob alone except NEWMV mode.
+    if (!have_newmv_in_inter_mode(best_mode) && actual_txfm_skip) return 1;
+  } else if (prune_winner_mode_eval_level == 3) {
+    // Skip winner mode processing of blocks for which transform turns out to be
+    // skip except NEWMV mode and considered based on the quantizer.
+    // At high quantizers: Take conservative approach by considering transform
+    // skip based on eob alone.
+    // At low quantizers: Consider transform skip based on eob nature or RD cost
+    // evaluation.
+    const int is_txfm_skip =
+        x->qindex > 127 ? actual_txfm_skip : actual_txfm_skip || use_txfm_skip;
+
+    if (!have_newmv_in_inter_mode(best_mode) && is_txfm_skip) return 1;
+  } else if (prune_winner_mode_eval_level >= 4) {
+    // Do not skip winner mode evaluation at low quantizers if normal mode's
+    // transform search was too aggressive.
+    if (sf->rd_sf.perform_coeff_opt >= 5 && x->qindex <= 70) return 0;
+
+    if (use_txfm_skip || actual_txfm_skip) return 1;
+  }
+
+  return 0;
+}
+
 // Checks the conditions to enable winner mode processing
-static INLINE int is_winner_mode_processing_enabled(
-    const struct AV1_COMP *cpi, MB_MODE_INFO *const mbmi,
-    const PREDICTION_MODE best_mode) {
+static INLINE int is_winner_mode_processing_enabled(const struct AV1_COMP *cpi,
+                                                    const MACROBLOCK *const x,
+                                                    MB_MODE_INFO *const mbmi,
+                                                    int actual_txfm_skip) {
   const SPEED_FEATURES *sf = &cpi->sf;
+  const PREDICTION_MODE best_mode = mbmi->mode;
+
+  if (bypass_winner_mode_processing(x, sf, mbmi->skip_txfm, actual_txfm_skip,
+                                    best_mode))
+    return 0;
 
   // TODO(any): Move block independent condition checks to frame level
   if (is_inter_block(mbmi)) {
     if (is_inter_mode(best_mode) &&
-        sf->tx_sf.tx_type_search.fast_inter_tx_type_search &&
-        !cpi->oxcf.use_inter_dct_only)
+        (sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != INT_MAX) &&
+        !cpi->oxcf.txfm_cfg.use_inter_dct_only)
       return 1;
   } else {
     if (sf->tx_sf.tx_type_search.fast_intra_tx_type_search &&
-        !cpi->oxcf.use_intra_default_tx_only && !cpi->oxcf.use_intra_dct_only)
+        !cpi->oxcf.txfm_cfg.use_intra_default_tx_only &&
+        !cpi->oxcf.txfm_cfg.use_intra_dct_only)
       return 1;
   }
 
@@ -404,55 +474,67 @@ static INLINE int is_winner_mode_processing_enabled(
 
 static INLINE void set_tx_size_search_method(
     const AV1_COMMON *cm, const WinnerModeParams *winner_mode_params,
-    MACROBLOCK *x, int enable_winner_mode_for_tx_size_srch,
+    TxfmSearchParams *txfm_params, int enable_winner_mode_for_tx_size_srch,
     int is_winner_mode) {
   // Populate transform size search method/transform mode appropriately
-  x->tx_size_search_method =
+  txfm_params->tx_size_search_method =
       winner_mode_params->tx_size_search_methods[DEFAULT_EVAL];
   if (enable_winner_mode_for_tx_size_srch) {
     if (is_winner_mode)
-      x->tx_size_search_method =
+      txfm_params->tx_size_search_method =
           winner_mode_params->tx_size_search_methods[WINNER_MODE_EVAL];
     else
-      x->tx_size_search_method =
+      txfm_params->tx_size_search_method =
           winner_mode_params->tx_size_search_methods[MODE_EVAL];
   }
-  x->tx_mode_search_type = select_tx_mode(cm, x->tx_size_search_method);
+  txfm_params->tx_mode_search_type =
+      select_tx_mode(cm, txfm_params->tx_size_search_method);
 }
 
-static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf, MACROBLOCK *x,
-                                     int enable_winner_mode_tx_type_pruning,
+static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf,
+                                     TxfmSearchParams *txfm_params,
+                                     int winner_mode_tx_type_pruning,
                                      int is_winner_mode) {
   // Populate prune transform mode appropriately
-  x->prune_mode = sf->tx_sf.tx_type_search.prune_mode;
-  if (enable_winner_mode_tx_type_pruning) {
-    if (is_winner_mode)
-      x->prune_mode = NO_PRUNE;
-    else
-      x->prune_mode = PRUNE_2D_AGGRESSIVE;
-  }
+  txfm_params->prune_2d_txfm_mode = sf->tx_sf.tx_type_search.prune_2d_txfm_mode;
+  if (!winner_mode_tx_type_pruning) return;
+
+  const int prune_mode[4][2] = { { TX_TYPE_PRUNE_3, TX_TYPE_PRUNE_0 },
+                                 { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 },
+                                 { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 },
+                                 { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_3 } };
+  txfm_params->prune_2d_txfm_mode =
+      prune_mode[winner_mode_tx_type_pruning - 1][is_winner_mode];
 }
 
 static INLINE void set_tx_domain_dist_params(
-    const WinnerModeParams *winner_mode_params, MACROBLOCK *x,
+    const WinnerModeParams *winner_mode_params, TxfmSearchParams *txfm_params,
     int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) {
+  if (txfm_params->use_qm_dist_metric) {
+    // QM-weighted PSNR is computed in transform space, so we need to forcibly
+    // enable the use of tx domain distortion.
+    txfm_params->use_transform_domain_distortion = 1;
+    txfm_params->tx_domain_dist_threshold = 0;
+    return;
+  }
+
   if (!enable_winner_mode_for_tx_domain_dist) {
-    x->use_transform_domain_distortion =
+    txfm_params->use_transform_domain_distortion =
         winner_mode_params->use_transform_domain_distortion[DEFAULT_EVAL];
-    x->tx_domain_dist_threshold =
+    txfm_params->tx_domain_dist_threshold =
         winner_mode_params->tx_domain_dist_threshold[DEFAULT_EVAL];
     return;
   }
 
   if (is_winner_mode) {
-    x->use_transform_domain_distortion =
+    txfm_params->use_transform_domain_distortion =
         winner_mode_params->use_transform_domain_distortion[WINNER_MODE_EVAL];
-    x->tx_domain_dist_threshold =
+    txfm_params->tx_domain_dist_threshold =
         winner_mode_params->tx_domain_dist_threshold[WINNER_MODE_EVAL];
   } else {
-    x->use_transform_domain_distortion =
+    txfm_params->use_transform_domain_distortion =
         winner_mode_params->use_transform_domain_distortion[MODE_EVAL];
-    x->tx_domain_dist_threshold =
+    txfm_params->tx_domain_dist_threshold =
         winner_mode_params->tx_domain_dist_threshold[MODE_EVAL];
   }
 }
@@ -464,86 +546,99 @@ static INLINE void set_mode_eval_params(const struct AV1_COMP *cpi,
   const AV1_COMMON *cm = &cpi->common;
   const SPEED_FEATURES *sf = &cpi->sf;
   const WinnerModeParams *winner_mode_params = &cpi->winner_mode_params;
+  TxfmSearchParams *txfm_params = &x->txfm_search_params;
+
+  txfm_params->use_qm_dist_metric =
+      cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR;
 
   switch (mode_eval_type) {
     case DEFAULT_EVAL:
-      x->use_default_inter_tx_type = 0;
-      x->use_default_intra_tx_type = 0;
-      x->predict_skip_level =
-          winner_mode_params->predict_skip_level[DEFAULT_EVAL];
+      txfm_params->default_inter_tx_type_prob_thresh = INT_MAX;
+      txfm_params->use_default_intra_tx_type = 0;
+      txfm_params->skip_txfm_level =
+          winner_mode_params->skip_txfm_level[DEFAULT_EVAL];
+      txfm_params->predict_dc_level =
+          winner_mode_params->predict_dc_level[DEFAULT_EVAL];
       // Set default transform domain distortion type
-      set_tx_domain_dist_params(winner_mode_params, x, 0, 0);
+      set_tx_domain_dist_params(winner_mode_params, txfm_params, 0, 0);
 
       // Get default threshold for R-D optimization of coefficients
-      x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
-          winner_mode_params->coeff_opt_dist_threshold, 0, 0);
+      get_rd_opt_coeff_thresh(winner_mode_params->coeff_opt_thresholds,
+                              txfm_params, 0, 0);
+
       // Set default transform size search method
-      set_tx_size_search_method(cm, winner_mode_params, x, 0, 0);
+      set_tx_size_search_method(cm, winner_mode_params, txfm_params, 0, 0);
       // Set default transform type prune
-      set_tx_type_prune(sf, x, 0, 0);
+      set_tx_type_prune(sf, txfm_params, 0, 0);
       break;
     case MODE_EVAL:
-      x->use_default_intra_tx_type =
+      txfm_params->use_default_intra_tx_type =
           (cpi->sf.tx_sf.tx_type_search.fast_intra_tx_type_search ||
-           cpi->oxcf.use_intra_default_tx_only);
-      x->use_default_inter_tx_type =
-          cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_search;
-      x->predict_skip_level = winner_mode_params->predict_skip_level[MODE_EVAL];
-
+           cpi->oxcf.txfm_cfg.use_intra_default_tx_only);
+      txfm_params->default_inter_tx_type_prob_thresh =
+          cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh;
+      txfm_params->skip_txfm_level =
+          winner_mode_params->skip_txfm_level[MODE_EVAL];
+      txfm_params->predict_dc_level =
+          winner_mode_params->predict_dc_level[MODE_EVAL];
       // Set transform domain distortion type for mode evaluation
       set_tx_domain_dist_params(
-          winner_mode_params, x,
+          winner_mode_params, txfm_params,
           sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 0);
 
       // Get threshold for R-D optimization of coefficients during mode
       // evaluation
-      x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
-          winner_mode_params->coeff_opt_dist_threshold,
+      get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_thresholds, txfm_params,
           sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0);
+
       // Set the transform size search method for mode evaluation
       set_tx_size_search_method(
-          cm, winner_mode_params, x,
+          cm, winner_mode_params, txfm_params,
           sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 0);
       // Set transform type prune for mode evaluation
-      set_tx_type_prune(
-          sf, x, sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning,
-          0);
+      set_tx_type_prune(sf, txfm_params,
+                        sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning,
+                        0);
       break;
     case WINNER_MODE_EVAL:
-      x->use_default_inter_tx_type = 0;
-      x->use_default_intra_tx_type = 0;
-      x->predict_skip_level =
-          winner_mode_params->predict_skip_level[WINNER_MODE_EVAL];
+      txfm_params->default_inter_tx_type_prob_thresh = INT_MAX;
+      txfm_params->use_default_intra_tx_type = 0;
+      txfm_params->skip_txfm_level =
+          winner_mode_params->skip_txfm_level[WINNER_MODE_EVAL];
+      txfm_params->predict_dc_level =
+          winner_mode_params->predict_dc_level[WINNER_MODE_EVAL];
 
       // Set transform domain distortion type for winner mode evaluation
       set_tx_domain_dist_params(
-          winner_mode_params, x,
+          winner_mode_params, txfm_params,
           sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 1);
 
       // Get threshold for R-D optimization of coefficients for winner mode
       // evaluation
-      x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
-          winner_mode_params->coeff_opt_dist_threshold,
+      get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_thresholds, txfm_params,
           sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1);
+
       // Set the transform size search method for winner mode evaluation
       set_tx_size_search_method(
-          cm, winner_mode_params, x,
+          cm, winner_mode_params, txfm_params,
           sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
       // Set default transform type prune mode for winner mode evaluation
-      set_tx_type_prune(
-          sf, x, sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning,
-          1);
-
-      // Reset hash state for winner mode processing. Winner mode and subsequent
-      // transform/mode evaluations (palette/IntraBC) cann't reuse old data as
-      // the decisions would have been sub-optimal
-      // TODO(any): Move the evaluation of palette/IntraBC modes before winner
-      // mode is processed and clean-up the code below
-      reset_hash_records(x, cpi->sf.tx_sf.use_inter_txb_hash);
-
+      set_tx_type_prune(sf, txfm_params,
+                        sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning,
+                        1);
       break;
     default: assert(0);
   }
+
+  // Rd record collected at a specific mode evaluation stage can not be used
+  // across other evaluation stages as the transform parameters are different.
+  // Hence, reset mb rd record whenever mode evaluation stage type changes.
+  if (txfm_params->mode_eval_type != mode_eval_type)
+    reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
+
+  txfm_params->mode_eval_type = mode_eval_type;
 }
 
 // Similar to store_cfl_required(), but for use during the RDO process,
@@ -552,7 +647,7 @@ static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
                                                       const MACROBLOCK *x) {
   const MACROBLOCKD *xd = &x->e_mbd;
 
-  if (cm->seq_params.monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED;
+  if (cm->seq_params->monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED;
 
   if (!xd->is_chroma_ref) {
     // For non-chroma-reference blocks, we should always store the luma pixels,
@@ -575,24 +670,22 @@ static AOM_INLINE void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
 
 // Store best mode stats for winner mode processing
 static INLINE void store_winner_mode_stats(
-    const AV1_COMMON *const cm, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    const AV1_COMMON *const cm, MACROBLOCK *x, const MB_MODE_INFO *mbmi,
     RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv,
     THR_MODES mode_index, uint8_t *color_map, BLOCK_SIZE bsize, int64_t this_rd,
-    int enable_multiwinner_mode_process, int txfm_search_done) {
+    int multi_winner_mode_type, int txfm_search_done) {
   WinnerModeStats *winner_mode_stats = x->winner_mode_stats;
   int mode_idx = 0;
   int is_palette_mode = mbmi->palette_mode_info.palette_size[PLANE_TYPE_Y] > 0;
   // Mode stat is not required when multiwinner mode processing is disabled
-  if (!enable_multiwinner_mode_process) return;
+  if (multi_winner_mode_type == MULTI_WINNER_MODE_OFF) return;
   // Ignore mode with maximum rd
   if (this_rd == INT64_MAX) return;
   // TODO(any): Winner mode processing is currently not applicable for palette
   // mode in Inter frames. Clean-up the following code, once support is added
   if (!frame_is_intra_only(cm) && is_palette_mode) return;
 
-  const int max_winner_mode_count = frame_is_intra_only(cm)
-                                        ? MAX_WINNER_MODE_COUNT_INTRA
-                                        : MAX_WINNER_MODE_COUNT_INTER;
+  int max_winner_mode_count = winner_mode_count_allowed[multi_winner_mode_type];
   assert(x->winner_mode_count >= 0 &&
          x->winner_mode_count <= max_winner_mode_count);
 
@@ -619,14 +712,16 @@ static INLINE void store_winner_mode_stats(
   // Update rd stats required for inter frame
   if (!frame_is_intra_only(cm) && rd_cost && rd_cost_y && rd_cost_uv) {
     const MACROBLOCKD *xd = &x->e_mbd;
-    const int skip_ctx = av1_get_skip_context(xd);
+    const int skip_ctx = av1_get_skip_txfm_context(xd);
     const int is_intra_mode = av1_mode_defs[mode_index].mode < INTRA_MODE_END;
-    const int skip = mbmi->skip && !is_intra_mode;
+    const int skip_txfm = mbmi->skip_txfm && !is_intra_mode;
 
     winner_mode_stats[mode_idx].rd_cost = *rd_cost;
     if (txfm_search_done) {
       winner_mode_stats[mode_idx].rate_y =
-          rd_cost_y->rate + x->skip_cost[skip_ctx][rd_cost->skip || skip];
+          rd_cost_y->rate +
+          x->mode_costs
+              .skip_txfm_cost[skip_ctx][rd_cost->skip_txfm || skip_txfm];
       winner_mode_stats[mode_idx].rate_uv = rd_cost_uv->rate;
     }
   }
@@ -645,6 +740,30 @@ static INLINE void store_winner_mode_stats(
       AOMMIN(x->winner_mode_count + 1, max_winner_mode_count);
 }
 
+unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs);
+
+unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd);
+
+static INLINE int is_mode_intra(PREDICTION_MODE mode) {
+  return mode < INTRA_MODE_END;
+}
+
+// This function will copy usable ref_mv_stack[ref_frame][4] and
+// weight[ref_frame][4] information from ref_mv_stack[ref_frame][8] and
+// weight[ref_frame][8].
+static INLINE void av1_copy_usable_ref_mv_stack_and_weight(
+    const MACROBLOCKD *xd, MB_MODE_INFO_EXT *const mbmi_ext,
+    MV_REFERENCE_FRAME ref_frame) {
+  memcpy(mbmi_ext->weight[ref_frame], xd->weight[ref_frame],
+         USABLE_REF_MV_STACK_SIZE * sizeof(xd->weight[0][0]));
+  memcpy(mbmi_ext->ref_mv_stack[ref_frame], xd->ref_mv_stack[ref_frame],
+         USABLE_REF_MV_STACK_SIZE * sizeof(xd->ref_mv_stack[0][0]));
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/reconinter_enc.c b/media/libaom/src/av1/encoder/reconinter_enc.c
index 231b02091e..7eadbda945 100644
--- a/media/libaom/src/av1/encoder/reconinter_enc.c
+++ b/media/libaom/src/av1/encoder/reconinter_enc.c
@@ -31,7 +31,8 @@
 static void enc_calc_subpel_params(const MV *const src_mv,
                                    InterPredParams *const inter_pred_params,
                                    MACROBLOCKD *xd, int mi_x, int mi_y, int ref,
-                                   uint8_t **pre, SubpelParams *subpel_params,
+                                   uint8_t **mc_buf, uint8_t **pre,
+                                   SubpelParams *subpel_params,
                                    int *src_stride) {
   // These are part of the function signature to use this function through a
   // function pointer. See typedef of 'CalcSubpelParamsFunc'.
@@ -39,6 +40,7 @@ static void enc_calc_subpel_params(const MV *const src_mv,
   (void)mi_x;
   (void)mi_y;
   (void)ref;
+  (void)mc_buf;
 
   const struct scale_factors *sf = inter_pred_params->scale_factors;
 
@@ -73,16 +75,18 @@ static void enc_calc_subpel_params(const MV *const src_mv,
 void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
                                        const MV *src_mv,
                                        InterPredParams *inter_pred_params) {
-  av1_build_one_inter_predictor(dst, dst_stride, src_mv, inter_pred_params,
-                                NULL /* xd */, 0 /* mi_x */, 0 /* mi_y */,
-                                0 /* ref */, enc_calc_subpel_params);
+  av1_build_one_inter_predictor(
+      dst, dst_stride, src_mv, inter_pred_params, NULL /* xd */, 0 /* mi_x */,
+      0 /* mi_y */, inter_pred_params->conv_params.do_average /* ref */,
+      NULL /* mc_buf */, enc_calc_subpel_params);
 }
 
 static void enc_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                        int plane, const MB_MODE_INFO *mi,
                                        int bw, int bh, int mi_x, int mi_y) {
   av1_build_inter_predictors(cm, xd, plane, mi, 0 /* build_for_obmc */, bw, bh,
-                             mi_x, mi_y, enc_calc_subpel_params);
+                             mi_x, mi_y, NULL /* mc_buf */,
+                             enc_calc_subpel_params);
 }
 
 void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) {
@@ -136,18 +140,49 @@ void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
   }
 }
 
+static void setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+                                   int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+                                   struct build_prediction_ctxt *ctxt,
+                                   const int num_planes) {
+  const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->bsize);
+  const int ref_mi_row = xd->mi_row + mi_row_offset;
+  const int ref_mi_col = xd->mi_col + mi_col_offset;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane],
+                     ctxt->tmp_width[plane], ctxt->tmp_height[plane],
+                     ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset,
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+
+  const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0];
+
+  const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(ctxt->cm, frame);
+
+  xd->block_ref_scale_factors[0] = sf;
+  if ((!av1_is_valid_scale(sf)))
+    aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Reference frame has invalid dimensions");
+
+  av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf,
+                       num_planes);
+}
+
 static INLINE void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row,
                                          int rel_mi_col, uint8_t op_mi_size,
                                          int dir, MB_MODE_INFO *above_mbmi,
                                          void *fun_ctxt, const int num_planes) {
   struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
-  av1_setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt,
-                             num_planes);
+  setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt,
+                         num_planes);
 
   const int mi_x = (xd->mi_col + rel_mi_col) << MI_SIZE_LOG2;
   const int mi_y = (xd->mi_row + rel_mi_row) << MI_SIZE_LOG2;
 
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 
   InterPredParams inter_pred_params;
 
@@ -190,10 +225,10 @@ void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          int tmp_height[MAX_MB_PLANE],
                                          int tmp_stride[MAX_MB_PLANE]) {
   if (!xd->up_available) return;
-  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
-                                        tmp_width,  tmp_height,
-                                        tmp_stride, xd->mb_to_right_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  struct build_prediction_ctxt ctxt = {
+    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, NULL
+  };
+  BLOCK_SIZE bsize = xd->mi[0]->bsize;
   foreach_overlappable_nb_above(cm, xd,
                                 max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                 build_obmc_prediction, &ctxt);
@@ -205,10 +240,10 @@ void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         int tmp_height[MAX_MB_PLANE],
                                         int tmp_stride[MAX_MB_PLANE]) {
   if (!xd->left_available) return;
-  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
-                                        tmp_width,  tmp_height,
-                                        tmp_stride, xd->mb_to_bottom_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  struct build_prediction_ctxt ctxt = {
+    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, NULL
+  };
+  BLOCK_SIZE bsize = xd->mi[0]->bsize;
   foreach_overlappable_nb_left(cm, xd,
                                max_neighbor_obmc[mi_size_high_log2[bsize]],
                                build_obmc_prediction, &ctxt);
@@ -224,26 +259,7 @@ void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd) {
   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-  if (is_cur_buf_hbd(xd)) {
-    int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
-    dst_buf1[1] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
-    dst_buf1[2] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
-    dst_buf2[1] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
-    dst_buf2[2] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
-  } else {
-    dst_buf1[0] = xd->tmp_obmc_bufs[0];
-    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
-    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
-    dst_buf2[0] = xd->tmp_obmc_bufs[1];
-    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
-    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
-  }
+  av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2);
 
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
@@ -251,15 +267,15 @@ void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd) {
                                       dst_stride1);
   av1_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2,
                                      dst_stride2);
-  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, &cm->cur_frame->buf,
-                       mi_row, mi_col, 0, num_planes);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row,
+                       mi_col, 0, num_planes);
   av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
                                   dst_stride2);
 }
 
 void av1_build_inter_predictors_for_planes_single_buf(
     MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
-    uint8_t *ext_dst[3], int ext_dst_stride[3]) {
+    uint8_t *ext_dst[], int ext_dst_stride[]) {
   assert(bsize < BLOCK_SIZES_ALL);
   const MB_MODE_INFO *mi = xd->mi[0];
   const int mi_row = xd->mi_row;
@@ -343,6 +359,7 @@ static void build_wedge_inter_predictor_from_buf(
 
   if (is_compound && is_masked_compound_type(comp_data->type)) {
     if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+#if CONFIG_AV1_HIGHBITDEPTH
       if (is_hbd) {
         av1_build_compound_diffwtd_mask_highbd(
             comp_data->seg_mask, comp_data->mask_type,
@@ -353,46 +370,49 @@ static void build_wedge_inter_predictor_from_buf(
             comp_data->seg_mask, comp_data->mask_type, ext_dst0,
             ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
       }
+#else
+      (void)is_hbd;
+      av1_build_compound_diffwtd_mask(comp_data->seg_mask, comp_data->mask_type,
+                                      ext_dst0, ext_dst_stride0, ext_dst1,
+                                      ext_dst_stride1, h, w);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
     }
 #if CONFIG_AV1_HIGHBITDEPTH
     if (is_hbd) {
       build_masked_compound_highbd(
           dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
-          mbmi->sb_type, h, w, xd->bd);
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, mbmi->bsize,
+          h, w, xd->bd);
     } else {
       build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
-                            ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
+                            ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize,
                             h, w);
     }
 #else
     build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
-                          ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
-                          h, w);
+                          ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize, h,
+                          w);
 #endif
   } else {
 #if CONFIG_AV1_HIGHBITDEPTH
     if (is_hbd) {
-      aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
-                               xd->bd);
+      aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(ext_dst0), ext_dst_stride0,
+                               CONVERT_TO_SHORTPTR(dst), dst_buf->stride, w, h);
     } else {
-      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
-                        0, NULL, 0, w, h);
+      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h);
     }
 #else
-    aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL, 0,
-                      NULL, 0, w, h);
+    aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h);
 #endif
   }
 }
 
 void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                               int plane_from, int plane_to,
-                                              uint8_t *ext_dst0[3],
-                                              int ext_dst_stride0[3],
-                                              uint8_t *ext_dst1[3],
-                                              int ext_dst_stride1[3]) {
+                                              uint8_t *ext_dst0[],
+                                              int ext_dst_stride0[],
+                                              uint8_t *ext_dst1[],
+                                              int ext_dst_stride1[]) {
   int plane;
   assert(bsize < BLOCK_SIZES_ALL);
   for (plane = plane_from; plane <= plane_to; ++plane) {
@@ -405,3 +425,283 @@ void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
         ext_dst1[plane], ext_dst_stride1[plane]);
   }
 }
+
+// Get pred block from up-sampled reference.
+void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                          int mi_row, int mi_col, const MV *const mv,
+                          uint8_t *comp_pred, int width, int height,
+                          int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                          int ref_stride, int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    for (int i = 0; i < height; i++) {
+      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+      comp_pred += width;
+      ref += ref_stride;
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
+                          -1, width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
+                         16, width, height);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
+                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+                          width, intermediate_height);
+    aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
+                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+                         width, height);
+  }
+}
+
+void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                   int mi_row, int mi_col, const MV *const mv,
+                                   uint8_t *comp_pred, const uint8_t *pred,
+                                   int width, int height, int subpel_x_q3,
+                                   int subpel_y_q3, const uint8_t *ref,
+                                   int ref_stride, int subpel_search) {
+  int i, j;
+
+  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint8_t *comp_pred, const uint8_t *pred,
+                                    int width, int height, int subpel_x_q3,
+                                    int subpel_y_q3, const uint8_t *ref,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask,
+                                    int subpel_search) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                         subpel_search);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
+                       mask_stride, invert_mask);
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+
+  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint8_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
+                                 const struct AV1Common *const cm, int mi_row,
+                                 int mi_col, const MV *const mv,
+                                 uint8_t *comp_pred8, int width, int height,
+                                 int subpel_x_q3, int subpel_y_q3,
+                                 const uint8_t *ref8, int ref_stride, int bd,
+                                 int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+    for (int i = 0; i < height; i++) {
+      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+      comp_pred += width;
+      ref += ref_stride;
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
+                                 16, NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
+                                kernel, 16, width, height, bd);
+  } else {
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
+                                 ref_stride, CONVERT_TO_BYTEPTR(temp),
+                                 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                                 intermediate_height, bd);
+    aom_highbd_convolve8_vert_c(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+        bd);
+  }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, int subpel_search) {
+  int i, j;
+
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+    int subpel_search) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                              height, subpel_x_q3, subpel_y_q3, ref8,
+                              ref_stride, bd, subpel_search);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_highbd_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd, int subpel_search) {
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
+                            mask, mask_stride, invert_mask);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/av1/encoder/reconinter_enc.h b/media/libaom/src/av1/encoder/reconinter_enc.h
index fdc1f31c84..5d32545f50 100644
--- a/media/libaom/src/av1/encoder/reconinter_enc.h
+++ b/media/libaom/src/av1/encoder/reconinter_enc.h
@@ -24,6 +24,13 @@
 extern "C" {
 #endif
 
+void aom_highbd_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd, int subpel_search);
+
 // Build single or compound reference inter predictors for all planes.
 // Can build inter-intra predictors, masked predictors etc as well.
 void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -54,16 +61,18 @@ void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
 
 void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd);
 
+// |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive.
 void av1_build_inter_predictors_for_planes_single_buf(
     MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
-    uint8_t *ext_dst[3], int ext_dst_stride[3]);
+    uint8_t *ext_dst[], int ext_dst_stride[]);
 
+// |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive.
 void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                               int plane_from, int plane_to,
-                                              uint8_t *ext_dst0[3],
-                                              int ext_dst_stride0[3],
-                                              uint8_t *ext_dst1[3],
-                                              int ext_dst_stride1[3]);
+                                              uint8_t *ext_dst0[],
+                                              int ext_dst_stride0[],
+                                              uint8_t *ext_dst1[],
+                                              int ext_dst_stride1[]);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/av1/encoder/segmentation.c b/media/libaom/src/av1/encoder/segmentation.c
index 0c029c0e6e..d315838855 100644
--- a/media/libaom/src/av1/encoder/segmentation.c
+++ b/media/libaom/src/av1/encoder/segmentation.c
@@ -43,203 +43,6 @@ void av1_clear_segdata(struct segmentation *seg, int segment_id,
   seg->feature_data[segment_id][feature_id] = 0;
 }
 
-static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                       const TileInfo *tile, MB_MODE_INFO **mi,
-                       unsigned *no_pred_segcounts,
-                       unsigned (*temporal_predictor_count)[2],
-                       unsigned *t_unpred_seg_counts, int bw, int bh,
-                       int mi_row, int mi_col) {
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
-
-  xd->mi = mi;
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
-                 mi_params->mi_cols);
-
-  // Count the number of hits on each segment with no prediction
-  const int segment_id = xd->mi[0]->segment_id;
-  no_pred_segcounts[segment_id]++;
-
-  // Temporal prediction not allowed on key frames
-  if (cm->current_frame.frame_type != KEY_FRAME) {
-    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-    // Test to see if the segment id matches the predicted value.
-    const int pred_segment_id =
-        cm->last_frame_seg_map
-            ? get_segment_id(mi_params, cm->last_frame_seg_map, bsize, mi_row,
-                             mi_col)
-            : 0;
-    const int pred_flag = pred_segment_id == segment_id;
-    const int pred_context = av1_get_pred_context_seg_id(xd);
-
-    // Store the prediction status for this mb and update counts
-    // as appropriate
-    xd->mi[0]->seg_id_predicted = pred_flag;
-    temporal_predictor_count[pred_context][pred_flag]++;
-
-    // Update the "unpredicted" segment count
-    if (!pred_flag) t_unpred_seg_counts[segment_id]++;
-  }
-}
-
-static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                          const TileInfo *tile, MB_MODE_INFO **mi,
-                          unsigned *no_pred_segcounts,
-                          unsigned (*temporal_predictor_count)[2],
-                          unsigned *t_unpred_seg_counts, int mi_row, int mi_col,
-                          BLOCK_SIZE bsize) {
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int mis = mi_params->mi_stride;
-  const int bs = mi_size_wide[bsize], hbs = bs / 2;
-  PARTITION_TYPE partition;
-  const int qbs = bs / 4;
-
-  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
-
-#define CSEGS(cs_bw, cs_bh, cs_rowoff, cs_coloff)                              \
-  count_segs(cm, xd, tile, mi + mis * (cs_rowoff) + (cs_coloff),               \
-             no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, \
-             (cs_bw), (cs_bh), mi_row + (cs_rowoff), mi_col + (cs_coloff));
-
-  if (bsize == BLOCK_8X8)
-    partition = PARTITION_NONE;
-  else
-    partition = get_partition(cm, mi_row, mi_col, bsize);
-  switch (partition) {
-    case PARTITION_NONE: CSEGS(bs, bs, 0, 0); break;
-    case PARTITION_HORZ:
-      CSEGS(bs, hbs, 0, 0);
-      CSEGS(bs, hbs, hbs, 0);
-      break;
-    case PARTITION_VERT:
-      CSEGS(hbs, bs, 0, 0);
-      CSEGS(hbs, bs, 0, hbs);
-      break;
-    case PARTITION_HORZ_A:
-      CSEGS(hbs, hbs, 0, 0);
-      CSEGS(hbs, hbs, 0, hbs);
-      CSEGS(bs, hbs, hbs, 0);
-      break;
-    case PARTITION_HORZ_B:
-      CSEGS(bs, hbs, 0, 0);
-      CSEGS(hbs, hbs, hbs, 0);
-      CSEGS(hbs, hbs, hbs, hbs);
-      break;
-    case PARTITION_VERT_A:
-      CSEGS(hbs, hbs, 0, 0);
-      CSEGS(hbs, hbs, hbs, 0);
-      CSEGS(hbs, bs, 0, hbs);
-      break;
-    case PARTITION_VERT_B:
-      CSEGS(hbs, bs, 0, 0);
-      CSEGS(hbs, hbs, 0, hbs);
-      CSEGS(hbs, hbs, hbs, hbs);
-      break;
-    case PARTITION_HORZ_4:
-      CSEGS(bs, qbs, 0, 0);
-      CSEGS(bs, qbs, qbs, 0);
-      CSEGS(bs, qbs, 2 * qbs, 0);
-      if (mi_row + 3 * qbs < mi_params->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
-      break;
-
-    case PARTITION_VERT_4:
-      CSEGS(qbs, bs, 0, 0);
-      CSEGS(qbs, bs, 0, qbs);
-      CSEGS(qbs, bs, 0, 2 * qbs);
-      if (mi_col + 3 * qbs < mi_params->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
-      break;
-
-    case PARTITION_SPLIT: {
-      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-      int n;
-      assert(subsize < BLOCK_SIZES_ALL);
-
-      for (n = 0; n < 4; n++) {
-        const int mi_dc = hbs * (n & 1);
-        const int mi_dr = hbs * (n >> 1);
-
-        count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts,
-                      temporal_predictor_count, t_unpred_seg_counts,
-                      mi_row + mi_dr, mi_col + mi_dc, subsize);
-      }
-    } break;
-    default: assert(0);
-  }
-
-#undef CSEGS
-}
-
-void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
-  struct segmentation *seg = &cm->seg;
-  struct segmentation_probs *segp = &cm->fc->seg;
-  int no_pred_cost;
-  int t_pred_cost = INT_MAX;
-  int tile_col, tile_row, mi_row, mi_col;
-  unsigned temporal_predictor_count[SEG_TEMPORAL_PRED_CTXS][2] = { { 0 } };
-  unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 };
-  unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
-  (void)xd;
-  int scale_up = cm->prev_frame && (cm->width > cm->prev_frame->width ||
-                                    cm->height > cm->prev_frame->height);
-  // First of all generate stats regarding how well the last segment map
-  // predicts this one
-  if (!scale_up) {
-    for (tile_row = 0; tile_row < cm->tiles.rows; tile_row++) {
-      TileInfo tile_info;
-      av1_tile_set_row(&tile_info, cm, tile_row);
-      for (tile_col = 0; tile_col < cm->tiles.cols; tile_col++) {
-        MB_MODE_INFO **mi_ptr;
-        av1_tile_set_col(&tile_info, cm, tile_col);
-        mi_ptr = cm->mi_params.mi_grid_base +
-                 tile_info.mi_row_start * cm->mi_params.mi_stride +
-                 tile_info.mi_col_start;
-        for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-             mi_row += cm->seq_params.mib_size,
-            mi_ptr += cm->seq_params.mib_size * cm->mi_params.mi_stride) {
-          MB_MODE_INFO **mi = mi_ptr;
-          for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-               mi_col += cm->seq_params.mib_size,
-              mi += cm->seq_params.mib_size) {
-            count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
-                          temporal_predictor_count, t_unpred_seg_counts, mi_row,
-                          mi_col, cm->seq_params.sb_size);
-          }
-        }
-      }
-    }
-  }
-
-  int seg_id_cost[MAX_SEGMENTS];
-  av1_cost_tokens_from_cdf(seg_id_cost, segp->tree_cdf, NULL);
-  no_pred_cost = 0;
-  for (int i = 0; i < MAX_SEGMENTS; ++i)
-    no_pred_cost += no_pred_segcounts[i] * seg_id_cost[i];
-
-  // Frames without past dependency cannot use temporal prediction
-  if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) {
-    int pred_flag_cost[SEG_TEMPORAL_PRED_CTXS][2];
-    for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i)
-      av1_cost_tokens_from_cdf(pred_flag_cost[i], segp->pred_cdf[i], NULL);
-    t_pred_cost = 0;
-    // Cost for signaling the prediction flag.
-    for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) {
-      for (int j = 0; j < 2; ++j)
-        t_pred_cost += temporal_predictor_count[i][j] * pred_flag_cost[i][j];
-    }
-    // Cost for signaling the unpredicted segment id.
-    for (int i = 0; i < MAX_SEGMENTS; ++i)
-      t_pred_cost += t_unpred_seg_counts[i] * seg_id_cost[i];
-  }
-
-  // Now choose which coding method to use.
-  if (t_pred_cost < no_pred_cost) {
-    assert(!cm->features.error_resilient_mode);
-    seg->temporal_update = 1;
-  } else {
-    seg->temporal_update = 0;
-  }
-}
-
 void av1_reset_segment_features(AV1_COMMON *cm) {
   struct segmentation *seg = &cm->seg;
 
diff --git a/media/libaom/src/av1/encoder/sorting_network.h b/media/libaom/src/av1/encoder/sorting_network.h
new file mode 100644
index 0000000000..54f4c19dcd
--- /dev/null
+++ b/media/libaom/src/av1/encoder/sorting_network.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*! \file
+ * This file contains several utility functions used to sort small arrays with
+ * sorting networks.
+ *
+ * Sorting network is a (potentially branch-less) way to quickly sort small
+ * arrays with known size. For more details, consult
+ * (https://en.wikipedia.org/wiki/Sorting_network).
+ */
+#ifndef AOM_AV1_ENCODER_SORTING_NETWORK_H_
+#define AOM_AV1_ENCODER_SORTING_NETWORK_H_
+
+#include "aom/aom_integer.h"
+
+#define SWAP(i, j)                                   \
+  do {                                               \
+    const float maxf = (k[i] >= k[j]) ? k[i] : k[j]; \
+    const float minf = (k[i] >= k[j]) ? k[j] : k[i]; \
+    const int maxi = (k[i] >= k[j]) ? v[i] : v[j];   \
+    const int mini = (k[i] >= k[j]) ? v[j] : v[i];   \
+    k[i] = maxf;                                     \
+    k[j] = minf;                                     \
+    v[i] = maxi;                                     \
+    v[j] = mini;                                     \
+  } while (0)
+
+/*!\brief Sorts two size-16 arrays of keys and values in descending order of
+ * keys.
+ *
+ * \param[in,out]    k          An length-16 array of float serves as the keys.
+ * \param[in,out]    v          An length-16 array of int32 serves as the
+ *                              value.
+ */
+static AOM_INLINE void av1_sort_fi32_16(float k[], int32_t v[]) {
+  SWAP(0, 1);
+  SWAP(2, 3);
+  SWAP(4, 5);
+  SWAP(6, 7);
+  SWAP(8, 9);
+  SWAP(10, 11);
+  SWAP(12, 13);
+  SWAP(14, 15);
+  SWAP(0, 2);
+  SWAP(1, 3);
+  SWAP(4, 6);
+  SWAP(5, 7);
+  SWAP(8, 10);
+  SWAP(9, 11);
+  SWAP(12, 14);
+  SWAP(13, 15);
+  SWAP(1, 2);
+  SWAP(5, 6);
+  SWAP(0, 4);
+  SWAP(3, 7);
+  SWAP(9, 10);
+  SWAP(13, 14);
+  SWAP(8, 12);
+  SWAP(11, 15);
+  SWAP(1, 5);
+  SWAP(2, 6);
+  SWAP(9, 13);
+  SWAP(10, 14);
+  SWAP(0, 8);
+  SWAP(7, 15);
+  SWAP(1, 4);
+  SWAP(3, 6);
+  SWAP(9, 12);
+  SWAP(11, 14);
+  SWAP(2, 4);
+  SWAP(3, 5);
+  SWAP(10, 12);
+  SWAP(11, 13);
+  SWAP(1, 9);
+  SWAP(6, 14);
+  SWAP(3, 4);
+  SWAP(11, 12);
+  SWAP(1, 8);
+  SWAP(2, 10);
+  SWAP(5, 13);
+  SWAP(7, 14);
+  SWAP(3, 11);
+  SWAP(2, 8);
+  SWAP(4, 12);
+  SWAP(7, 13);
+  SWAP(3, 10);
+  SWAP(5, 12);
+  SWAP(3, 9);
+  SWAP(6, 12);
+  SWAP(3, 8);
+  SWAP(7, 12);
+  SWAP(5, 9);
+  SWAP(6, 10);
+  SWAP(4, 8);
+  SWAP(7, 11);
+  SWAP(5, 8);
+  SWAP(7, 10);
+  SWAP(6, 8);
+  SWAP(7, 9);
+  SWAP(7, 8);
+}
+
+/*!\brief Sorts two size-8 arrays of keys and values in descending order of
+ * keys.
+ *
+ * \param[in,out]    k          An length-8 array of float serves as the keys.
+ * \param[in,out]    v          An length-8 array of int32 serves as the values.
+ */
+static AOM_INLINE void av1_sort_fi32_8(float k[], int32_t v[]) {
+  SWAP(0, 1);
+  SWAP(2, 3);
+  SWAP(4, 5);
+  SWAP(6, 7);
+  SWAP(0, 2);
+  SWAP(1, 3);
+  SWAP(4, 6);
+  SWAP(5, 7);
+  SWAP(1, 2);
+  SWAP(5, 6);
+  SWAP(0, 4);
+  SWAP(3, 7);
+  SWAP(1, 5);
+  SWAP(2, 6);
+  SWAP(1, 4);
+  SWAP(3, 6);
+  SWAP(2, 4);
+  SWAP(3, 5);
+  SWAP(3, 4);
+}
+#undef SWAP
+#endif  // AOM_AV1_ENCODER_SORTING_NETWORK_H_
diff --git a/media/libaom/src/av1/encoder/sparse_linear_solver.c b/media/libaom/src/av1/encoder/sparse_linear_solver.c
new file mode 100644
index 0000000000..e47c78e148
--- /dev/null
+++ b/media/libaom/src/av1/encoder/sparse_linear_solver.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/sparse_linear_solver.h"
+#include "config/aom_config.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/alloccommon.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+/*
+ * Input:
+ * rows: array of row positions
+ * cols: array of column positions
+ * values: array of element values
+ * num_elem: total number of elements in the matrix
+ * num_rows: number of rows in the matrix
+ * num_cols: number of columns in the matrix
+ *
+ * Output:
+ * sm: pointer to the sparse matrix to be initialized
+ *
+ * Return: 0  - success
+ *         -1 - failed
+ */
+int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
+                        int num_elem, int num_rows, int num_cols,
+                        SPARSE_MTX *sm) {
+  sm->n_elem = num_elem;
+  sm->n_rows = num_rows;
+  sm->n_cols = num_cols;
+  if (num_elem == 0) {
+    sm->row_pos = NULL;
+    sm->col_pos = NULL;
+    sm->value = NULL;
+    return 0;
+  }
+  sm->row_pos = aom_calloc(num_elem, sizeof(*sm->row_pos));
+  sm->col_pos = aom_calloc(num_elem, sizeof(*sm->col_pos));
+  sm->value = aom_calloc(num_elem, sizeof(*sm->value));
+
+  if (!sm->row_pos || !sm->col_pos || !sm->value) {
+    av1_free_sparse_mtx_elems(sm);
+    return -1;
+  }
+
+  memcpy(sm->row_pos, rows, num_elem * sizeof(*sm->row_pos));
+  memcpy(sm->col_pos, cols, num_elem * sizeof(*sm->col_pos));
+  memcpy(sm->value, values, num_elem * sizeof(*sm->value));
+
+  return 0;
+}
+
+/*
+ * Combines two sparse matrices (allocating new space).
+ *
+ * Input:
+ * sm1, sm2: matrices to be combined
+ * row_offset1, row_offset2: row offset of each matrix in the new matrix
+ * col_offset1, col_offset2: column offset of each matrix in the new matrix
+ * new_n_rows, new_n_cols: number of rows and columns in the new matrix
+ *
+ * Output:
+ * sm: the combined matrix
+ *
+ * Return: 0  - success
+ *         -1 - failed
+ */
+int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
+                                SPARSE_MTX *sm, int row_offset1,
+                                int col_offset1, int row_offset2,
+                                int col_offset2, int new_n_rows,
+                                int new_n_cols) {
+  sm->n_elem = sm1->n_elem + sm2->n_elem;
+  sm->n_cols = new_n_cols;
+  sm->n_rows = new_n_rows;
+
+  if (sm->n_elem == 0) {
+    sm->row_pos = NULL;
+    sm->col_pos = NULL;
+    sm->value = NULL;
+    return 0;
+  }
+
+  sm->row_pos = aom_calloc(sm->n_elem, sizeof(*sm->row_pos));
+  sm->col_pos = aom_calloc(sm->n_elem, sizeof(*sm->col_pos));
+  sm->value = aom_calloc(sm->n_elem, sizeof(*sm->value));
+
+  if (!sm->row_pos || !sm->col_pos || !sm->value) {
+    av1_free_sparse_mtx_elems(sm);
+    return -1;
+  }
+
+  for (int i = 0; i < sm1->n_elem; i++) {
+    sm->row_pos[i] = sm1->row_pos[i] + row_offset1;
+    sm->col_pos[i] = sm1->col_pos[i] + col_offset1;
+  }
+  memcpy(sm->value, sm1->value, sm1->n_elem * sizeof(*sm1->value));
+  int n_elem1 = sm1->n_elem;
+  for (int i = 0; i < sm2->n_elem; i++) {
+    sm->row_pos[n_elem1 + i] = sm2->row_pos[i] + row_offset2;
+    sm->col_pos[n_elem1 + i] = sm2->col_pos[i] + col_offset2;
+  }
+  memcpy(sm->value + n_elem1, sm2->value, sm2->n_elem * sizeof(*sm2->value));
+  return 0;
+}
+
+void av1_free_sparse_mtx_elems(SPARSE_MTX *sm) {
+  sm->n_cols = 0;
+  sm->n_rows = 0;
+  if (sm->n_elem != 0) {
+    aom_free(sm->row_pos);
+    aom_free(sm->col_pos);
+    aom_free(sm->value);
+  }
+  sm->n_elem = 0;
+}
+
+/*
+ * Calculate matrix and vector multiplication: A*b
+ *
+ * Input:
+ * sm: matrix A
+ * srcv: the vector b to be multiplied to
+ * dstl: the length of vectors
+ *
+ * Output:
+ * dstv: pointer to the resulting vector
+ */
+void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv,
+                              double *dstv, int dstl) {
+  memset(dstv, 0, sizeof(*dstv) * dstl);
+  for (int i = 0; i < sm->n_elem; i++) {
+    dstv[sm->row_pos[i]] += srcv[sm->col_pos[i]] * sm->value[i];
+  }
+}
+/*
+ * Calculate matrix and vector multiplication: b*A
+ *
+ * Input:
+ * sm: matrix A
+ * srcv: the vector b to be multiplied to
+ * dstl: the length of vectors
+ *
+ * Output:
+ * dstv: pointer to the resulting vector
+ */
+void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv,
+                             double *dstv, int dstl) {
+  memset(dstv, 0, sizeof(*dstv) * dstl);
+  for (int i = 0; i < sm->n_elem; i++) {
+    dstv[sm->col_pos[i]] += srcv[sm->row_pos[i]] * sm->value[i];
+  }
+}
+
+/*
+ * Calculate inner product of two vectors
+ *
+ * Input:
+ * src1, scr2: the vectors to be multiplied
+ * src1l: length of the vectors
+ *
+ * Output:
+ * the inner product
+ */
+double av1_vect_vect_multi(const double *src1, int src1l, const double *src2) {
+  double result = 0;
+  for (int i = 0; i < src1l; i++) {
+    result += src1[i] * src2[i];
+  }
+  return result;
+}
+
+/*
+ * Multiply each element in the matrix sm with a constant c
+ */
+void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c) {
+  for (int i = 0; i < sm->n_elem; i++) {
+    sm->value[i] *= c;
+  }
+}
+
+static INLINE void free_solver_local_buf(double *buf1, double *buf2,
+                                         double *buf3, double *buf4,
+                                         double *buf5, double *buf6,
+                                         double *buf7) {
+  aom_free(buf1);
+  aom_free(buf2);
+  aom_free(buf3);
+  aom_free(buf4);
+  aom_free(buf5);
+  aom_free(buf6);
+  aom_free(buf7);
+}
+
+/*
+ * Solve for Ax = b
+ * no requirement on A
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0  - success
+ *         -1 - failed
+ */
+int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
+                                     int bl, double *x) {
+  double *r = NULL, *r_hat = NULL, *p = NULL, *p_hat = NULL, *Ap = NULL,
+         *p_hatA = NULL, *x_hat = NULL;
+  double alpha, beta, rtr, r_norm_2;
+  double denormtemp;
+
+  // initialize
+  r = aom_calloc(bl, sizeof(*r));
+  r_hat = aom_calloc(bl, sizeof(*r_hat));
+  p = aom_calloc(bl, sizeof(*p));
+  p_hat = aom_calloc(bl, sizeof(*p_hat));
+  Ap = aom_calloc(bl, sizeof(*Ap));
+  p_hatA = aom_calloc(bl, sizeof(*p_hatA));
+  x_hat = aom_calloc(bl, sizeof(*x_hat));
+  if (!r || !r_hat || !p || !p_hat || !Ap || !p_hatA || !x_hat) {
+    free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
+    return -1;
+  }
+
+  int i;
+  for (i = 0; i < bl; i++) {
+    r[i] = b[i];
+    r_hat[i] = b[i];
+    p[i] = r[i];
+    p_hat[i] = r_hat[i];
+    x[i] = 0;
+    x_hat[i] = 0;
+  }
+  r_norm_2 = av1_vect_vect_multi(r_hat, bl, r);
+  for (int k = 0; k < MAX_CG_SP_ITER; k++) {
+    rtr = r_norm_2;
+    av1_mtx_vect_multi_right(A, p, Ap, bl);
+    av1_mtx_vect_multi_left(A, p_hat, p_hatA, bl);
+
+    denormtemp = av1_vect_vect_multi(p_hat, bl, Ap);
+    if (denormtemp < 1e-10) break;
+    alpha = rtr / denormtemp;
+    r_norm_2 = 0;
+    for (i = 0; i < bl; i++) {
+      x[i] += alpha * p[i];
+      x_hat[i] += alpha * p_hat[i];
+      r[i] -= alpha * Ap[i];
+      r_hat[i] -= alpha * p_hatA[i];
+      r_norm_2 += r_hat[i] * r[i];
+    }
+    if (sqrt(r_norm_2) < 1e-2) {
+      break;
+    }
+    if (rtr < 1e-10) break;
+    beta = r_norm_2 / rtr;
+    for (i = 0; i < bl; i++) {
+      p[i] = r[i] + beta * p[i];
+      p_hat[i] = r_hat[i] + beta * p_hat[i];
+    }
+  }
+  // free
+  free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
+  return 0;
+}
+
+/*
+ * Solve for Ax = b when A is symmetric and positive definite
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0  - success
+ *         -1 - failed
+ */
+int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
+                                  double *x) {
+  double *r = NULL, *p = NULL, *Ap = NULL;
+  double alpha, beta, rtr, r_norm_2;
+  double denormtemp;
+
+  // initialize
+  r = aom_calloc(bl, sizeof(*r));
+  p = aom_calloc(bl, sizeof(*p));
+  Ap = aom_calloc(bl, sizeof(*Ap));
+  if (!r || !p || !Ap) {
+    free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);
+    return -1;
+  }
+
+  int i;
+  for (i = 0; i < bl; i++) {
+    r[i] = b[i];
+    p[i] = r[i];
+    x[i] = 0;
+  }
+  r_norm_2 = av1_vect_vect_multi(r, bl, r);
+  int k;
+  for (k = 0; k < MAX_CG_SP_ITER; k++) {
+    rtr = r_norm_2;
+    av1_mtx_vect_multi_right(A, p, Ap, bl);
+    denormtemp = av1_vect_vect_multi(p, bl, Ap);
+    if (denormtemp < 1e-10) break;
+    alpha = rtr / denormtemp;
+    r_norm_2 = 0;
+    for (i = 0; i < bl; i++) {
+      x[i] += alpha * p[i];
+      r[i] -= alpha * Ap[i];
+      r_norm_2 += r[i] * r[i];
+    }
+    if (r_norm_2 < 1e-8 * bl) break;
+    if (rtr < 1e-10) break;
+    beta = r_norm_2 / rtr;
+    for (i = 0; i < bl; i++) {
+      p[i] = r[i] + beta * p[i];
+    }
+  }
+  // free
+  free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);
+
+  return 0;
+}
+
+/*
+ * Solve for Ax = b using Jacobi method
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0  - success
+ *         -1 - failed
+ */
+int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x) {
+  double *diags = NULL, *Rx = NULL, *x_last = NULL, *x_cur = NULL,
+         *tempx = NULL;
+  double resi2;
+
+  diags = aom_calloc(bl, sizeof(*diags));
+  Rx = aom_calloc(bl, sizeof(*Rx));
+  x_last = aom_calloc(bl, sizeof(*x_last));
+  x_cur = aom_calloc(bl, sizeof(*x_cur));
+
+  if (!diags || !Rx || !x_last || !x_cur) {
+    free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
+    return -1;
+  }
+
+  int i;
+  memset(x_last, 0, sizeof(*x_last) * bl);
+  // get the diagonals of A
+  memset(diags, 0, sizeof(*diags) * bl);
+  for (int c = 0; c < A->n_elem; c++) {
+    if (A->row_pos[c] != A->col_pos[c]) continue;
+    diags[A->row_pos[c]] = A->value[c];
+  }
+  int k;
+  for (k = 0; k < MAX_CG_SP_ITER; k++) {
+    // R = A - diag(diags)
+    // get R*x_last
+    memset(Rx, 0, sizeof(*Rx) * bl);
+    for (int c = 0; c < A->n_elem; c++) {
+      if (A->row_pos[c] == A->col_pos[c]) continue;
+      Rx[A->row_pos[c]] += x_last[A->col_pos[c]] * A->value[c];
+    }
+    resi2 = 0;
+    for (i = 0; i < bl; i++) {
+      x_cur[i] = (b[i] - Rx[i]) / diags[i];
+      resi2 += (x_last[i] - x_cur[i]) * (x_last[i] - x_cur[i]);
+    }
+    if (resi2 <= 1e-10 * bl) break;
+    // swap last & cur buffer ptrs
+    tempx = x_last;
+    x_last = x_cur;
+    x_cur = tempx;
+  }
+  printf("\n numiter: %d\n", k);
+  for (i = 0; i < bl; i++) {
+    x[i] = x_cur[i];
+  }
+  free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
+  return 0;
+}
+
+/*
+ * Solve for Ax = b using Steepest descent method
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0  - success
+ *         -1 - failed
+ */
+int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
+                                double *x) {
+  double *d = NULL, *Ad = NULL, *Ax = NULL;
+  double resi2, resi2_last, dAd, temp;
+
+  d = aom_calloc(bl, sizeof(*d));
+  Ax = aom_calloc(bl, sizeof(*Ax));
+  Ad = aom_calloc(bl, sizeof(*Ad));
+
+  if (!d || !Ax || !Ad) {
+    free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);
+    return -1;
+  }
+
+  int i;
+  // initialize with 0s
+  resi2 = 0;
+  for (i = 0; i < bl; i++) {
+    x[i] = 0;
+    d[i] = b[i];
+    resi2 += d[i] * d[i] / bl;
+  }
+  int k;
+  for (k = 0; k < MAX_CG_SP_ITER; k++) {
+    // get A*x_last
+    av1_mtx_vect_multi_right(A, d, Ad, bl);
+    dAd = resi2 * bl / av1_vect_vect_multi(d, bl, Ad);
+    for (i = 0; i < bl; i++) {
+      temp = dAd * d[i];
+      x[i] = x[i] + temp;
+    }
+    av1_mtx_vect_multi_right(A, x, Ax, bl);
+    resi2_last = resi2;
+    resi2 = 0;
+    for (i = 0; i < bl; i++) {
+      d[i] = b[i] - Ax[i];
+      resi2 += d[i] * d[i] / bl;
+    }
+    if (resi2 <= 1e-8) break;
+    if (resi2_last - resi2 < 1e-8) {
+      break;
+    }
+  }
+  free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);
+
+  return 0;
+}
+
+#endif  // CONFIG_OPTICAL_FLOW_API
diff --git a/media/libaom/src/av1/encoder/sparse_linear_solver.h b/media/libaom/src/av1/encoder/sparse_linear_solver.h
new file mode 100644
index 0000000000..f30fc0f5b1
--- /dev/null
+++ b/media/libaom/src/av1/encoder/sparse_linear_solver.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_
+#define AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+
+// Number of iterations for solving linear equations.
+#define MAX_CG_SP_ITER 100
+
+typedef struct {
+  int n_elem;  // number of non-zero elements
+  int n_rows;
+  int n_cols;
+  // using arrays to represent non-zero elements.
+  int *col_pos;
+  int *row_pos;  // starts with 0
+  double *value;
+} SPARSE_MTX;
+
+int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
+                        int num_elem, int num_rows, int num_cols,
+                        SPARSE_MTX *sm);
+int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
+                                SPARSE_MTX *sm, int row_offset1,
+                                int col_offset1, int row_offset2,
+                                int col_offset2, int new_n_rows,
+                                int new_n_cols);
+void av1_free_sparse_mtx_elems(SPARSE_MTX *sm);
+
+void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv,
+                              double *dstv, int dstl);
+void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv,
+                             double *dstv, int dstl);
+double av1_vect_vect_multi(const double *src1, int src1l, const double *src2);
+void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c);
+
+int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
+                                  double *x);
+int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
+                                     int bl, double *x);
+int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x);
+int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
+                                double *x);
+
+#endif  // CONFIG_OPTICAL_FLOW_API
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ */
diff --git a/media/libaom/src/av1/encoder/speed_features.c b/media/libaom/src/av1/encoder/speed_features.c
index e03faeccc6..a671ea77d1 100644
--- a/media/libaom/src/av1/encoder/speed_features.c
+++ b/media/libaom/src/av1/encoder/speed_features.c
@@ -50,8 +50,11 @@ static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
 // Index 2: Winner mode evaluation. Index 1 and 2 are applicable when
 // enable_winner_mode_for_use_tx_domain_dist speed feature is ON
 // TODO(any): Experiment the threshold logic based on variance metric
-static unsigned int tx_domain_dist_thresholds[3][MODE_EVAL_TYPES] = {
-  { UINT_MAX, UINT_MAX, UINT_MAX }, { 22026, 22026, 22026 }, { 0, 0, 0 }
+static unsigned int tx_domain_dist_thresholds[4][MODE_EVAL_TYPES] = {
+  { UINT_MAX, UINT_MAX, UINT_MAX },
+  { 22026, 22026, 22026 },
+  { 1377, 1377, 1377 },
+  { 0, 0, 0 }
 };
 
 // Transform domain distortion type to be used for default, mode and winner mode
@@ -66,6 +69,7 @@ static unsigned int tx_domain_dist_types[3][MODE_EVAL_TYPES] = { { 0, 2, 0 },
 // Threshold values to be used for disabling coeff RD-optimization
 // based on block MSE / qstep^2.
 // TODO(any): Experiment the threshold logic based on variance metric.
+// Table has satd and dist threshold value index 0 : dist,index 1: satd
 // For each row, the indices are as follows.
 // Index 0: Default mode evaluation, Winner mode processing is not applicable
 // (Eg : IntraBc)
@@ -73,14 +77,17 @@ static unsigned int tx_domain_dist_types[3][MODE_EVAL_TYPES] = { { 0, 2, 0 },
 // Index 2: Winner mode evaluation.
 // Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed
 // feature is ON
-// There are 6 levels with increasing speed, mapping to vertical indices.
-static unsigned int coeff_opt_dist_thresholds[6][MODE_EVAL_TYPES] = {
-  { UINT_MAX, UINT_MAX, UINT_MAX },
-  { 3200, 250, UINT_MAX },
-  { 1728, 142, UINT_MAX },
-  { 864, 142, UINT_MAX },
-  { 432, 86, UINT_MAX },
-  { 216, 86, UINT_MAX }
+// There are 7 levels with increasing speed, mapping to vertical indices.
+static unsigned int coeff_opt_thresholds[9][MODE_EVAL_TYPES][2] = {
+  { { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+  { { 3200, UINT_MAX }, { 250, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+  { { 1728, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+  { { 864, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+  { { 432, UINT_MAX }, { 86, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+  { { 864, 97 }, { 142, 16 }, { UINT_MAX, UINT_MAX } },
+  { { 432, 97 }, { 86, 16 }, { UINT_MAX, UINT_MAX } },
+  { { 216, 25 }, { 86, 10 }, { UINT_MAX, UINT_MAX } },
+  { { 216, 25 }, { 0, 10 }, { UINT_MAX, UINT_MAX } }
 };
 
 // Transform size to be used for default, mode and winner mode evaluation
@@ -88,10 +95,11 @@ static unsigned int coeff_opt_dist_thresholds[6][MODE_EVAL_TYPES] = {
 // (Eg : IntraBc) Index 1: Mode evaluation. Index 2: Winner mode evaluation.
 // Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed
 // feature is ON
-static TX_SIZE_SEARCH_METHOD tx_size_search_methods[3][MODE_EVAL_TYPES] = {
+static TX_SIZE_SEARCH_METHOD tx_size_search_methods[4][MODE_EVAL_TYPES] = {
   { USE_FULL_RD, USE_LARGESTALL, USE_FULL_RD },
   { USE_FAST_RD, USE_LARGESTALL, USE_FULL_RD },
-  { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD }
+  { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD },
+  { USE_LARGESTALL, USE_LARGESTALL, USE_LARGESTALL }
 };
 
 // Predict transform skip levels to be used for default, mode and winner mode
@@ -105,30 +113,448 @@ static unsigned int predict_skip_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 },
                                                                 { 1, 1, 1 },
                                                                 { 1, 2, 1 } };
 
+// Predict DC block levels to be used for default, mode and winner mode
+// evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+// Values indicate the aggressiveness of skip flag prediction.
+// 0 : no early DC block prediction
+// 1 : Early DC block prediction based on error variance
+static unsigned int predict_dc_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 },
+                                                              { 1, 1, 0 },
+                                                              { 1, 1, 1 } };
+
+#if !CONFIG_FRAME_PARALLEL_ENCODE || \
+    (CONFIG_FRAME_PARALLEL_ENCODE && !CONFIG_FPMT_TEST)
+// This table holds the maximum number of reference frames for global motion.
+// The table is indexed as per the speed feature 'gm_search_type'.
+// 0 : All reference frames are allowed.
+// 1 : All reference frames except L2 and L3 are allowed.
+// 2 : All reference frames except L2, L3 and ARF2 are allowed.
+// 3 : No reference frame is allowed.
+static int gm_available_reference_frames[GM_DISABLE_SEARCH + 1] = {
+  INTER_REFS_PER_FRAME, INTER_REFS_PER_FRAME - 2, INTER_REFS_PER_FRAME - 3, 0
+};
+#endif
+
+// Qindex threshold levels used for selecting full-pel motion search.
+// ms_qthresh[i][j][k] indicates the qindex boundary value for 'k'th qindex band
+// for resolution index 'j' for aggressiveness level 'i'.
+// Aggressiveness increases from i = 0 to 2.
+// j = 0: lower than 720p resolution, j = 1: 720p or larger resolution.
+// Currently invoked only for speed 0, 1 and 2.
+static int ms_qindex_thresh[3][2][2] = { { { 200, 70 }, { MAXQ, 200 } },
+                                         { { 170, 50 }, { MAXQ, 200 } },
+                                         { { 170, 40 }, { 200, 40 } } };
+
+// Full-pel search methods for aggressive search based on qindex.
+// Index 0 is for resolutions lower than 720p, index 1 for 720p or larger
+// resolutions. Currently invoked only for speed 1 and 2.
+static SEARCH_METHODS motion_search_method[2] = { CLAMPED_DIAMOND, DIAMOND };
+
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
 static int frame_is_boosted(const AV1_COMP *cpi) {
   return frame_is_kf_gf_arf(cpi);
 }
 
-static BLOCK_SIZE dim_to_size(int dim) {
-  switch (dim) {
-    case 4: return BLOCK_4X4;
-    case 8: return BLOCK_8X8;
-    case 16: return BLOCK_16X16;
-    case 32: return BLOCK_32X32;
-    case 64: return BLOCK_64X64;
-    case 128: return BLOCK_128X128;
-    default: assert(0); return 0;
+static void set_allintra_speed_feature_framesize_dependent(
+    const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+  const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
+  const bool use_hbd = cpi->oxcf.use_highbitdepth;
+
+  if (is_480p_or_larger) {
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+    if (is_720p_or_larger)
+      sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED;
+    else
+      sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED;
+  } else {
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+    if (use_hbd) sf->tx_sf.prune_tx_size_level = 1;
+  }
+
+  if (is_4k_or_larger) {
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+  }
+
+  // TODO(huisu@google.com): train models for 720P and above.
+  if (!is_720p_or_larger) {
+    sf->part_sf.ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+    sf->part_sf.ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+    sf->part_sf.ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+    sf->part_sf.ml_partition_search_breakout_thresh[3] = 500;  // BLOCK_64X64
+    sf->part_sf.ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
+    sf->part_sf.ml_early_term_after_part_split_level = 1;
+  }
+
+  if (is_720p_or_larger) {
+    // TODO(chiyotsai@google.com): make this speed feature adaptive based on
+    // current block's vertical texture instead of hardcoded with resolution
+    sf->mv_sf.use_downsampled_sad = 1;
+  }
+
+  if (speed >= 1) {
+    if (is_720p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    } else {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->part_sf.ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+      sf->part_sf.ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+      sf->part_sf.ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+      sf->part_sf.ml_partition_search_breakout_thresh[3] = 300;  // BLOCK_64X64
+      sf->part_sf.ml_partition_search_breakout_thresh[4] = -1;  // BLOCK_128X128
+    }
+    sf->part_sf.ml_early_term_after_part_split_level = 2;
+  }
+
+  if (speed >= 2) {
+    if (is_720p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    } else {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    }
+
+    if (is_720p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+      sf->part_sf.partition_search_breakout_rate_thr = 120;
+    } else {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 22);
+      sf->part_sf.partition_search_breakout_rate_thr = 100;
+    }
+
+    if (is_480p_or_larger) {
+      sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
+      if (use_hbd) sf->tx_sf.prune_tx_size_level = 2;
+    } else {
+      if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+    }
+  }
+
+  if (speed >= 3) {
+    sf->part_sf.ml_early_term_after_part_split_level = 0;
+
+    if (is_720p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+      sf->part_sf.partition_search_breakout_rate_thr = 200;
+    } else {
+      sf->part_sf.max_intra_bsize = BLOCK_32X32;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
+      sf->part_sf.partition_search_breakout_rate_thr = 120;
+    }
+    if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+  }
+
+  if (speed >= 4) {
+    if (is_720p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+    } else {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+    }
+
+    if (is_480p_or_larger) {
+      sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2;
+    }
+  }
+
+  if (speed >= 6) {
+    if (is_720p_or_larger) {
+      sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+    }
+
+    if (is_1080p_or_larger) {
+      sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    }
+
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16;
+  }
+
+  if (speed >= 7) {
+    // TODO(kyslov): add more speed features to control speed/quality
+  }
+
+  if (speed >= 8) {
+    if (!is_480p_or_larger) {
+      sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+    }
+    if (is_720p_or_larger) {
+      sf->rt_sf.force_large_partition_blocks_intra = 1;
+    }
+  }
+
+  if (speed >= 9) {
+    // TODO(kyslov): add more speed features to control speed/quality
+    if (!is_4k_or_larger) {
+      sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_OFF;
+      sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_OFF;
+    }
+  }
+}
+
+static void set_allintra_speed_features_framesize_independent(
+    const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int allow_screen_content_tools =
+      cm->features.allow_screen_content_tools;
+  const int use_hbd = cpi->oxcf.use_highbitdepth;
+
+  sf->part_sf.less_rectangular_check_level = 1;
+  sf->part_sf.ml_prune_partition = 1;
+  sf->part_sf.prune_ext_partition_types_search_level = 1;
+  sf->part_sf.prune_part4_search = 2;
+  sf->part_sf.simple_motion_search_prune_rect = 1;
+  sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
+  sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+  sf->part_sf.use_best_rd_for_pruning = 1;
+
+  sf->intra_sf.intra_pruning_with_hog = 1;
+  sf->intra_sf.prune_luma_palette_size_search_level = 1;
+  sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
+  sf->intra_sf.early_term_chroma_palette_size_search = 1;
+
+  sf->tx_sf.adaptive_txb_search_level = 1;
+  sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+  sf->tx_sf.model_based_prune_tx_search_level = 1;
+  sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+
+  sf->rt_sf.use_nonrd_pick_mode = 0;
+  sf->rt_sf.use_real_time_ref_set = 0;
+
+  if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION ||
+      cpi->use_screen_content_tools) {
+    sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
+  } else {
+    sf->mv_sf.exhaustive_searches_thresh = (1 << 25);
+  }
+
+  sf->rd_sf.perform_coeff_opt = 1;
+  sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL;
+
+  if (speed >= 1) {
+    sf->part_sf.intra_cnn_based_part_prune_level =
+        allow_screen_content_tools ? 0 : 2;
+    sf->part_sf.simple_motion_search_early_term_none = 1;
+    // TODO(Venkat): Clean-up frame type dependency for
+    // simple_motion_search_split in partition search function and set the
+    // speed feature accordingly
+    sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2;
+    sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3;
+    sf->part_sf.reuse_best_prediction_for_part_ab = 1;
+
+    sf->mv_sf.exhaustive_searches_thresh <<= 1;
+
+    sf->intra_sf.prune_palette_search_level = 1;
+    sf->intra_sf.prune_luma_palette_size_search_level = 2;
+    sf->intra_sf.top_intra_model_count_allowed = 3;
+
+    sf->tx_sf.adaptive_txb_search_level = 2;
+    sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+    sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.model_based_prune_tx_search_level = 0;
+    sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
+    sf->tx_sf.tx_type_search.skip_tx_search = 1;
+
+    sf->rd_sf.perform_coeff_opt = 2;
+    sf->rd_sf.tx_domain_dist_level = 1;
+    sf->rd_sf.tx_domain_dist_thres_level = 1;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
+    sf->lpf_sf.dual_sgr_penalty_level = 1;
+    sf->lpf_sf.enable_sgr_ep_pruning = 1;
+  }
+
+  if (speed >= 2) {
+    sf->mv_sf.auto_mv_step_size = 1;
+
+    sf->intra_sf.disable_smooth_intra = 1;
+    sf->intra_sf.intra_pruning_with_hog = 2;
+    sf->intra_sf.prune_filter_intra_level = 1;
+
+    sf->rd_sf.perform_coeff_opt = 3;
+
+    sf->lpf_sf.prune_wiener_based_on_src_var = 1;
+    sf->lpf_sf.prune_sgr_based_on_wiener = 1;
+  }
+
+  if (speed >= 3) {
+    sf->hl_sf.high_precision_mv_usage = CURRENT_Q;
+    sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+
+    sf->part_sf.less_rectangular_check_level = 2;
+    sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL1;
+    sf->part_sf.prune_ext_part_using_split_info = 1;
+
+    sf->mv_sf.full_pixel_search_level = 1;
+    sf->mv_sf.search_method = DIAMOND;
+
+    // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are
+    // inherited directly from luma hog with some minor tweaking. Eventually we
+    // should run this with a bayesian optimizer to find the Pareto frontier.
+    sf->intra_sf.chroma_intra_pruning_with_hog = 2;
+    sf->intra_sf.intra_pruning_with_hog = 3;
+    sf->intra_sf.prune_palette_search_level = 2;
+
+    sf->tx_sf.adaptive_txb_search_level = 2;
+    sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+
+    // TODO(any): evaluate if these lpf features can be moved to speed 2.
+    // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality
+    // loss.
+    sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2;
+    sf->lpf_sf.disable_loop_restoration_chroma = 0;
+    sf->lpf_sf.reduce_wiener_window_size = 1;
+    sf->lpf_sf.prune_wiener_based_on_src_var = 2;
+  }
+
+  if (speed >= 4) {
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+
+    sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL2;
+    sf->part_sf.simple_motion_search_reduce_search_steps = 4;
+    sf->part_sf.prune_ext_part_using_split_info = 2;
+    sf->part_sf.early_term_after_none_split = 1;
+    sf->part_sf.ml_predict_breakout_level = 3;
+
+    sf->intra_sf.prune_chroma_modes_using_luma_winner = 1;
+
+    sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL;
+
+    sf->tpl_sf.prune_starting_mv = 2;
+    sf->tpl_sf.subpel_force_stop = HALF_PEL;
+    sf->tpl_sf.search_method = FAST_BIGDIA;
+
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
+    sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+    sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
+
+    sf->rd_sf.perform_coeff_opt = 5;
+    sf->rd_sf.tx_domain_dist_thres_level = 3;
+
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3;
+
+    sf->mv_sf.reduce_search_range = 1;
+
+    sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1;
+    sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1;
+    sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_DEFAULT;
+    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+  }
+
+  if (speed >= 5) {
+    sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL3;
+    sf->part_sf.ext_partition_eval_thresh =
+        allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+    sf->part_sf.intra_cnn_based_part_prune_level =
+        allow_screen_content_tools ? 1 : 2;
+
+    sf->intra_sf.chroma_intra_pruning_with_hog = 3;
+
+    sf->lpf_sf.use_coarse_filter_level_search = 0;
+    sf->lpf_sf.disable_lr_filter = 1;
+
+    sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2;
+
+    sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_FAST;
+  }
+
+  if (speed >= 6) {
+    sf->intra_sf.prune_filter_intra_level = 2;
+    sf->intra_sf.chroma_intra_pruning_with_hog = 4;
+    sf->intra_sf.intra_pruning_with_hog = 4;
+    sf->intra_sf.cfl_search_range = 1;
+    sf->intra_sf.top_intra_model_count_allowed = 2;
+    sf->intra_sf.adapt_top_model_rd_count_using_neighbors = 1;
+
+    sf->part_sf.prune_rectangular_split_based_on_qidx =
+        allow_screen_content_tools ? 0 : 2;
+    sf->part_sf.prune_sub_8x8_partition_level =
+        allow_screen_content_tools ? 0 : 1;
+    sf->part_sf.prune_part4_search = 3;
+    // TODO(jingning): This might not be a good trade off if the
+    // target image quality is very low.
+    sf->part_sf.default_max_partition_size = BLOCK_32X32;
+
+    sf->mv_sf.use_bsize_dependent_search_method = 1;
+
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3;
+    sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0;
+
+    sf->rd_sf.perform_coeff_opt = 6;
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+
+    sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
+    sf->winner_mode_sf.prune_winner_mode_eval_level = 1;
+  }
+  // The following should make all-intra mode speed 7 approximately equal
+  // to real-time speed 6,
+  // all-intra speed 8 close to real-time speed 7, and all-intra speed 9
+  // close to real-time speed 8
+  if (speed >= 7) {
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->rt_sf.var_part_split_threshold_shift = 7;
+  }
+
+  if (speed >= 8) {
+    sf->rt_sf.hybrid_intra_pickmode = 1;
+    sf->rt_sf.use_nonrd_pick_mode = 1;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 1;
+    sf->rt_sf.var_part_split_threshold_shift = 8;
+    // Set mask for intra modes.
+    for (int i = 0; i < BLOCK_SIZES; ++i)
+      if (i >= BLOCK_32X32)
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+      else
+        // Use DC, H, V intra mode for block sizes < 32X32.
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+  }
+
+  if (speed >= 9) {
+    sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+    sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+
+    sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+    sf->rt_sf.hybrid_intra_pickmode = 0;
+    sf->rt_sf.var_part_split_threshold_shift = 9;
+    sf->rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var = true;
   }
 }
 
 static void set_good_speed_feature_framesize_dependent(
     const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
   const AV1_COMMON *const cm = &cpi->common;
-  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
   const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
+  const bool use_hbd = cpi->oxcf.use_highbitdepth;
+  const int boosted = frame_is_boosted(cpi);
+  const int is_boosted_arf2_bwd_type =
+      boosted ||
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+  const int is_lf_frame =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == LF_UPDATE;
 
   if (is_480p_or_larger) {
     sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
@@ -139,6 +565,7 @@ static void set_good_speed_feature_framesize_dependent(
   } else {
     sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
     sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+    if (use_hbd) sf->tx_sf.prune_tx_size_level = 1;
   }
 
   if (is_4k_or_larger) {
@@ -155,7 +582,22 @@ static void set_good_speed_feature_framesize_dependent(
     sf->part_sf.ml_early_term_after_part_split_level = 1;
   }
 
+  if (is_720p_or_larger) {
+    // TODO(chiyotsai@google.com): make this speed feature adaptive based on
+    // current block's vertical texture instead of hardcoded with resolution
+    sf->mv_sf.use_downsampled_sad = 1;
+  }
+
+  if (!is_720p_or_larger) {
+    const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+    const int rate_tolerance =
+        AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
+    sf->hl_sf.recode_tolerance = 25 + (rate_tolerance >> 2);
+  }
+
   if (speed >= 1) {
+    if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 1;
+
     if (is_720p_or_larger) {
       sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
     } else if (is_480p_or_larger) {
@@ -172,6 +614,8 @@ static void set_good_speed_feature_framesize_dependent(
       sf->part_sf.ml_partition_search_breakout_thresh[4] = -1;  // BLOCK_128X128
     }
     sf->part_sf.ml_early_term_after_part_split_level = 2;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
   }
 
   if (speed >= 2) {
@@ -198,35 +642,118 @@ static void set_good_speed_feature_framesize_dependent(
     }
 
     if (is_480p_or_larger) {
+      sf->inter_sf.disable_interintra_wedge_var_thresh = 100;
+    } else {
+      sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+    }
+
+    if (is_480p_or_lesser) sf->inter_sf.skip_ext_comp_nearmv_mode = 1;
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 1 : 0;
+    } else {
+      sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 2 : 0;
+    }
+
+    if (is_480p_or_larger) {
       sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
+      if (use_hbd) sf->tx_sf.prune_tx_size_level = 2;
+    } else {
+      if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+      sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = boosted ? 0 : 1;
+      sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = boosted ? 0 : 1;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->mv_sf.disable_second_mv = 1;
+      sf->mv_sf.auto_mv_step_size = 2;
+    } else {
+      sf->mv_sf.disable_second_mv = boosted ? 0 : 2;
+      sf->mv_sf.auto_mv_step_size = 1;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->hl_sf.recode_tolerance = 50;
+      sf->inter_sf.disable_interinter_wedge_newmv_search =
+          is_boosted_arf2_bwd_type ? 0 : 1;
+      sf->inter_sf.enable_fast_wedge_mask_search = 1;
     }
   }
 
   if (speed >= 3) {
+    sf->inter_sf.enable_fast_wedge_mask_search = 1;
+    sf->inter_sf.skip_newmv_in_drl = 2;
+    sf->inter_sf.skip_ext_comp_nearmv_mode = 1;
+    sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 3 : 0;
+    sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
+    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch =
+        frame_is_intra_only(&cpi->common) ? 0 : 1;
+
     sf->part_sf.ml_early_term_after_part_split_level = 0;
 
     if (is_720p_or_larger) {
       sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
       sf->part_sf.partition_search_breakout_rate_thr = 200;
+      sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 2 : 0;
     } else {
       sf->part_sf.max_intra_bsize = BLOCK_32X32;
       sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
       sf->part_sf.partition_search_breakout_rate_thr = 120;
+      sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 1 : 0;
+    }
+    if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+
+    if (is_480p_or_larger) {
+      sf->part_sf.early_term_after_none_split = 1;
+    } else {
+      sf->part_sf.early_term_after_none_split = 0;
     }
+    if (is_720p_or_larger) {
+      sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 2;
+    } else {
+      sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 3;
+    }
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+      sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 1;
+    } else {
+      sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+      sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2;
+      sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL2;
+    }
+
+    sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
   }
 
   if (speed >= 4) {
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
+    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
     if (is_720p_or_larger) {
       sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
     } else {
       sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
     }
+    sf->part_sf.early_term_after_none_split = 1;
 
     if (is_480p_or_larger) {
       sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2;
     }
 
-    sf->inter_sf.prune_obmc_prob_thresh = 16;
+    sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+    sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
+    sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2;
+    if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 3;
+
+    if (is_720p_or_larger)
+      sf->hl_sf.recode_tolerance = 32;
+    else
+      sf->hl_sf.recode_tolerance = 55;
+
+    sf->intra_sf.skip_intra_in_interframe = 4;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3;
   }
 
   if (speed >= 5) {
@@ -235,137 +762,169 @@ static void set_good_speed_feature_framesize_dependent(
     } else if (is_480p_or_larger) {
       sf->inter_sf.prune_warped_prob_thresh = 8;
     }
-  }
-}
+    if (is_720p_or_larger) sf->hl_sf.recode_tolerance = 40;
 
-static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
-                                                     SPEED_FEATURES *const sf,
-                                                     int speed) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
-  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
-  const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
+    sf->inter_sf.skip_newmv_in_drl = 4;
 
-  (void)is_720p_or_larger;  // Not used so far
+    if (!is_720p_or_larger) {
+      sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW_SET;
+    }
 
-  if (!is_360p_or_larger) {
-    if (speed >= 6) sf->rt_sf.force_tx_search_off = 1;
-    if (speed >= 8) {
-      sf->rt_sf.use_modeled_non_rd_cost = 0;
-      sf->rt_sf.use_nonrd_filter_search = 0;
+    if (!is_480p_or_larger) {
+      sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh =
+          boosted ? INT_MAX : 250;
     }
-  }
-  if (is_360p_or_larger) {
-    if (speed >= 7) {
-      sf->interp_sf.disable_filter_search_var_thresh = 0;
+
+    if (is_480p_or_lesser) {
+      sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL1;
+    } else {
+      sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL2;
     }
   }
-  if (!is_480p_or_larger) {
-    if (speed == 7) {
-      sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+
+  if (speed >= 6) {
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4;
+    sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3;
+    if (is_720p_or_larger) {
+      sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
     }
-    if (speed >= 8) {
-      sf->mv_sf.subpel_search_method = SUBPEL_TREE;
 
-      sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+    if (is_1080p_or_larger) {
+      sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    }
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.disable_masked_comp = 1;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+      sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW;
     }
+
+    if (is_720p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    } else {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16;
+    }
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.prune_ref_mv_idx_search = 2;
+    } else {
+      sf->inter_sf.prune_ref_mv_idx_search = 1;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = 150;
+    }
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+
+    if (!is_480p_or_larger) sf->hl_sf.num_frames_used_in_tf = 3;
   }
 }
 
 static void set_good_speed_features_framesize_independent(
     const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
   const AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int boosted = frame_is_boosted(cpi);
   const int is_boosted_arf2_bwd_type =
-      boosted || gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
+      boosted || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+  const int is_inter_frame =
+      gf_group->frame_type[cpi->gf_frame_index] == INTER_FRAME;
   const int allow_screen_content_tools =
       cm->features.allow_screen_content_tools;
-  if (!cpi->oxcf.large_scale_tile) {
+  const int use_hbd = cpi->oxcf.use_highbitdepth;
+  if (!cpi->oxcf.tile_cfg.enable_large_scale_tile) {
     sf->hl_sf.high_precision_mv_usage = LAST_MV_DATA;
   }
 
   // Speed 0 for all speed features that give neutral coding performance change.
-  sf->gm_sf.gm_disable_recode = 1;
   sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
 
   sf->part_sf.less_rectangular_check_level = 1;
-  sf->part_sf.ml_prune_4_partition = 1;
-  sf->part_sf.ml_prune_ab_partition = 1;
-  sf->part_sf.ml_prune_rect_partition = 1;
+  sf->part_sf.ml_prune_partition = 1;
   sf->part_sf.prune_ext_partition_types_search_level = 1;
+  sf->part_sf.prune_part4_search = 2;
   sf->part_sf.simple_motion_search_prune_rect = 1;
+  sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
+  sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+  sf->part_sf.use_best_rd_for_pruning = 1;
+  sf->part_sf.simple_motion_search_prune_agg =
+      allow_screen_content_tools ? NO_PRUNING : SIMPLE_AGG_LVL0;
 
-  sf->inter_sf.disable_wedge_search_edge_thresh = 0;
-  sf->inter_sf.disable_wedge_search_var_thresh = 0;
   // TODO(debargha): Test, tweak and turn on either 1 or 2
   sf->inter_sf.inter_mode_rd_model_estimation = 1;
   sf->inter_sf.model_based_post_interp_filter_breakout = 1;
   sf->inter_sf.prune_compound_using_single_ref = 1;
   sf->inter_sf.prune_mode_search_simple_translation = 1;
-  sf->inter_sf.prune_motion_mode_level = 1;
   sf->inter_sf.prune_ref_frame_for_rect_partitions =
       (boosted || (allow_screen_content_tools))
           ? 0
           : (is_boosted_arf2_bwd_type ? 1 : 2);
-  sf->inter_sf.prune_wedge_pred_diff_based = 1;
-  sf->inter_sf.reduce_inter_modes = 1;
+  sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2;
   sf->inter_sf.selective_ref_frame = 1;
   sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
 
-  sf->interp_sf.cb_pred_filter_search = 0;
   sf->interp_sf.use_fast_interpolation_filter_search = 1;
 
   sf->intra_sf.intra_pruning_with_hog = 1;
-  sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f;
 
   sf->tx_sf.adaptive_txb_search_level = 1;
   sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
   sf->tx_sf.model_based_prune_tx_search_level = 1;
   sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
 
+  sf->tpl_sf.search_method = NSTEP_8PT;
+
   sf->rt_sf.use_nonrd_pick_mode = 0;
   sf->rt_sf.use_real_time_ref_set = 0;
 
-  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
-    sf->mv_sf.exhaustive_searches_thresh = (1 << 24);
-  else
+  if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION ||
+      cpi->use_screen_content_tools) {
+    sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
+  } else {
     sf->mv_sf.exhaustive_searches_thresh = (1 << 25);
+  }
 
   sf->rd_sf.perform_coeff_opt = 1;
+  sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL;
 
   if (speed >= 1) {
-    sf->gm_sf.disable_adaptive_warp_error_thresh = 0;
     sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
     sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1;
 
-    sf->part_sf.intra_cnn_split = 1;
+    sf->part_sf.intra_cnn_based_part_prune_level =
+        allow_screen_content_tools ? 0 : 2;
     sf->part_sf.simple_motion_search_early_term_none = 1;
     // TODO(Venkat): Clean-up frame type dependency for
     // simple_motion_search_split in partition search function and set the
     // speed feature accordingly
     sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2;
+    sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3;
 
     sf->mv_sf.exhaustive_searches_thresh <<= 1;
     sf->mv_sf.obmc_full_pixel_search_level = 1;
     sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS;
+    sf->mv_sf.disable_extensive_joint_motion_search = 1;
 
-    sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1;
     sf->inter_sf.prune_comp_type_by_comp_avg = 1;
     sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1;
-    sf->inter_sf.prune_motion_mode_level = 2;
     sf->inter_sf.prune_ref_frame_for_rect_partitions =
         (frame_is_intra_only(&cpi->common) || (allow_screen_content_tools))
             ? 0
             : (boosted ? 1 : 2);
-    sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2;
+    sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3;
     sf->inter_sf.reuse_inter_intra_mode = 1;
     sf->inter_sf.selective_ref_frame = 2;
-    sf->inter_sf.skip_repeated_newmv = 1;
+    sf->inter_sf.skip_arf_compound = 1;
 
-    sf->interp_sf.cb_pred_filter_search = 0;
     sf->interp_sf.use_interp_filter = 1;
+
     sf->intra_sf.prune_palette_search_level = 1;
 
     sf->tx_sf.adaptive_txb_search_level = 2;
@@ -374,15 +933,13 @@ static void set_good_speed_features_framesize_independent(
     sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
     sf->tx_sf.model_based_prune_tx_search_level = 0;
     sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
-    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_FAST;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
     sf->tx_sf.tx_type_search.skip_tx_search = 1;
-    sf->tx_sf.use_intra_txb_hash = 1;
 
     sf->rd_sf.perform_coeff_opt = boosted ? 2 : 3;
     sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
     sf->rd_sf.tx_domain_dist_thres_level = 1;
 
-    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
     sf->lpf_sf.dual_sgr_penalty_level = 1;
     sf->lpf_sf.enable_sgr_ep_pruning = 1;
 
@@ -391,489 +948,709 @@ static void set_good_speed_features_framesize_independent(
   }
 
   if (speed >= 2) {
-    sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_2;
+    sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
 
-    sf->part_sf.allow_partition_search_skip = 1;
+    sf->fp_sf.skip_motion_search_threshold = 25;
 
-    sf->mv_sf.auto_mv_step_size = 1;
+    sf->gm_sf.disable_gm_search_based_on_stats = 1;
+
+    sf->part_sf.reuse_best_prediction_for_part_ab =
+        !frame_is_intra_only(&cpi->common);
+
+    sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL;
     sf->mv_sf.subpel_iters_per_step = 1;
+    sf->mv_sf.reduce_search_range = 1;
 
     // TODO(chiyotsai@google.com): We can get 10% speed up if we move
     // adaptive_rd_thresh to speed 1. But currently it performs poorly on some
     // clips (e.g. 5% loss on dinner_1080p). We need to examine the sequence a
     // bit more closely to figure out why.
     sf->inter_sf.adaptive_rd_thresh = 1;
-    sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
-    sf->inter_sf.disable_interinter_wedge_newmv_search = 1;
-    sf->inter_sf.disable_wedge_search_edge_thresh = 0;
-    sf->inter_sf.disable_wedge_search_var_thresh = 100;
+    sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
     sf->inter_sf.fast_interintra_wedge_search = 1;
-    sf->inter_sf.fast_wedge_sign_estimate = 1;
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1;
-    sf->inter_sf.prune_compound_using_neighbors = 1;
+    sf->inter_sf.prune_ext_comp_using_neighbors = 1;
+    sf->inter_sf.prune_comp_using_best_single_mode_ref = 2;
     sf->inter_sf.prune_comp_type_by_comp_avg = 2;
-    sf->inter_sf.prune_warp_using_wmtype = 1;
     sf->inter_sf.selective_ref_frame = 3;
     sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+    // Enable fast search only for COMPOUND_DIFFWTD type.
+    sf->inter_sf.enable_fast_compound_mode_search = 1;
+    sf->inter_sf.reuse_mask_search_results = 1;
+    sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 1;
+    sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 1;
+    sf->inter_sf.alt_ref_search_fp = 1;
 
-    // TODO(Sachin): Enable/Enhance this speed feature for speed 2 & 3
     sf->interp_sf.adaptive_interp_filter_search = 1;
     sf->interp_sf.disable_dual_filter = 1;
-    sf->interp_sf.disable_filter_search_var_thresh = 100;
 
     sf->intra_sf.disable_smooth_intra =
-        !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key != 1);
+        !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key > 1);
+    sf->intra_sf.intra_pruning_with_hog = 2;
+    sf->intra_sf.skip_intra_in_interframe = is_inter_frame ? 2 : 1;
+    sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
+
+    sf->tpl_sf.prune_starting_mv = 1;
+    sf->tpl_sf.search_method = DIAMOND;
 
     sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4;
+    sf->rd_sf.use_mb_rd_hash = 1;
 
     sf->lpf_sf.prune_wiener_based_on_src_var = 1;
-    sf->lpf_sf.prune_sgr_based_on_wiener = !allow_screen_content_tools;
+    sf->lpf_sf.prune_sgr_based_on_wiener = 1;
+    sf->lpf_sf.disable_loop_restoration_chroma = boosted ? 0 : 1;
+    sf->lpf_sf.reduce_wiener_window_size = boosted ? 0 : 1;
+
+    // TODO(any): Re-evaluate this feature set to 1 in speed 2.
+    sf->tpl_sf.allow_compound_pred = 0;
+    sf->tpl_sf.prune_ref_frames_in_tpl = 1;
   }
 
   if (speed >= 3) {
     sf->hl_sf.high_precision_mv_usage = CURRENT_Q;
-    sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
 
     sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+    sf->gm_sf.prune_zero_mv_with_sse = 1;
 
     sf->part_sf.less_rectangular_check_level = 2;
-    sf->part_sf.simple_motion_search_prune_agg = 1;
-    sf->part_sf.prune_4_partition_using_split_info =
-        !allow_screen_content_tools;
-
-    // adaptive_motion_search breaks encoder multi-thread tests.
-    // The values in x->pred_mv[] differ for single and multi-thread cases.
-    // See aomedia:1778.
-    // sf->mv_sf.adaptive_motion_search = 1;
+    sf->part_sf.simple_motion_search_prune_agg =
+        allow_screen_content_tools
+            ? SIMPLE_AGG_LVL0
+            : (boosted ? SIMPLE_AGG_LVL1 : QIDX_BASED_AGG_LVL1);
+    sf->part_sf.prune_ext_part_using_split_info = 1;
+    sf->part_sf.simple_motion_search_rect_split = 1;
+
     sf->mv_sf.full_pixel_search_level = 1;
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
-    sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
     sf->mv_sf.search_method = DIAMOND;
+    sf->mv_sf.disable_second_mv = 2;
+    sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_1;
 
-    sf->inter_sf.disable_sb_level_mv_cost_upd = 1;
-    // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
-    // it with cpi->sf.disable_wedge_search_var_thresh.
-    sf->inter_sf.disable_wedge_interintra_search = 1;
+    sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+    sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+    sf->inter_sf.disable_onesided_comp = 1;
+    sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
     // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2
     // and clean-up the speed feature
     sf->inter_sf.perform_best_rd_based_gating_for_chroma = 1;
     sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 1;
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2;
-    sf->inter_sf.prune_motion_mode_level = boosted ? 2 : 3;
-    sf->inter_sf.selective_ref_frame = 4;
+    sf->inter_sf.selective_ref_frame = 5;
     sf->inter_sf.skip_repeated_ref_mv = 1;
-    sf->inter_sf.skip_repeated_full_newmv = 1;
-    if (cpi->oxcf.enable_smooth_interintra)
-      sf->inter_sf.disable_smooth_interintra = boosted ? 0 : 1;
     sf->inter_sf.reuse_compound_type_decision = 1;
-    sf->inter_sf.txfm_rd_gate_level = (boosted || allow_screen_content_tools)
-                                          ? 0
-                                          : (is_boosted_arf2_bwd_type ? 1 : 2);
+    sf->inter_sf.txfm_rd_gate_level =
+        boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2);
+    sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 2;
 
+    sf->interp_sf.adaptive_interp_filter_search = 2;
+
+    // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are
+    // inherited directly from luma hog with some minor tweaking. Eventually we
+    // should run this with a bayesian optimizer to find the Pareto frontier.
+    sf->intra_sf.chroma_intra_pruning_with_hog = 2;
+    sf->intra_sf.intra_pruning_with_hog = 3;
     sf->intra_sf.prune_palette_search_level = 2;
+    sf->intra_sf.top_intra_model_count_allowed = 2;
 
+    sf->tpl_sf.prune_starting_mv = 2;
     sf->tpl_sf.skip_alike_starting_mv = 2;
     sf->tpl_sf.prune_intra_modes = 1;
     sf->tpl_sf.reduce_first_step_size = 6;
+    sf->tpl_sf.subpel_force_stop = QUARTER_PEL;
+    sf->tpl_sf.gop_length_decision_method = 1;
 
     sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3;
-    sf->tx_sf.tx_type_search.use_skip_flag_prediction =
-        allow_screen_content_tools ? 1 : 2;
+    sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
 
     // TODO(any): Refactor the code related to following winner mode speed
     // features
     sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1;
-    // TODO(any): Experiment with this speed feature by enabling for key frames
-    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch =
-        frame_is_intra_only(&cpi->common) ? 0 : 1;
-    sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist =
-        !allow_screen_content_tools;
+    sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1;
     sf->winner_mode_sf.motion_mode_for_winner_cand =
-        boosted
-            ? 0
-            : gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE ? 1
-                                                                         : 2;
-
-    // TODO(any): evaluate if these lpf features can be moved to speed 2.
-    sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 0 : 2;
-    sf->lpf_sf.disable_loop_restoration_chroma =
-        (boosted || allow_screen_content_tools) ? 0 : 1;
-    sf->lpf_sf.reduce_wiener_window_size = !boosted;
+        boosted ? 0
+                : gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE
+                      ? 1
+                      : 2;
+    sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 4;
+
+    // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality
+    // loss.
+    sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2;
     sf->lpf_sf.prune_wiener_based_on_src_var = 2;
-
-    sf->hl_sf.second_alt_ref_filtering = 0;
+    sf->lpf_sf.use_coarse_filter_level_search =
+        frame_is_intra_only(&cpi->common) ? 0 : 1;
+    sf->lpf_sf.use_downsampled_wiener_stats = 1;
   }
 
   if (speed >= 4) {
+    sf->gm_sf.prune_zero_mv_with_sse = 2;
+
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
 
-    sf->part_sf.simple_motion_search_prune_agg = 2;
-    sf->part_sf.prune_ab_partition_using_split_info =
-        !allow_screen_content_tools;
+    sf->part_sf.simple_motion_search_prune_agg =
+        allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL2;
+    sf->part_sf.simple_motion_search_reduce_search_steps = 4;
+    sf->part_sf.prune_ext_part_using_split_info = 2;
+    sf->part_sf.ml_predict_breakout_level = 3;
+    sf->part_sf.prune_rectangular_split_based_on_qidx =
+        (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0
+                                                                          : 1;
 
-    sf->inter_sf.adaptive_mode_search = 1;
-    sf->inter_sf.alt_ref_search_fp = 1;
-    sf->inter_sf.prune_ref_mv_idx_search = 1;
-    sf->inter_sf.txfm_rd_gate_level =
-        (boosted || allow_screen_content_tools) ? 0 : 3;
+    sf->inter_sf.alt_ref_search_fp = 2;
+    sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 3;
 
     sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2;
-    sf->inter_sf.prune_compound_using_neighbors = 2;
-    sf->inter_sf.disable_smooth_interintra = 1;
+    sf->inter_sf.prune_ext_comp_using_neighbors = 2;
+    sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
+    sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+    sf->inter_sf.prune_nearest_near_mv_using_refmv_weight = boosted ? 0 : 1;
 
     sf->interp_sf.cb_pred_filter_search = 1;
     sf->interp_sf.skip_sharp_interp_filter_search = 1;
     sf->interp_sf.use_interp_filter = 2;
-    sf->interp_sf.adaptive_interp_filter_search = 2;
 
     sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
     sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
     sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
-    sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
-    // TODO(any): Experiment with this speed feature set to 2 for higher quality
-    // presets as well
-    sf->intra_sf.skip_intra_in_interframe = 2;
-
-    sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning = 1;
+    // TODO(any): "intra_y_mode_mask" doesn't help much at speed 4.
+    // sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    // sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    // sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_sf.skip_intra_in_interframe = 4;
+
+    sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL;
+    sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2;
+
+    sf->tpl_sf.subpel_force_stop = HALF_PEL;
+    sf->tpl_sf.search_method = FAST_BIGDIA;
+
     sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
-    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_MORE;
-    sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
-    // TODO(any): Experiment with enabling of this speed feature as hash state
-    // is reset during winner mode processing
-    sf->tx_sf.use_intra_txb_hash = 0;
 
-    sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 5;
-    sf->rd_sf.tx_domain_dist_thres_level = 2;
+    sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 5 : 7;
 
     // TODO(any): Extend multi-winner mode processing support for inter frames
-    sf->winner_mode_sf.enable_multiwinner_mode_process =
-        frame_is_intra_only(&cpi->common) ? 1 : 0;
-    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+    sf->winner_mode_sf.multi_winner_mode_type =
+        frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_DEFAULT
+                                          : MULTI_WINNER_MODE_OFF;
+    sf->winner_mode_sf.dc_blk_pred_level = boosted ? 0 : 1;
 
-    sf->lpf_sf.cdef_pick_method = allow_screen_content_tools
-                                      ? CDEF_FAST_SEARCH_LVL1
-                                      : CDEF_FAST_SEARCH_LVL2;
-
-    // TODO(any): The following features have no impact on quality and speed,
-    // and are disabled.
-    // sf->part_sf.partition_search_breakout_rate_thr = 300;
-    // sf->interp_sf.disable_filter_search_var_thresh = 200;
-    // sf->rd_sf.use_fast_coef_costing = 1;
-
-    // TODO(any): The following features give really bad quality/speed trade
-    // off. Needs to be re-worked.
-    // sf->mv_sf.search_method = BIGDIA;
-    // sf->inter_sf.adaptive_rd_thresh = 4;
-    // sf->rd_sf.tx_domain_dist_level = 2;
-    // sf->rt_sf.mode_search_skip_flags =
-    //     (cm->current_frame.frame_type == KEY_FRAME)
-    //     ? 0
-    //     : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
-    //     FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
-    //     FLAG_EARLY_TERMINATE;
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
   }
 
   if (speed >= 5) {
-    sf->part_sf.simple_motion_search_prune_agg = 3;
+    sf->fp_sf.reduce_mv_step_param = 4;
+
+    sf->part_sf.simple_motion_search_prune_agg =
+        allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL3;
     sf->part_sf.ext_partition_eval_thresh =
         allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+    sf->part_sf.prune_sub_8x8_partition_level =
+        (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0
+                                                                          : 2;
 
-    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
-    sf->inter_sf.disable_interinter_wedge = 1;
-    sf->inter_sf.disable_obmc = 1;
-    sf->inter_sf.disable_onesided_comp = 1;
-    sf->inter_sf.txfm_rd_gate_level =
-        (boosted || allow_screen_content_tools) ? 0 : 4;
     sf->inter_sf.prune_inter_modes_if_skippable = 1;
+    sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 4;
+    // Enable fast search for all valid compound modes.
+    sf->inter_sf.enable_fast_compound_mode_search = 2;
+    sf->inter_sf.prune_comp_ref_frames = 1;
+
+    sf->intra_sf.chroma_intra_pruning_with_hog = 3;
+
+    // TODO(any): Extend multi-winner mode processing support for inter frames
+    sf->winner_mode_sf.multi_winner_mode_type =
+        frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_FAST
+                                          : MULTI_WINNER_MODE_OFF;
 
-    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
     sf->lpf_sf.disable_lr_filter = 1;
 
-    sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL;
-    sf->mv_sf.prune_mesh_search = 1;
-    sf->mv_sf.reduce_search_range = 1;
+    sf->tpl_sf.prune_starting_mv = 3;
+    sf->tpl_sf.use_y_only_rate_distortion = 1;
+    sf->tpl_sf.subpel_force_stop = FULL_PEL;
+    sf->tpl_sf.gop_length_decision_method = 2;
 
-    sf->tpl_sf.subpel_force_stop = QUARTER_PEL;
+    sf->winner_mode_sf.dc_blk_pred_level = 1;
+
+    sf->fp_sf.disable_recon = 1;
   }
 
   if (speed >= 6) {
+    sf->hl_sf.disable_extra_sc_testing = 1;
+    sf->hl_sf.second_alt_ref_filtering = 0;
+    sf->hl_sf.recode_tolerance = 55;
+
+    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
+    sf->inter_sf.selective_ref_frame = 6;
+    sf->inter_sf.prune_comp_ref_frames = 2;
+    sf->inter_sf.prune_ext_comp_using_neighbors = 3;
+
+    sf->intra_sf.chroma_intra_pruning_with_hog = 4;
+    sf->intra_sf.intra_pruning_with_hog = 4;
+    sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC;
+    sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC;
+    sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC;
+    sf->intra_sf.early_term_chroma_palette_size_search = 1;
+
+    sf->part_sf.prune_rectangular_split_based_on_qidx =
+        boosted || allow_screen_content_tools ? 0 : 2;
+    sf->part_sf.prune_sub_8x8_partition_level =
+        allow_screen_content_tools ? 0
+                                   : frame_is_intra_only(&cpi->common) ? 1 : 2;
+    sf->part_sf.prune_part4_search = 3;
+
+    sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL;
+    sf->mv_sf.use_bsize_dependent_search_method = 1;
+    sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1;
+
+    sf->tpl_sf.gop_length_decision_method = 3;
+    sf->tpl_sf.disable_filtered_key_tpl = 1;
+
+    sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 6 : 8;
+
+    sf->winner_mode_sf.dc_blk_pred_level = 2;
+    sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
+
+    sf->fp_sf.skip_zeromv_motion_search = 1;
+  }
+}
+
+static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
+                                                     SPEED_FEATURES *const sf,
+                                                     int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int boosted = frame_is_boosted(cpi);
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
+
+  if (!is_360p_or_larger) {
+    sf->rt_sf.prune_intra_mode_based_on_mv_range = 1;
+    sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1;
+    if (speed >= 6)
+      sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2;
+    if (speed >= 7) sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    if (speed >= 8) {
+      sf->rt_sf.use_nonrd_filter_search = 0;
+      sf->rt_sf.tx_size_level_based_on_qstep = 1;
+    }
+    if (speed >= 9) {
+      sf->rt_sf.use_comp_ref_nonrd = 0;
+      sf->rt_sf.nonrd_agressive_skip = 1;
+      sf->rt_sf.skip_intra_pred = 1;
+// TODO(kyslov) Re-enable when AV1 models are trained
+#if 0
+#if CONFIG_RT_ML_PARTITIONING
+      if (!frame_is_intra_only(cm)) {
+        sf->part_sf.partition_search_type = ML_BASED_PARTITION;
+        sf->rt_sf.reuse_inter_pred_nonrd = 0;
+      }
+#endif
+#endif
+    }
+    if (speed >= 10) {
+      sf->rt_sf.skip_intra_pred = 2;
+      sf->rt_sf.hybrid_intra_pickmode = 3;
+    }
+  } else {
+    sf->rt_sf.prune_intra_mode_based_on_mv_range = 2;
+    sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
+    if (speed <= 5) {
+      sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh =
+          boosted ? INT_MAX : 350;
+      sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2;
+    }
+    if (speed == 8 && !cpi->ppi->use_svc) {
+      sf->rt_sf.short_circuit_low_temp_var = 0;
+      sf->rt_sf.use_nonrd_altref_frame = 1;
+    }
+    if (speed >= 8) sf->rt_sf.tx_size_level_based_on_qstep = 2;
+    if (speed >= 9) {
+      sf->rt_sf.gf_length_lvl = 1;
+      sf->rt_sf.skip_cdef_sb = 1;
+      sf->rt_sf.sad_based_adp_altref_lag = 2;
+    }
+
+    if (speed >= 10) {
+      // TODO(yunqing): extend this sf to other speeds and/or other resolutions.
+      sf->rt_sf.use_rtc_tf = 1;
+      sf->rt_sf.hybrid_intra_pickmode = 2;
+      sf->rt_sf.sad_based_adp_altref_lag = 4;
+      sf->rt_sf.tx_size_level_based_on_qstep = 0;
+    }
+  }
+  if (!is_480p_or_larger) {
+    if (speed == 7) {
+      sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+    }
+    if (speed >= 8) {
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+      sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+    }
+    if (speed >= 9) {
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+      sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+    }
+  }
+  if (!is_720p_or_larger) {
+    if (speed >= 9) {
+      sf->rt_sf.force_large_partition_blocks_intra = 1;
+    }
+  } else {
+    if (speed >= 9) {
+      sf->rt_sf.sad_based_adp_altref_lag = 1;
+      sf->rt_sf.sad_based_comp_prune = 1;
+    }
+    if (speed >= 10) {
+      sf->rt_sf.sad_based_adp_altref_lag = 3;
+      sf->rt_sf.sad_based_comp_prune = 2;
+    }
+  }
+  if (cpi->ppi->use_svc) {
+    if (cpi->svc.ref_frame_comp[0] || cpi->svc.ref_frame_comp[1] ||
+        cpi->svc.ref_frame_comp[2]) {
+      sf->rt_sf.use_comp_ref_nonrd = 1;
+      sf->rt_sf.ref_frame_comp_nonrd[0] =
+          cpi->svc.ref_frame_comp[0] && cpi->svc.reference[GOLDEN_FRAME - 1];
+      sf->rt_sf.ref_frame_comp_nonrd[1] =
+          cpi->svc.ref_frame_comp[1] && cpi->svc.reference[LAST2_FRAME - 1];
+      sf->rt_sf.ref_frame_comp_nonrd[2] =
+          cpi->svc.ref_frame_comp[2] && cpi->svc.reference[ALTREF_FRAME - 1];
+    } else {
+      sf->rt_sf.use_comp_ref_nonrd = 0;
+      sf->rt_sf.sad_based_comp_prune = 0;
+    }
+  }
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+    if (speed >= 10) {
+      sf->rt_sf.prune_idtx_nonrd = 1;
+      sf->rt_sf.part_early_exit_zeromv = 1;
+      sf->rt_sf.use_nonrd_filter_search = 0;
+    }
+    if (speed >= 9) sf->rt_sf.skip_lf_screen = 1;
+    sf->rt_sf.skip_cdef_sb = 1;
+    sf->rt_sf.use_rtc_tf = 0;
+    sf->rt_sf.use_comp_ref_nonrd = 0;
+    sf->rt_sf.sad_based_comp_prune = 0;
+    sf->rt_sf.source_metrics_sb_nonrd = 1;
+    if (cpi->rc.high_source_sad == 1) {
+      sf->rt_sf.force_large_partition_blocks = 0;
+      for (int i = 0; i < BLOCK_SIZES; ++i)
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+    }
+    if (cpi->rc.high_num_blocks_with_motion && speed >= 6) {
+      sf->mv_sf.search_method = NSTEP;
+      sf->rt_sf.fullpel_search_step_param = 2;
+    }
+    sf->rt_sf.partition_direct_merging = 0;
   }
 }
 
 // TODO(kyslov): now this is very similar to
 // set_good_speed_features_framesize_independent
-//               except it sets non-rd flag on speed8. This function will likely
-//               be modified in the future with RT-specific speed features
+// except it sets non-rd flag on speed 8. This function will likely
+// be modified in the future with RT-specific speed features.
 static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
                                                         SPEED_FEATURES *sf,
                                                         int speed) {
   AV1_COMMON *const cm = &cpi->common;
   const int boosted = frame_is_boosted(cpi);
 
-  // Speed 0 for all speed features that give neutral coding performance change.
-  sf->gm_sf.gm_disable_recode = 1;
-  sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
-
-  sf->part_sf.less_rectangular_check_level = 1;
-  sf->part_sf.ml_prune_4_partition = 1;
-  sf->part_sf.ml_prune_ab_partition = 1;
-  sf->part_sf.ml_prune_rect_partition = 1;
-  sf->part_sf.prune_ext_partition_types_search_level = 1;
+  // Currently, rt speed 0, 1, 2, 3, 4, 5 are the same.
+  // Following set of speed features are not impacting encoder's decisions as
+  // the relevant tools are disabled by default.
+  sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+  sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+  sf->inter_sf.reuse_inter_intra_mode = 1;
+  sf->inter_sf.prune_compound_using_single_ref = 0;
+  sf->inter_sf.prune_comp_search_by_single_result = 2;
+  sf->inter_sf.prune_comp_type_by_comp_avg = 2;
+  sf->inter_sf.fast_wedge_sign_estimate = 1;
+  sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+  sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+  sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+  sf->interp_sf.cb_pred_filter_search = 0;
+  sf->part_sf.ml_prune_partition = 1;
+  sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+  sf->part_sf.prune_ext_partition_types_search_level = 2;
+  sf->part_sf.less_rectangular_check_level = 2;
+  sf->mv_sf.obmc_full_pixel_search_level = 1;
+  sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
+  sf->tx_sf.model_based_prune_tx_search_level = 0;
+  sf->lpf_sf.dual_sgr_penalty_level = 1;
+  sf->lpf_sf.disable_lr_filter = 1;
+  sf->rt_sf.skip_interp_filter_search = 1;
+  sf->intra_sf.prune_palette_search_level = 2;
+  sf->intra_sf.prune_luma_palette_size_search_level = 2;
+
+  // End of set
+
+  // TODO(any, yunqing): tune these features for real-time use cases.
+  sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_SOLO;
+  sf->hl_sf.frame_parameter_update = 0;
 
-  // TODO(debargha): Test, tweak and turn on either 1 or 2
-  sf->inter_sf.inter_mode_rd_model_estimation = 0;
-  sf->inter_sf.disable_wedge_search_edge_thresh = 0;
-  sf->inter_sf.disable_wedge_search_var_thresh = 0;
   sf->inter_sf.model_based_post_interp_filter_breakout = 1;
-  sf->inter_sf.prune_compound_using_single_ref = 0;
-  sf->inter_sf.prune_mode_search_simple_translation = 1;
-  sf->inter_sf.prune_motion_mode_level = 1;
+  // TODO(any): As per the experiments, this speed feature is doing redundant
+  // computation since the model rd based pruning logic is similar to model rd
+  // based gating when inter_mode_rd_model_estimation = 2. Enable this SF if
+  // either of the condition becomes true.
+  //    (1) inter_mode_rd_model_estimation != 2
+  //    (2) skip_interp_filter_search == 0
+  //    (3) Motion mode or compound mode is enabled */
+  sf->inter_sf.prune_mode_search_simple_translation = 0;
   sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted;
-  sf->inter_sf.prune_wedge_pred_diff_based = 1;
-  sf->inter_sf.reduce_inter_modes = 1;
-  sf->inter_sf.selective_ref_frame = 1;
-  sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
+  sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+  sf->inter_sf.selective_ref_frame = 4;
+  sf->inter_sf.alt_ref_search_fp = 2;
+  sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 4;
+  sf->inter_sf.limit_txfm_eval_per_mode = 3;
+
+  sf->inter_sf.adaptive_rd_thresh = 4;
+  sf->inter_sf.inter_mode_rd_model_estimation = 2;
+  sf->inter_sf.prune_inter_modes_if_skippable = 1;
+  sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3;
+  sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3;
+  sf->inter_sf.skip_newmv_in_drl = 4;
 
-  sf->interp_sf.cb_pred_filter_search = 0;
   sf->interp_sf.use_fast_interpolation_filter_search = 1;
-
-  sf->intra_sf.intra_pruning_with_hog = 1;
-  sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f;
+  sf->interp_sf.use_interp_filter = 1;
+  sf->interp_sf.adaptive_interp_filter_search = 1;
+  sf->interp_sf.disable_dual_filter = 1;
+
+  sf->part_sf.default_max_partition_size = BLOCK_128X128;
+  sf->part_sf.default_min_partition_size = BLOCK_8X8;
+  sf->part_sf.use_best_rd_for_pruning = 1;
+  sf->part_sf.early_term_after_none_split = 1;
+  sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+  sf->part_sf.max_intra_bsize = BLOCK_16X16;
+  sf->part_sf.partition_search_breakout_rate_thr = 500;
+  sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+  sf->part_sf.adjust_var_based_rd_partitioning = 2;
 
   sf->mv_sf.full_pixel_search_level = 1;
   sf->mv_sf.exhaustive_searches_thresh = INT_MAX;
+  sf->mv_sf.auto_mv_step_size = 1;
+  sf->mv_sf.subpel_iters_per_step = 1;
+  sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
+  sf->mv_sf.search_method = FAST_DIAMOND;
+  sf->mv_sf.subpel_force_stop = EIGHTH_PEL;
+  sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+
+  for (int i = 0; i < TX_SIZES; ++i) {
+    sf->intra_sf.intra_y_mode_mask[i] = INTRA_DC;
+    sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
+  }
+  sf->intra_sf.skip_intra_in_interframe = 5;
+  sf->intra_sf.disable_smooth_intra = 1;
+  sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
 
-  sf->rt_sf.check_intra_pred_nonrd = 1;
-  sf->rt_sf.estimate_motion_for_var_based_partition = 1;
-  sf->rt_sf.hybrid_intra_pickmode = 0;
-  sf->rt_sf.nonrd_prune_ref_frame_search = 0;
-  sf->rt_sf.reuse_inter_pred_nonrd = 0;
-  sf->rt_sf.use_comp_ref_nonrd = 1;
-  sf->rt_sf.use_nonrd_filter_search = 1;
-  sf->rt_sf.use_nonrd_pick_mode = 0;
-  sf->rt_sf.use_real_time_ref_set = 0;
-  sf->tx_sf.adaptive_txb_search_level = 1;
   sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
-  sf->tx_sf.model_based_prune_tx_search_level = 1;
   sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+  sf->tx_sf.adaptive_txb_search_level = 2;
+  sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+  sf->tx_sf.tx_size_search_lgr_block = 1;
+  sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+  sf->tx_sf.tx_type_search.skip_tx_search = 1;
+  sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+  sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+  sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+  sf->tx_sf.refine_fast_tx_search_results = 0;
+  sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+  sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+  sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4;
+
+  sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
+  sf->rd_sf.simple_model_rd_from_var = 1;
+  sf->rd_sf.tx_domain_dist_level = 2;
+  sf->rd_sf.tx_domain_dist_thres_level = 2;
+
+  sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+  sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+
+  sf->winner_mode_sf.dc_blk_pred_level = frame_is_intra_only(cm) ? 0 : 2;
+  sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+  sf->winner_mode_sf.tx_size_search_level = 1;
+  sf->winner_mode_sf.winner_mode_ifs = 1;
 
-  if (speed >= 1) {
-    sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_1;
-    sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
-
-    sf->part_sf.prune_ext_partition_types_search_level = 2;
-    sf->part_sf.simple_motion_search_prune_rect = 1;
-
-    sf->mv_sf.obmc_full_pixel_search_level = 1;
-    sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS;
-
-    sf->inter_sf.prune_comp_search_by_single_result = 1;
-    sf->inter_sf.reuse_inter_intra_mode = 1;
-    sf->inter_sf.selective_ref_frame = 2;
-    sf->inter_sf.skip_repeated_newmv = 1;
-    sf->inter_sf.disable_wedge_search_var_thresh = 0;
-    sf->inter_sf.disable_wedge_search_edge_thresh = 0;
-    sf->inter_sf.prune_comp_type_by_comp_avg = 1;
-    sf->inter_sf.prune_motion_mode_level = 2;
-    sf->inter_sf.prune_single_motion_modes_by_simple_trans = 1;
-
-    sf->interp_sf.cb_pred_filter_search = 1;
-    sf->interp_sf.use_interp_filter = 1;
-
-    sf->tx_sf.adaptive_txb_search_level = 2;
-    sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
-    sf->tx_sf.tx_size_search_lgr_block = 1;
-    sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
-    sf->tx_sf.tx_type_search.skip_tx_search = 1;
-    sf->tx_sf.use_intra_txb_hash = 1;
-
-    sf->rd_sf.optimize_b_precheck = 1;
-    sf->rd_sf.tx_domain_dist_level = boosted ? 0 : 1;
-    sf->rd_sf.tx_domain_dist_thres_level = 1;
-
-    sf->lpf_sf.dual_sgr_penalty_level = 1;
-  }
-
-  if (speed >= 2) {
-    sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_2;
-
-    sf->part_sf.allow_partition_search_skip = 1;
-    sf->part_sf.partition_search_breakout_rate_thr = 80;
-
-    sf->mv_sf.auto_mv_step_size = 1;
-    sf->mv_sf.subpel_iters_per_step = 1;
-
-    sf->inter_sf.adaptive_rd_thresh = 1;
-    sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
-    sf->inter_sf.disable_wedge_search_edge_thresh = 0;
-    sf->inter_sf.disable_wedge_search_var_thresh = 100;
-    sf->inter_sf.fast_wedge_sign_estimate = 1;
-    sf->inter_sf.prune_comp_type_by_comp_avg = 2;
-    sf->inter_sf.selective_ref_frame = 3;
-    sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
-
-    sf->interp_sf.adaptive_interp_filter_search = 1;
-    sf->interp_sf.cb_pred_filter_search = 0;
-    sf->interp_sf.disable_dual_filter = 1;
-    sf->interp_sf.disable_filter_search_var_thresh = 100;
-
-    sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
-    sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
-    sf->tx_sf.model_based_prune_tx_search_level = 0;
-
-    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
-  }
-
-  if (speed >= 3) {
-    sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
-
-    sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
-
-    sf->part_sf.less_rectangular_check_level = 2;
-
-    sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
-    // adaptive_motion_search breaks encoder multi-thread tests.
-    // The values in x->pred_mv[] differ for single and multi-thread cases.
-    // See aomedia:1778.
-    // sf->mv_sf.adaptive_motion_search = 1;
-
-    sf->inter_sf.adaptive_rd_thresh = 2;
-    sf->inter_sf.disable_sb_level_mv_cost_upd = 1;
-    // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
-    // it with cpi->sf.disable_wedge_search_var_thresh.
-    sf->inter_sf.disable_wedge_interintra_search = 1;
-    sf->inter_sf.prune_comp_search_by_single_result = 2;
-    sf->inter_sf.prune_motion_mode_level = boosted ? 2 : 3;
-    sf->inter_sf.prune_warp_using_wmtype = 1;
-    sf->inter_sf.selective_ref_frame = 4;
-
-    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_FAST;
-
-    sf->rd_sf.tx_domain_dist_level = 1;
-
-    sf->winner_mode_sf.tx_size_search_level = boosted ? 0 : 2;
-  }
-
-  if (speed >= 4) {
-    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
-
-    sf->inter_sf.adaptive_mode_search = 1;
-    sf->inter_sf.alt_ref_search_fp = 1;
-
-    sf->interp_sf.skip_sharp_interp_filter_search = 1;
-
-    sf->tx_sf.tx_type_search.fast_inter_tx_type_search = 1;
-    sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
-    sf->tx_sf.use_intra_txb_hash = 0;
-
-    sf->rd_sf.use_mb_rd_hash = 0;
-
-    sf->winner_mode_sf.tx_size_search_level = frame_is_intra_only(cm) ? 0 : 2;
+  sf->rt_sf.check_intra_pred_nonrd = 1;
+  sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+  sf->rt_sf.hybrid_intra_pickmode = 1;
+  sf->rt_sf.use_comp_ref_nonrd = 0;
+  sf->rt_sf.ref_frame_comp_nonrd[0] = 0;
+  sf->rt_sf.ref_frame_comp_nonrd[1] = 0;
+  sf->rt_sf.ref_frame_comp_nonrd[2] = 0;
+  sf->rt_sf.use_nonrd_filter_search = 1;
+  sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+  sf->rt_sf.num_inter_modes_for_tx_search = 5;
+  sf->rt_sf.prune_inter_modes_using_temp_var = 1;
+  sf->rt_sf.use_real_time_ref_set = 1;
+  sf->rt_sf.use_simple_rd_model = 1;
+  sf->rt_sf.prune_inter_modes_with_golden_ref = boosted ? 0 : 1;
+  // TODO(any): This sf could be removed.
+  sf->rt_sf.short_circuit_low_temp_var = 1;
+  sf->rt_sf.check_scene_detection = 1;
+  if (cpi->rc.rtc_external_ratectrl) sf->rt_sf.check_scene_detection = 0;
+  if (cm->current_frame.frame_type != KEY_FRAME &&
+      cpi->oxcf.rc_cfg.mode == AOM_CBR)
+    sf->rt_sf.overshoot_detection_cbr = FAST_DETECTION_MAXQ;
+  // Enable noise estimation only for high resolutions for now.
+  //
+  // Since use_temporal_noise_estimate has no effect for all-intra frame
+  // encoding, it is disabled for this case.
+  if (cpi->oxcf.kf_cfg.key_freq_max != 0 && cm->width * cm->height > 640 * 480)
+    sf->rt_sf.use_temporal_noise_estimate = 1;
+  sf->rt_sf.skip_tx_no_split_var_based_partition = 1;
+  sf->rt_sf.skip_newmv_mode_based_on_sse = 1;
+  sf->rt_sf.mode_search_skip_flags =
+      (cm->current_frame.frame_type == KEY_FRAME)
+          ? 0
+          : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+                FLAG_EARLY_TERMINATE;
+  sf->rt_sf.var_part_split_threshold_shift = 5;
+  if (!frame_is_intra_only(&cpi->common)) sf->rt_sf.var_part_based_on_qidx = 1;
+
+  // For SVC: use better mv search on base temporal layers, and only
+  // on base spatial layer if highest resolution is above 640x360.
+  if (cpi->svc.number_temporal_layers > 1 &&
+      cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 &&
+      (cpi->svc.spatial_layer_id == 0 ||
+       cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <=
+           640 * 360)) {
+    sf->mv_sf.search_method = NSTEP;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+    sf->rt_sf.fullpel_search_step_param = 6;
   }
 
-  if (speed >= 5) {
-    sf->hl_sf.recode_loop = ALLOW_RECODE_KFMAXBW;
-
-    sf->inter_sf.adaptive_rd_thresh = 4;
-    sf->interp_sf.disable_filter_search_var_thresh = 200;
-
-    sf->rd_sf.use_fast_coef_costing = 1;
-    sf->rd_sf.tx_domain_dist_level = 2;
-    sf->rd_sf.tx_domain_dist_thres_level = 2;
-    sf->winner_mode_sf.tx_size_search_level = 1;
-
-    sf->rt_sf.mode_search_skip_flags =
-        (cm->current_frame.frame_type == KEY_FRAME)
-            ? 0
-            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
-                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
-                  FLAG_EARLY_TERMINATE;
-    sf->hl_sf.frame_parameter_update = 0;
-
-    sf->part_sf.default_max_partition_size = BLOCK_128X128;
-    sf->part_sf.default_min_partition_size = BLOCK_8X8;
-    sf->part_sf.max_intra_bsize = BLOCK_32X32;
-    sf->part_sf.partition_search_breakout_rate_thr = 500;
-    sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
-    sf->part_sf.adjust_var_based_rd_partitioning = 2;
-
-    sf->mv_sf.search_method = FAST_DIAMOND;
-    sf->mv_sf.subpel_force_stop = QUARTER_PEL;
+  if (speed >= 6) {
     sf->mv_sf.use_fullpel_costlist = 1;
-    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
-
-    sf->inter_sf.adaptive_mode_search = 2;
-    sf->inter_sf.inter_mode_rd_model_estimation = 2;
-
-    for (int i = 0; i < TX_SIZES; ++i) {
-      sf->intra_sf.intra_y_mode_mask[i] = INTRA_DC;
-      sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
-    }
 
-    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_MORE;
-    sf->tx_sf.use_inter_txb_hash = 0;
-    sf->tx_sf.refine_fast_tx_search_results = 0;
+    sf->rd_sf.tx_domain_dist_thres_level = 3;
 
-    sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
-    sf->rd_sf.simple_model_rd_from_var = 1;
+    sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = 0;
+    sf->inter_sf.limit_inter_mode_cands = 4;
+    sf->inter_sf.prune_warped_prob_thresh = 8;
+    sf->inter_sf.extra_prune_warped = 1;
 
-    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
-    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+    sf->rt_sf.gf_refresh_based_on_qp = 1;
+    sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1;
+    sf->rt_sf.var_part_split_threshold_shift = 7;
+    if (!frame_is_intra_only(&cpi->common))
+      sf->rt_sf.var_part_based_on_qidx = 2;
 
-    sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
-    sf->rt_sf.num_inter_modes_for_tx_search = 5;
-    sf->rt_sf.skip_interp_filter_search = 1;
-    sf->rt_sf.use_comp_ref_nonrd = 0;
-    sf->rt_sf.use_real_time_ref_set = 1;
-    sf->rt_sf.use_simple_rd_model = 1;
-  }
-
-  if (speed >= 6) {
-    sf->part_sf.adjust_var_based_rd_partitioning = 1;
+    sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 3;
   }
 
   if (speed >= 7) {
-    sf->hl_sf.frame_parameter_update = 0;
-
-    sf->part_sf.default_max_partition_size = BLOCK_128X128;
-    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_1;
+    sf->rt_sf.use_comp_ref_nonrd = 1;
+    sf->rt_sf.ref_frame_comp_nonrd[2] = 1;  // LAST_ALTREF
+    sf->tx_sf.intra_tx_size_search_init_depth_sqr = 2;
     sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+    sf->part_sf.max_intra_bsize = BLOCK_32X32;
 
     sf->mv_sf.search_method = FAST_DIAMOND;
     sf->mv_sf.subpel_force_stop = QUARTER_PEL;
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
 
     sf->inter_sf.inter_mode_rd_model_estimation = 2;
+    // This sf is not applicable in non-rd path.
+    sf->inter_sf.skip_newmv_in_drl = 0;
+
+    // Disable intra_y_mode_mask pruning since the performance at speed 7 isn't
+    // good. May need more study.
+    for (int i = 0; i < TX_SIZES; ++i) {
+      sf->intra_sf.intra_y_mode_mask[i] = INTRA_ALL;
+    }
 
-    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
     sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL5;
 
     sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
     sf->rt_sf.nonrd_prune_ref_frame_search = 1;
-    sf->rt_sf.reuse_inter_pred_nonrd = 0;
+    // This is for rd path only.
+    sf->rt_sf.prune_inter_modes_using_temp_var = 0;
+    sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 0;
+    sf->rt_sf.prune_intra_mode_based_on_mv_range = 0;
+#if !CONFIG_REALTIME_ONLY
+    sf->rt_sf.reuse_inter_pred_nonrd =
+        (cpi->oxcf.motion_mode_cfg.enable_warped_motion == 0);
+#else
+    sf->rt_sf.reuse_inter_pred_nonrd = 1;
+#endif
+#if CONFIG_AV1_TEMPORAL_DENOISING
+    sf->rt_sf.reuse_inter_pred_nonrd = (cpi->oxcf.noise_sensitivity == 0);
+#endif
     sf->rt_sf.short_circuit_low_temp_var = 0;
     sf->rt_sf.skip_interp_filter_search = 0;
-    sf->rt_sf.use_comp_ref_nonrd = 0;
-    sf->rt_sf.use_nonrd_altref_frame = 1;
+    // For spatial layers, only LAST and GOLDEN are currently used in the SVC
+    // for nonrd. The flag use_nonrd_altref_frame can disable GOLDEN in the
+    // get_ref_frame_flags() for some patterns, so disable it here for
+    // spatial layers.
+    sf->rt_sf.use_nonrd_altref_frame =
+        (cpi->svc.number_spatial_layers > 1) ? 0 : 1;
     sf->rt_sf.use_nonrd_pick_mode = 1;
-    sf->rt_sf.nonrd_check_partition_merge_mode = 1;
-    sf->rt_sf.nonrd_check_partition_split = 0;
-    sf->rt_sf.hybrid_intra_pickmode = 1;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 3;
+    sf->rt_sf.skip_intra_pred = 1;
+    sf->rt_sf.source_metrics_sb_nonrd = 1;
+    // For SVC: use better mv search on base temporal layers, and only
+    // on base spatial layer if highest resolution is above 640x360.
+    if (cpi->svc.number_temporal_layers > 1 &&
+        cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 &&
+        (cpi->svc.spatial_layer_id == 0 ||
+         cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <=
+             640 * 360)) {
+      sf->mv_sf.search_method = NSTEP;
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+      sf->rt_sf.fullpel_search_step_param = 6;
+    } else if (cpi->svc.non_reference_frame) {
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+      sf->rt_sf.fullpel_search_step_param = 10;
+    }
+    // Set mask for intra modes.
+    for (int i = 0; i < BLOCK_SIZES; ++i)
+      if (i >= BLOCK_32X32)
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+      else
+        // Use DC, H, V intra mode for block sizes < 32X32.
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+
+    sf->winner_mode_sf.dc_blk_pred_level = 0;
+    sf->rt_sf.var_part_based_on_qidx = 3;
   }
 
   if (speed >= 8) {
-    sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+    sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_2;
+    sf->intra_sf.intra_pruning_with_hog = 1;
+    sf->rt_sf.estimate_motion_for_var_based_partition = 1;
     sf->rt_sf.short_circuit_low_temp_var = 1;
-    sf->rt_sf.reuse_inter_pred_nonrd = 1;
     sf->rt_sf.use_nonrd_altref_frame = 0;
     sf->rt_sf.nonrd_prune_ref_frame_search = 2;
     sf->rt_sf.nonrd_check_partition_merge_mode = 0;
-    sf->rt_sf.nonrd_check_partition_split = 0;
-    sf->rt_sf.use_modeled_non_rd_cost = 1;
-    sf->rt_sf.source_metrics_sb_nonrd = 1;
+    sf->rt_sf.var_part_split_threshold_shift = 8;
     sf->interp_sf.cb_pred_filter_search = 1;
+    sf->rt_sf.var_part_based_on_qidx = 4;
+    sf->rt_sf.partition_direct_merging = 1;
+  }
+  if (speed >= 9) {
+    sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_3;
+    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+    sf->rt_sf.force_large_partition_blocks = 1;
+    sf->rt_sf.skip_intra_pred = 2;
+    sf->rt_sf.var_part_split_threshold_shift = 9;
+    for (int i = 0; i < BLOCK_SIZES; ++i)
+      sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+    sf->rt_sf.var_part_based_on_qidx = 0;
+  }
+  if (speed >= 10) {
+    sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_4;
+    sf->rt_sf.nonrd_agressive_skip = 1;
+    sf->rt_sf.nonrd_prune_ref_frame_search = 3;
+    sf->rt_sf.var_part_split_threshold_shift = 10;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+    sf->rt_sf.force_half_pel_block = 1;
+    sf->rt_sf.reduce_zeromv_mvres = true;
   }
 }
 
@@ -881,28 +1658,41 @@ static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
   // best quality defaults
   hl_sf->frame_parameter_update = 1;
   hl_sf->recode_loop = ALLOW_RECODE;
-  hl_sf->disable_overlay_frames = 0;
-  hl_sf->adaptive_overlay_encoding = 1;
   // Recode loop tolerance %.
   hl_sf->recode_tolerance = 25;
   hl_sf->high_precision_mv_usage = CURRENT_Q;
+  hl_sf->superres_auto_search_type = SUPERRES_AUTO_ALL;
+  hl_sf->disable_extra_sc_testing = 0;
   hl_sf->second_alt_ref_filtering = 1;
+  hl_sf->num_frames_used_in_tf = INT_MAX;
+}
+
+static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) {
+  fp_sf->reduce_mv_step_param = 3;
+  fp_sf->skip_motion_search_threshold = 0;
+  fp_sf->disable_recon = 0;
+  fp_sf->skip_zeromv_motion_search = 0;
 }
 
 static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
+  tpl_sf->gop_length_decision_method = 0;
   tpl_sf->prune_intra_modes = 0;
+  tpl_sf->prune_starting_mv = 0;
   tpl_sf->reduce_first_step_size = 0;
   tpl_sf->skip_alike_starting_mv = 0;
   tpl_sf->subpel_force_stop = EIGHTH_PEL;
+  tpl_sf->search_method = NSTEP;
+  tpl_sf->disable_filtered_key_tpl = 0;
+  tpl_sf->prune_ref_frames_in_tpl = 0;
+  tpl_sf->allow_compound_pred = 1;
+  tpl_sf->use_y_only_rate_distortion = 0;
 }
 
 static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
-  gm_sf->gm_erroradv_type = GM_ERRORADV_TR_0;
-  gm_sf->disable_adaptive_warp_error_thresh = 1;
-  gm_sf->selective_ref_gm = 1;
   gm_sf->gm_search_type = GM_FULL_SEARCH;
-  gm_sf->gm_disable_recode = 0;
   gm_sf->prune_ref_frame_for_gm_search = 0;
+  gm_sf->prune_zero_mv_with_sse = 0;
+  gm_sf->disable_gm_search_based_on_stats = 0;
 }
 
 static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
@@ -910,44 +1700,50 @@ static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
   part_sf->less_rectangular_check_level = 0;
   part_sf->use_square_partition_only_threshold = BLOCK_128X128;
   part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
-  part_sf->auto_min_partition_based_on_simple_motion = 0;
   part_sf->default_max_partition_size = BLOCK_LARGEST;
   part_sf->default_min_partition_size = BLOCK_4X4;
   part_sf->adjust_var_based_rd_partitioning = 0;
-  part_sf->allow_partition_search_skip = 0;
   part_sf->max_intra_bsize = BLOCK_LARGEST;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
-  part_sf->always_this_block_size = BLOCK_16X16;
+  part_sf->fixed_partition_size = BLOCK_16X16;
   // Recode loop tolerance %.
   part_sf->partition_search_breakout_dist_thr = 0;
   part_sf->partition_search_breakout_rate_thr = 0;
   part_sf->prune_ext_partition_types_search_level = 0;
-  part_sf->ml_prune_rect_partition = 0;
-  part_sf->ml_prune_ab_partition = 0;
-  part_sf->ml_prune_4_partition = 0;
+  part_sf->prune_part4_search = 0;
+  part_sf->ml_prune_partition = 0;
   part_sf->ml_early_term_after_part_split_level = 0;
   for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
     part_sf->ml_partition_search_breakout_thresh[i] =
         -1;  // -1 means not enabled.
   }
-  part_sf->simple_motion_search_prune_agg = 0;
+  part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0;
   part_sf->simple_motion_search_split = 0;
   part_sf->simple_motion_search_prune_rect = 0;
   part_sf->simple_motion_search_early_term_none = 0;
-  part_sf->intra_cnn_split = 0;
+  part_sf->simple_motion_search_reduce_search_steps = 0;
+  part_sf->intra_cnn_based_part_prune_level = 0;
   part_sf->ext_partition_eval_thresh = BLOCK_8X8;
-  part_sf->prune_4_partition_using_split_info = 0;
-  part_sf->prune_ab_partition_using_split_info = 0;
+  part_sf->rect_partition_eval_thresh = BLOCK_128X128;
+  part_sf->prune_ext_part_using_split_info = 0;
+  part_sf->prune_rectangular_split_based_on_qidx = 0;
+  part_sf->early_term_after_none_split = 0;
+  part_sf->ml_predict_breakout_level = 0;
+  part_sf->prune_sub_8x8_partition_level = 0;
+  part_sf->simple_motion_search_rect_split = 0;
+  part_sf->reuse_prev_rd_results_for_part_ab = 0;
+  part_sf->reuse_best_prediction_for_part_ab = 0;
+  part_sf->use_best_rd_for_pruning = 0;
+  part_sf->skip_non_sq_part_based_on_none = 0;
 }
 
 static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
   mv_sf->full_pixel_search_level = 0;
-  mv_sf->adaptive_motion_search = 0;
   mv_sf->auto_mv_step_size = 0;
   mv_sf->exhaustive_searches_thresh = 0;
   mv_sf->obmc_full_pixel_search_level = 0;
-  mv_sf->prune_mesh_search = 0;
+  mv_sf->prune_mesh_search = PRUNE_MESH_SEARCH_DISABLED;
   mv_sf->reduce_search_range = 0;
   mv_sf->search_method = NSTEP;
   mv_sf->simple_motion_subpel_force_stop = EIGHTH_PEL;
@@ -955,77 +1751,93 @@ static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
   mv_sf->subpel_iters_per_step = 2;
   mv_sf->subpel_search_method = SUBPEL_TREE;
   mv_sf->use_accurate_subpel_search = USE_8_TAPS;
+  mv_sf->use_bsize_dependent_search_method = 0;
   mv_sf->use_fullpel_costlist = 0;
+  mv_sf->use_downsampled_sad = 0;
+  mv_sf->disable_extensive_joint_motion_search = 0;
+  mv_sf->disable_second_mv = 0;
+  mv_sf->skip_fullpel_search_using_startmv = 0;
 }
 
 static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
-  inter_sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   inter_sf->adaptive_rd_thresh = 0;
   inter_sf->model_based_post_interp_filter_breakout = 0;
   inter_sf->reduce_inter_modes = 0;
-  inter_sf->adaptive_mode_search = 0;
   inter_sf->alt_ref_search_fp = 0;
+  inter_sf->prune_comp_ref_frames = 0;
   inter_sf->selective_ref_frame = 0;
   inter_sf->prune_ref_frame_for_rect_partitions = 0;
-  inter_sf->disable_wedge_search_edge_thresh = 0;
-  inter_sf->disable_wedge_search_var_thresh = 0;
   inter_sf->fast_wedge_sign_estimate = 0;
-  inter_sf->prune_wedge_pred_diff_based = 0;
   inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED;
   inter_sf->reuse_inter_intra_mode = 0;
-  inter_sf->disable_sb_level_coeff_cost_upd = 0;
-  inter_sf->disable_sb_level_mv_cost_upd = 0;
+  inter_sf->mv_cost_upd_level = INTERNAL_COST_UPD_SB;
+  inter_sf->coeff_cost_upd_level = INTERNAL_COST_UPD_SB;
+  inter_sf->mode_cost_upd_level = INTERNAL_COST_UPD_SB;
   inter_sf->prune_inter_modes_based_on_tpl = 0;
+  inter_sf->prune_nearmv_using_neighbors = PRUNE_NEARMV_OFF;
   inter_sf->prune_comp_search_by_single_result = 0;
   inter_sf->skip_repeated_ref_mv = 0;
-  inter_sf->skip_repeated_newmv = 0;
-  inter_sf->skip_repeated_full_newmv = 0;
-  inter_sf->prune_single_motion_modes_by_simple_trans = 0;
+  inter_sf->skip_newmv_in_drl = 0;
   inter_sf->inter_mode_rd_model_estimation = 0;
   inter_sf->prune_compound_using_single_ref = 0;
-  inter_sf->prune_compound_using_neighbors = 0;
+  inter_sf->prune_ext_comp_using_neighbors = 0;
+  inter_sf->skip_ext_comp_nearmv_mode = 0;
+  inter_sf->prune_comp_using_best_single_mode_ref = 0;
+  inter_sf->prune_nearest_near_mv_using_refmv_weight = 0;
   inter_sf->disable_onesided_comp = 0;
   inter_sf->prune_mode_search_simple_translation = 0;
   inter_sf->prune_comp_type_by_comp_avg = 0;
   inter_sf->disable_interinter_wedge_newmv_search = 0;
-  inter_sf->enable_interinter_diffwtd_newmv_search = 0;
-  inter_sf->disable_smooth_interintra = 0;
-  inter_sf->prune_motion_mode_level = 0;
-  inter_sf->prune_warp_using_wmtype = 0;
-  inter_sf->disable_wedge_interintra_search = 0;
   inter_sf->fast_interintra_wedge_search = 0;
   inter_sf->prune_comp_type_by_model_rd = 0;
   inter_sf->perform_best_rd_based_gating_for_chroma = 0;
   inter_sf->prune_obmc_prob_thresh = 0;
-  inter_sf->disable_obmc = 0;
-  inter_sf->disable_interinter_wedge = 0;
+  inter_sf->disable_interinter_wedge_var_thresh = 0;
+  inter_sf->disable_interintra_wedge_var_thresh = 0;
   inter_sf->prune_ref_mv_idx_search = 0;
   inter_sf->prune_warped_prob_thresh = 0;
   inter_sf->reuse_compound_type_decision = 0;
   inter_sf->txfm_rd_gate_level = 0;
   inter_sf->prune_inter_modes_if_skippable = 0;
+  inter_sf->disable_masked_comp = 0;
+  inter_sf->enable_fast_compound_mode_search = 0;
+  inter_sf->reuse_mask_search_results = 0;
+  inter_sf->enable_fast_wedge_mask_search = 0;
+  inter_sf->inter_mode_txfm_breakout = 0;
+  inter_sf->limit_inter_mode_cands = 0;
+  inter_sf->limit_txfm_eval_per_mode = 0;
+  inter_sf->skip_arf_compound = 0;
 }
 
 static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
-  interp_sf->disable_filter_search_var_thresh = 0;
   interp_sf->adaptive_interp_filter_search = 0;
-  interp_sf->use_fast_interpolation_filter_search = 0;
+  interp_sf->cb_pred_filter_search = 0;
   interp_sf->disable_dual_filter = 0;
-  interp_sf->use_interp_filter = 0;
   interp_sf->skip_sharp_interp_filter_search = 0;
+  interp_sf->use_fast_interpolation_filter_search = 0;
+  interp_sf->use_interp_filter = 0;
 }
 
 static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
+  intra_sf->dv_cost_upd_level = INTERNAL_COST_UPD_SB;
   intra_sf->skip_intra_in_interframe = 1;
   intra_sf->intra_pruning_with_hog = 0;
-  intra_sf->src_var_thresh_intra_skip = 1;
+  intra_sf->chroma_intra_pruning_with_hog = 0;
   intra_sf->prune_palette_search_level = 0;
+  intra_sf->prune_luma_palette_size_search_level = 0;
 
   for (int i = 0; i < TX_SIZES; i++) {
     intra_sf->intra_y_mode_mask[i] = INTRA_ALL;
     intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
   }
   intra_sf->disable_smooth_intra = 0;
+  intra_sf->prune_filter_intra_level = 0;
+  intra_sf->prune_chroma_modes_using_luma_winner = 0;
+  intra_sf->cfl_search_range = 3;
+  intra_sf->top_intra_model_count_allowed = TOP_INTRA_MODEL_COUNT;
+  intra_sf->adapt_top_model_rd_count_using_neighbors = 0;
+  intra_sf->early_term_chroma_palette_size_search = 0;
+  intra_sf->skip_filter_intra_in_inter_frames = 0;
 }
 
 static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
@@ -1035,53 +1847,48 @@ static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
   tx_sf->intra_tx_size_search_init_depth_sqr = 0;
   tx_sf->tx_size_search_lgr_block = 0;
   tx_sf->model_based_prune_tx_search_level = 0;
-  tx_sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE;
+  tx_sf->tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_1;
   tx_sf->tx_type_search.ml_tx_split_thresh = 8500;
   tx_sf->tx_type_search.use_skip_flag_prediction = 1;
   tx_sf->tx_type_search.use_reduced_intra_txset = 0;
   tx_sf->tx_type_search.fast_intra_tx_type_search = 0;
-  tx_sf->tx_type_search.fast_inter_tx_type_search = 0;
+  tx_sf->tx_type_search.fast_inter_tx_type_prob_thresh = INT_MAX;
   tx_sf->tx_type_search.skip_tx_search = 0;
   tx_sf->tx_type_search.prune_tx_type_using_stats = 0;
   tx_sf->tx_type_search.prune_tx_type_est_rd = 0;
-  tx_sf->tx_type_search.enable_winner_mode_tx_type_pruning = 0;
+  tx_sf->tx_type_search.winner_mode_tx_type_pruning = 0;
   tx_sf->txb_split_cap = 1;
   tx_sf->adaptive_txb_search_level = 0;
-  tx_sf->use_intra_txb_hash = 0;
-  tx_sf->use_inter_txb_hash = 1;
   tx_sf->refine_fast_tx_search_results = 1;
+  tx_sf->prune_tx_size_level = 0;
 }
 
 static AOM_INLINE void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf,
-                                  const AV1_COMP *cpi) {
-  if (cpi->oxcf.disable_trellis_quant == 3) {
-    rd_sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+                                  const AV1EncoderConfig *oxcf) {
+  const int disable_trellis_quant = oxcf->algo_cfg.disable_trellis_quant;
+  if (disable_trellis_quant == 3) {
+    rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg)
                                        ? NO_ESTIMATE_YRD_TRELLIS_OPT
                                        : NO_TRELLIS_OPT;
-  } else if (cpi->oxcf.disable_trellis_quant == 2) {
-    rd_sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+  } else if (disable_trellis_quant == 2) {
+    rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg)
                                        ? FINAL_PASS_TRELLIS_OPT
                                        : NO_TRELLIS_OPT;
-  } else if (cpi->oxcf.disable_trellis_quant == 0) {
-    if (is_lossless_requested(&cpi->oxcf)) {
+  } else if (disable_trellis_quant == 0) {
+    if (is_lossless_requested(&oxcf->rc_cfg)) {
       rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
     } else {
       rd_sf->optimize_coefficients = FULL_TRELLIS_OPT;
     }
-  } else if (cpi->oxcf.disable_trellis_quant == 1) {
+  } else if (disable_trellis_quant == 1) {
     rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
   } else {
     assert(0 && "Invalid disable_trellis_quant value");
   }
-  // TODO(sarahparker) Pair this with a speed setting once experiments are done
-  rd_sf->trellis_eob_fast = 0;
-  rd_sf->use_mb_rd_hash = 1;
-  rd_sf->optimize_b_precheck = 0;
-  rd_sf->use_fast_coef_costing = 0;
+  rd_sf->use_mb_rd_hash = 0;
   rd_sf->simple_model_rd_from_var = 0;
   rd_sf->tx_domain_dist_level = 0;
   rd_sf->tx_domain_dist_thres_level = 0;
-  rd_sf->use_hash_based_trellis = 0;
   rd_sf->perform_coeff_opt = 0;
 }
 
@@ -1089,71 +1896,135 @@ static AOM_INLINE void init_winner_mode_sf(
     WINNER_MODE_SPEED_FEATURES *winner_mode_sf) {
   winner_mode_sf->motion_mode_for_winner_cand = 0;
   // Set this at the appropriate speed levels
-  winner_mode_sf->tx_size_search_level = USE_FULL_RD;
+  winner_mode_sf->tx_size_search_level = 0;
   winner_mode_sf->enable_winner_mode_for_coeff_opt = 0;
   winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0;
   winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0;
-  winner_mode_sf->enable_multiwinner_mode_process = 0;
+  winner_mode_sf->multi_winner_mode_type = 0;
+  winner_mode_sf->dc_blk_pred_level = 0;
+  winner_mode_sf->winner_mode_ifs = 0;
+  winner_mode_sf->prune_winner_mode_eval_level = 0;
 }
 
 static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
   lpf_sf->disable_loop_restoration_chroma = 0;
+  lpf_sf->disable_loop_restoration_luma = 0;
   lpf_sf->prune_wiener_based_on_src_var = 0;
   lpf_sf->prune_sgr_based_on_wiener = 0;
   lpf_sf->enable_sgr_ep_pruning = 0;
   lpf_sf->reduce_wiener_window_size = 0;
   lpf_sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+  lpf_sf->use_coarse_filter_level_search = 0;
   lpf_sf->cdef_pick_method = CDEF_FULL_SEARCH;
   // Set decoder side speed feature to use less dual sgr modes
   lpf_sf->dual_sgr_penalty_level = 0;
   lpf_sf->disable_lr_filter = 0;
+  lpf_sf->use_downsampled_wiener_stats = 0;
 }
 
 static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
+  rt_sf->check_intra_pred_nonrd = 0;
+  rt_sf->skip_intra_pred = 0;
+  rt_sf->estimate_motion_for_var_based_partition = 0;
+  rt_sf->nonrd_check_partition_merge_mode = 0;
+  rt_sf->nonrd_check_partition_split = 0;
   rt_sf->mode_search_skip_flags = 0;
-  rt_sf->skip_interp_filter_search = 0;
-  rt_sf->force_tx_search_off = 0;
+  rt_sf->nonrd_prune_ref_frame_search = 0;
+  rt_sf->use_nonrd_pick_mode = 0;
+  rt_sf->use_nonrd_altref_frame = 0;
+  rt_sf->use_comp_ref_nonrd = 0;
+  rt_sf->use_real_time_ref_set = 0;
+  rt_sf->short_circuit_low_temp_var = 0;
+  rt_sf->use_modeled_non_rd_cost = 0;
+  rt_sf->reuse_inter_pred_nonrd = 0;
   rt_sf->num_inter_modes_for_tx_search = INT_MAX;
+  rt_sf->use_nonrd_filter_search = 0;
   rt_sf->use_simple_rd_model = 0;
-  rt_sf->nonrd_check_partition_merge_mode = 0;
-  rt_sf->nonrd_check_partition_split = 0;
+  rt_sf->skip_interp_filter_search = 0;
+  rt_sf->hybrid_intra_pickmode = 0;
+  rt_sf->source_metrics_sb_nonrd = 0;
+  rt_sf->overshoot_detection_cbr = NO_DETECTION;
+  rt_sf->check_scene_detection = 0;
+  rt_sf->force_large_partition_blocks = 0;
+  rt_sf->use_temporal_noise_estimate = 0;
+  rt_sf->fullpel_search_step_param = 0;
+  for (int i = 0; i < BLOCK_SIZES; ++i)
+    rt_sf->intra_y_mode_bsize_mask_nrd[i] = INTRA_ALL;
+  rt_sf->nonrd_agressive_skip = 0;
+  rt_sf->skip_cdef_sb = 0;
+  rt_sf->force_large_partition_blocks_intra = 0;
+  rt_sf->skip_tx_no_split_var_based_partition = 0;
+  rt_sf->skip_newmv_mode_based_on_sse = 0;
+  rt_sf->gf_length_lvl = 0;
+  rt_sf->prune_inter_modes_with_golden_ref = 0;
+  rt_sf->prune_inter_modes_wrt_gf_arf_based_on_sad = 0;
+  rt_sf->prune_inter_modes_using_temp_var = 0;
+  rt_sf->force_half_pel_block = 0;
+  rt_sf->prune_intra_mode_based_on_mv_range = 0;
+  rt_sf->var_part_split_threshold_shift = 7;
+  rt_sf->gf_refresh_based_on_qp = 0;
+  rt_sf->use_rtc_tf = 0;
+  rt_sf->prune_idtx_nonrd = 0;
+  rt_sf->part_early_exit_zeromv = 0;
+  rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED;
+  rt_sf->skip_lf_screen = 0;
+  rt_sf->sad_based_adp_altref_lag = 0;
+  rt_sf->partition_direct_merging = 0;
+  rt_sf->var_part_based_on_qidx = 0;
+  rt_sf->sad_based_comp_prune = 0;
+  rt_sf->tx_size_level_based_on_qstep = 0;
+  rt_sf->reduce_zeromv_mvres = false;
+  rt_sf->vbp_prune_16x16_split_using_min_max_sub_blk_var = false;
 }
 
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
   SPEED_FEATURES *const sf = &cpi->sf;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
 
-  if (oxcf->mode == GOOD) {
-    set_good_speed_feature_framesize_dependent(cpi, sf, speed);
-  } else if (oxcf->mode == REALTIME) {
-    set_rt_speed_feature_framesize_dependent(cpi, sf, speed);
+  switch (oxcf->mode) {
+    case GOOD:
+      set_good_speed_feature_framesize_dependent(cpi, sf, speed);
+      break;
+    case ALLINTRA:
+      set_allintra_speed_feature_framesize_dependent(cpi, sf, speed);
+      break;
+    case REALTIME:
+      set_rt_speed_feature_framesize_dependent(cpi, sf, speed);
+      break;
+  }
+
+  if (!cpi->ppi->seq_params_locked) {
+    cpi->common.seq_params->enable_masked_compound &=
+        !sf->inter_sf.disable_masked_comp;
+    cpi->common.seq_params->enable_interintra_compound &=
+        (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
   }
 
   // This is only used in motion vector unit test.
-  if (cpi->oxcf.motion_vector_unit_test == 1)
+  if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 1)
     cpi->mv_search_params.find_fractional_mv_step = av1_return_max_sub_pixel_mv;
-  else if (cpi->oxcf.motion_vector_unit_test == 2)
+  else if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 2)
     cpi->mv_search_params.find_fractional_mv_step = av1_return_min_sub_pixel_mv;
 
-  MACROBLOCK *const x = &cpi->td.mb;
-  AV1_COMMON *const cm = &cpi->common;
-  x->min_partition_size = AOMMAX(sf->part_sf.default_min_partition_size,
-                                 dim_to_size(cpi->oxcf.min_partition_size));
-  x->max_partition_size = AOMMIN(sf->part_sf.default_max_partition_size,
-                                 dim_to_size(cpi->oxcf.max_partition_size));
-  x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
-  x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
+  // For multi-thread use case with row_mt enabled, cost update for a set of
+  // SB rows is not desirable. Hence, the sf mv_cost_upd_level is set to
+  // INTERNAL_COST_UPD_SBROW in such cases.
+  if ((cpi->oxcf.row_mt == 1) && (cpi->mt_info.num_workers > 1)) {
+    if (sf->inter_sf.mv_cost_upd_level == INTERNAL_COST_UPD_SBROW_SET) {
+      // Set mv_cost_upd_level to use row level update.
+      sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+    }
+  }
 }
 
 void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
-  AV1_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
-  MACROBLOCK *const x = &cpi->td.mb;
   WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   int i;
 
   init_hl_sf(&sf->hl_sf);
+  init_fp_sf(&sf->fp_sf);
   init_tpl_sf(&sf->tpl_sf);
   init_gm_sf(&sf->gm_sf);
   init_part_sf(&sf->part_sf);
@@ -1162,20 +2033,36 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
   init_interp_sf(&sf->interp_sf);
   init_intra_sf(&sf->intra_sf);
   init_tx_sf(&sf->tx_sf);
-  init_rd_sf(&sf->rd_sf, cpi);
+  init_rd_sf(&sf->rd_sf, oxcf);
   init_winner_mode_sf(&sf->winner_mode_sf);
   init_lpf_sf(&sf->lpf_sf);
   init_rt_sf(&sf->rt_sf);
 
-  if (oxcf->mode == GOOD)
-    set_good_speed_features_framesize_independent(cpi, sf, speed);
-  else if (oxcf->mode == REALTIME)
-    set_rt_speed_features_framesize_independent(cpi, sf, speed);
+  switch (oxcf->mode) {
+    case GOOD:
+      set_good_speed_features_framesize_independent(cpi, sf, speed);
+      break;
+    case ALLINTRA:
+      set_allintra_speed_features_framesize_independent(cpi, sf, speed);
+      break;
+    case REALTIME:
+      set_rt_speed_features_framesize_independent(cpi, sf, speed);
+      break;
+  }
 
-  if (!cpi->seq_params_locked) {
-    cpi->common.seq_params.enable_dual_filter &=
+  if (!oxcf->txfm_cfg.enable_tx_size_search) {
+    sf->winner_mode_sf.tx_size_search_level = 3;
+  }
+
+  if (!cpi->ppi->seq_params_locked) {
+    cpi->common.seq_params->order_hint_info.enable_dist_wtd_comp &=
+        (sf->inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
+    cpi->common.seq_params->enable_dual_filter &=
         !sf->interp_sf.disable_dual_filter;
-    cpi->common.seq_params.enable_restoration &= !sf->lpf_sf.disable_lr_filter;
+    cpi->common.seq_params->enable_restoration &= !sf->lpf_sf.disable_lr_filter;
+
+    cpi->common.seq_params->enable_interintra_compound &=
+        (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
   }
 
   // sf->part_sf.partition_search_breakout_dist_thr is set assuming max 64x64
@@ -1208,8 +2095,9 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
   if (is_stat_generation_stage(cpi))
     sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
 
-  // No recode or trellis for 1 pass.
-  if (oxcf->pass == 0) sf->hl_sf.recode_loop = DISALLOW_RECODE;
+  // No recode for 1 pass.
+  if (oxcf->pass == AOM_RC_ONE_PASS && has_no_stats_stage(cpi))
+    sf->hl_sf.recode_loop = DISALLOW_RECODE;
 
   MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
   if (sf->mv_sf.subpel_search_method == SUBPEL_TREE) {
@@ -1220,27 +2108,17 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
   } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
     mv_search_params->find_fractional_mv_step =
         av1_find_best_sub_pixel_tree_pruned_more;
-  } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
-    mv_search_params->find_fractional_mv_step =
-        av1_find_best_sub_pixel_tree_pruned_evenmore;
   }
 
-  x->min_partition_size = AOMMAX(sf->part_sf.default_min_partition_size,
-                                 dim_to_size(cpi->oxcf.min_partition_size));
-  x->max_partition_size = AOMMIN(sf->part_sf.default_max_partition_size,
-                                 dim_to_size(cpi->oxcf.max_partition_size));
-  x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
-  x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
-
   // This is only used in motion vector unit test.
-  if (cpi->oxcf.motion_vector_unit_test == 1)
+  if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 1)
     mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
-  else if (cpi->oxcf.motion_vector_unit_test == 2)
+  else if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 2)
     mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
 
   // assert ensures that tx_domain_dist_level is accessed correctly
   assert(cpi->sf.rd_sf.tx_domain_dist_thres_level >= 0 &&
-         cpi->sf.rd_sf.tx_domain_dist_thres_level < 3);
+         cpi->sf.rd_sf.tx_domain_dist_thres_level < 4);
   memcpy(winner_mode_params->tx_domain_dist_threshold,
          tx_domain_dist_thresholds[cpi->sf.rd_sf.tx_domain_dist_thres_level],
          sizeof(winner_mode_params->tx_domain_dist_threshold));
@@ -1251,20 +2129,20 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
          tx_domain_dist_types[cpi->sf.rd_sf.tx_domain_dist_level],
          sizeof(winner_mode_params->use_transform_domain_distortion));
 
-  // assert ensures that coeff_opt_dist_thresholds is accessed correctly
+  // assert ensures that coeff_opt_thresholds is accessed correctly
   assert(cpi->sf.rd_sf.perform_coeff_opt >= 0 &&
-         cpi->sf.rd_sf.perform_coeff_opt < 6);
-  memcpy(winner_mode_params->coeff_opt_dist_threshold,
-         coeff_opt_dist_thresholds[cpi->sf.rd_sf.perform_coeff_opt],
-         sizeof(winner_mode_params->coeff_opt_dist_threshold));
+         cpi->sf.rd_sf.perform_coeff_opt < 9);
+  memcpy(winner_mode_params->coeff_opt_thresholds,
+         &coeff_opt_thresholds[cpi->sf.rd_sf.perform_coeff_opt],
+         sizeof(winner_mode_params->coeff_opt_thresholds));
 
   // assert ensures that predict_skip_levels is accessed correctly
   assert(cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction >= 0 &&
          cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction < 3);
-  memcpy(winner_mode_params->predict_skip_level,
+  memcpy(winner_mode_params->skip_txfm_level,
          predict_skip_levels[cpi->sf.tx_sf.tx_type_search
                                  .use_skip_flag_prediction],
-         sizeof(winner_mode_params->predict_skip_level));
+         sizeof(winner_mode_params->skip_txfm_level));
 
   // assert ensures that tx_size_search_level is accessed correctly
   assert(cpi->sf.winner_mode_sf.tx_size_search_level >= 0 &&
@@ -1272,13 +2150,32 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
   memcpy(winner_mode_params->tx_size_search_methods,
          tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level],
          sizeof(winner_mode_params->tx_size_search_methods));
+  memcpy(winner_mode_params->predict_dc_level,
+         predict_dc_levels[cpi->sf.winner_mode_sf.dc_blk_pred_level],
+         sizeof(winner_mode_params->predict_dc_level));
 
-  if (cpi->oxcf.row_mt == 1 && (cpi->oxcf.max_threads > 1)) {
+  if (cpi->oxcf.row_mt == 1 && (cpi->mt_info.num_workers > 1)) {
     if (sf->inter_sf.inter_mode_rd_model_estimation == 1) {
       // Revert to type 2
       sf->inter_sf.inter_mode_rd_model_estimation = 2;
     }
+
+#if !CONFIG_FRAME_PARALLEL_ENCODE || \
+    (CONFIG_FRAME_PARALLEL_ENCODE && !CONFIG_FPMT_TEST)
+    // Disable the speed feature 'prune_ref_frame_for_gm_search' to achieve
+    // better parallelism when number of threads available are greater than or
+    // equal to maximum number of reference frames allowed for global motion.
+    if (sf->gm_sf.gm_search_type != GM_DISABLE_SEARCH &&
+        (cpi->mt_info.num_workers >=
+         gm_available_reference_frames[sf->gm_sf.gm_search_type]))
+      sf->gm_sf.prune_ref_frame_for_gm_search = 0;
+#endif
   }
+
+  // This only applies to the real time mode. Adaptive gf refresh is disabled if
+  // gf_cbr_boost_pct that is set by the user is larger than 0.
+  if (cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 0)
+    sf->rt_sf.gf_refresh_based_on_qp = 0;
 }
 
 // Override some speed features based on qindex
@@ -1287,36 +2184,131 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
   SPEED_FEATURES *const sf = &cpi->sf;
   WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
   const int boosted = frame_is_boosted(cpi);
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
-  if (is_720p_or_larger && cpi->oxcf.mode == GOOD && speed == 0) {
-    if (cm->quant_params.base_qindex <= 80) {
-      sf->rd_sf.perform_coeff_opt = 2;
-      memcpy(winner_mode_params->coeff_opt_dist_threshold,
-             coeff_opt_dist_thresholds[sf->rd_sf.perform_coeff_opt],
-             sizeof(winner_mode_params->coeff_opt_dist_threshold));
+  const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+  const int is_arf2_bwd_type =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+
+  if (cpi->oxcf.mode == REALTIME) {
+    if (speed >= 6) {
+      const int qindex_thresh = boosted ? 190 : (is_720p_or_larger ? 120 : 150);
+      sf->part_sf.adjust_var_based_rd_partitioning =
+          frame_is_intra_only(cm)
+              ? 0
+              : cm->quant_params.base_qindex > qindex_thresh;
+    }
+    return;
+  }
+
+  if (speed == 0) {
+    // qindex_thresh for resolution < 720p
+    const int qindex_thresh = boosted ? 70 : (is_arf2_bwd_type ? 110 : 140);
+    if (!is_720p_or_larger && cm->quant_params.base_qindex <= qindex_thresh) {
+      sf->part_sf.simple_motion_search_split =
+          cm->features.allow_screen_content_tools ? 1 : 2;
+      sf->part_sf.simple_motion_search_early_term_none = 1;
+      sf->tx_sf.model_based_prune_tx_search_level = 0;
+    }
+
+    if (is_720p_or_larger && cm->quant_params.base_qindex <= 128) {
+      sf->rd_sf.perform_coeff_opt = 2 + is_1080p_or_larger;
+      memcpy(winner_mode_params->coeff_opt_thresholds,
+             &coeff_opt_thresholds[sf->rd_sf.perform_coeff_opt],
+             sizeof(winner_mode_params->coeff_opt_thresholds));
       sf->part_sf.simple_motion_search_split =
           cm->features.allow_screen_content_tools ? 1 : 2;
       sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
       sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
       sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+      sf->tx_sf.model_based_prune_tx_search_level = 0;
+
+      if (is_1080p_or_larger && cm->quant_params.base_qindex <= 108) {
+        sf->inter_sf.selective_ref_frame = 2;
+        sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
+        sf->rd_sf.tx_domain_dist_thres_level = 1;
+        sf->part_sf.simple_motion_search_early_term_none = 1;
+        sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+        sf->interp_sf.cb_pred_filter_search = 0;
+        sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
+        sf->tx_sf.tx_type_search.skip_tx_search = 1;
+      }
     }
   }
 
-  if (cpi->oxcf.mode == GOOD && speed >= 3) {
+  if (speed >= 2) {
     // Disable extended partitions for lower quantizers
-    if (cm->quant_params.base_qindex <= 100 &&
-        !cm->features.allow_screen_content_tools && !boosted) {
+    const int aggr = AOMMIN(3, speed - 2);
+    const int qindex_thresh1[4] = { 50, 50, 80, 100 };
+    const int qindex_thresh2[4] = { 80, 100, 120, 160 };
+    int qindex_thresh;
+    int disable_ext_part;
+    if (aggr <= 1) {
+      const int qthresh2 =
+          (!aggr && !is_480p_or_larger) ? 70 : qindex_thresh2[aggr];
+      qindex_thresh = cm->features.allow_screen_content_tools
+                          ? qindex_thresh1[aggr]
+                          : qthresh2;
+      disable_ext_part = !boosted;
+    } else {
+      qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
+      disable_ext_part = !frame_is_intra_only(cm);
+    }
+    if (cm->quant_params.base_qindex <= qindex_thresh && disable_ext_part) {
       sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
     }
   }
 
-  if (cpi->oxcf.mode == GOOD && speed >= 4) {
-    // Disable extended partitions for lower quantizers
-    const int qindex_thresh = boosted ? 80 : 120;
-    if (cm->quant_params.base_qindex <= qindex_thresh &&
-        !cm->features.allow_screen_content_tools &&
-        !frame_is_intra_only(&cpi->common)) {
-      sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+  if (speed >= 4) {
+    // Disable rectangular partitions for lower quantizers
+    const int aggr = AOMMIN(1, speed - 4);
+    const int qindex_thresh[2] = { 65, 80 };
+    int disable_rect_part;
+    disable_rect_part = !boosted;
+    if (cm->quant_params.base_qindex <= qindex_thresh[aggr] &&
+        disable_rect_part && is_480p_or_larger) {
+      sf->part_sf.rect_partition_eval_thresh = BLOCK_8X8;
+    }
+  }
+
+  if (speed <= 2) {
+    if (!is_stat_generation_stage(cpi)) {
+      // Use faster full-pel motion search for high quantizers.
+      // Also use reduced total search range for low resolutions at high
+      // quantizers.
+      const int aggr = speed;
+      const int qindex_thresh1 = ms_qindex_thresh[aggr][is_720p_or_larger][0];
+      const int qindex_thresh2 = ms_qindex_thresh[aggr][is_720p_or_larger][1];
+      const SEARCH_METHODS search_method =
+          motion_search_method[is_720p_or_larger];
+      if (cm->quant_params.base_qindex > qindex_thresh1) {
+        sf->mv_sf.search_method = search_method;
+        sf->tpl_sf.search_method = search_method;
+      } else if (cm->quant_params.base_qindex > qindex_thresh2) {
+        sf->mv_sf.search_method = NSTEP_8PT;
+      }
+    }
+  }
+
+  if (speed >= 4) {
+    // Disable LR search at low and high quantizers and enable only for
+    // mid-quantizer range.
+    if (!boosted && !is_arf2_bwd_type) {
+      const int qindex_low[2] = { 100, 60 };
+      const int qindex_high[2] = { 180, 160 };
+      if (cm->quant_params.base_qindex <= qindex_low[is_720p_or_larger] ||
+          cm->quant_params.base_qindex > qindex_high[is_720p_or_larger]) {
+        sf->lpf_sf.disable_loop_restoration_luma = 1;
+      }
+    }
+  }
+
+  if (speed == 1) {
+    // Reuse interinter wedge mask search from first search for non-boosted
+    // non-internal-arf frames, except at very high quantizers.
+    if (cm->quant_params.base_qindex <= 200) {
+      if (!boosted && !is_arf2_bwd_type)
+        sf->inter_sf.reuse_mask_search_results = 1;
     }
   }
 }
diff --git a/media/libaom/src/av1/encoder/speed_features.h b/media/libaom/src/av1/encoder/speed_features.h
index d12c3c02e1..97a5cf6918 100644
--- a/media/libaom/src/av1/encoder/speed_features.h
+++ b/media/libaom/src/av1/encoder/speed_features.h
@@ -13,11 +13,17 @@
 #define AOM_AV1_ENCODER_SPEED_FEATURES_H_
 
 #include "av1/common/enums.h"
+#include "av1/encoder/enc_enums.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/encodemb.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+/*! @file */
+
+/*!\cond */
 #define MAX_MESH_STEP 4
 
 typedef struct MESH_PATTERN {
@@ -33,18 +39,6 @@ enum {
 } UENUM1BYTE(GM_SEARCH_TYPE);
 
 enum {
-  GM_ERRORADV_TR_0,
-  GM_ERRORADV_TR_1,
-  GM_ERRORADV_TR_2,
-  GM_ERRORADV_TR_TYPES,
-} UENUM1BYTE(GM_ERRORADV_TYPE);
-
-enum {
-  FULL_TXFM_RD,
-  LOW_TXFM_RD,
-} UENUM1BYTE(TXFM_RD_MODEL);
-
-enum {
   DIST_WTD_COMP_ENABLED,
   DIST_WTD_COMP_SKIP_MV_SEARCH,
   DIST_WTD_COMP_DISABLED,
@@ -77,6 +71,8 @@ enum {
   INTRA_DC = (1 << DC_PRED),
   INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED),
   INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_H_V_SMOOTH =
+      (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << SMOOTH_PRED),
   INTRA_DC_PAETH_H_V =
       (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED)
 };
@@ -116,32 +112,31 @@ enum {
   RESERVE_3_SF = 128,
 } UENUM1BYTE(DEV_SPEED_FEATURES);
 
+/* This enumeration defines when the rate control recode loop will be
+ * enabled.
+ */
 enum {
-  // No recode.
+  /*
+   * No recodes allowed
+   */
   DISALLOW_RECODE = 0,
-  // Allow recode for KF and exceeding maximum frame bandwidth.
-  ALLOW_RECODE_KFMAXBW = 1,
-  // Allow recode only for KF/ARF/GF frames.
-  ALLOW_RECODE_KFARFGF = 2,
-  // Allow recode for all frames based on bitrate constraints.
-  ALLOW_RECODE = 3,
+  /*
+   * Allow recode only for KF/ARF/GF frames
+   */
+  ALLOW_RECODE_KFARFGF = 1,
+  /*
+   * Allow recode for all frame types based on bitrate constraints.
+   */
+  ALLOW_RECODE = 2,
 } UENUM1BYTE(RECODE_LOOP_TYPE);
 
 enum {
   SUBPEL_TREE = 0,
-  SUBPEL_TREE_PRUNED = 1,           // Prunes 1/2-pel searches
-  SUBPEL_TREE_PRUNED_MORE = 2,      // Prunes 1/2-pel searches more aggressively
-  SUBPEL_TREE_PRUNED_EVENMORE = 3,  // Prunes 1/2- and 1/4-pel searches
-  // Other methods to come
+  SUBPEL_TREE_PRUNED = 1,       // Prunes 1/2-pel searches
+  SUBPEL_TREE_PRUNED_MORE = 2,  // Prunes 1/2-pel searches more aggressively
 } UENUM1BYTE(SUBPEL_SEARCH_METHODS);
 
 enum {
-  USE_FULL_RD = 0,
-  USE_FAST_RD,
-  USE_LARGESTALL,
-} UENUM1BYTE(TX_SIZE_SEARCH_METHOD);
-
-enum {
   // Try the full image with different values.
   LPF_PICK_FROM_FULL_IMAGE,
   // Try the full image filter search with non-dual filter only.
@@ -153,15 +148,24 @@ enum {
   // Pick 0 to disable LPF if LPF was enabled last frame
   LPF_PICK_MINIMAL_LPF
 } UENUM1BYTE(LPF_PICK_METHOD);
+/*!\endcond */
 
-enum {
-  CDEF_FULL_SEARCH,
-  CDEF_FAST_SEARCH_LVL1,  // Search among a subset of all possible filters.
-  CDEF_FAST_SEARCH_LVL2,  // Search reduced subset of filters than Level 1.
-  CDEF_PICK_FROM_Q,       // Estimate filter strength based on quantizer.
+/*!\enum CDEF_PICK_METHOD
+ * \brief This enumeration defines a variety of CDEF pick methods
+ */
+typedef enum {
+  CDEF_FULL_SEARCH,      /**< Full search */
+  CDEF_FAST_SEARCH_LVL1, /**< Search among a subset of all possible filters. */
+  CDEF_FAST_SEARCH_LVL2, /**< Search reduced subset of filters than Level 1. */
+  CDEF_FAST_SEARCH_LVL3, /**< Search reduced subset of secondary filters than
+                              Level 2. */
+  CDEF_FAST_SEARCH_LVL4, /**< Search reduced subset of filters than Level 3. */
+  CDEF_FAST_SEARCH_LVL5, /**< Search reduced subset of filters than Level 4. */
+  CDEF_PICK_FROM_Q,      /**< Estimate filter strength based on quantizer. */
   CDEF_PICK_METHODS
-} UENUM1BYTE(CDEF_PICK_METHOD);
+} CDEF_PICK_METHOD;
 
+/*!\cond */
 enum {
   // Terminate search early based on distortion so far compared to
   // qp step, distortion in the neighborhood of the frame, etc.
@@ -182,23 +186,66 @@ enum {
 } UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC);
 
 enum {
-  NO_PRUNE = 0,
+  // No tx type pruning
+  TX_TYPE_PRUNE_0 = 0,
   // adaptively prunes the least perspective tx types out of all 16
   // (tuned to provide negligible quality loss)
-  PRUNE_2D_ACCURATE = 1,
+  TX_TYPE_PRUNE_1 = 1,
   // similar, but applies much more aggressive pruning to get better speed-up
-  PRUNE_2D_FAST = 2,
-  PRUNE_2D_MORE = 3,
+  TX_TYPE_PRUNE_2 = 2,
+  TX_TYPE_PRUNE_3 = 3,
   // More aggressive pruning based on tx type score and allowed tx count
-  PRUNE_2D_AGGRESSIVE = 4,
+  TX_TYPE_PRUNE_4 = 4,
+  TX_TYPE_PRUNE_5 = 5,
 } UENUM1BYTE(TX_TYPE_PRUNE_MODE);
 
+enum {
+  // No reaction to rate control on a detected slide/scene change.
+  NO_DETECTION = 0,
+
+  // Set to larger Q based only on the detected slide/scene change and
+  // current/past Q.
+  FAST_DETECTION_MAXQ = 1,
+} UENUM1BYTE(OVERSHOOT_DETECTION_CBR);
+
+enum {
+  // Turns off multi-winner mode. So we will do txfm search on either all modes
+  // if winner mode is off, or we will only on txfm search on a single winner
+  // mode.
+  MULTI_WINNER_MODE_OFF = 0,
+
+  // Limits the number of winner modes to at most 2
+  MULTI_WINNER_MODE_FAST = 1,
+
+  // Uses the default number of winner modes, which is 3 for intra mode, and 1
+  // for inter mode.
+  MULTI_WINNER_MODE_DEFAULT = 2,
+
+  // Maximum number of winner modes allowed.
+  MULTI_WINNER_MODE_LEVELS,
+} UENUM1BYTE(MULTI_WINNER_MODE_TYPE);
+
+enum {
+  PRUNE_NEARMV_OFF = 0,     // Turn off nearmv pruning
+  PRUNE_NEARMV_LEVEL1 = 1,  // Prune nearmv for qindex (0-85)
+  PRUNE_NEARMV_LEVEL2 = 2,  // Prune nearmv for qindex (0-170)
+  PRUNE_NEARMV_LEVEL3 = 3,  // Prune nearmv more aggressively for qindex (0-170)
+  PRUNE_NEARMV_MAX = PRUNE_NEARMV_LEVEL3,
+} UENUM1BYTE(PRUNE_NEARMV_LEVEL);
+
 typedef struct {
-  TX_TYPE_PRUNE_MODE prune_mode;
+  TX_TYPE_PRUNE_MODE prune_2d_txfm_mode;
   int fast_intra_tx_type_search;
-  int fast_inter_tx_type_search;
 
-  // prune two least frequently chosen transforms for each intra mode
+  // INT_MAX: Disable fast search.
+  // 1 - 1024: Probability threshold used for conditionally forcing tx type,
+  // during mode search.
+  // 0: Force tx type to be DCT_DCT unconditionally, during
+  // mode search.
+  int fast_inter_tx_type_prob_thresh;
+
+  // Prune less likely chosen transforms for each intra mode. The speed
+  // feature ranges from 0 to 2, for different speed / compression trade offs.
   int use_reduced_intra_txset;
 
   // Use a skip flag prediction model to detect blocks with skip = 1 early
@@ -221,7 +268,7 @@ typedef struct {
   // inter blocks. It enables further tx type mode pruning based on ML model for
   // mode evaluation and disables tx type mode pruning for winner mode
   // processing.
-  int enable_winner_mode_tx_type_pruning;
+  int winner_mode_tx_type_pruning;
 } TX_TYPE_SEARCH;
 
 enum {
@@ -231,9 +278,13 @@ enum {
   // Always use a fixed size partition
   FIXED_PARTITION,
 
-  REFERENCE_PARTITION,
+  // Partition using source variance
+  VAR_BASED_PARTITION,
 
-  VAR_BASED_PARTITION
+#if CONFIG_RT_ML_PARTITIONING
+  // Partition using ML model
+  ML_BASED_PARTITION
+#endif
 } UENUM1BYTE(PARTITION_SEARCH_TYPE);
 
 enum {
@@ -249,42 +300,181 @@ enum {
   QTR_ONLY,
 } UENUM1BYTE(MV_PREC_LOGIC);
 
+enum {
+  SUPERRES_AUTO_ALL,   // Tries all possible superres ratios
+  SUPERRES_AUTO_DUAL,  // Tries no superres and q-based superres ratios
+  SUPERRES_AUTO_SOLO,  // Only apply the q-based superres ratio
+} UENUM1BYTE(SUPERRES_AUTO_SEARCH_TYPE);
+/*!\endcond */
+
+/*!\enum INTERNAL_COST_UPDATE_TYPE
+ * \brief This enum decides internally how often to update the entropy costs
+ *
+ * INTERNAL_COST_UPD_TYPE is similar to \ref COST_UPDATE_TYPE but has slightly
+ * more flexibility in update frequency. This enum is separate from \ref
+ * COST_UPDATE_TYPE because although \ref COST_UPDATE_TYPE is not exposed, its
+ * values are public so it cannot be modified without breaking public API.
+ * Due to the use of AOMMIN() in populate_unified_cost_update_freq() to
+ * compute the unified cost update frequencies (out of COST_UPDATE_TYPE and
+ * INTERNAL_COST_UPDATE_TYPE), the values of this enum type must be listed in
+ * the order of increasing frequencies.
+ *
+ * \warning  In case of any updates/modifications to the enum COST_UPDATE_TYPE,
+ * update the enum INTERNAL_COST_UPDATE_TYPE as well.
+ */
+typedef enum {
+  INTERNAL_COST_UPD_OFF,       /*!< Turn off cost updates. */
+  INTERNAL_COST_UPD_TILE,      /*!< Update every tile. */
+  INTERNAL_COST_UPD_SBROW_SET, /*!< Update every row_set of height 256 pixs. */
+  INTERNAL_COST_UPD_SBROW,     /*!< Update every sb rows inside a tile. */
+  INTERNAL_COST_UPD_SB,        /*!< Update every sb. */
+} INTERNAL_COST_UPDATE_TYPE;
+
+/*!\enum SIMPLE_MOTION_SEARCH_PRUNE_LEVEL
+ * \brief This enumeration defines a variety of simple motion search based
+ * partition prune levels
+ */
+typedef enum {
+  NO_PRUNING = -1,
+  SIMPLE_AGG_LVL0,     /*!< Simple prune aggressiveness level 0. */
+  SIMPLE_AGG_LVL1,     /*!< Simple prune aggressiveness level 1. */
+  SIMPLE_AGG_LVL2,     /*!< Simple prune aggressiveness level 2. */
+  SIMPLE_AGG_LVL3,     /*!< Simple prune aggressiveness level 3. */
+  QIDX_BASED_AGG_LVL1, /*!< Qindex based prune aggressiveness level, aggressive
+                          level maps to simple agg level 1 or 2 based on qindex.
+                        */
+  TOTAL_SIMPLE_AGG_LVLS = QIDX_BASED_AGG_LVL1, /*!< Total number of simple prune
+                                                  aggressiveness levels. */
+  TOTAL_QINDEX_BASED_AGG_LVLS =
+      QIDX_BASED_AGG_LVL1 -
+      SIMPLE_AGG_LVL3, /*!< Total number of qindex based simple prune
+                          aggressiveness levels. */
+  TOTAL_AGG_LVLS = TOTAL_SIMPLE_AGG_LVLS +
+                   TOTAL_QINDEX_BASED_AGG_LVLS, /*!< Total number of levels. */
+} SIMPLE_MOTION_SEARCH_PRUNE_LEVEL;
+
+/*!\enum PRUNE_MESH_SEARCH_LEVEL
+ * \brief This enumeration defines a variety of mesh search prune levels.
+ */
+typedef enum {
+  PRUNE_MESH_SEARCH_DISABLED = 0, /*!< Prune mesh search level 0. */
+  PRUNE_MESH_SEARCH_LVL_1 = 1,    /*!< Prune mesh search level 1. */
+  PRUNE_MESH_SEARCH_LVL_2 = 2,    /*!< Prune mesh search level 2. */
+} PRUNE_MESH_SEARCH_LEVEL;
+
+/*!\enum INTER_SEARCH_EARLY_TERM_IDX
+ * \brief This enumeration defines inter search early termination index in
+ * non-rd path based on sse value.
+ */
+typedef enum {
+  EARLY_TERM_DISABLED =
+      0, /*!< Early terminate inter mode search based on sse disabled. */
+  EARLY_TERM_IDX_1 =
+      1, /*!< Early terminate inter mode search based on sse, index 1. */
+  EARLY_TERM_IDX_2 =
+      2, /*!< Early terminate inter mode search based on sse, index 2. */
+  EARLY_TERM_IDX_3 =
+      3, /*!< Early terminate inter mode search based on sse, index 3. */
+  EARLY_TERM_IDX_4 =
+      4, /*!< Early terminate inter mode search based on sse, index 4. */
+  EARLY_TERM_INDICES, /*!< Total number of early terminate indices */
+} INTER_SEARCH_EARLY_TERM_IDX;
+
+/*!
+ * \brief Sequence/frame level speed vs quality features
+ */
 typedef struct HIGH_LEVEL_SPEED_FEATURES {
-  // Frame level coding parameter update
+  /*! Frame level coding parameter update. */
   int frame_parameter_update;
 
+  /*!
+   * Cases and frame types for which the recode loop is enabled.
+   */
   RECODE_LOOP_TYPE recode_loop;
 
-  // This feature controls the tolerence vs target used in deciding whether to
-  // recode a frame. It has no meaning if recode is disabled.
+  /*!
+   * Controls the tolerance vs target rate used in deciding whether to
+   * recode a frame. It has no meaning if recode is disabled.
+   */
   int recode_tolerance;
 
-  // Determine how motion vector precision is chosen. The possibilities are:
-  // LAST_MV_DATA: use the mv data from the last coded frame
-  // CURRENT_Q: use the current q as a threshold
-  // QTR_ONLY: use quarter pel precision only.
+  /*!
+   * Determine how motion vector precision is chosen. The possibilities are:
+   * LAST_MV_DATA: use the mv data from the last coded frame
+   * CURRENT_Q: use the current q as a threshold
+   * QTR_ONLY: use quarter pel precision only.
+   */
   MV_PREC_LOGIC high_precision_mv_usage;
 
-  // Whether to disable overlay frames for filtered Altref frames,
-  // overiding oxcf->enable_overlay flag set as 1.
-  int disable_overlay_frames;
+  /*!
+   * Always set to 0. If on it enables 0 cost background transmission
+   * (except for the initial transmission of the segmentation). The feature is
+   * disabled because the addition of very large block sizes make the
+   * backgrounds very to cheap to encode, and the segmentation we have
+   * adds overhead.
+   */
+  int static_segmentation;
 
-  // Enable/disable adaptively deciding whether or not to encode ALTREF overlay
-  // frame.
-  int adaptive_overlay_encoding;
+  /*!
+   * Superres-auto mode search type:
+   */
+  SUPERRES_AUTO_SEARCH_TYPE superres_auto_search_type;
 
-  // Always set to 0. If on it enables 0 cost background transmission
-  // (except for the initial transmission of the segmentation). The feature is
-  // disabled because the addition of very large block sizes make the
-  // backgrounds very to cheap to encode, and the segmentation we have
-  // adds overhead.
-  int static_segmentation;
+  /*!
+   * Enable/disable extra screen content test by encoding key frame twice.
+   */
+  int disable_extra_sc_testing;
 
-  // Enable/disable second_alt_ref temporal filtering.
+  /*!
+   * Enable/disable second_alt_ref temporal filtering.
+   */
   int second_alt_ref_filtering;
+
+  /*!
+   * Number of frames to be used in temporal filtering controlled based on noise
+   * levels and arf-q.
+   */
+  int num_frames_used_in_tf;
 } HIGH_LEVEL_SPEED_FEATURES;
 
+/*!
+ * Speed features for the first pass.
+ */
+typedef struct FIRST_PASS_SPEED_FEATURES {
+  /*!
+   * \brief Reduces the mv search window.
+   * By default, the initial search window is around
+   * MIN(MIN(dims), MAX_FULL_PEL_VAL) = MIN(MIN(dims), 1023).
+   * Each step reduction decrease the window size by about a factor of 2.
+   */
+  int reduce_mv_step_param;
+
+  /*!
+   * \brief Skips the motion search when the zero mv has small sse.
+   */
+  int skip_motion_search_threshold;
+
+  /*!
+   * \brief Skips reconstruction by using source buffers for prediction
+   */
+  int disable_recon;
+
+  /*!
+   * \brief Skips the motion search centered on 0,0 mv.
+   */
+  int skip_zeromv_motion_search;
+} FIRST_PASS_SPEED_FEATURES;
+
+/*!\cond */
 typedef struct TPL_SPEED_FEATURES {
+  // GOP length adaptive decision.
+  // If set to 0, tpl model decides whether a shorter gf interval is better.
+  // If set to 1, tpl stats of ARFs from base layer, (base+1) layer and
+  // (base+2) layer decide whether a shorter gf interval is better.
+  // If set to 2, tpl stats of ARFs from base layer, (base+1) layer and GF boost
+  // decide whether a shorter gf interval is better.
+  // If set to 3, gop length adaptive decision is disabled.
+  int gop_length_decision_method;
   // Prune the intra modes search by tpl.
   // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED.
   // If set to 1, we only search DC_PRED, V_PRED, and H_PRED.
@@ -301,50 +491,63 @@ typedef struct TPL_SPEED_FEATURES {
 
   // When to stop subpel search.
   SUBPEL_FORCE_STOP subpel_force_stop;
-} TPL_SPEED_FEATURES;
 
-typedef struct GLOBAL_MOTION_SPEED_FEATURES {
-  // Global motion warp error threshold
-  GM_ERRORADV_TYPE gm_erroradv_type;
+  // Which search method to use.
+  SEARCH_METHODS search_method;
 
-  // Disable adaptive threshold for global motion warp error
-  int disable_adaptive_warp_error_thresh;
+  // Prune starting mvs in TPL based on sad scores.
+  int prune_starting_mv;
 
-  // Do not compute the global motion parameters for a LAST2_FRAME or
-  // LAST3_FRAME if the GOLDEN_FRAME is closer and it has a non identity
-  // global model.
-  int selective_ref_gm;
+  // Not run TPL for filtered Key frame.
+  int disable_filtered_key_tpl;
 
-  GM_SEARCH_TYPE gm_search_type;
+  // Prune reference frames in TPL.
+  int prune_ref_frames_in_tpl;
+
+  // Support compound predictions.
+  int allow_compound_pred;
 
-  // whether to disable the global motion recode loop
-  int gm_disable_recode;
+  // Calculate rate and distortion based on Y plane only.
+  int use_y_only_rate_distortion;
+} TPL_SPEED_FEATURES;
+
+typedef struct GLOBAL_MOTION_SPEED_FEATURES {
+  GM_SEARCH_TYPE gm_search_type;
 
   // During global motion estimation, prune remaining reference frames in a
   // given direction(past/future), if the evaluated ref_frame in that direction
   // yields gm_type as INVALID/TRANSLATION/IDENTITY
   int prune_ref_frame_for_gm_search;
+
+  // When the current GM type is set to ZEROMV, prune ZEROMV if its performance
+  // is worse than NEWMV under SSE metric.
+  // 0 : no pruning
+  // 1 : conservative pruning
+  // 2 : aggressive pruning
+  int prune_zero_mv_with_sse;
+
+  // Disable global motion estimation based on stats of previous frames in the
+  // GF group
+  int disable_gm_search_based_on_stats;
 } GLOBAL_MOTION_SPEED_FEATURES;
 
 typedef struct PARTITION_SPEED_FEATURES {
   PARTITION_SEARCH_TYPE partition_search_type;
 
-  // Used if partition_search_type = FIXED_SIZE_PARTITION
-  BLOCK_SIZE always_this_block_size;
+  // Used if partition_search_type = FIXED_PARTITION
+  BLOCK_SIZE fixed_partition_size;
 
   // Prune extended partition types search
   // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing
   // aggressiveness of pruning in order.
   int prune_ext_partition_types_search_level;
 
-  // Use a ML model to prune horz and vert partitions
-  int ml_prune_rect_partition;
-
-  // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
-  int ml_prune_ab_partition;
+  // Prune part4 based on block size
+  int prune_part4_search;
 
-  // Use a ML model to prune horz4 and vert4 partitions.
-  int ml_prune_4_partition;
+  // Use a ML model to prune rectangular, ab and 4-way horz
+  // and vert partitions
+  int ml_prune_partition;
 
   // Use a ML model to adaptively terminate partition search after trying
   // PARTITION_SPLIT. Can take values 0 - 2, 0 meaning not being enabled, and
@@ -359,21 +562,20 @@ typedef struct PARTITION_SPEED_FEATURES {
   // Use square partition only beyond this block size.
   BLOCK_SIZE use_square_partition_only_threshold;
 
-  // Sets min and max square partition levels for this superblock based on
+  // Sets max square partition levels for this superblock based on
   // motion vector and prediction error distribution produced from 16x16
   // simple motion search
   MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion;
-  int auto_min_partition_based_on_simple_motion;
 
   // Min and max square partition size we enable (block_size) as per auto
   // min max, but also used by adjust partitioning, and pick_partitioning.
   BLOCK_SIZE default_min_partition_size;
   BLOCK_SIZE default_max_partition_size;
 
-  // Sets level of adjustmet of variace-based partitioning during
-  // rd_use_partition 0 - no partition adjusment, 1 - try to merge partitions
-  // for small blocks and high QP, 2 - always try to merge leaf partitions, 3 -
-  // try to merge and split leaf partitions
+  // Sets level of adjustment of variance-based partitioning during
+  // rd_use_partition 0 - no partition adjustment, 1 - try to merge partitions
+  // for small blocks and high QP, 2 - try to merge partitions, 3 - try to merge
+  // and split leaf partitions and 0 - 3 decreasing aggressiveness in order.
   int adjust_var_based_rd_partitioning;
 
   // Partition search early breakout thresholds.
@@ -383,11 +585,10 @@ typedef struct PARTITION_SPEED_FEATURES {
   // Thresholds for ML based partition search breakout.
   int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
 
-  // Allow skipping partition search for still image frame
-  int allow_partition_search_skip;
-
-  // The aggresiveness of pruning with simple_motion_search.
-  // Currently 0 is the lowest, and 2 the highest.
+  // Aggressiveness levels for pruning split and rectangular partitions based on
+  // simple_motion_search. SIMPLE_AGG_LVL0 to SIMPLE_AGG_LVL3 correspond to
+  // simple motion search based pruning. QIDX_BASED_AGG_LVL1 corresponds to
+  // qindex based and simple motion search based pruning.
   int simple_motion_search_prune_agg;
 
   // Perform simple_motion_search on each possible subblock and use it to prune
@@ -405,29 +606,128 @@ typedef struct PARTITION_SPEED_FEATURES {
   // partition after PARTITION_NONE
   int simple_motion_search_early_term_none;
 
+  // Controls whether to reduce the number of motion search steps. If this is 0,
+  // then simple_motion_search has the same number of steps as
+  // single_motion_search (assuming no other speed features). Otherwise, reduce
+  // the number of steps by the value contained in this variable.
+  int simple_motion_search_reduce_search_steps;
+
   // This variable controls the maximum block size where intra blocks can be
   // used in inter frames.
   // TODO(aconverse): Fold this into one of the other many mode skips
   BLOCK_SIZE max_intra_bsize;
 
   // Use CNN with luma pixels on source frame on each of the 64x64 subblock to
-  // perform split/no_split decision on intra-frames.
-  int intra_cnn_split;
+  // perform partition pruning in intra frames.
+  // 0: No Pruning
+  // 1: Prune split and rectangular partitions only
+  // 2: Prune none, split and rectangular partitions
+  int intra_cnn_based_part_prune_level;
 
   // Disable extended partition search for lower block sizes.
   int ext_partition_eval_thresh;
 
-  // Prune 1:4 partition search based on winner info from split partitions
-  int prune_4_partition_using_split_info;
+  // Disable rectangular partitions for larger block sizes.
+  int rect_partition_eval_thresh;
 
-  // Prune AB partition search using split and HORZ/VERT info
-  int prune_ab_partition_using_split_info;
+  // prune extended partition search
+  // 0 : no pruning
+  // 1 : prune 1:4 partition search using winner info from split partitions
+  // 2 : prune 1:4 and AB partition search using split and HORZ/VERT info
+  int prune_ext_part_using_split_info;
+
+  // Prunt rectangular, AB and 4-way partition based on q index and block size
+  // 0 : no pruning
+  // 1 : prune sub_8x8 at very low quantizers
+  // 2 : prune all block size based on qindex
+  int prune_rectangular_split_based_on_qidx;
+
+  // Terminate partition search for child partition,
+  // when NONE and SPLIT partition rd_costs are INT64_MAX.
+  int early_term_after_none_split;
+
+  // Level used to adjust threshold for av1_ml_predict_breakout(). At lower
+  // levels, more conservative threshold is used, and value of 0 indicates
+  // av1_ml_predict_breakout() is disabled. Value of 3 corresponds to default
+  // case with no adjustment to lbd thresholds.
+  int ml_predict_breakout_level;
+
+  // Prune sub_8x8 (BLOCK_4X4, BLOCK_4X8 and BLOCK_8X4) partitions.
+  // 0 : no pruning
+  // 1 : pruning based on neighbour block information
+  // 2 : prune always
+  int prune_sub_8x8_partition_level;
+
+  // Prune rectangular split based on simple motion search split/no_split score.
+  // 0: disable pruning, 1: enable pruning
+  int simple_motion_search_rect_split;
+
+  // The current encoder adopts a DFS search for block partitions.
+  // Therefore the mode selection and associated rdcost is ready for smaller
+  // blocks before the mode selection for some partition types.
+  // AB partition could use previous rd information and skip mode search.
+  // An example is:
+  //
+  //  current block
+  //  +---+---+
+  //  |       |
+  //  +       +
+  //  |       |
+  //  +-------+
+  //
+  //  SPLIT partition has been searched first before trying HORZ_A
+  //  +---+---+
+  //  | R | R |
+  //  +---+---+
+  //  | R | R |
+  //  +---+---+
+  //
+  //  HORZ_A
+  //  +---+---+
+  //  |   |   |
+  //  +---+---+
+  //  |       |
+  //  +-------+
+  //
+  //  With this speed feature, the top two sub blocks can directly use rdcost
+  //  searched in split partition, and the mode info is also copied from
+  //  saved info. Similarly, the bottom rectangular block can also use
+  //  the available information from previous rectangular search.
+  int reuse_prev_rd_results_for_part_ab;
+
+  // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT
+  // when encoding PARTITION_AB.
+  int reuse_best_prediction_for_part_ab;
+
+  // The current partition search records the best rdcost so far and uses it
+  // in mode search and transform search to early skip when some criteria is
+  // met. For example, when the current rdcost is larger than the best rdcost,
+  // or the model rdcost is larger than the best rdcost times some thresholds.
+  // By default, this feature is turned on to speed up the encoder partition
+  // search.
+  // If disabling it, at speed 0, 30 frames, we could get
+  // about -0.25% quality gain (psnr, ssim, vmaf), with about 13% slowdown.
+  int use_best_rd_for_pruning;
+
+  // Skip evaluation of non-square partitions based on the corresponding NONE
+  // partition.
+  // 0: no pruning
+  // 1: prune extended partitions if NONE is skippable
+  // 2: on top of 1, prune rectangular partitions if NONE is inter, not a newmv
+  // mode and skippable
+  int skip_non_sq_part_based_on_none;
 } PARTITION_SPEED_FEATURES;
 
 typedef struct MV_SPEED_FEATURES {
   // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
   SEARCH_METHODS search_method;
 
+  // Enable the use of faster, less accurate mv search method on bsize >=
+  // BLOCK_32X32.
+  // TODO(chiyotsai@google.com): Take the clip's resolution and mv activity into
+  // account.
+  int use_bsize_dependent_search_method;
+
   // If this is set to 1, we limit the motion search range to 2 times the
   // largest motion vector found in the last frame.
   int auto_mv_step_size;
@@ -451,11 +751,6 @@ typedef struct MV_SPEED_FEATURES {
   // encoding and decoding; otherwise, it uses bilinear interpolation.
   SUBPEL_SEARCH_TYPE use_accurate_subpel_search;
 
-  // TODO(jingning): combine the related motion search speed features
-  // This allows us to use motion search at other sizes as a starting
-  // point for this motion search and limits the search range around it.
-  int adaptive_motion_search;
-
   // Threshold for allowing exhaustive motion search.
   int exhaustive_searches_thresh;
 
@@ -469,7 +764,7 @@ typedef struct MV_SPEED_FEATURES {
   int reduce_search_range;
 
   // Prune mesh search.
-  int prune_mesh_search;
+  PRUNE_MESH_SEARCH_LEVEL prune_mesh_search;
 
   // Use the rd cost around the best FULLPEL_MV to speed up subpel search
   int use_fullpel_costlist;
@@ -481,6 +776,22 @@ typedef struct MV_SPEED_FEATURES {
 
   // Accurate full pixel motion search based on TPL stats.
   int full_pixel_search_level;
+
+  // Whether to downsample the rows in sad calculation during motion search.
+  // This is only active when there are at least 16 rows.
+  int use_downsampled_sad;
+
+  // Enable/disable extensive joint motion search.
+  int disable_extensive_joint_motion_search;
+
+  // Enable second best mv check in joint mv search.
+  // 0: allow second MV (use rd cost as the metric)
+  // 1: use var as the metric
+  // 2: disable second MV
+  int disable_second_mv;
+
+  // Skips full pixel search based on start mv of prior ref_mv_idx.
+  int skip_fullpel_search_using_startmv;
 } MV_SPEED_FEATURES;
 
 typedef struct INTER_MODE_SPEED_FEATURES {
@@ -498,9 +809,6 @@ typedef struct INTER_MODE_SPEED_FEATURES {
   // Limit the inter mode tested in the RD loop
   int reduce_inter_modes;
 
-  // Adaptive prediction mode search
-  int adaptive_mode_search;
-
   // This variable is used to cap the maximum number of times we skip testing a
   // mode to be evaluated. A high value means we will be faster.
   int adaptive_rd_thresh;
@@ -509,8 +817,8 @@ typedef struct INTER_MODE_SPEED_FEATURES {
   int prune_inter_modes_if_skippable;
 
   // Drop less likely to be picked reference frames in the RD search.
-  // Has five levels for now: 0, 1, 2, 3 and 4, where higher levels prune more
-  // aggressively than lower ones. (0 means no pruning).
+  // Has seven levels for now: 0, 1, 2, 3, 4, 5 and 6 where higher levels prune
+  // more aggressively than lower ones. (0 means no pruning).
   int selective_ref_frame;
 
   // Prune reference frames for rectangular partitions.
@@ -519,18 +827,36 @@ typedef struct INTER_MODE_SPEED_FEATURES {
   // 2 implies prune horiz, vert and extended partition
   int prune_ref_frame_for_rect_partitions;
 
+  // Prune inter modes w.r.t past reference frames
+  // 0 no pruning
+  // 1 prune inter modes w.r.t ALTREF2 and ALTREF reference frames
+  // 2 prune inter modes w.r.t BWDREF, ALTREF2 and ALTREF reference frames
   int alt_ref_search_fp;
 
-  // flag to skip NEWMV mode in drl if the motion search result is the same
-  int skip_repeated_newmv;
-
-  // Skip the current ref_mv in NEW_MV mode if we have already encountered
-  // another ref_mv in the drl such that:
-  //  1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
-  //     search process as the current fullpel_mv.
-  //  2. The rate needed to encode the current fullpel_mv is larger than that
-  //     for the other ref_mv.
-  int skip_repeated_full_newmv;
+  // Prune compound reference frames
+  // 0 no pruning
+  // 1 prune compound references which do not satisfy the two conditions:
+  //   a) The references are at a nearest distance from the current frame in
+  //   both past and future direction.
+  //   b) The references have minimum pred_mv_sad in both past and future
+  //   direction.
+  // 2 prune compound references except the one with nearest distance from the
+  //   current frame in both past and future direction.
+  int prune_comp_ref_frames;
+
+  // Skip the current ref_mv in NEW_MV mode based on mv, rate cost, etc.
+  // This speed feature equaling 0 means no skipping.
+  // If the speed feature equals 1 or 2, skip the current ref_mv in NEW_MV mode
+  // if we have already encountered ref_mv in the drl such that:
+  //  1. The other drl has the same mv during the SIMPLE_TRANSLATION search
+  //     process as the current mv.
+  //  2. The rate needed to encode the current mv is larger than that for the
+  //     other ref_mv.
+  // The speed feature equaling 1 means using subpel mv in the comparison.
+  // The speed feature equaling 2 means using fullpel mv in the comparison.
+  // If the speed feature >= 3, skip the current ref_mv in NEW_MV mode based on
+  // known full_mv bestsme and drl cost.
+  int skip_newmv_in_drl;
 
   // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV,
   // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found
@@ -541,10 +867,6 @@ typedef struct INTER_MODE_SPEED_FEATURES {
   // Flag used to control the ref_best_rd based gating for chroma
   int perform_best_rd_based_gating_for_chroma;
 
-  // Skip certain motion modes (OBMC, warped, interintra) for single reference
-  // motion search, using the results of single ref SIMPLE_TRANSLATION
-  int prune_single_motion_modes_by_simple_trans;
-
   // Reuse the inter_intra_mode search result from NEARESTMV mode to other
   // single ref modes
   int reuse_inter_intra_mode;
@@ -565,12 +887,6 @@ typedef struct INTER_MODE_SPEED_FEATURES {
   // same single inter mode as a group.
   int prune_comp_search_by_single_result;
 
-  // If 1 we iterate finding a best reference for 2 ref frames together - via
-  // a log search that iterates 4 times (check around mv for last for best
-  // error of combined predictor then check around mv for alt). If 0 we
-  // we just use the best motion vector found for each frame by itself.
-  BLOCK_SIZE comp_inter_joint_search_thresh;
-
   // Instead of performing a full MV search, do a simple translation first
   // and only perform a full MV search on the motion vectors that performed
   // well.
@@ -581,12 +897,27 @@ typedef struct INTER_MODE_SPEED_FEATURES {
   // the single reference modes, it is one of the two best performers.
   int prune_compound_using_single_ref;
 
-  // Skip extended compound mode using ref frames of above and left neighbor
+  // Skip extended compound mode (NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEWMV,
+  // NEW_NEARMV) using ref frames of above and left neighbor
   // blocks.
   // 0 : no pruning
+  // 1 : prune ext compound modes using neighbor blocks (less aggressiveness)
+  // 2 : prune ext compound modes using neighbor blocks (high aggressiveness)
+  // 3 : prune ext compound modes unconditionally (highest aggressiveness)
+  int prune_ext_comp_using_neighbors;
+
+  // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes
+  int skip_ext_comp_nearmv_mode;
+
+  // Skip extended compound mode when ref frame corresponding to NEWMV does not
+  // have NEWMV as single mode winner.
+  // 0 : no pruning
   // 1 : prune extended compound mode (less aggressiveness)
   // 2 : prune extended compound mode (high aggressiveness)
-  int prune_compound_using_neighbors;
+  int prune_comp_using_best_single_mode_ref;
+
+  // Skip NEARESTMV and NEARMV using weight computed in ref mv list population
+  int prune_nearest_near_mv_using_refmv_weight;
 
   // Based on previous ref_mv_idx search result, prune the following search.
   int prune_ref_mv_idx_search;
@@ -594,73 +925,49 @@ typedef struct INTER_MODE_SPEED_FEATURES {
   // Disable one sided compound modes.
   int disable_onesided_comp;
 
-  // Prune/gate motion mode evaluation based on token based rd
-  // during transform search for inter blocks
-  // Values are 0 (not used) , 1 - 3 with progressively increasing
-  // aggressiveness
-  int prune_motion_mode_level;
-
   // Prune obmc search using previous frame stats.
+  // INT_MAX : disable obmc search
   int prune_obmc_prob_thresh;
 
-  // Disable obmc.
-  int disable_obmc;
-
-  // Gate warp evaluation for motions of type IDENTITY,
-  // TRANSLATION and AFFINE(based on number of warp neighbors)
-  int prune_warp_using_wmtype;
-
   // Prune warped motion search using previous frame stats.
   int prune_warped_prob_thresh;
 
-  // Enable/disable interintra wedge search.
-  int disable_wedge_interintra_search;
+  // Variance threshold to enable/disable Interintra wedge search
+  unsigned int disable_interintra_wedge_var_thresh;
+
+  // Variance threshold to enable/disable Interinter wedge search
+  unsigned int disable_interinter_wedge_var_thresh;
 
   // De-couple wedge and mode search during interintra RDO.
   int fast_interintra_wedge_search;
 
-  // Only enable wedge search if the edge strength is greater than
-  // this threshold. A value of 0 signals that this check is disabled.
-  unsigned int disable_wedge_search_edge_thresh;
-
-  // Only enable wedge search if the variance is above this threshold.
-  unsigned int disable_wedge_search_var_thresh;
-
   // Whether fast wedge sign estimate is used
   int fast_wedge_sign_estimate;
 
-  // Whether to prune wedge search based on predictor difference
-  int prune_wedge_pred_diff_based;
-
   // Enable/disable ME for interinter wedge search.
   int disable_interinter_wedge_newmv_search;
 
-  // Enable/disable ME for interinter diffwtd search. PSNR BD-rate gain of
-  // ~0.1 on the lowres test set, but ~15% slower computation.
-  int enable_interinter_diffwtd_newmv_search;
-
-  // Enable/disable smooth inter-intra mode
-  int disable_smooth_interintra;
-
-  // Disable interinter_wedge
-  int disable_interinter_wedge;
-
   // Decide when and how to use joint_comp.
   DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag;
 
-  // Whether to override and disable sb level coeff cost updates, if
-  // cpi->oxcf.coeff_cost_upd_freq = COST_UPD_SB (i.e. set at SB level)
-  int disable_sb_level_coeff_cost_upd;
+  // Clip the frequency of updating the mv cost.
+  INTERNAL_COST_UPDATE_TYPE mv_cost_upd_level;
+
+  // Clip the frequency of updating the coeff cost.
+  INTERNAL_COST_UPDATE_TYPE coeff_cost_upd_level;
 
-  // Whether to override and disable sb level mv cost updates, if
-  // cpi->oxcf.coeff_cost_upd_freq = COST_UPD_SB (i.e. set at SB level)
-  int disable_sb_level_mv_cost_upd;
+  // Clip the frequency of updating the mode cost.
+  INTERNAL_COST_UPDATE_TYPE mode_cost_upd_level;
 
   // Prune inter modes based on tpl stats
   // 0 : no pruning
   // 1 - 3 indicate increasing aggressiveness in order.
   int prune_inter_modes_based_on_tpl;
 
+  // Skip NEARMV and NEAR_NEARMV modes using ref frames of above and left
+  // neighbor blocks and qindex.
+  PRUNE_NEARMV_LEVEL prune_nearmv_using_neighbors;
+
   // Model based breakout after interpolation filter search
   // 0: no breakout
   // 1: use model based rd breakout
@@ -670,13 +977,48 @@ typedef struct INTER_MODE_SPEED_FEATURES {
   // 0: No reuse
   // 1: Reuse the compound type decision
   int reuse_compound_type_decision;
+
+  // Enable/disable masked compound.
+  int disable_masked_comp;
+
+  // Enable/disable the fast compound mode search.
+  int enable_fast_compound_mode_search;
+
+  // Reuse masked compound type search results
+  int reuse_mask_search_results;
+
+  // Enable/disable fast search for wedge masks
+  int enable_fast_wedge_mask_search;
+
+  // Early breakout from transform search of inter modes
+  int inter_mode_txfm_breakout;
+
+  // Limit number of inter modes for txfm search if a newmv mode gets
+  // evaluated among the top modes.
+  // 0: no pruning
+  // 1 to 3 indicate increasing order of aggressiveness
+  int limit_inter_mode_cands;
+
+  // Cap the no. of txfm searches for a given prediction mode.
+  // 0: no cap, 1: cap beyond first 4 searches, 2: cap beyond first 3 searches.
+  int limit_txfm_eval_per_mode;
+
+  // Prune warped motion search based on block size.
+  int extra_prune_warped;
+
+  // Do not search compound modes for ARF.
+  // The intuition is that ARF is predicted by frames far away from it,
+  // whose temporal correlations with the ARF are likely low.
+  // It is therefore likely that compound modes do not work as well for ARF
+  // as other inter frames.
+  // Speed/quality impact:
+  // Speed 1: 12% faster, 0.1% psnr loss.
+  // Speed 2: 2%  faster, 0.05% psnr loss.
+  // No change for speed 3 and up, because |disable_onesided_comp| is true.
+  int skip_arf_compound;
 } INTER_MODE_SPEED_FEATURES;
 
 typedef struct INTERP_FILTER_SPEED_FEATURES {
-  // A source variance threshold below which filter search is disabled
-  // Choose a very large value (UINT_MAX) to use 8-tap always
-  unsigned int disable_filter_search_var_thresh;
-
   // Do limited interpolation filter search for dual filters, since best choice
   // usually includes EIGHTTAP_REGULAR.
   int use_fast_interpolation_filter_search;
@@ -708,19 +1050,27 @@ typedef struct INTRA_MODE_SPEED_FEATURES {
   // flag to allow skipping intra mode for inter frame prediction
   int skip_intra_in_interframe;
 
-  // variance threshold for intra mode gating when inter turned out to be skip
-  // in inter frame prediction
-  unsigned int src_var_thresh_intra_skip;
-
   // Prune intra mode candidates based on source block histogram of gradient.
+  // Applies to luma plane only.
+  // Feasible values are 0..4. The feature is disabled for 0. An increasing
+  // value indicates more aggressive pruning threshold.
   int intra_pruning_with_hog;
 
-  // TODO(anyone): tune intra_pruning_with_hog_thresh for various speeds.
-  float intra_pruning_with_hog_thresh;
+  // Prune intra mode candidates based on source block histogram of gradient.
+  // Applies to chroma plane only.
+  // Feasible values are 0..4. The feature is disabled for 0. An increasing
+  // value indicates more aggressive pruning threshold.
+  int chroma_intra_pruning_with_hog;
 
   // Enable/disable smooth intra modes.
   int disable_smooth_intra;
 
+  // Prune filter intra modes in intra frames.
+  // 0 : No pruning
+  // 1 : Evaluate applicable filter intra modes based on best intra mode so far
+  // 2 : Do not evaluate filter intra modes
+  int prune_filter_intra_level;
+
   // prune palette search
   // 0: No pruning
   // 1: Perform coarse search to prune the palette colors. For winner colors,
@@ -729,6 +1079,73 @@ typedef struct INTRA_MODE_SPEED_FEATURES {
   // colors to remaining colors) and terminate the search if current number of
   // palette colors is not the winner.
   int prune_palette_search_level;
+
+  // Terminate early in luma palette_size search. Speed feature values indicate
+  // increasing level of pruning.
+  // 0: No early termination
+  // 1: Terminate early for higher luma palette_size, if header rd cost of lower
+  // palette_size is more than 2 * best_rd. This level of pruning is more
+  // conservative when compared to sf level 2 as the cases which will get pruned
+  // with sf level 1 is a subset of the cases which will get pruned with sf
+  // level 2.
+  // 2: Terminate early for higher luma palette_size, if header rd cost of lower
+  // palette_size is more than best_rd.
+  // For allintra encode, this sf reduces instruction count by 2.49%, 1.07%,
+  // 2.76%, 2.30%, 1.84%, 2.69%, 2.04%, 2.05% and 1.44% for speed 0, 1, 2, 3, 4,
+  // 5, 6, 7 and 8 on screen content set with coding performance change less
+  // than 0.01% for speed <= 2 and less than 0.03% for speed >= 3. For AVIF
+  // image encode, this sf reduces instruction count by 1.94%, 1.13%, 1.29%,
+  // 0.93%, 0.89%, 1.03%, 1.07%, 1.20% and 0.18% for speed 0, 1, 2, 3, 4, 5, 6,
+  // 7 and 8 on a typical image dataset with coding performance change less than
+  // 0.01%.
+  int prune_luma_palette_size_search_level;
+
+  // Prune chroma intra modes based on luma intra mode winner.
+  // 0: No pruning
+  // 1: Prune chroma intra modes other than UV_DC_PRED, UV_SMOOTH_PRED,
+  // UV_CFL_PRED and the mode that corresponds to luma intra mode winner.
+  int prune_chroma_modes_using_luma_winner;
+
+  // Clip the frequency of updating the mv cost for intrabc.
+  INTERNAL_COST_UPDATE_TYPE dv_cost_upd_level;
+
+  // We use DCT_DCT transform followed by computing SATD (Sum of Absolute
+  // Transformed Differences) as an estimation of RD score to quickly find the
+  // best possible Chroma from Luma (CFL) parameter. Then we do a full RD search
+  // near the best possible parameter. The search range is set here.
+  // The range of cfl_searh_range should be [1, 33], and the following are the
+  // recommended values.
+  // 1: Fastest mode.
+  // 3: Default mode that provides good speedup without losing compression
+  // performance at speed 0.
+  // 33: Exhaustive rd search (33 == CFL_MAGS_SIZE). This mode should only
+  // be used for debugging purpose.
+  int cfl_search_range;
+
+  // TOP_INTRA_MODEL_COUNT is 4 that is the number of top model rd to store in
+  // intra mode decision. Here, add a speed feature to reduce this number for
+  // higher speeds.
+  int top_intra_model_count_allowed;
+
+  // Adapt top_intra_model_count_allowed locally to prune luma intra modes using
+  // neighbor block and quantizer information.
+  int adapt_top_model_rd_count_using_neighbors;
+
+  // Terminate early in chroma palette_size search.
+  // 0: No early termination
+  // 1: Terminate early for higher palette_size, if header rd cost of lower
+  // palette_size is more than best_rd.
+  // For allintra encode, this sf reduces instruction count by 0.45%,
+  // 0.62%, 1.73%, 2.50%, 2.89%, 3.09% and 3.86% for speed 0 to 6 on screen
+  // content set with coding performance change less than 0.01%.
+  // For AVIF image encode, this sf reduces instruction count by 0.45%, 0.81%,
+  // 0.85%, 1.05%, 1.45%, 1.66% and 1.95% for speed 0 to 6 on a typical image
+  // dataset with no quality drop.
+  int early_term_chroma_palette_size_search;
+
+  // Skips the evaluation of filter intra modes in inter frames if rd evaluation
+  // of luma intra dc mode results in invalid rd stats.
+  int skip_filter_intra_in_inter_frames;
 } INTRA_MODE_SPEED_FEATURES;
 
 typedef struct TX_SPEED_FEATURES {
@@ -761,23 +1178,16 @@ typedef struct TX_SPEED_FEATURES {
   // 1-2: progressively increasing aggressiveness of pruning
   int model_based_prune_tx_search_level;
 
-  // Use hash table to store intra(keyframe only) txb transform search results
-  // to avoid repeated search on the same residue signal.
-  int use_intra_txb_hash;
-
-  // Use hash table to store inter txb transform search results
-  // to avoid repeated search on the same residue signal.
-  int use_inter_txb_hash;
-
   // Refine TX type after fast TX search.
   int refine_fast_tx_search_results;
+
+  // Prune transform split/no_split eval based on residual properties. A value
+  // of 0 indicates no pruning, and the aggressiveness of pruning progressively
+  // increases from levels 1 to 3.
+  int prune_tx_size_level;
 } TX_SPEED_FEATURES;
 
 typedef struct RD_CALC_SPEED_FEATURES {
-  // This feature controls whether we do the expensive context update and
-  // calculation in the rd coefficient costing loop.
-  int use_fast_coef_costing;
-
   // Fast approximation of av1_model_rd_from_var_lapndz
   int simple_model_rd_from_var;
 
@@ -795,20 +1205,10 @@ typedef struct RD_CALC_SPEED_FEATURES {
   // Trellis (dynamic programming) optimization of quantized values
   TRELLIS_OPT_TYPE optimize_coefficients;
 
-  // Use a hash table to store previously computed optimized qcoeffs from
-  // expensive calls to optimize_txb.
-  int use_hash_based_trellis;
-
   // Use hash table to store macroblock RD search results
   // to avoid repeated search on the same residue signal.
   int use_mb_rd_hash;
 
-  // Flag used to control the speed of the eob selection in trellis.
-  int trellis_eob_fast;
-
-  // Calculate RD cost before doing optimize_b, and skip if the cost is large.
-  int optimize_b_precheck;
-
   // Flag used to control the extent of coeff R-D optimization
   int perform_coeff_opt;
 } RD_CALC_SPEED_FEATURES;
@@ -827,6 +1227,7 @@ typedef struct WINNER_MODE_SPEED_FEATURES {
   // Level 0  : FULL RD     LARGEST ALL   FULL RD
   // Level 1  : FAST RD     LARGEST ALL   FULL RD
   // Level 2  : LARGEST ALL LARGEST ALL   FULL RD
+  // Level 3 :  LARGEST ALL LARGEST ALL   LARGEST ALL
   int tx_size_search_level;
 
   // Flag used to control the winner mode processing for use transform
@@ -834,18 +1235,46 @@ typedef struct WINNER_MODE_SPEED_FEATURES {
   int enable_winner_mode_for_use_tx_domain_dist;
 
   // Flag used to enable processing of multiple winner modes
-  int enable_multiwinner_mode_process;
+  MULTI_WINNER_MODE_TYPE multi_winner_mode_type;
 
   // Motion mode for winner candidates:
   // 0: speed feature OFF
   // 1 / 2 : Use configured number of winner candidates
   int motion_mode_for_winner_cand;
+
+  // Early DC only txfm block prediction
+  // 0: speed feature OFF
+  // 1 / 2 : Use the configured level for different modes
+  int dc_blk_pred_level;
+
+  // If on, disables interpolation filter search in handle_inter_mode loop, and
+  // performs it during winner mode processing by \ref
+  // tx_search_best_inter_candidates.
+  int winner_mode_ifs;
+
+  // Controls the disabling of winner mode processing. Speed feature levels
+  // are ordered in increasing aggressiveness of pruning. The method considered
+  // for disabling, depends on the sf level value and it is described as below.
+  // 0: Do not disable
+  // 1: Disable for blocks with low source variance.
+  // 2: Disable for blocks which turn out to be transform skip (skipped based on
+  // eob) during MODE_EVAL stage except NEWMV mode.
+  // 3: Disable for blocks which turn out to be transform skip during MODE_EVAL
+  // stage except NEWMV mode. For high quantizers, prune conservatively based on
+  // transform skip (skipped based on eob) except for NEWMV mode.
+  // 4: Disable for blocks which turn out to be transform skip during MODE_EVAL
+  // stage.
+  int prune_winner_mode_eval_level;
 } WINNER_MODE_SPEED_FEATURES;
 
 typedef struct LOOP_FILTER_SPEED_FEATURES {
   // This feature controls how the loop filter level is determined.
   LPF_PICK_METHOD lpf_pick;
 
+  // Skip some final iterations in the determination of the best loop filter
+  // level.
+  int use_coarse_filter_level_search;
+
   // Control how the CDEF strength is determined.
   CDEF_PICK_METHOD cdef_pick_method;
 
@@ -860,6 +1289,9 @@ typedef struct LOOP_FILTER_SPEED_FEATURES {
   // Disable loop restoration for Chroma plane
   int disable_loop_restoration_chroma;
 
+  // Disable loop restoration for luma plane
+  int disable_loop_restoration_luma;
+
   // Prune RESTORE_WIENER evaluation based on source variance
   // 0 : no pruning
   // 1 : conservative pruning
@@ -878,12 +1310,22 @@ typedef struct LOOP_FILTER_SPEED_FEATURES {
 
   // Disable loop restoration filter
   int disable_lr_filter;
+
+  // Whether to downsample the rows in computation of wiener stats.
+  int use_downsampled_wiener_stats;
 } LOOP_FILTER_SPEED_FEATURES;
 
 typedef struct REAL_TIME_SPEED_FEATURES {
   // check intra prediction for non-RD mode.
   int check_intra_pred_nonrd;
 
+  // Skip checking intra prediction.
+  // 0 - don't skip
+  // 1 - skip if TX is skipped and best mode is not NEWMV
+  // 2 - skip if TX is skipped
+  // Skipping aggressiveness increases from level 1 to 2.
+  int skip_intra_pred;
+
   // Perform coarse ME before calculating variance in variance-based partition
   int estimate_motion_for_var_based_partition;
 
@@ -891,6 +1333,7 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // 0 - don't check merge
   // 1 - always check merge
   // 2 - check merge and prune checking final split
+  // 3 - check merge and prune checking final split based on bsize and qindex
   int nonrd_check_partition_merge_mode;
 
   // For nonrd_use_partition: check of leaf partition extra split
@@ -916,6 +1359,10 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // Use compound reference for non-RD mode.
   int use_comp_ref_nonrd;
 
+  // Reference frames for compound prediction for nonrd pickmode:
+  // LAST_GOLDEN (0), LAST_LAST2 (1), or LAST_ALTREF (2).
+  int ref_frame_comp_nonrd[3];
+
   // use reduced ref set for real-time mode
   int use_real_time_ref_set;
 
@@ -932,9 +1379,6 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // Number of best inter modes to search transform. INT_MAX - search all.
   int num_inter_modes_for_tx_search;
 
-  // Forces TX search off for RDCost calulation.
-  int force_tx_search_off;
-
   // Use interpolation filter search in non-RD mode decision.
   int use_nonrd_filter_search;
 
@@ -944,87 +1388,257 @@ typedef struct REAL_TIME_SPEED_FEATURES {
   // If set forces interpolation filter to EIGHTTAP_REGULAR
   int skip_interp_filter_search;
 
-  // Use hybrid (rd for bsize < 16x16, otherwise nonrd) intra search for intra
-  // only frames.
+  // For nonrd mode: use hybrid intra mode search for intra only frames based on
+  // block properties.
+  // 0 : use nonrd pick intra for all blocks
+  // 1 : use rd for bsize < 16x16, nonrd otherwise
+  // 2 : use rd for bsize < 16x16 and src var >= 101, nonrd otherwise
   int hybrid_intra_pickmode;
 
   // Compute variance/sse on source difference, prior to encoding superblock.
   int source_metrics_sb_nonrd;
+
+  // Flag to indicate process for handling overshoot on slide/scene change,
+  // for real-time CBR mode.
+  OVERSHOOT_DETECTION_CBR overshoot_detection_cbr;
+
+  // Check for scene/content change detection on every frame before encoding.
+  int check_scene_detection;
+
+  // Forces larger partition blocks in variance based partitioning
+  int force_large_partition_blocks;
+
+  // uses results of temporal noise estimate
+  int use_temporal_noise_estimate;
+
+  // Parameter indicating initial search window to be used in full-pixel search
+  // for nonrd_pickmode. Range [0, MAX_MVSEARCH_STEPS - 1]. Lower value
+  // indicates larger window. If set to 0, step_param is set based on internal
+  // logic in set_mv_search_params().
+  int fullpel_search_step_param;
+
+  // Bit mask to enable or disable intra modes for each prediction block size
+  // separately, for nonrd pickmode.
+  int intra_y_mode_bsize_mask_nrd[BLOCK_SIZES];
+
+  // Skips mode checks more agressively in nonRD mode
+  int nonrd_agressive_skip;
+
+  // Skip cdef on 64x64 blocks when NEWMV or INTRA is not picked or color
+  // sensitivity is off. When color sensitivity is on for a superblock, all
+  // 64x64 blocks within will not skip.
+  int skip_cdef_sb;
+
+  // Forces larger partition blocks in variance based partitioning for intra
+  // frames
+  int force_large_partition_blocks_intra;
+
+  // Skip evaluation of no split in tx size selection for merge partition
+  int skip_tx_no_split_var_based_partition;
+
+  // Intermediate termination of newMV mode evaluation based on so far best mode
+  // sse
+  int skip_newmv_mode_based_on_sse;
+
+  // Define gf length multiplier.
+  // Level 0: use large multiplier, level 1: use medium multiplier.
+  int gf_length_lvl;
+
+  // Prune inter modes with golden frame as reference for NEARMV and NEWMV modes
+  int prune_inter_modes_with_golden_ref;
+
+  // Prune inter modes w.r.t golden or alt-ref frame based on sad
+  int prune_inter_modes_wrt_gf_arf_based_on_sad;
+
+  // Prune inter mode search in rd path based on current block's temporal
+  // variance wrt LAST reference.
+  int prune_inter_modes_using_temp_var;
+
+  // Force half_pel at block level.
+  int force_half_pel_block;
+
+  // Prune intra mode evaluation in inter frames based on mv range.
+  BLOCK_SIZE prune_intra_mode_based_on_mv_range;
+  // The number of times to left shift the splitting thresholds in variance
+  // based partitioning. The minimum values should be 7 to avoid left shifting
+  // by a negative number.
+  int var_part_split_threshold_shift;
+
+  // Qindex based variance partition threshold index, which determines
+  // the aggressiveness of partition pruning
+  // 0: disabled for speeds 9,10
+  // 1,2: (rd-path) lowers qindex thresholds conditionally (for low SAD sb)
+  // 3,4: (non-rd path) uses pre-tuned qindex thresholds
+  int var_part_based_on_qidx;
+
+  // Enable GF refresh based on Q value.
+  int gf_refresh_based_on_qp;
+
+  // Temporal filtering
+  int use_rtc_tf;
+
+  // Prune the use of the identity transform in nonrd_pickmode,
+  // used for screen content mode: only for smaller blocks
+  // and higher spatial variance, and when skip_txfm is not
+  // already set.
+  int prune_idtx_nonrd;
+
+  // Skip loopfilter, for static content after slide change
+  // or key frame, once quality has ramped up.
+  int skip_lf_screen;
+
+  // For nonrd: early exit out of variance partition that sets the
+  // block size to superblock size, and sets mode to zeromv-last skip.
+  int part_early_exit_zeromv;
+
+  // Early terminate inter mode search based on sse in non-rd path.
+  INTER_SEARCH_EARLY_TERM_IDX sse_early_term_inter_search;
+
+  // SAD based adaptive altref selection
+  int sad_based_adp_altref_lag;
+
+  // Enable/disable partition direct merging.
+  int partition_direct_merging;
+
+  // SAD based compound mode pruning
+  int sad_based_comp_prune;
+
+  // Level of aggressiveness for obtaining tx size based on qstep
+  int tx_size_level_based_on_qstep;
+
+  // Reduce the mv resolution for zero mv if the variance is low.
+  bool reduce_zeromv_mvres;
+
+  // Avoid the partitioning of a 16x16 block in variance based partitioning
+  // (VBP) by making use of minimum and maximum sub-block variances.
+  // For allintra encode, this speed feature reduces instruction count by 5.39%
+  // for speed 9 on a typical video dataset with coding performance gain
+  // of 1.44%.
+  // For AVIF image encode, this speed feature reduces encode time
+  // by 8.44% for speed 9 on a typical image dataset with coding performance
+  // gain of 0.78%.
+  bool vbp_prune_16x16_split_using_min_max_sub_blk_var;
 } REAL_TIME_SPEED_FEATURES;
 
+/*!\endcond */
+
+/*!
+ * \brief Top level speed vs quality trade off data struture.
+ */
 typedef struct SPEED_FEATURES {
-  /*
+  /*!
    * Sequence/frame level speed features:
    */
   HIGH_LEVEL_SPEED_FEATURES hl_sf;
 
-  /*
+  /*!
+   * Speed features for the first pass.
+   */
+  FIRST_PASS_SPEED_FEATURES fp_sf;
+
+  /*!
    * Speed features related to how tpl's searches are done.
    */
   TPL_SPEED_FEATURES tpl_sf;
 
-  /*
+  /*!
    * Global motion speed features:
    */
   GLOBAL_MOTION_SPEED_FEATURES gm_sf;
 
-  /*
+  /*!
    * Partition search speed features:
    */
   PARTITION_SPEED_FEATURES part_sf;
 
-  /*
+  /*!
    * Motion search speed features:
    */
   MV_SPEED_FEATURES mv_sf;
 
-  /*
+  /*!
    * Inter mode search speed features:
    */
   INTER_MODE_SPEED_FEATURES inter_sf;
 
-  /*
+  /*!
    * Interpolation filter search speed features:
    */
   INTERP_FILTER_SPEED_FEATURES interp_sf;
 
-  /*
+  /*!
    * Intra mode search speed features:
    */
   INTRA_MODE_SPEED_FEATURES intra_sf;
 
-  /*
+  /*!
    * Transform size/type search speed features:
    */
   TX_SPEED_FEATURES tx_sf;
 
-  /*
+  /*!
    * RD calculation speed features:
    */
   RD_CALC_SPEED_FEATURES rd_sf;
 
-  /*
+  /*!
    * Two-pass mode evaluation features:
    */
   WINNER_MODE_SPEED_FEATURES winner_mode_sf;
 
-  /*
+  /*!
    * In-loop filter speed features:
    */
   LOOP_FILTER_SPEED_FEATURES lpf_sf;
 
-  /*
+  /*!
    * Real-time mode speed features:
    */
   REAL_TIME_SPEED_FEATURES rt_sf;
 } SPEED_FEATURES;
+/*!\cond */
 
 struct AV1_COMP;
 
+/*!\endcond */
+/*!\brief Frame size independent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in]    cpi     Top - level encoder instance structure
+ * \param[in]    speed   Speed setting passed in from the command  line
+ *
+ * \return No return value but configures the various speed trade off flags
+ *         based on the passed in speed setting. (Higher speed gives lower
+ *         quality)
+ */
 void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi,
                                                   int speed);
+
+/*!\brief Frame size dependent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in]    cpi     Top - level encoder instance structure
+ * \param[in]    speed   Speed setting passed in from the command  line
+ *
+ * \return No return value but configures the various speed trade off flags
+ *         based on the passed in speed setting and frame size. (Higher speed
+ *         corresponds to lower quality)
+ */
 void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi,
                                                 int speed);
+/*!\brief Q index dependent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in]    cpi     Top - level encoder instance structure
+ * \param[in]    speed   Speed setting passed in from the command  line
+ *
+ * \return No return value but configures the various speed trade off flags
+ *         based on the passed in speed setting and current frame's Q index.
+ *         (Higher speed corresponds to lower quality)
+ */
 void av1_set_speed_features_qindex_dependent(struct AV1_COMP *cpi, int speed);
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/av1/encoder/superres_scale.c b/media/libaom/src/av1/encoder/superres_scale.c
new file mode 100644
index 0000000000..283faabe61
--- /dev/null
+++ b/media/libaom/src/av1/encoder/superres_scale.c
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/random.h"
+
+// Compute the horizontal frequency components' energy in a frame
+// by calculuating the 16x4 Horizontal DCT. This is to be used to
+// decide the superresolution parameters.
+static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
+  uint64_t freq_energy[16] = { 0 };
+  const YV12_BUFFER_CONFIG *buf = cpi->source;
+  const int bd = cpi->td.mb.e_mbd.bd;
+  const int width = buf->y_crop_width;
+  const int height = buf->y_crop_height;
+  DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]);
+  int n = 0;
+  memset(freq_energy, 0, sizeof(freq_energy));
+  if (buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer);
+    for (int i = 0; i < height - 4; i += 4) {
+      for (int j = 0; j < width - 16; j += 16) {
+        av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride,
+                            H_DCT, bd);
+        for (int k = 1; k < 16; ++k) {
+          const uint64_t this_energy =
+              ((int64_t)coeff[k] * coeff[k]) +
+              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+              ((int64_t)coeff[k + 48] * coeff[k + 48]);
+          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
+        }
+        n++;
+      }
+    }
+  } else {
+    assert(bd == 8);
+    DECLARE_ALIGNED(16, int16_t, src16[16 * 4]);
+    for (int i = 0; i < height - 4; i += 4) {
+      for (int j = 0; j < width - 16; j += 16) {
+        for (int ii = 0; ii < 4; ++ii)
+          for (int jj = 0; jj < 16; ++jj)
+            src16[ii * 16 + jj] =
+                buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)];
+        av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd);
+        for (int k = 1; k < 16; ++k) {
+          const uint64_t this_energy =
+              ((int64_t)coeff[k] * coeff[k]) +
+              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+              ((int64_t)coeff[k + 48] * coeff[k + 48]);
+          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2);
+        }
+        n++;
+      }
+    }
+  }
+  if (n) {
+    for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n;
+    // Convert to cumulative energy
+    for (int k = 14; k > 0; --k) energy[k] += energy[k + 1];
+  } else {
+    for (int k = 1; k < 16; ++k) energy[k] = 1e+20;
+  }
+}
+
+static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
+  // Choose an arbitrary random number
+  static unsigned int seed = 56789;
+  const ResizeCfg *resize_cfg = &cpi->oxcf.resize_cfg;
+  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
+  uint8_t new_denom = SCALE_NUMERATOR;
+
+  if (cpi->common.seq_params->reduced_still_picture_hdr) return SCALE_NUMERATOR;
+  switch (resize_cfg->resize_mode) {
+    case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
+    case RESIZE_FIXED:
+      if (cpi->common.current_frame.frame_type == KEY_FRAME)
+        new_denom = resize_cfg->resize_kf_scale_denominator;
+      else
+        new_denom = resize_cfg->resize_scale_denominator;
+      break;
+    case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+    default: assert(0);
+  }
+  return new_denom;
+}
+
+int av1_superres_in_recode_allowed(const AV1_COMP *const cpi) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  // Empirically found to not be beneficial for image coding.
+  return oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO &&
+         cpi->sf.hl_sf.superres_auto_search_type != SUPERRES_AUTO_SOLO &&
+         cpi->rc.frames_to_key > 1;
+}
+
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008
+#define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008
+#define SUPERRES_ENERGY_BY_AC_THRESH 0.2
+
+static double get_energy_by_q2_thresh(const GF_GROUP *gf_group,
+                                      const RATE_CONTROL *rc,
+                                      int gf_frame_index) {
+  // TODO(now): Return keyframe thresh * factor based on frame type / pyramid
+  // level.
+  if (gf_group->update_type[gf_frame_index] == ARF_UPDATE) {
+    return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME;
+  } else if (gf_group->update_type[gf_frame_index] == KF_UPDATE) {
+    if (rc->frames_to_key <= 1)
+      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO;
+    else
+      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME;
+  } else {
+    assert(0);
+  }
+  return 0;
+}
+
+static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy,
+                                                     double threshq,
+                                                     double threshp) {
+  const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8);
+  const double tq = threshq * q * q;
+  const double tp = threshp * energy[1];
+  const double thresh = AOMMIN(tq, tp);
+  int k;
+  for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) {
+    if (energy[k - 1] > thresh) break;
+  }
+  return 3 * SCALE_NUMERATOR - k;
+}
+
+static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
+                                             int sr_kf, int sr_arf) {
+  // Use superres for Key-frames and Alt-ref frames only.
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+      gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE) {
+    return SCALE_NUMERATOR;
+  }
+  if (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE && !sr_kf) {
+    return SCALE_NUMERATOR;
+  }
+  if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && !sr_arf) {
+    return SCALE_NUMERATOR;
+  }
+
+  double energy[16];
+  analyze_hor_freq(cpi, energy);
+
+  const double energy_by_q2_thresh =
+      get_energy_by_q2_thresh(gf_group, &cpi->rc, cpi->gf_frame_index);
+  int denom = get_superres_denom_from_qindex_energy(
+      qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH);
+  /*
+  printf("\nenergy = [");
+  for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]);
+  printf("]\n");
+  printf("boost = %d\n",
+         (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE)
+             ? cpi->ppi->p_rc.kf_boost
+             : cpi->rc.gfu_boost);
+  printf("denom = %d\n", denom);
+  */
+  if (av1_superres_in_recode_allowed(cpi)) {
+    assert(cpi->superres_mode != AOM_SUPERRES_NONE);
+    // Force superres to be tried in the recode loop, as full-res is also going
+    // to be tried anyway.
+    denom = AOMMAX(denom, SCALE_NUMERATOR + 1);
+  }
+  return denom;
+}
+
+static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
+  // Choose an arbitrary random number
+  static unsigned int seed = 34567;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const SuperResCfg *const superres_cfg = &oxcf->superres_cfg;
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
+  uint8_t new_denom = SCALE_NUMERATOR;
+
+  // Make sure that superres mode of the frame is consistent with the
+  // sequence-level flag.
+  assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_NONE,
+                 cpi->common.seq_params->enable_superres));
+  assert(IMPLIES(!cpi->common.seq_params->enable_superres,
+                 superres_cfg->superres_mode == AOM_SUPERRES_NONE));
+  // Make sure that superres mode for current encoding is consistent with user
+  // provided superres mode.
+  assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_AUTO,
+                 cpi->superres_mode == superres_cfg->superres_mode));
+
+  // Note: we must look at the current superres_mode to be tried in 'cpi' here,
+  // not the user given mode in 'oxcf'.
+  switch (cpi->superres_mode) {
+    case AOM_SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
+    case AOM_SUPERRES_FIXED:
+      if (cpi->common.current_frame.frame_type == KEY_FRAME)
+        new_denom = superres_cfg->superres_kf_scale_denominator;
+      else
+        new_denom = superres_cfg->superres_scale_denominator;
+      break;
+    case AOM_SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+    case AOM_SUPERRES_QTHRESH: {
+      // Do not use superres when screen content tools are used.
+      if (cpi->common.features.allow_screen_content_tools) break;
+      if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ)
+        av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height);
+
+      // Now decide the use of superres based on 'q'.
+      int bottom_index, top_index;
+      const int q = av1_rc_pick_q_and_bounds(
+          cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index,
+          &bottom_index, &top_index);
+
+      const int qthresh = (frame_is_intra_only(&cpi->common))
+                              ? superres_cfg->superres_kf_qthresh
+                              : superres_cfg->superres_qthresh;
+      if (q <= qthresh) {
+        new_denom = SCALE_NUMERATOR;
+      } else {
+        new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+      }
+      break;
+    }
+    case AOM_SUPERRES_AUTO: {
+      if (cpi->common.features.allow_screen_content_tools) break;
+      if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ)
+        av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height);
+
+      // Now decide the use of superres based on 'q'.
+      int bottom_index, top_index;
+      const int q = av1_rc_pick_q_and_bounds(
+          cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index,
+          &bottom_index, &top_index);
+
+      const SUPERRES_AUTO_SEARCH_TYPE sr_search_type =
+          cpi->sf.hl_sf.superres_auto_search_type;
+      const int qthresh = (sr_search_type == SUPERRES_AUTO_SOLO) ? 128 : 0;
+      if (q <= qthresh) {
+        new_denom = SCALE_NUMERATOR;  // Don't use superres.
+      } else {
+        if (sr_search_type == SUPERRES_AUTO_ALL) {
+          if (cpi->common.current_frame.frame_type == KEY_FRAME)
+            new_denom = superres_cfg->superres_kf_scale_denominator;
+          else
+            new_denom = superres_cfg->superres_scale_denominator;
+        } else {
+          new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+        }
+      }
+      break;
+    }
+    default: assert(0);
+  }
+  return new_denom;
+}
+
+static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
+  return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
+}
+
+static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
+  // Only need to check the width, as scaling is horizontal only.
+  (void)oheight;
+  return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom);
+}
+
+static int validate_size_scales(RESIZE_MODE resize_mode,
+                                aom_superres_mode superres_mode, int owidth,
+                                int oheight, size_params_type *rsz) {
+  if (dimensions_are_ok(owidth, oheight, rsz)) {  // Nothing to do.
+    return 1;
+  }
+
+  // Calculate current resize scale.
+  int resize_denom =
+      AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width),
+             DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height));
+
+  if (resize_mode != RESIZE_RANDOM && superres_mode == AOM_SUPERRES_RANDOM) {
+    // Alter superres scale as needed to enforce conformity.
+    rsz->superres_denom =
+        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom;
+    if (!dimensions_are_ok(owidth, oheight, rsz)) {
+      if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom;
+    }
+  } else if (resize_mode == RESIZE_RANDOM &&
+             superres_mode != AOM_SUPERRES_RANDOM) {
+    // Alter resize scale as needed to enforce conformity.
+    resize_denom =
+        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom;
+    rsz->resize_width = owidth;
+    rsz->resize_height = oheight;
+    av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                              resize_denom);
+    if (!dimensions_are_ok(owidth, oheight, rsz)) {
+      if (resize_denom > SCALE_NUMERATOR) {
+        --resize_denom;
+        rsz->resize_width = owidth;
+        rsz->resize_height = oheight;
+        av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                                  resize_denom);
+      }
+    }
+  } else if (resize_mode == RESIZE_RANDOM &&
+             superres_mode == AOM_SUPERRES_RANDOM) {
+    // Alter both resize and superres scales as needed to enforce conformity.
+    do {
+      if (resize_denom > rsz->superres_denom)
+        --resize_denom;
+      else
+        --rsz->superres_denom;
+      rsz->resize_width = owidth;
+      rsz->resize_height = oheight;
+      av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                                resize_denom);
+    } while (!dimensions_are_ok(owidth, oheight, rsz) &&
+             (resize_denom > SCALE_NUMERATOR ||
+              rsz->superres_denom > SCALE_NUMERATOR));
+  } else {  // We are allowed to alter neither resize scale nor superres
+            // scale.
+    return 0;
+  }
+  return dimensions_are_ok(owidth, oheight, rsz);
+}
+
+// Calculates resize and superres params for next frame
+static size_params_type calculate_next_size_params(AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+  size_params_type rsz = { frm_dim_cfg->width, frm_dim_cfg->height,
+                           SCALE_NUMERATOR };
+  int resize_denom = SCALE_NUMERATOR;
+  if (has_no_stats_stage(cpi) && cpi->ppi->use_svc &&
+      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) {
+    rsz.resize_width = cpi->common.width;
+    rsz.resize_height = cpi->common.height;
+    return rsz;
+  }
+  if (is_stat_generation_stage(cpi)) return rsz;
+  if (resize_pending_params->width && resize_pending_params->height) {
+    rsz.resize_width = resize_pending_params->width;
+    rsz.resize_height = resize_pending_params->height;
+    resize_pending_params->width = resize_pending_params->height = 0;
+    if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE) return rsz;
+  } else {
+    resize_denom = calculate_next_resize_scale(cpi);
+    rsz.resize_width = frm_dim_cfg->width;
+    rsz.resize_height = frm_dim_cfg->height;
+    av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
+                              resize_denom);
+  }
+  rsz.superres_denom = calculate_next_superres_scale(cpi);
+  if (!validate_size_scales(oxcf->resize_cfg.resize_mode, cpi->superres_mode,
+                            frm_dim_cfg->width, frm_dim_cfg->height, &rsz))
+    assert(0 && "Invalid scale parameters");
+  return rsz;
+}
+
+static void setup_frame_size_from_params(AV1_COMP *cpi,
+                                         const size_params_type *rsz) {
+  int encode_width = rsz->resize_width;
+  int encode_height = rsz->resize_height;
+
+  AV1_COMMON *cm = &cpi->common;
+  cm->superres_upscaled_width = encode_width;
+  cm->superres_upscaled_height = encode_height;
+  cm->superres_scale_denominator = rsz->superres_denom;
+  av1_calculate_scaled_superres_size(&encode_width, &encode_height,
+                                     rsz->superres_denom);
+  av1_set_frame_size(cpi, encode_width, encode_height);
+}
+
+void av1_setup_frame_size(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  // Reset superres params from previous frame.
+  cm->superres_scale_denominator = SCALE_NUMERATOR;
+  const size_params_type rsz = calculate_next_size_params(cpi);
+  setup_frame_size_from_params(cpi, &rsz);
+
+  assert(av1_is_min_tile_width_satisfied(cm));
+}
+
+void av1_superres_post_encode(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+
+  if (!av1_superres_scaled(cm)) return;
+
+  assert(cpi->oxcf.superres_cfg.enable_superres);
+  assert(!is_lossless_requested(&cpi->oxcf.rc_cfg));
+  assert(!cm->features.all_lossless);
+
+  av1_superres_upscale(cm, NULL);
+
+  // If regular resizing is occurring the source will need to be downscaled to
+  // match the upscaled superres resolution. Otherwise the original source is
+  // used.
+  if (!av1_resize_scaled(cm)) {
+    cpi->source = cpi->unscaled_source;
+    if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
+  } else {
+    assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
+    assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
+    // Do downscale. cm->(width|height) has been updated by
+    // av1_superres_upscale
+    cpi->source = realloc_and_scale_source(cpi, cm->superres_upscaled_width,
+                                           cm->superres_upscaled_height);
+  }
+}
diff --git a/media/libaom/src/av1/common/cdef_block_neon.c b/media/libaom/src/av1/encoder/superres_scale.h
index 2d6bc65e31..450a4ed902 100644
--- a/media/libaom/src/av1/common/cdef_block_neon.c
+++ b/media/libaom/src/av1/encoder/superres_scale.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,6 +9,20 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_neon
-#include "av1/common/cdef_block_simd.h"
+#ifndef AOM_AV1_ENCODER_SUPERRES_SCALE_H_
+#define AOM_AV1_ENCODER_SUPERRES_SCALE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int av1_superres_in_recode_allowed(const AV1_COMP *const cpi);
+void av1_superres_post_encode(AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_SUPERRES_SCALE_H_
diff --git a/media/libaom/src/av1/encoder/svc_layercontext.c b/media/libaom/src/av1/encoder/svc_layercontext.c
index b72d8aa733..d0b7d235b5 100644
--- a/media/libaom/src/av1/encoder/svc_layercontext.c
+++ b/media/libaom/src/av1/encoder/svc_layercontext.c
@@ -11,6 +11,7 @@
 #include <math.h>
 
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
 
 static void swap_ptr(void *a, void *b) {
   void **a_p = (void **)a;
@@ -28,65 +29,70 @@ void av1_init_layer_context(AV1_COMP *const cpi) {
   int mi_cols = cpi->common.mi_params.mi_cols;
   svc->base_framerate = 30.0;
   svc->current_superframe = 0;
+  svc->force_zero_mode_spatial_ref = 1;
+  svc->num_encoded_top_layer = 0;
+  svc->use_flexible_mode = 0;
 
   for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
     for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
       int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
       LAYER_CONTEXT *const lc = &svc->layer_context[layer];
       RATE_CONTROL *const lrc = &lc->rc;
-      lrc->ni_av_qi = oxcf->worst_allowed_q;
-      lrc->total_actual_bits = 0;
-      lrc->total_target_vs_actual = 0;
+      PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+      lrc->ni_av_qi = oxcf->rc_cfg.worst_allowed_q;
+      lp_rc->total_actual_bits = 0;
       lrc->ni_tot_qi = 0;
-      lrc->tot_q = 0.0;
-      lrc->avg_q = 0.0;
-      lrc->ni_frames = 0;
+      lp_rc->tot_q = 0.0;
+      lp_rc->avg_q = 0.0;
+      lp_rc->ni_frames = 0;
       lrc->decimation_count = 0;
       lrc->decimation_factor = 0;
       lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
       lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
+      lrc->rtc_external_ratectrl = 0;
       for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) {
-        lrc->rate_correction_factors[i] = 1.0;
+        lp_rc->rate_correction_factors[i] = 1.0;
       }
       lc->target_bandwidth = lc->layer_target_bitrate;
-      lrc->last_q[INTER_FRAME] = lrc->worst_quality;
-      lrc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality;
-      lrc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality;
-      lrc->buffer_level =
-          oxcf->starting_buffer_level_ms * lc->target_bandwidth / 1000;
-      lrc->bits_off_target = lrc->buffer_level;
+      lp_rc->last_q[INTER_FRAME] = lrc->worst_quality;
+      lp_rc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality;
+      lp_rc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality;
+      lp_rc->buffer_level =
+          oxcf->rc_cfg.starting_buffer_level_ms * lc->target_bandwidth / 1000;
+      lp_rc->bits_off_target = lp_rc->buffer_level;
       // Initialize the cyclic refresh parameters. If spatial layers are used
       // (i.e., ss_number_layers > 1), these need to be updated per spatial
       // layer. Cyclic refresh is only applied on base temporal layer.
       if (svc->number_spatial_layers > 1 && tl == 0) {
-        size_t last_coded_q_map_size;
         lc->sb_index = 0;
         lc->actual_num_seg1_blocks = 0;
         lc->actual_num_seg2_blocks = 0;
         lc->counter_encode_maxq_scene_change = 0;
+        if (lc->map) aom_free(lc->map);
         CHECK_MEM_ERROR(cm, lc->map,
-                        aom_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
-        memset(lc->map, 0, mi_rows * mi_cols);
-        last_coded_q_map_size =
-            mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
-        CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
-                        aom_malloc(last_coded_q_map_size));
-        assert(MAXQ <= 255);
-        memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
+                        aom_calloc(mi_rows * mi_cols, sizeof(*lc->map)));
       }
     }
+    svc->downsample_filter_type[sl] = BILINEAR;
+    svc->downsample_filter_phase[sl] = 8;
   }
+  if (svc->number_spatial_layers == 3) {
+    svc->downsample_filter_type[0] = EIGHTTAP_SMOOTH;
+  }
+  svc->ref_frame_comp[0] = 0;
+  svc->ref_frame_comp[1] = 0;
+  svc->ref_frame_comp[2] = 0;
 }
 
 // Update the layer context from a change_config() call.
 void av1_update_layer_context_change_config(AV1_COMP *const cpi,
                                             const int64_t target_bandwidth) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   SVC *const svc = &cpi->svc;
   int layer = 0;
   int64_t spatial_layer_target = 0;
   float bitrate_alloc = 1.0;
-
   for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
     for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
       layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
@@ -98,26 +104,37 @@ void av1_update_layer_context_change_config(AV1_COMP *const cpi,
       LAYER_CONTEXT *const lc =
           &svc->layer_context[sl * svc->number_temporal_layers + tl];
       RATE_CONTROL *const lrc = &lc->rc;
+      PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
       lc->spatial_layer_target_bandwidth = spatial_layer_target;
       bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
-      lrc->starting_buffer_level =
-          (int64_t)(rc->starting_buffer_level * bitrate_alloc);
-      lrc->optimal_buffer_level =
-          (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
-      lrc->maximum_buffer_size =
-          (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
-      lrc->bits_off_target =
-          AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
-      lrc->buffer_level = AOMMIN(lrc->buffer_level, lrc->maximum_buffer_size);
+      lp_rc->starting_buffer_level =
+          (int64_t)(p_rc->starting_buffer_level * bitrate_alloc);
+      lp_rc->optimal_buffer_level =
+          (int64_t)(p_rc->optimal_buffer_level * bitrate_alloc);
+      lp_rc->maximum_buffer_size =
+          (int64_t)(p_rc->maximum_buffer_size * bitrate_alloc);
+      lp_rc->bits_off_target =
+          AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size);
+      lp_rc->buffer_level =
+          AOMMIN(lp_rc->buffer_level, lp_rc->maximum_buffer_size);
       lc->framerate = cpi->framerate / lc->framerate_factor;
-      lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+      lrc->avg_frame_bandwidth =
+          (int)round(lc->target_bandwidth / lc->framerate);
       lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+      lrc->rtc_external_ratectrl = rc->rtc_external_ratectrl;
       lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
       lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
     }
   }
 }
 
+/*!\brief Return layer context for current layer.
+ *
+ * \ingroup rate_control
+ * \param[in]       cpi   Top level encoder structure
+ *
+ * \return LAYER_CONTEXT for current layer.
+ */
 static LAYER_CONTEXT *get_layer_context(AV1_COMP *const cpi) {
   return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
                                      cpi->svc.number_temporal_layers +
@@ -130,7 +147,7 @@ void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) {
   RATE_CONTROL *const lrc = &lc->rc;
   const int tl = svc->temporal_layer_id;
   lc->framerate = cpi->framerate / lc->framerate_factor;
-  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->avg_frame_bandwidth = (int)round(lc->target_bandwidth / lc->framerate);
   lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
   // Update the average layer frame size (non-cumulative per-frame-bw).
   if (tl == 0) {
@@ -143,73 +160,75 @@ void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) {
         cpi->framerate / lcprev->framerate_factor;
     const int64_t prev_layer_target_bandwidth = lcprev->layer_target_bitrate;
     lc->avg_frame_size =
-        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
-              (lc->framerate - prev_layer_framerate));
+        (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) /
+                   (lc->framerate - prev_layer_framerate));
   }
 }
 
 void av1_restore_layer_context(AV1_COMP *const cpi) {
-  GF_GROUP *const gf_group = &cpi->gf_group;
   SVC *const svc = &cpi->svc;
+  const AV1_COMMON *const cm = &cpi->common;
   LAYER_CONTEXT *const lc = get_layer_context(cpi);
   const int old_frame_since_key = cpi->rc.frames_since_key;
   const int old_frame_to_key = cpi->rc.frames_to_key;
   // Restore layer rate control.
   cpi->rc = lc->rc;
-  cpi->oxcf.target_bandwidth = lc->target_bandwidth;
-  gf_group->index = lc->group_index;
+  cpi->ppi->p_rc = lc->p_rc;
+  cpi->oxcf.rc_cfg.target_bandwidth = lc->target_bandwidth;
+  cpi->gf_frame_index = 0;
+  cpi->mv_search_params.max_mv_magnitude = lc->max_mv_magnitude;
+  if (cpi->mv_search_params.max_mv_magnitude == 0)
+    cpi->mv_search_params.max_mv_magnitude = AOMMAX(cm->width, cm->height);
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
   cpi->rc.frames_since_key = old_frame_since_key;
   cpi->rc.frames_to_key = old_frame_to_key;
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
   // for the base temporal layer.
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
       svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
     swap_ptr(&cr->map, &lc->map);
-    swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map);
     cr->sb_index = lc->sb_index;
     cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
     cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
   }
-  svc->skip_nonzeromv_last = 0;
-  svc->skip_nonzeromv_gf = 0;
-  // For each reference (LAST/GOLDEN) set the skip_nonzero_last/gf frame flags.
-  // This is to skip testing nonzero-mv for that reference if it was last
+  svc->skip_mvsearch_last = 0;
+  svc->skip_mvsearch_gf = 0;
+  // For each reference (LAST/GOLDEN) set the skip_mvsearch_last/gf frame flags.
+  // This is to skip searching mv for that reference if it was last
   // refreshed (i.e., buffer slot holding that reference was refreshed) on the
-  // previous spatial layer at the same time (current_superframe).
-  if (svc->external_ref_frame_config) {
+  // previous spatial layer(s) at the same time (current_superframe).
+  if (svc->set_ref_frame_config && svc->force_zero_mode_spatial_ref) {
     int ref_frame_idx = svc->ref_idx[LAST_FRAME - 1];
     if (svc->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
-        svc->buffer_spatial_layer[ref_frame_idx] == svc->spatial_layer_id - 1)
-      svc->skip_nonzeromv_last = 1;
+        svc->buffer_spatial_layer[ref_frame_idx] <= svc->spatial_layer_id - 1)
+      svc->skip_mvsearch_last = 1;
     ref_frame_idx = svc->ref_idx[GOLDEN_FRAME - 1];
     if (svc->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
-        svc->buffer_spatial_layer[ref_frame_idx] == svc->spatial_layer_id - 1)
-      svc->skip_nonzeromv_gf = 1;
+        svc->buffer_spatial_layer[ref_frame_idx] <= svc->spatial_layer_id - 1)
+      svc->skip_mvsearch_gf = 1;
   }
 }
 
 void av1_save_layer_context(AV1_COMP *const cpi) {
-  GF_GROUP *const gf_group = &cpi->gf_group;
   SVC *const svc = &cpi->svc;
+  const AV1_COMMON *const cm = &cpi->common;
   LAYER_CONTEXT *lc = get_layer_context(cpi);
   lc->rc = cpi->rc;
-  lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth;
-  lc->group_index = gf_group->index;
+  lc->p_rc = cpi->ppi->p_rc;
+  lc->target_bandwidth = (int)cpi->oxcf.rc_cfg.target_bandwidth;
+  lc->group_index = cpi->gf_frame_index;
+  lc->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude;
   if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate;
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
   // for the base temporal layer.
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
       cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
     signed char *temp = lc->map;
-    uint8_t *temp2 = lc->last_coded_q_map;
     lc->map = cr->map;
     cr->map = temp;
-    lc->last_coded_q_map = cr->last_coded_q_map;
-    cr->last_coded_q_map = temp2;
     lc->sb_index = cr->sb_index;
     lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
     lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
@@ -222,7 +241,7 @@ void av1_save_layer_context(AV1_COMP *const cpi) {
       svc->buffer_time_index[i] = svc->current_superframe;
       svc->buffer_spatial_layer[i] = svc->spatial_layer_id;
     }
-  } else if (cpi->svc.external_ref_frame_config) {
+  } else if (cpi->svc.set_ref_frame_config) {
     for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
       int ref_frame_map_idx = svc->ref_idx[i];
       if (cpi->svc.refresh[ref_frame_map_idx]) {
@@ -231,10 +250,36 @@ void av1_save_layer_context(AV1_COMP *const cpi) {
       }
     }
   }
+  for (unsigned int i = 0; i < REF_FRAMES; i++) {
+    if (frame_is_intra_only(cm) ||
+        cm->current_frame.refresh_frame_flags & (1 << i)) {
+      svc->spatial_layer_fb[i] = svc->spatial_layer_id;
+      svc->temporal_layer_fb[i] = svc->temporal_layer_id;
+    }
+  }
   if (svc->spatial_layer_id == svc->number_spatial_layers - 1)
     svc->current_superframe++;
 }
 
+int av1_svc_primary_ref_frame(const AV1_COMP *const cpi) {
+  const SVC *const svc = &cpi->svc;
+  const AV1_COMMON *const cm = &cpi->common;
+  int fb_idx = -1;
+  int primary_ref_frame = PRIMARY_REF_NONE;
+  // Set the primary_ref_frame to LAST_FRAME if that buffer slot for LAST
+  // was last updated on a lower temporal layer (or base TL0) and for the
+  // same spatial layer. For RTC patterns this allows for continued decoding
+  // when set of enhancement layers are dropped (continued decoding starting
+  // at next base TL0), so error_resilience can be off/0 for all layers.
+  fb_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+  if (svc->spatial_layer_fb[fb_idx] == svc->spatial_layer_id &&
+      (svc->temporal_layer_fb[fb_idx] < svc->temporal_layer_id ||
+       svc->temporal_layer_fb[fb_idx] == 0)) {
+    primary_ref_frame = 0;  // LAST_FRAME
+  }
+  return primary_ref_frame;
+}
+
 void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
   for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
@@ -242,12 +287,10 @@ void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) {
       int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
       LAYER_CONTEXT *const lc = &svc->layer_context[layer];
       if (lc->map) aom_free(lc->map);
-      if (lc->last_coded_q_map) aom_free(lc->last_coded_q_map);
     }
   }
 }
 
-// Reset on key frame: reset counters, references and buffer updates.
 void av1_svc_reset_temporal_layers(AV1_COMP *const cpi, int is_key) {
   SVC *const svc = &cpi->svc;
   LAYER_CONTEXT *lc = NULL;
@@ -261,9 +304,9 @@ void av1_svc_reset_temporal_layers(AV1_COMP *const cpi, int is_key) {
   av1_restore_layer_context(cpi);
 }
 
-static void get_layer_resolution(const int width_org, const int height_org,
-                                 const int num, const int den, int *width_out,
-                                 int *height_out) {
+void av1_get_layer_resolution(const int width_org, const int height_org,
+                              const int num, const int den, int *width_out,
+                              int *height_out) {
   int w, h;
   if (width_out == NULL || height_out == NULL || den == 0) return;
   w = width_org * num / den;
@@ -281,8 +324,185 @@ void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) {
   int width = 0, height = 0;
   lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
                            svc->temporal_layer_id];
-  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
-                       lc->scaling_factor_num, lc->scaling_factor_den, &width,
-                       &height);
-  av1_set_size_literal(cpi, width, height);
+  av1_get_layer_resolution(cpi->oxcf.frm_dim_cfg.width,
+                           cpi->oxcf.frm_dim_cfg.height, lc->scaling_factor_num,
+                           lc->scaling_factor_den, &width, &height);
+  // Use Eightap_smooth for low resolutions.
+  if (width * height <= 320 * 240)
+    svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH;
+
+  cpi->common.width = width;
+  cpi->common.height = height;
+  alloc_mb_mode_info_buffers(cpi);
+  av1_update_frame_size(cpi);
+  if (svc->spatial_layer_id == 0) svc->high_source_sad_superframe = 0;
+}
+
+enum {
+  SVC_LAST_FRAME = 0,
+  SVC_LAST2_FRAME,
+  SVC_LAST3_FRAME,
+  SVC_GOLDEN_FRAME,
+  SVC_BWDREF_FRAME,
+  SVC_ALTREF2_FRAME,
+  SVC_ALTREF_FRAME
+};
+
+// For fixed svc mode: fixed pattern is set based on the number of
+// spatial and temporal layers, and the ksvc_fixed_mode.
+void av1_set_svc_fixed_mode(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  int i;
+  assert(svc->use_flexible_mode == 0);
+  // Fixed SVC mode only supports at most 3 spatial or temporal layers.
+  assert(svc->number_spatial_layers >= 1 && svc->number_spatial_layers <= 3 &&
+         svc->number_temporal_layers >= 1 && svc->number_temporal_layers <= 3);
+  svc->set_ref_frame_config = 1;
+  int superframe_cnt = svc->current_superframe;
+  // Set the reference map buffer idx for the 7 references:
+  // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+  // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = i;
+  for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->reference[i] = 0;
+  for (i = 0; i < REF_FRAMES; i++) svc->refresh[i] = 0;
+  // Always reference LAST, and reference GOLDEN on SL > 0.
+  // For KSVC: GOLDEN reference will be removed on INTER_FRAMES later
+  // when frame_type is set.
+  svc->reference[SVC_LAST_FRAME] = 1;
+  if (svc->spatial_layer_id > 0) svc->reference[SVC_GOLDEN_FRAME] = 1;
+  if (svc->temporal_layer_id == 0) {
+    // Base temporal layer.
+    if (svc->spatial_layer_id == 0) {
+      // Set all buffer_idx to 0. Update slot 0 (LAST).
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      svc->refresh[0] = 1;
+    } else if (svc->spatial_layer_id == 1) {
+      // Set buffer_idx for LAST to slot 1, GOLDEN (and all other refs) to
+      // slot 0. Update slot 1 (LAST).
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      svc->ref_idx[SVC_LAST_FRAME] = 1;
+      svc->refresh[1] = 1;
+    } else if (svc->spatial_layer_id == 2) {
+      // Set buffer_idx for LAST to slot 2, GOLDEN (and all other refs) to
+      // slot 1. Update slot 2 (LAST).
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 1;
+      svc->ref_idx[SVC_LAST_FRAME] = 2;
+      svc->refresh[2] = 1;
+    }
+  } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 1) % 4 == 0) {
+    // First top temporal enhancement layer.
+    if (svc->spatial_layer_id == 0) {
+      // Reference LAST (slot 0).
+      // Set GOLDEN to slot 3 and update slot 3.
+      // Set all other buffer_idx to slot 0.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        svc->ref_idx[SVC_GOLDEN_FRAME] = 3;
+        svc->refresh[3] = 1;
+      }
+    } else if (svc->spatial_layer_id == 1) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+      // GOLDEN (and all other refs) to slot 3.
+      // Set LAST2 to slot 4 and Update slot 4.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 3;
+      svc->ref_idx[SVC_LAST_FRAME] = 1;
+      if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        svc->ref_idx[SVC_LAST2_FRAME] = 4;
+        svc->refresh[4] = 1;
+      }
+    } else if (svc->spatial_layer_id == 2) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+      // GOLDEN (and all other refs) to slot 4.
+      // No update.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 4;
+      svc->ref_idx[SVC_LAST_FRAME] = 2;
+    }
+  } else if (svc->temporal_layer_id == 1) {
+    // Middle temporal enhancement layer.
+    if (svc->spatial_layer_id == 0) {
+      // Reference LAST.
+      // Set all buffer_idx to 0.
+      // Set GOLDEN to slot 5 and update slot 5.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+        svc->ref_idx[SVC_GOLDEN_FRAME] = 5;
+        svc->refresh[5] = 1;
+      }
+    } else if (svc->spatial_layer_id == 1) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+      // GOLDEN (and all other refs) to slot 5.
+      // Set LAST3 to slot 6 and update slot 6.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 5;
+      svc->ref_idx[SVC_LAST_FRAME] = 1;
+      if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+        svc->ref_idx[SVC_LAST3_FRAME] = 6;
+        svc->refresh[6] = 1;
+      }
+    } else if (svc->spatial_layer_id == 2) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+      // GOLDEN (and all other refs) to slot 6.
+      // Set LAST3 to slot 7 and update slot 7.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 6;
+      svc->ref_idx[SVC_LAST_FRAME] = 2;
+      if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+        svc->ref_idx[SVC_LAST3_FRAME] = 7;
+        svc->refresh[7] = 1;
+      }
+    }
+  } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 3) % 4 == 0) {
+    // Second top temporal enhancement layer.
+    if (svc->spatial_layer_id == 0) {
+      // Set LAST to slot 5 and reference LAST.
+      // Set GOLDEN to slot 3 and update slot 3.
+      // Set all other buffer_idx to 0.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      svc->ref_idx[SVC_LAST_FRAME] = 5;
+      if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        svc->ref_idx[SVC_GOLDEN_FRAME] = 3;
+        svc->refresh[3] = 1;
+      }
+    } else if (svc->spatial_layer_id == 1) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+      // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      svc->ref_idx[SVC_LAST_FRAME] = 6;
+      svc->ref_idx[SVC_GOLDEN_FRAME] = 3;
+      if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+        svc->ref_idx[SVC_LAST2_FRAME] = 4;
+        svc->refresh[4] = 1;
+      }
+    } else if (svc->spatial_layer_id == 2) {
+      // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
+      // GOLDEN to slot 4. No update.
+      for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0;
+      svc->ref_idx[SVC_LAST_FRAME] = 7;
+      svc->ref_idx[SVC_GOLDEN_FRAME] = 4;
+    }
+  }
+}
+
+void av1_svc_check_reset_layer_rc_flag(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    // Check for reset based on avg_frame_bandwidth for spatial layer sl.
+    int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
+                                 svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    RATE_CONTROL *lrc = &lc->rc;
+    if (lrc->avg_frame_bandwidth > (3 * lrc->prev_avg_frame_bandwidth >> 1) ||
+        lrc->avg_frame_bandwidth < (lrc->prev_avg_frame_bandwidth >> 1)) {
+      // Reset for all temporal layers with spatial layer sl.
+      for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+        int layer2 = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+        LAYER_CONTEXT *lc2 = &svc->layer_context[layer2];
+        RATE_CONTROL *lrc2 = &lc2->rc;
+        PRIMARY_RATE_CONTROL *lp_rc2 = &lc2->p_rc;
+        PRIMARY_RATE_CONTROL *const lp_rc = &lc2->p_rc;
+        lrc2->rc_1_frame = 0;
+        lrc2->rc_2_frame = 0;
+        lp_rc2->bits_off_target = lp_rc->optimal_buffer_level;
+        lp_rc2->buffer_level = lp_rc->optimal_buffer_level;
+      }
+    }
+  }
 }
diff --git a/media/libaom/src/av1/encoder/svc_layercontext.h b/media/libaom/src/av1/encoder/svc_layercontext.h
index 7cb85a3c93..dc6906d438 100644
--- a/media/libaom/src/av1/encoder/svc_layercontext.h
+++ b/media/libaom/src/av1/encoder/svc_layercontext.h
@@ -19,8 +19,14 @@
 extern "C" {
 #endif
 
+/*!
+ * \brief The stucture of quantities related to each spatial and temporal layer.
+ * \ingroup SVC
+ */
 typedef struct {
+  /*!\cond */
   RATE_CONTROL rc;
+  PRIMARY_RATE_CONTROL p_rc;
   int framerate_factor;
   int64_t layer_target_bitrate;
   int scaling_factor_num;
@@ -32,66 +38,245 @@ typedef struct {
   int max_q;
   int min_q;
   int frames_from_key_frame;
-  // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+  /*!\endcond */
+
+  /*!
+   * Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+   */
   int sb_index;
+  /*!
+   * Segmentation map
+   */
   int8_t *map;
-  uint8_t *last_coded_q_map;
+  /*!
+   * Number of blocks on segment 1
+   */
   int actual_num_seg1_blocks;
+
+  /*!
+   * Number of blocks on segment 2
+   */
   int actual_num_seg2_blocks;
+  /*!
+   * Counter used to detect scene change.
+   */
   int counter_encode_maxq_scene_change;
+
+  /*!
+   * Speed settings for each layer.
+   */
   uint8_t speed;
+  /*!
+   * GF group index.
+   */
   unsigned char group_index;
+  /*!
+   * If current layer is key frame.
+   */
+  int is_key_frame;
+  /*!
+   * Maximum motion magnitude of previous encoded layer.
+   */
+  int max_mv_magnitude;
 } LAYER_CONTEXT;
 
+/*!
+ * \brief The stucture of SVC.
+ * \ingroup SVC
+ */
 typedef struct SVC {
+  /*!\cond */
   int spatial_layer_id;
   int temporal_layer_id;
   int number_spatial_layers;
   int number_temporal_layers;
-  int external_ref_frame_config;
+  int set_ref_frame_config;
   int non_reference_frame;
-  // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
-  // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  int use_flexible_mode;
+  int ksvc_fixed_mode;
+  int ref_frame_comp[3];
+  /*!\endcond */
+
+  /*!
+   * LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+   * BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+   */
   int reference[INTER_REFS_PER_FRAME];
+  /*!\cond */
   int ref_idx[INTER_REFS_PER_FRAME];
   int refresh[REF_FRAMES];
+  int gld_idx_1layer;
   double base_framerate;
   unsigned int current_superframe;
   unsigned int buffer_time_index[REF_FRAMES];
   unsigned char buffer_spatial_layer[REF_FRAMES];
-  int skip_nonzeromv_last;
-  int skip_nonzeromv_gf;
-  // Layer context used for rate control in one pass temporal CBR mode or
-  // two pass spatial mode.
+  int skip_mvsearch_last;
+  int skip_mvsearch_gf;
+  int spatial_layer_fb[REF_FRAMES];
+  int temporal_layer_fb[REF_FRAMES];
+  int num_encoded_top_layer;
+  int first_layer_denoise;
+  int high_source_sad_superframe;
+  /*!\endcond */
+
+  /*!
+   * Layer context used for rate control in CBR mode.
+   */
   LAYER_CONTEXT layer_context[AOM_MAX_LAYERS];
+
+  /*!
+   * EIGHTTAP_SMOOTH or BILINEAR
+   */
+  InterpFilter downsample_filter_type[AOM_MAX_SS_LAYERS];
+
+  /*!
+   * Downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
+   * = 8 will center the target pixel and get a symmetric averaging filter.
+   */
+  int downsample_filter_phase[AOM_MAX_SS_LAYERS];
+
+  /*!
+   * Force zero-mv in mode search for the spatial/inter-layer reference.
+   */
+  int force_zero_mode_spatial_ref;
 } SVC;
 
 struct AV1_COMP;
 
-// Initialize layer context data from init_config().
+/*!\brief Initialize layer context data from init_config().
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Nothing returned. Set cpi->svc.
+ */
 void av1_init_layer_context(struct AV1_COMP *const cpi);
 
-// Update the layer context from a change_config() call.
+/*!\brief Update the layer context from a change_config() call.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ * \param[in]       target_bandwidth  Total target bandwidth
+ *
+ * \return  Nothing returned. Buffer level for each layer is set.
+ */
 void av1_update_layer_context_change_config(struct AV1_COMP *const cpi,
                                             const int64_t target_bandwidth);
 
-// Prior to encoding the frame, update framerate-related quantities
-// for the current temporal layer.
+/*!\brief Prior to encoding the frame, update framerate-related quantities
+          for the current temporal layer.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Nothing returned. Frame related quantities for current temporal
+ layer are updated.
+ */
 void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi);
 
-// Prior to encoding the frame, set the layer context, for the current layer
-// to be encoded, to the cpi struct.
+/*!\brief Prior to encoding the frame, set the layer context, for the current
+ layer to be encoded, to the cpi struct.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Nothing returned. Layer context for current layer is set.
+ */
 void av1_restore_layer_context(struct AV1_COMP *const cpi);
 
-// Save the layer context after encoding the frame.
+/*!\brief Save the layer context after encoding the frame.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Nothing returned.
+ */
 void av1_save_layer_context(struct AV1_COMP *const cpi);
 
+/*!\brief Free the memory used for cyclic refresh in layer context.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Nothing returned.
+ */
 void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi);
 
+/*!\brief Reset on key frame: reset counters, references and buffer updates.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ * \param[in]       is_key  Whether current layer is key frame
+ *
+ * \return  Nothing returned.
+ */
 void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key);
 
+/*!\brief Before encoding, set resolutions and allocate compressor data.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  Nothing returned.
+ */
 void av1_one_pass_cbr_svc_start_layer(struct AV1_COMP *const cpi);
 
+/*!\brief Get primary reference frame for current layer
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi  Top level encoder structure
+ *
+ * \return  The primary reference frame for current layer.
+ */
+int av1_svc_primary_ref_frame(const struct AV1_COMP *const cpi);
+
+/*!\brief Get resolution for current layer.
+ *
+ * \ingroup SVC
+ * \param[in]       width_org    Original width, unscaled
+ * \param[in]       height_org   Original height, unscaled
+ * \param[in]       num          Numerator for the scale ratio
+ * \param[in]       den          Denominator for the scale ratio
+ * \param[in]       width_out    Output width, scaled for current layer
+ * \param[in]       height_out   Output height, scaled for current layer
+ *
+ * \return Nothing is returned. Instead the scaled width and height are set.
+ */
+void av1_get_layer_resolution(const int width_org, const int height_org,
+                              const int num, const int den, int *width_out,
+                              int *height_out);
+
+void av1_set_svc_fixed_mode(struct AV1_COMP *const cpi);
+
+void av1_svc_check_reset_layer_rc_flag(struct AV1_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/temporal_filter.c b/media/libaom/src/av1/encoder/temporal_filter.c
index a637df559e..5f76b1890e 100644
--- a/media/libaom/src/av1/encoder/temporal_filter.c
+++ b/media/libaom/src/av1/encoder/temporal_filter.c
@@ -13,56 +13,82 @@
 #include <limits.h>
 
 #include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
 
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/odintrin.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
 #include "av1/common/alloccommon.h"
 #include "av1/common/av1_common_int.h"
-#include "av1/common/odintrin.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"
 #include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
 #include "av1/encoder/mcomp.h"
+#include "av1/encoder/pass2_strategy.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/temporal_filter.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/aom_timer.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
-#include "aom_scale/aom_scale.h"
+
+/*!\cond */
 
 // NOTE: All `tf` in this file means `temporal filtering`.
 
-// Does motion search for blocks in temporal filtering. This is the first step
-// for temporal filtering. More specifically, given a frame to be filtered and
-// another frame as reference, this function searches the reference frame to
-// find out the most alike block as that from the frame to be filtered. This
-// found block will be further used for weighted averaging.
-// NOTE: Besides doing motion search for the entire block, this function will
-// also do motion search for each 1/4 sub-block to get more precise prediction.
-// Inputs:
-//   cpi: Pointer to the composed information of input video.
-//   frame_to_filter: Pointer to the frame to be filtered.
-//   ref_frame: Pointer to the reference frame.
-//   block_size: Block size used for motion search.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   ref_mv: Reference motion vector, which is commonly inherited from the
-//           motion search result of previous frame.
-//   subblock_mvs: Pointer to the result motion vectors for 4 sub-blocks.
-//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
-// Returns:
-//   Search error (MSE) of the entire block.
-static int tf_motion_search(AV1_COMP *cpi,
-                            const YV12_BUFFER_CONFIG *frame_to_filter,
-                            const YV12_BUFFER_CONFIG *ref_frame,
-                            const BLOCK_SIZE block_size, const int mb_row,
-                            const int mb_col, MV *ref_mv, MV *subblock_mvs,
-                            int *subblock_mses) {
+// Forward Declaration.
+static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+                                         MV *subblock_mvs, int *subblock_mses);
+
+/*!\endcond */
+/*!\brief Does motion search for blocks in temporal filtering. This is
+ *  the first step for temporal filtering. More specifically, given a frame to
+ * be filtered and another frame as reference, this function searches the
+ * reference frame to find out the most similar block as that from the frame
+ * to be filtered. This found block will be further used for weighted
+ * averaging.
+ *
+ * NOTE: Besides doing motion search for the entire block, this function will
+ *       also do motion search for each 1/4 sub-block to get more precise
+ *       predictions. Then, this function will determines whether to use 4
+ *       sub-blocks to replace the entire block. If we do need to split the
+ *       entire block, 4 elements in `subblock_mvs` and `subblock_mses` refer to
+ *       the searched motion vector and search error (MSE) w.r.t. each sub-block
+ *       respectively. Otherwise, the 4 elements will be the same, all of which
+ *       are assigned as the searched motion vector and search error (MSE) for
+ *       the entire block.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   cpi             Top level encoder instance structure
+ * \param[in]   mb              Pointer to macroblock
+ * \param[in]   frame_to_filter Pointer to the frame to be filtered
+ * \param[in]   ref_frame       Pointer to the reference frame
+ * \param[in]   block_size      Block size used for motion search
+ * \param[in]   mb_row          Row index of the block in the frame
+ * \param[in]   mb_col          Column index of the block in the frame
+ * \param[in]   ref_mv          Reference motion vector, which is commonly
+ *                              inherited from the motion search result of
+ *                              previous frame.
+ * \param[out]  subblock_mvs    Pointer to the motion vectors for 4 sub-blocks
+ * \param[out]  subblock_mses   Pointer to the search errors (MSE) for 4
+ *                              sub-blocks
+ *
+ * \return Nothing will be returned. Results are saved in subblock_mvs and
+ *         subblock_mses
+ */
+static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
+                             const YV12_BUFFER_CONFIG *frame_to_filter,
+                             const YV12_BUFFER_CONFIG *ref_frame,
+                             const BLOCK_SIZE block_size, const int mb_row,
+                             const int mb_col, MV *ref_mv, MV *subblock_mvs,
+                             int *subblock_mses) {
   // Frame information
   const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
 
@@ -75,19 +101,16 @@ static int tf_motion_search(AV1_COMP *cpi,
   const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
 
   // Save input state.
-  MACROBLOCK *const mb = &cpi->td.mb;
   MACROBLOCKD *const mbd = &mb->e_mbd;
   const struct buf_2d ori_src_buf = mb->plane[0].src;
   const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
-  const MV_COST_TYPE ori_mv_cost_type = mb->mv_cost_type;
 
   // Parameters used for motion search.
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   SUBPEL_MOTION_SEARCH_PARAMS ms_params;
-
-  const search_site_config ss_cfg =
-      cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
-  const SEARCH_METHODS full_search_method = NSTEP;
+  const SEARCH_METHODS search_method = NSTEP;
+  const search_site_config *search_site_cfg =
+      cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
   const int step_param = av1_init_search_range(
       AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height));
   const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS;
@@ -113,45 +136,54 @@ static int tf_motion_search(AV1_COMP *cpi,
   int cost_list[5];
 
   // Do motion search.
-  // NOTE: In `av1_full_pixel_search()` and `find_fractional_mv_step()`, the
-  // searched result will be stored in `mb->best_mv`.
-  int_mv best_mv;
+  int_mv best_mv;  // Searched motion vector.
   int block_mse = INT_MAX;
-  mb->mv_cost_type = mv_cost_type;
+  MV block_mv = kZeroMv;
+  const int q = av1_get_q(cpi);
 
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
-                                     &baseline_mv, &ss_cfg);
+                                     &baseline_mv, search_site_cfg,
+                                     /*fine_search_interval=*/0);
+  av1_set_mv_search_method(&full_ms_params, search_site_cfg, search_method);
   full_ms_params.run_mesh_search = 1;
-  full_ms_params.search_method = full_search_method;
+  full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+
+  if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
+    // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
+    full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
+    full_ms_params.mesh_search_mv_diff_threshold = 2;
+  }
+
   av1_full_pixel_search(start_mv, &full_ms_params, step_param,
                         cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
                         NULL);
 
-  // Since we are merely refining the result from full pixel search, we don't
-  // need regularization for subpel search
-  mb->mv_cost_type = MV_COST_NONE;
   if (force_integer_mv == 1) {  // Only do full search on the entire block.
     const int mv_row = best_mv.as_mv.row;
     const int mv_col = best_mv.as_mv.col;
     best_mv.as_mv.row = GET_MV_SUBPEL(mv_row);
     best_mv.as_mv.col = GET_MV_SUBPEL(mv_col);
     const int mv_offset = mv_row * y_stride + mv_col;
-    error = cpi->fn_ptr[block_size].vf(
+    error = cpi->ppi->fn_ptr[block_size].vf(
         ref_frame->y_buffer + y_offset + mv_offset, y_stride,
         frame_to_filter->y_buffer + y_offset, y_stride, &sse);
     block_mse = DIVIDE_AND_ROUND(error, mb_pels);
-    mb->e_mbd.mi[0]->mv[0] = best_mv;
+    block_mv = best_mv.as_mv;
   } else {  // Do fractional search on the entire block and all sub-blocks.
     av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size,
                                       &baseline_mv, cost_list);
     ms_params.forced_stop = EIGHTH_PEL;
     ms_params.var_params.subpel_search_type = subpel_search_type;
+    // Since we are merely refining the result from full pixel search, we don't
+    // need regularization for subpel search
+    ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+
     MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
     error = cpi->mv_search_params.find_fractional_mv_step(
         &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv.as_mv,
         &distortion, &sse, NULL);
     block_mse = DIVIDE_AND_ROUND(error, mb_pels);
-    mb->e_mbd.mi[0]->mv[0] = best_mv;
+    block_mv = best_mv.as_mv;
     *ref_mv = best_mv.as_mv;
     // On 4 sub-blocks.
     const BLOCK_SIZE subblock_size = ss_size_lookup[block_size][1][1];
@@ -166,23 +198,33 @@ static int tf_motion_search(AV1_COMP *cpi,
         const int offset = i * y_stride + j;
         mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
         mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
-        mb->mv_cost_type = mv_cost_type;
-
-        av1_make_default_fullpel_ms_params(
-            &full_ms_params, cpi, mb, subblock_size, &baseline_mv, &ss_cfg);
+        av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb,
+                                           subblock_size, &baseline_mv,
+                                           search_site_cfg,
+                                           /*fine_search_interval=*/0);
+        av1_set_mv_search_method(&full_ms_params, search_site_cfg,
+                                 search_method);
         full_ms_params.run_mesh_search = 1;
-        full_ms_params.search_method = full_search_method;
+        full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+
+        if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
+          // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
+          full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
+          full_ms_params.mesh_search_mv_diff_threshold = 2;
+        }
+
         av1_full_pixel_search(start_mv, &full_ms_params, step_param,
                               cond_cost_list(cpi, cost_list),
                               &best_mv.as_fullmv, NULL);
 
-        // Since we are merely refining the result from full pixel search, we
-        // don't need regularization for subpel search
-        mb->mv_cost_type = MV_COST_NONE;
         av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
                                           &baseline_mv, cost_list);
         ms_params.forced_stop = EIGHTH_PEL;
         ms_params.var_params.subpel_search_type = subpel_search_type;
+        // Since we are merely refining the result from full pixel search, we
+        // don't need regularization for subpel search
+        ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+
         subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
         error = cpi->mv_search_params.find_fractional_mv_step(
             &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
@@ -197,72 +239,54 @@ static int tf_motion_search(AV1_COMP *cpi,
   // Restore input state.
   mb->plane[0].src = ori_src_buf;
   mbd->plane[0].pre[0] = ori_pre_buf;
-  mb->mv_cost_type = ori_mv_cost_type;
 
-  return block_mse;
-}
+  // Make partition decision.
+  tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
+                               subblock_mses);
 
-// Helper function to get weight according to thresholds.
-static INLINE int get_weight_by_thresh(const int value, const int low,
-                                       const int high) {
-  return value < low ? 2 : value < high ? 1 : 0;
+  // Do not pass down the reference motion vector if error is too large.
+  const int thresh = (min_frame_size >= 720) ? 12 : 3;
+  if (block_mse > (thresh << (mbd->bd - 8))) {
+    *ref_mv = kZeroMv;
+  }
 }
+/*!\cond */
 
-// Gets filter weight for blocks in temporal filtering. The weights will be
-// assigned based on the motion search errors.
-// NOTE: Besides assigning filter weight for the block, this function will also
-// determine whether to split the entire block into 4 sub-blocks for further
-// filtering.
-// TODO(any): Many magic numbers are used in this function. They may be tuned
-// to improve the performance.
+// Determines whether to split the entire block to 4 sub-blocks for filtering.
+// In particular, this decision is made based on the comparison between the
+// motion search error of the entire block and the errors of all sub-blocks.
 // Inputs:
-//   block_mse: Motion search error (MSE) for the entire block.
-//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
-//   is_second_arf: Whether the to-filter frame is the second ARF. This field
-//                  will affect the filter weight for the to-filter frame.
-//   subblock_filter_weights: Pointer to the assigned filter weight for each
-//                            sub-block. If not using sub-blocks, the first
-//                            element will be used for the entire block.
-// Returns: Whether to use 4 sub-blocks to replace the original block.
-static int tf_get_filter_weight(const int block_mse, const int *subblock_mses,
-                                const int is_second_arf,
-                                int *subblock_filter_weights) {
-  // `block_mse` is initialized as INT_MAX and will be overwritten after the
-  // motion search with reference frame, therefore INT_MAX can ONLY be accessed
-  // by to-filter frame.
-  if (block_mse == INT_MAX) {
-    const int weight = TF_ENABLE_PLANEWISE_STRATEGY
-                           ? TF_PLANEWISE_FILTER_WEIGHT_SCALE
-                           : is_second_arf ? 64 : 32;
-    subblock_filter_weights[0] = subblock_filter_weights[1] =
-        subblock_filter_weights[2] = subblock_filter_weights[3] = weight;
-    return 0;
-  }
-
-  const int thresh_low = is_second_arf ? 20 : 40;
-  const int thresh_high = is_second_arf ? 40 : 80;
-
+//   block_mv: Motion vector for the entire block (ONLY as reference).
+//   block_mse: Motion search error (MSE) for the entire block (ONLY as
+//              reference).
+//   subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be
+//                 modified based on the partition decision).
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will
+//                  be modified based on the partition decision).
+// Returns:
+//   Nothing will be returned. Results are saved in `subblock_mvs` and
+//   `subblock_mses`.
+static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+                                         MV *subblock_mvs, int *subblock_mses) {
   int min_subblock_mse = INT_MAX;
   int max_subblock_mse = INT_MIN;
-  int sum_subblock_mse = 0;
+  int64_t sum_subblock_mse = 0;
   for (int i = 0; i < 4; ++i) {
     sum_subblock_mse += subblock_mses[i];
     min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
     max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
-    subblock_filter_weights[i] =
-        get_weight_by_thresh(subblock_mses[i], thresh_low, thresh_high);
   }
 
+  // TODO(any): The following magic numbers may be tuned to improve the
+  // performance OR find a way to get rid of these magic numbers.
   if (((block_mse * 15 < sum_subblock_mse * 4) &&
        max_subblock_mse - min_subblock_mse < 48) ||
       ((block_mse * 14 < sum_subblock_mse * 4) &&
        max_subblock_mse - min_subblock_mse < 24)) {  // No split.
-    const int weight = get_weight_by_thresh(block_mse, thresh_low, thresh_high);
-    subblock_filter_weights[0] = subblock_filter_weights[1] =
-        subblock_filter_weights[2] = subblock_filter_weights[3] = weight;
-    return 0;
-  } else {  // Do split.
-    return 1;
+    for (int i = 0; i < 4; ++i) {
+      subblock_mvs[i] = block_mv;
+      subblock_mses[i] = block_mse;
+    }
   }
 }
 
@@ -271,58 +295,53 @@ static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
   return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 }
 
-// Builds predictor for blocks in temporal filtering. This is the second step
-// for temporal filtering, which is to construct predictions from all reference
-// frames INCLUDING the frame to be filtered itself. These predictors are built
-// based on the motion search results (motion vector is set as 0 for the frame
-// to be filtered), and will be futher used for weighted averaging.
-// Inputs:
-//   ref_frame: Pointer to the reference frame (or the frame to be filtered).
-//   mbd: Pointer to the block for filtering. Besides containing the subsampling
-//        information of all planes, this field also gives the searched motion
-//        vector for the entire block, i.e., `mbd->mi[0]->mv[0]`. This vector
-//        should be 0 if the `ref_frame` itself is the frame to be filtered.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   num_planes: Number of planes in the frame.
-//   scale: Scaling factor.
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//   subblock_mvs: The motion vectors for each sub-block (row-major order).
-//   pred: Pointer to the predictor to build.
-// Returns:
-//   Nothing will be returned. But the content to which `pred` points will be
-//   modified.
+/*!\endcond */
+/*!\brief Builds predictor for blocks in temporal filtering. This is the
+ * second step for temporal filtering, which is to construct predictions from
+ * all reference frames INCLUDING the frame to be filtered itself. These
+ * predictors are built based on the motion search results (motion vector is
+ * set as 0 for the frame to be filtered), and will be futher used for
+ * weighted averaging.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   ref_frame      Pointer to the reference frame (or the frame
+ *                             to be filtered)
+ * \param[in]   mbd            Pointer to the block for filtering. Besides
+ *                             containing the subsampling information of all
+ *                             planes, this field also gives the searched
+ *                             motion vector for the entire block, i.e.,
+ *                             `mbd->mi[0]->mv[0]`. This vector  should be 0
+ *                             if the `ref_frame` itself is the frame to be
+ *                             filtered.
+ * \param[in]   block_size     Size of the block
+ * \param[in]   mb_row         Row index of the block in the frame
+ * \param[in]   mb_col         Column index of the block in the frame
+ * \param[in]   num_planes     Number of planes in the frame
+ * \param[in]   scale          Scaling factor
+ * \param[in]   subblock_mvs   The motion vectors for each sub-block (row-major
+ *                             order)
+ * \param[out]  pred           Pointer to the predictor to be built
+ *
+ * \return Nothing returned, But the contents of `pred` will be modified
+ */
 static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
                                const MACROBLOCKD *mbd,
                                const BLOCK_SIZE block_size, const int mb_row,
                                const int mb_col, const int num_planes,
                                const struct scale_factors *scale,
-                               const int use_subblock, const MV *subblock_mvs,
-                               uint8_t *pred) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
+                               const MV *subblock_mvs, uint8_t *pred) {
   // Information of the entire block.
   const int mb_height = block_size_high[block_size];  // Height.
   const int mb_width = block_size_wide[block_size];   // Width.
-  const int mb_pels = mb_height * mb_width;           // Number of pixels.
   const int mb_y = mb_height * mb_row;                // Y-coord (Top-left).
   const int mb_x = mb_width * mb_col;                 // X-coord (Top-left).
   const int bit_depth = mbd->bd;                      // Bit depth.
   const int is_intrabc = 0;                           // Is intra-copied?
-  const int mb_mv_row = mbd->mi[0]->mv[0].as_mv.row;  // Motion vector (y).
-  const int mb_mv_col = mbd->mi[0]->mv[0].as_mv.col;  // Motion vector (x).
-  const MV mb_mv = { (int16_t)mb_mv_row, (int16_t)mb_mv_col };
   const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame);
 
-  // Information of each sub-block (actually in use).
-  const int num_blocks = use_subblock ? 2 : 1;  // Num of blocks on each side.
-  const int block_height = mb_height >> (num_blocks - 1);  // Height.
-  const int block_width = mb_width >> (num_blocks - 1);    // Width.
-
   // Default interpolation filters.
   const int_interpfilters interp_filters =
-      av1_broadcast_interp_filter(MULTITAP_SHARP);
+      av1_broadcast_interp_filter(MULTITAP_SHARP2);
 
   // Handle Y-plane, U-plane and V-plane (if needed) in sequence.
   int plane_offset = 0;
@@ -334,8 +353,8 @@ static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
     const int plane_w = mb_width >> subsampling_x;   // Plane width.
     const int plane_y = mb_y >> subsampling_y;       // Y-coord (Top-left).
     const int plane_x = mb_x >> subsampling_x;       // X-coord (Top-left).
-    const int h = block_height >> subsampling_y;     // Sub-block height.
-    const int w = block_width >> subsampling_x;      // Sub-block width.
+    const int h = plane_h >> 1;                      // Sub-block height.
+    const int w = plane_w >> 1;                      // Sub-block width.
     const int is_y_plane = (plane == 0);             // Is Y-plane?
 
     const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
@@ -343,12 +362,12 @@ static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
                                     ref_frame->heights[is_y_plane ? 0 : 1],
                                     ref_frame->strides[is_y_plane ? 0 : 1] };
 
-    // Handle entire block or sub-blocks if needed.
+    // Handle each subblock.
     int subblock_idx = 0;
     for (int i = 0; i < plane_h; i += h) {
       for (int j = 0; j < plane_w; j += w) {
         // Choose proper motion vector.
-        const MV mv = use_subblock ? subblock_mvs[subblock_idx] : mb_mv;
+        const MV mv = subblock_mvs[subblock_idx++];
         assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
                mv.col >= INT16_MIN && mv.col <= INT16_MAX);
 
@@ -363,13 +382,12 @@ static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
         inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
         av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
                                           plane_w, &mv, &inter_pred_params);
-
-        ++subblock_idx;
       }
     }
-    plane_offset += mb_pels;
+    plane_offset += plane_h * plane_w;
   }
 }
+/*!\cond */
 
 // Computes temporal filter weights and accumulators for the frame to be
 // filtered. More concretely, the filter weights for all pixels are the same.
@@ -378,27 +396,22 @@ static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
 //        subsampling information of all planes as well as the bit-depth.
 //   block_size: Size of the block.
 //   num_planes: Number of planes in the frame.
-//   filter_weight: Weight used for filtering.
 //   pred: Pointer to the well-built predictors.
 //   accum: Pointer to the pixel-wise accumulator for filtering.
 //   count: Pointer to the pixel-wise counter fot filtering.
 // Returns:
 //   Nothing will be returned. But the content to which `accum` and `pred`
 //   point will be modified.
-void av1_apply_temporal_filter_self(const MACROBLOCKD *mbd,
-                                    const BLOCK_SIZE block_size,
-                                    const int num_planes,
-                                    const int filter_weight,
-                                    const uint8_t *pred, uint32_t *accum,
-                                    uint16_t *count) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
+void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame,
+                                   const MACROBLOCKD *mbd,
+                                   const BLOCK_SIZE block_size,
+                                   const int mb_row, const int mb_col,
+                                   const int num_planes, uint32_t *accum,
+                                   uint16_t *count) {
   // Block information.
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
   const int is_high_bitdepth = is_cur_buf_hbd(mbd);
-  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
 
   int plane_offset = 0;
   for (int plane = 0; plane < num_planes; ++plane) {
@@ -407,17 +420,27 @@ void av1_apply_temporal_filter_self(const MACROBLOCKD *mbd,
     const int h = mb_height >> subsampling_y;  // Plane height.
     const int w = mb_width >> subsampling_x;   // Plane width.
 
+    const int frame_stride = ref_frame->strides[plane == AOM_PLANE_Y ? 0 : 1];
+    const uint8_t *buf8 = ref_frame->buffers[plane];
+    const uint16_t *buf16 = CONVERT_TO_SHORTPTR(buf8);
+    const int frame_offset = mb_row * h * frame_stride + mb_col * w;
+
     int pred_idx = 0;
+    int pixel_idx = 0;
     for (int i = 0; i < h; ++i) {
       for (int j = 0; j < w; ++j) {
         const int idx = plane_offset + pred_idx;  // Index with plane shift.
-        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
-        accum[idx] += filter_weight * pred_value;
-        count[idx] += filter_weight;
+        const int pred_value = is_high_bitdepth
+                                   ? buf16[frame_offset + pixel_idx]
+                                   : buf8[frame_offset + pixel_idx];
+        accum[idx] += TF_WEIGHT_SCALE * pred_value;
+        count[idx] += TF_WEIGHT_SCALE;
         ++pred_idx;
+        ++pixel_idx;
       }
+      pixel_idx += (frame_stride - w);
     }
-    plane_offset += mb_pels;
+    plane_offset += h * w;
   }
 }
 
@@ -468,239 +491,171 @@ static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset,
   }
 }
 
-// Function to adjust the filter weight when use YUV strategy.
+// Function to accumulate pixel-wise squared difference between two luma buffers
+// to be consumed while filtering the chroma planes.
 // Inputs:
-//   filter_weight: Original filter weight.
-//   sum_square_diff: Sum of squared difference between input frame and
-//                    prediction. This field is computed pixel by pixel, and
-//                    is used as a reference for the filter weight adjustment.
-//   num_ref_pixels: Number of pixels used to compute the `sum_square_diff`.
-//                   This field should align with the above lookup tables
-//                   `filter_weight_adjustment_lookup_table_yuv` and
-//                   `highbd_filter_weight_adjustment_lookup_table_yuv`.
-//   strength: Strength for filter weight adjustment.
+//   square_diff: Pointer to squared differences from luma plane.
+//   luma_sse_sum: Pointer to save the sum of luma squared differences.
+//   block_height: Height of block for computation.
+//   block_width: Width of block for computation.
+//   ss_x_shift: Chroma subsampling shift in 'X' direction
+//   ss_y_shift: Chroma subsampling shift in 'Y' direction
 // Returns:
-//   Adjusted filter weight which will finally be used for filtering.
-static INLINE int adjust_filter_weight_yuv(const int filter_weight,
-                                           const uint64_t sum_square_diff,
-                                           const int num_ref_pixels,
-                                           const int strength) {
-  int modifier =
-      (int)(AOMMIN(sum_square_diff * TF_YUV_FILTER_WEIGHT_SCALE, INT32_MAX)) /
-      num_ref_pixels;
-  const int rounding = (1 << strength) >> 1;
-  modifier = (modifier + rounding) >> strength;
-  return (modifier >= 16) ? 0 : (16 - modifier) * filter_weight;
-}
-
-// Applies temporal filter with YUV strategy.
-// Inputs:
-//   frame_to_filter: Pointer to the frame to be filtered, which is used as
-//                    reference to compute squared differece from the predictor.
-//   mbd: Pointer to the block for filtering, which is ONLY used to get
-//        subsampling information of all YUV planes.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   num_planes: Number of planes in the frame.
-//   strength: Strength for filter weight adjustment.
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//   subblock_filter_weights: The filter weights for each sub-block (row-major
-//                            order). If `use_subblock` is set as 0, the first
-//                            weight will be applied to the entire block.
-//   pred: Pointer to the well-built predictors.
-//   accum: Pointer to the pixel-wise accumulator for filtering.
-//   count: Pointer to the pixel-wise counter fot filtering.
-// Returns:
-//   Nothing will be returned. But the content to which `accum` and `pred`
-//   point will be modified.
-void av1_apply_temporal_filter_yuv_c(
-    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
-  // Block information.
-  const int mb_height = block_size_high[block_size];
-  const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
-  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
-  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
-
-  // Allocate memory for pixel-wise squared differences for all planes. They,
-  // regardless of the subsampling, are assigned with memory of size `mb_pels`.
-  uint32_t *square_diff =
-      aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
-  memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0]));
-
-  int plane_offset = 0;
-  for (int plane = 0; plane < num_planes; ++plane) {
-    // Locate pixel on reference frame.
-    const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
-    const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
-    const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
-    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
-    const uint8_t *ref = frame_to_filter->buffers[plane];
-    compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset,
-                        plane_w, plane_h, plane_w, is_high_bitdepth,
-                        square_diff + plane_offset);
-    plane_offset += mb_pels;
-  }
-
-  // Get window size for pixel-wise filtering.
-  assert(TF_YUV_FILTER_WINDOW_LENGTH % 2 == 1);
-  const int half_window = TF_YUV_FILTER_WINDOW_LENGTH >> 1;
-
-  // Handle planes in sequence.
-  plane_offset = 0;
-  for (int plane = 0; plane < num_planes; ++plane) {
-    const int subsampling_y = mbd->plane[plane].subsampling_y;
-    const int subsampling_x = mbd->plane[plane].subsampling_x;
-    const int h = mb_height >> subsampling_y;  // Plane height.
-    const int w = mb_width >> subsampling_x;   // Plane width.
-
-    // Perform filtering.
-    int pred_idx = 0;
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        // non-local mean approach
-        uint64_t sum_square_diff = 0;
-        int num_ref_pixels = 0;
-
-        for (int wi = -half_window; wi <= half_window; ++wi) {
-          for (int wj = -half_window; wj <= half_window; ++wj) {
-            const int y = i + wi;  // Y-coord on the current plane.
-            const int x = j + wj;  // X-coord on the current plane.
-            if (y >= 0 && y < h && x >= 0 && x < w) {
-              sum_square_diff += square_diff[plane_offset + y * w + x];
-              ++num_ref_pixels;
-            }
-          }
-        }
-
-        if (plane == 0) {  // Filter Y-plane using both U-plane and V-plane.
-          for (int p = 1; p < num_planes; ++p) {
-            const int ss_y_shift = mbd->plane[p].subsampling_y - subsampling_y;
-            const int ss_x_shift = mbd->plane[p].subsampling_x - subsampling_x;
-            const int yy = i >> ss_y_shift;  // Y-coord on UV-plane.
-            const int xx = j >> ss_x_shift;  // X-coord on UV-plane.
-            const int ww = w >> ss_x_shift;  // Width of UV-plane.
-            sum_square_diff += square_diff[p * mb_pels + yy * ww + xx];
-            ++num_ref_pixels;
-          }
-        } else {  // Filter U-plane and V-plane using Y-plane.
-          const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y;
-          const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x;
-          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-              const int ww = w << ss_x_shift;         // Width of Y-plane.
-              sum_square_diff += square_diff[yy * ww + xx];
-              ++num_ref_pixels;
-            }
-          }
+//   Nothing will be returned. But the content to which `luma_sse_sum` points
+//   will be modified.
+void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum,
+                               int block_height, int block_width,
+                               int ss_x_shift, int ss_y_shift) {
+  for (int i = 0; i < block_height; ++i) {
+    for (int j = 0; j < block_width; ++j) {
+      for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+        for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+          const int yy = (i << ss_y_shift) + ii;     // Y-coord on Y-plane.
+          const int xx = (j << ss_x_shift) + jj;     // X-coord on Y-plane.
+          const int ww = block_width << ss_x_shift;  // Width of Y-plane.
+          luma_sse_sum[i * block_width + j] += square_diff[yy * ww + xx];
         }
-
-        // Base filter weight estimated by motion search error.
-        const int subblock_idx =
-            use_subblock ? (i >= h / 2) * 2 + (j >= w / 2) : 0;
-        const int filter_weight = subblock_filter_weights[subblock_idx];
-
-        const int idx = plane_offset + pred_idx;  // Index with plane shift.
-        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
-        const int adjusted_weight = adjust_filter_weight_yuv(
-            filter_weight, sum_square_diff, num_ref_pixels, strength);
-        accum[idx] += adjusted_weight * pred_value;
-        count[idx] += adjusted_weight;
-
-        ++pred_idx;
       }
     }
-    plane_offset += mb_pels;
   }
-
-  aom_free(square_diff);
 }
 
-// Applies temporal filter with plane-wise strategy.
-// The strategy of filter weight adjustment is different from the function
-// `av1_apply_temporal_filter_yuv_c()`.
-// Inputs:
-//   frame_to_filter: Pointer to the frame to be filtered, which is used as
-//                    reference to compute squared differece from the predictor.
-//   mbd: Pointer to the block for filtering, which is ONLY used to get
-//        subsampling information of all planes.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   num_planes: Number of planes in the frame.
-//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
-//                 with each plane (in Y, U, V order).
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//   block_mse: Motion search error (MSE) for the entire block.
-//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
-//   q_factor: Quantization factor. This is actually the `q` defined in libaom,
-//             which is converted from `qindex`.
-//   pred: Pointer to the well-built predictors.
-//   accum: Pointer to the pixel-wise accumulator for filtering.
-//   count: Pointer to the pixel-wise counter fot filtering.
-// Returns:
-//   Nothing will be returned. But the content to which `accum` and `pred`
-//   point will be modified.
-void av1_apply_temporal_filter_planewise_c(
+/*!\endcond */
+/*!\brief Applies temporal filtering. NOTE that there are various optimised
+ * versions of this function called where the appropriate instruction set is
+ * supported.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   frame_to_filter Pointer to the frame to be filtered, which is
+ *                              used as reference to compute squared
+ *                              difference from the predictor.
+ * \param[in]   mbd             Pointer to the block for filtering, ONLY used
+ *                              to get subsampling information for the  planes
+ * \param[in]   block_size      Size of the block
+ * \param[in]   mb_row          Row index of the block in the frame
+ * \param[in]   mb_col          Column index of the block in the frame
+ * \param[in]   num_planes      Number of planes in the frame
+ * \param[in]   noise_levels    Estimated noise levels for each plane
+ *                              in the frame (Y,U,V)
+ * \param[in]   subblock_mvs    Pointer to the motion vectors for 4 sub-blocks
+ * \param[in]   subblock_mses   Pointer to the search errors (MSE) for 4
+ *                              sub-blocks
+ * \param[in]   q_factor        Quantization factor. This is actually the `q`
+ *                              defined in libaom, converted from `qindex`
+ * \param[in]   filter_strength Filtering strength. This value lies in range
+ *                              [0, 6] where 6 is the maximum strength.
+ * \param[out]  pred            Pointer to the well-built predictors
+ * \param[out]  accum           Pointer to the pixel-wise accumulator for
+ *                              filtering
+ * \param[out]  count           Pointer to the pixel-wise counter for
+ *                              filtering
+ *
+ * \return Nothing returned, But the contents of `accum`, `pred` and 'count'
+ *         will be modified
+ */
+void av1_apply_temporal_filter_c(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
     const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
   // Block information.
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
   const int mb_pels = mb_height * mb_width;
   const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
   const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
+  // Frame information.
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Decay factors for non-local mean approach.
+  double decay_factor[MAX_MB_PLANE] = { 0 };
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  for (int plane = 0; plane < num_planes; plane++) {
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    decay_factor[plane] = 1 / (n_decay * q_decay * s_decay);
+  }
+  double d_factor[4] = { 0 };
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
 
-  // Allocate memory for pixel-wise squared differences for all planes. They,
+  // Allocate memory for pixel-wise squared differences. They,
   // regardless of the subsampling, are assigned with memory of size `mb_pels`.
-  uint32_t *square_diff =
-      aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
-  memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0]));
-
-  int plane_offset = 0;
-  for (int plane = 0; plane < num_planes; ++plane) {
-    // Locate pixel on reference frame.
-    const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
-    const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
-    const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
-    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
-    const uint8_t *ref = frame_to_filter->buffers[plane];
-    compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset,
-                        plane_w, plane_h, plane_w, is_high_bitdepth,
-                        square_diff + plane_offset);
-    plane_offset += mb_pels;
+  uint32_t *square_diff = aom_memalign(16, mb_pels * sizeof(uint32_t));
+  if (!square_diff) {
+    aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR,
+                       "Error allocating temporal filter data");
+  }
+  memset(square_diff, 0, mb_pels * sizeof(square_diff[0]));
+
+  // Allocate memory for accumulated luma squared error. This value will be
+  // consumed while filtering the chroma planes.
+  uint32_t *luma_sse_sum = aom_memalign(32, mb_pels * sizeof(uint32_t));
+  if (!luma_sse_sum) {
+    aom_free(square_diff);
+    aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR,
+                       "Error allocating temporal filter data");
   }
+  memset(luma_sse_sum, 0, mb_pels * sizeof(luma_sse_sum[0]));
 
   // Get window size for pixel-wise filtering.
-  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH % 2 == 1);
-  const int half_window = TF_PLANEWISE_FILTER_WINDOW_LENGTH >> 1;
-
-  // Hyper-parameter for filter weight adjustment.
-  const int frame_height = frame_to_filter->heights[0]
-                           << mbd->plane[0].subsampling_y;
-  const int decay_control = frame_height >= 720 ? 4 : 3;
+  assert(TF_WINDOW_LENGTH % 2 == 1);
+  const int half_window = TF_WINDOW_LENGTH >> 1;
 
   // Handle planes in sequence.
-  plane_offset = 0;
+  int plane_offset = 0;
   for (int plane = 0; plane < num_planes; ++plane) {
+    // Locate pixel on reference frame.
     const int subsampling_y = mbd->plane[plane].subsampling_y;
     const int subsampling_x = mbd->plane[plane].subsampling_x;
     const int h = mb_height >> subsampling_y;  // Plane height.
     const int w = mb_width >> subsampling_x;   // Plane width.
+    const int frame_stride =
+        frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+    const int frame_offset = mb_row * h * frame_stride + mb_col * w;
+    const uint8_t *ref = frame_to_filter->buffers[plane];
+    const int ss_y_shift =
+        subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int ss_x_shift =
+        subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane will
+    // be more accurate. The luma sse sum is reused in both chroma planes.
+    if (plane == AOM_PLANE_U)
+      compute_luma_sq_error_sum(square_diff, luma_sse_sum, h, w, ss_x_shift,
+                                ss_y_shift);
+    compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset, w,
+                        h, w, is_high_bitdepth, square_diff);
 
     // Perform filtering.
     int pred_idx = 0;
@@ -708,166 +663,86 @@ void av1_apply_temporal_filter_planewise_c(
       for (int j = 0; j < w; ++j) {
         // non-local mean approach
         uint64_t sum_square_diff = 0;
-        int num_ref_pixels = 0;
 
         for (int wi = -half_window; wi <= half_window; ++wi) {
           for (int wj = -half_window; wj <= half_window; ++wj) {
             const int y = CLIP(i + wi, 0, h - 1);  // Y-coord on current plane.
             const int x = CLIP(j + wj, 0, w - 1);  // X-coord on current plane.
-            sum_square_diff += square_diff[plane_offset + y * w + x];
-            ++num_ref_pixels;
+            sum_square_diff += square_diff[y * w + x];
           }
         }
 
-        // Filter U-plane and V-plane using Y-plane. This is because motion
-        // search is only done on Y-plane, so the information from Y-plane will
-        // be more accurate.
-        if (plane != 0) {
-          const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y;
-          const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x;
-          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-              const int ww = w << ss_x_shift;         // Width of Y-plane.
-              sum_square_diff += square_diff[yy * ww + xx];
-              ++num_ref_pixels;
-            }
-          }
-        }
+        sum_square_diff += luma_sse_sum[i * w + j];
 
         // Scale down the difference for high bit depth input.
-        if (mbd->bd > 8) sum_square_diff >>= (mbd->bd - 8) * (mbd->bd - 8);
-        const double window_error = (double)(sum_square_diff) / num_ref_pixels;
-        const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
-        const double block_error =
-            (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+        if (mbd->bd > 8) sum_square_diff >>= ((mbd->bd - 8) * 2);
 
-        // Control factor for non-local mean approach.
-        const double r =
-            (double)decay_control * (0.7 + log(noise_levels[plane] + 1.0));
-        const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
+        // Combine window error and block error, and normalize it.
+        const double window_error = sum_square_diff * inv_num_ref_pixels;
+        const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
+        const double block_error = (double)subblock_mses[subblock_idx];
+        const double combined_error =
+            weight_factor * window_error + block_error * inv_factor;
 
         // Compute filter weight.
-        const double scaled_diff =
-            AOMMAX(-(window_error + block_error / 10) / (2 * r * r * q), -15.0);
-        const int adjusted_weight =
-            (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor[plane];
+        scaled_error = AOMMIN(scaled_error, 7);
+        const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
 
         const int idx = plane_offset + pred_idx;  // Index with plane shift.
         const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
-        accum[idx] += adjusted_weight * pred_value;
-        count[idx] += adjusted_weight;
+        accum[idx] += weight * pred_value;
+        count[idx] += weight;
 
         ++pred_idx;
       }
     }
-    plane_offset += mb_pels;
+    plane_offset += h * w;
   }
 
   aom_free(square_diff);
+  aom_free(luma_sse_sum);
 }
-
-// Computes temporal filter weights and accumulators from all reference frames
-// excluding the current frame to be filtered.
-// Inputs:
-//   frame_to_filter: Pointer to the frame to be filtered, which is used as
-//                    reference to compute squared differece from the predictor.
-//   mbd: Pointer to the block for filtering, which is ONLY used to get
-//        subsampling information of all planes and the bit-depth.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   num_planes: Number of planes in the frame.
-//   strength: Strength for filter weight adjustment. (Used in YUV strategy)
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//                 (Used in YUV strategy)
-//   subblock_filter_weights: The filter weights for each sub-block (row-major
-//                            order). If `use_subblock` is set as 0, the first
-//                            weight will be applied to the entire block. (Used
-//                            in YUV strategy)
-//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
-//                 with each plane (in Y, U, V order). (Used in plane-wise
-//                 strategy)
-//   block_mse: Motion search error (MSE) for the entire block.
-//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
-//   q_factor: Quantization factor.
-//   pred: Pointer to the well-built predictors.
-//   accum: Pointer to the pixel-wise accumulator for filtering.
-//   count: Pointer to the pixel-wise counter fot filtering.
-// Returns:
-//   Nothing will be returned. But the content to which `accum` and `pred`
-//   point will be modified.
-void av1_apply_temporal_filter_others(
+#if CONFIG_AV1_HIGHBITDEPTH
+// Calls High bit-depth temporal filter
+void av1_highbd_apply_temporal_filter_c(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const double *noise_levels,
-    const int block_mse, const int *subblock_mses, const int q_factor,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
     const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
-  if (TF_ENABLE_PLANEWISE_STRATEGY) {
-    // TODO(any): avx2 and sse2 version should be changed to align with C
-    // function before using.
-    if (is_frame_high_bitdepth(frame_to_filter) || block_size != BLOCK_32X32) {
-      av1_apply_temporal_filter_planewise_c(
-          frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-          noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
-          accum, count);
-    } else {
-      av1_apply_temporal_filter_planewise(
-          frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-          noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
-          accum, count);
-    }
-  } else {  // Commonly used for low-resolution video.
-    if (subblock_filter_weights[0] == 0 && subblock_filter_weights[1] == 0 &&
-        subblock_filter_weights[2] == 0 && subblock_filter_weights[3] == 0) {
-      return;
-    }
-    const int adj_strength = strength + 2 * (mbd->bd - 8);
-    if (num_planes == 3 && TF_YUV_FILTER_WEIGHT_SCALE == 3 &&
-        block_size != BLOCK_32X32) {
-      av1_apply_temporal_filter_yuv(frame_to_filter, mbd, block_size, mb_row,
-                                    mb_col, num_planes, adj_strength,
-                                    use_subblock, subblock_filter_weights, pred,
-                                    accum, count);
-    } else {
-      // TODO(any): sse4 version should be changed to align with C function
-      // before using.
-      av1_apply_temporal_filter_yuv_c(frame_to_filter, mbd, block_size, mb_row,
-                                      mb_col, num_planes, adj_strength,
-                                      use_subblock, subblock_filter_weights,
-                                      pred, accum, count);
-    }
-  }
+  av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col,
+                              num_planes, noise_levels, subblock_mvs,
+                              subblock_mses, q_factor, filter_strength, pred,
+                              accum, count);
 }
-
-// Normalizes the accumulated filtering result to produce the filtered frame.
-// Inputs:
-//   mbd: Pointer to the block for filtering, which is ONLY used to get
-//        subsampling information of all planes.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   num_planes: Number of planes in the frame.
-//   accum: Pointer to the pre-computed accumulator.
-//   count: Pointer to the pre-computed count.
-//   result_buffer: Pointer to result buffer.
-// Returns:
-//   Nothing will be returned. But the content to which `result_buffer` point
-//   will be modified.
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+/*!\brief Normalizes the accumulated filtering result to produce the filtered
+ *        frame
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   mbd            Pointer to the block for filtering, which is
+ *                             ONLY used to get subsampling information for
+ *                             all the planes
+ * \param[in]   block_size     Size of the block
+ * \param[in]   mb_row         Row index of the block in the frame
+ * \param[in]   mb_col         Column index of the block in the frame
+ * \param[in]   num_planes     Number of planes in the frame
+ * \param[in]   accum          Pointer to the pre-computed accumulator
+ * \param[in]   count          Pointer to the pre-computed count
+ * \param[out]  result_buffer  Pointer to result buffer
+ *
+ * \return Nothing returned, but the content to which `result_buffer` pointer
+ *         will be modified
+ */
 static void tf_normalize_filtered_frame(
     const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row,
     const int mb_col, const int num_planes, const uint32_t *accum,
     const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
   // Block information.
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
   const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer);
 
   int plane_offset = 0;
@@ -896,192 +771,345 @@ static void tf_normalize_filtered_frame(
       }
       frame_idx += (frame_stride - plane_w);
     }
-    plane_offset += mb_pels;
+    plane_offset += plane_h * plane_w;
   }
 }
 
-// Helper function to compute number of blocks on either side of the frame.
-static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
-  return (frame_length + mb_length - 1) / mb_length;
+int av1_get_q(const AV1_COMP *cpi) {
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
+  const int q =
+      (int)av1_convert_qindex_to_q(cpi->ppi->p_rc.avg_frame_qindex[frame_type],
+                                   cpi->common.seq_params->bit_depth);
+  return q;
 }
 
-typedef struct {
-  int64_t sum;
-  int64_t sse;
-} FRAME_DIFF;
-
-// Does temporal filter for a particular frame.
-// Inputs:
-//   cpi: Pointer to the composed information of input video.
-//   frames: Frame buffers used for temporal filtering.
-//   num_frames: Number of frames in the frame buffer.
-//   filter_frame_idx: Index of the frame to be filtered.
-//   is_key_frame: Whether the to-filter is a key frame.
-//   is_second_arf: Whether the to-filter frame is the second ARF. This field
-//                  is ONLY used for assigning filter weight.
-//   block_size: Block size used for temporal filtering.
-//   scale: Scaling factor.
-//   strength: Pre-estimated strength for filter weight adjustment.
-//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
-//                 with each plane (in Y, U, V order).
-// Returns:
-//   Difference between filtered frame and the original frame.
-static FRAME_DIFF tf_do_filtering(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG **frames, const int num_frames,
-    const int filter_frame_idx, const int is_key_frame, const int is_second_arf,
-    const BLOCK_SIZE block_size, const struct scale_factors *scale,
-    const int strength, const double *noise_levels) {
-  // Basic information.
+void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+  const int num_frames = tf_ctx->num_frames;
+  const int filter_frame_idx = tf_ctx->filter_frame_idx;
+  const int compute_frame_diff = tf_ctx->compute_frame_diff;
+  const struct scale_factors *scale = &tf_ctx->sf;
+  const double *noise_levels = tf_ctx->noise_levels;
+  const int num_pels = tf_ctx->num_pels;
+  const int q_factor = tf_ctx->q_factor;
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
   const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
-  const int frame_height = frame_to_filter->y_crop_height;
-  const int frame_width = frame_to_filter->y_crop_width;
+  MACROBLOCK *const mb = &td->mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  TemporalFilterData *const tf_data = &td->tf_data;
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
-  const int mb_rows = get_num_blocks(frame_height, mb_height);
-  const int mb_cols = get_num_blocks(frame_width, mb_width);
-  const int num_planes = av1_num_planes(&cpi->common);
   const int mi_h = mi_size_high_log2[block_size];
   const int mi_w = mi_size_wide_log2[block_size];
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
-
-  // Save input state.
-  MACROBLOCK *const mb = &cpi->td.mb;
-  MACROBLOCKD *const mbd = &mb->e_mbd;
-  uint8_t *input_buffer[MAX_MB_PLANE];
-  for (int i = 0; i < num_planes; i++) {
-    input_buffer[i] = mbd->plane[i].pre[0].buf;
-  }
-  MB_MODE_INFO **input_mb_mode_info = mbd->mi;
+  const int num_planes = av1_num_planes(&cpi->common);
+  uint32_t *accum = tf_data->accum;
+  uint16_t *count = tf_data->count;
+  uint8_t *pred = tf_data->pred;
 
-  // Setup.
-  mbd->block_ref_scale_factors[0] = scale;
-  mbd->block_ref_scale_factors[1] = scale;
-  // A temporary block info used to store state in temporal filtering process.
-  MB_MODE_INFO *tmp_mb_mode_info = (MB_MODE_INFO *)malloc(sizeof(MB_MODE_INFO));
-  memset(tmp_mb_mode_info, 0, sizeof(MB_MODE_INFO));
-  mbd->mi = &tmp_mb_mode_info;
-  mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
-  // Allocate memory for predictor, accumulator and count.
-  uint8_t *pred8 = aom_memalign(32, num_planes * mb_pels * sizeof(uint8_t));
-  uint16_t *pred16 = aom_memalign(32, num_planes * mb_pels * sizeof(uint16_t));
-  uint32_t *accum = aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
-  uint16_t *count = aom_memalign(16, num_planes * mb_pels * sizeof(uint16_t));
-  memset(pred8, 0, num_planes * mb_pels * sizeof(pred8[0]));
-  memset(pred16, 0, num_planes * mb_pels * sizeof(pred16[0]));
-  uint8_t *const pred = is_high_bitdepth ? CONVERT_TO_BYTEPTR(pred16) : pred8;
+  // Factor to control the filering strength.
+  const int filter_strength = cpi->oxcf.algo_cfg.arnr_strength;
 
   // Do filtering.
-  FRAME_DIFF diff = { 0, 0 };
-  // Perform temporal filtering block by block.
-  for (int mb_row = 0; mb_row < mb_rows; mb_row++) {
-    av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
-                          (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+  FRAME_DIFF *diff = &td->tf_data.diff;
+  av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+                        (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+                        cpi->oxcf.border_in_pixels);
+  for (int mb_col = 0; mb_col < tf_ctx->mb_cols; mb_col++) {
+    av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+                          (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
                           cpi->oxcf.border_in_pixels);
-    for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
-      av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
-                            (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
-                            cpi->oxcf.border_in_pixels);
-      memset(accum, 0, num_planes * mb_pels * sizeof(accum[0]));
-      memset(count, 0, num_planes * mb_pels * sizeof(count[0]));
-      MV ref_mv = kZeroMv;  // Reference motion vector passed down along frames.
-      // Perform temporal filtering frame by frame.
-      for (int frame = 0; frame < num_frames; frame++) {
-        if (frames[frame] == NULL) continue;
-
-        // Motion search.
-        MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
-        int subblock_filter_weights[4] = { 0, 0, 0, 0 };
-        int block_mse = INT_MAX;
-        int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
-
-        if (frame == filter_frame_idx) {  // Frame to be filtered.
-          // Set motion vector as 0 for the frame to be filtered.
-          mbd->mi[0]->mv[0].as_mv = kZeroMv;
-          // Change ref_mv sign for following frames.
-          ref_mv.row *= -1;
-          ref_mv.col *= -1;
-        } else {  // Other reference frames.
-          block_mse = tf_motion_search(cpi, frame_to_filter, frames[frame],
-                                       block_size, mb_row, mb_col, &ref_mv,
-                                       subblock_mvs, subblock_mses);
-          // Do not pass down the reference motion vector if error is too large.
-          const int thresh = AOMMIN(frame_height, frame_width) >= 720 ? 12 : 3;
-          if (block_mse > (thresh << (mbd->bd - 8))) {
-            ref_mv = kZeroMv;
-          }
-        }
+    memset(accum, 0, num_pels * sizeof(accum[0]));
+    memset(count, 0, num_pels * sizeof(count[0]));
+    MV ref_mv = kZeroMv;  // Reference motion vector passed down along frames.
+                          // Perform temporal filtering frame by frame.
+    for (int frame = 0; frame < num_frames; frame++) {
+      if (frames[frame] == NULL) continue;
+
+      // Motion search.
+      MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
+      int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+      if (frame ==
+          filter_frame_idx) {  // Frame to be filtered.
+                               // Change ref_mv sign for following frames.
+        ref_mv.row *= -1;
+        ref_mv.col *= -1;
+      } else {  // Other reference frames.
+        tf_motion_search(cpi, mb, frame_to_filter, frames[frame], block_size,
+                         mb_row, mb_col, &ref_mv, subblock_mvs, subblock_mses);
+      }
 
-        // Build predictor.
-        int use_subblock = tf_get_filter_weight(
-            block_mse, subblock_mses, is_second_arf, subblock_filter_weights);
+      // Perform weighted averaging.
+      if (frame == filter_frame_idx) {  // Frame to be filtered.
+        tf_apply_temporal_filter_self(frames[frame], mbd, block_size, mb_row,
+                                      mb_col, num_planes, accum, count);
+      } else {  // Other reference frames.
         tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col,
-                           num_planes, scale, use_subblock, subblock_mvs, pred);
-
-        // Perform weighted averaging.
-        if (frame == filter_frame_idx) {  // Frame to be filtered.
-          av1_apply_temporal_filter_self(mbd, block_size, num_planes,
-                                         subblock_filter_weights[0], pred,
-                                         accum, count);
-        } else {  // Other reference frames.
-          const FRAME_TYPE frame_type =
-              (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME
-                                                           : KEY_FRAME;
-          const int q_factor =
-              (int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[frame_type],
-                                           cpi->common.seq_params.bit_depth);
-          av1_apply_temporal_filter_others(
-              frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-              strength, use_subblock, subblock_filter_weights, noise_levels,
-              block_mse, subblock_mses, q_factor, pred, accum, count);
+                           num_planes, scale, subblock_mvs, pred);
+
+        // All variants of av1_apply_temporal_filter() contain floating point
+        // operations. Hence, clear the system state.
+
+        // TODO(any): avx2/sse2 version should be changed to align with C
+        // function before using. In particular, current avx2/sse2 function
+        // only supports 32x32 block size and 5x5 filtering window.
+        if (is_frame_high_bitdepth(frame_to_filter)) {  // for high bit-depth
+#if CONFIG_AV1_HIGHBITDEPTH
+          if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+            av1_highbd_apply_temporal_filter(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mvs, subblock_mses, q_factor,
+                filter_strength, pred, accum, count);
+          } else {
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+            av1_apply_temporal_filter_c(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mvs, subblock_mses, q_factor,
+                filter_strength, pred, accum, count);
+#if CONFIG_AV1_HIGHBITDEPTH
+          }
+#endif            // CONFIG_AV1_HIGHBITDEPTH
+        } else {  // for 8-bit
+          if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+            av1_apply_temporal_filter(frame_to_filter, mbd, block_size, mb_row,
+                                      mb_col, num_planes, noise_levels,
+                                      subblock_mvs, subblock_mses, q_factor,
+                                      filter_strength, pred, accum, count);
+          } else {
+            av1_apply_temporal_filter_c(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mvs, subblock_mses, q_factor,
+                filter_strength, pred, accum, count);
+          }
         }
       }
+    }
+    tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
+                                accum, count, tf_ctx->output_frame);
+
+    if (compute_frame_diff) {
+      const int y_height = mb_height >> mbd->plane[0].subsampling_y;
+      const int y_width = mb_width >> mbd->plane[0].subsampling_x;
+      const int source_y_stride = frame_to_filter->y_stride;
+      const int filter_y_stride = tf_ctx->output_frame->y_stride;
+      const int source_offset =
+          mb_row * y_height * source_y_stride + mb_col * y_width;
+      const int filter_offset =
+          mb_row * y_height * filter_y_stride + mb_col * y_width;
+      unsigned int sse = 0;
+      cpi->ppi->fn_ptr[block_size].vf(
+          frame_to_filter->y_buffer + source_offset, source_y_stride,
+          tf_ctx->output_frame->y_buffer + filter_offset, filter_y_stride,
+          &sse);
+      diff->sum += sse;
+      diff->sse += sse * (int64_t)sse;
+    }
+  }
+}
 
-      tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
-                                  accum, count, &cpi->alt_ref_buffer);
-
-      if (!is_key_frame && cpi->sf.hl_sf.adaptive_overlay_encoding) {
-        const int y_height = mb_height >> mbd->plane[0].subsampling_y;
-        const int y_width = mb_width >> mbd->plane[0].subsampling_x;
-        const int source_y_stride = frame_to_filter->y_stride;
-        const int filter_y_stride = cpi->alt_ref_buffer.y_stride;
-        const int source_offset =
-            mb_row * y_height * source_y_stride + mb_col * y_width;
-        const int filter_offset =
-            mb_row * y_height * filter_y_stride + mb_col * y_width;
-        unsigned int sse = 0;
-        cpi->fn_ptr[block_size].vf(frame_to_filter->y_buffer + source_offset,
-                                   source_y_stride,
-                                   cpi->alt_ref_buffer.y_buffer + filter_offset,
-                                   filter_y_stride, &sse);
-        diff.sum += sse;
-        diff.sse += sse * sse;
-      }
+/*!\brief Does temporal filter for a given frame.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   cpi                   Top level encoder instance structure
+ *
+ * \return Nothing will be returned, but the contents of td->diff will be
+ modified.
+ */
+static void tf_do_filtering(AV1_COMP *cpi) {
+  // Basic information.
+  ThreadData *td = &cpi->td;
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  const struct scale_factors *scale = &tf_ctx->sf;
+  const int num_planes = av1_num_planes(&cpi->common);
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  MACROBLOCKD *mbd = &td->mb.e_mbd;
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  MB_MODE_INFO **input_mb_mode_info;
+  tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
+  tf_setup_macroblockd(mbd, &td->tf_data, scale);
+
+  // Perform temporal filtering for each row.
+  for (int mb_row = 0; mb_row < tf_ctx->mb_rows; mb_row++)
+    av1_tf_do_filtering_row(cpi, td, mb_row);
+
+  tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
+}
+
+/*!\brief Setups the frame buffer for temporal filtering. This fuction
+ * determines how many frames will be used for temporal filtering and then
+ * groups them into a buffer. This function will also estimate the noise level
+ * of the to-filter frame.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]   cpi             Top level encoder instance structure
+ * \param[in]   filter_frame_lookahead_idx  The index of the to-filter frame
+ *                              in the lookahead buffer cpi->lookahead
+ * \param[in]   gf_frame_index  GOP index
+ *
+ * \return Nothing will be returned. But the fields `frames`, `num_frames`,
+ *         `filter_frame_idx` and `noise_levels` will be updated in cpi->tf_ctx.
+ */
+static void tf_setup_filtering_buffer(AV1_COMP *cpi,
+                                      int filter_frame_lookahead_idx,
+                                      int gf_frame_index) {
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index];
+  const FRAME_TYPE frame_type = gf_group->frame_type[gf_frame_index];
+  const int is_forward_keyframe =
+      av1_gop_check_forward_keyframe(gf_group, gf_frame_index);
+
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+  // Number of frames used for filtering. Set `arnr_max_frames` as 1 to disable
+  // temporal filtering.
+  int num_frames = AOMMAX(cpi->oxcf.algo_cfg.arnr_max_frames, 1);
+  int num_before = 0;  // Number of filtering frames before the to-filter frame.
+  int num_after = 0;   // Number of filtering frames after the to-filer frame.
+  const int lookahead_depth =
+      av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
+
+  // Temporal filtering should not go beyond key frames
+  const int key_to_curframe =
+      AOMMAX(cpi->rc.frames_since_key + filter_frame_lookahead_idx, 0);
+  const int curframe_to_key =
+      AOMMAX(cpi->rc.frames_to_key - filter_frame_lookahead_idx - 1, 0);
+
+  // Number of buffered frames before the to-filter frame.
+  int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe);
+
+  // Number of buffered frames after the to-filter frame.
+  int max_after =
+      AOMMIN(lookahead_depth - filter_frame_lookahead_idx - 1, curframe_to_key);
+
+  // Estimate noises for each plane.
+  const struct lookahead_entry *to_filter_buf = av1_lookahead_peek(
+      cpi->ppi->lookahead, filter_frame_lookahead_idx, cpi->compressor_stage);
+  assert(to_filter_buf != NULL);
+  const YV12_BUFFER_CONFIG *to_filter_frame = &to_filter_buf->img;
+  const int num_planes = av1_num_planes(&cpi->common);
+  double *noise_levels = tf_ctx->noise_levels;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    noise_levels[plane] = av1_estimate_noise_from_single_plane(
+        to_filter_frame, plane, cpi->common.seq_params->bit_depth,
+        NOISE_ESTIMATION_EDGE_THRESHOLD);
+  }
+  // Get quantization factor.
+  const int q = av1_get_q(cpi);
+  // Get correlation estimates from first-pass;
+  const FIRSTPASS_STATS *stats =
+      cpi->twopass_frame.stats_in - (cpi->rc.frames_since_key == 0);
+  double accu_coeff0 = 1.0, accu_coeff1 = 1.0;
+  for (int i = 1; i <= max_after; i++) {
+    if (stats + filter_frame_lookahead_idx + i >=
+        cpi->ppi->twopass.stats_buf_ctx->stats_in_end) {
+      max_after = i - 1;
+      break;
     }
+    accu_coeff1 *=
+        AOMMAX(stats[filter_frame_lookahead_idx + i].cor_coeff, 0.001);
+  }
+  if (max_after >= 1) {
+    accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after);
+  }
+  for (int i = 1; i <= max_before; i++) {
+    if (stats + filter_frame_lookahead_idx - i + 1 <=
+        cpi->ppi->twopass.stats_buf_ctx->stats_in_start) {
+      max_before = i - 1;
+      break;
+    }
+    accu_coeff0 *=
+        AOMMAX(stats[filter_frame_lookahead_idx - i + 1].cor_coeff, 0.001);
+  }
+  if (max_before >= 1) {
+    accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before);
   }
 
-  // Restore input state
-  for (int i = 0; i < num_planes; i++) {
-    mbd->plane[i].pre[0].buf = input_buffer[i];
+  // Adjust number of filtering frames based on quantization factor. When the
+  // quantization factor is small enough (lossless compression), we will not
+  // change the number of frames for key frame filtering, which is to avoid
+  // visual quality drop.
+  int adjust_num = 6;
+  if (num_frames == 1) {  // `arnr_max_frames = 1` is used to disable filtering.
+    adjust_num = 0;
+  } else if ((update_type == KF_UPDATE) && q <= 10) {
+    adjust_num = 0;
   }
-  mbd->mi = input_mb_mode_info;
+  num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth);
 
-  free(tmp_mb_mode_info);
-  aom_free(pred8);
-  aom_free(pred16);
-  aom_free(accum);
-  aom_free(count);
+  if (frame_type == KEY_FRAME) {
+    num_before = is_forward_keyframe ? num_frames / 2 : 0;
+    num_after = AOMMIN(num_frames - 1, max_after);
+  } else {
+    int gfu_boost = av1_calc_arf_boost(&cpi->ppi->twopass, &cpi->twopass_frame,
+                                       &cpi->ppi->p_rc, &cpi->frame_info,
+                                       filter_frame_lookahead_idx, max_before,
+                                       max_after, NULL, NULL, 0);
+
+    num_frames = AOMMIN(num_frames, gfu_boost / 150);
+    num_frames += !(num_frames & 1);  // Make the number odd.
+
+    // Limit the number of frames if noise levels are low and high quantizers.
+    if (noise_levels[AOM_PLANE_Y] < 1.9 && cpi->ppi->p_rc.arf_q > 40)
+      num_frames = AOMMIN(num_frames, cpi->sf.hl_sf.num_frames_used_in_tf);
+
+    // Only use 2 neighbours for the second ARF.
+    if (update_type == INTNL_ARF_UPDATE) num_frames = AOMMIN(num_frames, 3);
+    if (AOMMIN(max_after, max_before) >= num_frames / 2) {
+      // just use half half
+      num_before = num_frames / 2;
+      num_after = num_frames / 2;
+    } else {
+      if (max_after < num_frames / 2) {
+        num_after = max_after;
+        num_before = AOMMIN(num_frames - 1 - num_after, max_before);
+      } else {
+        num_before = max_before;
+        num_after = AOMMIN(num_frames - 1 - num_before, max_after);
+      }
+      // Adjust insymmetry based on frame-level correlation
+      if (max_after > 0 && max_before > 0) {
+        if (num_after < num_before) {
+          const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff1, 0.01));
+          num_before = AOMMIN(num_before, num_after + insym);
+        } else {
+          const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff0, 0.01));
+          num_after = AOMMIN(num_after, num_before + insym);
+        }
+      }
+    }
+  }
+  num_frames = num_before + 1 + num_after;
 
-  return diff;
+  // Setup the frame buffer.
+  for (int frame = 0; frame < num_frames; ++frame) {
+    const int lookahead_idx = frame - num_before + filter_frame_lookahead_idx;
+    struct lookahead_entry *buf = av1_lookahead_peek(
+        cpi->ppi->lookahead, lookahead_idx, cpi->compressor_stage);
+    assert(buf != NULL);
+    frames[frame] = &buf->img;
+  }
+  tf_ctx->num_frames = num_frames;
+  tf_ctx->filter_frame_idx = num_before;
+  assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame);
+
+  av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes,
+                       cpi->common.seq_params->sb_size);
+  av1_setup_block_planes(&cpi->td.mb.e_mbd,
+                         cpi->common.seq_params->subsampling_x,
+                         cpi->common.seq_params->subsampling_y, num_planes);
 }
 
+/*!\cond */
+
 // A constant number, sqrt(pi / 2),  used for noise estimation.
 static const double SQRT_PI_BY_2 = 1.25331413732;
 
 double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
                                             const int plane,
-                                            const int bit_depth) {
+                                            const int bit_depth,
+                                            const int edge_thresh) {
   const int is_y_plane = (plane == 0);
   const int height = frame->crop_heights[is_y_plane ? 0 : 1];
   const int width = frame->crop_widths[is_y_plane ? 0 : 1];
@@ -1110,7 +1138,7 @@ double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
                      2 * (mat[0][1] - mat[2][1]);
       const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8);
       // Accumulate Laplacian.
-      if (Ga < NOISE_ESTIMATION_EDGE_THRESHOLD) {  // Only count smooth pixels.
+      if (Ga < edge_thresh) {  // Only count smooth pixels.
         const int v = 4 * mat[1][1] -
                       2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
                       (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
@@ -1124,215 +1152,215 @@ double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
   return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
 }
 
-// Estimates the strength for filter weight adjustment, which is used in YUV
-// strategy. This estimation is based on the pre-estimated noise level of the
-// to-filter frame.
+// Initializes the members of TemporalFilterCtx
 // Inputs:
-//   cpi: Pointer to the composed information of input video.
-//   noise_level: Noise level of the to-filter frame, estimated with Y-plane.
-//   group_boost: Boost level for the current group of frames.
+//   cpi: Top level encoder instance structure
+//   check_show_existing: If 1, check whether the filtered frame is similar
+//                        to the original frame.
+//   filter_frame_lookahead_idx: The index of the frame to be filtered in the
+//                               lookahead buffer cpi->lookahead.
 // Returns:
-//   Estimated strength which will be used for filter weight adjustment.
-static int tf_estimate_strength(const AV1_COMP *cpi, const double noise_level,
-                                const int group_boost) {
-  int strength = cpi->oxcf.arnr_strength;
-
-  // Adjust the strength based on the estimated noise level.
-  if (noise_level > 0) {       // Adjust when the noise level is reliable.
-    if (noise_level < 0.75) {  // Noise level lies in range (0, 0.75).
-      strength = strength - 2;
-    } else if (noise_level < 1.75) {  // Noise level lies in range [0.75, 1.75).
-      strength = strength - 1;
-    } else if (noise_level < 4.0) {  // Noise level lies in range [1.75, 4.0).
-      strength = strength + 0;
-    } else {  // Noise level lies in range [4.0, +inf).
-      strength = strength + 1;
-    }
+//   Nothing will be returned. But the contents of cpi->tf_ctx will be modified.
+static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx,
+                        int gf_frame_index, int compute_frame_diff,
+                        YV12_BUFFER_CONFIG *output_frame) {
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  // Setup frame buffer for filtering.
+  YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+  tf_ctx->num_frames = 0;
+  tf_ctx->filter_frame_idx = -1;
+  tf_ctx->output_frame = output_frame;
+  tf_ctx->compute_frame_diff = compute_frame_diff;
+  tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, gf_frame_index);
+  assert(tf_ctx->num_frames > 0);
+  assert(tf_ctx->filter_frame_idx < tf_ctx->num_frames);
+
+  // Setup scaling factors. Scaling on each of the arnr frames is not
+  // supported.
+  // ARF is produced at the native frame size and resized when coded.
+  struct scale_factors *sf = &tf_ctx->sf;
+  av1_setup_scale_factors_for_frame(
+      sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+      frames[0]->y_crop_width, frames[0]->y_crop_height);
+
+  // Initialize temporal filter parameters.
+  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+  const int filter_frame_idx = tf_ctx->filter_frame_idx;
+  const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int mb_width = block_size_wide[block_size];
+  const int mb_height = block_size_high[block_size];
+  const int mb_rows = get_num_blocks(frame_height, mb_height);
+  const int mb_cols = get_num_blocks(frame_width, mb_width);
+  const int mb_pels = mb_width * mb_height;
+  const int is_highbitdepth = is_frame_high_bitdepth(frame_to_filter);
+  const int num_planes = av1_num_planes(&cpi->common);
+  int num_pels = 0;
+  for (int i = 0; i < num_planes; i++) {
+    const int subsampling_x = mbd->plane[i].subsampling_x;
+    const int subsampling_y = mbd->plane[i].subsampling_y;
+    num_pels += mb_pels >> (subsampling_x + subsampling_y);
   }
-
-  // Adjust the strength based on active max q.
-  const FRAME_TYPE frame_type =
-      (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME : KEY_FRAME;
-  const int q = (int)av1_convert_qindex_to_q(
-      cpi->rc.avg_frame_qindex[frame_type], cpi->common.seq_params.bit_depth);
-  strength = strength - AOMMAX(0, (16 - q) / 2);
-
-  return CLIP(strength, 0, group_boost / 300);
+  tf_ctx->num_pels = num_pels;
+  tf_ctx->mb_rows = mb_rows;
+  tf_ctx->mb_cols = mb_cols;
+  tf_ctx->is_highbitdepth = is_highbitdepth;
+  tf_ctx->q_factor = av1_get_q(cpi);
 }
 
-// Setups the frame buffer for temporal filtering. Basically, this fuction
-// determines how many frames will be used for temporal filtering and then
-// groups them into a buffer.
-// Inputs:
-//   cpi: Pointer to the composed information of input video.
-//   filter_frame_lookahead_idx: The index of the to-filter frame in the
-//                               lookahead buffer `cpi->lookahead`.
-//   is_second_arf: Whether the to-filter frame is the second ARF. This field
-//                  will affect the number of frames used for filtering.
-//   frames: Pointer to the frame buffer to setup.
-//   num_frames_for_filtering: Number of frames used for filtering.
-//   filter_frame_idx: Index of the to-filter frame in the setup frame buffer.
-// Returns:
-//   Nothing will be returned. But the frame buffer `frames`, number of frames
-//   in the buffer `num_frames_for_filtering`, and the index of the to-filter
-//   frame in the buffer `filter_frame_idx` will be updated in this function.
-static void tf_setup_filtering_buffer(const AV1_COMP *cpi,
-                                      const int filter_frame_lookahead_idx,
-                                      const int is_second_arf,
-                                      YV12_BUFFER_CONFIG **frames,
-                                      int *num_frames_for_filtering,
-                                      int *filter_frame_idx) {
-  int num_frames = 0;          // Number of frames used for filtering.
-  int num_frames_before = -1;  // Number of frames before the to-filter frame.
-  int filter_frame_offset;
-
-  if (filter_frame_lookahead_idx == -1) {  // Key frame.
-    num_frames = TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME;
-    num_frames_before = 0;
-    filter_frame_offset = filter_frame_lookahead_idx;
-  } else if (filter_frame_lookahead_idx < -1) {  // Key frame in one-pass mode.
-    num_frames = TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME;
-    num_frames_before = num_frames - 1;
-    filter_frame_offset = -filter_frame_lookahead_idx;
-  } else {
-    num_frames = cpi->oxcf.arnr_max_frames;
-    if (is_second_arf) {  // Only use 2 neighbours for the second ARF.
-      num_frames = AOMMIN(num_frames, 3);
-    }
-    if (num_frames > cpi->rc.gfu_boost / 150) {
-      num_frames = cpi->rc.gfu_boost / 150;
-      num_frames += !(num_frames & 1);
-    }
-    num_frames_before = AOMMIN(num_frames >> 1, filter_frame_lookahead_idx + 1);
-    const int lookahead_depth =
-        av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage);
-    const int num_frames_after =
-        AOMMIN((num_frames - 1) >> 1,
-               lookahead_depth - filter_frame_lookahead_idx - 1);
-    num_frames = num_frames_before + 1 + num_frames_after;
-    filter_frame_offset = filter_frame_lookahead_idx;
-  }
-  *num_frames_for_filtering = num_frames;
-  *filter_frame_idx = num_frames_before;
-
-  // Setup the frame buffer.
-  for (int frame = 0; frame < num_frames; ++frame) {
-    const int lookahead_idx = frame - num_frames_before + filter_frame_offset;
-    struct lookahead_entry *buf = av1_lookahead_peek(
-        cpi->lookahead, lookahead_idx, cpi->compressor_stage);
-    frames[frame] = (buf == NULL) ? NULL : &buf->img;
+int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame,
+                                  const FRAME_DIFF *frame_diff, int q_index,
+                                  aom_bit_depth_t bit_depth) {
+  const int frame_height = frame->y_crop_height;
+  const int frame_width = frame->y_crop_width;
+  const int block_height = block_size_high[TF_BLOCK_SIZE];
+  const int block_width = block_size_wide[TF_BLOCK_SIZE];
+  const int mb_rows = get_num_blocks(frame_height, block_height);
+  const int mb_cols = get_num_blocks(frame_width, block_width);
+  const int num_mbs = AOMMAX(1, mb_rows * mb_cols);
+  const float mean = (float)frame_diff->sum / num_mbs;
+  const float std = (float)sqrt((float)frame_diff->sse / num_mbs - mean * mean);
+
+  const int ac_q_step = av1_ac_quant_QTX(q_index, 0, bit_depth);
+  const float threshold = 0.7f * ac_q_step * ac_q_step;
+
+  if (mean < threshold && std < mean * 1.2) {
+    return 1;
   }
+  return 0;
 }
 
-int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
-                        int *show_existing_arf) {
+void av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
+                         int gf_frame_index, FRAME_DIFF *frame_diff,
+                         YV12_BUFFER_CONFIG *output_frame) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
   // Basic informaton of the current frame.
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  const uint8_t group_idx = gf_group->index;
-  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[group_idx];
-  // Filter one more ARF if the lookahead index is leq 7 (w.r.t. 9-th frame).
-  // This frame is ALWAYS a show existing frame.
-  const int is_second_arf = (update_type == INTNL_ARF_UPDATE) &&
-                            (filter_frame_lookahead_idx >= 7) &&
-                            cpi->sf.hl_sf.second_alt_ref_filtering;
+  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+  TemporalFilterData *tf_data = &cpi->td.tf_data;
+  const int compute_frame_diff = frame_diff != NULL;
   // TODO(anyone): Currently, we enforce the filtering strength on internal
   // ARFs except the second ARF to be zero. We should investigate in which case
   // it is more beneficial to use non-zero strength filtering.
-  if (update_type == INTNL_ARF_UPDATE && !is_second_arf) {
-    return 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Only parallel level 0 frames go through temporal filtering.
+  assert(cpi->ppi->gf_group.frame_parallel_level[gf_frame_index] == 0);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+  // Initialize temporal filter context structure.
+  init_tf_ctx(cpi, filter_frame_lookahead_idx, gf_frame_index,
+              compute_frame_diff, output_frame);
+
+  // Allocate and reset temporal filter buffers.
+  const int is_highbitdepth = tf_ctx->is_highbitdepth;
+  if (!tf_alloc_and_reset_data(tf_data, tf_ctx->num_pels, is_highbitdepth)) {
+    aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating temporal filter data");
   }
 
-  // TODO(yunqing): For INTNL_ARF_UPDATE type, the following me initialization
-  // is used somewhere unexpectedly. Should be resolved later.
-  // Initialize errorperbit, sadperbit16 and sadperbit4.
-  const int rdmult = av1_compute_rd_mult_based_on_qindex(cpi, TF_QINDEX);
-  set_error_per_bit(&cpi->td.mb, rdmult);
-  av1_initialize_me_consts(cpi, &cpi->td.mb, TF_QINDEX);
-  av1_fill_mv_costs(cpi->common.fc,
-                    cpi->common.features.cur_frame_force_integer_mv,
-                    cpi->common.features.allow_high_precision_mv, &cpi->td.mb);
+  // Perform temporal filtering process.
+  if (mt_info->num_workers > 1)
+    av1_tf_do_filtering_mt(cpi);
+  else
+    tf_do_filtering(cpi);
 
-  // Setup frame buffer for filtering.
-  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
-  int num_frames_for_filtering = 0;
-  int filter_frame_idx = -1;
-  tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, is_second_arf,
-                            frames, &num_frames_for_filtering,
-                            &filter_frame_idx);
-
-  // Estimate noise and strength.
-  const int bit_depth = cpi->common.seq_params.bit_depth;
-  const int num_planes = av1_num_planes(&cpi->common);
-  double noise_levels[MAX_MB_PLANE] = { 0 };
-  for (int plane = 0; plane < num_planes; ++plane) {
-    noise_levels[plane] = av1_estimate_noise_from_single_plane(
-        frames[filter_frame_idx], plane, bit_depth);
+  if (compute_frame_diff) {
+    *frame_diff = tf_data->diff;
   }
-  const int strength =
-      tf_estimate_strength(cpi, noise_levels[0], cpi->rc.gfu_boost);
-  if (filter_frame_lookahead_idx >= 0) {
-    cpi->common.showable_frame =
-        (strength == 0 && num_frames_for_filtering == 1) || is_second_arf ||
-        (cpi->oxcf.enable_overlay == 0 || cpi->sf.hl_sf.disable_overlay_frames);
+  // Deallocate temporal filter buffers.
+  tf_dealloc_data(tf_data, is_highbitdepth);
+}
+
+int av1_is_temporal_filter_on(const AV1EncoderConfig *oxcf) {
+  return oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1;
+}
+
+void av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  tf_info->is_temporal_filter_on = av1_is_temporal_filter_on(oxcf);
+  if (tf_info->is_temporal_filter_on == 0) return;
+
+  AV1_COMMON *cm = &cpi->common;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  int ret;
+  for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
+    ret = aom_realloc_frame_buffer(
+        &tf_info->tf_buf[i], oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+        seq_params->subsampling_x, seq_params->subsampling_y,
+        seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+        cm->features.byte_alignment, NULL, NULL, NULL,
+        cpi->oxcf.tool_cfg.enable_global_motion, 0);
+    if (ret) {
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate tf_info");
+    }
   }
 
-  // Do filtering.
-  const int is_key_frame = (filter_frame_lookahead_idx < 0);
-  FRAME_DIFF diff = { 0, 0 };
-  if (num_frames_for_filtering > 0 && frames[0] != NULL) {
-    // Setup scaling factors. Scaling on each of the arnr frames is not
-    // supported.
-    // ARF is produced at the native frame size and resized when coded.
-    struct scale_factors sf;
-    av1_setup_scale_factors_for_frame(
-        &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
-        frames[0]->y_crop_width, frames[0]->y_crop_height);
-    diff = tf_do_filtering(cpi, frames, num_frames_for_filtering,
-                           filter_frame_idx, is_key_frame, is_second_arf,
-                           TF_BLOCK_SIZE, &sf, strength, noise_levels);
+  ret = aom_realloc_frame_buffer(
+      &tf_info->tf_buf_second_arf, oxcf->frm_dim_cfg.width,
+      oxcf->frm_dim_cfg.height, seq_params->subsampling_x,
+      seq_params->subsampling_y, seq_params->use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL,
+      cpi->oxcf.tool_cfg.enable_global_motion, 0);
+  if (ret) {
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate tf_info");
   }
+}
 
-  if (is_key_frame) {  // Key frame should always be filtered.
-    return 1;
+void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info) {
+  if (tf_info->is_temporal_filter_on == 0) return;
+  for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
+    aom_free_frame_buffer(&tf_info->tf_buf[i]);
   }
+  aom_free_frame_buffer(&tf_info->tf_buf_second_arf);
+}
 
-  if ((show_existing_arf != NULL && cpi->sf.hl_sf.adaptive_overlay_encoding) ||
-      is_second_arf) {
-    const int frame_height = frames[filter_frame_idx]->y_crop_height;
-    const int frame_width = frames[filter_frame_idx]->y_crop_width;
-    const int block_height = block_size_high[TF_BLOCK_SIZE];
-    const int block_width = block_size_wide[TF_BLOCK_SIZE];
-    const int mb_rows = get_num_blocks(frame_height, block_height);
-    const int mb_cols = get_num_blocks(frame_width, block_width);
-    const int num_mbs = AOMMAX(1, mb_rows * mb_cols);
-    const float mean = (float)diff.sum / num_mbs;
-    const float std = (float)sqrt((float)diff.sse / num_mbs - mean * mean);
-
-    aom_clear_system_state();
-    // TODO(yunqing): This can be combined with TPL q calculation later.
-    cpi->rc.base_frame_target = gf_group->bit_allocation[group_idx];
-    av1_set_target_rate(cpi, cpi->common.width, cpi->common.height);
-    int top_index = 0;
-    int bottom_index = 0;
-    const int q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cpi->oxcf.width,
-                                           cpi->oxcf.height, group_idx,
-                                           &bottom_index, &top_index);
-    const int ac_q = av1_ac_quant_QTX(q, 0, bit_depth);
-    const float threshold = 0.7f * ac_q * ac_q;
-
-    if (!is_second_arf) {
-      *show_existing_arf = 0;
-      if (mean < threshold && std < mean * 1.2) {
-        *show_existing_arf = 1;
-      }
-      cpi->common.showable_frame |= *show_existing_arf;
-    } else {
-      // Use source frame if the filtered frame becomes very different.
-      if (!(mean < threshold && std < mean * 1.2)) {
-        return 0;
+void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info) {
+  av1_zero(tf_info->tf_buf_valid);
+  av1_zero(tf_info->tf_buf_gf_index);
+  av1_zero(tf_info->tf_buf_display_index_offset);
+}
+
+void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, AV1_COMP *cpi,
+                           const GF_GROUP *gf_group) {
+  if (tf_info->is_temporal_filter_on == 0) return;
+  const AV1_COMMON *const cm = &cpi->common;
+  for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) {
+    int update_type = gf_group->update_type[gf_index];
+    if (update_type == KF_UPDATE || update_type == ARF_UPDATE) {
+      int buf_idx = gf_group->frame_type[gf_index] == INTER_FRAME;
+      int lookahead_idx = gf_group->arf_src_offset[gf_index] +
+                          gf_group->cur_frame_idx[gf_index];
+      // This function is designed to be called multiple times after
+      // av1_tf_info_reset(). It will only generate the filtered frame that does
+      // not exist yet.
+      if (tf_info->tf_buf_valid[buf_idx] == 0 ||
+          tf_info->tf_buf_display_index_offset[buf_idx] != lookahead_idx) {
+        YV12_BUFFER_CONFIG *out_buf = &tf_info->tf_buf[buf_idx];
+        av1_temporal_filter(cpi, lookahead_idx, gf_index,
+                            &tf_info->frame_diff[buf_idx], out_buf);
+        aom_extend_frame_borders(out_buf, av1_num_planes(cm));
+        tf_info->tf_buf_gf_index[buf_idx] = gf_index;
+        tf_info->tf_buf_display_index_offset[buf_idx] = lookahead_idx;
+        tf_info->tf_buf_valid[buf_idx] = 1;
       }
     }
   }
+}
 
-  return 1;
+YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info,
+                                                 int gf_index,
+                                                 FRAME_DIFF *frame_diff) {
+  if (tf_info->is_temporal_filter_on == 0) return NULL;
+  YV12_BUFFER_CONFIG *out_buf = NULL;
+  for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
+    if (tf_info->tf_buf_valid[i] && tf_info->tf_buf_gf_index[i] == gf_index) {
+      out_buf = &tf_info->tf_buf[i];
+      *frame_diff = tf_info->frame_diff[i];
+    }
+  }
+  return out_buf;
 }
+/*!\endcond */
diff --git a/media/libaom/src/av1/encoder/temporal_filter.h b/media/libaom/src/av1/encoder/temporal_filter.h
index 5a6bde2594..8225dd9832 100644
--- a/media/libaom/src/av1/encoder/temporal_filter.h
+++ b/media/libaom/src/av1/encoder/temporal_filter.h
@@ -12,10 +12,15 @@
 #ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
 #define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
 
+#include <stdbool.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
-
+/*!\cond */
+struct AV1_COMP;
+struct AV1EncoderConfig;
+struct ThreadData;
 // TODO(any): These two variables are only used in avx2, sse2, sse4
 // implementations, where the block size is still hard coded. This should be
 // fixed to align with the c implementation.
@@ -25,21 +30,235 @@ extern "C" {
 // Block size used in temporal filtering.
 #define TF_BLOCK_SIZE BLOCK_32X32
 
-// Window size for YUV temporal filtering.
-// This is particually used for function `av1_apply_temporal_filter_yuv()`.
-#define TF_YUV_FILTER_WINDOW_LENGTH 3
-// A scale factor used in YUV temporal filtering for weight adjustment.
-#define TF_YUV_FILTER_WEIGHT_SCALE 3
+// Window size for temporal filtering.
+#define TF_WINDOW_LENGTH 5
 
-#define TF_ENABLE_PLANEWISE_STRATEGY 1
-// Window size for plane-wise temporal filtering.
-// This is particually used for function `av1_apply_temporal_filter_planewise()`
-#define TF_PLANEWISE_FILTER_WINDOW_LENGTH 5
-// A scale factor used in plane-wise temporal filtering to raise the filter
-// weight from `double` with range [0, 1] to `int` with range [0, 1000].
-#define TF_PLANEWISE_FILTER_WEIGHT_SCALE 1000
+// Hyper-parameters used to compute filtering weight. These hyper-parameters can
+// be tuned for a better performance.
+// 0. A scale factor used in temporal filtering to raise the filter weight from
+//    `double` with range [0, 1] to `int` with range [0, 1000].
+#define TF_WEIGHT_SCALE 1000
+// 1. Weight factor used to balance the weighted-average between window error
+//    and block error. The weight is for window error while the weight for block
+//    error is always set as 1.
+#define TF_WINDOW_BLOCK_BALANCE_WEIGHT 5
+// 2. Threshold for using q to adjust the filtering weight. Concretely, when
+//    using a small q (high bitrate), we would like to reduce the filtering
+//    strength such that more detailed information can be preserved. Hence, when
+//    q is smaller than this threshold, we will adjust the filtering weight
+//    based on the q-value.
+#define TF_Q_DECAY_THRESHOLD 20
+// 3. Normalization factor used to normalize the motion search error. Since the
+//    motion search error can be large and uncontrollable, we will simply
+//    normalize it before using it to compute the filtering weight.
+#define TF_SEARCH_ERROR_NORM_WEIGHT 20
+// 4. Threshold for using `arnr_strength` to adjust the filtering strength.
+//    Concretely, users can use `arnr_strength` arguments to control the
+//    strength of temporal filtering. When `arnr_strength` is small enough (
+//    i.e., smaller than this threshold), we will adjust the filtering weight
+//    based on the strength value.
+#define TF_STRENGTH_THRESHOLD 4
+// 5. Threshold for using motion search distance to adjust the filtering weight.
+//    Concretely, larger motion search vector leads to a higher probability of
+//    unreliable search. Hence, we would like to reduce the filtering strength
+//    when the distance is large enough. Considering that the distance actually
+//    relies on the frame size, this threshold is also a resolution-based
+//    threshold. Taking 720p videos as an instance, if this field equals to 0.1,
+//    then the actual threshold will be 720 * 0.1 = 72. Similarly, the threshold
+//    for 360p videos will be 360 * 0.1 = 36.
+#define TF_SEARCH_DISTANCE_THRESHOLD 0.1
+// 6. Threshold to identify if the q is in a relative high range.
+//    Above this cutoff q, a stronger filtering is applied.
+//    For a high q, the quantization throws away more information, and thus a
+//    stronger filtering is less likely to distort the encoded quality, while a
+//    stronger filtering could reduce bit rates.
+//    Ror a low q, more details are expected to be retained. Filtering is thus
+//    more conservative.
+#define TF_QINDEX_CUTOFF 128
 
 #define NOISE_ESTIMATION_EDGE_THRESHOLD 50
+
+// Sum and SSE source vs filtered frame difference returned by
+// temporal filter.
+typedef struct {
+  int64_t sum;
+  int64_t sse;
+} FRAME_DIFF;
+
+/*!\endcond */
+
+/*!
+ * \brief Parameters related to temporal filtering.
+ */
+typedef struct {
+  /*!
+   * Frame buffers used for temporal filtering.
+   */
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+  /*!
+   * Number of frames in the frame buffer.
+   */
+  int num_frames;
+
+  /*!
+   * Output filtered frame
+   */
+  YV12_BUFFER_CONFIG *output_frame;
+
+  /*!
+   * Index of the frame to be filtered.
+   */
+  int filter_frame_idx;
+  /*!
+   * Whether to accumulate diff for show existing condition check.
+   */
+  int compute_frame_diff;
+  /*!
+   * Frame scaling factor.
+   */
+  struct scale_factors sf;
+  /*!
+   * Estimated noise levels for each plane in the frame.
+   */
+  double noise_levels[MAX_MB_PLANE];
+  /*!
+   * Number of pixels in the temporal filtering block across all planes.
+   */
+  int num_pels;
+  /*!
+   * Number of temporal filtering block rows.
+   */
+  int mb_rows;
+  /*!
+   * Number of temporal filtering block columns.
+   */
+  int mb_cols;
+  /*!
+   * Whether the frame is high-bitdepth or not.
+   */
+  int is_highbitdepth;
+  /*!
+   * Quantization factor used in temporal filtering.
+   */
+  int q_factor;
+} TemporalFilterCtx;
+
+/*!
+ * buffer count in TEMPORAL_FILTER_INFO
+ * Currently we only apply filtering on KEY and ARF after
+ * define_gf_group(). Hence, the count is two.
+ */
+#define TF_INFO_BUF_COUNT 2
+
+/*!
+ * \brief Temporal filter info for a gop
+ */
+typedef struct TEMPORAL_FILTER_INFO {
+  /*!
+   * A flag indicate whether temporal filter shoud be applied.
+   * This flag will stored the result of
+   * av1_is_temporal_filter_on()
+   */
+  int is_temporal_filter_on;
+  /*!
+   * buffers used for temporal filtering in a GOP
+   * index 0 for key frame and index 1 for ARF
+   */
+  YV12_BUFFER_CONFIG tf_buf[TF_INFO_BUF_COUNT];
+
+  /*!
+   * buffers used for temporal filtering for
+   * INTNL_ARF_UPDATE
+   * Check av1_gop_is_second_arf() for the
+   * definition of second_arf in detail
+   */
+  YV12_BUFFER_CONFIG tf_buf_second_arf;
+  /*!
+   * whether to show the buffer directly or not.
+   */
+  FRAME_DIFF frame_diff[TF_INFO_BUF_COUNT];
+  /*!
+   * the corresponding gf_index for the buffer.
+   */
+  int tf_buf_gf_index[TF_INFO_BUF_COUNT];
+  /*!
+   * the display_index offset between next show frame and the frames in the GOP
+   */
+  int tf_buf_display_index_offset[TF_INFO_BUF_COUNT];
+  /*!
+   * whether the buf is valid or not.
+   */
+  int tf_buf_valid[TF_INFO_BUF_COUNT];
+} TEMPORAL_FILTER_INFO;
+
+/*!\brief Check whether we should apply temporal filter at all.
+ * \param[in]   oxcf           AV1 encoder config
+ *
+ * \return 1: temporal filter is on 0: temporal is off
+ */
+int av1_is_temporal_filter_on(const struct AV1EncoderConfig *oxcf);
+
+/*!\brief Allocate buffers for TEMPORAL_FILTER_INFO
+ * \param[in,out]   tf_info           Temporal filter info for a gop
+ * \param[in,out]   cpi               Top level encoder instance structure
+ */
+void av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, struct AV1_COMP *cpi);
+
+/*!\brief Free buffers for TEMPORAL_FILTER_INFO
+ * \param[in,out]   tf_info           Temporal filter info for a gop
+ */
+void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info);
+
+/*!\brief Reset validity of tf_buf in TEMPORAL_FILTER_INFO
+ * \param[in,out]   tf_info           Temporal filter info for a gop
+ */
+void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info);
+
+/*!\brief Apply temporal filter for key frame and ARF in a gop
+ * \param[in,out]   tf_info           Temporal filter info for a gop
+ * \param[in,out]   cpi               Top level encoder instance structure
+ * \param[in]       gf_group          GF/ARF group data structure
+ */
+void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, struct AV1_COMP *cpi,
+                           const GF_GROUP *gf_group);
+
+/*!\brief Get a filtered buffer from TEMPORAL_FILTER_INFO
+ * \param[in,out]   tf_info           Temporal filter info for a gop
+ * \param[in]       gf_index          gf_index for the target buffer
+ * \param[out]      show_tf_buf       whether the target buffer can be shown
+ * directly
+ */
+YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info,
+                                                 int gf_index,
+                                                 FRAME_DIFF *frame_diff);
+
+/*!\cond */
+
+// Data related to temporal filtering.
+typedef struct {
+  // Source vs filtered frame error.
+  FRAME_DIFF diff;
+  // Pointer to temporary block info used to store state in temporal filtering
+  // process.
+  MB_MODE_INFO *tmp_mbmi;
+  // Pointer to accumulator buffer used in temporal filtering process.
+  uint32_t *accum;
+  // Pointer to count buffer used in temporal filtering process.
+  uint16_t *count;
+  // Pointer to predictor used in temporal filtering process.
+  uint8_t *pred;
+} TemporalFilterData;
+
+// Data related to temporal filter multi-thread synchronization.
+typedef struct {
+#if CONFIG_MULTITHREAD
+  // Mutex lock used for dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif  // CONFIG_MULTITHREAD
+  // Next temporal filter block row to be filtered.
+  int next_tf_row;
+} AV1TemporalFilterSync;
+
 // Estimates noise level from a given frame using a single plane (Y, U, or V).
 // This is an adaptation of the mehtod in the following paper:
 // Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise
@@ -55,31 +274,173 @@ extern "C" {
 //   The estimated noise, or -1.0 if there are too few smooth pixels.
 double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
                                             const int plane,
-                                            const int bit_depth);
-
-#define TF_QINDEX 128  // Q-index used in temporal filtering.
-#define TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME 7
-// Performs temporal filtering if needed.
-// NOTE: In this function, the lookahead index is different from the 0-based
-// real index. For example, if we want to filter the first frame in the
-// pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead
-// of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the
-// second frame in the pre-fetched buffer. Another example: if we want to filter
-// the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16.
-// Futhermore, negative number is used for key frame in one-pass mode, where key
-// frame is filtered with the frames before it instead of after it. For example,
-// -15 means to filter the 17-th frame, which is a key frame in one-pass mode.
+                                            const int bit_depth,
+                                            const int edge_thresh);
+/*!\endcond */
+
+/*!\brief Does temporal filter for a given macroblock row.
+*
+* \ingroup src_frame_proc
+* \param[in]   cpi                   Top level encoder instance structure
+* \param[in]   td                    Pointer to thread data
+* \param[in]   mb_row                Macroblock row to be filtered
+filtering
+*
+* \return Nothing will be returned, but the contents of td->diff will be
+modified.
+*/
+void av1_tf_do_filtering_row(struct AV1_COMP *cpi, struct ThreadData *td,
+                             int mb_row);
+
+/*!\brief Performs temporal filtering if needed on a source frame.
+ * For example to create a filtered alternate reference frame (ARF)
+ *
+ * In this function, the lookahead index is different from the 0-based
+ * real index. For example, if we want to filter the first frame in the
+ * pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead
+ * of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the
+ * second frame in the pre-fetched buffer. Another example: if we want to filter
+ * the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16.
+ * Futhermore, negative number is used for key frame in one-pass mode, where key
+ * frame is filtered with the frames before it instead of after it. For example,
+ * -15 means to filter the 17-th frame, which is a key frame in one-pass mode.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]      cpi                        Top level encoder instance
+ *                                            structure
+ * \param[in]      filter_frame_lookahead_idx The index of the
+ *                                            to-filter frame in the lookahead
+ *                                            buffer cpi->lookahead.
+ * \param[in]      gf_frame_index             Index of GOP
+ * \param[in,out]  frame_diff                 structure of sse and sum of the
+ *                                            filtered frame.
+ * \param[out]     output_frame               Ouput filtered frame.
+ */
+void av1_temporal_filter(struct AV1_COMP *cpi,
+                         const int filter_frame_lookahead_idx,
+                         int gf_frame_index, FRAME_DIFF *frame_diff,
+                         YV12_BUFFER_CONFIG *output_frame);
+
+/*!\brief Check whether a filtered frame can be show directly
+ *
+ * This function will use the filtered frame's sse and current q index
+ * to make decision.
+ *
+ * \ingroup src_frame_proc
+ * \param[in]  frame        filtered frame's buffer
+ * \param[in]  frame_diff   structure of sse and sum of the
+ *                          filtered frame.
+ * \param[in]  q_index      q_index used for this frame
+ * \param[in]  bit_depth    bit depth
+ * \return     return 1 if this frame can be shown directly, otherwise
+ *             return 0
+ */
+int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame,
+                                  const FRAME_DIFF *frame_diff, int q_index,
+                                  aom_bit_depth_t bit_depth);
+
+/*!\cond */
+// Helper function to get `q` used for encoding.
+int av1_get_q(const struct AV1_COMP *cpi);
+
+// Allocates memory for members of TemporalFilterData.
+// Inputs:
+//   tf_data: Pointer to the structure containing temporal filter related data.
+//   num_pels: Number of pixels in the block across all planes.
+//   is_high_bitdepth: Whether the frame is high-bitdepth or not.
+// Returns:
+//   Nothing will be returned. But the contents of tf_data will be modified.
+static AOM_INLINE bool tf_alloc_and_reset_data(TemporalFilterData *tf_data,
+                                               int num_pels,
+                                               int is_high_bitdepth) {
+  tf_data->tmp_mbmi = (MB_MODE_INFO *)malloc(sizeof(*tf_data->tmp_mbmi));
+  memset(tf_data->tmp_mbmi, 0, sizeof(*tf_data->tmp_mbmi));
+  tf_data->accum =
+      (uint32_t *)aom_memalign(16, num_pels * sizeof(*tf_data->accum));
+  tf_data->count =
+      (uint16_t *)aom_memalign(16, num_pels * sizeof(*tf_data->count));
+  memset(&tf_data->diff, 0, sizeof(tf_data->diff));
+  if (is_high_bitdepth)
+    tf_data->pred = CONVERT_TO_BYTEPTR(
+        aom_memalign(32, num_pels * 2 * sizeof(*tf_data->pred)));
+  else
+    tf_data->pred =
+        (uint8_t *)aom_memalign(32, num_pels * sizeof(*tf_data->pred));
+  if (!(tf_data->accum && tf_data->count && tf_data->pred)) {
+    aom_free(tf_data->accum);
+    aom_free(tf_data->count);
+    aom_free(tf_data->pred);
+    return false;
+  }
+  return true;
+}
+
+// Setup macroblockd params for temporal filtering process.
+// Inputs:
+//   mbd: Pointer to the block for filtering.
+//   tf_data: Pointer to the structure containing temporal filter related data.
+//   scale: Scaling factor.
+// Returns:
+//   Nothing will be returned. Contents of mbd will be modified.
+static AOM_INLINE void tf_setup_macroblockd(MACROBLOCKD *mbd,
+                                            TemporalFilterData *tf_data,
+                                            const struct scale_factors *scale) {
+  mbd->block_ref_scale_factors[0] = scale;
+  mbd->block_ref_scale_factors[1] = scale;
+  mbd->mi = &tf_data->tmp_mbmi;
+  mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+}
+
+// Deallocates the memory allocated for members of TemporalFilterData.
+// Inputs:
+//   tf_data: Pointer to the structure containing temporal filter related data.
+//   is_high_bitdepth: Whether the frame is high-bitdepth or not.
+// Returns:
+//   Nothing will be returned.
+static AOM_INLINE void tf_dealloc_data(TemporalFilterData *tf_data,
+                                       int is_high_bitdepth) {
+  if (is_high_bitdepth)
+    tf_data->pred = (uint8_t *)CONVERT_TO_SHORTPTR(tf_data->pred);
+  free(tf_data->tmp_mbmi);
+  aom_free(tf_data->accum);
+  aom_free(tf_data->count);
+  aom_free(tf_data->pred);
+}
+
+// Saves the state prior to temporal filter process.
+// Inputs:
+//   mbd: Pointer to the block for filtering.
+//   input_mbmi: Backup block info to save input state.
+//   input_buffer: Backup buffer pointer to save input state.
+//   num_planes: Number of planes.
+// Returns:
+//   Nothing will be returned. Contents of input_mbmi and input_buffer will be
+//   modified.
+static INLINE void tf_save_state(MACROBLOCKD *mbd, MB_MODE_INFO ***input_mbmi,
+                                 uint8_t **input_buffer, int num_planes) {
+  for (int i = 0; i < num_planes; i++) {
+    input_buffer[i] = mbd->plane[i].pre[0].buf;
+  }
+  *input_mbmi = mbd->mi;
+}
+
+// Restores the initial state after temporal filter process.
 // Inputs:
-//   cpi: Pointer to the composed information of input video.
-//   filter_frame_lookahead_idx: The index of the to-filter frame in the
-//                               lookahead buffer `cpi->lookahead`.
-//   show_existing_arf: Whether to show existing ARF. This field will be updated
-//                      in this function.
+//   mbd: Pointer to the block for filtering.
+//   input_mbmi: Backup block info from where input state is restored.
+//   input_buffer: Backup buffer pointer from where input state is restored.
+//   num_planes: Number of planes.
 // Returns:
-//   Whether temporal filtering is successfully done.
-int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
-                        int *show_existing_arf);
+//   Nothing will be returned. Contents of mbd will be modified.
+static INLINE void tf_restore_state(MACROBLOCKD *mbd, MB_MODE_INFO **input_mbmi,
+                                    uint8_t **input_buffer, int num_planes) {
+  for (int i = 0; i < num_planes; i++) {
+    mbd->plane[i].pre[0].buf = input_buffer[i];
+  }
+  mbd->mi = input_mbmi;
+}
 
+/*!\endcond */
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/thirdpass.c b/media/libaom/src/av1/encoder/thirdpass.c
new file mode 100644
index 0000000000..d5265540d1
--- /dev/null
+++ b/media/libaom/src/av1/encoder/thirdpass.c
@@ -0,0 +1,780 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_codec.h"
+#include "aom/aomdx.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/av1_iface_common.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/thirdpass.h"
+#include "av1/common/blockd.h"
+
+#if CONFIG_THREE_PASS
+#include "common/ivfdec.h"
+#endif
+
+#if CONFIG_THREE_PASS
+static void setup_two_pass_stream_input(
+    struct AvxInputContext **input_ctx_ptr, const char *input_file_name,
+    struct aom_internal_error_info *err_info) {
+  FILE *infile;
+  infile = fopen(input_file_name, "rb");
+  if (!infile) {
+    aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM,
+                       "Failed to open input file '%s'.", input_file_name);
+  }
+  struct AvxInputContext *aom_input_ctx = aom_malloc(sizeof(*aom_input_ctx));
+  if (!aom_input_ctx) {
+    fclose(infile);
+    aom_internal_error(err_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate memory for third-pass context.");
+  }
+  memset(aom_input_ctx, 0, sizeof(*aom_input_ctx));
+  aom_input_ctx->filename = input_file_name;
+  aom_input_ctx->file = infile;
+
+  if (file_is_ivf(aom_input_ctx)) {
+    aom_input_ctx->file_type = FILE_TYPE_IVF;
+  } else {
+    fclose(infile);
+    aom_free(aom_input_ctx);
+    aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM,
+                       "Unrecognized input file type.");
+  }
+  *input_ctx_ptr = aom_input_ctx;
+}
+
+static void init_third_pass(THIRD_PASS_DEC_CTX *ctx) {
+  if (!ctx->input_ctx) {
+    if (ctx->input_file_name == NULL) {
+      aom_internal_error(ctx->err_info, AOM_CODEC_INVALID_PARAM,
+                         "No third pass input specified.");
+    }
+    setup_two_pass_stream_input(&ctx->input_ctx, ctx->input_file_name,
+                                ctx->err_info);
+  }
+
+#if CONFIG_AV1_DECODER
+  if (!ctx->decoder.iface) {
+    aom_codec_iface_t *decoder_iface = &aom_codec_av1_inspect_algo;
+    if (aom_codec_dec_init(&ctx->decoder, decoder_iface, NULL, 0)) {
+      aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                         "Failed to initialize decoder.");
+    }
+  }
+#else
+  aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                     "To utilize three-pass encoding, libaom must be built "
+                     "with CONFIG_AV1_DECODER=1.");
+#endif
+}
+#endif  // CONFIG_THREE_PASS
+
+// Return 0: success
+//        1: cannot read because this is end of file
+//       -1: failure to read the frame
+static int read_frame(THIRD_PASS_DEC_CTX *ctx) {
+#if CONFIG_THREE_PASS
+  if (!ctx->input_ctx || !ctx->decoder.iface) {
+    init_third_pass(ctx);
+  }
+  if (!ctx->have_frame) {
+    if (ivf_read_frame(ctx->input_ctx->file, &ctx->buf, &ctx->bytes_in_buffer,
+                       &ctx->buffer_size, NULL) != 0) {
+      if (feof(ctx->input_ctx->file)) {
+        return 1;
+      } else {
+        return -1;
+      }
+    }
+    ctx->frame = ctx->buf;
+    ctx->end_frame = ctx->frame + ctx->bytes_in_buffer;
+    ctx->have_frame = 1;
+  }
+#else
+  aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                     "Cannot parse bitstream without CONFIG_THREE_PASS.");
+#endif
+  Av1DecodeReturn adr;
+  if (aom_codec_decode(&ctx->decoder, ctx->frame,
+                       (unsigned int)ctx->bytes_in_buffer,
+                       &adr) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to decode frame for third pass.");
+  }
+  ctx->this_frame_bits = (int)(adr.buf - ctx->frame) << 3;
+  ctx->frame = adr.buf;
+  ctx->bytes_in_buffer = ctx->end_frame - ctx->frame;
+  if (ctx->frame == ctx->end_frame) ctx->have_frame = 0;
+  return 0;
+}
+
+static void free_frame_info(THIRD_PASS_FRAME_INFO *frame_info) {
+  if (!frame_info) return;
+  aom_free(frame_info->mi_info);
+  frame_info->mi_info = NULL;
+}
+
+// This function gets the information needed from the recently decoded frame,
+// via various decoder APIs, and saves the info into ctx->frame_info.
+// Return 0: success
+//        1: cannot read because this is end of file
+//       -1: failure to read the frame
+static int get_frame_info(THIRD_PASS_DEC_CTX *ctx) {
+  int ret = read_frame(ctx);
+  if (ret != 0) return ret;
+  int cur = ctx->frame_info_count;
+
+  ctx->frame_info[cur].actual_bits = ctx->this_frame_bits;
+
+  if (cur >= MAX_THIRD_PASS_BUF) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Third pass frame info ran out of available slots.");
+  }
+  int frame_type_flags = 0;
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_FRAME_FLAGS,
+                        &frame_type_flags) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read frame flags.");
+  }
+  if (frame_type_flags & AOM_FRAME_IS_KEY) {
+    ctx->frame_info[cur].frame_type = KEY_FRAME;
+  } else if (frame_type_flags & AOM_FRAME_IS_INTRAONLY) {
+    ctx->frame_info[cur].frame_type = INTRA_ONLY_FRAME;
+  } else if (frame_type_flags & AOM_FRAME_IS_SWITCH) {
+    ctx->frame_info[cur].frame_type = S_FRAME;
+  } else {
+    ctx->frame_info[cur].frame_type = INTER_FRAME;
+  }
+
+  // Get frame width and height
+  int frame_size[2];
+  if (aom_codec_control(&ctx->decoder, AV1D_GET_FRAME_SIZE, frame_size) !=
+      AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read frame size.");
+  }
+
+  // Check if we need to re-alloc the mi fields.
+  const int mi_cols = (frame_size[0] + 3) >> 2;
+  const int mi_rows = (frame_size[1] + 3) >> 2;
+  ctx->frame_info[cur].mi_stride = mi_cols;
+  ctx->frame_info[cur].mi_rows = mi_rows;
+  ctx->frame_info[cur].mi_cols = mi_cols;
+
+  if (ctx->frame_info[cur].width != frame_size[0] ||
+      ctx->frame_info[cur].height != frame_size[1] ||
+      !ctx->frame_info[cur].mi_info) {
+    free_frame_info(&ctx->frame_info[cur]);
+
+    ctx->frame_info[cur].mi_info =
+        aom_malloc(mi_cols * mi_rows * sizeof(*ctx->frame_info[cur].mi_info));
+
+    if (!ctx->frame_info[cur].mi_info) {
+      aom_internal_error(ctx->err_info, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate mi buffer for the third pass.");
+    }
+  }
+
+  ctx->frame_info[cur].width = frame_size[0];
+  ctx->frame_info[cur].height = frame_size[1];
+
+  // Get frame base q idx
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_BASE_Q_IDX,
+                        &ctx->frame_info[cur].base_q_idx) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read base q index.");
+  }
+
+  // Get show existing frame flag
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_EXISTING_FRAME_FLAG,
+                        &ctx->frame_info[cur].is_show_existing_frame) !=
+      AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read show existing frame flag.");
+  }
+
+  // Get show frame flag
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_FRAME_FLAG,
+                        &ctx->frame_info[cur].is_show_frame) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read show frame flag.");
+  }
+
+  // Get order hint
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_ORDER_HINT,
+                        &ctx->frame_info[cur].order_hint) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read order hint.");
+  }
+
+  // Clear MI info
+  for (int mi_row = 0; mi_row < mi_rows; mi_row++) {
+    for (int mi_col = 0; mi_col < mi_cols; mi_col++) {
+      ctx->frame_info[cur].mi_info[mi_row * mi_cols + mi_col].bsize =
+          BLOCK_INVALID;
+    }
+  }
+
+  // Get relevant information regarding each 4x4 MI
+  MB_MODE_INFO cur_mi_info;
+  THIRD_PASS_MI_INFO *const this_mi = ctx->frame_info[cur].mi_info;
+  for (int mi_row = 0; mi_row < mi_rows; mi_row++) {
+    for (int mi_col = 0; mi_col < mi_cols; mi_col++) {
+      const int offset = mi_row * mi_cols + mi_col;
+      if (this_mi[offset].bsize != BLOCK_INVALID) {
+        continue;
+      }
+      // Get info of this MI
+      if (aom_codec_control(&ctx->decoder, AV1D_GET_MI_INFO, mi_row, mi_col,
+                            &cur_mi_info) != AOM_CODEC_OK) {
+        aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                           "Failed to read mi info.");
+      }
+      const int blk_mi_rows = mi_size_high[cur_mi_info.bsize];
+      const int blk_mi_cols = mi_size_wide[cur_mi_info.bsize];
+
+      for (int h = 0; h < blk_mi_rows; h++) {
+        for (int w = 0; w < blk_mi_cols; w++) {
+          if (h + mi_row >= mi_rows || w + mi_col >= mi_cols) {
+            continue;
+          }
+          const int this_offset = offset + h * mi_cols + w;
+          this_mi[this_offset].bsize = cur_mi_info.bsize;
+          this_mi[this_offset].partition = cur_mi_info.partition;
+          this_mi[this_offset].mi_row_start = mi_row;
+          this_mi[this_offset].mi_col_start = mi_col;
+          this_mi[this_offset].mv[0] = cur_mi_info.mv[0];
+          this_mi[this_offset].mv[1] = cur_mi_info.mv[1];
+          this_mi[this_offset].ref_frame[0] = cur_mi_info.ref_frame[0];
+          this_mi[this_offset].ref_frame[1] = cur_mi_info.ref_frame[1];
+          this_mi[this_offset].pred_mode = cur_mi_info.mode;
+        }
+      }
+    }
+  }
+
+  ctx->frame_info_count++;
+
+  return 0;
+}
+
+#define USE_SECOND_PASS_FILE 1
+
+#if !USE_SECOND_PASS_FILE
+// Parse the frames in the gop and determine the last frame of the current GOP.
+// Decode more frames if necessary. The variable max_num is the maximum static
+// GOP length if we detect an IPPP structure, and it is expected that max_mum >=
+// MAX_GF_INTERVAL.
+static void get_current_gop_end(THIRD_PASS_DEC_CTX *ctx, int max_num,
+                                int *last_idx) {
+  assert(max_num >= MAX_GF_INTERVAL);
+  *last_idx = 0;
+  int cur_idx = 0;
+  int arf_order_hint = -1;
+  int num_show_frames = 0;
+  while (num_show_frames < max_num) {
+    assert(cur_idx < MAX_THIRD_PASS_BUF);
+    // Read in from bitstream if needed.
+    if (cur_idx >= ctx->frame_info_count) {
+      int ret = get_frame_info(ctx);
+      if (ret == 1) {
+        // At the end of the file, GOP ends in the prev frame.
+        if (arf_order_hint >= 0) {
+          aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                             "Failed to derive GOP length.");
+        }
+        *last_idx = cur_idx - 1;
+        return;
+      }
+      if (ret < 0) {
+        aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                           "Failed to read frame for third pass.");
+      }
+    }
+
+    // TODO(bohanli): verify that fwd_kf works here.
+    if (ctx->frame_info[cur_idx].frame_type == KEY_FRAME &&
+        ctx->frame_info[cur_idx].is_show_frame) {
+      if (cur_idx != 0) {
+        // If this is a key frame and is not the first kf in this kf group, we
+        // have reached the next key frame. Stop here.
+        *last_idx = cur_idx - 1;
+        return;
+      }
+    } else if (!ctx->frame_info[cur_idx].is_show_frame &&
+               arf_order_hint == -1) {
+      // If this is an arf (the first no show)
+      if (num_show_frames <= 1) {
+        // This is an arf and we should end the GOP with its overlay.
+        arf_order_hint = ctx->frame_info[cur_idx].order_hint;
+      } else {
+        // There are multiple show frames before the this arf, so we treat the
+        // frames previous to this arf as a GOP.
+        *last_idx = cur_idx - 1;
+        return;
+      }
+    } else if (arf_order_hint >= 0 && ctx->frame_info[cur_idx].order_hint ==
+                                          (unsigned int)arf_order_hint) {
+      // If this is the overlay/show existing of the arf
+      assert(ctx->frame_info[cur_idx].is_show_frame);
+      *last_idx = cur_idx;
+      return;
+    } else {
+      // This frame is part of the GOP.
+      if (ctx->frame_info[cur_idx].is_show_frame) num_show_frames++;
+    }
+    cur_idx++;
+  }
+  // This is a long IPPP GOP and we will use a length of max_num here.
+  assert(arf_order_hint < 0);
+  *last_idx = max_num - 1;
+  return;
+}
+#endif
+
+static AOM_INLINE void read_gop_frames(THIRD_PASS_DEC_CTX *ctx) {
+  int cur_idx = 0;
+  while (cur_idx < ctx->gop_info.num_frames) {
+    assert(cur_idx < MAX_THIRD_PASS_BUF);
+    // Read in from bitstream if needed.
+    if (cur_idx >= ctx->frame_info_count) {
+      int ret = get_frame_info(ctx);
+      if (ret != 0) {
+        aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                           "Failed to read frame for third pass.");
+      }
+    }
+    cur_idx++;
+  }
+  return;
+}
+
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) {
+  // Read in future frames in the current GOP.
+  read_gop_frames(ctx);
+
+  int gf_len = 0;
+  // Check the GOP length against the value read from second_pass_file
+  for (int i = 0; i < ctx->gop_info.num_frames; i++) {
+    if (ctx->frame_info[i].is_show_frame) gf_len++;
+  }
+
+  if (gf_len != ctx->gop_info.gf_length) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Mismatch in third pass GOP length!");
+  }
+}
+
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) {
+  if (ctx->frame_info_count == 0) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "No available frame info for third pass.");
+  }
+  ctx->frame_info_count--;
+  free_frame_info(&ctx->frame_info[0]);
+  for (int i = 0; i < ctx->frame_info_count; i++) {
+    ctx->frame_info[i] = ctx->frame_info[i + 1];
+  }
+  ctx->frame_info[ctx->frame_info_count].mi_info = NULL;
+}
+
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+                            const char *file) {
+  av1_free_thirdpass_ctx(*ctx);
+  CHECK_MEM_ERROR(cm, *ctx, aom_calloc(1, sizeof(**ctx)));
+  THIRD_PASS_DEC_CTX *ctx_ptr = *ctx;
+  ctx_ptr->input_file_name = file;
+  ctx_ptr->prev_gop_end = -1;
+  ctx_ptr->err_info = cm->error;
+}
+
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) {
+  if (ctx == NULL) return;
+  if (ctx->decoder.iface) {
+    aom_codec_destroy(&ctx->decoder);
+  }
+#if CONFIG_THREE_PASS
+  if (ctx->input_ctx && ctx->input_ctx->file) fclose(ctx->input_ctx->file);
+  aom_free(ctx->input_ctx);
+#endif
+  if (ctx->buf) free(ctx->buf);
+  for (int i = 0; i < MAX_THIRD_PASS_BUF; i++) {
+    free_frame_info(&ctx->frame_info[i]);
+  }
+  aom_free(ctx);
+}
+
+void av1_write_second_pass_gop_info(AV1_COMP *cpi) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+
+  if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) {
+    // Write the GOP length to a log file.
+    av1_open_second_pass_log(cpi, 0);
+
+    THIRD_PASS_GOP_INFO gop_info;
+
+    gop_info.num_frames = gf_group->size;
+    gop_info.use_arf = (gf_group->arf_index >= 0);
+    gop_info.gf_length = p_rc->baseline_gf_interval;
+
+    size_t count =
+        fwrite(&gop_info, sizeof(gop_info), 1, cpi->second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                         "Could not write to second pass log file!");
+    }
+  }
+}
+
+void av1_write_second_pass_per_frame_info(AV1_COMP *cpi, int gf_index) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+  if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) {
+    // write target bitrate
+    int bits = gf_group->bit_allocation[gf_index];
+    size_t count = fwrite(&bits, sizeof(bits), 1, cpi->second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                         "Could not write to second pass log file!");
+    }
+
+    // write sse
+    uint64_t sse = 0;
+    int pkt_idx = cpi->ppi->output_pkt_list->cnt - 1;
+    if (pkt_idx >= 0 &&
+        cpi->ppi->output_pkt_list->pkts[pkt_idx].kind == AOM_CODEC_PSNR_PKT) {
+      sse = cpi->ppi->output_pkt_list->pkts[pkt_idx].data.psnr.sse[0];
+#if CONFIG_INTERNAL_STATS
+    } else if (cpi->ppi->b_calculate_psnr) {
+      sse = cpi->ppi->total_sq_error[0];
+#endif
+    } else {
+      const YV12_BUFFER_CONFIG *orig = cpi->source;
+      const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+      PSNR_STATS psnr;
+#if CONFIG_AV1_HIGHBITDEPTH
+      const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+      const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+      aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+#else
+      aom_calc_psnr(orig, recon, &psnr);
+#endif
+      sse = psnr.sse[0];
+    }
+
+    count = fwrite(&sse, sizeof(sse), 1, cpi->second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                         "Could not write to second pass log file!");
+    }
+
+    // write bpm_factor
+    double factor = cpi->ppi->twopass.bpm_factor;
+    count = fwrite(&factor, sizeof(factor), 1, cpi->second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                         "Could not write to second pass log file!");
+    }
+  }
+}
+void av1_open_second_pass_log(AV1_COMP *cpi, int is_read) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  if (oxcf->second_pass_log == NULL) {
+    aom_internal_error(cpi->common.error, AOM_CODEC_INVALID_PARAM,
+                       "No second pass log file specified for the third pass!");
+  }
+  // Read the GOP length from a file.
+  if (!cpi->second_pass_log_stream) {
+    if (is_read) {
+      cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "rb");
+    } else {
+      cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "wb");
+    }
+    if (!cpi->second_pass_log_stream) {
+      aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                         "Could not open second pass log file!");
+    }
+  }
+}
+
+void av1_close_second_pass_log(AV1_COMP *cpi) {
+  if (cpi->second_pass_log_stream) {
+    int ret = fclose(cpi->second_pass_log_stream);
+    if (ret != 0) {
+      aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                         "Could not close second pass log file!");
+    }
+    cpi->second_pass_log_stream = 0;
+  }
+}
+
+void av1_read_second_pass_gop_info(FILE *second_pass_log_stream,
+                                   THIRD_PASS_GOP_INFO *gop_info,
+                                   struct aom_internal_error_info *error) {
+  size_t count = fread(gop_info, sizeof(*gop_info), 1, second_pass_log_stream);
+  if (count < 1) {
+    aom_internal_error(error, AOM_CODEC_ERROR,
+                       "Could not read from second pass log file!");
+  }
+}
+
+void av1_read_second_pass_per_frame_info(
+    FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr,
+    int frame_info_count, struct aom_internal_error_info *error) {
+  for (int i = 0; i < frame_info_count; i++) {
+    // read target bits
+    int bits = 0;
+    size_t count = fread(&bits, sizeof(bits), 1, second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(error, AOM_CODEC_ERROR,
+                         "Could not read from second pass log file!");
+    }
+    frame_info_arr[i].bits_allocated = bits;
+
+    // read distortion
+    uint64_t sse;
+    count = fread(&sse, sizeof(sse), 1, second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(error, AOM_CODEC_ERROR,
+                         "Could not read from second pass log file!");
+    }
+    frame_info_arr[i].sse = sse;
+
+    // read bpm factor
+    double factor;
+    count = fread(&factor, sizeof(factor), 1, second_pass_log_stream);
+    if (count < 1) {
+      aom_internal_error(error, AOM_CODEC_ERROR,
+                         "Could not read from second pass log file!");
+    }
+    frame_info_arr[i].bpm_factor = factor;
+  }
+}
+
+int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) {
+  if (ctx == NULL) return -1;
+  int use_arf = 0;
+  for (int i = 0; i < ctx->gop_info.gf_length; i++) {
+    if (ctx->frame_info[i].order_hint != 0 &&
+        ctx->frame_info[i].is_show_frame == 0) {
+      use_arf = 1;
+    }
+  }
+  if (use_arf != ctx->gop_info.use_arf) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Mismatch in third pass GOP length!");
+  }
+  return use_arf;
+}
+
+void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight,
+                              int fwidth, double *ratio_h, double *ratio_w) {
+  assert(ctx);
+  assert(fidx < ctx->frame_info_count);
+  const int fheight_second_pass = ctx->frame_info[fidx].height;
+  const int fwidth_second_pass = ctx->frame_info[fidx].width;
+  assert(fheight_second_pass <= fheight && fwidth_second_pass <= fwidth);
+
+  *ratio_h = (double)fheight / fheight_second_pass;
+  *ratio_w = (double)fwidth / fwidth_second_pass;
+}
+
+THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx,
+                                          int mi_row, int mi_col,
+                                          double ratio_h, double ratio_w) {
+  assert(ctx);
+  assert(fidx < ctx->frame_info_count);
+
+  const int mi_rows_second_pass = ctx->frame_info[fidx].mi_rows;
+  const int mi_cols_second_pass = ctx->frame_info[fidx].mi_cols;
+
+  const int mi_row_second_pass =
+      clamp((int)round(mi_row / ratio_h), 0, mi_rows_second_pass - 1);
+  const int mi_col_second_pass =
+      clamp((int)round(mi_col / ratio_w), 0, mi_cols_second_pass - 1);
+
+  const int mi_stride_second_pass = ctx->frame_info[fidx].mi_stride;
+  THIRD_PASS_MI_INFO *this_mi = ctx->frame_info[fidx].mi_info +
+                                mi_row_second_pass * mi_stride_second_pass +
+                                mi_col_second_pass;
+  return this_mi;
+}
+
+void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi,
+                                    double ratio_h, double ratio_w, int *mi_row,
+                                    int *mi_col) {
+  *mi_row = (int)round(third_pass_mi->mi_row_start * ratio_h);
+  *mi_col = (int)round(third_pass_mi->mi_col_start * ratio_w);
+}
+
+int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi,
+                                      double ratio_h, double ratio_w,
+                                      MV_REFERENCE_FRAME frame) {
+  assert(this_mi != NULL);
+  int_mv cur_mv;
+  cur_mv.as_int = INVALID_MV;
+
+  if (frame < LAST_FRAME || frame > ALTREF_FRAME) return cur_mv;
+
+  for (int r = 0; r < 2; r++) {
+    if (this_mi->ref_frame[r] == frame) {
+      cur_mv.as_mv.row = (int16_t)round(this_mi->mv[r].as_mv.row * ratio_h);
+      cur_mv.as_mv.col = (int16_t)round(this_mi->mv[r].as_mv.col * ratio_w);
+    }
+  }
+
+  return cur_mv;
+}
+
+BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi,
+                                                double ratio_h,
+                                                double ratio_w) {
+  assert(this_mi != NULL);
+  BLOCK_SIZE bsize = BLOCK_INVALID;
+
+  const BLOCK_SIZE bsize_second_pass = this_mi->bsize;
+  assert(bsize_second_pass != BLOCK_INVALID);
+
+  const int w_second_pass = block_size_wide[bsize_second_pass];
+  const int h_second_pass = block_size_high[bsize_second_pass];
+
+  int part_type;
+
+  if (w_second_pass == h_second_pass) {
+    part_type = PARTITION_NONE;
+  } else if (w_second_pass / h_second_pass == 2) {
+    part_type = PARTITION_HORZ;
+  } else if (w_second_pass / h_second_pass == 4) {
+    part_type = PARTITION_HORZ_4;
+  } else if (h_second_pass / w_second_pass == 2) {
+    part_type = PARTITION_VERT;
+  } else if (h_second_pass / w_second_pass == 4) {
+    part_type = PARTITION_VERT_4;
+  } else {
+    part_type = PARTITION_INVALID;
+  }
+  assert(part_type != PARTITION_INVALID);
+
+  const int w = (int)(round(w_second_pass * ratio_w));
+  const int h = (int)(round(h_second_pass * ratio_h));
+
+  for (int i = 0; i < SQR_BLOCK_SIZES; i++) {
+    const BLOCK_SIZE this_bsize = subsize_lookup[part_type][i];
+    if (this_bsize == BLOCK_INVALID) continue;
+
+    const int this_w = block_size_wide[this_bsize];
+    const int this_h = block_size_high[this_bsize];
+
+    if (this_w >= w && this_h >= h) {
+      // find the smallest block size that contains the mapped block
+      bsize = this_bsize;
+      break;
+    }
+  }
+  if (bsize == BLOCK_INVALID) {
+    // could not find a proper one, just use the largest then.
+    bsize = BLOCK_128X128;
+  }
+
+  return bsize;
+}
+
+PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
+                                               THIRD_PASS_MI_INFO *this_mi) {
+  int mi_stride = ctx->frame_info[0].mi_stride;
+
+  int mi_row = this_mi->mi_row_start;
+  int mi_col = this_mi->mi_col_start;
+
+  THIRD_PASS_MI_INFO *corner_mi =
+      &ctx->frame_info[0].mi_info[mi_row * mi_stride + mi_col];
+
+  return corner_mi->partition;
+}
+
+#if CONFIG_BITRATE_ACCURACY
+static void fwrite_and_check(const void *ptr, size_t size, size_t nmemb,
+                             FILE *stream,
+                             struct aom_internal_error_info *error) {
+  size_t count = fwrite(ptr, size, nmemb, stream);
+  if (count < nmemb) {
+    aom_internal_error(error, AOM_CODEC_ERROR, "fwrite_and_check failed\n");
+  }
+}
+
+static void fread_and_check(void *ptr, size_t size, size_t nmemb, FILE *stream,
+                            struct aom_internal_error_info *error) {
+  size_t count = fread(ptr, size, nmemb, stream);
+  if (count < nmemb) {
+    aom_internal_error(error, AOM_CODEC_ERROR, "fread_and_check failed\n");
+  }
+}
+
+void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group,
+                       const TplParams *tpl_data) {
+  tpl_info->tpl_ready = tpl_data->ready;
+  if (tpl_info->tpl_ready) {
+    tpl_info->gf_length = gf_group->size;
+    for (int i = 0; i < tpl_info->gf_length; ++i) {
+      tpl_info->txfm_stats_list[i] = tpl_data->txfm_stats_list[i];
+      tpl_info->qstep_ratio_ls[i] = av1_tpl_get_qstep_ratio(tpl_data, i);
+      tpl_info->update_type_list[i] = gf_group->update_type[i];
+    }
+  }
+}
+
+void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream,
+                        struct aom_internal_error_info *error) {
+  fwrite_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1,
+                   log_stream, error);
+  if (tpl_info->tpl_ready) {
+    fwrite_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1,
+                     log_stream, error);
+    assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS);
+    fwrite_and_check(&tpl_info->txfm_stats_list,
+                     sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length,
+                     log_stream, error);
+    fwrite_and_check(&tpl_info->qstep_ratio_ls,
+                     sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length,
+                     log_stream, error);
+    fwrite_and_check(&tpl_info->update_type_list,
+                     sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length,
+                     log_stream, error);
+  }
+}
+
+void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream,
+                       struct aom_internal_error_info *error) {
+  av1_zero(*tpl_info);
+  fread_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1,
+                  log_stream, error);
+  if (tpl_info->tpl_ready) {
+    fread_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1,
+                    log_stream, error);
+    assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS);
+    fread_and_check(&tpl_info->txfm_stats_list,
+                    sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length,
+                    log_stream, error);
+    fread_and_check(&tpl_info->qstep_ratio_ls,
+                    sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length,
+                    log_stream, error);
+    fread_and_check(&tpl_info->update_type_list,
+                    sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length,
+                    log_stream, error);
+  }
+}
+#endif  // CONFIG_BITRATE_ACCURACY
diff --git a/media/libaom/src/av1/encoder/thirdpass.h b/media/libaom/src/av1/encoder/thirdpass.h
new file mode 100644
index 0000000000..8080c06cb6
--- /dev/null
+++ b/media/libaom/src/av1/encoder/thirdpass.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_THIRDPASS_H_
+#define AOM_AV1_ENCODER_THIRDPASS_H_
+
+#include "av1/common/enums.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/tpl_model.h"
+
+struct AV1_COMP;
+
+// TODO(bohanli): optimize this number
+#define MAX_THIRD_PASS_BUF \
+  (AOMMAX((2 * MAX_GF_INTERVAL + 1), MAX_STATIC_GF_GROUP_LENGTH))
+
+// Struct to store useful information related to a GOP, in addition to what is
+// available in the bitstream
+typedef struct {
+  int gf_length;
+  int num_frames;
+  int use_arf;
+} THIRD_PASS_GOP_INFO;
+
+#if CONFIG_BITRATE_ACCURACY
+typedef struct TPL_INFO {
+  int gf_length;
+  int tpl_ready;
+  TplTxfmStats txfm_stats_list[MAX_LENGTH_TPL_FRAME_STATS];
+  double qstep_ratio_ls[MAX_LENGTH_TPL_FRAME_STATS];
+  FRAME_UPDATE_TYPE update_type_list[MAX_LENGTH_TPL_FRAME_STATS];
+} TPL_INFO;
+#endif  // CONFIG_BITRATE_ACCURACY
+
+typedef struct {
+  BLOCK_SIZE bsize;
+  PARTITION_TYPE partition;
+  int mi_row_start;
+  int mi_col_start;
+  int_mv mv[2];
+  MV_REFERENCE_FRAME ref_frame[2];
+  PREDICTION_MODE pred_mode;
+} THIRD_PASS_MI_INFO;
+
+// Struct to store useful information about a frame for the third pass.
+// The members are extracted from the decoder by function get_frame_info.
+typedef struct {
+  int width;
+  int height;
+  int mi_stride;
+  int mi_rows;
+  int mi_cols;
+  int base_q_idx;
+  int is_show_existing_frame;
+  int is_show_frame;
+  int bits_allocated;
+  int actual_bits;
+  uint64_t sse;
+  double bpm_factor;
+  FRAME_TYPE frame_type;
+  unsigned int order_hint;
+  THIRD_PASS_MI_INFO *mi_info;
+} THIRD_PASS_FRAME_INFO;
+
+typedef struct {
+  /* --- Input and decoding related members --- */
+  // the input file
+  const char *input_file_name;
+#if CONFIG_THREE_PASS
+  // input context
+  struct AvxInputContext *input_ctx;
+#endif
+  // decoder codec context
+  aom_codec_ctx_t decoder;
+  // start of the frame in buf
+  const unsigned char *frame;
+  // end of the frame(s) in buf
+  const unsigned char *end_frame;
+  // whether we still have following frames in buf
+  int have_frame;
+  // pointer to buffer for the read frames
+  uint8_t *buf;
+  // size of data in buffer
+  size_t bytes_in_buffer;
+  // current buffer size
+  size_t buffer_size;
+  // error info pointer
+  struct aom_internal_error_info *err_info;
+
+  int this_frame_bits;
+
+  /* --- Members for third pass encoding --- */
+  // Array to store info about each frame.
+  // frame_info[0] should point to the current frame.
+  THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF];
+  // number of frames available in frame_info
+  int frame_info_count;
+  // the end of the previous GOP (order hint)
+  int prev_gop_end;
+  THIRD_PASS_GOP_INFO gop_info;
+} THIRD_PASS_DEC_CTX;
+
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+                            const char *file);
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx);
+
+// Set the GOP structure from the twopass bitstream.
+// TODO(bohanli): this is currently a skeleton and we only return the gop
+// length. This function also saves all frame information in the array
+// ctx->frame_info for this GOP.
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx);
+
+// Pop one frame out of the array ctx->frame_info. This function is used to make
+// sure that frame_info[0] always corresponds to the current frame.
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx);
+
+void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read);
+void av1_close_second_pass_log(struct AV1_COMP *cpi);
+
+// Write the current GOP information into the second pass log file.
+void av1_write_second_pass_gop_info(struct AV1_COMP *cpi);
+// Write the information of the frames in this GOP into the second pass log
+// file.
+void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index);
+
+// Read the next GOP information from the second pass log file.
+void av1_read_second_pass_gop_info(FILE *second_pass_log_stream,
+                                   THIRD_PASS_GOP_INFO *gop_info,
+                                   struct aom_internal_error_info *error);
+// read the information of the frames in next GOP from the second pass log file.
+void av1_read_second_pass_per_frame_info(FILE *second_pass_log_stream,
+                                         THIRD_PASS_FRAME_INFO *frame_info_arr,
+                                         int frame_info_count,
+                                         struct aom_internal_error_info *error);
+
+int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx);
+
+// Calculate the ratio of third pass frame dimensions over second pass frame
+// dimensions. Return them in ratio_h and ratio_w.
+void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight,
+                              int fwidth, double *ratio_h, double *ratio_w);
+
+// Get the pointer to a second pass mi info, where mi_row and mi_col are the mi
+// location in the thirdpass frame.
+THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx,
+                                          int mi_row, int mi_col,
+                                          double ratio_h, double ratio_w);
+
+// Get the adjusted MVs of this_mi, associated with the reference frame. If no
+// MV is found with the reference frame, INVALID_MV is returned.
+int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi,
+                                      double ratio_h, double ratio_w,
+                                      MV_REFERENCE_FRAME frame);
+
+// Get the adjusted block size of this_mi.
+BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi,
+                                                double ratio_h, double ratio_w);
+
+// Get the adjusted mi position in the third pass frame, of a given
+// third_pass_mi. Location is returned in mi_row and mi_col.
+void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi,
+                                    double ratio_h, double ratio_w, int *mi_row,
+                                    int *mi_col);
+
+PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
+                                               THIRD_PASS_MI_INFO *this_mi);
+
+#if CONFIG_BITRATE_ACCURACY
+
+void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group,
+                       const TplParams *tpl_data);
+
+void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream,
+                        struct aom_internal_error_info *error);
+
+void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream,
+                       struct aom_internal_error_info *error);
+
+#endif  // CONFIG_BITRATE_ACCURACY
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_THIRDPASS_H_
diff --git a/media/libaom/src/av1/encoder/tokenize.c b/media/libaom/src/av1/encoder/tokenize.c
index e67415349e..de6d452aba 100644
--- a/media/libaom/src/av1/encoder/tokenize.c
+++ b/media/libaom/src/av1/encoder/tokenize.c
@@ -27,9 +27,9 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
-static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
+static int cost_and_tokenize_map(Av1ColorMapParam *param, TokenExtra **t,
                                  int plane, int calc_rate, int allow_update_cdf,
-                                 FRAME_COUNTS *counts, MapCdf map_pb_cdf) {
+                                 FRAME_COUNTS *counts) {
   const uint8_t *const color_map = param->color_map;
   MapCdf map_cdf = param->map_cdf;
   ColorCost color_cost = param->color_cost;
@@ -39,7 +39,6 @@ static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
   const int n = param->n_colors;
   const int palette_size_idx = n - PALETTE_MIN_SIZE;
   int this_rate = 0;
-  uint8_t color_order[PALETTE_MAX_SIZE];
 
   (void)plane;
   (void)counts;
@@ -48,14 +47,14 @@ static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
     for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
       int i = k - j;
       int color_new_idx;
-      const int color_ctx = av1_get_palette_color_index_context(
-          color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
+      const int color_ctx = av1_fast_palette_color_index_context(
+          color_map, plane_block_width, i, j, &color_new_idx);
       assert(color_new_idx >= 0 && color_new_idx < n);
       if (calc_rate) {
-        this_rate += (*color_cost)[palette_size_idx][color_ctx][color_new_idx];
+        this_rate += color_cost[palette_size_idx][color_ctx][color_new_idx];
       } else {
         (*t)->token = color_new_idx;
-        (*t)->color_map_cdf = map_pb_cdf[palette_size_idx][color_ctx];
+        (*t)->color_ctx = color_ctx;
         ++(*t);
         if (allow_update_cdf)
           update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n);
@@ -83,13 +82,14 @@ static void get_palette_params(const MACROBLOCK *const x, int plane,
   params->color_map = xd->plane[plane].color_index_map;
   params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
                           : xd->tile_ctx->palette_y_color_index_cdf;
-  params->color_cost =
-      plane ? &x->palette_uv_color_cost : &x->palette_y_color_cost;
+  params->color_cost = plane ? x->mode_costs.palette_uv_color_cost
+                             : x->mode_costs.palette_y_color_cost;
   params->n_colors = pmi->palette_size[plane];
   av1_get_block_dimensions(bsize, plane, xd, &params->plane_width, NULL,
                            &params->rows, &params->cols);
 }
 
+// TODO(any): Remove this function
 static void get_color_map_params(const MACROBLOCK *const x, int plane,
                                  BLOCK_SIZE bsize, TX_SIZE tx_size,
                                  COLOR_MAP_TYPE type,
@@ -107,14 +107,11 @@ int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
   assert(plane == 0 || plane == 1);
   Av1ColorMapParam color_map_params;
   get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
-  MapCdf map_pb_cdf = plane ? x->tile_pb_ctx->palette_uv_color_index_cdf
-                            : x->tile_pb_ctx->palette_y_color_index_cdf;
-  return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL,
-                               map_pb_cdf);
+  return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL);
 }
 
 void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
-                            TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
                             COLOR_MAP_TYPE type, int allow_update_cdf,
                             FRAME_COUNTS *counts) {
   assert(plane == 0 || plane == 1);
@@ -122,12 +119,10 @@ void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
   get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
   // The first color index does not use context or entropy.
   (*t)->token = color_map_params.color_map[0];
-  (*t)->color_map_cdf = NULL;
+  (*t)->color_ctx = -1;
   ++(*t);
-  MapCdf map_pb_cdf = plane ? x->tile_pb_ctx->palette_uv_color_index_cdf
-                            : x->tile_pb_ctx->palette_y_color_index_cdf;
   cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf,
-                        counts, map_pb_cdf);
+                        counts);
 }
 
 static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size,
@@ -143,16 +138,22 @@ static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size,
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   const TX_SIZE plane_tx_size =
-      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
                                     pd->subsampling_y)
             : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
                                                          blk_col)];
 
   if (tx_size == plane_tx_size || plane) {
-    plane_bsize = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
-                                       pd->subsampling_y);
-    av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
-                                      plane_bsize, tx_size, arg);
+    plane_bsize =
+        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+
+    struct tokenize_b_args *args = arg;
+    if (args->allow_update_cdf)
+      av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
+                                        plane_bsize, tx_size, arg);
+    else
+      av1_record_txb_context(plane, block, blk_row, blk_col, plane_bsize,
+                             tx_size, arg);
 
   } else {
     // Half the block size in transform block unit.
@@ -160,16 +161,18 @@ static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size,
     const int bsw = tx_size_wide_unit[sub_txs];
     const int bsh = tx_size_high_unit[sub_txs];
     const int step = bsw * bsh;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
 
     assert(bsw > 0 && bsh > 0);
 
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        const int offsetr = blk_row + row;
+    for (int row = 0; row < row_end; row += bsh) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += bsw) {
         const int offsetc = blk_col + col;
 
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
         tokenize_vartx(td, sub_txs, plane_bsize, offsetr, offsetc, block, plane,
                        arg);
         block += step;
@@ -194,7 +197,7 @@ void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td,
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
 
-  if (mbmi->skip) {
+  if (mbmi->skip_txfm) {
     av1_reset_entropy_context(xd, bsize, num_planes);
     return;
   }
diff --git a/media/libaom/src/av1/encoder/tokenize.h b/media/libaom/src/av1/encoder/tokenize.h
index 52caacbaee..f675c489ae 100644
--- a/media/libaom/src/av1/encoder/tokenize.h
+++ b/media/libaom/src/av1/encoder/tokenize.h
@@ -20,10 +20,47 @@
 extern "C" {
 #endif
 
+// The token and color_ctx members of the TokenExtra structure are used
+// to store the indices of color and color context of each pixel in
+// case of palette mode.
+// 1) token can take values in the range of [0, 7] as maximum number of possible
+// colors is 8 (PALETTE_COLORS). Hence token requires 3 bits (unsigned).
+// 2) The reserved field (1-bit) is positioned such that color_ctx occupies the
+// most significant bits and token occupies the least significant bits of the
+// byte. Thus accesses to token and color_ctx are optimal. If TokenExtra is
+// defined as:
+//   typedef struct {
+//     int8_t color_ctx : 4;
+//     uint8_t token : 3;
+//   } TokenExtra;
+// then read of color_ctx requires an extra left shift to facilitate sign
+// extension and write of token requires an extra masking.
+// 3) color_ctx can take 5 (PALETTE_COLOR_INDEX_CONTEXTS) valid values, i.e.,
+// from 0 to 4. As per the current implementation it can take values in the
+// range of [-1, 4]. Here -1 corresponds to invalid color index context and is
+// used for default initialization. Hence color_ctx requires 4 bits (signed).
 typedef struct {
-  aom_cdf_prob *color_map_cdf;
-  uint8_t token;
-} TOKENEXTRA;
+  uint8_t token : 3;
+  uint8_t reserved : 1;
+  int8_t color_ctx : 4;
+} TokenExtra;
+
+typedef struct {
+  TokenExtra *start;
+  unsigned int count;
+} TokenList;
+
+typedef struct {
+  // Number of tile tokens for which memory is allocated.
+  unsigned int tokens_allocated;
+  // tile_tok[i][j] is a pointer to the buffer storing palette tokens of the ith
+  // tile row, jth tile column.
+  TokenExtra *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
+  // tplist[i][j][k] holds the start pointer of tile_tok[i][j] and the count of
+  // palette tokens for the kth superblock row of the ith tile row, jth tile
+  // column.
+  TokenList *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
+} TokenInfo;
 
 struct AV1_COMP;
 struct ThreadData;
@@ -54,7 +91,7 @@ int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
                        TX_SIZE tx_size, COLOR_MAP_TYPE type);
 
 void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
-                            TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
                             COLOR_MAP_TYPE type, int allow_update_cdf,
                             struct FRAME_COUNTS *counts);
 
@@ -64,6 +101,57 @@ static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
+// Token buffer is only used for palette tokens.
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
+                                           int sb_size_log2,
+                                           const int num_planes) {
+  // Calculate the maximum number of max superblocks in the image.
+  const int shift = sb_size_log2 - 4;
+  const int sb_size = 1 << sb_size_log2;
+  const int sb_size_square = sb_size * sb_size;
+  const int sb_rows = CEIL_POWER_OF_TWO(mb_rows, shift);
+  const int sb_cols = CEIL_POWER_OF_TWO(mb_cols, shift);
+
+  // One palette token for each pixel. There can be palettes on two planes.
+  const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square;
+
+  return sb_rows * sb_cols * sb_palette_toks;
+}
+
+// Allocate memory for token related info.
+static AOM_INLINE void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info,
+                                        unsigned int tokens_required) {
+  int sb_rows =
+      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+  token_info->tokens_allocated = tokens_required;
+
+  CHECK_MEM_ERROR(cm, token_info->tile_tok[0][0],
+                  (TokenExtra *)aom_calloc(
+                      tokens_required, sizeof(*token_info->tile_tok[0][0])));
+
+  CHECK_MEM_ERROR(
+      cm, token_info->tplist[0][0],
+      (TokenList *)aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
+                              sizeof(*token_info->tplist[0][0])));
+}
+
+// Check if memory allocation has been done for token related info.
+static AOM_INLINE bool is_token_info_allocated(const TokenInfo *token_info) {
+  return ((token_info->tile_tok[0][0] != NULL) &&
+          (token_info->tplist[0][0] != NULL));
+}
+
+// Free memory from token related variables.
+static AOM_INLINE void free_token_info(TokenInfo *token_info) {
+  aom_free(token_info->tile_tok[0][0]);
+  token_info->tile_tok[0][0] = NULL;
+
+  aom_free(token_info->tplist[0][0]);
+  token_info->tplist[0][0] = NULL;
+
+  token_info->tokens_allocated = 0;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/tpl_model.c b/media/libaom/src/av1/encoder/tpl_model.c
index 79b94f3739..238be92f34 100644
--- a/media/libaom/src/av1/encoder/tpl_model.c
+++ b/media/libaom/src/av1/encoder/tpl_model.c
@@ -12,12 +12,12 @@
 #include <stdint.h>
 #include <float.h>
 
+#include "av1/encoder/thirdpass.h"
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 #include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_codec.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/common/av1_common_int.h"
 #include "av1/common/enums.h"
@@ -25,12 +25,75 @@
 #include "av1/common/reconintra.h"
 
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/tpl_model.h"
 
+static INLINE double exp_bounded(double v) {
+  // When v > 700 or <-700, the exp function will be close to overflow
+  // For details, see the "Notes" in the following link.
+  // https://en.cppreference.com/w/c/numeric/math/exp
+  if (v > 700) {
+    return DBL_MAX;
+  } else if (v < -700) {
+    return 0;
+  }
+  return exp(v);
+}
+
+void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats) {
+  tpl_txfm_stats->ready = 0;
+  tpl_txfm_stats->coeff_num = 256;
+  tpl_txfm_stats->txfm_block_count = 0;
+  memset(tpl_txfm_stats->abs_coeff_sum, 0,
+         sizeof(tpl_txfm_stats->abs_coeff_sum[0]) * tpl_txfm_stats->coeff_num);
+  memset(tpl_txfm_stats->abs_coeff_mean, 0,
+         sizeof(tpl_txfm_stats->abs_coeff_mean[0]) * tpl_txfm_stats->coeff_num);
+}
+
+void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats,
+                                   TplTxfmStats *accumulated_stats) {
+  accumulated_stats->txfm_block_count += sub_stats->txfm_block_count;
+  for (int i = 0; i < accumulated_stats->coeff_num; ++i) {
+    accumulated_stats->abs_coeff_sum[i] += sub_stats->abs_coeff_sum[i];
+  }
+}
+
+void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats,
+                               const tran_low_t *coeff) {
+  // For transform larger than 16x16, the scale of coeff need to be adjusted.
+  // It's not LOSSLESS_Q_STEP.
+  assert(tpl_txfm_stats->coeff_num <= 256);
+  for (int i = 0; i < tpl_txfm_stats->coeff_num; ++i) {
+    tpl_txfm_stats->abs_coeff_sum[i] += abs(coeff[i]) / (double)LOSSLESS_Q_STEP;
+  }
+  ++tpl_txfm_stats->txfm_block_count;
+}
+
+void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats) {
+  if (txfm_stats->txfm_block_count > 0) {
+    for (int j = 0; j < txfm_stats->coeff_num; j++) {
+      txfm_stats->abs_coeff_mean[j] =
+          txfm_stats->abs_coeff_sum[j] / txfm_stats->txfm_block_count;
+    }
+    txfm_stats->ready = 1;
+  } else {
+    txfm_stats->ready = 0;
+  }
+}
+
+static AOM_INLINE void av1_tpl_store_txfm_stats(
+    TplParams *tpl_data, const TplTxfmStats *tpl_txfm_stats,
+    const int frame_index) {
+  tpl_data->txfm_stats_list[frame_index] = *tpl_txfm_stats;
+}
+
 static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane,
                                           const tran_low_t *coeff,
                                           tran_low_t *qcoeff,
@@ -38,60 +101,124 @@ static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane,
                                           uint16_t *eob, int64_t *recon_error,
                                           int64_t *sse) {
   const struct macroblock_plane *const p = &x->plane[plane];
-  const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
   int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
   const int shift = tx_size == TX_32X32 ? 0 : 2;
 
-  av1_quantize_fp(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
-                  p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob,
-                  scan_order->scan, scan_order->iscan);
+  QUANT_PARAM quant_param;
+  av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
 
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob,
+                                  scan_order, &quant_param);
+    *recon_error =
+        av1_highbd_block_error(coeff, dqcoeff, pix_num, sse, xd->bd) >> shift;
+  } else {
+    av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order,
+                           &quant_param);
+    *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+  }
+#else
+  (void)xd;
+  av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order,
+                         &quant_param);
   *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
   *recon_error = AOMMAX(*recon_error, 1);
 
   *sse = (*sse) >> shift;
   *sse = AOMMAX(*sse, 1);
 }
 
-static AOM_INLINE void tpl_fwd_txfm(const int16_t *src_diff, int bw,
-                                    tran_low_t *coeff, TX_SIZE tx_size,
-                                    int bit_depth, int is_hbd) {
-  TxfmParam txfm_param;
-  txfm_param.tx_type = DCT_DCT;
-  txfm_param.tx_size = tx_size;
-  txfm_param.lossless = 0;
-  txfm_param.tx_set_type = EXT_TX_SET_ALL16;
-
-  txfm_param.bd = bit_depth;
-  txfm_param.is_hbd = is_hbd;
-  av1_fwd_txfm(src_diff, coeff, bw, &txfm_param);
+static AOM_INLINE void set_tpl_stats_block_size(uint8_t *block_mis_log2,
+                                                uint8_t *tpl_bsize_1d) {
+  // tpl stats bsize: 2 means 16x16
+  *block_mis_log2 = 2;
+  // Block size used in tpl motion estimation
+  *tpl_bsize_1d = 16;
+  // MIN_TPL_BSIZE_1D = 16;
+  assert(*tpl_bsize_1d >= 16);
 }
 
-static AOM_INLINE int64_t tpl_get_satd_cost(const MACROBLOCK *x,
+void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi,
+                           CommonModeInfoParams *const mi_params, int width,
+                           int height, int byte_alignment, int lag_in_frames) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  TplParams *const tpl_data = &ppi->tpl_data;
+  set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2,
+                           &tpl_data->tpl_bsize_1d);
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  tpl_data->border_in_pixels =
+      ALIGN_POWER_OF_TWO(tpl_data->tpl_bsize_1d + 2 * AOM_INTERP_EXTEND, 5);
+
+  const int alloc_y_plane_only =
+      ppi->cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : 0;
+  for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
+    const int mi_cols =
+        ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2);
+    const int mi_rows =
+        ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2);
+    TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame];
+    tpl_frame->is_valid = 0;
+    tpl_frame->width = mi_cols >> block_mis_log2;
+    tpl_frame->height = mi_rows >> block_mis_log2;
+    tpl_frame->stride = tpl_data->tpl_stats_buffer[frame].width;
+    tpl_frame->mi_rows = mi_params->mi_rows;
+    tpl_frame->mi_cols = mi_params->mi_cols;
+  }
+  tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1];
+
+  // If lag_in_frames <= 1, TPL module is not invoked. Hence dynamic memory
+  // allocations are avoided for buffers in tpl_data.
+  if (lag_in_frames <= 1) return;
+
+  AOM_CHECK_MEM_ERROR(&ppi->error, tpl_data->txfm_stats_list,
+                      aom_calloc(MAX_LENGTH_TPL_FRAME_STATS,
+                                 sizeof(*tpl_data->txfm_stats_list)));
+
+  for (int frame = 0; frame < lag_in_frames; ++frame) {
+    AOM_CHECK_MEM_ERROR(
+        &ppi->error, tpl_data->tpl_stats_pool[frame],
+        aom_calloc(tpl_data->tpl_stats_buffer[frame].width *
+                       tpl_data->tpl_stats_buffer[frame].height,
+                   sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr)));
+
+    if (aom_alloc_frame_buffer(
+            &tpl_data->tpl_rec_pool[frame], width, height,
+            seq_params->subsampling_x, seq_params->subsampling_y,
+            seq_params->use_highbitdepth, tpl_data->border_in_pixels,
+            byte_alignment, alloc_y_plane_only))
+      aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate frame buffer");
+  }
+}
+
+static AOM_INLINE int64_t tpl_get_satd_cost(BitDepthInfo bd_info,
                                             int16_t *src_diff, int diff_stride,
                                             const uint8_t *src, int src_stride,
                                             const uint8_t *dst, int dst_stride,
                                             tran_low_t *coeff, int bw, int bh,
                                             TX_SIZE tx_size) {
-  const MACROBLOCKD *xd = &x->e_mbd;
   const int pix_num = bw * bh;
 
-  av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
-                     dst_stride);
-  tpl_fwd_txfm(src_diff, bw, coeff, tx_size, xd->bd, is_cur_buf_hbd(xd));
+  av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+                     dst, dst_stride);
+  av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
   return aom_satd(coeff, pix_num);
 }
 
 static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
-  const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
+  const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
 
   assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
-
   int rate_cost = 1;
 
   for (int idx = 0; idx < eob; ++idx) {
     int abs_level = abs(qcoeff[scan_order->scan[idx]]);
-    rate_cost += (int)(log(abs_level + 1.0) / log(2.0)) + 1;
+    rate_cost += (int)(log(abs_level + 1.0) / log(2.0)) + 1 + (abs_level > 0);
   }
 
   return (rate_cost << AV1_PROB_COST_SHIFT);
@@ -103,11 +230,11 @@ static AOM_INLINE void txfm_quant_rdcost(
     tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size,
     int *rate_cost, int64_t *recon_error, int64_t *sse) {
   const MACROBLOCKD *xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
   uint16_t eob;
-  av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
-                     dst_stride);
-  tpl_fwd_txfm(src_diff, diff_stride, coeff, tx_size, xd->bd,
-               is_cur_buf_hbd(xd));
+  av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+                     dst, dst_stride);
+  av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
 
   get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error,
                      sse);
@@ -142,15 +269,18 @@ static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
   step_param = tpl_sf->reduce_first_step_size;
   step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
 
-  search_site_config *ss_cfg = &cpi->mv_search_params.ss_cfg[SS_CFG_SRC];
-  if (ss_cfg->stride != stride_ref)
-    ss_cfg = &cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
-
-  assert(ss_cfg->stride == stride_ref);
+  const search_site_config *search_site_cfg =
+      cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
+  if (search_site_cfg->stride != stride_ref)
+    search_site_cfg = cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
+  assert(search_site_cfg->stride == stride_ref);
 
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
-                                     ss_cfg);
+                                     search_site_cfg,
+                                     /*fine_search_interval=*/0);
+  av1_set_mv_search_method(&full_ms_params, search_site_cfg,
+                           tpl_sf->search_method);
 
   av1_full_pixel_search(start_mv, &full_ms_params, step_param,
                         cond_cost_list(cpi, cost_list), &best_mv->as_fullmv,
@@ -170,7 +300,21 @@ static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
   return bestsme;
 }
 
-static int is_alike_mv(int_mv candidate_mv, int_mv *center_mvs,
+typedef struct {
+  int_mv mv;
+  int sad;
+} center_mv_t;
+
+static int compare_sad(const void *a, const void *b) {
+  const int diff = ((center_mv_t *)a)->sad - ((center_mv_t *)b)->sad;
+  if (diff < 0)
+    return -1;
+  else if (diff > 0)
+    return 1;
+  return 0;
+}
+
+static int is_alike_mv(int_mv candidate_mv, center_mv_t *center_mvs,
                        int center_mvs_count, int skip_alike_starting_mv) {
   // MV difference threshold is in 1/8 precision.
   const int mv_diff_thr[3] = { 1, (8 << 3), (16 << 3) };
@@ -178,26 +322,133 @@ static int is_alike_mv(int_mv candidate_mv, int_mv *center_mvs,
   int i;
 
   for (i = 0; i < center_mvs_count; i++) {
-    if (abs(center_mvs[i].as_mv.col - candidate_mv.as_mv.col) < thr &&
-        abs(center_mvs[i].as_mv.row - candidate_mv.as_mv.row) < thr)
+    if (abs(center_mvs[i].mv.as_mv.col - candidate_mv.as_mv.col) < thr &&
+        abs(center_mvs[i].mv.as_mv.row - candidate_mv.as_mv.row) < thr)
       return 1;
   }
 
   return 0;
 }
 
-static AOM_INLINE void mode_estimation(
-    AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, struct scale_factors *sf,
-    int frame_idx, int mi_row, int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
-    const YV12_BUFFER_CONFIG *ref_frame[],
-    const YV12_BUFFER_CONFIG *src_ref_frame[], TplDepStats *tpl_stats) {
+static void get_rate_distortion(
+    int *rate_cost, int64_t *recon_error, int64_t *pred_error,
+    int16_t *src_diff, tran_low_t *coeff, tran_low_t *qcoeff,
+    tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x,
+    const YV12_BUFFER_CONFIG *ref_frame_ptr[2], uint8_t *rec_buffer_pool[3],
+    const int rec_stride_pool[3], TX_SIZE tx_size, PREDICTION_MODE best_mode,
+    int mi_row, int mi_col, int use_y_only_rate_distortion,
+    TplTxfmStats *tpl_txfm_stats) {
+  const SequenceHeader *seq_params = cm->seq_params;
+  *rate_cost = 0;
+  *recon_error = 1;
+  *pred_error = 1;
+
+  MACROBLOCKD *xd = &x->e_mbd;
+  int is_compound = (best_mode == NEW_NEWMV);
+  int num_planes = use_y_only_rate_distortion ? 1 : MAX_MB_PLANE;
+
+  uint8_t *src_buffer_pool[MAX_MB_PLANE] = {
+    xd->cur_buf->y_buffer,
+    xd->cur_buf->u_buffer,
+    xd->cur_buf->v_buffer,
+  };
+  const int src_stride_pool[MAX_MB_PLANE] = {
+    xd->cur_buf->y_stride,
+    xd->cur_buf->uv_stride,
+    xd->cur_buf->uv_stride,
+  };
+
+  const int_interpfilters kernel =
+      av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    struct macroblockd_plane *pd = &xd->plane[plane];
+    BLOCK_SIZE bsize_plane =
+        ss_size_lookup[txsize_to_bsize[tx_size]][pd->subsampling_x]
+                      [pd->subsampling_y];
+
+    int dst_buffer_stride = rec_stride_pool[plane];
+    int dst_mb_offset =
+        ((mi_row * MI_SIZE * dst_buffer_stride) >> pd->subsampling_y) +
+        ((mi_col * MI_SIZE) >> pd->subsampling_x);
+    uint8_t *dst_buffer = rec_buffer_pool[plane] + dst_mb_offset;
+    for (int ref = 0; ref < 1 + is_compound; ++ref) {
+      if (!is_inter_mode(best_mode)) {
+        av1_predict_intra_block(
+            xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+            block_size_wide[bsize_plane], block_size_high[bsize_plane],
+            max_txsize_rect_lookup[bsize_plane], best_mode, 0, 0,
+            FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, dst_buffer,
+            dst_buffer_stride, 0, 0, plane);
+      } else {
+        int_mv best_mv = xd->mi[0]->mv[ref];
+        uint8_t *ref_buffer_pool[MAX_MB_PLANE] = {
+          ref_frame_ptr[ref]->y_buffer,
+          ref_frame_ptr[ref]->u_buffer,
+          ref_frame_ptr[ref]->v_buffer,
+        };
+        InterPredParams inter_pred_params;
+        struct buf_2d ref_buf = {
+          NULL, ref_buffer_pool[plane],
+          plane ? ref_frame_ptr[ref]->uv_width : ref_frame_ptr[ref]->y_width,
+          plane ? ref_frame_ptr[ref]->uv_height : ref_frame_ptr[ref]->y_height,
+          plane ? ref_frame_ptr[ref]->uv_stride : ref_frame_ptr[ref]->y_stride
+        };
+        av1_init_inter_params(&inter_pred_params, block_size_wide[bsize_plane],
+                              block_size_high[bsize_plane],
+                              (mi_row * MI_SIZE) >> pd->subsampling_y,
+                              (mi_col * MI_SIZE) >> pd->subsampling_x,
+                              pd->subsampling_x, pd->subsampling_y, xd->bd,
+                              is_cur_buf_hbd(xd), 0,
+                              xd->block_ref_scale_factors[0], &ref_buf, kernel);
+        if (is_compound) av1_init_comp_mode(&inter_pred_params);
+        inter_pred_params.conv_params = get_conv_params_no_round(
+            ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+
+        av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride,
+                                          &best_mv.as_mv, &inter_pred_params);
+      }
+    }
+
+    int src_stride = src_stride_pool[plane];
+    int src_mb_offset = ((mi_row * MI_SIZE * src_stride) >> pd->subsampling_y) +
+                        ((mi_col * MI_SIZE) >> pd->subsampling_x);
+
+    int this_rate = 1;
+    int64_t this_recon_error = 1;
+    int64_t sse;
+    txfm_quant_rdcost(
+        x, src_diff, block_size_wide[bsize_plane],
+        src_buffer_pool[plane] + src_mb_offset, src_stride, dst_buffer,
+        dst_buffer_stride, coeff, qcoeff, dqcoeff, block_size_wide[bsize_plane],
+        block_size_high[bsize_plane], max_txsize_rect_lookup[bsize_plane],
+        &this_rate, &this_recon_error, &sse);
+
+    if (plane == 0 && tpl_txfm_stats) {
+      // We only collect Y plane's transform coefficient
+      av1_record_tpl_txfm_block(tpl_txfm_stats, coeff);
+    }
+
+    *recon_error += this_recon_error;
+    *pred_error += sse;
+    *rate_cost += this_rate;
+  }
+}
+
+static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
+                                       TplTxfmStats *tpl_txfm_stats,
+                                       MACROBLOCK *x, int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                       TplDepStats *tpl_stats) {
   AV1_COMMON *cm = &cpi->common;
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
 
   (void)gf_group;
 
-  TplParams *tpl_data = &cpi->tpl_data;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  TplParams *tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
 
   const int bw = 4 << mi_size_wide_log2[bsize];
@@ -205,31 +456,66 @@ static AOM_INLINE void mode_estimation(
   const int_interpfilters kernel =
       av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
 
+  int frame_offset = tpl_data->frame_idx - cpi->gf_frame_index;
+
   int64_t best_intra_cost = INT64_MAX;
   int64_t intra_cost;
   PREDICTION_MODE best_mode = DC_PRED;
 
   int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
   uint8_t *src_mb_buffer = xd->cur_buf->y_buffer + mb_y_offset;
-  const int src_stride = xd->cur_buf->y_stride;
+  int src_stride = xd->cur_buf->y_stride;
 
-  const int dst_mb_offset =
+  int dst_mb_offset =
       mi_row * MI_SIZE * tpl_frame->rec_picture->y_stride + mi_col * MI_SIZE;
   uint8_t *dst_buffer = tpl_frame->rec_picture->y_buffer + dst_mb_offset;
-  const int dst_buffer_stride = tpl_frame->rec_picture->y_stride;
-
-  // Temporaray buffers
-  DECLARE_ALIGNED(32, uint8_t, predictor8[MC_FLOW_NUM_PELS * 2]);
-  DECLARE_ALIGNED(32, int16_t, src_diff[MC_FLOW_NUM_PELS]);
-  DECLARE_ALIGNED(32, tran_low_t, coeff[MC_FLOW_NUM_PELS]);
-  DECLARE_ALIGNED(32, tran_low_t, qcoeff[MC_FLOW_NUM_PELS]);
-  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MC_FLOW_NUM_PELS]);
-  DECLARE_ALIGNED(32, tran_low_t, best_coeff[MC_FLOW_NUM_PELS]);
+  int dst_buffer_stride = tpl_frame->rec_picture->y_stride;
+  int use_y_only_rate_distortion = cpi->sf.tpl_sf.use_y_only_rate_distortion;
+
+  uint8_t *rec_buffer_pool[3] = {
+    tpl_frame->rec_picture->y_buffer,
+    tpl_frame->rec_picture->u_buffer,
+    tpl_frame->rec_picture->v_buffer,
+  };
+
+  const int rec_stride_pool[3] = {
+    tpl_frame->rec_picture->y_stride,
+    tpl_frame->rec_picture->uv_stride,
+    tpl_frame->rec_picture->uv_stride,
+  };
+
+  for (int plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    struct macroblockd_plane *pd = &xd->plane[plane];
+    pd->subsampling_x = xd->cur_buf->subsampling_x;
+    pd->subsampling_y = xd->cur_buf->subsampling_y;
+  }
+
+  // Number of pixels in a tpl block
+  const int tpl_block_pels = tpl_data->tpl_bsize_1d * tpl_data->tpl_bsize_1d;
+  // Allocate temporary buffers used in motion estimation.
+  uint8_t *predictor8 = aom_memalign(32, tpl_block_pels * 2 * sizeof(uint8_t));
+  int16_t *src_diff = aom_memalign(32, tpl_block_pels * sizeof(int16_t));
+  tran_low_t *coeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
+  tran_low_t *qcoeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
+  tran_low_t *dqcoeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
   uint8_t *predictor =
       is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
-  int64_t recon_error = 1, sse = 1;
+  int64_t recon_error = 1;
+  int64_t pred_error = 1;
+
+  if (!(predictor8 && src_diff && coeff && qcoeff && dqcoeff)) {
+    aom_free(predictor8);
+    aom_free(src_diff);
+    aom_free(coeff);
+    aom_free(qcoeff);
+    aom_free(dqcoeff);
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating tpl data");
+  }
 
   memset(tpl_stats, 0, sizeof(*tpl_stats));
+  tpl_stats->ref_frame_index[0] = -1;
+  tpl_stats->ref_frame_index[1] = -1;
 
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
@@ -239,7 +525,7 @@ static AOM_INLINE void mode_estimation(
                  cm->mi_params.mi_rows, cm->mi_params.mi_cols);
   set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
                av1_num_planes(cm));
-  xd->mi[0]->sb_type = bsize;
+  xd->mi[0]->bsize = bsize;
   xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
 
   // Intra prediction search
@@ -248,7 +534,6 @@ static AOM_INLINE void mode_estimation(
   // Pre-load the bottom left line.
   if (xd->left_available &&
       mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) {
-#if CONFIG_AV1_HIGHBITDEPTH
     if (is_cur_buf_hbd(xd)) {
       uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer);
       for (int i = 0; i < bw; ++i)
@@ -259,26 +544,24 @@ static AOM_INLINE void mode_estimation(
         dst_buffer[(bw + i) * dst_buffer_stride - 1] =
             dst_buffer[(bw - 1) * dst_buffer_stride - 1];
     }
-#else
-    for (int i = 0; i < bw; ++i)
-      dst_buffer[(bw + i) * dst_buffer_stride - 1] =
-          dst_buffer[(bw - 1) * dst_buffer_stride - 1];
-#endif
   }
 
   // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED,
   // H_PRED, and V_PRED
   const PREDICTION_MODE last_intra_mode =
       cpi->sf.tpl_sf.prune_intra_modes ? D45_PRED : INTRA_MODE_END;
+  const SequenceHeader *seq_params = cm->seq_params;
   for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode;
        ++mode) {
-    av1_predict_intra_block(cm, xd, block_size_wide[bsize],
-                            block_size_high[bsize], tx_size, mode, 0, 0,
-                            FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride,
-                            predictor, bw, 0, 0, 0);
+    av1_predict_intra_block(xd, seq_params->sb_size,
+                            seq_params->enable_intra_edge_filter,
+                            block_size_wide[bsize], block_size_high[bsize],
+                            tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
+                            dst_buffer_stride, predictor, bw, 0, 0, 0);
 
-    intra_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride,
-                                   predictor, bw, coeff, bw, bh, tx_size);
+    intra_cost =
+        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                          predictor, bw, coeff, bw, bh, tx_size);
 
     if (intra_cost < best_intra_cost) {
       best_intra_cost = intra_cost;
@@ -286,24 +569,60 @@ static AOM_INLINE void mode_estimation(
     }
   }
 
+  if (cpi->third_pass_ctx &&
+      frame_offset < cpi->third_pass_ctx->frame_info_count &&
+      tpl_data->frame_idx < gf_group->size) {
+    double ratio_h, ratio_w;
+    av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height,
+                             cm->width, &ratio_h, &ratio_w);
+    THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+        cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w);
+
+    PREDICTION_MODE third_pass_mode = this_mi->pred_mode;
+
+    if (third_pass_mode >= last_intra_mode &&
+        third_pass_mode < INTRA_MODE_END) {
+      av1_predict_intra_block(
+          xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+          block_size_wide[bsize], block_size_high[bsize], tx_size,
+          third_pass_mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
+          dst_buffer_stride, predictor, bw, 0, 0, 0);
+
+      intra_cost =
+          tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                            predictor, bw, coeff, bw, bh, tx_size);
+
+      if (intra_cost < best_intra_cost) {
+        best_intra_cost = intra_cost;
+        best_mode = third_pass_mode;
+      }
+    }
+  }
+
   // Motion compensated prediction
   xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+  xd->mi[0]->ref_frame[1] = NONE_FRAME;
+  xd->mi[0]->compound_idx = 1;
 
   int best_rf_idx = -1;
-  int_mv best_mv;
+  int_mv best_mv[2];
   int64_t inter_cost;
   int64_t best_inter_cost = INT64_MAX;
   int rf_idx;
+  int_mv single_mv[INTER_REFS_PER_FRAME];
 
-  best_mv.as_int = INVALID_MV;
+  best_mv[0].as_int = INVALID_MV;
+  best_mv[1].as_int = INVALID_MV;
 
   for (rf_idx = 0; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) {
-    if (ref_frame[rf_idx] == NULL || src_ref_frame[rf_idx] == NULL) {
+    single_mv[rf_idx].as_int = INVALID_MV;
+    if (tpl_data->ref_frame[rf_idx] == NULL ||
+        tpl_data->src_ref_frame[rf_idx] == NULL) {
       tpl_stats->mv[rf_idx].as_int = INVALID_MV;
       continue;
     }
 
-    const YV12_BUFFER_CONFIG *ref_frame_ptr = src_ref_frame[rf_idx];
+    const YV12_BUFFER_CONFIG *ref_frame_ptr = tpl_data->src_ref_frame[rf_idx];
     int ref_mb_offset =
         mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
     uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
@@ -312,15 +631,19 @@ static AOM_INLINE void mode_estimation(
     int_mv best_rfidx_mv = { 0 };
     uint32_t bestsme = UINT32_MAX;
 
-    int_mv center_mvs[4] = { { 0 } };
+    center_mv_t center_mvs[4] = { { { 0 }, INT_MAX },
+                                  { { 0 }, INT_MAX },
+                                  { { 0 }, INT_MAX },
+                                  { { 0 }, INT_MAX } };
     int refmv_count = 1;
+    int idx;
 
     if (xd->up_available) {
       TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
           mi_row - mi_height, mi_col, tpl_frame->stride, block_mis_log2)];
       if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
                        cpi->sf.tpl_sf.skip_alike_starting_mv)) {
-        center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
         ++refmv_count;
       }
     }
@@ -330,7 +653,7 @@ static AOM_INLINE void mode_estimation(
           mi_row, mi_col - mi_width, tpl_frame->stride, block_mis_log2)];
       if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
                        cpi->sf.tpl_sf.skip_alike_starting_mv)) {
-        center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
         ++refmv_count;
       }
     }
@@ -341,16 +664,59 @@ static AOM_INLINE void mode_estimation(
           block_mis_log2)];
       if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
                        cpi->sf.tpl_sf.skip_alike_starting_mv)) {
-        center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
         ++refmv_count;
       }
     }
 
-    for (int idx = 0; idx < refmv_count; ++idx) {
+    if (cpi->third_pass_ctx &&
+        frame_offset < cpi->third_pass_ctx->frame_info_count &&
+        tpl_data->frame_idx < gf_group->size) {
+      double ratio_h, ratio_w;
+      av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height,
+                               cm->width, &ratio_h, &ratio_w);
+      THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+          cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w);
+
+      int_mv tp_mv = av1_get_third_pass_adjusted_mv(this_mi, ratio_h, ratio_w,
+                                                    rf_idx + LAST_FRAME);
+      if (tp_mv.as_int != INVALID_MV &&
+          !is_alike_mv(tp_mv, center_mvs + 1, refmv_count - 1,
+                       cpi->sf.tpl_sf.skip_alike_starting_mv)) {
+        center_mvs[0].mv = tp_mv;
+      }
+    }
+
+    // Prune starting mvs
+    if (cpi->sf.tpl_sf.prune_starting_mv) {
+      // Get each center mv's sad.
+      for (idx = 0; idx < refmv_count; ++idx) {
+        FULLPEL_MV mv = get_fullmv_from_mv(&center_mvs[idx].mv.as_mv);
+        clamp_fullmv(&mv, &x->mv_limits);
+        center_mvs[idx].sad = (int)cpi->ppi->fn_ptr[bsize].sdf(
+            src_mb_buffer, src_stride, &ref_mb[mv.row * ref_stride + mv.col],
+            ref_stride);
+      }
+
+      // Rank center_mv using sad.
+      if (refmv_count > 1) {
+        qsort(center_mvs, refmv_count, sizeof(center_mvs[0]), compare_sad);
+      }
+      refmv_count = AOMMIN(4 - cpi->sf.tpl_sf.prune_starting_mv, refmv_count);
+      // Further reduce number of refmv based on sad difference.
+      if (refmv_count > 1) {
+        int last_sad = center_mvs[refmv_count - 1].sad;
+        int second_to_last_sad = center_mvs[refmv_count - 2].sad;
+        if ((last_sad - second_to_last_sad) * 5 > second_to_last_sad)
+          refmv_count--;
+      }
+    }
+
+    for (idx = 0; idx < refmv_count; ++idx) {
       int_mv this_mv;
-      uint32_t thissme =
-          motion_estimation(cpi, x, src_mb_buffer, ref_mb, src_stride,
-                            ref_stride, bsize, center_mvs[idx].as_mv, &this_mv);
+      uint32_t thissme = motion_estimation(cpi, x, src_mb_buffer, ref_mb,
+                                           src_stride, ref_stride, bsize,
+                                           center_mvs[idx].mv.as_mv, &this_mv);
 
       if (thissme < bestsme) {
         bestsme = thissme;
@@ -359,6 +725,7 @@ static AOM_INLINE void mode_estimation(
     }
 
     tpl_stats->mv[rf_idx].as_int = best_rfidx_mv.as_int;
+    single_mv[rf_idx] = best_rfidx_mv;
 
     struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
                               ref_frame_ptr->y_width, ref_frame_ptr->y_height,
@@ -366,90 +733,249 @@ static AOM_INLINE void mode_estimation(
     InterPredParams inter_pred_params;
     av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
                           mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
-                          sf, &ref_buf, kernel);
+                          &tpl_data->sf, &ref_buf, kernel);
     inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
 
     av1_enc_build_one_inter_predictor(predictor, bw, &best_rfidx_mv.as_mv,
                                       &inter_pred_params);
 
-    inter_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride,
-                                   predictor, bw, coeff, bw, bh, tx_size);
+    inter_cost =
+        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                          predictor, bw, coeff, bw, bh, tx_size);
     // Store inter cost for each ref frame
     tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost);
 
     if (inter_cost < best_inter_cost) {
-      memcpy(best_coeff, coeff, sizeof(best_coeff));
       best_rf_idx = rf_idx;
 
       best_inter_cost = inter_cost;
-      best_mv.as_int = best_rfidx_mv.as_int;
+      best_mv[0].as_int = best_rfidx_mv.as_int;
       if (best_inter_cost < best_intra_cost) {
         best_mode = NEWMV;
         xd->mi[0]->ref_frame[0] = best_rf_idx + LAST_FRAME;
-        xd->mi[0]->mv[0].as_int = best_mv.as_int;
+        xd->mi[0]->mv[0].as_int = best_mv[0].as_int;
       }
     }
   }
 
-  if (best_inter_cost < INT64_MAX) {
-    uint16_t eob;
-    get_quantize_error(x, 0, best_coeff, qcoeff, dqcoeff, tx_size, &eob,
-                       &recon_error, &sse);
+  int comp_ref_frames[3][2] = {
+    { 0, 4 },
+    { 0, 6 },
+    { 3, 6 },
+  };
+
+  int start_rf = 0;
+  int end_rf = 3;
+  if (!cpi->sf.tpl_sf.allow_compound_pred) end_rf = 0;
+  if (cpi->third_pass_ctx &&
+      frame_offset < cpi->third_pass_ctx->frame_info_count &&
+      tpl_data->frame_idx < gf_group->size) {
+    double ratio_h, ratio_w;
+    av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height,
+                             cm->width, &ratio_h, &ratio_w);
+    THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+        cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w);
+
+    if (this_mi->ref_frame[0] >= LAST_FRAME &&
+        this_mi->ref_frame[1] >= LAST_FRAME) {
+      int found = 0;
+      for (int i = 0; i < 3; i++) {
+        if (comp_ref_frames[i][0] + LAST_FRAME == this_mi->ref_frame[0] &&
+            comp_ref_frames[i][1] + LAST_FRAME == this_mi->ref_frame[1]) {
+          found = 1;
+          break;
+        }
+      }
+      if (!found || !cpi->sf.tpl_sf.allow_compound_pred) {
+        comp_ref_frames[2][0] = this_mi->ref_frame[0] - LAST_FRAME;
+        comp_ref_frames[2][1] = this_mi->ref_frame[1] - LAST_FRAME;
+        if (!cpi->sf.tpl_sf.allow_compound_pred) {
+          start_rf = 2;
+          end_rf = 3;
+        }
+      }
+    }
+  }
 
-    const int rate_cost = rate_estimator(qcoeff, eob, tx_size);
+  xd->mi_row = mi_row;
+  xd->mi_col = mi_col;
+  int best_cmp_rf_idx = -1;
+  for (int cmp_rf_idx = start_rf; cmp_rf_idx < end_rf; ++cmp_rf_idx) {
+    int rf_idx0 = comp_ref_frames[cmp_rf_idx][0];
+    int rf_idx1 = comp_ref_frames[cmp_rf_idx][1];
+
+    if (tpl_data->ref_frame[rf_idx0] == NULL ||
+        tpl_data->src_ref_frame[rf_idx0] == NULL ||
+        tpl_data->ref_frame[rf_idx1] == NULL ||
+        tpl_data->src_ref_frame[rf_idx1] == NULL) {
+      continue;
+    }
+
+    const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = {
+      tpl_data->src_ref_frame[rf_idx0],
+      tpl_data->src_ref_frame[rf_idx1],
+    };
+
+    xd->mi[0]->ref_frame[0] = LAST_FRAME;
+    xd->mi[0]->ref_frame[1] = ALTREF_FRAME;
+
+    struct buf_2d yv12_mb[2][MAX_MB_PLANE];
+    for (int i = 0; i < 2; ++i) {
+      av1_setup_pred_block(xd, yv12_mb[i], ref_frame_ptr[i],
+                           xd->block_ref_scale_factors[i],
+                           xd->block_ref_scale_factors[i], MAX_MB_PLANE);
+      for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        xd->plane[plane].pre[i] = yv12_mb[i][plane];
+      }
+    }
+
+    int_mv tmp_mv[2] = { single_mv[rf_idx0], single_mv[rf_idx1] };
+    int rate_mv;
+    av1_joint_motion_search(cpi, x, bsize, tmp_mv, NULL, 0, &rate_mv,
+                            !cpi->sf.mv_sf.disable_second_mv);
+
+    for (int ref = 0; ref < 2; ++ref) {
+      struct buf_2d ref_buf = { NULL, ref_frame_ptr[ref]->y_buffer,
+                                ref_frame_ptr[ref]->y_width,
+                                ref_frame_ptr[ref]->y_height,
+                                ref_frame_ptr[ref]->y_stride };
+      InterPredParams inter_pred_params;
+      av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+                            mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd),
+                            0, &tpl_data->sf, &ref_buf, kernel);
+      av1_init_comp_mode(&inter_pred_params);
+
+      inter_pred_params.conv_params = get_conv_params_no_round(
+          ref, 0, xd->tmp_conv_dst, MAX_SB_SIZE, 1, xd->bd);
+
+      av1_enc_build_one_inter_predictor(predictor, bw, &tmp_mv[ref].as_mv,
+                                        &inter_pred_params);
+    }
+    inter_cost =
+        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                          predictor, bw, coeff, bw, bh, tx_size);
+    if (inter_cost < best_inter_cost) {
+      best_cmp_rf_idx = cmp_rf_idx;
+      best_inter_cost = inter_cost;
+      best_mv[0] = tmp_mv[0];
+      best_mv[1] = tmp_mv[1];
+
+      if (best_inter_cost < best_intra_cost) {
+        best_mode = NEW_NEWMV;
+        xd->mi[0]->ref_frame[0] = rf_idx0 + LAST_FRAME;
+        xd->mi[0]->ref_frame[1] = rf_idx1 + LAST_FRAME;
+      }
+    }
+  }
+
+  if (best_inter_cost < INT64_MAX) {
+    xd->mi[0]->mv[0].as_int = best_mv[0].as_int;
+    xd->mi[0]->mv[1].as_int = best_mv[1].as_int;
+    const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = {
+      best_cmp_rf_idx >= 0
+          ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]
+          : tpl_data->src_ref_frame[best_rf_idx],
+      best_cmp_rf_idx >= 0
+          ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]
+          : NULL,
+    };
+    int rate_cost = 1;
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+                        rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+                        use_y_only_rate_distortion, NULL);
     tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
   }
 
   best_intra_cost = AOMMAX(best_intra_cost, 1);
-  if (frame_idx == 0) {
-    best_inter_cost = 0;
-  } else {
-    best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
-  }
+  best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
   tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
   tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
 
-  tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
+  tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
 
   // Final encode
-  if (is_inter_mode(best_mode)) {
-    const YV12_BUFFER_CONFIG *ref_frame_ptr = ref_frame[best_rf_idx];
-
-    InterPredParams inter_pred_params;
-    struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
-                              ref_frame_ptr->y_width, ref_frame_ptr->y_height,
-                              ref_frame_ptr->y_stride };
-    av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
-                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
-                          sf, &ref_buf, kernel);
-    inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
-
-    av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride,
-                                      &best_mv.as_mv, &inter_pred_params);
-  } else {
-    av1_predict_intra_block(cm, xd, block_size_wide[bsize],
-                            block_size_high[bsize], tx_size, best_mode, 0, 0,
-                            FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride,
-                            dst_buffer, dst_buffer_stride, 0, 0, 0);
-  }
-
-  int rate_cost;
-  txfm_quant_rdcost(x, src_diff, bw, src_mb_buffer, src_stride, dst_buffer,
-                    dst_buffer_stride, coeff, qcoeff, dqcoeff, bw, bh, tx_size,
-                    &rate_cost, &recon_error, &sse);
+  int rate_cost = 0;
+  const YV12_BUFFER_CONFIG *ref_frame_ptr[2];
+
+  ref_frame_ptr[0] =
+      best_mode == NEW_NEWMV
+          ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]
+          : best_rf_idx >= 0 ? tpl_data->ref_frame[best_rf_idx] : NULL;
+  ref_frame_ptr[1] =
+      best_mode == NEW_NEWMV
+          ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]
+          : NULL;
+  get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                      qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+                      rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+                      use_y_only_rate_distortion, tpl_txfm_stats);
 
   tpl_stats->recrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
   tpl_stats->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+
   if (!is_inter_mode(best_mode)) {
     tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
     tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
   }
+
   tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist);
   tpl_stats->recrf_rate = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->recrf_rate);
 
-  if (best_rf_idx >= 0) {
-    tpl_stats->mv[best_rf_idx].as_int = best_mv.as_int;
-    tpl_stats->ref_frame_index = best_rf_idx;
+  if (best_mode == NEW_NEWMV) {
+    ref_frame_ptr[0] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]];
+    ref_frame_ptr[1] =
+        tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]];
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+                        rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+                        use_y_only_rate_distortion, NULL);
+    tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->cmp_recrf_rate[0] = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+
+    tpl_stats->cmp_recrf_dist[0] =
+        AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[0]);
+    tpl_stats->cmp_recrf_rate[0] =
+        AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[0]);
+
+    tpl_stats->cmp_recrf_dist[0] =
+        AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[0]);
+    tpl_stats->cmp_recrf_rate[0] =
+        AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[0]);
+
+    rate_cost = 0;
+    ref_frame_ptr[0] =
+        tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]];
+    ref_frame_ptr[1] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]];
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+                        rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+                        use_y_only_rate_distortion, NULL);
+    tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->cmp_recrf_rate[1] = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+
+    tpl_stats->cmp_recrf_dist[1] =
+        AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[1]);
+    tpl_stats->cmp_recrf_rate[1] =
+        AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[1]);
+
+    tpl_stats->cmp_recrf_dist[1] =
+        AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[1]);
+    tpl_stats->cmp_recrf_rate[1] =
+        AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[1]);
+  }
+
+  if (best_mode == NEWMV) {
+    tpl_stats->mv[best_rf_idx] = best_mv[0];
+    tpl_stats->ref_frame_index[0] = best_rf_idx;
+    tpl_stats->ref_frame_index[1] = NONE_FRAME;
+  } else if (best_mode == NEW_NEWMV) {
+    tpl_stats->ref_frame_index[0] = comp_ref_frames[best_cmp_rf_idx][0];
+    tpl_stats->ref_frame_index[1] = comp_ref_frames[best_cmp_rf_idx][1];
+    tpl_stats->mv[tpl_stats->ref_frame_index[0]] = best_mv[0];
+    tpl_stats->mv[tpl_stats->ref_frame_index[1]] = best_mv[1];
   }
 
   for (int idy = 0; idy < mi_height; ++idy) {
@@ -460,6 +986,13 @@ static AOM_INLINE void mode_estimation(
       }
     }
   }
+
+  // Free temporary buffers.
+  aom_free(predictor8);
+  aom_free(src_diff);
+  aom_free(coeff);
+  aom_free(qcoeff);
+  aom_free(dqcoeff);
 }
 
 static int round_floor(int ref_pos, int bsize_pix) {
@@ -472,41 +1005,24 @@ static int round_floor(int ref_pos, int bsize_pix) {
   return round;
 }
 
-static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
-                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
-  int width = 0, height = 0;
-  int bw = 4 << mi_size_wide_log2[bsize];
-  int bh = 4 << mi_size_high_log2[bsize];
-
-  switch (block) {
-    case 0:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 1:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 2:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    case 3:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    default: assert(0);
+int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width,
+                         int height) {
+  int min_row = AOMMAX(row_a, row_b);
+  int max_row = AOMMIN(row_a + height, row_b + height);
+  int min_col = AOMMAX(col_a, col_b);
+  int max_col = AOMMIN(col_a + width, col_b + width);
+  if (min_row < max_row && min_col < max_col) {
+    return (max_row - min_row) * (max_col - min_col);
   }
-
-  return width * height;
+  return 0;
 }
 
 int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) {
   return (mi_row >> right_shift) * stride + (mi_col >> right_shift);
 }
 
-static int64_t delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
-                               int64_t srcrf_dist, int pix_num) {
+int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+                            int64_t srcrf_dist, int pix_num) {
   double beta = (double)srcrf_dist / recrf_dist;
   int64_t rate_cost = delta_rate;
 
@@ -536,7 +1052,7 @@ static int64_t delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
 
 static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
                                           int mi_col, const BLOCK_SIZE bsize,
-                                          int frame_idx) {
+                                          int frame_idx, int ref) {
   TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx];
   TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr;
   TplDepFrame *tpl_frame = tpl_data->tpl_frame;
@@ -544,8 +1060,10 @@ static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
   TplDepStats *tpl_stats_ptr = &tpl_ptr[av1_tpl_ptr_pos(
       mi_row, mi_col, tpl_frame->stride, block_mis_log2)];
 
-  if (tpl_stats_ptr->ref_frame_index < 0) return;
-  const int ref_frame_index = tpl_stats_ptr->ref_frame_index;
+  int is_compound = tpl_stats_ptr->ref_frame_index[1] >= 0;
+
+  if (tpl_stats_ptr->ref_frame_index[ref] < 0) return;
+  const int ref_frame_index = tpl_stats_ptr->ref_frame_index[ref];
   TplDepFrame *ref_tpl_frame =
       &tpl_frame[tpl_frame[frame_idx].ref_map_index[ref_frame_index]];
   TplDepStats *ref_stats_ptr = ref_tpl_frame->tpl_stats_ptr;
@@ -568,15 +1086,20 @@ static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
   int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
   int block;
 
-  int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - tpl_stats_ptr->srcrf_dist;
-  int64_t mc_dep_dist = (int64_t)(
-      tpl_stats_ptr->mc_dep_dist *
-      ((double)(tpl_stats_ptr->recrf_dist - tpl_stats_ptr->srcrf_dist) /
-       tpl_stats_ptr->recrf_dist));
-  int64_t delta_rate = tpl_stats_ptr->recrf_rate - tpl_stats_ptr->srcrf_rate;
+  int64_t srcrf_dist = is_compound ? tpl_stats_ptr->cmp_recrf_dist[!ref]
+                                   : tpl_stats_ptr->srcrf_dist;
+  int64_t srcrf_rate = is_compound ? tpl_stats_ptr->cmp_recrf_rate[!ref]
+                                   : tpl_stats_ptr->srcrf_rate;
+
+  int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - srcrf_dist;
+  int64_t mc_dep_dist =
+      (int64_t)(tpl_stats_ptr->mc_dep_dist *
+                ((double)(tpl_stats_ptr->recrf_dist - srcrf_dist) /
+                 tpl_stats_ptr->recrf_dist));
+  int64_t delta_rate = tpl_stats_ptr->recrf_rate - srcrf_rate;
   int64_t mc_dep_rate =
-      delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
-                      tpl_stats_ptr->srcrf_dist, pix_num);
+      av1_delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
+                          srcrf_dist, pix_num);
 
   for (block = 0; block < 4; ++block) {
     int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
@@ -584,145 +1107,137 @@ static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
 
     if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
         grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
-      int overlap_area = get_overlap_area(
-          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+      int overlap_area = av1_get_overlap_area(grid_pos_row, grid_pos_col,
+                                              ref_pos_row, ref_pos_col, bw, bh);
       int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
       int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
-      const int step = 1 << block_mis_log2;
-
-      for (int idy = 0; idy < mi_height; idy += step) {
-        for (int idx = 0; idx < mi_width; idx += step) {
-          TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos(
-              ref_mi_row + idy, ref_mi_col + idx, ref_tpl_frame->stride,
-              block_mis_log2)];
-          des_stats->mc_dep_dist +=
-              ((cur_dep_dist + mc_dep_dist) * overlap_area) / pix_num;
-          des_stats->mc_dep_rate +=
-              ((delta_rate + mc_dep_rate) * overlap_area) / pix_num;
-
-          assert(overlap_area >= 0);
-        }
-      }
+      assert((1 << block_mis_log2) == mi_height);
+      assert((1 << block_mis_log2) == mi_width);
+      TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos(
+          ref_mi_row, ref_mi_col, ref_tpl_frame->stride, block_mis_log2)];
+      des_stats->mc_dep_dist +=
+          ((cur_dep_dist + mc_dep_dist) * overlap_area) / pix_num;
+      des_stats->mc_dep_rate +=
+          ((delta_rate + mc_dep_rate) * overlap_area) / pix_num;
     }
   }
 }
 
 static AOM_INLINE void tpl_model_update(TplParams *const tpl_data, int mi_row,
-                                        int mi_col, const BLOCK_SIZE bsize,
-                                        int frame_idx) {
-  const int mi_height = mi_size_high[bsize];
-  const int mi_width = mi_size_wide[bsize];
-  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
-  const BLOCK_SIZE tpl_block_size =
+                                        int mi_col, int frame_idx) {
+  const BLOCK_SIZE tpl_stats_block_size =
       convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2);
-
-  for (int idy = 0; idy < mi_height; idy += step) {
-    for (int idx = 0; idx < mi_width; idx += step) {
-      tpl_model_update_b(tpl_data, mi_row + idy, mi_col + idx, tpl_block_size,
-                         frame_idx);
-    }
-  }
+  tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx,
+                     0);
+  tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx,
+                     1);
 }
 
 static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row,
-                                       int mi_col, BLOCK_SIZE bsize, int stride,
+                                       int mi_col, int stride,
                                        const TplDepStats *src_stats,
                                        uint8_t block_mis_log2) {
-  const int mi_height = mi_size_high[bsize];
-  const int mi_width = mi_size_wide[bsize];
-  const int step = 1 << block_mis_log2;
-
-  int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
-  int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
-  int64_t srcrf_dist = src_stats->srcrf_dist / (mi_height * mi_width);
-  int64_t recrf_dist = src_stats->recrf_dist / (mi_height * mi_width);
-  int64_t srcrf_rate = src_stats->srcrf_rate / (mi_height * mi_width);
-  int64_t recrf_rate = src_stats->recrf_rate / (mi_height * mi_width);
-
-  intra_cost = AOMMAX(1, intra_cost);
-  inter_cost = AOMMAX(1, inter_cost);
-  srcrf_dist = AOMMAX(1, srcrf_dist);
-  recrf_dist = AOMMAX(1, recrf_dist);
-  srcrf_rate = AOMMAX(1, srcrf_rate);
-  recrf_rate = AOMMAX(1, recrf_rate);
-
-  for (int idy = 0; idy < mi_height; idy += step) {
-    TplDepStats *tpl_ptr = &tpl_stats_ptr[av1_tpl_ptr_pos(
-        mi_row + idy, mi_col, stride, block_mis_log2)];
-    for (int idx = 0; idx < mi_width; idx += step) {
-      tpl_ptr->intra_cost = intra_cost;
-      tpl_ptr->inter_cost = inter_cost;
-      tpl_ptr->srcrf_dist = srcrf_dist;
-      tpl_ptr->recrf_dist = recrf_dist;
-      tpl_ptr->srcrf_rate = srcrf_rate;
-      tpl_ptr->recrf_rate = recrf_rate;
-      memcpy(tpl_ptr->mv, src_stats->mv, sizeof(tpl_ptr->mv));
-      memcpy(tpl_ptr->pred_error, src_stats->pred_error,
-             sizeof(tpl_ptr->pred_error));
-      tpl_ptr->ref_frame_index = src_stats->ref_frame_index;
-      ++tpl_ptr;
-    }
+  int index = av1_tpl_ptr_pos(mi_row, mi_col, stride, block_mis_log2);
+  TplDepStats *tpl_ptr = &tpl_stats_ptr[index];
+  *tpl_ptr = *src_stats;
+  tpl_ptr->intra_cost = AOMMAX(1, tpl_ptr->intra_cost);
+  tpl_ptr->inter_cost = AOMMAX(1, tpl_ptr->inter_cost);
+  tpl_ptr->srcrf_dist = AOMMAX(1, tpl_ptr->srcrf_dist);
+  tpl_ptr->srcrf_sse = AOMMAX(1, tpl_ptr->srcrf_sse);
+  tpl_ptr->recrf_dist = AOMMAX(1, tpl_ptr->recrf_dist);
+  tpl_ptr->srcrf_rate = AOMMAX(1, tpl_ptr->srcrf_rate);
+  tpl_ptr->recrf_rate = AOMMAX(1, tpl_ptr->recrf_rate);
+  tpl_ptr->cmp_recrf_dist[0] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[0]);
+  tpl_ptr->cmp_recrf_dist[1] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[1]);
+  tpl_ptr->cmp_recrf_rate[0] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[0]);
+  tpl_ptr->cmp_recrf_rate[1] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[1]);
+}
+
+// Reset the ref and source frame pointers of tpl_data.
+static AOM_INLINE void tpl_reset_src_ref_frames(TplParams *tpl_data) {
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    tpl_data->ref_frame[i] = NULL;
+    tpl_data->src_ref_frame[i] = NULL;
   }
 }
 
-static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
-                                         int pframe_qindex) {
-  const GF_GROUP *gf_group = &cpi->gf_group;
-  if (frame_idx == gf_group->size) return;
-  TplParams *const tpl_data = &cpi->tpl_data;
+static AOM_INLINE int get_gop_length(const GF_GROUP *gf_group) {
+  int gop_length = AOMMIN(gf_group->size, MAX_TPL_FRAME_IDX - 1);
+  return gop_length;
+}
+
+// Initialize the mc_flow parameters used in computing tpl data.
+static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
+                                              int pframe_qindex) {
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
   const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture;
-  const YV12_BUFFER_CONFIG *ref_frame[7] = { NULL, NULL, NULL, NULL,
-                                             NULL, NULL, NULL };
   const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME];
+  uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME];
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
+      gf_group, cpi->sf.inter_sf.selective_ref_frame,
+      cpi->sf.tpl_sf.prune_ref_frames_in_tpl, frame_idx);
+  int gop_length = get_gop_length(gf_group);
   int ref_frame_flags;
-  const YV12_BUFFER_CONFIG *src_frame[7] = { NULL, NULL, NULL, NULL,
-                                             NULL, NULL, NULL };
-
   AV1_COMMON *cm = &cpi->common;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  struct scale_factors sf;
   int rdmult, idx;
   ThreadData *td = &cpi->td;
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
-  int mi_row, mi_col;
-  const BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
+  tpl_data->frame_idx = frame_idx;
+  tpl_reset_src_ref_frames(tpl_data);
   av1_tile_init(&xd->tile, cm, 0, 0);
 
-  const TX_SIZE tx_size = max_txsize_lookup[bsize];
-  const int mi_height = mi_size_high[bsize];
-  const int mi_width = mi_size_wide[bsize];
-
   // Setup scaling factor
   av1_setup_scale_factors_for_frame(
-      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      &tpl_data->sf, this_frame->y_crop_width, this_frame->y_crop_height,
       this_frame->y_crop_width, this_frame->y_crop_height);
 
   xd->cur_buf = this_frame;
 
   for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
-    ref_frame[idx] =
-        tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]].rec_picture;
-    src_frame[idx] =
-        tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]].gf_picture;
+    TplDepFrame *tpl_ref_frame =
+        &tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]];
+    tpl_data->ref_frame[idx] = tpl_ref_frame->rec_picture;
+    tpl_data->src_ref_frame[idx] = tpl_ref_frame->gf_picture;
+    ref_frame_display_indices[idx] = tpl_ref_frame->frame_display_index;
   }
 
   // Store the reference frames based on priority order
   for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-    ref_frames_ordered[i] = ref_frame[ref_frame_priority_order[i] - 1];
+    ref_frames_ordered[i] =
+        tpl_data->ref_frame[ref_frame_priority_order[i] - 1];
   }
 
   // Work out which reference frame slots may be used.
-  ref_frame_flags = get_ref_frame_flags(&cpi->sf, ref_frames_ordered,
-                                        cpi->ext_flags.ref_frame_flags);
+  ref_frame_flags =
+      get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi),
+                          ref_frames_ordered, cpi->ext_flags.ref_frame_flags);
 
-  enforce_max_ref_frames(cpi, &ref_frame_flags);
+  enforce_max_ref_frames(cpi, &ref_frame_flags, ref_frame_display_indices,
+                         tpl_frame->frame_display_index);
 
   // Prune reference frames
   for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
     if ((ref_frame_flags & (1 << idx)) == 0) {
-      ref_frame[idx] = NULL;
+      tpl_data->ref_frame[idx] = NULL;
+    }
+  }
+
+  // Skip motion estimation w.r.t. reference frames which are not
+  // considered in RD search, using "selective_ref_frame" speed feature.
+  // The reference frame pruning is not enabled for frames beyond the gop
+  // length, as there are fewer reference frames and the reference frames
+  // differ from the frames considered during RD search.
+  if (ref_pruning_enabled && (frame_idx < gop_length)) {
+    for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+      const MV_REFERENCE_FRAME refs[2] = { idx + 1, NONE_FRAME };
+      if (prune_ref_by_selective_ref_frame(cpi, NULL, refs,
+                                           ref_frame_display_indices)) {
+        tpl_data->ref_frame[idx] = NULL;
+      }
     }
   }
 
@@ -732,89 +1247,135 @@ static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
   MB_MODE_INFO *mbmi_ptr = &mbmi;
   xd->mi = &mbmi_ptr;
 
-  xd->block_ref_scale_factors[0] = &sf;
+  xd->block_ref_scale_factors[0] = &tpl_data->sf;
+  xd->block_ref_scale_factors[1] = &tpl_data->sf;
 
   const int base_qindex = pframe_qindex;
   // Get rd multiplier set up.
   rdmult = (int)av1_compute_rd_mult(cpi, base_qindex);
   if (rdmult < 1) rdmult = 1;
-  set_error_per_bit(x, rdmult);
-  av1_initialize_me_consts(cpi, x, base_qindex);
+  av1_set_error_per_bit(&x->errorperbit, rdmult);
+  av1_set_sad_per_bit(cpi, &x->sadperbit, base_qindex);
 
   tpl_frame->is_valid = 1;
 
   cm->quant_params.base_qindex = base_qindex;
   av1_frame_init_quantizer(cpi);
 
-  tpl_frame->base_rdmult =
-      av1_compute_rd_mult_based_on_qindex(cpi, pframe_qindex) / 6;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  const FRAME_UPDATE_TYPE update_type =
+      gf_group->update_type[cpi->gf_frame_index];
+  tpl_frame->base_rdmult = av1_compute_rd_mult_based_on_qindex(
+                               bd_info.bit_depth, update_type, pframe_qindex) /
+                           6;
+
+  av1_init_tpl_txfm_stats(tpl_txfm_stats);
+}
+
+// This function stores the motion estimation dependencies of all the blocks in
+// a row
+void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats,
+                               MACROBLOCK *x, int mi_row, BLOCK_SIZE bsize,
+                               TX_SIZE tx_size) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mi_width = mi_size_wide[bsize];
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
+  MACROBLOCKD *xd = &x->e_mbd;
 
-  for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
+  const int tplb_cols_in_tile =
+      ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]);
+  const int tplb_row = ROUND_POWER_OF_TWO(mi_row, mi_size_high_log2[bsize]);
+  assert(mi_size_high[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2));
+  assert(mi_size_wide[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2));
+
+  for (int mi_col = 0, tplb_col_in_tile = 0; mi_col < mi_params->mi_cols;
+       mi_col += mi_width, tplb_col_in_tile++) {
+    (*tpl_row_mt->sync_read_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+                                 tplb_col_in_tile);
+    TplDepStats tpl_stats;
+
+    // Motion estimation column boundary
+    av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width,
+                          tpl_data->border_in_pixels);
+    xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
+    xd->mb_to_right_edge =
+        GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
+    mode_estimation(cpi, tpl_txfm_stats, x, mi_row, mi_col, bsize, tx_size,
+                    &tpl_stats);
+
+    // Motion flow dependency dispenser.
+    tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, tpl_frame->stride,
+                    &tpl_stats, tpl_data->tpl_stats_block_mis_log2);
+    (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+                                  tplb_col_in_tile, tplb_cols_in_tile);
+  }
+}
+
+static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BLOCK_SIZE bsize =
+      convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int mi_height = mi_size_high[bsize];
+  for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
     // Motion estimation row boundary
     av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
-                          cpi->oxcf.border_in_pixels);
+                          cpi->ppi->tpl_data.border_in_pixels);
     xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
     xd->mb_to_bottom_edge =
         GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
-    for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += mi_width) {
-      TplDepStats tpl_stats;
-
-      // Motion estimation column boundary
-      av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width,
-                            cpi->oxcf.border_in_pixels);
-      xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
-      xd->mb_to_right_edge =
-          GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
-      mode_estimation(cpi, x, xd, &sf, frame_idx, mi_row, mi_col, bsize,
-                      tx_size, ref_frame, src_frame, &tpl_stats);
-
-      // Motion flow dependency dispenser.
-      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
-                      tpl_frame->stride, &tpl_stats,
-                      tpl_data->tpl_stats_block_mis_log2);
-    }
+    av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, x, mi_row, bsize,
+                              tx_size);
   }
 }
 
-static void mc_flow_synthesizer(AV1_COMP *cpi, int frame_idx) {
-  AV1_COMMON *cm = &cpi->common;
-
-  const GF_GROUP *gf_group = &cpi->gf_group;
-  if (frame_idx == gf_group->size) return;
-
-  TplParams *const tpl_data = &cpi->tpl_data;
-
-  const BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+static void mc_flow_synthesizer(TplParams *tpl_data, int frame_idx, int mi_rows,
+                                int mi_cols) {
+  if (!frame_idx) {
+    return;
+  }
+  const BLOCK_SIZE bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
+  assert(mi_height == (1 << tpl_data->tpl_stats_block_mis_log2));
+  assert(mi_width == (1 << tpl_data->tpl_stats_block_mis_log2));
 
-  for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += mi_height) {
-    for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += mi_width) {
-      if (frame_idx) {
-        tpl_model_update(tpl_data, mi_row, mi_col, bsize, frame_idx);
-      }
+  for (int mi_row = 0; mi_row < mi_rows; mi_row += mi_height) {
+    for (int mi_col = 0; mi_col < mi_cols; mi_col += mi_width) {
+      tpl_model_update(tpl_data, mi_row, mi_col, frame_idx);
     }
   }
 }
 
 static AOM_INLINE void init_gop_frames_for_tpl(
     AV1_COMP *cpi, const EncodeFrameParams *const init_frame_params,
-    GF_GROUP *gf_group, int gop_eval, int *tpl_group_frames,
-    const EncodeFrameInput *const frame_input, int *pframe_qindex) {
+    GF_GROUP *gf_group, int *tpl_group_frames, int *pframe_qindex) {
   AV1_COMMON *cm = &cpi->common;
-  int cur_frame_idx = gf_group->index;
+  assert(cpi->gf_frame_index == 0);
   *pframe_qindex = 0;
 
-  RefBufferStack ref_buffer_stack = cpi->ref_buffer_stack;
+  RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+  init_ref_map_pair(cpi, ref_frame_map_pairs);
+
+  int remapped_ref_idx[REF_FRAMES];
+
   EncodeFrameParams frame_params = *init_frame_params;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
 
   int ref_picture_map[REF_FRAMES];
 
   for (int i = 0; i < REF_FRAMES; ++i) {
-    if (frame_params.frame_type == KEY_FRAME || gop_eval) {
+    if (frame_params.frame_type == KEY_FRAME) {
       tpl_data->tpl_frame[-i - 1].gf_picture = NULL;
-      tpl_data->tpl_frame[-1 - 1].rec_picture = NULL;
+      tpl_data->tpl_frame[-i - 1].rec_picture = NULL;
       tpl_data->tpl_frame[-i - 1].frame_display_index = 0;
     } else {
       tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf;
@@ -826,86 +1387,102 @@ static AOM_INLINE void init_gop_frames_for_tpl(
     ref_picture_map[i] = -i - 1;
   }
 
-  *tpl_group_frames = cur_frame_idx;
+  *tpl_group_frames = 0;
 
   int gf_index;
-  int use_arf = gf_group->update_type[1] == ARF_UPDATE;
-  int anc_frame_offset = gf_group->cur_frame_idx[cur_frame_idx] + 1;
   int process_frame_count = 0;
-  const int gop_length =
-      AOMMIN(gf_group->size - 1 + use_arf, MAX_LENGTH_TPL_FRAME_STATS - 1);
-  for (gf_index = cur_frame_idx; gf_index <= gop_length; ++gf_index) {
+  const int gop_length = get_gop_length(gf_group);
+
+  for (gf_index = 0; gf_index < gop_length; ++gf_index) {
     TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
     FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index];
-
+    int lookahead_index =
+        gf_group->cur_frame_idx[gf_index] + gf_group->arf_src_offset[gf_index];
     frame_params.show_frame = frame_update_type != ARF_UPDATE &&
                               frame_update_type != INTNL_ARF_UPDATE;
     frame_params.show_existing_frame =
         frame_update_type == INTNL_OVERLAY_UPDATE ||
         frame_update_type == OVERLAY_UPDATE;
-    frame_params.frame_type =
-        frame_update_type == KF_UPDATE ? KEY_FRAME : INTER_FRAME;
+    frame_params.frame_type = gf_group->frame_type[gf_index];
 
     if (frame_update_type == LF_UPDATE)
       *pframe_qindex = gf_group->q_val[gf_index];
 
-    if (gf_index == cur_frame_idx) {
-      tpl_frame->gf_picture = frame_input->source;
-      // frame display index = frame offset within the gf group + start frame of
-      // the gf group
-      tpl_frame->frame_display_index =
-          gf_group->frame_disp_idx[gf_index] +
-          cpi->common.current_frame.display_order_hint;
-    } else {
-      int frame_display_index = gf_index == gf_group->size
-                                    ? cpi->rc.baseline_gf_interval
-                                    : gf_group->frame_disp_idx[gf_index];
-      struct lookahead_entry *buf = av1_lookahead_peek(
-          cpi->lookahead, frame_display_index - anc_frame_offset,
-          cpi->compressor_stage);
-      if (buf == NULL) break;
-      tpl_frame->gf_picture = &buf->img;
-      // frame display index = frame offset within the gf group + start frame of
-      // the gf group
-      tpl_frame->frame_display_index =
-          frame_display_index + cpi->common.current_frame.display_order_hint;
+    const struct lookahead_entry *buf = av1_lookahead_peek(
+        cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage);
+    if (buf == NULL) break;
+    tpl_frame->gf_picture = &buf->img;
+
+    // Use filtered frame buffer if available. This will make tpl stats more
+    // precise.
+    FRAME_DIFF frame_diff;
+    const YV12_BUFFER_CONFIG *tf_buf =
+        av1_tf_info_get_filtered_buf(&cpi->ppi->tf_info, gf_index, &frame_diff);
+    if (tf_buf != NULL) {
+      tpl_frame->gf_picture = tf_buf;
     }
 
+    // 'cm->current_frame.frame_number' is the display number
+    // of the current frame.
+    // 'lookahead_index' is frame offset within the gf group.
+    // 'lookahead_index + cm->current_frame.frame_number'
+    // is the display index of the frame.
+    tpl_frame->frame_display_index =
+        lookahead_index + cm->current_frame.frame_number;
+    assert(buf->display_idx ==
+           cpi->frame_index_set.show_frame_count + lookahead_index);
+
     if (frame_update_type != OVERLAY_UPDATE &&
         frame_update_type != INTNL_OVERLAY_UPDATE) {
       tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
       tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
       ++process_frame_count;
     }
+    const int true_disp = (int)(tpl_frame->frame_display_index);
 
-    av1_get_ref_frames(cpi, &ref_buffer_stack);
-    int refresh_mask = av1_get_refresh_frame_flags(
-        cpi, &frame_params, frame_update_type, &ref_buffer_stack);
+    av1_get_ref_frames(ref_frame_map_pairs, true_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                       cpi, gf_index, 0,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                       remapped_ref_idx);
+
+    int refresh_mask =
+        av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type,
+                                    gf_index, true_disp, ref_frame_map_pairs);
+
+    // Make the frames marked as is_frame_non_ref to non-reference frames.
+    if (cpi->ppi->gf_group.is_frame_non_ref[gf_index]) refresh_mask = 0;
 
     int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
-    av1_update_ref_frame_map(cpi, frame_update_type,
-                             frame_params.show_existing_frame,
-                             refresh_frame_map_index, &ref_buffer_stack);
+
+    if (refresh_frame_map_index < REF_FRAMES &&
+        refresh_frame_map_index != INVALID_IDX) {
+      ref_frame_map_pairs[refresh_frame_map_index].disp_order =
+          AOMMAX(0, true_disp);
+      ref_frame_map_pairs[refresh_frame_map_index].pyr_level =
+          get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp,
+                             cpi->ppi->gf_group.max_layer_depth);
+    }
 
     for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
       tpl_frame->ref_map_index[i - LAST_FRAME] =
-          ref_picture_map[cm->remapped_ref_idx[i - LAST_FRAME]];
+          ref_picture_map[remapped_ref_idx[i - LAST_FRAME]];
 
     if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index;
 
     ++*tpl_group_frames;
   }
 
-  if (cur_frame_idx == 0) return;
-
+  const int tpl_extend = cpi->oxcf.gf_cfg.lag_in_frames - MAX_GF_INTERVAL;
   int extend_frame_count = 0;
-  int extend_frame_length =
-      AOMMIN(cpi->rc.baseline_gf_interval,
-             cpi->rc.frames_to_key - cpi->rc.baseline_gf_interval);
-  int frame_display_index = cpi->rc.baseline_gf_interval + 1;
+  int extend_frame_length = AOMMIN(
+      tpl_extend, cpi->rc.frames_to_key - cpi->ppi->p_rc.baseline_gf_interval);
+
+  int frame_display_index = gf_group->cur_frame_idx[gop_length - 1] +
+                            gf_group->arf_src_offset[gop_length - 1] + 1;
 
-  for (; gf_index < MAX_LENGTH_TPL_FRAME_STATS &&
-         extend_frame_count < extend_frame_length;
+  for (;
+       gf_index < MAX_TPL_FRAME_IDX && extend_frame_count < extend_frame_length;
        ++gf_index) {
     TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
     FRAME_UPDATE_TYPE frame_update_type = LF_UPDATE;
@@ -915,36 +1492,64 @@ static AOM_INLINE void init_gop_frames_for_tpl(
         frame_update_type == INTNL_OVERLAY_UPDATE;
     frame_params.frame_type = INTER_FRAME;
 
+    int lookahead_index = frame_display_index;
     struct lookahead_entry *buf = av1_lookahead_peek(
-        cpi->lookahead, frame_display_index - anc_frame_offset,
-        cpi->compressor_stage);
+        cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage);
 
     if (buf == NULL) break;
 
     tpl_frame->gf_picture = &buf->img;
     tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
     tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
-    ++process_frame_count;
-
-    // frame display index = frame offset within the gf group + start frame of
-    // the gf group
+    // 'cm->current_frame.frame_number' is the display number
+    // of the current frame.
+    // 'frame_display_index' is frame offset within the gf group.
+    // 'frame_display_index + cm->current_frame.frame_number'
+    // is the display index of the frame.
     tpl_frame->frame_display_index =
-        frame_display_index + cpi->common.current_frame.display_order_hint;
+        frame_display_index + cm->current_frame.frame_number;
+
+    ++process_frame_count;
 
     gf_group->update_type[gf_index] = LF_UPDATE;
-    gf_group->q_val[gf_index] = *pframe_qindex;
 
-    av1_get_ref_frames(cpi, &ref_buffer_stack);
-    int refresh_mask = av1_get_refresh_frame_flags(
-        cpi, &frame_params, frame_update_type, &ref_buffer_stack);
+#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+    if (cpi->oxcf.pass == AOM_RC_SECOND_PASS) {
+      if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
+        *pframe_qindex = cpi->oxcf.rc_cfg.cq_level;
+      } else if (cpi->oxcf.rc_cfg.mode == AOM_VBR) {
+        // TODO(angiebird): Find a more adaptive method to decide pframe_qindex
+        // override the pframe_qindex in the second pass when bitrate accuracy
+        // is on. We found that setting this pframe_qindex make the tpl stats
+        // more stable.
+        *pframe_qindex = 128;
+      }
+    }
+#endif  // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+    gf_group->q_val[gf_index] = *pframe_qindex;
+    const int true_disp = (int)(tpl_frame->frame_display_index);
+    av1_get_ref_frames(ref_frame_map_pairs, true_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                       cpi, gf_index, 0,
+#endif
+                       remapped_ref_idx);
+    int refresh_mask =
+        av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type,
+                                    gf_index, true_disp, ref_frame_map_pairs);
     int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
-    av1_update_ref_frame_map(cpi, frame_update_type,
-                             frame_params.show_existing_frame,
-                             refresh_frame_map_index, &ref_buffer_stack);
+
+    if (refresh_frame_map_index < REF_FRAMES &&
+        refresh_frame_map_index != INVALID_IDX) {
+      ref_frame_map_pairs[refresh_frame_map_index].disp_order =
+          AOMMAX(0, true_disp);
+      ref_frame_map_pairs[refresh_frame_map_index].pyr_level =
+          get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp,
+                             cpi->ppi->gf_group.max_layer_depth);
+    }
 
     for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
       tpl_frame->ref_map_index[i - LAST_FRAME] =
-          ref_picture_map[cm->remapped_ref_idx[i - LAST_FRAME]];
+          ref_picture_map[remapped_ref_idx[i - LAST_FRAME]];
 
     tpl_frame->ref_map_index[ALTREF_FRAME - LAST_FRAME] = -1;
     tpl_frame->ref_map_index[LAST3_FRAME - LAST_FRAME] = -1;
@@ -957,133 +1562,231 @@ static AOM_INLINE void init_gop_frames_for_tpl(
     ++extend_frame_count;
     ++frame_display_index;
   }
-
-  av1_get_ref_frames(cpi, &cpi->ref_buffer_stack);
 }
 
-static AOM_INLINE void init_tpl_stats(TplParams *const tpl_data) {
+void av1_init_tpl_stats(TplParams *const tpl_data) {
+  tpl_data->ready = 0;
+  set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2,
+                           &tpl_data->tpl_bsize_1d);
+  for (int frame_idx = 0; frame_idx < MAX_LENGTH_TPL_FRAME_STATS; ++frame_idx) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
+    tpl_frame->is_valid = 0;
+  }
   for (int frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
     TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
+    if (tpl_data->tpl_stats_pool[frame_idx] == NULL) continue;
     memset(tpl_data->tpl_stats_pool[frame_idx], 0,
            tpl_frame->height * tpl_frame->width *
                sizeof(*tpl_frame->tpl_stats_ptr));
-    tpl_frame->is_valid = 0;
   }
 }
 
-int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
-                        const EncodeFrameParams *const frame_params,
-                        const EncodeFrameInput *const frame_input) {
-  AV1_COMMON *cm = &cpi->common;
-  GF_GROUP *gf_group = &cpi->gf_group;
-  int bottom_index, top_index;
-  EncodeFrameParams this_frame_params = *frame_params;
-  TplParams *const tpl_data = &cpi->tpl_data;
+int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index) {
+  if (tpl_data->ready == 0) {
+    return 0;
+  }
+  if (gf_frame_index >= MAX_TPL_FRAME_IDX) {
+    assert(gf_frame_index < MAX_TPL_FRAME_IDX && "Invalid gf_frame_index\n");
+    return 0;
+  }
+  return tpl_data->tpl_frame[gf_frame_index].is_valid;
+}
 
-  if (cpi->superres_mode != SUPERRES_NONE) return 0;
+static AOM_INLINE int eval_gop_length(double *beta, int gop_eval) {
+  switch (gop_eval) {
+    case 1:
+      // Allow larger GOP size if the base layer ARF has higher dependency
+      // factor than the intermediate ARF and both ARFs have reasonably high
+      // dependency factors.
+      return (beta[0] >= beta[1] + 0.7) && beta[0] > 3.0;
+    case 2:
+      if ((beta[0] >= beta[1] + 0.4) && beta[0] > 1.6)
+        return 1;  // Don't shorten the gf interval
+      else if ((beta[0] < beta[1] + 0.1) || beta[0] <= 1.4)
+        return 0;  // Shorten the gf interval
+      else
+        return 2;  // Cannot decide the gf interval, so redo the
+                   // tpl stats calculation.
+    case 3: return beta[0] > 1.1;
+    default: return 2;
+  }
+}
 
+// TODO(jingning): Restructure av1_rc_pick_q_and_bounds() to narrow down
+// the scope of input arguments.
+void av1_tpl_preload_rc_estimate(AV1_COMP *cpi,
+                                 const EncodeFrameParams *const frame_params) {
+  AV1_COMMON *cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  int bottom_index, top_index;
   cm->current_frame.frame_type = frame_params->frame_type;
-  for (int gf_index = gf_group->index; gf_index < gf_group->size; ++gf_index) {
-    av1_configure_buffer_updates(cpi, &this_frame_params,
-                                 gf_group->update_type[gf_index], 0);
-
-    cpi->refresh_golden_frame = this_frame_params.refresh_golden_frame;
-    cpi->refresh_bwd_ref_frame = this_frame_params.refresh_bwd_ref_frame;
-    cpi->refresh_alt_ref_frame = this_frame_params.refresh_alt_ref_frame;
-
+  for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size;
+       ++gf_index) {
+    cm->current_frame.frame_type = gf_group->frame_type[gf_index];
     cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE &&
                      gf_group->update_type[gf_index] != INTNL_ARF_UPDATE;
+    gf_group->q_val[gf_index] = av1_rc_pick_q_and_bounds(
+        cpi, cm->width, cm->height, gf_index, &bottom_index, &top_index);
+  }
+}
 
-    gf_group->q_val[gf_index] =
-        av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height, gf_index,
-                                 &bottom_index, &top_index);
+int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
+                        const EncodeFrameParams *const frame_params) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+  assert(cpi->gf_frame_index == 0);
+  AV1_COMMON *cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  EncodeFrameParams this_frame_params = *frame_params;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  int approx_gop_eval = (gop_eval > 1);
+  int num_arf_layers = MAX_ARF_LAYERS;
+
+  // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base
+  // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3,
+  // tpl stats calculation is limited to ARFs from base layer and (base+1)
+  // layer.
+  if (approx_gop_eval) num_arf_layers = (gop_eval == 2) ? 3 : 2;
+
+  if (cpi->superres_mode != AOM_SUPERRES_NONE) {
+    assert(cpi->superres_mode != AOM_SUPERRES_AUTO);
+    av1_init_tpl_stats(tpl_data);
+    return 0;
+  }
+
+  cm->current_frame.frame_type = frame_params->frame_type;
+  for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size;
+       ++gf_index) {
+    cm->current_frame.frame_type = gf_group->frame_type[gf_index];
+    av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
+                                 gf_group->update_type[gf_index],
+                                 gf_group->refbuf_state[gf_index], 0);
 
-    cm->current_frame.frame_type = INTER_FRAME;
+    memcpy(&cpi->refresh_frame, &this_frame_params.refresh_frame,
+           sizeof(cpi->refresh_frame));
   }
 
   int pframe_qindex;
   int tpl_gf_group_frames;
-  init_gop_frames_for_tpl(cpi, frame_params, gf_group, gop_eval,
-                          &tpl_gf_group_frames, frame_input, &pframe_qindex);
+  init_gop_frames_for_tpl(cpi, frame_params, gf_group, &tpl_gf_group_frames,
+                          &pframe_qindex);
 
-  cpi->rc.base_layer_qp = pframe_qindex;
+  cpi->ppi->p_rc.base_layer_qp = pframe_qindex;
 
-  init_tpl_stats(tpl_data);
+  av1_init_tpl_stats(tpl_data);
 
+  tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read_dummy;
+  tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write_dummy;
+
+  av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
+                                    cm->width, cm->height);
+
+  if (frame_params->frame_type == KEY_FRAME) {
+    av1_init_mv_probs(cm);
+  }
+  av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
+                    cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs);
+
+  const int gop_length = get_gop_length(gf_group);
+  const int num_planes =
+      cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : av1_num_planes(cm);
   // Backward propagation from tpl_group_frames to 1.
-  for (int frame_idx = gf_group->index; frame_idx < tpl_gf_group_frames;
+  for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames;
        ++frame_idx) {
     if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
         gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
       continue;
 
-    mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
+    // When approx_gop_eval = 1, skip tpl stats calculation for higher layer
+    // frames and for frames beyond gop length.
+    if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+                            frame_idx >= gop_length))
+      continue;
+
+    init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
+    if (mt_info->num_workers > 1) {
+      tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read;
+      tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write;
+      av1_mc_flow_dispenser_mt(cpi);
+    } else {
+      mc_flow_dispenser(cpi);
+    }
+    av1_tpl_txfm_stats_update_abs_coeff_mean(&cpi->td.tpl_txfm_stats);
+    av1_tpl_store_txfm_stats(tpl_data, &cpi->td.tpl_txfm_stats, frame_idx);
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+    if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) {
+      int frame_coding_idx =
+          av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, frame_idx);
+      rc_log_frame_stats(&cpi->rc_log, frame_coding_idx,
+                         &cpi->td.tpl_txfm_stats);
+    }
+#endif  // CONFIG_RATECTRL_LOG
 
     aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture,
-                             av1_num_planes(cm));
+                             num_planes);
   }
 
-  for (int frame_idx = tpl_gf_group_frames - 1; frame_idx >= gf_group->index;
-       --frame_idx) {
+  for (int frame_idx = tpl_gf_group_frames - 1;
+       frame_idx >= cpi->gf_frame_index; --frame_idx) {
     if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
         gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
       continue;
 
-    mc_flow_synthesizer(cpi, frame_idx);
+    if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+                            frame_idx >= gop_length))
+      continue;
+
+    mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows,
+                        cm->mi_params.mi_cols);
   }
 
-  av1_configure_buffer_updates(cpi, &this_frame_params,
-                               gf_group->update_type[gf_group->index], 0);
+  av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
+                               gf_group->update_type[cpi->gf_frame_index],
+                               gf_group->update_type[cpi->gf_frame_index], 0);
   cm->current_frame.frame_type = frame_params->frame_type;
   cm->show_frame = frame_params->show_frame;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  // Record the time if the function returns.
+  if (cpi->common.tiles.large_scale || gf_group->max_layer_depth_allowed == 0 ||
+      !gop_eval)
+    end_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+
+  if (!approx_gop_eval) {
+    tpl_data->ready = 1;
+  }
   if (cpi->common.tiles.large_scale) return 0;
   if (gf_group->max_layer_depth_allowed == 0) return 1;
+  if (!gop_eval) return 0;
+  assert(gf_group->arf_index >= 0);
 
   double beta[2] = { 0.0 };
-  for (int frame_idx = 1; frame_idx <= AOMMIN(tpl_gf_group_frames - 1, 2);
-       ++frame_idx) {
-    TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
-    TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-    int tpl_stride = tpl_frame->stride;
-    int64_t intra_cost_base = 0;
-    int64_t mc_dep_cost_base = 0;
-    const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
-    const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-
-    for (int row = 0; row < cm->mi_params.mi_rows; row += step) {
-      for (int col = 0; col < mi_cols_sr; col += step) {
-        TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
-            row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
-        int64_t mc_dep_delta =
-            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
-                   this_stats->mc_dep_dist);
-        intra_cost_base += (this_stats->recrf_dist << RDDIV_BITS);
-        mc_dep_cost_base +=
-            (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
-      }
-    }
-    beta[frame_idx - 1] = (double)mc_dep_cost_base / intra_cost_base;
-  }
-
-  // Allow larger GOP size if the base layer ARF has higher dependency factor
-  // than the intermediate ARF and both ARFs have reasonably high dependency
-  // factors.
-  return (beta[0] >= beta[1] + 0.7) && beta[0] > 3.0;
+  const int frame_idx_0 = gf_group->arf_index;
+  const int frame_idx_1 =
+      AOMMIN(tpl_gf_group_frames - 1, gf_group->arf_index + 1);
+  beta[0] = av1_tpl_get_frame_importance(tpl_data, frame_idx_0);
+  beta[1] = av1_tpl_get_frame_importance(tpl_data, frame_idx_1);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+  return eval_gop_length(beta, gop_eval);
 }
 
 void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  const int tpl_idx = gf_group->index;
+  const int tpl_idx = cpi->gf_frame_index;
 
-  assert(IMPLIES(gf_group->size > 0, tpl_idx < gf_group->size));
+  assert(
+      IMPLIES(cpi->ppi->gf_group.size > 0, tpl_idx < cpi->ppi->gf_group.size));
 
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx];
 
   if (!tpl_frame->is_valid) return;
-  if (cpi->superres_mode != SUPERRES_NONE) return;
 
   const TplDepStats *const tpl_stats = tpl_frame->tpl_stats_ptr;
   const int tpl_stride = tpl_frame->stride;
@@ -1097,8 +1800,6 @@ void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
   const double c = 1.2;
   const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
 
-  aom_clear_system_state();
-
   // Loop through each 'block_size' X 'block_size' block.
   for (int row = 0; row < num_rows; row++) {
     for (int col = 0; col < num_cols; col++) {
@@ -1124,66 +1825,479 @@ void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
       cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c;
     }
   }
-  aom_clear_system_state();
 }
 
 void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
                              BLOCK_SIZE sb_size, int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_group.index < cpi->gf_group.size));
-  const int tpl_idx = cpi->gf_group.index;
-  TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
+  const int tpl_idx = cpi->gf_frame_index;
 
-  if (tpl_frame->is_valid == 0) return;
-  if (!is_frame_tpl_eligible(cpi)) return;
-  if (tpl_idx >= MAX_LAG_BUFFERS) return;
-  if (cpi->superres_mode != SUPERRES_NONE) return;
-  if (cpi->oxcf.aq_mode != NO_AQ) return;
+  if (tpl_idx >= MAX_TPL_FRAME_IDX) return;
+  TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx];
+  if (!tpl_frame->is_valid) return;
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return;
+  if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
+
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int sb_mi_width_sr = coded_to_superres_mi(
+      mi_size_wide[sb_size], cm->superres_scale_denominator);
 
   const int bsize_base = BLOCK_16X16;
   const int num_mi_w = mi_size_wide[bsize_base];
   const int num_mi_h = mi_size_high[bsize_base];
-  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
   const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
-  const int num_bcols = (mi_size_wide[sb_size] + num_mi_w - 1) / num_mi_w;
+  const int num_bcols = (sb_mi_width_sr + num_mi_w - 1) / num_mi_w;
   const int num_brows = (mi_size_high[sb_size] + num_mi_h - 1) / num_mi_h;
   int row, col;
 
   double base_block_count = 0.0;
   double log_sum = 0.0;
 
-  aom_clear_system_state();
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
-    for (col = mi_col / num_mi_h;
-         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+    for (col = mi_col_sr / num_mi_h;
+         col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
       const int index = row * num_cols + col;
       log_sum += log(cpi->tpl_rdmult_scaling_factors[index]);
       base_block_count += 1.0;
     }
   }
 
-  MACROBLOCKD *const xd = &x->e_mbd;
   const CommonQuantParams *quant_params = &cm->quant_params;
   const int orig_rdmult = av1_compute_rd_mult(
       cpi, quant_params->base_qindex + quant_params->y_dc_delta_q);
   const int new_rdmult =
-      av1_compute_rd_mult(cpi, quant_params->base_qindex + xd->delta_qindex +
+      av1_compute_rd_mult(cpi, quant_params->base_qindex + x->delta_qindex +
                                    quant_params->y_dc_delta_q);
   const double scaling_factor = (double)new_rdmult / (double)orig_rdmult;
 
   double scale_adj = log(scaling_factor) - log_sum / base_block_count;
-  scale_adj = exp(scale_adj);
+  scale_adj = exp_bounded(scale_adj);
 
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
-    for (col = mi_col / num_mi_h;
-         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+    for (col = mi_col_sr / num_mi_h;
+         col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
       const int index = row * num_cols + col;
-      cpi->tpl_sb_rdmult_scaling_factors[index] =
+      cpi->ppi->tpl_sb_rdmult_scaling_factors[index] =
           scale_adj * cpi->tpl_rdmult_scaling_factors[index];
     }
   }
-  aom_clear_system_state();
+}
+
+double av1_exponential_entropy(double q_step, double b) {
+  b = AOMMAX(b, TPL_EPSILON);
+  double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON);
+  return -log2(1 - z) - z * log2(z) / (1 - z);
+}
+
+double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio) {
+  // zero bin's size is zero_bin_ratio * q_step
+  // non-zero bin's size is q_step
+  b = AOMMAX(b, TPL_EPSILON);
+  double z = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+  double h = av1_exponential_entropy(q_step, b);
+  double r = -(1 - z) * log2(1 - z) - z * log2(z) + z * (h + 1);
+  return r;
+}
+
+double av1_laplace_estimate_frame_rate(int q_index, int block_count,
+                                       const double *abs_coeff_mean,
+                                       int coeff_num) {
+  double zero_bin_ratio = 2;
+  double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+  double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+  double est_rate = 0;
+  // dc coeff
+  est_rate += av1_laplace_entropy(dc_q_step, abs_coeff_mean[0], zero_bin_ratio);
+  // ac coeff
+  for (int i = 1; i < coeff_num; ++i) {
+    est_rate +=
+        av1_laplace_entropy(ac_q_step, abs_coeff_mean[i], zero_bin_ratio);
+  }
+  est_rate *= block_count;
+  return est_rate;
+}
+
+double av1_estimate_coeff_entropy(double q_step, double b,
+                                  double zero_bin_ratio, int qcoeff) {
+  b = AOMMAX(b, TPL_EPSILON);
+  int abs_qcoeff = abs(qcoeff);
+  double z0 = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+  if (abs_qcoeff == 0) {
+    double r = -log2(1 - z0);
+    return r;
+  } else {
+    double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON);
+    double r = 1 - log2(z0) - log2(1 - z) - (abs_qcoeff - 1) * log2(z);
+    return r;
+  }
+}
+
+double av1_estimate_txfm_block_entropy(int q_index,
+                                       const double *abs_coeff_mean,
+                                       int *qcoeff_arr, int coeff_num) {
+  double zero_bin_ratio = 2;
+  double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+  double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+  double est_rate = 0;
+  // dc coeff
+  est_rate += av1_estimate_coeff_entropy(dc_q_step, abs_coeff_mean[0],
+                                         zero_bin_ratio, qcoeff_arr[0]);
+  // ac coeff
+  for (int i = 1; i < coeff_num; ++i) {
+    est_rate += av1_estimate_coeff_entropy(ac_q_step, abs_coeff_mean[i],
+                                           zero_bin_ratio, qcoeff_arr[i]);
+  }
+  return est_rate;
+}
+
+#if CONFIG_RD_COMMAND
+void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command) {
+  FILE *fptr = fopen(filepath, "r");
+  fscanf(fptr, "%d", &rd_command->frame_count);
+  rd_command->frame_index = 0;
+  for (int i = 0; i < rd_command->frame_count; ++i) {
+    int option;
+    fscanf(fptr, "%d", &option);
+    rd_command->option_ls[i] = (RD_OPTION)option;
+    if (option == RD_OPTION_SET_Q) {
+      fscanf(fptr, "%d", &rd_command->q_index_ls[i]);
+    } else if (option == RD_OPTION_SET_Q_RDMULT) {
+      fscanf(fptr, "%d", &rd_command->q_index_ls[i]);
+      fscanf(fptr, "%d", &rd_command->rdmult_ls[i]);
+    }
+  }
+  fclose(fptr);
+}
+#endif  // CONFIG_RD_COMMAND
+
+double av1_tpl_get_frame_importance(const TplParams *tpl_data,
+                                    int gf_frame_index) {
+  const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_frame_index];
+  const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+  const int tpl_stride = tpl_frame->stride;
+  double intra_cost_base = 0;
+  double mc_dep_cost_base = 0;
+  double cbcmp_base = 1;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+  for (int row = 0; row < tpl_frame->mi_rows; row += step) {
+    for (int col = 0; col < tpl_frame->mi_cols; col += step) {
+      const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+      double cbcmp = (double)this_stats->srcrf_dist;
+      const int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+      intra_cost_base += log(dist_scaled) * cbcmp;
+      mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp;
+      cbcmp_base += cbcmp;
+    }
+  }
+  return exp((mc_dep_cost_base - intra_cost_base) / cbcmp_base);
+}
+
+double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index) {
+  if (!av1_tpl_stats_ready(tpl_data, gf_frame_index)) {
+    return 1;
+  }
+  const double frame_importance =
+      av1_tpl_get_frame_importance(tpl_data, gf_frame_index);
+  return sqrt(1 / frame_importance);
+}
+
+int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio,
+                                     aom_bit_depth_t bit_depth) {
+  const double leaf_qstep = av1_dc_quant_QTX(leaf_qindex, 0, bit_depth);
+  const double target_qstep = leaf_qstep * qstep_ratio;
+  int qindex = leaf_qindex;
+  for (qindex = leaf_qindex; qindex > 0; --qindex) {
+    const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth);
+    if (qstep <= target_qstep) break;
+  }
+  return qindex;
+}
+
+int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index,
+                        int leaf_qindex, aom_bit_depth_t bit_depth) {
+  const double qstep_ratio = av1_tpl_get_qstep_ratio(tpl_data, gf_frame_index);
+  return av1_get_q_index_from_qstep_ratio(leaf_qindex, qstep_ratio, bit_depth);
+}
+
+#if CONFIG_BITRATE_ACCURACY
+void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget,
+                     int show_frame_count) {
+  av1_zero(*vbr_rc_info);
+  vbr_rc_info->ready = 0;
+  vbr_rc_info->total_bit_budget = total_bit_budget;
+  vbr_rc_info->show_frame_count = show_frame_count;
+  const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.94559, 1,
+                                                     0.94559, 1,       1,
+                                                     0.94559 };
+
+  // TODO(angiebird): Based on the previous code, only the scale factor 0.94559
+  // will be used in most of the cases with --limi=17. Figure out if the
+  // following scale factors works better.
+  // const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.12040, 1,
+  //                                                    1.10199, 1,       1,
+  //                                                    0.16393 };
+
+  const double mv_scale_factors[FRAME_UPDATE_TYPES] = { 3, 3, 3, 3, 3, 3, 3 };
+  memcpy(vbr_rc_info->scale_factors, scale_factors,
+         sizeof(scale_factors[0]) * FRAME_UPDATE_TYPES);
+  memcpy(vbr_rc_info->mv_scale_factors, mv_scale_factors,
+         sizeof(mv_scale_factors[0]) * FRAME_UPDATE_TYPES);
+
+  vbr_rc_reset_gop_data(vbr_rc_info);
+#if CONFIG_THREE_PASS
+  // TODO(angiebird): Explain why we use -1 here
+  vbr_rc_info->cur_gop_idx = -1;
+  vbr_rc_info->gop_count = 0;
+  vbr_rc_info->total_frame_count = 0;
+#endif  // CONFIG_THREE_PASS
+}
+
+#if CONFIG_THREE_PASS
+int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info,
+                                int gf_frame_index) {
+  int gop_idx = vbr_rc_info->cur_gop_idx;
+  int gop_start_idx = vbr_rc_info->gop_start_idx_list[gop_idx];
+  return gop_start_idx + gf_frame_index;
+}
+
+void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info,
+                                const TPL_INFO *tpl_info) {
+  int gop_start_idx = vbr_rc_info->total_frame_count;
+  vbr_rc_info->gop_start_idx_list[vbr_rc_info->gop_count] = gop_start_idx;
+  vbr_rc_info->gop_length_list[vbr_rc_info->gop_count] = tpl_info->gf_length;
+  assert(gop_start_idx + tpl_info->gf_length <= VBR_RC_INFO_MAX_FRAMES);
+  for (int i = 0; i < tpl_info->gf_length; ++i) {
+    vbr_rc_info->txfm_stats_list[gop_start_idx + i] =
+        tpl_info->txfm_stats_list[i];
+    vbr_rc_info->qstep_ratio_list[gop_start_idx + i] =
+        tpl_info->qstep_ratio_ls[i];
+    vbr_rc_info->update_type_list[gop_start_idx + i] =
+        tpl_info->update_type_list[i];
+  }
+  vbr_rc_info->total_frame_count += tpl_info->gf_length;
+  vbr_rc_info->gop_count++;
+}
+#endif  // CONFIG_THREE_PASS
+
+void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info,
+                                   int gop_showframe_count) {
+  vbr_rc_info->gop_showframe_count = gop_showframe_count;
+  vbr_rc_info->gop_bit_budget = vbr_rc_info->total_bit_budget *
+                                gop_showframe_count /
+                                vbr_rc_info->show_frame_count;
+}
+
+void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count,
+                                  const double *qstep_ratio_list,
+                                  aom_bit_depth_t bit_depth,
+                                  int *q_index_list) {
+  for (int i = 0; i < frame_count; ++i) {
+    q_index_list[i] = av1_get_q_index_from_qstep_ratio(
+        base_q_index, qstep_ratio_list[i], bit_depth);
+  }
+}
+
+double av1_vbr_rc_info_estimate_gop_bitrate(
+    int base_q_index, aom_bit_depth_t bit_depth,
+    const double *update_type_scale_factors, int frame_count,
+    const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+    const TplTxfmStats *stats_list, int *q_index_list,
+    double *estimated_bitrate_byframe) {
+  av1_vbr_rc_compute_q_indices(base_q_index, frame_count, qstep_ratio_list,
+                               bit_depth, q_index_list);
+  double estimated_gop_bitrate = 0;
+  for (int frame_index = 0; frame_index < frame_count; frame_index++) {
+    const TplTxfmStats *frame_stats = &stats_list[frame_index];
+    double frame_bitrate = 0;
+    if (frame_stats->ready) {
+      int q_index = q_index_list[frame_index];
+
+      frame_bitrate = av1_laplace_estimate_frame_rate(
+          q_index, frame_stats->txfm_block_count, frame_stats->abs_coeff_mean,
+          frame_stats->coeff_num);
+    }
+    FRAME_UPDATE_TYPE update_type = update_type_list[frame_index];
+    estimated_gop_bitrate +=
+        frame_bitrate * update_type_scale_factors[update_type];
+    if (estimated_bitrate_byframe != NULL) {
+      estimated_bitrate_byframe[frame_index] = frame_bitrate;
+    }
+  }
+  return estimated_gop_bitrate;
+}
+
+int av1_vbr_rc_info_estimate_base_q(
+    double bit_budget, aom_bit_depth_t bit_depth,
+    const double *update_type_scale_factors, int frame_count,
+    const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+    const TplTxfmStats *stats_list, int *q_index_list,
+    double *estimated_bitrate_byframe) {
+  int q_max = 255;  // Maximum q value.
+  int q_min = 0;    // Minimum q value.
+  int q = (q_max + q_min) / 2;
+
+  double q_max_estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+      q_max, bit_depth, update_type_scale_factors, frame_count,
+      update_type_list, qstep_ratio_list, stats_list, q_index_list,
+      estimated_bitrate_byframe);
+
+  double q_min_estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+      q_min, bit_depth, update_type_scale_factors, frame_count,
+      update_type_list, qstep_ratio_list, stats_list, q_index_list,
+      estimated_bitrate_byframe);
+  while (q_min + 1 < q_max) {
+    double estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+        q, bit_depth, update_type_scale_factors, frame_count, update_type_list,
+        qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe);
+    if (estimate > bit_budget) {
+      q_min = q;
+      q_min_estimate = estimate;
+    } else {
+      q_max = q;
+      q_max_estimate = estimate;
+    }
+    q = (q_max + q_min) / 2;
+  }
+  // Pick the estimate that lands closest to the budget.
+  if (fabs(q_max_estimate - bit_budget) < fabs(q_min_estimate - bit_budget)) {
+    q = q_max;
+  } else {
+    q = q_min;
+  }
+  // Update q_index_list and vbr_rc_info.
+  av1_vbr_rc_info_estimate_gop_bitrate(
+      q, bit_depth, update_type_scale_factors, frame_count, update_type_list,
+      qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe);
+  return q;
+}
+void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info,
+                                    const TplParams *tpl_data,
+                                    const GF_GROUP *gf_group,
+                                    aom_bit_depth_t bit_depth) {
+  vbr_rc_info->q_index_list_ready = 1;
+  double gop_bit_budget = vbr_rc_info->gop_bit_budget;
+
+  for (int i = 0; i < gf_group->size; i++) {
+    vbr_rc_info->qstep_ratio_list[i] = av1_tpl_get_qstep_ratio(tpl_data, i);
+  }
+
+  double mv_bits = 0;
+  for (int i = 0; i < gf_group->size; i++) {
+    double frame_mv_bits = 0;
+    if (av1_tpl_stats_ready(tpl_data, i)) {
+      TplDepFrame *tpl_frame = &tpl_data->tpl_frame[i];
+      frame_mv_bits = av1_tpl_compute_frame_mv_entropy(
+          tpl_frame, tpl_data->tpl_stats_block_mis_log2);
+      FRAME_UPDATE_TYPE updae_type = gf_group->update_type[i];
+      mv_bits += frame_mv_bits * vbr_rc_info->mv_scale_factors[updae_type];
+    }
+  }
+
+  mv_bits = AOMMIN(mv_bits, 0.6 * gop_bit_budget);
+  gop_bit_budget -= mv_bits;
+
+  vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q(
+      gop_bit_budget, bit_depth, vbr_rc_info->scale_factors, gf_group->size,
+      gf_group->update_type, vbr_rc_info->qstep_ratio_list,
+      tpl_data->txfm_stats_list, vbr_rc_info->q_index_list, NULL);
+}
+
+#endif  // CONFIG_BITRATE_ACCURACY
+
+// Use upper and left neighbor block as the reference MVs.
+// Compute the minimum difference between current MV and reference MV.
+int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col,
+                                 int step, int tpl_stride, int right_shift) {
+  const TplDepStats *tpl_stats =
+      &tpl_frame
+           ->tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_stride, right_shift)];
+  int_mv current_mv = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+  int current_mv_magnitude =
+      abs(current_mv.as_mv.row) + abs(current_mv.as_mv.col);
+
+  // Retrieve the up and left neighbors.
+  int up_error = INT_MAX;
+  int_mv up_mv_diff;
+  if (row - step >= 0) {
+    tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+        row - step, col, tpl_stride, right_shift)];
+    up_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+    up_mv_diff.as_mv.row = current_mv.as_mv.row - up_mv_diff.as_mv.row;
+    up_mv_diff.as_mv.col = current_mv.as_mv.col - up_mv_diff.as_mv.col;
+    up_error = abs(up_mv_diff.as_mv.row) + abs(up_mv_diff.as_mv.col);
+  }
+
+  int left_error = INT_MAX;
+  int_mv left_mv_diff;
+  if (col - step >= 0) {
+    tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+        row, col - step, tpl_stride, right_shift)];
+    left_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+    left_mv_diff.as_mv.row = current_mv.as_mv.row - left_mv_diff.as_mv.row;
+    left_mv_diff.as_mv.col = current_mv.as_mv.col - left_mv_diff.as_mv.col;
+    left_error = abs(left_mv_diff.as_mv.row) + abs(left_mv_diff.as_mv.col);
+  }
+
+  // Return the MV with the minimum distance from current.
+  if (up_error < left_error && up_error < current_mv_magnitude) {
+    return up_mv_diff;
+  } else if (left_error < up_error && left_error < current_mv_magnitude) {
+    return left_mv_diff;
+  }
+  return current_mv;
+}
+
+/* Compute the entropy of motion vectors for a single frame. */
+double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame,
+                                        uint8_t right_shift) {
+  if (!tpl_frame->is_valid) {
+    return 0;
+  }
+
+  int count_row[500] = { 0 };
+  int count_col[500] = { 0 };
+  int n = 0;  // number of MVs to process
+
+  const int tpl_stride = tpl_frame->stride;
+  const int step = 1 << right_shift;
+
+  for (int row = 0; row < tpl_frame->mi_rows; row += step) {
+    for (int col = 0; col < tpl_frame->mi_cols; col += step) {
+      int_mv mv = av1_compute_mv_difference(tpl_frame, row, col, step,
+                                            tpl_stride, right_shift);
+      count_row[clamp(mv.as_mv.row, 0, 499)] += 1;
+      count_col[clamp(mv.as_mv.row, 0, 499)] += 1;
+      n += 1;
+    }
+  }
+
+  // Estimate the bits used using the entropy formula.
+  double rate_row = 0;
+  double rate_col = 0;
+  for (int i = 0; i < 500; i++) {
+    if (count_row[i] != 0) {
+      double p = count_row[i] / (double)n;
+      rate_row += count_row[i] * -log2(p);
+    }
+    if (count_col[i] != 0) {
+      double p = count_col[i] / (double)n;
+      rate_col += count_col[i] * -log2(p);
+    }
+  }
+
+  return rate_row + rate_col;
 }
diff --git a/media/libaom/src/av1/encoder/tpl_model.h b/media/libaom/src/av1/encoder/tpl_model.h
index 11a61b649d..b77a19ff71 100644
--- a/media/libaom/src/av1/encoder/tpl_model.h
+++ b/media/libaom/src/av1/encoder/tpl_model.h
@@ -16,6 +16,26 @@
 extern "C" {
 #endif
 
+/*!\cond */
+
+struct AV1_PRIMARY;
+struct AV1_COMP;
+struct AV1_SEQ_CODING_TOOLS;
+struct EncodeFrameParams;
+struct EncodeFrameInput;
+struct GF_GROUP;
+struct TPL_INFO;
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/mv.h"
+#include "av1/common/scale.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
+
 static INLINE BLOCK_SIZE convert_length_to_bsize(int length) {
   switch (length) {
     case 64: return BLOCK_64X64;
@@ -29,17 +49,676 @@ static INLINE BLOCK_SIZE convert_length_to_bsize(int length) {
   }
 }
 
-int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
-                        const EncodeFrameParams *const frame_params,
-                        const EncodeFrameInput *const frame_input);
+typedef struct AV1TplRowMultiThreadSync {
+#if CONFIG_MULTITHREAD
+  // Synchronization objects for top-right dependency.
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  // Buffer to store the macroblock whose encoding is complete.
+  // num_finished_cols[i] stores the number of macroblocks which finished
+  // encoding in the ith macroblock row.
+  int *num_finished_cols;
+  // Number of extra macroblocks of the top row to be complete for encoding
+  // of the current macroblock to start. A value of 1 indicates top-right
+  // dependency.
+  int sync_range;
+  // Number of macroblock rows.
+  int rows;
+  // Number of threads processing the current tile.
+  int num_threads_working;
+} AV1TplRowMultiThreadSync;
+
+typedef struct AV1TplRowMultiThreadInfo {
+  // Row synchronization related function pointers.
+  void (*sync_read_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c);
+  void (*sync_write_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c,
+                         int cols);
+} AV1TplRowMultiThreadInfo;
+
+// TODO(jingning): This needs to be cleaned up next.
+
+// TPL stats buffers are prepared for every frame in the GOP,
+// including (internal) overlays and (internal) arfs.
+// In addition, frames in the lookahead that are outside of the GOP
+// are also used.
+// Thus it should use
+// (gop_length) + (# overlays) + (MAX_LAG_BUFFERS - gop_len) =
+// MAX_LAG_BUFFERS + (# overlays)
+// 2 * MAX_LAG_BUFFERS is therefore a safe estimate.
+// TODO(bohanli): test setting it to 1.5 * MAX_LAG_BUFFER
+#define MAX_TPL_FRAME_IDX (2 * MAX_LAG_BUFFERS)
+// The first REF_FRAMES + 1 buffers are reserved.
+// tpl_data->tpl_frame starts after REF_FRAMES + 1
+#define MAX_LENGTH_TPL_FRAME_STATS (MAX_TPL_FRAME_IDX + REF_FRAMES + 1)
+#define TPL_DEP_COST_SCALE_LOG2 4
+
+#define TPL_EPSILON 0.0000001
+
+typedef struct TplTxfmStats {
+  int ready;                  // Whether abs_coeff_mean is ready
+  double abs_coeff_sum[256];  // Assume we are using 16x16 transform block
+  double abs_coeff_mean[256];
+  int txfm_block_count;
+  int coeff_num;
+} TplTxfmStats;
+
+typedef struct TplDepStats {
+  int64_t intra_cost;
+  int64_t inter_cost;
+  int64_t srcrf_dist;
+  int64_t recrf_dist;
+  int64_t cmp_recrf_dist[2];
+  int64_t srcrf_rate;
+  int64_t recrf_rate;
+  int64_t srcrf_sse;
+  int64_t cmp_recrf_rate[2];
+  int64_t mc_dep_rate;
+  int64_t mc_dep_dist;
+  int_mv mv[INTER_REFS_PER_FRAME];
+  int ref_frame_index[2];
+  int64_t pred_error[INTER_REFS_PER_FRAME];
+} TplDepStats;
+
+typedef struct TplDepFrame {
+  uint8_t is_valid;
+  TplDepStats *tpl_stats_ptr;
+  const YV12_BUFFER_CONFIG *gf_picture;
+  YV12_BUFFER_CONFIG *rec_picture;
+  int ref_map_index[REF_FRAMES];
+  int stride;
+  int width;
+  int height;
+  int mi_rows;
+  int mi_cols;
+  int base_rdmult;
+  uint32_t frame_display_index;
+} TplDepFrame;
+
+/*!\endcond */
+/*!
+ * \brief Params related to temporal dependency model.
+ */
+typedef struct TplParams {
+  /*!
+   * Whether the tpl stats is ready.
+   */
+  int ready;
+
+  /*!
+   * Block granularity of tpl score storage.
+   */
+  uint8_t tpl_stats_block_mis_log2;
+
+  /*!
+   * Tpl motion estimation block 1d size. tpl_bsize_1d >= 16.
+   */
+  uint8_t tpl_bsize_1d;
+
+  /*!
+   * Buffer to store the frame level tpl information for each frame in a gf
+   * group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf
+   * group
+   */
+  TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS];
+
+  /*!
+   * Buffer to store tpl stats at block granularity.
+   * tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf
+   * group.
+   */
+  TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS];
+
+  /*!
+   * Pointer to the buffer which stores tpl transform stats per frame.
+   * txfm_stats_list[i] stores the TplTxfmStats of the ith frame in a gf group.
+   * Memory is allocated dynamically for MAX_LENGTH_TPL_FRAME_STATS frames when
+   * tpl is enabled.
+   */
+  TplTxfmStats *txfm_stats_list;
+
+  /*!
+   * Buffer to store tpl reconstructed frame.
+   * tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group.
+   */
+  YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS];
+
+  /*!
+   * Pointer to tpl_stats_buffer.
+   */
+  TplDepFrame *tpl_frame;
+
+  /*!
+   * Scale factors for the current frame.
+   */
+  struct scale_factors sf;
+
+  /*!
+   * GF group index of the current frame.
+   */
+  int frame_idx;
+
+  /*!
+   * Array of pointers to the frame buffers holding the source frame.
+   * src_ref_frame[i] stores the pointer to the source frame of the ith
+   * reference frame type.
+   */
+  const YV12_BUFFER_CONFIG *src_ref_frame[INTER_REFS_PER_FRAME];
+
+  /*!
+   * Array of pointers to the frame buffers holding the tpl reconstructed frame.
+   * ref_frame[i] stores the pointer to the tpl reconstructed frame of the ith
+   * reference frame type.
+   */
+  const YV12_BUFFER_CONFIG *ref_frame[INTER_REFS_PER_FRAME];
+
+  /*!
+   * Parameters related to synchronization for top-right dependency in row based
+   * multi-threading of tpl
+   */
+  AV1TplRowMultiThreadSync tpl_mt_sync;
+
+  /*!
+   * Frame border for tpl frame.
+   */
+  int border_in_pixels;
+
+} TplParams;
+
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG
+#define VBR_RC_INFO_MAX_FRAMES 500
+#endif  //  CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG
+
+#if CONFIG_BITRATE_ACCURACY
+
+/*!
+ * \brief This structure stores information needed for bitrate accuracy
+ * experiment.
+ */
+typedef struct {
+  int ready;
+  double total_bit_budget;  // The total bit budget of the entire video
+  int show_frame_count;     // Number of show frames in the entire video
+
+  int gop_showframe_count;  // The number of show frames in the current gop
+  double gop_bit_budget;    // The bitbudget for the current gop
+  double scale_factors[FRAME_UPDATE_TYPES];     // Scale factors to improve the
+                                                // budget estimation
+  double mv_scale_factors[FRAME_UPDATE_TYPES];  // Scale factors to improve
+                                                // MV entropy estimation
+
+  // === Below this line are GOP related data that will be updated per GOP ===
+  int base_q_index;  // Stores the base q index.
+  int q_index_list_ready;
+  int q_index_list[VBR_RC_INFO_MAX_FRAMES];  // q indices for the current
+                                             // GOP
+
+  // Array to store qstep_ratio for each frame in a GOP
+  double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES];
+
+#if CONFIG_THREE_PASS
+  TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES];
+  FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES];
+  int gop_start_idx_list[VBR_RC_INFO_MAX_FRAMES];
+  int gop_length_list[VBR_RC_INFO_MAX_FRAMES];
+  int cur_gop_idx;
+  int total_frame_count;
+  int gop_count;
+#endif  // CONFIG_THREE_PASS
+} VBR_RATECTRL_INFO;
+
+static INLINE void vbr_rc_reset_gop_data(VBR_RATECTRL_INFO *vbr_rc_info) {
+  vbr_rc_info->q_index_list_ready = 0;
+  av1_zero(vbr_rc_info->q_index_list);
+}
+
+void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget,
+                     int show_frame_count);
+
+int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info,
+                                int gf_frame_index);
+
+void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info,
+                                const struct TPL_INFO *tpl_info);
+
+void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info,
+                                   int gop_showframe_count);
+
+void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count,
+                                  const double *qstep_ratio_list,
+                                  aom_bit_depth_t bit_depth, int *q_index_list);
+
+/*!\brief Update q_index_list in vbr_rc_info based on tpl stats
+ *
+ * \param[out]      vbr_rc_info    Rate control info for BITRATE_ACCURACY
+ *                                 experiment
+ * \param[in]       tpl_data       TPL struct
+ * \param[in]       gf_group       GOP struct
+ * \param[in]       bit_depth      bit depth
+ */
+void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info,
+                                    const TplParams *tpl_data,
+                                    const struct GF_GROUP *gf_group,
+                                    aom_bit_depth_t bit_depth);
+/*
+ *!\brief Compute the number of bits needed to encode a GOP
+ *
+ * \param[in]    base_q_index              base layer q_index
+ * \param[in]    bit_depth                 bit depth
+ * \param[in]    update_type_scale_factors array of scale factors for each
+ *                                         update_type
+ * \param[in]    frame_count               size of update_type_list,
+ *                                         qstep_ratio_list stats_list,
+ *                                         q_index_list and
+ *                                         estimated_bitrate_byframe
+ * \param[in]    update_type_list          array of update_type, one per frame
+ * \param[in]    qstep_ratio_list          array of qstep_ratio, one per frame
+ * \param[in]    stats_list                array of transform stats, one per
+ *                                         frame
+ * \param[out]   q_index_list              array of q_index, one per frame
+ * \param[out]   estimated_bitrate_byframe array to keep track of frame
+ *                                         bitrate
+ *
+ * \return The estimated GOP bitrate.
+ *
+ */
+double av1_vbr_rc_info_estimate_gop_bitrate(
+    int base_q_index, aom_bit_depth_t bit_depth,
+    const double *update_type_scale_factors, int frame_count,
+    const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+    const TplTxfmStats *stats_list, int *q_index_list,
+    double *estimated_bitrate_byframe);
+
+/*!\brief Estimate the optimal base q index for a GOP.
+ *
+ * This function uses a binary search to find base layer q index to
+ * achieve the specified bit budget.
+ *
+ * \param[in]    bit_budget        target bit budget
+ * \param[in]    bit_depth         bit depth
+ * \param[in]    update_type_scale_factors array of scale factors for each
+ *                                 update_type
+ * \param[in]    frame_count       size of update_type_list, qstep_ratio_list
+ *                                 stats_list, q_index_list and
+ *                                 estimated_bitrate_byframe
+ * \param[in]    update_type_list  array of update_type, one per frame
+ * \param[in]    qstep_ratio_list  array of qstep_ratio, one per frame
+ * \param[in]    stats_list        array of transform stats, one per frame
+ * \param[out]   q_index_list      array of q_index, one per frame
+ * \param[out]   estimated_bitrate_byframe Array to keep track of frame
+ * bitrate
+ *
+ * \return Returns the optimal base q index to use.
+ */
+int av1_vbr_rc_info_estimate_base_q(
+    double bit_budget, aom_bit_depth_t bit_depth,
+    const double *update_type_scale_factors, int frame_count,
+    const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+    const TplTxfmStats *stats_list, int *q_index_list,
+    double *estimated_bitrate_byframe);
+
+#endif  // CONFIG_BITRATE_ACCURACY
+
+#if CONFIG_RD_COMMAND
+typedef enum {
+  RD_OPTION_NONE,
+  RD_OPTION_SET_Q,
+  RD_OPTION_SET_Q_RDMULT
+} RD_OPTION;
+
+typedef struct RD_COMMAND {
+  RD_OPTION option_ls[MAX_LENGTH_TPL_FRAME_STATS];
+  int q_index_ls[MAX_LENGTH_TPL_FRAME_STATS];
+  int rdmult_ls[MAX_LENGTH_TPL_FRAME_STATS];
+  int frame_count;
+  int frame_index;
+} RD_COMMAND;
+
+void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command);
+#endif  // CONFIG_RD_COMMAND
+
+/*!\brief Allocate buffers used by tpl model
+ *
+ * \param[in]    Top-level encode/decode structure
+ * \param[in]    lag_in_frames  number of lookahead frames
+ *
+ * \param[out]   tpl_data  tpl data structure
+ */
+
+void av1_setup_tpl_buffers(struct AV1_PRIMARY *const ppi,
+                           CommonModeInfoParams *const mi_params, int width,
+                           int height, int byte_alignment, int lag_in_frames);
+
+/*!\brief Implements temporal dependency modelling for a GOP (GF/ARF
+ * group) and selects between 16 and 32 frame GOP structure.
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    cpi           Top - level encoder instance structure
+ * \param[in]    gop_eval      Flag if it is in the GOP length decision stage
+ * \param[in]    frame_params  Per frame encoding parameters
+ *
+ * \return Indicates whether or not we should use a longer GOP length.
+ */
+int av1_tpl_setup_stats(struct AV1_COMP *cpi, int gop_eval,
+                        const struct EncodeFrameParams *const frame_params);
+
+/*!\cond */
+
+void av1_tpl_preload_rc_estimate(
+    struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params);
 
 int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift);
 
-void av1_tpl_rdmult_setup(AV1_COMP *cpi);
+void av1_init_tpl_stats(TplParams *const tpl_data);
+
+int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index);
+
+void av1_tpl_rdmult_setup(struct AV1_COMP *cpi);
 
-void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
+void av1_tpl_rdmult_setup_sb(struct AV1_COMP *cpi, MACROBLOCK *const x,
                              BLOCK_SIZE sb_size, int mi_row, int mi_col);
 
+void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi,
+                               TplTxfmStats *tpl_txfm_stats, MACROBLOCK *x,
+                               int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+/*!\brief  Compute the entropy of an exponential probability distribution
+ * function (pdf) subjected to uniform quantization.
+ *
+ * pdf(x) = b*exp(-b*x)
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    q_step        quantizer step size
+ * \param[in]    b             parameter of exponential distribution
+ *
+ * \return entropy cost
+ */
+double av1_exponential_entropy(double q_step, double b);
+
+/*!\brief  Compute the entropy of a Laplace probability distribution
+ * function (pdf) subjected to non-uniform quantization.
+ *
+ * pdf(x) = 0.5*b*exp(-0.5*b*|x|)
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    q_step          quantizer step size for non-zero bins
+ * \param[in]    b               parameter of Laplace distribution
+ * \param[in]    zero_bin_ratio  zero bin's size is zero_bin_ratio * q_step
+ *
+ * \return entropy cost
+ */
+double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio);
+
+/*!\brief  Compute the frame rate using transform block stats
+ *
+ * Assume each position i in the transform block is of Laplace distribution
+ * with mean absolute deviation abs_coeff_mean[i]
+ *
+ * Then we can use av1_laplace_entropy() to compute the expected frame
+ * rate.
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    q_index         quantizer index
+ * \param[in]    block_count     number of transform blocks
+ * \param[in]    abs_coeff_mean  array of mean absolute deviation
+ * \param[in]    coeff_num       number of coefficients per transform block
+ *
+ * \return expected frame rate
+ */
+double av1_laplace_estimate_frame_rate(int q_index, int block_count,
+                                       const double *abs_coeff_mean,
+                                       int coeff_num);
+
+/*
+ *!\brief Init TplTxfmStats
+ *
+ * \param[in]    tpl_txfm_stats  a structure for storing transform stats
+ *
+ */
+void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats);
+
+/*
+ *!\brief Accumulate TplTxfmStats
+ *
+ * \param[in]  sub_stats          a structure for storing sub transform stats
+ * \param[out] accumulated_stats  a structure for storing accumulated
+ *transform stats
+ *
+ */
+void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats,
+                                   TplTxfmStats *accumulated_stats);
+
+/*
+ *!\brief Record a transform block into  TplTxfmStats
+ *
+ * \param[in]  tpl_txfm_stats     A structure for storing transform stats
+ * \param[out] coeff              An array of transform coefficients. Its size
+ *                                should equal to tpl_txfm_stats.coeff_num.
+ *
+ */
+void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats,
+                               const tran_low_t *coeff);
+
+/*
+ *!\brief Update abs_coeff_mean and ready of txfm_stats
+ * If txfm_block_count > 0, this function will use abs_coeff_sum and
+ * txfm_block_count to compute abs_coeff_mean. Moreover, reday flag
+ * will be set to one.
+ *
+ * \param[in]  txfm_stats     A structure for storing transform stats
+ */
+void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats);
+
+/*!\brief  Estimate coefficient entropy using Laplace dsitribution
+ *
+ *\ingroup tpl_modelling
+ *
+ * This function is equivalent to -log2(laplace_prob()), where laplace_prob()
+ *is defined in tpl_model_test.cc
+ *
+ * \param[in]    q_step          quantizer step size without any scaling
+ * \param[in]    b               mean absolute deviation of Laplace
+ *distribution \param[in]    zero_bin_ratio  zero bin's size is zero_bin_ratio
+ ** q_step \param[in]    qcoeff          quantized coefficient
+ *
+ * \return estimated coefficient entropy
+ *
+ */
+double av1_estimate_coeff_entropy(double q_step, double b,
+                                  double zero_bin_ratio, int qcoeff);
+
+/*!\brief  Estimate entropy of a transform block using Laplace dsitribution
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    q_index         quantizer index
+ * \param[in]    abs_coeff_mean  array of mean absolute deviations
+ * \param[in]    qcoeff_arr      array of quantized coefficients
+ * \param[in]    coeff_num       number of coefficients per transform block
+ *
+ * \return estimated transform block entropy
+ *
+ */
+double av1_estimate_txfm_block_entropy(int q_index,
+                                       const double *abs_coeff_mean,
+                                       int *qcoeff_arr, int coeff_num);
+
+// TODO(angiebird): Add doxygen description here.
+int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+                            int64_t srcrf_dist, int pix_num);
+
+/*!\brief  Compute the overlap area between two blocks with the same size
+ *
+ *\ingroup tpl_modelling
+ *
+ * If there is no overlap, this function should return zero.
+ *
+ * \param[in]    row_a  row position of the first block
+ * \param[in]    col_a  column position of the first block
+ * \param[in]    row_b  row position of the second block
+ * \param[in]    col_b  column position of the second block
+ * \param[in]    width  width shared by the two blocks
+ * \param[in]    height height shared by the two blocks
+ *
+ * \return overlap area of the two blocks
+ */
+int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width,
+                         int height);
+
+/*!\brief Get current frame's q_index from tpl stats and leaf_qindex
+ *
+ * \param[in]       tpl_data          TPL struct
+ * \param[in]       gf_frame_index    current frame index in the GOP
+ * \param[in]       leaf_qindex       q index of leaf frame
+ * \param[in]       bit_depth         bit depth
+ *
+ * \return q_index
+ */
+int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index,
+                        int leaf_qindex, aom_bit_depth_t bit_depth);
+
+/*!\brief Compute the frame importance from TPL stats
+ *
+ * \param[in]       tpl_data          TPL struct
+ * \param[in]       gf_frame_index    current frame index in the GOP
+ *
+ * \return frame_importance
+ */
+double av1_tpl_get_frame_importance(const TplParams *tpl_data,
+                                    int gf_frame_index);
+
+/*!\brief Compute the ratio between arf q step and the leaf q step based on
+ * TPL stats
+ *
+ * \param[in]       tpl_data          TPL struct
+ * \param[in]       gf_frame_index    current frame index in the GOP
+ * \param[in]       leaf_qindex       q index of leaf frame
+ * \param[in]       bit_depth         bit depth
+ *
+ * \return qstep_ratio
+ */
+double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index);
+
+/*!\brief Find a q index whose step size is near qstep_ratio * leaf_qstep
+ *
+ * \param[in]       leaf_qindex       q index of leaf frame
+ * \param[in]       qstep_ratio       step ratio between target q index and
+ * leaf q index \param[in]       bit_depth         bit depth
+ *
+ * \return q_index
+ */
+int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio,
+                                     aom_bit_depth_t bit_depth);
+
+/*!\brief Improve the motion vector estimation by taking neighbors into
+ * account.
+ *
+ * Use the upper and left neighbor block as the reference MVs.
+ * Compute the minimum difference between current MV and reference MV.
+ *
+ * \param[in]       tpl_frame         Tpl frame struct
+ * \param[in]       row               Current row
+ * \param[in]       col               Current column
+ * \param[in]       step              Step parameter for av1_tpl_ptr_pos
+ * \param[in]       tpl_stride        Stride parameter for av1_tpl_ptr_pos
+ * \param[in]       right_shift       Right shift parameter for
+ * av1_tpl_ptr_pos
+ */
+int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col,
+                                 int step, int tpl_stride, int right_shift);
+
+/*!\brief Compute the entropy of motion vectors for a single frame.
+ *
+ * \param[in]       tpl_frame         TPL frame struct
+ * \param[in]       right_shift       right shift value for step
+ *
+ * \return Bits used by the motion vectors for one frame.
+ */
+double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame,
+                                        uint8_t right_shift);
+
+#if CONFIG_RATECTRL_LOG
+typedef struct {
+  int coding_frame_count;
+  int base_q_index;
+
+  // Encode decision
+  int q_index_list[VBR_RC_INFO_MAX_FRAMES];
+  double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES];
+  FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES];
+
+  // Frame stats
+  TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES];
+
+  // Estimated encode results
+  double est_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES];
+
+  // Actual encode results
+  double act_rate_list[VBR_RC_INFO_MAX_FRAMES];
+  double act_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES];
+} RATECTRL_LOG;
+
+static INLINE void rc_log_init(RATECTRL_LOG *rc_log) { av1_zero(*rc_log); }
+
+static INLINE void rc_log_frame_stats(RATECTRL_LOG *rc_log, int coding_index,
+                                      const TplTxfmStats *txfm_stats) {
+  rc_log->txfm_stats_list[coding_index] = *txfm_stats;
+}
+
+static INLINE void rc_log_frame_encode_param(RATECTRL_LOG *rc_log,
+                                             int coding_index,
+                                             double qstep_ratio, int q_index,
+                                             FRAME_UPDATE_TYPE update_type) {
+  rc_log->qstep_ratio_list[coding_index] = qstep_ratio;
+  rc_log->q_index_list[coding_index] = q_index;
+  rc_log->update_type_list[coding_index] = update_type;
+  const TplTxfmStats *txfm_stats = &rc_log->txfm_stats_list[coding_index];
+  rc_log->est_coeff_rate_list[coding_index] = 0;
+  if (txfm_stats->ready) {
+    rc_log->est_coeff_rate_list[coding_index] = av1_laplace_estimate_frame_rate(
+        q_index, txfm_stats->txfm_block_count, txfm_stats->abs_coeff_mean,
+        txfm_stats->coeff_num);
+  }
+}
+
+static INLINE void rc_log_frame_entropy(RATECTRL_LOG *rc_log, int coding_index,
+                                        double act_rate,
+                                        double act_coeff_rate) {
+  rc_log->act_rate_list[coding_index] = act_rate;
+  rc_log->act_coeff_rate_list[coding_index] = act_coeff_rate;
+}
+
+static INLINE void rc_log_record_chunk_info(RATECTRL_LOG *rc_log,
+                                            int base_q_index,
+                                            int coding_frame_count) {
+  rc_log->base_q_index = base_q_index;
+  rc_log->coding_frame_count = coding_frame_count;
+}
+
+static INLINE void rc_log_show(const RATECTRL_LOG *rc_log) {
+  printf("= chunk 1\n");
+  printf("coding_frame_count %d base_q_index %d\n", rc_log->coding_frame_count,
+         rc_log->base_q_index);
+  printf("= frame %d\n", rc_log->coding_frame_count);
+  for (int coding_idx = 0; coding_idx < rc_log->coding_frame_count;
+       coding_idx++) {
+    printf(
+        "coding_idx %d update_type %d q %d qstep_ratio %f est_coeff_rate %f "
+        "act_coeff_rate %f act_rate %f\n",
+        coding_idx, rc_log->update_type_list[coding_idx],
+        rc_log->q_index_list[coding_idx], rc_log->qstep_ratio_list[coding_idx],
+        rc_log->est_coeff_rate_list[coding_idx],
+        rc_log->act_coeff_rate_list[coding_idx],
+        rc_log->act_rate_list[coding_idx]);
+  }
+}
+#endif  // CONFIG_RATECTRL_LOG
+
+/*!\endcond */
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/tune_butteraugli.c b/media/libaom/src/av1/encoder/tune_butteraugli.c
new file mode 100644
index 0000000000..2f057e1fc8
--- /dev/null
+++ b/media/libaom/src/av1/encoder/tune_butteraugli.c
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/encoder/tune_butteraugli.h"
+
+#include "aom_dsp/butteraugli.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/var_based_part.h"
+
+static const int resize_factor = 2;
+
+static void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi,
+                                              const YV12_BUFFER_CONFIG *source,
+                                              const YV12_BUFFER_CONFIG *recon,
+                                              const double K) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = cm->seq_params;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const aom_color_range_t color_range =
+      seq_params->color_range != 0 ? AOM_CR_FULL_RANGE : AOM_CR_STUDIO_RANGE;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int width = source->y_crop_width;
+  const int height = source->y_crop_height;
+  const int ss_x = source->subsampling_x;
+  const int ss_y = source->subsampling_y;
+
+  float *diffmap;
+  CHECK_MEM_ERROR(cm, diffmap, aom_malloc(width * height * sizeof(*diffmap)));
+  if (!aom_calc_butteraugli(source, recon, bit_depth,
+                            seq_params->matrix_coefficients, color_range,
+                            diffmap)) {
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "Failed to calculate Butteraugli distances.");
+  }
+
+  const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize] / resize_factor;
+  const int num_mi_h = mi_size_high[butteraugli_rdo_bsize] / resize_factor;
+  const int num_cols =
+      (mi_params->mi_cols / resize_factor + num_mi_w - 1) / num_mi_w;
+  const int num_rows =
+      (mi_params->mi_rows / resize_factor + num_mi_h - 1) / num_mi_h;
+  const int block_w = num_mi_w << 2;
+  const int block_h = num_mi_h << 2;
+  double log_sum = 0.0;
+  double blk_count = 0.0;
+
+  // Loop through each block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      const int y_start = row * block_h;
+      const int x_start = col * block_w;
+      float dbutteraugli = 0.0f;
+      float dmse = 0.0f;
+      float px_count = 0.0f;
+
+      // Loop through each pixel.
+      for (int y = y_start; y < y_start + block_h && y < height; y++) {
+        for (int x = x_start; x < x_start + block_w && x < width; x++) {
+          dbutteraugli += powf(diffmap[y * width + x], 12.0f);
+          float px_diff = source->y_buffer[y * source->y_stride + x] -
+                          recon->y_buffer[y * recon->y_stride + x];
+          dmse += px_diff * px_diff;
+          px_count += 1.0f;
+        }
+      }
+      const int y_end = AOMMIN((y_start >> ss_y) + (block_h >> ss_y),
+                               (height + ss_y) >> ss_y);
+      for (int y = y_start >> ss_y; y < y_end; y++) {
+        const int x_end = AOMMIN((x_start >> ss_x) + (block_w >> ss_x),
+                                 (width + ss_x) >> ss_x);
+        for (int x = x_start >> ss_x; x < x_end; x++) {
+          const int src_px_index = y * source->uv_stride + x;
+          const int recon_px_index = y * recon->uv_stride + x;
+          const float px_diff_u = (float)(source->u_buffer[src_px_index] -
+                                          recon->u_buffer[recon_px_index]);
+          const float px_diff_v = (float)(source->v_buffer[src_px_index] -
+                                          recon->v_buffer[recon_px_index]);
+          dmse += px_diff_u * px_diff_u + px_diff_v * px_diff_v;
+          px_count += 2.0f;
+        }
+      }
+
+      dbutteraugli = powf(dbutteraugli, 1.0f / 12.0f);
+      dmse = dmse / px_count;
+      const float eps = 0.01f;
+      double weight;
+      if (dbutteraugli < eps || dmse < eps) {
+        weight = -1.0;
+      } else {
+        blk_count += 1.0;
+        weight = dmse / dbutteraugli;
+        weight = AOMMIN(weight, 5.0);
+        weight += K;
+        log_sum += log(weight);
+      }
+      cpi->butteraugli_info.rdmult_scaling_factors[index] = weight;
+    }
+  }
+  // Geometric average of the weights.
+  log_sum = exp(log_sum / blk_count);
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      double *weight = &cpi->butteraugli_info.rdmult_scaling_factors[index];
+      if (*weight <= 0.0) {
+        *weight = 1.0;
+      } else {
+        *weight /= log_sum;
+      }
+      *weight = AOMMIN(*weight, 2.5);
+      *weight = AOMMAX(*weight, 0.4);
+    }
+  }
+
+  aom_free(diffmap);
+}
+
+void av1_set_butteraugli_rdmult(const AV1_COMP *cpi, MACROBLOCK *x,
+                                BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                int *rdmult) {
+  assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI);
+  if (!cpi->butteraugli_info.recon_set) {
+    return;
+  }
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize];
+  const int num_mi_h = mi_size_high[butteraugli_rdo_bsize];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  double num_of_mi = 0.0;
+  double geom_mean_of_scale = 0.0;
+
+  for (int row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (int col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale +=
+          log(cpi->butteraugli_info.rdmult_scaling_factors[index]);
+      num_of_mi += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+  *rdmult = AOMMAX(*rdmult, 0);
+  av1_set_error_per_bit(&x->errorperbit, *rdmult);
+}
+
+static void copy_plane(const uint8_t *src, int src_stride, uint8_t *dst,
+                       int dst_stride, int w, int h) {
+  for (int row = 0; row < h; row++) {
+    memcpy(dst, src, w);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_img(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                     int width, int height) {
+  copy_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, width,
+             height);
+  const int width_uv = (width + src->subsampling_x) >> src->subsampling_x;
+  const int height_uv = (height + src->subsampling_y) >> src->subsampling_y;
+  copy_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+             width_uv, height_uv);
+  copy_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+             width_uv, height_uv);
+}
+
+static void zero_plane(uint8_t *dst, int dst_stride, int h) {
+  for (int row = 0; row < h; row++) {
+    memset(dst, 0, dst_stride);
+    dst += dst_stride;
+  }
+}
+
+static void zero_img(YV12_BUFFER_CONFIG *dst) {
+  zero_plane(dst->y_buffer, dst->y_stride, dst->y_height);
+  zero_plane(dst->u_buffer, dst->uv_stride, dst->uv_height);
+  zero_plane(dst->v_buffer, dst->uv_stride, dst->uv_height);
+}
+
+void av1_setup_butteraugli_source(AV1_COMP *cpi) {
+  YV12_BUFFER_CONFIG *const dst = &cpi->butteraugli_info.source;
+  AV1_COMMON *const cm = &cpi->common;
+  const int width = cpi->source->y_crop_width;
+  const int height = cpi->source->y_crop_height;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = cpi->source->subsampling_x;
+  const int ss_y = cpi->source->subsampling_y;
+  if (dst->buffer_alloc_sz == 0) {
+    aom_alloc_frame_buffer(
+        dst, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
+        cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0);
+  }
+  av1_copy_and_extend_frame(cpi->source, dst);
+
+  YV12_BUFFER_CONFIG *const resized_dst = &cpi->butteraugli_info.resized_source;
+  if (resized_dst->buffer_alloc_sz == 0) {
+    aom_alloc_frame_buffer(
+        resized_dst, width / resize_factor, height / resize_factor, ss_x, ss_y,
+        cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+        cm->features.byte_alignment, 0);
+  }
+  av1_resize_and_extend_frame_nonnormative(cpi->source, resized_dst, bit_depth,
+                                           av1_num_planes(cm));
+
+  zero_img(cpi->source);
+  copy_img(resized_dst, cpi->source, width / resize_factor,
+           height / resize_factor);
+}
+
+void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K) {
+  av1_copy_and_extend_frame(&cpi->butteraugli_info.source, cpi->source);
+  AV1_COMMON *const cm = &cpi->common;
+  const int width = cpi->source->y_crop_width;
+  const int height = cpi->source->y_crop_height;
+  const int ss_x = cpi->source->subsampling_x;
+  const int ss_y = cpi->source->subsampling_y;
+
+  YV12_BUFFER_CONFIG resized_recon;
+  memset(&resized_recon, 0, sizeof(resized_recon));
+  aom_alloc_frame_buffer(
+      &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment, 0);
+  copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor,
+           height / resize_factor);
+
+  set_mb_butteraugli_rdmult_scaling(cpi, &cpi->butteraugli_info.resized_source,
+                                    &resized_recon, K);
+  cpi->butteraugli_info.recon_set = true;
+  aom_free_frame_buffer(&resized_recon);
+}
+
+void av1_setup_butteraugli_rdmult(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+  const int q_index = 96;
+
+  // Setup necessary params for encoding, including frame source, etc.
+  if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
+  av1_set_frame_size(cpi, cm->superres_upscaled_width,
+                     cm->superres_upscaled_height);
+
+  cpi->source = av1_realloc_and_scale_if_required(
+      cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter,
+      0, false, false, cpi->oxcf.border_in_pixels,
+      cpi->oxcf.tool_cfg.enable_global_motion);
+  if (cpi->unscaled_last_source != NULL) {
+    cpi->last_source = av1_realloc_and_scale_if_required(
+        cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+        cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels,
+        cpi->oxcf.tool_cfg.enable_global_motion);
+  }
+
+  av1_setup_butteraugli_source(cpi);
+  av1_setup_frame(cpi);
+
+  if (cm->seg.enabled) {
+    if (!cm->seg.update_data && cm->prev_frame) {
+      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+      cm->seg.enabled = cm->prev_frame->seg.enabled;
+    } else {
+      av1_calculate_segdata(&cm->seg);
+    }
+  } else {
+    memset(&cm->seg, 0, sizeof(cm->seg));
+  }
+  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+  cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+  const PARTITION_SEARCH_TYPE partition_search_type =
+      cpi->sf.part_sf.partition_search_type;
+  const BLOCK_SIZE fixed_partition_size = cpi->sf.part_sf.fixed_partition_size;
+  // Enable a quicker pass by uncommenting the following lines:
+  // cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+  // cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+
+  av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q_index,
+                    q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+  av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+  if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
+    av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                       cm->seq_params->bit_depth);
+
+  av1_set_variance_partition_thresholds(cpi, q_index, 0);
+  av1_encode_frame(cpi);
+
+  av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.3);
+  cpi->sf.part_sf.partition_search_type = partition_search_type;
+  cpi->sf.part_sf.fixed_partition_size = fixed_partition_size;
+}
diff --git a/media/libaom/src/av1/encoder/tune_butteraugli.h b/media/libaom/src/av1/encoder/tune_butteraugli.h
new file mode 100644
index 0000000000..bae5d2a882
--- /dev/null
+++ b/media/libaom/src/av1/encoder/tune_butteraugli.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
+#define AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/block.h"
+
+typedef struct {
+  // Stores the scaling factors for rdmult when tuning for Butteraugli.
+  // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for
+  // 4x4 block at (row, col).
+  double *rdmult_scaling_factors;
+  YV12_BUFFER_CONFIG source, resized_source;
+  bool recon_set;
+} TuneButteraugliInfo;
+
+struct AV1_COMP;
+static const BLOCK_SIZE butteraugli_rdo_bsize = BLOCK_16X16;
+
+void av1_set_butteraugli_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                                BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                int *rdmult);
+
+void av1_setup_butteraugli_source(struct AV1_COMP *cpi);
+
+// 'K' is used to balance the rate-distortion distribution between PSNR
+// and Butteraugli.
+void av1_setup_butteraugli_rdmult_and_restore_source(struct AV1_COMP *cpi,
+                                                     double K);
+
+void av1_setup_butteraugli_rdmult(struct AV1_COMP *cpi);
+
+#endif  // AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
diff --git a/media/libaom/src/av1/encoder/tune_vmaf.c b/media/libaom/src/av1/encoder/tune_vmaf.c
index 997f78e27c..0477bbd759 100644
--- a/media/libaom/src/av1/encoder/tune_vmaf.c
+++ b/media/libaom/src/av1/encoder/tune_vmaf.c
@@ -12,13 +12,184 @@
 #include "av1/encoder/tune_vmaf.h"
 
 #include "aom_dsp/psnr.h"
-#include "aom_dsp/vmaf.h"
-#include "aom_ports/system_state.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/rdopt.h"
+#include "config/aom_scale_rtcd.h"
 
 static const double kBaselineVmaf = 97.42773;
 
+static double get_layer_value(const double *array, int layer) {
+  while (array[layer] < 0.0 && layer > 0) layer--;
+  return AOMMAX(array[layer], 0.0);
+}
+
+static void motion_search(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *src,
+                          const YV12_BUFFER_CONFIG *ref,
+                          const BLOCK_SIZE block_size, const int mb_row,
+                          const int mb_col, FULLPEL_MV *ref_mv) {
+  // Block information (ONLY Y-plane is used for motion search).
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int y_stride = src->y_stride;
+  assert(y_stride == ref->y_stride);
+  const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+
+  // Save input state.
+  MACROBLOCK *const mb = &cpi->td.mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  const struct buf_2d ori_src_buf = mb->plane[0].src;
+  const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
+
+  // Parameters used for motion search.
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  const SEARCH_METHODS search_method = NSTEP;
+  const search_site_config *search_site_cfg =
+      cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
+  const int step_param =
+      av1_init_search_range(AOMMAX(src->y_crop_width, src->y_crop_height));
+
+  // Baseline position for motion search (used for rate distortion comparison).
+  const MV baseline_mv = kZeroMv;
+
+  // Setup.
+  mb->plane[0].src.buf = src->y_buffer + y_offset;
+  mb->plane[0].src.stride = y_stride;
+  mbd->plane[0].pre[0].buf = ref->y_buffer + y_offset;
+  mbd->plane[0].pre[0].stride = y_stride;
+
+  // Unused intermediate results for motion search.
+  int cost_list[5];
+
+  // Do motion search.
+  // Only do full search on the entire block.
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
+                                     &baseline_mv, search_site_cfg,
+                                     /*fine_search_interval=*/0);
+  av1_set_mv_search_method(&full_ms_params, search_site_cfg, search_method);
+  av1_full_pixel_search(*ref_mv, &full_ms_params, step_param,
+                        cond_cost_list(cpi, cost_list), ref_mv, NULL);
+
+  // Restore input state.
+  mb->plane[0].src = ori_src_buf;
+  mbd->plane[0].pre[0] = ori_pre_buf;
+}
+
+static unsigned int residual_variance(const AV1_COMP *cpi,
+                                      const YV12_BUFFER_CONFIG *src,
+                                      const YV12_BUFFER_CONFIG *ref,
+                                      const BLOCK_SIZE block_size,
+                                      const int mb_row, const int mb_col,
+                                      FULLPEL_MV ref_mv, unsigned int *sse) {
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int y_stride = src->y_stride;
+  assert(y_stride == ref->y_stride);
+  const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+  const int mv_offset = ref_mv.row * y_stride + ref_mv.col;
+  const unsigned int var = cpi->ppi->fn_ptr[block_size].vf(
+      ref->y_buffer + y_offset + mv_offset, y_stride, src->y_buffer + y_offset,
+      y_stride, sse);
+  return var;
+}
+
+static double frame_average_variance(const AV1_COMP *const cpi,
+                                     const YV12_BUFFER_CONFIG *const frame) {
+  const uint8_t *const y_buffer = frame->y_buffer;
+  const int y_stride = frame->y_stride;
+  const BLOCK_SIZE block_size = BLOCK_64X64;
+
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  int row, col;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  double var = 0.0, var_count = 0.0;
+
+  // Loop through each block.
+  for (row = 0; row < frame->y_height / block_h; ++row) {
+    for (col = 0; col < frame->y_width / block_w; ++col) {
+      struct buf_2d buf;
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+
+      buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y;
+      buf.stride = y_stride;
+
+      if (cpi->common.seq_params->use_highbitdepth) {
+        assert(frame->flags & YV12_FLAG_HIGHBITDEPTH);
+        var += av1_high_get_sby_perpixel_variance(cpi, &buf, block_size,
+                                                  bit_depth);
+      } else {
+        var += av1_get_sby_perpixel_variance(cpi, &buf, block_size);
+      }
+      var_count += 1.0;
+    }
+  }
+  var /= var_count;
+  return var;
+}
+
+static double residual_frame_average_variance(AV1_COMP *cpi,
+                                              const YV12_BUFFER_CONFIG *src,
+                                              const YV12_BUFFER_CONFIG *ref,
+                                              FULLPEL_MV *mvs) {
+  if (ref == NULL) return frame_average_variance(cpi, src);
+  const BLOCK_SIZE block_size = BLOCK_16X16;
+  const int frame_height = src->y_height;
+  const int frame_width = src->y_width;
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_rows = (frame_height + mb_height - 1) / mb_height;
+  const int mb_cols = (frame_width + mb_width - 1) / mb_width;
+  const int num_planes = av1_num_planes(&cpi->common);
+  const int mi_h = mi_size_high_log2[block_size];
+  const int mi_w = mi_size_wide_log2[block_size];
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  // Save input state.
+  MACROBLOCK *const mb = &cpi->td.mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  for (int i = 0; i < num_planes; i++) {
+    input_buffer[i] = mbd->plane[i].pre[0].buf;
+  }
+  MB_MODE_INFO **input_mb_mode_info = mbd->mi;
+
+  bool do_motion_search = false;
+  if (mvs == NULL) {
+    do_motion_search = true;
+    CHECK_MEM_ERROR(&cpi->common, mvs,
+                    (FULLPEL_MV *)aom_calloc(mb_rows * mb_cols, sizeof(*mvs)));
+  }
+
+  unsigned int variance = 0;
+  // Perform temporal filtering block by block.
+  for (int mb_row = 0; mb_row < mb_rows; mb_row++) {
+    av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+                          (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+                          cpi->oxcf.border_in_pixels);
+    for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+      av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+                            (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
+                            cpi->oxcf.border_in_pixels);
+      FULLPEL_MV *ref_mv = &mvs[mb_col + mb_row * mb_cols];
+      if (do_motion_search) {
+        motion_search(cpi, src, ref, block_size, mb_row, mb_col, ref_mv);
+      }
+      unsigned int mv_sse;
+      const unsigned int blk_var = residual_variance(
+          cpi, src, ref, block_size, mb_row, mb_col, *ref_mv, &mv_sse);
+      variance += blk_var;
+    }
+  }
+
+  // Restore input state
+  for (int i = 0; i < num_planes; i++) {
+    mbd->plane[i].pre[0].buf = input_buffer[i];
+  }
+  mbd->mi = input_mb_mode_info;
+  return (double)variance / (double)(mb_rows * mb_cols);
+}
+
 // TODO(sdeng): Add the SIMD implementation.
 static AOM_INLINE void highbd_unsharp_rect(const uint16_t *source,
                                            int source_stride,
@@ -60,7 +231,10 @@ static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
                                const YV12_BUFFER_CONFIG *blurred,
                                const YV12_BUFFER_CONFIG *dst, double amount) {
   const int bit_depth = cpi->td.mb.e_mbd.bd;
-  if (bit_depth > 8) {
+  if (cpi->common.seq_params->use_highbitdepth) {
+    assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+    assert(blurred->flags & YV12_FLAG_HIGHBITDEPTH);
+    assert(dst->flags & YV12_FLAG_HIGHBITDEPTH);
     highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride,
                         CONVERT_TO_SHORTPTR(blurred->y_buffer),
                         blurred->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer),
@@ -90,7 +264,6 @@ static AOM_INLINE void gaussian_blur(const int bit_depth,
   ConvolveParams conv_params = get_conv_params(0, 0, bit_depth);
   InterpFilterParams filter = { .filter_ptr = gauss_filter,
                                 .taps = 8,
-                                .subpel_shifts = 0,
                                 .interp_filter = EIGHTTAP_REGULAR };
 
   for (row = 0; row < num_rows; ++row) {
@@ -103,7 +276,7 @@ static AOM_INLINE void gaussian_blur(const int bit_depth,
       uint8_t *dst_buf =
           dst->y_buffer + row_offset_y * dst->y_stride + col_offset_y;
 
-      if (bit_depth > 8) {
+      if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
         av1_highbd_convolve_2d_sr(
             CONVERT_TO_SHORTPTR(src_buf), source->y_stride,
             CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h,
@@ -117,48 +290,18 @@ static AOM_INLINE void gaussian_blur(const int bit_depth,
   }
 }
 
-static double frame_average_variance(const AV1_COMP *const cpi,
-                                     const YV12_BUFFER_CONFIG *const frame) {
-  const uint8_t *const y_buffer = frame->y_buffer;
-  const int y_stride = frame->y_stride;
-  const BLOCK_SIZE block_size = BLOCK_64X64;
-
-  const int block_w = mi_size_wide[block_size] * 4;
-  const int block_h = mi_size_high[block_size] * 4;
-  int row, col;
+static AOM_INLINE double cal_approx_vmaf(const AV1_COMP *const cpi,
+                                         double source_variance,
+                                         YV12_BUFFER_CONFIG *const source,
+                                         YV12_BUFFER_CONFIG *const sharpened) {
   const int bit_depth = cpi->td.mb.e_mbd.bd;
-  double var = 0.0, var_count = 0.0;
-
-  // Loop through each block.
-  for (row = 0; row < frame->y_height / block_h; ++row) {
-    for (col = 0; col < frame->y_width / block_w; ++col) {
-      struct buf_2d buf;
-      const int row_offset_y = row * block_h;
-      const int col_offset_y = col * block_w;
-
-      buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y;
-      buf.stride = y_stride;
+  const bool cal_vmaf_neg =
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+  double new_vmaf;
 
-      if (bit_depth > 8) {
-        var += av1_high_get_sby_perpixel_variance(cpi, &buf, block_size,
-                                                  bit_depth);
-      } else {
-        var += av1_get_sby_perpixel_variance(cpi, &buf, block_size);
-      }
-      var_count += 1.0;
-    }
-  }
-  var /= var_count;
-  return var;
-}
+  aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, sharpened, bit_depth,
+                cal_vmaf_neg, &new_vmaf);
 
-static double cal_approx_vmaf(const AV1_COMP *const cpi, double source_variance,
-                              YV12_BUFFER_CONFIG *const source,
-                              YV12_BUFFER_CONFIG *const sharpened) {
-  const int bit_depth = cpi->td.mb.e_mbd.bd;
-  double new_vmaf;
-  aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, sharpened, bit_depth,
-                &new_vmaf);
   const double sharpened_var = frame_average_variance(cpi, sharpened);
   return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf);
 }
@@ -197,12 +340,12 @@ static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi,
   const AV1_COMMON *const cm = &cpi->common;
   const int width = source->y_width;
   const int height = source->y_height;
-
   YV12_BUFFER_CONFIG sharpened;
   memset(&sharpened, 0, sizeof(sharpened));
   aom_alloc_frame_buffer(
-      &sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+      &sharpened, width, height, source->subsampling_x, source->subsampling_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment, 0);
 
   const double baseline_variance = frame_average_variance(cpi, source);
   double unsharp_amount;
@@ -234,9 +377,35 @@ static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi,
   return unsharp_amount;
 }
 
+void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi,
+                                YV12_BUFFER_CONFIG *const source) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int width = source->y_width;
+  const int height = source->y_height;
+
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+  const double best_frame_unsharp_amount =
+      get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
+  if (best_frame_unsharp_amount <= 0.0) return;
+
+  YV12_BUFFER_CONFIG blurred;
+  memset(&blurred, 0, sizeof(blurred));
+  aom_alloc_frame_buffer(
+      &blurred, width, height, source->subsampling_x, source->subsampling_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment, 0);
+
+  gaussian_blur(bit_depth, source, &blurred);
+  unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
+  aom_free_frame_buffer(&blurred);
+}
+
 void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi,
                                   YV12_BUFFER_CONFIG *const source) {
-  aom_clear_system_state();
   const AV1_COMMON *const cm = &cpi->common;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
   const int width = source->y_width;
@@ -246,50 +415,69 @@ void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi,
   memset(&source_extended, 0, sizeof(source_extended));
   memset(&blurred, 0, sizeof(blurred));
   aom_alloc_frame_buffer(
-      &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+      &source_extended, width, height, source->subsampling_x,
+      source->subsampling_y, cm->seq_params->use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0);
   aom_alloc_frame_buffer(
-      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+      &blurred, width, height, source->subsampling_x, source->subsampling_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment, 0);
 
   av1_copy_and_extend_frame(source, &source_extended);
   gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+  const double last_frame_unsharp_amount =
+      get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
   const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
-      cpi, source, &blurred, cpi->last_frame_unsharp_amount, 0.05, 20, 1.01);
-  cpi->last_frame_unsharp_amount = best_frame_unsharp_amount;
+      cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01);
+
+  cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+      best_frame_unsharp_amount;
 
   unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
   aom_free_frame_buffer(&blurred);
-  aom_clear_system_state();
 }
 
 void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
                                 YV12_BUFFER_CONFIG *const source) {
-  aom_clear_system_state();
   const AV1_COMMON *const cm = &cpi->common;
   const int width = source->y_width;
   const int height = source->y_height;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = source->subsampling_x;
+  const int ss_y = source->subsampling_y;
 
   YV12_BUFFER_CONFIG source_extended, blurred;
   memset(&blurred, 0, sizeof(blurred));
   memset(&source_extended, 0, sizeof(source_extended));
   aom_alloc_frame_buffer(
-      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+      &blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0);
+  aom_alloc_frame_buffer(&source_extended, width, height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0);
 
   av1_copy_and_extend_frame(source, &source_extended);
   gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+  const double last_frame_unsharp_amount =
+      get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
   const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
-      cpi, source, &blurred, cpi->last_frame_unsharp_amount, 0.05, 20, 1.01);
-  cpi->last_frame_unsharp_amount = best_frame_unsharp_amount;
+      cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01);
+
+  cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+      best_frame_unsharp_amount;
 
   const int block_size = BLOCK_64X64;
   const int block_w = mi_size_wide[block_size] * 4;
@@ -297,19 +485,23 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
   const int num_cols = (source->y_width + block_w - 1) / block_w;
   const int num_rows = (source->y_height + block_h - 1) / block_h;
   double *best_unsharp_amounts =
-      aom_malloc(sizeof(*best_unsharp_amounts) * num_cols * num_rows);
-  memset(best_unsharp_amounts, 0,
-         sizeof(*best_unsharp_amounts) * num_cols * num_rows);
+      aom_calloc(num_cols * num_rows, sizeof(*best_unsharp_amounts));
+  if (!best_unsharp_amounts) {
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating vmaf data");
+  }
 
   YV12_BUFFER_CONFIG source_block, blurred_block;
   memset(&source_block, 0, sizeof(source_block));
   memset(&blurred_block, 0, sizeof(blurred_block));
-  aom_alloc_frame_buffer(
-      &source_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &blurred_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&source_block, block_w, block_h, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0);
+  aom_alloc_frame_buffer(&blurred_block, block_w, block_h, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0);
 
   for (int row = 0; row < num_rows; ++row) {
     for (int col = 0; col < num_cols; ++col) {
@@ -319,7 +511,9 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
       const int block_height = AOMMIN(height - row_offset_y, block_h);
       const int index = col + row * num_cols;
 
-      if (bit_depth > 8) {
+      if (cm->seq_params->use_highbitdepth) {
+        assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+        assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
         uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
                                   row_offset_y * source->y_stride +
                                   col_offset_y;
@@ -386,7 +580,9 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
       const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
       const int index = col + row * num_cols;
 
-      if (bit_depth > 8) {
+      if (cm->seq_params->use_highbitdepth) {
+        assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+        assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
         uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
                             row_offset_y * source->y_stride + col_offset_y;
         uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
@@ -411,92 +607,6 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
   aom_free_frame_buffer(&blurred_block);
   aom_free_frame_buffer(&blurred);
   aom_free(best_unsharp_amounts);
-  aom_clear_system_state();
-}
-
-typedef struct FrameData {
-  const YV12_BUFFER_CONFIG *source, *blurred;
-  int block_w, block_h, num_rows, num_cols, row, col, bit_depth;
-} FrameData;
-
-// A callback function used to pass data to VMAF.
-// Returns 0 after reading a frame.
-// Returns 2 when there is no more frame to read.
-static int update_frame(float *ref_data, float *main_data, float *temp_data,
-                        int stride, void *user_data) {
-  FrameData *frames = (FrameData *)user_data;
-  const int width = frames->source->y_width;
-  const int height = frames->source->y_height;
-  const int row = frames->row;
-  const int col = frames->col;
-  const int num_rows = frames->num_rows;
-  const int num_cols = frames->num_cols;
-  const int block_w = frames->block_w;
-  const int block_h = frames->block_h;
-  const YV12_BUFFER_CONFIG *source = frames->source;
-  const YV12_BUFFER_CONFIG *blurred = frames->blurred;
-  const int bit_depth = frames->bit_depth;
-  const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
-  (void)temp_data;
-  stride /= (int)sizeof(*ref_data);
-
-  for (int i = 0; i < height; ++i) {
-    float *ref, *main;
-    ref = ref_data + i * stride;
-    main = main_data + i * stride;
-    if (bit_depth == 8) {
-      uint8_t *src;
-      src = source->y_buffer + i * source->y_stride;
-      for (int j = 0; j < width; ++j) {
-        ref[j] = main[j] = (float)src[j];
-      }
-    } else {
-      uint16_t *src;
-      src = CONVERT_TO_SHORTPTR(source->y_buffer) + i * source->y_stride;
-      for (int j = 0; j < width; ++j) {
-        ref[j] = main[j] = scale_factor * (float)src[j];
-      }
-    }
-  }
-  if (row < num_rows && col < num_cols) {
-    // Set current block
-    const int row_offset = row * block_h;
-    const int col_offset = col * block_w;
-    const int block_width = AOMMIN(width - col_offset, block_w);
-    const int block_height = AOMMIN(height - row_offset, block_h);
-
-    float *main_buf = main_data + col_offset + row_offset * stride;
-    if (bit_depth == 8) {
-      uint8_t *blurred_buf =
-          blurred->y_buffer + row_offset * blurred->y_stride + col_offset;
-      for (int i = 0; i < block_height; ++i) {
-        for (int j = 0; j < block_width; ++j) {
-          main_buf[j] = (float)blurred_buf[j];
-        }
-        main_buf += stride;
-        blurred_buf += blurred->y_stride;
-      }
-    } else {
-      uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred->y_buffer) +
-                              row_offset * blurred->y_stride + col_offset;
-      for (int i = 0; i < block_height; ++i) {
-        for (int j = 0; j < block_width; ++j) {
-          main_buf[j] = scale_factor * (float)blurred_buf[j];
-        }
-        main_buf += stride;
-        blurred_buf += blurred->y_stride;
-      }
-    }
-
-    frames->col++;
-    if (frames->col >= num_cols) {
-      frames->col = 0;
-      frames->row++;
-    }
-    return 0;
-  } else {
-    return 2;
-  }
 }
 
 void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
@@ -506,16 +616,17 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
   const int resized_block_size = BLOCK_32X32;
   const int resize_factor = 2;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = cpi->source->subsampling_x;
+  const int ss_y = cpi->source->subsampling_y;
 
-  aom_clear_system_state();
   YV12_BUFFER_CONFIG resized_source;
   memset(&resized_source, 0, sizeof(resized_source));
   aom_alloc_frame_buffer(
-      &resized_source, y_width / resize_factor, y_height / resize_factor, 1, 1,
-      cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
-      cm->features.byte_alignment);
-  av1_resize_and_extend_frame(cpi->source, &resized_source, bit_depth,
-                              av1_num_planes(cm));
+      &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x,
+      ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment, 0);
+  av1_resize_and_extend_frame_nonnormative(cpi->source, &resized_source,
+                                           bit_depth, av1_num_planes(cm));
 
   const int resized_y_width = resized_source.y_width;
   const int resized_y_height = resized_source.y_height;
@@ -528,27 +639,29 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
 
   YV12_BUFFER_CONFIG blurred;
   memset(&blurred, 0, sizeof(blurred));
-  aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, 1, 1,
-                         cm->seq_params.use_highbitdepth,
+  aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, ss_x,
+                         ss_y, cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
-                         cm->features.byte_alignment);
+                         cm->features.byte_alignment, 0);
   gaussian_blur(bit_depth, &resized_source, &blurred);
 
-  double *scores = aom_malloc(sizeof(*scores) * (num_rows * num_cols));
-  memset(scores, 0, sizeof(*scores) * (num_rows * num_cols));
-  FrameData frame_data;
-  frame_data.source = &resized_source;
-  frame_data.blurred = &blurred;
-  frame_data.block_w = resized_block_w;
-  frame_data.block_h = resized_block_h;
-  frame_data.num_rows = num_rows;
-  frame_data.num_cols = num_cols;
-  frame_data.row = 0;
-  frame_data.col = 0;
-  frame_data.bit_depth = bit_depth;
-  aom_calc_vmaf_multi_frame(&frame_data, cpi->oxcf.vmaf_model_path,
-                            update_frame, resized_y_width, resized_y_height,
-                            bit_depth, scores);
+  YV12_BUFFER_CONFIG recon;
+  memset(&recon, 0, sizeof(recon));
+  aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0);
+  aom_yv12_copy_frame(&resized_source, &recon, 1);
+
+  VmafContext *vmaf_context;
+  const bool cal_vmaf_neg =
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+  aom_init_vmaf_context(&vmaf_context, cpi->vmaf_info.vmaf_model, cal_vmaf_neg);
+  unsigned int *sses = aom_calloc(num_rows * num_cols, sizeof(*sses));
+  if (!sses) {
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating vmaf data");
+  }
 
   // Loop through each 'block_size' block.
   for (int row = 0; row < num_rows; ++row) {
@@ -563,14 +676,51 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
       uint8_t *const blurred_buf =
           blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
 
-      const double vmaf = scores[index];
+      cpi->ppi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
+                                              blurred_buf, blurred.y_stride,
+                                              &sses[index]);
+
+      uint8_t *const recon_buf =
+          recon.y_buffer + row_offset_y * recon.y_stride + col_offset_y;
+      // Set recon buf
+      if (cpi->common.seq_params->use_highbitdepth) {
+        highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
+                            CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
+                            CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride,
+                            resized_block_w, resized_block_h, 0.0, bit_depth);
+      } else {
+        unsharp_rect(blurred_buf, blurred.y_stride, blurred_buf,
+                     blurred.y_stride, recon_buf, recon.y_stride,
+                     resized_block_w, resized_block_h, 0.0);
+      }
+
+      aom_read_vmaf_image(vmaf_context, &resized_source, &recon, bit_depth,
+                          index);
+
+      // Restore recon buf
+      if (cpi->common.seq_params->use_highbitdepth) {
+        highbd_unsharp_rect(
+            CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
+            CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
+            CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, resized_block_w,
+            resized_block_h, 0.0, bit_depth);
+      } else {
+        unsharp_rect(orig_buf, resized_source.y_stride, orig_buf,
+                     resized_source.y_stride, recon_buf, recon.y_stride,
+                     resized_block_w, resized_block_h, 0.0);
+      }
+    }
+  }
+  aom_flush_vmaf_context(vmaf_context);
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      const double vmaf = aom_calc_vmaf_at_index(
+          vmaf_context, cpi->vmaf_info.vmaf_model, index);
       const double dvmaf = kBaselineVmaf - vmaf;
-      unsigned int sse;
-      cpi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
-                                         blurred_buf, blurred.y_stride, &sse);
 
       const double mse =
-          (double)sse / (double)(resized_y_width * resized_y_height);
+          (double)sses[index] / (double)(resized_y_width * resized_y_height);
       double weight;
       const double eps = 0.01 / (num_rows * num_cols);
       if (dvmaf < eps || mse < eps) {
@@ -581,14 +731,14 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
 
       // Normalize it with a data fitted model.
       weight = 6.0 * (1.0 - exp(-0.05 * weight)) + 0.8;
-      cpi->vmaf_rdmult_scaling_factors[index] = weight;
+      cpi->vmaf_info.rdmult_scaling_factors[index] = weight;
     }
   }
 
   aom_free_frame_buffer(&resized_source);
   aom_free_frame_buffer(&blurred);
-  aom_free(scores);
-  aom_clear_system_state();
+  aom_close_vmaf_context(vmaf_context);
+  aom_free(sses);
 }
 
 void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
@@ -607,13 +757,12 @@ void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
   double num_of_mi = 0.0;
   double geom_mean_of_scale = 0.0;
 
-  aom_clear_system_state();
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
     for (col = mi_col / num_mi_h;
          col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
       const int index = row * num_cols + col;
-      geom_mean_of_scale += log(cpi->vmaf_rdmult_scaling_factors[index]);
+      geom_mean_of_scale += log(cpi->vmaf_info.rdmult_scaling_factors[index]);
       num_of_mi += 1.0;
     }
   }
@@ -621,8 +770,7 @@ void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
 
   *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
   *rdmult = AOMMAX(*rdmult, 0);
-  set_error_per_bit(x, *rdmult);
-  aom_clear_system_state();
+  av1_set_error_per_bit(&x->errorperbit, *rdmult);
 }
 
 // TODO(sdeng): replace them with the SIMD versions.
@@ -662,35 +810,43 @@ static AOM_INLINE double image_sad_c(const uint8_t *src, int src_stride,
   return accum / (double)(h * w);
 }
 
-static AOM_INLINE double calc_vmaf_motion_score(
-    const AV1_COMP *const cpi, const AV1_COMMON *const cm,
-    const YV12_BUFFER_CONFIG *const cur, const YV12_BUFFER_CONFIG *const last,
-    const YV12_BUFFER_CONFIG *const next) {
+static double calc_vmaf_motion_score(const AV1_COMP *const cpi,
+                                     const AV1_COMMON *const cm,
+                                     const YV12_BUFFER_CONFIG *const cur,
+                                     const YV12_BUFFER_CONFIG *const last,
+                                     const YV12_BUFFER_CONFIG *const next) {
   const int y_width = cur->y_width;
   const int y_height = cur->y_height;
   YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int ss_x = cur->subsampling_x;
+  const int ss_y = cur->subsampling_y;
 
   memset(&blurred_cur, 0, sizeof(blurred_cur));
   memset(&blurred_last, 0, sizeof(blurred_last));
   memset(&blurred_next, 0, sizeof(blurred_next));
 
-  aom_alloc_frame_buffer(
-      &blurred_cur, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &blurred_last, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &blurred_next, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0);
+  aom_alloc_frame_buffer(&blurred_last, y_width, y_height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0);
+  aom_alloc_frame_buffer(&blurred_next, y_width, y_height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0);
 
   gaussian_blur(bit_depth, cur, &blurred_cur);
   gaussian_blur(bit_depth, last, &blurred_last);
   if (next) gaussian_blur(bit_depth, next, &blurred_next);
 
   double motion1, motion2 = 65536.0;
-  if (bit_depth > 8) {
+  if (cm->seq_params->use_highbitdepth) {
+    assert(blurred_cur.flags & YV12_FLAG_HIGHBITDEPTH);
+    assert(blurred_last.flags & YV12_FLAG_HIGHBITDEPTH);
     const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
     motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
                                  blurred_cur.y_stride,
@@ -698,6 +854,7 @@ static AOM_INLINE double calc_vmaf_motion_score(
                                  blurred_last.y_stride, y_width, y_height) *
               scale_factor;
     if (next) {
+      assert(blurred_next.flags & YV12_FLAG_HIGHBITDEPTH);
       motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
                                    blurred_cur.y_stride,
                                    CONVERT_TO_SHORTPTR(blurred_next.y_buffer),
@@ -722,6 +879,21 @@ static AOM_INLINE double calc_vmaf_motion_score(
   return AOMMIN(motion1, motion2);
 }
 
+static AOM_INLINE void get_neighbor_frames(const AV1_COMP *const cpi,
+                                           YV12_BUFFER_CONFIG **last,
+                                           YV12_BUFFER_CONFIG **next) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  const int src_index =
+      cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[cpi->gf_frame_index];
+  struct lookahead_entry *last_entry = av1_lookahead_peek(
+      cpi->ppi->lookahead, src_index - 1, cpi->compressor_stage);
+  struct lookahead_entry *next_entry = av1_lookahead_peek(
+      cpi->ppi->lookahead, src_index + 1, cpi->compressor_stage);
+  *next = &next_entry->img;
+  *last = cm->show_frame ? cpi->last_source : &last_entry->img;
+}
+
 // Calculates the new qindex from the VMAF motion score. This is based on the
 // observation: when the motion score becomes higher, the VMAF score of the
 // same source and distorted frames would become higher.
@@ -730,37 +902,34 @@ int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) {
   if (cm->current_frame.frame_number == 0 || cpi->oxcf.pass == 1) {
     return current_qindex;
   }
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+  const double last_frame_ysse =
+      get_layer_value(cpi->vmaf_info.last_frame_ysse, layer_depth);
+  const double last_frame_vmaf =
+      get_layer_value(cpi->vmaf_info.last_frame_vmaf, layer_depth);
   const int bit_depth = cpi->td.mb.e_mbd.bd;
-  const double approx_sse =
-      cpi->last_frame_ysse /
-      (double)((1 << (bit_depth - 8)) * (1 << (bit_depth - 8)));
-  const double approx_dvmaf = kBaselineVmaf - cpi->last_frame_vmaf;
+  const double approx_sse = last_frame_ysse / (double)((1 << (bit_depth - 8)) *
+                                                       (1 << (bit_depth - 8)));
+  const double approx_dvmaf = kBaselineVmaf - last_frame_vmaf;
   const double sse_threshold =
       0.01 * cpi->source->y_width * cpi->source->y_height;
   const double vmaf_threshold = 0.01;
   if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) {
     return current_qindex;
   }
-  aom_clear_system_state();
-  const GF_GROUP *gf_group = &cpi->gf_group;
   YV12_BUFFER_CONFIG *cur_buf = cpi->source;
-  int src_index = 0;
   if (cm->show_frame == 0) {
-    src_index = gf_group->arf_src_offset[gf_group->index];
-    struct lookahead_entry *cur_entry =
-        av1_lookahead_peek(cpi->lookahead, src_index, cpi->compressor_stage);
+    const int src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
+    struct lookahead_entry *cur_entry = av1_lookahead_peek(
+        cpi->ppi->lookahead, src_index, cpi->compressor_stage);
     cur_buf = &cur_entry->img;
   }
   assert(cur_buf);
 
-  const struct lookahead_entry *last_entry =
-      av1_lookahead_peek(cpi->lookahead, src_index - 1, cpi->compressor_stage);
-  const struct lookahead_entry *next_entry =
-      av1_lookahead_peek(cpi->lookahead, src_index + 1, cpi->compressor_stage);
-  const YV12_BUFFER_CONFIG *next_buf = &next_entry->img;
-  const YV12_BUFFER_CONFIG *last_buf =
-      cm->show_frame ? cpi->last_source : &last_entry->img;
-
+  YV12_BUFFER_CONFIG *next_buf, *last_buf;
+  get_neighbor_frames(cpi, &last_buf, &next_buf);
   assert(last_buf);
 
   const double motion =
@@ -771,24 +940,171 @@ int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) {
   const double dsse = dvmaf * approx_sse / approx_dvmaf;
 
   const double beta = approx_sse / (dsse + approx_sse);
-  const int offset = av1_get_deltaq_offset(cpi, current_qindex, beta);
+  const int offset =
+      av1_get_deltaq_offset(cm->seq_params->bit_depth, current_qindex, beta);
   int qindex = current_qindex + offset;
 
   qindex = AOMMIN(qindex, MAXQ);
   qindex = AOMMAX(qindex, MINQ);
 
-  aom_clear_system_state();
   return qindex;
 }
 
-void av1_update_vmaf_curve(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source,
-                           YV12_BUFFER_CONFIG *recon) {
+static AOM_INLINE double cal_approx_score(
+    AV1_COMP *const cpi, double src_variance, double new_variance,
+    double src_score, YV12_BUFFER_CONFIG *const src,
+    YV12_BUFFER_CONFIG *const recon_sharpened) {
+  double score;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+  const bool cal_vmaf_neg =
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+  aom_calc_vmaf(cpi->vmaf_info.vmaf_model, src, recon_sharpened, bit_depth,
+                cal_vmaf_neg, &score);
+  return src_variance / new_variance * (score - src_score);
+}
+
+static double find_best_frame_unsharp_amount_loop_neg(
+    AV1_COMP *const cpi, double src_variance, double base_score,
+    YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon,
+    YV12_BUFFER_CONFIG *const ref, YV12_BUFFER_CONFIG *const src_blurred,
+    YV12_BUFFER_CONFIG *const recon_blurred,
+    YV12_BUFFER_CONFIG *const src_sharpened,
+    YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs,
+    double best_score, const double unsharp_amount_start,
+    const double step_size, const int max_loop_count, const double max_amount) {
+  const double min_amount = 0.0;
+  int loop_count = 0;
+  double approx_score = best_score;
+  double unsharp_amount = unsharp_amount_start;
+
+  do {
+    best_score = approx_score;
+    unsharp_amount += step_size;
+    if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
+    unsharp(cpi, recon, recon_blurred, recon_sharpened, unsharp_amount);
+    unsharp(cpi, src, src_blurred, src_sharpened, unsharp_amount);
+    const double new_variance =
+        residual_frame_average_variance(cpi, src_sharpened, ref, mvs);
+    approx_score = cal_approx_score(cpi, src_variance, new_variance, base_score,
+                                    src, recon_sharpened);
+
+    loop_count++;
+  } while (approx_score > best_score && loop_count < max_loop_count);
+  unsharp_amount =
+      approx_score > best_score ? unsharp_amount : unsharp_amount - step_size;
+
+  return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount));
+}
+
+static double find_best_frame_unsharp_amount_neg(
+    AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const src,
+    YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref,
+    double base_score, const double unsharp_amount_start,
+    const double step_size, const int max_loop_count,
+    const double max_filter_amount) {
+  FULLPEL_MV *mvs = NULL;
+  const double src_variance =
+      residual_frame_average_variance(cpi, src, ref, mvs);
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int width = recon->y_width;
+  const int height = recon->y_height;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
-  aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, recon, bit_depth,
-                &cpi->last_frame_vmaf);
-  if (bit_depth > 8) {
-    cpi->last_frame_ysse = (double)aom_highbd_get_y_sse(source, recon);
+  const int ss_x = recon->subsampling_x;
+  const int ss_y = recon->subsampling_y;
+
+  YV12_BUFFER_CONFIG src_blurred, recon_blurred, src_sharpened, recon_sharpened;
+  memset(&recon_sharpened, 0, sizeof(recon_sharpened));
+  memset(&src_sharpened, 0, sizeof(src_sharpened));
+  memset(&recon_blurred, 0, sizeof(recon_blurred));
+  memset(&src_blurred, 0, sizeof(src_blurred));
+  aom_alloc_frame_buffer(&recon_sharpened, width, height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0);
+  aom_alloc_frame_buffer(&src_sharpened, width, height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0);
+  aom_alloc_frame_buffer(&recon_blurred, width, height, ss_x, ss_y,
+                         cm->seq_params->use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment, 0);
+  aom_alloc_frame_buffer(
+      &src_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0);
+
+  gaussian_blur(bit_depth, recon, &recon_blurred);
+  gaussian_blur(bit_depth, src, &src_blurred);
+
+  unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_start);
+  unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_start);
+  const double variance_start =
+      residual_frame_average_variance(cpi, &src_sharpened, ref, mvs);
+  const double score_start = cal_approx_score(
+      cpi, src_variance, variance_start, base_score, src, &recon_sharpened);
+
+  const double unsharp_amount_next = unsharp_amount_start + step_size;
+  unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_next);
+  unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_next);
+  const double variance_next =
+      residual_frame_average_variance(cpi, &src_sharpened, ref, mvs);
+  const double score_next = cal_approx_score(cpi, src_variance, variance_next,
+                                             base_score, src, &recon_sharpened);
+
+  double unsharp_amount;
+  if (score_next > score_start) {
+    unsharp_amount = find_best_frame_unsharp_amount_loop_neg(
+        cpi, src_variance, base_score, src, recon, ref, &src_blurred,
+        &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_next,
+        unsharp_amount_next, step_size, max_loop_count, max_filter_amount);
   } else {
-    cpi->last_frame_ysse = (double)aom_get_y_sse(source, recon);
+    unsharp_amount = find_best_frame_unsharp_amount_loop_neg(
+        cpi, src_variance, base_score, src, recon, ref, &src_blurred,
+        &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_start,
+        unsharp_amount_start, -step_size, max_loop_count, max_filter_amount);
+  }
+
+  aom_free_frame_buffer(&recon_sharpened);
+  aom_free_frame_buffer(&src_sharpened);
+  aom_free_frame_buffer(&recon_blurred);
+  aom_free_frame_buffer(&src_blurred);
+  aom_free(mvs);
+  return unsharp_amount;
+}
+
+void av1_update_vmaf_curve(AV1_COMP *cpi) {
+  YV12_BUFFER_CONFIG *source = cpi->source;
+  YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const int layer_depth =
+      AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+  double base_score;
+  const bool cal_vmaf_neg =
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+  aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, recon, bit_depth,
+                cal_vmaf_neg, &base_score);
+  cpi->vmaf_info.last_frame_vmaf[layer_depth] = base_score;
+  if (cpi->common.seq_params->use_highbitdepth) {
+    assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+    assert(recon->flags & YV12_FLAG_HIGHBITDEPTH);
+    cpi->vmaf_info.last_frame_ysse[layer_depth] =
+        (double)aom_highbd_get_y_sse(source, recon);
+  } else {
+    cpi->vmaf_info.last_frame_ysse[layer_depth] =
+        (double)aom_get_y_sse(source, recon);
+  }
+
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+    YV12_BUFFER_CONFIG *last, *next;
+    get_neighbor_frames(cpi, &last, &next);
+    double best_unsharp_amount_start =
+        get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+    const int max_loop_count = 5;
+    cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+        find_best_frame_unsharp_amount_neg(cpi, source, recon, last, base_score,
+                                           best_unsharp_amount_start, 0.025,
+                                           max_loop_count, 1.01);
   }
 }
diff --git a/media/libaom/src/av1/encoder/tune_vmaf.h b/media/libaom/src/av1/encoder/tune_vmaf.h
index c4cf072242..a04a29e6fe 100644
--- a/media/libaom/src/av1/encoder/tune_vmaf.h
+++ b/media/libaom/src/av1/encoder/tune_vmaf.h
@@ -12,21 +12,52 @@
 #ifndef AOM_AV1_ENCODER_TUNE_VMAF_H_
 #define AOM_AV1_ENCODER_TUNE_VMAF_H_
 
+#include "aom_dsp/vmaf.h"
 #include "aom_scale/yv12config.h"
-#include "av1/encoder/encoder.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/block.h"
 
-void av1_vmaf_blk_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
+typedef struct {
+  // Stores the scaling factors for rdmult when tuning for VMAF.
+  // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for
+  // 64x64 block at (row, col).
+  double *rdmult_scaling_factors;
 
-void av1_vmaf_frame_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
+  // Stores the luma sse of the last frame.
+  double last_frame_ysse[MAX_ARF_LAYERS];
 
-void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi);
+  // Stores the VMAF of the last frame.
+  double last_frame_vmaf[MAX_ARF_LAYERS];
 
-void av1_set_vmaf_rdmult(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                         int mi_row, int mi_col, int *rdmult);
+  // Stores the filter strength of the last frame.
+  double last_frame_unsharp_amount[MAX_ARF_LAYERS];
 
-int av1_get_vmaf_base_qindex(const AV1_COMP *cpi, int current_qindex);
+  // Stores the origial qindex before scaling.
+  int original_qindex;
 
-void av1_update_vmaf_curve(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source,
-                           YV12_BUFFER_CONFIG *recon);
+  // VMAF model used in VMAF caculations.
+  VmafModel *vmaf_model;
+} TuneVMAFInfo;
+
+struct AV1_COMP;
+
+void av1_vmaf_blk_preprocessing(struct AV1_COMP *cpi,
+                                YV12_BUFFER_CONFIG *source);
+
+void av1_vmaf_frame_preprocessing(struct AV1_COMP *cpi,
+                                  YV12_BUFFER_CONFIG *source);
+
+void av1_vmaf_neg_preprocessing(struct AV1_COMP *cpi,
+                                YV12_BUFFER_CONFIG *source);
+
+void av1_set_mb_vmaf_rdmult_scaling(struct AV1_COMP *cpi);
+
+void av1_set_vmaf_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                         BLOCK_SIZE bsize, int mi_row, int mi_col, int *rdmult);
+
+int av1_get_vmaf_base_qindex(const struct AV1_COMP *cpi, int current_qindex);
+
+void av1_update_vmaf_curve(struct AV1_COMP *cpi);
 
 #endif  // AOM_AV1_ENCODER_TUNE_VMAF_H_
diff --git a/media/libaom/src/av1/encoder/tx_search.c b/media/libaom/src/av1/encoder/tx_search.c
index 65b9a24726..e24800b8ce 100644
--- a/media/libaom/src/av1/encoder/tx_search.c
+++ b/media/libaom/src/av1/encoder/tx_search.c
@@ -11,14 +11,18 @@
 
 #include "av1/common/cfl.h"
 #include "av1/common/reconintra.h"
-#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/block.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/common/idct.h"
 #include "av1/encoder/model_rd.h"
 #include "av1/encoder/random.h"
 #include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/sorting_network.h"
 #include "av1/encoder/tx_prune_model_weights.h"
 #include "av1/encoder/tx_search.h"
+#include "av1/encoder/txb_rdopt.h"
+
+#define PROB_THRESH_OFFSET_TX_TYPE 100
 
 struct rdcost_block_args {
   const AV1_COMP *cpi;
@@ -30,7 +34,6 @@ struct rdcost_block_args {
   int64_t best_rd;
   int exit_early;
   int incomplete_exit;
-  int use_fast_coef_costing;
   FAST_TX_SEARCH_MODE ftxs_mode;
   int skip_trellis;
 };
@@ -41,11 +44,6 @@ typedef struct {
   TX_TYPE tx_type;
 } TxCandidateInfo;
 
-typedef struct {
-  int leaf;
-  int8_t children[4];
-} RD_RECORD_IDX_NODE;
-
 // origin_threshold * 128 / 100
 static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
   {
@@ -62,7 +60,7 @@ static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
   },
 };
 
-// lookup table for predict_skip_flag
+// lookup table for predict_skip_txfm
 // int max_tx_size = max_txsize_rect_lookup[bsize];
 // if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
 //   max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
@@ -73,259 +71,19 @@ static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
   TX_8X8,   TX_8X8,   TX_16X16, TX_16X16,
 };
 
-static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
-                                const uint32_t hash) {
-  // Linear search through the circular buffer to find matching hash.
-  for (int i = cur_record->index_start - 1; i >= 0; i--) {
-    if (cur_record->hash_vals[i] == hash) return i;
-  }
-  for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) {
-    if (cur_record->hash_vals[i] == hash) return i;
-  }
-  int index;
-  // If not found - add new RD info into the buffer and return its index
-  if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
-    index = (cur_record->index_start + cur_record->num) %
-            TX_SIZE_RD_RECORD_BUFFER_LEN;
-    cur_record->num++;
-  } else {
-    index = cur_record->index_start;
-    cur_record->index_start =
-        (cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN;
-  }
-
-  cur_record->hash_vals[index] = hash;
-  av1_zero(cur_record->tx_rd_info[index]);
-  return index;
-}
-
-static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = {
-  { 1, { 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 1, { 0, 0, 0, 0 } },
-  { 1, { 0, 0, 0, 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 1, { 0 } },
-  { 1, { 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = {
-  { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 0, { 3, 4, 5, 6 } },
-  { 0, { 7, 8, 9, 10 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 0, { 3, 4, 7, 8 } },
-  { 0, { 5, 6, 9, 10 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = {
-  { 0, { 1, 2, 3, 4 } },     { 0, { 5, 6, 9, 10 } },    { 0, { 7, 8, 11, 12 } },
-  { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = {
-  { 0, { 2, 3, 4, 5 } },     { 0, { 6, 7, 8, 9 } },
-  { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } },
-  { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } },
-  { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } },
-  { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = {
-  { 0, { 2, 3, 6, 7 } },     { 0, { 4, 5, 8, 9 } },
-  { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } },
-  { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } },
-  { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } },
-  { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = {
-  { 0, { 4, 5, 8, 9 } },     { 0, { 6, 7, 10, 11 } },
-  { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } },
-  { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } },
-  { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } },
-  { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } },
-  { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } },
-  { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } },
-  { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } },
-  { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } },
-  { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = {
-  { 0, { 1, -1, 2, -1 } },
-  { 0, { 3, 4, -1, -1 } },
-  { 0, { 5, 6, -1, -1 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 0, { 3, 4, -1, -1 } },
-  { 0, { 5, 6, -1, -1 } },
-};
-
-static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = {
-  NULL,                    // BLOCK_4X4
-  NULL,                    // BLOCK_4X8
-  NULL,                    // BLOCK_8X4
-  rd_record_tree_8x8,      // BLOCK_8X8
-  rd_record_tree_8x16,     // BLOCK_8X16
-  rd_record_tree_16x8,     // BLOCK_16X8
-  rd_record_tree_16x16,    // BLOCK_16X16
-  rd_record_tree_1_2,      // BLOCK_16X32
-  rd_record_tree_2_1,      // BLOCK_32X16
-  rd_record_tree_sqr,      // BLOCK_32X32
-  rd_record_tree_1_2,      // BLOCK_32X64
-  rd_record_tree_2_1,      // BLOCK_64X32
-  rd_record_tree_sqr,      // BLOCK_64X64
-  rd_record_tree_64x128,   // BLOCK_64X128
-  rd_record_tree_128x64,   // BLOCK_128X64
-  rd_record_tree_128x128,  // BLOCK_128X128
-  NULL,                    // BLOCK_4X16
-  NULL,                    // BLOCK_16X4
-  rd_record_tree_1_4,      // BLOCK_8X32
-  rd_record_tree_4_1,      // BLOCK_32X8
-  rd_record_tree_1_4,      // BLOCK_16X64
-  rd_record_tree_4_1,      // BLOCK_64X16
-};
-
-static const int rd_record_tree_size[BLOCK_SIZES_ALL] = {
-  0,                                                            // BLOCK_4X4
-  0,                                                            // BLOCK_4X8
-  0,                                                            // BLOCK_8X4
-  sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X8
-  sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_8X16
-  sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_16X8
-  sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE),    // BLOCK_16X16
-  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X32
-  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X16
-  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X32
-  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X64
-  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X32
-  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X64
-  sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_64X128
-  sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_128X64
-  sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE),  // BLOCK_128X128
-  0,                                                            // BLOCK_4X16
-  0,                                                            // BLOCK_16X4
-  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X32
-  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X8
-  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X64
-  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X16
-};
-
-static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree,
-                                       BLOCK_SIZE bsize) {
-  const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize];
-  const int size = rd_record_tree_size[bsize];
-  for (int i = 0; i < size; ++i) {
-    if (rd_record[i].leaf) {
-      av1_zero(tree[i].children);
-    } else {
-      for (int j = 0; j < 4; ++j) {
-        const int8_t idx = rd_record[i].children[j];
-        tree[i].children[j] = idx > 0 ? &tree[idx] : NULL;
-      }
-    }
-  }
-}
-
-// Go through all TX blocks that could be used in TX size search, compute
-// residual hash values for them and find matching RD info that stores previous
-// RD search results for these TX blocks. The idea is to prevent repeated
-// rate/distortion computations that happen because of the combination of
-// partition and TX size search. The resulting RD info records are returned in
-// the form of a quadtree for easier access in actual TX size search.
-static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize,
-                                   TXB_RD_INFO_NODE *dst_rd_info) {
-  TXB_RD_RECORD *rd_records_table[4] = { x->txb_rd_record_8X8,
-                                         x->txb_rd_record_16X16,
-                                         x->txb_rd_record_32X32,
-                                         x->txb_rd_record_64X64 };
-  const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-
-  // Hashing is performed only for square TX sizes larger than TX_4X4
-  if (max_square_tx_size < TX_8X8) return 0;
-  const int diff_stride = bw;
-  const struct macroblock_plane *const p = &x->plane[0];
-  const int16_t *diff = &p->src_diff[0];
-  init_rd_record_tree(dst_rd_info, bsize);
-  // Coordinates of the top-left corner of current block within the superblock
-  // measured in pixels:
-  const int mi_row = x->e_mbd.mi_row;
-  const int mi_col = x->e_mbd.mi_col;
-  const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
-  const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
-  int cur_rd_info_idx = 0;
-  int cur_tx_depth = 0;
-  TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
-  while (cur_tx_depth <= MAX_VARTX_DEPTH) {
-    const int cur_tx_bw = tx_size_wide[cur_tx_size];
-    const int cur_tx_bh = tx_size_high[cur_tx_size];
-    if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
-    const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
-    const int tx_size_idx = cur_tx_size - TX_8X8;
-    for (int row = 0; row < bh; row += cur_tx_bh) {
-      for (int col = 0; col < bw; col += cur_tx_bw) {
-        if (cur_tx_bw != cur_tx_bh) {
-          // Use dummy nodes for all rectangular transforms within the
-          // TX size search tree.
-          dst_rd_info[cur_rd_info_idx].rd_info_array = NULL;
-        } else {
-          // Get spatial location of this TX block within the superblock
-          // (measured in cur_tx_bsize units).
-          const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh;
-          const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw;
-
-          int16_t hash_data[MAX_SB_SQUARE];
-          int16_t *cur_hash_row = hash_data;
-          const int16_t *cur_diff_row = diff + row * diff_stride + col;
-          for (int i = 0; i < cur_tx_bh; i++) {
-            memcpy(cur_hash_row, cur_diff_row, sizeof(*hash_data) * cur_tx_bw);
-            cur_hash_row += cur_tx_bw;
-            cur_diff_row += diff_stride;
-          }
-          const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
-                                                (uint8_t *)hash_data,
-                                                2 * cur_tx_bw * cur_tx_bh);
-          // Find corresponding RD info based on the hash value.
-          const int record_idx =
-              row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
-          TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx];
-          int idx = find_tx_size_rd_info(records, hash);
-          dst_rd_info[cur_rd_info_idx].rd_info_array =
-              &records->tx_rd_info[idx];
-        }
-        ++cur_rd_info_idx;
-      }
-    }
-    cur_tx_size = next_tx_size;
-    ++cur_tx_depth;
-  }
-  return 1;
-}
+// look-up table for sqrt of number of pixels in a transform block
+// rounded up to the nearest integer.
+static const int sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4,  8,  16, 32, 32, 6,  6,
+                                                     12, 12, 23, 23, 32, 32, 8,
+                                                     8,  16, 16, 23, 23 };
 
 static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
   const int16_t *diff = x->plane[0].src_diff;
-  const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
-                                             (uint8_t *)diff, 2 * rows * cols);
+  const uint32_t hash =
+      av1_get_crc32c_value(&x->txfm_search_info.mb_rd_record->crc_calculator,
+                           (uint8_t *)diff, 2 * rows * cols);
   return (hash << 5) + bsize;
 }
 
@@ -336,9 +94,9 @@ static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
   if (ref_best_rd != INT64_MAX) {
     for (int i = 0; i < mb_rd_record->num; ++i) {
       const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
-      // If there is a match in the tx_rd_record, fetch the RD decision and
+      // If there is a match in the mb_rd_record, fetch the RD decision and
       // terminate early.
-      if (mb_rd_record->tx_rd_info[index].hash_value == hash) {
+      if (mb_rd_record->mb_rd_info[index].hash_value == hash) {
         match_index = index;
         break;
       }
@@ -347,18 +105,18 @@ static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
   return match_index;
 }
 
-static AOM_INLINE void fetch_tx_rd_info(int n4,
-                                        const MB_RD_INFO *const tx_rd_info,
+static AOM_INLINE void fetch_mb_rd_info(int n4,
+                                        const MB_RD_INFO *const mb_rd_info,
                                         RD_STATS *const rd_stats,
                                         MACROBLOCK *const x) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  mbmi->tx_size = tx_rd_info->tx_size;
-  memcpy(x->blk_skip, tx_rd_info->blk_skip,
-         sizeof(tx_rd_info->blk_skip[0]) * n4);
-  av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
-  av1_copy_array(xd->tx_type_map, tx_rd_info->tx_type_map, n4);
-  *rd_stats = tx_rd_info->rd_stats;
+  mbmi->tx_size = mb_rd_info->tx_size;
+  memcpy(x->txfm_search_info.blk_skip, mb_rd_info->blk_skip,
+         sizeof(mb_rd_info->blk_skip[0]) * n4);
+  av1_copy(mbmi->inter_tx_size, mb_rd_info->inter_tx_size);
+  av1_copy_array(xd->tx_type_map, mb_rd_info->tx_type_map, n4);
+  *rd_stats = mb_rd_info->rd_stats;
 }
 
 // Compute the pixel domain distortion from diff on all visible 4x4s in the
@@ -388,11 +146,43 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
   return sse;
 }
 
+// Computes the residual block's SSE and mean on all visible 4x4s in the
+// transform block
+static INLINE int64_t pixel_diff_stats(
+    MACROBLOCK *x, int plane, int blk_row, int blk_col,
+    const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize,
+    unsigned int *block_mse_q8, int64_t *per_px_mean, uint64_t *block_var) {
+  int visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+                     NULL, &visible_cols, &visible_rows);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+
+  diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+  uint64_t sse = 0;
+  int sum = 0;
+  sse = aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum);
+  if (visible_cols > 0 && visible_rows > 0) {
+    double norm_factor = 1.0 / (visible_cols * visible_rows);
+    int sign_sum = sum > 0 ? 1 : -1;
+    // Conversion to transform domain
+    *per_px_mean = (int64_t)(norm_factor * abs(sum)) << 7;
+    *per_px_mean = sign_sum * (*per_px_mean);
+    *block_mse_q8 = (unsigned int)(norm_factor * (256 * sse));
+    *block_var = (uint64_t)(sse - (uint64_t)(norm_factor * sum * sum));
+  } else {
+    *block_mse_q8 = UINT_MAX;
+  }
+  return sse;
+}
+
 // Uses simple features on top of DCT coefficients to quickly predict
 // whether optimal RD decision is to skip encoding the residual.
 // The sse value is stored in dist.
-static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+static int predict_skip_txfm(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
                              int reduced_tx_set) {
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -408,12 +198,12 @@ static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
   // For faster early skip decision, use dist to compare against threshold so
   // that quality risk is less for the skip=1 decision. Otherwise, use mse
   // since the fwd_txfm coeff checks will take care of quality
-  // TODO(any): Use dist to return 0 when predict_skip_level is 1
-  int64_t pred_err = (x->predict_skip_level >= 2) ? *dist : mse;
+  // TODO(any): Use dist to return 0 when skip_txfm_level is 1
+  int64_t pred_err = (txfm_params->skip_txfm_level >= 2) ? *dist : mse;
   // Predict not to skip when error is larger than threshold.
   if (pred_err > mse_thresh) return 0;
   // Return as skip otherwise for aggressive early skip
-  else if (x->predict_skip_level >= 2)
+  else if (txfm_params->skip_txfm_level >= 2)
     return 1;
 
   const int max_tx_size = max_predict_sf_tx_size[bsize];
@@ -452,7 +242,7 @@ static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
 }
 
 // Used to set proper context for early termination with skip = 1.
-static AOM_INLINE void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats,
+static AOM_INLINE void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats,
                                      int bsize, int64_t dist) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -461,8 +251,9 @@ static AOM_INLINE void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats,
   memset(xd->tx_type_map, DCT_DCT, sizeof(xd->tx_type_map[0]) * n4);
   memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
   mbmi->tx_size = tx_size;
-  for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
-  rd_stats->skip = 1;
+  for (int i = 0; i < n4; ++i)
+    set_blk_skip(x->txfm_search_info.blk_skip, 0, i, 1);
+  rd_stats->skip_txfm = 1;
   if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
   rd_stats->dist = rd_stats->sse = (dist << 4);
   // Though decision is to make the block as skip based on luma stats,
@@ -484,37 +275,37 @@ static AOM_INLINE void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats,
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
   TXB_CTX txb_ctx;
   get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx);
-  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+  const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y]
                                 .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
   rd_stats->rate = zero_blk_rate *
                    (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) *
                    (block_size_high[bsize] >> tx_size_high_log2[tx_size]);
 }
 
-static AOM_INLINE void save_tx_rd_info(int n4, uint32_t hash,
+static AOM_INLINE void save_mb_rd_info(int n4, uint32_t hash,
                                        const MACROBLOCK *const x,
                                        const RD_STATS *const rd_stats,
-                                       MB_RD_RECORD *tx_rd_record) {
+                                       MB_RD_RECORD *mb_rd_record) {
   int index;
-  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
+  if (mb_rd_record->num < RD_RECORD_BUFFER_LEN) {
     index =
-        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
-    ++tx_rd_record->num;
+        (mb_rd_record->index_start + mb_rd_record->num) % RD_RECORD_BUFFER_LEN;
+    ++mb_rd_record->num;
   } else {
-    index = tx_rd_record->index_start;
-    tx_rd_record->index_start =
-        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+    index = mb_rd_record->index_start;
+    mb_rd_record->index_start =
+        (mb_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
   }
-  MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index];
+  MB_RD_INFO *const mb_rd_info = &mb_rd_record->mb_rd_info[index];
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
-  tx_rd_info->hash_value = hash;
-  tx_rd_info->tx_size = mbmi->tx_size;
-  memcpy(tx_rd_info->blk_skip, x->blk_skip,
-         sizeof(tx_rd_info->blk_skip[0]) * n4);
-  av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
-  av1_copy_array(tx_rd_info->tx_type_map, xd->tx_type_map, n4);
-  tx_rd_info->rd_stats = *rd_stats;
+  mb_rd_info->hash_value = hash;
+  mb_rd_info->tx_size = mbmi->tx_size;
+  memcpy(mb_rd_info->blk_skip, x->txfm_search_info.blk_skip,
+         sizeof(mb_rd_info->blk_skip[0]) * n4);
+  av1_copy(mb_rd_info->inter_tx_size, mbmi->inter_tx_size);
+  av1_copy_array(mb_rd_info->tx_type_map, xd->tx_type_map, n4);
+  mb_rd_info->rd_stats = *rd_stats;
 }
 
 static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
@@ -544,8 +335,7 @@ static AOM_INLINE void select_tx_block(
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
     RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
-    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
-    TXB_RD_INFO_NODE *rd_info_node);
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode);
 
 // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
 // 0: Do not collect any RD stats
@@ -569,7 +359,7 @@ static AOM_INLINE void get_energy_distribution_fine(
     assert(bw <= 32);
     assert(bh <= 32);
     assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
-    if (cpi->common.seq_params.use_highbitdepth) {
+    if (cpi->common.seq_params->use_highbitdepth) {
       const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
       const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
       for (int i = 0; i < bh; ++i)
@@ -594,43 +384,43 @@ static AOM_INLINE void get_energy_distribution_fine(
     const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
     assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
     assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[1]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[2]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[3]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[1]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[2]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[3]);
     src += bh / 4 * src_stride;
     dst += bh / 4 * dst_stride;
 
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[5]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[6]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[7]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[5]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[6]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[7]);
     src += bh / 4 * src_stride;
     dst += bh / 4 * dst_stride;
 
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[9]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[10]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[11]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[9]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[10]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[11]);
     src += bh / 4 * src_stride;
     dst += bh / 4 * dst_stride;
 
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[13]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[14]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[15]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[13]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[14]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[15]);
   }
 
   double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
@@ -720,13 +510,13 @@ static AOM_INLINE void get_2x2_normalized_sses_and_sads(
 
         if (sse_norm_arr) {
           unsigned int this_sse;
-          cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
-                                        dst_stride, &this_sse);
+          cpi->ppi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+                                             dst_stride, &this_sse);
           sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
         }
 
         if (sad_norm_arr) {
-          const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
+          const unsigned int this_sad = cpi->ppi->fn_ptr[tx_bsize_half].sdf(
               this_src, src_stride, this_dst, dst_stride);
           sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
         }
@@ -783,11 +573,11 @@ static AOM_INLINE void PrintTransformUnitStats(
   const uint8_t *const dst =
       &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
   unsigned int sse;
-  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
   const double sse_norm = (double)sse / num_samples;
 
   const unsigned int sad =
-      cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
+      cpi->ppi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
   const double sad_norm = (double)sad / num_samples;
 
   fprintf(fout, " %g %g", sse_norm, sad_norm);
@@ -850,14 +640,14 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
   for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblock_plane *const p = &x->plane[plane];
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
-                                               pd->subsampling_y);
+    const BLOCK_SIZE bs =
+        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
     unsigned int sse;
 
     if (x->skip_chroma_rd && plane) continue;
 
-    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-                       &sse);
+    cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                            pd->dst.stride, &sse);
     total_sse += sse;
   }
   total_sse <<= 4;
@@ -867,7 +657,6 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
 static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
                              int64_t sse, int *est_residue_cost,
                              int64_t *est_dist) {
-  aom_clear_system_state();
   const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
   if (md->ready) {
     if (sse < md->dist_mean) {
@@ -981,7 +770,7 @@ static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi,
   const double sse_norm = (double)sse / num_samples;
 
   const unsigned int sad =
-      cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
+      cpi->ppi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
   const double sad_norm =
       (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
 
@@ -1056,19 +845,21 @@ static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi,
 #endif  // CONFIG_COLLECT_RD_STATS >= 2
 #endif  // CONFIG_COLLECT_RD_STATS
 
-static AOM_INLINE void inverse_transform_block_facade(MACROBLOCKD *xd,
+static AOM_INLINE void inverse_transform_block_facade(MACROBLOCK *const x,
                                                       int plane, int block,
                                                       int blk_row, int blk_col,
                                                       int eob,
                                                       int reduced_tx_set) {
   if (!eob) return;
-
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  struct macroblock_plane *const p = &x->plane[plane];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
   const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
                                           tx_size, reduced_tx_set);
+
+  struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dst_stride = pd->dst.stride;
   uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
   av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
@@ -1099,18 +890,18 @@ static INLINE void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                           ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
                                                     : AV1_XFORM_QUANT_FP)
                           : AV1_XFORM_QUANT_FP,
-                      cpi->oxcf.quant_b_adapt, &quant_param_intra);
+                      cpi->oxcf.q_cfg.quant_b_adapt, &quant_param_intra);
       av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, best_tx_type,
                         &quant_param_intra);
       av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
                       &txfm_param_intra, &quant_param_intra);
       if (quant_param_intra.use_optimize_b) {
         av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx,
-                       cpi->sf.rd_sf.trellis_eob_fast, rate_cost);
+                       rate_cost);
       }
     }
 
-    inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
+    inverse_transform_block_facade(x, plane, block, blk_row, blk_col,
                                    x->plane[plane].eobs[block],
                                    cm->features.reduced_tx_set_used);
 
@@ -1132,7 +923,7 @@ static unsigned pixel_dist_visible_only(
   unsigned sse;
 
   if (txb_rows == visible_rows && txb_cols == visible_cols) {
-    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+    cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
     return sse;
   }
 
@@ -1181,7 +972,6 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
                                            TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
   const uint16_t eob = p->eobs[block];
   const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
   const int bsw = block_size_wide[tx_bsize];
@@ -1193,7 +983,7 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
   const int dst_idx = (blk_row * dst_stride + blk_col) << MI_SIZE_LOG2;
   const uint8_t *src = &x->plane[plane].src.buf[src_idx];
   const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
-  const tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  const tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
 
   assert(cpi != NULL);
   assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
@@ -1204,18 +994,15 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     recon = CONVERT_TO_BYTEPTR(recon16);
-    av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride,
-                                   CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw,
-                                   bsh, NULL, NULL, 0, 0, NULL, xd->bd);
+    aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride,
+                             CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, bsh);
   } else {
     recon = (uint8_t *)recon16;
-    av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
-                            NULL, 0, 0, NULL);
+    aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh);
   }
 #else
   recon = (uint8_t *)recon16;
-  av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
-                          NULL, 0, 0, NULL);
+  aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh);
 #endif
 
   const PLANE_TYPE plane_type = get_plane_type(plane);
@@ -1229,62 +1016,10 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
                          blk_row, blk_col, plane_bsize, tx_bsize);
 }
 
-static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row,
-                                   int blk_col, BLOCK_SIZE plane_bsize,
-                                   TX_SIZE tx_size) {
-  int16_t tmp_data[64 * 64];
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int16_t *diff = x->plane[plane].src_diff;
-  const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col;
-  const int txb_w = tx_size_wide[tx_size];
-  const int txb_h = tx_size_high[tx_size];
-  uint8_t *hash_data = (uint8_t *)cur_diff_row;
-  if (txb_w != diff_stride) {
-    int16_t *cur_hash_row = tmp_data;
-    for (int i = 0; i < txb_h; i++) {
-      memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w);
-      cur_hash_row += txb_w;
-      cur_diff_row += diff_stride;
-    }
-    hash_data = (uint8_t *)tmp_data;
-  }
-  CRC32C *crc = &x->mb_rd_record.crc_calculator;
-  const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h);
-  return (hash << 5) + tx_size;
-}
-
 // pruning thresholds for prune_txk_type and prune_txk_type_separ
 static const int prune_factors[5] = { 200, 200, 120, 80, 40 };  // scale 1000
 static const int mul_factors[5] = { 80, 80, 70, 50, 30 };       // scale 100
 
-static INLINE int is_intra_hash_match(const AV1_COMP *cpi, MACROBLOCK *x,
-                                      int plane, int blk_row, int blk_col,
-                                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                                      const TXB_CTX *const txb_ctx,
-                                      TXB_RD_INFO **intra_txb_rd_info,
-                                      const int tx_type_map_idx,
-                                      uint16_t *cur_joint_ctx) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  assert(cpi->sf.tx_sf.use_intra_txb_hash &&
-         frame_is_intra_only(&cpi->common) && !is_inter_block(xd->mi[0]) &&
-         plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size]);
-  const uint32_t intra_hash =
-      get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
-  const int intra_hash_idx =
-      find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
-  *intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
-  *cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
-  if ((*intra_txb_rd_info)->entropy_context == *cur_joint_ctx &&
-      x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
-    xd->tx_type_map[tx_type_map_idx] = (*intra_txb_rd_info)->tx_type;
-    const TX_TYPE ref_tx_type =
-        av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size,
-                        cpi->common.features.reduced_tx_set_used);
-    return (ref_tx_type == (*intra_txb_rd_info)->tx_type);
-  }
-  return 0;
-}
-
 // R-D costs are sorted in ascending order.
 static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
   int i, j, k;
@@ -1311,12 +1046,40 @@ static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
   }
 }
 
+static INLINE int64_t av1_block_error_qm(const tran_low_t *coeff,
+                                         const tran_low_t *dqcoeff,
+                                         intptr_t block_size,
+                                         const qm_val_t *qmatrix,
+                                         const int16_t *scan, int64_t *ssz) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+
+  for (i = 0; i < block_size; i++) {
+    int64_t weight = qmatrix[scan[i]];
+    int64_t dd = coeff[i] - dqcoeff[i];
+    dd *= weight;
+    int64_t cc = coeff[i];
+    cc *= weight;
+    // The ranges of coeff and dqcoeff are
+    //  bd8 : 18 bits (including sign)
+    //  bd10: 20 bits (including sign)
+    //  bd12: 22 bits (including sign)
+    // As AOM_QM_BITS is 5, the intermediate quantities in the calculation
+    // below should fit in 54 bits, thus no overflow should happen.
+    error += (dd * dd + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+    sqcoeff += (cc * cc + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+  }
+
+  *ssz = sqcoeff;
+  return error;
+}
+
 static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
-                                        TX_SIZE tx_size, int64_t *out_dist,
+                                        TX_SIZE tx_size,
+                                        const qm_val_t *qmatrix,
+                                        const int16_t *scan, int64_t *out_dist,
                                         int64_t *out_sse) {
-  MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
   // Transform domain distortion computation is more efficient as it does
   // not involve an inverse transform, but it is less accurate.
   const int buffer_length = av1_get_max_eob(tx_size);
@@ -1326,16 +1089,25 @@ static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
   int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
   const int block_offset = BLOCK_OFFSET(block);
   tran_low_t *const coeff = p->coeff + block_offset;
-  tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
+  tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
 #if CONFIG_AV1_HIGHBITDEPTH
-  if (is_cur_buf_hbd(xd))
+  MACROBLOCKD *const xd = &x->e_mbd;
+  if (is_cur_buf_hbd(xd)) {
+    // TODO(veluca): handle use_qm_dist_metric for HBD too.
     *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
                                        xd->bd);
-  else
-    *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
-#else
-  *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+  } else {
 #endif
+    if (qmatrix == NULL || !x->txfm_search_params.use_qm_dist_metric) {
+      *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+    } else {
+      *out_dist = av1_block_error_qm(coeff, dqcoeff, buffer_length, qmatrix,
+                                     scan, &this_sse);
+    }
+#if CONFIG_AV1_HIGHBITDEPTH
+  }
+#endif
+
   *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
   *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
 }
@@ -1348,6 +1120,7 @@ uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                               int reduced_tx_set_used, int64_t ref_best_rd,
                               int num_sel) {
   const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
 
   int idx;
 
@@ -1374,7 +1147,7 @@ uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   QUANT_PARAM quant_param;
   TxfmParam txfm_param;
   av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
-  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt,
+  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
                   &quant_param);
   int tx_type;
   // to ensure we can try ones even outside of ext_tx_set of current block
@@ -1389,10 +1162,16 @@ uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     tx_type = idx_map[idx];
     txfm_param.tx_type = tx_type;
 
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
                     &quant_param);
 
-    dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse);
+    const SCAN_ORDER *const scan_order =
+        get_scan(txfm_param.tx_size, txfm_param.tx_type);
+    dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                         scan_order->scan, &dist, &sse);
 
     rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
                                               txb_ctx, reduced_tx_set_used, 0);
@@ -1419,10 +1198,16 @@ uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     tx_type = idx_map_v[idx_v[idx] * 4];
     txfm_param.tx_type = tx_type;
 
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
                     &quant_param);
 
-    dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse);
+    const SCAN_ORDER *const scan_order =
+        get_scan(txfm_param.tx_size, txfm_param.tx_type);
+    dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                         scan_order->scan, &dist, &sse);
 
     rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
                                               txb_ctx, reduced_tx_set_used, 0);
@@ -1479,6 +1264,7 @@ uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                         uint16_t allowed_tx_mask, int prune_factor,
                         const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
   const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
   int tx_type;
 
   int64_t rds[TX_TYPES];
@@ -1489,7 +1275,7 @@ uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   TxfmParam txfm_param;
   QUANT_PARAM quant_param;
   av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
-  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt,
+  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
                   &quant_param);
 
   for (int idx = 0; idx < TX_TYPES; idx++) {
@@ -1503,6 +1289,9 @@ uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     }
     txfm_param.tx_type = tx_type;
 
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
     // do txfm and quantization
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
                     &quant_param);
@@ -1510,7 +1299,10 @@ uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
                                               txb_ctx, reduced_tx_set_used, 0);
     // tx domain dist
-    dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse);
+    const SCAN_ORDER *const scan_order =
+        get_scan(txfm_param.tx_size, txfm_param.tx_type);
+    dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                         scan_order->scan, &dist, &sse);
 
     txk_map[num_cand] = tx_type;
     rds[num_cand] = RDCOST(x->rdmult, rate_cost, dist);
@@ -1596,43 +1388,19 @@ static const float *prune_2D_adaptive_thresholds[] = {
   NULL,
 };
 
-// Probablities are sorted in descending order.
-static INLINE void sort_probability(float prob[], int txk[], int len) {
-  int i, j, k;
-
-  for (i = 1; i <= len - 1; ++i) {
-    for (j = 0; j < i; ++j) {
-      if (prob[j] < prob[i]) {
-        float temp;
-        int tempi;
-
-        temp = prob[i];
-        tempi = txk[i];
-
-        for (k = i; k > j; k--) {
-          prob[k] = prob[k - 1];
-          txk[k] = txk[k - 1];
-        }
-
-        prob[j] = temp;
-        txk[j] = tempi;
-        break;
-      }
-    }
-  }
-}
-
-static INLINE float get_adaptive_thresholds(TX_SIZE tx_size,
-                                            TxSetType tx_set_type,
-                                            TX_TYPE_PRUNE_MODE prune_mode) {
-  const int prune_aggr_table[4][2] = { { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 } };
+static INLINE float get_adaptive_thresholds(
+    TX_SIZE tx_size, TxSetType tx_set_type,
+    TX_TYPE_PRUNE_MODE prune_2d_txfm_mode) {
+  const int prune_aggr_table[5][2] = {
+    { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 }, { 12, 9 }
+  };
   int pruning_aggressiveness = 0;
   if (tx_set_type == EXT_TX_SET_ALL16)
     pruning_aggressiveness =
-        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0];
+        prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][0];
   else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
     pruning_aggressiveness =
-        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1];
+        prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][1];
 
   return prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness];
 }
@@ -1701,11 +1469,25 @@ static AOM_INLINE void get_energy_distribution_finer(const int16_t *diff,
   for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
 }
 
+static AOM_INLINE bool check_bit_mask(uint16_t mask, int val) {
+  return mask & (1 << val);
+}
+
+static AOM_INLINE void set_bit_mask(uint16_t *mask, int val) {
+  *mask |= (1 << val);
+}
+
+static AOM_INLINE void unset_bit_mask(uint16_t *mask, int val) {
+  *mask &= ~(1 << val);
+}
+
 static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
                         int blk_row, int blk_col, TxSetType tx_set_type,
-                        TX_TYPE_PRUNE_MODE prune_mode, int *txk_map,
+                        TX_TYPE_PRUNE_MODE prune_2d_txfm_mode, int *txk_map,
                         uint16_t *allowed_tx_mask) {
-  int tx_type_table_2D[16] = {
+  // This table is used because the search order is different from the enum
+  // order.
+  static const int tx_type_table_2D[16] = {
     DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
     ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
     FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
@@ -1723,11 +1505,9 @@ static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
 #endif
   if (!nn_config_hor || !nn_config_ver) return;  // Model not established yet.
 
-  aom_clear_system_state();
   float hfeatures[16], vfeatures[16];
   float hscores[4], vscores[4];
   float scores_2D_raw[16];
-  float scores_2D[16];
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
   const int hfeatures_num = bw <= 8 ? bw : bw / 2;
@@ -1740,10 +1520,11 @@ static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
   const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
   get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
                                 vfeatures);
+
   av1_get_horver_correlation_full(diff, diff_stride, bw, bh,
                                   &hfeatures[hfeatures_num - 1],
                                   &vfeatures[vfeatures_num - 1]);
-  aom_clear_system_state();
+
 #if CONFIG_NN_V2
   av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores);
   av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores);
@@ -1751,7 +1532,6 @@ static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
   av1_nn_predict(hfeatures, nn_config_hor, 1, hscores);
   av1_nn_predict(vfeatures, nn_config_ver, 1, vscores);
 #endif
-  aom_clear_system_state();
 
   for (int i = 0; i < 4; i++) {
     float *cur_scores_2D = scores_2D_raw + i * 4;
@@ -1761,10 +1541,14 @@ static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
     cur_scores_2D[3] = vscores[i] * hscores[3];
   }
 
-  av1_nn_softmax(scores_2D_raw, scores_2D, 16);
+  assert(TX_TYPES == 16);
+  // This version of the function only works when there are at most 16 classes.
+  // So we will need to change the optimization or use av1_nn_softmax instead if
+  // this ever gets changed.
+  av1_nn_fast_softmax_16(scores_2D_raw, scores_2D_raw);
 
   const float score_thresh =
-      get_adaptive_thresholds(tx_size, tx_set_type, prune_mode);
+      get_adaptive_thresholds(tx_size, tx_set_type, prune_2d_txfm_mode);
 
   // Always keep the TX type with the highest score, prune all others with
   // score below score_thresh.
@@ -1774,56 +1558,82 @@ static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
   float sum_score = 0.0;
   // Calculate sum of allowed tx type score and Populate allow bit mask based
   // on score_thresh and allowed_tx_mask
+  int allow_count = 0;
+  int tx_type_allowed[16] = { TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID };
+  float scores_2D[16] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+  };
   for (int tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) {
-    int allow_tx_type = *allowed_tx_mask & (1 << tx_type_table_2D[tx_idx]);
-    if (scores_2D[tx_idx] > max_score && allow_tx_type) {
-      max_score = scores_2D[tx_idx];
+    const int allow_tx_type =
+        check_bit_mask(*allowed_tx_mask, tx_type_table_2D[tx_idx]);
+    if (!allow_tx_type) {
+      continue;
+    }
+    if (scores_2D_raw[tx_idx] > max_score) {
+      max_score = scores_2D_raw[tx_idx];
       max_score_i = tx_idx;
     }
-    if (scores_2D[tx_idx] >= score_thresh && allow_tx_type) {
+    if (scores_2D_raw[tx_idx] >= score_thresh) {
       // Set allow mask based on score_thresh
-      allow_bitmask |= (1 << tx_type_table_2D[tx_idx]);
+      set_bit_mask(&allow_bitmask, tx_type_table_2D[tx_idx]);
 
       // Accumulate score of allowed tx type
-      sum_score += scores_2D[tx_idx];
+      sum_score += scores_2D_raw[tx_idx];
+
+      scores_2D[allow_count] = scores_2D_raw[tx_idx];
+      tx_type_allowed[allow_count] = tx_type_table_2D[tx_idx];
+      allow_count += 1;
     }
   }
-  if (!((allow_bitmask >> max_score_i) & 0x01)) {
-    // Set allow mask based on tx type with max score
-    allow_bitmask |= (1 << tx_type_table_2D[max_score_i]);
-    sum_score += scores_2D[max_score_i];
+  if (!check_bit_mask(allow_bitmask, tx_type_table_2D[max_score_i])) {
+    // If even the tx_type with max score is pruned, this means that no other
+    // tx_type is feasible. When this happens, we force enable max_score_i and
+    // end the search.
+    set_bit_mask(&allow_bitmask, tx_type_table_2D[max_score_i]);
+    memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D));
+    *allowed_tx_mask = allow_bitmask;
+    return;
   }
+
   // Sort tx type probability of all types
-  sort_probability(scores_2D, tx_type_table_2D, TX_TYPES);
+  if (allow_count <= 8) {
+    av1_sort_fi32_8(scores_2D, tx_type_allowed);
+  } else {
+    av1_sort_fi32_16(scores_2D, tx_type_allowed);
+  }
 
   // Enable more pruning based on tx type probability and number of allowed tx
   // types
-  if (prune_mode == PRUNE_2D_AGGRESSIVE) {
+  if (prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) {
     float temp_score = 0.0;
     float score_ratio = 0.0;
     int tx_idx, tx_count = 0;
     const float inv_sum_score = 100 / sum_score;
     // Get allowed tx types based on sorted probability score and tx count
-    for (tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) {
+    for (tx_idx = 0; tx_idx < allow_count; tx_idx++) {
       // Skip the tx type which has more than 30% of cumulative
       // probability and allowed tx type count is more than 2
       if (score_ratio > 30.0 && tx_count >= 2) break;
 
-      // Calculate cumulative probability of allowed tx types
-      if (allow_bitmask & (1 << tx_type_table_2D[tx_idx])) {
-        // Calculate cumulative probability
-        temp_score += scores_2D[tx_idx];
+      assert(check_bit_mask(allow_bitmask, tx_type_allowed[tx_idx]));
+      // Calculate cumulative probability
+      temp_score += scores_2D[tx_idx];
 
-        // Calculate percentage of cumulative probability of allowed tx type
-        score_ratio = temp_score * inv_sum_score;
-        tx_count++;
-      }
+      // Calculate percentage of cumulative probability of allowed tx type
+      score_ratio = temp_score * inv_sum_score;
+      tx_count++;
     }
     // Set remaining tx types as pruned
-    for (; tx_idx < TX_TYPES; tx_idx++)
-      allow_bitmask &= ~(1 << tx_type_table_2D[tx_idx]);
+    for (; tx_idx < allow_count; tx_idx++)
+      unset_bit_mask(&allow_bitmask, tx_type_allowed[tx_idx]);
   }
-  memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D));
+
+  memcpy(txk_map, tx_type_allowed, sizeof(tx_type_table_2D));
   *allowed_tx_mask = allow_bitmask;
 }
 
@@ -1860,7 +1670,6 @@ static AOM_INLINE void get_mean_dev_features(const int16_t *data, int stride,
       total_x_sum += x_sum;
       total_x2_sum += x2_sum;
 
-      aom_clear_system_state();
       const float mean = (float)x_sum / sub_num;
       const float dev = get_dev(mean, (double)x2_sum, sub_num);
       feature[feature_idx++] = mean;
@@ -1893,14 +1702,12 @@ static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
       x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
-  aom_clear_system_state();
 
   float features[64] = { 0.0f };
   get_mean_dev_features(diff, diff_stride, bw, bh, features);
 
   float score = 0.0f;
   av1_nn_predict(features, nn_config, 1, &score);
-  aom_clear_system_state();
 
   int int_score = (int)(score * 10000);
   return clamp(int_score, -80000, 80000);
@@ -1914,16 +1721,56 @@ get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   const int is_inter = is_inter_block(mbmi);
   const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
   // if txk_allowed = TX_TYPES, >1 tx types are allowed, else, if txk_allowed <
   // TX_TYPES, only that specific tx type is allowed.
   TX_TYPE txk_allowed = TX_TYPES;
 
-  if ((!is_inter && x->use_default_intra_tx_type) ||
-      (is_inter && x->use_default_inter_tx_type)) {
+  const FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  int use_actual_frame_probs = 1;
+  const int *tx_type_probs;
+#if CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST
+  use_actual_frame_probs =
+      (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+  if (!use_actual_frame_probs) {
+    tx_type_probs =
+        (int *)cpi->ppi->temp_frame_probs.tx_type_probs[update_type][tx_size];
+  }
+#endif
+  if (use_actual_frame_probs) {
+    tx_type_probs = cpi->ppi->frame_probs.tx_type_probs[update_type][tx_size];
+  }
+
+  if ((!is_inter && txfm_params->use_default_intra_tx_type) ||
+      (is_inter && txfm_params->default_inter_tx_type_prob_thresh == 0)) {
     txk_allowed =
-        get_default_tx_type(0, xd, tx_size, cpi->is_screen_content_type);
+        get_default_tx_type(0, xd, tx_size, cpi->use_screen_content_tools);
+  } else if (is_inter &&
+             txfm_params->default_inter_tx_type_prob_thresh != INT_MAX) {
+    if (tx_type_probs[DEFAULT_INTER_TX_TYPE] >
+        txfm_params->default_inter_tx_type_prob_thresh) {
+      txk_allowed = DEFAULT_INTER_TX_TYPE;
+    } else {
+      int force_tx_type = 0;
+      int max_prob = 0;
+      const int tx_type_prob_threshold =
+          txfm_params->default_inter_tx_type_prob_thresh +
+          PROB_THRESH_OFFSET_TX_TYPE;
+      for (int i = 1; i < TX_TYPES; i++) {  // find maximum probability.
+        if (tx_type_probs[i] > max_prob) {
+          max_prob = tx_type_probs[i];
+          force_tx_type = i;
+        }
+      }
+      if (max_prob > tx_type_prob_threshold)  // force tx type with max prob.
+        txk_allowed = force_tx_type;
+      else if (x->rd_model == LOW_TXFM_RD) {
+        if (plane == 0) txk_allowed = DCT_DCT;
+      }
+    }
   } else if (x->rd_model == LOW_TXFM_RD) {
     if (plane == 0) txk_allowed = DCT_DCT;
   }
@@ -1943,18 +1790,23 @@ get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
           ? fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]
           : mbmi->mode;
   uint16_t ext_tx_used_flag =
-      cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset &&
+      cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset != 0 &&
               tx_set_type == EXT_TX_SET_DTT4_IDTX_1DDCT
           ? av1_reduced_intra_tx_used_flag[intra_dir]
           : av1_ext_tx_used_flag[tx_set_type];
+
+  if (cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset == 2)
+    ext_tx_used_flag &= av1_derived_intra_tx_used_flag[intra_dir];
+
   if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
       ext_tx_used_flag == 0x0001 ||
-      (is_inter && cpi->oxcf.use_inter_dct_only) ||
-      (!is_inter && cpi->oxcf.use_intra_dct_only)) {
+      (is_inter && cpi->oxcf.txfm_cfg.use_inter_dct_only) ||
+      (!is_inter && cpi->oxcf.txfm_cfg.use_intra_dct_only)) {
     txk_allowed = DCT_DCT;
   }
 
-  if (cpi->oxcf.enable_flip_idtx == 0) ext_tx_used_flag &= DCT_ADST_TX_MASK;
+  if (cpi->oxcf.txfm_cfg.enable_flip_idtx == 0)
+    ext_tx_used_flag &= DCT_ADST_TX_MASK;
 
   uint16_t allowed_tx_mask = 0;  // 1: allow; 0: skip.
   if (txk_allowed < TX_TYPES) {
@@ -1967,9 +1819,6 @@ get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
     assert(plane == 0);
     allowed_tx_mask = ext_tx_used_flag;
     int num_allowed = 0;
-    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
-    const int *tx_type_probs =
-        cpi->frame_probs.tx_type_probs[update_type][tx_size];
     int i;
 
     if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
@@ -1997,8 +1846,8 @@ get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
     assert(num_allowed > 0);
 
     if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) {
-      int pf = prune_factors[x->prune_mode];
-      int mf = mul_factors[x->prune_mode];
+      int pf = prune_factors[txfm_params->prune_2d_txfm_mode];
+      int mf = mul_factors[txfm_params->prune_2d_txfm_mode];
       if (num_allowed <= 7) {
         const uint16_t prune =
             prune_txk_type(cpi, x, plane, block, tx_size, blk_row, blk_col,
@@ -2016,12 +1865,13 @@ get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
       }
     } else {
       assert(num_allowed > 0);
-      int allowed_tx_count = (x->prune_mode == PRUNE_2D_AGGRESSIVE) ? 1 : 5;
+      int allowed_tx_count =
+          (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) ? 1 : 5;
       // !fast_tx_search && txk_end != txk_start && plane == 0
-      if (x->prune_mode >= PRUNE_2D_ACCURATE && is_inter &&
+      if (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_1 && is_inter &&
           num_allowed > allowed_tx_count) {
         prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
-                    x->prune_mode, txk_map, &allowed_tx_mask);
+                    txfm_params->prune_2d_txfm_mode, txk_map, &allowed_tx_mask);
       }
     }
   }
@@ -2039,38 +1889,19 @@ get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
 
 #if CONFIG_RD_DEBUG
 static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
-                                         TX_SIZE tx_size, int blk_row,
-                                         int blk_col, int txb_coeff_cost) {
-  (void)blk_row;
-  (void)blk_col;
-  (void)tx_size;
+                                         int txb_coeff_cost) {
   rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
-
-  {
-    const int txb_h = tx_size_high_unit[tx_size];
-    const int txb_w = tx_size_wide_unit[tx_size];
-    int idx, idy;
-    for (idy = 0; idy < txb_h; ++idy)
-      for (idx = 0; idx < txb_w; ++idx)
-        rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0;
-
-    rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost;
-  }
-  assert(blk_row < TXB_COEFF_COST_MAP_SIZE);
-  assert(blk_col < TXB_COEFF_COST_MAP_SIZE);
 }
 #endif
 
 static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block,
                               TX_SIZE tx_size, const TX_TYPE tx_type,
                               const TXB_CTX *const txb_ctx,
-                              int use_fast_coef_costing,
                               int reduced_tx_set_used) {
 #if TXCOEFF_COST_TIMER
   struct aom_usec_timer timer;
   aom_usec_timer_start(&timer);
 #endif
-  (void)use_fast_coef_costing;
   const int cost = av1_cost_coeffs_txb(x, plane, block, tx_size, tx_type,
                                        txb_ctx, reduced_tx_set_used);
 #if TXCOEFF_COST_TIMER
@@ -2083,19 +1914,111 @@ static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block,
   return cost;
 }
 
+static int skip_trellis_opt_based_on_satd(MACROBLOCK *x,
+                                          QUANT_PARAM *quant_param, int plane,
+                                          int block, TX_SIZE tx_size,
+                                          int quant_b_adapt, int qstep,
+                                          unsigned int coeff_opt_satd_threshold,
+                                          int skip_trellis, int dc_only_blk) {
+  if (skip_trellis || (coeff_opt_satd_threshold == UINT_MAX))
+    return skip_trellis;
+
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff_ptr = p->coeff + block_offset;
+  const int n_coeffs = av1_get_max_eob(tx_size);
+  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size));
+  int satd = (dc_only_blk) ? abs(coeff_ptr[0]) : aom_satd(coeff_ptr, n_coeffs);
+  satd = RIGHT_SIGNED_SHIFT(satd, shift);
+  satd >>= (x->e_mbd.bd - 8);
+
+  const int skip_block_trellis =
+      ((uint64_t)satd >
+       (uint64_t)coeff_opt_satd_threshold * qstep * sqrt_tx_pixels_2d[tx_size]);
+
+  av1_setup_quant(
+      tx_size, !skip_block_trellis,
+      skip_block_trellis
+          ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP)
+          : AV1_XFORM_QUANT_FP,
+      quant_b_adapt, quant_param);
+
+  return skip_block_trellis;
+}
+
+// Predict DC only blocks if the residual variance is below a qstep based
+// threshold.For such blocks, transform type search is bypassed.
+static INLINE void predict_dc_only_block(
+    MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+    int block, int blk_row, int blk_col, RD_STATS *best_rd_stats,
+    int64_t *block_sse, unsigned int *block_mse_q8, int64_t *per_px_mean,
+    int *dc_only_blk) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
+  uint64_t block_var = UINT64_MAX;
+  const int dc_qstep = x->plane[plane].dequant_QTX[0] >> 3;
+  *block_sse = pixel_diff_stats(x, plane, blk_row, blk_col, plane_bsize,
+                                txsize_to_bsize[tx_size], block_mse_q8,
+                                per_px_mean, &block_var);
+  assert((*block_mse_q8) != UINT_MAX);
+  uint64_t var_threshold = (uint64_t)(1.8 * qstep * qstep);
+  if (is_cur_buf_hbd(xd))
+    block_var = ROUND_POWER_OF_TWO(block_var, (xd->bd - 8) * 2);
+  // Early prediction of skip block if residual mean and variance are less
+  // than qstep based threshold
+  if (((llabs(*per_px_mean) * dc_coeff_scale[tx_size]) < (dc_qstep << 12)) &&
+      (block_var < var_threshold)) {
+    // If the normalized mean of residual block is less than the dc qstep and
+    // the  normalized block variance is less than ac qstep, then the block is
+    // assumed to be a skip block and its rdcost is updated accordingly.
+    best_rd_stats->skip_txfm = 1;
+
+    x->plane[plane].eobs[block] = 0;
+
+    if (is_cur_buf_hbd(xd))
+      *block_sse = ROUND_POWER_OF_TWO((*block_sse), (xd->bd - 8) * 2);
+
+    best_rd_stats->dist = (*block_sse) << 4;
+    best_rd_stats->sse = best_rd_stats->dist;
+
+    ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+    av1_get_entropy_contexts(plane_bsize, &xd->plane[plane], ctxa, ctxl);
+    ENTROPY_CONTEXT *ta = ctxa;
+    ENTROPY_CONTEXT *tl = ctxl;
+    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+    TXB_CTX txb_ctx_tmp;
+    const PLANE_TYPE plane_type = get_plane_type(plane);
+    get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx_tmp);
+    const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][plane_type]
+                                  .txb_skip_cost[txb_ctx_tmp.txb_skip_ctx][1];
+    best_rd_stats->rate = zero_blk_rate;
+
+    best_rd_stats->rdcost =
+        RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->sse);
+
+    x->plane[plane].txb_entropy_ctx[block] = 0;
+  } else if (block_var < var_threshold) {
+    // Predict DC only blocks based on residual variance.
+    // For chroma plane, this early prediction is disabled for intra blocks.
+    if ((plane == 0) || (plane > 0 && is_inter_block(mbmi))) *dc_only_blk = 1;
+  }
+}
+
 // Search for the best transform type for a given transform block.
 // This function can be used for both inter and intra, both luma and chroma.
 static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                            int block, int blk_row, int blk_col,
                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                            const TXB_CTX *const txb_ctx,
-                           FAST_TX_SEARCH_MODE ftxs_mode,
-                           int use_fast_coef_costing, int skip_trellis,
+                           FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis,
                            int64_t ref_best_rd, RD_STATS *best_rd_stats) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
   MB_MODE_INFO *mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   int64_t best_rd = INT64_MAX;
   uint16_t best_eob = 0;
   TX_TYPE best_tx_type = DCT_DCT;
@@ -2103,7 +2026,8 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
   // of the best tx_type
   DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
-  tran_low_t *orig_dqcoeff = pd->dqcoeff;
+  struct macroblock_plane *const p = &x->plane[plane];
+  tran_low_t *orig_dqcoeff = p->dqcoeff;
   tran_low_t *best_dqcoeff = this_dqcoeff;
   const int tx_type_map_idx =
       plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col;
@@ -2112,45 +2036,6 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   skip_trellis |= !is_trellis_used(cpi->optimize_seg_arr[xd->mi[0]->segment_id],
                                    DRY_RUN_NORMAL);
 
-  // Hashing based speed feature for intra block. If the hash of the residue
-  // is found in the hash table, use the previous RD search results stored in
-  // the table and terminate early.
-  TXB_RD_INFO *intra_txb_rd_info = NULL;
-  uint16_t cur_joint_ctx = 0;
-  const int is_inter = is_inter_block(mbmi);
-  const int use_intra_txb_hash =
-      cpi->sf.tx_sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
-      !is_inter && plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size];
-  if (use_intra_txb_hash) {
-    const int mi_row = xd->mi_row;
-    const int mi_col = xd->mi_col;
-    const int within_border =
-        mi_row >= xd->tile.mi_row_start &&
-        (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
-        mi_col >= xd->tile.mi_col_start &&
-        (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
-    if (within_border &&
-        is_intra_hash_match(cpi, x, plane, blk_row, blk_col, plane_bsize,
-                            tx_size, txb_ctx, &intra_txb_rd_info,
-                            tx_type_map_idx, &cur_joint_ctx)) {
-      best_rd_stats->rate = intra_txb_rd_info->rate;
-      best_rd_stats->dist = intra_txb_rd_info->dist;
-      best_rd_stats->sse = intra_txb_rd_info->sse;
-      best_rd_stats->skip = intra_txb_rd_info->eob == 0;
-      x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
-      x->plane[plane].txb_entropy_ctx[block] =
-          intra_txb_rd_info->txb_entropy_ctx;
-      best_eob = intra_txb_rd_info->eob;
-      best_tx_type = intra_txb_rd_info->tx_type;
-      skip_trellis |= !intra_txb_rd_info->perform_block_coeff_opt;
-      update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
-      recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  txb_ctx, skip_trellis, best_tx_type, 1, &rate_cost, best_eob);
-      pd->dqcoeff = orig_dqcoeff;
-      return;
-    }
-  }
-
   uint8_t best_txb_ctx = 0;
   // txk_allowed = TX_TYPES: >1 tx types are allowed
   // txk_allowed < TX_TYPES: only that specific tx type is allowed.
@@ -2158,22 +2043,49 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   int txk_map[TX_TYPES] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   };
-  // Bit mask to indicate which transform types are allowed in the RD search.
-  const uint16_t allowed_tx_mask =
-      get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  txb_ctx, ftxs_mode, ref_best_rd, &txk_allowed, txk_map);
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
 
+  const uint8_t txw = tx_size_wide[tx_size];
+  const uint8_t txh = tx_size_high[tx_size];
+  int64_t block_sse;
   unsigned int block_mse_q8;
-  int64_t block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
-                                      txsize_to_bsize[tx_size], &block_mse_q8);
-  assert(block_mse_q8 != UINT_MAX);
+  int dc_only_blk = 0;
+  const bool predict_dc_block =
+      txfm_params->predict_dc_level && txw != 64 && txh != 64;
+  int64_t per_px_mean = INT64_MAX;
+  if (predict_dc_block) {
+    predict_dc_only_block(x, plane, plane_bsize, tx_size, block, blk_row,
+                          blk_col, best_rd_stats, &block_sse, &block_mse_q8,
+                          &per_px_mean, &dc_only_blk);
+    if (best_rd_stats->skip_txfm == 1) {
+      const TX_TYPE tx_type = DCT_DCT;
+      if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type;
+      return;
+    }
+  } else {
+    block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
+                                txsize_to_bsize[tx_size], &block_mse_q8);
+    assert(block_mse_q8 != UINT_MAX);
+  }
+
+  // Bit mask to indicate which transform types are allowed in the RD search.
+  uint16_t tx_mask;
+
+  // Use DCT_DCT transform for DC only block.
+  if (dc_only_blk)
+    tx_mask = 1 << DCT_DCT;
+  else
+    tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
+                          tx_size, txb_ctx, ftxs_mode, ref_best_rd,
+                          &txk_allowed, txk_map);
+  const uint16_t allowed_tx_mask = tx_mask;
+
   if (is_cur_buf_hbd(xd)) {
     block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
     block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
   }
   block_sse *= 16;
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
-  const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
   // Use mse / qstep^2 based threshold logic to take decision of R-D
   // optimization of coeffs. For smaller residuals, coeff optimization
   // would be helpful. For larger residuals, R-D optimization may not be
@@ -2181,7 +2093,7 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   // TODO(any): Experiment with variance and mean based thresholds
   const int perform_block_coeff_opt =
       ((uint64_t)block_mse_q8 <=
-       (uint64_t)x->coeff_opt_dist_threshold * qstep * qstep);
+       (uint64_t)txfm_params->coeff_opt_thresholds[0] * qstep * qstep);
   skip_trellis |= !perform_block_coeff_opt;
 
   // Flag to indicate if distortion should be calculated in transform domain or
@@ -2189,17 +2101,19 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   // Transform domain distortion is accurate for higher residuals.
   // TODO(any): Experiment with variance and mean based thresholds
   int use_transform_domain_distortion =
-      (x->use_transform_domain_distortion > 0) &&
-      (block_mse_q8 >= x->tx_domain_dist_threshold) &&
+      (txfm_params->use_transform_domain_distortion > 0) &&
+      (block_mse_q8 >= txfm_params->tx_domain_dist_threshold) &&
       // Any 64-pt transforms only preserves half the coefficients.
       // Therefore transform domain distortion is not valid for these
       // transform sizes.
-      txsize_sqr_up_map[tx_size] != TX_64X64;
+      (txsize_sqr_up_map[tx_size] != TX_64X64) &&
+      // Use pixel domain distortion for DC only blocks
+      !dc_only_blk;
   // Flag to indicate if an extra calculation of distortion in the pixel domain
   // should be performed at the end, after the best transform type has been
   // decided.
   int calc_pixel_domain_distortion_final =
-      x->use_transform_domain_distortion == 1 &&
+      txfm_params->use_transform_domain_distortion == 1 &&
       use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD;
   if (calc_pixel_domain_distortion_final &&
       (txk_allowed < TX_TYPES || allowed_tx_mask == 0x0001))
@@ -2209,17 +2123,19 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
   TxfmParam txfm_param;
   QUANT_PARAM quant_param;
+  int skip_trellis_based_on_satd[TX_TYPES] = { 0 };
   av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
   av1_setup_quant(tx_size, !skip_trellis,
                   skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
                                                          : AV1_XFORM_QUANT_FP)
                                : AV1_XFORM_QUANT_FP,
-                  cpi->oxcf.quant_b_adapt, &quant_param);
+                  cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
 
   // Iterate through all transform type candidates.
   for (int idx = 0; idx < TX_TYPES; ++idx) {
     const TX_TYPE tx_type = (TX_TYPE)txk_map[idx];
-    if (!(allowed_tx_mask & (1 << tx_type))) continue;
+    if (tx_type == TX_TYPE_INVALID || !check_bit_mask(allowed_tx_mask, tx_type))
+      continue;
     txfm_param.tx_type = tx_type;
     if (av1_use_qmatrix(&cm->quant_params, xd, mbmi->segment_id)) {
       av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
@@ -2229,28 +2145,26 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     RD_STATS this_rd_stats;
     av1_invalid_rd_stats(&this_rd_stats);
 
-    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
-                    &quant_param);
+    if (!dc_only_blk)
+      av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param);
+    else
+      av1_xform_dc_only(x, plane, block, &txfm_param, per_px_mean);
+
+    skip_trellis_based_on_satd[tx_type] = skip_trellis_opt_based_on_satd(
+        x, &quant_param, plane, block, tx_size, cpi->oxcf.q_cfg.quant_b_adapt,
+        qstep, txfm_params->coeff_opt_thresholds[1], skip_trellis, dc_only_blk);
+
+    av1_quant(x, plane, block, &txfm_param, &quant_param);
 
     // Calculate rate cost of quantized coefficients.
     if (quant_param.use_optimize_b) {
-      if (cpi->sf.rd_sf.optimize_b_precheck && best_rd < INT64_MAX &&
-          eobs_ptr[block] >= 4) {
-        // Calculate distortion quickly in transform domain.
-        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
-                             &this_rd_stats.sse);
-
-        const int64_t best_rd_ = AOMMIN(best_rd, ref_best_rd);
-        const int64_t dist_cost_estimate =
-            RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse));
-        if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue;
-      }
+      // TODO(aomedia:3209): update Trellis quantization to take into account
+      // quantization matrices.
       av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
-                     cpi->sf.rd_sf.trellis_eob_fast, &rate_cost);
+                     &rate_cost);
     } else {
-      rate_cost =
-          cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx,
-                      use_fast_coef_costing, cm->features.reduced_tx_set_used);
+      rate_cost = cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx,
+                              cm->features.reduced_tx_set_used);
     }
 
     // If rd cost based on coeff rate alone is already more than best_rd,
@@ -2261,16 +2175,23 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     if (eobs_ptr[block] == 0) {
       // When eob is 0, pixel domain distortion is more efficient and accurate.
       this_rd_stats.dist = this_rd_stats.sse = block_sse;
+    } else if (dc_only_blk) {
+      this_rd_stats.sse = block_sse;
+      this_rd_stats.dist = dist_block_px_domain(
+          cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
     } else if (use_transform_domain_distortion) {
-      dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+      const SCAN_ORDER *const scan_order =
+          get_scan(txfm_param.tx_size, txfm_param.tx_type);
+      dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                           scan_order->scan, &this_rd_stats.dist,
                            &this_rd_stats.sse);
     } else {
       int64_t sse_diff = INT64_MAX;
       // high_energy threshold assumes that every pixel within a txfm block
       // has a residue energy of at least 25% of the maximum, i.e. 128 * 128
-      // for 8 bit, then the threshold is scaled based on input bit depth.
+      // for 8 bit.
       const int64_t high_energy_thresh =
-          ((int64_t)128 * 128 * tx_size_2d[tx_size]) << ((xd->bd - 8) * 2);
+          ((int64_t)128 * 128 * tx_size_2d[tx_size]);
       const int is_high_energy = (block_sse >= high_energy_thresh);
       if (tx_size == TX_64X64 || is_high_energy) {
         // Because 3 out 4 quadrants of transform coefficients are forced to
@@ -2279,7 +2200,10 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
         // to decide if we should do pixel domain distortion. If the energy
         // is mostly in first quadrant, then it is unlikely that we have
         // overflow issue in inverse transform.
-        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+        const SCAN_ORDER *const scan_order =
+            get_scan(txfm_param.tx_size, txfm_param.tx_type);
+        dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+                             scan_order->scan, &this_rd_stats.dist,
                              &this_rd_stats.sse);
         sse_diff = block_sse - this_rd_stats.sse;
       }
@@ -2314,8 +2238,8 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
       best_eob = x->plane[plane].eobs[block];
       // Swap dqcoeff buffers
       tran_low_t *const tmp_dqcoeff = best_dqcoeff;
-      best_dqcoeff = pd->dqcoeff;
-      pd->dqcoeff = tmp_dqcoeff;
+      best_dqcoeff = p->dqcoeff;
+      p->dqcoeff = tmp_dqcoeff;
     }
 
 #if CONFIG_COLLECT_RD_STATS == 1
@@ -2376,15 +2300,16 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
   assert(best_rd != INT64_MAX);
 
-  best_rd_stats->skip = best_eob == 0;
+  best_rd_stats->skip_txfm = best_eob == 0;
   if (plane == 0) update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
   x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
   x->plane[plane].eobs[block] = best_eob;
+  skip_trellis = skip_trellis_based_on_satd[best_tx_type];
 
   // Point dqcoeff to the quantized coefficients corresponding to the best
   // transform type, then we can skip transform and quantization, e.g. in the
   // final pixel domain distortion calculation and recon_intra().
-  pd->dqcoeff = best_dqcoeff;
+  p->dqcoeff = best_dqcoeff;
 
   if (calc_pixel_domain_distortion_final && best_eob) {
     best_rd_stats->dist = dist_block_px_domain(
@@ -2392,23 +2317,11 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     best_rd_stats->sse = block_sse;
   }
 
-  if (intra_txb_rd_info != NULL) {
-    intra_txb_rd_info->valid = 1;
-    intra_txb_rd_info->entropy_context = cur_joint_ctx;
-    intra_txb_rd_info->rate = best_rd_stats->rate;
-    intra_txb_rd_info->dist = best_rd_stats->dist;
-    intra_txb_rd_info->sse = best_rd_stats->sse;
-    intra_txb_rd_info->eob = best_eob;
-    intra_txb_rd_info->txb_entropy_ctx = best_txb_ctx;
-    intra_txb_rd_info->perform_block_coeff_opt = perform_block_coeff_opt;
-    if (plane == 0) intra_txb_rd_info->tx_type = best_tx_type;
-  }
-
   // Intra mode needs decoded pixels such that the next transform block
   // can use them for prediction.
   recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
               txb_ctx, skip_trellis, best_tx_type, 0, &rate_cost, best_eob);
-  pd->dqcoeff = orig_dqcoeff;
+  p->dqcoeff = orig_dqcoeff;
 }
 
 // Pick transform type for a luma transform block of tx_size. Note this function
@@ -2418,52 +2331,14 @@ static AOM_INLINE void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x,
                                   int block, int plane_bsize, TXB_CTX *txb_ctx,
                                   RD_STATS *rd_stats,
                                   FAST_TX_SEARCH_MODE ftxs_mode,
-                                  int64_t ref_rdcost,
-                                  TXB_RD_INFO *rd_info_array) {
-  const struct macroblock_plane *const p = &x->plane[0];
-  const uint16_t cur_joint_ctx =
-      (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
-  MACROBLOCKD *xd = &x->e_mbd;
-  assert(is_inter_block(xd->mi[0]));
-  const int tx_type_map_idx = blk_row * xd->tx_type_map_stride + blk_col;
-  // Look up RD and terminate early in case when we've already processed exactly
-  // the same residue with exactly the same entropy context.
-  if (rd_info_array != NULL && rd_info_array->valid &&
-      rd_info_array->entropy_context == cur_joint_ctx) {
-    xd->tx_type_map[tx_type_map_idx] = rd_info_array->tx_type;
-    const TX_TYPE ref_tx_type =
-        av1_get_tx_type(&x->e_mbd, get_plane_type(0), blk_row, blk_col, tx_size,
-                        cpi->common.features.reduced_tx_set_used);
-    if (ref_tx_type == rd_info_array->tx_type) {
-      rd_stats->rate += rd_info_array->rate;
-      rd_stats->dist += rd_info_array->dist;
-      rd_stats->sse += rd_info_array->sse;
-      rd_stats->skip &= rd_info_array->eob == 0;
-      p->eobs[block] = rd_info_array->eob;
-      p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx;
-      return;
-    }
-  }
-
+                                  int64_t ref_rdcost) {
+  assert(is_inter_block(x->e_mbd.mi[0]));
   RD_STATS this_rd_stats;
   const int skip_trellis = 0;
   search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size,
-                 txb_ctx, ftxs_mode, 0, skip_trellis, ref_rdcost,
-                 &this_rd_stats);
+                 txb_ctx, ftxs_mode, skip_trellis, ref_rdcost, &this_rd_stats);
 
   av1_merge_rd_stats(rd_stats, &this_rd_stats);
-
-  // Save RD results for possible reuse in future.
-  if (rd_info_array != NULL) {
-    rd_info_array->valid = 1;
-    rd_info_array->entropy_context = cur_joint_ctx;
-    rd_info_array->rate = this_rd_stats.rate;
-    rd_info_array->dist = this_rd_stats.dist;
-    rd_info_array->sse = this_rd_stats.sse;
-    rd_info_array->eob = p->eobs[block];
-    rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block];
-    rd_info_array->tx_type = xd->tx_type_map[tx_type_map_idx];
-  }
 }
 
 static AOM_INLINE void try_tx_block_no_split(
@@ -2471,8 +2346,7 @@ static AOM_INLINE void try_tx_block_no_split(
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
     const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
     int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
-    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
-    TxCandidateInfo *no_split) {
+    FAST_TX_SEARCH_MODE ftxs_mode, TxCandidateInfo *no_split) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[0];
@@ -2482,35 +2356,35 @@ static AOM_INLINE void try_tx_block_no_split(
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
-  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+  const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y]
                                 .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
   rd_stats->zero_rate = zero_blk_rate;
   const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
   mbmi->inter_tx_size[index] = tx_size;
   tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
-             rd_stats, ftxs_mode, ref_best_rd,
-             rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+             rd_stats, ftxs_mode, ref_best_rd);
   assert(rd_stats->rate < INT_MAX);
 
-  const int pick_skip = !xd->lossless[mbmi->segment_id] &&
-                        (rd_stats->skip == 1 ||
-                         RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
-                             RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse));
-  if (pick_skip) {
+  const int pick_skip_txfm =
+      !xd->lossless[mbmi->segment_id] &&
+      (rd_stats->skip_txfm == 1 ||
+       RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+           RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse));
+  if (pick_skip_txfm) {
 #if CONFIG_RD_DEBUG
-    update_txb_coeff_cost(rd_stats, 0, tx_size, blk_row, blk_col,
-                          zero_blk_rate - rd_stats->rate);
+    update_txb_coeff_cost(rd_stats, 0, zero_blk_rate - rd_stats->rate);
 #endif  // CONFIG_RD_DEBUG
     rd_stats->rate = zero_blk_rate;
     rd_stats->dist = rd_stats->sse;
     p->eobs[block] = 0;
     update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
   }
-  rd_stats->skip = pick_skip;
-  set_blk_skip(x, 0, blk_row * bw + blk_col, pick_skip);
+  rd_stats->skip_txfm = pick_skip_txfm;
+  set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col,
+               pick_skip_txfm);
 
   if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-    rd_stats->rate += x->txfm_partition_cost[txfm_partition_ctx][0];
+    rd_stats->rate += x->mode_costs.txfm_partition_cost[txfm_partition_ctx][0];
 
   no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
   no_split->txb_entropy_ctx = p->txb_entropy_ctx[block];
@@ -2523,8 +2397,7 @@ static AOM_INLINE void try_tx_block_split(
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
     int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
-    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
-    RD_STATS *split_rd_stats) {
+    FAST_TX_SEARCH_MODE ftxs_mode, RD_STATS *split_rd_stats) {
   assert(tx_size < TX_SIZES_ALL);
   MACROBLOCKD *const xd = &x->e_mbd;
   const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
@@ -2539,22 +2412,23 @@ static AOM_INLINE void try_tx_block_split(
   const int nblks = (txb_height / sub_txb_height) * (txb_width / sub_txb_width);
   assert(nblks > 0);
   av1_init_rd_stats(split_rd_stats);
-  split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
+  split_rd_stats->rate =
+      x->mode_costs.txfm_partition_cost[txfm_partition_ctx][1];
 
   for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) {
+    const int offsetr = blk_row + r;
+    if (offsetr >= max_blocks_high) break;
     for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) {
       assert(blk_idx < 4);
-      const int offsetr = blk_row + r;
       const int offsetc = blk_col + c;
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+      if (offsetc >= max_blocks_wide) continue;
 
       RD_STATS this_rd_stats;
       int this_cost_valid = 1;
-      select_tx_block(
-          cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta,
-          tl, tx_above, tx_left, &this_rd_stats, no_split_rd / nblks,
-          ref_best_rd - split_rd_stats->rdcost, &this_cost_valid, ftxs_mode,
-          (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
+      select_tx_block(cpi, x, offsetr, offsetc, block, sub_txs, depth + 1,
+                      plane_bsize, ta, tl, tx_above, tx_left, &this_rd_stats,
+                      no_split_rd / nblks, ref_best_rd - split_rd_stats->rdcost,
+                      &this_cost_valid, ftxs_mode);
       if (!this_cost_valid) {
         split_rd_stats->rdcost = INT64_MAX;
         return;
@@ -2571,6 +2445,96 @@ static AOM_INLINE void try_tx_block_split(
   }
 }
 
+static float get_var(float mean, double x2_sum, int num) {
+  const float e_x2 = (float)(x2_sum / num);
+  const float diff = e_x2 - mean * mean;
+  return diff;
+}
+
+static AOM_INLINE void get_blk_var_dev(const int16_t *data, int stride, int bw,
+                                       int bh, float *dev_of_mean,
+                                       float *var_of_vars) {
+  const int16_t *const data_ptr = &data[0];
+  const int subh = (bh >= bw) ? (bh >> 1) : bh;
+  const int subw = (bw >= bh) ? (bw >> 1) : bw;
+  const int num = bw * bh;
+  const int sub_num = subw * subh;
+  int total_x_sum = 0;
+  int64_t total_x2_sum = 0;
+  int blk_idx = 0;
+  float var_sum = 0.0f;
+  float mean_sum = 0.0f;
+  double var2_sum = 0.0f;
+  double mean2_sum = 0.0f;
+
+  for (int row = 0; row < bh; row += subh) {
+    for (int col = 0; col < bw; col += subw) {
+      int x_sum;
+      int64_t x2_sum;
+      aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh,
+                          &x_sum, &x2_sum);
+      total_x_sum += x_sum;
+      total_x2_sum += x2_sum;
+
+      const float mean = (float)x_sum / sub_num;
+      const float var = get_var(mean, (double)x2_sum, sub_num);
+      mean_sum += mean;
+      mean2_sum += (double)(mean * mean);
+      var_sum += var;
+      var2_sum += var * var;
+      blk_idx++;
+    }
+  }
+
+  const float lvl0_mean = (float)total_x_sum / num;
+  const float block_var = get_var(lvl0_mean, (double)total_x2_sum, num);
+  mean_sum += lvl0_mean;
+  mean2_sum += (double)(lvl0_mean * lvl0_mean);
+  var_sum += block_var;
+  var2_sum += block_var * block_var;
+  const float av_mean = mean_sum / 5;
+
+  if (blk_idx > 1) {
+    // Deviation of means.
+    *dev_of_mean = get_dev(av_mean, mean2_sum, (blk_idx + 1));
+    // Variance of variances.
+    const float mean_var = var_sum / (blk_idx + 1);
+    *var_of_vars = get_var(mean_var, var2_sum, (blk_idx + 1));
+  }
+}
+
+static void prune_tx_split_no_split(MACROBLOCK *x, BLOCK_SIZE bsize,
+                                    int blk_row, int blk_col, TX_SIZE tx_size,
+                                    int *try_no_split, int *try_split,
+                                    int pruning_level) {
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff =
+      x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  float dev_of_means = 0.0f;
+  float var_of_vars = 0.0f;
+
+  // This function calculates the deviation of means, and the variance of pixel
+  // variances of the block as well as it's sub-blocks.
+  get_blk_var_dev(diff, diff_stride, bw, bh, &dev_of_means, &var_of_vars);
+  const int dc_q = x->plane[0].dequant_QTX[0] >> 3;
+  const int ac_q = x->plane[0].dequant_QTX[1] >> 3;
+  const int no_split_thresh_scales[4] = { 0, 24, 8, 8 };
+  const int no_split_thresh_scale = no_split_thresh_scales[pruning_level];
+  const int split_thresh_scales[4] = { 0, 24, 10, 8 };
+  const int split_thresh_scale = split_thresh_scales[pruning_level];
+
+  if ((dev_of_means <= dc_q) &&
+      (split_thresh_scale * var_of_vars <= ac_q * ac_q)) {
+    *try_split = 0;
+  }
+  if ((dev_of_means > no_split_thresh_scale * dc_q) &&
+      (var_of_vars > no_split_thresh_scale * ac_q * ac_q)) {
+    *try_no_split = 0;
+  }
+}
+
 // Search for the best transform partition(recursive)/type for a given
 // inter-predicted luma block. The obtained transform selection will be saved
 // in xd->mi[0], the corresponding RD stats will be saved in rd_stats.
@@ -2579,8 +2543,7 @@ static AOM_INLINE void select_tx_block(
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
     RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
-    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
-    TXB_RD_INFO_NODE *rd_info_node) {
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode) {
   assert(tx_size < TX_SIZES_ALL);
   av1_init_rd_stats(rd_stats);
   if (ref_best_rd < 0) {
@@ -2593,19 +2556,33 @@ static AOM_INLINE void select_tx_block(
          blk_col < max_block_wide(xd, plane_bsize, 0));
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
-                                         mbmi->sb_type, tx_size);
+                                         mbmi->bsize, tx_size);
   struct macroblock_plane *const p = &x->plane[0];
 
-  const int try_no_split =
-      cpi->oxcf.enable_tx64 || txsize_sqr_up_map[tx_size] != TX_64X64;
+  int try_no_split = (cpi->oxcf.txfm_cfg.enable_tx64 ||
+                      txsize_sqr_up_map[tx_size] != TX_64X64) &&
+                     (cpi->oxcf.txfm_cfg.enable_rect_tx ||
+                      tx_size_wide[tx_size] == tx_size_high[tx_size]);
   int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
   TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
 
+  // Prune tx_split and no-split based on sub-block properties.
+  if (tx_size != TX_4X4 && try_split == 1 && try_no_split == 1 &&
+      cpi->sf.tx_sf.prune_tx_size_level > 0) {
+    prune_tx_split_no_split(x, plane_bsize, blk_row, blk_col, tx_size,
+                            &try_no_split, &try_split,
+                            cpi->sf.tx_sf.prune_tx_size_level);
+  }
+
+  if (cpi->sf.rt_sf.skip_tx_no_split_var_based_partition) {
+    if (x->try_merge_partition && try_split && p->eobs[block]) try_no_split = 0;
+  }
+
   // Try using current block as a single transform block without split.
   if (try_no_split) {
     try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
                           plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
-                          ftxs_mode, rd_info_node, &no_split);
+                          ftxs_mode, &no_split);
 
     // Speed features for early termination.
     const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level;
@@ -2641,7 +2618,7 @@ static AOM_INLINE void select_tx_block(
     try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
                        plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
                        AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
-                       rd_info_node, &split_rd_stats);
+                       &split_rd_stats);
   }
 
   if (no_split.rd < split_rd_stats.rdcost) {
@@ -2661,7 +2638,8 @@ static AOM_INLINE void select_tx_block(
     mbmi->tx_size = tx_size;
     update_txk_array(xd, blk_row, blk_col, tx_size, no_split.tx_type);
     const int bw = mi_size_wide[plane_bsize];
-    set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip);
+    set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col,
+                 rd_stats->skip_txfm);
   } else {
     *rd_stats = split_rd_stats;
     if (split_rd_stats.rdcost == INT64_MAX) *is_cost_valid = 0;
@@ -2674,10 +2652,11 @@ static AOM_INLINE void choose_largest_tx_size(const AV1_COMP *const cpi,
                                               BLOCK_SIZE bs) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  mbmi->tx_size = tx_size_from_tx_mode(bs, x->tx_mode_search_type);
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  mbmi->tx_size = tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type);
 
   // If tx64 is not enabled, we need to go down to the next available size
-  if (!cpi->oxcf.enable_tx64) {
+  if (!cpi->oxcf.txfm_cfg.enable_tx64 && cpi->oxcf.txfm_cfg.enable_rect_tx) {
     static const TX_SIZE tx_size_max_32[TX_SIZES_ALL] = {
       TX_4X4,    // 4x4 transform
       TX_8X8,    // 8x8 transform
@@ -2699,22 +2678,69 @@ static AOM_INLINE void choose_largest_tx_size(const AV1_COMP *const cpi,
       TX_16X32,  // 16x64 transform
       TX_32X16,  // 64x16 transform
     };
-
     mbmi->tx_size = tx_size_max_32[mbmi->tx_size];
+  } else if (cpi->oxcf.txfm_cfg.enable_tx64 &&
+             !cpi->oxcf.txfm_cfg.enable_rect_tx) {
+    static const TX_SIZE tx_size_max_square[TX_SIZES_ALL] = {
+      TX_4X4,    // 4x4 transform
+      TX_8X8,    // 8x8 transform
+      TX_16X16,  // 16x16 transform
+      TX_32X32,  // 32x32 transform
+      TX_64X64,  // 64x64 transform
+      TX_4X4,    // 4x8 transform
+      TX_4X4,    // 8x4 transform
+      TX_8X8,    // 8x16 transform
+      TX_8X8,    // 16x8 transform
+      TX_16X16,  // 16x32 transform
+      TX_16X16,  // 32x16 transform
+      TX_32X32,  // 32x64 transform
+      TX_32X32,  // 64x32 transform
+      TX_4X4,    // 4x16 transform
+      TX_4X4,    // 16x4 transform
+      TX_8X8,    // 8x32 transform
+      TX_8X8,    // 32x8 transform
+      TX_16X16,  // 16x64 transform
+      TX_16X16,  // 64x16 transform
+    };
+    mbmi->tx_size = tx_size_max_square[mbmi->tx_size];
+  } else if (!cpi->oxcf.txfm_cfg.enable_tx64 &&
+             !cpi->oxcf.txfm_cfg.enable_rect_tx) {
+    static const TX_SIZE tx_size_max_32_square[TX_SIZES_ALL] = {
+      TX_4X4,    // 4x4 transform
+      TX_8X8,    // 8x8 transform
+      TX_16X16,  // 16x16 transform
+      TX_32X32,  // 32x32 transform
+      TX_32X32,  // 64x64 transform
+      TX_4X4,    // 4x8 transform
+      TX_4X4,    // 8x4 transform
+      TX_8X8,    // 8x16 transform
+      TX_8X8,    // 16x8 transform
+      TX_16X16,  // 16x32 transform
+      TX_16X16,  // 32x16 transform
+      TX_32X32,  // 32x64 transform
+      TX_32X32,  // 64x32 transform
+      TX_4X4,    // 4x16 transform
+      TX_4X4,    // 16x4 transform
+      TX_8X8,    // 8x32 transform
+      TX_8X8,    // 32x8 transform
+      TX_16X16,  // 16x64 transform
+      TX_16X16,  // 64x16 transform
+    };
+
+    mbmi->tx_size = tx_size_max_32_square[mbmi->tx_size];
   }
 
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int no_skip_flag_rate = x->skip_cost[skip_ctx][0];
-  const int skip_flag_rate = x->skip_cost[skip_ctx][1];
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
   // Skip RDcost is used only for Inter blocks
-  const int64_t skip_rd =
-      is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_flag_rate, 0) : INT64_MAX;
-  const int64_t no_skip_rd = RDCOST(x->rdmult, no_skip_flag_rate, 0);
+  const int64_t skip_txfm_rd =
+      is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX;
+  const int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_rate, 0);
   const int skip_trellis = 0;
   av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
-                       AOMMIN(no_skip_rd, skip_rd), AOM_PLANE_Y, bs,
-                       mbmi->tx_size, cpi->sf.rd_sf.use_fast_coef_costing,
-                       FTXS_NONE, skip_trellis);
+                       AOMMIN(no_skip_txfm_rd, skip_txfm_rd), AOM_PLANE_Y, bs,
+                       mbmi->tx_size, FTXS_NONE, skip_trellis);
 }
 
 static AOM_INLINE void choose_smallest_tx_size(const AV1_COMP *const cpi,
@@ -2729,8 +2755,7 @@ static AOM_INLINE void choose_smallest_tx_size(const AV1_COMP *const cpi,
   // TODO(any) : Pass this_rd based on skip/non-skip cost
   const int skip_trellis = 0;
   av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size,
-                       cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE,
-                       skip_trellis);
+                       FTXS_NONE, skip_trellis);
 }
 
 // Search for the best uniform transform size and type for current coding block.
@@ -2743,8 +2768,9 @@ static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
 
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
-  const int tx_select = x->tx_mode_search_type == TX_MODE_SELECT;
+  const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT;
   int start_tx;
   // The split depth can be at most MAX_TX_DEPTH, so the init_depth controls
   // how many times of splitting is allowed during the RD search.
@@ -2754,10 +2780,14 @@ static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
     start_tx = max_rect_tx_size;
     init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
                                        is_inter_block(mbmi), &cpi->sf,
-                                       x->tx_size_search_method);
+                                       txfm_params->tx_size_search_method);
+    if (init_depth == MAX_TX_DEPTH && !cpi->oxcf.txfm_cfg.enable_tx64 &&
+        txsize_sqr_up_map[start_tx] == TX_64X64) {
+      start_tx = sub_tx_size_map[start_tx];
+    }
   } else {
     const TX_SIZE chosen_tx_size =
-        tx_size_from_tx_mode(bs, x->tx_mode_search_type);
+        tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type);
     start_tx = chosen_tx_size;
     init_depth = MAX_TX_DEPTH;
   }
@@ -2770,9 +2800,13 @@ static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
   const int num_blks = bsize_to_num_blk(bs);
   x->rd_model = FULL_TXFM_RD;
   int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX };
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   for (int tx_size = start_tx, depth = init_depth; depth <= MAX_TX_DEPTH;
        depth++, tx_size = sub_tx_size_map[tx_size]) {
-    if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) {
+    if ((!cpi->oxcf.txfm_cfg.enable_tx64 &&
+         txsize_sqr_up_map[tx_size] == TX_64X64) ||
+        (!cpi->oxcf.txfm_cfg.enable_rect_tx &&
+         tx_size_wide[tx_size] != tx_size_high[tx_size])) {
       continue;
     }
 
@@ -2780,7 +2814,7 @@ static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
     rd[depth] = av1_uniform_txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs,
                                      tx_size, FTXS_NONE, skip_trellis);
     if (rd[depth] < best_rd) {
-      av1_copy_array(best_blk_skip, x->blk_skip, num_blks);
+      av1_copy_array(best_blk_skip, txfm_info->blk_skip, num_blks);
       av1_copy_array(best_txk_type_map, xd->tx_type_map, num_blks);
       best_tx_size = tx_size;
       best_rd = rd[depth];
@@ -2798,7 +2832,7 @@ static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
   if (rd_stats->rate != INT_MAX) {
     mbmi->tx_size = best_tx_size;
     av1_copy_array(xd->tx_type_map, best_txk_type_map, num_blks);
-    av1_copy_array(x->blk_skip, best_blk_skip, num_blks);
+    av1_copy_array(txfm_info->blk_skip, best_blk_skip, num_blks);
   }
 }
 
@@ -2831,9 +2865,8 @@ static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row,
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
   search_tx_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                 &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
-                 args->skip_trellis, args->best_rd - args->current_rd,
-                 &this_rd_stats);
+                 &txb_ctx, args->ftxs_mode, args->skip_trellis,
+                 args->best_rd - args->current_rd, &this_rd_stats);
 
   if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
     assert(!is_inter || plane_bsize < BLOCK_8X8);
@@ -2841,29 +2874,31 @@ static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row,
   }
 
 #if CONFIG_RD_DEBUG
-  update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
-                        this_rd_stats.rate);
+  update_txb_coeff_cost(&this_rd_stats, plane, this_rd_stats.rate);
 #endif  // CONFIG_RD_DEBUG
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 
   const int blk_idx =
       blk_row * (block_size_wide[plane_bsize] >> MI_SIZE_LOG2) + blk_col;
+
+  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   if (plane == 0)
-    set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0);
+    set_blk_skip(txfm_info->blk_skip, plane, blk_idx,
+                 x->plane[plane].eobs[block] == 0);
   else
-    set_blk_skip(x, plane, blk_idx, 0);
+    set_blk_skip(txfm_info->blk_skip, plane, blk_idx, 0);
 
   int64_t rd;
   if (is_inter) {
-    const int64_t no_skip_rd =
+    const int64_t no_skip_txfm_rd =
         RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
-    const int64_t skip_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
-    rd = AOMMIN(no_skip_rd, skip_rd);
-    this_rd_stats.skip &= !x->plane[plane].eobs[block];
+    const int64_t skip_txfm_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+    rd = AOMMIN(no_skip_txfm_rd, skip_txfm_rd);
+    this_rd_stats.skip_txfm &= !x->plane[plane].eobs[block];
   } else {
-    // Signal non-skip for Intra blocks
+    // Signal non-skip_txfm for Intra blocks
     rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
-    this_rd_stats.skip = 0;
+    this_rd_stats.skip_txfm = 0;
   }
 
   av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
@@ -2872,8 +2907,130 @@ static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row,
   if (args->current_rd > args->best_rd) args->exit_early = 1;
 }
 
-// Search for the best transform type and return the transform coefficients RD
-// cost of current luma coding block with the given uniform transform size.
+int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              RD_STATS *rd_stats, int64_t ref_best_rd,
+                              BLOCK_SIZE bs, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int is_inter = is_inter_block(mbmi);
+  const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+                        block_signals_txsize(mbmi->bsize);
+  int tx_size_rate = 0;
+  if (tx_select) {
+    const int ctx = txfm_partition_context(
+        xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size);
+    tx_size_rate = mode_costs->txfm_partition_cost[ctx][0];
+  }
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1];
+  const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, 0);
+  const int64_t no_this_rd =
+      RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0);
+  mbmi->tx_size = tx_size;
+
+  const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+  const uint8_t txh_unit = tx_size_high_unit[tx_size];
+  const int step = txw_unit * txh_unit;
+  const int max_blocks_wide = max_block_wide(xd, bs, 0);
+  const int max_blocks_high = max_block_high(xd, bs, 0);
+
+  struct rdcost_block_args args;
+  av1_zero(args);
+  args.x = x;
+  args.cpi = cpi;
+  args.best_rd = ref_best_rd;
+  args.current_rd = AOMMIN(no_this_rd, skip_txfm_rd);
+  av1_init_rd_stats(&args.rd_stats);
+  av1_get_entropy_contexts(bs, &xd->plane[0], args.t_above, args.t_left);
+  int i = 0;
+  for (int blk_row = 0; blk_row < max_blocks_high && !args.incomplete_exit;
+       blk_row += txh_unit) {
+    for (int blk_col = 0; blk_col < max_blocks_wide; blk_col += txw_unit) {
+      RD_STATS this_rd_stats;
+      av1_init_rd_stats(&this_rd_stats);
+
+      if (args.exit_early) {
+        args.incomplete_exit = 1;
+        break;
+      }
+
+      ENTROPY_CONTEXT *a = args.t_above + blk_col;
+      ENTROPY_CONTEXT *l = args.t_left + blk_row;
+      TXB_CTX txb_ctx;
+      get_txb_ctx(bs, tx_size, 0, a, l, &txb_ctx);
+
+      TxfmParam txfm_param;
+      QUANT_PARAM quant_param;
+      av1_setup_xform(&cpi->common, x, tx_size, DCT_DCT, &txfm_param);
+      av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, 0, &quant_param);
+
+      av1_xform(x, 0, i, blk_row, blk_col, bs, &txfm_param);
+      av1_quant(x, 0, i, &txfm_param, &quant_param);
+
+      this_rd_stats.rate =
+          cost_coeffs(x, 0, i, tx_size, txfm_param.tx_type, &txb_ctx, 0);
+
+      const SCAN_ORDER *const scan_order =
+          get_scan(txfm_param.tx_size, txfm_param.tx_type);
+      dist_block_tx_domain(x, 0, i, tx_size, quant_param.qmatrix,
+                           scan_order->scan, &this_rd_stats.dist,
+                           &this_rd_stats.sse);
+
+      const int64_t no_skip_txfm_rd =
+          RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+      const int64_t skip_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+
+      this_rd_stats.skip_txfm &= !x->plane[0].eobs[i];
+
+      av1_merge_rd_stats(&args.rd_stats, &this_rd_stats);
+      args.current_rd += AOMMIN(no_skip_txfm_rd, skip_rd);
+
+      if (args.current_rd > ref_best_rd) {
+        args.exit_early = 1;
+        break;
+      }
+
+      av1_set_txb_context(x, 0, i, tx_size, a, l);
+      i += step;
+    }
+  }
+
+  if (args.incomplete_exit) av1_invalid_rd_stats(&args.rd_stats);
+
+  *rd_stats = args.rd_stats;
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  int64_t rd;
+  // rdstats->rate should include all the rate except skip/non-skip cost as the
+  // same is accounted in the caller functions after rd evaluation of all
+  // planes. However the decisions should be done after considering the
+  // skip/non-skip header cost
+  if (rd_stats->skip_txfm && is_inter) {
+    rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+  } else {
+    // Intra blocks are always signalled as non-skip
+    rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate,
+                rd_stats->dist);
+    rd_stats->rate += tx_size_rate;
+  }
+  // Check if forcing the block to skip transform leads to smaller RD cost.
+  if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) {
+    int64_t temp_skip_txfm_rd =
+        RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+    if (temp_skip_txfm_rd <= rd) {
+      rd = temp_skip_txfm_rd;
+      rd_stats->rate = 0;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip_txfm = 1;
+    }
+  }
+
+  return rd;
+}
+
 int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                              RD_STATS *rd_stats, int64_t ref_best_rd,
                              BLOCK_SIZE bs, TX_SIZE tx_size,
@@ -2881,29 +3038,30 @@ int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const ModeCosts *mode_costs = &x->mode_costs;
   const int is_inter = is_inter_block(mbmi);
-  const int tx_select = x->tx_mode_search_type == TX_MODE_SELECT &&
-                        block_signals_txsize(mbmi->sb_type);
+  const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+                        block_signals_txsize(mbmi->bsize);
   int tx_size_rate = 0;
   if (tx_select) {
     const int ctx = txfm_partition_context(
-        xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
-    tx_size_rate = is_inter ? x->txfm_partition_cost[ctx][0]
+        xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size);
+    tx_size_rate = is_inter ? mode_costs->txfm_partition_cost[ctx][0]
                             : tx_size_cost(x, bs, tx_size);
   }
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int no_skip_flag_rate = x->skip_cost[skip_ctx][0];
-  const int skip_flag_rate = x->skip_cost[skip_ctx][1];
-  const int64_t skip_rd =
-      is_inter ? RDCOST(x->rdmult, skip_flag_rate, 0) : INT64_MAX;
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1];
+  const int64_t skip_txfm_rd =
+      is_inter ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX;
   const int64_t no_this_rd =
-      RDCOST(x->rdmult, no_skip_flag_rate + tx_size_rate, 0);
+      RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0);
 
   mbmi->tx_size = tx_size;
   av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
-                       AOMMIN(no_this_rd, skip_rd), AOM_PLANE_Y, bs, tx_size,
-                       cpi->sf.rd_sf.use_fast_coef_costing, ftxs_mode,
-                       skip_trellis);
+                       AOMMIN(no_this_rd, skip_txfm_rd), AOM_PLANE_Y, bs,
+                       tx_size, ftxs_mode, skip_trellis);
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
 
   int64_t rd;
@@ -2911,22 +3069,23 @@ int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   // same is accounted in the caller functions after rd evaluation of all
   // planes. However the decisions should be done after considering the
   // skip/non-skip header cost
-  if (rd_stats->skip && is_inter) {
-    rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse);
+  if (rd_stats->skip_txfm && is_inter) {
+    rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
   } else {
     // Intra blocks are always signalled as non-skip
-    rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_rate + tx_size_rate,
+    rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate,
                 rd_stats->dist);
     rd_stats->rate += tx_size_rate;
   }
   // Check if forcing the block to skip transform leads to smaller RD cost.
-  if (is_inter && !rd_stats->skip && !xd->lossless[mbmi->segment_id]) {
-    int64_t temp_skip_rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse);
-    if (temp_skip_rd <= rd) {
-      rd = temp_skip_rd;
+  if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) {
+    int64_t temp_skip_txfm_rd =
+        RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+    if (temp_skip_txfm_rd <= rd) {
+      rd = temp_skip_txfm_rd;
       rd_stats->rate = 0;
       rd_stats->dist = rd_stats->sse;
-      rd_stats->skip = 1;
+      rd_stats->skip_txfm = 1;
     }
   }
 
@@ -2954,7 +3113,7 @@ static AOM_INLINE void tx_block_yrd(
   const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
       plane_bsize, blk_row, blk_col)];
   const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
-                                         mbmi->sb_type, tx_size);
+                                         mbmi->bsize, tx_size);
 
   av1_init_rd_stats(rd_stats);
   if (tx_size == plane_tx_size) {
@@ -2964,28 +3123,30 @@ static AOM_INLINE void tx_block_yrd(
     TXB_CTX txb_ctx;
     get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx);
 
-    const int zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(0)]
-                                  .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+    const int zero_blk_rate =
+        x->coeff_costs.coeff_costs[txs_ctx][get_plane_type(0)]
+            .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
     rd_stats->zero_rate = zero_blk_rate;
     tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
-               rd_stats, ftxs_mode, ref_best_rd, NULL);
+               rd_stats, ftxs_mode, ref_best_rd);
     const int mi_width = mi_size_wide[plane_bsize];
+    TxfmSearchInfo *txfm_info = &x->txfm_search_info;
     if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
             RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
-        rd_stats->skip == 1) {
+        rd_stats->skip_txfm == 1) {
       rd_stats->rate = zero_blk_rate;
       rd_stats->dist = rd_stats->sse;
-      rd_stats->skip = 1;
-      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1);
+      rd_stats->skip_txfm = 1;
+      set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 1);
       x->plane[0].eobs[block] = 0;
       x->plane[0].txb_entropy_ctx[block] = 0;
       update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
     } else {
-      rd_stats->skip = 0;
-      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0);
+      rd_stats->skip_txfm = 0;
+      set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 0);
     }
     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-      rd_stats->rate += x->txfm_partition_cost[ctx][0];
+      rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][0];
     av1_set_txb_context(x, 0, block, tx_size, ta, tl);
     txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
                           tx_size);
@@ -2994,15 +3155,18 @@ static AOM_INLINE void tx_block_yrd(
     const int txb_width = tx_size_wide_unit[sub_txs];
     const int txb_height = tx_size_high_unit[sub_txs];
     const int step = txb_height * txb_width;
+    const int row_end =
+        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+    const int col_end =
+        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
     RD_STATS pn_rd_stats;
     int64_t this_rd = 0;
     assert(txb_width > 0 && txb_height > 0);
 
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += txb_height) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += txb_width) {
-        const int offsetr = blk_row + row;
+    for (int row = 0; row < row_end; row += txb_height) {
+      const int offsetr = blk_row + row;
+      for (int col = 0; col < col_end; col += txb_width) {
         const int offsetc = blk_col + col;
-        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
         av1_init_rd_stats(&pn_rd_stats);
         tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize,
@@ -3019,7 +3183,7 @@ static AOM_INLINE void tx_block_yrd(
     }
 
     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-      rd_stats->rate += x->txfm_partition_cost[ctx][1];
+      rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][1];
   }
 }
 
@@ -3038,6 +3202,7 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   av1_init_rd_stats(rd_stats);
 
   MACROBLOCKD *const xd = &x->e_mbd;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   const struct macroblockd_plane *const pd = &xd->plane[0];
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
@@ -3045,8 +3210,8 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
   const int step = bw * bh;
-  const int init_depth = get_search_init_depth(mi_width, mi_height, 1, &cpi->sf,
-                                               x->tx_size_search_method);
+  const int init_depth = get_search_init_depth(
+      mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method);
   ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
   ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
   TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
@@ -3075,17 +3240,17 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int no_skip_flag_rate = x->skip_cost[skip_ctx][0];
-  const int skip_flag_rate = x->skip_cost[skip_ctx][1];
-  const int64_t skip_rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse);
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+  const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
   this_rd =
-      RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_rate, rd_stats->dist);
-  if (skip_rd < this_rd) {
-    this_rd = skip_rd;
+      RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate, rd_stats->dist);
+  if (skip_txfm_rd < this_rd) {
+    this_rd = skip_txfm_rd;
     rd_stats->rate = 0;
     rd_stats->dist = rd_stats->sse;
-    rd_stats->skip = 1;
+    rd_stats->skip_txfm = 1;
   }
 
   const int is_cost_valid = this_rd > ref_best_rd;
@@ -3102,13 +3267,17 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 // will be saved in rd_stats. The returned value is the corresponding RD cost.
 static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                                       int64_t ref_best_rd,
-                                       TXB_RD_INFO_NODE *rd_info_tree) {
+                                       int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   assert(is_inter_block(xd->mi[0]));
   assert(bsize < BLOCK_SIZES_ALL);
-  const int fast_tx_search = x->tx_size_search_method > USE_FULL_RD;
+  const int fast_tx_search = txfm_params->tx_size_search_method > USE_FULL_RD;
   int64_t rd_thresh = ref_best_rd;
+  if (rd_thresh == 0) {
+    av1_invalid_rd_stats(rd_stats);
+    return INT64_MAX;
+  }
   if (fast_tx_search && rd_thresh < INT64_MAX) {
     if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
   }
@@ -3126,17 +3295,17 @@ static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
   av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
   memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
   memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
-  const int init_depth = get_search_init_depth(mi_width, mi_height, 1, &cpi->sf,
-                                               x->tx_size_search_method);
+  const int init_depth = get_search_init_depth(
+      mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method);
   const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
   const int step = bw * bh;
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int no_skip_flag_cost = x->skip_cost[skip_ctx][0];
-  const int skip_flag_cost = x->skip_cost[skip_ctx][1];
-  int64_t skip_rd = RDCOST(x->rdmult, skip_flag_cost, 0);
-  int64_t no_skip_rd = RDCOST(x->rdmult, no_skip_flag_cost, 0);
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int no_skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+  const int skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+  int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, 0);
+  int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_cost, 0);
   int block = 0;
 
   av1_init_rd_stats(rd_stats);
@@ -3145,29 +3314,28 @@ static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
       const int64_t best_rd_sofar =
           (rd_thresh == INT64_MAX)
               ? INT64_MAX
-              : (rd_thresh - (AOMMIN(skip_rd, no_skip_rd)));
+              : (rd_thresh - (AOMMIN(skip_txfm_rd, no_skip_txfm_rd)));
       int is_cost_valid = 1;
       RD_STATS pn_rd_stats;
       // Search for the best transform block size and type for the sub-block.
       select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize,
                       ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX,
-                      best_rd_sofar, &is_cost_valid, ftxs_mode, rd_info_tree);
+                      best_rd_sofar, &is_cost_valid, ftxs_mode);
       if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
         av1_invalid_rd_stats(rd_stats);
         return INT64_MAX;
       }
       av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-      skip_rd = RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse);
-      no_skip_rd =
-          RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_cost, rd_stats->dist);
+      skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse);
+      no_skip_txfm_rd =
+          RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist);
       block += step;
-      if (rd_info_tree != NULL) rd_info_tree += 1;
     }
   }
 
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
 
-  rd_stats->skip = (skip_rd <= no_skip_rd);
+  rd_stats->skip_txfm = (skip_txfm_rd <= no_skip_txfm_rd);
 
   // If fast_tx_search is true, only DCT and 1D DCT were tested in
   // select_inter_block_yrd() above. Do a better search for tx type with
@@ -3178,14 +3346,14 @@ static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
   }
 
   int64_t final_rd;
-  if (rd_stats->skip) {
-    final_rd = RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse);
+  if (rd_stats->skip_txfm) {
+    final_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse);
   } else {
     final_rd =
-        RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_cost, rd_stats->dist);
+        RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist);
     if (!xd->lossless[xd->mi[0]->segment_id]) {
       final_rd =
-          AOMMIN(final_rd, RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse));
+          AOMMIN(final_rd, RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse));
     }
   }
 
@@ -3216,14 +3384,11 @@ static AOM_INLINE int model_based_tx_search_prune(const AV1_COMP *cpi,
   return ((model_rd * factor) >> 3) > ref_best_rd;
 }
 
-// Search for best transform size and type for luma inter blocks. The transform
-// block partitioning can be recursive resulting in non-uniform transform sizes.
-// The best transform size and type, if found, will be saved in the MB_MODE_INFO
-// structure, and the corresponding RD stats will be saved in rd_stats.
 void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
                                          RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                          int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   assert(is_inter_block(xd->mi[0]));
 
   av1_invalid_rd_stats(rd_stats);
@@ -3250,11 +3415,11 @@ void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   const int n4 = bsize_to_num_blk(bsize);
   if (is_mb_rd_hash_enabled) {
     hash = get_block_residue_hash(x, bsize);
-    mb_rd_record = &x->mb_rd_record;
+    mb_rd_record = x->txfm_search_info.mb_rd_record;
     const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
     if (match_index != -1) {
-      MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
-      fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
+      MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index];
+      fetch_mb_rd_info(n4, mb_rd_info, rd_stats, x);
       return;
     }
   }
@@ -3262,32 +3427,21 @@ void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   // If we predict that skip is the optimal RD decision - set the respective
   // context and terminate early.
   int64_t dist;
-  if (x->predict_skip_level &&
-      predict_skip_flag(x, bsize, &dist,
+  if (txfm_params->skip_txfm_level &&
+      predict_skip_txfm(x, bsize, &dist,
                         cpi->common.features.reduced_tx_set_used)) {
-    set_skip_flag(x, rd_stats, bsize, dist);
-    // Save the RD search results into tx_rd_record.
+    set_skip_txfm(x, rd_stats, bsize, dist);
+    // Save the RD search results into mb_rd_record.
     if (is_mb_rd_hash_enabled)
-      save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+      save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record);
     return;
   }
 #if CONFIG_SPEED_STATS
-  ++x->tx_search_count;
+  ++x->txfm_search_info.tx_search_count;
 #endif  // CONFIG_SPEED_STATS
 
-  // Pre-compute residue hashes (transform block level) and find existing or
-  // add new RD records to store and reuse rate and distortion values to speed
-  // up TX size/type search.
-  TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64];
-  int found_rd_info = 0;
-  if (ref_best_rd != INT64_MAX && within_border &&
-      cpi->sf.tx_sf.use_inter_txb_hash) {
-    found_rd_info = find_tx_size_rd_records(x, bsize, matched_rd_info);
-  }
-
   const int64_t rd =
-      select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd,
-                              found_rd_info ? matched_rd_info : NULL);
+      select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd);
 
   if (rd == INT64_MAX) {
     // We should always find at least one candidate unless ref_best_rd is less
@@ -3298,24 +3452,20 @@ void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
     return;
   }
 
-  // Save the RD search results into tx_rd_record.
+  // Save the RD search results into mb_rd_record.
   if (is_mb_rd_hash_enabled) {
     assert(mb_rd_record != NULL);
-    save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+    save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record);
   }
 }
 
-// Search for the best transform size and type for current coding block, with
-// the assumption that all the transform blocks have a uniform size (VP9 style).
-// The selected transform size and type will be saved in the MB_MODE_INFO
-// structure; the corresponding RD stats will be saved in rd_stats.
-// This function may be used for both intra and inter predicted blocks.
 void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        RD_STATS *rd_stats, BLOCK_SIZE bs,
                                        int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(bs == mbmi->sb_type);
+  const TxfmSearchParams *tx_params = &x->txfm_search_params;
+  assert(bs == mbmi->bsize);
   const int is_inter = is_inter_block(mbmi);
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
@@ -3336,11 +3486,11 @@ void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
         (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end);
     if (within_border) {
       hash = get_block_residue_hash(x, bs);
-      mb_rd_record = &x->mb_rd_record;
+      mb_rd_record = x->txfm_search_info.mb_rd_record;
       const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
       if (match_index != -1) {
-        MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
-        fetch_tx_rd_info(num_blks, tx_rd_info, rd_stats, x);
+        MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index];
+        fetch_mb_rd_info(num_blks, mb_rd_info, rd_stats, x);
         return;
       }
     }
@@ -3349,14 +3499,15 @@ void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   // If we predict that skip is the optimal RD decision - set the respective
   // context and terminate early.
   int64_t dist;
-  if (x->predict_skip_level && is_inter && !xd->lossless[mbmi->segment_id] &&
-      predict_skip_flag(x, bs, &dist,
+  if (tx_params->skip_txfm_level && is_inter &&
+      !xd->lossless[mbmi->segment_id] &&
+      predict_skip_txfm(x, bs, &dist,
                         cpi->common.features.reduced_tx_set_used)) {
     // Populate rdstats as per skip decision
-    set_skip_flag(x, rd_stats, bs, dist);
-    // Save the RD search results into tx_rd_record.
+    set_skip_txfm(x, rd_stats, bs, dist);
+    // Save the RD search results into mb_rd_record.
     if (mb_rd_record) {
-      save_tx_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
+      save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
     }
     return;
   }
@@ -3364,21 +3515,18 @@ void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   if (xd->lossless[mbmi->segment_id]) {
     // Lossless mode can only pick the smallest (4x4) transform size.
     choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
-  } else if (x->tx_size_search_method == USE_LARGESTALL) {
+  } else if (tx_params->tx_size_search_method == USE_LARGESTALL) {
     choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
   } else {
     choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
   }
 
-  // Save the RD search results into tx_rd_record for possible reuse in future.
+  // Save the RD search results into mb_rd_record for possible reuse in future.
   if (mb_rd_record) {
-    save_tx_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
+    save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
   }
 }
 
-// Calculate the transform coefficient RD cost for the given chroma coding block
-// Return value 0: early termination triggered, no valid rd cost available;
-//              1: rd cost values are valid.
 int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
                   BLOCK_SIZE bsize, int64_t ref_best_rd) {
   av1_init_rd_stats(rd_stats);
@@ -3389,7 +3537,7 @@ int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U];
   const int is_inter = is_inter_block(mbmi);
-  int64_t this_rd = 0, skip_rd = 0;
+  int64_t this_rd = 0, skip_txfm_rd = 0;
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
 
@@ -3411,19 +3559,17 @@ int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
     // TODO(any): Extend the early exit mechanism for intra modes as well
     if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && is_inter &&
         chroma_ref_best_rd != INT64_MAX)
-      chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_rd);
+      chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_txfm_rd);
     av1_txfm_rd_in_plane(x, cpi, &this_rd_stats, chroma_ref_best_rd, 0, plane,
-                         plane_bsize, uv_tx_size,
-                         cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE,
-                         skip_trellis);
+                         plane_bsize, uv_tx_size, FTXS_NONE, skip_trellis);
     if (this_rd_stats.rate == INT_MAX) {
       is_cost_valid = 0;
       break;
     }
     av1_merge_rd_stats(rd_stats, &this_rd_stats);
     this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-    skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
-    if (AOMMIN(this_rd, skip_rd) > ref_best_rd) {
+    skip_txfm_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+    if (AOMMIN(this_rd, skip_txfm_rd) > ref_best_rd) {
       is_cost_valid = 0;
       break;
     }
@@ -3437,17 +3583,15 @@ int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
   return is_cost_valid;
 }
 
-// Search for the best transform type and calculate the transform coefficients
-// RD cost of the current coding block with the specified (uniform) transform
-// size and channel. The RD results will be saved in rd_stats.
 void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
                           RD_STATS *rd_stats, int64_t ref_best_rd,
                           int64_t current_rd, int plane, BLOCK_SIZE plane_bsize,
-                          TX_SIZE tx_size, int use_fast_coef_costing,
-                          FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
+                          TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
+                          int skip_trellis) {
   assert(IMPLIES(plane == 0, x->e_mbd.mi[0]->tx_size == tx_size));
 
-  if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) {
+  if (!cpi->oxcf.txfm_cfg.enable_tx64 &&
+      txsize_sqr_up_map[tx_size] == TX_64X64) {
     av1_invalid_rd_stats(rd_stats);
     return;
   }
@@ -3465,7 +3609,6 @@ void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
   args.cpi = cpi;
   args.best_rd = ref_best_rd;
   args.current_rd = current_rd;
-  args.use_fast_coef_costing = use_fast_coef_costing;
   args.ftxs_mode = ftxs_mode;
   args.skip_trellis = skip_trellis;
   av1_init_rd_stats(&args.rd_stats);
@@ -3485,23 +3628,16 @@ void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
   }
 }
 
-// This function combines y and uv planes' transform search processes together
-// for inter-predicted blocks (including IntraBC), when the prediction is
-// already generated. It first does subtraction to obtain the prediction error.
-// Then it calls
-// av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and
-// av1_txfm_uvrd sequentially and handles the early terminations
-// happening in those functions. At the end, it computes the
-// rd_stats/_y/_uv accordingly.
 int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                     RD_STATS *rd_stats, RD_STATS *rd_stats_y,
                     RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int skip_ctx = av1_get_skip_context(xd);
-  const int skip_flag_cost[2] = { x->skip_cost[skip_ctx][0],
-                                  x->skip_cost[skip_ctx][1] };
+  TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  const int skip_ctx = av1_get_skip_txfm_context(xd);
+  const int skip_txfm_cost[2] = { x->mode_costs.skip_txfm_cost[skip_ctx][0],
+                                  x->mode_costs.skip_txfm_cost[skip_ctx][1] };
   const int64_t min_header_rate =
-      mode_rate + AOMMIN(skip_flag_cost[0], skip_flag_cost[1]);
+      mode_rate + AOMMIN(skip_txfm_cost[0], skip_txfm_cost[1]);
   // Account for minimum skip and non_skip rd.
   // Eventually either one of them will be added to mode_rate
   const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
@@ -3521,7 +3657,7 @@ int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
   // cost and distortion
   av1_subtract_plane(x, bsize, 0);
-  if (x->tx_mode_search_type == TX_MODE_SELECT &&
+  if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
       !xd->lossless[mbmi->segment_id]) {
     av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
 #if CONFIG_COLLECT_RD_STATS == 2
@@ -3531,30 +3667,19 @@ int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
     memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
     for (int i = 0; i < xd->height * xd->width; ++i)
-      set_blk_skip(x, 0, i, rd_stats_y->skip);
+      set_blk_skip(x->txfm_search_info.blk_skip, 0, i, rd_stats_y->skip_txfm);
   }
 
   if (rd_stats_y->rate == INT_MAX) return 0;
 
   av1_merge_rd_stats(rd_stats, rd_stats_y);
 
-  const int64_t non_skip_rdcosty =
-      RDCOST(x->rdmult, rd_stats->rate + skip_flag_cost[0], rd_stats->dist);
-  const int64_t skip_rdcosty =
-      RDCOST(x->rdmult, mode_rate + skip_flag_cost[1], rd_stats->sse);
-  const int64_t min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
-  if (min_rdcosty > ref_best_rd) {
-    const int64_t tokenonly_rdy =
-        AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
-               RDCOST(x->rdmult, 0, rd_stats_y->sse));
-    // Invalidate rd_stats_y to skip the rest of the motion modes search
-    if (tokenonly_rdy -
-            (tokenonly_rdy >> cpi->sf.inter_sf.prune_motion_mode_level) >
-        rd_thresh) {
-      av1_invalid_rd_stats(rd_stats_y);
-    }
-    return 0;
-  }
+  const int64_t non_skip_txfm_rdcosty =
+      RDCOST(x->rdmult, rd_stats->rate + skip_txfm_cost[0], rd_stats->dist);
+  const int64_t skip_txfm_rdcosty =
+      RDCOST(x->rdmult, mode_rate + skip_txfm_cost[1], rd_stats->sse);
+  const int64_t min_rdcosty = AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty);
+  if (min_rdcosty > ref_best_rd) return 0;
 
   av1_init_rd_stats(rd_stats_uv);
   const int num_planes = av1_num_planes(cm);
@@ -3563,8 +3688,8 @@ int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     // Calculate best rd cost possible for chroma
     if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma &&
         (ref_best_chroma_rd != INT64_MAX)) {
-      ref_best_chroma_rd =
-          (ref_best_chroma_rd - AOMMIN(non_skip_rdcosty, skip_rdcosty));
+      ref_best_chroma_rd = (ref_best_chroma_rd -
+                            AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty));
     }
     const int is_cost_valid_uv =
         av1_txfm_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd);
@@ -3572,30 +3697,30 @@ int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     av1_merge_rd_stats(rd_stats, rd_stats_uv);
   }
 
-  int choose_skip = rd_stats->skip;
-  if (!choose_skip && !xd->lossless[mbmi->segment_id]) {
-    const int64_t rdcost_no_skip = RDCOST(
-        x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_flag_cost[0],
+  int choose_skip_txfm = rd_stats->skip_txfm;
+  if (!choose_skip_txfm && !xd->lossless[mbmi->segment_id]) {
+    const int64_t rdcost_no_skip_txfm = RDCOST(
+        x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_txfm_cost[0],
         rd_stats->dist);
-    const int64_t rdcost_skip =
-        RDCOST(x->rdmult, skip_flag_cost[1], rd_stats->sse);
-    if (rdcost_no_skip >= rdcost_skip) choose_skip = 1;
+    const int64_t rdcost_skip_txfm =
+        RDCOST(x->rdmult, skip_txfm_cost[1], rd_stats->sse);
+    if (rdcost_no_skip_txfm >= rdcost_skip_txfm) choose_skip_txfm = 1;
   }
-  if (choose_skip) {
+  if (choose_skip_txfm) {
     rd_stats_y->rate = 0;
     rd_stats_uv->rate = 0;
-    rd_stats->rate = mode_rate + skip_flag_cost[1];
+    rd_stats->rate = mode_rate + skip_txfm_cost[1];
     rd_stats->dist = rd_stats->sse;
     rd_stats_y->dist = rd_stats_y->sse;
     rd_stats_uv->dist = rd_stats_uv->sse;
-    mbmi->skip = 1;
-    if (rd_stats->skip) {
+    mbmi->skip_txfm = 1;
+    if (rd_stats->skip_txfm) {
       const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
       if (tmprd > ref_best_rd) return 0;
     }
   } else {
-    rd_stats->rate += skip_flag_cost[0];
-    mbmi->skip = 0;
+    rd_stats->rate += skip_txfm_cost[0];
+    mbmi->skip_txfm = 0;
   }
 
   return 1;
diff --git a/media/libaom/src/av1/encoder/tx_search.h b/media/libaom/src/av1/encoder/tx_search.h
index 82d56719d0..e3caf5bf4c 100644
--- a/media/libaom/src/av1/encoder/tx_search.h
+++ b/media/libaom/src/av1/encoder/tx_search.h
@@ -35,39 +35,165 @@ enum {
 
 static AOM_INLINE int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize,
                                    TX_SIZE tx_size) {
-  assert(bsize == x->e_mbd.mi[0]->sb_type);
-  if (x->tx_mode_search_type != TX_MODE_SELECT || !block_signals_txsize(bsize))
+  assert(bsize == x->e_mbd.mi[0]->bsize);
+  if (x->txfm_search_params.tx_mode_search_type != TX_MODE_SELECT ||
+      !block_signals_txsize(bsize))
     return 0;
 
   const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
   const int depth = tx_size_to_depth(tx_size, bsize);
   const MACROBLOCKD *const xd = &x->e_mbd;
   const int tx_size_ctx = get_tx_size_context(xd);
-  return x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+  return x->mode_costs.tx_size_cost[tx_size_cat][tx_size_ctx][depth];
 }
 
+int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              RD_STATS *rd_stats, int64_t ref_best_rd,
+                              BLOCK_SIZE bs, TX_SIZE tx_size);
+
+/*!\brief Transform type search for luma macroblock with fixed transform size.
+ *
+ * \ingroup transform_search
+ * Search for the best transform type and return the transform coefficients RD
+ * cost of current luma macroblock with the given uniform transform size.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \param[in]    bs             Size of the current macroblock
+ * \param[in]    tx_size        The given transform size
+ * \param[in]    ftxs_mode      Transform search mode specifying desired speed
+                                and quality tradeoff
+ * \param[in]    skip_trellis   Binary flag indicating if trellis optimization
+                                should be skipped
+ * \return       An int64_t value that is the best RD cost found.
+ */
 int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                              RD_STATS *rd_stats, int64_t ref_best_rd,
                              BLOCK_SIZE bs, TX_SIZE tx_size,
                              FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis);
 
+/*!\brief Recursive transform size and type search.
+ *
+ * \ingroup transform_search
+ * Search for best transform size and type for luma inter blocks. The transform
+ * block partitioning can be recursive resulting in non-uniform transform sizes.
+ * The best transform size and type, if found, will be saved in the MB_MODE_INFO
+ * structure, and the corresponding RD stats will be saved in rd_stats.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    bsize          Current macroblock size
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \return       Nothing is returned. The selected transform size and type will
+                 be saved in the MB_MODE_INFO structure
+ */
 void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
                                          RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                          int64_t ref_best_rd);
 
+/*!\brief Uniform transform size and type search.
+ *
+ * \ingroup transform_search
+ * Search for the best transform size and type for current macroblock block,
+ * with the assumption that all the transform blocks have a uniform size
+ * (VP9 style). The selected transform size and type will be saved in the
+ * MB_MODE_INFO structure; the corresponding RD stats will be saved in rd_stats.
+ * This function may be used for both intra and inter predicted blocks.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    bs             Current macroblock size
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \return       Nothing is returned. The selected transform size and type will
+                 be saved in the MB_MODE_INFO structure
+ */
 void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        RD_STATS *rd_stats, BLOCK_SIZE bs,
                                        int64_t ref_best_rd);
 
+/*!\brief Chroma block transform search.
+ *
+ * \ingroup transform_search
+ * Calculate the transform coefficient RD cost for the given chroma macroblock
+ * If the current mode is intra, then this function will compute the predictor.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    bsize          Current macroblock size
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \return       An integer value is returned. 0: early termination triggered,
+                 no valid rd cost available; 1: rd cost values are valid.
+ */
 int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
                   BLOCK_SIZE bsize, int64_t ref_best_rd);
 
+/*!\brief Transform type search with fixed transform size.
+ *
+ * \ingroup transform_search
+ * Search for the best transform type and calculate the transform coefficients
+ * RD cost of the current transform block with the specified (uniform) transform
+ * size and plane. The RD results will be saved in rd_stats.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    rd_stats       Pointer to struct to keep track of the RD stats
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ * \param[in]    current_rd     Current RD cost for this block so far
+ * \param[in]    plane          Plane index
+ * \param[in]    plane_bsize    Size of the current macroblock considering
+                                sup-sampling
+ * \param[in]    tx_size        The given transform size
+ * \param[in]    ftxs_mode      Transform search mode specifying desired speed
+                                and quality tradeoff
+ * \param[in]    skip_trellis   Binary flag indicating if trellis optimization
+                                should be skipped
+ *
+ * \return       Nothing is returned. The RD results will be saved in rd_stats.
+ */
 void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
                           RD_STATS *rd_stats, int64_t ref_best_rd,
-                          int64_t this_rd, int plane, BLOCK_SIZE plane_bsize,
-                          TX_SIZE tx_size, int use_fast_coef_costing,
-                          FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis);
+                          int64_t current_rd, int plane, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
+                          int skip_trellis);
 
+/*!\brief Recursive transform size and type search.
+ *
+ * \ingroup transform_search
+ * This function combines y and uv planes' transform search processes together
+ * for inter-predicted blocks (including IntraBC), when the prediction is
+ * already generated. It first does subtraction to obtain the prediction error.
+ * Then it calls
+ * av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and
+ * av1_txfm_uvrd sequentially and handles possible early terminations.
+ * The RD metrics are calculated and stored in rd_stats/_y/_uv.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    bsize          Current macroblock size
+ * \param[in]    rd_stats       Pointer to struct to keep track of the overal RD
+                                stats
+ * \param[in]    rd_stats_y     Pointer to struct to keep track of the RD
+                                stats for the luma plane
+ * \param[in]    rd_stats_uv    Pointer to struct to keep track of the RD
+                                stats for the chroma planes
+ * \param[in]    mode_rate      Rate cost to encode the prediction mode info. of
+                                the current macroblock
+ * \param[in]    ref_best_rd    Best RD cost seen for this block so far
+ *
+ * \return       An integer value is returned indicating if a valid transform
+                 candidate is found (1) or not (0).
+ */
 int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                     RD_STATS *rd_stats, RD_STATS *rd_stats_y,
                     RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd);
diff --git a/media/libaom/src/av1/encoder/txb_rdopt.c b/media/libaom/src/av1/encoder/txb_rdopt.c
new file mode 100644
index 0000000000..2f2b8fd9ce
--- /dev/null
+++ b/media/libaom/src/av1/encoder/txb_rdopt.c
@@ -0,0 +1,659 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/txb_rdopt.h"
+#include "av1/encoder/txb_rdopt_utils.h"
+
+#include "av1/common/idct.h"
+
+static INLINE void update_coeff_general(
+    int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size,
+    TX_CLASS tx_class, int bwl, int height, int64_t rdmult, int shift,
+    int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels,
+    const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int is_last = si == (eob - 1);
+  const int coeff_ctx = get_lower_levels_ctx_general(
+      is_last, si, bwl, height, levels, ci, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    const int sign = (qc < 0) ? 1 : 0;
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci);
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+    const int rate =
+        get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
+                               dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    const int64_t rd = RDCOST(rdmult, rate, dist);
+
+    tran_low_t qc_low, dqc_low;
+    tran_low_t abs_qc_low;
+    int64_t dist_low, rd_low;
+    int rate_low;
+    if (abs_qc == 1) {
+      abs_qc_low = qc_low = dqc_low = 0;
+      dist_low = dist0;
+      rate_low = txb_costs->base_cost[coeff_ctx][0];
+    } else {
+      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+      abs_qc_low = abs_qc - 1;
+      dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci);
+      rate_low =
+          get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    }
+
+    rd_low = RDCOST(rdmult, rate_low, dist_low);
+    if (rd_low < rd) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+      *accu_rate += rate_low;
+      *accu_dist += dist_low - dist0;
+    } else {
+      *accu_rate += rate;
+      *accu_dist += dist - dist0;
+    }
+  }
+}
+
+static AOM_FORCE_INLINE void update_coeff_simple(
+    int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+    int bwl, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
+    const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
+    uint8_t *levels, const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+  (void)eob;
+  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+  // and not the last (scan_idx != eob - 1)
+  assert(si != eob - 1);
+  assert(si > 0);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int coeff_ctx =
+      get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t abs_tqc = abs(tcoeff[ci]);
+    const tran_low_t abs_dqc = abs(dqcoeff[ci]);
+    int rate_low = 0;
+    const int rate = get_two_coeff_cost_simple(
+        ci, abs_qc, coeff_ctx, txb_costs, bwl, tx_class, levels, &rate_low);
+    if (abs_dqc < abs_tqc) {
+      *accu_rate += rate;
+      return;
+    }
+
+    const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci);
+    const int64_t rd = RDCOST(rdmult, rate, dist);
+
+    const tran_low_t abs_qc_low = abs_qc - 1;
+    const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+    const int64_t dist_low =
+        get_coeff_dist(abs_tqc, abs_dqc_low, shift, qmatrix, ci);
+    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+
+    if (rd_low < rd) {
+      const int sign = (qc < 0) ? 1 : 0;
+      qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
+      dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+      *accu_rate += rate_low;
+    } else {
+      *accu_rate += rate;
+    }
+  }
+}
+
+static AOM_FORCE_INLINE void update_coeff_eob(
+    int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
+    int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height,
+    int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
+    const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+  assert(si != *eob - 1);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int coeff_ctx =
+      get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    int lower_level = 0;
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int sign = (qc < 0) ? 1 : 0;
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+    int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0;
+    int rate =
+        get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
+                               txb_costs, bwl, tx_class, levels);
+    int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
+
+    tran_low_t qc_low, dqc_low;
+    tran_low_t abs_qc_low;
+    int64_t dist_low, rd_low;
+    int rate_low;
+
+    if (abs_qc == 1) {
+      abs_qc_low = 0;
+      dqc_low = qc_low = 0;
+      dist_low = 0;
+      rate_low = txb_costs->base_cost[coeff_ctx][0];
+      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
+    } else {
+      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+      abs_qc_low = abs_qc - 1;
+      dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0;
+      rate_low =
+          get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+    }
+
+    int lower_level_new_eob = 0;
+    const int new_eob = si + 1;
+    const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bwl, height, si);
+    const int new_eob_cost =
+        get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
+    int rate_coeff_eob =
+        new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob,
+                                          dc_sign_ctx, txb_costs, bwl,
+                                          tx_class);
+    int64_t dist_new_eob = dist;
+    int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
+
+    if (abs_qc_low > 0) {
+      const int rate_coeff_eob_low =
+          new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign,
+                                            coeff_ctx_new_eob, dc_sign_ctx,
+                                            txb_costs, bwl, tx_class);
+      const int64_t dist_new_eob_low = dist_low;
+      const int64_t rd_new_eob_low =
+          RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
+      if (rd_new_eob_low < rd_new_eob) {
+        lower_level_new_eob = 1;
+        rd_new_eob = rd_new_eob_low;
+        rate_coeff_eob = rate_coeff_eob_low;
+        dist_new_eob = dist_new_eob_low;
+      }
+    }
+
+    if (sharpness == 0 || abs_qc > 1) {
+      if (rd_low < rd) {
+        lower_level = 1;
+        rd = rd_low;
+        rate = rate_low;
+        dist = dist_low;
+      }
+    }
+
+    if (sharpness == 0 && rd_new_eob < rd) {
+      for (int ni = 0; ni < *nz_num; ++ni) {
+        int last_ci = nz_ci[ni];
+        levels[get_padded_idx(last_ci, bwl)] = 0;
+        qcoeff[last_ci] = 0;
+        dqcoeff[last_ci] = 0;
+      }
+      *eob = new_eob;
+      *nz_num = 0;
+      *accu_rate = rate_coeff_eob;
+      *accu_dist = dist_new_eob;
+      lower_level = lower_level_new_eob;
+    } else {
+      *accu_rate += rate;
+      *accu_dist += dist;
+    }
+
+    if (lower_level) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+    }
+    if (qcoeff[ci]) {
+      nz_ci[*nz_num] = ci;
+      ++*nz_num;
+    }
+  }
+}
+
+static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
+                               int nz_num, int *nz_ci, int64_t rdmult,
+                               int skip_cost, int non_skip_cost,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff) {
+  const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
+  const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
+  if (rd_new_eob < rd) {
+    for (int i = 0; i < nz_num; ++i) {
+      const int ci = nz_ci[i];
+      qcoeff[ci] = 0;
+      dqcoeff[ci] = 0;
+      // no need to set up levels because this is the last step
+      // levels[get_padded_idx(ci, bwl)] = 0;
+    }
+    *accu_rate = 0;
+    *eob = 0;
+  }
+}
+
+// TODO(angiebird): use this function whenever it's possible
+static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd,
+                            int plane, TX_SIZE tx_size, TX_TYPE tx_type,
+                            int reduced_tx_set_used) {
+  if (plane > 0) return 0;
+
+  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
+      !xd->lossless[xd->mi[0]->segment_id]) {
+    const int ext_tx_set =
+        get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        return x->mode_costs
+            .inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
+    } else {
+      if (ext_tx_set > 0) {
+        PREDICTION_MODE intra_dir;
+        if (mbmi->filter_intra_mode_info.use_filter_intra)
+          intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+                                             .filter_intra_mode];
+        else
+          intra_dir = mbmi->mode;
+        return x->mode_costs.intra_tx_type_costs[ext_tx_set][square_tx_size]
+                                                [intra_dir][tx_type];
+      }
+    }
+  }
+  return 0;
+}
+
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                     int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                     const TXB_CTX *const txb_ctx, int *rate_cost,
+                     int sharpness) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblock_plane *p = &x->plane[plane];
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  const int shift = av1_get_tx_scale(tx_size);
+  int eob = p->eobs[block];
+  const int16_t *dequant = p->dequant_QTX;
+  const qm_val_t *iqmatrix =
+      av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
+  const qm_val_t *qmatrix =
+      cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR
+          ? av1_get_qmatrix(&cpi->common.quant_params, xd, plane, tx_size,
+                            tx_type)
+          : NULL;
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  tran_low_t *dqcoeff = p->dqcoeff + block_offset;
+  const tran_low_t *tcoeff = p->coeff + block_offset;
+  const CoeffCosts *coeff_costs = &x->coeff_costs;
+
+  // This function is not called if eob = 0.
+  assert(eob > 0);
+
+  const AV1_COMMON *cm = &cpi->common;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  assert(width == (1 << bwl));
+  const int is_inter = is_inter_block(mbmi);
+  const LV_MAP_COEFF_COST *txb_costs =
+      &coeff_costs->coeff_costs[txs_ctx][plane_type];
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *txb_eob_costs =
+      &coeff_costs->eob_costs[eob_multi_size][plane_type];
+
+  const int rshift = 2;
+
+  const int64_t rdmult =
+      (((int64_t)x->rdmult *
+        (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
+       2) >>
+      rshift;
+
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+
+  if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels);
+
+  // TODO(angirbird): check iqmatrix
+
+  const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
+  const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
+  int accu_rate = eob_cost;
+  int64_t accu_dist = 0;
+  int si = eob - 1;
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const tran_low_t abs_qc = abs(qc);
+  const int sign = qc < 0;
+  const int max_nz_num = 2;
+  int nz_num = 1;
+  int nz_ci[3] = { ci, 0, 0 };
+  if (abs_qc >= 2) {
+    update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
+                         bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
+                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+                         levels, iqmatrix, qmatrix);
+    --si;
+  } else {
+    assert(abs_qc == 1);
+    const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, si);
+    accu_rate +=
+        get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx,
+                           txb_costs, bwl, tx_class);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci);
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+    accu_dist += dist - dist0;
+    --si;
+  }
+
+#define UPDATE_COEFF_EOB_CASE(tx_class_literal)                            \
+  case tx_class_literal:                                                   \
+    for (; si >= 0 && nz_num <= max_nz_num; --si) {                        \
+      update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si,   \
+                       tx_size, tx_class_literal, bwl, height,             \
+                       txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
+                       txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,  \
+                       levels, sharpness, iqmatrix, qmatrix);              \
+    }                                                                      \
+    break
+  switch (tx_class) {
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ);
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_EOB_CASE
+    default: assert(false);
+  }
+
+  if (si == -1 && nz_num <= max_nz_num && sharpness == 0) {
+    update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
+                non_skip_cost, qcoeff, dqcoeff);
+  }
+
+#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal)                             \
+  case tx_class_literal:                                                       \
+    for (; si >= 1; --si) {                                                    \
+      update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bwl, \
+                          rdmult, shift, dequant, scan, txb_costs, tcoeff,     \
+                          qcoeff, dqcoeff, levels, iqmatrix, qmatrix);         \
+    }                                                                          \
+    break
+  switch (tx_class) {
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ);
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_SIMPLE_CASE
+    default: assert(false);
+  }
+
+  // DC position
+  if (si == 0) {
+    // no need to update accu_dist because it's not used after this point
+    int64_t dummy_dist = 0;
+    update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
+                         bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
+                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+                         levels, iqmatrix, qmatrix);
+  }
+
+  const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type,
+                                            cm->features.reduced_tx_set_used);
+  if (eob == 0)
+    accu_rate += skip_cost;
+  else
+    accu_rate += non_skip_cost + tx_type_cost;
+
+  p->eobs[block] = eob;
+  p->txb_entropy_ctx[block] =
+      av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]);
+
+  *rate_cost = accu_rate;
+  return eob;
+}
+
+static AOM_FORCE_INLINE int warehouse_efficients_txb(
+    const MACROBLOCK *x, const int plane, const int block,
+    const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
+    const struct macroblock_plane *p, const int eob,
+    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+    int reduced_tx_set_used) {
+  const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *const eob_costs =
+      &x->coeff_costs.eob_costs[eob_multi_size][plane_type];
+  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+  av1_txb_init_levels(qcoeff, width, height, levels);
+
+  cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+
+  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+  const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] =
+      coeff_costs->lps_cost;
+  int c = eob - 1;
+  {
+    const int pos = scan[c];
+    const tran_low_t v = qcoeff[pos];
+    const int sign = AOMSIGN(v);
+    const int level = (v ^ sign) - sign;
+    const int coeff_ctx = coeff_contexts[pos];
+    cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
+
+    if (v) {
+      // sign bit cost
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx_eob(pos, bwl, tx_class);
+        cost += get_br_cost(level, lps_cost[ctx]);
+      }
+      if (c) {
+        cost += av1_cost_literal(1);
+      } else {
+        const int sign01 = (sign ^ sign) - sign;
+        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+        cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+        return cost;
+      }
+    }
+  }
+  const int(*base_cost)[8] = coeff_costs->base_cost;
+  for (c = eob - 2; c >= 1; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = qcoeff[pos];
+    const int level = abs(v);
+    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
+    if (v) {
+      // sign bit cost
+      cost += av1_cost_literal(1);
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        cost += get_br_cost(level, lps_cost[ctx]);
+      }
+    }
+  }
+  // c == 0 after previous loop
+  {
+    const int pos = scan[c];
+    const tran_low_t v = qcoeff[pos];
+    const int coeff_ctx = coeff_contexts[pos];
+    const int sign = AOMSIGN(v);
+    const int level = (v ^ sign) - sign;
+    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
+
+    if (v) {
+      // sign bit cost
+      const int sign01 = (sign ^ sign) - sign;
+      const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+      cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        cost += get_br_cost(level, lps_cost[ctx]);
+      }
+    }
+  }
+  return cost;
+}
+
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+                                 const int block, const TX_SIZE tx_size,
+                                 const TX_TYPE tx_type) {
+  assert(plane == 0);
+
+  int cost = 0;
+  const struct macroblock_plane *p = &x->plane[plane];
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+
+  int eob = p->eobs[block];
+
+  // coeffs
+  int c = eob - 1;
+  // eob
+  {
+    const int pos = scan[c];
+    const tran_low_t v = abs(qcoeff[pos]) - 1;
+    cost += (v << (AV1_PROB_COST_SHIFT + 2));
+  }
+  // other coeffs
+  for (c = eob - 2; c >= 0; c--) {
+    const int pos = scan[c];
+    const tran_low_t v = abs(qcoeff[pos]);
+    const int idx = AOMMIN(v, 14);
+
+    cost += costLUT[idx];
+  }
+
+  // const_term does not contain DC, and log(e) does not contain eob, so both
+  // (eob-1)
+  cost += (const_term + loge_par) * (eob - 1);
+
+  return cost;
+}
+
+static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian(
+    const MACROBLOCK *x, const int plane, const int block,
+    const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob,
+    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+    int reduced_tx_set_used) {
+  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *const eob_costs =
+      &x->coeff_costs.eob_costs[eob_multi_size][plane_type];
+  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+  cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+
+  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+  cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type);
+  return cost;
+}
+
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+                        const TX_SIZE tx_size, const TX_TYPE tx_type,
+                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
+  const struct macroblock_plane *p = &x->plane[plane];
+  const int eob = p->eobs[block];
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs.coeff_costs[txs_ctx][plane_type];
+  if (eob == 0) {
+    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  }
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+  return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob,
+                                  plane_type, coeff_costs, xd, tx_type,
+                                  tx_class, reduced_tx_set_used);
+}
+
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+                                  const int block, const TX_SIZE tx_size,
+                                  const TX_TYPE tx_type,
+                                  const TXB_CTX *const txb_ctx,
+                                  const int reduced_tx_set_used,
+                                  const int adjust_eob) {
+  const struct macroblock_plane *p = &x->plane[plane];
+  int eob = p->eobs[block];
+
+  if (adjust_eob) {
+    const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+    const int16_t *scan = scan_order->scan;
+    tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block);
+    tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+    tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+    update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan,
+                          tcoeff, qcoeff, dqcoeff);
+    p->eobs[block] = eob;
+  }
+
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs.coeff_costs[txs_ctx][plane_type];
+  if (eob == 0) {
+    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  }
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+  return warehouse_efficients_txb_laplacian(
+      x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd,
+      tx_type, tx_class, reduced_tx_set_used);
+}
diff --git a/media/libaom/src/av1/encoder/txb_rdopt.h b/media/libaom/src/av1/encoder/txb_rdopt.h
new file mode 100644
index 0000000000..70b322a2e1
--- /dev/null
+++ b/media/libaom/src/av1/encoder/txb_rdopt.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TXB_RDOPT_H_
+#define AOM_AV1_ENCODER_TXB_RDOPT_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Adjust the magnitude of quantized coefficients to achieve better
+ * rate-distortion (RD) trade-off.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function goes through each coefficient and greedily choose to lower
+ * the coefficient magnitude by 1 or not based on the RD score.
+ *
+ * The coefficients are processing in reversed scan order.
+ *
+ * Note that, the end of block position (eob) may change if the original last
+ * coefficient is lowered to zero.
+ *
+ * \param[in]    cpi            Top-level encoder structure
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    block          The index of the current transform block in the
+ * \param[in]    tx_size        The transform size
+ * \param[in]    tx_type        The transform type
+ * \param[in]    txb_ctx        Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[out]   rate_cost      The entropy cost of coding the transform block
+ * after adjustment of coefficients.
+ * \param[in]    sharpness      When sharpness > 0, the function will be less
+ * aggressive towards lowering the magnitude of coefficients.
+ * In this way, the transform block will contain more high-frequency
+ * coefficients and therefore will preserve the sharpness of the reconstructed
+ * block.
+ */
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                     int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                     const TXB_CTX *const txb_ctx, int *rate_cost,
+                     int sharpness);
+
+/*!\brief Compute the entropy cost of coding coefficients in a transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * \param[in]    x                    Pointer to structure holding the data for
+ the current encoding macroblock.
+ * \param[in]    plane                The index of the current plane.
+ * \param[in]    block                The index of the current transform block
+ in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in]    tx_size              The transform size.
+ * \param[in]    tx_type              The transform type.
+ * \param[in]    txb_ctx              Context info for entropy coding transform
+ block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in]    reduced_tx_set_used  Whether the transform type is chosen from
+ * a reduced set.
+ */
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+                        const TX_SIZE tx_size, const TX_TYPE tx_type,
+                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used);
+
+/*!\brief Estimate the entropy cost of coding a transform block using Laplacian
+ * distribution.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function compute the entropy costs of the end of block position (eob)
+ * and the transform type (tx_type) precisely.
+ *
+ * Then using \ref av1_cost_coeffs_txb_estimate to estimate the entropy costs
+ * of coefficients in the transform block.
+ *
+ * In the end, the function returns the sum of entropy costs of end of block
+ * position (eob), transform type (tx_type) and coefficients.
+ *
+ * Compared to \ref av1_cost_coeffs_txb, this function is much faster but less
+ * accurate.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    block          The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in]    tx_size        The transform size
+ * \param[in]    tx_type        The transform type
+ * \param[in]    txb_ctx        Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in]    reduced_tx_set_used  Whether the transform type is chosen from
+ * a reduced set.
+ * \param[in]    adjust_eob     Whether to adjust the end of block position
+ (eob)
+ * or not.
+ * \return       int            Estimated entropy cost of coding the transform
+ block.
+ */
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+                                  const int block, const TX_SIZE tx_size,
+                                  const TX_TYPE tx_type,
+                                  const TXB_CTX *const txb_ctx,
+                                  const int reduced_tx_set_used,
+                                  const int adjust_eob);
+
+/*!\brief Estimate the entropy cost of transform coefficients using Laplacian
+ * distribution.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function assumes each transform coefficient is of its own Laplacian
+ * distribution and the coefficient is the only observation of the Laplacian
+ * distribution.
+ *
+ * Based on that, each coefficient's coding cost can be estimated by computing
+ * the entropy of the corresponding Laplacian distribution.
+ *
+ * This function then return the sum of the estimated entropy cost for all
+ * coefficients in the transform block.
+ *
+ * Note that the entropy cost of end of block (eob) and transform type (tx_type)
+ * are not included.
+ *
+ * \param[in]    x              Pointer to structure holding the data for the
+                                current encoding macroblock
+ * \param[in]    plane          The index of the current plane
+ * \param[in]    block          The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in]    tx_size        The transform size
+ * \param[in]    tx_type        The transform type
+ * \return       int            Estimated entropy cost of coefficients in the
+ * transform block.
+ */
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+                                 const int block, const TX_SIZE tx_size,
+                                 const TX_TYPE tx_type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_TXB_RDOPT_H_
diff --git a/media/libaom/src/av1/encoder/txb_rdopt_utils.h b/media/libaom/src/av1/encoder/txb_rdopt_utils.h
new file mode 100644
index 0000000000..d8158fd8e4
--- /dev/null
+++ b/media/libaom/src/av1/encoder/txb_rdopt_utils.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_
+#define AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_
+
+#include "av1/encoder/encodetxb.h"
+
+static const int golomb_bits_cost[32] = {
+  0,       512,     512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5,
+  512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7,
+  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9,
+  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9
+};
+
+static const int golomb_cost_diff[32] = {
+  0,       512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0,
+  512 * 2, 0,   0,       0, 0,       0, 0, 0, 0,       0, 0, 0, 0, 0, 0, 0
+};
+
+// Look up table of individual cost of coefficient by its quantization level.
+// determined based on Laplacian distribution conditioned on estimated context
+static const int costLUT[15] = { -1143, 53,   545,  825,  1031,
+                                 1209,  1393, 1577, 1763, 1947,
+                                 2132,  2317, 2501, 2686, 2871 };
+
+static const int const_term = (1 << AV1_PROB_COST_SHIFT);
+
+static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000;
+
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+                          const qm_val_t *iqmatrix) {
+  int dqv = dequant[!!coeff_idx];
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return dqv;
+}
+
+static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
+                                     int shift, const qm_val_t *qmatrix,
+                                     int coeff_idx) {
+  int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
+  if (qmatrix == NULL) {
+    return diff * diff;
+  }
+  // When AOM_DIST_METRIC_QM_PSNR is enabled, this mirrors the rate-distortion
+  // computation done in av1_block_error_qm, improving visual quality.
+  // The maximum value of `shift` is 2, `tcoeff` and `dqcoeff` are at most 22
+  // bits, and AOM_QM_BITS is 5, so `diff` should fit in 29-bits. The
+  // multiplication `diff * diff` then does not risk overflowing.
+  diff *= qmatrix[coeff_idx];
+  const int64_t error =
+      (diff * diff + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+  return error;
+}
+
+static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
+                        const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) {
+  int eob_extra;
+  const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
+  int eob_cost = 0;
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
+
+  if (av1_eob_offset_bits[eob_pt] > 0) {
+    const int eob_ctx = eob_pt - 3;
+    const int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
+    const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+    eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
+    const int offset_bits = av1_eob_offset_bits[eob_pt];
+    if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
+  }
+  return eob_cost;
+}
+
+static INLINE int get_golomb_cost(int abs_qc) {
+  if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+    const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    const int length = get_msb(r) + 1;
+    return av1_cost_literal(2 * length - 1);
+  }
+  return 0;
+}
+
+static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) {
+  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+  return coeff_lps[base_range] + get_golomb_cost(level);
+}
+
+static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
+                                        int *diff) {
+  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+  int golomb_bits = 0;
+  if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS)
+    *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1];
+
+  if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) {
+    int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    if (r < 32) {
+      golomb_bits = golomb_bits_cost[r];
+      *diff += golomb_cost_diff[r];
+    } else {
+      golomb_bits = get_golomb_cost(level);
+      *diff += (r & (r - 1)) == 0 ? 1024 : 0;
+    }
+  }
+
+  return coeff_lps[base_range] + golomb_bits;
+}
+
+static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
+    int ci, tran_low_t abs_qc, int coeff_ctx,
+    const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
+    const uint8_t *levels, int *cost_low) {
+  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+  // and not the last (scan_idx != eob - 1)
+  assert(ci > 0);
+  int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  int diff = 0;
+  if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4];
+  if (abs_qc) {
+    cost += av1_cost_literal(1);
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+      int brcost_diff = 0;
+      cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx],
+                                    &brcost_diff);
+      diff += brcost_diff;
+    }
+  }
+  *cost_low = cost - diff;
+
+  return cost;
+}
+
+static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
+                                     int coeff_ctx, int dc_sign_ctx,
+                                     const LV_MAP_COEFF_COST *txb_costs,
+                                     int bwl, TX_CLASS tx_class) {
+  int cost = 0;
+  cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  if (abs_qc != 0) {
+    if (ci == 0) {
+      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+    } else {
+      cost += av1_cost_literal(1);
+    }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      int br_ctx;
+      br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
+    }
+  }
+  return cost;
+}
+
+static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
+                                         int sign, int coeff_ctx,
+                                         int dc_sign_ctx,
+                                         const LV_MAP_COEFF_COST *txb_costs,
+                                         int bwl, TX_CLASS tx_class,
+                                         const uint8_t *levels) {
+  int cost = 0;
+  if (is_last) {
+    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  } else {
+    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  }
+  if (abs_qc != 0) {
+    if (ci == 0) {
+      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+    } else {
+      cost += av1_cost_literal(1);
+    }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      int br_ctx;
+      if (is_last)
+        br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
+      else
+        br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
+    }
+  }
+  return cost;
+}
+
+static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
+                                  int shift, tran_low_t *qc_low,
+                                  tran_low_t *dqc_low) {
+  tran_low_t abs_qc_low = abs_qc - 1;
+  *qc_low = (-sign ^ abs_qc_low) + sign;
+  assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low);
+  tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+  *dqc_low = (-sign ^ abs_dqc_low) + sign;
+  assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low);
+}
+
+static INLINE void update_coeff_eob_fast(int *eob, int shift,
+                                         const int16_t *dequant_ptr,
+                                         const int16_t *scan,
+                                         const tran_low_t *coeff_ptr,
+                                         tran_low_t *qcoeff_ptr,
+                                         tran_low_t *dqcoeff_ptr) {
+  // TODO(sarahparker) make this work for aomqm
+  int eob_out = *eob;
+  int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7),
+                  dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) };
+
+  for (int i = *eob - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) {
+      eob_out--;
+      qcoeff_ptr[rc] = 0;
+      dqcoeff_ptr[rc] = 0;
+    } else {
+      break;
+    }
+  }
+
+  *eob = eob_out;
+}
+#endif  // AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_
diff --git a/media/libaom/src/av1/encoder/use_flat_gop_model_params.h b/media/libaom/src/av1/encoder/use_flat_gop_model_params.h
deleted file mode 100644
index cf0776644f..0000000000
--- a/media/libaom/src/av1/encoder/use_flat_gop_model_params.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_
-#define AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "av1/encoder/ml.h"
-
-// A binary classifier that returns true (score > 0) if it is better to use a
-// flat GOP structure, rather than a GOP structure that uses ALT-REFs and
-// internal ARFs.
-
-#define NUM_FEATURES 21
-#define NUM_HIDDEN_LAYERS 1
-#define NUM_HIDDEN_NODES_LAYER0 48
-#define NUM_LABELS 1
-
-static const float
-    av1_use_flat_gop_nn_weights_layer0[NUM_FEATURES *
-                                       NUM_HIDDEN_NODES_LAYER0] = {
-      0.3801f,  -2.1832f, 1.7469f,  2.0130f,  2.1264f,  -0.7293f, -0.2814f,
-      0.0692f,  -4.6589f, -1.4591f, 0.3023f,  -0.4310f, -0.1911f, -0.8284f,
-      -1.3322f, -0.4621f, -0.1148f, -0.3531f, -0.0794f, -0.3114f, -0.1664f,
-      -0.1615f, 0.2913f,  -0.0394f, -0.0620f, 0.1845f,  0.0204f,  -0.2124f,
-      -0.1233f, -0.1685f, 0.1215f,  -0.2372f, -0.2865f, -0.1976f, 0.2137f,
-      -0.1318f, -0.0324f, 0.0415f,  -0.1172f, 0.1077f,  -0.1135f, -0.2462f,
-      -0.0743f, -0.1584f, -0.3267f, -0.0566f, -0.1615f, -0.3931f, -0.5200f,
-      -0.1786f, -0.1811f, -0.2812f, -0.1986f, -0.4393f, -0.3941f, -0.2500f,
-      -0.2029f, -0.4605f, -0.4973f, -0.2238f, -0.2599f, -0.1951f, -0.2034f,
-      -0.3186f, -0.1368f, -0.5076f, -0.4718f, -0.1815f, -0.3338f, -0.0550f,
-      -0.3920f, -0.5328f, -0.1658f, -0.2194f, -0.2867f, -0.0916f, -0.1678f,
-      -0.1760f, -0.5055f, -0.2322f, -0.4668f, -0.0121f, -0.3903f, -0.2721f,
-      -0.1306f, 0.1199f,  0.2894f,  0.1098f,  -0.0155f, -0.0844f, 0.0421f,
-      -0.2364f, -0.1073f, -0.0878f, -0.2146f, -0.1713f, -0.2283f, 0.0342f,
-      0.0394f,  -0.2808f, -0.0048f, 0.2640f,  -0.1371f, 0.1709f,  0.0155f,
-      -0.3614f, -0.1843f, -0.3215f, -0.3121f, -0.2609f, -0.0254f, -0.2474f,
-      -0.4674f, -0.3674f, -0.2076f, 0.0149f,  -0.3304f, -0.2678f, -0.0465f,
-      -0.1326f, -0.4504f, -0.5101f, -0.1280f, -0.0416f, -0.4296f, -0.4568f,
-      -0.6762f, -2.8105f, 0.7249f,  1.4288f,  1.3731f,  0.3034f,  0.1841f,
-      -0.0912f, -0.1508f, 1.2637f,  -0.2009f, 0.3236f,  -0.2500f, -0.0736f,
-      0.8655f,  -0.2599f, 0.1150f,  -0.0368f, -0.1122f, -0.7650f, -0.2004f,
-      -0.0891f, -0.3832f, -0.2576f, -0.3532f, -0.1735f, -0.4018f, -0.0265f,
-      -0.2988f, 0.2555f,  -0.1041f, -0.3391f, -0.5316f, -0.0171f, -0.3232f,
-      -0.0565f, -0.3359f, -0.1842f, -0.0582f, 0.0073f,  -0.0278f, -0.5517f,
-      0.0892f,  -0.1354f, 0.0548f,  -0.0401f, -0.1697f, 0.0432f,  0.0832f,
-      -0.3538f, 0.2602f,  -0.0066f, -0.2130f, -0.3085f, 0.0025f,  0.2464f,
-      -0.0103f, -0.3082f, -0.1136f, -0.2359f, -0.3421f, 0.1335f,  -0.3016f,
-      -1.0355f, -1.0572f, -0.3316f, -0.1235f, -0.3730f, -0.1751f, -0.1921f,
-      0.0031f,  -0.6297f, -0.5179f, 0.1082f,  -0.3130f, -0.1120f, -0.5430f,
-      -0.1782f, 0.0534f,  -0.1052f, 0.1471f,  -0.7156f, -0.5453f, -0.5437f,
-      1.8709f,  1.9696f,  -1.0343f, -0.3150f, -0.8399f, -0.0052f, -0.1123f,
-      -0.1059f, 0.6755f,  1.2593f,  -0.2512f, -0.2053f, 0.0835f,  0.3261f,
-      -0.0172f, 0.1230f,  -0.3687f, 0.1993f,  0.9390f,  -0.0165f, 0.6856f,
-      -0.4372f, -0.4041f, -0.2869f, -0.3871f, -0.3587f, -0.2418f, 0.0518f,
-      0.0110f,  -1.4713f, -0.1307f, -0.3246f, -0.5091f, -0.4652f, -0.4288f,
-      -0.0763f, -0.1755f, 0.0662f,  -0.3026f, -0.4462f, -0.4123f, -0.2891f,
-      -0.2251f, -0.4925f, -0.3820f, -0.1840f, -0.2878f, -0.1973f, -0.1010f,
-      -0.1622f, -0.3108f, -0.5292f, -0.1017f, -0.0607f, -0.2426f, -0.6406f,
-      -0.3834f, -0.2313f, -0.2433f, -0.1773f, -0.1581f, -0.3295f, -0.3799f,
-      -0.4447f, -0.2389f, -0.4231f, -0.1498f, -0.0181f, -0.4429f, -0.3515f,
-      0.0425f,  -0.5280f, -0.3462f, -0.3659f, 0.0153f,  -0.1002f, -0.5057f,
-      -0.2134f, -0.2859f, -0.1988f, -0.4758f, 0.0967f,  -0.4784f, 0.1868f,
-      -0.4387f, -1.3376f, -0.4452f, 0.3837f,  0.1698f,  -0.7076f, -0.4320f,
-      0.0382f,  -1.8053f, -0.6589f, 0.1406f,  -0.4340f, 0.0641f,  -0.2558f,
-      -0.4496f, -0.5003f, -0.6241f, -0.2217f, -0.8312f, -0.6793f, -0.3563f,
-      0.5153f,  -0.7851f, 1.0570f,  0.9702f,  0.5238f,  -0.6932f, -0.4443f,
-      0.0407f,  -3.0961f, -0.8461f, 0.0562f,  -0.0642f, 0.2471f,  -0.5911f,
-      -0.7715f, -0.1574f, -0.0375f, -0.1951f, -0.3097f, -0.2040f, 0.0128f,
-      -0.0918f, -0.0698f, -0.0970f, -0.2946f, -0.1723f, -0.2569f, -0.4382f,
-      -0.5174f, -0.2058f, -0.2973f, -0.0858f, -0.2526f, -0.2648f, -0.2339f,
-      -0.3474f, 0.0607f,  0.0272f,  -0.3142f, -0.1306f, -0.4938f, -0.1894f,
-      -0.0551f, -0.1061f, -0.1613f, -0.1942f, 0.0590f,  -0.2009f, -0.1286f,
-      -0.2035f, -0.0393f, -0.0650f, -0.1110f, 0.0123f,  -0.1122f, -0.0246f,
-      -0.2042f, 0.0411f,  -0.2771f, -0.0189f, 0.0927f,  0.0286f,  -0.1559f,
-      -0.3217f, -0.1039f, 0.1471f,  0.2489f,  0.2085f,  -0.4199f, -0.2404f,
-      0.0358f,  -0.7567f, -0.2413f, -0.3437f, -0.2433f, -0.3687f, -0.1194f,
-      -0.4289f, -0.1138f, -0.0721f, -0.3461f, -0.0244f, -0.3530f, -0.2842f,
-      -0.3823f, -0.1238f, -0.5475f, -0.2688f, -0.0073f, 0.0491f,  -0.4500f,
-      0.0201f,  0.0303f,  -0.2160f, -0.4219f, -0.4831f, -0.4593f, -0.2304f,
-      -0.2082f, -0.0367f, -0.5226f, -0.0082f, -0.1867f, -0.1812f, -0.2753f,
-      2.6650f,  1.9698f,  -2.9425f, 1.2119f,  1.5000f,  0.3356f,  0.3905f,
-      -0.2006f, -1.4038f, -1.0917f, 0.1423f,  -0.3528f, 0.0888f,  0.5802f,
-      1.0977f,  0.1083f,  -0.0693f, -0.0784f, 0.4247f,  0.4108f,  0.4970f,
-      -0.7290f, -0.1659f, -0.0517f, 0.0776f,  -0.0550f, -0.2374f, -0.4245f,
-      -0.0165f, -0.6804f, -0.3211f, -0.3101f, -0.1883f, -0.0786f, -0.3971f,
-      -0.4130f, -0.0606f, 0.1432f,  -0.0518f, -0.4179f, -0.4949f, -0.3451f,
-      -0.7559f, -4.0792f, 1.5526f,  0.2824f,  0.6086f,  -0.2148f, 0.0959f,
-      0.0506f,  -5.5176f, -3.9702f, 0.1597f,  -0.1760f, -0.0627f, 0.1657f,
-      -1.2996f, -0.2899f, -0.0600f, -0.0531f, -1.5160f, -0.4837f, -1.6961f,
-      -0.1134f, -0.1838f, -0.3071f, -0.4215f, -0.4184f, 0.0192f,  -0.2128f,
-      -0.3094f, -0.2607f, -0.4855f, -0.1881f, 0.0258f,  -0.5085f, -0.3630f,
-      -0.4824f, -0.3762f, -0.3324f, -0.1134f, -0.3350f, 0.0217f,  -0.2803f,
-      -0.5669f, -0.5674f, -0.5441f, -0.5965f, -0.3062f, -0.4666f, -0.4079f,
-      -0.0065f, -0.7566f, -0.3437f, -0.2474f, -0.2360f, -0.5683f, -0.3853f,
-      -0.6670f, -0.4158f, -0.2831f, -0.3327f, -0.7419f, -0.6481f, -0.4004f,
-      -0.4025f, -0.6405f, -0.4265f, -0.0167f, 0.3195f,  -0.0822f, -0.4350f,
-      -0.0032f, -1.0448f, -0.4407f, 0.0488f,  0.0776f,  -0.3828f, -0.3380f,
-      -0.2983f, -0.2220f, -0.4105f, -0.2312f, -0.4166f, -0.3258f, -0.1424f,
-      -0.6588f, -0.9433f, 0.3402f,  0.5800f,  0.6368f,  -0.4298f, -0.5743f,
-      0.0822f,  -1.0843f, -0.1645f, -0.1990f, 0.0255f,  -0.1039f, -0.3673f,
-      0.4367f,  -0.5491f, -0.0932f, -0.0323f, -0.2405f, -0.2922f, -0.4019f,
-      -0.4936f, -1.2338f, 0.4681f,  0.7454f,  0.8181f,  -0.3680f, -0.1613f,
-      -0.0008f, -1.3326f, -0.0667f, 0.1569f,  -0.0978f, -0.3229f, -0.4222f,
-      0.0330f,  0.1064f,  -0.1325f, 0.0121f,  -0.3976f, -0.2254f, -0.3942f,
-      -0.4771f, -0.1887f, 0.1020f,  0.3331f,  0.3098f,  -0.1256f, -0.4736f,
-      0.0295f,  -0.3919f, -0.0931f, -0.2484f, -0.4629f, -0.2800f, -0.2851f,
-      -0.2243f, -0.3958f, -0.3053f, -0.6585f, -0.1159f, -0.2330f, -0.1989f,
-      0.2273f,  0.1963f,  0.0283f,  0.0198f,  -0.1298f, -0.0627f, -0.2753f,
-      -0.1552f, 0.2734f,  -0.0551f, -0.2927f, -0.3772f, -0.4522f, -0.0786f,
-      0.0079f,  0.1664f,  -0.0228f, -0.2908f, -0.1714f, 0.1223f,  -0.0680f,
-      -0.5048f, -0.0852f, -0.4653f, -0.5142f, -0.1818f, -0.1659f, 0.0678f,
-      -0.1296f, 0.0295f,  -0.3487f, -0.1224f, -0.2690f, -0.3217f, -0.1957f,
-      -0.3196f, -0.4530f, -0.1746f, -0.2307f, -0.0504f, -0.0131f, -0.4613f,
-      -0.1476f, -0.5596f, -0.3829f, -0.4302f, -0.2910f, -0.2182f, -0.0811f,
-      -0.3967f, -0.3912f, -0.0371f, -0.1109f, -0.0793f, -0.2063f, -0.0060f,
-      -0.0236f, -0.4098f, -0.0276f, -0.3352f, -0.1888f, -0.2439f, -0.3748f,
-      0.0371f,  0.8460f,  -0.5547f, -1.2680f, -1.1623f, -0.1740f, -0.4815f,
-      -0.0294f, 4.4764f,  0.3716f,  -0.2826f, -0.0549f, -0.2937f, 0.0632f,
-      0.0686f,  -0.4681f, -0.2555f, -0.2427f, -0.2261f, -0.1567f, -0.5199f,
-      -0.4079f, -0.0801f, -0.2075f, -0.3956f, -0.0307f, -0.3150f, -0.3490f,
-      -0.0379f, 0.3060f,  -0.1775f, -0.1651f, 0.0677f,  -0.1947f, 0.0032f,
-      -0.2014f, -0.1575f, -0.1289f, -0.0250f, -0.0762f, -0.2324f, -0.2895f,
-      -0.4531f, -0.4601f, -0.1718f, -0.3139f, -0.4350f, 0.0346f,  -0.0891f,
-      -0.1581f, 0.2123f,  -0.1074f, 0.0221f,  0.0951f,  0.1161f,  0.0245f,
-      -0.0701f, -0.1677f, -0.4170f, -0.2214f, -0.3419f, -0.4873f, -0.0701f,
-      -0.0613f, -0.1031f, 0.0141f,  -0.1299f, -0.3953f, -0.2182f, -0.2679f,
-      -0.0141f, 0.3392f,  -0.0722f, -0.2390f, 0.1638f,  -0.1596f, -0.1527f,
-      -0.3581f, -0.4037f, -0.0736f, 0.0397f,  -0.1288f, -0.1362f, -0.0249f,
-      -0.5099f, -0.4040f, -0.1893f, -0.0298f, -0.1332f, -0.1693f, -0.3301f,
-      -0.1058f, -0.1414f, -0.5737f, -0.2342f, -0.2560f, -0.3834f, -0.0917f,
-      -0.1334f, -0.5077f, -0.3666f, -0.2515f, -0.4824f, -0.4714f, -0.5723f,
-      -0.1361f, -0.5244f, -0.2468f, 0.0237f,  -0.1862f, -0.3124f, -0.0183f,
-      -0.4662f, -0.4444f, -0.5400f, -0.1730f, -0.0123f, -0.2134f, -0.1024f,
-      -0.0172f, -0.4430f, -0.1403f, -0.0751f, -0.2403f, -0.2100f, -0.0678f,
-      2.4232f,  1.9825f,  0.1260f,  1.9972f,  2.8061f,  0.3916f,  0.1842f,
-      -0.2603f, -1.6092f, -1.6037f, 0.1475f,  0.0516f,  -0.2593f, 0.0359f,
-      -0.1802f, 0.0159f,  -0.0529f, -0.0983f, 0.7638f,  0.5529f,  0.9662f,
-      -0.4049f, -0.6372f, 0.4907f,  0.7360f,  0.9271f,  -0.6879f, -0.1067f,
-      0.0323f,  -1.8447f, 0.2176f,  -0.1047f, -0.0048f, -0.1031f, -0.7931f,
-      -0.3059f, -0.4595f, -0.1287f, -0.4031f, 0.1441f,  -0.6651f, 0.2530f,
-      -0.4572f, -0.0614f, 0.0345f,  -0.0008f, 0.0333f,  -0.3431f, 0.0538f,
-      -0.2691f, 0.2930f,  -0.0820f, -0.0979f, -0.0307f, 0.1713f,  0.0783f,
-      -0.4337f, -0.2702f, -0.1677f, -0.1719f, -0.4669f, -0.2847f, -0.4495f,
-      -0.3692f, -0.2641f, -0.2833f, -0.1168f, -0.0523f, -0.2368f, -0.4922f,
-      -0.3453f, -0.4452f, -0.5212f, 0.0412f,  -0.3310f, -0.2656f, -0.4903f,
-      -0.3854f, -0.1009f, -0.1038f, -0.2350f, -0.4430f, -0.5097f, -0.1755f,
-      0.0110f,  -0.0712f, -0.0662f, -0.4493f, -0.2111f, -0.3402f, -0.3100f,
-      -0.2525f, -0.1856f, -0.2689f, -0.4288f, -0.3912f, -0.0754f, -0.5191f,
-      -0.0747f, -0.0626f, -0.4821f, -0.2014f, -0.3124f, -0.4858f, -0.1896f,
-      1.0673f,  -0.8529f, 13.7564f, 18.7299f, 19.0062f, -1.1047f, -0.8654f,
-      0.1089f,  -1.2958f, -0.7793f, 0.0780f,  -0.1679f, 0.0054f,  -1.2451f,
-      -0.1287f, 0.0082f,  -0.2960f, -0.0442f, 2.3817f,  0.4716f,  1.3862f,
-      -0.0782f, -0.1871f, -0.2596f, 0.0093f,  0.1451f,  -0.1124f, -0.2315f,
-      -0.2677f, -0.1086f, 0.2216f,  0.2928f,  0.0391f,  0.0372f,  -0.2551f,
-      0.0552f,  -0.1876f, -0.2361f, -0.1889f, -0.0279f, 0.1204f,  0.2016f,
-      -0.5787f, -0.5830f, 0.0530f,  -0.1452f, -0.4899f, -0.2937f, 0.1430f,
-      -0.2752f, -0.2320f, -0.1908f, -0.5538f, -0.0858f, -0.1378f, -0.1505f,
-      -0.3908f, -0.4732f, -0.3018f, 0.0244f,  -0.2392f, -0.2833f, -0.3997f,
-      -0.4495f, -0.2570f, -0.3189f, -0.1534f, -0.1040f, -0.5497f, -0.3524f,
-      -0.2053f, 0.2415f,  -0.5027f, 0.0288f,  -0.1904f, -0.2183f, -0.1062f,
-      -0.3560f, 0.0165f,  -0.4601f, -0.2144f, -0.0439f, -0.4913f, -0.3160f,
-      -0.1641f, 0.1010f,  -0.1044f, -0.4064f, -0.3580f, -0.4015f, 0.1010f,
-      -0.1973f, 0.6392f,  -0.5177f, -0.0472f, -0.1526f, 0.1533f,  -0.0819f,
-      -0.0252f, -0.0783f, 0.1301f,  0.0158f,  -0.2003f, -0.4700f, -0.2329f,
-    };
-
-static const float
-    av1_use_flat_gop_nn_biases_layer0[NUM_HIDDEN_NODES_LAYER0] = {
-      -1.113218f, 0.f,        -0.268537f, -0.268537f, 0.f,        -0.268534f,
-      -0.40681f,  -0.268537f, -0.061835f, -0.614956f, 0.984277f,  -0.280228f,
-      -0.354716f, -0.202312f, -0.772829f, -0.464005f, -0.230795f, 0.f,
-      -0.124187f, -0.265949f, 0.325168f,  -0.359008f, -2.455546f, -0.229222f,
-      -0.692233f, -0.29401f,  -0.632682f, -0.479061f, -0.166094f, 0.077291f,
-      -0.235293f, -0.268537f, 0.167899f,  -0.141991f, -0.210089f, -0.177294f,
-      -0.325401f, -0.268537f, 0.323627f,  -0.156593f, -0.218451f, -0.230792f,
-      -0.268537f, 0.833177f,  0.f,        -0.353177f, -0.260953f, -0.209537f,
-    };
-
-static const float
-    av1_use_flat_gop_nn_weights_layer1[NUM_HIDDEN_NODES_LAYER0 * NUM_LABELS] = {
-      -0.024695f, 0.146668f,  -0.02723f,  0.034577f,  -0.255426f, 0.22402f,
-      -0.112595f, -0.131262f, 0.091164f,  -0.045294f, 0.028304f,  -0.051683f,
-      0.310497f,  -0.077786f, -0.047873f, -0.057205f, -0.065119f, 0.227417f,
-      -0.051126f, -0.137241f, 0.035742f,  -0.058992f, -0.021466f, 0.107947f,
-      -0.077183f, -0.04144f,  0.003568f,  -0.027656f, 0.038196f,  0.19684f,
-      -0.128401f, 0.149629f,  0.024526f,  0.037376f,  0.090752f,  -0.061666f,
-      -0.15743f,  0.057773f,  -0.010582f, 0.120997f,  0.060368f,  0.210028f,
-      -0.192244f, -0.064764f, -0.237655f, 0.1852f,    -0.084281f, -0.010434f,
-    };
-
-static const float av1_use_flat_gop_nn_biases_layer1[NUM_LABELS] = {
-  -0.672434f,
-};
-
-static const NN_CONFIG av1_use_flat_gop_nn_config = {
-  NUM_FEATURES,
-  NUM_LABELS,
-  NUM_HIDDEN_LAYERS,
-  {
-      NUM_HIDDEN_NODES_LAYER0,
-  },
-  {
-      av1_use_flat_gop_nn_weights_layer0,
-      av1_use_flat_gop_nn_weights_layer1,
-  },
-  {
-      av1_use_flat_gop_nn_biases_layer0,
-      av1_use_flat_gop_nn_biases_layer1,
-  },
-};
-
-#undef NUM_FEATURES
-#undef NUM_HIDDEN_LAYERS
-#undef NUM_HIDDEN_NODES_LAYER0
-#undef NUM_LABELS
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_
diff --git a/media/libaom/src/av1/encoder/var_based_part.c b/media/libaom/src/av1/encoder/var_based_part.c
index e3cb1fa8f6..b63ee03b06 100644
--- a/media/libaom/src/av1/encoder/var_based_part.c
+++ b/media/libaom/src/av1/encoder/var_based_part.c
@@ -22,7 +22,6 @@
 #include "aom_dsp/binary_codes_writer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/aom_timer.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/common/reconinter.h"
 #include "av1/common/blockd.h"
@@ -33,6 +32,17 @@
 
 extern const uint8_t AV1_VAR_OFFS[];
 
+// Possible values for the force_split variable while evaluating variance based
+// partitioning.
+enum {
+  // Evaluate all partition types
+  PART_EVAL_ALL = 0,
+  // Force PARTITION_SPLIT
+  PART_EVAL_ONLY_SPLIT = 1,
+  // Force PARTITION_NONE
+  PART_EVAL_ONLY_NONE = 2
+} UENUM1BYTE(PART_EVAL_STATUS);
+
 typedef struct {
   VPVariance *part_variances;
   VPartVar *split[4];
@@ -123,32 +133,56 @@ static AOM_INLINE void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
                   &node.part_variances->none);
 }
 
-static AOM_INLINE void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x,
-                                      MACROBLOCKD *const xd, int mi_row,
+static AOM_INLINE void set_block_size(AV1_COMP *const cpi, int mi_row,
                                       int mi_col, BLOCK_SIZE bsize) {
   if (cpi->common.mi_params.mi_cols > mi_col &&
       cpi->common.mi_params.mi_rows > mi_row) {
-    set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
-                          mi_row, mi_col);
-    xd->mi[0]->sb_type = bsize;
+    CommonModeInfoParams *mi_params = &cpi->common.mi_params;
+    const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+    const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col);
+    MB_MODE_INFO *mi = mi_params->mi_grid_base[mi_grid_idx] =
+        &mi_params->mi_alloc[mi_alloc_idx];
+    mi->bsize = bsize;
   }
 }
 
-static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
-                               MACROBLOCKD *const xd,
+static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCKD *const xd,
                                const TileInfo *const tile, void *data,
                                BLOCK_SIZE bsize, int mi_row, int mi_col,
                                int64_t threshold, BLOCK_SIZE bsize_min,
-                               int force_split) {
+                               PART_EVAL_STATUS force_split) {
   AV1_COMMON *const cm = &cpi->common;
   variance_node vt;
   const int block_width = mi_size_wide[bsize];
   const int block_height = mi_size_high[bsize];
+  int bs_width_check = block_width;
+  int bs_height_check = block_height;
+  int bs_width_vert_check = block_width >> 1;
+  int bs_height_horiz_check = block_height >> 1;
+  // On the right and bottom boundary we only need to check
+  // if half the bsize fits, because boundary is extended
+  // up to 64. So do this check only for sb_size = 64X64.
+  if (cm->seq_params->sb_size == BLOCK_64X64) {
+    if (tile->mi_col_end == cm->mi_params.mi_cols) {
+      bs_width_check = (block_width >> 1) + 1;
+      bs_width_vert_check = (block_width >> 2) + 1;
+    }
+    if (tile->mi_row_end == cm->mi_params.mi_rows) {
+      bs_height_check = (block_height >> 1) + 1;
+      bs_height_horiz_check = (block_height >> 2) + 1;
+    }
+  }
 
   assert(block_height == block_width);
   tree_to_node(data, bsize, &vt);
 
-  if (force_split == 1) return 0;
+  if (mi_col + bs_width_check <= tile->mi_col_end &&
+      mi_row + bs_height_check <= tile->mi_row_end &&
+      force_split == PART_EVAL_ONLY_NONE) {
+    set_block_size(cpi, mi_row, mi_col, bsize);
+    return 1;
+  }
+  if (force_split == PART_EVAL_ONLY_SPLIT) return 0;
 
   // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
   // variance is below threshold, otherwise split will be selected.
@@ -156,10 +190,10 @@ static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
   if (bsize == bsize_min) {
     // Variance already computed to set the force_split.
     if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
-    if (mi_col + block_width <= tile->mi_col_end &&
-        mi_row + block_height <= tile->mi_row_end &&
+    if (mi_col + bs_width_check <= tile->mi_col_end &&
+        mi_row + bs_height_check <= tile->mi_row_end &&
         vt.part_variances->none.variance < threshold) {
-      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      set_block_size(cpi, mi_row, mi_col, bsize);
       return 1;
     }
     return 0;
@@ -173,15 +207,15 @@ static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
       return 0;
     }
     // If variance is low, take the bsize (no split).
-    if (mi_col + block_width <= tile->mi_col_end &&
-        mi_row + block_height <= tile->mi_row_end &&
+    if (mi_col + bs_width_check <= tile->mi_col_end &&
+        mi_row + bs_height_check <= tile->mi_row_end &&
         vt.part_variances->none.variance < threshold) {
-      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      set_block_size(cpi, mi_row, mi_col, bsize);
       return 1;
     }
     // Check vertical split.
-    if (mi_row + block_height <= tile->mi_row_end &&
-        mi_col + block_width / 2 <= tile->mi_col_end) {
+    if (mi_row + bs_height_check <= tile->mi_row_end &&
+        mi_col + bs_width_vert_check <= tile->mi_col_end) {
       BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
       get_variance(&vt.part_variances->vert[0]);
       get_variance(&vt.part_variances->vert[1]);
@@ -189,14 +223,14 @@ static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
           vt.part_variances->vert[1].variance < threshold &&
           get_plane_block_size(subsize, xd->plane[1].subsampling_x,
                                xd->plane[1].subsampling_y) < BLOCK_INVALID) {
-        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
+        set_block_size(cpi, mi_row, mi_col, subsize);
+        set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize);
         return 1;
       }
     }
     // Check horizontal split.
-    if (mi_col + block_width <= tile->mi_col_end &&
-        mi_row + block_height / 2 <= tile->mi_row_end) {
+    if (mi_col + bs_width_check <= tile->mi_col_end &&
+        mi_row + bs_height_horiz_check <= tile->mi_row_end) {
       BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
       get_variance(&vt.part_variances->horz[0]);
       get_variance(&vt.part_variances->horz[1]);
@@ -204,8 +238,8 @@ static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
           vt.part_variances->horz[1].variance < threshold &&
           get_plane_block_size(subsize, xd->plane[1].subsampling_x,
                                xd->plane[1].subsampling_y) < BLOCK_INVALID) {
-        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
+        set_block_size(cpi, mi_row, mi_col, subsize);
+        set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize);
         return 1;
       }
     }
@@ -214,43 +248,98 @@ static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
   return 0;
 }
 
-static AOM_INLINE void fill_variance_8x8avg(const uint8_t *s, int sp,
-                                            const uint8_t *d, int dp,
-                                            int x16_idx, int y16_idx,
-                                            VP16x16 *vst,
+static AOM_INLINE int all_blks_inside(int x16_idx, int y16_idx, int pixels_wide,
+                                      int pixels_high) {
+  int all_inside = 1;
+  for (int k = 0; k < 4; k++) {
+    all_inside &= ((x16_idx + ((k & 1) << 3)) < pixels_wide);
+    all_inside &= ((y16_idx + ((k >> 1) << 3)) < pixels_high);
+  }
+  return all_inside;
+}
+
 #if CONFIG_AV1_HIGHBITDEPTH
-                                            int highbd_flag,
-#endif
-                                            int pixels_wide, int pixels_high,
-                                            int is_key_frame) {
-  int k;
-  for (k = 0; k < 4; k++) {
-    int x8_idx = x16_idx + ((k & 1) << 3);
-    int y8_idx = y16_idx + ((k >> 1) << 3);
+// TODO(yunqingwang): Perform average of four 8x8 blocks similar to lowbd
+static AOM_INLINE void fill_variance_8x8avg_highbd(
+    const uint8_t *s, int sp, const uint8_t *d, int dp, int x16_idx,
+    int y16_idx, VP16x16 *vst, int pixels_wide, int pixels_high,
+    int is_key_frame) {
+  for (int k = 0; k < 4; k++) {
+    const int x8_idx = x16_idx + ((k & 1) << 3);
+    const int y8_idx = y16_idx + ((k >> 1) << 3);
     unsigned int sse = 0;
     int sum = 0;
     if (x8_idx < pixels_wide && y8_idx < pixels_high) {
       int s_avg;
       int d_avg = 128;
-#if CONFIG_AV1_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = aom_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-        if (!is_key_frame)
-          d_avg = aom_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-      } else {
-        s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-        if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-      }
-#else
-      s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-      if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-#endif
+      s_avg = aom_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+      if (!is_key_frame)
+        d_avg = aom_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+
       sum = s_avg - d_avg;
       sse = sum * sum;
     }
     fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
   }
 }
+#endif
+
+static AOM_INLINE void fill_variance_8x8avg_lowbd(const uint8_t *s, int sp,
+                                                  const uint8_t *d, int dp,
+                                                  int x16_idx, int y16_idx,
+                                                  VP16x16 *vst, int pixels_wide,
+                                                  int pixels_high,
+                                                  int is_key_frame) {
+  unsigned int sse[4] = { 0 };
+  int sum[4] = { 0 };
+  int d_avg[4] = { 128, 128, 128, 128 };
+  int s_avg[4];
+
+  if (all_blks_inside(x16_idx, y16_idx, pixels_wide, pixels_high)) {
+    aom_avg_8x8_quad(s, sp, x16_idx, y16_idx, s_avg);
+    if (!is_key_frame) aom_avg_8x8_quad(d, dp, x16_idx, y16_idx, d_avg);
+    for (int k = 0; k < 4; k++) {
+      sum[k] = s_avg[k] - d_avg[k];
+      sse[k] = sum[k] * sum[k];
+    }
+  } else {
+    for (int k = 0; k < 4; k++) {
+      const int x8_idx = x16_idx + ((k & 1) << 3);
+      const int y8_idx = y16_idx + ((k >> 1) << 3);
+      if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+        s_avg[k] = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame) d_avg[k] = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+        sum[k] = s_avg[k] - d_avg[k];
+        sse[k] = sum[k] * sum[k];
+      }
+    }
+  }
+
+  for (int k = 0; k < 4; k++) {
+    fill_variance(sse[k], sum[k], 0, &vst->split[k].part_variances.none);
+  }
+}
+
+// Obtain parameters required to calculate variance (such as sum, sse, etc,.)
+// at 8x8 sub-block level for a given 16x16 block.
+static AOM_INLINE void fill_variance_8x8avg(const uint8_t *s, int sp,
+                                            const uint8_t *d, int dp,
+                                            int x16_idx, int y16_idx,
+                                            VP16x16 *vst, int highbd_flag,
+                                            int pixels_wide, int pixels_high,
+                                            int is_key_frame) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd_flag) {
+    fill_variance_8x8avg_highbd(s, sp, d, dp, x16_idx, y16_idx, vst,
+                                pixels_wide, pixels_high, is_key_frame);
+    return;
+  }
+#else
+  (void)highbd_flag;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  fill_variance_8x8avg_lowbd(s, sp, d, dp, x16_idx, y16_idx, vst, pixels_wide,
+                             pixels_high, is_key_frame);
+}
 
 static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
                               int dp, int x16_idx, int y16_idx,
@@ -326,82 +415,187 @@ static AOM_INLINE void fill_variance_4x4avg(const uint8_t *s, int sp,
 }
 
 // TODO(kyslov) Bring back threshold adjustment based on content state
-static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+static int64_t scale_part_thresh_content(int64_t threshold_base, int speed,
                                          int width, int height,
-                                         int content_state) {
+                                         int non_reference_frame) {
   (void)width;
   (void)height;
-  (void)content_state;
+  int64_t threshold = threshold_base;
+  if (non_reference_frame) threshold = (3 * threshold) >> 1;
   if (speed >= 8) {
-    return (5 * threshold_base) >> 2;
+    return (5 * threshold) >> 2;
   }
-  return threshold_base;
+  return threshold;
 }
 
-// Set the variance split thresholds for following the block sizes:
-// 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32,
-// 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is
-// currently only used on key frame.
 static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
-                                          int q, int content_state) {
+                                          int q, int content_lowsumdiff,
+                                          int source_sad_nonrd,
+                                          int source_sad_rd, int segment_id) {
   AV1_COMMON *const cm = &cpi->common;
   const int is_key_frame = frame_is_intra_only(cm);
-  const int threshold_multiplier = is_key_frame ? 40 : 1;
-  int64_t threshold_base =
-      (int64_t)(threshold_multiplier *
-                cpi->enc_quant_dequant_params.dequants.y_dequant_QTX[q][1]);
+  const int threshold_multiplier = is_key_frame ? 120 : 1;
+  const int ac_q = av1_ac_quant_QTX(q, 0, cm->seq_params->bit_depth);
+  int64_t threshold_base = (int64_t)(threshold_multiplier * ac_q);
+  const int current_qindex = cm->quant_params.base_qindex;
+  const int threshold_left_shift = cpi->sf.rt_sf.var_part_split_threshold_shift;
 
   if (is_key_frame) {
+    if (cpi->sf.rt_sf.force_large_partition_blocks_intra) {
+      const int shift_steps =
+          threshold_left_shift - (cpi->oxcf.mode == ALLINTRA ? 7 : 8);
+      assert(shift_steps >= 0);
+      threshold_base <<= shift_steps;
+    }
     thresholds[0] = threshold_base;
     thresholds[1] = threshold_base;
-    thresholds[2] = threshold_base >> 2;
-    thresholds[3] = threshold_base >> 2;
+    if (cm->width * cm->height < 1280 * 720) {
+      thresholds[2] = threshold_base / 3;
+      thresholds[3] = threshold_base >> 1;
+    } else {
+      int shift_val = 2;
+      if (cpi->sf.rt_sf.force_large_partition_blocks_intra) {
+        shift_val = 0;
+      }
+
+      thresholds[2] = threshold_base >> shift_val;
+      thresholds[3] = threshold_base >> shift_val;
+    }
     thresholds[4] = threshold_base << 2;
-  } else {
-    // Increase base variance threshold based on content_state/sum_diff level.
-    threshold_base = scale_part_thresh_sumdiff(
-        threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state);
+    return;
+  }
 
-    thresholds[0] = threshold_base >> 1;
-    thresholds[1] = threshold_base;
-    thresholds[3] = threshold_base << cpi->oxcf.speed;
-    if (cm->width >= 1280 && cm->height >= 720)
-      thresholds[3] = thresholds[3] << 1;
+  // Increase partition thresholds for noisy content. Apply it only for
+  // superblocks where sumdiff is low, as we assume the sumdiff of superblock
+  // whose only change is due to noise will be low (i.e, noise will average
+  // out over large block).
+  if (cpi->noise_estimate.enabled && content_lowsumdiff &&
+      (cm->width * cm->height > 640 * 480) &&
+      cm->current_frame.frame_number > 60) {
+    NOISE_LEVEL noise_level =
+        av1_noise_estimate_extract_level(&cpi->noise_estimate);
+    if (noise_level == kHigh)
+      threshold_base = (5 * threshold_base) >> 1;
+    else if (noise_level == kMedium &&
+             !cpi->sf.rt_sf.force_large_partition_blocks)
+      threshold_base = (5 * threshold_base) >> 2;
+  }
+  // TODO(kyslov) Enable var based partition adjusment on temporal denoising
+#if 0  // CONFIG_AV1_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+      cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow)
+      threshold_base =
+          av1_scale_part_thresh(threshold_base, cpi->denoiser.denoising_level,
+                                content_state, cpi->svc.temporal_layer_id);
+  else
+    threshold_base =
+        scale_part_thresh_content(threshold_base, cpi->oxcf.speed, cm->width,
+                                  cm->height, cpi->svc.non_reference_frame);
+#else
+  // Increase base variance threshold based on content_state/sum_diff level.
+  threshold_base =
+      scale_part_thresh_content(threshold_base, cpi->oxcf.speed, cm->width,
+                                cm->height, cpi->svc.non_reference_frame);
+#endif
+  thresholds[0] = threshold_base >> 1;
+  thresholds[1] = threshold_base;
+  thresholds[3] = threshold_base << threshold_left_shift;
+  if (cm->width >= 1280 && cm->height >= 720)
+    thresholds[3] = thresholds[3] << 1;
+  if (cm->width * cm->height <= 352 * 288) {
+    const int qindex_thr[5][2] = {
+      { 200, 220 }, { 140, 170 }, { 120, 150 }, { 200, 210 }, { 170, 220 },
+    };
+    int th_idx = 0;
+    if (cpi->sf.rt_sf.var_part_based_on_qidx >= 1)
+      th_idx =
+          (source_sad_rd <= kLowSad) ? cpi->sf.rt_sf.var_part_based_on_qidx : 0;
+    if (cpi->sf.rt_sf.var_part_based_on_qidx >= 3)
+      th_idx = cpi->sf.rt_sf.var_part_based_on_qidx;
+    const int qindex_low_thr = qindex_thr[th_idx][0];
+    const int qindex_high_thr = qindex_thr[th_idx][1];
+    if (current_qindex >= qindex_high_thr) {
+      threshold_base = (5 * threshold_base) >> 1;
+      thresholds[1] = threshold_base >> 3;
+      thresholds[2] = threshold_base << 2;
+      thresholds[3] = threshold_base << 5;
+    } else if (current_qindex < qindex_low_thr) {
+      thresholds[1] = threshold_base >> 3;
+      thresholds[2] = threshold_base >> 1;
+      thresholds[3] = threshold_base << 3;
+    } else {
+      int64_t qi_diff_low = current_qindex - qindex_low_thr;
+      int64_t qi_diff_high = qindex_high_thr - current_qindex;
+      int64_t threshold_diff = qindex_high_thr - qindex_low_thr;
+      int64_t threshold_base_high = (5 * threshold_base) >> 1;
+
+      threshold_diff = threshold_diff > 0 ? threshold_diff : 1;
+      threshold_base =
+          (qi_diff_low * threshold_base_high + qi_diff_high * threshold_base) /
+          threshold_diff;
+      thresholds[1] = threshold_base >> 3;
+      thresholds[2] = ((qi_diff_low * threshold_base) +
+                       qi_diff_high * (threshold_base >> 1)) /
+                      threshold_diff;
+      thresholds[3] = ((qi_diff_low * (threshold_base << 5)) +
+                       qi_diff_high * (threshold_base << 3)) /
+                      threshold_diff;
+    }
+  } else if (cm->width < 1280 && cm->height < 720) {
+    thresholds[2] = (5 * threshold_base) >> 2;
+  } else if (cm->width < 1920 && cm->height < 1080) {
+    thresholds[2] = threshold_base << 1;
+  } else {
+    thresholds[2] = (5 * threshold_base) >> 1;
+  }
+  if (cpi->sf.rt_sf.force_large_partition_blocks) {
+    double weight;
+    const int win = 20;
+    if (current_qindex < QINDEX_LARGE_BLOCK_THR - win)
+      weight = 1.0;
+    else if (current_qindex > QINDEX_LARGE_BLOCK_THR + win)
+      weight = 0.0;
+    else
+      weight =
+          1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + win) / (2 * win);
+    if (cm->width * cm->height > 640 * 480) {
+      for (int i = 0; i < 4; i++) {
+        thresholds[i] <<= 1;
+      }
+    }
     if (cm->width * cm->height <= 352 * 288) {
-      int last_qindex = cpi->rc.last_q[INTER_FRAME];
-      if (last_qindex >= QINDEX_HIGH_THR) {
-        threshold_base = (5 * threshold_base) >> 1;
-        thresholds[1] = threshold_base >> 3;
-        thresholds[2] = threshold_base << 2;
-        thresholds[3] = threshold_base << 5;
-      } else if (last_qindex < QINDEX_LOW_THR) {
-        thresholds[1] = threshold_base >> 3;
-        thresholds[2] = threshold_base >> 1;
-        thresholds[3] = threshold_base << 3;
+      thresholds[3] = INT32_MAX;
+      if (segment_id == 0) {
+        thresholds[1] <<= 2;
+        thresholds[2] <<= (source_sad_nonrd == kLowSad) ? 5 : 4;
       } else {
-        int64_t qi_diff_low = last_qindex - QINDEX_LOW_THR;
-        int64_t qi_diff_high = QINDEX_HIGH_THR - last_qindex;
-        int64_t threshold_diff = QINDEX_HIGH_THR - QINDEX_LOW_THR;
-        int64_t threshold_base_high = (5 * threshold_base) >> 1;
-
-        threshold_diff = threshold_diff > 0 ? threshold_diff : 1;
-        threshold_base = (qi_diff_low * threshold_base_high +
-                          qi_diff_high * threshold_base) /
-                         threshold_diff;
-        thresholds[1] = threshold_base >> 3;
-        thresholds[2] = ((qi_diff_low * threshold_base) +
-                         qi_diff_high * (threshold_base >> 1)) /
-                        threshold_diff;
-        thresholds[3] = ((qi_diff_low * (threshold_base << 5)) +
-                         qi_diff_high * (threshold_base << 3)) /
-                        threshold_diff;
+        thresholds[1] <<= 1;
+        thresholds[2] <<= 3;
       }
-    } else if (cm->width < 1280 && cm->height < 720) {
-      thresholds[2] = (5 * threshold_base) >> 2;
-    } else if (cm->width < 1920 && cm->height < 1080) {
-      thresholds[2] = threshold_base << 1;
-    } else {
-      thresholds[2] = (5 * threshold_base) >> 1;
+      // Condition the increase of partition thresholds on the segment
+      // and the content. Avoid the increase for superblocks which have
+      // high source sad, unless the whole frame has very high motion
+      // (i.e, cpi->rc.avg_source_sad is very large, in which case all blocks
+      // have high source sad).
+    } else if (cm->width * cm->height > 640 * 480 && segment_id == 0 &&
+               (source_sad_nonrd != kHighSad ||
+                cpi->rc.avg_source_sad > 50000)) {
+      thresholds[0] = (3 * thresholds[0]) >> 1;
+      thresholds[3] = INT32_MAX;
+      if (current_qindex > QINDEX_LARGE_BLOCK_THR) {
+        thresholds[1] =
+            (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]);
+        thresholds[2] =
+            (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]);
+      }
+    } else if (current_qindex > QINDEX_LARGE_BLOCK_THR && segment_id == 0 &&
+               (source_sad_nonrd != kHighSad ||
+                cpi->rc.avg_source_sad > 50000)) {
+      thresholds[1] =
+          (int)((1 - weight) * (thresholds[1] << 2) + weight * thresholds[1]);
+      thresholds[2] =
+          (int)((1 - weight) * (thresholds[2] << 4) + weight * thresholds[2]);
+      thresholds[3] = INT32_MAX;
     }
   }
 }
@@ -409,20 +603,21 @@ static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
 // Set temporal variance low flag for superblock 64x64.
 // Only first 25 in the array are used in this case.
 static AOM_INLINE void set_low_temp_var_flag_64x64(
-    CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd,
-    VP64x64 *vt, const int64_t thresholds[], int mi_col, int mi_row) {
-  if (xd->mi[0]->sb_type == BLOCK_64X64) {
+    CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info,
+    MACROBLOCKD *xd, VP64x64 *vt, const int64_t thresholds[], int mi_col,
+    int mi_row) {
+  if (xd->mi[0]->bsize == BLOCK_64X64) {
     if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
-      x->variance_low[0] = 1;
-  } else if (xd->mi[0]->sb_type == BLOCK_64X32) {
+      part_info->variance_low[0] = 1;
+  } else if (xd->mi[0]->bsize == BLOCK_64X32) {
     for (int i = 0; i < 2; i++) {
       if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
-        x->variance_low[i + 1] = 1;
+        part_info->variance_low[i + 1] = 1;
     }
-  } else if (xd->mi[0]->sb_type == BLOCK_32X64) {
+  } else if (xd->mi[0]->bsize == BLOCK_32X64) {
     for (int i = 0; i < 2; i++) {
       if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
-        x->variance_low[i + 3] = 1;
+        part_info->variance_low[i + 3] = 1;
     }
   } else {
     static const int idx[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
@@ -437,20 +632,20 @@ static AOM_INLINE void set_low_temp_var_flag_64x64(
 
       if (*this_mi == NULL) continue;
 
-      if ((*this_mi)->sb_type == BLOCK_32X32) {
+      if ((*this_mi)->bsize == BLOCK_32X32) {
         int64_t threshold_32x32 = (5 * thresholds[1]) >> 3;
         if (vt->split[i].part_variances.none.variance < threshold_32x32)
-          x->variance_low[i + 5] = 1;
+          part_info->variance_low[i + 5] = 1;
       } else {
         // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
         // inside.
-        if ((*this_mi)->sb_type == BLOCK_16X16 ||
-            (*this_mi)->sb_type == BLOCK_32X16 ||
-            (*this_mi)->sb_type == BLOCK_16X32) {
+        if ((*this_mi)->bsize == BLOCK_16X16 ||
+            (*this_mi)->bsize == BLOCK_32X16 ||
+            (*this_mi)->bsize == BLOCK_16X32) {
           for (int j = 0; j < 4; j++) {
             if (vt->split[i].split[j].part_variances.none.variance <
                 (thresholds[2] >> 8))
-              x->variance_low[(i << 2) + j + 9] = 1;
+              part_info->variance_low[(i << 2) + j + 9] = 1;
           }
         }
       }
@@ -459,20 +654,21 @@ static AOM_INLINE void set_low_temp_var_flag_64x64(
 }
 
 static AOM_INLINE void set_low_temp_var_flag_128x128(
-    CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd,
-    VP128x128 *vt, const int64_t thresholds[], int mi_col, int mi_row) {
-  if (xd->mi[0]->sb_type == BLOCK_128X128) {
+    CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info,
+    MACROBLOCKD *xd, VP128x128 *vt, const int64_t thresholds[], int mi_col,
+    int mi_row) {
+  if (xd->mi[0]->bsize == BLOCK_128X128) {
     if (vt->part_variances.none.variance < (thresholds[0] >> 1))
-      x->variance_low[0] = 1;
-  } else if (xd->mi[0]->sb_type == BLOCK_128X64) {
+      part_info->variance_low[0] = 1;
+  } else if (xd->mi[0]->bsize == BLOCK_128X64) {
     for (int i = 0; i < 2; i++) {
       if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
-        x->variance_low[i + 1] = 1;
+        part_info->variance_low[i + 1] = 1;
     }
-  } else if (xd->mi[0]->sb_type == BLOCK_64X128) {
+  } else if (xd->mi[0]->bsize == BLOCK_64X128) {
     for (int i = 0; i < 2; i++) {
       if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
-        x->variance_low[i + 3] = 1;
+        part_info->variance_low[i + 3] = 1;
     }
   } else {
     static const int idx64[4][2] = {
@@ -488,19 +684,19 @@ static AOM_INLINE void set_low_temp_var_flag_128x128(
           mi_params->mi_rows <= mi_row + idx64[i][0])
         continue;
       const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3;
-      if ((*mi_64)->sb_type == BLOCK_64X64) {
+      if ((*mi_64)->bsize == BLOCK_64X64) {
         if (vt->split[i].part_variances.none.variance < threshold_64x64)
-          x->variance_low[5 + i] = 1;
-      } else if ((*mi_64)->sb_type == BLOCK_64X32) {
+          part_info->variance_low[5 + i] = 1;
+      } else if ((*mi_64)->bsize == BLOCK_64X32) {
         for (int j = 0; j < 2; j++)
           if (vt->split[i].part_variances.horz[j].variance <
               (threshold_64x64 >> 1))
-            x->variance_low[9 + (i << 1) + j] = 1;
-      } else if ((*mi_64)->sb_type == BLOCK_32X64) {
+            part_info->variance_low[9 + (i << 1) + j] = 1;
+      } else if ((*mi_64)->bsize == BLOCK_32X64) {
         for (int j = 0; j < 2; j++)
           if (vt->split[i].part_variances.vert[j].variance <
               (threshold_64x64 >> 1))
-            x->variance_low[17 + (i << 1) + j] = 1;
+            part_info->variance_low[17 + (i << 1) + j] = 1;
       } else {
         for (int k = 0; k < 4; k++) {
           const int idx_str1 = mi_params->mi_stride * idx32[k][0] + idx32[k][1];
@@ -511,22 +707,22 @@ static AOM_INLINE void set_low_temp_var_flag_128x128(
               mi_params->mi_rows <= mi_row + idx64[i][0] + idx32[k][0])
             continue;
           const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3;
-          if ((*mi_32)->sb_type == BLOCK_32X32) {
+          if ((*mi_32)->bsize == BLOCK_32X32) {
             if (vt->split[i].split[k].part_variances.none.variance <
                 threshold_32x32)
-              x->variance_low[25 + (i << 2) + k] = 1;
+              part_info->variance_low[25 + (i << 2) + k] = 1;
           } else {
             // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
             // inside.
-            if ((*mi_32)->sb_type == BLOCK_16X16 ||
-                (*mi_32)->sb_type == BLOCK_32X16 ||
-                (*mi_32)->sb_type == BLOCK_16X32) {
+            if ((*mi_32)->bsize == BLOCK_16X16 ||
+                (*mi_32)->bsize == BLOCK_32X16 ||
+                (*mi_32)->bsize == BLOCK_16X32) {
               for (int j = 0; j < 4; j++) {
                 if (vt->split[i]
                         .split[k]
                         .split[j]
                         .part_variances.none.variance < (thresholds[3] >> 8))
-                  x->variance_low[41 + (i << 4) + (k << 2) + j] = 1;
+                  part_info->variance_low[41 + (i << 4) + (k << 2) + j] = 1;
               }
             }
           }
@@ -537,39 +733,159 @@ static AOM_INLINE void set_low_temp_var_flag_128x128(
 }
 
 static AOM_INLINE void set_low_temp_var_flag(
-    AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, VP128x128 *vt,
-    int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition, int mi_col,
-    int mi_row) {
+    AV1_COMP *cpi, PartitionSearchInfo *part_info, MACROBLOCKD *xd,
+    VP128x128 *vt, int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition,
+    int mi_col, int mi_row) {
   AV1_COMMON *const cm = &cpi->common;
-  const int mv_thr = cm->width > 640 ? 8 : 4;
-  // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected and
-  // int_pro mv is small. If the temporal variance is small set the flag
+  // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected.
+  // If the temporal variance is small set the flag
   // variance_low for the block. The variance threshold can be adjusted, the
   // higher the more aggressive.
-  if (ref_frame_partition == LAST_FRAME &&
-      (cpi->sf.rt_sf.short_circuit_low_temp_var == 1 ||
-       (cpi->sf.rt_sf.estimate_motion_for_var_based_partition &&
-        xd->mi[0]->mv[0].as_mv.col < mv_thr &&
-        xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
-        xd->mi[0]->mv[0].as_mv.row < mv_thr &&
-        xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
-    const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  if (ref_frame_partition == LAST_FRAME) {
+    const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
     if (is_small_sb)
-      set_low_temp_var_flag_64x64(&cm->mi_params, x, xd, &(vt->split[0]),
-                                  thresholds, mi_col, mi_row);
+      set_low_temp_var_flag_64x64(&cm->mi_params, part_info, xd,
+                                  &(vt->split[0]), thresholds, mi_col, mi_row);
     else
-      set_low_temp_var_flag_128x128(&cm->mi_params, x, xd, vt, thresholds,
-                                    mi_col, mi_row);
+      set_low_temp_var_flag_128x128(&cm->mi_params, part_info, xd, vt,
+                                    thresholds, mi_col, mi_row);
   }
 }
 
+static const int pos_shift_16x16[4][4] = {
+  { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
+};
+
+int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low,
+                                             int mi_row, int mi_col,
+                                             BLOCK_SIZE bsize) {
+  // Relative indices of MB inside the superblock.
+  const int mi_x = mi_row & 0xF;
+  const int mi_y = mi_col & 0xF;
+  // Relative indices of 16x16 block inside the superblock.
+  const int i = mi_x >> 2;
+  const int j = mi_y >> 2;
+  int force_skip_low_temp_var = 0;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  switch (bsize) {
+    case BLOCK_64X64: force_skip_low_temp_var = variance_low[0]; break;
+    case BLOCK_64X32:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[1];
+      } else if (!mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[2];
+      }
+      break;
+    case BLOCK_32X64:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[3];
+      } else if (mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[4];
+      }
+      break;
+    case BLOCK_32X32:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[5];
+      } else if (mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[6];
+      } else if (!mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[7];
+      } else if (mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[8];
+      }
+      break;
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
+      force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]];
+      break;
+    default: break;
+  }
+
+  return force_skip_low_temp_var;
+}
+
+int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row,
+                                    int mi_col, BLOCK_SIZE bsize) {
+  int force_skip_low_temp_var = 0;
+  int x, y;
+  x = (mi_col & 0x1F) >> 4;
+  // y = (mi_row & 0x1F) >> 4;
+  // const int idx64 = (y << 1) + x;
+  y = (mi_row & 0x17) >> 3;
+  const int idx64 = y + x;
+
+  x = (mi_col & 0xF) >> 3;
+  // y = (mi_row & 0xF) >> 3;
+  // const int idx32 = (y << 1) + x;
+  y = (mi_row & 0xB) >> 2;
+  const int idx32 = y + x;
+
+  x = (mi_col & 0x7) >> 2;
+  // y = (mi_row & 0x7) >> 2;
+  // const int idx16 = (y << 1) + x;
+  y = (mi_row & 0x5) >> 1;
+  const int idx16 = y + x;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  switch (bsize) {
+    case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break;
+    case BLOCK_128X64:
+      assert((mi_col & 0x1F) == 0);
+      force_skip_low_temp_var = variance_low[1 + ((mi_row & 0x1F) != 0)];
+      break;
+    case BLOCK_64X128:
+      assert((mi_row & 0x1F) == 0);
+      force_skip_low_temp_var = variance_low[3 + ((mi_col & 0x1F) != 0)];
+      break;
+    case BLOCK_64X64:
+      // Location of this 64x64 block inside the 128x128 superblock
+      force_skip_low_temp_var = variance_low[5 + idx64];
+      break;
+    case BLOCK_64X32:
+      x = (mi_col & 0x1F) >> 4;
+      y = (mi_row & 0x1F) >> 3;
+      /*
+      .---------------.---------------.
+      | x=0,y=0,idx=0 | x=0,y=0,idx=2 |
+      :---------------+---------------:
+      | x=0,y=1,idx=1 | x=1,y=1,idx=3 |
+      :---------------+---------------:
+      | x=0,y=2,idx=4 | x=1,y=2,idx=6 |
+      :---------------+---------------:
+      | x=0,y=3,idx=5 | x=1,y=3,idx=7 |
+      '---------------'---------------'
+      */
+      const int idx64x32 = (x << 1) + (y % 2) + ((y >> 1) << 2);
+      force_skip_low_temp_var = variance_low[9 + idx64x32];
+      break;
+    case BLOCK_32X64:
+      x = (mi_col & 0x1F) >> 3;
+      y = (mi_row & 0x1F) >> 4;
+      const int idx32x64 = (y << 2) + x;
+      force_skip_low_temp_var = variance_low[17 + idx32x64];
+      break;
+    case BLOCK_32X32:
+      force_skip_low_temp_var = variance_low[25 + (idx64 << 2) + idx32];
+      break;
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
+      force_skip_low_temp_var =
+          variance_low[41 + (idx64 << 4) + (idx32 << 2) + idx16];
+      break;
+    default: break;
+  }
+  return force_skip_low_temp_var;
+}
+
 void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
-                                           int content_state) {
+                                           int content_lowsumdiff) {
   SPEED_FEATURES *const sf = &cpi->sf;
   if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) {
     return;
   } else {
-    set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_state);
+    set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_lowsumdiff, 0,
+                       0, 0);
     // The threshold below is not changed locally.
     cpi->vbp_info.threshold_minmax = 15 + (q >> 3);
   }
@@ -581,7 +897,7 @@ static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x,
   int i;
   MACROBLOCKD *xd = &x->e_mbd;
 
-  if (is_key_frame || cpi->oxcf.monochrome) return;
+  if (is_key_frame || cpi->oxcf.tool_cfg.enable_monochrome) return;
 
   for (i = 1; i <= 2; ++i) {
     unsigned int uv_sad = UINT_MAX;
@@ -591,208 +907,67 @@ static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x,
         get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
 
     if (bs != BLOCK_INVALID)
-      uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf,
-                                   pd->dst.stride);
-
-    x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
+      uv_sad = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf,
+                                        pd->dst.stride);
+
+    if (uv_sad > (y_sad >> 1))
+      x->color_sensitivity_sb[i - 1] = 1;
+    else if (uv_sad < (y_sad >> 3))
+      x->color_sensitivity_sb[i - 1] = 0;
+    // Borderline case: to be refined at coding block level in nonrd_pickmode,
+    // for coding block size < sb_size.
+    else
+      x->color_sensitivity_sb[i - 1] = 2;
   }
 }
 
-// This function chooses partitioning based on the variance between source and
-// reconstructed last, where variance is computed for down-sampled inputs.
-// TODO(kyslov): lot of things. Bring back noise estimation, brush up partition
-// selection and most of all - retune the thresholds
-int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
-                                      ThreadData *td, MACROBLOCK *x, int mi_row,
-                                      int mi_col) {
-  AV1_COMMON *const cm = &cpi->common;
+static void fill_variance_tree_leaves(
+    AV1_COMP *cpi, MACROBLOCK *x, VP128x128 *vt, VP16x16 *vt2,
+    PART_EVAL_STATUS *force_split, int avg_16x16[][4], int maxvar_16x16[][4],
+    int minvar_16x16[][4], int *variance4x4downsample, int64_t *thresholds,
+    uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride) {
+  AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds;
-
-  int i, j, k, m;
-  VP128x128 *vt;
-  VP16x16 *vt2 = NULL;
-  unsigned char force_split[85];
-  int avg_32x32;
-  int max_var_32x32[4];
-  int min_var_32x32[4];
-  int var_32x32;
-  int var_64x64;
-  int min_var_64x64 = INT_MAX;
-  int max_var_64x64 = 0;
-  int avg_16x16[4][4];
-  int maxvar_16x16[4][4];
-  int minvar_16x16[4][4];
-  int64_t threshold_4x4avg;
-  int content_state = 0;
-  uint8_t *s;
-  const uint8_t *d;
-  int sp;
-  int dp;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
+  const int num_64x64_blocks = is_small_sb ? 1 : 4;
   // TODO(kyslov) Bring back compute_minmax_variance with content type detection
-  int compute_minmax_variance = 0;
-  int is_key_frame = frame_is_intra_only(cm);
+  const int compute_minmax_variance = 0;
+  const int segment_id = xd->mi[0]->segment_id;
   int pixels_wide = 128, pixels_high = 128;
-  assert(cm->seq_params.sb_size == BLOCK_64X64 ||
-         cm->seq_params.sb_size == BLOCK_128X128);
-  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
-  const int num_64x64_blocks = is_small_sb ? 1 : 4;
-
-  unsigned int y_sad = UINT_MAX;
-  unsigned int y_sad_g = UINT_MAX;
-  BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
-
-  // Ref frame used in partitioning.
-  MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
-
-  CHECK_MEM_ERROR(cm, vt, aom_malloc(sizeof(*vt)));
-
-  vt->split = td->vt64x64;
-
-  int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1],
-                            vbp_thresholds[2], vbp_thresholds[3],
-                            vbp_thresholds[4] };
-
-  const int low_res = (cm->width <= 352 && cm->height <= 288);
-  int variance4x4downsample[64];
-  int segment_id;
-  const int num_planes = av1_num_planes(cm);
-
-  segment_id = xd->mi[0]->segment_id;
-
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-      cyclic_refresh_segment_id_boosted(segment_id) &&
-      cpi->sf.rt_sf.use_nonrd_pick_mode) {
-    int q = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
-    set_vbp_thresholds(cpi, thresholds, q, content_state);
-  } else {
-    set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex,
-                       content_state);
-  }
 
   if (is_small_sb) {
     pixels_wide = 64;
     pixels_high = 64;
   }
-
-  // For non keyframes, disable 4x4 average for low resolution when speed = 8
-  threshold_4x4avg = INT64_MAX;
-
   if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
-
-  s = x->plane[0].src.buf;
-  sp = x->plane[0].src.stride;
-
-  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
-  // 5-20 for the 16x16 blocks.
-  force_split[0] = 0;
-  memset(x->variance_low, 0, sizeof(x->variance_low));
-
-  if (!is_key_frame) {
-    // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
-    // is!!
-    MB_MODE_INFO *mi = xd->mi[0];
-    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
-    assert(yv12 != NULL);
-    const YV12_BUFFER_CONFIG *yv12_g = NULL;
-
-    // For non-SVC GOLDEN is another temporal reference. Check if it should be
-    // used as reference for partitioning.
-    if (!cpi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
-        cpi->sf.rt_sf.use_nonrd_pick_mode) {
-      yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
-      if (yv12_g && yv12_g != yv12) {
-        av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
-                             get_ref_scale_factors(cm, GOLDEN_FRAME),
-                             num_planes);
-        y_sad_g = cpi->fn_ptr[bsize].sdf(
-            x->plane[0].src.buf, x->plane[0].src.stride,
-            xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
-      }
-    }
-
-    av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
-                         get_ref_scale_factors(cm, LAST_FRAME), num_planes);
-    mi->ref_frame[0] = LAST_FRAME;
-    mi->ref_frame[1] = NONE_FRAME;
-    mi->sb_type = cm->seq_params.sb_size;
-    mi->mv[0].as_int = 0;
-    mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
-    if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) {
-      if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
-        const MV dummy_mv = { 0, 0 };
-        y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size,
-                                              mi_row, mi_col, &dummy_mv);
-      }
-    }
-    if (y_sad == UINT_MAX) {
-      y_sad = cpi->fn_ptr[bsize].sdf(
-          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
-          xd->plane[0].pre[0].stride);
-    }
-
-    // Pick the ref frame for partitioning, use golden frame only if its
-    // lower sad.
-    if (y_sad_g < 0.9 * y_sad) {
-      av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
-                           get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
-      mi->ref_frame[0] = GOLDEN_FRAME;
-      mi->mv[0].as_int = 0;
-      y_sad = y_sad_g;
-      ref_frame_partition = GOLDEN_FRAME;
-      x->nonrd_prune_ref_frame_search = 0;
-    } else {
-      x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
-      ref_frame_partition = LAST_FRAME;
-      x->nonrd_prune_ref_frame_search =
-          cpi->sf.rt_sf.nonrd_prune_ref_frame_search;
-    }
-
-    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
-    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
-                                  cm->seq_params.sb_size, AOM_PLANE_Y,
-                                  AOM_PLANE_Y);
-
-    d = xd->plane[0].dst.buf;
-    dp = xd->plane[0].dst.stride;
-  } else {
-    d = AV1_VAR_OFFS;
-    dp = 0;
-  }
-
-  if (low_res && threshold_4x4avg < INT64_MAX)
-    CHECK_MEM_ERROR(cm, vt2, aom_malloc(sizeof(*vt2)));
-  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
-  // for splits.
-  for (m = 0; m < num_64x64_blocks; m++) {
+  for (int m = 0; m < num_64x64_blocks; m++) {
     const int x64_idx = ((m & 1) << 6);
     const int y64_idx = ((m >> 1) << 6);
     const int m2 = m << 2;
-    force_split[m + 1] = 0;
-    max_var_32x32[m] = 0;
-    min_var_32x32[m] = INT_MAX;
-    for (i = 0; i < 4; i++) {
+    force_split[m + 1] = PART_EVAL_ALL;
+
+    for (int i = 0; i < 4; i++) {
       const int x32_idx = x64_idx + ((i & 1) << 5);
       const int y32_idx = y64_idx + ((i >> 1) << 5);
       const int i2 = (m2 + i) << 2;
-      force_split[5 + m2 + i] = 0;
+      force_split[5 + m2 + i] = PART_EVAL_ALL;
       avg_16x16[m][i] = 0;
       maxvar_16x16[m][i] = 0;
       minvar_16x16[m][i] = INT_MAX;
-      for (j = 0; j < 4; j++) {
+      for (int j = 0; j < 4; j++) {
         const int x16_idx = x32_idx + ((j & 1) << 4);
         const int y16_idx = y32_idx + ((j >> 1) << 4);
         const int split_index = 21 + i2 + j;
         VP16x16 *vst = &vt->split[m].split[i].split[j];
-        force_split[split_index] = 0;
+        force_split[split_index] = PART_EVAL_ALL;
         variance4x4downsample[i2 + j] = 0;
         if (!is_key_frame) {
-          fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
-#if CONFIG_AV1_HIGHBITDEPTH
-                               xd->cur_buf->flags,
-#endif
-                               pixels_wide, pixels_high, is_key_frame);
+          fill_variance_8x8avg(src, src_stride, dst, dst_stride, x16_idx,
+                               y16_idx, vst, is_cur_buf_hbd(xd), pixels_wide,
+                               pixels_high, is_key_frame);
+
           fill_variance_tree(&vt->split[m].split[i].split[j], BLOCK_16X16);
           get_variance(&vt->split[m].split[i].split[j].part_variances.none);
           avg_16x16[m][i] +=
@@ -810,42 +985,44 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
             // 16X16 variance is above threshold for split, so force split to
             // 8x8 for this 16x16 block (this also forces splits for upper
             // levels).
-            force_split[split_index] = 1;
-            force_split[5 + m2 + i] = 1;
-            force_split[m + 1] = 1;
-            force_split[0] = 1;
-          } else if (compute_minmax_variance &&
+            force_split[split_index] = PART_EVAL_ONLY_SPLIT;
+            force_split[5 + m2 + i] = PART_EVAL_ONLY_SPLIT;
+            force_split[m + 1] = PART_EVAL_ONLY_SPLIT;
+            force_split[0] = PART_EVAL_ONLY_SPLIT;
+          } else if (!cyclic_refresh_segment_id_boosted(segment_id) &&
+                     compute_minmax_variance &&
                      vt->split[m]
                              .split[i]
                              .split[j]
-                             .part_variances.none.variance > thresholds[2] &&
-                     !cyclic_refresh_segment_id_boosted(segment_id)) {
+                             .part_variances.none.variance > thresholds[2]) {
             // We have some nominal amount of 16x16 variance (based on average),
             // compute the minmax over the 8x8 sub-blocks, and if above
             // threshold, force split to 8x8 block for this 16x16 block.
-            int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+            int minmax = compute_minmax_8x8(src, src_stride, dst, dst_stride,
+                                            x16_idx, y16_idx,
 #if CONFIG_AV1_HIGHBITDEPTH
                                             xd->cur_buf->flags,
 #endif
                                             pixels_wide, pixels_high);
             int thresh_minmax = (int)cpi->vbp_info.threshold_minmax;
             if (minmax > thresh_minmax) {
-              force_split[split_index] = 1;
-              force_split[5 + m2 + i] = 1;
-              force_split[m + 1] = 1;
-              force_split[0] = 1;
+              force_split[split_index] = PART_EVAL_ONLY_SPLIT;
+              force_split[5 + m2 + i] = PART_EVAL_ONLY_SPLIT;
+              force_split[m + 1] = PART_EVAL_ONLY_SPLIT;
+              force_split[0] = PART_EVAL_ONLY_SPLIT;
             }
           }
         }
         if (is_key_frame) {
-          force_split[split_index] = 0;
+          force_split[split_index] = PART_EVAL_ALL;
           // Go down to 4x4 down-sampling for variance.
           variance4x4downsample[i2 + j] = 1;
-          for (k = 0; k < 4; k++) {
+          for (int k = 0; k < 4; k++) {
             int x8_idx = x16_idx + ((k & 1) << 3);
             int y8_idx = y16_idx + ((k >> 1) << 3);
             VP8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
-            fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+            fill_variance_4x4avg(src, src_stride, dst, dst_stride, x8_idx,
+                                 y8_idx, vst2,
 #if CONFIG_AV1_HIGHBITDEPTH
                                  xd->cur_buf->flags,
 #endif
@@ -855,10 +1032,265 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
       }
     }
   }
+}
+
+static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
+                         unsigned int *y_sad_g,
+                         MV_REFERENCE_FRAME *ref_frame_partition, int mi_row,
+                         int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int num_planes = av1_num_planes(cm);
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
+  BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+  // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
+  // is!!
+  MB_MODE_INFO *mi = xd->mi[0];
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  assert(yv12 != NULL);
+  const YV12_BUFFER_CONFIG *yv12_g = NULL;
+
+  // For non-SVC GOLDEN is another temporal reference. Check if it should be
+  // used as reference for partitioning.
+  if (!cpi->ppi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG)) {
+    yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+    if (yv12_g && yv12_g != yv12) {
+      av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
+      *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+          xd->plane[0].pre[0].stride);
+    }
+  }
+
+  av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                       get_ref_scale_factors(cm, LAST_FRAME), num_planes);
+  mi->ref_frame[0] = LAST_FRAME;
+  mi->ref_frame[1] = NONE_FRAME;
+  mi->bsize = cm->seq_params->sb_size;
+  mi->mv[0].as_int = 0;
+  mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+  if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) {
+    if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
+      const MV dummy_mv = { 0, 0 };
+      *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params->sb_size,
+                                             mi_row, mi_col, &dummy_mv);
+    }
+  }
+  if (*y_sad == UINT_MAX) {
+    *y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+        x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+        xd->plane[0].pre[0].stride);
+  }
+
+  // Pick the ref frame for partitioning, use golden frame only if its
+  // lower sad.
+  if (*y_sad_g < 0.9 * *y_sad) {
+    av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                         get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
+    mi->ref_frame[0] = GOLDEN_FRAME;
+    mi->mv[0].as_int = 0;
+    *y_sad = *y_sad_g;
+    *ref_frame_partition = GOLDEN_FRAME;
+    x->nonrd_prune_ref_frame_search = 0;
+  } else {
+    *ref_frame_partition = LAST_FRAME;
+    x->nonrd_prune_ref_frame_search =
+        cpi->sf.rt_sf.nonrd_prune_ref_frame_search;
+  }
+
+  // Only calculate the predictor for non-zero MV.
+  if (mi->mv[0].as_int != 0) {
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
+                                  cm->seq_params->sb_size, AOM_PLANE_Y,
+                                  AOM_PLANE_Y);
+  }
+}
+
+// Decides whether to split or merge a 16x16 partition block in variance based
+// partitioning based on the 8x8 sub-block variances.
+static AOM_INLINE PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var(
+    VP16x16 *var_16x16_info, int64_t threshold16) {
+  int max_8x8_var = 0, min_8x8_var = INT_MAX;
+  for (int k = 0; k < 4; k++) {
+    get_variance(&var_16x16_info->split[k].part_variances.none);
+    int this_8x8_var = var_16x16_info->split[k].part_variances.none.variance;
+    max_8x8_var = AOMMAX(this_8x8_var, max_8x8_var);
+    min_8x8_var = AOMMIN(this_8x8_var, min_8x8_var);
+  }
+  // If the difference between maximum and minimum sub-block variances is high,
+  // then only evaluate PARTITION_SPLIT for the 16x16 block. Otherwise, evaluate
+  // only PARTITION_NONE. The shift factor for threshold16 has been derived
+  // empirically.
+  return ((max_8x8_var - min_8x8_var) > (threshold16 << 2))
+             ? PART_EVAL_ONLY_SPLIT
+             : PART_EVAL_ONLY_NONE;
+}
+
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                      ThreadData *td, MACROBLOCK *x, int mi_row,
+                                      int mi_col) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, choose_var_based_partitioning_time);
+#endif
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds;
 
-  // Fill the rest of the variance tree by summing split partition values.
+  int i, j, k, m;
+  VP128x128 *vt;
+  VP16x16 *vt2 = NULL;
+  PART_EVAL_STATUS force_split[85];
+  int avg_64x64;
+  int max_var_32x32[4];
+  int min_var_32x32[4];
+  int var_32x32;
+  int var_64x64;
+  int min_var_64x64 = INT_MAX;
+  int max_var_64x64 = 0;
+  int avg_16x16[4][4];
+  int maxvar_16x16[4][4];
+  int minvar_16x16[4][4];
+  int64_t threshold_4x4avg;
+  uint8_t *s;
+  const uint8_t *d;
+  int sp;
+  int dp;
+  NOISE_LEVEL noise_level = kLow;
+
+  int is_key_frame =
+      (frame_is_intra_only(cm) ||
+       (cpi->ppi->use_svc &&
+        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
+
+  assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+         cm->seq_params->sb_size == BLOCK_128X128);
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
+  const int num_64x64_blocks = is_small_sb ? 1 : 4;
+
+  unsigned int y_sad = UINT_MAX;
+  unsigned int y_sad_g = UINT_MAX;
+  BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+
+  // Ref frame used in partitioning.
+  MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
+
+  CHECK_MEM_ERROR(cm, vt, aom_malloc(sizeof(*vt)));
+
+  vt->split = td->vt64x64;
+
+  int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1],
+                            vbp_thresholds[2], vbp_thresholds[3],
+                            vbp_thresholds[4] };
+
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  int variance4x4downsample[64];
+  const int segment_id = xd->mi[0]->segment_id;
+
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+      cyclic_refresh_segment_id_boosted(segment_id)) {
+    const int q =
+        av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
+    set_vbp_thresholds(cpi, thresholds, q, x->content_state_sb.low_sumdiff,
+                       x->content_state_sb.source_sad_nonrd,
+                       x->content_state_sb.source_sad_rd, 1);
+  } else {
+    set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex,
+                       x->content_state_sb.low_sumdiff,
+                       x->content_state_sb.source_sad_nonrd,
+                       x->content_state_sb.source_sad_rd, 0);
+  }
+
+  // For non keyframes, disable 4x4 average for low resolution when speed = 8
+  threshold_4x4avg = INT64_MAX;
+
+  s = x->plane[0].src.buf;
+  sp = x->plane[0].src.stride;
+
+  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+  // 5-20 for the 16x16 blocks.
+  force_split[0] = PART_EVAL_ALL;
+  memset(x->part_search_info.variance_low, 0,
+         sizeof(x->part_search_info.variance_low));
+
+  // Check if LAST frame is NULL or if the resolution of LAST is
+  // different than the current frame resolution, and if so, treat this frame
+  // as a key frame, for the purpose of the superblock partitioning.
+  // LAST == NULL can happen in cases where enhancement spatial layers are
+  // enabled dyanmically and the only reference is the spatial(GOLDEN).
+  // TODO(marpan): Check se of scaled references for the different resoln.
+  if (!frame_is_intra_only(cm)) {
+    const YV12_BUFFER_CONFIG *const ref =
+        get_ref_frame_yv12_buf(cm, LAST_FRAME);
+    if (ref == NULL || ref->y_crop_height != cm->height ||
+        ref->y_crop_width != cm->width) {
+      is_key_frame = 1;
+    }
+  }
+
+  if (!is_key_frame) {
+    setup_planes(cpi, x, &y_sad, &y_sad_g, &ref_frame_partition, mi_row,
+                 mi_col);
+
+    MB_MODE_INFO *mi = xd->mi[0];
+    // Use reference SB directly for zero mv.
+    if (mi->mv[0].as_int != 0) {
+      d = xd->plane[0].dst.buf;
+      dp = xd->plane[0].dst.stride;
+    } else {
+      d = xd->plane[0].pre[0].buf;
+      dp = xd->plane[0].pre[0].stride;
+    }
+  } else {
+    d = AV1_VAR_OFFS;
+    dp = 0;
+  }
+
+  x->force_zeromv_skip = 0;
+  const unsigned int thresh_exit_part =
+      (cm->seq_params->sb_size == BLOCK_64X64) ? 5000 : 10000;
+  // If the superblock is completely static (zero source sad) and
+  // the y_sad (relative to LAST ref) is very small, take the sb_size partition
+  // and exit, and force zeromv_last skip mode for nonrd_pickmode.
+  // Only do this when the cyclic refresh is applied, and only on the base
+  // segment (so the QP-boosted segment can still contnue cleaning/ramping
+  // up the quality).
+  // TODO(marpan): Check color component for setting this skip.
+  if (!is_key_frame && cpi->sf.rt_sf.part_early_exit_zeromv &&
+      cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->cyclic_refresh->apply_cyclic_refresh &&
+      segment_id == CR_SEGMENT_ID_BASE &&
+      x->content_state_sb.source_sad_nonrd == kZeroSad &&
+      ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0 &&
+      y_sad < thresh_exit_part) {
+    const int block_width = mi_size_wide[cm->seq_params->sb_size];
+    const int block_height = mi_size_high[cm->seq_params->sb_size];
+    if (mi_col + block_width <= tile->mi_col_end &&
+        mi_row + block_height <= tile->mi_row_end) {
+      set_block_size(cpi, mi_row, mi_col, bsize);
+      x->force_zeromv_skip = 1;
+      if (vt2) aom_free(vt2);
+      if (vt) aom_free(vt);
+      return 0;
+    }
+  }
+
+  if (cpi->noise_estimate.enabled)
+    noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
+
+  if (low_res && threshold_4x4avg < INT64_MAX)
+    CHECK_MEM_ERROR(cm, vt2, aom_malloc(sizeof(*vt2)));
+  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+  // for splits.
+  fill_variance_tree_leaves(cpi, x, vt, vt2, force_split, avg_16x16,
+                            maxvar_16x16, minvar_16x16, variance4x4downsample,
+                            thresholds, s, sp, d, dp);
+
+  avg_64x64 = 0;
   for (m = 0; m < num_64x64_blocks; ++m) {
-    avg_32x32 = 0;
+    max_var_32x32[m] = 0;
+    min_var_32x32[m] = INT_MAX;
     const int m2 = m << 2;
     for (i = 0; i < 4; i++) {
       const int i2 = (m2 + i) << 2;
@@ -874,10 +1306,13 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
           // to split. This also forces a split on the upper levels.
           get_variance(&vtemp->part_variances.none);
           if (vtemp->part_variances.none.variance > thresholds[3]) {
-            force_split[split_index] = 1;
-            force_split[5 + m2 + i] = 1;
-            force_split[m + 1] = 1;
-            force_split[0] = 1;
+            force_split[split_index] =
+                cpi->sf.rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var
+                    ? get_part_eval_based_on_sub_blk_var(vtemp, thresholds[3])
+                    : PART_EVAL_ONLY_SPLIT;
+            force_split[5 + m2 + i] = PART_EVAL_ONLY_SPLIT;
+            force_split[m + 1] = PART_EVAL_ONLY_SPLIT;
+            force_split[0] = PART_EVAL_ONLY_SPLIT;
           }
         }
       }
@@ -886,7 +1321,7 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
       // (some threshold of) the average variance over the sub-16x16 blocks,
       // then force this block to split. This also forces a split on the upper
       // (64x64) level.
-      if (!force_split[5 + m2 + i]) {
+      if (force_split[5 + m2 + i] == PART_EVAL_ALL) {
         get_variance(&vt->split[m].split[i].part_variances.none);
         var_32x32 = vt->split[m].split[i].part_variances.none.variance;
         max_var_32x32[m] = AOMMAX(var_32x32, max_var_32x32[m]);
@@ -898,49 +1333,63 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
                  (thresholds[2] >> 1) &&
              vt->split[m].split[i].part_variances.none.variance >
                  (avg_16x16[m][i] >> 1))) {
-          force_split[5 + m2 + i] = 1;
-          force_split[m + 1] = 1;
-          force_split[0] = 1;
-        } else if (!is_key_frame && cm->height <= 360 &&
-                   (maxvar_16x16[m][i] - minvar_16x16[m][i]) >
-                       (thresholds[2] >> 1) &&
-                   maxvar_16x16[m][i] > thresholds[2]) {
-          force_split[5 + m2 + i] = 1;
-          force_split[m + 1] = 1;
-          force_split[0] = 1;
+          force_split[5 + m2 + i] = PART_EVAL_ONLY_SPLIT;
+          force_split[m + 1] = PART_EVAL_ONLY_SPLIT;
+          force_split[0] = PART_EVAL_ONLY_SPLIT;
+        } else if (!is_key_frame && (cm->width * cm->height <= 640 * 360) &&
+                   (((maxvar_16x16[m][i] - minvar_16x16[m][i]) >
+                         (thresholds[2] >> 1) &&
+                     maxvar_16x16[m][i] > thresholds[2]) ||
+                    (cpi->sf.rt_sf.force_large_partition_blocks &&
+                     x->content_state_sb.source_sad_nonrd > kLowSad &&
+                     cpi->rc.frame_source_sad < 20000 &&
+                     maxvar_16x16[m][i] > (thresholds[2] >> 4) &&
+                     maxvar_16x16[m][i] > (minvar_16x16[m][i] << 2)))) {
+          force_split[5 + m2 + i] = PART_EVAL_ONLY_SPLIT;
+          force_split[m + 1] = PART_EVAL_ONLY_SPLIT;
+          force_split[0] = PART_EVAL_ONLY_SPLIT;
         }
-        avg_32x32 += var_32x32;
       }
     }
-    if (!force_split[1 + m]) {
+    if (force_split[1 + m] == PART_EVAL_ALL) {
       fill_variance_tree(&vt->split[m], BLOCK_64X64);
       get_variance(&vt->split[m].part_variances.none);
       var_64x64 = vt->split[m].part_variances.none.variance;
       max_var_64x64 = AOMMAX(var_64x64, max_var_64x64);
       min_var_64x64 = AOMMIN(var_64x64, min_var_64x64);
-      // If variance of this 64x64 block is above (some threshold of) the
-      // average variance over the sub-32x32 blocks, then force this block to
-      // split. Only checking this for noise level >= medium for now.
+      // If the difference of the max-min variances of sub-blocks or max
+      // variance of a sub-block is above some threshold of then force this
+      // block to split. Only checking this for noise level >= medium, if
+      // encoder is in SVC or if we already forced large blocks.
 
       if (!is_key_frame &&
           (max_var_32x32[m] - min_var_32x32[m]) > 3 * (thresholds[1] >> 3) &&
-          max_var_32x32[m] > thresholds[1] >> 1)
-        force_split[1 + m] = 1;
+          max_var_32x32[m] > thresholds[1] >> 1 &&
+          (noise_level >= kMedium || cpi->ppi->use_svc ||
+           cpi->sf.rt_sf.force_large_partition_blocks)) {
+        force_split[1 + m] = PART_EVAL_ONLY_SPLIT;
+        force_split[0] = PART_EVAL_ONLY_SPLIT;
+      }
+      avg_64x64 += var_64x64;
     }
-    if (is_small_sb) force_split[0] = 1;
+    if (is_small_sb) force_split[0] = PART_EVAL_ONLY_SPLIT;
   }
 
-  if (!force_split[0]) {
+  if (force_split[0] == PART_EVAL_ALL) {
     fill_variance_tree(vt, BLOCK_128X128);
     get_variance(&vt->part_variances.none);
     if (!is_key_frame &&
+        vt->part_variances.none.variance > (9 * avg_64x64) >> 5)
+      force_split[0] = PART_EVAL_ONLY_SPLIT;
+
+    if (!is_key_frame &&
         (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) &&
         max_var_64x64 > thresholds[0] >> 1)
-      force_split[0] = 1;
+      force_split[0] = PART_EVAL_ONLY_SPLIT;
   }
 
   if (mi_col + 32 > tile->mi_col_end || mi_row + 32 > tile->mi_row_end ||
-      !set_vt_partitioning(cpi, x, xd, tile, vt, BLOCK_128X128, mi_row, mi_col,
+      !set_vt_partitioning(cpi, xd, tile, vt, BLOCK_128X128, mi_row, mi_col,
                            thresholds[0], BLOCK_16X16, force_split[0])) {
     for (m = 0; m < num_64x64_blocks; ++m) {
       const int x64_idx = ((m & 1) << 4);
@@ -949,7 +1398,7 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
 
       // Now go through the entire structure, splitting every block size until
       // we get to one that's got a variance lower than our threshold.
-      if (!set_vt_partitioning(cpi, x, xd, tile, &vt->split[m], BLOCK_64X64,
+      if (!set_vt_partitioning(cpi, xd, tile, &vt->split[m], BLOCK_64X64,
                                mi_row + y64_idx, mi_col + x64_idx,
                                thresholds[1], BLOCK_16X16,
                                force_split[1 + m])) {
@@ -957,7 +1406,7 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
           const int x32_idx = ((i & 1) << 3);
           const int y32_idx = ((i >> 1) << 3);
           const int i2 = (m2 + i) << 2;
-          if (!set_vt_partitioning(cpi, x, xd, tile, &vt->split[m].split[i],
+          if (!set_vt_partitioning(cpi, xd, tile, &vt->split[m].split[i],
                                    BLOCK_32X32, (mi_row + y64_idx + y32_idx),
                                    (mi_col + x64_idx + x32_idx), thresholds[2],
                                    BLOCK_16X16, force_split[5 + m2 + i])) {
@@ -972,7 +1421,7 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
                   (!is_key_frame && variance4x4downsample[i2 + j] == 1)
                       ? &vt2[i2 + j]
                       : &vt->split[m].split[i].split[j];
-              if (!set_vt_partitioning(cpi, x, xd, tile, vtemp, BLOCK_16X16,
+              if (!set_vt_partitioning(cpi, xd, tile, vtemp, BLOCK_16X16,
                                        mi_row + y64_idx + y32_idx + y16_idx,
                                        mi_col + x64_idx + x32_idx + x16_idx,
                                        thresholds[3], BLOCK_8X8,
@@ -981,8 +1430,7 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
                   const int x8_idx = (k & 1) << 1;
                   const int y8_idx = (k >> 1) << 1;
                   set_block_size(
-                      cpi, x, xd,
-                      (mi_row + y64_idx + y32_idx + y16_idx + y8_idx),
+                      cpi, (mi_row + y64_idx + y32_idx + y16_idx + y8_idx),
                       (mi_col + x64_idx + x32_idx + x16_idx + x8_idx),
                       BLOCK_8X8);
                 }
@@ -995,12 +1443,15 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
   }
 
   if (cpi->sf.rt_sf.short_circuit_low_temp_var) {
-    set_low_temp_var_flag(cpi, x, xd, vt, thresholds, ref_frame_partition,
-                          mi_col, mi_row);
+    set_low_temp_var_flag(cpi, &x->part_search_info, xd, vt, thresholds,
+                          ref_frame_partition, mi_col, mi_row);
   }
   chroma_check(cpi, x, bsize, y_sad, is_key_frame);
 
   if (vt2) aom_free(vt2);
   if (vt) aom_free(vt);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, choose_var_based_partitioning_time);
+#endif
   return 0;
 }
diff --git a/media/libaom/src/av1/encoder/var_based_part.h b/media/libaom/src/av1/encoder/var_based_part.h
index a80e25c329..5176751342 100644
--- a/media/libaom/src/av1/encoder/var_based_part.h
+++ b/media/libaom/src/av1/encoder/var_based_part.h
@@ -24,20 +24,73 @@
 extern "C" {
 #endif
 
-#define QINDEX_LOW_THR \
-  200  // Use low qindex variance partition thresholds when qindex is below this
-       // threshold
-#define QINDEX_HIGH_THR \
-  220  // Use high qindex variance partition thresholds when qindex is above
+#define QINDEX_LARGE_BLOCK_THR \
+  100  // Use increased thresholds for midres for speed 9 when qindex is above
        // this threshold
 
+/*!\brief Set the thresholds for variance based partition.
+ *
+ * Set the variance split thresholds for following the block sizes:
+ * 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32,
+ * 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is
+ * currently only used on key frame. The thresholds are based om Q, resolution,
+ * noise level, and content state.
+ *
+ * \ingroup variance_partition
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]      cpi                Top level encoder structure
+ * \param[in]      q                  q index
+ * \param[in]      content_lowsumdiff Low sumdiff flag for superblock
+ *
+ * \return Returns the set of thresholds in \c cpi->vbp_info.thresholds.
+ */
 void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
-                                           int content_state);
+                                           int content_lowsumdiff);
 
+/*!\brief Variance based partition selection.
+ *
+ * Select the partitioning based on the variance of the residual signal,
+ * residual generated as the difference between the source and prediction.
+ * The prediction is the reconstructed LAST or reconstructed GOLDEN, whichever
+ * has lower y sad. For LAST, option exists (speed feature) to use motion
+ * compensation based on superblock motion via int_pro_motion_estimation. For
+ * key frames reference is fixed 128 level, so variance is the source variance.
+ * The variance is computed for downsampled inputs (8x8 or 4x4 downsampled),
+ * and selection is done top-down via as set of partition thresholds. defined
+ * for each block level, and set based on Q, resolution, noise level, and
+ * content state.
+ *
+ * \ingroup variance_partition
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]       cpi          Top level encoder structure
+ * \param[in]       tile         Pointer to TileInfo
+ * \param[in]       td           Pointer to ThreadData
+ * \param[in]       x            Pointer to MACROBLOCK
+ * \param[in]       mi_row       Row coordinate of the superblock in a step
+ size of MI_SIZE
+ * \param[in]       mi_col       Column coordinate of the super block in a step
+ size of MI_SIZE
+ *
+ * \return Returns the partition in \c xd->mi[0]->sb_type. Also sets the low
+ * temporal variance flag and the color sensitivity flag (both used in
+ * nonrd_pickmode).
+ */
 int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
                                       ThreadData *td, MACROBLOCK *x, int mi_row,
                                       int mi_col);
 
+// Read out the block's temporal variance for 64x64 SB case.
+int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low,
+                                             int mi_row, int mi_col,
+                                             BLOCK_SIZE bsize);
+// Read out the block's temporal variance for 128x128 SB case.
+int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row,
+                                    int mi_col, BLOCK_SIZE bsize);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/media/libaom/src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
index 62eaa30747..494b0fdf15 100644
--- a/media/libaom/src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
+++ b/media/libaom/src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -408,22 +408,18 @@ void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
   (void)stage_range;
   for (col = 0; col < col_num; col++) {
     // stage 0;
-    int32_t stage_idx = 0;
     int j;
     for (j = 0; j < 4; ++j) {
       buf0[j] = input[j * col_num + col];
     }
 
     // stage 1
-    stage_idx++;
     buf1[0] = buf0[3];
     buf1[1] = buf0[0];
     buf1[2] = buf0[1];
     buf1[3] = buf0[2];
 
     // stage 2
-    stage_idx++;
-
     cospi = cospi_arr(cos_bit);
     btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
                         cos_bit);
@@ -431,15 +427,12 @@ void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
                         buf0[3], cos_bit);
 
     // stage 3
-    stage_idx++;
     buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
     buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
     buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
     buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
 
     // stage 4
-    stage_idx++;
-
     cospi = cospi_arr(cos_bit);
     buf0[0] = buf1[0];
     buf0[1] = buf1[1];
@@ -447,7 +440,6 @@ void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
                         buf0[3], cos_bit);
 
     // stage 5
-    stage_idx++;
     buf1[0] = buf0[0];
     buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]);
     buf1[2] = buf0[3];
diff --git a/media/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/media/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
index 634d50bb22..fa5c66abf5 100644
--- a/media/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
+++ b/media/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -1704,8 +1704,8 @@ static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
     }
     fdct64_new_avx2(bufA, bufA, cos_bit_row);
     fdct64_new_avx2(bufB, bufB, cos_bit_row);
-    av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
-    av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
+    round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
+    round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
 
     int32_t *output8 = output + 16 * 32 * i;
     for (int j = 0; j < 4; ++j) {
@@ -1843,8 +1843,8 @@ static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
     }
     fdct64_new_avx2(bufA, bufA, cos_bit_row);
     fdct64_new_avx2(bufB, bufB, cos_bit_row);
-    av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
-    av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
+    round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
+    round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
 
     int32_t *output8 = output + 16 * 32 * i;
     for (int j = 0; j < 4; ++j) {
@@ -1893,8 +1893,8 @@ static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
     }
     fdct32_avx2(bufA, bufA, cos_bit_row);
     fdct32_avx2(bufB, bufB, cos_bit_row);
-    av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
-    av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
+    round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
+    round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
 
     int32_t *output8 = output + 16 * 32 * i;
     for (int j = 0; j < 4; ++j) {
@@ -2804,8 +2804,7 @@ static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
 void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
                              int diff_stride, TxfmParam *txfm_param) {
   FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
-  if ((fwd_txfm2d_func == NULL) ||
-      (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+  if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
     av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
   } else {
     fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
diff --git a/media/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/media/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
index 0bc3fbc2d6..db554c4a50 100644
--- a/media/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
+++ b/media/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -354,8 +354,7 @@ static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
 void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff,
                                int diff_stride, TxfmParam *txfm_param) {
   FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
-  if ((fwd_txfm2d_func == NULL) ||
-      (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+  if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
     av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
   } else {
     fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
diff --git a/media/libaom/src/av1/encoder/x86/av1_k_means_avx2.c b/media/libaom/src/av1/encoder/x86/av1_k_means_avx2.c
new file mode 100644
index 0000000000..23a7369e99
--- /dev/null
+++ b/media/libaom/src/av1/encoder/x86/av1_k_means_avx2.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>  // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_calc_indices_dim1_avx2(const int *data, const int *centroids,
+                                uint8_t *indices, int n, int k) {
+  __m256i dist[PALETTE_MAX_SIZE];
+  const __m256i v_zero = _mm256_setzero_si256();
+
+  for (int i = 0; i < n; i += 8) {
+    __m256i ind = _mm256_loadu_si256((__m256i *)data);
+    for (int j = 0; j < k; j++) {
+      __m256i cent = _mm256_set1_epi32((uint32_t)centroids[j]);
+      __m256i d1 = _mm256_sub_epi32(ind, cent);
+      dist[j] = _mm256_mullo_epi32(d1, d1);
+    }
+
+    ind = _mm256_setzero_si256();
+    for (int j = 1; j < k; j++) {
+      __m256i cmp = _mm256_cmpgt_epi32(dist[0], dist[j]);
+      __m256i dist1 = _mm256_andnot_si256(cmp, dist[0]);
+      __m256i dist2 = _mm256_and_si256(cmp, dist[j]);
+      dist[0] = _mm256_or_si256(dist1, dist2);
+      __m256i ind1 = _mm256_set1_epi32(j);
+      ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind),
+                            _mm256_and_si256(cmp, ind1));
+    }
+
+    __m256i p1 = _mm256_packus_epi32(ind, v_zero);
+    __m256i px = _mm256_permute4x64_epi64(p1, 0x58);
+    __m256i p2 = _mm256_packus_epi16(px, v_zero);
+    __m128i d1 = _mm256_extracti128_si256(p2, 0);
+
+    _mm_storel_epi64((__m128i *)indices, d1);
+
+    indices += 8;
+    data += 8;
+  }
+}
+
+void av1_calc_indices_dim2_avx2(const int *data, const int *centroids,
+                                uint8_t *indices, int n, int k) {
+  __m256i dist[PALETTE_MAX_SIZE];
+  const __m256i v_zero = _mm256_setzero_si256();
+  const __m256i v_permute = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
+
+  for (int i = 0; i < n; i += 8) {
+    __m256i ind1 = _mm256_loadu_si256((__m256i *)data);
+    __m256i ind2 = _mm256_loadu_si256((__m256i *)(data + 8));
+    for (int j = 0; j < k; j++) {
+      __m128i cent0 = _mm_loadl_epi64((__m128i const *)&centroids[2 * j]);
+      __m256i cent1 = _mm256_inserti128_si256(v_zero, cent0, 0);
+      cent1 = _mm256_inserti128_si256(cent1, cent0, 1);
+      __m256i cent = _mm256_unpacklo_epi64(cent1, cent1);
+      __m256i d1 = _mm256_sub_epi32(ind1, cent);
+      __m256i d2 = _mm256_sub_epi32(ind2, cent);
+      __m256i d3 = _mm256_mullo_epi32(d1, d1);
+      __m256i d4 = _mm256_mullo_epi32(d2, d2);
+      __m256i d5 = _mm256_hadd_epi32(d3, d4);
+      dist[j] = _mm256_permutevar8x32_epi32(d5, v_permute);
+    }
+
+    __m256i ind = _mm256_setzero_si256();
+    for (int j = 1; j < k; j++) {
+      __m256i cmp = _mm256_cmpgt_epi32(dist[0], dist[j]);
+      __m256i dist1 = _mm256_andnot_si256(cmp, dist[0]);
+      __m256i dist2 = _mm256_and_si256(cmp, dist[j]);
+      dist[0] = _mm256_or_si256(dist1, dist2);
+      ind1 = _mm256_set1_epi32(j);
+      ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind),
+                            _mm256_and_si256(cmp, ind1));
+    }
+
+    __m256i p1 = _mm256_packus_epi32(ind, v_zero);
+    __m256i px = _mm256_permute4x64_epi64(p1, 0x58);
+    __m256i p2 = _mm256_packus_epi16(px, v_zero);
+    __m128i d1 = _mm256_extracti128_si256(p2, 0);
+
+    _mm_storel_epi64((__m128i *)indices, d1);
+
+    indices += 8;
+    data += 16;
+  }
+}
diff --git a/media/libaom/src/av1/encoder/x86/av1_k_means_sse2.c b/media/libaom/src/av1/encoder/x86/av1_k_means_sse2.c
new file mode 100644
index 0000000000..43f661fdaf
--- /dev/null
+++ b/media/libaom/src/av1/encoder/x86/av1_k_means_sse2.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_calc_indices_dim1_sse2(const int *data, const int *centroids,
+                                uint8_t *indices, int n, int k) {
+  const __m128i v_zero = _mm_setzero_si128();
+  int l = 1;
+  __m128i dist[PALETTE_MAX_SIZE];
+  __m128i ind[2];
+
+  for (int i = 0; i < n; i += 4) {
+    l = (l == 0) ? 1 : 0;
+    ind[l] = _mm_loadu_si128((__m128i *)data);
+    for (int j = 0; j < k; j++) {
+      __m128i cent = _mm_set1_epi32((uint32_t)centroids[j]);
+      __m128i d1 = _mm_sub_epi32(ind[l], cent);
+      __m128i d2 = _mm_packs_epi32(d1, d1);
+      __m128i d3 = _mm_mullo_epi16(d2, d2);
+      __m128i d4 = _mm_mulhi_epi16(d2, d2);
+      dist[j] = _mm_unpacklo_epi16(d3, d4);
+    }
+
+    ind[l] = _mm_setzero_si128();
+    for (int j = 1; j < k; j++) {
+      __m128i cmp = _mm_cmpgt_epi32(dist[0], dist[j]);
+      __m128i dist1 = _mm_andnot_si128(cmp, dist[0]);
+      __m128i dist2 = _mm_and_si128(cmp, dist[j]);
+      dist[0] = _mm_or_si128(dist1, dist2);
+      __m128i ind1 = _mm_set1_epi32(j);
+      ind[l] =
+          _mm_or_si128(_mm_andnot_si128(cmp, ind[l]), _mm_and_si128(cmp, ind1));
+    }
+    ind[l] = _mm_packus_epi16(ind[l], v_zero);
+    if (l == 1) {
+      __m128i p2 = _mm_packus_epi16(_mm_unpacklo_epi64(ind[0], ind[1]), v_zero);
+      _mm_storel_epi64((__m128i *)indices, p2);
+      indices += 8;
+    }
+    data += 4;
+  }
+}
+
+void av1_calc_indices_dim2_sse2(const int *data, const int *centroids,
+                                uint8_t *indices, int n, int k) {
+  const __m128i v_zero = _mm_setzero_si128();
+  int l = 1;
+  __m128i dist[PALETTE_MAX_SIZE];
+  __m128i ind[2];
+
+  for (int i = 0; i < n; i += 4) {
+    l = (l == 0) ? 1 : 0;
+    __m128i ind1 = _mm_loadu_si128((__m128i *)data);
+    __m128i ind2 = _mm_loadu_si128((__m128i *)(data + 4));
+    __m128i indl = _mm_unpacklo_epi32(ind1, ind2);
+    __m128i indh = _mm_unpackhi_epi32(ind1, ind2);
+    ind1 = _mm_unpacklo_epi32(indl, indh);
+    ind2 = _mm_unpackhi_epi32(indl, indh);
+    for (int j = 0; j < k; j++) {
+      __m128i cent0 = _mm_set1_epi32(centroids[2 * j]);
+      __m128i cent1 = _mm_set1_epi32(centroids[2 * j + 1]);
+      __m128i d1 = _mm_sub_epi32(ind1, cent0);
+      __m128i d2 = _mm_sub_epi32(ind2, cent1);
+      __m128i d3 = _mm_madd_epi16(d1, d1);
+      __m128i d4 = _mm_madd_epi16(d2, d2);
+      dist[j] = _mm_add_epi32(d3, d4);
+    }
+
+    ind[l] = _mm_setzero_si128();
+    for (int j = 1; j < k; j++) {
+      __m128i cmp = _mm_cmpgt_epi32(dist[0], dist[j]);
+      __m128i dist1 = _mm_andnot_si128(cmp, dist[0]);
+      __m128i dist2 = _mm_and_si128(cmp, dist[j]);
+      dist[0] = _mm_or_si128(dist1, dist2);
+      ind1 = _mm_set1_epi32(j);
+      ind[l] =
+          _mm_or_si128(_mm_andnot_si128(cmp, ind[l]), _mm_and_si128(cmp, ind1));
+    }
+    ind[l] = _mm_packus_epi16(ind[l], v_zero);
+    if (l == 1) {
+      __m128i p2 = _mm_packus_epi16(_mm_unpacklo_epi64(ind[0], ind[1]), v_zero);
+      _mm_storel_epi64((__m128i *)indices, p2);
+      indices += 8;
+    }
+    data += 8;
+  }
+}
diff --git a/media/libaom/src/av1/encoder/x86/av1_quantize_avx2.c b/media/libaom/src/av1/encoder/x86/av1_quantize_avx2.c
index f5f7ee115d..591edd7061 100644
--- a/media/libaom/src/av1/encoder/x86/av1_quantize_avx2.c
+++ b/media/libaom/src/av1/encoder/x86/av1_quantize_avx2.c
@@ -154,22 +154,18 @@ static INLINE int16_t accumulate_eob(__m128i eob) {
   return _mm_extract_epi16(eob, 1);
 }
 
-static INLINE void store_zero_tran_low(int16_t *a) {
-  const __m256i zero = _mm256_setzero_si256();
-  _mm256_storeu_si256((__m256i *)(a), zero);
-}
-
 void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan) {
+                          const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
   __m128i eob;
   __m256i round256, quant256, dequant256;
-  __m256i eob256, thr256;
+  __m256i eob256;
 
   coeff_ptr += n_coeffs;
-  scan += n_coeffs;
+  iscan += n_coeffs;
   qcoeff_ptr += n_coeffs;
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
@@ -205,7 +201,7 @@ void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs,
       _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256);
     }
 
-    eob256 = scan_eob_256((const __m256i *)(scan + n_coeffs), &coeff256);
+    eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256);
     n_coeffs += 8 * 2;
   }
 
@@ -214,30 +210,22 @@ void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs,
   quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
   round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
 
-  thr256 = _mm256_srai_epi16(dequant256, 1);
-
   // AC only loop
   while (n_coeffs < 0) {
     __m256i coeff256 =
         _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs));
     __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
-    int32_t nzflag =
-        _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
 
-    if (nzflag) {
-      __m256i qtmp256;
-      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
-      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
-      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
-      _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff256);
-      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
-      _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256);
-      eob256 = _mm256_max_epi16(
-          eob256, scan_eob_256((const __m256i *)(scan + n_coeffs), &coeff256));
-    } else {
-      store_zero_tran_low(qcoeff_ptr + n_coeffs);
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-    }
+    __m256i qtmp256;
+    qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+    qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+    qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+    _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff256);
+    coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256);
+    eob256 = _mm256_max_epi16(
+        eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256));
+
     n_coeffs += 8 * 2;
   }
 
diff --git a/media/libaom/src/av1/encoder/x86/av1_quantize_sse2.c b/media/libaom/src/av1/encoder/x86/av1_quantize_sse2.c
index 5497c7eb78..b533894015 100644
--- a/media/libaom/src/av1/encoder/x86/av1_quantize_sse2.c
+++ b/media/libaom/src/av1/encoder/x86/av1_quantize_sse2.c
@@ -15,6 +15,7 @@
 #include "config/av1_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
 
 static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
                               __m128i *c0, __m128i *c1) {
@@ -187,3 +188,102 @@ void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     *eob_ptr = _mm_extract_epi16(eob, 1);
   }
 }
+
+static INLINE void quantize_lp(const int16_t *iscan_ptr,
+                               const int16_t *coeff_ptr, intptr_t n_coeffs,
+                               int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                               const __m128i *round0, const __m128i *round1,
+                               const __m128i *quant0, const __m128i *quant1,
+                               const __m128i *dequant0, const __m128i *dequant1,
+                               __m128i *eob) {
+  const int16_t *read = coeff_ptr + n_coeffs;
+  __m128i coeff0 = _mm_load_si128((const __m128i *)read);
+  __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1);
+
+  // Poor man's sign extract
+  const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+  __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+  qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+  const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+  const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+  // Reinsert signs
+  qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+  qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+  int16_t *addr = qcoeff_ptr + n_coeffs;
+  _mm_store_si128((__m128i *)addr, qcoeff0);
+  _mm_store_si128((__m128i *)addr + 1, qcoeff1);
+
+  coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+  coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+  addr = dqcoeff_ptr + n_coeffs;
+  _mm_store_si128((__m128i *)addr, coeff0);
+  _mm_store_si128((__m128i *)addr + 1, coeff1);
+
+  const __m128i zero = _mm_setzero_si128();
+  // Scan for eob
+  const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+  const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+  const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+  const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+
+  const __m128i iscan0 =
+      _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+  const __m128i iscan1 =
+      _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+
+  // Add one to convert from indices to counts
+  const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+  const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+  const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+  const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+  const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+  *eob = _mm_max_epi16(*eob, eob2);
+}
+
+void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+  const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+  const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+  const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+  const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+  const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+  __m128i eob = _mm_setzero_si128();
+
+  // DC and first 15 AC
+  quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+              &round1, &quant0, &quant1, &dequant0, &dequant1, &eob);
+  n_coeffs += 8 * 2;
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+                &round1, &quant1, &quant1, &dequant1, &dequant1, &eob);
+    n_coeffs += 8 * 2;
+  }
+
+  // Accumulate EOB
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libaom/src/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/media/libaom/src/av1/encoder/x86/av1_ssim_opt_x86_64.asm
index faa2a232a3..618758105a 100644
--- a/media/libaom/src/av1/encoder/x86/av1_ssim_opt_x86_64.asm
+++ b/media/libaom/src/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -67,7 +67,7 @@ SECTION .text
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(av1_ssim_parms_16x16_sse2) PRIVATE
+globalsym(av1_ssim_parms_16x16_sse2)
 sym(av1_ssim_parms_16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -157,7 +157,7 @@ sym(av1_ssim_parms_16x16_sse2):
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(av1_ssim_parms_8x8_sse2) PRIVATE
+globalsym(av1_ssim_parms_8x8_sse2)
 sym(av1_ssim_parms_8x8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libaom/src/av1/encoder/x86/av1_temporal_denoiser_sse2.c b/media/libaom/src/av1/encoder/x86/av1_temporal_denoiser_sse2.c
new file mode 100644
index 0000000000..830f40ecb0
--- /dev/null
+++ b/media/libaom/src/av1/encoder/x86/av1_temporal_denoiser_sse2.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int sum_diff_16x1(__m128i acc_diff) {
+  const __m128i k_1 = _mm_set1_epi16(1);
+  const __m128i acc_diff_lo =
+      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi =
+      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+  const __m128i hgfe_dcba =
+      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba =
+      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+  return _mm_cvtsi128_si32(hgfedcba);
+}
+
+// Denoise a 16x1 vector.
+static INLINE __m128i av1_denoiser_16x1_sse2(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
+    const __m128i *k_16, const __m128i *l3, const __m128i *l32,
+    const __m128i *l21, __m128i acc_diff) {
+  // Calculate differences
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  __m128i v_running_avg_y;
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
+  // Clamp absolute difference to 16 to be used to get mask. Doing this
+  // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
+  const __m128i clamped_absdiff =
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
+  // Get masks for l2 l1 and l0 adjustments.
+  const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
+  const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
+  const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
+  // Get adjustments for l2, l1, and l0.
+  __m128i adj2 = _mm_and_si128(mask2, *l32);
+  const __m128i adj1 = _mm_and_si128(mask1, *l21);
+  const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+  __m128i adj, padj, nadj;
+
+  // Combine the adjustments and get absolute adjustments.
+  adj2 = _mm_add_epi8(adj2, adj1);
+  adj = _mm_sub_epi8(*l3, adj2);
+  adj = _mm_andnot_si128(mask0, adj);
+  adj = _mm_or_si128(adj, adj0);
+
+  // Restore the sign and get positive and negative adjustments.
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+
+  // Calculate filtered value.
+  v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Adjustments <=7, and each element in acc_diff can fit in signed
+  // char.
+  acc_diff = _mm_adds_epi8(acc_diff, padj);
+  acc_diff = _mm_subs_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoise a 16x1 vector with a weaker filter.
+static INLINE __m128i av1_denoiser_adj_16x1_sse2(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
+  __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
+  // Calculate differences.
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+  // Clamp absolute difference to delta to get the adjustment.
+  const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+  // Restore the sign and get positive and negative adjustments.
+  __m128i padj, nadj;
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+  // Calculate filtered value.
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
+  v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Accumulate the adjustments.
+  acc_diff = _mm_subs_epi8(acc_diff, padj);
+  acc_diff = _mm_adds_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int av1_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride,
+                                       const uint8_t *mc_running_avg_y,
+                                       int mc_avg_y_stride,
+                                       uint8_t *running_avg_y, int avg_y_stride,
+                                       int increase_denoising, BLOCK_SIZE bs,
+                                       int motion_magnitude, int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+  __m128i acc_diff = _mm_setzero_si128();
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+  const int b_height = block_size_high[bs] >> 1;
+
+  for (r = 0; r < b_height; ++r) {
+    memcpy(sig_buffer[r], sig, width);
+    memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+           width);
+    memcpy(running_buffer[r], running_avg_y, width);
+    memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+    acc_diff = av1_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r],
+                                      running_buffer[r], &k_0, &k_4, &k_8,
+                                      &k_16, &l3, &l32, &l21, acc_diff);
+    memcpy(running_avg_y, running_buffer[r], width);
+    memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
+    // Update pointers for next iteration.
+    sig += (sig_stride << 1);
+    mc_running_avg_y += (mc_avg_y_stride << 1);
+    running_avg_y += (avg_y_stride << 1);
+  }
+
+  {
+    sum_diff = sum_diff_16x1(acc_diff);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // check if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the acceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
+          acc_diff = av1_denoiser_adj_16x1_sse2(
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0,
+              k_delta, acc_diff);
+          memcpy(running_avg_y, running_buffer[r], width);
+          memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width,
+                 width);
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << 1);
+        }
+        sum_diff = sum_diff_16x1(acc_diff);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+// Denoise 16x16 to 128x128 blocks.
+static int av1_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
+                                     const uint8_t *mc_running_avg_y,
+                                     int mc_avg_y_stride,
+                                     uint8_t *running_avg_y, int avg_y_stride,
+                                     int increase_denoising, BLOCK_SIZE bs,
+                                     int motion_magnitude) {
+  int sum_diff_thresh, r, c, sum_diff = 0;
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  __m128i acc_diff[8][8];
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+  const int b_width = block_size_wide[bs];
+  const int b_height = block_size_high[bs];
+  const int b_width_shift4 = b_width >> 4;
+
+  for (r = 0; r < 8; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      acc_diff[c][r] = _mm_setzero_si128();
+    }
+  }
+
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      acc_diff[c][r >> 4] = av1_denoiser_16x1_sse2(
+          sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3,
+          &l32, &l21, acc_diff[c][r >> 4]);
+      // Update pointers for next iteration.
+      sig += 16;
+      mc_running_avg_y += 16;
+      running_avg_y += 16;
+    }
+
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
+  }
+
+  {
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
+        sum_diff = 0;
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            acc_diff[c][r >> 4] =
+                av1_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y,
+                                           k_0, k_delta, acc_diff[c][r >> 4]);
+            // Update pointers for next iteration.
+            sig += 16;
+            mc_running_avg_y += 16;
+            running_avg_y += 16;
+          }
+
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
+            }
+          }
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
+        }
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+int av1_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
+                             const uint8_t *mc_avg, int mc_avg_stride,
+                             uint8_t *avg, int avg_stride,
+                             int increase_denoising, BLOCK_SIZE bs,
+                             int motion_magnitude) {
+  // Rank by frequency of the block type to have an early termination.
+  if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+      bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 ||
+      bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+      bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+    return av1_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride,
+                                     avg, avg_stride, increase_denoising, bs,
+                                     motion_magnitude);
+  } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return av1_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride,
+                                       avg, avg_stride, increase_denoising, bs,
+                                       motion_magnitude, 8);
+  } else {
+    return COPY_BLOCK;
+  }
+}
diff --git a/media/libaom/src/av1/encoder/x86/cnn_avx2.c b/media/libaom/src/av1/encoder/x86/cnn_avx2.c
new file mode 100644
index 0000000000..ee93b3d5a0
--- /dev/null
+++ b/media/libaom/src/av1/encoder/x86/cnn_avx2.c
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/cnn.h"
+
+// This mask rearranges source pixels in the order shown below.
+// shuffle_src_layer0[0][8]: applied on source pixels 0 to 7.
+// shuffle_src_layer0[1][8]: applied on source pixels 7 to 14.
+// This shuffling is needed to process 3 5x5 blocks which need
+// source pixels in the following order.
+// 1st 5x5 block: source pixels needed are 0 to 4,
+// 2nd 5x5 block: source pixels needed are 4 to 8,
+// 3rd 5x5 block: source pixels needed are 8 to 12.
+// Source pixels are loaded like mentioned below.
+// load_src0 : 0, 1, 2, 3, 4, 5, 6, 7
+// load_src1 : 7, 8, 9, 10, 11, 12, 13, 14
+// After applying masks, source bytes will be in the order:
+// load_src0 : 0, 1, 2, 3, 4, 4, 5, 6
+//             consists 5 pixels needed for 1st 5x5 block and
+//             first 3 pixels needed for 2nd 5x5 block.
+// load_src1 : 7, 8, 8, 9, 10, 11, 12, x
+//             consists last 2 pixels needed for 2nd 5x5 block and
+//             5 pixels needed for 3rd 5x5 block.
+DECLARE_ALIGNED(32, static const uint32_t,
+                shuffle_src_layer0[2][8]) = { { 0, 1, 2, 3, 4, 4, 5, 6 },
+                                              { 0, 1, 1, 2, 3, 4, 5, 0 } };
+
+// This mask rearrange the weights to match shuffled source pixels order.
+DECLARE_ALIGNED(32, static const uint32_t,
+                shuffle_weight_layer0[2][8]) = { { 0, 1, 2, 3, 4, 0, 1, 2 },
+                                                 { 3, 4, 0, 1, 2, 3, 4, 0 } };
+
+// Shuffle mask used to rearrange weights corresponding to layer 1 and layer 2.
+// For layer 1 and layer 2, convolution happens at 2x2 as filter_width and
+// filter_height are equal to 2. So rearranging the weights in the
+// order shown below to match source pixels. Basically this mask replicates
+// the weights across the width of 2.
+DECLARE_ALIGNED(32, static const uint32_t,
+                shuffle_weight_layer_1_and_2[2][8]) = {
+  { 0, 1, 0, 1, 0, 1, 0, 1 }, { 2, 3, 2, 3, 2, 3, 2, 3 }
+};
+
+// After the stages of multiplication and accumulation, the output values
+// in the register will be jumbled. In order to store register into
+// output buffer in a proper way, the following mask is applied on output
+// register.
+DECLARE_ALIGNED(32, static const uint32_t,
+                shuffle_output_layer_1_and_2[8]) = { 0, 1, 4, 5, 2, 3, 6, 7 };
+
+// Load weights needed for layer 0 (for 5x5 block processing),
+// and fill the registers appropriately to match source pixel mapping.
+static INLINE void prepare_weights_for_5x5_convolve(
+    const float *layer_config_weights, int off, float weight[5][8],
+    const int cstep, __m256 *shuffle_weight, const __m256i weight_mask_0,
+    const __m256i weight_mask_1) {
+  for (int row = 0; row < 5; ++row) {
+    for (int col = 0; col < 5; ++col) {
+      weight[row][col] = layer_config_weights[off];
+      off += cstep;
+    }
+  }
+  shuffle_weight[0] = _mm256_loadu_ps(weight[0]);
+  shuffle_weight[1] = _mm256_loadu_ps(weight[1]);
+  shuffle_weight[2] = _mm256_loadu_ps(weight[2]);
+  shuffle_weight[3] = _mm256_loadu_ps(weight[3]);
+  shuffle_weight[4] = _mm256_loadu_ps(weight[4]);
+
+  shuffle_weight[0] =
+      _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_0);
+  shuffle_weight[1] =
+      _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_0);
+  shuffle_weight[2] =
+      _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_0);
+  shuffle_weight[3] =
+      _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_0);
+  shuffle_weight[4] =
+      _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_0);
+  shuffle_weight[5] =
+      _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_1);
+  shuffle_weight[6] =
+      _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_1);
+  shuffle_weight[7] =
+      _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_1);
+  shuffle_weight[8] =
+      _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_1);
+  shuffle_weight[9] =
+      _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_1);
+}
+
+// For each row, loads source pixels 0 to 7(load_src_0), 7 to 14(load_src_1) and
+// arranges them appropriately to process 3 blocks.
+#define PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS()                            \
+  do {                                                                 \
+    for (int row = 0; row < 5; row++) {                                \
+      load_src_0 = _mm256_loadu_ps(input_ptr);                         \
+      load_src_1 = _mm256_loadu_ps(input_ptr + 7);                     \
+      load_src_0 = _mm256_permutevar8x32_ps(load_src_0, block0_1);     \
+      load_src_1 = _mm256_permutevar8x32_ps(load_src_1, block1_2);     \
+      load_src_0 = _mm256_mul_ps(load_src_0, shuffle_weight[0 + row]); \
+      load_src_1 = _mm256_mul_ps(load_src_1, shuffle_weight[5 + row]); \
+      accum_src_0 = _mm256_add_ps(load_src_0, accum_src_0);            \
+      accum_src_1 = _mm256_add_ps(load_src_1, accum_src_1);            \
+      input_ptr += in_stride;                                          \
+    }                                                                  \
+  } while (0)
+
+// Load masks needed for shuffling of output and weights.
+static INLINE void load_shuffle_masks_for_2x2_convolve(__m256i *output_mask,
+                                                       __m256i *weight_mask) {
+  // Load shuffle buffer needed to sort the output.
+  *output_mask =
+      _mm256_load_si256((const __m256i *)shuffle_output_layer_1_and_2);
+
+  // Load shuffle buffers needed for weight.
+  weight_mask[0] =
+      _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[0]);
+  weight_mask[1] =
+      _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[1]);
+}
+
+// Load weights needed for layer 1 and 2 (for 2x2 block processing),
+// and fill the registers appropriately to match source pixel mapping.
+static INLINE void prepare_weights_for_2x2_convolve(
+    const float *layer_config_weights, int off, const int cstep,
+    __m256 *shuffle_weight, __m256i *weight_mask) {
+  // Weights needed for 2x2 block.
+  float weight[4] = { 0 };
+  for (int i = 0; i < 4; ++i) {
+    weight[i] = layer_config_weights[off];
+    off += cstep;
+  }
+
+  const __m256 weight_vec = _mm256_castps128_ps256(_mm_loadu_ps(weight));
+  shuffle_weight[0] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[0]);
+  shuffle_weight[1] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[1]);
+}
+
+// Do convolution of one 5x5 block.
+#define PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(w, accum0, in_stride)           \
+  do {                                                                   \
+    __m128 load_src[5];                                                  \
+    load_src[0] = _mm_loadu_ps(input_ptr);                               \
+    last_column_sum += input_ptr[4] * weight[0][4];                      \
+    input_ptr += in_stride;                                              \
+    load_src[1] = _mm_loadu_ps(input_ptr);                               \
+    last_column_sum += input_ptr[4] * weight[1][4];                      \
+    input_ptr += in_stride;                                              \
+    load_src[2] = _mm_loadu_ps(input_ptr);                               \
+    last_column_sum += input_ptr[4] * weight[2][4];                      \
+    input_ptr += in_stride;                                              \
+    load_src[3] = _mm_loadu_ps(input_ptr);                               \
+    last_column_sum += input_ptr[4] * weight[3][4];                      \
+    input_ptr += in_stride;                                              \
+    load_src[4] = _mm_loadu_ps(input_ptr);                               \
+    last_column_sum += input_ptr[4] * weight[4][4];                      \
+                                                                         \
+    load_src[0] = _mm_mul_ps(load_src[0], _mm256_castps256_ps128(w[0])); \
+    load_src[1] = _mm_mul_ps(load_src[1], _mm256_castps256_ps128(w[1])); \
+    load_src[2] = _mm_mul_ps(load_src[2], _mm256_castps256_ps128(w[2])); \
+    load_src[3] = _mm_mul_ps(load_src[3], _mm256_castps256_ps128(w[3])); \
+    load_src[4] = _mm_mul_ps(load_src[4], _mm256_castps256_ps128(w[4])); \
+                                                                         \
+    accum0 = _mm_add_ps(load_src[0], accum0);                            \
+    load_src[1] = _mm_add_ps(load_src[1], load_src[2]);                  \
+    load_src[3] = _mm_add_ps(load_src[3], load_src[4]);                  \
+    load_src[1] = _mm_add_ps(load_src[1], load_src[3]);                  \
+    accum0 = _mm_add_ps(accum0, load_src[1]);                            \
+  } while (0)
+
+// Do convolution on 8 horizontal 2x2 blocks.
+static INLINE void perform_convolve_for_8h_2x2_blocks(
+    const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum,
+    __m256i shuffle_output_mask) {
+  __m256 load_src[4];
+  // Load input into source registers.
+  load_src[0] = _mm256_loadu_ps(input_ptr);
+  load_src[1] = _mm256_loadu_ps(input_ptr + 8);
+  load_src[2] = _mm256_loadu_ps(input_ptr + in_stride);
+  load_src[3] = _mm256_loadu_ps(input_ptr + in_stride + 8);
+
+  // Multiply the loaded input with corresponding weights.
+  load_src[0] = _mm256_mul_ps(load_src[0], weight[0]);
+  load_src[1] = _mm256_mul_ps(load_src[1], weight[0]);
+  load_src[2] = _mm256_mul_ps(load_src[2], weight[1]);
+  load_src[3] = _mm256_mul_ps(load_src[3], weight[1]);
+
+  // Accumulate across 2x2 blocks.
+  load_src[0] = _mm256_add_ps(load_src[0], load_src[2]);
+  load_src[1] = _mm256_add_ps(load_src[1], load_src[3]);
+  load_src[0] = _mm256_hadd_ps(load_src[0], load_src[1]);
+
+  // Sort the output in order to store into output buffer.
+  load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask);
+  *out_accum = _mm256_add_ps(*out_accum, load_src[0]);
+}
+
+// Do convolution on 8 (4 horizontal x 2 vertical) 2x2 blocks.
+static INLINE void perform_convolve_for_4hx2v_2x2_blocks(
+    const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum,
+    __m256i shuffle_output_mask) {
+  __m256 load_src[4];
+  // Load input into source registers.
+  load_src[0] = _mm256_loadu_ps(input_ptr);
+  load_src[1] = _mm256_loadu_ps(input_ptr + in_stride);
+  load_src[2] = _mm256_loadu_ps(input_ptr + (in_stride * 2));
+  load_src[3] = _mm256_loadu_ps(input_ptr + (in_stride * 3));
+
+  // Multiply the loaded input with corresponding weights.
+  load_src[0] = _mm256_mul_ps(load_src[0], weight[0]);
+  load_src[1] = _mm256_mul_ps(load_src[1], weight[1]);
+  load_src[2] = _mm256_mul_ps(load_src[2], weight[0]);
+  load_src[3] = _mm256_mul_ps(load_src[3], weight[1]);
+
+  // Accumulate across 2x2 blocks.
+  load_src[0] = _mm256_add_ps(load_src[0], load_src[1]);
+  load_src[2] = _mm256_add_ps(load_src[2], load_src[3]);
+  load_src[0] = _mm256_hadd_ps(load_src[0], load_src[2]);
+
+  // Sort the output in order to store into output buffer.
+  load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask);
+  *out_accum = _mm256_add_ps(*out_accum, load_src[0]);
+}
+
+// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when
+// filter_width and filter_height are equal to 5.
+// CNN convolve parsing is based on av1_intra_mode_cnn_partition_cnn_config.
+// Based on the configuration set for each layer, the current encoder
+// always chooses the case of no_maxpool_padding_valid.
+// And also for layer 0 convolution happens at 5x5 level as the
+// filter_width and filter_height are set as 5.
+static void cnn_convolve_no_maxpool_padding_valid_5x5_avx2(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int channel_step) {
+  const int kFilterWidth = 5;
+  const int kFilterHeight = 5;
+  const int kSkipWidth = 4;
+  const int kSkipHeight = 4;
+  assert(layer_config->filter_width == kFilterWidth &&
+         layer_config->filter_height == kFilterHeight);
+  assert(layer_config->skip_width == kSkipWidth &&
+         layer_config->skip_height == kSkipHeight);
+
+  // Load shuffle buffers needed for source.
+  const __m256i block0_1 =
+      _mm256_load_si256((const __m256i *)shuffle_src_layer0[0]);
+  const __m256i block1_2 =
+      _mm256_load_si256((const __m256i *)shuffle_src_layer0[1]);
+
+  // Load shuffle buffers needed for weight.
+  const __m256i weight_mask_0 =
+      _mm256_load_si256((const __m256i *)shuffle_weight_layer0[0]);
+  const __m256i weight_mask_1 =
+      _mm256_load_si256((const __m256i *)shuffle_weight_layer0[1]);
+
+  // Width needs to be moved to go to next iteration of processing 3 5x5 blocks.
+  const int kSkipWidthForNextIter = kSkipWidth * 3;
+
+  // Minimum width required to process 3 5x5 blocks at a time.
+  // min width (for processing 3 5x5 block) = 2*skip_width + filter_width
+  // Here, skip_width specifies how much width we should move while processing
+  // next block convolution and filter_width specifies for how many pixels
+  // filter needs to be applied.
+  const int kMinWidthFor3_5x5Blocks = (kSkipWidth * 2) + kFilterWidth;
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    const float out_ch_bias = layer_config->bias[i];
+    for (int k = 0; k < layer_config->in_channels; ++k) {
+      __m256 shuffle_weight[10];
+
+      // Weights needed are 5x5, for SIMD purpose made this array as 5x8.
+      float weight[5][8] = { { 0 } };
+      int off = k * layer_config->out_channels + i;
+
+      // In layer 0, the convolution process happens at 5x5.
+      // The weights needed for 5x5 block are same across the in-channels,
+      // which is why the load of weights happens once for each in-channel.
+      prepare_weights_for_5x5_convolve(layer_config->weights, off, weight,
+                                       cstep, shuffle_weight, weight_mask_0,
+                                       weight_mask_1);
+
+      for (int h = 0, u = 0; h < in_height - kFilterHeight + 1;
+           h += kSkipHeight, ++u) {
+        const int out_h = u * out_stride;
+        int v = 0;
+        int w = 0;
+        int rem_width = in_width;
+        // Processing 3 5x5 blocks at a time, if sufficient width is present.
+        while (rem_width >= kMinWidthFor3_5x5Blocks) {
+          __m256 load_src_0, load_src_1;
+          __m256 accum_src_0 = _mm256_setzero_ps();
+          __m256 accum_src_1 = _mm256_setzero_ps();
+          const float *input_ptr = &input[k][h * in_stride + w];
+          PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS();
+
+          // Accumulate across column.
+          __m256 accum = _mm256_hadd_ps(accum_src_0, accum_src_1);
+          __m128 tmp_reg_0 = _mm256_extractf128_ps(accum_src_0, 1);
+          __m128 tmp_reg_1 = _mm256_extractf128_ps(accum_src_1, 1);
+
+          __m128 accum_l = _mm256_castps256_ps128(accum);
+          __m128 accum_h = _mm256_extractf128_ps(accum, 1);
+
+          __m128 tmp_reg_2 = _mm_add_ps(accum_l, tmp_reg_0);
+          __m128 tmp_reg_3 = _mm_add_ps(tmp_reg_0, accum_h);
+          __m128 tmp_reg_4 = _mm_add_ps(tmp_reg_1, accum_h);
+
+          // 1st 5x5 block output.
+          output[i][out_h + v] =
+              out_ch_bias + _mm_cvtss_f32(tmp_reg_2) +
+              _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 1));
+
+          // 2nd 5x5 block output.
+          output[i][out_h + v + 1] =
+              out_ch_bias +
+              _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_3, tmp_reg_3, 1)) +
+              _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 2));
+
+          // 3rd 5x5 block output.
+          output[i][out_h + v + 2] =
+              out_ch_bias +
+              _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_4, tmp_reg_4, 2)) +
+              _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 3));
+
+          v += 3;
+          w += kSkipWidthForNextIter;
+          rem_width -= kSkipWidthForNextIter;
+        }
+
+        // Process remaining blocks as single 5x5 block at a time.
+        while (rem_width >= kFilterWidth) {
+          float last_column_sum = 0;
+          __m128 accum = _mm_setzero_ps();
+          const float *input_ptr = &input[k][h * in_stride + w];
+          PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(shuffle_weight, accum, in_stride);
+
+          // Accumulate across column.
+          accum = _mm_hadd_ps(accum, accum);
+          output[i][out_h + v] = out_ch_bias + last_column_sum +
+                                 _mm_cvtss_f32(accum) +
+                                 _mm_cvtss_f32(_mm_shuffle_ps(accum, accum, 1));
+
+          v += 1;
+          w += kSkipWidth;
+          rem_width -= kSkipWidth;
+        }
+      }
+    }
+  }
+}
+
+// AVX2 implementation for layer 1.
+static INLINE void cnn_convolve_no_maxpool_padding_valid_layer1_avx2(
+    const float **input, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int channel_step) {
+  __m256i weight_mask[2];
+  __m256i shuffle_output_mask;
+  load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask);
+
+  const int kInHeight = 16;
+  const int kFilterHeight = 2;
+  const int kSkipHeight = 2;
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]);
+    // out_accum registers are used to store the 2x2 convolve outputs
+    // (calculated over input block size), which are accumulated across the
+    // in_channels. As per the design, each iteration of for loop processes 8
+    // (horizontal) 2x2 blocks and stores in corresponding out_accum register
+    // (as input size is 16x16, a total of 64 2x2 blocks are present and 8
+    // out_accum registers are enough to store the outputs).
+    // Hence for loops corresponding to 'j' and 'h', below, run over the number
+    // of out_accum registers.
+    __m256 out_accum[8];
+    for (int j = 0; j < 8; ++j) out_accum[j] = bias_reg;
+    for (int k = 0; k < layer_config->in_channels; ++k) {
+      __m256 shuffle_weight[2];
+      int off = k * layer_config->out_channels + i;
+      // In layer 1, the convolution process happens at 2x2.
+      // The weights needed for 2x2 block are same across the in-channels,
+      // which is why the load of weights happens once for each in-channel.
+      prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep,
+                                       shuffle_weight, weight_mask);
+
+      for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
+           h += kSkipHeight, ++u) {
+        const float *input_ptr = &input[k][h * in_stride];
+        perform_convolve_for_8h_2x2_blocks(input_ptr, in_stride, shuffle_weight,
+                                           &out_accum[u], shuffle_output_mask);
+      }
+    }
+    // Store output of layer 1.
+    for (int j = 0; j < 8; ++j) {
+      _mm256_storeu_ps(&output[i][j * out_stride], out_accum[j]);
+    }
+  }
+}
+
+// AVX2 implementation for layer 2.
+static INLINE void cnn_convolve_no_maxpool_padding_valid_layer2_avx2(
+    const float **input, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int channel_step) {
+  __m256i weight_mask[2];
+  __m256i shuffle_output_mask;
+  load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask);
+
+  const int kInHeight = 8;
+  const int kFilterHeight = 2;
+  const int kSkipHeight = 2;
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]);
+    // out_accum registers are used to store the 2x2 convolve outputs
+    // (calculated over input block size), which are accumulated across the
+    // in_channels. As per the design, each iteration of for loop processes 8
+    // (4 horizontal x 2 vertical) 2x2 blocks and stores in corresponding
+    // out_accum register (as input size is 8x8, a total of 16 2x2 blocks are
+    // present and 2 out_accum registers are enough to store the outputs).
+    // Hence for loops corresponding to 'j' and 'h', below, run over the number
+    // of out_accum registers.
+    __m256 out_accum[2];
+
+    // Height needs to be moved to go to next iteration of processing
+    // while processing 2 2x2 blocks vertically.
+    const int kSkipHeightForNextIter = kSkipHeight * 2;
+    for (int j = 0; j < 2; ++j) out_accum[j] = bias_reg;
+    for (int k = 0; k < layer_config->in_channels; ++k) {
+      __m256 shuffle_weight[2];
+      int off = k * layer_config->out_channels + i;
+      // In layer 2, the convolution process happens at 2x2.
+      // The weights needed for 2x2 block are same across the in-channels,
+      // which is why the load of weights happens once for each in-channel.
+      prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep,
+                                       shuffle_weight, weight_mask);
+
+      for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
+           h += kSkipHeightForNextIter, ++u) {
+        const float *input_ptr = &input[k][h * in_stride];
+        perform_convolve_for_4hx2v_2x2_blocks(input_ptr, in_stride,
+                                              shuffle_weight, &out_accum[u],
+                                              shuffle_output_mask);
+      }
+    }
+    // Store output of layer 2.
+    for (int j = 0; j < 2; ++j) {
+      _mm256_storeu_ps(&output[i][j * out_stride * 2], out_accum[j]);
+    }
+  }
+}
+
+// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when
+// filter_width and filter_height are equal to 2.
+// As per the layer config set by av1_intra_mode_cnn_partition_cnn_config,
+// the filter_width and filter_height are equal to 2 for layer >= 1. So
+// convolution happens at 2x2 for layer >= 1.
+void cnn_convolve_no_maxpool_padding_valid_2x2_avx2(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int channel_step) {
+  assert(layer_config->filter_width == 2 && layer_config->filter_height == 2);
+  assert(layer_config->skip_width == 2 && layer_config->skip_height == 2);
+
+  if (in_width == 16 && in_height == 16) {
+    // This case of in_width and in_height equal to 16 corresponds to layer 1.
+    // The output size of this layer is 8x8.
+    cnn_convolve_no_maxpool_padding_valid_layer1_avx2(
+        input, in_stride, layer_config, output, out_stride, start_idx, cstep,
+        channel_step);
+  } else if (in_width == 8 && in_height == 8) {
+    // This case of in_width and in_height equal to 8 corresponds to layer 2.
+    // The output size of this layer is 4x4.
+    cnn_convolve_no_maxpool_padding_valid_layer2_avx2(
+        input, in_stride, layer_config, output, out_stride, start_idx, cstep,
+        channel_step);
+  } else {
+    // For layer equal to 3 and 4, the input is of size 4x4 and 2x2
+    // respectively. Implementing SIMD for these cases might not be optimal,
+    // which is why we call C path for layer >= 3.
+    av1_cnn_convolve_no_maxpool_padding_valid_c(
+        input, in_width, in_height, in_stride, layer_config, output, out_stride,
+        start_idx, cstep, channel_step);
+  }
+}
+
+// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c().
+// As per the current encoder, av1_cnn_convolve function gets called for
+// block size equal to 64x64. av1_cnn_convolve() uses layer config values
+// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few
+// details related to each layer's config parameters.
+// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht
+//     0         64x64    16x16      5         5         4       4
+//     1         16x16    8x8        2         2         2       2
+//     2         8x8      4x4        2         2         2       2
+//     3         4x4      2x2        2         2         2       2
+//     4         2x2      1x1        2         2         2       2
+// Here,
+// filter_wd = filter_width and filter_ht = filter_height,
+// skip_wd = skip_width and skip_ht = skip_height.
+void av1_cnn_convolve_no_maxpool_padding_valid_avx2(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+    int start_idx, int cstep, int channel_step) {
+  if (layer_config->filter_width == 5 && layer_config->filter_height == 5 &&
+      layer_config->skip_width == 4 && layer_config->skip_height == 4) {
+    cnn_convolve_no_maxpool_padding_valid_5x5_avx2(
+        input, in_width, in_height, in_stride, layer_config, output, out_stride,
+        start_idx, cstep, channel_step);
+  } else if (layer_config->filter_width == 2 &&
+             layer_config->filter_height == 2 &&
+             layer_config->skip_width == 2 && layer_config->skip_height == 2) {
+    cnn_convolve_no_maxpool_padding_valid_2x2_avx2(
+        input, in_width, in_height, in_stride, layer_config, output, out_stride,
+        start_idx, cstep, channel_step);
+  } else {
+    av1_cnn_convolve_no_maxpool_padding_valid_c(
+        input, in_width, in_height, in_stride, layer_config, output, out_stride,
+        start_idx, cstep, channel_step);
+  }
+}
diff --git a/media/libaom/src/av1/encoder/x86/corner_match_avx2.c b/media/libaom/src/av1/encoder/x86/corner_match_avx2.c
index 8d7eb3f038..033ae3773e 100644
--- a/media/libaom/src/av1/encoder/x86/corner_match_avx2.c
+++ b/media/libaom/src/av1/encoder/x86/corner_match_avx2.c
@@ -15,7 +15,6 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 #include "av1/encoder/corner_match.h"
 
 DECLARE_ALIGNED(16, static const uint8_t,
@@ -76,6 +75,5 @@ double av1_compute_cross_correlation_avx2(unsigned char *im1, int stride1,
 
   int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc;
   int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc;
-  aom_clear_system_state();
   return cov / sqrt((double)var2);
 }
diff --git a/media/libaom/src/av1/encoder/x86/corner_match_sse4.c b/media/libaom/src/av1/encoder/x86/corner_match_sse4.c
index 5c9ca207e3..1a879dad3e 100644
--- a/media/libaom/src/av1/encoder/x86/corner_match_sse4.c
+++ b/media/libaom/src/av1/encoder/x86/corner_match_sse4.c
@@ -19,7 +19,6 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 #include "av1/encoder/corner_match.h"
 
 DECLARE_ALIGNED(16, static const uint8_t,
@@ -100,6 +99,5 @@ double av1_compute_cross_correlation_sse4_1(unsigned char *im1, int stride1,
 
   int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
   int cov = cross * MATCH_SZ_SQ - sum1 * sum2;
-  aom_clear_system_state();
   return cov / sqrt((double)var2);
 }
diff --git a/media/libaom/src/av1/encoder/x86/error_intrin_sse2.c b/media/libaom/src/av1/encoder/x86/error_intrin_sse2.c
new file mode 100644
index 0000000000..e876db123e
--- /dev/null
+++ b/media/libaom/src/av1/encoder/x86/error_intrin_sse2.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static AOM_INLINE __m128i reduce_sum_epi64(__m128i reg) {
+  __m128i reg_hi = _mm_srli_si128(reg, 8);
+  reg = _mm_add_epi64(reg, reg_hi);
+
+  return reg;
+}
+
+int64_t av1_block_error_lp_sse2(const int16_t *coeff, const int16_t *dqcoeff,
+                                intptr_t block_size) {
+  assert(block_size % 16 == 0);
+  assert(block_size >= 16);
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i accum_0 = zero;
+  __m128i accum_1 = zero;
+
+  for (int i = 0; i < block_size; i += 16) {
+    // Load 8 elements for coeff and dqcoeff.
+    const __m128i _coeff_0 = _mm_loadu_si128((const __m128i *)coeff);
+    const __m128i _coeff_1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+    const __m128i _dqcoeff_0 = _mm_loadu_si128((const __m128i *)dqcoeff);
+    const __m128i _dqcoeff_1 = _mm_loadu_si128((const __m128i *)(dqcoeff + 8));
+    // Compute the diff
+    const __m128i diff_0 = _mm_sub_epi16(_dqcoeff_0, _coeff_0);
+    const __m128i diff_1 = _mm_sub_epi16(_dqcoeff_1, _coeff_1);
+    // Compute the error
+    const __m128i error_0 = _mm_madd_epi16(diff_0, diff_0);
+    const __m128i error_1 = _mm_madd_epi16(diff_1, diff_1);
+
+    const __m128i error_lo_0 = _mm_unpacklo_epi32(error_0, zero);
+    const __m128i error_lo_1 = _mm_unpacklo_epi32(error_1, zero);
+    const __m128i error_hi_0 = _mm_unpackhi_epi32(error_0, zero);
+    const __m128i error_hi_1 = _mm_unpackhi_epi32(error_1, zero);
+
+    // Accumulate
+    accum_0 = _mm_add_epi64(accum_0, error_lo_0);
+    accum_1 = _mm_add_epi64(accum_1, error_lo_1);
+    accum_0 = _mm_add_epi64(accum_0, error_hi_0);
+    accum_1 = _mm_add_epi64(accum_1, error_hi_1);
+
+    // Advance
+    coeff += 16;
+    dqcoeff += 16;
+  }
+
+  __m128i accum = _mm_add_epi64(accum_0, accum_1);
+  // Reduce sum the register
+  accum = reduce_sum_epi64(accum);
+
+  // Store the results.
+#if ARCH_X86_64
+  return _mm_cvtsi128_si64(accum);
+#else
+  int64_t result;
+  _mm_storel_epi64((__m128i *)&result, accum);
+  return result;
+#endif  // ARCH_X86_64
+}
diff --git a/media/libaom/src/av1/encoder/x86/hash_sse42.c b/media/libaom/src/av1/encoder/x86/hash_sse42.c
index 65fa463117..2f4e02df1a 100644
--- a/media/libaom/src/av1/encoder/x86/hash_sse42.c
+++ b/media/libaom/src/av1/encoder/x86/hash_sse42.c
@@ -41,11 +41,11 @@ uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p,
 
 #ifdef __x86_64__
   uint64_t crc64 = crc;
-  CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len);
+  CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len)
   crc = (uint32_t)crc64;
 #endif
-  CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len);
-  CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len);
-  CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len);
+  CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len)
+  CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len)
+  CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len)
   return (crc ^= 0xFFFFFFFF);
 }
diff --git a/media/libaom/src/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/media/libaom/src/av1/encoder/x86/highbd_fwd_txfm_avx2.c
index a81378cfe3..1faa412b71 100644
--- a/media/libaom/src/av1/encoder/x86/highbd_fwd_txfm_avx2.c
+++ b/media/libaom/src/av1/encoder/x86/highbd_fwd_txfm_avx2.c
@@ -1335,7 +1335,7 @@ void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride,
   row_txfm(in, out, bit, 2, 2);
   fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
   fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
-  av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
+  round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
   store_buffer_avx2(in, coeff, 8, 16);
   (void)bd;
 }
@@ -1396,7 +1396,7 @@ void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride,
   row_txfm(in, out, bit, 1, 1);
   fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
   fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
-  av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
+  round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
   store_buffer_avx2(in, coeff, 8, 16);
   (void)bd;
 }
diff --git a/media/libaom/src/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/media/libaom/src/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index 73afc5d039..73f9b44d13 100644
--- a/media/libaom/src/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/media/libaom/src/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -11,16 +11,70 @@
 #include <assert.h>
 #include <smmintrin.h> /* SSE4.1 */
 
-#include "config/aom_config.h"
-#include "config/av1_rtcd.h"
-
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) {
+  __m128i in[4];
+  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+  // Convert to int32_t.
+  __m128i op[4];
+  op[0] = _mm_cvtepi16_epi32(in[0]);
+  op[1] = _mm_cvtepi16_epi32(in[1]);
+  op[2] = _mm_cvtepi16_epi32(in[2]);
+  op[3] = _mm_cvtepi16_epi32(in[3]);
+
+  for (int i = 0; i < 2; ++i) {
+    __m128i a1 = op[0];
+    __m128i b1 = op[1];
+    __m128i c1 = op[2];
+    __m128i d1 = op[3];
+    __m128i e1;
+
+    a1 = _mm_add_epi32(a1, b1);  // a1 += b1
+    d1 = _mm_sub_epi32(d1, c1);  // d1 = d1 - c1
+    e1 = _mm_sub_epi32(a1, d1);  // e1 = (a1 - d1) >> 1
+    e1 = _mm_srai_epi32(e1, 1);
+    b1 = _mm_sub_epi32(e1, b1);  // b1 = e1 - b1
+    c1 = _mm_sub_epi32(e1, c1);  // c1 = e1 - c1
+    a1 = _mm_sub_epi32(a1, c1);  // a1 -= c1
+    d1 = _mm_add_epi32(d1, b1);  // d1 += b1
+
+    op[0] = a1;
+    op[1] = c1;
+    op[2] = d1;
+    op[3] = b1;
+
+    transpose_32bit_4x4(op, op);
+  }
+
+  op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT);
+  op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT);
+  op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT);
+  op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT);
+
+  _mm_storeu_si128((__m128i *)(output + 0), op[0]);
+  _mm_storeu_si128((__m128i *)(output + 4), op[1]);
+  _mm_storeu_si128((__m128i *)(output + 8), op[2]);
+  _mm_storeu_si128((__m128i *)(output + 12), op[3]);
+}
+
+void av1_highbd_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output,
+                               int stride) {
+  av1_fwht4x4_sse4_1(input, output, stride);
+}
 
 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
                                    int stride, int flipud, int fliplr,
@@ -827,6 +881,7 @@ static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
     out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]);
   }
 }
+#if !CONFIG_REALTIME_ONLY
 static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
   (void)bit;
   (void)col_num;
@@ -841,6 +896,7 @@ static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
     out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]);
   }
 }
+#endif
 void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
   __m128i in[16], out[16];
@@ -1146,6 +1202,7 @@ static INLINE void load_buffer_4x8(const int16_t *input, __m128i *out,
   load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift);
 }
 
+#if !CONFIG_REALTIME_ONLY
 static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out,
                                     const int stride, const int flipud,
                                     const int fliplr, const int shift) {
@@ -1162,6 +1219,7 @@ static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out,
   load_buffer_4x8(topL, out, stride, flipud, fliplr, shift);
   load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift);
 }
+#endif
 
 static INLINE void load_buffer_32x8n(const int16_t *input, __m128i *out,
                                      int stride, int flipud, int fliplr,
@@ -1943,6 +2001,7 @@ static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
   fadst8x8_sse4_1,  // V_FLIPADST
   idtx8x8_sse4_1    // H_FLIPADST
 };
+#if !CONFIG_REALTIME_ONLY
 static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = {
   fdct8x8_sse4_1,   // DCT_DCT
   NULL,             // ADST_DCT
@@ -1961,6 +2020,7 @@ static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = {
   NULL,             // V_FLIPADST
   NULL,             // H_FLIPADST
 };
+#endif
 static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = {
   fdct4x8_sse4_1,   // DCT_DCT
   fadst8x8_sse4_1,  // ADST_DCT
@@ -2194,6 +2254,7 @@ void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
   (void)bd;
 }
 
+#if !CONFIG_REALTIME_ONLY
 void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff,
                                 int stride, TX_TYPE tx_type, int bd) {
   __m128i in[16];
@@ -2222,6 +2283,7 @@ void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff,
   }
   (void)bd;
 }
+#endif
 
 void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff,
                                 int stride, TX_TYPE tx_type, int bd) {
@@ -2394,6 +2456,7 @@ void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff,
   (void)bd;
 }
 
+#if !CONFIG_REALTIME_ONLY
 void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff,
                                 int stride, TX_TYPE tx_type, int bd) {
   __m128i in[64];
@@ -2461,6 +2524,7 @@ void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff,
   transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
   (void)bd;
 }
+#endif
 
 void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
@@ -2522,6 +2586,7 @@ void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride,
   (void)bd;
 }
 
+#if !CONFIG_REALTIME_ONLY
 void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff,
                                  int stride, TX_TYPE tx_type, int bd) {
   __m128i in[256];
@@ -2602,3 +2667,4 @@ void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff,
   transpose_8nx8n(in, outcoeff128, txfm_size_row, 32);
   (void)bd;
 }
+#endif
diff --git a/media/libaom/src/av1/encoder/x86/highbd_temporal_filter_avx2.c b/media/libaom/src/av1/encoder/x86/highbd_temporal_filter_avx2.c
new file mode 100644
index 0000000000..68509fa106
--- /dev/null
+++ b/media/libaom/src/av1/encoder/x86/highbd_temporal_filter_avx2.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = {
+  { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 },
+  { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 },
+  { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 },
+  { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
+};
+
+static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+    const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    uint32_t *frame_sse, const unsigned int sse_stride) {
+  (void)block_width;
+  const uint16_t *src1 = frame1;
+  const uint16_t *src2 = frame2;
+  uint32_t *dst = frame_sse + 2;
+  for (int i = 0; i < block_height; i++) {
+    __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1);
+    __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2);
+    __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2);
+    __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+    __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+    __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+    __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+    __m256i diff_lo =
+        _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+    __m256i diff_hi =
+        _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+    _mm256_storeu_si256((__m256i *)dst, diff_lo);
+    dst += 8;
+    _mm256_storeu_si256((__m256i *)dst, diff_hi);
+
+    src1 += stride, src2 += stride2;
+    dst += sse_stride - 8;
+  }
+}
+
+static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
+    const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    uint32_t *frame_sse, const unsigned int sse_stride) {
+  (void)block_width;
+  const uint16_t *src1 = frame1;
+  const uint16_t *src2 = frame2;
+  uint32_t *dst = frame_sse + 2;
+  for (int i = 0; i < block_height; i++) {
+    __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1);
+    __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2);
+    __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2);
+    __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+    __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+    __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+    __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+    __m256i diff_lo =
+        _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+    __m256i diff_hi =
+        _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+    _mm256_storeu_si256((__m256i *)dst, diff_lo);
+    _mm256_storeu_si256((__m256i *)(dst + 8), diff_hi);
+
+    v_src1 = _mm256_loadu_si256((__m256i *)(src1 + 16));
+    v_src2 = _mm256_loadu_si256((__m256i *)(src2 + 16));
+    v_diff = _mm256_sub_epi16(v_src1, v_src2);
+    v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+    v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+    v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+    v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+    diff_lo =
+        _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+    diff_hi =
+        _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+    _mm256_storeu_si256((__m256i *)(dst + 16), diff_lo);
+    _mm256_storeu_si256((__m256i *)(dst + 24), diff_hi);
+
+    src1 += stride;
+    src2 += stride2;
+    dst += sse_stride;
+  }
+}
+
+static AOM_FORCE_INLINE void xx_load_and_pad_left(uint32_t *src,
+                                                  __m256i *v256tmp) {
+  *v256tmp = _mm256_loadu_si256((__m256i *)src);
+  // For the first column, replicate the first element twice to the left
+  __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0xEA);
+  *v256tmp = _mm256_inserti128_si256(*v256tmp,
+                                     _mm256_extracti128_si256(v256tmp1, 0), 0);
+}
+
+static AOM_FORCE_INLINE void xx_load_and_pad_right(uint32_t *src,
+                                                   __m256i *v256tmp) {
+  *v256tmp = _mm256_loadu_si256((__m256i *)src);
+  // For the last column, replicate the last element twice to the right
+  __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0x54);
+  *v256tmp = _mm256_inserti128_si256(*v256tmp,
+                                     _mm256_extracti128_si256(v256tmp1, 1), 1);
+}
+
+static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
+  // Mask the required 5 values inside the vector
+  __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]);
+  __m128i v128a, v128b;
+  // Extract 256b as two 128b registers A and B
+  v128a = _mm256_castsi256_si128(vtmp);
+  v128b = _mm256_extracti128_si256(vtmp, 1);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A2+B2, A3+B3, 0, 0]
+  v128b = _mm_srli_si128(v128a, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  v128b = _mm_srli_si128(v128a, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  return _mm_extract_epi32(v128a, 0);
+}
+
+static void highbd_apply_temporal_filter(
+    const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
+    const double inv_num_ref_pixels, const double decay_factor,
+    const double inv_factor, const double weight_factor, double *d_factor) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_sse[BH][BW];
+
+  if (block_width == 32) {
+    get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  } else {
+    get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  }
+
+  __m256i vsrc[5];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  int col;
+  uint32_t *src = frame_sse;
+  for (int i = 2; i < 5; i++) {
+    xx_load_and_pad_left(src, &vsrc[i]);
+    src += SSE_STRIDE;
+  }
+
+  // Copy first row to first 2 vectors
+  vsrc[0] = vsrc[2];
+  vsrc[1] = vsrc[2];
+
+  for (int row = 0; row < block_height - 3; row++) {
+    __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+    __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+    __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+    __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+    for (int i = 0; i < 4; i++) {
+      vsrc[i] = vsrc[i + 1];
+    }
+
+    xx_load_and_pad_left(src, &vsrc[4]);
+    src += SSE_STRIDE;
+
+    acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0);
+    acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1);
+    acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2);
+    acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3);
+  }
+  for (int row = block_height - 3; row < block_height; row++) {
+    __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+    __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+    __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+    __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+    for (int i = 0; i < 4; i++) {
+      vsrc[i] = vsrc[i + 1];
+    }
+
+    acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0);
+    acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1);
+    acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2);
+    acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3);
+  }
+  for (col = 4; col < block_width - 4; col += 4) {
+    src = frame_sse + col;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      vsrc[i] = _mm256_loadu_si256((__m256i *)src);
+      src += SSE_STRIDE;
+    }
+
+    // Copy first row to first 2 vectors
+    vsrc[0] = vsrc[2];
+    vsrc[1] = vsrc[2];
+
+    for (int row = 0; row < block_height - 3; row++) {
+      __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+      __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+      __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+      __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+      for (int i = 0; i < 4; i++) {
+        vsrc[i] = vsrc[i + 1];
+      }
+
+      vsrc[4] = _mm256_loadu_si256((__m256i *)src);
+
+      src += SSE_STRIDE;
+
+      acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+      acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+      acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+      acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+    }
+    for (int row = block_height - 3; row < block_height; row++) {
+      __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+      __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+      __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+      __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+      for (int i = 0; i < 4; i++) {
+        vsrc[i] = vsrc[i + 1];
+      }
+
+      acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+      acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+      acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+      acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+    }
+  }
+
+  src = frame_sse + col;
+
+  // Load and pad(for first and last col) 3 rows from the top
+  for (int i = 2; i < 5; i++) {
+    xx_load_and_pad_right(src, &vsrc[i]);
+    src += SSE_STRIDE;
+  }
+
+  // Copy first row to first 2 vectors
+  vsrc[0] = vsrc[2];
+  vsrc[1] = vsrc[2];
+
+  for (int row = 0; row < block_height - 3; row++) {
+    __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+    __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+    __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+    __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+    for (int i = 0; i < 4; i++) {
+      vsrc[i] = vsrc[i + 1];
+    }
+
+    xx_load_and_pad_right(src, &vsrc[4]);
+    src += SSE_STRIDE;
+
+    acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+    acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+    acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+    acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+  }
+  for (int row = block_height - 3; row < block_height; row++) {
+    __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+    __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+    __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+    __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+    for (int i = 0; i < 4; i++) {
+      vsrc[i] = vsrc[i + 1];
+    }
+
+    acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+    acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+    acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+    acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+  }
+
+  for (int i = 0, k = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame2[i * stride2 + j];
+      uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+      // Scale down the difference for high bit depth input.
+      diff_sse >>= ((bd - 8) * 2);
+
+      const double window_error = diff_sse * inv_num_ref_pixels;
+      const int subblock_idx =
+          (i >= block_height / 2) * 2 + (j >= block_width / 2);
+      const double block_error = (double)subblock_mses[subblock_idx];
+      const double combined_error =
+          weight_factor * window_error + block_error * inv_factor;
+
+      double scaled_error =
+          combined_error * d_factor[subblock_idx] * decay_factor;
+      scaled_error = AOMMIN(scaled_error, 7);
+      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+      count[k] += weight;
+      accumulator[k] += weight * pixel_value;
+    }
+  }
+}
+
+void av1_highbd_apply_temporal_filter_avx2(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint32_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+  uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred);
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint16_t *ref =
+        CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++, k++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+            }
+          }
+        }
+      }
+    }
+
+    highbd_apply_temporal_filter(
+        ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
+        subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
+        luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
+        weight_factor, d_factor);
+    plane_offset += plane_h * plane_w;
+  }
+}
diff --git a/media/libaom/src/av1/encoder/x86/highbd_temporal_filter_sse2.c b/media/libaom/src/av1/encoder/x86/highbd_temporal_filter_sse2.c
new file mode 100644
index 0000000000..1bfdaf72e1
--- /dev/null
+++ b/media/libaom/src/av1/encoder/x86/highbd_temporal_filter_sse2.c
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+// For the squared error buffer, keep a padding for 4 samples
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = {
+  { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }
+};
+
+static void get_squared_error(const uint16_t *frame1, const unsigned int stride,
+                              const uint16_t *frame2,
+                              const unsigned int stride2, const int block_width,
+                              const int block_height, uint32_t *frame_sse,
+                              const unsigned int dst_stride) {
+  const uint16_t *src1 = frame1;
+  const uint16_t *src2 = frame2;
+  uint32_t *dst = frame_sse;
+
+  for (int i = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j += 8) {
+      __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
+      __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j));
+
+      __m128i vdiff = _mm_sub_epi16(vsrc1, vsrc2);
+      __m128i vmullo = _mm_mullo_epi16(vdiff, vdiff);
+      __m128i vmullh = _mm_mulhi_epi16(vdiff, vdiff);
+
+      __m128i vres1 = _mm_unpacklo_epi16(vmullo, vmullh);
+      __m128i vres2 = _mm_unpackhi_epi16(vmullo, vmullh);
+
+      _mm_storeu_si128((__m128i *)(dst + j + 2), vres1);
+      _mm_storeu_si128((__m128i *)(dst + j + 6), vres2);
+    }
+
+    src1 += stride;
+    src2 += stride2;
+    dst += dst_stride;
+  }
+}
+
+static void xx_load_and_pad(uint32_t *src, __m128i *dstvec, int col,
+                            int block_width) {
+  __m128i vtmp1 = _mm_loadu_si128((__m128i *)src);
+  __m128i vtmp2 = _mm_loadu_si128((__m128i *)(src + 4));
+  // For the first column, replicate the first element twice to the left
+  dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA);
+  // For the last column, replicate the last element twice to the right
+  dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54);
+}
+
+static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) {
+  __m128i veca, vecb;
+  // Mask and obtain the required 5 values inside the vector
+  veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]);
+  vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  veca = _mm_add_epi32(veca, vecb);
+  // B = [A2+B2, A3+B3, 0, 0]
+  vecb = _mm_srli_si128(veca, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  veca = _mm_add_epi32(veca, vecb);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  vecb = _mm_srli_si128(veca, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  veca = _mm_add_epi32(veca, vecb);
+  return _mm_cvtsi128_si32(veca);
+}
+
+static void highbd_apply_temporal_filter(
+    const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
+    const double inv_num_ref_pixels, const double decay_factor,
+    const double inv_factor, const double weight_factor, double *d_factor) {
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+
+  uint32_t acc_5x5_sse[BH][BW];
+
+  get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
+                    frame_sse, SSE_STRIDE);
+
+  __m128i vsrc[5][2];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  for (int col = 0; col < block_width; col += 4) {
+    uint32_t *src = frame_sse + col;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      xx_load_and_pad(src, vsrc[i], col, block_width);
+      src += SSE_STRIDE;
+    }
+
+    // Padding for top 2 rows
+    vsrc[0][0] = vsrc[2][0];
+    vsrc[0][1] = vsrc[2][1];
+    vsrc[1][0] = vsrc[2][0];
+    vsrc[1][1] = vsrc[2][1];
+
+    for (int row = 0; row < block_height - 3; row++) {
+      __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]);
+      __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]);
+      __m128i vsum13 = _mm_add_epi32(vsum11, vsum12);
+      __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]);
+
+      __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]);
+      __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]);
+      __m128i vsum23 = _mm_add_epi32(vsum21, vsum22);
+      __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]);
+
+      vsrc[0][0] = vsrc[1][0];
+      vsrc[0][1] = vsrc[1][1];
+      vsrc[1][0] = vsrc[2][0];
+      vsrc[1][1] = vsrc[2][1];
+      vsrc[2][0] = vsrc[3][0];
+      vsrc[2][1] = vsrc[3][1];
+      vsrc[3][0] = vsrc[4][0];
+      vsrc[3][1] = vsrc[4][1];
+
+      // Load next row
+      xx_load_and_pad(src, vsrc[4], col, block_width);
+      src += SSE_STRIDE;
+
+      acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0);
+      acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1);
+      acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2);
+      acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3);
+    }
+    for (int row = block_height - 3; row < block_height; row++) {
+      __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]);
+      __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]);
+      __m128i vsum13 = _mm_add_epi32(vsum11, vsum12);
+      __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]);
+
+      __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]);
+      __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]);
+      __m128i vsum23 = _mm_add_epi32(vsum21, vsum22);
+      __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]);
+
+      vsrc[0][0] = vsrc[1][0];
+      vsrc[0][1] = vsrc[1][1];
+      vsrc[1][0] = vsrc[2][0];
+      vsrc[1][1] = vsrc[2][1];
+      vsrc[2][0] = vsrc[3][0];
+      vsrc[2][1] = vsrc[3][1];
+      vsrc[3][0] = vsrc[4][0];
+      vsrc[3][1] = vsrc[4][1];
+
+      acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0);
+      acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1);
+      acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2);
+      acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3);
+    }
+  }
+
+  for (int i = 0, k = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame2[i * stride2 + j];
+      uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+      // Scale down the difference for high bit depth input.
+      diff_sse >>= ((bd - 8) * 2);
+
+      const double window_error = diff_sse * inv_num_ref_pixels;
+      const int subblock_idx =
+          (i >= block_height / 2) * 2 + (j >= block_width / 2);
+      const double block_error = (double)subblock_mses[subblock_idx];
+      const double combined_error =
+          weight_factor * window_error + block_error * inv_factor;
+
+      double scaled_error =
+          combined_error * d_factor[subblock_idx] * decay_factor;
+      scaled_error = AOMMIN(scaled_error, 7);
+      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+      count[k] += weight;
+      accumulator[k] += weight * pixel_value;
+    }
+  }
+}
+
+void av1_highbd_apply_temporal_filter_sse2(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  (void)is_high_bitdepth;
+
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint32_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+  uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred);
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
+  // Handle planes in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint16_t *ref =
+        CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++, k++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+            }
+          }
+        }
+      }
+    }
+
+    highbd_apply_temporal_filter(
+        ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
+        subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
+        luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
+        weight_factor, d_factor);
+    plane_offset += plane_h * plane_w;
+  }
+}
diff --git a/media/libaom/src/av1/encoder/x86/ml_sse3.c b/media/libaom/src/av1/encoder/x86/ml_sse3.c
index 89b1e6a05b..ab69088dce 100644
--- a/media/libaom/src/av1/encoder/x86/ml_sse3.c
+++ b/media/libaom/src/av1/encoder/x86/ml_sse3.c
@@ -242,3 +242,95 @@ void av1_nn_predict_sse3(const float *input_nodes,
   }
   if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
 }
+
+// Based on N. N. Schraudolph. A Fast, Compact Approximation of the Exponential
+// Function. Neural Computation, 11(4):853–862, 1999.
+static AOM_INLINE __m128 approx_exp(__m128 y) {
+#define A ((1 << 23) / 0.69314718056f)  // (1 << 23) / ln(2)
+#define B \
+  127  // Offset for the exponent according to IEEE floating point standard.
+#define C 60801  // Magic number controls the accuracy of approximation
+  const __m128 multiplier = _mm_set1_ps(A);
+  const __m128i offset = _mm_set1_epi32(B * (1 << 23) - C);
+
+  y = _mm_mul_ps(y, multiplier);
+  y = _mm_castsi128_ps(_mm_add_epi32(_mm_cvtps_epi32(y), offset));
+  return y;
+#undef A
+#undef B
+#undef C
+}
+
+static AOM_INLINE __m128 reduce_max(__m128 reg) {
+  __m128 tmp_reg;
+
+  tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e);  // 01 00 11 10
+  reg = _mm_max_ps(reg, tmp_reg);
+
+  tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1);  // 10 11 00 01
+  reg = _mm_max_ps(reg, tmp_reg);
+
+  return reg;
+}
+
+static AOM_INLINE __m128 reduce_sum(__m128 reg) {
+  __m128 tmp_reg;
+
+  tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e);  // 01 00 11 10
+  reg = _mm_add_ps(reg, tmp_reg);
+
+  tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1);  // 10 11 00 01
+  reg = _mm_add_ps(reg, tmp_reg);
+
+  return reg;
+}
+
+void av1_nn_fast_softmax_16_sse3(const float *input, float *output) {
+  // Clips at -10 to avoid underflowing
+  const __m128 clipper = _mm_set1_ps(-10.0f);
+
+  // Load in 16 values
+  __m128 in_0 = _mm_loadu_ps(&input[0]);
+  __m128 in_1 = _mm_loadu_ps(&input[4]);
+  __m128 in_2 = _mm_loadu_ps(&input[8]);
+  __m128 in_3 = _mm_loadu_ps(&input[12]);
+
+  // Get the max
+  __m128 max_0 = _mm_max_ps(in_0, in_1);
+  __m128 max_1 = _mm_max_ps(in_2, in_3);
+
+  max_0 = _mm_max_ps(max_0, max_1);
+  max_0 = reduce_max(max_0);
+
+  // Subtract the max off and clip
+  in_0 = _mm_sub_ps(in_0, max_0);
+  in_1 = _mm_sub_ps(in_1, max_0);
+  in_2 = _mm_sub_ps(in_2, max_0);
+  in_3 = _mm_sub_ps(in_3, max_0);
+
+  in_0 = _mm_max_ps(in_0, clipper);
+  in_1 = _mm_max_ps(in_1, clipper);
+  in_2 = _mm_max_ps(in_2, clipper);
+  in_3 = _mm_max_ps(in_3, clipper);
+
+  // Exponentiate and compute the denominator
+  __m128 sum = in_0 = approx_exp(in_0);
+  in_1 = approx_exp(in_1);
+  sum = _mm_add_ps(sum, in_1);
+  in_2 = approx_exp(in_2);
+  sum = _mm_add_ps(sum, in_2);
+  in_3 = approx_exp(in_3);
+  sum = _mm_add_ps(sum, in_3);
+  sum = reduce_sum(sum);
+
+  // Divide to get the probability
+  in_0 = _mm_div_ps(in_0, sum);
+  in_1 = _mm_div_ps(in_1, sum);
+  in_2 = _mm_div_ps(in_2, sum);
+  in_3 = _mm_div_ps(in_3, sum);
+
+  _mm_storeu_ps(&output[0], in_0);
+  _mm_storeu_ps(&output[4], in_1);
+  _mm_storeu_ps(&output[8], in_2);
+  _mm_storeu_ps(&output[12], in_3);
+}
diff --git a/media/libaom/src/av1/encoder/x86/pickrst_avx2.c b/media/libaom/src/av1/encoder/x86/pickrst_avx2.c
index f8703a23ca..d53b128567 100644
--- a/media/libaom/src/av1/encoder/x86/pickrst_avx2.c
+++ b/media/libaom/src/av1/encoder/x86/pickrst_avx2.c
@@ -10,6 +10,7 @@
  */
 
 #include <immintrin.h>  // AVX2
+#include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_dsp/x86/transpose_sse2.h"
@@ -34,7 +35,16 @@ static INLINE void acc_stat_win7_one_line_avx2(
     int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
   int j, k, l;
   const int wiener_win = WIENER_WIN;
-  for (j = h_start; j < h_end; j += 2) {
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
     const uint8_t X1 = src[j];
     const uint8_t X2 = src[j + 1];
     *sumX += X1 + X2;
@@ -49,7 +59,36 @@ static INLINE void acc_stat_win7_one_line_avx2(
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m256i kl =
-            _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
+        acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint8_t X1 = src[j];
+    *sumX += X1;
+    const uint8_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_avx2` function wants its input to have interleaved
+        // copies of two pixels, but we only have one. However, the pixels
+        // are (effectively) used as inputs to a multiply-accumulate.
+        // So if we set the extra pixel slot to 0, then it is effectively
+        // ignored.
+        const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -64,7 +103,8 @@ static INLINE void acc_stat_win7_one_line_avx2(
 
 static INLINE void compute_stats_win7_opt_avx2(
     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
-    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
+    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+    int use_downsampled_wiener_stats) {
   int i, j, k, l, m, n;
   const int wiener_win = WIENER_WIN;
   const int pixel_count = (h_end - h_start) * (v_end - v_start);
@@ -74,21 +114,51 @@ static INLINE void compute_stats_win7_opt_avx2(
 
   int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
   int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t M_int32_row[WIENER_WIN][WIENER_WIN] = { { 0 } };
 
   DECLARE_ALIGNED(32, int32_t,
                   H_int32[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
+  DECLARE_ALIGNED(32, int32_t,
+                  H_int32_row[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
   int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
   int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
   int32_t sumX = 0;
   const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+  int downsample_factor =
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+  int32_t sumX_row = 0;
+  int32_t sumY_row[WIENER_WIN][WIENER_WIN] = { { 0 } };
 
   const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
   for (j = v_start; j < v_end; j += 64) {
     const int vert_end = AOMMIN(64, v_end - j) + j;
-    for (i = j; i < vert_end; i++) {
+    for (i = j; i < vert_end; i = i + downsample_factor) {
+      if (use_downsampled_wiener_stats &&
+          (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+        downsample_factor = vert_end - i;
+      }
+      sumX_row = 0;
+      memset(sumY_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN);
+      memset(M_int32_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN);
+      memset(H_int32_row, 0, sizeof(int32_t) * WIENER_WIN2 * (WIENER_WIN * 8));
       acc_stat_win7_one_line_avx2(
           dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
-          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+          dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row);
+      sumX += sumX_row * downsample_factor;
+
+      // Scale M matrix based on the downsampling factor
+      for (k = 0; k < wiener_win; ++k) {
+        for (l = 0; l < wiener_win; ++l) {
+          sumY[k][l] += (sumY_row[k][l] * downsample_factor);
+          M_int32[k][l] += (M_int32_row[k][l] * downsample_factor);
+        }
+      }
+      // Scale H matrix based on the downsampling factor
+      for (k = 0; k < WIENER_WIN2; ++k) {
+        for (l = 0; l < WIENER_WIN * 8; ++l) {
+          H_int32[k][l] += (H_int32_row[k][l] * downsample_factor);
+        }
+      }
     }
     for (k = 0; k < wiener_win; ++k) {
       for (l = 0; l < wiener_win; ++l) {
@@ -165,7 +235,16 @@ static INLINE void acc_stat_highbd_win7_one_line_avx2(
     int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
   int j, k, l;
   const int wiener_win = WIENER_WIN;
-  for (j = h_start; j < h_end; j += 2) {
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
     const uint16_t X1 = src[j];
     const uint16_t X2 = src[j + 1];
     *sumX += X1 + X2;
@@ -181,8 +260,7 @@ static INLINE void acc_stat_highbd_win7_one_line_avx2(
 
         // Load two u16 values from dgd_ijkl combined as a u32,
         // then broadcast to 8x u32 slots of a 256
-        const __m256i dgd_ijkl =
-            _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l));
         // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16
 
         acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -202,6 +280,42 @@ static INLINE void acc_stat_highbd_win7_one_line_avx2(
       }
     }
   }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint16_t X1 = src[j];
+    *sumX += X1;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_highbd_avx2` function wants its input to have
+        // interleaved copies of two pixels, but we only have one. However, the
+        // pixels are (effectively) used as inputs to a multiply-accumulate. So
+        // if we set the extra pixel slot to 0, then it is effectively ignored.
+        const __m256i dgd_ijkl = _mm256_set1_epi32((uint32_t)D1);
+
+        acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+      }
+    }
+  }
 }
 
 static INLINE void compute_stats_highbd_win7_opt_avx2(
@@ -269,7 +383,16 @@ static INLINE void acc_stat_highbd_win5_one_line_avx2(
     int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
   int j, k, l;
   const int wiener_win = WIENER_WIN_CHROMA;
-  for (j = h_start; j < h_end; j += 2) {
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
     const uint16_t X1 = src[j];
     const uint16_t X2 = src[j + 1];
     *sumX += X1 + X2;
@@ -285,8 +408,7 @@ static INLINE void acc_stat_highbd_win5_one_line_avx2(
 
         // Load two u16 values from dgd_ijkl combined as a u32,
         // then broadcast to 8x u32 slots of a 256
-        const __m256i dgd_ijkl =
-            _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+        const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_uint32(dgd_ijk + l));
         // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16
 
         acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
@@ -302,6 +424,38 @@ static INLINE void acc_stat_highbd_win5_one_line_avx2(
       }
     }
   }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint16_t X1 = src[j];
+    *sumX += X1;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_highbd_avx2` function wants its input to have
+        // interleaved copies of two pixels, but we only have one. However, the
+        // pixels are (effectively) used as inputs to a multiply-accumulate. So
+        // if we set the extra pixel slot to 0, then it is effectively ignored.
+        const __m256i dgd_ijkl = _mm256_set1_epi32((uint32_t)D1);
+
+        acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+      }
+    }
+  }
 }
 
 static INLINE void compute_stats_highbd_win5_opt_avx2(
@@ -391,7 +545,16 @@ static INLINE void acc_stat_win5_one_line_avx2(
     int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
   int j, k, l;
   const int wiener_win = WIENER_WIN_CHROMA;
-  for (j = h_start; j < h_end; j += 2) {
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
     const uint8_t X1 = src[j];
     const uint8_t X2 = src[j + 1];
     *sumX += X1 + X2;
@@ -406,7 +569,34 @@ static INLINE void acc_stat_win5_one_line_avx2(
         M_int[k][l] += D1 * X1 + D2 * X2;
 
         const __m256i kl =
-            _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
+        acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint8_t X1 = src[j];
+    *sumX += X1;
+    const uint8_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_avx2` function wants its input to have interleaved
+        // copies of two pixels, but we only have one. However, the pixels
+        // are (effectively) used as inputs to a multiply-accumulate.
+        // So if we set the extra pixel slot to 0, then it is effectively
+        // ignored.
+        const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
@@ -419,7 +609,8 @@ static INLINE void acc_stat_win5_one_line_avx2(
 
 static INLINE void compute_stats_win5_opt_avx2(
     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
-    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
+    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+    int use_downsampled_wiener_stats) {
   int i, j, k, l, m, n;
   const int wiener_win = WIENER_WIN_CHROMA;
   const int pixel_count = (h_end - h_start) * (v_end - v_start);
@@ -428,22 +619,56 @@ static INLINE void compute_stats_win5_opt_avx2(
   uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
 
   int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t M_int32_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
   int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
   DECLARE_ALIGNED(
       32, int32_t,
       H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
+  DECLARE_ALIGNED(
+      32, int32_t,
+      H_int32_row[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
   int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
   int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
   int32_t sumX = 0;
   const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+  int downsample_factor =
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+  int32_t sumX_row = 0;
+  int32_t sumY_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
 
   const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
   for (j = v_start; j < v_end; j += 64) {
     const int vert_end = AOMMIN(64, v_end - j) + j;
-    for (i = j; i < vert_end; i++) {
+    for (i = j; i < vert_end; i = i + downsample_factor) {
+      if (use_downsampled_wiener_stats &&
+          (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+        downsample_factor = vert_end - i;
+      }
+      sumX_row = 0;
+      memset(sumY_row, 0,
+             sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA);
+      memset(M_int32_row, 0,
+             sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA);
+      memset(H_int32_row, 0,
+             sizeof(int32_t) * WIENER_WIN2_CHROMA * (WIENER_WIN_CHROMA * 8));
       acc_stat_win5_one_line_avx2(
           dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
-          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+          dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row);
+      sumX += sumX_row * downsample_factor;
+
+      // Scale M matrix based on the downsampling factor
+      for (k = 0; k < wiener_win; ++k) {
+        for (l = 0; l < wiener_win; ++l) {
+          sumY[k][l] += (sumY_row[k][l] * downsample_factor);
+          M_int32[k][l] += (M_int32_row[k][l] * downsample_factor);
+        }
+      }
+      // Scale H matrix based on the downsampling factor
+      for (k = 0; k < WIENER_WIN2_CHROMA; ++k) {
+        for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+          H_int32[k][l] += (H_int32_row[k][l] * downsample_factor);
+        }
+      }
     }
     for (k = 0; k < wiener_win; ++k) {
       for (l = 0; l < wiener_win; ++l) {
@@ -480,16 +705,20 @@ static INLINE void compute_stats_win5_opt_avx2(
 void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
                             const uint8_t *src, int h_start, int h_end,
                             int v_start, int v_end, int dgd_stride,
-                            int src_stride, int64_t *M, int64_t *H) {
+                            int src_stride, int64_t *M, int64_t *H,
+                            int use_downsampled_wiener_stats) {
   if (wiener_win == WIENER_WIN) {
     compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
-                                dgd_stride, src_stride, M, H);
+                                dgd_stride, src_stride, M, H,
+                                use_downsampled_wiener_stats);
   } else if (wiener_win == WIENER_WIN_CHROMA) {
     compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
-                                dgd_stride, src_stride, M, H);
+                                dgd_stride, src_stride, M, H,
+                                use_downsampled_wiener_stats);
   } else {
     av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                        dgd_stride, src_stride, M, H);
+                        dgd_stride, src_stride, M, H,
+                        use_downsampled_wiener_stats);
   }
 }
 
@@ -861,6 +1090,229 @@ void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height,
   }
 }
 
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m256i h00, h01, h11, c0, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i h01_even = _mm256_mul_epi32(f1, f2);
+      const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h01 = _mm256_add_epi64(h01, h01_even);
+      h01 = _mm256_add_epi64(h01, h01_odd);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+  const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+  c_low = _mm256_add_epi64(c_low, c_high);
+  const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+                                         _mm256_castsi256_si128(c_low));
+
+  __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+  const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+  h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+  const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+                                           _mm256_castsi256_si128(h0x_low));
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+  const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+  h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+  const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+                                           _mm256_castsi256_si128(h1x_low));
+
+  xx_storeu_128(C, c_128bit);
+  xx_storeu_128(H[0], h0x_128bit);
+  xx_storeu_128(H[1], h1x_128bit);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_high_bd_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m256i h00, c0;
+  const __m256i zero = _mm256_setzero_si256();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
+                                           _mm256_castsi256_si128(h00));
+  const __m128i h00_val =
+      _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
+
+  const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
+                                          _mm256_castsi256_si128(c0));
+  const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_high_bd_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m256i h11, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
+                                           _mm256_castsi256_si128(h11));
+  const __m128i h11_val =
+      _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+
+  const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
+                                          _mm256_castsi256_si128(c1));
+  const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// AVX2 variant of av1_calc_proj_params_high_bd_c.
+void av1_calc_proj_params_high_bd_avx2(const uint8_t *src8, int width,
+                                       int height, int src_stride,
+                                       const uint8_t *dat8, int dat_stride,
+                                       int32_t *flt0, int flt0_stride,
+                                       int32_t *flt1, int flt1_stride,
+                                       int64_t H[2][2], int64_t C[2],
+                                       const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+                                        dat_stride, flt0, flt0_stride, flt1,
+                                        flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_high_bd_avx2(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
 #if CONFIG_AV1_HIGHBITDEPTH
 int64_t av1_highbd_pixel_proj_error_avx2(
     const uint8_t *src8, int width, int height, int src_stride,
diff --git a/media/libaom/src/av1/encoder/x86/pickrst_sse4.c b/media/libaom/src/av1/encoder/x86/pickrst_sse4.c
index a2f65a50c1..3d496ef3cd 100644
--- a/media/libaom/src/av1/encoder/x86/pickrst_sse4.c
+++ b/media/libaom/src/av1/encoder/x86/pickrst_sse4.c
@@ -38,7 +38,16 @@ static INLINE void acc_stat_win7_one_line_sse4_1(
     int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
   const int wiener_win = 7;
   int j, k, l;
-  for (j = h_start; j < h_end; j += 2) {
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
     const uint8_t *dgd_ij = dgd + j;
     const uint8_t X1 = src[j];
     const uint8_t X2 = src[j + 1];
@@ -64,11 +73,41 @@ static INLINE void acc_stat_win7_one_line_sse4_1(
       }
     }
   }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint8_t *dgd_ij = dgd + j;
+    const uint8_t X1 = src[j];
+    *sumX += X1;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_sse41` function wants its input to have interleaved
+        // copies of two pixels, but we only have one. However, the pixels
+        // are (effectively) used as inputs to a multiply-accumulate.
+        // So if we set the extra pixel slot to 0, then it is effectively
+        // ignored.
+        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
+        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
 }
 
 static INLINE void compute_stats_win7_opt_sse4_1(
     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
-    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
+    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+    int use_downsampled_wiener_stats) {
   int i, j, k, l, m, n;
   const int wiener_win = WIENER_WIN;
   const int pixel_count = (h_end - h_start) * (v_end - v_start);
@@ -78,20 +117,48 @@ static INLINE void compute_stats_win7_opt_sse4_1(
       find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
 
   int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t M_int32_row[WIENER_WIN][WIENER_WIN] = { { 0 } };
   int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
   int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int32_t H_int32_row[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
   int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
   int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
   int32_t sumX = 0;
   const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+  int downsample_factor =
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+  int32_t sumX_row = 0;
+  int32_t sumY_row[WIENER_WIN][WIENER_WIN] = { { 0 } };
 
   const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
   for (j = v_start; j < v_end; j += 64) {
     const int vert_end = AOMMIN(64, v_end - j) + j;
-    for (i = j; i < vert_end; i++) {
+    for (i = j; i < vert_end; i = i + downsample_factor) {
+      if (use_downsampled_wiener_stats &&
+          (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+        downsample_factor = vert_end - i;
+      }
+      sumX_row = 0;
+      memset(sumY_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN);
+      memset(M_int32_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN);
+      memset(H_int32_row, 0, sizeof(int32_t) * WIENER_WIN2 * (WIENER_WIN * 8));
       acc_stat_win7_one_line_sse4_1(
           dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
-          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+          dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row);
+      sumX += sumX_row * downsample_factor;
+      // Scale M matrix based on the downsampling factor
+      for (k = 0; k < wiener_win; ++k) {
+        for (l = 0; l < wiener_win; ++l) {
+          sumY[k][l] += (sumY_row[k][l] * downsample_factor);
+          M_int32[k][l] += (M_int32_row[k][l] * downsample_factor);
+        }
+      }
+      // Scale H matrix based on the downsampling factor
+      for (k = 0; k < WIENER_WIN2; ++k) {
+        for (l = 0; l < WIENER_WIN * 8; ++l) {
+          H_int32[k][l] += (H_int32_row[k][l] * downsample_factor);
+        }
+      }
     }
     for (k = 0; k < wiener_win; ++k) {
       for (l = 0; l < wiener_win; ++l) {
@@ -173,7 +240,16 @@ static INLINE void acc_stat_highbd_win7_one_line_sse4_1(
     int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
   int j, k, l;
   const int wiener_win = WIENER_WIN;
-  for (j = h_start; j < h_end; j += 2) {
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
     const uint16_t X1 = src[j];
     const uint16_t X2 = src[j + 1];
     *sumX += X1 + X2;
@@ -209,6 +285,42 @@ static INLINE void acc_stat_highbd_win7_one_line_sse4_1(
       }
     }
   }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint16_t X1 = src[j];
+    *sumX += X1;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_highbd_sse41` function wants its input to have
+        // interleaved copies of two pixels, but we only have one. However, the
+        // pixels are (effectively) used as inputs to a multiply-accumulate. So
+        // if we set the extra pixel slot to 0, then it is effectively ignored.
+        const __m128i dgd_ijkl = _mm_set1_epi32((uint32_t)D1);
+
+        acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+      }
+    }
+  }
 }
 
 static INLINE void compute_stats_highbd_win7_opt_sse4_1(
@@ -277,7 +389,16 @@ static INLINE void acc_stat_highbd_win5_one_line_sse4_1(
     int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
   int j, k, l;
   const int wiener_win = WIENER_WIN_CHROMA;
-  for (j = h_start; j < h_end; j += 2) {
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
     const uint16_t X1 = src[j];
     const uint16_t X2 = src[j + 1];
     *sumX += X1 + X2;
@@ -309,6 +430,38 @@ static INLINE void acc_stat_highbd_win5_one_line_sse4_1(
       }
     }
   }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint16_t X1 = src[j];
+    *sumX += X1;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_highbd_sse41` function wants its input to have
+        // interleaved copies of two pixels, but we only have one. However, the
+        // pixels are (effectively) used as inputs to a multiply-accumulate. So
+        // if we set the extra pixel slot to 0, then it is effectively ignored.
+        const __m128i dgd_ijkl = _mm_set1_epi32((uint32_t)D1);
+
+        acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+      }
+    }
+  }
 }
 
 static INLINE void compute_stats_highbd_win5_opt_sse4_1(
@@ -397,7 +550,16 @@ static INLINE void acc_stat_win5_one_line_sse4_1(
     int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
   const int wiener_win = WIENER_WIN_CHROMA;
   int j, k, l;
-  for (j = h_start; j < h_end; j += 2) {
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  const int h_end_even = h_end & ~1;
+  const int has_odd_pixel = h_end & 1;
+  for (j = h_start; j < h_end_even; j += 2) {
     const uint8_t *dgd_ij = dgd + j;
     const uint8_t X1 = src[j];
     const uint8_t X2 = src[j + 1];
@@ -421,11 +583,39 @@ static INLINE void acc_stat_win5_one_line_sse4_1(
       }
     }
   }
+  // If the width is odd, add in the final pixel
+  if (has_odd_pixel) {
+    const uint8_t *dgd_ij = dgd + j;
+    const uint8_t X1 = src[j];
+    *sumX += X1;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        sumY[k][l] += D1;
+        M_int[k][l] += D1 * X1;
+
+        // The `acc_stat_sse41` function wants its input to have interleaved
+        // copies of two pixels, but we only have one. However, the pixels
+        // are (effectively) used as inputs to a multiply-accumulate.
+        // So if we set the extra pixel slot to 0, then it is effectively
+        // ignored.
+        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
+        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
 }
 
 static INLINE void compute_stats_win5_opt_sse4_1(
     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
-    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
+    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+    int use_downsampled_wiener_stats) {
   int i, j, k, l, m, n;
   const int wiener_win = WIENER_WIN_CHROMA;
   const int pixel_count = (h_end - h_start) * (v_end - v_start);
@@ -435,20 +625,51 @@ static INLINE void compute_stats_win5_opt_sse4_1(
       find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
 
   int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t M_int32_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
   int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
   int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int32_t H_int32_row[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
   int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
   int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
   int32_t sumX = 0;
   const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+  int downsample_factor =
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+  int32_t sumX_row = 0;
+  int32_t sumY_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
 
   const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
   for (j = v_start; j < v_end; j += 64) {
     const int vert_end = AOMMIN(64, v_end - j) + j;
-    for (i = j; i < vert_end; i++) {
+    for (i = j; i < vert_end; i = i + downsample_factor) {
+      if (use_downsampled_wiener_stats &&
+          (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+        downsample_factor = vert_end - i;
+      }
+      sumX_row = 0;
+      memset(sumY_row, 0,
+             sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA);
+      memset(M_int32_row, 0,
+             sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA);
+      memset(H_int32_row, 0,
+             sizeof(int32_t) * WIENER_WIN2_CHROMA * (WIENER_WIN_CHROMA * 8));
       acc_stat_win5_one_line_sse4_1(
           dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
-          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+          dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row);
+      sumX += sumX_row * downsample_factor;
+      // Scale M matrix based on the downsampling factor
+      for (k = 0; k < wiener_win; ++k) {
+        for (l = 0; l < wiener_win; ++l) {
+          sumY[k][l] += (sumY_row[k][l] * downsample_factor);
+          M_int32[k][l] += (M_int32_row[k][l] * downsample_factor);
+        }
+      }
+      // Scale H matrix based on the downsampling factor
+      for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+        for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+          H_int32[k][l] += (H_int32_row[k][l] * downsample_factor);
+        }
+      }
     }
     for (k = 0; k < wiener_win; ++k) {
       for (l = 0; l < wiener_win; ++l) {
@@ -484,16 +705,20 @@ static INLINE void compute_stats_win5_opt_sse4_1(
 void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
                               const uint8_t *src, int h_start, int h_end,
                               int v_start, int v_end, int dgd_stride,
-                              int src_stride, int64_t *M, int64_t *H) {
+                              int src_stride, int64_t *M, int64_t *H,
+                              int use_downsampled_wiener_stats) {
   if (wiener_win == WIENER_WIN) {
     compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
-                                  dgd_stride, src_stride, M, H);
+                                  dgd_stride, src_stride, M, H,
+                                  use_downsampled_wiener_stats);
   } else if (wiener_win == WIENER_WIN_CHROMA) {
     compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
-                                  dgd_stride, src_stride, M, H);
+                                  dgd_stride, src_stride, M, H,
+                                  use_downsampled_wiener_stats);
   } else {
     av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                        dgd_stride, src_stride, M, H);
+                        dgd_stride, src_stride, M, H,
+                        use_downsampled_wiener_stats);
   }
 }
 
@@ -624,6 +849,429 @@ int64_t av1_lowbd_pixel_proj_error_sse4_1(
   return err;
 }
 
+// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
+// C and H need to be computed.
+static AOM_INLINE void calc_proj_params_r0_r1_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m128i h00, h01, h11, c0, c1;
+  const __m128i zero = _mm_setzero_si128();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+      const __m128i s_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+      __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+      __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f1 = _mm_sub_epi32(f1, d);
+      f2 = _mm_sub_epi32(f2, d);
+
+      const __m128i h00_even = _mm_mul_epi32(f1, f1);
+      const __m128i h00_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+      h00 = _mm_add_epi64(h00, h00_even);
+      h00 = _mm_add_epi64(h00, h00_odd);
+
+      const __m128i h01_even = _mm_mul_epi32(f1, f2);
+      const __m128i h01_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32));
+      h01 = _mm_add_epi64(h01, h01_even);
+      h01 = _mm_add_epi64(h01, h01_odd);
+
+      const __m128i h11_even = _mm_mul_epi32(f2, f2);
+      const __m128i h11_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+      h11 = _mm_add_epi64(h11, h11_even);
+      h11 = _mm_add_epi64(h11, h11_odd);
+
+      const __m128i c0_even = _mm_mul_epi32(f1, s);
+      const __m128i c0_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+      c0 = _mm_add_epi64(c0, c0_even);
+      c0 = _mm_add_epi64(c0, c0_odd);
+
+      const __m128i c1_even = _mm_mul_epi32(f2, s);
+      const __m128i c1_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+      c1 = _mm_add_epi64(c1, c1_even);
+      c1 = _mm_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m128i c_low = _mm_unpacklo_epi64(c0, c1);
+  const __m128i c_high = _mm_unpackhi_epi64(c0, c1);
+  c_low = _mm_add_epi64(c_low, c_high);
+
+  __m128i h0x_low = _mm_unpacklo_epi64(h00, h01);
+  const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01);
+  h0x_low = _mm_add_epi64(h0x_low, h0x_high);
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m128i h1x_low = _mm_unpacklo_epi64(zero, h11);
+  const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11);
+  h1x_low = _mm_add_epi64(h1x_low, h1x_high);
+
+  xx_storeu_128(C, c_low);
+  xx_storeu_128(H[0], h0x_low);
+  xx_storeu_128(H[1], h1x_low);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m128i h00, c0;
+  const __m128i zero = _mm_setzero_si128();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+      const __m128i s_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+      __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f1 = _mm_sub_epi32(f1, d);
+
+      const __m128i h00_even = _mm_mul_epi32(f1, f1);
+      const __m128i h00_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+      h00 = _mm_add_epi64(h00, h00_even);
+      h00 = _mm_add_epi64(h00, h00_odd);
+
+      const __m128i c0_even = _mm_mul_epi32(f1, s);
+      const __m128i c0_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+      c0 = _mm_add_epi64(c0, c0_even);
+      c0 = _mm_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8));
+
+  const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, zero);
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m128i h11, c1;
+  const __m128i zero = _mm_setzero_si128();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+      const __m128i s_load = _mm_cvtepu8_epi32(
+          _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+      __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f2 = _mm_sub_epi32(f2, d);
+
+      const __m128i h11_even = _mm_mul_epi32(f2, f2);
+      const __m128i h11_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+      h11 = _mm_add_epi64(h11, h11_even);
+      h11 = _mm_add_epi64(h11, h11_odd);
+
+      const __m128i c1_even = _mm_mul_epi32(f2, s);
+      const __m128i c1_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+      c1 = _mm_add_epi64(c1, c1_even);
+      c1 = _mm_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8));
+
+  const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(zero, c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// SSE4.1 variant of av1_calc_proj_params_c.
+void av1_calc_proj_params_sse4_1(const uint8_t *src8, int width, int height,
+                                 int src_stride, const uint8_t *dat8,
+                                 int dat_stride, int32_t *flt0, int flt0_stride,
+                                 int32_t *flt1, int flt1_stride,
+                                 int64_t H[2][2], int64_t C[2],
+                                 const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_sse4_1(src8, width, height, src_stride, dat8,
+                                  dat_stride, flt0, flt0_stride, flt1,
+                                  flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_sse4_1(src8, width, height, src_stride, dat8,
+                               dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_sse4_1(src8, width, height, src_stride, dat8,
+                               dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m128i h00, h01, h11, c0, c1;
+  const __m128i zero = _mm_setzero_si128();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m128i s_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+      __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f1 = _mm_sub_epi32(f1, d);
+      f2 = _mm_sub_epi32(f2, d);
+
+      const __m128i h00_even = _mm_mul_epi32(f1, f1);
+      const __m128i h00_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+      h00 = _mm_add_epi64(h00, h00_even);
+      h00 = _mm_add_epi64(h00, h00_odd);
+
+      const __m128i h01_even = _mm_mul_epi32(f1, f2);
+      const __m128i h01_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32));
+      h01 = _mm_add_epi64(h01, h01_even);
+      h01 = _mm_add_epi64(h01, h01_odd);
+
+      const __m128i h11_even = _mm_mul_epi32(f2, f2);
+      const __m128i h11_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+      h11 = _mm_add_epi64(h11, h11_even);
+      h11 = _mm_add_epi64(h11, h11_odd);
+
+      const __m128i c0_even = _mm_mul_epi32(f1, s);
+      const __m128i c0_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+      c0 = _mm_add_epi64(c0, c0_even);
+      c0 = _mm_add_epi64(c0, c0_odd);
+
+      const __m128i c1_even = _mm_mul_epi32(f2, s);
+      const __m128i c1_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+      c1 = _mm_add_epi64(c1, c1_even);
+      c1 = _mm_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m128i c_low = _mm_unpacklo_epi64(c0, c1);
+  const __m128i c_high = _mm_unpackhi_epi64(c0, c1);
+  c_low = _mm_add_epi64(c_low, c_high);
+
+  __m128i h0x_low = _mm_unpacklo_epi64(h00, h01);
+  const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01);
+  h0x_low = _mm_add_epi64(h0x_low, h0x_high);
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m128i h1x_low = _mm_unpacklo_epi64(zero, h11);
+  const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11);
+  h1x_low = _mm_add_epi64(h1x_low, h1x_high);
+
+  xx_storeu_128(C, c_low);
+  xx_storeu_128(H[0], h0x_low);
+  xx_storeu_128(H[1], h1x_low);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_high_bd_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m128i h00, c0;
+  const __m128i zero = _mm_setzero_si128();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m128i s_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f1 = _mm_sub_epi32(f1, d);
+
+      const __m128i h00_even = _mm_mul_epi32(f1, f1);
+      const __m128i h00_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+      h00 = _mm_add_epi64(h00, h00_even);
+      h00 = _mm_add_epi64(h00, h00_odd);
+
+      const __m128i c0_even = _mm_mul_epi32(f1, s);
+      const __m128i c0_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+      c0 = _mm_add_epi64(c0, c0_even);
+      c0 = _mm_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8));
+
+  const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, zero);
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_high_bd_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m128i h11, c1;
+  const __m128i zero = _mm_setzero_si128();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i u_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m128i s_load = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+      __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm_sub_epi32(s, d);
+      f2 = _mm_sub_epi32(f2, d);
+
+      const __m128i h11_even = _mm_mul_epi32(f2, f2);
+      const __m128i h11_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+      h11 = _mm_add_epi64(h11, h11_even);
+      h11 = _mm_add_epi64(h11, h11_odd);
+
+      const __m128i c1_even = _mm_mul_epi32(f2, s);
+      const __m128i c1_odd =
+          _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+      c1 = _mm_add_epi64(c1, c1_even);
+      c1 = _mm_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8));
+
+  const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(zero, c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// SSE4.1 variant of av1_calc_proj_params_high_bd_c.
+void av1_calc_proj_params_high_bd_sse4_1(const uint8_t *src8, int width,
+                                         int height, int src_stride,
+                                         const uint8_t *dat8, int dat_stride,
+                                         int32_t *flt0, int flt0_stride,
+                                         int32_t *flt1, int flt1_stride,
+                                         int64_t H[2][2], int64_t C[2],
+                                         const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+                                          dat_stride, flt0, flt0_stride, flt1,
+                                          flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+                                       dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+                                       dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
 #if CONFIG_AV1_HIGHBITDEPTH
 int64_t av1_highbd_pixel_proj_error_sse4_1(
     const uint8_t *src8, int width, int height, int src_stride,
diff --git a/media/libaom/src/av1/encoder/x86/rdopt_avx2.c b/media/libaom/src/av1/encoder/x86/rdopt_avx2.c
index f588badc7c..3bc763c587 100644
--- a/media/libaom/src/av1/encoder/x86/rdopt_avx2.c
+++ b/media/libaom/src/av1/encoder/x86/rdopt_avx2.c
@@ -11,8 +11,8 @@
 
 #include <assert.h>
 #include <immintrin.h>
+#include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
-#include "aom_ports/system_state.h"
 
 #include "config/av1_rtcd.h"
 #include "av1/encoder/rdopt.h"
@@ -31,8 +31,8 @@ INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
   //                      [ m n o p ]
 
   const __m256i pixels = _mm256_set_epi64x(
-      *(uint64_t *)&diff[0 * stride], *(uint64_t *)&diff[1 * stride],
-      *(uint64_t *)&diff[2 * stride], *(uint64_t *)&diff[3 * stride]);
+      loadu_uint64(&diff[0 * stride]), loadu_uint64(&diff[1 * stride]),
+      loadu_uint64(&diff[2 * stride]), loadu_uint64(&diff[3 * stride]));
   // pixels = [d c b a h g f e] [l k j i p o n m] as i16
 
   const __m256i slli = _mm256_slli_epi64(pixels, 16);
@@ -227,8 +227,6 @@ void av1_get_horver_correlation_full_avx2(const int16_t *diff, int stride,
   int64_t y2_sum = x2_sum - x2_firstcol;
   int64_t z2_sum = x2_sum - x2_firstrow;
 
-  aom_clear_system_state();
-
   const float num_hor = (float)(height * (width - 1));
   const float num_ver = (float)((height - 1) * width);
 
diff --git a/media/libaom/src/av1/encoder/x86/rdopt_sse4.c b/media/libaom/src/av1/encoder/x86/rdopt_sse4.c
index 67d94b4ca8..4c4ec1fa7d 100644
--- a/media/libaom/src/av1/encoder/x86/rdopt_sse4.c
+++ b/media/libaom/src/av1/encoder/x86/rdopt_sse4.c
@@ -12,7 +12,6 @@
 #include <assert.h>
 #include <emmintrin.h>
 #include "aom_dsp/x86/synonyms.h"
-#include "aom_ports/system_state.h"
 
 #include "config/av1_rtcd.h"
 #include "av1/encoder/rdopt.h"
@@ -246,8 +245,6 @@ void av1_get_horver_correlation_full_sse4_1(const int16_t *diff, int stride,
   int64_t y2_sum = x2_sum - x2_firstcol;
   int64_t z2_sum = x2_sum - x2_firstrow;
 
-  aom_clear_system_state();
-
   const float num_hor = (float)(height * (width - 1));
   const float num_ver = (float)((height - 1) * width);
 
diff --git a/media/libaom/src/av1/encoder/x86/reconinter_enc_sse2.c b/media/libaom/src/av1/encoder/x86/reconinter_enc_sse2.c
new file mode 100644
index 0000000000..6455bf3d5a
--- /dev/null
+++ b/media/libaom/src/av1/encoder/x86/reconinter_enc_sse2.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/reconinter_enc.h"
+
+void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
+                             int mi_row, int mi_col, const MV *const mv,
+                             uint8_t *comp_pred, int width, int height,
+                             int subpel_x_q3, int subpel_y_q3,
+                             const uint8_t *ref, int ref_stride,
+                             int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+  // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
+  // 2-tap yet.
+  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    if (width >= 16) {
+      int i;
+      assert(!(width & 15));
+      /*Read 16 pixels one row at a time.*/
+      for (i = 0; i < height; i++) {
+        int j;
+        for (j = 0; j < width; j += 16) {
+          xx_storeu_128(comp_pred, xx_loadu_128(ref));
+          comp_pred += 16;
+          ref += 16;
+        }
+        ref += ref_stride - width;
+      }
+    } else if (width >= 8) {
+      int i;
+      assert(!(width & 7));
+      assert(!(height & 1));
+      /*Read 8 pixels two rows at a time.*/
+      for (i = 0; i < height; i += 2) {
+        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
+        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
+        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
+        comp_pred += 16;
+        ref += 2 * ref_stride;
+      }
+    } else {
+      int i;
+      assert(!(width & 3));
+      assert(!(height & 3));
+      /*Read 4 pixels four rows at a time.*/
+      for (i = 0; i < height; i++) {
+        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
+        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
+        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
+        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
+        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
+                                               _mm_unpacklo_epi32(row2, row3));
+        xx_storeu_128(comp_pred, reg);
+        comp_pred += 16;
+        ref += 4 * ref_stride;
+      }
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
+                        width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
+                       width, height);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+    uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+                                    ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+                                    : temp;
+    uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+    int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+                        kernel_x, 16, NULL, -1, width, intermediate_height);
+    aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+                       kernel_y, 16, width, height);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+                                                    const __m128i *w0,
+                                                    const __m128i *w1,
+                                                    const __m128i *r,
+                                                    void *const result) {
+  assert(DIST_PRECISION_BITS <= 4);
+  __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+  __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+  __m128i sum = _mm_adds_epu16(mult0, mult1);
+  __m128i round = _mm_adds_epu16(sum, *r);
+  __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, shift);
+}
+
+void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
+                                    const struct AV1Common *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint8_t *comp_pred8, int width, int height,
+                                    int subpel_x_q3, int subpel_y_q3,
+                                    const uint8_t *ref8, int ref_stride, int bd,
+                                    int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+    if (width >= 8) {
+      int i;
+      assert(!(width & 7));
+      /*Read 8 pixels one row at a time.*/
+      for (i = 0; i < height; i++) {
+        int j;
+        for (j = 0; j < width; j += 8) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          _mm_storeu_si128((__m128i *)comp_pred, s0);
+          comp_pred += 8;
+          ref += 8;
+        }
+        ref += ref_stride - width;
+      }
+    } else {
+      int i;
+      assert(!(width & 3));
+      /*Read 4 pixels two rows at a time.*/
+      for (i = 0; i < height; i += 2) {
+        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
+        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
+        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
+        _mm_storeu_si128((__m128i *)comp_pred, t0);
+        comp_pred += 8;
+        ref += 2 * ref_stride;
+      }
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
+                               NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
+                              kernel, 16, width, height, bd);
+  } else {
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
+    uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+                                     ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+                                     : temp;
+    uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz(
+        ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
+        MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
+    aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE,
+                              comp_pred8, width, NULL, -1, kernel_y, 16, width,
+                              height, bd);
+  }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, int subpel_search) {
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+  /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
+  assert(!(width * height & 7));
+  int n = width * height >> 3;
+  for (int i = 0; i < n; i++) {
+    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
+    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+    _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
+    comp_pred16 += 8;
+    pred += 8;
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+    int subpel_search) {
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  int n;
+  int i;
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  assert(!(width * height & 7));
+  n = width * height >> 3;
+
+  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
+  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
+  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred16);
+    __m128i p1 = xx_loadu_128(pred);
+
+    highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
+
+    comp_pred16 += 8;
+    pred += 8;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, int subpel_search) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+  for (i = 0; i < n; i++) {
+    __m128i s0 = xx_loadu_128(comp_pred);
+    __m128i p0 = xx_loadu_128(pred);
+    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
+    comp_pred += 16;
+    pred += 16;
+  }
+}
+
+void aom_comp_mask_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int subpel_search) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+                     mask_stride, invert_mask);
+}
diff --git a/media/libaom/src/av1/encoder/x86/reconinter_enc_ssse3.c b/media/libaom/src/av1/encoder/x86/reconinter_enc_ssse3.c
new file mode 100644
index 0000000000..7ac0f0d037
--- /dev/null
+++ b/media/libaom/src/av1/encoder/x86/reconinter_enc_ssse3.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
+                                        const __m128i *w, const __m128i *r,
+                                        void *const result) {
+  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p1 = xx_loadu_128(pred);
+
+    compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+    comp_pred += 16;
+    pred += 16;
+  }
+}
diff --git a/media/libaom/src/av1/encoder/x86/temporal_filter_avx2.c b/media/libaom/src/av1/encoder/x86/temporal_filter_avx2.c
index 847f7283ce..8aa07641aa 100644
--- a/media/libaom/src/av1/encoder/x86/temporal_filter_avx2.c
+++ b/media/libaom/src/av1/encoder/x86/temporal_filter_avx2.c
@@ -127,23 +127,17 @@ static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
   return _mm_extract_epi32(v128a, 0);
 }
 
-static void apply_temporal_filter_planewise(
+static void apply_temporal_filter(
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
-    const double sigma, const int decay_control, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
-    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
-  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    uint16_t *frame_sse, uint32_t *luma_sse_sum,
+    const double inv_num_ref_pixels, const double decay_factor,
+    const double inv_factor, const double weight_factor, double *d_factor) {
   assert(((block_width == 16) || (block_width == 32)) &&
          ((block_height == 16) || (block_height == 32)));
-  if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
 
   uint32_t acc_5x5_sse[BH][BW];
-  const double h = decay_control * (0.7 + log(sigma + 1.0));
-  const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
-  uint16_t *frame_sse =
-      (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
 
   if (block_width == 32) {
     get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
@@ -201,84 +195,120 @@ static void apply_temporal_filter_planewise(
   for (int i = 0, k = 0; i < block_height; i++) {
     for (int j = 0; j < block_width; j++, k++) {
       const int pixel_value = frame2[i * stride2 + j];
+      uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
 
-      int diff_sse = acc_5x5_sse[i][j];
-      int num_ref_pixels =
-          TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH;
-
-      // Filter U-plane and V-plane using Y-plane. This is because motion
-      // search is only done on Y-plane, so the information from Y-plane will
-      // be more accurate.
-      if (plane != PLANE_TYPE_Y) {
-        for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-          for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-            const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-            const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-            diff_sse += luma_sq_error[yy * SSE_STRIDE + xx];
-            ++num_ref_pixels;
-          }
-        }
-      }
-
-      const double window_error = (double)(diff_sse) / num_ref_pixels;
+      const double window_error = diff_sse * inv_num_ref_pixels;
       const int subblock_idx =
           (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error =
-          (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+      const double block_error = (double)subblock_mses[subblock_idx];
+      const double combined_error =
+          weight_factor * window_error + block_error * inv_factor;
 
-      const double scaled_diff =
-          AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0);
-      const int adjusted_weight =
-          (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+      double scaled_error =
+          combined_error * d_factor[subblock_idx] * decay_factor;
+      scaled_error = AOMMIN(scaled_error, 7);
+      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
 
-      count[k] += adjusted_weight;
-      accumulator[k] += adjusted_weight * pixel_value;
+      count[k] += weight;
+      accumulator[k] += weight * pixel_value;
     }
   }
 }
 
-void av1_apply_temporal_filter_planewise_avx2(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+void av1_apply_temporal_filter_avx2(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
     const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
-  if (is_high_bitdepth) {
-    assert(0 && "Only support low bit-depth with avx2!");
-  }
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!");
+  assert(!is_high_bitdepth && "Only support low bit-depth with avx2!");
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
-  const int frame_height = ref_frame->heights[0] << mbd->plane[0].subsampling_y;
-  const int decay_control = frame_height >= 720 ? 4 : 3;
+  (void)is_high_bitdepth;
 
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
-  uint16_t luma_sq_error[SSE_STRIDE * BH];
-  uint16_t *chroma_sq_error =
-      (num_planes > 0)
-          ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t))
-          : NULL;
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
 
+  // Handle planes in sequence.
+  int plane_offset = 0;
   for (int plane = 0; plane < num_planes; ++plane) {
     const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
     const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
-    const uint32_t frame_stride = ref_frame->strides[plane == 0 ? 0 : 1];
+    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
 
-    const uint8_t *ref = ref_frame->buffers[plane] + frame_offset;
+    const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
     const int ss_x_shift =
-        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
     const int ss_y_shift =
-        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
-
-    apply_temporal_filter_planewise(
-        ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
-        noise_levels[plane], decay_control, use_subblock, block_mse,
-        subblock_mses, q_factor, accum + mb_pels * plane,
-        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
-        ss_x_shift, ss_y_shift);
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++, k++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx];
+            }
+          }
+        }
+      }
+    }
+
+    apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w,
+                          plane_w, plane_h, subblock_mses, accum + plane_offset,
+                          count + plane_offset, frame_sse, luma_sse_sum,
+                          inv_num_ref_pixels, decay_factor, inv_factor,
+                          weight_factor, d_factor);
+    plane_offset += plane_h * plane_w;
   }
-  if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
 }
diff --git a/media/libaom/src/av1/encoder/x86/temporal_filter_constants.h b/media/libaom/src/av1/encoder/x86/temporal_filter_constants.h
deleted file mode 100644
index 7cd61d75ef..0000000000
--- a/media/libaom/src/av1/encoder/x86/temporal_filter_constants.h
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
-#define AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
-
-// Division using multiplication and shifting. The C implementation does:
-// modifier *= 3;
-// modifier /= index;
-// where 'modifier' is a set of summed values and 'index' is the number of
-// summed values.
-//
-// This equation works out to (m * 3) / i which reduces to:
-// m * 3/4
-// m * 1/2
-// m * 1/3
-//
-// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
-// m * C / 65536
-// we can create a C to replicate the division.
-//
-// m * 49152 / 65536 = m * 3/4
-// m * 32758 / 65536 = m * 1/2
-// m * 21846 / 65536 = m * 0.3333
-//
-// These are loaded using an instruction expecting int16_t values but are used
-// with _mm_mulhi_epu16(), which treats them as unsigned.
-#define NEIGHBOR_CONSTANT_4 (int16_t)49152
-#define NEIGHBOR_CONSTANT_5 (int16_t)39322
-#define NEIGHBOR_CONSTANT_6 (int16_t)32768
-#define NEIGHBOR_CONSTANT_7 (int16_t)28087
-#define NEIGHBOR_CONSTANT_8 (int16_t)24576
-#define NEIGHBOR_CONSTANT_9 (int16_t)21846
-#define NEIGHBOR_CONSTANT_10 (int16_t)19661
-#define NEIGHBOR_CONSTANT_11 (int16_t)17874
-#define NEIGHBOR_CONSTANT_13 (int16_t)15124
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_7,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
-};
-
-DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
-};
-
-DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
-};
-
-DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
-};
-
-static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
-  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
-  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-  LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1
-};
-
-static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1
-};
-
-static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-  RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1
-};
-
-static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
-  TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-  LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4
-};
-
-static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4
-};
-
-static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-  RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4
-};
-
-static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
-  TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4
-};
-
-#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U
-#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U
-#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U
-#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U
-#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U
-#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U
-#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U
-#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U
-#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7,
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8,
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11,
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13,
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
-};
-
-static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
-};
-
-static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1
-};
-
-static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1
-};
-
-static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1
-};
-
-static const uint32_t
-    *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4
-    };
-
-#define DIST_STRIDE ((BW) + 2)
-#endif  // AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/media/libaom/src/av1/encoder/x86/temporal_filter_sse2.c b/media/libaom/src/av1/encoder/x86/temporal_filter_sse2.c
index 1722fac86c..26c3926dca 100644
--- a/media/libaom/src/av1/encoder/x86/temporal_filter_sse2.c
+++ b/media/libaom/src/av1/encoder/x86/temporal_filter_sse2.c
@@ -102,23 +102,17 @@ static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) {
   return _mm_cvtsi128_si32(veca);
 }
 
-static void apply_temporal_filter_planewise(
+static void apply_temporal_filter(
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
-    const double sigma, const int decay_control, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
-    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
-  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
+    const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+    uint16_t *frame_sse, uint32_t *luma_sse_sum,
+    const double inv_num_ref_pixels, const double decay_factor,
+    const double inv_factor, const double weight_factor, double *d_factor) {
   assert(((block_width == 16) || (block_width == 32)) &&
          ((block_height == 16) || (block_height == 32)));
-  if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
 
   uint32_t acc_5x5_sse[BH][BW];
-  const double h = decay_control * (0.7 + log(sigma + 1.0));
-  const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
-  uint16_t *frame_sse =
-      (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
 
   get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
                     frame_sse, SSE_STRIDE);
@@ -178,85 +172,120 @@ static void apply_temporal_filter_planewise(
   for (int i = 0, k = 0; i < block_height; i++) {
     for (int j = 0; j < block_width; j++, k++) {
       const int pixel_value = frame2[i * stride2 + j];
+      uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
 
-      int diff_sse = acc_5x5_sse[i][j];
-      int num_ref_pixels =
-          TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH;
-
-      // Filter U-plane and V-plane using Y-plane. This is because motion
-      // search is only done on Y-plane, so the information from Y-plane will
-      // be more accurate.
-      if (plane != PLANE_TYPE_Y) {
-        for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-          for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-            const int yy = (i << ss_y_shift) + ii;      // Y-coord on Y-plane.
-            const int xx = (j << ss_x_shift) + jj + 2;  // X-coord on Y-plane.
-            const int ww = SSE_STRIDE;                  // Stride of Y-plane.
-            diff_sse += luma_sq_error[yy * ww + xx];
-            ++num_ref_pixels;
-          }
-        }
-      }
-
-      const double window_error = (double)(diff_sse) / num_ref_pixels;
+      const double window_error = diff_sse * inv_num_ref_pixels;
       const int subblock_idx =
           (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error =
-          (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+      const double block_error = (double)subblock_mses[subblock_idx];
+      const double combined_error =
+          weight_factor * window_error + block_error * inv_factor;
 
-      const double scaled_diff =
-          AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0);
-      const int adjusted_weight =
-          (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+      double scaled_error =
+          combined_error * d_factor[subblock_idx] * decay_factor;
+      scaled_error = AOMMIN(scaled_error, 7);
+      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
 
-      count[k] += adjusted_weight;
-      accumulator[k] += adjusted_weight * pixel_value;
+      count[k] += weight;
+      accumulator[k] += weight * pixel_value;
     }
   }
 }
 
-void av1_apply_temporal_filter_planewise_sse2(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+void av1_apply_temporal_filter_sse2(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
+    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strength,
     const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
-  if (is_high_bitdepth) {
-    assert(0 && "Only support low bit-depth with sse2!");
-  }
+  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+  assert(!is_high_bitdepth && "Only support low bit-depth with sse2!");
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
-  const int frame_height = ref_frame->heights[0] << mbd->plane[0].subsampling_y;
-  const int decay_control = frame_height >= 720 ? 4 : 3;
+  (void)is_high_bitdepth;
 
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
-  uint16_t luma_sq_error[SSE_STRIDE * BH];
-  uint16_t *chroma_sq_error =
-      (num_planes > 0)
-          ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t))
-          : NULL;
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int min_frame_size = AOMMIN(frame_height, frame_width);
+  // Variables to simplify combined error calculation.
+  const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+                                   TF_SEARCH_ERROR_NORM_WEIGHT);
+  const double weight_factor =
+      (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
+  double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+  q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
+  // Smaller strength -> smaller filtering weight.
+  double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+  s_decay = CLIP(s_decay, 1e-5, 1);
+  double d_factor[4] = { 0 };
+  uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+  uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
 
+  // Handle planes in sequence.
+  int plane_offset = 0;
   for (int plane = 0; plane < num_planes; ++plane) {
     const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
     const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
-    const uint32_t frame_stride = ref_frame->strides[plane == 0 ? 0 : 1];
+    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
 
-    const uint8_t *ref = ref_frame->buffers[plane] + frame_offset;
+    const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
     const int ss_x_shift =
-        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+        mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
     const int ss_y_shift =
-        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
-
-    apply_temporal_filter_planewise(
-        ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
-        noise_levels[plane], decay_control, use_subblock, block_mse,
-        subblock_mses, q_factor, accum + mb_pels * plane,
-        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
-        ss_x_shift, ss_y_shift);
+        mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+    const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+                               ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+    const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+    // Larger noise -> larger filtering weight.
+    const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
+    const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+    // Filter U-plane and V-plane using Y-plane. This is because motion
+    // search is only done on Y-plane, so the information from Y-plane
+    // will be more accurate. The luma sse sum is reused in both chroma
+    // planes.
+    if (plane == AOM_PLANE_U) {
+      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+        for (unsigned int j = 0; j < plane_w; j++, k++) {
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+            }
+          }
+        }
+      }
+    }
+
+    apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w,
+                          plane_w, plane_h, subblock_mses, accum + plane_offset,
+                          count + plane_offset, frame_sse, luma_sse_sum,
+                          inv_num_ref_pixels, decay_factor, inv_factor,
+                          weight_factor, d_factor);
+    plane_offset += plane_h * plane_w;
   }
-  if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
 }
diff --git a/media/libaom/src/av1/encoder/x86/temporal_filter_sse4.c b/media/libaom/src/av1/encoder/x86/temporal_filter_sse4.c
deleted file mode 100644
index e3f9f5f276..0000000000
--- a/media/libaom/src/av1/encoder/x86/temporal_filter_sse4.c
+++ /dev/null
@@ -1,2044 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <smmintrin.h>
-
-#include "config/av1_rtcd.h"
-#include "aom/aom_integer.h"
-#include "av1/encoder/encoder.h"
-#include "av1/encoder/temporal_filter.h"
-#include "av1/encoder/x86/temporal_filter_constants.h"
-
-//////////////////////////
-// Low bit-depth Begins //
-//////////////////////////
-
-// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
-// difference squared, and store as unsigned 16-bit integer to dst.
-static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
-                                uint16_t *dst) {
-  const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a);
-  const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b);
-
-  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
-  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
-
-  __m128i dist_first;
-
-  dist_first = _mm_sub_epi16(a_first, b_first);
-  dist_first = _mm_mullo_epi16(dist_first, dist_first);
-
-  _mm_storeu_si128((__m128i *)dst, dist_first);
-}
-
-static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
-                                 uint16_t *dst) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
-  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
-
-  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
-  const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero);
-  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
-  const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero);
-
-  __m128i dist_first, dist_second;
-
-  dist_first = _mm_sub_epi16(a_first, b_first);
-  dist_second = _mm_sub_epi16(a_second, b_second);
-  dist_first = _mm_mullo_epi16(dist_first, dist_first);
-  dist_second = _mm_mullo_epi16(dist_second, dist_second);
-
-  _mm_storeu_si128((__m128i *)dst, dist_first);
-  _mm_storeu_si128((__m128i *)(dst + 8), dist_second);
-}
-
-static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) {
-  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
-}
-
-static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first,
-                                __m128i *reg_second) {
-  read_dist_8(dist, reg_first);
-  read_dist_8(dist + 8, reg_second);
-}
-
-// Average the value based on the number of values summed (9 for pixels away
-// from the border, 4 for pixels in corners, and 6 for other edge values).
-//
-// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
-// by weight.
-static __m128i average_8(__m128i sum, const __m128i *mul_constants,
-                         const int strength, const int rounding,
-                         const int weight) {
-  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
-  const __m128i weight_u16 = _mm_set1_epi16(weight);
-  const __m128i sixteen = _mm_set1_epi16(16);
-
-  // modifier * 3 / index;
-  sum = _mm_mulhi_epu16(sum, *mul_constants);
-
-  sum = _mm_adds_epu16(sum, rounding_u16);
-  sum = _mm_srl_epi16(sum, strength_u128);
-
-  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
-  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
-  // So this needs to use the epu16 version which did not come until SSE4.
-  sum = _mm_min_epu16(sum, sixteen);
-
-  sum = _mm_sub_epi16(sixteen, sum);
-
-  return _mm_mullo_epi16(sum, weight_u16);
-}
-
-static __m128i average_4_4(__m128i sum, const __m128i *mul_constants,
-                           const int strength, const int rounding,
-                           const int weight_0, const int weight_1) {
-  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
-  const __m128i weight_u16 =
-      _mm_setr_epi16(weight_0, weight_0, weight_0, weight_0, weight_1, weight_1,
-                     weight_1, weight_1);
-  const __m128i sixteen = _mm_set1_epi16(16);
-
-  // modifier * 3 / index;
-  sum = _mm_mulhi_epu16(sum, *mul_constants);
-
-  sum = _mm_adds_epu16(sum, rounding_u16);
-  sum = _mm_srl_epi16(sum, strength_u128);
-
-  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
-  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
-  // So this needs to use the epu16 version which did not come until SSE4.
-  sum = _mm_min_epu16(sum, sixteen);
-
-  sum = _mm_sub_epi16(sixteen, sum);
-
-  return _mm_mullo_epi16(sum, weight_u16);
-}
-
-static INLINE void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
-                              const __m128i *mul_constants_0,
-                              const __m128i *mul_constants_1,
-                              const int strength, const int rounding,
-                              const int weight) {
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
-  const __m128i weight_u16 = _mm_set1_epi16(weight);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  __m128i input_0, input_1;
-
-  input_0 = _mm_mulhi_epu16(*sum_0_u16, *mul_constants_0);
-  input_0 = _mm_adds_epu16(input_0, rounding_u16);
-
-  input_1 = _mm_mulhi_epu16(*sum_1_u16, *mul_constants_1);
-  input_1 = _mm_adds_epu16(input_1, rounding_u16);
-
-  input_0 = _mm_srl_epi16(input_0, strength_u128);
-  input_1 = _mm_srl_epi16(input_1, strength_u128);
-
-  input_0 = _mm_min_epu16(input_0, sixteen);
-  input_1 = _mm_min_epu16(input_1, sixteen);
-  input_0 = _mm_sub_epi16(sixteen, input_0);
-  input_1 = _mm_sub_epi16(sixteen, input_1);
-
-  *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
-  *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
-}
-
-// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
-static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
-                                   uint16_t *count, uint32_t *accumulator) {
-  const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
-  __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
-  __m128i pred_0_u32, pred_1_u32;
-  __m128i accum_0_u32, accum_1_u32;
-
-  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
-  _mm_storeu_si128((__m128i *)count, count_u16);
-
-  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
-
-  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
-  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
-
-  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
-  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
-
-  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
-  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
-
-  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
-}
-
-static INLINE void accumulate_and_store_16(const __m128i sum_0_u16,
-                                           const __m128i sum_1_u16,
-                                           const uint8_t *pred, uint16_t *count,
-                                           uint32_t *accumulator) {
-  const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
-          count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
-  __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
-          pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
-  __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
-  __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
-
-  count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
-  _mm_storeu_si128((__m128i *)count, count_0_u16);
-
-  count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
-  _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
-
-  pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
-  pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
-
-  pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
-  pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
-  pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
-  pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
-
-  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
-  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
-  accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
-  accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
-
-  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
-  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
-  accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
-  accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
-
-  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
-}
-
-// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
-// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
-static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) {
-  __m128i dist_reg, dist_left, dist_right;
-
-  dist_reg = _mm_loadu_si128((const __m128i *)y_dist);
-  dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1));
-  dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1));
-
-  *sum = _mm_adds_epu16(dist_reg, dist_left);
-  *sum = _mm_adds_epu16(*sum, dist_right);
-}
-
-// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
-// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
-// the rest in sum_second.
-static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first,
-                              __m128i *sum_second) {
-  get_sum_8(y_dist, sum_first);
-  get_sum_8(y_dist + 8, sum_second);
-}
-
-// Read in a row of chroma values corresponds to a row of 16 luma values.
-static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
-                                           const uint16_t *v_dist,
-                                           __m128i *u_first, __m128i *u_second,
-                                           __m128i *v_first,
-                                           __m128i *v_second) {
-  if (!ss_x) {
-    // If there is no chroma subsampling in the horizontal direction, then we
-    // need to load 16 entries from chroma.
-    read_dist_16(u_dist, u_first, u_second);
-    read_dist_16(v_dist, v_first, v_second);
-  } else {  // ss_x == 1
-    // Otherwise, we only need to load 8 entries
-    __m128i u_reg, v_reg;
-
-    read_dist_8(u_dist, &u_reg);
-
-    *u_first = _mm_unpacklo_epi16(u_reg, u_reg);
-    *u_second = _mm_unpackhi_epi16(u_reg, u_reg);
-
-    read_dist_8(v_dist, &v_reg);
-
-    *v_first = _mm_unpacklo_epi16(v_reg, v_reg);
-    *v_second = _mm_unpackhi_epi16(v_reg, v_reg);
-  }
-}
-
-// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit
-// int in dst.
-static INLINE void hadd_epu16(__m128i *src, __m128i *dst) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i shift_right = _mm_srli_si128(*src, 2);
-
-  const __m128i odd = _mm_blend_epi16(shift_right, zero, 170);
-  const __m128i even = _mm_blend_epi16(*src, zero, 170);
-
-  *dst = _mm_add_epi32(even, odd);
-}
-
-// Add a row of luma distortion to 8 corresponding chroma mods.
-static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
-                                                 int ss_x, int ss_y,
-                                                 __m128i *u_mod,
-                                                 __m128i *v_mod) {
-  __m128i y_reg;
-  if (!ss_x) {
-    read_dist_8(y_dist, &y_reg);
-    if (ss_y == 1) {
-      __m128i y_tmp;
-      read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
-
-      y_reg = _mm_adds_epu16(y_reg, y_tmp);
-    }
-  } else {
-    __m128i y_first, y_second;
-    read_dist_16(y_dist, &y_first, &y_second);
-    if (ss_y == 1) {
-      __m128i y_tmp_0, y_tmp_1;
-      read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
-
-      y_first = _mm_adds_epu16(y_first, y_tmp_0);
-      y_second = _mm_adds_epu16(y_second, y_tmp_1);
-    }
-
-    hadd_epu16(&y_first, &y_first);
-    hadd_epu16(&y_second, &y_second);
-
-    y_reg = _mm_packus_epi32(y_first, y_second);
-  }
-
-  *u_mod = _mm_adds_epu16(*u_mod, y_reg);
-  *v_mod = _mm_adds_epu16(*v_mod, y_reg);
-}
-
-// Apply temporal filter to the luma components. This performs temporal
-// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
-// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
-// else use top_weight for top half, and bottom weight for bottom half.
-static void apply_temporal_filter_luma_16(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist,
-    const uint16_t *v_dist, const int16_t *const *neighbors_first,
-    const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
-    const int *blk_fw) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul_first, mul_second;
-
-  __m128i sum_row_1_first, sum_row_1_second;
-  __m128i sum_row_2_first, sum_row_2_second;
-  __m128i sum_row_3_first, sum_row_3_second;
-
-  __m128i u_first, u_second;
-  __m128i v_first, v_second;
-
-  __m128i sum_row_first;
-  __m128i sum_row_second;
-
-  // Loop variables
-  unsigned int h;
-
-  assert(strength >= 0);
-  assert(strength <= 6);
-
-  assert(block_width == 16);
-
-  (void)block_width;
-
-  // First row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Add luma values
-  get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
-  get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-  sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first);
-  sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second);
-
-  // Add chroma values
-  read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
-                          &v_second);
-
-  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
-  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
-
-  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
-  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    sum_row_first =
-        average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
-    sum_row_second =
-        average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
-  } else {
-    average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
-               strength, rounding, weight);
-  }
-  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
-                          y_accum);
-
-  y_src += y_src_stride;
-  y_pre += y_pre_stride;
-  y_count += y_pre_stride;
-  y_accum += y_pre_stride;
-  y_dist += DIST_STRIDE;
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-
-  // Then all the rows except the last one
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
-
-  for (h = 1; h < block_height - 1; ++h) {
-    // Move the weight to bottom half
-    if (!use_whole_blk && h == block_height / 2) {
-      if (blk_fw) {
-        blk_fw += 2;
-      } else {
-        weight = bottom_weight;
-      }
-    }
-    // Shift the rows up
-    sum_row_1_first = sum_row_2_first;
-    sum_row_1_second = sum_row_2_second;
-    sum_row_2_first = sum_row_3_first;
-    sum_row_2_second = sum_row_3_second;
-
-    // Add luma values to the modifier
-    sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
-    sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
-
-    get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-    sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first);
-    sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second);
-
-    // Add chroma values to the modifier
-    if (ss_y == 0 || h % 2 == 0) {
-      // Only calculate the new chroma distortion if we are at a pixel that
-      // corresponds to a new chroma row
-      read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
-                              &v_first, &v_second);
-
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
-      u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
-      v_dist += DIST_STRIDE;
-    }
-
-    sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
-    sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
-    sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
-    sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
-
-    // Get modifier and store result
-    if (blk_fw) {
-      sum_row_first =
-          average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
-      sum_row_second =
-          average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
-    } else {
-      average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
-                 strength, rounding, weight);
-    }
-    accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
-                            y_accum);
-
-    y_src += y_src_stride;
-    y_pre += y_pre_stride;
-    y_count += y_pre_stride;
-    y_accum += y_pre_stride;
-    y_dist += DIST_STRIDE;
-  }
-
-  // The last row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Shift the rows up
-  sum_row_1_first = sum_row_2_first;
-  sum_row_1_second = sum_row_2_second;
-  sum_row_2_first = sum_row_3_first;
-  sum_row_2_second = sum_row_3_second;
-
-  // Add luma values to the modifier
-  sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
-  sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
-
-  // Add chroma values to the modifier
-  if (ss_y == 0) {
-    // Only calculate the new chroma distortion if we are at a pixel that
-    // corresponds to a new chroma row
-    read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
-                            &v_second);
-  }
-
-  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
-  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
-  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
-  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    sum_row_first =
-        average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
-    sum_row_second =
-        average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
-  } else {
-    average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
-               strength, rounding, weight);
-  }
-  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
-                          y_accum);
-}
-
-// Perform temporal filter for the luma component.
-static void apply_temporal_filter_luma(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist,
-    const uint16_t *u_dist, const uint16_t *v_dist) {
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
-  const unsigned int mid_width = block_width >> 1,
-                     last_width = block_width - blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const int16_t *const *neighbors_first;
-  const int16_t *const *neighbors_second;
-
-  if (block_width == 16) {
-    // Special Case: The blockwidth is 16 and we are operating on a row of 16
-    // chroma pixels. In this case, we can't use the usualy left-midle-right
-    // pattern. We also don't support splitting now.
-    neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
-    neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
-    if (use_whole_blk) {
-      apply_temporal_filter_luma_16(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
-          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-          v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-          bottom_weight, NULL);
-    } else {
-      apply_temporal_filter_luma_16(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
-          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-          v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw);
-    }
-
-    return;
-  }
-
-  // Left
-  neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
-  neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  apply_temporal_filter_luma_16(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
-      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
-      neighbors_second, top_weight, bottom_weight, NULL);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  for (; blk_col < mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    apply_temporal_filter_luma_16(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
-        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight, NULL);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; blk_col < last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    apply_temporal_filter_luma_16(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
-        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight, NULL);
-  }
-
-  // Right
-  neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
-  apply_temporal_filter_luma_16(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
-      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
-      neighbors_second, top_weight, bottom_weight, NULL);
-}
-
-// Apply temporal filter to the chroma components. This performs temporal
-// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
-// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
-// else use top_weight for top half, and bottom weight for bottom half.
-static void apply_temporal_filter_chroma_8(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
-    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
-    const int16_t *const *neighbors, int top_weight, int bottom_weight,
-    const int *blk_fw) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul;
-
-  __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3;
-  __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3;
-
-  __m128i u_sum_row, v_sum_row;
-
-  // Loop variable
-  unsigned int h;
-
-  (void)uv_block_width;
-
-  // First row
-  mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
-
-  // Add chroma values
-  get_sum_8(u_dist, &u_sum_row_2);
-  get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
-
-  u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3);
-
-  get_sum_8(v_dist, &v_sum_row_2);
-  get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
-
-  v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3);
-
-  // Add luma values
-  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    u_sum_row =
-        average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
-    v_sum_row =
-        average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
-  } else {
-    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
-    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
-  }
-  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
-  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-  u_count += uv_pre_stride;
-  u_accum += uv_pre_stride;
-  v_count += uv_pre_stride;
-  v_accum += uv_pre_stride;
-
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
-  y_dist += DIST_STRIDE * (1 + ss_y);
-
-  // Then all the rows except the last one
-  mul = _mm_loadu_si128((const __m128i *)neighbors[1]);
-
-  for (h = 1; h < uv_block_height - 1; ++h) {
-    // Move the weight pointer to the bottom half of the blocks
-    if (h == uv_block_height / 2) {
-      if (blk_fw) {
-        blk_fw += 2;
-      } else {
-        weight = bottom_weight;
-      }
-    }
-
-    // Shift the rows up
-    u_sum_row_1 = u_sum_row_2;
-    u_sum_row_2 = u_sum_row_3;
-
-    v_sum_row_1 = v_sum_row_2;
-    v_sum_row_2 = v_sum_row_3;
-
-    // Add chroma values
-    u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
-    get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
-    u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3);
-
-    v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
-    get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
-    v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3);
-
-    // Add luma values
-    add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
-
-    // Get modifier and store result
-    if (blk_fw) {
-      u_sum_row = average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0],
-                              blk_fw[1]);
-      v_sum_row = average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0],
-                              blk_fw[1]);
-    } else {
-      u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
-      v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
-    }
-
-    accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
-    accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
-
-    u_src += uv_src_stride;
-    u_pre += uv_pre_stride;
-    u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
-    v_pre += uv_pre_stride;
-    v_dist += DIST_STRIDE;
-    u_count += uv_pre_stride;
-    u_accum += uv_pre_stride;
-    v_count += uv_pre_stride;
-    v_accum += uv_pre_stride;
-
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
-    y_dist += DIST_STRIDE * (1 + ss_y);
-  }
-
-  // The last row
-  mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
-
-  // Shift the rows up
-  u_sum_row_1 = u_sum_row_2;
-  u_sum_row_2 = u_sum_row_3;
-
-  v_sum_row_1 = v_sum_row_2;
-  v_sum_row_2 = v_sum_row_3;
-
-  // Add chroma values
-  u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
-  v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
-
-  // Add luma values
-  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    u_sum_row =
-        average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
-    v_sum_row =
-        average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
-  } else {
-    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
-    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
-  }
-
-  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
-  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
-}
-
-// Perform temporal filter for the chroma components.
-static void apply_temporal_filter_chroma(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
-  const unsigned int uv_width = block_width >> ss_x,
-                     uv_height = block_height >> ss_y;
-
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
-  const unsigned int uv_mid_width = uv_width >> 1,
-                     uv_last_width = uv_width - uv_blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const int16_t *const *neighbors;
-
-  if (uv_width == 8) {
-    // Special Case: We are subsampling in x direction on a 16x16 block. Since
-    // we are operating on a row of 8 chroma pixels, we can't use the usual
-    // left-middle-right pattern.
-    assert(ss_x);
-
-    if (ss_y) {
-      neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
-    } else {
-      neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
-    }
-
-    if (use_whole_blk) {
-      apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-          top_weight, bottom_weight, NULL);
-    } else {
-      apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-          0, 0, blk_fw);
-    }
-
-    return;
-  }
-
-  // Left
-  if (ss_x && ss_y) {
-    neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
-  } else {
-    neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
-  }
-
-  apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
-      bottom_weight, NULL);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  if (ss_x && ss_y) {
-    neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else {
-    neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
-  }
-
-  for (; uv_blk_col < uv_mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-        top_weight, bottom_weight, NULL);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; uv_blk_col < uv_last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-        top_weight, bottom_weight, NULL);
-  }
-
-  // Right
-  if (ss_x && ss_y) {
-    neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else {
-    neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
-  }
-
-  apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
-      bottom_weight, NULL);
-}
-
-static void apply_temporal_filter_yuv(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
-  const int use_whole_blk = !use_subblock;
-  const int *blk_fw = subblock_filter_weights;
-
-  // Block information (Y-plane).
-  const unsigned int block_height = block_size_high[block_size];
-  const unsigned int block_width = block_size_wide[block_size];
-  const int mb_pels = block_height * block_width;
-  const int y_src_stride = ref_frame->y_stride;
-  const int y_pre_stride = block_width;
-  const int mb_y_src_offset =
-      mb_row * block_height * ref_frame->y_stride + mb_col * block_width;
-
-  // Block information (UV-plane).
-  const int ss_y = mbd->plane[1].subsampling_y;
-  const int ss_x = mbd->plane[1].subsampling_x;
-  const unsigned int uv_height = block_height >> ss_y;
-  const unsigned int uv_width = block_width >> ss_x;
-  const int uv_src_stride = ref_frame->uv_stride;
-  const int uv_pre_stride = block_width >> ss_x;
-  const int mb_uv_src_offset =
-      mb_row * uv_height * ref_frame->uv_stride + mb_col * uv_width;
-
-  const uint8_t *y_src = ref_frame->y_buffer + mb_y_src_offset;
-  const uint8_t *u_src = ref_frame->u_buffer + mb_uv_src_offset;
-  const uint8_t *v_src = ref_frame->v_buffer + mb_uv_src_offset;
-  const uint8_t *y_pre = pred;
-  const uint8_t *u_pre = pred + mb_pels;
-  const uint8_t *v_pre = pred + mb_pels * 2;
-  uint32_t *y_accum = accum;
-  uint32_t *u_accum = accum + mb_pels;
-  uint32_t *v_accum = accum + mb_pels * 2;
-  uint16_t *y_count = count;
-  uint16_t *u_count = count + mb_pels;
-  uint16_t *v_count = count + mb_pels * 2;
-
-  const unsigned int chroma_height = block_height >> ss_y,
-                     chroma_width = block_width >> ss_x;
-
-  DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
-  const int *blk_fw_ptr = blk_fw;
-
-  uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
-           *v_dist_ptr = v_dist + 1;
-  const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
-  const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
-
-  // Loop variables
-  unsigned int row, blk_col;
-
-  assert(block_width <= BW && "block width too large");
-  assert(block_height <= BH && "block height too large");
-  assert(block_width % 16 == 0 && "block width must be multiple of 16");
-  assert(block_height % 2 == 0 && "block height must be even");
-  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
-         "invalid chroma subsampling");
-  assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
-  assert(blk_fw[0] >= 0 && "filter weight must be positive");
-  assert(
-      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
-      "subblock filter weight must be positive");
-  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
-  assert(
-      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
-      "subblock filter weight must be less than 2");
-
-  // Precompute the difference sqaured
-  for (row = 0; row < block_height; row++) {
-    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
-      store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
-                    y_dist_ptr + blk_col);
-    }
-    y_src_ptr += y_src_stride;
-    y_pre_ptr += y_pre_stride;
-    y_dist_ptr += DIST_STRIDE;
-  }
-
-  for (row = 0; row < chroma_height; row++) {
-    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
-      store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
-                   u_dist_ptr + blk_col);
-      store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
-                   v_dist_ptr + blk_col);
-    }
-
-    u_src_ptr += uv_src_stride;
-    u_pre_ptr += uv_pre_stride;
-    u_dist_ptr += DIST_STRIDE;
-    v_src_ptr += uv_src_stride;
-    v_pre_ptr += uv_pre_stride;
-    v_dist_ptr += DIST_STRIDE;
-  }
-
-  y_dist_ptr = y_dist + 1;
-  u_dist_ptr = u_dist + 1;
-  v_dist_ptr = v_dist + 1;
-
-  apply_temporal_filter_luma(y_src, y_src_stride, y_pre, y_pre_stride, u_src,
-                             v_src, uv_src_stride, u_pre, v_pre, uv_pre_stride,
-                             block_width, block_height, ss_x, ss_y, strength,
-                             blk_fw_ptr, use_whole_blk, y_accum, y_count,
-                             y_dist_ptr, u_dist_ptr, v_dist_ptr);
-
-  apply_temporal_filter_chroma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
-      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
-      strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
-      y_dist_ptr, u_dist_ptr, v_dist_ptr);
-}
-
-////////////////////////
-// Low bit-depth Ends //
-////////////////////////
-
-///////////////////////////
-// High bit-depth Begins //
-///////////////////////////
-
-// Compute (a-b)**2 for 8 pixels with size 16-bit
-static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
-                                       uint32_t *dst) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
-  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
-
-  const __m128i a_first = _mm_cvtepu16_epi32(a_reg);
-  const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero);
-  const __m128i b_first = _mm_cvtepu16_epi32(b_reg);
-  const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero);
-
-  __m128i dist_first, dist_second;
-
-  dist_first = _mm_sub_epi32(a_first, b_first);
-  dist_second = _mm_sub_epi32(a_second, b_second);
-  dist_first = _mm_mullo_epi32(dist_first, dist_first);
-  dist_second = _mm_mullo_epi32(dist_second, dist_second);
-
-  _mm_storeu_si128((__m128i *)dst, dist_first);
-  _mm_storeu_si128((__m128i *)(dst + 4), dist_second);
-}
-
-// Sum up three neighboring distortions for the pixels
-static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) {
-  __m128i dist_reg, dist_left, dist_right;
-
-  dist_reg = _mm_loadu_si128((const __m128i *)dist);
-  dist_left = _mm_loadu_si128((const __m128i *)(dist - 1));
-  dist_right = _mm_loadu_si128((const __m128i *)(dist + 1));
-
-  *sum = _mm_add_epi32(dist_reg, dist_left);
-  *sum = _mm_add_epi32(*sum, dist_right);
-}
-
-static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first,
-                                    __m128i *sum_second) {
-  highbd_get_sum_4(dist, sum_first);
-  highbd_get_sum_4(dist + 4, sum_second);
-}
-
-// Average the value based on the number of values summed (9 for pixels away
-// from the border, 4 for pixels in corners, and 6 for other edge values, plus
-// however many values from y/uv plane are).
-//
-// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
-// by weight.
-static INLINE void highbd_average_4(__m128i *output, const __m128i *sum,
-                                    const __m128i *mul_constants,
-                                    const int strength, const int rounding,
-                                    const int weight) {
-  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u32 = _mm_set1_epi32(rounding);
-  const __m128i weight_u32 = _mm_set1_epi32(weight);
-  const __m128i sixteen = _mm_set1_epi32(16);
-  const __m128i zero = _mm_setzero_si128();
-
-  // modifier * 3 / index;
-  const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero);
-  const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero);
-  const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero);
-  const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero);
-
-  const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo);
-  const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32);
-  const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi);
-  const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32);
-
-  // Now we have
-  //   mul_lo: 00 a1 00 a0
-  //   mul_hi: 00 a3 00 a2
-  // Unpack as 64 bit words to get even and odd elements
-  //   unpack_lo: 00 a2 00 a0
-  //   unpack_hi: 00 a3 00 a1
-  // Then we can shift and OR the results to get everything in 32-bits
-  const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div);
-  const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div);
-  const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4);
-  const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift);
-
-  // Round
-  *output = _mm_add_epi32(mul, rounding_u32);
-  *output = _mm_srl_epi32(*output, strength_u128);
-
-  // Multiply with the weight
-  *output = _mm_min_epu32(*output, sixteen);
-  *output = _mm_sub_epi32(sixteen, *output);
-  *output = _mm_mullo_epi32(*output, weight_u32);
-}
-
-static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1,
-                                    const __m128i *sum_0_u32,
-                                    const __m128i *sum_1_u32,
-                                    const __m128i *mul_constants_0,
-                                    const __m128i *mul_constants_1,
-                                    const int strength, const int rounding,
-                                    const int weight) {
-  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
-                   weight);
-  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
-                   weight);
-}
-
-// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
-static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
-                                                 const __m128i sum_second_u32,
-                                                 const uint16_t *pred,
-                                                 uint16_t *count,
-                                                 uint32_t *accumulator) {
-  // Cast down to 16-bit ints
-  const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32);
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred);
-  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
-
-  __m128i pred_0_u32, pred_1_u32;
-  __m128i accum_0_u32, accum_1_u32;
-
-  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
-  _mm_storeu_si128((__m128i *)count, count_u16);
-
-  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
-
-  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
-  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
-
-  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
-  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
-
-  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
-  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
-
-  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
-}
-
-static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) {
-  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
-}
-
-static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first,
-                                      __m128i *reg_second) {
-  highbd_read_dist_4(dist, reg_first);
-  highbd_read_dist_4(dist + 4, reg_second);
-}
-
-static INLINE void highbd_read_chroma_dist_row_8(
-    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first,
-    __m128i *u_second, __m128i *v_first, __m128i *v_second) {
-  if (!ss_x) {
-    // If there is no chroma subsampling in the horizontal direction, then we
-    // need to load 8 entries from chroma.
-    highbd_read_dist_8(u_dist, u_first, u_second);
-    highbd_read_dist_8(v_dist, v_first, v_second);
-  } else {  // ss_x == 1
-    // Otherwise, we only need to load 8 entries
-    __m128i u_reg, v_reg;
-
-    highbd_read_dist_4(u_dist, &u_reg);
-
-    *u_first = _mm_unpacklo_epi32(u_reg, u_reg);
-    *u_second = _mm_unpackhi_epi32(u_reg, u_reg);
-
-    highbd_read_dist_4(v_dist, &v_reg);
-
-    *v_first = _mm_unpacklo_epi32(v_reg, v_reg);
-    *v_second = _mm_unpackhi_epi32(v_reg, v_reg);
-  }
-}
-
-static void highbd_apply_temporal_filter_luma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
-    const uint32_t *v_dist, const uint32_t *const *neighbors_first,
-    const uint32_t *const *neighbors_second, int top_weight,
-    int bottom_weight) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul_first, mul_second;
-
-  __m128i sum_row_1_first, sum_row_1_second;
-  __m128i sum_row_2_first, sum_row_2_second;
-  __m128i sum_row_3_first, sum_row_3_second;
-
-  __m128i u_first, u_second;
-  __m128i v_first, v_second;
-
-  __m128i sum_row_first;
-  __m128i sum_row_second;
-
-  // Loop variables
-  unsigned int h;
-
-  assert(strength >= 0 && strength <= 14 &&
-         "invalid adjusted temporal filter strength");
-  assert(block_width == 8);
-
-  (void)block_width;
-
-  // First row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Add luma values
-  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
-  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
-  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
-  sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first);
-  sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second);
-
-  // Add chroma values
-  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
-                                &v_first, &v_second);
-
-  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
-  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
-
-  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
-
-  // Get modifier and store result
-  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
-                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
-                   weight);
-
-  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
-                                y_accum);
-
-  y_src += y_src_stride;
-  y_pre += y_pre_stride;
-  y_count += y_pre_stride;
-  y_accum += y_pre_stride;
-  y_dist += DIST_STRIDE;
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-
-  // Then all the rows except the last one
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
-
-  for (h = 1; h < block_height - 1; ++h) {
-    // Move the weight to bottom half
-    if (!use_whole_blk && h == block_height / 2) {
-      weight = bottom_weight;
-    }
-    // Shift the rows up
-    sum_row_1_first = sum_row_2_first;
-    sum_row_1_second = sum_row_2_second;
-    sum_row_2_first = sum_row_3_first;
-    sum_row_2_second = sum_row_3_second;
-
-    // Add luma values to the modifier
-    sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
-    sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
-
-    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-    sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first);
-    sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second);
-
-    // Add chroma values to the modifier
-    if (ss_y == 0 || h % 2 == 0) {
-      // Only calculate the new chroma distortion if we are at a pixel that
-      // corresponds to a new chroma row
-      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
-                                    &v_first, &v_second);
-
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
-      u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
-      v_dist += DIST_STRIDE;
-    }
-
-    sum_row_first = _mm_add_epi32(sum_row_first, u_first);
-    sum_row_second = _mm_add_epi32(sum_row_second, u_second);
-    sum_row_first = _mm_add_epi32(sum_row_first, v_first);
-    sum_row_second = _mm_add_epi32(sum_row_second, v_second);
-
-    // Get modifier and store result
-    highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
-                     &sum_row_second, &mul_first, &mul_second, strength,
-                     rounding, weight);
-    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
-                                  y_accum);
-
-    y_src += y_src_stride;
-    y_pre += y_pre_stride;
-    y_count += y_pre_stride;
-    y_accum += y_pre_stride;
-    y_dist += DIST_STRIDE;
-  }
-
-  // The last row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Shift the rows up
-  sum_row_1_first = sum_row_2_first;
-  sum_row_1_second = sum_row_2_second;
-  sum_row_2_first = sum_row_3_first;
-  sum_row_2_second = sum_row_3_second;
-
-  // Add luma values to the modifier
-  sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
-  sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
-
-  // Add chroma values to the modifier
-  if (ss_y == 0) {
-    // Only calculate the new chroma distortion if we are at a pixel that
-    // corresponds to a new chroma row
-    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
-                                  &v_first, &v_second);
-  }
-
-  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
-  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
-
-  // Get modifier and store result
-  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
-                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
-                   weight);
-  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
-                                y_accum);
-}
-
-// Perform temporal filter for the luma component.
-static void highbd_apply_temporal_filter_luma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
-    const uint32_t *u_dist, const uint32_t *v_dist) {
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
-  const unsigned int mid_width = block_width >> 1,
-                     last_width = block_width - blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const uint32_t *const *neighbors_first;
-  const uint32_t *const *neighbors_second;
-
-  // Left
-  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
-  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
-      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
-      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-      neighbors_first, neighbors_second, top_weight, bottom_weight);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  for (; blk_col < mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; blk_col < last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
-  }
-
-  // Right
-  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
-  highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
-      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
-      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-      neighbors_first, neighbors_second, top_weight, bottom_weight);
-}
-
-// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
-// subsampling in x direction, then we have 16 lumas, else we have 8.
-static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
-    const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst,
-    __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) {
-  __m128i y_reg_fst, y_reg_snd;
-  if (!ss_x) {
-    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
-    if (ss_y == 1) {
-      __m128i y_tmp_fst, y_tmp_snd;
-      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
-      y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst);
-      y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd);
-    }
-  } else {
-    // Temporary
-    __m128i y_fst, y_snd;
-
-    // First 8
-    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
-    if (ss_y == 1) {
-      __m128i y_tmp_fst, y_tmp_snd;
-      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
-
-      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
-      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
-    }
-
-    y_reg_fst = _mm_hadd_epi32(y_fst, y_snd);
-
-    // Second 8
-    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
-    if (ss_y == 1) {
-      __m128i y_tmp_fst, y_tmp_snd;
-      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
-
-      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
-      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
-    }
-
-    y_reg_snd = _mm_hadd_epi32(y_fst, y_snd);
-  }
-
-  *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst);
-  *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd);
-  *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst);
-  *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd);
-}
-
-// Apply temporal filter to the chroma components. This performs temporal
-// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
-// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
-// else use top_weight for top half, and bottom weight for bottom half.
-static void highbd_apply_temporal_filter_chroma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
-    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
-    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
-    int top_weight, int bottom_weight, const int *blk_fw) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul_fst, mul_snd;
-
-  __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
-  __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
-  __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
-  __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
-
-  __m128i u_sum_row_fst, v_sum_row_fst;
-  __m128i u_sum_row_snd, v_sum_row_snd;
-
-  // Loop variable
-  unsigned int h;
-
-  (void)uv_block_width;
-
-  // First row
-  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
-  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
-
-  // Add chroma values
-  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
-  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
-
-  u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst);
-  u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd);
-
-  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
-  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
-
-  v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst);
-  v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd);
-
-  // Add luma values
-  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
-                                       &u_sum_row_snd, &v_sum_row_fst,
-                                       &v_sum_row_snd);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-  } else {
-    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
-                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
-                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-  }
-  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
-                                u_accum);
-  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
-                                v_accum);
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-  u_count += uv_pre_stride;
-  u_accum += uv_pre_stride;
-  v_count += uv_pre_stride;
-  v_accum += uv_pre_stride;
-
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
-  y_dist += DIST_STRIDE * (1 + ss_y);
-
-  // Then all the rows except the last one
-  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[1]);
-  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[1]);
-
-  for (h = 1; h < uv_block_height - 1; ++h) {
-    // Move the weight pointer to the bottom half of the blocks
-    if (h == uv_block_height / 2) {
-      if (blk_fw) {
-        blk_fw += 2;
-      } else {
-        weight = bottom_weight;
-      }
-    }
-
-    // Shift the rows up
-    u_sum_row_1_fst = u_sum_row_2_fst;
-    u_sum_row_2_fst = u_sum_row_3_fst;
-    u_sum_row_1_snd = u_sum_row_2_snd;
-    u_sum_row_2_snd = u_sum_row_3_snd;
-
-    v_sum_row_1_fst = v_sum_row_2_fst;
-    v_sum_row_2_fst = v_sum_row_3_fst;
-    v_sum_row_1_snd = v_sum_row_2_snd;
-    v_sum_row_2_snd = v_sum_row_3_snd;
-
-    // Add chroma values
-    u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
-    u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
-    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
-    u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst);
-    u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd);
-
-    v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
-    v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
-    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
-    v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst);
-    v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd);
-
-    // Add luma values
-    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
-                                         &u_sum_row_snd, &v_sum_row_fst,
-                                         &v_sum_row_snd);
-
-    // Get modifier and store result
-    if (blk_fw) {
-      highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
-                       rounding, blk_fw[0]);
-      highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
-                       rounding, blk_fw[1]);
-
-      highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
-                       rounding, blk_fw[0]);
-      highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
-                       rounding, blk_fw[1]);
-
-    } else {
-      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
-                       &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                       weight);
-      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
-                       &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                       weight);
-    }
-
-    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
-                                  u_accum);
-    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
-                                  v_accum);
-
-    u_src += uv_src_stride;
-    u_pre += uv_pre_stride;
-    u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
-    v_pre += uv_pre_stride;
-    v_dist += DIST_STRIDE;
-    u_count += uv_pre_stride;
-    u_accum += uv_pre_stride;
-    v_count += uv_pre_stride;
-    v_accum += uv_pre_stride;
-
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
-    y_dist += DIST_STRIDE * (1 + ss_y);
-  }
-
-  // The last row
-  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
-  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
-
-  // Shift the rows up
-  u_sum_row_1_fst = u_sum_row_2_fst;
-  u_sum_row_2_fst = u_sum_row_3_fst;
-  u_sum_row_1_snd = u_sum_row_2_snd;
-  u_sum_row_2_snd = u_sum_row_3_snd;
-
-  v_sum_row_1_fst = v_sum_row_2_fst;
-  v_sum_row_2_fst = v_sum_row_3_fst;
-  v_sum_row_1_snd = v_sum_row_2_snd;
-  v_sum_row_2_snd = v_sum_row_3_snd;
-
-  // Add chroma values
-  u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
-  v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
-  u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
-  v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
-
-  // Add luma values
-  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
-                                       &u_sum_row_snd, &v_sum_row_fst,
-                                       &v_sum_row_snd);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-  } else {
-    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
-                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
-                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-  }
-
-  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
-                                u_accum);
-  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
-                                v_accum);
-}
-
-// Perform temporal filter for the chroma components.
-static void highbd_apply_temporal_filter_chroma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
-  const unsigned int uv_width = block_width >> ss_x,
-                     uv_height = block_height >> ss_y;
-
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
-  const unsigned int uv_mid_width = uv_width >> 1,
-                     uv_last_width = uv_width - uv_blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const uint32_t *const *neighbors_fst;
-  const uint32_t *const *neighbors_snd;
-
-  if (uv_width == 8) {
-    // Special Case: We are subsampling in x direction on a 16x16 block. Since
-    // we are operating on a row of 8 chroma pixels, we can't use the usual
-    // left-middle-right pattern.
-    assert(ss_x);
-
-    if (ss_y) {
-      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
-      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
-    } else {
-      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
-      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
-    }
-
-    if (use_whole_blk) {
-      highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
-    } else {
-      highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
-    }
-
-    return;
-  }
-
-  // Left
-  if (ss_x && ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
-    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
-    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else {
-    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
-    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
-  }
-
-  highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  if (ss_x && ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else {
-    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
-  }
-
-  for (; uv_blk_col < uv_mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; uv_blk_col < uv_last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
-  }
-
-  // Right
-  if (ss_x && ss_y) {
-    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else {
-    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
-  }
-
-  highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
-}
-
-static void highbd_apply_temporal_filter_yuv(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
-  const int use_whole_blk = !use_subblock;
-  const int *blk_fw = subblock_filter_weights;
-
-  // Block information (Y-plane).
-  const unsigned int block_height = block_size_high[block_size];
-  const unsigned int block_width = block_size_wide[block_size];
-  const int mb_pels = block_height * block_width;
-  const int y_src_stride = ref_frame->y_stride;
-  const int y_pre_stride = block_width;
-  const int mb_y_src_offset =
-      mb_row * block_height * ref_frame->y_stride + mb_col * block_width;
-
-  // Block information (UV-plane).
-  const int ss_y = mbd->plane[1].subsampling_y;
-  const int ss_x = mbd->plane[1].subsampling_x;
-  const unsigned int uv_height = block_height >> ss_y;
-  const unsigned int uv_width = block_width >> ss_x;
-  const int uv_src_stride = ref_frame->uv_stride;
-  const int uv_pre_stride = block_width >> ss_x;
-  const int mb_uv_src_offset =
-      mb_row * uv_height * ref_frame->uv_stride + mb_col * uv_width;
-
-  const uint8_t *y_src = ref_frame->y_buffer + mb_y_src_offset;
-  const uint8_t *u_src = ref_frame->u_buffer + mb_uv_src_offset;
-  const uint8_t *v_src = ref_frame->v_buffer + mb_uv_src_offset;
-  const uint8_t *y_pre = pred;
-  const uint8_t *u_pre = pred + mb_pels;
-  const uint8_t *v_pre = pred + mb_pels * 2;
-  uint32_t *y_accum = accum;
-  uint32_t *u_accum = accum + mb_pels;
-  uint32_t *v_accum = accum + mb_pels * 2;
-  uint16_t *y_count = count;
-  uint16_t *u_count = count + mb_pels;
-  uint16_t *v_count = count + mb_pels * 2;
-
-  const unsigned int chroma_height = block_height >> ss_y,
-                     chroma_width = block_width >> ss_x;
-
-  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
-
-  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
-           *v_dist_ptr = v_dist + 1;
-  const uint16_t *y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
-                 *u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
-                 *v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
-  const uint16_t *y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
-                 *u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
-                 *v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
-
-  // Loop variables
-  unsigned int row, blk_col;
-
-  assert(block_width <= BW && "block width too large");
-  assert(block_height <= BH && "block height too large");
-  assert(block_width % 16 == 0 && "block width must be multiple of 16");
-  assert(block_height % 2 == 0 && "block height must be even");
-  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
-         "invalid chroma subsampling");
-  assert(strength >= 0 && strength <= 14 &&
-         "invalid adjusted temporal filter strength");
-  assert(blk_fw[0] >= 0 && "filter weight must be positive");
-  assert(
-      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
-      "subblock filter weight must be positive");
-  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
-  assert(
-      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
-      "subblock filter weight must be less than 2");
-
-  // Precompute the difference squared
-  for (row = 0; row < block_height; row++) {
-    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
-      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
-                          y_dist_ptr + blk_col);
-    }
-    y_src_ptr += y_src_stride;
-    y_pre_ptr += y_pre_stride;
-    y_dist_ptr += DIST_STRIDE;
-  }
-
-  for (row = 0; row < chroma_height; row++) {
-    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
-      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
-                          u_dist_ptr + blk_col);
-      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
-                          v_dist_ptr + blk_col);
-    }
-
-    u_src_ptr += uv_src_stride;
-    u_pre_ptr += uv_pre_stride;
-    u_dist_ptr += DIST_STRIDE;
-    v_src_ptr += uv_src_stride;
-    v_pre_ptr += uv_pre_stride;
-    v_dist_ptr += DIST_STRIDE;
-  }
-
-  y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
-  u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
-  v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
-  y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
-  u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
-  v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
-
-  y_dist_ptr = y_dist + 1;
-  u_dist_ptr = u_dist + 1;
-  v_dist_ptr = v_dist + 1;
-
-  highbd_apply_temporal_filter_luma(
-      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
-      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
-      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, y_accum,
-      y_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
-
-  highbd_apply_temporal_filter_chroma(
-      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
-      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
-      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, u_accum,
-      u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
-}
-
-/////////////////////////
-// High bit-depth Ends //
-/////////////////////////
-
-void av1_apply_temporal_filter_yuv_sse4_1(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
-  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
-  // TODO(any): Need to support when `num_planes != 3`, like C implementation.
-  assert(num_planes == 3);
-  (void)num_planes;
-  if (is_high_bitdepth) {
-    highbd_apply_temporal_filter_yuv(
-        ref_frame, mbd, block_size, mb_row, mb_col, strength, use_subblock,
-        subblock_filter_weights, pred, accum, count);
-  } else {
-    apply_temporal_filter_yuv(ref_frame, mbd, block_size, mb_row, mb_col,
-                              strength, use_subblock, subblock_filter_weights,
-                              pred, accum, count);
-  }
-}
diff --git a/media/libaom/src/av1/ratectrl_qmode.cc b/media/libaom/src/av1/ratectrl_qmode.cc
new file mode 100644
index 0000000000..6ae3c3e1eb
--- /dev/null
+++ b/media/libaom/src/av1/ratectrl_qmode.cc
@@ -0,0 +1,1081 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/ratectrl_qmode.h"
+
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <numeric>
+#include <vector>
+
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/tpl_model.h"
+
+namespace aom {
+
+// This is used before division to ensure that the divisor isn't zero or
+// too close to zero.
+static double ModifyDivisor(double divisor) {
+  const double kEpsilon = 0.0000001;
+  return (divisor < 0 ? std::min(divisor, -kEpsilon)
+                      : std::max(divisor, kEpsilon));
+}
+
+GopFrame GopFrameInvalid() {
+  GopFrame gop_frame = {};
+  gop_frame.is_valid = false;
+  gop_frame.coding_idx = -1;
+  gop_frame.order_idx = -1;
+  return gop_frame;
+}
+
+void SetGopFrameByType(GopFrameType gop_frame_type, GopFrame *gop_frame) {
+  switch (gop_frame_type) {
+    case GopFrameType::kRegularKey:
+      gop_frame->is_key_frame = 1;
+      gop_frame->is_arf_frame = 0;
+      gop_frame->is_show_frame = 1;
+      gop_frame->is_golden_frame = 1;
+      gop_frame->encode_ref_mode = EncodeRefMode::kRegular;
+      break;
+    case GopFrameType::kRegularArf:
+      gop_frame->is_key_frame = 0;
+      gop_frame->is_arf_frame = 1;
+      gop_frame->is_show_frame = 0;
+      gop_frame->is_golden_frame = 1;
+      gop_frame->encode_ref_mode = EncodeRefMode::kRegular;
+      break;
+    case GopFrameType::kIntermediateArf:
+      gop_frame->is_key_frame = 0;
+      gop_frame->is_arf_frame = 1;
+      gop_frame->is_show_frame = 0;
+      gop_frame->is_golden_frame = 0;
+      gop_frame->encode_ref_mode = EncodeRefMode::kRegular;
+      break;
+    case GopFrameType::kRegularLeaf:
+      gop_frame->is_key_frame = 0;
+      gop_frame->is_arf_frame = 0;
+      gop_frame->is_show_frame = 1;
+      gop_frame->is_golden_frame = 0;
+      gop_frame->encode_ref_mode = EncodeRefMode::kRegular;
+      break;
+    case GopFrameType::kShowExisting:
+      gop_frame->is_key_frame = 0;
+      gop_frame->is_arf_frame = 0;
+      gop_frame->is_show_frame = 1;
+      gop_frame->is_golden_frame = 0;
+      gop_frame->encode_ref_mode = EncodeRefMode::kShowExisting;
+      break;
+    case GopFrameType::kOverlay:
+      gop_frame->is_key_frame = 0;
+      gop_frame->is_arf_frame = 0;
+      gop_frame->is_show_frame = 1;
+      gop_frame->is_golden_frame = 0;
+      gop_frame->encode_ref_mode = EncodeRefMode::kOverlay;
+      break;
+  }
+}
+
+GopFrame GopFrameBasic(int global_coding_idx_offset,
+                       int global_order_idx_offset, int coding_idx,
+                       int order_idx, int depth, GopFrameType gop_frame_type) {
+  GopFrame gop_frame = {};
+  gop_frame.is_valid = true;
+  gop_frame.coding_idx = coding_idx;
+  gop_frame.order_idx = order_idx;
+  gop_frame.global_coding_idx = global_coding_idx_offset + coding_idx;
+  gop_frame.global_order_idx = global_order_idx_offset + order_idx;
+  SetGopFrameByType(gop_frame_type, &gop_frame);
+  gop_frame.colocated_ref_idx = -1;
+  gop_frame.update_ref_idx = -1;
+  gop_frame.layer_depth = depth + kLayerDepthOffset;
+  return gop_frame;
+}
+
+// This function create gop frames with indices of display order from
+// order_start to order_end - 1. The function will recursively introduce
+// intermediate ARF untill maximum depth is met or the number of regular frames
+// in between two ARFs are less than 3. Than the regular frames will be added
+// into the gop_struct.
+void ConstructGopMultiLayer(GopStruct *gop_struct,
+                            RefFrameManager *ref_frame_manager, int max_depth,
+                            int depth, int order_start, int order_end) {
+  int coding_idx = static_cast<int>(gop_struct->gop_frame_list.size());
+  GopFrame gop_frame;
+  int num_frames = order_end - order_start;
+  const int global_coding_idx_offset = gop_struct->global_coding_idx_offset;
+  const int global_order_idx_offset = gop_struct->global_order_idx_offset;
+  // If there are less than kMinIntervalToAddArf frames, stop introducing ARF
+  if (depth < max_depth && num_frames >= kMinIntervalToAddArf) {
+    int order_mid = (order_start + order_end) / 2;
+    // intermediate ARF
+    gop_frame = GopFrameBasic(global_coding_idx_offset, global_order_idx_offset,
+                              coding_idx, order_mid, depth,
+                              GopFrameType::kIntermediateArf);
+    ref_frame_manager->UpdateRefFrameTable(&gop_frame);
+    gop_struct->gop_frame_list.push_back(gop_frame);
+    ConstructGopMultiLayer(gop_struct, ref_frame_manager, max_depth, depth + 1,
+                           order_start, order_mid);
+    // show existing intermediate ARF
+    gop_frame = GopFrameBasic(global_coding_idx_offset, global_order_idx_offset,
+                              coding_idx, order_mid, max_depth,
+                              GopFrameType::kShowExisting);
+    ref_frame_manager->UpdateRefFrameTable(&gop_frame);
+    gop_struct->gop_frame_list.push_back(gop_frame);
+    ConstructGopMultiLayer(gop_struct, ref_frame_manager, max_depth, depth + 1,
+                           order_mid + 1, order_end);
+  } else {
+    // regular frame
+    for (int i = order_start; i < order_end; ++i) {
+      coding_idx = static_cast<int>(gop_struct->gop_frame_list.size());
+      gop_frame =
+          GopFrameBasic(global_coding_idx_offset, global_order_idx_offset,
+                        coding_idx, i, max_depth, GopFrameType::kRegularLeaf);
+      ref_frame_manager->UpdateRefFrameTable(&gop_frame);
+      gop_struct->gop_frame_list.push_back(gop_frame);
+    }
+  }
+}
+
+GopStruct ConstructGop(RefFrameManager *ref_frame_manager, int show_frame_count,
+                       bool has_key_frame, int global_coding_idx_offset,
+                       int global_order_idx_offset) {
+  GopStruct gop_struct;
+  gop_struct.show_frame_count = show_frame_count;
+  gop_struct.global_coding_idx_offset = global_coding_idx_offset;
+  gop_struct.global_order_idx_offset = global_order_idx_offset;
+  int order_start = 0;
+  int order_arf = show_frame_count - 1;
+  int coding_idx;
+  GopFrame gop_frame;
+  if (has_key_frame) {
+    const int key_frame_depth = -1;
+    ref_frame_manager->Reset();
+    coding_idx = static_cast<int>(gop_struct.gop_frame_list.size());
+    gop_frame = GopFrameBasic(global_coding_idx_offset, global_order_idx_offset,
+                              coding_idx, order_start, key_frame_depth,
+                              GopFrameType::kRegularKey);
+    ref_frame_manager->UpdateRefFrameTable(&gop_frame);
+    gop_struct.gop_frame_list.push_back(gop_frame);
+    order_start++;
+  }
+  // ARF
+  const int arf_depth = 0;
+  coding_idx = static_cast<int>(gop_struct.gop_frame_list.size());
+  gop_frame = GopFrameBasic(global_coding_idx_offset, global_order_idx_offset,
+                            coding_idx, order_arf, arf_depth,
+                            GopFrameType::kRegularArf);
+  ref_frame_manager->UpdateRefFrameTable(&gop_frame);
+  gop_struct.gop_frame_list.push_back(gop_frame);
+  ConstructGopMultiLayer(&gop_struct, ref_frame_manager,
+                         ref_frame_manager->ForwardMaxSize(), arf_depth + 1,
+                         order_start, order_arf);
+  // Overlay
+  coding_idx = static_cast<int>(gop_struct.gop_frame_list.size());
+  gop_frame = GopFrameBasic(
+      global_coding_idx_offset, global_order_idx_offset, coding_idx, order_arf,
+      ref_frame_manager->ForwardMaxSize(), GopFrameType::kOverlay);
+  ref_frame_manager->UpdateRefFrameTable(&gop_frame);
+  gop_struct.gop_frame_list.push_back(gop_frame);
+  return gop_struct;
+}
+
+void AV1RateControlQMode::SetRcParam(const RateControlParam &rc_param) {
+  rc_param_ = rc_param;
+}
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+// We adapt the threshold based on number of frames in this key-frame group so
+// far.
+static double GetSecondRefUsageThreshold(int frame_count_so_far) {
+  const int adapt_upto = 32;
+  const double min_second_ref_usage_thresh = 0.085;
+  const double second_ref_usage_thresh_max_delta = 0.035;
+  if (frame_count_so_far >= adapt_upto) {
+    return min_second_ref_usage_thresh + second_ref_usage_thresh_max_delta;
+  }
+  return min_second_ref_usage_thresh +
+         ((double)frame_count_so_far / (adapt_upto - 1)) *
+             second_ref_usage_thresh_max_delta;
+}
+
+// Slide show transition detection.
+// Tests for case where there is very low error either side of the current frame
+// but much higher just for this frame. This can help detect key frames in
+// slide shows even where the slides are pictures of different sizes.
+// Also requires that intra and inter errors are very similar to help eliminate
+// harmful false positives.
+// It will not help if the transition is a fade or other multi-frame effect.
+static bool DetectSlideTransition(const FIRSTPASS_STATS &this_frame,
+                                  const FIRSTPASS_STATS &last_frame,
+                                  const FIRSTPASS_STATS &next_frame) {
+  // Intra / Inter threshold very low
+  constexpr double kVeryLowII = 1.5;
+  // Clean slide transitions we expect a sharp single frame spike in error.
+  constexpr double kErrorSpike = 5.0;
+
+  // TODO(angiebird): Understand the meaning of these conditions.
+  return (this_frame.intra_error < (this_frame.coded_error * kVeryLowII)) &&
+         (this_frame.coded_error > (last_frame.coded_error * kErrorSpike)) &&
+         (this_frame.coded_error > (next_frame.coded_error * kErrorSpike));
+}
+
+// Check if there is a significant intra/inter error change between the current
+// frame and its neighbor. If so, we should further test whether the current
+// frame should be a key frame.
+static bool DetectIntraInterErrorChange(const FIRSTPASS_STATS &this_stats,
+                                        const FIRSTPASS_STATS &last_stats,
+                                        const FIRSTPASS_STATS &next_stats) {
+  // Minimum % intra coding observed in first pass (1.0 = 100%)
+  constexpr double kMinIntraLevel = 0.25;
+  // Minimum ratio between the % of intra coding and inter coding in the first
+  // pass after discounting neutral blocks (discounting neutral blocks in this
+  // way helps catch scene cuts in clips with very flat areas or letter box
+  // format clips with image padding.
+  constexpr double kIntraVsInterRatio = 2.0;
+
+  const double modified_pcnt_inter =
+      this_stats.pcnt_inter - this_stats.pcnt_neutral;
+  const double pcnt_intra_min =
+      std::max(kMinIntraLevel, kIntraVsInterRatio * modified_pcnt_inter);
+
+  // In real scene cuts there is almost always a sharp change in the intra
+  // or inter error score.
+  constexpr double kErrorChangeThreshold = 0.4;
+  const double last_this_error_ratio =
+      fabs(last_stats.coded_error - this_stats.coded_error) /
+      ModifyDivisor(this_stats.coded_error);
+
+  const double this_next_error_ratio =
+      fabs(last_stats.intra_error - this_stats.intra_error) /
+      ModifyDivisor(this_stats.intra_error);
+
+  // Maximum threshold for the relative ratio of intra error score vs best
+  // inter error score.
+  constexpr double kThisIntraCodedErrorRatioMax = 1.9;
+  const double this_intra_coded_error_ratio =
+      this_stats.intra_error / ModifyDivisor(this_stats.coded_error);
+
+  // For real scene cuts we expect an improvment in the intra inter error
+  // ratio in the next frame.
+  constexpr double kNextIntraCodedErrorRatioMin = 3.5;
+  const double next_intra_coded_error_ratio =
+      next_stats.intra_error / ModifyDivisor(next_stats.coded_error);
+
+  double pcnt_intra = 1.0 - this_stats.pcnt_inter;
+  return pcnt_intra > pcnt_intra_min &&
+         this_intra_coded_error_ratio < kThisIntraCodedErrorRatioMax &&
+         (last_this_error_ratio > kErrorChangeThreshold ||
+          this_next_error_ratio > kErrorChangeThreshold ||
+          next_intra_coded_error_ratio > kNextIntraCodedErrorRatioMin);
+}
+
+// Check whether the candidate can be a key frame.
+// This is a rewrite of test_candidate_kf().
+static bool TestCandidateKey(const FirstpassInfo &first_pass_info,
+                             int candidate_key_idx, int frames_since_prev_key) {
+  const auto &stats_list = first_pass_info.stats_list;
+  const int stats_count = static_cast<int>(stats_list.size());
+  if (candidate_key_idx + 1 >= stats_count || candidate_key_idx - 1 < 0) {
+    return false;
+  }
+  const auto &last_stats = stats_list[candidate_key_idx - 1];
+  const auto &this_stats = stats_list[candidate_key_idx];
+  const auto &next_stats = stats_list[candidate_key_idx + 1];
+
+  if (frames_since_prev_key < 3) return false;
+  const double second_ref_usage_threshold =
+      GetSecondRefUsageThreshold(frames_since_prev_key);
+  if (this_stats.pcnt_second_ref >= second_ref_usage_threshold) return false;
+  if (next_stats.pcnt_second_ref >= second_ref_usage_threshold) return false;
+
+  // Hard threshold where the first pass chooses intra for almost all blocks.
+  // In such a case even if the frame is not a scene cut coding a key frame
+  // may be a good option.
+  constexpr double kVeryLowInterThreshold = 0.05;
+  if (this_stats.pcnt_inter < kVeryLowInterThreshold ||
+      DetectSlideTransition(this_stats, last_stats, next_stats) ||
+      DetectIntraInterErrorChange(this_stats, last_stats, next_stats)) {
+    double boost_score = 0.0;
+    double decay_accumulator = 1.0;
+
+    // We do "-1" because the candidate key is not counted.
+    int stats_after_this_stats = stats_count - candidate_key_idx - 1;
+
+    // Number of frames required to test for scene cut detection
+    constexpr int kSceneCutKeyTestIntervalMax = 16;
+
+    // Make sure we have enough stats after the candidate key.
+    const int frames_to_test_after_candidate_key =
+        std::min(kSceneCutKeyTestIntervalMax, stats_after_this_stats);
+
+    // Examine how well the key frame predicts subsequent frames.
+    int i;
+    for (i = 1; i <= frames_to_test_after_candidate_key; ++i) {
+      // Get the next frame details
+      const auto &stats = stats_list[candidate_key_idx + i];
+
+      // Cumulative effect of decay in prediction quality.
+      if (stats.pcnt_inter > 0.85) {
+        decay_accumulator *= stats.pcnt_inter;
+      } else {
+        decay_accumulator *= (0.85 + stats.pcnt_inter) / 2.0;
+      }
+
+      constexpr double kBoostFactor = 12.5;
+      double next_iiratio =
+          (kBoostFactor * stats.intra_error / ModifyDivisor(stats.coded_error));
+      next_iiratio = std::min(next_iiratio, 128.0);
+      double boost_score_increment = decay_accumulator * next_iiratio;
+
+      // Keep a running total.
+      boost_score += boost_score_increment;
+
+      // Test various breakout clauses.
+      // TODO(any): Test of intra error should be normalized to an MB.
+      // TODO(angiebird): Investigate the following questions.
+      // Question 1: next_iiratio (intra_error / coded_error) * kBoostFactor
+      // We know intra_error / coded_error >= 1 and kBoostFactor = 12.5,
+      // therefore, (intra_error / coded_error) * kBoostFactor will always
+      // greater than 1.5. Is "next_iiratio < 1.5" always false?
+      // Question 2: Similar to question 1, is "next_iiratio < 3.0" always true?
+      // Question 3: Why do we need to divide 200 with num_mbs_16x16?
+      if ((stats.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+          (((stats.pcnt_inter - stats.pcnt_neutral) < 0.20) &&
+           (next_iiratio < 3.0)) ||
+          (boost_score_increment < 3.0) ||
+          (stats.intra_error <
+           (200.0 / static_cast<double>(first_pass_info.num_mbs_16x16)))) {
+        break;
+      }
+    }
+
+    // If there is tolerable prediction for at least the next 3 frames then
+    // break out else discard this potential key frame and move on
+    const int count_for_tolerable_prediction = 3;
+    if (boost_score > 30.0 && (i > count_for_tolerable_prediction)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Compute key frame location from first_pass_info.
+std::vector<int> GetKeyFrameList(const FirstpassInfo &first_pass_info) {
+  std::vector<int> key_frame_list;
+  key_frame_list.push_back(0);  // The first frame is always a key frame
+  int candidate_key_idx = 1;
+  while (candidate_key_idx <
+         static_cast<int>(first_pass_info.stats_list.size())) {
+    const int frames_since_prev_key = candidate_key_idx - key_frame_list.back();
+    // Check for a scene cut.
+    const bool scenecut_detected = TestCandidateKey(
+        first_pass_info, candidate_key_idx, frames_since_prev_key);
+    if (scenecut_detected) {
+      key_frame_list.push_back(candidate_key_idx);
+    }
+    ++candidate_key_idx;
+  }
+  return key_frame_list;
+}
+
+// initialize GF_GROUP_STATS
+static void InitGFStats(GF_GROUP_STATS *gf_stats) {
+  gf_stats->gf_group_err = 0.0;
+  gf_stats->gf_group_raw_error = 0.0;
+  gf_stats->gf_group_skip_pct = 0.0;
+  gf_stats->gf_group_inactive_zone_rows = 0.0;
+
+  gf_stats->mv_ratio_accumulator = 0.0;
+  gf_stats->decay_accumulator = 1.0;
+  gf_stats->zero_motion_accumulator = 1.0;
+  gf_stats->loop_decay_rate = 1.0;
+  gf_stats->last_loop_decay_rate = 1.0;
+  gf_stats->this_frame_mv_in_out = 0.0;
+  gf_stats->mv_in_out_accumulator = 0.0;
+  gf_stats->abs_mv_in_out_accumulator = 0.0;
+
+  gf_stats->avg_sr_coded_error = 0.0;
+  gf_stats->avg_pcnt_second_ref = 0.0;
+  gf_stats->avg_new_mv_count = 0.0;
+  gf_stats->avg_wavelet_energy = 0.0;
+  gf_stats->avg_raw_err_stdev = 0.0;
+  gf_stats->non_zero_stdev_count = 0;
+}
+
+static int FindRegionIndex(const std::vector<REGIONS> &regions, int frame_idx) {
+  for (int k = 0; k < static_cast<int>(regions.size()); k++) {
+    if (regions[k].start <= frame_idx && regions[k].last >= frame_idx) {
+      return k;
+    }
+  }
+  return -1;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static bool DetectFlash(const std::vector<FIRSTPASS_STATS> &stats_list,
+                        int index) {
+  int next_index = index + 1;
+  if (next_index >= static_cast<int>(stats_list.size())) return false;
+  const FIRSTPASS_STATS &next_frame = stats_list[next_index];
+
+  // What we are looking for here is a situation where there is a
+  // brief break in prediction (such as a flash) but subsequent frames
+  // are reasonably well predicted by an earlier (pre flash) frame.
+  // The recovery after a flash is indicated by a high pcnt_second_ref
+  // compared to pcnt_inter.
+  return next_frame.pcnt_second_ref > next_frame.pcnt_inter &&
+         next_frame.pcnt_second_ref >= 0.5;
+}
+
+#define MIN_SHRINK_LEN 6
+
+// This function takes in a suggesting gop interval from cur_start to cur_last,
+// analyzes firstpass stats and region stats and then return a better gop cut
+// location.
+// TODO(b/231517281): Simplify the indices once we have an unit test.
+// We are using four indices here, order_index, cur_start, cur_last, and
+// frames_since_key. Ideally, only three indices are needed.
+// 1) start_index = order_index + cur_start
+// 2) end_index = order_index + cur_end
+// 3) key_index
+int FindBetterGopCut(const std::vector<FIRSTPASS_STATS> &stats_list,
+                     const std::vector<REGIONS> &regions_list,
+                     int min_gop_show_frame_count, int max_gop_show_frame_count,
+                     int order_index, int cur_start, int cur_last,
+                     int frames_since_key) {
+  // only try shrinking if interval smaller than active_max_gf_interval
+  if (cur_last - cur_start > max_gop_show_frame_count ||
+      cur_start >= cur_last) {
+    return cur_last;
+  }
+  int num_regions = static_cast<int>(regions_list.size());
+  int num_stats = static_cast<int>(stats_list.size());
+  const int min_shrink_int = std::max(MIN_SHRINK_LEN, min_gop_show_frame_count);
+
+  // find the region indices of where the first and last frame belong.
+  int k_start = FindRegionIndex(regions_list, cur_start + frames_since_key);
+  int k_last = FindRegionIndex(regions_list, cur_last + frames_since_key);
+  if (cur_start + frames_since_key == 0) k_start = 0;
+
+  int scenecut_idx = -1;
+  // See if we have a scenecut in between
+  for (int r = k_start + 1; r <= k_last; r++) {
+    if (regions_list[r].type == SCENECUT_REGION &&
+        regions_list[r].last - frames_since_key - cur_start >
+            min_gop_show_frame_count) {
+      scenecut_idx = r;
+      break;
+    }
+  }
+
+  // if the found scenecut is very close to the end, ignore it.
+  if (regions_list[num_regions - 1].last - regions_list[scenecut_idx].last <
+      4) {
+    scenecut_idx = -1;
+  }
+
+  if (scenecut_idx != -1) {
+    // If we have a scenecut, then stop at it.
+    // TODO(bohanli): add logic here to stop before the scenecut and for
+    // the next gop start from the scenecut with GF
+    int is_minor_sc =
+        (regions_list[scenecut_idx].avg_cor_coeff *
+             (1 - stats_list[order_index + regions_list[scenecut_idx].start -
+                             frames_since_key]
+                          .noise_var /
+                      regions_list[scenecut_idx].avg_intra_err) >
+         0.6);
+    cur_last =
+        regions_list[scenecut_idx].last - frames_since_key - !is_minor_sc;
+  } else {
+    int is_last_analysed =
+        (k_last == num_regions - 1) &&
+        (cur_last + frames_since_key == regions_list[k_last].last);
+    int not_enough_regions =
+        k_last - k_start <= 1 + (regions_list[k_start].type == SCENECUT_REGION);
+    // if we are very close to the end, then do not shrink since it may
+    // introduce intervals that are too short
+    if (!(is_last_analysed && not_enough_regions)) {
+      const double arf_length_factor = 0.1;
+      double best_score = 0;
+      int best_j = -1;
+      const int first_frame = regions_list[0].start - frames_since_key;
+      const int last_frame =
+          regions_list[num_regions - 1].last - frames_since_key;
+      // score of how much the arf helps the whole GOP
+      double base_score = 0.0;
+      // Accumulate base_score in
+      for (int j = cur_start + 1; j < cur_start + min_shrink_int; j++) {
+        if (order_index + j >= num_stats) break;
+        base_score = (base_score + 1.0) * stats_list[order_index + j].cor_coeff;
+      }
+      int met_blending = 0;   // Whether we have met blending areas before
+      int last_blending = 0;  // Whether the previous frame if blending
+      for (int j = cur_start + min_shrink_int; j <= cur_last; j++) {
+        if (order_index + j >= num_stats) break;
+        base_score = (base_score + 1.0) * stats_list[order_index + j].cor_coeff;
+        int this_reg = FindRegionIndex(regions_list, j + frames_since_key);
+        if (this_reg < 0) continue;
+        // A GOP should include at most 1 blending region.
+        if (regions_list[this_reg].type == BLENDING_REGION) {
+          last_blending = 1;
+          if (met_blending) {
+            break;
+          } else {
+            base_score = 0;
+            continue;
+          }
+        } else {
+          if (last_blending) met_blending = 1;
+          last_blending = 0;
+        }
+
+        // Add the factor of how good the neighborhood is for this
+        // candidate arf.
+        double this_score = arf_length_factor * base_score;
+        double temp_accu_coeff = 1.0;
+        // following frames
+        int count_f = 0;
+        for (int n = j + 1; n <= j + 3 && n <= last_frame; n++) {
+          if (order_index + n >= num_stats) break;
+          temp_accu_coeff *= stats_list[order_index + n].cor_coeff;
+          this_score +=
+              temp_accu_coeff *
+              (1 - stats_list[order_index + n].noise_var /
+                       AOMMAX(regions_list[this_reg].avg_intra_err, 0.001));
+          count_f++;
+        }
+        // preceding frames
+        temp_accu_coeff = 1.0;
+        for (int n = j; n > j - 3 * 2 + count_f && n > first_frame; n--) {
+          if (order_index + n < num_stats) break;
+          temp_accu_coeff *= stats_list[order_index + n].cor_coeff;
+          this_score +=
+              temp_accu_coeff *
+              (1 - stats_list[order_index + n].noise_var /
+                       AOMMAX(regions_list[this_reg].avg_intra_err, 0.001));
+        }
+
+        if (this_score > best_score) {
+          best_score = this_score;
+          best_j = j;
+        }
+      }
+
+      // For blending areas, move one more frame in case we missed the
+      // first blending frame.
+      int best_reg = FindRegionIndex(regions_list, best_j + frames_since_key);
+      if (best_reg < num_regions - 1 && best_reg > 0) {
+        if (regions_list[best_reg - 1].type == BLENDING_REGION &&
+            regions_list[best_reg + 1].type == BLENDING_REGION) {
+          if (best_j + frames_since_key == regions_list[best_reg].start &&
+              best_j + frames_since_key < regions_list[best_reg].last) {
+            best_j += 1;
+          } else if (best_j + frames_since_key == regions_list[best_reg].last &&
+                     best_j + frames_since_key > regions_list[best_reg].start) {
+            best_j -= 1;
+          }
+        }
+      }
+
+      if (cur_last - best_j < 2) best_j = cur_last;
+      if (best_j > 0 && best_score > 0.1) cur_last = best_j;
+      // if cannot find anything, just cut at the original place.
+    }
+  }
+
+  return cur_last;
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static bool DetectTransitionToStill(
+    const std::vector<FIRSTPASS_STATS> &stats_list, int next_stats_index,
+    int min_gop_show_frame_count, int frame_interval, int still_interval,
+    double loop_decay_rate, double last_decay_rate) {
+  // Break clause to detect very still sections after motion
+  // For example a static image after a fade or other transition
+  // instead of a clean scene cut.
+  if (frame_interval > min_gop_show_frame_count && loop_decay_rate >= 0.999 &&
+      last_decay_rate < 0.9) {
+    int stats_count = static_cast<int>(stats_list.size());
+    int stats_left = stats_count - next_stats_index;
+    if (stats_left >= still_interval) {
+      // Look ahead a few frames to see if static condition persists...
+      int j;
+      for (j = 0; j < still_interval; ++j) {
+        const FIRSTPASS_STATS &stats = stats_list[next_stats_index + j];
+        if (stats.pcnt_inter - stats.pcnt_motion < 0.999) break;
+      }
+      // Only if it does do we signal a transition to still.
+      return j == still_interval;
+    }
+  }
+  return false;
+}
+
+static int DetectGopCut(const std::vector<FIRSTPASS_STATS> &stats_list,
+                        int start_idx, int candidate_cut_idx, int next_key_idx,
+                        int flash_detected, int min_gop_show_frame_count,
+                        int max_gop_show_frame_count, int frame_width,
+                        int frame_height, const GF_GROUP_STATS &gf_stats) {
+  (void)max_gop_show_frame_count;
+  const int candidate_gop_size = candidate_cut_idx - start_idx;
+
+  if (!flash_detected) {
+    // Break clause to detect very still sections after motion. For example,
+    // a static image after a fade or other transition.
+    if (DetectTransitionToStill(stats_list, start_idx, min_gop_show_frame_count,
+                                candidate_gop_size, 5, gf_stats.loop_decay_rate,
+                                gf_stats.last_loop_decay_rate)) {
+      return 1;
+    }
+    const double arf_abs_zoom_thresh = 4.4;
+    // Motion breakout threshold for loop below depends on image size.
+    const double mv_ratio_accumulator_thresh =
+        (frame_height + frame_width) / 4.0;
+    // Some conditions to breakout after min interval.
+    if (candidate_gop_size >= min_gop_show_frame_count &&
+        // If possible don't break very close to a kf
+        (next_key_idx - candidate_cut_idx >= min_gop_show_frame_count) &&
+        (candidate_gop_size & 0x01) &&
+        (gf_stats.mv_ratio_accumulator > mv_ratio_accumulator_thresh ||
+         gf_stats.abs_mv_in_out_accumulator > arf_abs_zoom_thresh)) {
+      return 1;
+    }
+  }
+
+  // TODO(b/231489624): Check if we need this part.
+  // If almost totally static, we will not use the the max GF length later,
+  // so we can continue for more frames.
+  // if ((candidate_gop_size >= active_max_gf_interval + 1) &&
+  //     !is_almost_static(gf_stats->zero_motion_accumulator,
+  //                       twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled)) {
+  //   return 0;
+  // }
+  return 0;
+}
+
+/*!\brief Determine the length of future GF groups.
+ *
+ * \ingroup gf_group_algo
+ * This function decides the gf group length of future frames in batch
+ *
+ * \param[in]    rc_param         Rate control parameters
+ * \param[in]    stats_list       List of first pass stats
+ * \param[in]    regions_list     List of regions from av1_identify_regions
+ * \param[in]    order_index      Index of current frame in stats_list
+ * \param[in]    frames_since_key Number of frames since the last key frame
+ * \param[in]    frames_to_key    Number of frames to the next key frame
+ *
+ * \return Returns a vector of decided GF group lengths.
+ */
+static std::vector<int> PartitionGopIntervals(
+    const RateControlParam &rc_param,
+    const std::vector<FIRSTPASS_STATS> &stats_list,
+    const std::vector<REGIONS> &regions_list, int order_index,
+    int frames_since_key, int frames_to_key) {
+  int i = (frames_since_key == 0) ? 1 : 0;
+  // If cpi->gf_state.arf_gf_boost_lst is 0, we are starting with a KF or GF.
+  int cur_start = 0;
+  // Each element is the last frame of the previous GOP. If there are n GOPs,
+  // you need n + 1 cuts to find the durations. So cut_pos starts out with -1,
+  // which is the last frame of the previous GOP.
+  std::vector<int> cut_pos(1, -1);
+  int cut_here = 0;
+  GF_GROUP_STATS gf_stats;
+  InitGFStats(&gf_stats);
+  int num_stats = static_cast<int>(stats_list.size());
+  while (i + order_index < num_stats) {
+    // reaches next key frame, break here
+    if (i >= frames_to_key) {
+      cut_here = 2;
+    } else if (i - cur_start >= rc_param.max_gop_show_frame_count) {
+      // reached maximum len, but nothing special yet (almost static)
+      // let's look at the next interval
+      cut_here = 1;
+    } else {
+      // Test for the case where there is a brief flash but the prediction
+      // quality back to an earlier frame is then restored.
+      const int gop_start_idx = cur_start + order_index;
+      const int candidate_gop_cut_idx = i + order_index;
+      const int next_key_idx = frames_to_key + order_index;
+      const bool flash_detected =
+          DetectFlash(stats_list, candidate_gop_cut_idx);
+
+      // TODO(bohanli): remove redundant accumulations here, or unify
+      // this and the ones in define_gf_group
+      const FIRSTPASS_STATS *stats = &stats_list[candidate_gop_cut_idx];
+      av1_accumulate_next_frame_stats(stats, flash_detected, frames_since_key,
+                                      i, &gf_stats, rc_param.frame_width,
+                                      rc_param.frame_height);
+
+      // TODO(angiebird): Can we simplify this part? Looks like we are going to
+      // change the gop cut index with FindBetterGopCut() anyway.
+      cut_here = DetectGopCut(
+          stats_list, gop_start_idx, candidate_gop_cut_idx, next_key_idx,
+          flash_detected, rc_param.min_gop_show_frame_count,
+          rc_param.max_gop_show_frame_count, rc_param.frame_width,
+          rc_param.frame_height, gf_stats);
+    }
+
+    if (!cut_here) {
+      ++i;
+      continue;
+    }
+    int original_last = i - 1;  // the current last frame in the gf group
+    int cur_last = FindBetterGopCut(
+        stats_list, regions_list, rc_param.min_gop_show_frame_count,
+        rc_param.max_gop_show_frame_count, order_index, cur_start,
+        original_last, frames_since_key);
+    // only try shrinking if interval smaller than active_max_gf_interval
+    cut_pos.push_back(cur_last);
+
+    // reset pointers to the shrunken location
+    cur_start = cur_last;
+    int cur_region_idx =
+        FindRegionIndex(regions_list, cur_start + 1 + frames_since_key);
+    if (cur_region_idx >= 0)
+      if (regions_list[cur_region_idx].type == SCENECUT_REGION) cur_start++;
+
+    // TODO(angiebird): Why do we need to break here?
+    if (cut_here > 1 && cur_last == original_last) break;
+    // reset accumulators
+    InitGFStats(&gf_stats);
+    i = cur_last + 1;
+  }
+  std::vector<int> gf_intervals;
+  // save intervals
+  for (size_t n = 1; n < cut_pos.size(); n++) {
+    gf_intervals.push_back(cut_pos[n] - cut_pos[n - 1]);
+  }
+
+  return gf_intervals;
+}
+
+// TODO(angiebird): Add unit test to this function.
+GopStructList AV1RateControlQMode::DetermineGopInfo(
+    const FirstpassInfo &firstpass_info) {
+  const int stats_size = static_cast<int>(firstpass_info.stats_list.size());
+  GopStructList gop_list;
+  RefFrameManager ref_frame_manager(rc_param_.max_ref_frames);
+  int global_coding_idx_offset = 0;
+  int global_order_idx_offset = 0;
+  std::vector<int> key_frame_list = GetKeyFrameList(firstpass_info);
+  key_frame_list.push_back(stats_size);  // a sentinel value
+  for (size_t ki = 0; ki + 1 < key_frame_list.size(); ++ki) {
+    int frames_to_key = key_frame_list[ki + 1] - key_frame_list[ki];
+    int key_order_index = key_frame_list[ki];  // The key frame's display order
+
+    std::vector<REGIONS> regions_list(MAX_FIRSTPASS_ANALYSIS_FRAMES);
+    // TODO(angiebird): Assume frames_to_key <= MAX_FIRSTPASS_ANALYSIS_FRAMES
+    // for now.
+    // Handle the situation that frames_to_key > MAX_FIRSTPASS_ANALYSIS_FRAMES
+    // here or refactor av1_identify_regions() to make it support
+    // frames_to_key > MAX_FIRSTPASS_ANALYSIS_FRAMES
+    assert(frames_to_key <= MAX_FIRSTPASS_ANALYSIS_FRAMES);
+    int total_regions = 0;
+    av1_identify_regions(firstpass_info.stats_list.data() + key_order_index,
+                         frames_to_key, 0, regions_list.data(), &total_regions);
+    regions_list.resize(total_regions);
+    std::vector<int> gf_intervals = PartitionGopIntervals(
+        rc_param_, firstpass_info.stats_list, regions_list, key_order_index,
+        /*frames_since_key=*/0, frames_to_key);
+    for (size_t gi = 0; gi < gf_intervals.size(); ++gi) {
+      const bool has_key_frame = gi == 0;
+      const int show_frame_count = gf_intervals[gi];
+      GopStruct gop =
+          ConstructGop(&ref_frame_manager, show_frame_count, has_key_frame,
+                       global_coding_idx_offset, global_order_idx_offset);
+      assert(gop.show_frame_count == show_frame_count);
+      global_coding_idx_offset += static_cast<int>(gop.gop_frame_list.size());
+      global_order_idx_offset += gop.show_frame_count;
+      gop_list.push_back(gop);
+    }
+  }
+  return gop_list;
+}
+
+TplFrameDepStats CreateTplFrameDepStats(int frame_height, int frame_width,
+                                        int min_block_size) {
+  const int unit_rows =
+      frame_height / min_block_size + !!(frame_height % min_block_size);
+  const int unit_cols =
+      frame_width / min_block_size + !!(frame_width % min_block_size);
+  TplFrameDepStats frame_dep_stats;
+  frame_dep_stats.unit_size = min_block_size;
+  frame_dep_stats.unit_stats = std::vector<std::vector<TplUnitDepStats>>(
+      unit_rows, std::vector<TplUnitDepStats>(unit_cols));
+  return frame_dep_stats;
+}
+
+TplUnitDepStats TplBlockStatsToDepStats(const TplBlockStats &block_stats,
+                                        int unit_count) {
+  TplUnitDepStats dep_stats = {};
+  dep_stats.intra_cost = block_stats.intra_cost * 1.0 / unit_count;
+  dep_stats.inter_cost = block_stats.inter_cost * 1.0 / unit_count;
+  // In rare case, inter_cost may be greater than intra_cost.
+  // If so, we need to modify inter_cost such that inter_cost <= intra_cost
+  // because it is required by GetPropagationFraction()
+  dep_stats.inter_cost = std::min(dep_stats.intra_cost, dep_stats.inter_cost);
+  dep_stats.mv = block_stats.mv;
+  dep_stats.ref_frame_index = block_stats.ref_frame_index;
+  return dep_stats;
+}
+
+TplFrameDepStats CreateTplFrameDepStatsWithoutPropagation(
+    const TplFrameStats &frame_stats) {
+  const int min_block_size = frame_stats.min_block_size;
+  TplFrameDepStats frame_dep_stats = CreateTplFrameDepStats(
+      frame_stats.frame_height, frame_stats.frame_width, min_block_size);
+  for (const TplBlockStats &block_stats : frame_stats.block_stats_list) {
+    const int block_unit_rows = block_stats.height / min_block_size;
+    const int block_unit_cols = block_stats.width / min_block_size;
+    const int unit_count = block_unit_rows * block_unit_cols;
+    const int block_unit_row = block_stats.row / min_block_size;
+    const int block_unit_col = block_stats.col / min_block_size;
+    TplUnitDepStats unit_stats =
+        TplBlockStatsToDepStats(block_stats, unit_count);
+    for (int r = 0; r < block_unit_rows; r++) {
+      for (int c = 0; c < block_unit_cols; c++) {
+        frame_dep_stats.unit_stats[block_unit_row + r][block_unit_col + c] =
+            unit_stats;
+      }
+    }
+  }
+  return frame_dep_stats;
+}
+
+int GetRefCodingIdxList(const TplUnitDepStats &unit_dep_stats,
+                        const RefFrameTable &ref_frame_table,
+                        int *ref_coding_idx_list) {
+  int ref_frame_count = 0;
+  for (int i = 0; i < kBlockRefCount; ++i) {
+    ref_coding_idx_list[i] = -1;
+    int ref_frame_index = unit_dep_stats.ref_frame_index[i];
+    if (ref_frame_index != -1) {
+      ref_coding_idx_list[i] = ref_frame_table[ref_frame_index].coding_idx;
+      ref_frame_count++;
+    }
+  }
+  return ref_frame_count;
+}
+
+int GetBlockOverlapArea(int r0, int c0, int r1, int c1, int size) {
+  const int r_low = std::max(r0, r1);
+  const int r_high = std::min(r0 + size, r1 + size);
+  const int c_low = std::max(c0, c1);
+  const int c_high = std::min(c0 + size, c1 + size);
+  if (r_high >= r_low && c_high >= c_low) {
+    return (r_high - r_low) * (c_high - c_low);
+  }
+  return 0;
+}
+
+// TODO(angiebird): Merge TplFrameDepStatsAccumulateIntraCost and
+// TplFrameDepStatsAccumulate.
+double TplFrameDepStatsAccumulateIntraCost(
+    const TplFrameDepStats &frame_dep_stats) {
+  auto getIntraCost = [](double sum, const TplUnitDepStats &unit) {
+    return sum + unit.intra_cost;
+  };
+  double sum = 0;
+  for (const auto &row : frame_dep_stats.unit_stats) {
+    sum = std::accumulate(row.begin(), row.end(), sum, getIntraCost);
+  }
+  return sum;
+}
+
+double TplFrameDepStatsAccumulate(const TplFrameDepStats &frame_dep_stats) {
+  auto getOverallCost = [](double sum, const TplUnitDepStats &unit) {
+    return sum + unit.propagation_cost + unit.intra_cost;
+  };
+  double sum = 0;
+  for (const auto &row : frame_dep_stats.unit_stats) {
+    sum = std::accumulate(row.begin(), row.end(), sum, getOverallCost);
+  }
+  return sum;
+}
+
+// This is a generalization of GET_MV_RAWPEL that allows for an arbitrary
+// number of fractional bits.
+// TODO(angiebird): Add unit test to this function
+int GetFullpelValue(int subpel_value, int subpel_bits) {
+  const int subpel_scale = (1 << subpel_bits);
+  const int sign = subpel_value >= 0 ? 1 : -1;
+  int fullpel_value = (abs(subpel_value) + subpel_scale / 2) >> subpel_bits;
+  fullpel_value *= sign;
+  return fullpel_value;
+}
+
+double GetPropagationFraction(const TplUnitDepStats &unit_dep_stats) {
+  assert(unit_dep_stats.intra_cost >= unit_dep_stats.inter_cost);
+  return (unit_dep_stats.intra_cost - unit_dep_stats.inter_cost) /
+         ModifyDivisor(unit_dep_stats.intra_cost);
+}
+
+void TplFrameDepStatsPropagate(int coding_idx,
+                               const RefFrameTable &ref_frame_table,
+                               TplGopDepStats *tpl_gop_dep_stats) {
+  assert(!tpl_gop_dep_stats->frame_dep_stats_list.empty());
+  TplFrameDepStats *frame_dep_stats =
+      &tpl_gop_dep_stats->frame_dep_stats_list[coding_idx];
+
+  const int unit_size = frame_dep_stats->unit_size;
+  const int frame_unit_rows =
+      static_cast<int>(frame_dep_stats->unit_stats.size());
+  const int frame_unit_cols =
+      static_cast<int>(frame_dep_stats->unit_stats[0].size());
+  for (int unit_row = 0; unit_row < frame_unit_rows; ++unit_row) {
+    for (int unit_col = 0; unit_col < frame_unit_cols; ++unit_col) {
+      TplUnitDepStats &unit_dep_stats =
+          frame_dep_stats->unit_stats[unit_row][unit_col];
+      int ref_coding_idx_list[kBlockRefCount] = { -1, -1 };
+      int ref_frame_count = GetRefCodingIdxList(unit_dep_stats, ref_frame_table,
+                                                ref_coding_idx_list);
+      if (ref_frame_count == 0) continue;
+      for (int i = 0; i < kBlockRefCount; ++i) {
+        if (ref_coding_idx_list[i] == -1) continue;
+        TplFrameDepStats &ref_frame_dep_stats =
+            tpl_gop_dep_stats->frame_dep_stats_list[ref_coding_idx_list[i]];
+        const auto &mv = unit_dep_stats.mv[i];
+        const int mv_row = GetFullpelValue(mv.row, mv.subpel_bits);
+        const int mv_col = GetFullpelValue(mv.col, mv.subpel_bits);
+        const int ref_pixel_r = unit_row * unit_size + mv_row;
+        const int ref_pixel_c = unit_col * unit_size + mv_col;
+        const int ref_unit_row_low =
+            (unit_row * unit_size + mv_row) / unit_size;
+        const int ref_unit_col_low =
+            (unit_col * unit_size + mv_col) / unit_size;
+        for (int j = 0; j < 2; ++j) {
+          for (int k = 0; k < 2; ++k) {
+            const int ref_unit_row = ref_unit_row_low + j;
+            const int ref_unit_col = ref_unit_col_low + k;
+            if (ref_unit_row >= 0 && ref_unit_row < frame_unit_rows &&
+                ref_unit_col >= 0 && ref_unit_col < frame_unit_cols) {
+              const int overlap_area = GetBlockOverlapArea(
+                  ref_pixel_r, ref_pixel_c, ref_unit_row * unit_size,
+                  ref_unit_col * unit_size, unit_size);
+              const double overlap_ratio =
+                  overlap_area * 1.0 / (unit_size * unit_size);
+              const double propagation_fraction =
+                  GetPropagationFraction(unit_dep_stats);
+              const double propagation_ratio =
+                  1.0 / ref_frame_count * overlap_ratio * propagation_fraction;
+              TplUnitDepStats &ref_unit_stats =
+                  ref_frame_dep_stats.unit_stats[ref_unit_row][ref_unit_col];
+              ref_unit_stats.propagation_cost +=
+                  (unit_dep_stats.intra_cost +
+                   unit_dep_stats.propagation_cost) *
+                  propagation_ratio;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// TODO(angiebird): Add unit test for this function
+std::vector<RefFrameTable> GetRefFrameTableList(const GopStruct &gop_struct,
+                                                RefFrameTable ref_frame_table) {
+  const int frame_count = static_cast<int>(gop_struct.gop_frame_list.size());
+  std::vector<RefFrameTable> ref_frame_table_list;
+  ref_frame_table_list.push_back(ref_frame_table);
+  for (int coding_idx = 0; coding_idx < frame_count; coding_idx++) {
+    const auto &gop_frame = gop_struct.gop_frame_list[coding_idx];
+    if (gop_frame.update_ref_idx != -1) {
+      ref_frame_table[gop_frame.update_ref_idx] = gop_frame;
+    }
+    ref_frame_table_list.push_back(ref_frame_table);
+  }
+  return ref_frame_table_list;
+}
+
+TplGopDepStats ComputeTplGopDepStats(
+    const TplGopStats &tpl_gop_stats,
+    const std::vector<RefFrameTable> &ref_frame_table_list) {
+  const int frame_count = static_cast<int>(ref_frame_table_list.size());
+
+  // Create the struct to store TPL dependency stats
+  TplGopDepStats tpl_gop_dep_stats;
+  for (int coding_idx = 0; coding_idx < frame_count; coding_idx++) {
+    tpl_gop_dep_stats.frame_dep_stats_list.push_back(
+        CreateTplFrameDepStatsWithoutPropagation(
+            tpl_gop_stats.frame_stats_list[coding_idx]));
+  }
+
+  // Back propagation
+  for (int coding_idx = frame_count - 1; coding_idx >= 0; coding_idx--) {
+    auto &ref_frame_table = ref_frame_table_list[coding_idx];
+    // TODO(angiebird): Handle/test the case where reference frame
+    // is in the previous GOP
+    TplFrameDepStatsPropagate(coding_idx, ref_frame_table, &tpl_gop_dep_stats);
+  }
+  return tpl_gop_dep_stats;
+}
+
+static int GetRDMult(const GopFrame &gop_frame, int qindex) {
+  // TODO(angiebird):
+  // 1) Check if these rdmult rules are good in our use case.
+  // 2) Support high-bit-depth mode
+  if (gop_frame.is_golden_frame) {
+    // Assume ARF_UPDATE/GF_UPDATE share the same remult rule.
+    return av1_compute_rd_mult_based_on_qindex(AOM_BITS_8, GF_UPDATE, qindex);
+  } else if (gop_frame.is_key_frame) {
+    return av1_compute_rd_mult_based_on_qindex(AOM_BITS_8, KF_UPDATE, qindex);
+  } else {
+    // Assume LF_UPDATE/OVERLAY_UPDATE/INTNL_OVERLAY_UPDATE/INTNL_ARF_UPDATE
+    // share the same remult rule.
+    return av1_compute_rd_mult_based_on_qindex(AOM_BITS_8, LF_UPDATE, qindex);
+  }
+}
+
+GopEncodeInfo AV1RateControlQMode::GetGopEncodeInfo(
+    const GopStruct &gop_struct, const TplGopStats &tpl_gop_stats,
+    const RefFrameTable &ref_frame_table_snapshot_init) {
+  const std::vector<RefFrameTable> ref_frame_table_list =
+      GetRefFrameTableList(gop_struct, ref_frame_table_snapshot_init);
+
+  GopEncodeInfo gop_encode_info;
+  gop_encode_info.final_snapshot = ref_frame_table_list.back();
+  TplGopDepStats gop_dep_stats =
+      ComputeTplGopDepStats(tpl_gop_stats, ref_frame_table_list);
+  const int frame_count =
+      static_cast<int>(tpl_gop_stats.frame_stats_list.size());
+  for (int i = 0; i < frame_count; i++) {
+    const TplFrameDepStats &frame_dep_stats =
+        gop_dep_stats.frame_dep_stats_list[i];
+    const double cost_without_propagation =
+        TplFrameDepStatsAccumulateIntraCost(frame_dep_stats);
+    const double cost_with_propagation =
+        TplFrameDepStatsAccumulate(frame_dep_stats);
+    const double frame_importance =
+        cost_with_propagation / cost_without_propagation;
+    // Imitate the behavior of av1_tpl_get_qstep_ratio()
+    const double qstep_ratio = sqrt(1 / frame_importance);
+    FrameEncodeParameters param;
+    param.q_index = av1_get_q_index_from_qstep_ratio(rc_param_.base_q_index,
+                                                     qstep_ratio, AOM_BITS_8);
+    const GopFrame &gop_frame = gop_struct.gop_frame_list[i];
+    param.rdmult = GetRDMult(gop_frame, param.q_index);
+    gop_encode_info.param_list.push_back(param);
+  }
+  return gop_encode_info;
+}
+
+}  // namespace aom
diff --git a/media/libaom/src/av1/ratectrl_qmode.h b/media/libaom/src/av1/ratectrl_qmode.h
new file mode 100644
index 0000000000..760216943e
--- /dev/null
+++ b/media/libaom/src/av1/ratectrl_qmode.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_RATECTRL_QMODE_H_
+#define AOM_AV1_RATECTRL_QMODE_H_
+
+#include <deque>
+#include <queue>
+#include <vector>
+#include "av1/encoder/firstpass.h"
+#include "av1/ratectrl_qmode_interface.h"
+#include "av1/reference_manager.h"
+
+namespace aom {
+
+constexpr int kLayerDepthOffset = 1;
+constexpr int kMinIntervalToAddArf = 3;
+constexpr int kMinArfInterval = (kMinIntervalToAddArf + 1) / 2;
+
+struct TplUnitDepStats {
+  double propagation_cost;
+  double intra_cost;
+  double inter_cost;
+  std::array<MotionVector, kBlockRefCount> mv;
+  std::array<int, kBlockRefCount> ref_frame_index;
+};
+
+struct TplFrameDepStats {
+  int unit_size;  // equivalent to min_block_size
+  std::vector<std::vector<TplUnitDepStats>> unit_stats;
+};
+
+struct TplGopDepStats {
+  std::vector<TplFrameDepStats> frame_dep_stats_list;
+};
+
+GopFrame GopFrameInvalid();
+
+// gop frame type used for facilitate setting up GopFrame
+// TODO(angiebird): Define names for forward key frame and
+// key frame with overlay
+enum class GopFrameType {
+  kRegularKey,  // High quality key frame without overlay
+  kRegularArf,  // High quality arf with strong filtering followed by an overlay
+                // later
+  kIntermediateArf,  // Good quality arf with weak or no filtering followed by a
+                     // show_existing later
+  kRegularLeaf,      // Regular leaf frame
+  kShowExisting,     // Show_existing frame
+  kOverlay           // Overlay frame
+};
+
+// Set up is_key_frame, is_arf_frame, is_show_frame, is_golden_frame and
+// encode_ref_mode in GopFrame based on gop_frame_type
+void SetGopFrameByType(GopFrameType gop_frame_type, GopFrame *gop_frame);
+
+GopFrame GopFrameBasic(int global_coding_idx_offset,
+                       int global_order_idx_offset, int coding_idx,
+                       int order_idx, int depth, GopFrameType gop_frame_type);
+
+GopStruct ConstructGop(RefFrameManager *ref_frame_manager, int show_frame_count,
+                       bool has_key_frame, int global_coding_idx_offset,
+                       int global_order_idx_offset);
+
+TplFrameDepStats CreateTplFrameDepStats(int frame_height, int frame_width,
+                                        int min_block_size);
+
+TplUnitDepStats TplBlockStatsToDepStats(const TplBlockStats &block_stats,
+                                        int unit_count);
+
+TplFrameDepStats CreateTplFrameDepStatsWithoutPropagation(
+    const TplFrameStats &frame_stats);
+
+std::vector<int> GetKeyFrameList(const FirstpassInfo &first_pass_info);
+
+double TplFrameDepStatsAccumulateIntraCost(
+    const TplFrameDepStats &frame_dep_stats);
+
+double TplFrameDepStatsAccumulate(const TplFrameDepStats &frame_dep_stats);
+
+void TplFrameDepStatsPropagate(int coding_idx,
+                               const RefFrameTable &ref_frame_table,
+                               TplGopDepStats *tpl_gop_dep_stats);
+
+int GetBlockOverlapArea(int r0, int c0, int r1, int c1, int size);
+
+TplGopDepStats ComputeTplGopDepStats(
+    const TplGopStats &tpl_gop_stats,
+    const std::vector<RefFrameTable> &ref_frame_table_list);
+
+class AV1RateControlQMode : public AV1RateControlQModeInterface {
+ public:
+  void SetRcParam(const RateControlParam &rc_param) override;
+  GopStructList DetermineGopInfo(const FirstpassInfo &firstpass_info) override;
+  GopEncodeInfo GetGopEncodeInfo(
+      const GopStruct &gop_struct, const TplGopStats &tpl_gop_stats,
+      const RefFrameTable &ref_frame_table_snapshot) override;
+
+ private:
+  RateControlParam rc_param_;
+};
+}  // namespace aom
+
+#endif  // AOM_AV1_RATECTRL_QMODE_H_
diff --git a/media/libaom/src/av1/common/cdef_block_avx2.c b/media/libaom/src/av1/ratectrl_qmode_interface.cc
index e2b85b3e28..eb29e43303 100644
--- a/media/libaom/src/av1/common/cdef_block_avx2.c
+++ b/media/libaom/src/av1/ratectrl_qmode_interface.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,6 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_avx2
-#include "av1/common/cdef_block_simd.h"
+#include "av1/ratectrl_qmode_interface.h"
+
+namespace aom {
+
+AV1RateControlQModeInterface::AV1RateControlQModeInterface() = default;
+AV1RateControlQModeInterface::~AV1RateControlQModeInterface() = default;
+
+}  // namespace aom
diff --git a/media/libaom/src/av1/ratectrl_qmode_interface.h b/media/libaom/src/av1/ratectrl_qmode_interface.h
new file mode 100644
index 0000000000..5fa8492410
--- /dev/null
+++ b/media/libaom/src/av1/ratectrl_qmode_interface.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_RATECTRL_QMODE_INTERFACE_H_
+#define AOM_AV1_RATECTRL_QMODE_INTERFACE_H_
+
+#include <array>
+#include <vector>
+
+#include "av1/encoder/firstpass.h"
+
+namespace aom {
+
+constexpr int kBlockRefCount = 2;
+constexpr int kRefFrameTableSize = 7;
+
+struct MotionVector {
+  int row;          // subpel row
+  int col;          // subpel col
+  int subpel_bits;  // number of fractional bits used by row/col
+};
+
+struct RateControlParam {
+  int max_gop_show_frame_count;
+  int min_gop_show_frame_count;
+  int max_ref_frames;
+  int base_q_index;
+  int frame_width;
+  int frame_height;
+};
+
+struct TplBlockStats {
+  int height;  // pixel height
+  int width;   // pixel width
+  int row;     // pixel row of the top left corner
+  int col;     // pixel col of the top lef corner
+  int64_t intra_cost;
+  int64_t inter_cost;
+  std::array<MotionVector, kBlockRefCount> mv;
+  std::array<int, kBlockRefCount> ref_frame_index;
+};
+
+enum class EncodeRefMode {
+  kRegular,
+  kOverlay,
+  kShowExisting,
+};
+
+enum class ReferenceName {
+  kNoneFrame = -1,
+  kIntraFrame = 0,
+  kLastFrame = 1,
+  kLast2Frame = 2,
+  kLast3Frame = 3,
+  kGoldenFrame = 4,
+  kBwdrefFrame = 5,
+  kAltref2Frame = 6,
+  kAltrefFrame = 7,
+};
+
+struct ReferenceFrame {
+  int index;  // Index of reference slot containing the reference frame
+  ReferenceName name;
+};
+
+struct GopFrame {
+  // basic info
+  bool is_valid;
+  int order_idx;   // Index in display order in a GOP
+  int coding_idx;  // Index in coding order in a GOP
+
+  int global_order_idx;   // Index in display order in the whole video chunk
+  int global_coding_idx;  // Index in coding order in the whole video chunk
+
+  bool is_key_frame;     // If this is key frame, reset reference buffers are
+                         // required
+  bool is_arf_frame;     // Is this a forward frame, a frame with order_idx
+                         // higher than the current display order
+  bool is_show_frame;    // Is this frame a show frame after coding
+  bool is_golden_frame;  // Is this a high quality frame
+
+  // reference frame info
+  EncodeRefMode encode_ref_mode;
+  int colocated_ref_idx;  // colocated_ref_idx == -1 when encode_ref_mode ==
+                          // EncodeRefMode::kRegular
+  int update_ref_idx;     // The reference index that this frame should be
+                          // updated to. update_ref_idx == -1 when this frame
+                          // will not serve as a reference frame
+  std::vector<ReferenceFrame>
+      ref_frame_list;  // A list of available reference frames in priority order
+                       // for the current to-be-coded frame. The list size
+                       // should be less or equal to kRefFrameTableSize. The
+                       // reference frames with smaller indices are more likely
+                       // to be a good reference frame. Therefore, they should
+                       // be prioritized when the reference frame count is
+                       // limited. For example, if we plan to use 3 reference
+                       // frames, we should choose ref_frame_list[0],
+                       // ref_frame_list[1] and ref_frame_list[2].
+  int layer_depth;     // Layer depth in the GOP structure
+  ReferenceFrame primary_ref_frame;  // We will use the primary reference frame
+                                     // to update current frame's initial
+                                     // probability model
+};
+
+struct GopStruct {
+  int show_frame_count;
+  int global_coding_idx_offset;
+  int global_order_idx_offset;
+  std::vector<GopFrame> gop_frame_list;
+};
+
+using GopStructList = std::vector<GopStruct>;
+
+struct FrameEncodeParameters {
+  int q_index;
+  int rdmult;
+};
+
+struct FirstpassInfo {
+  int num_mbs_16x16;  // Count of 16x16 unit blocks in each frame.
+                      // FIRSTPASS_STATS's unit block size is 16x16
+  std::vector<FIRSTPASS_STATS> stats_list;
+};
+
+using RefFrameTable = std::array<GopFrame, kRefFrameTableSize>;
+
+struct GopEncodeInfo {
+  std::vector<FrameEncodeParameters> param_list;
+  RefFrameTable final_snapshot;  // RefFrameTable snapshot after coding this GOP
+};
+
+struct TplFrameStats {
+  int min_block_size;
+  int frame_width;
+  int frame_height;
+  std::vector<TplBlockStats> block_stats_list;
+};
+
+struct TplGopStats {
+  std::vector<TplFrameStats> frame_stats_list;
+};
+
+class AV1RateControlQModeInterface {
+ public:
+  AV1RateControlQModeInterface();
+  virtual ~AV1RateControlQModeInterface();
+
+  virtual void SetRcParam(const RateControlParam &rc_param) = 0;
+  virtual GopStructList DetermineGopInfo(
+      const FirstpassInfo &firstpass_info) = 0;
+  // Accept firstpass and tpl info from the encoder and return q index and
+  // rdmult. This needs to be called with consecutive GOPs as returned by
+  // DetermineGopInfo.
+  virtual GopEncodeInfo GetGopEncodeInfo(
+      const GopStruct &gop_struct, const TplGopStats &tpl_gop_stats,
+      const RefFrameTable &ref_frame_table_snapshot_init) = 0;
+};  // class AV1RateCtrlQMode
+}  // namespace aom
+
+#endif  // AOM_AV1_RATECTRL_QMODE_INTERFACE_H_
diff --git a/media/libaom/src/av1/ratectrl_rtc.cc b/media/libaom/src/av1/ratectrl_rtc.cc
new file mode 100644
index 0000000000..f1af797352
--- /dev/null
+++ b/media/libaom/src/av1/ratectrl_rtc.cc
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/ratectrl_rtc.h"
+
+#include <memory>
+#include <new>
+
+#include "aom/aomcx.h"
+#include "aom/aom_encoder.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/svc_layercontext.h"
+
+namespace aom {
+
+AV1RateControlRtcConfig::AV1RateControlRtcConfig() {
+  width = 1280;
+  height = 720;
+  max_quantizer = 63;
+  min_quantizer = 2;
+  target_bandwidth = 1000;
+  buf_initial_sz = 600;
+  buf_optimal_sz = 600;
+  buf_sz = 1000;
+  undershoot_pct = overshoot_pct = 50;
+  max_intra_bitrate_pct = 50;
+  max_inter_bitrate_pct = 0;
+  framerate = 30.0;
+  ts_number_layers = 1;
+  aq_mode = 0;
+  layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
+  ts_rate_decimator[0] = 1;
+  av1_zero(max_quantizers);
+  av1_zero(min_quantizers);
+  av1_zero(scaling_factor_den);
+  av1_zero(scaling_factor_num);
+  av1_zero(layer_target_bitrate);
+  av1_zero(ts_rate_decimator);
+  scaling_factor_num[0] = 1;
+  scaling_factor_den[0] = 1;
+  max_quantizers[0] = max_quantizer;
+  min_quantizers[0] = min_quantizer;
+}
+
+std::unique_ptr<AV1RateControlRTC> AV1RateControlRTC::Create(
+    const AV1RateControlRtcConfig &cfg) {
+  std::unique_ptr<AV1RateControlRTC> rc_api(new (std::nothrow)
+                                                AV1RateControlRTC());
+  if (!rc_api) return nullptr;
+  rc_api->cpi_ = static_cast<AV1_COMP *>(aom_memalign(32, sizeof(*cpi_)));
+  if (!rc_api->cpi_) return nullptr;
+  av1_zero(*rc_api->cpi_);
+  rc_api->cpi_->ppi =
+      static_cast<AV1_PRIMARY *>(aom_memalign(32, sizeof(AV1_PRIMARY)));
+  if (!rc_api->cpi_->ppi) return nullptr;
+  rc_api->cpi_->common.seq_params = &rc_api->cpi_->ppi->seq_params;
+  av1_zero(*rc_api->cpi_->common.seq_params);
+  rc_api->InitRateControl(cfg);
+  if (cfg.aq_mode) {
+    AV1_COMP *const cpi = rc_api->cpi_;
+    cpi->enc_seg.map = static_cast<uint8_t *>(aom_calloc(
+        cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols,
+        sizeof(*cpi->enc_seg.map)));
+    if (!cpi->enc_seg.map) return nullptr;
+    cpi->cyclic_refresh = av1_cyclic_refresh_alloc(
+        cpi->common.mi_params.mi_rows, cpi->common.mi_params.mi_cols);
+    if (!cpi->cyclic_refresh) return nullptr;
+  }
+  return rc_api;
+}
+
+AV1RateControlRTC::~AV1RateControlRTC() {
+  if (cpi_) {
+    if (cpi_->svc.number_spatial_layers > 1 ||
+        cpi_->svc.number_temporal_layers > 1) {
+      for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
+        for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
+          int layer =
+              LAYER_IDS_TO_IDX(sl, tl, cpi_->svc.number_temporal_layers);
+          LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
+          aom_free(lc->map);
+        }
+      }
+    }
+    if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
+      aom_free(cpi_->enc_seg.map);
+      cpi_->enc_seg.map = nullptr;
+      av1_cyclic_refresh_free(cpi_->cyclic_refresh);
+    }
+    aom_free(cpi_->ppi);
+    aom_free(cpi_);
+  }
+}
+
+void AV1RateControlRTC::InitRateControl(const AV1RateControlRtcConfig &rc_cfg) {
+  AV1_COMMON *cm = &cpi_->common;
+  AV1EncoderConfig *oxcf = &cpi_->oxcf;
+  RATE_CONTROL *const rc = &cpi_->rc;
+  cm->seq_params->profile = PROFILE_0;
+  cm->seq_params->bit_depth = AOM_BITS_8;
+  cm->show_frame = 1;
+  oxcf->profile = cm->seq_params->profile;
+  oxcf->mode = REALTIME;
+  oxcf->rc_cfg.mode = AOM_CBR;
+  oxcf->pass = AOM_RC_ONE_PASS;
+  oxcf->q_cfg.aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ;
+  oxcf->tune_cfg.content = AOM_CONTENT_DEFAULT;
+  oxcf->rc_cfg.drop_frames_water_mark = 0;
+  oxcf->tool_cfg.bit_depth = AOM_BITS_8;
+  oxcf->tool_cfg.superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC;
+  cm->current_frame.frame_number = 0;
+  cpi_->ppi->p_rc.kf_boost = DEFAULT_KF_BOOST_RT;
+  for (auto &lvl_idx : oxcf->target_seq_level_idx) lvl_idx = SEQ_LEVEL_MAX;
+
+  memcpy(cpi_->ppi->level_params.target_seq_level_idx,
+         oxcf->target_seq_level_idx, sizeof(oxcf->target_seq_level_idx));
+  UpdateRateControl(rc_cfg);
+  set_sb_size(cm->seq_params,
+              av1_select_sb_size(oxcf, cm->width, cm->height,
+                                 cpi_->svc.number_spatial_layers));
+  cpi_->ppi->use_svc = cpi_->svc.number_spatial_layers > 1 ||
+                       cpi_->svc.number_temporal_layers > 1;
+  av1_primary_rc_init(oxcf, &cpi_->ppi->p_rc);
+  rc->rc_1_frame = 0;
+  rc->rc_2_frame = 0;
+  av1_rc_init_minq_luts();
+  av1_rc_init(oxcf, rc);
+  // Enable external rate control.
+  cpi_->rc.rtc_external_ratectrl = 1;
+  cpi_->sf.rt_sf.use_nonrd_pick_mode = 1;
+}
+
+void AV1RateControlRTC::UpdateRateControl(
+    const AV1RateControlRtcConfig &rc_cfg) {
+  AV1_COMMON *cm = &cpi_->common;
+  AV1EncoderConfig *oxcf = &cpi_->oxcf;
+  RATE_CONTROL *const rc = &cpi_->rc;
+
+  initial_width_ = rc_cfg.width;
+  initial_height_ = rc_cfg.height;
+  cm->width = rc_cfg.width;
+  cm->height = rc_cfg.height;
+  oxcf->frm_dim_cfg.width = rc_cfg.width;
+  oxcf->frm_dim_cfg.height = rc_cfg.height;
+  oxcf->rc_cfg.worst_allowed_q = av1_quantizer_to_qindex(rc_cfg.max_quantizer);
+  oxcf->rc_cfg.best_allowed_q = av1_quantizer_to_qindex(rc_cfg.min_quantizer);
+  rc->worst_quality = oxcf->rc_cfg.worst_allowed_q;
+  rc->best_quality = oxcf->rc_cfg.best_allowed_q;
+  oxcf->input_cfg.init_framerate = rc_cfg.framerate;
+  oxcf->rc_cfg.target_bandwidth = rc_cfg.target_bandwidth > INT64_MAX / 1000
+                                      ? INT64_MAX
+                                      : 1000 * rc_cfg.target_bandwidth;
+  oxcf->rc_cfg.starting_buffer_level_ms = rc_cfg.buf_initial_sz;
+  oxcf->rc_cfg.optimal_buffer_level_ms = rc_cfg.buf_optimal_sz;
+  oxcf->rc_cfg.maximum_buffer_size_ms = rc_cfg.buf_sz;
+  oxcf->rc_cfg.under_shoot_pct = rc_cfg.undershoot_pct;
+  oxcf->rc_cfg.over_shoot_pct = rc_cfg.overshoot_pct;
+  oxcf->rc_cfg.max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
+  oxcf->rc_cfg.max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct;
+  cpi_->framerate = rc_cfg.framerate;
+  cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
+  cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
+  set_primary_rc_buffer_sizes(oxcf, cpi_->ppi);
+  enc_set_mb_mi(&cm->mi_params, cm->width, cm->height, BLOCK_8X8);
+  int64_t target_bandwidth_svc = 0;
+  for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
+      const int layer =
+          LAYER_IDS_TO_IDX(sl, tl, cpi_->svc.number_temporal_layers);
+      LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer];
+      RATE_CONTROL *const lrc = &lc->rc;
+      lc->layer_target_bitrate = 1000 * rc_cfg.layer_target_bitrate[layer];
+      lc->max_q = rc_cfg.max_quantizers[layer];
+      lc->min_q = rc_cfg.min_quantizers[layer];
+      lrc->worst_quality =
+          av1_quantizer_to_qindex(rc_cfg.max_quantizers[layer]);
+      lrc->best_quality = av1_quantizer_to_qindex(rc_cfg.min_quantizers[layer]);
+      lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl];
+      lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl];
+      lc->framerate_factor = rc_cfg.ts_rate_decimator[tl];
+      if (tl == cpi_->svc.number_temporal_layers - 1)
+        target_bandwidth_svc += lc->layer_target_bitrate;
+    }
+  }
+  av1_new_framerate(cpi_, cpi_->framerate);
+  if (cpi_->svc.number_temporal_layers > 1 ||
+      cpi_->svc.number_spatial_layers > 1) {
+    if (cm->current_frame.frame_number == 0) av1_init_layer_context(cpi_);
+    // This is needed to initialize external RC flag in layer context structure.
+    cpi_->rc.rtc_external_ratectrl = 1;
+    av1_update_layer_context_change_config(cpi_, target_bandwidth_svc);
+  }
+  check_reset_rc_flag(cpi_);
+}
+
+void AV1RateControlRTC::ComputeQP(const AV1FrameParamsRTC &frame_params) {
+  AV1_COMMON *const cm = &cpi_->common;
+  int width, height;
+  GF_GROUP *const gf_group = &cpi_->ppi->gf_group;
+  cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id;
+  cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id;
+  if (cpi_->svc.number_spatial_layers > 1) {
+    const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
+                                       cpi_->svc.temporal_layer_id,
+                                       cpi_->svc.number_temporal_layers);
+    LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer];
+    av1_get_layer_resolution(initial_width_, initial_height_,
+                             lc->scaling_factor_num, lc->scaling_factor_den,
+                             &width, &height);
+    cm->width = width;
+    cm->height = height;
+  }
+  enc_set_mb_mi(&cm->mi_params, cm->width, cm->height, BLOCK_8X8);
+  cm->current_frame.frame_type = frame_params.frame_type;
+  cpi_->refresh_frame.golden_frame =
+      (cm->current_frame.frame_type == KEY_FRAME) ? 1 : 0;
+  cpi_->sf.rt_sf.use_nonrd_pick_mode = 1;
+
+  if (frame_params.frame_type == kKeyFrame) {
+    gf_group->update_type[cpi_->gf_frame_index] = KF_UPDATE;
+    gf_group->frame_type[cpi_->gf_frame_index] = KEY_FRAME;
+    gf_group->refbuf_state[cpi_->gf_frame_index] = REFBUF_RESET;
+    cpi_->rc.frames_since_key = 0;
+    if (cpi_->ppi->use_svc) {
+      const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
+                                         cpi_->svc.temporal_layer_id,
+                                         cpi_->svc.number_temporal_layers);
+      if (cm->current_frame.frame_number > 0)
+        av1_svc_reset_temporal_layers(cpi_, 1);
+      cpi_->svc.layer_context[layer].is_key_frame = 1;
+    }
+  } else {
+    gf_group->update_type[cpi_->gf_frame_index] = LF_UPDATE;
+    gf_group->frame_type[cpi_->gf_frame_index] = INTER_FRAME;
+    gf_group->refbuf_state[cpi_->gf_frame_index] = REFBUF_UPDATE;
+    if (cpi_->ppi->use_svc) {
+      const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
+                                         cpi_->svc.temporal_layer_id,
+                                         cpi_->svc.number_temporal_layers);
+      cpi_->svc.layer_context[layer].is_key_frame = 0;
+    }
+    cpi_->rc.frames_since_key++;
+  }
+  if (cpi_->svc.number_spatial_layers > 1 ||
+      cpi_->svc.number_temporal_layers > 1) {
+    av1_update_temporal_layer_framerate(cpi_);
+    av1_restore_layer_context(cpi_);
+  }
+  int target = 0;
+  if (cpi_->oxcf.rc_cfg.mode == AOM_CBR) {
+    if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+      av1_cyclic_refresh_update_parameters(cpi_);
+    if (frame_is_intra_only(cm))
+      target = av1_calc_iframe_target_size_one_pass_cbr(cpi_);
+    else
+      target = av1_calc_pframe_target_size_one_pass_cbr(
+          cpi_, gf_group->update_type[cpi_->gf_frame_index]);
+  }
+  av1_rc_set_frame_target(cpi_, target, cm->width, cm->height);
+
+  int bottom_index, top_index;
+  cpi_->common.quant_params.base_qindex =
+      av1_rc_pick_q_and_bounds(cpi_, cm->width, cm->height,
+                               cpi_->gf_frame_index, &bottom_index, &top_index);
+
+  if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+    av1_cyclic_refresh_setup(cpi_);
+}
+
+int AV1RateControlRTC::GetQP() const {
+  return cpi_->common.quant_params.base_qindex;
+}
+
+signed char *AV1RateControlRTC::GetCyclicRefreshMap() const {
+  return cpi_->cyclic_refresh->map;
+}
+
+int *AV1RateControlRTC::GetDeltaQ() const {
+  return cpi_->cyclic_refresh->qindex_delta;
+}
+
+void AV1RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
+  av1_rc_postencode_update(cpi_, encoded_frame_size);
+  if (cpi_->svc.number_spatial_layers > 1 ||
+      cpi_->svc.number_temporal_layers > 1)
+    av1_save_layer_context(cpi_);
+  cpi_->common.current_frame.frame_number++;
+}
+
+}  // namespace aom
diff --git a/media/libaom/src/av1/ratectrl_rtc.h b/media/libaom/src/av1/ratectrl_rtc.h
new file mode 100644
index 0000000000..9843803a2f
--- /dev/null
+++ b/media/libaom/src/av1/ratectrl_rtc.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_RATECTRL_RTC_H_
+#define AOM_AV1_RATECTRL_RTC_H_
+
+#include <cstdint>
+#include <memory>
+
+struct AV1_COMP;
+
+namespace aom {
+
+// These constants come from AV1 spec.
+static constexpr size_t kAV1MaxLayers = 32;
+static constexpr size_t kAV1MaxTemporalLayers = 8;
+static constexpr size_t kAV1MaxSpatialLayers = 4;
+
+enum FrameType { kKeyFrame, kInterFrame };
+
+struct AV1RateControlRtcConfig {
+ public:
+  AV1RateControlRtcConfig();
+
+  int width;
+  int height;
+  // 0-63
+  int max_quantizer;
+  int min_quantizer;
+  int64_t target_bandwidth;
+  int64_t buf_initial_sz;
+  int64_t buf_optimal_sz;
+  int64_t buf_sz;
+  int undershoot_pct;
+  int overshoot_pct;
+  int max_intra_bitrate_pct;
+  int max_inter_bitrate_pct;
+  double framerate;
+  int layer_target_bitrate[kAV1MaxLayers];
+  int ts_rate_decimator[kAV1MaxTemporalLayers];
+  int aq_mode;
+  // Number of spatial layers
+  int ss_number_layers;
+  // Number of temporal layers
+  int ts_number_layers;
+  int max_quantizers[kAV1MaxLayers];
+  int min_quantizers[kAV1MaxLayers];
+  int scaling_factor_num[kAV1MaxSpatialLayers];
+  int scaling_factor_den[kAV1MaxSpatialLayers];
+};
+
+struct AV1FrameParamsRTC {
+  FrameType frame_type;
+  int spatial_layer_id;
+  int temporal_layer_id;
+};
+
+class AV1RateControlRTC {
+ public:
+  static std::unique_ptr<AV1RateControlRTC> Create(
+      const AV1RateControlRtcConfig &cfg);
+  ~AV1RateControlRTC();
+
+  void UpdateRateControl(const AV1RateControlRtcConfig &rc_cfg);
+  // GetQP() needs to be called after ComputeQP() to get the latest QP
+  int GetQP() const;
+  signed char *GetCyclicRefreshMap() const;
+  int *GetDeltaQ() const;
+  void ComputeQP(const AV1FrameParamsRTC &frame_params);
+  // Feedback to rate control with the size of current encoded frame
+  void PostEncodeUpdate(uint64_t encoded_frame_size);
+
+ private:
+  AV1RateControlRTC() = default;
+  void InitRateControl(const AV1RateControlRtcConfig &cfg);
+  AV1_COMP *cpi_;
+  int initial_width_;
+  int initial_height_;
+};
+
+}  // namespace aom
+
+#endif  // AOM_AV1_RATECTRL_RTC_H_
diff --git a/media/libaom/src/av1/reference_manager.cc b/media/libaom/src/av1/reference_manager.cc
new file mode 100644
index 0000000000..74f0747a94
--- /dev/null
+++ b/media/libaom/src/av1/reference_manager.cc
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <algorithm>
+#include <set>
+#include <utility>
+#include <tuple>
+#include <vector>
+
+#include "av1/reference_manager.h"
+#include "av1/ratectrl_qmode.h"
+
+namespace aom {
+
+void RefFrameManager::Reset() {
+  free_ref_idx_list_.clear();
+  for (int i = 0; i < kRefFrameTableSize; ++i) {
+    free_ref_idx_list_.push_back(i);
+    ref_frame_table_[i] = GopFrameInvalid();
+  }
+  forward_stack_.clear();
+  backward_queue_.clear();
+  last_queue_.clear();
+}
+
+int RefFrameManager::AllocateRefIdx() {
+  if (free_ref_idx_list_.empty()) {
+    size_t backward_size = backward_queue_.size();
+    size_t last_size = last_queue_.size();
+    if (last_size >= backward_size) {
+      int ref_idx = last_queue_.front();
+      last_queue_.pop_front();
+      free_ref_idx_list_.push_back(ref_idx);
+    } else {
+      int ref_idx = backward_queue_.front();
+      backward_queue_.pop_front();
+      free_ref_idx_list_.push_back(ref_idx);
+    }
+  }
+
+  int ref_idx = free_ref_idx_list_.front();
+  free_ref_idx_list_.pop_front();
+  return ref_idx;
+}
+
+int RefFrameManager::GetRefFrameCountByType(
+    RefUpdateType ref_update_type) const {
+  size_t cnt = 0;
+  switch (ref_update_type) {
+    case RefUpdateType::kForward: cnt = forward_stack_.size(); break;
+    case RefUpdateType::kBackward: cnt = backward_queue_.size(); break;
+    case RefUpdateType::kLast: cnt = last_queue_.size(); break;
+    case RefUpdateType::kNone: cnt = 0; break;
+  }
+  return static_cast<int>(cnt);
+}
+
+int RefFrameManager::GetRefFrameCount() const {
+  return GetRefFrameCountByType(RefUpdateType::kForward) +
+         GetRefFrameCountByType(RefUpdateType::kBackward) +
+         GetRefFrameCountByType(RefUpdateType::kLast);
+}
+
+// TODO(angiebird): Add unit test.
+// Find the ref_idx corresponding to a ref_update_type.
+// Return -1 if no ref frame is found.
+// The priority_idx indicate closeness between the current frame and
+// the ref frame in display order.
+// For example, ref_update_type == kForward and priority_idx == 0 means
+// find the closest ref frame in forward_stack_.
+int RefFrameManager::GetRefFrameIdxByPriority(RefUpdateType ref_update_type,
+                                              int priority_idx) const {
+  if (ref_update_type == RefUpdateType::kForward) {
+    int size = static_cast<int>(forward_stack_.size());
+    if (priority_idx < size) {
+      return forward_stack_[size - priority_idx - 1];
+    }
+  } else if (ref_update_type == RefUpdateType::kBackward) {
+    int size = static_cast<int>(backward_queue_.size());
+    if (priority_idx < size) {
+      return backward_queue_[size - priority_idx - 1];
+    }
+  } else if (ref_update_type == RefUpdateType::kLast) {
+    int size = static_cast<int>(last_queue_.size());
+    if (priority_idx < size) {
+      return last_queue_[size - priority_idx - 1];
+    }
+  }
+  return -1;
+}
+
+// The priority_idx indicate closeness between the current frame and
+// the ref frame in display order.
+// For example, ref_update_type == kForward and priority_idx == 0 means
+// find the closest ref frame in forward_stack_.
+GopFrame RefFrameManager::GetRefFrameByPriority(RefUpdateType ref_update_type,
+                                                int priority_idx) const {
+  int ref_idx = GetRefFrameIdxByPriority(ref_update_type, priority_idx);
+  if (ref_idx == -1) {
+    return GopFrameInvalid();
+  }
+  assert(ref_frame_table_[ref_idx].update_ref_idx == ref_idx);
+  return ref_frame_table_[ref_idx];
+}
+
+GopFrame RefFrameManager::GetRefFrameByIndex(int ref_idx) const {
+  return ref_frame_table_[ref_idx];
+}
+
+ReferenceName get_ref_name(RefUpdateType ref_update_type, int priority_idx,
+                           const std::set<ReferenceName> &used_name_set) {
+  // TODO(angiebird): Find the better way to assign name lists.
+  // Maybe sort the names based on how frequent each name is being used in the
+  // past?
+  const std::vector<ReferenceName> forward_name_list{
+    ReferenceName::kBwdrefFrame, ReferenceName::kAltref2Frame,
+    ReferenceName::kAltrefFrame, ReferenceName::kGoldenFrame,
+    ReferenceName::kLastFrame,   ReferenceName::kLast2Frame,
+    ReferenceName::kLast3Frame
+  };
+  const std::vector<ReferenceName> backward_name_list{
+    ReferenceName::kGoldenFrame, ReferenceName::kLastFrame,
+    ReferenceName::kLast2Frame,  ReferenceName::kLast3Frame,
+    ReferenceName::kBwdrefFrame, ReferenceName::kAltref2Frame,
+    ReferenceName::kAltrefFrame
+  };
+  const std::vector<ReferenceName> last_name_list{
+    ReferenceName::kLastFrame,   ReferenceName::kLast2Frame,
+    ReferenceName::kLast3Frame,  ReferenceName::kGoldenFrame,
+    ReferenceName::kBwdrefFrame, ReferenceName::kAltref2Frame,
+    ReferenceName::kAltrefFrame
+  };
+
+  const std::vector<ReferenceName> *name_list = nullptr;
+  switch (ref_update_type) {
+    case RefUpdateType::kForward: name_list = &forward_name_list; break;
+    case RefUpdateType::kBackward: name_list = &backward_name_list; break;
+    case RefUpdateType::kLast: name_list = &last_name_list; break;
+    case RefUpdateType::kNone: break;
+  }
+
+  if (name_list) {
+    const int name_list_size = static_cast<int>(name_list->size());
+    for (int idx = priority_idx; idx < name_list_size; ++idx) {
+      ReferenceName ref_name = name_list->at(idx);
+      bool not_used = used_name_set.find(ref_name) == used_name_set.end();
+      if (not_used) return ref_name;
+    }
+  }
+  return ReferenceName::kNoneFrame;
+}
+
+// Generate a list of available reference frames in priority order for the
+// current to-be-coded frame. The list size should be less or equal to
+// kRefFrameTableSize. The reference frames with smaller indices are more likely
+// to be a good reference frame. Therefore, they should be prioritized when the
+// reference frame count is limited. For example, if we plan to use 3 reference
+// frames, we should choose ref_frame_list[0], ref_frame_list[1] and
+// ref_frame_list[2].
+std::vector<ReferenceFrame> RefFrameManager::GetRefFrameListByPriority() const {
+  constexpr int round_robin_size = 3;
+  const std::vector<RefUpdateType> round_robin_list{ RefUpdateType::kForward,
+                                                     RefUpdateType::kBackward,
+                                                     RefUpdateType::kLast };
+  std::vector<int> priority_idx_list(round_robin_size, 0);
+  int available_ref_frames = GetRefFrameCount();
+  std::vector<ReferenceFrame> ref_frame_list;
+  int ref_frame_count = 0;
+  int round_robin_idx = 0;
+  std::set<ReferenceName> used_name_set;
+  while (ref_frame_count < available_ref_frames) {
+    const RefUpdateType ref_update_type = round_robin_list[round_robin_idx];
+    int priority_idx = priority_idx_list[round_robin_idx];
+    int ref_idx = GetRefFrameIdxByPriority(ref_update_type, priority_idx);
+    if (ref_idx != -1) {
+      const ReferenceName name =
+          get_ref_name(ref_update_type, priority_idx, used_name_set);
+      assert(name != ReferenceName::kNoneFrame);
+      used_name_set.insert(name);
+      ReferenceFrame ref_frame = { ref_idx, name };
+      ref_frame_list.push_back(ref_frame);
+      ++ref_frame_count;
+      ++priority_idx_list[round_robin_idx];
+    }
+    round_robin_idx = (round_robin_idx + 1) % round_robin_size;
+  }
+  return ref_frame_list;
+}
+
+void RefFrameManager::UpdateOrder(int global_order_idx) {
+  cur_global_order_idx_ = global_order_idx;
+  if (forward_stack_.empty()) {
+    return;
+  }
+  int ref_idx = forward_stack_.back();
+  const GopFrame &gf_frame = ref_frame_table_[ref_idx];
+  if (gf_frame.global_order_idx <= global_order_idx) {
+    forward_stack_.pop_back();
+    if (gf_frame.is_golden_frame) {
+      // high quality frame
+      backward_queue_.push_back(ref_idx);
+    } else {
+      last_queue_.push_back(ref_idx);
+    }
+  }
+}
+
+int RefFrameManager::ColocatedRefIdx(int global_order_idx) {
+  if (forward_stack_.size() == 0) return -1;
+  int ref_idx = forward_stack_.back();
+  int arf_global_order_idx = ref_frame_table_[ref_idx].global_order_idx;
+  if (arf_global_order_idx == global_order_idx) {
+    return ref_idx;
+  }
+  return -1;
+}
+
+static RefUpdateType infer_ref_update_type(const GopFrame &gop_frame,
+                                           int cur_global_order_idx) {
+  if (gop_frame.global_order_idx > cur_global_order_idx) {
+    return RefUpdateType::kForward;
+  }
+  if (gop_frame.is_golden_frame) {
+    return RefUpdateType::kBackward;
+  }
+  if (gop_frame.encode_ref_mode == EncodeRefMode::kShowExisting ||
+      gop_frame.encode_ref_mode == EncodeRefMode::kOverlay) {
+    return RefUpdateType::kNone;
+  }
+  return RefUpdateType::kLast;
+}
+
+using PrimaryRefKey = std::tuple<int,   // abs layer_depth delta
+                                 bool,  // is_key_frame differs
+                                 bool,  // is_golden_frame differs
+                                 bool,  // is_arf_frame differs
+                                 bool,  // is_show_frame differs
+                                 bool,  // encode_ref_mode differs
+                                 int>;  // abs order_idx delta
+
+// Generate PrimaryRefKey based on abs layer_depth delta,
+// frame flags and abs order_idx delta. These are the fields that will
+// be used to pick the primary reference frame for probability model
+static PrimaryRefKey get_primary_ref_key(const GopFrame &cur_frame,
+                                         const GopFrame &ref_frame) {
+  return std::make_tuple(abs(cur_frame.layer_depth - ref_frame.layer_depth),
+                         cur_frame.is_key_frame != ref_frame.is_key_frame,
+                         cur_frame.is_golden_frame != ref_frame.is_golden_frame,
+                         cur_frame.is_arf_frame != ref_frame.is_arf_frame,
+                         cur_frame.is_show_frame != ref_frame.is_show_frame,
+                         cur_frame.encode_ref_mode != ref_frame.encode_ref_mode,
+                         abs(cur_frame.order_idx - ref_frame.order_idx));
+}
+
+// Pick primary_ref_idx for probability model.
+ReferenceFrame RefFrameManager::GetPrimaryRefFrame(
+    const GopFrame &gop_frame) const {
+  assert(gop_frame.is_valid);
+  std::vector<std::pair<PrimaryRefKey, int>> candidate_list;
+  for (int ref_idx = 0; ref_idx < static_cast<int>(ref_frame_table_.size());
+       ++ref_idx) {
+    const GopFrame &ref_frame = ref_frame_table_[ref_idx];
+    if (ref_frame.is_valid) {
+      assert(ref_idx == ref_frame.update_ref_idx);
+      PrimaryRefKey key = get_primary_ref_key(gop_frame, ref_frame);
+      std::pair<PrimaryRefKey, int> candidate = { key, ref_idx };
+      candidate_list.push_back(candidate);
+    }
+  }
+
+  std::sort(candidate_list.begin(), candidate_list.end());
+
+  ReferenceFrame ref_frame = { -1, ReferenceName::kNoneFrame };
+  std::vector<ReferenceFrame> ref_frame_list = GetRefFrameListByPriority();
+  assert(candidate_list.size() == ref_frame_list.size());
+  if (!candidate_list.empty()) {
+    int ref_idx = candidate_list[0].second;
+    for (const auto &frame : ref_frame_list) {
+      if (frame.index == ref_idx) {
+        ref_frame = frame;
+      }
+    }
+  }
+  return ref_frame;
+}
+
+void RefFrameManager::UpdateRefFrameTable(GopFrame *gop_frame) {
+  gop_frame->ref_frame_list = GetRefFrameListByPriority();
+  gop_frame->primary_ref_frame = GetPrimaryRefFrame(*gop_frame);
+  gop_frame->colocated_ref_idx = ColocatedRefIdx(gop_frame->global_order_idx);
+
+  if (gop_frame->is_show_frame) {
+    UpdateOrder(gop_frame->global_order_idx);
+  }
+  // Call infer_ref_update_type() after UpdateOrder() so that
+  // cur_global_order_idx_ is up-to-date
+  RefUpdateType ref_update_type =
+      infer_ref_update_type(*gop_frame, cur_global_order_idx_);
+  if (ref_update_type == RefUpdateType::kNone) {
+    gop_frame->update_ref_idx = -1;
+  } else {
+    const int ref_idx = AllocateRefIdx();
+    gop_frame->update_ref_idx = ref_idx;
+    switch (ref_update_type) {
+      case RefUpdateType::kForward: forward_stack_.push_back(ref_idx); break;
+      case RefUpdateType::kBackward: backward_queue_.push_back(ref_idx); break;
+      case RefUpdateType::kLast: last_queue_.push_back(ref_idx); break;
+      case RefUpdateType::kNone: break;
+    }
+    ref_frame_table_[ref_idx] = *gop_frame;
+  }
+}
+
+}  // namespace aom
diff --git a/media/libaom/src/av1/reference_manager.h b/media/libaom/src/av1/reference_manager.h
new file mode 100644
index 0000000000..50a7d67080
--- /dev/null
+++ b/media/libaom/src/av1/reference_manager.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_REFERENCE_MANAGER_H_
+#define AOM_AV1_REFERENCE_MANAGER_H_
+
+#include <deque>
+#include <iostream>
+#include <vector>
+
+#include "av1/ratectrl_qmode_interface.h"
+
+namespace aom {
+
+enum class RefUpdateType { kForward, kBackward, kLast, kNone };
+
+class RefFrameManager {
+ public:
+  explicit RefFrameManager(int max_ref_frames)
+      : max_ref_frames_(max_ref_frames) {
+    // forward_max_size_ define max number of arf frames that can exists at
+    // the same time. In the other words, it's the max size of forward_stack_.
+    // TODO(angiebird): Figure out if this number is optimal.
+    forward_max_size_ = kRefFrameTableSize - 2;
+    cur_global_order_idx_ = 0;
+    Reset();
+  }
+  ~RefFrameManager() = default;
+
+  RefFrameManager(const RefFrameManager &) = delete;
+  RefFrameManager &operator=(const RefFrameManager &) = delete;
+
+  friend std::ostream &operator<<(std::ostream &os,
+                                  const RefFrameManager &rfm) {
+    os << "=" << std::endl;
+    os << "forward: ";
+    for (const auto &ref_idx : rfm.forward_stack_) {
+      os << rfm.ref_frame_table_[ref_idx].order_idx << " ";
+    }
+    os << std::endl;
+    os << "backward: ";
+    for (const auto &ref_idx : rfm.backward_queue_) {
+      os << rfm.ref_frame_table_[ref_idx].order_idx << " ";
+    }
+    os << std::endl;
+    os << "last: ";
+    for (const auto &ref_idx : rfm.last_queue_) {
+      os << rfm.ref_frame_table_[ref_idx].order_idx << " ";
+    }
+    os << std::endl;
+    return os;
+  }
+
+  void Reset();
+  int AllocateRefIdx();
+  int GetRefFrameCountByType(RefUpdateType ref_update_type) const;
+  int GetRefFrameCount() const;
+  std::vector<ReferenceFrame> GetRefFrameListByPriority() const;
+  int GetRefFrameIdxByPriority(RefUpdateType ref_update_type,
+                               int priority_idx) const;
+  GopFrame GetRefFrameByPriority(RefUpdateType ref_update_type,
+                                 int priority_idx) const;
+  GopFrame GetRefFrameByIndex(int ref_idx) const;
+  void UpdateOrder(int global_order_idx);
+  int ColocatedRefIdx(int global_order_idx);
+  int ForwardMaxSize() const { return forward_max_size_; }
+  int MaxRefFrames() const { return max_ref_frames_; }
+  int CurGlobalOrderIdx() const { return cur_global_order_idx_; }
+  void UpdateRefFrameTable(GopFrame *gop_frame);
+  ReferenceFrame GetPrimaryRefFrame(const GopFrame &gop_frame) const;
+
+ private:
+  // TOOD(angiebird): Do we still need max_ref_frames_?
+  int max_ref_frames_;
+  int forward_max_size_;
+  int cur_global_order_idx_;
+  RefFrameTable ref_frame_table_;
+  std::deque<int> free_ref_idx_list_;
+  std::vector<int> forward_stack_;
+  std::deque<int> backward_queue_;
+  std::deque<int> last_queue_;
+};
+
+}  // namespace aom
+
+#endif  // AOM_AV1_REFERENCE_MANAGER_H_
diff --git a/media/libaom/src/build/.gitignore b/media/libaom/src/build/.gitignore
new file mode 100644
index 0000000000..1350fcb5eb
--- /dev/null
+++ b/media/libaom/src/build/.gitignore
@@ -0,0 +1 @@
+x86*-win32-vs*
diff --git a/media/libaom/src/build/cmake/aom_config_defaults.cmake b/media/libaom/src/build/cmake/aom_config_defaults.cmake
index f9e70eb248..367001dc74 100644
--- a/media/libaom/src/build/cmake/aom_config_defaults.cmake
+++ b/media/libaom/src/build/cmake/aom_config_defaults.cmake
@@ -42,15 +42,15 @@ set_aom_detect_var(HAVE_MSA 0 "Enables MSA optimizations.")
 set_aom_detect_var(HAVE_VSX 0 "Enables VSX optimizations.")
 
 # x86/x86_64 feature flags.
-set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.")
-set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.")
 set_aom_detect_var(HAVE_MMX 0 "Enables MMX optimizations. ")
 set_aom_detect_var(HAVE_SSE 0 "Enables SSE optimizations.")
 set_aom_detect_var(HAVE_SSE2 0 "Enables SSE2 optimizations.")
 set_aom_detect_var(HAVE_SSE3 0 "Enables SSE3 optimizations.")
+set_aom_detect_var(HAVE_SSSE3 0 "Enables SSSE3 optimizations.")
 set_aom_detect_var(HAVE_SSE4_1 0 "Enables SSE 4.1 optimizations.")
 set_aom_detect_var(HAVE_SSE4_2 0 "Enables SSE 4.2 optimizations.")
-set_aom_detect_var(HAVE_SSSE3 0 "Enables SSSE3 optimizations.")
+set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.")
+set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.")
 
 # Flags describing the build environment.
 set_aom_detect_var(HAVE_FEXCEPT 0
@@ -71,21 +71,34 @@ set_aom_config_var(AOM_RTCD_FLAGS ""
 set_aom_config_var(CONFIG_AV1_DECODER 1 "Enable AV1 decoder.")
 set_aom_config_var(CONFIG_AV1_ENCODER 1 "Enable AV1 encoder.")
 set_aom_config_var(CONFIG_BIG_ENDIAN 0 "Internal flag.")
+set_aom_config_var(CONFIG_FPMT_TEST 0 "Enable FPMT testing.")
+set_aom_config_var(CONFIG_FRAME_PARALLEL_ENCODE 0
+                   "Enable frame parallelism during encode.")
+set_aom_config_var(
+  CONFIG_FRAME_PARALLEL_ENCODE_2 0
+  "Enable frame parallelism during encode for frames in lower layer depths.")
 set_aom_config_var(CONFIG_GCC 0 "Building with GCC (detect).")
 set_aom_config_var(CONFIG_GCOV 0 "Enable gcov support.")
 set_aom_config_var(CONFIG_GPROF 0 "Enable gprof support.")
 set_aom_config_var(CONFIG_LIBYUV 1 "Enables libyuv scaling/conversion support.")
 
+set_aom_config_var(CONFIG_AV1_HIGHBITDEPTH 1
+                   "Build with high bitdepth support.")
+set_aom_config_var(CONFIG_AV1_TEMPORAL_DENOISING 0
+                   "Build with temporal denoising support.")
 set_aom_config_var(CONFIG_MULTITHREAD 1 "Multithread support.")
 set_aom_config_var(CONFIG_OS_SUPPORT 0 "Internal flag.")
 set_aom_config_var(CONFIG_PIC 0 "Build with PIC enabled.")
+set_aom_config_var(CONFIG_REALTIME_ONLY 0
+                   "Build for RTC-only. See aomcx.h for all disabled features.")
 set_aom_config_var(CONFIG_RUNTIME_CPU_DETECT 1 "Runtime CPU detection support.")
 set_aom_config_var(CONFIG_SHARED 0 "Build shared libs.")
 set_aom_config_var(CONFIG_WEBM_IO 1 "Enables WebM support.")
 
 # Debugging flags.
-set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0 "Bitstream debugging flag.")
-set_aom_config_var(CONFIG_DEBUG 0 "Debug build flag.")
+set_aom_config_var(CONFIG_DEBUG 0 "Enable debug-only code.")
+set_aom_config_var(CONFIG_EXCLUDE_SIMD_MISMATCH 0
+                   "Exclude mismatch in SIMD functions for testing/debugging.")
 set_aom_config_var(CONFIG_MISMATCH_DEBUG 0 "Mismatch debugging flag.")
 
 # AV1 feature flags.
@@ -102,39 +115,58 @@ set_aom_config_var(FORCE_HIGHBITDEPTH_DECODING 0
 mark_as_advanced(FORCE_HIGHBITDEPTH_DECODING)
 set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2
                    "Max profile to support decoding.")
-set_aom_config_var(CONFIG_NORMAL_TILE_MODE 0 "Only enables normal tile mode.")
+set_aom_config_var(
+  CONFIG_NORMAL_TILE_MODE 0
+  "Only enables general decoding (disables large scale tile decoding).")
 set_aom_config_var(CONFIG_SIZE_LIMIT 0 "Limit max decode width/height.")
 set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 "Spatial resampling.")
+set_aom_config_var(CONFIG_TUNE_BUTTERAUGLI 0
+                   "Enable encoding tuning for Butteraugli.")
+set_aom_config_var(CONFIG_TUNE_VMAF 0 "Enable encoding tuning for VMAF.")
 set_aom_config_var(DECODE_HEIGHT_LIMIT 0 "Set limit for decode height.")
 set_aom_config_var(DECODE_WIDTH_LIMIT 0 "Set limit for decode width.")
-set_aom_config_var(CONFIG_TUNE_VMAF 0 "Enable encoding tuning for VMAF.")
+set_aom_config_var(STATIC_LINK_JXL 0 "Statically link the JPEG-XL library.")
 
 # AV1 experiment flags.
-set_aom_config_var(CONFIG_SPEED_STATS 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_DIST_8X8 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_ENTROPY_STATS 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_RD_DEBUG 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_SHARP_SETTINGS 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
-                   "Disable full_pixel_motion_search_based_split on BLOCK_8X8.")
-set_aom_config_var(CONFIG_COLLECT_PARTITION_STATS 0
-                   "Collect stats on partition decisions.")
-set_aom_config_var(CONFIG_COLLECT_COMPONENT_TIMING 0
-                   "Collect encoding component timing information.")
-set_aom_config_var(CONFIG_LPF_MASK 0
-                   "Enable the use loop filter bitmasks for optimizations.")
-set_aom_config_var(CONFIG_HTB_TRELLIS 0
-                   "Enable the use of hash table for trellis optimizations.")
-set_aom_config_var(CONFIG_REALTIME_ONLY 0
-                   "Build for RTC-only to reduce binary size.")
-set_aom_config_var(CONFIG_AV1_HIGHBITDEPTH 1
-                   "Build with high bitdepth support.")
-set_aom_config_var(CONFIG_NN_V2 0 "Fully-connected neural nets ver.2.")
-set_aom_config_var(CONFIG_SUPERRES_IN_RECODE 1
-                   "Enable encoding both full-res and superres in recode loop"
-                   "when SUPERRES_AUTO mode is used.")
+set_aom_config_var(CONFIG_BITRATE_ACCURACY 0
+                   "AV1 experiment: Improve bitrate accuracy.")
+set_aom_config_var(
+  CONFIG_BITRATE_ACCURACY_BL 0
+  "AV1 experiment: Baseline of improve bitrate accuracy experiment.")
+set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0
+                   "AV1 experiment: Bitstream debugging.")
+set_aom_config_var(
+  CONFIG_COLLECT_COMPONENT_TIMING 0
+  "AV1 experiment: Collect encoding component timing information.")
+set_aom_config_var(
+  CONFIG_COLLECT_PARTITION_STATS 0
+  "AV1 experiment: Collect partition timing stats. Can be 1 or 2.")
+set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 "AV1 experiment.")
+set_aom_config_var(
+  CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
+  "AV1 experiment: Disable full_pixel_motion_search_based_split on BLOCK_8X8.")
+set_aom_config_var(CONFIG_ENTROPY_STATS 0 "AV1 experiment.")
+set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 "AV1 experiment.")
+set_aom_config_var(CONFIG_NN_V2 0
+                   "AV1 experiment: Fully-connected neural nets ver.2.")
+set_aom_config_var(CONFIG_OPTICAL_FLOW_API 0
+                   "AV1 experiment: for optical flow API.")
+set_aom_config_var(CONFIG_PARTITION_SEARCH_ORDER 0
+                   "AV1 experiment: Use alternative partition search order.")
+set_aom_config_var(CONFIG_RATECTRL_LOG 0
+                   "AV1 experiment: Log rate control decision.")
+set_aom_config_var(CONFIG_RD_COMMAND 0
+                   "AV1 experiment: Use external rdmult and q_index.")
+set_aom_config_var(CONFIG_RD_DEBUG 0 "AV1 experiment.")
+set_aom_config_var(
+  CONFIG_RT_ML_PARTITIONING 0
+  "AV1 experiment: Build with ML-based partitioning for Real Time.")
+set_aom_config_var(CONFIG_SPEED_STATS 0 "AV1 experiment.")
+set_aom_config_var(CONFIG_TFLITE 0
+                   "AV1 experiment: Enable tensorflow lite library.")
+set_aom_config_var(CONFIG_THREE_PASS 0
+                   "AV1 experiment: Enable three-pass encoding.")
+
 #
 # Variables in this section control optional features of the build system.
 #
diff --git a/media/libaom/src/build/cmake/aom_configure.cmake b/media/libaom/src/build/cmake/aom_configure.cmake
index 224a46587c..1279ad3d03 100644
--- a/media/libaom/src/build/cmake/aom_configure.cmake
+++ b/media/libaom/src/build/cmake/aom_configure.cmake
@@ -44,7 +44,7 @@ endif()
 list(APPEND aom_build_vars ${AOM_CONFIG_VARS} ${AOM_OPTION_VARS})
 foreach(cache_var ${aom_build_vars})
   get_property(cache_var_helpstring CACHE ${cache_var} PROPERTY HELPSTRING)
-  if("${cache_var_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+  if(cache_var_helpstring STREQUAL cmake_cmdline_helpstring)
     set(AOM_CMAKE_CONFIG "${AOM_CMAKE_CONFIG} -D${cache_var}=${${cache_var}}")
   endif()
 endforeach()
@@ -53,11 +53,10 @@ string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
 # Detect target CPU.
 if(NOT AOM_TARGET_CPU)
   string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase)
-  if("${cpu_lowercase}" STREQUAL "amd64"
-     OR "${cpu_lowercase}" STREQUAL "x86_64")
-    if(${CMAKE_SIZEOF_VOID_P} EQUAL 4)
+  if(cpu_lowercase STREQUAL "amd64" OR cpu_lowercase STREQUAL "x86_64")
+    if(CMAKE_SIZEOF_VOID_P EQUAL 4)
       set(AOM_TARGET_CPU "x86")
-    elseif(${CMAKE_SIZEOF_VOID_P} EQUAL 8)
+    elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
       set(AOM_TARGET_CPU "x86_64")
     else()
       message(
@@ -66,15 +65,13 @@ if(NOT AOM_TARGET_CPU)
                     "      CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}\n"
                     "      CMAKE_GENERATOR=${CMAKE_GENERATOR}\n")
     endif()
-  elseif("${cpu_lowercase}" STREQUAL "i386"
-         OR "${cpu_lowercase}" STREQUAL "x86")
+  elseif(cpu_lowercase STREQUAL "i386" OR cpu_lowercase STREQUAL "x86")
     set(AOM_TARGET_CPU "x86")
-  elseif("${cpu_lowercase}" MATCHES "^arm"
-         OR "${cpu_lowercase}" MATCHES "^mips")
+  elseif(cpu_lowercase MATCHES "^arm" OR cpu_lowercase MATCHES "^mips")
     set(AOM_TARGET_CPU "${cpu_lowercase}")
-  elseif("${cpu_lowercase}" MATCHES "aarch64")
+  elseif(cpu_lowercase MATCHES "aarch64")
     set(AOM_TARGET_CPU "arm64")
-  elseif("${cpu_lowercase}" MATCHES "^ppc")
+  elseif(cpu_lowercase MATCHES "^ppc")
     set(AOM_TARGET_CPU "ppc")
   else()
     message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not "
@@ -105,13 +102,37 @@ string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
 message("--- aom_configure: Detected CPU: ${AOM_TARGET_CPU}")
 set(AOM_TARGET_SYSTEM ${CMAKE_SYSTEM_NAME})
 
-if("${CMAKE_BUILD_TYPE}" MATCHES "Deb")
+string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
+if(build_type_lowercase STREQUAL "debug")
   set(CONFIG_DEBUG 1)
 endif()
 
 if(BUILD_SHARED_LIBS)
   set(CONFIG_PIC 1)
   set(CONFIG_SHARED 1)
+elseif(NOT CONFIG_PIC)
+  # Update the variable only when it does not carry the CMake assigned help
+  # string for variables specified via the command line. This allows the user to
+  # force CONFIG_PIC=0.
+  unset(cache_helpstring)
+  get_property(cache_helpstring CACHE CONFIG_PIC PROPERTY HELPSTRING)
+  if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+    aom_check_c_compiles("pie_check" "
+                          #if !(__pie__ || __PIE__)
+                          #error Neither __pie__ or __PIE__ are set
+                          #endif
+                          extern void unused(void);
+                          void unused(void) {}" HAVE_PIE)
+
+    if(HAVE_PIE)
+      # If -fpie or -fPIE are used ensure the assembly code has PIC enabled to
+      # avoid DT_TEXTRELs: /usr/bin/ld: warning: creating DT_TEXTREL in a PIE
+      set(CONFIG_PIC 1)
+      message(
+        "CONFIG_PIC enabled for position independent executable (PIE) build")
+    endif()
+  endif()
+  unset(cache_helpstring)
 endif()
 
 if(NOT MSVC)
@@ -120,8 +141,8 @@ if(NOT MSVC)
     # TODO(tomfinegan): clang needs -pie in CMAKE_EXE_LINKER_FLAGS for this to
     # work.
     set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-    if("${AOM_TARGET_SYSTEM}" STREQUAL "Linux"
-       AND "${AOM_TARGET_CPU}" MATCHES "^armv[78]")
+    if(AOM_TARGET_SYSTEM STREQUAL "Linux"
+       AND AOM_TARGET_CPU MATCHES "^armv[78]")
       set(AOM_AS_FLAGS ${AOM_AS_FLAGS} --defsym PIC=1)
     else()
       set(AOM_AS_FLAGS ${AOM_AS_FLAGS} -DPIC)
@@ -129,7 +150,7 @@ if(NOT MSVC)
   endif()
 endif()
 
-if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+if(AOM_TARGET_CPU STREQUAL "x86" OR AOM_TARGET_CPU STREQUAL "x86_64")
   find_program(AS_EXECUTABLE yasm $ENV{YASM_PATH})
   if(NOT AS_EXECUTABLE OR ENABLE_NASM)
     unset(AS_EXECUTABLE CACHE)
@@ -149,11 +170,11 @@ if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
   get_asm_obj_format("objformat")
   set(AOM_AS_FLAGS -f ${objformat} ${AOM_AS_FLAGS})
   string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS)
-elseif("${AOM_TARGET_CPU}" MATCHES "arm")
-  if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+elseif(AOM_TARGET_CPU MATCHES "arm")
+  if(AOM_TARGET_SYSTEM STREQUAL "Darwin")
     set(AS_EXECUTABLE as)
     set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT})
-  elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+  elseif(AOM_TARGET_SYSTEM STREQUAL "Windows")
     if(NOT AS_EXECUTABLE)
       set(AS_EXECUTABLE ${CMAKE_C_COMPILER} -c -mimplicit-it=always)
     endif()
@@ -197,14 +218,17 @@ if(CONFIG_GPROF)
   require_compiler_flag("-pg" YES)
 endif()
 
-if("${AOM_TARGET_SYSTEM}" MATCHES "Darwin\|Linux\|Windows\|Android")
+if(AOM_TARGET_SYSTEM MATCHES "Darwin\|Linux\|Windows\|Android")
   set(CONFIG_OS_SUPPORT 1)
 endif()
 
-# The default _WIN32_WINNT value in MinGW is 0x0502 (Windows XP with SP2). Set
-# it to 0x0601 (Windows 7).
-if("${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+if(AOM_TARGET_SYSTEM STREQUAL "Windows")
+  # The default _WIN32_WINNT value in MinGW is 0x0502 (Windows XP with SP2). Set
+  # it to 0x0601 (Windows 7).
   add_compiler_flag_if_supported("-D_WIN32_WINNT=0x0601")
+  # Prevent windows.h from defining the min and max macros. This allows us to
+  # use std::min and std::max.
+  add_compiler_flag_if_supported("-DNOMINMAX")
 endif()
 
 #
@@ -276,7 +300,10 @@ else()
   add_compiler_flag_if_supported("-Wall")
   add_compiler_flag_if_supported("-Wdisabled-optimization")
   add_compiler_flag_if_supported("-Wextra")
+  add_compiler_flag_if_supported("-Wextra-semi")
+  add_compiler_flag_if_supported("-Wextra-semi-stmt")
   add_compiler_flag_if_supported("-Wfloat-conversion")
+  add_compiler_flag_if_supported("-Wformat=2")
   add_c_flag_if_supported("-Wimplicit-function-declaration")
   add_compiler_flag_if_supported("-Wlogical-op")
   add_compiler_flag_if_supported("-Wpointer-arith")
@@ -288,21 +315,25 @@ else()
   add_compiler_flag_if_supported("-Wunused")
   add_compiler_flag_if_supported("-Wvla")
 
-  if(CMAKE_C_COMPILER_ID MATCHES "GNU"
-     AND "${SANITIZE}" MATCHES "address|undefined")
+  if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND SANITIZE MATCHES "address|undefined")
 
     # This combination has more stack overhead, so we account for it by
     # providing higher stack limit than usual.
     add_c_flag_if_supported("-Wstack-usage=170000")
     add_cxx_flag_if_supported("-Wstack-usage=270000")
   elseif(CONFIG_RD_DEBUG) # Another case where higher stack usage is expected.
-    add_c_flag_if_supported("-Wstack-usage=117000")
+    add_c_flag_if_supported("-Wstack-usage=135000")
     add_cxx_flag_if_supported("-Wstack-usage=240000")
   else()
     add_c_flag_if_supported("-Wstack-usage=100000")
     add_cxx_flag_if_supported("-Wstack-usage=240000")
   endif()
 
+  if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND SANITIZE MATCHES "address")
+    # Disable no optimization warning when compiling with sanitizers
+    add_compiler_flag_if_supported("-Wno-disabled-optimization")
+  endif()
+
   # Add -Wshadow only for C files to avoid massive gtest warning spam.
   add_c_flag_if_supported("-Wshadow")
 
@@ -311,7 +342,7 @@ else()
 
   # Quiet gcc 6 vs 7 abi warnings:
   # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
-  if("${AOM_TARGET_CPU}" MATCHES "arm")
+  if(AOM_TARGET_CPU MATCHES "arm")
     add_cxx_flag_if_supported("-Wno-psabi")
   endif()
 
@@ -319,13 +350,33 @@ else()
     add_compiler_flag_if_supported("-Werror")
   endif()
 
-  if("${CMAKE_BUILD_TYPE}" MATCHES "Rel")
+  if(build_type_lowercase MATCHES "rel")
     add_compiler_flag_if_supported("-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0")
   endif()
   add_compiler_flag_if_supported("-D_LARGEFILE_SOURCE")
   add_compiler_flag_if_supported("-D_FILE_OFFSET_BITS=64")
 endif()
 
+# Prior to r23, or with ANDROID_USE_LEGACY_TOOLCHAIN_FILE set,
+# android.toolchain.cmake would set normal (non-cache) versions of variables
+# like CMAKE_C_FLAGS_RELEASE which would mask the ones added to the cache
+# variable in add_compiler_flag_if_supported(), etc. As a workaround we add
+# everything accumulated in AOM_C/CXX_FLAGS to the normal versions. This could
+# also be addressed by reworking the flag tests and adding the results directly
+# to target_compile_options() as in e.g., libgav1, but that's a larger task.
+# https://github.com/android/ndk/wiki/Changelog-r23#changes
+if(ANDROID
+   AND ("${ANDROID_NDK_MAJOR}" LESS 23 OR ANDROID_USE_LEGACY_TOOLCHAIN_FILE))
+  foreach(lang C;CXX)
+    string(STRIP "${AOM_${lang}_FLAGS}" AOM_${lang}_FLAGS)
+    if(AOM_${lang}_FLAGS)
+      foreach(config ${AOM_${lang}_CONFIGS})
+        set(${config} "${${config}} ${AOM_${lang}_FLAGS}")
+      endforeach()
+    endif()
+  endforeach()
+endif()
+
 set(AOM_LIB_LINK_TYPE PUBLIC)
 if(EMSCRIPTEN)
 
diff --git a/media/libaom/src/build/cmake/aom_experiment_deps.cmake b/media/libaom/src/build/cmake/aom_experiment_deps.cmake
index 2e3615791e..3bbeb0c874 100644
--- a/media/libaom/src/build/cmake/aom_experiment_deps.cmake
+++ b/media/libaom/src/build/cmake/aom_experiment_deps.cmake
@@ -21,8 +21,4 @@ macro(fix_experiment_configs)
     change_config_and_warn(CONFIG_INSPECTION 1 CONFIG_ANALYZER)
   endif()
 
-  if(CONFIG_DIST_8X8 AND CONFIG_MULTITHREAD)
-    change_config_and_warn(CONFIG_DIST_8X8 0 CONFIG_MULTITHREAD)
-  endif()
-
 endmacro()
diff --git a/media/libaom/src/build/cmake/aom_install.cmake b/media/libaom/src/build/cmake/aom_install.cmake
index cd40fe4245..3b52a6872d 100644
--- a/media/libaom/src/build/cmake/aom_install.cmake
+++ b/media/libaom/src/build/cmake/aom_install.cmake
@@ -10,8 +10,7 @@
 #
 list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom.h"
             "${AOM_ROOT}/aom/aom_codec.h" "${AOM_ROOT}/aom/aom_frame_buffer.h"
-            "${AOM_ROOT}/aom/aom_image.h" "${AOM_ROOT}/aom/aom_integer.h"
-            "${AOM_ROOT}/aom/aom.h")
+            "${AOM_ROOT}/aom/aom_image.h" "${AOM_ROOT}/aom/aom_integer.h")
 
 if(CONFIG_AV1_DECODER)
   list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom_decoder.h"
@@ -20,14 +19,15 @@ endif()
 
 if(CONFIG_AV1_ENCODER)
   list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aomcx.h"
-              "${AOM_ROOT}/aom/aom_encoder.h")
+              "${AOM_ROOT}/aom/aom_encoder.h"
+              "${AOM_ROOT}/aom/aom_external_partition.h")
 endif()
 
 # Generate aom.pc and setup dependencies to ensure it is created when necessary.
 # Note: aom.pc generation uses GNUInstallDirs:
 # https://cmake.org/cmake/help/latest/module/GNUInstallDirs.html
 macro(setup_aom_install_targets)
-  if(NOT (MSVC OR XCODE))
+  if(NOT XCODE)
     include("GNUInstallDirs")
     set(AOM_PKG_CONFIG_FILE "${AOM_CONFIG_DIR}/aom.pc")
 
@@ -39,14 +39,19 @@ macro(setup_aom_install_targets)
     add_custom_command(
       OUTPUT "${AOM_PKG_CONFIG_FILE}"
       COMMAND ${CMAKE_COMMAND} ARGS
-              -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT}
+              -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+              -DAOM_ROOT=${AOM_ROOT}
               -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
               -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
               -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
               -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
               -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
               -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
-              -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H} -P
+              -DCONFIG_TUNE_VMAF=${CONFIG_TUNE_VMAF}
+              -DCONFIG_TUNE_BUTTERAUGLI=${CONFIG_TUNE_BUTTERAUGLI}
+              -DCONFIG_TFLITE=${CONFIG_TFLITE}
+              -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H}
+              -P
               "${AOM_ROOT}/build/cmake/pkg_config.cmake"
       COMMENT "Writing aom.pc"
       VERBATIM)
@@ -78,19 +83,15 @@ macro(setup_aom_install_targets)
       set(AOM_INSTALL_LIBS aom)
     endif()
 
-    # Setup the install rules.
-    install(
-      FILES ${AOM_INSTALL_INCS}
-      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/aom")
-    install(
-      FILES "${AOM_PKG_CONFIG_FILE}"
-      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pkgconfig")
-    install(TARGETS ${AOM_INSTALL_LIBS} DESTINATION
-                    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
-
-    if(ENABLE_EXAMPLES)
-      install(TARGETS ${AOM_INSTALL_BINS} DESTINATION
-                      "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
-    endif()
+    # Setup the install rules. install() will automatically prepend
+    # CMAKE_INSTALL_PREFIX to relative paths
+    install(FILES ${AOM_INSTALL_INCS}
+            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/aom")
+    install(FILES "${AOM_PKG_CONFIG_FILE}"
+            DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+    install(TARGETS ${AOM_INSTALL_LIBS};${AOM_INSTALL_BINS}
+            RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+            LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+            ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}")
   endif()
 endmacro()
diff --git a/media/libaom/src/build/cmake/aom_optimization.cmake b/media/libaom/src/build/cmake/aom_optimization.cmake
index d8b258f1e6..1dd6c3b94e 100644
--- a/media/libaom/src/build/cmake/aom_optimization.cmake
+++ b/media/libaom/src/build/cmake/aom_optimization.cmake
@@ -44,9 +44,10 @@ function(add_intrinsics_object_library flag opt_name target_to_update sources)
   endif()
   set(target_name ${target_to_update}_${opt_name}_intrinsics)
   add_library(${target_name} OBJECT ${${sources}})
+  set_property(TARGET ${target_name} PROPERTY FOLDER ${AOM_TARGET_CPU})
 
   if(MSVC)
-    get_msvc_intrinsic_flag(${flag} "flag")
+    get_msvc_intrinsic_flag("${flag}" "flag")
   endif()
 
   if("${flag}" STREQUAL "-mavx2")
@@ -83,7 +84,7 @@ endfunction()
 function(add_intrinsics_source_to_target flag target sources)
   target_sources(${target} PRIVATE ${${sources}})
   if(MSVC)
-    get_msvc_intrinsic_flag(${flag} "flag")
+    get_msvc_intrinsic_flag("${flag}" "flag")
   endif()
   if(flag)
     foreach(source ${${sources}})
@@ -100,6 +101,7 @@ function(get_asm_obj_format out_format)
     if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
       set(objformat "macho64")
     elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+           OR "${AOM_TARGET_SYSTEM}" STREQUAL "CYGWIN"
            OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
       set(objformat "win64")
     else()
@@ -109,6 +111,7 @@ function(get_asm_obj_format out_format)
     if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
       set(objformat "macho32")
     elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+           OR "${AOM_TARGET_SYSTEM}" STREQUAL "CYGWIN"
            OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
       set(objformat "win32")
     else()
@@ -127,46 +130,59 @@ endfunction()
 # into the aom library target(s). Generates a dummy C file with a dummy function
 # to ensure that all cmake generators can determine the linker language, and
 # that build tools don't complain that an object exposes no symbols.
+#
+# In shared library configs every step described above happens twice, and
+# directory/target/object names are updated to include _shared and _static
+# suffixes.
 function(add_asm_library lib_name asm_sources)
   if("${${asm_sources}}" STREQUAL "")
     return()
   endif()
-  set(asm_lib_obj_dir "${AOM_CONFIG_DIR}/asm_objects/${lib_name}")
-  if(NOT EXISTS "${asm_lib_obj_dir}")
-    file(MAKE_DIRECTORY "${asm_lib_obj_dir}")
+
+  list(APPEND asm_configs "static")
+  if(BUILD_SHARED_LIBS)
+    list(APPEND asm_configs "shared")
   endif()
 
-  # TODO(tomfinegan): If cmake ever allows addition of .o files to OBJECT lib
-  # targets, make this OBJECT instead of STATIC to hide the target from
-  # consumers of the AOM cmake build.
-  add_library(${lib_name} STATIC ${${asm_sources}})
-
-  foreach(asm_source ${${asm_sources}})
-    get_filename_component(asm_source_name "${asm_source}" NAME)
-    set(asm_object "${asm_lib_obj_dir}/${asm_source_name}.o")
-    add_custom_command(OUTPUT "${asm_object}"
-                       COMMAND ${AS_EXECUTABLE} ARGS ${AOM_AS_FLAGS}
-                               -I${AOM_ROOT}/ -I${AOM_CONFIG_DIR}/ -o
-                               "${asm_object}" "${asm_source}"
-                       DEPENDS "${asm_source}"
-                       COMMENT "Building ASM object ${asm_object}"
-                       WORKING_DIRECTORY "${AOM_CONFIG_DIR}"
-                       VERBATIM)
-    target_sources(aom PRIVATE "${asm_object}")
-    if(BUILD_SHARED_LIBS)
-      target_sources(aom_static PRIVATE "${asm_object}")
+  foreach(asm_config ${asm_configs})
+    set(asm_lib_name ${lib_name}_${asm_config})
+    set(asm_lib_obj_dir "${AOM_CONFIG_DIR}/asm_objects/${asm_lib_name}")
+    if(NOT EXISTS "${asm_lib_obj_dir}")
+      file(MAKE_DIRECTORY "${asm_lib_obj_dir}")
     endif()
-  endforeach()
 
-  # The above created a target containing only ASM sources. Cmake needs help
-  # here to determine the linker language. Add a dummy C file to force the
-  # linker language to C. We don't bother with setting the LINKER_LANGUAGE
-  # property on the library target because not all generators obey it (looking
-  # at you, xcode generator).
-  add_dummy_source_file_to_target("${lib_name}" "c")
+    add_library(${asm_lib_name} STATIC ${${asm_sources}})
+    set_property(TARGET ${asm_lib_name} PROPERTY FOLDER ${AOM_TARGET_CPU})
+
+    foreach(asm_source ${${asm_sources}})
+      get_filename_component(asm_source_name "${asm_source}" NAME)
+      set(asm_object "${asm_lib_obj_dir}/${asm_source_name}.o")
+      add_custom_command(OUTPUT "${asm_object}"
+                         COMMAND ${AS_EXECUTABLE} ARGS ${AOM_AS_FLAGS}
+                                 -I${AOM_ROOT}/ -I${AOM_CONFIG_DIR}/ -o
+                                 "${asm_object}" "${asm_source}"
+                         DEPENDS "${asm_source}"
+                         COMMENT "Building ASM object ${asm_object}"
+                         WORKING_DIRECTORY "${AOM_CONFIG_DIR}"
+                         VERBATIM)
+      if(BUILD_SHARED_LIBS AND "${asm_config}" STREQUAL "static")
+        target_sources(aom_static PRIVATE "${asm_object}")
+      else()
+        target_sources(aom PRIVATE "${asm_object}")
+      endif()
+    endforeach()
+
+    # The above created a target containing only ASM sources. CMake needs help
+    # here to determine the linker language. Add a dummy C file to force the
+    # linker language to C. We don't bother with setting the LINKER_LANGUAGE
+    # property on the library target because not all generators obey it (looking
+    # at you, Xcode generator).
+    add_dummy_source_file_to_target("${asm_lib_name}" "c")
+
+    # Add the new lib target to the global list of aom library targets.
+    list(APPEND AOM_LIB_TARGETS ${asm_lib_name})
+  endforeach()
 
-  # Add the new lib target to the global list of aom library targets.
-  list(APPEND AOM_LIB_TARGETS ${lib_name})
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
 endfunction()
 
@@ -236,5 +252,5 @@ function(add_rtcd_build_step config output source symbol)
     WORKING_DIRECTORY ${AOM_CONFIG_DIR}
     VERBATIM)
   set_property(SOURCE ${source} PROPERTY OBJECT_DEPENDS ${output})
-  set_property(SOURCE ${output} PROPERTY GENERATED)
+  set_property(SOURCE ${output} PROPERTY GENERATED TRUE)
 endfunction()
diff --git a/media/libaom/src/build/cmake/compiler_flags.cmake b/media/libaom/src/build/cmake/compiler_flags.cmake
index 24484bcade..f008b964f5 100644
--- a/media/libaom/src/build/cmake/compiler_flags.cmake
+++ b/media/libaom/src/build/cmake/compiler_flags.cmake
@@ -59,6 +59,12 @@ function(add_c_flag_if_supported c_flag)
     return()
   endif()
 
+  # Between 3.17.0 and 3.18.2 check_c_compiler_flag() sets a normal variable at
+  # parent scope while check_cxx_source_compiles() continues to set an internal
+  # cache variable, so we unset both to avoid the failure / success state
+  # persisting between checks. See
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/21207.
+  unset(C_FLAG_SUPPORTED)
   unset(C_FLAG_SUPPORTED CACHE)
   message("Checking C compiler flag support for: " ${c_flag})
   check_c_compiler_flag("${c_flag}" C_FLAG_SUPPORTED)
@@ -89,6 +95,12 @@ function(add_cxx_flag_if_supported cxx_flag)
     return()
   endif()
 
+  # Between 3.17.0 and 3.18.2 check_cxx_compiler_flag() sets a normal variable
+  # at parent scope while check_cxx_source_compiles() continues to set an
+  # internal cache variable, so we unset both to avoid the failure / success
+  # state persisting between checks. See
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/21207.
+  unset(CXX_FLAG_SUPPORTED)
   unset(CXX_FLAG_SUPPORTED CACHE)
   message("Checking C++ compiler flag support for: " ${cxx_flag})
   check_cxx_compiler_flag("${cxx_flag}" CXX_FLAG_SUPPORTED)
diff --git a/media/libaom/src/build/cmake/exports.cmake b/media/libaom/src/build/cmake/exports.cmake
index fa7842c9d9..3fcdd0c1f2 100644
--- a/media/libaom/src/build/cmake/exports.cmake
+++ b/media/libaom/src/build/cmake/exports.cmake
@@ -17,9 +17,9 @@ include("${AOM_ROOT}/build/cmake/exports_sources.cmake")
 
 # Creates the custom target which handles generation of the symbol export lists.
 function(setup_exports_target)
-  if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+  if(APPLE)
     set(symbol_file_ext "syms")
-  elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND MSVC)
+  elseif(WIN32)
     set(symbol_file_ext "def")
   else()
     set(symbol_file_ext "ver")
@@ -27,23 +27,25 @@ function(setup_exports_target)
 
   set(aom_sym_file "${AOM_CONFIG_DIR}/libaom.${symbol_file_ext}")
 
-  add_custom_target(generate_exports
-                    COMMAND ${CMAKE_COMMAND}
-                            -DAOM_ROOT="${AOM_ROOT}"
-                            -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}"
-                            -DAOM_TARGET_SYSTEM=${AOM_TARGET_SYSTEM}
-                            -DAOM_SYM_FILE="${aom_sym_file}"
-                            -DAOM_MSVC=${MSVC}
-                            -DAOM_XCODE=${XCODE}
-                            -DCONFIG_NAME=$<CONFIG>
-                            -DCONFIG_AV1_DECODER=${CONFIG_AV1_DECODER}
-                            -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER}
-                            -DCONFIG_INSPECTION=${CONFIG_INSPECTION}
-                            -DENABLE_TESTS=${ENABLE_TESTS}
-                            -P
-                            "${AOM_ROOT}/build/cmake/generate_exports.cmake"
-                    SOURCES ${AOM_EXPORTS_SOURCES}
-                    DEPENDS ${AOM_EXPORTS_SOURCES})
+  add_custom_target(
+    generate_exports
+    COMMAND ${CMAKE_COMMAND}
+            -DAOM_ROOT="${AOM_ROOT}"
+            -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}"
+            -DAOM_TARGET_SYSTEM=${AOM_TARGET_SYSTEM}
+            -DAOM_SYM_FILE="${aom_sym_file}"
+            -DAOM_MSVC=${MSVC}
+            -DAOM_XCODE=${XCODE}
+            -DCMAKE_SHARED_LIBRARY_PREFIX="${CMAKE_SHARED_LIBRARY_PREFIX}"
+            -DCONFIG_NAME=$<CONFIG>
+            -DCONFIG_AV1_DECODER=${CONFIG_AV1_DECODER}
+            -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER}
+            -DCONFIG_INSPECTION=${CONFIG_INSPECTION}
+            -DENABLE_TESTS=${ENABLE_TESTS}
+            -P
+            "${AOM_ROOT}/build/cmake/generate_exports.cmake"
+    SOURCES ${AOM_EXPORTS_SOURCES}
+    DEPENDS ${AOM_EXPORTS_SOURCES} BYPRODUCTS ${aom_sym_file})
 
   # Make libaom depend on the exports file, and set flags to pick it up when
   # creating the dylib.
@@ -54,14 +56,12 @@ function(setup_exports_target)
                  APPEND_STRING
                  PROPERTY LINK_FLAGS "-exported_symbols_list ${aom_sym_file}")
   elseif(WIN32)
-    if(NOT MSVC)
-      set_property(TARGET aom
-                   APPEND_STRING
-                   PROPERTY LINK_FLAGS "-Wl,--version-script ${aom_sym_file}")
-    else()
+    if(MSVC)
       set_property(TARGET aom
                    APPEND_STRING
                    PROPERTY LINK_FLAGS "/DEF:${aom_sym_file}")
+    else()
+      target_sources(aom PRIVATE "${aom_sym_file}")
     endif()
 
     # TODO(tomfinegan): Sort out the import lib situation and flags for MSVC.
diff --git a/media/libaom/src/build/cmake/generate_exports.cmake b/media/libaom/src/build/cmake/generate_exports.cmake
index f1d15a0fa7..3a5f67cea6 100644
--- a/media/libaom/src/build/cmake/generate_exports.cmake
+++ b/media/libaom/src/build/cmake/generate_exports.cmake
@@ -10,6 +10,7 @@
 #
 cmake_minimum_required(VERSION 3.5)
 
+# CMAKE_SHARED_LIBRARY_PREFIX can be empty
 set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_TARGET_SYSTEM" "AOM_SYM_FILE"
                   "CONFIG_AV1_DECODER" "CONFIG_AV1_ENCODER")
 
@@ -23,8 +24,9 @@ include("${AOM_ROOT}/build/cmake/exports_sources.cmake")
 
 if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
   set(symbol_prefix "_")
-elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
-  file(WRITE "${AOM_SYM_FILE}" "LIBRARY aom\n" "EXPORTS\n")
+elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS")
+  file(WRITE "${AOM_SYM_FILE}" "LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}aom\n"
+                               "EXPORTS\n")
 else()
   set(symbol_suffix ";")
 endif()
@@ -33,8 +35,9 @@ set(aom_sym_file "${AOM_SYM_FILE}")
 
 if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
   file(REMOVE "${aom_sym_file}")
-elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
-  file(WRITE "${aom_sym_file}" "LIBRARY aom\n" "EXPORTS\n")
+elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS")
+  file(WRITE "${aom_sym_file}" "LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}aom\n"
+                               "EXPORTS\n")
 else()
   file(WRITE "${aom_sym_file}" "{\nglobal:\n")
 endif()
@@ -47,7 +50,7 @@ endforeach()
 
 foreach(exported_symbol ${exported_symbols})
   string(STRIP "${exported_symbol}" exported_symbol)
-  if("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
+  if("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS")
     string(SUBSTRING ${exported_symbol} 0 4 export_type)
     string(COMPARE EQUAL "${export_type}" "data" is_data)
     if(is_data)
diff --git a/media/libaom/src/build/cmake/ios-Info.plist b/media/libaom/src/build/cmake/ios-Info.plist
deleted file mode 100644
index 300e3e310d..0000000000
--- a/media/libaom/src/build/cmake/ios-Info.plist
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>CFBundleDevelopmentRegion</key>
-	<string>en</string>
-	<key>CFBundleExecutable</key>
-	<string>AOM</string>
-	<key>CFBundleIdentifier</key>
-	<string>org.webmproject.AOM</string>
-	<key>CFBundleInfoDictionaryVersion</key>
-	<string>6.0</string>
-	<key>CFBundleName</key>
-	<string>AOM</string>
-	<key>CFBundlePackageType</key>
-	<string>FMWK</string>
-	<key>CFBundleShortVersionString</key>
-	<string>${VERSION}</string>
-	<key>CFBundleSignature</key>
-	<string>????</string>
-	<key>CFBundleSupportedPlatforms</key>
-	<array>
-		<string>iPhoneOS</string>
-	</array>
-	<key>CFBundleVersion</key>
-	<string>${VERSION}</string>
-	<key>MinimumOSVersion</key>
-	<string>${IOS_VERSION_MIN}</string>
-	<key>UIDeviceFamily</key>
-	<array>
-		<integer>1</integer>
-		<integer>2</integer>
-	</array>
-	<key>AOMFullVersion</key>
-	<string>${FULLVERSION}</string>
-</dict>
-</plist>
diff --git a/media/libaom/src/build/cmake/iosbuild.sh b/media/libaom/src/build/cmake/iosbuild.sh
deleted file mode 100644
index 167ece200a..0000000000
--- a/media/libaom/src/build/cmake/iosbuild.sh
+++ /dev/null
@@ -1,384 +0,0 @@
-#!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-## This script generates 'AOM.framework'. An iOS app can encode and decode AVx
-## video by including 'AOM.framework'.
-##
-## Run iosbuild.sh to create 'AOM.framework' in the current directory.
-##
-set -e
-devnull='> /dev/null 2>&1'
-
-BUILD_ROOT="_iosbuild"
-CONFIGURE_ARGS="--disable-docs
-                --disable-examples
-                --disable-libyuv
-                --disable-unit-tests"
-DIST_DIR="_dist"
-FRAMEWORK_DIR="AOM.framework"
-FRAMEWORK_LIB="AOM.framework/AOM"
-HEADER_DIR="${FRAMEWORK_DIR}/Headers/aom"
-SCRIPT_DIR=$(dirname "$0")
-LIBAOM_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
-LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
-ORIG_PWD="$(pwd)"
-ARM_TARGETS="arm64-darwin-gcc
-             armv7-darwin-gcc
-             armv7s-darwin-gcc"
-SIM_TARGETS="x86-iphonesimulator-gcc
-             x86_64-iphonesimulator-gcc"
-OSX_TARGETS="x86-darwin16-gcc
-             x86_64-darwin16-gcc"
-TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
-
-# Configures for the target specified by $1, and invokes make with the dist
-# target using $ as the distribution output directory.
-build_target() {
-  local target="$1"
-  local old_pwd="$(pwd)"
-  local target_specific_flags=""
-
-  vlog "***Building target: ${target}***"
-
-  case "${target}" in
-    x86-*)
-      target_specific_flags="--enable-pic"
-      vlog "Enabled PIC for ${target}"
-      ;;
-  esac
-
-  mkdir "${target}"
-  cd "${target}"
-  # TODO(tomfinegan@google.com): switch to cmake.
-  eval "${LIBAOM_SOURCE_DIR}/configure" --target="${target}" \
-    ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${target_specific_flags} \
-    ${devnull}
-  export DIST_DIR
-  eval make dist ${devnull}
-  cd "${old_pwd}"
-
-  vlog "***Done building target: ${target}***"
-}
-
-# Returns the preprocessor symbol for the target specified by $1.
-target_to_preproc_symbol() {
-  target="$1"
-  case "${target}" in
-    arm64-*)
-      echo "__aarch64__"
-      ;;
-    armv7-*)
-      echo "__ARM_ARCH_7A__"
-      ;;
-    armv7s-*)
-      echo "__ARM_ARCH_7S__"
-      ;;
-    x86-*)
-      echo "__i386__"
-      ;;
-    x86_64-*)
-      echo "__x86_64__"
-      ;;
-    *)
-      echo "#error ${target} unknown/unsupported"
-      return 1
-      ;;
-  esac
-}
-
-# Create a aom_config.h shim that, based on preprocessor settings for the
-# current target CPU, includes the real aom_config.h for the current target.
-# $1 is the list of targets.
-create_aom_framework_config_shim() {
-  local targets="$1"
-  local config_file="${HEADER_DIR}/aom_config.h"
-  local preproc_symbol=""
-  local target=""
-  local include_guard="AOM_FRAMEWORK_HEADERS_AOM_AOM_CONFIG_H_"
-
-  local file_header="/*
- *  Copyright (c) $(date +%Y), Alliance for Open Media. All rights reserved.
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* GENERATED FILE: DO NOT EDIT! */
-
-#ifndef ${include_guard}
-#define ${include_guard}
-
-#if defined"
-
-  printf "%s" "${file_header}" > "${config_file}"
-  for target in ${targets}; do
-    preproc_symbol=$(target_to_preproc_symbol "${target}")
-    printf " ${preproc_symbol}\n" >> "${config_file}"
-    printf "#define AOM_FRAMEWORK_TARGET \"${target}\"\n" >> "${config_file}"
-    printf "#include \"AOM/aom/${target}/aom_config.h\"\n" >> "${config_file}"
-    printf "#elif defined" >> "${config_file}"
-    mkdir "${HEADER_DIR}/${target}"
-    cp -p "${BUILD_ROOT}/${target}/aom_config.h" "${HEADER_DIR}/${target}"
-  done
-
-  # Consume the last line of output from the loop: We don't want it.
-  sed -i '' -e '$d' "${config_file}"
-
-  printf "#endif\n\n" >> "${config_file}"
-  printf "#endif  // ${include_guard}" >> "${config_file}"
-}
-
-# Verifies that $FRAMEWORK_LIB fat library contains requested builds.
-verify_framework_targets() {
-  local requested_cpus=""
-  local cpu=""
-
-  # Extract CPU from full target name.
-  for target; do
-    cpu="${target%%-*}"
-    if [ "${cpu}" = "x86" ]; then
-      # lipo -info outputs i386 for libaom x86 targets.
-      cpu="i386"
-    fi
-    requested_cpus="${requested_cpus}${cpu} "
-  done
-
-  # Get target CPUs present in framework library.
-  local targets_built=$(${LIPO} -info ${FRAMEWORK_LIB})
-
-  # $LIPO -info outputs a string like the following:
-  #   Architectures in the fat file: $FRAMEWORK_LIB <architectures>
-  # Capture only the architecture strings.
-  targets_built=${targets_built##*: }
-
-  # Sort CPU strings to make the next step a simple string compare.
-  local actual=$(echo ${targets_built} | tr " " "\n" | sort | tr "\n" " ")
-  local requested=$(echo ${requested_cpus} | tr " " "\n" | sort | tr "\n" " ")
-
-  vlog "Requested ${FRAMEWORK_LIB} CPUs: ${requested}"
-  vlog "Actual ${FRAMEWORK_LIB} CPUs: ${actual}"
-
-  if [ "${requested}" != "${actual}" ]; then
-    elog "Actual ${FRAMEWORK_LIB} targets do not match requested target list."
-    elog "  Requested target CPUs: ${requested}"
-    elog "  Actual target CPUs: ${actual}"
-    return 1
-  fi
-}
-
-# Configures and builds each target specified by $1, and then builds
-# AOM.framework.
-build_framework() {
-  local lib_list=""
-  local targets="$1"
-  local target=""
-  local target_dist_dir=""
-
-  # Clean up from previous build(s).
-  rm -rf "${BUILD_ROOT}" "${FRAMEWORK_DIR}"
-
-  # Create output dirs.
-  mkdir -p "${BUILD_ROOT}"
-  mkdir -p "${HEADER_DIR}"
-
-  cd "${BUILD_ROOT}"
-
-  for target in ${targets}; do
-    build_target "${target}"
-    target_dist_dir="${BUILD_ROOT}/${target}/${DIST_DIR}"
-    if [ "${ENABLE_SHARED}" = "yes" ]; then
-      local suffix="dylib"
-    else
-      local suffix="a"
-    fi
-    lib_list="${lib_list} ${target_dist_dir}/lib/libaom.${suffix}"
-  done
-
-  cd "${ORIG_PWD}"
-
-  # The basic libaom API includes are all the same; just grab the most recent
-  # set.
-  cp -p "${target_dist_dir}"/include/aom/* "${HEADER_DIR}"
-
-  # Build the fat library.
-  ${LIPO} -create ${lib_list} -output ${FRAMEWORK_DIR}/AOM
-
-  # Create the aom_config.h shim that allows usage of aom_config.h from
-  # within AOM.framework.
-  create_aom_framework_config_shim "${targets}"
-
-  # Copy in aom_version.h.
-  cp -p "${BUILD_ROOT}/${target}/aom_version.h" "${HEADER_DIR}"
-
-  if [ "${ENABLE_SHARED}" = "yes" ]; then
-    # Adjust the dylib's name so dynamic linking in apps works as expected.
-    install_name_tool -id '@rpath/AOM.framework/AOM' ${FRAMEWORK_DIR}/AOM
-
-    # Copy in Info.plist.
-    cat "${SCRIPT_DIR}/ios-Info.plist" \
-      | sed "s/\${FULLVERSION}/${FULLVERSION}/g" \
-      | sed "s/\${VERSION}/${VERSION}/g" \
-      | sed "s/\${IOS_VERSION_MIN}/${IOS_VERSION_MIN}/g" \
-      > "${FRAMEWORK_DIR}/Info.plist"
-  fi
-
-  # Confirm AOM.framework/AOM contains the targets requested.
-  verify_framework_targets ${targets}
-
-  vlog "Created fat library ${FRAMEWORK_LIB} containing:"
-  for lib in ${lib_list}; do
-    vlog "  $(echo ${lib} | awk -F / '{print $2, $NF}')"
-  done
-}
-
-# Trap function. Cleans up the subtree used to build all targets contained in
-# $TARGETS.
-cleanup() {
-  local res=$?
-  cd "${ORIG_PWD}"
-
-  if [ $res -ne 0 ]; then
-    elog "build exited with error ($res)"
-  fi
-
-  if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then
-    rm -rf "${BUILD_ROOT}"
-  fi
-}
-
-print_list() {
-  local indent="$1"
-  shift
-  local list="$@"
-  for entry in ${list}; do
-    echo "${indent}${entry}"
-  done
-}
-
-iosbuild_usage() {
-cat << EOF
-  Usage: ${0##*/} [arguments]
-    --help: Display this message and exit.
-    --enable-shared: Build a dynamic framework for use on iOS 8 or later.
-    --extra-configure-args <args>: Extra args to pass when configuring libaom.
-    --macosx: Uses darwin16 targets instead of iphonesimulator targets for x86
-              and x86_64. Allows linking to framework when builds target MacOSX
-              instead of iOS.
-    --preserve-build-output: Do not delete the build directory.
-    --show-build-output: Show output from each library build.
-    --targets <targets>: Override default target list. Defaults:
-$(print_list "        " ${TARGETS})
-    --test-link: Confirms all targets can be linked. Functionally identical to
-                 passing --enable-examples via --extra-configure-args.
-    --verbose: Output information about the environment and each stage of the
-               build.
-EOF
-}
-
-elog() {
-  echo "${0##*/} failed because: $@" 1>&2
-}
-
-vlog() {
-  if [ "${VERBOSE}" = "yes" ]; then
-    echo "$@"
-  fi
-}
-
-trap cleanup EXIT
-
-# Parse the command line.
-while [ -n "$1" ]; do
-  case "$1" in
-    --extra-configure-args)
-      EXTRA_CONFIGURE_ARGS="$2"
-      shift
-      ;;
-    --help)
-      iosbuild_usage
-      exit
-      ;;
-    --enable-shared)
-      ENABLE_SHARED=yes
-      ;;
-    --preserve-build-output)
-      PRESERVE_BUILD_OUTPUT=yes
-      ;;
-    --show-build-output)
-      devnull=
-      ;;
-    --test-link)
-      EXTRA_CONFIGURE_ARGS="${EXTRA_CONFIGURE_ARGS} --enable-examples"
-      ;;
-    --targets)
-      TARGETS="$2"
-      shift
-      ;;
-    --macosx)
-      TARGETS="${ARM_TARGETS} ${OSX_TARGETS}"
-      ;;
-    --verbose)
-      VERBOSE=yes
-      ;;
-    *)
-      iosbuild_usage
-      exit 1
-      ;;
-  esac
-  shift
-done
-
-if [ "${ENABLE_SHARED}" = "yes" ]; then
-  CONFIGURE_ARGS="--enable-shared ${CONFIGURE_ARGS}"
-fi
-
-FULLVERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBAOM_SOURCE_DIR}")
-VERSION=$(echo "${FULLVERSION}" | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*$/\1/')
-
-if [ "$ENABLE_SHARED" = "yes" ]; then
-  IOS_VERSION_OPTIONS="--enable-shared"
-  IOS_VERSION_MIN="8.0"
-else
-  IOS_VERSION_OPTIONS=""
-  IOS_VERSION_MIN="6.0"
-fi
-
-if [ "${VERBOSE}" = "yes" ]; then
-cat << EOF
-  BUILD_ROOT=${BUILD_ROOT}
-  DIST_DIR=${DIST_DIR}
-  CONFIGURE_ARGS=${CONFIGURE_ARGS}
-  EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
-  FRAMEWORK_DIR=${FRAMEWORK_DIR}
-  FRAMEWORK_LIB=${FRAMEWORK_LIB}
-  HEADER_DIR=${HEADER_DIR}
-  LIBAOM_SOURCE_DIR=${LIBAOM_SOURCE_DIR}
-  LIPO=${LIPO}
-  MAKEFLAGS=${MAKEFLAGS}
-  ORIG_PWD=${ORIG_PWD}
-  PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
-  TARGETS="$(print_list "" ${TARGETS})"
-  ENABLE_SHARED=${ENABLE_SHARED}
-  OSX_TARGETS="${OSX_TARGETS}"
-  SIM_TARGETS="${SIM_TARGETS}"
-  SCRIPT_DIR="${SCRIPT_DIR}"
-  FULLVERSION="${FULLVERSION}"
-  VERSION="${VERSION}"
-  IOS_VERSION_MIN="${IOS_VERSION_MIN}"
-EOF
-fi
-
-build_framework "${TARGETS}"
-echo "Successfully built '${FRAMEWORK_DIR}' for:"
-print_list "" ${TARGETS}
diff --git a/media/libaom/src/build/cmake/msvc_runtime.cmake b/media/libaom/src/build/cmake/msvc_runtime.cmake
deleted file mode 100644
index 9e4cbea435..0000000000
--- a/media/libaom/src/build/cmake/msvc_runtime.cmake
+++ /dev/null
@@ -1,37 +0,0 @@
-#
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
-#
-# This source code is subject to the terms of the BSD 2 Clause License and the
-# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
-# not distributed with this source code in the LICENSE file, you can obtain it
-# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
-# License 1.0 was not distributed with this source code in the PATENTS file, you
-# can obtain it at www.aomedia.org/license/patent.
-#
-if(AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_)
-  return()
-endif() # AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_
-set(AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_ 1)
-
-if(MSVC)
-
-  # CMake defaults to producing code linked to the DLL MSVC runtime. That will
-  # not work with googletest, and isn't what we want anyway.
-  if(NOT "${MSVC_RUNTIME}" STREQUAL "dll")
-    foreach(flag_var
-            CMAKE_C_FLAGS
-            CMAKE_C_FLAGS_DEBUG
-            CMAKE_C_FLAGS_RELEASE
-            CMAKE_C_FLAGS_MINSIZEREL
-            CMAKE_C_FLAGS_RELWITHDEBINFO
-            CMAKE_CXX_FLAGS
-            CMAKE_CXX_FLAGS_DEBUG
-            CMAKE_CXX_FLAGS_RELEASE
-            CMAKE_CXX_FLAGS_MINSIZEREL
-            CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-  endif()
-endif()
diff --git a/media/libaom/src/build/cmake/pkg_config.cmake b/media/libaom/src/build/cmake/pkg_config.cmake
index c3914d79ea..e8fff2e776 100644
--- a/media/libaom/src/build/cmake/pkg_config.cmake
+++ b/media/libaom/src/build/cmake/pkg_config.cmake
@@ -51,8 +51,14 @@ file(
   APPEND "${pkgconfig_file}"
   "Description: Alliance for Open Media AV1 codec library v${aom_version}.\n")
 file(APPEND "${pkgconfig_file}" "Version: ${package_version}\n")
-file(APPEND "${pkgconfig_file}" "Requires:\n")
-file(APPEND "${pkgconfig_file}" "Conflicts:\n")
+file(APPEND "${pkgconfig_file}" "Requires:")
+if(CONFIG_TUNE_VMAF)
+  file(APPEND "${pkgconfig_file}" " libvmaf")
+endif()
+if(CONFIG_TUNE_BUTTERAUGLI)
+  file(APPEND "${pkgconfig_file}" " libjxl")
+endif()
+file(APPEND "${pkgconfig_file}" "\nConflicts:\n")
 file(APPEND "${pkgconfig_file}" "Libs: -L\${libdir} -l${pkg_name}\n")
 if(CONFIG_MULTITHREAD AND HAVE_PTHREAD_H)
   file(APPEND "${pkgconfig_file}" "Libs.private: -lm -lpthread\n")
diff --git a/media/libaom/src/build/cmake/rtcd.pl b/media/libaom/src/build/cmake/rtcd.pl
index dafccdca9c..e9f75dd44b 100644..100755
--- a/media/libaom/src/build/cmake/rtcd.pl
+++ b/media/libaom/src/build/cmake/rtcd.pl
@@ -91,7 +91,9 @@ sub specialize {
 
 sub add_proto {
   my $fn = splice(@_, -2, 1);
-  $ALL_FUNCS{$fn} = \@_;
+  my @proto = @_;
+  foreach (@proto) { tr/\t/ / }
+  $ALL_FUNCS{$fn} = \@proto;
   specialize $fn, "c";
 }
 
diff --git a/media/libaom/src/build/cmake/toolchains/arm64-android-clang.cmake b/media/libaom/src/build/cmake/toolchains/android.cmake
index c13b1d96c5..f0b9fab5b9 100644
--- a/media/libaom/src/build/cmake/toolchains/arm64-android-clang.cmake
+++ b/media/libaom/src/build/cmake/toolchains/android.cmake
@@ -8,21 +8,22 @@
 # License 1.0 was not distributed with this source code in the PATENTS file, you
 # can obtain it at www.aomedia.org/license/patent.
 #
-if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_ANDROID_CLANG_CMAKE_)
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ANDROID_CMAKE_)
   return()
-endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_ANDROID_CLANG_CMAKE_
-set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_ANDROID_CLANG_CMAKE_ 1)
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ANDROID_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ANDROID_CMAKE_ 1)
 
 if(NOT ANDROID_PLATFORM)
-  set(ANDROID_PLATFORM android-21)
+  set(ANDROID_PLATFORM android-24)
 endif()
 
+# Choose target architecture with:
+#
+# -DANDROID_ABI={armeabi-v7a,armeabi-v7a with NEON,arm64-v8a,x86,x86_64}
 if(NOT ANDROID_ABI)
   set(ANDROID_ABI arm64-v8a)
 endif()
 
-set(AS_EXECUTABLE as)
-
 # Toolchain files don't have access to cached variables:
 # https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate
 # environment variable when loaded the first time.
@@ -39,10 +40,16 @@ endif()
 
 include("${AOM_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake")
 
-# No intrinsics flag required for arm64-android-clang.
-set(AOM_NEON_INTRIN_FLAG "")
+if(ANDROID_ABI MATCHES "^armeabi")
+  set(AOM_NEON_INTRIN_FLAG "-mfpu=neon")
+endif()
 
-# No runtime cpu detect for arm64-android-clang.
-set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
+if(ANDROID_ABI MATCHES "^arm")
+  set(AS_EXECUTABLE as)
+  # No runtime cpu detect for arm targets.
+  set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
+elseif(ANDROID_ABI MATCHES "^x86")
+  set(AS_EXECUTABLE yasm)
+endif()
 
 set(CMAKE_SYSTEM_NAME "Android")
diff --git a/media/libaom/src/build/cmake/toolchains/arm-ios-common.cmake b/media/libaom/src/build/cmake/toolchains/arm-ios-common.cmake
index 053e33a271..62ca1155ee 100644
--- a/media/libaom/src/build/cmake/toolchains/arm-ios-common.cmake
+++ b/media/libaom/src/build/cmake/toolchains/arm-ios-common.cmake
@@ -16,9 +16,10 @@ set(AOM_BUILD_CMAKE_ARM_IOS_COMMON_CMAKE_ 1)
 set(CMAKE_SYSTEM_NAME "Darwin")
 set(CMAKE_OSX_SYSROOT iphoneos)
 set(CMAKE_C_COMPILER clang)
-set(CMAKE_C_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_C_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
 set(CMAKE_CXX_COMPILER clang++)
-set(CMAKE_CXX_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_CXX_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
 
 # No runtime cpu detect for arm*-ios targets.
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/media/libaom/src/build/cmake/toolchains/arm64-linux-gcc.cmake b/media/libaom/src/build/cmake/toolchains/arm64-linux-gcc.cmake
index a6c9543db3..fc4b277bb9 100644
--- a/media/libaom/src/build/cmake/toolchains/arm64-linux-gcc.cmake
+++ b/media/libaom/src/build/cmake/toolchains/arm64-linux-gcc.cmake
@@ -24,8 +24,8 @@ endif()
 set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
 set(AS_EXECUTABLE ${CROSS}as)
-set(CMAKE_C_COMPILER_ARG1 "-march=armv8-a")
-set(CMAKE_CXX_COMPILER_ARG1 "-march=armv8-a")
+set(CMAKE_C_FLAGS_INIT "-march=armv8-a")
+set(CMAKE_CXX_FLAGS_INIT "-march=armv8-a")
 set(AOM_AS_FLAGS "-march=armv8-a")
 set(CMAKE_SYSTEM_PROCESSOR "arm64")
 
diff --git a/media/libaom/src/build/cmake/toolchains/armv7-linux-gcc.cmake b/media/libaom/src/build/cmake/toolchains/armv7-linux-gcc.cmake
index b898b4b789..26c028f11f 100644
--- a/media/libaom/src/build/cmake/toolchains/armv7-linux-gcc.cmake
+++ b/media/libaom/src/build/cmake/toolchains/armv7-linux-gcc.cmake
@@ -28,13 +28,15 @@ endif()
 set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
 set(AS_EXECUTABLE ${CROSS}as)
-set(CMAKE_C_COMPILER_ARG1 "-march=armv7-a ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
-set(CMAKE_CXX_COMPILER_ARG1 "-march=armv7-a ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
+set(CMAKE_C_FLAGS_INIT "-march=armv7-a -mfpu=vfpv3 \
+                          ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
+set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -mfpu=vfpv3 \
+                            ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
 set(AOM_AS_FLAGS --defsym ARCHITECTURE=7 -march=armv7-a -mfpu=neon
                  ${AOM_EXTRA_TOOLCHAIN_FLAGS})
 set(CMAKE_SYSTEM_PROCESSOR "armv7")
 
-set(AOM_NEON_INTRIN_FLAG "-mfpu=neon")
+set(AOM_NEON_INTRIN_FLAG "-mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
 
 # No runtime cpu detect for armv7-linux-gcc.
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/media/libaom/src/build/cmake/toolchains/ios-simulator-common.cmake b/media/libaom/src/build/cmake/toolchains/ios-simulator-common.cmake
index 76e0bd140b..173c423c3d 100644
--- a/media/libaom/src/build/cmake/toolchains/ios-simulator-common.cmake
+++ b/media/libaom/src/build/cmake/toolchains/ios-simulator-common.cmake
@@ -16,8 +16,9 @@ set(AOM_BUILD_CMAKE_IOS_SIMULATOR_COMMON_CMAKE_ 1)
 set(CMAKE_SYSTEM_NAME "Darwin")
 set(CMAKE_OSX_SYSROOT iphonesimulator)
 set(CMAKE_C_COMPILER clang)
-set(CMAKE_C_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_C_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
 set(CMAKE_CXX_COMPILER clang++)
-set(CMAKE_CXX_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_CXX_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
 
 # TODO(tomfinegan): Handle bit code embedding.
diff --git a/media/libaom/src/build/cmake/toolchains/mips32-linux-gcc.cmake b/media/libaom/src/build/cmake/toolchains/mips32-linux-gcc.cmake
index c644eec8c0..ad5ebffdc6 100644
--- a/media/libaom/src/build/cmake/toolchains/mips32-linux-gcc.cmake
+++ b/media/libaom/src/build/cmake/toolchains/mips32-linux-gcc.cmake
@@ -65,8 +65,9 @@ endif()
 set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
 set(AS_EXECUTABLE ${CROSS}as)
-set(CMAKE_C_COMPILER_ARG1 "-EL ${MIPS_CFLAGS}")
-set(CMAKE_CXX_COMPILER_ARG1 "-EL ${MIPS_CXXFLAGS}")
+set(CMAKE_C_FLAGS_INIT "-EL ${MIPS_CFLAGS}")
+set(CMAKE_CXX_FLAGS_INIT "-EL ${MIPS_CXXFLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-EL ${MIPS_CXXFLAGS}")
 set(CMAKE_SYSTEM_PROCESSOR "mips32")
 
 # No runtime cpu detect for mips32-linux-gcc.
diff --git a/media/libaom/src/build/cmake/toolchains/mips64-linux-gcc.cmake b/media/libaom/src/build/cmake/toolchains/mips64-linux-gcc.cmake
index 442d910995..0af992451c 100644
--- a/media/libaom/src/build/cmake/toolchains/mips64-linux-gcc.cmake
+++ b/media/libaom/src/build/cmake/toolchains/mips64-linux-gcc.cmake
@@ -42,8 +42,9 @@ endif()
 set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
 set(AS_EXECUTABLE ${CROSS}as)
-set(CMAKE_C_COMPILER_ARG1 "-EL ${MIPS_CFLAGS}")
-set(CMAKE_CXX_COMPILER_ARG1 "-EL ${MIPS_CXXFLAGS}")
+set(CMAKE_C_FLAGS_INIT "-EL ${MIPS_CFLAGS}")
+set(CMAKE_CXX_FLAGS_INIT "-EL ${MIPS_CXXFLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-EL ${MIPS_CXXFLAGS}")
 set(CMAKE_SYSTEM_PROCESSOR "mips64")
 
 # No runtime cpu detect for mips64-linux-gcc.
diff --git a/media/libaom/src/build/cmake/toolchains/x86-linux.cmake b/media/libaom/src/build/cmake/toolchains/x86-linux.cmake
index c2a700bfef..a9c4f8c6b4 100644
--- a/media/libaom/src/build/cmake/toolchains/x86-linux.cmake
+++ b/media/libaom/src/build/cmake/toolchains/x86-linux.cmake
@@ -15,5 +15,6 @@ set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_ 1)
 
 set(CMAKE_SYSTEM_PROCESSOR "x86")
 set(CMAKE_SYSTEM_NAME "Linux")
-set(CMAKE_C_COMPILER_ARG1 "-m32")
-set(CMAKE_CXX_COMPILER_ARG1 "-m32")
+set(CMAKE_C_FLAGS_INIT "-m32")
+set(CMAKE_CXX_FLAGS_INIT "-m32")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-m32")
diff --git a/media/libaom/src/build/cmake/toolchains/x86-macos.cmake b/media/libaom/src/build/cmake/toolchains/x86-macos.cmake
index 095ef18e77..68e1bb07ff 100644
--- a/media/libaom/src/build/cmake/toolchains/x86-macos.cmake
+++ b/media/libaom/src/build/cmake/toolchains/x86-macos.cmake
@@ -11,8 +11,9 @@
 set(CMAKE_SYSTEM_PROCESSOR "x86")
 set(CMAKE_SYSTEM_NAME "Darwin")
 set(CMAKE_OSX_ARCHITECTURES "i386")
-set(CMAKE_C_COMPILER_ARG1 "-arch i386")
-set(CMAKE_CXX_COMPILER_ARG1 "-arch i386")
+set(CMAKE_C_FLAGS_INIT "-arch i386")
+set(CMAKE_CXX_FLAGS_INIT "-arch i386")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-arch i386")
 
 # Apple tools always complain in 32 bit mode without PIC.
 set(CONFIG_PIC 1 CACHE STRING "")
diff --git a/media/libaom/src/build/cmake/toolchains/x86-mingw-gcc.cmake b/media/libaom/src/build/cmake/toolchains/x86-mingw-gcc.cmake
index 4839c9d455..2e9a9a84b6 100644
--- a/media/libaom/src/build/cmake/toolchains/x86-mingw-gcc.cmake
+++ b/media/libaom/src/build/cmake/toolchains/x86-mingw-gcc.cmake
@@ -15,8 +15,9 @@ set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_ 1)
 
 set(CMAKE_SYSTEM_PROCESSOR "x86")
 set(CMAKE_SYSTEM_NAME "Windows")
-set(CMAKE_C_COMPILER_ARG1 "-m32")
-set(CMAKE_CXX_COMPILER_ARG1 "-m32")
+set(CMAKE_C_FLAGS_INIT "-m32")
+set(CMAKE_CXX_FLAGS_INIT "-m32")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-m32")
 
 if("${CROSS}" STREQUAL "")
   set(CROSS i686-w64-mingw32-)
diff --git a/media/libaom/src/build/cmake/version.cmake b/media/libaom/src/build/cmake/version.cmake
index dd953a37a3..f4377a13e1 100644
--- a/media/libaom/src/build/cmake/version.cmake
+++ b/media/libaom/src/build/cmake/version.cmake
@@ -24,7 +24,9 @@ include("${AOM_ROOT}/build/cmake/util.cmake")
 # Generate the version string for this run.
 unset(aom_version)
 if(EXISTS "${GIT_EXECUTABLE}")
-  execute_process(COMMAND ${GIT_EXECUTABLE} --git-dir=${AOM_ROOT}/.git describe
+  execute_process(COMMAND ${GIT_EXECUTABLE}
+                          --git-dir=${AOM_ROOT}/.git describe
+                          --match=v[0-9]*
                   OUTPUT_VARIABLE aom_version
                   ERROR_QUIET
                   RESULT_VARIABLE version_check_result)
diff --git a/media/libaom/src/build/cmake/version.pl b/media/libaom/src/build/cmake/version.pl
index 7d23f2b277..392815f81d 100644..100755
--- a/media/libaom/src/build/cmake/version.pl
+++ b/media/libaom/src/build/cmake/version.pl
@@ -62,7 +62,9 @@ my $version_patch = $version_components[2];
 my $version_extra = "";
 if (length($git_desc) > 0) {
   my @git_desc_components = split('-', $git_desc, 2);
-  $version_extra = $git_desc_components[1];
+  if (@git_desc_components > 1) {
+    $version_extra = $git_desc_components[1];
+  }
 }
 
 open(my $version_file, '>', $version_filename) or
diff --git a/media/libaom/src/common/args.c b/media/libaom/src/common/args.c
index ec2a863534..b5ede193b5 100644
--- a/media/libaom/src/common/args.c
+++ b/media/libaom/src/common/args.c
@@ -11,6 +11,7 @@
 
 #include "common/args.h"
 
+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
@@ -18,30 +19,21 @@
 #include "aom/aom_integer.h"
 #include "aom_ports/msvc.h"
 #include "aom/aom_codec.h"
+#include "common/tools_common.h"
 
-#if defined(__GNUC__) && __GNUC__
-extern void die(const char *fmt, ...) __attribute__((noreturn));
-#else
-extern void die(const char *fmt, ...);
-#endif
-
-struct arg arg_init(char **argv) {
-  struct arg a;
-
-  a.argv = argv;
-  a.argv_step = 1;
-  a.name = NULL;
-  a.val = NULL;
-  a.def = NULL;
-  return a;
-}
+static const char kSbSizeWarningString[] =
+    "super_block_size has to be 64 or 128.";
+static const char kMinpartWarningString[] =
+    "min_partition_size has to be smaller or equal to max_partition_size.";
+static const char kMaxpartWarningString[] =
+    "max_partition_size has to be smaller or equal to super_block_size.";
 
-char *ignore_front_spaces(const char *str) {
+static char *ignore_front_spaces(const char *str) {
   while (str[0] == ' ' || str[0] == '\t') ++str;
   return (char *)str;
 }
 
-void ignore_end_spaces(char *str) {
+static void ignore_end_spaces(char *str) {
   char *end = str + strlen(str);
   while (end > str && (end[0] == ' ' || end[0] == '\t' || end[0] == '\n' ||
                        end[0] == '\r' || end[0] == '\0'))
@@ -49,13 +41,6 @@ void ignore_end_spaces(char *str) {
   if (end >= str) end[1] = '\0';
 }
 
-static const char kSbSizeWarningString[] =
-    "super_block_size has to be 64 or 128.";
-static const char kMinpartWarningString[] =
-    "min_partition_size has to be smaller or equal to max_partition_size.";
-static const char kMaxpartWarningString[] =
-    "max_partition_size has to be smaller or equal to super_block_size.";
-
 int parse_cfg(const char *file, cfg_options_t *config) {
   char line[1024 * 10];
   FILE *f = fopen(file, "r");
@@ -86,40 +71,40 @@ int parse_cfg(const char *file, cfg_options_t *config) {
     ignore_end_spaces(left);
     ignore_end_spaces(right);
 
-    GET_PARAMS(super_block_size);
-    GET_PARAMS(max_partition_size);
-    GET_PARAMS(min_partition_size);
-    GET_PARAMS(disable_ab_partition_type);
-    GET_PARAMS(disable_rect_partition_type);
-    GET_PARAMS(disable_1to4_partition_type);
-    GET_PARAMS(disable_flip_idtx);
-    GET_PARAMS(disable_cdef);
-    GET_PARAMS(disable_lr);
-    GET_PARAMS(disable_obmc);
-    GET_PARAMS(disable_warp_motion);
-    GET_PARAMS(disable_global_motion);
-    GET_PARAMS(disable_dist_wtd_comp);
-    GET_PARAMS(disable_diff_wtd_comp);
-    GET_PARAMS(disable_inter_intra_comp);
-    GET_PARAMS(disable_masked_comp);
-    GET_PARAMS(disable_one_sided_comp);
-    GET_PARAMS(disable_palette);
-    GET_PARAMS(disable_intrabc);
-    GET_PARAMS(disable_cfl);
-    GET_PARAMS(disable_smooth_intra);
-    GET_PARAMS(disable_filter_intra);
-    GET_PARAMS(disable_dual_filter);
-    GET_PARAMS(disable_intra_angle_delta);
-    GET_PARAMS(disable_intra_edge_filter);
-    GET_PARAMS(disable_tx_64x64);
-    GET_PARAMS(disable_smooth_inter_intra);
-    GET_PARAMS(disable_inter_inter_wedge);
-    GET_PARAMS(disable_inter_intra_wedge);
-    GET_PARAMS(disable_paeth_intra);
-    GET_PARAMS(disable_trellis_quant);
-    GET_PARAMS(disable_ref_frame_mv);
-    GET_PARAMS(reduced_reference_set);
-    GET_PARAMS(reduced_tx_type_set);
+    GET_PARAMS(super_block_size)
+    GET_PARAMS(max_partition_size)
+    GET_PARAMS(min_partition_size)
+    GET_PARAMS(disable_ab_partition_type)
+    GET_PARAMS(disable_rect_partition_type)
+    GET_PARAMS(disable_1to4_partition_type)
+    GET_PARAMS(disable_flip_idtx)
+    GET_PARAMS(disable_cdef)
+    GET_PARAMS(disable_lr)
+    GET_PARAMS(disable_obmc)
+    GET_PARAMS(disable_warp_motion)
+    GET_PARAMS(disable_global_motion)
+    GET_PARAMS(disable_dist_wtd_comp)
+    GET_PARAMS(disable_diff_wtd_comp)
+    GET_PARAMS(disable_inter_intra_comp)
+    GET_PARAMS(disable_masked_comp)
+    GET_PARAMS(disable_one_sided_comp)
+    GET_PARAMS(disable_palette)
+    GET_PARAMS(disable_intrabc)
+    GET_PARAMS(disable_cfl)
+    GET_PARAMS(disable_smooth_intra)
+    GET_PARAMS(disable_filter_intra)
+    GET_PARAMS(disable_dual_filter)
+    GET_PARAMS(disable_intra_angle_delta)
+    GET_PARAMS(disable_intra_edge_filter)
+    GET_PARAMS(disable_tx_64x64)
+    GET_PARAMS(disable_smooth_inter_intra)
+    GET_PARAMS(disable_inter_inter_wedge)
+    GET_PARAMS(disable_inter_intra_wedge)
+    GET_PARAMS(disable_paeth_intra)
+    GET_PARAMS(disable_trellis_quant)
+    GET_PARAMS(disable_ref_frame_mv)
+    GET_PARAMS(reduced_reference_set)
+    GET_PARAMS(reduced_tx_type_set)
 
     fprintf(stderr, "\nInvalid parameter: %s", left);
     exit(-1);
@@ -145,43 +130,12 @@ int parse_cfg(const char *file, cfg_options_t *config) {
 }
 
 int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) {
-  struct arg arg;
-
-  if (!argv[0] || argv[0][0] != '-') return 0;
-
-  arg = arg_init(argv);
-
-  if (def->short_name && strlen(arg.argv[0]) == strlen(def->short_name) + 1 &&
-      !strcmp(arg.argv[0] + 1, def->short_name)) {
-    arg.name = arg.argv[0] + 1;
-    arg.val = def->has_val ? arg.argv[1] : NULL;
-    arg.argv_step = def->has_val ? 2 : 1;
-  } else if (def->long_name) {
-    const size_t name_len = strlen(def->long_name);
-
-    if (strlen(arg.argv[0]) >= name_len + 2 && arg.argv[0][1] == '-' &&
-        !strncmp(arg.argv[0] + 2, def->long_name, name_len) &&
-        (arg.argv[0][name_len + 2] == '=' ||
-         arg.argv[0][name_len + 2] == '\0')) {
-      arg.name = arg.argv[0] + 2;
-      arg.val = arg.name[name_len] == '=' ? arg.name + name_len + 1 : NULL;
-      arg.argv_step = 1;
-    }
+  char err_msg[ARG_ERR_MSG_MAX_LEN];
+  int ret = arg_match_helper(arg_, def, argv, err_msg);
+  if (err_msg[0] != '\0') {
+    die("%s", err_msg);
   }
-
-  if (arg.name && !arg.val && def->has_val)
-    die("Error: option %s requires argument.\n", arg.name);
-
-  if (arg.name && arg.val && !def->has_val)
-    die("Error: option %s requires no argument.\n", arg.name);
-
-  if (arg.name && (arg.val || !def->has_val)) {
-    arg.def = def;
-    *arg_ = arg;
-    return 1;
-  }
-
-  return 0;
+  return ret;
 }
 
 const char *arg_next(struct arg *arg) {
@@ -192,6 +146,7 @@ const char *arg_next(struct arg *arg) {
 
 char **argv_dup(int argc, const char **argv) {
   char **new_argv = malloc((argc + 1) * sizeof(*argv));
+  if (!new_argv) return NULL;
 
   memcpy(new_argv, argv, argc * sizeof(*argv));
   new_argv[argc] = NULL;
@@ -199,24 +154,31 @@ char **argv_dup(int argc, const char **argv) {
 }
 
 void arg_show_usage(FILE *fp, const struct arg_def *const *defs) {
-  char option_text[40] = { 0 };
-
   for (; *defs; defs++) {
     const struct arg_def *def = *defs;
     char *short_val = def->has_val ? " <arg>" : "";
     char *long_val = def->has_val ? "=<arg>" : "";
+    int n = 0;
 
+    // Short options are indented with two spaces. Long options are indented
+    // with 12 spaces.
     if (def->short_name && def->long_name) {
       char *comma = def->has_val ? "," : ",      ";
 
-      snprintf(option_text, 37, "-%s%s%s --%s%6s", def->short_name, short_val,
-               comma, def->long_name, long_val);
+      n = fprintf(fp, "  -%s%s%s --%s%s", def->short_name, short_val, comma,
+                  def->long_name, long_val);
     } else if (def->short_name)
-      snprintf(option_text, 37, "-%s%s", def->short_name, short_val);
+      n = fprintf(fp, "  -%s%s", def->short_name, short_val);
     else if (def->long_name)
-      snprintf(option_text, 37, "          --%s%s", def->long_name, long_val);
+      n = fprintf(fp, "            --%s%s", def->long_name, long_val);
 
-    fprintf(fp, "  %-37s\t%s\n", option_text, def->desc);
+    // Descriptions are indented with 40 spaces. If an option is 40 characters
+    // or longer, its description starts on the next line.
+    if (n < 40)
+      for (int i = 0; i < 40 - n; i++) fputc(' ', fp);
+    else
+      fputs("\n                                        ", fp);
+    fprintf(fp, "%s\n", def->desc);
 
     if (def->enums) {
       const struct arg_enum_list *listptr;
@@ -230,114 +192,57 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs) {
 }
 
 unsigned int arg_parse_uint(const struct arg *arg) {
-  char *endptr;
-  const unsigned long rawval = strtoul(arg->val, &endptr, 10);  // NOLINT
-
-  if (arg->val[0] != '\0' && endptr[0] == '\0') {
-    if (rawval <= UINT_MAX) return (unsigned int)rawval;
-
-    die("Option %s: Value %lu out of range for unsigned int\n", arg->name,
-        rawval);
+  char err_msg[ARG_ERR_MSG_MAX_LEN];
+  unsigned int ret = arg_parse_uint_helper(arg, err_msg);
+  if (err_msg[0] != '\0') {
+    die("%s", err_msg);
   }
-
-  die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
-  return 0;
+  return ret;
 }
 
 int arg_parse_int(const struct arg *arg) {
-  char *endptr;
-  const long rawval = strtol(arg->val, &endptr, 10);  // NOLINT
-
-  if (arg->val[0] != '\0' && endptr[0] == '\0') {
-    if (rawval >= INT_MIN && rawval <= INT_MAX) return (int)rawval;
-
-    die("Option %s: Value %ld out of range for signed int\n", arg->name,
-        rawval);
+  char err_msg[ARG_ERR_MSG_MAX_LEN];
+  int ret = arg_parse_int_helper(arg, err_msg);
+  if (err_msg[0] != '\0') {
+    die("%s", err_msg);
   }
-
-  die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
-  return 0;
+  return ret;
 }
 
 struct aom_rational arg_parse_rational(const struct arg *arg) {
-  long int rawval;
-  char *endptr;
-  struct aom_rational rat;
-
-  /* parse numerator */
-  rawval = strtol(arg->val, &endptr, 10);
-
-  if (arg->val[0] != '\0' && endptr[0] == '/') {
-    if (rawval >= INT_MIN && rawval <= INT_MAX)
-      rat.num = (int)rawval;
-    else
-      die("Option %s: Value %ld out of range for signed int\n", arg->name,
-          rawval);
-  } else
-    die("Option %s: Expected / at '%c'\n", arg->name, *endptr);
-
-  /* parse denominator */
-  rawval = strtol(endptr + 1, &endptr, 10);
-
-  if (arg->val[0] != '\0' && endptr[0] == '\0') {
-    if (rawval >= INT_MIN && rawval <= INT_MAX)
-      rat.den = (int)rawval;
-    else
-      die("Option %s: Value %ld out of range for signed int\n", arg->name,
-          rawval);
-  } else
-    die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
-
-  return rat;
+  char err_msg[ARG_ERR_MSG_MAX_LEN];
+  struct aom_rational ret = arg_parse_rational_helper(arg, err_msg);
+  if (err_msg[0] != '\0') {
+    die("%s", err_msg);
+  }
+  return ret;
 }
 
 int arg_parse_enum(const struct arg *arg) {
-  const struct arg_enum_list *listptr;
-  long int rawval;
-  char *endptr;
-
-  /* First see if the value can be parsed as a raw value */
-  rawval = strtol(arg->val, &endptr, 10);
-  if (arg->val[0] != '\0' && endptr[0] == '\0') {
-    /* Got a raw value, make sure it's valid */
-    for (listptr = arg->def->enums; listptr->name; listptr++)
-      if (listptr->val == rawval) return (int)rawval;
+  char err_msg[ARG_ERR_MSG_MAX_LEN];
+  int ret = arg_parse_enum_helper(arg, err_msg);
+  if (err_msg[0] != '\0') {
+    die("%s", err_msg);
   }
-
-  /* Next see if it can be parsed as a string */
-  for (listptr = arg->def->enums; listptr->name; listptr++)
-    if (!strcmp(arg->val, listptr->name)) return listptr->val;
-
-  die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
-  return 0;
+  return ret;
 }
 
 int arg_parse_enum_or_int(const struct arg *arg) {
-  if (arg->def->enums) return arg_parse_enum(arg);
-  return arg_parse_int(arg);
+  char err_msg[ARG_ERR_MSG_MAX_LEN];
+  int ret = arg_parse_enum_or_int_helper(arg, err_msg);
+  if (err_msg[0] != '\0') {
+    die("%s", err_msg);
+  }
+  return ret;
 }
 
 // parse a comma separated list of at most n integers
 // return the number of elements in the list
 int arg_parse_list(const struct arg *arg, int *list, int n) {
-  const char *ptr = arg->val;
-  char *endptr;
-  int i = 0;
-
-  while (ptr[0] != '\0') {
-    int32_t rawval = (int32_t)strtol(ptr, &endptr, 10);
-    if (rawval < INT_MIN || rawval > INT_MAX) {
-      die("Option %s: Value %ld out of range for signed int\n", arg->name,
-          rawval);
-    } else if (i >= n) {
-      die("Option %s: List has more than %d entries\n", arg->name, n);
-    } else if (*endptr == ',') {
-      endptr++;
-    } else if (*endptr != '\0') {
-      die("Option %s: Bad list separator '%c'\n", arg->name, *endptr);
-    }
-    list[i++] = (int)rawval;
-    ptr = endptr;
+  char err_msg[ARG_ERR_MSG_MAX_LEN];
+  int ret = arg_parse_list_helper(arg, list, n, err_msg);
+  if (err_msg[0] != '\0') {
+    die("%s", err_msg);
   }
-  return i;
+  return ret;
 }
diff --git a/media/libaom/src/common/args.h b/media/libaom/src/common/args.h
index 286f7dd1ac..1c5c437632 100644
--- a/media/libaom/src/common/args.h
+++ b/media/libaom/src/common/args.h
@@ -15,44 +15,13 @@
 
 #include "aom/aom_codec.h"
 #include "aom/aom_encoder.h"
+#include "common/args_helper.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct arg {
-  char **argv;
-  const char *name;
-  const char *val;
-  unsigned int argv_step;
-  const struct arg_def *def;
-};
-
-struct arg_enum_list {
-  const char *name;
-  int val;
-};
-#define ARG_ENUM_LIST_END \
-  { 0 }
-
-typedef struct arg_def {
-  const char *short_name;
-  const char *long_name;
-  int has_val;
-  const char *desc;
-  const struct arg_enum_list *enums;
-} arg_def_t;
-#define ARG_DEF(s, l, v, d) \
-  { s, l, v, d, NULL }
-#define ARG_DEF_ENUM(s, l, v, d, e) \
-  { s, l, v, d, e }
-#define ARG_DEF_LIST_END \
-  { 0 }
-
-struct arg arg_init(char **argv);
 int arg_match(struct arg *arg_, const struct arg_def *def, char **argv);
-char *ignore_front_spaces(const char *str);
-void ignore_end_spaces(char *str);
 int parse_cfg(const char *file, cfg_options_t *config);
 const char *arg_next(struct arg *arg);
 void arg_show_usage(FILE *fp, const struct arg_def *const *defs);
diff --git a/media/libaom/src/common/args_helper.c b/media/libaom/src/common/args_helper.c
new file mode 100644
index 0000000000..2201868335
--- /dev/null
+++ b/media/libaom/src/common/args_helper.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "common/args_helper.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#define SET_ERR_STRING(...) \
+  if (err_msg) snprintf(err_msg, ARG_ERR_MSG_MAX_LEN, __VA_ARGS__)
+
+struct arg arg_init(char **argv) {
+  struct arg a;
+
+  a.argv = argv;
+  a.argv_step = 1;
+  a.name = NULL;
+  a.val = NULL;
+  a.def = NULL;
+  return a;
+}
+
+int arg_match_helper(struct arg *arg_, const struct arg_def *def, char **argv,
+                     char *err_msg) {
+  struct arg arg;
+
+  if (err_msg) err_msg[0] = '\0';
+
+  assert(def->has_val == 0 || def->has_val == 1 || def->has_val == -1);
+
+  if (!argv[0] || argv[0][0] != '-') return 0;
+
+  arg = arg_init(argv);
+
+  if (def->short_name && !strcmp(arg.argv[0] + 1, def->short_name)) {
+    arg.name = arg.argv[0] + 1;
+    arg.val = def->has_val ? arg.argv[1] : NULL;
+    arg.argv_step = def->has_val ? 2 : 1;
+  } else if (def->long_name) {
+    const size_t name_len = strlen(def->long_name);
+
+    if (arg.argv[0][1] == '-' &&
+        !strncmp(arg.argv[0] + 2, def->long_name, name_len) &&
+        (arg.argv[0][name_len + 2] == '=' ||
+         arg.argv[0][name_len + 2] == '\0')) {
+      arg.name = arg.argv[0] + 2;
+      arg.val = arg.name[name_len] == '=' ? arg.name + name_len + 1 : NULL;
+      arg.argv_step = 1;
+    }
+  }
+
+  if (arg.name) {
+    if (def->has_val == -1) {
+      arg.def = def;
+      *arg_ = arg;
+      return 1;
+    }
+
+    if (!arg.val && def->has_val) {
+      SET_ERR_STRING("Error: option %s requires argument.\n", arg.name);
+      return 0;
+    }
+
+    if (arg.val && !def->has_val) {
+      SET_ERR_STRING("Error: option %s requires no argument.\n", arg.name);
+      return 0;
+    }
+
+    arg.def = def;
+    *arg_ = arg;
+    return 1;
+  }
+
+  return 0;
+}
+
+unsigned int arg_parse_uint_helper(const struct arg *arg, char *err_msg) {
+  char *endptr;
+  const unsigned long rawval = strtoul(arg->val, &endptr, 10);  // NOLINT
+
+  if (err_msg) err_msg[0] = '\0';
+
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    if (rawval <= UINT_MAX) return (unsigned int)rawval;
+    SET_ERR_STRING("Option %s: Value %lu out of range for unsigned int\n",
+                   arg->name, rawval);
+    return 0;
+  }
+  SET_ERR_STRING("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+  return 0;
+}
+
+int arg_parse_int_helper(const struct arg *arg, char *err_msg) {
+  char *endptr;
+  const long rawval = strtol(arg->val, &endptr, 10);  // NOLINT
+
+  if (err_msg) err_msg[0] = '\0';
+
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    if (rawval >= INT_MIN && rawval <= INT_MAX) return (int)rawval;
+    SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n",
+                   arg->name, rawval);
+    return 0;
+  }
+  SET_ERR_STRING("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+  return 0;
+}
+
+struct aom_rational arg_parse_rational_helper(const struct arg *arg,
+                                              char *err_msg) {
+  long rawval;  // NOLINT
+  char *endptr;
+  struct aom_rational rat = { 0, 1 };
+
+  if (err_msg) err_msg[0] = '\0';
+
+  /* parse numerator */
+  rawval = strtol(arg->val, &endptr, 10);
+
+  if (arg->val[0] != '\0' && endptr[0] == '/') {
+    if (rawval >= INT_MIN && rawval <= INT_MAX) {
+      rat.num = (int)rawval;
+    } else {
+      SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n",
+                     arg->name, rawval);
+      return rat;
+    }
+  } else {
+    SET_ERR_STRING("Option %s: Expected / at '%c'\n", arg->name, *endptr);
+    return rat;
+  }
+
+  /* parse denominator */
+  rawval = strtol(endptr + 1, &endptr, 10);
+
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    if (rawval >= INT_MIN && rawval <= INT_MAX) {
+      rat.den = (int)rawval;
+    } else {
+      SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n",
+                     arg->name, rawval);
+      return rat;
+    }
+  } else {
+    SET_ERR_STRING("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+    return rat;
+  }
+
+  return rat;
+}
+
+int arg_parse_enum_helper(const struct arg *arg, char *err_msg) {
+  const struct arg_enum_list *listptr;
+  long rawval;  // NOLINT
+  char *endptr;
+
+  if (err_msg) err_msg[0] = '\0';
+
+  /* First see if the value can be parsed as a raw value */
+  rawval = strtol(arg->val, &endptr, 10);
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    /* Got a raw value, make sure it's valid */
+    for (listptr = arg->def->enums; listptr->name; listptr++)
+      if (listptr->val == rawval) return (int)rawval;
+  }
+
+  /* Next see if it can be parsed as a string */
+  for (listptr = arg->def->enums; listptr->name; listptr++)
+    if (!strcmp(arg->val, listptr->name)) return listptr->val;
+
+  SET_ERR_STRING("Option %s: Invalid value '%s'\n", arg->name, arg->val);
+  return 0;
+}
+
+int arg_parse_enum_or_int_helper(const struct arg *arg, char *err_msg) {
+  if (arg->def->enums) return arg_parse_enum_helper(arg, err_msg);
+  return arg_parse_int_helper(arg, err_msg);
+}
+
+// parse a comma separated list of at most n integers
+// return the number of elements in the list
+int arg_parse_list_helper(const struct arg *arg, int *list, int n,
+                          char *err_msg) {
+  const char *ptr = arg->val;
+  char *endptr;
+  int i = 0;
+
+  if (err_msg) err_msg[0] = '\0';
+
+  while (ptr[0] != '\0') {
+    long rawval = strtol(ptr, &endptr, 10);  // NOLINT
+    if (rawval < INT_MIN || rawval > INT_MAX) {
+      SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n",
+                     arg->name, rawval);
+      return 0;
+    } else if (i >= n) {
+      SET_ERR_STRING("Option %s: List has more than %d entries\n", arg->name,
+                     n);
+      return 0;
+    } else if (*endptr == ',') {
+      endptr++;
+    } else if (*endptr != '\0') {
+      SET_ERR_STRING("Option %s: Bad list separator '%c'\n", arg->name,
+                     *endptr);
+      return 0;
+    }
+    list[i++] = (int)rawval;
+    ptr = endptr;
+  }
+  return i;
+}
diff --git a/media/libaom/src/common/args_helper.h b/media/libaom/src/common/args_helper.h
new file mode 100644
index 0000000000..c86a6128d3
--- /dev/null
+++ b/media/libaom/src/common/args_helper.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_ARGS_HELPER_H_
+#define AOM_COMMON_ARGS_HELPER_H_
+
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Maximum length of the error messages for the helper functions.
+#define ARG_ERR_MSG_MAX_LEN 200
+
+struct arg {
+  char **argv;
+  const char *name;
+  const char *val;
+  unsigned int argv_step;
+  const struct arg_def *def;
+};
+
+struct arg_enum_list {
+  const char *name;
+  int val;
+};
+#define ARG_ENUM_LIST_END \
+  { 0 }
+
+typedef struct arg_def {
+  const char *short_name;
+  const char *long_name;
+  int has_val;  //  0: The argument must not have a value.
+                //  1: The argument must have a value.
+                // -1: The argument may or may not have a value.
+  const char *desc;
+  const struct arg_enum_list *enums;
+} arg_def_t;
+#define ARG_DEF(s, l, v, d) \
+  { s, l, v, d, NULL }
+#define ARG_DEF_ENUM(s, l, v, d, e) \
+  { s, l, v, d, e }
+#define ARG_DEF_LIST_END \
+  { 0 }
+
+struct arg arg_init(char **argv);
+
+/*
+ * The helper functions below all take an optional parameter err_msg for
+ * error reporting. When err_msg is not NULL (must point to a buffer
+ * which is at least ARG_ERR_MSG_MAX_LEN bytes long), a related error message is
+ * stored in it if an error occurs. It will be set to an empty string if no
+ * error occurs.
+ */
+int arg_match_helper(struct arg *arg_, const struct arg_def *def, char **argv,
+                     char *err_msg);
+unsigned int arg_parse_uint_helper(const struct arg *arg, char *err_msg);
+int arg_parse_int_helper(const struct arg *arg, char *err_msg);
+struct aom_rational arg_parse_rational_helper(const struct arg *arg,
+                                              char *err_msg);
+int arg_parse_enum_helper(const struct arg *arg, char *err_msg);
+int arg_parse_enum_or_int_helper(const struct arg *arg, char *err_msg);
+int arg_parse_list_helper(const struct arg *arg, int *list, int n,
+                          char *err_msg);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_ARGS_HELPER_H_
diff --git a/media/libaom/src/common/ivf_dec.cmake b/media/libaom/src/common/ivf_dec.cmake
new file mode 100644
index 0000000000..fedeea7940
--- /dev/null
+++ b/media/libaom/src/common/ivf_dec.cmake
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_COMMON_IVF_DEC_CMAKE_)
+  return()
+endif() # AOM_COMMON_AOM_COMMON_CMAKE_
+set(AOM_COMMON_IVF_DEC_CMAKE_ 1)
+
+list(APPEND IVF_DEC_SOURCES "${AOM_ROOT}/common/ivfdec.c"
+            "${AOM_ROOT}/common/ivfdec.h")
+
+# Creates the aom_common build target and makes libaom depend on it. The libaom
+# target must exist before this function is called.
+function(setup_ivf_dec_targets)
+  add_library(ivf_dec OBJECT ${IVF_DEC_SOURCES})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} ivf_dec PARENT_SCOPE)
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:ivf_dec>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:ivf_dec>)
+  endif()
+endfunction()
diff --git a/media/libaom/src/common/ivfdec.c b/media/libaom/src/common/ivfdec.c
index 80d73b04c9..18f053e3ad 100644
--- a/media/libaom/src/common/ivfdec.c
+++ b/media/libaom/src/common/ivfdec.c
@@ -39,7 +39,7 @@ int file_is_ivf(struct AvxInputContext *input_ctx) {
       if (mem_get_le16(raw_hdr + 4) != 0) {
         fprintf(stderr,
                 "Error: Unrecognized IVF version! This file may not"
-                " decode properly.");
+                " decode properly.\n");
       }
 
       input_ctx->fourcc = mem_get_le32(raw_hdr + 8);
@@ -67,12 +67,13 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
   size_t frame_size = 0;
 
   if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile)) warn("Failed to read frame size");
+    if (!feof(infile)) fprintf(stderr, "Warning: Failed to read frame size\n");
   } else {
     frame_size = mem_get_le32(raw_header);
 
     if (frame_size > 256 * 1024 * 1024) {
-      warn("Read invalid frame size (%u)", (unsigned int)frame_size);
+      fprintf(stderr, "Warning: Read invalid frame size (%u)\n",
+              (unsigned int)frame_size);
       frame_size = 0;
     }
 
@@ -83,7 +84,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
         *buffer = new_buffer;
         *buffer_size = 2 * frame_size;
       } else {
-        warn("Failed to allocate compressed data buffer");
+        fprintf(stderr, "Warning: Failed to allocate compressed data buffer\n");
         frame_size = 0;
       }
     }
@@ -97,7 +98,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
   if (!feof(infile)) {
     ASAN_UNPOISON_MEMORY_REGION(*buffer, *buffer_size);
     if (fread(*buffer, 1, frame_size, infile) != frame_size) {
-      warn("Failed to read full frame");
+      fprintf(stderr, "Warning: Failed to read full frame\n");
       return 1;
     }
 
diff --git a/media/libaom/src/common/rawenc.c b/media/libaom/src/common/rawenc.c
index b72132c2e9..aa80d2cae3 100644
--- a/media/libaom/src/common/rawenc.c
+++ b/media/libaom/src/common/rawenc.c
@@ -12,14 +12,8 @@
 #include <stdbool.h>
 #include "common/rawenc.h"
 
+// Number of bytes to write per batch in write_greyscale.
 #define BATCH_SIZE 8
-// When writing greyscale color, batch 8 writes for low bit-depth, 4 writes
-// for high bit-depth.
-static const uint8_t batched[BATCH_SIZE] = { 128, 128, 128, 128,
-                                             128, 128, 128, 128 };
-static const uint8_t batched_hbd[BATCH_SIZE] = {
-  0, 128, 0, 128, 0, 128, 0, 128
-};
 
 // Interface to writing to either a file or MD5Context. Takes a pointer to
 // either the file or MD5Context, the buffer, the size of each element, and
@@ -37,25 +31,34 @@ static void write_md5(void *md5, const uint8_t *buffer, unsigned int size,
   MD5Update((MD5Context *)md5, buffer, size * nmemb);
 }
 
-// Writes out n greyscale values.
-static void write_greyscale(const bool high_bitdepth, int n, WRITER writer_func,
+// Writes out n neutral chroma samples (for greyscale).
+static void write_greyscale(const aom_image_t *img, int n, WRITER writer_func,
                             void *file_or_md5) {
-  const uint8_t *b = batched;
-  if (high_bitdepth) {
-    b = batched_hbd;
+  // Batch 8 writes for low bit-depth, 4 writes for high bit-depth.
+  int bytes_per_sample;
+  union {
+    uint8_t u8[BATCH_SIZE];
+    uint16_t u16[BATCH_SIZE / 2];
+  } batched;
+  if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    bytes_per_sample = 2;
+    for (int i = 0; i < BATCH_SIZE / 2; ++i) {
+      batched.u16[i] = 1 << (img->bit_depth - 1);
+    }
+  } else {
+    bytes_per_sample = 1;
+    for (int i = 0; i < BATCH_SIZE; ++i) {
+      batched.u8[i] = 0x80;
+    }
   }
-  const int num_batched_writes =
-      high_bitdepth ? n / (BATCH_SIZE / 2) : n / BATCH_SIZE;
+  const int samples_per_batch = BATCH_SIZE / bytes_per_sample;
+  const int num_batched_writes = n / samples_per_batch;
   for (int i = 0; i < num_batched_writes; ++i) {
-    writer_func(file_or_md5, b, sizeof(uint8_t), BATCH_SIZE);
+    writer_func(file_or_md5, batched.u8, sizeof(uint8_t), BATCH_SIZE);
   }
-  const int remaining = high_bitdepth ? n % (BATCH_SIZE / 2) : n % BATCH_SIZE;
+  const int remaining = n % samples_per_batch;
   for (int i = 0; i < remaining; ++i) {
-    if (high_bitdepth) {
-      writer_func(file_or_md5, batched_hbd, sizeof(uint8_t), 2);
-    } else {
-      writer_func(file_or_md5, batched, sizeof(uint8_t), 1);
-    }
+    writer_func(file_or_md5, batched.u8, sizeof(uint8_t), bytes_per_sample);
   }
 }
 
@@ -73,7 +76,7 @@ static void raw_write_image_file_or_md5(const aom_image_t *img,
     // If we're on a color plane and the output is monochrome, write a greyscale
     // value. Since there are only YUV planes, compare against Y.
     if (img->monochrome && plane != AOM_PLANE_Y) {
-      write_greyscale(high_bitdepth, w * h, writer_func, file_or_md5);
+      write_greyscale(img, w * h, writer_func, file_or_md5);
       continue;
     }
     const unsigned char *buf = img->planes[plane];
diff --git a/media/libaom/src/common/tools_common.c b/media/libaom/src/common/tools_common.c
index 51c1c52a1a..5b70b1e234 100644
--- a/media/libaom/src/common/tools_common.c
+++ b/media/libaom/src/common/tools_common.c
@@ -9,14 +9,15 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "common/tools_common.h"
-
+#include <assert.h>
 #include <math.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
+#include "common/tools_common.h"
+
 #if CONFIG_AV1_ENCODER
 #include "aom/aomcx.h"
 #endif
@@ -65,7 +66,7 @@ void fatal(const char *fmt, ...) {
   exit(EXIT_FAILURE);
 }
 
-void warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
+void aom_tools_warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
 
 void die_codec(aom_codec_ctx_t *ctx, const char *s) {
   const char *detail = aom_codec_error_detail(ctx);
@@ -84,10 +85,12 @@ int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame) {
 
   for (plane = 0; plane < 3; ++plane) {
     uint8_t *ptr;
-    const int w = aom_img_plane_width(yuv_frame, plane);
+    int w = aom_img_plane_width(yuv_frame, plane);
     const int h = aom_img_plane_height(yuv_frame, plane);
     int r;
-
+    // Assuming that for nv12 we read all chroma data at one time
+    if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane > 1) break;
+    if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2;
     /* Determine the correct plane based on the image format. The for-loop
      * always counts in Y,U,V order, but this may not match the order of
      * the data on disk.
@@ -128,66 +131,107 @@ int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame) {
   return shortread;
 }
 
+struct CodecInfo {
+  // Pointer to a function of zero arguments that returns an aom_codec_iface_t.
+  aom_codec_iface_t *(*const interface)();
+  char *short_name;
+  uint32_t fourcc;
+};
+
 #if CONFIG_AV1_ENCODER
-static const AvxInterface aom_encoders[] = {
-  { "av1", AV1_FOURCC, &aom_codec_av1_cx },
+static const struct CodecInfo aom_encoders[] = {
+  { &aom_codec_av1_cx, "av1", AV1_FOURCC },
 };
 
 int get_aom_encoder_count(void) {
   return sizeof(aom_encoders) / sizeof(aom_encoders[0]);
 }
 
-const AvxInterface *get_aom_encoder_by_index(int i) { return &aom_encoders[i]; }
+aom_codec_iface_t *get_aom_encoder_by_index(int i) {
+  assert(i >= 0 && i < get_aom_encoder_count());
+  return aom_encoders[i].interface();
+}
 
-const AvxInterface *get_aom_encoder_by_name(const char *name) {
-  int i;
+aom_codec_iface_t *get_aom_encoder_by_short_name(const char *name) {
+  for (int i = 0; i < get_aom_encoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_encoders[i];
+    if (strcmp(info->short_name, name) == 0) return info->interface();
+  }
+  return NULL;
+}
 
-  for (i = 0; i < get_aom_encoder_count(); ++i) {
-    const AvxInterface *encoder = get_aom_encoder_by_index(i);
-    if (strcmp(encoder->name, name) == 0) return encoder;
+uint32_t get_fourcc_by_aom_encoder(aom_codec_iface_t *iface) {
+  for (int i = 0; i < get_aom_encoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_encoders[i];
+    if (info->interface() == iface) {
+      return info->fourcc;
+    }
   }
+  return 0;
+}
 
+const char *get_short_name_by_aom_encoder(aom_codec_iface_t *iface) {
+  for (int i = 0; i < get_aom_encoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_encoders[i];
+    if (info->interface() == iface) {
+      return info->short_name;
+    }
+  }
   return NULL;
 }
 
-// large scale tile encoding
-static const AvxInterface aom_lst_encoder = { "av1", LST_FOURCC,
-                                              &aom_codec_av1_cx };
-const AvxInterface *get_aom_lst_encoder(void) { return &aom_lst_encoder; }
 #endif  // CONFIG_AV1_ENCODER
 
 #if CONFIG_AV1_DECODER
-static const AvxInterface aom_decoders[] = {
-  { "av1", AV1_FOURCC, &aom_codec_av1_dx },
+static const struct CodecInfo aom_decoders[] = {
+  { &aom_codec_av1_dx, "av1", AV1_FOURCC },
 };
 
 int get_aom_decoder_count(void) {
   return sizeof(aom_decoders) / sizeof(aom_decoders[0]);
 }
 
-const AvxInterface *get_aom_decoder_by_index(int i) { return &aom_decoders[i]; }
-
-const AvxInterface *get_aom_decoder_by_name(const char *name) {
-  int i;
+aom_codec_iface_t *get_aom_decoder_by_index(int i) {
+  assert(i >= 0 && i < get_aom_decoder_count());
+  return aom_decoders[i].interface();
+}
 
-  for (i = 0; i < get_aom_decoder_count(); ++i) {
-    const AvxInterface *const decoder = get_aom_decoder_by_index(i);
-    if (strcmp(decoder->name, name) == 0) return decoder;
+aom_codec_iface_t *get_aom_decoder_by_short_name(const char *name) {
+  for (int i = 0; i < get_aom_decoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_decoders[i];
+    if (strcmp(info->short_name, name) == 0) return info->interface();
   }
-
   return NULL;
 }
 
-const AvxInterface *get_aom_decoder_by_fourcc(uint32_t fourcc) {
-  int i;
-
-  for (i = 0; i < get_aom_decoder_count(); ++i) {
-    const AvxInterface *const decoder = get_aom_decoder_by_index(i);
-    if (decoder->fourcc == fourcc) return decoder;
+aom_codec_iface_t *get_aom_decoder_by_fourcc(uint32_t fourcc) {
+  for (int i = 0; i < get_aom_decoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_decoders[i];
+    if (info->fourcc == fourcc) return info->interface();
   }
+  return NULL;
+}
 
+const char *get_short_name_by_aom_decoder(aom_codec_iface_t *iface) {
+  for (int i = 0; i < get_aom_decoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_decoders[i];
+    if (info->interface() == iface) {
+      return info->short_name;
+    }
+  }
   return NULL;
 }
+
+uint32_t get_fourcc_by_aom_decoder(aom_codec_iface_t *iface) {
+  for (int i = 0; i < get_aom_decoder_count(); ++i) {
+    const struct CodecInfo *info = &aom_decoders[i];
+    if (info->interface() == iface) {
+      return info->fourcc;
+    }
+  }
+  return 0;
+}
+
 #endif  // CONFIG_AV1_DECODER
 
 void aom_img_write(const aom_image_t *img, FILE *file) {
@@ -208,7 +252,7 @@ void aom_img_write(const aom_image_t *img, FILE *file) {
   }
 }
 
-int aom_img_read(aom_image_t *img, FILE *file) {
+bool aom_img_read(aom_image_t *img, FILE *file) {
   int plane;
 
   for (plane = 0; plane < 3; ++plane) {
@@ -220,12 +264,12 @@ int aom_img_read(aom_image_t *img, FILE *file) {
     int y;
 
     for (y = 0; y < h; ++y) {
-      if (fread(buf, 1, w, file) != (size_t)w) return 0;
+      if (fread(buf, 1, w, file) != (size_t)w) return false;
       buf += stride;
     }
   }
 
-  return 1;
+  return true;
 }
 
 // TODO(dkovalev) change sse_to_psnr signature: double -> int64_t
@@ -437,7 +481,7 @@ static int img_shifted_realloc_required(const aom_image_t *img,
          required_fmt != shifted->fmt;
 }
 
-void aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr,
+bool aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr,
                    aom_image_t **img_shifted_ptr) {
   aom_image_t *img = *img_ptr;
   aom_image_t *img_shifted = *img_shifted_ptr;
@@ -457,6 +501,10 @@ void aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr,
     }
     if (!img_shifted) {
       img_shifted = aom_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16);
+      if (!img_shifted) {
+        *img_shifted_ptr = NULL;
+        return false;
+      }
       img_shifted->bit_depth = output_bit_depth;
       img_shifted->monochrome = img->monochrome;
       img_shifted->csp = img->csp;
@@ -469,6 +517,8 @@ void aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr,
     *img_shifted_ptr = img_shifted;
     *img_ptr = img_shifted;
   }
+
+  return true;
 }
 
 // Related to I420, NV12 format has one luma "luminance" plane Y and one plane
diff --git a/media/libaom/src/common/tools_common.h b/media/libaom/src/common/tools_common.h
index 1ed004521a..77494dea37 100644
--- a/media/libaom/src/common/tools_common.h
+++ b/media/libaom/src/common/tools_common.h
@@ -11,6 +11,7 @@
 #ifndef AOM_COMMON_TOOLS_COMMON_H_
 #define AOM_COMMON_TOOLS_COMMON_H_
 
+#include <stdbool.h>
 #include <stdio.h>
 
 #include "config/aom_config.h"
@@ -78,13 +79,6 @@ enum VideoFileType {
   FILE_TYPE_WEBM
 };
 
-// Used in lightfield example.
-enum {
-  YUV1D,  // 1D tile output for conformance test.
-  YUV,    // Tile output in YUV format.
-  NV12,   // Tile output in NV12 format.
-} UENUM1BYTE(OUTPUT_FORMAT);
-
 // The fourcc for large_scale_tile encoding is "LSTC".
 #define LST_FOURCC 0x4354534c
 
@@ -116,6 +110,7 @@ struct AvxInputContext {
 #if CONFIG_AV1_ENCODER
   y4m_input y4m;
 #endif
+  aom_color_range_t color_range;
 };
 
 #ifdef __cplusplus
@@ -124,133 +119,69 @@ extern "C" {
 
 #if defined(__GNUC__)
 #define AOM_NO_RETURN __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define AOM_NO_RETURN __declspec(noreturn)
 #else
 #define AOM_NO_RETURN
 #endif
 
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+#define AOM_TOOLS_FORMAT_PRINTF(string_index, first_to_check)
+#if defined(__has_attribute)
+#if __has_attribute(format)
+#undef AOM_TOOLS_FORMAT_PRINTF
+#define AOM_TOOLS_FORMAT_PRINTF(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#endif
+
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
-void die(const char *fmt, ...) AOM_NO_RETURN;
-void fatal(const char *fmt, ...) AOM_NO_RETURN;
-void warn(const char *fmt, ...);
+AOM_NO_RETURN void die(const char *fmt, ...) AOM_TOOLS_FORMAT_PRINTF(1, 2);
+AOM_NO_RETURN void fatal(const char *fmt, ...) AOM_TOOLS_FORMAT_PRINTF(1, 2);
+void aom_tools_warn(const char *fmt, ...) AOM_TOOLS_FORMAT_PRINTF(1, 2);
 
-void die_codec(aom_codec_ctx_t *ctx, const char *s) AOM_NO_RETURN;
+AOM_NO_RETURN void die_codec(aom_codec_ctx_t *ctx, const char *s);
 
 /* The tool including this file must define usage_exit() */
-void usage_exit(void) AOM_NO_RETURN;
+AOM_NO_RETURN void usage_exit(void);
 
 #undef AOM_NO_RETURN
 
-int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame);
+// The AOM library can support different encoders / decoders. These
+// functions provide different ways to lookup / iterate through them.
+// The return result may be NULL to indicate no codec was found.
+int get_aom_encoder_count();
+aom_codec_iface_t *get_aom_encoder_by_index(int i);
+aom_codec_iface_t *get_aom_encoder_by_short_name(const char *name);
+// If the interface is unknown, returns NULL.
+const char *get_short_name_by_aom_encoder(aom_codec_iface_t *encoder);
+// If the interface is unknown, returns 0.
+uint32_t get_fourcc_by_aom_encoder(aom_codec_iface_t *iface);
+
+int get_aom_decoder_count();
+aom_codec_iface_t *get_aom_decoder_by_index(int i);
+aom_codec_iface_t *get_aom_decoder_by_short_name(const char *name);
+aom_codec_iface_t *get_aom_decoder_by_fourcc(uint32_t fourcc);
+const char *get_short_name_by_aom_decoder(aom_codec_iface_t *decoder);
+// If the interface is unknown, returns 0.
+uint32_t get_fourcc_by_aom_decoder(aom_codec_iface_t *iface);
 
-///////////////////////////////////////////////////////////////////////////////
-// A description of the interfaces used to access the AOM codecs
-///////////////////////////////////////////////////////////////////////////////
-//
-// There are three levels of interfaces used to access the AOM codec: the
-// AVXInterface, the aom_codec_iface, and the aom_codec_ctx. Each of these
-// is described in detail here.
-//
-//
-// 1. AVXInterface
-//    (Related files: common/tools_common.c,  common/tools_common.h)
-//
-// The high-level interface to the AVx encoders / decoders. Each AvxInterface
-// contains the name of the codec (e.g., "av1"), the four character code
-// associated with it, and a function pointer to the actual interface (see the
-// documentation on aom_codec_iface_t for more info). This API
-// is meant for lookup / iteration over all known codecs.
-//
-// For the encoder, call get_aom_encoder_by_name(...) if you know the name
-// (e.g., "av1"); to iterate over all known encoders, use
-// get_aom_encoder_count() and get_aom_encoder_by_index(i). To get the
-// encoder specifically for large scale tile encoding, use
-// get_aom_lst_encoder().
-//
-// For the decoder, similar functions are available. There is also a
-// get_aom_decoder_by_fourcc(fourcc) to get the decoder based on the four
-// character codes.
-//
-// The main purpose of the AVXInterface is to get a reference to the
-// aom_codec_interface_t, pointed to by its codec_interface variable.
-//
-//
-// 2. aom_codec_iface_t
-//    (Related files: aom/aom_codec.h, aom/src/aom_codec.c,
-//    aom/internal/aom_codec_internal.h, av1/av1_cx_iface.c,
-//    av1/av1_dx_iface.c)
-//
-// Used to initialize the codec context, which contains the configuration for
-// for modifying the encoder/decoder during run-time. See the documentation of
-// aom/aom_codec.h for more details. For the most part, users will call the
-// helper functions listed there, such as aom_codec_iface_name,
-// aom_codec_get_caps, etc., to interact with it.
-//
-// The main purpose of the aom_codec_iface_t is to provide a way to generate
-// a default codec config, find out what capabilities the implementation has,
-// and create an aom_codec_ctx_t (which is actually used to interact with the
-// codec).
-//
-// Note that the implementations of the aom_codec_iface_t are located in
-// av1/av1_cx_iface.c and av1/av1_dx_iface.c
-//
-//
-// 3. aom_codec_ctx_t
-//  (Related files: aom/aom_codec.h, av1/av1_cx_iface.c, av1/av1_dx_iface.c,
-//   aom/aomcx.h, aom/aomdx.h, aom/src/aom_encoder.c, aom/src/aom_decoder.c)
-//
-// The actual interface between user code and the codec. It stores the name
-// of the codec, a pointer back to the aom_codec_iface_t that initialized it,
-// initialization flags, a config for either encoder or the decoder, and a
-// pointer to internal data.
-//
-// The codec is configured / queried through calls to aom_codec_control,
-// which takes a control code (listed in aomcx.h and aomdx.h) and a parameter.
-// In the case of "getter" control codes, the parameter is modified to have
-// the requested value; in the case of "setter" control codes, the codec's
-// configuration is changed based on the parameter. Note that a aom_codec_err_t
-// is returned, which indicates if the operation was successful or not.
-//
-// Note that for the encoder, the aom_codec_alg_priv_t points to the
-// the aom_codec_alg_priv structure in av1/av1_cx_iface.c, and for the decoder,
-// the struct in av1/av1_dx_iface.c. Variables such as AV1_COMP cpi are stored
-// here and also used in the core algorithm.
-//
-// At the end, aom_codec_destroy should be called for each initialized
-// aom_codec_ctx_t.
-
-typedef struct AvxInterface {
-  const char *const name;
-  const uint32_t fourcc;
-  // Pointer to a function of zero arguments that returns an aom_codec_iface_t
-  // pointer. E.g.:
-  //   aom_codec_iface_t *codec = interface->codec_interface();
-  aom_codec_iface_t *(*const codec_interface)();
-} AvxInterface;
-
-int get_aom_encoder_count(void);
-// Lookup the interface by index -- it must be the case that
-// i < get_aom_encoder_count()
-const AvxInterface *get_aom_encoder_by_index(int i);
-// Lookup the interface by name -- returns NULL if no match.
-const AvxInterface *get_aom_encoder_by_name(const char *name);
-const AvxInterface *get_aom_lst_encoder(void);
-
-int get_aom_decoder_count(void);
-const AvxInterface *get_aom_decoder_by_index(int i);
-const AvxInterface *get_aom_decoder_by_name(const char *name);
-// Lookup the interface by the fourcc -- returns NULL if no match.
-const AvxInterface *get_aom_decoder_by_fourcc(uint32_t fourcc);
+int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame);
 
 void aom_img_write(const aom_image_t *img, FILE *file);
-int aom_img_read(aom_image_t *img, FILE *file);
+// Returns true on success, false on failure.
+bool aom_img_read(aom_image_t *img, FILE *file);
 
 double sse_to_psnr(double samples, double peak, double mse);
 void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift);
 void aom_img_downshift(aom_image_t *dst, const aom_image_t *src,
                        int down_shift);
-void aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr,
+// Returns true on success, false on failure.
+bool aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr,
                    aom_image_t **img_shifted_ptr);
 void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src);
 
diff --git a/media/libaom/src/common/warnings.c b/media/libaom/src/common/warnings.c
index 2facee2526..a20531cb8b 100644
--- a/media/libaom/src/common/warnings.c
+++ b/media/libaom/src/common/warnings.c
@@ -86,7 +86,7 @@ void check_encoder_config(int disable_prompt,
   /* Count and print warnings. */
   for (warning = warning_list.warning_node; warning != NULL;
        warning = warning->next_warning, ++num_warnings) {
-    warn(warning->warning_string);
+    aom_tools_warn("%s", warning->warning_string);
   }
 
   free_warning_list(&warning_list);
diff --git a/media/libaom/src/common/webmenc.cc b/media/libaom/src/common/webmenc.cc
index 6ae7df646f..bb754e8119 100644
--- a/media/libaom/src/common/webmenc.cc
+++ b/media/libaom/src/common/webmenc.cc
@@ -12,7 +12,10 @@
 #include "common/webmenc.h"
 
 #include <stdio.h>
+#include <string.h>
 
+#include <memory>
+#include <new>
 #include <string>
 
 #include "common/av1_config.h"
@@ -23,21 +26,73 @@
 namespace {
 const uint64_t kDebugTrackUid = 0xDEADBEEF;
 const int kVideoTrackNumber = 1;
+
+// Simplistic mechanism to detect if an argv parameter refers to
+// an input or output file. Returns the total number of arguments that
+// should be skipped.
+int skip_input_output_arg(const char *arg, const char *input_fname) {
+  if (strcmp(arg, input_fname) == 0) {
+    return 1;
+  }
+  if (strcmp(arg, "-o") == 0 || strcmp(arg, "--output") == 0) {
+    return 2;
+  }
+  if (strncmp(arg, "--output=", strlen("--output=")) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
 }  // namespace
 
+char *extract_encoder_settings(const char *version, const char **argv, int argc,
+                               const char *input_fname) {
+  // + 9 for "version:" prefix and for null terminator.
+  size_t total_size = strlen(version) + 9;
+  int i = 1;
+  while (i < argc) {
+    int num_skip = skip_input_output_arg(argv[i], input_fname);
+    i += num_skip;
+    if (num_skip == 0) {
+      total_size += strlen(argv[i]) + 1;  // + 1 is for space separator.
+      ++i;
+    }
+  }
+  char *result = static_cast<char *>(malloc(total_size));
+  if (result == nullptr) {
+    return nullptr;
+  }
+  char *cur = result;
+  cur += snprintf(cur, total_size, "version:%s", version);
+  i = 1;
+  while (i < argc) {
+    int num_skip = skip_input_output_arg(argv[i], input_fname);
+    i += num_skip;
+    if (num_skip == 0) {
+      cur += snprintf(cur, total_size, " %s", argv[i]);
+      ++i;
+    }
+  }
+  *cur = '\0';
+  return result;
+}
+
 int write_webm_file_header(struct WebmOutputContext *webm_ctx,
                            aom_codec_ctx_t *encoder_ctx,
                            const aom_codec_enc_cfg_t *cfg,
                            stereo_format_t stereo_fmt, unsigned int fourcc,
-                           const struct AvxRational *par) {
-  mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(webm_ctx->stream);
-  mkvmuxer::Segment *const segment = new mkvmuxer::Segment();
-  if (!writer || !segment) {
+                           const struct AvxRational *par,
+                           const char *encoder_settings) {
+  std::unique_ptr<mkvmuxer::MkvWriter> writer(
+      new (std::nothrow) mkvmuxer::MkvWriter(webm_ctx->stream));
+  std::unique_ptr<mkvmuxer::Segment> segment(new (std::nothrow)
+                                                 mkvmuxer::Segment());
+  if (writer == nullptr || segment == nullptr) {
     fprintf(stderr, "webmenc> mkvmuxer objects alloc failed, out of memory?\n");
     return -1;
   }
 
-  bool ok = segment->Init(writer);
+  bool ok = segment->Init(writer.get());
   if (!ok) {
     fprintf(stderr, "webmenc> mkvmuxer Init failed.\n");
     return -1;
@@ -116,13 +171,27 @@ int write_webm_file_header(struct WebmOutputContext *webm_ctx,
     video_track->set_display_height(cfg->g_h);
   }
 
+  if (encoder_settings != nullptr) {
+    mkvmuxer::Tag *tag = segment->AddTag();
+    if (tag == nullptr) {
+      fprintf(stderr,
+              "webmenc> Unable to allocate memory for encoder settings tag.\n");
+      return -1;
+    }
+    ok = tag->add_simple_tag("ENCODER_SETTINGS", encoder_settings);
+    if (!ok) {
+      fprintf(stderr,
+              "webmenc> Unable to allocate memory for encoder settings tag.\n");
+      return -1;
+    }
+  }
+
   if (webm_ctx->debug) {
     video_track->set_uid(kDebugTrackUid);
   }
 
-  webm_ctx->writer = writer;
-  webm_ctx->segment = segment;
-
+  webm_ctx->writer = writer.release();
+  webm_ctx->segment = segment.release();
   return 0;
 }
 
diff --git a/media/libaom/src/common/webmenc.h b/media/libaom/src/common/webmenc.h
index a4aa992b02..c912208b45 100644
--- a/media/libaom/src/common/webmenc.h
+++ b/media/libaom/src/common/webmenc.h
@@ -38,6 +38,16 @@ enum {
   STEREO_FORMAT_RIGHT_LEFT = 11
 } UENUM1BYTE(stereo_format_t);
 
+// Simplistic mechanism to extract encoder settings, without having
+// to re-invoke the entire flag-parsing logic. It lists the codec version
+// and then copies the arguments as-is from argv, but skips the binary name,
+// any arguments that match the input filename, and the output flags "-o"
+// and "--output" (and the following argument for those flags). The caller
+// is responsible for free-ing the returned string. If there is insufficient
+// memory, it returns nullptr.
+char *extract_encoder_settings(const char *version, const char **argv, int argc,
+                               const char *input_fname);
+
 // The following functions wrap libwebm's mkvmuxer. All functions return 0 upon
 // success, or -1 upon failure.
 
@@ -45,7 +55,8 @@ int write_webm_file_header(struct WebmOutputContext *webm_ctx,
                            aom_codec_ctx_t *encoder_ctx,
                            const aom_codec_enc_cfg_t *cfg,
                            stereo_format_t stereo_fmt, unsigned int fourcc,
-                           const struct AvxRational *par);
+                           const struct AvxRational *par,
+                           const char *encoder_settings);
 
 int write_webm_block(struct WebmOutputContext *webm_ctx,
                      const aom_codec_enc_cfg_t *cfg,
diff --git a/media/libaom/src/common/y4menc.c b/media/libaom/src/common/y4menc.c
index e3f5d5b387..eaeedba57d 100644
--- a/media/libaom/src/common/y4menc.c
+++ b/media/libaom/src/common/y4menc.c
@@ -83,11 +83,16 @@ static const char *colorspace(unsigned int bit_depth,
 int y4m_write_file_header(char *buf, size_t len, int width, int height,
                           const struct AvxRational *framerate, int monochrome,
                           aom_chroma_sample_position_t csp, aom_img_fmt_t fmt,
-                          unsigned int bit_depth) {
+                          unsigned int bit_depth, aom_color_range_t range) {
   const char *color = monochrome ? monochrome_colorspace(bit_depth)
                                  : colorspace(bit_depth, csp, fmt);
-  return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s\n", width, height,
-                  framerate->numerator, framerate->denominator, 'p', color);
+  const char *color_range = "";  // Default assumption is studio range.
+  if (range == AOM_CR_FULL_RANGE) {
+    color_range = " XCOLORRANGE=FULL";
+  }
+  return snprintf(buf, len, "YUV4MPEG2 W%d H%d F%d:%d Ip %s%s\n", width, height,
+                  framerate->numerator, framerate->denominator, color,
+                  color_range);
 }
 
 int y4m_write_frame_header(char *buf, size_t len) {
diff --git a/media/libaom/src/common/y4menc.h b/media/libaom/src/common/y4menc.h
index f6d5fd86be..6484efcc50 100644
--- a/media/libaom/src/common/y4menc.h
+++ b/media/libaom/src/common/y4menc.h
@@ -20,12 +20,12 @@
 extern "C" {
 #endif
 
-#define Y4M_BUFFER_SIZE 128
+#define Y4M_BUFFER_SIZE 256
 
 int y4m_write_file_header(char *buf, size_t len, int width, int height,
                           const struct AvxRational *framerate, int monochrome,
                           aom_chroma_sample_position_t csp, aom_img_fmt_t fmt,
-                          unsigned int bit_depth);
+                          unsigned int bit_depth, aom_color_range_t range);
 int y4m_write_frame_header(char *buf, size_t len);
 void y4m_write_image_file(const aom_image_t *img, const int *planes,
                           FILE *file);
diff --git a/media/libaom/src/common/y4minput.c b/media/libaom/src/common/y4minput.c
index f3dfaafc68..d64b6e4792 100644
--- a/media/libaom/src/common/y4minput.c
+++ b/media/libaom/src/common/y4minput.c
@@ -11,6 +11,7 @@
  * Based on code from the OggTheora software codec source code,
  * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
  */
+#include <assert.h>
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
@@ -52,16 +53,31 @@ static int file_read(void *buf, size_t size, FILE *file) {
   return len == size;
 }
 
+// Stores the color range in 'y4m_ctx', returning 1 if successfully parsed,
+// 0 otherwise.
+static int parse_color_range(y4m_input *y4m_ctx, const char *buf) {
+  // Note that default is studio range.
+  if (strcmp(buf, "LIMITED") == 0) {
+    return 1;
+  }
+  if (strcmp(buf, "FULL") == 0) {
+    y4m_ctx->color_range = AOM_CR_FULL_RANGE;
+    return 1;
+  }
+  fprintf(stderr, "Unknown color range value: %s\n", buf);
+  return 0;
+}
+
+static int parse_metadata(y4m_input *y4m_ctx, const char *buf) {
+  if (strncmp(buf, "COLORRANGE=", 11) == 0) {
+    return parse_color_range(y4m_ctx, buf + 11);
+  }
+  return 1;  // No support for other metadata, just ignore them.
+}
+
 static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
-  int got_w;
-  int got_h;
-  int got_fps;
-  int got_interlace;
-  int got_par;
-  int got_chroma;
   char *p;
   char *q;
-  got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0;
   for (p = _tags;; p = q) {
     /*Skip any leading spaces.*/
     while (*p == ' ') p++;
@@ -74,47 +90,117 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
     switch (p[0]) {
       case 'W': {
         if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1;
-        got_w = 1;
       } break;
       case 'H': {
         if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1;
-        got_h = 1;
       } break;
       case 'F': {
         if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) {
           return -1;
         }
-        got_fps = 1;
       } break;
       case 'I': {
         _y4m->interlace = p[1];
-        got_interlace = 1;
       } break;
       case 'A': {
         if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) {
           return -1;
         }
-        got_par = 1;
       } break;
       case 'C': {
         if (q - p > 16) return -1;
         memcpy(_y4m->chroma_type, p + 1, q - p - 1);
         _y4m->chroma_type[q - p - 1] = '\0';
-        got_chroma = 1;
       } break;
-        /*Ignore unknown tags.*/
+      case 'X': {
+        if (!parse_metadata(_y4m, p + 1)) return -1;
+      } break;
+      default: break; /*Ignore unknown tags.*/
     }
   }
-  if (!got_w || !got_h || !got_fps) return -1;
-  if (!got_interlace) _y4m->interlace = '?';
-  if (!got_par) _y4m->par_n = _y4m->par_d = 0;
-  /*Chroma-type is not specified in older files, e.g., those generated by
-     mplayer.*/
-  if (!got_chroma)
-    snprintf(_y4m->chroma_type, sizeof(_y4m->chroma_type), "420");
   return 0;
 }
 
+// Copy a single tag into the buffer, along with a null character.
+// Returns 0 if any file IO errors occur.
+static int copy_tag(char *buf, size_t buf_len, char *end_tag, FILE *file) {
+  size_t i;
+  assert(buf_len >= 1);
+  // Skip leading space characters.
+  do {
+    if (!file_read(buf, 1, file)) {
+      return 0;
+    }
+  } while (buf[0] == ' ');
+
+  // If we hit the newline, treat this as the "empty" tag.
+  if (buf[0] == '\n') {
+    buf[0] = '\0';
+    *end_tag = '\n';
+    return 1;
+  }
+
+  // Copy over characters until a space is hit, or the buffer is exhausted.
+  for (i = 1; i < buf_len; ++i) {
+    if (!file_read(buf + i, 1, file)) {
+      return 0;
+    }
+    if (buf[i] == ' ' || buf[i] == '\n') {
+      break;
+    }
+  }
+  if (i == buf_len) {
+    fprintf(stderr, "Error: Y4M header tags must be less than %lu characters\n",
+            (unsigned long)i);
+    return 0;
+  }
+  *end_tag = buf[i];
+  buf[i] = '\0';
+  return 1;
+}
+
+// Returns 1 if tags were parsed successfully, 0 otherwise.
+static int parse_tags(y4m_input *y4m_ctx, FILE *file) {
+  char tag[256];
+  char end;  // Character denoting the end of the tag, ' ' or '\n'.
+  // Set Y4M tags to defaults, updating them as processing occurs. Mandatory
+  // fields are marked with -1 and will be checked after the tags are parsed.
+  y4m_ctx->pic_w = -1;
+  y4m_ctx->pic_h = -1;
+  y4m_ctx->fps_n = -1;  // Also serves as marker for fps_d
+  y4m_ctx->par_n = 0;
+  y4m_ctx->par_d = 0;
+  y4m_ctx->interlace = '?';
+  y4m_ctx->color_range = AOM_CR_STUDIO_RANGE;
+  snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420");
+
+  // Find one tag at a time.
+  do {
+    if (!copy_tag(tag, sizeof(tag), &end, file)) {
+      return 0;
+    }
+    // y4m_parse_tags returns 0 on success.
+    if (y4m_parse_tags(y4m_ctx, tag)) {
+      return 0;
+    }
+  } while (end != '\n');
+
+  // Check the mandatory fields.
+  if (y4m_ctx->pic_w == -1) {
+    fprintf(stderr, "Width field missing\n");
+    return 0;
+  }
+  if (y4m_ctx->pic_h == -1) {
+    fprintf(stderr, "Height field missing\n");
+    return 0;
+  }
+  if (y4m_ctx->fps_n == -1) {
+    fprintf(stderr, "FPS field missing\n");
+    return 0;
+  }
+  return 1;
+}
+
 /*All anti-aliasing filters in the following conversion functions are based on
    one of two window functions:
   The 6-tap Lanczos window (for down-sampling and shifts):
@@ -221,26 +307,6 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
   }
 }
 
-/*Handles both 422 and 420mpeg2 to 422jpeg and 420jpeg, respectively.*/
-static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
-                                         unsigned char *_aux) {
-  int c_w;
-  int c_h;
-  int c_sz;
-  int pli;
-  /*Skip past the luma data.*/
-  _dst += _y4m->pic_w * _y4m->pic_h;
-  /*Compute the size of each chroma plane.*/
-  c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
-  c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
-  c_sz = c_w * c_h;
-  for (pli = 1; pli < 3; pli++) {
-    y4m_42xmpeg2_42xjpeg_helper(_dst, _aux, c_w, c_h);
-    _dst += c_sz;
-    _aux += c_sz;
-  }
-}
-
 /*This format is only used for interlaced content, but is included for
    completeness.
 
@@ -779,45 +845,41 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst,
   (void)_aux;
 }
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
-                   aom_chroma_sample_position_t csp, int only_420) {
-  char buffer[80] = { 0 };
-  int ret;
-  int i;
-  /*Read until newline, or 80 cols, whichever happens first.*/
-  for (i = 0; i < 79; i++) {
-    if (_nskip > 0) {
-      buffer[i] = *_skip++;
-      _nskip--;
-    } else {
-      if (!file_read(buffer + i, 1, _fin)) return -1;
-    }
-    if (buffer[i] == '\n') break;
+static const char TAG[] = "YUV4MPEG2";
+
+int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
+                   int num_skip, aom_chroma_sample_position_t csp,
+                   int only_420) {
+  // File must start with |TAG|.
+  char tag_buffer[9];  // 9 == strlen(TAG)
+  // Read as much as possible from |skip_buffer|, which were characters
+  // that were previously read from the file to do input-type detection.
+  assert(num_skip >= 0 && num_skip <= 8);
+  if (num_skip > 0) {
+    memcpy(tag_buffer, skip_buffer, num_skip);
   }
-  /*We skipped too much header data.*/
-  if (_nskip > 0) return -1;
-  if (i == 79) {
-    fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n");
+  // Start reading from the file now that the |skip_buffer| is depleted.
+  if (!file_read(tag_buffer + num_skip, 9 - num_skip, file)) {
     return -1;
   }
-  buffer[i] = '\0';
-  if (memcmp(buffer, "YUV4MPEG", 8)) {
-    fprintf(stderr, "Incomplete magic for YUV4MPEG file.\n");
+  if (memcmp(TAG, tag_buffer, 9) != 0) {
+    fprintf(stderr, "Error parsing header: must start with %s\n", TAG);
     return -1;
   }
-  if (buffer[8] != '2') {
-    fprintf(stderr, "Incorrect YUV input file version; YUV4MPEG2 required.\n");
+  // Next character must be a space.
+  if (!file_read(tag_buffer, 1, file) || tag_buffer[0] != ' ') {
+    fprintf(stderr, "Error parsing header: space must follow %s\n", TAG);
+    return -1;
   }
-  ret = y4m_parse_tags(_y4m, buffer + 5);
-  if (ret < 0) {
-    fprintf(stderr, "Error parsing YUV4MPEG2 header.\n");
-    return ret;
+  if (!parse_tags(y4m_ctx, file)) {
+    fprintf(stderr, "Error parsing %s header.\n", TAG);
+    return -1;
   }
-  if (_y4m->interlace == '?') {
+  if (y4m_ctx->interlace == '?') {
     fprintf(stderr,
             "Warning: Input video interlacing format unknown; "
             "assuming progressive scan.\n");
-  } else if (_y4m->interlace != 'p') {
+  } else if (y4m_ctx->interlace != 'p') {
     fprintf(stderr,
             "Input video is interlaced; "
             "Only progressive scan handled.\n");
@@ -826,263 +888,268 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
   /* Only support vertical chroma sample position if the input format is
    * already 420mpeg2. Colocated is not supported in Y4M.
    */
-  if (csp == AOM_CSP_VERTICAL && strcmp(_y4m->chroma_type, "420mpeg2") != 0) {
+  if (csp == AOM_CSP_VERTICAL &&
+      strcmp(y4m_ctx->chroma_type, "420mpeg2") != 0) {
     fprintf(stderr,
             "Vertical chroma sample position only supported "
             "for 420mpeg2 input\n");
     return -1;
   }
   if (csp == AOM_CSP_COLOCATED) {
-    fprintf(stderr, "Colocated chroma sample position not supported in Y4M\n");
-    return -1;
+    // TODO(any): check the right way to handle this in y4m
+    fprintf(stderr,
+            "Ignoring colocated chroma sample position for reading in Y4M\n");
   }
-  _y4m->aom_fmt = AOM_IMG_FMT_I420;
-  _y4m->bps = 12;
-  _y4m->bit_depth = 8;
-  if (strcmp(_y4m->chroma_type, "420") == 0 ||
-      strcmp(_y4m->chroma_type, "420jpeg") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        _y4m->pic_w * _y4m->pic_h +
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+  y4m_ctx->aom_fmt = AOM_IMG_FMT_I420;
+  y4m_ctx->bps = 12;
+  y4m_ctx->bit_depth = 8;
+  y4m_ctx->aux_buf = NULL;
+  y4m_ctx->dst_buf = NULL;
+  if (strcmp(y4m_ctx->chroma_type, "420") == 0 ||
+      strcmp(y4m_ctx->chroma_type, "420jpeg") == 0 ||
+      strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+        y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        y4m_ctx->pic_w * y4m_ctx->pic_h +
+        2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-  } else if (strcmp(_y4m->chroma_type, "420p10") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 2;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        2 * (_y4m->pic_w * _y4m->pic_h +
-             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "420p10") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 2;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2));
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-    _y4m->bit_depth = 10;
-    _y4m->bps = 15;
-    _y4m->aom_fmt = AOM_IMG_FMT_I42016;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->bps = 15;
+    y4m_ctx->aom_fmt = AOM_IMG_FMT_I42016;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "420p12") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 2;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        2 * (_y4m->pic_w * _y4m->pic_h +
-             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+  } else if (strcmp(y4m_ctx->chroma_type, "420p12") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 2;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2));
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-    _y4m->bit_depth = 12;
-    _y4m->bps = 18;
-    _y4m->aom_fmt = AOM_IMG_FMT_I42016;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->bps = 18;
+    y4m_ctx->aom_fmt = AOM_IMG_FMT_I42016;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->convert = y4m_convert_null;
-    if (csp != AOM_CSP_VERTICAL) {
-      _y4m->convert = y4m_convert_42xmpeg2_42xjpeg;
-      snprintf(_y4m->chroma_type, sizeof(_y4m->chroma_type), "420");
-    }
-  } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+        y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.
       We need to make two filter passes, so we need some extra space in the
        aux buffer.*/
-    _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->convert = y4m_convert_42xpaldv_42xjpeg;
-  } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    y4m_ctx->aux_buf_sz =
+        3 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+    y4m_ctx->aux_buf_read_sz =
+        2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+    y4m_ctx->convert = y4m_convert_42xpaldv_42xjpeg;
+  } else if (strcmp(y4m_ctx->chroma_type, "422jpeg") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_422jpeg_420jpeg;
-  } else if (strcmp(_y4m->chroma_type, "422") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz =
+        2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+    y4m_ctx->convert = y4m_convert_422jpeg_420jpeg;
+  } else if (strcmp(y4m_ctx->chroma_type, "422") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
     if (only_420) {
-      _y4m->dst_c_dec_h = 2;
-      _y4m->dst_c_dec_v = 2;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->dst_c_dec_h = 2;
+      y4m_ctx->dst_c_dec_v = 2;
+      y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Chroma filter required: read into the aux buf first.
         We need to make two filter passes, so we need some extra space in the
          aux buffer.*/
-      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->aux_buf_sz =
-          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->convert = y4m_convert_422_420jpeg;
+      y4m_ctx->aux_buf_read_sz =
+          2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz +
+                            ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->convert = y4m_convert_422_420jpeg;
     } else {
-      _y4m->aom_fmt = AOM_IMG_FMT_I422;
-      _y4m->bps = 16;
-      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-      _y4m->dst_buf_read_sz =
-          _y4m->pic_w * _y4m->pic_h + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      y4m_ctx->aom_fmt = AOM_IMG_FMT_I422;
+      y4m_ctx->bps = 16;
+      y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+      y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+      y4m_ctx->dst_buf_read_sz =
+          y4m_ctx->pic_w * y4m_ctx->pic_h +
+          2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
       /*Natively supported: no conversion required.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-      _y4m->convert = y4m_convert_null;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+      y4m_ctx->convert = y4m_convert_null;
     }
-  } else if (strcmp(_y4m->chroma_type, "422p10") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->aom_fmt = AOM_IMG_FMT_I42216;
-    _y4m->bps = 20;
-    _y4m->bit_depth = 10;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
-                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "422p10") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->aom_fmt = AOM_IMG_FMT_I42216;
+    y4m_ctx->bps = 20;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h);
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "422p12") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->aom_fmt = AOM_IMG_FMT_I42216;
-    _y4m->bps = 24;
-    _y4m->bit_depth = 12;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
-                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "422p12") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->aom_fmt = AOM_IMG_FMT_I42216;
+    y4m_ctx->bps = 24;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h);
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "411") == 0) {
-    _y4m->src_c_dec_h = 4;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "411") == 0) {
+    y4m_ctx->src_c_dec_h = 4;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.
       We need to make two filter passes, so we need some extra space in the
        aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h;
-    _y4m->aux_buf_sz =
-        _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_411_420jpeg;
-  } else if (strcmp(_y4m->chroma_type, "444") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
+    y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 3) / 4) * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz =
+        y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+    y4m_ctx->convert = y4m_convert_411_420jpeg;
+  } else if (strcmp(y4m_ctx->chroma_type, "444") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
     if (only_420) {
-      _y4m->dst_c_dec_h = 2;
-      _y4m->dst_c_dec_v = 2;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->dst_c_dec_h = 2;
+      y4m_ctx->dst_c_dec_v = 2;
+      y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Chroma filter required: read into the aux buf first.
         We need to make two filter passes, so we need some extra space in the
          aux buffer.*/
-      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
-      _y4m->aux_buf_sz =
-          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->convert = y4m_convert_444_420jpeg;
+      y4m_ctx->aux_buf_read_sz = 2 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz +
+                            ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->convert = y4m_convert_444_420jpeg;
     } else {
-      _y4m->aom_fmt = AOM_IMG_FMT_I444;
-      _y4m->bps = 24;
-      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->aom_fmt = AOM_IMG_FMT_I444;
+      y4m_ctx->bps = 24;
+      y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+      y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+      y4m_ctx->dst_buf_read_sz = 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Natively supported: no conversion required.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-      _y4m->convert = y4m_convert_null;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+      y4m_ctx->convert = y4m_convert_null;
     }
-  } else if (strcmp(_y4m->chroma_type, "444p10") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
-    _y4m->aom_fmt = AOM_IMG_FMT_I44416;
-    _y4m->bps = 30;
-    _y4m->bit_depth = 10;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "444p10") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->aom_fmt = AOM_IMG_FMT_I44416;
+    y4m_ctx->bps = 30;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "444p12") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
-    _y4m->aom_fmt = AOM_IMG_FMT_I44416;
-    _y4m->bps = 36;
-    _y4m->bit_depth = 12;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "444p12") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->aom_fmt = AOM_IMG_FMT_I44416;
+    y4m_ctx->bps = 36;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
+  } else if (strcmp(y4m_ctx->chroma_type, "444alpha") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
     if (only_420) {
-      _y4m->dst_c_dec_h = 2;
-      _y4m->dst_c_dec_v = 2;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->dst_c_dec_h = 2;
+      y4m_ctx->dst_c_dec_v = 2;
+      y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Chroma filter required: read into the aux buf first.
         We need to make two filter passes, so we need some extra space in the
          aux buffer.
         The extra plane also gets read into the aux buf.
         It will be discarded.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
-      _y4m->convert = y4m_convert_444_420jpeg;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz =
+          3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+      y4m_ctx->convert = y4m_convert_444_420jpeg;
     } else {
       fprintf(stderr, "Unsupported format: 444A\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "mono") == 0) {
-    _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;
-    _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "mono") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->src_c_dec_v = 0;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*No extra space required, but we need to clear the chroma planes.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_mono_420jpeg;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_mono_420jpeg;
   } else {
-    fprintf(stderr, "Unknown chroma sampling type: %s\n", _y4m->chroma_type);
+    fprintf(stderr, "Unknown chroma sampling type: %s\n", y4m_ctx->chroma_type);
     return -1;
   }
   /*The size of the final frame buffers is always computed from the
      destination chroma decimation type.*/
-  _y4m->dst_buf_sz =
-      _y4m->pic_w * _y4m->pic_h +
-      2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
-          ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
-  if (_y4m->bit_depth == 8)
-    _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
+  y4m_ctx->dst_buf_sz =
+      y4m_ctx->pic_w * y4m_ctx->pic_h +
+      2 * ((y4m_ctx->pic_w + y4m_ctx->dst_c_dec_h - 1) / y4m_ctx->dst_c_dec_h) *
+          ((y4m_ctx->pic_h + y4m_ctx->dst_c_dec_v - 1) / y4m_ctx->dst_c_dec_v);
+  if (y4m_ctx->bit_depth == 8)
+    y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz);
   else
-    _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
+    y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz);
+  if (!y4m_ctx->dst_buf) return -1;
 
-  if (_y4m->aux_buf_sz > 0)
-    _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+  if (y4m_ctx->aux_buf_sz > 0) {
+    y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz);
+    if (!y4m_ctx->aux_buf) {
+      free(y4m_ctx->dst_buf);
+      return -1;
+    }
+  }
   return 0;
 }
 
diff --git a/media/libaom/src/common/y4minput.h b/media/libaom/src/common/y4minput.h
index f6c5a3d3ab..2472007b67 100644
--- a/media/libaom/src/common/y4minput.h
+++ b/media/libaom/src/common/y4minput.h
@@ -55,16 +55,23 @@ struct y4m_input {
   enum aom_img_fmt aom_fmt;
   int bps;
   unsigned int bit_depth;
+  aom_color_range_t color_range;
 };
 
 /**
- * Open the input file, treating it as Y4M. y4m_input is filled in after
- * reading it. Note that chroma-sample-position should only be set for 420
- * input, and the input chroma is shifted if necessary. The code does not
- * support the conversion from co-located to vertical.
+ * Open the input file, treating it as Y4M. |y4m_ctx| is filled in after
+ * reading it. Note that |csp| should only be set for 420 input, and the input
+ * chroma is shifted if necessary. The code does not support the conversion
+ * from co-located to vertical. The |skip_buffer| indicates bytes that were
+ * previously read from |file|, to do input-type detection; this buffer will
+ * be read before the |file| is read. It is of size |num_skip|, which *must*
+ * be 8 or less.
+ *
+ * Returns 0 on success, -1 on failure.
  */
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
-                   aom_chroma_sample_position_t csp, int only_420);
+int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
+                   int num_skip, aom_chroma_sample_position_t csp,
+                   int only_420);
 void y4m_input_close(y4m_input *_y4m);
 int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *img);
 
diff --git a/media/libaom/src/doc/AlgorithmDescription.md b/media/libaom/src/doc/AlgorithmDescription.md
new file mode 100644
index 0000000000..bfd64dad67
--- /dev/null
+++ b/media/libaom/src/doc/AlgorithmDescription.md
@@ -0,0 +1,799 @@
+<div style="font-size:3em; text-align:center;"> Algorithm Description </div>
+
+# Abstract
+This document describes technical aspects of coding tools included in
+the associated codec. This document is not a specification of the associated
+codec. Instead, it summarizes the highlighted features of coding tools for new
+developers. This document should be updated when significant new normative
+changes have been integrated into the associated codec.
+
+# Table of Contents
+
+[Abbreviations](#Abbreviations)
+
+[Algorithm description](#Algorithm-Description)
+
+- [Block Partitioning](#Block-Partitioning)
+  - [Coding block partition](#Coding-block-partition)
+  - [Transform block partition](#Transform-block-partition)
+- [Intra Prediction](#Intra-Prediction)
+  - [Directional intra prediction modes](#Directional-intra-prediction-modes)
+  - [Non-directional intra prediction modes](#Non-directional-intra-prediction-modes)
+  - [Recursive filtering modes](#Recursive-filtering-modes)
+  - [Chroma from Luma mode](#Chroma-from-Luma-mode)
+- [Inter Prediction](#Inter-Prediction)
+  - [Motion vector prediction](#Motion-vector-prediction)
+  - [Motion vector coding](#Motion-vector-coding)
+  - [Interpolation filter for motion compensation](#Interpolation-filter-for-motion-compensation)
+  - [Warped motion compensation](#Warped-motion-compensation)
+  - [Overlapped block motion compensation](#Overlapped-block-motion-compensation)
+  - [Reference frames](#Reference-frames)
+  - [Compound Prediction](#Compound-Prediction)
+- [Transform](#Transform)
+- [Quantization](#Quantization)
+- [Entropy Coding](#Entropy-Coding)
+- [Loop filtering and post-processing](#Loop-filtering-and-post-processing)
+  - [Deblocking](#Deblocking)
+  - [Constrained directional enhancement](#Constrained-directional-enhancement)
+  - [Loop Restoration filter](#Loop-Restoration-filter)
+  - [Frame super-resolution](#Frame-super-resolution)
+  - [Film grain synthesis](#Film-grain-synthesis)
+- [Screen content coding](#Screen-content-coding)
+  - [Intra block copy](#Intra-block-copy)
+  - [Palette mode](#Palette-mode)
+
+[References](#References)
+
+# Abbreviations
+
+CfL: Chroma from Luma\
+IntraBC: Intra block copy\
+LCU: Largest coding unit\
+OBMC: Overlapped Block Motion Compensation\
+CDEF: Constrained Directional Enhancement Filter
+
+# Algorithm Description
+
+## Block Partitioning
+
+### Coding block partition
+
+The largest coding block unit (LCU) applied in this codec is 128×128. In
+addition to no split mode `PARTITION_NONE`, the partition tree supports 9
+different partitioning patterns, as shown in below figure.
+
+<figure class="image"> <center><img src="img\partition_codingblock.svg"
+alt="Partition" width="360" /> <figcaption>Figure 1: Supported coding block
+partitions</figcaption> </figure>
+
+According to the number of sub-partitions, the 9 partition modes are summarized
+as follows: 1. Four partitions: `PARTITION_SPLIT`, `PARTITION_VERT_4`,
+`PARTITION_HORZ_4` 2. Three partitions (T-Shape): `PARTITION_HORZ_A`,
+`PARTITION_HORZ_B`, `PARTITION_VERT_A`, `PARTITION_HORZ_B` 3. Two partitions:
+`PARTITION_HORZ`, `PARTITION_VERT`
+
+Among all the 9 partitioning patterns, only `PARTITION_SPLIT` mode supports
+recursive partitioning, i.e., sub-partitions can be further split, other
+partitioning modes cannot further split. Particularly, for 8x8 and 128x128,
+`PARTITION_VERT_4`, `PARTITION_HORZ_4` are not used, and for 8x8, T-Shape
+partitions are not used either.
+
+### Transform block partition
+
+For both intra and inter coded blocks, the coding block can be further
+partitioned into multiple transform units with the partitioning depth up to 2
+levels. The mapping from the transform size of the current depth to the
+transform size of the next depth is shown in the following Table 1.
+
+<figure class="image"> <center><figcaption>Table 1: Transform partition size
+setting</figcaption> <img src="img\tx_partition.svg" alt="Partition" width="220"
+/> </figure>
+
+Furthermore, for intra coded blocks, the transform partition is done in a way
+that all the transform blocks have the same size, and the transform blocks are
+coded in a raster scan order. An example of the transform block partitioning for
+intra coded block is shown in the Figure 2.
+
+<figure class="image"> <center><img src="img\intra_tx_partition.svg"
+alt="Partition" width="600" /> <figcaption>Figure 2: Example of transform
+partitioning for intra coded block</figcaption> </figure>
+
+For inter coded blocks, the transform unit partitioning can be done in a
+recursive manner with the partitioning depth up to 2 levels. The transform
+partitioning supports 1:1 (square), 1:2/2:1, and 1:4/4:1 transform unit sizes
+ranging from 4×4 to 64×64. If the coding block is smaller than or equal to
+64x64, the transform block partitioning can only apply to luma component, for
+chroma blocks, the transform block size is identical to the coding block size.
+Otherwise, if the coding block width or height is greater than 64, then both the
+luma and chroma coding blocks will implicitly split into multiples of min(W,
+64)x min(H, 64) and min(W, 32)x min(H, 32) transform blocks, respectively.
+
+<figure class="image"> <center><img src="img\inter_tx_partition.svg"
+alt="Partition" width="400" /> <figcaption>Figure 3: Example of transform
+partitioning for inter coded block</figcaption> </figure>
+
+## Intra Prediction
+
+### Directional intra prediction modes
+
+Directional intra prediction modes are applied in intra prediction, which models
+local textures using a given direction pattern. Directional intra prediction
+modes are represented by nominal modes and angle delta. The nominal modes are
+similar set of intra prediction angles used in VP9, which includes 8 angles. The
+index value of angle delta is ranging from -3 ~ +3, and zero delta angle
+indicates a nominal mode. The prediction angle is represented by a nominal intra
+angle plus an angle delta. In total, there are 56 directional intra prediction
+modes, as shown in the following figure. In the below figure, solid arrows
+indicate directional intra prediction modes and dotted arrows represent non-zero
+angle delta.
+
+<figure class="image"> <center><img src="img\intra_directional.svg"
+alt="Directional intra" width="300" /> <figcaption>Figure 4: Directional intra
+prediction modes</figcaption> </figure>
+
+The nominal mode index and angle delta index is signalled separately, and
+nominal mode index is signalled before the associated angle delta index. It is
+noted that for small block sizes, where the coding gain from extending intra
+prediction angles may saturate, only the nominal modes are used and angle delta
+index is not signalled.
+
+### Non-directional intra prediction modes
+
+In addition to directional intra prediction modes, four non-directional intra
+modes which simulate smooth textures are also included. The four non-directional
+intra modes include `SMOOTH_V`, `SMOOTH_H`, `SMOOTH` and `PAETH predictor`.
+
+In `SMOOTH V`, `SMOOTH H` and `SMOOTH modes`, the prediction values are
+generated using quadratic interpolation along vertical, horizontal directions,
+or the average thereof. The samples used in the quadratic interpolation include
+reconstructed samples from the top and left neighboring blocks and samples from
+the right and bottom boundaries which are approximated by top reconstructed
+samples and the left reconstructed samples.
+
+In `PAETH predictor` mode, the prediction for each sample is assigned as one
+from the top (T), left (L) and top-left (TL) reference samples, which has the
+value closest to the Paeth predictor value, i.e., T + L -TL. The samples used in
+`PAETH predictor` are illustrated in below figure.
+
+<figure class="image"> <center><img src="img\intra_paeth.svg" alt="Directional
+intra" width="300" /> <figcaption>Figure 5: Paeth predictor</figcaption>
+</figure>
+
+### Recursive filtering modes
+
+Five filtering intra modes are defined, and each mode specify a set of eight
+7-tap filters. Given the selected filtering mode index (0~4), the current block
+is divided into 4x2 sub-blocks. For one 4×2 sub-block, each sample is predicted
+by 7-tap interpolation using the 7 top and left neighboring samples as inputs.
+Different filters are applied for samples located at different coordinates
+within a 4×2 sub-block. The prediction process can be done recursively in unit
+4x2 sub-block, which means that prediction samples generated for one 4x2
+prediction block can be used to predict another 4x2 sub-block.
+
+<figure class="image"> <center><img src="img\intra_recursive.svg"
+alt="Directional intra" width="300" /> <figcaption>Figure 6: Recursive filtering
+modes</figcaption> </figure>
+
+### Chroma from Luma mode
+
+Chroma from Luma (CfL) is a chroma intra prediction mode, which models chroma
+samples as a linear function of co-located reconstructed luma samples. To align
+the resolution between luma and chroma samples for different chroma sampling
+format, e.g., 4:2:0 and 4:2:2, reconstructed luma pixels may need to be
+sub-sampled before being used in CfL mode. In addition, the DC component is
+removed to form the AC contribution. In CfL mode, the model parameters which
+specify the linear function between two color components are optimized by
+encoder signalled in the bitstream.
+
+<figure class="image"> <center><img src="img\intra_cfl.svg" alt="Directional
+intra" width="700" /> <figcaption>Figure 7: CfL prediction</figcaption>
+</figure>
+
+## Inter Prediction
+
+### Motion vector prediction
+
+Motion vectors are predicted by neighboring blocks which can be either spatial
+neighboring blocks, or temporal neighboring blocks located in a reference frame.
+A set of MV predictors will be identified by checking all these blocks and
+utilized to encode the motion vector information.
+
+**Spatial motion vector prediction**
+
+There are two sets of spatial neighboring blocks that can be utilized for
+finding spatial MV predictors, including the adjacent spatial neighbors which
+are direct top and left neighbors of the current block, and second outer spatial
+neighbors which are close but not directly adjacent to the current block. The
+two sets of spatial neighboring blocks are illustrated in an example shown in
+Figure 8.
+
+<figure class="image"> <center><img src="img\inter_spatial_mvp.svg"
+alt="Directional intra" width="350" /><figcaption>Figure 8: Motion field
+estimation by linear projection</figcaption></figure>
+
+For each set of spatial neighbors, the top row will be checked from left to
+right and then the left column will be checked from top to down. For the
+adjacent spatial neighbors, an additional top-right block will be also checked
+after checking the left column neighboring blocks. For the non-adjacent spatial
+neighbors, the top-left block located at (-1, -1) position will be checked
+first, then the top row and left column in a similar manner as the adjacent
+neighbors. The adjacent neighbors will be checked first, then the temporal MV
+predictor that will be described in the next subsection will be checked second,
+after that, the non-adjacent spatial neighboring blocks will be checked.
+
+For compound prediction which utilizes a pair of reference frames, the
+non-adjacent spatial neighbors are not used for deriving the MV predictor.
+
+**Temporal motion vector prediction**
+
+In addition to spatial neighboring blocks, MV predictor can be also derived
+using co-located blocks of reference pictures, namely temporal MV predictor. To
+generate temporal MV predictor, the MVs of reference frames are first stored
+together with reference indices associated with the reference frame. Then for
+each 8x8 block of the current frame, the MVs of a reference frame which pass the
+8x8 block are identified and stored together with the reference frame index in a
+temporal MV buffer. In an example shown in Figure 5, the MV of reference frame 1
+(R1) pointing from R1 to a reference frame of R1 is identified, i.e., MVref,
+which passes a 8x8 block (shaded in blue dots) of current frame. Then this MVref
+is stored in the temporal MV buffer associated with this 8x8 block. <figure
+class="image"> <center><img src="img\inter_motion_field.svg" alt="Directional
+intra" width="800" /><figcaption>Figure 9: Motion field estimation by linear
+projection</figcaption></figure> Finally, given a couple of pre-defined block
+coordinates, the associated MVs stored in the temporal MV buffer are identified
+and projected accordingly to derive a temporal MV predictor which points from
+the current block to its reference frame, e.g., MV0 in Figure 5. In Figure 6,
+the pre-defined block positions for deriving temporal MV predictors of a 16x16
+block are shown and up to 7 blocks will be checked to find valid temporal MV
+predictors.<figure class="image"> <center><img
+src="img\inter_tmvp_positions.svg" alt="Directional intra" width="300"
+/><figcaption>Figure 10: Block positions for deriving temporal MV
+predictors</figcaption></figure> The temporal MV predictors are checked after
+the nearest spatial MV predictors but before the non-adjacent spatial MV
+predictors.
+
+All the spatial and temporal MV candidates will be put together in a pool, with
+each predictor associated with a weighting determined during the scanning of the
+spatial and temporal neighboring blocks. Based on the associated weightings, the
+candidates are sorted and ranked, and up to four candidates will be used as a
+list MV predictor list.
+
+### Motion vector coding
+
+### Interpolation filter for motion compensation
+
+<mark>[Ed.: to be added]</mark>
+
+### Warped motion compensation
+
+**Global warped motion**
+
+The global motion information is signalled at each inter frame, wherein the
+global motion type and motion parameters are included. The global motion types
+and the number of the associated parameters are listed in the following table.
+
+
+| Global motion type   | Number of parameters   |
+|:------------------:|:--------------------:|
+| Identity (zero motion)| 0 |
+| Translation | 2 |
+| Rotzoom  | 4 |
+| General affine | 6 |
+
+For an inter coded block, after the reference frame index is
+transmitted, if the motion of current block is indicated as global motion, the
+global motion type and the associated parameters of the given reference will be
+used for current block.
+
+**Local warped motion**
+
+For an inter coded block, local warped motion is allowed when the following
+conditions are all satisfied:
+
+* Current block is single prediction
+* Width or height is greater than or equal to 8 samples
+* At least one of the immediate neighbors uses same reference frame with current block
+
+If the local warped motion is used for current block, instead of signalling the
+affine parameters, they are estimated by using mean square minimization of the
+distance between the reference projection and modeled projection based on the
+motion vectors of current block and its immediate neighbors. To estimate the
+parameters of local warped motion, the projection sample pair of the center
+pixel in neighboring block and its corresponding pixel in the reference frame
+are collected if the neighboring block uses the same reference frame with
+current block. After that, 3 extra samples are created by shifting the center
+position by a quarter sample in one or two dimensions, and these samples are
+also considered as projection sample pairs to ensure the stability of the model
+parameter estimation process.
+
+
+### Overlapped block motion compensation
+
+For an inter-coded block, overlapped block motion compensation (OBMC) is allowed
+when the following conditions are all satisfied.
+
+* Current block is single prediction
+* Width or height is greater than or equal to 8 samples
+* At least one of the neighboring blocks are inter-coded blocks
+
+When OBMC is applied to current block, firstly, the initial inter prediction
+samples is generated by using the assigned motion vector of current block, then
+the inter predicted samples for the current block and inter predicted samples
+based on motion vectors from the above and left blocks are blended to generate
+the final prediction samples.The maximum number of neighboring motion vectors is
+limited based on the size of current block, and up to 4 motion vectors from each
+of upper and left blocks can be involved in the OBMC process of current block.
+
+One example of the processing order of neighboring blocks is shown in the
+following picture, wherein the values marked in each block indicate the
+processing order of the motion vectors of current block and neighboring blocks.
+To be specific, the motion vector of current block is firstly applied to
+generate inter prediction samples P0(x,y). Then motion vector of block 1 is
+applied to generate the prediction samples p1(x,y). After that, the prediction
+samples in the overlapping area between block 0 and block 1 is an weighted
+average of p0(x,y) and p1(x,y). The overlapping area of block 1 and block 0 is
+marked in grey in the following picture. The motion vectors of block 2, 3, 4 are
+further applied and blended in the same way.
+
+<figure class="image"> <center><img src="img\inter_obmc.svg" alt="Directional
+intra" width="300" /><figcaption>Figure 11: neighboring blocks for OBMC
+process</figcaption></figure>
+
+### Reference frames
+
+<mark>[Ed.: to be added]</mark>
+
+### Compound Prediction
+
+<mark>[Ed.: to be added]</mark>
+
+**Compound wedge prediction**
+
+<mark>[Ed.: to be added]</mark>
+
+**Difference-modulated masked prediction**
+
+<mark>[Ed.: to be added]</mark>
+
+**Frame distance-based compound prediction**
+
+<mark>[Ed.: to be added]</mark>
+
+**Compound inter-intra prediction**
+
+<mark>[Ed.: to be added]</mark>
+
+## Transform
+
+The separable 2D transform process is applied on prediction residuals. For the
+forward transform, a 1-D vertical transform is performed first on each column of
+the input residual block, then a horizontal transform is performed on each row
+of the vertical transform output. For the backward transform, a 1-D horizontal
+transform is performed first on each row of the input de-quantized coefficient
+block, then a vertical transform is performed on each column of the horizontal
+transform output. The primary 1-D transforms include four different types of
+transform: a) 4-point, 8-point, 16-point, 32-point, 64-point DCT-2; b) 4-point,
+8-point, 16-point asymmetric DST’s (DST-4, DST-7) and c) their flipped
+versions; d) 4-point, 8-point, 16-point, 32-point identity transforms. When
+transform size is 4-point, ADST refers to DST-7, otherwise, when transform size
+is greater than 4-point, ADST refers to DST-4.
+
+<figure class="image"> <center><figcaption>Table 2: Transform basis functions
+(DCT-2, DST-4 and DST-7 for N-point input.</figcaption> <img src=
+"img\tx_basis.svg" alt="Partition" width="450" /> </figure>
+
+For luma component, each transform block can select one pair of horizontal and
+vertical transform combination given a pre-defined set of transform type
+candidates, and the selection is explicitly signalled into the bitstream.
+However, the selection is not signalled when Max(width,height) is 64. When
+the maximum of transform block width and height is greater than or equal to 32,
+the set of transform type candidates depend on the prediction mode, as described
+in Table 3. Otherwise, when the maximum of transform block width and height is
+smaller than 32, the set of transform type candidates depend on the prediction
+mode, as described in Table 4.
+
+<figure class="image"> <center><figcaption>Table 3: Transform type candidates
+for luma component when max(width, height) is greater than or equal to 32.
+</figcaption> <img src="img\tx_cands_large.svg" alt="Partition" width="370" />
+</figure>
+
+<figure class="image"> <center><figcaption>Table 4: Transform type candidates
+for luma component when max(width, height) is smaller than 32. </figcaption>
+<img src="img\tx_cands_small.svg" alt="Partition" width="440" /> </figure>
+
+The set of transform type candidates (namely transform set) is defined in Table
+5.
+
+<figure class="image"> <center><figcaption>Table 5: Definition of transform set.
+</figcaption> <img src="img\tx_set.svg" alt="Partition" width="450" /> </figure>
+
+For chroma component, the transform type selection is done in an implicit way.
+For intra prediction residuals, the transform type is selected according to the
+intra prediction mode, as specified in Table 4. For inter prediction residuals,
+the transform type is selected according to the transform type selection of the
+co-located luma block. Therefore, for chroma component, there is no transform
+type signalling in the bitstream.
+
+<figure class="image"> <center><figcaption>Table 6: Transform type selection for
+chroma component intra prediction residuals.</figcaption> <img src=
+"img\tx_chroma.svg" alt="Partition" width="500" /> </figure>
+
+The computational cost of large size (e.g., 64-point) transforms is further
+reduced by zeroing out all the coefficients except the following two cases:
+
+1. The top-left 32×32 quadrant for 64×64/64×32/32×64 DCT_DCT hybrid transforms
+2. The left 32×16 area for 64×16 and top 16×32 for16×64 DCT_DCT hybrid transforms.
+
+Both the DCT-2 and ADST (DST-4, DST-7) are implemented using butterfly structure
+[1], which included multiple stages of butterfly operations. Each butterfly
+operations can be calculated in parallel and different stages are cascaded in a
+sequential order.
+
+## Quantization
+Quantization of transform coefficients may apply different quantization step
+size for DC and AC transform coefficients, and different quantization step size
+for luma and chroma transform coefficients. To specify the quantization step
+size, in the frame header, a _**base_q_idx**_ syntax element is first signalled,
+which is a 8-bit fixed length code specifying the quantization step size for
+luma AC coefficients. The valid range of _**base_q_idx**_ is [0, 255].
+
+After that, the delta value relative to base_q_idx for Luma DC coefficients,
+indicated as DeltaQYDc is further signalled. Furthermore, if there are more than
+one color plane, then a flag _**diff_uv_delta**_ is signaled to indicate whether
+Cb and Cr color components apply different quantization index values. If
+_**diff_uv_delta**_ is signalled as 0, then only the delta values relative to
+base_q_idx for chroma DC coefficients (indicated as DeltaQUDc) and AC
+coefficients (indicated as DeltaQUAc) are signalled. Otherwise, the delta values
+relative to base_q_idx for both the Cb and Cr DC coefficients (indicated as
+DeltaQUDc and DeltaQVDc) and AC coefficients (indicated as DeltaQUAc and
+DeltaQVAc) are signalled.
+
+The above decoded DeltaQYDc, DeltaQUAc, DeltaQUDc, DeltaQVAc and DeltaQVDc are
+added to _base_q_idx_ to derive the quantization indices. Then these
+quantization indices are further mapped to quantization step size according to
+two tables. For DC coefficients, the mapping from quantization index to
+quantization step size for 8-bit, 10-bit and 12-bit internal bit depth is
+specified by a lookup table Dc_Qlookup[3][256], and the mapping from
+quantization index to quantization step size for 8-bit, 10-bit and 12-bit is
+specified by a lookup table Ac_Qlookup[3][256].
+
+<figure class="image"> <center><img src="img\quant_dc.svg" alt="quant_dc"
+width="800" /><figcaption>Figure 11: Quantization step size of DC coefficients
+for different internal bit-depth</figcaption></figure>
+
+<figure class="image"> <center><img src="img\quant_ac.svg" alt="quant_ac"
+width="800" /><figcaption>Figure 12: Quantization step size of AC coefficients
+for different internal bit-depth</figcaption></figure>
+
+Given the quantization step size, indicated as _Q<sub>step_, the input quantized
+coefficients is further de-quantized using the following formula:
+
+_F_ = sign * ( (_f_ * _Q<sub>step_) % 0xFFFFFF ) / _deNorm_
+
+, where _f_ is the input quantized coefficient, _F_ is the output dequantized
+coefficient, _deNorm_ is a constant value derived from the transform block area
+size, as indicated by the following table:
+
+| _deNorm_ | Tx block area size |
+|----------|:--------------------------|
+| 1| Less than 512 samples |
+| 2 | 512 or 1024 samples |
+| 4 | Greater than 1024 samples |
+
+When the quantization index is 0, the quantization is performed using a
+quantization step size equal to 1, which is lossless coding mode.
+
+## Entropy Coding
+
+**Entropy coding engine**
+
+<mark>[Ed.: to be added]</mark>
+
+**Coefficient coding**
+
+For each transform unit, the coefficient coding starts with coding a skip sign,
+which is followed by the signaling of primary transform kernel type and the
+end-of-block (EOB) position in case the transform coding is not skipped. After
+that, the coefficient values are coded in a multiple level map manner plus sign
+values. The level maps are coded as three level planes, namely lower-level,
+middle-level and higher-level planes, and the sign is coded as another separate
+plane. The lower-level, middle-level and higher-level planes correspond to
+correspond to different ranges of coefficient magnitudes. The lower level plane
+corresponds to the range of 0–2, the middle level plane takes care of the
+range of 3–14, and the higher-level plane covers the range of 15 and above.
+
+The three level planes are coded as follows. After the EOB position is coded,
+the lower-level and middle-level planes are coded together in backward scan
+order, and the scan order refers to zig-zag scan applied on the entire transform
+unit basis. Then the sign plane and higher-level plane are coded together in
+forward scan order. After that, the remainder (coefficient level minus 14) is
+entropy coded using Exp-Golomb code.
+
+The context model applied to the lower level plane depends on the primary
+transform directions, including: bi-directional, horizontal, and vertical, as
+well as transform size, and up to five neighbor (in frequency domain)
+coefficients are used to derive the context. The middle level plane uses a
+similar context model, but the number of context neighbor coefficients is
+reduced from 5 to 2. The higher-level plane is coded by Exp-Golomb code without
+using context model. For the sign plane, except the DC sign that is coded using
+the DC signs from its neighboring transform units, sign values of other
+coefficients are coded directly without using context model.
+
+## Loop filtering and post-processing
+
+### Deblocking
+
+There are four methods when picking deblocking filter level, which are listed
+below:
+
+* LPF_PICK_FROM_FULL_IMAGE: search the full image with different values
+* LPF_PICK_FROM_Q: estimate the filter level based on quantizer and frame type
+* LPF_PICK_FROM_SUBIMAGE: estimate the level from a portion of image
+* LPF_PICK_MINIMAL_LPF: set the filter level to 0 and disable the deblocking
+
+When estimating the filter level from the full image or sub-image, the searching
+starts from the previous frame filter level, ends when the filter step is less
+or equal to zero. In addition to filter level, there are some other parameters
+which control the deblocking filter such as sharpness level, mode deltas, and
+reference deltas.
+
+Deblocking is performed at 128x128 super block level, and the vertical and
+horizontal edges are filtered respectively. For a 128x128 super block, the
+vertical/horizontal edges aligned with each 8x8 block is firstly filtered. If
+the 4x4 transform is used, the internal edge aligned with a 4x4 block will be
+further filtered. The filter length is switchable from 4-tap, 6-tap, 8-tap,
+14-tap, and 0-tap (no filtering). The location of filter taps are identified
+based on the number of filter taps in order to compute the filter mask. When
+finally performing the filtering, outer taps are added if there is high edge
+variance.
+
+### Constrained directional enhancement filter
+
+**Edge Direction Estimation**\
+In CDEF, edge direction search is performed at 8x8 block-level. There are
+eight edge directions in total, as illustrated in Figure 13.
+<figure class="image"> <center><img src="img\edge_direction.svg"
+alt="Edge direction" width="700" /> <figcaption>Figure 13: Line number
+k for pixels following direction d=0:7 in an 8x8 block.</figcaption> </figure>
+
+The optimal edge direction d_opt is found by maximizing the following
+term [3]:
+
+<figure class="image"> <center><img src="img\equ_edge_direction.svg"
+alt="Equation edge direction" width="250" /> </figure>
+<!-- $$d_{opt}=\max_{d} s_d$$
+$$s_d = \sum_{k}\frac{1}{N_{d,k}}(\sum_{p\in P_{d,k}}x_p)^2,$$ -->
+
+where x_p is the value of pixel p, P_{d,k} is the set of pixels in
+line k following direction d, N_{d,k} is the cardinality of P_{d,k}.
+
+**Directional filter**\
+CDEF consists two filter taps: the primary tap and the secondary tap.
+The primary tap works along the edge direction (as shown in Figure 14),
+while the secondary tap forms an oriented 45 degree off the edge direction
+ (as shown in Figure 15).
+
+<figure class="image"> <center><img src="img\primary_tap.svg"
+alt="Primary tap" width="700" /> <figcaption>Figure 14: Primary filter
+taps following edge direction. For even strengths a = 2 and b = 4, for
+odd strengths a = 3 and b = 3. The filtered pixel is shown in the
+highlighted center.</figcaption> </figure>
+
+<figure class="image"> <center><img src="img\secondary_tap.svg"
+alt="Edge direction" width="700" /> <figcaption>Figure 15: Secondary
+filter taps. The filtered pixel is shown in the highlighted center.
+</figcaption> </figure>
+
+CDEF can be described by the following equation:
+
+<figure class="image"> <center><img src="img\equ_dir_search.svg"
+alt="Equation direction search" width="720" /> </figure>
+
+<!-- $$y(i,j)=x(i,j)+round(\sum_{m,n}w^{(p)}_{d,m,n}f(x(m,x)-x(i,j),S^{(p)},
+D)+\sum_{m,n}w^{(s)}_{d,m,n}f(x(m,x)-x(i,j),S^{(s)},D)),$$ -->
+
+where x(i,j) and y(i,j) are the input and output reconstructed values
+of CDEF. p denotes primary tap, and s denotes secondary tap, w is
+the weight between primary and secondary tap. f(d,S,D) is a non-linear
+filtering function, S denotes filter strength, D is a damping parameter.
+For 8-bit content, S^p ranges from 0 to 15, and S^s can be
+0, 1, 2, or 4. D ranges from 3 to 6 for luma, and 2 to 4 for chroma.
+
+**Non linear filter**\
+CDEF uses a non-linear filtering function to prevent excessive blurring
+when applied across an edge. It is achieved by ignoring pixels that are
+too different from the current pixels to be filtered. When the difference
+between current pixel and it's neighboring pixel d is within a threshold,
+f(d,S,D) = d, otherwise f(d,S,D) = 0. Specifically, the strength S
+determines the maximum difference allowed and damping D determines the
+point to ignore the filter tap.
+
+### Loop Restoration filter
+
+**Separable symmetric wiener filter**
+
+Let F be a w x w 2D filter taps around the pixel to be filtered, denoted as
+a w^2 x 1 column vector. When compared with traditional Wiener Filter,
+Separable Symmetric Wiener Filter has the following three constraints in order
+to save signaling bits and reduce complexity [4]:
+
+1) The w x w filter window of is separated into horizontal and vertical w-tap
+convolutions.
+
+2) The horizontal and vertical filters are constrained to be symmetric.
+
+3) It is assumed that the summation of horizontal/vertical filter coefficients
+is 1.
+
+As a result, F can be written as F = column_vectorize[ab^T], subject to a(i)
+= a(w - 1 - i), b(i) = b(w - 1 - i), for i = [0, r - 1], and sum(a(i)) =
+sum(b(i)) = 1, where a is the vertical filters and b is the horizontal filters.
+The derivation of the filters a and b starts from an initial guess of
+horizontal and vertical filters, optimizing one of the two while holding the
+other fixed. In the implementation w = 7, thus, 3 taps need to be sent for
+filters a and b, respectively. When signaling the filter coefficients, 4, 5 and
+6 bits are used for the first three filter taps, and the remaining ones are
+obtained from the normalization and symmetry constraints. 30 bits in total are
+transmitted for both vertical and horizontal filters.
+
+
+**Dual self-guided filter**
+
+Dual self-guided filter is designed to firstly obtain two coarse restorations
+X1 and X2 of the degraded frame X, and the final restoration Xr is obtained as
+a combination of the degraded samples, and the difference between the degraded
+samples and the coarse restorations [4]:
+
+<figure class="image"> <center><img src="img\equ_dual_self_guided.svg"
+alt="Equation dual self guided filter" width="300" /> </figure>
+<!-- $$X_r = X + \alpha (X_1 - X) + \beta (X_2 - X)$$ -->
+
+At encoder side, alpha and beta are computed using:
+
+<figure class="image"> <center><img src="img\equ_dual_self_para.svg"
+alt="Equation dual self guided filter parameter" width="220" /> </figure>
+<!-- $${\alpha, \beta}^T = (A^T A) ^{-1} A^T b,$$ -->
+
+where A = {X1 - X, X2 - X}, b = Y - X, and Y is the original source.
+
+X1 and X2 are obtained using guided filtering, and the filtering is controlled
+by a radius r and a noise parameter e, where a higher r implies a higher
+spatial variance and a higher e implies a higher range variance [4]. X1 and X2
+can be described by {r1, e1} and {r2, e2}, respectively.
+
+The encoder sends a 6-tuple {r1, e1, r2, e2, alpha, beta} to the decoder. In
+the implementation, {r1, e1, r2, e2} uses a 3-bit codebook, and {alpha, beta}
+uses 7-bit each due to much higher precision, resulting in a total of 17 bits.
+r is always less or equal to 3 [4].
+
+Guided filtering can be described by a local linear model:
+
+<figure class="image"> <center><img src="img\equ_guided_filter.svg"
+alt="Equation guided filter" width="155" /> </figure>
+<!-- $$y=Fx+G,$$ -->
+
+where x and y are the input and output samples, F and G are determined by the
+statistics in the neighboring of the pixel to be filtered. It is called
+self-guided filtering when the guidance image is the same as the degraded
+image[4].
+
+Following are three steps when deriving F and G of the self-guided filtering:
+
+1) Compute mean u and variance d of pixels in a (2r + 1) x (2r + 1) window
+around the pixel to be filtered.
+
+2) For each pixel, compute f = d / (d + e); g = (1 - f)u.
+
+3) Compute F and G for each pixel as averages of f and g values in a 3 x 3
+window around the pixel for use in step 2.
+
+### Frame super-resolution
+
+In order to improve the perceptual quality of decoded pictures, a
+super-resolution process is applied at low bit-rates [5]. First, at encoder
+side, the source video is downscaled as a non-normative procedure. Second,
+the downscaled video is encoded, followed by deblocking and CDEF process.
+Third, a linear upscaling process is applied as a normative procedure to bring
+the encoded video back to it's original spatial resolution. Lastly, the loop
+restoration is applied to resolve part of the high frequency lost. The last
+two steps together are called super-resolving process [5]. Similarly, decoding,
+deblocking and CDEF processes are applied at lower spatial resolution at
+decoder side. Then, the frames go through the super-resolving process.
+In order to reduce overheads in line-buffers with respect to hardware
+implementation, the upscaling and downscaling process are applied to
+horizontal dimension only.
+
+### Film grain synthesis
+
+At encoder side, film grain is removed from the input video as a denoising
+process. Then, the structure and intensity of the input video are analyzed
+by canny edge detector, and smooth areas are used to estimate the strength
+of film grain. Once the strength is estimated, the denoised video and film
+grain parameters are sent to decoder side. Those parameters are used to
+synthesis the grain and add it back to the decoded video, producing the final
+output video.
+
+In order to reconstruct the film grain, the following parameters are sent to
+decoder side: lag value, autoregressive coefficients, values for precomputed
+look-up table index of chroma components, and a set of points for a piece-wise
+linear scaling function [6]. Those parameters are signaled as quantized
+integers including 64 bytes for scaling function and 74 bytes for
+autoregressive coefficients. Once the parameters are received, an
+autoregressive process is applied in a raster scan order to generate one 64x64
+luma and two 32x32 chroma film grain templates [6]. Those templates are used
+to generate the grain for the remaining part of a picture.
+
+## Screen content coding
+
+To improve the coding performance of screen content coding, the associated video
+codec incorporates several coding tools，for example, intra block copy
+(IntraBC) is employed to handle the repeated patterns in a screen picture, and
+palette mode is used to handle the screen blocks with a limited number of
+different colors.
+
+### Intra block copy
+
+Intra Block Copy (IntraBC) [2] is a coding tool similar to inter-picture
+prediction. The main difference is that in IntraBC, a predictor block is
+formed from the reconstructed samples (before application of in-loop filtering)
+of the current picture. Therefore, IntraBC can be considered as "motion
+compensation" within current picture.
+
+A block vector (BV) was coded to specify the location of the predictor block.
+The BV precision is integer. The BV will be signalled in the bitstream since the
+decoder needs it to locate the predictor. For current block, the flag use
+IntraBC indicating whether current block is IntraBC mode is first transmitted in
+bit stream. Then, if the current block is IntraBC mode, the BV difference diff
+is obtained by subtracting the reference BV from the current BV, and then diff
+is classified into four types according to the diff values of horizontal and
+vertical component. Type information needs to be transmitted into the bitstream,
+after that, diff values of two components may be signalled based on the type
+info.
+
+IntraBC is very effective for screen content coding, but it also brings a lot of
+difficulties to hardware design. To facilitate the hardware design, the
+following modifications are adopted.
+
+1) when IntraBC is allowed, the loop filters are disabled, which are de-blocking
+filter, the CDEF (Constrained Directional Enhancement Filter), and the Loop
+Restoration. By doing this, picture buffer of reconstructed samples can be
+shared between IntraBC and inter prediction.
+
+2) To facilitate parallel decoding, the prediction cannot exceed the restricted
+areas. For one super block, if the coordinate of its top-left position is (x0,
+y0), the prediction at position (x, y) can be accessed by IntraBC, if y < y0 and
+x < x0 + 2 * (y0 - y)
+
+3) To allow hardware writing back delay, immediate reconstructed areas cannot be
+accessed by IntraBC prediction. The restricted immediate reconstructed area can
+be 1 ∼ n super blocks. So on top of modification 2, if the coordinate of one
+super block's top-left position is (x0, y0), the prediction at position (x, y)
+can be accessed by IntraBC, if y < y0 and x < x0 + 2 * (y0 - y) - D, where D
+denotes the restricted immediate reconstructed area. When D is one super block,
+the prediction area is shown in below figure.
+
+<figure class="image"> <center><img src="img\SCC_IntraBC.svg" alt="Intra block
+copy" width="600" /> <figcaption>Figure 13: the prediction area for IntraBC mode
+in one super block prediction</figcaption> </figure>
+
+### Palette mode
+
+# References
+
+[1] J. Han, Y. Xu and D. Mukherjee, "A butterfly structured design of the hybrid
+transform coding scheme," 2013 Picture Coding Symposium (PCS), San Jose, CA,
+2013, pp. 17-20.\
+[2] J. Li, H. Su, A. Converse, B. Li, R. Zhou, B. Lin, J. Xu, Y. Lu, and R.
+Xiong, "Intra Block Copy for Screen Content in the Emerging AV1 Video Codec,"
+2018 Data Compression Conference, Snowbird, Utah, USA.\
+[3] S. Midtskogen and J.M. Valin. "The AV1 constrained directional enhancement
+ filter (CDEF)." In 2018 IEEE International Conference on Acoustics, Speech
+  and Signal Processing (ICASSP), pp. 1193-1197. IEEE, 2018.\
+[4] D. Mukherjee, S. Li, Y. Chen, A. Anis, S. Parker, and
+J. Bankoski. "A switchable loop-restoration with side-information framework
+for the emerging AV1 video codec." In 2017 IEEE International Conference on
+Image Processing (ICIP), pp. 265-269. IEEE, 2017.\
+[5] Y. Chen, D. Murherjee, J. Han, A. Grange, Y. Xu, Z. Liu,... & C.H.Chiang,
+(2018, June). "An overview of core coding tools in the AV1 video codec.""
+In 2018 Picture Coding Symposium (PCS) (pp. 41-45). IEEE.\
+[6] A. Norkin, & N. Birkbeck, (2018, March). "Film grain synthesis for AV1
+video codec." In 2018 Data Compression Conference (pp. 3-12). IEEE.
diff --git a/media/libaom/src/doc/dev_guide/av1_decoder.dox b/media/libaom/src/doc/dev_guide/av1_decoder.dox
new file mode 100644
index 0000000000..f65ddb51ca
--- /dev/null
+++ b/media/libaom/src/doc/dev_guide/av1_decoder.dox
@@ -0,0 +1,11 @@
+/*!\page decoder_guide AV1 DECODER GUIDE
+
+  Describe AV1 decoding techniques here.
+
+  \cond
+  \if av1_md_support
+  [AV1 Algorithm Description](\ref LALGORITHMDESCRIPTION)
+  \endif
+  \endcond
+
+*/
diff --git a/media/libaom/src/doc/dev_guide/av1_encoder.dox b/media/libaom/src/doc/dev_guide/av1_encoder.dox
new file mode 100644
index 0000000000..0f7e8f87e2
--- /dev/null
+++ b/media/libaom/src/doc/dev_guide/av1_encoder.dox
@@ -0,0 +1,1617 @@
+/*!\page encoder_guide AV1 ENCODER GUIDE
+
+\tableofcontents
+
+\section architecture_introduction Introduction
+
+This document provides an architectural overview of the libaom AV1 encoder.
+
+It is intended as a high level starting point for anyone wishing to contribute
+to the project, that will help them to more quickly understand the structure
+of the encoder and find their way around the codebase.
+
+It stands above and will where necessary link to more detailed function
+level documents.
+
+\subsection  architecture_gencodecs Generic Block Transform Based Codecs
+
+Most modern video encoders including VP8, H.264, VP9, HEVC and AV1
+(in increasing order of complexity) share a common basic paradigm. This
+comprises separating a stream of raw video frames into a series of discrete
+blocks (of one or more sizes), then computing a prediction signal and a
+quantized, transform coded, residual error signal. The prediction and residual
+error signal, along with any side information needed by the decoder, are then
+entropy coded and packed to form the encoded bitstream. See Figure 1: below,
+where the blue blocks are, to all intents and purposes, the lossless parts of
+the encoder and the red block is the lossy part.
+
+This is of course a gross oversimplification, even in regard to the simplest
+of the above codecs.  For example, all of them allow for block based
+prediction at multiple different scales (i.e. different block sizes) and may
+use previously coded pixels in the current frame for prediction or pixels from
+one or more previously encoded frames. Further, they may support multiple
+different transforms and transform sizes and quality optimization tools like
+loop filtering.
+
+\image html genericcodecflow.png "" width=70%
+
+\subsection architecture_av1_structure AV1 Structure and Complexity
+
+As previously stated, AV1 adopts the same underlying paradigm as other block
+transform based codecs. However, it is much more complicated than previous
+generation codecs and supports many more block partitioning, prediction and
+transform options.
+
+AV1 supports block partitions of various sizes from 128x128 pixels down to 4x4
+pixels using a multi-layer recursive tree structure as illustrated in figure 2
+below.
+
+\image html av1partitions.png "" width=70%
+
+AV1 also provides 71 basic intra prediction modes, 56 single frame inter prediction
+modes (7 reference frames x 4 modes x 2 for OBMC (overlapped block motion
+compensation)), 12768 compound inter prediction modes (that combine inter
+predictors from two reference frames) and 36708 compound inter / intra
+prediction modes. Furthermore, in addition to simple inter motion estimation,
+AV1 also supports warped motion prediction using affine transforms.
+
+In terms of transform coding, it has 16 separable 2-D transform kernels
+\f$(DCT, ADST, fADST, IDTX)^2\f$ that can be applied at up to 19 different
+scales from 64x64 down to 4x4 pixels.
+
+When combined together, this means that for any one 8x8 pixel block in a
+source frame, there are approximately 45,000,000 different ways that it can
+be encoded.
+
+Consequently, AV1 requires complex control processes. While not necessarily
+a normative part of the bitstream, these are the algorithms that turn a set
+of compression tools and a bitstream format specification, into a coherent
+and useful codec implementation. These may include but are not limited to
+things like :-
+
+- Rate distortion optimization (The process of trying to choose the most
+  efficient combination of block size, prediction mode, transform type
+  etc.)
+- Rate control (regulation of the output bitrate)
+- Encoder speed vs quality trade offs.
+- Features such as two pass encoding or optimization for low delay
+  encoding.
+
+For a more detailed overview of AV1's encoding tools and a discussion of some
+of the design considerations and hardware constraints that had to be
+accommodated, please refer to <a href="https://arxiv.org/abs/2008.06091">
+A Technical Overview of AV1</a>.
+
+Figure 3 provides a slightly expanded but still simplistic view of the
+AV1 encoder architecture with blocks that relate to some of the subsequent
+sections of this document. In this diagram, the raw uncompressed frame buffers
+are shown in dark green and the reconstructed frame buffers used for
+prediction in light green. Red indicates those parts of the codec that are
+(or may be) lossy, where fidelity can be traded off against compression
+efficiency, whilst light blue shows algorithms or coding tools that are
+lossless. The yellow blocks represent non-bitstream normative configuration
+and control algorithms.
+
+\image html av1encoderflow.png "" width=70%
+
+\section architecture_command_line The Libaom Command Line Interface
+
+ Add details or links here: TODO ? elliotk@
+
+\section architecture_enc_data_structures Main Encoder Data Structures
+
+The following are the main high level data structures used by the libaom AV1
+encoder and referenced elsewhere in this overview document:
+
+- \ref AV1_PRIMARY
+    - \ref AV1_PRIMARY.gf_group (\ref GF_GROUP)
+    - \ref AV1_PRIMARY.lap_enabled
+    - \ref AV1_PRIMARY.twopass (\ref TWO_PASS)
+    - \ref AV1_PRIMARY.p_rc (\ref PRIMARY_RATE_CONTROL)
+    - \ref AV1_PRIMARY.tf_info (\ref TEMPORAL_FILTER_INFO)
+
+- \ref AV1_COMP
+    - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig)
+    - \ref AV1_COMP.rc (\ref RATE_CONTROL)
+    - \ref AV1_COMP.speed
+    - \ref AV1_COMP.sf (\ref SPEED_FEATURES)
+
+- \ref AV1EncoderConfig (Encoder configuration parameters)
+    - \ref AV1EncoderConfig.pass
+    - \ref AV1EncoderConfig.algo_cfg (\ref AlgoCfg)
+    - \ref AV1EncoderConfig.kf_cfg (\ref KeyFrameCfg)
+    - \ref AV1EncoderConfig.rc_cfg (\ref RateControlCfg)
+
+- \ref AlgoCfg (Algorithm related configuration parameters)
+    - \ref AlgoCfg.arnr_max_frames
+    - \ref AlgoCfg.arnr_strength
+
+- \ref KeyFrameCfg (Keyframe coding configuration parameters)
+    - \ref KeyFrameCfg.enable_keyframe_filtering
+
+- \ref RateControlCfg (Rate control configuration)
+    - \ref RateControlCfg.mode
+    - \ref RateControlCfg.target_bandwidth
+    - \ref RateControlCfg.best_allowed_q
+    - \ref RateControlCfg.worst_allowed_q
+    - \ref RateControlCfg.cq_level
+    - \ref RateControlCfg.under_shoot_pct
+    - \ref RateControlCfg.over_shoot_pct
+    - \ref RateControlCfg.maximum_buffer_size_ms
+    - \ref RateControlCfg.starting_buffer_level_ms
+    - \ref RateControlCfg.optimal_buffer_level_ms
+    - \ref RateControlCfg.vbrbias
+    - \ref RateControlCfg.vbrmin_section
+    - \ref RateControlCfg.vbrmax_section
+
+- \ref PRIMARY_RATE_CONTROL (Primary Rate control status)
+    - \ref PRIMARY_RATE_CONTROL.gf_intervals[]
+    - \ref PRIMARY_RATE_CONTROL.cur_gf_index
+
+- \ref RATE_CONTROL (Rate control status)
+    - \ref RATE_CONTROL.intervals_till_gf_calculate_due
+    - \ref RATE_CONTROL.frames_till_gf_update_due
+    - \ref RATE_CONTROL.frames_to_key
+
+- \ref TWO_PASS (Two pass status and control data)
+
+- \ref GF_GROUP (Data related to the current GF/ARF group)
+
+- \ref FIRSTPASS_STATS (Defines entries in the first pass stats buffer)
+    - \ref FIRSTPASS_STATS.coded_error
+
+- \ref SPEED_FEATURES (Encode speed vs quality tradeoff parameters)
+    - \ref SPEED_FEATURES.hl_sf (\ref HIGH_LEVEL_SPEED_FEATURES)
+
+- \ref HIGH_LEVEL_SPEED_FEATURES
+    - \ref HIGH_LEVEL_SPEED_FEATURES.recode_loop
+    - \ref HIGH_LEVEL_SPEED_FEATURES.recode_tolerance
+
+- \ref TplParams
+
+\section architecture_enc_use_cases Encoder Use Cases
+
+The libaom AV1 encoder is configurable to support a number of different use
+cases and rate control strategies.
+
+The principle use cases for which it is optimised are as follows:
+
+ - <b>Video on Demand / Streaming</b>
+ - <b>Low Delay or Live Streaming</b>
+ - <b>Video Conferencing / Real Time Coding (RTC)</b>
+ - <b>Fixed Quality / Testing</b>
+
+Other examples of use cases for which the encoder could be configured but for
+which there is less by way of specific optimizations include:
+
+ - <b>Download and Play</b>
+ - <b>Disk Playback</b>>
+ - <b>Storage</b>
+ - <b>Editing</b>
+ - <b>Broadcast video</b>
+
+Specific use cases may have particular requirements or constraints. For
+example:
+
+<b>Video Conferencing:</b>  In a video conference we need to encode the video
+in real time and to avoid any coding tools that could increase latency, such
+as frame look ahead.
+
+<b>Live Streams:</b> In cases such as live streaming of games or events, it
+may be possible to allow some limited buffering of the video and use of
+lookahead coding tools to improve encoding quality. However,  whilst a lag of
+a second or two may be fine given the one way nature of this type of video,
+it is clearly not possible to use tools such as two pass coding.
+
+<b>Broadcast:</b> Broadcast video (e.g. digital TV over satellite) may have
+specific requirements such as frequent and regular key frames (e.g. once per
+second or more) as these are important as entry points to users when switching
+channels. There may also be  strict upper limits on bandwidth over a short
+window of time.
+
+<b>Download and Play:</b> Download and play applications may have less strict
+requirements in terms of local frame by frame rate control but there may be a
+requirement to accurately hit a file size target for the video clip as a
+whole. Similar considerations may apply to playback from mass storage devices
+such as DVD or disk drives.
+
+<b>Editing:</b> In certain special use cases such as offline editing, it may
+be desirable to have very high quality and data rate but also very frequent
+key frames or indeed to encode the video exclusively as key frames. Lossless
+video encoding may also be required in this use case.
+
+<b>VOD / Streaming:</b> One of the most important and common use cases for AV1
+is video on demand or streaming, for services such as YouTube and Netflix. In
+this use case it is possible to do two or even multi-pass encoding to improve
+compression efficiency. Streaming services will often store many encoded
+copies of a video at different resolutions and data rates to support users
+with different types of playback device and bandwidth limitations.
+Furthermore, these services support dynamic switching between multiple
+streams, so that they can respond to changing network conditions.
+
+Exact rate control when encoding for a specific format (e.g 360P or 1080P on
+YouTube) may not be critical, provided that the video bandwidth remains within
+allowed limits. Whilst a format may have a nominal target data rate, this can
+be considered more as the desired average egress rate over the video corpus
+rather than a strict requirement for any individual clip. Indeed, in order
+to maintain optimal quality of experience for the end user, it may be
+desirable to encode some easier videos or sections of video at a lower data
+rate and harder videos or sections at a higher rate.
+
+VOD / streaming does not usually require very frequent key frames (as in the
+broadcast case) but key frames are important in trick play (scanning back and
+forth to different points in a video) and for adaptive stream switching. As
+such, in a use case like YouTube, there is normally an upper limit on the
+maximum time between key frames of a few seconds, but within certain limits
+the encoder can try to align key frames with real scene cuts.
+
+Whilst encoder speed may not seem to be as critical in this use case, for
+services such as YouTube, where millions of new videos have to be encoded
+every day, encoder speed is still important, so libaom allows command line
+control of the encode speed vs quality trade off.
+
+<b>Fixed Quality / Testing Mode:</b> Libaom also has a fixed quality encoder
+pathway designed for testing under highly constrained conditions.
+
+\section architecture_enc_speed_quality Speed vs Quality Trade Off
+
+In any modern video encoder there are trade offs that can be made in regard to
+the amount of time spent encoding a video or video frame vs the quality of the
+final encode.
+
+These trade offs typically limit the scope of the search for an optimal
+prediction / transform combination with faster encode modes doing fewer
+partition, reference frame, prediction mode and transform searches at the cost
+of some reduction in coding efficiency.
+
+The pruning of the size of the search tree is typically based on assumptions
+about the likelihood of different search modes being selected based on what
+has gone before and features such as the dimensions of the video frames and
+the Q value selected for encoding the frame. For example certain intra modes
+are less likely to be chosen at high Q but may be more likely if similar
+modes were used for the previously coded blocks above and to the left of the
+current block.
+
+The speed settings depend both on the use case (e.g. Real Time encoding) and
+an explicit speed control passed in on the command line as <b>--cpu-used</b>
+and stored in the \ref AV1_COMP.speed field of the main compressor instance
+data structure (<b>cpi</b>).
+
+The control flags for the speed trade off are stored the \ref AV1_COMP.sf
+field of the compressor instancve and are set in the following functions:-
+
+- \ref av1_set_speed_features_framesize_independent()
+- \ref av1_set_speed_features_framesize_dependent()
+- \ref av1_set_speed_features_qindex_dependent()
+
+A second factor impacting the speed of encode is rate distortion optimisation
+(<b>rd vs non-rd</b> encoding).
+
+When rate distortion optimization is enabled each candidate combination of
+a prediction mode and transform coding strategy is fully encoded and the
+resulting error (or distortion) as compared to the original source and the
+number of bits used, are passed to a rate distortion function. This function
+converts the distortion and cost in bits to a single <b>RD</b> value (where
+lower is better). This <b>RD</b> value is used to decide between different
+encoding strategies for the current block where, for example, a one may
+result in a lower distortion but a larger number of bits.
+
+The calculation of this <b>RD</b> value is broadly speaking as follows:
+
+\f[
+  RD = (&lambda; * Rate) + Distortion
+\f]
+
+This assumes a linear relationship between the number of bits used and
+distortion (represented by the rate multiplier value <b>&lambda;</b>) which is
+not actually valid across a broad range of rate and distortion values.
+Typically, where distortion is high, expending a small number of extra bits
+will result in a large change in distortion. However, at lower values of
+distortion the cost in bits of each incremental improvement is large.
+
+To deal with this we scale the value of <b>&lambda;</b> based on the quantizer
+value chosen for the frame. This is assumed to be a proxy for our approximate
+position on the true rate distortion curve and it is further assumed that over
+a limited range of distortion values, a linear relationship between distortion
+and rate is a valid approximation.
+
+Doing a rate distortion test on each candidate prediction / transform
+combination is expensive in terms of cpu cycles. Hence, for cases where encode
+speed is critical, libaom implements a non-rd pathway where the <b>RD</b>
+value is estimated based on the prediction error and quantizer setting.
+
+\section architecture_enc_src_proc Source Frame Processing
+
+\subsection architecture_enc_frame_proc_data Main Data Structures
+
+The following are the main data structures referenced in this section
+(see also \ref architecture_enc_data_structures):
+
+- \ref AV1_PRIMARY ppi (the primary compressor instance data structure)
+    - \ref AV1_PRIMARY.tf_info (\ref TEMPORAL_FILTER_INFO)
+
+- \ref AV1_COMP cpi (the main compressor instance data structure)
+    - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig)
+
+- \ref AV1EncoderConfig (Encoder configuration parameters)
+    - \ref AV1EncoderConfig.algo_cfg (\ref AlgoCfg)
+    - \ref AV1EncoderConfig.kf_cfg (\ref KeyFrameCfg)
+
+- \ref AlgoCfg (Algorithm related configuration parameters)
+    - \ref AlgoCfg.arnr_max_frames
+    - \ref AlgoCfg.arnr_strength
+
+- \ref KeyFrameCfg (Keyframe coding configuration parameters)
+    - \ref KeyFrameCfg.enable_keyframe_filtering
+
+\subsection architecture_enc_frame_proc_ingest Frame Ingest / Coding Pipeline
+
+ To encode a frame, first call \ref av1_receive_raw_frame() to obtain the raw
+ frame data. Then call \ref av1_get_compressed_data() to encode raw frame data
+ into compressed frame data. The main body of \ref av1_get_compressed_data()
+ is \ref av1_encode_strategy(), which determines high-level encode strategy
+ (frame type, frame placement, etc.) and then encodes the frame by calling
+ \ref av1_encode(). In \ref av1_encode(), \ref av1_first_pass() will execute
+ the first_pass of two-pass encoding, while \ref encode_frame_to_data_rate()
+ will perform the final pass for either one-pass or two-pass encoding.
+
+ The main body of \ref encode_frame_to_data_rate() is
+ \ref encode_with_recode_loop_and_filter(), which handles encoding before
+ in-loop filters (with recode loops \ref encode_with_recode_loop(), or
+ without any recode loop \ref encode_without_recode()), followed by in-loop
+ filters (deblocking filters \ref loopfilter_frame(), CDEF filters and
+ restoration filters \ref cdef_restoration_frame()).
+
+ Except for rate/quality control, both \ref encode_with_recode_loop() and
+ \ref encode_without_recode() call \ref av1_encode_frame() to manage the
+ reference frame buffers and \ref encode_frame_internal() to perform the
+ rest of encoding that does not require access to external frames.
+ \ref encode_frame_internal() is the starting point for the partition search
+ (see \ref architecture_enc_partitions).
+
+\subsection architecture_enc_frame_proc_tf Temporal Filtering
+
+\subsubsection architecture_enc_frame_proc_tf_overview Overview
+
+Video codecs exploit the spatial and temporal correlations in video signals to
+achieve compression efficiency. The noise factor in the source signal
+attenuates such correlation and impedes the codec performance. Denoising the
+video signal is potentially a promising solution.
+
+One strategy for denoising a source is motion compensated temporal filtering.
+Unlike image denoising, where only the spatial information is available,
+video denoising can leverage a combination of the spatial and temporal
+information. Specifically, in the temporal domain, similar pixels can often be
+tracked along the motion trajectory of moving objects. Motion estimation is
+applied to neighboring frames to find similar patches or blocks of pixels that
+can be combined to create a temporally filtered output.
+
+AV1, in common with VP8 and VP9, uses an in-loop motion compensated temporal
+filter to generate what are referred to as alternate reference frames (or ARF
+frames). These can be encoded in the bitstream and stored as frame buffers for
+use in the prediction of subsequent frames, but are not usually directly
+displayed (hence they are sometimes referred to as non-display frames).
+
+The following command line parameters set the strength of the filter, the
+number of frames used and determine whether filtering is allowed for key
+frames.
+
+- <b>--arnr-strength</b> (\ref AlgoCfg.arnr_strength)
+- <b>--arnr-maxframes</b> (\ref AlgoCfg.arnr_max_frames)
+- <b>--enable-keyframe-filtering</b>
+  (\ref KeyFrameCfg.enable_keyframe_filtering)
+
+Note that in AV1, the temporal filtering scheme is designed around the
+hierarchical ARF based pyramid coding structure. We typically apply denoising
+only on key frame and ARF frames at the highest (and sometimes the second
+highest) layer in the hierarchical coding structure.
+
+\subsubsection architecture_enc_frame_proc_tf_algo Temporal Filtering Algorithm
+
+Our method divides the current frame into "MxM" blocks. For each block, a
+motion search is applied on frames before and after the current frame. Only
+the best matching patch with the smallest mean square error (MSE) is kept as a
+candidate patch for a neighbour frame. The current block is also a candidate
+patch. A total of N candidate patches are combined to generate the filtered
+output.
+
+Let f(i) represent the filtered sample value and \f$p_{j}(i)\f$ the sample
+value of the j-th patch. The filtering process is:
+
+\f[
+  f(i) = \frac{p_{0}(i) + \sum_{j=1}^{N} &omega;_{j}(i).p_{j}(i)}
+              {1 + \sum_{j=1}^{N} &omega;_{j}(i)}
+\f]
+
+where \f$ &omega;_{j}(i) \f$ is the weight of the j-th patch from a total of
+N patches. The weight is determined by the patch difference as:
+
+\f[
+  &omega;_{j}(i) = exp(-\frac{D_{j}(i)}{h^2})
+\f]
+
+where \f$ D_{j}(i) \f$ is the sum of squared difference between the current
+block and the j-th candidate patch:
+
+\f[
+  D_{j}(i) = \sum_{k\in&Omega;_{i}}||p_{0}(k) - p_{j}(k)||_{2}
+\f]
+
+where:
+- \f$p_{0}\f$ refers to the current frame.
+- \f$&Omega;_{i}\f$ is the patch window, an "LxL" pixel square.
+- h is a critical parameter that controls the decay of the weights measured by
+  the Euclidean distance. It is derived from an estimate of noise amplitude in
+  the source. This allows the filter coefficients to adapt for videos with
+  different noise characteristics.
+- Usually, M = 32, N = 7, and L = 5, but they can be adjusted.
+
+It is recommended that the reader refers to the code for more details.
+
+\subsubsection architecture_enc_frame_proc_tf_funcs Temporal Filter Functions
+
+The main entry point for temporal filtering is \ref av1_temporal_filter().
+This function returns 1 if temporal filtering is successful, otherwise 0.
+When temporal filtering is applied, the filtered frame will be held in
+the output_frame, which is the frame to be
+encoded in the following encoding process.
+
+Almost all temporal filter related code is in av1/encoder/temporal_filter.c
+and av1/encoder/temporal_filter.h.
+
+Inside \ref av1_temporal_filter(), the reader's attention is directed to
+\ref tf_setup_filtering_buffer() and \ref tf_do_filtering().
+
+- \ref tf_setup_filtering_buffer(): sets up the frame buffer for
+  temporal filtering, determines the number of frames to be used, and
+  calculates the noise level of each frame.
+
+- \ref tf_do_filtering(): the main function for the temporal
+  filtering algorithm. It breaks each frame into "MxM" blocks. For each
+  block a motion search \ref tf_motion_search() is applied to find
+  the motion vector from one neighboring frame. tf_build_predictor() is then
+  called to build the matching patch and \ref av1_apply_temporal_filter_c() (see
+  also optimised SIMD versions) to apply temporal filtering. The weighted
+  average over each pixel is accumulated and finally normalized in
+  \ref tf_normalize_filtered_frame() to generate the final filtered frame.
+
+- \ref av1_apply_temporal_filter_c(): the core function of our temporal
+  filtering algorithm (see also optimised SIMD versions).
+
+\subsection architecture_enc_frame_proc_film Film Grain Modelling
+
+ Add details here.
+
+\section architecture_enc_rate_ctrl Rate Control
+
+\subsection architecture_enc_rate_ctrl_data Main Data Structures
+
+The following are the main data structures referenced in this section
+(see also \ref architecture_enc_data_structures):
+
+ - \ref AV1_PRIMARY ppi (the primary compressor instance data structure)
+    - \ref AV1_PRIMARY.twopass (\ref TWO_PASS)
+
+ - \ref AV1_COMP cpi (the main compressor instance data structure)
+    - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig)
+    - \ref AV1_COMP.rc (\ref RATE_CONTROL)
+    - \ref AV1_COMP.sf (\ref SPEED_FEATURES)
+
+ - \ref AV1EncoderConfig (Encoder configuration parameters)
+    - \ref AV1EncoderConfig.rc_cfg (\ref RateControlCfg)
+
+ - \ref FIRSTPASS_STATS *frame_stats_buf (used to store per frame first
+   pass stats)
+
+ - \ref SPEED_FEATURES (Encode speed vs quality tradeoff parameters)
+    - \ref SPEED_FEATURES.hl_sf (\ref HIGH_LEVEL_SPEED_FEATURES)
+
+\subsection architecture_enc_rate_ctrl_options Supported Rate Control Options
+
+Different use cases (\ref architecture_enc_use_cases) may have different
+requirements in terms of data rate control.
+
+The broad rate control strategy is selected using the <b>--end-usage</b>
+parameter on the command line, which maps onto the field
+\ref aom_codec_enc_cfg_t.rc_end_usage in \ref aom_encoder.h.
+
+The four supported options are:-
+
+- <b>VBR</b> (Variable Bitrate)
+- <b>CBR</b> (Constant Bitrate)
+- <b>CQ</b> (Constrained Quality mode ; A constrained variant of VBR)
+- <b>Fixed Q</b> (Constant quality of Q mode)
+
+The value of \ref aom_codec_enc_cfg_t.rc_end_usage is in turn copied over
+into the encoder rate control configuration data structure as
+\ref RateControlCfg.mode.
+
+In regards to the most important use cases above, Video on demand uses either
+VBR or CQ mode. CBR is the preferred rate control model for RTC and Live
+streaming and Fixed Q is only used in testing.
+
+The behaviour of each of these modes is regulated by a series of secondary
+command line rate control options but also depends somewhat on the selected
+use case, whether 2-pass coding is enabled and the selected encode speed vs
+quality trade offs (\ref AV1_COMP.speed and \ref AV1_COMP.sf).
+
+The list below gives the names of the main rate control command line
+options together with the names of the corresponding fields in the rate
+control configuration data structures.
+
+- <b>--target-bitrate</b> (\ref RateControlCfg.target_bandwidth)
+- <b>--min-q</b> (\ref RateControlCfg.best_allowed_q)
+- <b>--max-q</b> (\ref RateControlCfg.worst_allowed_q)
+- <b>--cq-level</b> (\ref RateControlCfg.cq_level)
+- <b>--undershoot-pct</b> (\ref RateControlCfg.under_shoot_pct)
+- <b>--overshoot-pct</b> (\ref RateControlCfg.over_shoot_pct)
+
+The following control aspects of vbr encoding
+
+- <b>--bias-pct</b> (\ref RateControlCfg.vbrbias)
+- <b>--minsection-pct</b> ((\ref RateControlCfg.vbrmin_section)
+- <b>--maxsection-pct</b> ((\ref RateControlCfg.vbrmax_section)
+
+The following relate to buffer and delay management in one pass low delay and
+real time coding
+
+- <b>--buf-sz</b> (\ref RateControlCfg.maximum_buffer_size_ms)
+- <b>--buf-initial-sz</b> (\ref RateControlCfg.starting_buffer_level_ms)
+- <b>--buf-optimal-sz</b> (\ref RateControlCfg.optimal_buffer_level_ms)
+
+\subsection architecture_enc_vbr Variable Bitrate (VBR) Encoding
+
+For streamed VOD content the most common rate control strategy is Variable
+Bitrate (VBR) encoding. The CQ mode mentioned above is a variant of this
+where additional quantizer and quality constraints are applied.  VBR
+encoding may in theory be used in conjunction with either 1-pass or 2-pass
+encoding.
+
+VBR encoding varies the number of bits given to each frame or group of frames
+according to the difficulty of that frame or group of frames, such that easier
+frames are allocated fewer bits and harder frames are allocated more bits. The
+intent here is to even out the quality between frames. This contrasts with
+Constant Bitrate (CBR) encoding where each frame is allocated the same number
+of bits.
+
+Whilst for any given frame or group of frames the data rate may vary, the VBR
+algorithm attempts to deliver a given average bitrate over a wider time
+interval. In standard VBR encoding, the time interval over which the data rate
+is averaged is usually the duration of the video clip.  An alternative
+approach is to target an average VBR bitrate over the entire video corpus for
+a particular video format (corpus VBR).
+
+\subsubsection architecture_enc_1pass_vbr 1 Pass VBR Encoding
+
+The command line for libaom does allow 1 Pass VBR, but this has not been
+properly optimised and behaves much like 1 pass CBR in most regards, with bits
+allocated to frames by the following functions:
+
+- \ref av1_calc_iframe_target_size_one_pass_vbr()
+- \ref av1_calc_pframe_target_size_one_pass_vbr()
+
+\subsubsection architecture_enc_2pass_vbr 2 Pass VBR Encoding
+
+The main focus here will be on 2-pass VBR encoding (and the related CQ mode)
+as these are the modes most commonly used for VOD content.
+
+2-pass encoding is selected on the command line by setting --passes=2
+(or -p 2).
+
+Generally speaking, in 2-pass encoding, an encoder will first encode a video
+using a default set of parameters and assumptions. Depending on the outcome
+of that first encode, the baseline assumptions and parameters will be adjusted
+to optimize the output during the second pass.  In essence the first pass is a
+fact finding mission to establish the complexity and variability of the video,
+in order to allow a better allocation of bits in the second pass.
+
+The libaom 2-pass algorithm is unusual in that the first pass is not a full
+encode of the video. Rather it uses a limited set of prediction and transform
+options and a fixed quantizer,  to generate statistics about each frame. No
+output bitstream is created and the per frame first pass statistics are stored
+entirely in volatile memory. This has some disadvantages when compared to a
+full first pass encode, but avoids the need for file I/O and improves speed.
+
+For two pass encoding, the function \ref av1_encode() will first be called
+for each frame in the video with the value \ref AV1EncoderConfig.pass = 1.
+This will result in calls to \ref av1_first_pass().
+
+Statistics for each frame are stored in \ref FIRSTPASS_STATS frame_stats_buf.
+
+After completion of the first pass, \ref av1_encode() will be called again for
+each frame with \ref AV1EncoderConfig.pass = 2.  The frames are then encoded in
+accordance with the statistics gathered during the first pass by calls to
+\ref encode_frame_to_data_rate() which in turn calls
+ \ref av1_get_second_pass_params().
+
+In summary the second pass code :-
+
+- Searches for scene cuts (if auto key frame detection is enabled).
+- Defines the length of and hierarchical structure to be used in each
+  ARF/GF group.
+- Allocates bits based on the relative complexity of each frame, the quality
+  of frame to frame prediction and the type of frame (e.g. key frame, ARF
+  frame, golden frame or normal leaf frame).
+- Suggests a maximum Q (quantizer value) for each ARF/GF group, based on
+  estimated complexity and recent rate control compliance
+  (\ref RATE_CONTROL.active_worst_quality)
+- Tracks adherence to the overall rate control objectives and adjusts
+  heuristics.
+
+The main two pass functions in regard to the above include:-
+
+- \ref find_next_key_frame()
+- \ref define_gf_group()
+- \ref calculate_total_gf_group_bits()
+- \ref get_twopass_worst_quality()
+- \ref av1_gop_setup_structure()
+- \ref av1_gop_bit_allocation()
+- \ref av1_twopass_postencode_update()
+
+For each frame, the two pass algorithm defines a target number of bits
+\ref RATE_CONTROL.base_frame_target,  which is then adjusted if necessary to
+reflect any undershoot or overshoot on previous frames to give
+\ref RATE_CONTROL.this_frame_target.
+
+As well as \ref RATE_CONTROL.active_worst_quality, the two pass code also
+maintains a record of the actual Q value used to encode previous frames
+at each level in the current pyramid hierarchy
+(\ref PRIMARY_RATE_CONTROL.active_best_quality). The function
+\ref rc_pick_q_and_bounds(), uses these values to set a permitted Q range
+for each frame.
+
+\subsubsection architecture_enc_1pass_lagged 1 Pass Lagged VBR Encoding
+
+1 pass lagged encode falls between simple 1 pass encoding and full two pass
+encoding and is used for cases where it is not possible to do a full first
+pass through the entire video clip, but where some delay is permissible. For
+example near live streaming where there is a delay of up to a few seconds. In
+this case the first pass and second pass are in effect combined such that the
+first pass starts encoding the clip and the second pass lags behind it by a
+few frames.  When using this method, full sequence level statistics are not
+available, but it is possible to collect and use frame or group of frame level
+data to help in the allocation of bits and in defining ARF/GF coding
+hierarchies.  The reader is referred to the \ref AV1_PRIMARY.lap_enabled field
+in the main compressor instance (where <b>lap</b> stands for
+<b>look ahead processing</b>). This encoding mode for the most part uses the
+same rate control pathways as two pass VBR encoding.
+
+\subsection architecture_enc_rc_loop The Main Rate Control Loop
+
+Having established a target rate for a given frame and an allowed range of Q
+values, the encoder then tries to encode the frame at a rate that is as close
+as possible to the target value, given the Q range constraints.
+
+There are two main mechanisms by which this is achieved.
+
+The first selects a frame level Q, using an adaptive estimate of the number of
+bits that will be generated when the frame is encoded at any given Q.
+Fundamentally this mechanism is common to VBR, CBR and to use cases such as
+RTC with small adjustments.
+
+As the Q value mainly adjusts the precision of the residual signal, it is not
+actually a reliable basis for accurately predicting the number of bits that
+will be generated across all clips. A well predicted clip, for example, may
+have a much smaller error residual after prediction.  The algorithm copes with
+this by adapting its predictions on the fly using a feedback loop based on how
+well it did the previous time around.
+
+The main functions responsible for the prediction of Q and the adaptation over
+time, for the two pass encoding pipeline are:
+
+- \ref rc_pick_q_and_bounds()
+    - \ref get_q()
+        - \ref av1_rc_regulate_q()
+        - \ref get_rate_correction_factor()
+        - \ref set_rate_correction_factor()
+        - \ref find_closest_qindex_by_rate()
+- \ref av1_twopass_postencode_update()
+    - \ref av1_rc_update_rate_correction_factors()
+
+A second mechanism for control comes into play if there is a large rate miss
+for the current frame (much too big or too small). This is a recode mechanism
+which allows the current frame to be re-encoded one or more times with a
+revised Q value. This obviously has significant implications for encode speed
+and in the case of RTC latency (hence it is not used for the RTC pathway).
+
+Whether or not a recode is allowed for a given frame depends on the selected
+encode speed vs quality trade off. This is set on the command line using the
+--cpu-used parameter which maps onto the \ref AV1_COMP.speed field in the main
+compressor instance data structure.
+
+The value of \ref AV1_COMP.speed, combined with the use case, is used to
+populate the speed features data structure AV1_COMP.sf. In particular
+\ref HIGH_LEVEL_SPEED_FEATURES.recode_loop determines the types of frames that
+may be recoded and \ref HIGH_LEVEL_SPEED_FEATURES.recode_tolerance is a rate
+error trigger threshold.
+
+For more information the reader is directed to the following functions:
+
+- \ref encode_with_recode_loop()
+- \ref encode_without_recode()
+- \ref recode_loop_update_q()
+- \ref recode_loop_test()
+- \ref av1_set_speed_features_framesize_independent()
+- \ref av1_set_speed_features_framesize_dependent()
+
+\subsection architecture_enc_fixed_q Fixed Q Mode
+
+There are two main fixed Q cases:
+-# Fixed Q with adaptive qp offsets: same qp offset for each pyramid level
+   in a given video, but these offsets are adaptive based on video content.
+-# Fixed Q with fixed qp offsets: content-independent fixed qp offsets for
+   each pyramid level.
+
+The reader is also refered to the following functions:
+- \ref av1_rc_pick_q_and_bounds()
+- \ref rc_pick_q_and_bounds_no_stats_cbr()
+- \ref rc_pick_q_and_bounds_no_stats()
+- \ref rc_pick_q_and_bounds()
+
+\section architecture_enc_frame_groups GF/ ARF Frame Groups & Hierarchical Coding
+
+\subsection architecture_enc_frame_groups_data Main Data Structures
+
+The following are the main data structures referenced in this section
+(see also \ref architecture_enc_data_structures):
+
+- \ref AV1_COMP cpi (the main compressor instance data structure)
+    - \ref AV1_COMP.rc (\ref RATE_CONTROL)
+
+- \ref FIRSTPASS_STATS *frame_stats_buf (used to store per frame first pass
+stats)
+
+\subsection architecture_enc_frame_groups_groups Frame Groups
+
+To process a sequence/stream of video frames, the encoder divides the frames
+into groups and encodes them sequentially (possibly dependent on previous
+groups). In AV1 such a group is usually referred to as a golden frame group
+(GF group) or sometimes an Alt-Ref (ARF) group or a group of pictures (GOP).
+A GF group determines and stores the coding structure of the frames (for
+example, frame type, usage of the hierarchical structure, usage of overlay
+frames, etc.) and can be considered as the base unit to process the frames,
+therefore playing an important role in the encoder.
+
+The length of a specific GF group is arguably the most important aspect when
+determining a GF group. This is because most GF group level decisions are
+based on the frame characteristics, if not on the length itself directly.
+Note that the GF group is always a group of consecutive frames, which means
+the start and end of the group (so again, the length of it) determines which
+frames are included in it and hence determines the characteristics of the GF
+group. Therefore, in this document we will first discuss the GF group length
+decision in Libaom, followed by frame structure decisions when defining a GF
+group with a certain length.
+
+\subsection architecture_enc_gf_length GF / ARF Group Length Determination
+
+The basic intuition of determining the GF group length is that it is usually
+desirable to group together frames that are similar. Hence, we may choose
+longer groups when consecutive frames are very alike and shorter ones when
+they are very different.
+
+The determination of the GF group length is done in function \ref
+calculate_gf_length(). The following encoder use cases are supported:
+
+<ul>
+  <li><b>Single pass with look-ahead disabled(\ref has_no_stats_stage()):
+  </b> in this case there is no information available on the following stream
+  of frames, therefore the function will set the GF group length for the
+  current and the following GF groups (a total number of MAX_NUM_GF_INTERVALS
+  groups) to be the maximum value allowed.</li>
+
+  <li><b>Single pass with look-ahead enabled (\ref AV1_PRIMARY.lap_enabled):</b>
+  look-ahead processing is enabled for single pass, therefore there is a
+  limited amount of information available regarding future frames. In this
+  case the function will determine the length based on \ref FIRSTPASS_STATS
+  (which is generated when processing the look-ahead buffer) for only the
+  current GF group.</li>
+
+  <li><b>Two pass:</b> the first pass in two-pass encoding collects the stats
+  and will not call the function. In the second pass, the function tries to
+  determine the GF group length of the current and the following GF groups (a
+  total number of MAX_NUM_GF_INTERVALS groups) based on the first-pass
+  statistics. Note that as we will be discussing later, such decisions may not
+  be accurate and can be changed later.</li>
+</ul>
+
+Except for the first trivial case where there is no prior knowledge of the
+following frames, the function \ref calculate_gf_length() tries to determine the
+GF group length based on the first pass statistics. The determination is divided
+into two parts:
+
+<ol>
+   <li>Baseline decision based on accumulated statistics: this part of the function
+   iterates through the firstpass statistics of the following frames and
+   accumulates the statistics with function accumulate_next_frame_stats.
+   The accumulated statistics are then used to determine whether the
+   correlation in the GF group has dropped too much in function detect_gf_cut.
+   If detect_gf_cut returns non-zero, or if we've reached the end of
+   first-pass statistics, the baseline decision is set at the current point.</li>
+
+   <li>If we are not at the end of the first-pass statistics, the next part will
+   try to refine the baseline decision. This algorithm is based on the analysis
+   of firstpass stats. It tries to cut the groups in stable regions or
+   relatively stable points. Also it tries to avoid cutting in a blending
+   region.</li>
+</ol>
+
+As mentioned, for two-pass encoding, the function \ref
+calculate_gf_length() tries to determine the length of as many as
+MAX_NUM_GF_INTERVALS groups. The decisions are stored in
+\ref PRIMARY_RATE_CONTROL.gf_intervals[]. The variables
+\ref RATE_CONTROL.intervals_till_gf_calculate_due and
+\ref PRIMARY_RATE_CONTROL.gf_intervals[] help with managing and updating the stored
+decisions. In the function \ref define_gf_group(), the corresponding
+stored length decision will be used to define the current GF group.
+
+When the maximum GF group length is larger or equal to 32, the encoder will
+enforce an extra layer to determine whether to use maximum GF length of 32
+or 16 for every GF group. In such a case, \ref calculate_gf_length() is
+first called with the original maximum length (>=32). Afterwards,
+\ref av1_tpl_setup_stats() is called to analyze the determined GF group
+and compare the reference to the last frame and the middle frame. If it is
+decided that we should use a maximum GF length of 16, the function
+\ref calculate_gf_length() is called again with the updated maximum
+length, and it only sets the length for a single GF group
+(\ref RATE_CONTROL.intervals_till_gf_calculate_due is set to 1). This process
+is shown below.
+
+\image html tplgfgroupdiagram.png "" width=40%
+
+Before encoding each frame, the encoder checks
+\ref RATE_CONTROL.frames_till_gf_update_due. If it is zero, indicating
+processing of the current GF group is done, the encoder will check whether
+\ref RATE_CONTROL.intervals_till_gf_calculate_due is zero. If it is, as
+discussed above, \ref calculate_gf_length() is called with original
+maximum length. If it is not zero, then the GF group length value stored
+in \ref PRIMARY_RATE_CONTROL.gf_intervals[\ref PRIMARY_RATE_CONTROL.cur_gf_index] is used
+(subject to change as discussed above).
+
+\subsection architecture_enc_gf_structure Defining a GF Group's Structure
+
+The function \ref define_gf_group() defines the frame structure as well
+as other GF group level parameters (e.g. bit allocation) once the length of
+the current GF group is determined.
+
+The function first iterates through the first pass statistics in the GF group to
+accumulate various stats, using accumulate_this_frame_stats() and
+accumulate_next_frame_stats(). The accumulated statistics are then used to
+determine the use of the use of ALTREF frame along with other properties of the
+GF group. The values of \ref PRIMARY_RATE_CONTROL.cur_gf_index, \ref
+RATE_CONTROL.intervals_till_gf_calculate_due and \ref
+RATE_CONTROL.frames_till_gf_update_due are also updated accordingly.
+
+The function \ref av1_gop_setup_structure() is called at the end to determine
+the frame layers and reference maps in the GF group, where the
+construct_multi_layer_gf_structure() function sets the frame update types for
+each frame and the group structure.
+
+- If ALTREF frames are allowed for the GF group: the first frame is set to
+  KF_UPDATE, GF_UPDATE or ARF_UPDATE. The last frames of the GF group is set to
+  OVERLAY_UPDATE.  Then in set_multi_layer_params(), frame update
+  types are determined recursively in a binary tree fashion, and assigned to
+  give the final IBBB structure for the group.  - If the current branch has more
+  than 2 frames and we have not reached maximum layer depth, then the middle
+  frame is set as INTNL_ARF_UPDATE, and the left and right branches are
+  processed recursively.  - If the current branch has less than 3 frames, or we
+  have reached maximum layer depth, then every frame in the branch is set to
+  LF_UPDATE.
+
+- If ALTREF frame is not allowed for the GF group: the frames are set
+  as LF_UPDATE. This basically forms an IPPP GF group structure.
+
+As mentioned, the encoder may use Temporal dependancy modelling (TPL - see \ref
+architecture_enc_tpl) to determine whether we should use a maximum length of 32
+or 16 for the current GF group. This requires calls to \ref define_gf_group()
+but should not change other settings (since it is in essence a trial). This
+special case is indicated by the setting parameter <b>is_final_pass</b> for to
+zero.
+
+For single pass encodes where look-ahead processing is disabled
+(\ref AV1_PRIMARY.lap_enabled = 0), \ref define_gf_group_pass0() is used
+instead of \ref define_gf_group().
+
+\subsection architecture_enc_kf_groups Key Frame Groups
+
+A special constraint for GF group length is the location of the next keyframe
+(KF). The frames between two KFs are referred to as a KF group. Each KF group
+can be encoded and decoded independently. Because of this, a GF group cannot
+span beyond a KF and the location of the next KF is set as a hard boundary
+for GF group length.
+
+<ul>
+   <li>For two-pass encoding \ref RATE_CONTROL.frames_to_key controls when to
+   encode a key frame. When it is zero, the current frame is a keyframe and
+   the function \ref find_next_key_frame() is called. This in turn calls
+   \ref define_kf_interval() to work out where the next key frame should
+   be placed.</li>
+
+   <li>For single-pass with look-ahead enabled, \ref define_kf_interval()
+   is called whenever a GF group update is needed (when
+   \ref RATE_CONTROL.frames_till_gf_update_due is zero). This is because
+   generally KFs are more widely spaced and the look-ahead buffer is usually
+   not long enough.</li>
+
+   <li>For single-pass with look-ahead disabled, the KFs are placed according
+   to the command line parameter <b>--kf-max-dist</b> (The above two cases are
+   also subject to this constraint).</li>
+</ul>
+
+The function \ref define_kf_interval() tries to detect a scenecut.
+If a scenecut within kf-max-dist is detected, then it is set as the next
+keyframe. Otherwise the given maximum value is used.
+
+\section architecture_enc_tpl Temporal Dependency Modelling
+
+The temporal dependency model runs at the beginning of each GOP. It builds the
+motion trajectory within the GOP in units of 16x16 blocks. The temporal
+dependency of a 16x16 block is evaluated as the predictive coding gains it
+contributes to its trailing motion trajectory. This temporal dependency model
+reflects how important a coding block is for the coding efficiency of the
+overall GOP. It is hence used to scale the Lagrangian multiplier used in the
+rate-distortion optimization framework.
+
+\subsection architecture_enc_tpl_config Configurations
+
+The temporal dependency model and its applications are by default turned on in
+libaom encoder for the VoD use case. To disable it, use --tpl-model=0 in the
+aomenc configuration.
+
+\subsection architecture_enc_tpl_algoritms Algorithms
+
+The scheme works in the reverse frame processing order over the source frames,
+propagating information from future frames back to the current frame. For each
+frame, a propagation step is run for each MB. it operates as follows:
+
+<ul>
+   <li> Estimate the intra prediction cost in terms of sum of absolute Hadamard
+   transform difference (SATD) noted as intra_cost. It also loads the motion
+   information available from the first-pass encode and estimates the inter
+   prediction cost as inter_cost. Due to the use of hybrid inter/intra
+   prediction mode, the inter_cost value is further upper bounded by
+   intra_cost. A propagation cost variable is used to collect all the
+   information flowed back from future processing frames. It is initialized as
+   0 for all the blocks in the last processing frame in a group of pictures
+   (GOP).</li>
+
+   <li> The fraction of information from a current block to be propagated towards
+   its reference block is estimated as:
+\f[
+   propagation\_fraction = (1 - inter\_cost/intra\_cost)
+\f]
+   It reflects how much the motion compensated reference would reduce the
+   prediction error in percentage.</li>
+
+   <li> The total amount of information the current block contributes to the GOP
+   is estimated as intra_cost + propagation_cost. The information that it
+   propagates towards its reference block is captured by:
+
+\f[
+   propagation\_amount =
+   (intra\_cost + propagation\_cost) * propagation\_fraction
+\f]</li>
+
+   <li> Note that the reference block may not necessarily sit on the grid of
+   16x16 blocks. The propagation amount is hence dispensed to all the blocks
+   that overlap with the reference block. The corresponding block in the
+   reference frame accumulates its own propagation cost as it receives back
+   propagation.
+
+\f[
+   propagation\_cost = propagation\_cost +
+                       (\frac{overlap\_area}{(16*16)} * propagation\_amount)
+\f]</li>
+
+   <li> In the final encoding stage, the distortion propagation factor of a block
+   is evaluated as \f$(1 + \frac{propagation\_cost}{intra\_cost})\f$, where the second term
+   captures its impact on later frames in a GOP.</li>
+
+   <li> The Lagrangian multiplier is adapted at the 64x64 block level. For every
+   64x64 block in a frame, we have a distortion propagation factor:
+
+\f[
+  dist\_prop[i] = 1 + \frac{propagation\_cost[i]}{intra\_cost[i]}
+\f]
+
+   where i denotes the block index in the frame. We also have the frame level
+   distortion propagation factor:
+
+\f[
+  dist\_prop = 1 +
+  \frac{\sum_{i}propagation\_cost[i]}{\sum_{i}intra\_cost[i]}
+\f]
+
+   which is used to normalize the propagation factor at the 64x64 block level. The
+   Lagrangian multiplier is hence adapted as:
+
+\f[
+  &lambda;[i] = &lambda;[0] * \frac{dist\_prop}{dist\_prop[i]}
+\f]
+
+   where &lambda;0 is the multiplier associated with the frame level QP. The
+   64x64 block level QP is scaled according to the Lagrangian multiplier.
+</ul>
+
+\subsection architecture_enc_tpl_keyfun Key Functions and data structures
+
+The reader is also refered to the following functions and data structures:
+
+- \ref TplParams
+- \ref av1_tpl_setup_stats() builds the TPL model.
+- \ref setup_delta_q() Assign different quantization parameters to each super
+  block based on its TPL weight.
+
+\section architecture_enc_partitions Block Partition Search
+
+ A frame is first split into tiles in \ref encode_tiles(), with each tile
+ compressed by av1_encode_tile(). Then a tile is processed in superblock rows
+ via \ref av1_encode_sb_row() and then \ref encode_sb_row().
+
+ The partition search processes superblocks sequentially in \ref
+ encode_sb_row(). Two search modes are supported, depending upon the encoding
+ configuration, \ref encode_nonrd_sb() is for 1-pass and real-time modes,
+ while \ref encode_rd_sb() performs more exhaustive rate distortion based
+ searches.
+
+ Partition search over the recursive quad-tree space is implemented by
+ recursive calls to \ref av1_nonrd_use_partition(),
+ \ref av1_rd_use_partition(), or av1_rd_pick_partition() and returning best
+ options for sub-trees to their parent partitions.
+
+ In libaom, the partition search lays on top of the mode search (predictor,
+ transform, etc.), instead of being a separate module. The interface of mode
+ search is \ref pick_sb_modes(), which connects the partition_search with
+ \ref architecture_enc_inter_modes and \ref architecture_enc_intra_modes. To
+ make good decisions, reconstruction is also required in order to build
+ references and contexts. This is implemented by \ref encode_sb() at the
+ sub-tree level and \ref encode_b() at coding block level.
+
+ See also \ref partition_search
+
+\section architecture_enc_intra_modes Intra Mode Search
+
+AV1 also provides 71 different intra prediction modes, i.e. modes that predict
+only based upon information in the current frame with no dependency on
+previous or future frames. For key frames, where this independence from any
+other frame is a defining requirement and for other cases where intra only
+frames are required, the encoder need only considers these modes in the rate
+distortion loop.
+
+Even so, in most use cases, searching all possible intra prediction modes for
+every block and partition size is not practical and some pruning of the search
+tree is necessary.
+
+For the Rate distortion optimized case, the main top level function
+responsible for selecting the intra prediction mode for a given block is
+\ref av1_rd_pick_intra_mode_sb(). The readers attention is also drawn to the
+functions \ref hybrid_intra_mode_search() and \ref av1_nonrd_pick_intra_mode()
+which may be used where encode speed is critical. The choice between the
+rd path and the non rd or hybrid paths depends on the encoder use case and the
+\ref AV1_COMP.speed parameter. Further fine control of the speed vs quality
+trade off is provided by means of fields in \ref AV1_COMP.sf (which has type
+\ref SPEED_FEATURES).
+
+Note that some intra modes are only considered for specific use cases or
+types of video. For example the palette based prediction modes are often
+valueable for graphics or screen share content but not for natural video.
+(See \ref av1_search_palette_mode())
+
+See also \ref intra_mode_search for more details.
+
+\section architecture_enc_inter_modes Inter Prediction Mode Search
+
+For inter frames, where we also allow prediction using one or more previously
+coded frames (which may chronologically speaking be past or future frames or
+non-display reference buffers such as ARF frames), the size of the search tree
+that needs to be traversed, to select a prediction mode, is considerably more
+massive.
+
+In addition to the 71 possible intra modes we also need to consider 56 single
+frame inter prediction modes (7 reference frames x 4 modes x 2 for OBMC
+(overlapped block motion compensation)), 12768 compound inter prediction modes
+(these are modes that combine inter predictors from two reference frames) and
+36708 compound inter / intra prediction modes.
+
+As with the intra mode search, libaom supports an RD based pathway and a non
+rd pathway for speed critical use cases.  The entry points for these two cases
+are \ref av1_rd_pick_inter_mode() and \ref av1_nonrd_pick_inter_mode_sb()
+respectively.
+
+Various heuristics and predictive strategies are used to prune the search tree
+with fine control provided through the speed features parameter in the main
+compressor instance data structure \ref AV1_COMP.sf.
+
+It is worth noting, that some prediction modes incurr a much larger rate cost
+than others (ignoring for now the cost of coding the error residual). For
+example, a compound mode that requires the encoder to specify two reference
+frames and two new motion vectors will almost inevitable have a higher rate
+cost than a simple inter prediction mode that uses a predicted or 0,0 motion
+vector. As such, if we have already found a mode for the current block that
+has a low RD cost, we can skip a large number of the possible modes on the
+basis that even if the error residual is 0 the inherent rate cost of the
+mode itself will garauntee that it is not chosen.
+
+See also \ref inter_mode_search for more details.
+
+\section architecture_enc_tx_search Transform Search
+
+AV1 implements the transform stage using 4 seperable 1-d transforms (DCT,
+ADST, FLIPADST and IDTX, where FLIPADST is the reversed version of ADST
+and IDTX is the identity transform) which can be combined to give 16 2-d
+combinations.
+
+These combinations can be applied at 19 different scales from 64x64 pixels
+down to 4x4 pixels.
+
+This gives rise to a large number of possible candidate transform options
+for coding the residual error after prediction. An exhaustive rate-distortion
+based evaluation of all candidates would not be practical from a speed
+perspective in a production encoder implementation. Hence libaom addopts a
+number of strategies to prune the selection of both the transform size and
+transform type.
+
+There are a number of strategies that have been tested and implememnted in
+libaom including:
+
+- A statistics based approach that looks at the frequency with which certain
+  combinations are used in a given context and prunes out very unlikely
+  candidates. It is worth noting here that some size candidates can be pruned
+  out immediately based on the size of the prediction partition. For example it
+  does not make sense to use a transform size that is larger than the
+  prediction partition size but also a very large prediction partition size is
+  unlikely to be optimally pared with small transforms.
+
+- A Machine learning based model
+
+- A method that initially tests candidates using a fast algorithm that skips
+  entropy encoding and uses an estimated cost model to choose a reduced subset
+  for full RD analysis. This subject is covered more fully in a paper authored
+  by Bohan Li, Jingning Han, and Yaowu Xu titled: <b>Fast Transform Type
+  Selection Using Conditional Laplace Distribution Based Rate Estimation</b>
+
+<b>TODO Add link to paper when available</b>
+
+See also \ref transform_search for more details.
+
+\section architecture_post_enc_filt Post Encode Loop Filtering
+
+AV1 supports three types of post encode <b>in loop</b> filtering to improve
+the quality of the reconstructed video.
+
+- <b>Deblocking Filter</b> The first of these is a farily traditional boundary
+  deblocking filter that attempts to smooth discontinuities that may occur at
+  the boundaries between blocks. See also \ref in_loop_filter.
+
+- <b>CDEF Filter</b> The constrained directional enhancement filter (CDEF)
+  allows the codec to apply a non-linear deringing filter along certain
+  (potentially oblique) directions. A primary filter is applied along the
+  selected direction, whilst a secondary filter is applied at 45 degrees to
+  the primary direction. (See also \ref in_loop_cdef and
+  <a href="https://arxiv.org/abs/2008.06091"> A Technical Overview of AV1</a>.
+
+- <b>Loop Restoration Filter</b> The loop restoration filter is applied after
+  any prior post filtering stages. It acts on units of either 64 x 64,
+  128 x 128, or 256 x 256 pixel blocks, refered to as loop restoration units.
+  Each unit can independently select either to bypass filtering, use a Wiener
+  filter, or use a self-guided filter. (See also \ref in_loop_restoration and
+  <a href="https://arxiv.org/abs/2008.06091"> A Technical Overview of AV1</a>.
+
+\section architecture_entropy Entropy Coding
+
+\subsection architecture_entropy_aritmetic Arithmetic Coder
+
+VP9, used a binary arithmetic coder to encode symbols, where the propability
+of a 1 or 0 at each descision node was based on a context model that took
+into account recently coded values (for example previously coded coefficients
+in the current block). A mechanism existed to update the context model each
+frame, either explicitly in the bitstream, or implicitly at both the encoder
+and decoder based on the observed frequency of different outcomes in the
+previous frame. VP9 also supported seperate context models for different types
+of frame (e.g. inter coded frames and key frames).
+
+In contrast, AV1 uses an M-ary symbol arithmetic coder to compress the syntax
+elements, where integer \f$M\in[2, 14]\f$. This approach is based upon the entropy
+coding strategy used in the Daala video codec and allows for some bit-level
+parallelism in its implementation. AV1 also has an extended context model and
+allows for updates to the probabilities on a per symbol basis as opposed to
+the per frame strategy in VP9.
+
+To improve the performance / throughput of the arithmetic encoder, especially
+in hardware implementations, the probability model is updated and maintained
+at 15-bit precision, but the arithmetic encoder only uses the most significant
+9 bits when encoding a symbol. A more detailed discussion of the algorithm
+and design constraints can be found in
+<a href="https://arxiv.org/abs/2008.06091"> A Technical Overview of AV1</a>.
+
+TODO add references to key functions / files.
+
+As with VP9, a mechanism exists in AV1 to encode some elements into the
+bitstream as uncrompresed bits or literal values, without using the arithmetic
+coder. For example, some frame and sequence header values, where it is
+beneficial to be able to read the values directly.
+
+TODO add references to key functions / files.
+
+\subsection architecture_entropy_coef Transform Coefficient Coding and Optimization
+\image html coeff_coding.png "" width=70%
+
+\subsubsection architecture_entropy_coef_what Transform coefficient coding
+Transform coefficient coding is where the encoder compresses a quantized version
+of prediction residue into the bitstream.
+
+\paragraph architecture_entropy_coef_prepare Preparation - transform and quantize
+Before the entropy coding stage, the encoder decouple the pixel-to-pixel
+correlation of the prediction residue by transforming the residue from the
+spatial domain to the frequency domain. Then the encoder quantizes the transform
+coefficients to make the coefficients ready for entropy coding.
+
+\paragraph architecture_entropy_coef_coding The coding process
+The encoder uses \ref av1_write_coeffs_txb() to write the coefficients of
+a transform block into the bitstream.
+The coding process has three stages.
+1. The encoder will code transform block skip flag (txb_skip). If the skip flag is
+off, then the encoder will code the end of block position (eob) which is the scan
+index of the last non-zero coefficient plus one.
+2. Second, the encoder will code lower magnitude levels of each coefficient in
+reverse scan order.
+3. Finally, the encoder will code the sign and higher magnitude levels for each
+coefficient if they are available.
+
+Related functions:
+- \ref av1_write_coeffs_txb()
+- write_inter_txb_coeff()
+- \ref av1_write_intra_coeffs_mb()
+
+\paragraph architecture_entropy_coef_context Context information
+To improve the compression efficiency, the encoder uses several context models
+tailored for transform coefficients to capture the correlations between coding
+symbols. Most of the context models are built to capture the correlations
+between the coefficients within the same transform block. However, transform
+block skip flag (txb_skip) and the sign of dc coefficient (dc_sign) require
+context info from neighboring transform blocks.
+
+Here is how context info spread between transform blocks. Before coding a
+transform block, the encoder will use get_txb_ctx() to collect the context
+information from neighboring transform blocks. Then the context information
+will be used for coding transform block skip flag (txb_skip) and the sign of
+dc coefficient (dc_sign). After the transform block is coded, the encoder will
+extract the context info from the current block using
+\ref av1_get_txb_entropy_context(). Then encoder will store the context info
+into a byte (uint8_t) using av1_set_entropy_contexts(). The encoder will use
+the context info to code other transform blocks.
+
+Related functions:
+- \ref av1_get_txb_entropy_context()
+- av1_set_entropy_contexts()
+- get_txb_ctx()
+- \ref av1_update_intra_mb_txb_context()
+
+\subsubsection architecture_entropy_coef_rd RD optimization
+Beside the actual entropy coding, the encoder uses several utility functions
+to make optimal RD decisions.
+
+\paragraph architecture_entropy_coef_cost Entropy cost
+The encoder uses \ref av1_cost_coeffs_txb() or \ref av1_cost_coeffs_txb_laplacian()
+to estimate the entropy cost of a transform block. Note that
+\ref av1_cost_coeffs_txb() is slower but accurate whereas
+\ref av1_cost_coeffs_txb_laplacian() is faster but less accurate.
+
+Related functions:
+- \ref av1_cost_coeffs_txb()
+- \ref av1_cost_coeffs_txb_laplacian()
+- \ref av1_cost_coeffs_txb_estimate()
+
+\paragraph architecture_entropy_coef_opt Quantized level optimization
+Beside computing entropy cost, the encoder also uses \ref av1_optimize_txb()
+to adjust the coefficient’s quantized levels to achieve optimal RD trade-off.
+In \ref av1_optimize_txb(), the encoder goes through each quantized
+coefficient and lowers the quantized coefficient level by one if the action
+yields a better RD score.
+
+Related functions:
+- \ref av1_optimize_txb()
+
+All the related functions are listed in \ref coefficient_coding.
+
+*/
+
+/*!\defgroup encoder_algo Encoder Algorithm
+ *
+ * The encoder algorithm describes how a sequence is encoded, including high
+ * level decision as well as algorithm used at every encoding stage.
+ */
+
+/*!\defgroup high_level_algo High-level Algorithm
+ * \ingroup encoder_algo
+ * This module describes sequence level/frame level algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+
+/*!\defgroup speed_features Speed vs Quality Trade Off
+ * \ingroup high_level_algo
+ * This module describes the encode speed vs quality tradeoff
+ * @{
+ */
+/*! @} - end defgroup speed_features */
+
+/*!\defgroup src_frame_proc Source Frame Processing
+ * \ingroup high_level_algo
+ * This module describes algorithms in AV1 assosciated with the
+ * pre-processing of source frames. See also \ref architecture_enc_src_proc
+ *
+ * @{
+ */
+/*! @} - end defgroup src_frame_proc */
+
+/*!\defgroup rate_control Rate Control
+ * \ingroup high_level_algo
+ * This module describes rate control algorithm in AV1.
+ *  See also \ref architecture_enc_rate_ctrl
+ * @{
+ */
+/*! @} - end defgroup rate_control */
+
+/*!\defgroup tpl_modelling Temporal Dependency Modelling
+ * \ingroup high_level_algo
+ * This module includes algorithms to implement temporal dependency modelling.
+ *  See also \ref architecture_enc_tpl
+ * @{
+ */
+/*! @} - end defgroup tpl_modelling */
+
+/*!\defgroup two_pass_algo Two Pass Mode
+   \ingroup high_level_algo
+
+ In two pass mode, the input file is passed into the encoder for a quick
+ first pass, where statistics are gathered. These statistics and the input
+ file are then passed back into the encoder for a second pass. The statistics
+ help the encoder reach the desired bitrate without as much overshooting or
+ undershooting.
+
+ During the first pass, the codec will return "stats" packets that contain
+ information useful for the second pass. The caller should concatenate these
+ packets as they are received. In the second pass, the concatenated packets
+ are passed in, along with the frames to encode. During the second pass,
+ "frame" packets are returned that represent the compressed video.
+
+ A complete example can be found in `examples/twopass_encoder.c`. Pseudocode
+ is provided below to illustrate the core parts.
+
+ During the first pass, the uncompressed frames are passed in and stats
+ information is appended to a byte array.
+
+~~~~~~~~~~~~~~~{.c}
+// For simplicity, assume that there is enough memory in the stats buffer.
+// Actual code will want to use a resizable array. stats_len represents
+// the length of data already present in the buffer.
+void get_stats_data(aom_codec_ctx_t *encoder, char *stats,
+                    size_t *stats_len, bool *got_data) {
+  const aom_codec_cx_pkt_t *pkt;
+  aom_codec_iter_t iter = NULL;
+  while ((pkt = aom_codec_get_cx_data(encoder, &iter))) {
+    *got_data = true;
+    if (pkt->kind != AOM_CODEC_STATS_PKT) continue;
+    memcpy(stats + *stats_len, pkt->data.twopass_stats.buf,
+           pkt->data.twopass_stats.sz);
+    *stats_len += pkt->data.twopass_stats.sz;
+  }
+}
+
+void first_pass(char *stats, size_t *stats_len) {
+  struct aom_codec_enc_cfg first_pass_cfg;
+  ... // Initialize the config as needed.
+  first_pass_cfg.g_pass = AOM_RC_FIRST_PASS;
+  aom_codec_ctx_t first_pass_encoder;
+  ... // Initialize the encoder.
+
+  while (frame_available) {
+    // Read in the uncompressed frame, update frame_available
+    aom_image_t *frame_to_encode = ...;
+    aom_codec_encode(&first_pass_encoder, img, pts, duration, flags);
+    get_stats_data(&first_pass_encoder, stats, stats_len);
+  }
+  // After all frames have been processed, call aom_codec_encode with
+  // a NULL ptr repeatedly, until no more data is returned. The NULL
+  // ptr tells the encoder that no more frames are available.
+  bool got_data;
+  do {
+    got_data = false;
+    aom_codec_encode(&first_pass_encoder, NULL, pts, duration, flags);
+    get_stats_data(&first_pass_encoder, stats, stats_len, &got_data);
+  } while (got_data);
+
+  aom_codec_destroy(&first_pass_encoder);
+}
+~~~~~~~~~~~~~~~
+
+ During the second pass, the uncompressed frames and the stats are
+ passed into the encoder.
+
+~~~~~~~~~~~~~~~{.c}
+// Write out each encoded frame to the file.
+void get_cx_data(aom_codec_ctx_t *encoder, FILE *file,
+                 bool *got_data) {
+  const aom_codec_cx_pkt_t *pkt;
+  aom_codec_iter_t iter = NULL;
+  while ((pkt = aom_codec_get_cx_data(encoder, &iter))) {
+   *got_data = true;
+   if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) continue;
+   fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, file);
+  }
+}
+
+void second_pass(char *stats, size_t stats_len) {
+  struct aom_codec_enc_cfg second_pass_cfg;
+  ... // Initialize the config file as needed.
+  second_pass_cfg.g_pass = AOM_RC_LAST_PASS;
+  cfg.rc_twopass_stats_in.buf = stats;
+  cfg.rc_twopass_stats_in.sz = stats_len;
+  aom_codec_ctx_t second_pass_encoder;
+  ... // Initialize the encoder from the config.
+
+  FILE *output = fopen("output.obu", "wb");
+  while (frame_available) {
+    // Read in the uncompressed frame, update frame_available
+    aom_image_t *frame_to_encode = ...;
+    aom_codec_encode(&second_pass_encoder, img, pts, duration, flags);
+    get_cx_data(&second_pass_encoder, output);
+  }
+  // Pass in NULL to flush the encoder.
+  bool got_data;
+  do {
+    got_data = false;
+    aom_codec_encode(&second_pass_encoder, NULL, pts, duration, flags);
+    get_cx_data(&second_pass_encoder, output, &got_data);
+  } while (got_data);
+
+  aom_codec_destroy(&second_pass_encoder);
+}
+~~~~~~~~~~~~~~~
+ */
+
+ /*!\defgroup look_ahead_buffer The Look-Ahead Buffer
+    \ingroup high_level_algo
+
+ A program should call \ref aom_codec_encode() for each frame that needs
+ processing. These frames are internally copied and stored in a fixed-size
+ circular buffer, known as the look-ahead buffer. Other parts of the code
+ will use future frame information to inform current frame decisions;
+ examples include the first-pass algorithm, TPL model, and temporal filter.
+ Note that this buffer also keeps a reference to the last source frame.
+
+ The look-ahead buffer is defined in \ref av1/encoder/lookahead.h. It acts as an
+ opaque structure, with an interface to create and free memory associated with
+ it. It supports pushing and popping frames onto the structure in a FIFO
+ fashion. It also allows look-ahead when using the \ref av1_lookahead_peek()
+ function with a non-negative number, and look-behind when -1 is passed in (for
+ the last source frame; e.g., firstpass will use this for motion estimation).
+ The \ref av1_lookahead_depth() function returns the current number of frames
+ stored in it. Note that \ref av1_lookahead_pop() is a bit of a misnomer - it
+ only pops if either the "flush" variable is set, or the buffer is at maximum
+ capacity.
+
+ The buffer is stored in the \ref AV1_PRIMARY::lookahead field.
+ It is initialized in the first call to \ref aom_codec_encode(), in the
+ \ref av1_receive_raw_frame() sub-routine. The buffer size is defined by
+ the g_lag_in_frames parameter set in the
+ \ref aom_codec_enc_cfg_t::g_lag_in_frames struct.
+ This can be modified manually but should only be set once. On the command
+ line, the flag "--lag-in-frames" controls it. The default size is 19 for
+ non-realtime usage and 1 for realtime. Note that a maximum value of 35 is
+ enforced.
+
+ A frame will stay in the buffer as long as possible. As mentioned above,
+ the \ref av1_lookahead_pop() only removes a frame when either flush is set,
+ or the buffer is full. Note that each call to \ref aom_codec_encode() inserts
+ another frame into the buffer, and pop is called by the sub-function
+ \ref av1_encode_strategy(). The buffer is told to flush when
+ \ref aom_codec_encode() is passed a NULL image pointer. Note that the caller
+ must repeatedly call \ref aom_codec_encode() with a NULL image pointer, until
+ no more packets are available, in order to fully flush the buffer.
+
+ */
+
+/*! @} - end defgroup high_level_algo */
+
+/*!\defgroup partition_search Partition Search
+ * \ingroup encoder_algo
+ * For and overview of the partition search see \ref architecture_enc_partitions
+ * @{
+ */
+
+/*! @} - end defgroup partition_search */
+
+/*!\defgroup intra_mode_search Intra Mode Search
+ * \ingroup encoder_algo
+ * This module describes intra mode search algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup intra_mode_search */
+
+/*!\defgroup inter_mode_search Inter Mode Search
+ * \ingroup encoder_algo
+ * This module describes inter mode search algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup inter_mode_search */
+
+/*!\defgroup palette_mode_search Palette Mode Search
+ * \ingroup intra_mode_search
+ * This module describes palette mode search algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup palette_mode_search */
+
+/*!\defgroup transform_search Transform Search
+ * \ingroup encoder_algo
+ * This module describes transform search algorithm in AV1.
+ * @{
+ */
+/*! @} - end defgroup transform_search */
+
+/*!\defgroup coefficient_coding Transform Coefficient Coding and Optimization
+ * \ingroup encoder_algo
+ * This module describes the algorithms of transform coefficient coding and optimization in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup coefficient_coding */
+
+/*!\defgroup in_loop_filter In-loop Filter
+ * \ingroup encoder_algo
+ * This module describes in-loop filter algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup in_loop_filter */
+
+/*!\defgroup in_loop_cdef CDEF
+ * \ingroup encoder_algo
+ * This module describes the CDEF parameter search algorithm
+ * in AV1. More details will be added.
+ * @{
+ */
+/*! @} - end defgroup in_loop_restoration */
+
+/*!\defgroup in_loop_restoration Loop Restoration
+ * \ingroup encoder_algo
+ * This module describes the loop restoration search
+ * and estimation algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup in_loop_restoration */
+
+/*!\defgroup cyclic_refresh Cyclic Refresh
+ * \ingroup encoder_algo
+ * This module describes the cyclic refresh (aq-mode=3) in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup cyclic_refresh */
+
+/*!\defgroup SVC Scalable Video Coding
+ * \ingroup encoder_algo
+ * This module describes scalable video coding algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup SVC */
+/*!\defgroup variance_partition Variance Partition
+ * \ingroup encoder_algo
+ * This module describes variance partition algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup variance_partition */
+/*!\defgroup nonrd_mode_search NonRD Optimized Mode Search
+ * \ingroup encoder_algo
+ * This module describes NonRD Optimized Mode Search used in Real-Time mode.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup nonrd_mode_search */
diff --git a/media/libaom/src/doc/dev_guide/av1encoderflow.png b/media/libaom/src/doc/dev_guide/av1encoderflow.png
new file mode 100644
index 0000000000..5e69fce39c
--- /dev/null
+++ b/media/libaom/src/doc/dev_guide/av1encoderflow.png
diff --git a/media/libaom/src/doc/dev_guide/av1partitions.png b/media/libaom/src/doc/dev_guide/av1partitions.png
new file mode 100644
index 0000000000..125439f5cb
--- /dev/null
+++ b/media/libaom/src/doc/dev_guide/av1partitions.png
diff --git a/media/libaom/src/doc/dev_guide/coeff_coding.png b/media/libaom/src/doc/dev_guide/coeff_coding.png
new file mode 100644
index 0000000000..cba97dd712
--- /dev/null
+++ b/media/libaom/src/doc/dev_guide/coeff_coding.png
diff --git a/media/libaom/src/doc/dev_guide/filter_flow.png b/media/libaom/src/doc/dev_guide/filter_flow.png
new file mode 100644
index 0000000000..82849a0666
--- /dev/null
+++ b/media/libaom/src/doc/dev_guide/filter_flow.png
diff --git a/media/libaom/src/doc/dev_guide/filter_thr.png b/media/libaom/src/doc/dev_guide/filter_thr.png
new file mode 100644
index 0000000000..b833e941f6
--- /dev/null
+++ b/media/libaom/src/doc/dev_guide/filter_thr.png
diff --git a/media/libaom/src/doc/dev_guide/genericcodecflow.png b/media/libaom/src/doc/dev_guide/genericcodecflow.png
new file mode 100644
index 0000000000..65a6b2f19e
--- /dev/null
+++ b/media/libaom/src/doc/dev_guide/genericcodecflow.png
diff --git a/media/libaom/src/doc/dev_guide/gf_group.png b/media/libaom/src/doc/dev_guide/gf_group.png
new file mode 100644
index 0000000000..1cd47d2490
--- /dev/null
+++ b/media/libaom/src/doc/dev_guide/gf_group.png
diff --git a/media/libaom/src/doc/dev_guide/partition.png b/media/libaom/src/doc/dev_guide/partition.png
new file mode 100644
index 0000000000..914d6c2fd0
--- /dev/null
+++ b/media/libaom/src/doc/dev_guide/partition.png
diff --git a/media/libaom/src/doc/dev_guide/tplgfgroupdiagram.png b/media/libaom/src/doc/dev_guide/tplgfgroupdiagram.png
new file mode 100644
index 0000000000..fa5b0671c2
--- /dev/null
+++ b/media/libaom/src/doc/dev_guide/tplgfgroupdiagram.png
diff --git a/media/libaom/src/doc/img/edge_direction.svg b/media/libaom/src/doc/img/edge_direction.svg
new file mode 100644
index 0000000000..343a2b9f60
--- /dev/null
+++ b/media/libaom/src/doc/img/edge_direction.svg
@@ -0,0 +1,6319 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export edge_direction.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="9.25333in" height="8.04538in"
+		viewBox="0 0 666.24 579.267" xml:space="preserve" color-interpolation-filters="sRGB" class="st8">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#ffffff;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {fill:#000000;font-family:Calibri;font-size:0.75em}
+		.st3 {fill:#00b0f0;fill-opacity:0.5;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st4 {fill:#000000;font-family:Calibri;font-size:0.833336em}
+		.st5 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st6 {fill:#000000;font-family:Calibri;font-size:1.5em;font-style:italic}
+		.st7 {font-size:1em;font-style:normal}
+		.st8 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<v:layer v:name="Connector" v:index="0"/>
+		<g id="shape111-1" v:mID="111" v:groupContext="shape" transform="translate(18.12,-468.375)">
+			<title>Square</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape113-4" v:mID="113" v:groupContext="shape" transform="translate(36.12,-468.375)">
+			<title>Square.113</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape114-7" v:mID="114" v:groupContext="shape" transform="translate(54.12,-468.375)">
+			<title>Square.114</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape115-10" v:mID="115" v:groupContext="shape" transform="translate(72.12,-468.375)">
+			<title>Square.115</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape116-13" v:mID="116" v:groupContext="shape" transform="translate(18.12,-450.375)">
+			<title>Square.116</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape117-16" v:mID="117" v:groupContext="shape" transform="translate(36.12,-450.375)">
+			<title>Square.117</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape118-19" v:mID="118" v:groupContext="shape" transform="translate(54.12,-450.375)">
+			<title>Square.118</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape119-22" v:mID="119" v:groupContext="shape" transform="translate(72.12,-450.375)">
+			<title>Square.119</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape124-25" v:mID="124" v:groupContext="shape" transform="translate(18.12,-432.375)">
+			<title>Square.124</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape125-28" v:mID="125" v:groupContext="shape" transform="translate(36.12,-432.375)">
+			<title>Square.125</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape126-31" v:mID="126" v:groupContext="shape" transform="translate(54.12,-432.375)">
+			<title>Square.126</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape127-34" v:mID="127" v:groupContext="shape" transform="translate(72.12,-432.375)">
+			<title>Square.127</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape128-37" v:mID="128" v:groupContext="shape" transform="translate(18.12,-414.375)">
+			<title>Square.128</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape129-40" v:mID="129" v:groupContext="shape" transform="translate(36.12,-414.375)">
+			<title>Square.129</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape130-43" v:mID="130" v:groupContext="shape" transform="translate(54.12,-414.375)">
+			<title>Square.130</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape131-46" v:mID="131" v:groupContext="shape" transform="translate(72.12,-414.375)">
+			<title>Square.131</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape132-49" v:mID="132" v:groupContext="shape" transform="translate(18.12,-396.375)">
+			<title>Square.132</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape133-52" v:mID="133" v:groupContext="shape" transform="translate(36.12,-396.375)">
+			<title>Square.133</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape134-55" v:mID="134" v:groupContext="shape" transform="translate(54.12,-396.375)">
+			<title>Square.134</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape135-58" v:mID="135" v:groupContext="shape" transform="translate(72.12,-396.375)">
+			<title>Square.135</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape136-61" v:mID="136" v:groupContext="shape" transform="translate(18.12,-378.375)">
+			<title>Square.136</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape137-64" v:mID="137" v:groupContext="shape" transform="translate(36.12,-378.375)">
+			<title>Square.137</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape138-67" v:mID="138" v:groupContext="shape" transform="translate(54.12,-378.375)">
+			<title>Square.138</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape139-70" v:mID="139" v:groupContext="shape" transform="translate(72.12,-378.375)">
+			<title>Square.139</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape140-73" v:mID="140" v:groupContext="shape" transform="translate(18.12,-360.375)">
+			<title>Square.140</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape141-76" v:mID="141" v:groupContext="shape" transform="translate(36.12,-360.375)">
+			<title>Square.141</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape142-79" v:mID="142" v:groupContext="shape" transform="translate(54.12,-360.375)">
+			<title>Square.142</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape143-82" v:mID="143" v:groupContext="shape" transform="translate(72.12,-360.375)">
+			<title>Square.143</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape144-85" v:mID="144" v:groupContext="shape" transform="translate(18.12,-342.375)">
+			<title>Square.144</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape145-88" v:mID="145" v:groupContext="shape" transform="translate(36.12,-342.375)">
+			<title>Square.145</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape146-91" v:mID="146" v:groupContext="shape" transform="translate(54.12,-342.375)">
+			<title>Square.146</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape147-94" v:mID="147" v:groupContext="shape" transform="translate(72.12,-342.375)">
+			<title>Square.147</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape148-97" v:mID="148" v:groupContext="shape" transform="translate(90.12,-468.375)">
+			<title>Square.148</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape149-100" v:mID="149" v:groupContext="shape" transform="translate(108.12,-468.375)">
+			<title>Square.149</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape150-103" v:mID="150" v:groupContext="shape" transform="translate(126.12,-468.375)">
+			<title>Square.150</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape151-106" v:mID="151" v:groupContext="shape" transform="translate(144.12,-468.375)">
+			<title>Square.151</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape152-109" v:mID="152" v:groupContext="shape" transform="translate(90.12,-450.375)">
+			<title>Square.152</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape153-112" v:mID="153" v:groupContext="shape" transform="translate(108.12,-450.375)">
+			<title>Square.153</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape154-115" v:mID="154" v:groupContext="shape" transform="translate(126.12,-450.375)">
+			<title>Square.154</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape155-118" v:mID="155" v:groupContext="shape" transform="translate(144.12,-450.375)">
+			<title>Square.155</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape156-121" v:mID="156" v:groupContext="shape" transform="translate(90.12,-432.375)">
+			<title>Square.156</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape157-124" v:mID="157" v:groupContext="shape" transform="translate(108.12,-432.375)">
+			<title>Square.157</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape158-127" v:mID="158" v:groupContext="shape" transform="translate(126.12,-432.375)">
+			<title>Square.158</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape159-130" v:mID="159" v:groupContext="shape" transform="translate(144.12,-432.375)">
+			<title>Square.159</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape160-133" v:mID="160" v:groupContext="shape" transform="translate(90.12,-414.375)">
+			<title>Square.160</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape161-136" v:mID="161" v:groupContext="shape" transform="translate(108.12,-414.375)">
+			<title>Square.161</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape162-139" v:mID="162" v:groupContext="shape" transform="translate(126.12,-414.375)">
+			<title>Square.162</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape163-142" v:mID="163" v:groupContext="shape" transform="translate(144.12,-414.375)">
+			<title>Square.163</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape164-145" v:mID="164" v:groupContext="shape" transform="translate(90.12,-396.375)">
+			<title>Square.164</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape165-148" v:mID="165" v:groupContext="shape" transform="translate(108.12,-396.375)">
+			<title>Square.165</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape166-151" v:mID="166" v:groupContext="shape" transform="translate(126.12,-396.375)">
+			<title>Square.166</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape167-154" v:mID="167" v:groupContext="shape" transform="translate(144.12,-396.375)">
+			<title>Square.167</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape168-157" v:mID="168" v:groupContext="shape" transform="translate(90.12,-378.375)">
+			<title>Square.168</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape169-160" v:mID="169" v:groupContext="shape" transform="translate(108.12,-378.375)">
+			<title>Square.169</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape170-163" v:mID="170" v:groupContext="shape" transform="translate(126.12,-378.375)">
+			<title>Square.170</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape171-166" v:mID="171" v:groupContext="shape" transform="translate(144.12,-378.375)">
+			<title>Square.171</title>
+			<desc>12</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text>		</g>
+		<g id="shape172-169" v:mID="172" v:groupContext="shape" transform="translate(90.12,-360.375)">
+			<title>Square.172</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape173-172" v:mID="173" v:groupContext="shape" transform="translate(108.12,-360.375)">
+			<title>Square.173</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape174-175" v:mID="174" v:groupContext="shape" transform="translate(126.12,-360.375)">
+			<title>Square.174</title>
+			<desc>12</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text>		</g>
+		<g id="shape175-178" v:mID="175" v:groupContext="shape" transform="translate(144.12,-360.375)">
+			<title>Square.175</title>
+			<desc>13</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>13</text>		</g>
+		<g id="shape176-181" v:mID="176" v:groupContext="shape" transform="translate(90.12,-342.375)">
+			<title>Square.176</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape177-184" v:mID="177" v:groupContext="shape" transform="translate(108.12,-342.375)">
+			<title>Square.177</title>
+			<desc>12</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text>		</g>
+		<g id="shape178-187" v:mID="178" v:groupContext="shape" transform="translate(126.12,-342.375)">
+			<title>Square.178</title>
+			<desc>13</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>13</text>		</g>
+		<g id="shape179-190" v:mID="179" v:groupContext="shape" transform="translate(144.12,-342.375)">
+			<title>Square.179</title>
+			<desc>14</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>14</text>		</g>
+		<g id="shape180-193" v:mID="180" v:groupContext="shape" transform="translate(180.12,-468.375)">
+			<title>Square.180</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape181-196" v:mID="181" v:groupContext="shape" transform="translate(198.12,-468.375)">
+			<title>Square.181</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape182-199" v:mID="182" v:groupContext="shape" transform="translate(216.12,-468.375)">
+			<title>Square.182</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape183-202" v:mID="183" v:groupContext="shape" transform="translate(234.12,-468.375)">
+			<title>Square.183</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape184-205" v:mID="184" v:groupContext="shape" transform="translate(180.12,-450.375)">
+			<title>Square.184</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape185-208" v:mID="185" v:groupContext="shape" transform="translate(198.12,-450.375)">
+			<title>Square.185</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape186-211" v:mID="186" v:groupContext="shape" transform="translate(216.12,-450.375)">
+			<title>Square.186</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape187-214" v:mID="187" v:groupContext="shape" transform="translate(234.12,-450.375)">
+			<title>Square.187</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape188-217" v:mID="188" v:groupContext="shape" transform="translate(180.12,-432.375)">
+			<title>Square.188</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape189-220" v:mID="189" v:groupContext="shape" transform="translate(198.12,-432.375)">
+			<title>Square.189</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape190-223" v:mID="190" v:groupContext="shape" transform="translate(216.12,-432.375)">
+			<title>Square.190</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape191-226" v:mID="191" v:groupContext="shape" transform="translate(234.12,-432.375)">
+			<title>Square.191</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape192-229" v:mID="192" v:groupContext="shape" transform="translate(180.12,-414.375)">
+			<title>Square.192</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape193-232" v:mID="193" v:groupContext="shape" transform="translate(198.12,-414.375)">
+			<title>Square.193</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape194-235" v:mID="194" v:groupContext="shape" transform="translate(216.12,-414.375)">
+			<title>Square.194</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape195-238" v:mID="195" v:groupContext="shape" transform="translate(234.12,-414.375)">
+			<title>Square.195</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape196-241" v:mID="196" v:groupContext="shape" transform="translate(180.12,-396.375)">
+			<title>Square.196</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape197-244" v:mID="197" v:groupContext="shape" transform="translate(198.12,-396.375)">
+			<title>Square.197</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape198-247" v:mID="198" v:groupContext="shape" transform="translate(216.12,-396.375)">
+			<title>Square.198</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape199-250" v:mID="199" v:groupContext="shape" transform="translate(234.12,-396.375)">
+			<title>Square.199</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape200-253" v:mID="200" v:groupContext="shape" transform="translate(180.12,-378.375)">
+			<title>Square.200</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape201-256" v:mID="201" v:groupContext="shape" transform="translate(198.12,-378.375)">
+			<title>Square.201</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape202-259" v:mID="202" v:groupContext="shape" transform="translate(216.12,-378.375)">
+			<title>Square.202</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape203-262" v:mID="203" v:groupContext="shape" transform="translate(234.12,-378.375)">
+			<title>Square.203</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape204-265" v:mID="204" v:groupContext="shape" transform="translate(180.12,-360.375)">
+			<title>Square.204</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape205-268" v:mID="205" v:groupContext="shape" transform="translate(198.12,-360.375)">
+			<title>Square.205</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape206-271" v:mID="206" v:groupContext="shape" transform="translate(216.12,-360.375)">
+			<title>Square.206</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape207-274" v:mID="207" v:groupContext="shape" transform="translate(234.12,-360.375)">
+			<title>Square.207</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape208-277" v:mID="208" v:groupContext="shape" transform="translate(180.12,-342.375)">
+			<title>Square.208</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape209-280" v:mID="209" v:groupContext="shape" transform="translate(198.12,-342.375)">
+			<title>Square.209</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape210-283" v:mID="210" v:groupContext="shape" transform="translate(216.12,-342.375)">
+			<title>Square.210</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape211-286" v:mID="211" v:groupContext="shape" transform="translate(234.12,-342.375)">
+			<title>Square.211</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape212-289" v:mID="212" v:groupContext="shape" transform="translate(252.12,-468.375)">
+			<title>Square.212</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape213-292" v:mID="213" v:groupContext="shape" transform="translate(270.12,-468.375)">
+			<title>Square.213</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape214-295" v:mID="214" v:groupContext="shape" transform="translate(288.12,-468.375)">
+			<title>Square.214</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape215-298" v:mID="215" v:groupContext="shape" transform="translate(306.12,-468.375)">
+			<title>Square.215</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape216-301" v:mID="216" v:groupContext="shape" transform="translate(252.12,-450.375)">
+			<title>Square.216</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape217-304" v:mID="217" v:groupContext="shape" transform="translate(270.12,-450.375)">
+			<title>Square.217</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape218-307" v:mID="218" v:groupContext="shape" transform="translate(288.12,-450.375)">
+			<title>Square.218</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape219-310" v:mID="219" v:groupContext="shape" transform="translate(306.12,-450.375)">
+			<title>Square.219</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape220-313" v:mID="220" v:groupContext="shape" transform="translate(252.12,-432.375)">
+			<title>Square.220</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape221-316" v:mID="221" v:groupContext="shape" transform="translate(270.12,-432.375)">
+			<title>Square.221</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape222-319" v:mID="222" v:groupContext="shape" transform="translate(288.12,-432.375)">
+			<title>Square.222</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape223-322" v:mID="223" v:groupContext="shape" transform="translate(306.12,-432.375)">
+			<title>Square.223</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape224-325" v:mID="224" v:groupContext="shape" transform="translate(252.12,-414.375)">
+			<title>Square.224</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape225-328" v:mID="225" v:groupContext="shape" transform="translate(270.12,-414.375)">
+			<title>Square.225</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape226-331" v:mID="226" v:groupContext="shape" transform="translate(288.12,-414.375)">
+			<title>Square.226</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape227-334" v:mID="227" v:groupContext="shape" transform="translate(306.12,-414.375)">
+			<title>Square.227</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape228-337" v:mID="228" v:groupContext="shape" transform="translate(252.12,-396.375)">
+			<title>Square.228</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape229-340" v:mID="229" v:groupContext="shape" transform="translate(270.12,-396.375)">
+			<title>Square.229</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape230-343" v:mID="230" v:groupContext="shape" transform="translate(288.12,-396.375)">
+			<title>Square.230</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape231-346" v:mID="231" v:groupContext="shape" transform="translate(306.12,-396.375)">
+			<title>Square.231</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape232-349" v:mID="232" v:groupContext="shape" transform="translate(252.12,-378.375)">
+			<title>Square.232</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape233-352" v:mID="233" v:groupContext="shape" transform="translate(270.12,-378.375)">
+			<title>Square.233</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape234-355" v:mID="234" v:groupContext="shape" transform="translate(288.12,-378.375)">
+			<title>Square.234</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape235-358" v:mID="235" v:groupContext="shape" transform="translate(306.12,-378.375)">
+			<title>Square.235</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape236-361" v:mID="236" v:groupContext="shape" transform="translate(252.12,-360.375)">
+			<title>Square.236</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape237-364" v:mID="237" v:groupContext="shape" transform="translate(270.12,-360.375)">
+			<title>Square.237</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape238-367" v:mID="238" v:groupContext="shape" transform="translate(288.12,-360.375)">
+			<title>Square.238</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape239-370" v:mID="239" v:groupContext="shape" transform="translate(306.12,-360.375)">
+			<title>Square.239</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape240-373" v:mID="240" v:groupContext="shape" transform="translate(252.12,-342.375)">
+			<title>Square.240</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape241-376" v:mID="241" v:groupContext="shape" transform="translate(270.12,-342.375)">
+			<title>Square.241</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape242-379" v:mID="242" v:groupContext="shape" transform="translate(288.12,-342.375)">
+			<title>Square.242</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape243-382" v:mID="243" v:groupContext="shape" transform="translate(306.12,-342.375)">
+			<title>Square.243</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape244-385" v:mID="244" v:groupContext="shape" transform="translate(342.12,-468.375)">
+			<title>Square.244</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape245-388" v:mID="245" v:groupContext="shape" transform="translate(360.12,-468.375)">
+			<title>Square.245</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape246-391" v:mID="246" v:groupContext="shape" transform="translate(378.12,-468.375)">
+			<title>Square.246</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape247-394" v:mID="247" v:groupContext="shape" transform="translate(396.12,-468.375)">
+			<title>Square.247</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape248-397" v:mID="248" v:groupContext="shape" transform="translate(342.12,-450.375)">
+			<title>Square.248</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape249-400" v:mID="249" v:groupContext="shape" transform="translate(360.12,-450.375)">
+			<title>Square.249</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape250-403" v:mID="250" v:groupContext="shape" transform="translate(378.12,-450.375)">
+			<title>Square.250</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape251-406" v:mID="251" v:groupContext="shape" transform="translate(396.12,-450.375)">
+			<title>Square.251</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape252-409" v:mID="252" v:groupContext="shape" transform="translate(342.12,-432.375)">
+			<title>Square.252</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape253-412" v:mID="253" v:groupContext="shape" transform="translate(360.12,-432.375)">
+			<title>Square.253</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape254-415" v:mID="254" v:groupContext="shape" transform="translate(378.12,-432.375)">
+			<title>Square.254</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape255-418" v:mID="255" v:groupContext="shape" transform="translate(396.12,-432.375)">
+			<title>Square.255</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape256-421" v:mID="256" v:groupContext="shape" transform="translate(342.12,-414.375)">
+			<title>Square.256</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape257-424" v:mID="257" v:groupContext="shape" transform="translate(360.12,-414.375)">
+			<title>Square.257</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape258-427" v:mID="258" v:groupContext="shape" transform="translate(378.12,-414.375)">
+			<title>Square.258</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape259-430" v:mID="259" v:groupContext="shape" transform="translate(396.12,-414.375)">
+			<title>Square.259</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape260-433" v:mID="260" v:groupContext="shape" transform="translate(342.12,-396.375)">
+			<title>Square.260</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape261-436" v:mID="261" v:groupContext="shape" transform="translate(360.12,-396.375)">
+			<title>Square.261</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape262-439" v:mID="262" v:groupContext="shape" transform="translate(378.12,-396.375)">
+			<title>Square.262</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape263-442" v:mID="263" v:groupContext="shape" transform="translate(396.12,-396.375)">
+			<title>Square.263</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape264-445" v:mID="264" v:groupContext="shape" transform="translate(342.12,-378.375)">
+			<title>Square.264</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape265-448" v:mID="265" v:groupContext="shape" transform="translate(360.12,-378.375)">
+			<title>Square.265</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape266-451" v:mID="266" v:groupContext="shape" transform="translate(378.12,-378.375)">
+			<title>Square.266</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape267-454" v:mID="267" v:groupContext="shape" transform="translate(396.12,-378.375)">
+			<title>Square.267</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape268-457" v:mID="268" v:groupContext="shape" transform="translate(342.12,-360.375)">
+			<title>Square.268</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape269-460" v:mID="269" v:groupContext="shape" transform="translate(360.12,-360.375)">
+			<title>Square.269</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape270-463" v:mID="270" v:groupContext="shape" transform="translate(378.12,-360.375)">
+			<title>Square.270</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape271-466" v:mID="271" v:groupContext="shape" transform="translate(396.12,-360.375)">
+			<title>Square.271</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape272-469" v:mID="272" v:groupContext="shape" transform="translate(342.12,-342.375)">
+			<title>Square.272</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape273-472" v:mID="273" v:groupContext="shape" transform="translate(360.12,-342.375)">
+			<title>Square.273</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape274-475" v:mID="274" v:groupContext="shape" transform="translate(378.12,-342.375)">
+			<title>Square.274</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape275-478" v:mID="275" v:groupContext="shape" transform="translate(396.12,-342.375)">
+			<title>Square.275</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape276-481" v:mID="276" v:groupContext="shape" transform="translate(414.12,-468.375)">
+			<title>Square.276</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape277-484" v:mID="277" v:groupContext="shape" transform="translate(432.12,-468.375)">
+			<title>Square.277</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape278-487" v:mID="278" v:groupContext="shape" transform="translate(450.12,-468.375)">
+			<title>Square.278</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape279-490" v:mID="279" v:groupContext="shape" transform="translate(468.12,-468.375)">
+			<title>Square.279</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape280-493" v:mID="280" v:groupContext="shape" transform="translate(414.12,-450.375)">
+			<title>Square.280</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape281-496" v:mID="281" v:groupContext="shape" transform="translate(432.12,-450.375)">
+			<title>Square.281</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape282-499" v:mID="282" v:groupContext="shape" transform="translate(450.12,-450.375)">
+			<title>Square.282</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape283-502" v:mID="283" v:groupContext="shape" transform="translate(468.12,-450.375)">
+			<title>Square.283</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape284-505" v:mID="284" v:groupContext="shape" transform="translate(414.12,-432.375)">
+			<title>Square.284</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape285-508" v:mID="285" v:groupContext="shape" transform="translate(432.12,-432.375)">
+			<title>Square.285</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape286-511" v:mID="286" v:groupContext="shape" transform="translate(450.12,-432.375)">
+			<title>Square.286</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape287-514" v:mID="287" v:groupContext="shape" transform="translate(468.12,-432.375)">
+			<title>Square.287</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape288-517" v:mID="288" v:groupContext="shape" transform="translate(414.12,-414.375)">
+			<title>Square.288</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape289-520" v:mID="289" v:groupContext="shape" transform="translate(432.12,-414.375)">
+			<title>Square.289</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape290-523" v:mID="290" v:groupContext="shape" transform="translate(450.12,-414.375)">
+			<title>Square.290</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape291-526" v:mID="291" v:groupContext="shape" transform="translate(468.12,-414.375)">
+			<title>Square.291</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape292-529" v:mID="292" v:groupContext="shape" transform="translate(414.12,-396.375)">
+			<title>Square.292</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape293-532" v:mID="293" v:groupContext="shape" transform="translate(432.12,-396.375)">
+			<title>Square.293</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape294-535" v:mID="294" v:groupContext="shape" transform="translate(450.12,-396.375)">
+			<title>Square.294</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape295-538" v:mID="295" v:groupContext="shape" transform="translate(468.12,-396.375)">
+			<title>Square.295</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape296-541" v:mID="296" v:groupContext="shape" transform="translate(414.12,-378.375)">
+			<title>Square.296</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape297-544" v:mID="297" v:groupContext="shape" transform="translate(432.12,-378.375)">
+			<title>Square.297</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape298-547" v:mID="298" v:groupContext="shape" transform="translate(450.12,-378.375)">
+			<title>Square.298</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape299-550" v:mID="299" v:groupContext="shape" transform="translate(468.12,-378.375)">
+			<title>Square.299</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape300-553" v:mID="300" v:groupContext="shape" transform="translate(414.12,-360.375)">
+			<title>Square.300</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape301-556" v:mID="301" v:groupContext="shape" transform="translate(432.12,-360.375)">
+			<title>Square.301</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape302-559" v:mID="302" v:groupContext="shape" transform="translate(450.12,-360.375)">
+			<title>Square.302</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape303-562" v:mID="303" v:groupContext="shape" transform="translate(468.12,-360.375)">
+			<title>Square.303</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape304-565" v:mID="304" v:groupContext="shape" transform="translate(414.12,-342.375)">
+			<title>Square.304</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape305-568" v:mID="305" v:groupContext="shape" transform="translate(432.12,-342.375)">
+			<title>Square.305</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape306-571" v:mID="306" v:groupContext="shape" transform="translate(450.12,-342.375)">
+			<title>Square.306</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape307-574" v:mID="307" v:groupContext="shape" transform="translate(468.12,-342.375)">
+			<title>Square.307</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape308-577" v:mID="308" v:groupContext="shape" transform="translate(504.12,-468.375)">
+			<title>Square.308</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape309-580" v:mID="309" v:groupContext="shape" transform="translate(522.12,-468.375)">
+			<title>Square.309</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape310-583" v:mID="310" v:groupContext="shape" transform="translate(540.12,-468.375)">
+			<title>Square.310</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape311-586" v:mID="311" v:groupContext="shape" transform="translate(558.12,-468.375)">
+			<title>Square.311</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape312-589" v:mID="312" v:groupContext="shape" transform="translate(504.12,-450.375)">
+			<title>Square.312</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape313-592" v:mID="313" v:groupContext="shape" transform="translate(522.12,-450.375)">
+			<title>Square.313</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape314-595" v:mID="314" v:groupContext="shape" transform="translate(540.12,-450.375)">
+			<title>Square.314</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape315-598" v:mID="315" v:groupContext="shape" transform="translate(558.12,-450.375)">
+			<title>Square.315</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape316-601" v:mID="316" v:groupContext="shape" transform="translate(504.12,-432.375)">
+			<title>Square.316</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape317-604" v:mID="317" v:groupContext="shape" transform="translate(522.12,-432.375)">
+			<title>Square.317</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape318-607" v:mID="318" v:groupContext="shape" transform="translate(540.12,-432.375)">
+			<title>Square.318</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape319-610" v:mID="319" v:groupContext="shape" transform="translate(558.12,-432.375)">
+			<title>Square.319</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape320-613" v:mID="320" v:groupContext="shape" transform="translate(504.12,-414.375)">
+			<title>Square.320</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape321-616" v:mID="321" v:groupContext="shape" transform="translate(522.12,-414.375)">
+			<title>Square.321</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape322-619" v:mID="322" v:groupContext="shape" transform="translate(540.12,-414.375)">
+			<title>Square.322</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape323-622" v:mID="323" v:groupContext="shape" transform="translate(558.12,-414.375)">
+			<title>Square.323</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape324-625" v:mID="324" v:groupContext="shape" transform="translate(504.12,-396.375)">
+			<title>Square.324</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape325-628" v:mID="325" v:groupContext="shape" transform="translate(522.12,-396.375)">
+			<title>Square.325</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape326-631" v:mID="326" v:groupContext="shape" transform="translate(540.12,-396.375)">
+			<title>Square.326</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape327-634" v:mID="327" v:groupContext="shape" transform="translate(558.12,-396.375)">
+			<title>Square.327</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape328-637" v:mID="328" v:groupContext="shape" transform="translate(504.12,-378.375)">
+			<title>Square.328</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape329-640" v:mID="329" v:groupContext="shape" transform="translate(522.12,-378.375)">
+			<title>Square.329</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape330-643" v:mID="330" v:groupContext="shape" transform="translate(540.12,-378.375)">
+			<title>Square.330</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape331-646" v:mID="331" v:groupContext="shape" transform="translate(558.12,-378.375)">
+			<title>Square.331</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape332-649" v:mID="332" v:groupContext="shape" transform="translate(504.12,-360.375)">
+			<title>Square.332</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape333-652" v:mID="333" v:groupContext="shape" transform="translate(522.12,-360.375)">
+			<title>Square.333</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape334-655" v:mID="334" v:groupContext="shape" transform="translate(540.12,-360.375)">
+			<title>Square.334</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape335-658" v:mID="335" v:groupContext="shape" transform="translate(558.12,-360.375)">
+			<title>Square.335</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape336-661" v:mID="336" v:groupContext="shape" transform="translate(504.12,-342.375)">
+			<title>Square.336</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape337-664" v:mID="337" v:groupContext="shape" transform="translate(522.12,-342.375)">
+			<title>Square.337</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape338-667" v:mID="338" v:groupContext="shape" transform="translate(540.12,-342.375)">
+			<title>Square.338</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape339-670" v:mID="339" v:groupContext="shape" transform="translate(558.12,-342.375)">
+			<title>Square.339</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape340-673" v:mID="340" v:groupContext="shape" transform="translate(576.12,-468.375)">
+			<title>Square.340</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape341-676" v:mID="341" v:groupContext="shape" transform="translate(594.12,-468.375)">
+			<title>Square.341</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape342-679" v:mID="342" v:groupContext="shape" transform="translate(612.12,-468.375)">
+			<title>Square.342</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape343-682" v:mID="343" v:groupContext="shape" transform="translate(630.12,-468.375)">
+			<title>Square.343</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape344-685" v:mID="344" v:groupContext="shape" transform="translate(576.12,-450.375)">
+			<title>Square.344</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape345-688" v:mID="345" v:groupContext="shape" transform="translate(594.12,-450.375)">
+			<title>Square.345</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape346-691" v:mID="346" v:groupContext="shape" transform="translate(612.12,-450.375)">
+			<title>Square.346</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape347-694" v:mID="347" v:groupContext="shape" transform="translate(630.12,-450.375)">
+			<title>Square.347</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape348-697" v:mID="348" v:groupContext="shape" transform="translate(576.12,-432.375)">
+			<title>Square.348</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape349-700" v:mID="349" v:groupContext="shape" transform="translate(594.12,-432.375)">
+			<title>Square.349</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape350-703" v:mID="350" v:groupContext="shape" transform="translate(612.12,-432.375)">
+			<title>Square.350</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape351-706" v:mID="351" v:groupContext="shape" transform="translate(630.12,-432.375)">
+			<title>Square.351</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape352-709" v:mID="352" v:groupContext="shape" transform="translate(576.12,-414.375)">
+			<title>Square.352</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape353-712" v:mID="353" v:groupContext="shape" transform="translate(594.12,-414.375)">
+			<title>Square.353</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape354-715" v:mID="354" v:groupContext="shape" transform="translate(612.12,-414.375)">
+			<title>Square.354</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape355-718" v:mID="355" v:groupContext="shape" transform="translate(630.12,-414.375)">
+			<title>Square.355</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape356-721" v:mID="356" v:groupContext="shape" transform="translate(576.12,-396.375)">
+			<title>Square.356</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape357-724" v:mID="357" v:groupContext="shape" transform="translate(594.12,-396.375)">
+			<title>Square.357</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape358-727" v:mID="358" v:groupContext="shape" transform="translate(612.12,-396.375)">
+			<title>Square.358</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape359-730" v:mID="359" v:groupContext="shape" transform="translate(630.12,-396.375)">
+			<title>Square.359</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape360-733" v:mID="360" v:groupContext="shape" transform="translate(576.12,-378.375)">
+			<title>Square.360</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape361-736" v:mID="361" v:groupContext="shape" transform="translate(594.12,-378.375)">
+			<title>Square.361</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape362-739" v:mID="362" v:groupContext="shape" transform="translate(612.12,-378.375)">
+			<title>Square.362</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape363-742" v:mID="363" v:groupContext="shape" transform="translate(630.12,-378.375)">
+			<title>Square.363</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape364-745" v:mID="364" v:groupContext="shape" transform="translate(576.12,-360.375)">
+			<title>Square.364</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape365-748" v:mID="365" v:groupContext="shape" transform="translate(594.12,-360.375)">
+			<title>Square.365</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape366-751" v:mID="366" v:groupContext="shape" transform="translate(612.12,-360.375)">
+			<title>Square.366</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape367-754" v:mID="367" v:groupContext="shape" transform="translate(630.12,-360.375)">
+			<title>Square.367</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape368-757" v:mID="368" v:groupContext="shape" transform="translate(576.12,-342.375)">
+			<title>Square.368</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape369-760" v:mID="369" v:groupContext="shape" transform="translate(594.12,-342.375)">
+			<title>Square.369</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape370-763" v:mID="370" v:groupContext="shape" transform="translate(612.12,-342.375)">
+			<title>Square.370</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape371-766" v:mID="371" v:groupContext="shape" transform="translate(630.12,-342.375)">
+			<title>Square.371</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape372-769" v:mID="372" v:groupContext="shape" transform="translate(18.12,-180.375)">
+			<title>Square.372</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape373-772" v:mID="373" v:groupContext="shape" transform="translate(36.12,-180.375)">
+			<title>Square.373</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape374-775" v:mID="374" v:groupContext="shape" transform="translate(54.12,-180.375)">
+			<title>Square.374</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape375-778" v:mID="375" v:groupContext="shape" transform="translate(72.12,-180.375)">
+			<title>Square.375</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape376-781" v:mID="376" v:groupContext="shape" transform="translate(18.12,-162.375)">
+			<title>Square.376</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape377-784" v:mID="377" v:groupContext="shape" transform="translate(36.12,-162.375)">
+			<title>Square.377</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape378-787" v:mID="378" v:groupContext="shape" transform="translate(54.12,-162.375)">
+			<title>Square.378</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape379-790" v:mID="379" v:groupContext="shape" transform="translate(72.12,-162.375)">
+			<title>Square.379</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape380-793" v:mID="380" v:groupContext="shape" transform="translate(18.12,-144.375)">
+			<title>Square.380</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape381-796" v:mID="381" v:groupContext="shape" transform="translate(36.12,-144.375)">
+			<title>Square.381</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape382-799" v:mID="382" v:groupContext="shape" transform="translate(54.12,-144.375)">
+			<title>Square.382</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape383-802" v:mID="383" v:groupContext="shape" transform="translate(72.12,-144.375)">
+			<title>Square.383</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape384-805" v:mID="384" v:groupContext="shape" transform="translate(18.12,-126.375)">
+			<title>Square.384</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape385-808" v:mID="385" v:groupContext="shape" transform="translate(36.12,-126.375)">
+			<title>Square.385</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape386-811" v:mID="386" v:groupContext="shape" transform="translate(54.12,-126.375)">
+			<title>Square.386</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape387-814" v:mID="387" v:groupContext="shape" transform="translate(72.12,-126.375)">
+			<title>Square.387</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape388-817" v:mID="388" v:groupContext="shape" transform="translate(18.12,-108.375)">
+			<title>Square.388</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape389-820" v:mID="389" v:groupContext="shape" transform="translate(36.12,-108.375)">
+			<title>Square.389</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape390-823" v:mID="390" v:groupContext="shape" transform="translate(54.12,-108.375)">
+			<title>Square.390</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape391-826" v:mID="391" v:groupContext="shape" transform="translate(72.12,-108.375)">
+			<title>Square.391</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape392-829" v:mID="392" v:groupContext="shape" transform="translate(18.12,-90.375)">
+			<title>Square.392</title>
+			<desc>12</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text>		</g>
+		<g id="shape393-832" v:mID="393" v:groupContext="shape" transform="translate(36.12,-90.375)">
+			<title>Square.393</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape394-835" v:mID="394" v:groupContext="shape" transform="translate(54.12,-90.375)">
+			<title>Square.394</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape395-838" v:mID="395" v:groupContext="shape" transform="translate(72.12,-90.375)">
+			<title>Square.395</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape396-841" v:mID="396" v:groupContext="shape" transform="translate(18.12,-72.375)">
+			<title>Square.396</title>
+			<desc>13</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>13</text>		</g>
+		<g id="shape397-844" v:mID="397" v:groupContext="shape" transform="translate(36.12,-72.375)">
+			<title>Square.397</title>
+			<desc>12</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text>		</g>
+		<g id="shape398-847" v:mID="398" v:groupContext="shape" transform="translate(54.12,-72.375)">
+			<title>Square.398</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape399-850" v:mID="399" v:groupContext="shape" transform="translate(72.12,-72.375)">
+			<title>Square.399</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape400-853" v:mID="400" v:groupContext="shape" transform="translate(18.12,-54.375)">
+			<title>Square.400</title>
+			<desc>14</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>14</text>		</g>
+		<g id="shape401-856" v:mID="401" v:groupContext="shape" transform="translate(36.12,-54.375)">
+			<title>Square.401</title>
+			<desc>13</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>13</text>		</g>
+		<g id="shape402-859" v:mID="402" v:groupContext="shape" transform="translate(54.12,-54.375)">
+			<title>Square.402</title>
+			<desc>12</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text>		</g>
+		<g id="shape403-862" v:mID="403" v:groupContext="shape" transform="translate(72.12,-54.375)">
+			<title>Square.403</title>
+			<desc>11</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text>		</g>
+		<g id="shape404-865" v:mID="404" v:groupContext="shape" transform="translate(90.12,-180.375)">
+			<title>Square.404</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape405-868" v:mID="405" v:groupContext="shape" transform="translate(108.12,-180.375)">
+			<title>Square.405</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape406-871" v:mID="406" v:groupContext="shape" transform="translate(126.12,-180.375)">
+			<title>Square.406</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape407-874" v:mID="407" v:groupContext="shape" transform="translate(144.12,-180.375)">
+			<title>Square.407</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape408-877" v:mID="408" v:groupContext="shape" transform="translate(90.12,-162.375)">
+			<title>Square.408</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape409-880" v:mID="409" v:groupContext="shape" transform="translate(108.12,-162.375)">
+			<title>Square.409</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape410-883" v:mID="410" v:groupContext="shape" transform="translate(126.12,-162.375)">
+			<title>Square.410</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape411-886" v:mID="411" v:groupContext="shape" transform="translate(144.12,-162.375)">
+			<title>Square.411</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape412-889" v:mID="412" v:groupContext="shape" transform="translate(90.12,-144.375)">
+			<title>Square.412</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape413-892" v:mID="413" v:groupContext="shape" transform="translate(108.12,-144.375)">
+			<title>Square.413</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape414-895" v:mID="414" v:groupContext="shape" transform="translate(126.12,-144.375)">
+			<title>Square.414</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape415-898" v:mID="415" v:groupContext="shape" transform="translate(144.12,-144.375)">
+			<title>Square.415</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape416-901" v:mID="416" v:groupContext="shape" transform="translate(90.12,-126.375)">
+			<title>Square.416</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape417-904" v:mID="417" v:groupContext="shape" transform="translate(108.12,-126.375)">
+			<title>Square.417</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape418-907" v:mID="418" v:groupContext="shape" transform="translate(126.12,-126.375)">
+			<title>Square.418</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape419-910" v:mID="419" v:groupContext="shape" transform="translate(144.12,-126.375)">
+			<title>Square.419</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape420-913" v:mID="420" v:groupContext="shape" transform="translate(90.12,-108.375)">
+			<title>Square.420</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape421-916" v:mID="421" v:groupContext="shape" transform="translate(108.12,-108.375)">
+			<title>Square.421</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape422-919" v:mID="422" v:groupContext="shape" transform="translate(126.12,-108.375)">
+			<title>Square.422</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape423-922" v:mID="423" v:groupContext="shape" transform="translate(144.12,-108.375)">
+			<title>Square.423</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape424-925" v:mID="424" v:groupContext="shape" transform="translate(90.12,-90.375)">
+			<title>Square.424</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape425-928" v:mID="425" v:groupContext="shape" transform="translate(108.12,-90.375)">
+			<title>Square.425</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape426-931" v:mID="426" v:groupContext="shape" transform="translate(126.12,-90.375)">
+			<title>Square.426</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape427-934" v:mID="427" v:groupContext="shape" transform="translate(144.12,-90.375)">
+			<title>Square.427</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape428-937" v:mID="428" v:groupContext="shape" transform="translate(90.12,-72.375)">
+			<title>Square.428</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape429-940" v:mID="429" v:groupContext="shape" transform="translate(108.12,-72.375)">
+			<title>Square.429</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape430-943" v:mID="430" v:groupContext="shape" transform="translate(126.12,-72.375)">
+			<title>Square.430</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape431-946" v:mID="431" v:groupContext="shape" transform="translate(144.12,-72.375)">
+			<title>Square.431</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape432-949" v:mID="432" v:groupContext="shape" transform="translate(90.12,-54.375)">
+			<title>Square.432</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape433-952" v:mID="433" v:groupContext="shape" transform="translate(108.12,-54.375)">
+			<title>Square.433</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape434-955" v:mID="434" v:groupContext="shape" transform="translate(126.12,-54.375)">
+			<title>Square.434</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape435-958" v:mID="435" v:groupContext="shape" transform="translate(144.12,-54.375)">
+			<title>Square.435</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape436-961" v:mID="436" v:groupContext="shape" transform="translate(180.12,-180.375)">
+			<title>Square.436</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape437-964" v:mID="437" v:groupContext="shape" transform="translate(198.12,-180.375)">
+			<title>Square.437</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape438-967" v:mID="438" v:groupContext="shape" transform="translate(216.12,-180.375)">
+			<title>Square.438</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape439-970" v:mID="439" v:groupContext="shape" transform="translate(234.12,-180.375)">
+			<title>Square.439</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape440-973" v:mID="440" v:groupContext="shape" transform="translate(180.12,-162.375)">
+			<title>Square.440</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape441-976" v:mID="441" v:groupContext="shape" transform="translate(198.12,-162.375)">
+			<title>Square.441</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape442-979" v:mID="442" v:groupContext="shape" transform="translate(216.12,-162.375)">
+			<title>Square.442</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape443-982" v:mID="443" v:groupContext="shape" transform="translate(234.12,-162.375)">
+			<title>Square.443</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape444-985" v:mID="444" v:groupContext="shape" transform="translate(180.12,-144.375)">
+			<title>Square.444</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape445-988" v:mID="445" v:groupContext="shape" transform="translate(198.12,-144.375)">
+			<title>Square.445</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape446-991" v:mID="446" v:groupContext="shape" transform="translate(216.12,-144.375)">
+			<title>Square.446</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape447-994" v:mID="447" v:groupContext="shape" transform="translate(234.12,-144.375)">
+			<title>Square.447</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape448-997" v:mID="448" v:groupContext="shape" transform="translate(180.12,-126.375)">
+			<title>Square.448</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape449-1000" v:mID="449" v:groupContext="shape" transform="translate(198.12,-126.375)">
+			<title>Square.449</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape450-1003" v:mID="450" v:groupContext="shape" transform="translate(216.12,-126.375)">
+			<title>Square.450</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape451-1006" v:mID="451" v:groupContext="shape" transform="translate(234.12,-126.375)">
+			<title>Square.451</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape452-1009" v:mID="452" v:groupContext="shape" transform="translate(180.12,-108.375)">
+			<title>Square.452</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape453-1012" v:mID="453" v:groupContext="shape" transform="translate(198.12,-108.375)">
+			<title>Square.453</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape454-1015" v:mID="454" v:groupContext="shape" transform="translate(216.12,-108.375)">
+			<title>Square.454</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape455-1018" v:mID="455" v:groupContext="shape" transform="translate(234.12,-108.375)">
+			<title>Square.455</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape456-1021" v:mID="456" v:groupContext="shape" transform="translate(180.12,-90.375)">
+			<title>Square.456</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape457-1024" v:mID="457" v:groupContext="shape" transform="translate(198.12,-90.375)">
+			<title>Square.457</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape458-1027" v:mID="458" v:groupContext="shape" transform="translate(216.12,-90.375)">
+			<title>Square.458</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape459-1030" v:mID="459" v:groupContext="shape" transform="translate(234.12,-90.375)">
+			<title>Square.459</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape460-1033" v:mID="460" v:groupContext="shape" transform="translate(180.12,-72.375)">
+			<title>Square.460</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape461-1036" v:mID="461" v:groupContext="shape" transform="translate(198.12,-72.375)">
+			<title>Square.461</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape462-1039" v:mID="462" v:groupContext="shape" transform="translate(216.12,-72.375)">
+			<title>Square.462</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape463-1042" v:mID="463" v:groupContext="shape" transform="translate(234.12,-72.375)">
+			<title>Square.463</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape464-1045" v:mID="464" v:groupContext="shape" transform="translate(180.12,-54.375)">
+			<title>Square.464</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape465-1048" v:mID="465" v:groupContext="shape" transform="translate(198.12,-54.375)">
+			<title>Square.465</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape466-1051" v:mID="466" v:groupContext="shape" transform="translate(216.12,-54.375)">
+			<title>Square.466</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape467-1054" v:mID="467" v:groupContext="shape" transform="translate(234.12,-54.375)">
+			<title>Square.467</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape468-1057" v:mID="468" v:groupContext="shape" transform="translate(252.12,-180.375)">
+			<title>Square.468</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape469-1060" v:mID="469" v:groupContext="shape" transform="translate(270.12,-180.375)">
+			<title>Square.469</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape470-1063" v:mID="470" v:groupContext="shape" transform="translate(288.12,-180.375)">
+			<title>Square.470</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape471-1066" v:mID="471" v:groupContext="shape" transform="translate(306.12,-180.375)">
+			<title>Square.471</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape472-1069" v:mID="472" v:groupContext="shape" transform="translate(252.12,-162.375)">
+			<title>Square.472</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape473-1072" v:mID="473" v:groupContext="shape" transform="translate(270.12,-162.375)">
+			<title>Square.473</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape474-1075" v:mID="474" v:groupContext="shape" transform="translate(288.12,-162.375)">
+			<title>Square.474</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape475-1078" v:mID="475" v:groupContext="shape" transform="translate(306.12,-162.375)">
+			<title>Square.475</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape476-1081" v:mID="476" v:groupContext="shape" transform="translate(252.12,-144.375)">
+			<title>Square.476</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape477-1084" v:mID="477" v:groupContext="shape" transform="translate(270.12,-144.375)">
+			<title>Square.477</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape478-1087" v:mID="478" v:groupContext="shape" transform="translate(288.12,-144.375)">
+			<title>Square.478</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape479-1090" v:mID="479" v:groupContext="shape" transform="translate(306.12,-144.375)">
+			<title>Square.479</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape480-1093" v:mID="480" v:groupContext="shape" transform="translate(252.12,-126.375)">
+			<title>Square.480</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape481-1096" v:mID="481" v:groupContext="shape" transform="translate(270.12,-126.375)">
+			<title>Square.481</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape482-1099" v:mID="482" v:groupContext="shape" transform="translate(288.12,-126.375)">
+			<title>Square.482</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape483-1102" v:mID="483" v:groupContext="shape" transform="translate(306.12,-126.375)">
+			<title>Square.483</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape484-1105" v:mID="484" v:groupContext="shape" transform="translate(252.12,-108.375)">
+			<title>Square.484</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape485-1108" v:mID="485" v:groupContext="shape" transform="translate(270.12,-108.375)">
+			<title>Square.485</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape486-1111" v:mID="486" v:groupContext="shape" transform="translate(288.12,-108.375)">
+			<title>Square.486</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape487-1114" v:mID="487" v:groupContext="shape" transform="translate(306.12,-108.375)">
+			<title>Square.487</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape488-1117" v:mID="488" v:groupContext="shape" transform="translate(252.12,-90.375)">
+			<title>Square.488</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape489-1120" v:mID="489" v:groupContext="shape" transform="translate(270.12,-90.375)">
+			<title>Square.489</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape490-1123" v:mID="490" v:groupContext="shape" transform="translate(288.12,-90.375)">
+			<title>Square.490</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape491-1126" v:mID="491" v:groupContext="shape" transform="translate(306.12,-90.375)">
+			<title>Square.491</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape492-1129" v:mID="492" v:groupContext="shape" transform="translate(252.12,-72.375)">
+			<title>Square.492</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape493-1132" v:mID="493" v:groupContext="shape" transform="translate(270.12,-72.375)">
+			<title>Square.493</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape494-1135" v:mID="494" v:groupContext="shape" transform="translate(288.12,-72.375)">
+			<title>Square.494</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape495-1138" v:mID="495" v:groupContext="shape" transform="translate(306.12,-72.375)">
+			<title>Square.495</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape496-1141" v:mID="496" v:groupContext="shape" transform="translate(252.12,-54.375)">
+			<title>Square.496</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape497-1144" v:mID="497" v:groupContext="shape" transform="translate(270.12,-54.375)">
+			<title>Square.497</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape498-1147" v:mID="498" v:groupContext="shape" transform="translate(288.12,-54.375)">
+			<title>Square.498</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape499-1150" v:mID="499" v:groupContext="shape" transform="translate(306.12,-54.375)">
+			<title>Square.499</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape500-1153" v:mID="500" v:groupContext="shape" transform="translate(342.12,-180.375)">
+			<title>Square.500</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape501-1156" v:mID="501" v:groupContext="shape" transform="translate(360.12,-180.375)">
+			<title>Square.501</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape502-1159" v:mID="502" v:groupContext="shape" transform="translate(378.12,-180.375)">
+			<title>Square.502</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape503-1162" v:mID="503" v:groupContext="shape" transform="translate(396.12,-180.375)">
+			<title>Square.503</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape504-1165" v:mID="504" v:groupContext="shape" transform="translate(342.12,-162.375)">
+			<title>Square.504</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape505-1168" v:mID="505" v:groupContext="shape" transform="translate(360.12,-162.375)">
+			<title>Square.505</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape506-1171" v:mID="506" v:groupContext="shape" transform="translate(378.12,-162.375)">
+			<title>Square.506</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape507-1174" v:mID="507" v:groupContext="shape" transform="translate(396.12,-162.375)">
+			<title>Square.507</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape508-1177" v:mID="508" v:groupContext="shape" transform="translate(342.12,-144.375)">
+			<title>Square.508</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape509-1180" v:mID="509" v:groupContext="shape" transform="translate(360.12,-144.375)">
+			<title>Square.509</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape510-1183" v:mID="510" v:groupContext="shape" transform="translate(378.12,-144.375)">
+			<title>Square.510</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape511-1186" v:mID="511" v:groupContext="shape" transform="translate(396.12,-144.375)">
+			<title>Square.511</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape512-1189" v:mID="512" v:groupContext="shape" transform="translate(342.12,-126.375)">
+			<title>Square.512</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape513-1192" v:mID="513" v:groupContext="shape" transform="translate(360.12,-126.375)">
+			<title>Square.513</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape514-1195" v:mID="514" v:groupContext="shape" transform="translate(378.12,-126.375)">
+			<title>Square.514</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape515-1198" v:mID="515" v:groupContext="shape" transform="translate(396.12,-126.375)">
+			<title>Square.515</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape516-1201" v:mID="516" v:groupContext="shape" transform="translate(342.12,-108.375)">
+			<title>Square.516</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape517-1204" v:mID="517" v:groupContext="shape" transform="translate(360.12,-108.375)">
+			<title>Square.517</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape518-1207" v:mID="518" v:groupContext="shape" transform="translate(378.12,-108.375)">
+			<title>Square.518</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape519-1210" v:mID="519" v:groupContext="shape" transform="translate(396.12,-108.375)">
+			<title>Square.519</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape520-1213" v:mID="520" v:groupContext="shape" transform="translate(342.12,-90.375)">
+			<title>Square.520</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape521-1216" v:mID="521" v:groupContext="shape" transform="translate(360.12,-90.375)">
+			<title>Square.521</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape522-1219" v:mID="522" v:groupContext="shape" transform="translate(378.12,-90.375)">
+			<title>Square.522</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape523-1222" v:mID="523" v:groupContext="shape" transform="translate(396.12,-90.375)">
+			<title>Square.523</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape524-1225" v:mID="524" v:groupContext="shape" transform="translate(342.12,-72.375)">
+			<title>Square.524</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape525-1228" v:mID="525" v:groupContext="shape" transform="translate(360.12,-72.375)">
+			<title>Square.525</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape526-1231" v:mID="526" v:groupContext="shape" transform="translate(378.12,-72.375)">
+			<title>Square.526</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape527-1234" v:mID="527" v:groupContext="shape" transform="translate(396.12,-72.375)">
+			<title>Square.527</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape528-1237" v:mID="528" v:groupContext="shape" transform="translate(342.12,-54.375)">
+			<title>Square.528</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape529-1240" v:mID="529" v:groupContext="shape" transform="translate(360.12,-54.375)">
+			<title>Square.529</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape530-1243" v:mID="530" v:groupContext="shape" transform="translate(378.12,-54.375)">
+			<title>Square.530</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape531-1246" v:mID="531" v:groupContext="shape" transform="translate(396.12,-54.375)">
+			<title>Square.531</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape532-1249" v:mID="532" v:groupContext="shape" transform="translate(414.12,-180.375)">
+			<title>Square.532</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape533-1252" v:mID="533" v:groupContext="shape" transform="translate(432.12,-180.375)">
+			<title>Square.533</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape534-1255" v:mID="534" v:groupContext="shape" transform="translate(450.12,-180.375)">
+			<title>Square.534</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape535-1258" v:mID="535" v:groupContext="shape" transform="translate(468.12,-180.375)">
+			<title>Square.535</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape536-1261" v:mID="536" v:groupContext="shape" transform="translate(414.12,-162.375)">
+			<title>Square.536</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape537-1264" v:mID="537" v:groupContext="shape" transform="translate(432.12,-162.375)">
+			<title>Square.537</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape538-1267" v:mID="538" v:groupContext="shape" transform="translate(450.12,-162.375)">
+			<title>Square.538</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape539-1270" v:mID="539" v:groupContext="shape" transform="translate(468.12,-162.375)">
+			<title>Square.539</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape540-1273" v:mID="540" v:groupContext="shape" transform="translate(414.12,-144.375)">
+			<title>Square.540</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape541-1276" v:mID="541" v:groupContext="shape" transform="translate(432.12,-144.375)">
+			<title>Square.541</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape542-1279" v:mID="542" v:groupContext="shape" transform="translate(450.12,-144.375)">
+			<title>Square.542</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape543-1282" v:mID="543" v:groupContext="shape" transform="translate(468.12,-144.375)">
+			<title>Square.543</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape544-1285" v:mID="544" v:groupContext="shape" transform="translate(414.12,-126.375)">
+			<title>Square.544</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape545-1288" v:mID="545" v:groupContext="shape" transform="translate(432.12,-126.375)">
+			<title>Square.545</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape546-1291" v:mID="546" v:groupContext="shape" transform="translate(450.12,-126.375)">
+			<title>Square.546</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape547-1294" v:mID="547" v:groupContext="shape" transform="translate(468.12,-126.375)">
+			<title>Square.547</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape548-1297" v:mID="548" v:groupContext="shape" transform="translate(414.12,-108.375)">
+			<title>Square.548</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape549-1300" v:mID="549" v:groupContext="shape" transform="translate(432.12,-108.375)">
+			<title>Square.549</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape550-1303" v:mID="550" v:groupContext="shape" transform="translate(450.12,-108.375)">
+			<title>Square.550</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape551-1306" v:mID="551" v:groupContext="shape" transform="translate(468.12,-108.375)">
+			<title>Square.551</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape552-1309" v:mID="552" v:groupContext="shape" transform="translate(414.12,-90.375)">
+			<title>Square.552</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape553-1312" v:mID="553" v:groupContext="shape" transform="translate(432.12,-90.375)">
+			<title>Square.553</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape554-1315" v:mID="554" v:groupContext="shape" transform="translate(450.12,-90.375)">
+			<title>Square.554</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape555-1318" v:mID="555" v:groupContext="shape" transform="translate(468.12,-90.375)">
+			<title>Square.555</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape556-1321" v:mID="556" v:groupContext="shape" transform="translate(414.12,-72.375)">
+			<title>Square.556</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape557-1324" v:mID="557" v:groupContext="shape" transform="translate(432.12,-72.375)">
+			<title>Square.557</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape558-1327" v:mID="558" v:groupContext="shape" transform="translate(450.12,-72.375)">
+			<title>Square.558</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape559-1330" v:mID="559" v:groupContext="shape" transform="translate(468.12,-72.375)">
+			<title>Square.559</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape560-1333" v:mID="560" v:groupContext="shape" transform="translate(414.12,-54.375)">
+			<title>Square.560</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape561-1336" v:mID="561" v:groupContext="shape" transform="translate(432.12,-54.375)">
+			<title>Square.561</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape562-1339" v:mID="562" v:groupContext="shape" transform="translate(450.12,-54.375)">
+			<title>Square.562</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape563-1342" v:mID="563" v:groupContext="shape" transform="translate(468.12,-54.375)">
+			<title>Square.563</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape564-1345" v:mID="564" v:groupContext="shape" transform="translate(504.12,-180.375)">
+			<title>Square.564</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape565-1348" v:mID="565" v:groupContext="shape" transform="translate(522.12,-180.375)">
+			<title>Square.565</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape566-1351" v:mID="566" v:groupContext="shape" transform="translate(540.12,-180.375)">
+			<title>Square.566</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape567-1354" v:mID="567" v:groupContext="shape" transform="translate(558.12,-180.375)">
+			<title>Square.567</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape568-1357" v:mID="568" v:groupContext="shape" transform="translate(504.12,-162.375)">
+			<title>Square.568</title>
+			<desc>0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape569-1360" v:mID="569" v:groupContext="shape" transform="translate(522.12,-162.375)">
+			<title>Square.569</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape570-1363" v:mID="570" v:groupContext="shape" transform="translate(540.12,-162.375)">
+			<title>Square.570</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape571-1366" v:mID="571" v:groupContext="shape" transform="translate(558.12,-162.375)">
+			<title>Square.571</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape572-1369" v:mID="572" v:groupContext="shape" transform="translate(504.12,-144.375)">
+			<title>Square.572</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape573-1372" v:mID="573" v:groupContext="shape" transform="translate(522.12,-144.375)">
+			<title>Square.573</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape574-1375" v:mID="574" v:groupContext="shape" transform="translate(540.12,-144.375)">
+			<title>Square.574</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape575-1378" v:mID="575" v:groupContext="shape" transform="translate(558.12,-144.375)">
+			<title>Square.575</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape576-1381" v:mID="576" v:groupContext="shape" transform="translate(504.12,-126.375)">
+			<title>Square.576</title>
+			<desc>1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape577-1384" v:mID="577" v:groupContext="shape" transform="translate(522.12,-126.375)">
+			<title>Square.577</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape578-1387" v:mID="578" v:groupContext="shape" transform="translate(540.12,-126.375)">
+			<title>Square.578</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape579-1390" v:mID="579" v:groupContext="shape" transform="translate(558.12,-126.375)">
+			<title>Square.579</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape580-1393" v:mID="580" v:groupContext="shape" transform="translate(504.12,-108.375)">
+			<title>Square.580</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape581-1396" v:mID="581" v:groupContext="shape" transform="translate(522.12,-108.375)">
+			<title>Square.581</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape582-1399" v:mID="582" v:groupContext="shape" transform="translate(540.12,-108.375)">
+			<title>Square.582</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape583-1402" v:mID="583" v:groupContext="shape" transform="translate(558.12,-108.375)">
+			<title>Square.583</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape584-1405" v:mID="584" v:groupContext="shape" transform="translate(504.12,-90.375)">
+			<title>Square.584</title>
+			<desc>2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape585-1408" v:mID="585" v:groupContext="shape" transform="translate(522.12,-90.375)">
+			<title>Square.585</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape586-1411" v:mID="586" v:groupContext="shape" transform="translate(540.12,-90.375)">
+			<title>Square.586</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape587-1414" v:mID="587" v:groupContext="shape" transform="translate(558.12,-90.375)">
+			<title>Square.587</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape588-1417" v:mID="588" v:groupContext="shape" transform="translate(504.12,-72.375)">
+			<title>Square.588</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape589-1420" v:mID="589" v:groupContext="shape" transform="translate(522.12,-72.375)">
+			<title>Square.589</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape590-1423" v:mID="590" v:groupContext="shape" transform="translate(540.12,-72.375)">
+			<title>Square.590</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape591-1426" v:mID="591" v:groupContext="shape" transform="translate(558.12,-72.375)">
+			<title>Square.591</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape592-1429" v:mID="592" v:groupContext="shape" transform="translate(504.12,-54.375)">
+			<title>Square.592</title>
+			<desc>3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape593-1432" v:mID="593" v:groupContext="shape" transform="translate(522.12,-54.375)">
+			<title>Square.593</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape594-1435" v:mID="594" v:groupContext="shape" transform="translate(540.12,-54.375)">
+			<title>Square.594</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape595-1438" v:mID="595" v:groupContext="shape" transform="translate(558.12,-54.375)">
+			<title>Square.595</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape596-1441" v:mID="596" v:groupContext="shape" transform="translate(576.12,-180.375)">
+			<title>Square.596</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape597-1444" v:mID="597" v:groupContext="shape" transform="translate(594.12,-180.375)">
+			<title>Square.597</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape598-1447" v:mID="598" v:groupContext="shape" transform="translate(612.12,-180.375)">
+			<title>Square.598</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape599-1450" v:mID="599" v:groupContext="shape" transform="translate(630.12,-180.375)">
+			<title>Square.599</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape600-1453" v:mID="600" v:groupContext="shape" transform="translate(576.12,-162.375)">
+			<title>Square.600</title>
+			<desc>4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape601-1456" v:mID="601" v:groupContext="shape" transform="translate(594.12,-162.375)">
+			<title>Square.601</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape602-1459" v:mID="602" v:groupContext="shape" transform="translate(612.12,-162.375)">
+			<title>Square.602</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape603-1462" v:mID="603" v:groupContext="shape" transform="translate(630.12,-162.375)">
+			<title>Square.603</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape604-1465" v:mID="604" v:groupContext="shape" transform="translate(576.12,-144.375)">
+			<title>Square.604</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape605-1468" v:mID="605" v:groupContext="shape" transform="translate(594.12,-144.375)">
+			<title>Square.605</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape606-1471" v:mID="606" v:groupContext="shape" transform="translate(612.12,-144.375)">
+			<title>Square.606</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape607-1474" v:mID="607" v:groupContext="shape" transform="translate(630.12,-144.375)">
+			<title>Square.607</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape608-1477" v:mID="608" v:groupContext="shape" transform="translate(576.12,-126.375)">
+			<title>Square.608</title>
+			<desc>5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text>		</g>
+		<g id="shape609-1480" v:mID="609" v:groupContext="shape" transform="translate(594.12,-126.375)">
+			<title>Square.609</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape610-1483" v:mID="610" v:groupContext="shape" transform="translate(612.12,-126.375)">
+			<title>Square.610</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape611-1486" v:mID="611" v:groupContext="shape" transform="translate(630.12,-126.375)">
+			<title>Square.611</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape612-1489" v:mID="612" v:groupContext="shape" transform="translate(576.12,-108.375)">
+			<title>Square.612</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape613-1492" v:mID="613" v:groupContext="shape" transform="translate(594.12,-108.375)">
+			<title>Square.613</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape614-1495" v:mID="614" v:groupContext="shape" transform="translate(612.12,-108.375)">
+			<title>Square.614</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape615-1498" v:mID="615" v:groupContext="shape" transform="translate(630.12,-108.375)">
+			<title>Square.615</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape616-1501" v:mID="616" v:groupContext="shape" transform="translate(576.12,-90.375)">
+			<title>Square.616</title>
+			<desc>6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text>		</g>
+		<g id="shape617-1504" v:mID="617" v:groupContext="shape" transform="translate(594.12,-90.375)">
+			<title>Square.617</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape618-1507" v:mID="618" v:groupContext="shape" transform="translate(612.12,-90.375)">
+			<title>Square.618</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape619-1510" v:mID="619" v:groupContext="shape" transform="translate(630.12,-90.375)">
+			<title>Square.619</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape620-1513" v:mID="620" v:groupContext="shape" transform="translate(576.12,-72.375)">
+			<title>Square.620</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape621-1516" v:mID="621" v:groupContext="shape" transform="translate(594.12,-72.375)">
+			<title>Square.621</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape622-1519" v:mID="622" v:groupContext="shape" transform="translate(612.12,-72.375)">
+			<title>Square.622</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape623-1522" v:mID="623" v:groupContext="shape" transform="translate(630.12,-72.375)">
+			<title>Square.623</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape624-1525" v:mID="624" v:groupContext="shape" transform="translate(576.12,-54.375)">
+			<title>Square.624</title>
+			<desc>7</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text>		</g>
+		<g id="shape625-1528" v:mID="625" v:groupContext="shape" transform="translate(594.12,-54.375)">
+			<title>Square.625</title>
+			<desc>8</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text>		</g>
+		<g id="shape626-1531" v:mID="626" v:groupContext="shape" transform="translate(612.12,-54.375)">
+			<title>Square.626</title>
+			<desc>9</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st3"/>
+			<text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text>		</g>
+		<g id="shape627-1534" v:mID="627" v:groupContext="shape" transform="translate(630.12,-54.375)">
+			<title>Square.627</title>
+			<desc>10</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+			<rect x="0" y="561.267" width="18" height="18" class="st1"/>
+			<text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text>		</g>
+		<g id="shape630-1537" v:mID="630" v:groupContext="shape" transform="translate(472.189,-335.711) rotate(45)">
+			<title>Sheet.630</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid1">
+						<rect width="89.024" height="228.01" id="mfid2"/>
+					</clipPath>
+					<g clip-path="url(#mfid1)">
+						<mask id="mfid3">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid4" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid3)">
+									<use xlink:href="#mfid2"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid5" width="90" height="229" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFoAAADlCAYAAADTNOCiAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnWARjRyu0AAA7uSURBVHhe7dxJs11VGcZxhg79BvoJwInObGZ2A6oyBJuBOlS/gIC9jghdQmcIkJCOJvQhBAKRQCANAR3Qgw29WhazVDk4vu+967lZ+z3P2nut3Zy777v3qnoHTEjqV/96zoE0l2y194UfXvO5yy7f/vnwj/Mb4q0hb9t+5NJt156fsQd6QJZb6M3YA7wY+StX7lx89Ue7Zuy+n0X+44lPF9tPfzZj9/kY8p/eWix2yV03Y/fz6pD17pixuz+L/IeAbKF3y12v2D+esYsfQ75dQC0yoHe/vVjcMGOXPYv8+2cvIlNoQb4z3I0zdt5LIVtoWzOg75K7acaufwz5tjfXkZtmA8h6d8/Y6WeRf0eQLTSrGdB75HYI9tdm7IsvhZyCrpsNRQb0XrkdZ2bstceQbw3IgGY1A5rVvAH9zvrtnDq2Rf5tQAZ0l9kA9D3hbp4qdoz8ZYJsoVnN7EMwng1A7wt3y9SwGfItAdlCJ2dDIHNrVuT94W49OxHsJeRn1pEBzWoGNKsZ0HHNSeh3F4sDcrd5x2bIN79xEXnI2VBoRdY7KHe7V+w6ZAvNagZ06YegrVmR9Q55xLbIvwnIFrqP2UjVzKDvfU/+/ec+W3zdAzZD3kmQLTSrGdCs5g1oQW2aDUUGtAtsi/zrCHmzZwPQ94XbtVWxU8gWmtWcMxu0Zr0IOadmQN8vd8fLWwybIe8IyJ1mQyCbZmOpZrlUzYBWZL0H/raFsC3yr44vI5fMBqsZ0DFyyWywmgF9WG732LFTyCloVjOgl2rWE8yhZkORAa1351ixGfJNr68j58wGqxnQQ34IxjXH0A/K3XV+ZNgW+ZcBGdCsZkCzmktno/S7c91sAFnvob+PCJsh3xgh59QMaFZz1mwIap+zAWS9h+Xu3mzsOmQLzWoe42zENQNab49gf2MzsFPIdTV3nQ1as16EzGqumw1WM6CB/Ei4Pa+sGLuCfMU68g0Rcq+zIZCs5pzZYDUDmtXMZiOGfvQf8mOtCrsOua/ZYDV3nQ1Wc+5sAFnvsVVgW+RrArKFZjUDmtUM6KWa9QSzy2ywmtvMRgytd89Q2Az5+tfWkdvOBqs5ZzZKa+46Gxb58XD7Xu0Z2yJfTZAtNKsZ0KzmnNmIa46h45r7+O4M6FTNa9D/XCyOyPWGnUJOQefW3Hk2BLXv2WA1W2jUDGi9/V2xGfJ1ARnQrGZAs5pzZqPPD0FAs5oBzWpumg0gPxHugGL/pAW2Rb4qIAN6VbNBa9aLkFnNdbPBam4zGzH00fdbYDPk7RGyhWY158wGqxnQrOaus8FqzpmNppoBrXcwF9si/yJCnmeDQx8N0E+GO9SEHSMPcWw2WM1dZ4PVnDMb7Ofcx1Wwh0bWy5oNgSyuWS5Vc8lssJ9zX1f0Adn24QdLzQarGdBxzSWzwWoGNKs5hg4/7a33UtBjmg3dZTfQrGZAj+FD0DV07mx0/e7cNBvuoFnNgGY1b0ALapvZYDUDOkaeBPQYZsMlNKt5Mz8E9fQ/UNxAJ2dDIItrlkvV3GY23EKzmgEd11wyG6xmQLOaAY3/r+EKeqyz4Rp6LB+Ciqz/88gddO5srOK7M2p2Bc1mg9W8AS2obWaD1QxoVrNr6LHNhjtoNhu0Zr0IObdmQLOaAc1q9gstkMU1y6Vq7job+isobqBZzYCOay6ZDVYzoFnNgLY1u4cey2y4gx7jhyB+8dU9dGnNQ8yGS2hW8wa0oLaZDVYzoFnNgAay/jYCN9Bjng230LRmvQg5t2ZAs5oBzWr2Cy2QxTXLpWruczZcQ8c1l8wGqxnQrGZA19Wsv3HRHfSYZgM1u4Iuno0I2UKzmgHNagY0rVlOf9OiS+jSmoeeDXfQdDYEtc1ssJoBzWoGNKvZNfSYZsMVNK1ZL0LOrRnQrGZAs5pTs+EWurZmuVTNQ82G/s59V9BxzSWzwWoGNKsZ0Lk1u4Ye02y4hC6tGdCsZkCzmgFNkeViZFfQpTWvcjb8Qgtqm9lgNQOa1QzoVM1uocc6G/pnBd1Bj+1DcA36fUfQtTXLpWoeajaAPC1oQbXQrGZAs5oBXVyznjfosc6GP+iA3FQzoFnNgGY1A5oiy6VqdgU9xu/O8d9M4AdaUNvMBqsZ0KxmQKdqrkCHmt1Bj3E28PdsuIMuqRnQrGZAs5qzZkNwXUIv1RygWc1DzQaQpwUtqBaa1QxoVjOgi2vWi5CPeYMe62y4hE7VDGhWM6BZzYCmyHKpmt1CxzWPbTbcQjfNBqsZ0KxmQNuac2fj2AfOoMcyG0B2DR3XXDcbrGZAs5qzZkNQ2Ww85Ql6reYAzWruezYschJakP1BC2pOzYBmNWfNRkCu1Kxnkb1Cx7PBat6s2fAFHSFbaFZzzmxQZLlUzanZcA/ddjaKa9azyHJAftoTdJfZYDUD2tZc8t3ZLXTpbLCa284GkCcHnVtz59kQ1KbZcAnNZoPV3GY2LHISOiC7hG6qufNsBORKzXoWeQrQrOac2WA19zkbx6cAXTobFFnOIiehA3IF+kOH0KxmQLOaAV1cs55FlrPI7qBLZ4PVbKFRc9vvzmvIel6hWc2AZjWXzkbud2fU7A56rLPhGprVnDMbqZor0IJaMhvPeIRe9Wyg5rrZcAXNZoPVDGhWc9ZsCGrObMQ1u4XuezaAnIQOyJOBZrPBagZ0cc16FlkOyBVowZ0EdNvZQM19zoYraFYzoFnNgKY1yzXVvAEdkOtqftY79CpnA8iTgGY1A5rVDOhUzRVoQV2CDsiVmvUi5MlADzEbqDl3NlxCs5oBzWrOmg1BzZmNVM2uoPueDSAnoQNyHTSQ3UKzmgFdXLOeRZYDcgVaQCcB3XY2UPNQs3FCzi00qxnQtGa5VM1ZsyGYqZpdQq9yNoA8KWg2G6xmQKdqrkALalbNeoIJ5Bhakd1C9zkbtuY2s3HiI2fQbDZYzVmzIag5s4GaGTRqdg3dZjaAnIQOyAy6rmZX0KxmQBfXrGeR5YBcgRbMJug/e4eumw3UPOR3Z7fQrGZA05rlUjVnzYZA5szGJKD7ng0g50IrsktoVjOgUzVXoAU1q2Y9wQRyDG1nwxV0n7Nha247G0B2C81qzpoNQWXQqZoZNKvZHXSb2QByEjogM+i6mmPo5zxDF9esZ5HlgFyBFsyS2XAJXTcbqHlV351dQrPZoDXLpWpegg7IpTXb2XAL3fdsADkX2tbsDprNRqrmCrSgZtWsJ5hAjqHZbADZNXTpbAC5Flog29R8Us4NNKs5azYENWc22nx3ngx0U81J6IDMoOtqTs2GO+jimvUsshyQK9CC2XY2ZmhBLZkNIMfQObPhEpoiy6VqzpoNgewyGyc/dgTd92wAORe6rubnvUKnaq5AC2pWzXqCCeQYOms2BNkddOlsALkWWiC7zsY0oANyY81yqZq7zoZL6Kaak9ABmUEDOYYuqdkVdHbNehZZDsgVaMHso+ZpQgtqyWwka9YTzBzoF7xBV5DlUjVnzYZANtWcOxu+oQMyg26qORc6t2ZX0KmaK9CCmlWznmACOYbOmo2APCnoVM210ALZ52z4hQ7IjTXLpWruczZOeYNO1ZyEDsgMGsgxdNuafUIH5ErNehZZDsgVaMHso+YK9CdTgRbUktlI1qwnmDnQcc2uoFOzsZnfnVGzP+iAzKCbas6FLql5WtCCmlWznmACOYbOmo2AXKlZT5Bf9AbdVHMttEAONRu+oANyY81yqZr7nA3U7A7aIiehAzKDBnIM3bpmvYDsGjr1Idh2NupqbpoN39CCWjIbyZr1BLMOuqlmd9C25qzZEMimmotnQ2Bj6Je8QzfVnAtdUvO0oAU1q2Y9wQRyDJ01GwG5UrOeQXYFnaq5FlogVzEbPqEFNWc2VvHdeQP606lAB2QGDeQYunXNehZZ7vSqoC+7fPsXL9127Xv4wYY4NhuoebO+O6PmGHqIE9v/ivGXVoLdZjaSNesJZh10U82rgq4g48XY3/zp3sW+1y5U/kKT+I+wlf5+56LZEMimmotnQ1BTs6F3Jrqz/1q/c+Fe1vv3YnE+3CvhXv3P+v0l3F/lzn3yv8UPrj6cRsaz2PsFW5GH/iPGQM6FLqk5CR2QY+izehZZziIz6GxkPIt9IGDbmi00at7UX64KyJWa9SxyClpQs2rWi5DPliLjWeyDr19oPRtj/e7cVHMttOACujUyHsOeZ6MKrcjf74KMF2N/S7APCXYMnaq5Ai2oS9ABuVKznmACOYZms5FVs55FlgNyBVpQS2ajN2Q8i32vYLedDdS8qd+d5XJrTs3Gmb6R8Sz2fW+sY1egA3JjzXJda2bQTTVvQAfkxprlWM2DIeNVsH+2d3G/YNfVnIQOyHXQrOa+ZwPISeiAHEOf+XhgZDyGnaxZzyLLAbkCLZhdZgPIFWhBXYIOyJWa9SyyHJABfXpVyHgW+wHFDsgVaEEdy2yg5razsXJkPIt9+M0LtTVvQAfk0pqLZ0NQc2Yj57vzpiHjMewUdKrmXOiSmpPQAbkO2ta86ch4KWw6GwG5UrOeYAI5hmazkVWznkWWA3IFWlCXoMeGjBdjf1uwHxRsVvNW+u6syN8bEzLeEvZbF5agu9bMoJtq3oAOyI01y40WGc9iPxSwc2eD1dz3bAA5Bf3S2JHxLPbDgh3XXIEWzC6zAeQKtKAuQQfkSs16FvmjLYKMx7Cbao6h+5oN1JwzG2vIV20hZDyL/cjb69glNRfPhqDmzAZqBvSLWxUZz2I/Ktgl0CU1J6EDMoN2gYwXY38nYI/lu7MbZDyL/dg7gi2YbWvuYzbcIeMx7JyaGXSq5qzZEORTXpHxLPbjgs1q7ns2gKznHhnPYh+JsEtqrkALak7Npz6cCDIew+46G7ZmC/3C1JDxLPYTii2QxbMhqE2zMVlkvCXsdy8U1ZyEDsh6inzllJHxLPbRgJ1Vs55FlgPy8zNy9cXY3w3YtubS2ZiRE89iPxmwU7NR9915Rm54DLt0NmbkzGexjwk2kCvQgmqhT87IZc9iPxWw62o++cGM3OpVsH++jp2qeUbu+Cz20wE7hn5uRu7nWezjgj0jD/SWsN+Tr36CfMWM3P+z2DPygC/GnpEHfsCekVfwFHvrIV9yyf8BAzo3kZlAInsAAAAASUVORK5CYIIAAAA="/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid4)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid6">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid5" clip-path="url(#mfid6)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape631-1540" v:mID="631" v:groupContext="shape" transform="translate(773.187,-98.8741) rotate(75)">
+			<title>Sheet.631</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid7">
+						<rect width="89.024" height="228.01" id="mfid8"/>
+					</clipPath>
+					<g clip-path="url(#mfid7)">
+						<mask id="mfid9">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid10" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid9)">
+									<use xlink:href="#mfid8"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid11" width="90" height="229" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFoAAADlCAYAAADTNOCiAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnWARjRyu0AAA7uSURBVHhe7dxJs11VGcZxhg79BvoJwInObGZ2A6oyBJuBOlS/gIC9jghdQmcIkJCOJvQhBAKRQCANAR3Qgw29WhazVDk4vu+967lZ+z3P2nut3Zy777v3qnoHTEjqV/96zoE0l2y194UfXvO5yy7f/vnwj/Mb4q0hb9t+5NJt156fsQd6QJZb6M3YA7wY+StX7lx89Ue7Zuy+n0X+44lPF9tPfzZj9/kY8p/eWix2yV03Y/fz6pD17pixuz+L/IeAbKF3y12v2D+esYsfQ75dQC0yoHe/vVjcMGOXPYv8+2cvIlNoQb4z3I0zdt5LIVtoWzOg75K7acaufwz5tjfXkZtmA8h6d8/Y6WeRf0eQLTSrGdB75HYI9tdm7IsvhZyCrpsNRQb0XrkdZ2bstceQbw3IgGY1A5rVvAH9zvrtnDq2Rf5tQAZ0l9kA9D3hbp4qdoz8ZYJsoVnN7EMwng1A7wt3y9SwGfItAdlCJ2dDIHNrVuT94W49OxHsJeRn1pEBzWoGNKsZ0HHNSeh3F4sDcrd5x2bIN79xEXnI2VBoRdY7KHe7V+w6ZAvNagZ06YegrVmR9Q55xLbIvwnIFrqP2UjVzKDvfU/+/ec+W3zdAzZD3kmQLTSrGdCs5g1oQW2aDUUGtAtsi/zrCHmzZwPQ94XbtVWxU8gWmtWcMxu0Zr0IOadmQN8vd8fLWwybIe8IyJ1mQyCbZmOpZrlUzYBWZL0H/raFsC3yr44vI5fMBqsZ0DFyyWywmgF9WG732LFTyCloVjOgl2rWE8yhZkORAa1351ixGfJNr68j58wGqxnQQ34IxjXH0A/K3XV+ZNgW+ZcBGdCsZkCzmktno/S7c91sAFnvob+PCJsh3xgh59QMaFZz1mwIap+zAWS9h+Xu3mzsOmQLzWoe42zENQNab49gf2MzsFPIdTV3nQ1as16EzGqumw1WM6CB/Ei4Pa+sGLuCfMU68g0Rcq+zIZCs5pzZYDUDmtXMZiOGfvQf8mOtCrsOua/ZYDV3nQ1Wc+5sAFnvsVVgW+RrArKFZjUDmtUM6KWa9QSzy2ywmtvMRgytd89Q2Az5+tfWkdvOBqs5ZzZKa+46Gxb58XD7Xu0Z2yJfTZAtNKsZ0KzmnNmIa46h45r7+O4M6FTNa9D/XCyOyPWGnUJOQefW3Hk2BLXv2WA1W2jUDGi9/V2xGfJ1ARnQrGZAs5pzZqPPD0FAs5oBzWpumg0gPxHugGL/pAW2Rb4qIAN6VbNBa9aLkFnNdbPBam4zGzH00fdbYDPk7RGyhWY158wGqxnQrOaus8FqzpmNppoBrXcwF9si/yJCnmeDQx8N0E+GO9SEHSMPcWw2WM1dZ4PVnDMb7Ofcx1Wwh0bWy5oNgSyuWS5Vc8lssJ9zX1f0Adn24QdLzQarGdBxzSWzwWoGNKs5hg4/7a33UtBjmg3dZTfQrGZAj+FD0DV07mx0/e7cNBvuoFnNgGY1b0ALapvZYDUDOkaeBPQYZsMlNKt5Mz8E9fQ/UNxAJ2dDIItrlkvV3GY23EKzmgEd11wyG6xmQLOaAY3/r+EKeqyz4Rp6LB+Ciqz/88gddO5srOK7M2p2Bc1mg9W8AS2obWaD1QxoVrNr6LHNhjtoNhu0Zr0IObdmQLOaAc1q9gstkMU1y6Vq7job+isobqBZzYCOay6ZDVYzoFnNgLY1u4cey2y4gx7jhyB+8dU9dGnNQ8yGS2hW8wa0oLaZDVYzoFnNgAay/jYCN9Bjng230LRmvQg5t2ZAs5oBzWr2Cy2QxTXLpWruczZcQ8c1l8wGqxnQrGZA19Wsv3HRHfSYZgM1u4Iuno0I2UKzmgHNagY0rVlOf9OiS+jSmoeeDXfQdDYEtc1ssJoBzWoGNKvZNfSYZsMVNK1ZL0LOrRnQrGZAs5pTs+EWurZmuVTNQ82G/s59V9BxzSWzwWoGNKsZ0Lk1u4Ye02y4hC6tGdCsZkCzmgFNkeViZFfQpTWvcjb8Qgtqm9lgNQOa1QzoVM1uocc6G/pnBd1Bj+1DcA36fUfQtTXLpWoeajaAPC1oQbXQrGZAs5oBXVyznjfosc6GP+iA3FQzoFnNgGY1A5oiy6VqdgU9xu/O8d9M4AdaUNvMBqsZ0KxmQKdqrkCHmt1Bj3E28PdsuIMuqRnQrGZAs5qzZkNwXUIv1RygWc1DzQaQpwUtqBaa1QxoVjOgi2vWi5CPeYMe62y4hE7VDGhWM6BZzYCmyHKpmt1CxzWPbTbcQjfNBqsZ0KxmQNuac2fj2AfOoMcyG0B2DR3XXDcbrGZAs5qzZkNQ2Ww85Ql6reYAzWruezYschJakP1BC2pOzYBmNWfNRkCu1Kxnkb1Cx7PBat6s2fAFHSFbaFZzzmxQZLlUzanZcA/ddjaKa9azyHJAftoTdJfZYDUD2tZc8t3ZLXTpbLCa284GkCcHnVtz59kQ1KbZcAnNZoPV3GY2LHISOiC7hG6qufNsBORKzXoWeQrQrOac2WA19zkbx6cAXTobFFnOIiehA3IF+kOH0KxmQLOaAV1cs55FlrPI7qBLZ4PVbKFRc9vvzmvIel6hWc2AZjWXzkbud2fU7A56rLPhGprVnDMbqZor0IJaMhvPeIRe9Wyg5rrZcAXNZoPVDGhWc9ZsCGrObMQ1u4XuezaAnIQOyJOBZrPBagZ0cc16FlkOyBVowZ0EdNvZQM19zoYraFYzoFnNgKY1yzXVvAEdkOtqftY79CpnA8iTgGY1A5rVDOhUzRVoQV2CDsiVmvUi5MlADzEbqDl3NlxCs5oBzWrOmg1BzZmNVM2uoPueDSAnoQNyHTSQ3UKzmgFdXLOeRZYDcgVaQCcB3XY2UPNQs3FCzi00qxnQtGa5VM1ZsyGYqZpdQq9yNoA8KWg2G6xmQKdqrkALalbNeoIJ5Bhakd1C9zkbtuY2s3HiI2fQbDZYzVmzIag5s4GaGTRqdg3dZjaAnIQOyAy6rmZX0KxmQBfXrGeR5YBcgRbMJug/e4eumw3UPOR3Z7fQrGZA05rlUjVnzYZA5szGJKD7ng0g50IrsktoVjOgUzVXoAU1q2Y9wQRyDG1nwxV0n7Nha247G0B2C81qzpoNQWXQqZoZNKvZHXSb2QByEjogM+i6mmPo5zxDF9esZ5HlgFyBFsyS2XAJXTcbqHlV351dQrPZoDXLpWpegg7IpTXb2XAL3fdsADkX2tbsDprNRqrmCrSgZtWsJ5hAjqHZbADZNXTpbAC5Flog29R8Us4NNKs5azYENWc22nx3ngx0U81J6IDMoOtqTs2GO+jimvUsshyQK9CC2XY2ZmhBLZkNIMfQObPhEpoiy6VqzpoNgewyGyc/dgTd92wAORe6rubnvUKnaq5AC2pWzXqCCeQYOms2BNkddOlsALkWWiC7zsY0oANyY81yqZq7zoZL6Kaak9ABmUEDOYYuqdkVdHbNehZZDsgVaMHso+ZpQgtqyWwka9YTzBzoF7xBV5DlUjVnzYZANtWcOxu+oQMyg26qORc6t2ZX0KmaK9CCmlWznmACOYbOmo2APCnoVM210ALZ52z4hQ7IjTXLpWruczZOeYNO1ZyEDsgMGsgxdNuafUIH5ErNehZZDsgVaMHso+YK9CdTgRbUktlI1qwnmDnQcc2uoFOzsZnfnVGzP+iAzKCbas6FLql5WtCCmlWznmACOYbOmo2AXKlZT5Bf9AbdVHMttEAONRu+oANyY81yqZr7nA3U7A7aIiehAzKDBnIM3bpmvYDsGjr1Idh2NupqbpoN39CCWjIbyZr1BLMOuqlmd9C25qzZEMimmotnQ2Bj6Je8QzfVnAtdUvO0oAU1q2Y9wQRyDJ01GwG5UrOeQXYFnaq5FlogVzEbPqEFNWc2VvHdeQP606lAB2QGDeQYunXNehZZ7vSqoC+7fPsXL9127Xv4wYY4NhuoebO+O6PmGHqIE9v/ivGXVoLdZjaSNesJZh10U82rgq4g48XY3/zp3sW+1y5U/kKT+I+wlf5+56LZEMimmotnQ1BTs6F3Jrqz/1q/c+Fe1vv3YnE+3CvhXv3P+v0l3F/lzn3yv8UPrj6cRsaz2PsFW5GH/iPGQM6FLqk5CR2QY+izehZZziIz6GxkPIt9IGDbmi00at7UX64KyJWa9SxyClpQs2rWi5DPliLjWeyDr19oPRtj/e7cVHMttOACujUyHsOeZ6MKrcjf74KMF2N/S7APCXYMnaq5Ai2oS9ABuVKznmACOYZms5FVs55FlgNyBVpQS2ajN2Q8i32vYLedDdS8qd+d5XJrTs3Gmb6R8Sz2fW+sY1egA3JjzXJda2bQTTVvQAfkxprlWM2DIeNVsH+2d3G/YNfVnIQOyHXQrOa+ZwPISeiAHEOf+XhgZDyGnaxZzyLLAbkCLZhdZgPIFWhBXYIOyJWa9SyyHJABfXpVyHgW+wHFDsgVaEEdy2yg5razsXJkPIt9+M0LtTVvQAfk0pqLZ0NQc2Yj57vzpiHjMewUdKrmXOiSmpPQAbkO2ta86ch4KWw6GwG5UrOeYAI5hmazkVWznkWWA3IFWlCXoMeGjBdjf1uwHxRsVvNW+u6syN8bEzLeEvZbF5agu9bMoJtq3oAOyI01y40WGc9iPxSwc2eD1dz3bAA5Bf3S2JHxLPbDgh3XXIEWzC6zAeQKtKAuQQfkSs16FvmjLYKMx7Cbao6h+5oN1JwzG2vIV20hZDyL/cjb69glNRfPhqDmzAZqBvSLWxUZz2I/Ktgl0CU1J6EDMoN2gYwXY38nYI/lu7MbZDyL/dg7gi2YbWvuYzbcIeMx7JyaGXSq5qzZEORTXpHxLPbjgs1q7ns2gKznHhnPYh+JsEtqrkALak7Npz6cCDIew+46G7ZmC/3C1JDxLPYTii2QxbMhqE2zMVlkvCXsdy8U1ZyEDsh6inzllJHxLPbRgJ1Vs55FlgPy8zNy9cXY3w3YtubS2ZiRE89iPxmwU7NR9915Rm54DLt0NmbkzGexjwk2kCvQgmqhT87IZc9iPxWw62o++cGM3OpVsH++jp2qeUbu+Cz20wE7hn5uRu7nWezjgj0jD/SWsN+Tr36CfMWM3P+z2DPygC/GnpEHfsCekVfwFHvrIV9yyf8BAzo3kZlAInsAAAAASUVORK5CYIIAAAA="/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid10)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid12">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid11" clip-path="url(#mfid12)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape632-1543" v:mID="632" v:groupContext="shape" transform="translate(950.873,41.6775) rotate(90)">
+			<title>Sheet.632</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid13">
+						<rect width="89.024" height="228.01" id="mfid14"/>
+					</clipPath>
+					<g clip-path="url(#mfid13)">
+						<mask id="mfid15">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid16" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid15)">
+									<use xlink:href="#mfid14"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid17" width="90" height="229" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFoAAADlCAYAAADTNOCiAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnWARjRyu0AAA7uSURBVHhe7dxJs11VGcZxhg79BvoJwInObGZ2A6oyBJuBOlS/gIC9jghdQmcIkJCOJvQhBAKRQCANAR3Qgw29WhazVDk4vu+967lZ+z3P2nut3Zy777v3qnoHTEjqV/96zoE0l2y194UfXvO5yy7f/vnwj/Mb4q0hb9t+5NJt156fsQd6QJZb6M3YA7wY+StX7lx89Ue7Zuy+n0X+44lPF9tPfzZj9/kY8p/eWix2yV03Y/fz6pD17pixuz+L/IeAbKF3y12v2D+esYsfQ75dQC0yoHe/vVjcMGOXPYv8+2cvIlNoQb4z3I0zdt5LIVtoWzOg75K7acaufwz5tjfXkZtmA8h6d8/Y6WeRf0eQLTSrGdB75HYI9tdm7IsvhZyCrpsNRQb0XrkdZ2bstceQbw3IgGY1A5rVvAH9zvrtnDq2Rf5tQAZ0l9kA9D3hbp4qdoz8ZYJsoVnN7EMwng1A7wt3y9SwGfItAdlCJ2dDIHNrVuT94W49OxHsJeRn1pEBzWoGNKsZ0HHNSeh3F4sDcrd5x2bIN79xEXnI2VBoRdY7KHe7V+w6ZAvNagZ06YegrVmR9Q55xLbIvwnIFrqP2UjVzKDvfU/+/ec+W3zdAzZD3kmQLTSrGdCs5g1oQW2aDUUGtAtsi/zrCHmzZwPQ94XbtVWxU8gWmtWcMxu0Zr0IOadmQN8vd8fLWwybIe8IyJ1mQyCbZmOpZrlUzYBWZL0H/raFsC3yr44vI5fMBqsZ0DFyyWywmgF9WG732LFTyCloVjOgl2rWE8yhZkORAa1351ixGfJNr68j58wGqxnQQ34IxjXH0A/K3XV+ZNgW+ZcBGdCsZkCzmktno/S7c91sAFnvob+PCJsh3xgh59QMaFZz1mwIap+zAWS9h+Xu3mzsOmQLzWoe42zENQNab49gf2MzsFPIdTV3nQ1as16EzGqumw1WM6CB/Ei4Pa+sGLuCfMU68g0Rcq+zIZCs5pzZYDUDmtXMZiOGfvQf8mOtCrsOua/ZYDV3nQ1Wc+5sAFnvsVVgW+RrArKFZjUDmtUM6KWa9QSzy2ywmtvMRgytd89Q2Az5+tfWkdvOBqs5ZzZKa+46Gxb58XD7Xu0Z2yJfTZAtNKsZ0KzmnNmIa46h45r7+O4M6FTNa9D/XCyOyPWGnUJOQefW3Hk2BLXv2WA1W2jUDGi9/V2xGfJ1ARnQrGZAs5pzZqPPD0FAs5oBzWpumg0gPxHugGL/pAW2Rb4qIAN6VbNBa9aLkFnNdbPBam4zGzH00fdbYDPk7RGyhWY158wGqxnQrOaus8FqzpmNppoBrXcwF9si/yJCnmeDQx8N0E+GO9SEHSMPcWw2WM1dZ4PVnDMb7Ofcx1Wwh0bWy5oNgSyuWS5Vc8lssJ9zX1f0Adn24QdLzQarGdBxzSWzwWoGNKs5hg4/7a33UtBjmg3dZTfQrGZAj+FD0DV07mx0/e7cNBvuoFnNgGY1b0ALapvZYDUDOkaeBPQYZsMlNKt5Mz8E9fQ/UNxAJ2dDIItrlkvV3GY23EKzmgEd11wyG6xmQLOaAY3/r+EKeqyz4Rp6LB+Ciqz/88gddO5srOK7M2p2Bc1mg9W8AS2obWaD1QxoVrNr6LHNhjtoNhu0Zr0IObdmQLOaAc1q9gstkMU1y6Vq7job+isobqBZzYCOay6ZDVYzoFnNgLY1u4cey2y4gx7jhyB+8dU9dGnNQ8yGS2hW8wa0oLaZDVYzoFnNgAay/jYCN9Bjng230LRmvQg5t2ZAs5oBzWr2Cy2QxTXLpWruczZcQ8c1l8wGqxnQrGZA19Wsv3HRHfSYZgM1u4Iuno0I2UKzmgHNagY0rVlOf9OiS+jSmoeeDXfQdDYEtc1ssJoBzWoGNKvZNfSYZsMVNK1ZL0LOrRnQrGZAs5pTs+EWurZmuVTNQ82G/s59V9BxzSWzwWoGNKsZ0Lk1u4Ye02y4hC6tGdCsZkCzmgFNkeViZFfQpTWvcjb8Qgtqm9lgNQOa1QzoVM1uocc6G/pnBd1Bj+1DcA36fUfQtTXLpWoeajaAPC1oQbXQrGZAs5oBXVyznjfosc6GP+iA3FQzoFnNgGY1A5oiy6VqdgU9xu/O8d9M4AdaUNvMBqsZ0KxmQKdqrkCHmt1Bj3E28PdsuIMuqRnQrGZAs5qzZkNwXUIv1RygWc1DzQaQpwUtqBaa1QxoVjOgi2vWi5CPeYMe62y4hE7VDGhWM6BZzYCmyHKpmt1CxzWPbTbcQjfNBqsZ0KxmQNuac2fj2AfOoMcyG0B2DR3XXDcbrGZAs5qzZkNQ2Ww85Ql6reYAzWruezYschJakP1BC2pOzYBmNWfNRkCu1Kxnkb1Cx7PBat6s2fAFHSFbaFZzzmxQZLlUzanZcA/ddjaKa9azyHJAftoTdJfZYDUD2tZc8t3ZLXTpbLCa284GkCcHnVtz59kQ1KbZcAnNZoPV3GY2LHISOiC7hG6qufNsBORKzXoWeQrQrOac2WA19zkbx6cAXTobFFnOIiehA3IF+kOH0KxmQLOaAV1cs55FlrPI7qBLZ4PVbKFRc9vvzmvIel6hWc2AZjWXzkbud2fU7A56rLPhGprVnDMbqZor0IJaMhvPeIRe9Wyg5rrZcAXNZoPVDGhWc9ZsCGrObMQ1u4XuezaAnIQOyJOBZrPBagZ0cc16FlkOyBVowZ0EdNvZQM19zoYraFYzoFnNgKY1yzXVvAEdkOtqftY79CpnA8iTgGY1A5rVDOhUzRVoQV2CDsiVmvUi5MlADzEbqDl3NlxCs5oBzWrOmg1BzZmNVM2uoPueDSAnoQNyHTSQ3UKzmgFdXLOeRZYDcgVaQCcB3XY2UPNQs3FCzi00qxnQtGa5VM1ZsyGYqZpdQq9yNoA8KWg2G6xmQKdqrkALalbNeoIJ5Bhakd1C9zkbtuY2s3HiI2fQbDZYzVmzIag5s4GaGTRqdg3dZjaAnIQOyAy6rmZX0KxmQBfXrGeR5YBcgRbMJug/e4eumw3UPOR3Z7fQrGZA05rlUjVnzYZA5szGJKD7ng0g50IrsktoVjOgUzVXoAU1q2Y9wQRyDG1nwxV0n7Nha247G0B2C81qzpoNQWXQqZoZNKvZHXSb2QByEjogM+i6mmPo5zxDF9esZ5HlgFyBFsyS2XAJXTcbqHlV351dQrPZoDXLpWpegg7IpTXb2XAL3fdsADkX2tbsDprNRqrmCrSgZtWsJ5hAjqHZbADZNXTpbAC5Flog29R8Us4NNKs5azYENWc22nx3ngx0U81J6IDMoOtqTs2GO+jimvUsshyQK9CC2XY2ZmhBLZkNIMfQObPhEpoiy6VqzpoNgewyGyc/dgTd92wAORe6rubnvUKnaq5AC2pWzXqCCeQYOms2BNkddOlsALkWWiC7zsY0oANyY81yqZq7zoZL6Kaak9ABmUEDOYYuqdkVdHbNehZZDsgVaMHso+ZpQgtqyWwka9YTzBzoF7xBV5DlUjVnzYZANtWcOxu+oQMyg26qORc6t2ZX0KmaK9CCmlWznmACOYbOmo2APCnoVM210ALZ52z4hQ7IjTXLpWruczZOeYNO1ZyEDsgMGsgxdNuafUIH5ErNehZZDsgVaMHso+YK9CdTgRbUktlI1qwnmDnQcc2uoFOzsZnfnVGzP+iAzKCbas6FLql5WtCCmlWznmACOYbOmo2AXKlZT5Bf9AbdVHMttEAONRu+oANyY81yqZr7nA3U7A7aIiehAzKDBnIM3bpmvYDsGjr1Idh2NupqbpoN39CCWjIbyZr1BLMOuqlmd9C25qzZEMimmotnQ2Bj6Je8QzfVnAtdUvO0oAU1q2Y9wQRyDJ01GwG5UrOeQXYFnaq5FlogVzEbPqEFNWc2VvHdeQP606lAB2QGDeQYunXNehZZ7vSqoC+7fPsXL9127Xv4wYY4NhuoebO+O6PmGHqIE9v/ivGXVoLdZjaSNesJZh10U82rgq4g48XY3/zp3sW+1y5U/kKT+I+wlf5+56LZEMimmotnQ1BTs6F3Jrqz/1q/c+Fe1vv3YnE+3CvhXv3P+v0l3F/lzn3yv8UPrj6cRsaz2PsFW5GH/iPGQM6FLqk5CR2QY+izehZZziIz6GxkPIt9IGDbmi00at7UX64KyJWa9SxyClpQs2rWi5DPliLjWeyDr19oPRtj/e7cVHMttOACujUyHsOeZ6MKrcjf74KMF2N/S7APCXYMnaq5Ai2oS9ABuVKznmACOYZms5FVs55FlgNyBVpQS2ajN2Q8i32vYLedDdS8qd+d5XJrTs3Gmb6R8Sz2fW+sY1egA3JjzXJda2bQTTVvQAfkxprlWM2DIeNVsH+2d3G/YNfVnIQOyHXQrOa+ZwPISeiAHEOf+XhgZDyGnaxZzyLLAbkCLZhdZgPIFWhBXYIOyJWa9SyyHJABfXpVyHgW+wHFDsgVaEEdy2yg5razsXJkPIt9+M0LtTVvQAfk0pqLZ0NQc2Yj57vzpiHjMewUdKrmXOiSmpPQAbkO2ta86ch4KWw6GwG5UrOeYAI5hmazkVWznkWWA3IFWlCXoMeGjBdjf1uwHxRsVvNW+u6syN8bEzLeEvZbF5agu9bMoJtq3oAOyI01y40WGc9iPxSwc2eD1dz3bAA5Bf3S2JHxLPbDgh3XXIEWzC6zAeQKtKAuQQfkSs16FvmjLYKMx7Cbao6h+5oN1JwzG2vIV20hZDyL/cjb69glNRfPhqDmzAZqBvSLWxUZz2I/Ktgl0CU1J6EDMoN2gYwXY38nYI/lu7MbZDyL/dg7gi2YbWvuYzbcIeMx7JyaGXSq5qzZEORTXpHxLPbjgs1q7ns2gKznHhnPYh+JsEtqrkALak7Npz6cCDIew+46G7ZmC/3C1JDxLPYTii2QxbMhqE2zMVlkvCXsdy8U1ZyEDsh6inzllJHxLPbRgJ1Vs55FlgPy8zNy9cXY3w3YtubS2ZiRE89iPxmwU7NR9915Rm54DLt0NmbkzGexjwk2kCvQgmqhT87IZc9iPxWw62o++cGM3OpVsH++jp2qeUbu+Cz20wE7hn5uRu7nWezjgj0jD/SWsN+Tr36CfMWM3P+z2DPygC/GnpEHfsCekVfwFHvrIV9yyf8BAzo3kZlAInsAAAAASUVORK5CYIIAAAA="/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid16)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid18">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid17" clip-path="url(#mfid18)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape633-1546" v:mID="633" v:groupContext="shape" transform="translate(1104.93,181.961) rotate(105)">
+			<title>Sheet.633</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid19">
+						<rect width="89.024" height="228.01" id="mfid20"/>
+					</clipPath>
+					<g clip-path="url(#mfid19)">
+						<mask id="mfid21">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid22" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid21)">
+									<use xlink:href="#mfid20"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid23" width="90" height="229" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFoAAADlCAYAAADTNOCiAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnWARjRyu0AAA7uSURBVHhe7dxJs11VGcZxhg79BvoJwInObGZ2A6oyBJuBOlS/gIC9jghdQmcIkJCOJvQhBAKRQCANAR3Qgw29WhazVDk4vu+967lZ+z3P2nut3Zy777v3qnoHTEjqV/96zoE0l2y194UfXvO5yy7f/vnwj/Mb4q0hb9t+5NJt156fsQd6QJZb6M3YA7wY+StX7lx89Ue7Zuy+n0X+44lPF9tPfzZj9/kY8p/eWix2yV03Y/fz6pD17pixuz+L/IeAbKF3y12v2D+esYsfQ75dQC0yoHe/vVjcMGOXPYv8+2cvIlNoQb4z3I0zdt5LIVtoWzOg75K7acaufwz5tjfXkZtmA8h6d8/Y6WeRf0eQLTSrGdB75HYI9tdm7IsvhZyCrpsNRQb0XrkdZ2bstceQbw3IgGY1A5rVvAH9zvrtnDq2Rf5tQAZ0l9kA9D3hbp4qdoz8ZYJsoVnN7EMwng1A7wt3y9SwGfItAdlCJ2dDIHNrVuT94W49OxHsJeRn1pEBzWoGNKsZ0HHNSeh3F4sDcrd5x2bIN79xEXnI2VBoRdY7KHe7V+w6ZAvNagZ06YegrVmR9Q55xLbIvwnIFrqP2UjVzKDvfU/+/ec+W3zdAzZD3kmQLTSrGdCs5g1oQW2aDUUGtAtsi/zrCHmzZwPQ94XbtVWxU8gWmtWcMxu0Zr0IOadmQN8vd8fLWwybIe8IyJ1mQyCbZmOpZrlUzYBWZL0H/raFsC3yr44vI5fMBqsZ0DFyyWywmgF9WG732LFTyCloVjOgl2rWE8yhZkORAa1351ixGfJNr68j58wGqxnQQ34IxjXH0A/K3XV+ZNgW+ZcBGdCsZkCzmktno/S7c91sAFnvob+PCJsh3xgh59QMaFZz1mwIap+zAWS9h+Xu3mzsOmQLzWoe42zENQNab49gf2MzsFPIdTV3nQ1as16EzGqumw1WM6CB/Ei4Pa+sGLuCfMU68g0Rcq+zIZCs5pzZYDUDmtXMZiOGfvQf8mOtCrsOua/ZYDV3nQ1Wc+5sAFnvsVVgW+RrArKFZjUDmtUM6KWa9QSzy2ywmtvMRgytd89Q2Az5+tfWkdvOBqs5ZzZKa+46Gxb58XD7Xu0Z2yJfTZAtNKsZ0KzmnNmIa46h45r7+O4M6FTNa9D/XCyOyPWGnUJOQefW3Hk2BLXv2WA1W2jUDGi9/V2xGfJ1ARnQrGZAs5pzZqPPD0FAs5oBzWpumg0gPxHugGL/pAW2Rb4qIAN6VbNBa9aLkFnNdbPBam4zGzH00fdbYDPk7RGyhWY158wGqxnQrOaus8FqzpmNppoBrXcwF9si/yJCnmeDQx8N0E+GO9SEHSMPcWw2WM1dZ4PVnDMb7Ofcx1Wwh0bWy5oNgSyuWS5Vc8lssJ9zX1f0Adn24QdLzQarGdBxzSWzwWoGNKs5hg4/7a33UtBjmg3dZTfQrGZAj+FD0DV07mx0/e7cNBvuoFnNgGY1b0ALapvZYDUDOkaeBPQYZsMlNKt5Mz8E9fQ/UNxAJ2dDIItrlkvV3GY23EKzmgEd11wyG6xmQLOaAY3/r+EKeqyz4Rp6LB+Ciqz/88gddO5srOK7M2p2Bc1mg9W8AS2obWaD1QxoVrNr6LHNhjtoNhu0Zr0IObdmQLOaAc1q9gstkMU1y6Vq7job+isobqBZzYCOay6ZDVYzoFnNgLY1u4cey2y4gx7jhyB+8dU9dGnNQ8yGS2hW8wa0oLaZDVYzoFnNgAay/jYCN9Bjng230LRmvQg5t2ZAs5oBzWr2Cy2QxTXLpWruczZcQ8c1l8wGqxnQrGZA19Wsv3HRHfSYZgM1u4Iuno0I2UKzmgHNagY0rVlOf9OiS+jSmoeeDXfQdDYEtc1ssJoBzWoGNKvZNfSYZsMVNK1ZL0LOrRnQrGZAs5pTs+EWurZmuVTNQ82G/s59V9BxzSWzwWoGNKsZ0Lk1u4Ye02y4hC6tGdCsZkCzmgFNkeViZFfQpTWvcjb8Qgtqm9lgNQOa1QzoVM1uocc6G/pnBd1Bj+1DcA36fUfQtTXLpWoeajaAPC1oQbXQrGZAs5oBXVyznjfosc6GP+iA3FQzoFnNgGY1A5oiy6VqdgU9xu/O8d9M4AdaUNvMBqsZ0KxmQKdqrkCHmt1Bj3E28PdsuIMuqRnQrGZAs5qzZkNwXUIv1RygWc1DzQaQpwUtqBaa1QxoVjOgi2vWi5CPeYMe62y4hE7VDGhWM6BZzYCmyHKpmt1CxzWPbTbcQjfNBqsZ0KxmQNuac2fj2AfOoMcyG0B2DR3XXDcbrGZAs5qzZkNQ2Ww85Ql6reYAzWruezYschJakP1BC2pOzYBmNWfNRkCu1Kxnkb1Cx7PBat6s2fAFHSFbaFZzzmxQZLlUzanZcA/ddjaKa9azyHJAftoTdJfZYDUD2tZc8t3ZLXTpbLCa284GkCcHnVtz59kQ1KbZcAnNZoPV3GY2LHISOiC7hG6qufNsBORKzXoWeQrQrOac2WA19zkbx6cAXTobFFnOIiehA3IF+kOH0KxmQLOaAV1cs55FlrPI7qBLZ4PVbKFRc9vvzmvIel6hWc2AZjWXzkbud2fU7A56rLPhGprVnDMbqZor0IJaMhvPeIRe9Wyg5rrZcAXNZoPVDGhWc9ZsCGrObMQ1u4XuezaAnIQOyJOBZrPBagZ0cc16FlkOyBVowZ0EdNvZQM19zoYraFYzoFnNgKY1yzXVvAEdkOtqftY79CpnA8iTgGY1A5rVDOhUzRVoQV2CDsiVmvUi5MlADzEbqDl3NlxCs5oBzWrOmg1BzZmNVM2uoPueDSAnoQNyHTSQ3UKzmgFdXLOeRZYDcgVaQCcB3XY2UPNQs3FCzi00qxnQtGa5VM1ZsyGYqZpdQq9yNoA8KWg2G6xmQKdqrkALalbNeoIJ5Bhakd1C9zkbtuY2s3HiI2fQbDZYzVmzIag5s4GaGTRqdg3dZjaAnIQOyAy6rmZX0KxmQBfXrGeR5YBcgRbMJug/e4eumw3UPOR3Z7fQrGZA05rlUjVnzYZA5szGJKD7ng0g50IrsktoVjOgUzVXoAU1q2Y9wQRyDG1nwxV0n7Nha247G0B2C81qzpoNQWXQqZoZNKvZHXSb2QByEjogM+i6mmPo5zxDF9esZ5HlgFyBFsyS2XAJXTcbqHlV351dQrPZoDXLpWpegg7IpTXb2XAL3fdsADkX2tbsDprNRqrmCrSgZtWsJ5hAjqHZbADZNXTpbAC5Flog29R8Us4NNKs5azYENWc22nx3ngx0U81J6IDMoOtqTs2GO+jimvUsshyQK9CC2XY2ZmhBLZkNIMfQObPhEpoiy6VqzpoNgewyGyc/dgTd92wAORe6rubnvUKnaq5AC2pWzXqCCeQYOms2BNkddOlsALkWWiC7zsY0oANyY81yqZq7zoZL6Kaak9ABmUEDOYYuqdkVdHbNehZZDsgVaMHso+ZpQgtqyWwka9YTzBzoF7xBV5DlUjVnzYZANtWcOxu+oQMyg26qORc6t2ZX0KmaK9CCmlWznmACOYbOmo2APCnoVM210ALZ52z4hQ7IjTXLpWruczZOeYNO1ZyEDsgMGsgxdNuafUIH5ErNehZZDsgVaMHso+YK9CdTgRbUktlI1qwnmDnQcc2uoFOzsZnfnVGzP+iAzKCbas6FLql5WtCCmlWznmACOYbOmo2AXKlZT5Bf9AbdVHMttEAONRu+oANyY81yqZr7nA3U7A7aIiehAzKDBnIM3bpmvYDsGjr1Idh2NupqbpoN39CCWjIbyZr1BLMOuqlmd9C25qzZEMimmotnQ2Bj6Je8QzfVnAtdUvO0oAU1q2Y9wQRyDJ01GwG5UrOeQXYFnaq5FlogVzEbPqEFNWc2VvHdeQP606lAB2QGDeQYunXNehZZ7vSqoC+7fPsXL9127Xv4wYY4NhuoebO+O6PmGHqIE9v/ivGXVoLdZjaSNesJZh10U82rgq4g48XY3/zp3sW+1y5U/kKT+I+wlf5+56LZEMimmotnQ1BTs6F3Jrqz/1q/c+Fe1vv3YnE+3CvhXv3P+v0l3F/lzn3yv8UPrj6cRsaz2PsFW5GH/iPGQM6FLqk5CR2QY+izehZZziIz6GxkPIt9IGDbmi00at7UX64KyJWa9SxyClpQs2rWi5DPliLjWeyDr19oPRtj/e7cVHMttOACujUyHsOeZ6MKrcjf74KMF2N/S7APCXYMnaq5Ai2oS9ABuVKznmACOYZms5FVs55FlgNyBVpQS2ajN2Q8i32vYLedDdS8qd+d5XJrTs3Gmb6R8Sz2fW+sY1egA3JjzXJda2bQTTVvQAfkxprlWM2DIeNVsH+2d3G/YNfVnIQOyHXQrOa+ZwPISeiAHEOf+XhgZDyGnaxZzyLLAbkCLZhdZgPIFWhBXYIOyJWa9SyyHJABfXpVyHgW+wHFDsgVaEEdy2yg5razsXJkPIt9+M0LtTVvQAfk0pqLZ0NQc2Yj57vzpiHjMewUdKrmXOiSmpPQAbkO2ta86ch4KWw6GwG5UrOeYAI5hmazkVWznkWWA3IFWlCXoMeGjBdjf1uwHxRsVvNW+u6syN8bEzLeEvZbF5agu9bMoJtq3oAOyI01y40WGc9iPxSwc2eD1dz3bAA5Bf3S2JHxLPbDgh3XXIEWzC6zAeQKtKAuQQfkSs16FvmjLYKMx7Cbao6h+5oN1JwzG2vIV20hZDyL/cjb69glNRfPhqDmzAZqBvSLWxUZz2I/Ktgl0CU1J6EDMoN2gYwXY38nYI/lu7MbZDyL/dg7gi2YbWvuYzbcIeMx7JyaGXSq5qzZEORTXpHxLPbjgs1q7ns2gKznHhnPYh+JsEtqrkALak7Npz6cCDIew+46G7ZmC/3C1JDxLPYTii2QxbMhqE2zMVlkvCXsdy8U1ZyEDsh6inzllJHxLPbRgJ1Vs55FlgPy8zNy9cXY3w3YtubS2ZiRE89iPxmwU7NR9915Rm54DLt0NmbkzGexjwk2kCvQgmqhT87IZc9iPxWw62o++cGM3OpVsH++jp2qeUbu+Cz20wE7hn5uRu7nWezjgj0jD/SWsN+Tr36CfMWM3P+z2DPygC/GnpEHfsCekVfwFHvrIV9yyf8BAzo3kZlAInsAAAAASUVORK5CYIIAAAA="/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid22)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid24">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid23" clip-path="url(#mfid24)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape634-1549" v:mID="634" v:groupContext="shape" transform="translate(570.995,596.312) rotate(120)">
+			<title>Sheet.634</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid25">
+						<rect width="89.024" height="228.01" id="mfid26"/>
+					</clipPath>
+					<g clip-path="url(#mfid25)">
+						<mask id="mfid27">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid28" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid27)">
+									<use xlink:href="#mfid26"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid29" width="90" height="229" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFoAAADlCAYAAADTNOCiAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnWARjRyu0AAA7uSURBVHhe7dxJs11VGcZxhg79BvoJwInObGZ2A6oyBJuBOlS/gIC9jghdQmcIkJCOJvQhBAKRQCANAR3Qgw29WhazVDk4vu+967lZ+z3P2nut3Zy777v3qnoHTEjqV/96zoE0l2y194UfXvO5yy7f/vnwj/Mb4q0hb9t+5NJt156fsQd6QJZb6M3YA7wY+StX7lx89Ue7Zuy+n0X+44lPF9tPfzZj9/kY8p/eWix2yV03Y/fz6pD17pixuz+L/IeAbKF3y12v2D+esYsfQ75dQC0yoHe/vVjcMGOXPYv8+2cvIlNoQb4z3I0zdt5LIVtoWzOg75K7acaufwz5tjfXkZtmA8h6d8/Y6WeRf0eQLTSrGdB75HYI9tdm7IsvhZyCrpsNRQb0XrkdZ2bstceQbw3IgGY1A5rVvAH9zvrtnDq2Rf5tQAZ0l9kA9D3hbp4qdoz8ZYJsoVnN7EMwng1A7wt3y9SwGfItAdlCJ2dDIHNrVuT94W49OxHsJeRn1pEBzWoGNKsZ0HHNSeh3F4sDcrd5x2bIN79xEXnI2VBoRdY7KHe7V+w6ZAvNagZ06YegrVmR9Q55xLbIvwnIFrqP2UjVzKDvfU/+/ec+W3zdAzZD3kmQLTSrGdCs5g1oQW2aDUUGtAtsi/zrCHmzZwPQ94XbtVWxU8gWmtWcMxu0Zr0IOadmQN8vd8fLWwybIe8IyJ1mQyCbZmOpZrlUzYBWZL0H/raFsC3yr44vI5fMBqsZ0DFyyWywmgF9WG732LFTyCloVjOgl2rWE8yhZkORAa1351ixGfJNr68j58wGqxnQQ34IxjXH0A/K3XV+ZNgW+ZcBGdCsZkCzmktno/S7c91sAFnvob+PCJsh3xgh59QMaFZz1mwIap+zAWS9h+Xu3mzsOmQLzWoe42zENQNab49gf2MzsFPIdTV3nQ1as16EzGqumw1WM6CB/Ei4Pa+sGLuCfMU68g0Rcq+zIZCs5pzZYDUDmtXMZiOGfvQf8mOtCrsOua/ZYDV3nQ1Wc+5sAFnvsVVgW+RrArKFZjUDmtUM6KWa9QSzy2ywmtvMRgytd89Q2Az5+tfWkdvOBqs5ZzZKa+46Gxb58XD7Xu0Z2yJfTZAtNKsZ0KzmnNmIa46h45r7+O4M6FTNa9D/XCyOyPWGnUJOQefW3Hk2BLXv2WA1W2jUDGi9/V2xGfJ1ARnQrGZAs5pzZqPPD0FAs5oBzWpumg0gPxHugGL/pAW2Rb4qIAN6VbNBa9aLkFnNdbPBam4zGzH00fdbYDPk7RGyhWY158wGqxnQrOaus8FqzpmNppoBrXcwF9si/yJCnmeDQx8N0E+GO9SEHSMPcWw2WM1dZ4PVnDMb7Ofcx1Wwh0bWy5oNgSyuWS5Vc8lssJ9zX1f0Adn24QdLzQarGdBxzSWzwWoGNKs5hg4/7a33UtBjmg3dZTfQrGZAj+FD0DV07mx0/e7cNBvuoFnNgGY1b0ALapvZYDUDOkaeBPQYZsMlNKt5Mz8E9fQ/UNxAJ2dDIItrlkvV3GY23EKzmgEd11wyG6xmQLOaAY3/r+EKeqyz4Rp6LB+Ciqz/88gddO5srOK7M2p2Bc1mg9W8AS2obWaD1QxoVrNr6LHNhjtoNhu0Zr0IObdmQLOaAc1q9gstkMU1y6Vq7job+isobqBZzYCOay6ZDVYzoFnNgLY1u4cey2y4gx7jhyB+8dU9dGnNQ8yGS2hW8wa0oLaZDVYzoFnNgAay/jYCN9Bjng230LRmvQg5t2ZAs5oBzWr2Cy2QxTXLpWruczZcQ8c1l8wGqxnQrGZA19Wsv3HRHfSYZgM1u4Iuno0I2UKzmgHNagY0rVlOf9OiS+jSmoeeDXfQdDYEtc1ssJoBzWoGNKvZNfSYZsMVNK1ZL0LOrRnQrGZAs5pTs+EWurZmuVTNQ82G/s59V9BxzSWzwWoGNKsZ0Lk1u4Ye02y4hC6tGdCsZkCzmgFNkeViZFfQpTWvcjb8Qgtqm9lgNQOa1QzoVM1uocc6G/pnBd1Bj+1DcA36fUfQtTXLpWoeajaAPC1oQbXQrGZAs5oBXVyznjfosc6GP+iA3FQzoFnNgGY1A5oiy6VqdgU9xu/O8d9M4AdaUNvMBqsZ0KxmQKdqrkCHmt1Bj3E28PdsuIMuqRnQrGZAs5qzZkNwXUIv1RygWc1DzQaQpwUtqBaa1QxoVjOgi2vWi5CPeYMe62y4hE7VDGhWM6BZzYCmyHKpmt1CxzWPbTbcQjfNBqsZ0KxmQNuac2fj2AfOoMcyG0B2DR3XXDcbrGZAs5qzZkNQ2Ww85Ql6reYAzWruezYschJakP1BC2pOzYBmNWfNRkCu1Kxnkb1Cx7PBat6s2fAFHSFbaFZzzmxQZLlUzanZcA/ddjaKa9azyHJAftoTdJfZYDUD2tZc8t3ZLXTpbLCa284GkCcHnVtz59kQ1KbZcAnNZoPV3GY2LHISOiC7hG6qufNsBORKzXoWeQrQrOac2WA19zkbx6cAXTobFFnOIiehA3IF+kOH0KxmQLOaAV1cs55FlrPI7qBLZ4PVbKFRc9vvzmvIel6hWc2AZjWXzkbud2fU7A56rLPhGprVnDMbqZor0IJaMhvPeIRe9Wyg5rrZcAXNZoPVDGhWc9ZsCGrObMQ1u4XuezaAnIQOyJOBZrPBagZ0cc16FlkOyBVowZ0EdNvZQM19zoYraFYzoFnNgKY1yzXVvAEdkOtqftY79CpnA8iTgGY1A5rVDOhUzRVoQV2CDsiVmvUi5MlADzEbqDl3NlxCs5oBzWrOmg1BzZmNVM2uoPueDSAnoQNyHTSQ3UKzmgFdXLOeRZYDcgVaQCcB3XY2UPNQs3FCzi00qxnQtGa5VM1ZsyGYqZpdQq9yNoA8KWg2G6xmQKdqrkALalbNeoIJ5Bhakd1C9zkbtuY2s3HiI2fQbDZYzVmzIag5s4GaGTRqdg3dZjaAnIQOyAy6rmZX0KxmQBfXrGeR5YBcgRbMJug/e4eumw3UPOR3Z7fQrGZA05rlUjVnzYZA5szGJKD7ng0g50IrsktoVjOgUzVXoAU1q2Y9wQRyDG1nwxV0n7Nha247G0B2C81qzpoNQWXQqZoZNKvZHXSb2QByEjogM+i6mmPo5zxDF9esZ5HlgFyBFsyS2XAJXTcbqHlV351dQrPZoDXLpWpegg7IpTXb2XAL3fdsADkX2tbsDprNRqrmCrSgZtWsJ5hAjqHZbADZNXTpbAC5Flog29R8Us4NNKs5azYENWc22nx3ngx0U81J6IDMoOtqTs2GO+jimvUsshyQK9CC2XY2ZmhBLZkNIMfQObPhEpoiy6VqzpoNgewyGyc/dgTd92wAORe6rubnvUKnaq5AC2pWzXqCCeQYOms2BNkddOlsALkWWiC7zsY0oANyY81yqZq7zoZL6Kaak9ABmUEDOYYuqdkVdHbNehZZDsgVaMHso+ZpQgtqyWwka9YTzBzoF7xBV5DlUjVnzYZANtWcOxu+oQMyg26qORc6t2ZX0KmaK9CCmlWznmACOYbOmo2APCnoVM210ALZ52z4hQ7IjTXLpWruczZOeYNO1ZyEDsgMGsgxdNuafUIH5ErNehZZDsgVaMHso+YK9CdTgRbUktlI1qwnmDnQcc2uoFOzsZnfnVGzP+iAzKCbas6FLql5WtCCmlWznmACOYbOmo2AXKlZT5Bf9AbdVHMttEAONRu+oANyY81yqZr7nA3U7A7aIiehAzKDBnIM3bpmvYDsGjr1Idh2NupqbpoN39CCWjIbyZr1BLMOuqlmd9C25qzZEMimmotnQ2Bj6Je8QzfVnAtdUvO0oAU1q2Y9wQRyDJ01GwG5UrOeQXYFnaq5FlogVzEbPqEFNWc2VvHdeQP606lAB2QGDeQYunXNehZZ7vSqoC+7fPsXL9127Xv4wYY4NhuoebO+O6PmGHqIE9v/ivGXVoLdZjaSNesJZh10U82rgq4g48XY3/zp3sW+1y5U/kKT+I+wlf5+56LZEMimmotnQ1BTs6F3Jrqz/1q/c+Fe1vv3YnE+3CvhXv3P+v0l3F/lzn3yv8UPrj6cRsaz2PsFW5GH/iPGQM6FLqk5CR2QY+izehZZziIz6GxkPIt9IGDbmi00at7UX64KyJWa9SxyClpQs2rWi5DPliLjWeyDr19oPRtj/e7cVHMttOACujUyHsOeZ6MKrcjf74KMF2N/S7APCXYMnaq5Ai2oS9ABuVKznmACOYZms5FVs55FlgNyBVpQS2ajN2Q8i32vYLedDdS8qd+d5XJrTs3Gmb6R8Sz2fW+sY1egA3JjzXJda2bQTTVvQAfkxprlWM2DIeNVsH+2d3G/YNfVnIQOyHXQrOa+ZwPISeiAHEOf+XhgZDyGnaxZzyLLAbkCLZhdZgPIFWhBXYIOyJWa9SyyHJABfXpVyHgW+wHFDsgVaEEdy2yg5razsXJkPIt9+M0LtTVvQAfk0pqLZ0NQc2Yj57vzpiHjMewUdKrmXOiSmpPQAbkO2ta86ch4KWw6GwG5UrOeYAI5hmazkVWznkWWA3IFWlCXoMeGjBdjf1uwHxRsVvNW+u6syN8bEzLeEvZbF5agu9bMoJtq3oAOyI01y40WGc9iPxSwc2eD1dz3bAA5Bf3S2JHxLPbDgh3XXIEWzC6zAeQKtKAuQQfkSs16FvmjLYKMx7Cbao6h+5oN1JwzG2vIV20hZDyL/cjb69glNRfPhqDmzAZqBvSLWxUZz2I/Ktgl0CU1J6EDMoN2gYwXY38nYI/lu7MbZDyL/dg7gi2YbWvuYzbcIeMx7JyaGXSq5qzZEORTXpHxLPbjgs1q7ns2gKznHhnPYh+JsEtqrkALak7Npz6cCDIew+46G7ZmC/3C1JDxLPYTii2QxbMhqE2zMVlkvCXsdy8U1ZyEDsh6inzllJHxLPbRgJ1Vs55FlgPy8zNy9cXY3w3YtubS2ZiRE89iPxmwU7NR9915Rm54DLt0NmbkzGexjwk2kCvQgmqhT87IZc9iPxWw62o++cGM3OpVsH++jp2qeUbu+Cz20wE7hn5uRu7nWezjgj0jD/SWsN+Tr36CfMWM3P+z2DPygC/GnpEHfsCekVfwFHvrIV9yyf8BAzo3kZlAInsAAAAASUVORK5CYIIAAAA="/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid28)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid30">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid29" clip-path="url(#mfid30)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape635-1552" v:mID="635" v:groupContext="shape" transform="translate(538.497,799.539) rotate(150)">
+			<title>Sheet.635</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid31">
+						<rect width="89.024" height="228.01" id="mfid32"/>
+					</clipPath>
+					<g clip-path="url(#mfid31)">
+						<mask id="mfid33">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid34" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid33)">
+									<use xlink:href="#mfid32"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid35" width="90" height="229" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFoAAADlCAYAAADTNOCiAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnWARjRyu0AAA7uSURBVHhe7dxJs11VGcZxhg79BvoJwInObGZ2A6oyBJuBOlS/gIC9jghdQmcIkJCOJvQhBAKRQCANAR3Qgw29WhazVDk4vu+967lZ+z3P2nut3Zy777v3qnoHTEjqV/96zoE0l2y194UfXvO5yy7f/vnwj/Mb4q0hb9t+5NJt156fsQd6QJZb6M3YA7wY+StX7lx89Ue7Zuy+n0X+44lPF9tPfzZj9/kY8p/eWix2yV03Y/fz6pD17pixuz+L/IeAbKF3y12v2D+esYsfQ75dQC0yoHe/vVjcMGOXPYv8+2cvIlNoQb4z3I0zdt5LIVtoWzOg75K7acaufwz5tjfXkZtmA8h6d8/Y6WeRf0eQLTSrGdB75HYI9tdm7IsvhZyCrpsNRQb0XrkdZ2bstceQbw3IgGY1A5rVvAH9zvrtnDq2Rf5tQAZ0l9kA9D3hbp4qdoz8ZYJsoVnN7EMwng1A7wt3y9SwGfItAdlCJ2dDIHNrVuT94W49OxHsJeRn1pEBzWoGNKsZ0HHNSeh3F4sDcrd5x2bIN79xEXnI2VBoRdY7KHe7V+w6ZAvNagZ06YegrVmR9Q55xLbIvwnIFrqP2UjVzKDvfU/+/ec+W3zdAzZD3kmQLTSrGdCs5g1oQW2aDUUGtAtsi/zrCHmzZwPQ94XbtVWxU8gWmtWcMxu0Zr0IOadmQN8vd8fLWwybIe8IyJ1mQyCbZmOpZrlUzYBWZL0H/raFsC3yr44vI5fMBqsZ0DFyyWywmgF9WG732LFTyCloVjOgl2rWE8yhZkORAa1351ixGfJNr68j58wGqxnQQ34IxjXH0A/K3XV+ZNgW+ZcBGdCsZkCzmktno/S7c91sAFnvob+PCJsh3xgh59QMaFZz1mwIap+zAWS9h+Xu3mzsOmQLzWoe42zENQNab49gf2MzsFPIdTV3nQ1as16EzGqumw1WM6CB/Ei4Pa+sGLuCfMU68g0Rcq+zIZCs5pzZYDUDmtXMZiOGfvQf8mOtCrsOua/ZYDV3nQ1Wc+5sAFnvsVVgW+RrArKFZjUDmtUM6KWa9QSzy2ywmtvMRgytd89Q2Az5+tfWkdvOBqs5ZzZKa+46Gxb58XD7Xu0Z2yJfTZAtNKsZ0KzmnNmIa46h45r7+O4M6FTNa9D/XCyOyPWGnUJOQefW3Hk2BLXv2WA1W2jUDGi9/V2xGfJ1ARnQrGZAs5pzZqPPD0FAs5oBzWpumg0gPxHugGL/pAW2Rb4qIAN6VbNBa9aLkFnNdbPBam4zGzH00fdbYDPk7RGyhWY158wGqxnQrOaus8FqzpmNppoBrXcwF9si/yJCnmeDQx8N0E+GO9SEHSMPcWw2WM1dZ4PVnDMb7Ofcx1Wwh0bWy5oNgSyuWS5Vc8lssJ9zX1f0Adn24QdLzQarGdBxzSWzwWoGNKs5hg4/7a33UtBjmg3dZTfQrGZAj+FD0DV07mx0/e7cNBvuoFnNgGY1b0ALapvZYDUDOkaeBPQYZsMlNKt5Mz8E9fQ/UNxAJ2dDIItrlkvV3GY23EKzmgEd11wyG6xmQLOaAY3/r+EKeqyz4Rp6LB+Ciqz/88gddO5srOK7M2p2Bc1mg9W8AS2obWaD1QxoVrNr6LHNhjtoNhu0Zr0IObdmQLOaAc1q9gstkMU1y6Vq7job+isobqBZzYCOay6ZDVYzoFnNgLY1u4cey2y4gx7jhyB+8dU9dGnNQ8yGS2hW8wa0oLaZDVYzoFnNgAay/jYCN9Bjng230LRmvQg5t2ZAs5oBzWr2Cy2QxTXLpWruczZcQ8c1l8wGqxnQrGZA19Wsv3HRHfSYZgM1u4Iuno0I2UKzmgHNagY0rVlOf9OiS+jSmoeeDXfQdDYEtc1ssJoBzWoGNKvZNfSYZsMVNK1ZL0LOrRnQrGZAs5pTs+EWurZmuVTNQ82G/s59V9BxzSWzwWoGNKsZ0Lk1u4Ye02y4hC6tGdCsZkCzmgFNkeViZFfQpTWvcjb8Qgtqm9lgNQOa1QzoVM1uocc6G/pnBd1Bj+1DcA36fUfQtTXLpWoeajaAPC1oQbXQrGZAs5oBXVyznjfosc6GP+iA3FQzoFnNgGY1A5oiy6VqdgU9xu/O8d9M4AdaUNvMBqsZ0KxmQKdqrkCHmt1Bj3E28PdsuIMuqRnQrGZAs5qzZkNwXUIv1RygWc1DzQaQpwUtqBaa1QxoVjOgi2vWi5CPeYMe62y4hE7VDGhWM6BZzYCmyHKpmt1CxzWPbTbcQjfNBqsZ0KxmQNuac2fj2AfOoMcyG0B2DR3XXDcbrGZAs5qzZkNQ2Ww85Ql6reYAzWruezYschJakP1BC2pOzYBmNWfNRkCu1Kxnkb1Cx7PBat6s2fAFHSFbaFZzzmxQZLlUzanZcA/ddjaKa9azyHJAftoTdJfZYDUD2tZc8t3ZLXTpbLCa284GkCcHnVtz59kQ1KbZcAnNZoPV3GY2LHISOiC7hG6qufNsBORKzXoWeQrQrOac2WA19zkbx6cAXTobFFnOIiehA3IF+kOH0KxmQLOaAV1cs55FlrPI7qBLZ4PVbKFRc9vvzmvIel6hWc2AZjWXzkbud2fU7A56rLPhGprVnDMbqZor0IJaMhvPeIRe9Wyg5rrZcAXNZoPVDGhWc9ZsCGrObMQ1u4XuezaAnIQOyJOBZrPBagZ0cc16FlkOyBVowZ0EdNvZQM19zoYraFYzoFnNgKY1yzXVvAEdkOtqftY79CpnA8iTgGY1A5rVDOhUzRVoQV2CDsiVmvUi5MlADzEbqDl3NlxCs5oBzWrOmg1BzZmNVM2uoPueDSAnoQNyHTSQ3UKzmgFdXLOeRZYDcgVaQCcB3XY2UPNQs3FCzi00qxnQtGa5VM1ZsyGYqZpdQq9yNoA8KWg2G6xmQKdqrkALalbNeoIJ5Bhakd1C9zkbtuY2s3HiI2fQbDZYzVmzIag5s4GaGTRqdg3dZjaAnIQOyAy6rmZX0KxmQBfXrGeR5YBcgRbMJug/e4eumw3UPOR3Z7fQrGZA05rlUjVnzYZA5szGJKD7ng0g50IrsktoVjOgUzVXoAU1q2Y9wQRyDG1nwxV0n7Nha247G0B2C81qzpoNQWXQqZoZNKvZHXSb2QByEjogM+i6mmPo5zxDF9esZ5HlgFyBFsyS2XAJXTcbqHlV351dQrPZoDXLpWpegg7IpTXb2XAL3fdsADkX2tbsDprNRqrmCrSgZtWsJ5hAjqHZbADZNXTpbAC5Flog29R8Us4NNKs5azYENWc22nx3ngx0U81J6IDMoOtqTs2GO+jimvUsshyQK9CC2XY2ZmhBLZkNIMfQObPhEpoiy6VqzpoNgewyGyc/dgTd92wAORe6rubnvUKnaq5AC2pWzXqCCeQYOms2BNkddOlsALkWWiC7zsY0oANyY81yqZq7zoZL6Kaak9ABmUEDOYYuqdkVdHbNehZZDsgVaMHso+ZpQgtqyWwka9YTzBzoF7xBV5DlUjVnzYZANtWcOxu+oQMyg26qORc6t2ZX0KmaK9CCmlWznmACOYbOmo2APCnoVM210ALZ52z4hQ7IjTXLpWruczZOeYNO1ZyEDsgMGsgxdNuafUIH5ErNehZZDsgVaMHso+YK9CdTgRbUktlI1qwnmDnQcc2uoFOzsZnfnVGzP+iAzKCbas6FLql5WtCCmlWznmACOYbOmo2AXKlZT5Bf9AbdVHMttEAONRu+oANyY81yqZr7nA3U7A7aIiehAzKDBnIM3bpmvYDsGjr1Idh2NupqbpoN39CCWjIbyZr1BLMOuqlmd9C25qzZEMimmotnQ2Bj6Je8QzfVnAtdUvO0oAU1q2Y9wQRyDJ01GwG5UrOeQXYFnaq5FlogVzEbPqEFNWc2VvHdeQP606lAB2QGDeQYunXNehZZ7vSqoC+7fPsXL9127Xv4wYY4NhuoebO+O6PmGHqIE9v/ivGXVoLdZjaSNesJZh10U82rgq4g48XY3/zp3sW+1y5U/kKT+I+wlf5+56LZEMimmotnQ1BTs6F3Jrqz/1q/c+Fe1vv3YnE+3CvhXv3P+v0l3F/lzn3yv8UPrj6cRsaz2PsFW5GH/iPGQM6FLqk5CR2QY+izehZZziIz6GxkPIt9IGDbmi00at7UX64KyJWa9SxyClpQs2rWi5DPliLjWeyDr19oPRtj/e7cVHMttOACujUyHsOeZ6MKrcjf74KMF2N/S7APCXYMnaq5Ai2oS9ABuVKznmACOYZms5FVs55FlgNyBVpQS2ajN2Q8i32vYLedDdS8qd+d5XJrTs3Gmb6R8Sz2fW+sY1egA3JjzXJda2bQTTVvQAfkxprlWM2DIeNVsH+2d3G/YNfVnIQOyHXQrOa+ZwPISeiAHEOf+XhgZDyGnaxZzyLLAbkCLZhdZgPIFWhBXYIOyJWa9SyyHJABfXpVyHgW+wHFDsgVaEEdy2yg5razsXJkPIt9+M0LtTVvQAfk0pqLZ0NQc2Yj57vzpiHjMewUdKrmXOiSmpPQAbkO2ta86ch4KWw6GwG5UrOeYAI5hmazkVWznkWWA3IFWlCXoMeGjBdjf1uwHxRsVvNW+u6syN8bEzLeEvZbF5agu9bMoJtq3oAOyI01y40WGc9iPxSwc2eD1dz3bAA5Bf3S2JHxLPbDgh3XXIEWzC6zAeQKtKAuQQfkSs16FvmjLYKMx7Cbao6h+5oN1JwzG2vIV20hZDyL/cjb69glNRfPhqDmzAZqBvSLWxUZz2I/Ktgl0CU1J6EDMoN2gYwXY38nYI/lu7MbZDyL/dg7gi2YbWvuYzbcIeMx7JyaGXSq5qzZEORTXpHxLPbjgs1q7ns2gKznHhnPYh+JsEtqrkALak7Npz6cCDIew+46G7ZmC/3C1JDxLPYTii2QxbMhqE2zMVlkvCXsdy8U1ZyEDsh6inzllJHxLPbRgJ1Vs55FlgPy8zNy9cXY3w3YtubS2ZiRE89iPxmwU7NR9915Rm54DLt0NmbkzGexjwk2kCvQgmqhT87IZc9iPxWw62o++cGM3OpVsH++jp2qeUbu+Cz20wE7hn5uRu7nWezjgj0jD/SWsN+Tr36CfMWM3P+z2DPygC/GnpEHfsCekVfwFHvrIV9yyf8BAzo3kZlAInsAAAAASUVORK5CYIIAAAA="/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid34)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid36">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid35" clip-path="url(#mfid36)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape636-1555" v:mID="636" v:groupContext="shape" transform="translate(398.905,-202.875)">
+			<title>Sheet.636</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid37">
+						<rect width="89.024" height="228.01" id="mfid38"/>
+					</clipPath>
+					<g clip-path="url(#mfid37)">
+						<mask id="mfid39">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid40" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid39)">
+									<use xlink:href="#mfid38"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid41" width="90" height="229" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFoAAADlCAYAAADTNOCiAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnWARjRyu0AAA7uSURBVHhe7dxJs11VGcZxhg79BvoJwInObGZ2A6oyBJuBOlS/gIC9jghdQmcIkJCOJvQhBAKRQCANAR3Qgw29WhazVDk4vu+967lZ+z3P2nut3Zy777v3qnoHTEjqV/96zoE0l2y194UfXvO5yy7f/vnwj/Mb4q0hb9t+5NJt156fsQd6QJZb6M3YA7wY+StX7lx89Ue7Zuy+n0X+44lPF9tPfzZj9/kY8p/eWix2yV03Y/fz6pD17pixuz+L/IeAbKF3y12v2D+esYsfQ75dQC0yoHe/vVjcMGOXPYv8+2cvIlNoQb4z3I0zdt5LIVtoWzOg75K7acaufwz5tjfXkZtmA8h6d8/Y6WeRf0eQLTSrGdB75HYI9tdm7IsvhZyCrpsNRQb0XrkdZ2bstceQbw3IgGY1A5rVvAH9zvrtnDq2Rf5tQAZ0l9kA9D3hbp4qdoz8ZYJsoVnN7EMwng1A7wt3y9SwGfItAdlCJ2dDIHNrVuT94W49OxHsJeRn1pEBzWoGNKsZ0HHNSeh3F4sDcrd5x2bIN79xEXnI2VBoRdY7KHe7V+w6ZAvNagZ06YegrVmR9Q55xLbIvwnIFrqP2UjVzKDvfU/+/ec+W3zdAzZD3kmQLTSrGdCs5g1oQW2aDUUGtAtsi/zrCHmzZwPQ94XbtVWxU8gWmtWcMxu0Zr0IOadmQN8vd8fLWwybIe8IyJ1mQyCbZmOpZrlUzYBWZL0H/raFsC3yr44vI5fMBqsZ0DFyyWywmgF9WG732LFTyCloVjOgl2rWE8yhZkORAa1351ixGfJNr68j58wGqxnQQ34IxjXH0A/K3XV+ZNgW+ZcBGdCsZkCzmktno/S7c91sAFnvob+PCJsh3xgh59QMaFZz1mwIap+zAWS9h+Xu3mzsOmQLzWoe42zENQNab49gf2MzsFPIdTV3nQ1as16EzGqumw1WM6CB/Ei4Pa+sGLuCfMU68g0Rcq+zIZCs5pzZYDUDmtXMZiOGfvQf8mOtCrsOua/ZYDV3nQ1Wc+5sAFnvsVVgW+RrArKFZjUDmtUM6KWa9QSzy2ywmtvMRgytd89Q2Az5+tfWkdvOBqs5ZzZKa+46Gxb58XD7Xu0Z2yJfTZAtNKsZ0KzmnNmIa46h45r7+O4M6FTNa9D/XCyOyPWGnUJOQefW3Hk2BLXv2WA1W2jUDGi9/V2xGfJ1ARnQrGZAs5pzZqPPD0FAs5oBzWpumg0gPxHugGL/pAW2Rb4qIAN6VbNBa9aLkFnNdbPBam4zGzH00fdbYDPk7RGyhWY158wGqxnQrOaus8FqzpmNppoBrXcwF9si/yJCnmeDQx8N0E+GO9SEHSMPcWw2WM1dZ4PVnDMb7Ofcx1Wwh0bWy5oNgSyuWS5Vc8lssJ9zX1f0Adn24QdLzQarGdBxzSWzwWoGNKs5hg4/7a33UtBjmg3dZTfQrGZAj+FD0DV07mx0/e7cNBvuoFnNgGY1b0ALapvZYDUDOkaeBPQYZsMlNKt5Mz8E9fQ/UNxAJ2dDIItrlkvV3GY23EKzmgEd11wyG6xmQLOaAY3/r+EKeqyz4Rp6LB+Ciqz/88gddO5srOK7M2p2Bc1mg9W8AS2obWaD1QxoVrNr6LHNhjtoNhu0Zr0IObdmQLOaAc1q9gstkMU1y6Vq7job+isobqBZzYCOay6ZDVYzoFnNgLY1u4cey2y4gx7jhyB+8dU9dGnNQ8yGS2hW8wa0oLaZDVYzoFnNgAay/jYCN9Bjng230LRmvQg5t2ZAs5oBzWr2Cy2QxTXLpWruczZcQ8c1l8wGqxnQrGZA19Wsv3HRHfSYZgM1u4Iuno0I2UKzmgHNagY0rVlOf9OiS+jSmoeeDXfQdDYEtc1ssJoBzWoGNKvZNfSYZsMVNK1ZL0LOrRnQrGZAs5pTs+EWurZmuVTNQ82G/s59V9BxzSWzwWoGNKsZ0Lk1u4Ye02y4hC6tGdCsZkCzmgFNkeViZFfQpTWvcjb8Qgtqm9lgNQOa1QzoVM1uocc6G/pnBd1Bj+1DcA36fUfQtTXLpWoeajaAPC1oQbXQrGZAs5oBXVyznjfosc6GP+iA3FQzoFnNgGY1A5oiy6VqdgU9xu/O8d9M4AdaUNvMBqsZ0KxmQKdqrkCHmt1Bj3E28PdsuIMuqRnQrGZAs5qzZkNwXUIv1RygWc1DzQaQpwUtqBaa1QxoVjOgi2vWi5CPeYMe62y4hE7VDGhWM6BZzYCmyHKpmt1CxzWPbTbcQjfNBqsZ0KxmQNuac2fj2AfOoMcyG0B2DR3XXDcbrGZAs5qzZkNQ2Ww85Ql6reYAzWruezYschJakP1BC2pOzYBmNWfNRkCu1Kxnkb1Cx7PBat6s2fAFHSFbaFZzzmxQZLlUzanZcA/ddjaKa9azyHJAftoTdJfZYDUD2tZc8t3ZLXTpbLCa284GkCcHnVtz59kQ1KbZcAnNZoPV3GY2LHISOiC7hG6qufNsBORKzXoWeQrQrOac2WA19zkbx6cAXTobFFnOIiehA3IF+kOH0KxmQLOaAV1cs55FlrPI7qBLZ4PVbKFRc9vvzmvIel6hWc2AZjWXzkbud2fU7A56rLPhGprVnDMbqZor0IJaMhvPeIRe9Wyg5rrZcAXNZoPVDGhWc9ZsCGrObMQ1u4XuezaAnIQOyJOBZrPBagZ0cc16FlkOyBVowZ0EdNvZQM19zoYraFYzoFnNgKY1yzXVvAEdkOtqftY79CpnA8iTgGY1A5rVDOhUzRVoQV2CDsiVmvUi5MlADzEbqDl3NlxCs5oBzWrOmg1BzZmNVM2uoPueDSAnoQNyHTSQ3UKzmgFdXLOeRZYDcgVaQCcB3XY2UPNQs3FCzi00qxnQtGa5VM1ZsyGYqZpdQq9yNoA8KWg2G6xmQKdqrkALalbNeoIJ5Bhakd1C9zkbtuY2s3HiI2fQbDZYzVmzIag5s4GaGTRqdg3dZjaAnIQOyAy6rmZX0KxmQBfXrGeR5YBcgRbMJug/e4eumw3UPOR3Z7fQrGZA05rlUjVnzYZA5szGJKD7ng0g50IrsktoVjOgUzVXoAU1q2Y9wQRyDG1nwxV0n7Nha247G0B2C81qzpoNQWXQqZoZNKvZHXSb2QByEjogM+i6mmPo5zxDF9esZ5HlgFyBFsyS2XAJXTcbqHlV351dQrPZoDXLpWpegg7IpTXb2XAL3fdsADkX2tbsDprNRqrmCrSgZtWsJ5hAjqHZbADZNXTpbAC5Flog29R8Us4NNKs5azYENWc22nx3ngx0U81J6IDMoOtqTs2GO+jimvUsshyQK9CC2XY2ZmhBLZkNIMfQObPhEpoiy6VqzpoNgewyGyc/dgTd92wAORe6rubnvUKnaq5AC2pWzXqCCeQYOms2BNkddOlsALkWWiC7zsY0oANyY81yqZq7zoZL6Kaak9ABmUEDOYYuqdkVdHbNehZZDsgVaMHso+ZpQgtqyWwka9YTzBzoF7xBV5DlUjVnzYZANtWcOxu+oQMyg26qORc6t2ZX0KmaK9CCmlWznmACOYbOmo2APCnoVM210ALZ52z4hQ7IjTXLpWruczZOeYNO1ZyEDsgMGsgxdNuafUIH5ErNehZZDsgVaMHso+YK9CdTgRbUktlI1qwnmDnQcc2uoFOzsZnfnVGzP+iAzKCbas6FLql5WtCCmlWznmACOYbOmo2AXKlZT5Bf9AbdVHMttEAONRu+oANyY81yqZr7nA3U7A7aIiehAzKDBnIM3bpmvYDsGjr1Idh2NupqbpoN39CCWjIbyZr1BLMOuqlmd9C25qzZEMimmotnQ2Bj6Je8QzfVnAtdUvO0oAU1q2Y9wQRyDJ01GwG5UrOeQXYFnaq5FlogVzEbPqEFNWc2VvHdeQP606lAB2QGDeQYunXNehZZ7vSqoC+7fPsXL9127Xv4wYY4NhuoebO+O6PmGHqIE9v/ivGXVoLdZjaSNesJZh10U82rgq4g48XY3/zp3sW+1y5U/kKT+I+wlf5+56LZEMimmotnQ1BTs6F3Jrqz/1q/c+Fe1vv3YnE+3CvhXv3P+v0l3F/lzn3yv8UPrj6cRsaz2PsFW5GH/iPGQM6FLqk5CR2QY+izehZZziIz6GxkPIt9IGDbmi00at7UX64KyJWa9SxyClpQs2rWi5DPliLjWeyDr19oPRtj/e7cVHMttOACujUyHsOeZ6MKrcjf74KMF2N/S7APCXYMnaq5Ai2oS9ABuVKznmACOYZms5FVs55FlgNyBVpQS2ajN2Q8i32vYLedDdS8qd+d5XJrTs3Gmb6R8Sz2fW+sY1egA3JjzXJda2bQTTVvQAfkxprlWM2DIeNVsH+2d3G/YNfVnIQOyHXQrOa+ZwPISeiAHEOf+XhgZDyGnaxZzyLLAbkCLZhdZgPIFWhBXYIOyJWa9SyyHJABfXpVyHgW+wHFDsgVaEEdy2yg5razsXJkPIt9+M0LtTVvQAfk0pqLZ0NQc2Yj57vzpiHjMewUdKrmXOiSmpPQAbkO2ta86ch4KWw6GwG5UrOeYAI5hmazkVWznkWWA3IFWlCXoMeGjBdjf1uwHxRsVvNW+u6syN8bEzLeEvZbF5agu9bMoJtq3oAOyI01y40WGc9iPxSwc2eD1dz3bAA5Bf3S2JHxLPbDgh3XXIEWzC6zAeQKtKAuQQfkSs16FvmjLYKMx7Cbao6h+5oN1JwzG2vIV20hZDyL/cjb69glNRfPhqDmzAZqBvSLWxUZz2I/Ktgl0CU1J6EDMoN2gYwXY38nYI/lu7MbZDyL/dg7gi2YbWvuYzbcIeMx7JyaGXSq5qzZEORTXpHxLPbjgs1q7ns2gKznHhnPYh+JsEtqrkALak7Npz6cCDIew+46G7ZmC/3C1JDxLPYTii2QxbMhqE2zMVlkvCXsdy8U1ZyEDsh6inzllJHxLPbRgJ1Vs55FlgPy8zNy9cXY3w3YtubS2ZiRE89iPxmwU7NR9915Rm54DLt0NmbkzGexjwk2kCvQgmqhT87IZc9iPxWw62o++cGM3OpVsH++jp2qeUbu+Cz20wE7hn5uRu7nWezjgj0jD/SWsN+Tr36CfMWM3P+z2DPygC/GnpEHfsCekVfwFHvrIV9yyf8BAzo3kZlAInsAAAAASUVORK5CYIIAAAA="/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid40)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid42">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid41" clip-path="url(#mfid42)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape637-1558" v:mID="637" v:groupContext="shape" transform="translate(838.754,-138.135) rotate(30)">
+			<title>Sheet.637</title>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+			<switch>
+				<foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+						requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+					<v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+								ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+								pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+								upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+								fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+								WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+								5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+								CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+								54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+								bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+								I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+								s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+								5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+								s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+								9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+								6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+								ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+								n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+								TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+								qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+								kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+								2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+								4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+								Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+								l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+								XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+								31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+								iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+								B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+								x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+								dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+								GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+								x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+								cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+								Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+								ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+								++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+								7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+								G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+								qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+								/zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+								8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+								Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+								l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+								efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+								2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+								DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+								OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+								WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+								r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+								ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+								Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+								6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+								H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+								41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+								LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+								EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+								2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+								A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+								7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+								L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+								QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+								qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+								jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+								IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+								9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+								oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+								TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+								dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+								kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+								YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+								FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+								BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+								gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+								emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+								spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+								7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+								tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+								iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+								9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+								uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+								viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+								iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+								xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+								gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+								3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+								mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+								yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+								mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+								wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+								/dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+								e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+								Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+								a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+								4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+								wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+								WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+								W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+				</foreignObject>
+				<svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+					<clipPath id="mfid43">
+						<rect width="89.024" height="228.01" id="mfid44"/>
+					</clipPath>
+					<g clip-path="url(#mfid43)">
+						<mask id="mfid45">
+							<rect width="90" height="229" fill="white" stroke="none"/>
+						</mask>
+						<mask id="mfid46" fill="white" stroke="none">
+							<g>
+								<g mask="url(#mfid45)">
+									<use xlink:href="#mfid44"/>
+								</g>
+							</g>
+						</mask>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode  -->
+						<defs>
+							<image id="mfid47" width="90" height="229" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFoAAADlCAYAAADTNOCiAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnWARjRyu0AAA7uSURBVHhe7dxJs11VGcZxhg79BvoJwInObGZ2A6oyBJuBOlS/gIC9jghdQmcIkJCOJvQhBAKRQCANAR3Qgw29WhazVDk4vu+967lZ+z3P2nut3Zy777v3qnoHTEjqV/96zoE0l2y194UfXvO5yy7f/vnwj/Mb4q0hb9t+5NJt156fsQd6QJZb6M3YA7wY+StX7lx89Ue7Zuy+n0X+44lPF9tPfzZj9/kY8p/eWix2yV03Y/fz6pD17pixuz+L/IeAbKF3y12v2D+esYsfQ75dQC0yoHe/vVjcMGOXPYv8+2cvIlNoQb4z3I0zdt5LIVtoWzOg75K7acaufwz5tjfXkZtmA8h6d8/Y6WeRf0eQLTSrGdB75HYI9tdm7IsvhZyCrpsNRQb0XrkdZ2bstceQbw3IgGY1A5rVvAH9zvrtnDq2Rf5tQAZ0l9kA9D3hbp4qdoz8ZYJsoVnN7EMwng1A7wt3y9SwGfItAdlCJ2dDIHNrVuT94W49OxHsJeRn1pEBzWoGNKsZ0HHNSeh3F4sDcrd5x2bIN79xEXnI2VBoRdY7KHe7V+w6ZAvNagZ06YegrVmR9Q55xLbIvwnIFrqP2UjVzKDvfU/+/ec+W3zdAzZD3kmQLTSrGdCs5g1oQW2aDUUGtAtsi/zrCHmzZwPQ94XbtVWxU8gWmtWcMxu0Zr0IOadmQN8vd8fLWwybIe8IyJ1mQyCbZmOpZrlUzYBWZL0H/raFsC3yr44vI5fMBqsZ0DFyyWywmgF9WG732LFTyCloVjOgl2rWE8yhZkORAa1351ixGfJNr68j58wGqxnQQ34IxjXH0A/K3XV+ZNgW+ZcBGdCsZkCzmktno/S7c91sAFnvob+PCJsh3xgh59QMaFZz1mwIap+zAWS9h+Xu3mzsOmQLzWoe42zENQNab49gf2MzsFPIdTV3nQ1as16EzGqumw1WM6CB/Ei4Pa+sGLuCfMU68g0Rcq+zIZCs5pzZYDUDmtXMZiOGfvQf8mOtCrsOua/ZYDV3nQ1Wc+5sAFnvsVVgW+RrArKFZjUDmtUM6KWa9QSzy2ywmtvMRgytd89Q2Az5+tfWkdvOBqs5ZzZKa+46Gxb58XD7Xu0Z2yJfTZAtNKsZ0KzmnNmIa46h45r7+O4M6FTNa9D/XCyOyPWGnUJOQefW3Hk2BLXv2WA1W2jUDGi9/V2xGfJ1ARnQrGZAs5pzZqPPD0FAs5oBzWpumg0gPxHugGL/pAW2Rb4qIAN6VbNBa9aLkFnNdbPBam4zGzH00fdbYDPk7RGyhWY158wGqxnQrOaus8FqzpmNppoBrXcwF9si/yJCnmeDQx8N0E+GO9SEHSMPcWw2WM1dZ4PVnDMb7Ofcx1Wwh0bWy5oNgSyuWS5Vc8lssJ9zX1f0Adn24QdLzQarGdBxzSWzwWoGNKs5hg4/7a33UtBjmg3dZTfQrGZAj+FD0DV07mx0/e7cNBvuoFnNgGY1b0ALapvZYDUDOkaeBPQYZsMlNKt5Mz8E9fQ/UNxAJ2dDIItrlkvV3GY23EKzmgEd11wyG6xmQLOaAY3/r+EKeqyz4Rp6LB+Ciqz/88gddO5srOK7M2p2Bc1mg9W8AS2obWaD1QxoVrNr6LHNhjtoNhu0Zr0IObdmQLOaAc1q9gstkMU1y6Vq7job+isobqBZzYCOay6ZDVYzoFnNgLY1u4cey2y4gx7jhyB+8dU9dGnNQ8yGS2hW8wa0oLaZDVYzoFnNgAay/jYCN9Bjng230LRmvQg5t2ZAs5oBzWr2Cy2QxTXLpWruczZcQ8c1l8wGqxnQrGZA19Wsv3HRHfSYZgM1u4Iuno0I2UKzmgHNagY0rVlOf9OiS+jSmoeeDXfQdDYEtc1ssJoBzWoGNKvZNfSYZsMVNK1ZL0LOrRnQrGZAs5pTs+EWurZmuVTNQ82G/s59V9BxzSWzwWoGNKsZ0Lk1u4Ye02y4hC6tGdCsZkCzmgFNkeViZFfQpTWvcjb8Qgtqm9lgNQOa1QzoVM1uocc6G/pnBd1Bj+1DcA36fUfQtTXLpWoeajaAPC1oQbXQrGZAs5oBXVyznjfosc6GP+iA3FQzoFnNgGY1A5oiy6VqdgU9xu/O8d9M4AdaUNvMBqsZ0KxmQKdqrkCHmt1Bj3E28PdsuIMuqRnQrGZAs5qzZkNwXUIv1RygWc1DzQaQpwUtqBaa1QxoVjOgi2vWi5CPeYMe62y4hE7VDGhWM6BZzYCmyHKpmt1CxzWPbTbcQjfNBqsZ0KxmQNuac2fj2AfOoMcyG0B2DR3XXDcbrGZAs5qzZkNQ2Ww85Ql6reYAzWruezYschJakP1BC2pOzYBmNWfNRkCu1Kxnkb1Cx7PBat6s2fAFHSFbaFZzzmxQZLlUzanZcA/ddjaKa9azyHJAftoTdJfZYDUD2tZc8t3ZLXTpbLCa284GkCcHnVtz59kQ1KbZcAnNZoPV3GY2LHISOiC7hG6qufNsBORKzXoWeQrQrOac2WA19zkbx6cAXTobFFnOIiehA3IF+kOH0KxmQLOaAV1cs55FlrPI7qBLZ4PVbKFRc9vvzmvIel6hWc2AZjWXzkbud2fU7A56rLPhGprVnDMbqZor0IJaMhvPeIRe9Wyg5rrZcAXNZoPVDGhWc9ZsCGrObMQ1u4XuezaAnIQOyJOBZrPBagZ0cc16FlkOyBVowZ0EdNvZQM19zoYraFYzoFnNgKY1yzXVvAEdkOtqftY79CpnA8iTgGY1A5rVDOhUzRVoQV2CDsiVmvUi5MlADzEbqDl3NlxCs5oBzWrOmg1BzZmNVM2uoPueDSAnoQNyHTSQ3UKzmgFdXLOeRZYDcgVaQCcB3XY2UPNQs3FCzi00qxnQtGa5VM1ZsyGYqZpdQq9yNoA8KWg2G6xmQKdqrkALalbNeoIJ5Bhakd1C9zkbtuY2s3HiI2fQbDZYzVmzIag5s4GaGTRqdg3dZjaAnIQOyAy6rmZX0KxmQBfXrGeR5YBcgRbMJug/e4eumw3UPOR3Z7fQrGZA05rlUjVnzYZA5szGJKD7ng0g50IrsktoVjOgUzVXoAU1q2Y9wQRyDG1nwxV0n7Nha247G0B2C81qzpoNQWXQqZoZNKvZHXSb2QByEjogM+i6mmPo5zxDF9esZ5HlgFyBFsyS2XAJXTcbqHlV351dQrPZoDXLpWpegg7IpTXb2XAL3fdsADkX2tbsDprNRqrmCrSgZtWsJ5hAjqHZbADZNXTpbAC5Flog29R8Us4NNKs5azYENWc22nx3ngx0U81J6IDMoOtqTs2GO+jimvUsshyQK9CC2XY2ZmhBLZkNIMfQObPhEpoiy6VqzpoNgewyGyc/dgTd92wAORe6rubnvUKnaq5AC2pWzXqCCeQYOms2BNkddOlsALkWWiC7zsY0oANyY81yqZq7zoZL6Kaak9ABmUEDOYYuqdkVdHbNehZZDsgVaMHso+ZpQgtqyWwka9YTzBzoF7xBV5DlUjVnzYZANtWcOxu+oQMyg26qORc6t2ZX0KmaK9CCmlWznmACOYbOmo2APCnoVM210ALZ52z4hQ7IjTXLpWruczZOeYNO1ZyEDsgMGsgxdNuafUIH5ErNehZZDsgVaMHso+YK9CdTgRbUktlI1qwnmDnQcc2uoFOzsZnfnVGzP+iAzKCbas6FLql5WtCCmlWznmACOYbOmo2AXKlZT5Bf9AbdVHMttEAONRu+oANyY81yqZr7nA3U7A7aIiehAzKDBnIM3bpmvYDsGjr1Idh2NupqbpoN39CCWjIbyZr1BLMOuqlmd9C25qzZEMimmotnQ2Bj6Je8QzfVnAtdUvO0oAU1q2Y9wQRyDJ01GwG5UrOeQXYFnaq5FlogVzEbPqEFNWc2VvHdeQP606lAB2QGDeQYunXNehZZ7vSqoC+7fPsXL9127Xv4wYY4NhuoebO+O6PmGHqIE9v/ivGXVoLdZjaSNesJZh10U82rgq4g48XY3/zp3sW+1y5U/kKT+I+wlf5+56LZEMimmotnQ1BTs6F3Jrqz/1q/c+Fe1vv3YnE+3CvhXv3P+v0l3F/lzn3yv8UPrj6cRsaz2PsFW5GH/iPGQM6FLqk5CR2QY+izehZZziIz6GxkPIt9IGDbmi00at7UX64KyJWa9SxyClpQs2rWi5DPliLjWeyDr19oPRtj/e7cVHMttOACujUyHsOeZ6MKrcjf74KMF2N/S7APCXYMnaq5Ai2oS9ABuVKznmACOYZms5FVs55FlgNyBVpQS2ajN2Q8i32vYLedDdS8qd+d5XJrTs3Gmb6R8Sz2fW+sY1egA3JjzXJda2bQTTVvQAfkxprlWM2DIeNVsH+2d3G/YNfVnIQOyHXQrOa+ZwPISeiAHEOf+XhgZDyGnaxZzyLLAbkCLZhdZgPIFWhBXYIOyJWa9SyyHJABfXpVyHgW+wHFDsgVaEEdy2yg5razsXJkPIt9+M0LtTVvQAfk0pqLZ0NQc2Yj57vzpiHjMewUdKrmXOiSmpPQAbkO2ta86ch4KWw6GwG5UrOeYAI5hmazkVWznkWWA3IFWlCXoMeGjBdjf1uwHxRsVvNW+u6syN8bEzLeEvZbF5agu9bMoJtq3oAOyI01y40WGc9iPxSwc2eD1dz3bAA5Bf3S2JHxLPbDgh3XXIEWzC6zAeQKtKAuQQfkSs16FvmjLYKMx7Cbao6h+5oN1JwzG2vIV20hZDyL/cjb69glNRfPhqDmzAZqBvSLWxUZz2I/Ktgl0CU1J6EDMoN2gYwXY38nYI/lu7MbZDyL/dg7gi2YbWvuYzbcIeMx7JyaGXSq5qzZEORTXpHxLPbjgs1q7ns2gKznHhnPYh+JsEtqrkALak7Npz6cCDIew+46G7ZmC/3C1JDxLPYTii2QxbMhqE2zMVlkvCXsdy8U1ZyEDsh6inzllJHxLPbRgJ1Vs55FlgPy8zNy9cXY3w3YtubS2ZiRE89iPxmwU7NR9915Rm54DLt0NmbkzGexjwk2kCvQgmqhT87IZc9iPxWw62o++cGM3OpVsH++jp2qeUbu+Cz20wE7hn5uRu7nWezjgj0jD/SWsN+Tr36CfMWM3P+z2DPygC/GnpEHfsCekVfwFHvrIV9yyf8BAzo3kZlAInsAAAAASUVORK5CYIIAAAA="/>
+						</defs>
+						<!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+						<g mask="url(#mfid46)">
+							<g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+								<clipPath id="mfid48">
+									<rect x="-0.5" y="-0.5" width="90" height="229"/>
+								</clipPath>
+								<use xlink:href="#mfid47" clip-path="url(#mfid48)"
+										transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+							</g>
+						</g>
+					</g>
+				</svg>
+			</switch>
+			<rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+		</g>
+		<g id="shape638-1561" v:mID="638" v:groupContext="shape" transform="translate(36.12,-306.375)">
+			<title>Sheet.638</title>
+			<desc>d = 0</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 0</tspan></text>		</g>
+		<g id="shape639-1565" v:mID="639" v:groupContext="shape" transform="translate(198.12,-306.375)">
+			<title>Sheet.639</title>
+			<desc>d = 1</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 1</tspan></text>		</g>
+		<g id="shape640-1569" v:mID="640" v:groupContext="shape" transform="translate(360.12,-306.375)">
+			<title>Sheet.640</title>
+			<desc>d = 2</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 2</tspan></text>		</g>
+		<g id="shape641-1573" v:mID="641" v:groupContext="shape" transform="translate(522.12,-306.375)">
+			<title>Sheet.641</title>
+			<desc>d = 3</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 3</tspan></text>		</g>
+		<g id="shape642-1577" v:mID="642" v:groupContext="shape" transform="translate(36.12,-18.375)">
+			<title>Sheet.642</title>
+			<desc>d = 4</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 4</tspan></text>		</g>
+		<g id="shape643-1581" v:mID="643" v:groupContext="shape" transform="translate(198.12,-18.375)">
+			<title>Sheet.643</title>
+			<desc>d = 5</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 5</tspan></text>		</g>
+		<g id="shape644-1585" v:mID="644" v:groupContext="shape" transform="translate(360.12,-18.375)">
+			<title>Sheet.644</title>
+			<desc>d = 6</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 6</tspan></text>		</g>
+		<g id="shape645-1589" v:mID="645" v:groupContext="shape" transform="translate(522.12,-18.375)">
+			<title>Sheet.645</title>
+			<desc>d = 7</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="561.267" width="108" height="36"/>
+			<rect x="0" y="543.267" width="108" height="36" class="st5"/>
+			<text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 7</tspan></text>		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/equ_dir_search.svg b/media/libaom/src/doc/img/equ_dir_search.svg
new file mode 100644
index 0000000000..3f14e3d95c
--- /dev/null
+++ b/media/libaom/src/doc/img/equ_dir_search.svg
@@ -0,0 +1,206 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_dir_search.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="7.17726in" height="0.950904in"
+		viewBox="0 0 516.763 68.4651" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+			<title>Sheet.1</title>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="480.013" height="31.7151" class="st1"/>
+			<image x="0" y="36.75" width="480.013" height="31.7151" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+						iVBORw0KGgoAAAANSUhEUgAABGAAAABKCAYAAAD0diLqAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+						ARjRyu0AADUmSURBVHhe7Z3pr2VF9b/9B3zjCxJfEBITYwwxhhiiaUKAQIB8MQoEBYLMo8woyCwoo8zIPDc2NINAg0wKCkIQZLJFQFBQ
+						hm6QSQRknvYvz/au+6uurl279lD77HP78yQn3ffcc/fZVbVqrVVrrar9mUIIIYQQQgghhBBCZEUBGCGEEEIIIYQQQojMKAAjhBBCCCGE
+						EEIIkRkFYIQQQgghhBBCCCEyowCMEEIIIYQQQgghRGYUgBFCCCGEEEIIIYTIjAIwQgghhBBCCCGEEJlRAEYIIYQQQgghhBAiMwrACCFE
+						Ap9++mlx2223FbfffvvMO/UsWrSouPvuu2d+EkIIIfLw0UcfFVdeeWWxePHimXfi3HvvvcW1115b2jYhhBDDoQCMEEIk8OCDDxY/+9nP
+						Sic3FT7L3/z1r3+deUcIIYTon+uvv75YsGBBckCFz82fP7/4/e9/P/OOEEKIIVAARgghanjzzTeLfffdt/jnP/858046Tz75ZHHAAQcU
+						77777sw7QgghRH8sXbq02HPPPYvXX3995p00Xn755WK//fYr/xVCCDEMCsAIIRrz/PPPF1tttVWx/vrrD/raaKONiocffnjmLobjt7/9
+						bfHjH/+4+Pjjj2feSeeDDz4ofvjDH5bXEEKsePz73/8uzjzzzOI///nPzDtxXnnlleLUU08t3n777Zl3RFMeeeSR0l6E7EjO1xZbbFE8
+						++yzM3cxHBdeeGH5asMpp5zS+m+FENNN0+31sk/9oACMEKIxZNm+853vFJ/5zGdmX+uss05x4403FnfeeWejF4GJs846qzj++OPLF9Ui
+						XOtzn/vcMte31+GHH94qENIWAihkCDFQbbnhhhuKgw8+eND7FkJMnvfff7848sgjG29DJIBw+umnN9ryKP4/zzzzTLHGGmssYzu++93v
+						louMkB2KvdD9BCnMRu2+++7FmmuuWXz2s59d5vr2GjqY8cYbbxTbbLNNKTNteOihh4oddtiheOutt2beEUKsKLTZXi/71B0FYIQQrWBB
+						8fWvf33W6cQZvfzyy3s70O+TTz4pM4lnn3126eza9/CdbbYCtYVoP1nNp556auad5mCscP6blocLIaab3/zmN+XivaleJFh74oknFnfd
+						ddfMO6IpnG2yyiqrzNoO/t/neScsPh5//PFy8fKVr3xl9ns22WSTQXX9E088UVaktv3OF154odhss8062TghxPTRdnu97FN3FIARQrSC
+						BQUBFzcLiINLNL1vcHTJQq6++url9wyZYSR4summmxavvfbazDv/g/LLgw46qJg3b16ZJb3uuuuKb3/728X//d//Fffdd9/Mp/4HDi7X
+						4DwYIcSKATpijz32KB577LGZd5pBZcLOO++sUu+WYDeoWDH7xIuqGKpj+oYzvjgA90tf+lJpE4fccopt3HXXXSvPGcMeYZfWW2+9Yrfd
+						diuTCi5Uvmy99dbFPffcM/OOEGJFoMv2etmnbigAI4RoDeX1nG/iOrgEIV566aWZT/QLZymg8IfMMOKU7rTTTsU777wz887/IOjywAMP
+						FJdddlnpdPMzQan777+/PHvgxRdfnPlkUQZvNtxww+UCM0KIuUvXrR1dt5aI8HZZthDlWjRQtYn+33vvvUv7OATYoCOOOCJYZUWwaccd
+						dyz+9a9/lQGaH/3oR+Wjql14nwCO/74QYu7SdXu97FM3BgnA/OMf/xjFAFFqRTnwUEZxLNBeFodjiFJSHfH3v/995icxLcRkiGALQRfX
+						wT3kkEOy7Q3lIEuUPnN5CAjA+NlFtkdxbg1t52wX1/mlHHy11VZbJthCAGaDDTaY6gxjTj0+Jh3VhRXNxsi2x7n00kuLY489duan5pCV
+						RL+McWGM7mPOogvHjr9dltf555/feFtYKs8991yx8cYbDzY3CMD89Kc/nflpWTh/bMsttyxeffXVcp5QEeQfEmwBGK4zreS2IWPRdV2Z
+						pnnbFdqIz+VXfE2CMfZ71+31Y7VP9DF9TdA5xqT9huwBmHvvvbfYdtttS+GbNLZlgsXhtDv6qdBO2tvn2RxdQA6QB+RCTAcpMuTvte/7
+						PBgfHCFK+4eYx6EAjEEGgL3zOLkG2YQvfvGLyzhrYwzA0Hd33313WYLqb6/yyanHx6ajurAi2ZgV2bbzHQRYmffMIXuEL/fBvCdIbM6p
+						v6glMM0TJAgILFy4sNQJVGjwBJ1FixYtNwfOPffcysX1JKEdnH1yxhlnZAu294XJh79dts/zYHxw7NuW9jclFoBZvHjxrG1ed911g1WY
+						Yw3AYJewT8yx2LzObUPGpOu6Mk3ztgu0jTY2PVw2F0P2OwEIgqw33XRT6Y/++c9/nv1OqsgfffTR8v/YqtD2eoN5xQHy6A38V7bZT4t9
+						IujOuVixhL/ZhUn5a1kDMHQApZix0/8RBrYwHHXUUcEFTluqrosQ2kn2Y5iUObG20gdVbUUAf/3rX5eLyD7P7ohdF3mgYiImFyI/RL2/
+						9rWvzTqkKFhfEafIEDDeZBTtWrxYYOQaY76Psv4h5jALpNAWJMCAcUAwVS8GB26y3YiMo0G/jmkLEmfRcN/cK+fYcDaAu2XKJUWPtyVV
+						vqYJa9NctjEpMpHLttOnv/jFL8ozK8hKG0P0O+1gzqA3TzvttDLLNn/+/GKttdYqnWsygTh9BGZtUesfUsh5MDfffHOpVz7/+c+X54bg
+						MHMIIjrYPy+GRTFbZt57772Zd7rx4Ycflk61G4yoevEkuu2337504NG5PjitBMJzLXz7BJnA0Xbbl3O7LGOKHAyR8UZGqrYg8R7zlEUS
+						BwUzb/3tuyarY8pkM2848+3iiy8uK16322674CLJ5n0uG5LT/k2KaZq3baBNtA2Zji2sH3744bICxE2g9QHBD/TmBRdcsEwAdoh+J+D6
+						rW99q5wzFoDZZ599irXXXrsMZGI3rb0x35b7w8Zhn/j/H//4x2LzzTdfTnf0bZ+WLl1azjdXT1e92Pp/3HHHVVY4EWDnnmM6fgi/oYps
+						ARgEjbMaGMCYoKHw6UgeO9unIbTrhhaVDDALjuuvv37mnbnJ7373uzJyGTtwjslkwt7nwabudf2sCvLAYp2snz+ZxfBce+215TixF5Q9
+						oS4pMmQw51HEXMtec2GMY1kCDBlBRhxtoK2cT+OXt4/pEF7GCWeW++ax4SzEvvzlLwcPCk3V421pIl/TxFy2MZO27W7g2K8oy9nvpt9o
+						O8ElFyvl5p7scfO2qPXv8Yorrij1AJlD1/nlGty7vwjGfnKdPoNYBvfAPeOU++fUMLbMy1122aXUEVWZW/QjwfYch6/3DXLob5clSDjt
+						WwarDuG9+uqrS7tjNpixCj2Nb2yH8JIMYC7gk1x00UXlOFXpkZw2JLf9myTTNG+bQrCMw7ZjbSNgYP7qoYce2muglG2nXDc0J3P2OwGH
+						r371q8Utt9yyjLzyf6pX0ONudTbzvcq2uNXd9A2VaARj/Iq+XPbJdBL9iJ3ywRaRAGGcaXNoVwWfIehep+Mn5a9lC8DQEBpEw2IgAESx
+						cEr6VHB33HFHsfLKK5cDFyoBJdhAlNBKh+caGFgWv3X7nJk0e+65Z/nZPp1kM1zf/OY3l8lSGmZg/eBMW2gjSoXSb9EMMxb+WKTKkAvl
+						fhgXrmevSUSW+6TqEZ3oFRZbOIboOYwUBiq0NQrZJCNhgZpJwsGgZN7JmJItYTF78sknB8coVY+3oY18TRNz1cZM2rZjp6gm42DRUFA0
+						R79z/yzCbK6HYBGMvrPsIrYVxzS0qLXDD6mmMdAROMdcxyWXg2v3wD3HzqmxwBPOe8hBRQ8efvjhvd4jOjfXWXGMx5DbZYcA2fEDK1Q6
+						ceAulSEsPmgfAZlQMIH+JoDonw0zKZhDjA1nKN16661l5c5VV1213H3ntiE57d+kyTFvxwCyzoKbhXfM76T9J5xwQrmA7zsxRlXkqquu
+						OmsLXHL1OwFI2lI1Fyyg4SYMYwEY7AP6g3mIvqRq0vdrIZd9QhfNmzev1M+xynGCbaw5qp5uR2IRH7fucdmT8NeyBGBQimSBUfw5lGIf
+						UO7LoPVZ9TEmqGogKjjWx94iFyz6MG5VWx+awOQ/4IADRtvesWJKmQU5C3OXtjJERorrmYPL/3lvWqk6Kd4yBGw9oA85wwHnNmSkMMRj
+						0YeW9ca5jZFbj49dR3VlLtqYFdW22zWrtnkAcsxWJMsuYpNwTEOOHwteSsLd36EjQvMBB7fPEm+DLZIEstAFvm7zwVHHEXaddxee/IaT
+						3tejl2mznxDoC8aPQLnZJ170+zRvMWFMQk8jIYj1/e9/v9x+hA2jiim0UML2k4jrexHVBktsMC51i6acNmQadF1X+p63Y8DaFAp8j4W+
+						+x35ZD1FoNLdDu+DHrAKTaCP3CpMH3xZgrYc4o3+D21RzGWfzOZgJ7GXVdB25ij6wk1oGATkeCJd1RZGYxL+WpYADB230korjXoC2KKK
+						hZNfejvtmOOXIyrZJ5aJr3P+UqCdCsA0x8r5/TNLusgQWQeqXlwHF8U2zU+/IhPmH6iIs0u7fKfXxzIJGN1JY3qvLqsAOfX4tOioLsxF
+						G7Oi2nbLyIccPAPbQ1WOBShsIRkKJGD7qKaxigNzEkNZWwKmOQ45RG9RcYP+r3sKBm1joesfLm5YMNp17ruQMwADVtXj2igW22N4Wkob
+						WIQQ/G+7eECuCWaMAZOlOrnMbUOmQdd1pe95O2lsMe77s2Oj736361ExEqtiw464VTno8tD2ettS+8tf/rL82WxZKACTyz5ZojAluIOt
+						4LNVNp82V9kuYxL+Wu8BGJsAdVGrMUAGuG5QphFzlhDgMWN73vtQQhhgBWCaY2XzKB4UkNFVhqw02JxbU6SxCPSYoT277bZbGSU3UOqh
+						Aw196EtkcwyBBu6Ve64z1Ln1+LToqK7MJRuzItt226YZO7SVrba/+tWvyn4ykO/Q9h7e59wl7o/PUyEYOizQHF/Xae4LO6MnxeG0+crn
+						QwtS2kB1UF+ykTsAA1ayb/aJ1zRvl6U92Kg6e+RDyT32fyxbJS0pxIKShWUVOW3INOm6LvQ9byeNVfWNPaDUd7/bnCGxVrWdki2JnAPj
+						fh//Z575gU62plPN+Ze//KX8mQANgU5/i08u++Sez5Myvy0AEzrzFagKojqo7lpD+2vRAAyDQFaAQd1rr72WO3gOBx4BogTfwJBj0GNl
+						TSxEiNZTgsWTOHA+QgLTlOeff748eZr75bp1GV5+z2dzODd9gCNAVoIDxrhPSkypIuDZ5pyvwnuMj19ZYIvqWPmmjS1POSDr5o9tG7hf
+						9vsj6FwXBRNbcNskqzO0KfQVgKEvObfG7h95Rj6RJ/p0hx12CGbJOP+DQ6CQP/6WzzJuOOTu4V7unOJkcv/MGmR44403LkuFDfoVxWHz
+						BbllvvBkCvs+fsdBYlXjyOcpscbR4rO8uCZzl3v1nd0UGarD9mZyHXsNed6Hq2eQSZ7K5X73m2++WWasGQeLsHMwGvfM3mDfEWfhQX+h
+						14j4c74F12ZeVjmvXIO/mWR5e91TT9Al/jxN0eOQS0dx3bPPPruU7SqZ52W6iznGuV/MHb6TcaHfu2YyuDan7NtYc0101R/+8IfyvriH
+						mJ4bq41Bz5guYG5w0CXzhbHkZ95HP7hzYJK23b1u6N58+u53C1bwol3IJrYmdg9ApQs2w5VDy7RRFYdfxdZFzgQI2RVkDR3V1a75mONM
+						e1ho1o1RXQAG6KPQVtY2DBGAAQ6tRJ5sbJEZHh89FK7PyhkuflCchQALKzcTjewzR0O6E/+L7VWpc47PcV5Fzsdxp8D8QK/YOPivn/zk
+						J8stqFN9lFy6zvcVsQG80A3m97g6kPlttoTf8bdm17rQpn0ufc7bvqBPkEmz59ZX6EN0pesXuDBfWEDHtlfbnKO6iX/5uQ/Mt2dsQz6P
+						T5/9bok1vpu+4RxC9FhdMNZsEXPJBVnBV6cqE/vA9kTa55PLPhEYIrbA2KdUoNUFYKx/aKubaPYZ2l+rDMAgPHQsh0/RATQOgXHhJnnf
+						Lcu1jkMhhRQLA4sTi5FAWRFVS8ki18ECh8c/EulCYaI4684XMacidgDdpLB+oqSU8mR+ZiKQsWE/Hs9xf/rpp8u+80u0zEBX7QWkr+rG
+						til2vz//+c/L+8Uoct26klgWsikl0HVgcLoGYLgGExQlbvePLKPMWGCzwMOw+sqdRRpjgCyhyFgMIvsYXowBwRR+Tun30JziXsjMoeyQ
+						a0rXTzzxxOLoo48ugwjwpz/9qfwujJPvqDA2OFkYfh5Lx/3hYLA3nO8KRXzrZCgF2kw0HoXG9/DCGRjC2aMPTjrppKiewfBhAF2lbJlu
+						dIe/IKI9jO/tt98+8049ixYtKh/9NxbMaa3Sz0adHoecOgqZ57tdmWcOcH1f5g888MBycc542++Ye4wt+6Jj7YxBNQLz9YEHHiivQVCJ
+						tuBcccgf98bWNO6hSn+O0cagh7bddtvZw9H5mXv83ve+V/Yx8+Wcc84pD7F3HdxJ2XauxRxlLzq6C/2LTontn++730PVErxwdgk0hg7/
+						A9qOk+8+Ycz60Xd6Q6CjcjylB9kl8UEbUu7D7CGvKoe4L18ChgrAILOh7bLMidxgX9Bt6MkqecUP4J5sQWALCd4LzUPaQ/+HFkshSBqx
+						UG+rI3OAbaB9sUU0pPgouXRdyFfkszwxjMf1oqcYC4KsVLpdc8015e9I8PA7vhe9gay5VbVNads+lz7nbR/Q3/iNdugrP2NTaBdV1dh2
+						/HFss29fzL9hTEKYTY/NuTbg0zKX8RGY1/iPdWeO9Nnv9BE+ENfzX8jYwoULK21IaHt9Krnsk/nmzEHmYgzaztyjrVUBGOZrij8ytL8W
+						DMAwECyMTYhDCpHPWAbFFXZrQNVeaRwR/o4Bs3Kxrvv1EPJ99913dhFp0Wvug/upwiZKarkan2EBYILd5kXGK0VYiV4ec8wxpRI36H+u
+						waKCa2AA+NldQAJjN69ii4GNrZ1HYZHDrs4OSpFJYPdripDvikGbQgGApvQRgGHSWwDDAiEofCYsfUmf8p6rMJE9C774TpsFAlGABJhC
+						c8rt99CcYlzJ8tM/Jtf8HuPqyobNO5QMfWGYMWPB4lefIAMouZBSislQE7hH7p97tlesjL8v0DPMC/rCnClfz4T0GvfF/VUp8mknJHch
+						6vQ45NJR/B1BlVSZ53cEflPmQypcC32GXjNsPjNnmDuuTqjKmDS1MbkhQEU2y9VVdo+2GDDnhxf/N+pkIodtN+fb1V0pMpyj31nArL76
+						6uV3+y/6rirjiYNLn9n906cEc+qSDtw3gfaqxUQXmFvYXcaTca3D+jyWLGEhzIKYz3aFse3qk6TCnKByzx1PbHps8dQHBK6tjeYv4V+a
+						nJju84MMVrVT51tNI64PVBXoM+p8lJy6Dv1vvqKNHdf0fUDukd9RrUEywsV+V9fOKrq0z6XPedsH6CbWc+78QwfSV1RkYF8YF372q+eZ
+						T7G2MucIOAL9jl/szrk2EHTBz7AnZZm/WedD9t3v9Ndhhx22TMLTfVHVHbKF+P7+9voUctonszcp/pvrI/qJPoPxxZ+rC+gM7a8FAzCU
+						ZNEQBJtBJZJHFNfN4lgGxVeAFtWrMp5UA5izSgYLYQll7ZvAddwInjnGddEuJgeTpK2TngsEiGCC29+0DaGgv0xh8y+KyC2hox20p2ry
+						p4xtU7gOVSLuNUxB1kV3kRM+19YIGbS7awDGZJPJaoE2k1WuT4YdhWOGgc9Z1DmUabex4Pfsvazrd2QVmXXnlDteJtehR6WZQ+0rIHMk
+						UCp+NZg5Dr6yqZOhplhQg++yV+iwyT6p0zMEAwgKhIw1f8dCu6r0eFpxdUhVNsyo0+ND6agUmWfxZJUvBu3jPvzATyosMn0n0A/k4Qgu
+						WLCgfNGmEGOzMfRL6DBp+tHukW2R5513Xrl90p2jdTKRw7ajs8hYmu6yeVsnw7n6nf54/PHHi1NPPbXcdufqtKrFGt+PvmOhdMEFFxSr
+						rbZaWTmDTMfsFeNy+umnZ9GTjBP3nLL9l/unH/l8LKtrtqaPwAAyViVnOWBsWLS645lzuyz6Cp8J/cb8YJ4g025Vl+k+f4xsDriJg7mC
+						LabqqqJNJmM+ylC6znxd/7OuPfS3tWE/sXGxYEEdXdrn0ue87QMSLe48AFuQWx8zX+l3O6PEoA1VQeWUOdcGgjpupb8FvXxf3CdXv7OO
+						oNKdCncCtabP0CW0PQTyTvVqE1uTyz6ZfuOeq2yqi/Ujn3fHwScmG8bQ/lrtIbwmTP4N2URHWaI0jTrFZeC8Es3k2n0/HYR7YGL5C0uf
+						sTnHMSyjWBfBox20J2aYDKuAsKhyX9g9IB/ISQzkBHlhzOpgocNn274IAsQCcoY5AXWTlYgxTltVO93IrNu+ujlVtfgPZckM60dXYZmR
+						4f3QQsgcB3MkjCYylArtd40B89P2tefEDbS4eqaqKgbo5zFtGekLnHic+ZQtf6l63CWHjmoq84bvsPVBql1xaWpjqKLAAW3zYj6l3peL
+						9WNdJi5VJnLa9lQZG8q2k/W04HLsu9imeuaZZy535lcVZOII8lQFO7rAPLKSbf6tCzKYnePzlj0O0WRB4TrNbV4EsPqWLc5PYX7bd2Cv
+						YkG+vrBAC1sbXf+kSvcxxwnoDXFvQ2N9UZc8beuj9K3rYr6i6arQwjfmf3QhtX0uTeYtIHch+5PyahPssLFOWU+kLLLBdJo/5/ogdQya
+						9nsb2O7Gusn0GnIdAv2CvkndXp/TPlkVCvebUl2DTeKzVvVVxVQGYGyR5isiKht431+opCoumwB+AKcPqu7ZZ+jO7oIt2usyuqmGiQlH
+						1Qb9xMTrEzOiKcbFlFWVYkiFdnetgDFMMdbJps2Bqiwii1wWu77hMPn0nVnLSoYWlMBcC42Xu+BxsykYFhyZ0N+Y4xAyaqky1ATkzT0P
+						ZohtSFDl3MbmE1VNKY4CjkidQzAmrC9Sst6petwlh46ySjRfflmE+NU2RlWFWVds3voByxhNbQzjwplBd955Z+MX+sZdqKXgBihd3RFi
+						DLbdZKyuP/uy7cyZW2+9deanMKa36zKeY8ECsamyTFaRz6JDcY6r6HNBgYw10T19QCaXRSJttfHMscDwMfnxq2ir9A3jx8HxsfMNgXvn
+						3KRpkEmjbxvikkPXxXxFS6iFdBD+Bbara4WgS5P2uTSdt/htIfuT8rJtOk1o4rekBmBs0d4kUJWCjUHIL/HpQ18iO1Q3sQ2qCtP33NM0
+						BG1NBzDmjH0M8/UYy6otVsbUBWBskebftDm/NNp3jK3z6sojzajHSobaYIo5JVpqnT0NjhOGIEVhmAIgghhzljDefCa0NaUrKB4me0qm
+						mHb1oRgY974CMMg0fV1XBcGE5nNVWURzrNyy7TZzCqr+DmzB4xsoPsfnQwfVWXAoZNRSZagpZH951Cr36u+VzoWNpZ9FtLNKfD1FX7Cv
+						OpZdp7zzG9/4RpKBGBPMsdR5marHXfrWUTGZj2UXCboQfIltl2hKE7viMnYbk1pRApO27ZAqY331O99XFRA3bLFW97mxYGcPpMiyHTyd
+						cni62Zs+qgfpd15D89xzz5U6ZagEgdl9f8Fm502FbDfzMLZwx4Zx4CtjNi0yaVhSqy8b4pJD11X5F2C6yh8Dxq2v7S8uTdrn0ue8zUGT
+						9QR9HZozLrZo7ztBA03GoI9+5/toS8wPNd9lWvxV881DQU0fe9hCir5mvVZX/T20vxYNwCBACJK/SLPMeqgxJlSxqB5/TzbFSoaYVOxl
+						7CM6h4AhaEwChDOGLUJTI5Dcp2Vk275SDuElE3PxxReXE4vTzc3YIGgYBgOFz5Nw/EhrSqTPIsCWdUEREcDoQ+hQglw7xYFC+dTdawp9
+						BWDoi6qsuw/3zudCRtqqUjAc7v7fqjnF/3nPlCQ/IyumVGxehZSSv+BBRihRZkz5GxQKisXFHAcLHuEIUPlhpMhQE/gOKmBw+od4CpJh
+						TpAvi7SP9/25Q1k7v6sz9PRfioEYE6nOLZi8VenGIXRUTObNQQ5lKnz9w3fzGFPKcVPBqecJFugCrm92xZ+32BiMdZWtaWpjcsNYsbDj
+						LBJk3PqRsWMMDfoefer2bZ1MQE7bbjLmL1ZD9NHv3DtOf9184fc489NSDWeJgVDw3cUqQuhvdDdzOYYFeFPsfh1co4/rNMHaO2SCwBZH
+						vi60hYBvuxkD9Fvdwt2C13U+zNgwnyrlvutsSG5dx1hU+YoxXeWvUfheniTTVOa6tM+lz3nbFfqUBNcWW2xR8MQoMHvuV4KFdAQ/89mY
+						ffCPXiABzSG0boV0W2J+iU8f/Y7N4RHdsWCPrTlyn7vYBzZvGEPGPQZrIwIvKfqadS1+Wmgt5DK0vxYNwJiiYJK7Z1JY1CnkGJvh8JWA
+						i68oeIQjhyPZBEBIKCfiMzg3KJdUmHgIdUq0FOFlQdgkyzsEdl+0g0lqFQ6+sUHoQoqDCU3f0c8hTMjtMyg9niLC0xoMosSUufLoYl/x
+						xTCHIiW7Zg5uH5UWfG8fARgLhISCiz5WRuorCvrzlltuKX+HQ+cqvao5ZUbB5hTjwgLQ5pA5zX603Prbovn0qe21tbb4Sofr2yOouS5/
+						g8Jx99TXyVBTCLogv0MGX8AMsmvkLDjmG2reJzOVspBiHPyxGDPIpJ37kLKvtk6P59ZRUCXzYOPqL45N5i0AwNyj3SbbqXqN63J9O4vA
+						AtauY0WfsjBFd/L/EGOyMaZvaQfOCO2wbaiuw0GfcSCfaw+gD9sOyB/9z+G6MWfIh8VKanYx1u+p32/fZ456CKsQ8fX8UDDXeKISTjgL
+						sjpcPRDzkWgvMkAfIvtV8u1iQf0+norB/HZ1dm5sLlPZOFTwBcx++3rT9Klvu5lPLNb9Q8d98F2QyzofZkzEqn5CxGzIELou5ivGdJXZ
+						NUt+oqu4V9MxKfqpa/tc+py3XbFgAfdDwgi7QUCfn11fjcU3VV7+4//5DJ+N2XY/oEPfuDa87doHqvySEFX93uT7bQ64vrsLbVqwYEFS
+						kKJvkL2m63jWgawH6ZdYEJatbDyhdv311698+qCLBaT9dZdPld/Qpi0pRAMwKASUPYNHIxnMBx54oPyZDgpFqCzSFMuumJN20UUXlR2D
+						sLkLM1N+fAeLVRatqdjkYvLWwQDbAmJMIAQYIQ45og9x7lBCLLItKoyzte222wYnlUVWqyYvBgXDwlMccCAJqPEzE9/AGcApoC9jY+lj
+						AQZbuMSwSYG8dK28wQD1EYCxdoeCiz58J9tVmA+PPvro7HsoPJTn/Pnzl3PKQ3OKMd14443Lv0FB2JibYuYzVZkWm28oLZ4S448lBwtS
+						imyP012yZEn5HTyBw66HDLHIcMe/ToaacO+995ZP/kjJovYNfUxf02bG0w4lQ0bpF5wVxoQXFUQ86apOuZrcxgzE2DDn1nf0q6jT47l1
+						VEzmTX+FnG/uhXvGfuAEEwgl82PObapew8giN3fccUc5Z2gHc2zLLbcs9RpyRKbO15s+Y7Ix5rSjmxk/dNamm25azoX9999/mXkQeiJC
+						H7YdmHP0P68m+sWunWIvYv2e+v32fbwYa1930T4C2dxPTAZyYg4/r7qMIbgLmpCPxNzC7hDQ4bV48eKZ39TDnElJXKRAu3gNAeNKkAm9
+						MHSCAOzwX/wF9ApyhUyxEEX2OBiTe2Rxgr8Ry+4byH+KDzMmbPHlB52qiNmQIXRdzFc03YF98AM43C/zD/nmcG7G1GwmpOinru1z6XPe
+						dsWt1sB+46tR5YCvZjrY5kFortIG2hLThfzOEpb0G4Eud/tKqo/gY35Jqr2v6vfU77fv43NUIiNLLvQVfv+k9FqbdbwloqvkEftF5fWq
+						q65aHHjggcu1uQqTq7qkaZXf0CUmEaP2EF4aeOihh5YTgJPnycrz2E9uxHeMDSY9g17l6KMMED6uyfPxERLXueH3RHNXXnnlYvXVV0/e
+						p2eLDCJYKKU6GAwUft9noHTF7R8W5QsXLiwX9hhqFj2Mw84777xc9New811YwLj96sLCFEeMa6HsfEHGofzBD35Q3gOR/Lr9dQbOG7KB
+						cahbyJqyRF660lcABoeTPufwuhT43nPOOWc2KMnf0u/sJa/Cn1P0P0aFSc984L2zzz571miiaJl3nJ/C53yQdYJpobHkGsiN3R9jThsZ
+						XzKh/A2Oth9FTpGhFFh8892TyhADjyrcfvvty7byot04TBglAl8oXJtnoXvEKcYJJuJOPx922GFl31Rl6bgGgQky01dddVUpCzzSGAOA
+						sSdQxoF0LOYZ7+OOO650lqqou15IJnzMADVxKGJ6nHvKqaNiMo9zzMJ+q622CgZ53XmEjue+jFS9xndy/8iGLUSZV8wvrkub2bJXt/Ae
+						m40xvY8uwI7zyG9ezA/aylxlLKvksattBw4ZRkaw0010P1kp7pvr1xHr99TvZ4GEDDD3kQHmP0Fk5i6LZRwxnHlXvoaGcUAnfeELXyir
+						90Lb7JhLyC19V/dCrgnGs8gP6cIqbMHKQqCPYNSQARjsALIwiQQB0M8EgOyx5sjmokWLSn/W9S1iATH0FVv90GnMZeQWO1fVHsbooIMO
+						KtZaa61yPnBdKgqwMdg3zmvDB2JRz3sshGP+HHKILUWnELC2v2WOXHfddUn9ih/DOISCFiHqbEhuXRfzFbFB/I4DUn1c34sxf/jhh2d+
+						8z9S9VPX9kHf87YrjCOybz4E+pV5gB3BntAu9DB9Fhpz2kBbYluAzLavtNJKZV/RZy5co83ah+sij/hZftWTT6zfU7+fABXjz9MP6Sf6
+						jK3WrMt5EaQi0VqlM3KDXktZx9P/pvvqXugrAor4lSk6xWA+Mt51Wzer/IbUtjQlGoBB4fpK15ReLArEzRFhxKh0gQ4+8cQTKxWgj1Vf
+						pCwy+D2f6/MU8rFAe2gXk7OuCqUOlB/GIiUjASgCJkpKZpPPWCS6KyjbPgIw4n/0IUMYDrIXLJbHYNzbgA7CMcdgE7RBEbOYiekYsl/I
+						NsFIIvXMCZwgCxATVODJKlwbecWoxALGZAW4HtUYoeulLFQsI9ckoNaXHg/Rp45qQ1O91oa5aGP6lAnmR+oi27J9yDCyHCO13+u+n8Ct
+						6S0CG/gXBF+YuwQo6raBDAnBYBzESWG+V5OAWoyhAjAkCNh2RHl5k4DTmMDOsii1ShlsCXM05oNRTYBdIahJdSqBc2ScRSSZXgL7Ns+Q
+						9diiEltEgsAW/wRerCKYOZZa0cL94jumjntuG5LT/qXQRD+2pe95OwZoC8EntvF1oamPEKt68knp97rvJ0j6t7/9bdafw1axQ4XgC4EG
+						fMtQQH5ouL8m6/gc4DNXJWSMFL+h77ZUBmAsc03U3RUAlBFKMvbIJxQy2yy6RlXpEK5DpC8Ei24UrwkZgse9pSwyMD5UX6RUykwjlDQS
+						Qa+L+NWBomDBGYom0+/0v/2OBSrlmLGsgYGMkGnj2n04Pow3Tug0ldyOnS4yxLwnws9rWoMvgH5AT7h7bFHmsQqvBQsWlPOGTD2ZKVuo
+						WcCEhZLpJ2SW6H9szzsBIKpeqq5X5aSxgORe2QrEdgOyR03Gsi89XkVfOqoNMb3WF3PRxvQpE1QQVW2nwCZgW8yRtQouMoJV/oCR2u+x
+						7582WLwOEbCoAr9wnXXWafWY2RAEFZijObEEwSSrM7tiCQH3nCL6jaBHlfx/+OGHZYUtfj12zPXBLIvvBh0IBPAevwtB4AWbxxxFBtyH
+						DsQCMNhAqj04f4Nrk31umpDLaUNy2786htBPfc/bMUB1BFVbXYNKdT7Cf//733KNar4c/llqADGl34fwUYagbh2fG/wIgrTnn39+NC6Q
+						4jf03ZbKAIwJk3uWB1tGEOyURz7ZwgUF3Bb+lv2QoYUO348zxqKC8iKElLIzyrbc/ZQhTLG6hmeuYYY5VoqXAsqkSpFRhk3/s8impI7x
+						IgJM35ozUAVjRIBvLi1O5hptZQhnhb9L0RNjBmVNMNetdqkLehhWYkoAxMCgYnTdDD4Lp1g1jZF6PQMDgX5Eh+Mck0khS2q6PJU+9HgV
+						femoNsT0Wh/MZRvTh0ywDQ95DmWksPeUUiO7LMqA8eJnFiUxJyq132PfP20wd8i69lFJ2gZ0CrqlzsEdEyz4SRYhA9OcICDRRcLLrXZh
+						XqYEKhk3/Hv3b7El2BRsi4HNScnq44ezPkj9W7t35jXJU+xqiu/oktuG5LR/MYbQT9M4b1OgLbSJ7Tf0Y1tiPsLjjz9eyi6ywbqY7+H7
+						kP+qbddGar/n9lGGgrlTtY4fAg5ZJoAcC3al+g19t6UyAINSpjTRSgnJoqKsEZyUaCmChePvZmybwMKNMkj/bAoDZW/7HLlH9oURCSey
+						WKdM2E9IRIwDaOcytA+l4GYkmkAVFEYxlPmwBSHGk2ooDBXKB6emKlNi4PCwpQP5mEuKfy7SVIZQXmQUmZfITy6GcFCsoovghclpLOjh
+						Yhl713Hj/24pN0qcDGRKxV7oerHgDfc5b968soKRv0F3tzmIraser6OrjmpDTK/1xVy2MV1lAh2BvWbbQwhbGGLfOcOAbT+cp4BzVLdY
+						Tun3uu+fJhgLKuRSDg/PgckC9nxaAhncJ7Yjd4KAAIN/uH3fELjg7Ae3gjI1YILPxrka7t+yJck9VNZsYF3CAUjaoldJUgB2CfvkBnhc
+						bMsGfj4VLDzRpI3PkNOG5LZ/IYbQT9M4b5tAm2hb2+BSnY9ARTFrH2zN008/Xey1115lQIYHTsRI7fchfJQhqFvH5wbdwBjhF8RI8Rty
+						tKUyAEM0+fTTTy8dHxwhDthi73OT6DTXOPLII1uVeN54441leWIVCC8LI0oWMUAYnLrII9DBHN5YN1HmChg5HI2mho3tRZw2HdtKRBCM
+						8yvof7ZRXHPNNbXyYQooxZkW4yBVhhhbFgOcU+IGCvrGZKjJeSZtMAfSdT5pF9F0AkA4q+bA43S6+22rMok4/gQvwYI5lD6SvWZfPnAd
+						c2IN/3rMM5x7rsk8uuSSS8rScoN+4fBD9DeHhmFg2vZVFz2eQlsd1YYUvdaVFcHGdJEJkjk4xlV/h5xyECPZRWwLvgfnH7nzK0Rqv9d9
+						/zTB1g78tEnZUuYsZ1pNysFuCmOOzJIsyqkD+B7sU0rgogsEN9wtPlahiW3iDAw7IDZkU/wAPgEb/GirOgN0Mz4e12IOcsYE0D6zY2B/
+						iz0ysGtms7Cb/mGgXIOtUBy2GjtgOIWcNiS3/fMZQj9N27xtA3LHeURNE08pPgKBEYIo2CdePEghJViS0u9D+ChDUbeOzwnz55hjjinO
+						OOOM6FxK9RtytCV6CG8foLyOPvropMdC54YJwmMjUdYrEpwYvttuu40imopDgGOCXIjpoU6GWDTxyF+yADkfeWffQ5AHBy8nZJQ5kMvO
+						bKHyDyePcmf+T3kon8FxZOujuz0CB9jNBiLv/B1ZQoP93QRzqG4566yzZgPIGAx/K6V/PRxuHG+uQRaUV05y6/Ex6agurEg2RrZ9xYbS
+						+x122GGZIPOYwQlHZ3Pobs5gr33PvHnzOh8EWgeBe+wA38PCjcQECVPsws033zy7Lc23KdgpEphudadtW3VtCQEk7A4JBw6fpDqNYAtP
+						qHO3WzD/sGVu4oW/xeaRpOBvc1eQ5LQhY9J1XZm2edsF2oj8jiHQtCL1+6TBLycYTCArFnyZtN+QPQAjhJj7DPEoT6o+zjzzzDJjlnJu
+						Sh+goMl0kMnbY489yqAP301W3jIUGFZKualuscAi2TLOSDJwXPk7N8vHtTngDwfajawTpMRZZvuF4V8Po8LWP5xggjoxIyOEECsy2CRs
+						EzYqZ4KALTtU9xIESdkG1BXaRaUj9oltezySluAPW4B5MpHZBd+mUL3CtgmqWgzsGDaKA98N3uO6VBKYvWNxQ1KC7UOWNeZzZJHdLV18
+						nqAM78/lSgshhGiDAjBCiE6QTcThy1WiS1CD7TkEOdh3y2tsh5Ph0HJPFoDpCg5t1d55IYQQ6eROEFDdQYCc7QjYJwIwk3i6W4y+bQpb
+						0FeUrfxCCNE3CsAIIVpDxottOfvss0/rcwgI2lAhwrYaXgR0cF55PCWHflPxYoEXXk0fVTkES5YsWaZCpSuUgWs7hRBCdAN7wrYjKgbb
+						JgioZDH7xItKRnQ0h6Xa0zhdG0Xlx9iesNWnTbGKIm2nEEKIdigAI4RoBeXW7J90Hc8hXhw+2+Qw8Nzg1FP9kvJ0uBQIamlbkRBCdMPO
+						7QrZkZwvOzdsLPRtUwhq8ZSeMbVRCCGmCQVghBCNodrFHkM+9ItHzY+J0BMmukC2NffZAUIIMZex6syQDcn54myU3AfEN6Vvm8KWWyUI
+						hBCiPQrACCEag/Plbhsa6sV3yvETQggRw982NNQLG0VQXgghhKhCARghhBBCCCGEEEKIzCgAI4QQQgghhBBCCJEZBWCEEEIIIYQQQggh
+						MqMAjBBCCCGEEEIIIURmFIARQgghhBBCCCGEyIwCMEIIIYQQQgghhBCZUQBGCCGEEEIIIYQQIjMKwAghhBBCCCGEEEJkRgEYIUQn7r77
+						7mL99dcv9t9//+LVV18tzjrrrGKjjTYq1lprreK+++4rnn/++WLfffct1ltvvWKrrbYqlixZMvOXy/PRRx8Vp556arHGGmsUV111VfHc
+						c8+Vf7v22msXO+64Y/HKK6/MfFIIIYSIU2dTXn755eLOO+8sttxyy2LNNdcsjjvuuOL999+f+evlefvtt4uDDjqotG/YvsWLFxe77LJL
+						ef3DDjus/L0QQggRQwEYIURr3njjjeK0004rgyoEXbbbbrti6dKl5e8uvfTS0kk96aSTinfffbd87brrrsW5555b/j7E/fffX9xwww3F
+						HXfcUay66qrFKaecUjrD77zzTrHTTjsVl1122cwnhRBCiDgPPfRQaVPuuuuuoE3ZfPPNi1tvvbX49NNPiyeffLK0WY888sjMXy8P18JO
+						XXTRRcVqq61WXHHFFcUnn3xSJgdIMtxzzz0znxRCCCHCKAAjhGgN2b9bbrmldFjnzZu3jONKAGaTTTYpXn/99fJnc3h5v4rLL7+8zFAS
+						pNlmm22KN998s3xfARghhBBNWbBgQfHss89W2pSTTz65DL7AE088Uay77rrlvyE+/PDD8jrYtIMPPrg45JBDygobUABGCCFEKgrACCE6
+						c+WVVxabbbZZWREDH3zwQbHffvuV2UYDJ3idddYpM5Ix3nvvvWL33Xdv9bdCCCGES6pNobrFtWNVEICh4pPPG1yH63FdIYQQIoYCMEKI
+						Tnz88cdlNvCII46YzSRaNvC2224rfwb+v+GGG5bnxMR44YUXyv35lIwbqY6xEEII4RKyKfyf9/gdmB076qijZu1YFVR6cl6MWylDcIek
+						A8kHIYQQIoYCMEKIThAUITjiZwM5lPCpp54qf7aKGMq933rrreK8886bPeiQUnD20Bt+JpHP7b333qWDywGHl1xySVkKLoQQQtQRqk7B
+						nlAVQ3UMWEUM57s89thjxU033VS+j23CRrn4CQEqYthuy/svvvhiWREqhBBCVKEAjBCiE6F985zVsvXWW5fBFnjttdeKDTbYoMw68sQJ
+						c245Q2aVVVYpD/K1rCNnxLA335xe+1v21lNF41bVCCGEEDF8mxI6EB77QtUmFTE8ye+ZZ54p3z/jjDNKG/Xggw+WP2OnqPZ0Kz45vJcn
+						AfIvZ87EDvEVQgghFIARQnSCR03zGE738ZvHHnts6bgalHcTZKFsm0eC2sGFVMjwHplInGI4/vjji/nz55f/Bz57wgknlBU1OMz2t0II
+						IUQdvk2hYoUKFhIABttmeYofwX4eL21cffXVZQBm4cKF5c9UzOy1117lk/oMbB+PtiYRsWjRototTEIIIVZsFIARQkwUHNoLL7xwNgAj
+						hBBCjAUSBe4WWyGEEKILCsAIISbKkiVLlslOCiGEEGOBba/u05KEEEKILigAI4SYGGwnovpl6dKlM+8IIYQQ4+Cll17S1lchhBC9ogCM
+						EGJihJ4wIYQQQowBnuCnR0sLIYToEwVghBBCCCGEEEIIITKjAIwQQgghhBBCCCFEZhSAEUIIIYQQQgghhMiMAjBCCCGEEEIIIYQQmVEA
+						RgghhBBCCCGEECIzCsAIIYQQQgghhBBCZEYBGCGEEEIIIYQQQoisFMX/A2IP+9+ZsJeHAAAAAElFTkSuQmCC"/>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="480.013" height="31.7151" class="st1"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/equ_dual_self_guided.svg b/media/libaom/src/doc/img/equ_dual_self_guided.svg
new file mode 100644
index 0000000000..c936f46f46
--- /dev/null
+++ b/media/libaom/src/doc/img/equ_dual_self_guided.svg
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_dual_self_guided.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.89143in" height="0.748518in"
+		viewBox="0 0 208.183 53.8933" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+			<title>Sheet.1</title>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="171.433" height="17.1433" class="st1"/>
+			<image x="0" y="36.75" width="171.433" height="17.1433" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+						iVBORw0KGgoAAAANSUhEUgAAAZAAAAAoCAYAAADQUaxgAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+						ARjRyu0AAAv7SURBVHhe7Z35yw5dGMf9A37xm5+UkiRJkiJFhJAtW9lly5Z935UQsv0gITsRRWSJ3pdEsmXf9zU72bfz9jnvnMc8Y+Z+
+						7plz7u25r09NmGee+54z5zrne53rus6ooARBEAQhASIggiAIQiJEQARBEIREiIAIgiAIiRABEQRBEBIhAiIIgiAkQgREEARBSIQIiCAI
+						gpAIERDBmq9fv6p//vlHffz40TsjuOL06dPq5s2b3r/KP2JLmeP58+f62f769cs7Y48IiGAFA33y5Mlq8+bN6vfv395ZwRUM+t69e6sT
+						J054Z8ovYkuZ5cePH2r+/Plq+fLl+u8uiC0g3ECFChVCj23btulrvn//riZNmhR6zerVq/U1hcidO3dU/fr1Q9vVunVr9ebNG33dhQsX
+						VLVq1f66plmzZurp06f6mvIARjhv3jw1Z86clAZZbDaDHbRr1y60LXXr1lW3bt3S1z169Eg1btz4r2tq1KihLl++rK+Bq1ev6s/jz0KB
+						lcSWLVtKxkv16tXV0qVL1efPn70rSiO2FI5rW0Kkhw4d6kykE69A7t69W2IcEydO1J3mh5tD6TCcjRs3RhpOIULbFi1aVNJB586d837y
+						hwcPHqgGDRqoXr16aeEpjx7VkSNHVNOmTdW9e/e8M6kpRpvZuXNnyWA+cODAX3bw9u1b1bVrV9WmTRsdrgoLL/A7q1atUp07dy5xUvKZ
+						Fy9eqEGDBpXqw2fPnqlu3bqpMWPGaHEJIrZUNi5sCXBweXZcY0tiAfn586eaNm2abkyYZ228pjNnznhnyheoOuJB+4NejfGmZs6c6dRw
+						nzx5kjfxcCYyJjQmtqAhR1GMNkMbaSttpu08AwPPDU9w4MCB6vXr197ZcMznbNq0yTtjB9/NRMKk4xI83FGjRql///3XO/OH2bNna6fq
+						/v373pn/EVtKD1e2ZJ4dIm87P1nlQE6dOqUqV66sG4Q6GojbooRhRlRewIsaMWKEbnuHDh1KPEPTkSwTXScCmTxcTSC20N+1a9dW169f
+						986kR7HZDPZgVqt4fXjOBtpKm2l7WZjPCZssk8DEMW7cuNj9VxbYJx5/UAj494wZM0qFXQxiS+nhypaAZ1elShV1+PBh70wyrATk3bt3
+						qlOnTrpBffr00ROm6xhbPrN7927d9ooVK5Z0RNyOjEO+CAiTD95LEg+mGG0GL9hMdGa1miSvYT7n0KFD3pnkZEJAEDacqrAwG+dwtPi5
+						P4QlthQPV7Zknh25Iv9KJi5WAgI0gsYQzjl//rwO3XCkSoSVF4JLykuXLsXuyDjki4Aw6eAxrly50jsTj2KzGSY1Jjgz0d2+fTuRh0xu
+						AXuzHfSQCQHBoQqGc4FY/Lp167TXHBwbYkvxcGVLZkVI4p3QeFKsBcSfzKpatWpGQjf5in9JWalSJVWrVq2MLptdC8irV690otH0H0va
+						KVOm6IkqFXjAXH/06FHvTDyK0WYIsdBeVqtU6CXxkL98+aKGDBmiPUc8SBtcCwiCNmvWLJ1XQUhM/5LEbtSokWrbtq26du2ad/UfxJbi
+						48KWgKo1VjM2eSJrAfHnAqiiyEToJp/xx2HXrFnjnc0MrgQEY2NDEYMbL+T9+/f6PMJBpQyrKNOP/Izkpz8sgbeIWIZNCOlQjDbjn+im
+						Tp2aeDMXfRGWR4iLawF5+fKlrrBC2Pbv36/at2+vmjdvrr3kvn376raHrczFluLjypaOHz+uP8OUPyfBWkBg+/bt+kb8uYBMgrdDvTjf
+						mfQYPnx4qVhsUqh4YMLlMxksmfR+XAkIqyRWG2E19/Qf/cjKCqFBIMePH6++ffvmXRFdTROHJDZDf+3atUstWLDAO1M4cO/Dhg3TbbZJ
+						hK9fv157nXj6NrgWELxY7CnMEzb5j7AS3lzY0sOHD9XIkSN1PzRp0kR79IUU8nJlS4g24p00fAjWAoJXwRLVeOEu4rOFAkZHvNVsGnQx
+						sFPhQkAoA8Z74cCTCWJi0myMZEVC+/yDkomHhGeLFi10CCwJcW2GPTUMmB49eqiGDRvqSaeQYFIlzIB9MMnR5qSJcPqf38d7tMG1gHBf
+						hK7CwPmgtDcYb8+FLWHTOERmpYIzZRMGyjYubcmMdZvxZCUg3ECrVq10J0SVl5VXEA88YTwZszGH9hvPPQmmQ/mcJAd5GFYMUXBfpp+i
+						BhoDmQFNmGTPnj164jYhLrAd9DY2Y747icGfPHmyJMEa97BZVfPMt27dqkODTHZ44rQ5WI2ULnEEhHJark16sLIOq6gKgkBMnz49UoxM
+						v2Hb/mtyYUuInN8JM/fgf5NEWeCEhdlJOseOHTsSO9iubcnMNzkREBTcn/1PtbGuvGG8AJOwwyBSbWpyBYbvN/64MEAYKNxnlLdoBAQP
+						hxg2O4T92Ax6W5uxERC+m7xPkuPx48fep8SHtvrLuv1VQ/5XTKRLHAFJBc/S1QqEVcXYsWPVhw8fvDOlMSWjwVBVLmwJ22HSvHLlinfm
+						z7l0nwXtOXbsWKitlHWQu0rqYLq2pZwJCJNmsNYaBTTJrJ49e0YaEzdNgg0VpRO4liUoccxCIdiR4E+mu6jTD8NWQIzBpEpaGgGhHXhM
+						wdiwCUcglGVVa/mxsRmDjYDkArzEYFm3PwGaJPZM/xO6YEVlg0sB4V7If0TBCh2HhL7jew25sCWeH/k/Sn7B3EPLli11IUC+kglbMvPB
+						3LlzvTPxiS0gdBRfGDa5mPIyJtKwUAqdtWzZMv3ir/79++sHwkMgDBQ0rlTg8ecqiR6118MkCvnspEvKsrAVELwfQlOpBqwRkFQeWVyP
+						zcZm/BSSgLBq8XvIBv9Eh72kGzYx8BzjPPsoXAoIk1dUnzDBE15C9A4ePOid/UO2bYn7oRzawPcyJvxilG9kypYQfvrFZk6JJSB0GB1H
+						B4ZNkMGNdcFYH0vdDRs26D+pfiC2jCFTihYVUsknjBeAiIRhu6QsC1sBwXOjUiyq6oX+ZSDhoaVapXAPDNJ06sdtbcZPoQiICa8Q/gub
+						lLB7Bi5HnPwKz4bcVVyPPQxXAmL6JKoCkUkPewqb8CFXtgR8Hq+Pj7q3fCBTtgS2e3AgbQGhcyZMmKDq1KkTObHQYXQIN5UqmYXyISA2
+						OyCzCcbFw65Xr57eNxHlqbBUN3FYm2R6FLYCAsTOGdD+QcjKECPq0qWLWrt2rVqxYoVuA20+e/as9nL8k4PxXMoSfZc2A/kuINTj84ZT
+						3obKCvvTp0/eT0rDczGv3o6zWiUsQ3iGzYR+LzoJrgQER2TAgAF6rwcOlGkLn8+KAEcEe4pqY65sibHJ6+YXLlyYl+KRaVsCVo62e4pS
+						CgjVNxgsNxc89u7d6131P9TnUwUUvA7j4C2bfrjxOCGrXLFv377QNuER+CuTWDryNtHgdRx4RDZJWD8uBAQuXryoBzxto3/YgMWOdJPI
+						RCxGjx6tf4YBB98AjNHSrrC6/0zZDOSrgBBPZ1NmsB2UHLPnwMCkT7I5eB1HzZo1db+kwoQgUyWJ08WVgOCQLFmyRIsb/1eHeQ5M9NgH
+						JdipyJUt4TAxDxnxoG/yQUiyZUv8Po6I7d61REl0G0zSavHixd4ZIV1cCYgteHqECJLEXW3IVwHJFnjprsKjrgSEcWxTNJILW6LNFO0Y
+						weAeEEHbsGAhwcqRULatM5J1AaGT8Dhs4m7FCvFQmx27LmF5TWVN3LirDcUsIIQmCFEQonHhKePtEwoqq/ItFfwuBTC2NplNW2IMdezY
+						UYfQKVPn4O/YVVSYqDxCeJF220ZHsi4gLMO7d+9eqgRWKDyYxJjMshGKxDMlDozBE6Ygh4MTQvGFbS6gUGCSJa6fyTcdxAXhIC9hO/Fm
+						05bI/YWFfYrpDRom5B7nP/CKIusCIpQf+G9KectqWHmm4A5i1CSq2VluO+BdQujKVShabCk7YD/YEfZkk/swiIAIVlB+GbYvRnCDGfB4
+						6C4GvEtu3LjhNJIgtpR5eLYUAbn6r7FFQARrKPUdPHhwUSUhswXJXiqUMh3ayRfEljIH6YN+/fo5zaOKgAiCIAiJEAERBEEQEiECIgiC
+						ICRCBEQQBEFIhAiIIAiCkAgREEEQBCERIiCCIAhCApT6DyBwOP/MSHc/AAAAAElFTkSuQmCC"/>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="171.433" height="17.1433" class="st1"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/equ_dual_self_para.svg b/media/libaom/src/doc/img/equ_dual_self_para.svg
new file mode 100644
index 0000000000..d294bcae25
--- /dev/null
+++ b/media/libaom/src/doc/img/equ_dual_self_para.svg
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_dual_self_para.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="1.99855in" height="0.813996in"
+		viewBox="0 0 143.896 58.6077" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+			<title>Sheet.1</title>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="107.146" height="21.8577" class="st1"/>
+			<image x="0" y="36.75" width="107.146" height="21.8577" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+						iVBORw0KGgoAAAANSUhEUgAAAPoAAAAzCAYAAAC+CxVBAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+						ARjRyu0AAAsuSURBVHhe7Z3pyw1tHMfvf8AbL5RXXkmSJC+IFBGKkKzJfqPsW1kiWbKFhJCkbKHsb+xZEonIEiL7viRk366nz/XM777n
+						zJmZc86ca44zZ65PTY8z59xnZq75fX/bdc15qpTFYql4rNAtlhRghW6xpAArdIslBVihWywpwArdYkkBVugWSwqwQrdYUoAVusWSAqzQ
+						LZYUkGih37x5UzVu3Fg1b95cderUSXXs2FE1aNBAb/ybfa1atVL169dXly9fdv7KYkkfiRb65s2b1bZt29SfP3/06+fPn6t27dqpBQsW
+						qL9//+p9Hz58UBMmTFCPHj3Sr9PKpk2bVJ06dVTbtm21A+S/vBYnydaoUSPVu3dvPWZJ5PPnz2r9+vVq586dzp74SFqQSazQv379qtas
+						WaM+fvzo7FF6QBnYY8eOOXv+/9zChQvV+/fvnT3pAwFMnjxZ3bhxw9mj9Bgh9HPnzjl7lLpy5YqaNWuW+vHjh7MnGVy/fl0NHjxYDR06
+						VItv+/btzjvxkbQgk1ih37lzJ8tzb9myRTVr1ky/J3z69Ek7hG/fvjl70gfjsWPHjhoDhJUrV2rDxEAFPrd69WrnVfJ49+6d6ty5c+xC
+						T2KQSazQiTruyPP79281c+bMrNQTj/vlyxfnVXnDuRJh37x54+wxw969e3WqKTAe1dXVavTo0doYhWvXrqn9+/c7r8zy6tUrderUqZoI
+						GAelEnoSg0yia3Q3eM2uXbuquXPnZkSupPDr1y9tFEuXLtX/jgJiOnDgQIZ4AWG7BSZpJlHdjdd5FgrnfebMmYwSQeA9ro1rjHp9uTAt
+						dMbx8OHDWal3EoNMWQv94cOHauTIkapu3bq6nuzTp4+6evWq824mRKOGDRuqgwcPOnuSA46J1JoISz0dBYySv8fQMfgwLly4oMcTUZqE
+						7yN9DRIa1zZ27NisMsIUpoXO91RVVWX0MfwoJMjgFHB27oZoKWy2LIXOYJFuIvL79+/r13jHPXv2aDFfunTJ+WQtu3bt0u8h+KRx69Yt
+						1aZNG9/ryhciOUbTokULde/ePWevPxs2bFCtW7c22iTC2HHECGPRokXO3my4Py1btizqWoMwKXSCDPeE68G2wigkyGDLZDyHDh3S49C0
+						aVN1+/Zt5934KEuhnz59WneJvdGN+oc6yJtySuqEV01ad/379+9q6tSputsdNaV9+fKlvnaM0lsneqFeHDNmjBo0aJCuIU2A8dKB5vhs
+						8+fPd97Jhns1Z86crP6AsG7duprpvrBt69atzl/UYkro3Ae653I9ub4vSpARWy7VdGbZCf3169dq0qRJ2ni94PnwgN6IwUAxYIgdQ0oS
+						Fy9e1HOvudLDIBDZqlWr9HwuaTNRndQ8CKnPw6JuoRD9EB+GizBwJGENKLnmEydOOHvMYEroZBuMZ5MmTfT1hI1V1CBDd57vdk/HxUnZ
+						CZ2FHaTtfrAfY8ZQ3IgDoPOZJLjB3OguXbqot2/fOnsLg7QfIyNtFKGFOQ2/aaBikOi3bNkyHak5flC0FuJyzCaEThbJ+WOH/JfrCctQ
+						olyL3He+u1Q9pbISOoM2ceJE9eLFC2dPLU+fPtUGvWTJEm1c3BBSXiIJK5AYNFmllJS5YMSNyKMavKT9pM1kQqzIYhzCDB1naLIuJPph
+						6GRgfDfHR2yILggMncaVdx6/WEwInV4HAqes4Ry5njDHFSXIiHPIVWaZJKfQaYKdP39eDRs2THe/ufAOHTqo48ePG58TpcaZN2+eNlrS
+						P+m2051kMNeuXauNu1KQJk7UTOTo0aOqX79+OmUUI/cTOlG/Z8+e2gmyzFXGlNfFLM9kwQir0RAHcNx8hA7UtWQWJpaH0rDt27dvTReb
+						a4zi8J89e6aDiTQKieR+Qi82yEh9PmDAAJ2loifOO2xWqVhChY5XGz9+vO4OUvchbLzxvn37VL169TKmSY4cOVJzw6OCoZDKYCTTp0/X
+						A8fAM6gMjETzSkHqtCjTXIh74MCB6uTJk/q1TK/xfWGppkkk+knTlJKB4+cTqeSzuTrapQI7ptdBSi02VojjKgRsnO/F0S1fvlyPH9pi
+						NgSt4ZhNEyh0Dk5U5aZ5D8x7eHJO6sGDBzUpZDHemQUI1HhBKSWejwZOHNMybtzd4yhbjx498m7KYEhRohpGuXHjRj3mkuG4hV6KRUPe
+						6Aci3nxKA0l5Me5ygOyK66GxKIjQKYlMrVbkvkhJ4A1cMn5xNOh8hU69SHOFg65YscL3oJLW4JHx3izeF88eBepVDJf6xQ9pIpWLYZiA
+						MYxSp929e1cbpffvglJN04h90I129xYkJeUcwhqCIJ8tVfYRBmM1btw4LWw3hWQo+SL1ud86BjleHPfPV+hEaaJ12NygeDtuFJ6JerEY
+						EDLfE+TJZBDKwTBMEUXoRADm3Ino3rESoVdXV8e69BKboOYnqrtxCz1XOVJOQsd2yVDdD6mA2JzJ5qX0ZfzukWiqZEInSnNAPE9QhJWT
+						ogHCvHcx0RyI1GFTPlLXeBfLJJkoQpdlpoxF0Ga6pnTjLhHCNm909FIuQqfx2717d99rcG+5MpR8CbJjMiNmX3jP9LQj+ApdIkNYrSdC
+						N9E8YHHFjBkzAg1eBoHOpKkBLwe42YVEC2p/VrQF9SnEiExPW7mhATdt2jTf2Q9JSzmHXCWWCN3kwp1CwbZpwLH52TmpNSk212Nq3YFo
+						y5vxyFQrNm56IRH4Cp3B52TCpn3cqXuxYJS9evUKFLqUEvy4gDe9Mk2pm3H8TT7OC0Pk3KiNg7y9pJqFZgn5wlz5kCFDdI/AD3e0z2UX
+						8mBNrsgfJwQopo2DGm1hU5ZRYVz87g/3jvGIy8Z9hY5H4aBBafKTJ0/0HCADEBT18VjMZ44YMSJnGslFMl3n58mk+x/XtMO/RIRJJM4F
+						4mLO3FsXuxHxmJqfdoNzwcn49QYEWUfPNfGsQtgjr8VMLZoAp8TUcVhvCYdN05PzNFUy4jC8vS9snAe4vDbOfhaQoaN8bCQMX6EzCHQh
+						ObA8W8zNxaMz79e/f3919uxZPe1AKomQaRDxwwKCpCj5GDKDOHv2bB3VWfwAHI+pjlGjRqlu3boZa4aUEzxlxtNmuYyIVYGMM7MSP3/+
+						dPZmw72iFDAtIBqAu3fv1pEozIEwF8x95Pi5GoKk9vk8aRcH2DfpOv2lx48fO3uzYR3J8OHD9fWYmrKUp+Jkao1jULaiNe/YSnnD8cP6
+						ZfngK3RgMPihPU6AA+FVuHmskpMVcYgdw2IFG08duecE5T28F+uGg+A4OAkGgIhEN5eoxHeyUogH/ytpNZwbPDbd3qAuK+PG2Ls3v5kQ
+						KQH8Nu5hVFiKLMtq3Rv9ErfDkfrS+zk2+gU4KjcS+bn2Ypu4hSA/6Og9R+8YuQXm3VikVOxTf2RnpOjYObpavHixb/nA2EyZMkWvH2F8
+						+WGRqAQK3RRElrD6hoYHXftiBy+pIGacKX2ItCBNrrAAYKmFzIiMopiZlNiFTooU1myiVjNV/yQRSeXSZPSscmzfvn1ov8FSC46RrNcv
+						68uXWIUuD6f4PY0mIPJ/1ZApB6j7aHAxl8t4VTo0uCjJwpp6lkzIiIsNBLEJnXqdHwMM+zF90nXSdu9SwLQhXddKN36ujSlCrrWUtXmS
+						oQtPb6zYtfaxCZ3uOYbrbtB54WbTWAr7TFrA2THrwM9oVSoYLVOEQfPwlkxoevM/ijCxJiL2Gt2SP4gd712JQmAajamqtGdv/wordIsl
+						BVihWywpwArdYkkBVugWSwqwQrdYUoAVusWSAqzQLZYUYIVusaQAK3SLJQVYoVssKcAK3WJJAVboFkvFo9R/WJivBGHVOxsAAAAASUVO
+						RK5CYII="/>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="107.146" height="21.8577" class="st1"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/equ_edge_direction.svg b/media/libaom/src/doc/img/equ_edge_direction.svg
new file mode 100644
index 0000000000..d36634db1b
--- /dev/null
+++ b/media/libaom/src/doc/img/equ_edge_direction.svg
@@ -0,0 +1,121 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_edge_direction.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.2307in" height="1.4152in"
+		viewBox="0 0 160.61 101.895" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+			<title>Sheet.1</title>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="123.86" height="65.1446" class="st1"/>
+			<image x="0" y="36.75" width="123.86" height="65.1446" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+						iVBORw0KGgoAAAANSUhEUgAAASEAAACYCAYAAACrr18SAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+						ARjRyu0AABsPSURBVHhe7Z3nzxRVG4f5B/jCBxI/GBMTQ/xgCDEGgzFANGKE2CIIQRELIh1sAQyCEKkiKHaNitIjVUikWbECooCCImBB
+						RFFUxAKW8+Y6zL3v7LJ9Z3Zm9/ldyQSe2dkyM+f8zt3OmVZOCCESRCIkhEgUiZAQIlEkQkKIRJEICSESRSIkhEgUiZAQIlEkQkKIRJEI
+						CSESRSIkhEgUiZAQIlEkQjXw559/usGDB7tWrVpltk2bNgWvCiHKQSIUAfv27XMdO3Z0nTp1cl9++WWwVwhRDhKhCHj33Xdd69at3cCB
+						A90ff/wR7BVClINEKAKeeOIJ74o9+OCDwR4hRLlIhGrE4kJYQooHCVE5EqEa+fbbb12XLl1afDzo77//dv/++2/wlxDlIxGqgB9//NHN
+						mTPHB6GxfAYMGODWrFnjTjvttBYXD+JcOWfLCnbr1s398MMP7tVXX3VXXnmlvz5t2rRxN954o9u7d687duyYe+yxx/y14/h27dq5cePG
+						uZ9++in4xJP89ddf7qWXXnKXXnqp/wyO5T28l88IM3HixMz328bv4D7NmzfvlNfYVw6cB7+N38j7LrroIrdx40b333//BUeIKJEIlcnW
+						rVvdBRdc4MaPH+87A6P+iy++mOkoxIVaIt9884278MIL/TZ69Gh/TRCo8PW55JJL3GWXXeYWL16csZief/55f93o7P/880/wac4tXbrU
+						77/++uvdr7/+6jv+559/7rp37+6uuOIKd+jQoeDI//Ppp596oTr//PPd/v37/b7Dhw+7Pn36+O/evHlz2VYan3/VVVe5Z555xgsi8H7O
+						b+fOnf5vES0SoTKwRt6vXz/fMYwjR474BktHI0OWFnbv3u3at2/vO3M1GxbM+++/H3xacbA6sD543+zZs7OsBXNVee2pp57Kes1+o1ku
+						xurVq/317Nmzp7++xrp16/zn5IoW8Lnz58/375s6daoXuu+++87dcMMN/t6VC58zc+bMrN/0yy+/uGuuucZbu1u2bPH7CsH7165dmxFC
+						UR4SoRKY20ED37BhQ7D3JNbJ2Ph/1NCoP/zww1g+OypMhM466yz38ccfB3tPYq+dc845bteuXcHekxQSIcDSREjC2PGF3F7eQ4LgjDPO
+						8C7ysGHD3IoVK4JXy8PuNd/zzjvveOuJ34EFt2rVqlN+UxjcxQ4dOnjBQrhE+UiESoDwIEAXX3yxO3jwYLD3JIyMjJCjRo1yx48fD/ZG
+						B7EJvveNN94I9qQPExo6LkIRpthrxUSIzk6mccyYMd4N4vqblVYs9mYWK8eNHTu2qGjkg3vIvbTvQtBuvvlm99FHH2VZcYXAJec351pq
+						ojgSoRJMnjzZN8h8jcvqg+KKByFyxKH27NkT7Ekf1YqQCWyuCJFhxMXFJZw+fbq3ArFITLSKiRBC8dxzz/l7cscdd2RiOpVA/KlHjx4Z
+						IWJDjEqVX1ipxqJFi4I9olwkQkUw85yGOHfu3GDvSazRlRMrqBa+kwDt0aNHgz3po1oRstfCIkS8jbgblg8xnrD1UY4I8X4ylgSo+YxK
+						3TFzAxE9RJL4DrEp7j+ZuGIglohXrksqSiMRKkJYhHJHQosHMZrTYHHVRowYkYkH0KCJE5DV4Tg+h+PgzTff9GnfkSNHeitn+PDh/nMm
+						TJiQeR9/MwITZyDDs3DhQv/ecrAOayN5pVs1gekoRMjcW7umYejcxJ24jqT1CUCT+jcQLALj7N++fbt3y9iwbMphyZIl/rzvvvvuLNea
+						WBYxLSziMFhZCxYs8FYb9weR4lxyf7cojUSoBEzFoGOGRYiR0tLPFg8idsSxdAbSvNdee61P8zKysg9hwUWgA5EpQsSohRk6dKi3dGjU
+						iBHvARoznZGsUJqJQ4TyBfotdY8Icfxdd92V9Zmvv/66T8mbCCDa3B+sVYS9GOHBBhcw7HYjxtQLhUWZe4rYITzcN7Pg4ooNNjsSoRIw
+						kjKiTpo0yTc+GiwjLjUrWCrEirB+sGqoI+EYju3fv39W4ye4jKi89dZbvkMxshN0tawRQkUNEp0GV4/Xu3btmup4EJhYYi3s2LEj2HuS
+						8Gu52TGzMMJWD9cWUUYM7r//fv83go/lSGyM643l8cUXX3jBxvrkelMewesIvblwCEPv3r29EOHWclwhzIoaMmRIxvVln8WHEKbw+3HT
+						qJBn9QSwgHZLrRWrFYlQGWzbts3HBmjQdBxG2RMnTrjly5f7vxkply1b5hsuozMjP0ITho4QHuFXrlyZlc6l8RP/sQA4Ac40p3vD1kN4
+						49yxGrFwcl9j32effVbwNSwcc0cty8W1nTJlihcqOj9/I94IT27FdNjiyldNze/ld+eD/Q899JB3fzkW14x7TgU4QmjYeYetHn4bv79U
+						8FrkRyIUMVg8uTUzNlJaJ0BkEBssHxu5yQoxujK1IN/rIh1YLDCcqMCNxGrVWlLVIRGKGHO7zMUAzHYEhpEcsG6wcrCGDCwnGvKBAwcy
+						r1u6l3lLbCJ5bLAIWz0I0i233OIrvIn3lYpBiWwkQhFDnIIJnFQ6A24WcZ5wXMEyPTaVAeEhkE1wFSxoS0MnyM171bDTAfeBeJ+528TB
+						cB1x/xhsyLKJypAIxQCTXQmgkobPF1fAArr88st9hoX0LrEg3mOuF/8Sd6LehQBs2KpqaXDuzKpPEwSssVSxeCmrYLDo3Lmzr67ON8FW
+						FEciVGcs3mMBaJEfJoEi4ASIw26raD4kQnUmN94jCkMQWFXIzY9EqI7QmXDRSPXjalVSBd0SIQ2P2xpe0kM0HxIhkVo0K71lIBESqYE4
+						EFXLZAYHDRrkK57D9TiiOZEIiVTAWkBM9mXyKVBTxTSNuFYoEOlBIiQSxyaAsrSqlSkgPlhEuRNZRfMhERKJQ51NrtWDG1ZsrpdoHiRC
+						InEIQIenuthcOz3RtmUgERKJg9iErR7ECFFiHh7r+KR5jW1ROxIhkTgsCEcBJ/VALBLGKoa2LMcLL7xwygMGRHMhERKJw8Re1hBinSAm
+						/7JiABNCWaiMCaFazqS5kQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkBAiUSRCQohE
+						kQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkDgF
+						Hru8detW9+WXXwZ7hIgPiZDIwDPhv/jiC3ffffe5Nm3auE2bNgWvCBEfEiHhdu/e7dq3b+86dOjg+vbt6y644ALXqlUriZCoCxIhkcUf
+						f/zhBg4cKBESdUMiJLKQCIl6IxESWUiERL2RCIksJEKi3kiERBYSIVFvJEIiC4mQqDcSoTx8/PHHrnv37u6SSy6p69a7d+/ECwQbUYSO
+						Hz/uJk6cmPeaxr098cQTvrhTVI9EKA/79+/P1MrY1qtXL7d+/Xr32muvVbStW7fOPfjgg27q1Kl+Gzx4sLvwwgtd69atsz7ftmeeeSb4
+						FcnQiCKECMyePTvrOrZt29YLRL57UmqbP39+5n4hbldeeaVr165d1ufbdvHFF7uDBw8Gv0RUg0SoAK+//ro744wzMo2N/7MvKqhO/uST
+						T9y0adPcOeeck/meq666yh05ciQ4qv40qjt27NgxL/B2Hdn4m/1RwX1ZvXq169OnT9YgsnTp0uCIxoY2uWTJEnf11Ve7jh07uhEjRriv
+						v/46eDU+JEIF4IYwEoYbNdYRVlLU0PFffPFFP9rSuDds2BC8Un8aOSb06aef+s4TvmdYSFG7S3ze3r17Xb9+/fx39O/fP1KxS4pFixa5
+						BQsW+PMzUb/iiivcoUOHgiPiQSJUBEa+nj17ZjXqqEfXMMSDiEUNHz7c/fXXX8He+tLIIgQrVqzIslKitmDDMFDhPvMd77//frC3Mfnl
+						l1/cHXfc4Q4fPhzscf7+cy0RpziRCJUg3+j61FNPxRaM/Oqrr3wMguB4EqRFhBBh4jOVCn4+C7Zr167uwIEDwRHR8u+///rY09ixY/13
+						p5Fff/3VrV27tujAZvMHmbz8zz//ZO0jLhYnEqESIDYEKus1ugIN5t577800hnpy9OhRd9NNN/nzJP4Rl9gWA+GhU3Pdq/l+3AfcCLtf
+						bHFasHTyQYMGuZ07dwZ70oW1Ya5poWvw/fffu8svv9zdfffdPtsIW7ZscaeddpqbN2+e/zsuJEJlwAjHDQw36jh9ZUZXzGP+rQc//vij
+						69atW9b55W5xN0TDLJlJkybVZFlgSZ599tmZ388gUq2olQMWJFtasevKVui6njhxIvMaA+DkyZMVE0oT+UZXfOikYjfNysaNG91FF11U
+						cwKgkAW7efPm4IiWBy4pJQXEzUqBiFMHRTgibiRCFUCMhIZsjTru0bWlYYmAqGJuWCZDhw7N3C+2eozsaYZAOm4X7lchuD633nqrjwnV
+						A4lQBdAxSKWHGzWBu3qMFi0B6m24nlE2/u+++85nHMP3rFZXr5HZt2+fT7QUKoolZsT1+fzzz/3fhAR+//13//+4kAhVCDcptyiOAsMf
+						fvghOEJUg2Xl2KKOreQWnmLBLly4MHi1ZUHQedSoUe7666/3SYgwCPPcuXOzpg7hlilFn0LyTesoFvATpbF0MOnuqMGCzZ3WgTXQUi1Y
+						hOass87KKgOxGNq5556bNTeOJX/feOON4Kh4qFmEMNeYU3XppZf6EYYR55577nE//fRTcERzkm90JbUuqoM5dlzHuBp8PguWimfS62kA
+						EaBNUSNGW+rRo4d79913fZZ03Lhx/sEDzDlkX63wGXzHypUrgz3O7dmzxwtQ+Pqw5YpVHNQkQqaeXDDK2AFTmnQ2Fy6JOpd6gdWTWxTX
+						kkfXWsECYg7drl27gj3Rk6/wNA0WrPWjCRMmeLHkb2KPWIYE6skYEki++eabfXyr1rmFZnWSgk8DNYmQBbnCNSSoKzcXv9OKngpBHOWl
+						l14K/ipNblC40o3MSJSTQxlFbf6QbXEWxTUzVOV26tQp9qVMsFaxAux+Yc3GWXhaDlgaI0eOzGo3WIT8PpvCw0oM/H3NNdd466gW6Hek
+						6seMGZMKQ6EmESJlzYXB8jGzFovo4YcfLjr7lpgKCo+JGTYJG5F8o2uc0zqaEQtKUzBJ4WSc5LNg45zWUQ78ntxJy1iG/DYb4GlnCNH2
+						7dv937VgxalxJAGqoSYRsrJuu5mdO3d2jz32WFmWwLfffuvduLj9zXpApiV3dI3Cd28p1FOEAEuAjKbdLzaWrfjzzz+DI5LFrkdc8Zim
+						EiFGFUQHiyZ8QykQK3VydFKKpqJ0j5Iid3Stl0sWvuaNsuWj3iIEYQs2DS5ZGFxSXNMoXK98NI0I0fGso3EiZi5yQ8spOMPcTItPGgXM
+						fqfhtPSK3Gqw2hXiFPWqt8JdnjNnjrdg01b1bktoxNU/TIQYLNNg/VUlQjSUa6+91gvOtm3bgr0nb+z48eN9qo+UXxjiQEOGDPEnz4xj
+						1lOmXqES0haYNswSUnaseghMR10tXQyzhJLOjtFnWK2A/mDz2iwInRsvJT5kMSJ+86xZs/w5sBAZwkWcldqe5cuXFxVVS8fHvURHuVQl
+						QhaQ5gJYeTcQxSean7soFzec1KIF1chQIGDElBodbjYj6fnnny8BqgE6F/HFerQJm4yctAABsdEuXbr4/kRlMoOkxavoZ4bN57KJvSwb
+						smbNGn8M140Bmpo9MtYM9MWWFSHORLypUiPA4JpNnz7dfy8eTa3WWlUixIXgRFl0yhQXl4ysEAVV4dHM0tgzZ87MHEtD4/3cgEaG87H5
+						TmmKKTQi+Qro4gArnikLaSmlMBEiw8yqhogJAskgbes58ZuJs4bbGMkQ+hkicMstt2Tmd3Esbm2xqRYUhnKtq02emDuHUBKCqLWsouqY
+						EG4YC35bUJry7ilTppzi06PUuVYPCpyWoFgt0CgYUdIWU2hEeGIFnYfJk3FdS0QH8UlT3I5zxX2iUJO+hCuGoFCgaE9lwc3aunXrKdfF
+						Ymm8xzArB6EpBEWKXOtqnxKCJYRRcfrpp7vzzjuv5sXcqhahckGpwwHHfBeuEcH1wgXDLE3apA9j8YTwVqh6HcuVRh4+lr/JeNYbfh+/
+						M66njXCPcL+Y81evuFPcmBUVnuqCJVkstka2jaxbFDMaEMUZM2bUfD1jFyE6RdjqMXORC8fi4HFPjouDNMUUCmGV62zE7ogV5IM4Alki
+						zocMX5IQmGUUj/ppI3SWZozb4V1QaGnukMVkce0KtUv6HEHpKOqPELTcxfGrIXYRokGhvIxuXCRMQVPqF154oeEeHIeIpimmkA86HW4N
+						7jGCjxCFY3K5YK2yJQ0dhw4UpavOOTdr3I57xhK2CArniQtH1rqQq0n/QzSKiVQlYECQYUskMF0JnCzmPf4tM4S5UPxwzGIetFaoY6SR
+						esUUEGhGtGpFjnViyKTQOBEfRKhQDIA6EYKeaanwZhEyilijWpGgHnE7OiFWcT2fF2dhDRamHzZsmE/N42LlxmTD0Pdwd7nGtUL7Z0Jt
+						ODteLbGLULOAmNYjpsD3YMWEJwVXCnUg1113nTeTCRragu/5nhRKXAHLLk2ZStwMhL5W16lecTu+hyRNrW5JJVg8qFgAOgzC07dvX/fO
+						O+8Ee2rj5Zdfdm+++WbwV21IhMqABsxymHHHFOx7SHsWiuGUAw3TVjGwOAEilO9JoVhAWEJpmTdlkA267bbbio7sxSBOQm1a3HE7+55i
+						7m4cINQMiLlFwfngGlIgHM5QpwmJUAloWJjymPRxxhRwofDVyU6ZgFQLyYBwIRpuAp9LcVnuk0LTEg+KEkscxBm3o128/fbbPkVdryJL
+						4+mnn/YlMaT0cV3jtMzrgUSoBHHHFCjmfP75532dCNYKYlFLbIEak9xRL1yFG07Npi0eFAVxx+3IJu7YscPdeOONmfKGZnkWfVJIhIoQ
+						RUwBi4YKU9so8sRdmjZtWmZJXBqybbUUkQHuAfGgXDcGN4/PJz5kxWWVxIMQYILFNm0gF86NJ7cyQtdqyVUL96jWuB3nSerZ7tc333zj
+						Yx8MQohbu3btsu4XW6EnV4jykAgVgEWuGE1zG1zcW62xBSrUScPmioCtghn+Diwg4i6l0uFkNxGXYktLYCEgZkzRScK94/uZ8pAr6nFv
+						xWqwRHlIhPJgMYV8jS7OLV/MplIKxXisIpnvMWurkngQx5VaWgIrjKB6vQtQEVQslXoLEFvuZG1RORKhPOS6UPXaiN0wolcLFg2WTaEY
+						DwKH0NF5sBqIB4VnaheC2BGuSKnnT/G9WAblZGyiJNeFqufW6PMf04BEqIkoFeOhw1CNjAhRPEpmpZx4EMeUsxQv1lJcqwGK5kUi1ESU
+						U/Nj6XqEqND0CNwLFsoio0YlLhXuLN1QrGYH65GANIvaYZngmlG1TQ1NFFW1onmRCDUR5cR4wun6fMdahgnhQYxsPahSGS8EilgTE2fJ
+						vvHZ1NGQTSrH5RMtF4lQk8CjlrA6SBeXiitxDNZQPnEgDR+u2DYLp5S44apRuMfjnp599lkvZsyKv/POO33sRIhCSIQaHNLn+bJCjz/+
+						eHDEqSAwvXr1OiUeZDGjsNWDhYMrVsqaIWjN97K0BNaQivdEuUiEIoSnydIJzzzzzNTO0ykGosSkyPCUD84jvGZNPogBEQsihY/7xvQT
+						VRGLcpEIRQiWBOsC05HLyTqlDavzCVs9CBJrGBNLYiVGhAV3z9Y0BrJhZMVs5j91QlhPuGGvvPJK1hNZ0gZuLFYhsSvmYiG4BOPZKDcY
+						MGBA0acJi9qRCEUMk0fDC483EggMFowt+cEcKToiQWpcONZ/AlZiZN1we0QNdUFMlTDrD3cMty4sXGmGAkysuNzyAuJaWHWs8cy5iHiQ
+						CEWIFfWxemSjQjqdzkima8KECd4q4vHeLGBlE0IRI0SIND5QGsCKftZROY6MGrVFhQon44ZpNwTgyfTZxu/Nl+EzS87KC8JgCRJzS+o8
+						WgISoQixmAqWQLOD9ZPG86Ss4JFHHnEPPPCAn3xarKzA2LVrl1/FILci3Cykej6UsSUiEYoQ3BHcFxo1pjxTI4iN4KIVm3PViLASQD2D
+						74gAywPzpFKsNVaf5NqGFz7jGs+ePbvidZ8QU5Zrya0IZ34dFiETgjU/LD4kQhGC6Y5Z//333/u6GlYHHD16dMHK5EYFd4vzQ2jrAeKC
+						a4WLxbUkRsP/cZ1YEcDWSKJIEuHPdamKwbEIGsuqhJdnxbXk3jENJlwpTiAb1zOf60Yg/vbbb09sKZNGRSIUEWa6k03BFWC0RniIDzWb
+						e0YHq2cnM9FDeHKfs4XwU6SJaKxatcpboZXAipYIDY/BYZoJIseCZawZvWzZsrwWEL+l0GRe7nW4xEGURiIUEXQCOkPbtm39yIz7UMmI
+						LEqTW7Nkwo+IICY8GhnxQEjybbNmzTpFVIhtIUClKsINkg+4Z/km85pVVU83tRmQCEUEIzAz02mc7733nm/Y9XwETEsAoQiXP1hWi46P
+						AFRjCRHbKpX9oi6Kz2ZKyr333lvwyRr8nhEjRvgEBU+joLQhzTVSaUEiFBGY4RZXoJPQWSjeo4KYznPixIngSFENuH/EWgjyG7lPE60m
+						JsTnlSou5XldDz30kP9cXMFCMR+s37vuustbZMSOcO8kQqWRCEUEsR9rnDZCU2PzwQcfuNdeey04SlSLlT/Y00Nxv6jJCj+AgP3MpSvX
+						HbJ4EJ9TaPkTCi2J89lnEgsq5LphVeGO85uUTSsfiVAE0IBZx2fFihX+bzrF4sWLfRUxJrnmUNWOxYPIhhF7I12/fv36U1YMIBlAYoAn
+						mBSqcuYYxAz3GVeMKRtYsR9++GFwxP8h/mQPDiAGxft4gGA+yxarigGHe09VOaIoC7g0EiHREGB9WAC6FAwCuEaPPvqot3JsLhgDQqVZ
+						PcQHS4j0O08x5f9YtzyB4+eff/ZTXBhkcMFZtoRANxYw38W0F9wyURyJkEg9Fg9KYjoMgobQsEbSk08+6WuHEERif1999ZW3zghqU0ZA
+						pg5Bwnri/8QJ61VL1chIhESqYQY7xYHMcOexQzNmzPBuUVrA4vroo4+Cv0Q1SISEqBKsJOKAhWJPojwkQkJUCa7Wb7/9FvwlqkUiJIRI
+						FImQECJRJEJCiESRCAkhEkUiJIRIFImQECJRJEJCiESRCAkhEkUiJIRIFImQECJRJEJCiESRCAkhEkUiJIRIFImQECJBnPsfh6LP/cPu
+						K/UAAAAASUVORK5CYII="/>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="123.86" height="65.1446" class="st1"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/equ_guided_filter.svg b/media/libaom/src/doc/img/equ_guided_filter.svg
new file mode 100644
index 0000000000..021c194d7a
--- /dev/null
+++ b/media/libaom/src/doc/img/equ_guided_filter.svg
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_guided_filter.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="1.42115in" height="0.772328in"
+		viewBox="0 0 102.323 55.6076" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+			<title>Sheet.1</title>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="65.5731" height="18.8576" class="st1"/>
+			<image x="0" y="36.75" width="65.5731" height="18.8576" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+						iVBORw0KGgoAAAANSUhEUgAAAJkAAAAsCAYAAAB2Wxp8AAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+						ARjRyu0AAAZ7SURBVHhe7ZvvK15vHMf9A57sgdoDeSTtgSRJ8URtZUWhaPPAj7C2FpOE2APlx2JCnnkg+bGhlZUUTbS2VsuoJUIYkl9h
+						MjHMj8+398e59r3dzn3f577Pdcx2X6866fy4z65znff1vj6fz3XmQwqFxSiRKSxHiUxhOUpkCstRIlNYjhKZwnKUyBSWo0SmsBwlMoXl
+						KJEpLEeJTGE5SmReyM+fP+nr1680NDREg4OD9OXLFz4mODo6ovHxcTo9PdWOmMMrRPb582fy9fUlHx8ft7ZXr15pd/j7OT8/p6mpKcrL
+						yyN/f39KTU2l169f0/v376m9vZ2SkpJoYGCABfbixQveZOFVTra0tESRkZEsIIxgPX78+EFlZWV8TW9vr3b072Z/f5/Ky8vJz8+PiouL
+						aXNzUzvzP3t7e3wuPT3daf94gleJ7MOHD9yBYWFhNDc3px29ihDjp0+ftCPXx+rqKs3Ozmp75oGg4FoBAQH05s0bOjs7085cZWRkhG7f
+						vk0hISE0MzOjHTWPV4msvr6eRYZOx8h1xPb2Nj18+FBqRxsFU7SsaXpjY4OSk5M5VMDUiCnTGRDkvXv3XPaPu3iNyBDYPn78mEVWUVHh
+						tMMhsszMTHaV60aWyE5OTqi0tJSfNycn51Jg74iDgwPKzs7mwSgTrxEZBBMdHc2dbh9r4RxexO7uLu/DAXJzc3/vXyeyRPbu3Tt2MEyT
+						o6Oj2lHnwL3S0tKkhwmGRIZ0NzExkRuNzv/+/bt25gLEMHiBNTU12pGbh8gwg4ODaXp6Wjt6AVJ5jHpnKTucoaenh+7evcv3wctA7LS+
+						vk6PHj3iY+gjs/GUDJHt7OzwNIkB9ezZM84YjQC36+rqYieXiUuRodOysrJoZWWFFY6Gd3d3a2cvgDPguGyblUlTUxO3Eam6rUN9+/aN
+						YmNjnWaSEBgGUHNzM78w7GPKjYqKopSUFJqYmKD5+Xm+z9OnT+nw8FD7pfvIEBkGDUSP58XA+NM4FRlGNlJfZGVAvKi2tjbeB7impKSE
+						j4vrHIFr8XJwracbnNToyBQcHx9Tfn6+7v2wBQYGcvHREZh6qqqqWFwC9AF+W1dXx+0R98df/HueIkNkqHGhLXqu/SdwKrLl5WUemRj5
+						qLVkZGTQnTt3aHJyUruC+BzcASk/ps2biKN4DOl8a2srOxCmGD3gSoWFhZeeWQwsuIWIX/AXTo4+M4NZkdkmOM6e6zoxHPiPjY1xDQUP
+						YJupwAHgBLLTXpmItuuNbIgDgnEWj9mztbVF9+/fZ+G6m4GiLII6FETgyXbr1i2uZzlCmAGutX9XtiDksb+32MxO+fYYFpmoMdmPMtFY
+						mcsQshFTm308BhBn2ceYrhCiNTs16iHTybBy4ao2BhAS4XpnojSDIZHBoeBU9pVg23hM5jKETGzjMXcdyxEQAe6HGFU2ZkVm+06MOJJ4
+						t7jeqsTNkMhETGPvBJjvMe+7WqYRoAOuO/AXUxt+68laJIL9lpYWnoKQiQrRwsngaAI4RmVlpekak1mRAZHto6TiKiYzsp5rFkMiEw1B
+						NRhVYcHw8DB39t8Qj7nKIB0hYk4E+ai1LSwsUERExBVXxxcOT548MR1oyxAZ2iCWk5AZO6O/v58F5sgoMMhevnzJfQjn9mQmMCQyOEdB
+						QQF3LupmGLX4Bgn7aOBNrY+hnR0dHdxGrMnpfX3gCogMCUNDQwO7eHV1NTsEsmxRSYfD4esFCM0sMkQGMABQx8M7snVcgf071ItXAQqz
+						MTExfI2nFQTDgT+q/M+fP+dlCmQ4RUVFXPXGP37T4jGsUAQFBXHb9LYHDx7wJz1GwEjGCMZz456dnZ0cHKMyDvGhL1DxX1xc1H5hDlki
+						AxhU+HwHLoSg/u3bt/z9GP6iiAxhQYAYiJjq9ZIEPD9qgfgGLTw8/FIpxyiGRAaLtLfJtbU1dgdP1a3QR6bIBJg+ESvCDLAaAAezdS2I
+						69evX9qePrimtrbWoy9TXIoMUwAsNS4u7tKaFpYr4AqYrz2ZpxX6YHH+Jg5aiBIhExIpd3EpMrGUZFs9RoCI+T4+Pp47RfHvgyVD1NMs
+						CfyRDoeGhvIiMBALyshesGiu+PeBkeAjCU+/MHEpMgS5jY2NHPQiJU5ISPj9Hw4U3kFfXx99/PhR23MfQ4G/QmEGJTKF5SiRKSxHiUxh
+						OUpkCstRIlNYjhKZwnKUyBSWo0SmsBwlMoXFEP0Ht6gu9OfTLrAAAAAASUVORK5CYII="/>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="65.5731" height="18.8576" class="st1"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/equ_wiener_filter.svg b/media/libaom/src/doc/img/equ_wiener_filter.svg
new file mode 100644
index 0000000000..fcea1c8391
--- /dev/null
+++ b/media/libaom/src/doc/img/equ_wiener_filter.svg
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_wiener_filter.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="1.45687in" height="0.790186in"
+		viewBox="0 0 104.895 56.8934" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+			<title>Sheet.1</title>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="68.1446" height="20.1434" class="st1"/>
+			<image x="0" y="36.75" width="68.1446" height="20.1434" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+						iVBORw0KGgoAAAANSUhEUgAAAJ8AAAAvCAYAAAD90RiVAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+						ARjRyu0AAAWxSURBVHhe7ZrLK25fGMf9AyZmRkpJBjIwIFKKHMXAwG1CopBbyUAUQilCuRQDA3KdGCgpYqCUASUnRSSSW0jI/bZ+fddZ
+						y9ney7bfd2/v2s7v+dTqeNde+z1r7fV9nvVd691+jCAUQeIjlEHiI5RB4iOUQeIjlEHiI5RB4iOUQeKzKbe3t6yvr4+Nj4+Lmn8PEp/N
+						+P37N8vNzWV5eXksLCyMjYyMiCv/HiQ+m3JxccF+/fpF4iN8D4mPUAaJzwDLy8vM39+f+fn5eVTs+lBXV1dZYGCgyz4HBQWxtbU13q6n
+						p4cFBAS4bJeens6urq54O2/xVHyHh4csMTHRqS9zc3OihT7r6+vcY2rvjY+PZ1tbW6KF9ViW+fb391lMTIzugK+vr1lDQwNvMzU1JWrt
+						iXY8k5OTotYZjBVtIiIiLJ0obzPf/f09Ky4uZtHR0bxfQ0ND4op7sLMuKSlhoaGh/J6ZmRlx5XuxTHyLi4u845GRkWxnZ0fUOiMndWlp
+						SdTYE5nRw8PD2ebmpqh1pr+/n487JyeH3dzciNo/TE9Ps6SkpC9La2sre35+Fnf9wVvxHR0dsezsbFZfX8/71dTUJK64B8c5tbW1LC4u
+						7sv5sxLLxNfZ2el2ErTgoeLhfGc6twIpqpSUFHZ5eSlqP/P09MQqKyt5u+bmZvb+/i6umMdb8cE2lJWVfWRkZLSHhwdx1Zm9vT1WWlrK
+						28NufDV/VmKJ+JDqi4qKDE0CHmp+fj6PULuiFRX+xWdXnJ+fs+TkZN7OqLcyirfiwzKLRAAPFxISous/X15eWGNjI1+FZLC1tLSIq9+P
+						JeKDkGBO0XlHL4driET5AE5PT1l5eblpQ/6dnJ2dfZh3Pc8kJ9hqvwe8EZ8MGlggaW8wL+4CfWFhgQsVyUMGm9VBpIcl4tPzR/Pz89xP
+						vL6+ihr7I3e8KPjbHRMTE3zCrNjdSnZ3d1lmZib3X3im2ATAF3Z3d4sW7kEmxi8jEJ4Ur7vAQIBVVVWxk5OTj+TxHUGkhyXikynbcRLw
+						IOGZ7L6zdQTZBuPRyxqwFnLnbrXf8xZkYqwqd3d3n6yQ4+YOfYWYZ2dn+WcZbFYGkRFMi0/rj1wVLEt4KJ6CTIlJdfWdRgsm4vHxUXyj
+						MfD/1tTUuPw+d8UuwYVMjGUUaMeBei2Yj7q6uo9nI5OHr4PItPjc+b23tzc2ODiou1u0I9pNhJ7fktniq6MYXyHFpvVs8gQC4pLgTK+6
+						upptb2/zz9rk4esgMi0+vUlAuscD+Ul+T24ivvJ7cmn29VLlDvQBh8vwexLZR9gDZDSUsbExXmSGk8lDRRCZFh92g+4mYWBgwCnl2x05
+						Ych+yIKu0GYLuwQXgqawsPDTGR2CH32E94MHRLZD1kP2k8jNooogMiU+O06CGbQ+CePC+FyhXZrtElxYMh3P6GQWh/U5Pj7mPs/Rf0u/
+						p2L+TIlPOwlW+wU8CF9vODz1e95upqxGBo3jGZ0860Pp7e3lK5F2Q4FfPvALCMarIohMic9uk2AWmSmM+j29pdmXYLksKChwOqPDRg9Z
+						D33NyMjgn7VIv2dk/vDdsbGxLDU1lR+hWYHX4kMEDQ8P84Hh1wAcWv5kjI4HmbSiooK301uafQVOFbDqoM/azQaQZ314FWxlZUXU/gWe
+						EH7PyPzJgEORxzlm8Vh8eJ9NvnrjqmRlZfFXp34KGA8mx9VYtO/v4dgIE+WqnYrjJIglLS3NqS9RUVGfMhN8YFtb24efOzg4YAkJCU73
+						oWjH6wgyH+4LDg7m3hGiN4upZZf4/4HjmI6ODvHJHCQ+wiOwxOttxjyBxEcYRr79srGxIWrMQeIjDIEN2ejoKGtvb7fsPJDERxgCr2h1
+						dXV9+nXELCQ+QhkkPkIZJD5CGSQ+QhkkPkIZJD5CGSQ+QhkkPkIZJD5CGSQ+QhkkPkIZJD5CGSQ+QhGM/Qd0+F8Wgj3WpQAAAABJRU5E
+						rkJggg=="/>
+			<rect v:rectContext="foreign" x="0" y="36.75" width="68.1446" height="20.1434" class="st1"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/inter_motion_field.svg b/media/libaom/src/doc/img/inter_motion_field.svg
new file mode 100644
index 0000000000..091ae11f35
--- /dev/null
+++ b/media/libaom/src/doc/img/inter_motion_field.svg
@@ -0,0 +1,219 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export inter_motion_field.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="5.60417in" height="1.72563in"
+		viewBox="0 0 403.5 124.245" xml:space="preserve" color-interpolation-filters="sRGB" class="st21">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {fill:url(#ptrn11-12_10);shape-rendering:crispEdges;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st3 {marker-start:url(#mrkr5-20);stroke:#923931;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st4 {fill:#923931;fill-opacity:1;stroke:#923931;stroke-opacity:1;stroke-width:0.29411764705882}
+		.st5 {fill:#ffffff;stroke:none;stroke-linecap:butt;stroke-width:7.2}
+		.st6 {fill:#923931;font-family:Arial;font-size:0.666664em}
+		.st7 {baseline-shift:-32.4939%;font-size:0.649878em}
+		.st8 {stroke:#923931;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st9 {marker-start:url(#mrkr10-32);stroke:#923931;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st10 {fill:url(#ptrn17-38_36);shape-rendering:crispEdges;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st11 {marker-end:url(#mrkr10-44);stroke:#923931;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st12 {marker-end:url(#mrkr10-56);stroke:#000000;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st13 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.29411764705882}
+		.st14 {marker-start:url(#mrkr5-62);stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st15 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.34246575342466}
+		.st16 {fill:#0070c0;font-family:Arial;font-size:0.666664em}
+		.st17 {marker-end:url(#mrkr10-70);stroke:#0070c0;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st18 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.29411764705882}
+		.st19 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st20 {fill:#000000;font-family:Arial;font-size:0.499992em}
+		.st21 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Patterns_And_Gradients">
+		<pattern id="ptrn11-12" v:fillPattern="11" v:foreground="#002060" v:background="#ffffff" patternUnits="userSpaceOnUse"
+				width="6" height="6" viewBox="0 0 64 64">
+			<image x="0" y="0" width="64" height="64" image-rendering="optimizeSpeed"
+					xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAgAAAAICAYAAADED76LAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAAgSURBVChTY2BQSPiPDDD4UBongCvAZRJhEwbcDf//AwDDKesZBFJo/QAAAABJRU5ErkJggg=="/>
+		</pattern>
+		<pattern id="ptrn17-38" v:fillPattern="17" v:foreground="#923931" v:foregroundOpacity="0.47" v:background="#ffffff"
+				v:backgroundOpacity="0.47" patternUnits="userSpaceOnUse" width="6" height="6" viewBox="0 0 64 64">
+			<image x="0" y="0" width="64" height="64" image-rendering="optimizeSpeed"
+					xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAgAAAAICAYAAADED76LAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAAlSURBVChTY5hoaVj+798/MAax0fkMuCRgfMIKcEnA+APvBsNyAMXonGF0YUavAAAAAElFTkSuQmCC"/>
+		</pattern>
+	</defs>
+	<defs id="Markers">
+		<g id="lend5">
+			<path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr5-20" class="st4" v:arrowType="5" v:arrowSize="1" v:setback="5.47" refX="5.47" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(3.4) "/>
+		</marker>
+		<g id="lend10">
+			<path
+					d="M 0 0.75 C -0.414214 0.75 -0.75 0.414214 -0.75 0 -0.75 -0.414214 -0.414214 -0.75 0 -0.75 0.414214 -0.75 0.75 -0.414214 0.75 0 0.75 0.414214 0.414214 0.75 0 0.75 Z "
+					style="stroke:none"/>
+		</g>
+		<marker id="mrkr10-32" class="st4" v:arrowType="10" v:arrowSize="1" v:setback="2.07" refX="2.07" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(3.4) "/>
+		</marker>
+		<marker id="mrkr10-44" class="st4" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+		</marker>
+		<marker id="mrkr10-56" class="st13" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+		</marker>
+		<marker id="mrkr5-62" class="st15" v:arrowType="5" v:arrowSize="0" v:setback="4.63" refX="4.63" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(2.92) "/>
+		</marker>
+		<marker id="mrkr10-70" class="st18" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(47.13,0.12) rotate(-90) scale(-1,1)">
+			<title>Parallelogram</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 124.25 L97.2 124.25 L108 70.25 L10.8 70.25 L0 124.25 Z" class="st1"/>
+		</g>
+		<g id="shape2-3" v:mID="2" v:groupContext="shape" transform="translate(155.13,0.12) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.2</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 124.25 L97.2 124.25 L108 70.25 L10.8 70.25 L0 124.25 Z" class="st1"/>
+		</g>
+		<g id="shape3-5" v:mID="3" v:groupContext="shape" transform="translate(-60.87,0.12) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.3</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 124.25 L97.2 124.25 L108 70.25 L10.8 70.25 L0 124.25 Z" class="st1"/>
+		</g>
+		<g id="shape4-7" v:mID="4" v:groupContext="shape" transform="translate(26.88,31.62) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.4</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 124.25 L24.3 124.25 L27 110.75 L2.7 110.75 L0 124.25 Z" class="st1"/>
+		</g>
+		<g id="shape5-9" v:mID="5" v:groupContext="shape" transform="translate(134.88,49.62) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.5</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<pattern id="ptrn11-12_10" patternUnits="userSpaceOnUse" patternTransform="rotate(-90) scale(-1,1)"
+					xlink:href="#ptrn11-12"/>
+			<path d="M0 124.25 L24.3 124.25 L27 110.75 L2.7 110.75 L0 124.25 Z" class="st2"/>
+		</g>
+		<g id="shape6-13" v:mID="6" v:groupContext="shape" transform="translate(-81.12,13.62) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.6</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 124.25 L24.3 124.25 L27 110.75 L2.7 110.75 L0 124.25 Z" class="st1"/>
+		</g>
+		<g id="shape7-15" v:mID="7" v:groupContext="shape" transform="translate(56.8008,-95.4345) rotate(9.46232)">
+			<title>Sheet.7</title>
+			<desc>MVref</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="41.0586" cy="124.245" width="82.12" height="0"/>
+			<path d="M4.1 124.25 L4.46 124.25 L82.12 124.25" class="st3"/>
+			<rect v:rectContext="textBkgnd" x="32.0251" y="118.245" width="18.067" height="12.0287" class="st5"/>
+			<text x="32.03" y="127.25" class="st6" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>MV<tspan
+						dy="-0.287em" class="st7" v:baseFontSize="8">ref</tspan></text>		</g>
+		<g id="shape9-24" v:mID="9" v:groupContext="shape" transform="translate(164.801,-77.4345) rotate(9.4623)">
+			<title>Sheet.9</title>
+			<path d="M0 124.25 L82.12 124.25" class="st8"/>
+		</g>
+		<g id="shape12-27" v:mID="12" v:groupContext="shape" transform="translate(123.949,167.675) rotate(-170.538)">
+			<title>Sheet.12</title>
+			<path d="M1.55 124.25 L1.91 124.25 L27.37 124.25" class="st9"/>
+		</g>
+		<g id="shape13-33" v:mID="13" v:groupContext="shape" transform="translate(263.13,0.12) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.13</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 124.25 L97.2 124.25 L108 70.25 L10.8 70.25 L0 124.25 Z" class="st1"/>
+		</g>
+		<g id="shape14-35" v:mID="14" v:groupContext="shape" transform="translate(242.88,67.62) rotate(-90) scale(-1,1)">
+			<title>Parallelogram.14</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<pattern id="ptrn17-38_36" patternUnits="userSpaceOnUse" patternTransform="rotate(-90) scale(-1,1)"
+					xlink:href="#ptrn17-38"/>
+			<path d="M0 124.25 L24.3 124.25 L27 110.75 L2.7 110.75 L0 124.25 Z" class="st10"/>
+		</g>
+		<g id="shape8-39" v:mID="8" v:groupContext="shape" transform="translate(353.801,-45.9345) rotate(9.46229)">
+			<title>Sheet.8</title>
+			<path d="M0 124.25 L25.71 124.25" class="st11"/>
+		</g>
+		<g id="shape15-45" v:mID="15" v:groupContext="shape" transform="translate(272.557,-59.475) rotate(9.46231)">
+			<title>Sheet.15</title>
+			<desc>MVref</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="41.1819" cy="124.245" width="82.37" height="0"/>
+			<path d="M0 124.25 L82.36 124.25" class="st8"/>
+			<rect v:rectContext="textBkgnd" x="32.1485" y="119.445" width="18.067" height="10.0769" class="st5"/>
+			<text x="32.15" y="126.64" class="st6" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>MV<tspan
+						dy="-0.287em" class="st7" v:baseFontSize="8">ref</tspan></text>		</g>
+		<g id="shape16-51" v:mID="16" v:groupContext="shape" transform="translate(245.314,-64.0156) rotate(9.46229)">
+			<title>Sheet.16</title>
+			<path d="M0 124.25 L25.71 124.25" class="st12"/>
+		</g>
+		<g id="shape17-57" v:mID="17" v:groupContext="shape" transform="translate(163.726,-75.3635) rotate(9.46229)">
+			<title>Sheet.17</title>
+			<desc>MV0</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="41.6032" cy="124.245" width="83.21" height="0"/>
+			<path d="M3.47 124.25 L3.83 124.25 L83.21 124.25" class="st14"/>
+			<rect v:rectContext="textBkgnd" x="33.3787" y="119.445" width="16.449" height="9.59985" class="st5"/>
+			<text x="33.38" y="126.64" class="st16" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>MV0</text>		</g>
+		<g id="shape19-65" v:mID="19" v:groupContext="shape" transform="translate(245.326,-61.7636) rotate(9.46229)">
+			<title>Sheet.19</title>
+			<path d="M0 124.25 L25.71 124.25" class="st17"/>
+		</g>
+		<g id="shape21-71" v:mID="21" v:groupContext="shape" transform="translate(225.375,-0.375)">
+			<title>Sheet.21</title>
+			<desc>Current frame</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="29.25" cy="117.495" width="58.5" height="13.5"/>
+			<rect x="0" y="110.745" width="58.5" height="13.5" class="st19"/>
+			<text x="10.74" y="119" class="st20" v:langID="2052"><v:paragraph v:spLine="-1" v:horizAlign="1"/><v:tabList/>Current frame</text>		</g>
+		<g id="shape22-74" v:mID="22" v:groupContext="shape" transform="translate(331.125,-0.375)">
+			<title>Sheet.22</title>
+			<desc>Reference frame 1 (R1)</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="36" cy="117.495" width="72.01" height="13.5"/>
+			<rect x="0" y="110.745" width="72" height="13.5" class="st19"/>
+			<text x="4.49" y="119" class="st20" v:langID="2052"><v:paragraph v:spLine="-1" v:horizAlign="1"/><v:tabList/>Reference frame 1 (R1)</text>		</g>
+		<g id="shape23-77" v:mID="23" v:groupContext="shape" transform="translate(119.625,-0.375)">
+			<title>Sheet.23</title>
+			<desc>Reference frame 0</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="29.25" cy="117.495" width="58.5" height="13.5"/>
+			<rect x="0" y="110.745" width="58.5" height="13.5" class="st19"/>
+			<text x="4.41" y="119" class="st20" v:langID="2052"><v:paragraph v:spLine="-1" v:horizAlign="1"/><v:tabList/>Reference frame 0</text>		</g>
+		<g id="shape24-80" v:mID="24" v:groupContext="shape" transform="translate(0.375,-0.375)">
+			<title>Sheet.24</title>
+			<desc>Reference frame of R1</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="36" cy="117.495" width="72.01" height="13.5"/>
+			<rect x="0" y="110.745" width="72" height="13.5" class="st19"/>
+			<text x="5.65" y="119" class="st20" v:langID="2052"><v:paragraph v:spLine="-1" v:horizAlign="1"/><v:tabList/>Reference frame of R1</text>		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/inter_obmc.svg b/media/libaom/src/doc/img/inter_obmc.svg
new file mode 100644
index 0000000000..a69084b08e
--- /dev/null
+++ b/media/libaom/src/doc/img/inter_obmc.svg
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">
+<!-- 由 Microsoft Visio 11.0, SVG Export, v1.0 生成 inter_obmc.svg 页-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.98609in"
+		height="2.98609in" viewBox="0 0 214.998 214.998" xml:space="preserve" color-interpolation-filters="sRGB" class="st4">
+	<v:documentProperties v:langID="2052" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#ffffff;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.2}
+		.st2 {fill:#000000;font-family:Times New Roman;font-size:1.16666em}
+		.st3 {fill:#8c8c8c;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.2}
+		.st4 {fill:none;fill-rule:evenodd;font-size:12;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>页-1</title>
+		<v:pageProperties v:drawingScale="0.0393701" v:pageScale="0.0393701" v:drawingUnits="24" v:shadowOffsetX="4.25197"
+				v:shadowOffsetY="-4.25197"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(1.2,-1.2)">
+			<title>工作表.1</title>
+			<desc>4</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="35.4331" cy="144.132" width="70.87" height="141.732"/>
+			<rect x="0" y="73.2661" width="70.8661" height="141.732" class="st1"/>
+			<text x="31.93" y="148.33" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text>		</g>
+		<g id="shape2-4" v:mID="2" v:groupContext="shape" transform="translate(72.0661,-1.2)">
+			<title>工作表.2</title>
+			<desc>0</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="70.8661" cy="144.132" width="141.74" height="141.732"/>
+			<rect x="0" y="73.2661" width="141.732" height="141.732" class="st1"/>
+			<text x="67.37" y="148.33" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text>		</g>
+		<g id="shape3-7" v:mID="3" v:groupContext="shape" transform="translate(107.499,-142.932)">
+			<title>工作表.3</title>
+			<desc>2</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="35.4331" cy="179.565" width="70.87" height="70.8661"/>
+			<rect x="0" y="144.132" width="70.8661" height="70.8661" class="st1"/>
+			<text x="31.93" y="183.77" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text>		</g>
+		<g id="shape4-10" v:mID="4" v:groupContext="shape" transform="translate(178.365,-142.932)">
+			<title>工作表.4</title>
+			<desc>3</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="17.7165" cy="197.282" width="35.44" height="35.4331"/>
+			<rect x="0" y="179.565" width="35.4331" height="35.4331" class="st1"/>
+			<text x="14.22" y="201.48" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text>		</g>
+		<g id="shape5-13" v:mID="5" v:groupContext="shape" transform="translate(72.0661,-142.932)">
+			<title>工作表.5</title>
+			<desc>1</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="17.7165" cy="197.282" width="35.44" height="35.4331"/>
+			<rect x="0" y="179.565" width="35.4331" height="35.4331" class="st1"/>
+			<text x="14.22" y="201.48" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text>		</g>
+		<g id="shape6-16" v:mID="6" v:groupContext="shape" transform="translate(72.0661,-72.0661)">
+			<title>工作表.6</title>
+			<rect x="0" y="144.132" width="35.4331" height="70.8661" class="st3"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/inter_spatial_mvp.svg b/media/libaom/src/doc/img/inter_spatial_mvp.svg
new file mode 100644
index 0000000000..aa2e88afe8
--- /dev/null
+++ b/media/libaom/src/doc/img/inter_spatial_mvp.svg
@@ -0,0 +1,215 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export inter_spatial_mvp.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="3.50333in" height="3.01208in"
+		viewBox="0 0 252.24 216.87" xml:space="preserve" color-interpolation-filters="sRGB" class="st10">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st2 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st3 {marker-end:url(#mrkr5-45);marker-start:url(#mrkr10-43);stroke:#ea700d;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.25}
+		.st4 {fill:#ea700d;fill-opacity:1;stroke:#ea700d;stroke-opacity:1;stroke-width:0.3315649867374}
+		.st5 {marker-end:url(#mrkr5-54);marker-start:url(#mrkr10-52);stroke:#f59d56;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.25}
+		.st6 {fill:#f59d56;fill-opacity:1;stroke:#f59d56;stroke-opacity:1;stroke-width:0.3315649867374}
+		.st7 {marker-end:url(#mrkr5-54);stroke:#f59d56;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.25}
+		.st8 {marker-end:url(#mrkr5-70);marker-start:url(#mrkr10-68);stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.25}
+		.st9 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.3315649867374}
+		.st10 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend10">
+			<path
+					d="M 0 0.75 C -0.414214 0.75 -0.75 0.414214 -0.75 0 -0.75 -0.414214 -0.414214 -0.75 0 -0.75 0.414214 -0.75 0.75 -0.414214 0.75 0 0.75 0.414214 0.414214 0.75 0 0.75 Z "
+					style="stroke:none"/>
+		</g>
+		<marker id="mrkr10-43" class="st4" v:arrowType="10" v:arrowSize="2" v:setback="1.974" refX="1.974" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(3.016) "/>
+		</marker>
+		<g id="lend5">
+			<path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr5-45" class="st4" v:arrowType="5" v:arrowSize="2" v:setback="5.278" refX="-5.278" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-3.016,-3.016) "/>
+		</marker>
+		<marker id="mrkr10-52" class="st6" v:arrowType="10" v:arrowSize="2" v:setback="1.974" refX="1.974" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(3.016) "/>
+		</marker>
+		<marker id="mrkr5-54" class="st6" v:arrowType="5" v:arrowSize="2" v:setback="5.278" refX="-5.278" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-3.016,-3.016) "/>
+		</marker>
+		<marker id="mrkr10-68" class="st9" v:arrowType="10" v:arrowSize="2" v:setback="1.974" refX="1.974" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(3.016) "/>
+		</marker>
+		<marker id="mrkr5-70" class="st9" v:arrowType="5" v:arrowSize="2" v:setback="5.278" refX="-5.278" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-3.016,-3.016) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape33-1" v:mID="33" v:groupContext="shape" transform="translate(72.12,-0.75)">
+			<title>Square.33</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="72.87" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape38-3" v:mID="38" v:groupContext="shape" transform="translate(72.12,-144.75)">
+			<title>Square.38</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape39-5" v:mID="39" v:groupContext="shape" transform="translate(108.12,-144.75)">
+			<title>Square.39</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape40-7" v:mID="40" v:groupContext="shape" transform="translate(144.12,-144.75)">
+			<title>Square.40</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape41-9" v:mID="41" v:groupContext="shape" transform="translate(180.12,-144.75)">
+			<title>Square.41</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape42-11" v:mID="42" v:groupContext="shape" transform="translate(36.12,-108.75)">
+			<title>Square.42</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape43-13" v:mID="43" v:groupContext="shape" transform="translate(36.12,-72.75)">
+			<title>Square.43</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape44-15" v:mID="44" v:groupContext="shape" transform="translate(36.12,-36.75)">
+			<title>Square.44</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape45-17" v:mID="45" v:groupContext="shape" transform="translate(36.12,-0.75)">
+			<title>Square.45</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape46-19" v:mID="46" v:groupContext="shape" transform="translate(0.12,-108.75)">
+			<title>Square.46</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape47-21" v:mID="47" v:groupContext="shape" transform="translate(0.12,-72.75)">
+			<title>Square.47</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape48-23" v:mID="48" v:groupContext="shape" transform="translate(0.120005,-36.75)">
+			<title>Square.48</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape49-25" v:mID="49" v:groupContext="shape" transform="translate(0.120005,-0.75)">
+			<title>Square.49</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape50-27" v:mID="50" v:groupContext="shape" transform="translate(72.12,-180.75)">
+			<title>Square.50</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape51-29" v:mID="51" v:groupContext="shape" transform="translate(108.12,-180.75)">
+			<title>Square.51</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape52-31" v:mID="52" v:groupContext="shape" transform="translate(144.12,-180.75)">
+			<title>Square.52</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape53-33" v:mID="53" v:groupContext="shape" transform="translate(180.12,-180.75)">
+			<title>Square.53</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape54-35" v:mID="54" v:groupContext="shape" transform="translate(36.12,-144.75)">
+			<title>Square.54</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape55-37" v:mID="55" v:groupContext="shape" transform="translate(90.12,-162.75)">
+			<title>Sheet.55</title>
+			<path d="M2.47 216.87 L2.83 216.87 L101.4 216.87" class="st3"/>
+		</g>
+		<g id="shape56-46" v:mID="56" v:groupContext="shape" transform="translate(270.99,90.12) rotate(90)">
+			<title>Sheet.56</title>
+			<path d="M2.47 216.87 L2.83 216.87 L101.4 216.87" class="st5"/>
+		</g>
+		<g id="shape58-55" v:mID="58" v:groupContext="shape" transform="translate(-81.3576,28.773) rotate(-38.6598)">
+			<title>Sheet.58</title>
+			<path d="M0 216.87 L223.91 216.87" class="st7"/>
+		</g>
+		<g id="shape59-60" v:mID="59" v:groupContext="shape" transform="translate(216.12,-144.75)">
+			<title>Square.59</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="180.87" width="36" height="36" class="st2"/>
+		</g>
+		<g id="shape60-62" v:mID="60" v:groupContext="shape" transform="translate(54.12,-162.75)">
+			<title>Sheet.60</title>
+			<path d="M1.74 215.13 L2 214.87 L36 180.87 L137.4 180.87" class="st8"/>
+		</g>
+		<g id="shape61-71" v:mID="61" v:groupContext="shape" transform="translate(234.99,90.12) rotate(90)">
+			<title>Sheet.61</title>
+			<path d="M2.47 216.87 L2.83 216.87 L101.4 216.87" class="st8"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/inter_tmvp_positions.svg b/media/libaom/src/doc/img/inter_tmvp_positions.svg
new file mode 100644
index 0000000000..87f8dfa80f
--- /dev/null
+++ b/media/libaom/src/doc/img/inter_tmvp_positions.svg
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export inter_tmvp_positions.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.00333in" height="1.51208in"
+		viewBox="0 0 144.24 108.87" xml:space="preserve" color-interpolation-filters="sRGB" class="st4">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {fill:#000000;font-family:Calibri;font-size:0.833336em}
+		.st3 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st4 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape31-1" v:mID="31" v:groupContext="shape" transform="translate(0.12,-0.12)">
+			<title>Square.31</title>
+			<desc>B4</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B4</text>		</g>
+		<g id="shape30-4" v:mID="30" v:groupContext="shape" transform="translate(108.12,-36.12)">
+			<title>Square.30</title>
+			<desc>B6</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B6</text>		</g>
+		<g id="shape32-7" v:mID="32" v:groupContext="shape" transform="translate(108.12,-0.12)">
+			<title>Square.32</title>
+			<desc>B5</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B5</text>		</g>
+		<g id="shape25-10" v:mID="25" v:groupContext="shape" transform="translate(36.12,-36.12)">
+			<title>Square</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="36.87" width="72" height="72" class="st3"/>
+		</g>
+		<g id="shape26-12" v:mID="26" v:groupContext="shape" transform="translate(36.12,-72.12)">
+			<title>Square.26</title>
+			<desc>B0</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B0</text>		</g>
+		<g id="shape27-15" v:mID="27" v:groupContext="shape" transform="translate(72.12,-72.12)">
+			<title>Square.27</title>
+			<desc>B1</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B1</text>		</g>
+		<g id="shape28-18" v:mID="28" v:groupContext="shape" transform="translate(36.12,-36.12)">
+			<title>Square.28</title>
+			<desc>B2</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B2</text>		</g>
+		<g id="shape29-21" v:mID="29" v:groupContext="shape" transform="translate(72.12,-36.12)">
+			<title>Square.29</title>
+			<desc>B3</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+			<rect x="0" y="72.87" width="36" height="36" class="st1"/>
+			<text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B3</text>		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/inter_tx_partition.svg b/media/libaom/src/doc/img/inter_tx_partition.svg
new file mode 100644
index 0000000000..6f853c65d3
--- /dev/null
+++ b/media/libaom/src/doc/img/inter_tx_partition.svg
@@ -0,0 +1,87 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export inter_tx_partition.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="4.52083in" height="2.02083in"
+		viewBox="0 0 325.5 145.5" xml:space="preserve" color-interpolation-filters="sRGB" class="st6">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st2 {stroke:#000000;stroke-dasharray:1.5,3;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st3 {stroke:#000000;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st4 {marker-end:url(#mrkr5-22);stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st5 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.22935779816514}
+		.st6 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend5">
+			<path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr5-22" class="st5" v:arrowType="5" v:arrowSize="2" v:setback="7.63" refX="-7.63" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-4.36,-4.36) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape24-1" v:mID="24" v:groupContext="shape" transform="translate(0.75,-0.75)">
+			<title>Square.24</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="1.5" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape25-3" v:mID="25" v:groupContext="shape" transform="translate(180.75,-0.75)">
+			<title>Square.25</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="1.5" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape26-5" v:mID="26" v:groupContext="shape" transform="translate(180.75,-72.75)">
+			<title>Sheet.26</title>
+			<path d="M0 145.5 L144 145.5" class="st2"/>
+		</g>
+		<g id="shape27-8" v:mID="27" v:groupContext="shape" transform="translate(398.25,0.75) rotate(90)">
+			<title>Sheet.27</title>
+			<path d="M0 145.5 L144 145.5" class="st2"/>
+		</g>
+		<g id="shape28-11" v:mID="28" v:groupContext="shape" transform="translate(252.75,-108.75)">
+			<title>Sheet.28</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape29-14" v:mID="29" v:groupContext="shape" transform="translate(434.25,0.750007) rotate(90)">
+			<title>Sheet.29</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape30-17" v:mID="30" v:groupContext="shape" transform="translate(170.739,-101.283) rotate(-18.4349)">
+			<title>Sheet.30</title>
+			<path d="M0 145.5 L51.2 145.5" class="st4"/>
+		</g>
+		<g id="shape31-23" v:mID="31" v:groupContext="shape" transform="translate(270.75,-126.75)">
+			<title>Sheet.31</title>
+			<path d="M0 145.5 L30.28 145.5" class="st4"/>
+		</g>
+		<g id="shape32-28" v:mID="32" v:groupContext="shape" transform="translate(409.634,121.634) rotate(135)">
+			<title>Sheet.32</title>
+			<path d="M0 145.5 L45.06 145.5" class="st4"/>
+		</g>
+		<g id="shape33-33" v:mID="33" v:groupContext="shape" transform="translate(270.844,-90.8438)">
+			<title>Sheet.33</title>
+			<path d="M0 145.5 L30.18 145.5" class="st4"/>
+		</g>
+		<g id="shape34-38" v:mID="34" v:groupContext="shape" transform="translate(381.705,179.364) rotate(148.992)">
+			<title>Sheet.34</title>
+			<path d="M0 145.5 L99.28 145.5" class="st4"/>
+		</g>
+		<g id="shape35-43" v:mID="35" v:groupContext="shape" transform="translate(216.75,-36.75)">
+			<title>Sheet.35</title>
+			<path d="M0 145.5 L66.28 145.5" class="st4"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/intra_cfl.svg b/media/libaom/src/doc/img/intra_cfl.svg
new file mode 100644
index 0000000000..1153a2845e
--- /dev/null
+++ b/media/libaom/src/doc/img/intra_cfl.svg
@@ -0,0 +1,193 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export CfL_prediction.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="6.52269in" height="1.90714in"
+		viewBox="0 0 469.634 137.314" xml:space="preserve" color-interpolation-filters="sRGB" class="st13">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#ff00ff;fill-opacity:0;stroke:#000000;stroke-opacity:0;stroke-width:0.75}
+		.st2 {fill:#ffffff;stroke:#000000;stroke-width:0.75}
+		.st3 {fill:#000000;font-family:Calibri;font-size:0.75em}
+		.st4 {marker-end:url(#mrkr4-22);stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1}
+		.st5 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.28409090909091}
+		.st6 {fill:none;stroke:#000000;stroke-width:0.75}
+		.st7 {fill:#000000;font-family:Calibri;font-size:1.99999em}
+		.st8 {fill:#000000;font-family:Calibri;font-size:1.5em}
+		.st9 {fill:none;stroke:none;stroke-width:0.25}
+		.st10 {font-size:1em}
+		.st11 {fill:#000000;font-family:SimSun;font-size:0.75em}
+		.st12 {font-family:Calibri;font-size:1em}
+		.st13 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend4">
+			<path d="M 2 1 L 0 0 L 2 -1 L 2 1 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr4-22" class="st5" v:arrowType="4" v:arrowSize="2" v:setback="7.04" refX="-7.04" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend4" transform="scale(-3.52,-3.52) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="0.0393701" v:pageScale="0.0393701" v:drawingUnits="24" v:shadowOffsetX="4.25197"
+				v:shadowOffsetY="-4.25197"/>
+		<v:layer v:name="Flowchart" v:index="0"/>
+		<v:layer v:name="Connector" v:index="1"/>
+		<g id="group5-1" transform="translate(111.581,-86.9232)" v:mID="5" v:groupContext="group" v:layerMember="0">
+			<v:custProps>
+				<v:cp v:nameU="Cost" v:lbl="Cost" v:type="7" v:format="@" v:langID="1033"/>
+				<v:cp v:nameU="Duration" v:lbl="Duration" v:type="2" v:langID="1033"/>
+				<v:cp v:nameU="Resources" v:lbl="Resources" v:langID="1033"/>
+			</v:custProps>
+			<v:userDefs>
+				<v:ud v:nameU="ScaleFactor" v:val="VT0(1):26"/>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<title>Tagged process</title>
+			<g id="shape6-2" v:mID="6" v:groupContext="shape" transform="translate(0.566929,0)">
+				<title>Sheet.6</title>
+				<path d="M53.15 137.31 L70.87 137.31 L70.87 128.46 L70.87 116.05 L0 116.05 L0 137.31 L53.15 137.31 Z" class="st1"/>
+			</g>
+			<g id="shape7-4" v:mID="7" v:groupContext="shape" v:layerMember="0" transform="translate(54.9213,0)">
+				<title>Sheet.7</title>
+			</g>
+			<g id="shape8-6" v:mID="8" v:groupContext="shape" v:layerMember="0">
+				<title>Sheet.8</title>
+				<desc>Sub-Sample</desc>
+				<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+				<v:textRect cx="35.4331" cy="126.684" width="70.87" height="21.2598"/>
+				<path d="M0 137.31 L70.87 137.31 L70.87 121.37 L70.87 116.05 L0 116.05 L0 137.31 Z" class="st2"/>
+				<text x="13.81" y="129.38" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Sub-Sample</text>			</g>
+		</g>
+		<g id="group9-9" transform="translate(224.967,-86.9232)" v:mID="9" v:groupContext="group" v:layerMember="0">
+			<v:custProps>
+				<v:cp v:nameU="Cost" v:lbl="Cost" v:type="7" v:format="@" v:langID="1033"/>
+				<v:cp v:nameU="Duration" v:lbl="Duration" v:type="2" v:langID="1033"/>
+				<v:cp v:nameU="Resources" v:lbl="Resources" v:langID="1033"/>
+			</v:custProps>
+			<v:userDefs>
+				<v:ud v:nameU="ScaleFactor" v:val="VT0(1):26"/>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<title>Tagged process.9</title>
+			<g id="shape10-10" v:mID="10" v:groupContext="shape" transform="translate(0.566929,0)">
+				<title>Sheet.10</title>
+				<path d="M53.15 137.31 L70.87 137.31 L70.87 128.46 L70.87 116.05 L0 116.05 L0 137.31 L53.15 137.31 Z" class="st1"/>
+			</g>
+			<g id="shape11-12" v:mID="11" v:groupContext="shape" v:layerMember="0" transform="translate(54.9213,0)">
+				<title>Sheet.11</title>
+			</g>
+			<g id="shape12-14" v:mID="12" v:groupContext="shape" v:layerMember="0">
+				<title>Sheet.12</title>
+				<desc>Average</desc>
+				<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+				<v:textRect cx="35.4331" cy="126.684" width="70.87" height="21.2598"/>
+				<path d="M0 137.31 L70.87 137.31 L70.87 121.37 L70.87 116.05 L0 116.05 L0 137.31 Z" class="st2"/>
+				<text x="20.48" y="129.38" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Average</text>			</g>
+		</g>
+		<g id="shape27-17" v:mID="27" v:groupContext="shape" transform="translate(182.447,-97.5531)">
+			<title>Sheet.27</title>
+			<path d="M0 137.31 L35.48 137.31" class="st4"/>
+		</g>
+		<g id="shape28-23" v:mID="28" v:groupContext="shape" transform="translate(295.833,-97.5531)">
+			<title>Sheet.28</title>
+			<path d="M0 137.31 L35.48 137.31" class="st4"/>
+		</g>
+		<g id="shape29-28" v:mID="29" v:groupContext="shape" transform="translate(341.47,-86.9232)">
+			<title>Sheet.29</title>
+			<desc>-</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="11.1968" cy="126.117" width="22.4" height="22.3937"/>
+			<ellipse cx="11.1968" cy="126.117" rx="11.1968" ry="11.1968" class="st6"/>
+			<text x="7.52" y="133.32" class="st7" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>- </text>		</g>
+		<g id="shape34-31" v:mID="34" v:groupContext="shape" v:layerMember="1" transform="translate(147.014,-101.663)">
+			<title>Dynamic connector</title>
+			<path d="M0 130.79 L0 109.53 L205.65 109.53 L205.65 122.62" class="st4"/>
+		</g>
+		<g id="shape35-36" v:mID="35" v:groupContext="shape" transform="translate(34.2657,-97.5531)">
+			<title>Sheet.35</title>
+			<path d="M0 137.31 L70.27 137.31" class="st4"/>
+		</g>
+		<g id="shape36-41" v:mID="36" v:groupContext="shape" transform="translate(341.329,-43.2697)">
+			<title>Sheet.36</title>
+			<desc>×</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="11.1968" cy="126.117" width="22.4" height="22.3937"/>
+			<ellipse cx="11.1968" cy="126.117" rx="11.1968" ry="11.1968" class="st6"/>
+			<text x="6.71" y="131.52" class="st8" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>× </text>		</g>
+		<g id="shape37-44" v:mID="37" v:groupContext="shape" transform="translate(34.2657,-53.5676)">
+			<title>Sheet.37</title>
+			<path d="M0 137.31 L300.06 137.31" class="st4"/>
+		</g>
+		<g id="shape38-49" v:mID="38" v:groupContext="shape" transform="translate(489.499,50.3067) rotate(89.9693)">
+			<title>Sheet.38</title>
+			<path d="M0 137.31 L14.24 137.31" class="st4"/>
+		</g>
+		<g id="shape39-54" v:mID="39" v:groupContext="shape" transform="translate(341.329,-0.75)">
+			<title>Sheet.39</title>
+			<desc>+</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="11.1968" cy="126.117" width="22.4" height="22.3937"/>
+			<ellipse cx="11.1968" cy="126.117" rx="11.1968" ry="11.1968" class="st6"/>
+			<text x="6.71" y="131.52" class="st8" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>+  </text>		</g>
+		<g id="shape40-57" v:mID="40" v:groupContext="shape" transform="translate(34.2657,-11.9539)">
+			<title>Sheet.40</title>
+			<path d="M0 137.31 L300.02 137.31" class="st4"/>
+		</g>
+		<g id="shape41-62" v:mID="41" v:groupContext="shape" v:layerMember="1" transform="translate(345.51,-86.9234)">
+			<title>Dynamic connector.41</title>
+			<path d="M7.09 137.31 L7.09 151.53" class="st4"/>
+		</g>
+		<g id="shape74-67" v:mID="74" v:groupContext="shape" v:layerMember="1" transform="translate(345.439,-43.2697)">
+			<title>Dynamic connector.74</title>
+			<path d="M7.09 137.31 L7.09 150.4" class="st4"/>
+		</g>
+		<g id="shape75-72" v:mID="75" v:groupContext="shape" transform="translate(363.722,-11.9551)">
+			<title>Sheet.75</title>
+			<path d="M0 137.31 L35.48 137.31" class="st4"/>
+		</g>
+		<g id="shape78-77" v:mID="78" v:groupContext="shape" transform="translate(3.08465,-17.2788)">
+			<title>Sheet.78</title>
+			<desc>Chroma DC Prediction</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="70.3916" cy="131.314" width="140.79" height="12"/>
+			<rect x="0" y="125.314" width="140.783" height="12" class="st9"/>
+			<text x="30.02" y="134.01" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Chroma DC Prediction</text>		</g>
+		<g id="shape82-80" v:mID="82" v:groupContext="shape" transform="translate(0.25,-60.75)">
+			<title>Sheet.82</title>
+			<desc>Scaling parameter α</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="70.3916" cy="131.314" width="140.79" height="12"/>
+			<rect x="0" y="125.314" width="140.783" height="12" class="st9"/>
+			<text x="33.74" y="134.01" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Scaling parameter α </text>		</g>
+		<g id="shape83-83" v:mID="83" v:groupContext="shape" transform="translate(30.0138,-102.514)">
+			<title>Sheet.83</title>
+			<desc>Luma reconstructed samples</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="43.937" cy="131.314" width="87.88" height="12"/>
+			<rect x="0" y="125.314" width="87.874" height="12" class="st9"/>
+			<text x="7.25" y="128.61" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Luma reconstructed <tspan
+						x="29.03" dy="1.2em" class="st10">samples</tspan></text>		</g>
+		<g id="shape84-87" v:mID="84" v:groupContext="shape" transform="translate(398.518,-5.47437)">
+			<title>Sheet.84</title>
+			<desc>CfL Prediction</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="35.4331" cy="131.314" width="70.87" height="12"/>
+			<rect x="0" y="125.314" width="70.8661" height="12" class="st9"/>
+			<text x="10.04" y="134.01" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>CfL Prediction</text>		</g>
+		<g id="shape85-90" v:mID="85" v:groupContext="shape" transform="translate(354.581,-72.75)">
+			<title>Sheet.85</title>
+			<desc>“AC” contribution</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="40.3937" cy="131.314" width="80.79" height="12"/>
+			<rect x="0" y="125.314" width="80.7874" height="12" class="st9"/>
+			<text x="2.62" y="134.31" class="st11" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>“<tspan class="st12">AC</tspan>”<tspan
+						class="st12"> </tspan><tspan class="st12">contribution</tspan></text>		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/intra_directional.svg b/media/libaom/src/doc/img/intra_directional.svg
new file mode 100644
index 0000000000..3a08007a95
--- /dev/null
+++ b/media/libaom/src/doc/img/intra_directional.svg
@@ -0,0 +1,192 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export intra_directional.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="4.24969in" height="4.20313in"
+		viewBox="0 0 305.978 302.625" xml:space="preserve" color-interpolation-filters="sRGB" class="st13">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st2 {marker-start:url(#mrkr5-8);stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st3 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.37313432835821}
+		.st4 {fill:#ffffff;stroke:none;stroke-linecap:butt;stroke-width:7.2}
+		.st5 {fill:#2f4f4f;font-family:Consolas;font-size:0.791656em}
+		.st6 {font-size:1em}
+		.st7 {fill:#ffffff;stroke:none;stroke-linecap:butt}
+		.st8 {marker-end:url(#mrkr5-49);stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st9 {marker-end:url(#mrkr5-65);stroke:#000000;stroke-dasharray:2.25,2.25;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st10 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.22935779816514}
+		.st11 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st12 {fill:#000000;font-family:Calibri;font-size:0.666664em}
+		.st13 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend5">
+			<path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr5-8" class="st3" v:arrowType="5" v:arrowSize="2" v:setback="4.45" refX="4.45" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(2.68) "/>
+		</marker>
+		<marker id="mrkr5-49" class="st3" v:arrowType="5" v:arrowSize="2" v:setback="4.69" refX="-4.69" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-2.68,-2.68) "/>
+		</marker>
+		<marker id="mrkr5-65" class="st10" v:arrowType="5" v:arrowSize="2" v:setback="7.63" refX="-7.63" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-4.36,-4.36) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(8.98899,-0.75)">
+			<title>Square</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="14.625" width="288" height="288" class="st1"/>
+		</g>
+		<g id="shape5-3" v:mID="5" v:groupContext="shape" transform="translate(222.977,-200.113) rotate(45)">
+			<title>Sheet.5</title>
+			<desc>D135_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="101.823" cy="302.625" width="203.65" height="0"/>
+			<path d="M6.68 302.62 L7.03 302.62 L203.65 302.62" class="st2"/>
+			<rect v:rectContext="textBkgnd" x="78.3191" y="295.425" width="47.0086" height="14.4001" class="st4"/>
+			<text x="78.32" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+						v:langID="2052">13</tspan>5_PRED</text>		</g>
+		<g id="shape6-12" v:mID="6" v:groupContext="shape" transform="translate(8.98899,-144.75)">
+			<title>Sheet.6</title>
+			<desc>H_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="302.625" width="144" height="0"/>
+			<path d="M6.67 302.62 L7.03 302.62 L144 302.62" class="st2"/>
+			<rect v:rectContext="textBkgnd" x="56.3305" y="295.425" width="31.3391" height="14.4001" class="st4"/>
+			<text x="56.33" y="306.23" class="st5" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>H<tspan class="st6"
+						v:langID="1033">_PRED</tspan></text>		</g>
+		<g id="shape8-20" v:mID="8" v:groupContext="shape" transform="translate(367.241,-107.423) rotate(66.3706)">
+			<title>Sheet.8</title>
+			<desc>D113_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="78.5891" cy="302.625" width="157.18" height="0"/>
+			<path d="M6.67 302.62 L7.03 302.62 L157.18 302.62" class="st2"/>
+			<rect v:rectContext="textBkgnd" x="55.0849" y="295.425" width="47.0086" height="14.4001" class="st7"/>
+			<text x="55.08" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+						v:langID="2052">113</tspan>_PRED</text>		</g>
+		<g id="shape9-28" v:mID="9" v:groupContext="shape" transform="translate(130.287,-182.377) rotate(23.6294)">
+			<title>Sheet.9</title>
+			<desc>D157_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="78.5891" cy="302.625" width="157.18" height="0"/>
+			<path d="M6.67 302.62 L7.03 302.62 L157.18 302.62" class="st2"/>
+			<rect v:rectContext="textBkgnd" x="55.0849" y="295.425" width="47.0086" height="14.4001" class="st4"/>
+			<text x="55.08" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+						v:langID="2052">157</tspan>_PRED</text>		</g>
+		<g id="shape10-36" v:mID="10" v:groupContext="shape" transform="translate(-112.309,-56.3771) rotate(-23.6294)">
+			<title>Sheet.10</title>
+			<desc>D203_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="78.5891" cy="302.625" width="157.18" height="0"/>
+			<path d="M6.67 302.62 L7.03 302.62 L157.18 302.62" class="st2"/>
+			<rect v:rectContext="textBkgnd" x="55.0849" y="295.425" width="47.0086" height="14.4001" class="st4"/>
+			<text x="55.08" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+						v:langID="2052">203</tspan>_PRED</text>		</g>
+		<g id="shape11-44" v:mID="11" v:groupContext="shape" transform="translate(-60.9992,-56.1132) rotate(-45)">
+			<title>Sheet.11</title>
+			<desc>D45_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="101.823" cy="302.625" width="203.65" height="0"/>
+			<path d="M0 302.62 L196.61 302.62" class="st8"/>
+			<rect v:rectContext="textBkgnd" x="80.9308" y="295.425" width="41.7854" height="14.4001" class="st7"/>
+			<text x="80.93" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D45_PRED</text>		</g>
+		<g id="shape12-52" v:mID="12" v:groupContext="shape" transform="translate(-149.636,157.875) rotate(-90)">
+			<title>Sheet.12</title>
+			<desc>V_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="302.625" width="144" height="0"/>
+			<path d="M0 302.62 L136.96 302.62" class="st8"/>
+			<rect v:rectContext="textBkgnd" x="56.3305" y="295.425" width="31.3391" height="14.4001" class="st7"/>
+			<text x="56.33" y="306.23" class="st5" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>V<tspan class="st6"
+						v:langID="1033">_PRED</tspan></text>		</g>
+		<g id="shape13-60" v:mID="13" v:groupContext="shape" transform="translate(-117.687,22.537) rotate(-63.4349)">
+			<title>Sheet.13</title>
+			<path d="M0 302.62 L155.27 302.62" class="st9"/>
+		</g>
+		<g id="shape14-66" v:mID="14" v:groupContext="shape" transform="translate(-110.772,9.50969) rotate(-60.6422)">
+			<title>Sheet.14</title>
+			<path d="M0 302.62 L159.5 302.62" class="st9"/>
+		</g>
+		<g id="shape15-71" v:mID="15" v:groupContext="shape" transform="translate(-103.636,-2.51593) rotate(-57.9946)">
+			<title>Sheet.15</title>
+			<path d="M0 302.62 L164.09 302.62" class="st9"/>
+		</g>
+		<g id="shape16-76" v:mID="16" v:groupContext="shape" transform="translate(-130.368,51.6163) rotate(-69.444)">
+			<title>Sheet.16</title>
+			<path d="M0 302.62 L148.07 302.62" class="st9"/>
+		</g>
+		<g id="shape17-81" v:mID="17" v:groupContext="shape" transform="translate(-135.861,67.6095) rotate(-72.646)">
+			<title>Sheet.17</title>
+			<path d="M0 302.62 L145.14 302.62" class="st9"/>
+		</g>
+		<g id="shape18-86" v:mID="18" v:groupContext="shape" transform="translate(-140.6,84.4777) rotate(-75.9638)">
+			<title>Sheet.18</title>
+			<path d="M0 302.62 L142.71 302.62" class="st9"/>
+		</g>
+		<g id="shape30-91" v:mID="30" v:groupContext="shape" transform="translate(-124.263,36.5772) rotate(-66.3706)">
+			<title>Sheet.30</title>
+			<desc>D67_PRED</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="78.5891" cy="302.625" width="157.18" height="0"/>
+			<path d="M0 302.62 L150.14 302.62" class="st8"/>
+			<rect v:rectContext="textBkgnd" x="57.6964" y="295.425" width="41.7854" height="14.4001" class="st4"/>
+			<text x="57.7" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+						v:langID="2052">67</tspan>_PRED</text>		</g>
+		<g id="shape31-99" v:mID="31" v:groupContext="shape" transform="translate(214.864,-288.75)">
+			<title>Sheet.31</title>
+			<desc>+1</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+			<rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+			<text x="4.98" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>+1</text>		</g>
+		<g id="shape32-102" v:mID="32" v:groupContext="shape" transform="translate(224.989,-288.75)">
+			<title>Sheet.32</title>
+			<desc>+2</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+			<rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+			<text x="4.98" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>+2</text>		</g>
+		<g id="shape33-105" v:mID="33" v:groupContext="shape" transform="translate(238.489,-288.75)">
+			<title>Sheet.33</title>
+			<desc>+3</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+			<rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+			<text x="4.98" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>+3</text>		</g>
+		<g id="shape34-108" v:mID="34" v:groupContext="shape" transform="translate(197.989,-288.75)">
+			<title>Sheet.34</title>
+			<desc>-1</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+			<rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+			<text x="5.75" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>-1</text>		</g>
+		<g id="shape35-111" v:mID="35" v:groupContext="shape" transform="translate(188.989,-288.75)">
+			<title>Sheet.35</title>
+			<desc>-2</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+			<rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+			<text x="5.75" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>-2</text>		</g>
+		<g id="shape36-114" v:mID="36" v:groupContext="shape" transform="translate(177.739,-288.75)">
+			<title>Sheet.36</title>
+			<desc>-3</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+			<rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+			<text x="5.75" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>-3</text>		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/intra_paeth.svg b/media/libaom/src/doc/img/intra_paeth.svg
new file mode 100644
index 0000000000..f7a831febb
--- /dev/null
+++ b/media/libaom/src/doc/img/intra_paeth.svg
@@ -0,0 +1,181 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export intra_paeth.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.52083in" height="2.52083in"
+		viewBox="0 0 181.5 181.5" xml:space="preserve" color-interpolation-filters="sRGB" class="st7">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-dasharray:2.25,2.25;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st2 {fill:#000000;font-family:Calibri;font-size:1.00001em}
+		.st3 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st4 {fill:#000000;font-family:Calibri;font-size:0.833336em}
+		.st5 {font-size:1em}
+		.st6 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:2.25}
+		.st7 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape211-1" v:mID="211" v:groupContext="shape" transform="translate(0.375,-73.125)">
+			<title>Square.211</title>
+			<desc>L</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="163.5" width="36" height="36"/>
+			<path d="M0 181.5 L36 181.5 L36 145.5 L0 145.5 L0 181.5 Z" class="st1"/>
+			<text x="15.48" y="167.1" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>L</text>		</g>
+		<g id="shape212-4" v:mID="212" v:groupContext="shape" transform="translate(108.375,-145.125)">
+			<title>Square.212</title>
+			<desc>T</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="163.5" width="36" height="36"/>
+			<path d="M0 181.5 L36 181.5 L36 145.5 L0 145.5 L0 181.5 Z" class="st1"/>
+			<text x="15.08" y="167.1" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>T</text>		</g>
+		<g id="shape213-7" v:mID="213" v:groupContext="shape" transform="translate(0.375007,-145.125)">
+			<title>Square.213</title>
+			<desc>TL</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="163.5" width="36" height="36"/>
+			<path d="M0 181.5 L36 181.5 L36 145.5 L0 145.5 L0 181.5 Z" class="st1"/>
+			<text x="12.55" y="167.1" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>TL</text>		</g>
+		<g id="group214-10" transform="translate(36.375,-1.12501)" v:mID="214" v:groupContext="group">
+			<title>Sheet.214</title>
+			<g id="shape183-11" v:mID="183" v:groupContext="shape" transform="translate(6.86646E-06,-108)">
+				<title>Square.183</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape184-13" v:mID="184" v:groupContext="shape" transform="translate(36,-108)">
+				<title>Square.184</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape185-15" v:mID="185" v:groupContext="shape" transform="translate(72,-108)">
+				<title>Square.185</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape186-17" v:mID="186" v:groupContext="shape" transform="translate(108,-108)">
+				<title>Square.186</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape199-19" v:mID="199" v:groupContext="shape" transform="translate(1.37329E-05,-72)">
+				<title>Square.199</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape200-21" v:mID="200" v:groupContext="shape" transform="translate(36,-72)">
+				<title>Square.200</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape201-23" v:mID="201" v:groupContext="shape" transform="translate(72,-72)">
+				<title>Square.201</title>
+				<desc>Current Pixel</desc>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<v:textBlock v:margins="rect(0,0,0,0)"/>
+				<v:textRect cx="18" cy="163.5" width="36" height="36"/>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+				<text x="2.43" y="160.5" class="st4" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Current <tspan
+							x="8.47" dy="1.2em" class="st5">Pixel</tspan></text>			</g>
+			<g id="shape202-27" v:mID="202" v:groupContext="shape" transform="translate(108,-72)">
+				<title>Square.202</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape203-29" v:mID="203" v:groupContext="shape" transform="translate(0,-36)">
+				<title>Square.203</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape204-31" v:mID="204" v:groupContext="shape" transform="translate(36,-36)">
+				<title>Square.204</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape205-33" v:mID="205" v:groupContext="shape" transform="translate(72,-36)">
+				<title>Square.205</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape206-35" v:mID="206" v:groupContext="shape" transform="translate(108,-36)">
+				<title>Square.206</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape207-37" v:mID="207" v:groupContext="shape" transform="translate(6.86646E-06,0)">
+				<title>Square.207</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape208-39" v:mID="208" v:groupContext="shape" transform="translate(36,0)">
+				<title>Square.208</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape209-41" v:mID="209" v:groupContext="shape" transform="translate(72,0)">
+				<title>Square.209</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+			<g id="shape210-43" v:mID="210" v:groupContext="shape" transform="translate(108,0)">
+				<title>Square.210</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="145.5" width="36" height="36" class="st3"/>
+			</g>
+		</g>
+		<g id="shape215-45" v:mID="215" v:groupContext="shape" transform="translate(36.375,-1.125)">
+			<title>Square.215</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="37.5" width="144" height="144" class="st6"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/intra_recursive.svg b/media/libaom/src/doc/img/intra_recursive.svg
new file mode 100644
index 0000000000..adc4193169
--- /dev/null
+++ b/media/libaom/src/doc/img/intra_recursive.svg
@@ -0,0 +1,710 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export intra_recursive.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="4.52015in" height="4.46693in"
+		viewBox="0 0 325.45 321.619" xml:space="preserve" color-interpolation-filters="sRGB" class="st9">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st3 {marker-end:url(#mrkr10-184);marker-start:url(#mrkr10-182);stroke:#0070c0;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st4 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.34246575342466}
+		.st5 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.29411764705882}
+		.st6 {marker-end:url(#mrkr10-235);marker-start:url(#mrkr10-233);stroke:#bf9000;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st7 {fill:#bf9000;fill-opacity:1;stroke:#bf9000;stroke-opacity:1;stroke-width:0.34246575342466}
+		.st8 {fill:#bf9000;fill-opacity:1;stroke:#bf9000;stroke-opacity:1;stroke-width:0.29411764705882}
+		.st9 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend10">
+			<path
+					d="M 0 0.75 C -0.414214 0.75 -0.75 0.414214 -0.75 0 -0.75 -0.414214 -0.414214 -0.75 0 -0.75 0.414214 -0.75 0.75 -0.414214 0.75 0 0.75 0.414214 0.414214 0.75 0 0.75 Z "
+					style="stroke:none"/>
+		</g>
+		<marker id="mrkr10-182" class="st4" v:arrowType="10" v:arrowSize="0" v:setback="1.71" refX="1.71" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(2.92) "/>
+		</marker>
+		<marker id="mrkr10-184" class="st5" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+		</marker>
+		<marker id="mrkr10-233" class="st7" v:arrowType="10" v:arrowSize="0" v:setback="1.71" refX="1.71" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(2.92) "/>
+		</marker>
+		<marker id="mrkr10-235" class="st8" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="group149-1" transform="translate(0.12,-214.583)" v:mID="149" v:groupContext="group">
+			<title>Sheet.149</title>
+			<g id="shape142-2" v:mID="142" v:groupContext="shape" transform="translate(0,-71.2776)">
+				<title>Square.142</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+			<g id="shape143-4" v:mID="143" v:groupContext="shape" transform="translate(36.0645,-71.2776)">
+				<title>Square.143</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+			<g id="shape144-6" v:mID="144" v:groupContext="shape" transform="translate(72.129,-71.2776)">
+				<title>Square.144</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+			<g id="shape145-8" v:mID="145" v:groupContext="shape" transform="translate(108.193,-71.2776)">
+				<title>Square.145</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+			<g id="shape146-10" v:mID="146" v:groupContext="shape" transform="translate(144.258,-71.2776)">
+				<title>Square.146</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+			<g id="shape147-12" v:mID="147" v:groupContext="shape" transform="translate(0,-35.6388)">
+				<title>Square.147</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+			<g id="shape148-14" v:mID="148" v:groupContext="shape">
+				<title>Square.148</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+			</g>
+		</g>
+		<g id="group64-16" transform="translate(36.1845,-214.583)" v:mID="64" v:groupContext="group">
+			<title>Sheet.64</title>
+			<g id="shape38-17" v:mID="38" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group63-19" v:mID="63" v:groupContext="group">
+				<title>Sheet.63</title>
+				<g id="shape46-20" v:mID="46" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape47-22" v:mID="47" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape48-24" v:mID="48" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape49-26" v:mID="49" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape50-28" v:mID="50" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape51-30" v:mID="51" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape52-32" v:mID="52" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape53-34" v:mID="53" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group65-36" transform="translate(180.442,-214.583)" v:mID="65" v:groupContext="group">
+			<title>Sheet.65</title>
+			<g id="shape66-37" v:mID="66" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group67-39" v:mID="67" v:groupContext="group">
+				<title>Sheet.67</title>
+				<g id="shape68-40" v:mID="68" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape69-42" v:mID="69" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape70-44" v:mID="70" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape71-46" v:mID="71" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape72-48" v:mID="72" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape73-50" v:mID="73" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape74-52" v:mID="74" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape75-54" v:mID="75" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group76-56" transform="translate(36.1845,-143.305)" v:mID="76" v:groupContext="group">
+			<title>Sheet.76</title>
+			<g id="shape77-57" v:mID="77" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group78-59" v:mID="78" v:groupContext="group">
+				<title>Sheet.78</title>
+				<g id="shape79-60" v:mID="79" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape80-62" v:mID="80" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape81-64" v:mID="81" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape82-66" v:mID="82" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape83-68" v:mID="83" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape84-70" v:mID="84" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape85-72" v:mID="85" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape86-74" v:mID="86" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group87-76" transform="translate(180.442,-143.305)" v:mID="87" v:groupContext="group">
+			<title>Sheet.87</title>
+			<g id="shape88-77" v:mID="88" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group89-79" v:mID="89" v:groupContext="group">
+				<title>Sheet.89</title>
+				<g id="shape90-80" v:mID="90" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape91-82" v:mID="91" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape92-84" v:mID="92" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape93-86" v:mID="93" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape94-88" v:mID="94" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape95-90" v:mID="95" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape96-92" v:mID="96" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape97-94" v:mID="97" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group98-96" transform="translate(36.1845,-72.0276)" v:mID="98" v:groupContext="group">
+			<title>Sheet.98</title>
+			<g id="shape99-97" v:mID="99" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group100-99" v:mID="100" v:groupContext="group">
+				<title>Sheet.100</title>
+				<g id="shape101-100" v:mID="101" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape102-102" v:mID="102" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape103-104" v:mID="103" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape104-106" v:mID="104" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape105-108" v:mID="105" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape106-110" v:mID="106" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape107-112" v:mID="107" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape108-114" v:mID="108" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group109-116" transform="translate(180.442,-72.0276)" v:mID="109" v:groupContext="group">
+			<title>Sheet.109</title>
+			<g id="shape110-117" v:mID="110" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group111-119" v:mID="111" v:groupContext="group">
+				<title>Sheet.111</title>
+				<g id="shape112-120" v:mID="112" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape113-122" v:mID="113" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape114-124" v:mID="114" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape115-126" v:mID="115" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape116-128" v:mID="116" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape117-130" v:mID="117" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape118-132" v:mID="118" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape119-134" v:mID="119" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group120-136" transform="translate(36.1845,-0.75)" v:mID="120" v:groupContext="group">
+			<title>Sheet.120</title>
+			<g id="shape121-137" v:mID="121" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group122-139" v:mID="122" v:groupContext="group">
+				<title>Sheet.122</title>
+				<g id="shape123-140" v:mID="123" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape124-142" v:mID="124" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape125-144" v:mID="125" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape126-146" v:mID="126" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape127-148" v:mID="127" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape128-150" v:mID="128" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape129-152" v:mID="129" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape130-154" v:mID="130" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="group131-156" transform="translate(180.442,-0.75)" v:mID="131" v:groupContext="group">
+			<title>Sheet.131</title>
+			<g id="shape132-157" v:mID="132" v:groupContext="shape">
+				<title>Rectangle</title>
+				<v:userDefs>
+					<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+				</v:userDefs>
+				<rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+			</g>
+			<g id="group133-159" v:mID="133" v:groupContext="group">
+				<title>Sheet.133</title>
+				<g id="shape134-160" v:mID="134" v:groupContext="shape" transform="translate(0,-35.6388)">
+					<title>Square.46</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape135-162" v:mID="135" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+					<title>Square.47</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape136-164" v:mID="136" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+					<title>Square.48</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape137-166" v:mID="137" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+					<title>Square.49</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape138-168" v:mID="138" v:groupContext="shape">
+					<title>Square.50</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape139-170" v:mID="139" v:groupContext="shape" transform="translate(36.0645,0)">
+					<title>Square.51</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape140-172" v:mID="140" v:groupContext="shape" transform="translate(72.129,0)">
+					<title>Square.52</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+				<g id="shape141-174" v:mID="141" v:groupContext="shape" transform="translate(108.193,0)">
+					<title>Square.53</title>
+					<v:userDefs>
+						<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+					</v:userDefs>
+					<rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+				</g>
+			</g>
+		</g>
+		<g id="shape150-176" v:mID="150" v:groupContext="shape" transform="translate(244.217,-210.826) rotate(44.6598)">
+			<title>Sheet.150</title>
+			<path d="M1.28 321.62 L1.64 321.62 L49.49 321.62" class="st3"/>
+		</g>
+		<g id="shape151-185" v:mID="151" v:groupContext="shape" transform="translate(-266.901,54.0731) rotate(-90)">
+			<title>Sheet.151</title>
+			<path d="M1.28 321.62 L1.64 321.62 L34.22 321.62" class="st3"/>
+		</g>
+		<g id="shape152-192" v:mID="152" v:groupContext="shape" transform="translate(319.501,243.543) rotate(134.544)">
+			<title>Sheet.152</title>
+			<path d="M1.28 321.62 L1.64 321.62 L48.79 321.62" class="st3"/>
+		</g>
+		<g id="shape153-199" v:mID="153" v:groupContext="shape" transform="translate(271.203,305.09) rotate(153.231)">
+			<title>Sheet.153</title>
+			<path d="M1.28 321.62 L1.64 321.62 L78.31 321.62" class="st3"/>
+		</g>
+		<g id="shape154-206" v:mID="154" v:groupContext="shape" transform="translate(264.717,322.853) rotate(161.452)">
+			<title>Sheet.154</title>
+			<path d="M1.28 321.62 L1.64 321.62 L111.68 321.62" class="st3"/>
+		</g>
+		<g id="shape155-213" v:mID="155" v:groupContext="shape" transform="translate(18.1522,-267.546)">
+			<title>Sheet.155</title>
+			<path d="M1.28 321.62 L1.64 321.62 L34.65 321.62" class="st3"/>
+		</g>
+		<g id="shape156-220" v:mID="156" v:groupContext="shape" transform="translate(-204.714,-142.665) rotate(-43.8643)">
+			<title>Sheet.156</title>
+			<path d="M1.28 321.62 L1.64 321.62 L48.8 321.62" class="st3"/>
+		</g>
+		<g id="shape157-227" v:mID="157" v:groupContext="shape" transform="translate(388.475,-68.2707) rotate(44.6598)">
+			<title>Sheet.157</title>
+			<path d="M1.28 321.62 L1.64 321.62 L99.49 321.62" class="st6"/>
+		</g>
+		<g id="shape158-236" v:mID="158" v:groupContext="shape" transform="translate(-53.2468,375.362) rotate(-116.517)">
+			<title>Sheet.158</title>
+			<path d="M1.28 321.62 L1.64 321.62 L77.74 321.62" class="st6"/>
+		</g>
+		<g id="shape159-243" v:mID="159" v:groupContext="shape" transform="translate(556.158,160.495) rotate(90)">
+			<title>Sheet.159</title>
+			<path d="M1.28 321.62 L1.64 321.62 L69.37 321.62" class="st6"/>
+		</g>
+		<g id="shape160-250" v:mID="160" v:groupContext="shape" transform="translate(557.58,305.696) rotate(116.838)">
+			<title>Sheet.160</title>
+			<path d="M1.28 321.62 L1.64 321.62 L77.97 321.62" class="st6"/>
+		</g>
+		<g id="shape161-257" v:mID="161" v:groupContext="shape" transform="translate(532.733,389.26) rotate(135.34)">
+			<title>Sheet.161</title>
+			<path d="M1.28 321.62 L1.64 321.62 L99.49 321.62" class="st6"/>
+		</g>
+		<g id="shape162-264" v:mID="162" v:groupContext="shape" transform="translate(303.283,-92.4976) rotate(25.977)">
+			<title>Sheet.162</title>
+			<path d="M1.28 321.62 L1.64 321.62 L78.32 321.62" class="st6"/>
+		</g>
+		<g id="shape163-271" v:mID="163" v:groupContext="shape" transform="translate(162.41,-89.8469)">
+			<title>Sheet.163</title>
+			<path d="M1.28 321.62 L1.64 321.62 L70.22 321.62" class="st6"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/intra_tx_partition.svg b/media/libaom/src/doc/img/intra_tx_partition.svg
new file mode 100644
index 0000000000..69575d4cd7
--- /dev/null
+++ b/media/libaom/src/doc/img/intra_tx_partition.svg
@@ -0,0 +1,142 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export intra_tx_partition.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="7.02083in" height="2.02083in"
+		viewBox="0 0 505.5 145.5" xml:space="preserve" color-interpolation-filters="sRGB" class="st6">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st2 {stroke:#000000;stroke-dasharray:1.5,3;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st3 {stroke:#000000;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st4 {marker-end:url(#mrkr5-36);stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st5 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.22935779816514}
+		.st6 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend5">
+			<path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr5-36" class="st5" v:arrowType="5" v:arrowSize="2" v:setback="7.63" refX="-7.63" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend5" transform="scale(-4.36,-4.36) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(0.75,-0.75)">
+			<title>Square</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="1.5" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape4-3" v:mID="4" v:groupContext="shape" transform="translate(180.75,-0.75)">
+			<title>Square.4</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="1.5" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape5-5" v:mID="5" v:groupContext="shape" transform="translate(398.25,0.75) rotate(90)">
+			<title>Sheet.5</title>
+			<path d="M0 145.5 L144 145.5" class="st2"/>
+		</g>
+		<g id="shape6-8" v:mID="6" v:groupContext="shape" transform="translate(180.75,-72.75)">
+			<title>Sheet.6</title>
+			<path d="M0 145.5 L144 145.5" class="st2"/>
+		</g>
+		<g id="shape7-11" v:mID="7" v:groupContext="shape" transform="translate(360.75,-0.75)">
+			<title>Square.7</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="1.5" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape8-13" v:mID="8" v:groupContext="shape" transform="translate(578.25,0.75) rotate(90)">
+			<title>Sheet.8</title>
+			<path d="M0 145.5 L144 145.5" class="st2"/>
+		</g>
+		<g id="shape9-16" v:mID="9" v:groupContext="shape" transform="translate(432,-108.5)">
+			<title>Sheet.9</title>
+			<path d="M0 145.5 L72.75 145.5" class="st3"/>
+		</g>
+		<g id="shape10-19" v:mID="10" v:groupContext="shape" transform="translate(360.75,-72.75)">
+			<title>Sheet.10</title>
+			<path d="M0 145.5 L144 145.5" class="st2"/>
+		</g>
+		<g id="shape11-22" v:mID="11" v:groupContext="shape" transform="translate(360.75,-36.75)">
+			<title>Sheet.11</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape12-25" v:mID="12" v:groupContext="shape" transform="translate(542.25,0.750007) rotate(90)">
+			<title>Sheet.12</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape13-28" v:mID="13" v:groupContext="shape" transform="translate(614.25,0.75) rotate(90)">
+			<title>Sheet.13</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape14-31" v:mID="14" v:groupContext="shape" transform="translate(216.75,-108.75)">
+			<title>Sheet.14</title>
+			<path d="M0 145.5 L66.28 145.5" class="st4"/>
+		</g>
+		<g id="shape15-37" v:mID="15" v:groupContext="shape" transform="translate(391.634,139.634) rotate(135)">
+			<title>Sheet.15</title>
+			<path d="M0 145.5 L96.1 145.5" class="st4"/>
+		</g>
+		<g id="shape16-42" v:mID="16" v:groupContext="shape" transform="translate(216.75,-36.75)">
+			<title>Sheet.16</title>
+			<path d="M0 145.5 L66.28 145.5" class="st4"/>
+		</g>
+		<g id="shape17-47" v:mID="17" v:groupContext="shape" transform="translate(378.75,-126.75)">
+			<title>Sheet.17</title>
+			<path d="M0 145.5 L102.28 145.5" class="st4"/>
+		</g>
+		<g id="shape18-52" v:mID="18" v:groupContext="shape" transform="translate(378.75,-90.75)">
+			<title>Sheet.18</title>
+			<path d="M0 145.5 L102.28 145.5" class="st4"/>
+		</g>
+		<g id="shape19-57" v:mID="19" v:groupContext="shape" transform="translate(378.75,-54.75)">
+			<title>Sheet.19</title>
+			<path d="M0 145.5 L102.28 145.5" class="st4"/>
+		</g>
+		<g id="shape20-62" v:mID="20" v:groupContext="shape" transform="translate(378.75,-18.75)">
+			<title>Sheet.20</title>
+			<path d="M0 145.5 L102.28 145.5" class="st4"/>
+		</g>
+		<g id="shape21-67" v:mID="21" v:groupContext="shape" transform="translate(532.761,156.783) rotate(161.565)">
+			<title>Sheet.21</title>
+			<path d="M0 145.5 L108.12 145.5" class="st4"/>
+		</g>
+		<g id="shape22-72" v:mID="22" v:groupContext="shape" transform="translate(532.761,192.783) rotate(161.565)">
+			<title>Sheet.22</title>
+			<path d="M0 145.5 L108.12 145.5" class="st4"/>
+		</g>
+		<g id="shape23-77" v:mID="23" v:groupContext="shape" transform="translate(532.761,228.783) rotate(161.565)">
+			<title>Sheet.23</title>
+			<path d="M0 145.5 L108.12 145.5" class="st4"/>
+		</g>
+		<g id="shape36-82" v:mID="36" v:groupContext="shape" transform="translate(360.75,-108.5)">
+			<title>Sheet.36</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape37-85" v:mID="37" v:groupContext="shape" transform="translate(432.75,-36.75)">
+			<title>Sheet.37</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape38-88" v:mID="38" v:groupContext="shape" transform="translate(542.25,72.75) rotate(90)">
+			<title>Sheet.38</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+		<g id="shape39-91" v:mID="39" v:groupContext="shape" transform="translate(614.25,72.75) rotate(90)">
+			<title>Sheet.39</title>
+			<path d="M0 145.5 L72 145.5" class="st3"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/loop_restoration.svg b/media/libaom/src/doc/img/loop_restoration.svg
new file mode 100644
index 0000000000..cdeb76a871
--- /dev/null
+++ b/media/libaom/src/doc/img/loop_restoration.svg
@@ -0,0 +1,114 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export loop_restoration.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="5.47917in" height="2.49905in"
+		viewBox="0 0 394.5 179.932" xml:space="preserve" color-interpolation-filters="sRGB" class="st11">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#bfbfbf;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {marker-end:url(#mrkr4-8);stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1}
+		.st3 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.28409090909091}
+		.st4 {stroke:#000000;stroke-dasharray:0,3.75;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st5 {marker-end:url(#mrkr4-27);stroke:#4bacc6;stroke-linecap:round;stroke-linejoin:round;stroke-width:1}
+		.st6 {fill:#4bacc6;fill-opacity:1;stroke:#4bacc6;stroke-opacity:1;stroke-width:0.28409090909091}
+		.st7 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st8 {fill:#000000;font-family:Times New Roman;font-size:1.00001em}
+		.st9 {baseline-shift:-32.4941%;font-size:0.649882em}
+		.st10 {font-size:1em}
+		.st11 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<defs id="Markers">
+		<g id="lend4">
+			<path d="M 2 1 L 0 0 L 2 -1 L 2 1 " style="stroke:none"/>
+		</g>
+		<marker id="mrkr4-8" class="st3" v:arrowType="4" v:arrowSize="2" v:setback="7.04" refX="-7.04" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend4" transform="scale(-3.52,-3.52) "/>
+		</marker>
+		<marker id="mrkr4-27" class="st6" v:arrowType="4" v:arrowSize="2" v:setback="7.04" refX="-7.04" orient="auto"
+				markerUnits="strokeWidth" overflow="visible">
+			<use xlink:href="#lend4" transform="scale(-3.52,-3.52) "/>
+		</marker>
+	</defs>
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<v:layer v:name="Connector" v:index="0"/>
+		<g id="shape24-1" v:mID="24" v:groupContext="shape" transform="translate(34.9607,-40.8257)">
+			<title>Parallelogram</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<path d="M0 179.93 L222.58 179.93 L288.29 110.74 L65.71 110.74 L0 179.93 Z" class="st1"/>
+		</g>
+		<g id="shape28-3" v:mID="28" v:groupContext="shape" transform="translate(-95.504,15.1931) rotate(-46.4754)">
+			<title>Sheet.28</title>
+			<path d="M0 179.93 L40.67 179.93" class="st2"/>
+		</g>
+		<g id="shape29-9" v:mID="29" v:groupContext="shape" transform="translate(34.9607,-40.8257)">
+			<title>Sheet.29</title>
+			<path d="M0 179.93 L48.37 179.93" class="st2"/>
+		</g>
+		<g id="shape33-14" v:mID="33" v:groupContext="shape" transform="translate(-10.6429,-34.9507) rotate(-14.6817)">
+			<title>Sheet.33</title>
+			<path d="M0 179.93 L180.5 179.93" class="st2"/>
+		</g>
+		<g id="shape36-19" v:mID="36" v:groupContext="shape" transform="translate(36.2288,91.5749) rotate(-90)">
+			<title>Sheet.36</title>
+			<path d="M0 179.93 L57.25 179.93" class="st4"/>
+		</g>
+		<g id="shape37-22" v:mID="37" v:groupContext="shape" transform="translate(-55.1147,-16.6562) rotate(-30.0403)">
+			<title>Sheet.37</title>
+			<path d="M0 179.93 L202.28 179.93" class="st5"/>
+		</g>
+		<g id="shape38-28" v:mID="38" v:groupContext="shape" transform="translate(18.375,-33.5132)">
+			<title>Sheet.38</title>
+			<desc>X</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="5.34375" cy="174.026" width="10.69" height="11.8125"/>
+			<rect x="0" y="168.119" width="10.6875" height="11.8125" class="st7"/>
+			<text x="4" y="177.63" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>X</text>		</g>
+		<g id="shape43-31" v:mID="43" v:groupContext="shape" transform="translate(31.875,-69.5132)">
+			<title>Sheet.43</title>
+			<desc>X1</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="14.3438" cy="174.026" width="28.69" height="11.8125"/>
+			<rect x="0" y="168.119" width="28.6875" height="11.8125" class="st7"/>
+			<text x="8.06" y="177.63" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>X<tspan dy="-0.279em"
+						class="st9" v:baseFontSize="12">1</tspan></text>		</g>
+		<g id="shape52-35" v:mID="52" v:groupContext="shape" transform="translate(72.375,-20.0132)">
+			<title>Sheet.52</title>
+			<desc>X2</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="14.3438" cy="174.026" width="28.69" height="11.8125"/>
+			<rect x="0" y="168.119" width="28.6875" height="11.8125" class="st7"/>
+			<text x="8.06" y="177.63" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>X<tspan dy="-0.279em"
+						class="st9" v:baseFontSize="12">2</tspan></text>		</g>
+		<g id="shape53-39" v:mID="53" v:groupContext="shape" transform="translate(205.688,-148.826)">
+			<title>Sheet.53</title>
+			<desc>Y</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="10.6875" cy="174.026" width="21.38" height="11.8125"/>
+			<rect x="0" y="168.119" width="21.375" height="11.8125" class="st7"/>
+			<text x="6.35" y="177.63" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>Y</text>		</g>
+		<g id="shape54-42" v:mID="54" v:groupContext="shape" transform="translate(200.625,-60.1114)">
+			<title>Sheet.54</title>
+			<desc>Xr = X + α(X1 – X) + β(X2 – X)</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="87.75" cy="170.932" width="175.5" height="18"/>
+			<rect x="0" y="161.932" width="175.5" height="18" class="st7"/>
+			<text x="12.79" y="174.53" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>X<tspan dy="-0.279em"
+						class="st9" v:baseFontSize="12">r </tspan><tspan dy="0.181em" class="st10">= X + </tspan>α(X<tspan
+						dy="-0.279em" class="st9" v:baseFontSize="12">1 </tspan><tspan dy="0.181em" class="st10">–</tspan> X) + β(X<tspan
+						dy="-0.279em" class="st9" v:baseFontSize="12">2 </tspan><tspan dy="0.181em" class="st10">–</tspan> X)  </text>		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/partition_codingblock.svg b/media/libaom/src/doc/img/partition_codingblock.svg
new file mode 100644
index 0000000000..872692dbd7
--- /dev/null
+++ b/media/libaom/src/doc/img/partition_codingblock.svg
@@ -0,0 +1,225 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export partition_codingblock.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="8.02083in" height="8.51563in"
+		viewBox="0 0 577.5 613.125" xml:space="preserve" color-interpolation-filters="sRGB" class="st6">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st2 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st3 {fill:#000000;font-family:Consolas;font-size:1.16666em}
+		.st4 {font-size:1em}
+		.st5 {stroke:#0070c0;stroke-dasharray:1.5,3;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+		.st6 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(0.75,-468.375)">
+			<title>Square</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape6-3" v:mID="6" v:groupContext="shape" transform="translate(216.75,-468.375)">
+			<title>Square.6</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape15-5" v:mID="15" v:groupContext="shape" transform="translate(432.75,-468.375)">
+			<title>Square.15</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape24-7" v:mID="24" v:groupContext="shape" transform="translate(0.75,-252.375)">
+			<title>Square.24</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape30-9" v:mID="30" v:groupContext="shape" transform="translate(216.75,-252.375)">
+			<title>Square.30</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape34-11" v:mID="34" v:groupContext="shape" transform="translate(432.75,-252.375)">
+			<title>Square.34</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape38-13" v:mID="38" v:groupContext="shape" transform="translate(0.75,-36.375)">
+			<title>Square.38</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape42-15" v:mID="42" v:groupContext="shape" transform="translate(216.75,-36.375)">
+			<title>Square.42</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape47-17" v:mID="47" v:groupContext="shape" transform="translate(432.75,-36.375)">
+			<title>Square.47</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="469.125" width="144" height="144" class="st1"/>
+		</g>
+		<g id="shape50-19" v:mID="50" v:groupContext="shape" transform="translate(0.75,-436.875)">
+			<title>Sheet.50</title>
+			<desc>PARTITION_SPLIT</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="14.27" y="610.43" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_SPLIT</text>		</g>
+		<g id="shape51-22" v:mID="51" v:groupContext="shape" transform="translate(216.75,-436.875)">
+			<title>Sheet.51</title>
+			<desc>PARTITION_VERT_4</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="10.42" y="610.43" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_VERT_4</text>		</g>
+		<g id="shape52-25" v:mID="52" v:groupContext="shape" transform="translate(432.75,-436.875)">
+			<title>Sheet.52</title>
+			<desc>PARTITION_HORZ_4</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="10.42" y="610.43" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_HORZ_4</text>		</g>
+		<g id="shape60-28" v:mID="60" v:groupContext="shape" transform="translate(0.75,-220.875)">
+			<title>Sheet.60</title>
+			<desc>PARTITION_HORZ_B</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="10.42" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_HORZ_<tspan
+						class="st4" v:langID="2052">B</tspan></text>		</g>
+		<g id="shape61-32" v:mID="61" v:groupContext="shape" transform="translate(216.75,-220.875)">
+			<title>Sheet.61</title>
+			<desc>PARTITION_VERT_A</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="10.42" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_VERT_A</text>		</g>
+		<g id="shape62-35" v:mID="62" v:groupContext="shape" transform="translate(432.75,-220.875)">
+			<title>Sheet.62</title>
+			<desc>PARTITION_HORZ_A</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="10.42" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_HORZ_A</text>		</g>
+		<g id="shape63-38" v:mID="63" v:groupContext="shape" transform="translate(0.75,-0.375)">
+			<title>Sheet.63</title>
+			<desc>PARTITION_VERT_B</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="10.42" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_VERT_<tspan
+						class="st4" v:langID="2052">B</tspan></text>		</g>
+		<g id="shape64-42" v:mID="64" v:groupContext="shape" transform="translate(216.75,-0.375)">
+			<title>Sheet.64</title>
+			<desc>PARTITION_HORZ</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="18.12" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_HORZ</text>		</g>
+		<g id="shape65-45" v:mID="65" v:groupContext="shape" transform="translate(432.75,-0.375)">
+			<title>Sheet.65</title>
+			<desc>PARTITION_VERT</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+			<rect x="0" y="586.125" width="144" height="27" class="st2"/>
+			<text x="18.12" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_<tspan
+						class="st4" v:langID="2052">VERT</tspan></text>		</g>
+		<g id="shape66-49" v:mID="66" v:groupContext="shape" transform="translate(685.875,0.75) rotate(90)">
+			<title>Sheet.66</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape67-52" v:mID="67" v:groupContext="shape" transform="translate(0.75,-540.375)">
+			<title>Sheet.67</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape68-55" v:mID="68" v:groupContext="shape" transform="translate(865.875,0.750007) rotate(90)">
+			<title>Sheet.68</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape69-58" v:mID="69" v:groupContext="shape" transform="translate(901.875,0.750007) rotate(90)">
+			<title>Sheet.69</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape70-61" v:mID="70" v:groupContext="shape" transform="translate(937.875,0.750007) rotate(90)">
+			<title>Sheet.70</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape71-64" v:mID="71" v:groupContext="shape" transform="translate(432.75,-504.375)">
+			<title>Sheet.71</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape72-67" v:mID="72" v:groupContext="shape" transform="translate(432.75,-540.375)">
+			<title>Sheet.72</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape73-70" v:mID="73" v:groupContext="shape" transform="translate(432.75,-576.375)">
+			<title>Sheet.73</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape74-73" v:mID="74" v:groupContext="shape" transform="translate(0.75,-324.375)">
+			<title>Sheet.74</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape75-76" v:mID="75" v:groupContext="shape" transform="translate(685.875,288.75) rotate(90)">
+			<title>Sheet.75</title>
+			<path d="M0 613.13 L72 613.13" class="st5"/>
+		</g>
+		<g id="shape76-79" v:mID="76" v:groupContext="shape" transform="translate(901.875,216.75) rotate(90)">
+			<title>Sheet.76</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape77-82" v:mID="77" v:groupContext="shape" transform="translate(216.75,-324.375)">
+			<title>Sheet.77</title>
+			<path d="M0 613.13 L72 613.13" class="st5"/>
+		</g>
+		<g id="shape78-85" v:mID="78" v:groupContext="shape" transform="translate(432.75,-324.375)">
+			<title>Sheet.78</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape79-88" v:mID="79" v:groupContext="shape" transform="translate(1117.88,216.75) rotate(90)">
+			<title>Sheet.79</title>
+			<path d="M0 613.13 L72 613.13" class="st5"/>
+		</g>
+		<g id="shape80-91" v:mID="80" v:groupContext="shape" transform="translate(685.875,432.75) rotate(90)">
+			<title>Sheet.80</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape81-94" v:mID="81" v:groupContext="shape" transform="translate(72.75,-108.375)">
+			<title>Sheet.81</title>
+			<path d="M0 613.13 L72 613.13" class="st5"/>
+		</g>
+		<g id="shape82-97" v:mID="82" v:groupContext="shape" transform="translate(216.75,-108.375)">
+			<title>Sheet.82</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+		<g id="shape83-100" v:mID="83" v:groupContext="shape" transform="translate(1117.88,432.75) rotate(90)">
+			<title>Sheet.83</title>
+			<path d="M0 613.13 L144 613.13" class="st5"/>
+		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/primary_tap.svg b/media/libaom/src/doc/img/primary_tap.svg
new file mode 100644
index 0000000000..8cd2a18134
--- /dev/null
+++ b/media/libaom/src/doc/img/primary_tap.svg
@@ -0,0 +1,1589 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export primary_tap.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="11.2533in" height="6.63188in"
+		viewBox="0 0 810.24 477.495" xml:space="preserve" color-interpolation-filters="sRGB" class="st7">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#ffffff;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {fill:#000000;font-family:Calibri;font-size:1.00001em;font-style:italic}
+		.st3 {font-size:1em;font-style:normal}
+		.st4 {fill:#00b0f0;fill-opacity:0.5;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st5 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st6 {fill:#000000;font-family:Calibri;font-size:1.5em;font-style:italic}
+		.st7 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.12,-423.375)">
+			<title>Square</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape2-3" v:mID="2" v:groupContext="shape" transform="translate(54.12,-423.375)">
+			<title>Square.2</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape3-5" v:mID="3" v:groupContext="shape" transform="translate(90.12,-423.375)">
+			<title>Square.3</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape4-7" v:mID="4" v:groupContext="shape" transform="translate(126.12,-423.375)">
+			<title>Square.4</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape5-9" v:mID="5" v:groupContext="shape" transform="translate(162.12,-423.375)">
+			<title>Square.5</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape6-13" v:mID="6" v:groupContext="shape" transform="translate(18.12,-387.375)">
+			<title>Square.6</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape7-15" v:mID="7" v:groupContext="shape" transform="translate(54.12,-387.375)">
+			<title>Square.7</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape8-17" v:mID="8" v:groupContext="shape" transform="translate(90.12,-387.375)">
+			<title>Square.8</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape9-19" v:mID="9" v:groupContext="shape" transform="translate(126.12,-387.375)">
+			<title>Square.9</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape10-23" v:mID="10" v:groupContext="shape" transform="translate(162.12,-387.375)">
+			<title>Square.10</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape11-25" v:mID="11" v:groupContext="shape" transform="translate(18.12,-351.375)">
+			<title>Square.11</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape12-27" v:mID="12" v:groupContext="shape" transform="translate(54.12,-351.375)">
+			<title>Square.12</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape13-29" v:mID="13" v:groupContext="shape" transform="translate(90.12,-351.375)">
+			<title>Square.13</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape14-31" v:mID="14" v:groupContext="shape" transform="translate(126.12,-351.375)">
+			<title>Square.14</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape15-33" v:mID="15" v:groupContext="shape" transform="translate(162.12,-351.375)">
+			<title>Square.15</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape16-35" v:mID="16" v:groupContext="shape" transform="translate(18.12,-315.375)">
+			<title>Square.16</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape17-37" v:mID="17" v:groupContext="shape" transform="translate(54.12,-315.375)">
+			<title>Square.17</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape18-41" v:mID="18" v:groupContext="shape" transform="translate(90.12,-315.375)">
+			<title>Square.18</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape19-43" v:mID="19" v:groupContext="shape" transform="translate(126.12,-315.375)">
+			<title>Square.19</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape20-45" v:mID="20" v:groupContext="shape" transform="translate(162.12,-315.375)">
+			<title>Square.20</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape21-47" v:mID="21" v:groupContext="shape" transform="translate(18.12,-279.375)">
+			<title>Square.21</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape22-51" v:mID="22" v:groupContext="shape" transform="translate(54.12,-279.375)">
+			<title>Square.22</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape23-53" v:mID="23" v:groupContext="shape" transform="translate(90.12,-279.375)">
+			<title>Square.23</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape24-55" v:mID="24" v:groupContext="shape" transform="translate(126.12,-279.375)">
+			<title>Square.24</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape25-57" v:mID="25" v:groupContext="shape" transform="translate(162.12,-279.375)">
+			<title>Square.25</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape30-59" v:mID="30" v:groupContext="shape" transform="translate(216.12,-423.375)">
+			<title>Square.30</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape31-61" v:mID="31" v:groupContext="shape" transform="translate(252.12,-423.375)">
+			<title>Square.31</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape32-63" v:mID="32" v:groupContext="shape" transform="translate(288.12,-423.375)">
+			<title>Square.32</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape33-65" v:mID="33" v:groupContext="shape" transform="translate(324.12,-423.375)">
+			<title>Square.33</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape34-67" v:mID="34" v:groupContext="shape" transform="translate(360.12,-423.375)">
+			<title>Square.34</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape35-69" v:mID="35" v:groupContext="shape" transform="translate(216.12,-387.375)">
+			<title>Square.35</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape36-71" v:mID="36" v:groupContext="shape" transform="translate(252.12,-387.375)">
+			<title>Square.36</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape37-73" v:mID="37" v:groupContext="shape" transform="translate(288.12,-387.375)">
+			<title>Square.37</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape38-75" v:mID="38" v:groupContext="shape" transform="translate(324.12,-387.375)">
+			<title>Square.38</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape39-77" v:mID="39" v:groupContext="shape" transform="translate(360.12,-387.375)">
+			<title>Square.39</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape40-81" v:mID="40" v:groupContext="shape" transform="translate(216.12,-351.375)">
+			<title>Square.40</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape41-83" v:mID="41" v:groupContext="shape" transform="translate(252.12,-351.375)">
+			<title>Square.41</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape42-87" v:mID="42" v:groupContext="shape" transform="translate(288.12,-351.375)">
+			<title>Square.42</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape43-89" v:mID="43" v:groupContext="shape" transform="translate(324.12,-351.375)">
+			<title>Square.43</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape44-93" v:mID="44" v:groupContext="shape" transform="translate(360.12,-351.375)">
+			<title>Square.44</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape45-95" v:mID="45" v:groupContext="shape" transform="translate(216.12,-315.375)">
+			<title>Square.45</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape46-99" v:mID="46" v:groupContext="shape" transform="translate(252.12,-315.375)">
+			<title>Square.46</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape47-101" v:mID="47" v:groupContext="shape" transform="translate(288.12,-315.375)">
+			<title>Square.47</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape48-103" v:mID="48" v:groupContext="shape" transform="translate(324.12,-315.375)">
+			<title>Square.48</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape49-105" v:mID="49" v:groupContext="shape" transform="translate(360.12,-315.375)">
+			<title>Square.49</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape50-107" v:mID="50" v:groupContext="shape" transform="translate(216.12,-279.375)">
+			<title>Square.50</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape51-109" v:mID="51" v:groupContext="shape" transform="translate(252.12,-279.375)">
+			<title>Square.51</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape52-111" v:mID="52" v:groupContext="shape" transform="translate(288.12,-279.375)">
+			<title>Square.52</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape53-113" v:mID="53" v:groupContext="shape" transform="translate(324.12,-279.375)">
+			<title>Square.53</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape54-115" v:mID="54" v:groupContext="shape" transform="translate(360.12,-279.375)">
+			<title>Square.54</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape55-117" v:mID="55" v:groupContext="shape" transform="translate(414.12,-423.375)">
+			<title>Square.55</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape56-119" v:mID="56" v:groupContext="shape" transform="translate(450.12,-423.375)">
+			<title>Square.56</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape57-121" v:mID="57" v:groupContext="shape" transform="translate(486.12,-423.375)">
+			<title>Square.57</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape58-123" v:mID="58" v:groupContext="shape" transform="translate(522.12,-423.375)">
+			<title>Square.58</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape59-125" v:mID="59" v:groupContext="shape" transform="translate(558.12,-423.375)">
+			<title>Square.59</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape60-127" v:mID="60" v:groupContext="shape" transform="translate(414.12,-387.375)">
+			<title>Square.60</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape61-129" v:mID="61" v:groupContext="shape" transform="translate(450.12,-387.375)">
+			<title>Square.61</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape62-131" v:mID="62" v:groupContext="shape" transform="translate(486.12,-387.375)">
+			<title>Square.62</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape63-133" v:mID="63" v:groupContext="shape" transform="translate(522.12,-387.375)">
+			<title>Square.63</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape64-135" v:mID="64" v:groupContext="shape" transform="translate(558.12,-387.375)">
+			<title>Square.64</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape65-137" v:mID="65" v:groupContext="shape" transform="translate(414.12,-351.375)">
+			<title>Square.65</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape66-141" v:mID="66" v:groupContext="shape" transform="translate(450.12,-351.375)">
+			<title>Square.66</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape67-145" v:mID="67" v:groupContext="shape" transform="translate(486.12,-351.375)">
+			<title>Square.67</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape68-147" v:mID="68" v:groupContext="shape" transform="translate(522.12,-351.375)">
+			<title>Square.68</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape69-151" v:mID="69" v:groupContext="shape" transform="translate(558.12,-351.375)">
+			<title>Square.69</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape70-155" v:mID="70" v:groupContext="shape" transform="translate(414.12,-315.375)">
+			<title>Square.70</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape71-157" v:mID="71" v:groupContext="shape" transform="translate(450.12,-315.375)">
+			<title>Square.71</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape72-159" v:mID="72" v:groupContext="shape" transform="translate(486.12,-315.375)">
+			<title>Square.72</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape73-161" v:mID="73" v:groupContext="shape" transform="translate(522.12,-315.375)">
+			<title>Square.73</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape74-163" v:mID="74" v:groupContext="shape" transform="translate(558.12,-315.375)">
+			<title>Square.74</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape75-165" v:mID="75" v:groupContext="shape" transform="translate(414.12,-279.375)">
+			<title>Square.75</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape76-167" v:mID="76" v:groupContext="shape" transform="translate(450.12,-279.375)">
+			<title>Square.76</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape77-169" v:mID="77" v:groupContext="shape" transform="translate(486.12,-279.375)">
+			<title>Square.77</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape78-171" v:mID="78" v:groupContext="shape" transform="translate(522.12,-279.375)">
+			<title>Square.78</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape79-173" v:mID="79" v:groupContext="shape" transform="translate(558.12,-279.375)">
+			<title>Square.79</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape80-175" v:mID="80" v:groupContext="shape" transform="translate(612.12,-423.375)">
+			<title>Square.80</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape81-177" v:mID="81" v:groupContext="shape" transform="translate(648.12,-423.375)">
+			<title>Square.81</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape82-179" v:mID="82" v:groupContext="shape" transform="translate(684.12,-423.375)">
+			<title>Square.82</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape83-181" v:mID="83" v:groupContext="shape" transform="translate(720.12,-423.375)">
+			<title>Square.83</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape84-183" v:mID="84" v:groupContext="shape" transform="translate(756.12,-423.375)">
+			<title>Square.84</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape85-185" v:mID="85" v:groupContext="shape" transform="translate(612.12,-387.375)">
+			<title>Square.85</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape86-189" v:mID="86" v:groupContext="shape" transform="translate(648.12,-387.375)">
+			<title>Square.86</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape87-191" v:mID="87" v:groupContext="shape" transform="translate(684.12,-387.375)">
+			<title>Square.87</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape88-193" v:mID="88" v:groupContext="shape" transform="translate(720.12,-387.375)">
+			<title>Square.88</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape89-195" v:mID="89" v:groupContext="shape" transform="translate(756.12,-387.375)">
+			<title>Square.89</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape90-197" v:mID="90" v:groupContext="shape" transform="translate(612.12,-351.375)">
+			<title>Square.90</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape91-199" v:mID="91" v:groupContext="shape" transform="translate(648.12,-351.375)">
+			<title>Square.91</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape92-203" v:mID="92" v:groupContext="shape" transform="translate(684.12,-351.375)">
+			<title>Square.92</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape93-205" v:mID="93" v:groupContext="shape" transform="translate(720.12,-351.375)">
+			<title>Square.93</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape94-209" v:mID="94" v:groupContext="shape" transform="translate(756.12,-351.375)">
+			<title>Square.94</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape95-211" v:mID="95" v:groupContext="shape" transform="translate(612.12,-315.375)">
+			<title>Square.95</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape96-213" v:mID="96" v:groupContext="shape" transform="translate(648.12,-315.375)">
+			<title>Square.96</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape97-215" v:mID="97" v:groupContext="shape" transform="translate(684.12,-315.375)">
+			<title>Square.97</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape98-217" v:mID="98" v:groupContext="shape" transform="translate(720.12,-315.375)">
+			<title>Square.98</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape99-219" v:mID="99" v:groupContext="shape" transform="translate(756.12,-315.375)">
+			<title>Square.99</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape100-223" v:mID="100" v:groupContext="shape" transform="translate(612.12,-279.375)">
+			<title>Square.100</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape101-225" v:mID="101" v:groupContext="shape" transform="translate(648.12,-279.375)">
+			<title>Square.101</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape102-227" v:mID="102" v:groupContext="shape" transform="translate(684.12,-279.375)">
+			<title>Square.102</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape103-229" v:mID="103" v:groupContext="shape" transform="translate(720.12,-279.375)">
+			<title>Square.103</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape104-231" v:mID="104" v:groupContext="shape" transform="translate(756.12,-279.375)">
+			<title>Square.104</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape115-233" v:mID="115" v:groupContext="shape" transform="translate(18.12,-189.375)">
+			<title>Square.115</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape116-237" v:mID="116" v:groupContext="shape" transform="translate(54.12,-189.375)">
+			<title>Square.116</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape117-239" v:mID="117" v:groupContext="shape" transform="translate(90.12,-189.375)">
+			<title>Square.117</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape118-241" v:mID="118" v:groupContext="shape" transform="translate(126.12,-189.375)">
+			<title>Square.118</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape119-243" v:mID="119" v:groupContext="shape" transform="translate(162.12,-189.375)">
+			<title>Square.119</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape120-245" v:mID="120" v:groupContext="shape" transform="translate(18.12,-153.375)">
+			<title>Square.120</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape121-247" v:mID="121" v:groupContext="shape" transform="translate(54.12,-153.375)">
+			<title>Square.121</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape122-251" v:mID="122" v:groupContext="shape" transform="translate(90.12,-153.375)">
+			<title>Square.122</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape123-253" v:mID="123" v:groupContext="shape" transform="translate(126.12,-153.375)">
+			<title>Square.123</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape124-255" v:mID="124" v:groupContext="shape" transform="translate(162.12,-153.375)">
+			<title>Square.124</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape125-257" v:mID="125" v:groupContext="shape" transform="translate(18.12,-117.375)">
+			<title>Square.125</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape126-259" v:mID="126" v:groupContext="shape" transform="translate(54.12,-117.375)">
+			<title>Square.126</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape127-261" v:mID="127" v:groupContext="shape" transform="translate(90.12,-117.375)">
+			<title>Square.127</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape128-263" v:mID="128" v:groupContext="shape" transform="translate(126.12,-117.375)">
+			<title>Square.128</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape129-265" v:mID="129" v:groupContext="shape" transform="translate(162.12,-117.375)">
+			<title>Square.129</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape130-267" v:mID="130" v:groupContext="shape" transform="translate(18.12,-81.375)">
+			<title>Square.130</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape131-269" v:mID="131" v:groupContext="shape" transform="translate(54.12,-81.375)">
+			<title>Square.131</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape132-271" v:mID="132" v:groupContext="shape" transform="translate(90.12,-81.3749)">
+			<title>Square.132</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape133-273" v:mID="133" v:groupContext="shape" transform="translate(126.12,-81.3749)">
+			<title>Square.133</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape134-277" v:mID="134" v:groupContext="shape" transform="translate(162.12,-81.3749)">
+			<title>Square.134</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape135-279" v:mID="135" v:groupContext="shape" transform="translate(18.12,-45.375)">
+			<title>Square.135</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape136-281" v:mID="136" v:groupContext="shape" transform="translate(54.12,-45.375)">
+			<title>Square.136</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape137-283" v:mID="137" v:groupContext="shape" transform="translate(90.12,-45.375)">
+			<title>Square.137</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape138-285" v:mID="138" v:groupContext="shape" transform="translate(126.12,-45.375)">
+			<title>Square.138</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape139-287" v:mID="139" v:groupContext="shape" transform="translate(162.12,-45.375)">
+			<title>Square.139</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape140-291" v:mID="140" v:groupContext="shape" transform="translate(216.12,-189.375)">
+			<title>Square.140</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape141-293" v:mID="141" v:groupContext="shape" transform="translate(252.12,-189.375)">
+			<title>Square.141</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape142-297" v:mID="142" v:groupContext="shape" transform="translate(288.12,-189.375)">
+			<title>Square.142</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape143-299" v:mID="143" v:groupContext="shape" transform="translate(324.12,-189.375)">
+			<title>Square.143</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape144-301" v:mID="144" v:groupContext="shape" transform="translate(360.12,-189.375)">
+			<title>Square.144</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape145-303" v:mID="145" v:groupContext="shape" transform="translate(216.12,-153.375)">
+			<title>Square.145</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape146-305" v:mID="146" v:groupContext="shape" transform="translate(252.12,-153.375)">
+			<title>Square.146</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape147-307" v:mID="147" v:groupContext="shape" transform="translate(288.12,-153.375)">
+			<title>Square.147</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape148-311" v:mID="148" v:groupContext="shape" transform="translate(324.12,-153.375)">
+			<title>Square.148</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape149-313" v:mID="149" v:groupContext="shape" transform="translate(360.12,-153.375)">
+			<title>Square.149</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape150-315" v:mID="150" v:groupContext="shape" transform="translate(216.12,-117.375)">
+			<title>Square.150</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape151-317" v:mID="151" v:groupContext="shape" transform="translate(252.12,-117.375)">
+			<title>Square.151</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape152-319" v:mID="152" v:groupContext="shape" transform="translate(288.12,-117.375)">
+			<title>Square.152</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape153-321" v:mID="153" v:groupContext="shape" transform="translate(324.12,-117.375)">
+			<title>Square.153</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape154-323" v:mID="154" v:groupContext="shape" transform="translate(360.12,-117.375)">
+			<title>Square.154</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape155-325" v:mID="155" v:groupContext="shape" transform="translate(216.12,-81.3749)">
+			<title>Square.155</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape156-327" v:mID="156" v:groupContext="shape" transform="translate(252.12,-81.3749)">
+			<title>Square.156</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape157-329" v:mID="157" v:groupContext="shape" transform="translate(288.12,-81.3749)">
+			<title>Square.157</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape158-333" v:mID="158" v:groupContext="shape" transform="translate(324.12,-81.3749)">
+			<title>Square.158</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape159-335" v:mID="159" v:groupContext="shape" transform="translate(360.12,-81.3749)">
+			<title>Square.159</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape160-337" v:mID="160" v:groupContext="shape" transform="translate(216.12,-45.3749)">
+			<title>Square.160</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape161-339" v:mID="161" v:groupContext="shape" transform="translate(252.12,-45.3749)">
+			<title>Square.161</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape162-341" v:mID="162" v:groupContext="shape" transform="translate(288.12,-45.3749)">
+			<title>Square.162</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape163-343" v:mID="163" v:groupContext="shape" transform="translate(324.12,-45.3749)">
+			<title>Square.163</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape164-347" v:mID="164" v:groupContext="shape" transform="translate(360.12,-45.3749)">
+			<title>Square.164</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape165-349" v:mID="165" v:groupContext="shape" transform="translate(414.12,-189.375)">
+			<title>Square.165</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape166-351" v:mID="166" v:groupContext="shape" transform="translate(450.12,-189.375)">
+			<title>Square.166</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape167-353" v:mID="167" v:groupContext="shape" transform="translate(486.12,-189.375)">
+			<title>Square.167</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape168-357" v:mID="168" v:groupContext="shape" transform="translate(522.12,-189.375)">
+			<title>Square.168</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape169-359" v:mID="169" v:groupContext="shape" transform="translate(558.12,-189.375)">
+			<title>Square.169</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape170-361" v:mID="170" v:groupContext="shape" transform="translate(414.12,-153.375)">
+			<title>Square.170</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape171-363" v:mID="171" v:groupContext="shape" transform="translate(450.12,-153.375)">
+			<title>Square.171</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape172-365" v:mID="172" v:groupContext="shape" transform="translate(486.12,-153.375)">
+			<title>Square.172</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape173-369" v:mID="173" v:groupContext="shape" transform="translate(522.12,-153.375)">
+			<title>Square.173</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape174-371" v:mID="174" v:groupContext="shape" transform="translate(558.12,-153.375)">
+			<title>Square.174</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape175-373" v:mID="175" v:groupContext="shape" transform="translate(414.12,-117.375)">
+			<title>Square.175</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape176-375" v:mID="176" v:groupContext="shape" transform="translate(450.12,-117.375)">
+			<title>Square.176</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape177-377" v:mID="177" v:groupContext="shape" transform="translate(486.12,-117.375)">
+			<title>Square.177</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape178-379" v:mID="178" v:groupContext="shape" transform="translate(522.12,-117.375)">
+			<title>Square.178</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape179-381" v:mID="179" v:groupContext="shape" transform="translate(558.12,-117.375)">
+			<title>Square.179</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape180-383" v:mID="180" v:groupContext="shape" transform="translate(414.12,-81.3749)">
+			<title>Square.180</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape181-385" v:mID="181" v:groupContext="shape" transform="translate(450.12,-81.3749)">
+			<title>Square.181</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape182-387" v:mID="182" v:groupContext="shape" transform="translate(486.12,-81.3749)">
+			<title>Square.182</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape183-391" v:mID="183" v:groupContext="shape" transform="translate(522.12,-81.3749)">
+			<title>Square.183</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape184-393" v:mID="184" v:groupContext="shape" transform="translate(558.12,-81.3749)">
+			<title>Square.184</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape185-395" v:mID="185" v:groupContext="shape" transform="translate(414.12,-45.3749)">
+			<title>Square.185</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape186-397" v:mID="186" v:groupContext="shape" transform="translate(450.12,-45.3749)">
+			<title>Square.186</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape187-399" v:mID="187" v:groupContext="shape" transform="translate(486.12,-45.3749)">
+			<title>Square.187</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape188-403" v:mID="188" v:groupContext="shape" transform="translate(522.12,-45.3749)">
+			<title>Square.188</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape189-405" v:mID="189" v:groupContext="shape" transform="translate(558.12,-45.3749)">
+			<title>Square.189</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape190-407" v:mID="190" v:groupContext="shape" transform="translate(612.12,-189.375)">
+			<title>Square.190</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape191-409" v:mID="191" v:groupContext="shape" transform="translate(648.12,-189.375)">
+			<title>Square.191</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape192-411" v:mID="192" v:groupContext="shape" transform="translate(684.12,-189.375)">
+			<title>Square.192</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape193-413" v:mID="193" v:groupContext="shape" transform="translate(720.12,-189.375)">
+			<title>Square.193</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape194-417" v:mID="194" v:groupContext="shape" transform="translate(756.12,-189.375)">
+			<title>Square.194</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape195-419" v:mID="195" v:groupContext="shape" transform="translate(612.12,-153.375)">
+			<title>Square.195</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape196-421" v:mID="196" v:groupContext="shape" transform="translate(648.12,-153.375)">
+			<title>Square.196</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape197-423" v:mID="197" v:groupContext="shape" transform="translate(684.12,-153.375)">
+			<title>Square.197</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape198-427" v:mID="198" v:groupContext="shape" transform="translate(720.12,-153.375)">
+			<title>Square.198</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape199-429" v:mID="199" v:groupContext="shape" transform="translate(756.12,-153.375)">
+			<title>Square.199</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape200-431" v:mID="200" v:groupContext="shape" transform="translate(612.12,-117.375)">
+			<title>Square.200</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape201-433" v:mID="201" v:groupContext="shape" transform="translate(648.12,-117.375)">
+			<title>Square.201</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape202-435" v:mID="202" v:groupContext="shape" transform="translate(684.12,-117.375)">
+			<title>Square.202</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st4"/>
+		</g>
+		<g id="shape203-437" v:mID="203" v:groupContext="shape" transform="translate(720.12,-117.375)">
+			<title>Square.203</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape204-439" v:mID="204" v:groupContext="shape" transform="translate(756.12,-117.375)">
+			<title>Square.204</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape205-441" v:mID="205" v:groupContext="shape" transform="translate(612.12,-81.3749)">
+			<title>Square.205</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape206-443" v:mID="206" v:groupContext="shape" transform="translate(648.12,-81.3749)">
+			<title>Square.206</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape207-445" v:mID="207" v:groupContext="shape" transform="translate(684.12,-81.3749)">
+			<title>Square.207</title>
+			<desc>b/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape208-449" v:mID="208" v:groupContext="shape" transform="translate(720.12,-81.3749)">
+			<title>Square.208</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape209-451" v:mID="209" v:groupContext="shape" transform="translate(756.12,-81.3749)">
+			<title>Square.209</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape210-453" v:mID="210" v:groupContext="shape" transform="translate(612.12,-45.3749)">
+			<title>Square.210</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape211-455" v:mID="211" v:groupContext="shape" transform="translate(648.12,-45.3749)">
+			<title>Square.211</title>
+			<desc>a/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+			<text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text>		</g>
+		<g id="shape212-459" v:mID="212" v:groupContext="shape" transform="translate(684.12,-45.3749)">
+			<title>Square.212</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape213-461" v:mID="213" v:groupContext="shape" transform="translate(720.12,-45.3749)">
+			<title>Square.213</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape214-463" v:mID="214" v:groupContext="shape" transform="translate(756.12,-45.3749)">
+			<title>Square.214</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="441.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape236-465" v:mID="236" v:groupContext="shape" transform="translate(54.12,-252.375)">
+			<title>Sheet.236</title>
+			<desc>d = 0</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 0</tspan></text>		</g>
+		<g id="shape237-470" v:mID="237" v:groupContext="shape" transform="translate(252.12,-252.375)">
+			<title>Sheet.237</title>
+			<desc>d = 1</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 1</tspan></text>		</g>
+		<g id="shape238-475" v:mID="238" v:groupContext="shape" transform="translate(450.12,-252.375)">
+			<title>Sheet.238</title>
+			<desc>d = 2</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 2</tspan></text>		</g>
+		<g id="shape239-480" v:mID="239" v:groupContext="shape" transform="translate(648.12,-252.375)">
+			<title>Sheet.239</title>
+			<desc>d = 3</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 3</tspan></text>		</g>
+		<g id="shape240-485" v:mID="240" v:groupContext="shape" transform="translate(54.12,-18.375)">
+			<title>Sheet.240</title>
+			<desc>d = 4</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 4</tspan></text>		</g>
+		<g id="shape241-490" v:mID="241" v:groupContext="shape" transform="translate(252.12,-18.375)">
+			<title>Sheet.241</title>
+			<desc>d = 5</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 5</tspan></text>		</g>
+		<g id="shape242-495" v:mID="242" v:groupContext="shape" transform="translate(450.12,-18.375)">
+			<title>Sheet.242</title>
+			<desc>d = 6</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 6</tspan></text>		</g>
+		<g id="shape243-500" v:mID="243" v:groupContext="shape" transform="translate(648.12,-18.375)">
+			<title>Sheet.243</title>
+			<desc>d = 7</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="463.995" width="108" height="27"/>
+			<rect x="0" y="450.495" width="108" height="27" class="st5"/>
+			<text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+						class="st3">= 7</tspan></text>		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/quant_ac.svg b/media/libaom/src/doc/img/quant_ac.svg
new file mode 100644
index 0000000000..3f589c8be6
--- /dev/null
+++ b/media/libaom/src/doc/img/quant_ac.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 653.55 392.07"><defs><style>.cls-1,.cls-10,.cls-12,.cls-2,.cls-20,.cls-26,.cls-33,.cls-4,.cls-6,.cls-7,.cls-9{fill:none;}.cls-2{stroke:#d9d9d9;}.cls-10,.cls-12,.cls-2,.cls-20,.cls-26,.cls-33,.cls-4,.cls-6,.cls-7,.cls-9{stroke-linejoin:round;}.cls-2,.cls-20,.cls-26,.cls-33{stroke-width:0.75px;}.cls-3{clip-path:url(#clip-path);}.cls-20,.cls-4,.cls-6{stroke:#5b9bd5;}.cls-10,.cls-4,.cls-7{stroke-linecap:round;stroke-width:2.25px;}.cls-5{fill:#5b9bd5;}.cls-12,.cls-6,.cls-9{stroke-width:0.72px;}.cls-26,.cls-7,.cls-9{stroke:#ed7d31;}.cls-8{fill:#ed7d31;}.cls-10,.cls-12,.cls-33{stroke:#a5a5a5;}.cls-11{fill:#a5a5a5;}.cls-13{clip-path:url(#clip-path-4);}.cls-14{font-size:9px;font-family:Calibri, Calibri;}.cls-14,.cls-15,.cls-21{fill:#595959;}.cls-15{font-size:15.96px;}.cls-15,.cls-21{font-family:TimesNewRomanPSMT, Times New Roman;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0em;}.cls-18{letter-spacing:0em;}.cls-19{letter-spacing:0em;}.cls-21{font-size:14.04px;}.cls-22{letter-spacing:0em;}.cls-23{letter-spacing:0em;}.cls-24{letter-spacing:0.01em;}.cls-25{letter-spacing:0em;}.cls-27{letter-spacing:0em;}.cls-28{letter-spacing:0em;}.cls-29{letter-spacing:0em;}.cls-30{letter-spacing:0em;}.cls-31{letter-spacing:0em;}.cls-32{letter-spacing:-0.01em;}</style><clipPath id="clip-path"><rect class="cls-1" x="53.78" y="8.9" width="587.4" height="355.08"/></clipPath><clipPath id="clip-path-4"><rect class="cls-1" x="0.38" y="0.38" width="652.8" height="391.32"/></clipPath></defs><title>tables3Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><path class="cls-2" d="M53.81,9H640.53M53.81,59.65H640.53M53.81,110.18H640.53M53.81,160.82H640.53M53.81,211.46H640.53M53.81,262.1H640.53M53.81,312.74H640.53"/><path class="cls-2" d="M626.78,9V363.3M512.18,9V363.3M397.57,9V363.3M283,9V363.3M168.38,9V363.3M53.81,9V363.3"/><line class="cls-2" x1="53.81" y1="363.3" x2="640.53" y2="363.3"/><g class="cls-3"><polyline class="cls-4" points="54.95 363.25 57.26 363.25 59.53 363.25 61.81 363.13 64.09 363.13 66.38 363.13 68.66 363.13 71.06 363.13 73.33 363.13 75.61 363.13 77.89 363.13 80.17 363.13 82.45 363.13 84.73 363.13 87.02 363.13 89.3 363.01 91.58 363.01 93.86 363.01 96.25 363.01 98.53 363.01 100.81 363.01 103.09 363.01 105.38 363.01 107.66 363.01 109.94 363.01 112.22 363.01 114.5 363.01 116.78 362.89 119.17 362.89 121.45 362.89 123.73 362.89 126.02 362.89 128.29 362.89 130.57 362.89 132.85 362.89 135.13 362.89 137.41 362.89 139.69 362.89 142.09 362.89 144.38 362.77 146.66 362.77 148.94 362.77 151.22 362.77 153.5 362.77 155.78 362.77 158.06 362.77 160.34 362.77 162.62 362.77 165.01 362.77 167.29 362.77 169.57 362.77 171.85 362.65 174.13 362.65 176.41 362.65 178.69 362.65 180.97 362.65 183.25 362.65 185.53 362.65 187.94 362.65 190.22 362.65 192.5 362.65 194.78 362.65 197.06 362.65 199.34 362.54 201.62 362.54 203.9 362.54 206.18 362.54 208.46 362.54 210.85 362.54 213.13 362.54 215.41 362.54 217.69 362.54 219.97 362.54 222.25 362.54 224.53 362.42 226.81 362.42 229.09 362.42 231.38 362.42 233.78 362.42 236.06 362.42 238.34 362.42 240.62 362.42 242.9 362.42 245.18 362.42 247.46 362.42 249.74 362.42 252.01 362.3 254.29 362.3 256.69 362.3 258.98 362.3 261.25 362.3 263.54 362.3 265.81 362.3 268.1 362.3 270.38 362.3 272.65 362.3 274.94 362.3 277.21 362.18 279.62 362.18 281.89 362.18 284.18 362.18 286.45 362.18 288.74 362.18 291.01 362.06 293.3 362.06 295.57 362.06 297.86 362.06 300.13 362.06 302.42 362.06 304.81 361.94 307.1 361.94 309.38 361.94 311.65 361.94 313.94 361.94 316.21 361.94 318.5 361.81 320.77 361.81 323.06 361.81 325.33 361.81 327.74 361.81 330.01 361.81 332.3 361.69 334.57 361.69 336.86 361.69 339.13 361.57 341.42 361.57 343.69 361.57 345.98 361.57 348.25 361.45 350.65 361.45 352.94 361.45 355.21 361.45 357.5 361.33 359.77 361.33 362.06 361.33 364.33 361.33 366.62 361.21 368.89 361.21 371.18 361.21 373.57 361.21 375.86 361.1 378.13 361.1 380.42 361.1 382.69 360.98 384.98 360.98 387.25 360.98 389.54 360.86 391.81 360.86 394.1 360.74 396.5 360.74 398.77 360.74 401.06 360.62 403.33 360.62 405.62 360.62 407.89 360.5 410.18 360.5 412.45 360.38 414.74 360.38 417.01 360.25 419.42 360.25 421.69 360.25 423.98 360.13 426.25 360.13 428.54 360.01 430.81 360.01 433.1 359.89 435.38 359.89 437.65 359.77 439.94 359.77 442.33 359.65 444.62 359.54 446.89 359.54 449.18 359.42 451.45 359.42 453.74 359.3 456.01 359.18 458.3 359.18 460.57 359.06 462.86 359.06 465.25 358.94 467.54 358.81 469.81 358.81 472.1 358.69 474.38 358.57 476.65 358.45 478.94 358.45 481.21 358.33 483.5 358.21 485.77 358.1 488.06 357.98 490.45 357.98 492.74 357.86 495.01 357.74 497.3 357.62 499.57 357.5 501.86 357.38 504.13 357.25 506.42 357.13 508.69 357.01 510.98 356.89 513.38 356.77 515.65 356.65 517.93 356.54 520.22 356.42 522.5 356.3 524.77 356.18 527.05 356.06 529.34 355.94 531.62 355.81 533.89 355.57 536.29 355.45 538.58 355.33 540.86 355.21 543.13 354.98 545.41 354.86 547.7 354.74 549.98 354.5 552.25 354.38 554.53 354.25 556.82 354.01 559.22 353.89 561.5 353.65 563.77 353.54 566.05 353.3 568.34 353.06 570.62 352.94 572.89 352.69 575.17 352.45 577.46 352.33 579.74 352.1 582.13 351.86 584.41 351.62 586.7 351.38 588.98 351.13 591.25 350.89 593.53 350.65 595.82 350.42 598.1 350.18 600.38 349.94 602.65 349.69 605.05 349.45 607.34 349.21 609.62 348.86 611.89 348.62 614.17 348.38 616.46 348.01 618.74 347.77 621.01 347.42 623.29 347.18 625.58 346.81 627.98 346.45 630.25 346.21 632.53 345.86 634.82 345.5 637.1 345.13 639.38 344.79"/></g><circle class="cls-5" cx="54.92" cy="363.2" r="1.98"/><circle class="cls-6" cx="54.92" cy="363.2" r="1.98"/><circle class="cls-5" cx="57.2" cy="363.2" r="1.98"/><circle class="cls-6" cx="57.2" cy="363.2" r="1.98"/><circle class="cls-5" cx="59.48" cy="363.2" r="1.98"/><circle class="cls-6" cx="59.48" cy="363.2" r="1.98"/><circle class="cls-5" cx="61.76" cy="363.08" r="1.98"/><circle class="cls-6" cx="61.76" cy="363.08" r="1.98"/><circle class="cls-5" cx="64.04" cy="363.08" r="1.98"/><circle class="cls-6" cx="64.04" cy="363.08" r="1.98"/><circle class="cls-5" cx="66.32" cy="363.08" r="1.98"/><circle class="cls-6" cx="66.32" cy="363.08" r="1.98"/><circle class="cls-5" cx="68.6" cy="363.08" r="1.98"/><circle class="cls-6" cx="68.6" cy="363.08" r="1.98"/><circle class="cls-5" cx="71" cy="363.08" r="1.98"/><circle class="cls-6" cx="71" cy="363.08" r="1.98"/><circle class="cls-5" cx="73.28" cy="363.08" r="1.98"/><circle class="cls-6" cx="73.28" cy="363.08" r="1.98"/><circle class="cls-5" cx="75.56" cy="363.08" r="1.98"/><circle class="cls-6" cx="75.56" cy="363.08" r="1.98"/><circle class="cls-5" cx="77.83" cy="363.08" r="1.98"/><circle class="cls-6" cx="77.83" cy="363.08" r="1.98"/><circle class="cls-5" cx="80.12" cy="363.08" r="1.98"/><circle class="cls-6" cx="80.12" cy="363.08" r="1.98"/><circle class="cls-5" cx="82.4" cy="363.08" r="1.98"/><circle class="cls-6" cx="82.4" cy="363.08" r="1.98"/><circle class="cls-5" cx="84.67" cy="363.08" r="1.98"/><circle class="cls-6" cx="84.67" cy="363.08" r="1.98"/><circle class="cls-5" cx="86.95" cy="363.08" r="1.98"/><circle class="cls-6" cx="86.95" cy="363.08" r="1.98"/><circle class="cls-5" cx="89.24" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -280.75, 415.99)"/><circle class="cls-6" cx="89.24" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -280.75, 415.99)"/><circle class="cls-5" cx="91.52" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -278.69, 418.26)"/><circle class="cls-6" cx="91.52" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -278.69, 418.26)"/><circle class="cls-5" cx="93.8" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -276.64, 420.53)"/><circle class="cls-6" cx="93.8" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -276.64, 420.53)"/><circle class="cls-5" cx="96.19" cy="362.96" r="1.98"/><circle class="cls-6" cx="96.19" cy="362.96" r="1.98"/><circle class="cls-5" cx="98.47" cy="362.96" r="1.98"/><circle class="cls-6" cx="98.47" cy="362.96" r="1.98"/><circle class="cls-5" cx="100.76" cy="362.96" r="1.98"/><circle class="cls-6" cx="100.76" cy="362.96" r="1.98"/><circle class="cls-5" cx="103.03" cy="362.96" r="1.98"/><circle class="cls-6" cx="103.03" cy="362.96" r="1.98"/><circle class="cls-5" cx="105.31" cy="362.96" r="1.98"/><circle class="cls-6" cx="105.31" cy="362.96" r="1.98"/><circle class="cls-5" cx="107.6" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -264.2, 434.26)"/><circle class="cls-6" cx="107.6" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -264.2, 434.26)"/><circle class="cls-5" cx="109.88" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -262.14, 436.53)"/><circle class="cls-6" cx="109.88" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -262.14, 436.53)"/><circle class="cls-5" cx="112.16" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -260.09, 438.8)"/><circle class="cls-6" cx="112.16" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -260.09, 438.8)"/><circle class="cls-5" cx="114.44" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -258.03, 441.07)"/><circle class="cls-6" cx="114.44" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -258.03, 441.07)"/><circle class="cls-5" cx="116.72" cy="362.84" r="1.98"/><circle class="cls-6" cx="116.72" cy="362.84" r="1.98"/><circle class="cls-5" cx="119.12" cy="362.84" r="1.98"/><circle class="cls-6" cx="119.12" cy="362.84" r="1.98"/><circle class="cls-5" cx="121.4" cy="362.84" r="1.98"/><circle class="cls-6" cx="121.4" cy="362.84" r="1.98"/><circle class="cls-5" cx="123.67" cy="362.84" r="1.98"/><circle class="cls-6" cx="123.67" cy="362.84" r="1.98"/><circle class="cls-5" cx="125.95" cy="362.84" r="1.98"/><circle class="cls-6" cx="125.95" cy="362.84" r="1.98"/><circle class="cls-5" cx="128.24" cy="362.84" r="1.98"/><circle class="cls-6" cx="128.24" cy="362.84" r="1.98"/><circle class="cls-5" cx="130.52" cy="362.84" r="1.98"/><circle class="cls-6" cx="130.52" cy="362.84" r="1.98"/><circle class="cls-5" cx="132.8" cy="362.84" r="1.98"/><circle class="cls-6" cx="132.8" cy="362.84" r="1.98"/><circle class="cls-5" cx="135.08" cy="362.84" r="1.98"/><circle class="cls-6" cx="135.08" cy="362.84" r="1.98"/><circle class="cls-5" cx="137.36" cy="362.84" r="1.98"/><circle class="cls-6" cx="137.36" cy="362.84" r="1.98"/><circle class="cls-5" cx="139.64" cy="362.84" r="1.98"/><circle class="cls-6" cx="139.64" cy="362.84" r="1.98"/><circle class="cls-5" cx="142.03" cy="362.84" r="1.98"/><circle class="cls-6" cx="142.03" cy="362.84" r="1.98"/><circle class="cls-5" cx="144.31" cy="362.72" r="1.98"/><circle class="cls-6" cx="144.31" cy="362.72" r="1.98"/><circle class="cls-5" cx="146.6" cy="362.72" r="1.98"/><circle class="cls-6" cx="146.6" cy="362.72" r="1.98"/><circle class="cls-5" cx="148.88" cy="362.72" r="1.98"/><circle class="cls-6" cx="148.88" cy="362.72" r="1.98"/><circle class="cls-5" cx="151.16" cy="362.72" r="1.98"/><circle class="cls-6" cx="151.16" cy="362.72" r="1.98"/><circle class="cls-5" cx="153.44" cy="362.72" r="1.98"/><circle class="cls-6" cx="153.44" cy="362.72" r="1.98"/><circle class="cls-5" cx="155.72" cy="362.72" r="1.98"/><circle class="cls-6" cx="155.72" cy="362.72" r="1.98"/><circle class="cls-5" cx="158" cy="362.72" r="1.98"/><circle class="cls-6" cx="158" cy="362.72" r="1.98"/><circle class="cls-5" cx="160.28" cy="362.72" r="1.98"/><circle class="cls-6" cx="160.28" cy="362.72" r="1.98"/><circle class="cls-5" cx="162.56" cy="362.72" r="1.98"/><circle class="cls-6" cx="162.56" cy="362.72" r="1.98"/><circle class="cls-5" cx="164.95" cy="362.72" r="1.98"/><circle class="cls-6" cx="164.95" cy="362.72" r="1.98"/><circle class="cls-5" cx="167.24" cy="362.72" r="1.98"/><circle class="cls-6" cx="167.24" cy="362.72" r="1.98"/><circle class="cls-5" cx="169.52" cy="362.72" r="1.98"/><circle class="cls-6" cx="169.52" cy="362.72" r="1.98"/><circle class="cls-5" cx="171.8" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -205.96, 497.82)"/><circle class="cls-6" cx="171.8" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -205.96, 497.82)"/><circle class="cls-5" cx="174.08" cy="362.6" r="1.98"/><circle class="cls-6" cx="174.08" cy="362.6" r="1.98"/><circle class="cls-5" cx="176.36" cy="362.6" r="1.98"/><circle class="cls-6" cx="176.36" cy="362.6" r="1.98"/><circle class="cls-5" cx="178.64" cy="362.6" r="1.98"/><circle class="cls-6" cx="178.64" cy="362.6" r="1.98"/><circle class="cls-5" cx="180.92" cy="362.6" r="1.98"/><circle class="cls-6" cx="180.92" cy="362.6" r="1.98"/><circle class="cls-5" cx="183.19" cy="362.6" r="1.98"/><circle class="cls-6" cx="183.19" cy="362.6" r="1.98"/><circle class="cls-5" cx="185.47" cy="362.6" r="1.98"/><circle class="cls-6" cx="185.47" cy="362.6" r="1.98"/><circle class="cls-5" cx="187.88" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -191.47, 513.83)"/><circle class="cls-6" cx="187.88" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -191.47, 513.83)"/><circle class="cls-5" cx="190.16" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -189.41, 516.1)"/><circle class="cls-6" cx="190.16" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -189.41, 516.1)"/><circle class="cls-5" cx="192.44" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -187.36, 518.36)"/><circle class="cls-6" cx="192.44" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -187.36, 518.36)"/><circle class="cls-5" cx="194.72" cy="362.6" r="1.98"/><circle class="cls-6" cx="194.72" cy="362.6" r="1.98"/><circle class="cls-5" cx="197" cy="362.6" r="1.98"/><circle class="cls-6" cx="197" cy="362.6" r="1.98"/><circle class="cls-5" cx="199.28" cy="362.48" r="1.98"/><circle class="cls-6" cx="199.28" cy="362.48" r="1.98"/><circle class="cls-5" cx="201.56" cy="362.48" r="1.98" transform="translate(-34.74 21.62) rotate(-5.65)"/><circle class="cls-6" cx="201.56" cy="362.48" r="1.98" transform="translate(-34.74 21.62) rotate(-5.65)"/><circle class="cls-5" cx="203.83" cy="362.48" r="1.98" transform="translate(-34.73 21.85) rotate(-5.65)"/><circle class="cls-6" cx="203.83" cy="362.48" r="1.98" transform="translate(-34.73 21.85) rotate(-5.65)"/><circle class="cls-5" cx="206.11" cy="362.48" r="1.98" transform="translate(-34.71 22.07) rotate(-5.65)"/><circle class="cls-6" cx="206.11" cy="362.48" r="1.98" transform="translate(-34.71 22.07) rotate(-5.65)"/><path class="cls-5" d="M210.38,362.48a2,2,0,1,1-2-2A2,2,0,0,1,210.38,362.48Z"/><path class="cls-6" d="M210.38,362.48a2,2,0,1,1-2-2A2,2,0,0,1,210.38,362.48Z"/><path class="cls-5" d="M212.78,362.48a2,2,0,1,1-2-2A2,2,0,0,1,212.78,362.48Z"/><path class="cls-6" d="M212.78,362.48a2,2,0,1,1-2-2A2,2,0,0,1,212.78,362.48Z"/><path class="cls-5" d="M215.06,362.48a2,2,0,1,1-2-2A2,2,0,0,1,215.06,362.48Z"/><path class="cls-6" d="M215.06,362.48a2,2,0,1,1-2-2A2,2,0,0,1,215.06,362.48Z"/><path class="cls-5" d="M217.33,362.48a2,2,0,1,1-2-2A2,2,0,0,1,217.33,362.48Z"/><path class="cls-6" d="M217.33,362.48a2,2,0,1,1-2-2A2,2,0,0,1,217.33,362.48Z"/><path class="cls-5" d="M219.61,362.48a2,2,0,1,1-2-2A2,2,0,0,1,219.61,362.48Z"/><path class="cls-6" d="M219.61,362.48a2,2,0,1,1-2-2A2,2,0,0,1,219.61,362.48Z"/><path class="cls-5" d="M221.89,362.48a2,2,0,1,1-2-2A2,2,0,0,1,221.89,362.48Z"/><path class="cls-6" d="M221.89,362.48a2,2,0,1,1-2-2A2,2,0,0,1,221.89,362.48Z"/><path class="cls-5" d="M224.17,362.48a2,2,0,1,1-2-2A2,2,0,0,1,224.17,362.48Z"/><path class="cls-6" d="M224.17,362.48a2,2,0,1,1-2-2A2,2,0,0,1,224.17,362.48Z"/><path class="cls-5" d="M226.45,362.36a2,2,0,1,1-2-2A2,2,0,0,1,226.45,362.36Z"/><path class="cls-6" d="M226.45,362.36a2,2,0,1,1-2-2A2,2,0,0,1,226.45,362.36Z"/><path class="cls-5" d="M228.73,362.36a2,2,0,1,1-2-2A2,2,0,0,1,228.73,362.36Z"/><path class="cls-6" d="M228.73,362.36a2,2,0,1,1-2-2A2,2,0,0,1,228.73,362.36Z"/><path class="cls-5" d="M231,362.36a2,2,0,1,1-2-2A2,2,0,0,1,231,362.36Z"/><path class="cls-6" d="M231,362.36a2,2,0,1,1-2-2A2,2,0,0,1,231,362.36Z"/><path class="cls-5" d="M233.29,362.36a2,2,0,1,1-2-2A2,2,0,0,1,233.29,362.36Z"/><path class="cls-6" d="M233.29,362.36a2,2,0,1,1-2-2A2,2,0,0,1,233.29,362.36Z"/><circle class="cls-5" cx="233.72" cy="362.36" r="1.98" transform="translate(-144.3 569.79) rotate(-85.93)"/><circle class="cls-6" cx="233.72" cy="362.36" r="1.98" transform="translate(-144.3 569.79) rotate(-85.93)"/><circle class="cls-5" cx="236" cy="362.36" r="1.98" transform="translate(-142.18 572.07) rotate(-85.93)"/><circle class="cls-6" cx="236" cy="362.36" r="1.98" transform="translate(-142.18 572.07) rotate(-85.93)"/><circle class="cls-5" cx="238.28" cy="362.36" r="1.98" transform="translate(-140.06 574.34) rotate(-85.93)"/><circle class="cls-6" cx="238.28" cy="362.36" r="1.98" transform="translate(-140.06 574.34) rotate(-85.93)"/><circle class="cls-5" cx="240.56" cy="362.36" r="1.98" transform="translate(-137.94 576.62) rotate(-85.93)"/><circle class="cls-6" cx="240.56" cy="362.36" r="1.98" transform="translate(-137.94 576.62) rotate(-85.93)"/><circle class="cls-5" cx="242.83" cy="362.36" r="1.98" transform="translate(-135.82 578.89) rotate(-85.93)"/><circle class="cls-6" cx="242.83" cy="362.36" r="1.98" transform="translate(-135.82 578.89) rotate(-85.93)"/><circle class="cls-5" cx="245.11" cy="362.36" r="1.98" transform="translate(-133.7 581.17) rotate(-85.93)"/><circle class="cls-6" cx="245.11" cy="362.36" r="1.98" transform="translate(-133.7 581.17) rotate(-85.93)"/><path class="cls-5" d="M249.38,362.36a2,2,0,1,1-2-2A2,2,0,0,1,249.38,362.36Z"/><path class="cls-6" d="M249.38,362.36a2,2,0,1,1-2-2A2,2,0,0,1,249.38,362.36Z"/><path class="cls-5" d="M251.66,362.36a2,2,0,1,1-2-2A2,2,0,0,1,251.66,362.36Z"/><path class="cls-6" d="M251.66,362.36a2,2,0,1,1-2-2A2,2,0,0,1,251.66,362.36Z"/><path class="cls-5" d="M253.94,362.24a2,2,0,1,1-2-2A2,2,0,0,1,253.94,362.24Z"/><path class="cls-6" d="M253.94,362.24a2,2,0,1,1-2-2A2,2,0,0,1,253.94,362.24Z"/><path class="cls-5" d="M256.22,362.24a2,2,0,1,1-2-2A2,2,0,0,1,256.22,362.24Z"/><path class="cls-6" d="M256.22,362.24a2,2,0,1,1-2-2A2,2,0,0,1,256.22,362.24Z"/><path class="cls-5" d="M258.61,362.24a2,2,0,1,1-2-2A2,2,0,0,1,258.61,362.24Z"/><path class="cls-6" d="M258.61,362.24a2,2,0,1,1-2-2A2,2,0,0,1,258.61,362.24Z"/><path class="cls-5" d="M260.89,362.24a2,2,0,1,1-2-2A2,2,0,0,1,260.89,362.24Z"/><path class="cls-6" d="M260.89,362.24a2,2,0,1,1-2-2A2,2,0,0,1,260.89,362.24Z"/><path class="cls-5" d="M263.17,362.24a2,2,0,1,1-2-2A2,2,0,0,1,263.17,362.24Z"/><path class="cls-6" d="M263.17,362.24a2,2,0,1,1-2-2A2,2,0,0,1,263.17,362.24Z"/><path class="cls-5" d="M265.45,362.24a2,2,0,1,1-2-2A2,2,0,0,1,265.45,362.24Z"/><path class="cls-6" d="M265.45,362.24a2,2,0,1,1-2-2A2,2,0,0,1,265.45,362.24Z"/><path class="cls-5" d="M267.73,362.24a2,2,0,1,1-2-2A2,2,0,0,1,267.73,362.24Z"/><path class="cls-6" d="M267.73,362.24a2,2,0,1,1-2-2A2,2,0,0,1,267.73,362.24Z"/><path class="cls-5" d="M270,362.24a2,2,0,1,1-2-2A2,2,0,0,1,270,362.24Z"/><path class="cls-6" d="M270,362.24a2,2,0,1,1-2-2A2,2,0,0,1,270,362.24Z"/><path class="cls-5" d="M272.29,362.24a2,2,0,1,1-2-2A2,2,0,0,1,272.29,362.24Z"/><path class="cls-6" d="M272.29,362.24a2,2,0,1,1-2-2A2,2,0,0,1,272.29,362.24Z"/><path class="cls-5" d="M274.57,362.24a2,2,0,1,1-2-2A2,2,0,0,1,274.57,362.24Z"/><path class="cls-6" d="M274.57,362.24a2,2,0,1,1-2-2A2,2,0,0,1,274.57,362.24Z"/><path class="cls-5" d="M276.85,362.24a2,2,0,1,1-2-2A2,2,0,0,1,276.85,362.24Z"/><path class="cls-6" d="M276.85,362.24a2,2,0,1,1-2-2A2,2,0,0,1,276.85,362.24Z"/><path class="cls-5" d="M279.13,362.12a2,2,0,1,1-2-2A2,2,0,0,1,279.13,362.12Z"/><path class="cls-6" d="M279.13,362.12a2,2,0,1,1-2-2A2,2,0,0,1,279.13,362.12Z"/><circle class="cls-5" cx="279.56" cy="362.12" r="1.98" transform="translate(-34.32 29.31) rotate(-5.65)"/><circle class="cls-6" cx="279.56" cy="362.12" r="1.98" transform="translate(-34.32 29.31) rotate(-5.65)"/><circle class="cls-5" cx="281.83" cy="362.12" r="1.98" transform="translate(-34.31 29.53) rotate(-5.65)"/><circle class="cls-6" cx="281.83" cy="362.12" r="1.98" transform="translate(-34.31 29.53) rotate(-5.65)"/><circle class="cls-5" cx="284.11" cy="362.12" r="1.98" transform="translate(-34.3 29.76) rotate(-5.65)"/><circle class="cls-6" cx="284.11" cy="362.12" r="1.98" transform="translate(-34.3 29.76) rotate(-5.65)"/><path class="cls-5" d="M288.38,362.12a2,2,0,1,1-2-2A2,2,0,0,1,288.38,362.12Z"/><path class="cls-6" d="M288.38,362.12a2,2,0,1,1-2-2A2,2,0,0,1,288.38,362.12Z"/><path class="cls-5" d="M290.66,362.12a2,2,0,1,1-2-2A2,2,0,0,1,290.66,362.12Z"/><path class="cls-6" d="M290.66,362.12a2,2,0,1,1-2-2A2,2,0,0,1,290.66,362.12Z"/><path class="cls-5" d="M292.94,362a2,2,0,1,1-2-2A2,2,0,0,1,292.94,362Z"/><path class="cls-6" d="M292.94,362a2,2,0,1,1-2-2A2,2,0,0,1,292.94,362Z"/><path class="cls-5" d="M295.22,362a2,2,0,1,1-2-2A2,2,0,0,1,295.22,362Z"/><path class="cls-6" d="M295.22,362a2,2,0,1,1-2-2A2,2,0,0,1,295.22,362Z"/><path class="cls-5" d="M297.5,362a2,2,0,1,1-2-2A2,2,0,0,1,297.5,362Z"/><path class="cls-6" d="M297.5,362a2,2,0,1,1-2-2A2,2,0,0,1,297.5,362Z"/><path class="cls-5" d="M299.78,362a2,2,0,1,1-2-2A2,2,0,0,1,299.78,362Z"/><path class="cls-6" d="M299.78,362a2,2,0,1,1-2-2A2,2,0,0,1,299.78,362Z"/><path class="cls-5" d="M302.06,362a2,2,0,1,1-2-2A2,2,0,0,1,302.06,362Z"/><path class="cls-6" d="M302.06,362a2,2,0,1,1-2-2A2,2,0,0,1,302.06,362Z"/><path class="cls-5" d="M304.33,362a2,2,0,1,1-2-2A2,2,0,0,1,304.33,362Z"/><path class="cls-6" d="M304.33,362a2,2,0,1,1-2-2A2,2,0,0,1,304.33,362Z"/><path class="cls-5" d="M306.73,361.88a2,2,0,1,1-2-2A2,2,0,0,1,306.73,361.88Z"/><path class="cls-6" d="M306.73,361.88a2,2,0,1,1-2-2A2,2,0,0,1,306.73,361.88Z"/><path class="cls-5" d="M309,361.88a2,2,0,1,1-2-2A2,2,0,0,1,309,361.88Z"/><path class="cls-6" d="M309,361.88a2,2,0,1,1-2-2A2,2,0,0,1,309,361.88Z"/><path class="cls-5" d="M311.29,361.88a2,2,0,1,1-2-2A2,2,0,0,1,311.29,361.88Z"/><path class="cls-6" d="M311.29,361.88a2,2,0,1,1-2-2A2,2,0,0,1,311.29,361.88Z"/><path class="cls-5" d="M313.57,361.88a2,2,0,1,1-2-2A2,2,0,0,1,313.57,361.88Z"/><path class="cls-6" d="M313.57,361.88a2,2,0,1,1-2-2A2,2,0,0,1,313.57,361.88Z"/><path class="cls-5" d="M315.85,361.88a2,2,0,1,1-2-2A2,2,0,0,1,315.85,361.88Z"/><path class="cls-6" d="M315.85,361.88a2,2,0,1,1-2-2A2,2,0,0,1,315.85,361.88Z"/><path class="cls-5" d="M318.13,361.88a2,2,0,1,1-2-2A2,2,0,0,1,318.13,361.88Z"/><path class="cls-6" d="M318.13,361.88a2,2,0,1,1-2-2A2,2,0,0,1,318.13,361.88Z"/><circle class="cls-5" cx="318.44" cy="361.76" r="1.98" transform="translate(-34.1 33.14) rotate(-5.65)"/><circle class="cls-6" cx="318.44" cy="361.76" r="1.98" transform="translate(-34.1 33.14) rotate(-5.65)"/><circle class="cls-5" cx="320.72" cy="361.76" r="1.98" transform="translate(-34.09 33.36) rotate(-5.65)"/><circle class="cls-6" cx="320.72" cy="361.76" r="1.98" transform="translate(-34.09 33.36) rotate(-5.65)"/><circle class="cls-5" cx="323" cy="361.76" r="1.98" transform="translate(-34.07 33.59) rotate(-5.65)"/><circle class="cls-6" cx="323" cy="361.76" r="1.98" transform="translate(-34.07 33.59) rotate(-5.65)"/><circle class="cls-5" cx="325.28" cy="361.76" r="1.98" transform="translate(-34.06 33.81) rotate(-5.65)"/><circle class="cls-6" cx="325.28" cy="361.76" r="1.98" transform="translate(-34.06 33.81) rotate(-5.65)"/><path class="cls-5" d="M329.66,361.76a2,2,0,1,1-2-2A2,2,0,0,1,329.66,361.76Z"/><path class="cls-6" d="M329.66,361.76a2,2,0,1,1-2-2A2,2,0,0,1,329.66,361.76Z"/><path class="cls-5" d="M331.94,361.76a2,2,0,1,1-2-2A2,2,0,0,1,331.94,361.76Z"/><path class="cls-6" d="M331.94,361.76a2,2,0,1,1-2-2A2,2,0,0,1,331.94,361.76Z"/><path class="cls-5" d="M334.22,361.64a2,2,0,1,1-2-2A2,2,0,0,1,334.22,361.64Z"/><path class="cls-6" d="M334.22,361.64a2,2,0,1,1-2-2A2,2,0,0,1,334.22,361.64Z"/><path class="cls-5" d="M336.5,361.64a2,2,0,1,1-2-2A2,2,0,0,1,336.5,361.64Z"/><path class="cls-6" d="M336.5,361.64a2,2,0,1,1-2-2A2,2,0,0,1,336.5,361.64Z"/><path class="cls-5" d="M338.78,361.64a2,2,0,1,1-2-2A2,2,0,0,1,338.78,361.64Z"/><path class="cls-6" d="M338.78,361.64a2,2,0,1,1-2-2A2,2,0,0,1,338.78,361.64Z"/><path class="cls-5" d="M341.06,361.52a2,2,0,1,1-2-2A2,2,0,0,1,341.06,361.52Z"/><path class="cls-6" d="M341.06,361.52a2,2,0,1,1-2-2A2,2,0,0,1,341.06,361.52Z"/><path class="cls-5" d="M343.33,361.52a2,2,0,1,1-2-2A2,2,0,0,1,343.33,361.52Z"/><path class="cls-6" d="M343.33,361.52a2,2,0,1,1-2-2A2,2,0,0,1,343.33,361.52Z"/><path class="cls-5" d="M345.61,361.52a2,2,0,1,1-2-2A2,2,0,0,1,345.61,361.52Z"/><path class="cls-6" d="M345.61,361.52a2,2,0,1,1-2-2A2,2,0,0,1,345.61,361.52Z"/><path class="cls-5" d="M347.89,361.52a2,2,0,1,1-2-2A2,2,0,0,1,347.89,361.52Z"/><path class="cls-6" d="M347.89,361.52a2,2,0,1,1-2-2A2,2,0,0,1,347.89,361.52Z"/><path class="cls-5" d="M350.17,361.4a2,2,0,1,1-2-2A2,2,0,0,1,350.17,361.4Z"/><path class="cls-6" d="M350.17,361.4a2,2,0,1,1-2-2A2,2,0,0,1,350.17,361.4Z"/><path class="cls-5" d="M352.57,361.4a2,2,0,1,1-2-2A2,2,0,0,1,352.57,361.4Z"/><path class="cls-6" d="M352.57,361.4a2,2,0,1,1-2-2A2,2,0,0,1,352.57,361.4Z"/><path class="cls-5" d="M354.85,361.4a2,2,0,1,1-2-2A2,2,0,0,1,354.85,361.4Z"/><path class="cls-6" d="M354.85,361.4a2,2,0,1,1-2-2A2,2,0,0,1,354.85,361.4Z"/><path class="cls-5" d="M357.13,361.4a2,2,0,1,1-2-2A2,2,0,0,1,357.13,361.4Z"/><path class="cls-6" d="M357.13,361.4a2,2,0,1,1-2-2A2,2,0,0,1,357.13,361.4Z"/><circle class="cls-5" cx="357.44" cy="361.28" r="1.98" transform="translate(-28.27 692.2) rotate(-85.93)"/><circle class="cls-6" cx="357.44" cy="361.28" r="1.98" transform="translate(-28.27 692.2) rotate(-85.93)"/><circle class="cls-5" cx="359.72" cy="361.28" r="1.98" transform="translate(-26.15 694.47) rotate(-85.93)"/><circle class="cls-6" cx="359.72" cy="361.28" r="1.98" transform="translate(-26.15 694.47) rotate(-85.93)"/><circle class="cls-5" cx="362" cy="361.28" r="1.98" transform="translate(-24.03 696.75) rotate(-85.93)"/><circle class="cls-6" cx="362" cy="361.28" r="1.98" transform="translate(-24.03 696.75) rotate(-85.93)"/><circle class="cls-5" cx="364.28" cy="361.28" r="1.98" transform="translate(-21.91 699.02) rotate(-85.93)"/><circle class="cls-6" cx="364.28" cy="361.28" r="1.98" transform="translate(-21.91 699.02) rotate(-85.93)"/><circle class="cls-5" cx="366.56" cy="361.16" r="1.98" transform="translate(-33.8 37.88) rotate(-5.65)"/><circle class="cls-6" cx="366.56" cy="361.16" r="1.98" transform="translate(-33.8 37.88) rotate(-5.65)"/><circle class="cls-5" cx="368.83" cy="361.16" r="1.98" transform="translate(-33.79 38.1) rotate(-5.65)"/><circle class="cls-6" cx="368.83" cy="361.16" r="1.98" transform="translate(-33.79 38.1) rotate(-5.65)"/><circle class="cls-5" cx="371.11" cy="361.16" r="1.98" transform="translate(-33.78 38.33) rotate(-5.65)"/><circle class="cls-6" cx="371.11" cy="361.16" r="1.98" transform="translate(-33.78 38.33) rotate(-5.65)"/><path class="cls-5" d="M375.5,361.16a2,2,0,1,1-2-2A2,2,0,0,1,375.5,361.16Z"/><path class="cls-6" d="M375.5,361.16a2,2,0,1,1-2-2A2,2,0,0,1,375.5,361.16Z"/><path class="cls-5" d="M377.78,361a2,2,0,1,1-2-2A2,2,0,0,1,377.78,361Z"/><path class="cls-6" d="M377.78,361a2,2,0,1,1-2-2A2,2,0,0,1,377.78,361Z"/><path class="cls-5" d="M380.06,361a2,2,0,1,1-2-2A2,2,0,0,1,380.06,361Z"/><path class="cls-6" d="M380.06,361a2,2,0,1,1-2-2A2,2,0,0,1,380.06,361Z"/><path class="cls-5" d="M382.33,361a2,2,0,1,1-2-2A2,2,0,0,1,382.33,361Z"/><path class="cls-6" d="M382.33,361a2,2,0,1,1-2-2A2,2,0,0,1,382.33,361Z"/><path class="cls-5" d="M384.61,360.92a2,2,0,1,1-2-2A2,2,0,0,1,384.61,360.92Z"/><path class="cls-6" d="M384.61,360.92a2,2,0,1,1-2-2A2,2,0,0,1,384.61,360.92Z"/><path class="cls-5" d="M386.89,360.92a2,2,0,1,1-2-2A2,2,0,0,1,386.89,360.92Z"/><path class="cls-6" d="M386.89,360.92a2,2,0,1,1-2-2A2,2,0,0,1,386.89,360.92Z"/><path class="cls-5" d="M389.17,360.92a2,2,0,1,1-2-2A2,2,0,0,1,389.17,360.92Z"/><path class="cls-6" d="M389.17,360.92a2,2,0,1,1-2-2A2,2,0,0,1,389.17,360.92Z"/><path class="cls-5" d="M391.45,360.8a2,2,0,1,1-2-2A2,2,0,0,1,391.45,360.8Z"/><path class="cls-6" d="M391.45,360.8a2,2,0,1,1-2-2A2,2,0,0,1,391.45,360.8Z"/><path class="cls-5" d="M393.73,360.8a2,2,0,1,1-2-2A2,2,0,0,1,393.73,360.8Z"/><path class="cls-6" d="M393.73,360.8a2,2,0,1,1-2-2A2,2,0,0,1,393.73,360.8Z"/><path class="cls-5" d="M396,360.68a2,2,0,1,1-2-2A2,2,0,0,1,396,360.68Z"/><path class="cls-6" d="M396,360.68a2,2,0,1,1-2-2A2,2,0,0,1,396,360.68Z"/><circle class="cls-5" cx="396.44" cy="360.68" r="1.98" transform="translate(8.56 730.54) rotate(-85.93)"/><circle class="cls-6" cx="396.44" cy="360.68" r="1.98" transform="translate(8.56 730.54) rotate(-85.93)"/><circle class="cls-5" cx="398.72" cy="360.68" r="1.98" transform="translate(10.68 732.82) rotate(-85.93)"/><circle class="cls-6" cx="398.72" cy="360.68" r="1.98" transform="translate(10.68 732.82) rotate(-85.93)"/><circle class="cls-5" cx="401" cy="360.56" r="1.98" transform="translate(12.92 734.98) rotate(-85.93)"/><circle class="cls-6" cx="401" cy="360.56" r="1.98" transform="translate(12.92 734.98) rotate(-85.93)"/><circle class="cls-5" cx="403.28" cy="360.56" r="1.98" transform="translate(15.04 737.26) rotate(-85.93)"/><circle class="cls-6" cx="403.28" cy="360.56" r="1.98" transform="translate(15.04 737.26) rotate(-85.93)"/><circle class="cls-5" cx="405.56" cy="360.56" r="1.98" transform="translate(17.16 739.53) rotate(-85.93)"/><circle class="cls-6" cx="405.56" cy="360.56" r="1.98" transform="translate(17.16 739.53) rotate(-85.93)"/><circle class="cls-5" cx="407.83" cy="360.44" r="1.98" transform="translate(-33.53 41.94) rotate(-5.65)"/><circle class="cls-6" cx="407.83" cy="360.44" r="1.98" transform="translate(-33.53 41.94) rotate(-5.65)"/><circle class="cls-5" cx="410.11" cy="360.44" r="1.98" transform="translate(-33.52 42.17) rotate(-5.65)"/><circle class="cls-6" cx="410.11" cy="360.44" r="1.98" transform="translate(-33.52 42.17) rotate(-5.65)"/><path class="cls-5" d="M414.38,360.32a2,2,0,1,1-2-2A2,2,0,0,1,414.38,360.32Z"/><path class="cls-6" d="M414.38,360.32a2,2,0,1,1-2-2A2,2,0,0,1,414.38,360.32Z"/><path class="cls-5" d="M416.66,360.32a2,2,0,1,1-2-2A2,2,0,0,1,416.66,360.32Z"/><path class="cls-6" d="M416.66,360.32a2,2,0,1,1-2-2A2,2,0,0,1,416.66,360.32Z"/><path class="cls-5" d="M418.94,360.2a2,2,0,1,1-2-2A2,2,0,0,1,418.94,360.2Z"/><path class="cls-6" d="M418.94,360.2a2,2,0,1,1-2-2A2,2,0,0,1,418.94,360.2Z"/><path class="cls-5" d="M421.33,360.2a2,2,0,1,1-2-2A2,2,0,0,1,421.33,360.2Z"/><path class="cls-6" d="M421.33,360.2a2,2,0,1,1-2-2A2,2,0,0,1,421.33,360.2Z"/><path class="cls-5" d="M423.61,360.2a2,2,0,1,1-2-2A2,2,0,0,1,423.61,360.2Z"/><path class="cls-6" d="M423.61,360.2a2,2,0,1,1-2-2A2,2,0,0,1,423.61,360.2Z"/><path class="cls-5" d="M425.89,360.08a2,2,0,1,1-2-2A2,2,0,0,1,425.89,360.08Z"/><path class="cls-6" d="M425.89,360.08a2,2,0,1,1-2-2A2,2,0,0,1,425.89,360.08Z"/><path class="cls-5" d="M428.17,360.08a2,2,0,1,1-2-2A2,2,0,0,1,428.17,360.08Z"/><path class="cls-6" d="M428.17,360.08a2,2,0,1,1-2-2A2,2,0,0,1,428.17,360.08Z"/><path class="cls-5" d="M430.45,360a2,2,0,1,1-2-2A2,2,0,0,1,430.45,360Z"/><path class="cls-6" d="M430.45,360a2,2,0,1,1-2-2A2,2,0,0,1,430.45,360Z"/><path class="cls-5" d="M432.73,360a2,2,0,1,1-2-2A2,2,0,0,1,432.73,360Z"/><path class="cls-6" d="M432.73,360a2,2,0,1,1-2-2A2,2,0,0,1,432.73,360Z"/><path class="cls-5" d="M435,359.84a2,2,0,1,1-2-2A2,2,0,0,1,435,359.84Z"/><path class="cls-6" d="M435,359.84a2,2,0,1,1-2-2A2,2,0,0,1,435,359.84Z"/><path class="cls-5" d="M437.29,359.84a2,2,0,1,1-2-2A2,2,0,0,1,437.29,359.84Z"/><path class="cls-6" d="M437.29,359.84a2,2,0,1,1-2-2A2,2,0,0,1,437.29,359.84Z"/><path class="cls-5" d="M439.57,359.72a2,2,0,1,1-2-2A2,2,0,0,1,439.57,359.72Z"/><path class="cls-6" d="M439.57,359.72a2,2,0,1,1-2-2A2,2,0,0,1,439.57,359.72Z"/><path class="cls-5" d="M441.85,359.72a2,2,0,1,1-2-2A2,2,0,0,1,441.85,359.72Z"/><path class="cls-6" d="M441.85,359.72a2,2,0,1,1-2-2A2,2,0,0,1,441.85,359.72Z"/><circle class="cls-5" cx="442.28" cy="359.6" r="1.98" transform="translate(52.23 775.27) rotate(-85.93)"/><circle class="cls-6" cx="442.28" cy="359.6" r="1.98" transform="translate(52.23 775.27) rotate(-85.93)"/><circle class="cls-5" cx="444.56" cy="359.48" r="1.98" transform="translate(-33.26 45.55) rotate(-5.65)"/><circle class="cls-6" cx="444.56" cy="359.48" r="1.98" transform="translate(-33.26 45.55) rotate(-5.65)"/><circle class="cls-5" cx="446.83" cy="359.48" r="1.98" transform="translate(-33.25 45.78) rotate(-5.65)"/><circle class="cls-6" cx="446.83" cy="359.48" r="1.98" transform="translate(-33.25 45.78) rotate(-5.65)"/><circle class="cls-5" cx="449.11" cy="359.36" r="1.98" transform="translate(58.83 781.87) rotate(-85.93)"/><circle class="cls-6" cx="449.11" cy="359.36" r="1.98" transform="translate(58.83 781.87) rotate(-85.93)"/><path class="cls-5" d="M453.38,359.36a2,2,0,1,1-2-2A2,2,0,0,1,453.38,359.36Z"/><path class="cls-6" d="M453.38,359.36a2,2,0,1,1-2-2A2,2,0,0,1,453.38,359.36Z"/><path class="cls-5" d="M455.66,359.24a2,2,0,1,1-2-2A2,2,0,0,1,455.66,359.24Z"/><path class="cls-6" d="M455.66,359.24a2,2,0,1,1-2-2A2,2,0,0,1,455.66,359.24Z"/><circle class="cls-5" cx="455.95" cy="359.12" r="1.98" transform="translate(76.55 798.33) rotate(-87.4)"/><circle class="cls-6" cx="455.95" cy="359.12" r="1.98" transform="translate(76.55 798.33) rotate(-87.4)"/><path class="cls-5" d="M460.22,359.12a2,2,0,1,1-2-2A2,2,0,0,1,460.22,359.12Z"/><path class="cls-6" d="M460.22,359.12a2,2,0,1,1-2-2A2,2,0,0,1,460.22,359.12Z"/><circle class="cls-5" cx="460.51" cy="359" r="1.98" transform="translate(-19.15 26.02) rotate(-3.17)"/><circle class="cls-6" cx="460.51" cy="359" r="1.98" transform="translate(-19.15 26.02) rotate(-3.17)"/><path class="cls-5" d="M464.78,359a2,2,0,1,1-2-2A2,2,0,0,1,464.78,359Z"/><path class="cls-6" d="M464.78,359a2,2,0,1,1-2-2A2,2,0,0,1,464.78,359Z"/><path class="cls-5" d="M467.18,358.88a2,2,0,1,1-2-2A2,2,0,0,1,467.18,358.88Z"/><path class="cls-6" d="M467.18,358.88a2,2,0,1,1-2-2A2,2,0,0,1,467.18,358.88Z"/><path class="cls-5" d="M469.45,358.76a2,2,0,1,1-2-2A2,2,0,0,1,469.45,358.76Z"/><path class="cls-6" d="M469.45,358.76a2,2,0,1,1-2-2A2,2,0,0,1,469.45,358.76Z"/><path class="cls-5" d="M471.74,358.76a2,2,0,1,1-2-2A2,2,0,0,1,471.74,358.76Z"/><path class="cls-6" d="M471.74,358.76a2,2,0,1,1-2-2A2,2,0,0,1,471.74,358.76Z"/><circle class="cls-5" cx="472.04" cy="358.64" r="1.98" transform="translate(-19.11 26.65) rotate(-3.17)"/><circle class="cls-6" cx="472.04" cy="358.64" r="1.98" transform="translate(-19.11 26.65) rotate(-3.17)"/><path class="cls-5" d="M476.3,358.52a2,2,0,1,1-2-2A2,2,0,0,1,476.3,358.52Z"/><path class="cls-6" d="M476.3,358.52a2,2,0,1,1-2-2A2,2,0,0,1,476.3,358.52Z"/><circle class="cls-5" cx="476.6" cy="358.4" r="1.98" transform="translate(96.97 818.26) rotate(-87.4)"/><circle class="cls-6" cx="476.6" cy="358.4" r="1.98" transform="translate(96.97 818.26) rotate(-87.4)"/><path class="cls-5" d="M480.86,358.4a2,2,0,1,1-2-2A2,2,0,0,1,480.86,358.4Z"/><path class="cls-6" d="M480.86,358.4a2,2,0,1,1-2-2A2,2,0,0,1,480.86,358.4Z"/><circle class="cls-5" cx="481.16" cy="358.28" r="1.98" transform="translate(-19.08 27.16) rotate(-3.17)"/><circle class="cls-6" cx="481.16" cy="358.28" r="1.98" transform="translate(-19.08 27.16) rotate(-3.17)"/><path class="cls-5" d="M485.42,358.16a2,2,0,1,1-2-2A2,2,0,0,1,485.42,358.16Z"/><path class="cls-6" d="M485.42,358.16a2,2,0,1,1-2-2A2,2,0,0,1,485.42,358.16Z"/><circle class="cls-5" cx="485.72" cy="358.04" r="1.98" transform="translate(106.04 827.03) rotate(-87.4)"/><circle class="cls-6" cx="485.72" cy="358.04" r="1.98" transform="translate(106.04 827.03) rotate(-87.4)"/><path class="cls-5" d="M490,357.92a2,2,0,1,1-2-2A2,2,0,0,1,490,357.92Z"/><path class="cls-6" d="M490,357.92a2,2,0,1,1-2-2A2,2,0,0,1,490,357.92Z"/><circle class="cls-5" cx="490.39" cy="357.92" r="1.98" transform="translate(-19.04 27.67) rotate(-3.17)"/><circle class="cls-6" cx="490.39" cy="357.92" r="1.98" transform="translate(-19.04 27.67) rotate(-3.17)"/><path class="cls-5" d="M494.66,357.8a2,2,0,1,1-2-2A2,2,0,0,1,494.66,357.8Z"/><path class="cls-6" d="M494.66,357.8a2,2,0,1,1-2-2A2,2,0,0,1,494.66,357.8Z"/><circle class="cls-5" cx="494.95" cy="357.68" r="1.98" transform="translate(-19.02 27.92) rotate(-3.17)"/><circle class="cls-6" cx="494.95" cy="357.68" r="1.98" transform="translate(-19.02 27.92) rotate(-3.17)"/><path class="cls-5" d="M499.22,357.56a2,2,0,1,1-2-2A2,2,0,0,1,499.22,357.56Z"/><path class="cls-6" d="M499.22,357.56a2,2,0,1,1-2-2A2,2,0,0,1,499.22,357.56Z"/><circle class="cls-5" cx="499.51" cy="357.44" r="1.98" transform="translate(119.81 840.24) rotate(-87.4)"/><circle class="cls-6" cx="499.51" cy="357.44" r="1.98" transform="translate(119.81 840.24) rotate(-87.4)"/><path class="cls-5" d="M503.78,357.32a2,2,0,1,1-2-2A2,2,0,0,1,503.78,357.32Z"/><path class="cls-6" d="M503.78,357.32a2,2,0,1,1-2-2A2,2,0,0,1,503.78,357.32Z"/><path class="cls-5" d="M506.06,357.2a2,2,0,1,1-2-2A2,2,0,0,1,506.06,357.2Z"/><path class="cls-6" d="M506.06,357.2a2,2,0,1,1-2-2A2,2,0,0,1,506.06,357.2Z"/><path class="cls-5" d="M508.33,357.08a2,2,0,1,1-2-2A2,2,0,0,1,508.33,357.08Z"/><path class="cls-6" d="M508.33,357.08a2,2,0,1,1-2-2A2,2,0,0,1,508.33,357.08Z"/><path class="cls-5" d="M510.62,357a2,2,0,1,1-2-2A2,2,0,0,1,510.62,357Z"/><path class="cls-6" d="M510.62,357a2,2,0,1,1-2-2A2,2,0,0,1,510.62,357Z"/><path class="cls-5" d="M512.89,356.84a2,2,0,1,1-2-2A2,2,0,0,1,512.89,356.84Z"/><path class="cls-6" d="M512.89,356.84a2,2,0,1,1-2-2A2,2,0,0,1,512.89,356.84Z"/><path class="cls-5" d="M515.3,356.72a2,2,0,1,1-2-2A2,2,0,0,1,515.3,356.72Z"/><path class="cls-6" d="M515.3,356.72a2,2,0,1,1-2-2A2,2,0,0,1,515.3,356.72Z"/><circle class="cls-5" cx="515.6" cy="356.6" r="1.98" transform="translate(-18.93 29.06) rotate(-3.17)"/><circle class="cls-6" cx="515.6" cy="356.6" r="1.98" transform="translate(-18.93 29.06) rotate(-3.17)"/><path class="cls-5" d="M519.86,356.48a2,2,0,1,1-2-2A2,2,0,0,1,519.86,356.48Z"/><path class="cls-6" d="M519.86,356.48a2,2,0,1,1-2-2A2,2,0,0,1,519.86,356.48Z"/><circle class="cls-5" cx="520.16" cy="356.36" r="1.98" transform="translate(-18.91 29.31) rotate(-3.17)"/><circle class="cls-6" cx="520.16" cy="356.36" r="1.98" transform="translate(-18.91 29.31) rotate(-3.17)"/><path class="cls-5" d="M524.42,356.24a2,2,0,1,1-2-2A2,2,0,0,1,524.42,356.24Z"/><path class="cls-6" d="M524.42,356.24a2,2,0,1,1-2-2A2,2,0,0,1,524.42,356.24Z"/><circle class="cls-5" cx="524.72" cy="356.12" r="1.98" transform="translate(145.19 864.15) rotate(-87.4)"/><circle class="cls-6" cx="524.72" cy="356.12" r="1.98" transform="translate(145.19 864.15) rotate(-87.4)"/><path class="cls-5" d="M529,356a2,2,0,1,1-2-2A2,2,0,0,1,529,356Z"/><path class="cls-6" d="M529,356a2,2,0,1,1-2-2A2,2,0,0,1,529,356Z"/><circle class="cls-5" cx="529.28" cy="355.88" r="1.98"/><circle class="cls-6" cx="529.28" cy="355.88" r="1.98"/><path class="cls-5" d="M533.54,355.76a2,2,0,1,1-2-2A2,2,0,0,1,533.54,355.76Z"/><path class="cls-6" d="M533.54,355.76a2,2,0,1,1-2-2A2,2,0,0,1,533.54,355.76Z"/><circle class="cls-5" cx="533.83" cy="355.52" r="1.98"/><circle class="cls-6" cx="533.83" cy="355.52" r="1.98"/><path class="cls-5" d="M538.22,355.4a2,2,0,1,1-2-2A2,2,0,0,1,538.22,355.4Z"/><path class="cls-6" d="M538.22,355.4a2,2,0,1,1-2-2A2,2,0,0,1,538.22,355.4Z"/><circle class="cls-5" cx="538.51" cy="355.28" r="1.98" transform="translate(-18.82 30.32) rotate(-3.17)"/><circle class="cls-6" cx="538.51" cy="355.28" r="1.98" transform="translate(-18.82 30.32) rotate(-3.17)"/><path class="cls-5" d="M542.78,355.16a2,2,0,1,1-2-2A2,2,0,0,1,542.78,355.16Z"/><path class="cls-6" d="M542.78,355.16a2,2,0,1,1-2-2A2,2,0,0,1,542.78,355.16Z"/><path class="cls-5" d="M545.06,354.92a2,2,0,1,1-2-2A2,2,0,0,1,545.06,354.92Z"/><path class="cls-6" d="M545.06,354.92a2,2,0,1,1-2-2A2,2,0,0,1,545.06,354.92Z"/><path class="cls-5" d="M547.33,354.8a2,2,0,1,1-2-2A2,2,0,0,1,547.33,354.8Z"/><path class="cls-6" d="M547.33,354.8a2,2,0,1,1-2-2A2,2,0,0,1,547.33,354.8Z"/><path class="cls-5" d="M549.62,354.68a2,2,0,1,1-2-2A2,2,0,0,1,549.62,354.68Z"/><path class="cls-6" d="M549.62,354.68a2,2,0,1,1-2-2A2,2,0,0,1,549.62,354.68Z"/><path class="cls-5" d="M551.89,354.44a2,2,0,1,1-2-2A2,2,0,0,1,551.89,354.44Z"/><path class="cls-6" d="M551.89,354.44a2,2,0,1,1-2-2A2,2,0,0,1,551.89,354.44Z"/><path class="cls-5" d="M554.18,354.32a2,2,0,1,1-2-2A2,2,0,0,1,554.18,354.32Z"/><path class="cls-6" d="M554.18,354.32a2,2,0,1,1-2-2A2,2,0,0,1,554.18,354.32Z"/><path class="cls-5" d="M556.45,354.2a2,2,0,1,1-2-2A2,2,0,0,1,556.45,354.2Z"/><path class="cls-6" d="M556.45,354.2a2,2,0,1,1-2-2A2,2,0,0,1,556.45,354.2Z"/><path class="cls-5" d="M558.74,354a2,2,0,1,1-2-2A2,2,0,0,1,558.74,354Z"/><path class="cls-6" d="M558.74,354a2,2,0,1,1-2-2A2,2,0,0,1,558.74,354Z"/><circle class="cls-5" cx="559.16" cy="353.84" r="1.98" transform="translate(180.35 896.38) rotate(-87.4)"/><circle class="cls-6" cx="559.16" cy="353.84" r="1.98" transform="translate(180.35 896.38) rotate(-87.4)"/><path class="cls-5" d="M563.42,353.6a2,2,0,1,1-2-2A2,2,0,0,1,563.42,353.6Z"/><path class="cls-6" d="M563.42,353.6a2,2,0,1,1-2-2A2,2,0,0,1,563.42,353.6Z"/><circle class="cls-5" cx="563.72" cy="353.48" r="1.98" transform="translate(185.06 900.59) rotate(-87.4)"/><circle class="cls-6" cx="563.72" cy="353.48" r="1.98" transform="translate(185.06 900.59) rotate(-87.4)"/><path class="cls-5" d="M568,353.24a2,2,0,1,1-2-2A2,2,0,0,1,568,353.24Z"/><path class="cls-6" d="M568,353.24a2,2,0,1,1-2-2A2,2,0,0,1,568,353.24Z"/><circle class="cls-5" cx="568.28" cy="353" r="1.98" transform="translate(-18.65 31.97) rotate(-3.17)"/><circle class="cls-6" cx="568.28" cy="353" r="1.98" transform="translate(-18.65 31.97) rotate(-3.17)"/><path class="cls-5" d="M572.54,352.88a2,2,0,1,1-2-2A2,2,0,0,1,572.54,352.88Z"/><path class="cls-6" d="M572.54,352.88a2,2,0,1,1-2-2A2,2,0,0,1,572.54,352.88Z"/><circle class="cls-5" cx="572.83" cy="352.64" r="1.98" transform="translate(-18.62 32.22) rotate(-3.17)"/><circle class="cls-6" cx="572.83" cy="352.64" r="1.98" transform="translate(-18.62 32.22) rotate(-3.17)"/><path class="cls-5" d="M577.1,352.4a2,2,0,1,1-2-2A2,2,0,0,1,577.1,352.4Z"/><path class="cls-6" d="M577.1,352.4a2,2,0,1,1-2-2A2,2,0,0,1,577.1,352.4Z"/><circle class="cls-5" cx="577.39" cy="352.28" r="1.98" transform="translate(-18.6 32.47) rotate(-3.17)"/><circle class="cls-6" cx="577.39" cy="352.28" r="1.98" transform="translate(-18.6 32.47) rotate(-3.17)"/><path class="cls-5" d="M581.66,352a2,2,0,1,1-2-2A2,2,0,0,1,581.66,352Z"/><path class="cls-6" d="M581.66,352a2,2,0,1,1-2-2A2,2,0,0,1,581.66,352Z"/><path class="cls-5" d="M584.06,351.8a2,2,0,1,1-2-2A2,2,0,0,1,584.06,351.8Z"/><path class="cls-6" d="M584.06,351.8a2,2,0,1,1-2-2A2,2,0,0,1,584.06,351.8Z"/><path class="cls-5" d="M586.33,351.56a2,2,0,1,1-2-2A2,2,0,0,1,586.33,351.56Z"/><path class="cls-6" d="M586.33,351.56a2,2,0,1,1-2-2A2,2,0,0,1,586.33,351.56Z"/><path class="cls-5" d="M588.62,351.32a2,2,0,1,1-2-2A2,2,0,0,1,588.62,351.32Z"/><path class="cls-6" d="M588.62,351.32a2,2,0,1,1-2-2A2,2,0,0,1,588.62,351.32Z"/><path class="cls-5" d="M590.89,351.08a2,2,0,1,1-2-2A2,2,0,0,1,590.89,351.08Z"/><path class="cls-6" d="M590.89,351.08a2,2,0,1,1-2-2A2,2,0,0,1,590.89,351.08Z"/><path class="cls-5" d="M593.18,350.84a2,2,0,1,1-2-2A2,2,0,0,1,593.18,350.84Z"/><path class="cls-6" d="M593.18,350.84a2,2,0,1,1-2-2A2,2,0,0,1,593.18,350.84Z"/><path class="cls-5" d="M595.45,350.6a2,2,0,1,1-2-2A2,2,0,0,1,595.45,350.6Z"/><path class="cls-6" d="M595.45,350.6a2,2,0,1,1-2-2A2,2,0,0,1,595.45,350.6Z"/><path class="cls-5" d="M597.74,350.36a2,2,0,1,1-2-2A2,2,0,0,1,597.74,350.36Z"/><path class="cls-6" d="M597.74,350.36a2,2,0,1,1-2-2A2,2,0,0,1,597.74,350.36Z"/><circle class="cls-5" cx="598.04" cy="350.12" r="1.98" transform="translate(221.18 931.67) rotate(-87.4)"/><circle class="cls-6" cx="598.04" cy="350.12" r="1.98" transform="translate(221.18 931.67) rotate(-87.4)"/><path class="cls-5" d="M602.3,349.88a2,2,0,1,1-2-2A2,2,0,0,1,602.3,349.88Z"/><path class="cls-6" d="M602.3,349.88a2,2,0,1,1-2-2A2,2,0,0,1,602.3,349.88Z"/><circle class="cls-5" cx="602.6" cy="349.64" r="1.98" transform="translate(-18.41 33.86) rotate(-3.17)"/><circle class="cls-6" cx="602.6" cy="349.64" r="1.98" transform="translate(-18.41 33.86) rotate(-3.17)"/><path class="cls-5" d="M607,349.4a2,2,0,1,1-2-2A2,2,0,0,1,607,349.4Z"/><path class="cls-6" d="M607,349.4a2,2,0,1,1-2-2A2,2,0,0,1,607,349.4Z"/><circle class="cls-5" cx="607.28" cy="349.16" r="1.98" transform="translate(230.96 939.98) rotate(-87.4)"/><circle class="cls-6" cx="607.28" cy="349.16" r="1.98" transform="translate(230.96 939.98) rotate(-87.4)"/><path class="cls-5" d="M611.54,348.8a2,2,0,1,1-2-2A2,2,0,0,1,611.54,348.8Z"/><path class="cls-6" d="M611.54,348.8a2,2,0,1,1-2-2A2,2,0,0,1,611.54,348.8Z"/><circle class="cls-5" cx="611.83" cy="348.56" r="1.98" transform="translate(-18.34 34.37) rotate(-3.17)"/><circle class="cls-6" cx="611.83" cy="348.56" r="1.98" transform="translate(-18.34 34.37) rotate(-3.17)"/><path class="cls-5" d="M616.1,348.32a2,2,0,1,1-2-2A2,2,0,0,1,616.1,348.32Z"/><path class="cls-6" d="M616.1,348.32a2,2,0,1,1-2-2A2,2,0,0,1,616.1,348.32Z"/><circle class="cls-5" cx="616.39" cy="347.96" r="1.98" transform="translate(-18.3 34.62) rotate(-3.17)"/><circle class="cls-6" cx="616.39" cy="347.96" r="1.98" transform="translate(-18.3 34.62) rotate(-3.17)"/><path class="cls-5" d="M620.66,347.72a2,2,0,1,1-2-2A2,2,0,0,1,620.66,347.72Z"/><path class="cls-6" d="M620.66,347.72a2,2,0,1,1-2-2A2,2,0,0,1,620.66,347.72Z"/><circle class="cls-5" cx="620.95" cy="347.36" r="1.98" transform="translate(-18.26 34.87) rotate(-3.17)"/><circle class="cls-6" cx="620.95" cy="347.36" r="1.98" transform="translate(-18.26 34.87) rotate(-3.17)"/><path class="cls-5" d="M625.22,347.12a2,2,0,1,1-2-2A2,2,0,0,1,625.22,347.12Z"/><path class="cls-6" d="M625.22,347.12a2,2,0,1,1-2-2A2,2,0,0,1,625.22,347.12Z"/><circle class="cls-5" cx="625.51" cy="346.76" r="1.98" transform="translate(250.77 955.91) rotate(-87.4)"/><circle class="cls-6" cx="625.51" cy="346.76" r="1.98" transform="translate(250.77 955.91) rotate(-87.4)"/><path class="cls-5" d="M629.89,346.4a2,2,0,1,1-2-2A2,2,0,0,1,629.89,346.4Z"/><path class="cls-6" d="M629.89,346.4a2,2,0,1,1-2-2A2,2,0,0,1,629.89,346.4Z"/><path class="cls-5" d="M632.18,346.16a2,2,0,1,1-2-2A2,2,0,0,1,632.18,346.16Z"/><path class="cls-6" d="M632.18,346.16a2,2,0,1,1-2-2A2,2,0,0,1,632.18,346.16Z"/><path class="cls-5" d="M634.45,345.8a2,2,0,1,1-2-2A2,2,0,0,1,634.45,345.8Z"/><path class="cls-6" d="M634.45,345.8a2,2,0,1,1-2-2A2,2,0,0,1,634.45,345.8Z"/><path class="cls-5" d="M636.74,345.44a2,2,0,1,1-2-2A2,2,0,0,1,636.74,345.44Z"/><path class="cls-6" d="M636.74,345.44a2,2,0,1,1-2-2A2,2,0,0,1,636.74,345.44Z"/><circle class="cls-5" cx="637.04" cy="345.08" r="1.98"/><circle class="cls-6" cx="637.04" cy="345.08" r="1.98"/><path class="cls-5" d="M641.3,344.72a2,2,0,1,1-2-2A2,2,0,0,1,641.3,344.72Z"/><path class="cls-6" d="M641.3,344.72a2,2,0,1,1-2-2A2,2,0,0,1,641.3,344.72Z"/><g class="cls-3"><polyline class="cls-7" points="54.95 363.25 57.26 363.25 59.53 363.13 61.81 363.13 64.09 363.13 66.38 363.13 68.66 363.13 71.06 363.01 73.33 363.01 75.61 363.01 77.89 363.01 80.17 362.89 82.45 362.89 84.73 362.89 87.02 362.77 89.3 362.77 91.58 362.77 93.86 362.65 96.25 362.65 98.53 362.65 100.81 362.54 103.09 362.54 105.38 362.54 107.66 362.42 109.94 362.42 112.22 362.42 114.5 362.3 116.78 362.3 119.17 362.18 121.45 362.18 123.73 362.18 126.02 362.06 128.29 362.06 130.57 362.06 132.85 361.94 135.13 361.94 137.41 361.94 139.69 361.81 142.09 361.81 144.38 361.69 146.66 361.69 148.94 361.69 151.22 361.57 153.5 361.57 155.78 361.45 158.06 361.45 160.34 361.45 162.62 361.33 165.01 361.33 167.29 361.33 169.57 361.21 171.85 361.21 174.13 361.1 176.41 361.1 178.69 361.1 180.97 360.98 183.25 360.98 185.53 360.98 187.94 360.86 190.22 360.86 192.5 360.74 194.78 360.74 197.06 360.74 199.34 360.62 201.62 360.62 203.9 360.5 206.18 360.5 208.46 360.5 210.85 360.38 213.13 360.38 215.41 360.38 217.69 360.25 219.97 360.25 222.25 360.25 224.53 360.13 226.81 360.13 229.09 360.01 231.38 360.01 233.78 360.01 236.06 359.89 238.34 359.89 240.62 359.89 242.9 359.77 245.18 359.77 247.46 359.65 249.74 359.65 252.01 359.65 254.29 359.54 256.69 359.54 258.98 359.54 261.25 359.42 263.54 359.42 265.81 359.42 268.1 359.3 270.38 359.3 272.65 359.18 274.94 359.18 277.21 359.06 279.62 358.94 281.89 358.94 284.18 358.81 286.45 358.69 288.74 358.69 291.01 358.57 293.3 358.45 295.57 358.45 297.86 358.33 300.13 358.21 302.42 358.21 304.81 358.1 307.1 357.98 309.38 357.98 311.65 357.86 313.94 357.74 316.21 357.74 318.5 357.62 320.77 357.5 323.06 357.38 325.33 357.38 327.74 357.25 330.01 357.13 332.3 357.01 334.57 356.89 336.86 356.77 339.13 356.65 341.42 356.54 343.69 356.42 345.98 356.3 348.25 356.18 350.65 356.06 352.94 355.94 355.21 355.81 357.5 355.69 359.77 355.57 362.06 355.45 364.33 355.33 366.62 355.21 368.89 355.1 371.18 354.98 373.57 354.74 375.86 354.62 378.13 354.5 380.42 354.25 382.69 354.13 384.98 354.01 387.25 353.77 389.54 353.65 391.81 353.54 394.1 353.3 396.5 353.18 398.77 352.94 401.06 352.81 403.33 352.57 405.62 352.33 407.89 352.21 410.18 351.98 412.45 351.74 414.74 351.62 417.01 351.38 419.42 351.13 421.69 351.01 423.98 350.77 426.25 350.54 428.54 350.3 430.81 349.94 433.1 349.69 435.38 349.45 437.65 349.21 439.94 348.98 442.33 348.74 444.62 348.5 446.89 348.25 449.18 347.89 451.45 347.65 453.74 347.3 456.01 347.06 458.3 346.81 460.57 346.45 462.86 346.1 465.25 345.86 467.54 345.5 469.81 345.13 472.1 344.89 474.38 344.54 476.65 344.06 478.94 343.69 481.21 343.33 483.5 342.98 485.77 342.62 488.06 342.25 490.45 341.89 492.74 341.42 495.01 341.06 497.3 340.57 499.57 340.21 501.86 339.74 504.13 339.25 506.42 338.89 508.69 338.42 510.98 337.94 513.38 337.45 515.65 336.98 517.93 336.5 520.22 335.89 522.5 335.42 524.77 334.94 527.05 334.33 529.34 333.74 531.62 333.25 533.89 332.65 536.29 332.06 538.58 331.45 540.86 330.86 543.13 330.25 545.41 329.65 547.7 328.94 549.98 328.33 552.25 327.62 554.53 326.89 556.82 326.3 559.22 325.57 561.5 324.86 563.77 324.01 566.05 323.3 568.34 322.57 570.62 321.74 572.89 320.89 575.17 320.18 577.46 319.33 579.74 318.38 582.13 317.54 584.41 316.69 586.7 315.86 588.98 314.89 591.25 313.94 593.53 312.98 595.82 312.01 598.1 310.94 600.38 309.98 602.65 308.89 605.05 307.81 607.34 306.74 609.62 305.65 611.89 304.57 614.17 303.38 616.46 302.18 618.74 301.1 621.01 299.89 623.29 298.57 625.58 297.38 627.98 296.06 630.25 294.74 632.53 293.42 634.82 292.1 637.1 290.65 639.38 289.27"/></g><circle class="cls-8" cx="54.92" cy="363.2" r="1.98"/><circle class="cls-9" cx="54.92" cy="363.2" r="1.98"/><circle class="cls-8" cx="57.2" cy="363.2" r="1.98"/><circle class="cls-9" cx="57.2" cy="363.2" r="1.98"/><circle class="cls-8" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-9" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-8" cx="61.76" cy="363.08" r="1.98"/><circle class="cls-9" cx="61.76" cy="363.08" r="1.98"/><circle class="cls-8" cx="64.04" cy="363.08" r="1.98"/><circle class="cls-9" cx="64.04" cy="363.08" r="1.98"/><circle class="cls-8" cx="66.32" cy="363.08" r="1.98"/><circle class="cls-9" cx="66.32" cy="363.08" r="1.98"/><circle class="cls-8" cx="68.6" cy="363.08" r="1.98"/><circle class="cls-9" cx="68.6" cy="363.08" r="1.98"/><circle class="cls-8" cx="71" cy="362.96" r="1.98"/><circle class="cls-9" cx="71" cy="362.96" r="1.98"/><circle class="cls-8" cx="73.28" cy="362.96" r="1.98"/><circle class="cls-9" cx="73.28" cy="362.96" r="1.98"/><circle class="cls-8" cx="75.56" cy="362.96" r="1.98"/><circle class="cls-9" cx="75.56" cy="362.96" r="1.98"/><circle class="cls-8" cx="77.83" cy="362.96" r="1.98"/><circle class="cls-9" cx="77.83" cy="362.96" r="1.98"/><circle class="cls-8" cx="80.12" cy="362.84" r="1.98"/><circle class="cls-9" cx="80.12" cy="362.84" r="1.98"/><circle class="cls-8" cx="82.4" cy="362.84" r="1.98"/><circle class="cls-9" cx="82.4" cy="362.84" r="1.98"/><circle class="cls-8" cx="84.67" cy="362.84" r="1.98"/><circle class="cls-9" cx="84.67" cy="362.84" r="1.98"/><circle class="cls-8" cx="86.95" cy="362.72" r="1.98"/><circle class="cls-9" cx="86.95" cy="362.72" r="1.98"/><circle class="cls-8" cx="89.24" cy="362.72" r="1.98"/><circle class="cls-9" cx="89.24" cy="362.72" r="1.98"/><circle class="cls-8" cx="91.52" cy="362.72" r="1.98"/><circle class="cls-9" cx="91.52" cy="362.72" r="1.98"/><circle class="cls-8" cx="93.8" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -276.28, 420.2)"/><circle class="cls-9" cx="93.8" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -276.28, 420.2)"/><circle class="cls-8" cx="96.19" cy="362.6" r="1.98"/><circle class="cls-9" cx="96.19" cy="362.6" r="1.98"/><circle class="cls-8" cx="98.47" cy="362.6" r="1.98"/><circle class="cls-9" cx="98.47" cy="362.6" r="1.98"/><circle class="cls-8" cx="100.76" cy="362.48" r="1.98"/><circle class="cls-9" cx="100.76" cy="362.48" r="1.98"/><circle class="cls-8" cx="103.03" cy="362.48" r="1.98"/><circle class="cls-9" cx="103.03" cy="362.48" r="1.98"/><circle class="cls-8" cx="105.31" cy="362.48" r="1.98"/><circle class="cls-9" cx="105.31" cy="362.48" r="1.98"/><circle class="cls-8" cx="107.6" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -263.6, 433.72)"/><circle class="cls-9" cx="107.6" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -263.6, 433.72)"/><circle class="cls-8" cx="109.88" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -261.54, 435.99)"/><circle class="cls-9" cx="109.88" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -261.54, 435.99)"/><circle class="cls-8" cx="112.16" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -259.49, 438.26)"/><circle class="cls-9" cx="112.16" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -259.49, 438.26)"/><circle class="cls-8" cx="114.44" cy="362.24" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -257.31, 440.42)"/><circle class="cls-9" cx="114.44" cy="362.24" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -257.31, 440.42)"/><circle class="cls-8" cx="116.72" cy="362.24" r="1.98"/><circle class="cls-9" cx="116.72" cy="362.24" r="1.98"/><circle class="cls-8" cx="119.12" cy="362.12" r="1.98"/><circle class="cls-9" cx="119.12" cy="362.12" r="1.98"/><circle class="cls-8" cx="121.4" cy="362.12" r="1.98"/><circle class="cls-9" cx="121.4" cy="362.12" r="1.98"/><circle class="cls-8" cx="123.67" cy="362.12" r="1.98"/><circle class="cls-9" cx="123.67" cy="362.12" r="1.98"/><circle class="cls-8" cx="125.95" cy="362" r="1.98"/><circle class="cls-9" cx="125.95" cy="362" r="1.98"/><circle class="cls-8" cx="128.24" cy="362" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -244.63, 453.94)"/><circle class="cls-9" cx="128.24" cy="362" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -244.63, 453.94)"/><circle class="cls-8" cx="130.52" cy="362" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -242.58, 456.2)"/><circle class="cls-9" cx="130.52" cy="362" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -242.58, 456.2)"/><circle class="cls-8" cx="132.8" cy="361.88" r="1.98"/><circle class="cls-9" cx="132.8" cy="361.88" r="1.98"/><circle class="cls-8" cx="135.08" cy="361.88" r="1.98"/><circle class="cls-9" cx="135.08" cy="361.88" r="1.98"/><circle class="cls-8" cx="137.36" cy="361.88" r="1.98"/><circle class="cls-9" cx="137.36" cy="361.88" r="1.98"/><circle class="cls-8" cx="139.64" cy="361.76" r="1.98"/><circle class="cls-9" cx="139.64" cy="361.76" r="1.98"/><circle class="cls-8" cx="142.03" cy="361.76" r="1.98"/><circle class="cls-9" cx="142.03" cy="361.76" r="1.98"/><circle class="cls-8" cx="144.31" cy="361.64" r="1.98"/><circle class="cls-9" cx="144.31" cy="361.64" r="1.98"/><circle class="cls-8" cx="146.6" cy="361.64" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -227.73, 471.88)"/><circle class="cls-9" cx="146.6" cy="361.64" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -227.73, 471.88)"/><circle class="cls-8" cx="148.88" cy="361.64" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -225.67, 474.15)"/><circle class="cls-9" cx="148.88" cy="361.64" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -225.67, 474.15)"/><circle class="cls-8" cx="151.16" cy="361.52" r="1.98"/><circle class="cls-9" cx="151.16" cy="361.52" r="1.98"/><circle class="cls-8" cx="153.44" cy="361.52" r="1.98"/><circle class="cls-9" cx="153.44" cy="361.52" r="1.98"/><circle class="cls-8" cx="155.72" cy="361.4" r="1.98"/><circle class="cls-9" cx="155.72" cy="361.4" r="1.98"/><circle class="cls-8" cx="158" cy="361.4" r="1.98"/><circle class="cls-9" cx="158" cy="361.4" r="1.98"/><circle class="cls-8" cx="160.28" cy="361.4" r="1.98"/><circle class="cls-9" cx="160.28" cy="361.4" r="1.98"/><circle class="cls-8" cx="162.56" cy="361.28" r="1.98"/><circle class="cls-9" cx="162.56" cy="361.28" r="1.98"/><circle class="cls-8" cx="164.95" cy="361.28" r="1.98"/><circle class="cls-9" cx="164.95" cy="361.28" r="1.98"/><circle class="cls-8" cx="167.24" cy="361.28" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -208.76, 492.1)"/><circle class="cls-9" cx="167.24" cy="361.28" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -208.76, 492.1)"/><circle class="cls-8" cx="169.52" cy="361.16" r="1.98"/><circle class="cls-9" cx="169.52" cy="361.16" r="1.98"/><circle class="cls-8" cx="171.8" cy="361.16" r="1.98"/><circle class="cls-9" cx="171.8" cy="361.16" r="1.98"/><circle class="cls-8" cx="174.08" cy="361.04" r="1.98"/><circle class="cls-9" cx="174.08" cy="361.04" r="1.98"/><circle class="cls-8" cx="176.36" cy="361.04" r="1.98"/><circle class="cls-9" cx="176.36" cy="361.04" r="1.98"/><circle class="cls-8" cx="178.64" cy="361.04" r="1.98"/><circle class="cls-9" cx="178.64" cy="361.04" r="1.98"/><circle class="cls-8" cx="180.92" cy="360.92" r="1.98"/><circle class="cls-9" cx="180.92" cy="360.92" r="1.98"/><circle class="cls-8" cx="183.19" cy="360.92" r="1.98"/><circle class="cls-9" cx="183.19" cy="360.92" r="1.98"/><circle class="cls-8" cx="185.47" cy="360.92" r="1.98"/><circle class="cls-9" cx="185.47" cy="360.92" r="1.98"/><circle class="cls-8" cx="187.88" cy="360.8" r="1.98"/><circle class="cls-9" cx="187.88" cy="360.8" r="1.98"/><circle class="cls-8" cx="190.16" cy="360.8" r="1.98"/><circle class="cls-9" cx="190.16" cy="360.8" r="1.98"/><circle class="cls-8" cx="192.44" cy="360.68" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -185.45, 516.63)"/><circle class="cls-9" cx="192.44" cy="360.68" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -185.45, 516.63)"/><circle class="cls-8" cx="194.72" cy="360.68" r="1.98"/><circle class="cls-9" cx="194.72" cy="360.68" r="1.98"/><circle class="cls-8" cx="197" cy="360.68" r="1.98"/><circle class="cls-9" cx="197" cy="360.68" r="1.98"/><circle class="cls-8" cx="199.28" cy="360.56" r="1.98"/><circle class="cls-9" cx="199.28" cy="360.56" r="1.98"/><circle class="cls-8" cx="201.56" cy="360.56" r="1.98" transform="translate(-172.38 536.04) rotate(-85.93)"/><circle class="cls-9" cx="201.56" cy="360.56" r="1.98" transform="translate(-172.38 536.04) rotate(-85.93)"/><circle class="cls-8" cx="203.83" cy="360.44" r="1.98" transform="translate(-34.52 21.84) rotate(-5.65)"/><circle class="cls-9" cx="203.83" cy="360.44" r="1.98" transform="translate(-34.52 21.84) rotate(-5.65)"/><circle class="cls-8" cx="206.11" cy="360.44" r="1.98" transform="translate(-34.51 22.06) rotate(-5.65)"/><circle class="cls-9" cx="206.11" cy="360.44" r="1.98" transform="translate(-34.51 22.06) rotate(-5.65)"/><path class="cls-8" d="M210.38,360.44a2,2,0,1,1-2-2A2,2,0,0,1,210.38,360.44Z"/><path class="cls-9" d="M210.38,360.44a2,2,0,1,1-2-2A2,2,0,0,1,210.38,360.44Z"/><path class="cls-8" d="M212.78,360.32a2,2,0,1,1-2-2A2,2,0,0,1,212.78,360.32Z"/><path class="cls-9" d="M212.78,360.32a2,2,0,1,1-2-2A2,2,0,0,1,212.78,360.32Z"/><path class="cls-8" d="M215.06,360.32a2,2,0,1,1-2-2A2,2,0,0,1,215.06,360.32Z"/><path class="cls-9" d="M215.06,360.32a2,2,0,1,1-2-2A2,2,0,0,1,215.06,360.32Z"/><path class="cls-8" d="M217.33,360.32a2,2,0,1,1-2-2A2,2,0,0,1,217.33,360.32Z"/><path class="cls-9" d="M217.33,360.32a2,2,0,1,1-2-2A2,2,0,0,1,217.33,360.32Z"/><path class="cls-8" d="M219.61,360.2a2,2,0,1,1-2-2A2,2,0,0,1,219.61,360.2Z"/><path class="cls-9" d="M219.61,360.2a2,2,0,1,1-2-2A2,2,0,0,1,219.61,360.2Z"/><path class="cls-8" d="M221.89,360.2a2,2,0,1,1-2-2A2,2,0,0,1,221.89,360.2Z"/><path class="cls-9" d="M221.89,360.2a2,2,0,1,1-2-2A2,2,0,0,1,221.89,360.2Z"/><path class="cls-8" d="M224.17,360.2a2,2,0,1,1-2-2A2,2,0,0,1,224.17,360.2Z"/><path class="cls-9" d="M224.17,360.2a2,2,0,1,1-2-2A2,2,0,0,1,224.17,360.2Z"/><path class="cls-8" d="M226.45,360.08a2,2,0,1,1-2-2A2,2,0,0,1,226.45,360.08Z"/><path class="cls-9" d="M226.45,360.08a2,2,0,1,1-2-2A2,2,0,0,1,226.45,360.08Z"/><path class="cls-8" d="M228.73,360.08a2,2,0,1,1-2-2A2,2,0,0,1,228.73,360.08Z"/><path class="cls-9" d="M228.73,360.08a2,2,0,1,1-2-2A2,2,0,0,1,228.73,360.08Z"/><path class="cls-8" d="M231,360a2,2,0,1,1-2-2A2,2,0,0,1,231,360Z"/><path class="cls-9" d="M231,360a2,2,0,1,1-2-2A2,2,0,0,1,231,360Z"/><path class="cls-8" d="M233.29,360a2,2,0,1,1-2-2A2,2,0,0,1,233.29,360Z"/><path class="cls-9" d="M233.29,360a2,2,0,1,1-2-2A2,2,0,0,1,233.29,360Z"/><circle class="cls-8" cx="233.72" cy="359.96" r="1.98" transform="translate(-141.9 567.57) rotate(-85.93)"/><circle class="cls-9" cx="233.72" cy="359.96" r="1.98" transform="translate(-141.9 567.57) rotate(-85.93)"/><circle class="cls-8" cx="236" cy="359.84" r="1.98" transform="translate(-34.31 25.01) rotate(-5.65)"/><circle class="cls-9" cx="236" cy="359.84" r="1.98" transform="translate(-34.31 25.01) rotate(-5.65)"/><circle class="cls-8" cx="238.28" cy="359.84" r="1.98" transform="translate(-34.3 25.23) rotate(-5.65)"/><circle class="cls-9" cx="238.28" cy="359.84" r="1.98" transform="translate(-34.3 25.23) rotate(-5.65)"/><circle class="cls-8" cx="240.56" cy="359.84" r="1.98" transform="translate(-34.29 25.45) rotate(-5.65)"/><circle class="cls-9" cx="240.56" cy="359.84" r="1.98" transform="translate(-34.29 25.45) rotate(-5.65)"/><circle class="cls-8" cx="242.83" cy="359.72" r="1.98" transform="translate(-34.26 25.68) rotate(-5.65)"/><circle class="cls-9" cx="242.83" cy="359.72" r="1.98" transform="translate(-34.26 25.68) rotate(-5.65)"/><circle class="cls-8" cx="245.11" cy="359.72" r="1.98" transform="translate(-34.25 25.9) rotate(-5.65)"/><circle class="cls-9" cx="245.11" cy="359.72" r="1.98" transform="translate(-34.25 25.9) rotate(-5.65)"/><path class="cls-8" d="M249.38,359.6a2,2,0,1,1-2-2A2,2,0,0,1,249.38,359.6Z"/><path class="cls-9" d="M249.38,359.6a2,2,0,1,1-2-2A2,2,0,0,1,249.38,359.6Z"/><path class="cls-8" d="M251.66,359.6a2,2,0,1,1-2-2A2,2,0,0,1,251.66,359.6Z"/><path class="cls-9" d="M251.66,359.6a2,2,0,1,1-2-2A2,2,0,0,1,251.66,359.6Z"/><path class="cls-8" d="M253.94,359.6a2,2,0,1,1-2-2A2,2,0,0,1,253.94,359.6Z"/><path class="cls-9" d="M253.94,359.6a2,2,0,1,1-2-2A2,2,0,0,1,253.94,359.6Z"/><path class="cls-8" d="M256.22,359.48a2,2,0,1,1-2-2A2,2,0,0,1,256.22,359.48Z"/><path class="cls-9" d="M256.22,359.48a2,2,0,1,1-2-2A2,2,0,0,1,256.22,359.48Z"/><path class="cls-8" d="M258.61,359.48a2,2,0,1,1-2-2A2,2,0,0,1,258.61,359.48Z"/><path class="cls-9" d="M258.61,359.48a2,2,0,1,1-2-2A2,2,0,0,1,258.61,359.48Z"/><path class="cls-8" d="M260.89,359.48a2,2,0,1,1-2-2A2,2,0,0,1,260.89,359.48Z"/><path class="cls-9" d="M260.89,359.48a2,2,0,1,1-2-2A2,2,0,0,1,260.89,359.48Z"/><path class="cls-8" d="M263.17,359.36a2,2,0,1,1-2-2A2,2,0,0,1,263.17,359.36Z"/><path class="cls-9" d="M263.17,359.36a2,2,0,1,1-2-2A2,2,0,0,1,263.17,359.36Z"/><path class="cls-8" d="M265.45,359.36a2,2,0,1,1-2-2A2,2,0,0,1,265.45,359.36Z"/><path class="cls-9" d="M265.45,359.36a2,2,0,1,1-2-2A2,2,0,0,1,265.45,359.36Z"/><path class="cls-8" d="M267.73,359.36a2,2,0,1,1-2-2A2,2,0,0,1,267.73,359.36Z"/><path class="cls-9" d="M267.73,359.36a2,2,0,1,1-2-2A2,2,0,0,1,267.73,359.36Z"/><path class="cls-8" d="M270,359.24a2,2,0,1,1-2-2A2,2,0,0,1,270,359.24Z"/><path class="cls-9" d="M270,359.24a2,2,0,1,1-2-2A2,2,0,0,1,270,359.24Z"/><path class="cls-8" d="M272.29,359.24a2,2,0,1,1-2-2A2,2,0,0,1,272.29,359.24Z"/><path class="cls-9" d="M272.29,359.24a2,2,0,1,1-2-2A2,2,0,0,1,272.29,359.24Z"/><path class="cls-8" d="M274.57,359.12a2,2,0,1,1-2-2A2,2,0,0,1,274.57,359.12Z"/><path class="cls-9" d="M274.57,359.12a2,2,0,1,1-2-2A2,2,0,0,1,274.57,359.12Z"/><path class="cls-8" d="M276.85,359.12a2,2,0,1,1-2-2A2,2,0,0,1,276.85,359.12Z"/><path class="cls-9" d="M276.85,359.12a2,2,0,1,1-2-2A2,2,0,0,1,276.85,359.12Z"/><path class="cls-8" d="M279.13,359a2,2,0,1,1-2-2A2,2,0,0,1,279.13,359Z"/><path class="cls-9" d="M279.13,359a2,2,0,1,1-2-2A2,2,0,0,1,279.13,359Z"/><circle class="cls-8" cx="279.56" cy="358.88" r="1.98"/><circle class="cls-9" cx="279.56" cy="358.88" r="1.98"/><circle class="cls-8" cx="281.83" cy="358.88" r="1.98"/><circle class="cls-9" cx="281.83" cy="358.88" r="1.98"/><circle class="cls-8" cx="284.11" cy="358.76" r="1.98" transform="translate(-33.97 29.74) rotate(-5.65)"/><circle class="cls-9" cx="284.11" cy="358.76" r="1.98" transform="translate(-33.97 29.74) rotate(-5.65)"/><path class="cls-8" d="M288.38,358.64a2,2,0,1,1-2-2A2,2,0,0,1,288.38,358.64Z"/><path class="cls-9" d="M288.38,358.64a2,2,0,1,1-2-2A2,2,0,0,1,288.38,358.64Z"/><path class="cls-8" d="M290.66,358.64a2,2,0,1,1-2-2A2,2,0,0,1,290.66,358.64Z"/><path class="cls-9" d="M290.66,358.64a2,2,0,1,1-2-2A2,2,0,0,1,290.66,358.64Z"/><path class="cls-8" d="M292.94,358.52a2,2,0,1,1-2-2A2,2,0,0,1,292.94,358.52Z"/><path class="cls-9" d="M292.94,358.52a2,2,0,1,1-2-2A2,2,0,0,1,292.94,358.52Z"/><path class="cls-8" d="M295.22,358.4a2,2,0,1,1-2-2A2,2,0,0,1,295.22,358.4Z"/><path class="cls-9" d="M295.22,358.4a2,2,0,1,1-2-2A2,2,0,0,1,295.22,358.4Z"/><path class="cls-8" d="M297.5,358.4a2,2,0,1,1-2-2A2,2,0,0,1,297.5,358.4Z"/><path class="cls-9" d="M297.5,358.4a2,2,0,1,1-2-2A2,2,0,0,1,297.5,358.4Z"/><path class="cls-8" d="M299.78,358.28a2,2,0,1,1-2-2A2,2,0,0,1,299.78,358.28Z"/><path class="cls-9" d="M299.78,358.28a2,2,0,1,1-2-2A2,2,0,0,1,299.78,358.28Z"/><path class="cls-8" d="M302.06,358.16a2,2,0,1,1-2-2A2,2,0,0,1,302.06,358.16Z"/><path class="cls-9" d="M302.06,358.16a2,2,0,1,1-2-2A2,2,0,0,1,302.06,358.16Z"/><path class="cls-8" d="M304.33,358.16a2,2,0,1,1-2-2A2,2,0,0,1,304.33,358.16Z"/><path class="cls-9" d="M304.33,358.16a2,2,0,1,1-2-2A2,2,0,0,1,304.33,358.16Z"/><path class="cls-8" d="M306.73,358a2,2,0,1,1-2-2A2,2,0,0,1,306.73,358Z"/><path class="cls-9" d="M306.73,358a2,2,0,1,1-2-2A2,2,0,0,1,306.73,358Z"/><path class="cls-8" d="M309,357.92a2,2,0,1,1-2-2A2,2,0,0,1,309,357.92Z"/><path class="cls-9" d="M309,357.92a2,2,0,1,1-2-2A2,2,0,0,1,309,357.92Z"/><path class="cls-8" d="M311.29,357.92a2,2,0,1,1-2-2A2,2,0,0,1,311.29,357.92Z"/><path class="cls-9" d="M311.29,357.92a2,2,0,1,1-2-2A2,2,0,0,1,311.29,357.92Z"/><path class="cls-8" d="M313.57,357.8a2,2,0,1,1-2-2A2,2,0,0,1,313.57,357.8Z"/><path class="cls-9" d="M313.57,357.8a2,2,0,1,1-2-2A2,2,0,0,1,313.57,357.8Z"/><path class="cls-8" d="M315.85,357.68a2,2,0,1,1-2-2A2,2,0,0,1,315.85,357.68Z"/><path class="cls-9" d="M315.85,357.68a2,2,0,1,1-2-2A2,2,0,0,1,315.85,357.68Z"/><path class="cls-8" d="M318.13,357.68a2,2,0,1,1-2-2A2,2,0,0,1,318.13,357.68Z"/><path class="cls-9" d="M318.13,357.68a2,2,0,1,1-2-2A2,2,0,0,1,318.13,357.68Z"/><circle class="cls-8" cx="318.44" cy="357.56" r="1.98" transform="translate(-60.79 649.84) rotate(-85.93)"/><circle class="cls-9" cx="318.44" cy="357.56" r="1.98" transform="translate(-60.79 649.84) rotate(-85.93)"/><circle class="cls-8" cx="320.72" cy="357.44" r="1.98" transform="translate(-33.66 33.34) rotate(-5.65)"/><circle class="cls-9" cx="320.72" cy="357.44" r="1.98" transform="translate(-33.66 33.34) rotate(-5.65)"/><circle class="cls-8" cx="323" cy="357.32" r="1.98" transform="translate(-56.32 654.17) rotate(-85.93)"/><circle class="cls-9" cx="323" cy="357.32" r="1.98" transform="translate(-56.32 654.17) rotate(-85.93)"/><circle class="cls-8" cx="325.28" cy="357.32" r="1.98" transform="translate(-54.2 656.44) rotate(-85.93)"/><circle class="cls-9" cx="325.28" cy="357.32" r="1.98" transform="translate(-54.2 656.44) rotate(-85.93)"/><path class="cls-8" d="M329.66,357.2a2,2,0,1,1-2-2A2,2,0,0,1,329.66,357.2Z"/><path class="cls-9" d="M329.66,357.2a2,2,0,1,1-2-2A2,2,0,0,1,329.66,357.2Z"/><path class="cls-8" d="M331.94,357.08a2,2,0,1,1-2-2A2,2,0,0,1,331.94,357.08Z"/><path class="cls-9" d="M331.94,357.08a2,2,0,1,1-2-2A2,2,0,0,1,331.94,357.08Z"/><path class="cls-8" d="M334.22,357a2,2,0,1,1-2-2A2,2,0,0,1,334.22,357Z"/><path class="cls-9" d="M334.22,357a2,2,0,1,1-2-2A2,2,0,0,1,334.22,357Z"/><path class="cls-8" d="M336.5,356.84a2,2,0,1,1-2-2A2,2,0,0,1,336.5,356.84Z"/><path class="cls-9" d="M336.5,356.84a2,2,0,1,1-2-2A2,2,0,0,1,336.5,356.84Z"/><path class="cls-8" d="M338.78,356.72a2,2,0,1,1-2-2A2,2,0,0,1,338.78,356.72Z"/><path class="cls-9" d="M338.78,356.72a2,2,0,1,1-2-2A2,2,0,0,1,338.78,356.72Z"/><path class="cls-8" d="M341.06,356.6a2,2,0,1,1-2-2A2,2,0,0,1,341.06,356.6Z"/><path class="cls-9" d="M341.06,356.6a2,2,0,1,1-2-2A2,2,0,0,1,341.06,356.6Z"/><path class="cls-8" d="M343.33,356.48a2,2,0,1,1-2-2A2,2,0,0,1,343.33,356.48Z"/><path class="cls-9" d="M343.33,356.48a2,2,0,1,1-2-2A2,2,0,0,1,343.33,356.48Z"/><path class="cls-8" d="M345.61,356.36a2,2,0,1,1-2-2A2,2,0,0,1,345.61,356.36Z"/><path class="cls-9" d="M345.61,356.36a2,2,0,1,1-2-2A2,2,0,0,1,345.61,356.36Z"/><path class="cls-8" d="M347.89,356.24a2,2,0,1,1-2-2A2,2,0,0,1,347.89,356.24Z"/><path class="cls-9" d="M347.89,356.24a2,2,0,1,1-2-2A2,2,0,0,1,347.89,356.24Z"/><path class="cls-8" d="M350.17,356.12a2,2,0,1,1-2-2A2,2,0,0,1,350.17,356.12Z"/><path class="cls-9" d="M350.17,356.12a2,2,0,1,1-2-2A2,2,0,0,1,350.17,356.12Z"/><path class="cls-8" d="M352.57,356a2,2,0,1,1-2-2A2,2,0,0,1,352.57,356Z"/><path class="cls-9" d="M352.57,356a2,2,0,1,1-2-2A2,2,0,0,1,352.57,356Z"/><path class="cls-8" d="M354.85,355.88a2,2,0,1,1-2-2A2,2,0,0,1,354.85,355.88Z"/><path class="cls-9" d="M354.85,355.88a2,2,0,1,1-2-2A2,2,0,0,1,354.85,355.88Z"/><path class="cls-8" d="M357.13,355.76a2,2,0,1,1-2-2A2,2,0,0,1,357.13,355.76Z"/><path class="cls-9" d="M357.13,355.76a2,2,0,1,1-2-2A2,2,0,0,1,357.13,355.76Z"/><circle class="cls-8" cx="357.44" cy="355.64" r="1.98" transform="translate(-22.64 686.96) rotate(-85.93)"/><circle class="cls-9" cx="357.44" cy="355.64" r="1.98" transform="translate(-22.64 686.96) rotate(-85.93)"/><circle class="cls-8" cx="359.72" cy="355.52" r="1.98"/><circle class="cls-9" cx="359.72" cy="355.52" r="1.98"/><circle class="cls-8" cx="362" cy="355.4" r="1.98" transform="translate(-33.26 37.4) rotate(-5.65)"/><circle class="cls-9" cx="362" cy="355.4" r="1.98" transform="translate(-33.26 37.4) rotate(-5.65)"/><circle class="cls-8" cx="364.28" cy="355.28" r="1.98" transform="translate(-15.93 693.45) rotate(-85.93)"/><circle class="cls-9" cx="364.28" cy="355.28" r="1.98" transform="translate(-15.93 693.45) rotate(-85.93)"/><circle class="cls-8" cx="366.56" cy="355.16" r="1.98" transform="translate(-33.21 37.85) rotate(-5.65)"/><circle class="cls-9" cx="366.56" cy="355.16" r="1.98" transform="translate(-33.21 37.85) rotate(-5.65)"/><circle class="cls-8" cx="368.83" cy="355.04" r="1.98" transform="translate(-33.19 38.07) rotate(-5.65)"/><circle class="cls-9" cx="368.83" cy="355.04" r="1.98" transform="translate(-33.19 38.07) rotate(-5.65)"/><circle class="cls-8" cx="371.11" cy="354.92" r="1.98" transform="translate(-9.22 699.94) rotate(-85.93)"/><circle class="cls-9" cx="371.11" cy="354.92" r="1.98" transform="translate(-9.22 699.94) rotate(-85.93)"/><path class="cls-8" d="M375.5,354.68a2,2,0,1,1-2-2A2,2,0,0,1,375.5,354.68Z"/><path class="cls-9" d="M375.5,354.68a2,2,0,1,1-2-2A2,2,0,0,1,375.5,354.68Z"/><path class="cls-8" d="M377.78,354.56a2,2,0,1,1-2-2A2,2,0,0,1,377.78,354.56Z"/><path class="cls-9" d="M377.78,354.56a2,2,0,1,1-2-2A2,2,0,0,1,377.78,354.56Z"/><path class="cls-8" d="M380.06,354.44a2,2,0,1,1-2-2A2,2,0,0,1,380.06,354.44Z"/><path class="cls-9" d="M380.06,354.44a2,2,0,1,1-2-2A2,2,0,0,1,380.06,354.44Z"/><path class="cls-8" d="M382.33,354.2a2,2,0,1,1-2-2A2,2,0,0,1,382.33,354.2Z"/><path class="cls-9" d="M382.33,354.2a2,2,0,1,1-2-2A2,2,0,0,1,382.33,354.2Z"/><path class="cls-8" d="M384.61,354.08a2,2,0,1,1-2-2A2,2,0,0,1,384.61,354.08Z"/><path class="cls-9" d="M384.61,354.08a2,2,0,1,1-2-2A2,2,0,0,1,384.61,354.08Z"/><path class="cls-8" d="M386.89,354a2,2,0,1,1-2-2A2,2,0,0,1,386.89,354Z"/><path class="cls-9" d="M386.89,354a2,2,0,1,1-2-2A2,2,0,0,1,386.89,354Z"/><path class="cls-8" d="M389.17,353.72a2,2,0,1,1-2-2A2,2,0,0,1,389.17,353.72Z"/><path class="cls-9" d="M389.17,353.72a2,2,0,1,1-2-2A2,2,0,0,1,389.17,353.72Z"/><path class="cls-8" d="M391.45,353.6a2,2,0,1,1-2-2A2,2,0,0,1,391.45,353.6Z"/><path class="cls-9" d="M391.45,353.6a2,2,0,1,1-2-2A2,2,0,0,1,391.45,353.6Z"/><path class="cls-8" d="M393.73,353.48a2,2,0,1,1-2-2A2,2,0,0,1,393.73,353.48Z"/><path class="cls-9" d="M393.73,353.48a2,2,0,1,1-2-2A2,2,0,0,1,393.73,353.48Z"/><path class="cls-8" d="M396,353.24a2,2,0,1,1-2-2A2,2,0,0,1,396,353.24Z"/><path class="cls-9" d="M396,353.24a2,2,0,1,1-2-2A2,2,0,0,1,396,353.24Z"/><circle class="cls-8" cx="396.44" cy="353.12" r="1.98" transform="translate(-32.87 40.78) rotate(-5.65)"/><circle class="cls-9" cx="396.44" cy="353.12" r="1.98" transform="translate(-32.87 40.78) rotate(-5.65)"/><circle class="cls-8" cx="398.72" cy="352.88" r="1.98"/><circle class="cls-9" cx="398.72" cy="352.88" r="1.98"/><circle class="cls-8" cx="401" cy="352.76" r="1.98" transform="translate(-32.81 41.23) rotate(-5.65)"/><circle class="cls-9" cx="401" cy="352.76" r="1.98" transform="translate(-32.81 41.23) rotate(-5.65)"/><circle class="cls-8" cx="403.28" cy="352.52" r="1.98"/><circle class="cls-9" cx="403.28" cy="352.52" r="1.98"/><circle class="cls-8" cx="405.56" cy="352.28" r="1.98" transform="translate(25.42 731.84) rotate(-85.93)"/><circle class="cls-9" cx="405.56" cy="352.28" r="1.98" transform="translate(25.42 731.84) rotate(-85.93)"/><circle class="cls-8" cx="407.83" cy="352.16" r="1.98" transform="translate(-32.72 41.9) rotate(-5.65)"/><circle class="cls-9" cx="407.83" cy="352.16" r="1.98" transform="translate(-32.72 41.9) rotate(-5.65)"/><circle class="cls-8" cx="410.11" cy="351.92" r="1.98" transform="translate(30.01 736.05) rotate(-85.93)"/><circle class="cls-9" cx="410.11" cy="351.92" r="1.98" transform="translate(30.01 736.05) rotate(-85.93)"/><path class="cls-8" d="M414.38,351.68a2,2,0,1,1-2-2A2,2,0,0,1,414.38,351.68Z"/><path class="cls-9" d="M414.38,351.68a2,2,0,1,1-2-2A2,2,0,0,1,414.38,351.68Z"/><path class="cls-8" d="M416.66,351.56a2,2,0,1,1-2-2A2,2,0,0,1,416.66,351.56Z"/><path class="cls-9" d="M416.66,351.56a2,2,0,1,1-2-2A2,2,0,0,1,416.66,351.56Z"/><path class="cls-8" d="M418.94,351.32a2,2,0,1,1-2-2A2,2,0,0,1,418.94,351.32Z"/><path class="cls-9" d="M418.94,351.32a2,2,0,1,1-2-2A2,2,0,0,1,418.94,351.32Z"/><path class="cls-8" d="M421.33,351.08a2,2,0,1,1-2-2A2,2,0,0,1,421.33,351.08Z"/><path class="cls-9" d="M421.33,351.08a2,2,0,1,1-2-2A2,2,0,0,1,421.33,351.08Z"/><path class="cls-8" d="M423.61,351a2,2,0,1,1-2-2A2,2,0,0,1,423.61,351Z"/><path class="cls-9" d="M423.61,351a2,2,0,1,1-2-2A2,2,0,0,1,423.61,351Z"/><path class="cls-8" d="M425.89,350.72a2,2,0,1,1-2-2A2,2,0,0,1,425.89,350.72Z"/><path class="cls-9" d="M425.89,350.72a2,2,0,1,1-2-2A2,2,0,0,1,425.89,350.72Z"/><path class="cls-8" d="M428.17,350.48a2,2,0,1,1-2-2A2,2,0,0,1,428.17,350.48Z"/><path class="cls-9" d="M428.17,350.48a2,2,0,1,1-2-2A2,2,0,0,1,428.17,350.48Z"/><path class="cls-8" d="M430.45,350.24a2,2,0,1,1-2-2A2,2,0,0,1,430.45,350.24Z"/><path class="cls-9" d="M430.45,350.24a2,2,0,1,1-2-2A2,2,0,0,1,430.45,350.24Z"/><path class="cls-8" d="M432.73,349.88a2,2,0,1,1-2-2A2,2,0,0,1,432.73,349.88Z"/><path class="cls-9" d="M432.73,349.88a2,2,0,1,1-2-2A2,2,0,0,1,432.73,349.88Z"/><path class="cls-8" d="M435,349.64a2,2,0,1,1-2-2A2,2,0,0,1,435,349.64Z"/><path class="cls-9" d="M435,349.64a2,2,0,1,1-2-2A2,2,0,0,1,435,349.64Z"/><path class="cls-8" d="M437.29,349.4a2,2,0,1,1-2-2A2,2,0,0,1,437.29,349.4Z"/><path class="cls-9" d="M437.29,349.4a2,2,0,1,1-2-2A2,2,0,0,1,437.29,349.4Z"/><path class="cls-8" d="M439.57,349.16a2,2,0,1,1-2-2A2,2,0,0,1,439.57,349.16Z"/><path class="cls-9" d="M439.57,349.16a2,2,0,1,1-2-2A2,2,0,0,1,439.57,349.16Z"/><path class="cls-8" d="M441.85,348.92a2,2,0,1,1-2-2A2,2,0,0,1,441.85,348.92Z"/><path class="cls-9" d="M441.85,348.92a2,2,0,1,1-2-2A2,2,0,0,1,441.85,348.92Z"/><circle class="cls-8" cx="442.28" cy="348.68" r="1.98" transform="translate(63.12 765.12) rotate(-85.93)"/><circle class="cls-9" cx="442.28" cy="348.68" r="1.98" transform="translate(63.12 765.12) rotate(-85.93)"/><circle class="cls-8" cx="444.56" cy="348.44" r="1.98" transform="translate(-32.17 45.5) rotate(-5.65)"/><circle class="cls-9" cx="444.56" cy="348.44" r="1.98" transform="translate(-32.17 45.5) rotate(-5.65)"/><circle class="cls-8" cx="446.83" cy="348.2" r="1.98"/><circle class="cls-9" cx="446.83" cy="348.2" r="1.98"/><circle class="cls-8" cx="449.11" cy="347.84" r="1.98" transform="translate(-32.09 45.95) rotate(-5.65)"/><circle class="cls-9" cx="449.11" cy="347.84" r="1.98" transform="translate(-32.09 45.95) rotate(-5.65)"/><path class="cls-8" d="M453.38,347.6a2,2,0,1,1-2-2A2,2,0,0,1,453.38,347.6Z"/><path class="cls-9" d="M453.38,347.6a2,2,0,1,1-2-2A2,2,0,0,1,453.38,347.6Z"/><path class="cls-8" d="M455.66,347.24a2,2,0,1,1-2-2A2,2,0,0,1,455.66,347.24Z"/><path class="cls-9" d="M455.66,347.24a2,2,0,1,1-2-2A2,2,0,0,1,455.66,347.24Z"/><circle class="cls-8" cx="455.95" cy="347" r="1.98" transform="translate(-18.49 25.75) rotate(-3.17)"/><circle class="cls-9" cx="455.95" cy="347" r="1.98" transform="translate(-18.49 25.75) rotate(-3.17)"/><path class="cls-8" d="M460.22,346.76a2,2,0,1,1-2-2A2,2,0,0,1,460.22,346.76Z"/><path class="cls-9" d="M460.22,346.76a2,2,0,1,1-2-2A2,2,0,0,1,460.22,346.76Z"/><circle class="cls-8" cx="460.51" cy="346.4" r="1.98" transform="translate(93.61 790.74) rotate(-87.4)"/><circle class="cls-9" cx="460.51" cy="346.4" r="1.98" transform="translate(93.61 790.74) rotate(-87.4)"/><path class="cls-8" d="M464.78,346a2,2,0,1,1-2-2A2,2,0,0,1,464.78,346Z"/><path class="cls-9" d="M464.78,346a2,2,0,1,1-2-2A2,2,0,0,1,464.78,346Z"/><path class="cls-8" d="M467.18,345.8a2,2,0,1,1-2-2A2,2,0,0,1,467.18,345.8Z"/><path class="cls-9" d="M467.18,345.8a2,2,0,1,1-2-2A2,2,0,0,1,467.18,345.8Z"/><path class="cls-8" d="M469.45,345.44a2,2,0,1,1-2-2A2,2,0,0,1,469.45,345.44Z"/><path class="cls-9" d="M469.45,345.44a2,2,0,1,1-2-2A2,2,0,0,1,469.45,345.44Z"/><path class="cls-8" d="M471.74,345.08a2,2,0,1,1-2-2A2,2,0,0,1,471.74,345.08Z"/><path class="cls-9" d="M471.74,345.08a2,2,0,1,1-2-2A2,2,0,0,1,471.74,345.08Z"/><circle class="cls-8" cx="472.04" cy="344.84" r="1.98" transform="translate(106.16 800.76) rotate(-87.4)"/><circle class="cls-9" cx="472.04" cy="344.84" r="1.98" transform="translate(106.16 800.76) rotate(-87.4)"/><path class="cls-8" d="M476.3,344.48a2,2,0,1,1-2-2A2,2,0,0,1,476.3,344.48Z"/><path class="cls-9" d="M476.3,344.48a2,2,0,1,1-2-2A2,2,0,0,1,476.3,344.48Z"/><circle class="cls-8" cx="476.6" cy="344" r="1.98"/><circle class="cls-9" cx="476.6" cy="344" r="1.98"/><path class="cls-8" d="M480.86,343.64a2,2,0,1,1-2-2A2,2,0,0,1,480.86,343.64Z"/><path class="cls-9" d="M480.86,343.64a2,2,0,1,1-2-2A2,2,0,0,1,480.86,343.64Z"/><circle class="cls-8" cx="481.16" cy="343.28" r="1.98"/><circle class="cls-9" cx="481.16" cy="343.28" r="1.98"/><path class="cls-8" d="M485.42,342.92a2,2,0,1,1-2-2A2,2,0,0,1,485.42,342.92Z"/><path class="cls-9" d="M485.42,342.92a2,2,0,1,1-2-2A2,2,0,0,1,485.42,342.92Z"/><circle class="cls-8" cx="485.72" cy="342.56" r="1.98"/><circle class="cls-9" cx="485.72" cy="342.56" r="1.98"/><path class="cls-8" d="M490,342.2a2,2,0,1,1-2-2A2,2,0,0,1,490,342.2Z"/><path class="cls-9" d="M490,342.2a2,2,0,1,1-2-2A2,2,0,0,1,490,342.2Z"/><circle class="cls-8" cx="490.39" cy="341.84" r="1.98"/><circle class="cls-9" cx="490.39" cy="341.84" r="1.98"/><path class="cls-8" d="M494.66,341.36a2,2,0,1,1-2-2A2,2,0,0,1,494.66,341.36Z"/><path class="cls-9" d="M494.66,341.36a2,2,0,1,1-2-2A2,2,0,0,1,494.66,341.36Z"/><circle class="cls-8" cx="494.95" cy="341" r="1.98"/><circle class="cls-9" cx="494.95" cy="341" r="1.98"/><path class="cls-8" d="M499.22,340.52a2,2,0,1,1-2-2A2,2,0,0,1,499.22,340.52Z"/><path class="cls-9" d="M499.22,340.52a2,2,0,1,1-2-2A2,2,0,0,1,499.22,340.52Z"/><circle class="cls-8" cx="499.51" cy="340.16" r="1.98" transform="translate(-31.09 50.88) rotate(-5.65)"/><circle class="cls-9" cx="499.51" cy="340.16" r="1.98" transform="translate(-31.09 50.88) rotate(-5.65)"/><path class="cls-8" d="M503.78,339.68a2,2,0,1,1-2-2A2,2,0,0,1,503.78,339.68Z"/><path class="cls-9" d="M503.78,339.68a2,2,0,1,1-2-2A2,2,0,0,1,503.78,339.68Z"/><path class="cls-8" d="M506.06,339.2a2,2,0,1,1-2-2A2,2,0,0,1,506.06,339.2Z"/><path class="cls-9" d="M506.06,339.2a2,2,0,1,1-2-2A2,2,0,0,1,506.06,339.2Z"/><path class="cls-8" d="M508.33,338.84a2,2,0,1,1-2-2A2,2,0,0,1,508.33,338.84Z"/><path class="cls-9" d="M508.33,338.84a2,2,0,1,1-2-2A2,2,0,0,1,508.33,338.84Z"/><path class="cls-8" d="M510.62,338.36a2,2,0,1,1-2-2A2,2,0,0,1,510.62,338.36Z"/><path class="cls-9" d="M510.62,338.36a2,2,0,1,1-2-2A2,2,0,0,1,510.62,338.36Z"/><path class="cls-8" d="M512.89,337.88a2,2,0,1,1-2-2A2,2,0,0,1,512.89,337.88Z"/><path class="cls-9" d="M512.89,337.88a2,2,0,1,1-2-2A2,2,0,0,1,512.89,337.88Z"/><path class="cls-8" d="M515.3,337.4a2,2,0,1,1-2-2A2,2,0,0,1,515.3,337.4Z"/><path class="cls-9" d="M515.3,337.4a2,2,0,1,1-2-2A2,2,0,0,1,515.3,337.4Z"/><circle class="cls-8" cx="515.6" cy="336.92" r="1.98"/><circle class="cls-9" cx="515.6" cy="336.92" r="1.98"/><path class="cls-8" d="M519.86,336.44a2,2,0,1,1-2-2A2,2,0,0,1,519.86,336.44Z"/><path class="cls-9" d="M519.86,336.44a2,2,0,1,1-2-2A2,2,0,0,1,519.86,336.44Z"/><circle class="cls-8" cx="520.16" cy="335.84" r="1.98"/><circle class="cls-9" cx="520.16" cy="335.84" r="1.98"/><path class="cls-8" d="M524.42,335.36a2,2,0,1,1-2-2A2,2,0,0,1,524.42,335.36Z"/><path class="cls-9" d="M524.42,335.36a2,2,0,1,1-2-2A2,2,0,0,1,524.42,335.36Z"/><circle class="cls-8" cx="524.72" cy="334.88" r="1.98" transform="translate(153.49 834.53) rotate(-85.93)"/><circle class="cls-9" cx="524.72" cy="334.88" r="1.98" transform="translate(153.49 834.53) rotate(-85.93)"/><path class="cls-8" d="M529,334.28a2,2,0,1,1-2-2A2,2,0,0,1,529,334.28Z"/><path class="cls-9" d="M529,334.28a2,2,0,1,1-2-2A2,2,0,0,1,529,334.28Z"/><circle class="cls-8" cx="529.28" cy="333.68" r="1.98" transform="translate(158.92 837.96) rotate(-85.93)"/><circle class="cls-9" cx="529.28" cy="333.68" r="1.98" transform="translate(158.92 837.96) rotate(-85.93)"/><path class="cls-8" d="M533.54,333.2a2,2,0,1,1-2-2A2,2,0,0,1,533.54,333.2Z"/><path class="cls-9" d="M533.54,333.2a2,2,0,1,1-2-2A2,2,0,0,1,533.54,333.2Z"/><circle class="cls-8" cx="533.83" cy="332.6" r="1.98" transform="translate(164.23 841.51) rotate(-85.93)"/><circle class="cls-9" cx="533.83" cy="332.6" r="1.98" transform="translate(164.23 841.51) rotate(-85.93)"/><path class="cls-8" d="M538.22,332a2,2,0,1,1-2-2A2,2,0,0,1,538.22,332Z"/><path class="cls-9" d="M538.22,332a2,2,0,1,1-2-2A2,2,0,0,1,538.22,332Z"/><circle class="cls-8" cx="538.51" cy="331.4" r="1.98"/><circle class="cls-9" cx="538.51" cy="331.4" r="1.98"/><path class="cls-8" d="M542.78,330.8a2,2,0,1,1-2-2A2,2,0,0,1,542.78,330.8Z"/><path class="cls-9" d="M542.78,330.8a2,2,0,1,1-2-2A2,2,0,0,1,542.78,330.8Z"/><path class="cls-8" d="M545.06,330.2a2,2,0,1,1-2-2A2,2,0,0,1,545.06,330.2Z"/><path class="cls-9" d="M545.06,330.2a2,2,0,1,1-2-2A2,2,0,0,1,545.06,330.2Z"/><path class="cls-8" d="M547.33,329.6a2,2,0,1,1-2-2A2,2,0,0,1,547.33,329.6Z"/><path class="cls-9" d="M547.33,329.6a2,2,0,1,1-2-2A2,2,0,0,1,547.33,329.6Z"/><path class="cls-8" d="M549.62,328.88a2,2,0,1,1-2-2A2,2,0,0,1,549.62,328.88Z"/><path class="cls-9" d="M549.62,328.88a2,2,0,1,1-2-2A2,2,0,0,1,549.62,328.88Z"/><path class="cls-8" d="M551.89,328.28a2,2,0,1,1-2-2A2,2,0,0,1,551.89,328.28Z"/><path class="cls-9" d="M551.89,328.28a2,2,0,1,1-2-2A2,2,0,0,1,551.89,328.28Z"/><path class="cls-8" d="M554.18,327.56a2,2,0,1,1-2-2A2,2,0,0,1,554.18,327.56Z"/><path class="cls-9" d="M554.18,327.56a2,2,0,1,1-2-2A2,2,0,0,1,554.18,327.56Z"/><path class="cls-8" d="M556.45,326.84a2,2,0,1,1-2-2A2,2,0,0,1,556.45,326.84Z"/><path class="cls-9" d="M556.45,326.84a2,2,0,1,1-2-2A2,2,0,0,1,556.45,326.84Z"/><path class="cls-8" d="M558.74,326.24a2,2,0,1,1-2-2A2,2,0,0,1,558.74,326.24Z"/><path class="cls-9" d="M558.74,326.24a2,2,0,1,1-2-2A2,2,0,0,1,558.74,326.24Z"/><circle class="cls-8" cx="559.16" cy="325.52" r="1.98" transform="translate(-29.35 56.68) rotate(-5.65)"/><circle class="cls-9" cx="559.16" cy="325.52" r="1.98" transform="translate(-29.35 56.68) rotate(-5.65)"/><path class="cls-8" d="M563.42,324.8a2,2,0,1,1-2-2A2,2,0,0,1,563.42,324.8Z"/><path class="cls-9" d="M563.42,324.8a2,2,0,1,1-2-2A2,2,0,0,1,563.42,324.8Z"/><circle class="cls-8" cx="563.72" cy="323.96" r="1.98" transform="translate(200.61 863.29) rotate(-85.93)"/><circle class="cls-9" cx="563.72" cy="323.96" r="1.98" transform="translate(200.61 863.29) rotate(-85.93)"/><path class="cls-8" d="M568,323.24a2,2,0,1,1-2-2A2,2,0,0,1,568,323.24Z"/><path class="cls-9" d="M568,323.24a2,2,0,1,1-2-2A2,2,0,0,1,568,323.24Z"/><circle class="cls-8" cx="568.28" cy="322.52" r="1.98" transform="translate(-29.01 57.57) rotate(-5.65)"/><circle class="cls-9" cx="568.28" cy="322.52" r="1.98" transform="translate(-29.01 57.57) rotate(-5.65)"/><path class="cls-8" d="M572.54,321.68a2,2,0,1,1-2-2A2,2,0,0,1,572.54,321.68Z"/><path class="cls-9" d="M572.54,321.68a2,2,0,1,1-2-2A2,2,0,0,1,572.54,321.68Z"/><circle class="cls-8" cx="572.83" cy="320.84" r="1.98"/><circle class="cls-9" cx="572.83" cy="320.84" r="1.98"/><path class="cls-8" d="M577.1,320.12a2,2,0,1,1-2-2A2,2,0,0,1,577.1,320.12Z"/><path class="cls-9" d="M577.1,320.12a2,2,0,1,1-2-2A2,2,0,0,1,577.1,320.12Z"/><circle class="cls-8" cx="577.39" cy="319.28" r="1.98"/><circle class="cls-9" cx="577.39" cy="319.28" r="1.98"/><path class="cls-8" d="M581.66,318.32a2,2,0,1,1-2-2A2,2,0,0,1,581.66,318.32Z"/><path class="cls-9" d="M581.66,318.32a2,2,0,1,1-2-2A2,2,0,0,1,581.66,318.32Z"/><path class="cls-8" d="M584.06,317.48a2,2,0,1,1-2-2A2,2,0,0,1,584.06,317.48Z"/><path class="cls-9" d="M584.06,317.48a2,2,0,1,1-2-2A2,2,0,0,1,584.06,317.48Z"/><path class="cls-8" d="M586.33,316.64a2,2,0,1,1-2-2A2,2,0,0,1,586.33,316.64Z"/><path class="cls-9" d="M586.33,316.64a2,2,0,1,1-2-2A2,2,0,0,1,586.33,316.64Z"/><path class="cls-8" d="M588.62,315.8a2,2,0,1,1-2-2A2,2,0,0,1,588.62,315.8Z"/><path class="cls-9" d="M588.62,315.8a2,2,0,1,1-2-2A2,2,0,0,1,588.62,315.8Z"/><path class="cls-8" d="M590.89,314.84a2,2,0,1,1-2-2A2,2,0,0,1,590.89,314.84Z"/><path class="cls-9" d="M590.89,314.84a2,2,0,1,1-2-2A2,2,0,0,1,590.89,314.84Z"/><path class="cls-8" d="M593.18,313.88a2,2,0,1,1-2-2A2,2,0,0,1,593.18,313.88Z"/><path class="cls-9" d="M593.18,313.88a2,2,0,1,1-2-2A2,2,0,0,1,593.18,313.88Z"/><path class="cls-8" d="M595.45,312.92a2,2,0,1,1-2-2A2,2,0,0,1,595.45,312.92Z"/><path class="cls-9" d="M595.45,312.92a2,2,0,1,1-2-2A2,2,0,0,1,595.45,312.92Z"/><path class="cls-8" d="M597.74,312a2,2,0,1,1-2-2A2,2,0,0,1,597.74,312Z"/><path class="cls-9" d="M597.74,312a2,2,0,1,1-2-2A2,2,0,0,1,597.74,312Z"/><circle class="cls-8" cx="598.04" cy="310.88" r="1.98" transform="translate(245.55 885.37) rotate(-85.93)"/><circle class="cls-9" cx="598.04" cy="310.88" r="1.98" transform="translate(245.55 885.37) rotate(-85.93)"/><path class="cls-8" d="M602.3,309.92a2,2,0,1,1-2-2A2,2,0,0,1,602.3,309.92Z"/><path class="cls-9" d="M602.3,309.92a2,2,0,1,1-2-2A2,2,0,0,1,602.3,309.92Z"/><circle class="cls-8" cx="602.6" cy="308.84" r="1.98"/><circle class="cls-9" cx="602.6" cy="308.84" r="1.98"/><path class="cls-8" d="M607,307.76a2,2,0,1,1-2-2A2,2,0,0,1,607,307.76Z"/><path class="cls-9" d="M607,307.76a2,2,0,1,1-2-2A2,2,0,0,1,607,307.76Z"/><circle class="cls-8" cx="607.28" cy="306.68" r="1.98" transform="translate(258.32 890.68) rotate(-85.93)"/><circle class="cls-9" cx="607.28" cy="306.68" r="1.98" transform="translate(258.32 890.68) rotate(-85.93)"/><path class="cls-8" d="M611.54,305.6a2,2,0,1,1-2-2A2,2,0,0,1,611.54,305.6Z"/><path class="cls-9" d="M611.54,305.6a2,2,0,1,1-2-2A2,2,0,0,1,611.54,305.6Z"/><circle class="cls-8" cx="611.83" cy="304.52" r="1.98" transform="translate(-27.03 61.77) rotate(-5.65)"/><circle class="cls-9" cx="611.83" cy="304.52" r="1.98" transform="translate(-27.03 61.77) rotate(-5.65)"/><path class="cls-8" d="M616.1,303.32a2,2,0,1,1-2-2A2,2,0,0,1,616.1,303.32Z"/><path class="cls-9" d="M616.1,303.32a2,2,0,1,1-2-2A2,2,0,0,1,616.1,303.32Z"/><circle class="cls-8" cx="616.39" cy="302.12" r="1.98"/><circle class="cls-9" cx="616.39" cy="302.12" r="1.98"/><path class="cls-8" d="M620.66,301a2,2,0,1,1-2-2A2,2,0,0,1,620.66,301Z"/><path class="cls-9" d="M620.66,301a2,2,0,1,1-2-2A2,2,0,0,1,620.66,301Z"/><circle class="cls-8" cx="620.95" cy="299.84" r="1.98"/><circle class="cls-9" cx="620.95" cy="299.84" r="1.98"/><path class="cls-8" d="M625.22,298.52a2,2,0,1,1-2-2A2,2,0,0,1,625.22,298.52Z"/><path class="cls-9" d="M625.22,298.52a2,2,0,1,1-2-2A2,2,0,0,1,625.22,298.52Z"/><circle class="cls-8" cx="625.51" cy="297.32" r="1.98" transform="translate(284.61 900.18) rotate(-85.93)"/><circle class="cls-9" cx="625.51" cy="297.32" r="1.98" transform="translate(284.61 900.18) rotate(-85.93)"/><path class="cls-8" d="M629.89,296a2,2,0,1,1-2-2A2,2,0,0,1,629.89,296Z"/><path class="cls-9" d="M629.89,296a2,2,0,1,1-2-2A2,2,0,0,1,629.89,296Z"/><path class="cls-8" d="M632.18,294.68a2,2,0,1,1-2-2A2,2,0,0,1,632.18,294.68Z"/><path class="cls-9" d="M632.18,294.68a2,2,0,1,1-2-2A2,2,0,0,1,632.18,294.68Z"/><path class="cls-8" d="M634.45,293.36a2,2,0,1,1-2-2A2,2,0,0,1,634.45,293.36Z"/><path class="cls-9" d="M634.45,293.36a2,2,0,1,1-2-2A2,2,0,0,1,634.45,293.36Z"/><path class="cls-8" d="M636.74,292a2,2,0,1,1-2-2A2,2,0,0,1,636.74,292Z"/><path class="cls-9" d="M636.74,292a2,2,0,1,1-2-2A2,2,0,0,1,636.74,292Z"/><circle class="cls-8" cx="637.04" cy="290.6" r="1.98" transform="translate(302.01 905.43) rotate(-85.93)"/><circle class="cls-9" cx="637.04" cy="290.6" r="1.98" transform="translate(302.01 905.43) rotate(-85.93)"/><path class="cls-8" d="M641.3,289.28a2,2,0,1,1-2-2A2,2,0,0,1,641.3,289.28Z"/><path class="cls-9" d="M641.3,289.28a2,2,0,1,1-2-2A2,2,0,0,1,641.3,289.28Z"/><g class="cls-3"><polyline class="cls-10" points="54.95 363.25 57.26 363.13 59.53 363.13 61.81 363.01 64.09 362.89 66.38 362.89 68.66 362.77 71.06 362.65 73.33 362.54 75.61 362.42 77.89 362.3 80.17 362.18 82.45 362.06 84.73 361.94 87.02 361.69 89.3 361.57 91.58 361.45 93.86 361.33 96.25 361.1 98.53 360.98 100.81 360.74 103.09 360.62 105.38 360.5 107.66 360.25 109.94 360.13 112.22 359.89 114.5 359.77 116.78 359.54 119.17 359.42 121.45 359.18 123.73 359.06 126.02 358.81 128.29 358.69 130.57 358.45 132.85 358.33 135.13 358.1 137.41 357.98 139.69 357.74 142.09 357.5 144.38 357.38 146.66 357.13 148.94 357.01 151.22 356.77 153.5 356.65 155.78 356.42 158.06 356.18 160.34 356.06 162.62 355.81 165.01 355.69 167.29 355.45 169.57 355.33 171.85 355.1 174.13 354.86 176.41 354.74 178.69 354.5 180.97 354.38 183.25 354.13 185.53 354.01 187.94 353.77 190.22 353.65 192.5 353.42 194.78 353.18 197.06 353.06 199.34 352.81 201.62 352.69 203.9 352.45 206.18 352.33 208.46 352.1 210.85 351.98 213.13 351.74 215.41 351.62 217.69 351.38 219.97 351.25 222.25 351.01 224.53 350.89 226.81 350.65 229.09 350.54 231.38 350.3 233.78 350.18 236.06 349.94 238.34 349.81 240.62 349.57 242.9 349.45 245.18 349.21 247.46 349.1 249.74 348.86 252.01 348.74 254.29 348.5 256.69 348.38 258.98 348.25 261.25 348.01 263.54 347.89 265.81 347.65 268.1 347.54 270.38 347.3 272.65 347.18 274.94 346.81 277.21 346.45 279.62 346.21 281.89 345.86 284.18 345.5 286.45 345.13 288.74 344.77 291.01 344.54 293.3 344.18 295.57 343.81 297.86 343.45 300.13 343.21 302.42 342.86 304.81 342.5 307.1 342.13 309.38 341.89 311.65 341.54 313.94 341.18 316.21 340.81 318.5 340.57 320.77 340.21 323.06 339.86 325.33 339.5 327.74 339.25 330.01 338.89 332.3 338.42 334.57 337.94 336.86 337.45 339.13 336.98 341.42 336.5 343.69 335.89 345.98 335.42 348.25 334.94 350.65 334.45 352.94 333.98 355.21 333.5 357.5 333.01 359.77 332.54 362.06 332.06 364.33 331.57 366.62 331.1 368.89 330.62 371.18 329.89 373.57 329.3 375.86 328.57 378.13 327.98 380.42 327.25 382.69 326.65 384.98 326.06 387.25 325.33 389.54 324.74 391.81 324.01 394.1 323.42 396.5 322.69 398.77 322.1 401.06 321.25 403.33 320.54 405.62 319.69 407.89 318.86 410.18 318.01 412.45 317.3 414.74 316.45 417.01 315.62 419.42 314.77 421.69 313.94 423.98 312.98 426.25 312.01 428.54 311.06 430.81 310.1 433.1 309.13 435.38 308.18 437.65 307.21 439.94 306.13 442.33 305.18 444.62 304.1 446.89 302.89 449.18 301.81 451.45 300.74 453.74 299.54 456.01 298.45 458.3 297.25 460.57 295.94 462.86 294.74 465.25 293.42 467.54 292.1 469.81 290.77 472.1 289.45 474.38 288.01 476.65 286.57 478.94 285.13 481.21 283.69 483.5 282.25 485.77 280.69 488.06 279.13 490.45 277.45 492.74 275.89 495.01 274.21 497.3 272.65 499.57 270.86 501.86 269.06 504.13 267.25 506.42 265.45 508.69 263.77 510.98 261.74 513.38 259.81 515.65 257.89 517.93 255.97 520.22 253.81 522.5 251.78 524.77 249.62 527.05 247.57 529.34 245.29 531.62 243.01 533.89 240.74 536.29 238.46 538.58 236.06 540.86 233.53 543.13 231.13 545.41 228.62 547.7 225.97 549.98 223.34 552.25 220.69 554.53 217.94 556.82 215.18 559.22 212.18 561.5 209.29 563.77 206.41 566.05 203.29 568.34 200.18 570.62 197.18 572.89 193.94 575.17 190.69 577.46 187.22 579.74 183.85 582.13 180.5 584.41 176.9 586.7 173.29 588.98 169.57 591.25 165.85 593.53 162.01 595.82 158.06 598.1 154.09 600.38 150.01 602.65 145.81 605.05 141.62 607.34 137.18 609.62 132.85 611.89 128.29 614.17 123.73 616.46 119.06 618.74 114.38 621.01 109.58 623.29 104.66 625.58 99.61 627.98 94.45 630.25 89.3 632.53 83.89 634.82 78.38 637.1 72.86 639.38 67.25"/></g><circle class="cls-11" cx="54.98" cy="363.26" r="2.52"/><circle class="cls-12" cx="54.98" cy="363.26" r="2.52"/><circle class="cls-11" cx="57.26" cy="363.14" r="2.52"/><circle class="cls-12" cx="57.26" cy="363.14" r="2.52"/><circle class="cls-11" cx="59.54" cy="363.14" r="2.52"/><circle class="cls-12" cx="59.54" cy="363.14" r="2.52"/><circle class="cls-11" cx="61.82" cy="363.02" r="2.52"/><circle class="cls-12" cx="61.82" cy="363.02" r="2.52"/><circle class="cls-11" cx="64.1" cy="362.9" r="2.52"/><circle class="cls-12" cx="64.1" cy="362.9" r="2.52"/><circle class="cls-11" cx="66.38" cy="362.9" r="2.52"/><circle class="cls-12" cx="66.38" cy="362.9" r="2.52"/><circle class="cls-11" cx="68.65" cy="362.78" r="2.52"/><circle class="cls-12" cx="68.65" cy="362.78" r="2.52"/><circle class="cls-11" cx="71.06" cy="362.66" r="2.52"/><circle class="cls-12" cx="71.06" cy="362.66" r="2.52"/><circle class="cls-11" cx="73.33" cy="362.54" r="2.52"/><circle class="cls-12" cx="73.33" cy="362.54" r="2.52"/><circle class="cls-11" cx="75.62" cy="362.42" r="2.52"/><circle class="cls-12" cx="75.62" cy="362.42" r="2.52"/><circle class="cls-11" cx="77.9" cy="362.3" r="2.52" transform="translate(-290.31 404.11) rotate(-84.35)"/><circle class="cls-12" cx="77.9" cy="362.3" r="2.52" transform="translate(-290.31 404.11) rotate(-84.35)"/><circle class="cls-11" cx="80.17" cy="362.18" r="2.52"/><circle class="cls-12" cx="80.17" cy="362.18" r="2.52"/><circle class="cls-11" cx="82.45" cy="362.06" r="2.52"/><circle class="cls-12" cx="82.45" cy="362.06" r="2.52"/><circle class="cls-11" cx="84.74" cy="361.94" r="2.52"/><circle class="cls-12" cx="84.74" cy="361.94" r="2.52"/><circle class="cls-11" cx="87.02" cy="361.7" r="2.52"/><circle class="cls-12" cx="87.02" cy="361.7" r="2.52"/><circle class="cls-11" cx="89.3" cy="361.58" r="2.52"/><circle class="cls-12" cx="89.3" cy="361.58" r="2.52"/><circle class="cls-11" cx="91.58" cy="361.46" r="2.52"/><circle class="cls-12" cx="91.58" cy="361.46" r="2.52"/><circle class="cls-11" cx="93.86" cy="361.34" r="2.52"/><circle class="cls-12" cx="93.86" cy="361.34" r="2.52"/><circle class="cls-11" cx="96.26" cy="361.1" r="2.52"/><circle class="cls-12" cx="96.26" cy="361.1" r="2.52"/><circle class="cls-11" cx="98.53" cy="360.98" r="2.52" transform="translate(-270.39 423.46) rotate(-84.35)"/><circle class="cls-12" cx="98.53" cy="360.98" r="2.52" transform="translate(-270.39 423.46) rotate(-84.35)"/><circle class="cls-11" cx="100.81" cy="360.74" r="2.52"/><circle class="cls-12" cx="100.81" cy="360.74" r="2.52"/><circle class="cls-11" cx="103.1" cy="360.62" r="2.52"/><circle class="cls-12" cx="103.1" cy="360.62" r="2.52"/><circle class="cls-11" cx="105.38" cy="360.5" r="2.52"/><circle class="cls-12" cx="105.38" cy="360.5" r="2.52"/><circle class="cls-11" cx="107.66" cy="360.26" r="2.52"/><circle class="cls-12" cx="107.66" cy="360.26" r="2.52"/><circle class="cls-11" cx="109.94" cy="360.14" r="2.52"/><circle class="cls-12" cx="109.94" cy="360.14" r="2.52"/><circle class="cls-11" cx="112.22" cy="359.9" r="2.52"/><circle class="cls-12" cx="112.22" cy="359.9" r="2.52"/><circle class="cls-11" cx="114.5" cy="359.78" r="2.52"/><circle class="cls-12" cx="114.5" cy="359.78" r="2.52"/><circle class="cls-11" cx="116.78" cy="359.54" r="2.52"/><circle class="cls-12" cx="116.78" cy="359.54" r="2.52"/><circle class="cls-11" cx="119.17" cy="359.42" r="2.52"/><circle class="cls-12" cx="119.17" cy="359.42" r="2.52"/><circle class="cls-11" cx="121.45" cy="359.18" r="2.52"/><circle class="cls-12" cx="121.45" cy="359.18" r="2.52"/><circle class="cls-11" cx="123.74" cy="359.06" r="2.52"/><circle class="cls-12" cx="123.74" cy="359.06" r="2.52"/><circle class="cls-11" cx="126.02" cy="358.82" r="2.52"/><circle class="cls-12" cx="126.02" cy="358.82" r="2.52"/><circle class="cls-11" cx="128.3" cy="358.7" r="2.52"/><circle class="cls-12" cx="128.3" cy="358.7" r="2.52"/><circle class="cls-11" cx="130.58" cy="358.46" r="2.52"/><circle class="cls-12" cx="130.58" cy="358.46" r="2.52"/><circle class="cls-11" cx="132.86" cy="358.34" r="2.52"/><circle class="cls-12" cx="132.86" cy="358.34" r="2.52"/><circle class="cls-11" cx="135.14" cy="358.1" r="2.52"/><circle class="cls-12" cx="135.14" cy="358.1" r="2.52"/><circle class="cls-11" cx="137.42" cy="357.98" r="2.52"/><circle class="cls-12" cx="137.42" cy="357.98" r="2.52"/><circle class="cls-11" cx="139.69" cy="357.74" r="2.52"/><circle class="cls-12" cx="139.69" cy="357.74" r="2.52"/><circle class="cls-11" cx="142.1" cy="357.5" r="2.52"/><circle class="cls-12" cx="142.1" cy="357.5" r="2.52"/><circle class="cls-11" cx="144.38" cy="357.38" r="2.52"/><circle class="cls-12" cx="144.38" cy="357.38" r="2.52"/><circle class="cls-11" cx="146.66" cy="357.14" r="2.52"/><circle class="cls-12" cx="146.66" cy="357.14" r="2.52"/><circle class="cls-11" cx="148.94" cy="357.02" r="2.52"/><circle class="cls-12" cx="148.94" cy="357.02" r="2.52"/><circle class="cls-11" cx="151.22" cy="356.78" r="2.52"/><circle class="cls-12" cx="151.22" cy="356.78" r="2.52"/><circle class="cls-11" cx="153.5" cy="356.66" r="2.52"/><circle class="cls-12" cx="153.5" cy="356.66" r="2.52"/><circle class="cls-11" cx="155.78" cy="356.42" r="2.52"/><circle class="cls-12" cx="155.78" cy="356.42" r="2.52"/><circle class="cls-11" cx="158.06" cy="356.18" r="2.52"/><circle class="cls-12" cx="158.06" cy="356.18" r="2.52"/><circle class="cls-11" cx="160.33" cy="356.06" r="2.52"/><circle class="cls-12" cx="160.33" cy="356.06" r="2.52"/><circle class="cls-11" cx="162.62" cy="355.82" r="2.52"/><circle class="cls-12" cx="162.62" cy="355.82" r="2.52"/><circle class="cls-11" cx="165.02" cy="355.7" r="2.52"/><circle class="cls-12" cx="165.02" cy="355.7" r="2.52"/><circle class="cls-11" cx="167.3" cy="355.46" r="2.52"/><circle class="cls-12" cx="167.3" cy="355.46" r="2.52"/><circle class="cls-11" cx="169.58" cy="355.34" r="2.52"/><circle class="cls-12" cx="169.58" cy="355.34" r="2.52"/><circle class="cls-11" cx="171.86" cy="355.1" r="2.52"/><circle class="cls-12" cx="171.86" cy="355.1" r="2.52"/><circle class="cls-11" cx="174.14" cy="354.86" r="2.52"/><circle class="cls-12" cx="174.14" cy="354.86" r="2.52"/><circle class="cls-11" cx="176.42" cy="354.74" r="2.52"/><circle class="cls-12" cx="176.42" cy="354.74" r="2.52"/><circle class="cls-11" cx="178.69" cy="354.5" r="2.52"/><circle class="cls-12" cx="178.69" cy="354.5" r="2.52"/><circle class="cls-11" cx="180.97" cy="354.38" r="2.52"/><circle class="cls-12" cx="180.97" cy="354.38" r="2.52"/><circle class="cls-11" cx="183.26" cy="354.14" r="2.52"/><circle class="cls-12" cx="183.26" cy="354.14" r="2.52"/><circle class="cls-11" cx="185.53" cy="354.02" r="2.52"/><circle class="cls-12" cx="185.53" cy="354.02" r="2.52"/><circle class="cls-11" cx="187.94" cy="353.78" r="2.52"/><circle class="cls-12" cx="187.94" cy="353.78" r="2.52"/><circle class="cls-11" cx="190.22" cy="353.66" r="2.52"/><circle class="cls-12" cx="190.22" cy="353.66" r="2.52"/><circle class="cls-11" cx="192.5" cy="353.42" r="2.52"/><circle class="cls-12" cx="192.5" cy="353.42" r="2.52"/><circle class="cls-11" cx="194.78" cy="353.18" r="2.52"/><circle class="cls-12" cx="194.78" cy="353.18" r="2.52"/><circle class="cls-11" cx="197.05" cy="353.06" r="2.52"/><circle class="cls-12" cx="197.05" cy="353.06" r="2.52"/><circle class="cls-11" cx="199.33" cy="352.82" r="2.52"/><circle class="cls-12" cx="199.33" cy="352.82" r="2.52"/><circle class="cls-11" cx="201.61" cy="352.7" r="2.52"/><circle class="cls-12" cx="201.61" cy="352.7" r="2.52"/><circle class="cls-11" cx="203.9" cy="352.46" r="2.52"/><circle class="cls-12" cx="203.9" cy="352.46" r="2.52"/><circle class="cls-11" cx="206.18" cy="352.34" r="2.52"/><circle class="cls-12" cx="206.18" cy="352.34" r="2.52"/><circle class="cls-11" cx="208.46" cy="352.1" r="2.52"/><circle class="cls-12" cx="208.46" cy="352.1" r="2.52"/><circle class="cls-11" cx="210.85" cy="351.98" r="2.52"/><circle class="cls-12" cx="210.85" cy="351.98" r="2.52"/><circle class="cls-11" cx="213.13" cy="351.74" r="2.52"/><circle class="cls-12" cx="213.13" cy="351.74" r="2.52"/><circle class="cls-11" cx="215.42" cy="351.62" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.6, 22.94)"/><circle class="cls-12" cx="215.42" cy="351.62" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.6, 22.94)"/><circle class="cls-11" cx="217.7" cy="351.38" r="2.52"/><circle class="cls-12" cx="217.7" cy="351.38" r="2.52"/><circle class="cls-11" cx="219.98" cy="351.26" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.54, 23.39)"/><circle class="cls-12" cx="219.98" cy="351.26" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.54, 23.39)"/><circle class="cls-11" cx="222.26" cy="351.02" r="2.52"/><circle class="cls-12" cx="222.26" cy="351.02" r="2.52"/><circle class="cls-11" cx="224.54" cy="350.9" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.48, 23.83)"/><circle class="cls-12" cx="224.54" cy="350.9" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.48, 23.83)"/><circle class="cls-11" cx="226.82" cy="350.66" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.45, 24.06)"/><circle class="cls-12" cx="226.82" cy="350.66" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.45, 24.06)"/><circle class="cls-11" cx="229.1" cy="350.54" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.43, 24.28)"/><circle class="cls-12" cx="229.1" cy="350.54" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.43, 24.28)"/><circle class="cls-11" cx="231.38" cy="350.3" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.39, 24.5)"/><circle class="cls-12" cx="231.38" cy="350.3" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.39, 24.5)"/><circle class="cls-11" cx="233.78" cy="350.18" r="2.52" transform="translate(-33.37 24.74) rotate(-5.65)"/><circle class="cls-12" cx="233.78" cy="350.18" r="2.52" transform="translate(-33.37 24.74) rotate(-5.65)"/><circle class="cls-11" cx="236.06" cy="349.94" r="2.52" transform="translate(-129.73 560.59) rotate(-85.93)"/><circle class="cls-12" cx="236.06" cy="349.94" r="2.52" transform="translate(-129.73 560.59) rotate(-85.93)"/><circle class="cls-11" cx="238.33" cy="349.82" r="2.52" transform="translate(-33.31 25.19) rotate(-5.65)"/><circle class="cls-12" cx="238.33" cy="349.82" r="2.52" transform="translate(-33.31 25.19) rotate(-5.65)"/><circle class="cls-11" cx="240.61" cy="349.58" r="2.52" transform="translate(-125.14 564.8) rotate(-85.93)"/><circle class="cls-12" cx="240.61" cy="349.58" r="2.52" transform="translate(-125.14 564.8) rotate(-85.93)"/><circle class="cls-11" cx="242.9" cy="349.46" r="2.52"/><circle class="cls-12" cx="242.9" cy="349.46" r="2.52"/><circle class="cls-11" cx="245.18" cy="349.22" r="2.52"/><circle class="cls-12" cx="245.18" cy="349.22" r="2.52"/><circle class="cls-11" cx="247.46" cy="349.1" r="2.52"/><circle class="cls-12" cx="247.46" cy="349.1" r="2.52"/><circle class="cls-11" cx="249.74" cy="348.86" r="2.52"/><circle class="cls-12" cx="249.74" cy="348.86" r="2.52"/><circle class="cls-11" cx="252.01" cy="348.74" r="2.52"/><circle class="cls-12" cx="252.01" cy="348.74" r="2.52"/><circle class="cls-11" cx="254.29" cy="348.5" r="2.52"/><circle class="cls-12" cx="254.29" cy="348.5" r="2.52"/><circle class="cls-11" cx="256.7" cy="348.38" r="2.52"/><circle class="cls-12" cx="256.7" cy="348.38" r="2.52"/><circle class="cls-11" cx="258.98" cy="348.26" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.06, 27.21)"/><circle class="cls-12" cx="258.98" cy="348.26" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.06, 27.21)"/><circle class="cls-11" cx="261.26" cy="348.02" r="2.52"/><circle class="cls-12" cx="261.26" cy="348.02" r="2.52"/><circle class="cls-11" cx="263.54" cy="347.9" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33, 27.66)"/><circle class="cls-12" cx="263.54" cy="347.9" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33, 27.66)"/><circle class="cls-11" cx="265.82" cy="347.66" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.96, 27.88)"/><circle class="cls-12" cx="265.82" cy="347.66" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.96, 27.88)"/><circle class="cls-11" cx="268.1" cy="347.54" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.94, 28.11)"/><circle class="cls-12" cx="268.1" cy="347.54" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.94, 28.11)"/><circle class="cls-11" cx="270.38" cy="347.3" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.91, 28.33)"/><circle class="cls-12" cx="270.38" cy="347.3" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.91, 28.33)"/><circle class="cls-11" cx="272.65" cy="347.18" r="2.52"/><circle class="cls-12" cx="272.65" cy="347.18" r="2.52"/><circle class="cls-11" cx="274.94" cy="346.82" r="2.52"/><circle class="cls-12" cx="274.94" cy="346.82" r="2.52"/><circle class="cls-11" cx="277.22" cy="346.46" r="2.52"/><circle class="cls-12" cx="277.22" cy="346.46" r="2.52"/><circle class="cls-11" cx="279.61" cy="346.22" r="2.52" transform="translate(-85.55 600.58) rotate(-85.93)"/><circle class="cls-12" cx="279.61" cy="346.22" r="2.52" transform="translate(-85.55 600.58) rotate(-85.93)"/><circle class="cls-11" cx="281.9" cy="345.86" r="2.52"/><circle class="cls-12" cx="281.9" cy="345.86" r="2.52"/><circle class="cls-11" cx="284.18" cy="345.5" r="2.52"/><circle class="cls-12" cx="284.18" cy="345.5" r="2.52"/><circle class="cls-11" cx="286.46" cy="345.14" r="2.52"/><circle class="cls-12" cx="286.46" cy="345.14" r="2.52"/><circle class="cls-11" cx="288.74" cy="344.78" r="2.52"/><circle class="cls-12" cx="288.74" cy="344.78" r="2.52"/><circle class="cls-11" cx="291.01" cy="344.54" r="2.52"/><circle class="cls-12" cx="291.01" cy="344.54" r="2.52"/><circle class="cls-11" cx="293.29" cy="344.18" r="2.52"/><circle class="cls-12" cx="293.29" cy="344.18" r="2.52"/><circle class="cls-11" cx="295.57" cy="343.82" r="2.52"/><circle class="cls-12" cx="295.57" cy="343.82" r="2.52"/><circle class="cls-11" cx="297.85" cy="343.46" r="2.52"/><circle class="cls-12" cx="297.85" cy="343.46" r="2.52"/><circle class="cls-11" cx="300.13" cy="343.22" r="2.52"/><circle class="cls-12" cx="300.13" cy="343.22" r="2.52"/><circle class="cls-11" cx="302.42" cy="342.86" r="2.52" transform="translate(-61.02 620.2) rotate(-85.93)"/><circle class="cls-12" cx="302.42" cy="342.86" r="2.52" transform="translate(-61.02 620.2) rotate(-85.93)"/><circle class="cls-11" cx="304.82" cy="342.5" r="2.52" transform="translate(-58.43 622.26) rotate(-85.93)"/><circle class="cls-12" cx="304.82" cy="342.5" r="2.52" transform="translate(-58.43 622.26) rotate(-85.93)"/><circle class="cls-11" cx="307.1" cy="342.14" r="2.52"/><circle class="cls-12" cx="307.1" cy="342.14" r="2.52"/><circle class="cls-11" cx="309.38" cy="341.9" r="2.52"/><circle class="cls-12" cx="309.38" cy="341.9" r="2.52"/><circle class="cls-11" cx="311.65" cy="341.54" r="2.52" transform="translate(-50.68 54.33) rotate(-9.22)"/><circle class="cls-12" cx="311.65" cy="341.54" r="2.52" transform="translate(-50.68 54.33) rotate(-9.22)"/><circle class="cls-11" cx="313.94" cy="341.18" r="2.52" transform="translate(-73.12 596.41) rotate(-80.78)"/><circle class="cls-12" cx="313.94" cy="341.18" r="2.52" transform="translate(-73.12 596.41) rotate(-80.78)"/><circle class="cls-11" cx="316.22" cy="340.82" r="2.52" transform="translate(-70.85 598.35) rotate(-80.78)"/><circle class="cls-12" cx="316.22" cy="340.82" r="2.52" transform="translate(-70.85 598.35) rotate(-80.78)"/><circle class="cls-11" cx="318.5" cy="340.58" r="2.52"/><circle class="cls-12" cx="318.5" cy="340.58" r="2.52"/><circle class="cls-11" cx="320.78" cy="340.22" r="2.52"/><circle class="cls-12" cx="320.78" cy="340.22" r="2.52"/><circle class="cls-11" cx="323.06" cy="339.86" r="2.52"/><circle class="cls-12" cx="323.06" cy="339.86" r="2.52"/><circle class="cls-11" cx="325.33" cy="339.5" r="2.52"/><circle class="cls-12" cx="325.33" cy="339.5" r="2.52"/><circle class="cls-11" cx="327.74" cy="339.26" r="2.52"/><circle class="cls-12" cx="327.74" cy="339.26" r="2.52"/><circle class="cls-11" cx="330.01" cy="338.9" r="2.52"/><circle class="cls-12" cx="330.01" cy="338.9" r="2.52"/><circle class="cls-11" cx="332.29" cy="338.42" r="2.52"/><circle class="cls-12" cx="332.29" cy="338.42" r="2.52"/><circle class="cls-11" cx="334.57" cy="337.94" r="2.52"/><circle class="cls-12" cx="334.57" cy="337.94" r="2.52"/><circle class="cls-11" cx="336.85" cy="337.46" r="2.52"/><circle class="cls-12" cx="336.85" cy="337.46" r="2.52"/><circle class="cls-11" cx="339.13" cy="336.98" r="2.52"/><circle class="cls-12" cx="339.13" cy="336.98" r="2.52"/><circle class="cls-11" cx="341.42" cy="336.5" r="2.52"/><circle class="cls-12" cx="341.42" cy="336.5" r="2.52"/><circle class="cls-11" cx="343.7" cy="335.9" r="2.52"/><circle class="cls-12" cx="343.7" cy="335.9" r="2.52"/><circle class="cls-11" cx="345.98" cy="335.42" r="2.52"/><circle class="cls-12" cx="345.98" cy="335.42" r="2.52"/><circle class="cls-11" cx="348.26" cy="334.94" r="2.52" transform="translate(-19.37 648.49) rotate(-84.34)"/><circle class="cls-12" cx="348.26" cy="334.94" r="2.52" transform="translate(-19.37 648.49) rotate(-84.34)"/><circle class="cls-11" cx="350.65" cy="334.46" r="2.52" transform="translate(-49.05 60.49) rotate(-9.22)"/><circle class="cls-12" cx="350.65" cy="334.46" r="2.52" transform="translate(-49.05 60.49) rotate(-9.22)"/><circle class="cls-11" cx="352.94" cy="333.98" r="2.52"/><circle class="cls-12" cx="352.94" cy="333.98" r="2.52"/><circle class="cls-11" cx="355.22" cy="333.5" r="2.52"/><circle class="cls-12" cx="355.22" cy="333.5" r="2.52"/><circle class="cls-11" cx="357.5" cy="333.02" r="2.52"/><circle class="cls-12" cx="357.5" cy="333.02" r="2.52"/><circle class="cls-11" cx="359.78" cy="332.54" r="2.52" transform="translate(-26.1 634.4) rotate(-80.78)"/><circle class="cls-12" cx="359.78" cy="332.54" r="2.52" transform="translate(-26.1 634.4) rotate(-80.78)"/><circle class="cls-11" cx="362.06" cy="332.06" r="2.52"/><circle class="cls-12" cx="362.06" cy="332.06" r="2.52"/><circle class="cls-11" cx="364.33" cy="331.58" r="2.52"/><circle class="cls-12" cx="364.33" cy="331.58" r="2.52"/><circle class="cls-11" cx="366.61" cy="331.1" r="2.52" transform="translate(-18.93 639.94) rotate(-80.78)"/><circle class="cls-12" cx="366.61" cy="331.1" r="2.52" transform="translate(-18.93 639.94) rotate(-80.78)"/><circle class="cls-11" cx="368.9" cy="330.62" r="2.52"/><circle class="cls-12" cx="368.9" cy="330.62" r="2.52"/><circle class="cls-11" cx="371.18" cy="329.9" r="2.52"/><circle class="cls-12" cx="371.18" cy="329.9" r="2.52"/><circle class="cls-11" cx="373.57" cy="329.3" r="2.52"/><circle class="cls-12" cx="373.57" cy="329.3" r="2.52"/><circle class="cls-11" cx="375.85" cy="328.58" r="2.52"/><circle class="cls-12" cx="375.85" cy="328.58" r="2.52"/><circle class="cls-11" cx="378.13" cy="327.98" r="2.52"/><circle class="cls-12" cx="378.13" cy="327.98" r="2.52"/><circle class="cls-11" cx="380.42" cy="327.26" r="2.52"/><circle class="cls-12" cx="380.42" cy="327.26" r="2.52"/><circle class="cls-11" cx="382.7" cy="326.66" r="2.52" transform="translate(19.92 675.3) rotate(-84.34)"/><circle class="cls-12" cx="382.7" cy="326.66" r="2.52" transform="translate(19.92 675.3) rotate(-84.34)"/><circle class="cls-11" cx="384.98" cy="326.06" r="2.52"/><circle class="cls-12" cx="384.98" cy="326.06" r="2.52"/><circle class="cls-11" cx="387.26" cy="325.34" r="2.52"/><circle class="cls-12" cx="387.26" cy="325.34" r="2.52"/><circle class="cls-11" cx="389.54" cy="324.74" r="2.52" transform="translate(-46.99 66.59) rotate(-9.22)"/><circle class="cls-12" cx="389.54" cy="324.74" r="2.52" transform="translate(-46.99 66.59) rotate(-9.22)"/><circle class="cls-11" cx="391.82" cy="324.02" r="2.52" transform="translate(30.77 682) rotate(-84.34)"/><circle class="cls-12" cx="391.82" cy="324.02" r="2.52" transform="translate(30.77 682) rotate(-84.34)"/><circle class="cls-11" cx="394.1" cy="323.42" r="2.52"/><circle class="cls-12" cx="394.1" cy="323.42" r="2.52"/><circle class="cls-11" cx="396.5" cy="322.7" r="2.52"/><circle class="cls-12" cx="396.5" cy="322.7" r="2.52"/><circle class="cls-11" cx="398.78" cy="322.1" r="2.52" transform="translate(16.96 664.13) rotate(-80.78)"/><circle class="cls-12" cx="398.78" cy="322.1" r="2.52" transform="translate(16.96 664.13) rotate(-80.78)"/><circle class="cls-11" cx="401.06" cy="321.26" r="2.52"/><circle class="cls-12" cx="401.06" cy="321.26" r="2.52"/><circle class="cls-11" cx="403.33" cy="320.54" r="2.52" transform="translate(22.33 667.32) rotate(-80.78)"/><circle class="cls-12" cx="403.33" cy="320.54" r="2.52" transform="translate(22.33 667.32) rotate(-80.78)"/><circle class="cls-11" cx="405.61" cy="319.7" r="2.52"/><circle class="cls-12" cx="405.61" cy="319.7" r="2.52"/><circle class="cls-11" cx="407.9" cy="318.86" r="2.52"/><circle class="cls-12" cx="407.9" cy="318.86" r="2.52"/><circle class="cls-11" cx="410.18" cy="318.02" r="2.52"/><circle class="cls-12" cx="410.18" cy="318.02" r="2.52"/><circle class="cls-11" cx="412.46" cy="317.3" r="2.52"/><circle class="cls-12" cx="412.46" cy="317.3" r="2.52"/><circle class="cls-11" cx="414.74" cy="316.46" r="2.52"/><circle class="cls-12" cx="414.74" cy="316.46" r="2.52"/><circle class="cls-11" cx="417.01" cy="315.62" r="2.52"/><circle class="cls-12" cx="417.01" cy="315.62" r="2.52"/><circle class="cls-11" cx="419.42" cy="314.78" r="2.52"/><circle class="cls-12" cx="419.42" cy="314.78" r="2.52"/><circle class="cls-11" cx="421.7" cy="313.94" r="2.52" transform="translate(67.73 702.64) rotate(-84.34)"/><circle class="cls-12" cx="421.7" cy="313.94" r="2.52" transform="translate(67.73 702.64) rotate(-84.34)"/><circle class="cls-11" cx="423.98" cy="312.98" r="2.52"/><circle class="cls-12" cx="423.98" cy="312.98" r="2.52"/><circle class="cls-11" cx="426.26" cy="312.02" r="2.52" transform="translate(73.76 705.45) rotate(-84.34)"/><circle class="cls-12" cx="426.26" cy="312.02" r="2.52" transform="translate(73.76 705.45) rotate(-84.34)"/><circle class="cls-11" cx="428.54" cy="311.06" r="2.52"/><circle class="cls-12" cx="428.54" cy="311.06" r="2.52"/><circle class="cls-11" cx="430.82" cy="310.1" r="2.52" transform="translate(-44.11 73.01) rotate(-9.22)"/><circle class="cls-12" cx="430.82" cy="310.1" r="2.52" transform="translate(-44.11 73.01) rotate(-9.22)"/><circle class="cls-11" cx="433.1" cy="309.14" r="2.52"/><circle class="cls-12" cx="433.1" cy="309.14" r="2.52"/><circle class="cls-11" cx="435.38" cy="308.18" r="2.52" transform="translate(-43.74 73.72) rotate(-9.22)"/><circle class="cls-12" cx="435.38" cy="308.18" r="2.52" transform="translate(-43.74 73.72) rotate(-9.22)"/><circle class="cls-11" cx="437.65" cy="307.22" r="2.52" transform="translate(88.81 712.47) rotate(-84.34)"/><circle class="cls-12" cx="437.65" cy="307.22" r="2.52" transform="translate(88.81 712.47) rotate(-84.34)"/><circle class="cls-11" cx="439.94" cy="306.14" r="2.52"/><circle class="cls-12" cx="439.94" cy="306.14" r="2.52"/><circle class="cls-11" cx="442.33" cy="305.18" r="2.52" transform="translate(70.25 692.91) rotate(-80.78)"/><circle class="cls-12" cx="442.33" cy="305.18" r="2.52" transform="translate(70.25 692.91) rotate(-80.78)"/><circle class="cls-11" cx="444.61" cy="304.1" r="2.52" transform="translate(73.23 694.26) rotate(-80.78)"/><circle class="cls-12" cx="444.61" cy="304.1" r="2.52" transform="translate(73.23 694.26) rotate(-80.78)"/><circle class="cls-11" cx="446.9" cy="302.9" r="2.52"/><circle class="cls-12" cx="446.9" cy="302.9" r="2.52"/><circle class="cls-11" cx="449.18" cy="301.82" r="2.52"/><circle class="cls-12" cx="449.18" cy="301.82" r="2.52"/><circle class="cls-11" cx="451.46" cy="300.74" r="2.52"/><circle class="cls-12" cx="451.46" cy="300.74" r="2.52"/><circle class="cls-11" cx="453.73" cy="299.54" r="2.52" transform="translate(85.39 699.43) rotate(-80.78)"/><circle class="cls-12" cx="453.73" cy="299.54" r="2.52" transform="translate(85.39 699.43) rotate(-80.78)"/><circle class="cls-11" cx="456.01" cy="298.46" r="2.52"/><circle class="cls-12" cx="456.01" cy="298.46" r="2.52"/><path class="cls-11" d="M460.81,297.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,460.81,297.26Z"/><path class="cls-12" d="M460.81,297.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,460.81,297.26Z"/><path class="cls-11" d="M463.1,295.94a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,463.1,295.94Z"/><path class="cls-12" d="M463.1,295.94a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,463.1,295.94Z"/><path class="cls-11" d="M465.37,294.74a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,465.37,294.74Z"/><path class="cls-12" d="M465.37,294.74a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,465.37,294.74Z"/><path class="cls-11" d="M467.78,293.42a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,467.78,293.42Z"/><path class="cls-12" d="M467.78,293.42a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,467.78,293.42Z"/><circle class="cls-11" cx="467.54" cy="292.1" r="2.52" transform="translate(-26.51 47.49) rotate(-5.65)"/><circle class="cls-12" cx="467.54" cy="292.1" r="2.52" transform="translate(-26.51 47.49) rotate(-5.65)"/><path class="cls-11" d="M472.33,290.78a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,472.33,290.78Z"/><path class="cls-12" d="M472.33,290.78a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,472.33,290.78Z"/><circle class="cls-11" cx="472.1" cy="289.46" r="2.52" transform="translate(-26.22 47.93) rotate(-5.65)"/><circle class="cls-12" cx="472.1" cy="289.46" r="2.52" transform="translate(-26.22 47.93) rotate(-5.65)"/><path class="cls-11" d="M476.89,288a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,476.89,288Z"/><path class="cls-12" d="M476.89,288a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,476.89,288Z"/><circle class="cls-11" cx="476.66" cy="286.58" r="2.52" transform="translate(157.01 741.72) rotate(-85.93)"/><circle class="cls-12" cx="476.66" cy="286.58" r="2.52" transform="translate(157.01 741.72) rotate(-85.93)"/><path class="cls-11" d="M481.45,285.14a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,481.45,285.14Z"/><path class="cls-12" d="M481.45,285.14a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,481.45,285.14Z"/><circle class="cls-11" cx="481.22" cy="283.7" r="2.52"/><circle class="cls-12" cx="481.22" cy="283.7" r="2.52"/><path class="cls-11" d="M486,282.26a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,486,282.26Z"/><path class="cls-12" d="M486,282.26a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,486,282.26Z"/><circle class="cls-11" cx="485.78" cy="280.7" r="2.52"/><circle class="cls-12" cx="485.78" cy="280.7" r="2.52"/><path class="cls-11" d="M490.57,279.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,490.57,279.14Z"/><path class="cls-12" d="M490.57,279.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,490.57,279.14Z"/><circle class="cls-11" cx="490.45" cy="277.46" r="2.52" transform="translate(-24.95 49.68) rotate(-5.65)"/><circle class="cls-12" cx="490.45" cy="277.46" r="2.52" transform="translate(-24.95 49.68) rotate(-5.65)"/><path class="cls-11" d="M495.25,275.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,495.25,275.9Z"/><path class="cls-12" d="M495.25,275.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,495.25,275.9Z"/><circle class="cls-11" cx="495.01" cy="274.22" r="2.52" transform="translate(186.4 748.55) rotate(-85.93)"/><circle class="cls-12" cx="495.01" cy="274.22" r="2.52" transform="translate(186.4 748.55) rotate(-85.93)"/><path class="cls-11" d="M499.81,272.66a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,499.81,272.66Z"/><path class="cls-12" d="M499.81,272.66a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,499.81,272.66Z"/><path class="cls-11" d="M502.1,270.86a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,502.1,270.86Z"/><path class="cls-12" d="M502.1,270.86a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,502.1,270.86Z"/><path class="cls-11" d="M504.37,269.06a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,504.37,269.06Z"/><path class="cls-12" d="M504.37,269.06a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,504.37,269.06Z"/><path class="cls-11" d="M506.66,267.26a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,506.66,267.26Z"/><path class="cls-12" d="M506.66,267.26a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,506.66,267.26Z"/><path class="cls-11" d="M508.93,265.46a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,508.93,265.46Z"/><path class="cls-12" d="M508.93,265.46a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,508.93,265.46Z"/><path class="cls-11" d="M511.22,263.78a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,511.22,263.78Z"/><path class="cls-12" d="M511.22,263.78a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,511.22,263.78Z"/><path class="cls-11" d="M513.49,261.74a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,513.49,261.74Z"/><path class="cls-12" d="M513.49,261.74a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,513.49,261.74Z"/><path class="cls-11" d="M515.89,259.82a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,515.89,259.82Z"/><path class="cls-12" d="M515.89,259.82a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,515.89,259.82Z"/><circle class="cls-11" cx="515.66" cy="257.9" r="2.52"/><circle class="cls-12" cx="515.66" cy="257.9" r="2.52"/><path class="cls-11" d="M520.45,256a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,520.45,256Z"/><path class="cls-12" d="M520.45,256a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,520.45,256Z"/><circle class="cls-11" cx="520.22" cy="253.82" r="2.52" transform="translate(-22.48 52.5) rotate(-5.65)"/><circle class="cls-12" cx="520.22" cy="253.82" r="2.52" transform="translate(-22.48 52.5) rotate(-5.65)"/><path class="cls-11" d="M525,251.78a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,525,251.78Z"/><path class="cls-12" d="M525,251.78a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,525,251.78Z"/><circle class="cls-11" cx="524.78" cy="249.62" r="2.52"/><circle class="cls-12" cx="524.78" cy="249.62" r="2.52"/><path class="cls-11" d="M529.57,247.58a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,529.57,247.58Z"/><path class="cls-12" d="M529.57,247.58a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,529.57,247.58Z"/><circle class="cls-11" cx="529.33" cy="245.3" r="2.52" transform="translate(247.13 755.91) rotate(-85.93)"/><circle class="cls-12" cx="529.33" cy="245.3" r="2.52" transform="translate(247.13 755.91) rotate(-85.93)"/><path class="cls-11" d="M534.13,243a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,534.13,243Z"/><path class="cls-12" d="M534.13,243a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,534.13,243Z"/><circle class="cls-11" cx="533.89" cy="240.74" r="2.52" transform="translate(-21.12 53.78) rotate(-5.65)"/><circle class="cls-12" cx="533.89" cy="240.74" r="2.52" transform="translate(-21.12 53.78) rotate(-5.65)"/><path class="cls-11" d="M538.81,238.46a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,538.81,238.46Z"/><path class="cls-12" d="M538.81,238.46a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,538.81,238.46Z"/><path class="cls-11" d="M541.1,236.06a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,541.1,236.06Z"/><path class="cls-12" d="M541.1,236.06a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,541.1,236.06Z"/><path class="cls-11" d="M543.37,233.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,543.37,233.54Z"/><path class="cls-12" d="M543.37,233.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,543.37,233.54Z"/><path class="cls-11" d="M545.66,231.14a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,545.66,231.14Z"/><path class="cls-12" d="M545.66,231.14a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,545.66,231.14Z"/><path class="cls-11" d="M547.93,228.62a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,547.93,228.62Z"/><path class="cls-12" d="M547.93,228.62a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,547.93,228.62Z"/><path class="cls-11" d="M550.22,226a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,550.22,226Z"/><path class="cls-12" d="M550.22,226a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,550.22,226Z"/><path class="cls-11" d="M552.49,223.34a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,552.49,223.34Z"/><path class="cls-12" d="M552.49,223.34a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,552.49,223.34Z"/><path class="cls-11" d="M554.78,220.7a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,554.78,220.7Z"/><path class="cls-12" d="M554.78,220.7a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,554.78,220.7Z"/><circle class="cls-11" cx="554.54" cy="217.94" r="2.52" transform="translate(297.84 755.63) rotate(-85.93)"/><circle class="cls-12" cx="554.54" cy="217.94" r="2.52" transform="translate(297.84 755.63) rotate(-85.93)"/><path class="cls-11" d="M559.33,215.18a2.52,2.52,0,1,1-2.52-2.53A2.52,2.52,0,0,1,559.33,215.18Z"/><path class="cls-12" d="M559.33,215.18a2.52,2.52,0,1,1-2.52-2.53A2.52,2.52,0,0,1,559.33,215.18Z"/><circle class="cls-11" cx="559.22" cy="212.18" r="2.52" transform="translate(260.2 730.18) rotate(-80.78)"/><circle class="cls-12" cx="559.22" cy="212.18" r="2.52" transform="translate(260.2 730.18) rotate(-80.78)"/><path class="cls-11" d="M564,209.3a2.52,2.52,0,1,1-2.52-2.53A2.53,2.53,0,0,1,564,209.3Z"/><path class="cls-12" d="M564,209.3a2.52,2.52,0,1,1-2.52-2.53A2.53,2.53,0,0,1,564,209.3Z"/><circle class="cls-11" cx="563.78" cy="206.42" r="2.52"/><circle class="cls-12" cx="563.78" cy="206.42" r="2.52"/><path class="cls-11" d="M568.57,203.3a2.52,2.52,0,1,1-2.51-2.53A2.53,2.53,0,0,1,568.57,203.3Z"/><path class="cls-12" d="M568.57,203.3a2.52,2.52,0,1,1-2.51-2.53A2.53,2.53,0,0,1,568.57,203.3Z"/><circle class="cls-11" cx="568.33" cy="200.18" r="2.52" transform="translate(279.71 729.11) rotate(-80.78)"/><circle class="cls-12" cx="568.33" cy="200.18" r="2.52" transform="translate(279.71 729.11) rotate(-80.78)"/><path class="cls-11" d="M573.13,197.18a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,573.13,197.18Z"/><path class="cls-12" d="M573.13,197.18a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,573.13,197.18Z"/><circle class="cls-11" cx="572.89" cy="193.94" r="2.52" transform="translate(-30.61 234) rotate(-22.5)"/><circle class="cls-12" cx="572.89" cy="193.94" r="2.52" transform="translate(-30.61 234) rotate(-22.5)"/><path class="cls-11" d="M577.69,190.7a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,577.69,190.7Z"/><path class="cls-12" d="M577.69,190.7a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,577.69,190.7Z"/><circle class="cls-11" cx="577.45" cy="187.22" r="2.52" transform="translate(-27.69 235.23) rotate(-22.5)"/><circle class="cls-12" cx="577.45" cy="187.22" r="2.52" transform="translate(-27.69 235.23) rotate(-22.5)"/><path class="cls-11" d="M582.25,183.86a2.52,2.52,0,1,1-2.51-2.52A2.53,2.53,0,0,1,582.25,183.86Z"/><path class="cls-12" d="M582.25,183.86a2.52,2.52,0,1,1-2.51-2.52A2.53,2.53,0,0,1,582.25,183.86Z"/><path class="cls-11" d="M584.66,180.49a2.52,2.52,0,1,1-2.53-2.51A2.53,2.53,0,0,1,584.66,180.49Z"/><path class="cls-12" d="M584.66,180.49a2.52,2.52,0,1,1-2.53-2.51A2.53,2.53,0,0,1,584.66,180.49Z"/><path class="cls-11" d="M586.93,176.9a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,586.93,176.9Z"/><path class="cls-12" d="M586.93,176.9a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,586.93,176.9Z"/><path class="cls-11" d="M589.22,173.3a2.52,2.52,0,1,1-2.53-2.53A2.54,2.54,0,0,1,589.22,173.3Z"/><path class="cls-12" d="M589.22,173.3a2.52,2.52,0,1,1-2.53-2.53A2.54,2.54,0,0,1,589.22,173.3Z"/><path class="cls-11" d="M591.49,169.58a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,591.49,169.58Z"/><path class="cls-12" d="M591.49,169.58a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,591.49,169.58Z"/><path class="cls-11" d="M593.78,165.86a2.52,2.52,0,1,1-2.53-2.52A2.54,2.54,0,0,1,593.78,165.86Z"/><path class="cls-12" d="M593.78,165.86a2.52,2.52,0,1,1-2.53-2.52A2.54,2.54,0,0,1,593.78,165.86Z"/><circle class="cls-11" cx="593.54" cy="162.02" r="2.52" transform="translate(338.54 721.93) rotate(-80.78)"/><circle class="cls-12" cx="593.54" cy="162.02" r="2.52" transform="translate(338.54 721.93) rotate(-80.78)"/><path class="cls-11" d="M598.33,158.05a2.52,2.52,0,1,1-2.52-2.51A2.52,2.52,0,0,1,598.33,158.05Z"/><path class="cls-12" d="M598.33,158.05a2.52,2.52,0,1,1-2.52-2.51A2.52,2.52,0,0,1,598.33,158.05Z"/><circle class="cls-11" cx="598.1" cy="154.1" r="2.52" transform="translate(-13.44 240.61) rotate(-22.5)"/><circle class="cls-12" cx="598.1" cy="154.1" r="2.52" transform="translate(-13.44 240.61) rotate(-22.5)"/><path class="cls-11" d="M602.89,150a2.52,2.52,0,1,1-2.52-2.53A2.53,2.53,0,0,1,602.89,150Z"/><path class="cls-12" d="M602.89,150a2.52,2.52,0,1,1-2.52-2.53A2.53,2.53,0,0,1,602.89,150Z"/><circle class="cls-11" cx="602.66" cy="145.82" r="2.52" transform="translate(-9.93 241.73) rotate(-22.5)"/><circle class="cls-12" cx="602.66" cy="145.82" r="2.52" transform="translate(-9.93 241.73) rotate(-22.5)"/><path class="cls-11" d="M607.57,141.62a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,607.57,141.62Z"/><path class="cls-12" d="M607.57,141.62a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,607.57,141.62Z"/><circle class="cls-11" cx="607.33" cy="137.18" r="2.52" transform="translate(374.65 714.69) rotate(-80.78)"/><circle class="cls-12" cx="607.33" cy="137.18" r="2.52" transform="translate(374.65 714.69) rotate(-80.78)"/><path class="cls-11" d="M612.13,132.86a2.52,2.52,0,1,1-2.51-2.52A2.53,2.53,0,0,1,612.13,132.86Z"/><path class="cls-12" d="M612.13,132.86a2.52,2.52,0,1,1-2.51-2.52A2.53,2.53,0,0,1,612.13,132.86Z"/><circle class="cls-11" cx="611.89" cy="128.3" r="2.52" transform="translate(387.24 711.74) rotate(-80.78)"/><circle class="cls-12" cx="611.89" cy="128.3" r="2.52" transform="translate(387.24 711.74) rotate(-80.78)"/><path class="cls-11" d="M616.69,123.74a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,616.69,123.74Z"/><path class="cls-12" d="M616.69,123.74a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,616.69,123.74Z"/><circle class="cls-11" cx="616.45" cy="119.06" r="2.52" transform="translate(1.36 244.97) rotate(-22.5)"/><circle class="cls-12" cx="616.45" cy="119.06" r="2.52" transform="translate(1.36 244.97) rotate(-22.5)"/><path class="cls-11" d="M621.25,114.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,621.25,114.38Z"/><path class="cls-12" d="M621.25,114.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,621.25,114.38Z"/><circle class="cls-11" cx="621.01" cy="109.58" r="2.52" transform="translate(413.38 705.02) rotate(-80.78)"/><circle class="cls-12" cx="621.01" cy="109.58" r="2.52" transform="translate(413.38 705.02) rotate(-80.78)"/><path class="cls-11" d="M625.81,104.65a2.52,2.52,0,1,1-2.51-2.51A2.52,2.52,0,0,1,625.81,104.65Z"/><path class="cls-12" d="M625.81,104.65a2.52,2.52,0,1,1-2.51-2.51A2.52,2.52,0,0,1,625.81,104.65Z"/><path class="cls-11" d="M628.1,99.62a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,628.1,99.62Z"/><path class="cls-12" d="M628.1,99.62a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,628.1,99.62Z"/><path class="cls-11" d="M630.49,94.46A2.52,2.52,0,1,1,628,91.93,2.51,2.51,0,0,1,630.49,94.46Z"/><path class="cls-12" d="M630.49,94.46A2.52,2.52,0,1,1,628,91.93,2.51,2.51,0,0,1,630.49,94.46Z"/><path class="cls-11" d="M632.78,89.3a2.52,2.52,0,1,1-2.53-2.53A2.54,2.54,0,0,1,632.78,89.3Z"/><path class="cls-12" d="M632.78,89.3a2.52,2.52,0,1,1-2.53-2.53A2.54,2.54,0,0,1,632.78,89.3Z"/><circle class="cls-11" cx="632.54" cy="83.9" r="2.52" transform="translate(448.4 694.82) rotate(-80.78)"/><circle class="cls-12" cx="632.54" cy="83.9" r="2.52" transform="translate(448.4 694.82) rotate(-80.78)"/><path class="cls-11" d="M637.33,78.38a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,637.33,78.38Z"/><path class="cls-12" d="M637.33,78.38a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,637.33,78.38Z"/><circle class="cls-11" cx="637.1" cy="72.86" r="2.52"/><circle class="cls-12" cx="637.1" cy="72.86" r="2.52"/><path class="cls-11" d="M641.89,67.21a2.52,2.52,0,1,1-2.52-2.51A2.52,2.52,0,0,1,641.89,67.21Z"/><path class="cls-12" d="M641.89,67.21a2.52,2.52,0,1,1-2.52-2.51A2.52,2.52,0,0,1,641.89,67.21Z"/><g class="cls-13"><text class="cls-14" transform="translate(40.94 365.91)">0</text></g><g class="cls-13"><text class="cls-14" transform="translate(27.23 315.31)">5000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 264.69)">10000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 214.04)">15000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 163.44)">20000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 112.82)">25000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 62.17)">30000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 11.55)">35000</text></g><g class="cls-13"><text class="cls-14" transform="translate(52.69 377.63)">0</text></g><g class="cls-13"><text class="cls-14" transform="translate(165.03 377.63)">50</text></g><g class="cls-13"><text class="cls-14" transform="translate(277.36 377.63)">100</text></g><g class="cls-13"><text class="cls-14" transform="translate(391.98 377.63)">150</text></g><g class="cls-13"><text class="cls-14" transform="translate(506.58 377.63)">200</text></g><g class="cls-13"><text class="cls-14" transform="translate(621.19 377.63)">250</text></g><g class="cls-13"><text class="cls-15" transform="translate(17.56 199.36) rotate(-90)">Qstep</text></g><g class="cls-13"><text class="cls-15" transform="translate(325.65 386.9)">Q<tspan class="cls-16" x="11.53" y="0">_</tspan><tspan class="cls-17" x="19.54" y="0">i</tspan><tspan class="cls-18" x="23.97" y="0">n</tspan><tspan class="cls-19" x="31.98" y="0">d</tspan><tspan x="40.01" y="0">ex</tspan></text></g><line class="cls-4" x1="481.68" y1="70.09" x2="500.88" y2="70.09"/><path class="cls-5" d="M493.1,69.92a2,2,0,1,1-2-2A2,2,0,0,1,493.1,69.92Z"/><path class="cls-20" d="M493.1,69.92a2,2,0,1,1-2-2A2,2,0,0,1,493.1,69.92Z"/><g class="cls-13"><text class="cls-21" transform="translate(503.01 74.49)"><tspan class="cls-22">8</tspan><tspan x="6.98" y="0">-</tspan><tspan class="cls-23" x="11.65" y="0">b</tspan><tspan class="cls-22" x="18.73" y="0">it</tspan><tspan class="cls-24" x="26.45" y="0"> </tspan><tspan class="cls-25" x="30.03" y="0">A</tspan><tspan x="40.11" y="0">C</tspan></text></g><line class="cls-7" x1="481.68" y1="90.76" x2="500.88" y2="90.76"/><path class="cls-8" d="M493.1,90.68a2,2,0,1,1-2-2A2,2,0,0,1,493.1,90.68Z"/><path class="cls-26" d="M493.1,90.68a2,2,0,1,1-2-2A2,2,0,0,1,493.1,90.68Z"/><g class="cls-13"><text class="cls-21" transform="translate(503.01 95.15)"><tspan class="cls-22">1</tspan><tspan class="cls-27" x="6.98" y="0">0</tspan><tspan class="cls-28" x="14.03" y="0">-</tspan><tspan class="cls-29" x="18.7" y="0">bi</tspan><tspan class="cls-30" x="29.54" y="0">t</tspan><tspan class="cls-31" x="33.48" y="0"> </tspan><tspan class="cls-32" x="36.99" y="0">A</tspan><tspan x="47.05" y="0">C</tspan></text></g><line class="cls-10" x1="481.68" y1="111.43" x2="500.88" y2="111.43"/><path class="cls-11" d="M493.69,111.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,493.69,111.38Z"/><path class="cls-33" d="M493.69,111.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,493.69,111.38Z"/><g class="cls-13"><text class="cls-21" transform="translate(503.01 115.82)"><tspan class="cls-22">1</tspan><tspan class="cls-27" x="6.98" y="0">2</tspan><tspan class="cls-28" x="14.03" y="0">-</tspan><tspan class="cls-29" x="18.7" y="0">bi</tspan><tspan class="cls-30" x="29.54" y="0">t</tspan><tspan class="cls-31" x="33.48" y="0"> </tspan><tspan class="cls-32" x="36.99" y="0">A</tspan><tspan x="47.05" y="0">C</tspan></text></g><rect class="cls-2" x="0.38" y="0.38" width="652.8" height="391.32"/></g></g></svg>
+\ No newline at end of file
diff --git a/media/libaom/src/doc/img/quant_dc.svg b/media/libaom/src/doc/img/quant_dc.svg
new file mode 100644
index 0000000000..4fda1084e1
--- /dev/null
+++ b/media/libaom/src/doc/img/quant_dc.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 653.55 392.07"><defs><style>.cls-1,.cls-10,.cls-12,.cls-18,.cls-2,.cls-24,.cls-33,.cls-4,.cls-6,.cls-7,.cls-9{fill:none;}.cls-2{stroke:#d9d9d9;}.cls-10,.cls-12,.cls-18,.cls-2,.cls-24,.cls-33,.cls-4,.cls-6,.cls-7,.cls-9{stroke-linejoin:round;}.cls-18,.cls-2,.cls-24,.cls-33{stroke-width:0.75px;}.cls-3{clip-path:url(#clip-path);}.cls-18,.cls-4,.cls-6{stroke:#5b9bd5;}.cls-10,.cls-4,.cls-7{stroke-linecap:round;stroke-width:2.25px;}.cls-5{fill:#5b9bd5;}.cls-12,.cls-6,.cls-9{stroke-width:0.72px;}.cls-24,.cls-7,.cls-9{stroke:#ed7d31;}.cls-8{fill:#ed7d31;}.cls-10,.cls-12,.cls-33{stroke:#a5a5a5;}.cls-11{fill:#a5a5a5;}.cls-13{clip-path:url(#clip-path-4);}.cls-14{font-size:9px;font-family:Calibri, Calibri;}.cls-14,.cls-15,.cls-19,.cls-25{fill:#595959;}.cls-15{font-size:15.96px;}.cls-15,.cls-19,.cls-25{font-family:TimesNewRomanPSMT, Times New Roman;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0em;}.cls-19{font-size:14.04px;}.cls-20{letter-spacing:0em;}.cls-21{letter-spacing:0em;}.cls-22{letter-spacing:0.01em;}.cls-23{letter-spacing:0em;}.cls-25{font-size:14.06px;}.cls-26{letter-spacing:0em;}.cls-27{letter-spacing:0em;}.cls-28{letter-spacing:-0.01em;}.cls-29{letter-spacing:0em;}.cls-30{letter-spacing:0em;}.cls-31{letter-spacing:0em;}.cls-32{letter-spacing:-0.01em;}.cls-34{letter-spacing:0em;}.cls-35{letter-spacing:0em;}.cls-36{letter-spacing:0em;}.cls-37{letter-spacing:0em;}.cls-38{letter-spacing:0em;}.cls-39{letter-spacing:-0.01em;}</style><clipPath id="clip-path"><rect class="cls-1" x="53.77" y="8.9" width="587.4" height="355.08"/></clipPath><clipPath id="clip-path-4"><rect class="cls-1" x="0.38" y="0.38" width="652.8" height="391.32"/></clipPath></defs><title>tables2Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><path class="cls-2" d="M53.8,9H640.52M53.8,79.82H640.52M53.8,150.74H640.52M53.8,221.54H640.52M53.8,292.46H640.52"/><path class="cls-2" d="M626.77,9V363.3M512.18,9V363.3M397.58,9V363.3M283,9V363.3M168.37,9V363.3M53.8,9V363.3"/><line class="cls-2" x1="53.8" y1="363.3" x2="640.52" y2="363.3"/><g class="cls-3"><polyline class="cls-4" points="54.95 363.24 57.26 363.13 59.53 363.13 61.81 363.13 64.09 363.13 66.38 363.13 68.66 363.13 71.06 363.13 73.33 363.13 75.61 363.13 77.89 363.13 80.17 363.01 82.45 363.01 84.73 363.01 87.02 363.01 89.3 363.01 91.58 363.01 93.86 363.01 96.25 363.01 98.53 363.01 100.81 362.89 103.09 362.89 105.38 362.89 107.66 362.89 109.94 362.89 112.22 362.89 114.5 362.89 116.78 362.89 119.17 362.89 121.45 362.89 123.73 362.89 126.02 362.77 128.29 362.77 130.57 362.77 132.85 362.77 135.13 362.77 137.41 362.77 139.69 362.77 142.09 362.77 144.38 362.77 146.66 362.65 148.94 362.65 151.22 362.65 153.5 362.65 155.78 362.65 158.06 362.65 160.34 362.65 162.62 362.65 165.01 362.65 167.29 362.65 169.57 362.65 171.85 362.54 174.13 362.54 176.41 362.54 178.69 362.54 180.97 362.54 183.25 362.54 185.53 362.54 187.94 362.54 190.22 362.54 192.5 362.54 194.78 362.42 197.06 362.42 199.34 362.42 201.62 362.42 203.9 362.42 206.18 362.42 208.46 362.42 210.85 362.42 213.13 362.42 215.41 362.42 217.69 362.42 219.97 362.3 222.25 362.3 224.53 362.3 226.81 362.3 229.09 362.3 231.38 362.3 233.78 362.3 236.06 362.3 238.34 362.3 240.62 362.3 242.9 362.18 245.18 362.18 247.46 362.18 249.74 362.18 252.01 362.18 254.29 362.18 256.69 362.18 258.98 362.18 261.25 362.18 263.54 362.18 265.81 362.18 268.1 362.06 270.38 362.06 272.65 362.06 274.94 362.06 277.21 362.06 279.62 362.06 281.89 361.94 284.18 361.94 286.45 361.94 288.74 361.94 291.01 361.94 293.3 361.94 295.57 361.81 297.86 361.81 300.13 361.81 302.42 361.81 304.81 361.81 307.1 361.81 309.38 361.69 311.65 361.69 313.94 361.69 316.21 361.69 318.5 361.69 320.77 361.69 323.06 361.57 325.33 361.57 327.74 361.57 330.01 361.57 332.3 361.57 334.57 361.45 336.86 361.45 339.13 361.45 341.42 361.33 343.69 361.33 345.98 361.33 348.25 361.33 350.65 361.33 352.94 361.21 355.21 361.21 357.5 361.21 359.77 361.21 362.06 361.1 364.33 361.1 366.62 361.1 368.89 361.1 371.18 360.98 373.57 360.98 375.86 360.98 378.13 360.86 380.42 360.86 382.69 360.86 384.98 360.74 387.25 360.74 389.54 360.74 391.81 360.62 394.1 360.62 396.5 360.62 398.77 360.62 401.06 360.5 403.33 360.5 405.62 360.38 407.89 360.38 410.18 360.38 412.45 360.25 414.74 360.25 417.01 360.25 419.42 360.13 421.69 360.13 423.98 360.13 426.25 360.01 428.54 360.01 430.81 359.89 433.1 359.89 435.38 359.89 437.65 359.77 439.94 359.77 442.33 359.65 444.62 359.65 446.89 359.65 449.18 359.54 451.45 359.54 453.74 359.42 456.01 359.42 458.3 359.3 460.57 359.3 462.86 359.18 465.25 359.18 467.54 359.06 469.81 359.06 472.1 358.94 474.38 358.94 476.65 358.81 478.94 358.81 481.21 358.69 483.5 358.69 485.77 358.57 488.06 358.57 490.45 358.45 492.74 358.45 495.01 358.33 497.3 358.33 499.57 358.21 501.86 358.1 504.13 358.1 506.42 357.98 508.69 357.98 510.98 357.86 513.38 357.74 515.65 357.74 517.93 357.62 520.22 357.5 522.5 357.5 524.77 357.38 527.05 357.25 529.34 357.25 531.62 357.13 533.89 357.01 536.29 357.01 538.58 356.89 540.86 356.77 543.13 356.65 545.41 356.54 547.7 356.42 549.98 356.42 552.25 356.3 554.53 356.18 556.82 356.06 559.22 355.94 561.5 355.81 563.77 355.69 566.05 355.45 568.34 355.33 570.62 355.21 572.89 355.1 575.17 354.98 577.46 354.74 579.74 354.62 582.13 354.38 584.41 354.25 586.7 354.01 588.98 353.77 591.25 353.65 593.53 353.42 595.82 353.18 598.1 352.81 600.38 352.57 602.65 352.33 605.05 351.98 607.34 351.74 609.62 351.38 611.89 351.01 614.17 350.65 616.46 350.18 618.74 349.81 621.01 349.33 623.29 348.86 625.58 348.25 627.98 347.77 630.25 347.18 632.53 346.45 634.82 345.86 637.1 345.13 639.38 344.37"/></g><circle class="cls-5" cx="54.91" cy="363.2" r="1.98"/><circle class="cls-6" cx="54.91" cy="363.2" r="1.98"/><circle class="cls-5" cx="57.2" cy="363.08" r="1.98"/><circle class="cls-6" cx="57.2" cy="363.08" r="1.98"/><circle class="cls-5" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-6" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-5" cx="61.75" cy="363.08" r="1.98"/><circle class="cls-6" cx="61.75" cy="363.08" r="1.98"/><circle class="cls-5" cx="64.03" cy="363.08" r="1.98"/><circle class="cls-6" cx="64.03" cy="363.08" r="1.98"/><circle class="cls-5" cx="66.31" cy="363.08" r="1.98"/><circle class="cls-6" cx="66.31" cy="363.08" r="1.98"/><circle class="cls-5" cx="68.59" cy="363.08" r="1.98"/><circle class="cls-6" cx="68.59" cy="363.08" r="1.98"/><circle class="cls-5" cx="71" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -297.31, 397.95)"/><circle class="cls-6" cx="71" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -297.31, 397.95)"/><circle class="cls-5" cx="73.28" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -295.25, 400.22)"/><circle class="cls-6" cx="73.28" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -295.25, 400.22)"/><circle class="cls-5" cx="75.56" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -293.2, 402.49)"/><circle class="cls-6" cx="75.56" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -293.2, 402.49)"/><circle class="cls-5" cx="77.84" cy="363.08" r="1.98"/><circle class="cls-6" cx="77.84" cy="363.08" r="1.98"/><circle class="cls-5" cx="80.12" cy="362.96" r="1.98"/><circle class="cls-6" cx="80.12" cy="362.96" r="1.98"/><circle class="cls-5" cx="82.4" cy="362.96" r="1.98"/><circle class="cls-6" cx="82.4" cy="362.96" r="1.98"/><circle class="cls-5" cx="84.67" cy="362.96" r="1.98"/><circle class="cls-6" cx="84.67" cy="362.96" r="1.98"/><circle class="cls-5" cx="86.95" cy="362.96" r="1.98"/><circle class="cls-6" cx="86.95" cy="362.96" r="1.98"/><circle class="cls-5" cx="89.23" cy="362.96" r="1.98"/><circle class="cls-6" cx="89.23" cy="362.96" r="1.98"/><circle class="cls-5" cx="91.51" cy="362.96" r="1.98"/><circle class="cls-6" cx="91.51" cy="362.96" r="1.98"/><circle class="cls-5" cx="93.79" cy="362.96" r="1.98"/><circle class="cls-6" cx="93.79" cy="362.96" r="1.98"/><circle class="cls-5" cx="96.19" cy="362.96" r="1.98"/><circle class="cls-6" cx="96.19" cy="362.96" r="1.98"/><circle class="cls-5" cx="98.48" cy="362.96" r="1.98"/><circle class="cls-6" cx="98.48" cy="362.96" r="1.98"/><circle class="cls-5" cx="100.76" cy="362.84" r="1.98"/><circle class="cls-6" cx="100.76" cy="362.84" r="1.98"/><circle class="cls-5" cx="103.03" cy="362.84" r="1.98"/><circle class="cls-6" cx="103.03" cy="362.84" r="1.98"/><circle class="cls-5" cx="105.31" cy="362.84" r="1.98"/><circle class="cls-6" cx="105.31" cy="362.84" r="1.98"/><circle class="cls-5" cx="107.59" cy="362.84" r="1.98"/><circle class="cls-6" cx="107.59" cy="362.84" r="1.98"/><circle class="cls-5" cx="109.88" cy="362.84" r="1.98"/><circle class="cls-6" cx="109.88" cy="362.84" r="1.98"/><circle class="cls-5" cx="112.15" cy="362.84" r="1.98"/><circle class="cls-6" cx="112.15" cy="362.84" r="1.98"/><circle class="cls-5" cx="114.43" cy="362.84" r="1.98"/><circle class="cls-6" cx="114.43" cy="362.84" r="1.98"/><circle class="cls-5" cx="116.71" cy="362.84" r="1.98"/><circle class="cls-6" cx="116.71" cy="362.84" r="1.98"/><circle class="cls-5" cx="119.12" cy="362.84" r="1.98"/><circle class="cls-6" cx="119.12" cy="362.84" r="1.98"/><circle class="cls-5" cx="121.4" cy="362.84" r="1.98"/><circle class="cls-6" cx="121.4" cy="362.84" r="1.98"/><circle class="cls-5" cx="123.67" cy="362.84" r="1.98"/><circle class="cls-6" cx="123.67" cy="362.84" r="1.98"/><circle class="cls-5" cx="125.95" cy="362.72" r="1.98"/><circle class="cls-6" cx="125.95" cy="362.72" r="1.98"/><circle class="cls-5" cx="128.23" cy="362.72" r="1.98"/><circle class="cls-6" cx="128.23" cy="362.72" r="1.98"/><circle class="cls-5" cx="130.51" cy="362.72" r="1.98"/><circle class="cls-6" cx="130.51" cy="362.72" r="1.98"/><circle class="cls-5" cx="132.79" cy="362.72" r="1.98"/><circle class="cls-6" cx="132.79" cy="362.72" r="1.98"/><circle class="cls-5" cx="135.07" cy="362.72" r="1.98"/><circle class="cls-6" cx="135.07" cy="362.72" r="1.98"/><circle class="cls-5" cx="137.36" cy="362.72" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -237.13, 463.66)"/><circle class="cls-6" cx="137.36" cy="362.72" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -237.13, 463.66)"/><circle class="cls-5" cx="139.64" cy="362.72" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -235.07, 465.93)"/><circle class="cls-6" cx="139.64" cy="362.72" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -235.07, 465.93)"/><circle class="cls-5" cx="142.03" cy="362.72" r="1.98"/><circle class="cls-6" cx="142.03" cy="362.72" r="1.98"/><circle class="cls-5" cx="144.31" cy="362.72" r="1.98"/><circle class="cls-6" cx="144.31" cy="362.72" r="1.98"/><circle class="cls-5" cx="146.59" cy="362.6" r="1.98"/><circle class="cls-6" cx="146.59" cy="362.6" r="1.98"/><circle class="cls-5" cx="148.88" cy="362.6" r="1.98"/><circle class="cls-6" cx="148.88" cy="362.6" r="1.98"/><circle class="cls-5" cx="151.15" cy="362.6" r="1.98"/><circle class="cls-6" cx="151.15" cy="362.6" r="1.98"/><circle class="cls-5" cx="153.43" cy="362.6" r="1.98"/><circle class="cls-6" cx="153.43" cy="362.6" r="1.98"/><circle class="cls-5" cx="155.71" cy="362.6" r="1.98"/><circle class="cls-6" cx="155.71" cy="362.6" r="1.98"/><circle class="cls-5" cx="158" cy="362.6" r="1.98"/><circle class="cls-6" cx="158" cy="362.6" r="1.98"/><circle class="cls-5" cx="160.28" cy="362.6" r="1.98"/><circle class="cls-6" cx="160.28" cy="362.6" r="1.98"/><circle class="cls-5" cx="162.56" cy="362.6" r="1.98"/><circle class="cls-6" cx="162.56" cy="362.6" r="1.98"/><circle class="cls-5" cx="164.95" cy="362.6" r="1.98"/><circle class="cls-6" cx="164.95" cy="362.6" r="1.98"/><circle class="cls-5" cx="167.23" cy="362.6" r="1.98"/><circle class="cls-6" cx="167.23" cy="362.6" r="1.98"/><circle class="cls-5" cx="169.51" cy="362.6" r="1.98"/><circle class="cls-6" cx="169.51" cy="362.6" r="1.98"/><circle class="cls-5" cx="171.79" cy="362.48" r="1.98"/><circle class="cls-6" cx="171.79" cy="362.48" r="1.98"/><circle class="cls-5" cx="174.07" cy="362.48" r="1.98"/><circle class="cls-6" cx="174.07" cy="362.48" r="1.98"/><circle class="cls-5" cx="176.36" cy="362.48" r="1.98"/><circle class="cls-6" cx="176.36" cy="362.48" r="1.98"/><circle class="cls-5" cx="178.64" cy="362.48" r="1.98"/><circle class="cls-6" cx="178.64" cy="362.48" r="1.98"/><circle class="cls-5" cx="180.92" cy="362.48" r="1.98"/><circle class="cls-6" cx="180.92" cy="362.48" r="1.98"/><circle class="cls-5" cx="183.19" cy="362.48" r="1.98"/><circle class="cls-6" cx="183.19" cy="362.48" r="1.98"/><circle class="cls-5" cx="185.48" cy="362.48" r="1.98"/><circle class="cls-6" cx="185.48" cy="362.48" r="1.98"/><circle class="cls-5" cx="187.88" cy="362.48" r="1.98"/><circle class="cls-6" cx="187.88" cy="362.48" r="1.98"/><circle class="cls-5" cx="190.15" cy="362.48" r="1.98"/><circle class="cls-6" cx="190.15" cy="362.48" r="1.98"/><circle class="cls-5" cx="192.43" cy="362.48" r="1.98"/><circle class="cls-6" cx="192.43" cy="362.48" r="1.98"/><circle class="cls-5" cx="194.71" cy="362.36" r="1.98"/><circle class="cls-6" cx="194.71" cy="362.36" r="1.98"/><circle class="cls-5" cx="196.99" cy="362.36" r="1.98"/><circle class="cls-6" cx="196.99" cy="362.36" r="1.98"/><path class="cls-5" d="M201.26,362.36a2,2,0,1,1-2-2A2,2,0,0,1,201.26,362.36Z"/><path class="cls-6" d="M201.26,362.36a2,2,0,1,1-2-2A2,2,0,0,1,201.26,362.36Z"/><path class="cls-5" d="M203.53,362.36a2,2,0,1,1-2-2A2,2,0,0,1,203.53,362.36Z"/><path class="cls-6" d="M203.53,362.36a2,2,0,1,1-2-2A2,2,0,0,1,203.53,362.36Z"/><path class="cls-5" d="M205.81,362.36a2,2,0,1,1-2-2A2,2,0,0,1,205.81,362.36Z"/><path class="cls-6" d="M205.81,362.36a2,2,0,1,1-2-2A2,2,0,0,1,205.81,362.36Z"/><path class="cls-5" d="M208.09,362.36a2,2,0,1,1-2-2A2,2,0,0,1,208.09,362.36Z"/><path class="cls-6" d="M208.09,362.36a2,2,0,1,1-2-2A2,2,0,0,1,208.09,362.36Z"/><path class="cls-5" d="M210.37,362.36a2,2,0,1,1-2-2A2,2,0,0,1,210.37,362.36Z"/><path class="cls-6" d="M210.37,362.36a2,2,0,1,1-2-2A2,2,0,0,1,210.37,362.36Z"/><path class="cls-5" d="M212.77,362.36a2,2,0,1,1-2-2A2,2,0,0,1,212.77,362.36Z"/><path class="cls-6" d="M212.77,362.36a2,2,0,1,1-2-2A2,2,0,0,1,212.77,362.36Z"/><path class="cls-5" d="M215.05,362.36a2,2,0,1,1-2-2A2,2,0,0,1,215.05,362.36Z"/><path class="cls-6" d="M215.05,362.36a2,2,0,1,1-2-2A2,2,0,0,1,215.05,362.36Z"/><path class="cls-5" d="M217.33,362.36a2,2,0,1,1-2-2A2,2,0,0,1,217.33,362.36Z"/><path class="cls-6" d="M217.33,362.36a2,2,0,1,1-2-2A2,2,0,0,1,217.33,362.36Z"/><path class="cls-5" d="M219.61,362.36a2,2,0,1,1-2-2A2,2,0,0,1,219.61,362.36Z"/><path class="cls-6" d="M219.61,362.36a2,2,0,1,1-2-2A2,2,0,0,1,219.61,362.36Z"/><path class="cls-5" d="M221.89,362.24a2,2,0,1,1-2-2A2,2,0,0,1,221.89,362.24Z"/><path class="cls-6" d="M221.89,362.24a2,2,0,1,1-2-2A2,2,0,0,1,221.89,362.24Z"/><circle class="cls-5" cx="222.2" cy="362.24" r="1.98" transform="translate(-34.61 23.66) rotate(-5.65)"/><circle class="cls-6" cx="222.2" cy="362.24" r="1.98" transform="translate(-34.61 23.66) rotate(-5.65)"/><circle class="cls-5" cx="224.48" cy="362.24" r="1.98" transform="translate(-34.6 23.88) rotate(-5.65)"/><circle class="cls-6" cx="224.48" cy="362.24" r="1.98" transform="translate(-34.6 23.88) rotate(-5.65)"/><circle class="cls-5" cx="226.76" cy="362.24" r="1.98" transform="translate(-34.59 24.11) rotate(-5.65)"/><circle class="cls-6" cx="226.76" cy="362.24" r="1.98" transform="translate(-34.59 24.11) rotate(-5.65)"/><circle class="cls-5" cx="229.03" cy="362.24" r="1.98" transform="translate(-34.58 24.33) rotate(-5.65)"/><circle class="cls-6" cx="229.03" cy="362.24" r="1.98" transform="translate(-34.58 24.33) rotate(-5.65)"/><circle class="cls-5" cx="231.31" cy="362.24" r="1.98" transform="translate(-34.57 24.56) rotate(-5.65)"/><circle class="cls-6" cx="231.31" cy="362.24" r="1.98" transform="translate(-34.57 24.56) rotate(-5.65)"/><path class="cls-5" d="M235.7,362.24a2,2,0,1,1-2-2A2,2,0,0,1,235.7,362.24Z"/><path class="cls-6" d="M235.7,362.24a2,2,0,1,1-2-2A2,2,0,0,1,235.7,362.24Z"/><path class="cls-5" d="M238,362.24a2,2,0,1,1-2-2A2,2,0,0,1,238,362.24Z"/><path class="cls-6" d="M238,362.24a2,2,0,1,1-2-2A2,2,0,0,1,238,362.24Z"/><path class="cls-5" d="M240.26,362.24a2,2,0,1,1-2-2A2,2,0,0,1,240.26,362.24Z"/><path class="cls-6" d="M240.26,362.24a2,2,0,1,1-2-2A2,2,0,0,1,240.26,362.24Z"/><path class="cls-5" d="M242.53,362.24a2,2,0,1,1-2-2A2,2,0,0,1,242.53,362.24Z"/><path class="cls-6" d="M242.53,362.24a2,2,0,1,1-2-2A2,2,0,0,1,242.53,362.24Z"/><path class="cls-5" d="M244.81,362.12a2,2,0,1,1-2-2A2,2,0,0,1,244.81,362.12Z"/><path class="cls-6" d="M244.81,362.12a2,2,0,1,1-2-2A2,2,0,0,1,244.81,362.12Z"/><path class="cls-5" d="M247.09,362.12a2,2,0,1,1-2-2A2,2,0,0,1,247.09,362.12Z"/><path class="cls-6" d="M247.09,362.12a2,2,0,1,1-2-2A2,2,0,0,1,247.09,362.12Z"/><path class="cls-5" d="M249.37,362.12a2,2,0,1,1-2-2A2,2,0,0,1,249.37,362.12Z"/><path class="cls-6" d="M249.37,362.12a2,2,0,1,1-2-2A2,2,0,0,1,249.37,362.12Z"/><path class="cls-5" d="M251.65,362.12a2,2,0,1,1-2-2A2,2,0,0,1,251.65,362.12Z"/><path class="cls-6" d="M251.65,362.12a2,2,0,1,1-2-2A2,2,0,0,1,251.65,362.12Z"/><path class="cls-5" d="M253.93,362.12a2,2,0,1,1-2-2A2,2,0,0,1,253.93,362.12Z"/><path class="cls-6" d="M253.93,362.12a2,2,0,1,1-2-2A2,2,0,0,1,253.93,362.12Z"/><path class="cls-5" d="M256.21,362.12a2,2,0,1,1-2-2A2,2,0,0,1,256.21,362.12Z"/><path class="cls-6" d="M256.21,362.12a2,2,0,1,1-2-2A2,2,0,0,1,256.21,362.12Z"/><path class="cls-5" d="M258.61,362.12a2,2,0,1,1-2-2A2,2,0,0,1,258.61,362.12Z"/><path class="cls-6" d="M258.61,362.12a2,2,0,1,1-2-2A2,2,0,0,1,258.61,362.12Z"/><path class="cls-5" d="M260.89,362.12a2,2,0,1,1-2-2A2,2,0,0,1,260.89,362.12Z"/><path class="cls-6" d="M260.89,362.12a2,2,0,1,1-2-2A2,2,0,0,1,260.89,362.12Z"/><circle class="cls-5" cx="261.2" cy="362.12" r="1.98" transform="translate(-118.52 596.98) rotate(-85.93)"/><circle class="cls-6" cx="261.2" cy="362.12" r="1.98" transform="translate(-118.52 596.98) rotate(-85.93)"/><circle class="cls-5" cx="263.48" cy="362.12" r="1.98" transform="translate(-116.41 599.26) rotate(-85.93)"/><circle class="cls-6" cx="263.48" cy="362.12" r="1.98" transform="translate(-116.41 599.26) rotate(-85.93)"/><circle class="cls-5" cx="265.76" cy="362.12" r="1.98" transform="translate(-114.29 601.53) rotate(-85.93)"/><circle class="cls-6" cx="265.76" cy="362.12" r="1.98" transform="translate(-114.29 601.53) rotate(-85.93)"/><circle class="cls-5" cx="268.03" cy="362" r="1.98" transform="translate(-112.05 603.69) rotate(-85.93)"/><circle class="cls-6" cx="268.03" cy="362" r="1.98" transform="translate(-112.05 603.69) rotate(-85.93)"/><circle class="cls-5" cx="270.31" cy="362" r="1.98" transform="translate(-109.93 605.97) rotate(-85.93)"/><circle class="cls-6" cx="270.31" cy="362" r="1.98" transform="translate(-109.93 605.97) rotate(-85.93)"/><circle class="cls-5" cx="272.59" cy="362" r="1.98" transform="translate(-107.81 608.24) rotate(-85.93)"/><circle class="cls-6" cx="272.59" cy="362" r="1.98" transform="translate(-107.81 608.24) rotate(-85.93)"/><circle class="cls-5" cx="274.87" cy="362" r="1.98" transform="translate(-105.69 610.52) rotate(-85.93)"/><circle class="cls-6" cx="274.87" cy="362" r="1.98" transform="translate(-105.69 610.52) rotate(-85.93)"/><path class="cls-5" d="M279.14,362a2,2,0,1,1-2-2A2,2,0,0,1,279.14,362Z"/><path class="cls-6" d="M279.14,362a2,2,0,1,1-2-2A2,2,0,0,1,279.14,362Z"/><path class="cls-5" d="M281.53,362a2,2,0,1,1-2-2A2,2,0,0,1,281.53,362Z"/><path class="cls-6" d="M281.53,362a2,2,0,1,1-2-2A2,2,0,0,1,281.53,362Z"/><path class="cls-5" d="M283.81,361.88a2,2,0,1,1-2-2A2,2,0,0,1,283.81,361.88Z"/><path class="cls-6" d="M283.81,361.88a2,2,0,1,1-2-2A2,2,0,0,1,283.81,361.88Z"/><path class="cls-5" d="M286.09,361.88a2,2,0,1,1-2-2A2,2,0,0,1,286.09,361.88Z"/><path class="cls-6" d="M286.09,361.88a2,2,0,1,1-2-2A2,2,0,0,1,286.09,361.88Z"/><path class="cls-5" d="M288.37,361.88a2,2,0,1,1-2-2A2,2,0,0,1,288.37,361.88Z"/><path class="cls-6" d="M288.37,361.88a2,2,0,1,1-2-2A2,2,0,0,1,288.37,361.88Z"/><path class="cls-5" d="M290.65,361.88a2,2,0,1,1-2-2A2,2,0,0,1,290.65,361.88Z"/><path class="cls-6" d="M290.65,361.88a2,2,0,1,1-2-2A2,2,0,0,1,290.65,361.88Z"/><path class="cls-5" d="M292.93,361.88a2,2,0,1,1-2-2A2,2,0,0,1,292.93,361.88Z"/><path class="cls-6" d="M292.93,361.88a2,2,0,1,1-2-2A2,2,0,0,1,292.93,361.88Z"/><path class="cls-5" d="M295.21,361.88a2,2,0,1,1-2-2A2,2,0,0,1,295.21,361.88Z"/><path class="cls-6" d="M295.21,361.88a2,2,0,1,1-2-2A2,2,0,0,1,295.21,361.88Z"/><path class="cls-5" d="M297.49,361.76a2,2,0,1,1-2-2A2,2,0,0,1,297.49,361.76Z"/><path class="cls-6" d="M297.49,361.76a2,2,0,1,1-2-2A2,2,0,0,1,297.49,361.76Z"/><path class="cls-5" d="M299.77,361.76a2,2,0,1,1-2-2A2,2,0,0,1,299.77,361.76Z"/><path class="cls-6" d="M299.77,361.76a2,2,0,1,1-2-2A2,2,0,0,1,299.77,361.76Z"/><path class="cls-5" d="M302.05,361.76a2,2,0,1,1-2-2A2,2,0,0,1,302.05,361.76Z"/><path class="cls-6" d="M302.05,361.76a2,2,0,1,1-2-2A2,2,0,0,1,302.05,361.76Z"/><path class="cls-5" d="M304.33,361.76a2,2,0,1,1-2-2A2,2,0,0,1,304.33,361.76Z"/><path class="cls-6" d="M304.33,361.76a2,2,0,1,1-2-2A2,2,0,0,1,304.33,361.76Z"/><circle class="cls-5" cx="304.76" cy="361.76" r="1.98" transform="translate(-77.69 640.1) rotate(-85.93)"/><circle class="cls-6" cx="304.76" cy="361.76" r="1.98" transform="translate(-77.69 640.1) rotate(-85.93)"/><circle class="cls-5" cx="307.03" cy="361.76" r="1.98" transform="translate(-75.58 642.37) rotate(-85.93)"/><circle class="cls-6" cx="307.03" cy="361.76" r="1.98" transform="translate(-75.58 642.37) rotate(-85.93)"/><circle class="cls-5" cx="309.31" cy="361.64" r="1.98"/><circle class="cls-6" cx="309.31" cy="361.64" r="1.98"/><circle class="cls-5" cx="311.59" cy="361.64" r="1.98"/><circle class="cls-6" cx="311.59" cy="361.64" r="1.98"/><circle class="cls-5" cx="313.87" cy="361.64" r="1.98"/><circle class="cls-6" cx="313.87" cy="361.64" r="1.98"/><path class="cls-5" d="M318.14,361.64a2,2,0,1,1-2-2A2,2,0,0,1,318.14,361.64Z"/><path class="cls-6" d="M318.14,361.64a2,2,0,1,1-2-2A2,2,0,0,1,318.14,361.64Z"/><path class="cls-5" d="M320.42,361.64a2,2,0,1,1-2-2A2,2,0,0,1,320.42,361.64Z"/><path class="cls-6" d="M320.42,361.64a2,2,0,1,1-2-2A2,2,0,0,1,320.42,361.64Z"/><path class="cls-5" d="M322.7,361.64a2,2,0,1,1-2-2A2,2,0,0,1,322.7,361.64Z"/><path class="cls-6" d="M322.7,361.64a2,2,0,1,1-2-2A2,2,0,0,1,322.7,361.64Z"/><path class="cls-5" d="M325,361.52a2,2,0,1,1-2-2A2,2,0,0,1,325,361.52Z"/><path class="cls-6" d="M325,361.52a2,2,0,1,1-2-2A2,2,0,0,1,325,361.52Z"/><path class="cls-5" d="M327.26,361.52a2,2,0,1,1-2-2A2,2,0,0,1,327.26,361.52Z"/><path class="cls-6" d="M327.26,361.52a2,2,0,1,1-2-2A2,2,0,0,1,327.26,361.52Z"/><path class="cls-5" d="M329.65,361.52a2,2,0,1,1-2-2A2,2,0,0,1,329.65,361.52Z"/><path class="cls-6" d="M329.65,361.52a2,2,0,1,1-2-2A2,2,0,0,1,329.65,361.52Z"/><path class="cls-5" d="M331.93,361.52a2,2,0,1,1-2-2A2,2,0,0,1,331.93,361.52Z"/><path class="cls-6" d="M331.93,361.52a2,2,0,1,1-2-2A2,2,0,0,1,331.93,361.52Z"/><path class="cls-5" d="M334.21,361.52a2,2,0,1,1-2-2A2,2,0,0,1,334.21,361.52Z"/><path class="cls-6" d="M334.21,361.52a2,2,0,1,1-2-2A2,2,0,0,1,334.21,361.52Z"/><path class="cls-5" d="M336.49,361.4a2,2,0,1,1-2-2A2,2,0,0,1,336.49,361.4Z"/><path class="cls-6" d="M336.49,361.4a2,2,0,1,1-2-2A2,2,0,0,1,336.49,361.4Z"/><path class="cls-5" d="M338.77,361.4a2,2,0,1,1-2-2A2,2,0,0,1,338.77,361.4Z"/><path class="cls-6" d="M338.77,361.4a2,2,0,1,1-2-2A2,2,0,0,1,338.77,361.4Z"/><path class="cls-5" d="M341.05,361.4a2,2,0,1,1-2-2A2,2,0,0,1,341.05,361.4Z"/><path class="cls-6" d="M341.05,361.4a2,2,0,1,1-2-2A2,2,0,0,1,341.05,361.4Z"/><path class="cls-5" d="M343.33,361.28a2,2,0,1,1-2-2A2,2,0,0,1,343.33,361.28Z"/><path class="cls-6" d="M343.33,361.28a2,2,0,1,1-2-2A2,2,0,0,1,343.33,361.28Z"/><path class="cls-5" d="M345.61,361.28a2,2,0,1,1-2-2A2,2,0,0,1,345.61,361.28Z"/><path class="cls-6" d="M345.61,361.28a2,2,0,1,1-2-2A2,2,0,0,1,345.61,361.28Z"/><path class="cls-5" d="M347.89,361.28a2,2,0,1,1-2-2A2,2,0,0,1,347.89,361.28Z"/><path class="cls-6" d="M347.89,361.28a2,2,0,1,1-2-2A2,2,0,0,1,347.89,361.28Z"/><circle class="cls-5" cx="348.2" cy="361.28" r="1.98"/><circle class="cls-6" cx="348.2" cy="361.28" r="1.98"/><circle class="cls-5" cx="350.59" cy="361.28" r="1.98"/><circle class="cls-6" cx="350.59" cy="361.28" r="1.98"/><circle class="cls-5" cx="352.87" cy="361.16" r="1.98" transform="translate(-33.87 36.53) rotate(-5.65)"/><circle class="cls-6" cx="352.87" cy="361.16" r="1.98" transform="translate(-33.87 36.53) rotate(-5.65)"/><path class="cls-5" d="M357.14,361.16a2,2,0,1,1-2-2A2,2,0,0,1,357.14,361.16Z"/><path class="cls-6" d="M357.14,361.16a2,2,0,1,1-2-2A2,2,0,0,1,357.14,361.16Z"/><path class="cls-5" d="M359.42,361.16a2,2,0,1,1-2-2A2,2,0,0,1,359.42,361.16Z"/><path class="cls-6" d="M359.42,361.16a2,2,0,1,1-2-2A2,2,0,0,1,359.42,361.16Z"/><path class="cls-5" d="M361.7,361.16a2,2,0,1,1-2-2A2,2,0,0,1,361.7,361.16Z"/><path class="cls-6" d="M361.7,361.16a2,2,0,1,1-2-2A2,2,0,0,1,361.7,361.16Z"/><path class="cls-5" d="M364,361a2,2,0,1,1-2-2A2,2,0,0,1,364,361Z"/><path class="cls-6" d="M364,361a2,2,0,1,1-2-2A2,2,0,0,1,364,361Z"/><path class="cls-5" d="M366.26,361a2,2,0,1,1-2-2A2,2,0,0,1,366.26,361Z"/><path class="cls-6" d="M366.26,361a2,2,0,1,1-2-2A2,2,0,0,1,366.26,361Z"/><path class="cls-5" d="M368.53,361a2,2,0,1,1-2-2A2,2,0,0,1,368.53,361Z"/><path class="cls-6" d="M368.53,361a2,2,0,1,1-2-2A2,2,0,0,1,368.53,361Z"/><path class="cls-5" d="M370.81,361a2,2,0,1,1-2-2A2,2,0,0,1,370.81,361Z"/><path class="cls-6" d="M370.81,361a2,2,0,1,1-2-2A2,2,0,0,1,370.81,361Z"/><path class="cls-5" d="M373.09,360.92a2,2,0,1,1-2-2A2,2,0,0,1,373.09,360.92Z"/><path class="cls-6" d="M373.09,360.92a2,2,0,1,1-2-2A2,2,0,0,1,373.09,360.92Z"/><path class="cls-5" d="M375.49,360.92a2,2,0,1,1-2-2A2,2,0,0,1,375.49,360.92Z"/><path class="cls-6" d="M375.49,360.92a2,2,0,1,1-2-2A2,2,0,0,1,375.49,360.92Z"/><path class="cls-5" d="M377.77,360.92a2,2,0,1,1-2-2A2,2,0,0,1,377.77,360.92Z"/><path class="cls-6" d="M377.77,360.92a2,2,0,1,1-2-2A2,2,0,0,1,377.77,360.92Z"/><path class="cls-5" d="M380.05,360.8a2,2,0,1,1-2-2A2,2,0,0,1,380.05,360.8Z"/><path class="cls-6" d="M380.05,360.8a2,2,0,1,1-2-2A2,2,0,0,1,380.05,360.8Z"/><path class="cls-5" d="M382.33,360.8a2,2,0,1,1-2-2A2,2,0,0,1,382.33,360.8Z"/><path class="cls-6" d="M382.33,360.8a2,2,0,1,1-2-2A2,2,0,0,1,382.33,360.8Z"/><path class="cls-5" d="M384.61,360.8a2,2,0,1,1-2-2A2,2,0,0,1,384.61,360.8Z"/><path class="cls-6" d="M384.61,360.8a2,2,0,1,1-2-2A2,2,0,0,1,384.61,360.8Z"/><path class="cls-5" d="M386.89,360.68a2,2,0,1,1-2-2A2,2,0,0,1,386.89,360.68Z"/><path class="cls-6" d="M386.89,360.68a2,2,0,1,1-2-2A2,2,0,0,1,386.89,360.68Z"/><circle class="cls-5" cx="387.2" cy="360.68" r="1.98" transform="translate(-0.02 721.33) rotate(-85.93)"/><circle class="cls-6" cx="387.2" cy="360.68" r="1.98" transform="translate(-0.02 721.33) rotate(-85.93)"/><circle class="cls-5" cx="389.48" cy="360.68" r="1.98" transform="translate(2.1 723.6) rotate(-85.93)"/><circle class="cls-6" cx="389.48" cy="360.68" r="1.98" transform="translate(2.1 723.6) rotate(-85.93)"/><circle class="cls-5" cx="391.76" cy="360.56" r="1.98" transform="translate(-33.62 40.36) rotate(-5.65)"/><circle class="cls-6" cx="391.76" cy="360.56" r="1.98" transform="translate(-33.62 40.36) rotate(-5.65)"/><circle class="cls-5" cx="394.03" cy="360.56" r="1.98" transform="translate(-33.61 40.58) rotate(-5.65)"/><circle class="cls-6" cx="394.03" cy="360.56" r="1.98" transform="translate(-33.61 40.58) rotate(-5.65)"/><path class="cls-5" d="M398.42,360.56a2,2,0,1,1-2-2A2,2,0,0,1,398.42,360.56Z"/><path class="cls-6" d="M398.42,360.56a2,2,0,1,1-2-2A2,2,0,0,1,398.42,360.56Z"/><path class="cls-5" d="M400.7,360.56a2,2,0,1,1-2-2A2,2,0,0,1,400.7,360.56Z"/><path class="cls-6" d="M400.7,360.56a2,2,0,1,1-2-2A2,2,0,0,1,400.7,360.56Z"/><path class="cls-5" d="M403,360.44a2,2,0,1,1-2-2A2,2,0,0,1,403,360.44Z"/><path class="cls-6" d="M403,360.44a2,2,0,1,1-2-2A2,2,0,0,1,403,360.44Z"/><path class="cls-5" d="M405.26,360.44a2,2,0,1,1-2-2A2,2,0,0,1,405.26,360.44Z"/><path class="cls-6" d="M405.26,360.44a2,2,0,1,1-2-2A2,2,0,0,1,405.26,360.44Z"/><path class="cls-5" d="M407.53,360.32a2,2,0,1,1-2-2A2,2,0,0,1,407.53,360.32Z"/><path class="cls-6" d="M407.53,360.32a2,2,0,1,1-2-2A2,2,0,0,1,407.53,360.32Z"/><path class="cls-5" d="M409.81,360.32a2,2,0,1,1-2-2A2,2,0,0,1,409.81,360.32Z"/><path class="cls-6" d="M409.81,360.32a2,2,0,1,1-2-2A2,2,0,0,1,409.81,360.32Z"/><path class="cls-5" d="M412.09,360.32a2,2,0,1,1-2-2A2,2,0,0,1,412.09,360.32Z"/><path class="cls-6" d="M412.09,360.32a2,2,0,1,1-2-2A2,2,0,0,1,412.09,360.32Z"/><path class="cls-5" d="M414.37,360.2a2,2,0,1,1-2-2A2,2,0,0,1,414.37,360.2Z"/><path class="cls-6" d="M414.37,360.2a2,2,0,1,1-2-2A2,2,0,0,1,414.37,360.2Z"/><path class="cls-5" d="M416.65,360.2a2,2,0,1,1-2-2A2,2,0,0,1,416.65,360.2Z"/><path class="cls-6" d="M416.65,360.2a2,2,0,1,1-2-2A2,2,0,0,1,416.65,360.2Z"/><path class="cls-5" d="M418.93,360.2a2,2,0,1,1-2-2A2,2,0,0,1,418.93,360.2Z"/><path class="cls-6" d="M418.93,360.2a2,2,0,1,1-2-2A2,2,0,0,1,418.93,360.2Z"/><path class="cls-5" d="M421.33,360.08a2,2,0,1,1-2-2A2,2,0,0,1,421.33,360.08Z"/><path class="cls-6" d="M421.33,360.08a2,2,0,1,1-2-2A2,2,0,0,1,421.33,360.08Z"/><path class="cls-5" d="M423.61,360.08a2,2,0,1,1-2-2A2,2,0,0,1,423.61,360.08Z"/><path class="cls-6" d="M423.61,360.08a2,2,0,1,1-2-2A2,2,0,0,1,423.61,360.08Z"/><path class="cls-5" d="M425.89,360.08a2,2,0,1,1-2-2A2,2,0,0,1,425.89,360.08Z"/><path class="cls-6" d="M425.89,360.08a2,2,0,1,1-2-2A2,2,0,0,1,425.89,360.08Z"/><circle class="cls-5" cx="426.2" cy="359.96" r="1.98"/><circle class="cls-6" cx="426.2" cy="359.96" r="1.98"/><circle class="cls-5" cx="428.48" cy="359.96" r="1.98"/><circle class="cls-6" cx="428.48" cy="359.96" r="1.98"/><circle class="cls-5" cx="430.76" cy="359.84" r="1.98" transform="translate(-33.36 44.2) rotate(-5.65)"/><circle class="cls-6" cx="430.76" cy="359.84" r="1.98" transform="translate(-33.36 44.2) rotate(-5.65)"/><circle class="cls-5" cx="433.03" cy="359.84" r="1.98" transform="translate(-33.35 44.42) rotate(-5.65)"/><circle class="cls-6" cx="433.03" cy="359.84" r="1.98" transform="translate(-33.35 44.42) rotate(-5.65)"/><circle class="cls-5" cx="435.31" cy="359.84" r="1.98" transform="translate(-33.34 44.65) rotate(-5.65)"/><circle class="cls-6" cx="435.31" cy="359.84" r="1.98" transform="translate(-33.34 44.65) rotate(-5.65)"/><circle class="cls-5" cx="437.59" cy="359.72" r="1.98" transform="translate(47.76 770.71) rotate(-85.93)"/><circle class="cls-6" cx="437.59" cy="359.72" r="1.98" transform="translate(47.76 770.71) rotate(-85.93)"/><circle class="cls-5" cx="439.87" cy="359.72" r="1.98" transform="translate(49.88 772.98) rotate(-85.93)"/><circle class="cls-6" cx="439.87" cy="359.72" r="1.98" transform="translate(49.88 772.98) rotate(-85.93)"/><path class="cls-5" d="M444.26,359.6a2,2,0,1,1-2-2A2,2,0,0,1,444.26,359.6Z"/><path class="cls-6" d="M444.26,359.6a2,2,0,1,1-2-2A2,2,0,0,1,444.26,359.6Z"/><path class="cls-5" d="M446.53,359.6a2,2,0,1,1-2-2A2,2,0,0,1,446.53,359.6Z"/><path class="cls-6" d="M446.53,359.6a2,2,0,1,1-2-2A2,2,0,0,1,446.53,359.6Z"/><path class="cls-5" d="M448.81,359.6a2,2,0,1,1-2-2A2,2,0,0,1,448.81,359.6Z"/><path class="cls-6" d="M448.81,359.6a2,2,0,1,1-2-2A2,2,0,0,1,448.81,359.6Z"/><path class="cls-5" d="M451.09,359.48a2,2,0,1,1-2-2A2,2,0,0,1,451.09,359.48Z"/><path class="cls-6" d="M451.09,359.48a2,2,0,1,1-2-2A2,2,0,0,1,451.09,359.48Z"/><path class="cls-5" d="M453.38,359.48a2,2,0,1,1-2-2A2,2,0,0,1,453.38,359.48Z"/><path class="cls-6" d="M453.38,359.48a2,2,0,1,1-2-2A2,2,0,0,1,453.38,359.48Z"/><path class="cls-5" d="M455.65,359.36a2,2,0,1,1-2-2A2,2,0,0,1,455.65,359.36Z"/><path class="cls-6" d="M455.65,359.36a2,2,0,1,1-2-2A2,2,0,0,1,455.65,359.36Z"/><path class="cls-5" d="M457.94,359.36a2,2,0,1,1-2-2A2,2,0,0,1,457.94,359.36Z"/><path class="cls-6" d="M457.94,359.36a2,2,0,1,1-2-2A2,2,0,0,1,457.94,359.36Z"/><path class="cls-5" d="M460.21,359.24a2,2,0,1,1-2-2A2,2,0,0,1,460.21,359.24Z"/><path class="cls-6" d="M460.21,359.24a2,2,0,1,1-2-2A2,2,0,0,1,460.21,359.24Z"/><path class="cls-5" d="M462.5,359.24a2,2,0,1,1-2-2A2,2,0,0,1,462.5,359.24Z"/><path class="cls-6" d="M462.5,359.24a2,2,0,1,1-2-2A2,2,0,0,1,462.5,359.24Z"/><circle class="cls-5" cx="462.8" cy="359.12" r="1.98" transform="translate(-19.15 26.14) rotate(-3.17)"/><circle class="cls-6" cx="462.8" cy="359.12" r="1.98" transform="translate(-19.15 26.14) rotate(-3.17)"/><path class="cls-5" d="M467.18,359.12a2,2,0,1,1-2-2A2,2,0,0,1,467.18,359.12Z"/><path class="cls-6" d="M467.18,359.12a2,2,0,1,1-2-2A2,2,0,0,1,467.18,359.12Z"/><circle class="cls-5" cx="467.48" cy="359" r="1.98" transform="translate(-19.14 26.4) rotate(-3.17)"/><circle class="cls-6" cx="467.48" cy="359" r="1.98" transform="translate(-19.14 26.4) rotate(-3.17)"/><path class="cls-5" d="M471.74,359a2,2,0,1,1-2-2A2,2,0,0,1,471.74,359Z"/><path class="cls-6" d="M471.74,359a2,2,0,1,1-2-2A2,2,0,0,1,471.74,359Z"/><circle class="cls-5" cx="472.03" cy="358.88" r="1.98" transform="translate(92.14 814.16) rotate(-87.4)"/><circle class="cls-6" cx="472.03" cy="358.88" r="1.98" transform="translate(92.14 814.16) rotate(-87.4)"/><path class="cls-5" d="M476.3,358.88a2,2,0,1,1-2-2A2,2,0,0,1,476.3,358.88Z"/><path class="cls-6" d="M476.3,358.88a2,2,0,1,1-2-2A2,2,0,0,1,476.3,358.88Z"/><circle class="cls-5" cx="476.59" cy="358.76" r="1.98" transform="translate(-19.11 26.9) rotate(-3.17)"/><circle class="cls-6" cx="476.59" cy="358.76" r="1.98" transform="translate(-19.11 26.9) rotate(-3.17)"/><path class="cls-5" d="M480.86,358.76a2,2,0,1,1-2-2A2,2,0,0,1,480.86,358.76Z"/><path class="cls-6" d="M480.86,358.76a2,2,0,1,1-2-2A2,2,0,0,1,480.86,358.76Z"/><circle class="cls-5" cx="481.15" cy="358.64" r="1.98"/><circle class="cls-6" cx="481.15" cy="358.64" r="1.98"/><path class="cls-5" d="M485.42,358.64a2,2,0,1,1-2-2A2,2,0,0,1,485.42,358.64Z"/><path class="cls-6" d="M485.42,358.64a2,2,0,1,1-2-2A2,2,0,0,1,485.42,358.64Z"/><circle class="cls-5" cx="485.71" cy="358.52" r="1.98" transform="translate(105.56 827.49) rotate(-87.4)"/><circle class="cls-6" cx="485.71" cy="358.52" r="1.98" transform="translate(105.56 827.49) rotate(-87.4)"/><path class="cls-5" d="M490,358.52a2,2,0,1,1-2-2A2,2,0,0,1,490,358.52Z"/><path class="cls-6" d="M490,358.52a2,2,0,1,1-2-2A2,2,0,0,1,490,358.52Z"/><path class="cls-5" d="M492.38,358.4a2,2,0,1,1-2-2A2,2,0,0,1,492.38,358.4Z"/><path class="cls-6" d="M492.38,358.4a2,2,0,1,1-2-2A2,2,0,0,1,492.38,358.4Z"/><path class="cls-5" d="M494.65,358.4a2,2,0,1,1-2-2A2,2,0,0,1,494.65,358.4Z"/><path class="cls-6" d="M494.65,358.4a2,2,0,1,1-2-2A2,2,0,0,1,494.65,358.4Z"/><path class="cls-5" d="M496.94,358.28a2,2,0,1,1-2-2A2,2,0,0,1,496.94,358.28Z"/><path class="cls-6" d="M496.94,358.28a2,2,0,1,1-2-2A2,2,0,0,1,496.94,358.28Z"/><path class="cls-5" d="M499.21,358.28a2,2,0,1,1-2-2A2,2,0,0,1,499.21,358.28Z"/><path class="cls-6" d="M499.21,358.28a2,2,0,1,1-2-2A2,2,0,0,1,499.21,358.28Z"/><path class="cls-5" d="M501.5,358.16a2,2,0,1,1-2-2A2,2,0,0,1,501.5,358.16Z"/><path class="cls-6" d="M501.5,358.16a2,2,0,1,1-2-2A2,2,0,0,1,501.5,358.16Z"/><circle class="cls-5" cx="501.8" cy="358.04" r="1.98" transform="translate(-19.03 28.3) rotate(-3.17)"/><circle class="cls-6" cx="501.8" cy="358.04" r="1.98" transform="translate(-19.03 28.3) rotate(-3.17)"/><path class="cls-5" d="M506.06,358a2,2,0,1,1-2-2A2,2,0,0,1,506.06,358Z"/><path class="cls-6" d="M506.06,358a2,2,0,1,1-2-2A2,2,0,0,1,506.06,358Z"/><circle class="cls-5" cx="506.36" cy="357.92" r="1.98" transform="translate(125.86 847.53) rotate(-87.4)"/><circle class="cls-6" cx="506.36" cy="357.92" r="1.98" transform="translate(125.86 847.53) rotate(-87.4)"/><path class="cls-5" d="M510.62,357.92a2,2,0,1,1-2-2A2,2,0,0,1,510.62,357.92Z"/><path class="cls-6" d="M510.62,357.92a2,2,0,1,1-2-2A2,2,0,0,1,510.62,357.92Z"/><circle class="cls-5" cx="510.92" cy="357.8" r="1.98" transform="translate(130.34 851.97) rotate(-87.4)"/><circle class="cls-6" cx="510.92" cy="357.8" r="1.98" transform="translate(130.34 851.97) rotate(-87.4)"/><path class="cls-5" d="M515.3,357.68a2,2,0,1,1-2-2A2,2,0,0,1,515.3,357.68Z"/><path class="cls-6" d="M515.3,357.68a2,2,0,1,1-2-2A2,2,0,0,1,515.3,357.68Z"/><circle class="cls-5" cx="515.59" cy="357.68" r="1.98" transform="translate(-18.99 29.06) rotate(-3.17)"/><circle class="cls-6" cx="515.59" cy="357.68" r="1.98" transform="translate(-18.99 29.06) rotate(-3.17)"/><path class="cls-5" d="M519.86,357.56a2,2,0,1,1-2-2A2,2,0,0,1,519.86,357.56Z"/><path class="cls-6" d="M519.86,357.56a2,2,0,1,1-2-2A2,2,0,0,1,519.86,357.56Z"/><circle class="cls-5" cx="520.15" cy="357.44" r="1.98" transform="translate(-18.97 29.31) rotate(-3.17)"/><circle class="cls-6" cx="520.15" cy="357.44" r="1.98" transform="translate(-18.97 29.31) rotate(-3.17)"/><path class="cls-5" d="M524.42,357.44a2,2,0,1,1-2-2A2,2,0,0,1,524.42,357.44Z"/><path class="cls-6" d="M524.42,357.44a2,2,0,1,1-2-2A2,2,0,0,1,524.42,357.44Z"/><circle class="cls-5" cx="524.71" cy="357.32" r="1.98" transform="translate(-18.96 29.56) rotate(-3.17)"/><circle class="cls-6" cx="524.71" cy="357.32" r="1.98" transform="translate(-18.96 29.56) rotate(-3.17)"/><path class="cls-5" d="M529,357.2a2,2,0,1,1-2-2A2,2,0,0,1,529,357.2Z"/><path class="cls-6" d="M529,357.2a2,2,0,1,1-2-2A2,2,0,0,1,529,357.2Z"/><circle class="cls-5" cx="529.27" cy="357.2" r="1.98" transform="translate(148.46 869.74) rotate(-87.4)"/><circle class="cls-6" cx="529.27" cy="357.2" r="1.98" transform="translate(148.46 869.74) rotate(-87.4)"/><path class="cls-5" d="M533.53,357.08a2,2,0,1,1-2-2A2,2,0,0,1,533.53,357.08Z"/><path class="cls-6" d="M533.53,357.08a2,2,0,1,1-2-2A2,2,0,0,1,533.53,357.08Z"/><path class="cls-5" d="M535.82,357a2,2,0,1,1-2-2A2,2,0,0,1,535.82,357Z"/><path class="cls-6" d="M535.82,357a2,2,0,1,1-2-2A2,2,0,0,1,535.82,357Z"/><path class="cls-5" d="M538.21,357a2,2,0,1,1-2-2A2,2,0,0,1,538.21,357Z"/><path class="cls-6" d="M538.21,357a2,2,0,1,1-2-2A2,2,0,0,1,538.21,357Z"/><path class="cls-5" d="M540.5,356.84a2,2,0,1,1-2-2A2,2,0,0,1,540.5,356.84Z"/><path class="cls-6" d="M540.5,356.84a2,2,0,1,1-2-2A2,2,0,0,1,540.5,356.84Z"/><circle class="cls-5" cx="540.8" cy="356.72" r="1.98" transform="translate(-18.9 30.45) rotate(-3.17)"/><circle class="cls-6" cx="540.8" cy="356.72" r="1.98" transform="translate(-18.9 30.45) rotate(-3.17)"/><path class="cls-5" d="M545.06,356.6a2,2,0,1,1-2-2A2,2,0,0,1,545.06,356.6Z"/><path class="cls-6" d="M545.06,356.6a2,2,0,1,1-2-2A2,2,0,0,1,545.06,356.6Z"/><circle class="cls-5" cx="545.36" cy="356.48" r="1.98" transform="translate(164.53 885.12) rotate(-87.4)"/><circle class="cls-6" cx="545.36" cy="356.48" r="1.98" transform="translate(164.53 885.12) rotate(-87.4)"/><path class="cls-5" d="M549.62,356.36a2,2,0,1,1-2-2A2,2,0,0,1,549.62,356.36Z"/><path class="cls-6" d="M549.62,356.36a2,2,0,1,1-2-2A2,2,0,0,1,549.62,356.36Z"/><circle class="cls-5" cx="549.92" cy="356.36" r="1.98" transform="translate(-18.86 30.96) rotate(-3.17)"/><circle class="cls-6" cx="549.92" cy="356.36" r="1.98" transform="translate(-18.86 30.96) rotate(-3.17)"/><path class="cls-5" d="M554.18,356.24a2,2,0,1,1-2-2A2,2,0,0,1,554.18,356.24Z"/><path class="cls-6" d="M554.18,356.24a2,2,0,1,1-2-2A2,2,0,0,1,554.18,356.24Z"/><circle class="cls-5" cx="554.48" cy="356.12" r="1.98" transform="translate(-18.84 31.21) rotate(-3.17)"/><circle class="cls-6" cx="554.48" cy="356.12" r="1.98" transform="translate(-18.84 31.21) rotate(-3.17)"/><path class="cls-5" d="M558.74,356a2,2,0,1,1-2-2A2,2,0,0,1,558.74,356Z"/><path class="cls-6" d="M558.74,356a2,2,0,1,1-2-2A2,2,0,0,1,558.74,356Z"/><circle class="cls-5" cx="559.15" cy="355.88" r="1.98" transform="translate(178.31 898.33) rotate(-87.4)"/><circle class="cls-6" cx="559.15" cy="355.88" r="1.98" transform="translate(178.31 898.33) rotate(-87.4)"/><path class="cls-5" d="M563.42,355.76a2,2,0,1,1-2-2A2,2,0,0,1,563.42,355.76Z"/><path class="cls-6" d="M563.42,355.76a2,2,0,1,1-2-2A2,2,0,0,1,563.42,355.76Z"/><circle class="cls-5" cx="563.71" cy="355.64" r="1.98"/><circle class="cls-6" cx="563.71" cy="355.64" r="1.98"/><path class="cls-5" d="M568,355.4a2,2,0,1,1-2-2A2,2,0,0,1,568,355.4Z"/><path class="cls-6" d="M568,355.4a2,2,0,1,1-2-2A2,2,0,0,1,568,355.4Z"/><circle class="cls-5" cx="568.27" cy="355.28" r="1.98"/><circle class="cls-6" cx="568.27" cy="355.28" r="1.98"/><path class="cls-5" d="M572.53,355.16a2,2,0,1,1-2-2A2,2,0,0,1,572.53,355.16Z"/><path class="cls-6" d="M572.53,355.16a2,2,0,1,1-2-2A2,2,0,0,1,572.53,355.16Z"/><path class="cls-5" d="M574.82,355a2,2,0,1,1-2-2A2,2,0,0,1,574.82,355Z"/><path class="cls-6" d="M574.82,355a2,2,0,1,1-2-2A2,2,0,0,1,574.82,355Z"/><path class="cls-5" d="M577.09,354.92a2,2,0,1,1-2-2A2,2,0,0,1,577.09,354.92Z"/><path class="cls-6" d="M577.09,354.92a2,2,0,1,1-2-2A2,2,0,0,1,577.09,354.92Z"/><path class="cls-5" d="M579.38,354.68a2,2,0,1,1-2-2A2,2,0,0,1,579.38,354.68Z"/><path class="cls-6" d="M579.38,354.68a2,2,0,1,1-2-2A2,2,0,0,1,579.38,354.68Z"/><path class="cls-5" d="M581.65,354.56a2,2,0,1,1-2-2A2,2,0,0,1,581.65,354.56Z"/><path class="cls-6" d="M581.65,354.56a2,2,0,1,1-2-2A2,2,0,0,1,581.65,354.56Z"/><path class="cls-5" d="M584.06,354.32a2,2,0,1,1-2-2A2,2,0,0,1,584.06,354.32Z"/><path class="cls-6" d="M584.06,354.32a2,2,0,1,1-2-2A2,2,0,0,1,584.06,354.32Z"/><circle class="cls-5" cx="584.36" cy="354.2" r="1.98"/><circle class="cls-6" cx="584.36" cy="354.2" r="1.98"/><path class="cls-5" d="M588.62,354a2,2,0,1,1-2-2A2,2,0,0,1,588.62,354Z"/><path class="cls-6" d="M588.62,354a2,2,0,1,1-2-2A2,2,0,0,1,588.62,354Z"/><circle class="cls-5" cx="588.92" cy="353.72" r="1.98" transform="translate(-18.66 33.11) rotate(-3.17)"/><circle class="cls-6" cx="588.92" cy="353.72" r="1.98" transform="translate(-18.66 33.11) rotate(-3.17)"/><path class="cls-5" d="M593.18,353.6a2,2,0,1,1-2-2A2,2,0,0,1,593.18,353.6Z"/><path class="cls-6" d="M593.18,353.6a2,2,0,1,1-2-2A2,2,0,0,1,593.18,353.6Z"/><circle class="cls-5" cx="593.48" cy="353.36" r="1.98" transform="translate(213.59 930.21) rotate(-87.4)"/><circle class="cls-6" cx="593.48" cy="353.36" r="1.98" transform="translate(213.59 930.21) rotate(-87.4)"/><path class="cls-5" d="M597.74,353.12a2,2,0,1,1-2-2A2,2,0,0,1,597.74,353.12Z"/><path class="cls-6" d="M597.74,353.12a2,2,0,1,1-2-2A2,2,0,0,1,597.74,353.12Z"/><circle class="cls-5" cx="598.03" cy="352.76" r="1.98"/><circle class="cls-6" cx="598.03" cy="352.76" r="1.98"/><path class="cls-5" d="M602.3,352.52a2,2,0,1,1-2-2A2,2,0,0,1,602.3,352.52Z"/><path class="cls-6" d="M602.3,352.52a2,2,0,1,1-2-2A2,2,0,0,1,602.3,352.52Z"/><circle class="cls-5" cx="602.59" cy="352.28" r="1.98" transform="translate(-31.78 61.09) rotate(-5.65)"/><circle class="cls-6" cx="602.59" cy="352.28" r="1.98" transform="translate(-31.78 61.09) rotate(-5.65)"/><path class="cls-5" d="M607,351.92a2,2,0,1,1-2-2A2,2,0,0,1,607,351.92Z"/><path class="cls-6" d="M607,351.92a2,2,0,1,1-2-2A2,2,0,0,1,607,351.92Z"/><circle class="cls-5" cx="607.27" cy="351.68" r="1.98"/><circle class="cls-6" cx="607.27" cy="351.68" r="1.98"/><path class="cls-5" d="M611.53,351.32a2,2,0,1,1-2-2A2,2,0,0,1,611.53,351.32Z"/><path class="cls-6" d="M611.53,351.32a2,2,0,1,1-2-2A2,2,0,0,1,611.53,351.32Z"/><path class="cls-5" d="M613.82,351a2,2,0,1,1-2-2A2,2,0,0,1,613.82,351Z"/><path class="cls-6" d="M613.82,351a2,2,0,1,1-2-2A2,2,0,0,1,613.82,351Z"/><path class="cls-5" d="M616.09,350.6a2,2,0,1,1-2-2A2,2,0,0,1,616.09,350.6Z"/><path class="cls-6" d="M616.09,350.6a2,2,0,1,1-2-2A2,2,0,0,1,616.09,350.6Z"/><path class="cls-5" d="M618.38,350.12a2,2,0,1,1-2-2A2,2,0,0,1,618.38,350.12Z"/><path class="cls-6" d="M618.38,350.12a2,2,0,1,1-2-2A2,2,0,0,1,618.38,350.12Z"/><path class="cls-5" d="M620.65,349.76a2,2,0,1,1-2-2A2,2,0,0,1,620.65,349.76Z"/><path class="cls-6" d="M620.65,349.76a2,2,0,1,1-2-2A2,2,0,0,1,620.65,349.76Z"/><path class="cls-5" d="M622.94,349.28a2,2,0,1,1-2-2A2,2,0,0,1,622.94,349.28Z"/><path class="cls-6" d="M622.94,349.28a2,2,0,1,1-2-2A2,2,0,0,1,622.94,349.28Z"/><path class="cls-5" d="M625.21,348.8a2,2,0,1,1-2-2A2,2,0,0,1,625.21,348.8Z"/><path class="cls-6" d="M625.21,348.8a2,2,0,1,1-2-2A2,2,0,0,1,625.21,348.8Z"/><path class="cls-5" d="M627.5,348.2a2,2,0,1,1-2-2A2,2,0,0,1,627.5,348.2Z"/><path class="cls-6" d="M627.5,348.2a2,2,0,1,1-2-2A2,2,0,0,1,627.5,348.2Z"/><circle class="cls-5" cx="627.92" cy="347.72" r="1.98" transform="translate(236.56 949.4) rotate(-85.93)"/><circle class="cls-6" cx="627.92" cy="347.72" r="1.98" transform="translate(236.56 949.4) rotate(-85.93)"/><path class="cls-5" d="M632.18,347.12a2,2,0,1,1-2-2A2,2,0,0,1,632.18,347.12Z"/><path class="cls-6" d="M632.18,347.12a2,2,0,1,1-2-2A2,2,0,0,1,632.18,347.12Z"/><circle class="cls-5" cx="632.48" cy="346.4" r="1.98"/><circle class="cls-6" cx="632.48" cy="346.4" r="1.98"/><path class="cls-5" d="M636.74,345.8a2,2,0,1,1-2-2A2,2,0,0,1,636.74,345.8Z"/><path class="cls-6" d="M636.74,345.8a2,2,0,1,1-2-2A2,2,0,0,1,636.74,345.8Z"/><circle class="cls-5" cx="637.03" cy="345.08" r="1.98" transform="translate(247.67 956.05) rotate(-85.93)"/><circle class="cls-6" cx="637.03" cy="345.08" r="1.98" transform="translate(247.67 956.05) rotate(-85.93)"/><path class="cls-5" d="M641.3,344.36a2,2,0,1,1-2-2A2,2,0,0,1,641.3,344.36Z"/><path class="cls-6" d="M641.3,344.36a2,2,0,1,1-2-2A2,2,0,0,1,641.3,344.36Z"/><g class="cls-3"><polyline class="cls-7" points="54.95 363.24 57.26 363.13 59.53 363.13 61.81 363.13 64.09 363.13 66.38 363.01 68.66 363.01 71.06 363.01 73.33 362.89 75.61 362.89 77.89 362.89 80.17 362.77 82.45 362.77 84.73 362.77 87.02 362.65 89.3 362.65 91.58 362.54 93.86 362.54 96.25 362.54 98.53 362.42 100.81 362.42 103.09 362.3 105.38 362.3 107.66 362.18 109.94 362.18 112.22 362.18 114.5 362.06 116.78 362.06 119.17 361.94 121.45 361.94 123.73 361.81 126.02 361.81 128.29 361.69 130.57 361.69 132.85 361.69 135.13 361.57 137.41 361.57 139.69 361.45 142.09 361.45 144.38 361.33 146.66 361.33 148.94 361.21 151.22 361.21 153.5 361.1 155.78 361.1 158.06 361.1 160.34 360.98 162.62 360.98 165.01 360.86 167.29 360.86 169.57 360.74 171.85 360.74 174.13 360.62 176.41 360.62 178.69 360.62 180.97 360.5 183.25 360.5 185.53 360.38 187.94 360.38 190.22 360.25 192.5 360.25 194.78 360.13 197.06 360.13 199.34 360.13 201.62 360.01 203.9 360.01 206.18 359.89 208.46 359.89 210.85 359.89 213.13 359.77 215.41 359.77 217.69 359.65 219.97 359.65 222.25 359.54 224.53 359.54 226.81 359.54 229.09 359.42 231.38 359.42 233.78 359.3 236.06 359.3 238.34 359.18 240.62 359.18 242.9 359.18 245.18 359.06 247.46 359.06 249.74 358.94 252.01 358.94 254.29 358.94 256.69 358.81 258.98 358.81 261.25 358.69 263.54 358.69 265.81 358.69 268.1 358.57 270.38 358.57 272.65 358.57 274.94 358.45 277.21 358.33 279.62 358.21 281.89 358.21 284.18 358.1 286.45 357.98 288.74 357.86 291.01 357.86 293.3 357.74 295.57 357.62 297.86 357.5 300.13 357.5 302.42 357.38 304.81 357.25 307.1 357.25 309.38 357.13 311.65 357.01 313.94 356.89 316.21 356.89 318.5 356.77 320.77 356.65 323.06 356.65 325.33 356.54 327.74 356.42 330.01 356.3 332.3 356.18 334.57 356.06 336.86 355.94 339.13 355.81 341.42 355.69 343.69 355.57 345.98 355.45 348.25 355.33 350.65 355.21 352.94 355.1 355.21 354.98 357.5 354.86 359.77 354.74 362.06 354.62 364.33 354.5 366.62 354.5 368.89 354.25 371.18 354.13 373.57 354.01 375.86 353.89 378.13 353.65 380.42 353.54 382.69 353.42 384.98 353.3 387.25 353.18 389.54 352.94 391.81 352.81 394.1 352.69 396.5 352.57 398.77 352.33 401.06 352.21 403.33 351.98 405.62 351.86 407.89 351.74 410.18 351.5 412.45 351.38 414.74 351.13 417.01 351.01 419.42 350.77 421.69 350.65 423.98 350.42 426.25 350.3 428.54 350.06 430.81 349.81 433.1 349.69 435.38 349.45 437.65 349.33 439.94 349.1 442.33 348.86 444.62 348.74 446.89 348.5 449.18 348.25 451.45 348.01 453.74 347.89 456.01 347.65 458.3 347.42 460.57 347.18 462.86 346.94 465.25 346.69 467.54 346.45 469.81 346.21 472.1 345.98 474.38 345.74 476.65 345.5 478.94 345.25 481.21 345.01 483.5 344.77 485.77 344.54 488.06 344.3 490.45 344.06 492.74 343.69 495.01 343.45 497.3 343.21 499.57 342.98 501.86 342.62 504.13 342.38 506.42 342.13 508.69 341.77 510.98 341.54 513.38 341.18 515.65 340.94 517.93 340.57 520.22 340.33 522.5 339.98 524.77 339.62 527.05 339.38 529.34 339.01 531.62 338.65 533.89 338.3 536.29 337.94 538.58 337.57 540.86 337.21 543.13 336.74 545.41 336.38 547.7 336.01 549.98 335.54 552.25 335.06 554.53 334.69 556.82 334.21 559.22 333.74 561.5 333.25 563.77 332.65 566.05 332.18 568.34 331.57 570.62 331.1 572.89 330.38 575.17 329.77 577.46 329.18 579.74 328.45 582.13 327.74 584.41 327.01 586.7 326.18 588.98 325.33 591.25 324.5 593.53 323.54 595.82 322.57 598.1 321.62 600.38 320.54 602.65 319.33 605.05 318.13 607.34 316.81 609.62 315.5 611.89 314.06 614.17 312.5 616.46 310.81 618.74 309.13 621.01 307.21 623.29 305.3 625.58 303.25 627.98 300.98 630.25 298.69 632.53 296.18 634.82 293.42 637.1 290.54 639.38 287.51"/></g><circle class="cls-8" cx="54.91" cy="363.2" r="1.98"/><circle class="cls-9" cx="54.91" cy="363.2" r="1.98"/><circle class="cls-8" cx="57.2" cy="363.08" r="1.98"/><circle class="cls-9" cx="57.2" cy="363.08" r="1.98"/><circle class="cls-8" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-9" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-8" cx="61.75" cy="363.08" r="1.98"/><circle class="cls-9" cx="61.75" cy="363.08" r="1.98"/><circle class="cls-8" cx="64.03" cy="363.08" r="1.98"/><circle class="cls-9" cx="64.03" cy="363.08" r="1.98"/><circle class="cls-8" cx="66.31" cy="362.96" r="1.98"/><circle class="cls-9" cx="66.31" cy="362.96" r="1.98"/><circle class="cls-8" cx="68.59" cy="362.96" r="1.98"/><circle class="cls-9" cx="68.59" cy="362.96" r="1.98"/><circle class="cls-8" cx="71" cy="362.96" r="1.98"/><circle class="cls-9" cx="71" cy="362.96" r="1.98"/><circle class="cls-8" cx="73.28" cy="362.84" r="1.98"/><circle class="cls-9" cx="73.28" cy="362.84" r="1.98"/><circle class="cls-8" cx="75.56" cy="362.84" r="1.98"/><circle class="cls-9" cx="75.56" cy="362.84" r="1.98"/><circle class="cls-8" cx="77.84" cy="362.84" r="1.98"/><circle class="cls-9" cx="77.84" cy="362.84" r="1.98"/><circle class="cls-8" cx="80.12" cy="362.72" r="1.98"/><circle class="cls-9" cx="80.12" cy="362.72" r="1.98"/><circle class="cls-8" cx="82.4" cy="362.72" r="1.98"/><circle class="cls-9" cx="82.4" cy="362.72" r="1.98"/><circle class="cls-8" cx="84.67" cy="362.72" r="1.98"/><circle class="cls-9" cx="84.67" cy="362.72" r="1.98"/><circle class="cls-8" cx="86.95" cy="362.6" r="1.98"/><circle class="cls-9" cx="86.95" cy="362.6" r="1.98"/><circle class="cls-8" cx="89.23" cy="362.6" r="1.98"/><circle class="cls-9" cx="89.23" cy="362.6" r="1.98"/><circle class="cls-8" cx="91.51" cy="362.48" r="1.98"/><circle class="cls-9" cx="91.51" cy="362.48" r="1.98"/><circle class="cls-8" cx="93.79" cy="362.48" r="1.98"/><circle class="cls-9" cx="93.79" cy="362.48" r="1.98"/><circle class="cls-8" cx="96.19" cy="362.48" r="1.98"/><circle class="cls-9" cx="96.19" cy="362.48" r="1.98"/><circle class="cls-8" cx="98.48" cy="362.36" r="1.98"/><circle class="cls-9" cx="98.48" cy="362.36" r="1.98"/><circle class="cls-8" cx="100.76" cy="362.36" r="1.98"/><circle class="cls-9" cx="100.76" cy="362.36" r="1.98"/><circle class="cls-8" cx="103.03" cy="362.24" r="1.98"/><circle class="cls-9" cx="103.03" cy="362.24" r="1.98"/><circle class="cls-8" cx="105.31" cy="362.24" r="1.98"/><circle class="cls-9" cx="105.31" cy="362.24" r="1.98"/><circle class="cls-8" cx="107.59" cy="362.12" r="1.98"/><circle class="cls-9" cx="107.59" cy="362.12" r="1.98"/><circle class="cls-8" cx="109.88" cy="362.12" r="1.98"/><circle class="cls-9" cx="109.88" cy="362.12" r="1.98"/><circle class="cls-8" cx="112.15" cy="362.12" r="1.98"/><circle class="cls-9" cx="112.15" cy="362.12" r="1.98"/><circle class="cls-8" cx="114.43" cy="362" r="1.98"/><circle class="cls-9" cx="114.43" cy="362" r="1.98"/><circle class="cls-8" cx="116.71" cy="362" r="1.98"/><circle class="cls-9" cx="116.71" cy="362" r="1.98"/><circle class="cls-8" cx="119.12" cy="361.88" r="1.98"/><circle class="cls-9" cx="119.12" cy="361.88" r="1.98"/><circle class="cls-8" cx="121.4" cy="361.88" r="1.98"/><circle class="cls-9" cx="121.4" cy="361.88" r="1.98"/><circle class="cls-8" cx="123.67" cy="361.76" r="1.98"/><circle class="cls-9" cx="123.67" cy="361.76" r="1.98"/><circle class="cls-8" cx="125.95" cy="361.76" r="1.98"/><circle class="cls-9" cx="125.95" cy="361.76" r="1.98"/><circle class="cls-8" cx="128.23" cy="361.64" r="1.98"/><circle class="cls-9" cx="128.23" cy="361.64" r="1.98"/><circle class="cls-8" cx="130.51" cy="361.64" r="1.98"/><circle class="cls-9" cx="130.51" cy="361.64" r="1.98"/><circle class="cls-8" cx="132.79" cy="361.64" r="1.98"/><circle class="cls-9" cx="132.79" cy="361.64" r="1.98"/><circle class="cls-8" cx="135.07" cy="361.52" r="1.98"/><circle class="cls-9" cx="135.07" cy="361.52" r="1.98"/><circle class="cls-8" cx="137.36" cy="361.52" r="1.98"/><circle class="cls-9" cx="137.36" cy="361.52" r="1.98"/><circle class="cls-8" cx="139.64" cy="361.4" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -233.76, 464.74)"/><circle class="cls-9" cx="139.64" cy="361.4" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -233.76, 464.74)"/><circle class="cls-8" cx="142.03" cy="361.4" r="1.98"/><circle class="cls-9" cx="142.03" cy="361.4" r="1.98"/><circle class="cls-8" cx="144.31" cy="361.28" r="1.98"/><circle class="cls-9" cx="144.31" cy="361.28" r="1.98"/><circle class="cls-8" cx="146.59" cy="361.28" r="1.98"/><circle class="cls-9" cx="146.59" cy="361.28" r="1.98"/><circle class="cls-8" cx="148.88" cy="361.16" r="1.98"/><circle class="cls-9" cx="148.88" cy="361.16" r="1.98"/><circle class="cls-8" cx="151.15" cy="361.16" r="1.98"/><circle class="cls-9" cx="151.15" cy="361.16" r="1.98"/><circle class="cls-8" cx="153.43" cy="361.04" r="1.98"/><circle class="cls-9" cx="153.43" cy="361.04" r="1.98"/><circle class="cls-8" cx="155.71" cy="361.04" r="1.98"/><circle class="cls-9" cx="155.71" cy="361.04" r="1.98"/><circle class="cls-8" cx="158" cy="361.04" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -216.85, 482.69)"/><circle class="cls-9" cx="158" cy="361.04" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -216.85, 482.69)"/><circle class="cls-8" cx="160.28" cy="360.92" r="1.98"/><circle class="cls-9" cx="160.28" cy="360.92" r="1.98"/><circle class="cls-8" cx="162.56" cy="360.92" r="1.98"/><circle class="cls-9" cx="162.56" cy="360.92" r="1.98"/><circle class="cls-8" cx="164.95" cy="360.8" r="1.98"/><circle class="cls-9" cx="164.95" cy="360.8" r="1.98"/><circle class="cls-8" cx="167.23" cy="360.8" r="1.98"/><circle class="cls-9" cx="167.23" cy="360.8" r="1.98"/><circle class="cls-8" cx="169.51" cy="360.68" r="1.98"/><circle class="cls-9" cx="169.51" cy="360.68" r="1.98"/><circle class="cls-8" cx="171.79" cy="360.68" r="1.98"/><circle class="cls-9" cx="171.79" cy="360.68" r="1.98"/><circle class="cls-8" cx="174.07" cy="360.56" r="1.98"/><circle class="cls-9" cx="174.07" cy="360.56" r="1.98"/><circle class="cls-8" cx="176.36" cy="360.56" r="1.98"/><circle class="cls-9" cx="176.36" cy="360.56" r="1.98"/><circle class="cls-8" cx="178.64" cy="360.56" r="1.98"/><circle class="cls-9" cx="178.64" cy="360.56" r="1.98"/><circle class="cls-8" cx="180.92" cy="360.44" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -195.59, 504.95)"/><circle class="cls-9" cx="180.92" cy="360.44" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -195.59, 504.95)"/><circle class="cls-8" cx="183.19" cy="360.44" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -193.54, 507.22)"/><circle class="cls-9" cx="183.19" cy="360.44" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -193.54, 507.22)"/><circle class="cls-8" cx="185.48" cy="360.32" r="1.98"/><circle class="cls-9" cx="185.48" cy="360.32" r="1.98"/><circle class="cls-8" cx="187.88" cy="360.32" r="1.98"/><circle class="cls-9" cx="187.88" cy="360.32" r="1.98"/><circle class="cls-8" cx="190.15" cy="360.2" r="1.98"/><circle class="cls-9" cx="190.15" cy="360.2" r="1.98"/><circle class="cls-8" cx="192.43" cy="360.2" r="1.98"/><circle class="cls-9" cx="192.43" cy="360.2" r="1.98"/><circle class="cls-8" cx="194.71" cy="360.08" r="1.98"/><circle class="cls-9" cx="194.71" cy="360.08" r="1.98"/><circle class="cls-8" cx="196.99" cy="360.08" r="1.98"/><circle class="cls-9" cx="196.99" cy="360.08" r="1.98"/><path class="cls-8" d="M201.26,360.08a2,2,0,1,1-2-2A2,2,0,0,1,201.26,360.08Z"/><path class="cls-9" d="M201.26,360.08a2,2,0,1,1-2-2A2,2,0,0,1,201.26,360.08Z"/><path class="cls-8" d="M203.53,360a2,2,0,1,1-2-2A2,2,0,0,1,203.53,360Z"/><path class="cls-9" d="M203.53,360a2,2,0,1,1-2-2A2,2,0,0,1,203.53,360Z"/><path class="cls-8" d="M205.81,360a2,2,0,1,1-2-2A2,2,0,0,1,205.81,360Z"/><path class="cls-9" d="M205.81,360a2,2,0,1,1-2-2A2,2,0,0,1,205.81,360Z"/><path class="cls-8" d="M208.09,359.84a2,2,0,1,1-2-2A2,2,0,0,1,208.09,359.84Z"/><path class="cls-9" d="M208.09,359.84a2,2,0,1,1-2-2A2,2,0,0,1,208.09,359.84Z"/><path class="cls-8" d="M210.37,359.84a2,2,0,1,1-2-2A2,2,0,0,1,210.37,359.84Z"/><path class="cls-9" d="M210.37,359.84a2,2,0,1,1-2-2A2,2,0,0,1,210.37,359.84Z"/><path class="cls-8" d="M212.77,359.84a2,2,0,1,1-2-2A2,2,0,0,1,212.77,359.84Z"/><path class="cls-9" d="M212.77,359.84a2,2,0,1,1-2-2A2,2,0,0,1,212.77,359.84Z"/><path class="cls-8" d="M215.05,359.72a2,2,0,1,1-2-2A2,2,0,0,1,215.05,359.72Z"/><path class="cls-9" d="M215.05,359.72a2,2,0,1,1-2-2A2,2,0,0,1,215.05,359.72Z"/><path class="cls-8" d="M217.33,359.72a2,2,0,1,1-2-2A2,2,0,0,1,217.33,359.72Z"/><path class="cls-9" d="M217.33,359.72a2,2,0,1,1-2-2A2,2,0,0,1,217.33,359.72Z"/><path class="cls-8" d="M219.61,359.6a2,2,0,1,1-2-2A2,2,0,0,1,219.61,359.6Z"/><path class="cls-9" d="M219.61,359.6a2,2,0,1,1-2-2A2,2,0,0,1,219.61,359.6Z"/><path class="cls-8" d="M221.89,359.6a2,2,0,1,1-2-2A2,2,0,0,1,221.89,359.6Z"/><path class="cls-9" d="M221.89,359.6a2,2,0,1,1-2-2A2,2,0,0,1,221.89,359.6Z"/><circle class="cls-8" cx="222.2" cy="359.48" r="1.98" transform="translate(-34.34 23.64) rotate(-5.65)"/><circle class="cls-9" cx="222.2" cy="359.48" r="1.98" transform="translate(-34.34 23.64) rotate(-5.65)"/><circle class="cls-8" cx="224.48" cy="359.48" r="1.98" transform="translate(-34.33 23.87) rotate(-5.65)"/><circle class="cls-9" cx="224.48" cy="359.48" r="1.98" transform="translate(-34.33 23.87) rotate(-5.65)"/><circle class="cls-8" cx="226.76" cy="359.48" r="1.98" transform="translate(-34.32 24.09) rotate(-5.65)"/><circle class="cls-9" cx="226.76" cy="359.48" r="1.98" transform="translate(-34.32 24.09) rotate(-5.65)"/><circle class="cls-8" cx="229.03" cy="359.36" r="1.98" transform="translate(-145.65 562.34) rotate(-85.93)"/><circle class="cls-9" cx="229.03" cy="359.36" r="1.98" transform="translate(-145.65 562.34) rotate(-85.93)"/><circle class="cls-8" cx="231.31" cy="359.36" r="1.98" transform="translate(-143.53 564.61) rotate(-85.93)"/><circle class="cls-9" cx="231.31" cy="359.36" r="1.98" transform="translate(-143.53 564.61) rotate(-85.93)"/><path class="cls-8" d="M235.7,359.24a2,2,0,1,1-2-2A2,2,0,0,1,235.7,359.24Z"/><path class="cls-9" d="M235.7,359.24a2,2,0,1,1-2-2A2,2,0,0,1,235.7,359.24Z"/><path class="cls-8" d="M238,359.24a2,2,0,1,1-2-2A2,2,0,0,1,238,359.24Z"/><path class="cls-9" d="M238,359.24a2,2,0,1,1-2-2A2,2,0,0,1,238,359.24Z"/><path class="cls-8" d="M240.26,359.12a2,2,0,1,1-2-2A2,2,0,0,1,240.26,359.12Z"/><path class="cls-9" d="M240.26,359.12a2,2,0,1,1-2-2A2,2,0,0,1,240.26,359.12Z"/><path class="cls-8" d="M242.53,359.12a2,2,0,1,1-2-2A2,2,0,0,1,242.53,359.12Z"/><path class="cls-9" d="M242.53,359.12a2,2,0,1,1-2-2A2,2,0,0,1,242.53,359.12Z"/><path class="cls-8" d="M244.81,359.12a2,2,0,1,1-2-2A2,2,0,0,1,244.81,359.12Z"/><path class="cls-9" d="M244.81,359.12a2,2,0,1,1-2-2A2,2,0,0,1,244.81,359.12Z"/><path class="cls-8" d="M247.09,359a2,2,0,1,1-2-2A2,2,0,0,1,247.09,359Z"/><path class="cls-9" d="M247.09,359a2,2,0,1,1-2-2A2,2,0,0,1,247.09,359Z"/><path class="cls-8" d="M249.37,359a2,2,0,1,1-2-2A2,2,0,0,1,249.37,359Z"/><path class="cls-9" d="M249.37,359a2,2,0,1,1-2-2A2,2,0,0,1,249.37,359Z"/><path class="cls-8" d="M251.65,358.88a2,2,0,1,1-2-2A2,2,0,0,1,251.65,358.88Z"/><path class="cls-9" d="M251.65,358.88a2,2,0,1,1-2-2A2,2,0,0,1,251.65,358.88Z"/><path class="cls-8" d="M253.93,358.88a2,2,0,1,1-2-2A2,2,0,0,1,253.93,358.88Z"/><path class="cls-9" d="M253.93,358.88a2,2,0,1,1-2-2A2,2,0,0,1,253.93,358.88Z"/><path class="cls-8" d="M256.21,358.88a2,2,0,1,1-2-2A2,2,0,0,1,256.21,358.88Z"/><path class="cls-9" d="M256.21,358.88a2,2,0,1,1-2-2A2,2,0,0,1,256.21,358.88Z"/><path class="cls-8" d="M258.61,358.76a2,2,0,1,1-2-2A2,2,0,0,1,258.61,358.76Z"/><path class="cls-9" d="M258.61,358.76a2,2,0,1,1-2-2A2,2,0,0,1,258.61,358.76Z"/><path class="cls-8" d="M260.89,358.76a2,2,0,1,1-2-2A2,2,0,0,1,260.89,358.76Z"/><path class="cls-9" d="M260.89,358.76a2,2,0,1,1-2-2A2,2,0,0,1,260.89,358.76Z"/><circle class="cls-8" cx="261.2" cy="358.64" r="1.98"/><circle class="cls-9" cx="261.2" cy="358.64" r="1.98"/><circle class="cls-8" cx="263.48" cy="358.64" r="1.98"/><circle class="cls-9" cx="263.48" cy="358.64" r="1.98"/><circle class="cls-8" cx="265.76" cy="358.64" r="1.98"/><circle class="cls-9" cx="265.76" cy="358.64" r="1.98"/><circle class="cls-8" cx="268.03" cy="358.52" r="1.98" transform="translate(-34.02 28.16) rotate(-5.65)"/><circle class="cls-9" cx="268.03" cy="358.52" r="1.98" transform="translate(-34.02 28.16) rotate(-5.65)"/><circle class="cls-8" cx="270.31" cy="358.52" r="1.98" transform="translate(-34.01 28.38) rotate(-5.65)"/><circle class="cls-9" cx="270.31" cy="358.52" r="1.98" transform="translate(-34.01 28.38) rotate(-5.65)"/><circle class="cls-8" cx="272.59" cy="358.52" r="1.98" transform="translate(-34 28.61) rotate(-5.65)"/><circle class="cls-9" cx="272.59" cy="358.52" r="1.98" transform="translate(-34 28.61) rotate(-5.65)"/><circle class="cls-8" cx="274.87" cy="358.4" r="1.98" transform="translate(-102.1 607.17) rotate(-85.93)"/><circle class="cls-9" cx="274.87" cy="358.4" r="1.98" transform="translate(-102.1 607.17) rotate(-85.93)"/><path class="cls-8" d="M279.14,358.28a2,2,0,1,1-2-2A2,2,0,0,1,279.14,358.28Z"/><path class="cls-9" d="M279.14,358.28a2,2,0,1,1-2-2A2,2,0,0,1,279.14,358.28Z"/><path class="cls-8" d="M281.53,358.16a2,2,0,1,1-2-2A2,2,0,0,1,281.53,358.16Z"/><path class="cls-9" d="M281.53,358.16a2,2,0,1,1-2-2A2,2,0,0,1,281.53,358.16Z"/><path class="cls-8" d="M283.81,358.16a2,2,0,1,1-2-2A2,2,0,0,1,283.81,358.16Z"/><path class="cls-9" d="M283.81,358.16a2,2,0,1,1-2-2A2,2,0,0,1,283.81,358.16Z"/><path class="cls-8" d="M286.09,358a2,2,0,1,1-2-2A2,2,0,0,1,286.09,358Z"/><path class="cls-9" d="M286.09,358a2,2,0,1,1-2-2A2,2,0,0,1,286.09,358Z"/><path class="cls-8" d="M288.37,357.92a2,2,0,1,1-2-2A2,2,0,0,1,288.37,357.92Z"/><path class="cls-9" d="M288.37,357.92a2,2,0,1,1-2-2A2,2,0,0,1,288.37,357.92Z"/><path class="cls-8" d="M290.65,357.8a2,2,0,1,1-2-2A2,2,0,0,1,290.65,357.8Z"/><path class="cls-9" d="M290.65,357.8a2,2,0,1,1-2-2A2,2,0,0,1,290.65,357.8Z"/><path class="cls-8" d="M292.93,357.8a2,2,0,1,1-2-2A2,2,0,0,1,292.93,357.8Z"/><path class="cls-9" d="M292.93,357.8a2,2,0,1,1-2-2A2,2,0,0,1,292.93,357.8Z"/><path class="cls-8" d="M295.21,357.68a2,2,0,1,1-2-2A2,2,0,0,1,295.21,357.68Z"/><path class="cls-9" d="M295.21,357.68a2,2,0,1,1-2-2A2,2,0,0,1,295.21,357.68Z"/><path class="cls-8" d="M297.49,357.56a2,2,0,1,1-2-2A2,2,0,0,1,297.49,357.56Z"/><path class="cls-9" d="M297.49,357.56a2,2,0,1,1-2-2A2,2,0,0,1,297.49,357.56Z"/><path class="cls-8" d="M299.77,357.44a2,2,0,1,1-2-2A2,2,0,0,1,299.77,357.44Z"/><path class="cls-9" d="M299.77,357.44a2,2,0,1,1-2-2A2,2,0,0,1,299.77,357.44Z"/><path class="cls-8" d="M302.05,357.44a2,2,0,1,1-2-2A2,2,0,0,1,302.05,357.44Z"/><path class="cls-9" d="M302.05,357.44a2,2,0,1,1-2-2A2,2,0,0,1,302.05,357.44Z"/><path class="cls-8" d="M304.33,357.32a2,2,0,1,1-2-2A2,2,0,0,1,304.33,357.32Z"/><path class="cls-9" d="M304.33,357.32a2,2,0,1,1-2-2A2,2,0,0,1,304.33,357.32Z"/><circle class="cls-8" cx="304.76" cy="357.2" r="1.98" transform="translate(-33.71 31.77) rotate(-5.65)"/><circle class="cls-9" cx="304.76" cy="357.2" r="1.98" transform="translate(-33.71 31.77) rotate(-5.65)"/><circle class="cls-8" cx="307.03" cy="357.2" r="1.98" transform="translate(-33.7 31.99) rotate(-5.65)"/><circle class="cls-9" cx="307.03" cy="357.2" r="1.98" transform="translate(-33.7 31.99) rotate(-5.65)"/><circle class="cls-8" cx="309.31" cy="357.08" r="1.98" transform="translate(-68.79 640.3) rotate(-85.93)"/><circle class="cls-9" cx="309.31" cy="357.08" r="1.98" transform="translate(-68.79 640.3) rotate(-85.93)"/><circle class="cls-8" cx="311.59" cy="356.96" r="1.98"/><circle class="cls-9" cx="311.59" cy="356.96" r="1.98"/><circle class="cls-8" cx="313.87" cy="356.84" r="1.98" transform="translate(-33.63 32.67) rotate(-5.65)"/><circle class="cls-9" cx="313.87" cy="356.84" r="1.98" transform="translate(-33.63 32.67) rotate(-5.65)"/><path class="cls-8" d="M318.14,356.84a2,2,0,1,1-2-2A2,2,0,0,1,318.14,356.84Z"/><path class="cls-9" d="M318.14,356.84a2,2,0,1,1-2-2A2,2,0,0,1,318.14,356.84Z"/><path class="cls-8" d="M320.42,356.72a2,2,0,1,1-2-2A2,2,0,0,1,320.42,356.72Z"/><path class="cls-9" d="M320.42,356.72a2,2,0,1,1-2-2A2,2,0,0,1,320.42,356.72Z"/><path class="cls-8" d="M322.7,356.6a2,2,0,1,1-2-2A2,2,0,0,1,322.7,356.6Z"/><path class="cls-9" d="M322.7,356.6a2,2,0,1,1-2-2A2,2,0,0,1,322.7,356.6Z"/><path class="cls-8" d="M325,356.6a2,2,0,1,1-2-2A2,2,0,0,1,325,356.6Z"/><path class="cls-9" d="M325,356.6a2,2,0,1,1-2-2A2,2,0,0,1,325,356.6Z"/><path class="cls-8" d="M327.26,356.48a2,2,0,1,1-2-2A2,2,0,0,1,327.26,356.48Z"/><path class="cls-9" d="M327.26,356.48a2,2,0,1,1-2-2A2,2,0,0,1,327.26,356.48Z"/><path class="cls-8" d="M329.65,356.36a2,2,0,1,1-2-2A2,2,0,0,1,329.65,356.36Z"/><path class="cls-9" d="M329.65,356.36a2,2,0,1,1-2-2A2,2,0,0,1,329.65,356.36Z"/><path class="cls-8" d="M331.93,356.24a2,2,0,1,1-2-2A2,2,0,0,1,331.93,356.24Z"/><path class="cls-9" d="M331.93,356.24a2,2,0,1,1-2-2A2,2,0,0,1,331.93,356.24Z"/><path class="cls-8" d="M334.21,356.12a2,2,0,1,1-2-2A2,2,0,0,1,334.21,356.12Z"/><path class="cls-9" d="M334.21,356.12a2,2,0,1,1-2-2A2,2,0,0,1,334.21,356.12Z"/><path class="cls-8" d="M336.49,356a2,2,0,1,1-2-2A2,2,0,0,1,336.49,356Z"/><path class="cls-9" d="M336.49,356a2,2,0,1,1-2-2A2,2,0,0,1,336.49,356Z"/><path class="cls-8" d="M338.77,355.88a2,2,0,1,1-2-2A2,2,0,0,1,338.77,355.88Z"/><path class="cls-9" d="M338.77,355.88a2,2,0,1,1-2-2A2,2,0,0,1,338.77,355.88Z"/><path class="cls-8" d="M341.05,355.76a2,2,0,1,1-2-2A2,2,0,0,1,341.05,355.76Z"/><path class="cls-9" d="M341.05,355.76a2,2,0,1,1-2-2A2,2,0,0,1,341.05,355.76Z"/><path class="cls-8" d="M343.33,355.64a2,2,0,1,1-2-2A2,2,0,0,1,343.33,355.64Z"/><path class="cls-9" d="M343.33,355.64a2,2,0,1,1-2-2A2,2,0,0,1,343.33,355.64Z"/><path class="cls-8" d="M345.61,355.52a2,2,0,1,1-2-2A2,2,0,0,1,345.61,355.52Z"/><path class="cls-9" d="M345.61,355.52a2,2,0,1,1-2-2A2,2,0,0,1,345.61,355.52Z"/><path class="cls-8" d="M347.89,355.4a2,2,0,1,1-2-2A2,2,0,0,1,347.89,355.4Z"/><path class="cls-9" d="M347.89,355.4a2,2,0,1,1-2-2A2,2,0,0,1,347.89,355.4Z"/><circle class="cls-8" cx="348.2" cy="355.28" r="1.98"/><circle class="cls-9" cx="348.2" cy="355.28" r="1.98"/><circle class="cls-8" cx="350.59" cy="355.16" r="1.98" transform="translate(-33.29 36.28) rotate(-5.65)"/><circle class="cls-9" cx="350.59" cy="355.16" r="1.98" transform="translate(-33.29 36.28) rotate(-5.65)"/><circle class="cls-8" cx="352.87" cy="355.04" r="1.98"/><circle class="cls-9" cx="352.87" cy="355.04" r="1.98"/><path class="cls-8" d="M357.14,354.92a2,2,0,1,1-2-2A2,2,0,0,1,357.14,354.92Z"/><path class="cls-9" d="M357.14,354.92a2,2,0,1,1-2-2A2,2,0,0,1,357.14,354.92Z"/><path class="cls-8" d="M359.42,354.8a2,2,0,1,1-2-2A2,2,0,0,1,359.42,354.8Z"/><path class="cls-9" d="M359.42,354.8a2,2,0,1,1-2-2A2,2,0,0,1,359.42,354.8Z"/><path class="cls-8" d="M361.7,354.68a2,2,0,1,1-2-2A2,2,0,0,1,361.7,354.68Z"/><path class="cls-9" d="M361.7,354.68a2,2,0,1,1-2-2A2,2,0,0,1,361.7,354.68Z"/><path class="cls-8" d="M364,354.56a2,2,0,1,1-2-2A2,2,0,0,1,364,354.56Z"/><path class="cls-9" d="M364,354.56a2,2,0,1,1-2-2A2,2,0,0,1,364,354.56Z"/><path class="cls-8" d="M366.26,354.44a2,2,0,1,1-2-2A2,2,0,0,1,366.26,354.44Z"/><path class="cls-9" d="M366.26,354.44a2,2,0,1,1-2-2A2,2,0,0,1,366.26,354.44Z"/><path class="cls-8" d="M368.53,354.44a2,2,0,1,1-2-2A2,2,0,0,1,368.53,354.44Z"/><path class="cls-9" d="M368.53,354.44a2,2,0,1,1-2-2A2,2,0,0,1,368.53,354.44Z"/><path class="cls-8" d="M370.81,354.2a2,2,0,1,1-2-2A2,2,0,0,1,370.81,354.2Z"/><path class="cls-9" d="M370.81,354.2a2,2,0,1,1-2-2A2,2,0,0,1,370.81,354.2Z"/><path class="cls-8" d="M373.09,354.08a2,2,0,1,1-2-2A2,2,0,0,1,373.09,354.08Z"/><path class="cls-9" d="M373.09,354.08a2,2,0,1,1-2-2A2,2,0,0,1,373.09,354.08Z"/><path class="cls-8" d="M375.49,354a2,2,0,1,1-2-2A2,2,0,0,1,375.49,354Z"/><path class="cls-9" d="M375.49,354a2,2,0,1,1-2-2A2,2,0,0,1,375.49,354Z"/><path class="cls-8" d="M377.77,353.84a2,2,0,1,1-2-2A2,2,0,0,1,377.77,353.84Z"/><path class="cls-9" d="M377.77,353.84a2,2,0,1,1-2-2A2,2,0,0,1,377.77,353.84Z"/><path class="cls-8" d="M380.05,353.6a2,2,0,1,1-2-2A2,2,0,0,1,380.05,353.6Z"/><path class="cls-9" d="M380.05,353.6a2,2,0,1,1-2-2A2,2,0,0,1,380.05,353.6Z"/><path class="cls-8" d="M382.33,353.48a2,2,0,1,1-2-2A2,2,0,0,1,382.33,353.48Z"/><path class="cls-9" d="M382.33,353.48a2,2,0,1,1-2-2A2,2,0,0,1,382.33,353.48Z"/><path class="cls-8" d="M384.61,353.36a2,2,0,1,1-2-2A2,2,0,0,1,384.61,353.36Z"/><path class="cls-9" d="M384.61,353.36a2,2,0,1,1-2-2A2,2,0,0,1,384.61,353.36Z"/><path class="cls-8" d="M386.89,353.24a2,2,0,1,1-2-2A2,2,0,0,1,386.89,353.24Z"/><path class="cls-9" d="M386.89,353.24a2,2,0,1,1-2-2A2,2,0,0,1,386.89,353.24Z"/><circle class="cls-8" cx="387.2" cy="353.12" r="1.98" transform="translate(-23.38 678.75) rotate(-80.78)"/><circle class="cls-9" cx="387.2" cy="353.12" r="1.98" transform="translate(-23.38 678.75) rotate(-80.78)"/><circle class="cls-8" cx="389.48" cy="352.88" r="1.98" transform="translate(-32.88 40.1) rotate(-5.65)"/><circle class="cls-9" cx="389.48" cy="352.88" r="1.98" transform="translate(-32.88 40.1) rotate(-5.65)"/><circle class="cls-8" cx="391.76" cy="352.76" r="1.98"/><circle class="cls-9" cx="391.76" cy="352.76" r="1.98"/><circle class="cls-8" cx="394.03" cy="352.64" r="1.98"/><circle class="cls-9" cx="394.03" cy="352.64" r="1.98"/><path class="cls-8" d="M398.42,352.52a2,2,0,1,1-2-2A2,2,0,0,1,398.42,352.52Z"/><path class="cls-9" d="M398.42,352.52a2,2,0,1,1-2-2A2,2,0,0,1,398.42,352.52Z"/><path class="cls-8" d="M400.7,352.28a2,2,0,1,1-2-2A2,2,0,0,1,400.7,352.28Z"/><path class="cls-9" d="M400.7,352.28a2,2,0,1,1-2-2A2,2,0,0,1,400.7,352.28Z"/><path class="cls-8" d="M403,352.16a2,2,0,1,1-2-2A2,2,0,0,1,403,352.16Z"/><path class="cls-9" d="M403,352.16a2,2,0,1,1-2-2A2,2,0,0,1,403,352.16Z"/><path class="cls-8" d="M405.26,351.92a2,2,0,1,1-2-2A2,2,0,0,1,405.26,351.92Z"/><path class="cls-9" d="M405.26,351.92a2,2,0,1,1-2-2A2,2,0,0,1,405.26,351.92Z"/><path class="cls-8" d="M407.53,351.8a2,2,0,1,1-2-2A2,2,0,0,1,407.53,351.8Z"/><path class="cls-9" d="M407.53,351.8a2,2,0,1,1-2-2A2,2,0,0,1,407.53,351.8Z"/><path class="cls-8" d="M409.81,351.68a2,2,0,1,1-2-2A2,2,0,0,1,409.81,351.68Z"/><path class="cls-9" d="M409.81,351.68a2,2,0,1,1-2-2A2,2,0,0,1,409.81,351.68Z"/><path class="cls-8" d="M412.09,351.44a2,2,0,1,1-2-2A2,2,0,0,1,412.09,351.44Z"/><path class="cls-9" d="M412.09,351.44a2,2,0,1,1-2-2A2,2,0,0,1,412.09,351.44Z"/><path class="cls-8" d="M414.37,351.32a2,2,0,1,1-2-2A2,2,0,0,1,414.37,351.32Z"/><path class="cls-9" d="M414.37,351.32a2,2,0,1,1-2-2A2,2,0,0,1,414.37,351.32Z"/><path class="cls-8" d="M416.65,351.08a2,2,0,1,1-2-2A2,2,0,0,1,416.65,351.08Z"/><path class="cls-9" d="M416.65,351.08a2,2,0,1,1-2-2A2,2,0,0,1,416.65,351.08Z"/><path class="cls-8" d="M418.93,351a2,2,0,1,1-2-2A2,2,0,0,1,418.93,351Z"/><path class="cls-9" d="M418.93,351a2,2,0,1,1-2-2A2,2,0,0,1,418.93,351Z"/><path class="cls-8" d="M421.33,350.72a2,2,0,1,1-2-2A2,2,0,0,1,421.33,350.72Z"/><path class="cls-9" d="M421.33,350.72a2,2,0,1,1-2-2A2,2,0,0,1,421.33,350.72Z"/><path class="cls-8" d="M423.61,350.6a2,2,0,1,1-2-2A2,2,0,0,1,423.61,350.6Z"/><path class="cls-9" d="M423.61,350.6a2,2,0,1,1-2-2A2,2,0,0,1,423.61,350.6Z"/><path class="cls-8" d="M425.89,350.36a2,2,0,1,1-2-2A2,2,0,0,1,425.89,350.36Z"/><path class="cls-9" d="M425.89,350.36a2,2,0,1,1-2-2A2,2,0,0,1,425.89,350.36Z"/><circle class="cls-8" cx="426.2" cy="350.24" r="1.98"/><circle class="cls-9" cx="426.2" cy="350.24" r="1.98"/><circle class="cls-8" cx="428.48" cy="350" r="1.98"/><circle class="cls-9" cx="428.48" cy="350" r="1.98"/><circle class="cls-8" cx="430.76" cy="349.76" r="1.98"/><circle class="cls-9" cx="430.76" cy="349.76" r="1.98"/><circle class="cls-8" cx="433.03" cy="349.64" r="1.98"/><circle class="cls-9" cx="433.03" cy="349.64" r="1.98"/><circle class="cls-8" cx="435.31" cy="349.4" r="1.98"/><circle class="cls-9" cx="435.31" cy="349.4" r="1.98"/><circle class="cls-8" cx="437.59" cy="349.28" r="1.98" transform="translate(22.73 725.27) rotate(-80.78)"/><circle class="cls-9" cx="437.59" cy="349.28" r="1.98" transform="translate(22.73 725.27) rotate(-80.78)"/><circle class="cls-8" cx="439.87" cy="349.04" r="1.98"/><circle class="cls-9" cx="439.87" cy="349.04" r="1.98"/><path class="cls-8" d="M444.26,348.8a2,2,0,1,1-2-2A2,2,0,0,1,444.26,348.8Z"/><path class="cls-9" d="M444.26,348.8a2,2,0,1,1-2-2A2,2,0,0,1,444.26,348.8Z"/><path class="cls-8" d="M446.53,348.68a2,2,0,1,1-2-2A2,2,0,0,1,446.53,348.68Z"/><path class="cls-9" d="M446.53,348.68a2,2,0,1,1-2-2A2,2,0,0,1,446.53,348.68Z"/><path class="cls-8" d="M448.81,348.44a2,2,0,1,1-2-2A2,2,0,0,1,448.81,348.44Z"/><path class="cls-9" d="M448.81,348.44a2,2,0,1,1-2-2A2,2,0,0,1,448.81,348.44Z"/><path class="cls-8" d="M451.09,348.2a2,2,0,1,1-2-2A2,2,0,0,1,451.09,348.2Z"/><path class="cls-9" d="M451.09,348.2a2,2,0,1,1-2-2A2,2,0,0,1,451.09,348.2Z"/><path class="cls-8" d="M453.38,348a2,2,0,1,1-2-2A2,2,0,0,1,453.38,348Z"/><path class="cls-9" d="M453.38,348a2,2,0,1,1-2-2A2,2,0,0,1,453.38,348Z"/><path class="cls-8" d="M455.65,347.84a2,2,0,1,1-2-2A2,2,0,0,1,455.65,347.84Z"/><path class="cls-9" d="M455.65,347.84a2,2,0,1,1-2-2A2,2,0,0,1,455.65,347.84Z"/><path class="cls-8" d="M457.94,347.6a2,2,0,1,1-2-2A2,2,0,0,1,457.94,347.6Z"/><path class="cls-9" d="M457.94,347.6a2,2,0,1,1-2-2A2,2,0,0,1,457.94,347.6Z"/><path class="cls-8" d="M460.21,347.36a2,2,0,1,1-2-2A2,2,0,0,1,460.21,347.36Z"/><path class="cls-9" d="M460.21,347.36a2,2,0,1,1-2-2A2,2,0,0,1,460.21,347.36Z"/><path class="cls-8" d="M462.5,347.12a2,2,0,1,1-2-2A2,2,0,0,1,462.5,347.12Z"/><path class="cls-9" d="M462.5,347.12a2,2,0,1,1-2-2A2,2,0,0,1,462.5,347.12Z"/><circle class="cls-8" cx="462.8" cy="346.88" r="1.98"/><circle class="cls-9" cx="462.8" cy="346.88" r="1.98"/><path class="cls-8" d="M467.18,346.64a2,2,0,1,1-2-2A2,2,0,0,1,467.18,346.64Z"/><path class="cls-9" d="M467.18,346.64a2,2,0,1,1-2-2A2,2,0,0,1,467.18,346.64Z"/><circle class="cls-8" cx="467.48" cy="346.4" r="1.98"/><circle class="cls-9" cx="467.48" cy="346.4" r="1.98"/><path class="cls-8" d="M471.74,346.16a2,2,0,1,1-2-2A2,2,0,0,1,471.74,346.16Z"/><path class="cls-9" d="M471.74,346.16a2,2,0,1,1-2-2A2,2,0,0,1,471.74,346.16Z"/><circle class="cls-8" cx="472.03" cy="345.92" r="1.98" transform="translate(-31.79 48.2) rotate(-5.65)"/><circle class="cls-9" cx="472.03" cy="345.92" r="1.98" transform="translate(-31.79 48.2) rotate(-5.65)"/><path class="cls-8" d="M476.3,345.68a2,2,0,1,1-2-2A2,2,0,0,1,476.3,345.68Z"/><path class="cls-9" d="M476.3,345.68a2,2,0,1,1-2-2A2,2,0,0,1,476.3,345.68Z"/><circle class="cls-8" cx="476.59" cy="345.44" r="1.98" transform="translate(98.24 796.34) rotate(-85.93)"/><circle class="cls-9" cx="476.59" cy="345.44" r="1.98" transform="translate(98.24 796.34) rotate(-85.93)"/><path class="cls-8" d="M480.86,345.2a2,2,0,1,1-2-2A2,2,0,0,1,480.86,345.2Z"/><path class="cls-9" d="M480.86,345.2a2,2,0,1,1-2-2A2,2,0,0,1,480.86,345.2Z"/><circle class="cls-8" cx="481.15" cy="344.96" r="1.98"/><circle class="cls-9" cx="481.15" cy="344.96" r="1.98"/><path class="cls-8" d="M485.42,344.72a2,2,0,1,1-2-2A2,2,0,0,1,485.42,344.72Z"/><path class="cls-9" d="M485.42,344.72a2,2,0,1,1-2-2A2,2,0,0,1,485.42,344.72Z"/><circle class="cls-8" cx="485.71" cy="344.48" r="1.98" transform="translate(-31.58 49.54) rotate(-5.65)"/><circle class="cls-9" cx="485.71" cy="344.48" r="1.98" transform="translate(-31.58 49.54) rotate(-5.65)"/><path class="cls-8" d="M490,344.24a2,2,0,1,1-2-2A2,2,0,0,1,490,344.24Z"/><path class="cls-9" d="M490,344.24a2,2,0,1,1-2-2A2,2,0,0,1,490,344.24Z"/><path class="cls-8" d="M492.38,344a2,2,0,1,1-2-2A2,2,0,0,1,492.38,344Z"/><path class="cls-9" d="M492.38,344a2,2,0,1,1-2-2A2,2,0,0,1,492.38,344Z"/><path class="cls-8" d="M494.65,343.64a2,2,0,1,1-2-2A2,2,0,0,1,494.65,343.64Z"/><path class="cls-9" d="M494.65,343.64a2,2,0,1,1-2-2A2,2,0,0,1,494.65,343.64Z"/><path class="cls-8" d="M496.94,343.4a2,2,0,1,1-2-2A2,2,0,0,1,496.94,343.4Z"/><path class="cls-9" d="M496.94,343.4a2,2,0,1,1-2-2A2,2,0,0,1,496.94,343.4Z"/><path class="cls-8" d="M499.21,343.16a2,2,0,1,1-2-2A2,2,0,0,1,499.21,343.16Z"/><path class="cls-9" d="M499.21,343.16a2,2,0,1,1-2-2A2,2,0,0,1,499.21,343.16Z"/><path class="cls-8" d="M501.5,342.92a2,2,0,1,1-2-2A2,2,0,0,1,501.5,342.92Z"/><path class="cls-9" d="M501.5,342.92a2,2,0,1,1-2-2A2,2,0,0,1,501.5,342.92Z"/><circle class="cls-8" cx="501.8" cy="342.56" r="1.98" transform="translate(-31.31 51.11) rotate(-5.65)"/><circle class="cls-9" cx="501.8" cy="342.56" r="1.98" transform="translate(-31.31 51.11) rotate(-5.65)"/><path class="cls-8" d="M506.06,342.32a2,2,0,1,1-2-2A2,2,0,0,1,506.06,342.32Z"/><path class="cls-9" d="M506.06,342.32a2,2,0,1,1-2-2A2,2,0,0,1,506.06,342.32Z"/><circle class="cls-8" cx="506.36" cy="342.08" r="1.98" transform="translate(129.25 822.91) rotate(-85.93)"/><circle class="cls-9" cx="506.36" cy="342.08" r="1.98" transform="translate(129.25 822.91) rotate(-85.93)"/><path class="cls-8" d="M510.62,341.72a2,2,0,1,1-2-2A2,2,0,0,1,510.62,341.72Z"/><path class="cls-9" d="M510.62,341.72a2,2,0,1,1-2-2A2,2,0,0,1,510.62,341.72Z"/><circle class="cls-8" cx="510.92" cy="341.48" r="1.98" transform="translate(-31.16 52.01) rotate(-5.65)"/><circle class="cls-9" cx="510.92" cy="341.48" r="1.98" transform="translate(-31.16 52.01) rotate(-5.65)"/><path class="cls-8" d="M515.3,341.12a2,2,0,1,1-2-2A2,2,0,0,1,515.3,341.12Z"/><path class="cls-9" d="M515.3,341.12a2,2,0,1,1-2-2A2,2,0,0,1,515.3,341.12Z"/><circle class="cls-8" cx="515.59" cy="340.88" r="1.98"/><circle class="cls-9" cx="515.59" cy="340.88" r="1.98"/><path class="cls-8" d="M519.86,340.52a2,2,0,1,1-2-2A2,2,0,0,1,519.86,340.52Z"/><path class="cls-9" d="M519.86,340.52a2,2,0,1,1-2-2A2,2,0,0,1,519.86,340.52Z"/><circle class="cls-8" cx="520.15" cy="340.28" r="1.98" transform="translate(-31 52.91) rotate(-5.65)"/><circle class="cls-9" cx="520.15" cy="340.28" r="1.98" transform="translate(-31 52.91) rotate(-5.65)"/><path class="cls-8" d="M524.42,339.92a2,2,0,1,1-2-2A2,2,0,0,1,524.42,339.92Z"/><path class="cls-9" d="M524.42,339.92a2,2,0,1,1-2-2A2,2,0,0,1,524.42,339.92Z"/><circle class="cls-8" cx="524.71" cy="339.56" r="1.98" transform="translate(-30.91 53.36) rotate(-5.65)"/><circle class="cls-9" cx="524.71" cy="339.56" r="1.98" transform="translate(-30.91 53.36) rotate(-5.65)"/><path class="cls-8" d="M529,339.32a2,2,0,1,1-2-2A2,2,0,0,1,529,339.32Z"/><path class="cls-9" d="M529,339.32a2,2,0,1,1-2-2A2,2,0,0,1,529,339.32Z"/><circle class="cls-8" cx="529.27" cy="338.96" r="1.98"/><circle class="cls-9" cx="529.27" cy="338.96" r="1.98"/><path class="cls-8" d="M533.53,338.6a2,2,0,1,1-2-2A2,2,0,0,1,533.53,338.6Z"/><path class="cls-9" d="M533.53,338.6a2,2,0,1,1-2-2A2,2,0,0,1,533.53,338.6Z"/><path class="cls-8" d="M535.82,338.24a2,2,0,1,1-2-2A2,2,0,0,1,535.82,338.24Z"/><path class="cls-9" d="M535.82,338.24a2,2,0,1,1-2-2A2,2,0,0,1,535.82,338.24Z"/><path class="cls-8" d="M538.21,337.88a2,2,0,1,1-2-2A2,2,0,0,1,538.21,337.88Z"/><path class="cls-9" d="M538.21,337.88a2,2,0,1,1-2-2A2,2,0,0,1,538.21,337.88Z"/><path class="cls-8" d="M540.5,337.52a2,2,0,1,1-2-2A2,2,0,0,1,540.5,337.52Z"/><path class="cls-9" d="M540.5,337.52a2,2,0,1,1-2-2A2,2,0,0,1,540.5,337.52Z"/><circle class="cls-8" cx="540.8" cy="337.16" r="1.98"/><circle class="cls-9" cx="540.8" cy="337.16" r="1.98"/><path class="cls-8" d="M545.06,336.68a2,2,0,1,1-2-2A2,2,0,0,1,545.06,336.68Z"/><path class="cls-9" d="M545.06,336.68a2,2,0,1,1-2-2A2,2,0,0,1,545.06,336.68Z"/><circle class="cls-8" cx="545.36" cy="336.32" r="1.98"/><circle class="cls-9" cx="545.36" cy="336.32" r="1.98"/><path class="cls-8" d="M549.62,336a2,2,0,1,1-2-2A2,2,0,0,1,549.62,336Z"/><path class="cls-9" d="M549.62,336a2,2,0,1,1-2-2A2,2,0,0,1,549.62,336Z"/><circle class="cls-8" cx="549.92" cy="335.48" r="1.98" transform="translate(-30.38 55.82) rotate(-5.65)"/><circle class="cls-9" cx="549.92" cy="335.48" r="1.98" transform="translate(-30.38 55.82) rotate(-5.65)"/><path class="cls-8" d="M554.18,335a2,2,0,1,1-2-2A2,2,0,0,1,554.18,335Z"/><path class="cls-9" d="M554.18,335a2,2,0,1,1-2-2A2,2,0,0,1,554.18,335Z"/><circle class="cls-8" cx="554.48" cy="334.64" r="1.98" transform="translate(181.38 863.99) rotate(-85.93)"/><circle class="cls-9" cx="554.48" cy="334.64" r="1.98" transform="translate(181.38 863.99) rotate(-85.93)"/><path class="cls-8" d="M558.74,334.16a2,2,0,1,1-2-2A2,2,0,0,1,558.74,334.16Z"/><path class="cls-9" d="M558.74,334.16a2,2,0,1,1-2-2A2,2,0,0,1,558.74,334.16Z"/><circle class="cls-8" cx="559.15" cy="333.68" r="1.98"/><circle class="cls-9" cx="559.15" cy="333.68" r="1.98"/><path class="cls-8" d="M563.42,333.2a2,2,0,1,1-2-2A2,2,0,0,1,563.42,333.2Z"/><path class="cls-9" d="M563.42,333.2a2,2,0,1,1-2-2A2,2,0,0,1,563.42,333.2Z"/><circle class="cls-8" cx="563.71" cy="332.6" r="1.98"/><circle class="cls-9" cx="563.71" cy="332.6" r="1.98"/><path class="cls-8" d="M568,332.12a2,2,0,1,1-2-2A2,2,0,0,1,568,332.12Z"/><path class="cls-9" d="M568,332.12a2,2,0,1,1-2-2A2,2,0,0,1,568,332.12Z"/><circle class="cls-8" cx="568.27" cy="331.52" r="1.98"/><circle class="cls-9" cx="568.27" cy="331.52" r="1.98"/><path class="cls-8" d="M572.53,331a2,2,0,1,1-2-2A2,2,0,0,1,572.53,331Z"/><path class="cls-9" d="M572.53,331a2,2,0,1,1-2-2A2,2,0,0,1,572.53,331Z"/><path class="cls-8" d="M574.82,330.32a2,2,0,1,1-2-2A2,2,0,0,1,574.82,330.32Z"/><path class="cls-9" d="M574.82,330.32a2,2,0,1,1-2-2A2,2,0,0,1,574.82,330.32Z"/><path class="cls-8" d="M577.09,329.72a2,2,0,1,1-2-2A2,2,0,0,1,577.09,329.72Z"/><path class="cls-9" d="M577.09,329.72a2,2,0,1,1-2-2A2,2,0,0,1,577.09,329.72Z"/><path class="cls-8" d="M579.38,329.12a2,2,0,1,1-2-2A2,2,0,0,1,579.38,329.12Z"/><path class="cls-9" d="M579.38,329.12a2,2,0,1,1-2-2A2,2,0,0,1,579.38,329.12Z"/><path class="cls-8" d="M581.65,328.4a2,2,0,1,1-2-2A2,2,0,0,1,581.65,328.4Z"/><path class="cls-9" d="M581.65,328.4a2,2,0,1,1-2-2A2,2,0,0,1,581.65,328.4Z"/><path class="cls-8" d="M584.06,327.68a2,2,0,1,1-2-2A2,2,0,0,1,584.06,327.68Z"/><path class="cls-9" d="M584.06,327.68a2,2,0,1,1-2-2A2,2,0,0,1,584.06,327.68Z"/><circle class="cls-8" cx="584.36" cy="326.96" r="1.98"/><circle class="cls-9" cx="584.36" cy="326.96" r="1.98"/><path class="cls-8" d="M588.62,326.12a2,2,0,1,1-2-2A2,2,0,0,1,588.62,326.12Z"/><path class="cls-9" d="M588.62,326.12a2,2,0,1,1-2-2A2,2,0,0,1,588.62,326.12Z"/><circle class="cls-8" cx="588.92" cy="325.28" r="1.98" transform="translate(-29.19 59.61) rotate(-5.65)"/><circle class="cls-9" cx="588.92" cy="325.28" r="1.98" transform="translate(-29.19 59.61) rotate(-5.65)"/><path class="cls-8" d="M593.18,324.44a2,2,0,1,1-2-2A2,2,0,0,1,593.18,324.44Z"/><path class="cls-9" d="M593.18,324.44a2,2,0,1,1-2-2A2,2,0,0,1,593.18,324.44Z"/><circle class="cls-8" cx="593.48" cy="323.48" r="1.98" transform="translate(-28.99 60.05) rotate(-5.65)"/><circle class="cls-9" cx="593.48" cy="323.48" r="1.98" transform="translate(-28.99 60.05) rotate(-5.65)"/><path class="cls-8" d="M597.74,322.52a2,2,0,1,1-2-2A2,2,0,0,1,597.74,322.52Z"/><path class="cls-9" d="M597.74,322.52a2,2,0,1,1-2-2A2,2,0,0,1,597.74,322.52Z"/><circle class="cls-8" cx="598.03" cy="321.56" r="1.98" transform="translate(-28.77 60.49) rotate(-5.65)"/><circle class="cls-9" cx="598.03" cy="321.56" r="1.98" transform="translate(-28.77 60.49) rotate(-5.65)"/><path class="cls-8" d="M602.3,320.48a2,2,0,1,1-2-2A2,2,0,0,1,602.3,320.48Z"/><path class="cls-9" d="M602.3,320.48a2,2,0,1,1-2-2A2,2,0,0,1,602.3,320.48Z"/><circle class="cls-8" cx="602.59" cy="319.28" r="1.98" transform="translate(-28.53 60.93) rotate(-5.65)"/><circle class="cls-9" cx="602.59" cy="319.28" r="1.98" transform="translate(-28.53 60.93) rotate(-5.65)"/><path class="cls-8" d="M607,318.08a2,2,0,1,1-2-2A2,2,0,0,1,607,318.08Z"/><path class="cls-9" d="M607,318.08a2,2,0,1,1-2-2A2,2,0,0,1,607,318.08Z"/><circle class="cls-8" cx="607.27" cy="316.76" r="1.98"/><circle class="cls-9" cx="607.27" cy="316.76" r="1.98"/><path class="cls-8" d="M611.53,315.44a2,2,0,1,1-2-2A2,2,0,0,1,611.53,315.44Z"/><path class="cls-9" d="M611.53,315.44a2,2,0,1,1-2-2A2,2,0,0,1,611.53,315.44Z"/><path class="cls-8" d="M613.82,314a2,2,0,1,1-2-2A2,2,0,0,1,613.82,314Z"/><path class="cls-9" d="M613.82,314a2,2,0,1,1-2-2A2,2,0,0,1,613.82,314Z"/><path class="cls-8" d="M616.09,312.44a2,2,0,1,1-2-2A2,2,0,0,1,616.09,312.44Z"/><path class="cls-9" d="M616.09,312.44a2,2,0,1,1-2-2A2,2,0,0,1,616.09,312.44Z"/><path class="cls-8" d="M618.38,310.76a2,2,0,1,1-2-2A2,2,0,0,1,618.38,310.76Z"/><path class="cls-9" d="M618.38,310.76a2,2,0,1,1-2-2A2,2,0,0,1,618.38,310.76Z"/><path class="cls-8" d="M620.65,309.08a2,2,0,1,1-2-2A2,2,0,0,1,620.65,309.08Z"/><path class="cls-9" d="M620.65,309.08a2,2,0,1,1-2-2A2,2,0,0,1,620.65,309.08Z"/><path class="cls-8" d="M622.94,307.16a2,2,0,1,1-2-2A2,2,0,0,1,622.94,307.16Z"/><path class="cls-9" d="M622.94,307.16a2,2,0,1,1-2-2A2,2,0,0,1,622.94,307.16Z"/><path class="cls-8" d="M625.21,305.24a2,2,0,1,1-2-2A2,2,0,0,1,625.21,305.24Z"/><path class="cls-9" d="M625.21,305.24a2,2,0,1,1-2-2A2,2,0,0,1,625.21,305.24Z"/><path class="cls-8" d="M627.5,303.2a2,2,0,1,1-2-2A2,2,0,0,1,627.5,303.2Z"/><path class="cls-9" d="M627.5,303.2a2,2,0,1,1-2-2A2,2,0,0,1,627.5,303.2Z"/><circle class="cls-8" cx="627.92" cy="300.92" r="1.98" transform="translate(-26.6 63.34) rotate(-5.65)"/><circle class="cls-9" cx="627.92" cy="300.92" r="1.98" transform="translate(-26.6 63.34) rotate(-5.65)"/><path class="cls-8" d="M632.18,298.64a2,2,0,1,1-2-2A2,2,0,0,1,632.18,298.64Z"/><path class="cls-9" d="M632.18,298.64a2,2,0,1,1-2-2A2,2,0,0,1,632.18,298.64Z"/><circle class="cls-8" cx="632.48" cy="296.12" r="1.98" transform="translate(-26.1 63.76) rotate(-5.65)"/><circle class="cls-9" cx="632.48" cy="296.12" r="1.98" transform="translate(-26.1 63.76) rotate(-5.65)"/><path class="cls-8" d="M636.74,293.36a2,2,0,1,1-2-2A2,2,0,0,1,636.74,293.36Z"/><path class="cls-9" d="M636.74,293.36a2,2,0,1,1-2-2A2,2,0,0,1,636.74,293.36Z"/><circle class="cls-8" cx="637.03" cy="290.48" r="1.98" transform="translate(-25.52 64.19) rotate(-5.65)"/><circle class="cls-9" cx="637.03" cy="290.48" r="1.98" transform="translate(-25.52 64.19) rotate(-5.65)"/><path class="cls-8" d="M641.3,287.48a2,2,0,1,1-2-2A2,2,0,0,1,641.3,287.48Z"/><path class="cls-9" d="M641.3,287.48a2,2,0,1,1-2-2A2,2,0,0,1,641.3,287.48Z"/><g class="cls-3"><polyline class="cls-10" points="54.95 363.24 57.26 363.13 59.53 363.01 61.81 362.89 64.09 362.77 66.38 362.65 68.66 362.54 71.06 362.42 73.33 362.3 75.61 362.18 77.89 362.06 80.17 361.81 82.45 361.69 84.73 361.45 87.02 361.33 89.3 361.1 91.58 360.98 93.86 360.74 96.25 360.5 98.53 360.38 100.81 360.13 103.09 359.89 105.38 359.77 107.66 359.54 109.94 359.3 112.22 359.06 114.5 358.81 116.78 358.69 119.17 358.45 121.45 358.21 123.73 357.98 126.02 357.74 128.29 357.5 130.57 357.38 132.85 357.13 135.13 356.89 137.41 356.65 139.69 356.42 142.09 356.18 144.38 355.94 146.66 355.69 148.94 355.57 151.22 355.33 153.5 355.1 155.78 354.86 158.06 354.62 160.34 354.38 162.62 354.13 165.01 353.89 167.29 353.77 169.57 353.54 171.85 353.3 174.13 353.06 176.41 352.81 178.69 352.69 180.97 352.45 183.25 352.21 185.53 351.98 187.94 351.74 190.22 351.5 192.5 351.38 194.78 351.13 197.06 350.89 199.34 350.65 201.62 350.54 203.9 350.3 206.18 350.06 208.46 349.81 210.85 349.57 213.13 349.45 215.41 349.21 217.69 348.98 219.97 348.86 222.25 348.62 224.53 348.38 226.81 348.25 229.09 348.01 231.38 347.77 233.78 347.54 236.06 347.42 238.34 347.18 240.62 346.94 242.9 346.81 245.18 346.57 247.46 346.45 249.74 346.21 252.01 345.98 254.29 345.86 256.69 345.62 258.98 345.38 261.25 345.25 263.54 345.01 265.81 344.89 268.1 344.65 270.38 344.42 272.65 344.3 274.94 343.94 277.21 343.57 279.62 343.21 281.89 342.86 284.18 342.5 286.45 342.13 288.74 341.77 291.01 341.42 293.3 341.06 295.57 340.69 297.86 340.33 300.13 339.98 302.42 339.62 304.81 339.38 307.1 339.01 309.38 338.65 311.65 338.3 313.94 337.94 316.21 337.57 318.5 337.21 320.77 336.86 323.06 336.62 325.33 336.25 327.74 335.89 330.01 335.54 332.3 335.06 334.57 334.57 336.86 334.1 339.13 333.62 341.42 333.13 343.69 332.65 345.98 332.18 348.25 331.69 350.65 331.21 352.94 330.74 355.21 330.25 357.5 329.77 359.77 329.3 362.06 328.81 364.33 328.33 366.62 327.86 368.89 327.38 371.18 326.77 373.57 326.18 375.86 325.57 378.13 324.98 380.42 324.38 382.69 323.77 384.98 323.18 387.25 322.57 389.54 321.98 391.81 321.38 394.1 320.89 396.5 320.3 398.77 319.69 401.06 318.98 403.33 318.25 405.62 317.54 407.89 316.94 410.18 316.21 412.45 315.5 414.74 314.77 417.01 314.18 419.42 313.45 421.69 312.74 423.98 312.01 426.25 311.18 428.54 310.33 430.81 309.62 433.1 308.77 435.38 308.06 437.65 307.33 439.94 306.5 442.33 305.77 444.62 304.94 446.89 304.1 449.18 303.13 451.45 302.3 453.74 301.45 456.01 300.62 458.3 299.77 460.57 298.81 462.86 297.98 465.25 297.01 467.54 296.06 469.81 295.21 472.1 294.25 474.38 293.3 476.65 292.21 478.94 291.25 481.21 290.3 483.5 289.33 485.77 288.25 488.06 287.3 490.45 286.21 492.74 285.13 495.01 284.06 497.3 282.98 499.57 281.89 501.86 280.69 504.13 279.62 506.42 278.54 508.69 277.33 510.98 276.13 513.38 274.94 515.65 273.74 517.93 272.54 520.22 271.21 522.5 269.89 524.77 268.69 527.05 267.38 529.34 265.94 531.62 264.62 533.89 263.18 536.29 261.74 538.58 260.3 540.86 258.74 543.13 257.3 545.41 255.62 547.7 253.94 549.98 252.25 552.25 250.57 554.53 248.78 556.82 246.85 559.22 244.94 561.5 242.9 563.77 240.97 566.05 238.69 568.34 236.53 570.62 234.25 572.89 231.85 575.17 229.34 577.46 226.69 579.74 223.94 582.13 221.18 584.41 218.06 586.7 214.94 588.98 211.57 591.25 208.22 593.53 204.38 595.82 200.53 598.1 196.46 600.38 192.13 602.65 187.46 605.05 182.66 607.34 177.38 609.62 171.97 611.89 166.09 614.17 159.97 616.46 153.5 618.74 146.66 621.01 139.22 623.29 131.53 625.58 123.14 627.98 114.25 630.25 104.89 632.53 94.81 634.82 83.89 637.1 72.5 639.38 60.2"/></g><circle class="cls-11" cx="54.98" cy="363.26" r="2.52"/><circle class="cls-12" cx="54.98" cy="363.26" r="2.52"/><circle class="cls-11" cx="57.25" cy="363.14" r="2.52"/><circle class="cls-12" cx="57.25" cy="363.14" r="2.52"/><circle class="cls-11" cx="59.53" cy="363.02" r="2.52"/><circle class="cls-12" cx="59.53" cy="363.02" r="2.52"/><circle class="cls-11" cx="61.82" cy="362.9" r="2.52"/><circle class="cls-12" cx="61.82" cy="362.9" r="2.52"/><circle class="cls-11" cx="64.09" cy="362.78" r="2.52"/><circle class="cls-12" cx="64.09" cy="362.78" r="2.52"/><circle class="cls-11" cx="66.37" cy="362.66" r="2.52"/><circle class="cls-12" cx="66.37" cy="362.66" r="2.52"/><circle class="cls-11" cx="68.65" cy="362.54" r="2.52"/><circle class="cls-12" cx="68.65" cy="362.54" r="2.52"/><circle class="cls-11" cx="71.06" cy="362.42" r="2.52"/><circle class="cls-12" cx="71.06" cy="362.42" r="2.52"/><circle class="cls-11" cx="73.34" cy="362.3" r="2.52"/><circle class="cls-12" cx="73.34" cy="362.3" r="2.52"/><circle class="cls-11" cx="75.62" cy="362.18" r="2.52"/><circle class="cls-12" cx="75.62" cy="362.18" r="2.52"/><circle class="cls-11" cx="77.9" cy="362.06" r="2.52"/><circle class="cls-12" cx="77.9" cy="362.06" r="2.52"/><circle class="cls-11" cx="80.17" cy="361.82" r="2.52"/><circle class="cls-12" cx="80.17" cy="361.82" r="2.52"/><circle class="cls-11" cx="82.45" cy="361.7" r="2.52"/><circle class="cls-12" cx="82.45" cy="361.7" r="2.52"/><circle class="cls-11" cx="84.73" cy="361.46" r="2.52"/><circle class="cls-12" cx="84.73" cy="361.46" r="2.52"/><circle class="cls-11" cx="87.01" cy="361.34" r="2.52" transform="translate(-281.14 412.32) rotate(-84.35)"/><circle class="cls-12" cx="87.01" cy="361.34" r="2.52" transform="translate(-281.14 412.32) rotate(-84.35)"/><circle class="cls-11" cx="89.29" cy="361.1" r="2.52" transform="translate(-278.84 414.37) rotate(-84.35)"/><circle class="cls-12" cx="89.29" cy="361.1" r="2.52" transform="translate(-278.84 414.37) rotate(-84.35)"/><circle class="cls-11" cx="91.57" cy="360.98" r="2.52" transform="translate(-276.67 416.53) rotate(-84.35)"/><circle class="cls-12" cx="91.57" cy="360.98" r="2.52" transform="translate(-276.67 416.53) rotate(-84.35)"/><circle class="cls-11" cx="93.86" cy="360.74" r="2.52"/><circle class="cls-12" cx="93.86" cy="360.74" r="2.52"/><circle class="cls-11" cx="96.26" cy="360.5" r="2.52"/><circle class="cls-12" cx="96.26" cy="360.5" r="2.52"/><circle class="cls-11" cx="98.53" cy="360.38" r="2.52"/><circle class="cls-12" cx="98.53" cy="360.38" r="2.52"/><circle class="cls-11" cx="100.81" cy="360.14" r="2.52"/><circle class="cls-12" cx="100.81" cy="360.14" r="2.52"/><circle class="cls-11" cx="103.09" cy="359.9" r="2.52"/><circle class="cls-12" cx="103.09" cy="359.9" r="2.52"/><circle class="cls-11" cx="105.38" cy="359.78" r="2.52"/><circle class="cls-12" cx="105.38" cy="359.78" r="2.52"/><circle class="cls-11" cx="107.65" cy="359.54" r="2.52"/><circle class="cls-12" cx="107.65" cy="359.54" r="2.52"/><circle class="cls-11" cx="109.93" cy="359.3" r="2.52" transform="translate(-258.44 433.29) rotate(-84.35)"/><circle class="cls-12" cx="109.93" cy="359.3" r="2.52" transform="translate(-258.44 433.29) rotate(-84.35)"/><circle class="cls-11" cx="112.21" cy="359.06" r="2.52" transform="translate(-256.15 435.34) rotate(-84.35)"/><circle class="cls-12" cx="112.21" cy="359.06" r="2.52" transform="translate(-256.15 435.34) rotate(-84.35)"/><circle class="cls-11" cx="114.5" cy="358.82" r="2.52"/><circle class="cls-12" cx="114.5" cy="358.82" r="2.52"/><circle class="cls-11" cx="116.78" cy="358.7" r="2.52"/><circle class="cls-12" cx="116.78" cy="358.7" r="2.52"/><circle class="cls-11" cx="119.17" cy="358.46" r="2.52"/><circle class="cls-12" cx="119.17" cy="358.46" r="2.52"/><circle class="cls-11" cx="121.45" cy="358.22" r="2.52"/><circle class="cls-12" cx="121.45" cy="358.22" r="2.52"/><circle class="cls-11" cx="123.73" cy="357.98" r="2.52"/><circle class="cls-12" cx="123.73" cy="357.98" r="2.52"/><circle class="cls-11" cx="126.01" cy="357.74" r="2.52" transform="translate(-242.4 447.89) rotate(-84.35)"/><circle class="cls-12" cx="126.01" cy="357.74" r="2.52" transform="translate(-242.4 447.89) rotate(-84.35)"/><circle class="cls-11" cx="128.29" cy="357.5" r="2.52"/><circle class="cls-12" cx="128.29" cy="357.5" r="2.52"/><circle class="cls-11" cx="130.57" cy="357.38" r="2.52" transform="translate(-237.93 452.1) rotate(-84.35)"/><circle class="cls-12" cx="130.57" cy="357.38" r="2.52" transform="translate(-237.93 452.1) rotate(-84.35)"/><circle class="cls-11" cx="132.86" cy="357.14" r="2.52"/><circle class="cls-12" cx="132.86" cy="357.14" r="2.52"/><circle class="cls-11" cx="135.14" cy="356.9" r="2.52"/><circle class="cls-12" cx="135.14" cy="356.9" r="2.52"/><circle class="cls-11" cx="137.42" cy="356.66" r="2.52"/><circle class="cls-12" cx="137.42" cy="356.66" r="2.52"/><circle class="cls-11" cx="139.69" cy="356.42" r="2.52"/><circle class="cls-12" cx="139.69" cy="356.42" r="2.52"/><circle class="cls-11" cx="142.09" cy="356.18" r="2.52"/><circle class="cls-12" cx="142.09" cy="356.18" r="2.52"/><circle class="cls-11" cx="144.38" cy="355.94" r="2.52"/><circle class="cls-12" cx="144.38" cy="355.94" r="2.52"/><circle class="cls-11" cx="146.65" cy="355.7" r="2.52" transform="translate(-221.76 466.59) rotate(-84.35)"/><circle class="cls-12" cx="146.65" cy="355.7" r="2.52" transform="translate(-221.76 466.59) rotate(-84.35)"/><circle class="cls-11" cx="148.93" cy="355.58" r="2.52"/><circle class="cls-12" cx="148.93" cy="355.58" r="2.52"/><circle class="cls-11" cx="151.21" cy="355.34" r="2.52" transform="translate(-217.29 470.8) rotate(-84.35)"/><circle class="cls-12" cx="151.21" cy="355.34" r="2.52" transform="translate(-217.29 470.8) rotate(-84.35)"/><circle class="cls-11" cx="153.5" cy="355.1" r="2.52"/><circle class="cls-12" cx="153.5" cy="355.1" r="2.52"/><circle class="cls-11" cx="155.78" cy="354.86" r="2.52"/><circle class="cls-12" cx="155.78" cy="354.86" r="2.52"/><circle class="cls-11" cx="158.06" cy="354.62" r="2.52"/><circle class="cls-12" cx="158.06" cy="354.62" r="2.52"/><circle class="cls-11" cx="160.34" cy="354.38" r="2.52"/><circle class="cls-12" cx="160.34" cy="354.38" r="2.52"/><circle class="cls-11" cx="162.62" cy="354.14" r="2.52"/><circle class="cls-12" cx="162.62" cy="354.14" r="2.52"/><circle class="cls-11" cx="165.01" cy="353.9" r="2.52"/><circle class="cls-12" cx="165.01" cy="353.9" r="2.52"/><circle class="cls-11" cx="167.29" cy="353.78" r="2.52"/><circle class="cls-12" cx="167.29" cy="353.78" r="2.52"/><circle class="cls-11" cx="169.57" cy="353.54" r="2.52"/><circle class="cls-12" cx="169.57" cy="353.54" r="2.52"/><circle class="cls-11" cx="171.86" cy="353.3" r="2.52"/><circle class="cls-12" cx="171.86" cy="353.3" r="2.52"/><circle class="cls-11" cx="174.14" cy="353.06" r="2.52"/><circle class="cls-12" cx="174.14" cy="353.06" r="2.52"/><circle class="cls-11" cx="176.42" cy="352.82" r="2.52"/><circle class="cls-12" cx="176.42" cy="352.82" r="2.52"/><circle class="cls-11" cx="178.69" cy="352.7" r="2.52"/><circle class="cls-12" cx="178.69" cy="352.7" r="2.52"/><circle class="cls-11" cx="180.98" cy="352.46" r="2.52"/><circle class="cls-12" cx="180.98" cy="352.46" r="2.52"/><circle class="cls-11" cx="183.26" cy="352.22" r="2.52"/><circle class="cls-12" cx="183.26" cy="352.22" r="2.52"/><circle class="cls-11" cx="185.53" cy="351.98" r="2.52"/><circle class="cls-12" cx="185.53" cy="351.98" r="2.52"/><circle class="cls-11" cx="187.93" cy="351.74" r="2.52"/><circle class="cls-12" cx="187.93" cy="351.74" r="2.52"/><circle class="cls-11" cx="190.21" cy="351.5" r="2.52" transform="translate(-195.58 455.86) rotate(-76.72)"/><circle class="cls-12" cx="190.21" cy="351.5" r="2.52" transform="translate(-195.58 455.86) rotate(-76.72)"/><circle class="cls-11" cx="192.5" cy="351.38" r="2.52"/><circle class="cls-12" cx="192.5" cy="351.38" r="2.52"/><circle class="cls-11" cx="194.78" cy="351.14" r="2.52"/><circle class="cls-12" cx="194.78" cy="351.14" r="2.52"/><circle class="cls-11" cx="197.05" cy="350.9" r="2.52"/><circle class="cls-12" cx="197.05" cy="350.9" r="2.52"/><circle class="cls-11" cx="199.33" cy="350.66" r="2.52"/><circle class="cls-12" cx="199.33" cy="350.66" r="2.52"/><circle class="cls-11" cx="201.61" cy="350.54" r="2.52"/><circle class="cls-12" cx="201.61" cy="350.54" r="2.52"/><circle class="cls-11" cx="203.89" cy="350.3" r="2.52"/><circle class="cls-12" cx="203.89" cy="350.3" r="2.52"/><circle class="cls-11" cx="206.18" cy="350.06" r="2.52" transform="translate(-162.49 520.73) rotate(-84.34)"/><circle class="cls-12" cx="206.18" cy="350.06" r="2.52" transform="translate(-162.49 520.73) rotate(-84.34)"/><circle class="cls-11" cx="208.46" cy="349.82" r="2.52"/><circle class="cls-12" cx="208.46" cy="349.82" r="2.52"/><circle class="cls-11" cx="210.85" cy="349.58" r="2.52" transform="translate(-53.27 38.29) rotate(-9.22)"/><circle class="cls-12" cx="210.85" cy="349.58" r="2.52" transform="translate(-53.27 38.29) rotate(-9.22)"/><circle class="cls-11" cx="213.13" cy="349.46" r="2.52"/><circle class="cls-12" cx="213.13" cy="349.46" r="2.52"/><circle class="cls-11" cx="215.41" cy="349.22" r="2.52" transform="translate(-53.16 39.02) rotate(-9.22)"/><circle class="cls-12" cx="215.41" cy="349.22" r="2.52" transform="translate(-53.16 39.02) rotate(-9.22)"/><circle class="cls-11" cx="217.7" cy="348.98" r="2.52"/><circle class="cls-12" cx="217.7" cy="348.98" r="2.52"/><circle class="cls-11" cx="219.98" cy="348.86" r="2.52" transform="translate(-159.61 510.11) rotate(-80.78)"/><circle class="cls-12" cx="219.98" cy="348.86" r="2.52" transform="translate(-159.61 510.11) rotate(-80.78)"/><circle class="cls-11" cx="222.26" cy="348.62" r="2.52"/><circle class="cls-12" cx="222.26" cy="348.62" r="2.52"/><circle class="cls-11" cx="224.53" cy="348.38" r="2.52"/><circle class="cls-12" cx="224.53" cy="348.38" r="2.52"/><circle class="cls-11" cx="226.81" cy="348.26" r="2.52"/><circle class="cls-12" cx="226.81" cy="348.26" r="2.52"/><circle class="cls-11" cx="229.09" cy="348.02" r="2.52"/><circle class="cls-12" cx="229.09" cy="348.02" r="2.52"/><circle class="cls-11" cx="231.37" cy="347.78" r="2.52"/><circle class="cls-12" cx="231.37" cy="347.78" r="2.52"/><circle class="cls-11" cx="233.77" cy="347.54" r="2.52"/><circle class="cls-12" cx="233.77" cy="347.54" r="2.52"/><circle class="cls-11" cx="236.05" cy="347.42" r="2.52"/><circle class="cls-12" cx="236.05" cy="347.42" r="2.52"/><circle class="cls-11" cx="238.33" cy="347.18" r="2.52"/><circle class="cls-12" cx="238.33" cy="347.18" r="2.52"/><circle class="cls-11" cx="240.61" cy="346.94" r="2.52"/><circle class="cls-12" cx="240.61" cy="346.94" r="2.52"/><circle class="cls-11" cx="242.89" cy="346.82" r="2.52"/><circle class="cls-12" cx="242.89" cy="346.82" r="2.52"/><circle class="cls-11" cx="245.18" cy="346.58" r="2.52" transform="translate(-52.35 43.75) rotate(-9.22)"/><circle class="cls-12" cx="245.18" cy="346.58" r="2.52" transform="translate(-52.35 43.75) rotate(-9.22)"/><circle class="cls-11" cx="247.46" cy="346.46" r="2.52"/><circle class="cls-12" cx="247.46" cy="346.46" r="2.52"/><circle class="cls-11" cx="249.74" cy="346.22" r="2.52" transform="translate(-52.23 44.47) rotate(-9.22)"/><circle class="cls-12" cx="249.74" cy="346.22" r="2.52" transform="translate(-52.23 44.47) rotate(-9.22)"/><circle class="cls-11" cx="252.02" cy="345.98" r="2.52" transform="translate(-117.11 562.67) rotate(-84.34)"/><circle class="cls-12" cx="252.02" cy="345.98" r="2.52" transform="translate(-117.11 562.67) rotate(-84.34)"/><circle class="cls-11" cx="254.3" cy="345.86" r="2.52" transform="translate(-52.12 45.2) rotate(-9.22)"/><circle class="cls-12" cx="254.3" cy="345.86" r="2.52" transform="translate(-52.12 45.2) rotate(-9.22)"/><circle class="cls-11" cx="256.7" cy="345.62" r="2.52"/><circle class="cls-12" cx="256.7" cy="345.62" r="2.52"/><circle class="cls-11" cx="258.98" cy="345.38" r="2.52"/><circle class="cls-12" cx="258.98" cy="345.38" r="2.52"/><circle class="cls-11" cx="261.26" cy="345.26" r="2.52"/><circle class="cls-12" cx="261.26" cy="345.26" r="2.52"/><circle class="cls-11" cx="263.53" cy="345.02" r="2.52"/><circle class="cls-12" cx="263.53" cy="345.02" r="2.52"/><circle class="cls-11" cx="265.81" cy="344.9" r="2.52"/><circle class="cls-12" cx="265.81" cy="344.9" r="2.52"/><circle class="cls-11" cx="268.09" cy="344.66" r="2.52"/><circle class="cls-12" cx="268.09" cy="344.66" r="2.52"/><circle class="cls-11" cx="270.37" cy="344.42" r="2.52"/><circle class="cls-12" cx="270.37" cy="344.42" r="2.52"/><circle class="cls-11" cx="272.66" cy="344.3" r="2.52"/><circle class="cls-12" cx="272.66" cy="344.3" r="2.52"/><circle class="cls-11" cx="274.94" cy="343.94" r="2.52"/><circle class="cls-12" cx="274.94" cy="343.94" r="2.52"/><circle class="cls-11" cx="277.22" cy="343.58" r="2.52"/><circle class="cls-12" cx="277.22" cy="343.58" r="2.52"/><circle class="cls-11" cx="279.61" cy="343.22" r="2.52"/><circle class="cls-12" cx="279.61" cy="343.22" r="2.52"/><circle class="cls-11" cx="281.89" cy="342.86" r="2.52"/><circle class="cls-12" cx="281.89" cy="342.86" r="2.52"/><circle class="cls-11" cx="284.18" cy="342.5" r="2.52" transform="translate(-51.19 49.94) rotate(-9.22)"/><circle class="cls-12" cx="284.18" cy="342.5" r="2.52" transform="translate(-51.19 49.94) rotate(-9.22)"/><circle class="cls-11" cx="286.46" cy="342.14" r="2.52" transform="translate(-51.11 50.3) rotate(-9.22)"/><circle class="cls-12" cx="286.46" cy="342.14" r="2.52" transform="translate(-51.11 50.3) rotate(-9.22)"/><circle class="cls-11" cx="288.74" cy="341.78" r="2.52" transform="translate(-79.83 595.43) rotate(-84.34)"/><circle class="cls-12" cx="288.74" cy="341.78" r="2.52" transform="translate(-79.83 595.43) rotate(-84.34)"/><circle class="cls-11" cx="291.02" cy="341.42" r="2.52" transform="translate(-77.41 597.37) rotate(-84.34)"/><circle class="cls-12" cx="291.02" cy="341.42" r="2.52" transform="translate(-77.41 597.37) rotate(-84.34)"/><circle class="cls-11" cx="293.3" cy="341.06" r="2.52" transform="translate(-75 599.32) rotate(-84.34)"/><circle class="cls-12" cx="293.3" cy="341.06" r="2.52" transform="translate(-75 599.32) rotate(-84.34)"/><circle class="cls-11" cx="295.58" cy="340.7" r="2.52" transform="translate(-72.59 601.26) rotate(-84.34)"/><circle class="cls-12" cx="295.58" cy="340.7" r="2.52" transform="translate(-72.59 601.26) rotate(-84.34)"/><circle class="cls-11" cx="297.85" cy="340.34" r="2.52" transform="translate(-70.17 603.2) rotate(-84.34)"/><circle class="cls-12" cx="297.85" cy="340.34" r="2.52" transform="translate(-70.17 603.2) rotate(-84.34)"/><circle class="cls-11" cx="300.13" cy="339.98" r="2.52" transform="translate(-67.76 605.15) rotate(-84.34)"/><circle class="cls-12" cx="300.13" cy="339.98" r="2.52" transform="translate(-67.76 605.15) rotate(-84.34)"/><circle class="cls-11" cx="302.41" cy="339.62" r="2.52" transform="translate(-65.35 607.09) rotate(-84.34)"/><circle class="cls-12" cx="302.41" cy="339.62" r="2.52" transform="translate(-65.35 607.09) rotate(-84.34)"/><circle class="cls-11" cx="304.81" cy="339.38" r="2.52"/><circle class="cls-12" cx="304.81" cy="339.38" r="2.52"/><circle class="cls-11" cx="307.09" cy="339.02" r="2.52"/><circle class="cls-12" cx="307.09" cy="339.02" r="2.52"/><circle class="cls-11" cx="309.37" cy="338.66" r="2.52"/><circle class="cls-12" cx="309.37" cy="338.66" r="2.52"/><circle class="cls-11" cx="311.66" cy="338.3" r="2.52"/><circle class="cls-12" cx="311.66" cy="338.3" r="2.52"/><circle class="cls-11" cx="313.94" cy="337.94" r="2.52"/><circle class="cls-12" cx="313.94" cy="337.94" r="2.52"/><circle class="cls-11" cx="316.22" cy="337.58" r="2.52"/><circle class="cls-12" cx="316.22" cy="337.58" r="2.52"/><circle class="cls-11" cx="318.49" cy="337.22" r="2.52"/><circle class="cls-12" cx="318.49" cy="337.22" r="2.52"/><circle class="cls-11" cx="320.77" cy="336.86" r="2.52"/><circle class="cls-12" cx="320.77" cy="336.86" r="2.52"/><circle class="cls-11" cx="323.05" cy="336.62" r="2.52"/><circle class="cls-12" cx="323.05" cy="336.62" r="2.52"/><circle class="cls-11" cx="325.33" cy="336.26" r="2.52"/><circle class="cls-12" cx="325.33" cy="336.26" r="2.52"/><circle class="cls-11" cx="327.74" cy="335.9" r="2.52"/><circle class="cls-12" cx="327.74" cy="335.9" r="2.52"/><circle class="cls-11" cx="330.02" cy="335.54" r="2.52"/><circle class="cls-12" cx="330.02" cy="335.54" r="2.52"/><circle class="cls-11" cx="332.3" cy="335.06" r="2.52" transform="translate(-33.87 632.72) rotate(-84.34)"/><circle class="cls-12" cx="332.3" cy="335.06" r="2.52" transform="translate(-33.87 632.72) rotate(-84.34)"/><circle class="cls-11" cx="334.58" cy="334.58" r="2.52" transform="translate(-49.27 57.91) rotate(-9.22)"/><circle class="cls-12" cx="334.58" cy="334.58" r="2.52" transform="translate(-49.27 57.91) rotate(-9.22)"/><circle class="cls-11" cx="336.85" cy="334.1" r="2.52"/><circle class="cls-12" cx="336.85" cy="334.1" r="2.52"/><circle class="cls-11" cx="339.13" cy="333.62" r="2.52" transform="translate(-26.27 638.23) rotate(-84.34)"/><circle class="cls-12" cx="339.13" cy="333.62" r="2.52" transform="translate(-26.27 638.23) rotate(-84.34)"/><circle class="cls-11" cx="341.41" cy="333.14" r="2.52" transform="translate(-48.95 58.99) rotate(-9.22)"/><circle class="cls-12" cx="341.41" cy="333.14" r="2.52" transform="translate(-48.95 58.99) rotate(-9.22)"/><circle class="cls-11" cx="343.7" cy="332.66" r="2.52"/><circle class="cls-12" cx="343.7" cy="332.66" r="2.52"/><circle class="cls-11" cx="345.98" cy="332.18" r="2.52"/><circle class="cls-12" cx="345.98" cy="332.18" r="2.52"/><circle class="cls-11" cx="348.26" cy="331.7" r="2.52"/><circle class="cls-12" cx="348.26" cy="331.7" r="2.52"/><circle class="cls-11" cx="350.66" cy="331.22" r="2.52"/><circle class="cls-12" cx="350.66" cy="331.22" r="2.52"/><circle class="cls-11" cx="352.94" cy="330.74" r="2.52"/><circle class="cls-12" cx="352.94" cy="330.74" r="2.52"/><circle class="cls-11" cx="355.22" cy="330.26" r="2.52"/><circle class="cls-12" cx="355.22" cy="330.26" r="2.52"/><circle class="cls-11" cx="357.49" cy="329.78" r="2.52"/><circle class="cls-12" cx="357.49" cy="329.78" r="2.52"/><circle class="cls-11" cx="359.77" cy="329.3" r="2.52"/><circle class="cls-12" cx="359.77" cy="329.3" r="2.52"/><circle class="cls-11" cx="362.05" cy="328.82" r="2.52"/><circle class="cls-12" cx="362.05" cy="328.82" r="2.52"/><circle class="cls-11" cx="364.33" cy="328.34" r="2.52"/><circle class="cls-12" cx="364.33" cy="328.34" r="2.52"/><circle class="cls-11" cx="366.61" cy="327.86" r="2.52"/><circle class="cls-12" cx="366.61" cy="327.86" r="2.52"/><circle class="cls-11" cx="368.89" cy="327.38" r="2.52"/><circle class="cls-12" cx="368.89" cy="327.38" r="2.52"/><circle class="cls-11" cx="371.18" cy="326.78" r="2.52" transform="translate(9.42 663.94) rotate(-84.34)"/><circle class="cls-12" cx="371.18" cy="326.78" r="2.52" transform="translate(9.42 663.94) rotate(-84.34)"/><circle class="cls-11" cx="373.58" cy="326.18" r="2.52"/><circle class="cls-12" cx="373.58" cy="326.18" r="2.52"/><circle class="cls-11" cx="375.85" cy="325.58" r="2.52" transform="translate(-47.3 64.41) rotate(-9.22)"/><circle class="cls-12" cx="375.85" cy="325.58" r="2.52" transform="translate(-47.3 64.41) rotate(-9.22)"/><circle class="cls-11" cx="378.13" cy="324.98" r="2.52" transform="translate(17.48 669.25) rotate(-84.34)"/><circle class="cls-12" cx="378.13" cy="324.98" r="2.52" transform="translate(17.48 669.25) rotate(-84.34)"/><circle class="cls-11" cx="380.41" cy="324.38" r="2.52"/><circle class="cls-12" cx="380.41" cy="324.38" r="2.52"/><circle class="cls-11" cx="382.7" cy="323.78" r="2.52"/><circle class="cls-12" cx="382.7" cy="323.78" r="2.52"/><circle class="cls-11" cx="384.98" cy="323.18" r="2.52"/><circle class="cls-12" cx="384.98" cy="323.18" r="2.52"/><circle class="cls-11" cx="387.26" cy="322.58" r="2.52" transform="translate(6.81 653.16) rotate(-80.78)"/><circle class="cls-12" cx="387.26" cy="322.58" r="2.52" transform="translate(6.81 653.16) rotate(-80.78)"/><circle class="cls-11" cx="389.53" cy="321.98" r="2.52"/><circle class="cls-12" cx="389.53" cy="321.98" r="2.52"/><circle class="cls-11" cx="391.81" cy="321.38" r="2.52"/><circle class="cls-12" cx="391.81" cy="321.38" r="2.52"/><circle class="cls-11" cx="394.09" cy="320.9" r="2.52"/><circle class="cls-12" cx="394.09" cy="320.9" r="2.52"/><circle class="cls-11" cx="396.49" cy="320.3" r="2.52"/><circle class="cls-12" cx="396.49" cy="320.3" r="2.52"/><circle class="cls-11" cx="398.77" cy="319.7" r="2.52"/><circle class="cls-12" cx="398.77" cy="319.7" r="2.52"/><circle class="cls-11" cx="401.05" cy="318.98" r="2.52"/><circle class="cls-12" cx="401.05" cy="318.98" r="2.52"/><circle class="cls-11" cx="403.33" cy="318.26" r="2.52"/><circle class="cls-12" cx="403.33" cy="318.26" r="2.52"/><circle class="cls-11" cx="405.61" cy="317.54" r="2.52"/><circle class="cls-12" cx="405.61" cy="317.54" r="2.52"/><circle class="cls-11" cx="407.89" cy="316.94" r="2.52"/><circle class="cls-12" cx="407.89" cy="316.94" r="2.52"/><circle class="cls-11" cx="410.18" cy="316.22" r="2.52" transform="translate(-45.36 69.79) rotate(-9.22)"/><circle class="cls-12" cx="410.18" cy="316.22" r="2.52" transform="translate(-45.36 69.79) rotate(-9.22)"/><circle class="cls-11" cx="412.46" cy="315.5" r="2.52" transform="translate(-45.21 70.14) rotate(-9.22)"/><circle class="cls-12" cx="412.46" cy="315.5" r="2.52" transform="translate(-45.21 70.14) rotate(-9.22)"/><circle class="cls-11" cx="414.74" cy="314.78" r="2.52" transform="translate(60.62 696.47) rotate(-84.34)"/><circle class="cls-12" cx="414.74" cy="314.78" r="2.52" transform="translate(60.62 696.47) rotate(-84.34)"/><circle class="cls-11" cx="417.02" cy="314.18" r="2.52"/><circle class="cls-12" cx="417.02" cy="314.18" r="2.52"/><circle class="cls-11" cx="419.41" cy="313.46" r="2.52"/><circle class="cls-12" cx="419.41" cy="313.46" r="2.52"/><circle class="cls-11" cx="421.7" cy="312.74" r="2.52"/><circle class="cls-12" cx="421.7" cy="312.74" r="2.52"/><circle class="cls-11" cx="423.98" cy="312.02" r="2.52"/><circle class="cls-12" cx="423.98" cy="312.02" r="2.52"/><circle class="cls-11" cx="426.26" cy="311.18" r="2.52"/><circle class="cls-12" cx="426.26" cy="311.18" r="2.52"/><circle class="cls-11" cx="428.53" cy="310.34" r="2.52"/><circle class="cls-12" cx="428.53" cy="310.34" r="2.52"/><circle class="cls-11" cx="430.81" cy="309.62" r="2.52"/><circle class="cls-12" cx="430.81" cy="309.62" r="2.52"/><circle class="cls-11" cx="433.09" cy="308.78" r="2.52"/><circle class="cls-12" cx="433.09" cy="308.78" r="2.52"/><circle class="cls-11" cx="435.37" cy="308.06" r="2.52"/><circle class="cls-12" cx="435.37" cy="308.06" r="2.52"/><circle class="cls-11" cx="437.66" cy="307.34" r="2.52"/><circle class="cls-12" cx="437.66" cy="307.34" r="2.52"/><circle class="cls-11" cx="439.94" cy="306.5" r="2.52"/><circle class="cls-12" cx="439.94" cy="306.5" r="2.52"/><circle class="cls-11" cx="442.33" cy="305.78" r="2.52"/><circle class="cls-12" cx="442.33" cy="305.78" r="2.52"/><circle class="cls-11" cx="444.61" cy="304.94" r="2.52"/><circle class="cls-12" cx="444.61" cy="304.94" r="2.52"/><circle class="cls-11" cx="446.89" cy="304.1" r="2.52"/><circle class="cls-12" cx="446.89" cy="304.1" r="2.52"/><path class="cls-11" d="M451.69,303.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,451.69,303.14Z"/><path class="cls-12" d="M451.69,303.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,451.69,303.14Z"/><circle class="cls-11" cx="451.46" cy="302.3" r="2.52" transform="translate(-42.59 76.22) rotate(-9.22)"/><circle class="cls-12" cx="451.46" cy="302.3" r="2.52" transform="translate(-42.59 76.22) rotate(-9.22)"/><path class="cls-11" d="M456.25,301.46a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,456.25,301.46Z"/><path class="cls-12" d="M456.25,301.46a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,456.25,301.46Z"/><path class="cls-11" d="M458.53,300.62A2.52,2.52,0,1,1,456,298.1,2.52,2.52,0,0,1,458.53,300.62Z"/><path class="cls-12" d="M458.53,300.62A2.52,2.52,0,1,1,456,298.1,2.52,2.52,0,0,1,458.53,300.62Z"/><circle class="cls-11" cx="458.3" cy="299.78" r="2.52" transform="translate(126.79 735.67) rotate(-85.93)"/><circle class="cls-12" cx="458.3" cy="299.78" r="2.52" transform="translate(126.79 735.67) rotate(-85.93)"/><path class="cls-11" d="M463.09,298.82a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,463.09,298.82Z"/><path class="cls-12" d="M463.09,298.82a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,463.09,298.82Z"/><circle class="cls-11" cx="462.86" cy="297.98" r="2.52" transform="translate(132.82 738.54) rotate(-85.93)"/><circle class="cls-12" cx="462.86" cy="297.98" r="2.52" transform="translate(132.82 738.54) rotate(-85.93)"/><path class="cls-11" d="M467.77,297a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,467.77,297Z"/><path class="cls-12" d="M467.77,297a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,467.77,297Z"/><circle class="cls-11" cx="467.53" cy="296.06" r="2.52" transform="translate(139.08 741.43) rotate(-85.93)"/><circle class="cls-12" cx="467.53" cy="296.06" r="2.52" transform="translate(139.08 741.43) rotate(-85.93)"/><path class="cls-11" d="M472.33,295.22a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,472.33,295.22Z"/><path class="cls-12" d="M472.33,295.22a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,472.33,295.22Z"/><circle class="cls-11" cx="472.09" cy="294.26" r="2.52"/><circle class="cls-12" cx="472.09" cy="294.26" r="2.52"/><path class="cls-11" d="M476.89,293.3a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,476.89,293.3Z"/><path class="cls-12" d="M476.89,293.3a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,476.89,293.3Z"/><circle class="cls-11" cx="476.65" cy="292.22" r="2.52" transform="translate(-26.47 48.39) rotate(-5.65)"/><circle class="cls-12" cx="476.65" cy="292.22" r="2.52" transform="translate(-26.47 48.39) rotate(-5.65)"/><path class="cls-11" d="M481.45,291.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,481.45,291.26Z"/><path class="cls-12" d="M481.45,291.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,481.45,291.26Z"/><circle class="cls-11" cx="481.21" cy="290.3" r="2.52" transform="translate(-26.26 48.83) rotate(-5.65)"/><circle class="cls-12" cx="481.21" cy="290.3" r="2.52" transform="translate(-26.26 48.83) rotate(-5.65)"/><path class="cls-11" d="M486,289.34a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,486,289.34Z"/><path class="cls-12" d="M486,289.34a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,486,289.34Z"/><circle class="cls-11" cx="485.77" cy="288.26" r="2.52"/><circle class="cls-12" cx="485.77" cy="288.26" r="2.52"/><path class="cls-11" d="M490.57,287.3a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,490.57,287.3Z"/><path class="cls-12" d="M490.57,287.3a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,490.57,287.3Z"/><path class="cls-11" d="M493,286.22a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,493,286.22Z"/><path class="cls-12" d="M493,286.22a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,493,286.22Z"/><path class="cls-11" d="M495.25,285.14a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,495.25,285.14Z"/><path class="cls-12" d="M495.25,285.14a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,495.25,285.14Z"/><path class="cls-11" d="M497.53,284.06a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,497.53,284.06Z"/><path class="cls-12" d="M497.53,284.06a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,497.53,284.06Z"/><circle class="cls-11" cx="497.3" cy="282.98" r="2.52" transform="translate(179.78 758.96) rotate(-85.93)"/><circle class="cls-12" cx="497.3" cy="282.98" r="2.52" transform="translate(179.78 758.96) rotate(-85.93)"/><path class="cls-11" d="M502.09,281.9a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,502.09,281.9Z"/><path class="cls-12" d="M502.09,281.9a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,502.09,281.9Z"/><circle class="cls-11" cx="501.86" cy="280.7" r="2.52" transform="translate(186.29 761.39) rotate(-85.93)"/><circle class="cls-12" cx="501.86" cy="280.7" r="2.52" transform="translate(186.29 761.39) rotate(-85.93)"/><path class="cls-11" d="M506.65,279.62a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,506.65,279.62Z"/><path class="cls-12" d="M506.65,279.62a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,506.65,279.62Z"/><circle class="cls-11" cx="506.42" cy="278.54" r="2.52"/><circle class="cls-12" cx="506.42" cy="278.54" r="2.52"/><path class="cls-11" d="M511.21,277.34a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,511.21,277.34Z"/><path class="cls-12" d="M511.21,277.34a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,511.21,277.34Z"/><circle class="cls-11" cx="510.98" cy="276.14" r="2.52" transform="translate(-24.72 51.69) rotate(-5.65)"/><circle class="cls-12" cx="510.98" cy="276.14" r="2.52" transform="translate(-24.72 51.69) rotate(-5.65)"/><path class="cls-11" d="M515.89,274.94a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,515.89,274.94Z"/><path class="cls-12" d="M515.89,274.94a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,515.89,274.94Z"/><circle class="cls-11" cx="515.65" cy="273.74" r="2.52"/><circle class="cls-12" cx="515.65" cy="273.74" r="2.52"/><path class="cls-11" d="M520.45,272.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,520.45,272.54Z"/><path class="cls-12" d="M520.45,272.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,520.45,272.54Z"/><circle class="cls-11" cx="520.21" cy="271.22" r="2.52" transform="translate(-24.19 52.58) rotate(-5.65)"/><circle class="cls-12" cx="520.21" cy="271.22" r="2.52" transform="translate(-24.19 52.58) rotate(-5.65)"/><path class="cls-11" d="M525,269.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,525,269.9Z"/><path class="cls-12" d="M525,269.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,525,269.9Z"/><circle class="cls-11" cx="524.77" cy="268.7" r="2.52" transform="translate(219.55 773.1) rotate(-85.93)"/><circle class="cls-12" cx="524.77" cy="268.7" r="2.52" transform="translate(219.55 773.1) rotate(-85.93)"/><path class="cls-11" d="M529.57,267.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,529.57,267.38Z"/><path class="cls-12" d="M529.57,267.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,529.57,267.38Z"/><path class="cls-11" d="M531.86,265.94a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,531.86,265.94Z"/><path class="cls-12" d="M531.86,265.94a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,531.86,265.94Z"/><path class="cls-11" d="M534.13,264.62a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,534.13,264.62Z"/><path class="cls-12" d="M534.13,264.62a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,534.13,264.62Z"/><path class="cls-11" d="M536.42,263.18a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,536.42,263.18Z"/><path class="cls-12" d="M536.42,263.18a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,536.42,263.18Z"/><circle class="cls-11" cx="536.3" cy="261.74" r="2.52"/><circle class="cls-12" cx="536.3" cy="261.74" r="2.52"/><path class="cls-11" d="M541.09,260.3a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,541.09,260.3Z"/><path class="cls-12" d="M541.09,260.3a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,541.09,260.3Z"/><circle class="cls-11" cx="540.86" cy="258.74" r="2.52"/><circle class="cls-12" cx="540.86" cy="258.74" r="2.52"/><path class="cls-11" d="M545.65,257.3a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,545.65,257.3Z"/><path class="cls-12" d="M545.65,257.3a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,545.65,257.3Z"/><circle class="cls-11" cx="545.42" cy="255.62" r="2.52" transform="translate(251.78 781.54) rotate(-85.93)"/><circle class="cls-12" cx="545.42" cy="255.62" r="2.52" transform="translate(251.78 781.54) rotate(-85.93)"/><path class="cls-11" d="M550.21,253.94a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,550.21,253.94Z"/><path class="cls-12" d="M550.21,253.94a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,550.21,253.94Z"/><circle class="cls-11" cx="549.98" cy="252.26" r="2.52"/><circle class="cls-12" cx="549.98" cy="252.26" r="2.52"/><path class="cls-11" d="M554.77,250.58a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,554.77,250.58Z"/><path class="cls-12" d="M554.77,250.58a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,554.77,250.58Z"/><circle class="cls-11" cx="554.53" cy="248.78" r="2.52" transform="translate(267.08 784.28) rotate(-85.93)"/><circle class="cls-12" cx="554.53" cy="248.78" r="2.52" transform="translate(267.08 784.28) rotate(-85.93)"/><path class="cls-11" d="M559.33,246.86a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,559.33,246.86Z"/><path class="cls-12" d="M559.33,246.86a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,559.33,246.86Z"/><circle class="cls-11" cx="559.21" cy="244.94" r="2.52" transform="translate(-21.41 56.3) rotate(-5.65)"/><circle class="cls-12" cx="559.21" cy="244.94" r="2.52" transform="translate(-21.41 56.3) rotate(-5.65)"/><path class="cls-11" d="M564,242.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,564,242.9Z"/><path class="cls-12" d="M564,242.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,564,242.9Z"/><circle class="cls-11" cx="563.77" cy="240.98" r="2.52" transform="translate(283.44 786.25) rotate(-85.93)"/><circle class="cls-12" cx="563.77" cy="240.98" r="2.52" transform="translate(283.44 786.25) rotate(-85.93)"/><path class="cls-11" d="M568.57,238.7a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,568.57,238.7Z"/><path class="cls-12" d="M568.57,238.7a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,568.57,238.7Z"/><path class="cls-11" d="M570.86,236.54a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,570.86,236.54Z"/><path class="cls-12" d="M570.86,236.54a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,570.86,236.54Z"/><path class="cls-11" d="M573.13,234.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,573.13,234.26Z"/><path class="cls-12" d="M573.13,234.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,573.13,234.26Z"/><path class="cls-11" d="M575.42,231.86a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,575.42,231.86Z"/><path class="cls-12" d="M575.42,231.86a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,575.42,231.86Z"/><path class="cls-11" d="M577.69,229.34a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,577.69,229.34Z"/><path class="cls-12" d="M577.69,229.34a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,577.69,229.34Z"/><path class="cls-11" d="M580,226.7a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,580,226.7Z"/><path class="cls-12" d="M580,226.7a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,580,226.7Z"/><path class="cls-11" d="M582.25,223.94a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,582.25,223.94Z"/><path class="cls-12" d="M582.25,223.94a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,582.25,223.94Z"/><path class="cls-11" d="M584.65,221.18a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,584.65,221.18Z"/><path class="cls-12" d="M584.65,221.18a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,584.65,221.18Z"/><circle class="cls-11" cx="584.42" cy="218.06" r="2.52" transform="translate(275.56 760) rotate(-80.78)"/><circle class="cls-12" cx="584.42" cy="218.06" r="2.52" transform="translate(275.56 760) rotate(-80.78)"/><path class="cls-11" d="M589.21,214.94a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,589.21,214.94Z"/><path class="cls-12" d="M589.21,214.94a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,589.21,214.94Z"/><circle class="cls-11" cx="588.98" cy="211.58" r="2.52" transform="translate(-36.13 241.5) rotate(-22.5)"/><circle class="cls-12" cx="588.98" cy="211.58" r="2.52" transform="translate(-36.13 241.5) rotate(-22.5)"/><path class="cls-11" d="M593.77,208.22a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,593.77,208.22Z"/><path class="cls-12" d="M593.77,208.22a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,593.77,208.22Z"/><circle class="cls-11" cx="593.53" cy="204.38" r="2.52" transform="translate(296.72 757.51) rotate(-80.78)"/><circle class="cls-12" cx="593.53" cy="204.38" r="2.52" transform="translate(296.72 757.51) rotate(-80.78)"/><path class="cls-11" d="M598.33,200.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,598.33,200.54Z"/><path class="cls-12" d="M598.33,200.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,598.33,200.54Z"/><circle class="cls-11" cx="598.09" cy="196.46" r="2.52"/><circle class="cls-12" cx="598.09" cy="196.46" r="2.52"/><path class="cls-11" d="M602.89,192.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,602.89,192.14Z"/><path class="cls-12" d="M602.89,192.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,602.89,192.14Z"/><circle class="cls-11" cx="602.65" cy="187.46" r="2.52"/><circle class="cls-12" cx="602.65" cy="187.46" r="2.52"/><path class="cls-11" d="M607.57,182.66a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,607.57,182.66Z"/><path class="cls-12" d="M607.57,182.66a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,607.57,182.66Z"/><path class="cls-11" d="M609.86,177.38a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,609.86,177.38Z"/><path class="cls-12" d="M609.86,177.38a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,609.86,177.38Z"/><path class="cls-11" d="M612.13,172a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,612.13,172Z"/><path class="cls-12" d="M612.13,172a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,612.13,172Z"/><path class="cls-11" d="M614.42,166.1a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,614.42,166.1Z"/><path class="cls-12" d="M614.42,166.1a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,614.42,166.1Z"/><path class="cls-11" d="M616.69,160a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,616.69,160Z"/><path class="cls-12" d="M616.69,160a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,616.69,160Z"/><path class="cls-11" d="M619,153.5a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,619,153.5Z"/><path class="cls-12" d="M619,153.5a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,619,153.5Z"/><path class="cls-11" d="M621.25,146.66a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,621.25,146.66Z"/><path class="cls-12" d="M621.25,146.66a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,621.25,146.66Z"/><path class="cls-11" d="M623.53,139.22A2.52,2.52,0,1,1,621,136.7,2.52,2.52,0,0,1,623.53,139.22Z"/><path class="cls-12" d="M623.53,139.22A2.52,2.52,0,1,1,621,136.7,2.52,2.52,0,0,1,623.53,139.22Z"/><circle class="cls-11" cx="623.3" cy="131.54" r="2.52" transform="translate(-2.89 248.54) rotate(-22.5)"/><circle class="cls-12" cx="623.3" cy="131.54" r="2.52" transform="translate(-2.89 248.54) rotate(-22.5)"/><path class="cls-11" d="M628.09,123.14a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,628.09,123.14Z"/><path class="cls-12" d="M628.09,123.14a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,628.09,123.14Z"/><circle class="cls-11" cx="627.98" cy="114.26" r="2.52" transform="translate(4.08 249.01) rotate(-22.5)"/><circle class="cls-12" cx="627.98" cy="114.26" r="2.52" transform="translate(4.08 249.01) rotate(-22.5)"/><path class="cls-11" d="M632.77,104.9a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,632.77,104.9Z"/><path class="cls-12" d="M632.77,104.9a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,632.77,104.9Z"/><circle class="cls-11" cx="632.53" cy="94.82" r="2.52" transform="translate(11.86 249.28) rotate(-22.5)"/><circle class="cls-12" cx="632.53" cy="94.82" r="2.52" transform="translate(11.86 249.28) rotate(-22.5)"/><path class="cls-11" d="M637.33,83.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,637.33,83.9Z"/><path class="cls-12" d="M637.33,83.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,637.33,83.9Z"/><circle class="cls-11" cx="637.09" cy="72.5" r="2.52" transform="translate(463.48 689.75) rotate(-80.78)"/><circle class="cls-12" cx="637.09" cy="72.5" r="2.52" transform="translate(463.48 689.75) rotate(-80.78)"/><path class="cls-11" d="M641.89,60.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,641.89,60.26Z"/><path class="cls-12" d="M641.89,60.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,641.89,60.26Z"/><g class="cls-13"><text class="cls-14" transform="translate(40.93 365.91)">0</text></g><g class="cls-13"><text class="cls-14" transform="translate(27.23 295.04)">5000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 224.17)">10000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 153.31)">15000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 82.43)">20000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 11.56)">25000</text></g><g class="cls-13"><text class="cls-14" transform="translate(52.7 377.63)">0</text></g><g class="cls-13"><text class="cls-14" transform="translate(165.03 377.63)">50</text></g><g class="cls-13"><text class="cls-14" transform="translate(277.35 377.63)">100</text></g><g class="cls-13"><text class="cls-14" transform="translate(391.99 377.63)">150</text></g><g class="cls-13"><text class="cls-14" transform="translate(506.58 377.63)">200</text></g><g class="cls-13"><text class="cls-14" transform="translate(621.19 377.63)">250</text></g><g class="cls-13"><text class="cls-15" transform="translate(17.56 199.35) rotate(-90)">Qstep</text></g><g class="cls-13"><text class="cls-15" transform="translate(325.64 386.9)">Q<tspan class="cls-16" x="11.53" y="0">_</tspan><tspan x="19.54" y="0">i</tspan><tspan class="cls-16" x="23.97" y="0">n</tspan><tspan class="cls-17" x="31.98" y="0">d</tspan><tspan x="40.01" y="0">ex</tspan></text></g><line class="cls-4" x1="477.71" y1="70.43" x2="496.92" y2="70.44"/><path class="cls-5" d="M489.13,70.28a2,2,0,1,1-2-2A2,2,0,0,1,489.13,70.28Z"/><path class="cls-18" d="M489.13,70.28a2,2,0,1,1-2-2A2,2,0,0,1,489.13,70.28Z"/><g class="cls-13"><text class="cls-19" transform="translate(499.04 74.83)"><tspan class="cls-20">8</tspan><tspan x="6.98" y="0">-</tspan><tspan class="cls-21" x="11.65" y="0">b</tspan><tspan class="cls-20" x="18.73" y="0">it</tspan><tspan class="cls-22" x="26.45" y="0"> </tspan><tspan class="cls-23" x="30.03" y="0">D</tspan><tspan x="40.11" y="0">C</tspan></text></g><line class="cls-7" x1="477.71" y1="91.78" x2="496.92" y2="91.78"/><path class="cls-8" d="M489.13,91.64a2,2,0,1,1-2-2A2,2,0,0,1,489.13,91.64Z"/><path class="cls-24" d="M489.13,91.64a2,2,0,1,1-2-2A2,2,0,0,1,489.13,91.64Z"/><g class="cls-13"><text class="cls-25" transform="translate(499.04 96.16)"><tspan class="cls-26">1</tspan><tspan class="cls-27" x="6.96" y="0">0</tspan><tspan x="14.01" y="0">-</tspan><tspan class="cls-28" x="18.69" y="0">b</tspan><tspan class="cls-29" x="25.65" y="0">i</tspan><tspan class="cls-30" x="29.5" y="0">t</tspan><tspan class="cls-31" x="33.45" y="0"> </tspan><tspan class="cls-32" x="36.97" y="0">D</tspan><tspan x="47.01" y="0">C</tspan></text></g><line class="cls-10" x1="477.71" y1="113.13" x2="496.92" y2="113.13"/><circle class="cls-11" cx="487.21" cy="113.06" r="2.52" transform="translate(297.58 575.87) rotate(-80.78)"/><circle class="cls-33" cx="487.21" cy="113.06" r="2.52" transform="translate(297.58 575.87) rotate(-80.78)"/><g class="cls-13"><text class="cls-19" transform="translate(499.04 117.52)"><tspan class="cls-20">1</tspan><tspan class="cls-34" x="6.98" y="0">2</tspan><tspan class="cls-35" x="14.03" y="0">-</tspan><tspan class="cls-36" x="18.7" y="0">bi</tspan><tspan class="cls-37" x="29.54" y="0">t</tspan><tspan class="cls-38" x="33.48" y="0"> </tspan><tspan class="cls-39" x="36.99" y="0">D</tspan><tspan x="47.05" y="0">C</tspan></text></g><rect class="cls-2" x="0.38" y="0.38" width="652.8" height="391.32"/></g></g></svg>
+\ No newline at end of file
diff --git a/media/libaom/src/doc/img/scc_intrabc.svg b/media/libaom/src/doc/img/scc_intrabc.svg
new file mode 100644
index 0000000000..dfe4948861
--- /dev/null
+++ b/media/libaom/src/doc/img/scc_intrabc.svg
@@ -0,0 +1,348 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export scc_intrabc.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="5.4258in" height="2.9597in"
+		viewBox="0 0 390.657 213.098" xml:space="preserve" color-interpolation-filters="sRGB" class="st8">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#d8d8d8;stroke:#000000;stroke-width:0.25}
+		.st2 {fill:#fec000;stroke:#000000;stroke-width:0.25}
+		.st3 {fill:#00fefe;stroke:#000000;stroke-width:0.25}
+		.st4 {fill:#ffffff;stroke:#000000;stroke-width:0.25}
+		.st5 {fill:#ffc000;stroke:#000000;stroke-width:0.25}
+		.st6 {fill:none;stroke:none;stroke-width:0.25}
+		.st7 {fill:#4672c4;font-family:Calibri;font-size:0.666664em}
+		.st8 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="0.0393701" v:pageScale="0.0393701" v:drawingUnits="24" v:shadowOffsetX="4.25197"
+				v:shadowOffsetY="-4.25197"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(0.25,-141.982)">
+			<title>Sheet.1</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape3-3" v:mID="3" v:groupContext="shape" transform="translate(28.5965,-141.982)">
+			<title>Sheet.3</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape4-5" v:mID="4" v:groupContext="shape" transform="translate(56.9429,-141.982)">
+			<title>Sheet.4</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape5-7" v:mID="5" v:groupContext="shape" transform="translate(85.2894,-141.982)">
+			<title>Sheet.5</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape6-9" v:mID="6" v:groupContext="shape" transform="translate(113.636,-141.982)">
+			<title>Sheet.6</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape7-11" v:mID="7" v:groupContext="shape" transform="translate(141.982,-141.982)">
+			<title>Sheet.7</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape8-13" v:mID="8" v:groupContext="shape" transform="translate(170.329,-141.982)">
+			<title>Sheet.8</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape9-15" v:mID="9" v:groupContext="shape" transform="translate(198.675,-141.982)">
+			<title>Sheet.9</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape10-17" v:mID="10" v:groupContext="shape" transform="translate(0.25,-113.636)">
+			<title>Sheet.10</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape11-19" v:mID="11" v:groupContext="shape" transform="translate(28.5965,-113.636)">
+			<title>Sheet.11</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape12-21" v:mID="12" v:groupContext="shape" transform="translate(56.9429,-113.636)">
+			<title>Sheet.12</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape13-23" v:mID="13" v:groupContext="shape" transform="translate(85.2894,-113.636)">
+			<title>Sheet.13</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape14-25" v:mID="14" v:groupContext="shape" transform="translate(113.636,-113.636)">
+			<title>Sheet.14</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape15-27" v:mID="15" v:groupContext="shape" transform="translate(141.982,-113.636)">
+			<title>Sheet.15</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape16-29" v:mID="16" v:groupContext="shape" transform="translate(170.329,-113.636)">
+			<title>Sheet.16</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape17-31" v:mID="17" v:groupContext="shape" transform="translate(198.675,-113.636)">
+			<title>Sheet.17</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape18-33" v:mID="18" v:groupContext="shape" transform="translate(0.25,-85.2894)">
+			<title>Sheet.18</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape19-35" v:mID="19" v:groupContext="shape" transform="translate(28.5965,-85.2894)">
+			<title>Sheet.19</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape20-37" v:mID="20" v:groupContext="shape" transform="translate(56.9429,-85.2894)">
+			<title>Sheet.20</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape21-39" v:mID="21" v:groupContext="shape" transform="translate(85.2894,-85.2894)">
+			<title>Sheet.21</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape22-41" v:mID="22" v:groupContext="shape" transform="translate(113.636,-85.2894)">
+			<title>Sheet.22</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape23-43" v:mID="23" v:groupContext="shape" transform="translate(141.982,-85.2894)">
+			<title>Sheet.23</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape24-45" v:mID="24" v:groupContext="shape" transform="translate(170.329,-85.2894)">
+			<title>Sheet.24</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st2"/>
+		</g>
+		<g id="shape25-47" v:mID="25" v:groupContext="shape" transform="translate(198.675,-85.2894)">
+			<title>Sheet.25</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+		</g>
+		<g id="shape26-49" v:mID="26" v:groupContext="shape" transform="translate(0.25,-56.9429)">
+			<title>Sheet.26</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape27-51" v:mID="27" v:groupContext="shape" transform="translate(28.5965,-56.9429)">
+			<title>Sheet.27</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape28-53" v:mID="28" v:groupContext="shape" transform="translate(56.9429,-56.9429)">
+			<title>Sheet.28</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape29-55" v:mID="29" v:groupContext="shape" transform="translate(85.2894,-56.9429)">
+			<title>Sheet.29</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape30-57" v:mID="30" v:groupContext="shape" transform="translate(113.636,-56.9429)">
+			<title>Sheet.30</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st2"/>
+		</g>
+		<g id="shape31-59" v:mID="31" v:groupContext="shape" transform="translate(141.982,-56.9429)">
+			<title>Sheet.31</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+		</g>
+		<g id="shape32-61" v:mID="32" v:groupContext="shape" transform="translate(170.329,-56.9429)">
+			<title>Sheet.32</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape33-63" v:mID="33" v:groupContext="shape" transform="translate(198.675,-56.9429)">
+			<title>Sheet.33</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape34-65" v:mID="34" v:groupContext="shape" transform="translate(227.022,-141.982)">
+			<title>Sheet.34</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape35-67" v:mID="35" v:groupContext="shape" transform="translate(255.368,-141.982)">
+			<title>Sheet.35</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape36-69" v:mID="36" v:groupContext="shape" transform="translate(283.715,-141.982)">
+			<title>Sheet.36</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st5"/>
+		</g>
+		<g id="shape37-71" v:mID="37" v:groupContext="shape" transform="translate(312.061,-141.982)">
+			<title>Sheet.37</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+		</g>
+		<g id="shape38-73" v:mID="38" v:groupContext="shape" transform="translate(227.022,-113.636)">
+			<title>Sheet.38</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st2"/>
+		</g>
+		<g id="shape39-75" v:mID="39" v:groupContext="shape" transform="translate(255.368,-113.636)">
+			<title>Sheet.39</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+		</g>
+		<g id="shape40-77" v:mID="40" v:groupContext="shape" transform="translate(283.715,-113.636)">
+			<title>Sheet.40</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape41-79" v:mID="41" v:groupContext="shape" transform="translate(312.061,-113.636)">
+			<title>Sheet.41</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape42-81" v:mID="42" v:groupContext="shape" transform="translate(227.022,-85.2894)">
+			<title>Sheet.42</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape43-83" v:mID="43" v:groupContext="shape" transform="translate(255.368,-85.2894)">
+			<title>Sheet.43</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape44-85" v:mID="44" v:groupContext="shape" transform="translate(283.715,-85.2894)">
+			<title>Sheet.44</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape45-87" v:mID="45" v:groupContext="shape" transform="translate(312.061,-85.2894)">
+			<title>Sheet.45</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape46-89" v:mID="46" v:groupContext="shape" transform="translate(227.022,-56.9429)">
+			<title>Sheet.46</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape47-91" v:mID="47" v:groupContext="shape" transform="translate(255.368,-56.9429)">
+			<title>Sheet.47</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape48-93" v:mID="48" v:groupContext="shape" transform="translate(283.715,-56.9429)">
+			<title>Sheet.48</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape49-95" v:mID="49" v:groupContext="shape" transform="translate(312.061,-56.9429)">
+			<title>Sheet.49</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape50-97" v:mID="50" v:groupContext="shape" transform="translate(0.25,-28.5965)">
+			<title>Sheet.50</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape51-99" v:mID="51" v:groupContext="shape" transform="translate(28.5965,-28.5965)">
+			<title>Sheet.51</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape52-101" v:mID="52" v:groupContext="shape" transform="translate(56.9429,-28.5965)">
+			<title>Sheet.52</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st2"/>
+		</g>
+		<g id="shape53-103" v:mID="53" v:groupContext="shape" transform="translate(85.2894,-28.5965)">
+			<title>Sheet.53</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+		</g>
+		<g id="shape54-105" v:mID="54" v:groupContext="shape" transform="translate(113.636,-28.5965)">
+			<title>Sheet.54</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape55-107" v:mID="55" v:groupContext="shape" transform="translate(141.982,-28.5965)">
+			<title>Sheet.55</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape56-109" v:mID="56" v:groupContext="shape" transform="translate(170.329,-28.5965)">
+			<title>Sheet.56</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape57-111" v:mID="57" v:groupContext="shape" transform="translate(198.675,-28.5965)">
+			<title>Sheet.57</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape58-113" v:mID="58" v:groupContext="shape" transform="translate(227.022,-28.5965)">
+			<title>Sheet.58</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape59-115" v:mID="59" v:groupContext="shape" transform="translate(255.368,-28.5965)">
+			<title>Sheet.59</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape60-117" v:mID="60" v:groupContext="shape" transform="translate(283.715,-28.5965)">
+			<title>Sheet.60</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape61-119" v:mID="61" v:groupContext="shape" transform="translate(312.061,-28.5965)">
+			<title>Sheet.61</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape62-121" v:mID="62" v:groupContext="shape" transform="translate(0.25,-0.25)">
+			<title>Sheet.62</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape63-123" v:mID="63" v:groupContext="shape" transform="translate(28.5965,-0.25)">
+			<title>Sheet.63</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape64-125" v:mID="64" v:groupContext="shape" transform="translate(56.9429,-0.25)">
+			<title>Sheet.64</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape65-127" v:mID="65" v:groupContext="shape" transform="translate(85.2894,-0.25)">
+			<title>Sheet.65</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape66-129" v:mID="66" v:groupContext="shape" transform="translate(113.636,-0.25)">
+			<title>Sheet.66</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape67-131" v:mID="67" v:groupContext="shape" transform="translate(141.982,-0.25)">
+			<title>Sheet.67</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape68-133" v:mID="68" v:groupContext="shape" transform="translate(170.329,-0.25)">
+			<title>Sheet.68</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape69-135" v:mID="69" v:groupContext="shape" transform="translate(198.675,-0.25)">
+			<title>Sheet.69</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape70-137" v:mID="70" v:groupContext="shape" transform="translate(227.022,-0.25)">
+			<title>Sheet.70</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape71-139" v:mID="71" v:groupContext="shape" transform="translate(255.368,-0.25)">
+			<title>Sheet.71</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape72-141" v:mID="72" v:groupContext="shape" transform="translate(283.715,-0.25)">
+			<title>Sheet.72</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape73-143" v:mID="73" v:groupContext="shape" transform="translate(312.061,-0.25)">
+			<title>Sheet.73</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+		</g>
+		<g id="shape74-145" v:mID="74" v:groupContext="shape" transform="translate(0.25,-184.502)">
+			<title>Sheet.74</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+		</g>
+		<g id="shape75-147" v:mID="75" v:groupContext="shape" transform="translate(255.368,-184.502)">
+			<title>Sheet.75</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st5"/>
+		</g>
+		<g id="shape76-149" v:mID="76" v:groupContext="shape" transform="translate(127.809,-184.502)">
+			<title>Sheet.76</title>
+			<rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+		</g>
+		<g id="shape79-151" v:mID="79" v:groupContext="shape" transform="translate(27.8091,-193.762)">
+			<title>Sheet.79</title>
+			<desc>Current processing block</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="50" cy="207.098" width="100" height="12"/>
+			<rect x="0" y="201.098" width="100" height="12" class="st6"/>
+			<text x="9.78" y="209.5" class="st7" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>Current processing block</text>		</g>
+		<g id="shape80-154" v:mID="80" v:groupContext="shape" transform="translate(158.899,-192.675)">
+			<title>Sheet.80</title>
+			<desc>Allowed prediction block</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="50" cy="207.098" width="100" height="12"/>
+			<rect x="0" y="201.098" width="100" height="12" class="st6"/>
+			<text x="9.68" y="209.5" class="st7" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>Allowed prediction block</text>		</g>
+		<g id="shape81-157" v:mID="81" v:groupContext="shape" transform="translate(290.407,-192.675)">
+			<title>Sheet.81</title>
+			<desc>Restricted immediate blocks</desc>
+			<v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+			<v:textRect cx="50" cy="207.098" width="100" height="12"/>
+			<rect x="0" y="201.098" width="100" height="12" class="st6"/>
+			<text x="3.92" y="209.5" class="st7" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>Restricted immediate blocks</text>		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/secondary_tap.svg b/media/libaom/src/doc/img/secondary_tap.svg
new file mode 100644
index 0000000000..4c6283de36
--- /dev/null
+++ b/media/libaom/src/doc/img/secondary_tap.svg
@@ -0,0 +1,857 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export secondary_tap.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+		xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="11.2533in" height="3.38188in"
+		viewBox="0 0 810.24 243.495" xml:space="preserve" color-interpolation-filters="sRGB" class="st7">
+	<v:documentProperties v:langID="1033" v:viewMarkup="false">
+		<v:userDefs>
+			<v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+		</v:userDefs>
+	</v:documentProperties>
+
+	<style type="text/css">
+	<![CDATA[
+		.st1 {fill:#ffffff;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st2 {fill:#000000;font-family:Calibri;font-size:1.00001em}
+		.st3 {fill:#00b0f0;fill-opacity:0.5;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+		.st4 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+		.st5 {fill:#000000;font-family:Calibri;font-size:1.5em;font-style:italic}
+		.st6 {font-size:1em;font-style:normal}
+		.st7 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+	]]>
+	</style>
+
+	<g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+		<title>Page-1</title>
+		<v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+		<g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.12,-189.375)">
+			<title>Square</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape2-3" v:mID="2" v:groupContext="shape" transform="translate(54.12,-189.375)">
+			<title>Square.2</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape3-5" v:mID="3" v:groupContext="shape" transform="translate(90.12,-189.375)">
+			<title>Square.3</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape4-8" v:mID="4" v:groupContext="shape" transform="translate(126.12,-189.375)">
+			<title>Square.4</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape5-10" v:mID="5" v:groupContext="shape" transform="translate(162.12,-189.375)">
+			<title>Square.5</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape6-12" v:mID="6" v:groupContext="shape" transform="translate(18.12,-153.375)">
+			<title>Square.6</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape7-14" v:mID="7" v:groupContext="shape" transform="translate(54.12,-153.375)">
+			<title>Square.7</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape8-16" v:mID="8" v:groupContext="shape" transform="translate(90.12,-153.375)">
+			<title>Square.8</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape9-19" v:mID="9" v:groupContext="shape" transform="translate(126.12,-153.375)">
+			<title>Square.9</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape10-21" v:mID="10" v:groupContext="shape" transform="translate(162.12,-153.375)">
+			<title>Square.10</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape11-23" v:mID="11" v:groupContext="shape" transform="translate(18.12,-117.375)">
+			<title>Square.11</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape12-26" v:mID="12" v:groupContext="shape" transform="translate(54.12,-117.375)">
+			<title>Square.12</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape13-29" v:mID="13" v:groupContext="shape" transform="translate(90.12,-117.375)">
+			<title>Square.13</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st3"/>
+		</g>
+		<g id="shape14-31" v:mID="14" v:groupContext="shape" transform="translate(126.12,-117.375)">
+			<title>Square.14</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape15-34" v:mID="15" v:groupContext="shape" transform="translate(162.12,-117.375)">
+			<title>Square.15</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape16-37" v:mID="16" v:groupContext="shape" transform="translate(18.12,-81.375)">
+			<title>Square.16</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape17-39" v:mID="17" v:groupContext="shape" transform="translate(54.12,-81.375)">
+			<title>Square.17</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape18-41" v:mID="18" v:groupContext="shape" transform="translate(90.12,-81.375)">
+			<title>Square.18</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape19-44" v:mID="19" v:groupContext="shape" transform="translate(126.12,-81.375)">
+			<title>Square.19</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape20-46" v:mID="20" v:groupContext="shape" transform="translate(162.12,-81.375)">
+			<title>Square.20</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape21-48" v:mID="21" v:groupContext="shape" transform="translate(18.12,-45.375)">
+			<title>Square.21</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape22-50" v:mID="22" v:groupContext="shape" transform="translate(54.12,-45.375)">
+			<title>Square.22</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape23-52" v:mID="23" v:groupContext="shape" transform="translate(90.12,-45.375)">
+			<title>Square.23</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape24-55" v:mID="24" v:groupContext="shape" transform="translate(126.12,-45.375)">
+			<title>Square.24</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape25-57" v:mID="25" v:groupContext="shape" transform="translate(162.12,-45.375)">
+			<title>Square.25</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape30-59" v:mID="30" v:groupContext="shape" transform="translate(216.12,-189.375)">
+			<title>Square.30</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape31-61" v:mID="31" v:groupContext="shape" transform="translate(252.12,-189.375)">
+			<title>Square.31</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape32-63" v:mID="32" v:groupContext="shape" transform="translate(288.12,-189.375)">
+			<title>Square.32</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape33-65" v:mID="33" v:groupContext="shape" transform="translate(324.12,-189.375)">
+			<title>Square.33</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape34-68" v:mID="34" v:groupContext="shape" transform="translate(360.12,-189.375)">
+			<title>Square.34</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape35-70" v:mID="35" v:groupContext="shape" transform="translate(216.12,-153.375)">
+			<title>Square.35</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape36-73" v:mID="36" v:groupContext="shape" transform="translate(252.12,-153.375)">
+			<title>Square.36</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape37-75" v:mID="37" v:groupContext="shape" transform="translate(288.12,-153.375)">
+			<title>Square.37</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape38-78" v:mID="38" v:groupContext="shape" transform="translate(324.12,-153.375)">
+			<title>Square.38</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape39-80" v:mID="39" v:groupContext="shape" transform="translate(360.12,-153.375)">
+			<title>Square.39</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape40-82" v:mID="40" v:groupContext="shape" transform="translate(216.12,-117.375)">
+			<title>Square.40</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape41-84" v:mID="41" v:groupContext="shape" transform="translate(252.12,-117.375)">
+			<title>Square.41</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape42-87" v:mID="42" v:groupContext="shape" transform="translate(288.12,-117.375)">
+			<title>Square.42</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st3"/>
+		</g>
+		<g id="shape43-89" v:mID="43" v:groupContext="shape" transform="translate(324.12,-117.375)">
+			<title>Square.43</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape44-92" v:mID="44" v:groupContext="shape" transform="translate(360.12,-117.375)">
+			<title>Square.44</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape45-94" v:mID="45" v:groupContext="shape" transform="translate(216.12,-81.375)">
+			<title>Square.45</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape46-96" v:mID="46" v:groupContext="shape" transform="translate(252.12,-81.375)">
+			<title>Square.46</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape47-98" v:mID="47" v:groupContext="shape" transform="translate(288.12,-81.3749)">
+			<title>Square.47</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape48-101" v:mID="48" v:groupContext="shape" transform="translate(324.12,-81.3749)">
+			<title>Square.48</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape49-103" v:mID="49" v:groupContext="shape" transform="translate(360.12,-81.3749)">
+			<title>Square.49</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape50-106" v:mID="50" v:groupContext="shape" transform="translate(216.12,-45.375)">
+			<title>Square.50</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape51-108" v:mID="51" v:groupContext="shape" transform="translate(252.12,-45.375)">
+			<title>Square.51</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape52-111" v:mID="52" v:groupContext="shape" transform="translate(288.12,-45.375)">
+			<title>Square.52</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape53-113" v:mID="53" v:groupContext="shape" transform="translate(324.12,-45.375)">
+			<title>Square.53</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape54-115" v:mID="54" v:groupContext="shape" transform="translate(360.12,-45.375)">
+			<title>Square.54</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape55-117" v:mID="55" v:groupContext="shape" transform="translate(414.12,-189.375)">
+			<title>Square.55</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape56-120" v:mID="56" v:groupContext="shape" transform="translate(450.12,-189.375)">
+			<title>Square.56</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape57-122" v:mID="57" v:groupContext="shape" transform="translate(486.12,-189.375)">
+			<title>Square.57</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape58-124" v:mID="58" v:groupContext="shape" transform="translate(522.12,-189.375)">
+			<title>Square.58</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape59-126" v:mID="59" v:groupContext="shape" transform="translate(558.12,-189.375)">
+			<title>Square.59</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape60-129" v:mID="60" v:groupContext="shape" transform="translate(414.12,-153.375)">
+			<title>Square.60</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape61-131" v:mID="61" v:groupContext="shape" transform="translate(450.12,-153.375)">
+			<title>Square.61</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape62-134" v:mID="62" v:groupContext="shape" transform="translate(486.12,-153.375)">
+			<title>Square.62</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape63-136" v:mID="63" v:groupContext="shape" transform="translate(522.12,-153.375)">
+			<title>Square.63</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape64-139" v:mID="64" v:groupContext="shape" transform="translate(558.12,-153.375)">
+			<title>Square.64</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape65-141" v:mID="65" v:groupContext="shape" transform="translate(414.12,-117.375)">
+			<title>Square.65</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape66-143" v:mID="66" v:groupContext="shape" transform="translate(450.12,-117.375)">
+			<title>Square.66</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape67-145" v:mID="67" v:groupContext="shape" transform="translate(486.12,-117.375)">
+			<title>Square.67</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st3"/>
+		</g>
+		<g id="shape68-147" v:mID="68" v:groupContext="shape" transform="translate(522.12,-117.375)">
+			<title>Square.68</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape69-149" v:mID="69" v:groupContext="shape" transform="translate(558.12,-117.375)">
+			<title>Square.69</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape70-151" v:mID="70" v:groupContext="shape" transform="translate(414.12,-81.375)">
+			<title>Square.70</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape71-153" v:mID="71" v:groupContext="shape" transform="translate(450.12,-81.375)">
+			<title>Square.71</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape72-156" v:mID="72" v:groupContext="shape" transform="translate(486.12,-81.3749)">
+			<title>Square.72</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape73-158" v:mID="73" v:groupContext="shape" transform="translate(522.12,-81.3749)">
+			<title>Square.73</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape74-161" v:mID="74" v:groupContext="shape" transform="translate(558.12,-81.3749)">
+			<title>Square.74</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape75-163" v:mID="75" v:groupContext="shape" transform="translate(414.12,-45.375)">
+			<title>Square.75</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape76-166" v:mID="76" v:groupContext="shape" transform="translate(450.12,-45.375)">
+			<title>Square.76</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape77-168" v:mID="77" v:groupContext="shape" transform="translate(486.12,-45.375)">
+			<title>Square.77</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape78-170" v:mID="78" v:groupContext="shape" transform="translate(522.12,-45.375)">
+			<title>Square.78</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape79-172" v:mID="79" v:groupContext="shape" transform="translate(558.12,-45.375)">
+			<title>Square.79</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape80-175" v:mID="80" v:groupContext="shape" transform="translate(612.12,-189.375)">
+			<title>Square.80</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape81-177" v:mID="81" v:groupContext="shape" transform="translate(648.12,-189.375)">
+			<title>Square.81</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape82-180" v:mID="82" v:groupContext="shape" transform="translate(684.12,-189.375)">
+			<title>Square.82</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape83-182" v:mID="83" v:groupContext="shape" transform="translate(720.12,-189.375)">
+			<title>Square.83</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape84-184" v:mID="84" v:groupContext="shape" transform="translate(756.12,-189.375)">
+			<title>Square.84</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape85-186" v:mID="85" v:groupContext="shape" transform="translate(612.12,-153.375)">
+			<title>Square.85</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape86-188" v:mID="86" v:groupContext="shape" transform="translate(648.12,-153.375)">
+			<title>Square.86</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape87-190" v:mID="87" v:groupContext="shape" transform="translate(684.12,-153.375)">
+			<title>Square.87</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape88-193" v:mID="88" v:groupContext="shape" transform="translate(720.12,-153.375)">
+			<title>Square.88</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape89-195" v:mID="89" v:groupContext="shape" transform="translate(756.12,-153.375)">
+			<title>Square.89</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape90-198" v:mID="90" v:groupContext="shape" transform="translate(612.12,-117.375)">
+			<title>Square.90</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape91-200" v:mID="91" v:groupContext="shape" transform="translate(648.12,-117.375)">
+			<title>Square.91</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape92-203" v:mID="92" v:groupContext="shape" transform="translate(684.12,-117.375)">
+			<title>Square.92</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st3"/>
+		</g>
+		<g id="shape93-205" v:mID="93" v:groupContext="shape" transform="translate(720.12,-117.375)">
+			<title>Square.93</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape94-208" v:mID="94" v:groupContext="shape" transform="translate(756.12,-117.375)">
+			<title>Square.94</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape95-210" v:mID="95" v:groupContext="shape" transform="translate(612.12,-81.375)">
+			<title>Square.95</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape96-213" v:mID="96" v:groupContext="shape" transform="translate(648.12,-81.375)">
+			<title>Square.96</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape97-215" v:mID="97" v:groupContext="shape" transform="translate(684.12,-81.3749)">
+			<title>Square.97</title>
+			<desc>2/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text>		</g>
+		<g id="shape98-218" v:mID="98" v:groupContext="shape" transform="translate(720.12,-81.3749)">
+			<title>Square.98</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape99-220" v:mID="99" v:groupContext="shape" transform="translate(756.12,-81.3749)">
+			<title>Square.99</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape100-222" v:mID="100" v:groupContext="shape" transform="translate(612.12,-45.375)">
+			<title>Square.100</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape101-224" v:mID="101" v:groupContext="shape" transform="translate(648.12,-45.375)">
+			<title>Square.101</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape102-226" v:mID="102" v:groupContext="shape" transform="translate(684.12,-45.375)">
+			<title>Square.102</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape103-228" v:mID="103" v:groupContext="shape" transform="translate(720.12,-45.375)">
+			<title>Square.103</title>
+			<desc>1/16</desc>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+			<text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text>		</g>
+		<g id="shape104-231" v:mID="104" v:groupContext="shape" transform="translate(756.12,-45.375)">
+			<title>Square.104</title>
+			<v:userDefs>
+				<v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+			</v:userDefs>
+			<rect x="0" y="207.495" width="36" height="36" class="st1"/>
+		</g>
+		<g id="shape236-233" v:mID="236" v:groupContext="shape" transform="translate(54.12,-18.375)">
+			<title>Sheet.236</title>
+			<desc>d = 0, 4</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="229.995" width="108" height="27"/>
+			<rect x="0" y="216.495" width="108" height="27" class="st4"/>
+			<text x="27.42" y="235.4" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st6"> </tspan><tspan
+						class="st6">= 0, 4</tspan></text>		</g>
+		<g id="shape237-238" v:mID="237" v:groupContext="shape" transform="translate(252.12,-18.375)">
+			<title>Sheet.237</title>
+			<desc>d = 1, 5</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="229.995" width="108" height="27"/>
+			<rect x="0" y="216.495" width="108" height="27" class="st4"/>
+			<text x="27.42" y="235.4" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st6"> </tspan><tspan
+						class="st6">= 1, 5</tspan></text>		</g>
+		<g id="shape238-243" v:mID="238" v:groupContext="shape" transform="translate(450.12,-18.375)">
+			<title>Sheet.238</title>
+			<desc>d = 2, 6</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="229.995" width="108" height="27"/>
+			<rect x="0" y="216.495" width="108" height="27" class="st4"/>
+			<text x="27.42" y="235.4" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st6"> </tspan><tspan
+						class="st6">= 2, 6</tspan></text>		</g>
+		<g id="shape239-248" v:mID="239" v:groupContext="shape" transform="translate(648.12,-18.375)">
+			<title>Sheet.239</title>
+			<desc>d = 3, 7</desc>
+			<v:textBlock v:margins="rect(4,4,4,4)"/>
+			<v:textRect cx="54" cy="229.995" width="108" height="27"/>
+			<rect x="0" y="216.495" width="108" height="27" class="st4"/>
+			<text x="27.42" y="235.4" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st6"> </tspan><tspan
+						class="st6">= 3, 7</tspan></text>		</g>
+	</g>
+</svg>
diff --git a/media/libaom/src/doc/img/tx_basis.svg b/media/libaom/src/doc/img/tx_basis.svg
new file mode 100644
index 0000000000..eb27b0314b
--- /dev/null
+++ b/media/libaom/src/doc/img/tx_basis.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 381.36 266.69"><defs><style>.cls-1,.cls-21{fill:none;}.cls-2{clip-path:url(#clip-path);}.cls-3{fill:#ddebf7;}.cls-4{clip-path:url(#clip-path-2);}.cls-15,.cls-24,.cls-5{font-size:11.04px;}.cls-5{font-family:Calibri, Calibri;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0em;}.cls-12{letter-spacing:0em;}.cls-13{letter-spacing:0em;}.cls-14{font-size:7.32px;}.cls-14,.cls-15{font-family:Calibri-Italic, Calibri;font-style:italic;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0em;}.cls-18{letter-spacing:0.01em;}.cls-19{letter-spacing:0.01em;}.cls-20{clip-path:url(#clip-path-13);}.cls-21{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}.cls-22{clip-path:url(#clip-path-14);}.cls-23{clip-path:url(#clip-path-15);}.cls-24,.cls-25,.cls-33{font-family:CambriaMath, Cambria Math;}.cls-25{font-size:8.04px;}.cls-26{fill-rule:evenodd;}.cls-27{letter-spacing:0em;}.cls-28{clip-path:url(#clip-path-35);}.cls-29{clip-path:url(#clip-path-47);}.cls-30{letter-spacing:0em;}.cls-31{letter-spacing:-0.01em;}.cls-32{clip-path:url(#clip-path-98);}.cls-33{font-size:11.06px;}</style><clipPath id="clip-path" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="1.92" y="1.92" width="381.22" height="594.46"/></clipPath><clipPath id="clip-path-2" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="1.92" y="339.53" width="381.22" height="17.16"/></clipPath><clipPath id="clip-path-13" transform="translate(-1.43 -338.09)"><rect class="cls-1" width="385.18" height="598.42"/></clipPath><clipPath id="clip-path-14" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="132.72" y="370.06" width="181.68" height="53.04"/></clipPath><clipPath id="clip-path-15" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="0.96" y="0.94" width="382.08" height="595.32"/></clipPath><clipPath id="clip-path-35" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="137.4" y="411.46" width="131.16" height="27.24"/></clipPath><clipPath id="clip-path-47" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="131.52" y="451.78" width="194.28" height="41.52"/></clipPath><clipPath id="clip-path-98" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="134.4" y="566.98" width="105.84" height="18.36"/></clipPath></defs><title>tables2Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><g class="cls-2"><rect class="cls-3" x="0.01" y="0.96" width="381.34" height="18.24"/></g><g class="cls-4"><text class="cls-5" transform="translate(21.49 13.8)">Tra<tspan class="cls-6" x="14.52" y="0">n</tspan><tspan x="20.28" y="0">sf</tspan><tspan class="cls-7" x="27.97" y="0">o</tspan><tspan class="cls-8" x="33.83" y="0">r</tspan><tspan class="cls-9" x="37.67" y="0">m</tspan><tspan x="46.53" y="0"> </tspan><tspan class="cls-10" x="49.02" y="0">T</tspan><tspan class="cls-7" x="54.42" y="0">y</tspan><tspan class="cls-6" x="59.46" y="0">p</tspan><tspan x="65.22" y="0">e</tspan></text></g><g class="cls-4"><text class="cls-5" transform="translate(164.19 13.2)">Basis f<tspan class="cls-11" x="28.33" y="0">u</tspan><tspan class="cls-6" x="34.08" y="0">n</tspan><tspan class="cls-8" x="39.85" y="0">cti</tspan><tspan class="cls-12" x="50.67" y="0">o</tspan><tspan class="cls-13" x="56.55" y="0">n</tspan><tspan class="cls-8" x="62.31" y="0" xml:space="preserve"> T</tspan></text></g><g class="cls-4"><text class="cls-14" transform="translate(234.54 15.36)">i</text></g><g class="cls-4"><text class="cls-5" transform="translate(238.26 13.2)">(</text></g><g class="cls-4"><text class="cls-15" transform="translate(241.62 13.2)">j</text></g><g class="cls-4"><text class="cls-5" transform="translate(246.3 13.2)">), </text></g><g class="cls-4"><text class="cls-15" transform="translate(254.94 13.2)">i</text></g><g class="cls-4"><text class="cls-5" transform="translate(259.5 13.2)">, </text></g><g class="cls-4"><text class="cls-15" transform="translate(264.78 13.2)">j</text></g><g class="cls-4"><text class="cls-5" transform="translate(269.46 13.2)"> <tspan class="cls-16" x="2.5" y="0">=</tspan><tspan class="cls-17" x="8.02" y="0"> </tspan><tspan class="cls-18" x="10.51" y="0">0</tspan><tspan class="cls-17" x="16.16" y="0">, </tspan><tspan class="cls-18" x="21.41" y="0">1</tspan><tspan class="cls-17" x="27.06" y="0">, </tspan><tspan class="cls-19" x="32.31" y="0">…</tspan><tspan x="40" y="0">, N</tspan><tspan class="cls-11" x="52.38" y="0">-</tspan><tspan class="cls-17" x="55.72" y="0">1</tspan></text></g><g class="cls-2"><text class="cls-5" transform="translate(43.69 67.44)"><tspan class="cls-7">D</tspan><tspan x="6.83" y="0">CT-2</tspan></text><text class="cls-5" transform="translate(44.17 137.55)"><tspan class="cls-7">D</tspan><tspan x="6.83" y="0">ST-4</tspan></text><text class="cls-5" transform="translate(44.17 188.67)"><tspan class="cls-7">D</tspan><tspan x="6.83" y="0">ST-7</tspan></text><text class="cls-5" transform="translate(49.81 237.99)">IDT</text></g><g class="cls-20"><line class="cls-21" x1="113.25" y1="1.98" x2="113.25" y2="17.58"/><rect x="113.19" y="1.92" width="0.96" height="15.72"/><line class="cls-21" x1="113.25" y1="20.58" x2="113.25" y2="256.79"/><rect x="113.19" y="20.52" width="0.96" height="236.33"/><rect x="0.01" width="381.34" height="1.92"/><rect x="0.01" y="17.64" width="381.34" height="0.96"/><rect x="0.01" y="19.56" width="381.34" height="0.96"/><line class="cls-21" x1="0.07" y1="107.97" x2="381.29" y2="107.97"/><rect x="0.01" y="107.91" width="381.34" height="0.96"/><line class="cls-21" x1="0.07" y1="158.73" x2="381.29" y2="158.73"/><rect x="0.01" y="158.67" width="381.34" height="0.96"/><line class="cls-21" x1="0.07" y1="210.21" x2="381.29" y2="210.21"/><rect x="0.01" y="210.15" width="381.34" height="0.96"/><rect x="0.01" y="256.85" width="381.34" height="1.92"/></g><g class="cls-22"><path d="M139.1,395.72h-2.27l.06-.26a1.15,1.15,0,0,0,.32-.09.48.48,0,0,0,.18-.17,1.29,1.29,0,0,0,.16-.35c0-.17.11-.41.18-.74l1.16-5.27h-.69a1,1,0,0,0-.93.45,6.74,6.74,0,0,0-.49.89h-.51l.4-1.82h5.63l-.42,1.9h-.52a7.68,7.68,0,0,0,0-.79.92.92,0,0,0-.1-.39.37.37,0,0,0-.18-.19,1,1,0,0,0-.39-.05h-.8l-1.17,5.31c0,.14-.06.25-.07.34l0,.26a2.32,2.32,0,0,0,0,.24.48.48,0,0,0,.06.26.28.28,0,0,0,.16.14,1.61,1.61,0,0,0,.37.07Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M142.24,395.35a2.84,2.84,0,0,0,.09-.58c0-.12,0-.2-.08-.24a.54.54,0,0,0-.33-.07l0-.21,1,0h.23l-.56,2.52a2.84,2.84,0,0,0-.09.58.44.44,0,0,0,.05.23.18.18,0,0,0,.16.07.45.45,0,0,0,.25-.09,3,3,0,0,0,.38-.37l.22.22a3.32,3.32,0,0,1-.62.55,1.09,1.09,0,0,1-.54.14.51.51,0,0,1-.41-.18.76.76,0,0,1-.15-.48,3.63,3.63,0,0,1,.11-.76Zm1.27-2.64-.18.76h-.7l.18-.76Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(143.1 57.63)">(</text></g><g class="cls-23"><path d="M151.06,395.81a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81A.39.39,0,0,0,151,391a.85.85,0,0,0-.45-.1l.06-.28,1.31-.05h.31Zm1.62-7.36-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(151.86 57.63)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(159.42 57.63)">=</text></g><g class="cls-23"><path d="M177.09,395.81a1.21,1.21,0,0,1-1.22-.89h0a1.94,1.94,0,0,1-.39.48,1.53,1.53,0,0,1-.52.29,2.11,2.11,0,0,1-.75.12,1.65,1.65,0,0,1-.76-.19,1.37,1.37,0,0,1-.56-.58,2,2,0,0,1-.22-1,3.43,3.43,0,0,1,.63-1.93,5.86,5.86,0,0,1,1.78-1.67l.27.37a5.18,5.18,0,0,0-1.28,1.51,3.62,3.62,0,0,0-.47,1.75,1.61,1.61,0,0,0,.24.95.77.77,0,0,0,.66.33.88.88,0,0,0,.68-.32,2.1,2.1,0,0,0,.41-.95l.45-2h.9l-.41,1.85a3.25,3.25,0,0,0-.07.59.94.94,0,0,0,.19.65.66.66,0,0,0,.52.21,1.13,1.13,0,0,0,.8-.33,2.1,2.1,0,0,0,.53-.95,4.86,4.86,0,0,0,.19-1.45,4,4,0,0,0-.13-1,2.57,2.57,0,0,0-.38-.85l.37-.32a3.59,3.59,0,0,1,.81,1.13,3.25,3.25,0,0,1,.27,1.32,3.35,3.35,0,0,1-.31,1.46,2.49,2.49,0,0,1-.89,1.05,2.34,2.34,0,0,1-1.31.38Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-25" transform="translate(178.74 59.91)">0</text></g><g class="cls-23"><text class="cls-24" transform="translate(186.06 57.63)">∙</text></g><polygon class="cls-26" points="198.35 34.22 200.03 34.22 200.03 34.26 207.86 34.26 207.86 34.98 199.58 34.98 199.58 34.93 198.95 34.93 196.54 71.02 196.04 71.02 192.6 64.69 191.58 65.23 191.37 64.85 193.31 63.82 196.06 68.91 198.35 34.22"/><rect x="199.58" y="54.06" width="8.28" height="0.72"/><g class="cls-23"><text class="cls-24" transform="translate(200.72 49.23)">2</text></g><g class="cls-23"><path d="M208.83,395.8l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c0,.15-.11.4-.19.75l-1.26,5.75H206l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1.05c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(210.44 57.63)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(215.6 57.63)"><tspan class="cls-27">c</tspan><tspan x="4.91" y="0">os</tspan></text></g><path class="cls-26" d="M240,378.52l.25.41a12.83,12.83,0,0,0-2.94,5.56,32,32,0,0,0,0,16.15,12.9,12.9,0,0,0,2.95,5.63l-.25.4a12.67,12.67,0,0,1-3.52-5.71,27.72,27.72,0,0,1,0-16.78A12.76,12.76,0,0,1,240,378.52Zm65.56,0a12.76,12.76,0,0,1,3.52,5.66,27.72,27.72,0,0,1,0,16.78,12.67,12.67,0,0,1-3.52,5.71l-.25-.4a12.79,12.79,0,0,0,2.95-5.63,29.71,29.71,0,0,0,1-8.05,29.13,29.13,0,0,0-1-8.1,12.83,12.83,0,0,0-2.94-5.56Z" transform="translate(-1.43 -338.09)"/><polygon class="cls-26" points="239.18 54.06 271.34 54.06 303.5 54.06 303.5 54.78 271.34 54.78 239.18 54.78 239.18 54.06"/><g class="cls-23"><path d="M244.35,384.9c.06-.28.14-.63.26-1s.21-.71.29-.93l0-.05h-1.58l-.66,3.07c-.06.24-.12.5-.19.77s-.13.48-.17.6h-1.14l.05-.26a6.09,6.09,0,0,0,1.06-2.66l.35-1.52h-.17a1.38,1.38,0,0,0-.64.13,1.83,1.83,0,0,0-.58.51l-.34-.28c.19-.23.34-.4.46-.52a2,2,0,0,1,.37-.3,1.58,1.58,0,0,1,.44-.19,2.52,2.52,0,0,1,.58-.06h4.42l-.15.71h-1.29l-.61,2.7a4.59,4.59,0,0,0-.11.8.53.53,0,0,0,.07.31.26.26,0,0,0,.22.1c.21,0,.47-.18.76-.52l.31.29a3.45,3.45,0,0,1-.75.65,1.39,1.39,0,0,1-.72.2.73.73,0,0,1-.57-.25,1,1,0,0,1-.21-.66,5.29,5.29,0,0,1,.15-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(248.36 49.23)">∙</text></g><g class="cls-23"><path d="M256.26,383.69a4.12,4.12,0,0,0,.12-.81.39.39,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.32l-.78,3.46a4,4,0,0,0-.12.8.53.53,0,0,0,.07.31.27.27,0,0,0,.22.1.63.63,0,0,0,.35-.14,3.19,3.19,0,0,0,.51-.5l.31.3a5,5,0,0,1-.86.76,1.4,1.4,0,0,1-.73.2.72.72,0,0,1-.57-.26,1,1,0,0,1-.21-.65,4.26,4.26,0,0,1,.16-1.05Zm1.74-3.64-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(259.76 49.23)">∙</text></g><path class="cls-26" d="M270.53,379.09l.14.41a3.15,3.15,0,0,0-1.87,1.62,6.76,6.76,0,0,0-.6,3,7.06,7.06,0,0,0,.6,3.11,3.16,3.16,0,0,0,1.86,1.64l-.13.41a3.87,3.87,0,0,1-2.42-1.79,7,7,0,0,1,0-6.64A3.86,3.86,0,0,1,270.53,379.09Zm30.2,0a3.87,3.87,0,0,1,2.41,1.79,6.16,6.16,0,0,1,.85,3.32,6.23,6.23,0,0,1-.84,3.32,3.9,3.9,0,0,1-2.42,1.79l-.13-.41a3.14,3.14,0,0,0,1.85-1.64,7.06,7.06,0,0,0,.6-3.11,6.76,6.76,0,0,0-.6-3,3.12,3.12,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M274.69,386.46a2.46,2.46,0,0,0,.43,0,.78.78,0,0,0,.26-.11.62.62,0,0,0,.15-.2c0-.08.09-.21.15-.37h.45l-.09,1.57h-4.43v-.26a6.59,6.59,0,0,1,.62-1.16,11.52,11.52,0,0,1,1.12-1.39q.61-.69.9-1.05a4.89,4.89,0,0,0,.45-.67,2.42,2.42,0,0,0,.21-.53,2.22,2.22,0,0,0,.06-.51,1.92,1.92,0,0,0-.12-.71,1.08,1.08,0,0,0-.4-.49,1.17,1.17,0,0,0-.67-.18,1.37,1.37,0,0,0-1.36,1.12h-.65v-1A5.92,5.92,0,0,1,273,380a4.45,4.45,0,0,1,1-.13,2.27,2.27,0,0,1,1.51.43,1.58,1.58,0,0,1,.52,1.28,2.2,2.2,0,0,1-.06.53,2.27,2.27,0,0,1-.18.47,3.74,3.74,0,0,1-.32.5c-.14.18-.29.37-.46.55s-.45.49-.85.93a10.93,10.93,0,0,0-1.51,1.89Zm4.31,1a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31-.05h.31Zm1.62-7.36-.25,1.05h-.95l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(282.2 49.23)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(292.88 49.23)">1</text></g><g class="cls-23"><path d="M269.41,402.3a2.46,2.46,0,0,0,.43,0,.78.78,0,0,0,.26-.11.62.62,0,0,0,.15-.2c0-.08.09-.21.15-.37h.45l-.09,1.57h-4.43v-.26a6.59,6.59,0,0,1,.62-1.16,11.52,11.52,0,0,1,1.12-1.39q.62-.69.9-1.05a4.89,4.89,0,0,0,.45-.67,2.42,2.42,0,0,0,.21-.53,2.22,2.22,0,0,0,.06-.51,1.92,1.92,0,0,0-.12-.71,1.08,1.08,0,0,0-.4-.49,1.17,1.17,0,0,0-.67-.18,1.37,1.37,0,0,0-1.36,1.12h-.65v-1a5.92,5.92,0,0,1,1.22-.45,4.45,4.45,0,0,1,1-.13,2.27,2.27,0,0,1,1.51.43,1.58,1.58,0,0,1,.52,1.28,2.2,2.2,0,0,1-.06.53,2.27,2.27,0,0,1-.18.47,3.74,3.74,0,0,1-.32.5c-.14.18-.29.37-.46.55s-.45.49-.85.93a10.93,10.93,0,0,0-1.51,1.89Zm10.12-6.5-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c-.05.15-.11.4-.19.75l-1.26,5.75h-.74l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1.05c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,2.63,2.63,0,0,0,.13-.32c0-.12.09-.36.17-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-28"><text class="cls-5" transform="translate(136.02 91.95)">where </text></g><g class="cls-23"><path d="M175.77,430.13a1.21,1.21,0,0,1-1.22-.89h0a1.94,1.94,0,0,1-.39.48,1.53,1.53,0,0,1-.52.29,2.11,2.11,0,0,1-.75.12,1.65,1.65,0,0,1-.76-.19,1.37,1.37,0,0,1-.56-.58,2,2,0,0,1-.22-1,3.43,3.43,0,0,1,.63-1.93,5.86,5.86,0,0,1,1.78-1.67l.27.37a5.18,5.18,0,0,0-1.28,1.51,3.62,3.62,0,0,0-.47,1.75,1.61,1.61,0,0,0,.24.95.77.77,0,0,0,.66.33.88.88,0,0,0,.68-.32,2.1,2.1,0,0,0,.41-.95l.45-2h.9l-.41,1.85a3.25,3.25,0,0,0-.07.59.94.94,0,0,0,.19.65.66.66,0,0,0,.52.21,1.13,1.13,0,0,0,.8-.33,2.1,2.1,0,0,0,.53-.95,4.86,4.86,0,0,0,.19-1.45,4,4,0,0,0-.13-1,2.57,2.57,0,0,0-.38-.85l.37-.32a3.59,3.59,0,0,1,.81,1.13,3.25,3.25,0,0,1,.27,1.32,3.35,3.35,0,0,1-.31,1.46,2.49,2.49,0,0,1-.89,1.05,2.34,2.34,0,0,1-1.31.38Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-25" transform="translate(177.42 94.23)">0</text></g><g class="cls-23"><text class="cls-24" transform="translate(185.34 91.95)">=</text></g><path class="cls-26" d="M202.24,421.8l.14.42a3.09,3.09,0,0,0-1.87,1.62,6.74,6.74,0,0,0-.6,3,7,7,0,0,0,.6,3.11,3.1,3.1,0,0,0,1.86,1.64l-.13.42a3.87,3.87,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A3.93,3.93,0,0,1,202.24,421.8Zm24.92,0a3.91,3.91,0,0,1,2.41,1.79,7,7,0,0,1,0,6.65,3.9,3.9,0,0,1-2.42,1.79l-.13-.42a3.08,3.08,0,0,0,1.85-1.64,7,7,0,0,0,.6-3.11,6.74,6.74,0,0,0-.6-3,3.06,3.06,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M203.92,426.41a4.12,4.12,0,0,0,.12-.81.39.39,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.32l-.78,3.46a4,4,0,0,0-.12.8.53.53,0,0,0,.07.31.27.27,0,0,0,.22.1.63.63,0,0,0,.35-.14,3.19,3.19,0,0,0,.51-.5l.31.3a5,5,0,0,1-.86.76,1.4,1.4,0,0,1-.73.2.72.72,0,0,1-.57-.26,1,1,0,0,1-.21-.65,4.26,4.26,0,0,1,.16-1.05Zm1.74-3.64-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(208.02 91.95)">=</text></g><g class="cls-23"><text class="cls-24" transform="translate(219.3 91.95)">0</text></g><g class="cls-23"><text class="cls-24" transform="translate(229.98 91.95)">?</text></g><polygon class="cls-26" points="247.1 75.66 253.22 75.66 253.22 76.38 247.51 76.38 247.51 76.41 246.43 76.41 244.03 100.31 243.53 100.31 240.2 94.2 239.21 94.72 239 94.34 240.87 93.35 243.58 98.36 245.82 75.69 247.1 75.69 247.1 75.66"/><rect x="247.1" y="88.38" width="6.12" height="0.72"/><g class="cls-23"><text class="cls-25" transform="translate(248 85.47)">2</text></g><g class="cls-23"><path d="M254.25,430.2l0,.19a.83.83,0,0,0-.21.07.46.46,0,0,0-.12.12,1.06,1.06,0,0,0-.1.25,3.94,3.94,0,0,0-.14.54l-.92,4.19h-.54L251,432.19c-.14-.4-.26-.8-.37-1.2h0c0,.13-.06.38-.13.76s-.15.74-.22,1.1l-.35,1.57a3.29,3.29,0,0,0-.08.61.3.3,0,0,0,.09.25.5.5,0,0,0,.29.09l0,.19h-1.37l0-.19a.51.51,0,0,0,.22-.08.39.39,0,0,0,.13-.15,1.85,1.85,0,0,0,.09-.23c0-.09.07-.26.13-.52l.67-3a2.21,2.21,0,0,0,.06-.32,1.71,1.71,0,0,0,0-.31.29.29,0,0,0-.1-.26.52.52,0,0,0-.29-.09l0-.19h1.26l1,3.06c.15.44.27.81.35,1.11h0c0-.16.07-.42.15-.79s.14-.7.2-1l.28-1.23a3.58,3.58,0,0,0,.08-.63.31.31,0,0,0-.09-.26.52.52,0,0,0-.29-.09l0-.19Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(255.08 91.95)">:</text></g><g class="cls-28"><text class="cls-24" transform="translate(259.76 91.95)">1</text></g><g class="cls-29"><path d="M138,477.48h-2.27l.06-.26a1.15,1.15,0,0,0,.32-.09.48.48,0,0,0,.18-.17,1.29,1.29,0,0,0,.16-.35c.05-.16.11-.41.18-.74l1.16-5.27h-.69a1,1,0,0,0-.93.45,6.74,6.74,0,0,0-.49.89h-.51l.4-1.82h5.63l-.42,1.9h-.52a7.68,7.68,0,0,0,0-.79.92.92,0,0,0-.1-.39.37.37,0,0,0-.18-.19,1,1,0,0,0-.39,0h-.8l-1.17,5.31c0,.14-.06.25-.07.34l0,.26a2.32,2.32,0,0,0,0,.24.48.48,0,0,0,.06.26.28.28,0,0,0,.16.14,1.61,1.61,0,0,0,.37.07Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M141.13,477.11a2.84,2.84,0,0,0,.09-.58c0-.12,0-.2-.08-.24a.54.54,0,0,0-.33-.07l0-.21,1,0H142l-.56,2.52a2.84,2.84,0,0,0-.09.58.44.44,0,0,0,.05.23.18.18,0,0,0,.16.07.45.45,0,0,0,.25-.09,3,3,0,0,0,.38-.37l.22.22a3.32,3.32,0,0,1-.62.55,1.09,1.09,0,0,1-.54.14.51.51,0,0,1-.41-.18.76.76,0,0,1-.15-.48,3.63,3.63,0,0,1,.11-.76Zm1.27-2.64-.18.76h-.7l.18-.76Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(141.99 139.39)">(</text></g><g class="cls-23"><path d="M150,477.57a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88A3.79,3.79,0,0,0,150,473a.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31-.05h.31Zm1.62-7.36-.25,1.05h-.95l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(150.75 139.39)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(158.31 139.39)">=</text></g><polygon class="cls-26" points="176.73 115.98 178.42 115.98 178.42 116.03 186.25 116.03 186.25 116.75 177.97 116.75 177.97 116.7 177.34 116.7 174.93 152.79 174.43 152.79 170.99 146.45 169.97 147 169.76 146.62 171.7 145.59 174.45 150.68 176.73 115.98"/><rect x="177.97" y="135.83" width="8.28" height="0.72"/><g class="cls-23"><text class="cls-24" transform="translate(179.1 130.99)">2</text></g><g class="cls-23"><path d="M187.23,477.56l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c0,.15-.11.4-.19.75l-1.26,5.75h-.74l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1.05c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(188.82 139.39)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(193.98 139.39)"><tspan class="cls-12">s</tspan><tspan class="cls-30" x="4.8" y="0">i</tspan><tspan x="7.91" y="0">n</tspan></text></g><path class="cls-26" d="M216.8,460.29l.26.4a12.88,12.88,0,0,0-2.95,5.57,31.83,31.83,0,0,0,0,16.14,12.84,12.84,0,0,0,2.95,5.64l-.26.4a12.75,12.75,0,0,1-3.51-5.71,25.79,25.79,0,0,1-1.3-8.38,25.53,25.53,0,0,1,1.3-8.4A12.78,12.78,0,0,1,216.8,460.29Zm100,0a12.71,12.71,0,0,1,3.52,5.66,25.8,25.8,0,0,1,1.3,8.4,26.07,26.07,0,0,1-1.3,8.38,12.67,12.67,0,0,1-3.52,5.71l-.25-.4a12.93,12.93,0,0,0,2.94-5.64,31.83,31.83,0,0,0,0-16.14,13,13,0,0,0-2.94-5.57Z" transform="translate(-1.43 -338.09)"/><polygon class="cls-26" points="216.01 135.83 248.93 135.83 281.85 135.83 314.77 135.83 314.77 136.55 281.85 136.55 248.93 136.55 216.01 136.55 216.01 135.83"/><g class="cls-23"><path d="M221.17,466.67c.06-.29.14-.64.26-1.05s.21-.71.29-.93l0-.05h-1.58l-.66,3.07c-.06.24-.12.5-.19.77s-.13.48-.17.6H218l.05-.26a6.09,6.09,0,0,0,1.06-2.66l.35-1.52h-.17a1.38,1.38,0,0,0-.64.13,1.76,1.76,0,0,0-.58.52l-.34-.29c.19-.23.34-.4.46-.52a2,2,0,0,1,.37-.3A1.58,1.58,0,0,1,219,464a2.52,2.52,0,0,1,.58-.06H224l-.15.71h-1.29l-.61,2.7a4.59,4.59,0,0,0-.11.8.53.53,0,0,0,.07.31.26.26,0,0,0,.22.1c.21,0,.47-.18.76-.52l.31.29a3.45,3.45,0,0,1-.75.65,1.39,1.39,0,0,1-.72.2.73.73,0,0,1-.57-.25,1,1,0,0,1-.21-.66,5.29,5.29,0,0,1,.15-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(225.3 130.99)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(230.46 130.99)">(</text></g><g class="cls-23"><text class="cls-24" transform="translate(235.02 130.99)">2</text></g><g class="cls-23"><path d="M243.88,465.45a4.12,4.12,0,0,0,.12-.81.39.39,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.32l-.78,3.46a4,4,0,0,0-.12.8.53.53,0,0,0,.07.31.27.27,0,0,0,.22.1.63.63,0,0,0,.35-.14,3.19,3.19,0,0,0,.51-.5l.31.3a5,5,0,0,1-.86.76,1.4,1.4,0,0,1-.73.2.72.72,0,0,1-.57-.26,1,1,0,0,1-.21-.65,4.26,4.26,0,0,1,.16-1.05Zm1.74-3.64-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(247.38 130.99)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(258.06 130.99)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(264.18 130.99)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(271.16 130.99)">∙</text></g><path class="cls-26" d="M281.91,460.85l.15.42a3.06,3.06,0,0,0-1.87,1.62,6.73,6.73,0,0,0-.61,3,7,7,0,0,0,.61,3.11,3.05,3.05,0,0,0,1.85,1.64l-.13.42a3.9,3.9,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A4,4,0,0,1,281.91,460.85Zm30.08,0a3.93,3.93,0,0,1,2.42,1.79,7,7,0,0,1,0,6.65,3.9,3.9,0,0,1-2.42,1.79l-.13-.42a3.05,3.05,0,0,0,1.85-1.64,6.89,6.89,0,0,0,.61-3.11,6.74,6.74,0,0,0-.6-3,3.08,3.08,0,0,0-1.88-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M286.09,468.22a2.46,2.46,0,0,0,.43,0,.78.78,0,0,0,.26-.11.62.62,0,0,0,.15-.2c0-.08.09-.21.15-.37h.45l-.09,1.57H283v-.26a6.59,6.59,0,0,1,.62-1.16,11.52,11.52,0,0,1,1.12-1.39q.62-.69.9-1a4.89,4.89,0,0,0,.45-.67,2.42,2.42,0,0,0,.21-.53,2.22,2.22,0,0,0,.06-.51,1.92,1.92,0,0,0-.12-.71,1.08,1.08,0,0,0-.4-.49,1.17,1.17,0,0,0-.67-.18,1.37,1.37,0,0,0-1.36,1.12h-.65v-1a5.92,5.92,0,0,1,1.22-.45,4.45,4.45,0,0,1,1-.13,2.27,2.27,0,0,1,1.51.43,1.58,1.58,0,0,1,.52,1.28,2.2,2.2,0,0,1-.06.53,2.27,2.27,0,0,1-.18.47,3.74,3.74,0,0,1-.32.5c-.14.18-.29.37-.46.55s-.45.49-.85.93a10.93,10.93,0,0,0-1.51,1.89Zm4.31.95a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31-.05h.31Zm1.62-7.36-.25,1.05h-.95l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(293.48 130.99)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(304.16 130.99)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(258.18 146.83)">4</text></g><g class="cls-23"><path d="M273.51,477.56l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c-.05.15-.11.4-.19.75l-1.26,5.75h-.74l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1.05c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26H266l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M138.55,528.68h-2.27l.06-.26a1.15,1.15,0,0,0,.32-.09.48.48,0,0,0,.18-.17,1.29,1.29,0,0,0,.16-.35c.05-.16.11-.41.18-.74l1.16-5.27h-.69a1,1,0,0,0-.93.45,6.74,6.74,0,0,0-.49.89h-.51l.4-1.82h5.63l-.42,1.9h-.52a7.68,7.68,0,0,0,0-.79.92.92,0,0,0-.1-.39.37.37,0,0,0-.18-.19,1,1,0,0,0-.39-.05h-.8l-1.17,5.31c0,.14-.06.25-.07.34l0,.26a2.32,2.32,0,0,0,0,.24.48.48,0,0,0,.06.26.28.28,0,0,0,.16.14,1.61,1.61,0,0,0,.37.07Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M141.69,528.31a2.84,2.84,0,0,0,.09-.58c0-.12,0-.2-.08-.24a.54.54,0,0,0-.33-.07l0-.21,1,0h.23l-.56,2.52a2.84,2.84,0,0,0-.09.58.44.44,0,0,0,.05.23.18.18,0,0,0,.16.07.45.45,0,0,0,.25-.09,3,3,0,0,0,.38-.37l.22.22a3.32,3.32,0,0,1-.62.55,1.09,1.09,0,0,1-.54.14.51.51,0,0,1-.41-.18.76.76,0,0,1-.15-.48,3.63,3.63,0,0,1,.11-.76Zm1.27-2.64-.18.76h-.7l.18-.76Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(142.55 190.59)">(</text></g><g class="cls-23"><path d="M150.53,528.77a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31,0h.31Zm1.62-7.36-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(151.33 190.59)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(158.89 190.59)">=</text></g><polygon class="cls-26" points="177.29 167.4 178.97 167.4 178.97 167.45 212.13 167.45 212.13 168.17 178.53 168.17 178.53 168.12 177.89 168.12 175.48 204.21 174.98 204.21 171.54 197.87 170.52 198.42 170.31 198.03 172.25 197.01 175 202.09 177.29 167.4"/><rect x="178.53" y="187.01" width="33.6" height="0.72"/><g class="cls-23"><text class="cls-24" transform="translate(192.25 182.19)">4</text></g><g class="cls-23"><text class="cls-24" transform="translate(178.57 198.03)">2</text></g><g class="cls-23"><path d="M193.9,528.76l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c-.05.15-.11.4-.19.75l-1.26,5.75h-.74l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72L191,533c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(195.49 198.03)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(206.05 198.03)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(214.69 190.59)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(219.85 190.59)"><tspan class="cls-27">s</tspan><tspan class="cls-31" x="4.79" y="0">i</tspan><tspan x="7.8" y="0">n</tspan></text></g><path class="cls-26" d="M242.68,511.47l.25.4a12.87,12.87,0,0,0-2.94,5.57,31.83,31.83,0,0,0,0,16.14,12.93,12.93,0,0,0,2.94,5.64l-.25.4a12.67,12.67,0,0,1-3.52-5.71,26.08,26.08,0,0,1-1.3-8.39,25.69,25.69,0,0,1,1.3-8.39A12.71,12.71,0,0,1,242.68,511.47Zm94,0a12.71,12.71,0,0,1,3.52,5.66,25.43,25.43,0,0,1,1.3,8.39,25.81,25.81,0,0,1-1.3,8.39,12.67,12.67,0,0,1-3.52,5.71l-.25-.4a12.84,12.84,0,0,0,2.95-5.64,29.66,29.66,0,0,0,1-8.05,29,29,0,0,0-1-8.09,12.87,12.87,0,0,0-2.94-5.57Z" transform="translate(-1.43 -338.09)"/><polygon class="cls-26" points="241.89 187.01 288.27 187.01 334.65 187.01 334.65 187.73 288.27 187.73 241.89 187.73 241.89 187.01"/><g class="cls-23"><path d="M247,517.87c.06-.29.14-.64.26-1s.21-.71.29-.93l0,0H246l-.66,3.07c-.06.24-.12.5-.19.77s-.13.48-.17.6h-1.14l.05-.26a6.09,6.09,0,0,0,1.06-2.66l.35-1.52h-.17a1.38,1.38,0,0,0-.64.13,1.76,1.76,0,0,0-.58.52l-.34-.29c.19-.23.34-.4.46-.52a2,2,0,0,1,.37-.3,1.58,1.58,0,0,1,.44-.19,2.52,2.52,0,0,1,.58-.06h4.42l-.15.71h-1.29l-.61,2.7a4.59,4.59,0,0,0-.11.8.53.53,0,0,0,.07.31.26.26,0,0,0,.22.1c.21,0,.47-.18.76-.52l.31.29a3.45,3.45,0,0,1-.75.65,1.39,1.39,0,0,1-.72.2.73.73,0,0,1-.57-.25,1,1,0,0,1-.21-.66,5.29,5.29,0,0,1,.15-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(251.05 182.19)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(256.23 182.19)">(</text></g><g class="cls-23"><text class="cls-24" transform="translate(260.79 182.19)">2</text></g><g class="cls-23"><path d="M269.65,516.65a4.12,4.12,0,0,0,.12-.81.39.39,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32,0h.32l-.78,3.46a4,4,0,0,0-.12.8.53.53,0,0,0,.07.31.27.27,0,0,0,.22.1.63.63,0,0,0,.35-.14,3.19,3.19,0,0,0,.51-.5l.31.3a5,5,0,0,1-.86.76,1.4,1.4,0,0,1-.73.2.72.72,0,0,1-.57-.26,1,1,0,0,1-.21-.65,4.26,4.26,0,0,1,.16-1.05Zm1.74-3.64-.25,1h-1l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(273.15 182.19)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(283.83 182.19)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(289.95 182.19)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(296.91 182.19)">∙</text></g><path class="cls-26" d="M307.67,512l.14.42a3.09,3.09,0,0,0-1.87,1.62,6.74,6.74,0,0,0-.6,3,7,7,0,0,0,.6,3.11,3.13,3.13,0,0,0,1.86,1.64l-.13.42a3.91,3.91,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A3.9,3.9,0,0,1,307.67,512Zm24.08,0a3.88,3.88,0,0,1,2.41,1.79,7,7,0,0,1,0,6.65,3.93,3.93,0,0,1-2.42,1.79l-.13-.42a3.11,3.11,0,0,0,1.85-1.64,7,7,0,0,0,.6-3.11,6.74,6.74,0,0,0-.6-3,3.06,3.06,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M310,520.37a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31,0h.31Zm1.62-7.36-.25,1h-.95l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(313.23 182.19)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(323.91 182.19)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(281.07 198.03)">2</text></g><g class="cls-23"><path d="M296.4,528.76l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c-.05.15-.11.4-.19.75l-1.26,5.75h-.74L292,531.49c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53H294c0-.22.09-.58.19-1.09s.2-1,.28-1.34l.38-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-32"><path d="M139.76,577.52h-2.27l0-.26a1.46,1.46,0,0,0,.32-.09A.48.48,0,0,0,138,577a1,1,0,0,0,.16-.36c.05-.16.12-.4.19-.73l1.15-5.29h-.68a1.18,1.18,0,0,0-.54.1,1.17,1.17,0,0,0-.4.36,5.41,5.41,0,0,0-.48.89h-.52l.4-1.82H143l-.42,1.9H142c0-.35,0-.61,0-.79a1.14,1.14,0,0,0-.11-.4.47.47,0,0,0-.18-.19,1.18,1.18,0,0,0-.39,0h-.8L139.34,576q0,.21-.06.33a2.28,2.28,0,0,0-.05.51.6.6,0,0,0,.05.26.3.3,0,0,0,.17.14,1.5,1.5,0,0,0,.37.07Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M142.89,577.17a2.84,2.84,0,0,0,.09-.58c0-.12,0-.2-.08-.24a.54.54,0,0,0-.33-.07l0-.21,1,0h.23l-.56,2.52a2.84,2.84,0,0,0-.09.58.44.44,0,0,0,0,.23.18.18,0,0,0,.16.07.45.45,0,0,0,.25-.09A3,3,0,0,0,144,579l.22.22a3.32,3.32,0,0,1-.62.55,1.09,1.09,0,0,1-.54.14.51.51,0,0,1-.41-.18.76.76,0,0,1-.15-.48,3.63,3.63,0,0,1,.11-.76Zm1.27-2.64-.18.76h-.7l.18-.76Z" transform="translate(-1.43 -338.09)"/></g><path class="cls-26" d="M149.39,569.27l.14.42a3.09,3.09,0,0,0-1.87,1.62,6.74,6.74,0,0,0-.6,3,7,7,0,0,0,.6,3.11,3.13,3.13,0,0,0,1.86,1.64l-.13.42a3.91,3.91,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A3.9,3.9,0,0,1,149.39,569.27Zm4.88,0a3.88,3.88,0,0,1,2.41,1.79,7,7,0,0,1,0,6.65,3.93,3.93,0,0,1-2.42,1.79l-.13-.42a3.11,3.11,0,0,0,1.85-1.64,7,7,0,0,0,.6-3.11,6.74,6.74,0,0,0-.6-3,3.06,3.06,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M151.73,577.61a4.12,4.12,0,0,1-.47,1.32,1.85,1.85,0,0,1-.77.75,2.4,2.4,0,0,1-1.14.25,1.47,1.47,0,0,1-.45,0l.12-.47a1.72,1.72,0,0,0,.41,0,1.15,1.15,0,0,0,.44-.08,1,1,0,0,0,.34-.26,2.06,2.06,0,0,0,.29-.5,4.84,4.84,0,0,0,.24-.84l.88-3.89a4.33,4.33,0,0,0,.12-.81.42.42,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.31Zm1.63-7.37-.25,1h-1l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-33" transform="translate(160.09 239.43)">=</text></g><path class="cls-26" d="M177,569.27l.14.42a3.09,3.09,0,0,0-1.87,1.62,6.74,6.74,0,0,0-.6,3,7,7,0,0,0,.6,3.11,3.13,3.13,0,0,0,1.86,1.64l-.13.42a3.91,3.91,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A3.9,3.9,0,0,1,177,569.27Zm31.28,0a3.88,3.88,0,0,1,2.41,1.79,7,7,0,0,1,0,6.65,3.93,3.93,0,0,1-2.42,1.79l-.13-.42a3.11,3.11,0,0,0,1.85-1.64,7,7,0,0,0,.6-3.11,6.74,6.74,0,0,0-.6-3,3.06,3.06,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M178.67,573.88a3.62,3.62,0,0,0,.13-.81.39.39,0,0,0-.12-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.32l-.78,3.47a3.87,3.87,0,0,0-.12.79.63.63,0,0,0,.07.32.28.28,0,0,0,.23.1.6.6,0,0,0,.34-.14,3.64,3.64,0,0,0,.52-.5l.31.3a5.37,5.37,0,0,1-.86.76,1.44,1.44,0,0,1-.74.2.68.68,0,0,1-.56-.26,1,1,0,0,1-.21-.66,4.7,4.7,0,0,1,.15-1Zm1.74-3.64-.25,1h-.95l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-33" transform="translate(182.77 239.43)">=</text></g><g class="cls-23"><text class="cls-33" transform="translate(191.05 239.43)">=</text></g><g class="cls-23"><path d="M205.73,577.61a4.12,4.12,0,0,1-.47,1.32,1.85,1.85,0,0,1-.77.75,2.4,2.4,0,0,1-1.14.25,1.47,1.47,0,0,1-.45,0l.12-.47a1.72,1.72,0,0,0,.41,0,1.15,1.15,0,0,0,.44-.08,1,1,0,0,0,.34-.26,2.06,2.06,0,0,0,.29-.5,4.84,4.84,0,0,0,.24-.84l.88-3.89a4.33,4.33,0,0,0,.12-.81.42.42,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.31Zm1.63-7.37-.25,1h-1l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-33" transform="translate(213.49 239.43)">?</text></g><g class="cls-23"><text class="cls-33" transform="translate(219.85 239.43)">1</text></g><g class="cls-23"><text class="cls-33" transform="translate(225.97 239.43)">:</text></g><g class="cls-32"><text class="cls-33" transform="translate(230.77 239.43)">0</text></g></g></g></svg>
+\ No newline at end of file
diff --git a/media/libaom/src/doc/img/tx_cands_large.svg b/media/libaom/src/doc/img/tx_cands_large.svg
new file mode 100644
index 0000000000..fb4f5f49bf
--- /dev/null
+++ b/media/libaom/src/doc/img/tx_cands_large.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 324.79 73.56"><defs><style>.cls-1,.cls-22{fill:none;}.cls-2{fill:#ddebf7;}.cls-3{clip-path:url(#clip-path);}.cls-16,.cls-17,.cls-4{font-size:12px;fill:#333;}.cls-4{font-family:Calibri-Bold, Calibri;font-weight:700;}.cls-5{letter-spacing:0em;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0.01em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0em;}.cls-12{letter-spacing:0em;}.cls-13{letter-spacing:0.01em;}.cls-14{letter-spacing:0em;}.cls-15{clip-path:url(#clip-path-4);}.cls-16{font-family:Calibri, Calibri;}.cls-17{font-family:Calibri-Italic, Calibri;font-style:italic;}.cls-18{letter-spacing:0em;}.cls-19{letter-spacing:0em;}.cls-20{clip-path:url(#clip-path-7);}.cls-21{clip-path:url(#clip-path-10);}.cls-22{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="31.68" width="376.27" height="22.92"/></clipPath><clipPath id="clip-path-4" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="55.56" width="376.27" height="22.92"/></clipPath><clipPath id="clip-path-7" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="79.44" width="376.27" height="22.92"/></clipPath><clipPath id="clip-path-10" transform="translate(-53.04 -30.24)"><rect class="cls-1" width="380.26" height="105.36"/></clipPath></defs><title>tables2Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-2" y="0.96" width="324.79" height="24"/><g class="cls-3"><text class="cls-4" transform="translate(11.3 16.92)"><tspan class="cls-5">M</tspan><tspan class="cls-6" x="10.44" y="0">a</tspan><tspan x="16.33" y="0">x(</tspan><tspan class="cls-7" x="25.58" y="0">w</tspan><tspan class="cls-8" x="34.55" y="0">i</tspan><tspan class="cls-7" x="37.53" y="0">d</tspan><tspan x="43.99" y="0">t</tspan><tspan class="cls-9" x="48.15" y="0">h</tspan><tspan x="54.65" y="0">,</tspan><tspan class="cls-10" x="57.75" y="0"> </tspan><tspan class="cls-7" x="60.51" y="0">h</tspan><tspan class="cls-11" x="66.97" y="0">e</tspan><tspan class="cls-12" x="72.98" y="0">i</tspan><tspan class="cls-5" x="75.96" y="0">g</tspan><tspan class="cls-7" x="81.6" y="0">h</tspan><tspan x="88.06" y="0">t)</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(158.09 16.92)"><tspan class="cls-7">In</tspan><tspan x="9.69" y="0">t</tspan><tspan class="cls-13" x="13.85" y="0">r</tspan><tspan x="18.18" y="0">a</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(261.07 16.92)"><tspan class="cls-14">In</tspan><tspan x="9.69" y="0">ter</tspan></text></g><g class="cls-15"><text class="cls-16" transform="translate(53.18 40.8)"><tspan class="cls-8">3</tspan><tspan x="6.12" y="0">2</tspan></text></g><g class="cls-15"><text class="cls-17" transform="translate(148.13 40.8)"><tspan class="cls-8">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-6" x="19.54" y="0">O</tspan><tspan class="cls-5" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">ly</tspan></text></g><g class="cls-15"><text class="cls-17" transform="translate(235.75 40.8)"><tspan class="cls-8">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-6" x="19.54" y="0">O</tspan><tspan class="cls-5" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">l</tspan><tspan class="cls-18" x="36.23" y="0">y</tspan><tspan x="41.61" y="0">,</tspan><tspan class="cls-14" x="44.6" y="0" xml:space="preserve">  </tspan><tspan x="50.07" y="0">I</tspan><tspan class="cls-19" x="53.1" y="0">D</tspan><tspan x="60.49" y="0">TX</tspan></text></g><g class="cls-20"><text class="cls-16" transform="translate(53.18 64.68)"><tspan class="cls-8">6</tspan><tspan x="6.12" y="0">4</tspan></text></g><g class="cls-20"><text class="cls-17" transform="translate(148.13 64.68)"><tspan class="cls-8">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-6" x="19.54" y="0">O</tspan><tspan class="cls-5" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">ly</tspan></text></g><g class="cls-20"><text class="cls-17" transform="translate(251.23 64.68)"><tspan class="cls-8">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-6" x="19.54" y="0">O</tspan><tspan class="cls-5" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">ly</tspan></text></g><g class="cls-21"><line class="cls-22" x1="118.07" y1="1.98" x2="118.07" y2="23.82"/><rect x="118.01" y="1.92" width="0.96" height="21.96"/><line class="cls-22" x1="221.17" y1="1.98" x2="221.17" y2="23.82"/><rect x="221.11" y="1.92" width="0.96" height="21.96"/><line class="cls-22" x1="118.07" y1="25.86" x2="118.07" y2="71.58"/><rect x="118.01" y="25.8" width="0.96" height="45.84"/><line class="cls-22" x1="221.17" y1="25.86" x2="221.17" y2="71.58"/><rect x="221.11" y="25.8" width="0.96" height="45.84"/><rect width="324.79" height="1.92"/><rect y="23.88" width="324.79" height="1.92"/><rect y="71.64" width="324.79" height="1.92"/></g></g></g></svg>
+\ No newline at end of file
diff --git a/media/libaom/src/doc/img/tx_cands_small.svg b/media/libaom/src/doc/img/tx_cands_small.svg
new file mode 100644
index 0000000000..ddd9a87e53
--- /dev/null
+++ b/media/libaom/src/doc/img/tx_cands_small.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 380.5 90.27"><defs><style>.cls-1,.cls-30{fill:none;}.cls-2{fill:#ddebf7;}.cls-3{clip-path:url(#clip-path);}.cls-17,.cls-18,.cls-4{font-size:12px;}.cls-17,.cls-18,.cls-19,.cls-4{fill:#333;}.cls-4{font-family:Calibri-Bold, Calibri;font-weight:700;}.cls-5{letter-spacing:0em;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0.01em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0.01em;}.cls-11{letter-spacing:0em;}.cls-12{letter-spacing:0em;}.cls-13{letter-spacing:0em;}.cls-14{letter-spacing:0em;}.cls-15{letter-spacing:0.01em;}.cls-16{clip-path:url(#clip-path-4);}.cls-17{font-family:Calibri, Calibri;}.cls-18{font-family:Calibri-Italic, Calibri;}.cls-18,.cls-19{font-style:italic;}.cls-19{font-size:11.04px;font-family:SegoeUI-Italic, Segoe UI;}.cls-20{letter-spacing:0em;}.cls-21{letter-spacing:0em;}.cls-22{letter-spacing:0em;}.cls-23{letter-spacing:0em;}.cls-24{letter-spacing:0em;}.cls-25{clip-path:url(#clip-path-8);}.cls-26{clip-path:url(#clip-path-12);}.cls-27{letter-spacing:0em;}.cls-28{letter-spacing:0em;}.cls-29{clip-path:url(#clip-path-17);}.cls-30{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="31.68" width="431.98" height="22.92"/></clipPath><clipPath id="clip-path-4" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="55.56" width="431.98" height="22.92"/></clipPath><clipPath id="clip-path-8" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="79.44" width="431.98" height="22.92"/></clipPath><clipPath id="clip-path-12" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="103.33" width="431.98" height="15.74"/></clipPath><clipPath id="clip-path-17" transform="translate(-53.04 -30.24)"><rect class="cls-1" width="435.94" height="673.9"/></clipPath></defs><title>tx_cands_smallAsset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-2" y="0.96" width="380.5" height="24"/><g class="cls-3"><text class="cls-4" transform="translate(8.66 16.92)"><tspan class="cls-5">M</tspan><tspan class="cls-6" x="10.44" y="0">i</tspan><tspan class="cls-7" x="13.42" y="0">n</tspan><tspan x="19.89" y="0">(w</tspan><tspan class="cls-8" x="32.57" y="0">i</tspan><tspan class="cls-9" x="35.59" y="0">d</tspan><tspan x="42.05" y="0">t</tspan><tspan class="cls-10" x="46.21" y="0">h</tspan><tspan x="52.71" y="0">,</tspan><tspan class="cls-11" x="55.8" y="0"> </tspan><tspan class="cls-7" x="58.56" y="0">h</tspan><tspan class="cls-12" x="65.03" y="0">e</tspan><tspan class="cls-13" x="71.03" y="0">i</tspan><tspan class="cls-14" x="74.02" y="0">g</tspan><tspan class="cls-9" x="79.66" y="0">h</tspan><tspan x="86.12" y="0">t)</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(159.77 16.92)"><tspan class="cls-9">In</tspan><tspan x="9.69" y="0">t</tspan><tspan class="cls-15" x="13.85" y="0">r</tspan><tspan x="18.18" y="0">a</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(294.19 16.92)"><tspan class="cls-7">In</tspan><tspan x="9.69" y="0">ter</tspan></text></g><g class="cls-16"><text class="cls-17" transform="translate(52.7 40.8)">4</text></g><g class="cls-16"><text class="cls-18" transform="translate(122.57 40.8)"><tspan class="cls-13">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-11" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">4, </tspan></text></g><g class="cls-16"><text class="cls-19" transform="translate(155.93 40.8)"><tspan class="cls-20">I</tspan><tspan class="cls-21" x="2.88" y="0">D</tspan><tspan class="cls-22" x="10.7" y="0">TX</tspan><tspan x="22.96" y="0">,</tspan><tspan class="cls-23" x="25.36" y="0"> </tspan><tspan class="cls-24" x="28.37" y="0">1</tspan><tspan class="cls-21" x="34.34" y="0">DD</tspan><tspan x="49.97" y="0">CT</tspan></text></g><g class="cls-16"><text class="cls-18" transform="translate(290.59 40.8)">ALL<tspan class="cls-13" x="17.03" y="0">1</tspan><tspan x="23.15" y="0">6</tspan></text></g><g class="cls-25"><text class="cls-17" transform="translate(52.7 64.68)">8</text></g><g class="cls-25"><text class="cls-18" transform="translate(122.57 64.68)"><tspan class="cls-13">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-11" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">4, </tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(155.93 64.68)"><tspan class="cls-20">I</tspan><tspan class="cls-21" x="2.88" y="0">D</tspan><tspan class="cls-22" x="10.7" y="0">TX</tspan><tspan x="22.96" y="0">,</tspan><tspan class="cls-23" x="25.36" y="0"> </tspan><tspan class="cls-24" x="28.37" y="0">1</tspan><tspan class="cls-21" x="34.34" y="0">DD</tspan><tspan x="49.97" y="0">CT</tspan></text></g><g class="cls-25"><text class="cls-18" transform="translate(290.59 64.68)">ALL<tspan class="cls-13" x="17.03" y="0">1</tspan><tspan x="23.15" y="0">6</tspan></text></g><g class="cls-26"><text class="cls-17" transform="translate(49.58 84.99)"><tspan class="cls-13">1</tspan><tspan x="6.12" y="0">6</tspan></text></g><g class="cls-26"><text class="cls-18" transform="translate(142.49 84.99)"><tspan class="cls-13">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-11" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">4, </tspan></text></g><g class="cls-26"><text class="cls-19" transform="translate(175.85 84.99)"><tspan class="cls-20">IDT</tspan><tspan x="16.44" y="0">X</tspan></text></g><g class="cls-26"><text class="cls-18" transform="translate(257.11 84.99)"><tspan class="cls-13">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-11" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">9, </tspan></text></g><g class="cls-26"><text class="cls-19" transform="translate(290.47 84.99)"><tspan class="cls-27">I</tspan><tspan class="cls-21" x="2.88" y="0">D</tspan><tspan class="cls-22" x="10.7" y="0">TX</tspan><tspan x="22.96" y="0">,</tspan><tspan class="cls-22" x="25.36" y="0"> </tspan><tspan class="cls-28" x="28.37" y="0">1</tspan><tspan class="cls-21" x="34.34" y="0">DD</tspan><tspan x="49.97" y="0">CT</tspan></text></g><g class="cls-29"><line class="cls-30" x1="110.84" y1="1.98" x2="110.84" y2="23.82"/><rect x="110.78" y="1.92" width="0.96" height="21.96"/><line class="cls-30" x1="231.73" y1="1.98" x2="231.73" y2="23.82"/><rect x="231.67" y="1.92" width="0.96" height="21.96"/><line class="cls-30" x1="110.84" y1="25.86" x2="110.84" y2="88.29"/><rect x="110.78" y="25.81" width="0.96" height="62.54"/><line class="cls-30" x1="231.73" y1="25.86" x2="231.73" y2="88.29"/><rect x="231.67" y="25.81" width="0.96" height="62.54"/><rect width="380.5" height="1.92"/><rect y="23.88" width="380.5" height="1.92"/><rect y="88.35" width="380.5" height="1.92"/></g></g></g></svg>
+\ No newline at end of file
diff --git a/media/libaom/src/doc/img/tx_chroma.svg b/media/libaom/src/doc/img/tx_chroma.svg
new file mode 100644
index 0000000000..a0915e0031
--- /dev/null
+++ b/media/libaom/src/doc/img/tx_chroma.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 380.5 244.23"><defs><style>.cls-1,.cls-41{fill:none;}.cls-2{fill:#ddebf7;}.cls-3{clip-path:url(#clip-path);}.cls-4{font-size:12px;font-family:Calibri-Bold, Calibri;font-weight:700;}.cls-19,.cls-4{fill:#333;}.cls-5{letter-spacing:0em;}.cls-6{letter-spacing:0.01em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0em;}.cls-12{clip-path:url(#clip-path-3);}.cls-13{letter-spacing:0em;}.cls-14{letter-spacing:0em;}.cls-15{letter-spacing:0em;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0.01em;}.cls-18{clip-path:url(#clip-path-5);}.cls-19{font-size:11.04px;font-family:SegoeUI-Italic, Segoe UI;font-style:italic;}.cls-20{letter-spacing:0em;}.cls-21{letter-spacing:0em;}.cls-22{clip-path:url(#clip-path-8);}.cls-23{letter-spacing:0em;}.cls-24{letter-spacing:-0.01em;}.cls-25{clip-path:url(#clip-path-11);}.cls-26{letter-spacing:0em;}.cls-27{letter-spacing:0em;}.cls-28{clip-path:url(#clip-path-14);}.cls-29{letter-spacing:0em;}.cls-30{letter-spacing:0em;}.cls-31{clip-path:url(#clip-path-17);}.cls-32{clip-path:url(#clip-path-20);}.cls-33{clip-path:url(#clip-path-23);}.cls-34{clip-path:url(#clip-path-26);}.cls-35{clip-path:url(#clip-path-29);}.cls-36{clip-path:url(#clip-path-32);}.cls-37{clip-path:url(#clip-path-35);}.cls-38{clip-path:url(#clip-path-38);}.cls-39{clip-path:url(#clip-path-41);}.cls-40{clip-path:url(#clip-path-44);}.cls-41{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="53.52" y="17.15" width="110.3" height="30.24"/></clipPath><clipPath id="clip-path-3" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="17.15" width="431.98" height="30.24"/></clipPath><clipPath id="clip-path-5" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="48.35" width="431.98" height="15"/></clipPath><clipPath id="clip-path-8" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="64.31" width="431.98" height="15"/></clipPath><clipPath id="clip-path-11" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="80.27" width="431.98" height="15"/></clipPath><clipPath id="clip-path-14" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="96.23" width="431.98" height="15"/></clipPath><clipPath id="clip-path-17" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="112.2" width="431.98" height="15.02"/></clipPath><clipPath id="clip-path-20" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="128.18" width="431.98" height="15"/></clipPath><clipPath id="clip-path-23" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="144.14" width="431.98" height="15"/></clipPath><clipPath id="clip-path-26" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="160.1" width="431.98" height="15"/></clipPath><clipPath id="clip-path-29" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="176.06" width="431.98" height="15"/></clipPath><clipPath id="clip-path-32" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="192.02" width="431.98" height="15"/></clipPath><clipPath id="clip-path-35" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="207.98" width="431.98" height="15"/></clipPath><clipPath id="clip-path-38" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="223.94" width="431.98" height="17.88"/></clipPath><clipPath id="clip-path-41" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="242.78" width="431.98" height="15.72"/></clipPath><clipPath id="clip-path-44" transform="translate(-53.04 -15.71)"><rect class="cls-1" width="435.94" height="567.07"/></clipPath></defs><title>tx_chromaAsset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-2" y="0.96" width="380.5" height="31.32"/><g class="cls-3"><text class="cls-4" transform="translate(16.58 12.84)"><tspan class="cls-5">In</tspan><tspan x="9.69" y="0">t</tspan><tspan class="cls-6" x="13.85" y="0">r</tspan><tspan class="cls-7" x="18.18" y="0">a</tspan><tspan class="cls-5" x="24.07" y="0"> </tspan><tspan x="26.81" y="0">P</tspan><tspan class="cls-8" x="33.2" y="0">r</tspan><tspan class="cls-7" x="37.47" y="0">e</tspan><tspan class="cls-5" x="43.48" y="0">d</tspan><tspan class="cls-9" x="49.94" y="0">i</tspan><tspan x="52.93" y="0">c</tspan><tspan class="cls-9" x="57.95" y="0">ti</tspan><tspan x="65.01" y="0">o</tspan><tspan class="cls-10" x="71.46" y="0">n</tspan><tspan x="77.95" y="0"> </tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(41.06 28.44)"><tspan class="cls-11">M</tspan><tspan x="10.44" y="0">o</tspan><tspan class="cls-10" x="16.89" y="0">d</tspan><tspan x="23.38" y="0">e</tspan></text></g><g class="cls-12"><text class="cls-4" transform="translate(126.17 20.64)">V<tspan class="cls-11" x="7.1" y="0">e</tspan><tspan class="cls-13" x="13.09" y="0">r</tspan><tspan class="cls-6" x="17.39" y="0">ti</tspan><tspan x="24.49" y="0">cal</tspan><tspan class="cls-10" x="38.38" y="0"> </tspan><tspan x="41.14" y="0">t</tspan><tspan class="cls-6" x="45.3" y="0">r</tspan><tspan class="cls-7" x="49.64" y="0">a</tspan><tspan class="cls-5" x="55.53" y="0">n</tspan><tspan x="61.99" y="0">s</tspan><tspan class="cls-9" x="66.78" y="0">f</tspan><tspan x="70.61" y="0">o</tspan><tspan class="cls-14" x="77.06" y="0">r</tspan><tspan x="81.39" y="0">m</tspan></text></g><g class="cls-12"><text class="cls-4" transform="translate(253.87 20.64)">Ho<tspan class="cls-10" x="14.02" y="0">r</tspan><tspan class="cls-9" x="18.33" y="0">i</tspan><tspan x="21.32" y="0">z</tspan><tspan class="cls-10" x="26.09" y="0">o</tspan><tspan class="cls-15" x="32.59" y="0">n</tspan><tspan x="39.05" y="0">tal</tspan><tspan class="cls-14" x="52.08" y="0"> </tspan><tspan x="54.85" y="0">t</tspan><tspan class="cls-6" x="59.01" y="0">r</tspan><tspan class="cls-7" x="63.35" y="0">a</tspan><tspan class="cls-15" x="69.24" y="0">n</tspan><tspan x="75.7" y="0">s</tspan><tspan class="cls-16" x="80.49" y="0">f</tspan><tspan x="84.32" y="0">o</tspan><tspan class="cls-17" x="90.77" y="0">r</tspan><tspan x="95.1" y="0">m</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(9.62 44.28)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">C_P</tspan><tspan class="cls-21" x="25.8" y="0">R</tspan><tspan x="32.27" y="0">ED</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(160.37 44.28)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(294.91 44.28)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-22"><text class="cls-19" transform="translate(9.62 60.24)">V_P<tspan class="cls-23" x="17.86" y="0">R</tspan><tspan x="24.35" y="0">ED</tspan></text></g><g class="cls-22"><text class="cls-19" transform="translate(157.61 60.24)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-22"><text class="cls-19" transform="translate(294.91 60.24)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(9.62 76.2)">H<tspan class="cls-26" x="7.82" y="0">_</tspan><tspan x="12.37" y="0">P</tspan><tspan class="cls-21" x="18.72" y="0">R</tspan><tspan x="25.19" y="0">ED</tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(160.37 76.2)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(292.15 76.2)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(9.62 92.16)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">45</tspan><tspan x="19.76" y="0">_P</tspan><tspan class="cls-30" x="30.69" y="0">R</tspan><tspan x="37.14" y="0">ED</tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(160.37 92.16)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(294.91 92.16)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(9.62 108.15)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">135</tspan><tspan x="25.74" y="0">_P</tspan><tspan class="cls-29" x="36.67" y="0">R</tspan><tspan x="43.12" y="0">ED</tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(157.61 108.15)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(292.15 108.15)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(9.62 124.11)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">113</tspan><tspan x="25.74" y="0">_P</tspan><tspan class="cls-29" x="36.67" y="0">R</tspan><tspan x="43.12" y="0">ED</tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(157.61 124.11)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(294.91 124.11)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(9.62 140.07)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">157</tspan><tspan x="25.74" y="0">_P</tspan><tspan class="cls-29" x="36.67" y="0">R</tspan><tspan x="43.12" y="0">ED</tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(160.37 140.07)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(292.15 140.07)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(9.62 156.03)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">203</tspan><tspan x="25.74" y="0">_P</tspan><tspan class="cls-29" x="36.67" y="0">R</tspan><tspan x="43.12" y="0">ED</tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(160.37 156.03)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(292.15 156.03)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(9.62 171.99)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">67</tspan><tspan x="19.76" y="0">_P</tspan><tspan class="cls-30" x="30.69" y="0">R</tspan><tspan x="37.14" y="0">ED</tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(157.61 171.99)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(294.91 171.99)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(9.62 187.95)">SM<tspan class="cls-26" x="14.52" y="0">OOT</tspan><tspan x="36.87" y="0">H</tspan><tspan class="cls-26" x="44.69" y="0">_</tspan><tspan x="49.23" y="0">P</tspan><tspan class="cls-21" x="55.58" y="0">R</tspan><tspan x="62.05" y="0">ED</tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(157.61 187.95)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(292.15 187.95)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(9.62 203.91)">SM<tspan class="cls-26" x="14.52" y="0">OOT</tspan><tspan x="36.87" y="0">H</tspan><tspan class="cls-26" x="44.69" y="0">_</tspan><tspan x="49.23" y="0">V_P</tspan><tspan class="cls-23" x="67.1" y="0">R</tspan><tspan x="73.58" y="0">ED</tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(157.61 203.91)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(294.91 203.91)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(9.62 221.19)">SM<tspan class="cls-26" x="14.52" y="0">OOT</tspan><tspan x="36.87" y="0">H</tspan><tspan class="cls-26" x="44.69" y="0">_</tspan><tspan x="49.23" y="0">H</tspan><tspan class="cls-26" x="57.05" y="0">_</tspan><tspan x="61.6" y="0">P</tspan><tspan class="cls-21" x="67.95" y="0">R</tspan><tspan x="74.42" y="0">ED</tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(160.37 221.19)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(292.15 221.19)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(9.62 238.95)">PAE<tspan class="cls-26" x="18.83" y="0">T</tspan><tspan x="24.61" y="0">H</tspan><tspan class="cls-26" x="32.42" y="0">_</tspan><tspan x="36.97" y="0">P</tspan><tspan class="cls-21" x="43.32" y="0">R</tspan><tspan x="49.79" y="0">ED</tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(157.61 238.95)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(292.15 238.95)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-40"><line class="cls-41" x1="110.84" y1="1.98" x2="110.84" y2="31.14"/><rect x="110.78" y="1.92" width="0.96" height="29.28"/><line class="cls-41" x1="231.73" y1="1.98" x2="231.73" y2="31.14"/><rect x="231.67" y="1.92" width="0.96" height="29.28"/><line class="cls-41" x1="110.84" y1="33.18" x2="110.84" y2="242.25"/><rect x="110.78" y="33.13" width="0.96" height="209.18"/><line class="cls-41" x1="231.73" y1="33.18" x2="231.73" y2="242.25"/><rect x="231.67" y="33.13" width="0.96" height="209.18"/><rect width="380.5" height="1.92"/><rect y="31.2" width="380.5" height="1.92"/><rect y="242.31" width="380.5" height="1.92"/></g></g></g></svg>
+\ No newline at end of file
diff --git a/media/libaom/src/doc/img/tx_partition.svg b/media/libaom/src/doc/img/tx_partition.svg
new file mode 100644
index 0000000000..e0ce50c507
--- /dev/null
+++ b/media/libaom/src/doc/img/tx_partition.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 172.61 310.73"><defs><style>.cls-1,.cls-38{fill:none;}.cls-2{clip-path:url(#clip-path);}.cls-3{fill:#ddebf7;}.cls-4{clip-path:url(#clip-path-2);}.cls-5{font-size:11.04px;font-family:Calibri, Calibri;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0em;}.cls-12{clip-path:url(#clip-path-4);}.cls-13{letter-spacing:0em;}.cls-14{letter-spacing:0em;}.cls-15{letter-spacing:0.01em;}.cls-16{letter-spacing:0em;}.cls-17{clip-path:url(#clip-path-8);}.cls-18{clip-path:url(#clip-path-10);}.cls-19{letter-spacing:0.01em;}.cls-20{clip-path:url(#clip-path-12);}.cls-21{clip-path:url(#clip-path-14);}.cls-22{clip-path:url(#clip-path-16);}.cls-23{clip-path:url(#clip-path-18);}.cls-24{clip-path:url(#clip-path-20);}.cls-25{letter-spacing:0.01em;}.cls-26{clip-path:url(#clip-path-22);}.cls-27{clip-path:url(#clip-path-24);}.cls-28{clip-path:url(#clip-path-26);}.cls-29{clip-path:url(#clip-path-28);}.cls-30{clip-path:url(#clip-path-30);}.cls-31{clip-path:url(#clip-path-32);}.cls-32{clip-path:url(#clip-path-34);}.cls-33{clip-path:url(#clip-path-36);}.cls-34{clip-path:url(#clip-path-38);}.cls-35{clip-path:url(#clip-path-40);}.cls-36{clip-path:url(#clip-path-42);}.cls-37{clip-path:url(#clip-path-44);}.cls-38{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="1.92" width="172.49" height="323.09"/></clipPath><clipPath id="clip-path-2" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="17.16" width="86.9" height="29.52"/></clipPath><clipPath id="clip-path-4" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="89.78" y="17.16" width="83.66" height="29.52"/></clipPath><clipPath id="clip-path-8" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="62.88" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-10" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="77.4" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-12" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="91.92" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-14" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="106.45" width="172.49" height="13.58"/></clipPath><clipPath id="clip-path-16" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="120.99" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-18" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="135.51" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-20" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="150.03" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-22" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="164.55" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-24" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="179.07" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-26" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="193.59" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-28" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="208.11" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-30" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="222.63" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-32" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="237.15" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-34" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="251.67" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-36" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="266.19" width="172.49" height="13.58"/></clipPath><clipPath id="clip-path-38" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="280.73" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-40" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="295.25" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-42" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="309.77" width="172.49" height="14.28"/></clipPath><clipPath id="clip-path-44" transform="translate(-1.44 -15.24)"><rect class="cls-1" width="176.45" height="327.05"/></clipPath></defs><title>tables2Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><g class="cls-2"><rect class="cls-3" y="1.44" width="172.61" height="30.6"/></g><g class="cls-4"><text class="cls-5" transform="translate(5.28 13.2)">Tra<tspan class="cls-6" x="14.52" y="0">n</tspan><tspan x="20.28" y="0">sf</tspan><tspan class="cls-7" x="27.97" y="0">o</tspan><tspan class="cls-8" x="33.83" y="0">r</tspan><tspan class="cls-9" x="37.67" y="0">m</tspan><tspan class="cls-8" x="46.53" y="0" xml:space="preserve"> size </tspan><tspan class="cls-10" x="68.22" y="0">o</tspan><tspan x="74.1" y="0">f </tspan></text></g><g class="cls-4"><text class="cls-5" transform="translate(12.96 27.72)">cu<tspan class="cls-11" x="10.47" y="0">r</tspan><tspan x="14.28" y="0">rent depth</tspan></text></g><g class="cls-12"><text class="cls-5" transform="translate(91.46 13.2)">Tra<tspan class="cls-6" x="14.52" y="0">n</tspan><tspan x="20.28" y="0">sf</tspan><tspan class="cls-7" x="27.97" y="0">o</tspan><tspan class="cls-8" x="33.83" y="0">r</tspan><tspan class="cls-9" x="37.67" y="0">m</tspan><tspan class="cls-8" x="46.53" y="0" xml:space="preserve"> size </tspan><tspan class="cls-10" x="68.22" y="0">o</tspan><tspan x="74.1" y="0">f </tspan></text></g><g class="cls-12"><text class="cls-5" transform="translate(105.86 27.72)"><tspan class="cls-6">n</tspan><tspan x="5.77" y="0">e</tspan><tspan class="cls-13" x="11.26" y="0">x</tspan><tspan x="16.06" y="0">t</tspan><tspan class="cls-14" x="19.76" y="0"> </tspan><tspan class="cls-6" x="22.28" y="0">d</tspan><tspan x="28.05" y="0">epth</tspan></text></g><g class="cls-2"><text class="cls-5" transform="translate(27.48 43.32)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-2"><text class="cls-5" transform="translate(113.78 43.32)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-17"><text class="cls-5" transform="translate(27.48 58.2)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-17"><text class="cls-5" transform="translate(113.78 58.2)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-18"><text class="cls-5" transform="translate(21.84 72.72)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-18"><text class="cls-5" transform="translate(113.78 72.72)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-20"><text class="cls-5" transform="translate(21.84 87.24)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-20"><text class="cls-5" transform="translate(108.14 87.24)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-21"><text class="cls-5" transform="translate(21.84 101.79)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">6</tspan><tspan class="cls-16" x="22.29" y="0">4</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">6</tspan><tspan class="cls-8" x="39.31" y="0">4</tspan></text></g><g class="cls-21"><text class="cls-5" transform="translate(108.14 101.79)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-22"><text class="cls-5" transform="translate(27.48 116.31)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-22"><text class="cls-5" transform="translate(113.78 116.31)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-23"><text class="cls-5" transform="translate(27.48 130.83)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-23"><text class="cls-5" transform="translate(113.78 130.83)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-24"><text class="cls-5" transform="translate(24.72 145.35)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X</tspan><tspan class="cls-25" x="28.02" y="0">1</tspan><tspan x="33.68" y="0">6</tspan></text></g><g class="cls-24"><text class="cls-5" transform="translate(113.78 145.35)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-26"><text class="cls-5" transform="translate(24.72 159.87)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X8</tspan></text></g><g class="cls-26"><text class="cls-5" transform="translate(113.78 159.87)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-27"><text class="cls-5" transform="translate(21.84 174.39)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-27"><text class="cls-5" transform="translate(108.14 174.39)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-28"><text class="cls-5" transform="translate(21.84 188.91)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-28"><text class="cls-5" transform="translate(108.14 188.91)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-29"><text class="cls-5" transform="translate(21.84 203.43)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">6</tspan><tspan class="cls-8" x="39.31" y="0">4</tspan></text></g><g class="cls-29"><text class="cls-5" transform="translate(108.14 203.43)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-30"><text class="cls-5" transform="translate(21.84 217.95)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">6</tspan><tspan class="cls-16" x="22.29" y="0">4</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-30"><text class="cls-5" transform="translate(108.14 217.95)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-31"><text class="cls-5" transform="translate(24.72 232.47)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X</tspan><tspan class="cls-25" x="28.02" y="0">1</tspan><tspan x="33.68" y="0">6</tspan></text></g><g class="cls-31"><text class="cls-5" transform="translate(113.78 232.47)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-32"><text class="cls-5" transform="translate(24.72 246.99)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X4</tspan></text></g><g class="cls-32"><text class="cls-5" transform="translate(113.78 246.99)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-33"><text class="cls-5" transform="translate(24.72 261.53)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X</tspan><tspan class="cls-25" x="28.02" y="0">3</tspan><tspan x="33.68" y="0">2</tspan></text></g><g class="cls-33"><text class="cls-5" transform="translate(110.9 261.53)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X</tspan><tspan class="cls-25" x="28.02" y="0">1</tspan><tspan x="33.68" y="0">6</tspan></text></g><g class="cls-34"><text class="cls-5" transform="translate(24.72 276.05)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X8</tspan></text></g><g class="cls-34"><text class="cls-5" transform="translate(110.9 276.05)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X8</tspan></text></g><g class="cls-35"><text class="cls-5" transform="translate(21.84 290.57)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">6</tspan><tspan class="cls-8" x="39.31" y="0">4</tspan></text></g><g class="cls-35"><text class="cls-5" transform="translate(108.14 290.57)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-36"><text class="cls-5" transform="translate(21.84 305.45)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">6</tspan><tspan class="cls-16" x="22.29" y="0">4</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-36"><text class="cls-5" transform="translate(108.14 305.45)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-37"><line class="cls-38" x1="87.44" y1="2.94" x2="87.44" y2="30.42"/><rect x="87.38" y="2.88" width="0.96" height="27.6"/><line class="cls-38" x1="87.44" y1="33.42" x2="87.44" y2="307.79"/><rect x="87.38" y="33.36" width="0.96" height="274.49"/><rect width="172.61" height="2.88"/><rect y="30.48" width="172.61" height="0.96"/><rect y="32.4" width="172.61" height="0.96"/><rect y="307.85" width="172.61" height="2.88"/></g></g></g></svg>
+\ No newline at end of file
diff --git a/media/libaom/src/doc/img/tx_set.svg b/media/libaom/src/doc/img/tx_set.svg
new file mode 100644
index 0000000000..dee10d4d93
--- /dev/null
+++ b/media/libaom/src/doc/img/tx_set.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 347.4 549.8"><defs><style>.cls-1,.cls-60{fill:none;}.cls-2{fill:#ddebf7;}.cls-3{clip-path:url(#clip-path);}.cls-19,.cls-4{font-size:12px;fill:#333;}.cls-4{font-family:Calibri-Bold, Calibri;font-weight:700;}.cls-5{letter-spacing:0em;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0.01em;}.cls-12{letter-spacing:0em;}.cls-13{letter-spacing:0em;}.cls-14{letter-spacing:0em;}.cls-15{letter-spacing:0.01em;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0em;}.cls-18{clip-path:url(#clip-path-4);}.cls-19{font-family:Calibri-Italic, Calibri;font-style:italic;}.cls-20{clip-path:url(#clip-path-7);}.cls-21{letter-spacing:0em;}.cls-22{letter-spacing:0em;}.cls-23{clip-path:url(#clip-path-10);}.cls-24{clip-path:url(#clip-path-12);}.cls-25{clip-path:url(#clip-path-14);}.cls-26{clip-path:url(#clip-path-16);}.cls-27{clip-path:url(#clip-path-18);}.cls-28{clip-path:url(#clip-path-20);}.cls-29{clip-path:url(#clip-path-22);}.cls-30{clip-path:url(#clip-path-24);}.cls-31{clip-path:url(#clip-path-26);}.cls-32{clip-path:url(#clip-path-28);}.cls-33{clip-path:url(#clip-path-30);}.cls-34{clip-path:url(#clip-path-32);}.cls-35{clip-path:url(#clip-path-34);}.cls-36{clip-path:url(#clip-path-36);}.cls-37{clip-path:url(#clip-path-38);}.cls-38{clip-path:url(#clip-path-40);}.cls-39{clip-path:url(#clip-path-42);}.cls-40{clip-path:url(#clip-path-44);}.cls-41{clip-path:url(#clip-path-46);}.cls-42{clip-path:url(#clip-path-48);}.cls-43{clip-path:url(#clip-path-50);}.cls-44{clip-path:url(#clip-path-52);}.cls-45{clip-path:url(#clip-path-54);}.cls-46{clip-path:url(#clip-path-56);}.cls-47{clip-path:url(#clip-path-58);}.cls-48{clip-path:url(#clip-path-60);}.cls-49{clip-path:url(#clip-path-62);}.cls-50{clip-path:url(#clip-path-64);}.cls-51{clip-path:url(#clip-path-66);}.cls-52{clip-path:url(#clip-path-68);}.cls-53{clip-path:url(#clip-path-70);}.cls-54{clip-path:url(#clip-path-72);}.cls-55{letter-spacing:0.01em;}.cls-56{clip-path:url(#clip-path-73);}.cls-57{clip-path:url(#clip-path-74);}.cls-58{clip-path:url(#clip-path-75);}.cls-59{clip-path:url(#clip-path-76);}.cls-60{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="17.15" width="502.08" height="30.24"/></clipPath><clipPath id="clip-path-4" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="48.35" width="502.08" height="15"/></clipPath><clipPath id="clip-path-7" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="64.31" width="502.08" height="15"/></clipPath><clipPath id="clip-path-10" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="80.27" width="502.08" height="15"/></clipPath><clipPath id="clip-path-12" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="96.23" width="502.08" height="15"/></clipPath><clipPath id="clip-path-14" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="112.2" width="502.08" height="15.02"/></clipPath><clipPath id="clip-path-16" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="128.18" width="502.08" height="15"/></clipPath><clipPath id="clip-path-18" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="144.14" width="502.08" height="15"/></clipPath><clipPath id="clip-path-20" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="160.1" width="502.08" height="15"/></clipPath><clipPath id="clip-path-22" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="176.06" width="502.08" height="15"/></clipPath><clipPath id="clip-path-24" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="192.02" width="502.08" height="15"/></clipPath><clipPath id="clip-path-26" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="207.98" width="502.08" height="15"/></clipPath><clipPath id="clip-path-28" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="223.94" width="502.08" height="17.88"/></clipPath><clipPath id="clip-path-30" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="242.78" width="502.08" height="15.72"/></clipPath><clipPath id="clip-path-32" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="259.46" width="502.08" height="14.3"/></clipPath><clipPath id="clip-path-34" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="274.72" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-36" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="289.96" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-38" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="305.2" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-40" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="320.44" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-42" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="335.68" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-44" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="350.92" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-46" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="366.16" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-48" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="381.4" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-50" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="396.64" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-52" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="411.88" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-54" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="427.13" width="502.08" height="14.3"/></clipPath><clipPath id="clip-path-56" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="442.39" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-58" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="457.63" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-60" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="472.87" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-62" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="488.11" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-64" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="503.35" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-66" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="518.59" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-68" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="533.83" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-70" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="549.07" width="502.08" height="15"/></clipPath><clipPath id="clip-path-72" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="53.04" y="79.79" width="118.49" height="31.92"/></clipPath><clipPath id="clip-path-73" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="53.04" y="111.72" width="118.49" height="63.86"/></clipPath><clipPath id="clip-path-74" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="53.04" y="175.58" width="118.49" height="144.38"/></clipPath><clipPath id="clip-path-75" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="53.04" y="319.97" width="118.49" height="244.58"/></clipPath><clipPath id="clip-path-76" transform="translate(-53.03 -15.71)"><rect class="cls-1" width="506.04" height="567.07"/></clipPath></defs><title>tx_setAsset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-2" x="0.01" y="0.96" width="347.38" height="31.32"/><g class="cls-3"><text class="cls-4" transform="translate(24.51 20.64)"><tspan class="cls-5">Tr</tspan><tspan class="cls-6" x="10.28" y="0">a</tspan><tspan class="cls-7" x="16.17" y="0">n</tspan><tspan x="22.63" y="0">s</tspan><tspan class="cls-5" x="27.42" y="0">f</tspan><tspan x="31.25" y="0">o</tspan><tspan class="cls-8" x="37.7" y="0">r</tspan><tspan class="cls-6" x="42.03" y="0">m</tspan><tspan class="cls-7" x="51.75" y="0"> </tspan><tspan x="54.49" y="0">set</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(127.26 20.64)">V<tspan class="cls-9" x="7.1" y="0">e</tspan><tspan class="cls-10" x="13.09" y="0">r</tspan><tspan class="cls-11" x="17.39" y="0">ti</tspan><tspan x="24.49" y="0">cal</tspan><tspan class="cls-12" x="38.38" y="0"> </tspan><tspan x="41.14" y="0">t</tspan><tspan class="cls-11" x="45.3" y="0">r</tspan><tspan class="cls-13" x="49.64" y="0">a</tspan><tspan class="cls-7" x="55.53" y="0">n</tspan><tspan x="61.99" y="0">s</tspan><tspan class="cls-5" x="66.78" y="0">f</tspan><tspan class="cls-14" x="70.61" y="0">o</tspan><tspan class="cls-15" x="77.06" y="0">r</tspan><tspan x="81.39" y="0">m</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(234.8 20.64)">Ho<tspan class="cls-12" x="14.02" y="0">r</tspan><tspan class="cls-5" x="18.33" y="0">i</tspan><tspan x="21.32" y="0">z</tspan><tspan class="cls-12" x="26.09" y="0">o</tspan><tspan class="cls-16" x="32.59" y="0">n</tspan><tspan x="39.05" y="0">tal</tspan><tspan class="cls-8" x="52.08" y="0"> </tspan><tspan x="54.85" y="0">t</tspan><tspan class="cls-11" x="59.01" y="0">r</tspan><tspan class="cls-13" x="63.35" y="0">a</tspan><tspan class="cls-16" x="69.24" y="0">n</tspan><tspan x="75.7" y="0">s</tspan><tspan class="cls-17" x="80.49" y="0">f</tspan><tspan x="84.32" y="0">o</tspan><tspan class="cls-15" x="90.77" y="0">r</tspan><tspan x="95.1" y="0">m</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(37.35 44.16)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-13" x="19.54" y="0">O</tspan><tspan class="cls-9" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">ly</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(162.06 44.16)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(276.44 44.16)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-20"><text class="cls-19" transform="translate(46.95 60.12)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">TX</tspan></text></g><g class="cls-20"><text class="cls-19" transform="translate(163.62 60.12)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T</tspan></text></g><g class="cls-20"><text class="cls-19" transform="translate(278 60.12)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T</tspan></text></g><g class="cls-23"><text class="cls-19" transform="translate(160.62 76.08)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-23"><text class="cls-19" transform="translate(276.68 76.08)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-24"><text class="cls-19" transform="translate(163.62 92.04)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T</tspan></text></g><g class="cls-24"><text class="cls-19" transform="translate(276.44 92.04)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(157.62 108.03)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(272 108.03)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-26"><text class="cls-19" transform="translate(157.62 123.99)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-26"><text class="cls-19" transform="translate(275 123.99)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-27"><text class="cls-19" transform="translate(160.62 139.95)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-27"><text class="cls-19" transform="translate(272 139.95)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(162.06 155.91)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(276.44 155.91)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-29"><text class="cls-19" transform="translate(160.62 171.87)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-29"><text class="cls-19" transform="translate(275 171.87)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-30"><text class="cls-19" transform="translate(160.62 187.83)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-30"><text class="cls-19" transform="translate(272 187.83)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(160.62 203.79)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(253.04 203.79)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(158.94 221.19)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST</tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(276.44 221.19)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(157.62 238.95)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(272 238.95)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(157.62 255.03)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(253.04 255.03)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(138.66 270.29)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(275 270.29)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(138.66 285.53)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(272 285.53)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(139.98 300.77)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST</tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(253.04 300.77)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(160.62 316.01)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(275 316.01)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(160.62 331.25)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(272 331.25)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-40"><text class="cls-19" transform="translate(160.62 346.49)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-40"><text class="cls-19" transform="translate(253.04 346.49)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-41"><text class="cls-19" transform="translate(160.62 361.73)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-41"><text class="cls-19" transform="translate(276.68 361.73)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-42"><text class="cls-19" transform="translate(157.62 376.97)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-42"><text class="cls-19" transform="translate(275 376.97)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-43"><text class="cls-19" transform="translate(157.62 392.21)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-43"><text class="cls-19" transform="translate(272 392.21)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-44"><text class="cls-19" transform="translate(157.62 407.45)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-44"><text class="cls-19" transform="translate(253.04 407.45)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-45"><text class="cls-19" transform="translate(157.62 422.72)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-45"><text class="cls-19" transform="translate(276.68 422.72)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-46"><text class="cls-19" transform="translate(138.66 437.96)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-46"><text class="cls-19" transform="translate(275 437.96)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-47"><text class="cls-19" transform="translate(138.66 453.2)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-47"><text class="cls-19" transform="translate(272 453.2)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-48"><text class="cls-19" transform="translate(138.66 468.44)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-48"><text class="cls-19" transform="translate(253.04 468.44)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-49"><text class="cls-19" transform="translate(138.66 483.68)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-49"><text class="cls-19" transform="translate(276.68 483.68)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-50"><text class="cls-19" transform="translate(162.3 498.92)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-50"><text class="cls-19" transform="translate(275 498.92)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-51"><text class="cls-19" transform="translate(162.3 514.16)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-51"><text class="cls-19" transform="translate(272 514.16)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-52"><text class="cls-19" transform="translate(162.3 529.4)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-52"><text class="cls-19" transform="translate(253.04 529.4)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-53"><text class="cls-19" transform="translate(162.3 544.88)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-53"><text class="cls-19" transform="translate(276.68 544.88)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-54"><text class="cls-19" transform="translate(41.67 83.64)">1<tspan class="cls-55" x="6.08" y="0">D</tspan><tspan class="cls-5" x="13.54" y="0">D</tspan><tspan x="20.96" y="0">CT</tspan></text></g><g class="cls-56"><text class="cls-19" transform="translate(45.51 131.55)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-12" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">4</tspan></text></g><g class="cls-57"><text class="cls-19" transform="translate(45.51 235.59)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-12" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">9</tspan></text></g><g class="cls-58"><text class="cls-19" transform="translate(43.59 430.16)">ALL<tspan class="cls-5" x="17.03" y="0">1</tspan><tspan x="23.15" y="0">6</tspan></text></g><g class="cls-59"><line class="cls-60" x1="118.08" y1="1.98" x2="118.08" y2="31.14"/><rect x="118.02" y="1.92" width="0.96" height="29.28"/><line class="cls-60" x1="226.82" y1="1.98" x2="226.82" y2="31.14"/><rect x="226.76" y="1.92" width="0.96" height="29.28"/><line class="cls-60" x1="118.08" y1="33.18" x2="118.08" y2="547.82"/><rect x="118.02" y="33.13" width="0.96" height="514.75"/><line class="cls-60" x1="226.82" y1="33.18" x2="226.82" y2="547.82"/><rect x="226.76" y="33.13" width="0.96" height="514.75"/><rect x="0.01" width="347.38" height="1.92"/><rect x="0.01" y="31.2" width="347.38" height="1.92"/><line class="cls-60" x1="0.07" y1="47.7" x2="347.33" y2="47.7"/><rect x="0.01" y="47.64" width="347.38" height="0.96"/><line class="cls-60" x1="0.07" y1="63.66" x2="347.33" y2="63.66"/><rect x="0.01" y="63.6" width="347.38" height="0.96"/><line class="cls-60" x1="0.07" y1="95.58" x2="347.33" y2="95.58"/><rect x="0.01" y="95.52" width="347.38" height="0.96"/><line class="cls-60" x1="0.07" y1="159.45" x2="347.33" y2="159.45"/><rect x="0.01" y="159.39" width="347.38" height="0.96"/><line class="cls-60" x1="0.07" y1="303.83" x2="347.33" y2="303.83"/><rect x="0.01" y="303.77" width="347.38" height="0.96"/><rect x="0.01" y="547.88" width="347.38" height="1.92"/></g></g></g></svg>
+\ No newline at end of file
diff --git a/media/libaom/src/docs.cmake b/media/libaom/src/docs.cmake
index 28ca5c0260..0825ca435d 100644
--- a/media/libaom/src/docs.cmake
+++ b/media/libaom/src/docs.cmake
@@ -20,14 +20,24 @@ set(AOM_DOXYGEN_CONFIG_TEMPLATE "libs.doxy_template")
 set(AOM_DOXYGEN_OUTPUT_DIR "${AOM_CONFIG_DIR}/dox")
 set(AOM_DOXYGEN_SECTIONS "av1")
 
-set(AOM_DOXYGEN_SOURCES "${AOM_ROOT}/aom/aom.h" "${AOM_ROOT}/aom/aom_codec.h"
-                        "${AOM_ROOT}/aom/aom_decoder.h"
-                        "${AOM_ROOT}/aom/aom_encoder.h"
-                        "${AOM_ROOT}/aom/aom_frame_buffer.h"
-                        "${AOM_ROOT}/aom/aom_image.h"
-                        "${AOM_ROOT}/aom/aom_integer.h"
-                        "${AOM_ROOT}/keywords.dox" "${AOM_ROOT}/mainpage.dox"
-                        "${AOM_ROOT}/usage.dox")
+set(AOM_DOXYGEN_SOURCES
+    "${AOM_ROOT}/aom/aom.h"
+    "${AOM_ROOT}/aom/aom_codec.h"
+    "${AOM_ROOT}/aom/aom_decoder.h"
+    "${AOM_ROOT}/aom/aom_encoder.h"
+    "${AOM_ROOT}/aom/aom_external_partition.h"
+    "${AOM_ROOT}/aom/aom_frame_buffer.h"
+    "${AOM_ROOT}/aom/aom_image.h"
+    "${AOM_ROOT}/aom/aom_integer.h"
+    "${AOM_ROOT}/av1/common/av1_common_int.h"
+    "${AOM_ROOT}/av1/common/av1_loopfilter.h"
+    "${AOM_ROOT}/av1/common/blockd.h"
+    "${AOM_ROOT}/av1/common/cdef.h"
+    "${AOM_ROOT}/av1/common/enums.h"
+    "${AOM_ROOT}/av1/common/restoration.h"
+    "${AOM_ROOT}/keywords.dox"
+    "${AOM_ROOT}/mainpage.dox"
+    "${AOM_ROOT}/usage.dox")
 
 if(CONFIG_AV1_DECODER)
   set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
@@ -45,7 +55,8 @@ if(CONFIG_AV1_DECODER)
   set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_decoder decoder")
 
   set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomdx.h"
-                          "${AOM_ROOT}/usage_dx.dox")
+                          "${AOM_ROOT}/usage_dx.dox"
+                          "${AOM_ROOT}/av1/decoder/decoder.h")
 
   if(CONFIG_ANALYZER)
     set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
@@ -62,6 +73,9 @@ if(CONFIG_AV1_DECODER)
     set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
                                          "Bitstream inspector.")
   endif()
+
+  set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES}
+                          "${AOM_ROOT}/doc/dev_guide/av1_decoder.dox")
 endif()
 
 if(CONFIG_AV1_ENCODER)
@@ -95,6 +109,52 @@ if(CONFIG_AV1_ENCODER)
 
   set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomcx.h"
                           "${AOM_ROOT}/usage_cx.dox")
+  set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES}
+                          "${AOM_ROOT}/doc/dev_guide/av1_encoder.dox")
+  set(AOM_DOXYGEN_SOURCES
+      ${AOM_DOXYGEN_SOURCES}
+      "${AOM_ROOT}/aom_scale/yv12config.h"
+      "${AOM_ROOT}/av1/encoder/bitstream.h"
+      "${AOM_ROOT}/av1/encoder/block.h"
+      "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+      "${AOM_ROOT}/av1/encoder/encode_strategy.c"
+      "${AOM_ROOT}/av1/encoder/encode_strategy.h"
+      "${AOM_ROOT}/av1/encoder/encodeframe.c"
+      "${AOM_ROOT}/av1/encoder/encoder.c"
+      "${AOM_ROOT}/av1/encoder/encoder.h"
+      "${AOM_ROOT}/av1/encoder/encodetxb.h"
+      "${AOM_ROOT}/av1/encoder/firstpass.h"
+      "${AOM_ROOT}/av1/encoder/gop_structure.h"
+      "${AOM_ROOT}/av1/encoder/interp_search.c"
+      "${AOM_ROOT}/av1/encoder/intra_mode_search.h"
+      "${AOM_ROOT}/av1/encoder/intra_mode_search.c"
+      "${AOM_ROOT}/av1/encoder/intra_mode_search_utils.h"
+      "${AOM_ROOT}/av1/encoder/lookahead.h"
+      "${AOM_ROOT}/av1/encoder/palette.h"
+      "${AOM_ROOT}/av1/encoder/palette.c"
+      "${AOM_ROOT}/av1/encoder/partition_search.h"
+      "${AOM_ROOT}/av1/encoder/partition_search.c"
+      "${AOM_ROOT}/av1/encoder/pass2_strategy.h"
+      "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
+      "${AOM_ROOT}/av1/encoder/pickcdef.h"
+      "${AOM_ROOT}/av1/encoder/picklpf.h"
+      "${AOM_ROOT}/av1/encoder/pickrst.h"
+      "${AOM_ROOT}/av1/encoder/ratectrl.c"
+      "${AOM_ROOT}/av1/encoder/ratectrl.h"
+      "${AOM_ROOT}/av1/encoder/rc_utils.h"
+      "${AOM_ROOT}/av1/encoder/rdopt.h"
+      "${AOM_ROOT}/av1/encoder/rdopt.c"
+      "${AOM_ROOT}/av1/encoder/speed_features.h"
+      "${AOM_ROOT}/av1/encoder/svc_layercontext.c"
+      "${AOM_ROOT}/av1/encoder/svc_layercontext.h"
+      "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+      "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+      "${AOM_ROOT}/av1/encoder/tpl_model.h"
+      "${AOM_ROOT}/av1/encoder/tx_search.h"
+      "${AOM_ROOT}/av1/encoder/txb_rdopt.h"
+      "${AOM_ROOT}/av1/encoder/var_based_part.h"
+      "${AOM_ROOT}/av1/encoder/nonrd_opt.h"
+      "${AOM_ROOT}/av1/encoder/nonrd_pickmode.c")
 endif()
 
 if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
@@ -231,6 +291,16 @@ reference. The following utilities are included:
   get_filename_component(samples_dox ${samples_dox} NAME)
   set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} ${samples_dox})
 
+  # There are issues to show Markdown file for old Doxygen version. Here, only
+  # enable Markdown support for 1.8.16 or newer.
+  if(${DOXYGEN_VERSION_VALUE} GREATER_EQUAL 1008016)
+    set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_md_support")
+    set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/README.md")
+    # Uncomment and add AlgorithmDescription.md in result page when it is done.
+    # set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES}
+    # "${AOM_ROOT}/doc/AlgorithmDescription.md")
+  endif()
+
   # Generate libaom's doxyfile.
   file(WRITE "${AOM_DOXYFILE}" "##\n## GENERATED FILE. DO NOT EDIT\n##\n")
   file(READ "${AOM_ROOT}/${AOM_DOXYGEN_CONFIG_TEMPLATE}" doxygen_template_data)
@@ -245,6 +315,24 @@ reference. The following utilities are included:
   write_cmake_list_to_doxygen_config_var("ENABLED_SECTIONS"
                                          "AOM_DOXYGEN_SECTIONS")
 
+  # Add AOMedia logo.
+  set(aom_logo "aomedia_logo_200.png")
+  configure_file(${AOM_ROOT}/${aom_logo} ${AOM_CONFIG_DIR}/${aom_logo} COPYONLY)
+  file(APPEND "${AOM_DOXYFILE}"
+       "PROJECT_LOGO = ${AOM_CONFIG_DIR}/${aom_logo}\n")
+
+  # Only set HAVE_DOT to YES if dot tool is found.
+  if(DOXYGEN_DOT_FOUND)
+    file(APPEND "${AOM_DOXYFILE}" "HAVE_DOT = YES\n")
+    file(APPEND "${AOM_DOXYFILE}" "DOT_GRAPH_MAX_NODES = 10000\n")
+  endif()
+
+  # Add image path.
+  file(APPEND "${AOM_DOXYFILE}" "IMAGE_PATH += ${AOM_ROOT}/doc/dev_guide\n")
+
+  # Allow banner style comments
+  file(APPEND "${AOM_DOXYFILE}" "JAVADOC_BANNER = YES")
+
   # Add the doxygen generation rule.
   add_custom_target(docs ALL
                     COMMAND "${DOXYGEN_EXECUTABLE}" "${AOM_DOXYFILE}"
diff --git a/media/libaom/src/examples/analyzer.cc b/media/libaom/src/examples/analyzer.cc
index 35988211e7..501f5024db 100644
--- a/media/libaom/src/examples/analyzer.cc
+++ b/media/libaom/src/examples/analyzer.cc
@@ -39,7 +39,6 @@ class AV1Decoder {
 
   AvxVideoReader *reader;
   const AvxVideoInfo *info;
-  const AvxInterface *decoder;
 
   insp_frame_data frame_data;
 
@@ -92,8 +91,8 @@ bool AV1Decoder::open(const wxString &path) {
     fprintf(stderr, "Unknown input codec.");
     return false;
   }
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) {
+  printf("Using %s\n", aom_codec_iface_name(decoder));
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0)) {
     fprintf(stderr, "Failed to initialize decoder.");
     return false;
   }
diff --git a/media/libaom/src/examples/aom_cx_set_ref.c b/media/libaom/src/examples/aom_cx_set_ref.c
index 2f4f6586f4..da36d9fe13 100644
--- a/media/libaom/src/examples/aom_cx_set_ref.c
+++ b/media/libaom/src/examples/aom_cx_set_ref.c
@@ -108,7 +108,7 @@ static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
     }
 
     printf(
-        "Encode/decode mismatch on frame %d at"
+        "Encode/decode mismatch on frame %u at"
         " Y[%d, %d] {%d/%d},"
         " U[%d, %d] {%d/%d},"
         " V[%d, %d] {%d/%d}",
@@ -186,7 +186,6 @@ int main(int argc, char **argv) {
   aom_codec_err_t res;
   AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
-  const AvxInterface *encoder = NULL;
   int flags = 0;
   int allocated_raw_shift = 0;
   aom_img_fmt_t raw_fmt = AOM_IMG_FMT_I420;
@@ -229,7 +228,7 @@ int main(int argc, char **argv) {
   outfile_arg = argv[5];
   update_frame_num_arg = argv[6];
 
-  encoder = get_aom_encoder_by_name(codec_arg);
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg);
   if (!encoder) die("Unsupported codec.");
 
   update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0);
@@ -246,7 +245,7 @@ int main(int argc, char **argv) {
       die("Update frame number couldn't larger than limit\n");
   }
 
-  info.codec_fourcc = encoder->fourcc;
+  info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
   info.frame_width = (int)strtol(width_arg, NULL, 0);
   info.frame_height = (int)strtol(height_arg, NULL, 0);
   info.time_base.numerator = 1;
@@ -266,13 +265,17 @@ int main(int argc, char **argv) {
   // Allocate memory with the border so that it can be used as a reference.
   if (!aom_img_alloc_with_border(&ext_ref, ref_fmt, info.frame_width,
                                  info.frame_height, 32, 8,
-                                 AOM_BORDER_IN_PIXELS)) {
+                                 AOM_DEC_BORDER_IN_PIXELS)) {
     die("Failed to allocate image.");
   }
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+#if CONFIG_REALTIME_ONLY
+  res = aom_codec_enc_config_default(encoder, &cfg, 1);
+#else
+  res = aom_codec_enc_config_default(encoder, &cfg, 0);
+#endif
   if (res) die_codec(&ecodec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -293,17 +296,17 @@ int main(int argc, char **argv) {
   if (!(infile = fopen(infile_arg, "rb")))
     die("Failed to open %s for reading.", infile_arg);
 
-  if (aom_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, flags))
-    die_codec(&ecodec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&ecodec, encoder, &cfg, flags))
+    die("Failed to initialize encoder");
 
   // Disable alt_ref.
   if (aom_codec_control(&ecodec, AOME_SET_ENABLEAUTOALTREF, 0))
     die_codec(&ecodec, "Failed to set enable auto alt ref");
 
   if (test_decode) {
-    const AvxInterface *decoder = get_aom_decoder_by_name(codec_arg);
-    if (aom_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0))
-      die_codec(&dcodec, "Failed to initialize decoder.");
+    aom_codec_iface_t *decoder = get_aom_decoder_by_short_name(codec_arg);
+    if (aom_codec_dec_init(&dcodec, decoder, NULL, 0))
+      die("Failed to initialize decoder.");
   }
 
   // Encode frames.
@@ -335,6 +338,12 @@ int main(int argc, char **argv) {
         die_codec(&ecodec, "Failed to set encoder reference frame");
       printf(" <SET_REF>");
 
+#if CONFIG_REALTIME_ONLY
+      // Set cpu speed in encoder.
+      if (aom_codec_control(&ecodec, AOME_SET_CPUUSED, 7))
+        die_codec(&ecodec, "Failed to set cpu speed");
+#endif
+
       // If set_reference in decoder is commented out, the enc/dec mismatch
       // would be seen.
       if (test_decode) {
@@ -358,7 +367,7 @@ int main(int argc, char **argv) {
 
   printf("\n");
   fclose(infile);
-  printf("Processed %d frames.\n", frame_out);
+  printf("Processed %u frames.\n", frame_out);
 
   if (test_decode) {
     if (!mismatch_seen)
diff --git a/media/libaom/src/examples/av1_dec_fuzzer.cc b/media/libaom/src/examples/av1_dec_fuzzer.cc
index 1cddc8cc11..9b9a0b9cb6 100644
--- a/media/libaom/src/examples/av1_dec_fuzzer.cc
+++ b/media/libaom/src/examples/av1_dec_fuzzer.cc
@@ -34,7 +34,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     return 0;
   }
 
-  const aom_codec_iface_t *codec_interface = aom_codec_av1_dx();
+  aom_codec_iface_t *codec_interface = aom_codec_av1_dx();
   aom_codec_ctx_t codec;
   // Set thread count in the range [1, 64].
   const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1;
diff --git a/media/libaom/src/examples/av1_dec_fuzzer.dict b/media/libaom/src/examples/av1_dec_fuzzer.dict
new file mode 100644
index 0000000000..fb1638864c
--- /dev/null
+++ b/media/libaom/src/examples/av1_dec_fuzzer.dict
@@ -0,0 +1,5 @@
+# IVF Signature + version (bytes 0-5)
+kw1="DKIF\x00\x00"
+
+# AV1 codec fourCC (bytes 8-11)
+kw2="AV01"
diff --git a/media/libaom/src/examples/build_av1_dec_fuzzer.sh b/media/libaom/src/examples/build_av1_dec_fuzzer.sh
index 0dcb254dac..40355ea133 100644..100755
--- a/media/libaom/src/examples/build_av1_dec_fuzzer.sh
+++ b/media/libaom/src/examples/build_av1_dec_fuzzer.sh
@@ -33,11 +33,11 @@ if [[ $# -ne 2 ]]; then
   echo "  git clone https://aomedia.googlesource.com/aom"
   exit 2
 fi
-if [[ -z "$CC" ]]; then
+if [[ -z "${CC:-}" ]]; then
   echo "Set the CC environment variable to point to your C compiler."
   exit 2
 fi
-if [[ -z "$CXX" ]]; then
+if [[ -z "${CXX:-}" ]]; then
   echo "Set the CXX environment variable to point to your C++ compiler."
   exit 2
 fi
@@ -47,10 +47,10 @@ BUILD_DIR=$2
 # Run CMake with address sanitizer enabled and build the codec.
 # Enable DO_RANGE_CHECK_CLAMP to suppress the noise of integer overflows
 # in the transform functions. Also set memory limits.
-EXTRA_C_FLAGS='-DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824'
+EXTRA_C_FLAGS='-UNDEBUG -DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824'
 cd "${BUILD_DIR}"
 cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \
-  -DCONFIG_SCALABILITY=0 -DFORCE_HIGHBITDEPTH_DECODING=0 \
+  -DFORCE_HIGHBITDEPTH_DECODING=0 \
   -DCONFIG_AV1_ENCODER=0 -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 \
   -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \
   -DAOM_EXTRA_C_FLAGS="${EXTRA_C_FLAGS}" \
@@ -60,10 +60,10 @@ cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \
 make -j$(nproc)
 
 # Build the av1 fuzzer
-$CXX -std=c++11 -DDECODER=av1 -I${AOM_DIR} -I${BUILD_DIR} \
-    -fsanitize=fuzzer,address -Wl,--start-group \
+$CXX -std=c++11 -I${AOM_DIR} -I${BUILD_DIR} \
+    -g -fsanitize=fuzzer,address \
     ${AOM_DIR}/examples/av1_dec_fuzzer.cc -o ${BUILD_DIR}/av1_dec_fuzzer \
-    ${BUILD_DIR}/libaom.a -Wl,--end-group
+    ${BUILD_DIR}/libaom.a
 
 echo "Fuzzer built at ${BUILD_DIR}/av1_dec_fuzzer."
 echo "Create a corpus directory, copy IVF files in there, and run:"
diff --git a/media/libaom/src/examples/decode_to_md5.c b/media/libaom/src/examples/decode_to_md5.c
index bc127b78df..07f788ff97 100644
--- a/media/libaom/src/examples/decode_to_md5.c
+++ b/media/libaom/src/examples/decode_to_md5.c
@@ -77,10 +77,8 @@ void usage_exit(void) {
 int main(int argc, char **argv) {
   int frame_cnt = 0;
   FILE *outfile = NULL;
-  aom_codec_ctx_t codec;
   AvxVideoReader *reader = NULL;
   const AvxVideoInfo *info = NULL;
-  const AvxInterface *decoder = NULL;
 
   exec_name = argv[0];
 
@@ -94,13 +92,14 @@ int main(int argc, char **argv) {
 
   info = aom_video_reader_get_info(reader);
 
-  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
 
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(decoder));
 
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder");
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder");
 
   while (aom_video_reader_read_frame(reader)) {
     aom_codec_iter_t iter = NULL;
@@ -116,7 +115,7 @@ int main(int argc, char **argv) {
 
       get_image_md5(img, digest);
       print_md5(outfile, digest);
-      fprintf(outfile, "  img-%dx%d-%04d.i420\n", img->d_w, img->d_h,
+      fprintf(outfile, "  img-%ux%u-%04d.i420\n", img->d_w, img->d_h,
               ++frame_cnt);
     }
   }
diff --git a/media/libaom/src/examples/decode_with_drops.c b/media/libaom/src/examples/decode_with_drops.c
index 214401958a..9bec6ee2df 100644
--- a/media/libaom/src/examples/decode_with_drops.c
+++ b/media/libaom/src/examples/decode_with_drops.c
@@ -72,8 +72,6 @@ void usage_exit(void) {
 int main(int argc, char **argv) {
   int frame_cnt = 0;
   FILE *outfile = NULL;
-  aom_codec_ctx_t codec;
-  const AvxInterface *decoder = NULL;
   AvxVideoReader *reader = NULL;
   const AvxVideoInfo *info = NULL;
   int n = 0;
@@ -99,13 +97,13 @@ int main(int argc, char **argv) {
 
   info = aom_video_reader_get_info(reader);
 
-  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
 
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
-
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+  printf("Using %s\n", aom_codec_iface_name(decoder));
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder.");
 
   while (aom_video_reader_read_frame(reader)) {
     aom_codec_iter_t iter = NULL;
diff --git a/media/libaom/src/examples/inspect.c b/media/libaom/src/examples/inspect.c
index 526bdc16c1..8e7213ab43 100644
--- a/media/libaom/src/examples/inspect.c
+++ b/media/libaom/src/examples/inspect.c
@@ -267,7 +267,7 @@ struct parm_offset {
   char offset;
 };
 struct parm_offset parm_offsets[] = {
-  { "blockSize", offsetof(insp_mi_data, sb_type) },
+  { "blockSize", offsetof(insp_mi_data, bsize) },
   { "transformSize", offsetof(insp_mi_data, tx_size) },
   { "transformType", offsetof(insp_mi_data, tx_type) },
   { "dualFilterType", offsetof(insp_mi_data, dual_filter_type) },
@@ -623,11 +623,15 @@ void inspect(void *pbi, void *data) {
   // We allocate enough space and hope we don't write out of bounds. Totally
   // unsafe but this speeds things up, especially when compiled to Javascript.
   char *buffer = aom_malloc(MAX_BUFFER);
+  if (!buffer) {
+    fprintf(stderr, "Error allocating inspect info buffer\n");
+    abort();
+  }
   char *buf = buffer;
   buf += put_str(buf, "{\n");
   if (layers & BLOCK_SIZE_LAYER) {
     buf += put_block_info(buf, block_size_map, "blockSize",
-                          offsetof(insp_mi_data, sb_type), 0);
+                          offsetof(insp_mi_data, bsize), 0);
   }
   if (layers & TRANSFORM_SIZE_LAYER) {
     buf += put_block_info(buf, tx_size_map, "transformSize",
@@ -755,12 +759,11 @@ int open_file(char *file) {
   reader = aom_video_reader_open(file);
   if (!reader) die("Failed to open %s for reading.", file);
   info = aom_video_reader_get_info(reader);
-  const AvxInterface *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
-  fprintf(stderr, "Using %s\n",
-          aom_codec_iface_name(decoder->codec_interface()));
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+  fprintf(stderr, "Using %s\n", aom_codec_iface_name(decoder));
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder.");
   ifd_init(&frame_data, info->frame_width, info->frame_height);
   ifd_init_cb();
   return EXIT_SUCCESS;
@@ -793,6 +796,7 @@ int read_frame() {
     }
 
     frame = adr.buf;
+    frame_size = end_frame - frame;
     if (frame == end_frame) have_frame = 0;
   } while (adr.show_existing);
 
diff --git a/media/libaom/src/examples/lightfield_bitstream_parsing.c b/media/libaom/src/examples/lightfield_bitstream_parsing.c
index ffcbcb9cb9..35b4ad093e 100644
--- a/media/libaom/src/examples/lightfield_bitstream_parsing.c
+++ b/media/libaom/src/examples/lightfield_bitstream_parsing.c
@@ -192,10 +192,8 @@ void process_tile_list(const TILE_LIST_INFO *tiles, int num_tiles,
 }
 
 int main(int argc, char **argv) {
-  aom_codec_ctx_t codec;
   AvxVideoReader *reader = NULL;
   AvxVideoWriter *writer = NULL;
-  const AvxInterface *decoder = NULL;
   const AvxVideoInfo *info = NULL;
   int num_references;
   int i;
@@ -220,12 +218,13 @@ int main(int argc, char **argv) {
 
   tile_list_file = argv[4];
 
-  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(decoder));
 
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder.");
 
   // Decode anchor frames.
   AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0);
@@ -268,6 +267,8 @@ int main(int argc, char **argv) {
   unsigned char **frames =
       (unsigned char **)malloc(num_frames * sizeof(unsigned char *));
   size_t *frame_sizes = (size_t *)malloc(num_frames * sizeof(size_t));
+  if (!(frames && frame_sizes)) die("Failed to allocate frame data.");
+
   // Seek to the first camera image.
   fseeko(infile, camera_frame_pos, SEEK_SET);
   for (int f = 0; f < num_frames; ++f) {
@@ -276,6 +277,7 @@ int main(int argc, char **argv) {
     const unsigned char *frame =
         aom_video_reader_get_frame(reader, &frame_size);
     frames[f] = (unsigned char *)malloc(frame_size * sizeof(unsigned char));
+    if (!frames[f]) die("Failed to allocate frame data.");
     memcpy(frames[f], frame, frame_size);
     frame_sizes[f] = frame_size;
   }
diff --git a/media/libaom/src/examples/lightfield_decoder.c b/media/libaom/src/examples/lightfield_decoder.c
index a292e9c75e..65b13efa1a 100644
--- a/media/libaom/src/examples/lightfield_decoder.c
+++ b/media/libaom/src/examples/lightfield_decoder.c
@@ -46,6 +46,12 @@
 #include "common/tools_common.h"
 #include "common/video_reader.h"
 
+enum {
+  YUV1D,  // 1D tile output for conformance test.
+  YUV,    // Tile output in YUV format.
+  NV12,   // Tile output in NV12 format.
+} UENUM1BYTE(OUTPUT_FORMAT);
+
 static const char *exec_name;
 
 void usage_exit(void) {
@@ -57,8 +63,8 @@ void usage_exit(void) {
 }
 
 // Output frame size
-const int output_frame_width = 512;
-const int output_frame_height = 512;
+static const int output_frame_width = 512;
+static const int output_frame_height = 512;
 
 static void aom_img_copy_tile(const aom_image_t *src, const aom_image_t *dst,
                               int dst_row_offset, int dst_col_offset) {
@@ -90,11 +96,11 @@ static void aom_img_copy_tile(const aom_image_t *src, const aom_image_t *dst,
   }
 }
 
-void decode_tile(aom_codec_ctx_t *codec, const unsigned char *frame,
-                 size_t frame_size, int tr, int tc, int ref_idx,
-                 aom_image_t *reference_images, aom_image_t *output,
-                 int *tile_idx, unsigned int *output_bit_depth,
-                 aom_image_t **img_ptr, int output_format) {
+static void decode_tile(aom_codec_ctx_t *codec, const unsigned char *frame,
+                        size_t frame_size, int tr, int tc, int ref_idx,
+                        aom_image_t *reference_images, aom_image_t *output,
+                        int *tile_idx, unsigned int *output_bit_depth,
+                        aom_image_t **img_ptr, int output_format) {
   AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_TILE_MODE, 1);
   AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_EXT_TILE_DEBUG, 1);
   AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr);
@@ -156,9 +162,7 @@ static void img_write_to_file(const aom_image_t *img, FILE *file,
 
 int main(int argc, char **argv) {
   FILE *outfile = NULL;
-  aom_codec_ctx_t codec;
   AvxVideoReader *reader = NULL;
-  const AvxInterface *decoder = NULL;
   const AvxVideoInfo *info = NULL;
   int num_references;
   aom_img_fmt_t ref_fmt = 0;
@@ -189,13 +193,15 @@ int main(int argc, char **argv) {
 
   info = aom_video_reader_get_info(reader);
 
+  aom_codec_iface_t *decoder;
   if (info->codec_fourcc == LST_FOURCC)
     decoder = get_aom_decoder_by_fourcc(AV1_FOURCC);
   else
     die("Unknown input codec.");
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(decoder));
 
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
     die_codec(&codec, "Failed to initialize decoder.");
 
   if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB,
@@ -240,7 +246,7 @@ int main(int argc, char **argv) {
     while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
       char name[1024];
       snprintf(name, sizeof(name), "ref_%d.yuv", i);
-      printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h);
+      printf("writing ref image to %s, %u, %u\n", name, img->d_w, img->d_h);
       FILE *ref_file = fopen(name, "wb");
       aom_img_write(img, ref_file);
       fclose(ref_file);
@@ -264,12 +270,14 @@ int main(int argc, char **argv) {
   unsigned char **frames =
       (unsigned char **)malloc(num_frames * sizeof(unsigned char *));
   size_t *frame_sizes = (size_t *)malloc(num_frames * sizeof(size_t));
+  if (!(frames && frame_sizes)) die("Failed to allocate frame data.");
   // Seek to the first camera image.
   fseeko(infile, camera_frame_pos, SEEK_SET);
   for (int f = 0; f < num_frames; ++f) {
     aom_video_reader_read_frame(reader);
     frame = aom_video_reader_get_frame(reader, &frame_size);
     frames[f] = (unsigned char *)malloc(frame_size * sizeof(unsigned char));
+    if (!frames[f]) die("Failed to allocate frame data.");
     memcpy(frames[f], frame, frame_size);
     frame_sizes[f] = frame_size;
   }
@@ -300,8 +308,11 @@ int main(int argc, char **argv) {
         // Write out the tile list.
         if (tile_list_cnt) {
           out = &output;
-          if (output_bit_depth != 0)
-            aom_shift_img(output_bit_depth, &out, &output_shifted);
+          if (output_bit_depth != 0) {
+            if (!aom_shift_img(output_bit_depth, &out, &output_shifted)) {
+              die("Error allocating image");
+            }
+          }
           img_write_to_file(out, outfile, output_format);
           tile_list_writes++;
         }
@@ -332,8 +343,11 @@ int main(int argc, char **argv) {
                 &output, &tile_idx, &output_bit_depth, &img, output_format);
     if (output_format == YUV1D) {
       out = img;
-      if (output_bit_depth != 0)
-        aom_shift_img(output_bit_depth, &out, &output_shifted);
+      if (output_bit_depth != 0) {
+        if (!aom_shift_img(output_bit_depth, &out, &output_shifted)) {
+          die("Error allocating image");
+        }
+      }
       aom_img_write(out, outfile);
     }
   }
@@ -342,8 +356,11 @@ int main(int argc, char **argv) {
     // Write out the last tile list.
     if (tile_list_writes < tile_list_cnt) {
       out = &output;
-      if (output_bit_depth != 0)
-        aom_shift_img(output_bit_depth, &out, &output_shifted);
+      if (output_bit_depth != 0) {
+        if (!aom_shift_img(output_bit_depth, &out, &output_shifted)) {
+          die("Error allocating image");
+        }
+      }
       img_write_to_file(out, outfile, output_format);
     }
   }
diff --git a/media/libaom/src/examples/lightfield_encoder.c b/media/libaom/src/examples/lightfield_encoder.c
index e80fe24f6b..6be210129e 100644
--- a/media/libaom/src/examples/lightfield_encoder.c
+++ b/media/libaom/src/examples/lightfield_encoder.c
@@ -81,6 +81,7 @@ static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img,
       const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
       const size_t pkt_size = pkt->data.twopass_stats.sz;
       stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+      if (!stats->buf) die("Failed to allocate frame stats buffer.");
       memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
       stats->sz += pkt_size;
     }
@@ -128,7 +129,7 @@ static void get_raw_image(aom_image_t **frame_to_encode, aom_image_t *raw,
 }
 
 static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
-                             const AvxInterface *encoder,
+                             aom_codec_iface_t *encoder,
                              const aom_codec_enc_cfg_t *cfg, int lf_width,
                              int lf_height, int lf_blocksize, int flags,
                              aom_image_t *raw_shift) {
@@ -140,8 +141,8 @@ static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
   aom_fixed_buf_t stats = { NULL, 0 };
   aom_image_t *frame_to_encode;
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, flags))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, cfg, flags))
+    die("Failed to initialize encoder");
   if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0))
     die_codec(&codec, "Failed to turn off auto altref");
   if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
@@ -231,10 +232,10 @@ static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
 }
 
 static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
-                  const AvxInterface *encoder, aom_codec_enc_cfg_t *cfg,
+                  aom_codec_iface_t *encoder, aom_codec_enc_cfg_t *cfg,
                   int lf_width, int lf_height, int lf_blocksize, int flags,
                   aom_image_t *raw_shift) {
-  AvxVideoInfo info = { encoder->fourcc,
+  AvxVideoInfo info = { get_fourcc_by_aom_encoder(encoder),
                         cfg->g_w,
                         cfg->g_h,
                         { cfg->g_timebase.num, cfg->g_timebase.den },
@@ -253,15 +254,15 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
   writer = aom_video_writer_open(outfile_name, kContainerIVF, &info);
   if (!writer) die("Failed to open %s for writing", outfile_name);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, flags))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, cfg, flags))
+    die("Failed to initialize encoder");
   if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0))
     die_codec(&codec, "Failed to turn off auto altref");
   if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
     die_codec(&codec, "Failed to set frame parallel decoding");
   if (aom_codec_control(&codec, AV1E_ENABLE_EXT_TILE_DEBUG, 1))
     die_codec(&codec, "Failed to enable encoder ext_tile debug");
-  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 1))
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 3))
     die_codec(&codec, "Failed to set cpu-used");
 
   // Note: The superblock is a sequence parameter and has to be the same for 1
@@ -438,7 +439,6 @@ int main(int argc, char **argv) {
   aom_fixed_buf_t stats;
   int flags = 0;
 
-  const AvxInterface *encoder = NULL;
   const int fps = 30;
   const int bitrate = 200;  // kbit/s
   const char *const width_arg = argv[1];
@@ -452,7 +452,7 @@ int main(int argc, char **argv) {
 
   if (argc < 8) die("Invalid number of arguments");
 
-  encoder = get_aom_encoder_by_name("av1");
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name("av1");
   if (!encoder) die("Unsupported codec.");
 
   w = (int)strtol(width_arg, NULL, 0);
@@ -478,10 +478,10 @@ int main(int argc, char **argv) {
                   32);
   }
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
   // Configuration
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, 0);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = w;
diff --git a/media/libaom/src/examples/lightfield_tile_list_decoder.c b/media/libaom/src/examples/lightfield_tile_list_decoder.c
index 3b928df2c3..5b15ae00e6 100644
--- a/media/libaom/src/examples/lightfield_tile_list_decoder.c
+++ b/media/libaom/src/examples/lightfield_tile_list_decoder.c
@@ -37,6 +37,12 @@
 #include "common/tools_common.h"
 #include "common/video_reader.h"
 
+enum {
+  YUV1D,  // 1D tile output for conformance test.
+  YUV,    // Tile output in YUV format.
+  NV12,   // Tile output in NV12 format.
+} UENUM1BYTE(OUTPUT_FORMAT);
+
 static const char *exec_name;
 
 void usage_exit(void) {
@@ -98,9 +104,7 @@ static void write_tile_yuv1d(aom_codec_ctx_t *codec, const aom_image_t *img,
 
 int main(int argc, char **argv) {
   FILE *outfile = NULL;
-  aom_codec_ctx_t codec;
   AvxVideoReader *reader = NULL;
-  const AvxInterface *decoder = NULL;
   const AvxVideoInfo *info = NULL;
   int num_references;
   int num_tile_lists;
@@ -129,16 +133,17 @@ int main(int argc, char **argv) {
 
   info = aom_video_reader_get_info(reader);
 
-  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(decoder));
 
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder.");
 
   if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB,
                                     info->is_annexb)) {
-    die("Failed to set annex b status");
+    die_codec(&codec, "Failed to set annex b status");
   }
 
   // Decode anchor frames.
@@ -179,7 +184,7 @@ int main(int argc, char **argv) {
     while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
       char name[1024];
       snprintf(name, sizeof(name), "ref_%d.yuv", i);
-      printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h);
+      printf("writing ref image to %s, %u, %u\n", name, img->d_w, img->d_h);
       FILE *ref_file = fopen(name, "wb");
       aom_img_write(img, ref_file);
       fclose(ref_file);
diff --git a/media/libaom/src/examples/lossless_encoder.c b/media/libaom/src/examples/lossless_encoder.c
index e0253d2b34..1971b9c9df 100644
--- a/media/libaom/src/examples/lossless_encoder.c
+++ b/media/libaom/src/examples/lossless_encoder.c
@@ -57,14 +57,12 @@ static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
 
 int main(int argc, char **argv) {
   FILE *infile = NULL;
-  aom_codec_ctx_t codec;
   aom_codec_enc_cfg_t cfg;
   int frame_count = 0;
   aom_image_t raw;
   aom_codec_err_t res;
   AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
-  const AvxInterface *encoder = NULL;
   const int fps = 30;
 
   exec_name = argv[0];
@@ -75,10 +73,10 @@ int main(int argc, char **argv) {
 
   if (argc < 5) die("Invalid number of arguments");
 
-  encoder = get_aom_encoder_by_name("av1");
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name("av1");
   if (!encoder) die("Unsupported codec.");
 
-  info.codec_fourcc = encoder->fourcc;
+  info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
   info.frame_width = (int)strtol(argv[1], NULL, 0);
   info.frame_height = (int)strtol(argv[2], NULL, 0);
   info.time_base.numerator = 1;
@@ -94,9 +92,10 @@ int main(int argc, char **argv) {
     die("Failed to allocate image.");
   }
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  aom_codec_ctx_t codec;
+  res = aom_codec_enc_config_default(encoder, &cfg, 0);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -110,8 +109,8 @@ int main(int argc, char **argv) {
   if (!(infile = fopen(argv[3], "rb")))
     die("Failed to open %s for reading.", argv[3]);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+    die("Failed to initialize encoder");
 
   if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1E_SET_LOSSLESS, 1))
     die_codec(&codec, "Failed to use lossless mode");
diff --git a/media/libaom/src/examples/noise_model.c b/media/libaom/src/examples/noise_model.c
index d07443f9d5..1de13267fc 100644
--- a/media/libaom/src/examples/noise_model.c
+++ b/media/libaom/src/examples/noise_model.c
@@ -47,7 +47,7 @@
 #include "aom_dsp/aom_dsp_common.h"
 
 #if CONFIG_AV1_DECODER
-#include "aom_dsp/grain_synthesis.h"
+#include "av1/decoder/grain_synthesis.h"
 #endif
 
 #include "aom_dsp/grain_table.h"
@@ -114,7 +114,7 @@ typedef struct {
   const char *debug_file;
 } noise_model_args_t;
 
-static void parse_args(noise_model_args_t *noise_args, int *argc, char **argv) {
+static void parse_args(noise_model_args_t *noise_args, char **argv) {
   struct arg arg;
   static const arg_def_t *main_args[] = { &help,
                                           &input_arg,
@@ -129,7 +129,7 @@ static void parse_args(noise_model_args_t *noise_args, int *argc, char **argv) {
                                           &use_i444,
                                           &debug_file_arg,
                                           NULL };
-  for (int argi = *argc + 1; *argv; argi++, argv++) {
+  for (; *argv; argv++) {
     if (arg_match(&arg, &help, argv)) {
       fprintf(stdout, "\nOptions:\n");
       arg_show_usage(stdout, main_args);
@@ -294,8 +294,9 @@ int main(int argc, char *argv[]) {
 
   memset(&info, 0, sizeof(info));
 
+  (void)argc;
   exec_name = argv[0];
-  parse_args(&args, &argc, argv + 1);
+  parse_args(&args, argv + 1);
 
   info.frame_width = args.width;
   info.frame_height = args.height;
@@ -316,7 +317,7 @@ int main(int argc, char *argv[]) {
   }
   infile = fopen(args.input, "rb");
   if (!infile) {
-    die("Failed to open input file:", args.input);
+    die("Failed to open input file: %s", args.input);
   }
   fprintf(stderr, "Bit depth: %d  stride:%d\n", args.bit_depth, raw.stride[0]);
 
@@ -329,6 +330,7 @@ int main(int argc, char *argv[]) {
   const int num_blocks_w = (info.frame_width + block_size - 1) / block_size;
   const int num_blocks_h = (info.frame_height + block_size - 1) / block_size;
   uint8_t *flat_blocks = (uint8_t *)aom_malloc(num_blocks_w * num_blocks_h);
+  if (!flat_blocks) die("Failed to allocate block data.");
   // Sets the random seed on the first entry in the output table
   int16_t random_seed = 7391;
   aom_noise_model_t noise_model;
diff --git a/media/libaom/src/examples/photon_noise_table.c b/media/libaom/src/examples/photon_noise_table.c
new file mode 100644
index 0000000000..d3a21a48ee
--- /dev/null
+++ b/media/libaom/src/examples/photon_noise_table.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// This tool creates a film grain table, for use in stills and videos,
+// representing the noise that one would get by shooting with a digital camera
+// at a given light level. Much of the noise in digital images is photon shot
+// noise, which is due to the characteristics of photon arrival and grows in
+// standard deviation as the square root of the expected number of photons
+// captured.
+// https://www.photonstophotos.net/Emil%20Martinec/noise.html#shotnoise
+//
+// The proxy used by this tool for the amount of light captured is the ISO value
+// such that the focal plane exposure at the time of capture would have been
+// mapped by a 35mm camera to the output lightness observed in the image. That
+// is, if one were to shoot on a 35mm camera (36×24mm sensor) at the nominal
+// exposure for that ISO setting, the resulting image should contain noise of
+// the same order of magnitude as generated by this tool.
+//
+// Example usage:
+//
+//     ./photon_noise_table --width=3840 --height=2160 --iso=25600 -o noise.tbl
+//     # Then, for example:
+//     aomenc --film-grain-table=noise.tbl ...
+//     # Or:
+//     avifenc -c aom -a film-grain-table=noise.tbl ...
+//
+// The (mostly) square-root relationship between light intensity and noise
+// amplitude holds in linear light, but AV1 streams are most often encoded
+// non-linearly, and the film grain is applied to those non-linear values.
+// Therefore, this tool must account for the non-linearity, and this is
+// controlled by the optional `--transfer-function` (or `-t`) parameter, which
+// specifies the tone response curve that will be used when encoding the actual
+// image. The default for this tool is sRGB, which is approximately similar to
+// an encoding gamma of 1/2.2 (i.e. a decoding gamma of 2.2) though not quite
+// identical.
+//
+// As alluded to above, the tool assumes that the image is taken from the
+// entirety of a 36×24mm (“35mm format”) sensor. If that assumption does not
+// hold, then a “35mm-equivalent ISO value” that can be passed to the tool can
+// be obtained by multiplying the true ISO value by the ratio of 36×24mm to the
+// area that was actually used. For formats that approximately share the same
+// aspect ratio, this is often expressed as the square of the “equivalence
+// ratio” which is the ratio of their diagonals. For example, APS-C (often
+// ~24×16mm) is said to have an equivalence ratio of 1.5 relative to the 35mm
+// format, and therefore ISO 1000 on APS-C and ISO 1000×1.5² = 2250 on 35mm
+// produce an image of the same lightness from the same amount of light spread
+// onto their respective surface areas (resulting in different focal plane
+// exposures), and those images will thus have similar amounts of noise if the
+// cameras are of similar technology. https://doi.org/10.1117/1.OE.57.11.110801
+//
+// The tool needs to know the resolution of the images to which its grain tables
+// will be applied so that it can know how the light on the sensor was shared
+// between its pixels. As a general rule, while a higher pixel count will lead
+// to more noise per pixel, when the final image is viewed at the same physical
+// size, that noise will tend to “average out” to the same amount over a given
+// area, since there will be more pixels in it which, in aggregate, will have
+// received essentially as much light. Put differently, the amount of noise
+// depends on the scale at which it is measured, and the decision for this tool
+// was to make that scale relative to the image instead of its constituent
+// samples. For more on this, see:
+//
+// https://www.photonstophotos.net/Emil%20Martinec/noise-p3.html#pixelsize
+// https://www.dpreview.com/articles/5365920428/the-effect-of-pixel-and-sensor-sizes-on-noise/2
+// https://www.dpreview.com/videos/7940373140/dpreview-tv-why-lower-resolution-sensors-are-not-better-in-low-light
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/grain_table.h"
+#include "common/args.h"
+#include "common/tools_common.h"
+
+static const char *exec_name;
+
+static const struct arg_enum_list transfer_functions[] = {
+  { "bt470m", AOM_CICP_TC_BT_470_M }, { "bt470bg", AOM_CICP_TC_BT_470_B_G },
+  { "srgb", AOM_CICP_TC_SRGB },       { "smpte2084", AOM_CICP_TC_SMPTE_2084 },
+  { "hlg", AOM_CICP_TC_HLG },         ARG_ENUM_LIST_END
+};
+
+static arg_def_t help_arg =
+    ARG_DEF("h", "help", 0, "Show the available options");
+static arg_def_t width_arg =
+    ARG_DEF("w", "width", 1, "Width of the image in pixels (required)");
+static arg_def_t height_arg =
+    ARG_DEF("l", "height", 1, "Height of the image in pixels (required)");
+static arg_def_t iso_arg = ARG_DEF(
+    "i", "iso", 1, "ISO setting indicative of the light level (required)");
+static arg_def_t output_arg =
+    ARG_DEF("o", "output", 1,
+            "Output file to which to write the film grain table (required)");
+static arg_def_t transfer_function_arg =
+    ARG_DEF_ENUM("t", "transfer-function", 1,
+                 "Transfer function used by the encoded image (default = sRGB)",
+                 transfer_functions);
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s [--transfer-function=<tf>] --width=<width> "
+          "--height=<height> --iso=<iso> --output=<output.tbl>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+typedef struct {
+  float (*to_linear)(float);
+  float (*from_linear)(float);
+  // In linear output light. This would typically be 0.18 for SDR (this matches
+  // the definition of Standard Output Sensitivity from ISO 12232:2019), but in
+  // HDR, we certainly do not want to consider 18% of the maximum output a
+  // “mid-tone”, as it would be e.g. 1800 cd/m² for SMPTE ST 2084 (PQ).
+  float mid_tone;
+} transfer_function_t;
+
+static const transfer_function_t *find_transfer_function(
+    aom_transfer_characteristics_t tc);
+
+typedef struct {
+  int width;
+  int height;
+  int iso_setting;
+
+  const transfer_function_t *transfer_function;
+
+  const char *output_filename;
+} photon_noise_args_t;
+
+static void parse_args(int argc, char **argv,
+                       photon_noise_args_t *photon_noise_args) {
+  static const arg_def_t *args[] = { &help_arg,   &width_arg,
+                                     &height_arg, &iso_arg,
+                                     &output_arg, &transfer_function_arg,
+                                     NULL };
+  struct arg arg;
+  int width_set = 0, height_set = 0, iso_set = 0, output_set = 0, i;
+
+  photon_noise_args->transfer_function =
+      find_transfer_function(AOM_CICP_TC_SRGB);
+
+  for (i = 1; i < argc; i += arg.argv_step) {
+    arg.argv_step = 1;
+    if (arg_match(&arg, &help_arg, argv + i)) {
+      arg_show_usage(stdout, args);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &width_arg, argv + i)) {
+      photon_noise_args->width = arg_parse_int(&arg);
+      width_set = 1;
+    } else if (arg_match(&arg, &height_arg, argv + i)) {
+      photon_noise_args->height = arg_parse_int(&arg);
+      height_set = 1;
+    } else if (arg_match(&arg, &iso_arg, argv + i)) {
+      photon_noise_args->iso_setting = arg_parse_int(&arg);
+      iso_set = 1;
+    } else if (arg_match(&arg, &output_arg, argv + i)) {
+      photon_noise_args->output_filename = arg.val;
+      output_set = 1;
+    } else if (arg_match(&arg, &transfer_function_arg, argv + i)) {
+      const aom_transfer_characteristics_t tc = arg_parse_enum(&arg);
+      photon_noise_args->transfer_function = find_transfer_function(tc);
+    } else {
+      fatal("unrecognized argument \"%s\", see --help for available options",
+            argv[i]);
+    }
+  }
+
+  if (!width_set) {
+    fprintf(stderr, "Missing required parameter --width\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (!height_set) {
+    fprintf(stderr, "Missing required parameter --height\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (!iso_set) {
+    fprintf(stderr, "Missing required parameter --iso\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (!output_set) {
+    fprintf(stderr, "Missing required parameter --output\n");
+    exit(EXIT_FAILURE);
+  }
+}
+
+static float maxf(float a, float b) { return a > b ? a : b; }
+static float minf(float a, float b) { return a < b ? a : b; }
+
+static float gamma22_to_linear(float g) { return powf(g, 2.2f); }
+static float gamma22_from_linear(float l) { return powf(l, 1 / 2.2f); }
+static float gamma28_to_linear(float g) { return powf(g, 2.8f); }
+static float gamma28_from_linear(float l) { return powf(l, 1 / 2.8f); }
+
+static float srgb_to_linear(float srgb) {
+  return srgb <= 0.04045f ? srgb / 12.92f
+                          : powf((srgb + 0.055f) / 1.055f, 2.4f);
+}
+static float srgb_from_linear(float linear) {
+  return linear <= 0.0031308f ? 12.92f * linear
+                              : 1.055f * powf(linear, 1 / 2.4f) - 0.055f;
+}
+
+static const float kPqM1 = 2610.f / 16384;
+static const float kPqM2 = 128 * 2523.f / 4096;
+static const float kPqC1 = 3424.f / 4096;
+static const float kPqC2 = 32 * 2413.f / 4096;
+static const float kPqC3 = 32 * 2392.f / 4096;
+static float pq_to_linear(float pq) {
+  const float pq_pow_inv_m2 = powf(pq, 1.f / kPqM2);
+  return powf(maxf(0, pq_pow_inv_m2 - kPqC1) / (kPqC2 - kPqC3 * pq_pow_inv_m2),
+              1.f / kPqM1);
+}
+static float pq_from_linear(float linear) {
+  const float linear_pow_m1 = powf(linear, kPqM1);
+  return powf((kPqC1 + kPqC2 * linear_pow_m1) / (1 + kPqC3 * linear_pow_m1),
+              kPqM2);
+}
+
+// Note: it is perhaps debatable whether “linear” for HLG should be scene light
+// or display light. Here, it is implemented in terms of display light assuming
+// a nominal peak display luminance of 1000 cd/m², hence the system γ of 1.2. To
+// make it scene light instead, the OOTF (powf(x, 1.2f)) and its inverse should
+// be removed from the functions below, and the .mid_tone should be replaced
+// with powf(26.f / 1000, 1 / 1.2f).
+static const float kHlgA = 0.17883277f;
+static const float kHlgB = 0.28466892f;
+static const float kHlgC = 0.55991073f;
+static float hlg_to_linear(float hlg) {
+  // EOTF = OOTF ∘ OETF⁻¹
+  const float linear =
+      hlg <= 0.5f ? hlg * hlg / 3 : (expf((hlg - kHlgC) / kHlgA) + kHlgB) / 12;
+  return powf(linear, 1.2f);
+}
+static float hlg_from_linear(float linear) {
+  // EOTF⁻¹ = OETF ∘ OOTF⁻¹
+  linear = powf(linear, 1.f / 1.2f);
+  return linear <= 1.f / 12 ? sqrtf(3 * linear)
+                            : kHlgA * logf(12 * linear - kHlgB) + kHlgC;
+}
+
+static const transfer_function_t *find_transfer_function(
+    aom_transfer_characteristics_t tc) {
+  static const transfer_function_t
+      kGamma22TransferFunction = { .to_linear = &gamma22_to_linear,
+                                   .from_linear = &gamma22_from_linear,
+                                   .mid_tone = 0.18f },
+      kGamma28TransferFunction = { .to_linear = &gamma28_to_linear,
+                                   .from_linear = &gamma28_from_linear,
+                                   .mid_tone = 0.18f },
+      kSRgbTransferFunction = { .to_linear = &srgb_to_linear,
+                                .from_linear = &srgb_from_linear,
+                                .mid_tone = 0.18f },
+      kPqTransferFunction = { .to_linear = &pq_to_linear,
+                              .from_linear = &pq_from_linear,
+                              // https://www.itu.int/pub/R-REP-BT.2408-4-2021
+                              // page 6 (PDF page 8)
+                              .mid_tone = 26.f / 10000 },
+      kHlgTransferFunction = { .to_linear = &hlg_to_linear,
+                               .from_linear = &hlg_from_linear,
+                               .mid_tone = 26.f / 1000 };
+
+  switch (tc) {
+    case AOM_CICP_TC_BT_470_M: return &kGamma22TransferFunction;
+    case AOM_CICP_TC_BT_470_B_G: return &kGamma28TransferFunction;
+    case AOM_CICP_TC_SRGB: return &kSRgbTransferFunction;
+    case AOM_CICP_TC_SMPTE_2084: return &kPqTransferFunction;
+    case AOM_CICP_TC_HLG: return &kHlgTransferFunction;
+
+    default: fatal("unimplemented transfer function %d", tc);
+  }
+}
+
+static void generate_photon_noise(const photon_noise_args_t *photon_noise_args,
+                                  aom_film_grain_t *film_grain) {
+  // Assumes a daylight-like spectrum.
+  // https://www.strollswithmydog.com/effective-quantum-efficiency-of-sensor/#:~:text=11%2C260%20photons/um%5E2/lx-s
+  static const float kPhotonsPerLxSPerUm2 = 11260;
+
+  // Order of magnitude for cameras in the 2010-2020 decade, taking the CFA into
+  // account.
+  static const float kEffectiveQuantumEfficiency = 0.20f;
+
+  // Also reasonable values for current cameras. The read noise is typically
+  // higher than this at low ISO settings but it matters less there.
+  static const float kPhotoResponseNonUniformity = 0.005f;
+  static const float kInputReferredReadNoise = 1.5f;
+
+  // Focal plane exposure for a mid-tone (typically a 18% reflectance card), in
+  // lx·s.
+  const float mid_tone_exposure = 10.f / photon_noise_args->iso_setting;
+
+  // In microns. Assumes a 35mm sensor (36mm × 24mm).
+  const float pixel_area_um2 = (36000 * 24000.f) / (photon_noise_args->width *
+                                                    photon_noise_args->height);
+
+  const float mid_tone_electrons_per_pixel = kEffectiveQuantumEfficiency *
+                                             kPhotonsPerLxSPerUm2 *
+                                             mid_tone_exposure * pixel_area_um2;
+  const float max_electrons_per_pixel =
+      mid_tone_electrons_per_pixel /
+      photon_noise_args->transfer_function->mid_tone;
+
+  int i;
+
+  film_grain->num_y_points = 14;
+  for (i = 0; i < film_grain->num_y_points; ++i) {
+    float x = i / (film_grain->num_y_points - 1.f);
+    const float linear = photon_noise_args->transfer_function->to_linear(x);
+    const float electrons_per_pixel = max_electrons_per_pixel * linear;
+    // Quadrature sum of the relevant sources of noise, in electrons rms. Photon
+    // shot noise is sqrt(electrons) so we can skip the square root and the
+    // squaring.
+    // https://en.wikipedia.org/wiki/Addition_in_quadrature
+    // https://doi.org/10.1117/3.725073
+    const float noise_in_electrons =
+        sqrtf(kInputReferredReadNoise * kInputReferredReadNoise +
+              electrons_per_pixel +
+              (kPhotoResponseNonUniformity * kPhotoResponseNonUniformity *
+               electrons_per_pixel * electrons_per_pixel));
+    const float linear_noise = noise_in_electrons / max_electrons_per_pixel;
+    const float linear_range_start = maxf(0.f, linear - 2 * linear_noise);
+    const float linear_range_end = minf(1.f, linear + 2 * linear_noise);
+    const float tf_slope =
+        (photon_noise_args->transfer_function->from_linear(linear_range_end) -
+         photon_noise_args->transfer_function->from_linear(
+             linear_range_start)) /
+        (linear_range_end - linear_range_start);
+    float encoded_noise = linear_noise * tf_slope;
+
+    x = roundf(255 * x);
+    encoded_noise = minf(255.f, roundf(255 * 7.88f * encoded_noise));
+
+    film_grain->scaling_points_y[i][0] = (int)x;
+    film_grain->scaling_points_y[i][1] = (int)encoded_noise;
+  }
+
+  film_grain->apply_grain = 1;
+  film_grain->update_parameters = 1;
+  film_grain->num_cb_points = 0;
+  film_grain->num_cr_points = 0;
+  film_grain->scaling_shift = 8;
+  film_grain->ar_coeff_lag = 0;
+  film_grain->ar_coeffs_cb[0] = 0;
+  film_grain->ar_coeffs_cr[0] = 0;
+  film_grain->ar_coeff_shift = 6;
+  film_grain->cb_mult = 0;
+  film_grain->cb_luma_mult = 0;
+  film_grain->cb_offset = 0;
+  film_grain->cr_mult = 0;
+  film_grain->cr_luma_mult = 0;
+  film_grain->cr_offset = 0;
+  film_grain->overlap_flag = 1;
+  film_grain->random_seed = 7391;
+  film_grain->chroma_scaling_from_luma = 0;
+}
+
+int main(int argc, char **argv) {
+  photon_noise_args_t photon_noise_args;
+  aom_film_grain_table_t film_grain_table;
+  aom_film_grain_t film_grain;
+  struct aom_internal_error_info error_info;
+  memset(&photon_noise_args, 0, sizeof(photon_noise_args));
+  memset(&film_grain_table, 0, sizeof(film_grain_table));
+  memset(&film_grain, 0, sizeof(film_grain));
+  memset(&error_info, 0, sizeof(error_info));
+
+  exec_name = argv[0];
+  parse_args(argc, argv, &photon_noise_args);
+
+  generate_photon_noise(&photon_noise_args, &film_grain);
+  aom_film_grain_table_append(&film_grain_table, 0, 9223372036854775807ull,
+                              &film_grain);
+  if (aom_film_grain_table_write(&film_grain_table,
+                                 photon_noise_args.output_filename,
+                                 &error_info) != AOM_CODEC_OK) {
+    aom_film_grain_table_free(&film_grain_table);
+    fprintf(stderr, "Failed to write film grain table");
+    if (error_info.has_detail) {
+      fprintf(stderr, ": %s", error_info.detail);
+    }
+    fprintf(stderr, "\n");
+    return EXIT_FAILURE;
+  }
+  aom_film_grain_table_free(&film_grain_table);
+
+  return EXIT_SUCCESS;
+}
diff --git a/media/libaom/src/examples/resize_util.c b/media/libaom/src/examples/resize_util.c
index 5692c2062c..45a1db2028 100644
--- a/media/libaom/src/examples/resize_util.c
+++ b/media/libaom/src/examples/resize_util.c
@@ -53,6 +53,7 @@ int main(int argc, char *argv[]) {
   uint8_t *inbuf_v, *outbuf_v;
   int f, frames;
   int width, height, target_width, target_height;
+  int failed = 0;
 
   exec_name = argv[0];
 
@@ -102,6 +103,11 @@ int main(int argc, char *argv[]) {
 
   inbuf = (uint8_t *)malloc(width * height * 3 / 2);
   outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2);
+  if (!(inbuf && outbuf)) {
+    printf("Failed to allocate buffers.\n");
+    failed = 1;
+    goto Error;
+  }
   inbuf_u = inbuf + width * height;
   inbuf_v = inbuf_u + width * height / 4;
   outbuf_u = outbuf + target_width * target_height;
@@ -116,10 +122,11 @@ int main(int argc, char *argv[]) {
     f++;
   }
   printf("%d frames processed\n", f);
+Error:
   fclose(fpin);
   fclose(fpout);
 
   free(inbuf);
   free(outbuf);
-  return 0;
+  return failed;
 }
diff --git a/media/libaom/src/examples/scalable_decoder.c b/media/libaom/src/examples/scalable_decoder.c
index c229242238..00fe820fd5 100644
--- a/media/libaom/src/examples/scalable_decoder.c
+++ b/media/libaom/src/examples/scalable_decoder.c
@@ -93,8 +93,6 @@ int main(int argc, char **argv) {
   int frame_cnt = 0;
   FILE *outfile[MAX_LAYERS];
   char filename[80];
-  aom_codec_ctx_t codec;
-  const AvxInterface *decoder = NULL;
   FILE *inputfile = NULL;
   uint8_t *buf = NULL;
   size_t bytes_in_buffer = 0;
@@ -114,11 +112,12 @@ int main(int argc, char **argv) {
   obu_ctx.avx_ctx->file = inputfile;
   obu_ctx.avx_ctx->filename = argv[1];
 
-  decoder = get_aom_decoder_by_index(0);
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  aom_codec_iface_t *decoder = get_aom_decoder_by_index(0);
+  printf("Using %s\n", aom_codec_iface_name(decoder));
 
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder.");
 
   if (aom_codec_control(&codec, AV1D_SET_OUTPUT_ALL_LAYERS, 1)) {
     die_codec(&codec, "Failed to set output_all_layers control.");
@@ -128,7 +127,7 @@ int main(int argc, char **argv) {
   const size_t ret = fread(tmpbuf, 1, 32, inputfile);
   if (ret != 32) die_codec(&codec, "Input is not a valid obu file");
   si.is_annexb = 0;
-  if (aom_codec_peek_stream_info(decoder->codec_interface(), tmpbuf, 32, &si)) {
+  if (aom_codec_peek_stream_info(decoder, tmpbuf, 32, &si)) {
     die_codec(&codec, "Input is not a valid obu file");
   }
   fseek(inputfile, -32, SEEK_CUR);
@@ -143,7 +142,7 @@ int main(int argc, char **argv) {
 
   // open any enhancement layer output yuv files
   for (i = 1; i < si.number_spatial_layers; i++) {
-    snprintf(filename, sizeof(filename), "out_lyr%d.yuv", i);
+    snprintf(filename, sizeof(filename), "out_lyr%u.yuv", i);
     if (!(outfile[i] = fopen(filename, "wb")))
       die("Failed to open output for writing.");
   }
diff --git a/media/libaom/src/examples/scalable_encoder.c b/media/libaom/src/examples/scalable_encoder.c
index 7af03e29f5..5bfd1840b2 100644
--- a/media/libaom/src/examples/scalable_encoder.c
+++ b/media/libaom/src/examples/scalable_encoder.c
@@ -120,13 +120,11 @@ static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
 int main(int argc, char **argv) {
   FILE *infile0 = NULL;
   FILE *infile1 = NULL;
-  aom_codec_ctx_t codec;
   aom_codec_enc_cfg_t cfg;
   int frame_count = 0;
   aom_image_t raw0, raw1;
   aom_codec_err_t res;
   AvxVideoInfo info;
-  const AvxInterface *encoder = NULL;
   const int fps = 30;
   const int bitrate = 200;
   int keyframe_interval = 0;
@@ -157,10 +155,10 @@ int main(int argc, char **argv) {
   outfile_arg = argv[6];
   max_frames = (int)strtol(argv[7], NULL, 0);
 
-  encoder = get_aom_encoder_by_name(codec_arg);
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg);
   if (!encoder) die("Unsupported codec.");
 
-  info.codec_fourcc = encoder->fourcc;
+  info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
   info.frame_width = (int)strtol(width_arg, NULL, 0);
   info.frame_height = (int)strtol(height_arg, NULL, 0);
   info.time_base.numerator = 1;
@@ -184,9 +182,10 @@ int main(int argc, char **argv) {
   keyframe_interval = 100;
   if (keyframe_interval < 0) die("Invalid keyframe interval value.");
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  aom_codec_ctx_t codec;
+  res = aom_codec_enc_config_default(encoder, &cfg, 0);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -207,8 +206,8 @@ int main(int argc, char **argv) {
   if (!(infile1 = fopen(infile1_arg, "rb")))
     die("Failed to open %s for reading.", infile0_arg);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+    die("Failed to initialize encoder");
   if (aom_codec_control(&codec, AOME_SET_CPUUSED, 8))
     die_codec(&codec, "Failed to set cpu to 8");
 
diff --git a/media/libaom/src/examples/set_maps.c b/media/libaom/src/examples/set_maps.c
index 9aeb96e437..bcb28a063c 100644
--- a/media/libaom/src/examples/set_maps.c
+++ b/media/libaom/src/examples/set_maps.c
@@ -69,6 +69,7 @@ static void set_active_map(const aom_codec_enc_cfg_t *cfg,
   map.cols = (cfg->g_w + 15) / 16;
 
   map.active_map = (uint8_t *)malloc(map.rows * map.cols);
+  if (!map.active_map) die("Failed to allocate active map");
   for (i = 0; i < map.rows * map.cols; ++i) map.active_map[i] = i % 2;
 
   if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map))
@@ -121,26 +122,33 @@ int main(int argc, char **argv) {
   aom_codec_ctx_t codec;
   aom_codec_enc_cfg_t cfg;
   int frame_count = 0;
-  const int limit = 15;
+  const int limit = 10;
   aom_image_t raw;
   aom_codec_err_t res;
   AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
-  const AvxInterface *encoder = NULL;
   const int fps = 2;  // TODO(dkovalev) add command line argument
   const double bits_per_pixel_per_frame = 0.067;
 
+#if CONFIG_REALTIME_ONLY
+  const int usage = 1;
+  const int speed = 7;
+#else
+  const int usage = 0;
+  const int speed = 2;
+#endif
+
   exec_name = argv[0];
   if (argc != 6) die("Invalid number of arguments");
 
   memset(&info, 0, sizeof(info));
 
-  encoder = get_aom_encoder_by_name(argv[1]);
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(argv[1]);
   if (encoder == NULL) {
     die("Unsupported codec.");
   }
   assert(encoder != NULL);
-  info.codec_fourcc = encoder->fourcc;
+  info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
   info.frame_width = (int)strtol(argv[2], NULL, 0);
   info.frame_height = (int)strtol(argv[3], NULL, 0);
   info.time_base.numerator = 1;
@@ -156,9 +164,9 @@ int main(int argc, char **argv) {
     die("Failed to allocate image.");
   }
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, usage);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -175,8 +183,11 @@ int main(int argc, char **argv) {
   if (!(infile = fopen(argv[4], "rb")))
     die("Failed to open %s for reading.", argv[4]);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+    die("Failed to initialize encoder");
+
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed))
+    die_codec(&codec, "Failed to set cpu-used");
 
   // Encode frames.
   while (aom_img_read(&raw, infile) && frame_count < limit) {
@@ -184,7 +195,7 @@ int main(int argc, char **argv) {
 
     if (frame_count == 5) {
       set_active_map(&cfg, &codec);
-    } else if (frame_count == 11) {
+    } else if (frame_count == 9) {
       unset_active_map(&cfg, &codec);
     }
 
diff --git a/media/libaom/src/examples/simple_decoder.c b/media/libaom/src/examples/simple_decoder.c
index d098d1e0b9..b6891dcbba 100644
--- a/media/libaom/src/examples/simple_decoder.c
+++ b/media/libaom/src/examples/simple_decoder.c
@@ -92,9 +92,7 @@ void usage_exit(void) {
 int main(int argc, char **argv) {
   int frame_cnt = 0;
   FILE *outfile = NULL;
-  aom_codec_ctx_t codec;
   AvxVideoReader *reader = NULL;
-  const AvxInterface *decoder = NULL;
   const AvxVideoInfo *info = NULL;
 
   exec_name = argv[0];
@@ -109,13 +107,14 @@ int main(int argc, char **argv) {
 
   info = aom_video_reader_get_info(reader);
 
-  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
 
-  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(decoder));
 
-  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+  aom_codec_ctx_t codec;
+  if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+    die("Failed to initialize decoder.");
 
   while (aom_video_reader_read_frame(reader)) {
     aom_codec_iter_t iter = NULL;
diff --git a/media/libaom/src/examples/simple_encoder.c b/media/libaom/src/examples/simple_encoder.c
index 01a37cf0c9..c026706555 100644
--- a/media/libaom/src/examples/simple_encoder.c
+++ b/media/libaom/src/examples/simple_encoder.c
@@ -100,6 +100,7 @@
 #include <string.h>
 
 #include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
 #include "common/tools_common.h"
 #include "common/video_writer.h"
 
@@ -151,7 +152,6 @@ int main(int argc, char **argv) {
   aom_codec_err_t res;
   AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
-  const AvxInterface *encoder = NULL;
   const int fps = 30;
   const int bitrate = 200;
   int keyframe_interval = 0;
@@ -163,6 +163,13 @@ int main(int argc, char **argv) {
   const char *infile_arg = NULL;
   const char *outfile_arg = NULL;
   const char *keyframe_interval_arg = NULL;
+#if CONFIG_REALTIME_ONLY
+  const int usage = 1;
+  const int speed = 7;
+#else
+  const int usage = 0;
+  const int speed = 2;
+#endif
 
   exec_name = argv[0];
 
@@ -180,10 +187,10 @@ int main(int argc, char **argv) {
   keyframe_interval_arg = argv[6];
   max_frames = (int)strtol(argv[8], NULL, 0);
 
-  encoder = get_aom_encoder_by_name(codec_arg);
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg);
   if (!encoder) die("Unsupported codec.");
 
-  info.codec_fourcc = encoder->fourcc;
+  info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
   info.frame_width = (int)strtol(width_arg, NULL, 0);
   info.frame_height = (int)strtol(height_arg, NULL, 0);
   info.time_base.numerator = 1;
@@ -202,9 +209,9 @@ int main(int argc, char **argv) {
   keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0);
   if (keyframe_interval < 0) die("Invalid keyframe interval value.");
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, usage);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -220,8 +227,11 @@ int main(int argc, char **argv) {
   if (!(infile = fopen(infile_arg, "rb")))
     die("Failed to open %s for reading.", infile_arg);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+    die("Failed to initialize encoder");
+
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed))
+    die_codec(&codec, "Failed to set cpu-used");
 
   // Encode frames.
   while (aom_img_read(&raw, infile)) {
diff --git a/media/libaom/src/examples/svc_encoder_rtc.c b/media/libaom/src/examples/svc_encoder_rtc.c
index 1316c6c1eb..d3f649f2ea 100644
--- a/media/libaom/src/examples/svc_encoder_rtc.c
+++ b/media/libaom/src/examples/svc_encoder_rtc.c
@@ -20,67 +20,110 @@
 #include "aom/aom_encoder.h"
 #include "aom/aomcx.h"
 #include "av1/common/enums.h"
+#include "av1/encoder/encoder.h"
+#include "common/args.h"
 #include "common/tools_common.h"
 #include "common/video_writer.h"
+#include "examples/encoder_util.h"
 #include "aom_ports/aom_timer.h"
 
-#define zero(Dest) memset(&(Dest), 0, sizeof(Dest));
+#define OPTION_BUFFER_SIZE 1024
 
-static const char *exec_name;
-
-void usage_exit(void) { exit(EXIT_FAILURE); }
-
-static int mode_to_num_temporal_layers[10] = { 1, 2, 3, 3, 2, 1, 1, 3, 3, 3 };
-static int mode_to_num_spatial_layers[10] = { 1, 1, 1, 1, 1, 2, 3, 3, 3, 3 };
-static int mode_to_num_layers[10] = { 1, 2, 3, 3, 2, 2, 3, 9, 9, 9 };
+typedef struct {
+  const char *output_filename;
+  char options[OPTION_BUFFER_SIZE];
+  struct AvxInputContext input_ctx;
+  int speed;
+  int aq_mode;
+  int layering_mode;
+  int output_obu;
+} AppInput;
+
+typedef enum {
+  QUANTIZER = 0,
+  BITRATE,
+  SCALE_FACTOR,
+  AUTO_ALT_REF,
+  ALL_OPTION_TYPES
+} LAYER_OPTION_TYPE;
+
+static const arg_def_t outputfile =
+    ARG_DEF("o", "output", 1, "Output filename");
+static const arg_def_t frames_arg =
+    ARG_DEF("f", "frames", 1, "Number of frames to encode");
+static const arg_def_t threads_arg =
+    ARG_DEF("th", "threads", 1, "Number of threads to use");
+static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "Source width");
+static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "Source height");
+static const arg_def_t timebase_arg =
+    ARG_DEF("t", "timebase", 1, "Timebase (num/den)");
+static const arg_def_t bitrate_arg = ARG_DEF(
+    "b", "target-bitrate", 1, "Encoding bitrate, in kilobits per second");
+static const arg_def_t spatial_layers_arg =
+    ARG_DEF("sl", "spatial-layers", 1, "Number of spatial SVC layers");
+static const arg_def_t temporal_layers_arg =
+    ARG_DEF("tl", "temporal-layers", 1, "Number of temporal SVC layers");
+static const arg_def_t layering_mode_arg =
+    ARG_DEF("lm", "layering-mode", 1, "Temporal layering scheme.");
+static const arg_def_t kf_dist_arg =
+    ARG_DEF("k", "kf-dist", 1, "Number of frames between keyframes");
+static const arg_def_t scale_factors_arg =
+    ARG_DEF("r", "scale-factors", 1, "Scale factors (lowest to highest layer)");
+static const arg_def_t min_q_arg =
+    ARG_DEF(NULL, "min-q", 1, "Minimum quantizer");
+static const arg_def_t max_q_arg =
+    ARG_DEF(NULL, "max-q", 1, "Maximum quantizer");
+static const arg_def_t speed_arg =
+    ARG_DEF("sp", "speed", 1, "Speed configuration");
+static const arg_def_t aqmode_arg =
+    ARG_DEF("aq", "aqmode", 1, "AQ mode off/on");
+static const arg_def_t bitrates_arg =
+    ARG_DEF("bl", "bitrates", 1,
+            "Bitrates[spatial_layer * num_temporal_layer + temporal_layer]");
+static const arg_def_t dropframe_thresh_arg =
+    ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
+static const arg_def_t error_resilient_arg =
+    ARG_DEF(NULL, "error-resilient", 1, "Error resilient flag");
+static const arg_def_t output_obu_arg =
+    ARG_DEF(NULL, "output-obu", 1,
+            "Write OBUs when set to 1. Otherwise write IVF files.");
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static const struct arg_enum_list bitdepth_enum[] = {
+  { "8", AOM_BITS_8 }, { "10", AOM_BITS_10 }, { "12", AOM_BITS_12 }, { NULL, 0 }
+};
 
-// For rate control encoding stats.
-struct RateControlMetrics {
-  // Number of input frames per layer.
-  int layer_input_frames[AOM_MAX_TS_LAYERS];
-  // Number of encoded non-key frames per layer.
-  int layer_enc_frames[AOM_MAX_TS_LAYERS];
-  // Framerate per layer layer (cumulative).
-  double layer_framerate[AOM_MAX_TS_LAYERS];
-  // Target average frame size per layer (per-frame-bandwidth per layer).
-  double layer_pfb[AOM_MAX_LAYERS];
-  // Actual average frame size per layer.
-  double layer_avg_frame_size[AOM_MAX_LAYERS];
-  // Average rate mismatch per layer (|target - actual| / target).
-  double layer_avg_rate_mismatch[AOM_MAX_LAYERS];
-  // Actual encoding bitrate per layer (cumulative across temporal layers).
-  double layer_encoding_bitrate[AOM_MAX_LAYERS];
-  // Average of the short-time encoder actual bitrate.
-  // TODO(marpan): Should we add these short-time stats for each layer?
-  double avg_st_encoding_bitrate;
-  // Variance of the short-time encoder actual bitrate.
-  double variance_st_encoding_bitrate;
-  // Window (number of frames) for computing short-timee encoding bitrate.
-  int window_size;
-  // Number of window measurements.
-  int window_count;
-  int layer_target_bitrate[AOM_MAX_LAYERS];
+static const arg_def_t bitdepth_arg = ARG_DEF_ENUM(
+    "d", "bit-depth", 1, "Bit depth for codec 8, 10 or 12. ", bitdepth_enum);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static const arg_def_t *svc_args[] = {
+  &frames_arg,          &outputfile,     &width_arg,
+  &height_arg,          &timebase_arg,   &bitrate_arg,
+  &spatial_layers_arg,  &kf_dist_arg,    &scale_factors_arg,
+  &min_q_arg,           &max_q_arg,      &temporal_layers_arg,
+  &layering_mode_arg,   &threads_arg,    &aqmode_arg,
+#if CONFIG_AV1_HIGHBITDEPTH
+  &bitdepth_arg,
+#endif
+  &speed_arg,           &bitrates_arg,   &dropframe_thresh_arg,
+  &error_resilient_arg, &output_obu_arg, NULL
 };
 
-static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) {
-  FILE *f = input_ctx->file;
-  y4m_input *y4m = &input_ctx->y4m;
-  int shortread = 0;
+#define zero(Dest) memset(&(Dest), 0, sizeof(Dest))
 
-  if (input_ctx->file_type == FILE_TYPE_Y4M) {
-    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
-  } else {
-    shortread = read_yuv_frame(input_ctx, img);
-  }
+static const char *exec_name;
 
-  return !shortread;
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <options> input_filename -o output_filename\n",
+          exec_name);
+  fprintf(stderr, "Options:\n");
+  arg_show_usage(stderr, svc_args);
+  exit(EXIT_FAILURE);
 }
 
 static int file_is_y4m(const char detect[4]) {
-  if (memcmp(detect, "YUV4", 4) == 0) {
-    return 1;
-  }
-  return 0;
+  return memcmp(detect, "YUV4", 4) == 0;
 }
 
 static int fourcc_is_ivf(const char detect[4]) {
@@ -90,10 +133,10 @@ static int fourcc_is_ivf(const char detect[4]) {
   return 0;
 }
 
-static void close_input_file(struct AvxInputContext *input) {
-  fclose(input->file);
-  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
-}
+static const int option_max_values[ALL_OPTION_TYPES] = { 63, INT_MAX, INT_MAX,
+                                                         1 };
+
+static const int option_min_values[ALL_OPTION_TYPES] = { 0, 0, 1, 0 };
 
 static void open_input_file(struct AvxInputContext *input,
                             aom_chroma_sample_position_t csp) {
@@ -143,6 +186,279 @@ static void open_input_file(struct AvxInputContext *input,
   }
 }
 
+static aom_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input,
+                                      int *value0, int *value1) {
+  if (type == SCALE_FACTOR) {
+    *value0 = (int)strtol(input, &input, 10);
+    if (*input++ != '/') return AOM_CODEC_INVALID_PARAM;
+    *value1 = (int)strtol(input, &input, 10);
+
+    if (*value0 < option_min_values[SCALE_FACTOR] ||
+        *value1 < option_min_values[SCALE_FACTOR] ||
+        *value0 > option_max_values[SCALE_FACTOR] ||
+        *value1 > option_max_values[SCALE_FACTOR] ||
+        *value0 > *value1)  // num shouldn't be greater than den
+      return AOM_CODEC_INVALID_PARAM;
+  } else {
+    *value0 = atoi(input);
+    if (*value0 < option_min_values[type] || *value0 > option_max_values[type])
+      return AOM_CODEC_INVALID_PARAM;
+  }
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_layer_options_from_string(
+    aom_svc_params_t *svc_params, LAYER_OPTION_TYPE type, const char *input,
+    int *option0, int *option1) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+  char *input_string;
+  char *token;
+  const char *delim = ",";
+  int num_layers = svc_params->number_spatial_layers;
+  int i = 0;
+
+  if (type == BITRATE)
+    num_layers =
+        svc_params->number_spatial_layers * svc_params->number_temporal_layers;
+
+  if (input == NULL || option0 == NULL ||
+      (option1 == NULL && type == SCALE_FACTOR))
+    return AOM_CODEC_INVALID_PARAM;
+
+  input_string = malloc(strlen(input));
+  if (!input_string) die("Failed to allocate input string.");
+  memcpy(input_string, input, strlen(input));
+  if (input_string == NULL) return AOM_CODEC_MEM_ERROR;
+  token = strtok(input_string, delim);  // NOLINT
+  for (i = 0; i < num_layers; ++i) {
+    if (token != NULL) {
+      res = extract_option(type, token, option0 + i, option1 + i);
+      if (res != AOM_CODEC_OK) break;
+      token = strtok(NULL, delim);  // NOLINT
+    } else {
+      break;
+    }
+  }
+  if (res == AOM_CODEC_OK && i != num_layers) {
+    res = AOM_CODEC_INVALID_PARAM;
+  }
+  free(input_string);
+  return res;
+}
+
+static void parse_command_line(int argc, const char **argv_,
+                               AppInput *app_input,
+                               aom_svc_params_t *svc_params,
+                               aom_codec_enc_cfg_t *enc_cfg) {
+  struct arg arg;
+  char **argv = NULL;
+  char **argi = NULL;
+  char **argj = NULL;
+  char string_options[1024] = { 0 };
+
+  // Default settings
+  svc_params->number_spatial_layers = 1;
+  svc_params->number_temporal_layers = 1;
+  app_input->layering_mode = 0;
+  app_input->output_obu = 0;
+  enc_cfg->g_threads = 1;
+  enc_cfg->rc_end_usage = AOM_CBR;
+
+  // process command line options
+  argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    exit(EXIT_FAILURE);
+  }
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+
+    if (arg_match(&arg, &outputfile, argi)) {
+      app_input->output_filename = arg.val;
+    } else if (arg_match(&arg, &width_arg, argi)) {
+      enc_cfg->g_w = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &height_arg, argi)) {
+      enc_cfg->g_h = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &timebase_arg, argi)) {
+      enc_cfg->g_timebase = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &bitrate_arg, argi)) {
+      enc_cfg->rc_target_bitrate = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &spatial_layers_arg, argi)) {
+      svc_params->number_spatial_layers = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &temporal_layers_arg, argi)) {
+      svc_params->number_temporal_layers = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &speed_arg, argi)) {
+      app_input->speed = arg_parse_uint(&arg);
+      if (app_input->speed > 10) {
+        aom_tools_warn("Mapping speed %d to speed 10.\n", app_input->speed);
+      }
+    } else if (arg_match(&arg, &aqmode_arg, argi)) {
+      app_input->aq_mode = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &threads_arg, argi)) {
+      enc_cfg->g_threads = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &layering_mode_arg, argi)) {
+      app_input->layering_mode = arg_parse_int(&arg);
+    } else if (arg_match(&arg, &kf_dist_arg, argi)) {
+      enc_cfg->kf_min_dist = arg_parse_uint(&arg);
+      enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
+    } else if (arg_match(&arg, &scale_factors_arg, argi)) {
+      parse_layer_options_from_string(svc_params, SCALE_FACTOR, arg.val,
+                                      svc_params->scaling_factor_num,
+                                      svc_params->scaling_factor_den);
+    } else if (arg_match(&arg, &min_q_arg, argi)) {
+      enc_cfg->rc_min_quantizer = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &max_q_arg, argi)) {
+      enc_cfg->rc_max_quantizer = arg_parse_uint(&arg);
+#if CONFIG_AV1_HIGHBITDEPTH
+    } else if (arg_match(&arg, &bitdepth_arg, argi)) {
+      enc_cfg->g_bit_depth = arg_parse_enum_or_int(&arg);
+      switch (enc_cfg->g_bit_depth) {
+        case AOM_BITS_8:
+          enc_cfg->g_input_bit_depth = 8;
+          enc_cfg->g_profile = 0;
+          break;
+        case AOM_BITS_10:
+          enc_cfg->g_input_bit_depth = 10;
+          enc_cfg->g_profile = 2;
+          break;
+        case AOM_BITS_12:
+          enc_cfg->g_input_bit_depth = 12;
+          enc_cfg->g_profile = 2;
+          break;
+        default:
+          die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth);
+          break;
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) {
+      enc_cfg->rc_dropframe_thresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &error_resilient_arg, argi)) {
+      enc_cfg->g_error_resilient = arg_parse_uint(&arg);
+      if (enc_cfg->g_error_resilient != 0 && enc_cfg->g_error_resilient != 1)
+        die("Invalid value for error resilient (0, 1): %d.",
+            enc_cfg->g_error_resilient);
+    } else if (arg_match(&arg, &output_obu_arg, argi)) {
+      app_input->output_obu = arg_parse_uint(&arg);
+      if (app_input->output_obu != 0 && app_input->output_obu != 1)
+        die("Invalid value for obu output flag (0, 1): %d.",
+            app_input->output_obu);
+    } else {
+      ++argj;
+    }
+  }
+
+  // Total bitrate needs to be parsed after the number of layers.
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+    if (arg_match(&arg, &bitrates_arg, argi)) {
+      parse_layer_options_from_string(svc_params, BITRATE, arg.val,
+                                      svc_params->layer_target_bitrate, NULL);
+    } else {
+      ++argj;
+    }
+  }
+
+  // There will be a space in front of the string options
+  if (strlen(string_options) > 0)
+    strncpy(app_input->options, string_options, OPTION_BUFFER_SIZE);
+
+  // Check for unrecognized options
+  for (argi = argv; *argi; ++argi)
+    if (argi[0][0] == '-' && strlen(argi[0]) > 1)
+      die("Error: Unrecognized option %s\n", *argi);
+
+  if (argv[0] == NULL) {
+    usage_exit();
+  }
+
+  app_input->input_ctx.filename = argv[0];
+  free(argv);
+
+  open_input_file(&app_input->input_ctx, 0);
+  if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) {
+    enc_cfg->g_w = app_input->input_ctx.width;
+    enc_cfg->g_h = app_input->input_ctx.height;
+  }
+
+  if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 ||
+      enc_cfg->g_h % 2)
+    die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h);
+
+  printf(
+      "Codec %s\n"
+      "layers: %d\n"
+      "width %u, height: %u\n"
+      "num: %d, den: %d, bitrate: %u\n"
+      "gop size: %u\n",
+      aom_codec_iface_name(aom_codec_av1_cx()),
+      svc_params->number_spatial_layers, enc_cfg->g_w, enc_cfg->g_h,
+      enc_cfg->g_timebase.num, enc_cfg->g_timebase.den,
+      enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
+}
+
+static unsigned int mode_to_num_temporal_layers[11] = { 1, 2, 3, 3, 2, 1,
+                                                        1, 3, 3, 3, 3 };
+static unsigned int mode_to_num_spatial_layers[11] = { 1, 1, 1, 1, 1, 2,
+                                                       3, 2, 3, 3, 3 };
+
+// For rate control encoding stats.
+struct RateControlMetrics {
+  // Number of input frames per layer.
+  int layer_input_frames[AOM_MAX_TS_LAYERS];
+  // Number of encoded non-key frames per layer.
+  int layer_enc_frames[AOM_MAX_TS_LAYERS];
+  // Framerate per layer layer (cumulative).
+  double layer_framerate[AOM_MAX_TS_LAYERS];
+  // Target average frame size per layer (per-frame-bandwidth per layer).
+  double layer_pfb[AOM_MAX_LAYERS];
+  // Actual average frame size per layer.
+  double layer_avg_frame_size[AOM_MAX_LAYERS];
+  // Average rate mismatch per layer (|target - actual| / target).
+  double layer_avg_rate_mismatch[AOM_MAX_LAYERS];
+  // Actual encoding bitrate per layer (cumulative across temporal layers).
+  double layer_encoding_bitrate[AOM_MAX_LAYERS];
+  // Average of the short-time encoder actual bitrate.
+  // TODO(marpan): Should we add these short-time stats for each layer?
+  double avg_st_encoding_bitrate;
+  // Variance of the short-time encoder actual bitrate.
+  double variance_st_encoding_bitrate;
+  // Window (number of frames) for computing short-timee encoding bitrate.
+  int window_size;
+  // Number of window measurements.
+  int window_count;
+  int layer_target_bitrate[AOM_MAX_LAYERS];
+};
+
+// Reference frames used in this example encoder.
+enum {
+  SVC_LAST_FRAME = 0,
+  SVC_LAST2_FRAME,
+  SVC_LAST3_FRAME,
+  SVC_GOLDEN_FRAME,
+  SVC_BWDREF_FRAME,
+  SVC_ALTREF2_FRAME,
+  SVC_ALTREF_FRAME
+};
+
+static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) {
+  FILE *f = input_ctx->file;
+  y4m_input *y4m = &input_ctx->y4m;
+  int shortread = 0;
+
+  if (input_ctx->file_type == FILE_TYPE_Y4M) {
+    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
+  } else {
+    shortread = read_yuv_frame(input_ctx, img);
+  }
+
+  return !shortread;
+}
+
+static void close_input_file(struct AvxInputContext *input) {
+  fclose(input->file);
+  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
+}
+
 // Note: these rate control metrics assume only 1 key frame in the
 // sequence (i.e., first frame only). So for temporal pattern# 7
 // (which has key frame for every frame on base layer), the metrics
@@ -200,7 +516,7 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
   int tot_num_frames = 0;
   double perc_fluctuation = 0.0;
   printf("Total number of processed frames: %d\n\n", frame_cnt - 1);
-  printf("Rate control layer stats for %d layer(s):\n\n", ts_number_layers);
+  printf("Rate control layer stats for %u layer(s):\n\n", ts_number_layers);
   for (unsigned int sl = 0; sl < ss_number_layers; ++sl) {
     tot_num_frames = 0;
     for (unsigned int tl = 0; tl < ts_number_layers; ++tl) {
@@ -216,7 +532,7 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
           rc->layer_avg_frame_size[i] / rc->layer_enc_frames[tl];
       rc->layer_avg_rate_mismatch[i] =
           100.0 * rc->layer_avg_rate_mismatch[i] / rc->layer_enc_frames[tl];
-      printf("For layer#: %d %d \n", sl, tl);
+      printf("For layer#: %u %u \n", sl, tl);
       printf("Bitrate (target vs actual): %d %f\n", rc->layer_target_bitrate[i],
              rc->layer_encoding_bitrate[i]);
       printf("Average frame size (target vs actual): %f %f\n", rc->layer_pfb[i],
@@ -245,38 +561,39 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
 }
 
 // Layer pattern configuration.
-static int set_layer_pattern(int layering_mode, int superframe_cnt,
-                             aom_svc_layer_id_t *layer_id,
-                             aom_svc_ref_frame_config_t *ref_frame_config,
-                             int *use_svc_control, int spatial_layer_id,
-                             int is_key_frame, int ksvc_mode) {
+static void set_layer_pattern(
+    int layering_mode, int superframe_cnt, aom_svc_layer_id_t *layer_id,
+    aom_svc_ref_frame_config_t *ref_frame_config,
+    aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int *use_svc_control,
+    int spatial_layer_id, int is_key_frame, int ksvc_mode, int speed) {
   int i;
-  int shift = (layering_mode == 7) ? 2 : 0;
+  int enable_longterm_temporal_ref = 1;
+  int shift = (layering_mode == 8) ? 2 : 0;
   *use_svc_control = 1;
   layer_id->spatial_layer_id = spatial_layer_id;
-  // Set the referende map buffer idx for the 7 references:
+  int lag_index = 0;
+  int base_count = superframe_cnt >> 2;
+  ref_frame_comp_pred->use_comp_pred[0] = 0;  // GOLDEN_LAST
+  ref_frame_comp_pred->use_comp_pred[1] = 0;  // LAST2_LAST
+  ref_frame_comp_pred->use_comp_pred[2] = 0;  // ALTREF_LAST
+  // Set the reference map buffer idx for the 7 references:
   // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
   // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
   for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = i;
   for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->reference[i] = 0;
   for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
-  // Note for this layered patterns only use LAST and GF for prediction in
-  // non-rd mode (speed >= 7).
-  int layer_flags = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
-                    AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
-                    AOM_EFLAG_NO_REF_ARF2;
+
   if (ksvc_mode) {
-    // Same pattern as case 8.
-    layering_mode = 8;
-    if (!is_key_frame)
-      // No inter-layer prediction on inter-frames.
-      layer_flags |= AOM_EFLAG_NO_REF_GF;
+    // Same pattern as case 9, but the reference strucutre will be constrained
+    // below.
+    layering_mode = 9;
   }
   switch (layering_mode) {
     case 0:
-      // 1-layer: update LAST on every frame, reference LAST and GF.
+      // 1-layer: update LAST on every frame, reference LAST.
       layer_id->temporal_layer_id = 0;
       ref_frame_config->refresh[0] = 1;
+      ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       break;
     case 1:
       // 2-temporal layer.
@@ -284,12 +601,13 @@ static int set_layer_pattern(int layering_mode, int superframe_cnt,
       //  0    2    4
       if (superframe_cnt % 2 == 0) {
         layer_id->temporal_layer_id = 0;
-        // Update LAST on layer 0, reference LAST and GF.
+        // Update LAST on layer 0, reference LAST.
         ref_frame_config->refresh[0] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else {
         layer_id->temporal_layer_id = 1;
         // No updates on layer 1, only reference LAST (TL0).
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       }
       break;
     case 2:
@@ -300,28 +618,80 @@ static int set_layer_pattern(int layering_mode, int superframe_cnt,
       if (superframe_cnt % 4 == 0) {
         // Base layer.
         layer_id->temporal_layer_id = 0;
-        // Update LAST on layer 0, reference LAST and GF.
+        // Update LAST on layer 0, reference LAST.
         ref_frame_config->refresh[0] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if ((superframe_cnt - 1) % 4 == 0) {
         layer_id->temporal_layer_id = 2;
         // First top layer: no updates, only reference LAST (TL0).
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if ((superframe_cnt - 2) % 4 == 0) {
         layer_id->temporal_layer_id = 1;
         // Middle layer (TL1): update LAST2, only reference LAST (TL0).
         ref_frame_config->refresh[1] = 1;
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if ((superframe_cnt - 3) % 4 == 0) {
         layer_id->temporal_layer_id = 2;
         // Second top layer: no updates, only reference LAST.
         // Set buffer idx for LAST to slot 1, since that was the slot
         // updated in previous frame. So LAST is TL1 frame.
-        ref_frame_config->ref_idx[0] = 1;
-        ref_frame_config->ref_idx[1] = 0;
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+        ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 0;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       }
       break;
     case 3:
+      // 3 TL, same as above, except allow for predicting
+      // off 2 more references (GOLDEN and ALTREF), with
+      // GOLDEN updated periodically, and ALTREF lagging from
+      // LAST from ~4 frames. Both GOLDEN and ALTREF
+      // can only be updated on base temporal layer.
+
+      // Keep golden fixed at slot 3.
+      ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+      // Cyclically refresh slots 5, 6, 7, for lag altref.
+      lag_index = 5;
+      if (base_count > 0) {
+        lag_index = 5 + (base_count % 3);
+        if (superframe_cnt % 4 != 0) lag_index = 5 + ((base_count + 1) % 3);
+      }
+      // Set the altref slot to lag_index.
+      ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = lag_index;
+      if (superframe_cnt % 4 == 0) {
+        // Base layer.
+        layer_id->temporal_layer_id = 0;
+        // Update LAST on layer 0, reference LAST.
+        ref_frame_config->refresh[0] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+        // Refresh GOLDEN every x ~10 base layer frames.
+        if (base_count % 10 == 0) ref_frame_config->refresh[3] = 1;
+        // Refresh lag_index slot, needed for lagging altref.
+        ref_frame_config->refresh[lag_index] = 1;
+      } else if ((superframe_cnt - 1) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // First top layer: no updates, only reference LAST (TL0).
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+      } else if ((superframe_cnt - 2) % 4 == 0) {
+        layer_id->temporal_layer_id = 1;
+        // Middle layer (TL1): update LAST2, only reference LAST (TL0).
+        ref_frame_config->refresh[1] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+      } else if ((superframe_cnt - 3) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // Second top layer: no updates, only reference LAST.
+        // Set buffer idx for LAST to slot 1, since that was the slot
+        // updated in previous frame. So LAST is TL1 frame.
+        ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+        ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 0;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+      }
+      // Every frame can reference GOLDEN AND ALTREF.
+      ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+      ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+      // Allow for compound prediction using LAST and ALTREF.
+      if (speed >= 7) ref_frame_comp_pred->use_comp_pred[2] = 1;
+      break;
+    case 4:
       // 3-temporal layer: but middle layer updates GF, so 2nd TL2 will
       // only reference GF (not LAST). Other frames only reference LAST.
       //   1    3   5    7
@@ -332,37 +702,20 @@ static int set_layer_pattern(int layering_mode, int superframe_cnt,
         layer_id->temporal_layer_id = 0;
         // Update LAST on layer 0, only reference LAST.
         ref_frame_config->refresh[0] = 1;
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if ((superframe_cnt - 1) % 4 == 0) {
         layer_id->temporal_layer_id = 2;
         // First top layer: no updates, only reference LAST (TL0).
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if ((superframe_cnt - 2) % 4 == 0) {
         layer_id->temporal_layer_id = 1;
         // Middle layer (TL1): update GF, only reference LAST (TL0).
         ref_frame_config->refresh[3] = 1;
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if ((superframe_cnt - 3) % 4 == 0) {
         layer_id->temporal_layer_id = 2;
         // Second top layer: no updates, only reference GF.
-        layer_flags |= AOM_EFLAG_NO_REF_LAST;
-      }
-      break;
-    case 4:
-      // 2-temporal layer with the old update flags, not with the new
-      // SVC control.
-      *use_svc_control = 0;
-      //    1    3    5
-      //  0    2    4
-      if (superframe_cnt % 2 == 0) {
-        layer_id->temporal_layer_id = 0;
-        // Update LAST on layer 0, reference LAST and GF.
-        layer_flags |= AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
-      } else {
-        layer_id->temporal_layer_id = 1;
-        // No updates on layer 1, only reference LAST (TL0).
-        layer_flags |= AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
-                       AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
       }
       break;
     case 5:
@@ -371,13 +724,15 @@ static int set_layer_pattern(int layering_mode, int superframe_cnt,
       if (layer_id->spatial_layer_id == 0) {
         // Reference LAST, update LAST.
         ref_frame_config->refresh[0] = 1;
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if (layer_id->spatial_layer_id == 1) {
         // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
         // and GOLDEN to slot 0. Update slot 1 (LAST).
-        ref_frame_config->ref_idx[0] = 1;
-        ref_frame_config->ref_idx[3] = 0;
+        ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+        ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 0;
         ref_frame_config->refresh[1] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+        ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
       }
       break;
     case 6:
@@ -392,40 +747,129 @@ static int set_layer_pattern(int layering_mode, int superframe_cnt,
         for (i = 0; i < INTER_REFS_PER_FRAME; i++)
           ref_frame_config->ref_idx[i] = 0;
         ref_frame_config->refresh[0] = 1;
-        layer_flags |= AOM_EFLAG_NO_REF_GF;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       } else if (layer_id->spatial_layer_id == 1) {
         // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
         // and GOLDEN (and all other refs) to slot 0.
         // Update slot 1 (LAST).
         for (i = 0; i < INTER_REFS_PER_FRAME; i++)
           ref_frame_config->ref_idx[i] = 0;
-        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
         ref_frame_config->refresh[1] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+        ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
       } else if (layer_id->spatial_layer_id == 2) {
         // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2
         // and GOLDEN (and all other refs) to slot 1.
         // Update slot 2 (LAST).
         for (i = 0; i < INTER_REFS_PER_FRAME; i++)
           ref_frame_config->ref_idx[i] = 1;
-        ref_frame_config->ref_idx[0] = 2;
+        ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
         ref_frame_config->refresh[2] = 1;
+        ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+        ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+        // For 3 spatial layer case: allow for top spatial layer to use
+        // additional temporal reference. Update every 10 frames.
+        if (enable_longterm_temporal_ref) {
+          ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = REF_FRAMES - 1;
+          ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+          if (base_count % 10 == 0)
+            ref_frame_config->refresh[REF_FRAMES - 1] = 1;
+        }
       }
       break;
     case 7:
+      // 2 spatial and 3 temporal layer.
+      ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+      if (superframe_cnt % 4 == 0) {
+        // Base temporal layer
+        layer_id->temporal_layer_id = 0;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST, update LAST
+          // Set all buffer_idx to 0
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->refresh[0] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+          ref_frame_config->refresh[1] = 1;
+        }
+      } else if ((superframe_cnt - 1) % 4 == 0) {
+        // First top temporal enhancement layer.
+        layer_id->temporal_layer_id = 2;
+        if (layer_id->spatial_layer_id == 0) {
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+          ref_frame_config->refresh[3] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 3.
+          // No update.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 3;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+        }
+      } else if ((superframe_cnt - 2) % 4 == 0) {
+        // Middle temporal enhancement layer.
+        layer_id->temporal_layer_id = 1;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST.
+          // Set all buffer_idx to 0.
+          // Set GOLDEN to slot 5 and update slot 5.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5 - shift;
+          ref_frame_config->refresh[5 - shift] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 5.
+          // Set LAST3 to slot 6 and update slot 6.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 5 - shift;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+          ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 6 - shift;
+          ref_frame_config->refresh[6 - shift] = 1;
+        }
+      } else if ((superframe_cnt - 3) % 4 == 0) {
+        // Second top temporal enhancement layer.
+        layer_id->temporal_layer_id = 2;
+        if (layer_id->spatial_layer_id == 0) {
+          // Set LAST to slot 5 and reference LAST.
+          // Set GOLDEN to slot 3 and update slot 3.
+          // Set all other buffer_idx to 0.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5 - shift;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+          ref_frame_config->refresh[3] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+          // GOLDEN to slot 3. No update.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 6 - shift;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+        }
+      }
+      break;
+    case 8:
       // 3 spatial and 3 temporal layer.
-      // Same as case 8 but overalap in the buffer slot updates.
+      // Same as case 9 but overalap in the buffer slot updates.
       // (shift = 2). The slots 3 and 4 updated by first TL2 are
       // reused for update in TL1 superframe.
       // Note for this case, frame order hint must be disabled for
       // lower resolutios (operating points > 0) to be decoedable.
-    case 8:
+    case 9:
       // 3 spatial and 3 temporal layer.
       // No overlap in buffer updates between TL2 and TL1.
       // TL2 updates slot 3 and 4, TL1 updates 5, 6, 7.
       // Set the references via the svc_ref_frame_config control.
-      layer_flags = 0;
       // Always reference LAST.
-      ref_frame_config->reference[0] = 1;
+      ref_frame_config->reference[SVC_LAST_FRAME] = 1;
       if (superframe_cnt % 4 == 0) {
         // Base temporal layer.
         layer_id->temporal_layer_id = 0;
@@ -441,7 +885,7 @@ static int set_layer_pattern(int layering_mode, int superframe_cnt,
           // Update slot 1 (LAST).
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
           ref_frame_config->refresh[1] = 1;
         } else if (layer_id->spatial_layer_id == 2) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
@@ -449,7 +893,7 @@ static int set_layer_pattern(int layering_mode, int superframe_cnt,
           // Update slot 2 (LAST).
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 1;
-          ref_frame_config->ref_idx[0] = 2;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
           ref_frame_config->refresh[2] = 1;
         }
       } else if ((superframe_cnt - 1) % 4 == 0) {
@@ -461,7 +905,7 @@ static int set_layer_pattern(int layering_mode, int superframe_cnt,
           // Set all other buffer_idx to slot 0.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
           ref_frame_config->refresh[3] = 1;
         } else if (layer_id->spatial_layer_id == 1) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
@@ -469,8 +913,8 @@ static int set_layer_pattern(int layering_mode, int superframe_cnt,
           // Set LAST2 to slot 4 and Update slot 4.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 3;
-          ref_frame_config->ref_idx[0] = 1;
-          ref_frame_config->ref_idx[1] = 4;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+          ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4;
           ref_frame_config->refresh[4] = 1;
         } else if (layer_id->spatial_layer_id == 2) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
@@ -478,7 +922,7 @@ static int set_layer_pattern(int layering_mode, int superframe_cnt,
           // No update.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 4;
-          ref_frame_config->ref_idx[0] = 2;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
         }
       } else if ((superframe_cnt - 2) % 4 == 0) {
         // Middle temporal enhancement layer.
@@ -489,25 +933,25 @@ static int set_layer_pattern(int layering_mode, int superframe_cnt,
           // Set GOLDEN to slot 5 and update slot 5.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[3] = 5 - shift;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5 - shift;
           ref_frame_config->refresh[5 - shift] = 1;
         } else if (layer_id->spatial_layer_id == 1) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
           // GOLDEN (and all other refs) to slot 5.
-          // Set LAST2 to slot 6 and update slot 6.
+          // Set LAST3 to slot 6 and update slot 6.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 5 - shift;
-          ref_frame_config->ref_idx[0] = 1;
-          ref_frame_config->ref_idx[2] = 6 - shift;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+          ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 6 - shift;
           ref_frame_config->refresh[6 - shift] = 1;
         } else if (layer_id->spatial_layer_id == 2) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
           // GOLDEN (and all other refs) to slot 6.
-          // Set LAST2 to slot 6 and update slot 7.
+          // Set LAST3 to slot 7 and update slot 7.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 6 - shift;
-          ref_frame_config->ref_idx[0] = 2;
-          ref_frame_config->ref_idx[2] = 7 - shift;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+          ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 7 - shift;
           ref_frame_config->refresh[7 - shift] = 1;
         }
       } else if ((superframe_cnt - 3) % 4 == 0) {
@@ -519,69 +963,157 @@ static int set_layer_pattern(int layering_mode, int superframe_cnt,
           // Set all other buffer_idx to 0.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 5 - shift;
-          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5 - shift;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
           ref_frame_config->refresh[3] = 1;
         } else if (layer_id->spatial_layer_id == 1) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
           // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 6 - shift;
-          ref_frame_config->ref_idx[3] = 3;
-          ref_frame_config->ref_idx[1] = 4;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 6 - shift;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+          ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4;
           ref_frame_config->refresh[4] = 1;
         } else if (layer_id->spatial_layer_id == 2) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
           // GOLDEN to slot 4. No update.
           for (i = 0; i < INTER_REFS_PER_FRAME; i++)
             ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 7 - shift;
-          ref_frame_config->ref_idx[3] = 4;
+          ref_frame_config->ref_idx[SVC_LAST_FRAME] = 7 - shift;
+          ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 4;
+        }
+      }
+      if (layer_id->spatial_layer_id > 0) {
+        // Always reference GOLDEN (inter-layer prediction).
+        ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+        if (ksvc_mode) {
+          // KSVC: only keep the inter-layer reference (GOLDEN) for
+          // superframes whose base is key.
+          if (!is_key_frame) ref_frame_config->reference[SVC_GOLDEN_FRAME] = 0;
+        }
+        if (is_key_frame && layer_id->spatial_layer_id > 1) {
+          // On superframes whose base is key: remove LAST to avoid prediction
+          // off layer two levels below.
+          ref_frame_config->reference[SVC_LAST_FRAME] = 0;
         }
       }
-      if (layer_id->spatial_layer_id > 0)
-        ref_frame_config->reference[3] = 1;  // Reference GOLDEN.
+      // For 3 spatial layer case 8 (where there is free buffer slot):
+      // allow for top spatial layer to use additional temporal reference.
+      // Additional reference is only updated on base temporal layer, every
+      // 10 TL0 frames here.
+      if (enable_longterm_temporal_ref && layer_id->spatial_layer_id == 2 &&
+          layering_mode == 8) {
+        ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = REF_FRAMES - 1;
+        ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+        if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0)
+          ref_frame_config->refresh[REF_FRAMES - 1] = 1;
+      }
       break;
     default: assert(0); die("Error: Unsupported temporal layering mode!\n");
   }
-  return layer_flags;
 }
 
-int main(int argc, char **argv) {
+#if CONFIG_AV1_DECODER
+static void test_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
+                        const int frames_out, int *mismatch_seen) {
+  aom_image_t enc_img, dec_img;
+
+  if (*mismatch_seen) return;
+
+  /* Get the internal reference frame */
+  AOM_CODEC_CONTROL_TYPECHECKED(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img);
+  AOM_CODEC_CONTROL_TYPECHECKED(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
+      (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t enc_hbd_img;
+      aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    enc_img.d_w, enc_img.d_h, 16);
+      aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+      enc_img = enc_hbd_img;
+    }
+    if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t dec_hbd_img;
+      aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    dec_img.d_w, dec_img.d_h, 16);
+      aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+      dec_img = dec_hbd_img;
+    }
+  }
+#endif
+
+  if (!aom_compare_img(&enc_img, &dec_img)) {
+    int y[4], u[4], v[4];
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
+    } else {
+      aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+    }
+#else
+    aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+#endif
+    decoder->err = 1;
+    printf(
+        "Encode/decode mismatch on frame %d at"
+        " Y[%d, %d] {%d/%d},"
+        " U[%d, %d] {%d/%d},"
+        " V[%d, %d] {%d/%d}",
+        frames_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1],
+        v[2], v[3]);
+    *mismatch_seen = frames_out;
+  }
+
+  aom_img_free(&enc_img);
+  aom_img_free(&dec_img);
+}
+#endif  // CONFIG_AV1_DECODER
+
+int main(int argc, const char **argv) {
+  AppInput app_input;
   AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL };
-  aom_codec_ctx_t codec;
+  FILE *obu_files[AOM_MAX_LAYERS] = { NULL };
+  AvxVideoWriter *total_layer_file = NULL;
+  FILE *total_layer_obu_file = NULL;
   aom_codec_enc_cfg_t cfg;
   int frame_cnt = 0;
   aom_image_t raw;
-  aom_codec_err_t res;
-  unsigned int width;
-  unsigned int height;
-  uint32_t error_resilient = 0;
-  int speed;
   int frame_avail;
   int got_data = 0;
   int flags = 0;
   unsigned i;
   int pts = 0;             // PTS starts at 0.
   int frame_duration = 1;  // 1 timebase tick per frame.
-  int layering_mode = 0;
   aom_svc_layer_id_t layer_id;
   aom_svc_params_t svc_params;
   aom_svc_ref_frame_config_t ref_frame_config;
-  const AvxInterface *encoder = NULL;
-  struct AvxInputContext input_ctx;
+  aom_svc_ref_frame_comp_pred_t ref_frame_comp_pred;
+
+#if CONFIG_INTERNAL_STATS
+  FILE *stats_file = fopen("opsnr.stt", "a");
+  if (stats_file == NULL) {
+    die("Cannot open opsnr.stt\n");
+  }
+#endif
+#if CONFIG_AV1_DECODER
+  int mismatch_seen = 0;
+  aom_codec_ctx_t decoder;
+#endif
+
   struct RateControlMetrics rc;
   int64_t cx_time = 0;
-  const int min_args_base = 13;
-  const int min_args = min_args_base;
+  int64_t cx_time_sl[3];  // max number of spatial layers.
   double sum_bitrate = 0.0;
   double sum_bitrate2 = 0.0;
   double framerate = 30.0;
   int use_svc_control = 1;
+  int set_err_resil_frame = 0;
   zero(rc.layer_target_bitrate);
   memset(&layer_id, 0, sizeof(aom_svc_layer_id_t));
-  memset(&input_ctx, 0, sizeof(input_ctx));
+  memset(&app_input, 0, sizeof(AppInput));
   memset(&svc_params, 0, sizeof(svc_params));
 
   // Flag to test dynamic scaling of source frames for single
@@ -589,81 +1121,74 @@ int main(int argc, char **argv) {
   const int test_dynamic_scaling_single_layer = 0;
 
   /* Setup default input stream settings */
-  input_ctx.framerate.numerator = 30;
-  input_ctx.framerate.denominator = 1;
-  input_ctx.only_i420 = 1;
-  input_ctx.bit_depth = 0;
-  unsigned int ts_number_layers = 1;
-  unsigned int ss_number_layers = 1;
+  app_input.input_ctx.framerate.numerator = 30;
+  app_input.input_ctx.framerate.denominator = 1;
+  app_input.input_ctx.only_i420 = 1;
+  app_input.input_ctx.bit_depth = 0;
+  app_input.speed = 7;
   exec_name = argv[0];
-  // Check usage and arguments.
-  if (argc < min_args) {
-    die("Usage: %s <infile> <outfile> <codec_type(av1)> <width> <height> "
-        "<rate_num> <rate_den> <speed> <frame_drop_threshold> "
-        "<error_resilient> <threads> <mode> "
-        "<Rate_0> ... <Rate_nlayers-1>\n",
-        argv[0]);
+
+  // start with default encoder configuration
+  aom_codec_err_t res = aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg,
+                                                     AOM_USAGE_REALTIME);
+  if (res) {
+    die("Failed to get config: %s\n", aom_codec_err_to_string(res));
   }
 
-  encoder = get_aom_encoder_by_name(argv[3]);
+  // Real time parameters.
+  cfg.g_usage = AOM_USAGE_REALTIME;
 
-  width = (unsigned int)strtoul(argv[4], NULL, 0);
-  height = (unsigned int)strtoul(argv[5], NULL, 0);
-  if (width < 16 || width % 2 || height < 16 || height % 2) {
-    die("Invalid resolution: %d x %d", width, height);
-  }
+  cfg.rc_end_usage = AOM_CBR;
+  cfg.rc_min_quantizer = 2;
+  cfg.rc_max_quantizer = 52;
+  cfg.rc_undershoot_pct = 50;
+  cfg.rc_overshoot_pct = 50;
+  cfg.rc_buf_initial_sz = 600;
+  cfg.rc_buf_optimal_sz = 600;
+  cfg.rc_buf_sz = 1000;
+  cfg.rc_resize_mode = 0;  // Set to RESIZE_DYNAMIC for dynamic resize.
+  cfg.g_lag_in_frames = 0;
+  cfg.kf_mode = AOM_KF_AUTO;
 
-  layering_mode = (int)strtol(argv[12], NULL, 0);
-  if (layering_mode < 0 || layering_mode > 13) {
-    die("Invalid layering mode (0..12) %s", argv[12]);
-  }
+  parse_command_line(argc, argv, &app_input, &svc_params, &cfg);
 
-  if (argc != min_args + mode_to_num_layers[layering_mode]) {
-    die("Invalid number of arguments");
-  }
+  unsigned int ts_number_layers = svc_params.number_temporal_layers;
+  unsigned int ss_number_layers = svc_params.number_spatial_layers;
 
-  ts_number_layers = mode_to_num_temporal_layers[layering_mode];
-  ss_number_layers = mode_to_num_spatial_layers[layering_mode];
+  unsigned int width = cfg.g_w;
+  unsigned int height = cfg.g_h;
 
-  input_ctx.filename = argv[1];
-  open_input_file(&input_ctx, 0);
+  if (app_input.layering_mode >= 0) {
+    if (ts_number_layers !=
+            mode_to_num_temporal_layers[app_input.layering_mode] ||
+        ss_number_layers !=
+            mode_to_num_spatial_layers[app_input.layering_mode]) {
+      die("Number of layers doesn't match layering mode.");
+    }
+  }
 
   // Y4M reader has its own allocation.
-  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+  if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) {
     if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, width, height, 32)) {
-      die("Failed to allocate image", width, height);
+      die("Failed to allocate image (%dx%d)", width, height);
     }
   }
 
-  // Populate encoder configuration.
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
-  if (res) {
-    printf("Failed to get config: %s\n", aom_codec_err_to_string(res));
-    return EXIT_FAILURE;
-  }
-
-  // Update the default configuration with our settings.
-  cfg.g_w = width;
-  cfg.g_h = height;
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name("av1");
 
-  // Timebase format e.g. 30fps: numerator=1, demoninator = 30.
-  cfg.g_timebase.num = (int)strtol(argv[6], NULL, 0);
-  cfg.g_timebase.den = (int)strtol(argv[7], NULL, 0);
+  memcpy(&rc.layer_target_bitrate[0], &svc_params.layer_target_bitrate[0],
+         sizeof(svc_params.layer_target_bitrate));
 
-  speed = (int)strtol(argv[8], NULL, 0);
-  if (speed < 0 || speed > 8) {
-    die("Invalid speed setting: must be positive");
+  unsigned int total_rate = 0;
+  for (i = 0; i < ss_number_layers; i++) {
+    total_rate +=
+        svc_params
+            .layer_target_bitrate[i * ts_number_layers + ts_number_layers - 1];
   }
-
-  for (i = min_args_base;
-       (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) {
-    rc.layer_target_bitrate[i - 13] = (int)strtol(argv[i], NULL, 0);
-    svc_params.layer_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13];
+  if (total_rate != cfg.rc_target_bitrate) {
+    die("Incorrect total target bitrate");
   }
 
-  cfg.rc_target_bitrate =
-      svc_params.layer_target_bitrate[ss_number_layers * ts_number_layers - 1];
-
   svc_params.framerate_factor[0] = 1;
   if (ts_number_layers == 2) {
     svc_params.framerate_factor[0] = 2;
@@ -674,78 +1199,80 @@ int main(int argc, char **argv) {
     svc_params.framerate_factor[2] = 1;
   }
 
-  // Real time parameters.
-  cfg.g_usage = AOM_USAGE_REALTIME;
-
-  cfg.rc_dropframe_thresh = (unsigned int)strtoul(argv[9], NULL, 0);
-  cfg.rc_end_usage = AOM_CBR;
-  cfg.rc_min_quantizer = 2;
-  cfg.rc_max_quantizer = 52;
-  cfg.rc_undershoot_pct = 50;
-  cfg.rc_overshoot_pct = 50;
-  cfg.rc_buf_initial_sz = 600;
-  cfg.rc_buf_optimal_sz = 600;
-  cfg.rc_buf_sz = 1000;
-
-  // Use 1 thread as default.
-  cfg.g_threads = (unsigned int)strtoul(argv[11], NULL, 0);
-
-  error_resilient = (uint32_t)strtoul(argv[10], NULL, 0);
-  if (error_resilient != 0 && error_resilient != 1) {
-    die("Invalid value for error resilient (0, 1): %d.", error_resilient);
+  if (app_input.input_ctx.file_type == FILE_TYPE_Y4M) {
+    // Override these settings with the info from Y4M file.
+    cfg.g_w = app_input.input_ctx.width;
+    cfg.g_h = app_input.input_ctx.height;
+    // g_timebase is the reciprocal of frame rate.
+    cfg.g_timebase.num = app_input.input_ctx.framerate.denominator;
+    cfg.g_timebase.den = app_input.input_ctx.framerate.numerator;
   }
-  // Enable error resilient mode.
-  cfg.g_error_resilient = error_resilient;
-  cfg.g_lag_in_frames = 0;
-  cfg.kf_mode = AOM_KF_AUTO;
-
-  // Disable automatic keyframe placement.
-  cfg.kf_min_dist = cfg.kf_max_dist = 3000;
-
   framerate = cfg.g_timebase.den / cfg.g_timebase.num;
   set_rate_control_metrics(&rc, framerate, ss_number_layers, ts_number_layers);
 
-  if (input_ctx.file_type == FILE_TYPE_Y4M) {
-    if (input_ctx.width != cfg.g_w || input_ctx.height != cfg.g_h) {
-      die("Incorrect width or height: %d x %d", cfg.g_w, cfg.g_h);
-    }
-    if (input_ctx.framerate.numerator != cfg.g_timebase.den ||
-        input_ctx.framerate.denominator != cfg.g_timebase.num) {
-      die("Incorrect framerate: numerator %d denominator %d",
-          cfg.g_timebase.num, cfg.g_timebase.den);
-    }
-  }
-
+  AvxVideoInfo info;
+  info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
+  info.frame_width = cfg.g_w;
+  info.frame_height = cfg.g_h;
+  info.time_base.numerator = cfg.g_timebase.num;
+  info.time_base.denominator = cfg.g_timebase.den;
   // Open an output file for each stream.
   for (unsigned int sl = 0; sl < ss_number_layers; ++sl) {
     for (unsigned tl = 0; tl < ts_number_layers; ++tl) {
       i = sl * ts_number_layers + tl;
       char file_name[PATH_MAX];
-      AvxVideoInfo info;
-      info.codec_fourcc = encoder->fourcc;
-      info.frame_width = cfg.g_w;
-      info.frame_height = cfg.g_h;
-      info.time_base.numerator = cfg.g_timebase.num;
-      info.time_base.denominator = cfg.g_timebase.den;
-
-      snprintf(file_name, sizeof(file_name), "%s_%d.av1", argv[2], i);
-      outfile[i] = aom_video_writer_open(file_name, kContainerIVF, &info);
-      if (!outfile[i]) die("Failed to open %s for writing", file_name);
-      assert(outfile[i] != NULL);
+      snprintf(file_name, sizeof(file_name), "%s_%u.av1",
+               app_input.output_filename, i);
+      if (app_input.output_obu) {
+        obu_files[i] = fopen(file_name, "wb");
+        if (!obu_files[i]) die("Failed to open %s for writing", file_name);
+      } else {
+        outfile[i] = aom_video_writer_open(file_name, kContainerIVF, &info);
+        if (!outfile[i]) die("Failed to open %s for writing", file_name);
+      }
     }
   }
+  if (app_input.output_obu) {
+    total_layer_obu_file = fopen(app_input.output_filename, "wb");
+    if (!total_layer_obu_file)
+      die("Failed to open %s for writing", app_input.output_filename);
+  } else {
+    total_layer_file =
+        aom_video_writer_open(app_input.output_filename, kContainerIVF, &info);
+    if (!total_layer_file)
+      die("Failed to open %s for writing", app_input.output_filename);
+  }
 
   // Initialize codec.
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  aom_codec_ctx_t codec;
+  if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+    die("Failed to initialize encoder");
+
+#if CONFIG_AV1_DECODER
+  if (aom_codec_dec_init(&decoder, get_aom_decoder_by_index(0), NULL, 0)) {
+    die("Failed to initialize decoder");
+  }
+#endif
 
-  aom_codec_control(&codec, AOME_SET_CPUUSED, speed);
-  aom_codec_control(&codec, AV1E_SET_AQ_MODE, 3);
+  aom_codec_control(&codec, AOME_SET_CPUUSED, app_input.speed);
+  aom_codec_control(&codec, AV1E_SET_AQ_MODE, app_input.aq_mode ? 3 : 0);
   aom_codec_control(&codec, AV1E_SET_GF_CBR_BOOST_PCT, 0);
-  aom_codec_control(&codec, AV1E_SET_ENABLE_CDEF, 1);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_CDEF, 2);
+  aom_codec_control(&codec, AV1E_SET_LOOPFILTER_CONTROL, 2);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_WARPED_MOTION, 0);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_OBMC, 0);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
   aom_codec_control(&codec, AV1E_SET_ENABLE_ORDER_HINT, 0);
   aom_codec_control(&codec, AV1E_SET_ENABLE_TPL_MODEL, 0);
   aom_codec_control(&codec, AV1E_SET_DELTAQ_MODE, 0);
+  aom_codec_control(&codec, AV1E_SET_COEFF_COST_UPD_FREQ, 3);
+  aom_codec_control(&codec, AV1E_SET_MODE_COST_UPD_FREQ, 3);
+  aom_codec_control(&codec, AV1E_SET_MV_COST_UPD_FREQ, 3);
+  aom_codec_control(&codec, AV1E_SET_DV_COST_UPD_FREQ, 3);
+  aom_codec_control(&codec, AV1E_SET_CDF_UPDATE_MODE, 1);
+  aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS,
+                    cfg.g_threads ? get_msb(cfg.g_threads) : 0);
+  if (cfg.g_threads > 1) aom_codec_control(&codec, AV1E_SET_ROW_MT, 1);
 
   svc_params.number_spatial_layers = ss_number_layers;
   svc_params.number_temporal_layers = ts_number_layers;
@@ -766,8 +1293,8 @@ int main(int argc, char **argv) {
     svc_params.scaling_factor_num[1] = 1;
     svc_params.scaling_factor_den[1] = 2;
   }
-
   aom_codec_control(&codec, AV1E_SET_SVC_PARAMS, &svc_params);
+  // TODO(aomedia:3032): Configure KSVC in fixed mode.
 
   // This controls the maximum target size of the key frame.
   // For generating smaller key frames, use a smaller max_intra_size_pct
@@ -778,26 +1305,58 @@ int main(int argc, char **argv) {
                       max_intra_size_pct);
   }
 
+  for (unsigned int slx = 0; slx < ss_number_layers; slx++) cx_time_sl[slx] = 0;
   frame_avail = 1;
   while (frame_avail || got_data) {
     struct aom_usec_timer timer;
-    frame_avail = read_frame(&input_ctx, &raw);
-    int is_key_frame = (frame_cnt % cfg.kf_max_dist) == 0;
+    frame_avail = read_frame(&(app_input.input_ctx), &raw);
     // Loop over spatial layers.
     for (unsigned int slx = 0; slx < ss_number_layers; slx++) {
       aom_codec_iter_t iter = NULL;
       const aom_codec_cx_pkt_t *pkt;
       int layer = 0;
+      // Flag for superframe whose base is key.
+      int is_key_frame = (frame_cnt % cfg.kf_max_dist) == 0;
+      // For flexible mode:
+      if (app_input.layering_mode >= 0) {
+        // Set the reference/update flags, layer_id, and reference_map
+        // buffer index.
+        set_layer_pattern(app_input.layering_mode, frame_cnt, &layer_id,
+                          &ref_frame_config, &ref_frame_comp_pred,
+                          &use_svc_control, slx, is_key_frame,
+                          (app_input.layering_mode == 10), app_input.speed);
+        aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
+        if (use_svc_control) {
+          aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG,
+                            &ref_frame_config);
+          aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_COMP_PRED,
+                            &ref_frame_comp_pred);
+        }
+      } else {
+        // Only up to 3 temporal layers supported in fixed mode.
+        // Only need to set spatial and temporal layer_id: reference
+        // prediction, refresh, and buffer_idx are set internally.
+        layer_id.spatial_layer_id = slx;
+        layer_id.temporal_layer_id = 0;
+        if (ts_number_layers == 2) {
+          layer_id.temporal_layer_id = (frame_cnt % 2) != 0;
+        } else if (ts_number_layers == 3) {
+          if (frame_cnt % 2 != 0)
+            layer_id.temporal_layer_id = 2;
+          else if ((frame_cnt > 1) && ((frame_cnt - 2) % 4 == 0))
+            layer_id.temporal_layer_id = 1;
+        }
+        aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
+      }
 
-      // Set the reference/update flags, layer_id, and reference_map
-      // buffer index.
-      flags = set_layer_pattern(layering_mode, frame_cnt, &layer_id,
-                                &ref_frame_config, &use_svc_control, slx,
-                                is_key_frame, (layering_mode == 9));
-      aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
-      if (use_svc_control)
-        aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG,
-                          &ref_frame_config);
+      if (set_err_resil_frame) {
+        // Set error_resilient per frame: off/0 for base layer and
+        // on/1 for enhancement layer frames.
+        int err_resil_mode =
+            (layer_id.spatial_layer_id > 0 || layer_id.temporal_layer_id > 0);
+        aom_codec_control(&codec, AV1E_SET_ERROR_RESILIENT_MODE,
+                          err_resil_mode);
+      }
 
       layer = slx * ts_number_layers + layer_id.temporal_layer_id;
       if (frame_avail && slx == 0) ++rc.layer_input_frames[layer];
@@ -820,6 +1379,7 @@ int main(int argc, char **argv) {
         die_codec(&codec, "Failed to encode frame");
       aom_usec_timer_mark(&timer);
       cx_time += aom_usec_timer_elapsed(&timer);
+      cx_time_sl[slx] += aom_usec_timer_elapsed(&timer);
 
       got_data = 0;
       while ((pkt = aom_codec_get_cx_data(&codec, &iter))) {
@@ -831,22 +1391,36 @@ int main(int argc, char **argv) {
               for (unsigned tl = layer_id.temporal_layer_id;
                    tl < ts_number_layers; ++tl) {
                 unsigned int j = sl * ts_number_layers + tl;
-                aom_video_writer_write_frame(outfile[j], pkt->data.frame.buf,
-                                             pkt->data.frame.sz, pts);
+                if (app_input.output_obu) {
+                  fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+                         obu_files[j]);
+                } else {
+                  aom_video_writer_write_frame(outfile[j], pkt->data.frame.buf,
+                                               pkt->data.frame.sz, pts);
+                }
                 if (sl == (unsigned int)layer_id.spatial_layer_id)
                   rc.layer_encoding_bitrate[j] += 8.0 * pkt->data.frame.sz;
-                // Keep count of rate control stats per layer (for non-key).
-                if (tl == (unsigned int)layer_id.temporal_layer_id &&
-                    sl == (unsigned int)layer_id.spatial_layer_id &&
-                    !(pkt->data.frame.flags & AOM_FRAME_IS_KEY)) {
-                  rc.layer_avg_frame_size[j] += 8.0 * pkt->data.frame.sz;
-                  rc.layer_avg_rate_mismatch[j] +=
-                      fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[j]) /
-                      rc.layer_pfb[j];
-                  if (slx == 0) ++rc.layer_enc_frames[tl];
-                }
               }
             }
+            // Write everything into the top layer.
+            if (app_input.output_obu) {
+              fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+                     total_layer_obu_file);
+            } else {
+              aom_video_writer_write_frame(total_layer_file,
+                                           pkt->data.frame.buf,
+                                           pkt->data.frame.sz, pts);
+            }
+            // Keep count of rate control stats per layer (for non-key).
+            if (!(pkt->data.frame.flags & AOM_FRAME_IS_KEY)) {
+              unsigned int j = layer_id.spatial_layer_id * ts_number_layers +
+                               layer_id.temporal_layer_id;
+              rc.layer_avg_frame_size[j] += 8.0 * pkt->data.frame.sz;
+              rc.layer_avg_rate_mismatch[j] +=
+                  fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[j]) /
+                  rc.layer_pfb[j];
+              if (slx == 0) ++rc.layer_enc_frames[layer_id.temporal_layer_id];
+            }
 
             // Update for short-time encoding bitrate states, for moving window
             // of size rc->window, shifted by rc->window / 2.
@@ -878,15 +1452,32 @@ int main(int argc, char **argv) {
                 sum_bitrate2 = 0.0;
               }
             }
+
+#if CONFIG_AV1_DECODER
+            if (aom_codec_decode(&decoder, pkt->data.frame.buf,
+                                 (unsigned int)pkt->data.frame.sz, NULL))
+              die_codec(&decoder, "Failed to decode frame.");
+#endif
+
             break;
           default: break;
         }
       }
+#if CONFIG_AV1_DECODER
+      // Don't look for mismatch on top spatial and top temporal layers as they
+      // are non reference frames.
+      if ((ss_number_layers > 1 || ts_number_layers > 1) &&
+          !(layer_id.temporal_layer_id > 0 &&
+            layer_id.temporal_layer_id == (int)ts_number_layers - 1)) {
+        test_decode(&codec, &decoder, frame_cnt, &mismatch_seen);
+      }
+#endif
     }  // loop over spatial layers
     ++frame_cnt;
     pts += frame_duration;
   }
-  close_input_file(&input_ctx);
+
+  close_input_file(&(app_input.input_ctx));
   printout_rate_control_summary(&rc, frame_cnt, ss_number_layers,
                                 ts_number_layers);
   printf("\n");
@@ -894,13 +1485,31 @@ int main(int argc, char **argv) {
          frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000),
          1000000 * (double)frame_cnt / (double)cx_time);
 
+  if (ss_number_layers > 1) {
+    printf("Per spatial layer: \n");
+    for (unsigned int slx = 0; slx < ss_number_layers; slx++)
+      printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f\n",
+             frame_cnt, (float)cx_time_sl[slx] / (double)(frame_cnt * 1000),
+             1000000 * (double)frame_cnt / (double)cx_time_sl[slx]);
+  }
+
   if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
 
+#if CONFIG_INTERNAL_STATS
+  if (mismatch_seen) {
+    fprintf(stats_file, "First mismatch occurred in frame %d\n", mismatch_seen);
+  } else {
+    fprintf(stats_file, "No mismatch detected in recon buffers\n");
+  }
+  fclose(stats_file);
+#endif
+
   // Try to rewrite the output file headers with the actual frame count.
   for (i = 0; i < ss_number_layers * ts_number_layers; ++i)
     aom_video_writer_close(outfile[i]);
+  aom_video_writer_close(total_layer_file);
 
-  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+  if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) {
     aom_img_free(&raw);
   }
   return EXIT_SUCCESS;
diff --git a/media/libaom/src/examples/twopass_encoder.c b/media/libaom/src/examples/twopass_encoder.c
index a03bc6cc2a..388f68bd4d 100644
--- a/media/libaom/src/examples/twopass_encoder.c
+++ b/media/libaom/src/examples/twopass_encoder.c
@@ -52,6 +52,7 @@
 #include <string.h>
 
 #include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
 #include "common/tools_common.h"
 #include "common/video_writer.h"
 
@@ -82,6 +83,7 @@ static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img,
       const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
       const size_t pkt_size = pkt->data.twopass_stats.sz;
       stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+      if (!stats->buf) die("Failed to allocate frame stats buffer.");
       memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
       stats->sz += pkt_size;
     }
@@ -117,14 +119,14 @@ static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img,
 }
 
 static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
-                             const AvxInterface *encoder,
+                             aom_codec_iface_t *encoder,
                              const aom_codec_enc_cfg_t *cfg, int limit) {
   aom_codec_ctx_t codec;
   int frame_count = 0;
   aom_fixed_buf_t stats = { NULL, 0 };
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, cfg, 0))
+    die("Failed to initialize encoder");
 
   // Calculate frame statistics.
   while (aom_img_read(raw, infile) && frame_count < limit) {
@@ -143,9 +145,9 @@ static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
 }
 
 static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
-                  const AvxInterface *encoder, const aom_codec_enc_cfg_t *cfg,
+                  aom_codec_iface_t *encoder, const aom_codec_enc_cfg_t *cfg,
                   int limit) {
-  AvxVideoInfo info = { encoder->fourcc,
+  AvxVideoInfo info = { get_fourcc_by_aom_encoder(encoder),
                         cfg->g_w,
                         cfg->g_h,
                         { cfg->g_timebase.num, cfg->g_timebase.den },
@@ -157,8 +159,11 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
   writer = aom_video_writer_open(outfile_name, kContainerIVF, &info);
   if (!writer) die("Failed to open %s for writing", outfile_name);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_enc_init(&codec, encoder, cfg, 0))
+    die("Failed to initialize encoder");
+
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2))
+    die_codec(&codec, "Failed to set cpu-used");
 
   // Encode frames.
   while (aom_img_read(raw, infile) && frame_count < limit) {
@@ -188,7 +193,6 @@ int main(int argc, char **argv) {
   aom_codec_err_t res;
   aom_fixed_buf_t stats;
 
-  const AvxInterface *encoder = NULL;
   const int fps = 30;       // TODO(dkovalev) add command line argument
   const int bitrate = 200;  // kbit/s TODO(dkovalev) add command line argument
   const char *const codec_arg = argv[1];
@@ -205,7 +209,7 @@ int main(int argc, char **argv) {
 
   if (limit == 0) limit = 100;
 
-  encoder = get_aom_encoder_by_name(codec_arg);
+  aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg);
   if (!encoder) die("Unsupported codec.");
 
   w = (int)strtol(width_arg, NULL, 0);
@@ -215,12 +219,12 @@ int main(int argc, char **argv) {
     die("Invalid frame size: %dx%d", w, h);
 
   if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 1))
-    die("Failed to allocate image", w, h);
+    die("Failed to allocate image (%dx%d)", w, h);
 
-  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+  printf("Using %s\n", aom_codec_iface_name(encoder));
 
   // Configuration
-  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, 0);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = w;
diff --git a/media/libaom/src/libs.doxy_template b/media/libaom/src/libs.doxy_template
index c522e21d3a..6e042ac931 100644
--- a/media/libaom/src/libs.doxy_template
+++ b/media/libaom/src/libs.doxy_template
@@ -1,4 +1,4 @@
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+## Copyright (c) 2020, Alliance for Open Media. All rights reserved
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -8,92 +8,140 @@
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
 
-# Doxyfile 1.5.4
+# Doxyfile 1.8.16
 
 # This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project
+# doxygen (www.doxygen.org) for a project.
 #
-# All text after a hash (#) is considered a comment and will be ignored
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
 # The format is:
-#       TAG = value [value, ...]
-# For lists items can also be appended using:
-#       TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (" ")
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
 
 #---------------------------------------------------------------------------
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file that
-# follow. The default is UTF-8 which is also the encoding used for all text before
-# the first occurrence of this tag. Doxygen uses libiconv (or the iconv built into
-# libc) for the transcoding. See http://www.gnu.org/software/libiconv for the list of
-# possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
 
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
-# by quotes) that should identify the project.
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "AOMedia AV1 Codec"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
 
-PROJECT_NAME           = "AOMedia Codec SDK"
+PROJECT_BRIEF          =
 
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
-# base path where the generated documentation will be put.
-# If a relative path is entered, it will be relative to the location
-# where doxygen was started. If left blank the current directory will be used.
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
 
 OUTPUT_DIRECTORY       = docs
 
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
-# 4096 sub-directories (in 2 levels) under the output directory of each output
-# format and will distribute the generated files over these directories.
-# Enabling this option can be useful when feeding doxygen a huge amount of
-# source files, where putting all generated files in the same directory would
-# otherwise cause performance problems for the file system.
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
 
 CREATE_SUBDIRS         = NO
 
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
 # The OUTPUT_LANGUAGE tag is used to specify the language in which all
 # documentation generated by doxygen is written. Doxygen will use this
 # information to generate all constant output in the proper language.
-# The default language is English, other supported languages are:
-# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
-# Croatian, Czech, Danish, Dutch, Finnish, French, German, Greek, Hungarian,
-# Italian, Japanese, Japanese-en (Japanese with English messages), Korean,
-# Korean-en, Lithuanian, Norwegian, Polish, Portuguese, Romanian, Russian,
-# Serbian, Slovak, Slovene, Spanish, Swedish, and Ukrainian.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
 
 OUTPUT_LANGUAGE        = English
 
-# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
-# include brief member descriptions after the members that are listed in
-# the file and class documentation (similar to java_doc).
-# Set to NO to disable this.
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
 
 BRIEF_MEMBER_DESC      = YES
 
-# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
-# the brief description of a member or function before the detailed description.
-# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
 # brief descriptions will be completely suppressed.
+# The default value is: YES.
 
 REPEAT_BRIEF           = YES
 
-# This tag implements a quasi-intelligent brief description abbreviator
-# that is used to form the text in various listings. Each string
-# in this list, if found as the leading text of the brief description, will be
-# stripped from the text and the result after processing the whole list, is
-# used as the annotated text. Otherwise, the brief description is used as-is.
-# If left blank, the following values are used ("$name" is automatically
-# replaced with the name of the entity): "The $name class" "The $name widget"
-# "The $name file" "is" "provides" "specifies" "contains"
-# "represents" "a" "an" "the"
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
 
 ABBREVIATE_BRIEF       =
 
 # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# Doxygen will generate a detailed section even if there is only a brief
+# doxygen will generate a detailed section even if there is only a brief
 # description.
+# The default value is: NO.
 
 ALWAYS_DETAILED_SEC    = NO
 
@@ -101,873 +149,1895 @@ ALWAYS_DETAILED_SEC    = NO
 # inherited members of a class in the documentation of that class as if those
 # members were ordinary class members. Constructors, destructors and assignment
 # operators of the base classes will not be shown.
+# The default value is: NO.
 
 INLINE_INHERITED_MEMB  = NO
 
-# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
-# path before files name in the file list and in the header files. If set
-# to NO the shortest path that makes the file name unique will be used.
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
 
 FULL_PATH_NAMES        = YES
 
-# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
-# can be used to strip a user-defined part of the path. Stripping is
-# only done if one of the specified strings matches the left-hand part of
-# the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the
-# path to strip.
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
 STRIP_FROM_PATH        =
 
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
-# the path mentioned in the documentation of a class, which tells
-# the reader which header file to include in order to use a class.
-# If left blank only the name of the header file containing the class
-# definition is used. Otherwise one should specify the include paths that
-# are normally passed to the compiler using the -I flag.
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
 
 STRIP_FROM_INC_PATH    =
 
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
-# (but less readable) file names. This can be useful is your file systems
-# doesn't support long names like on DOS, Mac, or CD-ROM.
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
 
 SHORT_NAMES            = NO
 
-# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
-# will interpret the first line (until the first dot) of a java_doc-style
-# comment as the brief description. If set to NO, the java_doc
-# comments will behave just like regular Qt-style comments
-# (thus requiring an explicit @brief command for a brief description.)
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
 
 JAVADOC_AUTOBRIEF      = NO
 
-# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
-# interpret the first line (until the first dot) of a Qt-style
-# comment as the brief description. If set to NO, the comments
-# will behave just like regular Qt-style comments (thus requiring
-# an explicit \brief command for a brief description.)
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
 
 QT_AUTOBRIEF           = NO
 
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
-# treat a multi-line C++ special comment block (i.e. a block of //! or ///
-# comments) as a brief description. This used to be the default behaviour.
-# The new default is to treat a multi-line C++ comment block as a detailed
-# description. Set this tag to YES if you prefer the old behaviour instead.
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
 
 MULTILINE_CPP_IS_BRIEF = NO
 
-# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
-# member inherits the documentation from any documented member that it
-# re-implements.
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
 
 INHERIT_DOCS           = YES
 
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
-# a new page for each member. If set to NO, the documentation of a member will
-# be part of the file/class/namespace that contains it.
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
 
 SEPARATE_MEMBER_PAGES  = NO
 
-# The TAB_SIZE tag can be used to set the number of spaces in a tab.
-# Doxygen uses this value to replace tabs by spaces in code fragments.
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
 
 TAB_SIZE               = 4
 
-# This tag can be used to specify a number of aliases that acts
-# as commands in the documentation. An alias has the form "name=value".
-# For example adding "sideeffect=\par Side Effects:\n" will allow you to
-# put the command \sideeffect (or @sideeffect) in the documentation, which
-# will result in a user-defined paragraph with heading "Side Effects:".
-# You can put \n's in the value part of an alias to insert newlines.
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
 
 ALIASES                =
 
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
-# sources only. Doxygen will then generate output that is more tailored for C.
-# For instance, some of the names that are used will be different. The list
-# of all members will be omitted, etc.
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
 
 OPTIMIZE_OUTPUT_FOR_C  = YES
 
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
-# sources only. Doxygen will then generate output that is more tailored for Java.
-# For instance, namespaces will be presented as packages, qualified scopes
-# will look different, etc.
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
 
 OPTIMIZE_OUTPUT_JAVA   = NO
 
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to
-# include (a tag file for) the STL sources as input, then you should
-# set this tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
-# func(std::string) {}). This also make the inheritance and collaboration
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is
+# Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
 # diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
 
 BUILTIN_STL_SUPPORT    = NO
 
 # If you use Microsoft's C++/CLI language, you should set this option to YES to
 # enable parsing support.
+# The default value is: NO.
 
 CPP_CLI_SUPPORT        = NO
 
-# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
-# Doxygen will parse them like normal C++ but will assume all classes use public
-# instead of private inheritance when no explicit protection keyword is present.
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
 
 SIP_SUPPORT            = NO
 
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
 # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES, then doxygen will reuse the documentation of the first
+# tag is set to YES then doxygen will reuse the documentation of the first
 # member in the group (if any) for the other members of the group. By default
 # all members of a group must be documented explicitly.
+# The default value is: NO.
 
 DISTRIBUTE_GROUP_DOC   = NO
 
-# Set the SUBGROUPING tag to YES (the defqault) to allow class member groups of
-# the same type (for instance a group of public functions) to be put as a
-# subgroup of that type (e.g. under the Public Functions section). Set it to
-# NO to prevent subgrouping. Alternatively, this can be done per class using
-# the \nosubgrouping command.
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
 
 SUBGROUPING            = YES
 
-# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct (or union) is
-# documented as struct with the name of the typedef. So
-# typedef struct type_s {} type_t, will appear in the documentation as a struct
-# with name type_t. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named type_s. This can typically
-# be useful for C code where the coding convention is that all structs are
-# typedef'ed and only the typedef is referenced never the struct's name.
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
 
 TYPEDEF_HIDES_STRUCT   = NO
 
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
 
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
-# documentation are documented, even if no documentation was available.
-# Private class members and static file members will be hidden unless
-# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
 
 EXTRACT_ALL            = NO
 
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
-# will be included in the documentation.
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
 
 EXTRACT_PRIVATE        = NO
 
-# If the EXTRACT_STATIC tag is set to YES all static members of a file
-# will be included in the documentation.
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
 
-EXTRACT_STATIC         = NO
+EXTRACT_PACKAGE        = NO
 
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
-# defined locally in source files will be included in the documentation.
-# If set to NO only classes defined in header files are included.
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
 
 EXTRACT_LOCAL_CLASSES  = YES
 
-# This flag is only useful for Objective-C code. When set to YES local
-# methods, which are defined in the implementation section but not in
-# the interface are included in the documentation.
-# If set to NO (the default) only methods in the interface are included.
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
 
 EXTRACT_LOCAL_METHODS  = NO
 
-# If this flag is set to YES, the members of anonymous namespaces will be extracted
-# and appear in the documentation as a namespace called 'anonymous_namespace{file}',
-# where file will be replaced with the base name of the file that contains the anonymous
-# namespace. By default anonymous namespace are hidden.
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
 
 EXTRACT_ANON_NSPACES   = NO
 
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
-# undocumented members of documented classes, files or namespaces.
-# If set to NO (the default) these members will be included in the
-# various overviews, but no documentation section is generated.
-# This option has no effect if EXTRACT_ALL is enabled.
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
 
 HIDE_UNDOC_MEMBERS     = NO
 
-# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy.
-# If set to NO (the default) these classes will be included in the various
-# overviews. This option has no effect if EXTRACT_ALL is enabled.
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
 
 HIDE_UNDOC_CLASSES     = NO
 
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
-# friend (class|struct|union) declarations.
-# If set to NO (the default) these declarations will be included in the
-# documentation.
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = NO
 
-# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
-# documentation blocks found inside the body of a function.
-# If set to NO (the default) these blocks will be appended to the
-# function's detailed documentation block.
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
 
 HIDE_IN_BODY_DOCS      = NO
 
-# The INTERNAL_DOCS tag determines if documentation
-# that is typed after a \internal command is included. If the tag is set
-# to NO (the default) then the documentation will be excluded.
-# Set it to YES to include the internal documentation.
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
-# file names in lower-case letters. If set to YES upper-case letters are also
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
 # allowed. This is useful if you have classes or files whose names only differ
 # in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+# (including Cygwin) ands Mac users are advised to set this option to NO.
+# The default value is: system dependent.
 
 CASE_SENSE_NAMES       = YES
 
-# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
-# will show members with their full class and namespace scopes in the
-# documentation. If set to YES the scope will be hidden.
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
 
 HIDE_SCOPE_NAMES       = NO
 
-# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
-# will put a list of the files that are included by a file in the documentation
-# of that file.
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
 
 SHOW_INCLUDE_FILES     = YES
 
-# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
-# is inserted in the documentation for inline members.
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
 
 INLINE_INFO            = YES
 
-# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
-# will sort the (detailed) documentation of file and class members
-# alphabetically by member name. If set to NO the members will appear in
-# declaration order.
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
 
 SORT_MEMBER_DOCS       = NO
 
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
-# brief documentation of file, namespace and class members alphabetically
-# by member name. If set to NO (the default) the members will appear in
-# declaration order.
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
 
 SORT_BRIEF_DOCS        = NO
 
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
-# sorted by fully-qualified names, including namespaces. If set to
-# NO (the default), the class list will be sorted only by class name,
-# not including the namespace part.
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
 # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the
-# alphabetical list.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
 
 SORT_BY_SCOPE_NAME     = NO
 
-# The GENERATE_TODOLIST tag can be used to enable (YES) or
-# disable (NO) the todo list. This list is created by putting \todo
-# commands in the documentation.
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
 
 GENERATE_TODOLIST      = YES
 
-# The GENERATE_TESTLIST tag can be used to enable (YES) or
-# disable (NO) the test list. This list is created by putting \test
-# commands in the documentation.
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
 
 GENERATE_TESTLIST      = YES
 
-# The GENERATE_BUGLIST tag can be used to enable (YES) or
-# disable (NO) the bug list. This list is created by putting \bug
-# commands in the documentation.
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
 
 GENERATE_BUGLIST       = YES
 
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
-# disable (NO) the deprecated list. This list is created by putting
-# \deprecated commands in the documentation.
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
 
 GENERATE_DEPRECATEDLIST= YES
 
-# The ENABLED_SECTIONS tag can be used to enable conditional
-# documentation sections, marked by \if sectionname ... \endif.
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
 
 ENABLED_SECTIONS       =
 
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
-# the initial value of a variable or define consists of for it to appear in
-# the documentation. If the initializer consists of more lines than specified
-# here it will be hidden. Use a value of 0 to hide initializers completely.
-# The appearance of the initializer of individual variables and defines in the
-# documentation can be controlled using \showinitializer or \hideinitializer
-# command in the documentation regardless of this setting.
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
 
 MAX_INITIALIZER_LINES  = 30
 
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
-# at the bottom of the documentation of classes and structs. If set to YES the
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
 # list will mention the files that were used to generate the documentation.
+# The default value is: YES.
 
 SHOW_USED_FILES        = YES
 
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
 # The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from the
-# version control system). Doxygen will invoke the program by executing (via
-# popen()) the command <command> <input-file>, where <command> is the value of
-# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
-# provided by doxygen. Whatever the program writes to standard output
-# is used as the file version. See the manual for examples.
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
 
 FILE_VERSION_FILTER    =
 
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
 #---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
+# Configuration options related to warning and progress messages
 #---------------------------------------------------------------------------
 
-# The QUIET tag can be used to turn on/off the messages that are generated
-# by doxygen. Possible values are YES and NO. If left blank NO is used.
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
 
 QUIET                  = YES
 
 # The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated by doxygen. Possible values are YES and NO. If left blank
-# NO is used.
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
 
 WARNINGS               = YES
 
-# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
-# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
-# automatically be disabled.
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
 
 WARN_IF_UNDOCUMENTED   = YES
 
-# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some
-# parameters in a documented function, or documenting parameters that
-# don't exist or using markup commands wrongly.
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
 
 WARN_IF_DOC_ERROR      = YES
 
-# This WARN_NO_PARAMDOC option can be abled to get warnings for
-# functions that are documented, but have no documentation for their parameters
-# or return value. If set to NO (the default) doxygen will only warn about
-# wrong or incomplete parameter documentation, but not about the absence of
-# documentation.
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
+# The default value is: NO.
 
 WARN_NO_PARAMDOC       = NO
 
-# The WARN_FORMAT tag determines the format of the warning messages that
-# doxygen can produce. The string should contain the $file, $line, and $text
-# tags, which will be replaced by the file and line number from which the
-# warning originated and the warning text. Optionally the format may contain
-# $version, which will be replaced by the version of the file (if it could
-# be obtained via FILE_VERSION_FILTER)
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
 
 WARN_FORMAT            = "$file:$line: $text"
 
-# The WARN_LOGFILE tag can be used to specify a file to which warning
-# and error messages should be written. If left blank the output is written
-# to stderr.
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
 
 WARN_LOGFILE           =
 
 #---------------------------------------------------------------------------
-# configuration options related to the input files
+# Configuration options related to the input files
 #---------------------------------------------------------------------------
 
-# The INPUT tag can be used to specify the files and/or directories that contain
-# documented source files. You may enter file names like "myfile.cpp" or
-# directories like "/usr/src/myproject". Separate the files or directories
-# with spaces.
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
 
-INPUT =
+INPUT                  =
 
-# This tag can be used to specify the character encoding of the source files that
-# doxygen parses. Internally doxygen uses the UTF-8 encoding, which is also the default
-# input encoding. Doxygen uses libiconv (or the iconv built into libc) for the transcoding.
-# See http://www.gnu.org/software/libiconv for the list of possible encodings.
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
+# possible encodings.
+# The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
 
 # If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
-# and *.h) to filter out the source-files in the directories. If left
-# blank the following patterns are tested:
-# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
-# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          =
 
-# The RECURSIVE tag can be used to turn specify whether or not subdirectories
-# should be searched for input files as well. Possible values are YES and NO.
-# If left blank NO is used.
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
 
 RECURSIVE              = NO
 
-# The EXCLUDE tag can be used to specify files and/or directories that should
+# The EXCLUDE tag can be used to specify files and/or directories that should be
 # excluded from the INPUT source files. This way you can easily exclude a
 # subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
 
 EXCLUDE                =
 
-# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
-# directories that are symbolic links (a Unix filesystem feature) are excluded
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
 # from the input.
+# The default value is: NO.
 
 EXCLUDE_SYMLINKS       = NO
 
 # If the value of the INPUT tag contains directories, you can use the
 # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories. Note that the wildcards are matched
-# against the file with absolute path, so to exclude all test directories
-# for example use the pattern */test/*
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
 
 EXCLUDE_PATTERNS       =
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the output.
-# The symbol name can be a fully qualified name, a word, or if the wildcard * is used,
-# a substring. Examples: ANamespace, AClass, AClass::ANamespace, ANamespace::*Test
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
 
 EXCLUDE_SYMBOLS        =
 
-# The EXAMPLE_PATH tag can be used to specify one or more files or
-# directories that contain example code fragments that are included (see
-# the \include command).
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
 
 EXAMPLE_PATH           =
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
-# and *.h) to filter out the source-files in the directories. If left
-# blank all files are included.
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
 
 EXAMPLE_PATTERNS       =
 
 # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude
-# commands irrespective of the value of the RECURSIVE tag.
-# Possible values are YES and NO. If left blank NO is used.
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
 
 EXAMPLE_RECURSIVE      = NO
 
-# The IMAGE_PATH tag can be used to specify one or more files or
-# directories that contain image that are included in the documentation (see
-# the \image command).
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
 
 IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command <filter> <input-file>, where <filter>
-# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
-# input file. Doxygen will then use the output that the filter program writes
-# to standard output.  If FILTER_PATTERNS is specified, this tag will be
-# ignored.
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 INPUT_FILTER           =
 
 # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis.  Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match.  The filters are a list of the form:
-# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
-# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
-# is applied to all files.
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 FILTER_PATTERNS        =
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will be used to filter the input files when producing source
-# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
 
 FILTER_SOURCE_FILES    = NO
 
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
 #---------------------------------------------------------------------------
-# configuration options related to source browsing
+# Configuration options related to source browsing
 #---------------------------------------------------------------------------
 
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will
-# be generated. Documented entities will be cross-referenced with these sources.
-# Note: To get rid of all source code in the generated output, make sure also
-# VERBATIM_HEADERS is set to NO. If you have enabled CALL_GRAPH or CALLER_GRAPH
-# then you must also enable this option. If you don't then doxygen will produce
-# a warning and turn it on anyway
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
 
 SOURCE_BROWSER         = NO
 
-# Setting the INLINE_SOURCES tag to YES will include the body
-# of functions and classes directly in the documentation.
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
 
 INLINE_SOURCES         = NO
 
-# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
-# doxygen to hide any special comment blocks from generated source code
-# fragments. Normal C and C++ comments will always remain visible.
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
 
 STRIP_CODE_COMMENTS    = YES
 
-# If the REFERENCED_BY_RELATION tag is set to YES (the default)
-# then for each documented function all documented
-# functions referencing it will be listed.
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
 
 REFERENCED_BY_RELATION = YES
 
-# If the REFERENCES_RELATION tag is set to YES (the default)
-# then for each documented function all documented entities
-# called/used by that function will be listed.
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
 
 REFERENCES_RELATION    = YES
 
-# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
-# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
-# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
-# link to the source code.  Otherwise they will link to the documentstion.
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
 
 REFERENCES_LINK_SOURCE = YES
 
-# If the USE_HTAGS tag is set to YES then the references to source code
-# will point to the HTML generated by the htags(1) tool instead of doxygen
-# built-in source browser. The htags tool is part of GNU's global source
-# tagging system (see http://www.gnu.org/software/global/global.html). You
-# will need version 4.8.6 or higher.
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
 
 USE_HTAGS              = NO
 
-# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
-# will generate a verbatim copy of the header file for each class for
-# which an include is specified. Set to NO to disable this.
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
 
 VERBATIM_HEADERS       = YES
 
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files
+# were built. This is equivalent to specifying the "-p" option to a clang tool,
+# such as clang-check. These options will then be passed to the parser.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
 #---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
+# Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
 
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
-# of all compounds will be generated. Enable this if the project
-# contains a lot of classes, structs, unions or interfaces.
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
 
 ALPHABETICAL_INDEX     = NO
 
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
-# in which this list will be split (can be a number in the range [1..20])
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all
-# classes will be put under the same header in the alphabetical index.
-# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
-# should be ignored while generating the index headers.
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
 IGNORE_PREFIX          =
 
 #---------------------------------------------------------------------------
-# configuration options related to the HTML output
+# Configuration options related to the HTML output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
-# generate HTML output.
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
 
 GENERATE_HTML          = YES
 
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `html' will be used as the default path.
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_OUTPUT            = html
 
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
-# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
-# doxygen will generate files with .html extension.
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_FILE_EXTENSION    = .html
 
-# The HTML_HEADER tag can be used to specify a personal HTML header for
-# each generated HTML page. If it is left blank doxygen will generate a
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
 # standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_HEADER            =
 
-# The HTML_FOOTER tag can be used to specify a personal HTML footer for
-# each generated HTML page. If it is left blank doxygen will generate a
-# standard footer.
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_FOOTER            =
 
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
-# style sheet that is used by each HTML page. It can be used to
-# fine-tune the look of the HTML output. If the tag is left blank doxygen
-# will generate a default style sheet. Note that doxygen will try to copy
-# the style sheet file to the HTML output directory, so don't put your own
-# stylesheet in the HTML output directory as well, or it will be erased!
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_STYLESHEET        =
 
-# If the GENERATE_HTMLHELP tag is set to YES, additional index files
-# will be generated that can be used as input for tools like the
-# Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
-# of the generated HTML documentation.
-
-GENERATE_HTMLHELP      = NO
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
 
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
-# page has loaded. For this to work a browser that supports
-# java_script and DHTML is required (for instance Mozilla 1.0+, Firefox
-# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_DYNAMIC_SECTIONS  = NO
 
-# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
-# be used to specify the file name of the resulting .chm file. You
-# can add a path in front of the file if the result should not be
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: https://developer.apple.com/xcode/), introduced with OSX
+# 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
 # written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 CHM_FILE               =
 
-# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
-# be used to specify the location (absolute path including file name) of
-# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
-# the HTML help compiler on the generated index.hhp.
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 HHC_LOCATION           =
 
-# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
-# controls if a separate .chi index file is generated (YES) or that
-# it should be included in the master .chm file (NO).
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 GENERATE_CHI           = NO
 
-# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
-# controls whether a binary table of contents is generated (YES) or a
-# normal table of contents (NO) in the .chm file.
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 BINARY_TOC             = NO
 
-# The TOC_EXPAND flag can be set to YES to add extra items for group members
-# to the contents of the HTML help documentation and to the tree view.
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 TOC_EXPAND             = NO
 
-# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
-# top of each HTML page. The value NO (the default) enables the index and
-# the value YES disables it.
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 DISABLE_INDEX          = NO
 
-# This tag can be used to set the number of enum values (range [1..20])
-# that doxygen will group on one line in the generated HTML documentation.
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
 ENUM_VALUES_PER_LINE   = 4
 
-# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
-# generated containing a tree-like index structure (just like the one that
-# is generated for HTML Help). For this to work a browser that supports
-# java_script, DHTML, CSS and frames is required (for instance Mozilla 1.0+,
-# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are
-# probably better off using the HTML help feature.
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
-GENERATE_TREEVIEW      = NO
+TREEVIEW_WIDTH         = 250
 
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
-# used to set the initial width (in pixels) of the frame in which the tree
-# is shown.
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
 
-TREEVIEW_WIDTH         = 250
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = YES
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
 
 #---------------------------------------------------------------------------
-# configuration options related to the la_te_x output
+# Configuration options related to the LaTeX output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
-# generate Latex output.
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
 
 GENERATE_LATEX         = YES
 
-# The LATEX_OUTPUT tag is used to specify where the la_te_x docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `latex' will be used as the default path.
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_OUTPUT           = latex
 
-# The LATEX_CMD_NAME tag can be used to specify the la_te_x command name to be
-# invoked. If left blank `latex' will be used as the default command name.
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
-# generate index for la_te_x. If left blank `makeindex' will be used as the
-# default command name.
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
-# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
-# la_te_x documents. This may be useful for small projects and may help to
-# save some trees in general.
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 COMPACT_LATEX          = YES
 
-# The PAPER_TYPE tag can be used to set the paper type that is used
-# by the printer. Possible values are: a4, a4wide, letter, legal and
-# executive. If left blank a4wide will be used.
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 PAPER_TYPE             = letter
 
-# The EXTRA_PACKAGES tag can be to specify one or more names of la_te_x
-# packages that should be included in the la_te_x output.
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 EXTRA_PACKAGES         =
 
-# The LATEX_HEADER tag can be used to specify a personal la_te_x header for
-# the generated latex document. The header should contain everything until
-# the first chapter. If it is left blank doxygen will generate a
-# standard header. Notice: only use this tag if you know what you are doing!
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_HEADER           =
 
-# If the PDF_HYPERLINKS tag is set to YES, the la_te_x that is generated
-# is prepared for conversion to pdf (using ps2pdf). The pdf file will
-# contain links (just like the HTML output) instead of page references
-# This makes the output suitable for online browsing using a pdf viewer.
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
-# plain latex in the generated Makefile. Set this option to YES to get a
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
 # higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 USE_PDFLATEX           = YES
 
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
-# command to the generated la_te_x files. This will instruct la_te_x to keep
-# running if errors occur, instead of asking the user for help.
-# This option is also used when generating formulas in HTML.
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_BATCHMODE        = NO
 
-# If LATEX_HIDE_INDICES is set to YES then doxygen will not
-# include the index chapters (such as File Index, Compound Index, etc.)
-# in the output.
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_HIDE_INDICES     = NO
 
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
-# configuration options related to the RTF output
+# Configuration options related to the RTF output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
-# The RTF output is optimized for Word 97 and may not look very pretty with
-# other RTF readers or editors.
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
 
 GENERATE_RTF           = NO
 
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `rtf' will be used as the default path.
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_OUTPUT             = rtf
 
-# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
-# RTF documents. This may be useful for small projects and may help to
-# save some trees in general.
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
 
 COMPACT_RTF            = NO
 
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
-# will contain hyperlink fields. The RTF file will
-# contain links (just like the HTML output) instead of page references.
-# This makes the output suitable for online browsing using WORD or other
-# programs which support those fields.
-# Note: wordpad (write) and others do not support links.
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_HYPERLINKS         = NO
 
 # Load stylesheet definitions from file. Syntax is similar to doxygen's
-# config file, i.e. a series of assignments. You only have to provide
+# configuration file, i.e. a series of assignments. You only have to provide
 # replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_STYLESHEET_FILE    =
 
-# Set optional variables used in the generation of an rtf document.
-# Syntax is similar to doxygen's config file.
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
 
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
 #---------------------------------------------------------------------------
-# configuration options related to the man page output
+# Configuration options related to the man page output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
-# generate man pages
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
 
 GENERATE_MAN           = NO
 
-# The MAN_OUTPUT tag is used to specify where the man pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `man' will be used as the default path.
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
 
 MAN_OUTPUT             = man
 
-# The MAN_EXTENSION tag determines the extension that is added to
-# the generated man pages (default is the subroutine's section .3)
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
 
 MAN_EXTENSION          = .3
 
-# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
-# then it will generate one additional man file for each entity
-# documented in the real man page(s). These additional files
-# only source the real man page, but without them the man command
-# would be unable to find the correct page. The default is NO.
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
 
 MAN_LINKS              = YES
 
 #---------------------------------------------------------------------------
-# configuration options for the auto_gen Definitions output
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
-# generate an auto_gen Definitions (see autogen.sf.net) file
-# that captures the structure of the code including all
-# documentation. Note that this feature is still experimental
-# and incomplete at the moment.
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
 
 #---------------------------------------------------------------------------
-# configuration options related to the Perl module output
+# Configuration options related to the Perl module output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_PERLMOD tag is set to YES Doxygen will
-# generate a Perl module file that captures the structure of
-# the code including all documentation. Note that this
-# feature is still experimental and incomplete at the
-# moment.
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
 
 GENERATE_PERLMOD       = NO
 
-# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
-# the necessary Makefile rules, Perl scripts and la_te_x code to be able
-# to generate PDF and DVI output from the Perl module output.
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
 PERLMOD_LATEX          = NO
 
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
-# nicely formatted so it can be parsed by a human reader.  This is useful
-# if you want to understand what is going on.  On the other hand, if this
-# tag is set to NO the size of the Perl module output will be much smaller
-# and Perl will parse it just the same.
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
 PERLMOD_PRETTY         = YES
 
-# The names of the make variables in the generated doxyrules.make file
-# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
-# This is useful so different doxyrules.make files included by the same
-# Makefile don't overwrite each other's variables.
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
 PERLMOD_MAKEVAR_PREFIX =
 
@@ -975,278 +2045,438 @@ PERLMOD_MAKEVAR_PREFIX =
 # Configuration options related to the preprocessor
 #---------------------------------------------------------------------------
 
-# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
-# evaluate all C-preprocessor directives found in the sources and include
-# files.
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
 
 ENABLE_PREPROCESSING   = YES
 
-# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
-# names in the source code. If set to NO (the default) only conditional
-# compilation will be performed. Macro expansion can be done in a controlled
-# way by setting EXPAND_ONLY_PREDEF to YES.
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 MACRO_EXPANSION        = YES
 
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
-# then the macro expansion is limited to the macros specified with the
-# PREDEFINED and EXPAND_AS_DEFINED tags.
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 EXPAND_ONLY_PREDEF     = NO
 
-# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
-# in the INCLUDE_PATH (see below) will be search if a #include is found.
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 SEARCH_INCLUDES        = YES
 
 # The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by
-# the preprocessor.
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
 INCLUDE_PATH           =
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will
-# be used.
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 INCLUDE_FILE_PATTERNS  = *.h
 
-# The PREDEFINED tag can be used to specify one or more macro names that
-# are defined before the preprocessor is started (similar to the -D option of
-# gcc). The argument of the tag is a list of macros of the form: name
-# or name=definition (no spaces). If the definition and the = are
-# omitted =1 is assumed. To prevent a macro definition from being
-# undefined via #undef or recursively expanded use the := operator
-# instead of the = operator.
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+#
+# In builds where CONFIG_REALTIME_ONLY is set some functions are #ifdefed out
+# which causes reference failures. Hence for doxygen we set it to 0 here.
 
-PREDEFINED             =
+PREDEFINED             = CONFIG_REALTIME_ONLY=0
 
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
-# this tag can be used to specify a list of macro names that should be expanded.
-# The macro definition that is found in the sources will be used.
-# Use the PREDEFINED tag if you want to use a different macro definition.
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 EXPAND_AS_DEFINED      =
 
-# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
-# doxygen's preprocessor will remove all function-like macros that are alone
-# on a line, have an all uppercase name, and do not end with a semicolon. Such
-# function macros are typically used for boiler-plate code, and will confuse
-# the parser if not removed.
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 SKIP_FUNCTION_MACROS   = YES
 
 #---------------------------------------------------------------------------
-# Configuration::additions related to external references
+# Configuration options related to external references
 #---------------------------------------------------------------------------
 
-# The TAGFILES option can be used to specify one or more tagfiles.
-# Optionally an initial location of the external documentation
-# can be added for each tagfile. The format of a tag file without
-# this location is as follows:
-#   TAGFILES = file1 file2 ...
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
 # Adding location for the tag files is done as follows:
-#   TAGFILES = file1=loc1 "file2 = loc2" ...
-# where "loc1" and "loc2" can be relative or absolute paths or
-# URLs. If a location is present for each tag, the installdox tool
-# does not have to be run to correct the links.
-# Note that each tag file must have a unique name
-# (where the name does NOT include the path)
-# If a tag file is not located in the directory in which doxygen
-# is run, you must also specify the path to the tagfile here.
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
 
 TAGFILES               =
 
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create
-# a tag file that is based on the input files it reads.
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
 
 GENERATE_TAGFILE       =
 
-# If the ALLEXTERNALS tag is set to YES all external classes will be listed
-# in the class index. If set to NO only the inherited external classes
-# will be listed.
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
 
 ALLEXTERNALS           = NO
 
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will
-# be listed.
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
 
 EXTERNAL_GROUPS        = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of `which perl').
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
 
-PERL_PATH              = /usr/bin/perl
+EXTERNAL_PAGES         = YES
 
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
-# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base
-# or super classes. Setting the tag to NO turns the diagrams off. Note that
-# this option is superseded by the HAVE_DOT option below. This is only a
-# fallback. It is recommended to install and use dot, since it yields more
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
 # powerful graphs.
+# The default value is: YES.
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to
-# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to
-# specify the directory where the mscgen tool resides. If left empty the tool is assumed to
-# be found in the default search path.
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
 
-MSCGEN_PATH            =
+DIA_PATH               =
 
-# If set to YES, the inheritance and collaboration graphs will hide
-# inheritance and usage relations if the target is undocumented
-# or is not a class.
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
 
 HIDE_UNDOC_RELATIONS   = YES
 
 # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz, a graph visualization
-# toolkit from AT&T and Lucent Bell Labs. The other options in this section
-# have no effect if this option is set to NO (the default)
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
 
 HAVE_DOT               = NO
 
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect inheritance relations. Setting this tag to YES will force the
-# the CLASS_DIAGRAMS tag to NO.
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 CLASS_GRAPH            = YES
 
-# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect implementation dependencies (inheritance, containment, and
-# class references variables) of the class with other documented classes.
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 COLLABORATION_GRAPH    = YES
 
-# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for groups, showing the direct groups dependencies
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 GROUP_GRAPHS           = YES
 
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
 # collaboration diagrams in a style similar to the OMG's Unified Modeling
 # Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 UML_LOOK               = NO
 
-# If set to YES, the inheritance and collaboration graphs will show the
-# relations between templates and their instances.
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 TEMPLATE_RELATIONS     = NO
 
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
-# tags are set to YES then doxygen will generate a graph for each documented
-# file showing the direct and indirect include dependencies of the file with
-# other documented files.
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 INCLUDE_GRAPH          = YES
 
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
-# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
-# documented header file showing the documented files that directly or
-# indirectly include this file.
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 INCLUDED_BY_GRAPH      = YES
 
-# If the CALL_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will
-# generate a call dependency graph for every global function or class method.
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
 # Note that enabling this option will significantly increase the time of a run.
 # So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command.
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 CALL_GRAPH             = NO
 
-# If the CALLER_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will
-# generate a caller dependency graph for every global function or class method.
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
 # Note that enabling this option will significantly increase the time of a run.
 # So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command.
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 CALLER_GRAPH           = NO
 
-# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
-# will graphical hierarchy of all classes instead of a textual one.
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 GRAPHICAL_HIERARCHY    = YES
 
-# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
-# then doxygen will show the dependencies a directory has on other directories
-# in a graphical way. The dependency relations are determined by the #include
-# relations between the files in the directories.
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DIRECTORY_GRAPH        = YES
 
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. Possible values are png, jpg, or gif
-# If left blank png will be used.
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_IMAGE_FORMAT       = png
 
-# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
 # found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_PATH               =
 
 # The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the
-# \dotfile command).
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOTFILE_DIRS           =
 
-# The MAX_DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
-# nodes that will be shown in the graph. If the number of nodes in a graph
-# becomes larger than this value, doxygen will truncate the graph, which is
-# visualized by representing a node as a red box. Note that doxygen if the number
-# of direct children of the root node in a graph is already larger than
-# MAX_DOT_GRAPH_NOTES then the graph will not be shown at all. Also note
-# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_GRAPH_MAX_NODES    = 50
 
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
-# graphs generated by dot. A depth value of 3 means that only nodes reachable
-# from the root by following a path via at most 3 edges will be shown. Nodes
-# that lay further from the root node will be omitted. Note that setting this
-# option to 1 or 2 may greatly reduce the computation time needed for large
-# code bases. Also note that the size of a graph can be further restricted by
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
 # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 MAX_DOT_GRAPH_DEPTH    = 0
 
 # Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, which results in a white background.
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
 # Warning: Depending on the platform used, enabling this option may lead to
 # badly anti-aliased labels on the edges of a graph (i.e. they become hard to
 # read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_TRANSPARENT        = YES
+DOT_TRANSPARENT        = NO
 
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
 # files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10)
-# support this, this feature is disabled by default.
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_MULTI_TARGETS      = NO
 
-# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
-# generate a legend page explaining the meaning of the various boxes and
-# arrows in the dot generated graphs.
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
-# remove the intermediate dot files that are used to generate
-# the various graphs.
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
 
diff --git a/media/libaom/src/mainpage.dox b/media/libaom/src/mainpage.dox
index 03a299ae1a..10924acbcf 100644
--- a/media/libaom/src/mainpage.dox
+++ b/media/libaom/src/mainpage.dox
@@ -1,28 +1,31 @@
-/*!\mainpage AMedia Codec SDK
+/*!\mainpage AOMedia AV1 Codec
 
-  \section main_contents Page Contents
-  - \ref main_intro
-  - \ref main_startpoints
-  - \ref main_support
+  \tableofcontents
 
-  \section main_intro Introduction
-  Welcome to the AMedia Codec SDK. This SDK allows you to integrate your
+  \section aom_sdk AOMedia Codec SDK
+
+  \subsection main_intro Introduction
+  Welcome to the AOMedia Codec SDK. This SDK allows you to integrate your
   applications with the AOM and AV1 video codecs.
 
   This distribution of the AOMedia Codec SDK includes the following support:
 
-  \if aom_encoder
+  \if av1_encoder
   - \ref aom_encoder
   \endif
-  \if aom_decoder
+  \if av1_decoder
   - \ref aom_decoder
   \endif
 
 
-  \section main_startpoints Starting Points
+  \subsection main_startpoints Starting Points
   - Consult the \ref changelog for a complete list of improvements in this
     release.
+  \if av1_md_support
+  - [README](\ref LREADME) contains instructions on compiling the sample applications.
+  \else
   - \ref readme contains instructions on compiling the sample applications.
+  \endif
   - Read the \ref usage "usage" for a narrative on codec usage.
   - Read the \ref samples "sample code" for examples of how to interact with the
     codec.
@@ -33,20 +36,33 @@
   \if decoder
   - \ref decoder reference
   \endif
+  <br>
+
+  \section av1_guide AV1 Developer's Guide
+
+  \if av1_encoder
+  - \ref encoder_guide
+  \endif
+
+  \if av1_decoder
+  - \ref decoder_guide
+  \endif
+  <br>
 
   \section main_support Support Options & FAQ
-  The AOMedia project is an open source project supported by its community. For
-  questions about this SDK, please mail the apps-devel@webmproject.org list.
-  To contribute, see http://www.webmproject.org/code/contribute and mail
-  codec-devel@webmproject.org.
+  The AOMedia project is an open source project supported by its community.
+  For questions about this SDK or for help, please visit http://aomedia.org/
+  and email the aomediacodec@jointdevelopment.kavi.com list.
 */
 
 /*!\page changelog CHANGELOG
    \verbinclude CHANGELOG
 */
 
+\ifnot av1_md_support
 /*!\page readme README.md
    \include README.md
 */
+\endif
 
 /*!\defgroup codecs Supported Codecs */
diff --git a/media/libaom/src/stats/aomstats.c b/media/libaom/src/stats/aomstats.c
index 4a15adf02c..a006ec030f 100644
--- a/media/libaom/src/stats/aomstats.c
+++ b/media/libaom/src/stats/aomstats.c
@@ -11,10 +11,12 @@
 
 #include "stats/aomstats.h"
 
+#include <assert.h>
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 
+#include "aom_dsp/aom_dsp_common.h"
 #include "common/tools_common.h"
 
 int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
@@ -42,7 +44,7 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
     stats->buf.buf = malloc(stats->buf_alloc_sz);
 
     if (!stats->buf.buf)
-      fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
+      fatal("Failed to allocate first-pass stats buffer (%u bytes)",
             (unsigned int)stats->buf_alloc_sz);
 
     nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
@@ -83,24 +85,28 @@ void stats_close(stats_io_t *stats, int last_pass) {
 void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
   if (stats->file) {
     (void)fwrite(pkt, 1, len, stats->file);
-  } else {
-    if (stats->buf.sz + len > stats->buf_alloc_sz) {
-      size_t new_sz = stats->buf_alloc_sz + 64 * 1024;
-      char *new_ptr = realloc(stats->buf.buf, new_sz);
-
-      if (new_ptr) {
-        stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
-        stats->buf.buf = new_ptr;
-        stats->buf_alloc_sz = new_sz;
-      } else {
-        fatal("Failed to realloc firstpass stats buffer.");
-      }
+    return;
+  }
+  assert(stats->buf.sz <= stats->buf_alloc_sz);
+  assert(0 < stats->buf_alloc_sz);
+  if (stats->buf.sz + len > stats->buf_alloc_sz) {
+    // Grow by a factor of 1.5 each time, for amortized constant time.
+    // Also make sure there is enough room for the data.
+    size_t new_sz = AOMMAX((3 * stats->buf_alloc_sz) / 2, stats->buf.sz + len);
+    char *new_ptr = realloc(stats->buf.buf, new_sz);
+
+    if (new_ptr) {
+      stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
+      stats->buf.buf = new_ptr;
+      stats->buf_alloc_sz = new_sz;
+    } else {
+      fatal("Failed to realloc firstpass stats buffer.");
     }
-
-    memcpy(stats->buf_ptr, pkt, len);
-    stats->buf.sz += len;
-    stats->buf_ptr += len;
   }
+
+  memcpy(stats->buf_ptr, pkt, len);
+  stats->buf.sz += len;
+  stats->buf_ptr += len;
 }
 
 aom_fixed_buf_t stats_get(stats_io_t *stats) { return stats->buf; }
diff --git a/media/libaom/src/stats/rate_hist.c b/media/libaom/src/stats/rate_hist.c
index 71eb78b720..ae76fda102 100644
--- a/media/libaom/src/stats/rate_hist.c
+++ b/media/libaom/src/stats/rate_hist.c
@@ -38,7 +38,13 @@ struct rate_hist {
 struct rate_hist *init_rate_histogram(const aom_codec_enc_cfg_t *cfg,
                                       const aom_rational_t *fps) {
   int i;
-  struct rate_hist *hist = malloc(sizeof(*hist));
+  struct rate_hist *hist = calloc(1, sizeof(*hist));
+
+  if (hist == NULL || cfg == NULL || fps == NULL || fps->num == 0 ||
+      fps->den == 0) {
+    destroy_rate_histogram(hist);
+    return NULL;
+  }
 
   // Determine the number of samples in the buffer. Use the file's framerate
   // to determine the number of frames in rc_buf_sz milliseconds, with an
@@ -81,7 +87,11 @@ void update_rate_histogram(struct rate_hist *hist,
                       (uint64_t)cfg->g_timebase.num /
                       (uint64_t)cfg->g_timebase.den;
 
-  int idx = hist->frames++ % hist->samples;
+  int idx;
+
+  if (hist == NULL || cfg == NULL || pkt == NULL) return;
+
+  idx = hist->frames++ % hist->samples;
   hist->pts[idx] = now;
   hist->sz[idx] = (int)pkt->data.frame.sz;
 
@@ -117,9 +127,14 @@ void update_rate_histogram(struct rate_hist *hist,
 static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets,
                               int *num_buckets) {
   int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0;
-  int buckets = *num_buckets;
+  int buckets;
   int i;
 
+  assert(bucket != NULL);
+  assert(num_buckets != NULL);
+
+  buckets = *num_buckets;
+
   /* Find the extrema for this list of buckets */
   big_bucket = small_bucket = 0;
   for (i = 0; i < buckets; i++) {
@@ -179,38 +194,42 @@ static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets,
 
 static void show_histogram(const struct hist_bucket *bucket, int buckets,
                            int total, int scale) {
-  const char *pat1, *pat2;
+  int width1, width2;
   int i;
 
+  if (!buckets) return;
+  assert(bucket != NULL);
+  assert(buckets > 0);
+
   switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
     case 1:
     case 2:
-      pat1 = "%4d %2s: ";
-      pat2 = "%4d-%2d: ";
+      width1 = 4;
+      width2 = 2;
       break;
     case 3:
-      pat1 = "%5d %3s: ";
-      pat2 = "%5d-%3d: ";
+      width1 = 5;
+      width2 = 3;
       break;
     case 4:
-      pat1 = "%6d %4s: ";
-      pat2 = "%6d-%4d: ";
+      width1 = 6;
+      width2 = 4;
       break;
     case 5:
-      pat1 = "%7d %5s: ";
-      pat2 = "%7d-%5d: ";
+      width1 = 7;
+      width2 = 5;
       break;
     case 6:
-      pat1 = "%8d %6s: ";
-      pat2 = "%8d-%6d: ";
+      width1 = 8;
+      width2 = 6;
       break;
     case 7:
-      pat1 = "%9d %7s: ";
-      pat2 = "%9d-%7d: ";
+      width1 = 9;
+      width2 = 7;
       break;
     default:
-      pat1 = "%12d %10s: ";
-      pat2 = "%12d-%10d: ";
+      width1 = 12;
+      width2 = 10;
       break;
   }
 
@@ -225,9 +244,10 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets,
     assert(len <= HIST_BAR_MAX);
 
     if (bucket[i].low == bucket[i].high)
-      fprintf(stderr, pat1, bucket[i].low, "");
+      fprintf(stderr, "%*d %*s: ", width1, bucket[i].low, width2, "");
     else
-      fprintf(stderr, pat2, bucket[i].low, bucket[i].high);
+      fprintf(stderr, "%*d-%*d: ", width1, bucket[i].low, width2,
+              bucket[i].high);
 
     for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " ");
     fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct);
@@ -260,6 +280,8 @@ void show_rate_histogram(struct rate_hist *hist, const aom_codec_enc_cfg_t *cfg,
   int i, scale;
   int buckets = 0;
 
+  if (hist == NULL || cfg == NULL) return;
+
   for (i = 0; i < RATE_BINS; i++) {
     if (hist->bucket[i].low == INT_MAX) continue;
     hist->bucket[buckets++] = hist->bucket[i];
diff --git a/media/libaom/src/test/active_map_test.cc b/media/libaom/src/test/active_map_test.cc
index 0f8a7329e0..2bbc3b64fb 100644
--- a/media/libaom/src/test/active_map_test.cc
+++ b/media/libaom/src/test/active_map_test.cc
@@ -30,8 +30,7 @@ class ActiveMapTest
   virtual ~ActiveMapTest() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
+    InitializeConfig(GET_PARAM(1));
     cpu_used_ = GET_PARAM(2);
   }
 
@@ -39,6 +38,9 @@ class ActiveMapTest
                                   ::libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
     } else if (video->frame() == 3) {
       aom_active_map_t map = aom_active_map_t();
       /* clang-format off */
@@ -88,16 +90,8 @@ class ActiveMapTest
 
 TEST_P(ActiveMapTest, Test) { DoTest(); }
 
-class ActiveMapTestLarge : public ActiveMapTest {};
-
-TEST_P(ActiveMapTestLarge, Test) { DoTest(); }
-
-AV1_INSTANTIATE_TEST_CASE(ActiveMapTestLarge,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(0, 5));
-
-AV1_INSTANTIATE_TEST_CASE(ActiveMapTest,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(5, 9));
+AV1_INSTANTIATE_TEST_SUITE(ActiveMapTest,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(5, 9));
 
 }  // namespace
diff --git a/media/libaom/src/test/altref_test.cc b/media/libaom/src/test/altref_test.cc
index 43df39fb68..002a206967 100644
--- a/media/libaom/src/test/altref_test.cc
+++ b/media/libaom/src/test/altref_test.cc
@@ -15,84 +15,201 @@
 #include "test/i420_video_source.h"
 #include "test/util.h"
 namespace {
+typedef struct {
+  const unsigned int min_kf_dist;
+  const unsigned int max_kf_dist;
+  const unsigned int min_gf_interval;
+  const unsigned int max_gf_interval;
+  const unsigned int lag_in_frames;
+  libaom_test::TestMode encoding_mode;
+} AltRefTestParams;
 
-class AltRefForcedKeyTestLarge
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+static const AltRefTestParams TestParams[] = {
+  { 0, 10, 4, 8, 10, ::libaom_test::kOnePassGood },
+  { 0, 30, 8, 12, 16, ::libaom_test::kOnePassGood },
+  { 30, 30, 12, 16, 25, ::libaom_test::kOnePassGood },
+  { 0, 60, 12, 20, 25, ::libaom_test::kOnePassGood },
+  { 60, 60, 16, 28, 30, ::libaom_test::kOnePassGood },
+  { 0, 100, 16, 32, 35, ::libaom_test::kOnePassGood },
+  { 0, 10, 4, 8, 10, ::libaom_test::kTwoPassGood },
+  { 0, 30, 8, 12, 16, ::libaom_test::kTwoPassGood },
+  { 30, 30, 12, 16, 25, ::libaom_test::kTwoPassGood },
+  { 0, 60, 16, 24, 25, ::libaom_test::kTwoPassGood },
+  { 60, 60, 20, 28, 30, ::libaom_test::kTwoPassGood },
+  { 0, 100, 24, 32, 35, ::libaom_test::kTwoPassGood },
+};
+
+std::ostream &operator<<(std::ostream &os, const AltRefTestParams &test_arg) {
+  return os << "AltRefTestParams { min_kf_dist:" << test_arg.min_kf_dist
+            << " max_kf_dist:" << test_arg.max_kf_dist
+            << " min_gf_interval:" << test_arg.min_gf_interval
+            << " max_gf_interval:" << test_arg.max_gf_interval
+            << " lag_in_frames:" << test_arg.lag_in_frames
+            << " encoding_mode:" << test_arg.encoding_mode << " }";
+}
+
+// This class is used to check the presence of altref frame.
+class AltRefFramePresenceTestLarge
+    : public ::libaom_test::CodecTestWith2Params<AltRefTestParams, aom_rc_mode>,
       public ::libaom_test::EncoderTest {
  protected:
-  AltRefForcedKeyTestLarge()
-      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
-        cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {}
-  virtual ~AltRefForcedKeyTestLarge() {}
+  AltRefFramePresenceTestLarge()
+      : EncoderTest(GET_PARAM(0)), altref_test_params_(GET_PARAM(1)),
+        rc_end_usage_(GET_PARAM(2)) {
+    is_arf_frame_present_ = 0;
+  }
+  virtual ~AltRefFramePresenceTestLarge() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
-    cfg_.rc_end_usage = AOM_VBR;
-    cfg_.g_threads = 0;
+    InitializeConfig(altref_test_params_.encoding_mode);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    cfg_.kf_min_dist = altref_test_params_.min_kf_dist;
+    cfg_.kf_max_dist = altref_test_params_.max_kf_dist;
+    cfg_.g_lag_in_frames = altref_test_params_.lag_in_frames;
   }
 
+  virtual bool DoDecode() const { return 1; }
+
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
                                   ::libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
-      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_CPUUSED, 5);
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
-#if CONFIG_AV1_ENCODER
-      // override test default for tile columns if necessary.
-      if (GET_PARAM(0) == &libaom_test::kAV1) {
-        encoder->Control(AV1E_SET_TILE_COLUMNS, 6);
-      }
-#endif
+      encoder->Control(AV1E_SET_MIN_GF_INTERVAL,
+                       altref_test_params_.min_gf_interval);
+      encoder->Control(AV1E_SET_MAX_GF_INTERVAL,
+                       altref_test_params_.max_gf_interval);
     }
-    frame_flags_ =
-        (video->frame() == forced_kf_frame_num_) ? AOM_EFLAG_FORCE_KF : 0;
   }
 
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
-    if (frame_num_ == forced_kf_frame_num_) {
-      ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY,
-                static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_KEY))
-          << "Frame #" << frame_num_ << " isn't a keyframe!";
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (is_arf_frame_present_ != 1 && AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_ALTREF_PRESENT,
+                                    &is_arf_frame_present_);
     }
-    ++frame_num_;
+    return AOM_CODEC_OK == res_dec;
   }
 
-  ::libaom_test::TestMode encoding_mode_;
-  int cpu_used_;
-  unsigned int forced_kf_frame_num_;
-  unsigned int frame_num_;
+  const AltRefTestParams altref_test_params_;
+  int is_arf_frame_present_;
+  aom_rc_mode rc_end_usage_;
 };
 
-TEST_P(AltRefForcedKeyTestLarge, Frame1IsKey) {
-  const aom_rational timebase = { 1, 30 };
-  const int lag_values[] = { 3, 15, 25, -1 };
+TEST_P(AltRefFramePresenceTestLarge, AltRefFrameEncodePresenceTest) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 100);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(is_arf_frame_present_, 1);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(AltRefFramePresenceTestLarge,
+                           ::testing::ValuesIn(TestParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+
+typedef struct {
+  const ::libaom_test::TestMode encoding_mode;
+  const unsigned int min_gf_interval;
+  const unsigned int max_gf_interval;
+} gfIntervalParam;
 
-  forced_kf_frame_num_ = 1;
-  for (int i = 0; lag_values[i] != -1; ++i) {
+const gfIntervalParam gfTestParams[] = {
+  // single pass
+  { ::libaom_test::kOnePassGood, 0, 6 },
+  { ::libaom_test::kOnePassGood, 0, 8 },
+  { ::libaom_test::kOnePassGood, 5, 10 },
+  { ::libaom_test::kOnePassGood, 8, 16 },
+  { ::libaom_test::kOnePassGood, 16, 16 },
+
+  // two pass
+  { ::libaom_test::kTwoPassGood, 0, 6 },
+  { ::libaom_test::kTwoPassGood, 0, 8 },
+  { ::libaom_test::kTwoPassGood, 5, 10 },
+  { ::libaom_test::kTwoPassGood, 8, 16 },
+  { ::libaom_test::kTwoPassGood, 16, 32 },
+  { ::libaom_test::kTwoPassGood, 20, 32 },
+};
+
+// This class is used to test if the gf interval bounds configured by the user
+// are respected by the encoder.
+class GoldenFrameIntervalTestLarge
+    : public ::libaom_test::CodecTestWith2Params<gfIntervalParam, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  GoldenFrameIntervalTestLarge()
+      : EncoderTest(GET_PARAM(0)), gf_interval_param_(GET_PARAM(1)),
+        rc_end_usage_(GET_PARAM(2)) {
+    baseline_gf_interval_ = -1;
+    limit_ = 60;
     frame_num_ = 0;
-    cfg_.g_lag_in_frames = lag_values[i];
-    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       timebase.den, timebase.num, 0, 30);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
-}
+  virtual ~GoldenFrameIntervalTestLarge() {}
 
-TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) {
-  const aom_rational timebase = { 1, 30 };
-  const int lag_values[] = { 3, 15, 25, -1 };
+  virtual void SetUp() {
+    InitializeConfig(gf_interval_param_.encoding_mode);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    // kf_min_dist is equal to kf_max_dist to make sure that there are no scene
+    // cuts due to which the min_gf_interval may not be respected.
+    cfg_.kf_min_dist = limit_;
+    cfg_.kf_max_dist = limit_;
+    cfg_.g_limit = limit_;
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_target_bitrate = 1000;
+  }
 
-  for (int i = 0; lag_values[i] != -1; ++i) {
-    frame_num_ = 0;
-    forced_kf_frame_num_ = lag_values[i] - 1;
-    cfg_.g_lag_in_frames = lag_values[i];
-    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       timebase.den, timebase.num, 0, 30);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AV1E_SET_MIN_GF_INTERVAL,
+                       gf_interval_param_.min_gf_interval);
+      encoder->Control(AV1E_SET_MAX_GF_INTERVAL,
+                       gf_interval_param_.max_gf_interval);
+    }
+    if (frame_num_ > 0) {
+      encoder->Control(AV1E_GET_BASELINE_GF_INTERVAL, &baseline_gf_interval_);
+      ASSERT_LE(baseline_gf_interval_,
+                (int)gf_interval_param_.max_gf_interval + 1);
+      if ((frame_num_ + (int)gf_interval_param_.min_gf_interval) <= limit_) {
+        ASSERT_GE(baseline_gf_interval_,
+                  (int)gf_interval_param_.min_gf_interval);
+      }
+    }
   }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    (void)pkt;
+    ++frame_num_;
+  }
+
+  const gfIntervalParam gf_interval_param_;
+  int baseline_gf_interval_;
+  int limit_;
+  int frame_num_;
+  aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(GoldenFrameIntervalTestLarge, GoldenFrameIntervalTest) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, limit_);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-AV1_INSTANTIATE_TEST_CASE(AltRefForcedKeyTestLarge,
-                          ::testing::Values(::libaom_test::kOnePassGood),
-                          ::testing::Values(2, 5));
+AV1_INSTANTIATE_TEST_SUITE(GoldenFrameIntervalTestLarge,
+                           ::testing::ValuesIn(gfTestParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CQ, AOM_CBR));
 
 }  // namespace
diff --git a/media/libaom/src/test/aom_image_test.cc b/media/libaom/src/test/aom_image_test.cc
new file mode 100644
index 0000000000..6ee0058342
--- /dev/null
+++ b/media/libaom/src/test/aom_image_test.cc
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_image.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+TEST(AomImageTest, AomImgWrapInvalidAlign) {
+  const int kWidth = 128;
+  const int kHeight = 128;
+  unsigned char buf[kWidth * kHeight * 3];
+
+  aom_image_t img;
+  // Set img_data and img_data_owner to junk values. aom_img_wrap() should
+  // not read these values on failure.
+  img.img_data = (unsigned char *)"";
+  img.img_data_owner = 1;
+
+  aom_img_fmt_t format = AOM_IMG_FMT_I444;
+  // 'align' must be a power of 2 but is not. This causes the aom_img_wrap()
+  // call to fail. The test verifies we do not read the junk values in 'img'.
+  unsigned int align = 31;
+  EXPECT_EQ(aom_img_wrap(&img, format, kWidth, kHeight, align, buf), nullptr);
+}
+
+TEST(AomImageTest, AomImgSetRectOverflow) {
+  const int kWidth = 128;
+  const int kHeight = 128;
+  unsigned char buf[kWidth * kHeight * 3];
+
+  aom_image_t img;
+  aom_img_fmt_t format = AOM_IMG_FMT_I444;
+  unsigned int align = 32;
+  EXPECT_EQ(aom_img_wrap(&img, format, kWidth, kHeight, align, buf), &img);
+
+  EXPECT_EQ(aom_img_set_rect(&img, 0, 0, kWidth, kHeight, 0), 0);
+  // This would result in overflow because -1 is cast to UINT_MAX.
+  EXPECT_NE(aom_img_set_rect(&img, -1, -1, kWidth, kHeight, 0), 0);
+}
+
+TEST(AomImageTest, AomImgAllocNv12) {
+  const int kWidth = 128;
+  const int kHeight = 128;
+
+  aom_image_t img;
+  aom_img_fmt_t format = AOM_IMG_FMT_NV12;
+  unsigned int align = 32;
+  EXPECT_NE(aom_img_alloc(&img, format, kWidth, kHeight, align), nullptr);
+  EXPECT_EQ(img.stride[AOM_PLANE_U], img.stride[AOM_PLANE_Y]);
+  EXPECT_EQ(img.stride[AOM_PLANE_V], 0);
+  EXPECT_EQ(img.planes[AOM_PLANE_V], nullptr);
+  aom_img_free(&img);
+}
diff --git a/media/libaom/src/test/aom_mem_test.cc b/media/libaom/src/test/aom_mem_test.cc
new file mode 100644
index 0000000000..849ba64435
--- /dev/null
+++ b/media/libaom/src/test/aom_mem_test.cc
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_mem/aom_mem.h"
+
+#include <cstdio>
+#include <cstddef>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+TEST(AomMemTest, Overflow) {
+  // Allocations are aligned > 1 so SIZE_MAX should always fail.
+  ASSERT_EQ(aom_malloc(SIZE_MAX), nullptr);
+  ASSERT_EQ(aom_calloc(1, SIZE_MAX), nullptr);
+  ASSERT_EQ(aom_calloc(32, SIZE_MAX / 32), nullptr);
+  ASSERT_EQ(aom_calloc(SIZE_MAX, SIZE_MAX), nullptr);
+  ASSERT_EQ(aom_memalign(1, SIZE_MAX), nullptr);
+  ASSERT_EQ(aom_memalign(64, SIZE_MAX), nullptr);
+  ASSERT_EQ(aom_memalign(64, SIZE_MAX - 64), nullptr);
+  ASSERT_EQ(aom_memalign(64, SIZE_MAX - 64 - sizeof(size_t) + 2), nullptr);
+}
+
+TEST(AomMemTest, NullParams) {
+  ASSERT_EQ(aom_memset16(nullptr, 0, 0), nullptr);
+  aom_free(nullptr);
+}
diff --git a/media/libaom/src/test/aomcx_set_ref.sh b/media/libaom/src/test/aomcx_set_ref.sh
index f51b73c58e..237e2f319c 100644..100755
--- a/media/libaom/src/test/aomcx_set_ref.sh
+++ b/media/libaom/src/test/aomcx_set_ref.sh
@@ -41,7 +41,7 @@ aom_set_ref() {
 
   eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
-      "${ref_frame_num}" "${limit}" ${devnull}
+      "${ref_frame_num}" "${limit}" ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/media/libaom/src/test/aomdec.sh b/media/libaom/src/test/aomdec.sh
index 927142287c..eb1649a03b 100644..100755
--- a/media/libaom/src/test/aomdec.sh
+++ b/media/libaom/src/test/aomdec.sh
@@ -14,6 +14,9 @@
 ##
 . $(dirname $0)/tools_common.sh
 
+AV1_MONOCHROME_B10="${LIBAOM_TEST_DATA_PATH}/av1-1-b10-24-monochrome.ivf"
+AV1_MONOCHROME_B8="${LIBAOM_TEST_DATA_PATH}/av1-1-b8-24-monochrome.ivf"
+
 # Environment check: Make sure input is available.
 aomdec_verify_environment() {
   if [ "$(av1_encode_available)" != "yes" ] ; then
@@ -26,6 +29,9 @@ aomdec_verify_environment() {
       return 1
     fi
   fi
+  if [ ! -e "${AV1_MONOCHROME_B10}" ] || [ ! -e "${AV1_MONOCHROME_B8}" ]; then
+    elog "Libaom test data must exist before running this test script."
+  fi
   if [ -z "$(aom_tool_path aomdec)" ]; then
     elog "aomdec not found. It must exist in LIBAOM_BIN_PATH or its parent."
     return 1
@@ -67,7 +73,7 @@ aomdec_av1_ivf() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="${AV1_IVF_FILE}"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}" --ivf
+      encode_yuv_raw_input_av1 "${file}" --ivf || return 1
     fi
     aomdec "${AV1_IVF_FILE}" --summary --noblit
   fi
@@ -77,29 +83,39 @@ aomdec_av1_ivf_error_resilient() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="av1.error-resilient.ivf"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}" --ivf --error-resilient=1
+      encode_yuv_raw_input_av1 "${file}" --ivf --error-resilient=1 || return 1
     fi
     aomdec "${file}" --summary --noblit
   fi
 }
 
-aomdec_av1_ivf_multithread() {
+ivf_multithread() {
+  local row_mt="$1"
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="${AV1_IVF_FILE}"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}" --ivf
+      encode_yuv_raw_input_av1 "${file}" --ivf || return 1
     fi
     for threads in 2 3 4 5 6 7 8; do
-      aomdec "${file}" --summary --noblit --threads=$threads
+      aomdec "${file}" --summary --noblit --threads=$threads --row-mt=$row_mt \
+        || return 1
     done
   fi
 }
 
+aomdec_av1_ivf_multithread() {
+  ivf_multithread 0  # --row-mt=0
+}
+
+aomdec_av1_ivf_multithread_row_mt() {
+  ivf_multithread 1  # --row-mt=1
+}
+
 aomdec_aom_ivf_pipe_input() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="${AV1_IVF_FILE}"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}" --ivf
+      encode_yuv_raw_input_av1 "${file}" --ivf || return 1
     fi
     aomdec_pipe "${AV1_IVF_FILE}" --summary --noblit
   fi
@@ -109,7 +125,7 @@ aomdec_av1_obu_annexb() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="${AV1_OBU_ANNEXB_FILE}"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}" --obu --annexb=1
+      encode_yuv_raw_input_av1 "${file}" --obu --annexb=1 || return 1
     fi
     aomdec "${file}" --summary --noblit --annexb
   fi
@@ -119,7 +135,7 @@ aomdec_av1_obu_section5() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
     local file="${AV1_OBU_SEC5_FILE}"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}" --obu
+      encode_yuv_raw_input_av1 "${file}" --obu || return 1
     fi
     aomdec "${file}" --summary --noblit
   fi
@@ -130,18 +146,52 @@ aomdec_av1_webm() {
      [ "$(webm_io_available)" = "yes" ]; then
     local file="${AV1_WEBM_FILE}"
     if [ ! -e "${file}" ]; then
-      encode_yuv_raw_input_av1 "${file}"
+      encode_yuv_raw_input_av1 "${file}" || return 1
     fi
     aomdec "${AV1_WEBM_FILE}" --summary --noblit
   fi
 }
 
+aomdec_av1_monochrome_yuv() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local input="$1"
+    local basename="$(basename "${input}")"
+    local output="${basename}-%wx%h-%4.i420"
+    local md5file="${AOM_TEST_OUTPUT_DIR}/${basename}.md5"
+    local decoder="$(aom_tool_path aomdec)"
+    # Note aomdec() is not used to avoid ${devnull} which may also redirect
+    # stdout.
+    eval "${AOM_TEST_PREFIX}" "${decoder}" --md5 --i420 \
+      -o "${output}" "${input}" ">" "${md5file}" 2>&1 || return 1
+    diff "${1}.md5" "${md5file}"
+  fi
+}
+
+aomdec_av1_monochrome_yuv_8bit() {
+  aomdec_av1_monochrome_yuv "${AV1_MONOCHROME_B8}"
+}
+
+aomdec_av1_monochrome_yuv_10bit() {
+  aomdec_av1_monochrome_yuv "${AV1_MONOCHROME_B10}"
+}
+
 aomdec_tests="aomdec_av1_ivf
-              aomdec_av1_ivf_error_resilient
               aomdec_av1_ivf_multithread
+              aomdec_av1_ivf_multithread_row_mt
               aomdec_aom_ivf_pipe_input
-              aomdec_av1_obu_annexb
-              aomdec_av1_obu_section5
-              aomdec_av1_webm"
+              aomdec_av1_monochrome_yuv_8bit"
+
+if [ ! "$(realtime_only_build)" = "yes" ]; then
+  aomdec_tests="${aomdec_tests}
+                aomdec_av1_ivf_error_resilient
+                aomdec_av1_obu_annexb
+                aomdec_av1_obu_section5
+                aomdec_av1_webm"
+fi
+
+if [ "$(highbitdepth_available)" = "yes" ]; then
+  aomdec_tests="${aomdec_tests}
+                aomdec_av1_monochrome_yuv_10bit"
+fi
 
 run_tests aomdec_verify_environment "${aomdec_tests}"
diff --git a/media/libaom/src/test/aomenc.sh b/media/libaom/src/test/aomenc.sh
index b030397a30..ed98313300 100644..100755
--- a/media/libaom/src/test/aomenc.sh
+++ b/media/libaom/src/test/aomenc.sh
@@ -89,7 +89,44 @@ aomenc_av1_ivf() {
     aomenc $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
       --ivf \
-      --output="${output}"
+      --output="${output}" || return 1
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_ivf_rt() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AV1_IVF_FILE}"
+    if [ -e "${AV1_IVF_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.ivf"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_rt_params) \
+      --ivf \
+      --output="${output}" || return 1
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_ivf_use_16bit_internal() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AV1_IVF_FILE}"
+    if [ -e "${AV1_IVF_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test_16bit.ivf"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --ivf \
+      --use-16bit-internal \
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -108,7 +145,7 @@ aomenc_av1_obu_annexb() {
       $(aomenc_encode_test_fast_params) \
       --obu \
       --annexb=1 \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -126,7 +163,7 @@ aomenc_av1_obu_section5() {
     aomenc $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
       --obu \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -144,7 +181,7 @@ aomenc_av1_webm() {
     fi
     aomenc $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -160,7 +197,7 @@ aomenc_av1_webm_1pass() {
     aomenc $(yuv_raw_input) \
       $(aomenc_encode_test_fast_params) \
       --passes=1 \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -176,7 +213,7 @@ aomenc_av1_ivf_lossless() {
       $(aomenc_encode_test_fast_params) \
       --ivf \
       --output="${output}" \
-      --lossless=1
+      --lossless=1 || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -193,7 +230,7 @@ aomenc_av1_ivf_minq0_maxq0() {
       --ivf \
       --output="${output}" \
       --min-q=0 \
-      --max-q=0
+      --max-q=0 || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -212,7 +249,7 @@ aomenc_av1_webm_lag5_frames10() {
       $(aomenc_encode_test_fast_params) \
       --limit=${lag_total_frames} \
       --lag-in-frames=${lag_frames} \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -228,7 +265,7 @@ aomenc_av1_webm_non_square_par() {
     local output="${AOM_TEST_OUTPUT_DIR}/av1_non_square_par.webm"
     aomenc $(y4m_input_non_square_par) \
       $(aomenc_encode_test_fast_params) \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -245,7 +282,7 @@ aomenc_av1_webm_cdf_update_mode() {
       aomenc $(yuv_raw_input) \
         $(aomenc_encode_test_fast_params) \
         --cdf-update-mode=${mode} \
-        --output="${output}"
+        --output="${output}" || return 1
 
       if [ ! -e "${output}" ]; then
         elog "Output file does not exist."
@@ -255,15 +292,21 @@ aomenc_av1_webm_cdf_update_mode() {
   fi
 }
 
-aomenc_tests="aomenc_av1_ivf
-              aomenc_av1_obu_annexb
-              aomenc_av1_obu_section5
-              aomenc_av1_webm
-              aomenc_av1_webm_1pass
-              aomenc_av1_ivf_lossless
-              aomenc_av1_ivf_minq0_maxq0
-              aomenc_av1_webm_lag5_frames10
-              aomenc_av1_webm_non_square_par
-              aomenc_av1_webm_cdf_update_mode"
+if [ "$(realtime_only_build)" = "yes" ]; then
+  aomenc_tests="aomenc_av1_ivf_rt"
+else
+  aomenc_tests="aomenc_av1_ivf
+                aomenc_av1_ivf_rt
+                aomenc_av1_obu_annexb
+                aomenc_av1_obu_section5
+                aomenc_av1_webm
+                aomenc_av1_webm_1pass
+                aomenc_av1_ivf_lossless
+                aomenc_av1_ivf_minq0_maxq0
+                aomenc_av1_ivf_use_16bit_internal
+                aomenc_av1_webm_lag5_frames10
+                aomenc_av1_webm_non_square_par
+                aomenc_av1_webm_cdf_update_mode"
+fi
 
 run_tests aomenc_verify_environment "${aomenc_tests}"
diff --git a/media/libaom/src/test/aq_segment_test.cc b/media/libaom/src/test/aq_segment_test.cc
index 83bfdb6701..b4a8b612bf 100644
--- a/media/libaom/src/test/aq_segment_test.cc
+++ b/media/libaom/src/test/aq_segment_test.cc
@@ -19,6 +19,13 @@
 
 namespace {
 
+const libaom_test::TestMode kTestModeParams[] =
+#if CONFIG_REALTIME_ONLY
+    { ::libaom_test::kRealTime };
+#else
+    { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood };
+#endif
+
 class AqSegmentTest
     : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
                                                  int>,
@@ -28,8 +35,7 @@ class AqSegmentTest
   virtual ~AqSegmentTest() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
+    InitializeConfig(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
     aq_mode_ = 0;
   }
@@ -41,6 +47,11 @@ class AqSegmentTest
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
       encoder->Control(AV1E_SET_DELTAQ_MODE, deltaq_mode_);
       encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+      if (mode_ == ::libaom_test::kRealTime) {
+        encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+      }
     }
   }
 
@@ -70,10 +81,7 @@ class AqSegmentTest
 // 3-cyclic_refresh_aq) encodes and decodes without a mismatch.
 TEST_P(AqSegmentTest, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
 
-class AqSegmentTestLarge : public AqSegmentTest {};
-
-TEST_P(AqSegmentTestLarge, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
-
+#if !CONFIG_REALTIME_ONLY
 // Validate that this delta q mode
 // encodes and decodes without a mismatch.
 TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) {
@@ -85,13 +93,18 @@ TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) {
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
+#endif
+
+AV1_INSTANTIATE_TEST_SUITE(AqSegmentTest, ::testing::ValuesIn(kTestModeParams),
+                           ::testing::Range(5, 9), ::testing::Range(0, 4));
+
+#if !CONFIG_REALTIME_ONLY
+class AqSegmentTestLarge : public AqSegmentTest {};
+
+TEST_P(AqSegmentTestLarge, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
 
-AV1_INSTANTIATE_TEST_CASE(AqSegmentTest,
-                          ::testing::Values(::libaom_test::kRealTime,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(5, 9), ::testing::Range(0, 4));
-AV1_INSTANTIATE_TEST_CASE(AqSegmentTestLarge,
-                          ::testing::Values(::libaom_test::kRealTime,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(3, 5), ::testing::Range(0, 4));
+AV1_INSTANTIATE_TEST_SUITE(AqSegmentTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood),
+                           ::testing::Range(3, 5), ::testing::Range(0, 4));
+#endif
 }  // namespace
diff --git a/media/libaom/src/test/arf_freq_test.cc b/media/libaom/src/test/arf_freq_test.cc
index 0780cd712d..63ccdfc261 100644
--- a/media/libaom/src/test/arf_freq_test.cc
+++ b/media/libaom/src/test/arf_freq_test.cc
@@ -56,9 +56,13 @@ const TestVideoParam kTestVectors[] = {
 };
 
 const TestEncodeParam kEncodeVectors[] = {
-  { ::libaom_test::kOnePassGood, 2 }, { ::libaom_test::kOnePassGood, 5 },
-  { ::libaom_test::kTwoPassGood, 1 }, { ::libaom_test::kTwoPassGood, 2 },
-  { ::libaom_test::kTwoPassGood, 5 }, { ::libaom_test::kRealTime, 5 },
+#if CONFIG_REALTIME_ONLY
+  { ::libaom_test::kRealTime, 5 },
+#else
+  { ::libaom_test::kRealTime, 5 },    { ::libaom_test::kOnePassGood, 2 },
+  { ::libaom_test::kOnePassGood, 5 }, { ::libaom_test::kTwoPassGood, 1 },
+  { ::libaom_test::kTwoPassGood, 2 }, { ::libaom_test::kTwoPassGood, 5 },
+#endif
 };
 
 const int kMinArfVectors[] = {
@@ -67,14 +71,6 @@ const int kMinArfVectors[] = {
   0, 4, 8, 12, 15
 };
 
-int is_extension_y4m(const char *filename) {
-  const char *dot = strrchr(filename, '.');
-  if (!dot || dot == filename)
-    return 0;
-  else
-    return !strcmp(dot, ".y4m");
-}
-
 class ArfFreqTestLarge
     : public ::libaom_test::CodecTestWith3Params<TestVideoParam,
                                                  TestEncodeParam, int>,
@@ -87,14 +83,10 @@ class ArfFreqTestLarge
   virtual ~ArfFreqTestLarge() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(test_encode_param_.mode);
+    InitializeConfig(test_encode_param_.mode);
     if (test_encode_param_.mode != ::libaom_test::kRealTime) {
       cfg_.g_lag_in_frames = 25;
-      cfg_.rc_end_usage = AOM_VBR;
     } else {
-      cfg_.g_lag_in_frames = 0;
-      cfg_.rc_end_usage = AOM_CBR;
       cfg_.rc_buf_sz = 1000;
       cfg_.rc_buf_initial_sz = 500;
       cfg_.rc_buf_optimal_sz = 600;
@@ -198,6 +190,7 @@ TEST_P(ArfFreqTestLarge, MinArfFreqTest) {
         test_video_param_.framerate_num, test_video_param_.framerate_den, 0,
         kFrames));
   }
+  ASSERT_NE(video, nullptr);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
   const int min_run = GetMinVisibleRun();
diff --git a/media/libaom/src/test/av1_convolve_2d_test.cc b/media/libaom/src/test/av1_convolve_2d_test.cc
deleted file mode 100644
index 50a58f06de..0000000000
--- a/media/libaom/src/test/av1_convolve_2d_test.cc
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tuple>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/av1_convolve_2d_test_util.h"
-
-using libaom_test::ACMRandom;
-using libaom_test::AV1Convolve2D::AV1Convolve2DSrTest;
-using libaom_test::AV1Convolve2D::AV1JntConvolve2DTest;
-#if CONFIG_AV1_HIGHBITDEPTH
-using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DSrTest;
-using libaom_test::AV1HighbdConvolve2D::AV1HighbdJntConvolve2DTest;
-#endif
-using std::make_tuple;
-using std::tuple;
-
-namespace {
-
-TEST_P(AV1Convolve2DSrTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
-
-TEST_P(AV1Convolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
-
-INSTANTIATE_TEST_SUITE_P(
-    C_COPY, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_copy_sr_c, 0, 0));
-INSTANTIATE_TEST_SUITE_P(
-    C_X, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_c, 1, 0));
-INSTANTIATE_TEST_SUITE_P(
-    C_Y, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_c, 0, 1));
-INSTANTIATE_TEST_SUITE_P(
-    C, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_c, 1, 1));
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1Convolve2DSrTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_convolve_2d_copy_sr_sse2, 0, 0));
-INSTANTIATE_TEST_SUITE_P(
-    SSE2_X, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_sse2, 1, 0));
-INSTANTIATE_TEST_SUITE_P(
-    SSE2_Y, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_sse2, 0, 1));
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_sse2, 1, 1));
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1Convolve2DSrTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_convolve_2d_copy_sr_avx2, 0, 0));
-INSTANTIATE_TEST_SUITE_P(
-    AVX2_X, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_avx2, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(
-    AVX2_Y, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_avx2, 0, 1));
-
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_avx2, 1, 1));
-#endif  // HAVE_AVX2
-#endif  // HAVE_SSE2
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON_X, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_neon, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(
-    NEON_Y, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_neon, 0, 1));
-
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AV1Convolve2DSrTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_neon, 1, 1));
-
-INSTANTIATE_TEST_SUITE_P(NEON_COPY, AV1Convolve2DSrTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_convolve_2d_copy_sr_neon, 0, 0));
-#endif  // HAVE_NEON
-
-TEST_P(AV1JntConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
-TEST_P(AV1JntConvolve2DTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
-
-INSTANTIATE_TEST_SUITE_P(C_COPY, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_copy_c, 0, 0));
-
-INSTANTIATE_TEST_SUITE_P(
-    C_X, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_x_c, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(
-    C_Y, AV1JntConvolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_y_c, 0, 1));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_copy_sse2, 0, 0));
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_sse2, 1, 1));
-
-INSTANTIATE_TEST_SUITE_P(SSE2_X, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_x_sse2, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(SSE2_Y, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_y_sse2, 0, 1));
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(SSSE3, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_ssse3, 1, 1));
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_copy_avx2, 0, 0));
-INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_x_avx2, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_y_avx2, 0, 1));
-
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_avx2, 1, 1));
-#endif  // HAVE_AVX2
-#endif  // HAVE_SSSE3
-#endif  // HAVE_SSE2
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON_COPY, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_copy_neon, 0, 0));
-
-INSTANTIATE_TEST_SUITE_P(NEON, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_2d_neon, 1, 1));
-INSTANTIATE_TEST_SUITE_P(NEON_X, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_x_neon, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(NEON_Y, AV1JntConvolve2DTest,
-                         libaom_test::AV1Convolve2D::BuildParams(
-                             av1_dist_wtd_convolve_y_neon, 0, 1));
-#endif  // HAVE_NEON
-
-#if CONFIG_AV1_HIGHBITDEPTH
-TEST_P(AV1HighbdConvolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
-TEST_P(AV1HighbdConvolve2DSrTest, DISABLED_Speed) {
-  RunSpeedTest(GET_PARAM(1));
-}
-
-INSTANTIATE_TEST_SUITE_P(C_X, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_x_sr_c, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(C_Y, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_y_sr_c, 0, 1));
-
-INSTANTIATE_TEST_SUITE_P(C_COPY, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_2d_copy_sr_c, 0, 0));
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_2d_copy_sr_sse2, 0, 0));
-#if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(SSSE3, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_2d_sr_ssse3, 1, 1));
-INSTANTIATE_TEST_SUITE_P(SSSE3_X, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_x_sr_ssse3, 1, 0));
-INSTANTIATE_TEST_SUITE_P(SSSE3_Y, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_y_sr_ssse3, 0, 1));
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_2d_sr_avx2, 1, 1));
-INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_x_sr_avx2, 1, 0));
-INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_y_sr_avx2, 0, 1));
-INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1HighbdConvolve2DSrTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_convolve_2d_copy_sr_avx2, 0, 0));
-#endif  // HAVE_AVX2
-#endif  // HAVE_SSSE3
-#endif  // HAVE_SSE2
-TEST_P(AV1HighbdJntConvolve2DTest, CheckOutput) {
-  RunCheckOutput(GET_PARAM(1));
-}
-
-TEST_P(AV1HighbdJntConvolve2DTest, DISABLED_Speed) {
-  RunSpeedTest(GET_PARAM(1));
-}
-
-INSTANTIATE_TEST_SUITE_P(C_X, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_x_c, 1, 0));
-
-INSTANTIATE_TEST_SUITE_P(C_Y, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_y_c, 0, 1));
-
-INSTANTIATE_TEST_SUITE_P(C_COPY, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_2d_copy_c, 0, 0));
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(SSE4_1_COPY, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_2d_copy_sse4_1, 0,
-                             0));
-INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_2d_sse4_1, 1, 1));
-INSTANTIATE_TEST_SUITE_P(SSE4_1_X, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_x_sse4_1, 1, 0));
-INSTANTIATE_TEST_SUITE_P(SSE4_1_Y, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_y_sse4_1, 0, 1));
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_2d_copy_avx2, 0, 0));
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_2d_avx2, 1, 1));
-INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_x_avx2, 1, 0));
-INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1HighbdJntConvolve2DTest,
-                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                             av1_highbd_dist_wtd_convolve_y_avx2, 0, 1));
-#endif  // HAVE_AVX2
-#endif  // HAVE_SSE4_1
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-}  // namespace
diff --git a/media/libaom/src/test/av1_convolve_2d_test_util.cc b/media/libaom/src/test/av1_convolve_2d_test_util.cc
deleted file mode 100644
index 6f103d3f65..0000000000
--- a/media/libaom/src/test/av1_convolve_2d_test_util.cc
+++ /dev/null
@@ -1,708 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "test/av1_convolve_2d_test_util.h"
-
-#include "aom_ports/aom_timer.h"
-#include "av1/common/common_data.h"
-#include "av1/common/convolve.h"
-
-using std::make_tuple;
-using std::tuple;
-
-namespace libaom_test {
-
-const int kMaxSize = 128 + 32;  // padding
-namespace AV1Convolve2D {
-
-::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
-    convolve_2d_func filter, int has_subx, int has_suby) {
-  return ::testing::Combine(::testing::Values(filter),
-                            ::testing::Values(has_subx),
-                            ::testing::Values(has_suby),
-                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
-}
-
-AV1Convolve2DSrTest::~AV1Convolve2DSrTest() {}
-void AV1Convolve2DSrTest::SetUp() {
-  rnd_.Reset(ACMRandom::DeterministicSeed());
-}
-
-void AV1Convolve2DSrTest::TearDown() { libaom_test::ClearSystemState(); }
-
-void AV1Convolve2DSrTest::RunCheckOutput(convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int has_subx = GET_PARAM(1);
-  const int has_suby = GET_PARAM(2);
-  const int block_idx = GET_PARAM(3);
-  int hfilter, vfilter, subx, suby;
-  uint8_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, uint8_t, output[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint8_t, output2[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
-  for (int i = 0; i < MAX_SB_SQUARE; ++i)
-    output[i] = output2[i] = static_cast<uint8_t>(rnd_.Rand31());
-
-  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
-  const int num_sizes =
-      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
-                                                                           : 1;
-  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
-    const int out_w = block_size_wide[block_idx] >> shift;
-    const int out_h = block_size_high[block_idx] >> shift;
-    for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
-      for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL;
-           ++vfilter) {
-        const InterpFilterParams *filter_params_x =
-            av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                         out_w);
-        const InterpFilterParams *filter_params_y =
-            av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                         out_h);
-        for (int do_average = 0; do_average < 1; ++do_average) {
-          ConvolveParams conv_params1 =
-              get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
-          ConvolveParams conv_params2 =
-              get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
-
-          const int subx_range = has_subx ? 16 : 1;
-          const int suby_range = has_suby ? 16 : 1;
-          for (subx = 0; subx < subx_range; ++subx) {
-            for (suby = 0; suby < suby_range; ++suby) {
-              // Choose random locations within the source block
-              const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-              const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-              av1_convolve_2d_sr_c(input + offset_r * w + offset_c, w, output,
-                                   MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                                   filter_params_y, subx, suby, &conv_params1);
-              test_impl(input + offset_r * w + offset_c, w, output2,
-                        MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                        filter_params_y, subx, suby, &conv_params2);
-
-              if (memcmp(output, output2, sizeof(output))) {
-                for (int i = 0; i < MAX_SB_SIZE; ++i) {
-                  for (int j = 0; j < MAX_SB_SIZE; ++j) {
-                    int idx = i * MAX_SB_SIZE + j;
-                    ASSERT_EQ(output[idx], output2[idx])
-                        << out_w << "x" << out_h << " Pixel mismatch at index "
-                        << idx << " = (" << i << ", " << j
-                        << "), sub pixel offset = (" << suby << ", " << subx
-                        << ")";
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void AV1Convolve2DSrTest::RunSpeedTest(convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int has_subx = GET_PARAM(1);
-  const int has_suby = GET_PARAM(2);
-  const int block_idx = GET_PARAM(3);
-
-  uint8_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, uint8_t, output[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
-
-  int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR;
-  int subx = 0, suby = 0;
-
-  const int do_average = 0;
-  ConvolveParams conv_params2 =
-      get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
-
-  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
-  const int num_sizes =
-      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
-                                                                           : 1;
-  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
-    const int out_w = block_size_wide[block_idx] >> shift;
-    const int out_h = block_size_high[block_idx] >> shift;
-    const int num_loops = 1000000000 / (out_w + out_h);
-
-    const InterpFilterParams *filter_params_x =
-        av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                     out_w);
-    const InterpFilterParams *filter_params_y =
-        av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                     out_h);
-
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-
-    for (int i = 0; i < num_loops; ++i)
-      test_impl(input, w, output, MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                filter_params_y, subx, suby, &conv_params2);
-
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
-           out_h, 1000.0 * elapsed_time / num_loops);
-  }
-}
-
-AV1JntConvolve2DTest::~AV1JntConvolve2DTest() {}
-void AV1JntConvolve2DTest::SetUp() {
-  rnd_.Reset(ACMRandom::DeterministicSeed());
-}
-
-void AV1JntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
-
-void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int has_subx = GET_PARAM(1);
-  const int has_suby = GET_PARAM(2);
-  const int block_idx = GET_PARAM(3);
-  int hfilter, vfilter, subx, suby;
-  uint8_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, output8_1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, output8_2[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
-  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-    output1[i] = output2[i] = rnd_.Rand16();
-    output8_1[i] = output8_2[i] = rnd_.Rand8();
-  }
-
-  const int out_w = block_size_wide[block_idx];
-  const int out_h = block_size_high[block_idx];
-  for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
-    for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
-      const InterpFilterParams *filter_params_x =
-          av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                       out_w);
-      const InterpFilterParams *filter_params_y =
-          av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                       out_h);
-      for (int do_average = 0; do_average <= 1; ++do_average) {
-        ConvolveParams conv_params1 =
-            get_conv_params_no_round(do_average, 0, output1, MAX_SB_SIZE, 1, 8);
-        ConvolveParams conv_params2 =
-            get_conv_params_no_round(do_average, 0, output2, MAX_SB_SIZE, 1, 8);
-
-        // Test special case where dist_wtd_comp_avg is not used
-        conv_params1.use_dist_wtd_comp_avg = 0;
-        conv_params2.use_dist_wtd_comp_avg = 0;
-
-        const int subx_range = has_subx ? 16 : 1;
-        const int suby_range = has_suby ? 16 : 1;
-        for (subx = 0; subx < subx_range; ++subx) {
-          for (suby = 0; suby < suby_range; ++suby) {
-            // Choose random locations within the source block
-            const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-            const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-            av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w,
-                                       output8_1, MAX_SB_SIZE, out_w, out_h,
-                                       filter_params_x, filter_params_y, subx,
-                                       suby, &conv_params1);
-            test_impl(input + offset_r * w + offset_c, w, output8_2,
-                      MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                      filter_params_y, subx, suby, &conv_params2);
-
-            for (int i = 0; i < out_h; ++i) {
-              for (int j = 0; j < out_w; ++j) {
-                int idx = i * MAX_SB_SIZE + j;
-                ASSERT_EQ(output1[idx], output2[idx])
-                    << "Mismatch at unit tests for av1_dist_wtd_convolve_2d\n"
-                    << out_w << "x" << out_h << " Pixel mismatch at index "
-                    << idx << " = (" << i << ", " << j
-                    << "), sub pixel offset = (" << suby << ", " << subx << ")";
-              }
-            }
-
-            if (memcmp(output8_1, output8_2, sizeof(output8_1))) {
-              for (int i = 0; i < MAX_SB_SIZE; ++i) {
-                for (int j = 0; j < MAX_SB_SIZE; ++j) {
-                  int idx = i * MAX_SB_SIZE + j;
-                  ASSERT_EQ(output8_1[idx], output8_2[idx])
-                      << out_w << "x" << out_h << " Pixel mismatch at index "
-                      << idx << " = (" << i << ", " << j
-                      << "), sub pixel offset = (" << suby << ", " << subx
-                      << ")";
-                }
-              }
-            }
-          }
-        }
-
-        // Test different combination of fwd and bck offset weights
-        for (int k = 0; k < 2; ++k) {
-          for (int l = 0; l < 4; ++l) {
-            conv_params1.use_dist_wtd_comp_avg = 1;
-            conv_params2.use_dist_wtd_comp_avg = 1;
-            conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
-            conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
-            conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
-            conv_params2.bck_offset = quant_dist_lookup_table[k][l][1];
-
-            for (subx = 0; subx < subx_range; ++subx) {
-              for (suby = 0; suby < suby_range; ++suby) {
-                // Choose random locations within the source block
-                const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-                const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-                av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w,
-                                           output8_1, MAX_SB_SIZE, out_w, out_h,
-                                           filter_params_x, filter_params_y,
-                                           subx, suby, &conv_params1);
-                test_impl(input + offset_r * w + offset_c, w, output8_2,
-                          MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                          filter_params_y, subx, suby, &conv_params2);
-
-                for (int i = 0; i < out_h; ++i) {
-                  for (int j = 0; j < out_w; ++j) {
-                    int idx = i * MAX_SB_SIZE + j;
-                    ASSERT_EQ(output1[idx], output2[idx])
-                        << "Mismatch at unit tests for "
-                           "av1_dist_wtd_convolve_2d\n"
-                        << out_w << "x" << out_h << " Pixel mismatch at index "
-                        << idx << " = (" << i << ", " << j
-                        << "), sub pixel offset = (" << suby << ", " << subx
-                        << ")";
-                  }
-                }
-                if (memcmp(output8_1, output8_2, sizeof(output8_1))) {
-                  for (int i = 0; i < MAX_SB_SIZE; ++i) {
-                    for (int j = 0; j < MAX_SB_SIZE; ++j) {
-                      int idx = i * MAX_SB_SIZE + j;
-                      ASSERT_EQ(output8_1[idx], output8_2[idx])
-                          << out_w << "x" << out_h
-                          << " Pixel mismatch at index " << idx << " = (" << i
-                          << ", " << j << "), sub pixel offset = (" << suby
-                          << ", " << subx << ")";
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int has_subx = GET_PARAM(1);
-  const int has_suby = GET_PARAM(2);
-  const int block_idx = GET_PARAM(3);
-
-  int subx = 0, suby = 0;
-  uint8_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, output8[MAX_SB_SQUARE]);
-  int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR;
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
-  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-    output[i] = rnd_.Rand16();
-    output8[i] = rnd_.Rand8();
-  }
-
-  const int out_w = block_size_wide[block_idx];
-  const int out_h = block_size_high[block_idx];
-  const int num_loops = 1000000000 / (out_w + out_h);
-  const int do_average = 0;
-
-  const InterpFilterParams *filter_params_x =
-      av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                   out_w);
-  const InterpFilterParams *filter_params_y =
-      av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                   out_h);
-
-  ConvolveParams conv_params =
-      get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, 8);
-
-  conv_params.use_dist_wtd_comp_avg = 0;
-
-  // Choose random locations within the source block
-  const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-  const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-
-  aom_usec_timer timer;
-  aom_usec_timer_start(&timer);
-
-  for (int i = 0; i < num_loops; ++i)
-    test_impl(input + offset_r * w + offset_c, w, output8, MAX_SB_SIZE, out_w,
-              out_h, filter_params_x, filter_params_y, subx, suby,
-              &conv_params);
-
-  aom_usec_timer_mark(&timer);
-  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
-         out_h, 1000.0 * elapsed_time / num_loops);
-}
-}  // namespace AV1Convolve2D
-
-#if CONFIG_AV1_HIGHBITDEPTH
-namespace AV1HighbdConvolve2D {
-::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
-    highbd_convolve_2d_func filter, int has_subx, int has_suby) {
-  return ::testing::Combine(
-      ::testing::Range(8, 13, 2), ::testing::Values(filter),
-      ::testing::Values(has_subx), ::testing::Values(has_suby),
-      ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
-}
-
-AV1HighbdConvolve2DSrTest::~AV1HighbdConvolve2DSrTest() {}
-void AV1HighbdConvolve2DSrTest::SetUp() {
-  rnd_.Reset(ACMRandom::DeterministicSeed());
-}
-
-void AV1HighbdConvolve2DSrTest::TearDown() { libaom_test::ClearSystemState(); }
-
-void AV1HighbdConvolve2DSrTest::RunSpeedTest(
-    highbd_convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int bd = GET_PARAM(0);
-  const int has_subx = GET_PARAM(2);
-  const int has_suby = GET_PARAM(3);
-  const int block_idx = GET_PARAM(4);
-  int hfilter, vfilter, subx, suby;
-  uint16_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, uint16_t, output[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j)
-      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-
-  hfilter = EIGHTTAP_REGULAR;
-  vfilter = EIGHTTAP_REGULAR;
-  int do_average = 0;
-
-  const int offset_r = 3;
-  const int offset_c = 3;
-  subx = 0;
-  suby = 0;
-
-  ConvolveParams conv_params =
-      get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
-
-  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
-  const int num_sizes =
-      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
-                                                                           : 1;
-
-  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
-    const int out_w = block_size_wide[block_idx] >> shift;
-    const int out_h = block_size_high[block_idx] >> shift;
-    const int num_loops = 1000000000 / (out_w + out_h);
-
-    const InterpFilterParams *filter_params_x =
-        av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                     out_w);
-    const InterpFilterParams *filter_params_y =
-        av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                     out_h);
-
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < num_loops; ++i)
-      test_impl(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE, out_w,
-                out_h, filter_params_x, filter_params_y, subx, suby,
-                &conv_params, bd);
-
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
-           out_h, 1000.0 * elapsed_time / num_loops);
-  }
-}
-
-void AV1HighbdConvolve2DSrTest::RunCheckOutput(
-    highbd_convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int bd = GET_PARAM(0);
-  const int has_subx = GET_PARAM(2);
-  const int has_suby = GET_PARAM(3);
-  const int block_idx = GET_PARAM(4);
-  int hfilter, vfilter, subx, suby;
-  uint16_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, uint16_t, output[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint16_t, output2[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j)
-      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-  for (int i = 0; i < MAX_SB_SQUARE; ++i)
-    output[i] = output2[i] = static_cast<int16_t>(rnd_.Rand31());
-
-  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
-  const int num_sizes =
-      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
-                                                                           : 1;
-  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
-    const int out_w = block_size_wide[block_idx] >> shift;
-    const int out_h = block_size_high[block_idx] >> shift;
-    for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
-      for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL;
-           ++vfilter) {
-        const InterpFilterParams *filter_params_x =
-            av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                         out_w);
-        const InterpFilterParams *filter_params_y =
-            av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                         out_h);
-        for (int do_average = 0; do_average < 1; ++do_average) {
-          ConvolveParams conv_params1 =
-              get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
-          ConvolveParams conv_params2 =
-              get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
-
-          const int subx_range = has_subx ? 16 : 1;
-          const int suby_range = has_suby ? 16 : 1;
-          for (subx = 0; subx < subx_range; ++subx) {
-            for (suby = 0; suby < suby_range; ++suby) {
-              // Choose random locations within the source block
-              const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-              const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-              av1_highbd_convolve_2d_sr_c(input + offset_r * w + offset_c, w,
-                                          output, MAX_SB_SIZE, out_w, out_h,
-                                          filter_params_x, filter_params_y,
-                                          subx, suby, &conv_params1, bd);
-              test_impl(input + offset_r * w + offset_c, w, output2,
-                        MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                        filter_params_y, subx, suby, &conv_params2, bd);
-
-              if (memcmp(output, output2, sizeof(output))) {
-                for (int i = 0; i < MAX_SB_SIZE; ++i) {
-                  for (int j = 0; j < MAX_SB_SIZE; ++j) {
-                    int idx = i * MAX_SB_SIZE + j;
-                    ASSERT_EQ(output[idx], output2[idx])
-                        << out_w << "x" << out_h << " Pixel mismatch at index "
-                        << idx << " = (" << i << ", " << j
-                        << "), sub pixel offset = (" << suby << ", " << subx
-                        << ")";
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-AV1HighbdJntConvolve2DTest::~AV1HighbdJntConvolve2DTest() {}
-void AV1HighbdJntConvolve2DTest::SetUp() {
-  rnd_.Reset(ACMRandom::DeterministicSeed());
-}
-
-void AV1HighbdJntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
-
-void AV1HighbdJntConvolve2DTest::RunSpeedTest(
-    highbd_convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int bd = GET_PARAM(0);
-  const int block_idx = GET_PARAM(4);
-  int hfilter, vfilter, subx, suby;
-  uint16_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint16_t, output16[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j)
-      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-  for (int i = 0; i < MAX_SB_SQUARE; ++i) output[i] = rnd_.Rand16();
-  hfilter = EIGHTTAP_REGULAR;
-  vfilter = EIGHTTAP_REGULAR;
-  int do_average = 0;
-  const int out_w = block_size_wide[block_idx];
-  const int out_h = block_size_high[block_idx];
-
-  const InterpFilterParams *filter_params_x =
-      av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                   out_w);
-  const InterpFilterParams *filter_params_y =
-      av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                   out_h);
-
-  ConvolveParams conv_params =
-      get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, bd);
-
-  // Test special case where dist_wtd_comp_avg is not used
-  conv_params.use_dist_wtd_comp_avg = 0;
-
-  subx = 0;
-  suby = 0;
-  // Choose random locations within the source block
-  const int offset_r = 3;
-  const int offset_c = 3;
-
-  const int num_loops = 1000000000 / (out_w + out_h);
-  aom_usec_timer timer;
-  aom_usec_timer_start(&timer);
-  for (int i = 0; i < num_loops; ++i)
-    test_impl(input + offset_r * w + offset_c, w, output16, MAX_SB_SIZE, out_w,
-              out_h, filter_params_x, filter_params_y, subx, suby, &conv_params,
-              bd);
-
-  aom_usec_timer_mark(&timer);
-  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("convolve %3dx%-3d: %7.2f us\n", out_w, out_h,
-         1000.0 * elapsed_time / num_loops);
-}
-
-void AV1HighbdJntConvolve2DTest::RunCheckOutput(
-    highbd_convolve_2d_func test_impl) {
-  const int w = kMaxSize, h = kMaxSize;
-  const int bd = GET_PARAM(0);
-  const int has_subx = GET_PARAM(2);
-  const int has_suby = GET_PARAM(3);
-  const int block_idx = GET_PARAM(4);
-  int hfilter, vfilter, subx, suby;
-  uint16_t input[kMaxSize * kMaxSize];
-  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint16_t, output16_1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint16_t, output16_2[MAX_SB_SQUARE]);
-
-  for (int i = 0; i < h; ++i)
-    for (int j = 0; j < w; ++j)
-      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-    output1[i] = output2[i] = rnd_.Rand16();
-    output16_1[i] = output16_2[i] = rnd_.Rand16();
-  }
-
-  const int out_w = block_size_wide[block_idx];
-  const int out_h = block_size_high[block_idx];
-  for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
-    for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
-      const InterpFilterParams *filter_params_x =
-          av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
-                                                       out_w);
-      const InterpFilterParams *filter_params_y =
-          av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
-                                                       out_h);
-      for (int do_average = 0; do_average <= 1; ++do_average) {
-        ConvolveParams conv_params1 = get_conv_params_no_round(
-            do_average, 0, output1, MAX_SB_SIZE, 1, bd);
-        ConvolveParams conv_params2 = get_conv_params_no_round(
-            do_average, 0, output2, MAX_SB_SIZE, 1, bd);
-
-        // Test special case where dist_wtd_comp_avg is not used
-        conv_params1.use_dist_wtd_comp_avg = 0;
-        conv_params2.use_dist_wtd_comp_avg = 0;
-
-        const int subx_range = has_subx ? 16 : 1;
-        const int suby_range = has_suby ? 16 : 1;
-        for (subx = 0; subx < subx_range; ++subx) {
-          for (suby = 0; suby < suby_range; ++suby) {
-            // Choose random locations within the source block
-            const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-            const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-            av1_highbd_dist_wtd_convolve_2d_c(
-                input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
-                out_w, out_h, filter_params_x, filter_params_y, subx, suby,
-                &conv_params1, bd);
-            test_impl(input + offset_r * w + offset_c, w, output16_2,
-                      MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                      filter_params_y, subx, suby, &conv_params2, bd);
-
-            for (int i = 0; i < out_h; ++i) {
-              for (int j = 0; j < out_w; ++j) {
-                int idx = i * MAX_SB_SIZE + j;
-                ASSERT_EQ(output1[idx], output2[idx])
-                    << out_w << "x" << out_h << " Pixel mismatch at index "
-                    << idx << " = (" << i << ", " << j
-                    << "), sub pixel offset = (" << suby << ", " << subx << ")";
-              }
-            }
-
-            if (memcmp(output16_1, output16_2, sizeof(output16_1))) {
-              for (int i = 0; i < MAX_SB_SIZE; ++i) {
-                for (int j = 0; j < MAX_SB_SIZE; ++j) {
-                  int idx = i * MAX_SB_SIZE + j;
-                  ASSERT_EQ(output16_1[idx], output16_2[idx])
-                      << out_w << "x" << out_h << " Pixel mismatch at index "
-                      << idx << " = (" << i << ", " << j
-                      << "), sub pixel offset = (" << suby << ", " << subx
-                      << ")";
-                }
-              }
-            }
-          }
-        }
-
-        // Test different combination of fwd and bck offset weights
-        for (int k = 0; k < 2; ++k) {
-          for (int l = 0; l < 4; ++l) {
-            conv_params1.use_dist_wtd_comp_avg = 1;
-            conv_params2.use_dist_wtd_comp_avg = 1;
-            conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
-            conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
-            conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
-            conv_params2.bck_offset = quant_dist_lookup_table[k][l][1];
-
-            const int subx_range = has_subx ? 16 : 1;
-            const int suby_range = has_suby ? 16 : 1;
-            for (subx = 0; subx < subx_range; ++subx) {
-              for (suby = 0; suby < suby_range; ++suby) {
-                // Choose random locations within the source block
-                const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-                const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-                av1_highbd_dist_wtd_convolve_2d_c(
-                    input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
-                    out_w, out_h, filter_params_x, filter_params_y, subx, suby,
-                    &conv_params1, bd);
-                test_impl(input + offset_r * w + offset_c, w, output16_2,
-                          MAX_SB_SIZE, out_w, out_h, filter_params_x,
-                          filter_params_y, subx, suby, &conv_params2, bd);
-
-                for (int i = 0; i < out_h; ++i) {
-                  for (int j = 0; j < out_w; ++j) {
-                    int idx = i * MAX_SB_SIZE + j;
-                    ASSERT_EQ(output1[idx], output2[idx])
-                        << out_w << "x" << out_h << " Pixel mismatch at index "
-                        << idx << " = (" << i << ", " << j
-                        << "), sub pixel offset = (" << suby << ", " << subx
-                        << ")";
-                  }
-                }
-
-                if (memcmp(output16_1, output16_2, sizeof(output16_1))) {
-                  for (int i = 0; i < MAX_SB_SIZE; ++i) {
-                    for (int j = 0; j < MAX_SB_SIZE; ++j) {
-                      int idx = i * MAX_SB_SIZE + j;
-                      ASSERT_EQ(output16_1[idx], output16_2[idx])
-                          << out_w << "x" << out_h
-                          << " Pixel mismatch at index " << idx << " = (" << i
-                          << ", " << j << "), sub pixel offset = (" << suby
-                          << ", " << subx << ")";
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-}  // namespace AV1HighbdConvolve2D
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-}  // namespace libaom_test
diff --git a/media/libaom/src/test/av1_convolve_2d_test_util.h b/media/libaom/src/test/av1_convolve_2d_test_util.h
deleted file mode 100644
index 3c19cfed32..0000000000
--- a/media/libaom/src/test/av1_convolve_2d_test_util.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
-#define AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
-
-#include <tuple>
-
-#include "config/av1_rtcd.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/util.h"
-
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-
-namespace libaom_test {
-
-namespace AV1Convolve2D {
-
-typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride, int w, int h,
-                                 const InterpFilterParams *filter_params_x,
-                                 const InterpFilterParams *filter_params_y,
-                                 const int subpel_x_qn, const int subpel_y_qn,
-                                 ConvolveParams *conv_params);
-
-typedef std::tuple<convolve_2d_func, int, int, BLOCK_SIZE> Convolve2DParam;
-
-::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
-    convolve_2d_func filter, int subx_exist, int suby_exist);
-
-class AV1Convolve2DSrTest : public ::testing::TestWithParam<Convolve2DParam> {
- public:
-  virtual ~AV1Convolve2DSrTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
-
- protected:
-  void RunCheckOutput(convolve_2d_func test_impl);
-  void RunSpeedTest(convolve_2d_func test_impl);
-
-  libaom_test::ACMRandom rnd_;
-};
-
-class AV1JntConvolve2DTest : public ::testing::TestWithParam<Convolve2DParam> {
- public:
-  virtual ~AV1JntConvolve2DTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
-
- protected:
-  void RunCheckOutput(convolve_2d_func test_impl);
-  void RunSpeedTest(convolve_2d_func test_impl);
-
-  libaom_test::ACMRandom rnd_;
-};
-}  // namespace AV1Convolve2D
-
-#if CONFIG_AV1_HIGHBITDEPTH
-namespace AV1HighbdConvolve2D {
-typedef void (*highbd_convolve_2d_func)(
-    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params, int bd);
-
-typedef std::tuple<int, highbd_convolve_2d_func, int, int, BLOCK_SIZE>
-    HighbdConvolve2DParam;
-
-::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
-    highbd_convolve_2d_func filter, int subx_exist, int suby_exist);
-
-class AV1HighbdConvolve2DSrTest
-    : public ::testing::TestWithParam<HighbdConvolve2DParam> {
- public:
-  virtual ~AV1HighbdConvolve2DSrTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
-
- protected:
-  void RunCheckOutput(highbd_convolve_2d_func test_impl);
-  void RunSpeedTest(highbd_convolve_2d_func test_impl);
-
-  libaom_test::ACMRandom rnd_;
-};
-
-class AV1HighbdJntConvolve2DTest
-    : public ::testing::TestWithParam<HighbdConvolve2DParam> {
- public:
-  virtual ~AV1HighbdJntConvolve2DTest();
-  virtual void SetUp();
-
-  virtual void TearDown();
-
- protected:
-  void RunCheckOutput(highbd_convolve_2d_func test_impl);
-  void RunSpeedTest(highbd_convolve_2d_func test_impl);
-
-  libaom_test::ACMRandom rnd_;
-};
-}  // namespace AV1HighbdConvolve2D
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-}  // namespace libaom_test
-
-#endif  // AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
diff --git a/media/libaom/src/test/av1_convolve_scale_test.cc b/media/libaom/src/test/av1_convolve_scale_test.cc
index ffd0bab333..6b08e7a4d3 100644
--- a/media/libaom/src/test/av1_convolve_scale_test.cc
+++ b/media/libaom/src/test/av1_convolve_scale_test.cc
@@ -18,7 +18,6 @@
 
 #include "aom_ports/aom_timer.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -104,7 +103,6 @@ void TestFilter::set(NTaps ntaps, bool backwards) {
   params_.filter_ptr = &coeffs_[0];
   params_.taps = n;
   // These are ignored by the functions being tested. Set them to whatever.
-  params_.subpel_shifts = SUBPEL_SHIFTS;
   params_.interp_filter = EIGHTTAP_REGULAR;
 }
 
@@ -259,7 +257,7 @@ class ConvolveScaleTestBase : public ::testing::Test {
  public:
   ConvolveScaleTestBase() : image_(NULL) {}
   virtual ~ConvolveScaleTestBase() { delete image_; }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
   // Implemented by subclasses (SetUp depends on the parameters passed
   // in and RunOne depends on the function to be tested. These can't
@@ -284,6 +282,7 @@ class ConvolveScaleTestBase : public ::testing::Test {
 
     delete image_;
     image_ = new TestImage<SrcPixel>(width_, height_, bd_);
+    ASSERT_NE(image_, nullptr);
   }
 
   void SetConvParamOffset(int i, int j, int is_compound, int do_average,
@@ -294,8 +293,8 @@ class ConvolveScaleTestBase : public ::testing::Test {
       convolve_params_.do_average = do_average;
     } else {
       convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
-      convolve_params_.fwd_offset = quant_dist_lookup_table[i][j][0];
-      convolve_params_.bck_offset = quant_dist_lookup_table[i][j][1];
+      convolve_params_.fwd_offset = quant_dist_lookup_table[j][i];
+      convolve_params_.bck_offset = quant_dist_lookup_table[j][1 - i];
       convolve_params_.is_compound = is_compound;
       convolve_params_.do_average = do_average;
     }
diff --git a/media/libaom/src/test/av1_convolve_test.cc b/media/libaom/src/test/av1_convolve_test.cc
new file mode 100644
index 0000000000..9ebe932702
--- /dev/null
+++ b/media/libaom/src/test/av1_convolve_test.cc
@@ -0,0 +1,1805 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ostream>
+#include <set>
+#include <vector>
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "aom_ports/aom_timer.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// TODO(any): Remove following INTERP_FILTERS_ALL define, so that 12-tap filter
+// is tested once 12-tap filter SIMD is done.
+#undef INTERP_FILTERS_ALL
+#define INTERP_FILTERS_ALL 4
+
+// All single reference convolve tests are parameterized on block size,
+// bit-depth, and function to test.
+//
+// Note that parameterizing on these variables (and not other parameters) is
+// a conscious decision - Jenkins needs some degree of parallelization to run
+// the tests within the time limit, but if the number of parameters increases
+// too much, the gtest framework does not handle it well (increased overhead per
+// test, huge amount of output to stdout, etc.).
+//
+// Also note that the test suites must be named with the architecture, e.g.,
+// C, C_X, AVX2_X, ... The test suite that runs on Jenkins sometimes runs tests
+// that cannot deal with intrinsics (e.g., the Valgrind tests on 32-bit x86
+// binaries) and will disable tests using a filter like
+// --gtest_filter=-:SSE4_1.*. If the test suites are not named this way, the
+// testing infrastructure will not selectively filter them properly.
+class BlockSize {
+ public:
+  BlockSize(int w, int h) : width_(w), height_(h) {}
+
+  int Width() const { return width_; }
+  int Height() const { return height_; }
+
+  bool operator<(const BlockSize &other) const {
+    if (Width() == other.Width()) {
+      return Height() < other.Height();
+    }
+    return Width() < other.Width();
+  }
+
+  bool operator==(const BlockSize &other) const {
+    return Width() == other.Width() && Height() == other.Height();
+  }
+
+ private:
+  int width_;
+  int height_;
+};
+
+// Block size / bit depth / test function used to parameterize the tests.
+template <typename T>
+class TestParam {
+ public:
+  TestParam(const BlockSize &block, int bd, T test_func)
+      : block_(block), bd_(bd), test_func_(test_func) {}
+
+  const BlockSize &Block() const { return block_; }
+  int BitDepth() const { return bd_; }
+  T TestFunction() const { return test_func_; }
+
+  bool operator==(const TestParam &other) const {
+    return Block() == other.Block() && BitDepth() == other.BitDepth() &&
+           TestFunction() == other.TestFunction();
+  }
+
+ private:
+  BlockSize block_;
+  int bd_;
+  T test_func_;
+};
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os, const TestParam<T> &test_arg) {
+  return os << "TestParam { width:" << test_arg.Block().Width()
+            << " height:" << test_arg.Block().Height()
+            << " bd:" << test_arg.BitDepth() << " }";
+}
+
+// Generate the list of all block widths / heights that need to be tested,
+// includes chroma and luma sizes, for the given bit-depths. The test
+// function is the same for all generated parameters.
+template <typename T>
+std::vector<TestParam<T>> GetTestParams(std::initializer_list<int> bit_depths,
+                                        T test_func) {
+  std::set<BlockSize> sizes;
+  for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) {
+    const int w = block_size_wide[b];
+    const int h = block_size_high[b];
+    sizes.insert(BlockSize(w, h));
+    // Add in smaller chroma sizes as well.
+    if (w == 4 || h == 4) {
+      sizes.insert(BlockSize(w / 2, h / 2));
+    }
+  }
+  std::vector<TestParam<T>> result;
+  for (const BlockSize &block : sizes) {
+    for (int bd : bit_depths) {
+      result.push_back(TestParam<T>(block, bd, test_func));
+    }
+  }
+  return result;
+}
+
+template <typename T>
+std::vector<TestParam<T>> GetLowbdTestParams(T test_func) {
+  return GetTestParams({ 8 }, test_func);
+}
+
+template <typename T>
+::testing::internal::ParamGenerator<TestParam<T>> BuildLowbdParams(
+    T test_func) {
+  return ::testing::ValuesIn(GetLowbdTestParams(test_func));
+}
+
+// Test the test-parameters generators work as expected.
+class AV1ConvolveParametersTest : public ::testing::Test {};
+
+TEST_F(AV1ConvolveParametersTest, GetLowbdTestParams) {
+  auto v = GetLowbdTestParams(av1_convolve_x_sr_c);
+  ASSERT_EQ(27U, v.size());
+  for (const auto &p : v) {
+    ASSERT_EQ(8, p.BitDepth());
+    // Needed (instead of ASSERT_EQ(...) since gtest does not
+    // have built in printing for arbitrary functions, which
+    // causes a compilation error.
+    bool same_fn = av1_convolve_x_sr_c == p.TestFunction();
+    ASSERT_TRUE(same_fn);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+template <typename T>
+std::vector<TestParam<T>> GetHighbdTestParams(T test_func) {
+  return GetTestParams({ 10, 12 }, test_func);
+}
+
+template <typename T>
+::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdParams(
+    T test_func) {
+  return ::testing::ValuesIn(GetHighbdTestParams(test_func));
+}
+
+TEST_F(AV1ConvolveParametersTest, GetHighbdTestParams) {
+  auto v = GetHighbdTestParams(av1_highbd_convolve_x_sr_c);
+  ASSERT_EQ(54U, v.size());
+  int num_10 = 0;
+  int num_12 = 0;
+  for (const auto &p : v) {
+    ASSERT_TRUE(p.BitDepth() == 10 || p.BitDepth() == 12);
+    bool same_fn = av1_highbd_convolve_x_sr_c == p.TestFunction();
+    ASSERT_TRUE(same_fn);
+    if (p.BitDepth() == 10) {
+      ++num_10;
+    } else {
+      ++num_12;
+    }
+  }
+  ASSERT_EQ(num_10, num_12);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// AV1ConvolveTest is the base class that all convolve tests should derive from.
+// It provides storage/methods for generating randomized buffers for both
+// low bit-depth and high bit-depth, and setup/teardown methods for clearing
+// system state. Implementors can get the bit-depth / block-size /
+// test function by calling GetParam().
+template <typename T>
+class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
+ public:
+  ~AV1ConvolveTest() override { TearDown(); }
+
+  void SetUp() override {
+    rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  }
+
+  void TearDown() override {}
+
+  // Randomizes the 8-bit input buffer and returns a pointer to it. Note that
+  // the pointer is safe to use with an 8-tap filter. The stride can range
+  // from width to (width + kPadding). Also note that the pointer is to the
+  // same memory location.
+  static constexpr int kInputPadding = 12;
+
+  // Get a pointer to a buffer with stride == width. Note that we must have
+  // the test param passed in explicitly -- the gtest framework does not
+  // support calling GetParam() within a templatized class.
+  // Note that FirstRandomInput8 always returns the same pointer -- if two
+  // inputs are needed, also use SecondRandomInput8.
+  const uint8_t *FirstRandomInput8(const TestParam<T> &param) {
+    // Note we can't call GetParam() directly -- gtest does not support
+    // this for parameterized types.
+    return RandomInput8(input8_1_, param);
+  }
+
+  const uint8_t *SecondRandomInput8(const TestParam<T> &param) {
+    return RandomInput8(input8_2_, param);
+  }
+
+  // Some of the intrinsics perform writes in 32 byte chunks. Moreover, some
+  // of the instrinsics assume that the stride is also a multiple of 32.
+  // To satisfy these constraints and also remain simple, output buffer strides
+  // are assumed MAX_SB_SIZE.
+  static constexpr int kOutputStride = MAX_SB_SIZE;
+
+  // Check that two 8-bit output buffers are identical.
+  void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width,
+                            int height) {
+    ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations";
+    for (int j = 0; j < height; ++j) {
+      if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
+        p1 += kOutputStride;
+        p2 += kOutputStride;
+        continue;
+      }
+      for (int i = 0; i < width; ++i) {
+        ASSERT_EQ(p1[i], p2[i])
+            << width << "x" << height << " Pixel mismatch at (" << i << ", "
+            << j << ")";
+      }
+    }
+  }
+
+  // Check that two 16-bit output buffers are identical.
+  void AssertOutputBufferEq(const uint16_t *p1, const uint16_t *p2, int width,
+                            int height) {
+    ASSERT_TRUE(p1 != p2) << "Buffers must be in different memory locations";
+    for (int j = 0; j < height; ++j) {
+      if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
+        p1 += kOutputStride;
+        p2 += kOutputStride;
+        continue;
+      }
+      for (int i = 0; i < width; ++i) {
+        ASSERT_EQ(p1[i], p2[i])
+            << width << "x" << height << " Pixel mismatch at (" << i << ", "
+            << j << ")";
+      }
+    }
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  // Note that the randomized values are capped by bit-depth.
+  const uint16_t *FirstRandomInput16(const TestParam<T> &param) {
+    return RandomInput16(input16_1_, param);
+  }
+
+  const uint16_t *SecondRandomInput16(const TestParam<T> &param) {
+    return RandomInput16(input16_2_, param);
+  }
+#endif
+
+ private:
+  const uint8_t *RandomInput8(uint8_t *p, const TestParam<T> &param) {
+    EXPECT_EQ(8, param.BitDepth());
+    EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
+    EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
+    const int padded_width = param.Block().Width() + kInputPadding;
+    const int padded_height = param.Block().Height() + kInputPadding;
+    Randomize(p, padded_width * padded_height);
+    return p + (kInputPadding / 2) * padded_width + kInputPadding / 2;
+  }
+
+  void Randomize(uint8_t *p, int size) {
+    for (int i = 0; i < size; ++i) {
+      p[i] = rnd_.Rand8();
+    }
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  const uint16_t *RandomInput16(uint16_t *p, const TestParam<T> &param) {
+    // Check that this is only called with high bit-depths.
+    EXPECT_TRUE(param.BitDepth() == 10 || param.BitDepth() == 12);
+    EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
+    EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
+    const int padded_width = param.Block().Width() + kInputPadding;
+    const int padded_height = param.Block().Height() + kInputPadding;
+    Randomize(p, padded_width * padded_height, param.BitDepth());
+    return p + (kInputPadding / 2) * padded_width + kInputPadding / 2;
+  }
+
+  void Randomize(uint16_t *p, int size, int bit_depth) {
+    for (int i = 0; i < size; ++i) {
+      p[i] = rnd_.Rand16() & ((1 << bit_depth) - 1);
+    }
+  }
+#endif
+
+  static constexpr int kInputStride = MAX_SB_SIZE + kInputPadding;
+
+  libaom_test::ACMRandom rnd_;
+  // Statically allocate all the memory that is needed for the tests. Note
+  // that we cannot allocate output memory here. It must use DECLARE_ALIGNED,
+  // which is a C99 feature and interacts badly with C++ member variables.
+  uint8_t input8_1_[kInputStride * kInputStride];
+  uint8_t input8_2_[kInputStride * kInputStride];
+#if CONFIG_AV1_HIGHBITDEPTH
+  uint16_t input16_1_[kInputStride * kInputStride];
+  uint16_t input16_2_[kInputStride * kInputStride];
+#endif
+};
+
+////////////////////////////////////////////////////////
+// Single reference convolve-x functions (low bit-depth)
+////////////////////////////////////////////////////////
+typedef void (*convolve_x_func)(const uint8_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams *filter_params_x,
+                                const int subpel_x_qn,
+                                ConvolveParams *conv_params);
+
+class AV1ConvolveXTest : public AV1ConvolveTest<convolve_x_func> {
+ public:
+  void RunTest() {
+    for (int sub_x = 0; sub_x < 16; ++sub_x) {
+      for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+           ++filter) {
+        InterpFilter f = static_cast<InterpFilter>(filter);
+        TestConvolve(sub_x, f);
+      }
+    }
+  }
+
+ public:
+  void SpeedTest() {
+    for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+         ++filter) {
+      InterpFilter f = static_cast<InterpFilter>(filter);
+      TestConvolveSpeed(f, 10000);
+    }
+  }
+
+ private:
+  void TestConvolve(const int sub_x, const InterpFilter filter) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(filter, width);
+    ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    av1_convolve_x_sr_c(input, width, reference, kOutputStride, width, height,
+                        filter_params_x, sub_x, &conv_params1);
+
+    ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    convolve_x_func test_func = GetParam().TestFunction();
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    test_func(input, width, test, kOutputStride, width, height, filter_params_x,
+              sub_x, &conv_params2);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void TestConvolveSpeed(const InterpFilter filter, const int num_iters) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(filter, width);
+    ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      av1_convolve_x_sr_c(input, width, reference, kOutputStride, width, height,
+                          filter_params_x, 0, &conv_params1);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    convolve_x_func test_func = GetParam().TestFunction();
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      test_func(input, width, test, kOutputStride, width, height,
+                filter_params_x, 0, &conv_params2);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+           time2, time1 / time2);
+  }
+};
+
+TEST_P(AV1ConvolveXTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveXTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXTest,
+                         BuildLowbdParams(av1_convolve_x_sr_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveXTest,
+                         BuildLowbdParams(av1_convolve_x_sr_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXTest,
+                         BuildLowbdParams(av1_convolve_x_sr_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXTest,
+                         BuildLowbdParams(av1_convolve_x_sr_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/////////////////////////////////////////////////////////
+// Single reference convolve-x functions (high bit-depth)
+/////////////////////////////////////////////////////////
+typedef void (*highbd_convolve_x_func)(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+    ConvolveParams *conv_params, int bd);
+
+class AV1ConvolveXHighbdTest : public AV1ConvolveTest<highbd_convolve_x_func> {
+ public:
+  void RunTest() {
+    for (int sub_x = 0; sub_x < 16; ++sub_x) {
+      for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+           ++filter) {
+        InterpFilter f = static_cast<InterpFilter>(filter);
+        TestConvolve(sub_x, f);
+      }
+    }
+  }
+
+ public:
+  void SpeedTest() {
+    for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+         ++filter) {
+      InterpFilter f = static_cast<InterpFilter>(filter);
+      TestConvolveSpeed(f, 10000);
+    }
+  }
+
+ private:
+  void TestConvolve(const int sub_x, const InterpFilter filter) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(filter, width);
+    ConvolveParams conv_params1 =
+        get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
+    const uint16_t *input = FirstRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    av1_highbd_convolve_x_sr(input, width, reference, kOutputStride, width,
+                             height, filter_params_x, sub_x, &conv_params1,
+                             bit_depth);
+
+    ConvolveParams conv_params2 =
+        get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+                              filter_params_x, sub_x, &conv_params2, bit_depth);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void TestConvolveSpeed(const InterpFilter filter, const int num_iters) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(filter, width);
+    ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    const uint16_t *input = FirstRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      av1_highbd_convolve_x_sr_c(input, width, reference, kOutputStride, width,
+                                 height, filter_params_x, 0, &conv_params1,
+                                 bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    highbd_convolve_x_func test_func = GetParam().TestFunction();
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      test_func(input, width, test, kOutputStride, width, height,
+                filter_params_x, 0, &conv_params2, bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+           time2, time1 / time2);
+  }
+};
+
+TEST_P(AV1ConvolveXHighbdTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveXHighbdTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_x_sr_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1ConvolveXHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_x_sr_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_x_sr_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+////////////////////////////////////////////////////////
+// Single reference convolve-y functions (low bit-depth)
+////////////////////////////////////////////////////////
+typedef void (*convolve_y_func)(const uint8_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams *filter_params_y,
+                                const int subpel_y_qn);
+
+class AV1ConvolveYTest : public AV1ConvolveTest<convolve_y_func> {
+ public:
+  void RunTest() {
+    for (int sub_y = 0; sub_y < 16; ++sub_y) {
+      for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+           ++filter) {
+        InterpFilter f = static_cast<InterpFilter>(filter);
+        TestConvolve(sub_y, f);
+      }
+    }
+  }
+
+ public:
+  void SpeedTest() {
+    for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+         ++filter) {
+      InterpFilter f = static_cast<InterpFilter>(filter);
+      TestConvolveSpeed(f, 10000);
+    }
+  }
+
+ private:
+  void TestConvolve(const int sub_y, const InterpFilter filter) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(filter, height);
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    av1_convolve_y_sr_c(input, width, reference, kOutputStride, width, height,
+                        filter_params_y, sub_y);
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+                              filter_params_y, sub_y);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void TestConvolveSpeed(const InterpFilter filter, const int num_iters) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(filter, height);
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      av1_convolve_y_sr_c(input, width, reference, kOutputStride, width, height,
+                          filter_params_y, 0);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      GetParam().TestFunction()(input, width, test, kOutputStride, width,
+                                height, filter_params_y, 0);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+           time2, time1 / time2);
+  }
+};
+
+TEST_P(AV1ConvolveYTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveYTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYTest,
+                         BuildLowbdParams(av1_convolve_y_sr_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveYTest,
+                         BuildLowbdParams(av1_convolve_y_sr_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYTest,
+                         BuildLowbdParams(av1_convolve_y_sr_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYTest,
+                         BuildLowbdParams(av1_convolve_y_sr_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/////////////////////////////////////////////////////////
+// Single reference convolve-y functions (high bit-depth)
+/////////////////////////////////////////////////////////
+typedef void (*highbd_convolve_y_func)(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+    int bd);
+
+class AV1ConvolveYHighbdTest : public AV1ConvolveTest<highbd_convolve_y_func> {
+ public:
+  void RunTest() {
+    for (int sub_y = 0; sub_y < 16; ++sub_y) {
+      for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+           ++filter) {
+        InterpFilter f = static_cast<InterpFilter>(filter);
+        TestConvolve(sub_y, f);
+      }
+    }
+  }
+
+ public:
+  void SpeedTest() {
+    for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+         ++filter) {
+      InterpFilter f = static_cast<InterpFilter>(filter);
+      TestConvolveSpeed(f, 10000);
+    }
+  }
+
+ private:
+  void TestConvolve(const int sub_y, const InterpFilter filter) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(filter, height);
+    const uint16_t *input = FirstRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    av1_highbd_convolve_y_sr(input, width, reference, kOutputStride, width,
+                             height, filter_params_y, sub_y, bit_depth);
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+                              filter_params_y, sub_y, bit_depth);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void TestConvolveSpeed(const InterpFilter filter, const int num_iters) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(filter, width);
+    const uint16_t *input = FirstRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      av1_highbd_convolve_y_sr_c(input, width, reference, kOutputStride, width,
+                                 height, filter_params_y, 0, bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    highbd_convolve_y_func test_func = GetParam().TestFunction();
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      test_func(input, width, test, kOutputStride, width, height,
+                filter_params_y, 0, bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+           time2, time1 / time2);
+  }
+};
+
+TEST_P(AV1ConvolveYHighbdTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveYHighbdTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_y_sr_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1ConvolveYHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_y_sr_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_y_sr_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+//////////////////////////////////////////////////////////////
+// Single reference convolve-copy functions (low bit-depth)
+//////////////////////////////////////////////////////////////
+typedef void (*convolve_copy_func)(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride, int w,
+                                   int h);
+
+class AV1ConvolveCopyTest : public AV1ConvolveTest<convolve_copy_func> {
+ public:
+  void RunTest() {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    aom_convolve_copy(input, width, reference, kOutputStride, width, height);
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+};
+
+// Note that even though these are AOM convolve functions, we are using the
+// newer AV1 test framework.
+TEST_P(AV1ConvolveCopyTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveCopyTest,
+                         BuildLowbdParams(aom_convolve_copy_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveCopyTest,
+                         BuildLowbdParams(aom_convolve_copy_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveCopyTest,
+                         BuildLowbdParams(aom_convolve_copy_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveCopyTest,
+                         BuildLowbdParams(aom_convolve_copy_neon));
+#endif
+
+#if HAVE_MSA
+INSTANTIATE_TEST_SUITE_P(MSA, AV1ConvolveCopyTest,
+                         BuildLowbdParams(aom_convolve_copy_msa));
+#endif
+
+#if HAVE_DSPR2
+INSTANTIATE_TEST_SUITE_P(DSPR2, AV1ConvolveCopyTest,
+                         BuildLowbdParams(aom_convolve_copy_dspr2));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+///////////////////////////////////////////////////////////////
+// Single reference convolve-copy functions (high bit-depth)
+///////////////////////////////////////////////////////////////
+typedef void (*highbd_convolve_copy_func)(const uint16_t *src,
+                                          ptrdiff_t src_stride, uint16_t *dst,
+                                          ptrdiff_t dst_stride, int w, int h);
+
+class AV1ConvolveCopyHighbdTest
+    : public AV1ConvolveTest<highbd_convolve_copy_func> {
+ public:
+  void RunTest() {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+    const uint16_t *input = FirstRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    aom_highbd_convolve_copy(input, width, reference, kOutputStride, width,
+                             height);
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+};
+
+TEST_P(AV1ConvolveCopyHighbdTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveCopyHighbdTest,
+                         BuildHighbdParams(aom_highbd_convolve_copy_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveCopyHighbdTest,
+                         BuildHighbdParams(aom_highbd_convolve_copy_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveCopyHighbdTest,
+                         BuildHighbdParams(aom_highbd_convolve_copy_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+/////////////////////////////////////////////////////////
+// Single reference convolve-2D functions (low bit-depth)
+/////////////////////////////////////////////////////////
+typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride, int w, int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
+                                 const int subpel_x_qn, const int subpel_y_qn,
+                                 ConvolveParams *conv_params);
+
+class AV1Convolve2DTest : public AV1ConvolveTest<convolve_2d_func> {
+ public:
+  void RunTest() {
+    for (int sub_x = 0; sub_x < 16; ++sub_x) {
+      for (int sub_y = 0; sub_y < 16; ++sub_y) {
+        for (int h_f = EIGHTTAP_REGULAR; h_f <= INTERP_FILTERS_ALL; ++h_f) {
+          for (int v_f = EIGHTTAP_REGULAR; v_f <= INTERP_FILTERS_ALL; ++v_f) {
+            if (((h_f == MULTITAP_SHARP2) && (v_f < MULTITAP_SHARP2)) ||
+                ((h_f < MULTITAP_SHARP2) && (v_f == MULTITAP_SHARP2)))
+              continue;
+            TestConvolve(static_cast<InterpFilter>(h_f),
+                         static_cast<InterpFilter>(v_f), sub_x, sub_y);
+          }
+        }
+      }
+    }
+  }
+
+ public:
+  void SpeedTest() {
+    for (int h_f = EIGHTTAP_REGULAR; h_f <= INTERP_FILTERS_ALL; ++h_f) {
+      for (int v_f = EIGHTTAP_REGULAR; v_f <= INTERP_FILTERS_ALL; ++v_f) {
+        if (((h_f == MULTITAP_SHARP2) && (v_f < MULTITAP_SHARP2)) ||
+            ((h_f < MULTITAP_SHARP2) && (v_f == MULTITAP_SHARP2)))
+          continue;
+        TestConvolveSpeed(static_cast<InterpFilter>(h_f),
+                          static_cast<InterpFilter>(v_f), 10000);
+      }
+    }
+  }
+
+ private:
+  void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
+                    const int sub_x, const int sub_y) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(h_f, width);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(v_f, height);
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    av1_convolve_2d_sr_c(input, width, reference, kOutputStride, width, height,
+                         filter_params_x, filter_params_y, sub_x, sub_y,
+                         &conv_params1);
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+                              filter_params_x, filter_params_y, sub_x, sub_y,
+                              &conv_params2);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void TestConvolveSpeed(const InterpFilter h_f, const InterpFilter v_f,
+                         int num_iters) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(h_f, width);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(v_f, height);
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      av1_convolve_2d_sr_c(input, width, reference, kOutputStride, width,
+                           height, filter_params_x, filter_params_y, 0, 0,
+                           &conv_params1);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      GetParam().TestFunction()(input, width, test, kOutputStride, width,
+                                height, filter_params_x, filter_params_y, 0, 0,
+                                &conv_params2);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    printf("%d - %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", h_f, v_f, width, height,
+           time1, time2, time1 / time2);
+  }
+};
+
+TEST_P(AV1Convolve2DTest, RunTest) { RunTest(); }
+
+TEST_P(AV1Convolve2DTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1Convolve2DTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DTest,
+                         BuildLowbdParams(av1_convolve_2d_sr_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////
+// Single reference convolve-2d functions (high bit-depth)
+//////////////////////////////////////////////////////////
+
+typedef void (*highbd_convolve_2d_func)(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+
+class AV1Convolve2DHighbdTest
+    : public AV1ConvolveTest<highbd_convolve_2d_func> {
+ public:
+  void RunTest() {
+    for (int sub_x = 0; sub_x < 16; ++sub_x) {
+      for (int sub_y = 0; sub_y < 16; ++sub_y) {
+        for (int h_f = EIGHTTAP_REGULAR; h_f <= INTERP_FILTERS_ALL; ++h_f) {
+          for (int v_f = EIGHTTAP_REGULAR; v_f <= INTERP_FILTERS_ALL; ++v_f) {
+            if (((h_f == MULTITAP_SHARP2) && (v_f < MULTITAP_SHARP2)) ||
+                ((h_f < MULTITAP_SHARP2) && (v_f == MULTITAP_SHARP2)))
+              continue;
+            TestConvolve(static_cast<InterpFilter>(h_f),
+                         static_cast<InterpFilter>(v_f), sub_x, sub_y);
+          }
+        }
+      }
+    }
+  }
+
+ public:
+  void SpeedTest() {
+    for (int h_f = EIGHTTAP_REGULAR; h_f <= INTERP_FILTERS_ALL; ++h_f) {
+      for (int v_f = EIGHTTAP_REGULAR; v_f <= INTERP_FILTERS_ALL; ++v_f) {
+        if (((h_f == MULTITAP_SHARP2) && (v_f < MULTITAP_SHARP2)) ||
+            ((h_f < MULTITAP_SHARP2) && (v_f == MULTITAP_SHARP2)))
+          continue;
+        TestConvolveSpeed(static_cast<InterpFilter>(h_f),
+                          static_cast<InterpFilter>(v_f), 10000);
+      }
+    }
+  }
+
+ private:
+  void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
+                    const int sub_x, const int sub_y) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(h_f, width);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(v_f, height);
+    const uint16_t *input = FirstRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    ConvolveParams conv_params1 =
+        get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
+    av1_highbd_convolve_2d_sr(input, width, reference, kOutputStride, width,
+                              height, filter_params_x, filter_params_y, sub_x,
+                              sub_y, &conv_params1, bit_depth);
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    ConvolveParams conv_params2 =
+        get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
+    GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+                              filter_params_x, filter_params_y, sub_x, sub_y,
+                              &conv_params2, bit_depth);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+  void TestConvolveSpeed(const InterpFilter h_f, const InterpFilter v_f,
+                         int num_iters) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(h_f, width);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(v_f, height);
+    const uint16_t *input = FirstRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      av1_highbd_convolve_2d_sr_c(input, width, reference, kOutputStride, width,
+                                  height, filter_params_x, filter_params_y, 0,
+                                  0, &conv_params1, bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      GetParam().TestFunction()(input, width, test, kOutputStride, width,
+                                height, filter_params_x, filter_params_y, 0, 0,
+                                &conv_params2, bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    printf("%d - %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", h_f, v_f, width, height,
+           time1, time2, time1 / time2);
+  }
+};
+
+TEST_P(AV1Convolve2DHighbdTest, RunTest) { RunTest(); }
+
+TEST_P(AV1Convolve2DHighbdTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_2d_sr_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1Convolve2DHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_2d_sr_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DHighbdTest,
+                         BuildHighbdParams(av1_highbd_convolve_2d_sr_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+//////////////////////////
+// Compound Convolve Tests
+//////////////////////////
+
+// The compound functions do not work for chroma block sizes. Provide
+// a function to generate test parameters for just luma block sizes.
+template <typename T>
+std::vector<TestParam<T>> GetLumaTestParams(
+    std::initializer_list<int> bit_depths, T test_func) {
+  std::set<BlockSize> sizes;
+  for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) {
+    const int w = block_size_wide[b];
+    const int h = block_size_high[b];
+    sizes.insert(BlockSize(w, h));
+  }
+  std::vector<TestParam<T>> result;
+  for (int bit_depth : bit_depths) {
+    for (const auto &block : sizes) {
+      result.push_back(TestParam<T>(block, bit_depth, test_func));
+    }
+  }
+  return result;
+}
+
+template <typename T>
+std::vector<TestParam<T>> GetLowbdLumaTestParams(T test_func) {
+  return GetLumaTestParams({ 8 }, test_func);
+}
+
+template <typename T>
+::testing::internal::ParamGenerator<TestParam<T>> BuildLowbdLumaParams(
+    T test_func) {
+  return ::testing::ValuesIn(GetLowbdLumaTestParams(test_func));
+}
+
+TEST_F(AV1ConvolveParametersTest, GetLowbdLumaTestParams) {
+  auto v = GetLowbdLumaTestParams(av1_dist_wtd_convolve_x_c);
+  ASSERT_EQ(22U, v.size());
+  for (const auto &e : v) {
+    ASSERT_EQ(8, e.BitDepth());
+    bool same_fn = av1_dist_wtd_convolve_x_c == e.TestFunction();
+    ASSERT_TRUE(same_fn);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+template <typename T>
+std::vector<TestParam<T>> GetHighbdLumaTestParams(T test_func) {
+  return GetLumaTestParams({ 10, 12 }, test_func);
+}
+
+TEST_F(AV1ConvolveParametersTest, GetHighbdLumaTestParams) {
+  auto v = GetHighbdLumaTestParams(av1_highbd_dist_wtd_convolve_x_c);
+  ASSERT_EQ(44U, v.size());
+  int num_10 = 0;
+  int num_12 = 0;
+  for (const auto &e : v) {
+    ASSERT_TRUE(10 == e.BitDepth() || 12 == e.BitDepth());
+    bool same_fn = av1_highbd_dist_wtd_convolve_x_c == e.TestFunction();
+    ASSERT_TRUE(same_fn);
+    if (e.BitDepth() == 10) {
+      ++num_10;
+    } else {
+      ++num_12;
+    }
+  }
+  ASSERT_EQ(num_10, num_12);
+}
+
+template <typename T>
+::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdLumaParams(
+    T test_func) {
+  return ::testing::ValuesIn(GetHighbdLumaTestParams(test_func));
+}
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// Compound cases also need to test different frame offsets and weightings.
+class CompoundParam {
+ public:
+  CompoundParam(bool use_dist_wtd_comp_avg, int fwd_offset, int bck_offset)
+      : use_dist_wtd_comp_avg_(use_dist_wtd_comp_avg), fwd_offset_(fwd_offset),
+        bck_offset_(bck_offset) {}
+
+  bool UseDistWtdCompAvg() const { return use_dist_wtd_comp_avg_; }
+  int FwdOffset() const { return fwd_offset_; }
+  int BckOffset() const { return bck_offset_; }
+
+ private:
+  bool use_dist_wtd_comp_avg_;
+  int fwd_offset_;
+  int bck_offset_;
+};
+
+std::vector<CompoundParam> GetCompoundParams() {
+  std::vector<CompoundParam> result;
+  result.push_back(CompoundParam(false, 0, 0));
+  for (int k = 0; k < 2; ++k) {
+    for (int l = 0; l < 4; ++l) {
+      result.push_back(CompoundParam(true, quant_dist_lookup_table[l][k],
+                                     quant_dist_lookup_table[l][1 - k]));
+    }
+  }
+  return result;
+}
+
+TEST_F(AV1ConvolveParametersTest, GetCompoundParams) {
+  auto v = GetCompoundParams();
+  ASSERT_EQ(9U, v.size());
+  ASSERT_FALSE(v[0].UseDistWtdCompAvg());
+  for (size_t i = 1; i < v.size(); ++i) {
+    ASSERT_TRUE(v[i].UseDistWtdCompAvg());
+  }
+}
+
+////////////////////////////////////////////////
+// Compound convolve-x functions (low bit-depth)
+////////////////////////////////////////////////
+
+ConvolveParams GetConvolveParams(int do_average, CONV_BUF_TYPE *conv_buf,
+                                 int width, int bit_depth,
+                                 const CompoundParam &compound) {
+  ConvolveParams conv_params =
+      get_conv_params_no_round(do_average, 0, conv_buf, width, 1, bit_depth);
+  conv_params.use_dist_wtd_comp_avg = compound.UseDistWtdCompAvg();
+  conv_params.fwd_offset = compound.FwdOffset();
+  conv_params.bck_offset = compound.BckOffset();
+  return conv_params;
+}
+
+class AV1ConvolveXCompoundTest : public AV1ConvolveTest<convolve_x_func> {
+ public:
+  void RunTest() {
+    auto compound_params = GetCompoundParams();
+    for (int sub_pix = 0; sub_pix < 16; ++sub_pix) {
+      for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) {
+        for (const auto &c : compound_params) {
+          TestConvolve(sub_pix, static_cast<InterpFilter>(f), c);
+        }
+      }
+    }
+  }
+
+ protected:
+  virtual const InterpFilterParams *FilterParams(InterpFilter f,
+                                                 const BlockSize &block) const {
+    return av1_get_interp_filter_params_with_block_size(f, block.Width());
+  }
+
+  virtual convolve_x_func ReferenceFunc() const {
+    return av1_dist_wtd_convolve_x;
+  }
+
+ private:
+  void TestConvolve(const int sub_pix, const InterpFilter filter,
+                    const CompoundParam &compound) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const uint8_t *input1 = FirstRandomInput8(GetParam());
+    const uint8_t *input2 = SecondRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+    Convolve(ReferenceFunc(), input1, input2, reference, reference_conv_buf,
+             compound, sub_pix, filter);
+
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+             compound, sub_pix, filter);
+
+    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void Convolve(convolve_x_func test_func, const uint8_t *src1,
+                const uint8_t *src2, uint8_t *dst, CONV_BUF_TYPE *conv_buf,
+                const CompoundParam &compound, const int sub_pix,
+                const InterpFilter filter) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const InterpFilterParams *filter_params =
+        FilterParams(filter, GetParam().Block());
+
+    ConvolveParams conv_params =
+        GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
+    test_func(src1, width, dst, kOutputStride, width, height, filter_params,
+              sub_pix, &conv_params);
+
+    conv_params = GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
+    test_func(src2, width, dst, kOutputStride, width, height, filter_params,
+              sub_pix, &conv_params);
+  }
+};
+
+TEST_P(AV1ConvolveXCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_x_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveXCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_x_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_x_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_x_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/////////////////////////////////////////////////
+// Compound convolve-x functions (high bit-depth)
+/////////////////////////////////////////////////
+class AV1ConvolveXHighbdCompoundTest
+    : public AV1ConvolveTest<highbd_convolve_x_func> {
+ public:
+  void RunTest() {
+    auto compound_params = GetCompoundParams();
+    for (int sub_pix = 0; sub_pix < 16; ++sub_pix) {
+      for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) {
+        for (const auto &c : compound_params) {
+          TestConvolve(sub_pix, static_cast<InterpFilter>(f), c);
+        }
+      }
+    }
+  }
+
+ protected:
+  virtual const InterpFilterParams *FilterParams(InterpFilter f,
+                                                 const BlockSize &block) const {
+    return av1_get_interp_filter_params_with_block_size(f, block.Width());
+  }
+
+  virtual highbd_convolve_x_func ReferenceFunc() const {
+    return av1_highbd_dist_wtd_convolve_x;
+  }
+
+ private:
+  void TestConvolve(const int sub_pix, const InterpFilter filter,
+                    const CompoundParam &compound) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+
+    const uint16_t *input1 = FirstRandomInput16(GetParam());
+    const uint16_t *input2 = SecondRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+    Convolve(ReferenceFunc(), input1, input2, reference, reference_conv_buf,
+             compound, sub_pix, filter);
+
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+             compound, sub_pix, filter);
+
+    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+  void Convolve(highbd_convolve_x_func test_func, const uint16_t *src1,
+                const uint16_t *src2, uint16_t *dst, CONV_BUF_TYPE *conv_buf,
+                const CompoundParam &compound, const int sub_pix,
+                const InterpFilter filter) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const int bit_depth = GetParam().BitDepth();
+    const InterpFilterParams *filter_params =
+        FilterParams(filter, GetParam().Block());
+    ConvolveParams conv_params =
+        GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
+    test_func(src1, width, dst, kOutputStride, width, height, filter_params,
+              sub_pix, &conv_params, bit_depth);
+    conv_params =
+        GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
+    test_func(src2, width, dst, kOutputStride, width, height, filter_params,
+              sub_pix, &conv_params, bit_depth);
+  }
+};
+
+TEST_P(AV1ConvolveXHighbdCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1ConvolveXHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1ConvolveXHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1ConvolveXHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+////////////////////////////////////////////////
+// Compound convolve-y functions (low bit-depth)
+////////////////////////////////////////////////
+
+// Note that the X and Y convolve functions have the same type signature and
+// logic; they only differentiate the filter parameters and reference function.
+class AV1ConvolveYCompoundTest : public AV1ConvolveXCompoundTest {
+ protected:
+  const InterpFilterParams *FilterParams(
+      InterpFilter f, const BlockSize &block) const override {
+    return av1_get_interp_filter_params_with_block_size(f, block.Height());
+  }
+
+  convolve_x_func ReferenceFunc() const override {
+    return av1_dist_wtd_convolve_y;
+  }
+};
+
+TEST_P(AV1ConvolveYCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_y_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveYCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_y_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_y_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_y_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/////////////////////////////////////////////////
+// Compound convolve-y functions (high bit-depth)
+/////////////////////////////////////////////////
+
+// Again, the X and Y convolve functions have the same type signature and logic.
+class AV1ConvolveYHighbdCompoundTest : public AV1ConvolveXHighbdCompoundTest {
+  highbd_convolve_x_func ReferenceFunc() const override {
+    return av1_highbd_dist_wtd_convolve_y;
+  }
+  const InterpFilterParams *FilterParams(
+      InterpFilter f, const BlockSize &block) const override {
+    return av1_get_interp_filter_params_with_block_size(f, block.Height());
+  }
+};
+
+TEST_P(AV1ConvolveYHighbdCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1ConvolveYHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1ConvolveYHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1ConvolveYHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+//////////////////////////////////////////////////////
+// Compound convolve-2d-copy functions (low bit-depth)
+//////////////////////////////////////////////////////
+typedef void (*compound_conv_2d_copy_func)(const uint8_t *src, int src_stride,
+                                           uint8_t *dst, int dst_stride, int w,
+                                           int h, ConvolveParams *conv_params);
+
+class AV1Convolve2DCopyCompoundTest
+    : public AV1ConvolveTest<compound_conv_2d_copy_func> {
+ public:
+  void RunTest() {
+    auto compound_params = GetCompoundParams();
+    for (const auto &compound : compound_params) {
+      TestConvolve(compound);
+    }
+  }
+
+ private:
+  void TestConvolve(const CompoundParam &compound) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+
+    const uint8_t *input1 = FirstRandomInput8(GetParam());
+    const uint8_t *input2 = SecondRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+    Convolve(av1_dist_wtd_convolve_2d_copy, input1, input2, reference,
+             reference_conv_buf, compound);
+
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+             compound);
+
+    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void Convolve(compound_conv_2d_copy_func test_func, const uint8_t *src1,
+                const uint8_t *src2, uint8_t *dst, uint16_t *conv_buf,
+                const CompoundParam &compound) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+    ConvolveParams conv_params =
+        GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
+    test_func(src1, width, dst, kOutputStride, width, height, &conv_params);
+
+    conv_params = GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
+    test_func(src2, width, dst, kOutputStride, width, height, &conv_params);
+  }
+};
+
+TEST_P(AV1Convolve2DCopyCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DCopyCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1Convolve2DCopyCompoundTest,
+    BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1Convolve2DCopyCompoundTest,
+    BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1Convolve2DCopyCompoundTest,
+    BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+///////////////////////////////////////////////////////
+// Compound convolve-2d-copy functions (high bit-depth)
+///////////////////////////////////////////////////////
+typedef void (*highbd_compound_conv_2d_copy_func)(const uint16_t *src,
+                                                  int src_stride, uint16_t *dst,
+                                                  int dst_stride, int w, int h,
+                                                  ConvolveParams *conv_params,
+                                                  int bd);
+
+class AV1Convolve2DCopyHighbdCompoundTest
+    : public AV1ConvolveTest<highbd_compound_conv_2d_copy_func> {
+ public:
+  void RunTest() {
+    auto compound_params = GetCompoundParams();
+    for (const auto &compound : compound_params) {
+      TestConvolve(compound);
+    }
+  }
+
+ private:
+  void TestConvolve(const CompoundParam &compound) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+
+    const uint16_t *input1 = FirstRandomInput16(GetParam());
+    const uint16_t *input2 = SecondRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+    Convolve(av1_highbd_dist_wtd_convolve_2d_copy, input1, input2, reference,
+             reference_conv_buf, compound);
+
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+             compound);
+
+    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+  void Convolve(highbd_compound_conv_2d_copy_func test_func,
+                const uint16_t *src1, const uint16_t *src2, uint16_t *dst,
+                uint16_t *conv_buf, const CompoundParam &compound) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+    const int bit_depth = GetParam().BitDepth();
+
+    ConvolveParams conv_params =
+        GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
+    test_func(src1, width, dst, kOutputStride, width, height, &conv_params,
+              bit_depth);
+
+    conv_params =
+        GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
+    test_func(src2, width, dst, kOutputStride, width, height, &conv_params,
+              bit_depth);
+  }
+};
+
+TEST_P(AV1Convolve2DCopyHighbdCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1Convolve2DCopyHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1Convolve2DCopyHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1Convolve2DCopyHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+/////////////////////////////////////////////////
+// Compound convolve-2d functions (low bit-depth)
+/////////////////////////////////////////////////
+
+class AV1Convolve2DCompoundTest : public AV1ConvolveTest<convolve_2d_func> {
+ public:
+  void RunTest() {
+    auto compound_params = GetCompoundParams();
+    for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
+      for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
+        for (int sub_x = 0; sub_x < 16; ++sub_x) {
+          for (int sub_y = 0; sub_y < 16; ++sub_y) {
+            for (const auto &compound : compound_params) {
+              TestConvolve(static_cast<InterpFilter>(h_f),
+                           static_cast<InterpFilter>(v_f), sub_x, sub_y,
+                           compound);
+            }
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
+                    const int sub_x, const int sub_y,
+                    const CompoundParam &compound) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+
+    const uint8_t *input1 = FirstRandomInput8(GetParam());
+    const uint8_t *input2 = SecondRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+    Convolve(av1_dist_wtd_convolve_2d, input1, input2, reference,
+             reference_conv_buf, compound, h_f, v_f, sub_x, sub_y);
+
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+             compound, h_f, v_f, sub_x, sub_y);
+
+    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void Convolve(convolve_2d_func test_func, const uint8_t *src1,
+                const uint8_t *src2, uint8_t *dst, uint16_t *conv_buf,
+                const CompoundParam &compound, const InterpFilter h_f,
+                const InterpFilter v_f, const int sub_x, const int sub_y) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(h_f, width);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(v_f, height);
+    ConvolveParams conv_params =
+        GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
+
+    test_func(src1, width, dst, kOutputStride, width, height, filter_params_x,
+              filter_params_y, sub_x, sub_y, &conv_params);
+
+    conv_params = GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
+    test_func(src2, width, dst, kOutputStride, width, height, filter_params_x,
+              filter_params_y, sub_x, sub_y, &conv_params);
+  }
+};
+
+TEST_P(AV1Convolve2DCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1Convolve2DCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_sse2));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1Convolve2DCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DCompoundTest,
+                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////
+// Compound convolve-2d functions (high bit-depth)
+//////////////////////////////////////////////////
+
+class AV1Convolve2DHighbdCompoundTest
+    : public AV1ConvolveTest<highbd_convolve_2d_func> {
+ public:
+  void RunTest() {
+    auto compound_params = GetCompoundParams();
+    for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
+      for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
+        for (int sub_x = 0; sub_x < 16; ++sub_x) {
+          for (int sub_y = 0; sub_y < 16; ++sub_y) {
+            for (const auto &compound : compound_params) {
+              TestConvolve(static_cast<InterpFilter>(h_f),
+                           static_cast<InterpFilter>(v_f), sub_x, sub_y,
+                           compound);
+            }
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
+                    const int sub_x, const int sub_y,
+                    const CompoundParam &compound) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+    const uint16_t *input1 = FirstRandomInput16(GetParam());
+    const uint16_t *input2 = SecondRandomInput16(GetParam());
+    DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+    Convolve(av1_highbd_dist_wtd_convolve_2d, input1, input2, reference,
+             reference_conv_buf, compound, h_f, v_f, sub_x, sub_y);
+
+    DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+             compound, h_f, v_f, sub_x, sub_y);
+
+    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+    AssertOutputBufferEq(reference, test, width, height);
+  }
+
+ private:
+  void Convolve(highbd_convolve_2d_func test_func, const uint16_t *src1,
+                const uint16_t *src2, uint16_t *dst, uint16_t *conv_buf,
+                const CompoundParam &compound, const InterpFilter h_f,
+                const InterpFilter v_f, const int sub_x, const int sub_y) {
+    const BlockSize &block = GetParam().Block();
+    const int width = block.Width();
+    const int height = block.Height();
+
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(h_f, width);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(v_f, height);
+    const int bit_depth = GetParam().BitDepth();
+    ConvolveParams conv_params =
+        GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
+    test_func(src1, width, dst, kOutputStride, width, height, filter_params_x,
+              filter_params_y, sub_x, sub_y, &conv_params, bit_depth);
+
+    conv_params =
+        GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
+    test_func(src2, width, dst, kOutputStride, width, height, filter_params_x,
+              filter_params_y, sub_x, sub_y, &conv_params, bit_depth);
+  }
+};
+
+TEST_P(AV1Convolve2DHighbdCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1Convolve2DHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1Convolve2DHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1Convolve2DHighbdCompoundTest,
+    BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_avx2));
+#endif
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+}  // namespace
diff --git a/media/libaom/src/test/av1_encoder_parms_get_to_decoder.cc b/media/libaom/src/test/av1_encoder_parms_get_to_decoder.cc
index 76b82f58f7..e81ad87e70 100644
--- a/media/libaom/src/test/av1_encoder_parms_get_to_decoder.cc
+++ b/media/libaom/src/test/av1_encoder_parms_get_to_decoder.cc
@@ -88,8 +88,7 @@ class AVxEncoderParmsGetToDecoder
   virtual ~AVxEncoderParmsGetToDecoder() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(::libaom_test::kTwoPassGood);
+    InitializeConfig(::libaom_test::kTwoPassGood);
     cfg_.g_lag_in_frames = 25;
     test_video_ = kAV1ParamPassingTestVector;
     cfg_.rc_target_bitrate = test_video_.bitrate;
@@ -98,6 +97,7 @@ class AVxEncoderParmsGetToDecoder
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
                                   ::libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 3);
       encoder->Control(AV1E_SET_COLOR_PRIMARIES, encode_parms.color_primaries);
       encoder->Control(AV1E_SET_TRANSFER_CHARACTERISTICS,
                        encode_parms.transfer_characteristics);
@@ -150,11 +150,11 @@ TEST_P(AVxEncoderParmsGetToDecoder, BitstreamParms) {
 
   std::unique_ptr<libaom_test::VideoSource> video(
       new libaom_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video, nullptr);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
 
-AV1_INSTANTIATE_TEST_CASE(AVxEncoderParmsGetToDecoder,
-                          ::testing::ValuesIn(kAV1EncodeParameterSet));
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderParmsGetToDecoder,
+                           ::testing::ValuesIn(kAV1EncodeParameterSet));
 }  // namespace
diff --git a/media/libaom/src/test/av1_ext_tile_test.cc b/media/libaom/src/test/av1_ext_tile_test.cc
index 424d2f065f..c4f9cdc249 100644
--- a/media/libaom/src/test/av1_ext_tile_test.cc
+++ b/media/libaom/src/test/av1_ext_tile_test.cc
@@ -64,8 +64,7 @@ class AV1ExtTileTest
   }
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
+    InitializeConfig(encoding_mode_);
 
     cfg_.g_lag_in_frames = 0;
     cfg_.rc_end_usage = AOM_VBR;
@@ -199,7 +198,7 @@ class AV1ExtTileTest
 
 TEST_P(AV1ExtTileTest, DecoderResultTest) { TestRoundTrip(); }
 
-AV1_INSTANTIATE_TEST_CASE(
+AV1_INSTANTIATE_TEST_SUITE(
     // Now only test 2-pass mode.
     AV1ExtTileTest, ::testing::Values(::libaom_test::kTwoPassGood),
     ::testing::Range(1, 4));
@@ -208,7 +207,7 @@ class AV1ExtTileTestLarge : public AV1ExtTileTest {};
 
 TEST_P(AV1ExtTileTestLarge, DecoderResultTest) { TestRoundTrip(); }
 
-AV1_INSTANTIATE_TEST_CASE(
+AV1_INSTANTIATE_TEST_SUITE(
     // Now only test 2-pass mode.
     AV1ExtTileTestLarge, ::testing::Values(::libaom_test::kTwoPassGood),
     ::testing::Range(0, 1));
diff --git a/media/libaom/src/test/av1_external_partition_test.cc b/media/libaom/src/test/av1_external_partition_test.cc
new file mode 100644
index 0000000000..41fc96c052
--- /dev/null
+++ b/media/libaom/src/test/av1_external_partition_test.cc
@@ -0,0 +1,702 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <fstream>
+#include <new>
+#include <sstream>
+#include <string>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
+#include "av1/common/blockd.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/util.h"
+
+#if CONFIG_AV1_ENCODER
+#if !CONFIG_REALTIME_ONLY
+namespace {
+
+constexpr int kFrameNum = 8;
+constexpr int kVersion = 1;
+
+typedef struct TestData {
+  int version = kVersion;
+} TestData;
+
+typedef struct ToyModel {
+  TestData *data;
+  aom_ext_part_config_t config;
+  aom_ext_part_funcs_t funcs;
+  int mi_row;
+  int mi_col;
+  int frame_width;
+  int frame_height;
+  BLOCK_SIZE block_size;
+} ToyModel;
+
+// Note:
+// if CONFIG_PARTITION_SEARCH_ORDER = 0, we test APIs designed for the baseline
+// encoder's DFS partition search workflow.
+// if CONFIG_PARTITION_SEARCH_ORDER = 1, we test APIs designed for the new
+// ML model's partition search workflow.
+#if CONFIG_PARTITION_SEARCH_ORDER
+aom_ext_part_status_t ext_part_create_model(
+    void *priv, const aom_ext_part_config_t *part_config,
+    aom_ext_part_model_t *ext_part_model) {
+  TestData *received_data = reinterpret_cast<TestData *>(priv);
+  EXPECT_EQ(received_data->version, kVersion);
+  ToyModel *toy_model = new (std::nothrow) ToyModel;
+  if (toy_model == nullptr) {
+    EXPECT_NE(toy_model, nullptr);
+    return AOM_EXT_PART_ERROR;
+  }
+  toy_model->data = received_data;
+  *ext_part_model = toy_model;
+  EXPECT_EQ(part_config->superblock_size, BLOCK_64X64);
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_send_features(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_features_t *part_features) {
+  ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+  toy_model->mi_row = part_features->mi_row;
+  toy_model->mi_col = part_features->mi_col;
+  toy_model->frame_width = part_features->frame_width;
+  toy_model->frame_height = part_features->frame_height;
+  toy_model->block_size = static_cast<BLOCK_SIZE>(part_features->block_size);
+  return AOM_EXT_PART_OK;
+}
+
+// The model provide the whole decision tree to the encoder.
+aom_ext_part_status_t ext_part_get_partition_decision_whole_tree(
+    aom_ext_part_model_t ext_part_model,
+    aom_partition_decision_t *ext_part_decision) {
+  ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+  // A toy model that always asks the encoder to encode with
+  // 4x4 blocks (the smallest).
+  ext_part_decision->is_final_decision = 1;
+  // Note: super block size is fixed to BLOCK_64X64 for the
+  // input video. It is determined inside the encoder, see the
+  // check in "ext_part_create_model".
+  const int is_last_sb_col =
+      toy_model->mi_col * 4 + 64 > toy_model->frame_width;
+  const int is_last_sb_row =
+      toy_model->mi_row * 4 + 64 > toy_model->frame_height;
+  if (is_last_sb_row && is_last_sb_col) {
+    // 64x64: 1 node
+    // 32x32: 4 nodes (only the first one will further split)
+    // 16x16: 4 nodes
+    // 8x8:   4 * 4 nodes
+    // 4x4:   4 * 4 * 4 nodes
+    const int num_blocks = 1 + 4 + 4 + 4 * 4 + 4 * 4 * 4;
+    const int num_4x4_blocks = 4 * 4 * 4;
+    ext_part_decision->num_nodes = num_blocks;
+    // 64x64
+    ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+    // 32x32, only the first one will split, the other three are
+    // out of frame boundary.
+    ext_part_decision->partition_decision[1] = PARTITION_SPLIT;
+    ext_part_decision->partition_decision[2] = PARTITION_NONE;
+    ext_part_decision->partition_decision[3] = PARTITION_NONE;
+    ext_part_decision->partition_decision[4] = PARTITION_NONE;
+    // The rest blocks inside the top-left 32x32 block.
+    for (int i = 5; i < num_blocks - num_4x4_blocks; ++i) {
+      ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+    }
+    for (int i = num_blocks - num_4x4_blocks; i < num_blocks; ++i) {
+      ext_part_decision->partition_decision[i] = PARTITION_NONE;
+    }
+  } else if (is_last_sb_row) {
+    // 64x64: 1 node
+    // 32x32: 4 nodes (only the first two will further split)
+    // 16x16: 2 * 4 nodes
+    // 8x8:   2 * 4 * 4 nodes
+    // 4x4:   2 * 4 * 4 * 4 nodes
+    const int num_blocks = 1 + 4 + 2 * 4 + 2 * 4 * 4 + 2 * 4 * 4 * 4;
+    const int num_4x4_blocks = 2 * 4 * 4 * 4;
+    ext_part_decision->num_nodes = num_blocks;
+    // 64x64
+    ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+    // 32x32, only the first two will split, the other two are out
+    // of frame boundary.
+    ext_part_decision->partition_decision[1] = PARTITION_SPLIT;
+    ext_part_decision->partition_decision[2] = PARTITION_SPLIT;
+    ext_part_decision->partition_decision[3] = PARTITION_NONE;
+    ext_part_decision->partition_decision[4] = PARTITION_NONE;
+    // The rest blocks.
+    for (int i = 5; i < num_blocks - num_4x4_blocks; ++i) {
+      ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+    }
+    for (int i = num_blocks - num_4x4_blocks; i < num_blocks; ++i) {
+      ext_part_decision->partition_decision[i] = PARTITION_NONE;
+    }
+  } else if (is_last_sb_col) {
+    // 64x64: 1 node
+    // 32x32: 4 nodes (only the top-left and bottom-left will further split)
+    // 16x16: 2 * 4 nodes
+    // 8x8:   2 * 4 * 4 nodes
+    // 4x4:   2 * 4 * 4 * 4 nodes
+    const int num_blocks = 1 + 4 + 2 * 4 + 2 * 4 * 4 + 2 * 4 * 4 * 4;
+    const int num_4x4_blocks = 2 * 4 * 4 * 4;
+    ext_part_decision->num_nodes = num_blocks;
+    // 64x64
+    ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+    // 32x32, only the top-left and bottom-left will split, the other two are
+    // out of frame boundary.
+    ext_part_decision->partition_decision[1] = PARTITION_SPLIT;
+    ext_part_decision->partition_decision[2] = PARTITION_NONE;
+    ext_part_decision->partition_decision[3] = PARTITION_SPLIT;
+    ext_part_decision->partition_decision[4] = PARTITION_NONE;
+    // The rest blocks.
+    for (int i = 5; i < num_blocks - num_4x4_blocks; ++i) {
+      ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+    }
+    for (int i = num_blocks - num_4x4_blocks; i < num_blocks; ++i) {
+      ext_part_decision->partition_decision[i] = PARTITION_NONE;
+    }
+  } else {
+    // 64x64: 1 node
+    // 32x32: 4 nodes
+    // 16x16: 4 * 4 nodes
+    // 8x8:   4 * 4 * 4 nodes
+    // 4x4:   4 * 4 * 4 * 4 nodes
+    const int num_blocks = 1 + 4 + 4 * 4 + 4 * 4 * 4 + 4 * 4 * 4 * 4;
+    const int num_4x4_blocks = 4 * 4 * 4 * 4;
+    ext_part_decision->num_nodes = num_blocks;
+    for (int i = 0; i < num_blocks - num_4x4_blocks; ++i) {
+      ext_part_decision->partition_decision[i] = PARTITION_SPLIT;
+    }
+    for (int i = num_blocks - num_4x4_blocks; i < num_blocks; ++i) {
+      ext_part_decision->partition_decision[i] = PARTITION_NONE;
+    }
+  }
+
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_get_partition_decision_recursive(
+    aom_ext_part_model_t ext_part_model,
+    aom_partition_decision_t *ext_part_decision) {
+  ext_part_decision->current_decision = PARTITION_NONE;
+  ext_part_decision->is_final_decision = 1;
+  ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+  // Note: super block size is fixed to BLOCK_64X64 for the
+  // input video. It is determined inside the encoder, see the
+  // check in "ext_part_create_model".
+  const int is_last_sb_col =
+      toy_model->mi_col * 4 + 64 > toy_model->frame_width;
+  const int is_last_sb_row =
+      toy_model->mi_row * 4 + 64 > toy_model->frame_height;
+  if (is_last_sb_row && is_last_sb_col) {
+    if (block_size_wide[toy_model->block_size] == 64) {
+      ext_part_decision->current_decision = PARTITION_SPLIT;
+    } else {
+      ext_part_decision->current_decision = PARTITION_NONE;
+    }
+  } else if (is_last_sb_row) {
+    if (block_size_wide[toy_model->block_size] == 64) {
+      ext_part_decision->current_decision = PARTITION_SPLIT;
+    } else {
+      ext_part_decision->current_decision = PARTITION_NONE;
+    }
+  } else if (is_last_sb_col) {
+    if (block_size_wide[toy_model->block_size] == 64) {
+      ext_part_decision->current_decision = PARTITION_SPLIT;
+    } else {
+      ext_part_decision->current_decision = PARTITION_NONE;
+    }
+  } else {
+    ext_part_decision->current_decision = PARTITION_NONE;
+  }
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_send_partition_stats(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_stats_t *ext_part_stats) {
+  (void)ext_part_model;
+  (void)ext_part_stats;
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_delete_model(
+    aom_ext_part_model_t ext_part_model) {
+  ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+  EXPECT_EQ(toy_model->data->version, kVersion);
+  delete toy_model;
+  return AOM_EXT_PART_OK;
+}
+
+class ExternalPartitionTestAPI
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ExternalPartitionTestAPI()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
+  virtual ~ExternalPartitionTestAPI() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 4;
+    cfg_.rc_target_bitrate = 400;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual bool DoDecode() const { return false; }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  void SetExternalPartition(bool use_external_partition) {
+    use_external_partition_ = use_external_partition;
+  }
+
+  void SetPartitionControlMode(int mode) { partition_control_mode_ = mode; }
+
+  void SetDecisionMode(aom_ext_part_decision_mode_t mode) {
+    decision_mode_ = mode;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      if (decision_mode_ == AOM_EXT_PART_WHOLE_TREE) {
+        aom_ext_part_funcs_t ext_part_funcs;
+        ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_);
+        ext_part_funcs.decision_mode = AOM_EXT_PART_WHOLE_TREE;
+        ext_part_funcs.create_model = ext_part_create_model;
+        ext_part_funcs.send_features = ext_part_send_features;
+        ext_part_funcs.get_partition_decision =
+            ext_part_get_partition_decision_whole_tree;
+        ext_part_funcs.send_partition_stats = ext_part_send_partition_stats;
+        ext_part_funcs.delete_model = ext_part_delete_model;
+
+        encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        if (use_external_partition_) {
+          encoder->Control(AV1E_SET_EXTERNAL_PARTITION, &ext_part_funcs);
+        }
+        if (partition_control_mode_ == -1) {
+          encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 128);
+          encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 4);
+        } else {
+          switch (partition_control_mode_) {
+            case 1:
+              encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 64);
+              encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 64);
+              break;
+            case 2:
+              encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 4);
+              encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 4);
+              break;
+            default: assert(0 && "Invalid partition control mode."); break;
+          }
+        }
+      } else if (decision_mode_ == AOM_EXT_PART_RECURSIVE) {
+        aom_ext_part_funcs_t ext_part_funcs;
+        ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_);
+        ext_part_funcs.decision_mode = AOM_EXT_PART_RECURSIVE;
+        ext_part_funcs.create_model = ext_part_create_model;
+        ext_part_funcs.send_features = ext_part_send_features;
+        ext_part_funcs.get_partition_decision =
+            ext_part_get_partition_decision_recursive;
+        ext_part_funcs.send_partition_stats = ext_part_send_partition_stats;
+        ext_part_funcs.delete_model = ext_part_delete_model;
+
+        encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        if (use_external_partition_) {
+          encoder->Control(AV1E_SET_EXTERNAL_PARTITION, &ext_part_funcs);
+        }
+        if (partition_control_mode_ == -1) {
+          encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 128);
+          encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 4);
+        } else {
+          switch (partition_control_mode_) {
+            case 1:
+              encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 64);
+              encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 64);
+              break;
+            case 2:
+              encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 4);
+              encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 4);
+              break;
+            default: assert(0 && "Invalid partition control mode."); break;
+          }
+        }
+      } else {
+        assert(0 && "Invalid decision mode.");
+      }
+    }
+  }
+
+ private:
+  libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  double psnr_;
+  unsigned int nframes_;
+  bool use_external_partition_ = false;
+  TestData test_data_;
+  int partition_control_mode_ = -1;
+  aom_ext_part_decision_mode_t decision_mode_;
+};
+
+// Encode twice and expect the same psnr value.
+// The first run is a normal encoding run with restricted partition types,
+// i.e., we use control flags to force the encoder to encode with the
+// 4x4 block size.
+// The second run is to get partition decisions from a toy model that we
+// built, which will asks the encoder to encode with the 4x4 blocks.
+// We expect the encoding results are the same.
+TEST_P(ExternalPartitionTestAPI, WholePartitionTree4x4Block) {
+  ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+  SetExternalPartition(false);
+  SetPartitionControlMode(2);
+  SetDecisionMode(AOM_EXT_PART_WHOLE_TREE);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr = GetAveragePsnr();
+
+  SetExternalPartition(true);
+  SetPartitionControlMode(2);
+  SetDecisionMode(AOM_EXT_PART_WHOLE_TREE);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr2 = GetAveragePsnr();
+
+  EXPECT_DOUBLE_EQ(psnr, psnr2);
+}
+
+TEST_P(ExternalPartitionTestAPI, RecursivePartition) {
+  ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+  SetExternalPartition(false);
+  SetPartitionControlMode(1);
+  SetDecisionMode(AOM_EXT_PART_RECURSIVE);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr = GetAveragePsnr();
+
+  SetExternalPartition(true);
+  SetPartitionControlMode(1);
+  SetDecisionMode(AOM_EXT_PART_RECURSIVE);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr2 = GetAveragePsnr();
+
+  const double psnr_thresh = 0.02;
+  EXPECT_NEAR(psnr, psnr2, psnr_thresh);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ExternalPartitionTestAPI,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(4));  // cpu_used
+
+#else   // !CONFIG_PARTITION_SEARCH_ORDER
+// Feature files written during encoding, as defined in partition_strategy.c.
+std::string feature_file_names[] = {
+  "feature_before_partition_none",
+  "feature_before_partition_none_prune_rect",
+  "feature_after_partition_none_prune",
+  "feature_after_partition_none_terminate",
+  "feature_after_partition_split_terminate",
+  "feature_after_partition_split_prune_rect",
+  "feature_after_partition_rect",
+  "feature_after_partition_ab",
+};
+
+// Files written here in the test, where the feature data is received
+// from the API.
+std::string test_feature_file_names[] = {
+  "test_feature_before_partition_none",
+  "test_feature_before_partition_none_prune_rect",
+  "test_feature_after_partition_none_prune",
+  "test_feature_after_partition_none_terminate",
+  "test_feature_after_partition_split_terminate",
+  "test_feature_after_partition_split_prune_rect",
+  "test_feature_after_partition_rect",
+  "test_feature_after_partition_ab",
+};
+
+static void write_features_to_file(const float *features,
+                                   const int feature_size, const int id) {
+  if (!WRITE_FEATURE_TO_FILE) return;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s",
+           test_feature_file_names[id].c_str());
+  FILE *pfile = fopen(filename, "a");
+  ASSERT_NE(pfile, nullptr);
+  for (int i = 0; i < feature_size; ++i) {
+    fprintf(pfile, "%.6f", features[i]);
+    if (i < feature_size - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  fclose(pfile);
+}
+
+aom_ext_part_status_t ext_part_create_model(
+    void *priv, const aom_ext_part_config_t *part_config,
+    aom_ext_part_model_t *ext_part_model) {
+  TestData *received_data = reinterpret_cast<TestData *>(priv);
+  EXPECT_EQ(received_data->version, kVersion);
+  ToyModel *toy_model = new (std::nothrow) ToyModel;
+  if (toy_model == nullptr) {
+    EXPECT_NE(toy_model, nullptr);
+    return AOM_EXT_PART_ERROR;
+  }
+  toy_model->data = received_data;
+  *ext_part_model = toy_model;
+  EXPECT_EQ(part_config->superblock_size, BLOCK_64X64);
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_create_model_test(
+    void *priv, const aom_ext_part_config_t *part_config,
+    aom_ext_part_model_t *ext_part_model) {
+  (void)priv;
+  (void)ext_part_model;
+  EXPECT_EQ(part_config->superblock_size, BLOCK_64X64);
+  // Return status indicates it's a encoder test. It lets the encoder
+  // set a flag and write partition features to text files.
+  return AOM_EXT_PART_TEST;
+}
+
+aom_ext_part_status_t ext_part_send_features(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_features_t *part_features) {
+  (void)ext_part_model;
+  (void)part_features;
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_send_features_test(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_features_t *part_features) {
+  (void)ext_part_model;
+  if (part_features->id == AOM_EXT_PART_FEATURE_BEFORE_NONE) {
+    write_features_to_file(part_features->before_part_none.f,
+                           AOM_EXT_PART_SIZE_DIRECT_SPLIT, 0);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2) {
+    write_features_to_file(part_features->before_part_none.f_part2,
+                           AOM_EXT_PART_SIZE_PRUNE_PART, 1);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_NONE) {
+    write_features_to_file(part_features->after_part_none.f,
+                           AOM_EXT_PART_SIZE_PRUNE_NONE, 2);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_NONE_PART2) {
+    write_features_to_file(part_features->after_part_none.f_terminate,
+                           AOM_EXT_PART_SIZE_TERM_NONE, 3);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_SPLIT) {
+    write_features_to_file(part_features->after_part_split.f_terminate,
+                           AOM_EXT_PART_SIZE_TERM_SPLIT, 4);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2) {
+    write_features_to_file(part_features->after_part_split.f_prune_rect,
+                           AOM_EXT_PART_SIZE_PRUNE_RECT, 5);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_RECT) {
+    write_features_to_file(part_features->after_part_rect.f,
+                           AOM_EXT_PART_SIZE_PRUNE_AB, 6);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_AB) {
+    write_features_to_file(part_features->after_part_ab.f,
+                           AOM_EXT_PART_SIZE_PRUNE_4_WAY, 7);
+  }
+  return AOM_EXT_PART_TEST;
+}
+
+aom_ext_part_status_t ext_part_get_partition_decision(
+    aom_ext_part_model_t ext_part_model,
+    aom_partition_decision_t *ext_part_decision) {
+  (void)ext_part_model;
+  (void)ext_part_decision;
+  // Return an invalid decision such that the encoder doesn't take any
+  // partition decision from the ml model.
+  return AOM_EXT_PART_ERROR;
+}
+
+aom_ext_part_status_t ext_part_send_partition_stats(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_stats_t *ext_part_stats) {
+  (void)ext_part_model;
+  (void)ext_part_stats;
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_delete_model(
+    aom_ext_part_model_t ext_part_model) {
+  ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+  EXPECT_EQ(toy_model->data->version, kVersion);
+  delete toy_model;
+  return AOM_EXT_PART_OK;
+}
+
+class ExternalPartitionTestDfsAPI
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ExternalPartitionTestDfsAPI()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
+  virtual ~ExternalPartitionTestDfsAPI() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 4;
+    cfg_.rc_target_bitrate = 400;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual bool DoDecode() const { return false; }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  void SetExternalPartition(bool use_external_partition) {
+    use_external_partition_ = use_external_partition;
+  }
+
+  void SetTestSendFeatures(int test_send_features) {
+    test_send_features_ = test_send_features;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      aom_ext_part_funcs_t ext_part_funcs;
+      ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_);
+      if (use_external_partition_) {
+        ext_part_funcs.create_model = ext_part_create_model;
+        ext_part_funcs.send_features = ext_part_send_features;
+      }
+      if (test_send_features_ == 1) {
+        ext_part_funcs.create_model = ext_part_create_model;
+        ext_part_funcs.send_features = ext_part_send_features_test;
+      } else if (test_send_features_ == 0) {
+        ext_part_funcs.create_model = ext_part_create_model_test;
+        ext_part_funcs.send_features = ext_part_send_features;
+      }
+      ext_part_funcs.get_partition_decision = ext_part_get_partition_decision;
+      ext_part_funcs.send_partition_stats = ext_part_send_partition_stats;
+      ext_part_funcs.delete_model = ext_part_delete_model;
+
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      if (use_external_partition_) {
+        encoder->Control(AV1E_SET_EXTERNAL_PARTITION, &ext_part_funcs);
+      }
+    }
+  }
+
+ private:
+  libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  double psnr_;
+  unsigned int nframes_;
+  bool use_external_partition_ = false;
+  int test_send_features_ = -1;
+  TestData test_data_;
+};
+
+// Encode twice and expect the same psnr value.
+// The first run is the baseline without external partition.
+// The second run is to get partition decisions from the toy model we defined.
+// Here, we let the partition decision return invalid for all stages.
+// In this case, the external partition doesn't alter the original encoder
+// behavior. So we expect the same encoding results.
+TEST_P(ExternalPartitionTestDfsAPI, EncodeMatch) {
+  ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+  SetExternalPartition(false);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr = GetAveragePsnr();
+
+  SetExternalPartition(true);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr2 = GetAveragePsnr();
+
+  EXPECT_DOUBLE_EQ(psnr, psnr2);
+}
+
+// Encode twice to compare generated feature files.
+// The first run let the encoder write partition features to file.
+// The second run calls send partition features function to send features to
+// the external model, and we write them to file.
+// The generated files should match each other.
+TEST_P(ExternalPartitionTestDfsAPI, SendFeatures) {
+  ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+  SetExternalPartition(true);
+  SetTestSendFeatures(0);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  SetExternalPartition(true);
+  SetTestSendFeatures(1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  if (!WRITE_FEATURE_TO_FILE) return;
+
+  // Compare feature files by reading them into strings.
+  for (int i = 0; i < 8; ++i) {
+    std::ifstream base_file(feature_file_names[i]);
+    ASSERT_TRUE(base_file.good());
+    std::stringstream base_stream;
+    base_stream << base_file.rdbuf();
+    std::string base_string = base_stream.str();
+
+    std::ifstream test_file(test_feature_file_names[i]);
+    ASSERT_TRUE(test_file.good());
+    std::stringstream test_stream;
+    test_stream << test_file.rdbuf();
+    std::string test_string = test_stream.str();
+
+    EXPECT_STREQ(base_string.c_str(), test_string.c_str());
+  }
+
+  // Remove files.
+  std::string command("rm -f feature_* test_feature_*");
+  system(command.c_str());
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ExternalPartitionTestDfsAPI,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(4));  // cpu_used
+#endif  // CONFIG_PARTITION_SEARCH_ORDER
+
+}  // namespace
+#endif  // !CONFIG_REALTIME_ONLY
+#endif  // CONFIG_AV1_ENCODER
diff --git a/media/libaom/src/test/av1_fwd_txfm1d_test.cc b/media/libaom/src/test/av1_fwd_txfm1d_test.cc
index abc46ed5a8..5247d7616c 100644
--- a/media/libaom/src/test/av1_fwd_txfm1d_test.cc
+++ b/media/libaom/src/test/av1_fwd_txfm1d_test.cc
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <memory>
+#include <new>
+
 #include "av1/encoder/av1_fwd_txfm1d.h"
 #include "test/av1_txfm_test.h"
 
@@ -65,10 +68,14 @@ TEST(av1_fwd_txfm1d, accuracy) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   for (int si = 0; si < txfm_size_num; ++si) {
     int txfm_size = txfm_size_ls[si];
-    int32_t *input = new int32_t[txfm_size];
-    int32_t *output = new int32_t[txfm_size];
-    double *ref_input = new double[txfm_size];
-    double *ref_output = new double[txfm_size];
+    std::unique_ptr<int32_t[]> input(new (std::nothrow) int32_t[txfm_size]);
+    std::unique_ptr<int32_t[]> output(new (std::nothrow) int32_t[txfm_size]);
+    std::unique_ptr<double[]> ref_input(new (std::nothrow) double[txfm_size]);
+    std::unique_ptr<double[]> ref_output(new (std::nothrow) double[txfm_size]);
+    ASSERT_NE(input, nullptr);
+    ASSERT_NE(output, nullptr);
+    ASSERT_NE(ref_input, nullptr);
+    ASSERT_NE(ref_output, nullptr);
 
     for (int ti = 0; ti < txfm_type_num; ++ti) {
       TYPE_TXFM txfm_type = txfm_type_ls[ti];
@@ -83,8 +90,9 @@ TEST(av1_fwd_txfm1d, accuracy) {
             ref_input[ni] = static_cast<double>(input[ni]);
           }
 
-          fwd_txfm_func(input, output, cos_bit, range_bit);
-          reference_hybrid_1d(ref_input, ref_output, txfm_size, txfm_type);
+          fwd_txfm_func(input.get(), output.get(), cos_bit, range_bit);
+          reference_hybrid_1d(ref_input.get(), ref_output.get(), txfm_size,
+                              txfm_type);
 
           for (int ni = 0; ni < txfm_size; ++ni) {
             ASSERT_LE(
@@ -95,11 +103,6 @@ TEST(av1_fwd_txfm1d, accuracy) {
         }
       }
     }
-
-    delete[] input;
-    delete[] output;
-    delete[] ref_input;
-    delete[] ref_output;
   }
 }
 }  // namespace
diff --git a/media/libaom/src/test/av1_fwd_txfm2d_test.cc b/media/libaom/src/test/av1_fwd_txfm2d_test.cc
index dd60665769..8496937529 100644
--- a/media/libaom/src/test/av1_fwd_txfm2d_test.cc
+++ b/media/libaom/src/test/av1_fwd_txfm2d_test.cc
@@ -55,12 +55,16 @@ class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
     txfm2d_size_ = tx_width_ * tx_height_;
     input_ = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(input_[0]) * txfm2d_size_));
+    ASSERT_NE(input_, nullptr);
     output_ = reinterpret_cast<int32_t *>(
         aom_memalign(16, sizeof(output_[0]) * txfm2d_size_));
+    ASSERT_NE(output_, nullptr);
     ref_input_ = reinterpret_cast<double *>(
         aom_memalign(16, sizeof(ref_input_[0]) * txfm2d_size_));
+    ASSERT_NE(ref_input_, nullptr);
     ref_output_ = reinterpret_cast<double *>(
         aom_memalign(16, sizeof(ref_output_[0]) * txfm2d_size_));
+    ASSERT_NE(ref_output_, nullptr);
   }
 
   void RunFwdAccuracyCheck() {
@@ -354,6 +358,7 @@ void AV1FwdTxfm2dSpeedTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
 typedef std::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam;
 
 class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1FwdTxfm2dTest);
 
 TEST_P(AV1FwdTxfm2dTest, match) {
   AV1FwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
@@ -361,6 +366,78 @@ TEST_P(AV1FwdTxfm2dTest, match) {
 TEST_P(AV1FwdTxfm2dTest, DISABLED_Speed) {
   AV1FwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
 }
+TEST(AV1FwdTxfm2dTest, DCTScaleTest) {
+  BitDepthInfo bd_info;
+  bd_info.bit_depth = 8;
+  bd_info.use_highbitdepth_buf = 0;
+  DECLARE_ALIGNED(32, int16_t, src_diff[1024]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[1024]);
+
+  const TX_SIZE tx_size_list[4] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32 };
+  const int stride_list[4] = { 4, 8, 16, 32 };
+  const int ref_scale_list[4] = { 64, 64, 64, 16 };
+
+  for (int i = 0; i < 4; i++) {
+    TX_SIZE tx_size = tx_size_list[i];
+    int stride = stride_list[i];
+    int array_size = stride * stride;
+
+    for (int i = 0; i < array_size; i++) {
+      src_diff[i] = 8;
+      coeff[i] = 0;
+    }
+
+    av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, stride,
+                   coeff);
+
+    double input_sse = 0;
+    double output_sse = 0;
+    for (int i = 0; i < array_size; i++) {
+      input_sse += pow(src_diff[i], 2);
+      output_sse += pow(coeff[i], 2);
+    }
+
+    double scale = output_sse / input_sse;
+
+    EXPECT_NEAR(scale, ref_scale_list[i], 5);
+  }
+}
+TEST(AV1FwdTxfm2dTest, HadamardScaleTest) {
+  BitDepthInfo bd_info;
+  bd_info.bit_depth = 8;
+  bd_info.use_highbitdepth_buf = 0;
+  DECLARE_ALIGNED(32, int16_t, src_diff[1024]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[1024]);
+
+  const TX_SIZE tx_size_list[4] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32 };
+  const int stride_list[4] = { 4, 8, 16, 32 };
+  const int ref_scale_list[4] = { 1, 64, 64, 16 };
+
+  for (int i = 0; i < 4; i++) {
+    TX_SIZE tx_size = tx_size_list[i];
+    int stride = stride_list[i];
+    int array_size = stride * stride;
+
+    for (int i = 0; i < array_size; i++) {
+      src_diff[i] = 8;
+      coeff[i] = 0;
+    }
+
+    av1_quick_txfm(/*use_hadamard=*/1, tx_size, bd_info, src_diff, stride,
+                   coeff);
+
+    double input_sse = 0;
+    double output_sse = 0;
+    for (int i = 0; i < array_size; i++) {
+      input_sse += pow(src_diff[i], 2);
+      output_sse += pow(coeff[i], 2);
+    }
+
+    double scale = output_sse / input_sse;
+
+    EXPECT_NEAR(scale, ref_scale_list[i], 5);
+  }
+}
 using ::testing::Combine;
 using ::testing::Values;
 using ::testing::ValuesIn;
@@ -418,6 +495,20 @@ INSTANTIATE_TEST_SUITE_P(AVX2, AV1FwdTxfm2dTest,
                                  Values(av1_lowbd_fwd_txfm_avx2)));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+
+static TX_SIZE fwd_txfm_for_neon[] = { TX_4X4,   TX_8X8,   TX_16X16, TX_32X32,
+                                       TX_64X64, TX_4X8,   TX_8X4,   TX_8X16,
+                                       TX_16X8,  TX_16X32, TX_32X16, TX_32X64,
+                                       TX_64X32, TX_4X16,  TX_16X4,  TX_8X32,
+                                       TX_32X8,  TX_16X64, TX_64X16 };
+
+INSTANTIATE_TEST_SUITE_P(NEON, AV1FwdTxfm2dTest,
+                         Combine(ValuesIn(fwd_txfm_for_neon),
+                                 Values(av1_lowbd_fwd_txfm_neon)));
+
+#endif  // HAVE_NEON
+
 typedef void (*Highbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff,
                                      int diff_stride, TxfmParam *txfm_param);
 
@@ -548,6 +639,7 @@ typedef std::tuple<TX_SIZE, Highbd_fwd_txfm_func> HighbdFwdTxfm2dParam;
 
 class AV1HighbdFwdTxfm2dTest
     : public ::testing::TestWithParam<HighbdFwdTxfm2dParam> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdFwdTxfm2dTest);
 
 TEST_P(AV1HighbdFwdTxfm2dTest, match) {
   AV1HighbdFwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
@@ -564,8 +656,10 @@ using ::testing::ValuesIn;
 #if HAVE_SSE4_1
 static TX_SIZE Highbd_fwd_txfm_for_sse4_1[] = {
   TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_64X64, TX_4X8,   TX_8X4,
-  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
-  TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16,
+  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32,
+#if !CONFIG_REALTIME_ONLY
+  TX_4X16, TX_16X4, TX_8X32,  TX_32X8,  TX_16X64, TX_64X16,
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdFwdTxfm2dTest,
@@ -580,4 +674,17 @@ INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdFwdTxfm2dTest,
                          Combine(ValuesIn(Highbd_fwd_txfm_for_avx2),
                                  Values(av1_highbd_fwd_txfm)));
 #endif  // HAVE_AVX2
+
+#if HAVE_NEON
+static TX_SIZE Highbd_fwd_txfm_for_neon[] = {
+  TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_64X64, TX_4X8,   TX_8X4,
+  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
+  TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighbdFwdTxfm2dTest,
+                         Combine(ValuesIn(Highbd_fwd_txfm_for_neon),
+                                 Values(av1_highbd_fwd_txfm)));
+#endif  // HAVE_NEON
+
 }  // namespace
diff --git a/media/libaom/src/test/av1_highbd_iht_test.cc b/media/libaom/src/test/av1_highbd_iht_test.cc
index 8fea500db9..07c6036f14 100644
--- a/media/libaom/src/test/av1_highbd_iht_test.cc
+++ b/media/libaom/src/test/av1_highbd_iht_test.cc
@@ -17,7 +17,6 @@
 
 #include "test/acm_random.h"
 #include "test/av1_txfm_test.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/enums.h"
@@ -76,6 +75,7 @@ class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
 
     input_ = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(input_[0]) * num_coeffs_));
+    ASSERT_NE(input_, nullptr);
 
     // Note:
     // Inverse transform input buffer is 32-byte aligned
@@ -83,10 +83,13 @@ class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
     // void alloc_mode_context().
     coeffs_ = reinterpret_cast<int32_t *>(
         aom_memalign(32, sizeof(coeffs_[0]) * num_coeffs_));
+    ASSERT_NE(coeffs_, nullptr);
     output_ = reinterpret_cast<uint16_t *>(
         aom_memalign(32, sizeof(output_[0]) * num_coeffs_));
+    ASSERT_NE(output_, nullptr);
     output_ref_ = reinterpret_cast<uint16_t *>(
         aom_memalign(32, sizeof(output_ref_[0]) * num_coeffs_));
+    ASSERT_NE(output_ref_, nullptr);
   }
 
   virtual void TearDown() {
@@ -94,7 +97,6 @@ class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
     aom_free(coeffs_);
     aom_free(output_);
     aom_free(output_ref_);
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -129,6 +131,7 @@ class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
   uint16_t *output_;
   uint16_t *output_ref_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdInvHTNxN);
 
 void AV1HighbdInvHTNxN::RunBitexactCheck() {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -145,7 +148,7 @@ void AV1HighbdInvHTNxN::RunBitexactCheck() {
 
     txfm_ref_(input_, coeffs_, stride, tx_type_, bit_depth_);
     inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         inv_txfm_(coeffs_, output_, stride, tx_type_, bit_depth_));
 
     for (int j = 0; j < num_coeffs_; ++j) {
@@ -204,10 +207,16 @@ class AV1HighbdInvTxfm2d
  private:
   HighbdInvTxfm2dFunc target_func_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdInvTxfm2d);
 
 void AV1HighbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type_, TX_SIZE tx_size_,
                                              int run_times, int bit_depth_,
                                              int gt_int16) {
+#if CONFIG_REALTIME_ONLY
+  if (tx_size_ >= TX_4X16) {
+    return;
+  }
+#endif
   FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size_];
   TxfmParam txfm_param;
   const int BLK_WIDTH = 64;
@@ -359,4 +368,10 @@ INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdInvTxfm2d,
 INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdInvTxfm2d,
                          ::testing::Values(av1_highbd_inv_txfm_add_avx2));
 #endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighbdInvTxfm2d,
+                         ::testing::Values(av1_highbd_inv_txfm_add_neon));
+#endif
+
 }  // namespace
diff --git a/media/libaom/src/test/av1_horz_only_frame_superres_test.cc b/media/libaom/src/test/av1_horz_only_frame_superres_test.cc
index 115fc84c0f..d15924de2b 100644
--- a/media/libaom/src/test/av1_horz_only_frame_superres_test.cc
+++ b/media/libaom/src/test/av1_horz_only_frame_superres_test.cc
@@ -20,7 +20,6 @@
 #include "av1/common/convolve.h"
 #include "av1/common/resize.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -164,7 +163,7 @@ class ConvolveHorizRSTestBase : public ::testing::Test {
  public:
   ConvolveHorizRSTestBase() : image_(NULL) {}
   virtual ~ConvolveHorizRSTestBase() {}
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
   // Implemented by subclasses (SetUp depends on the parameters passed
   // in and RunOne depends on the function to be tested. These can't
@@ -194,6 +193,7 @@ class ConvolveHorizRSTestBase : public ::testing::Test {
 
         image_ =
             new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+        ASSERT_NE(image_, nullptr);
 
         Prep(&rnd);
         RunOne(true);
@@ -213,6 +213,7 @@ class ConvolveHorizRSTestBase : public ::testing::Test {
     int x0 = RS_SCALE_SUBPEL_MASK >> 1;
 
     image_ = new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+    ASSERT_NE(image_, nullptr);
 
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     Prep(&rnd);
diff --git a/media/libaom/src/test/av1_inv_txfm2d_test.cc b/media/libaom/src/test/av1_inv_txfm2d_test.cc
index eacdf85d43..d14acfe541 100644
--- a/media/libaom/src/test/av1_inv_txfm2d_test.cc
+++ b/media/libaom/src/test/av1_inv_txfm2d_test.cc
@@ -272,6 +272,7 @@ class AV1LbdInvTxfm2d : public ::testing::TestWithParam<AV1LbdInvTxfm2dParam> {
  private:
   LbdInvTxfm2dFunc target_func_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1LbdInvTxfm2d);
 
 void AV1LbdInvTxfm2d::RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size,
                                           int run_times, int gt_int16) {
diff --git a/media/libaom/src/test/av1_k_means_test.cc b/media/libaom/src/test/av1_k_means_test.cc
new file mode 100644
index 0000000000..c77f501c75
--- /dev/null
+++ b/media/libaom/src/test/av1_k_means_test.cc
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+#include <tuple>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "av1/encoder/palette.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1Kmeans {
+typedef void (*av1_calc_indices_dim1_func)(const int *data,
+                                           const int *centroids,
+                                           uint8_t *indices, int n, int k);
+typedef void (*av1_calc_indices_dim2_func)(const int *data,
+                                           const int *centroids,
+                                           uint8_t *indices, int n, int k);
+
+typedef std::tuple<av1_calc_indices_dim1_func, BLOCK_SIZE>
+    av1_calc_indices_dim1Param;
+
+typedef std::tuple<av1_calc_indices_dim2_func, BLOCK_SIZE>
+    av1_calc_indices_dim2Param;
+
+class AV1KmeansTest1
+    : public ::testing::TestWithParam<av1_calc_indices_dim1Param> {
+ public:
+  ~AV1KmeansTest1();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(av1_calc_indices_dim1_func test_impl, BLOCK_SIZE bsize,
+                      int centroids);
+  void RunSpeedTest(av1_calc_indices_dim1_func test_impl, BLOCK_SIZE bsize,
+                    int centroids);
+  bool CheckResult(int n) {
+    for (int idx = 0; idx < n; ++idx) {
+      if (indices1_[idx] != indices2_[idx]) {
+        printf("%d ", idx);
+        printf("%d != %d ", indices1_[idx], indices2_[idx]);
+        return false;
+      }
+    }
+    return true;
+  }
+
+  libaom_test::ACMRandom rnd_;
+  int data_[4096];
+  int centroids_[8];
+  uint8_t indices1_[4096];
+  uint8_t indices2_[4096];
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest1);
+
+AV1KmeansTest1::~AV1KmeansTest1() {}
+
+void AV1KmeansTest1::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  for (int i = 0; i < 4096; ++i) {
+    data_[i] = (int)rnd_.Rand8() << 4;
+  }
+  for (int i = 0; i < 8; i++) {
+    centroids_[i] = (int)rnd_.Rand8() << 4;
+  }
+}
+
+void AV1KmeansTest1::TearDown() {}
+
+void AV1KmeansTest1::RunCheckOutput(av1_calc_indices_dim1_func test_impl,
+                                    BLOCK_SIZE bsize, int k) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int n = w * h;
+  av1_calc_indices_dim1_c(data_, centroids_, indices1_, n, k);
+  test_impl(data_, centroids_, indices2_, n, k);
+
+  ASSERT_EQ(CheckResult(n), true)
+      << " block " << bsize << " index " << n << " Centroids " << k;
+}
+
+void AV1KmeansTest1::RunSpeedTest(av1_calc_indices_dim1_func test_impl,
+                                  BLOCK_SIZE bsize, int k) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int n = w * h;
+  const int num_loops = 1000000000 / n;
+
+  av1_calc_indices_dim1_func funcs[2] = { av1_calc_indices_dim1_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    av1_calc_indices_dim1_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(data_, centroids_, indices1_, n, k);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("av1_calc_indices_dim1 indices= %d centroids=%d: %7.2f/%7.2fns", n, k,
+         elapsed_time[0], elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1KmeansTest1, CheckOutput) {
+  // centroids = 2..8
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 2);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 3);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 4);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 5);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 6);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 7);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+TEST_P(AV1KmeansTest1, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 2);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 3);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 4);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 5);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 6);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 7);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+class AV1KmeansTest2
+    : public ::testing::TestWithParam<av1_calc_indices_dim2Param> {
+ public:
+  ~AV1KmeansTest2();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(av1_calc_indices_dim2_func test_impl, BLOCK_SIZE bsize,
+                      int centroids);
+  void RunSpeedTest(av1_calc_indices_dim2_func test_impl, BLOCK_SIZE bsize,
+                    int centroids);
+  bool CheckResult(int n) {
+    bool flag = true;
+    for (int idx = 0; idx < n; ++idx) {
+      if (indices1_[idx] != indices2_[idx]) {
+        printf("%d ", idx);
+        printf("%d != %d ", indices1_[idx], indices2_[idx]);
+        flag = false;
+      }
+    }
+    if (flag == false) {
+      return false;
+    }
+    return true;
+  }
+
+  libaom_test::ACMRandom rnd_;
+  int data_[4096 * 2];
+  int centroids_[8 * 2];
+  uint8_t indices1_[4096];
+  uint8_t indices2_[4096];
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest2);
+
+AV1KmeansTest2::~AV1KmeansTest2() {}
+
+void AV1KmeansTest2::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  for (int i = 0; i < 4096 * 2; ++i) {
+    data_[i] = (int)rnd_.Rand8();
+  }
+  for (int i = 0; i < 8 * 2; i++) {
+    centroids_[i] = (int)rnd_.Rand8();
+  }
+}
+
+void AV1KmeansTest2::TearDown() {}
+
+void AV1KmeansTest2::RunCheckOutput(av1_calc_indices_dim2_func test_impl,
+                                    BLOCK_SIZE bsize, int k) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int n = w * h;
+  av1_calc_indices_dim2_c(data_, centroids_, indices1_, n, k);
+  test_impl(data_, centroids_, indices2_, n, k);
+
+  ASSERT_EQ(CheckResult(n), true)
+      << " block " << bsize << " index " << n << " Centroids " << k;
+}
+
+void AV1KmeansTest2::RunSpeedTest(av1_calc_indices_dim2_func test_impl,
+                                  BLOCK_SIZE bsize, int k) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int n = w * h;
+  const int num_loops = 1000000000 / n;
+
+  av1_calc_indices_dim2_func funcs[2] = { av1_calc_indices_dim2_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    av1_calc_indices_dim2_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(data_, centroids_, indices1_, n, k);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("av1_calc_indices_dim2 indices= %d centroids=%d: %7.2f/%7.2fns", n, k,
+         elapsed_time[0], elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1KmeansTest2, CheckOutput) {
+  // centroids = 2..8
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 2);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 3);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 4);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 5);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 6);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 7);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+TEST_P(AV1KmeansTest2, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 2);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 3);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 4);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 5);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 6);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 7);
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+#if HAVE_AVX2 || HAVE_SSE2
+const BLOCK_SIZE kValidBlockSize[] = { BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X32,
+                                       BLOCK_16X8,  BLOCK_16X16, BLOCK_16X32,
+                                       BLOCK_32X8,  BLOCK_32X16, BLOCK_32X32,
+                                       BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
+                                       BLOCK_16X64, BLOCK_64X16 };
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1KmeansTest1,
+    ::testing::Combine(::testing::Values(&av1_calc_indices_dim1_avx2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1KmeansTest2,
+    ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_avx2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1KmeansTest1,
+    ::testing::Combine(::testing::Values(&av1_calc_indices_dim1_sse2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+// TODO(any): Disable av1_calc_indices_dim2 sse2 SIMD and its unit test due to
+// c/SIMD mismatch. Re-enable it after mismatch is fixed.
+// INSTANTIATE_TEST_SUITE_P(
+//    SSE2, AV1KmeansTest2,
+//    ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_sse2),
+//                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+}  // namespace AV1Kmeans
diff --git a/media/libaom/src/test/av1_key_value_api_test.cc b/media/libaom/src/test/av1_key_value_api_test.cc
new file mode 100644
index 0000000000..a5734f6beb
--- /dev/null
+++ b/media/libaom/src/test/av1_key_value_api_test.cc
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstring>
+#include <tuple>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_decoder.h"
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "aom/aomdx.h"
+#include "config/aom_config.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+typedef std::tuple<const char *, const char *> KeyValParam;
+
+class BaseKeyValAPI : public testing::Test {
+ public:
+  void SetUp() override {
+#if CONFIG_AV1_ENCODER
+    aom_codec_iface_t *iface_cx = aom_codec_av1_cx();
+    aom_codec_enc_cfg_t enc_cfg;
+#if CONFIG_REALTIME_ONLY
+    const int usage = 1;
+#else
+    const int usage = 0;
+#endif
+    EXPECT_EQ(AOM_CODEC_OK,
+              aom_codec_enc_config_default(iface_cx, &enc_cfg, usage));
+    EXPECT_EQ(AOM_CODEC_OK,
+              aom_codec_enc_init(&enc_, iface_cx, &enc_cfg, usage));
+#endif
+#if CONFIG_AV1_DECODER
+    aom_codec_iface_t *iface_dx = aom_codec_av1_dx();
+    aom_codec_dec_cfg_t dec_cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
+
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec_, iface_dx, &dec_cfg, 0));
+#endif
+  }
+
+  void TearDown() override {
+#if CONFIG_AV1_ENCODER
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc_));
+#endif
+#if CONFIG_AV1_DECODER
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&dec_));
+#endif
+  }
+
+ protected:
+#if CONFIG_AV1_ENCODER
+  aom_codec_ctx_t enc_;
+#endif
+#if CONFIG_AV1_DECODER
+  aom_codec_ctx_t dec_;
+#endif
+};
+
+// Tests on encoder options.
+// Need to add ones for the decoder in the future if it is also supported in the
+// key & value API.
+#if CONFIG_AV1_ENCODER
+class EncValidTest : public BaseKeyValAPI,
+                     public testing::WithParamInterface<KeyValParam> {};
+class EncInvalidTest : public BaseKeyValAPI,
+                       public testing::WithParamInterface<KeyValParam> {};
+
+TEST_P(EncValidTest, Valid) {
+  const char *key = std::get<0>(GetParam());
+  const char *val = std::get<1>(GetParam());
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_set_option(&enc_, key, val));
+}
+
+TEST_P(EncInvalidTest, NullArg) {
+  const char *key = std::get<0>(GetParam());
+  const char *val = std::get<1>(GetParam());
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_set_option(nullptr, key, val));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_set_option(&enc_, nullptr, val));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_set_option(&enc_, key, nullptr));
+}
+
+TEST_P(EncInvalidTest, InvalidParam) {
+  const char *key = std::get<0>(GetParam());
+  const char *val = std::get<1>(GetParam());
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_set_option(&enc_, key, val));
+  ASSERT_NE(aom_codec_error_detail(&enc_), nullptr);
+  EXPECT_GT(strlen(aom_codec_error_detail(&enc_)), 0u);
+}
+
+// No test for ratio / list for now since the API does not support any of the
+// parameters of these type.
+// The string type typically involves reading a path/file, which brings
+// potential fails.
+const KeyValParam enc_valid_params[] = {
+  std::make_tuple("auto-intra-tools-off", "1"),  // uint
+  std::make_tuple("min-gf-interval", "10"),      // uint
+  std::make_tuple("min-partition-size", "4"),    // int
+  std::make_tuple("tune", "psnr"),               // enum
+};
+
+const KeyValParam enc_invalid_params[] = {
+  // no match
+  std::make_tuple("a-b-c", "10"),
+  // uint
+  std::make_tuple("min-gf-interval", "-1"),
+  std::make_tuple("min-gf-interval", "1.1"),
+  std::make_tuple("min-gf-interval", "abc"),
+  // int
+  std::make_tuple("min-partition-size", "1.1"),
+  std::make_tuple("min-partition-size", "abc"),
+  // enum
+  std::make_tuple("tune", "PsnR1"),
+  // out of range
+  std::make_tuple("cq-level", "1000"),
+};
+
+INSTANTIATE_TEST_SUITE_P(KeyValAPI, EncValidTest,
+                         testing::ValuesIn(enc_valid_params));
+
+INSTANTIATE_TEST_SUITE_P(KeyValAPI, EncInvalidTest,
+                         testing::ValuesIn(enc_invalid_params));
+#endif  // CONFIG_AV1_ENCODER
+
+}  // namespace
diff --git a/media/libaom/src/test/av1_nn_predict_test.cc b/media/libaom/src/test/av1_nn_predict_test.cc
index c03cba8c52..7a3067dade 100644
--- a/media/libaom/src/test/av1_nn_predict_test.cc
+++ b/media/libaom/src/test/av1_nn_predict_test.cc
@@ -22,7 +22,6 @@
 #include "test/util.h"
 #include "test/register_state_check.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 
 namespace {
 typedef void (*NnPredict_Func)(const float *const input_nodes,
@@ -70,9 +69,9 @@ class NnPredictTest : public ::testing::TestWithParam<NnPredictTestParam> {
   float *bias[NN_MAX_HIDDEN_LAYERS + 1] = { 0 };
   float *weights_buf = nullptr, *bias_buf = nullptr;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(NnPredictTest);
 
 void NnPredictTest::RunNnPredictTest(const NN_CONFIG *const shape) {
-  libaom_test::ClearSystemState();
   float inputs[NN_MAX_NODES_PER_LAYER] = { 0 };
   float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 };
   float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 };
@@ -119,7 +118,6 @@ void NnPredictTest::RunNnPredictTest(const NN_CONFIG *const shape) {
 
     av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref);
     target_func_(inputs, &nn_config, 0, outputs_test);
-    libaom_test::ClearSystemState();
 
     for (int node = 0; node < shape->num_outputs; node++) {
       if (outputs_ref[node] < epsilon) {
@@ -139,7 +137,6 @@ void NnPredictTest::RunNnPredictTest(const NN_CONFIG *const shape) {
 
 void NnPredictTest::RunNnPredictSpeedTest(const NN_CONFIG *const shape,
                                           const int run_times) {
-  libaom_test::ClearSystemState();
   float inputs[NN_MAX_NODES_PER_LAYER] = { 0 };
   float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 };
   float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 };
@@ -166,7 +163,6 @@ void NnPredictTest::RunNnPredictSpeedTest(const NN_CONFIG *const shape,
     target_func_(inputs, &nn_config, 0, outputs_test);
   }
   aom_usec_timer_mark(&timer);
-  libaom_test::ClearSystemState();
   const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
 
   printf("%d", shape->num_inputs);
@@ -209,9 +205,14 @@ TEST_P(NnPredictTest, DISABLED_Speed) {
   RunNnPredictSpeedTest_all(shapes, sizeof(shapes) / sizeof(*shapes), 10000000);
 }
 
-#if HAVE_SSE3
+#if HAVE_SSE3 && !CONFIG_EXCLUDE_SIMD_MISMATCH
 INSTANTIATE_TEST_SUITE_P(SSE3, NnPredictTest,
                          ::testing::Values(av1_nn_predict_sse3));
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, NnPredictTest,
+                         ::testing::Values(av1_nn_predict_neon));
+#endif
+
 }  // namespace
diff --git a/media/libaom/src/test/av1_quantize_test.cc b/media/libaom/src/test/av1_quantize_test.cc
index 39a3c33d81..ce1311d4b8 100644
--- a/media/libaom/src/test/av1_quantize_test.cc
+++ b/media/libaom/src/test/av1_quantize_test.cc
@@ -16,9 +16,9 @@
 #include "config/av1_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "av1/common/scan.h"
+#include "av1/encoder/av1_quantize.h"
 
 namespace {
 
@@ -70,7 +70,7 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
     QuantizeFpFunc quanFunc = params_.qFunc;
     QuantizeFpFunc quanFuncRef = params_.qFuncRef;
 
-    const SCAN_ORDER scanOrder = av1_default_scan_orders[txSize];
+    const SCAN_ORDER scanOrder = av1_scan_orders[txSize][DCT_DCT];
     for (int i = 0; i < numTests; i++) {
       int err_count = 0;
       ref_eob = eob = UINT16_MAX;
@@ -97,7 +97,7 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
                   quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
                   &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
 
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
                    quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
                    scanOrder.scan, scanOrder.iscan, log_scale));
@@ -142,7 +142,7 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
     int log_scale = (txSize == TX_32X32);
     QuantizeFpFunc quanFunc = params_.qFunc;
     QuantizeFpFunc quanFuncRef = params_.qFuncRef;
-    const SCAN_ORDER scanOrder = av1_default_scan_orders[txSize];
+    const SCAN_ORDER scanOrder = av1_scan_orders[txSize][DCT_DCT];
 
     for (int i = 0; i < numTests; i++) {
       ref_eob = eob = UINT16_MAX;
@@ -174,7 +174,7 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
                   quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
                   &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
 
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
                    quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
                    scanOrder.scan, scanOrder.iscan, log_scale));
@@ -185,7 +185,7 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
 
   virtual void SetUp() { params_ = GetParam(); }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
   virtual ~AV1QuantizeTest() {}
 
@@ -202,10 +202,37 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
 
   QuantizeFuncParams params_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1QuantizeTest);
 
 TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); }
 TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
 
+TEST(AV1QuantizeTest, QuantizeFpNoQmatrix) {
+  // Here we use a uniform quantizer as an example
+  const int16_t dequant_ptr[2] = { 78, 93 };  // quantize step
+  const int16_t round_ptr[2] = { 39, 46 };    // round ~= dequant / 2
+
+  // quant ~= 2^16 / dequant. This is a 16-bit fixed point representation of the
+  // inverse of quantize step.
+  const int16_t quant_ptr[2] = { 840, 704 };
+  int log_scale = 0;
+  int coeff_count = 4;
+  const tran_low_t coeff_ptr[4] = { -449, 624, -14, 24 };
+  const tran_low_t ref_qcoeff_ptr[4] = { -6, 7, 0, 0 };
+  const tran_low_t ref_dqcoeff_ptr[4] = { -468, 651, 0, 0 };
+  const int16_t scan[4] = { 0, 1, 2, 3 };
+  tran_low_t qcoeff_ptr[4];
+  tran_low_t dqcoeff_ptr[4];
+  int eob = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr,
+                                       log_scale, scan, coeff_count, coeff_ptr,
+                                       qcoeff_ptr, dqcoeff_ptr);
+  EXPECT_EQ(eob, 2);
+  for (int i = 0; i < coeff_count; ++i) {
+    EXPECT_EQ(qcoeff_ptr[i], ref_qcoeff_ptr[i]);
+    EXPECT_EQ(dqcoeff_ptr[i], ref_dqcoeff_ptr[i]);
+  }
+}
+
 #if HAVE_SSE4_1
 const QuantizeFuncParams qfps[4] = {
   QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
diff --git a/media/libaom/src/test/av1_round_shift_array_test.cc b/media/libaom/src/test/av1_round_shift_array_test.cc
index 993fa9f19a..facb84b550 100644
--- a/media/libaom/src/test/av1_round_shift_array_test.cc
+++ b/media/libaom/src/test/av1_round_shift_array_test.cc
@@ -20,7 +20,6 @@
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -43,7 +42,7 @@ class AV1CompRoundShiftTest
   ~AV1CompRoundShiftTest();
 
   void SetUp() { rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); }
-  void TearDown() { libaom_test::ClearSystemState(); }
+  void TearDown() {}
 
  protected:
   void RunCheckOutput(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize,
@@ -53,8 +52,9 @@ class AV1CompRoundShiftTest
 
   libaom_test::ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CompRoundShiftTest);
 
-AV1CompRoundShiftTest::~AV1CompRoundShiftTest() { ; }
+AV1CompRoundShiftTest::~AV1CompRoundShiftTest() {}
 
 void AV1CompRoundShiftTest::RunCheckOutput(
     comp_round_shift_array_func test_impl, BLOCK_SIZE bsize, int bit) {
@@ -127,4 +127,4 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::ValuesIn(kValidBitCheck)));
 #endif
 
-};  // namespace AV1CompRoundShift
+}  // namespace AV1CompRoundShift
diff --git a/media/libaom/src/test/av1_softmax_test.cc b/media/libaom/src/test/av1_softmax_test.cc
new file mode 100644
index 0000000000..60c7b6f816
--- /dev/null
+++ b/media/libaom/src/test/av1_softmax_test.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <new>
+#include <tuple>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/ml.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+using FastSoftmaxFn = void (*)(const float *const input, float *output);
+using FastSoftmaxTestParams = std::tuple<const FastSoftmaxFn, int>;
+
+// Error thresholds for functional equivalence
+constexpr float kRelEpsilon = 5e-2f;
+constexpr float kAbsEpsilon = 5e-3f;
+
+class FastSoftmaxTest : public ::testing::TestWithParam<FastSoftmaxTestParams> {
+ public:
+  FastSoftmaxTest() : target_fn_(GET_PARAM(0)), num_classes_(GET_PARAM(1)) {}
+  virtual void SetUp() {
+    ref_buf_.reset(new (std::nothrow) float[num_classes_]());
+    ASSERT_NE(ref_buf_, nullptr);
+    dst_buf_.reset(new (std::nothrow) float[num_classes_]());
+    ASSERT_NE(dst_buf_, nullptr);
+    input_.reset(new (std::nothrow) float[num_classes_]());
+    ASSERT_NE(input_, nullptr);
+  }
+  void RunSoftmaxTest();
+  void RunSoftmaxSpeedTest(const int run_times);
+  void FillInputBuf();
+
+ private:
+  const FastSoftmaxFn target_fn_;
+  const int num_classes_;
+  std::unique_ptr<float[]> ref_buf_, dst_buf_, input_;
+  libaom_test::ACMRandom rng_;
+};
+
+void FastSoftmaxTest::FillInputBuf() {
+  for (int idx = 0; idx < num_classes_; idx++) {
+    input_[idx] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 30);
+  }
+}
+
+void FastSoftmaxTest::RunSoftmaxTest() {
+  av1_nn_softmax(input_.get(), ref_buf_.get(), num_classes_);
+  target_fn_(input_.get(), dst_buf_.get());
+
+  for (int idx = 0; idx < num_classes_; idx++) {
+    if (ref_buf_[idx] < kAbsEpsilon) {
+      ASSERT_LE(dst_buf_[idx], kAbsEpsilon)
+          << "Reference output was near-zero, test output was not" << std::endl;
+    } else {
+      const float error = dst_buf_[idx] - ref_buf_[idx];
+      const float relative_error = fabsf(error / ref_buf_[idx]);
+      ASSERT_LE(relative_error, kRelEpsilon)
+          << "Excessive relative error between reference and test output"
+          << std::endl;
+      ASSERT_LE(error, kAbsEpsilon)
+          << "Excessive absolute error between reference and test output"
+          << std::endl;
+    }
+  }
+}
+
+void FastSoftmaxTest::RunSoftmaxSpeedTest(const int run_times) {
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int idx = 0; idx < run_times; idx++) {
+    target_fn_(input_.get(), dst_buf_.get());
+  }
+  aom_usec_timer_mark(&timer);
+  const int64_t time = aom_usec_timer_elapsed(&timer);
+  std::cout << "Test with " << num_classes_ << " classes took " << time
+            << " us." << std::endl;
+}
+
+TEST_P(FastSoftmaxTest, RandomValues) {
+  FillInputBuf();
+  RunSoftmaxTest();
+}
+
+TEST_P(FastSoftmaxTest, DISABLED_Speed) {
+  constexpr int kNumTimes = 1000000;
+  RunSoftmaxSpeedTest(kNumTimes);
+}
+
+void AnchorSoftmax16Fn(const float *input, float *output) {
+  av1_nn_softmax(input, output, 16);
+}
+
+const FastSoftmaxTestParams kArrayParams_c[] = {
+  FastSoftmaxTestParams(AnchorSoftmax16Fn, 16),
+  FastSoftmaxTestParams(av1_nn_fast_softmax_16_c, 16)
+};
+INSTANTIATE_TEST_SUITE_P(C, FastSoftmaxTest,
+                         ::testing::ValuesIn(kArrayParams_c));
+
+#if HAVE_SSE3 && !CONFIG_EXCLUDE_SIMD_MISMATCH
+INSTANTIATE_TEST_SUITE_P(
+    SSE3, FastSoftmaxTest,
+    ::testing::Values(FastSoftmaxTestParams(av1_nn_fast_softmax_16_sse3, 16)));
+#endif
+}  // namespace
diff --git a/media/libaom/src/test/av1_temporal_denoiser_test.cc b/media/libaom/src/test/av1_temporal_denoiser_test.cc
new file mode 100644
index 0000000000..571fd926a4
--- /dev/null
+++ b/media/libaom/src/test/av1_temporal_denoiser_test.cc
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/register_state_check.h"
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+const int kNumPixels = 128 * 128;
+
+typedef int (*Av1DenoiserFilterFunc)(const uint8_t *sig, int sig_stride,
+                                     const uint8_t *mc_avg, int mc_avg_stride,
+                                     uint8_t *avg, int avg_stride,
+                                     int increase_denoising, BLOCK_SIZE bs,
+                                     int motion_magnitude);
+typedef std::tuple<Av1DenoiserFilterFunc, BLOCK_SIZE> AV1DenoiserTestParam;
+
+class AV1DenoiserTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<AV1DenoiserTestParam> {
+ public:
+  virtual ~AV1DenoiserTest() {}
+
+  virtual void SetUp() { bs_ = GET_PARAM(1); }
+
+  virtual void TearDown() {}
+
+ protected:
+  BLOCK_SIZE bs_;
+};
+
+TEST_P(AV1DenoiserTest, BitexactCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 4000;
+
+  // Allocate the space for input and output,
+  // where sig_block is the block to be denoised,
+  // mc_avg_block is the denoised reference block,
+  // avg_block_c is the denoised result from C code,
+  // avg_block_sse2 is the denoised result from SSE2 code.
+  DECLARE_ALIGNED(16, uint8_t, sig_block[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, mc_avg_block[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_c[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_sse2[kNumPixels]);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Generate random motion magnitude, 20% of which exceed the threshold.
+    const int motion_magnitude_random =
+        rnd.Rand8() % static_cast<int>(MOTION_MAGNITUDE_THRESHOLD * 1.2);
+
+    // Initialize a test block with random number in range [0, 255].
+    for (int j = 0; j < kNumPixels; ++j) {
+      int temp = 0;
+      sig_block[j] = rnd.Rand8();
+      // The pixels in mc_avg_block are generated by adding a random
+      // number in range [-19, 19] to corresponding pixels in sig_block.
+      temp =
+          sig_block[j] + ((rnd.Rand8() % 2 == 0) ? -1 : 1) * (rnd.Rand8() % 20);
+      // Clip.
+      mc_avg_block[j] = (temp < 0) ? 0 : ((temp > 255) ? 255 : temp);
+    }
+
+    API_REGISTER_STATE_CHECK(
+        av1_denoiser_filter_c(sig_block, 128, mc_avg_block, 128, avg_block_c,
+                              128, 0, bs_, motion_magnitude_random));
+
+    API_REGISTER_STATE_CHECK(GET_PARAM(0)(sig_block, 128, mc_avg_block, 128,
+                                          avg_block_sse2, 128, 0, bs_,
+                                          motion_magnitude_random));
+
+    // Test bitexactness.
+    for (int h = 0; h < block_size_high[bs_]; ++h) {
+      for (int w = 0; w < block_size_wide[bs_]; ++w) {
+        EXPECT_EQ(avg_block_c[h * 128 + w], avg_block_sse2[h * 128 + w]);
+      }
+    }
+  }
+}
+
+using std::make_tuple;
+
+// Test for all block size.
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1DenoiserTest,
+    ::testing::Values(make_tuple(&av1_denoiser_filter_sse2, BLOCK_8X8),
+                      make_tuple(&av1_denoiser_filter_sse2, BLOCK_8X16),
+                      make_tuple(&av1_denoiser_filter_sse2, BLOCK_16X8),
+                      make_tuple(&av1_denoiser_filter_sse2, BLOCK_16X16),
+                      make_tuple(&av1_denoiser_filter_sse2, BLOCK_16X32),
+                      make_tuple(&av1_denoiser_filter_sse2, BLOCK_32X16),
+                      make_tuple(&av1_denoiser_filter_sse2, BLOCK_32X32),
+                      make_tuple(&av1_denoiser_filter_sse2, BLOCK_32X64),
+                      make_tuple(&av1_denoiser_filter_sse2, BLOCK_64X32),
+                      make_tuple(&av1_denoiser_filter_sse2, BLOCK_64X64),
+                      make_tuple(&av1_denoiser_filter_sse2, BLOCK_128X64),
+                      make_tuple(&av1_denoiser_filter_sse2, BLOCK_64X128),
+                      make_tuple(&av1_denoiser_filter_sse2, BLOCK_128X128)));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1DenoiserTest,
+    ::testing::Values(make_tuple(&av1_denoiser_filter_neon, BLOCK_8X8),
+                      make_tuple(&av1_denoiser_filter_neon, BLOCK_8X16),
+                      make_tuple(&av1_denoiser_filter_neon, BLOCK_16X8),
+                      make_tuple(&av1_denoiser_filter_neon, BLOCK_16X16),
+                      make_tuple(&av1_denoiser_filter_neon, BLOCK_16X32),
+                      make_tuple(&av1_denoiser_filter_neon, BLOCK_32X16),
+                      make_tuple(&av1_denoiser_filter_neon, BLOCK_32X32),
+                      make_tuple(&av1_denoiser_filter_neon, BLOCK_32X64),
+                      make_tuple(&av1_denoiser_filter_neon, BLOCK_64X32),
+                      make_tuple(&av1_denoiser_filter_neon, BLOCK_64X64),
+                      make_tuple(&av1_denoiser_filter_neon, BLOCK_128X64),
+                      make_tuple(&av1_denoiser_filter_neon, BLOCK_64X128),
+                      make_tuple(&av1_denoiser_filter_neon, BLOCK_128X128)));
+#endif
+}  // namespace
diff --git a/media/libaom/src/test/av1_txfm_test.cc b/media/libaom/src/test/av1_txfm_test.cc
index aedd45d133..f741e7cae5 100644
--- a/media/libaom/src/test/av1_txfm_test.cc
+++ b/media/libaom/src/test/av1_txfm_test.cc
@@ -9,9 +9,13 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <stdio.h>
 #include "test/av1_txfm_test.h"
 
+#include <stdio.h>
+
+#include <memory>
+#include <new>
+
 namespace libaom_test {
 
 int get_txfm1d_size(TX_SIZE tx_size) { return tx_size_wide[tx_size]; }
@@ -237,9 +241,15 @@ void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type,
   const int tx_width = tx_size_wide[tx_size];
   const int tx_height = tx_size_high[tx_size];
 
-  double *const temp_in = new double[AOMMAX(tx_width, tx_height)];
-  double *const temp_out = new double[AOMMAX(tx_width, tx_height)];
-  double *const out_interm = new double[tx_width * tx_height];
+  std::unique_ptr<double[]> temp_in(
+      new (std::nothrow) double[AOMMAX(tx_width, tx_height)]);
+  std::unique_ptr<double[]> temp_out(
+      new (std::nothrow) double[AOMMAX(tx_width, tx_height)]);
+  std::unique_ptr<double[]> out_interm(
+      new (std::nothrow) double[tx_width * tx_height]);
+  ASSERT_NE(temp_in, nullptr);
+  ASSERT_NE(temp_out, nullptr);
+  ASSERT_NE(out_interm, nullptr);
   const int stride = tx_width;
 
   // Transform columns.
@@ -247,7 +257,7 @@ void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type,
     for (int r = 0; r < tx_height; ++r) {
       temp_in[r] = in[r * stride + c];
     }
-    reference_hybrid_1d(temp_in, temp_out, tx_height, type0);
+    reference_hybrid_1d(temp_in.get(), temp_out.get(), tx_height, type0);
     for (int r = 0; r < tx_height; ++r) {
       out_interm[r * stride + c] = temp_out[r];
     }
@@ -255,14 +265,10 @@ void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type,
 
   // Transform rows.
   for (int r = 0; r < tx_height; ++r) {
-    reference_hybrid_1d(out_interm + r * stride, out + r * stride, tx_width,
-                        type1);
+    reference_hybrid_1d(out_interm.get() + r * stride, out + r * stride,
+                        tx_width, type1);
   }
 
-  delete[] temp_in;
-  delete[] temp_out;
-  delete[] out_interm;
-
   // These transforms use an approximate 2D DCT transform, by only keeping the
   // top-left quarter of the coefficients, and repacking them in the first
   // quarter indices.
diff --git a/media/libaom/src/test/av1_txfm_test.h b/media/libaom/src/test/av1_txfm_test.h
index 5a56d28f1a..13a7e8a076 100644
--- a/media/libaom/src/test/av1_txfm_test.h
+++ b/media/libaom/src/test/av1_txfm_test.h
@@ -97,7 +97,7 @@ static INLINE bool IsTxSizeTypeValid(TX_SIZE tx_size, TX_TYPE tx_type) {
 }
 
 #if CONFIG_AV1_ENCODER
-
+#if !CONFIG_REALTIME_ONLY
 static const FwdTxfm2dFunc fwd_txfm_func_ls[TX_SIZES_ALL] = {
   av1_fwd_txfm2d_4x4_c,   av1_fwd_txfm2d_8x8_c,   av1_fwd_txfm2d_16x16_c,
   av1_fwd_txfm2d_32x32_c, av1_fwd_txfm2d_64x64_c, av1_fwd_txfm2d_4x8_c,
@@ -107,6 +107,29 @@ static const FwdTxfm2dFunc fwd_txfm_func_ls[TX_SIZES_ALL] = {
   av1_fwd_txfm2d_8x32_c,  av1_fwd_txfm2d_32x8_c,  av1_fwd_txfm2d_16x64_c,
   av1_fwd_txfm2d_64x16_c,
 };
+#else
+static const FwdTxfm2dFunc fwd_txfm_func_ls[TX_SIZES_ALL] = {
+  av1_fwd_txfm2d_4x4_c,
+  av1_fwd_txfm2d_8x8_c,
+  av1_fwd_txfm2d_16x16_c,
+  av1_fwd_txfm2d_32x32_c,
+  av1_fwd_txfm2d_64x64_c,
+  av1_fwd_txfm2d_4x8_c,
+  av1_fwd_txfm2d_8x4_c,
+  av1_fwd_txfm2d_8x16_c,
+  av1_fwd_txfm2d_16x8_c,
+  av1_fwd_txfm2d_16x32_c,
+  av1_fwd_txfm2d_32x16_c,
+  av1_fwd_txfm2d_32x64_c,
+  av1_fwd_txfm2d_64x32_c,
+  nullptr,
+  av1_fwd_txfm2d_16x4_c,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+};
+#endif
 #endif
 
 static const InvTxfm2dFunc inv_txfm_func_ls[TX_SIZES_ALL] = {
diff --git a/media/libaom/src/test/av1_wedge_utils_test.cc b/media/libaom/src/test/av1_wedge_utils_test.cc
index f9dc838ff4..a51ce12b2c 100644
--- a/media/libaom/src/test/av1_wedge_utils_test.cc
+++ b/media/libaom/src/test/av1_wedge_utils_test.cc
@@ -164,6 +164,7 @@ class WedgeUtilsSSEOptTest : public FunctionEquivalenceTest<FSSE> {
  protected:
   static const int kIterations = 10000;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(WedgeUtilsSSEOptTest);
 
 TEST_P(WedgeUtilsSSEOptTest, RandomValues) {
   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
@@ -181,7 +182,7 @@ TEST_P(WedgeUtilsSSEOptTest, RandomValues) {
 
     const uint64_t ref_res = params_.ref_func(r1, d, m, N);
     uint64_t tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -211,7 +212,7 @@ TEST_P(WedgeUtilsSSEOptTest, ExtremeValues) {
 
     const uint64_t ref_res = params_.ref_func(r1, d, m, N);
     uint64_t tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -230,6 +231,7 @@ class WedgeUtilsSignOptTest : public FunctionEquivalenceTest<FSign> {
   static const int kIterations = 10000;
   static const int kMaxSize = 8196;  // Size limited by SIMD implementation.
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(WedgeUtilsSignOptTest);
 
 TEST_P(WedgeUtilsSignOptTest, RandomValues) {
   DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
@@ -257,7 +259,7 @@ TEST_P(WedgeUtilsSignOptTest, RandomValues) {
 
     const int ref_res = params_.ref_func(ds, m, N, limit);
     int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -312,7 +314,7 @@ TEST_P(WedgeUtilsSignOptTest, ExtremeValues) {
 
     const int ref_res = params_.ref_func(ds, m, N, limit);
     int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -329,6 +331,7 @@ class WedgeUtilsDeltaSquaresOptTest : public FunctionEquivalenceTest<FDS> {
  protected:
   static const int kIterations = 10000;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(WedgeUtilsDeltaSquaresOptTest);
 
 TEST_P(WedgeUtilsDeltaSquaresOptTest, RandomValues) {
   DECLARE_ALIGNED(32, int16_t, a[MAX_SB_SQUARE]);
@@ -348,7 +351,7 @@ TEST_P(WedgeUtilsDeltaSquaresOptTest, RandomValues) {
     memset(&d_tst, INT16_MAX, sizeof(d_tst));
 
     params_.ref_func(d_ref, a, b, N);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(d_tst, a, b, N));
+    API_REGISTER_STATE_CHECK(params_.tst_func(d_tst, a, b, N));
 
     for (int i = 0; i < MAX_SB_SQUARE; ++i) ASSERT_EQ(d_ref[i], d_tst[i]);
   }
diff --git a/media/libaom/src/test/avg_test.cc b/media/libaom/src/test/avg_test.cc
index 1742aec5fc..ca24c9b641 100644
--- a/media/libaom/src/test/avg_test.cc
+++ b/media/libaom/src/test/avg_test.cc
@@ -9,14 +9,18 @@
  */
 
 #include <stdlib.h>
+#include <ostream>
+#include <string>
 #include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
+#include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -27,14 +31,13 @@ using libaom_test::ACMRandom;
 template <typename Pixel>
 class AverageTestBase : public ::testing::Test {
  public:
-  AverageTestBase(int width, int height)
+  AverageTestBase(int width, int height, int bit_depth = 8)
       : width_(width), height_(height), source_data_(NULL), source_stride_(0),
-        bit_depth_(8) {}
+        bit_depth_(bit_depth) {}
 
   virtual void TearDown() {
     aom_free(source_data_);
     source_data_ = NULL;
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -43,9 +46,19 @@ class AverageTestBase : public ::testing::Test {
   static const int kDataBlockSize = 64 * 128;
 
   virtual void SetUp() {
+    const testing::TestInfo *const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    // Skip the speed test for C code as the baseline uses the same function.
+    if (std::string(test_info->test_suite_name()).find("C/") == 0 &&
+        std::string(test_info->name()).find("DISABLED_Speed") !=
+            std::string::npos) {
+      GTEST_SKIP();
+    }
+
     source_data_ = static_cast<Pixel *>(
         aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
-    ASSERT_TRUE(source_data_ != NULL);
+    ASSERT_NE(source_data_, nullptr);
+    memset(source_data_, 0, kDataBlockSize * sizeof(source_data_[0]));
     source_stride_ = (width_ + 31) & ~31;
     bit_depth_ = 8;
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -60,6 +73,20 @@ class AverageTestBase : public ::testing::Test {
     return (average + 32) >> 6;
   }
 
+  static void ReferenceAverage8x8_quad(const uint8_t *source, int pitch,
+                                       int x16_idx, int y16_idx, int *avg) {
+    for (int k = 0; k < 4; k++) {
+      int average = 0;
+      int x8_idx = x16_idx + ((k & 1) << 3);
+      int y8_idx = y16_idx + ((k >> 1) << 3);
+      for (int h = 0; h < 8; ++h) {
+        for (int w = 0; w < 8; ++w)
+          average += source[(h + y8_idx) * pitch + w + x8_idx];
+      }
+      avg[k] = (average + 32) >> 6;
+    }
+  }
+
   static unsigned int ReferenceAverage4x4(const Pixel *source, int pitch) {
     unsigned int average = 0;
     for (int h = 0; h < 4; ++h) {
@@ -89,53 +116,233 @@ class AverageTestBase : public ::testing::Test {
 };
 typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch);
 
-// Arguments: width, height, pitch, block size, avg function.
-typedef std::tuple<int, int, int, int, AverageFunction> AvgFunc;
+// Arguments: width, height, bit_depth, buffer start offset, block size, avg
+// function.
+typedef std::tuple<int, int, int, int, int, AverageFunction> AvgFunc;
 
-class AverageTest : public AverageTestBase<uint8_t>,
+template <typename Pixel>
+class AverageTest : public AverageTestBase<Pixel>,
                     public ::testing::WithParamInterface<AvgFunc> {
  public:
-  AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+  AverageTest()
+      : AverageTestBase<Pixel>(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2)) {}
 
  protected:
+  using AverageTestBase<Pixel>::source_data_;
+  using AverageTestBase<Pixel>::source_stride_;
+  using AverageTestBase<Pixel>::ReferenceAverage8x8;
+  using AverageTestBase<Pixel>::ReferenceAverage4x4;
+  using AverageTestBase<Pixel>::FillConstant;
+  using AverageTestBase<Pixel>::FillRandom;
+
   void CheckAverages() {
-    const int block_size = GET_PARAM(3);
+    const int block_size = GET_PARAM(4);
     unsigned int expected = 0;
+
+    // The reference frame, but not the source frame, may be unaligned for
+    // certain types of searches.
+    const Pixel *const src = source_data_ + GET_PARAM(3);
     if (block_size == 8) {
-      expected =
-          ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_);
+      expected = ReferenceAverage8x8(src, source_stride_);
     } else if (block_size == 4) {
-      expected =
-          ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_);
+      expected = ReferenceAverage4x4(src, source_stride_);
     }
 
+    aom_usec_timer timer;
     unsigned int actual;
-    ASM_REGISTER_STATE_CHECK(
-        actual = GET_PARAM(4)(source_data_ + GET_PARAM(2), source_stride_));
+    if (sizeof(Pixel) == 2) {
+#if CONFIG_AV1_HIGHBITDEPTH
+      AverageFunction avg_c =
+          (block_size == 8) ? aom_highbd_avg_8x8_c : aom_highbd_avg_4x4_c;
+      // To avoid differences in optimization with the local Reference*()
+      // functions the C implementation is used as a baseline.
+      aom_usec_timer_start(&timer);
+      avg_c(CONVERT_TO_BYTEPTR(src), source_stride_);
+      aom_usec_timer_mark(&timer);
+      ref_elapsed_time_ += aom_usec_timer_elapsed(&timer);
+
+      AverageFunction avg_opt = GET_PARAM(5);
+      API_REGISTER_STATE_CHECK(
+          aom_usec_timer_start(&timer);
+          actual = avg_opt(CONVERT_TO_BYTEPTR(src), source_stride_);
+          aom_usec_timer_mark(&timer));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+    } else {
+      ASSERT_EQ(sizeof(Pixel), 1u);
+
+      AverageFunction avg_c = (block_size == 8) ? aom_avg_8x8_c : aom_avg_4x4_c;
+      aom_usec_timer_start(&timer);
+      avg_c(reinterpret_cast<const uint8_t *>(src), source_stride_);
+      aom_usec_timer_mark(&timer);
+      ref_elapsed_time_ += aom_usec_timer_elapsed(&timer);
+
+      AverageFunction avg_opt = GET_PARAM(5);
+      API_REGISTER_STATE_CHECK(
+          aom_usec_timer_start(&timer);
+          actual =
+              avg_opt(reinterpret_cast<const uint8_t *>(src), source_stride_);
+          aom_usec_timer_mark(&timer));
+    }
+    opt_elapsed_time_ += aom_usec_timer_elapsed(&timer);
 
     EXPECT_EQ(expected, actual);
   }
+
+  void TestConstantValue(Pixel value) {
+    FillConstant(value);
+    CheckAverages();
+  }
+
+  void TestRandom(int iterations = 1000) {
+    for (int i = 0; i < iterations; i++) {
+      FillRandom();
+      CheckAverages();
+    }
+  }
+
+  void PrintTimingStats() const {
+    printf(
+        "block_size = %d \t ref_time = %d \t simd_time = %d \t Gain = %4.2f\n",
+        GET_PARAM(4), static_cast<int>(ref_elapsed_time_),
+        static_cast<int>(opt_elapsed_time_),
+        (static_cast<float>(ref_elapsed_time_) /
+         static_cast<float>(opt_elapsed_time_)));
+  }
+
+  int64_t ref_elapsed_time_ = 0;
+  int64_t opt_elapsed_time_ = 0;
 };
 
-TEST_P(AverageTest, MinValue) {
-  FillConstant(0);
-  CheckAverages();
-}
+typedef void (*AverageFunction_8x8_quad)(const uint8_t *s, int pitch, int x_idx,
+                                         int y_idx, int *avg);
 
-TEST_P(AverageTest, MaxValue) {
-  FillConstant(255);
-  CheckAverages();
-}
+// Arguments: width, height, bit_depth, buffer start offset, block size, avg
+// function.
+typedef std::tuple<int, int, int, int, int, AverageFunction_8x8_quad>
+    AvgFunc_8x8_quad;
+
+template <typename Pixel>
+class AverageTest_8x8_quad
+    : public AverageTestBase<Pixel>,
+      public ::testing::WithParamInterface<AvgFunc_8x8_quad> {
+ public:
+  AverageTest_8x8_quad()
+      : AverageTestBase<Pixel>(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2)) {}
+
+ protected:
+  using AverageTestBase<Pixel>::source_data_;
+  using AverageTestBase<Pixel>::source_stride_;
+  using AverageTestBase<Pixel>::ReferenceAverage8x8_quad;
+  using AverageTestBase<Pixel>::FillConstant;
+  using AverageTestBase<Pixel>::FillRandom;
+
+  void CheckAverages(int iterations) {
+    ASSERT_EQ(sizeof(Pixel), 1u);
+    const int block_size = GET_PARAM(4);
+    (void)block_size;
+    int expected[4] = { 0 };
+    int x16_idx = 0;
+    int y16_idx = 0;
 
-TEST_P(AverageTest, Random) {
-  // The reference frame, but not the source frame, may be unaligned for
-  // certain types of searches.
-  for (int i = 0; i < 1000; i++) {
+    // The reference frame, but not the source frame, may be unaligned for
+    // certain types of searches.
+    const Pixel *const src = source_data_ + GET_PARAM(3);
+    ReferenceAverage8x8_quad(src, source_stride_, x16_idx, y16_idx, expected);
+
+    aom_usec_timer timer;
+    int expected_c[4] = { 0 };
+    int actual[4] = { 0 };
+    AverageFunction_8x8_quad avg_c = aom_avg_8x8_quad_c;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < iterations; i++) {
+      avg_c(reinterpret_cast<const uint8_t *>(src), source_stride_, x16_idx,
+            y16_idx, expected_c);
+    }
+    aom_usec_timer_mark(&timer);
+    ref_elapsed_time_ += aom_usec_timer_elapsed(&timer);
+
+    AverageFunction_8x8_quad avg_opt = GET_PARAM(5);
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < iterations; i++) {
+      avg_opt(reinterpret_cast<const uint8_t *>(src), source_stride_, x16_idx,
+              y16_idx, actual);
+    }
+    aom_usec_timer_mark(&timer);
+    opt_elapsed_time_ += aom_usec_timer_elapsed(&timer);
+
+    for (int k = 0; k < 4; k++) {
+      EXPECT_EQ(expected[k], actual[k]);
+      EXPECT_EQ(expected_c[k], actual[k]);
+    }
+
+    // Print scaling information only when Speed test is called.
+    if (iterations > 1) {
+      printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f\n",
+             static_cast<int>(ref_elapsed_time_),
+             static_cast<int>(opt_elapsed_time_),
+             (static_cast<float>(ref_elapsed_time_) /
+              static_cast<float>(opt_elapsed_time_)));
+    }
+  }
+
+  void TestConstantValue(Pixel value) {
+    FillConstant(value);
+    CheckAverages(1);
+  }
+
+  void TestRandom() {
     FillRandom();
-    CheckAverages();
+    CheckAverages(1);
+  }
+
+  void TestSpeed() {
+    FillRandom();
+    CheckAverages(1000000);
   }
+
+  int64_t ref_elapsed_time_ = 0;
+  int64_t opt_elapsed_time_ = 0;
+};
+
+using AverageTest8bpp = AverageTest<uint8_t>;
+
+TEST_P(AverageTest8bpp, MinValue) { TestConstantValue(0); }
+
+TEST_P(AverageTest8bpp, MaxValue) { TestConstantValue(255); }
+
+TEST_P(AverageTest8bpp, Random) { TestRandom(); }
+
+TEST_P(AverageTest8bpp, DISABLED_Speed) {
+  TestRandom(1000000);
+  PrintTimingStats();
 }
 
+using AvgTest8bpp_avg_8x8_quad = AverageTest_8x8_quad<uint8_t>;
+
+TEST_P(AvgTest8bpp_avg_8x8_quad, MinValue) { TestConstantValue(0); }
+
+TEST_P(AvgTest8bpp_avg_8x8_quad, MaxValue) { TestConstantValue(255); }
+
+TEST_P(AvgTest8bpp_avg_8x8_quad, Random) { TestRandom(); }
+
+TEST_P(AvgTest8bpp_avg_8x8_quad, DISABLED_Speed) { TestSpeed(); }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+using AverageTestHbd = AverageTest<uint16_t>;
+
+TEST_P(AverageTestHbd, MinValue) { TestConstantValue(0); }
+
+TEST_P(AverageTestHbd, MaxValue10bit) { TestConstantValue(1023); }
+TEST_P(AverageTestHbd, MaxValue12bit) { TestConstantValue(4095); }
+
+TEST_P(AverageTestHbd, Random) { TestRandom(); }
+
+TEST_P(AverageTestHbd, DISABLED_Speed) {
+  TestRandom(1000000);
+  PrintTimingStats();
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
 typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
                               const int ref_stride, const int height);
 
@@ -155,12 +362,14 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
   virtual void SetUp() {
     source_data_ = static_cast<uint8_t *>(
         aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
-    ASSERT_TRUE(source_data_ != NULL);
+    ASSERT_NE(source_data_, nullptr);
 
     hbuf_asm_ = static_cast<int16_t *>(
         aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+    ASSERT_NE(hbuf_asm_, nullptr);
     hbuf_c_ = static_cast<int16_t *>(
         aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+    ASSERT_NE(hbuf_c_, nullptr);
   }
 
   virtual void TearDown() {
@@ -173,10 +382,40 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
   }
 
   void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
-    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    API_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
+    API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+        << "Output mismatch\n";
+  }
+
+  void RunSpeedTest() {
+    const int numIter = 5000000;
+    printf("Height = %d number of iteration is %d \n", height_, numIter);
+    aom_usec_timer c_timer_;
+    aom_usec_timer_start(&c_timer_);
+    for (int i = 0; i < numIter; i++) {
+      c_func_(hbuf_c_, source_data_, 0, height_);
+    }
+    aom_usec_timer_mark(&c_timer_);
+
+    aom_usec_timer asm_timer_;
+    aom_usec_timer_start(&asm_timer_);
+
+    for (int i = 0; i < numIter; i++) {
+      asm_func_(hbuf_asm_, source_data_, 0, height_);
+    }
+    aom_usec_timer_mark(&asm_timer_);
+
+    const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+    const int asm_sum_time =
+        static_cast<int>(aom_usec_timer_elapsed(&asm_timer_));
+
+    printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+           asm_sum_time,
+           (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
+
     EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
-        << "Output mismatch";
+        << "Output mismatch\n";
   }
 
  private:
@@ -185,6 +424,7 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
   int16_t *hbuf_asm_;
   int16_t *hbuf_c_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest);
 
 typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
 
@@ -201,10 +441,38 @@ class IntProColTest : public AverageTestBase<uint8_t>,
 
  protected:
   void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
-    ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
+    API_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
+    API_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
     EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
   }
+  void RunSpeedTest() {
+    const int numIter = 5000000;
+    printf("Width = %d number of iteration is %d \n", width_, numIter);
+    aom_usec_timer c_timer_;
+    aom_usec_timer_start(&c_timer_);
+    for (int i = 0; i < numIter; i++) {
+      sum_c_ = c_func_(source_data_, width_);
+    }
+    aom_usec_timer_mark(&c_timer_);
+
+    aom_usec_timer asm_timer_;
+    aom_usec_timer_start(&asm_timer_);
+
+    for (int i = 0; i < numIter; i++) {
+      sum_asm_ = asm_func_(source_data_, width_);
+    }
+    aom_usec_timer_mark(&asm_timer_);
+
+    const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+    const int asm_sum_time =
+        static_cast<int>(aom_usec_timer_elapsed(&asm_timer_));
+
+    printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+           asm_sum_time,
+           (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
+
+    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch \n";
+  }
 
  private:
   IntProColFunc asm_func_;
@@ -212,6 +480,7 @@ class IntProColTest : public AverageTestBase<uint8_t>,
   int16_t sum_asm_;
   int16_t sum_c_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest);
 
 TEST_P(IntProRowTest, MinValue) {
   FillConstant(0);
@@ -228,6 +497,11 @@ TEST_P(IntProRowTest, Random) {
   RunComparison();
 }
 
+TEST_P(IntProRowTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+
 TEST_P(IntProColTest, MinValue) {
   FillConstant(0);
   RunComparison();
@@ -243,22 +517,188 @@ TEST_P(IntProColTest, Random) {
   RunComparison();
 }
 
+TEST_P(IntProColTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+class VectorVarTestBase : public ::testing::Test {
+ public:
+  explicit VectorVarTestBase(int bwl) { m_bwl = bwl; }
+  VectorVarTestBase() {}
+  ~VectorVarTestBase() {}
+
+ protected:
+  static const int kDataAlignment = 16;
+
+  virtual void SetUp() {
+    width = 4 << m_bwl;
+
+    ref_vector = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, width * sizeof(ref_vector[0])));
+    ASSERT_NE(ref_vector, nullptr);
+    src_vector = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, width * sizeof(src_vector[0])));
+    ASSERT_NE(src_vector, nullptr);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+  virtual void TearDown() {
+    aom_free(ref_vector);
+    ref_vector = NULL;
+    aom_free(src_vector);
+    src_vector = NULL;
+  }
+
+  void FillConstant(int16_t fill_constant_ref, int16_t fill_constant_src) {
+    for (int i = 0; i < width; ++i) {
+      ref_vector[i] = fill_constant_ref;
+      src_vector[i] = fill_constant_src;
+    }
+  }
+
+  void FillRandom() {
+    for (int i = 0; i < width; ++i) {
+      ref_vector[i] =
+          rnd_.Rand16() % max_range;  // acc. aom_vector_var_c brief.
+      src_vector[i] = rnd_.Rand16() % max_range;
+    }
+  }
+
+  int width;
+  int m_bwl;
+  int16_t *ref_vector;
+  int16_t *src_vector;
+  ACMRandom rnd_;
+
+  static const int max_range = 510;
+  static const int num_random_cmp = 50;
+};
+
+typedef int (*VectorVarFunc)(const int16_t *ref, const int16_t *src,
+                             const int bwl);
+
+typedef std::tuple<int, VectorVarFunc, VectorVarFunc> VecVarFunc;
+
+class VectorVarTest : public VectorVarTestBase,
+                      public ::testing::WithParamInterface<VecVarFunc> {
+ public:
+  VectorVarTest()
+      : VectorVarTestBase(GET_PARAM(0)), c_func(GET_PARAM(1)),
+        simd_func(GET_PARAM(2)) {}
+
+ protected:
+  int calcVarC() { return c_func(ref_vector, src_vector, m_bwl); }
+  int calcVarSIMD() { return simd_func(ref_vector, src_vector, m_bwl); }
+
+  VectorVarFunc c_func;
+  VectorVarFunc simd_func;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VectorVarTest);
+
+TEST_P(VectorVarTest, MaxVar) {
+  FillConstant(0, max_range);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, MaxVarRev) {
+  FillConstant(max_range, 0);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, ZeroDiff) {
+  FillConstant(0, 0);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, ZeroDiff2) {
+  FillConstant(max_range, max_range);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, Constant) {
+  FillConstant(30, 90);
+  int c_var = calcVarC();
+  int simd_var = calcVarSIMD();
+  ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, Random) {
+  for (size_t i = 0; i < num_random_cmp; i++) {
+    FillRandom();
+    int c_var = calcVarC();
+    int simd_var = calcVarSIMD();
+    ASSERT_EQ(c_var, simd_var);
+  }
+}
+TEST_P(VectorVarTest, DISABLED_Speed) {
+  FillRandom();
+  const int numIter = 50000;
+  printf("Width = %d number of iteration is %d \n", width, numIter);
+
+  int sum_c_var = 0;
+  int c_var = 0;
+
+  aom_usec_timer c_timer_;
+  aom_usec_timer_start(&c_timer_);
+  for (size_t i = 0; i < numIter; i++) {
+    c_var = calcVarC();
+    sum_c_var += c_var;
+  }
+  aom_usec_timer_mark(&c_timer_);
+
+  int simd_var = 0;
+  int sum_simd_var = 0;
+  aom_usec_timer simd_timer_;
+  aom_usec_timer_start(&simd_timer_);
+  for (size_t i = 0; i < numIter; i++) {
+    simd_var = calcVarSIMD();
+    sum_simd_var += simd_var;
+  }
+  aom_usec_timer_mark(&simd_timer_);
+
+  const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+  const int simd_sum_time =
+      static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+  printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+         simd_sum_time,
+         (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+  EXPECT_EQ(c_var, simd_var) << "Output mismatch \n";
+  EXPECT_EQ(sum_c_var, sum_simd_var) << "Output mismatch \n";
+}
+
 using std::make_tuple;
 
 INSTANTIATE_TEST_SUITE_P(
-    C, AverageTest,
-    ::testing::Values(make_tuple(16, 16, 1, 8, &aom_avg_8x8_c),
-                      make_tuple(16, 16, 1, 4, &aom_avg_4x4_c)));
+    C, AverageTest8bpp,
+    ::testing::Values(make_tuple(16, 16, 8, 1, 8, &aom_avg_8x8_c),
+                      make_tuple(16, 16, 8, 1, 4, &aom_avg_4x4_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AvgTest8bpp_avg_8x8_quad,
+    ::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_c),
+                      make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_c),
+                      make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_c)));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
-    SSE2, AverageTest,
-    ::testing::Values(make_tuple(16, 16, 0, 8, &aom_avg_8x8_sse2),
-                      make_tuple(16, 16, 5, 8, &aom_avg_8x8_sse2),
-                      make_tuple(32, 32, 15, 8, &aom_avg_8x8_sse2),
-                      make_tuple(16, 16, 0, 4, &aom_avg_4x4_sse2),
-                      make_tuple(16, 16, 5, 4, &aom_avg_4x4_sse2),
-                      make_tuple(32, 32, 15, 4, &aom_avg_4x4_sse2)));
+    SSE2, AverageTest8bpp,
+    ::testing::Values(make_tuple(16, 16, 8, 0, 8, &aom_avg_8x8_sse2),
+                      make_tuple(16, 16, 8, 5, 8, &aom_avg_8x8_sse2),
+                      make_tuple(32, 32, 8, 15, 8, &aom_avg_8x8_sse2),
+                      make_tuple(16, 16, 8, 0, 4, &aom_avg_4x4_sse2),
+                      make_tuple(16, 16, 8, 5, 4, &aom_avg_4x4_sse2),
+                      make_tuple(32, 32, 8, 15, 4, &aom_avg_4x4_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AvgTest8bpp_avg_8x8_quad,
+    ::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_sse2),
+                      make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_sse2),
+                      make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_sse2)));
 
 INSTANTIATE_TEST_SUITE_P(
     SSE2, IntProRowTest,
@@ -277,15 +717,328 @@ INSTANTIATE_TEST_SUITE_P(
                                  &aom_int_pro_col_c)));
 #endif
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AvgTest8bpp_avg_8x8_quad,
+    ::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_avx2),
+                      make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_avx2),
+                      make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_avx2)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AverageTest8bpp,
+    ::testing::Values(make_tuple(16, 16, 8, 0, 8, &aom_avg_8x8_neon),
+                      make_tuple(16, 16, 8, 5, 8, &aom_avg_8x8_neon),
+                      make_tuple(32, 32, 8, 15, 8, &aom_avg_8x8_neon),
+                      make_tuple(16, 16, 8, 0, 4, &aom_avg_4x4_neon),
+                      make_tuple(16, 16, 8, 5, 4, &aom_avg_4x4_neon),
+                      make_tuple(32, 32, 8, 15, 4, &aom_avg_4x4_neon)));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, IntProRowTest,
+    ::testing::Values(make_tuple(16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+                      make_tuple(32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+                      make_tuple(64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+                      make_tuple(128, &aom_int_pro_row_neon,
+                                 &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, IntProColTest,
+    ::testing::Values(make_tuple(16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+                      make_tuple(32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+                      make_tuple(64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+                      make_tuple(128, &aom_int_pro_col_neon,
+                                 &aom_int_pro_col_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AvgTest8bpp_avg_8x8_quad,
+    ::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_neon),
+                      make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_neon),
+                      make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_neon)));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    C, AverageTestHbd,
+    ::testing::Values(make_tuple(16, 16, 10, 1, 8, &aom_highbd_avg_8x8_c),
+                      make_tuple(16, 16, 10, 1, 4, &aom_highbd_avg_4x4_c),
+                      make_tuple(16, 16, 12, 1, 8, &aom_highbd_avg_8x8_c),
+                      make_tuple(16, 16, 12, 1, 4, &aom_highbd_avg_4x4_c)));
+
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
-    NEON, AverageTest,
-    ::testing::Values(make_tuple(16, 16, 0, 8, &aom_avg_8x8_neon),
-                      make_tuple(16, 16, 5, 8, &aom_avg_8x8_neon),
-                      make_tuple(32, 32, 15, 8, &aom_avg_8x8_neon),
-                      make_tuple(16, 16, 0, 4, &aom_avg_4x4_neon),
-                      make_tuple(16, 16, 5, 4, &aom_avg_4x4_neon),
-                      make_tuple(32, 32, 15, 4, &aom_avg_4x4_neon)));
+    NEON, AverageTestHbd,
+    ::testing::Values(make_tuple(16, 16, 10, 0, 4, &aom_highbd_avg_4x4_neon),
+                      make_tuple(16, 16, 10, 5, 4, &aom_highbd_avg_4x4_neon),
+                      make_tuple(32, 32, 10, 15, 4, &aom_highbd_avg_4x4_neon),
+                      make_tuple(16, 16, 12, 0, 4, &aom_highbd_avg_4x4_neon),
+                      make_tuple(16, 16, 12, 5, 4, &aom_highbd_avg_4x4_neon),
+                      make_tuple(32, 32, 12, 15, 4, &aom_highbd_avg_4x4_neon)));
+#endif  // HAVE_NEON
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
+typedef int (*SatdLpFunc)(const int16_t *coeffs, int length);
+
+template <typename SatdFuncType>
+struct SatdTestParam {
+  SatdTestParam(int s, SatdFuncType f1, SatdFuncType f2)
+      : satd_size(s), func_ref(f1), func_simd(f2) {}
+  friend std::ostream &operator<<(std::ostream &os,
+                                  const SatdTestParam<SatdFuncType> &param) {
+    return os << "satd_size: " << param.satd_size;
+  }
+  int satd_size;
+  SatdFuncType func_ref;
+  SatdFuncType func_simd;
+};
+
+template <typename CoeffType, typename SatdFuncType>
+class SatdTestBase
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<SatdTestParam<SatdFuncType>> {
+ protected:
+  explicit SatdTestBase(const SatdTestParam<SatdFuncType> &func_param) {
+    satd_size_ = func_param.satd_size;
+    satd_func_ref_ = func_param.func_ref;
+    satd_func_simd_ = func_param.func_simd;
+  }
+  virtual void SetUp() {
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<CoeffType *>(
+        aom_memalign(32, sizeof(*src_) * satd_size_));
+    ASSERT_NE(src_, nullptr);
+  }
+  virtual void TearDown() { aom_free(src_); }
+  void FillConstant(const CoeffType val) {
+    for (int i = 0; i < satd_size_; ++i) src_[i] = val;
+  }
+  void FillRandom() {
+    for (int i = 0; i < satd_size_; ++i) {
+      src_[i] = static_cast<int16_t>(rnd_.Rand16());
+    }
+  }
+  void Check(int expected) {
+    int total_ref;
+    API_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
+    EXPECT_EQ(expected, total_ref);
+
+    int total_simd;
+    API_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
+    EXPECT_EQ(expected, total_simd);
+  }
+  void RunComparison() {
+    int total_ref;
+    API_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
+
+    int total_simd;
+    API_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
+
+    EXPECT_EQ(total_ref, total_simd);
+  }
+  void RunSpeedTest() {
+    const int numIter = 500000;
+    printf("size = %d number of iteration is %d \n", satd_size_, numIter);
+
+    int total_ref;
+    aom_usec_timer c_timer_;
+    aom_usec_timer_start(&c_timer_);
+    for (int i = 0; i < numIter; i++) {
+      total_ref = satd_func_ref_(src_, satd_size_);
+    }
+    aom_usec_timer_mark(&c_timer_);
+
+    int total_simd;
+    aom_usec_timer simd_timer_;
+    aom_usec_timer_start(&simd_timer_);
+
+    for (int i = 0; i < numIter; i++) {
+      total_simd = satd_func_simd_(src_, satd_size_);
+    }
+    aom_usec_timer_mark(&simd_timer_);
+
+    const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+    const int simd_sum_time =
+        static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+    printf(
+        "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+        simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+    EXPECT_EQ(total_ref, total_simd) << "Output mismatch \n";
+  }
+  int satd_size_;
+
+ private:
+  CoeffType *src_;
+  SatdFuncType satd_func_ref_;
+  SatdFuncType satd_func_simd_;
+  ACMRandom rnd_;
+};
+
+class SatdTest : public SatdTestBase<tran_low_t, SatdFunc> {
+ public:
+  SatdTest() : SatdTestBase(GetParam()) {}
+};
+
+TEST_P(SatdTest, MinValue) {
+  const int kMin = -32640;
+  const int expected = -kMin * satd_size_;
+  FillConstant(kMin);
+  Check(expected);
+}
+TEST_P(SatdTest, MaxValue) {
+  const int kMax = 32640;
+  const int expected = kMax * satd_size_;
+  FillConstant(kMax);
+  Check(expected);
+}
+TEST_P(SatdTest, Random) {
+  int expected;
+  switch (satd_size_) {
+    case 16: expected = 205298; break;
+    case 64: expected = 1113950; break;
+    case 256: expected = 4268415; break;
+    case 1024: expected = 16954082; break;
+    default:
+      FAIL() << "Invalid satd size (" << satd_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+TEST_P(SatdTest, Match) {
+  FillRandom();
+  RunComparison();
+}
+TEST_P(SatdTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdTest);
+
+INSTANTIATE_TEST_SUITE_P(
+    C, SatdTest,
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_c),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_c),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_c),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c, &aom_satd_c)));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SatdTest,
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_neon),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_neon),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_neon),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+                                              &aom_satd_neon)));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VectorVarTest,
+    ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_neon),
+                      make_tuple(3, &aom_vector_var_c, &aom_vector_var_neon),
+                      make_tuple(4, &aom_vector_var_c, &aom_vector_var_neon),
+                      make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, SatdTest,
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_avx2),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_avx2),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_avx2),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+                                              &aom_satd_avx2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, SatdTest,
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_sse2),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_sse2),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_sse2),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+                                              &aom_satd_sse2)));
+#endif
+
+class SatdLpTest : public SatdTestBase<int16_t, SatdLpFunc> {
+ public:
+  SatdLpTest() : SatdTestBase(GetParam()) {}
+};
+
+TEST_P(SatdLpTest, MinValue) {
+  const int kMin = -32640;
+  const int expected = -kMin * satd_size_;
+  FillConstant(kMin);
+  Check(expected);
+}
+TEST_P(SatdLpTest, MaxValue) {
+  const int kMax = 32640;
+  const int expected = kMax * satd_size_;
+  FillConstant(kMax);
+  Check(expected);
+}
+TEST_P(SatdLpTest, Random) {
+  int expected;
+  switch (satd_size_) {
+    case 16: expected = 205298; break;
+    case 64: expected = 1113950; break;
+    case 256: expected = 4268415; break;
+    case 1024: expected = 16954082; break;
+    default:
+      FAIL() << "Invalid satd size (" << satd_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+TEST_P(SatdLpTest, Match) {
+  FillRandom();
+  RunComparison();
+}
+TEST_P(SatdLpTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdLpTest);
+
+// Add the following c test to avoid gtest uninitialized warning.
+INSTANTIATE_TEST_SUITE_P(
+    C, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_c),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_c),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_c),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_c)));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_neon),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_neon),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_neon),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_neon)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_avx2),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_avx2),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_avx2),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_avx2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_sse2),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_sse2),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_sse2),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_sse2)));
 #endif
 
 }  // namespace
diff --git a/media/libaom/src/test/best_encode.sh b/media/libaom/src/test/best_encode.sh
index fe31a01cb9..d29fdaed52 100644..100755
--- a/media/libaom/src/test/best_encode.sh
+++ b/media/libaom/src/test/best_encode.sh
@@ -29,7 +29,7 @@ if [[ -e $f.fpf ]]; then
     -p 2 \
     --pass=2 \
     --fpf=$f.fpf \
-    --best \
+    --good \
     --cpu-used=0 \
     --target-bitrate=$b \
     --auto-alt-ref=1 \
@@ -48,8 +48,7 @@ if [[ -e $f.fpf ]]; then
     --maxsection-pct=800 \
     --psnr \
     --arnr-maxframes=7 \
-    --arnr-strength=3 \
-    --arnr-type=3
+    --arnr-strength=3
 else
   # No first-pass file found, do 2-pass encode
   aomenc \
@@ -58,7 +57,7 @@ else
     -p 2 \
     --pass=1 \
     --fpf=$f.fpf \
-    --best \
+    --good \
     --cpu-used=0 \
     --target-bitrate=$b \
     --auto-alt-ref=1 \
@@ -79,7 +78,7 @@ else
     -p 2 \
     --pass=2 \
     --fpf=$f.fpf \
-    --best \
+    --good \
     --cpu-used=0 \
     --target-bitrate=$b \
     --auto-alt-ref=1 \
@@ -98,6 +97,5 @@ else
     --maxsection-pct=800 \
     --psnr \
     --arnr-maxframes=7 \
-    --arnr-strength=3 \
-    --arnr-type=3
+    --arnr-strength=3
 fi
diff --git a/media/libaom/src/test/blend_a64_mask_1d_test.cc b/media/libaom/src/test/blend_a64_mask_1d_test.cc
index 1b6350c793..9a9598704a 100644
--- a/media/libaom/src/test/blend_a64_mask_1d_test.cc
+++ b/media/libaom/src/test/blend_a64_mask_1d_test.cc
@@ -125,7 +125,7 @@ class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
     params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
                      src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
                      w_, h_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(
+    API_REGISTER_STATE_CHECK(params_.tst_func(
         dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
         src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, w_, h_));
   }
@@ -232,7 +232,7 @@ class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
                      CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
                      CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
                      mask_, w_, h_, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(
+    API_REGISTER_STATE_CHECK(params_.tst_func(
         CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
         CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
         CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, mask_, w_, h_,
diff --git a/media/libaom/src/test/blend_a64_mask_test.cc b/media/libaom/src/test/blend_a64_mask_test.cc
index 5c2c291fde..fc45664a97 100644
--- a/media/libaom/src/test/blend_a64_mask_test.cc
+++ b/media/libaom/src/test/blend_a64_mask_test.cc
@@ -190,6 +190,7 @@ class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t, uint8_t> {
     }
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTest8B);
 
 TEST_P(BlendA64MaskTest8B, RandomValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
@@ -304,6 +305,7 @@ class BlendA64MaskTest8B_d16
     }
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTest8B_d16);
 
 TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
@@ -404,6 +406,7 @@ class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t, uint16_t> {
 
   int bit_depth_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTestHBD);
 
 TEST_P(BlendA64MaskTestHBD, RandomValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
diff --git a/media/libaom/src/test/blockd_test.cc b/media/libaom/src/test/block_test.cc
index 17e6968630..74deee3f54 100644
--- a/media/libaom/src/test/blockd_test.cc
+++ b/media/libaom/src/test/block_test.cc
@@ -9,8 +9,13 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "aom/aom_codec.h"
 #include "av1/common/blockd.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/util.h"
 
 // Verify the optimized implementation of get_partition_subsize() produces the
 // same results as the Partition_Subsize lookup table in the spec.
@@ -120,3 +125,85 @@ TEST(BlockdTest, GetPartitionSubsize) {
     }
   }
 }
+
+#if CONFIG_AV1_DECODER && CONFIG_AV1_ENCODER
+namespace {
+// This class is used to validate if sb_size configured is respected
+// in the bitstream
+class SuperBlockSizeTestLarge
+    : public ::libaom_test::CodecTestWith3Params<
+          libaom_test::TestMode, aom_superblock_size_t, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  SuperBlockSizeTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        superblock_size_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
+    sb_size_violated_ = false;
+  }
+  virtual ~SuperBlockSizeTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_target_bitrate = 1000;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AV1E_SET_SUPERBLOCK_SIZE, superblock_size_);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec &&
+        superblock_size_ != AOM_SUPERBLOCK_SIZE_DYNAMIC) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      aom_superblock_size_t sb_size;
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_SB_SIZE, &sb_size);
+      if (superblock_size_ != sb_size) {
+        sb_size_violated_ = true;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  aom_superblock_size_t superblock_size_;
+  bool sb_size_violated_;
+  aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(SuperBlockSizeTestLarge, SuperBlockSizeTest) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(sb_size_violated_, false)
+      << "Failed for SB size " << superblock_size_;
+}
+
+const ::libaom_test::TestMode kTestModes[] = {
+#if CONFIG_REALTIME_ONLY
+  ::libaom_test::kRealTime
+#else
+  ::libaom_test::kRealTime, ::libaom_test::kOnePassGood,
+  ::libaom_test::kTwoPassGood
+#endif
+};
+
+AV1_INSTANTIATE_TEST_SUITE(SuperBlockSizeTestLarge,
+                           ::testing::ValuesIn(kTestModes),
+                           ::testing::Values(AOM_SUPERBLOCK_SIZE_64X64,
+                                             AOM_SUPERBLOCK_SIZE_128X128),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+}  // namespace
+#endif
diff --git a/media/libaom/src/test/borders_test.cc b/media/libaom/src/test/borders_test.cc
index 31eacab12e..bf9cc8b1ae 100644
--- a/media/libaom/src/test/borders_test.cc
+++ b/media/libaom/src/test/borders_test.cc
@@ -26,10 +26,7 @@ class BordersTestLarge
   BordersTestLarge() : EncoderTest(GET_PARAM(0)) {}
   virtual ~BordersTestLarge() {}
 
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-  }
+  virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
 
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
                                   ::libaom_test::Encoder *encoder) {
@@ -80,6 +77,6 @@ TEST_P(BordersTestLarge, TestLowBitrate) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-AV1_INSTANTIATE_TEST_CASE(BordersTestLarge,
-                          ::testing::Values(::libaom_test::kTwoPassGood));
+AV1_INSTANTIATE_TEST_SUITE(BordersTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood));
 }  // namespace
diff --git a/media/libaom/src/test/cdef_test.cc b/media/libaom/src/test/cdef_test.cc
index a2ec1e31e7..3f971be43e 100644
--- a/media/libaom/src/test/cdef_test.cc
+++ b/media/libaom/src/test/cdef_test.cc
@@ -9,7 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <array>
 #include <cstdlib>
+#include <iostream>
 #include <string>
 #include <tuple>
 
@@ -21,7 +23,6 @@
 #include "aom_ports/aom_timer.h"
 #include "av1/common/cdef_block.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -29,8 +30,10 @@ using libaom_test::ACMRandom;
 
 namespace {
 
-typedef std::tuple<cdef_filter_block_func, cdef_filter_block_func, BLOCK_SIZE,
-                   int, int>
+using CdefFilterBlockFunctions = std::array<cdef_filter_block_func, 4>;
+
+typedef std::tuple<CdefFilterBlockFunctions, CdefFilterBlockFunctions,
+                   BLOCK_SIZE, int, int>
     cdef_dir_param_t;
 
 class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
@@ -44,20 +47,30 @@ class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
     depth = GET_PARAM(4);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   int bsize;
   int boundary;
   int depth;
-  cdef_filter_block_func cdef;
-  cdef_filter_block_func ref_cdef;
+  CdefFilterBlockFunctions cdef;
+  CdefFilterBlockFunctions ref_cdef;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFBlockTest);
+
+typedef CDEFBlockTest CDEFBlockHighbdTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFBlockHighbdTest);
 
 typedef CDEFBlockTest CDEFSpeedTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFSpeedTest);
 
-void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
-               cdef_filter_block_func ref_cdef, int boundary, int depth) {
+typedef CDEFBlockTest CDEFSpeedHighbdTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFSpeedHighbdTest);
+
+int64_t test_cdef(int bsize, int iterations, CdefFilterBlockFunctions cdef,
+                  CdefFilterBlockFunctions ref_cdef, int boundary, int depth) {
+  aom_usec_timer ref_timer;
+  int64_t ref_elapsed_time = 0;
   const int size = 8;
   const int ysize = size + 2 * CDEF_VBORDER;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -73,6 +86,10 @@ void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
       errpridamping = 0, errsecdamping = 0;
   unsigned int pos = 0;
 
+  const int block_width =
+      ((bsize == BLOCK_8X8) || (bsize == BLOCK_8X4)) ? 8 : 4;
+  const int block_height =
+      ((bsize == BLOCK_8X8) || (bsize == BLOCK_4X8)) ? 8 : 4;
   const unsigned int max_pos = size * size >> static_cast<int>(depth == 8);
   for (pridamping = 3 + depth - 8; pridamping < 7 - 3 * !!boundary + depth - 8;
        pridamping++) {
@@ -114,19 +131,26 @@ void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
                 for (secstrength = 0; secstrength <= 4 << (depth - 8) && !error;
                      secstrength += 1 << (depth - 8)) {
                   if (secstrength == 3 << (depth - 8)) continue;
-                  ref_cdef(depth == 8 ? (uint8_t *)ref_d : 0, ref_d, size,
-                           s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
-                           pristrength, secstrength, dir, pridamping,
-                           secdamping, bsize, depth - 8);
+
+                  const int strength_index =
+                      (secstrength == 0) | ((pristrength == 0) << 1);
+
+                  aom_usec_timer_start(&ref_timer);
+                  ref_cdef[strength_index](
+                      ref_d, size,
+                      s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+                      pristrength, secstrength, dir, pridamping, secdamping,
+                      depth - 8, block_width, block_height);
+                  aom_usec_timer_mark(&ref_timer);
+                  ref_elapsed_time += aom_usec_timer_elapsed(&ref_timer);
                   // If cdef and ref_cdef are the same, we're just testing
                   // speed
-                  if (cdef != ref_cdef)
-                    ASM_REGISTER_STATE_CHECK(
-                        cdef(depth == 8 ? (uint8_t *)d : 0, d, size,
-                             s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
-                             pristrength, secstrength, dir, pridamping,
-                             secdamping, bsize, depth - 8));
-                  if (ref_cdef != cdef) {
+                  if (cdef[0] != ref_cdef[0])
+                    API_REGISTER_STATE_CHECK(cdef[strength_index](
+                        d, size, s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+                        pristrength, secstrength, dir, pridamping, secdamping,
+                        depth - 8, block_width, block_height));
+                  if (ref_cdef[0] != cdef[0]) {
                     for (pos = 0; pos < max_pos && !error; pos++) {
                       error = ref_d[pos] != d[pos];
                       errdepth = depth;
@@ -160,22 +184,21 @@ void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
                       << "size: " << bsize << std::endl
                       << "boundary: " << errboundary << std::endl
                       << std::endl;
+
+  return ref_elapsed_time;
 }
 
-void test_cdef_speed(int bsize, int iterations, cdef_filter_block_func cdef,
-                     cdef_filter_block_func ref_cdef, int boundary, int depth) {
-  aom_usec_timer ref_timer;
-  aom_usec_timer timer;
+void test_cdef_speed(int bsize, int iterations, CdefFilterBlockFunctions cdef,
+                     CdefFilterBlockFunctions ref_cdef, int boundary,
+                     int depth) {
+  int64_t ref_elapsed_time =
+      test_cdef(bsize, iterations, ref_cdef, ref_cdef, boundary, depth);
 
-  aom_usec_timer_start(&ref_timer);
-  test_cdef(bsize, iterations, ref_cdef, ref_cdef, boundary, depth);
-  aom_usec_timer_mark(&ref_timer);
-  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+  int64_t elapsed_time =
+      test_cdef(bsize, iterations, cdef, cdef, boundary, depth);
 
-  aom_usec_timer_start(&timer);
-  test_cdef(bsize, iterations, cdef, cdef, boundary, depth);
-  aom_usec_timer_mark(&timer);
-  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+  std::cout << "C time: " << ref_elapsed_time << " us" << std::endl
+            << "SIMD time: " << elapsed_time << " us" << std::endl;
 
   EXPECT_GT(ref_elapsed_time, elapsed_time)
       << "Error: CDEFSpeedTest, SIMD slower than C." << std::endl
@@ -196,14 +219,16 @@ class CDEFFindDirTest : public ::testing::TestWithParam<find_dir_param_t> {
     ref_finddir = GET_PARAM(1);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   find_dir_t finddir;
   find_dir_t ref_finddir;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFFindDirTest);
 
 typedef CDEFFindDirTest CDEFFindDirSpeedTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFFindDirSpeedTest);
 
 void test_finddir(int (*finddir)(const uint16_t *img, int stride, int32_t *var,
                                  int coeff_shift),
@@ -229,7 +254,7 @@ void test_finddir(int (*finddir)(const uint16_t *img, int stride, int32_t *var,
           for (int c = 0; c < 1 + 9 * (finddir == ref_finddir); c++)
             ref_res = ref_finddir(s, size, &ref_var, depth - 8);
           if (finddir != ref_finddir)
-            ASM_REGISTER_STATE_CHECK(res = finddir(s, size, &var, depth - 8));
+            API_REGISTER_STATE_CHECK(res = finddir(s, size, &var, depth - 8));
           if (ref_finddir != finddir) {
             if (res != ref_res || var != ref_var) error = 1;
             errdepth = depth;
@@ -257,12 +282,12 @@ void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride,
   aom_usec_timer_start(&ref_timer);
   test_finddir(ref_finddir, ref_finddir);
   aom_usec_timer_mark(&ref_timer);
-  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+  int64_t ref_elapsed_time = aom_usec_timer_elapsed(&ref_timer);
 
   aom_usec_timer_start(&timer);
   test_finddir(finddir, finddir);
   aom_usec_timer_mark(&timer);
-  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+  int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
 
   EXPECT_GT(ref_elapsed_time, elapsed_time)
       << "Error: CDEFFindDirSpeedTest, SIMD slower than C." << std::endl
@@ -270,14 +295,132 @@ void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride,
       << "SIMD time: " << elapsed_time << " us" << std::endl;
 }
 
+typedef void (*find_dir_dual_t)(const uint16_t *img1, const uint16_t *img2,
+                                int stride, int32_t *var1, int32_t *var2,
+                                int coeff_shift, int *out1, int *out2);
+
+typedef std::tuple<find_dir_dual_t, find_dir_dual_t> find_dir_dual_param_t;
+
+class CDEFFindDirDualTest
+    : public ::testing::TestWithParam<find_dir_dual_param_t> {
+ public:
+  virtual ~CDEFFindDirDualTest() {}
+  virtual void SetUp() {
+    finddir = GET_PARAM(0);
+    ref_finddir = GET_PARAM(1);
+  }
+
+  virtual void TearDown() {}
+
+ protected:
+  find_dir_dual_t finddir;
+  find_dir_dual_t ref_finddir;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFFindDirDualTest);
+
+typedef CDEFFindDirDualTest CDEFFindDirDualSpeedTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFFindDirDualSpeedTest);
+
+void test_finddir_dual(
+    void (*finddir)(const uint16_t *img1, const uint16_t *img2, int stride,
+                    int32_t *var1, int32_t *var2, int coeff_shift, int *out1,
+                    int *out2),
+    void (*ref_finddir)(const uint16_t *img1, const uint16_t *img2, int stride,
+                        int32_t *var1, int32_t *var2, int coeff_shift,
+                        int *out1, int *out2)) {
+  const int size_wd = 16;
+  const int size_ht = 8;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, s[size_ht * size_wd]);
+
+  int error = 0, errdepth = 0;
+  int32_t ref_var[2] = { 0 };
+  int ref_dir[2] = { 0 };
+  int32_t var[2] = { 0 };
+  int dir[2] = { 0 };
+
+  for (int depth = 8; depth <= 12 && !error; depth += 2) {
+    for (int count = 0; count < 512 && !error; count++) {
+      for (int level = 0; level < (1 << depth) && !error;
+           level += 1 << (depth - 8)) {
+        for (int bits = 1; bits <= depth && !error; bits++) {
+          for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+            s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                         (1 << depth) - 1);
+          for (int c = 0; c < 1 + 9 * (finddir == ref_finddir); c++)
+            ref_finddir(s, s + 8, size_wd, &ref_var[0], &ref_var[1], depth - 8,
+                        &ref_dir[0], &ref_dir[1]);
+          if (finddir != ref_finddir)
+            API_REGISTER_STATE_CHECK(finddir(s, s + 8, size_wd, &var[0],
+                                             &var[1], depth - 8, &dir[0],
+                                             &dir[1]));
+          if (ref_finddir != finddir) {
+            for (int j = 0; j < 2; j++) {
+              if (ref_dir[j] != dir[j] || ref_var[j] != var[j]) error = 1;
+            }
+            errdepth = depth;
+          }
+        }
+      }
+    }
+  }
+
+  for (int j = 0; j < 2; j++) {
+    EXPECT_EQ(0, error) << "Error: CDEFFindDirTest, SIMD and C mismatch."
+                        << std::endl
+                        << "direction: " << dir[j] << " : " << ref_dir[j]
+                        << std::endl
+                        << "variance: " << var[j] << " : " << ref_var[j]
+                        << std::endl
+                        << "depth: " << errdepth << std::endl
+                        << std::endl;
+  }
+}
+
+void test_finddir_dual_speed(
+    void (*finddir)(const uint16_t *img1, const uint16_t *img2, int stride,
+                    int32_t *var1, int32_t *var2, int coeff_shift, int *out1,
+                    int *out2),
+    void (*ref_finddir)(const uint16_t *img1, const uint16_t *img2, int stride,
+                        int32_t *var1, int32_t *var2, int coeff_shift,
+                        int *out1, int *out2)) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&ref_timer);
+  test_finddir_dual(ref_finddir, ref_finddir);
+  aom_usec_timer_mark(&ref_timer);
+  const double ref_elapsed_time =
+      static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+  aom_usec_timer_start(&timer);
+  test_finddir_dual(finddir, finddir);
+  aom_usec_timer_mark(&timer);
+  const double elapsed_time =
+      static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+  printf(
+      "ref_time=%lf \t simd_time=%lf \t "
+      "gain=%lf \n",
+      ref_elapsed_time, elapsed_time, ref_elapsed_time / elapsed_time);
+}
+
 TEST_P(CDEFBlockTest, TestSIMDNoMismatch) {
   test_cdef(bsize, 1, cdef, ref_cdef, boundary, depth);
 }
 
+TEST_P(CDEFBlockHighbdTest, TestSIMDHighbdNoMismatch) {
+  test_cdef(bsize, 1, cdef, ref_cdef, boundary, depth);
+}
+
 TEST_P(CDEFSpeedTest, DISABLED_TestSpeed) {
   test_cdef_speed(bsize, 4, cdef, ref_cdef, boundary, depth);
 }
 
+TEST_P(CDEFSpeedHighbdTest, DISABLED_TestSpeed) {
+  test_cdef_speed(bsize, 4, cdef, ref_cdef, boundary, depth);
+}
+
 TEST_P(CDEFFindDirTest, TestSIMDNoMismatch) {
   test_finddir(finddir, ref_finddir);
 }
@@ -286,141 +429,309 @@ TEST_P(CDEFFindDirSpeedTest, DISABLED_TestSpeed) {
   test_finddir_speed(finddir, ref_finddir);
 }
 
+TEST_P(CDEFFindDirDualTest, TestSIMDNoMismatch) {
+  test_finddir_dual(finddir, ref_finddir);
+}
+
+TEST_P(CDEFFindDirDualSpeedTest, DISABLED_TestSpeed) {
+  test_finddir_dual_speed(finddir, ref_finddir);
+}
+
 using std::make_tuple;
 
-// VS compiling for 32 bit targets does not support vector types in
-// structs as arguments, which makes the v256 type of the intrinsics
-// hard to support, so optimizations for this target are disabled.
-#if defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)
+#if (HAVE_SSE2 || HAVE_SSSE3 || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON)
+static const CdefFilterBlockFunctions kCdefFilterFuncC[] = {
+  { &cdef_filter_8_0_c, &cdef_filter_8_1_c, &cdef_filter_8_2_c,
+    &cdef_filter_8_3_c }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncC[] = {
+  { &cdef_filter_16_0_c, &cdef_filter_16_0_c, &cdef_filter_16_0_c,
+    &cdef_filter_16_0_c }
+};
+#endif
+
 #if HAVE_SSE2
+static const CdefFilterBlockFunctions kCdefFilterFuncSse2[] = {
+  { &cdef_filter_8_0_sse2, &cdef_filter_8_1_sse2, &cdef_filter_8_2_sse2,
+    &cdef_filter_8_3_sse2 }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncSse2[] = {
+  { &cdef_filter_16_0_sse2, &cdef_filter_16_1_sse2, &cdef_filter_16_2_sse2,
+    &cdef_filter_16_3_sse2 }
+};
+
 INSTANTIATE_TEST_SUITE_P(
     SSE2, CDEFBlockTest,
-    ::testing::Combine(::testing::Values(&cdef_filter_block_sse2),
-                       ::testing::Values(&cdef_filter_block_c),
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSse2),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
-                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, CDEFBlockHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSse2),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
 INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_sse2,
                                                       &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirDualTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_dual_sse2,
+                                                      &cdef_find_dir_dual_c)));
 #endif
+
 #if HAVE_SSSE3
+static const CdefFilterBlockFunctions kCdefFilterFuncSsse3[] = {
+  { &cdef_filter_8_0_ssse3, &cdef_filter_8_1_ssse3, &cdef_filter_8_2_ssse3,
+    &cdef_filter_8_3_ssse3 }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncSsse3[] = {
+  { &cdef_filter_16_0_ssse3, &cdef_filter_16_1_ssse3, &cdef_filter_16_2_ssse3,
+    &cdef_filter_16_3_ssse3 }
+};
+
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, CDEFBlockTest,
-    ::testing::Combine(::testing::Values(&cdef_filter_block_ssse3),
-                       ::testing::Values(&cdef_filter_block_c),
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSsse3),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
-                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, CDEFBlockHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSsse3),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
 INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
                                                       &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirDualTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_dual_ssse3,
+                                                      &cdef_find_dir_dual_c)));
 #endif
 
 #if HAVE_SSE4_1
+static const CdefFilterBlockFunctions kCdefFilterFuncSse4_1[] = {
+  { &cdef_filter_8_0_sse4_1, &cdef_filter_8_1_sse4_1, &cdef_filter_8_2_sse4_1,
+    &cdef_filter_8_3_sse4_1 }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncSse4_1[] = {
+  { &cdef_filter_16_0_sse4_1, &cdef_filter_16_1_sse4_1,
+    &cdef_filter_16_2_sse4_1, &cdef_filter_16_3_sse4_1 }
+};
+
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, CDEFBlockTest,
-    ::testing::Combine(::testing::Values(&cdef_filter_block_sse4_1),
-                       ::testing::Values(&cdef_filter_block_c),
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSse4_1),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
-                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, CDEFBlockHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSse4_1),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
 INSTANTIATE_TEST_SUITE_P(SSE4_1, CDEFFindDirTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
                                                       &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, CDEFFindDirDualTest,
+    ::testing::Values(make_tuple(&cdef_find_dir_dual_sse4_1,
+                                 &cdef_find_dir_dual_c)));
 #endif
 
 #if HAVE_AVX2
+static const CdefFilterBlockFunctions kCdefFilterFuncAvx2[] = {
+  { &cdef_filter_8_0_avx2, &cdef_filter_8_1_avx2, &cdef_filter_8_2_avx2,
+    &cdef_filter_8_3_avx2 }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncAvx2[] = {
+  { &cdef_filter_16_0_avx2, &cdef_filter_16_1_avx2, &cdef_filter_16_2_avx2,
+    &cdef_filter_16_3_avx2 }
+};
+
 INSTANTIATE_TEST_SUITE_P(
     AVX2, CDEFBlockTest,
-    ::testing::Combine(::testing::Values(&cdef_filter_block_avx2),
-                       ::testing::Values(&cdef_filter_block_c),
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncAvx2),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
-                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, CDEFBlockHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncAvx2),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
 INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_avx2,
                                                       &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirDualTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_dual_avx2,
+                                                      &cdef_find_dir_dual_c)));
 #endif
 
 #if HAVE_NEON
+static const CdefFilterBlockFunctions kCdefFilterFuncNeon[] = {
+  { &cdef_filter_8_0_neon, &cdef_filter_8_1_neon, &cdef_filter_8_2_neon,
+    &cdef_filter_8_3_neon }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncNeon[] = {
+  { &cdef_filter_16_0_neon, &cdef_filter_16_1_neon, &cdef_filter_16_2_neon,
+    &cdef_filter_16_3_neon }
+};
+
 INSTANTIATE_TEST_SUITE_P(
     NEON, CDEFBlockTest,
-    ::testing::Combine(::testing::Values(&cdef_filter_block_neon),
-                       ::testing::Values(&cdef_filter_block_c),
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncNeon),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
-                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, CDEFBlockHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncNeon),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
 INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_neon,
                                                       &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirDualTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_dual_neon,
+                                                      &cdef_find_dir_dual_c)));
 #endif
 
 // Test speed for all supported architectures
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
     SSE2, CDEFSpeedTest,
-    ::testing::Combine(::testing::Values(&cdef_filter_block_sse2),
-                       ::testing::Values(&cdef_filter_block_c),
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSse2),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, CDEFSpeedHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSse2),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
-                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+                       ::testing::Range(0, 16), ::testing::Values(10)));
 INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirSpeedTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_sse2,
                                                       &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirDualSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_dual_sse2,
+                                                      &cdef_find_dir_dual_c)));
 #endif
 
 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, CDEFSpeedTest,
-    ::testing::Combine(::testing::Values(&cdef_filter_block_ssse3),
-                       ::testing::Values(&cdef_filter_block_c),
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSsse3),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, CDEFSpeedHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSsse3),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
-                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+                       ::testing::Range(0, 16), ::testing::Values(10)));
 INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirSpeedTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
                                                       &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirDualSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_dual_ssse3,
+                                                      &cdef_find_dir_dual_c)));
 #endif
 
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, CDEFSpeedTest,
-    ::testing::Combine(::testing::Values(&cdef_filter_block_sse4_1),
-                       ::testing::Values(&cdef_filter_block_c),
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSse4_1),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, CDEFSpeedHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSse4_1),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
-                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+                       ::testing::Range(0, 16), ::testing::Values(10)));
 INSTANTIATE_TEST_SUITE_P(SSE4_1, CDEFFindDirSpeedTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
                                                       &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, CDEFFindDirDualSpeedTest,
+    ::testing::Values(make_tuple(&cdef_find_dir_dual_sse4_1,
+                                 &cdef_find_dir_dual_c)));
 #endif
 
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, CDEFSpeedTest,
-    ::testing::Combine(::testing::Values(&cdef_filter_block_avx2),
-                       ::testing::Values(&cdef_filter_block_c),
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncAvx2),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, CDEFSpeedHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncAvx2),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
-                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+                       ::testing::Range(0, 16), ::testing::Values(10)));
 INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirSpeedTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_avx2,
                                                       &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirDualSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_dual_avx2,
+                                                      &cdef_find_dir_dual_c)));
 #endif
 
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
     NEON, CDEFSpeedTest,
-    ::testing::Combine(::testing::Values(&cdef_filter_block_neon),
-                       ::testing::Values(&cdef_filter_block_c),
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncNeon),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, CDEFSpeedHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncNeon),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                          BLOCK_8X8),
-                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+                       ::testing::Range(0, 16), ::testing::Values(10)));
 INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirSpeedTest,
                          ::testing::Values(make_tuple(&cdef_find_dir_neon,
                                                       &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirDualSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_dual_neon,
+                                                      &cdef_find_dir_dual_c)));
 #endif
 
-#endif  // defined(_WIN64) || !defined(_MSC_VER)
 }  // namespace
diff --git a/media/libaom/src/test/cfl_test.cc b/media/libaom/src/test/cfl_test.cc
index d2973159c8..98cc9ab838 100644
--- a/media/libaom/src/test/cfl_test.cc
+++ b/media/libaom/src/test/cfl_test.cc
@@ -125,28 +125,33 @@ class CFLTestWithData : public CFLTest {
 template <typename I>
 class CFLTestWithAlignedData : public CFLTest {
  public:
-  CFLTestWithAlignedData() {
+  ~CFLTestWithAlignedData() {
+    aom_free(chroma_pels_ref);
+    aom_free(sub_luma_pels_ref);
+    aom_free(chroma_pels);
+    aom_free(sub_luma_pels);
+  }
+
+ protected:
+  void init() {
     chroma_pels_ref =
         reinterpret_cast<I *>(aom_memalign(32, sizeof(I) * CFL_BUF_SQUARE));
+    ASSERT_NE(chroma_pels_ref, nullptr);
     chroma_pels =
         reinterpret_cast<I *>(aom_memalign(32, sizeof(I) * CFL_BUF_SQUARE));
+    ASSERT_NE(chroma_pels, nullptr);
     sub_luma_pels_ref = reinterpret_cast<int16_t *>(
         aom_memalign(32, sizeof(int16_t) * CFL_BUF_SQUARE));
+    ASSERT_NE(sub_luma_pels_ref, nullptr);
     sub_luma_pels = reinterpret_cast<int16_t *>(
         aom_memalign(32, sizeof(int16_t) * CFL_BUF_SQUARE));
+    ASSERT_NE(sub_luma_pels, nullptr);
     memset(chroma_pels_ref, 0, sizeof(I) * CFL_BUF_SQUARE);
     memset(chroma_pels, 0, sizeof(I) * CFL_BUF_SQUARE);
     memset(sub_luma_pels_ref, 0, sizeof(int16_t) * CFL_BUF_SQUARE);
     memset(sub_luma_pels, 0, sizeof(int16_t) * CFL_BUF_SQUARE);
   }
-  ~CFLTestWithAlignedData() {
-    aom_free(chroma_pels_ref);
-    aom_free(sub_luma_pels_ref);
-    aom_free(chroma_pels);
-    aom_free(sub_luma_pels);
-  }
 
- protected:
   I *chroma_pels_ref;
   I *chroma_pels;
   int16_t *sub_luma_pels_ref;
@@ -183,6 +188,7 @@ class CFLSubAvgTest : public ::testing::TestWithParam<sub_avg_param>,
   cfl_subtract_average_fn sub_avg;
   cfl_subtract_average_fn sub_avg_ref;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLSubAvgTest);
 
 TEST_P(CFLSubAvgTest, SubAvgTest) {
   for (int it = 0; it < NUM_ITERATIONS; it++) {
@@ -286,6 +292,7 @@ class CFLSubsampleLBDTest
     fun_444_ref = cfl_get_luma_subsampling_444_lbd_c(tx_size);
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLSubsampleLBDTest);
 
 TEST_P(CFLSubsampleLBDTest, SubsampleLBD420Test) {
   subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
@@ -329,6 +336,7 @@ class CFLSubsampleHBDTest
     fun_444_ref = cfl_get_luma_subsampling_444_hbd_c(tx_size);
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLSubsampleHBDTest);
 
 TEST_P(CFLSubsampleHBDTest, SubsampleHBD420Test) {
   subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
@@ -363,6 +371,7 @@ class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
  public:
   virtual void SetUp() {
     CFLTest::init(std::get<0>(this->GetParam()));
+    CFLTestWithAlignedData::init();
     predict = std::get<1>(this->GetParam())(tx_size);
     predict_ref = cfl_get_predict_lbd_fn_c(tx_size);
   }
@@ -372,6 +381,7 @@ class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
   cfl_predict_lbd_fn predict;
   cfl_predict_lbd_fn predict_ref;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLPredictTest);
 
 TEST_P(CFLPredictTest, PredictTest) {
   for (int it = 0; it < NUM_ITERATIONS; it++) {
@@ -410,6 +420,7 @@ class CFLPredictHBDTest : public ::testing::TestWithParam<predict_param_hbd>,
  public:
   virtual void SetUp() {
     CFLTest::init(std::get<0>(this->GetParam()));
+    CFLTestWithAlignedData::init();
     predict = std::get<1>(this->GetParam())(tx_size);
     predict_ref = cfl_get_predict_hbd_fn_c(tx_size);
   }
@@ -419,6 +430,7 @@ class CFLPredictHBDTest : public ::testing::TestWithParam<predict_param_hbd>,
   cfl_predict_hbd_fn predict;
   cfl_predict_hbd_fn predict_ref;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLPredictHBDTest);
 
 TEST_P(CFLPredictHBDTest, PredictHBDTest) {
   int bd = 12;
diff --git a/media/libaom/src/test/clear_system_state.h b/media/libaom/src/test/clear_system_state.h
deleted file mode 100644
index d38ff5dd51..0000000000
--- a/media/libaom/src/test/clear_system_state.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#ifndef AOM_TEST_CLEAR_SYSTEM_STATE_H_
-#define AOM_TEST_CLEAR_SYSTEM_STATE_H_
-
-#include "config/aom_config.h"
-
-#if ARCH_X86 || ARCH_X86_64
-#include "aom_ports/x86.h"
-#endif
-
-namespace libaom_test {
-
-// Reset system to a known state. This function should be used for all non-API
-// test cases.
-inline void ClearSystemState() {
-#if ARCH_X86 || ARCH_X86_64
-  aom_reset_mmx_state();
-#endif
-}
-
-}  // namespace libaom_test
-#endif  // AOM_TEST_CLEAR_SYSTEM_STATE_H_
diff --git a/media/libaom/src/test/cnn_test.cc b/media/libaom/src/test/cnn_test.cc
index 4410493d30..0b92197447 100644
--- a/media/libaom/src/test/cnn_test.cc
+++ b/media/libaom/src/test/cnn_test.cc
@@ -17,7 +17,12 @@
 
 #include "config/av1_rtcd.h"
 
+#include "aom_ports/aom_timer.h"
 #include "av1/encoder/cnn.h"
+#include "av1/encoder/partition_cnn_weights.h"
+#include "test/acm_random.h"
+#include "test/function_equivalence_test.h"
+#include "test/util.h"
 
 #define SQR(x) ((x) * (x))
 
@@ -28,6 +33,9 @@
 #define MSE_FLOAT_TOL 1E-6
 #define MSE_INT_TOL 0
 
+// CNN convolve pixelwise error threshold for functional equivalence.
+#define CNN_CONVOLVE_PIXELWISE_FLOAT_TOL 1E-3f
+
 namespace {
 
 class CNNTest : public ::testing::Test {
@@ -45,6 +53,7 @@ class CNNTest : public ::testing::Test {
 
     float *output_ =
         (float *)aom_malloc(sizeof(*output_) * out_size * out_channels);
+    ASSERT_NE(output_, nullptr);
     float *output[CNN_MAX_CHANNELS] = { nullptr };
     for (int channel = 0; channel < out_channels; ++channel) {
       output[channel] = output_ + (channel * out_size);
@@ -73,11 +82,14 @@ class CNNTest : public ::testing::Test {
     int *out_widths = (int *)aom_calloc(num_outputs, sizeof(*out_widths));
     int *out_heights = (int *)aom_calloc(num_outputs, sizeof(*out_heights));
     int *not_used = (int *)aom_calloc(num_outputs, sizeof(*not_used));
+    ASSERT_NE(out_widths, nullptr);
+    ASSERT_NE(out_heights, nullptr);
+    ASSERT_NE(not_used, nullptr);
 
     av1_find_cnn_output_size(image_width, image_height, cnn_config, out_widths,
                              out_heights, not_used);
-    av1_cnn_predict(input, image_width, image_height, in_stride, cnn_config,
-                    thread_data, output);
+    ASSERT_TRUE(av1_cnn_predict(input, image_width, image_height, in_stride,
+                                cnn_config, thread_data, output));
 
     int channel_offset = 0;
     for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
@@ -2477,6 +2489,7 @@ TEST_F(CNNTest, TestMultiOutput) {
   float *const output_ = (float *)aom_malloc(
       sizeof(*output_) *
       (output_sizes[0] + output_sizes[1] + output_sizes[2] + output_sizes[3]));
+  ASSERT_NE(output_, nullptr);
   float *output[CNN_MAX_CHANNELS] = { nullptr };
   int ch_ite = 0;
   float *output_ite = output_;
@@ -2494,3 +2507,148 @@ TEST_F(CNNTest, TestMultiOutput) {
 
   aom_free(output_);
 }
+
+namespace {
+
+typedef void (*CNNConvolveNoMaxpoolPaddingValidFunc)(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+    int start_idx, int cstep, int channel_step);
+
+typedef libaom_test::FuncParam<CNNConvolveNoMaxpoolPaddingValidFunc>
+    CNNConvolveTestFuncs;
+
+class CNNConvolveTest : public ::testing::TestWithParam<CNNConvolveTestFuncs> {
+ protected:
+  virtual void SetUp() { params_ = GetParam(); }
+
+  void RunCNNConvolveSetup(int run_times) {
+    int in_width = 65;
+    int in_height = 65;
+
+    const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config;
+
+    for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
+      int out_width = 0, out_height = 0;
+      int in_size = in_width * in_height;
+      // Get current layer output width and height.
+      av1_find_cnn_layer_output_size(in_height, in_width,
+                                     &cnn_config->layer_config[layer],
+                                     &out_width, &out_height);
+
+      int out_size = out_width * out_height;
+      float *input[20], *output_ref[20], *output_mod[20];
+
+      float *input_data =
+          (float *)aom_malloc(sizeof(*input_data) * in_size *
+                              cnn_config->layer_config[layer].in_channels);
+      float *temp_ptr = input_data;
+      ASSERT_NE(temp_ptr, nullptr);
+      for (int i = 0; i < cnn_config->layer_config[layer].in_channels; ++i) {
+        input[i] = temp_ptr;
+        for (int j = 0; j < in_size; j++) {
+          *(temp_ptr++) = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
+        }
+      }
+
+      float *out_data_ref = (float *)aom_calloc(
+          sizeof(*out_data_ref),
+          out_size * cnn_config->layer_config[layer].out_channels);
+      ASSERT_NE(out_data_ref, nullptr);
+      float *out_data_mod = (float *)aom_calloc(
+          sizeof(*out_data_mod),
+          out_size * cnn_config->layer_config[layer].out_channels);
+      ASSERT_NE(out_data_mod, nullptr);
+      float *temp_ptr1 = out_data_ref;
+      float *temp_ptr2 = out_data_mod;
+      for (int i = 0; i < cnn_config->layer_config[layer].out_channels; ++i) {
+        output_ref[i] = temp_ptr1;
+        output_mod[i] = temp_ptr2;
+        temp_ptr1 += out_size;
+        temp_ptr2 += out_size;
+      }
+
+      RunCNNConvolveTest(input, in_width, in_height, out_size,
+                         &cnn_config->layer_config[layer], 0, 1, run_times,
+                         layer, output_ref, output_mod, out_width);
+
+      // Set current layer output width and height as next layer input width and
+      // height.
+      in_width = out_width;
+      in_height = out_height;
+
+      aom_free(input_data);
+      aom_free(out_data_ref);
+      aom_free(out_data_mod);
+    }
+  }
+
+  void RunCNNConvolveTest(float **input, int in_width, int in_height,
+                          int out_size, const CNN_LAYER_CONFIG *layer_config,
+                          int start_idx, int step, int run_times, int layer,
+                          float **output_ref, float **output_mod,
+                          int out_stride) {
+    const int cstep = layer_config->in_channels * layer_config->out_channels;
+    const int channel_step = AOMMAX(step, 1);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func((const float **)input, in_width, in_height, in_width,
+                       layer_config, output_ref, out_stride, start_idx, cstep,
+                       channel_step);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.tst_func((const float **)input, in_width, in_height, in_width,
+                       layer_config, output_mod, out_stride, start_idx, cstep,
+                       channel_step);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    if (run_times > 1) {
+      printf("layer : %d \n", layer);
+      printf("%7.2f/%7.2fns (%3.2f)\n", time1, time2, time1 / time2);
+    } else {
+      for (int channel = 0; channel < layer_config->out_channels; ++channel) {
+        const float *buf_ref = output_ref[channel];
+        const float *buf_mod = output_mod[channel];
+
+        for (int i = 0; i < out_size; ++i) {
+          if (buf_ref[i] < CNN_CONVOLVE_PIXELWISE_FLOAT_TOL) {
+            ASSERT_LE(buf_ref[i], CNN_CONVOLVE_PIXELWISE_FLOAT_TOL)
+                << "Reference output was near-zero, test output was not ("
+                << buf_mod[i] << ")";
+          } else {
+            const float error = buf_ref[i] - buf_mod[i];
+            const float relative_error = fabsf(error / buf_ref[i]);
+            ASSERT_LE(relative_error, CNN_CONVOLVE_PIXELWISE_FLOAT_TOL)
+                << " channel " << channel << " pixel " << i << ": "
+                << buf_ref[i] << "/" << buf_mod[i] << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  CNNConvolveTestFuncs params_;
+  libaom_test::ACMRandom rng_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CNNConvolveTest);
+
+TEST_P(CNNConvolveTest, CheckOutput) { RunCNNConvolveSetup(1); }
+
+TEST_P(CNNConvolveTest, DISABLED_Speed) { RunCNNConvolveSetup(100000); }
+
+#if HAVE_AVX2 && !CONFIG_EXCLUDE_SIMD_MISMATCH
+INSTANTIATE_TEST_SUITE_P(AVX2, CNNConvolveTest,
+                         ::testing::Values(CNNConvolveTestFuncs(
+                             &av1_cnn_convolve_no_maxpool_padding_valid_c,
+                             &av1_cnn_convolve_no_maxpool_padding_valid_avx2)));
+#endif
+
+}  // namespace
diff --git a/media/libaom/src/test/codec_factory.h b/media/libaom/src/test/codec_factory.h
index 801b8948fe..5ceb70bf14 100644
--- a/media/libaom/src/test/codec_factory.h
+++ b/media/libaom/src/test/codec_factory.h
@@ -161,7 +161,7 @@ class AV1CodecFactory : public CodecFactory {
 
 const libaom_test::AV1CodecFactory kAV1;
 
-#define AV1_INSTANTIATE_TEST_CASE(test, ...)                                \
+#define AV1_INSTANTIATE_TEST_SUITE(test, ...)                               \
   INSTANTIATE_TEST_SUITE_P(                                                 \
       AV1, test,                                                            \
       ::testing::Combine(                                                   \
diff --git a/media/libaom/src/test/coding_path_sync.cc b/media/libaom/src/test/coding_path_sync.cc
index 4c613dc03b..0eaa9dad8d 100644
--- a/media/libaom/src/test/coding_path_sync.cc
+++ b/media/libaom/src/test/coding_path_sync.cc
@@ -31,7 +31,11 @@ class CompressedSource {
     aom_codec_iface_t *algo = aom_codec_av1_cx();
 
     aom_codec_enc_cfg_t cfg;
+#if CONFIG_REALTIME_ONLY
+    aom_codec_enc_config_default(algo, &cfg, 1);
+#else
     aom_codec_enc_config_default(algo, &cfg, 0);
+#endif
 
     // force the quantizer, to reduce the sensitivity on encoding choices.
     // e.g, we don't want this test to break when the rate control is modified.
diff --git a/media/libaom/src/test/comp_avg_pred_test.cc b/media/libaom/src/test/comp_avg_pred_test.cc
index ac625a79d3..4218ac316d 100644
--- a/media/libaom/src/test/comp_avg_pred_test.cc
+++ b/media/libaom/src/test/comp_avg_pred_test.cc
@@ -13,10 +13,15 @@
 
 using libaom_test::ACMRandom;
 using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGTest);
 using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGUPSAMPLEDTest);
 #if CONFIG_AV1_HIGHBITDEPTH
 using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighBDDISTWTDCOMPAVGTest);
 using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(
+    AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest);
 #endif
 using std::make_tuple;
 using std::tuple;
diff --git a/media/libaom/src/test/comp_avg_pred_test.h b/media/libaom/src/test/comp_avg_pred_test.h
index 7f73312c4e..312f3d49e1 100644
--- a/media/libaom/src/test/comp_avg_pred_test.h
+++ b/media/libaom/src/test/comp_avg_pred_test.h
@@ -15,11 +15,11 @@
 #include <tuple>
 
 #include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "av1/common/common_data.h"
 #include "aom_ports/aom_timer.h"
@@ -92,7 +92,6 @@ class AV1DISTWTDCOMPAVGTest
  public:
   ~AV1DISTWTDCOMPAVGTest() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-  void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
   void RunCheckOutput(distwtdcompavg_func test_impl) {
@@ -117,8 +116,8 @@ class AV1DISTWTDCOMPAVGTest
 
     for (int ii = 0; ii < 2; ii++) {
       for (int jj = 0; jj < 4; jj++) {
-        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
 
         const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
         const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
@@ -160,8 +159,8 @@ class AV1DISTWTDCOMPAVGTest
     DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
     dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
 
     const int num_loops = 1000000000 / (in_w + in_h);
     aom_usec_timer timer;
@@ -196,7 +195,6 @@ class AV1DISTWTDCOMPAVGUPSAMPLEDTest
  public:
   ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-  void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
   void RunCheckOutput(distwtdcompavgupsampled_func test_impl) {
@@ -226,10 +224,9 @@ class AV1DISTWTDCOMPAVGUPSAMPLEDTest
         for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
           for (int ii = 0; ii < 2; ii++) {
             for (int jj = 0; jj < 4; jj++) {
-              dist_wtd_comp_params.fwd_offset =
-                  quant_dist_lookup_table[ii][jj][0];
+              dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
               dist_wtd_comp_params.bck_offset =
-                  quant_dist_lookup_table[ii][jj][1];
+                  quant_dist_lookup_table[jj][1 - ii];
 
               const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
               const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
@@ -282,8 +279,8 @@ class AV1DISTWTDCOMPAVGUPSAMPLEDTest
     DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
     dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
 
     int sub_x_q3 = 0;
     int sub_y_q3 = 0;
@@ -326,8 +323,6 @@ class AV1HighBDDISTWTDCOMPAVGTest
   ~AV1HighBDDISTWTDCOMPAVGTest() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
-  void TearDown() { libaom_test::ClearSystemState(); }
-
  protected:
   void RunCheckOutput(distwtdcompavg_func test_impl) {
     const int w = kMaxSize, h = kMaxSize;
@@ -351,8 +346,8 @@ class AV1HighBDDISTWTDCOMPAVGTest
 
     for (int ii = 0; ii < 2; ii++) {
       for (int jj = 0; jj < 4; jj++) {
-        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
 
         const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
         const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
@@ -398,8 +393,8 @@ class AV1HighBDDISTWTDCOMPAVGTest
     DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
     dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
 
     const int num_loops = 1000000000 / (in_w + in_h);
     aom_usec_timer timer;
@@ -436,7 +431,6 @@ class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
  public:
   ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-  void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
   void RunCheckOutput(highbddistwtdcompavgupsampled_func test_impl) {
@@ -466,10 +460,9 @@ class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
         for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
           for (int ii = 0; ii < 2; ii++) {
             for (int jj = 0; jj < 4; jj++) {
-              dist_wtd_comp_params.fwd_offset =
-                  quant_dist_lookup_table[ii][jj][0];
+              dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
               dist_wtd_comp_params.bck_offset =
-                  quant_dist_lookup_table[ii][jj][1];
+                  quant_dist_lookup_table[jj][1 - ii];
 
               const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
               const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
@@ -524,8 +517,8 @@ class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
     DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
     dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
     int sub_x_q3 = 0;
     int sub_y_q3 = 0;
     const int num_loops = 1000000000 / (in_w + in_h);
diff --git a/media/libaom/src/test/comp_mask_variance_test.cc b/media/libaom/src/test/comp_mask_variance_test.cc
index b666306a30..4c2cba435d 100644
--- a/media/libaom/src/test/comp_mask_variance_test.cc
+++ b/media/libaom/src/test/comp_mask_variance_test.cc
@@ -23,8 +23,8 @@
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -35,10 +35,12 @@ typedef void (*comp_mask_pred_func)(uint8_t *comp_pred, const uint8_t *pred,
                                     int ref_stride, const uint8_t *mask,
                                     int mask_stride, int invert_mask);
 
-#if HAVE_SSSE3 || HAVE_SSE2 || HAVE_AV2
+#if HAVE_SSSE3 || HAVE_SSE2 || HAVE_AVX2
 const BLOCK_SIZE kValidBlockSize[] = {
-  BLOCK_8X8,   BLOCK_8X16, BLOCK_8X32,  BLOCK_16X8,  BLOCK_16X16,
-  BLOCK_16X32, BLOCK_32X8, BLOCK_32X16, BLOCK_32X32,
+  BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X32,   BLOCK_16X8,   BLOCK_16X16,
+  BLOCK_16X32, BLOCK_32X8,  BLOCK_32X16,  BLOCK_32X32,  BLOCK_32X64,
+  BLOCK_64X32, BLOCK_64X64, BLOCK_64X128, BLOCK_128X64, BLOCK_128X128,
+  BLOCK_16X64, BLOCK_64X16
 };
 #endif
 typedef std::tuple<comp_mask_pred_func, BLOCK_SIZE> CompMaskPredParam;
@@ -75,16 +77,21 @@ class AV1CompMaskVarianceTest
   uint8_t *ref_buffer_;
   uint8_t *ref_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CompMaskVarianceTest);
 
-AV1CompMaskVarianceTest::~AV1CompMaskVarianceTest() { ; }
+AV1CompMaskVarianceTest::~AV1CompMaskVarianceTest() {}
 
 void AV1CompMaskVarianceTest::SetUp() {
   rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
   av1_init_wedge_masks();
   comp_pred1_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  ASSERT_NE(comp_pred1_, nullptr);
   comp_pred2_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  ASSERT_NE(comp_pred2_, nullptr);
   pred_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  ASSERT_NE(pred_, nullptr);
   ref_buffer_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE + (8 * MAX_SB_SIZE));
+  ASSERT_NE(ref_buffer_, nullptr);
   ref_ = ref_buffer_ + (8 * MAX_SB_SIZE);
   for (int i = 0; i < MAX_SB_SQUARE; ++i) {
     pred_[i] = rnd_.Rand8();
@@ -99,7 +106,6 @@ void AV1CompMaskVarianceTest::TearDown() {
   aom_free(comp_pred2_);
   aom_free(pred_);
   aom_free(ref_buffer_);
-  libaom_test::ClearSystemState();
 }
 
 void AV1CompMaskVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
@@ -182,7 +188,7 @@ class AV1CompMaskUpVarianceTest : public AV1CompMaskVarianceTest {
                     int havSub);
 };
 
-AV1CompMaskUpVarianceTest::~AV1CompMaskUpVarianceTest() { ; }
+AV1CompMaskUpVarianceTest::~AV1CompMaskUpVarianceTest() {}
 
 void AV1CompMaskUpVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
                                                BLOCK_SIZE bsize, int inv) {
@@ -318,8 +324,9 @@ class AV1HighbdCompMaskVarianceTest
   uint16_t *ref_buffer_;
   uint16_t *ref_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdCompMaskVarianceTest);
 
-AV1HighbdCompMaskVarianceTest::~AV1HighbdCompMaskVarianceTest() { ; }
+AV1HighbdCompMaskVarianceTest::~AV1HighbdCompMaskVarianceTest() {}
 
 void AV1HighbdCompMaskVarianceTest::SetUp() {
   rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
@@ -327,11 +334,15 @@ void AV1HighbdCompMaskVarianceTest::SetUp() {
 
   comp_pred1_ =
       (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_));
+  ASSERT_NE(comp_pred1_, nullptr);
   comp_pred2_ =
       (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_));
+  ASSERT_NE(comp_pred2_, nullptr);
   pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_));
+  ASSERT_NE(pred_, nullptr);
   ref_buffer_ = (uint16_t *)aom_memalign(
       16, (MAX_SB_SQUARE + (8 * MAX_SB_SIZE)) * sizeof(*ref_buffer_));
+  ASSERT_NE(ref_buffer_, nullptr);
   ref_ = ref_buffer_ + (8 * MAX_SB_SIZE);
 }
 
@@ -340,7 +351,6 @@ void AV1HighbdCompMaskVarianceTest::TearDown() {
   aom_free(comp_pred2_);
   aom_free(pred_);
   aom_free(ref_buffer_);
-  libaom_test::ClearSystemState();
 }
 
 void AV1HighbdCompMaskVarianceTest::RunCheckOutput(
@@ -451,7 +461,7 @@ class AV1HighbdCompMaskUpVarianceTest : public AV1HighbdCompMaskVarianceTest {
                     int havSub);
 };
 
-AV1HighbdCompMaskUpVarianceTest::~AV1HighbdCompMaskUpVarianceTest() { ; }
+AV1HighbdCompMaskUpVarianceTest::~AV1HighbdCompMaskUpVarianceTest() {}
 
 void AV1HighbdCompMaskUpVarianceTest::RunCheckOutput(
     highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
diff --git a/media/libaom/src/test/convolve_round_test.cc b/media/libaom/src/test/convolve_round_test.cc
index 4f17b54728..05807441c1 100644
--- a/media/libaom/src/test/convolve_round_test.cc
+++ b/media/libaom/src/test/convolve_round_test.cc
@@ -17,7 +17,6 @@
 #include "aom/aom_integer.h"
 #include "aom_ports/aom_timer.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -71,10 +70,13 @@ class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
     const size_t block_size = 128 * 128;
     src_ = reinterpret_cast<int32_t *>(
         aom_memalign(16, block_size * sizeof(*src_)));
+    ASSERT_NE(src_, nullptr);
     dst_ref_ = reinterpret_cast<uint16_t *>(
         aom_memalign(16, block_size * sizeof(*dst_ref_)));
+    ASSERT_NE(dst_ref_, nullptr);
     dst_ = reinterpret_cast<uint16_t *>(
         aom_memalign(16, block_size * sizeof(*dst_)));
+    ASSERT_NE(dst_, nullptr);
   }
 
   virtual void TearDown() {
@@ -114,7 +116,7 @@ class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
       GenerateBufferWithRandom(src_, src_stride, bits, w, h);
 
       func_ref_(src_, src_stride, dst_ref, dst_stride, w, h, bits);
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           func_(src_, src_stride, dst, dst_stride, w, h, bits));
 
       if (data_path_ == LOWBITDEPTH_TEST) {
diff --git a/media/libaom/src/test/convolve_test.cc b/media/libaom/src/test/convolve_test.cc
index 0b1eea16a5..d5e3750509 100644
--- a/media/libaom/src/test/convolve_test.cc
+++ b/media/libaom/src/test/convolve_test.cc
@@ -24,7 +24,6 @@
 #include "aom_ports/mem.h"
 #include "av1/common/filter.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -39,10 +38,9 @@ typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
                              int w, int h);
 
 struct ConvolveFunctions {
-  ConvolveFunctions(ConvolveFunc copy, ConvolveFunc h8, ConvolveFunc v8, int bd)
-      : copy_(copy), h8_(h8), v8_(v8), use_highbd_(bd) {}
+  ConvolveFunctions(ConvolveFunc h8, ConvolveFunc v8, int bd)
+      : h8_(h8), v8_(v8), use_highbd_(bd) {}
 
-  ConvolveFunc copy_;
   ConvolveFunc h8_;
   ConvolveFunc v8_;
   int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
@@ -269,31 +267,39 @@ void highbd_filter_average_block2d_8_c(
 
 class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
  public:
-  static void SetUpTestCase() {
+  static void SetUpTestSuite() {
     // Force input_ to be unaligned, output to be 16 byte aligned.
     input_ = reinterpret_cast<uint8_t *>(
                  aom_memalign(kDataAlignment, kInputBufferSize + 1)) +
              1;
+    ASSERT_NE(input_, nullptr);
     ref8_ = reinterpret_cast<uint8_t *>(
         aom_memalign(kDataAlignment, kOutputStride * kMaxDimension));
+    ASSERT_NE(ref8_, nullptr);
     output_ = reinterpret_cast<uint8_t *>(
         aom_memalign(kDataAlignment, kOutputBufferSize));
+    ASSERT_NE(output_, nullptr);
     output_ref_ = reinterpret_cast<uint8_t *>(
         aom_memalign(kDataAlignment, kOutputBufferSize));
+    ASSERT_NE(output_ref_, nullptr);
     input16_ = reinterpret_cast<uint16_t *>(aom_memalign(
                    kDataAlignment, (kInputBufferSize + 1) * sizeof(uint16_t))) +
                1;
+    ASSERT_NE(input16_, nullptr);
     ref16_ = reinterpret_cast<uint16_t *>(aom_memalign(
         kDataAlignment, kOutputStride * kMaxDimension * sizeof(uint16_t)));
+    ASSERT_NE(ref16_, nullptr);
     output16_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+    ASSERT_NE(output16_, nullptr);
     output16_ref_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+    ASSERT_NE(output16_ref_, nullptr);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
-  static void TearDownTestCase() {
+  static void TearDownTestSuite() {
     aom_free(input_ - 1);
     input_ = NULL;
     aom_free(ref8_);
@@ -479,22 +485,6 @@ uint16_t *ConvolveTest::output16_ref_ = NULL;
 
 TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
 
-TEST_P(ConvolveTest, Copy) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-
-  ASM_REGISTER_STATE_CHECK(UUT_->copy_(in, kInputStride, out, kOutputStride,
-                                       NULL, 0, NULL, 0, Width(), Height()));
-
-  CheckGuardBlocks();
-
-  for (int y = 0; y < Height(); ++y)
-    for (int x = 0; x < Width(); ++x)
-      ASSERT_EQ(lookup(out, y * kOutputStride + x),
-                lookup(in, y * kInputStride + x))
-          << "(" << x << "," << y << ")";
-}
-
 const int kNumFilterBanks = SWITCHABLE_FILTERS;
 const int kNumFilters = 16;
 
@@ -553,17 +543,15 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
           if (filter_x && filter_y)
             continue;
           else if (filter_y)
-            ASM_REGISTER_STATE_CHECK(
+            API_REGISTER_STATE_CHECK(
                 UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter,
                           16, filters[filter_y], 16, Width(), Height()));
           else if (filter_x)
-            ASM_REGISTER_STATE_CHECK(UUT_->h8_(
+            API_REGISTER_STATE_CHECK(UUT_->h8_(
                 in, kInputStride, out, kOutputStride, filters[filter_x], 16,
                 kInvalidFilter, 16, Width(), Height()));
           else
-            ASM_REGISTER_STATE_CHECK(UUT_->copy_(
-                in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                kInvalidFilter, 0, Width(), Height()));
+            continue;
 
           CheckGuardBlocks();
 
@@ -637,17 +625,15 @@ TEST_P(ConvolveTest, FilterExtremes) {
               if (filter_x && filter_y)
                 continue;
               else if (filter_y)
-                ASM_REGISTER_STATE_CHECK(UUT_->v8_(
+                API_REGISTER_STATE_CHECK(UUT_->v8_(
                     in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
                     filters[filter_y], 16, Width(), Height()));
               else if (filter_x)
-                ASM_REGISTER_STATE_CHECK(UUT_->h8_(
+                API_REGISTER_STATE_CHECK(UUT_->h8_(
                     in, kInputStride, out, kOutputStride, filters[filter_x], 16,
                     kInvalidFilter, 16, Width(), Height()));
               else
-                ASM_REGISTER_STATE_CHECK(UUT_->copy_(
-                    in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                    kInvalidFilter, 0, Width(), Height()));
+                continue;
 
               for (int y = 0; y < Height(); ++y)
                 for (int x = 0; x < Width(); ++x)
@@ -664,26 +650,6 @@ TEST_P(ConvolveTest, FilterExtremes) {
   }
 }
 
-TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
-  const uint8_t *const in = input();
-  uint8_t *const out = output();
-  const int kNumTests = 5000000;
-  const int width = Width();
-  const int height = Height();
-  aom_usec_timer timer;
-
-  aom_usec_timer_start(&timer);
-  for (int n = 0; n < kNumTests; ++n) {
-    UUT_->copy_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0, width,
-                height);
-  }
-  aom_usec_timer_mark(&timer);
-
-  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("convolve_copy_%dx%d_%d: %d us\n", width, height,
-         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
-}
-
 TEST_P(ConvolveTest, DISABLED_Speed) {
   uint8_t *const in = input();
   uint8_t *const out = output();
@@ -729,11 +695,11 @@ TEST_P(ConvolveTest, DISABLED_Speed) {
         for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
           if (filter_x && filter_y) continue;
           if (filter_y)
-            ASM_REGISTER_STATE_CHECK(
+            API_REGISTER_STATE_CHECK(
                 UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter,
                           16, filters[filter_y], 16, Width(), Height()));
           else if (filter_x)
-            ASM_REGISTER_STATE_CHECK(UUT_->h8_(
+            API_REGISTER_STATE_CHECK(UUT_->h8_(
                 in, kInputStride, out, kOutputStride, filters[filter_x], 16,
                 kInvalidFilter, 16, Width(), Height()));
         }
@@ -762,9 +728,6 @@ using std::make_tuple;
                       filter_x_stride, filter_y, filter_y_stride, w, h, bd); \
   }
 #if HAVE_SSE2 && ARCH_X86_64
-WRAP(convolve_copy_sse2, 8)
-WRAP(convolve_copy_sse2, 10)
-WRAP(convolve_copy_sse2, 12)
 WRAP(convolve8_horiz_sse2, 8)
 WRAP(convolve8_vert_sse2, 8)
 WRAP(convolve8_horiz_sse2, 10)
@@ -773,26 +736,20 @@ WRAP(convolve8_horiz_sse2, 12)
 WRAP(convolve8_vert_sse2, 12)
 #endif  // HAVE_SSE2 && ARCH_X86_64
 
-WRAP(convolve_copy_c, 8)
 WRAP(convolve8_horiz_c, 8)
 WRAP(convolve8_vert_c, 8)
-WRAP(convolve_copy_c, 10)
 WRAP(convolve8_horiz_c, 10)
 WRAP(convolve8_vert_c, 10)
-WRAP(convolve_copy_c, 12)
 WRAP(convolve8_horiz_c, 12)
 WRAP(convolve8_vert_c, 12)
 
 #if HAVE_AVX2
-WRAP(convolve_copy_avx2, 8)
 WRAP(convolve8_horiz_avx2, 8)
 WRAP(convolve8_vert_avx2, 8)
 
-WRAP(convolve_copy_avx2, 10)
 WRAP(convolve8_horiz_avx2, 10)
 WRAP(convolve8_vert_avx2, 10)
 
-WRAP(convolve_copy_avx2, 12)
 WRAP(convolve8_horiz_avx2, 12)
 WRAP(convolve8_vert_avx2, 12)
 #endif  // HAVE_AVX2
@@ -801,21 +758,18 @@ WRAP(convolve8_vert_avx2, 12)
 #undef WRAP
 
 #if CONFIG_AV1_HIGHBITDEPTH
-const ConvolveFunctions wrap_convolve8_c(wrap_convolve_copy_c_8,
-                                         wrap_convolve8_horiz_c_8,
+const ConvolveFunctions wrap_convolve8_c(wrap_convolve8_horiz_c_8,
                                          wrap_convolve8_vert_c_8, 8);
-const ConvolveFunctions wrap_convolve10_c(wrap_convolve_copy_c_10,
-                                          wrap_convolve8_horiz_c_10,
+const ConvolveFunctions wrap_convolve10_c(wrap_convolve8_horiz_c_10,
                                           wrap_convolve8_vert_c_10, 10);
-const ConvolveFunctions wrap_convolve12_c(wrap_convolve_copy_c_12,
-                                          wrap_convolve8_horiz_c_12,
+const ConvolveFunctions wrap_convolve12_c(wrap_convolve8_horiz_c_12,
                                           wrap_convolve8_vert_c_12, 12);
 const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(wrap_convolve8_c),
                                            ALL_SIZES(wrap_convolve10_c),
                                            ALL_SIZES(wrap_convolve12_c) };
 #else
-const ConvolveFunctions convolve8_c(aom_convolve_copy_c, aom_convolve8_horiz_c,
-                                    aom_convolve8_vert_c, 0);
+const ConvolveFunctions convolve8_c(aom_convolve8_horiz_c, aom_convolve8_vert_c,
+                                    0);
 const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) };
 #endif
 
@@ -824,21 +778,17 @@ INSTANTIATE_TEST_SUITE_P(C, ConvolveTest,
 
 #if HAVE_SSE2 && ARCH_X86_64
 #if CONFIG_AV1_HIGHBITDEPTH
-const ConvolveFunctions wrap_convolve8_sse2(wrap_convolve_copy_sse2_8,
-                                            wrap_convolve8_horiz_sse2_8,
+const ConvolveFunctions wrap_convolve8_sse2(wrap_convolve8_horiz_sse2_8,
                                             wrap_convolve8_vert_sse2_8, 8);
-const ConvolveFunctions wrap_convolve10_sse2(wrap_convolve_copy_sse2_10,
-                                             wrap_convolve8_horiz_sse2_10,
+const ConvolveFunctions wrap_convolve10_sse2(wrap_convolve8_horiz_sse2_10,
                                              wrap_convolve8_vert_sse2_10, 10);
-const ConvolveFunctions wrap_convolve12_sse2(wrap_convolve_copy_sse2_12,
-                                             wrap_convolve8_horiz_sse2_12,
+const ConvolveFunctions wrap_convolve12_sse2(wrap_convolve8_horiz_sse2_12,
                                              wrap_convolve8_vert_sse2_12, 12);
 const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(wrap_convolve8_sse2),
                                               ALL_SIZES(wrap_convolve10_sse2),
                                               ALL_SIZES(wrap_convolve12_sse2) };
 #else
-const ConvolveFunctions convolve8_sse2(aom_convolve_copy_c,
-                                       aom_convolve8_horiz_sse2,
+const ConvolveFunctions convolve8_sse2(aom_convolve8_horiz_sse2,
                                        aom_convolve8_vert_sse2, 0);
 const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) };
 #endif
@@ -847,8 +797,7 @@ INSTANTIATE_TEST_SUITE_P(SSE2, ConvolveTest,
 #endif
 
 #if HAVE_SSSE3
-const ConvolveFunctions convolve8_ssse3(aom_convolve_copy_c,
-                                        aom_convolve8_horiz_ssse3,
+const ConvolveFunctions convolve8_ssse3(aom_convolve8_horiz_ssse3,
                                         aom_convolve8_vert_ssse3, 0);
 
 const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
@@ -858,22 +807,18 @@ INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveTest,
 
 #if HAVE_AVX2
 #if CONFIG_AV1_HIGHBITDEPTH
-const ConvolveFunctions wrap_convolve8_avx2(wrap_convolve_copy_avx2_8,
-                                            wrap_convolve8_horiz_avx2_8,
+const ConvolveFunctions wrap_convolve8_avx2(wrap_convolve8_horiz_avx2_8,
                                             wrap_convolve8_vert_avx2_8, 8);
-const ConvolveFunctions wrap_convolve10_avx2(wrap_convolve_copy_avx2_10,
-                                             wrap_convolve8_horiz_avx2_10,
+const ConvolveFunctions wrap_convolve10_avx2(wrap_convolve8_horiz_avx2_10,
                                              wrap_convolve8_vert_avx2_10, 10);
-const ConvolveFunctions wrap_convolve12_avx2(wrap_convolve_copy_avx2_12,
-                                             wrap_convolve8_horiz_avx2_12,
+const ConvolveFunctions wrap_convolve12_avx2(wrap_convolve8_horiz_avx2_12,
                                              wrap_convolve8_vert_avx2_12, 12);
 const ConvolveParam kArray_Convolve8_avx2[] = {
   ALL_SIZES_64(wrap_convolve8_avx2), ALL_SIZES_64(wrap_convolve10_avx2),
   ALL_SIZES_64(wrap_convolve12_avx2)
 };
 #else
-const ConvolveFunctions convolve8_avx2(aom_convolve_copy_c,
-                                       aom_convolve8_horiz_avx2,
+const ConvolveFunctions convolve8_avx2(aom_convolve8_horiz_avx2,
                                        aom_convolve8_vert_avx2, 0);
 const ConvolveParam kArray_Convolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
 #endif
diff --git a/media/libaom/src/test/corner_match_test.cc b/media/libaom/src/test/corner_match_test.cc
index c685dca80c..e59cc27660 100644
--- a/media/libaom/src/test/corner_match_test.cc
+++ b/media/libaom/src/test/corner_match_test.cc
@@ -8,6 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include <memory>
+#include <new>
 #include <tuple>
 
 #include "config/av1_rtcd.h"
@@ -15,7 +17,6 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 
 #include "av1/encoder/corner_match.h"
@@ -47,13 +48,14 @@ class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
 
   libaom_test::ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CornerMatchTest);
 
 AV1CornerMatchTest::~AV1CornerMatchTest() {}
 void AV1CornerMatchTest::SetUp() {
   rnd_.Reset(ACMRandom::DeterministicSeed());
   target_func = GET_PARAM(1);
 }
-void AV1CornerMatchTest::TearDown() { libaom_test::ClearSystemState(); }
+void AV1CornerMatchTest::TearDown() {}
 
 void AV1CornerMatchTest::RunCheckOutput(int run_times) {
   const int w = 128, h = 128;
@@ -61,8 +63,10 @@ void AV1CornerMatchTest::RunCheckOutput(int run_times) {
   int i, j;
   aom_usec_timer ref_timer, test_timer;
 
-  uint8_t *input1 = new uint8_t[w * h];
-  uint8_t *input2 = new uint8_t[w * h];
+  std::unique_ptr<uint8_t[]> input1(new (std::nothrow) uint8_t[w * h]);
+  std::unique_ptr<uint8_t[]> input2(new (std::nothrow) uint8_t[w * h]);
+  ASSERT_NE(input1, nullptr);
+  ASSERT_NE(input2, nullptr);
 
   // Test the two extreme cases:
   // i) Random data, should have correlation close to 0
@@ -89,14 +93,16 @@ void AV1CornerMatchTest::RunCheckOutput(int run_times) {
     int x2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2);
     int y2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2);
 
-    double res_c =
-        av1_compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
-    double res_simd = target_func(input1, w, x1, y1, input2, w, x2, y2);
+    double res_c = av1_compute_cross_correlation_c(input1.get(), w, x1, y1,
+                                                   input2.get(), w, x2, y2);
+    double res_simd =
+        target_func(input1.get(), w, x1, y1, input2.get(), w, x2, y2);
 
     if (run_times > 1) {
       aom_usec_timer_start(&ref_timer);
       for (j = 0; j < run_times; j++) {
-        av1_compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
+        av1_compute_cross_correlation_c(input1.get(), w, x1, y1, input2.get(),
+                                        w, x2, y2);
       }
       aom_usec_timer_mark(&ref_timer);
       const int elapsed_time_c =
@@ -104,7 +110,7 @@ void AV1CornerMatchTest::RunCheckOutput(int run_times) {
 
       aom_usec_timer_start(&test_timer);
       for (j = 0; j < run_times; j++) {
-        target_func(input1, w, x1, y1, input2, w, x2, y2);
+        target_func(input1.get(), w, x1, y1, input2.get(), w, x2, y2);
       }
       aom_usec_timer_mark(&test_timer);
       const int elapsed_time_simd =
@@ -119,8 +125,6 @@ void AV1CornerMatchTest::RunCheckOutput(int run_times) {
       ASSERT_EQ(res_simd, res_c);
     }
   }
-  delete[] input1;
-  delete[] input2;
 }
 
 TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(1); }
diff --git a/media/libaom/src/test/cpu_speed_test.cc b/media/libaom/src/test/cpu_speed_test.cc
index 2a164974b0..5396becf4a 100644
--- a/media/libaom/src/test/cpu_speed_test.cc
+++ b/media/libaom/src/test/cpu_speed_test.cc
@@ -31,14 +31,9 @@ class CpuSpeedTest
   virtual ~CpuSpeedTest() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
+    InitializeConfig(encoding_mode_);
     if (encoding_mode_ != ::libaom_test::kRealTime) {
       cfg_.g_lag_in_frames = 25;
-      cfg_.rc_end_usage = AOM_VBR;
-    } else {
-      cfg_.g_lag_in_frames = 0;
-      cfg_.rc_end_usage = AOM_CBR;
     }
   }
 
@@ -169,12 +164,12 @@ TEST_P(CpuSpeedTestLarge, TestTuneScreen) { TestTuneScreen(); }
 TEST_P(CpuSpeedTestLarge, TestEncodeHighBitrate) { TestEncodeHighBitrate(); }
 TEST_P(CpuSpeedTestLarge, TestLowBitrate) { TestLowBitrate(); }
 
-AV1_INSTANTIATE_TEST_CASE(CpuSpeedTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(1, 3));
-AV1_INSTANTIATE_TEST_CASE(CpuSpeedTestLarge,
-                          ::testing::Values(::libaom_test::kTwoPassGood,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(CpuSpeedTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Range(1, 3));
+AV1_INSTANTIATE_TEST_SUITE(CpuSpeedTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Range(0, 1));
 }  // namespace
diff --git a/media/libaom/src/test/cpu_used_firstpass_test.cc b/media/libaom/src/test/cpu_used_firstpass_test.cc
new file mode 100644
index 0000000000..c53db6eab0
--- /dev/null
+++ b/media/libaom/src/test/cpu_used_firstpass_test.cc
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const double kPsnrDiffThreshold = 0.1;
+
+// Params: first pass cpu used, second pass cpu used
+class CpuUsedFirstpassTest
+    : public ::libaom_test::CodecTestWith2Params<int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  CpuUsedFirstpassTest()
+      : EncoderTest(GET_PARAM(0)), second_pass_cpu_used_(GET_PARAM(2)) {}
+  virtual ~CpuUsedFirstpassTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig(::libaom_test::kTwoPassGood);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.g_lag_in_frames = 19;
+    cfg_.g_threads = 0;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual void BeginPassHook(unsigned int pass) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+
+    if (pass == 0)
+      cpu_used_ = first_pass_cpu_used_;
+    else
+      cpu_used_ = second_pass_cpu_used_;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrDiffThreshold() { return kPsnrDiffThreshold; }
+
+  void DoTest() {
+    libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480,
+                                       cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                       0, 30);
+    double ref_psnr;
+    double psnr_diff;
+
+    first_pass_cpu_used_ = second_pass_cpu_used_;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));  // same preset case ref_psnr
+    ref_psnr = GetAveragePsnr();
+
+    first_pass_cpu_used_ = GET_PARAM(1);
+    if (first_pass_cpu_used_ == second_pass_cpu_used_) return;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    psnr_diff = abs(ref_psnr - GetAveragePsnr());
+    EXPECT_LT(psnr_diff, GetPsnrDiffThreshold())
+        << "first pass cpu used = " << first_pass_cpu_used_
+        << ", second pass cpu used = " << second_pass_cpu_used_;
+  }
+
+  int cpu_used_;
+  int first_pass_cpu_used_;
+  int second_pass_cpu_used_;
+  unsigned int nframes_;
+  double psnr_;
+};
+
+TEST_P(CpuUsedFirstpassTest, FirstPassTest) { DoTest(); }
+
+class CpuUsedFirstpassTestLarge : public CpuUsedFirstpassTest {};
+
+TEST_P(CpuUsedFirstpassTestLarge, FirstPassTest) { DoTest(); }
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+static const int kSecondPassCpuUsedLarge[] = { 2, 4 };
+static const int kSecondPassCpuUsed[] = { 6 };
+#else
+static const int kSecondPassCpuUsedLarge[] = { 2 };
+static const int kSecondPassCpuUsed[] = { 4, 6 };
+#endif
+#else
+static const int kSecondPassCpuUsedLarge[] = { 2 };
+static const int kSecondPassCpuUsed[] = { 4, 6 };
+#endif
+
+AV1_INSTANTIATE_TEST_SUITE(
+    CpuUsedFirstpassTestLarge, ::testing::Values(2, 4, 6),
+    ::testing::ValuesIn(kSecondPassCpuUsedLarge));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(
+    CpuUsedFirstpassTest, ::testing::Values(2, 4, 6),
+    ::testing::ValuesIn(kSecondPassCpuUsed));  // cpu_used
+
+}  // namespace
diff --git a/media/libaom/src/test/datarate_test.cc b/media/libaom/src/test/datarate_test.cc
index 053c055716..ee4f8c023e 100644
--- a/media/libaom/src/test/datarate_test.cc
+++ b/media/libaom/src/test/datarate_test.cc
@@ -38,8 +38,7 @@ class DatarateTestLarge
   virtual ~DatarateTestLarge() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
+    InitializeConfig(GET_PARAM(1));
     ResetModel();
   }
 
@@ -58,7 +57,9 @@ class DatarateTestLarge
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.7)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.3)
+    // FIXME(jingning): Lower this test threshold after vbr mode can render
+    // sufficiently accurate bit rate.
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.45)
         << " The datarate for the file is greater than target by too much!";
   }
 
@@ -80,6 +81,56 @@ class DatarateTestLarge
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
         << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.16)
+        << " The datarate for the file is greater than target by too much!";
+  }
+
+  virtual void BasicRateTargetingMultiThreadCBRTest() {
+    ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+                                         1, 0, 400);
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_threads = 4;
+
+    const int bitrate_array[2] = { 250, 650 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    tile_column_ = 2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 0.85)
+        << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 1.15)
+        << " The datarate for the file missed the target!"
+        << cfg_.rc_target_bitrate << " " << effective_datarate_;
+  }
+
+  virtual void ErrorResilienceOnSceneCuts() {
+    if (GET_PARAM(4) > 0) return;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    cfg_.rc_target_bitrate = 500;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
     ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
         << " The datarate for the file is greater than target by too much!";
   }
@@ -108,6 +159,31 @@ class DatarateTestLarge
         << " The datarate for the file is greater than target by too much!";
   }
 
+  virtual void CBRPeriodicKeyFrameOnSceneCuts() {
+    if (GET_PARAM(4) > 0) return;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    // Periodic keyframe
+    cfg_.kf_max_dist = 30;
+    cfg_.kf_min_dist = 30;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    cfg_.rc_target_bitrate = 500;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.3)
+        << " The datarate for the file is greater than target by too much!";
+  }
+
   virtual void BasicRateTargetingAQModeOnOffCBRTest() {
     if (GET_PARAM(4) > 0) return;
     cfg_.rc_buf_initial_sz = 500;
@@ -125,8 +201,7 @@ class DatarateTestLarge
 
     ::libaom_test::I420VideoSource video("pixel_capture_w320h240.yuv", 320, 240,
                                          30, 1, 0, 310);
-    const int bitrate_array[1] = { 60 };
-    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    cfg_.rc_target_bitrate = 60;
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
@@ -178,8 +253,7 @@ class DatarateTestFrameDropLarge
   virtual ~DatarateTestFrameDropLarge() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
+    InitializeConfig(GET_PARAM(1));
     ResetModel();
   }
 
@@ -240,11 +314,26 @@ TEST_P(DatarateTestLarge, BasicRateTargetingCBR) {
   BasicRateTargetingCBRTest();
 }
 
+// Check basic rate targeting for CBR, with 4 threads
+TEST_P(DatarateTestLarge, BasicRateTargetingMultiThreadCBR) {
+  BasicRateTargetingMultiThreadCBRTest();
+}
+
 // Check basic rate targeting for periodic key frame.
 TEST_P(DatarateTestLarge, PeriodicKeyFrameCBR) {
   BasicRateTargetingCBRPeriodicKeyFrameTest();
 }
 
+// Check basic rate targeting for periodic key frame, aligned with scene change.
+TEST_P(DatarateTestLarge, PeriodicKeyFrameCBROnSceneCuts) {
+  CBRPeriodicKeyFrameOnSceneCuts();
+}
+
+// Check basic rate targeting with error resilience on for scene cuts.
+TEST_P(DatarateTestLarge, ErrorResilienceOnSceneCuts) {
+  ErrorResilienceOnSceneCuts();
+}
+
 // Check basic rate targeting for CBR.
 TEST_P(DatarateTestLarge, BasicRateTargeting444CBR) {
   BasicRateTargeting444CBRTest();
@@ -281,8 +370,7 @@ class DatarateTestSpeedChangeRealtime
   virtual ~DatarateTestSpeedChangeRealtime() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
+    InitializeConfig(GET_PARAM(1));
     ResetModel();
   }
 
@@ -310,7 +398,7 @@ class DatarateTestSpeedChangeRealtime
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.83)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.20)
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.21)
         << " The datarate for the file is greater than target by too much!";
   }
 };
@@ -325,11 +413,26 @@ TEST_P(DatarateTestRealtime, BasicRateTargetingCBR) {
   BasicRateTargetingCBRTest();
 }
 
+// Check basic rate targeting for CBR, with 4 threads
+TEST_P(DatarateTestRealtime, BasicRateTargetingMultiThreadCBR) {
+  BasicRateTargetingMultiThreadCBRTest();
+}
+
 // Check basic rate targeting for periodic key frame.
 TEST_P(DatarateTestRealtime, PeriodicKeyFrameCBR) {
   BasicRateTargetingCBRPeriodicKeyFrameTest();
 }
 
+// Check basic rate targeting for periodic key frame, aligned with scene change.
+TEST_P(DatarateTestRealtime, PeriodicKeyFrameCBROnSceneCuts) {
+  CBRPeriodicKeyFrameOnSceneCuts();
+}
+
+// Check basic rate targeting with error resilience on for scene cuts.
+TEST_P(DatarateTestRealtime, ErrorResilienceOnSceneCuts) {
+  ErrorResilienceOnSceneCuts();
+}
+
 // Check basic rate targeting for CBR.
 TEST_P(DatarateTestRealtime, BasicRateTargeting444CBR) {
   BasicRateTargeting444CBRTest();
@@ -347,27 +450,27 @@ TEST_P(DatarateTestSpeedChangeRealtime, ChangingSpeedTest) {
   ChangingSpeedTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestLarge,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(5, 7), ::testing::Values(0, 3),
-                          ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestLarge,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(5, 7), ::testing::Values(0, 3),
+                           ::testing::Values(0, 1));
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestFrameDropLarge,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(5, 7), ::testing::Values(0, 3));
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestFrameDropLarge,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(5, 7), ::testing::Values(0, 3));
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestRealtime,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(7, 9), ::testing::Values(0, 3),
-                          ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestRealtime,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(7, 11), ::testing::Values(0, 3),
+                           ::testing::Values(0, 1));
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestFrameDropRealtime,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(7, 9), ::testing::Values(0, 3));
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestFrameDropRealtime,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(7, 11), ::testing::Values(0, 3));
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestSpeedChangeRealtime,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Values(0, 3));
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestSpeedChangeRealtime,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Values(0, 3));
 
 }  // namespace
 }  // namespace datarate_test
diff --git a/media/libaom/src/test/datarate_test.h b/media/libaom/src/test/datarate_test.h
index 3c15731195..1b0d515efa 100644
--- a/media/libaom/src/test/datarate_test.h
+++ b/media/libaom/src/test/datarate_test.h
@@ -30,11 +30,6 @@ class DatarateTest : public ::libaom_test::EncoderTest {
  protected:
   virtual ~DatarateTest() {}
 
-  virtual void SetUp() {
-    InitializeConfig();
-    ResetModel();
-  }
-
   virtual void ResetModel() {
     last_pts_ = 0;
     bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
@@ -47,6 +42,7 @@ class DatarateTest : public ::libaom_test::EncoderTest {
     bits_total_ = 0;
     denoiser_offon_test_ = 0;
     denoiser_offon_period_ = -1;
+    tile_column_ = 0;
   }
 
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
@@ -54,14 +50,20 @@ class DatarateTest : public ::libaom_test::EncoderTest {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
-      encoder->Control(AV1E_SET_TILE_COLUMNS, 0);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, tile_column_);
+      encoder->Control(AV1E_SET_ROW_MT, 1);
       if (cfg_.g_usage == AOM_USAGE_REALTIME) {
+        encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_WARPED_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
+        encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
         encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
         encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
         encoder->Control(AV1E_SET_ENABLE_CDEF, 1);
         encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
         encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
         encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
+        encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2);
       }
     }
 
@@ -151,6 +153,7 @@ class DatarateTest : public ::libaom_test::EncoderTest {
   int denoiser_offon_period_;
   unsigned int aq_mode_;
   bool speed_change_test_;
+  int tile_column_;
 };
 
 }  // namespace
diff --git a/media/libaom/src/test/decode_api_test.cc b/media/libaom/src/test/decode_api_test.cc
index 910640df75..f3cf7cc945 100644
--- a/media/libaom/src/test/decode_api_test.cc
+++ b/media/libaom/src/test/decode_api_test.cc
@@ -13,18 +13,12 @@
 
 #include "config/aom_config.h"
 
-#include "test/util.h"
 #include "aom/aomdx.h"
 #include "aom/aom_decoder.h"
 
 namespace {
 
 TEST(DecodeAPI, InvalidParams) {
-  static const aom_codec_iface_t *kCodecs[] = {
-#if CONFIG_AV1_DECODER
-    aom_codec_av1_dx(),
-#endif
-  };
   uint8_t buf[1] = { 0 };
   aom_codec_ctx_t dec;
 
@@ -38,18 +32,26 @@ TEST(DecodeAPI, InvalidParams) {
             aom_codec_decode(NULL, NULL, sizeof(buf), NULL));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(NULL));
   EXPECT_TRUE(aom_codec_error(NULL) != NULL);
+  EXPECT_TRUE(aom_codec_error_detail(NULL) == NULL);
 
-  for (const aom_codec_iface_t *iface : kCodecs) {
-    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_dec_init(NULL, iface, NULL, 0));
+  aom_codec_iface_t *iface = aom_codec_av1_dx();
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_dec_init(NULL, iface, NULL, 0));
 
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec, iface, NULL, 0));
-    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_decode(&dec, NULL, sizeof(buf), NULL));
-    EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(&dec, buf, 0, NULL));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec, iface, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+            aom_codec_decode(&dec, NULL, sizeof(buf), NULL));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(&dec, buf, 0, NULL));
+
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&dec));
+}
 
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&dec));
-  }
+TEST(DecodeAPI, InvalidControlId) {
+  aom_codec_iface_t *iface = aom_codec_av1_dx();
+  aom_codec_ctx_t dec;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec, iface, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_ERROR, aom_codec_control(&dec, -1, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_control(&dec, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&dec));
 }
 
 }  // namespace
diff --git a/media/libaom/src/test/decode_multithreaded_test.cc b/media/libaom/src/test/decode_multithreaded_test.cc
index 92253ede81..5a13f75d09 100644
--- a/media/libaom/src/test/decode_multithreaded_test.cc
+++ b/media/libaom/src/test/decode_multithreaded_test.cc
@@ -69,10 +69,7 @@ class AV1DecodeMultiThreadedTest
       delete multi_thread_dec_[i];
   }
 
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(libaom_test::kTwoPassGood);
-  }
+  virtual void SetUp() { InitializeConfig(libaom_test::kTwoPassGood); }
 
   virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
                                   libaom_test::Encoder *encoder) {
@@ -111,7 +108,7 @@ class AV1DecodeMultiThreadedTest
     cfg_.rc_end_usage = AOM_VBR;
 
     libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576,
-                                       timebase.den, timebase.num, 0, 5);
+                                       timebase.den, timebase.num, 0, 2);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
     const char *md5_single_thread_str = md5_single_thread_.Get();
@@ -157,14 +154,14 @@ TEST_P(AV1DecodeMultiThreadedTestLarge, MD5Match) {
 }
 
 // TODO(ranjit): More tests have to be added using pre-generated MD5.
-AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTest, ::testing::Values(1, 2),
-                          ::testing::Values(1, 2), ::testing::Values(1),
-                          ::testing::Values(3), ::testing::Values(0, 1));
-AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTestLarge,
-                          ::testing::Values(0, 1, 2, 6),
-                          ::testing::Values(0, 1, 2, 6),
-                          ::testing::Values(1, 4), ::testing::Values(0),
-                          ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(AV1DecodeMultiThreadedTest, ::testing::Values(1, 2),
+                           ::testing::Values(1, 2), ::testing::Values(1),
+                           ::testing::Values(3), ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(AV1DecodeMultiThreadedTestLarge,
+                           ::testing::Values(0, 1, 2, 6),
+                           ::testing::Values(0, 1, 2, 6),
+                           ::testing::Values(1, 4), ::testing::Values(0),
+                           ::testing::Values(0, 1));
 
 class AV1DecodeMultiThreadedLSTestLarge
     : public AV1DecodeMultiThreadedTestLarge {};
@@ -177,9 +174,9 @@ TEST_P(AV1DecodeMultiThreadedLSTestLarge, MD5Match) {
   DoTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedLSTestLarge,
-                          ::testing::Values(6), ::testing::Values(6),
-                          ::testing::Values(1), ::testing::Values(0, 3),
-                          ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(AV1DecodeMultiThreadedLSTestLarge,
+                           ::testing::Values(6), ::testing::Values(6),
+                           ::testing::Values(1), ::testing::Values(0, 3),
+                           ::testing::Values(0, 1));
 
 }  // namespace
diff --git a/media/libaom/src/test/decode_perf_test.cc b/media/libaom/src/test/decode_perf_test.cc
index 691337cd6f..7c52cf26bf 100644
--- a/media/libaom/src/test/decode_perf_test.cc
+++ b/media/libaom/src/test/decode_perf_test.cc
@@ -135,7 +135,7 @@ class AV1NewEncodeDecodePerfTest
     const std::string data_path(env ? env : ".");
     const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
     outfile_ = fopen(path_to_source.c_str(), "wb");
-    ASSERT_TRUE(outfile_ != NULL);
+    ASSERT_NE(outfile_, nullptr);
   }
 
   virtual void EndPassHook() {
@@ -242,6 +242,6 @@ TEST_P(AV1NewEncodeDecodePerfTest, PerfTest) {
   printf("}\n");
 }
 
-AV1_INSTANTIATE_TEST_CASE(AV1NewEncodeDecodePerfTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood));
+AV1_INSTANTIATE_TEST_SUITE(AV1NewEncodeDecodePerfTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood));
 }  // namespace
diff --git a/media/libaom/src/test/decode_scalability_test.cc b/media/libaom/src/test/decode_scalability_test.cc
new file mode 100644
index 0000000000..c04d58b09e
--- /dev/null
+++ b/media/libaom/src/test/decode_scalability_test.cc
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ostream>
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+struct ObuExtensionHeader {
+  int temporal_id;
+  int spatial_id;
+};
+
+struct DecodeParam {
+  const char *filename;
+  const ObuExtensionHeader *headers;
+  size_t num_headers;
+};
+
+std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) {
+  return os << "file: " << dp.filename;
+}
+
+class DecodeScalabilityTest
+    : public ::libaom_test::DecoderTest,
+      public ::libaom_test::CodecTestWithParam<DecodeParam> {
+ protected:
+  DecodeScalabilityTest()
+      : DecoderTest(GET_PARAM(0)), headers_(GET_PARAM(1).headers),
+        num_headers_(GET_PARAM(1).num_headers) {}
+
+  ~DecodeScalabilityTest() override {}
+
+  void PreDecodeFrameHook(const libaom_test::CompressedVideoSource &video,
+                          libaom_test::Decoder *decoder) override {
+    if (video.frame_number() == 0)
+      decoder->Control(AV1D_SET_OUTPUT_ALL_LAYERS, 1);
+  }
+
+  void DecompressedFrameHook(const aom_image_t &img,
+                             const unsigned int /*frame_number*/) override {
+    const ObuExtensionHeader &header = headers_[header_index_];
+    EXPECT_EQ(img.temporal_id, header.temporal_id);
+    EXPECT_EQ(img.spatial_id, header.spatial_id);
+    header_index_ = (header_index_ + 1) % num_headers_;
+  }
+
+  void RunTest() {
+    const DecodeParam input = GET_PARAM(1);
+    aom_codec_dec_cfg_t cfg = { 1, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
+    libaom_test::IVFVideoSource decode_video(input.filename);
+    decode_video.Init();
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&decode_video, cfg));
+  }
+
+ private:
+  const ObuExtensionHeader *const headers_;
+  const size_t num_headers_;
+  size_t header_index_ = 0;
+};
+
+TEST_P(DecodeScalabilityTest, ObuExtensionHeader) { RunTest(); }
+
+// For all test files, we have:
+//   operatingPoint = 0
+//   OperatingPointIdc = operating_point_idc[ 0 ]
+
+// av1-1-b8-01-size-16x16.ivf:
+//   operating_points_cnt_minus_1 = 0
+//   operating_point_idc[ 0 ] = 0x0
+const ObuExtensionHeader kSize16x16Headers[1] = { { 0, 0 } };
+
+// av1-1-b8-22-svc-L1T2.ivf:
+//   operating_points_cnt_minus_1 = 1
+//   operating_point_idc[ 0 ] = 0x103
+//   operating_point_idc[ 1 ] = 0x101
+const ObuExtensionHeader kL1T2Headers[2] = { { 0, 0 }, { 1, 0 } };
+
+// av1-1-b8-22-svc-L2T1.ivf:
+//   operating_points_cnt_minus_1 = 1
+//   operating_point_idc[ 0 ] = 0x301
+//   operating_point_idc[ 1 ] = 0x101
+const ObuExtensionHeader kL2T1Headers[2] = { { 0, 0 }, { 0, 1 } };
+
+// av1-1-b8-22-svc-L2T2.ivf:
+//   operating_points_cnt_minus_1 = 3
+//   operating_point_idc[ 0 ] = 0x303
+//   operating_point_idc[ 1 ] = 0x301
+//   operating_point_idc[ 2 ] = 0x103
+//   operating_point_idc[ 3 ] = 0x101
+const ObuExtensionHeader kL2T2Headers[4] = {
+  { 0, 0 }, { 0, 1 }, { 1, 0 }, { 1, 1 }
+};
+
+const DecodeParam kAV1DecodeScalabilityTests[] = {
+  // { filename, headers, num_headers }
+  { "av1-1-b8-01-size-16x16.ivf", kSize16x16Headers, 1 },
+  { "av1-1-b8-22-svc-L1T2.ivf", kL1T2Headers, 2 },
+  { "av1-1-b8-22-svc-L2T1.ivf", kL2T1Headers, 2 },
+  { "av1-1-b8-22-svc-L2T2.ivf", kL2T2Headers, 4 },
+};
+
+AV1_INSTANTIATE_TEST_SUITE(DecodeScalabilityTest,
+                           ::testing::ValuesIn(kAV1DecodeScalabilityTests));
+
+}  // namespace
diff --git a/media/libaom/src/test/decode_test_driver.cc b/media/libaom/src/test/decode_test_driver.cc
index 70de0cff69..246fc82098 100644
--- a/media/libaom/src/test/decode_test_driver.cc
+++ b/media/libaom/src/test/decode_test_driver.cc
@@ -56,7 +56,7 @@ void DecoderTest::HandlePeekResult(Decoder *const /*decoder*/,
 void DecoderTest::RunLoop(CompressedVideoSource *video,
                           const aom_codec_dec_cfg_t &dec_cfg) {
   Decoder *const decoder = codec_->CreateDecoder(dec_cfg, flags_);
-  ASSERT_TRUE(decoder != NULL);
+  ASSERT_NE(decoder, nullptr);
   bool end_of_file = false;
   bool peeked_stream = false;
 
diff --git a/media/libaom/src/test/decode_to_md5.sh b/media/libaom/src/test/decode_to_md5.sh
index 2edd1cb52b..214755f216 100644..100755
--- a/media/libaom/src/test/decode_to_md5.sh
+++ b/media/libaom/src/test/decode_to_md5.sh
@@ -39,7 +39,7 @@ decode_to_md5() {
   fi
 
   eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 
@@ -65,7 +65,7 @@ DISABLED_decode_to_md5_av1() {
   if [ "$(av1_decode_available)" = "yes" ]; then
     if [ ! -e "${AV1_IVF_FILE}" ]; then
       file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
-      encode_yuv_raw_input_av1 "${file}" --ivf
+      encode_yuv_raw_input_av1 "${file}" --ivf || return 1
     fi
     decode_to_md5 "${file}" "av1" "${expected_md5}"
   fi
diff --git a/media/libaom/src/test/decode_with_drops.sh b/media/libaom/src/test/decode_with_drops.sh
index 155ee92077..1fc13ced35 100644..100755
--- a/media/libaom/src/test/decode_with_drops.sh
+++ b/media/libaom/src/test/decode_with_drops.sh
@@ -39,7 +39,7 @@ decode_with_drops() {
   fi
 
   eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
-      "${drop_mode}" ${devnull}
+      "${drop_mode}" ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
@@ -52,13 +52,13 @@ DISABLED_decode_with_drops_av1() {
     local file="${AV1_IVF_FILE}"
     if [ ! -e "${AV1_IVF_FILE}" ]; then
       file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
-      encode_yuv_raw_input_av1 "${file}" --ivf
+      encode_yuv_raw_input_av1 "${file}" --ivf || return 1
     fi
     # Drop frames 3 and 4.
-    decode_with_drops "${file}" "av1" "3-4"
+    decode_with_drops "${file}" "av1" "3-4" || return 1
 
     # Test pattern mode: Drop 3 of every 4 frames.
-    decode_with_drops "${file}" "av1" "3/4"
+    decode_with_drops "${file}" "av1" "3/4" || return 1
   fi
 }
 
diff --git a/media/libaom/src/test/divu_small_test.cc b/media/libaom/src/test/divu_small_test.cc
index f4d0846cf4..496fbc1f8e 100644
--- a/media/libaom/src/test/divu_small_test.cc
+++ b/media/libaom/src/test/divu_small_test.cc
@@ -14,7 +14,7 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "test/acm_random.h"
-#include "av1/common/odintrin.h"
+#include "aom_dsp/odintrin.h"
 
 using libaom_test::ACMRandom;
 
diff --git a/media/libaom/src/test/dr_prediction_test.cc b/media/libaom/src/test/dr_prediction_test.cc
index e8865c02a3..cf2e90c6b9 100644
--- a/media/libaom/src/test/dr_prediction_test.cc
+++ b/media/libaom/src/test/dr_prediction_test.cc
@@ -22,7 +22,6 @@
 #include "av1/common/pred_common.h"
 #include "av1/common/reconintra.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -199,7 +198,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
     if (params_.tst_fn) {
       aom_usec_timer_start(&timer);
       for (int k = 0; k < kNumTests; ++k) {
-        ASM_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_,
+        API_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_,
                                                 above_, left_, upsample_above_,
                                                 upsample_left_, dx_, dy_, bd_));
       }
@@ -274,6 +273,25 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
     }
   }
 
+  void RundrPredTest(const int speed) {
+    if (params_.tst_fn == NULL) return;
+    const int angles[] = { 3, 45, 87 };
+    const int start_angle = speed ? 0 : start_angle_;
+    const int stop_angle = speed ? 3 : stop_angle_;
+    for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+      for (int i = start_angle; i < stop_angle; ++i) {
+        const int angle = speed ? angles[i] + start_angle_ : i;
+        dx_ = av1_get_dx(angle);
+        dy_ = av1_get_dy(angle);
+        if (speed) {
+          printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
+                 enable_upsample_, angle);
+        }
+        if (dx_ && dy_) RunTest(speed, false, angle);
+      }
+    }
+  }
+
   Pixel dst_ref_data_[kDstSize];
   Pixel dst_tst_data_[kDstSize];
 
@@ -364,6 +382,25 @@ INSTANTIATE_TEST_SUITE_P(
                                NULL, AOM_BITS_12, kZ3Start)));
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
+TEST_P(LowbdDrPredTest, OperationCheck) { RundrPredTest(0); }
+
+TEST_P(LowbdDrPredTest, DISABLED_Speed) { RundrPredTest(1); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, LowbdDrPredTest,
+    ::testing::Values(
+        DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+                           &z1_wrapper<av1_dr_prediction_z1_sse4_1>, AOM_BITS_8,
+                           kZ1Start),
+        DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+                           &z2_wrapper<av1_dr_prediction_z2_sse4_1>, AOM_BITS_8,
+                           kZ2Start),
+        DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+                           &z3_wrapper<av1_dr_prediction_z3_sse4_1>, AOM_BITS_8,
+                           kZ3Start)));
+#endif  // HAVE_SSE4_1
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, LowbdDrPredTest,
@@ -377,32 +414,6 @@ INSTANTIATE_TEST_SUITE_P(
                                          &z3_wrapper<av1_dr_prediction_z3_avx2>,
                                          AOM_BITS_8, kZ3Start)));
 
-TEST_P(LowbdDrPredTest, DISABLED_Speed) {
-  const int angles[] = { 3, 45, 87 };
-  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
-    for (int i = 0; i < 3; ++i) {
-      const int angle = angles[i] + start_angle_;
-      dx_ = av1_get_dx(angle);
-      dy_ = av1_get_dy(angle);
-      printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
-             enable_upsample_, angle);
-      if (dx_ && dy_) RunTest(true, false, angle);
-    }
-  }
-}
-
-TEST_P(LowbdDrPredTest, OperationCheck) {
-  if (params_.tst_fn == NULL) return;
-  // const int angles[] = { 3, 45, 81, 87, 93, 100, 145, 187, 199, 260 };
-  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
-    for (int angle = start_angle_; angle < stop_angle_; ++angle) {
-      dx_ = av1_get_dx(angle);
-      dy_ = av1_get_dy(angle);
-      if (dx_ && dy_) RunTest(false, false, angle);
-    }
-  }
-}
-
 #if CONFIG_AV1_HIGHBITDEPTH
 INSTANTIATE_TEST_SUITE_P(
     AVX2, HighbdDrPredTest,
@@ -471,4 +482,19 @@ TEST_P(HighbdDrPredTest, OperationCheck) {
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, LowbdDrPredTest,
+    ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+                                         &z1_wrapper<av1_dr_prediction_z1_neon>,
+                                         AOM_BITS_8, kZ1Start),
+                      DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+                                         &z2_wrapper<av1_dr_prediction_z2_neon>,
+                                         AOM_BITS_8, kZ2Start),
+                      DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+                                         &z3_wrapper<av1_dr_prediction_z3_neon>,
+                                         AOM_BITS_8, kZ3Start)));
+
+#endif  // HAVE_NEON
+
 }  // namespace
diff --git a/media/libaom/src/test/dump_obu.sh b/media/libaom/src/test/dump_obu.sh
index da44dd7e67..933db64a6a 100644..100755
--- a/media/libaom/src/test/dump_obu.sh
+++ b/media/libaom/src/test/dump_obu.sh
@@ -45,14 +45,21 @@ aomenc_available() {
 encode_test_file() {
   if [ "$(aomenc_available)" = "yes" ]; then
     local encoder="$(aom_tool_path aomenc)"
-
-    eval "${encoder}" \
-      $(aomenc_encode_test_fast_params) \
-      $(yuv_raw_input) \
-      --ivf \
-      --output=${dump_obu_test_file} \
-      ${devnull}
-
+    if [ "$(realtime_only_build)" = "yes" ]; then
+      eval "${encoder}" \
+        $(aomenc_encode_test_rt_params) \
+        $(yuv_raw_input) \
+        --ivf \
+        --output=${dump_obu_test_file} \
+        ${devnull} || return 1
+    else
+      eval "${encoder}" \
+        $(aomenc_encode_test_fast_params) \
+        $(yuv_raw_input) \
+        --ivf \
+        --output=${dump_obu_test_file} \
+        ${devnull} || return 1
+    fi
     if [ ! -e "${dump_obu_test_file}" ]; then
       elog "dump_obu test input encode failed."
       return 1
@@ -61,7 +68,7 @@ encode_test_file() {
 }
 
 dump_obu() {
-  encode_test_file
+  encode_test_file || return 1
   eval $(aom_tool_path dump_obu) "${dump_obu_test_file}" ${devnull}
 }
 
diff --git a/media/libaom/src/test/ec_test.cc b/media/libaom/src/test/ec_test.cc
index 853abcbc5a..c4b88e35dc 100644
--- a/media/libaom/src/test/ec_test.cc
+++ b/media/libaom/src/test/ec_test.cc
@@ -12,6 +12,8 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include <cstdlib>
+#include <memory>
+#include <new>
 
 #include "aom_dsp/entenc.h"
 #include "aom_dsp/entdec.h"
@@ -37,18 +39,18 @@ TEST(EC_TEST, random_ec_test) {
   od_ec_enc_init(&enc, 1);
   /*Test compatibility between multiple different encode/decode routines.*/
   for (i = 0; i < 409600; i++) {
-    unsigned *fz;
-    unsigned *fts;
-    unsigned *data;
-    unsigned *tell;
-    unsigned *enc_method;
     int j;
     sz = rand() / ((RAND_MAX >> (rand() % 9U)) + 1U);
-    fz = (unsigned *)malloc(sz * sizeof(*fz));
-    fts = (unsigned *)malloc(sz * sizeof(*fts));
-    data = (unsigned *)malloc(sz * sizeof(*data));
-    tell = (unsigned *)malloc((sz + 1) * sizeof(*tell));
-    enc_method = (unsigned *)malloc(sz * sizeof(*enc_method));
+    std::unique_ptr<unsigned[]> fz(new (std::nothrow) unsigned[sz]);
+    ASSERT_NE(fz, nullptr);
+    std::unique_ptr<unsigned[]> fts(new (std::nothrow) unsigned[sz]);
+    ASSERT_NE(fts, nullptr);
+    std::unique_ptr<unsigned[]> data(new (std::nothrow) unsigned[sz]);
+    ASSERT_NE(data, nullptr);
+    std::unique_ptr<unsigned[]> tell(new (std::nothrow) unsigned[sz + 1]);
+    ASSERT_NE(tell, nullptr);
+    std::unique_ptr<unsigned[]> enc_method(new (std::nothrow) unsigned[sz]);
+    ASSERT_NE(enc_method, nullptr);
     od_ec_enc_reset(&enc);
     tell[0] = od_ec_enc_tell_frac(&enc);
     for (j = 0; j < sz; j++) {
@@ -124,11 +126,6 @@ TEST(EC_TEST, random_ec_test) {
           << " instead of " << tell[j + 1] << " (Random seed: " << seed
           << ").\n";
     }
-    free(enc_method);
-    free(tell);
-    free(data);
-    free(fts);
-    free(fz);
   }
   od_ec_enc_reset(&enc);
   if (CDF_SHIFT == 0) {
diff --git a/media/libaom/src/test/edge_detect_test.cc b/media/libaom/src/test/edge_detect_test.cc
deleted file mode 100644
index 33fbbc0bb0..0000000000
--- a/media/libaom/src/test/edge_detect_test.cc
+++ /dev/null
@@ -1,409 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdbool.h>
-#include <memory>
-#include <tuple>
-#include "aom_mem/aom_mem.h"
-#include "av1/encoder/rdopt.h"
-#include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-namespace {
-
-using std::get;
-using std::tuple;
-
-static int get_pix(uint8_t *buf, int i, bool high_bd) {
-  if (high_bd) {
-    return *CONVERT_TO_SHORTPTR(buf + i);
-  } else {
-    return buf[i];
-  }
-}
-
-/** Get the (i, j) value from the input; if i or j is outside of the width
- * or height, the nearest pixel value is returned.
- */
-static int get_nearest_pix(const int *buf, int w, int h, int i, int j) {
-  int offset = AOMMAX(AOMMIN(i, w - 1), 0) + w * AOMMAX(AOMMIN(j, h - 1), 0);
-  return buf[offset];
-}
-
-/** Given the image data, creates a new image with padded values, so an
- * 8-tap filter can be convolved. The padded value is the same as the closest
- * value in the image. Returns a pointer to the start of the image in the
- * padded data. Must be freed with free_pad_8tap. The output will be either
- * 8-bit or 16-bit, depending on the high bit-depth (high_bd) field.
- */
-static uint8_t *pad_8tap_convolve(const int *data, int w, int h, bool high_bd) {
-  // SIMD optimizations require the width to be a multiple of 8 and the height
-  // to be multiples of 4.
-  assert(w % 8 == 0);
-  assert(h % 4 == 0);
-  // For an 8-tap filter, we need to pad with 3 lines on top and on the left,
-  // and 4 lines on the right and bottom, for 7 extra lines.
-  const int pad_w = w + 7;
-  const int pad_h = h + 7;
-
-  uint8_t *dst;
-  if (high_bd) {
-    dst =
-        CONVERT_TO_BYTEPTR(aom_memalign(32, sizeof(uint16_t) * pad_w * pad_h));
-  } else {
-    dst = (uint8_t *)aom_memalign(32, sizeof(uint8_t) * pad_w * pad_h);
-  }
-  if (dst == nullptr) {
-    EXPECT_NE(dst, nullptr);
-    return nullptr;
-  }
-
-  for (int j = 0; j < pad_h; ++j) {
-    for (int i = 0; i < pad_w; ++i) {
-      const int v = get_nearest_pix(data, w, h, i - 3, j - 3);
-      if (high_bd) {
-        *CONVERT_TO_SHORTPTR(dst + i + j * pad_w) = v;
-      } else {
-        dst[i + j * pad_w] = static_cast<uint8_t>(v);
-      }
-    }
-  }
-  return dst + (w + 7) * 3 + 3;
-}
-
-static int stride_8tap(int width) { return width + 7; }
-
-static void free_pad_8tap(uint8_t *padded, int width, bool high_bd) {
-  if (high_bd) {
-    aom_free(CONVERT_TO_SHORTPTR(padded - (width + 7) * 3 - 3));
-  } else {
-    aom_free(padded - (width + 7) * 3 - 3);
-  }
-}
-
-struct Pad8TapConvolveDeleter {
-  Pad8TapConvolveDeleter(const int width, const bool high_bd)
-      : width(width), high_bd(high_bd) {}
-  void operator()(uint8_t *p) {
-    if (p != nullptr) {
-      free_pad_8tap(p, width, high_bd);
-    }
-  }
-  const int width;
-  const bool high_bd;
-};
-
-static uint8_t *malloc_bd(int num_entries, bool high_bd) {
-  const int bytes_per_entry = high_bd ? sizeof(uint16_t) : sizeof(uint8_t);
-
-  uint8_t *buf = (uint8_t *)aom_memalign(32, bytes_per_entry * num_entries);
-  if (high_bd) {
-    return CONVERT_TO_BYTEPTR(buf);
-  } else {
-    return buf;
-  }
-}
-
-static void free_bd(uint8_t *p, bool high_bd) {
-  if (high_bd) {
-    aom_free(CONVERT_TO_SHORTPTR(p));
-  } else {
-    aom_free(p);
-  }
-}
-
-struct MallocBdDeleter {
-  explicit MallocBdDeleter(const bool high_bd) : high_bd(high_bd) {}
-  void operator()(uint8_t *p) { free_bd(p, high_bd); }
-  const bool high_bd;
-};
-
-class EdgeDetectBrightnessTest :
-    // Parameters are (brightness, width, height, high bit depth representation,
-    // bit depth).
-    public ::testing::TestWithParam<tuple<int, int, int, bool, int> > {
- protected:
-  void SetUp() override {
-    // Allocate a (width by height) array of luma values in orig_.
-    // padded_ will be filled by the pad() call, which adds a border around
-    // the orig_. The output_ array has enough space for the computation.
-    const int brightness = GET_PARAM(0);
-    const int width = GET_PARAM(1);
-    const int height = GET_PARAM(2);
-    const bool high_bd = GET_PARAM(3);
-
-    // Create the padded image of uniform brightness.
-    std::unique_ptr<int[]> orig(new int[width * height]);
-    ASSERT_NE(orig, nullptr);
-    for (int i = 0; i < width * height; ++i) {
-      orig[i] = brightness;
-    }
-    input_ = pad_8tap_convolve(orig.get(), width, height, high_bd);
-    ASSERT_NE(input_, nullptr);
-    output_ = malloc_bd(width * height, high_bd);
-    ASSERT_NE(output_, nullptr);
-  }
-
-  void TearDown() override {
-    const int width = GET_PARAM(1);
-    const bool high_bd = GET_PARAM(3);
-    free_pad_8tap(input_, width, high_bd);
-    free_bd(output_, high_bd);
-  }
-
-  // Skip the tests where brightness exceeds the bit-depth; we run into this
-  // issue because of gtest's limitation on valid combinations of test
-  // parameters. Also skip the tests where bit depth is greater than 8, but
-  // high bit depth representation is not set.
-  bool should_skip() const {
-    const int brightness = GET_PARAM(0);
-    const int bd = GET_PARAM(4);
-    if (brightness >= (1 << bd)) {
-      return true;
-    }
-    const bool high_bd = GET_PARAM(3);
-    if (bd > 8 && !high_bd) {
-      return true;
-    }
-    return false;
-  }
-
-  uint8_t *input_;
-  uint8_t *output_;
-};
-
-TEST_P(EdgeDetectBrightnessTest, BlurUniformBrightness) {
-  // Some combination of parameters are non-sensical, due to limitations
-  // of the testing framework. Ignore these.
-  if (should_skip()) {
-    return;
-  }
-
-  // For varying levels of brightness, the algorithm should
-  // produce the same output.
-  const int brightness = GET_PARAM(0);
-  const int width = GET_PARAM(1);
-  const int height = GET_PARAM(2);
-  const bool high_bd = GET_PARAM(3);
-  const int bd = GET_PARAM(4);
-
-  av1_gaussian_blur(input_, stride_8tap(width), width, height, output_, high_bd,
-                    bd);
-  for (int i = 0; i < width * height; ++i) {
-    ASSERT_EQ(brightness, get_pix(output_, i, high_bd));
-  }
-}
-
-// No edges on a uniformly bright image.
-TEST_P(EdgeDetectBrightnessTest, DetectUniformBrightness) {
-  if (should_skip()) {
-    return;
-  }
-  const int width = GET_PARAM(1);
-  const int height = GET_PARAM(2);
-  const bool high_bd = GET_PARAM(3);
-  const int bd = GET_PARAM(4);
-
-  ASSERT_EQ(
-      0, av1_edge_exists(input_, stride_8tap(width), width, height, high_bd, bd)
-             .magnitude);
-}
-
-#if CONFIG_AV1_HIGHBITDEPTH
-INSTANTIATE_TEST_SUITE_P(ImageBrightnessTests, EdgeDetectBrightnessTest,
-                         ::testing::Combine(
-                             // Brightness
-                             ::testing::Values(0, 1, 2, 127, 128, 129, 254, 255,
-                                               256, 511, 512, 1023, 1024, 2048,
-                                               4095),
-                             // Width
-                             ::testing::Values(8, 16, 32),
-                             // Height
-                             ::testing::Values(4, 8, 12, 32),
-                             // High bit depth representation
-                             ::testing::Bool(),
-                             // Bit depth
-                             ::testing::Values(8, 10, 12)));
-#else
-INSTANTIATE_TEST_SUITE_P(ImageBrightnessTests, EdgeDetectBrightnessTest,
-                         ::testing::Combine(
-                             // Brightness
-                             ::testing::Values(0, 1, 2, 127, 128, 129, 254, 255,
-                                               256, 511, 512, 1023, 1024, 2048,
-                                               4095),
-                             // Width
-                             ::testing::Values(8, 16, 32),
-                             // Height
-                             ::testing::Values(4, 8, 12, 32),
-                             // High bit depth representation
-                             ::testing::Values(false),
-                             // Bit depth
-                             ::testing::Values(8)));
-#endif
-
-class EdgeDetectImageTest :
-    // Parameters are (width, height, high bit depth representation, bit depth).
-    public ::testing::TestWithParam<tuple<int, int, bool, int> > {
- protected:
-  // Skip the tests where bit depth is greater than 8, but high bit depth
-  // representation is not set (limitation of testing framework).
-  bool should_skip() const {
-    const bool high_bd = GET_PARAM(2);
-    const int bd = GET_PARAM(3);
-    return bd > 8 && !high_bd;
-  }
-};
-
-// Generate images with black on one side and white on the other.
-TEST_P(EdgeDetectImageTest, BlackWhite) {
-  // Some combination of parameters are non-sensical, due to limitations
-  // of the testing framework. Ignore these.
-  if (should_skip()) {
-    return;
-  }
-
-  const int width = GET_PARAM(0);
-  const int height = GET_PARAM(1);
-  const bool high_bd = GET_PARAM(2);
-  const int bd = GET_PARAM(3);
-
-  const int white = (1 << bd) - 1;
-  std::unique_ptr<int[]> orig(new int[width * height]);
-  for (int j = 0; j < height; ++j) {
-    for (int i = 0; i < width; ++i) {
-      if (i < width / 2) {
-        orig[i + j * width] = 0;
-      } else {
-        orig[i + j * width] = white;
-      }
-    }
-  }
-
-  std::unique_ptr<uint8_t[], Pad8TapConvolveDeleter> padded(
-      pad_8tap_convolve(orig.get(), width, height, high_bd),
-      Pad8TapConvolveDeleter(width, high_bd));
-  ASSERT_NE(padded, nullptr);
-  // Value should be between 556 and 560.
-  ASSERT_LE(556, av1_edge_exists(padded.get(), stride_8tap(width), width,
-                                 height, high_bd, bd)
-                     .magnitude);
-  ASSERT_GE(560, av1_edge_exists(padded.get(), stride_8tap(width), width,
-                                 height, high_bd, bd)
-                     .magnitude);
-}
-
-// Hardcoded blur tests.
-static const int luma[32] = { 241, 147, 7,   90,  184, 103, 28,  186,
-                              2,   248, 49,  242, 114, 146, 127, 22,
-                              121, 228, 167, 108, 158, 174, 41,  168,
-                              214, 99,  184, 109, 114, 247, 117, 119 };
-static const uint8_t expected[] = { 161, 138, 119, 118, 123, 118, 113, 122,
-                                    143, 140, 134, 133, 134, 126, 116, 114,
-                                    147, 149, 145, 142, 143, 138, 126, 118,
-                                    164, 156, 148, 144, 148, 148, 138, 126 };
-
-static void hardcoded_blur_test_aux(const bool high_bd) {
-  const int w = 8;
-  const int h = 4;
-  for (int bd = 8; bd <= 12; bd += 2) {
-    // Skip the tests where bit depth is greater than 8, but high bit depth
-    // representation is not set.
-    if (bd > 8 && !high_bd) {
-      break;
-    }
-    std::unique_ptr<uint8_t[], MallocBdDeleter> output(
-        malloc_bd(w * h, high_bd), MallocBdDeleter(high_bd));
-    ASSERT_NE(output, nullptr);
-    std::unique_ptr<uint8_t[], Pad8TapConvolveDeleter> padded(
-        pad_8tap_convolve(luma, w, h, high_bd),
-        Pad8TapConvolveDeleter(w, high_bd));
-    ASSERT_NE(padded, nullptr);
-    av1_gaussian_blur(padded.get(), stride_8tap(w), w, h, output.get(), high_bd,
-                      bd);
-    for (int i = 0; i < w * h; ++i) {
-      ASSERT_EQ(expected[i], get_pix(output.get(), i, high_bd));
-    }
-
-    // If we multiply the inputs by a constant factor, the output should not
-    // vary more than 0.5 * factor.
-    for (int c = 2; c < (1 << (bd - 8)); ++c) {
-      int scaled_luma[32];
-      for (int i = 0; i < 32; ++i) {
-        scaled_luma[i] = luma[i] * c;
-      }
-      padded.reset(pad_8tap_convolve(scaled_luma, w, h, high_bd));
-      ASSERT_NE(padded, nullptr);
-      av1_gaussian_blur(padded.get(), stride_8tap(w), w, h, output.get(),
-                        high_bd, bd);
-      for (int i = 0; i < w * h; ++i) {
-        ASSERT_GE(c / 2,
-                  abs(expected[i] * c - get_pix(output.get(), i, high_bd)));
-      }
-    }
-  }
-}
-
-TEST(EdgeDetectImageTest, HardcodedBlurTest) {
-  hardcoded_blur_test_aux(false);
-#if CONFIG_AV1_HIGHBITDEPTH
-  hardcoded_blur_test_aux(true);
-#endif
-}
-
-TEST(EdgeDetectImageTest, SobelTest) {
-  // Randomly generated 3x3. Compute Sobel for middle value.
-  const uint8_t buf[9] = { 241, 147, 7, 90, 184, 103, 28, 186, 2 };
-  const int stride = 3;
-  bool high_bd = false;
-  sobel_xy result = av1_sobel(buf, stride, 1, 1, high_bd);
-  ASSERT_EQ(234, result.x);
-  ASSERT_EQ(140, result.y);
-
-#if CONFIG_AV1_HIGHBITDEPTH
-  // Verify it works for 8-bit values in a high bit-depth buffer.
-  const uint16_t buf8_16[9] = { 241, 147, 7, 90, 184, 103, 28, 186, 2 };
-  high_bd = true;
-  result = av1_sobel(CONVERT_TO_BYTEPTR(buf8_16), stride, 1, 1, high_bd);
-  ASSERT_EQ(234, result.x);
-  ASSERT_EQ(140, result.y);
-
-  // Verify it works for high bit-depth values as well.
-  const uint16_t buf16[9] = { 241, 147, 7, 90, 184, 2003, 1028, 186, 2 };
-  result = av1_sobel(CONVERT_TO_BYTEPTR(buf16), stride, 1, 1, high_bd);
-  ASSERT_EQ(-2566, result.x);
-  ASSERT_EQ(-860, result.y);
-#endif
-}
-
-#if CONFIG_AV1_HIGHBITDEPTH
-INSTANTIATE_TEST_SUITE_P(EdgeDetectImages, EdgeDetectImageTest,
-                         ::testing::Combine(
-                             // Width
-                             ::testing::Values(8, 16, 32),
-                             // Height
-                             ::testing::Values(4, 8, 12, 32),
-                             // High bit depth representation
-                             ::testing::Bool(),
-                             // Bit depth
-                             ::testing::Values(8, 10, 12)));
-#else
-INSTANTIATE_TEST_SUITE_P(EdgeDetectImages, EdgeDetectImageTest,
-                         ::testing::Combine(
-                             // Width
-                             ::testing::Values(8, 16, 32),
-                             // Height
-                             ::testing::Values(4, 8, 12, 32),
-                             // High bit depth representation
-                             ::testing::Values(false),
-                             // Bit depth
-                             ::testing::Values(8)));
-#endif
-}  // namespace
diff --git a/media/libaom/src/test/encode_api_test.cc b/media/libaom/src/test/encode_api_test.cc
index 25bdb5c3f3..70b0612ced 100644
--- a/media/libaom/src/test/encode_api_test.cc
+++ b/media/libaom/src/test/encode_api_test.cc
@@ -9,22 +9,24 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <cstdlib>
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "config/aom_config.h"
 
-#include "test/util.h"
 #include "aom/aomcx.h"
 #include "aom/aom_encoder.h"
 
 namespace {
 
-TEST(EncodeAPI, InvalidParams) {
-  static const aom_codec_iface_t *kCodecs[] = {
-#if CONFIG_AV1_ENCODER
-    aom_codec_av1_cx(),
+#if CONFIG_REALTIME_ONLY
+const int kUsage = 1;
+#else
+const int kUsage = 0;
 #endif
-  };
+
+TEST(EncodeAPI, InvalidParams) {
   uint8_t buf[1] = { 0 };
   aom_image_t img;
   aom_codec_ctx_t enc;
@@ -43,31 +45,61 @@ TEST(EncodeAPI, InvalidParams) {
             aom_codec_enc_config_default(NULL, &cfg, 0));
   EXPECT_TRUE(aom_codec_error(NULL) != NULL);
 
-  for (const aom_codec_iface_t *iface : kCodecs) {
-    SCOPED_TRACE(aom_codec_iface_name(iface));
-    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_enc_init(NULL, iface, NULL, 0));
-    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_enc_init(&enc, iface, NULL, 0));
-    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_enc_config_default(iface, &cfg, 2));
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  SCOPED_TRACE(aom_codec_iface_name(iface));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(NULL, iface, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+            aom_codec_enc_config_default(iface, &cfg, 3));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(NULL, aom_codec_get_global_headers(NULL));
 
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  aom_fixed_buf_t *glob_headers = aom_codec_get_global_headers(&enc);
+  EXPECT_TRUE(glob_headers->buf != NULL);
+  if (glob_headers) {
+    free(glob_headers->buf);
+    free(glob_headers);
+  }
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
 
-    EXPECT_EQ(NULL, aom_codec_get_global_headers(NULL));
+TEST(EncodeAPI, InvalidControlId) {
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_ERROR, aom_codec_control(&enc, -1, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_control(&enc, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
 
-    aom_fixed_buf_t *glob_headers = aom_codec_get_global_headers(&enc);
-    EXPECT_TRUE(glob_headers->buf != NULL);
-    if (glob_headers) {
-      free(glob_headers->buf);
-      free(glob_headers);
-    }
+#if !CONFIG_REALTIME_ONLY
+TEST(EncodeAPI, AllIntraMode) {
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+  // Set g_lag_in_frames to a nonzero value. This should cause
+  // aom_codec_enc_init() to fail.
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA));
+  cfg.g_lag_in_frames = 1;
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
 
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
-  }
+  // Set kf_max_dist to a nonzero value. This should cause aom_codec_enc_init()
+  // to fail.
+  EXPECT_EQ(AOM_CODEC_OK,
+            aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA));
+  cfg.kf_max_dist = 1;
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
 }
+#endif
 
 }  // namespace
diff --git a/media/libaom/src/test/encode_perf_test.cc b/media/libaom/src/test/encode_perf_test.cc
index 390a6e0e62..b626acd043 100644
--- a/media/libaom/src/test/encode_perf_test.cc
+++ b/media/libaom/src/test/encode_perf_test.cc
@@ -179,6 +179,6 @@ TEST_P(AV1EncodePerfTest, PerfTest) {
   }
 }
 
-AV1_INSTANTIATE_TEST_CASE(AV1EncodePerfTest,
-                          ::testing::Values(::libaom_test::kRealTime));
+AV1_INSTANTIATE_TEST_SUITE(AV1EncodePerfTest,
+                           ::testing::Values(::libaom_test::kRealTime));
 }  // namespace
diff --git a/media/libaom/src/test/encode_small_width_height_test.cc b/media/libaom/src/test/encode_small_width_height_test.cc
new file mode 100644
index 0000000000..fa0d16a100
--- /dev/null
+++ b/media/libaom/src/test/encode_small_width_height_test.cc
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Tests for https://crbug.com/aomedia/2777.
+//
+// Encode images with a small width (<= two AV1 superblocks) or a small height
+// (<= one AV1 superblock) with multiple threads. aom_codec_encode() should
+// not crash.
+
+#include <memory>
+
+#include "aom/aomcx.h"
+#include "aom/aom_encoder.h"
+#include "config/aom_config.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// Dummy buffer of zero samples.
+constexpr unsigned char kBuffer[256 * 512 + 2 * 128 * 256] = { 0 };
+#if CONFIG_REALTIME_ONLY
+const int kUsage = 1;
+#else
+const int kUsage = 0;
+#endif
+
+TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) {
+  // The image has only one tile and the tile is two AV1 superblocks wide.
+  // For speed >= 1, superblock size is 64x64 (see av1_select_sb_size()).
+  constexpr int kWidth = 128;
+  constexpr int kHeight = 512;
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               const_cast<unsigned char *>(kBuffer)));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+  cfg.g_threads = 2;
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+#if !CONFIG_REALTIME_ONLY
+TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
+  // The image has only one tile and the tile is two AV1 superblocks wide.
+  // For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
+  constexpr int kWidth = 256;
+  constexpr int kHeight = 512;
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               const_cast<unsigned char *>(kBuffer)));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  cfg.g_threads = 2;
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+#endif
+
+TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) {
+  // The image has only one tile and the tile is one AV1 superblock tall.
+  // For speed >= 1, superblock size is 64x64 (see av1_select_sb_size()).
+  constexpr int kWidth = 512;
+  constexpr int kHeight = 64;
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               const_cast<unsigned char *>(kBuffer)));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+  cfg.g_threads = 2;
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+#if !CONFIG_REALTIME_ONLY
+TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
+  // The image has only one tile and the tile is one AV1 superblock tall.
+  // For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
+  constexpr int kWidth = 512;
+  constexpr int kHeight = 128;
+
+  aom_image_t img;
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+                               const_cast<unsigned char *>(kBuffer)));
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  cfg.g_threads = 2;
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+#endif
+
+// A reproducer test for aomedia:3113. The test should complete without any
+// memory errors.
+TEST(EncodeSmallWidthHeight, 1x1) {
+  constexpr int kWidth = 1;
+  constexpr int kHeight = 1;
+
+  // This test cannot use aom_img_alloc() or aom_img_wrap() because they call
+  // align_image_dimension() to align img.w and img.h to the next even number
+  // (2). In this test it is important to set img.w and img.h to 1. Therefore we
+  // set up img manually.
+  aom_image_t img;
+  memset(&img, 0, sizeof(img));
+  img.fmt = AOM_IMG_FMT_I420;
+  img.bit_depth = 8;
+  img.w = kWidth;
+  img.h = kHeight;
+  img.d_w = kWidth;
+  img.d_h = kHeight;
+  img.x_chroma_shift = 1;
+  img.y_chroma_shift = 1;
+  img.bps = 12;
+  int y_stride = kWidth;
+  int uv_stride = (kWidth + 1) >> 1;
+  int y_height = kHeight;
+  int uv_height = (kHeight + 1) >> 1;
+  img.stride[AOM_PLANE_Y] = y_stride;
+  img.stride[AOM_PLANE_U] = img.stride[AOM_PLANE_V] = uv_stride;
+  std::unique_ptr<unsigned char[]> y_plane(
+      new unsigned char[y_height * y_stride]());
+  ASSERT_NE(y_plane, nullptr);
+  std::unique_ptr<unsigned char[]> u_plane(
+      new unsigned char[uv_height * uv_stride]());
+  ASSERT_NE(u_plane, nullptr);
+  std::unique_ptr<unsigned char[]> v_plane(
+      new unsigned char[uv_height * uv_stride]());
+  ASSERT_NE(v_plane, nullptr);
+  img.planes[AOM_PLANE_Y] = y_plane.get();
+  img.planes[AOM_PLANE_U] = u_plane.get();
+  img.planes[AOM_PLANE_V] = v_plane.get();
+
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+  cfg.g_w = kWidth;
+  cfg.g_h = kHeight;
+  aom_codec_ctx_t enc;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+}  // namespace
diff --git a/media/libaom/src/test/encode_test_driver.cc b/media/libaom/src/test/encode_test_driver.cc
index 01f8d501a2..96714f483b 100644
--- a/media/libaom/src/test/encode_test_driver.cc
+++ b/media/libaom/src/test/encode_test_driver.cc
@@ -83,27 +83,20 @@ void Encoder::Flush() {
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
 }
 
-void EncoderTest::InitializeConfig() {
-  const aom_codec_err_t res = codec_->DefaultEncoderConfig(&cfg_, 0);
-  ASSERT_EQ(AOM_CODEC_OK, res);
-}
-
-void EncoderTest::SetMode(TestMode mode) {
+void EncoderTest::InitializeConfig(TestMode mode) {
+  int usage = AOM_USAGE_GOOD_QUALITY;
   switch (mode) {
     case kOnePassGood:
     case kTwoPassGood: break;
-    case kRealTime: {
-      cfg_.g_lag_in_frames = 0;
-      cfg_.g_usage = AOM_USAGE_REALTIME;
-      break;
-    }
+    case kRealTime: usage = AOM_USAGE_REALTIME; break;
+    case kAllIntra: usage = AOM_USAGE_ALL_INTRA; break;
     default: ASSERT_TRUE(false) << "Unexpected mode " << mode;
   }
   mode_ = mode;
-  if (mode == kTwoPassGood)
-    passes_ = 2;
-  else
-    passes_ = 1;
+  passes_ = (mode == kTwoPassGood) ? 2 : 1;
+
+  const aom_codec_err_t res = codec_->DefaultEncoderConfig(&cfg_, usage);
+  ASSERT_EQ(AOM_CODEC_OK, res);
 }
 
 static bool compare_plane(const uint8_t *const buf1, int stride1,
@@ -180,9 +173,6 @@ void EncoderTest::MismatchHook(const aom_image_t *img_enc,
 }
 
 void EncoderTest::RunLoop(VideoSource *video) {
-  aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t();
-  dec_cfg.allow_lowbitdepth = 1;
-
   stats_.Reset();
 
   ASSERT_TRUE(passes_ == 1 || passes_ == 2);
@@ -199,7 +189,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
     BeginPassHook(pass);
     std::unique_ptr<Encoder> encoder(
         codec_->CreateEncoder(cfg_, init_flags_, &stats_));
-    ASSERT_TRUE(encoder.get() != NULL);
+    ASSERT_NE(encoder, nullptr);
 
     ASSERT_NO_FATAL_FAILURE(video->Begin());
     encoder->InitEncoder(video);
@@ -209,10 +199,11 @@ void EncoderTest::RunLoop(VideoSource *video) {
     }
 
     ASSERT_FALSE(::testing::Test::HasFatalFailure());
-
+#if CONFIG_AV1_DECODER
+    aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t();
+    dec_cfg.allow_lowbitdepth = 1;
     std::unique_ptr<Decoder> decoder(
         codec_->CreateDecoder(dec_cfg, 0 /* flags */));
-#if CONFIG_AV1_DECODER
     if (decoder->IsAV1()) {
       // Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole
       // frame is decoded.
@@ -233,17 +224,20 @@ void EncoderTest::RunLoop(VideoSource *video) {
         PreEncodeFrameHook(video);
         PreEncodeFrameHook(video, encoder.get());
         encoder->EncodeFrame(video, frame_flags_);
-
+        PostEncodeFrameHook(encoder.get());
         CxDataIterator iter = encoder->GetCxData();
-
         bool has_cxdata = false;
+
+#if CONFIG_AV1_DECODER
         bool has_dxdata = false;
+#endif
         while (const aom_codec_cx_pkt_t *pkt = iter.Next()) {
           pkt = MutateEncoderOutputHook(pkt);
           again = true;
           switch (pkt->kind) {
-            case AOM_CODEC_CX_FRAME_PKT:
+            case AOM_CODEC_CX_FRAME_PKT:  //
               has_cxdata = true;
+#if CONFIG_AV1_DECODER
               if (decoder.get() != NULL && DoDecode()) {
                 aom_codec_err_t res_dec;
                 if (DoDecodeInvisible()) {
@@ -260,6 +254,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
                 has_dxdata = true;
               }
+#endif
               ASSERT_GE(pkt->data.frame.pts, last_pts_);
               if (sl == number_spatial_layers_) last_pts_ = pkt->data.frame.pts;
               FramePktHook(pkt);
@@ -267,22 +262,31 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
             case AOM_CODEC_PSNR_PKT: PSNRPktHook(pkt); break;
 
+            case AOM_CODEC_STATS_PKT: StatsPktHook(pkt); break;
+
             default: break;
           }
         }
-
-        if (has_dxdata && has_cxdata) {
+        if (has_cxdata) {
           const aom_image_t *img_enc = encoder->GetPreviewFrame();
-          DxDataIterator dec_iter = decoder->GetDxData();
-          const aom_image_t *img_dec = dec_iter.Next();
-          if (img_enc && img_dec) {
-            const bool res =
-                compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL);
-            if (!res) {  // Mismatch
-              MismatchHook(img_enc, img_dec);
+          if (img_enc) {
+            CalculateFrameLevelSSIM(video->img(), img_enc, cfg_.g_bit_depth,
+                                    cfg_.g_input_bit_depth);
+          }
+#if CONFIG_AV1_DECODER
+          if (has_dxdata) {
+            DxDataIterator dec_iter = decoder->GetDxData();
+            const aom_image_t *img_dec = dec_iter.Next();
+            if (img_enc && img_dec) {
+              const bool res =
+                  compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL);
+              if (!res) {  // Mismatch
+                MismatchHook(img_enc, img_dec);
+              }
             }
+            if (img_dec) DecompressedFrameHook(*img_dec, video->pts());
           }
-          if (img_dec) DecompressedFrameHook(*img_dec, video->pts());
+#endif
         }
         if (!Continue()) break;
       }  // Loop over spatial layers
diff --git a/media/libaom/src/test/encode_test_driver.h b/media/libaom/src/test/encode_test_driver.h
index 6319a52202..84ca64c600 100644
--- a/media/libaom/src/test/encode_test_driver.h
+++ b/media/libaom/src/test/encode_test_driver.h
@@ -28,7 +28,7 @@ namespace libaom_test {
 class CodecFactory;
 class VideoSource;
 
-enum TestMode { kRealTime, kOnePassGood, kTwoPassGood };
+enum TestMode { kRealTime, kOnePassGood, kTwoPassGood, kAllIntra };
 #define ALL_TEST_MODES                                                     \
   ::testing::Values(::libaom_test::kRealTime, ::libaom_test::kOnePassGood, \
                     ::libaom_test::kTwoPassGood)
@@ -129,11 +129,21 @@ class Encoder {
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
   }
 
+  void Control(int ctrl_id, struct aom_svc_ref_frame_comp_pred *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
   void Control(int ctrl_id, struct aom_svc_params *arg) {
     const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
   }
 
+  void Control(int ctrl_id, struct aom_ext_part_funcs *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
 #if CONFIG_AV1_ENCODER
   void Control(int ctrl_id, aom_active_map_t *arg) {
     const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
@@ -141,6 +151,11 @@ class Encoder {
   }
 #endif
 
+  void SetOption(const char *name, const char *value) {
+    const aom_codec_err_t res = aom_codec_set_option(&encoder_, name, value);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
   void Config(const aom_codec_enc_cfg_t *cfg) {
     const aom_codec_err_t res = aom_codec_enc_config_set(&encoder_, cfg);
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
@@ -186,11 +201,9 @@ class EncoderTest {
 
   virtual ~EncoderTest() {}
 
-  // Initialize the cfg_ member with the default configuration.
-  void InitializeConfig();
-
-  // Map the TestMode enum to the passes_ variables.
-  void SetMode(TestMode mode);
+  // Initialize the cfg_ member with the default configuration for the
+  // TestMode enum and maps the TestMode enum to the passes_ variable.
+  void InitializeConfig(TestMode mode);
 
   // Set encoder flag.
   void set_init_flags(aom_codec_flags_t flag) { init_flags_ = flag; }
@@ -209,12 +222,23 @@ class EncoderTest {
   virtual void PreEncodeFrameHook(VideoSource * /*video*/,
                                   Encoder * /*encoder*/) {}
 
+  virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {}
+
   // Hook to be called on every compressed data packet.
   virtual void FramePktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
 
   // Hook to be called on every PSNR packet.
   virtual void PSNRPktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
 
+  // Hook to be called on every first pass stats packet.
+  virtual void StatsPktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
+
+  // Calculates SSIM at frame level.
+  virtual void CalculateFrameLevelSSIM(const aom_image_t * /*img_src*/,
+                                       const aom_image_t * /*img_enc*/,
+                                       aom_bit_depth_t /*bit_depth*/,
+                                       unsigned int /*input_bit_depth*/) {}
+
   // Hook to determine whether the encode loop should continue.
   virtual bool Continue() const {
     return !(::testing::Test::HasFatalFailure() || abort_);
diff --git a/media/libaom/src/test/encodemb_test.cc b/media/libaom/src/test/encodemb_test.cc
new file mode 100644
index 0000000000..4c725c7dea
--- /dev/null
+++ b/media/libaom/src/test/encodemb_test.cc
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/common/scan.h"
+
+namespace {
+
+// Reorders 'qcoeff_lexico', which is in lexicographic order (row by row), into
+// scan order (zigzag) in 'qcoeff_scan'.
+void ToScanOrder(TX_SIZE tx_size, TX_TYPE tx_type, tran_low_t *qcoeff_lexico,
+                 tran_low_t *qcoeff_scan) {
+  const int max_eob = av1_get_max_eob(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  for (int i = 0; i < max_eob; ++i) {
+    qcoeff_scan[i] = qcoeff_lexico[scan_order->scan[i]];
+  }
+}
+
+// Reorders 'qcoeff_scan', which is in scan order (zigzag), into lexicographic
+// order (row by row) in 'qcoeff_lexico'.
+void ToLexicoOrder(TX_SIZE tx_size, TX_TYPE tx_type, tran_low_t *qcoeff_scan,
+                   tran_low_t *qcoeff_lexico) {
+  const int max_eob = av1_get_max_eob(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  for (int i = 0; i < max_eob; ++i) {
+    qcoeff_lexico[scan_order->scan[i]] = qcoeff_scan[i];
+  }
+}
+
+// Runs coefficient dropout on 'qcoeff_scan'.
+void Dropout(TX_SIZE tx_size, TX_TYPE tx_type, int dropout_num_before,
+             int dropout_num_after, tran_low_t *qcoeff_scan) {
+  tran_low_t qcoeff[MAX_TX_SQUARE];
+  // qcoeff_scan is assumed to be in scan order, since tests are easier to
+  // understand this way, but av1_dropout_qcoeff expects coeffs in lexico order
+  // so we convert to lexico then back to scan afterwards.
+  ToLexicoOrder(tx_size, tx_type, qcoeff_scan, qcoeff);
+
+  const int max_eob = av1_get_max_eob(tx_size);
+  const int kDequantFactor = 10;
+  tran_low_t dqcoeff[MAX_TX_SQUARE];
+  for (int i = 0; i < max_eob; ++i) {
+    dqcoeff[i] = qcoeff[i] * kDequantFactor;
+  }
+
+  uint16_t eob = max_eob;
+  while (eob > 0 && qcoeff_scan[eob - 1] == 0) --eob;
+
+  MACROBLOCK mb;
+  const int kPlane = 0;
+  const int kBlock = 0;
+  memset(&mb, 0, sizeof(mb));
+  uint16_t eobs[] = { eob };
+  mb.plane[kPlane].eobs = eobs;
+  mb.plane[kPlane].qcoeff = qcoeff;
+  mb.plane[kPlane].dqcoeff = dqcoeff;
+  uint8_t txb_entropy_ctx[1];
+  mb.plane[kPlane].txb_entropy_ctx = txb_entropy_ctx;
+
+  av1_dropout_qcoeff_num(&mb, kPlane, kBlock, tx_size, tx_type,
+                         dropout_num_before, dropout_num_after);
+
+  ToScanOrder(tx_size, tx_type, qcoeff, qcoeff_scan);
+
+  // Check updated eob value is valid.
+  uint16_t new_eob = max_eob;
+  while (new_eob > 0 && qcoeff_scan[new_eob - 1] == 0) --new_eob;
+  EXPECT_EQ(new_eob, mb.plane[kPlane].eobs[0]);
+
+  // Check qqcoeff is still valid.
+  for (int i = 0; i < max_eob; ++i) {
+    EXPECT_EQ(qcoeff[i] * kDequantFactor, dqcoeff[i]);
+  }
+}
+
+void ExpectArrayEq(tran_low_t *actual, std::vector<tran_low_t> expected) {
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(expected[i], actual[i]) << "Arrays differ at index " << i;
+  }
+}
+
+static constexpr TX_TYPE kTxType = DCT_DCT;
+
+TEST(DropoutTest, KeepsLargeCoeffs) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Large isolated coeffs should be preserved.
+  tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 0, 0, 42, 0,    // should be kept
+                               0, 0, 0, 0, 0, 0, 0,  0,    //
+                               0, 0, 0, 0, 0, 0, 0,  -30,  // should be kept
+                               0, 0, 0, 0, 0, 0, 0,  0 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 42, 0,    //
+                               0, 0, 0, 0, 0, 0, 0,  0,    //
+                               0, 0, 0, 0, 0, 0, 0,  -30,  //
+                               0, 0, 0, 0, 0, 0, 0,  0 });
+}
+
+TEST(DropoutTest, RemovesSmallIsolatedCoeffs) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Small isolated coeffs should be removed.
+  tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 1,  0, 0, 0,  // should be removed
+                               0, 0, 0, 0, 0,  0, 0, 0,  //
+                               0, 0, 0, 0, -2, 0, 0, 0,  // should be removed
+                               0, 0, 0, 0, 0,  0, 0, 0 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, KeepsSmallCoeffsAmongLargeOnes) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Small coeffs that are not isolated (not enough zeros before/after should be
+  // kept).
+  tran_low_t qcoeff_scan[] = {
+    1, 0,  0, 0,  -5, 0, 0, -1,  // should be kept
+    0, 0,  0, 10, 0,  0, 2, 0,   // should be kept
+    0, 0,  0, 0,  0,  0, 0, 0,   //
+    0, -2, 0, 0,  0,  0, 0, 0    // should be removed
+  };                             // should be removed
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 1, 0, 0, 0,  -5, 0, 0, -1,  //
+                               0, 0, 0, 10, 0,  0, 2, 0,   //
+                               0, 0, 0, 0,  0,  0, 0, 0,   //
+                               0, 0, 0, 0,  0,  0, 0, 0 });
+}
+
+TEST(DropoutTest, KeepsSmallCoeffsCloseToStartOrEnd) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Small coeffs that are too close to the beginning or end of the block
+  // should also be kept (not enough zeroes before/after).
+  tran_low_t qcoeff_scan[] = { 0, 0, -1, 0,  0, 0, 0,  0,  // should be kept
+                               0, 0, 0,  10, 0, 0, 0,  0,  // should be kept
+                               0, 0, 0,  2,  0, 0, 0,  0,  // should be removed
+                               0, 0, 0,  0,  0, 0, -1, 0 };  // should be kept
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, -1, 0,  0, 0, 0,  0,  //
+                               0, 0, 0,  10, 0, 0, 0,  0,  //
+                               0, 0, 0,  0,  0, 0, 0,  0,  //
+                               0, 0, 0,  0,  0, 0, -1, 0 });
+}
+
+TEST(DropoutTest, RemovesSmallClusterOfCoeffs) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Small clusters (<= kDropoutContinuityMax) of small coeffs should be
+  // removed.
+  tran_low_t qcoeff_scan_two[] = {
+    0, 0, 0, 0, 1, 0, 0, -1,  // should be removed
+    0, 0, 0, 0, 0, 0, 0, 0,   //
+    0, 0, 0, 0, 0, 0, 1, 0,   // should be removed
+    0, 0, 0, 0, 0, 0, 0, 0
+  };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after,
+          qcoeff_scan_two);
+  ExpectArrayEq(qcoeff_scan_two, { 0, 0, 0, 0, 0, 0, 0, 0,  //
+                                   0, 0, 0, 0, 0, 0, 0, 0,  //
+                                   0, 0, 0, 0, 0, 0, 0, 0,  //
+                                   0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, KeepsLargeClusterOfCoeffs) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Large clusters (> kDropoutContinuityMax) of small coeffs should be kept.
+  tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 1, 0,  1, -1,  // should be kept
+                               0, 0, 0, 0, 0, 0,  0, 0,   //
+                               0, 0, 0, 0, 0, -2, 0, 0,   // should be removed
+                               0, 0, 0, 0, 0, 0,  0, 0 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 1, 0, 1, -1,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,   //
+                               0, 0, 0, 0, 0, 0, 0, 0,   //
+                               0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, NumBeforeLargerThanNumAfter) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 2;
+  // The second coeff (-2) doesn't seem to meet the dropout_num_before
+  // criteria. But since the first coeff (1) will be dropped, it will meet
+  // the criteria and should be dropped too.
+  tran_low_t qcoeff_scan[] = { 0,  0, 0, 0, 1, 0, 0, 0,  // should be removed
+                               -2, 0, 0, 0, 0, 0, 0, 0,  // should be removed
+                               0,  0, 0, 0, 0, 0, 0, 0,  //
+                               0,  0, 0, 0, 0, 0, 0, 0 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+// More complex test combining other test cases.
+TEST(DropoutTest, ComplexTest) {
+  const TX_SIZE tx_size = TX_8X8;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 2;
+  tran_low_t qcoeff_scan[] = { 1, 12, 0,  0,   0, 0, 1,  0,   //
+                               0, 0,  0,  -12, 0, 0, 0,  1,   //
+                               0, 0,  -2, 0,   1, 0, 0,  1,   //
+                               0, 0,  0,  0,   5, 0, -1, 0,   //
+                               0, 0,  0,  1,   0, 0, 0,  -1,  //
+                               0, 0,  0,  0,   2, 0, 0,  0,   //
+                               0, 1,  0,  0,   0, 5, 0,  0,   //
+                               0, 0,  1,  1,   0, 0, 0,  -2 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 1, 12, 0,  0,   0, 0, 0,  0,  //
+                               0, 0,  0,  -12, 0, 0, 0,  1,  //
+                               0, 0,  -2, 0,   1, 0, 0,  1,  //
+                               0, 0,  0,  0,   5, 0, -1, 0,  //
+                               0, 0,  0,  0,   0, 0, 0,  0,  //
+                               0, 0,  0,  0,   0, 0, 0,  0,  //
+                               0, 0,  0,  0,   0, 5, 0,  0,  //
+                               0, 0,  0,  0,   0, 0, 0,  -2 });
+}
+
+}  // namespace
diff --git a/media/libaom/src/test/encodetxb_test.cc b/media/libaom/src/test/encodetxb_test.cc
index 385d3f1a8b..ee09ea6aff 100644
--- a/media/libaom/src/test/encodetxb_test.cc
+++ b/media/libaom/src/test/encodetxb_test.cc
@@ -26,7 +26,6 @@
 #include "av1/common/scan.h"
 #include "av1/common/txb_common.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -48,16 +47,15 @@ class EncodeTxbTest : public ::testing::TestWithParam<GetNzMapContextsFunc> {
   virtual void SetUp() {
     coeff_contexts_ref_ = reinterpret_cast<int8_t *>(
         aom_memalign(16, sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE));
-    ASSERT_TRUE(coeff_contexts_ref_ != NULL);
+    ASSERT_NE(coeff_contexts_ref_, nullptr);
     coeff_contexts_ = reinterpret_cast<int8_t *>(
         aom_memalign(16, sizeof(*coeff_contexts_) * MAX_TX_SQUARE));
-    ASSERT_TRUE(coeff_contexts_ != NULL);
+    ASSERT_NE(coeff_contexts_, nullptr);
   }
 
   virtual void TearDown() {
     aom_free(coeff_contexts_ref_);
     aom_free(coeff_contexts_);
-    libaom_test::ClearSystemState();
   }
 
   void GetNzMapContextsRun() {
@@ -100,6 +98,7 @@ class EncodeTxbTest : public ::testing::TestWithParam<GetNzMapContextsFunc> {
   void SpeedTestGetNzMapContextsRun() {
     const int kNumTests = 2000000000;
     aom_usec_timer timer;
+    aom_usec_timer timer_ref;
 
     printf("Note: Only test the largest possible eob case!\n");
     for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
@@ -117,6 +116,16 @@ class EncodeTxbTest : public ::testing::TestWithParam<GetNzMapContextsFunc> {
       levels_ = set_levels(levels_buf_, width);
       InitDataWithEob(scan, bwl, eob);
 
+      aom_usec_timer_start(&timer_ref);
+      for (int i = 0; i < numTests; ++i) {
+        av1_get_nz_map_contexts_c(levels_, scan, eob, (TX_SIZE)tx_size,
+                                  tx_class, coeff_contexts_ref_);
+      }
+      aom_usec_timer_mark(&timer_ref);
+
+      levels_ = set_levels(levels_buf_, width);
+      InitDataWithEob(scan, bwl, eob);
+
       aom_usec_timer_start(&timer);
       for (int i = 0; i < numTests; ++i) {
         get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size,
@@ -124,9 +133,14 @@ class EncodeTxbTest : public ::testing::TestWithParam<GetNzMapContextsFunc> {
       }
       aom_usec_timer_mark(&timer);
 
+      const int elapsed_time_ref =
+          static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
       const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-      printf("get_nz_map_contexts_%2dx%2d: %7.1f ms\n", real_width, real_height,
-             elapsed_time / 1000.0);
+
+      printf("get_nz_map_contexts_%2dx%2d: %7.1f ms ref %7.1f ms gain %4.2f\n",
+             real_width, real_height, elapsed_time / 1000.0,
+             elapsed_time_ref / 1000.0,
+             (elapsed_time_ref * 1.0) / (elapsed_time * 1.0));
     }
   }
 
@@ -170,6 +184,7 @@ class EncodeTxbTest : public ::testing::TestWithParam<GetNzMapContextsFunc> {
   int8_t *coeff_contexts_ref_;
   int8_t *coeff_contexts_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(EncodeTxbTest);
 
 TEST_P(EncodeTxbTest, GetNzMapContexts) { GetNzMapContextsRun(); }
 
@@ -182,6 +197,11 @@ INSTANTIATE_TEST_SUITE_P(SSE2, EncodeTxbTest,
                          ::testing::Values(av1_get_nz_map_contexts_sse2));
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, EncodeTxbTest,
+                         ::testing::Values(av1_get_nz_map_contexts_neon));
+#endif
+
 typedef void (*av1_txb_init_levels_func)(const tran_low_t *const coeff,
                                          const int width, const int height,
                                          uint8_t *const levels);
@@ -192,9 +212,10 @@ class EncodeTxbInitLevelTest
     : public ::testing::TestWithParam<TxbInitLevelParam> {
  public:
   virtual ~EncodeTxbInitLevelTest() {}
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
   void RunTest(av1_txb_init_levels_func test_func, int tx_size, int is_speed);
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(EncodeTxbInitLevelTest);
 
 void EncodeTxbInitLevelTest::RunTest(av1_txb_init_levels_func test_func,
                                      int tx_size, int is_speed) {
@@ -260,4 +281,10 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Combine(::testing::Values(&av1_txb_init_levels_avx2),
                        ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
 #endif
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, EncodeTxbInitLevelTest,
+    ::testing::Combine(::testing::Values(&av1_txb_init_levels_neon),
+                       ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
+#endif
 }  // namespace
diff --git a/media/libaom/src/test/end_to_end_test.cc b/media/libaom/src/test/end_to_end_psnr_test.cc
index 162a7c7437..e6ab0ff46b 100644
--- a/media/libaom/src/test/end_to_end_test.cc
+++ b/media/libaom/src/test/end_to_end_psnr_test.cc
@@ -27,23 +27,14 @@ const unsigned int kHeight = 90;
 const unsigned int kFramerate = 50;
 const unsigned int kFrames = 10;
 const int kBitrate = 500;
-// List of psnr thresholds for speed settings 0-7 and 5 encoding modes
-const double kPsnrThreshold[][5] = {
-// Note:
-// AV1 HBD average PSNR is slightly lower than AV1.
-// We make two cases here to enable the testing and
-// guard picture quality.
-#if CONFIG_AV1_ENCODER
-  { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 31.0, 36.0, 36.0, 36.0, 36.0 },
-  { 31.0, 35.0, 35.0, 35.0, 35.0 }, { 31.0, 34.0, 34.0, 34.0, 34.0 },
-  { 31.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
-  { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
-#else
-  { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 },
-  { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 },
-  { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
-  { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
-#endif  // CONFIG_AV1_ENCODER
+const unsigned int kCqLevel = 18;
+// List of psnr thresholds for speed settings 0-8 and 4 encoding modes
+const double kPsnrThreshold[][4] = {
+  { 34.9, 44.4, 39.5, 41.9 }, { 34.9, 44.4, 39.5, 41.9 },
+  { 34.9, 44.4, 39.4, 41.9 }, { 34.9, 44.4, 39.1, 41.8 },
+  { 34.9, 44.4, 39.1, 41.8 }, { 34.9, 44.29, 38.5, 41.8 },
+  { 34.9, 44.3, 38.5, 41.3 }, { 34.9, 44.3, 38.5, 40.8 },
+  { 34.9, 44.3, 38.5, 40.8 }
 };
 
 typedef struct {
@@ -85,14 +76,6 @@ const libaom_test::TestMode kEncodingModeVectors[] = {
 // Speed settings tested
 const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6 };
 
-int is_extension_y4m(const char *filename) {
-  const char *dot = strrchr(filename, '.');
-  if (!dot || dot == filename)
-    return 0;
-  else
-    return !strcmp(dot, ".y4m");
-}
-
 class EndToEndTest
     : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
                                                  TestVideoParam, int>,
@@ -106,14 +89,11 @@ class EndToEndTest
   virtual ~EndToEndTest() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
-    if (encoding_mode_ != ::libaom_test::kRealTime) {
+    InitializeConfig(encoding_mode_);
+    if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+        encoding_mode_ == ::libaom_test::kTwoPassGood) {
       cfg_.g_lag_in_frames = 5;
-      cfg_.rc_end_usage = AOM_VBR;
-    } else {
-      cfg_.g_lag_in_frames = 0;
-      cfg_.rc_end_usage = AOM_CBR;
+    } else if (encoding_mode_ == ::libaom_test::kRealTime) {
       cfg_.rc_buf_sz = 1000;
       cfg_.rc_buf_initial_sz = 500;
       cfg_.rc_buf_optimal_sz = 600;
@@ -141,10 +121,13 @@ class EndToEndTest
         encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
       else
         encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
-      if (encoding_mode_ != ::libaom_test::kRealTime) {
+      if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+          encoding_mode_ == ::libaom_test::kTwoPassGood) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
         encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
         encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      } else if (encoding_mode_ == ::libaom_test::kAllIntra) {
+        encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
       }
     }
   }
@@ -176,7 +159,7 @@ class EndToEndTest
           test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
           kFramerate, 1, 0, kFrames));
     }
-    ASSERT_TRUE(video.get() != NULL);
+    ASSERT_NE(video, nullptr);
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
     const double psnr = GetAveragePsnr();
@@ -195,17 +178,35 @@ class EndToEndTest
 
 class EndToEndTestLarge : public EndToEndTest {};
 
+class EndToEndAllIntraTestLarge : public EndToEndTest {};
+
+class EndToEndAllIntraTest : public EndToEndTest {};
+
 TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); }
 
 TEST_P(EndToEndTest, EndtoEndPSNRTest) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(EndToEndTestLarge,
-                          ::testing::ValuesIn(kEncodingModeVectors),
-                          ::testing::ValuesIn(kTestVectors),
-                          ::testing::ValuesIn(kCpuUsedVectors));
+TEST_P(EndToEndAllIntraTestLarge, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndAllIntraTest, EndtoEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndTestLarge,
+                           ::testing::ValuesIn(kEncodingModeVectors),
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::ValuesIn(kCpuUsedVectors));
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(kTestVectors[2]),  // 444
+                           ::testing::Values(3));               // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndAllIntraTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Values(2, 4, 6, 8));  // cpu_used
 
-AV1_INSTANTIATE_TEST_CASE(EndToEndTest,
-                          ::testing::Values(kEncodingModeVectors[0]),
-                          ::testing::Values(kTestVectors[2]),  // 444
-                          ::testing::Values(kCpuUsedVectors[2]));
+AV1_INSTANTIATE_TEST_SUITE(EndToEndAllIntraTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(kTestVectors[0]),  // 420
+                           ::testing::Values(6));               // cpu_used
 }  // namespace
diff --git a/media/libaom/src/test/end_to_end_qmpsnr_test.cc b/media/libaom/src/test/end_to_end_qmpsnr_test.cc
new file mode 100644
index 0000000000..de183adc5f
--- /dev/null
+++ b/media/libaom/src/test/end_to_end_qmpsnr_test.cc
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/ssim.h"
+#include "av1/common/blockd.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const unsigned int kCqLevel = 18;
+// List of ssim thresholds for speed settings 0-8 with all intra encoding mode.
+const double kSsimThreshold[] = { 83.4, 83.4, 83.4, 83.3, 83.3,
+                                  83.0, 82.3, 81.1, 81.1 };
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " input_bit_depth:" << test_arg.input_bit_depth
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
+  { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+#if CONFIG_AV1_HIGHBITDEPTH
+  { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
+  { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
+  { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
+  { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
+#endif
+};
+
+// This class is used to check adherence to given ssim value, while using the
+// "dist-metric=qm-psnr" option.
+class EndToEndQMPSNRTest
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 TestVideoParam, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  EndToEndQMPSNRTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        test_video_param_(GET_PARAM(2)), cpu_used_(GET_PARAM(3)), nframes_(0),
+        ssim_(0.0) {}
+
+  ~EndToEndQMPSNRTest() override {}
+
+  void SetUp() override { InitializeConfig(encoding_mode_); }
+
+  void BeginPassHook(unsigned int) override {
+    nframes_ = 0;
+    ssim_ = 0.0;
+  }
+
+  void CalculateFrameLevelSSIM(const aom_image_t *img_src,
+                               const aom_image_t *img_enc,
+                               aom_bit_depth_t bit_depth,
+                               unsigned int input_bit_depth) override {
+    double frame_ssim;
+    double plane_ssim[MAX_MB_PLANE] = { 0.0, 0.0, 0.0 };
+    int crop_widths[PLANE_TYPES];
+    int crop_heights[PLANE_TYPES];
+    crop_widths[PLANE_TYPE_Y] = img_src->d_w;
+    crop_heights[PLANE_TYPE_Y] = img_src->d_h;
+    // Width of UV planes calculated based on chroma_shift values.
+    crop_widths[PLANE_TYPE_UV] =
+        img_src->x_chroma_shift == 1 ? (img_src->w + 1) >> 1 : img_src->w;
+    crop_heights[PLANE_TYPE_UV] =
+        img_src->y_chroma_shift == 1 ? (img_src->h + 1) >> 1 : img_src->h;
+    nframes_++;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    uint8_t is_hbd = bit_depth > AOM_BITS_8;
+    if (is_hbd) {
+      // HBD ssim calculation.
+      uint8_t shift = bit_depth - input_bit_depth;
+      for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+        const int is_uv = i > AOM_PLANE_Y;
+        plane_ssim[i] = aom_highbd_ssim2(
+            CONVERT_TO_BYTEPTR(img_src->planes[i]),
+            CONVERT_TO_BYTEPTR(img_enc->planes[i]),
+            img_src->stride[is_uv] >> is_hbd, img_enc->stride[is_uv] >> is_hbd,
+            crop_widths[is_uv], crop_heights[is_uv], input_bit_depth, shift);
+      }
+      frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+                   .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+      // Accumulate to find sequence level ssim value.
+      ssim_ += frame_ssim;
+      return;
+    }
+#else
+    (void)bit_depth;
+    (void)input_bit_depth;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+    // LBD ssim calculation.
+    for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+      const int is_uv = i > AOM_PLANE_Y;
+      plane_ssim[i] = aom_ssim2(img_src->planes[i], img_enc->planes[i],
+                                img_src->stride[is_uv], img_enc->stride[is_uv],
+                                crop_widths[is_uv], crop_heights[is_uv]);
+    }
+    frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+                 .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+    // Accumulate to find sequence level ssim value.
+    ssim_ += frame_ssim;
+  }
+
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_TUNING, AOM_TUNE_SSIM);
+      encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+      encoder->SetOption("dist-metric", "qm-psnr");
+    }
+  }
+
+  double GetAverageSsim() const {
+    if (nframes_) return 100 * pow(ssim_ / nframes_, 8.0);
+    return 0.0;
+  }
+
+  double GetSsimThreshold() { return kSsimThreshold[cpu_used_]; }
+
+  void DoTest() {
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    std::unique_ptr<libaom_test::VideoSource> video(
+        new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                        kFrames));
+    ASSERT_NE(video, nullptr);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double ssim = GetAverageSsim();
+    EXPECT_GT(ssim, GetSsimThreshold())
+        << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_;
+  }
+
+ private:
+  const libaom_test::TestMode encoding_mode_;
+  const TestVideoParam test_video_param_;
+  const int cpu_used_;
+  unsigned int nframes_;
+  double ssim_;
+};
+
+class EndToEndQMPSNRTestLarge : public EndToEndQMPSNRTest {};
+
+TEST_P(EndToEndQMPSNRTestLarge, EndtoEndQMPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndQMPSNRTest, EndtoEndQMPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndQMPSNRTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Values(2, 4, 6, 8));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndQMPSNRTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(kTestVectors[0]),  // 420
+                           ::testing::Values(6));               // cpu_used
+}  // namespace
diff --git a/media/libaom/src/test/end_to_end_ssim_test.cc b/media/libaom/src/test/end_to_end_ssim_test.cc
new file mode 100644
index 0000000000..2e40c9486b
--- /dev/null
+++ b/media/libaom/src/test/end_to_end_ssim_test.cc
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/ssim.h"
+#include "av1/common/blockd.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const unsigned int kCqLevel = 18;
+// List of ssim thresholds for speed settings 0-8 with all intra encoding mode.
+const double kSsimThreshold[] = { 83.4, 83.4, 83.4, 83.3, 83.3,
+                                  83.0, 82.3, 81.1, 81.1 };
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " input_bit_depth:" << test_arg.input_bit_depth
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
+  { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+#if CONFIG_AV1_HIGHBITDEPTH
+  { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
+  { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
+  { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
+  { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
+#endif
+};
+
+// This class is used to check adherence to given ssim value.
+class EndToEndSSIMTest
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 TestVideoParam, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  EndToEndSSIMTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        test_video_param_(GET_PARAM(2)), cpu_used_(GET_PARAM(3)), nframes_(0),
+        ssim_(0.0) {}
+
+  ~EndToEndSSIMTest() override {}
+
+  void SetUp() override { InitializeConfig(encoding_mode_); }
+
+  void BeginPassHook(unsigned int) override {
+    nframes_ = 0;
+    ssim_ = 0.0;
+  }
+
+  void CalculateFrameLevelSSIM(const aom_image_t *img_src,
+                               const aom_image_t *img_enc,
+                               aom_bit_depth_t bit_depth,
+                               unsigned int input_bit_depth) override {
+    double frame_ssim;
+    double plane_ssim[MAX_MB_PLANE] = { 0.0, 0.0, 0.0 };
+    int crop_widths[PLANE_TYPES];
+    int crop_heights[PLANE_TYPES];
+    crop_widths[PLANE_TYPE_Y] = img_src->d_w;
+    crop_heights[PLANE_TYPE_Y] = img_src->d_h;
+    // Width of UV planes calculated based on chroma_shift values.
+    crop_widths[PLANE_TYPE_UV] =
+        img_src->x_chroma_shift == 1 ? (img_src->w + 1) >> 1 : img_src->w;
+    crop_heights[PLANE_TYPE_UV] =
+        img_src->y_chroma_shift == 1 ? (img_src->h + 1) >> 1 : img_src->h;
+    nframes_++;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    uint8_t is_hbd = bit_depth > AOM_BITS_8;
+    if (is_hbd) {
+      // HBD ssim calculation.
+      uint8_t shift = bit_depth - input_bit_depth;
+      for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+        const int is_uv = i > AOM_PLANE_Y;
+        plane_ssim[i] = aom_highbd_ssim2(
+            CONVERT_TO_BYTEPTR(img_src->planes[i]),
+            CONVERT_TO_BYTEPTR(img_enc->planes[i]),
+            img_src->stride[is_uv] >> is_hbd, img_enc->stride[is_uv] >> is_hbd,
+            crop_widths[is_uv], crop_heights[is_uv], input_bit_depth, shift);
+      }
+      frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+                   .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+      // Accumulate to find sequence level ssim value.
+      ssim_ += frame_ssim;
+      return;
+    }
+#else
+    (void)bit_depth;
+    (void)input_bit_depth;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+    // LBD ssim calculation.
+    for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+      const int is_uv = i > AOM_PLANE_Y;
+      plane_ssim[i] = aom_ssim2(img_src->planes[i], img_enc->planes[i],
+                                img_src->stride[is_uv], img_enc->stride[is_uv],
+                                crop_widths[is_uv], crop_heights[is_uv]);
+    }
+    frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+                 .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+    // Accumulate to find sequence level ssim value.
+    ssim_ += frame_ssim;
+  }
+
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_TUNING, AOM_TUNE_SSIM);
+      encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+    }
+  }
+
+  double GetAverageSsim() const {
+    if (nframes_) return 100 * pow(ssim_ / nframes_, 8.0);
+    return 0.0;
+  }
+
+  double GetSsimThreshold() { return kSsimThreshold[cpu_used_]; }
+
+  void DoTest() {
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    std::unique_ptr<libaom_test::VideoSource> video(
+        new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                        kFrames));
+    ASSERT_NE(video, nullptr);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double ssim = GetAverageSsim();
+    EXPECT_GT(ssim, GetSsimThreshold())
+        << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_;
+  }
+
+ private:
+  const libaom_test::TestMode encoding_mode_;
+  const TestVideoParam test_video_param_;
+  const int cpu_used_;
+  unsigned int nframes_;
+  double ssim_;
+};
+
+class EndToEndSSIMTestLarge : public EndToEndSSIMTest {};
+
+TEST_P(EndToEndSSIMTestLarge, EndtoEndSSIMTest) { DoTest(); }
+
+TEST_P(EndToEndSSIMTest, EndtoEndSSIMTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndSSIMTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Values(2, 4, 6, 8));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndSSIMTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(kTestVectors[0]),  // 420
+                           ::testing::Values(6));               // cpu_used
+}  // namespace
diff --git a/media/libaom/src/test/error_block_test.cc b/media/libaom/src/test/error_block_test.cc
index 462661e61a..e4befd5f8b 100644
--- a/media/libaom/src/test/error_block_test.cc
+++ b/media/libaom/src/test/error_block_test.cc
@@ -20,7 +20,6 @@
 #include "config/av1_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/entropy.h"
@@ -32,16 +31,20 @@ using libaom_test::ACMRandom;
 namespace {
 const int kNumIterations = 1000;
 
-typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
-                                  const tran_low_t *dqcoeff,
-                                  intptr_t block_size, int64_t *ssz, int bps);
+using ErrorBlockFunc = int64_t (*)(const tran_low_t *coeff,
+                                   const tran_low_t *dqcoeff,
+                                   intptr_t block_size, int64_t *ssz, int bps);
 
-typedef int64_t (*ErrorBlockFunc8Bits)(const tran_low_t *coeff,
-                                       const tran_low_t *dqcoeff,
-                                       intptr_t block_size, int64_t *ssz);
+using ErrorBlockFunc8Bits = int64_t (*)(const tran_low_t *coeff,
+                                        const tran_low_t *dqcoeff,
+                                        intptr_t block_size, int64_t *ssz);
 
-typedef std::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>
-    ErrorBlockParam;
+using ErrorBlockLpFunc = int64_t (*)(const int16_t *coeff,
+                                     const int16_t *dqcoeff,
+                                     intptr_t block_size);
+
+using ErrorBlockParam =
+    std::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>;
 
 template <ErrorBlockFunc8Bits fn>
 int64_t BlockError8BitWrapper(const tran_low_t *coeff,
@@ -51,6 +54,15 @@ int64_t BlockError8BitWrapper(const tran_low_t *coeff,
   return fn(coeff, dqcoeff, block_size, ssz);
 }
 
+template <ErrorBlockLpFunc fn>
+int64_t BlockErrorLpWrapper(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                            intptr_t block_size, int64_t *ssz, int bps) {
+  EXPECT_EQ(bps, 8);
+  *ssz = -1;
+  return fn(reinterpret_cast<const int16_t *>(coeff),
+            reinterpret_cast<const int16_t *>(dqcoeff), block_size);
+}
+
 class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
  public:
   virtual ~ErrorBlockTest() {}
@@ -60,13 +72,14 @@ class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   aom_bit_depth_t bit_depth_;
   ErrorBlockFunc error_block_op_;
   ErrorBlockFunc ref_error_block_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ErrorBlockTest);
 
 TEST_P(ErrorBlockTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -98,7 +111,7 @@ TEST_P(ErrorBlockTest, OperationCheck) {
     }
     ref_ret =
         ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
     err_count += (ref_ret != ret) | (ref_ssz != ssz);
     if (err_count && !err_count_total) {
@@ -156,7 +169,7 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
     }
     ref_ret =
         ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
     err_count += (ref_ret != ret) | (ref_ssz != ssz);
     if (err_count && !err_count_total) {
@@ -246,7 +259,9 @@ const ErrorBlockParam kErrorBlockTestParamsSse2[] = {
              AOM_BITS_8),
 #endif
   make_tuple(&BlockError8BitWrapper<av1_block_error_sse2>,
-             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8)
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+  make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_sse2>,
+             &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
 };
 
 INSTANTIATE_TEST_SUITE_P(SSE2, ErrorBlockTest,
@@ -264,7 +279,9 @@ const ErrorBlockParam kErrorBlockTestParamsAvx2[] = {
              AOM_BITS_8),
 #endif
   make_tuple(&BlockError8BitWrapper<av1_block_error_avx2>,
-             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8)
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+  make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_avx2>,
+             &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, ErrorBlockTest,
@@ -280,10 +297,14 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_MSA
 
 #if (HAVE_NEON)
-INSTANTIATE_TEST_SUITE_P(
-    NEON, ErrorBlockTest,
-    ::testing::Values(make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
-                                 &BlockError8BitWrapper<av1_block_error_c>,
-                                 AOM_BITS_8)));
+const ErrorBlockParam kErrorBlockTestParamsNeon[] = {
+  make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+  make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_neon>,
+             &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, ErrorBlockTest,
+                         ::testing::ValuesIn(kErrorBlockTestParamsNeon));
 #endif  // HAVE_NEON
 }  // namespace
diff --git a/media/libaom/src/test/error_resilience_test.cc b/media/libaom/src/test/error_resilience_test.cc
index 1d52bb24a3..1ef72c88a8 100644
--- a/media/libaom/src/test/error_resilience_test.cc
+++ b/media/libaom/src/test/error_resilience_test.cc
@@ -27,12 +27,13 @@ const int kMaxSFrames = 12;
 const int kCpuUsed = 1;
 
 class ErrorResilienceTestLarge
-    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
       public ::libaom_test::EncoderTest {
  protected:
   ErrorResilienceTestLarge()
       : EncoderTest(GET_PARAM(0)), psnr_(0.0), nframes_(0), mismatch_psnr_(0.0),
-        mismatch_nframes_(0), encoding_mode_(GET_PARAM(1)), allow_mismatch_(0) {
+        mismatch_nframes_(0), encoding_mode_(GET_PARAM(1)), allow_mismatch_(0),
+        enable_altref_(GET_PARAM(2)) {
     Reset();
   }
 
@@ -57,10 +58,7 @@ class ErrorResilienceTestLarge
     init_flags_ = AOM_CODEC_USE_PSNR;
   }
 
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
-  }
+  virtual void SetUp() { InitializeConfig(encoding_mode_); }
 
   virtual void BeginPassHook(unsigned int /*pass*/) {
     psnr_ = 0.0;
@@ -77,7 +75,10 @@ class ErrorResilienceTestLarge
 
   virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
                                   libaom_test::Encoder *encoder) {
-    if (video->frame() == 0) encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, enable_altref_);
+    }
     frame_flags_ &=
         ~(AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
           AOM_EFLAG_NO_REF_FRAME_MVS | AOM_EFLAG_ERROR_RESILIENT |
@@ -320,6 +321,7 @@ class ErrorResilienceTestLarge
   unsigned int s_frames_[kMaxSFrames];
   libaom_test::TestMode encoding_mode_;
   int allow_mismatch_;
+  int enable_altref_;
 };
 
 TEST_P(ErrorResilienceTestLarge, OnVersusOff) {
@@ -356,6 +358,10 @@ TEST_P(ErrorResilienceTestLarge, OnVersusOff) {
 // if we lose (i.e., drop before decoding) a set of droppable
 // frames (i.e., frames that don't update any reference buffers).
 TEST_P(ErrorResilienceTestLarge, DropFramesWithoutRecovery) {
+  if (GET_PARAM(1) == ::libaom_test::kOnePassGood && GET_PARAM(2) == 1) {
+    fprintf(stderr, "Skipping test case #1 because of bug aomedia:3002\n");
+    return;
+  }
   SetupEncoder(500, 10);
   libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                      cfg_.g_timebase.den, cfg_.g_timebase.num,
@@ -455,5 +461,6 @@ TEST_P(ErrorResilienceTestLarge, SFrameTest) {
   EXPECT_LE(GetMismatchFrames(), GetEncodedFrames() - s_frame_list[0]);
 }
 
-AV1_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, NONREALTIME_TEST_MODES);
+AV1_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLarge, NONREALTIME_TEST_MODES,
+                           ::testing::Values(0, 1));
 }  // namespace
diff --git a/media/libaom/src/test/ethread_test.cc b/media/libaom/src/test/ethread_test.cc
index 306cc2f3a8..0074ff31b5 100644
--- a/media/libaom/src/test/ethread_test.cc
+++ b/media/libaom/src/test/ethread_test.cc
@@ -16,9 +16,192 @@
 #include "test/encode_test_driver.h"
 #include "test/md5_helper.h"
 #include "test/util.h"
+#include "test/y4m_video_source.h"
 #include "test/yuv_video_source.h"
+#include "av1/encoder/firstpass.h"
 
 namespace {
+const unsigned int kCqLevel = 18;
+
+#if !CONFIG_REALTIME_ONLY
+const size_t kFirstPassStatsSz = sizeof(FIRSTPASS_STATS);
+class AVxFirstPassEncoderThreadTest
+    : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
+                                                 int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AVxFirstPassEncoderThreadTest()
+      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
+        encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)),
+        tile_rows_(GET_PARAM(3)), tile_cols_(GET_PARAM(4)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+
+    row_mt_ = 1;
+    firstpass_stats_.buf = NULL;
+    firstpass_stats_.sz = 0;
+  }
+  virtual ~AVxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); }
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_2pass_vbr_minsection_pct = 5;
+    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    encoder_initialized_ = false;
+    abort_ = false;
+  }
+
+  virtual void EndPassHook() {
+    // For first pass stats test, only run first pass encoder.
+    if (cfg_.g_pass == AOM_RC_FIRST_PASS) abort_ = true;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+                                  ::libaom_test::Encoder *encoder) {
+    if (!encoder_initialized_) {
+      // Encode in 2-pass mode.
+      SetTileSize(encoder);
+      encoder->Control(AV1E_SET_ROW_MT, row_mt_);
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
+
+      encoder_initialized_ = true;
+    }
+  }
+
+  virtual void SetTileSize(libaom_test::Encoder *encoder) {
+    encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+    encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+  }
+
+  virtual void StatsPktHook(const aom_codec_cx_pkt_t *pkt) {
+    const uint8_t *const pkt_buf =
+        reinterpret_cast<uint8_t *>(pkt->data.twopass_stats.buf);
+    const size_t pkt_size = pkt->data.twopass_stats.sz;
+
+    // First pass stats size equals sizeof(FIRSTPASS_STATS)
+    EXPECT_EQ(pkt_size, kFirstPassStatsSz)
+        << "Error: First pass stats size doesn't equal kFirstPassStatsSz";
+
+    firstpass_stats_.buf =
+        realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size);
+    ASSERT_NE(firstpass_stats_.buf, nullptr);
+    memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf,
+           pkt_size);
+    firstpass_stats_.sz += pkt_size;
+  }
+
+  bool encoder_initialized_;
+  ::libaom_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  int tile_rows_;
+  int tile_cols_;
+  int row_mt_;
+  aom_fixed_buf_t firstpass_stats_;
+};
+
+static void compare_fp_stats_md5(aom_fixed_buf_t *fp_stats) {
+  // fp_stats consists of 2 set of first pass encoding stats. These 2 set of
+  // stats are compared to check if the stats match.
+  uint8_t *stats1 = reinterpret_cast<uint8_t *>(fp_stats->buf);
+  uint8_t *stats2 = stats1 + fp_stats->sz / 2;
+  ::libaom_test::MD5 md5_row_mt_0, md5_row_mt_1;
+
+  md5_row_mt_0.Add(stats1, fp_stats->sz / 2);
+  const char *md5_row_mt_0_str = md5_row_mt_0.Get();
+
+  md5_row_mt_1.Add(stats2, fp_stats->sz / 2);
+  const char *md5_row_mt_1_str = md5_row_mt_1.Get();
+
+  // Check md5 match.
+  ASSERT_STREQ(md5_row_mt_0_str, md5_row_mt_1_str)
+      << "MD5 checksums don't match";
+}
+
+TEST_P(AVxFirstPassEncoderThreadTest, FirstPassStatsTest) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  aom_fixed_buf_t firstpass_stats;
+  size_t single_run_sz;
+
+  cfg_.rc_target_bitrate = 1000;
+
+  // 5 encodes will be run:
+  // 1. row_mt_=0 and threads=1
+  // 2. row_mt_=1 and threads=1
+  // 3. row_mt_=1 and threads=2
+  // 4. row_mt_=1 and threads=4
+  // 5. row_mt_=1 and threads=8
+
+  // 4 comparisons will be made:
+  // 1. Between run 1 and run 2.
+  // 2. Between run 2 and run 3.
+  // 3. Between run 3 and run 4.
+  // 4. Between run 4 and run 5.
+
+  // Test row_mt_: 0 vs 1 at single thread case(threads = 1)
+  cfg_.g_threads = 1;
+
+  row_mt_ = 0;
+  init_flags_ = AOM_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  row_mt_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  firstpass_stats.buf = firstpass_stats_.buf;
+  firstpass_stats.sz = firstpass_stats_.sz;
+  single_run_sz = firstpass_stats_.sz / 2;
+
+  // Compare to check if using or not using row-mt are bit exact.
+  // Comparison 1 (between row_mt_=0 and row_mt_=1).
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats_md5(&firstpass_stats));
+
+  // Test single thread vs multiple threads
+  row_mt_ = 1;
+
+  cfg_.g_threads = 2;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // offset to the 2nd and 3rd run.
+  firstpass_stats.buf = reinterpret_cast<void *>(
+      reinterpret_cast<uint8_t *>(firstpass_stats_.buf) + single_run_sz);
+
+  // Compare to check if single-thread and multi-thread stats are bit exact.
+  // Comparison 2 (between threads=1 and threads=2).
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats_md5(&firstpass_stats));
+
+  cfg_.g_threads = 4;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // offset to the 3rd and 4th run
+  firstpass_stats.buf = reinterpret_cast<void *>(
+      reinterpret_cast<uint8_t *>(firstpass_stats_.buf) + single_run_sz * 2);
+
+  // Comparison 3 (between threads=2 and threads=4).
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats_md5(&firstpass_stats));
+
+  cfg_.g_threads = 8;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // offset to the 4th and 5th run.
+  firstpass_stats.buf = reinterpret_cast<void *>(
+      reinterpret_cast<uint8_t *>(firstpass_stats_.buf) + single_run_sz * 3);
+
+  // Comparison 4 (between threads=4 and threads=8).
+  compare_fp_stats_md5(&firstpass_stats);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
 class AVxEncoderThreadTest
     : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int,
                                                  int, int, int>,
@@ -47,17 +230,14 @@ class AVxEncoderThreadTest
   virtual ~AVxEncoderThreadTest() { delete decoder_; }
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
+    InitializeConfig(encoding_mode_);
 
-    if (encoding_mode_ != ::libaom_test::kRealTime) {
-      cfg_.g_lag_in_frames = 5;
-      cfg_.rc_end_usage = AOM_VBR;
+    if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+        encoding_mode_ == ::libaom_test::kTwoPassGood) {
+      cfg_.g_lag_in_frames = 6;
       cfg_.rc_2pass_vbr_minsection_pct = 5;
       cfg_.rc_2pass_vbr_maxsection_pct = 2000;
-    } else {
-      cfg_.g_lag_in_frames = 0;
-      cfg_.rc_end_usage = AOM_CBR;
+    } else if (encoding_mode_ == ::libaom_test::kRealTime) {
       cfg_.g_error_resilient = 1;
     }
     cfg_.rc_max_quantizer = 56;
@@ -74,14 +254,22 @@ class AVxEncoderThreadTest
       SetTileSize(encoder);
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_ROW_MT, row_mt_);
-      if (encoding_mode_ != ::libaom_test::kRealTime) {
+      if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+          encoding_mode_ == ::libaom_test::kTwoPassGood) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
-        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 5);
         encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
         encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
-      } else {
+        encoder->Control(AV1E_SET_MAX_GF_INTERVAL, 4);
+      } else if (encoding_mode_ == ::libaom_test::kRealTime) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
         encoder->Control(AV1E_SET_AQ_MODE, 3);
+        encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
+        encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
+        encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 3);
+        encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 3);
+      } else {
+        encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
       }
       encoder_initialized_ = true;
     }
@@ -152,6 +340,9 @@ class AVxEncoderThreadTest
       ASSERT_EQ(single_thr_size_enc, multi_thr_size_enc);
       ASSERT_EQ(single_thr_md5_enc, multi_thr_md5_enc);
       ASSERT_EQ(single_thr_md5_dec, multi_thr_md5_dec);
+
+      DoTestMaxThreads(&video, single_thr_size_enc, single_thr_md5_enc,
+                       single_thr_md5_dec);
     } else if (row_mt_ == 1) {
       // Encode using multiple threads row-mt enabled.
       cfg_.g_threads = 2;
@@ -200,9 +391,36 @@ class AVxEncoderThreadTest
       ASSERT_EQ(multi_thr4_row_mt_size_enc, multi_thr2_row_mt_size_enc);
       ASSERT_EQ(multi_thr4_row_mt_md5_enc, multi_thr2_row_mt_md5_enc);
       ASSERT_EQ(multi_thr4_row_mt_md5_dec, multi_thr2_row_mt_md5_dec);
+
+      DoTestMaxThreads(&video, multi_thr2_row_mt_size_enc,
+                       multi_thr2_row_mt_md5_enc, multi_thr2_row_mt_md5_dec);
     }
   }
 
+  virtual void DoTestMaxThreads(::libaom_test::YUVVideoSource *video,
+                                const std::vector<size_t> ref_size_enc,
+                                const std::vector<std::string> ref_md5_enc,
+                                const std::vector<std::string> ref_md5_dec) {
+    // This value should be kept the same as MAX_NUM_THREADS
+    // in aom_thread.h
+    cfg_.g_threads = 64;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+    std::vector<size_t> multi_thr_max_row_mt_size_enc;
+    std::vector<std::string> multi_thr_max_row_mt_md5_enc;
+    std::vector<std::string> multi_thr_max_row_mt_md5_dec;
+    multi_thr_max_row_mt_size_enc = size_enc_;
+    multi_thr_max_row_mt_md5_enc = md5_enc_;
+    multi_thr_max_row_mt_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Check that the vectors are equal.
+    ASSERT_EQ(ref_size_enc, multi_thr_max_row_mt_size_enc);
+    ASSERT_EQ(ref_md5_enc, multi_thr_max_row_mt_md5_enc);
+    ASSERT_EQ(ref_md5_dec, multi_thr_max_row_mt_md5_dec);
+  }
+
   bool encoder_initialized_;
   ::libaom_test::TestMode encoding_mode_;
   int set_cpu_used_;
@@ -215,12 +433,26 @@ class AVxEncoderThreadTest
   std::vector<std::string> md5_dec_;
 };
 
-TEST_P(AVxEncoderThreadTest, EncoderResultTest) {
+class AVxEncoderThreadRTTest : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadRTTest, EncoderResultTest) {
   cfg_.large_scale_tile = 0;
   decoder_->Control(AV1_SET_TILE_MODE, 0);
   DoTest();
 }
 
+// For real time mode, test speed 5, 6, 7, 8, 9, 10.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadRTTest,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Values(5, 6, 7, 8, 9, 10),
+                           ::testing::Values(0, 2), ::testing::Values(0, 2),
+                           ::testing::Values(0, 1));
+
+#if !CONFIG_REALTIME_ONLY
+
+// The AVxEncoderThreadTestLarge takes up ~14% of total run-time of the
+// Valgrind long tests. Exclude it; the smaller tests are still run.
+#if !AOM_VALGRIND_BUILD
 class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {};
 
 TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) {
@@ -229,27 +461,82 @@ TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) {
   DoTest();
 }
 
-// For AV1, only test speed 0 to 3.
-// Here test cpu_used 2 and 3
-AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood),
-                          ::testing::Range(2, 4), ::testing::Values(0, 2),
-                          ::testing::Values(0, 1), ::testing::Values(0, 1));
-
-// Test cpu_used 0 and 1.
-AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTestLarge,
-                          ::testing::Values(::libaom_test::kTwoPassGood,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(0, 2), ::testing::Values(0, 1, 2, 6),
-                          ::testing::Values(0, 1, 2, 6),
-                          ::testing::Values(0, 1));
+// Test cpu_used 0, 1, 3 and 5.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Values(0, 1, 3, 5),
+                           ::testing::Values(1, 6), ::testing::Values(1, 6),
+                           ::testing::Values(0, 1));
+#endif  // !AOM_VALGRIND_BUILD
+
+TEST_P(AVxEncoderThreadTest, EncoderResultTest) {
+  cfg_.large_scale_tile = 0;
+  decoder_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+class AVxEncoderThreadAllIntraTest : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadAllIntraTest, EncoderResultTest) {
+  cfg_.large_scale_tile = 0;
+  decoder_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+class AVxEncoderThreadAllIntraTestLarge : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadAllIntraTestLarge, EncoderResultTest) {
+  cfg_.large_scale_tile = 0;
+  decoder_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+// first pass stats test
+AV1_INSTANTIATE_TEST_SUITE(AVxFirstPassEncoderThreadTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Range(0, 6, 2), ::testing::Range(0, 2),
+                           ::testing::Range(1, 3));
+
+// For AV1, test speed 0, 1, 2, 3, 5.
+// Only test cpu_used 2 here.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(2), ::testing::Values(0, 2),
+                           ::testing::Values(0, 2), ::testing::Values(0, 1));
+
+// For all intra mode, test speed 0, 2, 4, 6, 8.
+// Only test cpu_used 6 here.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadAllIntraTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(6), ::testing::Values(0, 2),
+                           ::testing::Values(0, 2), ::testing::Values(0, 1));
+
+// Test cpu_used 0, 2, 4 and 8.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadAllIntraTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(0, 2, 4, 8),
+                           ::testing::Values(1, 6), ::testing::Values(1, 6),
+                           ::testing::Values(0, 1));
+#endif  // !CONFIG_REALTIME_ONLY
 
 class AVxEncoderThreadLSTest : public AVxEncoderThreadTest {
   virtual void SetTileSize(libaom_test::Encoder *encoder) {
     encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
     encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
   }
+
+  virtual void DoTestMaxThreads(::libaom_test::YUVVideoSource *video,
+                                const std::vector<size_t> ref_size_enc,
+                                const std::vector<std::string> ref_md5_enc,
+                                const std::vector<std::string> ref_md5_dec) {
+    (void)video;
+    (void)ref_size_enc;
+    (void)ref_md5_enc;
+    (void)ref_md5_dec;
+  }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AVxEncoderThreadLSTest);
 
 TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) {
   cfg_.large_scale_tile = 1;
@@ -258,6 +545,10 @@ TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) {
   DoTest();
 }
 
+// AVxEncoderThreadLSTestLarge takes up about 2% of total run-time of
+// the Valgrind long tests. Since we already run AVxEncoderThreadLSTest,
+// skip this one for Valgrind.
+#if !CONFIG_REALTIME_ONLY && !AOM_VALGRIND_BUILD
 class AVxEncoderThreadLSTestLarge : public AVxEncoderThreadLSTest {};
 
 TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) {
@@ -267,9 +558,10 @@ TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) {
   DoTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadLSTestLarge,
-                          ::testing::Values(::libaom_test::kTwoPassGood,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(0, 4), ::testing::Values(0, 6),
-                          ::testing::Values(0, 6), ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadLSTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Values(1, 3), ::testing::Values(0, 6),
+                           ::testing::Values(0, 6), ::testing::Values(1));
+#endif  // !CONFIG_REALTIME_ONLY && !AOM_VALGRIND_BUILD
 }  // namespace
diff --git a/media/libaom/src/test/examples.sh b/media/libaom/src/test/examples.sh
index 2cdb89dd08..87d8c2b03e 100644..100755
--- a/media/libaom/src/test/examples.sh
+++ b/media/libaom/src/test/examples.sh
@@ -17,6 +17,10 @@ example_tests=$(ls -r $(dirname $0)/*.sh)
 # List of script names to exclude.
 exclude_list="best_encode examples run_encodes tools_common"
 
+if [ "$(realtime_only_build)" = "yes" ]; then
+  exclude_list="${exclude_list} twopass_encoder simple_decoder lightfield_test"
+fi
+
 # Filter out the scripts in $exclude_list.
 for word in ${exclude_list}; do
   example_tests=$(filter_strings "${example_tests}" "${word}" exclude)
diff --git a/media/libaom/src/test/external_frame_buffer_test.cc b/media/libaom/src/test/external_frame_buffer_test.cc
index 1d726a4f1c..84bf584e6a 100644
--- a/media/libaom/src/test/external_frame_buffer_test.cc
+++ b/media/libaom/src/test/external_frame_buffer_test.cc
@@ -51,7 +51,10 @@ class ExternalFrameBufferList {
 
     num_buffers_ = num_buffers;
     ext_fb_list_ = new ExternalFrameBuffer[num_buffers_];
-    EXPECT_TRUE(ext_fb_list_ != NULL);
+    if (ext_fb_list_ == nullptr) {
+      EXPECT_NE(ext_fb_list_, nullptr);
+      return false;
+    }
     memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_);
     return true;
   }
@@ -61,13 +64,16 @@ class ExternalFrameBufferList {
   // frame buffer is in use by libaom. Finally sets |fb| to point to the
   // external frame buffer. Returns < 0 on an error.
   int GetFreeFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) {
-    EXPECT_TRUE(fb != NULL);
+    EXPECT_NE(fb, nullptr);
     const int idx = FindFreeBufferIndex();
     if (idx == num_buffers_) return -1;
 
     if (ext_fb_list_[idx].size < min_size) {
       delete[] ext_fb_list_[idx].data;
       ext_fb_list_[idx].data = new uint8_t[min_size];
+      if (ext_fb_list_[idx].data == nullptr) {
+        EXPECT_NE(ext_fb_list_[idx].data, nullptr);
+      }
       memset(ext_fb_list_[idx].data, 0, min_size);
       ext_fb_list_[idx].size = min_size;
     }
@@ -81,7 +87,7 @@ class ExternalFrameBufferList {
   // Test function that will not allocate any data for the frame buffer.
   // Returns < 0 on an error.
   int GetZeroFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) {
-    EXPECT_TRUE(fb != NULL);
+    EXPECT_NE(fb, nullptr);
     const int idx = FindFreeBufferIndex();
     if (idx == num_buffers_) return -1;
 
@@ -99,13 +105,13 @@ class ExternalFrameBufferList {
   // Returns < 0 on an error.
   int ReturnFrameBuffer(aom_codec_frame_buffer_t *fb) {
     if (fb == NULL) {
-      EXPECT_TRUE(fb != NULL);
+      EXPECT_NE(fb, nullptr);
       return -1;
     }
     ExternalFrameBuffer *const ext_fb =
         reinterpret_cast<ExternalFrameBuffer *>(fb->priv);
     if (ext_fb == NULL) {
-      EXPECT_TRUE(ext_fb != NULL);
+      EXPECT_NE(ext_fb, nullptr);
       return -1;
     }
     EXPECT_EQ(1, ext_fb->in_use);
@@ -141,7 +147,7 @@ class ExternalFrameBufferList {
   // Sets |fb| to an external frame buffer. idx is the index into the frame
   // buffer list.
   void SetFrameBuffer(int idx, aom_codec_frame_buffer_t *fb) {
-    ASSERT_TRUE(fb != NULL);
+    ASSERT_NE(fb, nullptr);
     fb->data = ext_fb_list_[idx].data;
     fb->size = ext_fb_list_[idx].size;
     ASSERT_EQ(0, ext_fb_list_[idx].in_use);
@@ -226,13 +232,13 @@ class ExternalFrameBufferMD5Test
 
   void OpenMD5File(const std::string &md5_file_name_) {
     md5_file_ = libaom_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL)
+    ASSERT_NE(md5_file_, nullptr)
         << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
   virtual void DecompressedFrameHook(const aom_image_t &img,
                                      const unsigned int frame_number) {
-    ASSERT_TRUE(md5_file_ != NULL);
+    ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
 
@@ -310,14 +316,14 @@ class ExternalFrameBufferTest : public ::testing::Test {
 
   virtual void SetUp() {
     video_ = new libaom_test::WebMVideoSource(kAV1TestFile);
-    ASSERT_TRUE(video_ != NULL);
+    ASSERT_NE(video_, nullptr);
     video_->Init();
     video_->Begin();
 
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
     cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
     decoder_ = new libaom_test::AV1Decoder(cfg, 0);
-    ASSERT_TRUE(decoder_ != NULL);
+    ASSERT_NE(decoder_, nullptr);
   }
 
   virtual void TearDown() {
@@ -378,14 +384,14 @@ class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
  protected:
   virtual void SetUp() {
     video_ = new libaom_test::IVFVideoSource(kAV1NonRefTestFile);
-    ASSERT_TRUE(video_ != NULL);
+    ASSERT_NE(video_, nullptr);
     video_->Init();
     video_->Begin();
 
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
     cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
     decoder_ = new libaom_test::AV1Decoder(cfg, 0);
-    ASSERT_TRUE(decoder_ != NULL);
+    ASSERT_NE(decoder_, nullptr);
   }
 
   virtual void CheckFrameBufferRelease() {
@@ -424,7 +430,7 @@ TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
     return;
 #endif
   }
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video, nullptr);
   video->Init();
 
   // Construct md5 file name.
@@ -532,7 +538,7 @@ TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
 }
 #endif  // CONFIG_WEBM_IO
 
-AV1_INSTANTIATE_TEST_CASE(
+AV1_INSTANTIATE_TEST_SUITE(
     ExternalFrameBufferMD5Test,
     ::testing::ValuesIn(libaom_test::kAV1TestVectors,
                         libaom_test::kAV1TestVectors +
diff --git a/media/libaom/src/test/fdct4x4_test.cc b/media/libaom/src/test/fdct4x4_test.cc
index 6600f2c466..046d8107b3 100644
--- a/media/libaom/src/test/fdct4x4_test.cc
+++ b/media/libaom/src/test/fdct4x4_test.cc
@@ -20,7 +20,6 @@
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/transform_test_base.h"
 #include "test/util.h"
@@ -72,7 +71,7 @@ class Trans4x4FDCT : public libaom_test::TransformTestBase<OutputType>,
     TxfmBaseOutType::mask_ = (1 << TxfmBaseOutType::bit_depth_) - 1;
     TxfmBaseOutType::num_coeffs_ = std::get<3>(this->GetParam());
   }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   void RunFwdTxfm(const int16_t *in, OutputType *out, int stride) {
@@ -89,10 +88,12 @@ class Trans4x4FDCT : public libaom_test::TransformTestBase<OutputType>,
 };
 
 using Trans4x4FDCTTranLow = Trans4x4FDCT<tran_low_t>;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Trans4x4FDCTTranLow);
 TEST_P(Trans4x4FDCTTranLow, CoeffCheck) { RunCoeffCheck(); }
 TEST_P(Trans4x4FDCTTranLow, MemCheck) { RunMemCheck(); }
 
 using Trans4x4FDCTInt16 = Trans4x4FDCT<int16_t>;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Trans4x4FDCTInt16);
 TEST_P(Trans4x4FDCTInt16, CoeffCheck) { RunCoeffCheck(); }
 TEST_P(Trans4x4FDCTInt16, MemCheck) { RunMemCheck(); }
 
diff --git a/media/libaom/src/test/fft_test.cc b/media/libaom/src/test/fft_test.cc
index d23aa012c4..7fce0f8c4e 100644
--- a/media/libaom/src/test/fft_test.cc
+++ b/media/libaom/src/test/fft_test.cc
@@ -92,6 +92,9 @@ class FFT2DTest : public ::testing::TestWithParam<FFTTestArg> {
     input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n);
     temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n);
     output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n * 2);
+    ASSERT_NE(input_, nullptr);
+    ASSERT_NE(temp_, nullptr);
+    ASSERT_NE(output_, nullptr);
     memset(input_, 0, sizeof(*input_) * n * n);
     memset(temp_, 0, sizeof(*temp_) * n * n);
     memset(output_, 0, sizeof(*output_) * n * n * 2);
@@ -126,12 +129,14 @@ TEST_P(FFT2DTest, Correct) {
 TEST_P(FFT2DTest, Benchmark) {
   int n = GetParam().n;
   float sum = 0;
-  for (int i = 0; i < 1000 * (64 - n); ++i) {
+  const int num_trials = 1000 * (64 - n);
+  for (int i = 0; i < num_trials; ++i) {
     input_[i % (n * n)] = 1;
     GetParam().fft(&input_[0], &temp_[0], &output_[0]);
     sum += output_[0];
     input_[i % (n * n)] = 0;
   }
+  EXPECT_NEAR(sum, num_trials, 1e-3);
 }
 
 INSTANTIATE_TEST_SUITE_P(C, FFT2DTest,
@@ -177,6 +182,9 @@ class IFFT2DTest : public ::testing::TestWithParam<IFFTTestArg> {
     input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n * 2);
     temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n * 2);
     output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n);
+    ASSERT_NE(input_, nullptr);
+    ASSERT_NE(temp_, nullptr);
+    ASSERT_NE(output_, nullptr);
     memset(input_, 0, sizeof(*input_) * n * n * 2);
     memset(temp_, 0, sizeof(*temp_) * n * n * 2);
     memset(output_, 0, sizeof(*output_) * n * n);
@@ -216,17 +224,19 @@ TEST_P(IFFT2DTest, Correctness) {
       expected[y * n + x] = 0;
     }
   }
-};
+}
 
 TEST_P(IFFT2DTest, Benchmark) {
   int n = GetParam().n;
   float sum = 0;
-  for (int i = 0; i < 1000 * (64 - n); ++i) {
+  const int num_trials = 1000 * (64 - n);
+  for (int i = 0; i < num_trials; ++i) {
     input_[i % (n * n)] = 1;
     GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
     sum += output_[0];
     input_[i % (n * n)] = 0;
   }
+  EXPECT_GE(sum, num_trials / 2);
 }
 INSTANTIATE_TEST_SUITE_P(
     C, IFFT2DTest,
diff --git a/media/libaom/src/test/film_grain_table_test.cc b/media/libaom/src/test/film_grain_table_test.cc
index 524d67d7bc..31fb908ffa 100644
--- a/media/libaom/src/test/film_grain_table_test.cc
+++ b/media/libaom/src/test/film_grain_table_test.cc
@@ -101,6 +101,20 @@ TEST(FilmGrainTableTest, AddAndLookupSingleSegment) {
   aom_film_grain_table_free(&table);
 }
 
+TEST(FilmGrainTableTest, AddSingleSegmentRemoveBiggerSegment) {
+  aom_film_grain_table_t table;
+  aom_film_grain_t grain;
+
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_table_append(&table, 0, 1000, film_grain_test_vectors + 0);
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 0, 1100, true, &grain));
+
+  EXPECT_EQ(0, table.head);
+  EXPECT_EQ(0, table.tail);
+  aom_film_grain_table_free(&table);
+}
+
 TEST(FilmGrainTableTest, SplitSingleSegment) {
   aom_film_grain_table_t table;
   aom_film_grain_t grain;
diff --git a/media/libaom/src/test/filterintra_test.cc b/media/libaom/src/test/filterintra_test.cc
index 284353c69c..c54bec5e59 100644
--- a/media/libaom/src/test/filterintra_test.cc
+++ b/media/libaom/src/test/filterintra_test.cc
@@ -16,7 +16,6 @@
 #include "config/av1_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/enums.h"
@@ -53,13 +52,15 @@ class AV1FilterIntraPredTest : public ::testing::TestWithParam<PredParams> {
     alloc_ = new uint8_t[2 * MaxTxSize + 1];
     predRef_ = new uint8_t[MaxTxSize * MaxTxSize];
     pred_ = new uint8_t[MaxTxSize * MaxTxSize];
+    ASSERT_NE(alloc_, nullptr);
+    ASSERT_NE(predRef_, nullptr);
+    ASSERT_NE(pred_, nullptr);
   }
 
   virtual void TearDown() {
     delete[] alloc_;
     delete[] predRef_;
     delete[] pred_;
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -71,12 +72,44 @@ class AV1FilterIntraPredTest : public ::testing::TestWithParam<PredParams> {
     while (tstIndex < MaxTestNum) {
       PrepareBuffer();
       predFuncRef_(predRef_, stride, txSize_, &above[1], left, mode_);
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           predFunc_(pred_, stride, txSize_, &above[1], left, mode_));
       DiffPred(tstIndex);
       tstIndex += 1;
     }
   }
+  void RunSpeedTest() const {
+    int stride = tx_size_wide[txSize_];
+    uint8_t *left = alloc_;
+    uint8_t *above = alloc_ + MaxTxSize;
+    const int numIter = 5000;
+
+    PrepareBuffer();
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (int i = 0; i < numIter; i++) {
+      predFuncRef_(predRef_, stride, txSize_, &above[1], left, mode_);
+    }
+    aom_usec_timer_mark(&ref_timer);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < numIter; i++) {
+      predFunc_(pred_, stride, txSize_, &above[1], left, mode_);
+    }
+    aom_usec_timer_mark(&timer);
+
+    const int ref_sum_time =
+        static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+    const int sum_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+    printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \t mode =  %d \n",
+           ref_sum_time, sum_time,
+           (static_cast<float>(ref_sum_time) / static_cast<float>(sum_time)),
+           static_cast<int>(mode_));
+
+    DiffPred(0);
+  }
 
  private:
   void PrepareBuffer() const {
@@ -110,8 +143,10 @@ class AV1FilterIntraPredTest : public ::testing::TestWithParam<PredParams> {
 
 TEST_P(AV1FilterIntraPredTest, BitExactCheck) { RunTest(); }
 
-using std::make_tuple;
+TEST_P(AV1FilterIntraPredTest, DISABLED_Speed) { RunSpeedTest(); }
 
+using ::testing::make_tuple;
+#if HAVE_SSE4_1
 const PredFuncMode kPredFuncMdArray[] = {
   make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
              FILTER_DC_PRED),
@@ -133,4 +168,30 @@ INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1FilterIntraPredTest,
     ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray),
                        ::testing::ValuesIn(kTxSize)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+const PredFuncMode kPredFuncMdArrayNEON[] = {
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+             FILTER_DC_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+             FILTER_V_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+             FILTER_H_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+             FILTER_D157_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+             FILTER_PAETH_PRED),
+};
+
+const TX_SIZE kTxSizeNEON[] = { TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_4X8,
+                                TX_8X4,  TX_8X16, TX_16X8,  TX_16X32, TX_32X16,
+                                TX_4X16, TX_16X4, TX_8X32,  TX_32X8 };
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1FilterIntraPredTest,
+    ::testing::Combine(::testing::ValuesIn(kPredFuncMdArrayNEON),
+                       ::testing::ValuesIn(kTxSizeNEON)));
+#endif  // HAVE_NEON
+
 }  // namespace
diff --git a/media/libaom/src/test/firstpass_test.cc b/media/libaom/src/test/firstpass_test.cc
new file mode 100644
index 0000000000..f7d8f2e056
--- /dev/null
+++ b/media/libaom/src/test/firstpass_test.cc
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stddef.h>
+
+#include "av1/common/common.h"
+#include "av1/encoder/firstpass.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+TEST(FirstpassTest, FirstpassInfoInitWithExtBuf) {
+  FIRSTPASS_INFO firstpass_info;
+  FIRSTPASS_STATS ext_stats_buf[10];
+  const int ref_stats_size = 10;
+  for (int i = 0; i < ref_stats_size; ++i) {
+    av1_zero(ext_stats_buf[i]);
+    ext_stats_buf[i].frame = i;
+  }
+  aom_codec_err_t ret =
+      av1_firstpass_info_init(&firstpass_info, ext_stats_buf, 10);
+  EXPECT_EQ(firstpass_info.stats_count, ref_stats_size);
+  EXPECT_EQ(firstpass_info.future_stats_count + firstpass_info.past_stats_count,
+            firstpass_info.stats_count);
+  EXPECT_EQ(firstpass_info.cur_index, 0);
+  EXPECT_EQ(ret, AOM_CODEC_OK);
+}
+
+TEST(FirstpassTest, FirstpassInfoInitWithStaticBuf) {
+  FIRSTPASS_INFO firstpass_info;
+  aom_codec_err_t ret = av1_firstpass_info_init(&firstpass_info, NULL, 0);
+  EXPECT_EQ(firstpass_info.stats_count, 0);
+  EXPECT_EQ(firstpass_info.cur_index, 0);
+  EXPECT_EQ(ret, AOM_CODEC_OK);
+}
+
+TEST(FirstpassTest, FirstpassInfoPushPop) {
+  FIRSTPASS_INFO firstpass_info;
+  av1_firstpass_info_init(&firstpass_info, NULL, 0);
+  EXPECT_EQ(firstpass_info.stats_buf_size, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+  for (int i = 0; i < FIRSTPASS_INFO_STATIC_BUF_SIZE; ++i) {
+    FIRSTPASS_STATS stats;
+    av1_zero(stats);
+    stats.frame = i;
+    aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+  EXPECT_EQ(firstpass_info.stats_count, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+  const int pop_count = FIRSTPASS_INFO_STATIC_BUF_SIZE / 2;
+  for (int i = 0; i < pop_count; ++i) {
+    const FIRSTPASS_STATS *stats = av1_firstpass_info_peek(&firstpass_info, 0);
+    aom_codec_err_t ret =
+        av1_firstpass_info_move_cur_index_and_pop(&firstpass_info);
+    EXPECT_NE(stats, nullptr);
+    EXPECT_EQ(stats->frame, i);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+  EXPECT_EQ(firstpass_info.stats_count,
+            FIRSTPASS_INFO_STATIC_BUF_SIZE - pop_count);
+
+  const int push_count = FIRSTPASS_INFO_STATIC_BUF_SIZE / 2;
+  for (int i = 0; i < push_count; ++i) {
+    FIRSTPASS_STATS stats;
+    av1_zero(stats);
+    aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+  EXPECT_EQ(firstpass_info.stats_count, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+
+  EXPECT_EQ(firstpass_info.stats_count, firstpass_info.stats_buf_size);
+  // Push the stats when the queue is full.
+  FIRSTPASS_STATS stats;
+  av1_zero(stats);
+  aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+  EXPECT_EQ(ret, AOM_CODEC_ERROR);
+}
+
+TEST(FirstpassTest, FirstpassInfoTotalStats) {
+  FIRSTPASS_INFO firstpass_info;
+  av1_firstpass_info_init(&firstpass_info, NULL, 0);
+  EXPECT_EQ(firstpass_info.total_stats.frame, 0);
+  for (int i = 0; i < 10; ++i) {
+    FIRSTPASS_STATS stats;
+    av1_zero(stats);
+    stats.count = 1;
+    av1_firstpass_info_push(&firstpass_info, &stats);
+  }
+  EXPECT_EQ(firstpass_info.total_stats.count, 10);
+}
+
+TEST(FirstpassTest, FirstpassInfoMoveCurr) {
+  FIRSTPASS_INFO firstpass_info;
+  av1_firstpass_info_init(&firstpass_info, NULL, 0);
+  int frame_cnt = 0;
+  EXPECT_EQ(firstpass_info.stats_buf_size, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+  for (int i = 0; i < FIRSTPASS_INFO_STATIC_BUF_SIZE; ++i) {
+    FIRSTPASS_STATS stats;
+    av1_zero(stats);
+    stats.frame = frame_cnt;
+    ++frame_cnt;
+    aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+  EXPECT_EQ(firstpass_info.cur_index, firstpass_info.start_index);
+  aom_codec_err_t ret = av1_firstpass_info_pop(&firstpass_info);
+  // We cannot pop when cur_index == start_index
+  EXPECT_EQ(ret, AOM_CODEC_ERROR);
+  int ref_frame_cnt = 0;
+  const int move_count = FIRSTPASS_INFO_STATIC_BUF_SIZE * 2 / 3;
+  for (int i = 0; i < move_count; ++i) {
+    const FIRSTPASS_STATS *this_stats =
+        av1_firstpass_info_peek(&firstpass_info, 0);
+    EXPECT_EQ(this_stats->frame, ref_frame_cnt);
+    ++ref_frame_cnt;
+    av1_firstpass_info_move_cur_index(&firstpass_info);
+  }
+  EXPECT_EQ(firstpass_info.future_stats_count,
+            FIRSTPASS_INFO_STATIC_BUF_SIZE - move_count);
+  EXPECT_EQ(firstpass_info.past_stats_count, move_count);
+  EXPECT_EQ(firstpass_info.stats_count, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+
+  const int test_count = FIRSTPASS_INFO_STATIC_BUF_SIZE / 2;
+  for (int i = 0; i < test_count; ++i) {
+    aom_codec_err_t ret = av1_firstpass_info_pop(&firstpass_info);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+
+  // Pop #test_count stats
+  for (int i = 0; i < test_count; ++i) {
+    FIRSTPASS_STATS stats;
+    av1_zero(stats);
+    stats.frame = frame_cnt;
+    ++frame_cnt;
+    aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+
+  // peek and move #test_count stats
+  for (int i = 0; i < test_count; ++i) {
+    const FIRSTPASS_STATS *this_stats =
+        av1_firstpass_info_peek(&firstpass_info, 0);
+    EXPECT_EQ(this_stats->frame, ref_frame_cnt);
+    ++ref_frame_cnt;
+    av1_firstpass_info_move_cur_index(&firstpass_info);
+  }
+
+  // pop #test_count stats
+  for (int i = 0; i < test_count; ++i) {
+    aom_codec_err_t ret = av1_firstpass_info_pop(&firstpass_info);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+}
+
+}  // namespace
diff --git a/media/libaom/src/test/frame_error_test.cc b/media/libaom/src/test/frame_error_test.cc
index 6d74a68f2a..c355efc8ac 100644
--- a/media/libaom/src/test/frame_error_test.cc
+++ b/media/libaom/src/test/frame_error_test.cc
@@ -20,7 +20,6 @@
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -44,7 +43,7 @@ class AV1FrameErrorTest : public ::testing::TestWithParam<FrameErrorParam> {
   virtual void SetUp() {
     rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
   }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   void RandomValues(frame_error_func test_impl, int width, int height);
@@ -52,6 +51,7 @@ class AV1FrameErrorTest : public ::testing::TestWithParam<FrameErrorParam> {
   void RunSpeedTest(frame_error_func test_impl, int width, int height);
   libaom_test::ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1FrameErrorTest);
 
 void AV1FrameErrorTest::RandomValues(frame_error_func test_impl, int width,
                                      int height) {
@@ -61,8 +61,8 @@ void AV1FrameErrorTest::RandomValues(frame_error_func test_impl, int width,
       static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
   uint8_t *const ref =
       static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
-  ASSERT_TRUE(dst != NULL);
-  ASSERT_TRUE(ref != NULL);
+  ASSERT_NE(dst, nullptr);
+  ASSERT_NE(ref, nullptr);
   for (int i = 0; i < max_blk_size; ++i) {
     dst[i] = rnd_.Rand8();
     ref[i] = rnd_.Rand8();
@@ -83,8 +83,8 @@ void AV1FrameErrorTest::ExtremeValues(frame_error_func test_impl, int width,
       static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
   uint8_t *const ref =
       static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
-  ASSERT_TRUE(dst != NULL);
-  ASSERT_TRUE(ref != NULL);
+  ASSERT_NE(dst, nullptr);
+  ASSERT_NE(ref, nullptr);
   for (int r = 0; r < 2; r++) {
     if (r == 0) {
       memset(dst, 0, max_blk_size);
@@ -111,8 +111,8 @@ void AV1FrameErrorTest::RunSpeedTest(frame_error_func test_impl, int width,
       static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
   uint8_t *const ref =
       static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
-  ASSERT_TRUE(dst != NULL);
-  ASSERT_TRUE(ref != NULL);
+  ASSERT_NE(dst, nullptr);
+  ASSERT_NE(ref, nullptr);
   for (int i = 0; i < max_blk_size; ++i) {
     dst[i] = ref[i] = rnd_.Rand8();
   }
diff --git a/media/libaom/src/test/frame_parallel_enc_test.cc b/media/libaom/src/test/frame_parallel_enc_test.cc
new file mode 100644
index 0000000000..cb14d39f74
--- /dev/null
+++ b/media/libaom/src/test/frame_parallel_enc_test.cc
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+#if (CONFIG_FRAME_PARALLEL_ENCODE && CONFIG_FPMT_TEST && !CONFIG_REALTIME_ONLY)
+class AVxFrameParallelThreadEncodeTest
+    : public ::libaom_test::CodecTestWith3Params<int, int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AVxFrameParallelThreadEncodeTest()
+      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
+        set_cpu_used_(GET_PARAM(1)), tile_cols_(GET_PARAM(2)),
+        tile_rows_(GET_PARAM(3)) {
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = 1280;
+    cfg.h = 720;
+    cfg.allow_lowbitdepth = 1;
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+  }
+  virtual ~AVxFrameParallelThreadEncodeTest() { delete decoder_; }
+
+  virtual void SetUp() {
+    InitializeConfig(::libaom_test::kTwoPassGood);
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_2pass_vbr_minsection_pct = 5;
+    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.g_threads = 16;
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    encoder_initialized_ = false;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+                                  ::libaom_test::Encoder *encoder) {
+    if (encoder_initialized_) return;
+    SetTileSize(encoder);
+    encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+    encoder->Control(AV1E_SET_FP_MT, 1);
+    encoder->Control(AV1E_SET_FP_MT_UNIT_TEST, enable_actual_parallel_encode_);
+    encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+    encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+    encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
+
+    encoder_initialized_ = true;
+  }
+
+  virtual void SetTileSize(libaom_test::Encoder *encoder) {
+    encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+    encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    size_enc_.push_back(pkt->data.frame.sz);
+
+    ::libaom_test::MD5 md5_enc;
+    md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+                pkt->data.frame.sz);
+    md5_enc_.push_back(md5_enc.Get());
+
+    const aom_codec_err_t res = decoder_->DecodeFrame(
+        reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != AOM_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(AOM_CODEC_OK, res);
+    }
+    const aom_image_t *img = decoder_->GetDxData().Next();
+
+    if (img) {
+      ::libaom_test::MD5 md5_res;
+      md5_res.Add(img);
+      md5_dec_.push_back(md5_res.Get());
+    }
+  }
+
+  void DoTest(::libaom_test::VideoSource *input_video) {
+    /* This is the actual parallel encode of frames using multiple cpis.
+     * The parallel frames are independently encoded.
+     * Threads are distributed among the parallel frames whereas non-parallel
+     * frames use all the threads. Example: for 8 threads, in case of 4 frames
+     * in a parallel encode set, each frame gets 2 threads. In case of 3 frames
+     * in a parallel encode set, threads are distributed as 2, 3 ,3.
+     */
+    enable_actual_parallel_encode_ = 1;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(input_video));
+    std::vector<size_t> enc_stream_fpmt_size;
+    std::vector<std::string> enc_stream_fpmt;
+    std::vector<std::string> dec_stream_fpmt;
+    enc_stream_fpmt_size = size_enc_;
+    enc_stream_fpmt = md5_enc_;
+    dec_stream_fpmt = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    /* This is the simulation of parallel encode of frames using single cpi.
+     * In simulation, it should be ensured to have no dependency across frames
+     * (similar to parallel encode).
+     * Each frame uses all the threads configured.
+     */
+    enable_actual_parallel_encode_ = 0;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(input_video));
+    std::vector<size_t> enc_stream_sim_size;
+    std::vector<std::string> enc_stream_sim;
+    std::vector<std::string> dec_stream_sim;
+    enc_stream_sim_size = size_enc_;
+    enc_stream_sim = md5_enc_;
+    dec_stream_sim = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Check that the vectors are equal.
+    ASSERT_EQ(enc_stream_sim_size, enc_stream_fpmt_size);
+    ASSERT_EQ(enc_stream_sim, enc_stream_fpmt);
+    ASSERT_EQ(dec_stream_sim, dec_stream_fpmt);
+  }
+
+  bool encoder_initialized_;
+  int set_cpu_used_;
+  int tile_cols_;
+  int tile_rows_;
+  int enable_actual_parallel_encode_;
+  ::libaom_test::Decoder *decoder_;
+  std::vector<size_t> size_enc_;
+  std::vector<std::string> md5_enc_;
+  std::vector<std::string> md5_dec_;
+};
+
+class AVxFrameParallelThreadEncodeHDResTestLarge
+    : public AVxFrameParallelThreadEncodeTest {};
+
+TEST_P(AVxFrameParallelThreadEncodeHDResTestLarge,
+       FrameParallelThreadEncodeTest) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 500;
+  DoTest(&video);
+}
+
+class AVxFrameParallelThreadEncodeLowResTestLarge
+    : public AVxFrameParallelThreadEncodeTest {};
+
+TEST_P(AVxFrameParallelThreadEncodeLowResTestLarge,
+       FrameParallelThreadEncodeTest) {
+  ::libaom_test::YUVVideoSource video("hantro_collage_w352h288.yuv",
+                                      AOM_IMG_FMT_I420, 352, 288, 30, 1, 0, 60);
+  cfg_.rc_target_bitrate = 200;
+  DoTest(&video);
+}
+
+class AVxFrameParallelThreadEncodeLowResTest
+    : public AVxFrameParallelThreadEncodeTest {};
+
+TEST_P(AVxFrameParallelThreadEncodeLowResTest, FrameParallelThreadEncodeTest) {
+  ::libaom_test::YUVVideoSource video("hantro_collage_w352h288.yuv",
+                                      AOM_IMG_FMT_I420, 352, 288, 30, 1, 0, 60);
+  cfg_.rc_target_bitrate = 200;
+  DoTest(&video);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(AVxFrameParallelThreadEncodeHDResTestLarge,
+                           ::testing::Values(2, 3, 4, 5, 6),
+                           ::testing::Values(0, 1, 2), ::testing::Values(0, 1));
+
+AV1_INSTANTIATE_TEST_SUITE(AVxFrameParallelThreadEncodeLowResTestLarge,
+                           ::testing::Values(2, 3), ::testing::Values(0, 1, 2),
+                           ::testing::Values(0, 1));
+
+AV1_INSTANTIATE_TEST_SUITE(AVxFrameParallelThreadEncodeLowResTest,
+                           ::testing::Values(4, 5, 6), ::testing::Values(1),
+                           ::testing::Values(0));
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE &&
+        // CONFIG_FPMT_TEST && !CONFIG_REALTIME_ONLY
+
+}  // namespace
diff --git a/media/libaom/src/test/frame_size_tests.cc b/media/libaom/src/test/frame_size_tests.cc
index 1546012a30..2365a20c24 100644
--- a/media/libaom/src/test/frame_size_tests.cc
+++ b/media/libaom/src/test/frame_size_tests.cc
@@ -12,6 +12,7 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/video_source.h"
+#include "test/util.h"
 
 namespace {
 
@@ -22,10 +23,7 @@ class AV1FrameSizeTests : public ::testing::Test,
       : EncoderTest(&::libaom_test::kAV1), expected_res_(AOM_CODEC_OK) {}
   virtual ~AV1FrameSizeTests() {}
 
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(::libaom_test::kRealTime);
-  }
+  virtual void SetUp() { InitializeConfig(::libaom_test::kRealTime); }
 
   virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
                                   libaom_test::Decoder *decoder) {
@@ -74,5 +72,64 @@ TEST_F(AV1FrameSizeTests, OneByOneVideo) {
   expected_res_ = AOM_CODEC_OK;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
-#undef ONE_BY_ONE_VIDEO_NAME
+
+#if !CONFIG_REALTIME_ONLY
+typedef struct {
+  unsigned int width;
+  unsigned int height;
+} FrameSizeParam;
+
+const FrameSizeParam FrameSizeTestParams[] = { { 96, 96 }, { 176, 144 } };
+
+// This unit test is used to validate the allocated size of compressed data
+// (ctx->cx_data) buffer, by feeding pseudo random input to the encoder in
+// lossless encoding mode.
+//
+// If compressed data buffer is not large enough, the av1_get_compressed_data()
+// call in av1/av1_cx_iface.c will overflow the buffer.
+class AV1LosslessFrameSizeTests
+    : public ::libaom_test::CodecTestWith2Params<FrameSizeParam,
+                                                 ::libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AV1LosslessFrameSizeTests()
+      : EncoderTest(GET_PARAM(0)), frame_size_param_(GET_PARAM(1)),
+        encoding_mode_(GET_PARAM(2)) {}
+  virtual ~AV1LosslessFrameSizeTests() {}
+
+  virtual void SetUp() { InitializeConfig(encoding_mode_); }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
+    return !::testing::Test::HasFailure();
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 6);
+      encoder->Control(AV1E_SET_LOSSLESS, 1);
+    }
+  }
+
+  const FrameSizeParam frame_size_param_;
+  const ::libaom_test::TestMode encoding_mode_;
+  int expected_res_;
+};
+
+TEST_P(AV1LosslessFrameSizeTests, LosslessEncode) {
+  ::libaom_test::RandomVideoSource video;
+
+  video.SetSize(frame_size_param_.width, frame_size_param_.height);
+  video.set_limit(10);
+  expected_res_ = AOM_CODEC_OK;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(AV1LosslessFrameSizeTests,
+                           ::testing::ValuesIn(FrameSizeTestParams),
+                           testing::Values(::libaom_test::kAllIntra));
+#endif  // !CONFIG_REALTIME_ONLY
+
 }  // namespace
diff --git a/media/libaom/src/test/function_equivalence_test.h b/media/libaom/src/test/function_equivalence_test.h
index a299c48d43..a7116b1ced 100644
--- a/media/libaom/src/test/function_equivalence_test.h
+++ b/media/libaom/src/test/function_equivalence_test.h
@@ -16,7 +16,6 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/util.h"
 
 using libaom_test::ACMRandom;
@@ -60,7 +59,7 @@ class FunctionEquivalenceTest : public ::testing::TestWithParam<FuncParam<T> > {
 
   virtual void SetUp() { params_ = this->GetParam(); }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   ACMRandom rng_;
diff --git a/media/libaom/src/test/fwd_kf_test.cc b/media/libaom/src/test/fwd_kf_test.cc
deleted file mode 100644
index 50c2f36d83..0000000000
--- a/media/libaom/src/test/fwd_kf_test.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <ostream>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/i420_video_source.h"
-#include "test/util.h"
-
-namespace {
-
-typedef struct {
-  const int max_kf_dist;
-  const double psnr_thresh;
-} FwdKfTestParam;
-
-const FwdKfTestParam kTestParams[] = {
-  { 4, 33.4 },  { 6, 32.9 },  { 8, 32.6 },
-  { 12, 32.4 }, { 16, 32.3 }, { 18, 32.1 }
-};
-
-std::ostream &operator<<(std::ostream &os, const FwdKfTestParam &test_arg) {
-  return os << "FwdKfTestParam { max_kf_dist:" << test_arg.max_kf_dist
-            << " psnr_thresh:" << test_arg.psnr_thresh << " }";
-}
-
-class ForwardKeyTest
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
-                                                 FwdKfTestParam>,
-      public ::libaom_test::EncoderTest {
- protected:
-  ForwardKeyTest()
-      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
-        kf_max_dist_param_(GET_PARAM(2)) {}
-  virtual ~ForwardKeyTest() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
-    const aom_rational timebase = { 1, 30 };
-    cfg_.g_timebase = timebase;
-    cpu_used_ = 2;
-    kf_max_dist_ = kf_max_dist_param_.max_kf_dist;
-    psnr_threshold_ = kf_max_dist_param_.psnr_thresh;
-    cfg_.rc_end_usage = AOM_VBR;
-    cfg_.rc_target_bitrate = 200;
-    cfg_.g_lag_in_frames = 10;
-    cfg_.fwd_kf_enabled = 1;
-    cfg_.kf_max_dist = kf_max_dist_;
-    cfg_.g_threads = 0;
-    init_flags_ = AOM_CODEC_USE_PSNR;
-  }
-
-  virtual void BeginPassHook(unsigned int) {
-    psnr_ = 0.0;
-    nframes_ = 0;
-  }
-
-  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
-    psnr_ += pkt->data.psnr.psnr[0];
-    nframes_++;
-  }
-
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
-    if (video->frame() == 0) {
-      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
-      if (encoding_mode_ != ::libaom_test::kRealTime) {
-        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
-        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
-        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
-      }
-    }
-  }
-
-  double GetAveragePsnr() const {
-    if (nframes_) return psnr_ / nframes_;
-    return 0.0;
-  }
-
-  double GetPsnrThreshold() { return psnr_threshold_; }
-
-  ::libaom_test::TestMode encoding_mode_;
-  const FwdKfTestParam kf_max_dist_param_;
-  double psnr_threshold_;
-  int kf_max_dist_;
-  int cpu_used_;
-  int nframes_;
-  double psnr_;
-};
-
-TEST_P(ForwardKeyTest, ForwardKeyEncodeTest) {
-  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
-                                     0, 20);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  // TODO(sarahparker) Add functionality to assert the minimum number of
-  // keyframes were placed.
-  EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold())
-      << "kf max dist = " << kf_max_dist_;
-}
-
-AV1_INSTANTIATE_TEST_CASE(ForwardKeyTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood),
-                          ::testing::ValuesIn(kTestParams));
-}  // namespace
diff --git a/media/libaom/src/test/fwht4x4_test.cc b/media/libaom/src/test/fwht4x4_test.cc
index d2f77b8d47..f39722b9e7 100644
--- a/media/libaom/src/test/fwht4x4_test.cc
+++ b/media/libaom/src/test/fwht4x4_test.cc
@@ -20,7 +20,6 @@
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/transform_test_base.h"
 #include "test/util.h"
@@ -37,7 +36,7 @@ typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
 
 using libaom_test::FhtFunc;
 
-typedef std::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int>
+typedef std::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int, FdctFunc>
     Dct4x4Param;
 
 void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
@@ -45,14 +44,26 @@ void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
   av1_fwht4x4_c(in, out, stride);
 }
 
-void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+void iwht4x4_10_c(const tran_low_t *in, uint8_t *out, int stride) {
   av1_highbd_iwht4x4_16_add_c(in, out, stride, 10);
 }
 
-void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+void iwht4x4_12_c(const tran_low_t *in, uint8_t *out, int stride) {
   av1_highbd_iwht4x4_16_add_c(in, out, stride, 12);
 }
 
+#if HAVE_SSE4_1
+
+void iwht4x4_10_sse4_1(const tran_low_t *in, uint8_t *out, int stride) {
+  av1_highbd_iwht4x4_16_add_sse4_1(in, out, stride, 10);
+}
+
+void iwht4x4_12_sse4_1(const tran_low_t *in, uint8_t *out, int stride) {
+  av1_highbd_iwht4x4_16_add_sse4_1(in, out, stride, 12);
+}
+
+#endif
+
 class Trans4x4WHT : public libaom_test::TransformTestBase<tran_low_t>,
                     public ::testing::TestWithParam<Dct4x4Param> {
  public:
@@ -67,8 +78,9 @@ class Trans4x4WHT : public libaom_test::TransformTestBase<tran_low_t>,
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
     num_coeffs_ = GET_PARAM(4);
+    fwd_txfm_c_ = GET_PARAM(5);
   }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
@@ -77,9 +89,92 @@ class Trans4x4WHT : public libaom_test::TransformTestBase<tran_low_t>,
   void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
+  void RunSpeedTest() {
+    if (!fwd_txfm_c_) {
+      GTEST_SKIP();
+    } else {
+      ACMRandom rnd(ACMRandom::DeterministicSeed());
+      const int count_test_block = 10;
+      const int numIter = 5000;
+
+      int c_sum_time = 0;
+      int simd_sum_time = 0;
+
+      int stride = 96;
+
+      int16_t *input_block = reinterpret_cast<int16_t *>(
+          aom_memalign(16, sizeof(int16_t) * stride * height_));
+      ASSERT_NE(input_block, nullptr);
+      tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>(
+          aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_));
+      ASSERT_NE(output_ref_block, nullptr);
+      tran_low_t *output_block = reinterpret_cast<tran_low_t *>(
+          aom_memalign(16, sizeof(output_block[0]) * num_coeffs_));
+      ASSERT_NE(output_block, nullptr);
+
+      for (int i = 0; i < count_test_block; ++i) {
+        int j, k;
+        for (j = 0; j < height_; ++j) {
+          for (k = 0; k < pitch_; ++k) {
+            int in_idx = j * stride + k;
+            int out_idx = j * pitch_ + k;
+            input_block[in_idx] =
+                (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+            if (bit_depth_ == AOM_BITS_8) {
+              output_block[out_idx] = output_ref_block[out_idx] = rnd.Rand8();
+            } else {
+              output_block[out_idx] = output_ref_block[out_idx] =
+                  rnd.Rand16() & mask_;
+            }
+          }
+        }
+
+        aom_usec_timer c_timer_;
+        aom_usec_timer_start(&c_timer_);
+        for (int i = 0; i < numIter; i++) {
+          API_REGISTER_STATE_CHECK(
+              fwd_txfm_c_(input_block, output_ref_block, stride));
+        }
+        aom_usec_timer_mark(&c_timer_);
+
+        aom_usec_timer simd_timer_;
+        aom_usec_timer_start(&simd_timer_);
+
+        for (int i = 0; i < numIter; i++) {
+          API_REGISTER_STATE_CHECK(
+              fwd_txfm_(input_block, output_block, stride));
+        }
+        aom_usec_timer_mark(&simd_timer_);
+
+        c_sum_time += static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+        simd_sum_time += static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+        // The minimum quant value is 4.
+        for (j = 0; j < height_; ++j) {
+          for (k = 0; k < pitch_; ++k) {
+            int out_idx = j * pitch_ + k;
+            ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
+                << "Error: not bit-exact result at index: " << out_idx
+                << " at test block: " << i;
+          }
+        }
+      }
+
+      printf(
+          "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+          simd_sum_time,
+          (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+      aom_free(input_block);
+      aom_free(output_ref_block);
+      aom_free(output_block);
+    }
+  }
 
   FdctFunc fwd_txfm_;
   IdctFunc inv_txfm_;
+
+  FdctFunc fwd_txfm_c_;  // C version of forward transform for speed test.
 };
 
 TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0, 0.00001); }
@@ -89,12 +184,43 @@ TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); }
 TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
 
 TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
+
+TEST_P(Trans4x4WHT, DISABLED_Speed) { RunSpeedTest(); }
+
 using std::make_tuple;
 
 INSTANTIATE_TEST_SUITE_P(
     C, Trans4x4WHT,
-    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, DCT_DCT,
-                                 AOM_BITS_10, 16),
-                      make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, DCT_DCT,
-                                 AOM_BITS_12, 16)));
+    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10_c, DCT_DCT,
+                                 AOM_BITS_10, 16, static_cast<FdctFunc>(NULL)),
+                      make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12_c, DCT_DCT,
+                                 AOM_BITS_12, 16,
+                                 static_cast<FdctFunc>(NULL))));
+
+#if HAVE_SSE4_1
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, Trans4x4WHT,
+    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_sse4_1, &iwht4x4_10_sse4_1,
+                                 DCT_DCT, AOM_BITS_10, 16,
+                                 static_cast<FdctFunc>(NULL)),
+                      make_tuple(&av1_highbd_fwht4x4_sse4_1, &iwht4x4_12_sse4_1,
+                                 DCT_DCT, AOM_BITS_12, 16,
+                                 static_cast<FdctFunc>(NULL))));
+
+#endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, Trans4x4WHT,
+    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_neon, &iwht4x4_10_c,
+                                 DCT_DCT, AOM_BITS_10, 16,
+                                 &av1_highbd_fwht4x4_c),
+                      make_tuple(&av1_highbd_fwht4x4_neon, &iwht4x4_12_c,
+                                 DCT_DCT, AOM_BITS_12, 16,
+                                 &av1_highbd_fwht4x4_c)));
+
+#endif  // HAVE_NEON
+
 }  // namespace
diff --git a/media/libaom/src/test/gf_pyr_height_test.cc b/media/libaom/src/test/gf_pyr_height_test.cc
index b1ade67a6b..a2d1a8fe42 100644
--- a/media/libaom/src/test/gf_pyr_height_test.cc
+++ b/media/libaom/src/test/gf_pyr_height_test.cc
@@ -25,32 +25,32 @@ static const struct GFPyrHeightTestParam {
   double psnr_thresh;
 } kTestParams[] = {
   // gf_min_pyr_height = 0
-  { 0, 0, 33.40 },
-  { 0, 1, 34.00 },
+  { 0, 0, 32.30 },
+  { 0, 1, 33.90 },
   { 0, 2, 34.00 },
   { 0, 3, 34.20 },
   { 0, 4, 34.30 },
-  { 0, 5, 34.40 },
+  { 0, 5, 34.35 },
   // gf_min_pyr_height = 1
-  { 1, 1, 34.00 },
+  { 1, 1, 33.90 },
   { 1, 2, 34.00 },
   { 1, 3, 34.20 },
   { 1, 4, 34.30 },
-  { 1, 5, 34.40 },
+  { 1, 5, 34.35 },
   // gf_min_pyr_height = 2
   { 2, 2, 34.00 },
   { 2, 3, 34.20 },
   { 2, 4, 34.30 },
-  { 2, 5, 34.40 },
+  { 2, 5, 34.35 },
   // gf_min_pyr_height = 3
   { 3, 3, 34.20 },
   { 3, 4, 34.30 },
-  { 3, 5, 34.40 },
+  { 3, 5, 34.35 },
   // gf_min_pyr_height = 4
   { 4, 4, 34.30 },
-  { 4, 5, 34.40 },
+  { 4, 5, 34.35 },
   // gf_min_pyr_height = 5
-  { 5, 5, 34.40 },
+  { 5, 5, 34.35 },
 };
 
 // Compiler may decide to add some padding to the struct above for alignment,
@@ -82,8 +82,7 @@ class GFPyrHeightTest
   virtual ~GFPyrHeightTest() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
+    InitializeConfig(encoding_mode_);
     const aom_rational timebase = { 1, 30 };
     cfg_.g_timebase = timebase;
     cpu_used_ = 4;
@@ -150,7 +149,7 @@ TEST_P(GFPyrHeightTest, EncodeAndVerifyPSNR) {
       << "GF Max Pyramid Height = " << gf_max_pyr_height_;
 }
 
-AV1_INSTANTIATE_TEST_CASE(GFPyrHeightTest, NONREALTIME_TEST_MODES,
-                          ::testing::Values(AOM_Q, AOM_VBR),
-                          ::testing::ValuesIn(kTestParams));
+AV1_INSTANTIATE_TEST_SUITE(GFPyrHeightTest, NONREALTIME_TEST_MODES,
+                           ::testing::Values(AOM_Q, AOM_VBR),
+                           ::testing::ValuesIn(kTestParams));
 }  // namespace
diff --git a/media/libaom/src/test/gviz_api.py b/media/libaom/src/test/gviz_api.py
index d3a443dabf..d3a443dabf 100644..100755
--- a/media/libaom/src/test/gviz_api.py
+++ b/media/libaom/src/test/gviz_api.py
diff --git a/media/libaom/src/test/hadamard_test.cc b/media/libaom/src/test/hadamard_test.cc
index 7903259e7d..0d020264ed 100644
--- a/media/libaom/src/test/hadamard_test.cc
+++ b/media/libaom/src/test/hadamard_test.cc
@@ -16,7 +16,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -24,16 +23,50 @@ namespace {
 
 using libaom_test::ACMRandom;
 
-typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride,
-                             tran_low_t *b);
+using HadamardFunc = void (*)(const int16_t *a, ptrdiff_t a_stride,
+                              tran_low_t *b);
+// Low precision version of Hadamard Transform
+using HadamardLPFunc = void (*)(const int16_t *a, ptrdiff_t a_stride,
+                                int16_t *b);
+// Low precision version of Hadamard Transform 8x8 - Dual
+using HadamardLP8x8DualFunc = void (*)(const int16_t *a, ptrdiff_t a_stride,
+                                       int16_t *b);
+
+template <typename OutputType>
+void Hadamard4x4(const OutputType *a, OutputType *out) {
+  OutputType b[8];
+  for (int i = 0; i < 4; i += 2) {
+    b[i + 0] = (a[i * 4] + a[(i + 1) * 4]) >> 1;
+    b[i + 1] = (a[i * 4] - a[(i + 1) * 4]) >> 1;
+  }
+
+  out[0] = b[0] + b[2];
+  out[1] = b[1] + b[3];
+  out[2] = b[0] - b[2];
+  out[3] = b[1] - b[3];
+}
+
+template <typename OutputType>
+void ReferenceHadamard4x4(const int16_t *a, int a_stride, OutputType *b) {
+  OutputType input[16];
+  OutputType buf[16];
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      input[i * 4 + j] = static_cast<OutputType>(a[i * a_stride + j]);
+    }
+  }
+  for (int i = 0; i < 4; ++i) Hadamard4x4(input + i, buf + i * 4);
+  for (int i = 0; i < 4; ++i) Hadamard4x4(buf + i, b + i * 4);
+}
 
-void HadamardLoop(const tran_low_t *a, tran_low_t *out) {
-  tran_low_t b[8];
+template <typename OutputType>
+void HadamardLoop(const OutputType *a, OutputType *out) {
+  OutputType b[8];
   for (int i = 0; i < 8; i += 2) {
     b[i + 0] = a[i * 8] + a[(i + 1) * 8];
     b[i + 1] = a[i * 8] - a[(i + 1) * 8];
   }
-  tran_low_t c[8];
+  OutputType c[8];
   for (int i = 0; i < 8; i += 4) {
     c[i + 0] = b[i + 0] + b[i + 2];
     c[i + 1] = b[i + 1] + b[i + 3];
@@ -50,19 +83,29 @@ void HadamardLoop(const tran_low_t *a, tran_low_t *out) {
   out[5] = c[3] - c[7];
 }
 
-void ReferenceHadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) {
-  tran_low_t input[64];
-  tran_low_t buf[64];
+template <typename OutputType>
+void ReferenceHadamard8x8(const int16_t *a, int a_stride, OutputType *b) {
+  OutputType input[64];
+  OutputType buf[64];
   for (int i = 0; i < 8; ++i) {
     for (int j = 0; j < 8; ++j) {
-      input[i * 8 + j] = static_cast<tran_low_t>(a[i * a_stride + j]);
+      input[i * 8 + j] = static_cast<OutputType>(a[i * a_stride + j]);
     }
   }
   for (int i = 0; i < 8; ++i) HadamardLoop(input + i, buf + i * 8);
   for (int i = 0; i < 8; ++i) HadamardLoop(buf + i, b + i * 8);
 }
 
-void ReferenceHadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
+template <typename OutputType>
+void ReferenceHadamard8x8Dual(const int16_t *a, int a_stride, OutputType *b) {
+  /* The source is a 8x16 block. The destination is rearranged to 8x16.
+   * Input is 9 bit. */
+  ReferenceHadamard8x8(a, a_stride, b);
+  ReferenceHadamard8x8(a + 8, a_stride, b + 64);
+}
+
+template <typename OutputType>
+void ReferenceHadamard16x16(const int16_t *a, int a_stride, OutputType *b) {
   /* The source is a 16x16 block. The destination is rearranged to 8x32.
    * Input is 9 bit. */
   ReferenceHadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
@@ -73,16 +116,16 @@ void ReferenceHadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
   /* Overlay the 8x8 blocks and combine. */
   for (int i = 0; i < 64; ++i) {
     /* 8x8 steps the range up to 15 bits. */
-    const tran_low_t a0 = b[0];
-    const tran_low_t a1 = b[64];
-    const tran_low_t a2 = b[128];
-    const tran_low_t a3 = b[192];
+    const OutputType a0 = b[0];
+    const OutputType a1 = b[64];
+    const OutputType a2 = b[128];
+    const OutputType a3 = b[192];
 
     /* Prevent the result from escaping int16_t. */
-    const tran_low_t b0 = (a0 + a1) >> 1;
-    const tran_low_t b1 = (a0 - a1) >> 1;
-    const tran_low_t b2 = (a2 + a3) >> 1;
-    const tran_low_t b3 = (a2 - a3) >> 1;
+    const OutputType b0 = (a0 + a1) >> 1;
+    const OutputType b1 = (a0 - a1) >> 1;
+    const OutputType b2 = (a2 + a3) >> 1;
+    const OutputType b3 = (a2 - a3) >> 1;
 
     /* Store a 16 bit value. */
     b[0] = b0 + b2;
@@ -94,22 +137,23 @@ void ReferenceHadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
   }
 }
 
-void ReferenceHadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) {
+template <typename OutputType>
+void ReferenceHadamard32x32(const int16_t *a, int a_stride, OutputType *b) {
   ReferenceHadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0);
   ReferenceHadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256);
   ReferenceHadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512);
   ReferenceHadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768);
 
   for (int i = 0; i < 256; ++i) {
-    const tran_low_t a0 = b[0];
-    const tran_low_t a1 = b[256];
-    const tran_low_t a2 = b[512];
-    const tran_low_t a3 = b[768];
+    const OutputType a0 = b[0];
+    const OutputType a1 = b[256];
+    const OutputType a2 = b[512];
+    const OutputType a3 = b[768];
 
-    const tran_low_t b0 = (a0 + a1) >> 2;
-    const tran_low_t b1 = (a0 - a1) >> 2;
-    const tran_low_t b2 = (a2 + a3) >> 2;
-    const tran_low_t b3 = (a2 - a3) >> 2;
+    const OutputType b0 = (a0 + a1) >> 2;
+    const OutputType b1 = (a0 - a1) >> 2;
+    const OutputType b2 = (a2 + a3) >> 2;
+    const OutputType b3 = (a2 - a3) >> 2;
 
     b[0] = b0 + b2;
     b[256] = b1 + b3;
@@ -120,51 +164,67 @@ void ReferenceHadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) {
   }
 }
 
-struct HadamardFuncWithSize {
-  HadamardFuncWithSize(HadamardFunc f, int s) : func(f), block_size(s) {}
-  HadamardFunc func;
-  int block_size;
+template <typename OutputType>
+void ReferenceHadamard(const int16_t *a, int a_stride, OutputType *b, int bw,
+                       int bh) {
+  if (bw == 32 && bh == 32) {
+    ReferenceHadamard32x32(a, a_stride, b);
+  } else if (bw == 16 && bh == 16) {
+    ReferenceHadamard16x16(a, a_stride, b);
+  } else if (bw == 8 && bh == 8) {
+    ReferenceHadamard8x8(a, a_stride, b);
+  } else if (bw == 4 && bh == 4) {
+    ReferenceHadamard4x4(a, a_stride, b);
+  } else if (bw == 8 && bh == 16) {
+    ReferenceHadamard8x8Dual(a, a_stride, b);
+  } else {
+    GTEST_FAIL() << "Invalid Hadamard transform size " << bw << bh << std::endl;
+  }
+}
+
+template <typename HadamardFuncType>
+struct FuncWithSize {
+  FuncWithSize(HadamardFuncType f, int bw, int bh)
+      : func(f), block_width(bw), block_height(bh) {}
+  HadamardFuncType func;
+  int block_width;
+  int block_height;
 };
 
-std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) {
-  return os << "block size: " << hfs.block_size;
-}
+using HadamardFuncWithSize = FuncWithSize<HadamardFunc>;
+using HadamardLPFuncWithSize = FuncWithSize<HadamardLPFunc>;
+using HadamardLP8x8DualFuncWithSize = FuncWithSize<HadamardLP8x8DualFunc>;
 
-class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
+template <typename OutputType, typename HadamardFuncType>
+class HadamardTestBase
+    : public ::testing::TestWithParam<FuncWithSize<HadamardFuncType>> {
  public:
-  virtual void SetUp() {
-    h_func_ = GetParam().func;
-    bwh_ = GetParam().block_size;
-    block_size_ = bwh_ * bwh_;
-    rnd_.Reset(ACMRandom::DeterministicSeed());
+  explicit HadamardTestBase(const FuncWithSize<HadamardFuncType> &func_param) {
+    h_func_ = func_param.func;
+    bw_ = func_param.block_width;
+    bh_ = func_param.block_height;
   }
 
-  virtual int16_t Rand() = 0;
+  virtual void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
-  void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b,
-                         int bwh) {
-    if (bwh == 32)
-      ReferenceHadamard32x32(a, a_stride, b);
-    else if (bwh == 16)
-      ReferenceHadamard16x16(a, a_stride, b);
-    else
-      ReferenceHadamard8x8(a, a_stride, b);
-  }
+  virtual int16_t Rand() = 0;
 
   void CompareReferenceRandom() {
     const int kMaxBlockSize = 32 * 32;
+    const int block_size_ = bw_ * bh_;
+
     DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]);
-    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
     memset(a, 0, sizeof(a));
     memset(b, 0, sizeof(b));
 
-    tran_low_t b_ref[kMaxBlockSize];
+    OutputType b_ref[kMaxBlockSize];
     memset(b_ref, 0, sizeof(b_ref));
 
     for (int i = 0; i < block_size_; ++i) a[i] = Rand();
 
-    ReferenceHadamard(a, bwh_, b_ref, bwh_);
-    ASM_REGISTER_STATE_CHECK(h_func_(a, bwh_, b));
+    ReferenceHadamard(a, bw_, b_ref, bw_, bh_);
+    API_REGISTER_STATE_CHECK(h_func_(a, bw_, b));
 
     // The order of the output is not important. Sort before checking.
     std::sort(b, b + block_size_);
@@ -174,18 +234,20 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
 
   void VaryStride() {
     const int kMaxBlockSize = 32 * 32;
+    const int block_size_ = bw_ * bh_;
+
     DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
-    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
     memset(a, 0, sizeof(a));
     for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand();
 
-    tran_low_t b_ref[kMaxBlockSize];
+    OutputType b_ref[kMaxBlockSize];
     for (int i = 8; i < 64; i += 8) {
       memset(b, 0, sizeof(b));
       memset(b_ref, 0, sizeof(b_ref));
 
-      ReferenceHadamard(a, i, b_ref, bwh_);
-      ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+      ReferenceHadamard(a, i, b_ref, bw_, bh_);
+      API_REGISTER_STATE_CHECK(h_func_(a, i, b));
 
       // The order of the output is not important. Sort before checking.
       std::sort(b, b + block_size_);
@@ -197,32 +259,32 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
   void SpeedTest(int times) {
     const int kMaxBlockSize = 32 * 32;
     DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]);
-    DECLARE_ALIGNED(16, tran_low_t, output[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, OutputType, output[kMaxBlockSize]);
     memset(input, 1, sizeof(input));
     memset(output, 0, sizeof(output));
 
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
     for (int i = 0; i < times; ++i) {
-      h_func_(input, bwh_, output);
+      h_func_(input, bw_, output);
     }
     aom_usec_timer_mark(&timer);
 
     const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("Hadamard%dx%d[%12d runs]: %d us\n", bwh_, bwh_, times,
-           elapsed_time);
+    printf("Hadamard%dx%d[%12d runs]: %d us\n", bw_, bh_, times, elapsed_time);
   }
 
   ACMRandom rnd_;
 
  private:
-  int bwh_;
-  int block_size_;
-  HadamardFunc h_func_;
+  HadamardFuncType h_func_;
+  int bw_;
+  int bh_;
 };
 
-class HadamardLowbdTest : public HadamardTestBase {
+class HadamardLowbdTest : public HadamardTestBase<tran_low_t, HadamardFunc> {
  public:
+  HadamardLowbdTest() : HadamardTestBase(GetParam()) {}
   virtual int16_t Rand() { return rnd_.Rand9Signed(); }
 };
 
@@ -230,32 +292,117 @@ TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
 
 TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
 
+TEST_P(HadamardLowbdTest, DISABLED_SpeedTest) { SpeedTest(1000000); }
+
 INSTANTIATE_TEST_SUITE_P(
     C, HadamardLowbdTest,
-    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_c, 8),
-                      HadamardFuncWithSize(&aom_hadamard_16x16_c, 16),
-                      HadamardFuncWithSize(&aom_hadamard_32x32_c, 32)));
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_4x4_c, 4, 4),
+                      HadamardFuncWithSize(&aom_hadamard_8x8_c, 8, 8),
+                      HadamardFuncWithSize(&aom_hadamard_16x16_c, 16, 16),
+                      HadamardFuncWithSize(&aom_hadamard_32x32_c, 32, 32)));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
     SSE2, HadamardLowbdTest,
-    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_sse2, 8),
-                      HadamardFuncWithSize(&aom_hadamard_16x16_sse2, 16),
-                      HadamardFuncWithSize(&aom_hadamard_32x32_sse2, 32)));
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_4x4_sse2, 4, 4),
+                      HadamardFuncWithSize(&aom_hadamard_8x8_sse2, 8, 8),
+                      HadamardFuncWithSize(&aom_hadamard_16x16_sse2, 16, 16),
+                      HadamardFuncWithSize(&aom_hadamard_32x32_sse2, 32, 32)));
 #endif  // HAVE_SSE2
 
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, HadamardLowbdTest,
-    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_16x16_avx2, 16),
-                      HadamardFuncWithSize(&aom_hadamard_32x32_avx2, 32)));
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_16x16_avx2, 16, 16),
+                      HadamardFuncWithSize(&aom_hadamard_32x32_avx2, 32, 32)));
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
     NEON, HadamardLowbdTest,
-    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_neon, 8),
-                      HadamardFuncWithSize(&aom_hadamard_16x16_neon, 16)));
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_neon, 8, 8),
+                      HadamardFuncWithSize(&aom_hadamard_16x16_neon, 16, 16)));
+#endif  // HAVE_NEON
+
+// Tests for low precision
+class HadamardLowbdLPTest : public HadamardTestBase<int16_t, HadamardLPFunc> {
+ public:
+  HadamardLowbdLPTest() : HadamardTestBase(GetParam()) {}
+  virtual int16_t Rand() { return rnd_.Rand9Signed(); }
+};
+
+TEST_P(HadamardLowbdLPTest, CompareReferenceRandom) {
+  CompareReferenceRandom();
+}
+
+TEST_P(HadamardLowbdLPTest, VaryStride) { VaryStride(); }
+
+TEST_P(HadamardLowbdLPTest, DISABLED_SpeedTest) { SpeedTest(1000000); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, HadamardLowbdLPTest,
+    ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_c, 8, 8),
+                      HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_c, 16,
+                                             16)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, HadamardLowbdLPTest,
+    ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_sse2, 8, 8),
+                      HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_sse2, 16,
+                                             16)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, HadamardLowbdLPTest,
+                         ::testing::Values(HadamardLPFuncWithSize(
+                             &aom_hadamard_lp_16x16_avx2, 16, 16)));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HadamardLowbdLPTest,
+    ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_neon, 8, 8),
+                      HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_neon, 16,
+                                             16)));
+#endif  // HAVE_NEON
+
+// Tests for 8x8 dual low precision
+class HadamardLowbdLP8x8DualTest
+    : public HadamardTestBase<int16_t, HadamardLP8x8DualFunc> {
+ public:
+  HadamardLowbdLP8x8DualTest() : HadamardTestBase(GetParam()) {}
+  virtual int16_t Rand() { return rnd_.Rand9Signed(); }
+};
+
+TEST_P(HadamardLowbdLP8x8DualTest, CompareReferenceRandom) {
+  CompareReferenceRandom();
+}
+
+TEST_P(HadamardLowbdLP8x8DualTest, VaryStride) { VaryStride(); }
+
+TEST_P(HadamardLowbdLP8x8DualTest, DISABLED_SpeedTest) { SpeedTest(1000000); }
+
+INSTANTIATE_TEST_SUITE_P(C, HadamardLowbdLP8x8DualTest,
+                         ::testing::Values(HadamardLP8x8DualFuncWithSize(
+                             &aom_hadamard_8x8_dual_c, 8, 16)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, HadamardLowbdLP8x8DualTest,
+                         ::testing::Values(HadamardLP8x8DualFuncWithSize(
+                             &aom_hadamard_8x8_dual_sse2, 8, 16)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, HadamardLowbdLP8x8DualTest,
+                         ::testing::Values(HadamardLP8x8DualFuncWithSize(
+                             &aom_hadamard_8x8_dual_avx2, 8, 16)));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, HadamardLowbdLP8x8DualTest,
+                         ::testing::Values(HadamardLP8x8DualFuncWithSize(
+                             &aom_hadamard_8x8_dual_neon, 8, 16)));
 #endif  // HAVE_NEON
 
 }  // namespace
diff --git a/media/libaom/src/test/hash_test.cc b/media/libaom/src/test/hash_test.cc
index eb964ac5f6..5ce0fbb3dc 100644
--- a/media/libaom/src/test/hash_test.cc
+++ b/media/libaom/src/test/hash_test.cc
@@ -49,7 +49,7 @@ class AV1Crc32cHashTest : public ::testing::TestWithParam<HashParam> {
   size_t length_;
 };
 
-AV1Crc32cHashTest::~AV1Crc32cHashTest() { ; }
+AV1Crc32cHashTest::~AV1Crc32cHashTest() {}
 
 void AV1Crc32cHashTest::SetUp() {
   rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
@@ -58,7 +58,7 @@ void AV1Crc32cHashTest::SetUp() {
   bsize_ = GET_PARAM(1);
   length_ = bsize_ * bsize_ * sizeof(uint16_t);
   buffer_ = new uint8_t[length_];
-  ASSERT_TRUE(buffer_ != NULL);
+  ASSERT_NE(buffer_, nullptr);
   for (size_t i = 0; i < length_; ++i) {
     buffer_[i] = rnd_.Rand8();
   }
diff --git a/media/libaom/src/test/hbd_metrics_test.cc b/media/libaom/src/test/hbd_metrics_test.cc
index 5b03beee7d..6c9fe55443 100644
--- a/media/libaom/src/test/hbd_metrics_test.cc
+++ b/media/libaom/src/test/hbd_metrics_test.cc
@@ -80,15 +80,15 @@ double compute_fastssim(const YV12_BUFFER_CONFIG *source,
 double compute_hbd_aomssim(const YV12_BUFFER_CONFIG *source,
                            const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
                            uint32_t bd) {
-  double ssim, weight;
-  ssim = aom_highbd_calc_ssim(source, dest, &weight, bd, in_bd);
-  return 100 * pow(ssim / weight, 8.0);
+  double ssim[2], weight[2];
+  aom_highbd_calc_ssim(source, dest, weight, bd, in_bd, ssim);
+  return 100 * pow(ssim[0] / weight[0], 8.0);
 }
 
 double compute_aomssim(const YV12_BUFFER_CONFIG *source,
                        const YV12_BUFFER_CONFIG *dest) {
   double ssim, weight;
-  ssim = aom_calc_ssim(source, dest, &weight);
+  aom_lowbd_calc_ssim(source, dest, &weight, &ssim);
   return 100 * pow(ssim / weight, 8.0);
 }
 
@@ -112,10 +112,10 @@ class HBDMetricsTestBase {
     memset(&hbd_src, 0, sizeof(hbd_src));
     memset(&hbd_dst, 0, sizeof(hbd_dst));
 
-    aom_alloc_frame_buffer(&lbd_src, width, height, 1, 1, 0, 32, 16);
-    aom_alloc_frame_buffer(&lbd_dst, width, height, 1, 1, 0, 32, 16);
-    aom_alloc_frame_buffer(&hbd_src, width, height, 1, 1, 1, 32, 16);
-    aom_alloc_frame_buffer(&hbd_dst, width, height, 1, 1, 1, 32, 16);
+    aom_alloc_frame_buffer(&lbd_src, width, height, 1, 1, 0, 32, 16, 0);
+    aom_alloc_frame_buffer(&lbd_dst, width, height, 1, 1, 0, 32, 16, 0);
+    aom_alloc_frame_buffer(&hbd_src, width, height, 1, 1, 1, 32, 16, 0);
+    aom_alloc_frame_buffer(&hbd_dst, width, height, 1, 1, 1, 32, 16, 0);
 
     memset(lbd_src.buffer_alloc, kPixFiller, lbd_src.buffer_alloc_sz);
     while (i < lbd_src.buffer_alloc_sz) {
diff --git a/media/libaom/src/test/hiprec_convolve_test.cc b/media/libaom/src/test/hiprec_convolve_test.cc
index 59d28e8830..3e93a06b52 100644
--- a/media/libaom/src/test/hiprec_convolve_test.cc
+++ b/media/libaom/src/test/hiprec_convolve_test.cc
@@ -17,8 +17,10 @@
 using libaom_test::ACMRandom;
 #if CONFIG_AV1_HIGHBITDEPTH
 using libaom_test::AV1HighbdHiprecConvolve::AV1HighbdHiprecConvolveTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdHiprecConvolveTest);
 #endif
 using libaom_test::AV1HiprecConvolve::AV1HiprecConvolveTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HiprecConvolveTest);
 using std::make_tuple;
 using std::tuple;
 
diff --git a/media/libaom/src/test/hiprec_convolve_test_util.cc b/media/libaom/src/test/hiprec_convolve_test_util.cc
index 956af7fc89..e2496b3236 100644
--- a/media/libaom/src/test/hiprec_convolve_test_util.cc
+++ b/media/libaom/src/test/hiprec_convolve_test_util.cc
@@ -11,6 +11,9 @@
 
 #include "test/hiprec_convolve_test_util.h"
 
+#include <memory>
+#include <new>
+
 #include "av1/common/restoration.h"
 
 using std::make_tuple;
@@ -85,7 +88,7 @@ void AV1HiprecConvolveTest::SetUp() {
   rnd_.Reset(ACMRandom::DeterministicSeed());
 }
 
-void AV1HiprecConvolveTest::TearDown() { libaom_test::ClearSystemState(); }
+void AV1HiprecConvolveTest::TearDown() {}
 
 void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
   const int w = 128, h = 128;
@@ -94,15 +97,18 @@ void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
   int i, j, k, m;
   const ConvolveParams conv_params = get_conv_params_wiener(8);
 
-  uint8_t *input_ = new uint8_t[h * w];
-  uint8_t *input = input_;
+  std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * w]);
+  ASSERT_NE(input_, nullptr);
+  uint8_t *input = input_.get();
 
   // The AVX2 convolve functions always write rows with widths that are
   // multiples of 16. So to avoid a buffer overflow, we may need to pad
   // rows to a multiple of 16.
   int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
-  uint8_t *output = new uint8_t[output_n];
-  uint8_t *output2 = new uint8_t[output_n];
+  std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[output_n]);
+  ASSERT_NE(output, nullptr);
+  std::unique_ptr<uint8_t[]> output2(new (std::nothrow) uint8_t[output_n]);
+  ASSERT_NE(output2, nullptr);
 
   // Generate random filter kernels
   DECLARE_ALIGNED(16, InterpKernel, hkernel);
@@ -116,11 +122,11 @@ void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
       // Choose random locations within the source block
       int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
       int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-      av1_wiener_convolve_add_src_c(input + offset_r * w + offset_c, w, output,
-                                    out_w, hkernel, 16, vkernel, 16, out_w,
-                                    out_h, &conv_params);
-      test_impl(input + offset_r * w + offset_c, w, output2, out_w, hkernel, 16,
-                vkernel, 16, out_w, out_h, &conv_params);
+      av1_wiener_convolve_add_src_c(input + offset_r * w + offset_c, w,
+                                    output.get(), out_w, hkernel, 16, vkernel,
+                                    16, out_w, out_h, &conv_params);
+      test_impl(input + offset_r * w + offset_c, w, output2.get(), out_w,
+                hkernel, 16, vkernel, 16, out_w, out_h, &conv_params);
 
       for (j = 0; j < out_w * out_h; ++j)
         ASSERT_EQ(output[j], output2[j])
@@ -128,9 +134,6 @@ void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
             << (j / out_w) << ") on iteration " << i;
     }
   }
-  delete[] input_;
-  delete[] output;
-  delete[] output2;
 }
 
 void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
@@ -140,15 +143,18 @@ void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
   int i, j, k;
   const ConvolveParams conv_params = get_conv_params_wiener(8);
 
-  uint8_t *input_ = new uint8_t[h * w];
-  uint8_t *input = input_;
+  std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * w]);
+  ASSERT_NE(input_, nullptr);
+  uint8_t *input = input_.get();
 
   // The AVX2 convolve functions always write rows with widths that are
   // multiples of 16. So to avoid a buffer overflow, we may need to pad
   // rows to a multiple of 16.
   int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
-  uint8_t *output = new uint8_t[output_n];
-  uint8_t *output2 = new uint8_t[output_n];
+  std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[output_n]);
+  ASSERT_NE(output, nullptr);
+  std::unique_ptr<uint8_t[]> output2(new (std::nothrow) uint8_t[output_n]);
+  ASSERT_NE(output2, nullptr);
 
   // Generate random filter kernels
   DECLARE_ALIGNED(16, InterpKernel, hkernel);
@@ -164,7 +170,7 @@ void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
   for (i = 0; i < num_iters; ++i) {
     for (j = 3; j < h - out_h - 4; j++) {
       for (k = 3; k < w - out_w - 4; k++) {
-        av1_wiener_convolve_add_src_c(input + j * w + k, w, output, out_w,
+        av1_wiener_convolve_add_src_c(input + j * w + k, w, output.get(), out_w,
                                       hkernel, 16, vkernel, 16, out_w, out_h,
                                       &conv_params);
       }
@@ -178,8 +184,8 @@ void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
   for (i = 0; i < num_iters; ++i) {
     for (j = 3; j < h - out_h - 4; j++) {
       for (k = 3; k < w - out_w - 4; k++) {
-        test_impl(input + j * w + k, w, output2, out_w, hkernel, 16, vkernel,
-                  16, out_w, out_h, &conv_params);
+        test_impl(input + j * w + k, w, output2.get(), out_w, hkernel, 16,
+                  vkernel, 16, out_w, out_h, &conv_params);
       }
     }
   }
@@ -193,10 +199,6 @@ void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
       << "Error: AV1HiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
       << "C time: " << ref_time << " us\n"
       << "SIMD time: " << tst_time << " us\n";
-
-  delete[] input_;
-  delete[] output;
-  delete[] output2;
 }
 }  // namespace AV1HiprecConvolve
 
@@ -220,9 +222,7 @@ void AV1HighbdHiprecConvolveTest::SetUp() {
   rnd_.Reset(ACMRandom::DeterministicSeed());
 }
 
-void AV1HighbdHiprecConvolveTest::TearDown() {
-  libaom_test::ClearSystemState();
-}
+void AV1HighbdHiprecConvolveTest::TearDown() {}
 
 void AV1HighbdHiprecConvolveTest::RunCheckOutput(
     highbd_hiprec_convolve_func test_impl) {
@@ -233,14 +233,17 @@ void AV1HighbdHiprecConvolveTest::RunCheckOutput(
   int i, j;
   const ConvolveParams conv_params = get_conv_params_wiener(bd);
 
-  uint16_t *input = new uint16_t[h * w];
+  std::unique_ptr<uint16_t[]> input(new (std::nothrow) uint16_t[h * w]);
+  ASSERT_NE(input, nullptr);
 
   // The AVX2 convolve functions always write rows with widths that are
   // multiples of 16. So to avoid a buffer overflow, we may need to pad
   // rows to a multiple of 16.
   int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
-  uint16_t *output = new uint16_t[output_n];
-  uint16_t *output2 = new uint16_t[output_n];
+  std::unique_ptr<uint16_t[]> output(new (std::nothrow) uint16_t[output_n]);
+  ASSERT_NE(output, nullptr);
+  std::unique_ptr<uint16_t[]> output2(new (std::nothrow) uint16_t[output_n]);
+  ASSERT_NE(output2, nullptr);
 
   // Generate random filter kernels
   DECLARE_ALIGNED(16, InterpKernel, hkernel);
@@ -249,9 +252,9 @@ void AV1HighbdHiprecConvolveTest::RunCheckOutput(
   for (i = 0; i < h; ++i)
     for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
 
-  uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input);
-  uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output);
-  uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2);
+  uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input.get());
+  uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output.get());
+  uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2.get());
   for (int kernel_type = 0; kernel_type < 3; kernel_type++) {
     generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
     for (i = 0; i < num_iters; ++i) {
@@ -270,9 +273,6 @@ void AV1HighbdHiprecConvolveTest::RunCheckOutput(
             << (j / out_w) << ") on iteration " << i;
     }
   }
-  delete[] input;
-  delete[] output;
-  delete[] output2;
 }
 
 void AV1HighbdHiprecConvolveTest::RunSpeedTest(
@@ -284,14 +284,17 @@ void AV1HighbdHiprecConvolveTest::RunSpeedTest(
   int i, j, k;
   const ConvolveParams conv_params = get_conv_params_wiener(bd);
 
-  uint16_t *input = new uint16_t[h * w];
+  std::unique_ptr<uint16_t[]> input(new (std::nothrow) uint16_t[h * w]);
+  ASSERT_NE(input, nullptr);
 
   // The AVX2 convolve functions always write rows with widths that are
   // multiples of 16. So to avoid a buffer overflow, we may need to pad
   // rows to a multiple of 16.
   int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
-  uint16_t *output = new uint16_t[output_n];
-  uint16_t *output2 = new uint16_t[output_n];
+  std::unique_ptr<uint16_t[]> output(new (std::nothrow) uint16_t[output_n]);
+  ASSERT_NE(output, nullptr);
+  std::unique_ptr<uint16_t[]> output2(new (std::nothrow) uint16_t[output_n]);
+  ASSERT_NE(output2, nullptr);
 
   // Generate random filter kernels
   DECLARE_ALIGNED(16, InterpKernel, hkernel);
@@ -302,9 +305,9 @@ void AV1HighbdHiprecConvolveTest::RunSpeedTest(
   for (i = 0; i < h; ++i)
     for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
 
-  uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input);
-  uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output);
-  uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2);
+  uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input.get());
+  uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output.get());
+  uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2.get());
 
   aom_usec_timer ref_timer;
   aom_usec_timer_start(&ref_timer);
@@ -340,10 +343,6 @@ void AV1HighbdHiprecConvolveTest::RunSpeedTest(
       << "Error: AV1HighbdHiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
       << "C time: " << ref_time << " us\n"
       << "SIMD time: " << tst_time << " us\n";
-
-  delete[] input;
-  delete[] output;
-  delete[] output2;
 }
 }  // namespace AV1HighbdHiprecConvolve
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/test/hiprec_convolve_test_util.h b/media/libaom/src/test/hiprec_convolve_test_util.h
index 6b6da4ee81..e064ba64a9 100644
--- a/media/libaom/src/test/hiprec_convolve_test_util.h
+++ b/media/libaom/src/test/hiprec_convolve_test_util.h
@@ -18,7 +18,6 @@
 
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
diff --git a/media/libaom/src/test/horver_correlation_test.cc b/media/libaom/src/test/horver_correlation_test.cc
index ccb8eddd0a..d1fd578448 100644
--- a/media/libaom/src/test/horver_correlation_test.cc
+++ b/media/libaom/src/test/horver_correlation_test.cc
@@ -48,6 +48,7 @@ class HorverTest : public ::testing::TestWithParam<HorverTestParam> {
   ACMRandom rng_;
   int16_t *data_buf_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HorverTest);
 
 void HorverTest::RunHorverTest(void) {
   for (int block_size = 0; block_size < BLOCK_SIZES_ALL; block_size++) {
@@ -140,6 +141,11 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(av1_get_horver_correlation_full_sse4_1));
 #endif  // HAVE_SSE4_1
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HorverTest, ::testing::Values(av1_get_horver_correlation_full_neon));
+#endif  // HAVE_NEON
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, HorverTest, ::testing::Values(av1_get_horver_correlation_full_avx2));
diff --git a/media/libaom/src/test/horz_superres_test.cc b/media/libaom/src/test/horz_superres_test.cc
index 938b0b15aa..12e14dc55b 100644
--- a/media/libaom/src/test/horz_superres_test.cc
+++ b/media/libaom/src/test/horz_superres_test.cc
@@ -39,7 +39,8 @@ typedef struct {
   unsigned int profile;
   unsigned int limit;
   unsigned int screen_content;
-  double psnr_threshold;
+  double psnr_threshold;   // used by modes other than AOM_SUPERRES_AUTO
+  double psnr_threshold2;  // used by AOM_SUPERRES_AUTO
 } TestVideoParam;
 
 std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
@@ -51,18 +52,21 @@ std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
 }
 
 const TestVideoParam kTestVideoVectors[] = {
-  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.5 },
+  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.3,
+    45.0 },
 #if CONFIG_AV1_HIGHBITDEPTH
-  { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 28.0 },
+  { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 27.0,
+    48.0 },
 #endif
-  { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 20.0 },
+  { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 23.0, 56.0 },
   // Image coding (single frame).
-  { "niklas_1280_720_30.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 1, 0, 32.0 },
+  { "niklas_1280_720_30.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 1, 0, 32.0,
+    49.0 },
 };
 
 // Modes with extra params have their own tests.
-const SUPERRES_MODE kSuperresModesWithoutParams[] = { SUPERRES_RANDOM,
-                                                      SUPERRES_AUTO };
+const aom_superres_mode kSuperresModesWithoutParams[] = { AOM_SUPERRES_RANDOM,
+                                                          AOM_SUPERRES_AUTO };
 
 // Superres denominators and superres kf denominators to be tested
 typedef tuple<int, int> SuperresDenominatorPair;
@@ -84,7 +88,8 @@ const SuperresQThresholdPair kSuperresQThresholds[] = {
 
 // Test parameter list:
 //  <[needed for EncoderTest], test_video_param_, superres_mode_>
-typedef tuple<const libaom_test::CodecFactory *, TestVideoParam, SUPERRES_MODE>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam,
+              aom_superres_mode>
     HorzSuperresTestParam;
 
 class HorzSuperresEndToEndTest
@@ -98,8 +103,7 @@ class HorzSuperresEndToEndTest
   virtual ~HorzSuperresEndToEndTest() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(::libaom_test::kTwoPassGood);
+    InitializeConfig(::libaom_test::kTwoPassGood);
     cfg_.g_lag_in_frames = 5;
     cfg_.rc_end_usage = AOM_Q;
     cfg_.rc_target_bitrate = kBitrate;
@@ -154,19 +158,20 @@ class HorzSuperresEndToEndTest
     std::unique_ptr<libaom_test::VideoSource> video;
     video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
                                                 test_video_param_.limit));
-    ASSERT_TRUE(video.get() != NULL);
+    ASSERT_NE(video, nullptr);
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr_thresh = (superres_mode_ == AOM_SUPERRES_AUTO)
+                                   ? test_video_param_.psnr_threshold2
+                                   : test_video_param_.psnr_threshold;
     const double psnr = GetAveragePsnr();
-    EXPECT_GT(psnr, test_video_param_.psnr_threshold)
-        << "superres_mode_ = " << superres_mode_;
+    EXPECT_GT(psnr, psnr_thresh);
 
-    EXPECT_EQ(test_video_param_.limit, frame_count_)
-        << "superres_mode_ = " << superres_mode_;
+    EXPECT_EQ(test_video_param_.limit, frame_count_);
   }
 
   TestVideoParam test_video_param_;
-  SUPERRES_MODE superres_mode_;
+  aom_superres_mode superres_mode_;
 
  private:
   double psnr_;
@@ -175,9 +180,9 @@ class HorzSuperresEndToEndTest
 
 TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(HorzSuperresEndToEndTest,
-                          ::testing::ValuesIn(kTestVideoVectors),
-                          ::testing::ValuesIn(kSuperresModesWithoutParams));
+AV1_INSTANTIATE_TEST_SUITE(HorzSuperresEndToEndTest,
+                           ::testing::ValuesIn(kTestVideoVectors),
+                           ::testing::ValuesIn(kSuperresModesWithoutParams));
 
 // Test parameter list:
 //  <[needed for EncoderTest], test_video_param_, tuple(superres_denom_,
@@ -192,7 +197,7 @@ class HorzSuperresFixedEndToEndTest
  protected:
   HorzSuperresFixedEndToEndTest()
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
-        superres_mode_(SUPERRES_FIXED), psnr_(0.0), frame_count_(0) {
+        superres_mode_(AOM_SUPERRES_FIXED), psnr_(0.0), frame_count_(0) {
     SuperresDenominatorPair denoms = GET_PARAM(2);
     superres_denom_ = std::get<0>(denoms);
     superres_kf_denom_ = std::get<1>(denoms);
@@ -201,8 +206,7 @@ class HorzSuperresFixedEndToEndTest
   virtual ~HorzSuperresFixedEndToEndTest() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(::libaom_test::kTwoPassGood);
+    InitializeConfig(::libaom_test::kTwoPassGood);
     cfg_.g_lag_in_frames = 5;
     cfg_.rc_end_usage = AOM_VBR;
     cfg_.rc_target_bitrate = kBitrate;
@@ -259,7 +263,7 @@ class HorzSuperresFixedEndToEndTest
     std::unique_ptr<libaom_test::VideoSource> video;
     video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
                                                 test_video_param_.limit));
-    ASSERT_TRUE(video.get() != NULL);
+    ASSERT_NE(video, nullptr);
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
     const double psnr = GetAveragePsnr();
@@ -275,7 +279,7 @@ class HorzSuperresFixedEndToEndTest
   }
 
   TestVideoParam test_video_param_;
-  SUPERRES_MODE superres_mode_;
+  aom_superres_mode superres_mode_;
   int superres_denom_;
   int superres_kf_denom_;
 
@@ -286,9 +290,9 @@ class HorzSuperresFixedEndToEndTest
 
 TEST_P(HorzSuperresFixedEndToEndTest, HorzSuperresFixedTestParam) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(HorzSuperresFixedEndToEndTest,
-                          ::testing::ValuesIn(kTestVideoVectors),
-                          ::testing::ValuesIn(kSuperresDenominators));
+AV1_INSTANTIATE_TEST_SUITE(HorzSuperresFixedEndToEndTest,
+                           ::testing::ValuesIn(kTestVideoVectors),
+                           ::testing::ValuesIn(kSuperresDenominators));
 
 // Test parameter list:
 //  <[needed for EncoderTest], test_video_param_,
@@ -303,7 +307,7 @@ class HorzSuperresQThreshEndToEndTest
  protected:
   HorzSuperresQThreshEndToEndTest()
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
-        superres_mode_(SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) {
+        superres_mode_(AOM_SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) {
     SuperresQThresholdPair qthresholds = GET_PARAM(2);
     superres_qthresh_ = std::get<0>(qthresholds);
     superres_kf_qthresh_ = std::get<1>(qthresholds);
@@ -312,8 +316,7 @@ class HorzSuperresQThreshEndToEndTest
   virtual ~HorzSuperresQThreshEndToEndTest() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(::libaom_test::kTwoPassGood);
+    InitializeConfig(::libaom_test::kTwoPassGood);
     cfg_.g_lag_in_frames = 5;
     cfg_.rc_end_usage = AOM_VBR;
     cfg_.rc_target_bitrate = kBitrate;
@@ -370,7 +373,7 @@ class HorzSuperresQThreshEndToEndTest
     std::unique_ptr<libaom_test::VideoSource> video;
     video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
                                                 test_video_param_.limit));
-    ASSERT_TRUE(video.get() != NULL);
+    ASSERT_NE(video, nullptr);
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
     const double psnr = GetAveragePsnr();
@@ -386,7 +389,7 @@ class HorzSuperresQThreshEndToEndTest
   }
 
   TestVideoParam test_video_param_;
-  SUPERRES_MODE superres_mode_;
+  aom_superres_mode superres_mode_;
   int superres_qthresh_;
   int superres_kf_qthresh_;
 
@@ -399,8 +402,8 @@ TEST_P(HorzSuperresQThreshEndToEndTest, HorzSuperresQThreshEndToEndPSNRTest) {
   DoTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(HorzSuperresQThreshEndToEndTest,
-                          ::testing::ValuesIn(kTestVideoVectors),
-                          ::testing::ValuesIn(kSuperresQThresholds));
+AV1_INSTANTIATE_TEST_SUITE(HorzSuperresQThreshEndToEndTest,
+                           ::testing::ValuesIn(kTestVideoVectors),
+                           ::testing::ValuesIn(kSuperresQThresholds));
 
 }  // namespace
diff --git a/media/libaom/src/test/intra_edge_test.cc b/media/libaom/src/test/intra_edge_test.cc
index f7702c952b..84e712d1d1 100644
--- a/media/libaom/src/test/intra_edge_test.cc
+++ b/media/libaom/src/test/intra_edge_test.cc
@@ -73,7 +73,7 @@ class UpsampleTest8B : public UpsampleTest<UP8B, uint8_t> {
  protected:
   void Execute(uint8_t *edge_tst) {
     params_.ref_func(edge_ref_, size_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_));
   }
 };
 
@@ -117,7 +117,7 @@ class UpsampleTestHB : public UpsampleTest<UPHB, uint16_t> {
  protected:
   void Execute(uint16_t *edge_tst) {
     params_.ref_func(edge_ref_, size_, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, bit_depth_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, bit_depth_));
   }
   int bit_depth_;
 };
@@ -202,7 +202,7 @@ class FilterEdgeTest8B : public FilterEdgeTest<FE8B, uint8_t> {
  protected:
   void Execute(uint8_t *edge_tst) {
     params_.ref_func(edge_ref_, size_, strength_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
   }
 };
 
@@ -240,7 +240,7 @@ class FilterEdgeTestHB : public FilterEdgeTest<FEHB, uint16_t> {
  protected:
   void Execute(uint16_t *edge_tst) {
     params_.ref_func(edge_ref_, size_, strength_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
   }
   int bit_depth_;
 };
@@ -284,7 +284,7 @@ TEST_P(UpsampleTest8B, DISABLED_Speed) {
   }
   edge_tst_ = &edge_tst_data_[kOffset];
   for (int iter = 0; iter < test_count; ++iter) {
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
   }
 }
 
@@ -298,7 +298,7 @@ TEST_P(UpsampleTestHB, DISABLED_Speed) {
   }
   edge_tst_ = &edge_tst_data_[kOffset];
   for (int iter = 0; iter < test_count; ++iter) {
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, bit_depth_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, bit_depth_));
   }
 }
 
@@ -311,7 +311,7 @@ TEST_P(FilterEdgeTest8B, DISABLED_Speed) {
   }
   edge_tst_ = &edge_tst_data_[kOffset];
   for (int iter = 0; iter < test_count; ++iter) {
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
     // iterate over filter strengths (1,2,3)
     strength_ = (strength_ == 3) ? 1 : strength_ + 1;
   }
@@ -328,7 +328,7 @@ TEST_P(FilterEdgeTestHB, DISABLED_Speed) {
   }
   edge_tst_ = &edge_tst_data_[kOffset];
   for (int iter = 0; iter < test_count; ++iter) {
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
     // iterate over filter strengths (1,2,3)
     strength_ = (strength_ == 3) ? 1 : strength_ + 1;
   }
diff --git a/media/libaom/src/test/intrabc_test.cc b/media/libaom/src/test/intrabc_test.cc
index b57eb6fab5..2c60596ab8 100644
--- a/media/libaom/src/test/intrabc_test.cc
+++ b/media/libaom/src/test/intrabc_test.cc
@@ -153,8 +153,10 @@ TEST(IntrabcTest, DvValidation) {
   xd.plane[2].subsampling_x = 1;
   xd.plane[2].subsampling_y = 1;
 
+  SequenceHeader seq_params = {};
   AV1_COMMON cm;
   memset(&cm, 0, sizeof(cm));
+  cm.seq_params = &seq_params;
 
   for (const DvTestCase &dv_case : kDvCases) {
     const int mi_row = xd.tile.mi_row_start + dv_case.mi_row_offset;
diff --git a/media/libaom/src/test/intrapred_test.cc b/media/libaom/src/test/intrapred_test.cc
index 779cf9a5de..356fd835ab 100644
--- a/media/libaom/src/test/intrapred_test.cc
+++ b/media/libaom/src/test/intrapred_test.cc
@@ -17,7 +17,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/blockd.h"
@@ -97,6 +96,63 @@ class AV1IntraPredTest
     }
     ASSERT_EQ(0, error_count);
   }
+  void RunSpeedTest(Pixel *left_col, Pixel *above_data, Pixel *dst,
+                    Pixel *ref_dst) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int block_width = params_.block_width;
+    const int block_height = params_.block_height;
+    above_row_ = above_data + 16;
+    left_col_ = left_col;
+    dst_ = dst;
+    ref_dst_ = ref_dst;
+    int error_count = 0;
+    const int numIter = 100;
+
+    int c_sum_time = 0;
+    int simd_sum_time = 0;
+    for (int i = 0; i < count_test_block; ++i) {
+      // Fill edges with random data, try first with saturated values.
+      for (int x = -1; x <= block_width * 2; x++) {
+        if (i == 0) {
+          above_row_[x] = mask_;
+        } else {
+          above_row_[x] = rnd.Rand16() & mask_;
+        }
+      }
+      for (int y = 0; y < block_height; y++) {
+        if (i == 0) {
+          left_col_[y] = mask_;
+        } else {
+          left_col_[y] = rnd.Rand16() & mask_;
+        }
+      }
+
+      aom_usec_timer c_timer_;
+      aom_usec_timer_start(&c_timer_);
+
+      PredictRefSpeedTest(numIter);
+
+      aom_usec_timer_mark(&c_timer_);
+
+      aom_usec_timer simd_timer_;
+      aom_usec_timer_start(&simd_timer_);
+
+      PredictFncSpeedTest(numIter);
+
+      aom_usec_timer_mark(&simd_timer_);
+
+      c_sum_time += static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+      simd_sum_time += static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+      CheckPrediction(i, &error_count);
+    }
+
+    printf(
+        "blockWxH = %d x %d c_time = %d \t simd_time = %d \t Gain = %4.2f \n",
+        block_width, block_height, c_sum_time, simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+    ASSERT_EQ(0, error_count);
+  }
 
  protected:
   virtual void SetUp() {
@@ -107,6 +163,9 @@ class AV1IntraPredTest
 
   virtual void Predict() = 0;
 
+  virtual void PredictRefSpeedTest(int num) = 0;
+  virtual void PredictFncSpeedTest(int num) = 0;
+
   void CheckPrediction(int test_case_number, int *error_count) const {
     // For each pixel ensure that the calculated value is the same as reference.
     const int block_width = params_.block_width;
@@ -139,20 +198,45 @@ class HighbdIntraPredTest : public AV1IntraPredTest<HighbdIntraPred, uint16_t> {
   void Predict() {
     const int bit_depth = params_.bit_depth;
     params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
   }
+  void PredictRefSpeedTest(int num) {
+    const int bit_depth = params_.bit_depth;
+    for (int i = 0; i < num; i++) {
+      params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+    }
+  }
+  void PredictFncSpeedTest(int num) {
+    const int bit_depth = params_.bit_depth;
+    for (int i = 0; i < num; i++) {
+      params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth);
+    }
+  }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HighbdIntraPredTest);
+
 #endif
 
 class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
  protected:
   void Predict() {
     params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         params_.pred_fn(dst_, stride_, above_row_, left_col_));
   }
+  void PredictRefSpeedTest(int num) {
+    for (int i = 0; i < num; i++) {
+      params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+    }
+  }
+  void PredictFncSpeedTest(int num) {
+    for (int i = 0; i < num; i++) {
+      params_.pred_fn(dst_, stride_, above_row_, left_col_);
+    }
+  }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(LowbdIntraPredTest);
 
 #if CONFIG_AV1_HIGHBITDEPTH
 // Suppress an unitialized warning. Once there are implementations to test then
@@ -167,21 +251,39 @@ TEST_P(HighbdIntraPredTest, Bitexact) {
   av1_zero(above_data);
   RunTest(left_col, above_data, dst, ref_dst);
 }
+
+TEST_P(HighbdIntraPredTest, DISABLED_Speed) {
+  // max block size is 64
+  DECLARE_ALIGNED(16, uint16_t, left_col[2 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[2 * 64 + 64]);
+  DECLARE_ALIGNED(16, uint16_t, dst[3 * 64 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 64 * 64]);
+  av1_zero(left_col);
+  av1_zero(above_data);
+  RunSpeedTest(left_col, above_data, dst, ref_dst);
+}
 #endif
 
-// Same issue as above but for arm.
-#if !HAVE_NEON
 TEST_P(LowbdIntraPredTest, Bitexact) {
-  // max block size is 32
-  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
-  DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]);
+  // max block size is 64
+  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 64 + 64]);
+  DECLARE_ALIGNED(16, uint8_t, dst[3 * 64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 64 * 64]);
   av1_zero(left_col);
   av1_zero(above_data);
   RunTest(left_col, above_data, dst, ref_dst);
 }
-#endif  // !HAVE_NEON
+TEST_P(LowbdIntraPredTest, DISABLED_Speed) {
+  // max block size is 64
+  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 64 + 64]);
+  DECLARE_ALIGNED(16, uint8_t, dst[3 * 64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 64 * 64]);
+  av1_zero(left_col);
+  av1_zero(above_data);
+  RunSpeedTest(left_col, above_data, dst, ref_dst);
+}
 
 #if CONFIG_AV1_HIGHBITDEPTH
 // -----------------------------------------------------------------------------
@@ -229,6 +331,57 @@ INSTANTIATE_TEST_SUITE_P(SSE2, LowbdIntraPredTest,
 
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
+  lowbd_entry(smooth, 4, 4, neon),     lowbd_entry(smooth, 4, 8, neon),
+  lowbd_entry(smooth, 4, 16, neon),    lowbd_entry(smooth, 8, 4, neon),
+  lowbd_entry(smooth, 8, 8, neon),     lowbd_entry(smooth, 8, 16, neon),
+  lowbd_entry(smooth, 8, 32, neon),    lowbd_entry(smooth, 16, 4, neon),
+  lowbd_entry(smooth, 16, 8, neon),    lowbd_entry(smooth, 16, 16, neon),
+  lowbd_entry(smooth, 16, 32, neon),   lowbd_entry(smooth, 16, 64, neon),
+  lowbd_entry(smooth, 32, 8, neon),    lowbd_entry(smooth, 32, 16, neon),
+  lowbd_entry(smooth, 32, 32, neon),   lowbd_entry(smooth, 32, 64, neon),
+  lowbd_entry(smooth, 64, 16, neon),   lowbd_entry(smooth, 64, 32, neon),
+  lowbd_entry(smooth, 64, 64, neon),
+
+  lowbd_entry(smooth_v, 4, 4, neon),   lowbd_entry(smooth_v, 4, 8, neon),
+  lowbd_entry(smooth_v, 4, 16, neon),  lowbd_entry(smooth_v, 8, 4, neon),
+  lowbd_entry(smooth_v, 8, 8, neon),   lowbd_entry(smooth_v, 8, 16, neon),
+  lowbd_entry(smooth_v, 8, 32, neon),  lowbd_entry(smooth_v, 16, 4, neon),
+  lowbd_entry(smooth_v, 16, 8, neon),  lowbd_entry(smooth_v, 16, 16, neon),
+  lowbd_entry(smooth_v, 16, 32, neon), lowbd_entry(smooth_v, 16, 64, neon),
+  lowbd_entry(smooth_v, 32, 8, neon),  lowbd_entry(smooth_v, 32, 16, neon),
+  lowbd_entry(smooth_v, 32, 32, neon), lowbd_entry(smooth_v, 32, 64, neon),
+  lowbd_entry(smooth_v, 64, 16, neon), lowbd_entry(smooth_v, 64, 32, neon),
+  lowbd_entry(smooth_v, 64, 64, neon),
+
+  lowbd_entry(smooth_h, 4, 4, neon),   lowbd_entry(smooth_h, 4, 8, neon),
+  lowbd_entry(smooth_h, 4, 16, neon),  lowbd_entry(smooth_h, 8, 4, neon),
+  lowbd_entry(smooth_h, 8, 8, neon),   lowbd_entry(smooth_h, 8, 16, neon),
+  lowbd_entry(smooth_h, 8, 32, neon),  lowbd_entry(smooth_h, 16, 4, neon),
+  lowbd_entry(smooth_h, 16, 8, neon),  lowbd_entry(smooth_h, 16, 16, neon),
+  lowbd_entry(smooth_h, 16, 32, neon), lowbd_entry(smooth_h, 16, 64, neon),
+  lowbd_entry(smooth_h, 32, 8, neon),  lowbd_entry(smooth_h, 32, 16, neon),
+  lowbd_entry(smooth_h, 32, 32, neon), lowbd_entry(smooth_h, 32, 64, neon),
+  lowbd_entry(smooth_h, 64, 16, neon), lowbd_entry(smooth_h, 64, 32, neon),
+  lowbd_entry(smooth_h, 64, 64, neon),
+
+  lowbd_entry(paeth, 4, 4, neon),      lowbd_entry(paeth, 4, 8, neon),
+  lowbd_entry(paeth, 4, 16, neon),     lowbd_entry(paeth, 8, 4, neon),
+  lowbd_entry(paeth, 8, 8, neon),      lowbd_entry(paeth, 8, 16, neon),
+  lowbd_entry(paeth, 8, 32, neon),     lowbd_entry(paeth, 16, 4, neon),
+  lowbd_entry(paeth, 16, 8, neon),     lowbd_entry(paeth, 16, 16, neon),
+  lowbd_entry(paeth, 16, 32, neon),    lowbd_entry(paeth, 16, 64, neon),
+  lowbd_entry(paeth, 32, 8, neon),     lowbd_entry(paeth, 32, 16, neon),
+  lowbd_entry(paeth, 32, 32, neon),    lowbd_entry(paeth, 32, 64, neon),
+  lowbd_entry(paeth, 64, 16, neon),    lowbd_entry(paeth, 64, 32, neon),
+  lowbd_entry(paeth, 64, 64, neon),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, LowbdIntraPredTest,
+                         ::testing::ValuesIn(LowbdIntraPredTestVectorNeon));
+#endif  // HAVE_NEON
+
 #if HAVE_SSSE3
 const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorSsse3[] = {
   lowbd_intrapred(paeth, ssse3),
@@ -260,9 +413,111 @@ INSTANTIATE_TEST_SUITE_P(AVX2, LowbdIntraPredTest,
 #if CONFIG_AV1_HIGHBITDEPTH
 #if HAVE_NEON
 const IntraPredFunc<HighbdIntraPred> HighbdIntraPredTestVectorNeon[] = {
-  highbd_entry(dc, 4, 4, neon, 8),   highbd_entry(dc, 8, 8, neon, 8),
-  highbd_entry(dc, 16, 16, neon, 8), highbd_entry(dc, 32, 32, neon, 8),
+  highbd_entry(dc, 4, 4, neon, 8),
+  highbd_entry(dc, 8, 8, neon, 8),
+  highbd_entry(dc, 16, 16, neon, 8),
+  highbd_entry(dc, 32, 32, neon, 8),
   highbd_entry(dc, 64, 64, neon, 8),
+
+  highbd_entry(v, 4, 4, neon, 12),
+  highbd_entry(v, 4, 8, neon, 12),
+  highbd_entry(v, 4, 16, neon, 12),
+  highbd_entry(v, 8, 4, neon, 12),
+  highbd_entry(v, 8, 8, neon, 12),
+  highbd_entry(v, 8, 16, neon, 12),
+  highbd_entry(v, 8, 32, neon, 12),
+  highbd_entry(v, 16, 4, neon, 12),
+  highbd_entry(v, 16, 8, neon, 12),
+  highbd_entry(v, 16, 16, neon, 12),
+  highbd_entry(v, 16, 32, neon, 12),
+  highbd_entry(v, 16, 64, neon, 12),
+  highbd_entry(v, 32, 8, neon, 12),
+  highbd_entry(v, 32, 16, neon, 12),
+  highbd_entry(v, 32, 32, neon, 12),
+  highbd_entry(v, 32, 64, neon, 12),
+  highbd_entry(v, 64, 16, neon, 12),
+  highbd_entry(v, 64, 32, neon, 12),
+  highbd_entry(v, 64, 64, neon, 12),
+
+  highbd_entry(paeth, 4, 4, neon, 12),
+  highbd_entry(paeth, 4, 8, neon, 12),
+  highbd_entry(paeth, 4, 16, neon, 12),
+  highbd_entry(paeth, 8, 4, neon, 12),
+  highbd_entry(paeth, 8, 8, neon, 12),
+  highbd_entry(paeth, 8, 16, neon, 12),
+  highbd_entry(paeth, 8, 32, neon, 12),
+  highbd_entry(paeth, 16, 4, neon, 12),
+  highbd_entry(paeth, 16, 8, neon, 12),
+  highbd_entry(paeth, 16, 16, neon, 12),
+  highbd_entry(paeth, 16, 32, neon, 12),
+  highbd_entry(paeth, 16, 64, neon, 12),
+  highbd_entry(paeth, 32, 8, neon, 12),
+  highbd_entry(paeth, 32, 16, neon, 12),
+  highbd_entry(paeth, 32, 32, neon, 12),
+  highbd_entry(paeth, 32, 64, neon, 12),
+  highbd_entry(paeth, 64, 16, neon, 12),
+  highbd_entry(paeth, 64, 32, neon, 12),
+  highbd_entry(paeth, 64, 64, neon, 12),
+
+  highbd_entry(smooth, 4, 4, neon, 12),
+  highbd_entry(smooth, 4, 8, neon, 12),
+  highbd_entry(smooth, 4, 16, neon, 12),
+  highbd_entry(smooth, 8, 4, neon, 12),
+  highbd_entry(smooth, 8, 8, neon, 12),
+  highbd_entry(smooth, 8, 16, neon, 12),
+  highbd_entry(smooth, 8, 32, neon, 12),
+  highbd_entry(smooth, 16, 4, neon, 12),
+  highbd_entry(smooth, 16, 8, neon, 12),
+  highbd_entry(smooth, 16, 16, neon, 12),
+  highbd_entry(smooth, 16, 32, neon, 12),
+  highbd_entry(smooth, 16, 64, neon, 12),
+  highbd_entry(smooth, 32, 8, neon, 12),
+  highbd_entry(smooth, 32, 16, neon, 12),
+  highbd_entry(smooth, 32, 32, neon, 12),
+  highbd_entry(smooth, 32, 64, neon, 12),
+  highbd_entry(smooth, 64, 16, neon, 12),
+  highbd_entry(smooth, 64, 32, neon, 12),
+  highbd_entry(smooth, 64, 64, neon, 12),
+
+  highbd_entry(smooth_v, 4, 4, neon, 12),
+  highbd_entry(smooth_v, 4, 8, neon, 12),
+  highbd_entry(smooth_v, 4, 16, neon, 12),
+  highbd_entry(smooth_v, 8, 4, neon, 12),
+  highbd_entry(smooth_v, 8, 8, neon, 12),
+  highbd_entry(smooth_v, 8, 16, neon, 12),
+  highbd_entry(smooth_v, 8, 32, neon, 12),
+  highbd_entry(smooth_v, 16, 4, neon, 12),
+  highbd_entry(smooth_v, 16, 8, neon, 12),
+  highbd_entry(smooth_v, 16, 16, neon, 12),
+  highbd_entry(smooth_v, 16, 32, neon, 12),
+  highbd_entry(smooth_v, 16, 64, neon, 12),
+  highbd_entry(smooth_v, 32, 8, neon, 12),
+  highbd_entry(smooth_v, 32, 16, neon, 12),
+  highbd_entry(smooth_v, 32, 32, neon, 12),
+  highbd_entry(smooth_v, 32, 64, neon, 12),
+  highbd_entry(smooth_v, 64, 16, neon, 12),
+  highbd_entry(smooth_v, 64, 32, neon, 12),
+  highbd_entry(smooth_v, 64, 64, neon, 12),
+
+  highbd_entry(smooth_h, 4, 4, neon, 12),
+  highbd_entry(smooth_h, 4, 8, neon, 12),
+  highbd_entry(smooth_h, 4, 16, neon, 12),
+  highbd_entry(smooth_h, 8, 4, neon, 12),
+  highbd_entry(smooth_h, 8, 8, neon, 12),
+  highbd_entry(smooth_h, 8, 16, neon, 12),
+  highbd_entry(smooth_h, 8, 32, neon, 12),
+  highbd_entry(smooth_h, 16, 4, neon, 12),
+  highbd_entry(smooth_h, 16, 8, neon, 12),
+  highbd_entry(smooth_h, 16, 16, neon, 12),
+  highbd_entry(smooth_h, 16, 32, neon, 12),
+  highbd_entry(smooth_h, 16, 64, neon, 12),
+  highbd_entry(smooth_h, 32, 8, neon, 12),
+  highbd_entry(smooth_h, 32, 16, neon, 12),
+  highbd_entry(smooth_h, 32, 32, neon, 12),
+  highbd_entry(smooth_h, 32, 64, neon, 12),
+  highbd_entry(smooth_h, 64, 16, neon, 12),
+  highbd_entry(smooth_h, 64, 32, neon, 12),
+  highbd_entry(smooth_h, 64, 64, neon, 12),
 };
 
 INSTANTIATE_TEST_SUITE_P(NEON, HighbdIntraPredTest,
diff --git a/media/libaom/src/test/invalid_file_test.cc b/media/libaom/src/test/invalid_file_test.cc
index dd0956d0c7..c84c8c9c50 100644
--- a/media/libaom/src/test/invalid_file_test.cc
+++ b/media/libaom/src/test/invalid_file_test.cc
@@ -51,7 +51,7 @@ class InvalidFileTest : public ::libaom_test::DecoderTest,
 
   void OpenResFile(const std::string &res_file_name) {
     res_file_ = libaom_test::OpenTestDataFile(res_file_name);
-    ASSERT_TRUE(res_file_ != NULL)
+    ASSERT_NE(res_file_, nullptr)
         << "Result file open failed. Filename: " << res_file_name;
   }
 
@@ -64,7 +64,7 @@ class InvalidFileTest : public ::libaom_test::DecoderTest,
       const aom_codec_err_t res_dec,
       const libaom_test::CompressedVideoSource &video,
       libaom_test::Decoder *decoder) {
-    EXPECT_TRUE(res_file_ != NULL);
+    EXPECT_NE(res_file_, nullptr);
     int expected_res_dec = -1;
 
     // Read integer result.
@@ -103,8 +103,7 @@ class InvalidFileTest : public ::libaom_test::DecoderTest,
     const DecodeParam input = GET_PARAM(1);
     aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
     cfg.threads = input.threads;
-    const std::string filename = input.filename;
-    libaom_test::IVFVideoSource decode_video(filename);
+    libaom_test::IVFVideoSource decode_video(input.filename);
     decode_video.Init();
 
     // The result file holds a list of expected integer results, one for each
@@ -133,10 +132,11 @@ const DecodeParam kAV1InvalidFileTests[] = {
   { 1, "invalid-google-142530197-1.ivf", NULL },
   { 4, "invalid-oss-fuzz-9463.ivf", "invalid-oss-fuzz-9463.ivf.res.2" },
   { 1, "invalid-oss-fuzz-9720.ivf", NULL },
-  { 1, "invalid-oss-fuzz-10389.ivf", "invalid-oss-fuzz-10389.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-10389.ivf", "invalid-oss-fuzz-10389.ivf.res.4" },
   { 1, "invalid-oss-fuzz-11523.ivf", "invalid-oss-fuzz-11523.ivf.res.2" },
   { 4, "invalid-oss-fuzz-15363.ivf", NULL },
-  { 1, "invalid-oss-fuzz-16437.ivf", NULL },
+  { 1, "invalid-oss-fuzz-16437.ivf", "invalid-oss-fuzz-16437.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-24706.ivf", NULL },
 #if CONFIG_AV1_HIGHBITDEPTH
   // These test vectors contain 10-bit or 12-bit video.
   { 1, "invalid-oss-fuzz-9288.ivf", NULL },
@@ -150,10 +150,11 @@ const DecodeParam kAV1InvalidFileTests[] = {
   { 1, "invalid-oss-fuzz-10779.ivf", NULL },
   { 1, "invalid-oss-fuzz-11477.ivf", NULL },
   { 1, "invalid-oss-fuzz-11479.ivf", "invalid-oss-fuzz-11479.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-33030.ivf", NULL },
 #endif
 };
 
-AV1_INSTANTIATE_TEST_CASE(InvalidFileTest,
-                          ::testing::ValuesIn(kAV1InvalidFileTests));
+AV1_INSTANTIATE_TEST_SUITE(InvalidFileTest,
+                           ::testing::ValuesIn(kAV1InvalidFileTests));
 
 }  // namespace
diff --git a/media/libaom/src/test/ivf_video_source.h b/media/libaom/src/test/ivf_video_source.h
index ff2841445e..f7efd6757e 100644
--- a/media/libaom/src/test/ivf_video_source.h
+++ b/media/libaom/src/test/ivf_video_source.h
@@ -45,14 +45,13 @@ class IVFVideoSource : public CompressedVideoSource {
   virtual void Init() {
     // Allocate a buffer for read in the compressed video frame.
     compressed_frame_buf_ = new uint8_t[kCodeBufferSize];
-    ASSERT_TRUE(compressed_frame_buf_ != NULL)
-        << "Allocate frame buffer failed";
+    ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed";
     ASAN_POISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize);
   }
 
   virtual void Begin() {
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
 
     // Read file header
@@ -73,7 +72,7 @@ class IVFVideoSource : public CompressedVideoSource {
   }
 
   void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     uint8_t frame_hdr[kIvfFrameHdrSize];
     // Check frame header and read a frame from input_file.
     if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) !=
diff --git a/media/libaom/src/test/kf_test.cc b/media/libaom/src/test/kf_test.cc
new file mode 100644
index 0000000000..0cef8db04d
--- /dev/null
+++ b/media/libaom/src/test/kf_test.cc
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ostream>
+
+#include "aom/aom_codec.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+#define NUM_LAG_VALUES 3
+
+namespace {
+typedef struct {
+  const unsigned int min_kf_dist;
+  const unsigned int max_kf_dist;
+} kfIntervalParam;
+
+const kfIntervalParam kfTestParams[] = {
+  { 1, 1 }, { 0, 10 }, { 10, 10 }, { 0, 30 }, { 30, 30 }
+};
+
+std::ostream &operator<<(std::ostream &os, const kfIntervalParam &test_arg) {
+  return os << "kfIntervalParam { min_kf_dist:" << test_arg.min_kf_dist
+            << " max_kf_dist:" << test_arg.max_kf_dist << " }";
+}
+
+// This class is used to test the presence of forward key frame.
+class KeyFrameIntervalTestLarge
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 kfIntervalParam, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  KeyFrameIntervalTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        kf_dist_param_(GET_PARAM(2)), end_usage_check_(GET_PARAM(3)) {
+    kf_dist_ = -1;
+    is_kf_interval_violated_ = false;
+  }
+  virtual ~KeyFrameIntervalTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = end_usage_check_;
+    cfg_.g_threads = 1;
+    cfg_.kf_min_dist = kf_dist_param_.min_kf_dist;
+    cfg_.kf_max_dist = kf_dist_param_.max_kf_dist;
+    cfg_.g_lag_in_frames = 19;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      int frame_flags = 0;
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_FRAME_FLAGS,
+                                    &frame_flags);
+      if (kf_dist_ != -1) {
+        kf_dist_++;
+        if (kf_dist_ > (int)kf_dist_param_.max_kf_dist) {
+          is_kf_interval_violated_ = true;
+        }
+      }
+      if ((frame_flags & AOM_FRAME_IS_KEY) ==
+          static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_KEY)) {
+        if (kf_dist_ != -1 && kf_dist_ < (int)kf_dist_param_.min_kf_dist) {
+          is_kf_interval_violated_ = true;
+        }
+        kf_dist_ = 0;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  const kfIntervalParam kf_dist_param_;
+  int kf_dist_;
+  bool is_kf_interval_violated_;
+  aom_rc_mode end_usage_check_;
+};
+
+// Because valgrind builds take a very long time to run, use a lower
+// resolution video for valgrind runs.
+const char *TestFileName() {
+#if AOM_VALGRIND_BUILD
+  return "hantro_collage_w176h144.yuv";
+#else
+  return "hantro_collage_w352h288.yuv";
+#endif  // AOM_VALGRIND_BUILD
+}
+
+int TestFileWidth() {
+#if AOM_VALGRIND_BUILD
+  return 176;
+#else
+  return 352;
+#endif  // AOM_VALGRIND_BUILD
+}
+
+int TestFileHeight() {
+#if AOM_VALGRIND_BUILD
+  return 144;
+#else
+  return 288;
+#endif  // AOM_VALGRIND_BUILD
+}
+
+TEST_P(KeyFrameIntervalTestLarge, KeyFrameIntervalTest) {
+  libaom_test::I420VideoSource video(TestFileName(), TestFileWidth(),
+                                     TestFileHeight(), cfg_.g_timebase.den,
+                                     cfg_.g_timebase.num, 0, 75);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(is_kf_interval_violated_, false) << kf_dist_param_;
+}
+
+// This class tests for presence and placement of application forced key frames.
+class ForcedKeyTestLarge
+    : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int,
+                                                 int, int, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ForcedKeyTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        auto_alt_ref_(GET_PARAM(2)), fwd_kf_enabled_(GET_PARAM(3)),
+        cpu_used_(GET_PARAM(4)), rc_end_usage_(GET_PARAM(5)) {
+    forced_kf_frame_num_ = 1;
+    frame_num_ = 0;
+    is_kf_placement_violated_ = false;
+  }
+  virtual ~ForcedKeyTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 0;
+    cfg_.kf_max_dist = 30;
+    cfg_.kf_min_dist = 0;
+    cfg_.fwd_kf_enabled = fwd_kf_enabled_;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, auto_alt_ref_);
+#if CONFIG_AV1_ENCODER
+      // override test default for tile columns if necessary.
+      if (GET_PARAM(0) == &libaom_test::kAV1) {
+        encoder->Control(AV1E_SET_TILE_COLUMNS, 6);
+      }
+#endif
+    }
+    frame_flags_ =
+        ((int)video->frame() == forced_kf_frame_num_) ? AOM_EFLAG_FORCE_KF : 0;
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      if ((int)frame_num_ == forced_kf_frame_num_) {
+        aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+        int frame_flags = 0;
+        AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_FRAME_FLAGS,
+                                      &frame_flags);
+        if ((frame_flags & AOM_FRAME_IS_KEY) !=
+            static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_KEY)) {
+          is_kf_placement_violated_ = true;
+        }
+      }
+      ++frame_num_;
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  void Frame1IsKey();
+  void ForcedFrameIsKey();
+  void ForcedFrameIsKeyCornerCases();
+
+  ::libaom_test::TestMode encoding_mode_;
+  int auto_alt_ref_;
+  int fwd_kf_enabled_;
+  int cpu_used_;
+  aom_rc_mode rc_end_usage_;
+  int forced_kf_frame_num_;
+  unsigned int frame_num_;
+  bool is_kf_placement_violated_;
+};
+
+void ForcedKeyTestLarge::Frame1IsKey() {
+  const aom_rational timebase = { 1, 30 };
+  // 1st element of this 2D array is for good encoding mode and 2nd element
+  // is for RT encoding mode.
+  const int lag_values[2][NUM_LAG_VALUES] = { { 3, 15, 25 }, { 0, -1, -1 } };
+  int is_realtime = (encoding_mode_ == ::libaom_test::kRealTime);
+
+  forced_kf_frame_num_ = 1;
+  for (int i = 0; i < NUM_LAG_VALUES; ++i) {
+    if (lag_values[is_realtime][i] == -1) continue;
+    frame_num_ = 0;
+    cfg_.g_lag_in_frames = lag_values[is_realtime][i];
+    is_kf_placement_violated_ = false;
+    libaom_test::I420VideoSource video(
+        TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den,
+        timebase.num, 0, fwd_kf_enabled_ ? 60 : 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_EQ(is_kf_placement_violated_, false)
+        << "Frame #" << frame_num_ << " isn't a keyframe!";
+  }
+}
+
+// This class checks the presence and placement of application
+// forced key frames.
+void ForcedKeyTestLarge::ForcedFrameIsKey() {
+  const aom_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    forced_kf_frame_num_ = lag_values[i] - 1;
+    cfg_.g_lag_in_frames = lag_values[i];
+    is_kf_placement_violated_ = false;
+    libaom_test::I420VideoSource video(
+        TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den,
+        timebase.num, 0, fwd_kf_enabled_ ? 60 : 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_EQ(is_kf_placement_violated_, false)
+        << "Frame #" << frame_num_ << " isn't a keyframe!";
+
+    // Two pass and single pass CBR are currently segfaulting for the case when
+    // forced kf is placed after lag in frames.
+    // TODO(anyone): Enable(uncomment) below test once above bug is fixed.
+    //    frame_num_ = 0;
+    //    forced_kf_frame_num_ = lag_values[i] + 1;
+    //    cfg_.g_lag_in_frames = lag_values[i];
+    //    is_kf_placement_violated_ = false;
+    //    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    //    ASSERT_EQ(is_kf_placement_violated_, false)
+    //    << "Frame #" << frame_num_ << " isn't a keyframe!";
+  }
+}
+
+void ForcedKeyTestLarge::ForcedFrameIsKeyCornerCases() {
+  const aom_rational timebase = { 1, 30 };
+  const int kf_offsets[] = { -2, -1, 1, 2, 0 };
+  cfg_.g_lag_in_frames = 35;
+  if (encoding_mode_ == ::libaom_test::kRealTime) cfg_.g_lag_in_frames = 0;
+
+  for (int i = 0; kf_offsets[i] != 0; ++i) {
+    frame_num_ = 0;
+    forced_kf_frame_num_ = (int)cfg_.kf_max_dist + kf_offsets[i];
+    forced_kf_frame_num_ = forced_kf_frame_num_ > 0 ? forced_kf_frame_num_ : 1;
+    is_kf_placement_violated_ = false;
+    libaom_test::I420VideoSource video(
+        TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den,
+        timebase.num, 0, fwd_kf_enabled_ ? 60 : 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_EQ(is_kf_placement_violated_, false)
+        << "Frame #" << frame_num_ << " isn't a keyframe!";
+  }
+}
+
+AV1_INSTANTIATE_TEST_SUITE(KeyFrameIntervalTestLarge,
+                           testing::Values(::libaom_test::kOnePassGood,
+                                           ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(kfTestParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+
+TEST_P(ForcedKeyTestLarge, Frame1IsKey) { Frame1IsKey(); }
+TEST_P(ForcedKeyTestLarge, ForcedFrameIsKey) { ForcedFrameIsKey(); }
+TEST_P(ForcedKeyTestLarge, ForcedFrameIsKeyCornerCases) {
+  ForcedFrameIsKeyCornerCases();
+}
+
+class ForcedKeyRTTestLarge : public ForcedKeyTestLarge {};
+
+TEST_P(ForcedKeyRTTestLarge, Frame1IsKey) { Frame1IsKey(); }
+TEST_P(ForcedKeyRTTestLarge, ForcedFrameIsKeyCornerCases) {
+  ForcedFrameIsKeyCornerCases();
+}
+// TODO(anyone): Add CBR to list of rc_modes once forced kf placement after
+// lag in frames bug is fixed.
+AV1_INSTANTIATE_TEST_SUITE(ForcedKeyTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(0, 1), ::testing::Values(0, 1),
+                           ::testing::Values(2, 5),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CQ));
+AV1_INSTANTIATE_TEST_SUITE(ForcedKeyRTTestLarge,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Values(0), ::testing::Values(0),
+                           ::testing::Values(7, 9),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR));
+}  // namespace
diff --git a/media/libaom/src/test/level_test.cc b/media/libaom/src/test/level_test.cc
index a9613c5f76..a3298280dd 100644
--- a/media/libaom/src/test/level_test.cc
+++ b/media/libaom/src/test/level_test.cc
@@ -42,14 +42,10 @@ class LevelTest
   virtual ~LevelTest() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
+    InitializeConfig(encoding_mode_);
     if (encoding_mode_ != ::libaom_test::kRealTime) {
       cfg_.g_lag_in_frames = 5;
-      cfg_.rc_end_usage = AOM_VBR;
     } else {
-      cfg_.g_lag_in_frames = 0;
-      cfg_.rc_end_usage = AOM_CBR;
       cfg_.rc_buf_sz = 1000;
       cfg_.rc_buf_initial_sz = 500;
       cfg_.rc_buf_optimal_sz = 600;
@@ -80,7 +76,7 @@ class LevelTest
 };
 
 TEST_P(LevelTest, TestTargetLevelApi) {
-  static const aom_codec_iface_t *codec = &aom_codec_av1_cx_algo;
+  static aom_codec_iface_t *codec = aom_codec_av1_cx();
   aom_codec_ctx_t enc;
   aom_codec_enc_cfg_t cfg;
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(codec, &cfg, 0));
@@ -108,7 +104,7 @@ TEST_P(LevelTest, TestTargetLevelApi) {
 TEST_P(LevelTest, TestTargetLevel19) {
   std::unique_ptr<libaom_test::VideoSource> video;
   video.reset(new libaom_test::Y4mVideoSource("park_joy_90p_8_420.y4m", 0, 10));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video, nullptr);
   // Level index 19 corresponding to level 6.3.
   target_level_ = 19;
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
@@ -121,6 +117,7 @@ TEST_P(LevelTest, TestLevelMonitoringLowBitrate) {
                                        30, 1, 0, 40);
     target_level_ = kLevelKeepStats;
     cfg_.rc_target_bitrate = 1000;
+    cfg_.g_limit = 40;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_EQ(level_[0], 0);
   }
@@ -133,8 +130,9 @@ TEST_P(LevelTest, TestLevelMonitoringHighBitrate) {
                                        30, 1, 0, 40);
     target_level_ = kLevelKeepStats;
     cfg_.rc_target_bitrate = 4000;
+    cfg_.g_limit = 40;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_EQ(level_[0], 1);
+    ASSERT_EQ(level_[0], 4);
   }
 }
 
@@ -151,7 +149,19 @@ TEST_P(LevelTest, TestTargetLevel0) {
   }
 }
 
-AV1_INSTANTIATE_TEST_CASE(LevelTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood),
-                          ::testing::ValuesIn(kCpuUsedVectors));
+TEST_P(LevelTest, TestTargetLevelRecode) {
+  if (cpu_used_ == 4 && encoding_mode_ == ::libaom_test::kTwoPassGood) {
+    libaom_test::I420VideoSource video("rand_noise_w1280h720.yuv", 1280, 720,
+                                       25, 1, 0, 10);
+    const int target_level = 0005;
+    target_level_ = target_level;
+    cfg_.rc_target_bitrate = 5000;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+}
+
+AV1_INSTANTIATE_TEST_SUITE(LevelTest,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::ValuesIn(kCpuUsedVectors));
 }  // namespace
diff --git a/media/libaom/src/test/lightfield_test.sh b/media/libaom/src/test/lightfield_test.sh
index 3de88af877..cf1ea73a84 100644..100755
--- a/media/libaom/src/test/lightfield_test.sh
+++ b/media/libaom/src/test/lightfield_test.sh
@@ -42,7 +42,7 @@ lightfield_test() {
 
   eval "${AOM_TEST_PREFIX}" "${encoder}" "${img_width}" "${img_height}" \
       "${yuv_file}" "${lf_file}" "${lf_width}" \
-      "${lf_height}" "${lf_blocksize}" ${devnull}
+      "${lf_height}" "${lf_blocksize}" ${devnull} || return 1
 
   [ -e "${lf_file}" ] || return 1
 
@@ -73,7 +73,7 @@ lightfield_test() {
   fi
 
   eval "${AOM_TEST_PREFIX}" "${bs_decoder}" "${lf_file}" "${tl_file}" \
-      "${num_references}" "${tl_text_file}" ${devnull}
+      "${num_references}" "${tl_text_file}" ${devnull} || return 1
 
   [ -e "${tl_file}" ] || return 1
 
@@ -86,7 +86,7 @@ lightfield_test() {
   fi
 
   eval "${AOM_TEST_PREFIX}" "${tl_decoder}" "${tl_file}" "${tl_outfile}" \
-      "${num_references}" "${num_tile_lists}" ${devnull}
+      "${num_references}" "${num_tile_lists}" ${devnull} || return 1
 
   [ -e "${tl_outfile}" ] || return 1
 
@@ -99,7 +99,7 @@ lightfield_test() {
   fi
 
   eval "${AOM_TEST_PREFIX}" "${ref_decoder}" "${lf_file}" "${tl_reffile}" \
-      "${num_references}" "${tl_text_file}" ${devnull}
+      "${num_references}" "${tl_text_file}" ${devnull} || return 1
 
   [ -e "${tl_reffile}" ] || return 1
 
diff --git a/media/libaom/src/test/loopfilter_control_test.cc b/media/libaom/src/test/loopfilter_control_test.cc
new file mode 100644
index 0000000000..5f0134045b
--- /dev/null
+++ b/media/libaom/src/test/loopfilter_control_test.cc
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+
+// List of psnr thresholds for LF settings 0-3
+// keys: video, LF control, aq mode.
+std::unordered_map<std::string,
+                   std::unordered_map<int, std::unordered_map<int, double>>>
+    kPsnrThreshold = { { "park_joy_90p_8_420.y4m",
+                         { { 0, { { 0, 35.0 }, { 3, 35.8 } } },
+                           { 1, { { 0, 35.1 }, { 3, 35.9 } } },
+                           { 2, { { 0, 35.1 }, { 3, 36.1 } } },
+                           { 3, { { 0, 35.1 }, { 3, 36.1 } } } } },
+                       { "paris_352_288_30.y4m",
+                         { { 0, { { 0, 35.40 }, { 3, 36.0 } } },
+                           { 1, { { 0, 35.50 }, { 3, 36.0 } } },
+                           { 2, { { 0, 35.50 }, { 3, 36.0 } } },
+                           { 3, { { 0, 35.50 }, { 3, 36.0 } } } } },
+                       { "niklas_1280_720_30.y4m",
+                         { { 0, { { 0, 33.20 }, { 3, 32.90 } } },
+                           { 1, { { 0, 33.57 }, { 3, 33.22 } } },
+                           { 2, { { 0, 33.57 }, { 3, 33.22 } } },
+                           { 3, { { 0, 33.45 }, { 3, 33.10 } } } } } };
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " input_bit_depth:" << test_arg.input_bit_depth
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "paris_352_288_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "niklas_1280_720_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+};
+
+// Params: test video, lf_control, aq mode, threads, tile columns.
+class LFControlEndToEndTest
+    : public ::libaom_test::CodecTestWith5Params<TestVideoParam, int,
+                                                 unsigned int, int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  LFControlEndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+        lf_control_(GET_PARAM(2)), psnr_(0.0), nframes_(0),
+        aq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)),
+        tile_columns_(GET_PARAM(5)) {}
+
+  virtual ~LFControlEndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig(::libaom_test::kRealTime);
+
+    cfg_.g_threads = threads_;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.kf_max_dist = 9999;
+    cfg_.kf_min_dist = 9999;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
+      encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+      encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_WARPED_MOTION, 0);
+      encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
+      encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
+      encoder->Control(AOME_SET_CPUUSED, 10);
+      encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+      encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(AV1E_SET_ROW_MT, 1);
+      encoder->Control(AV1E_SET_ENABLE_CDEF, 1);
+      encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
+      encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
+      encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
+      encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2);
+      encoder->Control(AV1E_SET_LOOPFILTER_CONTROL, lf_control_);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() {
+    return kPsnrThreshold[test_video_param_.filename][lf_control_][aq_mode_];
+  }
+
+  void DoTest() {
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    std::unique_ptr<libaom_test::VideoSource> video;
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                kFrames));
+    ASSERT_NE(video, nullptr);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "loopfilter control = " << lf_control_ << " aq mode = " << aq_mode_;
+  }
+
+  TestVideoParam test_video_param_;
+  int lf_control_;
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  unsigned int aq_mode_;
+  int threads_;
+  int tile_columns_;
+};
+
+class LFControlEndToEndTestThreaded : public LFControlEndToEndTest {};
+
+TEST_P(LFControlEndToEndTest, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(LFControlEndToEndTestThreaded, EndtoEndPSNRTest) { DoTest(); }
+
+TEST(LFControlGetterTest, NullptrInput) {
+  int *lf_level = nullptr;
+  aom_codec_ctx_t encoder;
+  aom_codec_enc_cfg_t cfg;
+  aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg, 1);
+  EXPECT_EQ(aom_codec_enc_init(&encoder, aom_codec_av1_cx(), &cfg, 0),
+            AOM_CODEC_OK);
+  EXPECT_EQ(aom_codec_control(&encoder, AOME_GET_LOOPFILTER_LEVEL, lf_level),
+            AOM_CODEC_INVALID_PARAM);
+  EXPECT_EQ(aom_codec_destroy(&encoder), AOM_CODEC_OK);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(LFControlEndToEndTest,
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Range(0, 4),
+                           ::testing::Values<unsigned int>(0, 3),
+                           ::testing::Values(1), ::testing::Values(1));
+
+AV1_INSTANTIATE_TEST_SUITE(LFControlEndToEndTestThreaded,
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Range(0, 4),
+                           ::testing::Values<unsigned int>(0, 3),
+                           ::testing::Range(2, 5), ::testing::Range(2, 5));
+}  // namespace
diff --git a/media/libaom/src/test/lossless_test.cc b/media/libaom/src/test/lossless_test.cc
index 71ae5e72be..c14bc06e5e 100644
--- a/media/libaom/src/test/lossless_test.cc
+++ b/media/libaom/src/test/lossless_test.cc
@@ -24,18 +24,20 @@ namespace {
 const int kMaxPsnr = 100;
 
 class LosslessTestLarge
-    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 aom_rc_mode, int>,
       public ::libaom_test::EncoderTest {
  protected:
   LosslessTestLarge()
       : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
-        encoding_mode_(GET_PARAM(1)) {}
+        encoding_mode_(GET_PARAM(1)), rc_end_usage_(GET_PARAM(2)),
+        cpu_used_(GET_PARAM(3)) {}
 
   virtual ~LosslessTestLarge() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
+    InitializeConfig(encoding_mode_);
+    cfg_.rc_end_usage = rc_end_usage_;
   }
 
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
@@ -46,6 +48,7 @@ class LosslessTestLarge
       if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) {
         encoder->Control(AV1E_SET_LOSSLESS, 1);
       }
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
     }
   }
 
@@ -60,10 +63,26 @@ class LosslessTestLarge
 
   double GetMinPsnr() const { return psnr_; }
 
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_LAST_QUANTIZER,
+                                    &base_qindex_);
+      EXPECT_EQ(base_qindex_, 0)
+          << "Error: Base_qindex is non zero for lossless coding";
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
  private:
   double psnr_;
   unsigned int nframes_;
   libaom_test::TestMode encoding_mode_;
+  aom_rc_mode rc_end_usage_;
+  int cpu_used_;
+  int base_qindex_;
 };
 
 TEST_P(LosslessTestLarge, TestLossLessEncoding) {
@@ -120,7 +139,33 @@ TEST_P(LosslessTestLarge, TestLossLessEncodingCtrl) {
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-AV1_INSTANTIATE_TEST_CASE(LosslessTestLarge,
-                          ::testing::Values(::libaom_test::kOnePassGood,
-                                            ::libaom_test::kTwoPassGood));
+class LosslessAllIntraTestLarge : public LosslessTestLarge {};
+
+TEST_P(LosslessAllIntraTestLarge, TestLossLessEncodingCtrl) {
+  const aom_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  // Intentionally set Q > 0, to make sure control can be used to activate
+  // lossless
+  cfg_.rc_min_quantizer = 10;
+  cfg_.rc_max_quantizer = 20;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 5);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(LosslessTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ),
+                           ::testing::Values(0));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(LosslessAllIntraTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(AOM_Q),
+                           ::testing::Values(6, 9));  // cpu_used
 }  // namespace
diff --git a/media/libaom/src/test/lpf_test.cc b/media/libaom/src/test/lpf_test.cc
index e8eeceb7c3..6960fd3e69 100644
--- a/media/libaom/src/test/lpf_test.cc
+++ b/media/libaom/src/test/lpf_test.cc
@@ -20,7 +20,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/av1_loopfilter.h"
@@ -136,7 +135,7 @@ class LoopTestParam : public ::testing::TestWithParam<params_t> {
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   int bit_depth_;
@@ -161,55 +160,64 @@ void call_filter(uint8_t *s, LOOP_PARAM, int bd, loop_op_t op) {
 void call_dualfilter(uint8_t *s, DUAL_LOOP_PARAM, int bd, dual_loop_op_t op) {
   (void)bd;
   op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
-};
+}
 
 #if CONFIG_AV1_HIGHBITDEPTH
 typedef LoopTestParam<hbdloop_op_t, hbdloop_param_t> Loop8Test6Param_hbd;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param_hbd);
 typedef LoopTestParam<hbddual_loop_op_t, hbddual_loop_param_t>
     Loop8Test9Param_hbd;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param_hbd);
 #endif
 typedef LoopTestParam<loop_op_t, loop_param_t> Loop8Test6Param_lbd;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param_lbd);
 typedef LoopTestParam<dual_loop_op_t, dual_loop_param_t> Loop8Test9Param_lbd;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param_lbd);
 
 #define OPCHECK(a, b)                                                          \
-  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
-  const int count_test_block = number_of_iterations;                           \
-  const int32_t p = kNumCoeffs / 32;                                           \
-  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
-  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
-  int err_count_total = 0;                                                     \
-  int first_failure = -1;                                                      \
-  for (int i = 0; i < count_test_block; ++i) {                                 \
-    int err_count = 0;                                                         \
-    uint8_t tmp = GetOuterThresh(&rnd);                                        \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    tmp = GetInnerThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
-    tmp = GetHevThresh(&rnd);                                                  \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    InitInput<a, b>(s, ref_s, &rnd, *limit, mask_, p, i);                      \
-    call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,       \
-                ref_loopfilter_op_);                                           \
-    ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
-                                         thresh, bit_depth_, loopfilter_op_)); \
-    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-      err_count += ref_s[j] != s[j];                                           \
-    }                                                                          \
-    if (err_count && !err_count_total) {                                       \
-      first_failure = i;                                                       \
+  do {                                                                         \
+    ACMRandom rnd(ACMRandom::DeterministicSeed());                             \
+    const int count_test_block = number_of_iterations;                         \
+    const int32_t p = kNumCoeffs / 32;                                         \
+    DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                      \
+    DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                  \
+    int err_count_total = 0;                                                   \
+    int first_failure = -1;                                                    \
+    for (int i = 0; i < count_test_block; ++i) {                               \
+      int err_count = 0;                                                       \
+      uint8_t tmp = GetOuterThresh(&rnd);                                      \
+      DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = { tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp }; \
+      tmp = GetInnerThresh(&rnd);                                              \
+      DECLARE_ALIGNED(16, const uint8_t,                                       \
+                      limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+      tmp = GetHevThresh(&rnd);                                                \
+      DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = { tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp }; \
+      InitInput<a, b>(s, ref_s, &rnd, *limit, mask_, p, i);                    \
+      call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,     \
+                  ref_loopfilter_op_);                                         \
+      API_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,    \
+                                           thresh, bit_depth_,                 \
+                                           loopfilter_op_));                   \
+      for (int j = 0; j < kNumCoeffs; ++j) {                                   \
+        err_count += ref_s[j] != s[j];                                         \
+      }                                                                        \
+      if (err_count && !err_count_total) {                                     \
+        first_failure = i;                                                     \
+      }                                                                        \
+      err_count_total += err_count;                                            \
     }                                                                          \
-    err_count_total += err_count;                                              \
-  }                                                                            \
-  EXPECT_EQ(0, err_count_total)                                                \
-      << "Error: Loop8Test6Param, C output doesn't match SIMD "                \
-         "loopfilter output. "                                                 \
-      << "First failed at test case " << first_failure;
+    EXPECT_EQ(0, err_count_total)                                              \
+        << "Error: Loop8Test6Param, C output doesn't match SIMD "              \
+           "loopfilter output. "                                               \
+        << "First failed at test case " << first_failure;                      \
+  } while (false)
 
 #if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(Loop8Test6Param_hbd, OperationCheck) { OPCHECK(uint16_t, 16); }
@@ -217,47 +225,52 @@ TEST_P(Loop8Test6Param_hbd, OperationCheck) { OPCHECK(uint16_t, 16); }
 TEST_P(Loop8Test6Param_lbd, OperationCheck) { OPCHECK(uint8_t, 8); }
 
 #define VALCHECK(a, b)                                                         \
-  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
-  const int count_test_block = number_of_iterations;                           \
-  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
-  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
-  int err_count_total = 0;                                                     \
-  int first_failure = -1;                                                      \
-  for (int i = 0; i < count_test_block; ++i) {                                 \
-    int err_count = 0;                                                         \
-    uint8_t tmp = GetOuterThresh(&rnd);                                        \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    tmp = GetInnerThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
-    tmp = GetHevThresh(&rnd);                                                  \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    int32_t p = kNumCoeffs / 32;                                               \
-    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-      s[j] = rnd.Rand16() & mask_;                                             \
-      ref_s[j] = s[j];                                                         \
-    }                                                                          \
-    call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,       \
-                ref_loopfilter_op_);                                           \
-    ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
-                                         thresh, bit_depth_, loopfilter_op_)); \
-    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-      err_count += ref_s[j] != s[j];                                           \
-    }                                                                          \
-    if (err_count && !err_count_total) {                                       \
-      first_failure = i;                                                       \
+  do {                                                                         \
+    ACMRandom rnd(ACMRandom::DeterministicSeed());                             \
+    const int count_test_block = number_of_iterations;                         \
+    DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                      \
+    DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                  \
+    int err_count_total = 0;                                                   \
+    int first_failure = -1;                                                    \
+    for (int i = 0; i < count_test_block; ++i) {                               \
+      int err_count = 0;                                                       \
+      uint8_t tmp = GetOuterThresh(&rnd);                                      \
+      DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = { tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp }; \
+      tmp = GetInnerThresh(&rnd);                                              \
+      DECLARE_ALIGNED(16, const uint8_t,                                       \
+                      limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+      tmp = GetHevThresh(&rnd);                                                \
+      DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = { tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp }; \
+      int32_t p = kNumCoeffs / 32;                                             \
+      for (int j = 0; j < kNumCoeffs; ++j) {                                   \
+        s[j] = rnd.Rand16() & mask_;                                           \
+        ref_s[j] = s[j];                                                       \
+      }                                                                        \
+      call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,     \
+                  ref_loopfilter_op_);                                         \
+      API_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,    \
+                                           thresh, bit_depth_,                 \
+                                           loopfilter_op_));                   \
+      for (int j = 0; j < kNumCoeffs; ++j) {                                   \
+        err_count += ref_s[j] != s[j];                                         \
+      }                                                                        \
+      if (err_count && !err_count_total) {                                     \
+        first_failure = i;                                                     \
+      }                                                                        \
+      err_count_total += err_count;                                            \
     }                                                                          \
-    err_count_total += err_count;                                              \
-  }                                                                            \
-  EXPECT_EQ(0, err_count_total)                                                \
-      << "Error: Loop8Test6Param, C output doesn't match SIMD "                \
-         "loopfilter output. "                                                 \
-      << "First failed at test case " << first_failure;
+    EXPECT_EQ(0, err_count_total)                                              \
+        << "Error: Loop8Test6Param, C output doesn't match SIMD "              \
+           "loopfilter output. "                                               \
+        << "First failed at test case " << first_failure;                      \
+  } while (false)
 
 #if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(Loop8Test6Param_hbd, ValueCheck) { VALCHECK(uint16_t, 16); }
@@ -265,29 +278,32 @@ TEST_P(Loop8Test6Param_hbd, ValueCheck) { VALCHECK(uint16_t, 16); }
 TEST_P(Loop8Test6Param_lbd, ValueCheck) { VALCHECK(uint8_t, 8); }
 
 #define SPEEDCHECK(a, b)                                                      \
-  ACMRandom rnd(ACMRandom::DeterministicSeed());                              \
-  const int count_test_block = kSpeedTestNum;                                 \
-  const int32_t bd = bit_depth_;                                              \
-  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                       \
-  uint8_t tmp = GetOuterThresh(&rnd);                                         \
-  DECLARE_ALIGNED(16, const uint8_t,                                          \
-                  blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
-                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
-  tmp = GetInnerThresh(&rnd);                                                 \
-  DECLARE_ALIGNED(16, const uint8_t,                                          \
-                  limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,      \
-                                 tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };    \
-  tmp = GetHevThresh(&rnd);                                                   \
-  DECLARE_ALIGNED(16, const uint8_t,                                          \
-                  thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
-                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
-  int32_t p = kNumCoeffs / 32;                                                \
-  for (int j = 0; j < kNumCoeffs; ++j) {                                      \
-    s[j] = rnd.Rand16() & mask_;                                              \
-  }                                                                           \
-  for (int i = 0; i < count_test_block; ++i) {                                \
-    call_filter(s + 8 + p * 8, p, blimit, limit, thresh, bd, loopfilter_op_); \
-  }
+  do {                                                                        \
+    ACMRandom rnd(ACMRandom::DeterministicSeed());                            \
+    const int count_test_block = kSpeedTestNum;                               \
+    const int32_t bd = bit_depth_;                                            \
+    DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                     \
+    uint8_t tmp = GetOuterThresh(&rnd);                                       \
+    DECLARE_ALIGNED(16, const uint8_t,                                        \
+                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                               \
+    DECLARE_ALIGNED(16, const uint8_t,                                        \
+                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                 \
+    DECLARE_ALIGNED(16, const uint8_t,                                        \
+                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    int32_t p = kNumCoeffs / 32;                                              \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                    \
+      s[j] = rnd.Rand16() & mask_;                                            \
+    }                                                                         \
+    for (int i = 0; i < count_test_block; ++i) {                              \
+      call_filter(s + 8 + p * 8, p, blimit, limit, thresh, bd,                \
+                  loopfilter_op_);                                            \
+    }                                                                         \
+  } while (false)
 
 #if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(Loop8Test6Param_hbd, DISABLED_Speed) { SPEEDCHECK(uint16_t, 16); }
@@ -295,58 +311,66 @@ TEST_P(Loop8Test6Param_hbd, DISABLED_Speed) { SPEEDCHECK(uint16_t, 16); }
 TEST_P(Loop8Test6Param_lbd, DISABLED_Speed) { SPEEDCHECK(uint8_t, 8); }
 
 #define OPCHECKd(a, b)                                                         \
-  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
-  const int count_test_block = number_of_iterations;                           \
-  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
-  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
-  int err_count_total = 0;                                                     \
-  int first_failure = -1;                                                      \
-  for (int i = 0; i < count_test_block; ++i) {                                 \
-    int err_count = 0;                                                         \
-    uint8_t tmp = GetOuterThresh(&rnd);                                        \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-    tmp = GetInnerThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    tmp = GetHevThresh(&rnd);                                                  \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-    tmp = GetOuterThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-    tmp = GetInnerThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    tmp = GetHevThresh(&rnd);                                                  \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-    int32_t p = kNumCoeffs / 32;                                               \
-    const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1;               \
-    InitInput<a, b>(s, ref_s, &rnd, limit, mask_, p, i);                       \
-    call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
-                    limit1, thresh1, bit_depth_, ref_loopfilter_op_);          \
-    ASM_REGISTER_STATE_CHECK(                                                  \
-        call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
-                        limit1, thresh1, bit_depth_, loopfilter_op_));         \
-    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-      err_count += ref_s[j] != s[j];                                           \
-    }                                                                          \
-    if (err_count && !err_count_total) {                                       \
-      first_failure = i;                                                       \
+  do {                                                                         \
+    ACMRandom rnd(ACMRandom::DeterministicSeed());                             \
+    const int count_test_block = number_of_iterations;                         \
+    DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                      \
+    DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                  \
+    int err_count_total = 0;                                                   \
+    int first_failure = -1;                                                    \
+    for (int i = 0; i < count_test_block; ++i) {                               \
+      int err_count = 0;                                                       \
+      uint8_t tmp = GetOuterThresh(&rnd);                                      \
+      DECLARE_ALIGNED(                                                         \
+          16, const uint8_t, blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp };            \
+      tmp = GetInnerThresh(&rnd);                                              \
+      DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = { tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp }; \
+      tmp = GetHevThresh(&rnd);                                                \
+      DECLARE_ALIGNED(                                                         \
+          16, const uint8_t, thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp };            \
+      tmp = GetOuterThresh(&rnd);                                              \
+      DECLARE_ALIGNED(                                                         \
+          16, const uint8_t, blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp };            \
+      tmp = GetInnerThresh(&rnd);                                              \
+      DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = { tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp }; \
+      tmp = GetHevThresh(&rnd);                                                \
+      DECLARE_ALIGNED(                                                         \
+          16, const uint8_t, thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp };            \
+      int32_t p = kNumCoeffs / 32;                                             \
+      const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1;             \
+      InitInput<a, b>(s, ref_s, &rnd, limit, mask_, p, i);                     \
+      call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
+                      limit1, thresh1, bit_depth_, ref_loopfilter_op_);        \
+      API_REGISTER_STATE_CHECK(                                                \
+          call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
+                          limit1, thresh1, bit_depth_, loopfilter_op_));       \
+      for (int j = 0; j < kNumCoeffs; ++j) {                                   \
+        err_count += ref_s[j] != s[j];                                         \
+      }                                                                        \
+      if (err_count && !err_count_total) {                                     \
+        first_failure = i;                                                     \
+      }                                                                        \
+      err_count_total += err_count;                                            \
     }                                                                          \
-    err_count_total += err_count;                                              \
-  }                                                                            \
-  EXPECT_EQ(0, err_count_total)                                                \
-      << "Error: Loop8Test9Param, C output doesn't match SIMD "                \
-         "loopfilter output. "                                                 \
-      << "First failed at test case " << first_failure;
+    EXPECT_EQ(0, err_count_total)                                              \
+        << "Error: Loop8Test9Param, C output doesn't match SIMD "              \
+           "loopfilter output. "                                               \
+        << "First failed at test case " << first_failure;                      \
+  } while (false)
 
 #if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(Loop8Test9Param_hbd, OperationCheck) { OPCHECKd(uint16_t, 16); }
@@ -354,14 +378,79 @@ TEST_P(Loop8Test9Param_hbd, OperationCheck) { OPCHECKd(uint16_t, 16); }
 TEST_P(Loop8Test9Param_lbd, OperationCheck) { OPCHECKd(uint8_t, 8); }
 
 #define VALCHECKd(a, b)                                                        \
-  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
-  const int count_test_block = number_of_iterations;                           \
-  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
-  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
-  int err_count_total = 0;                                                     \
-  int first_failure = -1;                                                      \
-  for (int i = 0; i < count_test_block; ++i) {                                 \
-    int err_count = 0;                                                         \
+  do {                                                                         \
+    ACMRandom rnd(ACMRandom::DeterministicSeed());                             \
+    const int count_test_block = number_of_iterations;                         \
+    DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                      \
+    DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                  \
+    int err_count_total = 0;                                                   \
+    int first_failure = -1;                                                    \
+    for (int i = 0; i < count_test_block; ++i) {                               \
+      int err_count = 0;                                                       \
+      uint8_t tmp = GetOuterThresh(&rnd);                                      \
+      DECLARE_ALIGNED(                                                         \
+          16, const uint8_t, blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp };            \
+      tmp = GetInnerThresh(&rnd);                                              \
+      DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = { tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp }; \
+      tmp = GetHevThresh(&rnd);                                                \
+      DECLARE_ALIGNED(                                                         \
+          16, const uint8_t, thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp };            \
+      tmp = GetOuterThresh(&rnd);                                              \
+      DECLARE_ALIGNED(                                                         \
+          16, const uint8_t, blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp };            \
+      tmp = GetInnerThresh(&rnd);                                              \
+      DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = { tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp,   \
+                                                         tmp, tmp, tmp, tmp }; \
+      tmp = GetHevThresh(&rnd);                                                \
+      DECLARE_ALIGNED(                                                         \
+          16, const uint8_t, thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                              tmp, tmp, tmp, tmp };            \
+      int32_t p = kNumCoeffs / 32;                                             \
+      for (int j = 0; j < kNumCoeffs; ++j) {                                   \
+        s[j] = rnd.Rand16() & mask_;                                           \
+        ref_s[j] = s[j];                                                       \
+      }                                                                        \
+      call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
+                      limit1, thresh1, bit_depth_, ref_loopfilter_op_);        \
+      API_REGISTER_STATE_CHECK(                                                \
+          call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
+                          limit1, thresh1, bit_depth_, loopfilter_op_));       \
+      for (int j = 0; j < kNumCoeffs; ++j) {                                   \
+        err_count += ref_s[j] != s[j];                                         \
+      }                                                                        \
+      if (err_count && !err_count_total) {                                     \
+        first_failure = i;                                                     \
+      }                                                                        \
+      err_count_total += err_count;                                            \
+    }                                                                          \
+    EXPECT_EQ(0, err_count_total)                                              \
+        << "Error: Loop8Test9Param, C output doesn't match SIMD "              \
+           "loopfilter output. "                                               \
+        << "First failed at test case " << first_failure;                      \
+  } while (false)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test9Param_hbd, ValueCheck) { VALCHECKd(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test9Param_lbd, ValueCheck) { VALCHECKd(uint8_t, 8); }
+
+#define SPEEDCHECKd(a, b)                                                      \
+  do {                                                                         \
+    ACMRandom rnd(ACMRandom::DeterministicSeed());                             \
+    const int count_test_block = kSpeedTestNum;                                \
+    DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                      \
     uint8_t tmp = GetOuterThresh(&rnd);                                        \
     DECLARE_ALIGNED(16, const uint8_t,                                         \
                     blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
@@ -389,67 +478,12 @@ TEST_P(Loop8Test9Param_lbd, OperationCheck) { OPCHECKd(uint8_t, 8); }
     int32_t p = kNumCoeffs / 32;                                               \
     for (int j = 0; j < kNumCoeffs; ++j) {                                     \
       s[j] = rnd.Rand16() & mask_;                                             \
-      ref_s[j] = s[j];                                                         \
-    }                                                                          \
-    call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
-                    limit1, thresh1, bit_depth_, ref_loopfilter_op_);          \
-    ASM_REGISTER_STATE_CHECK(                                                  \
-        call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
-                        limit1, thresh1, bit_depth_, loopfilter_op_));         \
-    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-      err_count += ref_s[j] != s[j];                                           \
     }                                                                          \
-    if (err_count && !err_count_total) {                                       \
-      first_failure = i;                                                       \
+    for (int i = 0; i < count_test_block; ++i) {                               \
+      call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,     \
+                      limit1, thresh1, bit_depth_, loopfilter_op_);            \
     }                                                                          \
-    err_count_total += err_count;                                              \
-  }                                                                            \
-  EXPECT_EQ(0, err_count_total)                                                \
-      << "Error: Loop8Test9Param, C output doesn't match SIMD "                \
-         "loopfilter output. "                                                 \
-      << "First failed at test case " << first_failure;
-
-#if CONFIG_AV1_HIGHBITDEPTH
-TEST_P(Loop8Test9Param_hbd, ValueCheck) { VALCHECKd(uint16_t, 16); }
-#endif
-TEST_P(Loop8Test9Param_lbd, ValueCheck) { VALCHECKd(uint8_t, 8); }
-
-#define SPEEDCHECKd(a, b)                                                    \
-  ACMRandom rnd(ACMRandom::DeterministicSeed());                             \
-  const int count_test_block = kSpeedTestNum;                                \
-  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                      \
-  uint8_t tmp = GetOuterThresh(&rnd);                                        \
-  DECLARE_ALIGNED(16, const uint8_t,                                         \
-                  blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-  tmp = GetInnerThresh(&rnd);                                                \
-  DECLARE_ALIGNED(16, const uint8_t,                                         \
-                  limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-  tmp = GetHevThresh(&rnd);                                                  \
-  DECLARE_ALIGNED(16, const uint8_t,                                         \
-                  thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-  tmp = GetOuterThresh(&rnd);                                                \
-  DECLARE_ALIGNED(16, const uint8_t,                                         \
-                  blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-  tmp = GetInnerThresh(&rnd);                                                \
-  DECLARE_ALIGNED(16, const uint8_t,                                         \
-                  limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-  tmp = GetHevThresh(&rnd);                                                  \
-  DECLARE_ALIGNED(16, const uint8_t,                                         \
-                  thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-  int32_t p = kNumCoeffs / 32;                                               \
-  for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-    s[j] = rnd.Rand16() & mask_;                                             \
-  }                                                                          \
-  for (int i = 0; i < count_test_block; ++i) {                               \
-    call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,     \
-                    limit1, thresh1, bit_depth_, loopfilter_op_);            \
-  }
+  } while (false)
 
 #if CONFIG_AV1_HIGHBITDEPTH
 TEST_P(Loop8Test9Param_hbd, DISABLED_Speed) { SPEEDCHECKd(uint16_t, 16); }
@@ -516,6 +550,15 @@ const loop_param_t kLoop8Test6[] = {
   make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
   make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
   make_tuple(&aom_lpf_vertical_14_sse2, &aom_lpf_vertical_14_c, 8),
+  make_tuple(&aom_lpf_horizontal_4_quad_sse2, &aom_lpf_horizontal_4_quad_c, 8),
+  make_tuple(&aom_lpf_vertical_4_quad_sse2, &aom_lpf_vertical_4_quad_c, 8),
+  make_tuple(&aom_lpf_horizontal_6_quad_sse2, &aom_lpf_horizontal_6_quad_c, 8),
+  make_tuple(&aom_lpf_vertical_6_quad_sse2, &aom_lpf_vertical_6_quad_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_quad_sse2, &aom_lpf_horizontal_8_quad_c, 8),
+  make_tuple(&aom_lpf_vertical_8_quad_sse2, &aom_lpf_vertical_8_quad_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_quad_sse2, &aom_lpf_horizontal_14_quad_c,
+             8),
+  make_tuple(&aom_lpf_vertical_14_quad_sse2, &aom_lpf_vertical_14_quad_c, 8)
 };
 
 INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_lbd,
@@ -538,6 +581,18 @@ INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test9Param_lbd,
 
 #endif  // HAVE_SSE2
 
+#if HAVE_AVX2
+const loop_param_t kLoop8Test6Avx2[] = {
+  make_tuple(&aom_lpf_horizontal_6_quad_avx2, &aom_lpf_horizontal_6_quad_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_quad_avx2, &aom_lpf_horizontal_8_quad_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_quad_avx2, &aom_lpf_horizontal_14_quad_c,
+             8),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, Loop8Test6Param_lbd,
+                         ::testing::ValuesIn(kLoop8Test6Avx2));
+#endif
+
 #if HAVE_SSE2 && CONFIG_AV1_HIGHBITDEPTH
 const hbddual_loop_param_t kHbdLoop8Test9[] = {
   make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
@@ -604,11 +659,136 @@ const loop_param_t kLoop8Test6[] = {
   make_tuple(&aom_lpf_horizontal_14_neon, &aom_lpf_horizontal_14_c, 8),
   make_tuple(&aom_lpf_horizontal_8_neon, &aom_lpf_horizontal_8_c, 8),
   make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8),
-  make_tuple(&aom_lpf_horizontal_4_neon, &aom_lpf_horizontal_4_c, 8)
+  make_tuple(&aom_lpf_horizontal_4_neon, &aom_lpf_horizontal_4_c, 8),
+  make_tuple(&aom_lpf_horizontal_4_quad_neon, &aom_lpf_horizontal_4_quad_c, 8),
+  make_tuple(&aom_lpf_vertical_4_quad_neon, &aom_lpf_vertical_4_quad_c, 8),
+  make_tuple(&aom_lpf_horizontal_6_quad_neon, &aom_lpf_horizontal_6_quad_c, 8),
+  make_tuple(&aom_lpf_vertical_6_quad_neon, &aom_lpf_vertical_6_quad_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_quad_neon, &aom_lpf_horizontal_8_quad_c, 8),
+  make_tuple(&aom_lpf_vertical_8_quad_neon, &aom_lpf_vertical_8_quad_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_quad_neon, &aom_lpf_horizontal_14_quad_c,
+             8),
+  make_tuple(&aom_lpf_vertical_14_quad_neon, &aom_lpf_vertical_14_quad_c, 8)
 };
 
 INSTANTIATE_TEST_SUITE_P(NEON, Loop8Test6Param_lbd,
                          ::testing::ValuesIn(kLoop8Test6));
+
+const dual_loop_param_t kLoop8Test9[] = {
+  make_tuple(&aom_lpf_horizontal_4_dual_neon, &aom_lpf_horizontal_4_dual_c, 8),
+  make_tuple(&aom_lpf_horizontal_6_dual_neon, &aom_lpf_horizontal_6_dual_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_dual_neon, &aom_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_dual_neon, &aom_lpf_horizontal_14_dual_c,
+             8),
+  make_tuple(&aom_lpf_vertical_4_dual_neon, &aom_lpf_vertical_4_dual_c, 8),
+  make_tuple(&aom_lpf_vertical_6_dual_neon, &aom_lpf_vertical_6_dual_c, 8),
+  make_tuple(&aom_lpf_vertical_8_dual_neon, &aom_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_lpf_vertical_14_dual_neon, &aom_lpf_vertical_14_dual_c, 8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, Loop8Test9Param_lbd,
+                         ::testing::ValuesIn(kLoop8Test9));
+#if CONFIG_AV1_HIGHBITDEPTH
+const hbdloop_param_t kHbdLoop8Test6[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_4_neon, &aom_highbd_lpf_horizontal_4_c,
+             8),
+  make_tuple(&aom_highbd_lpf_horizontal_4_neon, &aom_highbd_lpf_horizontal_4_c,
+             10),
+  make_tuple(&aom_highbd_lpf_horizontal_4_neon, &aom_highbd_lpf_horizontal_4_c,
+             12),
+  make_tuple(&aom_highbd_lpf_horizontal_6_neon, &aom_highbd_lpf_horizontal_6_c,
+             8),
+  make_tuple(&aom_highbd_lpf_horizontal_6_neon, &aom_highbd_lpf_horizontal_6_c,
+             10),
+  make_tuple(&aom_highbd_lpf_horizontal_6_neon, &aom_highbd_lpf_horizontal_6_c,
+             12),
+  make_tuple(&aom_highbd_lpf_horizontal_8_neon, &aom_highbd_lpf_horizontal_8_c,
+             8),
+  make_tuple(&aom_highbd_lpf_horizontal_8_neon, &aom_highbd_lpf_horizontal_8_c,
+             10),
+  make_tuple(&aom_highbd_lpf_horizontal_8_neon, &aom_highbd_lpf_horizontal_8_c,
+             12),
+  make_tuple(&aom_highbd_lpf_horizontal_14_neon,
+             &aom_highbd_lpf_horizontal_14_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_14_neon,
+             &aom_highbd_lpf_horizontal_14_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_14_neon,
+             &aom_highbd_lpf_horizontal_14_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_4_neon, &aom_highbd_lpf_vertical_4_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_4_neon, &aom_highbd_lpf_vertical_4_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_4_neon, &aom_highbd_lpf_vertical_4_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_6_neon, &aom_highbd_lpf_vertical_6_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_6_neon, &aom_highbd_lpf_vertical_6_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_6_neon, &aom_highbd_lpf_vertical_6_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_8_neon, &aom_highbd_lpf_vertical_8_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_8_neon, &aom_highbd_lpf_vertical_8_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_8_neon, &aom_highbd_lpf_vertical_8_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_14_neon, &aom_highbd_lpf_vertical_14_c,
+             8),
+  make_tuple(&aom_highbd_lpf_vertical_14_neon, &aom_highbd_lpf_vertical_14_c,
+             10),
+  make_tuple(&aom_highbd_lpf_vertical_14_neon, &aom_highbd_lpf_vertical_14_c,
+             12),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, Loop8Test6Param_hbd,
+                         ::testing::ValuesIn(kHbdLoop8Test6));
+
+const hbddual_loop_param_t kHbdLoop8Test9[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_neon,
+             &aom_highbd_lpf_horizontal_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_6_dual_neon,
+             &aom_highbd_lpf_horizontal_6_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_neon,
+             &aom_highbd_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_neon,
+             &aom_highbd_lpf_horizontal_14_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_neon,
+             &aom_highbd_lpf_vertical_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_6_dual_neon,
+             &aom_highbd_lpf_vertical_6_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_neon,
+             &aom_highbd_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_neon,
+             &aom_highbd_lpf_vertical_14_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_neon,
+             &aom_highbd_lpf_horizontal_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_6_dual_neon,
+             &aom_highbd_lpf_horizontal_6_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_neon,
+             &aom_highbd_lpf_horizontal_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_neon,
+             &aom_highbd_lpf_horizontal_14_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_neon,
+             &aom_highbd_lpf_vertical_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_6_dual_neon,
+             &aom_highbd_lpf_vertical_6_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_neon,
+             &aom_highbd_lpf_vertical_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_neon,
+             &aom_highbd_lpf_vertical_14_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_neon,
+             &aom_highbd_lpf_horizontal_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_6_dual_neon,
+             &aom_highbd_lpf_horizontal_6_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_neon,
+             &aom_highbd_lpf_horizontal_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_neon,
+             &aom_highbd_lpf_horizontal_14_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_neon,
+             &aom_highbd_lpf_vertical_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_6_dual_neon,
+             &aom_highbd_lpf_vertical_6_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_neon,
+             &aom_highbd_lpf_vertical_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_neon,
+             &aom_highbd_lpf_vertical_14_dual_c, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, Loop8Test9Param_hbd,
+                         ::testing::ValuesIn(kHbdLoop8Test9));
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_AVX2 && CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/test/masked_sad_test.cc b/media/libaom/src/test/masked_sad_test.cc
index aa4dd83410..91f7982dba 100644
--- a/media/libaom/src/test/masked_sad_test.cc
+++ b/media/libaom/src/test/masked_sad_test.cc
@@ -15,7 +15,6 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -59,7 +58,7 @@ class MaskedSADTestBase : public ::testing::Test {
                        int msk_stride, int inv_mask, unsigned sads[],
                        int times) = 0;
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
   void runMaskedSADTest(int run_times);
 };
 
@@ -86,6 +85,7 @@ class MaskedSADTest : public MaskedSADTestBase,
   MaskedSADFunc maskedSAD_op_;
   MaskedSADFunc ref_maskedSAD_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSADTest);
 
 class MaskedSADx4Test : public MaskedSADTestBase,
                         public ::testing::WithParamInterface<MaskedSADx4Param> {
@@ -109,6 +109,7 @@ class MaskedSADx4Test : public MaskedSADTestBase,
   MaskedSADx4Func maskedSAD_op_;
   MaskedSADx4Func ref_maskedSAD_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSADx4Test);
 
 void MaskedSADTest::runRef(const uint8_t *src_ptr, int src_stride,
                            const uint8_t *ref_ptr[], int ref_stride,
@@ -131,7 +132,7 @@ void MaskedSADTest::runTest(const uint8_t *src_ptr, int src_stride,
                             second_pred, msk, msk_stride, invert_mask);
   } else {
     for (int repeat = 0; repeat < times; ++repeat) {
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           sads[0] = maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
                                   second_pred, msk, msk_stride, invert_mask));
     }
@@ -155,7 +156,7 @@ void MaskedSADx4Test::runTest(const uint8_t *src_ptr, int src_stride,
                               int msk_stride, int invert_mask, unsigned sads[],
                               int times) {
   if (times == 1) {
-    ASM_REGISTER_STATE_CHECK(maskedSAD_op_(src_ptr, src_stride, ref_ptr,
+    API_REGISTER_STATE_CHECK(maskedSAD_op_(src_ptr, src_stride, ref_ptr,
                                            ref_stride, second_pred, msk,
                                            msk_stride, invert_mask, sads));
   } else {
@@ -252,13 +253,15 @@ class HighbdMaskedSADTest
     ref_maskedSAD_op_ = GET_PARAM(1);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
   void runHighbdMaskedSADTest(int run_times);
 
  protected:
   HighbdMaskedSADFunc maskedSAD_op_;
   HighbdMaskedSADFunc ref_maskedSAD_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HighbdMaskedSADTest);
+
 void HighbdMaskedSADTest::runHighbdMaskedSADTest(int run_times) {
   unsigned int ref_ret = 0, ret = 1;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -295,7 +298,7 @@ void HighbdMaskedSADTest::runHighbdMaskedSADTest(int run_times) {
       const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
       aom_usec_timer_start(&timer);
       if (run_times == 1) {
-        ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
+        API_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
                                                      ref8_ptr, ref_stride,
                                                      second_pred8_ptr, msk_ptr,
                                                      msk_stride, invert_mask));
@@ -348,12 +351,14 @@ const MaskedSADParam msad_test[] = {
   make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c),
   make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c),
   make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c),
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_masked_sad4x16_ssse3, &aom_masked_sad4x16_c),
   make_tuple(&aom_masked_sad16x4_ssse3, &aom_masked_sad16x4_c),
   make_tuple(&aom_masked_sad8x32_ssse3, &aom_masked_sad8x32_c),
   make_tuple(&aom_masked_sad32x8_ssse3, &aom_masked_sad32x8_c),
   make_tuple(&aom_masked_sad16x64_ssse3, &aom_masked_sad16x64_c),
   make_tuple(&aom_masked_sad64x16_ssse3, &aom_masked_sad64x16_c),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADTest, ::testing::ValuesIn(msad_test));
@@ -375,12 +380,14 @@ const MaskedSADx4Param msadx4_test[] = {
   make_tuple(&aom_masked_sad64x128x4d_ssse3, &aom_masked_sad64x128x4d_c),
   make_tuple(&aom_masked_sad128x64x4d_ssse3, &aom_masked_sad128x64x4d_c),
   make_tuple(&aom_masked_sad128x128x4d_ssse3, &aom_masked_sad128x128x4d_c),
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_masked_sad4x16x4d_ssse3, &aom_masked_sad4x16x4d_c),
   make_tuple(&aom_masked_sad16x4x4d_ssse3, &aom_masked_sad16x4x4d_c),
   make_tuple(&aom_masked_sad8x32x4d_ssse3, &aom_masked_sad8x32x4d_c),
   make_tuple(&aom_masked_sad32x8x4d_ssse3, &aom_masked_sad32x8x4d_c),
   make_tuple(&aom_masked_sad16x64x4d_ssse3, &aom_masked_sad16x64x4d_c),
   make_tuple(&aom_masked_sad64x16x4d_ssse3, &aom_masked_sad64x16x4d_c),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADx4Test,
@@ -407,12 +414,14 @@ const HighbdMaskedSADParam hbd_msad_test[] = {
              &aom_highbd_masked_sad128x64_c),
   make_tuple(&aom_highbd_masked_sad128x128_ssse3,
              &aom_highbd_masked_sad128x128_c),
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_highbd_masked_sad4x16_ssse3, &aom_highbd_masked_sad4x16_c),
   make_tuple(&aom_highbd_masked_sad16x4_ssse3, &aom_highbd_masked_sad16x4_c),
   make_tuple(&aom_highbd_masked_sad8x32_ssse3, &aom_highbd_masked_sad8x32_c),
   make_tuple(&aom_highbd_masked_sad32x8_ssse3, &aom_highbd_masked_sad32x8_c),
   make_tuple(&aom_highbd_masked_sad16x64_ssse3, &aom_highbd_masked_sad16x64_c),
   make_tuple(&aom_highbd_masked_sad64x16_ssse3, &aom_highbd_masked_sad64x16_c),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSSE3, HighbdMaskedSADTest,
@@ -438,12 +447,14 @@ const MaskedSADParam msad_avx2_test[] = {
   make_tuple(&aom_masked_sad64x128_avx2, &aom_masked_sad64x128_ssse3),
   make_tuple(&aom_masked_sad128x64_avx2, &aom_masked_sad128x64_ssse3),
   make_tuple(&aom_masked_sad128x128_avx2, &aom_masked_sad128x128_ssse3),
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_masked_sad4x16_avx2, &aom_masked_sad4x16_ssse3),
   make_tuple(&aom_masked_sad16x4_avx2, &aom_masked_sad16x4_ssse3),
   make_tuple(&aom_masked_sad8x32_avx2, &aom_masked_sad8x32_ssse3),
   make_tuple(&aom_masked_sad32x8_avx2, &aom_masked_sad32x8_ssse3),
   make_tuple(&aom_masked_sad16x64_avx2, &aom_masked_sad16x64_ssse3),
   make_tuple(&aom_masked_sad64x16_avx2, &aom_masked_sad64x16_ssse3)
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, MaskedSADTest,
@@ -477,6 +488,7 @@ const HighbdMaskedSADParam hbd_msad_avx2_test[] = {
              &aom_highbd_masked_sad128x64_ssse3),
   make_tuple(&aom_highbd_masked_sad128x128_avx2,
              &aom_highbd_masked_sad128x128_ssse3),
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_highbd_masked_sad4x16_avx2, &aom_highbd_masked_sad4x16_ssse3),
   make_tuple(&aom_highbd_masked_sad16x4_avx2, &aom_highbd_masked_sad16x4_ssse3),
   make_tuple(&aom_highbd_masked_sad8x32_avx2, &aom_highbd_masked_sad8x32_ssse3),
@@ -485,6 +497,7 @@ const HighbdMaskedSADParam hbd_msad_avx2_test[] = {
              &aom_highbd_masked_sad16x64_ssse3),
   make_tuple(&aom_highbd_masked_sad64x16_avx2,
              &aom_highbd_masked_sad64x16_ssse3)
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, HighbdMaskedSADTest,
diff --git a/media/libaom/src/test/masked_variance_test.cc b/media/libaom/src/test/masked_variance_test.cc
index bf814cea2b..4a4cb1a39b 100644
--- a/media/libaom/src/test/masked_variance_test.cc
+++ b/media/libaom/src/test/masked_variance_test.cc
@@ -16,7 +16,6 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -50,12 +49,13 @@ class MaskedSubPixelVarianceTest
     ref_func_ = GET_PARAM(1);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   MaskedSubPixelVarianceFunc opt_func_;
   MaskedSubPixelVarianceFunc ref_func_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSubPixelVarianceTest);
 
 TEST_P(MaskedSubPixelVarianceTest, OperationCheck) {
   unsigned int ref_ret, opt_ret;
@@ -93,7 +93,7 @@ TEST_P(MaskedSubPixelVarianceTest, OperationCheck) {
           ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
                               ref_stride, second_pred_ptr, msk_ptr, msk_stride,
                               invert_mask, &ref_sse);
-          ASM_REGISTER_STATE_CHECK(
+          API_REGISTER_STATE_CHECK(
               opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
                                   ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
                                   msk_stride, invert_mask, &opt_sse));
@@ -146,7 +146,7 @@ TEST_P(MaskedSubPixelVarianceTest, ExtremeValues) {
           ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
                               ref_stride, second_pred_ptr, msk_ptr, msk_stride,
                               invert_mask, &ref_sse);
-          ASM_REGISTER_STATE_CHECK(
+          API_REGISTER_STATE_CHECK(
               opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
                                   ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
                                   msk_stride, invert_mask, &opt_sse));
@@ -186,13 +186,14 @@ class HighbdMaskedSubPixelVarianceTest
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   MaskedSubPixelVarianceFunc opt_func_;
   MaskedSubPixelVarianceFunc ref_func_;
   aom_bit_depth_t bit_depth_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HighbdMaskedSubPixelVarianceTest);
 
 TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) {
   unsigned int ref_ret, opt_ret;
@@ -228,7 +229,7 @@ TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) {
           ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
                               ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
                               invert_mask, &ref_sse);
-          ASM_REGISTER_STATE_CHECK(
+          API_REGISTER_STATE_CHECK(
               opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
                                   ref8_ptr, ref_stride, second_pred8_ptr,
                                   msk_ptr, msk_stride, invert_mask, &opt_sse));
@@ -289,7 +290,7 @@ TEST_P(HighbdMaskedSubPixelVarianceTest, ExtremeValues) {
           ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
                               ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
                               invert_mask, &ref_sse);
-          ASM_REGISTER_STATE_CHECK(
+          API_REGISTER_STATE_CHECK(
               opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
                                   ref8_ptr, ref_stride, second_pred8_ptr,
                                   msk_ptr, msk_stride, invert_mask, &opt_sse));
@@ -352,7 +353,7 @@ const MaskedSubPixelVarianceParam sub_pel_var_test[] = {
              &aom_masked_sub_pixel_variance4x8_c),
   make_tuple(&aom_masked_sub_pixel_variance4x4_ssse3,
              &aom_masked_sub_pixel_variance4x4_c),
-
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_masked_sub_pixel_variance64x16_ssse3,
              &aom_masked_sub_pixel_variance64x16_c),
   make_tuple(&aom_masked_sub_pixel_variance16x64_ssse3,
@@ -365,6 +366,7 @@ const MaskedSubPixelVarianceParam sub_pel_var_test[] = {
              &aom_masked_sub_pixel_variance16x4_c),
   make_tuple(&aom_masked_sub_pixel_variance4x16_ssse3,
              &aom_masked_sub_pixel_variance4x16_c),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
@@ -468,7 +470,7 @@ const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test[] = {
              &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12),
   make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_ssse3,
              &aom_highbd_12_masked_sub_pixel_variance4x4_c, AOM_BITS_12),
-
+#if !CONFIG_REALTIME_ONLY
   make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x16_ssse3,
              &aom_highbd_8_masked_sub_pixel_variance64x16_c, AOM_BITS_8),
   make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x64_ssse3,
@@ -505,6 +507,7 @@ const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test[] = {
              &aom_highbd_12_masked_sub_pixel_variance16x4_c, AOM_BITS_12),
   make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x16_ssse3,
              &aom_highbd_12_masked_sub_pixel_variance4x16_c, AOM_BITS_12),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
diff --git a/media/libaom/src/test/metadata_test.cc b/media/libaom/src/test/metadata_test.cc
index 79e08a7a57..7143294999 100644
--- a/media/libaom/src/test/metadata_test.cc
+++ b/media/libaom/src/test/metadata_test.cc
@@ -34,8 +34,6 @@ const size_t kMetadataPayloadSizeCll = 4;
 const uint8_t kMetadataPayloadCll[kMetadataPayloadSizeCll] = { 0xB5, 0x01, 0x02,
                                                                0x03 };
 
-#if CONFIG_AV1_ENCODER
-
 const size_t kMetadataObuSizeT35 = 28;
 const uint8_t kMetadataObuT35[kMetadataObuSizeT35] = {
   0x2A, 0x1A, 0x02, 0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
@@ -60,10 +58,7 @@ class MetadataEncodeTest
 
   virtual ~MetadataEncodeTest() {}
 
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-  }
+  virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
 
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video) {
     aom_image_t *current_frame = video->img();
@@ -193,10 +188,9 @@ TEST_P(MetadataEncodeTest, TestMetadataEncoding) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-AV1_INSTANTIATE_TEST_CASE(MetadataEncodeTest,
-                          ::testing::Values(::libaom_test::kOnePassGood));
+AV1_INSTANTIATE_TEST_SUITE(MetadataEncodeTest,
+                           ::testing::Values(::libaom_test::kOnePassGood));
 
-#endif  // CONFIG_AV1_ENCODER
 }  // namespace
 
 TEST(MetadataTest, MetadataAllocation) {
@@ -294,7 +288,7 @@ TEST(MetadataTest, GetMetadataFromImage) {
   EXPECT_TRUE(aom_img_get_metadata(&image, 10u) == NULL);
 
   const aom_metadata_t *metadata = aom_img_get_metadata(&image, 0);
-  ASSERT_TRUE(metadata != NULL);
+  ASSERT_NE(metadata, nullptr);
   ASSERT_EQ(metadata->sz, kMetadataPayloadSizeT35);
   EXPECT_EQ(
       memcmp(kMetadataPayloadT35, metadata->payload, kMetadataPayloadSizeT35),
@@ -326,7 +320,7 @@ TEST(MetadataTest, ReadMetadatasFromImage) {
   ASSERT_EQ(number_metadata, 3u);
   for (size_t i = 0; i < number_metadata; ++i) {
     const aom_metadata_t *metadata = aom_img_get_metadata(&image, i);
-    ASSERT_TRUE(metadata != NULL);
+    ASSERT_NE(metadata, nullptr);
     ASSERT_EQ(metadata->type, types[i]);
     ASSERT_EQ(metadata->sz, kMetadataPayloadSizeT35);
     EXPECT_EQ(
diff --git a/media/libaom/src/test/mock_ratectrl_qmode.h b/media/libaom/src/test/mock_ratectrl_qmode.h
new file mode 100644
index 0000000000..f0f2e97af8
--- /dev/null
+++ b/media/libaom/src/test/mock_ratectrl_qmode.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_MOCK_RATECTRL_QMODE_H_
+#define AOM_TEST_MOCK_RATECTRL_QMODE_H_
+
+#include "av1/ratectrl_qmode_interface.h"
+#include "third_party/googletest/src/googlemock/include/gmock/gmock.h"
+
+namespace aom {
+
+class MockRateControlQMode : public AV1RateControlQModeInterface {
+ public:
+  MOCK_METHOD(void, SetRcParam, (const RateControlParam &rc_param), (override));
+  MOCK_METHOD(GopStructList, DetermineGopInfo,
+              (const FirstpassInfo &firstpass_info), (override));
+  MOCK_METHOD(GopEncodeInfo, GetGopEncodeInfo,
+              (const GopStruct &gop_struct, const TplGopStats &tpl_gop_stats,
+               const RefFrameTable &ref_frame_table_snapshot_init),
+              (override));
+};
+
+}  // namespace aom
+
+#endif  // AOM_TEST_MOCK_RATECTRL_QMODE_H_
diff --git a/media/libaom/src/test/monochrome_test.cc b/media/libaom/src/test/monochrome_test.cc
index ebccba5842..a71cc9b3df 100644
--- a/media/libaom/src/test/monochrome_test.cc
+++ b/media/libaom/src/test/monochrome_test.cc
@@ -20,17 +20,43 @@
 
 namespace {
 
+const unsigned int kCqLevel = 18;
+const double kMaxPsnr = 100.0;
+
+// kPsnrThreshold represents the psnr threshold used to validate the quality of
+// the first frame. The indices, 0 and 1 correspond to non-allintra and allintra
+// encoding modes.
+const double kPsnrThreshold[2] = { 29.0, 41.5 };
+
+// kPsnrFluctuation represents the maximum allowed psnr fluctuation w.r.t first
+// frame. The indices, 0 and 1 correspond to non-allintra and allintra encoding
+// modes.
+const double kPsnrFluctuation[2] = { 2.5, 0.3 };
+
 class MonochromeTest
-    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+                                                 int>,
       public ::libaom_test::EncoderTest {
  protected:
-  MonochromeTest() : EncoderTest(GET_PARAM(0)), frame0_psnr_y_(0.) {}
+  MonochromeTest()
+      : EncoderTest(GET_PARAM(0)), lossless_(GET_PARAM(2)),
+        frame0_psnr_y_(0.0) {}
 
   virtual ~MonochromeTest() {}
 
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
+  virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, GET_PARAM(3));
+      if (mode_ == ::libaom_test::kAllIntra) {
+        encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+      }
+      if (lossless_) {
+        encoder->Control(AV1E_SET_LOSSLESS, 1);
+      }
+    }
   }
 
   virtual void DecompressedFrameHook(const aom_image_t &img,
@@ -71,15 +97,23 @@ class MonochromeTest
   }
 
   virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Check average PSNR value is >= 100 db in case of lossless encoding.
+    if (lossless_) {
+      EXPECT_GE(pkt->data.psnr.psnr[0], kMaxPsnr);
+      return;
+    }
+    const bool is_allintra = (mode_ == ::libaom_test::kAllIntra);
     // Check that the initial Y PSNR value is 'high enough', and check that
     // subsequent Y PSNR values are 'close' to this initial value.
-    if (frame0_psnr_y_ == 0.) {
+    if (frame0_psnr_y_ == 0.0) {
       frame0_psnr_y_ = pkt->data.psnr.psnr[1];
-      EXPECT_GT(frame0_psnr_y_, 29.);
+      EXPECT_GT(frame0_psnr_y_, kPsnrThreshold[is_allintra]);
     }
-    EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_, 2.5);
+    EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_,
+                kPsnrFluctuation[is_allintra]);
   }
 
+  int lossless_;
   std::vector<int> chroma_value_list_;
   double frame0_psnr_y_;
 };
@@ -90,9 +124,6 @@ TEST_P(MonochromeTest, TestMonochromeEncoding) {
 
   init_flags_ = AOM_CODEC_USE_PSNR;
 
-  cfg_.g_w = 352;
-  cfg_.g_h = 288;
-
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 600;
   cfg_.rc_buf_sz = 1000;
@@ -101,13 +132,10 @@ TEST_P(MonochromeTest, TestMonochromeEncoding) {
   cfg_.rc_undershoot_pct = 50;
   cfg_.rc_overshoot_pct = 50;
   cfg_.rc_end_usage = AOM_CBR;
-  cfg_.kf_mode = AOM_KF_AUTO;
   cfg_.g_lag_in_frames = 1;
   cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
   // Enable dropped frames.
   cfg_.rc_dropframe_thresh = 1;
-  // Disable error_resilience mode.
-  cfg_.g_error_resilient = 0;
   // Run at low bitrate.
   cfg_.rc_target_bitrate = 40;
   // Set monochrome encoding flag
@@ -124,7 +152,33 @@ TEST_P(MonochromeTest, TestMonochromeEncoding) {
   }
 }
 
-AV1_INSTANTIATE_TEST_CASE(MonochromeTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood));
+class MonochromeAllIntraTest : public MonochromeTest {};
+
+TEST_P(MonochromeAllIntraTest, TestMonochromeEncoding) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 5);
+  init_flags_ = AOM_CODEC_USE_PSNR;
+  // Set monochrome encoding flag
+  cfg_.monochrome = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Check that the chroma planes are equal across all frames
+  std::vector<int>::const_iterator iter = chroma_value_list_.begin();
+  int initial_chroma_value = *iter;
+  for (; iter != chroma_value_list_.end(); ++iter) {
+    // Check that all decoded frames have the same constant chroma planes.
+    EXPECT_EQ(*iter, initial_chroma_value);
+  }
+}
+
+AV1_INSTANTIATE_TEST_SUITE(MonochromeTest,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(0),   // lossless
+                           ::testing::Values(0));  // cpu_used
 
+AV1_INSTANTIATE_TEST_SUITE(MonochromeAllIntraTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(0, 1),   // lossless
+                           ::testing::Values(6, 9));  // cpu_used
 }  // namespace
diff --git a/media/libaom/src/test/motion_vector_test.cc b/media/libaom/src/test/motion_vector_test.cc
index 2636c39aa2..bf10edefa9 100644
--- a/media/libaom/src/test/motion_vector_test.cc
+++ b/media/libaom/src/test/motion_vector_test.cc
@@ -46,14 +46,10 @@ class MotionVectorTestLarge
   virtual ~MotionVectorTestLarge() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
+    InitializeConfig(encoding_mode_);
     if (encoding_mode_ != ::libaom_test::kRealTime) {
       cfg_.g_lag_in_frames = 3;
-      cfg_.rc_end_usage = AOM_VBR;
     } else {
-      cfg_.g_lag_in_frames = 0;
-      cfg_.rc_end_usage = AOM_CBR;
       cfg_.rc_buf_sz = 1000;
       cfg_.rc_buf_initial_sz = 500;
       cfg_.rc_buf_optimal_sz = 600;
@@ -96,12 +92,12 @@ TEST_P(MotionVectorTestLarge, OverallTest) {
   video.reset(new libaom_test::YUVVideoSource(
       "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, width, height, 30, 1, 0, 3));
 
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video, nullptr);
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
 
-AV1_INSTANTIATE_TEST_CASE(MotionVectorTestLarge,
-                          ::testing::ValuesIn(kEncodingModeVectors),
-                          ::testing::ValuesIn(kCpuUsedVectors),
-                          ::testing::ValuesIn(kMVTestModes));
+AV1_INSTANTIATE_TEST_SUITE(MotionVectorTestLarge,
+                           ::testing::ValuesIn(kEncodingModeVectors),
+                           ::testing::ValuesIn(kCpuUsedVectors),
+                           ::testing::ValuesIn(kMVTestModes));
 }  // namespace
diff --git a/media/libaom/src/test/noise_model_test.cc b/media/libaom/src/test/noise_model_test.cc
index 5b61236f0b..e9cf9e2f7e 100644
--- a/media/libaom/src/test/noise_model_test.cc
+++ b/media/libaom/src/test/noise_model_test.cc
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <limits.h>
 #include <math.h>
 #include <algorithm>
 #include <vector>
@@ -77,8 +78,16 @@ std::vector<float> get_noise_psd(double *noise, int width, int height,
   float *block =
       (float *)aom_memalign(32, block_size * block_size * sizeof(block));
   std::vector<float> psd(block_size * block_size);
+  if (block == nullptr) {
+    EXPECT_NE(block, nullptr);
+    return psd;
+  }
   int num_blocks = 0;
   struct aom_noise_tx_t *tx = aom_noise_tx_malloc(block_size);
+  if (tx == nullptr) {
+    EXPECT_NE(tx, nullptr);
+    return psd;
+  }
   for (int y = 0; y <= height - block_size; y += block_size / 2) {
     for (int x = 0; x <= width - block_size; x += block_size / 2) {
       for (int yy = 0; yy < block_size; ++yy) {
@@ -145,7 +154,7 @@ TEST(NoiseStrengthSolver, GetCenters256Bins) {
 TEST(NoiseStrengthSolver, ObserveIdentity) {
   const int num_bins = 256;
   aom_noise_strength_solver_t solver;
-  EXPECT_EQ(1, aom_noise_strength_solver_init(&solver, num_bins, 8));
+  ASSERT_EQ(1, aom_noise_strength_solver_init(&solver, num_bins, 8));
 
   // We have to add a big more strength to constraints at the boundary to
   // overcome any regularization.
@@ -212,6 +221,12 @@ TEST(NoiseStrengthSolver, SimplifiesCurve) {
   aom_noise_strength_solver_free(&solver);
 }
 
+TEST(NoiseStrengthLut, LutInitNegativeOrZeroSize) {
+  aom_noise_strength_lut_t lut;
+  ASSERT_FALSE(aom_noise_strength_lut_init(&lut, -1));
+  ASSERT_FALSE(aom_noise_strength_lut_init(&lut, 0));
+}
+
 TEST(NoiseStrengthLut, LutEvalSinglePoint) {
   aom_noise_strength_lut_t lut;
   ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 1));
@@ -323,6 +338,22 @@ TEST(NoiseModel, InitFailsWithInvalidShape) {
   aom_noise_model_free(&model);
 }
 
+TEST(NoiseModel, InitFailsWithInvalidBitdepth) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 2, 8, 0 };
+  for (int i = 0; i <= 32; ++i) {
+    params.bit_depth = i;
+    if (i == 8 || i == 10 || i == 12) {
+      EXPECT_TRUE(aom_noise_model_init(&model, params)) << "bit_depth: " << i;
+      aom_noise_model_free(&model);
+    } else {
+      EXPECT_FALSE(aom_noise_model_init(&model, params)) << "bit_depth: " << i;
+    }
+  }
+  params.bit_depth = INT_MAX;
+  EXPECT_FALSE(aom_noise_model_init(&model, params));
+}
+
 // A container template class to hold a data type and extra arguments.
 // All of these args are bundled into one struct so that we can use
 // parameterized tests on combinations of supported data types
@@ -1153,7 +1184,7 @@ TEST(NoiseModelGetGrainParameters, GetGrainParametersReal) {
 template <typename T>
 class WienerDenoiseTest : public ::testing::Test, public T {
  public:
-  static void SetUpTestCase() { aom_dsp_rtcd(); }
+  static void SetUpTestSuite() { aom_dsp_rtcd(); }
 
  protected:
   void SetUp() {
diff --git a/media/libaom/src/test/obmc_sad_test.cc b/media/libaom/src/test/obmc_sad_test.cc
index 6b4382cd7f..9b70366440 100644
--- a/media/libaom/src/test/obmc_sad_test.cc
+++ b/media/libaom/src/test/obmc_sad_test.cc
@@ -37,6 +37,7 @@ typedef libaom_test::FuncParam<ObmcSadF> TestFuncs;
 ////////////////////////////////////////////////////////////////////////////////
 
 class ObmcSadTest : public FunctionEquivalenceTest<ObmcSadF> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcSadTest);
 
 TEST_P(ObmcSadTest, RandomValues) {
   DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
@@ -54,7 +55,7 @@ TEST_P(ObmcSadTest, RandomValues) {
 
     const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res =
+    API_REGISTER_STATE_CHECK(tst_res =
                                  params_.tst_func(pre, pre_stride, wsrc, mask));
 
     ASSERT_EQ(ref_res, tst_res);
@@ -77,7 +78,7 @@ TEST_P(ObmcSadTest, ExtremeValues) {
 
     const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res =
+    API_REGISTER_STATE_CHECK(tst_res =
                                  params_.tst_func(pre, pre_stride, wsrc, mask));
 
     ASSERT_EQ(ref_res, tst_res);
@@ -152,6 +153,7 @@ INSTANTIATE_TEST_SUITE_P(AVX2, ObmcSadTest,
 ////////////////////////////////////////////////////////////////////////////////
 
 class ObmcSadHBDTest : public FunctionEquivalenceTest<ObmcSadF> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcSadHBDTest);
 
 TEST_P(ObmcSadHBDTest, RandomValues) {
   DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
@@ -170,7 +172,7 @@ TEST_P(ObmcSadHBDTest, RandomValues) {
     const unsigned int ref_res =
         params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         tst_res =
             params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
 
@@ -195,7 +197,7 @@ TEST_P(ObmcSadHBDTest, ExtremeValues) {
     const unsigned int ref_res =
         params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         tst_res =
             params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
 
diff --git a/media/libaom/src/test/obmc_variance_test.cc b/media/libaom/src/test/obmc_variance_test.cc
index fc281d70b9..03b38f706a 100644
--- a/media/libaom/src/test/obmc_variance_test.cc
+++ b/media/libaom/src/test/obmc_variance_test.cc
@@ -40,6 +40,7 @@ typedef libaom_test::FuncParam<ObmcVarF> TestFuncs;
 ////////////////////////////////////////////////////////////////////////////////
 
 class ObmcVarianceTest : public FunctionEquivalenceTest<ObmcVarF> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcVarianceTest);
 
 TEST_P(ObmcVarianceTest, RandomValues) {
   DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
@@ -59,7 +60,7 @@ TEST_P(ObmcVarianceTest, RandomValues) {
     const unsigned int ref_res =
         params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
 
     ASSERT_EQ(ref_res, tst_res);
@@ -85,7 +86,7 @@ TEST_P(ObmcVarianceTest, ExtremeValues) {
     const unsigned int ref_res =
         params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
 
     ASSERT_EQ(ref_res, tst_res);
@@ -197,6 +198,7 @@ INSTANTIATE_TEST_SUITE_P(AVX2, ObmcVarianceTest,
 ////////////////////////////////////////////////////////////////////////////////
 #if CONFIG_AV1_HIGHBITDEPTH
 class ObmcVarianceHBDTest : public FunctionEquivalenceTest<ObmcVarF> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcVarianceHBDTest);
 
 TEST_P(ObmcVarianceHBDTest, RandomValues) {
   DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
@@ -217,7 +219,7 @@ TEST_P(ObmcVarianceHBDTest, RandomValues) {
     const unsigned int ref_res = params_.ref_func(
         CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask, &ref_sse);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
                                                         pre_stride, wsrc, mask,
                                                         &tst_sse));
 
@@ -244,7 +246,7 @@ TEST_P(ObmcVarianceHBDTest, ExtremeValues) {
     const unsigned int ref_res = params_.ref_func(
         CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask, &ref_sse);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
                                                         pre_stride, wsrc, mask,
                                                         &tst_sse));
 
diff --git a/media/libaom/src/test/pickrst_test.cc b/media/libaom/src/test/pickrst_test.cc
index 9a2c5bcd40..131e1dd5c7 100644
--- a/media/libaom/src/test/pickrst_test.cc
+++ b/media/libaom/src/test/pickrst_test.cc
@@ -75,6 +75,7 @@ class PixelProjErrorTest
   int32_t *flt0_;
   int32_t *flt1_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PixelProjErrorTest);
 
 void PixelProjErrorTest::RunPixelProjErrorTest(int32_t run_times) {
   int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
@@ -188,6 +189,12 @@ INSTANTIATE_TEST_SUITE_P(AVX2, PixelProjErrorTest,
                          ::testing::Values(av1_lowbd_pixel_proj_error_avx2));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, PixelProjErrorTest,
+                         ::testing::Values(av1_lowbd_pixel_proj_error_neon));
+#endif  // HAVE_NEON
+
 }  // namespace pickrst_test_lowbd
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -240,6 +247,7 @@ class PixelProjHighbdErrorTest
   int32_t *flt0_;
   int32_t *flt1_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PixelProjHighbdErrorTest);
 
 void PixelProjHighbdErrorTest::RunPixelProjErrorTest(int32_t run_times) {
   int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
@@ -356,6 +364,7 @@ INSTANTIATE_TEST_SUITE_P(AVX2, PixelProjHighbdErrorTest,
 #endif  // HAVE_AVX2
 
 }  // namespace pickrst_test_highbd
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 ////////////////////////////////////////////////////////////////////////////////
 // Get_proj_subspace_Test
@@ -409,6 +418,7 @@ class GetProjSubspaceTest
   int32_t *flt0_;
   int32_t *flt1_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GetProjSubspaceTest);
 
 void GetProjSubspaceTest::RunGetProjSubspaceTest(int32_t run_times) {
   int h_end = run_times != 1
@@ -524,6 +534,12 @@ TEST_P(GetProjSubspaceTest, ExtremeValues) {
 
 TEST_P(GetProjSubspaceTest, DISABLED_Speed) { RunGetProjSubspaceTest(200000); }
 
+#if HAVE_SSE4_1
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, GetProjSubspaceTest,
+                         ::testing::Values(av1_calc_proj_params_sse4_1));
+#endif  // HAVE_SSE4_1
+
 #if HAVE_AVX2
 
 INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTest,
@@ -531,4 +547,187 @@ INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTest,
 #endif  // HAVE_AVX2
 
 }  // namespace get_proj_subspace_test_lowbd
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace get_proj_subspace_test_hbd {
+static const int kIterations = 100;
+
+typedef void (*set_get_proj_subspace_hbd)(const uint8_t *src8, int width,
+                                          int height, int src_stride,
+                                          const uint8_t *dat8, int dat_stride,
+                                          int32_t *flt0, int flt0_stride,
+                                          int32_t *flt1, int flt1_stride,
+                                          int64_t H[2][2], int64_t C[2],
+                                          const sgr_params_type *params);
+
+typedef std::tuple<const set_get_proj_subspace_hbd> GetProjSubspaceHBDTestParam;
+
+class GetProjSubspaceTestHBD
+    : public ::testing::TestWithParam<GetProjSubspaceHBDTestParam> {
+ public:
+  virtual void SetUp() {
+    target_func_ = GET_PARAM(0);
+    src_ = (uint16_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*src_)));
+    ASSERT_NE(src_, nullptr);
+    dgd_ = (uint16_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*dgd_)));
+    ASSERT_NE(dgd_, nullptr);
+    flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*flt0_)));
+    ASSERT_NE(flt0_, nullptr);
+    flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*flt1_)));
+    ASSERT_NE(flt1_, nullptr);
+  }
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dgd_);
+    aom_free(flt0_);
+    aom_free(flt1_);
+  }
+  void RunGetProjSubspaceTestHBD(int32_t run_times);
+  void RunGetProjSubspaceTestHBD_ExtremeValues();
+
+ private:
+  set_get_proj_subspace_hbd target_func_;
+  libaom_test::ACMRandom rng_;
+  uint16_t *src_;
+  uint16_t *dgd_;
+  int32_t *flt0_;
+  int32_t *flt1_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GetProjSubspaceTestHBD);
+
+void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD(int32_t run_times) {
+  int h_end = run_times != 1
+                  ? 128
+                  : ((rng_.Rand16() % MAX_DATA_BLOCK) &
+                     2147483640);  // We test for widths divisible by 8.
+  int v_end =
+      run_times != 1 ? 128 : ((rng_.Rand16() % MAX_DATA_BLOCK) & 2147483640);
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  const int iters = run_times == 1 ? kIterations : 4;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+    int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+    int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = rng_.Rand16() % 4095;
+      src_[i] = rng_.Rand16() % 4095;
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+
+    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
+    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
+    params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+    params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
+    uint8_t *src = CONVERT_TO_BYTEPTR(src_);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      av1_calc_proj_params_high_bd_c(src, v_end, h_end, src_stride, dgd,
+                                     dgd_stride, flt0_, flt0_stride, flt1_,
+                                     flt1_stride, H_ref, C_ref, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(src, v_end, h_end, src_stride, dgd, dgd_stride, flt0_,
+                   flt0_stride, flt1_, flt1_stride, H_test, C_test, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+             params.r[1], h_end, v_end, time1, time2, time1 / time2);
+    } else {
+      ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+      ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+      ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+      ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+      ASSERT_EQ(C_ref[0], C_test[0]);
+      ASSERT_EQ(C_ref[1], C_test[1]);
+    }
+  }
+}
+
+void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD_ExtremeValues() {
+  const int h_start = 0;
+  int h_end = MAX_DATA_BLOCK;
+  const int v_start = 0;
+  int v_end = MAX_DATA_BLOCK;
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  const int iters = kIterations;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+    int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+    int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = 0;
+      src_[i] = 4095;
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+    params.r[0] = 1;
+    params.r[1] = 1;
+    params.s[0] = rng_.Rand8() % MAX_RADIUS;
+    params.s[1] = rng_.Rand8() % MAX_RADIUS;
+    uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
+    uint8_t *src = CONVERT_TO_BYTEPTR(src_);
+
+    av1_calc_proj_params_high_bd_c(
+        src, h_end - h_start, v_end - v_start, src_stride, dgd, dgd_stride,
+        flt0_, flt0_stride, flt1_, flt1_stride, H_ref, C_ref, &params);
+
+    target_func_(src, h_end - h_start, v_end - v_start, src_stride, dgd,
+                 dgd_stride, flt0_, flt0_stride, flt1_, flt1_stride, H_test,
+                 C_test, &params);
+
+    ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+    ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+    ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+    ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+    ASSERT_EQ(C_ref[0], C_test[0]);
+    ASSERT_EQ(C_ref[1], C_test[1]);
+  }
+}
+
+TEST_P(GetProjSubspaceTestHBD, RandomValues) { RunGetProjSubspaceTestHBD(1); }
+
+TEST_P(GetProjSubspaceTestHBD, ExtremeValues) {
+  RunGetProjSubspaceTestHBD_ExtremeValues();
+}
+
+TEST_P(GetProjSubspaceTestHBD, DISABLED_Speed) {
+  RunGetProjSubspaceTestHBD(200000);
+}
+
+#if HAVE_SSE4_1
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, GetProjSubspaceTestHBD,
+    ::testing::Values(av1_calc_proj_params_high_bd_sse4_1));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTestHBD,
+                         ::testing::Values(av1_calc_proj_params_high_bd_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace get_proj_subspace_test_hbd
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/test/qm_test.cc b/media/libaom/src/test/qm_test.cc
deleted file mode 100644
index d1dfbb849b..0000000000
--- a/media/libaom/src/test/qm_test.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include "config/aom_config.h"
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/i420_video_source.h"
-#include "test/util.h"
-
-namespace {
-
-class QMTest
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
-      public ::libaom_test::EncoderTest {
- protected:
-  QMTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~QMTest() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
-  }
-
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
-    if (video->frame() == 0) {
-      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
-      encoder->Control(AV1E_SET_ENABLE_QM, 1);
-      encoder->Control(AV1E_SET_QM_MIN, qm_min_);
-      encoder->Control(AV1E_SET_QM_MAX, qm_max_);
-
-      encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
-    }
-  }
-
-  void DoTest(int qm_min, int qm_max) {
-    qm_min_ = qm_min;
-    qm_max_ = qm_max;
-    cfg_.kf_max_dist = 12;
-    cfg_.rc_min_quantizer = 8;
-    cfg_.rc_max_quantizer = 56;
-    cfg_.rc_end_usage = AOM_CBR;
-    cfg_.g_lag_in_frames = 6;
-    cfg_.rc_buf_initial_sz = 500;
-    cfg_.rc_buf_optimal_sz = 500;
-    cfg_.rc_buf_sz = 1000;
-    cfg_.rc_target_bitrate = 300;
-    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
-                                         288, 30, 1, 0, 15);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  }
-
-  int set_cpu_used_;
-  int qm_min_;
-  int qm_max_;
-};
-
-// encodes and decodes without a mismatch.
-TEST_P(QMTest, TestNoMisMatchQM1) { DoTest(5, 9); }
-
-// encodes and decodes without a mismatch.
-TEST_P(QMTest, TestNoMisMatchQM2) { DoTest(0, 8); }
-
-// encodes and decodes without a mismatch.
-TEST_P(QMTest, TestNoMisMatchQM3) { DoTest(9, 15); }
-
-AV1_INSTANTIATE_TEST_CASE(QMTest,
-                          ::testing::Values(::libaom_test::kRealTime,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(5, 9));
-}  // namespace
diff --git a/media/libaom/src/test/quant_test.cc b/media/libaom/src/test/quant_test.cc
new file mode 100644
index 0000000000..a042af13eb
--- /dev/null
+++ b/media/libaom/src/test/quant_test.cc
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "av1/encoder/av1_quantize.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+const ::libaom_test::TestMode kTestMode[] =
+#if CONFIG_REALTIME_ONLY
+    { ::libaom_test::kRealTime };
+#else
+    { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood };
+#endif
+
+class QMTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  QMTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~QMTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_ENABLE_QM, 1);
+      encoder->Control(AV1E_SET_QM_MIN, qm_min_);
+      encoder->Control(AV1E_SET_QM_MAX, qm_max_);
+
+      encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+      if (mode_ == ::libaom_test::kRealTime) {
+        encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+      }
+    }
+  }
+
+  void DoTest(int qm_min, int qm_max) {
+    qm_min_ = qm_min;
+    qm_max_ = qm_max;
+    cfg_.kf_max_dist = 12;
+    cfg_.rc_min_quantizer = 8;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 6;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_target_bitrate = 300;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 15);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  int set_cpu_used_;
+  int qm_min_;
+  int qm_max_;
+};
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM1) { DoTest(5, 9); }
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM2) { DoTest(0, 8); }
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM3) { DoTest(9, 15); }
+
+AV1_INSTANTIATE_TEST_SUITE(QMTest, ::testing::ValuesIn(kTestMode),
+                           ::testing::Range(5, 9));
+
+#if !CONFIG_REALTIME_ONLY
+typedef struct {
+  const unsigned int min_q;
+  const unsigned int max_q;
+} QuantParam;
+
+const QuantParam QuantTestParams[] = {
+  { 0, 10 }, { 0, 60 }, { 20, 35 }, { 35, 50 }, { 50, 63 }
+};
+
+std::ostream &operator<<(std::ostream &os, const QuantParam &test_arg) {
+  return os << "QuantParam { min_q:" << test_arg.min_q
+            << " max_q:" << test_arg.max_q << " }";
+}
+
+/*
+ * This class is used to test whether base_qindex is within min
+ * and max quantizer range configured by user.
+ */
+class QuantizerBoundsCheckTestLarge
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 QuantParam, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  QuantizerBoundsCheckTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        quant_param_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
+    quant_bound_violated_ = false;
+  }
+  virtual ~QuantizerBoundsCheckTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    cfg_.rc_min_quantizer = quant_param_.min_q;
+    cfg_.rc_max_quantizer = quant_param_.max_q;
+    cfg_.g_lag_in_frames = 35;
+    if (rc_end_usage_ != AOM_Q) {
+      cfg_.rc_target_bitrate = 400;
+    }
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_LAST_QUANTIZER,
+                                    &base_qindex_);
+      min_bound_qindex_ = av1_quantizer_to_qindex(cfg_.rc_min_quantizer);
+      max_bound_qindex_ = av1_quantizer_to_qindex(cfg_.rc_max_quantizer);
+      if ((base_qindex_ < min_bound_qindex_ ||
+           base_qindex_ > max_bound_qindex_) &&
+          quant_bound_violated_ == false) {
+        quant_bound_violated_ = true;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  const QuantParam quant_param_;
+  int base_qindex_;
+  int min_bound_qindex_;
+  int max_bound_qindex_;
+  bool quant_bound_violated_;
+  aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(QuantizerBoundsCheckTestLarge, QuantizerBoundsCheckEncodeTest) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 50);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(quant_bound_violated_, false);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(QuantizerBoundsCheckTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(QuantTestParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+#endif  // !CONFIG_REALTIME_ONLY
+}  // namespace
diff --git a/media/libaom/src/test/quantize_func_test.cc b/media/libaom/src/test/quantize_func_test.cc
index b40b38d5a2..d9876a7039 100644
--- a/media/libaom/src/test/quantize_func_test.cc
+++ b/media/libaom/src/test/quantize_func_test.cc
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <algorithm>
 #include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -22,7 +23,6 @@
 #include "av1/encoder/encoder.h"
 #include "av1/common/scan.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -36,9 +36,18 @@ using libaom_test::ACMRandom;
       tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, \
       const int16_t *scan, const int16_t *iscan
 
+#define LP_QUANTIZE_PARAM_LIST                                             \
+  const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,   \
+      const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, \
+      const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,  \
+      const int16_t *iscan
+
+typedef void (*LPQuantizeFunc)(LP_QUANTIZE_PARAM_LIST);
 typedef void (*QuantizeFunc)(QUAN_PARAM_LIST);
 typedef void (*QuantizeFuncHbd)(QUAN_PARAM_LIST, int log_scale);
 
+#undef LP_QUANTIZE_PARAM_LIST
+
 #define HBD_QUAN_FUNC                                                      \
   fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \
      qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, log_scale)
@@ -68,8 +77,10 @@ void highbd_quan64x64_wrapper(QUAN_PARAM_LIST) {
 enum QuantType { TYPE_B, TYPE_DC, TYPE_FP };
 
 using std::tuple;
-typedef tuple<QuantizeFunc, QuantizeFunc, TX_SIZE, QuantType, aom_bit_depth_t>
-    QuantizeParam;
+
+template <typename FuncType>
+using QuantizeParam =
+    tuple<FuncType, FuncType, TX_SIZE, QuantType, aom_bit_depth_t>;
 
 typedef struct {
   QUANTS quant;
@@ -78,19 +89,26 @@ typedef struct {
 
 const int kTestNum = 1000;
 
-class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+#define GET_TEMPLATE_PARAM(k) std::get<k>(this->GetParam())
+
+template <typename CoeffType, typename FuncType>
+class QuantizeTestBase
+    : public ::testing::TestWithParam<QuantizeParam<FuncType>> {
  protected:
-  QuantizeTest()
-      : quant_ref_(GET_PARAM(0)), quant_(GET_PARAM(1)), tx_size_(GET_PARAM(2)),
-        type_(GET_PARAM(3)), bd_(GET_PARAM(4)) {}
+  QuantizeTestBase()
+      : quant_ref_(GET_TEMPLATE_PARAM(0)), quant_(GET_TEMPLATE_PARAM(1)),
+        tx_size_(GET_TEMPLATE_PARAM(2)), type_(GET_TEMPLATE_PARAM(3)),
+        bd_(GET_TEMPLATE_PARAM(4)) {}
 
-  virtual ~QuantizeTest() {}
+  virtual ~QuantizeTestBase() {}
 
   virtual void SetUp() {
     qtab_ = reinterpret_cast<QuanTable *>(aom_memalign(32, sizeof(*qtab_)));
+    ASSERT_NE(qtab_, nullptr);
     const int n_coeffs = coeff_num();
-    coeff_ = reinterpret_cast<tran_low_t *>(
-        aom_memalign(32, 6 * n_coeffs * sizeof(tran_low_t)));
+    coeff_ = reinterpret_cast<CoeffType *>(
+        aom_memalign(32, 6 * n_coeffs * sizeof(CoeffType)));
+    ASSERT_NE(coeff_, nullptr);
     InitQuantizer();
   }
 
@@ -99,22 +117,30 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
     qtab_ = NULL;
     aom_free(coeff_);
     coeff_ = NULL;
-    libaom_test::ClearSystemState();
   }
 
   void InitQuantizer() {
     av1_build_quantizer(bd_, 0, 0, 0, 0, 0, &qtab_->quant, &qtab_->dequant);
   }
 
+  virtual void RunQuantizeFunc(
+      const CoeffType *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+      const int16_t *round_ptr, const int16_t *quant_ptr,
+      const int16_t *quant_shift_ptr, CoeffType *qcoeff_ptr,
+      CoeffType *qcoeff_ref_ptr, CoeffType *dqcoeff_ptr,
+      CoeffType *dqcoeff_ref_ptr, const int16_t *dequant_ptr,
+      uint16_t *eob_ref_ptr, uint16_t *eob_ptr, const int16_t *scan,
+      const int16_t *iscan) = 0;
+
   void QuantizeRun(bool is_loop, int q = 0, int test_num = 1) {
-    tran_low_t *coeff_ptr = coeff_;
+    CoeffType *coeff_ptr = coeff_;
     const intptr_t n_coeffs = coeff_num();
 
-    tran_low_t *qcoeff_ref = coeff_ptr + n_coeffs;
-    tran_low_t *dqcoeff_ref = qcoeff_ref + n_coeffs;
+    CoeffType *qcoeff_ref = coeff_ptr + n_coeffs;
+    CoeffType *dqcoeff_ref = qcoeff_ref + n_coeffs;
 
-    tran_low_t *qcoeff = dqcoeff_ref + n_coeffs;
-    tran_low_t *dqcoeff = qcoeff + n_coeffs;
+    CoeffType *qcoeff = dqcoeff_ref + n_coeffs;
+    CoeffType *dqcoeff = qcoeff + n_coeffs;
     uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
 
     // Testing uses 2-D DCT scan order table
@@ -141,13 +167,9 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
 
       memset(qcoeff_ref, 0, 5 * n_coeffs * sizeof(*qcoeff_ref));
 
-      quant_ref_(coeff_ptr, n_coeffs, zbin, round, quant, quant_shift,
-                 qcoeff_ref, dqcoeff_ref, dequant, &eob[0], sc->scan,
-                 sc->iscan);
-
-      ASM_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, zbin, round, quant,
-                                      quant_shift, qcoeff, dqcoeff, dequant,
-                                      &eob[1], sc->scan, sc->iscan));
+      RunQuantizeFunc(coeff_ptr, n_coeffs, zbin, round, quant, quant_shift,
+                      qcoeff, qcoeff_ref, dqcoeff, dqcoeff_ref, dequant,
+                      &eob[0], &eob[1], sc->scan, sc->iscan);
 
       for (int j = 0; j < n_coeffs; ++j) {
         ASSERT_EQ(qcoeff_ref[j], qcoeff[j])
@@ -166,8 +188,8 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
     }
   }
 
-  void CompareResults(const tran_low_t *buf_ref, const tran_low_t *buf,
-                      int size, const char *text, int q, int number) {
+  void CompareResults(const CoeffType *buf_ref, const CoeffType *buf, int size,
+                      const char *text, int q, int number) {
     int i;
     for (i = 0; i < size; ++i) {
       ASSERT_EQ(buf_ref[i], buf[i]) << text << " mismatch on test: " << number
@@ -177,7 +199,7 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
 
   int coeff_num() const { return av1_get_max_eob(tx_size_); }
 
-  void FillCoeff(tran_low_t c) {
+  void FillCoeff(CoeffType c) {
     const int n_coeffs = coeff_num();
     for (int i = 0; i < n_coeffs; ++i) {
       coeff_[i] = c;
@@ -187,8 +209,11 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
   void FillCoeffRandom() {
     const int n_coeffs = coeff_num();
     FillCoeffZero();
-    int num = rnd_.Rand16() % n_coeffs;
-    for (int i = 0; i < num; ++i) {
+    const int num = rnd_.Rand16() % n_coeffs;
+    // Randomize the first non zero coeff position.
+    const int start = rnd_.Rand16() % n_coeffs;
+    const int end = std::min(start + num, n_coeffs);
+    for (int i = start; i < end; ++i) {
       coeff_[i] = GetRandomCoeff();
     }
   }
@@ -203,7 +228,7 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
   void FillCoeffZero() { FillCoeff(0); }
 
   void FillCoeffConstant() {
-    tran_low_t c = GetRandomCoeff();
+    CoeffType c = GetRandomCoeff();
     FillCoeff(c);
   }
 
@@ -220,47 +245,88 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
     coeff_[0] = -8191;
   }
 
-  tran_low_t GetRandomCoeff() {
-    tran_low_t coeff;
+  CoeffType GetRandomCoeff() {
+    CoeffType coeff;
     if (bd_ == AOM_BITS_8) {
       coeff =
           clamp(static_cast<int16_t>(rnd_.Rand16()), INT16_MIN + 1, INT16_MAX);
     } else {
-      tran_low_t min = -(1 << (7 + bd_));
-      tran_low_t max = -min - 1;
-      coeff = clamp(static_cast<tran_low_t>(rnd_.Rand31()), min, max);
+      CoeffType min = -(1 << (7 + bd_));
+      CoeffType max = -min - 1;
+      coeff = clamp(static_cast<CoeffType>(rnd_.Rand31()), min, max);
     }
     return coeff;
   }
 
   ACMRandom rnd_;
   QuanTable *qtab_;
-  tran_low_t *coeff_;
-  QuantizeFunc quant_ref_;
-  QuantizeFunc quant_;
+  CoeffType *coeff_;
+  FuncType quant_ref_;
+  FuncType quant_;
   TX_SIZE tx_size_;
   QuantType type_;
   aom_bit_depth_t bd_;
 };
 
-TEST_P(QuantizeTest, ZeroInput) {
+class FullPrecisionQuantizeTest
+    : public QuantizeTestBase<tran_low_t, QuantizeFunc> {
+  void RunQuantizeFunc(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                       const int16_t *zbin_ptr, const int16_t *round_ptr,
+                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *qcoeff_ref_ptr,
+                       tran_low_t *dqcoeff_ptr, tran_low_t *dqcoeff_ref_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ref_ptr,
+                       uint16_t *eob_ptr, const int16_t *scan,
+                       const int16_t *iscan) override {
+    quant_ref_(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+               quant_shift_ptr, qcoeff_ref_ptr, dqcoeff_ref_ptr, dequant_ptr,
+               eob_ref_ptr, scan, iscan);
+
+    API_REGISTER_STATE_CHECK(quant_(
+        coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+        qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan));
+  }
+};
+
+class LowPrecisionQuantizeTest
+    : public QuantizeTestBase<int16_t, LPQuantizeFunc> {
+  void RunQuantizeFunc(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                       const int16_t * /*zbin_ptr*/, const int16_t *round_ptr,
+                       const int16_t *quant_ptr,
+                       const int16_t * /*quant_shift_ptr*/, int16_t *qcoeff_ptr,
+                       int16_t *qcoeff_ref_ptr, int16_t *dqcoeff_ptr,
+                       int16_t *dqcoeff_ref_ptr, const int16_t *dequant_ptr,
+                       uint16_t *eob_ref_ptr, uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) override {
+    quant_ref_(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ref_ptr,
+               dqcoeff_ref_ptr, dequant_ptr, eob_ref_ptr, scan, iscan);
+
+    API_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
+                                    qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                    eob_ptr, scan, iscan));
+  }
+};
+
+TEST_P(FullPrecisionQuantizeTest, ZeroInput) {
   FillCoeffZero();
   QuantizeRun(false);
 }
 
-TEST_P(QuantizeTest, LargeNegativeInput) {
+TEST_P(FullPrecisionQuantizeTest, LargeNegativeInput) {
   FillDcLargeNegative();
   QuantizeRun(false, 0, 1);
 }
 
-TEST_P(QuantizeTest, DcOnlyInput) {
+TEST_P(FullPrecisionQuantizeTest, DcOnlyInput) {
   FillDcOnly();
   QuantizeRun(false, 0, 1);
 }
 
-TEST_P(QuantizeTest, RandomInput) { QuantizeRun(true, 0, kTestNum); }
+TEST_P(FullPrecisionQuantizeTest, RandomInput) {
+  QuantizeRun(true, 0, kTestNum);
+}
 
-TEST_P(QuantizeTest, MultipleQ) {
+TEST_P(FullPrecisionQuantizeTest, MultipleQ) {
   for (int q = 0; q < QINDEX_RANGE; ++q) {
     QuantizeRun(true, q, kTestNum);
   }
@@ -268,12 +334,12 @@ TEST_P(QuantizeTest, MultipleQ) {
 
 // Force the coeff to be half the value of the dequant.  This exposes a
 // mismatch found in av1_quantize_fp_sse2().
-TEST_P(QuantizeTest, CoeffHalfDequant) {
+TEST_P(FullPrecisionQuantizeTest, CoeffHalfDequant) {
   FillCoeff(16);
   QuantizeRun(false, 25, 1);
 }
 
-TEST_P(QuantizeTest, DISABLED_Speed) {
+TEST_P(FullPrecisionQuantizeTest, DISABLED_Speed) {
   tran_low_t *coeff_ptr = coeff_;
   const intptr_t n_coeffs = coeff_num();
 
@@ -320,15 +386,111 @@ TEST_P(QuantizeTest, DISABLED_Speed) {
     const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
     const int simd_elapsed_time =
         static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
-    printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time,
-           simd_elapsed_time, (elapsed_time / simd_elapsed_time));
+    printf("c_time = %d \t simd_time = %d \t Gain = %f \n", elapsed_time,
+           simd_elapsed_time, ((float)elapsed_time / simd_elapsed_time));
+  }
+}
+
+// TODO(crbug.com/aomedia/2796)
+TEST_P(LowPrecisionQuantizeTest, ZeroInput) {
+  FillCoeffZero();
+  QuantizeRun(false);
+}
+
+TEST_P(LowPrecisionQuantizeTest, LargeNegativeInput) {
+  FillDcLargeNegative();
+  QuantizeRun(false, 0, 1);
+}
+
+TEST_P(LowPrecisionQuantizeTest, DcOnlyInput) {
+  FillDcOnly();
+  QuantizeRun(false, 0, 1);
+}
+
+TEST_P(LowPrecisionQuantizeTest, RandomInput) {
+  QuantizeRun(true, 0, kTestNum);
+}
+
+TEST_P(LowPrecisionQuantizeTest, MultipleQ) {
+  for (int q = 0; q < QINDEX_RANGE; ++q) {
+    QuantizeRun(true, q, kTestNum);
+  }
+}
+
+// Force the coeff to be half the value of the dequant.  This exposes a
+// mismatch found in av1_quantize_fp_sse2().
+TEST_P(LowPrecisionQuantizeTest, CoeffHalfDequant) {
+  FillCoeff(16);
+  QuantizeRun(false, 25, 1);
+}
+
+TEST_P(LowPrecisionQuantizeTest, DISABLED_Speed) {
+  int16_t *coeff_ptr = coeff_;
+  const intptr_t n_coeffs = coeff_num();
+
+  int16_t *qcoeff_ref = coeff_ptr + n_coeffs;
+  int16_t *dqcoeff_ref = qcoeff_ref + n_coeffs;
+
+  int16_t *qcoeff = dqcoeff_ref + n_coeffs;
+  int16_t *dqcoeff = qcoeff + n_coeffs;
+  uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
+
+  // Testing uses 2-D DCT scan order table
+  const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT);
+
+  // Testing uses luminance quantization table
+  const int q = 22;
+  const int16_t *round_fp = qtab_->quant.y_round_fp[q];
+  const int16_t *quant_fp = qtab_->quant.y_quant_fp[q];
+  const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
+  const int kNumTests = 5000000;
+  aom_usec_timer timer, simd_timer;
+  int rows = tx_size_high[tx_size_];
+  int cols = tx_size_wide[tx_size_];
+  rows = AOMMIN(32, rows);
+  cols = AOMMIN(32, cols);
+  for (int cnt = 0; cnt <= rows; cnt++) {
+    FillCoeffRandomRows(cnt * cols);
+
+    aom_usec_timer_start(&timer);
+    for (int n = 0; n < kNumTests; ++n) {
+      quant_ref_(coeff_ptr, n_coeffs, round_fp, quant_fp, qcoeff, dqcoeff,
+                 dequant, eob, sc->scan, sc->iscan);
+    }
+    aom_usec_timer_mark(&timer);
+
+    aom_usec_timer_start(&simd_timer);
+    for (int n = 0; n < kNumTests; ++n) {
+      quant_(coeff_ptr, n_coeffs, round_fp, quant_fp, qcoeff, dqcoeff, dequant,
+             eob, sc->scan, sc->iscan);
+    }
+    aom_usec_timer_mark(&simd_timer);
+
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    const int simd_elapsed_time =
+        static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
+    printf("c_time = %d \t simd_time = %d \t Gain = %f \n", elapsed_time,
+           simd_elapsed_time, ((float)elapsed_time / simd_elapsed_time));
   }
 }
 
 using std::make_tuple;
 
 #if HAVE_AVX2
-const QuantizeParam kQParamArrayAvx2[] = {
+
+const QuantizeParam<LPQuantizeFunc> kLPQParamArrayAvx2[] = {
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, LowPrecisionQuantizeTest,
+                         ::testing::ValuesIn(kLPQParamArrayAvx2));
+
+const QuantizeParam<QuantizeFunc> kQParamArrayAvx2[] = {
   make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
              static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
   make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
@@ -408,12 +570,25 @@ const QuantizeParam kQParamArrayAvx2[] = {
              static_cast<TX_SIZE>(TX_4X4), TYPE_B, AOM_BITS_8)
 };
 
-INSTANTIATE_TEST_SUITE_P(AVX2, QuantizeTest,
+INSTANTIATE_TEST_SUITE_P(AVX2, FullPrecisionQuantizeTest,
                          ::testing::ValuesIn(kQParamArrayAvx2));
 #endif  // HAVE_AVX2
 
 #if HAVE_SSE2
-const QuantizeParam kQParamArraySSE2[] = {
+
+const QuantizeParam<LPQuantizeFunc> kLPQParamArraySSE2[] = {
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2,
+             static_cast<TX_SIZE>(TX_8X8), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2,
+             static_cast<TX_SIZE>(TX_4X4), TYPE_FP, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, LowPrecisionQuantizeTest,
+                         ::testing::ValuesIn(kLPQParamArraySSE2));
+
+const QuantizeParam<QuantizeFunc> kQParamArraySSE2[] = {
   make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
              static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
   make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
@@ -499,12 +674,25 @@ const QuantizeParam kQParamArraySSE2[] = {
              static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8)
 };
 
-INSTANTIATE_TEST_SUITE_P(SSE2, QuantizeTest,
+INSTANTIATE_TEST_SUITE_P(SSE2, FullPrecisionQuantizeTest,
                          ::testing::ValuesIn(kQParamArraySSE2));
 #endif
 
 #if HAVE_NEON
-const QuantizeParam kQParamArrayNEON[] = {
+
+const QuantizeParam<LPQuantizeFunc> kLPQParamArrayNEON[] = {
+  make_tuple(av1_quantize_lp_c, av1_quantize_lp_neon,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(av1_quantize_lp_c, av1_quantize_lp_neon,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(av1_quantize_lp_c, av1_quantize_lp_neon,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, LowPrecisionQuantizeTest,
+                         ::testing::ValuesIn(kLPQParamArrayNEON));
+
+const QuantizeParam<QuantizeFunc> kQParamArrayNEON[] = {
   make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
              static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
   make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
@@ -514,16 +702,44 @@ const QuantizeParam kQParamArrayNEON[] = {
   make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
              static_cast<TX_SIZE>(TX_8X32), TYPE_FP, AOM_BITS_8),
   make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
-             static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8)
+             static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_neon,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_neon,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_c, &aom_quantize_b_neon,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_neon,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_64x64_c, &aom_quantize_b_64x64_neon,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8),
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_neon>,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_12),
+  make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_neon>,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_12),
+  make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_neon>,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_neon,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_neon,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_neon,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12),
+#endif
 };
 
-INSTANTIATE_TEST_SUITE_P(NEON, QuantizeTest,
+INSTANTIATE_TEST_SUITE_P(NEON, FullPrecisionQuantizeTest,
                          ::testing::ValuesIn(kQParamArrayNEON));
 #endif
 
 #if HAVE_SSSE3 && ARCH_X86_64
 INSTANTIATE_TEST_SUITE_P(
-    SSSE3, QuantizeTest,
+    SSSE3, FullPrecisionQuantizeTest,
     ::testing::Values(
         make_tuple(&aom_quantize_b_c, &aom_quantize_b_ssse3,
                    static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
@@ -534,14 +750,15 @@ INSTANTIATE_TEST_SUITE_P(
 
 #endif  // HAVE_SSSE3 && ARCH_X86_64
 
-#if HAVE_AVX && ARCH_X86_64
+#if HAVE_AVX
 INSTANTIATE_TEST_SUITE_P(
-    AVX, QuantizeTest,
+    AVX, FullPrecisionQuantizeTest,
     ::testing::Values(
         make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx,
                    static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
         make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_avx,
                    static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8)));
 
-#endif  // HAVE_AVX && ARCH_X86_64
+#endif  // HAVE_AVX
+
 }  // namespace
diff --git a/media/libaom/src/test/ratectrl_qmode_test.cc b/media/libaom/src/test/ratectrl_qmode_test.cc
new file mode 100644
index 0000000000..8292b55231
--- /dev/null
+++ b/media/libaom/src/test/ratectrl_qmode_test.cc
@@ -0,0 +1,798 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <array>
+#include <algorithm>
+#include <cerrno>
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "av1/ratectrl_qmode.h"
+#include "av1/reference_manager.h"
+#include "test/mock_ratectrl_qmode.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// Reads a whitespace-delimited string from stream, and parses it as a double.
+// Returns an empty string if the entire string was successfully parsed as a
+// double, or an error messaage if not.
+std::string ReadDouble(std::istream &stream, double *value) {
+  std::string word;
+  stream >> word;
+  if (word.empty()) {
+    return "Unexpectedly reached end of input";
+  }
+  char *end;
+  *value = std::strtod(word.c_str(), &end);
+  if (*end != '\0') {
+    return "Unexpected characters found: " + word;
+  }
+  return "";
+}
+
+void ReadFirstpassInfo(const std::string &filename,
+                       aom::FirstpassInfo *firstpass_info) {
+  // These golden files are generated by the following command line:
+  // ./aomenc --width=352 --height=288 --fps=30/1 --limit=250 --codec=av1
+  // --cpu-used=3 --end-usage=q --cq-level=36 --threads=0 --profile=0
+  // --lag-in-frames=35 --min-q=0 --max-q=63 --auto-alt-ref=1 --passes=2
+  // --kf-max-dist=160 --kf-min-dist=0 --drop-frame=0
+  // --static-thresh=0 --minsection-pct=0 --maxsection-pct=2000
+  // --arnr-maxframes=7
+  // --arnr-strength=5 --sharpness=0 --undershoot-pct=100 --overshoot-pct=100
+  // --frame-parallel=0
+  // --tile-columns=0 -o output.webm hantro_collage_w352h288.yuv
+  // First pass stats are written out in av1_get_second_pass_params right after
+  // calculate_gf_length.
+  std::string path = libaom_test::GetDataPath() + "/" + filename;
+  std::ifstream firstpass_stats_file(path);
+  ASSERT_TRUE(firstpass_stats_file.good())
+      << "Error opening " << path << ": " << std::strerror(errno);
+  firstpass_info->num_mbs_16x16 = (352 / 16 + 1) * (288 / 16 + 1);
+  std::string newline;
+  while (std::getline(firstpass_stats_file, newline)) {
+    std::istringstream iss(newline);
+    FIRSTPASS_STATS firstpass_stats_input = {};
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.frame), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.weight), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.intra_error), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.frame_avg_wavelet_energy),
+              "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.coded_error), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.sr_coded_error), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.pcnt_inter), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.pcnt_motion), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.pcnt_second_ref), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.pcnt_neutral), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.intra_skip_pct), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.inactive_zone_rows), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.inactive_zone_cols), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.MVr), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.mvr_abs), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.MVc), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.mvc_abs), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.MVrv), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.MVcv), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.mv_in_out_count), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.new_mv_count), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.duration), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.count), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.raw_error_stdev), "");
+    iss >> firstpass_stats_input.is_flash;
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.noise_var), "");
+    ASSERT_EQ(ReadDouble(iss, &firstpass_stats_input.cor_coeff), "");
+    ASSERT_TRUE(iss.eof()) << "Too many fields on line "
+                           << firstpass_info->stats_list.size() + 1 << "\n"
+                           << newline;
+    firstpass_info->stats_list.push_back(firstpass_stats_input);
+  }
+}
+
+}  // namespace
+
+namespace aom {
+
+using ::testing::ElementsAre;
+using ::testing::Field;
+using ::testing::Return;
+
+constexpr double kErrorEpsilon = 0.000001;
+
+void TestGopDisplayOrder(const GopStruct &gop_struct) {
+  // Test whether show frames' order indices are sequential
+  int expected_order_idx = 0;
+  int expected_show_frame_count = 0;
+  for (const auto &gop_frame : gop_struct.gop_frame_list) {
+    if (gop_frame.is_show_frame) {
+      EXPECT_EQ(gop_frame.order_idx, expected_order_idx);
+      expected_order_idx++;
+      expected_show_frame_count++;
+    }
+  }
+  EXPECT_EQ(gop_struct.show_frame_count, expected_show_frame_count);
+}
+
+void TestGopGlobalOrderIdx(const GopStruct &gop_struct,
+                           int global_order_idx_offset) {
+  // Test whether show frames' global order indices are sequential
+  EXPECT_EQ(gop_struct.global_order_idx_offset, global_order_idx_offset);
+  int expected_global_order_idx = global_order_idx_offset;
+  for (const auto &gop_frame : gop_struct.gop_frame_list) {
+    if (gop_frame.is_show_frame) {
+      EXPECT_EQ(gop_frame.global_order_idx, expected_global_order_idx);
+      expected_global_order_idx++;
+    }
+  }
+}
+
+void TestGopGlobalCodingIdx(const GopStruct &gop_struct,
+                            int global_coding_idx_offset) {
+  EXPECT_EQ(gop_struct.global_coding_idx_offset, global_coding_idx_offset);
+  for (const auto &gop_frame : gop_struct.gop_frame_list) {
+    EXPECT_EQ(gop_frame.global_coding_idx,
+              global_coding_idx_offset + gop_frame.coding_idx);
+  }
+}
+
+void TestColocatedShowFrame(const GopStruct &gop_struct) {
+  // Test whether each non show frame has a colocated show frame
+  int gop_size = static_cast<int>(gop_struct.gop_frame_list.size());
+  for (int gop_idx = 0; gop_idx < gop_size; ++gop_idx) {
+    auto &gop_frame = gop_struct.gop_frame_list[gop_idx];
+    if (gop_frame.is_show_frame == 0) {
+      bool found_colocated_ref_frame = false;
+      for (int i = gop_idx + 1; i < gop_size; ++i) {
+        auto &next_gop_frame = gop_struct.gop_frame_list[i];
+        if (gop_frame.order_idx == next_gop_frame.order_idx) {
+          found_colocated_ref_frame = true;
+          EXPECT_EQ(gop_frame.update_ref_idx, next_gop_frame.colocated_ref_idx);
+          EXPECT_TRUE(next_gop_frame.is_show_frame);
+        }
+        if (gop_frame.update_ref_idx == next_gop_frame.update_ref_idx) {
+          break;
+        }
+      }
+      EXPECT_TRUE(found_colocated_ref_frame);
+    }
+  }
+}
+
+void TestLayerDepth(const GopStruct &gop_struct, int max_layer_depth) {
+  int gop_size = static_cast<int>(gop_struct.gop_frame_list.size());
+  for (int gop_idx = 0; gop_idx < gop_size; ++gop_idx) {
+    const auto &gop_frame = gop_struct.gop_frame_list[gop_idx];
+    if (gop_frame.is_key_frame) {
+      EXPECT_EQ(gop_frame.layer_depth, 0);
+    }
+
+    if (gop_frame.is_arf_frame) {
+      EXPECT_LT(gop_frame.layer_depth, max_layer_depth);
+    }
+
+    if (!gop_frame.is_key_frame && !gop_frame.is_arf_frame) {
+      EXPECT_EQ(gop_frame.layer_depth, max_layer_depth);
+    }
+  }
+}
+
+void TestArfInterval(const GopStruct &gop_struct) {
+  std::vector<int> arf_order_idx_list;
+  for (const auto &gop_frame : gop_struct.gop_frame_list) {
+    if (gop_frame.is_arf_frame) {
+      arf_order_idx_list.push_back(gop_frame.order_idx);
+    }
+  }
+  std::sort(arf_order_idx_list.begin(), arf_order_idx_list.end());
+  int arf_count = static_cast<int>(arf_order_idx_list.size());
+  for (int i = 1; i < arf_count; ++i) {
+    int arf_interval = arf_order_idx_list[i] - arf_order_idx_list[i - 1];
+    EXPECT_GE(arf_interval, kMinArfInterval);
+  }
+}
+
+TEST(RateControlQModeTest, ConstructGopARF) {
+  int show_frame_count = 16;
+  const bool has_key_frame = false;
+  const int global_coding_idx_offset = 5;
+  const int global_order_idx_offset = 20;
+  RefFrameManager ref_frame_manager(kRefFrameTableSize);
+  GopStruct gop_struct =
+      ConstructGop(&ref_frame_manager, show_frame_count, has_key_frame,
+                   global_coding_idx_offset, global_order_idx_offset);
+  EXPECT_EQ(gop_struct.show_frame_count, show_frame_count);
+  TestGopDisplayOrder(gop_struct);
+  TestGopGlobalOrderIdx(gop_struct, global_order_idx_offset);
+  TestGopGlobalCodingIdx(gop_struct, global_coding_idx_offset);
+  TestColocatedShowFrame(gop_struct);
+  const int max_layer_depth =
+      ref_frame_manager.ForwardMaxSize() + kLayerDepthOffset;
+  TestLayerDepth(gop_struct, max_layer_depth);
+  TestArfInterval(gop_struct);
+}
+
+TEST(RateControlQModeTest, ConstructGopKey) {
+  const int show_frame_count = 16;
+  const int has_key_frame = 1;
+  const int global_coding_idx_offset = 10;
+  const int global_order_idx_offset = 8;
+  RefFrameManager ref_frame_manager(kRefFrameTableSize);
+  GopStruct gop_struct =
+      ConstructGop(&ref_frame_manager, show_frame_count, has_key_frame,
+                   global_coding_idx_offset, global_order_idx_offset);
+  EXPECT_EQ(gop_struct.show_frame_count, show_frame_count);
+  TestGopDisplayOrder(gop_struct);
+  TestGopGlobalOrderIdx(gop_struct, global_order_idx_offset);
+  TestGopGlobalCodingIdx(gop_struct, global_coding_idx_offset);
+  TestColocatedShowFrame(gop_struct);
+  const int max_layer_depth =
+      ref_frame_manager.ForwardMaxSize() + kLayerDepthOffset;
+  TestLayerDepth(gop_struct, max_layer_depth);
+  TestArfInterval(gop_struct);
+}
+
+static TplBlockStats CreateToyTplBlockStats(int h, int w, int r, int c,
+                                            int intra_cost, int inter_cost) {
+  TplBlockStats tpl_block_stats = {};
+  tpl_block_stats.height = h;
+  tpl_block_stats.width = w;
+  tpl_block_stats.row = r;
+  tpl_block_stats.col = c;
+  tpl_block_stats.intra_cost = intra_cost;
+  tpl_block_stats.inter_cost = inter_cost;
+  tpl_block_stats.ref_frame_index = { -1, -1 };
+  return tpl_block_stats;
+}
+
+static TplFrameStats CreateToyTplFrameStatsWithDiffSizes(int min_block_size,
+                                                         int max_block_size) {
+  TplFrameStats frame_stats;
+  const int max_h = max_block_size;
+  const int max_w = max_h;
+  const int count = max_block_size / min_block_size;
+  frame_stats.min_block_size = min_block_size;
+  frame_stats.frame_height = max_h * count;
+  frame_stats.frame_width = max_w * count;
+  for (int i = 0; i < count; ++i) {
+    for (int j = 0; j < count; ++j) {
+      int h = max_h >> i;
+      int w = max_w >> j;
+      for (int u = 0; u * h < max_h; ++u) {
+        for (int v = 0; v * w < max_w; ++v) {
+          int r = max_h * i + h * u;
+          int c = max_w * j + w * v;
+          int intra_cost = std::rand() % 16;
+          TplBlockStats block_stats =
+              CreateToyTplBlockStats(h, w, r, c, intra_cost, 0);
+          frame_stats.block_stats_list.push_back(block_stats);
+        }
+      }
+    }
+  }
+  return frame_stats;
+}
+
+static void AugmentTplFrameStatsWithRefFrames(
+    TplFrameStats *tpl_frame_stats,
+    const std::array<int, kBlockRefCount> &ref_frame_index) {
+  for (auto &block_stats : tpl_frame_stats->block_stats_list) {
+    block_stats.ref_frame_index = ref_frame_index;
+  }
+}
+static void AugmentTplFrameStatsWithMotionVector(
+    TplFrameStats *tpl_frame_stats,
+    const std::array<MotionVector, kBlockRefCount> &mv) {
+  for (auto &block_stats : tpl_frame_stats->block_stats_list) {
+    block_stats.mv = mv;
+  }
+}
+
+static RefFrameTable CreateToyRefFrameTable(int frame_count) {
+  RefFrameTable ref_frame_table;
+  const int ref_frame_table_size = static_cast<int>(ref_frame_table.size());
+  EXPECT_LE(frame_count, ref_frame_table_size);
+  for (int i = 0; i < frame_count; ++i) {
+    ref_frame_table[i] =
+        GopFrameBasic(0, 0, i, i, 0, GopFrameType::kRegularLeaf);
+  }
+  for (int i = frame_count; i < ref_frame_table_size; ++i) {
+    ref_frame_table[i] = GopFrameInvalid();
+  }
+  return ref_frame_table;
+}
+
+static MotionVector CreateFullpelMv(int row, int col) {
+  return { row, col, 0 };
+}
+
+double TplFrameStatsAccumulateIntraCost(const TplFrameStats &frame_stats) {
+  double sum = 0;
+  for (auto &block_stats : frame_stats.block_stats_list) {
+    sum += block_stats.intra_cost;
+  }
+  return sum;
+}
+
+TEST(RateControlQModeTest, CreateTplFrameDepStats) {
+  TplFrameStats frame_stats = CreateToyTplFrameStatsWithDiffSizes(8, 16);
+  TplFrameDepStats frame_dep_stats =
+      CreateTplFrameDepStatsWithoutPropagation(frame_stats);
+  EXPECT_EQ(frame_stats.min_block_size, frame_dep_stats.unit_size);
+  const int unit_rows = static_cast<int>(frame_dep_stats.unit_stats.size());
+  const int unit_cols = static_cast<int>(frame_dep_stats.unit_stats[0].size());
+  EXPECT_EQ(frame_stats.frame_height, unit_rows * frame_dep_stats.unit_size);
+  EXPECT_EQ(frame_stats.frame_width, unit_cols * frame_dep_stats.unit_size);
+  const double intra_cost_sum =
+      TplFrameDepStatsAccumulateIntraCost(frame_dep_stats);
+
+  const double expected_intra_cost_sum =
+      TplFrameStatsAccumulateIntraCost(frame_stats);
+  EXPECT_NEAR(intra_cost_sum, expected_intra_cost_sum, kErrorEpsilon);
+}
+
+TEST(RateControlQModeTest, GetBlockOverlapArea) {
+  const int size = 8;
+  const int r0 = 8;
+  const int c0 = 9;
+  std::vector<int> r1 = { 8, 10, 16, 10, 8, 100 };
+  std::vector<int> c1 = { 9, 12, 17, 5, 100, 9 };
+  std::vector<int> ref_overlap = { 64, 30, 0, 24, 0, 0 };
+  for (int i = 0; i < static_cast<int>(r1.size()); ++i) {
+    const int overlap0 = GetBlockOverlapArea(r0, c0, r1[i], c1[i], size);
+    const int overlap1 = GetBlockOverlapArea(r1[i], c1[i], r0, c0, size);
+    EXPECT_EQ(overlap0, ref_overlap[i]);
+    EXPECT_EQ(overlap1, ref_overlap[i]);
+  }
+}
+
+TEST(RateControlQModeTest, TplBlockStatsToDepStats) {
+  const int intra_cost = 100;
+  const int inter_cost = 120;
+  const int unit_count = 2;
+  TplBlockStats block_stats =
+      CreateToyTplBlockStats(8, 4, 0, 0, intra_cost, inter_cost);
+  TplUnitDepStats unit_stats = TplBlockStatsToDepStats(block_stats, unit_count);
+  double expected_intra_cost = intra_cost * 1.0 / unit_count;
+  EXPECT_NEAR(unit_stats.intra_cost, expected_intra_cost, kErrorEpsilon);
+  // When inter_cost >= intra_cost in block_stats, in unit_stats,
+  // the inter_cost will be modified so that it's upper-bounded by intra_cost.
+  EXPECT_LE(unit_stats.inter_cost, unit_stats.intra_cost);
+}
+
+TEST(RateControlQModeTest, TplFrameDepStatsPropagateSingleZeroMotion) {
+  // cur frame with coding_idx 1 use ref frame with coding_idx 0
+  const std::array<int, kBlockRefCount> ref_frame_index = { 0, -1 };
+  TplFrameStats frame_stats = CreateToyTplFrameStatsWithDiffSizes(8, 16);
+  AugmentTplFrameStatsWithRefFrames(&frame_stats, ref_frame_index);
+
+  TplGopDepStats gop_dep_stats;
+  const int frame_count = 2;
+  // ref frame with coding_idx 0
+  TplFrameDepStats frame_dep_stats0 =
+      CreateTplFrameDepStats(frame_stats.frame_height, frame_stats.frame_width,
+                             frame_stats.min_block_size);
+  gop_dep_stats.frame_dep_stats_list.push_back(frame_dep_stats0);
+
+  // cur frame with coding_idx 1
+  const TplFrameDepStats frame_dep_stats1 =
+      CreateTplFrameDepStatsWithoutPropagation(frame_stats);
+  gop_dep_stats.frame_dep_stats_list.push_back(frame_dep_stats1);
+
+  const RefFrameTable ref_frame_table = CreateToyRefFrameTable(frame_count);
+  TplFrameDepStatsPropagate(/*coding_idx=*/1, ref_frame_table, &gop_dep_stats);
+
+  // cur frame with coding_idx 1
+  const double expected_propagation_sum =
+      TplFrameStatsAccumulateIntraCost(frame_stats);
+
+  // ref frame with coding_idx 0
+  const double propagation_sum =
+      TplFrameDepStatsAccumulate(gop_dep_stats.frame_dep_stats_list[0]);
+
+  // The propagation_sum between coding_idx 0 and coding_idx 1 should be equal
+  // because every block in cur frame has zero motion, use ref frame with
+  // coding_idx 0 for prediction, and ref frame itself is empty.
+  EXPECT_NEAR(propagation_sum, expected_propagation_sum, kErrorEpsilon);
+}
+
+TEST(RateControlQModeTest, TplFrameDepStatsPropagateCompoundZeroMotion) {
+  // cur frame with coding_idx 2 use two ref frames with coding_idx 0 and 1
+  const std::array<int, kBlockRefCount> ref_frame_index = { 0, 1 };
+  TplFrameStats frame_stats = CreateToyTplFrameStatsWithDiffSizes(8, 16);
+  AugmentTplFrameStatsWithRefFrames(&frame_stats, ref_frame_index);
+
+  TplGopDepStats gop_dep_stats;
+  const int frame_count = 3;
+  // ref frame with coding_idx 0
+  const TplFrameDepStats frame_dep_stats0 =
+      CreateTplFrameDepStats(frame_stats.frame_height, frame_stats.frame_width,
+                             frame_stats.min_block_size);
+  gop_dep_stats.frame_dep_stats_list.push_back(frame_dep_stats0);
+
+  // ref frame with coding_idx 1
+  const TplFrameDepStats frame_dep_stats1 =
+      CreateTplFrameDepStats(frame_stats.frame_height, frame_stats.frame_width,
+                             frame_stats.min_block_size);
+  gop_dep_stats.frame_dep_stats_list.push_back(frame_dep_stats1);
+
+  // cur frame with coding_idx 2
+  const TplFrameDepStats frame_dep_stats2 =
+      CreateTplFrameDepStatsWithoutPropagation(frame_stats);
+  gop_dep_stats.frame_dep_stats_list.push_back(frame_dep_stats2);
+
+  const RefFrameTable ref_frame_table = CreateToyRefFrameTable(frame_count);
+  TplFrameDepStatsPropagate(/*coding_idx=*/2, ref_frame_table, &gop_dep_stats);
+
+  // cur frame with coding_idx 1
+  const double expected_ref_sum = TplFrameStatsAccumulateIntraCost(frame_stats);
+
+  // ref frame with coding_idx 0
+  const double cost_sum0 =
+      TplFrameDepStatsAccumulate(gop_dep_stats.frame_dep_stats_list[0]);
+  EXPECT_NEAR(cost_sum0, expected_ref_sum * 0.5, kErrorEpsilon);
+
+  // ref frame with coding_idx 1
+  const double cost_sum1 =
+      TplFrameDepStatsAccumulate(gop_dep_stats.frame_dep_stats_list[1]);
+  EXPECT_NEAR(cost_sum1, expected_ref_sum * 0.5, kErrorEpsilon);
+}
+
+TEST(RateControlQModeTest, TplFrameDepStatsPropagateSingleWithMotion) {
+  // cur frame with coding_idx 1 use ref frame with coding_idx 0
+  const std::array<int, kBlockRefCount> ref_frame_index = { 0, -1 };
+  const int min_block_size = 8;
+  TplFrameStats frame_stats =
+      CreateToyTplFrameStatsWithDiffSizes(min_block_size, min_block_size * 2);
+  AugmentTplFrameStatsWithRefFrames(&frame_stats, ref_frame_index);
+
+  const int mv_row = min_block_size / 2;
+  const int mv_col = min_block_size / 4;
+  const double r_ratio = 1.0 / 2;
+  const double c_ratio = 1.0 / 4;
+  std::array<MotionVector, kBlockRefCount> mv;
+  mv[0] = CreateFullpelMv(mv_row, mv_col);
+  mv[1] = CreateFullpelMv(0, 0);
+  AugmentTplFrameStatsWithMotionVector(&frame_stats, mv);
+
+  TplGopDepStats gop_dep_stats;
+  const int frame_count = 2;
+  // ref frame with coding_idx 0
+  gop_dep_stats.frame_dep_stats_list.push_back(
+      CreateTplFrameDepStats(frame_stats.frame_height, frame_stats.frame_width,
+                             frame_stats.min_block_size));
+
+  // cur frame with coding_idx 1
+  gop_dep_stats.frame_dep_stats_list.push_back(
+      CreateTplFrameDepStatsWithoutPropagation(frame_stats));
+
+  const RefFrameTable ref_frame_table = CreateToyRefFrameTable(frame_count);
+  TplFrameDepStatsPropagate(/*coding_idx=*/1, ref_frame_table, &gop_dep_stats);
+
+  const auto &dep_stats0 = gop_dep_stats.frame_dep_stats_list[0];
+  const auto &dep_stats1 = gop_dep_stats.frame_dep_stats_list[1];
+  const int unit_rows = static_cast<int>(dep_stats0.unit_stats.size());
+  const int unit_cols = static_cast<int>(dep_stats0.unit_stats[0].size());
+  for (int r = 0; r < unit_rows; ++r) {
+    for (int c = 0; c < unit_cols; ++c) {
+      double ref_value = 0;
+      ref_value += (1 - r_ratio) * (1 - c_ratio) *
+                   dep_stats1.unit_stats[r][c].intra_cost;
+      if (r - 1 >= 0) {
+        ref_value += r_ratio * (1 - c_ratio) *
+                     dep_stats1.unit_stats[r - 1][c].intra_cost;
+      }
+      if (c - 1 >= 0) {
+        ref_value += (1 - r_ratio) * c_ratio *
+                     dep_stats1.unit_stats[r][c - 1].intra_cost;
+      }
+      if (r - 1 >= 0 && c - 1 >= 0) {
+        ref_value +=
+            r_ratio * c_ratio * dep_stats1.unit_stats[r - 1][c - 1].intra_cost;
+      }
+      EXPECT_NEAR(dep_stats0.unit_stats[r][c].propagation_cost, ref_value,
+                  kErrorEpsilon);
+    }
+  }
+}
+
+TEST(RateControlQModeTest, ComputeTplGopDepStats) {
+  TplGopStats tpl_gop_stats;
+  std::vector<RefFrameTable> ref_frame_table_list;
+  for (int i = 0; i < 3; i++) {
+    // Use the previous frame as reference
+    const std::array<int, kBlockRefCount> ref_frame_index = { i - 1, -1 };
+    int min_block_size = 8;
+    TplFrameStats frame_stats =
+        CreateToyTplFrameStatsWithDiffSizes(min_block_size, min_block_size * 2);
+    AugmentTplFrameStatsWithRefFrames(&frame_stats, ref_frame_index);
+    tpl_gop_stats.frame_stats_list.push_back(frame_stats);
+
+    ref_frame_table_list.push_back(CreateToyRefFrameTable(i));
+  }
+  const TplGopDepStats &gop_dep_stats =
+      ComputeTplGopDepStats(tpl_gop_stats, ref_frame_table_list);
+
+  double expected_sum = 0;
+  for (int i = 2; i >= 0; i--) {
+    // Due to the linear propagation with zero motion, we can accumulate the
+    // frame_stats intra_cost and use it as expected sum for dependency stats
+    expected_sum +=
+        TplFrameStatsAccumulateIntraCost(tpl_gop_stats.frame_stats_list[i]);
+    const double sum =
+        TplFrameDepStatsAccumulate(gop_dep_stats.frame_dep_stats_list[i]);
+    EXPECT_NEAR(sum, expected_sum, kErrorEpsilon);
+    break;
+  }
+}
+
+TEST(RefFrameManagerTest, GetRefFrameCount) {
+  const std::vector<int> order_idx_list = { 0, 4, 2, 1, 2, 3, 4 };
+  const std::vector<GopFrameType> type_list = {
+    GopFrameType::kRegularKey,      GopFrameType::kRegularArf,
+    GopFrameType::kIntermediateArf, GopFrameType::kRegularLeaf,
+    GopFrameType::kShowExisting,    GopFrameType::kRegularLeaf,
+    GopFrameType::kOverlay
+  };
+  RefFrameManager ref_manager(kRefFrameTableSize);
+  int coding_idx = 0;
+  const int first_leaf_idx = 3;
+  EXPECT_EQ(type_list[first_leaf_idx], GopFrameType::kRegularLeaf);
+  // update reference frame until we see the first kRegularLeaf frame
+  for (; coding_idx <= first_leaf_idx; ++coding_idx) {
+    GopFrame gop_frame = GopFrameBasic(
+        0, 0, coding_idx, order_idx_list[coding_idx], 0, type_list[coding_idx]);
+    ref_manager.UpdateRefFrameTable(&gop_frame);
+  }
+  EXPECT_EQ(ref_manager.GetRefFrameCount(), 4);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kForward), 2);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 1);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 1);
+  EXPECT_EQ(ref_manager.CurGlobalOrderIdx(), 1);
+
+  // update reference frame until we see the first kShowExisting frame
+  const int first_show_existing_idx = 4;
+  EXPECT_EQ(type_list[first_show_existing_idx], GopFrameType::kShowExisting);
+  for (; coding_idx <= first_show_existing_idx; ++coding_idx) {
+    GopFrame gop_frame = GopFrameBasic(
+        0, 0, coding_idx, order_idx_list[coding_idx], 0, type_list[coding_idx]);
+    ref_manager.UpdateRefFrameTable(&gop_frame);
+  }
+  EXPECT_EQ(ref_manager.GetRefFrameCount(), 4);
+  EXPECT_EQ(ref_manager.CurGlobalOrderIdx(), 2);
+  // After the first kShowExisting, the kIntermediateArf should be moved from
+  // kForward to kLast due to the cur_global_order_idx_ update
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kForward), 1);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 1);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 2);
+
+  const int second_leaf_idx = 5;
+  EXPECT_EQ(type_list[second_leaf_idx], GopFrameType::kRegularLeaf);
+  for (; coding_idx <= second_leaf_idx; ++coding_idx) {
+    GopFrame gop_frame = GopFrameBasic(
+        0, 0, coding_idx, order_idx_list[coding_idx], 0, type_list[coding_idx]);
+    ref_manager.UpdateRefFrameTable(&gop_frame);
+  }
+  EXPECT_EQ(ref_manager.GetRefFrameCount(), 5);
+  EXPECT_EQ(ref_manager.CurGlobalOrderIdx(), 3);
+  // An additional kRegularLeaf frame is added into kLast
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kForward), 1);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 1);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 3);
+
+  const int first_overlay_idx = 6;
+  EXPECT_EQ(type_list[first_overlay_idx], GopFrameType::kOverlay);
+  for (; coding_idx <= first_overlay_idx; ++coding_idx) {
+    GopFrame gop_frame = GopFrameBasic(
+        0, 0, coding_idx, order_idx_list[coding_idx], 0, type_list[coding_idx]);
+    ref_manager.UpdateRefFrameTable(&gop_frame);
+  }
+
+  EXPECT_EQ(ref_manager.GetRefFrameCount(), 5);
+  EXPECT_EQ(ref_manager.CurGlobalOrderIdx(), 4);
+  // After the kOverlay, the kRegularArf should be moved from
+  // kForward to kBackward due to the cur_global_order_idx_ update
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kForward), 0);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 2);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 3);
+}
+
+void TestRefFrameManagerPriority(const RefFrameManager &ref_manager,
+                                 RefUpdateType type) {
+  int ref_count = ref_manager.GetRefFrameCountByType(type);
+  int prev_global_order_idx = ref_manager.CurGlobalOrderIdx();
+  // The lower the priority is, the closer the gop_frame.global_order_idx should
+  // be with cur_global_order_idx_
+  for (int priority = 0; priority < ref_count; ++priority) {
+    GopFrame gop_frame = ref_manager.GetRefFrameByPriority(type, priority);
+    EXPECT_EQ(gop_frame.is_valid, true);
+    if (type == RefUpdateType::kForward) {
+      EXPECT_GE(gop_frame.global_order_idx, prev_global_order_idx);
+    } else {
+      EXPECT_LE(gop_frame.global_order_idx, prev_global_order_idx);
+    }
+    prev_global_order_idx = gop_frame.global_order_idx;
+  }
+  GopFrame gop_frame =
+      ref_manager.GetRefFrameByPriority(RefUpdateType::kForward, ref_count);
+  EXPECT_EQ(gop_frame.is_valid, false);
+}
+
+TEST(RefFrameManagerTest, GetRefFrameByPriority) {
+  const std::vector<int> order_idx_list = { 0, 4, 2, 1, 2, 3, 4 };
+  const std::vector<GopFrameType> type_list = {
+    GopFrameType::kRegularKey,      GopFrameType::kRegularArf,
+    GopFrameType::kIntermediateArf, GopFrameType::kRegularLeaf,
+    GopFrameType::kShowExisting,    GopFrameType::kRegularLeaf,
+    GopFrameType::kOverlay
+  };
+  RefFrameManager ref_manager(kRefFrameTableSize);
+  int coding_idx = 0;
+  const int first_leaf_idx = 3;
+  EXPECT_EQ(type_list[first_leaf_idx], GopFrameType::kRegularLeaf);
+  // update reference frame until we see the first kRegularLeaf frame
+  for (; coding_idx <= first_leaf_idx; ++coding_idx) {
+    GopFrame gop_frame = GopFrameBasic(
+        0, 0, coding_idx, order_idx_list[coding_idx], 0, type_list[coding_idx]);
+    ref_manager.UpdateRefFrameTable(&gop_frame);
+  }
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kForward), 2);
+  TestRefFrameManagerPriority(ref_manager, RefUpdateType::kForward);
+
+  const int first_overlay_idx = 6;
+  EXPECT_EQ(type_list[first_overlay_idx], GopFrameType::kOverlay);
+  for (; coding_idx <= first_overlay_idx; ++coding_idx) {
+    GopFrame gop_frame = GopFrameBasic(
+        0, 0, coding_idx, order_idx_list[coding_idx], 0, type_list[coding_idx]);
+    ref_manager.UpdateRefFrameTable(&gop_frame);
+  }
+
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 2);
+  TestRefFrameManagerPriority(ref_manager, RefUpdateType::kBackward);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 3);
+  TestRefFrameManagerPriority(ref_manager, RefUpdateType::kLast);
+}
+
+TEST(RefFrameManagerTest, GetRefFrameListByPriority) {
+  const std::vector<int> order_idx_list = { 0, 4, 2, 1 };
+  const int frame_count = static_cast<int>(order_idx_list.size());
+  const std::vector<GopFrameType> type_list = { GopFrameType::kRegularKey,
+                                                GopFrameType::kRegularArf,
+                                                GopFrameType::kIntermediateArf,
+                                                GopFrameType::kRegularLeaf };
+  RefFrameManager ref_manager(kRefFrameTableSize);
+  for (int coding_idx = 0; coding_idx < frame_count; ++coding_idx) {
+    GopFrame gop_frame = GopFrameBasic(
+        0, 0, coding_idx, order_idx_list[coding_idx], 0, type_list[coding_idx]);
+    ref_manager.UpdateRefFrameTable(&gop_frame);
+  }
+  EXPECT_EQ(ref_manager.GetRefFrameCount(), frame_count);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kForward), 2);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kBackward), 1);
+  EXPECT_EQ(ref_manager.GetRefFrameCountByType(RefUpdateType::kLast), 1);
+  std::vector<ReferenceFrame> ref_frame_list =
+      ref_manager.GetRefFrameListByPriority();
+  EXPECT_EQ(ref_frame_list.size(), order_idx_list.size());
+  std::vector<int> expected_global_order_idx = { 2, 0, 1, 4 };
+  std::vector<ReferenceName> expected_names = { ReferenceName::kBwdrefFrame,
+                                                ReferenceName::kGoldenFrame,
+                                                ReferenceName::kLastFrame,
+                                                ReferenceName::kAltref2Frame };
+  for (size_t i = 0; i < ref_frame_list.size(); ++i) {
+    ReferenceFrame &ref_frame = ref_frame_list[i];
+    GopFrame gop_frame = ref_manager.GetRefFrameByIndex(ref_frame.index);
+    EXPECT_EQ(gop_frame.global_order_idx, expected_global_order_idx[i]);
+    EXPECT_EQ(ref_frame.name, expected_names[i]);
+  }
+}
+
+TEST(RefFrameManagerTest, GetPrimaryRefFrame) {
+  const std::vector<int> order_idx_list = { 0, 4, 2, 1 };
+  const int frame_count = static_cast<int>(order_idx_list.size());
+  const std::vector<GopFrameType> type_list = { GopFrameType::kRegularKey,
+                                                GopFrameType::kRegularArf,
+                                                GopFrameType::kIntermediateArf,
+                                                GopFrameType::kRegularLeaf };
+  const std::vector<int> layer_depth_list = { 0, 2, 4, 6 };
+  RefFrameManager ref_manager(kRefFrameTableSize);
+  for (int coding_idx = 0; coding_idx < frame_count; ++coding_idx) {
+    GopFrame gop_frame =
+        GopFrameBasic(0, 0, coding_idx, order_idx_list[coding_idx],
+                      layer_depth_list[coding_idx], type_list[coding_idx]);
+    ref_manager.UpdateRefFrameTable(&gop_frame);
+  }
+
+  for (int i = 0; i < frame_count; ++i) {
+    // Test frame that share the same layer depth with a reference frame
+    int layer_depth = layer_depth_list[i];
+    // Set different frame type
+    GopFrameType type = type_list[(i + 1) % frame_count];
+    GopFrame gop_frame = GopFrameBasic(0, 0, 0, 0, layer_depth, type);
+    ReferenceFrame ref_frame = ref_manager.GetPrimaryRefFrame(gop_frame);
+    GopFrame primary_ref_frame =
+        ref_manager.GetRefFrameByIndex(ref_frame.index);
+    // The GetPrimaryRefFrame should find the ref_frame with matched layer depth
+    // because it's our first priority
+    EXPECT_EQ(primary_ref_frame.layer_depth, gop_frame.layer_depth);
+  }
+
+  const std::vector<int> mid_layer_depth_list = { 1, 3, 5 };
+  for (int i = 0; i < 3; ++i) {
+    // Test frame that share the same frame type with a reference frame
+    GopFrameType type = type_list[i];
+    // Let the frame layer_depth sit in the middle of two reference frames
+    int layer_depth = mid_layer_depth_list[i];
+    GopFrame gop_frame = GopFrameBasic(0, 0, 0, 0, layer_depth, type);
+    ReferenceFrame ref_frame = ref_manager.GetPrimaryRefFrame(gop_frame);
+    GopFrame primary_ref_frame =
+        ref_manager.GetRefFrameByIndex(ref_frame.index);
+    // The GetPrimaryRefFrame should find the ref_frame with matched frame type
+    // Here we use coding_idx to confirm that.
+    EXPECT_EQ(primary_ref_frame.coding_idx, i);
+  }
+}
+
+TEST(RateControlQModeTest, TestKeyframeDetection) {
+  FirstpassInfo firstpass_info;
+  const std::string kFirstpassStatsFile = "firstpass_stats";
+  ASSERT_NO_FATAL_FAILURE(
+      ReadFirstpassInfo(kFirstpassStatsFile, &firstpass_info));
+  EXPECT_THAT(GetKeyFrameList(firstpass_info),
+              ElementsAre(0, 30, 60, 90, 120, 150, 180, 210, 240));
+}
+
+TEST(RateControlQModeTest, DISABLED_TestGopIntervals) {
+  FirstpassInfo firstpass_info;
+  ASSERT_NO_FATAL_FAILURE(
+      ReadFirstpassInfo("firstpass_stats", &firstpass_info));
+  AV1RateControlQMode rc;
+  RateControlParam rc_param;
+  rc_param.frame_height = 288;
+  rc_param.frame_width = 352;
+  rc_param.max_gop_show_frame_count = 32;
+  rc_param.min_gop_show_frame_count = 4;
+  rc.SetRcParam(rc_param);
+  GopStructList gop_list = rc.DetermineGopInfo(firstpass_info);
+  std::vector<int> gop_interval_list;
+  std::transform(gop_list.begin(), gop_list.end(),
+                 std::back_inserter(gop_interval_list),
+                 [](GopStruct const &x) { return x.show_frame_count; });
+  EXPECT_THAT(gop_interval_list,
+              ElementsAre(21, 9, 30, 30, 30, 21, 9, 30, 12, 16, 2, 30, 10));
+}
+
+// MockRateControlQMode is provided for the use of clients of libaom, but it's
+// not expected that it will be used in any real libaom tests.
+// This simple "toy" test exists solely to verify the integration of gmock into
+// the aom build.
+TEST(RateControlQModeTest, TestMock) {
+  MockRateControlQMode mock_rc;
+  EXPECT_CALL(mock_rc,
+              DetermineGopInfo(Field(&FirstpassInfo::num_mbs_16x16, 1000)))
+      .WillOnce(Return(GopStructList{ { 6, 0, 0, {} }, { 4, 0, 0, {} } }));
+  FirstpassInfo firstpass_info = {};
+  firstpass_info.num_mbs_16x16 = 1000;
+  EXPECT_THAT(mock_rc.DetermineGopInfo(firstpass_info),
+              ElementsAre(Field(&GopStruct::show_frame_count, 6),
+                          Field(&GopStruct::show_frame_count, 4)));
+}
+
+}  // namespace aom
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  std::srand(0);
+  return RUN_ALL_TESTS();
+}
diff --git a/media/libaom/src/test/ratectrl_rtc_test.cc b/media/libaom/src/test/ratectrl_rtc_test.cc
new file mode 100644
index 0000000000..13b444c46d
--- /dev/null
+++ b/media/libaom/src/test/ratectrl_rtc_test.cc
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/ratectrl_rtc.h"
+
+#include <memory>
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+constexpr size_t kNumFrames = 250;
+
+constexpr int kTemporalId[4] = { 0, 2, 1, 2 };
+
+// Parameter: aq mode: 0 and 3
+class RcInterfaceTest : public ::libaom_test::EncoderTest,
+                        public ::libaom_test::CodecTestWithParam<int> {
+ public:
+  RcInterfaceTest()
+      : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
+        encoder_exit_(false), layer_frame_cnt_(0) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+    memset(&layer_id_, 0, sizeof(layer_id_));
+  }
+
+  ~RcInterfaceTest() override {}
+
+ protected:
+  void SetUp() override { InitializeConfig(::libaom_test::kRealTime); }
+
+  int GetNumSpatialLayers() override { return rc_cfg_.ss_number_layers; }
+
+  void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                          libaom_test::Encoder *encoder) override {
+    const int use_svc =
+        rc_cfg_.ss_number_layers > 1 || rc_cfg_.ts_number_layers > 1;
+    encoder->Control(AV1E_SET_RTC_EXTERNAL_RC, 1);
+    if (video->frame() == 0 && layer_frame_cnt_ == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 7);
+      encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+      if (use_svc) encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+    }
+    // SVC specific settings
+    if (use_svc) {
+      frame_params_.spatial_layer_id =
+          layer_frame_cnt_ % rc_cfg_.ss_number_layers;
+      frame_params_.temporal_layer_id = kTemporalId[video->frame() % 4];
+      layer_id_.spatial_layer_id = frame_params_.spatial_layer_id;
+      layer_id_.temporal_layer_id = frame_params_.temporal_layer_id;
+      encoder->Control(AV1E_SET_SVC_LAYER_ID, &layer_id_);
+    }
+    frame_params_.frame_type = layer_frame_cnt_ % key_interval_ == 0
+                                   ? aom::kKeyFrame
+                                   : aom::kInterFrame;
+    if (!use_svc && frame_params_.frame_type == aom::kInterFrame) {
+      // Disable golden frame update.
+      frame_flags_ |= AOM_EFLAG_NO_UPD_GF;
+      frame_flags_ |= AOM_EFLAG_NO_UPD_ARF;
+    }
+    encoder_exit_ = video->frame() == kNumFrames;
+  }
+
+  void PostEncodeFrameHook(::libaom_test::Encoder *encoder) override {
+    if (encoder_exit_) {
+      return;
+    }
+    layer_frame_cnt_++;
+    int qp;
+    encoder->Control(AOME_GET_LAST_QUANTIZER, &qp);
+    rc_api_->ComputeQP(frame_params_);
+    ASSERT_EQ(rc_api_->GetQP(), qp);
+  }
+
+  void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+    if (layer_id_.spatial_layer_id == 0)
+      rc_api_->PostEncodeUpdate(pkt->data.frame.sz - 2);
+    else
+      rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
+  }
+
+  void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) override {
+    (void)img1;
+    (void)img2;
+  }
+
+  void RunOneLayer() {
+    SetConfig();
+    rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0,
+                                        kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvc() {
+    SetConfigSvc();
+    rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0,
+                                        kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+ private:
+  void SetConfig() {
+    rc_cfg_.width = 1280;
+    rc_cfg_.height = 720;
+    rc_cfg_.max_quantizer = 52;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = 1000;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = 1000;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.ss_number_layers = 1;
+    rc_cfg_.ts_number_layers = 1;
+    rc_cfg_.scaling_factor_num[0] = 1;
+    rc_cfg_.scaling_factor_den[0] = 1;
+    rc_cfg_.layer_target_bitrate[0] = 1000;
+    rc_cfg_.max_quantizers[0] = 52;
+    rc_cfg_.min_quantizers[0] = 2;
+    rc_cfg_.aq_mode = aq_mode_;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = 1280;
+    cfg_.g_h = 720;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 52;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+  }
+
+  void SetConfigSvc() {
+    rc_cfg_.width = 1280;
+    rc_cfg_.height = 720;
+    rc_cfg_.max_quantizer = 52;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = 1000;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = 1000;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.ss_number_layers = 3;
+    rc_cfg_.ts_number_layers = 3;
+    rc_cfg_.aq_mode = aq_mode_;
+
+    rc_cfg_.scaling_factor_num[0] = 1;
+    rc_cfg_.scaling_factor_den[0] = 4;
+    rc_cfg_.scaling_factor_num[1] = 2;
+    rc_cfg_.scaling_factor_den[1] = 4;
+    rc_cfg_.scaling_factor_num[2] = 4;
+    rc_cfg_.scaling_factor_den[2] = 4;
+
+    rc_cfg_.ts_rate_decimator[0] = 4;
+    rc_cfg_.ts_rate_decimator[1] = 2;
+    rc_cfg_.ts_rate_decimator[2] = 1;
+
+    rc_cfg_.layer_target_bitrate[0] = 100;
+    rc_cfg_.layer_target_bitrate[1] = 140;
+    rc_cfg_.layer_target_bitrate[2] = 200;
+    rc_cfg_.layer_target_bitrate[3] = 250;
+    rc_cfg_.layer_target_bitrate[4] = 350;
+    rc_cfg_.layer_target_bitrate[5] = 500;
+    rc_cfg_.layer_target_bitrate[6] = 450;
+    rc_cfg_.layer_target_bitrate[7] = 630;
+    rc_cfg_.layer_target_bitrate[8] = 900;
+
+    for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) {
+      for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) {
+        const int i = sl * rc_cfg_.ts_number_layers + tl;
+        rc_cfg_.max_quantizers[i] = 56;
+        rc_cfg_.min_quantizers[i] = 2;
+      }
+    }
+
+    // Encoder settings for ground truth.
+    svc_params_.number_spatial_layers = 3;
+    svc_params_.number_temporal_layers = 3;
+    cfg_.g_timebase.num = 1;
+    cfg_.g_timebase.den = 30;
+    svc_params_.scaling_factor_num[0] = 72;
+    svc_params_.scaling_factor_den[0] = 288;
+    svc_params_.scaling_factor_num[1] = 144;
+    svc_params_.scaling_factor_den[1] = 288;
+    svc_params_.scaling_factor_num[2] = 288;
+    svc_params_.scaling_factor_den[2] = 288;
+    for (int i = 0; i < AOM_MAX_LAYERS; ++i) {
+      svc_params_.max_quantizers[i] = 56;
+      svc_params_.min_quantizers[i] = 2;
+    }
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    // 3 temporal layers
+    svc_params_.framerate_factor[0] = 4;
+    svc_params_.framerate_factor[1] = 2;
+    svc_params_.framerate_factor[2] = 1;
+
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.g_threads = 1;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_undershoot_pct = 50;
+
+    svc_params_.layer_target_bitrate[0] = 100;
+    svc_params_.layer_target_bitrate[1] = 140;
+    svc_params_.layer_target_bitrate[2] = 200;
+    svc_params_.layer_target_bitrate[3] = 250;
+    svc_params_.layer_target_bitrate[4] = 350;
+    svc_params_.layer_target_bitrate[5] = 500;
+    svc_params_.layer_target_bitrate[6] = 450;
+    svc_params_.layer_target_bitrate[7] = 630;
+    svc_params_.layer_target_bitrate[8] = 900;
+  }
+
+  std::unique_ptr<aom::AV1RateControlRTC> rc_api_;
+  aom::AV1RateControlRtcConfig rc_cfg_;
+  int aq_mode_;
+  int key_interval_;
+  aom::AV1FrameParamsRTC frame_params_;
+  bool encoder_exit_;
+  aom_svc_params_t svc_params_;
+  aom_svc_layer_id_t layer_id_;
+  int layer_frame_cnt_;
+};
+
+TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
+
+TEST_P(RcInterfaceTest, Svc) { RunSvc(); }
+
+AV1_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3));
+
+}  // namespace
diff --git a/media/libaom/src/test/ratectrl_test.cc b/media/libaom/src/test/ratectrl_test.cc
new file mode 100644
index 0000000000..d951b1197f
--- /dev/null
+++ b/media/libaom/src/test/ratectrl_test.cc
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/tpl_model.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+TEST(RatectrlTest, QModeGetQIndexTest) {
+  int base_q_index = 36;
+  int gf_update_type = INTNL_ARF_UPDATE;
+  int gf_pyramid_level = 1;
+  int arf_q = 100;
+  int q_index = av1_q_mode_get_q_index(base_q_index, gf_update_type,
+                                       gf_pyramid_level, arf_q);
+  EXPECT_EQ(q_index, arf_q);
+
+  gf_update_type = INTNL_ARF_UPDATE;
+  gf_pyramid_level = 3;
+  q_index = av1_q_mode_get_q_index(base_q_index, gf_update_type,
+                                   gf_pyramid_level, arf_q);
+  EXPECT_LT(q_index, arf_q);
+
+  gf_update_type = LF_UPDATE;
+  q_index = av1_q_mode_get_q_index(base_q_index, gf_update_type,
+                                   gf_pyramid_level, arf_q);
+  EXPECT_EQ(q_index, base_q_index);
+}
+}  // namespace
diff --git a/media/libaom/src/test/rd_test.cc b/media/libaom/src/test/rd_test.cc
new file mode 100644
index 0000000000..0c481fcbb6
--- /dev/null
+++ b/media/libaom/src/test/rd_test.cc
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <vector>
+
+#include "av1/common/quant_common.h"
+#include "av1/encoder/rd.h"
+#include "aom/aom_codec.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+TEST(RdTest, GetDeltaqOffsetValueTest1) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 4;
+  int q_index = 29;
+  int dc_q_step =
+      av1_dc_quant_QTX(q_index, 0, static_cast<aom_bit_depth_t>(bit_depth));
+  EXPECT_EQ(dc_q_step, 32);
+
+  int ref_new_dc_q_step = static_cast<int>(round(dc_q_step / sqrt(beta)));
+  EXPECT_EQ(ref_new_dc_q_step, 16);
+
+  int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+  int new_dc_q_step = av1_dc_quant_QTX(q_index, delta_q,
+                                       static_cast<aom_bit_depth_t>(bit_depth));
+
+  EXPECT_EQ(new_dc_q_step, ref_new_dc_q_step);
+}
+
+TEST(RdTest, GetDeltaqOffsetValueTest2) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 1.0 / 4.0;
+  int q_index = 29;
+  int dc_q_step =
+      av1_dc_quant_QTX(q_index, 0, static_cast<aom_bit_depth_t>(bit_depth));
+  EXPECT_EQ(dc_q_step, 32);
+
+  int ref_new_dc_q_step = static_cast<int>(round(dc_q_step / sqrt(beta)));
+  EXPECT_EQ(ref_new_dc_q_step, 64);
+
+  int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+  int new_dc_q_step = av1_dc_quant_QTX(q_index, delta_q,
+                                       static_cast<aom_bit_depth_t>(bit_depth));
+
+  EXPECT_EQ(new_dc_q_step, ref_new_dc_q_step);
+}
+
+TEST(RdTest, GetDeltaqOffsetBoundaryTest1) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 0.000000001;
+  std::vector<int> q_index_ls = { 254, 255 };
+  for (auto q_index : q_index_ls) {
+    int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+    EXPECT_EQ(q_index + delta_q, 255);
+  }
+}
+
+TEST(RdTest, GetDeltaqOffsetBoundaryTest2) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 100;
+  std::vector<int> q_index_ls = { 1, 0 };
+  for (auto q_index : q_index_ls) {
+    int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+    EXPECT_EQ(q_index + delta_q, 0);
+  }
+}
+
+TEST(RdTest, GetDeltaqOffsetUnitaryTest1) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 1;
+  for (int q_index = 0; q_index < 255; ++q_index) {
+    int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+    EXPECT_EQ(delta_q, 0);
+  }
+}
+
+}  // namespace
diff --git a/media/libaom/src/test/reconinter_test.cc b/media/libaom/src/test/reconinter_test.cc
index 51bec0eab7..ec97db7807 100644
--- a/media/libaom/src/test/reconinter_test.cc
+++ b/media/libaom/src/test/reconinter_test.cc
@@ -21,7 +21,6 @@
 #include "av1/common/scan.h"
 #include "av1/common/txb_common.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -51,7 +50,7 @@ class BuildCompDiffwtdMaskTest
  public:
   virtual ~BuildCompDiffwtdMaskTest() {}
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
   void RunTest(buildcompdiffwtdmaskd_func test_impl, const int is_speed,
                const DIFFWTD_MASK_TYPE type);
 
@@ -79,7 +78,7 @@ class BuildCompDiffwtdMaskD16Test
     : public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
  public:
   ~BuildCompDiffwtdMaskD16Test() {}
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
  protected:
@@ -88,6 +87,7 @@ class BuildCompDiffwtdMaskD16Test
                     DIFFWTD_MASK_TYPE mask_type);
   libaom_test::ACMRandom rnd_;
 };  // class BuildCompDiffwtdMaskD16Test
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskD16Test);
 
 void BuildCompDiffwtdMaskD16Test::RunCheckOutput(
     buildcompdiffwtdmaskd16_func test_impl) {
diff --git a/media/libaom/src/test/register_state_check.h b/media/libaom/src/test/register_state_check.h
index d404621dd7..3c244c265c 100644
--- a/media/libaom/src/test/register_state_check.h
+++ b/media/libaom/src/test/register_state_check.h
@@ -18,18 +18,13 @@
 
 #include "aom/aom_integer.h"
 
-// ASM_REGISTER_STATE_CHECK(asm_function)
-//   Minimally validates the environment pre & post function execution. This
-//   variant should be used with assembly functions which are not expected to
-//   fully restore the system state. See platform implementations of
-//   RegisterStateCheck for details.
-//
-// API_REGISTER_STATE_CHECK(api_function)
-//   Performs all the checks done by ASM_REGISTER_STATE_CHECK() and any
-//   additional checks to ensure the environment is in a consistent state pre &
-//   post function execution. This variant should be used with API functions.
-//   See platform implementations of RegisterStateCheckXXX for details.
-//
+// API_REGISTER_STATE_CHECK(function)
+//   Validates the environment pre & post function execution to ensure the
+//   environment is in a consistent state. This should be used with API
+//   function sand assembly functions which are not expected to fully restore
+//   the system state.
+//   See platform implementations of RegisterStateCheck and
+//   RegisterStateCheckMMX for details.
 
 #if defined(_WIN64) && ARCH_X86_64
 
@@ -56,7 +51,7 @@ class RegisterStateCheck {
  private:
   static bool StoreRegisters(CONTEXT *const context) {
     const HANDLE this_thread = GetCurrentThread();
-    EXPECT_TRUE(this_thread != NULL);
+    EXPECT_NE(this_thread, nullptr);
     context->ContextFlags = CONTEXT_FLOATING_POINT;
     const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
     EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
@@ -81,13 +76,6 @@ class RegisterStateCheck {
   bool initialized_;
   CONTEXT pre_context_;
 };
-
-#define ASM_REGISTER_STATE_CHECK(statement)    \
-  do {                                         \
-    libaom_test::RegisterStateCheck reg_check; \
-    statement;                                 \
-  } while (false)
-
 }  // namespace libaom_test
 
 #else
@@ -95,15 +83,11 @@ class RegisterStateCheck {
 namespace libaom_test {
 
 class RegisterStateCheck {};
-#define ASM_REGISTER_STATE_CHECK(statement) statement
-
 }  // namespace libaom_test
 
 #endif  // _WIN64 && ARCH_X86_64
 
-#if ARCH_X86 || ARCH_X86_64
-#if defined(__GNUC__)
-
+#if (ARCH_X86 || ARCH_X86_64) && defined(__GNUC__)
 namespace libaom_test {
 
 // Checks the FPU tag word pre/post execution to ensure emms has been called.
@@ -129,20 +113,23 @@ class RegisterStateCheckMMX {
 
   uint16_t pre_fpu_env_[14];
 };
+}  // namespace libaom_test
 
-#define API_REGISTER_STATE_CHECK(statement)       \
-  do {                                            \
-    libaom_test::RegisterStateCheckMMX reg_check; \
-    ASM_REGISTER_STATE_CHECK(statement);          \
-  } while (false)
+#else
+namespace libaom_test {
 
+class RegisterStateCheckMMX {};
 }  // namespace libaom_test
 
-#endif  // __GNUC__
-#endif  // ARCH_X86 || ARCH_X86_64
+#endif  // (ARCH_X86 || ARCH_X86_64) && defined(__GNUC__)
 
-#ifndef API_REGISTER_STATE_CHECK
-#define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK
-#endif
+#define API_REGISTER_STATE_CHECK(statement)           \
+  do {                                                \
+    libaom_test::RegisterStateCheck reg_check;        \
+    libaom_test::RegisterStateCheckMMX reg_check_mmx; \
+    statement;                                        \
+    (void)reg_check_mmx;                              \
+    (void)reg_check;                                  \
+  } while (false)
 
 #endif  // AOM_TEST_REGISTER_STATE_CHECK_H_
diff --git a/media/libaom/src/test/resize_test.cc b/media/libaom/src/test/resize_test.cc
index bcf6794d07..141cdc94fa 100644
--- a/media/libaom/src/test/resize_test.cc
+++ b/media/libaom/src/test/resize_test.cc
@@ -13,12 +13,14 @@
 #include <vector>
 #include "aom_dsp/aom_dsp_common.h"
 #include "common/tools_common.h"
+#include "av1/encoder/encoder.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/video_source.h"
 #include "test/util.h"
+#include "test/y4m_video_source.h"
 
 // Enable(1) or Disable(0) writing of the compressed bitstream.
 #define WRITE_COMPRESSED_STREAM 0
@@ -199,9 +201,17 @@ class ResizeTest
 
   virtual ~ResizeTest() {}
 
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
+  virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      if (GET_PARAM(1) == ::libaom_test::kRealTime) {
+        encoder->Control(AV1E_SET_AQ_MODE, 3);
+        encoder->Control(AOME_SET_CPUUSED, 5);
+        encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      }
+    }
   }
 
   virtual void DecompressedFrameHook(const aom_image_t &img,
@@ -239,6 +249,7 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 const unsigned int kStepDownFrame = 3;
 const unsigned int kStepUpFrame = 6;
 
@@ -298,7 +309,7 @@ class ResizeInternalTestLarge : public ResizeTest {
 
   virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
-    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 3.0);
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 4.1);
   }
 
 #if WRITE_COMPRESSED_STREAM
@@ -363,22 +374,50 @@ TEST_P(ResizeInternalTestLarge, TestInternalResizeChangeConfig) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
+AV1_INSTANTIATE_TEST_SUITE(ResizeInternalTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood));
+#endif
+
 class ResizeRealtimeTest
     : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
       public ::libaom_test::EncoderTest {
  protected:
-  ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
+  ResizeRealtimeTest()
+      : EncoderTest(GET_PARAM(0)), set_scale_mode_(false),
+        set_scale_mode2_(false) {}
   virtual ~ResizeRealtimeTest() {}
 
   virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
                                   libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_AQ_MODE, 3);
+      encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
     }
+    if (set_scale_mode_) {
+      struct aom_scaling_mode mode;
+      if (video->frame() <= 20)
+        mode = { AOME_ONETWO, AOME_ONETWO };
+      else if (video->frame() <= 40)
+        mode = { AOME_ONEFOUR, AOME_ONEFOUR };
+      else if (video->frame() > 40)
+        mode = { AOME_NORMAL, AOME_NORMAL };
+      encoder->Control(AOME_SET_SCALEMODE, &mode);
+    } else if (set_scale_mode2_) {
+      struct aom_scaling_mode mode;
+      if (video->frame() <= 20)
+        mode = { AOME_ONEFOUR, AOME_ONEFOUR };
+      else if (video->frame() <= 40)
+        mode = { AOME_ONETWO, AOME_ONETWO };
+      else if (video->frame() > 40)
+        mode = { AOME_THREEFOUR, AOME_THREEFOUR };
+      encoder->Control(AOME_SET_SCALEMODE, &mode);
+    }
 
-    if (change_bitrate_ && video->frame() == 120) {
+    if (change_bitrate_ && video->frame() == frame_change_bitrate_) {
       change_bitrate_ = false;
       cfg_.rc_target_bitrate = 500;
       encoder->Config(&cfg_);
@@ -386,8 +425,7 @@ class ResizeRealtimeTest
   }
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
+    InitializeConfig(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
   }
 
@@ -426,22 +464,135 @@ class ResizeRealtimeTest
     // the width and height of the frame are swapped
     cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height =
         AOMMAX(kInitialWidth, kInitialHeight);
+    if (set_scale_mode_ || set_scale_mode2_) {
+      cfg_.rc_dropframe_thresh = 0;
+      cfg_.g_forced_max_frame_width = 1280;
+      cfg_.g_forced_max_frame_height = 1280;
+    }
   }
 
   std::vector<FrameInfo> frame_info_list_;
   int set_cpu_used_;
   bool change_bitrate_;
+  unsigned int frame_change_bitrate_;
   double mismatch_psnr_;
   int mismatch_nframes_;
+  bool set_scale_mode_;
+  bool set_scale_mode2_;
 };
 
+// Check the AOME_SET_SCALEMODE control by downsizing to
+// 1/2, then 1/4, and then back up to originsal.
+TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode1) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.g_w = 1280;
+  cfg_.g_h = 720;
+  set_scale_mode_ = true;
+  set_scale_mode2_ = false;
+  DefaultConfig();
+  change_bitrate_ = false;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const auto frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w = 1280 >> 1;
+    unsigned int expected_h = 720 >> 1;
+    if (frame > 40) {
+      expected_w = 1280;
+      expected_h = 720;
+    } else if (frame > 20 && frame <= 40) {
+      expected_w = 1280 >> 2;
+      expected_h = 720 >> 2;
+    }
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
+// Check the AOME_SET_SCALEMODE control by downsizing to
+// 1/2, then 1/4, and then back up to originsal.
+TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode1QVGA) {
+  ::libaom_test::I420VideoSource video("desktop1.320_180.yuv", 320, 180, 30, 1,
+                                       0, 80);
+  cfg_.g_w = 320;
+  cfg_.g_h = 180;
+  set_scale_mode_ = true;
+  set_scale_mode2_ = false;
+  DefaultConfig();
+  change_bitrate_ = false;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const auto frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w = 320 >> 1;
+    unsigned int expected_h = 180 >> 1;
+    if (frame > 40) {
+      expected_w = 320;
+      expected_h = 180;
+    } else if (frame > 20 && frame <= 40) {
+      expected_w = 320 >> 2;
+      expected_h = 180 >> 2;
+    }
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
+// Check the AOME_SET_SCALEMODE control by downsizing to
+// 1/4, then 1/2, and then up to 3/4.
+TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode2) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.g_w = 1280;
+  cfg_.g_h = 720;
+  set_scale_mode_ = false;
+  set_scale_mode2_ = true;
+  DefaultConfig();
+  change_bitrate_ = false;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const auto frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w = 1280 >> 2;
+    unsigned int expected_h = 720 >> 2;
+    if (frame > 40) {
+      expected_w = (3 * 1280) >> 2;
+      expected_h = (3 * 720) >> 2;
+    } else if (frame > 20 && frame <= 40) {
+      expected_w = 1280 >> 1;
+      expected_h = 720 >> 1;
+    }
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   ResizingVideoSource video;
   video.flag_codec_ = 1;
-  DefaultConfig();
   change_bitrate_ = false;
+  set_scale_mode_ = false;
+  set_scale_mode2_ = false;
   mismatch_psnr_ = 0.0;
   mismatch_nframes_ = 0;
+  DefaultConfig();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   // Check we decoded the same number of frames as we attempted to encode
@@ -465,35 +616,44 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
 // Run at low bitrate, with resize_allowed = 1, and verify that we get
 // one resize down event.
-TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDown) {
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 299);
-  DefaultConfig();
-  cfg_.g_w = 352;
-  cfg_.g_h = 288;
+TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
+  ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.g_w = 640;
+  cfg_.g_h = 480;
   change_bitrate_ = false;
+  set_scale_mode_ = false;
+  set_scale_mode2_ = false;
   mismatch_psnr_ = 0.0;
   mismatch_nframes_ = 0;
+  DefaultConfig();
+  // Disable dropped frames.
+  cfg_.rc_dropframe_thresh = 0;
+  // Starting bitrate low.
+  cfg_.rc_target_bitrate = 150;
+  cfg_.rc_resize_mode = RESIZE_DYNAMIC;
+  cfg_.g_forced_max_frame_width = 1280;
+  cfg_.g_forced_max_frame_height = 1280;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   unsigned int last_w = cfg_.g_w;
   unsigned int last_h = cfg_.g_h;
-  int resize_count = 0;
+  int resize_down_count = 0;
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
        info != frame_info_list_.end(); ++info) {
     if (info->w != last_w || info->h != last_h) {
       // Verify that resize down occurs.
-      ASSERT_LT(info->w, last_w);
-      ASSERT_LT(info->h, last_h);
+      if (info->w < last_w && info->h < last_h) {
+        resize_down_count++;
+      }
       last_w = info->w;
       last_h = info->h;
-      resize_count++;
     }
   }
 
 #if CONFIG_AV1_DECODER
-  // Verify that we get 1 resize down event in this test.
-  ASSERT_EQ(1, resize_count) << "Resizing should occur.";
+  // Verify that we get at lease 1 resize down event in this test.
+  ASSERT_GE(resize_down_count, 1) << "Resizing should occur.";
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 #else
   printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
@@ -501,47 +661,60 @@ TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDown) {
 }
 
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
-// Start at low target bitrate, raise the bitrate in the middle of the clip,
-// scaling-up should occur after bitrate changed.
-TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDownUpChangeBitRate) {
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 359);
-  DefaultConfig();
-  cfg_.g_w = 352;
-  cfg_.g_h = 288;
+// Start at low target bitrate, raise the bitrate in the middle of the clip
+// (at frame# = frame_change_bitrate_), scaling-up should occur after bitrate
+// is increased.
+TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
+  ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.g_w = 640;
+  cfg_.g_h = 480;
   change_bitrate_ = true;
+  frame_change_bitrate_ = 120;
+  set_scale_mode_ = false;
+  set_scale_mode2_ = false;
   mismatch_psnr_ = 0.0;
   mismatch_nframes_ = 0;
+  DefaultConfig();
   // Disable dropped frames.
   cfg_.rc_dropframe_thresh = 0;
   // Starting bitrate low.
-  cfg_.rc_target_bitrate = 80;
+  cfg_.rc_target_bitrate = 150;
+  cfg_.rc_resize_mode = RESIZE_DYNAMIC;
+  cfg_.g_forced_max_frame_width = 1280;
+  cfg_.g_forced_max_frame_height = 1280;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   unsigned int last_w = cfg_.g_w;
   unsigned int last_h = cfg_.g_h;
-  int resize_count = 0;
+  unsigned int frame_number = 0;
+  int resize_down_count = 0;
+  int resize_up_count = 0;
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
        info != frame_info_list_.end(); ++info) {
     if (info->w != last_w || info->h != last_h) {
-      resize_count++;
-      if (resize_count == 1) {
-        // Verify that resize down occurs.
+      if (frame_number < frame_change_bitrate_) {
+        // Verify that resize down occurs, before bitrate is increased.
         ASSERT_LT(info->w, last_w);
         ASSERT_LT(info->h, last_h);
-      } else if (resize_count == 2) {
-        // Verify that resize up occurs.
+        resize_down_count++;
+      } else {
+        // Verify that resize up occurs, after bitrate is increased.
         ASSERT_GT(info->w, last_w);
         ASSERT_GT(info->h, last_h);
+        resize_up_count++;
       }
       last_w = info->w;
       last_h = info->h;
     }
+    frame_number++;
   }
 
 #if CONFIG_AV1_DECODER
-  // Verify that we get 2 resize events in this test.
-  ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
+  // Verify that we get at least 2 resize events in this test.
+  ASSERT_GE(resize_up_count, 1) << "Resizing up should occur at lease once.";
+  ASSERT_GE(resize_down_count, 1)
+      << "Resizing down should occur at lease once.";
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 #else
   printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
@@ -632,13 +805,68 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) {
   }
 }
 
-AV1_INSTANTIATE_TEST_CASE(ResizeTest,
-                          ::testing::Values(::libaom_test::kRealTime));
-AV1_INSTANTIATE_TEST_CASE(ResizeInternalTestLarge,
-                          ::testing::Values(::libaom_test::kOnePassGood));
-AV1_INSTANTIATE_TEST_CASE(ResizeRealtimeTest,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(5, 9));
-AV1_INSTANTIATE_TEST_CASE(ResizeCspTest,
-                          ::testing::Values(::libaom_test::kRealTime));
+#if !CONFIG_REALTIME_ONLY
+// This class is used to check if there are any fatal
+// failures while encoding with resize-mode > 0
+class ResizeModeTestLarge
+    : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int,
+                                                 int, int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ResizeModeTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        resize_mode_(GET_PARAM(2)), resize_denominator_(GET_PARAM(3)),
+        resize_kf_denominator_(GET_PARAM(4)), cpu_used_(GET_PARAM(5)) {}
+  virtual ~ResizeModeTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.rc_resize_mode = resize_mode_;
+    cfg_.rc_resize_denominator = resize_denominator_;
+    cfg_.rc_resize_kf_denominator = resize_kf_denominator_;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+    }
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  int resize_mode_;
+  int resize_denominator_;
+  int resize_kf_denominator_;
+  int cpu_used_;
+};
+
+TEST_P(ResizeModeTestLarge, ResizeModeTest) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 30);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ResizeModeTestLarge);
+AV1_INSTANTIATE_TEST_SUITE(ResizeModeTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(1, 2), ::testing::Values(8, 12),
+                           ::testing::Values(10, 14), ::testing::Values(3, 6));
+#endif  // !CONFIG_REALTIME_ONLY
+
+AV1_INSTANTIATE_TEST_SUITE(ResizeTest,
+                           ::testing::Values(::libaom_test::kRealTime));
+AV1_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(6, 10));
+AV1_INSTANTIATE_TEST_SUITE(ResizeCspTest,
+                           ::testing::Values(::libaom_test::kRealTime));
+
 }  // namespace
diff --git a/media/libaom/src/test/rt_end_to_end_test.cc b/media/libaom/src/test/rt_end_to_end_test.cc
index f14d124741..a6f39c13b4 100644
--- a/media/libaom/src/test/rt_end_to_end_test.cc
+++ b/media/libaom/src/test/rt_end_to_end_test.cc
@@ -32,20 +32,33 @@ const int kBitrate = 500;
 std::unordered_map<std::string,
                    std::unordered_map<int, std::unordered_map<int, double>>>
     kPsnrThreshold = { { "park_joy_90p_8_420.y4m",
-                         { { 5, { { 0, 35.4 }, { 3, 36.4 } } },
+                         { { 5, { { 0, 35.4 }, { 3, 36.3 } } },
                            { 6, { { 0, 35.3 }, { 3, 36.2 } } },
                            { 7, { { 0, 34.9 }, { 3, 35.8 } } },
-                           { 8, { { 0, 35.0 }, { 3, 35.8 } } } } },
+                           { 8, { { 0, 35.0 }, { 3, 35.8 } } },
+                           { 9, { { 0, 34.9 }, { 3, 35.5 } } },
+                           { 10, { { 0, 34.7 }, { 3, 35.3 } } } } },
                        { "paris_352_288_30.y4m",
                          { { 5, { { 0, 36.2 }, { 3, 36.7 } } },
-                           { 6, { { 0, 36.1 }, { 3, 36.6 } } },
+                           { 6, { { 0, 36.1 }, { 3, 36.5 } } },
                            { 7, { { 0, 35.5 }, { 3, 36.0 } } },
-                           { 8, { { 0, 36.0 }, { 3, 36.5 } } } } },
+                           { 8, { { 0, 36.0 }, { 3, 36.5 } } },
+                           { 9, { { 0, 35.5 }, { 3, 36.0 } } },
+                           { 10, { { 0, 35.3 }, { 3, 35.9 } } } } },
                        { "niklas_1280_720_30.y4m",
-                         { { 5, { { 0, 34.6 }, { 3, 34.6 } } },
+                         { { 5, { { 0, 34.4 }, { 3, 34.30 } } },
                            { 6, { { 0, 34.2 }, { 3, 34.2 } } },
-                           { 7, { { 0, 33.7 }, { 3, 33.6 } } },
-                           { 8, { { 0, 33.6 }, { 3, 33.4 } } } } } };
+                           { 7, { { 0, 33.5 }, { 3, 33.5 } } },
+                           { 8, { { 0, 33.48 }, { 3, 33.48 } } },
+                           { 9, { { 0, 33.4 }, { 3, 33.4 } } },
+                           { 10, { { 0, 33.2 }, { 3, 33.2 } } } } },
+                       { "hantro_collage_w352h288_nv12.yuv",
+                         { { 5, { { 0, 34.4 }, { 3, 34.30 } } },
+                           { 6, { { 0, 34.2 }, { 3, 34.2 } } },
+                           { 7, { { 0, 33.6 }, { 3, 33.6 } } },
+                           { 8, { { 0, 33.48 }, { 3, 33.48 } } },
+                           { 9, { { 0, 33.4 }, { 3, 33.4 } } },
+                           { 10, { { 0, 33.2 }, { 3, 33.2 } } } } } };
 
 typedef struct {
   const char *filename;
@@ -66,6 +79,7 @@ const TestVideoParam kTestVectors[] = {
   { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
   { "paris_352_288_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
   { "niklas_1280_720_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "hantro_collage_w352h288_nv12.yuv", 8, AOM_IMG_FMT_NV12, AOM_BITS_8, 0 },
 };
 
 // Params: test video, speed, aq mode, threads, tile columns.
@@ -83,14 +97,14 @@ class RTEndToEndTest
   virtual ~RTEndToEndTest() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(::libaom_test::kRealTime);
+    InitializeConfig(::libaom_test::kRealTime);
 
-    cfg_.rc_end_usage = AOM_CBR;
     cfg_.g_threads = threads_;
     cfg_.rc_buf_sz = 1000;
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 600;
+    cfg_.kf_max_dist = 9999;
+    cfg_.kf_min_dist = 9999;
   }
 
   virtual void BeginPassHook(unsigned int) {
@@ -106,12 +120,23 @@ class RTEndToEndTest
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
                                   ::libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
+      encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+      encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_WARPED_MOTION, 0);
+      encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
+      encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
       encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
       encoder->Control(AV1E_SET_ROW_MT, 1);
+      encoder->Control(AV1E_SET_ENABLE_CDEF, 1);
+      encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
+      encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
+      encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
+      encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2);
     }
   }
 
@@ -134,9 +159,14 @@ class RTEndToEndTest
     if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
 
     std::unique_ptr<libaom_test::VideoSource> video;
-    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
-                                                kFrames));
-    ASSERT_TRUE(video.get() != NULL);
+    if (is_extension_y4m(test_video_param_.filename))
+      video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                  kFrames));
+    else
+      video.reset(new libaom_test::YUVVideoSource(test_video_param_.filename,
+                                                  test_video_param_.fmt, 352,
+                                                  288, 30, 1, 0, kFrames));
+    ASSERT_NE(video, nullptr);
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
     const double psnr = GetAveragePsnr();
@@ -161,14 +191,14 @@ TEST_P(RTEndToEndTest, EndtoEndPSNRTest) { DoTest(); }
 
 TEST_P(RTEndToEndTestThreaded, EndtoEndPSNRTest) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(RTEndToEndTest, ::testing::ValuesIn(kTestVectors),
-                          ::testing::Range(5, 9),
-                          ::testing::Values<unsigned int>(0, 3),
-                          ::testing::Values(1), ::testing::Values(1));
+AV1_INSTANTIATE_TEST_SUITE(RTEndToEndTest, ::testing::ValuesIn(kTestVectors),
+                           ::testing::Range(5, 11),
+                           ::testing::Values<unsigned int>(0, 3),
+                           ::testing::Values(1), ::testing::Values(1));
 
-AV1_INSTANTIATE_TEST_CASE(RTEndToEndTestThreaded,
-                          ::testing::ValuesIn(kTestVectors),
-                          ::testing::Range(5, 9),
-                          ::testing::Values<unsigned int>(0, 3),
-                          ::testing::Range(2, 5), ::testing::Range(2, 5));
+AV1_INSTANTIATE_TEST_SUITE(RTEndToEndTestThreaded,
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Range(5, 11),
+                           ::testing::Values<unsigned int>(0, 3),
+                           ::testing::Range(2, 5), ::testing::Range(2, 5));
 }  // namespace
diff --git a/media/libaom/src/test/run_encodes.sh b/media/libaom/src/test/run_encodes.sh
index 2096d8b158..2096d8b158 100644..100755
--- a/media/libaom/src/test/run_encodes.sh
+++ b/media/libaom/src/test/run_encodes.sh
diff --git a/media/libaom/src/test/sad_test.cc b/media/libaom/src/test/sad_test.cc
index 0bdbf37452..0d0ee88ea6 100644
--- a/media/libaom/src/test/sad_test.cc
+++ b/media/libaom/src/test/sad_test.cc
@@ -20,7 +20,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "aom/aom_codec.h"
@@ -31,6 +30,10 @@ typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride,
                                    const uint8_t *ref_ptr, int ref_stride);
 typedef std::tuple<int, int, SadMxNFunc, int> SadMxNParam;
 
+typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride);
+typedef std::tuple<int, int, SadSkipMxNFunc, int> SadSkipMxNParam;
+
 typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
                                   const uint8_t *ref_ptr, int ref_stride,
                                   const uint8_t *second_pred);
@@ -60,6 +63,11 @@ typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
                              uint32_t *sad_array);
 typedef std::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
 
+typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_ptr[], int ref_stride,
+                                 uint32_t *sad_array);
+typedef std::tuple<int, int, SadSkipMxNx4Func, int> SadSkipMxNx4Param;
+
 typedef void (*SadMxNx4AvgFunc)(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *const ref_ptr[], int ref_stride,
                                 const uint8_t *second_pred,
@@ -74,30 +82,40 @@ class SADTestBase : public ::testing::Test {
   SADTestBase(int width, int height, int bit_depth)
       : width_(width), height_(height), bd_(bit_depth) {}
 
-  static void SetUpTestCase() {
+  static void SetUpTestSuite() {
     source_data8_ = reinterpret_cast<uint8_t *>(
         aom_memalign(kDataAlignment, kDataBlockSize));
+    ASSERT_NE(source_data8_, nullptr);
     reference_data8_ = reinterpret_cast<uint8_t *>(
         aom_memalign(kDataAlignment, kDataBufferSize));
+    ASSERT_NE(reference_data8_, nullptr);
     second_pred8_ =
         reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    ASSERT_NE(second_pred8_, nullptr);
     comp_pred8_ =
         reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    ASSERT_NE(comp_pred8_, nullptr);
     comp_pred8_test_ =
         reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    ASSERT_NE(comp_pred8_test_, nullptr);
     source_data16_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, kDataBlockSize * sizeof(uint16_t)));
+    ASSERT_NE(source_data16_, nullptr);
     reference_data16_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t)));
+    ASSERT_NE(reference_data16_, nullptr);
     second_pred16_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    ASSERT_NE(second_pred16_, nullptr);
     comp_pred16_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    ASSERT_NE(comp_pred16_, nullptr);
     comp_pred16_test_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    ASSERT_NE(comp_pred16_test_, nullptr);
   }
 
-  static void TearDownTestCase() {
+  static void TearDownTestSuite() {
     aom_free(source_data8_);
     source_data8_ = NULL;
     aom_free(reference_data8_);
@@ -120,7 +138,7 @@ class SADTestBase : public ::testing::Test {
     comp_pred16_test_ = NULL;
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   // Handle up to 4 128x128 blocks, with stride up to 256
@@ -182,6 +200,31 @@ class SADTestBase : public ::testing::Test {
     return sad;
   }
 
+  // Sum of Absolute Differences Skip rows. Given two blocks,
+  // calculate the absolute  difference between two pixels in the same
+  // relative location every other row; accumulate and double the result at the
+  // end.
+  unsigned int ReferenceSADSkip(int block_idx) {
+    unsigned int sad = 0;
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const source8 = source_data_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+    for (int h = 0; h < height_; h += 2) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          sad += abs(source8[h * source_stride_ + w] -
+                     reference8[h * reference_stride_ + w]);
+        } else {
+          sad += abs(source16[h * source_stride_ + w] -
+                     reference16[h * reference_stride_ + w]);
+        }
+      }
+    }
+    return sad * 2;
+  }
+
   // Sum of Absolute Differences Average. Given two blocks, and a prediction
   // calculate the absolute difference between one pixel and average of the
   // corresponding and predicted pixels; accumulate.
@@ -329,7 +372,7 @@ class SADx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
     const uint8_t *references[] = { GetReference(0), GetReference(1),
                                     GetReference(2), GetReference(3) };
 
-    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(
+    API_REGISTER_STATE_CHECK(GET_PARAM(2)(
         source_data_, source_stride_, references, reference_stride_, results));
   }
 
@@ -343,8 +386,53 @@ class SADx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
       EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
     }
   }
+
+  void SpeedSAD() {
+    int test_count = 2000000;
+    unsigned int exp_sad[4];
+    while (test_count > 0) {
+      SADs(exp_sad);
+      test_count -= 1;
+    }
+  }
+};
+
+class SADSkipx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
+                      public SADTestBase {
+ public:
+  SADSkipx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void SADs(unsigned int *results) {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    API_REGISTER_STATE_CHECK(GET_PARAM(2)(
+        source_data_, source_stride_, references, reference_stride_, results));
+  }
+
+  void CheckSADs() {
+    unsigned int reference_sad, exp_sad[4];
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSADSkip(block);
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+
+  void SpeedSAD() {
+    int test_count = 2000000;
+    unsigned int exp_sad[4];
+    while (test_count > 0) {
+      SADs(exp_sad);
+      test_count -= 1;
+    }
+  }
 };
 
+#if !CONFIG_REALTIME_ONLY
 class SADx4AvgTest : public ::testing::WithParamInterface<SadMxNx4AvgParam>,
                      public SADTestBase {
  public:
@@ -355,7 +443,7 @@ class SADx4AvgTest : public ::testing::WithParamInterface<SadMxNx4AvgParam>,
     const uint8_t *references[] = { GetReference(0), GetReference(1),
                                     GetReference(2), GetReference(3) };
 
-    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
+    API_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
                                           references, reference_stride_,
                                           second_pred_, results));
   }
@@ -380,6 +468,7 @@ class SADx4AvgTest : public ::testing::WithParamInterface<SadMxNx4AvgParam>,
     }
   }
 };
+#endif  // !CONFIG_REALTIME_ONLY
 
 class SADTest : public ::testing::WithParamInterface<SadMxNParam>,
                 public SADTestBase {
@@ -391,7 +480,7 @@ class SADTest : public ::testing::WithParamInterface<SadMxNParam>,
     unsigned int ret;
     const uint8_t *const reference = GetReference(block_idx);
 
-    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+    API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
                                                 reference, reference_stride_));
     return ret;
   }
@@ -412,6 +501,37 @@ class SADTest : public ::testing::WithParamInterface<SadMxNParam>,
   }
 };
 
+class SADSkipTest : public ::testing::WithParamInterface<SadMxNParam>,
+                    public SADTestBase {
+ public:
+  SADSkipTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSADSkip(0);
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void SpeedSAD() {
+    int test_count = 20000000;
+    while (test_count > 0) {
+      SAD(0);
+      test_count -= 1;
+    }
+  }
+};
+
 class SADavgTest : public ::testing::WithParamInterface<SadMxNAvgParam>,
                    public SADTestBase {
  public:
@@ -422,7 +542,7 @@ class SADavgTest : public ::testing::WithParamInterface<SadMxNAvgParam>,
     unsigned int ret;
     const uint8_t *const reference = GetReference(block_idx);
 
-    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+    API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
                                                 reference, reference_stride_,
                                                 second_pred_));
     return ret;
@@ -447,7 +567,7 @@ class DistWtdCompAvgTest
   void dist_wtd_comp_avg(int block_idx) {
     const uint8_t *const reference = GetReference(block_idx);
 
-    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
+    API_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
                                           height_, reference, reference_stride_,
                                           &jcp_param_));
   }
@@ -455,8 +575,8 @@ class DistWtdCompAvgTest
   void CheckCompAvg() {
     for (int j = 0; j < 2; ++j) {
       for (int i = 0; i < 4; ++i) {
-        jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
-        jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
+        jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
+        jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
 
         ReferenceDistWtdCompAvg(0);
         dist_wtd_comp_avg(0);
@@ -480,7 +600,7 @@ class DistWtdSADTest : public ::testing::WithParamInterface<DistWtdSadMxhParam>,
     unsigned int ret;
     const uint8_t *const reference = GetReference(block_idx);
 
-    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+    API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
                                                 reference, reference_stride_,
                                                 GET_PARAM(0), GET_PARAM(1)));
     return ret;
@@ -501,6 +621,7 @@ class DistWtdSADTest : public ::testing::WithParamInterface<DistWtdSadMxhParam>,
     }
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdSADTest);
 
 class DistWtdSADavgTest
     : public ::testing::WithParamInterface<DistWtdSadMxNAvgParam>,
@@ -513,7 +634,7 @@ class DistWtdSADavgTest
     unsigned int ret;
     const uint8_t *const reference = GetReference(block_idx);
 
-    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+    API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
                                                 reference, reference_stride_,
                                                 second_pred_, &jcp_param_));
     return ret;
@@ -522,8 +643,8 @@ class DistWtdSADavgTest
   void CheckSAD() {
     for (int j = 0; j < 2; ++j) {
       for (int i = 0; i < 4; ++i) {
-        jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
-        jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
+        jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
+        jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
 
         const unsigned int reference_sad = ReferenceDistWtdSADavg(0);
         const unsigned int exp_sad = dist_wtd_SAD_avg(0);
@@ -595,9 +716,61 @@ TEST_P(SADTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
-#define SPEED_TEST (0)
-#if SPEED_TEST
-TEST_P(SADTest, Speed) {
+TEST_P(SADTest, DISABLED_Speed) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  SpeedSAD();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, DISABLED_Speed) {
   const int tmp_stride = source_stride_;
   source_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
@@ -605,7 +778,6 @@ TEST_P(SADTest, Speed) {
   SpeedSAD();
   source_stride_ = tmp_stride;
 }
-#endif
 
 TEST_P(SADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
@@ -854,10 +1026,101 @@ TEST_P(SADx4Test, SrcAlignedByWidth) {
   source_data_ = tmp_source_data;
 }
 
+TEST_P(SADx4Test, DISABLED_Speed) {
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  SpeedSAD();
+}
+
+// SADSkipx4
+TEST_P(SADSkipx4Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 1000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(GetReference(0), reference_stride_);
+    FillRandom(GetReference(1), reference_stride_);
+    FillRandom(GetReference(2), reference_stride_);
+    FillRandom(GetReference(3), reference_stride_);
+    CheckSADs();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, SrcAlignedByWidth) {
+  uint8_t *tmp_source_data = source_data_;
+  source_data_ += width_;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+TEST_P(SADSkipx4Test, DISABLED_Speed) {
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  SpeedSAD();
+}
+
 using std::make_tuple;
 
-#if SPEED_TEST
-TEST_P(SADx4AvgTest, Speed) {
+#if !CONFIG_REALTIME_ONLY
+TEST_P(SADx4AvgTest, DISABLED_Speed) {
   int tmp_stride = reference_stride_;
   reference_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
@@ -869,7 +1132,6 @@ TEST_P(SADx4AvgTest, Speed) {
   SpeedSAD();
   reference_stride_ = tmp_stride;
 }
-#endif
 
 TEST_P(SADx4AvgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
@@ -918,6 +1180,7 @@ TEST_P(SADx4AvgTest, UnalignedRef) {
   CheckSADs();
   reference_stride_ = tmp_stride;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 //------------------------------------------------------------------------------
 // C functions
@@ -988,6 +1251,7 @@ const SadMxNParam c_tests[] = {
   make_tuple(4, 8, &aom_highbd_sad4x8_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_c, 12),
 #endif  // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16_c, -1),
   make_tuple(16, 64, &aom_sad16x64_c, -1),
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1018,9 +1282,112 @@ const SadMxNParam c_tests[] = {
   make_tuple(16, 4, &aom_highbd_sad16x4_c, 12),
   make_tuple(4, 16, &aom_highbd_sad4x16_c, 12),
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 };
 INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
+const SadSkipMxNParam skip_c_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128_c, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64_c, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128_c, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64_c, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32_c, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64_c, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32_c, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16_c, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32_c, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16_c, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8_c, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16_c, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8_c, -1),
+  make_tuple(8, 4, &aom_sad_skip_8x4_c, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8_c, -1),
+  make_tuple(4, 4, &aom_sad_skip_4x4_c, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16_c, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64_c, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8_c, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32_c, -1),
+  make_tuple(16, 4, &aom_sad_skip_16x4_c, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16_c, -1),
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4_c, 8),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_c, 8),
+#endif
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4_c, 10),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_c, 10),
+#endif
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4_c, 12),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_c, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_c, 12),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_c, 12),
+#endif  // !CONFIG_REALTIME_ONLY
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests));
+
 const SadMxNAvgParam avg_c_tests[] = {
   make_tuple(128, 128, &aom_sad128x128_avg_c, -1),
   make_tuple(128, 64, &aom_sad128x64_avg_c, -1),
@@ -1088,6 +1455,7 @@ const SadMxNAvgParam avg_c_tests[] = {
   make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 12),
 #endif  // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16_avg_c, -1),
   make_tuple(16, 64, &aom_sad16x64_avg_c, -1),
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1118,6 +1486,7 @@ const SadMxNAvgParam avg_c_tests[] = {
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 12),
   make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 12),
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 };
 INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
@@ -1140,12 +1509,14 @@ const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = {
   make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
 
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
   make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(C, DistWtdCompAvgTest,
@@ -1169,12 +1540,14 @@ const DistWtdSadMxNAvgParam dist_wtd_avg_c_tests[] = {
   make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_c, -1),
   make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_c, -1),
 
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_c, -1),
   make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_c, -1),
   make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_c, -1),
   make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_c, -1),
   make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_c, -1),
   make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_c, -1),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(C, DistWtdSADavgTest,
@@ -1247,6 +1620,7 @@ const SadMxNx4Param x4d_c_tests[] = {
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 12),
 #endif
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16x4d_c, -1),
   make_tuple(16, 64, &aom_sad16x64x4d_c, -1),
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1277,9 +1651,113 @@ const SadMxNx4Param x4d_c_tests[] = {
   make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 12),
   make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 12),
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
+const SadMxNx4Param skip_x4d_c_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128x4d_c, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64x4d_c, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128x4d_c, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64x4d_c, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32x4d_c, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64x4d_c, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32x4d_c, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16x4d_c, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32x4d_c, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16x4d_c, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8x4d_c, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16x4d_c, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8x4d_c, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8x4d_c, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16x4d_c, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64x4d_c, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8x4d_c, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32x4d_c, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16x4d_c, -1),
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_c, 8),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_c, 8),
+#endif
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_c, 10),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_c, 10),
+#endif
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_c, 12),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_c, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_c, 12),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_c, 12),
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_c_tests));
+
+#if !CONFIG_REALTIME_ONLY
 const SadMxNx4AvgParam x4d_avg_c_tests[] = {
   make_tuple(128, 128, &aom_sad128x128x4d_avg_c, -1),
   make_tuple(128, 64, &aom_sad128x64x4d_avg_c, -1),
@@ -1305,11 +1783,13 @@ const SadMxNx4AvgParam x4d_avg_c_tests[] = {
   make_tuple(4, 16, &aom_sad4x16x4d_avg_c, -1),
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4AvgTest, ::testing::ValuesIn(x4d_avg_c_tests));
+#endif  // !CONFIG_REALTIME_ONLY
 
 //------------------------------------------------------------------------------
 // ARM functions
 #if HAVE_NEON
 const SadMxNParam neon_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_neon, -1),
   make_tuple(64, 64, &aom_sad64x64_neon, -1),
   make_tuple(32, 32, &aom_sad32x32_neon, -1),
   make_tuple(16, 16, &aom_sad16x16_neon, -1),
@@ -1326,6 +1806,57 @@ const SadMxNx4Param x4d_neon_tests[] = {
   make_tuple(16, 16, &aom_sad16x16x4d_neon, -1),
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
+const SadSkipMxNParam skip_neon_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128_neon, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64_neon, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128_neon, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64_neon, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32_neon, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64_neon, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32_neon, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16_neon, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32_neon, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16_neon, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8_neon, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16_neon, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8_neon, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16_neon, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8_neon, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64_neon, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32_neon, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16_neon, -1),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest,
+                         ::testing::ValuesIn(skip_neon_tests));
+
+const SadSkipMxNx4Param skip_x4d_neon_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128x4d_neon, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64x4d_neon, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128x4d_neon, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64x4d_neon, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32x4d_neon, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64x4d_neon, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32x4d_neon, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16x4d_neon, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32x4d_neon, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16x4d_neon, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8x4d_neon, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8x4d_neon, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16x4d_neon, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8x4d_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16x4d_neon, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8x4d_neon, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64x4d_neon, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32x4d_neon, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16x4d_neon, -1),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_neon_tests));
 #endif  // HAVE_NEON
 
 //------------------------------------------------------------------------------
@@ -1389,6 +1920,7 @@ const SadMxNParam sse2_tests[] = {
   make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 12),
 #endif
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16_sse2, -1),
   make_tuple(16, 64, &aom_sad16x64_sse2, -1),
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1419,9 +1951,94 @@ const SadMxNParam sse2_tests[] = {
   make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 12),
   make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 12),
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
+const SadSkipMxNParam skip_sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128_sse2, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64_sse2, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128_sse2, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64_sse2, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32_sse2, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64_sse2, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32_sse2, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16_sse2, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32_sse2, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16_sse2, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8_sse2, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16_sse2, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8_sse2, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8_sse2, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16_sse2, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64_sse2, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8_sse2, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32_sse2, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16_sse2, -1),
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_sse2, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_sse2, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_sse2, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_sse2, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_sse2, 8),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_sse2, 8),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_sse2, 8),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_sse2, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_sse2, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_sse2, 8),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_sse2, 8),
+#endif
+
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_sse2, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_sse2, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_sse2, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_sse2, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_sse2, 10),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_sse2, 10),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_sse2, 10),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_sse2, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_sse2, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_sse2, 10),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_sse2, 10),
+#endif
+
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_sse2, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_sse2, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_sse2, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_sse2, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_sse2, 12),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_sse2, 12),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_sse2, 12),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_sse2, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_sse2, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_sse2, 12),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_sse2, 12),
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest,
+                         ::testing::ValuesIn(skip_sse2_tests));
+
 const SadMxNAvgParam avg_sse2_tests[] = {
   make_tuple(128, 128, &aom_sad128x128_avg_sse2, -1),
   make_tuple(128, 64, &aom_sad128x64_avg_sse2, -1),
@@ -1480,6 +2097,7 @@ const SadMxNAvgParam avg_sse2_tests[] = {
   make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 12),
 #endif
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16_avg_sse2, -1),
   make_tuple(16, 64, &aom_sad16x64_avg_sse2, -1),
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1510,6 +2128,7 @@ const SadMxNAvgParam avg_sse2_tests[] = {
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 12),
   make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 12),
 #endif
+#endif  // !CONFIG_REALTIME_ONLY
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
 
@@ -1571,6 +2190,7 @@ const SadMxNx4Param x4d_sse2_tests[] = {
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 12),
 #endif
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad64x16x4d_sse2, -1),
   make_tuple(16, 64, &aom_sad16x64x4d_sse2, -1),
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1601,9 +2221,93 @@ const SadMxNx4Param x4d_sse2_tests[] = {
   make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 12),
   make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 12),
 #endif
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
 
+const SadSkipMxNx4Param skip_x4d_sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128x4d_sse2, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64x4d_sse2, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128x4d_sse2, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64x4d_sse2, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32x4d_sse2, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64x4d_sse2, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32x4d_sse2, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16x4d_sse2, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32x4d_sse2, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16x4d_sse2, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8x4d_sse2, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16x4d_sse2, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8x4d_sse2, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8x4d_sse2, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16x4d_sse2, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64x4d_sse2, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8x4d_sse2, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32x4d_sse2, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16x4d_sse2, -1),
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_sse2, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_sse2, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_sse2, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_sse2, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_sse2, 8),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_sse2, 8),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_sse2, 8),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_sse2, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_sse2, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_sse2, 8),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_sse2, 8),
+#endif
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_sse2, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_sse2, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_sse2, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_sse2, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_sse2, 10),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_sse2, 10),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_sse2, 10),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_sse2, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_sse2, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_sse2, 10),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_sse2, 10),
+#endif
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_sse2, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_sse2, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_sse2, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_sse2, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_sse2, 12),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_sse2, 12),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_sse2, 12),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_sse2, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_sse2, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_sse2, 12),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_sse2, 12),
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_sse2_tests));
+
+#if !CONFIG_REALTIME_ONLY
 const SadMxNx4AvgParam x4d_avg_sse2_tests[] = {
   make_tuple(128, 128, &aom_sad128x128x4d_avg_sse2, -1),
   make_tuple(128, 64, &aom_sad128x64x4d_avg_sse2, -1),
@@ -1630,6 +2334,7 @@ const SadMxNx4AvgParam x4d_avg_sse2_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADx4AvgTest,
                          ::testing::ValuesIn(x4d_avg_sse2_tests));
+#endif  // !CONFIG_REALTIME_ONLY
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
@@ -1658,13 +2363,14 @@ const DistWtdSadMxhParam dist_wtd_sad_sse2_tests[] = {
   make_tuple(32, 8, &aom_sad32xh_sse2, -1),
   make_tuple(16, 64, &aom_sad16xh_sse2, -1),
   make_tuple(64, 16, &aom_sad64xh_sse2, -1),
-
+#if !CONFIG_REALTIME_ONLY
   make_tuple(16, 64, &aom_sad16xh_sse2, -1),
   make_tuple(64, 16, &aom_sad64xh_sse2, -1),
   make_tuple(8, 32, &aom_sad8xh_sse2, -1),
   make_tuple(32, 8, &aom_sad32xh_sse2, -1),
   make_tuple(4, 16, &aom_sad4xh_sse2, -1),
   make_tuple(16, 4, &aom_sad16xh_sse2, -1),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, DistWtdSADTest,
                          ::testing::ValuesIn(dist_wtd_sad_sse2_tests));
@@ -1694,13 +2400,14 @@ const DistWtdCompAvgParam dist_wtd_comp_avg_ssse3_tests[] = {
   make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
   make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdCompAvgTest,
@@ -1723,13 +2430,14 @@ const DistWtdSadMxNAvgParam dist_wtd_avg_ssse3_tests[] = {
   make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_ssse3, -1),
   make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_ssse3, -1),
   make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_ssse3, -1),
-
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_ssse3, -1),
   make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_ssse3, -1),
   make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_ssse3, -1),
   make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_ssse3, -1),
   make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_ssse3, -1),
   make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_ssse3, -1),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdSADavgTest,
                          ::testing::ValuesIn(dist_wtd_avg_ssse3_tests));
@@ -1784,6 +2492,7 @@ const SadMxNParam avx2_tests[] = {
   make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 10),
   make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 12),
 
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 8),
   make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 10),
   make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 12),
@@ -1797,9 +2506,66 @@ const SadMxNParam avx2_tests[] = {
   make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 10),
   make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 12),
 #endif
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
+const SadSkipMxNParam skip_avx2_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128_avx2, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64_avx2, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128_avx2, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64_avx2, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32_avx2, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64_avx2, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32_avx2, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_avx2, 8),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_avx2, 10),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_avx2, 12),
+#endif
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest,
+                         ::testing::ValuesIn(skip_avx2_tests));
+
 const SadMxNAvgParam avg_avx2_tests[] = {
   make_tuple(64, 128, &aom_sad64x128_avg_avx2, -1),
   make_tuple(128, 64, &aom_sad128x64_avg_avx2, -1),
@@ -1844,6 +2610,7 @@ const SadMxNAvgParam avg_avx2_tests[] = {
   make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 10),
   make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 12),
 
+#if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 8),
   make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 10),
   make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 12),
@@ -1857,20 +2624,96 @@ const SadMxNAvgParam avg_avx2_tests[] = {
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 10),
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 12),
 #endif
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
 
+const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128x4d_avx2, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64x4d_avx2, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128x4d_avx2, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64x4d_avx2, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32x4d_avx2, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64x4d_avx2, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32x4d_avx2, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16x4d_avx2, -1),
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_avx2, 8),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_avx2, 10),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_avx2, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_avx2, 8),
+
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_avx2, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_avx2, 10),
+
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_avx2, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_avx2, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_avx2, 12),
+#endif
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad_skip_64x16x4d_avx2, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8x4d_avx2, -1),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_avx2_tests));
+
 const SadMxNx4Param x4d_avx2_tests[] = {
   make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
   make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),
   make_tuple(32, 16, &aom_sad32x16x4d_avx2, -1),
-  make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1),
   make_tuple(64, 128, &aom_sad64x128x4d_avx2, -1),
   make_tuple(64, 64, &aom_sad64x64x4d_avx2, -1),
   make_tuple(64, 32, &aom_sad64x32x4d_avx2, -1),
-  make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1),
   make_tuple(128, 128, &aom_sad128x128x4d_avx2, -1),
   make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1),
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1),
+  make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1),
+#endif
+
 #if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 8),
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 10),
@@ -1906,6 +2749,7 @@ const SadMxNx4Param x4d_avx2_tests[] = {
   make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 10),
   make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 12),
 
+#if !CONFIG_REALTIME_ONLY
   make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 8),
   make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 10),
   make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 12),
@@ -1919,6 +2763,7 @@ const SadMxNx4Param x4d_avx2_tests[] = {
   make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 10),
   make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 12),
 #endif
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 #endif  // HAVE_AVX2
diff --git a/media/libaom/src/test/sb_multipass_test.cc b/media/libaom/src/test/sb_multipass_test.cc
index 0ca76ab85b..8ddc0026a9 100644
--- a/media/libaom/src/test/sb_multipass_test.cc
+++ b/media/libaom/src/test/sb_multipass_test.cc
@@ -45,8 +45,7 @@ class AV1SBMultipassTest
   virtual ~AV1SBMultipassTest() { delete decoder_; }
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(::libaom_test::kTwoPassGood);
+    InitializeConfig(::libaom_test::kTwoPassGood);
 
     cfg_.g_lag_in_frames = 5;
     cfg_.rc_end_usage = AOM_VBR;
@@ -147,7 +146,7 @@ class AV1SBMultipassTest
 
 TEST_P(AV1SBMultipassTest, TwoPassMatchTest) { DoTest(); }
 
-AV1_INSTANTIATE_TEST_CASE(AV1SBMultipassTest, ::testing::Range(0, 6),
-                          ::testing::Bool());
+AV1_INSTANTIATE_TEST_SUITE(AV1SBMultipassTest, ::testing::Range(4, 6),
+                           ::testing::Bool());
 
 }  // namespace
diff --git a/media/libaom/src/test/scalability_test.cc b/media/libaom/src/test/scalability_test.cc
index b399188617..24dbef5903 100644
--- a/media/libaom/src/test/scalability_test.cc
+++ b/media/libaom/src/test/scalability_test.cc
@@ -29,8 +29,7 @@ class ScalabilityTest
   virtual ~ScalabilityTest() {}
 
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
+    InitializeConfig(GET_PARAM(1));
     num_spatial_layers_ = 2;
   }
 
@@ -75,7 +74,7 @@ TEST_P(ScalabilityTest, TestNoMismatch2SpatialLayers) { DoTest(2); }
 
 TEST_P(ScalabilityTest, TestNoMismatch3SpatialLayers) { DoTest(3); }
 
-AV1_INSTANTIATE_TEST_CASE(ScalabilityTest,
-                          ::testing::Values(::libaom_test::kRealTime));
+AV1_INSTANTIATE_TEST_SUITE(ScalabilityTest,
+                           ::testing::Values(::libaom_test::kRealTime));
 
 }  // namespace
diff --git a/media/libaom/src/test/screen_content_test.cc b/media/libaom/src/test/screen_content_test.cc
new file mode 100644
index 0000000000..acdee6b230
--- /dev/null
+++ b/media/libaom/src/test/screen_content_test.cc
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom/aom_codec.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/util.h"
+
+namespace {
+// This class is used to validate if screen_content_tools are turned on
+// appropriately.
+class ScreenContentToolsTestLarge
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+                                                 aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ScreenContentToolsTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        rc_end_usage_(GET_PARAM(2)) {
+    is_screen_content_violated_ = true;
+    tune_content_ = AOM_CONTENT_DEFAULT;
+  }
+  virtual ~ScreenContentToolsTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.g_profile = 0;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AV1E_SET_TUNE_CONTENT, tune_content_);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      aom_screen_content_tools_info sc_info;
+
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_SCREEN_CONTENT_TOOLS_INFO,
+                                    &sc_info);
+      if (sc_info.allow_screen_content_tools == 1) {
+        is_screen_content_violated_ = false;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  bool is_screen_content_violated_;
+  int tune_content_;
+  aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(ScreenContentToolsTestLarge, ScreenContentToolsTest) {
+  // force screen content tools on
+  ::libaom_test::Y4mVideoSource video_nonsc("park_joy_90p_8_444.y4m", 0, 1);
+  cfg_.g_profile = 1;
+  tune_content_ = AOM_CONTENT_SCREEN;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video_nonsc));
+  ASSERT_EQ(is_screen_content_violated_, false)
+      << "Failed for tune_content_ = AOM_CONTENT_SCREEN";
+
+  // Don't force screen content, however as the input is screen content
+  // allow_screen_content_tools should still be turned on
+  ::libaom_test::Y4mVideoSource video_sc("desktop_credits.y4m", 0, 1);
+  cfg_.g_profile = 1;
+  is_screen_content_violated_ = true;
+  tune_content_ = AOM_CONTENT_DEFAULT;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video_sc));
+  ASSERT_EQ(is_screen_content_violated_, false)
+      << "Failed detection of screen content";
+
+  // TODO(anyone): Enable below test once low resolution screen content
+  // detection issues are fixed.
+  // low resolution test
+  //  ::libaom_test::Y4mVideoSource video_sc("screendata.y4m", 0, 1);
+  //  cfg_.g_profile = 0;
+  //  is_screen_content_violated_ = true;
+  //  tune_content_ = AOM_CONTENT_DEFAULT;
+  //  ASSERT_NO_FATAL_FAILURE(RunLoop(&video_sc));
+  //  ASSERT_EQ(is_screen_content_violated_, false)
+  //      << "Failed detection of screen content(lowres)";
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ScreenContentToolsTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(AOM_Q));
+
+class ScreenContentToolsMultiThreadTestLarge
+    : public ScreenContentToolsTestLarge {};
+
+TEST_P(ScreenContentToolsMultiThreadTestLarge, ScreenContentToolsTest) {
+  // TODO(aomedia:3278): This test is known to have data races. Do not run the
+  // test under ThreadSanitizer.
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+  GTEST_SKIP()
+      << "Skipping the test under ThreadSanitizer. See bug aomedia:3278.";
+#endif
+#elif defined(__SANITIZE_THREAD__)
+  GTEST_SKIP()
+      << "Skipping the test under ThreadSanitizer. See bug aomedia:3278.";
+#endif
+  // Don't force screen content, however as the input is screen content
+  // allow_screen_content_tools should still be turned on even with
+  // multi-threaded encoding.
+  ::libaom_test::Y4mVideoSource video_sc("desktop_credits.y4m", 0, 10);
+  cfg_.g_profile = 1;
+  cfg_.g_threads = 4;
+  is_screen_content_violated_ = true;
+  tune_content_ = AOM_CONTENT_DEFAULT;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video_sc));
+  ASSERT_EQ(is_screen_content_violated_, false)
+      << "Failed detection of screen content";
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ScreenContentToolsMultiThreadTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(AOM_Q));
+}  // namespace
diff --git a/media/libaom/src/test/selfguided_filter_test.cc b/media/libaom/src/test/selfguided_filter_test.cc
index d65cce58a4..a8461b5966 100644
--- a/media/libaom/src/test/selfguided_filter_test.cc
+++ b/media/libaom/src/test/selfguided_filter_test.cc
@@ -17,7 +17,6 @@
 #include "config/av1_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -45,7 +44,7 @@ class AV1SelfguidedFilterTest
   virtual ~AV1SelfguidedFilterTest() {}
   virtual void SetUp() {}
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   void RunSpeedTest() {
@@ -58,9 +57,12 @@ class AV1SelfguidedFilterTest
 
     uint8_t *input_ =
         (uint8_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint8_t));
+    ASSERT_NE(input_, nullptr);
     uint8_t *output_ = (uint8_t *)aom_memalign(
         32, out_stride * (height + 32) * sizeof(uint8_t));
+    ASSERT_NE(output_, nullptr);
     int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+    ASSERT_NE(tmpbuf, nullptr);
     uint8_t *input = input_ + stride * 16 + 16;
     uint8_t *output = output_ + out_stride * 16 + 16;
 
@@ -139,11 +141,15 @@ class AV1SelfguidedFilterTest
 
     uint8_t *input_ =
         (uint8_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint8_t));
+    ASSERT_NE(input_, nullptr);
     uint8_t *output_ = (uint8_t *)aom_memalign(
         32, out_stride * (max_h + 32) * sizeof(uint8_t));
+    ASSERT_NE(output_, nullptr);
     uint8_t *output2_ = (uint8_t *)aom_memalign(
         32, out_stride * (max_h + 32) * sizeof(uint8_t));
+    ASSERT_NE(output2_, nullptr);
     int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+    ASSERT_NE(tmpbuf, nullptr);
 
     uint8_t *input = input_ + stride * 16 + 16;
     uint8_t *output = output_ + out_stride * 16 + 16;
@@ -197,6 +203,7 @@ class AV1SelfguidedFilterTest
  private:
   SgrFunc tst_fun_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1SelfguidedFilterTest);
 
 TEST_P(AV1SelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
 TEST_P(AV1SelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
@@ -230,7 +237,7 @@ class AV1HighbdSelfguidedFilterTest
   virtual ~AV1HighbdSelfguidedFilterTest() {}
   virtual void SetUp() {}
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   void RunSpeedTest() {
@@ -245,9 +252,12 @@ class AV1HighbdSelfguidedFilterTest
 
     uint16_t *input_ =
         (uint16_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint16_t));
+    ASSERT_NE(input_, nullptr);
     uint16_t *output_ = (uint16_t *)aom_memalign(
         32, out_stride * (height + 32) * sizeof(uint16_t));
+    ASSERT_NE(output_, nullptr);
     int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+    ASSERT_NE(tmpbuf, nullptr);
     uint16_t *input = input_ + stride * 16 + 16;
     uint16_t *output = output_ + out_stride * 16 + 16;
 
@@ -330,11 +340,15 @@ class AV1HighbdSelfguidedFilterTest
 
     uint16_t *input_ =
         (uint16_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint16_t));
+    ASSERT_NE(input_, nullptr);
     uint16_t *output_ = (uint16_t *)aom_memalign(
         32, out_stride * (max_h + 32) * sizeof(uint16_t));
+    ASSERT_NE(output_, nullptr);
     uint16_t *output2_ = (uint16_t *)aom_memalign(
         32, out_stride * (max_h + 32) * sizeof(uint16_t));
+    ASSERT_NE(output2_, nullptr);
     int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+    ASSERT_NE(tmpbuf, nullptr);
 
     uint16_t *input = input_ + stride * 16 + 16;
     uint16_t *output = output_ + out_stride * 16 + 16;
@@ -388,6 +402,7 @@ class AV1HighbdSelfguidedFilterTest
  private:
   SgrFunc tst_fun_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdSelfguidedFilterTest);
 
 TEST_P(AV1HighbdSelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
 TEST_P(AV1HighbdSelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
diff --git a/media/libaom/src/test/set_maps.sh b/media/libaom/src/test/set_maps.sh
index 4f59b06d69..b79357a2b8 100644..100755
--- a/media/libaom/src/test/set_maps.sh
+++ b/media/libaom/src/test/set_maps.sh
@@ -36,7 +36,7 @@ set_maps() {
 
   eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/media/libaom/src/test/sharpness_test.cc b/media/libaom/src/test/sharpness_test.cc
new file mode 100644
index 0000000000..49c5804d25
--- /dev/null
+++ b/media/libaom/src/test/sharpness_test.cc
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <unordered_map>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+const unsigned int kCqLevel = 18;
+
+// List of psnr thresholds for different test combinations
+// keys: test-mode, cpu-used, sharpness.
+const std::unordered_map<
+    int, std::unordered_map<int, std::unordered_map<int, double>>>
+    kPsnrThreshold = { { static_cast<int>(::libaom_test::kTwoPassGood),
+                         { { 2, { { 2, 37.6 }, { 5, 37.6 } } },
+                           { 4, { { 2, 37.5 }, { 5, 37.5 } } },
+                           { 6, { { 2, 37.5 }, { 5, 37.5 } } } } },
+                       { static_cast<int>(::libaom_test::kAllIntra),
+                         { { 3, { { 2, 42.2 }, { 5, 42.2 } } },
+                           { 6, { { 2, 41.8 }, { 4, 41.9 }, { 5, 41.9 } } },
+                           { 9, { { 2, 41.0 }, { 5, 41.0 } } } } } };
+
+// This class is used to test sharpness parameter configured through control
+// call using AOME_SET_SHARPNESS for different encoder configurations.
+class SharpnessTest
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+                                                 int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  SharpnessTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), sharpness_level_(GET_PARAM(3)), psnr_(0.0),
+        nframes_(0) {}
+
+  ~SharpnessTest() override {}
+
+  void SetUp() override {
+    InitializeConfig(encoding_mode_);
+    if (encoding_mode_ == ::libaom_test::kTwoPassGood) {
+      cfg_.rc_target_bitrate = kBitrate;
+      cfg_.g_lag_in_frames = 5;
+    }
+  }
+
+  void BeginPassHook(unsigned int) override {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_SHARPNESS, sharpness_level_);
+      if (encoding_mode_ == ::libaom_test::kTwoPassGood) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      } else if (encoding_mode_ == ::libaom_test::kAllIntra) {
+        encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+      }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() {
+    return kPsnrThreshold.at(encoding_mode_).at(cpu_used_).at(sharpness_level_);
+  }
+
+  void DoTest() {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+
+    std::unique_ptr<libaom_test::VideoSource> video(
+        new libaom_test::Y4mVideoSource("paris_352_288_30.y4m", 0, kFrames));
+    ASSERT_NE(video, nullptr);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_
+        << ", sharpness level = " << sharpness_level_;
+  }
+
+ private:
+  const libaom_test::TestMode encoding_mode_;
+  const int cpu_used_;
+  const int sharpness_level_;
+  double psnr_;
+  unsigned int nframes_;
+};
+
+class SharpnessTestLarge : public SharpnessTest {};
+
+class SharpnessAllIntraTest : public SharpnessTest {};
+
+class SharpnessAllIntraTestLarge : public SharpnessTest {};
+
+TEST_P(SharpnessTestLarge, SharpnessPSNRTest) { DoTest(); }
+
+TEST_P(SharpnessAllIntraTest, SharpnessPSNRTest) { DoTest(); }
+
+TEST_P(SharpnessAllIntraTestLarge, SharpnessPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(SharpnessTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(2, 4, 6),  // cpu_used
+                           ::testing::Values(2, 5));    // sharpness level
+
+AV1_INSTANTIATE_TEST_SUITE(SharpnessAllIntraTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(6),   // cpu_used
+                           ::testing::Values(4));  // sharpness level
+
+AV1_INSTANTIATE_TEST_SUITE(SharpnessAllIntraTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(3, 6, 9),  // cpu_used
+                           ::testing::Values(2, 5));    // sharpness level
+}  // namespace
diff --git a/media/libaom/src/test/simd_cmp_impl.h b/media/libaom/src/test/simd_cmp_impl.h
index d3eb33619b..ab8f579ace 100644
--- a/media/libaom/src/test/simd_cmp_impl.h
+++ b/media/libaom/src/test/simd_cmp_impl.h
@@ -15,9 +15,11 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "aom_dsp/aom_simd.h"
+// Inlining not forced for the compiler due to some tests calling
+// SIMD_INLINE functions via function pointers
 #undef SIMD_INLINE
-#define SIMD_INLINE static  // Don't enforce inlining
+#define SIMD_INLINE static inline
+#include "aom_dsp/aom_simd.h"
 #include "aom_dsp/simd/v256_intrinsics_c.h"
 
 // Machine tuned code goes into this file. This file is included from
diff --git a/media/libaom/src/test/simd_impl.h b/media/libaom/src/test/simd_impl.h
index 61fda009f1..8535e37cd4 100644
--- a/media/libaom/src/test/simd_impl.h
+++ b/media/libaom/src/test/simd_impl.h
@@ -13,7 +13,6 @@
 
 #define SIMD_CHECK 1
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "aom_dsp/aom_simd_inline.h"
 #include "aom_dsp/simd/v256_intrinsics_c.h"
@@ -30,7 +29,7 @@ class TestIntrinsic : public ::testing::TestWithParam<param_signature> {
     name = std::get<2>(this->GetParam());
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   uint32_t mask, maskwidth;
diff --git a/media/libaom/src/test/simple_decoder.sh b/media/libaom/src/test/simple_decoder.sh
index 5f39ad206e..9b1aea1ed5 100644..100755
--- a/media/libaom/src/test/simple_decoder.sh
+++ b/media/libaom/src/test/simple_decoder.sh
@@ -36,7 +36,7 @@ simple_decoder() {
   fi
 
   eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/media/libaom/src/test/simple_encoder.sh b/media/libaom/src/test/simple_encoder.sh
index 5cd6b46a10..dfb1a1b546 100644..100755
--- a/media/libaom/src/test/simple_encoder.sh
+++ b/media/libaom/src/test/simple_encoder.sh
@@ -36,7 +36,7 @@ simple_encoder() {
 
   eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 5 \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/media/libaom/src/test/sse_sum_test.cc b/media/libaom/src/test/sse_sum_test.cc
new file mode 100644
index 0000000000..e7c32e6278
--- /dev/null
+++ b/media/libaom/src/test/sse_sum_test.cc
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+namespace {
+const int kNumIterations = 10000;
+
+typedef uint64_t (*SSI16Func)(const int16_t *src, int src_stride, int width,
+                              int height, int *sum);
+typedef libaom_test::FuncParam<SSI16Func> TestFuncs;
+
+class SumSSETest : public ::testing::TestWithParam<TestFuncs> {
+ public:
+  virtual ~SumSSETest() {}
+  virtual void SetUp() {
+    params_ = this->GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
+    ASSERT_NE(src_, nullptr);
+  }
+
+  virtual void TearDown() { aom_free(src_); }
+  void RunTest(int isRandom);
+  void RunSpeedTest();
+
+  void GenRandomData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = rnd_(2) ? rnd_(limit) : -rnd_(limit);
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    const int val = rnd_(2) ? limit - 1 : -(limit - 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = val;
+      }
+    }
+  }
+
+ protected:
+  TestFuncs params_;
+  int16_t *src_;
+  ACMRandom rnd_;
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSSETest);
+
+void SumSSETest::RunTest(int isRandom) {
+  for (int k = 0; k < kNumIterations; k++) {
+    const int width = 4 * (rnd_(31) + 1);   // Up to 128x128
+    const int height = 4 * (rnd_(31) + 1);  // Up to 128x128
+    int stride = 4 << rnd_(7);              // Up to 256 stride
+    while (stride < width) {                // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    if (isRandom) {
+      GenRandomData(width, height, stride);
+    } else {
+      GenExtremeData(width, height, stride);
+    }
+    int sum_ref = 0, sum_tst = 0;
+    const uint64_t sse_ref =
+        params_.ref_func(src_, stride, width, height, &sum_ref);
+    const uint64_t sse_tst =
+        params_.tst_func(src_, stride, width, height, &sum_tst);
+
+    EXPECT_EQ(sse_ref, sse_tst)
+        << "Error: SumSSETest [" << width << "x" << height
+        << "] C SSE does not match optimized output.";
+    EXPECT_EQ(sum_ref, sum_tst)
+        << "Error: SumSSETest [" << width << "x" << height
+        << "] C Sum does not match optimized output.";
+  }
+}
+
+void SumSSETest::RunSpeedTest() {
+  for (int block = BLOCK_4X4; block < BLOCK_SIZES_ALL; block++) {
+    const int width = block_size_wide[block];   // Up to 128x128
+    const int height = block_size_high[block];  // Up to 128x128
+    int stride = 4 << rnd_(7);                  // Up to 256 stride
+    while (stride < width) {                    // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    GenExtremeData(width, height, stride);
+    const int num_loops = 1000000000 / (width + height);
+    int sum_ref = 0, sum_tst = 0;
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      params_.ref_func(src_, stride, width, height, &sum_ref);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("SumSquaresTest C %3dx%-3d: %7.2f ns\n", width, height,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+    for (int i = 0; i < num_loops; ++i)
+      params_.tst_func(src_, stride, width, height, &sum_tst);
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("SumSquaresTest Test %3dx%-3d: %7.2f ns\n", width, height,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+}
+
+TEST_P(SumSSETest, OperationCheck) {
+  RunTest(1);  // GenRandomData
+}
+
+TEST_P(SumSSETest, ExtremeValues) {
+  RunTest(0);  // GenExtremeData
+}
+
+TEST_P(SumSSETest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, SumSSETest,
+                         ::testing::Values(TestFuncs(
+                             &aom_sum_sse_2d_i16_c, &aom_sum_sse_2d_i16_sse2)));
+
+#endif  // HAVE_SSE2
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SumSSETest,
+                         ::testing::Values(TestFuncs(
+                             &aom_sum_sse_2d_i16_c, &aom_sum_sse_2d_i16_avx2)));
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/media/libaom/src/test/still_picture_test.cc b/media/libaom/src/test/still_picture_test.cc
new file mode 100644
index 0000000000..e2eef94f9c
--- /dev/null
+++ b/media/libaom/src/test/still_picture_test.cc
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+// This class is used to test the presence of still picture feature.
+class StillPicturePresenceTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  StillPicturePresenceTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        enable_full_header_(GET_PARAM(2)) {
+    still_picture_coding_violated_ = false;
+  }
+  virtual ~StillPicturePresenceTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_Q;
+    cfg_.g_threads = 1;
+    cfg_.full_still_picture_hdr = enable_full_header_;
+    cfg_.g_limit = 1;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AV1E_SET_FORCE_VIDEO_MODE, 0);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_STILL_PICTURE,
+                                    &still_pic_info_);
+      if (still_pic_info_.is_still_picture != 1) {
+        still_picture_coding_violated_ = true;
+      }
+      if (still_pic_info_.is_reduced_still_picture_hdr == enable_full_header_) {
+        /* If full_still_picture_header is enabled in encoder config but
+         * bitstream contains reduced_still_picture_header set, then set
+         * still_picture_coding_violated_ to true.
+         * Similarly, if full_still_picture_header is disabled in encoder config
+         * but bitstream contains reduced_still_picture_header not set, then set
+         * still_picture_coding_violated_ to true.
+         */
+        still_picture_coding_violated_ = true;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  bool still_picture_coding_violated_;
+  int enable_full_header_;
+  aom_still_picture_info still_pic_info_;
+  aom_rc_mode end_usage_check_;
+};
+
+TEST_P(StillPicturePresenceTest, StillPictureEncodePresenceTest) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(still_picture_coding_violated_, false);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(StillPicturePresenceTest,
+                           ::testing::Values(::libaom_test::kOnePassGood,
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(1, 0));
+}  // namespace
diff --git a/media/libaom/src/test/subtract_test.cc b/media/libaom/src/test/subtract_test.cc
index 4001e8b7ab..59dd218319 100644
--- a/media/libaom/src/test/subtract_test.cc
+++ b/media/libaom/src/test/subtract_test.cc
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <cstdint>
 #include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -17,7 +18,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/blockd.h"
@@ -33,7 +33,7 @@ namespace {
 
 class AV1SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
  public:
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 };
 
 using libaom_test::ACMRandom;
@@ -48,10 +48,13 @@ TEST_P(AV1SubtractBlockTest, SimpleSubtract) {
     const int block_height = block_size_high[bsize];
     int16_t *diff = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(*diff) * block_width * block_height * 2));
+    ASSERT_NE(diff, nullptr);
     uint8_t *pred = reinterpret_cast<uint8_t *>(
         aom_memalign(16, block_width * block_height * 2));
+    ASSERT_NE(pred, nullptr);
     uint8_t *src = reinterpret_cast<uint8_t *>(
         aom_memalign(16, block_width * block_height * 2));
+    ASSERT_NE(src, nullptr);
 
     for (int n = 0; n < 100; n++) {
       for (int r = 0; r < block_height; ++r) {
@@ -110,22 +113,23 @@ INSTANTIATE_TEST_SUITE_P(MSA, AV1SubtractBlockTest,
 typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr,
                                 ptrdiff_t diff_stride, const uint8_t *src_ptr,
                                 ptrdiff_t src_stride, const uint8_t *pred_ptr,
-                                ptrdiff_t pred_stride, int bd);
+                                ptrdiff_t pred_stride);
 
 using std::get;
 using std::make_tuple;
 using std::tuple;
 
-// <width, height, bit_dpeth, subtract>
-typedef tuple<int, int, int, HBDSubtractFunc> Params;
+// <BLOCK_SIZE, bit_depth, optimized subtract func, reference subtract func>
+typedef tuple<BLOCK_SIZE, int, HBDSubtractFunc, HBDSubtractFunc> Params;
 
 class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
  public:
   virtual void SetUp() {
-    block_width_ = GET_PARAM(0);
-    block_height_ = GET_PARAM(1);
-    bit_depth_ = static_cast<aom_bit_depth_t>(GET_PARAM(2));
-    func_ = GET_PARAM(3);
+    block_width_ = block_size_wide[GET_PARAM(0)];
+    block_height_ = block_size_high[GET_PARAM(0)];
+    bit_depth_ = static_cast<aom_bit_depth_t>(GET_PARAM(1));
+    func_ = GET_PARAM(2);
+    ref_func_ = GET_PARAM(3);
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
 
@@ -133,10 +137,13 @@ class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
     const size_t max_block_size = max_width * max_width;
     src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
         aom_memalign(16, max_block_size * sizeof(uint16_t))));
+    ASSERT_NE(src_, nullptr);
     pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
         aom_memalign(16, max_block_size * sizeof(uint16_t))));
+    ASSERT_NE(pred_, nullptr);
     diff_ = reinterpret_cast<int16_t *>(
         aom_memalign(16, max_block_size * sizeof(int16_t)));
+    ASSERT_NE(diff_, nullptr);
   }
 
   virtual void TearDown() {
@@ -155,10 +162,12 @@ class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
   int block_width_;
   aom_bit_depth_t bit_depth_;
   HBDSubtractFunc func_;
+  HBDSubtractFunc ref_func_;
   uint8_t *src_;
   uint8_t *pred_;
   int16_t *diff_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HBDSubtractBlockTest);
 
 void AV1HBDSubtractBlockTest::CheckResult() {
   const int test_num = 100;
@@ -174,7 +183,7 @@ void AV1HBDSubtractBlockTest::CheckResult() {
     }
 
     func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
-          pred_, block_width_, bit_depth_);
+          pred_, block_width_);
 
     for (int r = 0; r < block_height_; ++r) {
       for (int c = 0; c < block_width_; ++c) {
@@ -196,57 +205,78 @@ void AV1HBDSubtractBlockTest::RunForSpeed() {
   const int mask = (1 << bit_depth_) - 1;
   int i, j;
 
+  if (ref_func_ == func_) GTEST_SKIP();
+
+  for (j = 0; j < max_block_size; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  aom_usec_timer ref_timer;
+  aom_usec_timer_start(&ref_timer);
+  for (i = 0; i < test_num; ++i) {
+    ref_func_(block_height_, block_width_, diff_, block_width_, src_,
+              block_width_, pred_, block_width_);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  const int64_t ref_elapsed_time = aom_usec_timer_elapsed(&ref_timer);
+
   for (j = 0; j < max_block_size; ++j) {
     CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
     CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
   }
 
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
   for (i = 0; i < test_num; ++i) {
     func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
-          pred_, block_width_, bit_depth_);
+          pred_, block_width_);
   }
+  aom_usec_timer_mark(&timer);
+  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+
+  printf(
+      "[%dx%d]: "
+      "ref_time=%6" PRId64 " \t simd_time=%6" PRId64
+      " \t "
+      "gain=%f \n",
+      block_width_, block_height_, ref_elapsed_time, elapsed_time,
+      static_cast<double>(ref_elapsed_time) /
+          static_cast<double>(elapsed_time));
 }
 
 TEST_P(AV1HBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
 
-#if HAVE_SSE2
-const Params kAV1HBDSubtractBlock_sse2[] = {
-  make_tuple(4, 4, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(4, 4, 12, &aom_highbd_subtract_block_c),
-  make_tuple(4, 8, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(4, 8, 12, &aom_highbd_subtract_block_c),
-  make_tuple(8, 4, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(8, 4, 12, &aom_highbd_subtract_block_c),
-  make_tuple(8, 8, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(8, 8, 12, &aom_highbd_subtract_block_c),
-  make_tuple(8, 16, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(8, 16, 12, &aom_highbd_subtract_block_c),
-  make_tuple(16, 8, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(16, 8, 12, &aom_highbd_subtract_block_c),
-  make_tuple(16, 16, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(16, 16, 12, &aom_highbd_subtract_block_c),
-  make_tuple(16, 32, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(16, 32, 12, &aom_highbd_subtract_block_c),
-  make_tuple(32, 16, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(32, 16, 12, &aom_highbd_subtract_block_c),
-  make_tuple(32, 32, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(32, 32, 12, &aom_highbd_subtract_block_c),
-  make_tuple(32, 64, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(32, 64, 12, &aom_highbd_subtract_block_c),
-  make_tuple(64, 32, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(64, 32, 12, &aom_highbd_subtract_block_c),
-  make_tuple(64, 64, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(64, 64, 12, &aom_highbd_subtract_block_c),
-  make_tuple(64, 128, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(64, 128, 12, &aom_highbd_subtract_block_c),
-  make_tuple(128, 64, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(128, 64, 12, &aom_highbd_subtract_block_c),
-  make_tuple(128, 128, 12, &aom_highbd_subtract_block_sse2),
-  make_tuple(128, 128, 12, &aom_highbd_subtract_block_c)
-};
+const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4,    BLOCK_4X8,    BLOCK_8X4,
+                                       BLOCK_8X8,    BLOCK_8X16,   BLOCK_16X8,
+                                       BLOCK_16X16,  BLOCK_16X32,  BLOCK_32X16,
+                                       BLOCK_32X32,  BLOCK_32X64,  BLOCK_64X32,
+                                       BLOCK_64X64,  BLOCK_64X128, BLOCK_128X64,
+                                       BLOCK_128X128 };
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1HBDSubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(12),
+                       ::testing::Values(&aom_highbd_subtract_block_c),
+                       ::testing::Values(&aom_highbd_subtract_block_c)));
 
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1HBDSubtractBlockTest,
-                         ::testing::ValuesIn(kAV1HBDSubtractBlock_sse2));
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1HBDSubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(12),
+                       ::testing::Values(&aom_highbd_subtract_block_sse2),
+                       ::testing::Values(&aom_highbd_subtract_block_c)));
 #endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1HBDSubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(12),
+                       ::testing::Values(&aom_highbd_subtract_block_neon),
+                       ::testing::Values(&aom_highbd_subtract_block_c)));
+#endif
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 }  // namespace
diff --git a/media/libaom/src/test/sum_squares_test.cc b/media/libaom/src/test/sum_squares_test.cc
index 8845466b86..5c049a59c3 100644
--- a/media/libaom/src/test/sum_squares_test.cc
+++ b/media/libaom/src/test/sum_squares_test.cc
@@ -21,7 +21,6 @@
 
 #include "aom_ports/mem.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "test/function_equivalence_test.h"
@@ -49,13 +48,10 @@ class SumSquaresTest : public ::testing::TestWithParam<TestFuncs> {
     params_ = this->GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
-    ASSERT_TRUE(src_ != NULL);
+    ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src_);
-  }
+  virtual void TearDown() { aom_free(src_); }
   void RunTest(int isRandom);
   void RunSpeedTest();
 
@@ -85,6 +81,7 @@ class SumSquaresTest : public ::testing::TestWithParam<TestFuncs> {
   int16_t *src_;
   ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquaresTest);
 
 void SumSquaresTest::RunTest(int isRandom) {
   int failed = 0;
@@ -102,7 +99,7 @@ void SumSquaresTest::RunTest(int isRandom) {
     }
     const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
     uint64_t res_tst;
-    ASM_REGISTER_STATE_CHECK(res_tst =
+    API_REGISTER_STATE_CHECK(res_tst =
                                  params_.tst_func(src_, stride, width, height));
 
     if (!failed) {
@@ -165,6 +162,15 @@ INSTANTIATE_TEST_SUITE_P(
 
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SumSquaresTest,
+    ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
+                                &aom_sum_squares_2d_i16_neon)));
+
+#endif  // HAVE_NEON
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, SumSquaresTest,
@@ -184,6 +190,7 @@ class SumSquares1DTest : public FunctionEquivalenceTest<F1D> {
   static const int kIterations = 1000;
   static const int kMaxSize = 256;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquares1DTest);
 
 TEST_P(SumSquares1DTest, RandomValues) {
   DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
@@ -197,7 +204,7 @@ TEST_P(SumSquares1DTest, RandomValues) {
 
     const uint64_t ref_res = params_.ref_func(src, N);
     uint64_t tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -218,7 +225,7 @@ TEST_P(SumSquares1DTest, ExtremeValues) {
 
     const uint64_t ref_res = params_.ref_func(src, N);
     uint64_t tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -252,12 +259,11 @@ class SSETest : public ::testing::TestWithParam<SSETestParam> {
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, 256 * 256 * 2));
     ref_ = reinterpret_cast<uint8_t *>(aom_memalign(32, 256 * 256 * 2));
-    ASSERT_TRUE(src_ != NULL);
-    ASSERT_TRUE(ref_ != NULL);
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(ref_, nullptr);
   }
 
   virtual void TearDown() {
-    libaom_test::ClearSystemState();
     aom_free(src_);
     aom_free(ref_);
   }
@@ -303,6 +309,7 @@ class SSETest : public ::testing::TestWithParam<SSETestParam> {
   uint8_t *ref_;
   ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest);
 
 void SSETest::RunTest(int isRandom, int width, int height, int run_times) {
   int failed = 0;
@@ -438,13 +445,10 @@ class SSE_Sum_Test : public ::testing::TestWithParam<SSE_SumTestParam> {
     width_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<int16_t *>(aom_memalign(32, 256 * 256 * 2));
-    ASSERT_TRUE(src_ != NULL);
+    ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src_);
-  }
+  virtual void TearDown() { aom_free(src_); }
   void RunTest(int isRandom, int width, int height, int run_times);
 
   void GenRandomData(int width, int height, int stride) {
@@ -472,6 +476,7 @@ class SSE_Sum_Test : public ::testing::TestWithParam<SSE_SumTestParam> {
   int16_t *src_;
   ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSE_Sum_Test);
 
 void SSE_Sum_Test::RunTest(int isRandom, int width, int height, int run_times) {
   aom_usec_timer ref_timer, test_timer;
@@ -583,13 +588,10 @@ class Lowbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<uint8_t *>(
         aom_memalign(16, 512 * 512 * sizeof(uint8_t)));
-    ASSERT_TRUE(src_ != NULL);
+    ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src_);
-  }
+  virtual void TearDown() { aom_free(src_); }
   void RunTest(int isRandom);
   void RunSpeedTest();
 
@@ -619,6 +621,7 @@ class Lowbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
   uint8_t *src_;
   ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Lowbd2dVarTest);
 
 void Lowbd2dVarTest::RunTest(int isRandom) {
   int failed = 0;
@@ -637,7 +640,7 @@ void Lowbd2dVarTest::RunTest(int isRandom) {
 
     const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
     uint64_t res_tst;
-    ASM_REGISTER_STATE_CHECK(res_tst =
+    API_REGISTER_STATE_CHECK(res_tst =
                                  params_.tst_func(src_, stride, width, height));
 
     if (!failed) {
@@ -713,13 +716,10 @@ class Highbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<uint16_t *>(
         aom_memalign(16, 512 * 512 * sizeof(uint16_t)));
-    ASSERT_TRUE(src_ != NULL);
+    ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src_);
-  }
+  virtual void TearDown() { aom_free(src_); }
   void RunTest(int isRandom);
   void RunSpeedTest();
 
@@ -749,6 +749,7 @@ class Highbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
   uint16_t *src_;
   ACMRandom rnd_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Highbd2dVarTest);
 
 void Highbd2dVarTest::RunTest(int isRandom) {
   int failed = 0;
@@ -768,7 +769,7 @@ void Highbd2dVarTest::RunTest(int isRandom) {
     const uint64_t res_ref =
         params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
     uint64_t res_tst;
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         res_tst =
             params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height));
 
diff --git a/media/libaom/src/test/superframe_test.cc b/media/libaom/src/test/superframe_test.cc
deleted file mode 100644
index 024a18b978..0000000000
--- a/media/libaom/src/test/superframe_test.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <climits>
-#include <tuple>
-#include <vector>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/i420_video_source.h"
-#include "test/util.h"
-
-namespace {
-
-const int kTestMode = 0;
-const int kTileCols = 1;
-const int kTileRows = 2;
-
-typedef std::tuple<libaom_test::TestMode, int, int> SuperframeTestParam;
-
-class SuperframeTest
-    : public ::libaom_test::CodecTestWithParam<SuperframeTestParam>,
-      public ::libaom_test::EncoderTest {
- protected:
-  SuperframeTest() : EncoderTest(GET_PARAM(0)), last_sf_pts_(0) {}
-  virtual ~SuperframeTest() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    const SuperframeTestParam input = GET_PARAM(1);
-    const libaom_test::TestMode mode = std::get<kTestMode>(input);
-    SetMode(mode);
-    sf_count_ = 0;
-    sf_count_max_ = INT_MAX;
-    n_tile_cols_ = std::get<kTileCols>(input);
-    n_tile_rows_ = std::get<kTileRows>(input);
-  }
-
-  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
-                                  libaom_test::Encoder *encoder) {
-    if (video->frame() == 0) {
-      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
-      encoder->Control(AOME_SET_CPUUSED, 2);
-      encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
-      encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
-    }
-  }
-
-  virtual const aom_codec_cx_pkt_t *MutateEncoderOutputHook(
-      const aom_codec_cx_pkt_t *pkt) {
-    if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return pkt;
-
-    const uint8_t *buffer = reinterpret_cast<uint8_t *>(pkt->data.frame.buf);
-    const uint8_t marker = buffer[0];
-    const int frames = (marker & 0x7) + 1;
-    const int mag = ((marker >> 3) & 3) + 1;
-    const unsigned int index_sz = 2 + mag * (frames - 1);
-    if ((marker & 0xe0) == 0xc0 && pkt->data.frame.sz >= index_sz &&
-        buffer[index_sz - 1] == marker) {
-      // frame is a superframe. strip off the index.
-      modified_buf_.resize(pkt->data.frame.sz - index_sz);
-      memcpy(&modified_buf_[0], (uint8_t *)pkt->data.frame.buf + index_sz,
-             pkt->data.frame.sz - index_sz);
-      modified_pkt_ = *pkt;
-      modified_pkt_.data.frame.buf = &modified_buf_[0];
-      modified_pkt_.data.frame.sz -= index_sz;
-
-      sf_count_++;
-      last_sf_pts_ = pkt->data.frame.pts;
-      return &modified_pkt_;
-    }
-
-    // Make sure we do a few frames after the last SF
-    abort_ |=
-        sf_count_ > sf_count_max_ && pkt->data.frame.pts - last_sf_pts_ >= 5;
-    return pkt;
-  }
-
-  int sf_count_;
-  int sf_count_max_;
-  aom_codec_cx_pkt_t modified_pkt_;
-  std::vector<uint8_t> modified_buf_;
-  aom_codec_pts_t last_sf_pts_;
-
- private:
-  int n_tile_cols_;
-  int n_tile_rows_;
-};
-
-TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {
-  sf_count_max_ = 0;  // early exit on successful test.
-  cfg_.g_lag_in_frames = 25;
-  cfg_.large_scale_tile = 1;
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 40);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  // NOTE: The use of BWDREF_FRAME will enable the coding of more non-show
-  //       frames besides ALTREF_FRAME.
-  EXPECT_GE(sf_count_, 1);
-}
-
-}  // namespace
diff --git a/media/libaom/src/test/svc_datarate_test.cc b/media/libaom/src/test/svc_datarate_test.cc
index 28e517ba10..e0b3860d03 100644
--- a/media/libaom/src/test/svc_datarate_test.cc
+++ b/media/libaom/src/test/svc_datarate_test.cc
@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <vector>
 #include "config/aom_config.h"
-
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/datarate_test.h"
@@ -20,10 +20,20 @@
 #include "test/y4m_video_source.h"
 #include "aom/aom_codec.h"
 #include "av1/common/enums.h"
+#include "av1/encoder/encoder.h"
 
 namespace datarate_test {
 namespace {
 
+struct FrameInfo {
+  FrameInfo(aom_codec_pts_t _pts, unsigned int _w, unsigned int _h)
+      : pts(_pts), w(_w), h(_h) {}
+
+  aom_codec_pts_t pts;
+  unsigned int w;
+  unsigned int h;
+};
+
 class DatarateTestSVC
     : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
                                                  unsigned int, int>,
@@ -36,11 +46,18 @@ class DatarateTestSVC
 
  protected:
   virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
+    InitializeConfig(GET_PARAM(1));
     ResetModel();
   }
 
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+    ++decoded_nframes_;
+  }
+
+  std::vector<FrameInfo> frame_info_list_;
+
   virtual int GetNumSpatialLayers() { return number_spatial_layers_; }
 
   virtual void ResetModel() {
@@ -56,18 +73,51 @@ class DatarateTestSVC
     memset(&layer_id_, 0, sizeof(aom_svc_layer_id_t));
     memset(&svc_params_, 0, sizeof(aom_svc_params_t));
     memset(&ref_frame_config_, 0, sizeof(aom_svc_ref_frame_config_t));
+    memset(&ref_frame_comp_pred_, 0, sizeof(aom_svc_ref_frame_comp_pred_t));
+    drop_frames_ = 0;
+    for (int i = 0; i < 1000; i++) drop_frames_list_[i] = 1000;
+    decoded_nframes_ = 0;
+    mismatch_nframes_ = 0;
+    mismatch_psnr_ = 0.0;
+    set_frame_level_er_ = 0;
+    multi_ref_ = 0;
+    use_fixed_mode_svc_ = 0;
+    comp_pred_ = 0;
+    dynamic_enable_disable_mode_ = 0;
+    intra_only_ = 0;
+    frame_to_start_decoding_ = 0;
+    layer_to_decode_ = 0;
+    frame_sync_ = 0;
+    current_video_frame_ = 0;
+    screen_mode_ = 0;
   }
 
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
                                   ::libaom_test::Encoder *encoder) {
     int spatial_layer_id = 0;
-    if (video->frame() == 0) {
+    current_video_frame_ = video->frame();
+    // video->frame() is called every superframe, so we should condition
+    // this on layer_frame_cnt_ = 0, so we only do this once on the very
+    // first frame.
+    if (video->frame() == 0 && layer_frame_cnt_ == 0) {
       initialize_svc(number_temporal_layers_, number_spatial_layers_,
                      &svc_params_);
+      if (dynamic_enable_disable_mode_ == 1) {
+        svc_params_.layer_target_bitrate[2] = 0;
+        cfg_.rc_target_bitrate -= target_layer_bitrate_[2];
+      }
       encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+      // TODO(aomedia:3032): Configure KSVC in fixed mode.
       encoder->Control(AV1E_SET_ENABLE_ORDER_HINT, 0);
       encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
       encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
+      if (cfg_.g_threads > 1) {
+        encoder->Control(AV1E_SET_TILE_COLUMNS, cfg_.g_threads >> 1);
+        encoder->Control(AV1E_SET_ROW_MT, 1);
+      }
+      if (screen_mode_) {
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      }
     }
     if (number_spatial_layers_ == 2) {
       spatial_layer_id = (layer_frame_cnt_ % 2 == 0) ? 0 : 1;
@@ -78,10 +128,66 @@ class DatarateTestSVC
     }
     // Set the reference/update flags, layer_id, and reference_map
     // buffer index.
-    frame_flags_ = set_layer_pattern(video->frame(), &layer_id_,
-                                     &ref_frame_config_, spatial_layer_id);
+    frame_flags_ =
+        set_layer_pattern(video->frame(), &layer_id_, &ref_frame_config_,
+                          &ref_frame_comp_pred_, spatial_layer_id, multi_ref_,
+                          comp_pred_, (video->frame() % cfg_.kf_max_dist) == 0);
+    if (intra_only_ == 1 && frame_sync_ > 0) {
+      // Set an Intra-only frame on SL0 at frame_sync_.
+      // In order to allow decoding to start on SL0 in mid-sequence we need to
+      // set and refresh all the slots used on SL0 stream, which is 0 and 3
+      // for this test pattern. The other slots (1, 2, 4, 5) are used for the
+      // SL > 0 layers and these slotes are not refreshed on frame_sync_, so
+      // temporal prediction for the top layers can continue.
+      if (spatial_layer_id == 0 && video->frame() == frame_sync_) {
+        ref_frame_config_.ref_idx[0] = 0;
+        ref_frame_config_.ref_idx[3] = 3;
+        ref_frame_config_.refresh[0] = 1;
+        ref_frame_config_.refresh[3] = 1;
+        for (int i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config_.reference[i] = 0;
+      }
+    }
+    if (intra_only_ && video->frame() == 50 && spatial_layer_id == 1) {
+      // Force an intra_only frame here, for SL1.
+      for (int i = 0; i < INTER_REFS_PER_FRAME; i++)
+        ref_frame_config_.reference[i] = 0;
+    }
     encoder->Control(AV1E_SET_SVC_LAYER_ID, &layer_id_);
-    encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+    // The SET_SVC_REF_FRAME_CONFIG and AV1E_SET_SVC_REF_FRAME_COMP_PRED api is
+    // for the flexible SVC mode (i.e., use_fixed_mode_svc == 0).
+    if (!use_fixed_mode_svc_) {
+      encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+      encoder->Control(AV1E_SET_SVC_REF_FRAME_COMP_PRED, &ref_frame_comp_pred_);
+    }
+    if (set_frame_level_er_) {
+      int mode =
+          (layer_id_.spatial_layer_id > 0 || layer_id_.temporal_layer_id > 0);
+      encoder->Control(AV1E_SET_ERROR_RESILIENT_MODE, mode);
+    }
+    if (dynamic_enable_disable_mode_ == 1) {
+      if (layer_frame_cnt_ == 300 && spatial_layer_id == 0) {
+        // Enable: set top spatial layer bitrate back to non-zero.
+        svc_params_.layer_target_bitrate[2] = target_layer_bitrate_[2];
+        cfg_.rc_target_bitrate += target_layer_bitrate_[2];
+        encoder->Config(&cfg_);
+        encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+      }
+    } else if (dynamic_enable_disable_mode_ == 2) {
+      if (layer_frame_cnt_ == 300 && spatial_layer_id == 0) {
+        // Disable top spatial layer mid-stream.
+        svc_params_.layer_target_bitrate[2] = 0;
+        cfg_.rc_target_bitrate -= target_layer_bitrate_[2];
+        encoder->Config(&cfg_);
+        encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+      } else if (layer_frame_cnt_ == 600 && spatial_layer_id == 0) {
+        // Enable top spatial layer mid-stream.
+        svc_params_.layer_target_bitrate[2] = target_layer_bitrate_[2];
+        cfg_.rc_target_bitrate += target_layer_bitrate_[2];
+        encoder->Config(&cfg_);
+        encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+      }
+    }
     layer_frame_cnt_++;
     DatarateTest::PreEncodeFrameHook(video, encoder);
   }
@@ -107,12 +213,45 @@ class DatarateTestSVC
     }
   }
 
+  virtual bool DoDecode() const {
+    if (drop_frames_ > 0) {
+      for (unsigned int i = 0; i < drop_frames_; ++i) {
+        if (drop_frames_list_[i] == (unsigned int)superframe_cnt_) {
+          std::cout << "             Skipping decoding frame: "
+                    << drop_frames_list_[i] << "\n";
+          return 0;
+        }
+      }
+    } else if (intra_only_ == 1) {
+      // Only start decoding at frames_to_start_decoding_.
+      if (current_video_frame_ < frame_to_start_decoding_) return 0;
+      // Only decode base layer for 3SL, for layer_to_decode_ = 0.
+      if (layer_to_decode_ == 0 && frame_sync_ > 0 &&
+          (layer_frame_cnt_ - 1) % 3 != 0)
+        return 0;
+    }
+    return 1;
+  }
+
+  virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+    double mismatch_psnr = compute_psnr(img1, img2);
+    mismatch_psnr_ += mismatch_psnr;
+    ++mismatch_nframes_;
+  }
+
+  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+  unsigned int GetDecodedFrames() { return decoded_nframes_; }
+
   // Layer pattern configuration.
-  virtual int set_layer_pattern(int frame_cnt, aom_svc_layer_id_t *layer_id,
-                                aom_svc_ref_frame_config_t *ref_frame_config,
-                                int spatial_layer) {
+  virtual int set_layer_pattern(
+      int frame_cnt, aom_svc_layer_id_t *layer_id,
+      aom_svc_ref_frame_config_t *ref_frame_config,
+      aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int spatial_layer,
+      int multi_ref, int comp_pred, int is_key_frame) {
+    int lag_index = 0;
+    int base_count = frame_cnt >> 2;
     layer_id->spatial_layer_id = spatial_layer;
-    // Set the referende map buffer idx for the 7 references:
+    // Set the reference map buffer idx for the 7 references:
     // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
     // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
     for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
@@ -120,6 +259,11 @@ class DatarateTestSVC
       ref_frame_config->reference[i] = 0;
     }
     for (int i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+    if (comp_pred) {
+      ref_frame_comp_pred->use_comp_pred[0] = 1;  // GOLDEN_LAST
+      ref_frame_comp_pred->use_comp_pred[1] = 1;  // LAST2_LAST
+      ref_frame_comp_pred->use_comp_pred[2] = 1;  // ALTREF_LAST
+    }
     // Set layer_flags to 0 when using ref_frame_config->reference.
     int layer_flags = 0;
     // Always reference LAST.
@@ -129,12 +273,26 @@ class DatarateTestSVC
       //   1    3   5    7
       //     2        6
       // 0        4        8
+      if (multi_ref) {
+        // Keep golden fixed at slot 3.
+        ref_frame_config->ref_idx[3] = 3;
+        // Cyclically refresh slots 4, 5, 6, 7, for lag altref.
+        lag_index = 4 + (base_count % 4);
+        // Set the altref slot to lag_index.
+        ref_frame_config->ref_idx[6] = lag_index;
+      }
       if (frame_cnt % 4 == 0) {
         // Base layer.
         layer_id->temporal_layer_id = 0;
         // Update LAST on layer 0, reference LAST and GF.
         ref_frame_config->refresh[0] = 1;
         ref_frame_config->reference[3] = 1;
+        if (multi_ref) {
+          // Refresh GOLDEN every x ~10 base layer frames.
+          if (base_count % 10 == 0) ref_frame_config->refresh[3] = 1;
+          // Refresh lag_index slot, needed for lagging altref.
+          ref_frame_config->refresh[lag_index] = 1;
+        }
       } else if ((frame_cnt - 1) % 4 == 0) {
         layer_id->temporal_layer_id = 2;
         // First top layer: no updates, only reference LAST (TL0).
@@ -150,6 +308,11 @@ class DatarateTestSVC
         ref_frame_config->ref_idx[0] = 1;
         ref_frame_config->ref_idx[1] = 0;
       }
+      if (multi_ref) {
+        // Every frame can reference GOLDEN AND ALTREF.
+        ref_frame_config->reference[3] = 1;
+        ref_frame_config->reference[6] = 1;
+      }
     } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 2) {
       layer_id->temporal_layer_id = 0;
       if (layer_id->spatial_layer_id == 0) {
@@ -191,11 +354,18 @@ class DatarateTestSVC
         for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1;
         ref_frame_config->ref_idx[0] = 2;
         ref_frame_config->refresh[2] = 1;
+        if (multi_ref) {
+          ref_frame_config->ref_idx[6] = 7;
+          ref_frame_config->reference[6] = 1;
+          if (base_count % 10 == 0) ref_frame_config->refresh[7] = 1;
+        }
       }
       // Reference GOLDEN.
       if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
     } else if (number_temporal_layers_ == 3 && number_spatial_layers_ == 3) {
       // 3 spatial and 3 temporal layer.
+      // Overlap in the buffer slot updates: the slots 3 and 4 updated by
+      // first TL2 are reused for update in TL1 superframe.
       if (superframe_cnt_ % 4 == 0) {
         // Base temporal layer.
         layer_id->temporal_layer_id = 0;
@@ -250,56 +420,72 @@ class DatarateTestSVC
         if (layer_id->spatial_layer_id == 0) {
           // Reference LAST.
           // Set all buffer_idx to 0.
-          // Set GOLDEN to slot 5 and update slot 5.
+          // Set GOLDEN to slot 3 and update slot 3.
           for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[3] = 5;
-          ref_frame_config->refresh[5] = 1;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->refresh[3] = 1;
         } else if (layer_id->spatial_layer_id == 1) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
-          // GOLDEN (and all other refs) to slot 5.
-          // Set LAST2 to slot 6 and update slot 6.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 5;
+          // GOLDEN (and all other refs) to slot 3.
+          // Set LAST2 to slot 4 and update slot 4.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
           ref_frame_config->ref_idx[0] = 1;
-          ref_frame_config->ref_idx[2] = 6;
-          ref_frame_config->refresh[6] = 1;
+          ref_frame_config->ref_idx[2] = 4;
+          ref_frame_config->refresh[4] = 1;
         } else if (layer_id->spatial_layer_id == 2) {
           // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
-          // GOLDEN (and all other refs) to slot 6.
-          // Set LAST2 to slot 6 and update slot 7.
-          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 6;
+          // GOLDEN (and all other refs) to slot 4.
+          // Set LAST2 to slot 5 and update slot 5.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
           ref_frame_config->ref_idx[0] = 2;
-          ref_frame_config->ref_idx[2] = 7;
-          ref_frame_config->refresh[7] = 1;
+          ref_frame_config->ref_idx[2] = 5;
+          ref_frame_config->refresh[5] = 1;
         }
       } else if ((superframe_cnt_ - 3) % 4 == 0) {
         // Second top temporal enhancement layer.
         layer_id->temporal_layer_id = 2;
         if (layer_id->spatial_layer_id == 0) {
-          // Set LAST to slot 5 and reference LAST.
+          // Set LAST to slot 3 and reference LAST.
           // Set GOLDEN to slot 3 and update slot 3.
           // Set all other buffer_idx to 0.
           for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 5;
+          ref_frame_config->ref_idx[0] = 3;
           ref_frame_config->ref_idx[3] = 3;
           ref_frame_config->refresh[3] = 1;
         } else if (layer_id->spatial_layer_id == 1) {
-          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 4,
           // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
           for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 6;
+          ref_frame_config->ref_idx[0] = 4;
           ref_frame_config->ref_idx[3] = 3;
           ref_frame_config->ref_idx[1] = 4;
           ref_frame_config->refresh[4] = 1;
         } else if (layer_id->spatial_layer_id == 2) {
-          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 5,
           // GOLDEN to slot 4. No update.
           for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
-          ref_frame_config->ref_idx[0] = 7;
+          ref_frame_config->ref_idx[0] = 5;
           ref_frame_config->ref_idx[3] = 4;
         }
       }
-      // Reference GOLDEN.
-      if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
+      if (layer_id->spatial_layer_id > 0) {
+        // Always reference GOLDEN (inter-layer prediction).
+        ref_frame_config->reference[3] = 1;
+        if (is_key_frame && layer_id->spatial_layer_id > 0) {
+          // On superframes whose base is key: remove LAST since GOLDEN
+          // is used as reference.
+          ref_frame_config->reference[0] = 0;
+        }
+      }
+      // Allow for top spatial layer to use additional temporal reference.
+      // Additional reference is only updated on base temporal layer, every
+      // 10 TL0 frames here.
+      if (multi_ref && layer_id->spatial_layer_id == 2) {
+        ref_frame_config->ref_idx[6] = 7;
+        ref_frame_config->reference[6] = 1;
+        if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0)
+          ref_frame_config->refresh[7] = 1;
+      }
     }
     return layer_flags;
   }
@@ -363,11 +549,100 @@ class DatarateTestSVC
     target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Top temporal layers are non_reference, so exlcude them from
+    // mismatch count, since loopfilter/cdef is not applied for these on
+    // encoder side, but is always applied on decoder.
+    // This means 150 = #frames(300) - #TL2_frames(150).
+    EXPECT_EQ((int)GetMismatchFrames(), 150);
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLScreenTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 60);
+
+    const int bitrate_array[2] = { 800, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    screen_mode_ = 1;
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 1;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.50)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.5)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Top temporal layers are non_reference, so exlcude them from
+    // mismatch count, since loopfilter/cdef is not applied for these on
+    // encoder side, but is always applied on decoder.
+    // This means 30 = #frames(60) - #TL2_frames(30).
+    EXPECT_EQ((int)GetMismatchFrames(), 30);
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLResizeTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.rc_resize_mode = RESIZE_DYNAMIC;
+
+    ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+                                         1, 0, 400);
+    cfg_.g_w = 640;
+    cfg_.g_h = 480;
+    const int bitrate_array[2] = { 80, 90 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
           << " The datarate for the file is greater than target by too much!";
     }
+    unsigned int last_w = cfg_.g_w;
+    unsigned int last_h = cfg_.g_h;
+    int resize_down_count = 0;
+    for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+         info != frame_info_list_.end(); ++info) {
+      if (info->w != last_w || info->h != last_h) {
+        // Verify that resize down occurs.
+        ASSERT_LT(info->w, last_w);
+        ASSERT_LT(info->h, last_h);
+        last_w = info->w;
+        last_h = info->h;
+        resize_down_count++;
+      }
+    }
+    // Must be at least one resize down.
+    ASSERT_GE(resize_down_count, 1);
   }
 
   virtual void BasicRateTargetingSVC1TL2SLTest() {
@@ -379,13 +654,141 @@ class DatarateTestSVC
     cfg_.rc_max_quantizer = 63;
     cfg_.rc_end_usage = AOM_CBR;
     cfg_.g_lag_in_frames = 0;
-    cfg_.g_error_resilient = 1;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 300, 600 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 1;
+    number_spatial_layers_ = 2;
+    target_layer_bitrate_[0] = 2 * cfg_.rc_target_bitrate / 4;
+    target_layer_bitrate_[1] = 2 * cfg_.rc_target_bitrate / 4;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLIntraStartDecodeBaseMidSeq() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 500, 1000 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    intra_only_ = 1;
+    frame_sync_ = 20;
+    frame_to_start_decoding_ = frame_sync_;
+    layer_to_decode_ = 0;
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Only check datarate on SL0 - this is layer that is decoded starting at
+    // frame_to_start_decoding_.
+    for (int i = 0; i < number_temporal_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Only base spatial layer is decoded and there are no non-referenece
+    // frames on S0, so #mismatch must be 0.
+    EXPECT_EQ((int)GetMismatchFrames(), 0);
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLIntraMidSeqDecodeAll() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 500, 1000 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    intra_only_ = 1;
+    frame_sync_ = 20;
+    frame_to_start_decoding_ = 0;
+    layer_to_decode_ = 3;
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // All 3 spatial layers are decoded, starting at frame 0, so there are
+    // and there 300/2 = 150 non-reference frames, so mismatch is 150.
+    EXPECT_EQ((int)GetMismatchFrames(), 150);
+  }
+
+  virtual void BasicRateTargetingSVC1TL2SLIntraOnlyTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
 
     ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
                                          288, 30, 1, 0, 300);
     const int bitrate_array[2] = { 300, 600 };
     cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
     ResetModel();
+    intra_only_ = 1;
     number_temporal_layers_ = 1;
     number_spatial_layers_ = 2;
     target_layer_bitrate_[0] = 2 * cfg_.rc_target_bitrate / 4;
@@ -408,13 +811,44 @@ class DatarateTestSVC
     cfg_.rc_max_quantizer = 63;
     cfg_.rc_end_usage = AOM_CBR;
     cfg_.g_lag_in_frames = 0;
-    cfg_.g_error_resilient = 1;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 500, 1000 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 1;
+    number_spatial_layers_ = 3;
+    target_layer_bitrate_[0] = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[1] = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[2] = 4 * cfg_.rc_target_bitrate / 8;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC1TL3SLMultiRefTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
 
     ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
                                          288, 30, 1, 0, 300);
     const int bitrate_array[2] = { 500, 1000 };
     cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
     ResetModel();
+    multi_ref_ = 1;
     number_temporal_layers_ = 1;
     number_spatial_layers_ = 3;
     target_layer_bitrate_[0] = 1 * cfg_.rc_target_bitrate / 8;
@@ -438,7 +872,7 @@ class DatarateTestSVC
     cfg_.rc_max_quantizer = 63;
     cfg_.rc_end_usage = AOM_CBR;
     cfg_.g_lag_in_frames = 0;
-    cfg_.g_error_resilient = 1;
+    cfg_.g_error_resilient = 0;
 
     ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
                                          288, 30, 1, 0, 300);
@@ -464,7 +898,7 @@ class DatarateTestSVC
     target_layer_bitrate_[8] = bitrate_sl2;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
-      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.50)
           << " The datarate for the file is lower than target by too much!";
       ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
           << " The datarate for the file is greater than target by too much!";
@@ -480,7 +914,7 @@ class DatarateTestSVC
     cfg_.rc_max_quantizer = 63;
     cfg_.rc_end_usage = AOM_CBR;
     cfg_.g_lag_in_frames = 0;
-    cfg_.g_error_resilient = 1;
+    cfg_.g_error_resilient = 0;
 
     ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
     const int bitrate_array[2] = { 600, 1200 };
@@ -507,7 +941,175 @@ class DatarateTestSVC
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.4)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingFixedModeSVC3TL3SLHDTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    use_fixed_mode_svc_ = 1;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLHDMultiThread2Test() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_threads = 2;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLHDMultiThread4Test() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_threads = 4;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLHDMultiRefTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    multi_ref_ = 1;
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
           << " The datarate for the file is greater than target by too much!";
     }
   }
@@ -521,7 +1123,7 @@ class DatarateTestSVC
     cfg_.rc_max_quantizer = 63;
     cfg_.rc_end_usage = AOM_CBR;
     cfg_.g_lag_in_frames = 0;
-    cfg_.g_error_resilient = 1;
+    cfg_.g_error_resilient = 0;
     cfg_.kf_mode = AOM_KF_AUTO;
     cfg_.kf_min_dist = cfg_.kf_max_dist = 100;
 
@@ -549,13 +1151,501 @@ class DatarateTestSVC
     target_layer_bitrate_[8] = bitrate_sl2;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
-      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.75)
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.55)
           << " The datarate for the file is lower than target by too much!";
       ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.4)
           << " The datarate for the file is greater than target by too much!";
     }
   }
 
+  virtual void BasicRateTargeting444SVC3TL3SLTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = 1;
+
+    ::libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLMultiRefDropAllEnhTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    // error_resilient can set to off/0, since for SVC the context update
+    // is done per-layer.
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    multi_ref_ = 1;
+    // Drop TL1 and TL2: #frames(300) - #TL0.
+    drop_frames_ = 300 - 300 / 4;
+    int n = 0;
+    for (int i = 0; i < 300; i++) {
+      if (i % 4 != 0) {
+        drop_frames_list_[n] = i;
+        n++;
+      }
+    }
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Test that no mismatches have been found.
+    std::cout << "          Decoded frames: " << GetDecodedFrames() << "\n";
+    std::cout << "          Mismatch frames: " << GetMismatchFrames() << "\n";
+    EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+    EXPECT_EQ((int)GetMismatchFrames(), 0);
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLDropAllEnhTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    // error_resilient can set to off/0, since for SVC the context update
+    // is done per-layer.
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    // Drop TL1 and TL2: #frames(300) - #TL0.
+    drop_frames_ = 300 - 300 / 4;
+    int n = 0;
+    for (int i = 0; i < 300; i++) {
+      if (i % 4 != 0) {
+        drop_frames_list_[n] = i;
+        n++;
+      }
+    }
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Test that no mismatches have been found.
+    std::cout << "          Decoded frames: " << GetDecodedFrames() << "\n";
+    std::cout << "          Mismatch frames: " << GetMismatchFrames() << "\n";
+    EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+    EXPECT_EQ((int)GetMismatchFrames(), 0);
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLDropTL2EnhTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    // error_resilient for sequence can be off/0, since dropped frames (TL2)
+    // are non-reference frames.
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    // Drop TL2: #frames(300) - (#TL0 + #TL1).
+    drop_frames_ = 300 - 300 / 2;
+    int n = 0;
+    for (int i = 0; i < 300; i++) {
+      if (i % 2 != 0) {
+        drop_frames_list_[n] = i;
+        n++;
+      }
+    }
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Test that no mismatches have been found.
+    std::cout << "          Decoded frames: " << GetDecodedFrames() << "\n";
+    std::cout << "          Mismatch frames: " << GetMismatchFrames() << "\n";
+    EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+    EXPECT_EQ((int)GetMismatchFrames(), 0);
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLDropAllEnhFrameERTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    // Set error_resilience at frame level, with codec control,
+    // on/1 for enahancement layers and off/0 for base layer frames.
+    set_frame_level_er_ = 1;
+
+    // Drop TL1 and TL2: #frames(300) - #TL0.
+    drop_frames_ = 300 - 300 / 4;
+    int n = 0;
+    for (int i = 0; i < 300; i++) {
+      if (i % 4 != 0) {
+        drop_frames_list_[n] = i;
+        n++;
+      }
+    }
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Test that no mismatches have been found.
+    std::cout << "          Decoded frames: " << GetDecodedFrames() << "\n";
+    std::cout << "          Mismatch frames: " << GetMismatchFrames() << "\n";
+    EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+    EXPECT_EQ((int)GetMismatchFrames(), 0);
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLDropSetEnhFrameERTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    // Set error_resilience at frame level, with codec control,
+    // on/1 for enahancement layers and off/0 for base layer frames.
+    set_frame_level_er_ = 1;
+
+    // Drop TL1 and TL2: for part of sequence. Start at first TL2 at
+    // frame 101, and end at second T2 at frame 199. Frame 200 is TL0,
+    // so we can continue decoding without mismatch (since LAST is the
+    // only reference and error_resil = 1 on TL1/TL2 frames).
+    int n = 0;
+    int num_nonref = 300 / 2;
+    for (int i = 101; i < 200; i++) {
+      if (i % 4 != 0) {
+        drop_frames_list_[n] = i;
+        n++;
+        if (i % 2 != 0) num_nonref -= 1;
+      }
+    }
+    drop_frames_ = n;
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Test that no mismatches have been found.
+    std::cout << "          Decoded frames: " << GetDecodedFrames() << "\n";
+    std::cout << "          Mismatch frames: " << GetMismatchFrames() << "\n";
+    EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+    EXPECT_EQ((int)GetMismatchFrames(), num_nonref);
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLDropSetEnhER0Test() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+
+    // Set error_resilience off.
+    cfg_.g_error_resilient = 0;
+
+    // Drop TL1 and TL2: for part of sequence. Start at first TL2 at
+    // frame 101, and end at second T2 at frame 199. Frame 200 is TL0,
+    // so we can continue decoding without mismatch (since LAST is the
+    // only reference and error_resil = 1 on TL1/TL2 frames).
+    int n = 0;
+    int num_nonref = 300 / 2;
+    for (int i = 101; i < 200; i++) {
+      if (i % 4 != 0) {
+        drop_frames_list_[n] = i;
+        n++;
+        if (i % 2 != 0) num_nonref -= 1;
+      }
+    }
+    drop_frames_ = n;
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Test that no mismatches have been found.
+    std::cout << "          Decoded frames: " << GetDecodedFrames() << "\n";
+    std::cout << "          Mismatch frames: " << GetMismatchFrames() << "\n";
+    EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+    EXPECT_EQ((int)GetMismatchFrames(), num_nonref);
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLDropSetEnhER0Test() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    // Set error_resilience off.
+    cfg_.g_error_resilient = 0;
+    // Drop TL1 and TL2: for part of sequence. Start at first TL2 at
+    // frame 101, and end at second T2 at frame 199. Frame 200 is TL0,
+    // so we can continue decoding without mismatch (since LAST is the
+    // only reference and error_resil = 1 on TL1/TL2 frames).
+    // Drop here means drop whole superframe.
+    int n = 0;
+    int num_nonref = 300 / 2;
+    for (int i = 101; i < 200; i++) {
+      if (i % 4 != 0) {
+        drop_frames_list_[n] = i;
+        n++;
+        if (i % 2 != 0) num_nonref -= 1;
+      }
+    }
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    multi_ref_ = 1;
+    drop_frames_ = n * number_spatial_layers_;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+    // Test that no mismatches have been found.
+    std::cout << "          Decoded frames: " << GetDecodedFrames() << "\n";
+    std::cout << "          Mismatch frames: " << GetMismatchFrames() << "\n";
+    EXPECT_EQ(300 * number_spatial_layers_ - GetDecodedFrames(), drop_frames_);
+    EXPECT_EQ((int)GetMismatchFrames(), num_nonref);
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLMultiRefCompoundTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+                                         1, 0, 400);
+    cfg_.g_w = 640;
+    cfg_.g_h = 480;
+    const int bitrate_array[2] = { 400, 800 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    multi_ref_ = 1;
+    comp_pred_ = 1;
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 1;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC1TL3SLDynEnablTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+                                         1, 0, 400);
+    const int bitrate_array[2] = { 500, 1000 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 1;
+    number_spatial_layers_ = 3;
+    target_layer_bitrate_[0] = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[1] = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[2] = 4 * cfg_.rc_target_bitrate / 8;
+    dynamic_enable_disable_mode_ = 1;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // No need to check RC on top layer which is disabled part of the time.
+    for (int i = 0; i < number_spatial_layers_ - 1; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC1TL3SLDynDisEnablTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 500, 1000 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 1;
+    number_spatial_layers_ = 3;
+    target_layer_bitrate_[0] = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[1] = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[2] = 4 * cfg_.rc_target_bitrate / 8;
+    dynamic_enable_disable_mode_ = 2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // No need to check RC on top layer which is disabled part of the time.
+    for (int i = 0; i < number_spatial_layers_ - 1; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
   int layer_frame_cnt_;
   int superframe_cnt_;
   int number_temporal_layers_;
@@ -564,8 +1654,25 @@ class DatarateTestSVC
   int target_layer_bitrate_[AOM_MAX_LAYERS];
   aom_svc_params_t svc_params_;
   aom_svc_ref_frame_config_t ref_frame_config_;
+  aom_svc_ref_frame_comp_pred_t ref_frame_comp_pred_;
   aom_svc_layer_id_t layer_id_;
   double effective_datarate_tl[AOM_MAX_LAYERS];
+  unsigned int drop_frames_;
+  unsigned int drop_frames_list_[1000];
+  unsigned int mismatch_nframes_;
+  unsigned int decoded_nframes_;
+  double mismatch_psnr_;
+  int set_frame_level_er_;
+  int multi_ref_;
+  int use_fixed_mode_svc_;
+  int comp_pred_;
+  int dynamic_enable_disable_mode_;
+  int intra_only_;
+  unsigned int frame_to_start_decoding_;
+  unsigned int layer_to_decode_;
+  unsigned int frame_sync_;
+  unsigned int current_video_frame_;
+  int screen_mode_;
 };
 
 // Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial.
@@ -573,16 +1680,55 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SL) {
   BasicRateTargetingSVC3TL1SLTest();
 }
 
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial
+// for screen mode.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLScreen) {
+  BasicRateTargetingSVC3TL1SLScreenTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial,
+// with dynamic resize on. Encode at very low bitrate and check that
+// there is at least one resize (down) event.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLResize) {
+  BasicRateTargetingSVC3TL1SLResizeTest();
+}
+
 // Check basic rate targeting for CBR, for 2 spatial layers, 1 temporal.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL2SL) {
   BasicRateTargetingSVC1TL2SLTest();
 }
 
+// Check basic rate targeting for CBR, for 3 spatial layers, 3 temporal,
+// with Intra-only frame inserted in the stream. Verify that we can start
+// decoding the SL0 stream at the intra_only frame in mid-sequence.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLIntraStartDecodeBaseMidSeq) {
+  BasicRateTargetingSVC3TL3SLIntraStartDecodeBaseMidSeq();
+}
+
+// Check basic rate targeting for CBR, for 3spatial layers, 3 temporal,
+// with Intra-only frame inserted in the stream. Verify that we can
+// decode all frames and layers with no mismatch.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLIntraMidSeqDecodeAll) {
+  BasicRateTargetingSVC3TL3SLIntraMidSeqDecodeAll();
+}
+
+// Check basic rate targeting for CBR, for 2 spatial layers, 1 temporal,
+// with Intra-only frame inserted in the stream.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL2SLIntraOnly) {
+  BasicRateTargetingSVC1TL2SLIntraOnlyTest();
+}
+
 // Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SL) {
   BasicRateTargetingSVC1TL3SLTest();
 }
 
+// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal,
+// with additional temporal reference for top spatial layer.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLMultiRef) {
+  BasicRateTargetingSVC1TL3SLMultiRefTest();
+}
+
 // Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SL) {
   BasicRateTargetingSVC3TL3SLTest();
@@ -594,16 +1740,128 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHD) {
 }
 
 // Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for fixed mode SVC.
+TEST_P(DatarateTestSVC, BasicRateTargetingFixedModeSVC3TL3SLHD) {
+  BasicRateTargetingFixedModeSVC3TL3SLHDTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for 2 threads, 2 tile_columns, row-mt enabled.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMultiThread2) {
+  BasicRateTargetingSVC3TL3SLHDMultiThread2Test();
+}
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for 4 threads, 4 tile_columns, row-mt enabled.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMultiThread4) {
+  BasicRateTargetingSVC3TL3SLHDMultiThread4Test();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// with additional temporal reference for top spatial layer.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMultiRef) {
+  BasicRateTargetingSVC3TL3SLHDMultiRefTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
 // for auto key frame mode with short key frame period.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLKf) {
   BasicRateTargetingSVC3TL3SLKfTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestSVC,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(7, 9),
-                          ::testing::Range<unsigned int>(0, 4),
-                          ::testing::Values(0, 1));
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for 4:4:4 input.
+TEST_P(DatarateTestSVC, BasicRateTargeting444SVC3TL3SL) {
+  BasicRateTargeting444SVC3TL3SLTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping of all enhancement layers (TL 1 and TL2). Check that the base
+// layer (TL0) can still be decodeable (with no mismatch) with the
+// error_resilient flag set to 0. This test used the pattern with multiple
+// references (last, golden, and altref), updated on base layer.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLMultiRefDropAllEnh) {
+  BasicRateTargetingSVC3TL1SLMultiRefDropAllEnhTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping of all enhancement layers (TL 1 and TL2). Check that the base
+// layer (TL0) can still be decodeable (with no mismatch) with the
+// error_resilient flag set to 0.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropAllEnh) {
+  BasicRateTargetingSVC3TL1SLDropAllEnhTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping of the TL2 enhancement layer, which are non-reference
+// (droppble) frames. For the base layer (TL0) and TL1 to still be decodeable
+// (with no mismatch), the error_resilient_flag may be off (set to 0),
+// since TL2 are non-reference frames.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropTL2Enh) {
+  BasicRateTargetingSVC3TL1SLDropTL2EnhTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping of all enhancement layers (TL 1 and TL2). Test that the
+// error_resilient flag can be set at frame level, with on/1 on
+// enhancement layers and off/0 on base layer.
+// This allows for successful decoding after dropping enhancement layer frames.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropAllEnhFrameER) {
+  BasicRateTargetingSVC3TL1SLDropAllEnhFrameERTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping set of enhancement layers (TL 1 and TL2) in middle of sequence.
+// Test that the error_resilient flag can be set at frame level, with on/1 on
+// enhancement layers and off/0 on base layer.
+// This allows for successful decoding after dropping a set enhancement layer
+// frames in the sequence.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropSetEnhFrameER) {
+  BasicRateTargetingSVC3TL1SLDropSetEnhFrameERTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping set of enhancement layers (TL 1 and TL2) in middle of sequence.
+// Test that the error_resilient flag can be 0/off for all frames.
+// This allows for successful decoding after dropping a set enhancement layer
+// frames in the sequence.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropSetEnhER0) {
+  BasicRateTargetingSVC3TL1SLDropSetEnhER0Test();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 3 spatial layers,
+// with dropping set of enhancement layers (superframe TL 1 and TL2) in middle
+// of sequence. Test that the error_resilient flag can be 0/off for all frames.
+// This allows for successful decoding after dropping a set enhancement layer
+// frames in the sequence.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLDropSetEnhER0) {
+  BasicRateTargetingSVC3TL3SLDropSetEnhER0Test();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with compound prediction on, for pattern with two additional refereces
+// (golden and altref), both updated on base TLO frames.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLMultiRefCompound) {
+  BasicRateTargetingSVC3TL1SLMultiRefCompoundTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal,
+// with the top spatial layer starting disabled (0 bitrate) and then
+// dynamically enabled after x frames with nonzero bitrate.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLDynEnabl) {
+  BasicRateTargetingSVC1TL3SLDynEnablTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal,
+// with the top spatial layer dynamically disabled snd enabled during the
+// middle of the sequence.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLDynDisEnabl) {
+  BasicRateTargetingSVC1TL3SLDynDisEnablTest();
+}
+
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestSVC,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(7, 11), ::testing::Values(0, 3),
+                           ::testing::Values(0, 1));
 
 }  // namespace
 }  // namespace datarate_test
diff --git a/media/libaom/src/test/temporal_filter_planewise_test.cc b/media/libaom/src/test/temporal_filter_planewise_test.cc
deleted file mode 100644
index c3f3e9e050..0000000000
--- a/media/libaom/src/test/temporal_filter_planewise_test.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <cmath>
-#include <cstdlib>
-#include <string>
-#include <tuple>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "test/function_equivalence_test.h"
-
-using libaom_test::ACMRandom;
-using libaom_test::FunctionEquivalenceTest;
-using ::testing::Combine;
-using ::testing::Range;
-using ::testing::Values;
-using ::testing::ValuesIn;
-
-#if !CONFIG_REALTIME_ONLY
-namespace {
-
-typedef void (*TemporalFilterPlanewiseFunc)(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_level, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count);
-typedef libaom_test::FuncParam<TemporalFilterPlanewiseFunc>
-    TemporalFilterPlanewiseFuncParam;
-
-typedef std::tuple<TemporalFilterPlanewiseFuncParam, int>
-    TemporalFilterPlanewiseWithParam;
-
-class TemporalFilterPlanewiseTest
-    : public ::testing::TestWithParam<TemporalFilterPlanewiseWithParam> {
- public:
-  virtual ~TemporalFilterPlanewiseTest() {}
-  virtual void SetUp() {
-    params_ = GET_PARAM(0);
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-    src1_ = reinterpret_cast<uint8_t *>(aom_memalign(8, 256 * 256));
-    src2_ = reinterpret_cast<uint8_t *>(aom_memalign(8, 256 * 256));
-
-    ASSERT_TRUE(src1_ != NULL);
-    ASSERT_TRUE(src2_ != NULL);
-  }
-
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src1_);
-    aom_free(src2_);
-  }
-  void RunTest(int isRandom, int width, int height, int run_times);
-
-  void GenRandomData(int width, int height, int stride, int stride2) {
-    for (int ii = 0; ii < height; ii++) {
-      for (int jj = 0; jj < width; jj++) {
-        src1_[ii * stride + jj] = rnd_.Rand8();
-        src2_[ii * stride2 + jj] = rnd_.Rand8();
-      }
-    }
-  }
-
-  void GenExtremeData(int width, int height, int stride, uint8_t *data,
-                      int stride2, uint8_t *data2, uint8_t val) {
-    for (int ii = 0; ii < height; ii++) {
-      for (int jj = 0; jj < width; jj++) {
-        data[ii * stride + jj] = val;
-        data2[ii * stride2 + jj] = (255 - val);
-      }
-    }
-  }
-
- protected:
-  TemporalFilterPlanewiseFuncParam params_;
-  uint8_t *src1_;
-  uint8_t *src2_;
-  ACMRandom rnd_;
-};
-
-void TemporalFilterPlanewiseTest::RunTest(int isRandom, int width, int height,
-                                          int run_times) {
-  aom_usec_timer ref_timer, test_timer;
-  for (int k = 0; k < 3; k++) {
-    const int stride = width;
-    const int stride2 = width;
-    if (isRandom) {
-      GenRandomData(width, height, stride, stride2);
-    } else {
-      const int msb = 8;  // Up to 8 bit input
-      const int limit = (1 << msb) - 1;
-      if (k == 0) {
-        GenExtremeData(width, height, stride, src1_, stride2, src2_, limit);
-      } else {
-        GenExtremeData(width, height, stride, src1_, stride2, src2_, 0);
-      }
-    }
-    double sigma[1] = { 2.1002103677063437 };
-    DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
-    DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
-    memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
-    memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
-    DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
-    DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
-    memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
-    memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
-
-    assert(width == 32 && height == 32);
-    const BLOCK_SIZE block_size = BLOCK_32X32;
-    const int use_subblock = 0;
-    const int block_mse = 20;
-    const int subblock_mses[4] = { 15, 16, 17, 18 };
-    const int q_factor = 12;
-    const int mb_row = 0;
-    const int mb_col = 0;
-    const int num_planes = 1;
-    YV12_BUFFER_CONFIG *ref_frame =
-        (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
-    ref_frame->heights[0] = height;
-    ref_frame->strides[0] = stride;
-    DECLARE_ALIGNED(16, uint8_t, src[1024 * 3]);
-    ref_frame->buffer_alloc = src;
-    ref_frame->buffers[0] = ref_frame->buffer_alloc;
-    ref_frame->flags = 0;  // Only support low bit-depth test.
-    memcpy(src, src1_, 1024 * 3 * sizeof(uint8_t));
-
-    MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
-    mbd->plane[0].subsampling_y = 0;
-    mbd->plane[0].subsampling_x = 0;
-    mbd->bd = 8;
-
-    params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                     sigma, use_subblock, block_mse, subblock_mses, q_factor,
-                     src2_, accumulator_ref, count_ref);
-    params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                     sigma, use_subblock, block_mse, subblock_mses, q_factor,
-                     src2_, accumulator_mod, count_mod);
-
-    if (run_times > 1) {
-      aom_usec_timer_start(&ref_timer);
-      for (int j = 0; j < run_times; j++) {
-        params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                         sigma, use_subblock, block_mse, subblock_mses,
-                         q_factor, src2_, accumulator_ref, count_ref);
-      }
-      aom_usec_timer_mark(&ref_timer);
-      const int elapsed_time_c =
-          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
-
-      aom_usec_timer_start(&test_timer);
-      for (int j = 0; j < run_times; j++) {
-        params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                         sigma, use_subblock, block_mse, subblock_mses,
-                         q_factor, src2_, accumulator_mod, count_mod);
-      }
-      aom_usec_timer_mark(&test_timer);
-      const int elapsed_time_simd =
-          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
-
-      printf(
-          "c_time=%d \t simd_time=%d \t "
-          "gain=%f\t width=%d\t height=%d \n",
-          elapsed_time_c, elapsed_time_simd,
-          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
-          height);
-
-    } else {
-      for (int i = 0, l = 0; i < height; i++) {
-        for (int j = 0; j < width; j++, l++) {
-          EXPECT_EQ(accumulator_ref[l], accumulator_mod[l])
-              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
-              << "] C accumulator does not match optimized accumulator.";
-          EXPECT_EQ(count_ref[l], count_mod[l])
-              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
-              << "] C count does not match optimized count.";
-        }
-      }
-    }
-
-    free(ref_frame);
-    free(mbd);
-  }
-}
-
-TEST_P(TemporalFilterPlanewiseTest, OperationCheck) {
-  for (int height = 32; height <= 32; height = height * 2) {
-    RunTest(1, height, height, 1);  // GenRandomData
-  }
-}
-
-TEST_P(TemporalFilterPlanewiseTest, ExtremeValues) {
-  for (int height = 32; height <= 32; height = height * 2) {
-    RunTest(0, height, height, 1);
-  }
-}
-
-TEST_P(TemporalFilterPlanewiseTest, DISABLED_Speed) {
-  for (int height = 32; height <= 32; height = height * 2) {
-    RunTest(1, height, height, 100000);
-  }
-}
-
-#if HAVE_AVX2
-TemporalFilterPlanewiseFuncParam temporal_filter_planewise_test_avx2[] = {
-  TemporalFilterPlanewiseFuncParam(&av1_apply_temporal_filter_planewise_c,
-                                   &av1_apply_temporal_filter_planewise_avx2)
-};
-INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterPlanewiseTest,
-                         Combine(ValuesIn(temporal_filter_planewise_test_avx2),
-                                 Range(64, 65, 4)));
-#endif  // HAVE_AVX2
-
-#if HAVE_SSE2
-TemporalFilterPlanewiseFuncParam temporal_filter_planewise_test_sse2[] = {
-  TemporalFilterPlanewiseFuncParam(&av1_apply_temporal_filter_planewise_c,
-                                   &av1_apply_temporal_filter_planewise_sse2)
-};
-INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterPlanewiseTest,
-                         Combine(ValuesIn(temporal_filter_planewise_test_sse2),
-                                 Range(64, 65, 4)));
-#endif  // HAVE_SSE2
-
-}  // namespace
-#endif
diff --git a/media/libaom/src/test/temporal_filter_test.cc b/media/libaom/src/test/temporal_filter_test.cc
new file mode 100644
index 0000000000..bf61f02cf7
--- /dev/null
+++ b/media/libaom/src/test/temporal_filter_test.cc
@@ -0,0 +1,561 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <memory>
+#include <new>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if !CONFIG_REALTIME_ONLY
+namespace {
+typedef enum {
+  I400,  // Monochrome
+  I420,  // 4:2:0
+  I422,  // 4:2:2
+  I444,  // 4:4:4
+} ColorFormat;
+static const char *color_fmt_str[] = { "I400", "I420", "I422", "I444" };
+typedef void (*TemporalFilterFunc)(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_level, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strenght,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count);
+typedef libaom_test::FuncParam<TemporalFilterFunc> TemporalFilterFuncParam;
+
+typedef std::tuple<TemporalFilterFuncParam, int> TemporalFilterWithParam;
+
+class TemporalFilterTest
+    : public ::testing::TestWithParam<TemporalFilterWithParam> {
+ public:
+  virtual ~TemporalFilterTest() {}
+  virtual void SetUp() {
+    params_ = GET_PARAM(0);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src1_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(8, sizeof(uint8_t) * MAX_MB_PLANE * BH * BW));
+    src2_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(8, sizeof(uint8_t) * MAX_MB_PLANE * BH * BW));
+
+    ASSERT_NE(src1_, nullptr);
+    ASSERT_NE(src2_, nullptr);
+  }
+
+  virtual void TearDown() {
+    aom_free(src1_);
+    aom_free(src2_);
+  }
+  void RunTest(int isRandom, int run_times, ColorFormat color_fmt);
+
+  void GenRandomData(int width, int height, int stride, int stride2,
+                     int num_planes, int subsampling_x, int subsampling_y) {
+    uint8_t *src1p = src1_;
+    uint8_t *src2p = src2_;
+    for (int plane = 0; plane < num_planes; plane++) {
+      int plane_w = plane ? width >> subsampling_x : width;
+      int plane_h = plane ? height >> subsampling_y : height;
+      int plane_stride = plane ? stride >> subsampling_x : stride;
+      int plane_stride2 = plane ? stride2 >> subsampling_x : stride2;
+      for (int ii = 0; ii < plane_h; ii++) {
+        for (int jj = 0; jj < plane_w; jj++) {
+          src1p[jj] = rnd_.Rand8();
+          src2p[jj] = rnd_.Rand8();
+        }
+        src1p += plane_stride;
+        src2p += plane_stride2;
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, int stride2,
+                      int num_planes, int subsampling_x, int subsampling_y,
+                      uint8_t val) {
+    uint8_t *src1p = src1_;
+    uint8_t *src2p = src2_;
+    for (int plane = 0; plane < num_planes; plane++) {
+      int plane_w = plane ? width >> subsampling_x : width;
+      int plane_h = plane ? height >> subsampling_y : height;
+      int plane_stride = plane ? stride >> subsampling_x : stride;
+      int plane_stride2 = plane ? stride2 >> subsampling_x : stride2;
+      for (int ii = 0; ii < plane_h; ii++) {
+        for (int jj = 0; jj < plane_w; jj++) {
+          src1p[jj] = val;
+          src2p[jj] = (255 - val);
+        }
+        src1p += plane_stride;
+        src2p += plane_stride2;
+      }
+    }
+  }
+
+ protected:
+  TemporalFilterFuncParam params_;
+  uint8_t *src1_;
+  uint8_t *src2_;
+  ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(TemporalFilterTest);
+
+void TemporalFilterTest::RunTest(int isRandom, int run_times,
+                                 ColorFormat color_fmt) {
+  aom_usec_timer ref_timer, test_timer;
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+  const int width = block_size_wide[block_size];
+  const int height = block_size_high[block_size];
+  int num_planes = MAX_MB_PLANE;
+  int subsampling_x = 0;
+  int subsampling_y = 0;
+  if (color_fmt == I420) {
+    subsampling_x = 1;
+    subsampling_y = 1;
+  } else if (color_fmt == I422) {
+    subsampling_x = 1;
+    subsampling_y = 0;
+  } else if (color_fmt == I400) {
+    num_planes = 1;
+  }
+  for (int k = 0; k < 3; k++) {
+    const int stride = width;
+    const int stride2 = width;
+    if (isRandom) {
+      GenRandomData(width, height, stride, stride2, num_planes, subsampling_x,
+                    subsampling_y);
+    } else {
+      const int msb = 8;  // Up to 8 bit input
+      const int limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, stride2, num_planes,
+                       subsampling_x, subsampling_y, limit);
+      } else {
+        GenExtremeData(width, height, stride, stride2, num_planes,
+                       subsampling_x, subsampling_y, 0);
+      }
+    }
+    double sigma[MAX_MB_PLANE] = { 2.1002103677063437, 2.1002103677063437,
+                                   2.1002103677063437 };
+    DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
+    memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
+    memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
+    DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
+    memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
+    memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
+
+    assert(width == 32 && height == 32);
+    const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
+    const int subblock_mses[4] = { 15, 16, 17, 18 };
+    const int q_factor = 12;
+    const int filter_strength = 5;
+    const int mb_row = 0;
+    const int mb_col = 0;
+    std::unique_ptr<YV12_BUFFER_CONFIG> ref_frame(new (std::nothrow)
+                                                      YV12_BUFFER_CONFIG);
+    ASSERT_NE(ref_frame, nullptr);
+    ref_frame->y_crop_height = 360;
+    ref_frame->y_crop_width = 540;
+    ref_frame->heights[PLANE_TYPE_Y] = height;
+    ref_frame->heights[PLANE_TYPE_UV] = height >> subsampling_y;
+    ref_frame->strides[PLANE_TYPE_Y] = stride;
+    ref_frame->strides[PLANE_TYPE_UV] = stride >> subsampling_x;
+    DECLARE_ALIGNED(16, uint8_t, src[1024 * 3]);
+    ref_frame->buffer_alloc = src;
+    ref_frame->flags = 0;  // Only support low bit-depth test.
+    memcpy(src, src1_, 1024 * 3 * sizeof(uint8_t));
+
+    std::unique_ptr<MACROBLOCKD> mbd(new (std::nothrow) MACROBLOCKD);
+    ASSERT_NE(mbd, nullptr);
+    mbd->bd = 8;
+    for (int plane = AOM_PLANE_Y; plane < num_planes; plane++) {
+      int plane_height = plane ? height >> subsampling_y : height;
+      int plane_stride = plane ? stride >> subsampling_x : stride;
+      ref_frame->buffers[plane] =
+          ref_frame->buffer_alloc + plane * plane_stride * plane_height;
+      mbd->plane[plane].subsampling_x = plane ? subsampling_x : 0;
+      mbd->plane[plane].subsampling_y = plane ? subsampling_y : 0;
+    }
+
+    params_.ref_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
+                     num_planes, sigma, subblock_mvs, subblock_mses, q_factor,
+                     filter_strength, src2_, accumulator_ref, count_ref);
+    params_.tst_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
+                     num_planes, sigma, subblock_mvs, subblock_mses, q_factor,
+                     filter_strength, src2_, accumulator_mod, count_mod);
+
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
+                         num_planes, sigma, subblock_mvs, subblock_mses,
+                         q_factor, filter_strength, src2_, accumulator_ref,
+                         count_ref);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
+                         num_planes, sigma, subblock_mvs, subblock_mses,
+                         q_factor, filter_strength, src2_, accumulator_mod,
+                         count_mod);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%f\t width=%d\t height=%d\t color_format=%s\n",
+          elapsed_time_c, elapsed_time_simd,
+          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
+          height, color_fmt_str[color_fmt]);
+
+    } else {
+      for (int i = 0, l = 0; i < height; i++) {
+        for (int j = 0; j < width; j++, l++) {
+          EXPECT_EQ(accumulator_ref[l], accumulator_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] " << color_fmt_str[color_fmt]
+              << " C accumulator does not match optimized accumulator.";
+          EXPECT_EQ(count_ref[l], count_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] " << color_fmt_str[color_fmt]
+              << " count does not match optimized count.";
+        }
+      }
+    }
+  }
+}
+
+TEST_P(TemporalFilterTest, OperationCheck) {
+  RunTest(1, 1, I400);
+  RunTest(1, 1, I420);
+  RunTest(1, 1, I422);
+  RunTest(1, 1, I444);
+}
+
+TEST_P(TemporalFilterTest, ExtremeValues) {
+  RunTest(0, 1, I400);
+  RunTest(0, 1, I420);
+  RunTest(0, 1, I422);
+  RunTest(0, 1, I444);
+}
+
+TEST_P(TemporalFilterTest, DISABLED_Speed) {
+  RunTest(1, 100000, I400);
+  RunTest(1, 100000, I420);
+  RunTest(1, 100000, I422);
+  RunTest(1, 100000, I444);
+}
+
+#if HAVE_AVX2
+TemporalFilterFuncParam temporal_filter_test_avx2[] = { TemporalFilterFuncParam(
+    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
+INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
+                         Combine(ValuesIn(temporal_filter_test_avx2),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_AVX2
+
+#if HAVE_SSE2
+TemporalFilterFuncParam temporal_filter_test_sse2[] = { TemporalFilterFuncParam(
+    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
+INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
+                         Combine(ValuesIn(temporal_filter_test_sse2),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_SSE2
+#if CONFIG_AV1_HIGHBITDEPTH
+
+typedef void (*HBDTemporalFilterFunc)(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_level, const MV *subblock_mvs,
+    const int *subblock_mses, const int q_factor, const int filter_strenght,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count);
+typedef libaom_test::FuncParam<HBDTemporalFilterFunc>
+    HBDTemporalFilterFuncParam;
+
+typedef std::tuple<HBDTemporalFilterFuncParam, int> HBDTemporalFilterWithParam;
+
+class HBDTemporalFilterTest
+    : public ::testing::TestWithParam<HBDTemporalFilterWithParam> {
+ public:
+  virtual ~HBDTemporalFilterTest() {}
+  virtual void SetUp() {
+    params_ = GET_PARAM(0);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src1_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * MAX_MB_PLANE * BH * BW));
+    src2_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * MAX_MB_PLANE * BH * BW));
+
+    ASSERT_NE(src1_, nullptr);
+    ASSERT_NE(src2_, nullptr);
+  }
+
+  virtual void TearDown() {
+    aom_free(src1_);
+    aom_free(src2_);
+  }
+  void RunTest(int isRandom, int run_times, int bd, ColorFormat color_fmt);
+
+  void GenRandomData(int width, int height, int stride, int stride2, int bd,
+                     int subsampling_x, int subsampling_y, int num_planes) {
+    uint16_t *src1p = src1_;
+    uint16_t *src2p = src2_;
+    for (int plane = AOM_PLANE_Y; plane < num_planes; plane++) {
+      int plane_w = plane ? width >> subsampling_x : width;
+      int plane_h = plane ? height >> subsampling_y : height;
+      int plane_stride = plane ? stride >> subsampling_x : stride;
+      int plane_stride2 = plane ? stride2 >> subsampling_x : stride2;
+      const uint16_t max_val = (1 << bd) - 1;
+      for (int ii = 0; ii < plane_h; ii++) {
+        for (int jj = 0; jj < plane_w; jj++) {
+          src1p[jj] = rnd_.Rand16() & max_val;
+          src2p[jj] = rnd_.Rand16() & max_val;
+        }
+        src1p += plane_stride;
+        src2p += plane_stride2;
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, int stride2, int bd,
+                      int subsampling_x, int subsampling_y, int num_planes,
+                      uint16_t val) {
+    uint16_t *src1p = src1_;
+    uint16_t *src2p = src2_;
+    for (int plane = AOM_PLANE_Y; plane < num_planes; plane++) {
+      int plane_w = plane ? width >> subsampling_x : width;
+      int plane_h = plane ? height >> subsampling_y : height;
+      int plane_stride = plane ? stride >> subsampling_x : stride;
+      int plane_stride2 = plane ? stride2 >> subsampling_x : stride2;
+      uint16_t max_val = (1 << bd) - 1;
+      for (int ii = 0; ii < plane_h; ii++) {
+        for (int jj = 0; jj < plane_w; jj++) {
+          src1p[jj] = val;
+          src2p[jj] = (max_val - val);
+        }
+        src1p += plane_stride;
+        src2p += plane_stride2;
+      }
+    }
+  }
+
+ protected:
+  HBDTemporalFilterFuncParam params_;
+  uint16_t *src1_;
+  uint16_t *src2_;
+  ACMRandom rnd_;
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HBDTemporalFilterTest);
+
+void HBDTemporalFilterTest::RunTest(int isRandom, int run_times, int BD,
+                                    ColorFormat color_fmt) {
+  aom_usec_timer ref_timer, test_timer;
+  const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+  const int width = block_size_wide[block_size];
+  const int height = block_size_high[block_size];
+  int num_planes = MAX_MB_PLANE;
+  int subsampling_x = 0;
+  int subsampling_y = 0;
+  if (color_fmt == I420) {
+    subsampling_x = 1;
+    subsampling_y = 1;
+  } else if (color_fmt == I422) {
+    subsampling_x = 1;
+    subsampling_y = 0;
+  } else if (color_fmt == I400) {
+    num_planes = 1;
+  }
+  for (int k = 0; k < 3; k++) {
+    const int stride = width;
+    const int stride2 = width;
+    if (isRandom) {
+      GenRandomData(width, height, stride, stride2, BD, subsampling_x,
+                    subsampling_y, num_planes);
+    } else {
+      const int msb = BD;
+      const uint16_t limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, stride2, BD, subsampling_x,
+                       subsampling_y, num_planes, limit);
+      } else {
+        GenExtremeData(width, height, stride, stride2, BD, subsampling_x,
+                       subsampling_y, num_planes, 0);
+      }
+    }
+    double sigma[MAX_MB_PLANE] = { 2.1002103677063437, 2.1002103677063437,
+                                   2.1002103677063437 };
+    DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
+    memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
+    memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
+    DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
+    memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
+    memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
+
+    assert(width == 32 && height == 32);
+    const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
+    const int subblock_mses[4] = { 15, 16, 17, 18 };
+    const int q_factor = 12;
+    const int filter_strength = 5;
+    const int mb_row = 0;
+    const int mb_col = 0;
+    std::unique_ptr<YV12_BUFFER_CONFIG> ref_frame(new (std::nothrow)
+                                                      YV12_BUFFER_CONFIG);
+    ASSERT_NE(ref_frame, nullptr);
+    ref_frame->y_crop_height = 360;
+    ref_frame->y_crop_width = 540;
+    ref_frame->heights[PLANE_TYPE_Y] = height;
+    ref_frame->heights[PLANE_TYPE_UV] = height >> subsampling_y;
+    ref_frame->strides[PLANE_TYPE_Y] = stride;
+    ref_frame->strides[PLANE_TYPE_UV] = stride >> subsampling_x;
+    DECLARE_ALIGNED(16, uint16_t, src[1024 * 3]);
+    ref_frame->buffer_alloc = CONVERT_TO_BYTEPTR(src);
+    ref_frame->flags = YV12_FLAG_HIGHBITDEPTH;  // Only Hihgbd bit-depth test.
+    memcpy(src, src1_, 1024 * 3 * sizeof(uint16_t));
+
+    std::unique_ptr<MACROBLOCKD> mbd(new (std::nothrow) MACROBLOCKD);
+    ASSERT_NE(mbd, nullptr);
+    mbd->bd = BD;
+    for (int plane = AOM_PLANE_Y; plane < num_planes; plane++) {
+      int plane_height = plane ? height >> subsampling_y : height;
+      int plane_stride = plane ? stride >> subsampling_x : stride;
+      ref_frame->buffers[plane] =
+          ref_frame->buffer_alloc + plane * plane_stride * plane_height;
+      mbd->plane[plane].subsampling_x = plane ? subsampling_x : 0;
+      mbd->plane[plane].subsampling_y = plane ? subsampling_y : 0;
+    }
+
+    params_.ref_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
+                     num_planes, sigma, subblock_mvs, subblock_mses, q_factor,
+                     filter_strength, CONVERT_TO_BYTEPTR(src2_),
+                     accumulator_ref, count_ref);
+    params_.tst_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
+                     num_planes, sigma, subblock_mvs, subblock_mses, q_factor,
+                     filter_strength, CONVERT_TO_BYTEPTR(src2_),
+                     accumulator_mod, count_mod);
+
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
+                         num_planes, sigma, subblock_mvs, subblock_mses,
+                         q_factor, filter_strength, CONVERT_TO_BYTEPTR(src2_),
+                         accumulator_ref, count_ref);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
+                         num_planes, sigma, subblock_mvs, subblock_mses,
+                         q_factor, filter_strength, CONVERT_TO_BYTEPTR(src2_),
+                         accumulator_mod, count_mod);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%f\t width=%d\t height=%d\t color_format=%s\n",
+          elapsed_time_c, elapsed_time_simd,
+          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
+          height, color_fmt_str[color_fmt]);
+
+    } else {
+      for (int i = 0, l = 0; i < height; i++) {
+        for (int j = 0; j < width; j++, l++) {
+          EXPECT_EQ(accumulator_ref[l], accumulator_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] " << color_fmt_str[color_fmt]
+              << " C accumulator does not match optimized accumulator.";
+          EXPECT_EQ(count_ref[l], count_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] " << color_fmt_str[color_fmt]
+              << " C count does not match optimized count.";
+        }
+      }
+    }
+  }
+}
+
+TEST_P(HBDTemporalFilterTest, OperationCheck) {
+  RunTest(1, 1, 10, I400);
+  RunTest(1, 1, 10, I420);
+  RunTest(1, 1, 10, I422);
+  RunTest(1, 1, 10, I444);
+}
+
+TEST_P(HBDTemporalFilterTest, ExtremeValues) {
+  RunTest(0, 1, 10, I400);
+  RunTest(0, 1, 10, I420);
+  RunTest(0, 1, 10, I422);
+  RunTest(0, 1, 10, I444);
+}
+
+TEST_P(HBDTemporalFilterTest, DISABLED_Speed) {
+  RunTest(1, 100000, 10, I400);
+  RunTest(1, 100000, 10, I420);
+  RunTest(1, 100000, 10, I422);
+  RunTest(1, 100000, 10, I444);
+}
+#if HAVE_SSE2
+HBDTemporalFilterFuncParam HBDtemporal_filter_test_sse2[] = {
+  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+                             &av1_highbd_apply_temporal_filter_sse2)
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, HBDTemporalFilterTest,
+                         Combine(ValuesIn(HBDtemporal_filter_test_sse2),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_SSE2
+#if HAVE_AVX2
+HBDTemporalFilterFuncParam HBDtemporal_filter_test_avx2[] = {
+  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+                             &av1_highbd_apply_temporal_filter_avx2)
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest,
+                         Combine(ValuesIn(HBDtemporal_filter_test_avx2),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_AVX2
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace
+#endif
diff --git a/media/libaom/src/test/temporal_filter_yuv_test.cc b/media/libaom/src/test/temporal_filter_yuv_test.cc
deleted file mode 100644
index dc17aaaf7f..0000000000
--- a/media/libaom/src/test/temporal_filter_yuv_test.cc
+++ /dev/null
@@ -1,841 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <ostream>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "config/av1_rtcd.h"
-#include "test/acm_random.h"
-#include "test/register_state_check.h"
-#include "aom_ports/aom_timer.h"
-#include "aom_ports/mem.h"
-
-namespace {
-
-using ::libaom_test::ACMRandom;
-
-const int MAX_WIDTH = 32;
-const int MAX_HEIGHT = 32;
-
-typedef void (*TemporalFilterYUVFunc)(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const int strength, const int use_subblock,
-    const int *blk_fw, const uint8_t *pred, uint32_t *accum, uint16_t *count);
-
-struct TemporalFilterWithBd {
-  TemporalFilterWithBd(TemporalFilterYUVFunc func, int bitdepth)
-      : temporal_filter(func), bd(bitdepth) {}
-
-  TemporalFilterYUVFunc temporal_filter;
-  int bd;
-};
-
-std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) {
-  return os << "Bitdepth: " << tf.bd;
-}
-
-int GetFilterWeight(unsigned int row, unsigned int col,
-                    unsigned int block_height, unsigned int block_width,
-                    const int *const blk_fw, int use_32x32) {
-  if (use_32x32) {
-    return blk_fw[0];
-  }
-
-  return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)];
-}
-
-template <typename PixelType>
-int GetModIndex(int sum_dist, int index, int rounding, int strength,
-                int filter_weight) {
-  int mod = sum_dist * 3 / index;
-  mod += rounding;
-  mod >>= strength;
-
-  mod = AOMMIN(16, mod);
-
-  mod = 16 - mod;
-  mod *= filter_weight;
-
-  return mod;
-}
-
-// Lowbitdepth version
-template <>
-int GetModIndex<uint8_t>(int sum_dist, int index, int rounding, int strength,
-                         int filter_weight) {
-  unsigned int index_mult[14] = { 0,     0,     0,     0,     49152,
-                                  39322, 32768, 28087, 24576, 21846,
-                                  19661, 17874, 0,     15124 };
-
-  assert(index >= 0 && index <= 13);
-  assert(index_mult[index] != 0);
-
-  int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
-  mod += rounding;
-  mod >>= strength;
-
-  mod = AOMMIN(16, mod);
-
-  mod = 16 - mod;
-  mod *= filter_weight;
-
-  return mod;
-}
-
-// Highbitdepth version
-template <>
-int GetModIndex<uint16_t>(int sum_dist, int index, int rounding, int strength,
-                          int filter_weight) {
-  int64_t index_mult[14] = { 0U,          0U,          0U,          0U,
-                             3221225472U, 2576980378U, 2147483648U, 1840700270U,
-                             1610612736U, 1431655766U, 1288490189U, 1171354718U,
-                             0U,          991146300U };
-
-  assert(index >= 0 && index <= 13);
-  assert(index_mult[index] != 0);
-
-  int mod = static_cast<int>((sum_dist * index_mult[index]) >> 32);
-  mod += rounding;
-  mod >>= strength;
-
-  mod = AOMMIN(16, mod);
-
-  mod = 16 - mod;
-  mod *= filter_weight;
-
-  return mod;
-}
-
-template <typename PixelType>
-void SetArray(PixelType *pixel_array, int width, int height, int stride,
-              int val) {
-  for (int row = 0; row < height; row++) {
-    for (int col = 0; col < width; col++) {
-      pixel_array[col] = val;
-    }
-    pixel_array += stride;
-  }
-}
-
-template <typename PixelType>
-void SetArray(PixelType *pixel_array, int width, int height, int stride,
-              ACMRandom *rnd, int low_val, int high_val) {
-  EXPECT_LE(low_val, high_val);
-
-  for (int row = 0; row < height; row++) {
-    for (int col = 0; col < width; col++) {
-      const int val =
-          static_cast<int>((*rnd).PseudoUniform(high_val - low_val));
-      pixel_array[col] = low_val + val;
-    }
-    pixel_array += stride;
-  }
-}
-
-template <typename ValueType>
-bool CheckArrayEqual(const ValueType *arr_1, const ValueType *arr_2, int width,
-                     int height, int stride_1, int stride_2) {
-  for (int row = 0; row < height; row++) {
-    for (int col = 0; col < width; col++) {
-      if (arr_1[col] != arr_2[col]) {
-        return false;
-      }
-    }
-    arr_1 += stride_1;
-    arr_2 += stride_2;
-  }
-  return true;
-}
-
-template <typename ValueType>
-void PrintArrayDiff(const ValueType *arr_1, const ValueType *arr_2, int width,
-                    int height, int stride_1, int stride_2) {
-  const ValueType *arr_1_start = arr_1, *arr_2_start = arr_2;
-
-  printf("Array 1:\n");
-  for (int row = 0; row < height; ++row) {
-    for (int col = 0; col < width; ++col) {
-      if (arr_1[col] != arr_2[col]) {
-        printf("*%3d", arr_1[col]);
-      } else {
-        printf("%4d", arr_1[col]);
-      }
-    }
-    printf("\n");
-    arr_1 += stride_1;
-    arr_2 += stride_2;
-  }
-
-  arr_1 = arr_1_start;
-  arr_2 = arr_2_start;
-
-  printf("Array 2:\n");
-  for (int row = 0; row < height; ++row) {
-    for (int col = 0; col < width; ++col) {
-      if (arr_1[col] != arr_2[col]) {
-        printf("*%3d", arr_2[col]);
-      } else {
-        printf("%4d", arr_2[col]);
-      }
-    }
-    printf("\n");
-    arr_1 += stride_1;
-    arr_2 += stride_2;
-  }
-
-  arr_1 = arr_1_start;
-  arr_2 = arr_2_start;
-  printf("Difference:\n");
-  for (int row = 0; row < height; ++row) {
-    for (int col = 0; col < width; ++col) {
-      printf("%4d", arr_1[col] - arr_2[col]);
-    }
-    printf("\n");
-    arr_1 += stride_1;
-    arr_2 += stride_2;
-  }
-}
-
-template <typename PixelType>
-void ApplyReferenceFilter(const PixelType *y_src, const PixelType *y_pre,
-                          const PixelType *u_src, const PixelType *v_src,
-                          const PixelType *u_pre, const PixelType *v_pre,
-                          unsigned int block_width, unsigned int block_height,
-                          int ss_x, int ss_y, int strength,
-                          const int *const blk_fw, int use_32x32,
-                          uint32_t *y_accum, uint16_t *y_count,
-                          uint32_t *u_accum, uint16_t *u_count,
-                          uint32_t *v_accum, uint16_t *v_count) {
-  const int uv_block_width = block_width >> ss_x,
-            uv_block_height = block_height >> ss_y;
-  const int y_src_stride = block_width, y_pre_stride = block_width;
-  const int uv_src_stride = uv_block_width, uv_pre_stride = uv_block_width;
-  const int y_diff_stride = block_width, uv_diff_stride = uv_block_width;
-  const int y_count_stride = block_width, u_count_stride = uv_block_width,
-            v_count_stride = uv_block_width;
-  const int y_accum_stride = block_width, u_accum_stride = uv_block_width,
-            v_accum_stride = uv_block_width;
-
-  int y_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  int u_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  int v_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-
-  const int rounding = (1 << strength) >> 1;
-
-  // Get the square diffs
-  for (int row = 0; row < (int)block_height; row++) {
-    for (int col = 0; col < (int)block_width; col++) {
-      const int diff =
-          y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col];
-      y_dif[row * y_diff_stride + col] = diff * diff;
-    }
-  }
-
-  for (int row = 0; row < (int)uv_block_height; row++) {
-    for (int col = 0; col < (int)uv_block_width; col++) {
-      const int u_diff =
-          u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col];
-      const int v_diff =
-          v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col];
-      u_dif[row * uv_diff_stride + col] = u_diff * u_diff;
-      v_dif[row * uv_diff_stride + col] = v_diff * v_diff;
-    }
-  }
-
-  // Apply the filter to luma
-  for (int row = 0; row < (int)block_height; row++) {
-    for (int col = 0; col < (int)block_width; col++) {
-      const int uv_row = row >> ss_y;
-      const int uv_col = col >> ss_x;
-      const int filter_weight = GetFilterWeight(row, col, block_height,
-                                                block_width, blk_fw, use_32x32);
-
-      // First we get the modifier for the current y pixel
-      const int y_pixel = y_pre[row * y_pre_stride + col];
-      int y_num_used = 0;
-      int y_mod = 0;
-
-      // Sum the neighboring 3x3 y pixels
-      for (int row_step = -1; row_step <= 1; row_step++) {
-        for (int col_step = -1; col_step <= 1; col_step++) {
-          const int sub_row = row + row_step;
-          const int sub_col = col + col_step;
-
-          if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 &&
-              sub_col < (int)block_width) {
-            y_mod += y_dif[sub_row * y_diff_stride + sub_col];
-            y_num_used++;
-          }
-        }
-      }
-
-      // Sum the corresponding uv pixels to the current y modifier
-      // Note we are rounding down instead of rounding to the nearest pixel.
-      y_mod += u_dif[uv_row * uv_diff_stride + uv_col];
-      y_mod += v_dif[uv_row * uv_diff_stride + uv_col];
-
-      y_num_used += 2;
-
-      // Set the modifier
-      y_mod = GetModIndex<PixelType>(y_mod, y_num_used, rounding, strength,
-                                     filter_weight);
-
-      // Accumulate the result
-      y_count[row * y_count_stride + col] += y_mod;
-      y_accum[row * y_accum_stride + col] += y_mod * y_pixel;
-    }
-  }
-
-  // Apply the filter to chroma
-  for (int uv_row = 0; uv_row < (int)uv_block_height; uv_row++) {
-    for (int uv_col = 0; uv_col < (int)uv_block_width; uv_col++) {
-      const int y_row = uv_row << ss_y;
-      const int y_col = uv_col << ss_x;
-      const int filter_weight = GetFilterWeight(
-          uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
-
-      const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col];
-      const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col];
-
-      int uv_num_used = 0;
-      int u_mod = 0, v_mod = 0;
-
-      // Sum the neighboring 3x3 chromal pixels to the chroma modifier
-      for (int row_step = -1; row_step <= 1; row_step++) {
-        for (int col_step = -1; col_step <= 1; col_step++) {
-          const int sub_row = uv_row + row_step;
-          const int sub_col = uv_col + col_step;
-
-          if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
-              sub_col < uv_block_width) {
-            u_mod += u_dif[sub_row * uv_diff_stride + sub_col];
-            v_mod += v_dif[sub_row * uv_diff_stride + sub_col];
-            uv_num_used++;
-          }
-        }
-      }
-
-      // Sum all the luma pixels associated with the current luma pixel
-      for (int row_step = 0; row_step < 1 + ss_y; row_step++) {
-        for (int col_step = 0; col_step < 1 + ss_x; col_step++) {
-          const int sub_row = y_row + row_step;
-          const int sub_col = y_col + col_step;
-          const int y_diff = y_dif[sub_row * y_diff_stride + sub_col];
-
-          u_mod += y_diff;
-          v_mod += y_diff;
-          uv_num_used++;
-        }
-      }
-
-      // Set the modifier
-      u_mod = GetModIndex<PixelType>(u_mod, uv_num_used, rounding, strength,
-                                     filter_weight);
-      v_mod = GetModIndex<PixelType>(v_mod, uv_num_used, rounding, strength,
-                                     filter_weight);
-
-      // Accumulate the result
-      u_count[uv_row * u_count_stride + uv_col] += u_mod;
-      u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel;
-      v_count[uv_row * v_count_stride + uv_col] += v_mod;
-      v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel;
-    }
-  }
-}
-
-class TemporalFilterYUVTest
-    : public ::testing::TestWithParam<TemporalFilterWithBd> {
- public:
-  virtual void SetUp() {
-    filter_func_ = GetParam().temporal_filter;
-    bd_ = GetParam().bd;
-    use_highbd_ = (bd_ != 8);
-
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-    saturate_test_ = 0;
-    num_repeats_ = 10;
-
-    ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12);
-  }
-
- protected:
-  template <typename PixelType>
-  void CompareTestWithParam(int width, int height, int ss_x, int ss_y,
-                            int filter_strength, int use_32x32,
-                            const int *filter_weight);
-  template <typename PixelType>
-  void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y,
-                              int filter_strength, int use_32x32,
-                              const int *filter_weight);
-  template <typename PixelType>
-  void ApplyTestFilter(const PixelType *y_src, int y_src_stride,
-                       const PixelType *y_pre, int y_pre_stride,
-                       const PixelType *u_src, const PixelType *v_src,
-                       int uv_src_stride, const PixelType *u_pre,
-                       const PixelType *v_pre, int uv_pre_stride,
-                       unsigned int block_width, unsigned int block_height,
-                       int ss_x, int ss_y, int strength, const int *blk_fw,
-                       int use_32x32, uint32_t *y_accum, uint16_t *y_count,
-                       uint32_t *u_accumu, uint16_t *u_count, uint32_t *v_accum,
-                       uint16_t *v_count);
-
-  TemporalFilterYUVFunc filter_func_;
-  ACMRandom rnd_;
-  int saturate_test_;
-  int num_repeats_;
-  int use_highbd_;
-  int bd_;
-};
-
-template <>
-void TemporalFilterYUVTest::ApplyTestFilter<uint8_t>(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
-    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
-    uint32_t *v_accum, uint16_t *v_count) {
-  (void)block_width;
-  (void)block_height;
-  (void)y_src_stride;
-  (void)uv_src_stride;
-
-  assert(block_width == MAX_WIDTH && MAX_WIDTH == 32);
-  assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32);
-  const BLOCK_SIZE block_size = BLOCK_32X32;
-  const int num_planes = 3;
-  const int mb_pels = MAX_WIDTH * MAX_HEIGHT;
-  const int mb_row = 0;
-  const int mb_col = 0;
-  const int use_subblock = !(use_32x32);
-
-  YV12_BUFFER_CONFIG *ref_frame =
-      (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
-  ref_frame->strides[0] = y_pre_stride;
-  ref_frame->strides[1] = uv_pre_stride;
-  const int alloc_size = MAX_MB_PLANE * mb_pels;
-  DECLARE_ALIGNED(16, uint8_t, src[alloc_size]);
-  ref_frame->buffer_alloc = src;
-  ref_frame->buffers[0] = ref_frame->buffer_alloc + 0 * mb_pels;
-  ref_frame->buffers[1] = ref_frame->buffer_alloc + 1 * mb_pels;
-  ref_frame->buffers[2] = ref_frame->buffer_alloc + 2 * mb_pels;
-  ref_frame->flags = bd_ > 8 ? YV12_FLAG_HIGHBITDEPTH : 0;
-
-  MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
-  mbd->plane[0].subsampling_y = 0;
-  mbd->plane[0].subsampling_x = 0;
-  mbd->plane[1].subsampling_y = ss_y;
-  mbd->plane[1].subsampling_x = ss_x;
-  mbd->plane[2].subsampling_y = ss_y;
-  mbd->plane[2].subsampling_x = ss_x;
-
-  DECLARE_ALIGNED(16, uint8_t, pred[alloc_size]);
-  DECLARE_ALIGNED(16, uint32_t, accum[alloc_size]);
-  DECLARE_ALIGNED(16, uint16_t, count[alloc_size]);
-  memcpy(src + 0 * mb_pels, y_src, mb_pels * sizeof(uint8_t));
-  memcpy(src + 1 * mb_pels, u_src, mb_pels * sizeof(uint8_t));
-  memcpy(src + 2 * mb_pels, v_src, mb_pels * sizeof(uint8_t));
-  memcpy(pred + 0 * mb_pels, y_pre, mb_pels * sizeof(uint8_t));
-  memcpy(pred + 1 * mb_pels, u_pre, mb_pels * sizeof(uint8_t));
-  memcpy(pred + 2 * mb_pels, v_pre, mb_pels * sizeof(uint8_t));
-  memcpy(accum + 0 * mb_pels, y_accum, mb_pels * sizeof(uint32_t));
-  memcpy(accum + 1 * mb_pels, u_accum, mb_pels * sizeof(uint32_t));
-  memcpy(accum + 2 * mb_pels, v_accum, mb_pels * sizeof(uint32_t));
-  memcpy(count + 0 * mb_pels, y_count, mb_pels * sizeof(uint16_t));
-  memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t));
-  memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t));
-
-  ASM_REGISTER_STATE_CHECK(
-      filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                   strength, use_subblock, blk_fw, pred, accum, count));
-
-  memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(v_accum, accum + 2 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(y_count, count + 0 * mb_pels, mb_pels * sizeof(uint16_t));
-  memcpy(u_count, count + 1 * mb_pels, mb_pels * sizeof(uint16_t));
-  memcpy(v_count, count + 2 * mb_pels, mb_pels * sizeof(uint16_t));
-
-  free(ref_frame);
-  free(mbd);
-}
-
-template <>
-void TemporalFilterYUVTest::ApplyTestFilter<uint16_t>(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
-    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
-    uint32_t *v_accum, uint16_t *v_count) {
-  (void)block_width;
-  (void)block_height;
-  (void)y_src_stride;
-  (void)uv_src_stride;
-
-  assert(block_width == MAX_WIDTH && MAX_WIDTH == 32);
-  assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32);
-  const BLOCK_SIZE block_size = BLOCK_32X32;
-  const int num_planes = 3;
-  const int mb_pels = MAX_WIDTH * MAX_HEIGHT;
-  const int mb_row = 0;
-  const int mb_col = 0;
-  const int use_subblock = !(use_32x32);
-
-  YV12_BUFFER_CONFIG *ref_frame =
-      (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
-  ref_frame->strides[0] = y_pre_stride;
-  ref_frame->strides[1] = uv_pre_stride;
-  const int alloc_size = MAX_MB_PLANE * mb_pels;
-  DECLARE_ALIGNED(16, uint16_t, src16[alloc_size]);
-  ref_frame->buffer_alloc = CONVERT_TO_BYTEPTR(src16);
-  ref_frame->buffers[0] = ref_frame->buffer_alloc + 0 * mb_pels;
-  ref_frame->buffers[1] = ref_frame->buffer_alloc + 1 * mb_pels;
-  ref_frame->buffers[2] = ref_frame->buffer_alloc + 2 * mb_pels;
-  ref_frame->flags = bd_ > 8 ? YV12_FLAG_HIGHBITDEPTH : 0;
-
-  MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
-  mbd->plane[0].subsampling_y = 0;
-  mbd->plane[0].subsampling_x = 0;
-  mbd->plane[1].subsampling_y = ss_y;
-  mbd->plane[1].subsampling_x = ss_x;
-  mbd->plane[2].subsampling_y = ss_y;
-  mbd->plane[2].subsampling_x = ss_x;
-
-  DECLARE_ALIGNED(16, uint16_t, pred16[alloc_size]);
-  DECLARE_ALIGNED(16, uint32_t, accum[alloc_size]);
-  DECLARE_ALIGNED(16, uint16_t, count[alloc_size]);
-  memcpy(src16 + 0 * mb_pels, y_src, mb_pels * sizeof(uint16_t));
-  memcpy(src16 + 1 * mb_pels, u_src, mb_pels * sizeof(uint16_t));
-  memcpy(src16 + 2 * mb_pels, v_src, mb_pels * sizeof(uint16_t));
-  memcpy(pred16 + 0 * mb_pels, y_pre, mb_pels * sizeof(uint16_t));
-  memcpy(pred16 + 1 * mb_pels, u_pre, mb_pels * sizeof(uint16_t));
-  memcpy(pred16 + 2 * mb_pels, v_pre, mb_pels * sizeof(uint16_t));
-  memcpy(accum + 0 * mb_pels, y_accum, mb_pels * sizeof(uint32_t));
-  memcpy(accum + 1 * mb_pels, u_accum, mb_pels * sizeof(uint32_t));
-  memcpy(accum + 2 * mb_pels, v_accum, mb_pels * sizeof(uint32_t));
-  memcpy(count + 0 * mb_pels, y_count, mb_pels * sizeof(uint16_t));
-  memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t));
-  memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t));
-  const uint8_t *pred = CONVERT_TO_BYTEPTR(pred16);
-
-  ASM_REGISTER_STATE_CHECK(
-      filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                   strength, use_subblock, blk_fw, pred, accum, count));
-
-  memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(v_accum, accum + 2 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(y_count, count + 0 * mb_pels, mb_pels * sizeof(uint16_t));
-  memcpy(u_count, count + 1 * mb_pels, mb_pels * sizeof(uint16_t));
-  memcpy(v_count, count + 2 * mb_pels, mb_pels * sizeof(uint16_t));
-
-  free(ref_frame);
-  free(mbd);
-}
-
-template <typename PixelType>
-void TemporalFilterYUVTest::CompareTestWithParam(int width, int height,
-                                                 int ss_x, int ss_y,
-                                                 int filter_strength,
-                                                 int use_32x32,
-                                                 const int *filter_weight) {
-  const int uv_width = width >> ss_x, uv_height = height >> ss_y;
-  const int y_stride = width, uv_stride = uv_width;
-
-  DECLARE_ALIGNED(16, PixelType, y_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, PixelType, y_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, y_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, y_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, y_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, y_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-
-  DECLARE_ALIGNED(16, PixelType, u_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, PixelType, u_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, u_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, u_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, u_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, u_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-
-  DECLARE_ALIGNED(16, PixelType, v_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, PixelType, v_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, v_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, v_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, v_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, v_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-
-  for (int repeats = 0; repeats < num_repeats_; repeats++) {
-    if (saturate_test_) {
-      const int max_val = (1 << bd_) - 1;
-      SetArray(y_src, width, height, y_stride, max_val);
-      SetArray(y_pre, width, height, y_stride, 0);
-      SetArray(u_src, uv_width, uv_height, uv_stride, max_val);
-      SetArray(u_pre, uv_width, uv_height, uv_stride, 0);
-      SetArray(v_src, uv_width, uv_height, uv_stride, max_val);
-      SetArray(v_pre, uv_width, uv_height, uv_stride, 0);
-    } else {
-      const int max_val = 7 << (bd_ - 8);
-      SetArray(y_src, width, height, y_stride, &rnd_, 0, max_val);
-      SetArray(y_pre, width, height, y_stride, &rnd_, 0, max_val);
-      SetArray(u_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
-      SetArray(u_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
-      SetArray(v_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
-      SetArray(v_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
-    }
-
-    ApplyReferenceFilter<PixelType>(
-        y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y,
-        filter_strength, filter_weight, use_32x32, y_accum_ref, y_count_ref,
-        u_accum_ref, u_count_ref, v_accum_ref, v_count_ref);
-
-    ApplyTestFilter(y_src, y_stride, y_pre, y_stride, u_src, v_src, uv_stride,
-                    u_pre, v_pre, uv_stride, width, height, ss_x, ss_y,
-                    filter_strength, filter_weight, use_32x32, y_accum_tst,
-                    y_count_tst, u_accum_tst, u_count_tst, v_accum_tst,
-                    v_count_tst);
-
-    EXPECT_TRUE(CheckArrayEqual(y_accum_tst, y_accum_ref, width, height,
-                                y_stride, y_stride));
-    EXPECT_TRUE(CheckArrayEqual(y_count_tst, y_count_ref, width, height,
-                                y_stride, y_stride));
-    EXPECT_TRUE(CheckArrayEqual(u_accum_tst, u_accum_ref, uv_width, uv_height,
-                                uv_stride, uv_stride));
-    EXPECT_TRUE(CheckArrayEqual(u_count_tst, u_count_ref, uv_width, uv_height,
-                                uv_stride, uv_stride));
-    EXPECT_TRUE(CheckArrayEqual(v_accum_tst, v_accum_ref, uv_width, uv_height,
-                                uv_stride, uv_stride));
-    EXPECT_TRUE(CheckArrayEqual(v_count_tst, v_count_ref, uv_width, uv_height,
-                                uv_stride, uv_stride));
-
-    if (HasFailure()) {
-      if (use_32x32) {
-        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y,
-               filter_strength, *filter_weight);
-      } else {
-        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x,
-               ss_y, filter_strength, filter_weight[0], filter_weight[1],
-               filter_weight[2], filter_weight[3]);
-      }
-
-      PrintArrayDiff(y_accum_ref, y_accum_tst, width, height, y_stride,
-                     y_stride);
-      PrintArrayDiff(y_count_ref, y_count_tst, width, height, y_stride,
-                     y_stride);
-      PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride,
-                     uv_stride);
-      PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride,
-                     uv_stride);
-      PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride,
-                     uv_stride);
-      PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride,
-                     uv_stride);
-
-      return;
-    }
-  }
-}
-
-template <typename PixelType>
-void TemporalFilterYUVTest::RunTestFilterWithParam(int width, int height,
-                                                   int ss_x, int ss_y,
-                                                   int filter_strength,
-                                                   int use_32x32,
-                                                   const int *filter_weight) {
-  PixelType y_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  PixelType y_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint16_t y_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint32_t y_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-
-  PixelType u_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  PixelType u_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint16_t u_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint32_t u_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-
-  PixelType v_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  PixelType v_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint16_t v_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint32_t v_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-
-  SetArray(y_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(y_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(u_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(u_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(v_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(v_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-
-  for (int repeats = 0; repeats < num_repeats_; repeats++) {
-    ApplyTestFilter(y_src, MAX_WIDTH, y_pre, MAX_WIDTH, u_src, v_src, MAX_WIDTH,
-                    u_pre, v_pre, MAX_WIDTH, width, height, ss_x, ss_y,
-                    filter_strength, filter_weight, use_32x32, y_accum, y_count,
-                    u_accum, u_count, v_accum, v_count);
-  }
-}
-
-TEST_P(TemporalFilterYUVTest, Use32x32) {
-  const int width = 32, height = 32;
-  const int use_32x32 = 1;
-
-  for (int ss_x = 0; ss_x <= 1; ss_x++) {
-    for (int ss_y = 0; ss_y <= 1; ss_y++) {
-      for (int filter_strength = 0; filter_strength <= 6;
-           filter_strength += 2) {
-        for (int filter_weight = 0; filter_weight <= 2; filter_weight++) {
-          if (use_highbd_) {
-            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
-            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
-                                           adjusted_strength, use_32x32,
-                                           &filter_weight);
-          } else {
-            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
-                                          filter_strength, use_32x32,
-                                          &filter_weight);
-          }
-          ASSERT_FALSE(HasFailure());
-        }
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterYUVTest, Use16x16) {
-  const int width = 32, height = 32;
-  const int use_32x32 = 0;
-
-  for (int ss_x = 0; ss_x <= 1; ss_x++) {
-    for (int ss_y = 0; ss_y <= 1; ss_y++) {
-      for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) {
-        // Set up the filter
-        int filter_weight[4];
-        int filter_idx_cp = filter_idx;
-        for (int idx = 0; idx < 4; idx++) {
-          filter_weight[idx] = filter_idx_cp % 3;
-          filter_idx_cp /= 3;
-        }
-
-        // Test each parameter
-        for (int filter_strength = 0; filter_strength <= 6;
-             filter_strength += 2) {
-          if (use_highbd_) {
-            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
-            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
-                                           adjusted_strength, use_32x32,
-                                           filter_weight);
-          } else {
-            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
-                                          filter_strength, use_32x32,
-                                          filter_weight);
-          }
-
-          ASSERT_FALSE(HasFailure());
-        }
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterYUVTest, SaturationTest) {
-  const int width = 32, height = 32;
-  const int use_32x32 = 1;
-  const int filter_weight = 1;
-  saturate_test_ = 1;
-
-  for (int ss_x = 0; ss_x <= 1; ss_x++) {
-    for (int ss_y = 0; ss_y <= 1; ss_y++) {
-      for (int filter_strength = 0; filter_strength <= 6;
-           filter_strength += 2) {
-        if (use_highbd_) {
-          const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
-          CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
-                                         adjusted_strength, use_32x32,
-                                         &filter_weight);
-        } else {
-          CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
-                                        filter_strength, use_32x32,
-                                        &filter_weight);
-        }
-
-        ASSERT_FALSE(HasFailure());
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterYUVTest, DISABLED_Speed) {
-  const int width = 32, height = 32;
-  num_repeats_ = 1000;
-
-  for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) {
-    const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3;
-    for (int ss_x = 0; ss_x <= 1; ss_x++) {
-      for (int ss_y = 0; ss_y <= 1; ss_y++) {
-        for (int filter_idx = 0; filter_idx < num_filter_weights;
-             filter_idx++) {
-          // Set up the filter
-          int filter_weight[4];
-          int filter_idx_cp = filter_idx;
-          for (int idx = 0; idx < 4; idx++) {
-            filter_weight[idx] = filter_idx_cp % 3;
-            filter_idx_cp /= 3;
-          }
-
-          // Test each parameter
-          for (int filter_strength = 0; filter_strength <= 6;
-               filter_strength += 2) {
-            aom_usec_timer timer;
-            aom_usec_timer_start(&timer);
-
-            if (use_highbd_) {
-              RunTestFilterWithParam<uint16_t>(width, height, ss_x, ss_y,
-                                               filter_strength, use_32x32,
-                                               filter_weight);
-            } else {
-              RunTestFilterWithParam<uint8_t>(width, height, ss_x, ss_y,
-                                              filter_strength, use_32x32,
-                                              filter_weight);
-            }
-
-            aom_usec_timer_mark(&timer);
-            const int elapsed_time =
-                static_cast<int>(aom_usec_timer_elapsed(&timer));
-
-            printf(
-                "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: "
-                "%d, Strength: %d, Time: %5d\n",
-                bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength,
-                elapsed_time);
-          }
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    C, TemporalFilterYUVTest,
-    ::testing::Values(
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 8),
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 10),
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 12)));
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, TemporalFilterYUVTest,
-    ::testing::Values(
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 8),
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 10),
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 12)));
-#endif  // HAVE_SSE4_1
-
-}  // namespace
diff --git a/media/libaom/src/test/test-data.sha1 b/media/libaom/src/test/test-data.sha1
index 383ae79c16..c8587c5e63 100644
--- a/media/libaom/src/test/test-data.sha1
+++ b/media/libaom/src/test/test-data.sha1
@@ -1,3 +1,4 @@
+a0edab4ab4054127474074d967a33616ccdccc76 *hantro_collage_w176h144.yuv
 d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv
 b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
 26b7f64399b84db4b4c9c915d743ec5c2619d4b9 *invalid-bug-1814.ivf
@@ -15,33 +16,32 @@ c9e06c4c7fb7d69fd635a1f606a5e478d60e99cf *invalid-oss-fuzz-10117-mc-buf-use-high
 91a5bedeb4832c1c2900736cc0f644bb63971bbc *invalid-oss-fuzz-10227.ivf
 b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10227.ivf.res
 b2d0a29a65879436bf483d04865faca7d11cc2ee *invalid-oss-fuzz-10389.ivf
-9655e6275888547ecd1f14e20e08ce4891372e76 *invalid-oss-fuzz-10389.ivf.res
-e5fe0e8984c42d53d4ff734c3fbfd57d5c5c25cf *invalid-oss-fuzz-10389.ivf.res.2
+f4ce175af1d871ed1603c8936f6b78e968f93c85 *invalid-oss-fuzz-10389.ivf.res.4
 11df8e9a068669c678097d460b63609d3da73828 *invalid-oss-fuzz-10555.ivf
 b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10555.ivf.res
 cf5945085fe85456a1f74bf4cc7998b88b3f4b62 *invalid-oss-fuzz-10705.ivf
 758671858368ffd2a2c0727898de5661f7cf7d68 *invalid-oss-fuzz-10705.ivf.res
 88e29851122cca3f336824f7fa4d9f757f91110c *invalid-oss-fuzz-10723.ivf
-1af486cd2cc83ebeddc76ca7a1c512cc0ec568d5 *invalid-oss-fuzz-10723.ivf.res
 64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-10723.ivf.res.2
 0784acc8931090ec24eba752d6c27e359e68fe7d *invalid-oss-fuzz-10779.ivf
 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-10779.ivf.res
 7d37be9357f89a100ced694aee1ca5a6fad35ba9 *invalid-oss-fuzz-11477.ivf
 15932651aacfc4622f0910f728f3f95e08e1753d *invalid-oss-fuzz-11477.ivf.res
 1674787c38ddf82a2e5c804203f04f56a304e8e0 *invalid-oss-fuzz-11479.ivf
-1af486cd2cc83ebeddc76ca7a1c512cc0ec568d5 *invalid-oss-fuzz-11479.ivf.res
 64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-11479.ivf.res.2
 b1a45514f0c59be03c9991cd04882426b9b930fa *invalid-oss-fuzz-11523.ivf
-7c44ac1723c14d98bcb888fbf118c959511519ba *invalid-oss-fuzz-11523.ivf.res
 3198c7af55a7d50173ce3c369c0cf2d9cdfface6 *invalid-oss-fuzz-11523.ivf.res.2
 cb445173be760c3554f1740ce4d119f57a7be043 *invalid-oss-fuzz-15363.ivf
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-15363.ivf.res
 5b697360bf0f02de31bae9b8da78e93570958fa4 *invalid-oss-fuzz-16437.ivf
-09d2af8dd22201dd8d48e5dcfcaed281ff9422c7 *invalid-oss-fuzz-16437.ivf.res
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-16437.ivf.res.2
+e821070cea8eb687be102a1a118e0341c2e9df69 *invalid-oss-fuzz-24706.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-24706.ivf.res
+c0c32af28c5c6672d14e76d197894723e8a07b07 *invalid-oss-fuzz-33030.ivf
+fb38337e7d6203618fcfce4bc2dc17d5a4f00638 *invalid-oss-fuzz-33030.ivf.res
 ccbe4081557eb44820a0e6337c4a094421826b9a *invalid-oss-fuzz-9288.ivf
 67c54283fe1a26ccf02cc991e4f9a1eea3ac5e78 *invalid-oss-fuzz-9288.ivf.res
 c0960f032484579f967881cc025b71cfd7a79ee1 *invalid-oss-fuzz-9463.ivf
-d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-9463.ivf.res
 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-9463.ivf.res.2
 f448caf378e250b7eea4fa2d1c3cd7ef4a3211ce *invalid-oss-fuzz-9482.ivf
 b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-9482.ivf.res
@@ -65,6 +65,7 @@ eb438c6540eb429f74404eedfa3228d409c57874 *desktop_640_360_30.yuv
 89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab *kirland_640_480_30.yuv
 33c533192759e5bb4f07abfbac389dc259db4686 *macmarcomoving_640_480_30.yuv
 8bfaab121080821b8f03b23467911e59ec59b8fe *macmarcostationary_640_480_30.yuv
+9ec21aa2c4a8a9d46d5403ea20c93b0ff5ad74a1 *rand_noise_w1280h720.yuv
 70894878d916a599842d9ad0dcd24e10c13e5467 *niklas_640_480_30.yuv
 8784b6df2d8cc946195a90ac00540500d2e522e4 *tacomanarrows_640_480_30.yuv
 edd86a1f5e62fd9da9a9d46078247759c2638009 *tacomasmallcameramovement_640_480_30.yuv
@@ -557,3 +558,10 @@ f8724ed96272ddbc35776908f2df7cb9955766a9 *paris_352_288_30.y4m
 c58ccf7ff04711acc559c06f0bfce3c5b14800c3 *av1-1-b8-23-film_grain-50.ivf.md5
 2f883c7e11c21a31f79bd9c809541be90b0c7c4a *av1-1-b10-23-film_grain-50.ivf
 83f2094fca597ad38b4fd623b807de1774c53ffb *av1-1-b10-23-film_grain-50.ivf.md5
+644e05c6bc0418a72b86427aa01e8b4ecea85e03 *desktop1.320_180.yuv
+ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv
+a17584012187cd886b64f8cb0f35bfd8d762f9dc *av1-1-b8-24-monochrome.ivf
+e71cd9a07f928c527c900daddd071ae60337426d *av1-1-b8-24-monochrome.ivf.md5
+03a8d002594ccc51932332002bb6f9837ef46d0f *av1-1-b10-24-monochrome.ivf
+e24aa6951afd7b2bb53eb1a73e25a19e7b189f82 *av1-1-b10-24-monochrome.ivf.md5
+df0c9481104aa8c81f9e3b61b6d147a331ad3e35 *firstpass_stats
diff --git a/media/libaom/src/test/test.cmake b/media/libaom/src/test/test.cmake
index d4d3b298dc..f44620f3fd 100644
--- a/media/libaom/src/test/test.cmake
+++ b/media/libaom/src/test/test.cmake
@@ -13,29 +13,28 @@ if(AOM_TEST_TEST_CMAKE_)
 endif() # AOM_TEST_TEST_CMAKE_
 set(AOM_TEST_TEST_CMAKE_ 1)
 
-include(FindPythonInterp)
 include(ProcessorCount)
 
 include("${AOM_ROOT}/test/test_data_util.cmake")
 
 set(AOM_UNIT_TEST_DATA_LIST_FILE "${AOM_ROOT}/test/test-data.sha1")
+set(AOM_IDE_TEST_FOLDER "test")
+set(AOM_IDE_TESTDATA_FOLDER "testdata")
 
 list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
             "${AOM_ROOT}/test/test_libaom.cc")
 
 list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
             "${AOM_ROOT}/test/acm_random.h"
+            "${AOM_ROOT}/test/aom_image_test.cc"
             "${AOM_ROOT}/test/aom_integer_test.cc"
             "${AOM_ROOT}/test/av1_config_test.cc"
-            "${AOM_ROOT}/test/blockd_test.cc"
-            "${AOM_ROOT}/test/clear_system_state.h"
+            "${AOM_ROOT}/test/av1_key_value_api_test.cc"
+            "${AOM_ROOT}/test/block_test.cc"
             "${AOM_ROOT}/test/codec_factory.h"
-            "${AOM_ROOT}/test/decode_test_driver.cc"
-            "${AOM_ROOT}/test/decode_test_driver.h"
             "${AOM_ROOT}/test/function_equivalence_test.h"
             "${AOM_ROOT}/test/log2_test.cc"
             "${AOM_ROOT}/test/md5_helper.h"
-            "${AOM_ROOT}/test/metadata_test.cc"
             "${AOM_ROOT}/test/register_state_check.h"
             "${AOM_ROOT}/test/test_vectors.cc"
             "${AOM_ROOT}/test/test_vectors.h"
@@ -43,12 +42,8 @@ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
             "${AOM_ROOT}/test/util.h"
             "${AOM_ROOT}/test/video_source.h")
 
-if(CONFIG_INTERNAL_STATS)
-  list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
-              "${AOM_ROOT}/test/hbd_metrics_test.cc")
-endif()
-
 list(APPEND AOM_UNIT_TEST_DECODER_SOURCES "${AOM_ROOT}/test/decode_api_test.cc"
+            "${AOM_ROOT}/test/decode_scalability_test.cc"
             "${AOM_ROOT}/test/external_frame_buffer_test.cc"
             "${AOM_ROOT}/test/invalid_file_test.cc"
             "${AOM_ROOT}/test/test_vector_test.cc"
@@ -56,43 +51,72 @@ list(APPEND AOM_UNIT_TEST_DECODER_SOURCES "${AOM_ROOT}/test/decode_api_test.cc"
 
 list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
             "${AOM_ROOT}/test/active_map_test.cc"
-            "${AOM_ROOT}/test/altref_test.cc"
             "${AOM_ROOT}/test/aq_segment_test.cc"
+            "${AOM_ROOT}/test/av1_external_partition_test.cc"
             "${AOM_ROOT}/test/borders_test.cc"
             "${AOM_ROOT}/test/cpu_speed_test.cc"
+            "${AOM_ROOT}/test/cpu_used_firstpass_test.cc"
             "${AOM_ROOT}/test/datarate_test.cc"
             "${AOM_ROOT}/test/datarate_test.h"
             "${AOM_ROOT}/test/svc_datarate_test.cc"
             "${AOM_ROOT}/test/encode_api_test.cc"
+            "${AOM_ROOT}/test/encode_small_width_height_test.cc"
             "${AOM_ROOT}/test/encode_test_driver.cc"
             "${AOM_ROOT}/test/encode_test_driver.h"
-            "${AOM_ROOT}/test/end_to_end_test.cc"
-            "${AOM_ROOT}/test/fwd_kf_test.cc"
+            "${AOM_ROOT}/test/end_to_end_psnr_test.cc"
             "${AOM_ROOT}/test/gf_pyr_height_test.cc"
             "${AOM_ROOT}/test/rt_end_to_end_test.cc"
-            "${AOM_ROOT}/test/error_resilience_test.cc"
+            "${AOM_ROOT}/test/loopfilter_control_test.cc"
             "${AOM_ROOT}/test/frame_size_tests.cc"
             "${AOM_ROOT}/test/horz_superres_test.cc"
             "${AOM_ROOT}/test/i420_video_source.h"
             "${AOM_ROOT}/test/level_test.cc"
-            "${AOM_ROOT}/test/lossless_test.cc"
+            "${AOM_ROOT}/test/metadata_test.cc"
             "${AOM_ROOT}/test/monochrome_test.cc"
-            "${AOM_ROOT}/test/qm_test.cc"
             "${AOM_ROOT}/test/resize_test.cc"
             "${AOM_ROOT}/test/scalability_test.cc"
+            "${AOM_ROOT}/test/sharpness_test.cc"
             "${AOM_ROOT}/test/y4m_test.cc"
             "${AOM_ROOT}/test/y4m_video_source.h"
             "${AOM_ROOT}/test/yuv_video_source.h"
             "${AOM_ROOT}/test/time_stamp_test.cc")
 
-list(APPEND AOM_DECODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/decode_perf_test.cc")
 list(APPEND AOM_ENCODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/encode_perf_test.cc")
 list(APPEND AOM_UNIT_TEST_WEBM_SOURCES "${AOM_ROOT}/test/webm_video_source.h")
 list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
             "${AOM_ROOT}/test/test_intra_pred_speed.cc")
 
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+              "${AOM_ROOT}/test/decode_test_driver.cc"
+              "${AOM_ROOT}/test/decode_test_driver.h")
+endif()
+
+if(CONFIG_INTERNAL_STATS AND CONFIG_AV1_HIGHBITDEPTH)
+  list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+              "${AOM_ROOT}/test/hbd_metrics_test.cc")
+endif()
+
+list(APPEND AOM_DECODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/decode_perf_test.cc")
+
+if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
+                   "${AOM_ROOT}/test/av1_external_partition_test.cc"
+                   "${AOM_ROOT}/test/borders_test.cc"
+                   "${AOM_ROOT}/test/cpu_speed_test.cc"
+                   "${AOM_ROOT}/test/cpu_used_firstpass_test.cc"
+                   "${AOM_ROOT}/test/end_to_end_psnr_test.cc"
+                   "${AOM_ROOT}/test/gf_pyr_height_test.cc"
+                   "${AOM_ROOT}/test/horz_superres_test.cc"
+                   "${AOM_ROOT}/test/level_test.cc"
+                   "${AOM_ROOT}/test/metadata_test.cc"
+                   "${AOM_ROOT}/test/monochrome_test.cc"
+                   "${AOM_ROOT}/test/sharpness_test.cc")
+endif()
+
 if(NOT BUILD_SHARED_LIBS)
   list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+              "${AOM_ROOT}/test/aom_mem_test.cc"
               "${AOM_ROOT}/test/av1_common_int_test.cc"
               "${AOM_ROOT}/test/cdef_test.cc"
               "${AOM_ROOT}/test/cfl_test.cc"
@@ -108,91 +132,28 @@ if(NOT BUILD_SHARED_LIBS)
               "${AOM_ROOT}/test/simd_cmp_impl.h"
               "${AOM_ROOT}/test/simd_impl.h")
 
-  if(CONFIG_ACCOUNTING)
-    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
-                "${AOM_ROOT}/test/accounting_test.cc")
-  endif()
-
-  if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
-    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
-                "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc"
-                "${AOM_ROOT}/test/av1_ext_tile_test.cc"
-                "${AOM_ROOT}/test/binary_codes_test.cc"
-                "${AOM_ROOT}/test/boolcoder_test.cc"
-                "${AOM_ROOT}/test/cnn_test.cc"
-                "${AOM_ROOT}/test/coding_path_sync.cc"
-                "${AOM_ROOT}/test/decode_multithreaded_test.cc"
-                "${AOM_ROOT}/test/divu_small_test.cc"
-                "${AOM_ROOT}/test/dr_prediction_test.cc"
-                "${AOM_ROOT}/test/ec_test.cc"
-                "${AOM_ROOT}/test/ethread_test.cc"
-                "${AOM_ROOT}/test/film_grain_table_test.cc"
-                "${AOM_ROOT}/test/sb_multipass_test.cc"
-                "${AOM_ROOT}/test/segment_binarization_sync.cc"
-                "${AOM_ROOT}/test/superframe_test.cc"
-                "${AOM_ROOT}/test/tile_independence_test.cc"
-                "${AOM_ROOT}/test/temporal_filter_planewise_test.cc"
-                "${AOM_ROOT}/test/temporal_filter_yuv_test.cc")
-    if(CONFIG_REALTIME_ONLY)
-      list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
-                       "${AOM_ROOT}/test/cnn_test.cc"
-                       "${AOM_ROOT}/test/temporal_filter_yuv_test.cc")
-    endif()
-    if(NOT CONFIG_AV1_HIGHBITDEPTH)
-      list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
-                       "${AOM_ROOT}/test/coding_path_sync.cc")
-    endif()
-  endif()
-
   list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON
               "${AOM_ROOT}/test/simd_cmp_neon.cc")
-  if(HAVE_NEON)
-    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
-                "${AOM_ROOT}/test/simd_neon_test.cc")
-  endif()
 
   list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE2
               "${AOM_ROOT}/test/simd_cmp_sse2.cc")
-  if(HAVE_SSE2)
-    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
-                "${AOM_ROOT}/test/simd_sse2_test.cc")
-  endif()
 
   list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSSE3
               "${AOM_ROOT}/test/simd_cmp_ssse3.cc")
-  if(HAVE_SSSE3)
-    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
-                "${AOM_ROOT}/test/simd_ssse3_test.cc")
-  endif()
-
-  if(HAVE_SSE4)
-    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
-                "${AOM_ROOT}/test/simd_sse4_test.cc")
-  endif()
-
-  if(HAVE_SSE4_1)
-    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
-                "${AOM_ROOT}/test/filterintra_test.cc")
-  endif()
 
   list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_AVX2
               "${AOM_ROOT}/test/simd_cmp_avx2.cc")
-  if(HAVE_AVX2)
-    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
-                "${AOM_ROOT}/test/simd_avx2_test.cc")
-  endif()
 
   list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
               "${AOM_ROOT}/test/arf_freq_test.cc"
-              "${AOM_ROOT}/test/av1_convolve_2d_test.cc"
-              "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc"
-              "${AOM_ROOT}/test/av1_convolve_2d_test_util.h"
+              "${AOM_ROOT}/test/av1_convolve_test.cc"
               "${AOM_ROOT}/test/av1_fwd_txfm1d_test.cc"
               "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc"
               "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
               "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
               "${AOM_ROOT}/test/av1_nn_predict_test.cc"
               "${AOM_ROOT}/test/av1_round_shift_array_test.cc"
+              "${AOM_ROOT}/test/av1_softmax_test.cc"
               "${AOM_ROOT}/test/av1_txfm_test.cc"
               "${AOM_ROOT}/test/av1_txfm_test.h"
               "${AOM_ROOT}/test/av1_wedge_utils_test.cc"
@@ -202,10 +163,13 @@ if(NOT BUILD_SHARED_LIBS)
               "${AOM_ROOT}/test/comp_avg_pred_test.cc"
               "${AOM_ROOT}/test/comp_avg_pred_test.h"
               "${AOM_ROOT}/test/comp_mask_variance_test.cc"
-              "${AOM_ROOT}/test/edge_detect_test.cc"
+              "${AOM_ROOT}/test/encodemb_test.cc"
               "${AOM_ROOT}/test/encodetxb_test.cc"
+              "${AOM_ROOT}/test/end_to_end_qmpsnr_test.cc"
+              "${AOM_ROOT}/test/end_to_end_ssim_test.cc"
               "${AOM_ROOT}/test/error_block_test.cc"
               "${AOM_ROOT}/test/fft_test.cc"
+              "${AOM_ROOT}/test/firstpass_test.cc"
               "${AOM_ROOT}/test/fwht4x4_test.cc"
               "${AOM_ROOT}/test/fdct4x4_test.cc"
               "${AOM_ROOT}/test/hadamard_test.cc"
@@ -217,32 +181,133 @@ if(NOT BUILD_SHARED_LIBS)
               "${AOM_ROOT}/test/obmc_sad_test.cc"
               "${AOM_ROOT}/test/obmc_variance_test.cc"
               "${AOM_ROOT}/test/pickrst_test.cc"
-              "${AOM_ROOT}/test/quantize_func_test.cc"
               "${AOM_ROOT}/test/sad_test.cc"
               "${AOM_ROOT}/test/subtract_test.cc"
               "${AOM_ROOT}/test/reconinter_test.cc"
               "${AOM_ROOT}/test/sum_squares_test.cc"
+              "${AOM_ROOT}/test/sse_sum_test.cc"
               "${AOM_ROOT}/test/variance_test.cc"
               "${AOM_ROOT}/test/wiener_test.cc"
               "${AOM_ROOT}/test/frame_error_test.cc"
               "${AOM_ROOT}/test/warp_filter_test.cc"
               "${AOM_ROOT}/test/warp_filter_test_util.cc"
-              "${AOM_ROOT}/test/warp_filter_test_util.h")
+              "${AOM_ROOT}/test/warp_filter_test_util.h"
+              "${AOM_ROOT}/test/webmenc_test.cc"
+              "${AOM_ROOT}/test/av1_k_means_test.cc")
 
   list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
-              "${AOM_ROOT}/test/av1_highbd_iht_test.cc"
-              "${AOM_ROOT}/test/av1_quantize_test.cc"
               "${AOM_ROOT}/test/corner_match_test.cc"
               "${AOM_ROOT}/test/simd_cmp_sse4.cc")
 
-  if(NOT CONFIG_AV1_HIGHBITDEPTH)
-    list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
-                     "${AOM_ROOT}/test/av1_quantize_test.cc")
+  if(CONFIG_ACCOUNTING)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/accounting_test.cc")
   endif()
 
-  if(NOT (HAVE_SSE2 OR HAVE_NEON))
-    list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
-                     "${AOM_ROOT}/test/quantize_func_test.cc")
+  if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/altref_test.cc"
+                "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc"
+                "${AOM_ROOT}/test/av1_ext_tile_test.cc"
+                "${AOM_ROOT}/test/binary_codes_test.cc"
+                "${AOM_ROOT}/test/boolcoder_test.cc"
+                "${AOM_ROOT}/test/cnn_test.cc"
+                "${AOM_ROOT}/test/decode_multithreaded_test.cc"
+                "${AOM_ROOT}/test/divu_small_test.cc"
+                "${AOM_ROOT}/test/dr_prediction_test.cc"
+                "${AOM_ROOT}/test/ec_test.cc"
+                "${AOM_ROOT}/test/error_resilience_test.cc"
+                "${AOM_ROOT}/test/ethread_test.cc"
+                "${AOM_ROOT}/test/film_grain_table_test.cc"
+                "${AOM_ROOT}/test/kf_test.cc"
+                "${AOM_ROOT}/test/lossless_test.cc"
+                "${AOM_ROOT}/test/quant_test.cc"
+                "${AOM_ROOT}/test/ratectrl_test.cc"
+                "${AOM_ROOT}/test/rd_test.cc"
+                "${AOM_ROOT}/test/sb_multipass_test.cc"
+                "${AOM_ROOT}/test/screen_content_test.cc"
+                "${AOM_ROOT}/test/segment_binarization_sync.cc"
+                "${AOM_ROOT}/test/still_picture_test.cc"
+                "${AOM_ROOT}/test/temporal_filter_test.cc"
+                "${AOM_ROOT}/test/tile_config_test.cc"
+                "${AOM_ROOT}/test/tile_independence_test.cc"
+                "${AOM_ROOT}/test/tpl_model_test.cc")
+    if(CONFIG_AV1_HIGHBITDEPTH)
+      list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                  "${AOM_ROOT}/test/coding_path_sync.cc")
+    endif()
+    if(CONFIG_REALTIME_ONLY)
+      list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
+                       "${AOM_ROOT}/test/altref_test.cc"
+                       "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc"
+                       "${AOM_ROOT}/test/av1_ext_tile_test.cc"
+                       "${AOM_ROOT}/test/cnn_test.cc"
+                       "${AOM_ROOT}/test/decode_multithreaded_test.cc"
+                       "${AOM_ROOT}/test/error_resilience_test.cc"
+                       "${AOM_ROOT}/test/kf_test.cc"
+                       "${AOM_ROOT}/test/lossless_test.cc"
+                       "${AOM_ROOT}/test/sb_multipass_test.cc"
+                       "${AOM_ROOT}/test/selfguided_filter_test.cc"
+                       "${AOM_ROOT}/test/screen_content_test.cc"
+                       "${AOM_ROOT}/test/still_picture_test.cc"
+                       "${AOM_ROOT}/test/tile_independence_test.cc"
+                       "${AOM_ROOT}/test/tpl_model_test.cc")
+    endif()
+  endif()
+
+  if(HAVE_NEON)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_neon_test.cc")
+  endif()
+
+  if(CONFIG_FRAME_PARALLEL_ENCODE
+     AND CONFIG_FPMT_TEST
+     AND (NOT CONFIG_REALTIME_ONLY))
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/frame_parallel_enc_test.cc")
+  endif()
+
+  if(HAVE_SSE2)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_sse2_test.cc")
+  endif()
+
+  if(HAVE_SSSE3)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_ssse3_test.cc")
+  endif()
+
+  if(HAVE_SSE4)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_sse4_test.cc")
+  endif()
+
+  if(HAVE_SSE4_1 OR HAVE_NEON)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/filterintra_test.cc")
+
+    list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+                "${AOM_ROOT}/test/av1_highbd_iht_test.cc")
+  endif()
+
+  if(HAVE_AVX2)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_avx2_test.cc")
+  endif()
+
+  if(CONFIG_AV1_TEMPORAL_DENOISING AND (HAVE_SSE2 OR HAVE_NEON))
+    list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+                "${AOM_ROOT}/test/av1_temporal_denoiser_test.cc")
+  endif()
+
+  if(CONFIG_AV1_HIGHBITDEPTH)
+    list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
+                "${AOM_ROOT}/test/av1_quantize_test.cc")
+  endif()
+
+  if(HAVE_SSE2 OR HAVE_NEON)
+    list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+                "${AOM_ROOT}/test/quantize_func_test.cc")
   endif()
 
   if(HAVE_SSE4_1)
@@ -257,35 +322,74 @@ if(NOT BUILD_SHARED_LIBS)
     list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES "${AOM_ROOT}/test/hash_test.cc")
   endif()
 
+  if(CONFIG_REALTIME_ONLY)
+    list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
+                     "${AOM_ROOT}/test/end_to_end_qmpsnr_test.cc"
+                     "${AOM_ROOT}/test/end_to_end_ssim_test.cc"
+                     "${AOM_ROOT}/test/firstpass_test.cc"
+                     "${AOM_ROOT}/test/frame_error_test.cc"
+                     "${AOM_ROOT}/test/motion_vector_test.cc"
+                     "${AOM_ROOT}/test/obmc_sad_test.cc"
+                     "${AOM_ROOT}/test/obmc_variance_test.cc"
+                     "${AOM_ROOT}/test/pickrst_test.cc"
+                     "${AOM_ROOT}/test/warp_filter_test.cc"
+                     "${AOM_ROOT}/test/warp_filter_test_util.cc"
+                     "${AOM_ROOT}/test/warp_filter_test_util.h"
+                     "${AOM_ROOT}/test/wiener_test.cc")
+  endif()
 endif()
 
-if(ENABLE_TESTS)
-  find_package(PythonInterp)
-  if(NOT PYTHONINTERP_FOUND)
-    message(
-      FATAL_ERROR "--- Unit tests require Python, rerun cmake with "
-                  "-DENABLE_TESTS=0 to avoid this error, or install Python and "
-                  "make sure it's in your PATH.")
-  endif()
+if(CONFIG_AV1_ENCODER AND ENABLE_TESTS)
+  list(APPEND AOM_RC_INTERFACE_SOURCES
+              "${AOM_ROOT}/test/encode_test_driver.cc"
+              "${AOM_ROOT}/test/encode_test_driver.h"
+              "${AOM_ROOT}/test/decode_test_driver.cc"
+              "${AOM_ROOT}/test/decode_test_driver.h"
+              "${AOM_ROOT}/test/codec_factory.h"
+              "${AOM_ROOT}/test/test_aom_rc_interface.cc"
+              "${AOM_ROOT}/test/ratectrl_rtc_test.cc"
+              "${AOM_ROOT}/common/y4minput.c"
+              "${AOM_ROOT}/common/y4minput.h"
+              "${AOM_ROOT}/test/y4m_video_source.h"
+              "${AOM_ROOT}/test/yuv_video_source.h")
+
+  list(APPEND AV1_RC_QMODE_SOURCES "${AOM_ROOT}/test/mock_ratectrl_qmode.h"
+              "${AOM_ROOT}/test/ratectrl_qmode_test.cc")
+endif()
 
+if(ENABLE_TESTS)
   if(BUILD_SHARED_LIBS AND APPLE) # Silence an RPATH warning.
     set(CMAKE_MACOSX_RPATH 1)
   endif()
 
-  include_directories(
-    "${AOM_ROOT}/third_party/googletest/src/googletest/include")
-
-  include_directories("${AOM_ROOT}/third_party/googletest/src/googletest")
   add_library(
     aom_gtest STATIC
     "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
-  if(MSVC OR WIN32)
-    target_compile_definitions(aom_gtest PRIVATE GTEST_OS_WINDOWS=1)
-  elseif(CONFIG_MULTITHREAD AND CMAKE_USE_PTHREADS_INIT)
-    target_compile_definitions(aom_gtest PRIVATE GTEST_HAS_PTHREAD=1)
-  else()
-    target_compile_definitions(aom_gtest PRIVATE GTEST_HAS_PTHREAD=0)
+  set_property(TARGET aom_gtest PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+  target_include_directories(
+    aom_gtest
+    PUBLIC "${AOM_ROOT}/third_party/googletest/src/googletest/include"
+    PRIVATE "${AOM_ROOT}/third_party/googletest/src/googletest")
+
+  # The definition of GTEST_HAS_PTHREAD must be public, since it's checked by
+  # interface headers, not just by the implementation.
+  if(NOT (MSVC OR WIN32))
+    if(CONFIG_MULTITHREAD AND CMAKE_USE_PTHREADS_INIT)
+      target_compile_definitions(aom_gtest PUBLIC GTEST_HAS_PTHREAD=1)
+    else()
+      target_compile_definitions(aom_gtest PUBLIC GTEST_HAS_PTHREAD=0)
+    endif()
   endif()
+
+  add_library(
+    aom_gmock STATIC
+    "${AOM_ROOT}/third_party/googletest/src/googlemock/src/gmock-all.cc")
+  set_property(TARGET aom_gmock PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+  target_include_directories(
+    aom_gmock
+    PUBLIC "${AOM_ROOT}/third_party/googletest/src/googlemock/include"
+    PRIVATE "${AOM_ROOT}/third_party/googletest/src/googlemock")
+  target_link_libraries(aom_gmock ${AOM_LIB_LINK_TYPE} aom_gtest)
 endif()
 
 # Setup testdata download targets, test build targets, and test run targets. The
@@ -298,21 +402,28 @@ function(setup_aom_test_targets)
   # list into separate object library targets, and then linking them into
   # test_libaom.
   add_library(test_aom_common OBJECT ${AOM_UNIT_TEST_COMMON_SOURCES})
+  set_property(TARGET test_aom_common PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
   add_dependencies(test_aom_common aom)
+  target_link_libraries(test_aom_common ${AOM_LIB_LINK_TYPE} aom_gtest)
 
   if(CONFIG_AV1_DECODER)
     add_library(test_aom_decoder OBJECT ${AOM_UNIT_TEST_DECODER_SOURCES})
+    set_property(TARGET test_aom_decoder PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
     add_dependencies(test_aom_decoder aom)
+    target_link_libraries(test_aom_decoder ${AOM_LIB_LINK_TYPE} aom_gtest)
   endif()
 
   if(CONFIG_AV1_ENCODER)
     add_library(test_aom_encoder OBJECT ${AOM_UNIT_TEST_ENCODER_SOURCES})
+    set_property(TARGET test_aom_encoder PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
     add_dependencies(test_aom_encoder aom)
+    target_link_libraries(test_aom_encoder ${AOM_LIB_LINK_TYPE} aom_gtest)
   endif()
 
   add_executable(test_libaom ${AOM_UNIT_TEST_WRAPPER_SOURCES}
                              $<TARGET_OBJECTS:aom_common_app_util>
                              $<TARGET_OBJECTS:test_aom_common>)
+  set_property(TARGET test_libaom PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
   list(APPEND AOM_APP_TARGETS test_libaom)
 
   if(CONFIG_AV1_DECODER)
@@ -336,6 +447,8 @@ function(setup_aom_test_targets)
       add_executable(test_intra_pred_speed
                      ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
                      $<TARGET_OBJECTS:aom_common_app_util>)
+      set_property(TARGET test_intra_pred_speed
+                   PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
       target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom
                             aom_gtest)
       list(APPEND AOM_APP_TARGETS test_intra_pred_speed)
@@ -394,12 +507,15 @@ function(setup_aom_test_targets)
                 -DAOM_TEST_FILE="${test_file}"
                 -DAOM_TEST_CHECKSUM=${test_file_checksum} -P
                 "${AOM_ROOT}/test/test_data_download_worker.cmake")
+      set_property(TARGET testdata_${test_index}
+                   PROPERTY FOLDER ${AOM_IDE_TESTDATA_FOLDER})
       list(APPEND testdata_targets testdata_${test_index})
     endforeach()
 
     # Create a custom build target for running each test data download target.
     add_custom_target(testdata)
     add_dependencies(testdata ${testdata_targets})
+    set_property(TARGET testdata PROPERTY FOLDER ${AOM_IDE_TESTDATA_FOLDER})
 
     # Skip creation of test run targets when generating for Visual Studio and
     # Xcode unless the user explicitly requests IDE test hosting. This is done
@@ -425,9 +541,11 @@ function(setup_aom_test_targets)
                                   -DTEST_LIBAOM=$<TARGET_FILE:test_libaom> -P
                                   "${AOM_ROOT}/test/test_runner.cmake"
                           DEPENDS testdata test_libaom)
+        set_property(TARGET ${test_name} PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
         list(APPEND test_targets ${test_name})
       endforeach()
       add_custom_target(runtests)
+      set_property(TARGET runtests PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
       add_dependencies(runtests ${test_targets})
     endif()
   endif()
@@ -467,5 +585,30 @@ function(setup_aom_test_targets)
     endforeach()
   endforeach()
 
+  # Set up test for rc interface
+  if(CONFIG_AV1_RC_RTC
+     AND CONFIG_AV1_ENCODER
+     AND ENABLE_TESTS
+     AND CONFIG_WEBM_IO
+     AND NOT BUILD_SHARED_LIBS)
+    add_executable(test_aom_rc_interface ${AOM_RC_INTERFACE_SOURCES})
+    target_link_libraries(test_aom_rc_interface ${AOM_LIB_LINK_TYPE} aom
+                          aom_av1_rc aom_gtest webm)
+    set_property(TARGET test_aom_rc_interface
+                 PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+    list(APPEND AOM_APP_TARGETS test_aom_rc_interface)
+  endif()
+
+  if(CONFIG_AV1_ENCODER
+     AND ENABLE_TESTS
+     AND NOT BUILD_SHARED_LIBS
+     AND NOT CONFIG_REALTIME_ONLY)
+    add_executable(test_av1_rc_qmode ${AV1_RC_QMODE_SOURCES})
+    target_link_libraries(test_av1_rc_qmode ${AOM_LIB_LINK_TYPE} av1_rc_qmode
+                          aom_gtest aom_gmock)
+    set_property(TARGET test_av1_rc_qmode
+                 PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+    list(APPEND AOM_APP_TARGETS test_av1_rc_qmode)
+  endif()
   set(AOM_APP_TARGETS ${AOM_APP_TARGETS} PARENT_SCOPE)
 endfunction()
diff --git a/media/libaom/src/av1/common/cdef_block_sse2.c b/media/libaom/src/test/test_aom_rc_interface.cc
index 73f115d17c..0182b62ec8 100644
--- a/media/libaom/src/av1/common/cdef_block_sse2.c
+++ b/media/libaom/src/test/test_aom_rc_interface.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_sse2
-#include "av1/common/cdef_block_simd.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/media/libaom/src/test/test_data_util.cmake b/media/libaom/src/test/test_data_util.cmake
index 050600e133..b40de5b723 100644
--- a/media/libaom/src/test/test_data_util.cmake
+++ b/media/libaom/src/test/test_data_util.cmake
@@ -10,7 +10,10 @@
 #
 
 list(APPEND AOM_TEST_DATA_FILE_NAMES
+            "desktop1.320_180.yuv"
+            "hantro_collage_w176h144.yuv"
             "hantro_collage_w352h288.yuv"
+            "hantro_collage_w352h288_nv12.yuv"
             "hantro_odd.yuv"
             "paris_352_288_30.y4m"
             "park_joy_90p_10_420.y4m"
@@ -27,12 +30,14 @@ list(APPEND AOM_TEST_DATA_FILE_NAMES
             "park_joy_90p_8_444.y4m"
             "pixel_capture_w320h240.yuv"
             "desktop_credits.y4m"
+            "rand_noise_w1280h720.yuv"
             "niklas_1280_720_30.y4m"
             "rush_hour_444.y4m"
             "screendata.y4m"
             "niklas_640_480_30.yuv"
             "vase10x10.yuv"
-            "vase10x10_tiles.txt")
+            "vase10x10_tiles.txt"
+            "firstpass_stats")
 
 if(ENABLE_DECODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
   list(APPEND AOM_TEST_DATA_FILE_NAMES "niklas_1280_720_30.yuv")
@@ -298,6 +303,8 @@ if(CONFIG_AV1_DECODER)
               "av1-1-b10-00-quantizer-63.ivf.md5"
               "av1-1-b10-23-film_grain-50.ivf"
               "av1-1-b10-23-film_grain-50.ivf.md5"
+              "av1-1-b10-24-monochrome.ivf"
+              "av1-1-b10-24-monochrome.ivf.md5"
               "av1-1-b8-01-size-16x16.ivf"
               "av1-1-b8-01-size-16x16.ivf.md5"
               "av1-1-b8-01-size-16x18.ivf"
@@ -518,6 +525,8 @@ if(CONFIG_AV1_DECODER)
               "av1-1-b8-22-svc-L2T2.ivf.md5"
               "av1-1-b8-23-film_grain-50.ivf"
               "av1-1-b8-23-film_grain-50.ivf.md5"
+              "av1-1-b8-24-monochrome.ivf"
+              "av1-1-b8-24-monochrome.ivf.md5"
               "invalid-bug-1814.ivf"
               "invalid-bug-1814.ivf.res"
               "invalid-chromium-906381.ivf"
@@ -533,33 +542,32 @@ if(CONFIG_AV1_DECODER)
               "invalid-oss-fuzz-10227.ivf"
               "invalid-oss-fuzz-10227.ivf.res"
               "invalid-oss-fuzz-10389.ivf"
-              "invalid-oss-fuzz-10389.ivf.res"
-              "invalid-oss-fuzz-10389.ivf.res.2"
+              "invalid-oss-fuzz-10389.ivf.res.4"
               "invalid-oss-fuzz-10555.ivf"
               "invalid-oss-fuzz-10555.ivf.res"
               "invalid-oss-fuzz-10705.ivf"
               "invalid-oss-fuzz-10705.ivf.res"
               "invalid-oss-fuzz-10723.ivf"
-              "invalid-oss-fuzz-10723.ivf.res"
               "invalid-oss-fuzz-10723.ivf.res.2"
               "invalid-oss-fuzz-10779.ivf"
               "invalid-oss-fuzz-10779.ivf.res"
               "invalid-oss-fuzz-11477.ivf"
               "invalid-oss-fuzz-11477.ivf.res"
               "invalid-oss-fuzz-11479.ivf"
-              "invalid-oss-fuzz-11479.ivf.res"
               "invalid-oss-fuzz-11479.ivf.res.2"
               "invalid-oss-fuzz-11523.ivf"
-              "invalid-oss-fuzz-11523.ivf.res"
               "invalid-oss-fuzz-11523.ivf.res.2"
               "invalid-oss-fuzz-15363.ivf"
               "invalid-oss-fuzz-15363.ivf.res"
               "invalid-oss-fuzz-16437.ivf"
-              "invalid-oss-fuzz-16437.ivf.res"
+              "invalid-oss-fuzz-16437.ivf.res.2"
+              "invalid-oss-fuzz-24706.ivf"
+              "invalid-oss-fuzz-24706.ivf.res"
+              "invalid-oss-fuzz-33030.ivf"
+              "invalid-oss-fuzz-33030.ivf.res"
               "invalid-oss-fuzz-9288.ivf"
               "invalid-oss-fuzz-9288.ivf.res"
               "invalid-oss-fuzz-9463.ivf"
-              "invalid-oss-fuzz-9463.ivf.res"
               "invalid-oss-fuzz-9463.ivf.res.2"
               "invalid-oss-fuzz-9482.ivf"
               "invalid-oss-fuzz-9482.ivf.res"
diff --git a/media/libaom/src/test/test_intra_pred_speed.cc b/media/libaom/src/test/test_intra_pred_speed.cc
index 25c50d022a..80fba3b5f2 100644
--- a/media/libaom/src/test/test_intra_pred_speed.cc
+++ b/media/libaom/src/test/test_intra_pred_speed.cc
@@ -19,7 +19,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/md5_helper.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
@@ -135,7 +134,6 @@ void TestIntraPred(TX_SIZE tx_size, AvxPredFunc const *pred_funcs,
       pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
                     intra_pred_test_mem.above, intra_pred_test_mem.left);
     }
-    libaom_test::ClearSystemState();
     aom_usec_timer_mark(&timer);
     const int elapsed_time =
         static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
@@ -413,21 +411,17 @@ static const char *const kSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = {
 // -----------------------------------------------------------------------------
 // 4x4, 4x8, 4x16
 
-INTRA_PRED_TEST(C_1, TX_4X4, aom_dc_predictor_4x4_c,
-                aom_dc_left_predictor_4x4_c, aom_dc_top_predictor_4x4_c,
-                aom_dc_128_predictor_4x4_c, aom_v_predictor_4x4_c,
-                aom_h_predictor_4x4_c, aom_paeth_predictor_4x4_c,
-                aom_smooth_predictor_4x4_c, aom_smooth_v_predictor_4x4_c,
-                aom_smooth_h_predictor_4x4_c)
-
-INTRA_PRED_TEST(C_2, TX_4X8, aom_dc_predictor_4x8_c,
-                aom_dc_left_predictor_4x8_c, aom_dc_top_predictor_4x8_c,
-                aom_dc_128_predictor_4x8_c, aom_v_predictor_4x8_c,
-                aom_h_predictor_4x8_c, aom_paeth_predictor_4x8_c,
-                aom_smooth_predictor_4x8_c, aom_smooth_v_predictor_4x8_c,
-                aom_smooth_h_predictor_4x8_c)
-
-INTRA_PRED_TEST(C_3, TX_4X16, aom_dc_predictor_4x16_c,
+INTRA_PRED_TEST(C, TX_4X4, aom_dc_predictor_4x4_c, aom_dc_left_predictor_4x4_c,
+                aom_dc_top_predictor_4x4_c, aom_dc_128_predictor_4x4_c,
+                aom_v_predictor_4x4_c, aom_h_predictor_4x4_c,
+                aom_paeth_predictor_4x4_c, aom_smooth_predictor_4x4_c,
+                aom_smooth_v_predictor_4x4_c, aom_smooth_h_predictor_4x4_c)
+INTRA_PRED_TEST(C, TX_4X8, aom_dc_predictor_4x8_c, aom_dc_left_predictor_4x8_c,
+                aom_dc_top_predictor_4x8_c, aom_dc_128_predictor_4x8_c,
+                aom_v_predictor_4x8_c, aom_h_predictor_4x8_c,
+                aom_paeth_predictor_4x8_c, aom_smooth_predictor_4x8_c,
+                aom_smooth_v_predictor_4x8_c, aom_smooth_h_predictor_4x8_c)
+INTRA_PRED_TEST(C, TX_4X16, aom_dc_predictor_4x16_c,
                 aom_dc_left_predictor_4x16_c, aom_dc_top_predictor_4x16_c,
                 aom_dc_128_predictor_4x16_c, aom_v_predictor_4x16_c,
                 aom_h_predictor_4x16_c, aom_paeth_predictor_4x16_c,
@@ -435,30 +429,30 @@ INTRA_PRED_TEST(C_3, TX_4X16, aom_dc_predictor_4x16_c,
                 aom_smooth_h_predictor_4x16_c)
 
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_dc_predictor_4x4_sse2,
+INTRA_PRED_TEST(SSE2, TX_4X4, aom_dc_predictor_4x4_sse2,
                 aom_dc_left_predictor_4x4_sse2, aom_dc_top_predictor_4x4_sse2,
                 aom_dc_128_predictor_4x4_sse2, aom_v_predictor_4x4_sse2,
                 aom_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_2, TX_4X8, aom_dc_predictor_4x8_sse2,
+INTRA_PRED_TEST(SSE2, TX_4X8, aom_dc_predictor_4x8_sse2,
                 aom_dc_left_predictor_4x8_sse2, aom_dc_top_predictor_4x8_sse2,
                 aom_dc_128_predictor_4x8_sse2, aom_v_predictor_4x8_sse2,
                 aom_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_3, TX_4X16, aom_dc_predictor_4x16_sse2,
+INTRA_PRED_TEST(SSE2, TX_4X16, aom_dc_predictor_4x16_sse2,
                 aom_dc_left_predictor_4x16_sse2, aom_dc_top_predictor_4x16_sse2,
                 aom_dc_128_predictor_4x16_sse2, aom_v_predictor_4x16_sse2,
                 aom_h_predictor_4x16_sse2, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_1, TX_4X4, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_4X4, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_4x4_ssse3, aom_smooth_predictor_4x4_ssse3,
                 aom_smooth_v_predictor_4x4_ssse3,
                 aom_smooth_h_predictor_4x4_ssse3)
-INTRA_PRED_TEST(SSSE3_2, TX_4X8, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_4X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_4x8_ssse3, aom_smooth_predictor_4x8_ssse3,
                 aom_smooth_v_predictor_4x8_ssse3,
                 aom_smooth_h_predictor_4x8_ssse3)
-INTRA_PRED_TEST(SSSE3_3, TX_4X16, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_4X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_4x16_ssse3, aom_smooth_predictor_4x16_ssse3,
                 aom_smooth_v_predictor_4x16_ssse3,
                 aom_smooth_h_predictor_4x16_ssse3)
@@ -473,7 +467,17 @@ INTRA_PRED_TEST(DSPR2, TX_4X4, aom_dc_predictor_4x4_dspr2, NULL, NULL, NULL,
 INTRA_PRED_TEST(NEON, TX_4X4, aom_dc_predictor_4x4_neon,
                 aom_dc_left_predictor_4x4_neon, aom_dc_top_predictor_4x4_neon,
                 aom_dc_128_predictor_4x4_neon, aom_v_predictor_4x4_neon,
-                aom_h_predictor_4x4_neon, NULL, NULL, NULL, NULL)
+                aom_h_predictor_4x4_neon, aom_paeth_predictor_4x4_neon,
+                aom_smooth_predictor_4x4_neon, aom_smooth_v_predictor_4x4_neon,
+                aom_smooth_h_predictor_4x4_neon)
+INTRA_PRED_TEST(NEON, TX_4X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_4x8_neon, aom_smooth_predictor_4x8_neon,
+                aom_smooth_v_predictor_4x8_neon,
+                aom_smooth_h_predictor_4x8_neon)
+INTRA_PRED_TEST(NEON, TX_4X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_4x16_neon, aom_smooth_predictor_4x16_neon,
+                aom_smooth_v_predictor_4x16_neon,
+                aom_smooth_h_predictor_4x16_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -486,28 +490,24 @@ INTRA_PRED_TEST(MSA, TX_4X4, aom_dc_predictor_4x4_msa,
 // -----------------------------------------------------------------------------
 // 8x8, 8x4, 8x16, 8x32
 
-INTRA_PRED_TEST(C_1, TX_8X8, aom_dc_predictor_8x8_c,
-                aom_dc_left_predictor_8x8_c, aom_dc_top_predictor_8x8_c,
-                aom_dc_128_predictor_8x8_c, aom_v_predictor_8x8_c,
-                aom_h_predictor_8x8_c, aom_paeth_predictor_8x8_c,
-                aom_smooth_predictor_8x8_c, aom_smooth_v_predictor_8x8_c,
-                aom_smooth_h_predictor_8x8_c)
-
-INTRA_PRED_TEST(C_2, TX_8X4, aom_dc_predictor_8x4_c,
-                aom_dc_left_predictor_8x4_c, aom_dc_top_predictor_8x4_c,
-                aom_dc_128_predictor_8x4_c, aom_v_predictor_8x4_c,
-                aom_h_predictor_8x4_c, aom_paeth_predictor_8x4_c,
-                aom_smooth_predictor_8x4_c, aom_smooth_v_predictor_8x4_c,
-                aom_smooth_h_predictor_8x4_c)
-
-INTRA_PRED_TEST(C_3, TX_8X16, aom_dc_predictor_8x16_c,
+INTRA_PRED_TEST(C, TX_8X8, aom_dc_predictor_8x8_c, aom_dc_left_predictor_8x8_c,
+                aom_dc_top_predictor_8x8_c, aom_dc_128_predictor_8x8_c,
+                aom_v_predictor_8x8_c, aom_h_predictor_8x8_c,
+                aom_paeth_predictor_8x8_c, aom_smooth_predictor_8x8_c,
+                aom_smooth_v_predictor_8x8_c, aom_smooth_h_predictor_8x8_c)
+
+INTRA_PRED_TEST(C, TX_8X4, aom_dc_predictor_8x4_c, aom_dc_left_predictor_8x4_c,
+                aom_dc_top_predictor_8x4_c, aom_dc_128_predictor_8x4_c,
+                aom_v_predictor_8x4_c, aom_h_predictor_8x4_c,
+                aom_paeth_predictor_8x4_c, aom_smooth_predictor_8x4_c,
+                aom_smooth_v_predictor_8x4_c, aom_smooth_h_predictor_8x4_c)
+INTRA_PRED_TEST(C, TX_8X16, aom_dc_predictor_8x16_c,
                 aom_dc_left_predictor_8x16_c, aom_dc_top_predictor_8x16_c,
                 aom_dc_128_predictor_8x16_c, aom_v_predictor_8x16_c,
                 aom_h_predictor_8x16_c, aom_paeth_predictor_8x16_c,
                 aom_smooth_predictor_8x16_c, aom_smooth_v_predictor_8x16_c,
                 aom_smooth_h_predictor_8x16_c)
-
-INTRA_PRED_TEST(C_4, TX_8X32, aom_dc_predictor_8x32_c,
+INTRA_PRED_TEST(C, TX_8X32, aom_dc_predictor_8x32_c,
                 aom_dc_left_predictor_8x32_c, aom_dc_top_predictor_8x32_c,
                 aom_dc_128_predictor_8x32_c, aom_v_predictor_8x32_c,
                 aom_h_predictor_8x32_c, aom_paeth_predictor_8x32_c,
@@ -515,38 +515,38 @@ INTRA_PRED_TEST(C_4, TX_8X32, aom_dc_predictor_8x32_c,
                 aom_smooth_h_predictor_8x32_c)
 
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_dc_predictor_8x8_sse2,
+INTRA_PRED_TEST(SSE2, TX_8X8, aom_dc_predictor_8x8_sse2,
                 aom_dc_left_predictor_8x8_sse2, aom_dc_top_predictor_8x8_sse2,
                 aom_dc_128_predictor_8x8_sse2, aom_v_predictor_8x8_sse2,
                 aom_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_2, TX_8X4, aom_dc_predictor_8x4_sse2,
+INTRA_PRED_TEST(SSE2, TX_8X4, aom_dc_predictor_8x4_sse2,
                 aom_dc_left_predictor_8x4_sse2, aom_dc_top_predictor_8x4_sse2,
                 aom_dc_128_predictor_8x4_sse2, aom_v_predictor_8x4_sse2,
                 aom_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_3, TX_8X16, aom_dc_predictor_8x16_sse2,
+INTRA_PRED_TEST(SSE2, TX_8X16, aom_dc_predictor_8x16_sse2,
                 aom_dc_left_predictor_8x16_sse2, aom_dc_top_predictor_8x16_sse2,
                 aom_dc_128_predictor_8x16_sse2, aom_v_predictor_8x16_sse2,
                 aom_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_4, TX_8X32, aom_dc_predictor_8x32_sse2,
+INTRA_PRED_TEST(SSE2, TX_8X32, aom_dc_predictor_8x32_sse2,
                 aom_dc_left_predictor_8x32_sse2, aom_dc_top_predictor_8x32_sse2,
                 aom_dc_128_predictor_8x32_sse2, aom_v_predictor_8x32_sse2,
                 aom_h_predictor_8x32_sse2, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_1, TX_8X8, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_8X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_8x8_ssse3, aom_smooth_predictor_8x8_ssse3,
                 aom_smooth_v_predictor_8x8_ssse3,
                 aom_smooth_h_predictor_8x8_ssse3)
-INTRA_PRED_TEST(SSSE3_2, TX_8X4, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_8X4, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_8x4_ssse3, aom_smooth_predictor_8x4_ssse3,
                 aom_smooth_v_predictor_8x4_ssse3,
                 aom_smooth_h_predictor_8x4_ssse3)
-INTRA_PRED_TEST(SSSE3_3, TX_8X16, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_8X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_8x16_ssse3, aom_smooth_predictor_8x16_ssse3,
                 aom_smooth_v_predictor_8x16_ssse3,
                 aom_smooth_h_predictor_8x16_ssse3)
-INTRA_PRED_TEST(SSSE3_4, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_8x32_ssse3, aom_smooth_predictor_8x32_ssse3,
                 aom_smooth_v_predictor_8x32_ssse3,
                 aom_smooth_h_predictor_8x32_ssse3)
@@ -561,7 +561,21 @@ INTRA_PRED_TEST(DSPR2, TX_8X8, aom_dc_predictor_8x8_dspr2, NULL, NULL, NULL,
 INTRA_PRED_TEST(NEON, TX_8X8, aom_dc_predictor_8x8_neon,
                 aom_dc_left_predictor_8x8_neon, aom_dc_top_predictor_8x8_neon,
                 aom_dc_128_predictor_8x8_neon, aom_v_predictor_8x8_neon,
-                aom_h_predictor_8x8_neon, NULL, NULL, NULL, NULL)
+                aom_h_predictor_8x8_neon, aom_paeth_predictor_8x8_neon,
+                aom_smooth_predictor_8x8_neon, aom_smooth_v_predictor_8x8_neon,
+                aom_smooth_h_predictor_8x8_neon)
+INTRA_PRED_TEST(NEON, TX_8X4, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x4_neon, aom_smooth_predictor_8x4_neon,
+                aom_smooth_v_predictor_8x4_neon,
+                aom_smooth_h_predictor_8x4_neon)
+INTRA_PRED_TEST(NEON, TX_8X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x16_neon, aom_smooth_predictor_8x16_neon,
+                aom_smooth_v_predictor_8x16_neon,
+                aom_smooth_h_predictor_8x16_neon)
+INTRA_PRED_TEST(NEON, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x32_neon, aom_smooth_predictor_8x32_neon,
+                aom_smooth_v_predictor_8x32_neon,
+                aom_smooth_h_predictor_8x32_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -574,35 +588,31 @@ INTRA_PRED_TEST(MSA, TX_8X8, aom_dc_predictor_8x8_msa,
 // -----------------------------------------------------------------------------
 // 16x16, 16x8, 16x32, 16x4, 16x64
 
-INTRA_PRED_TEST(C_1, TX_16X16, aom_dc_predictor_16x16_c,
+INTRA_PRED_TEST(C, TX_16X16, aom_dc_predictor_16x16_c,
                 aom_dc_left_predictor_16x16_c, aom_dc_top_predictor_16x16_c,
                 aom_dc_128_predictor_16x16_c, aom_v_predictor_16x16_c,
                 aom_h_predictor_16x16_c, aom_paeth_predictor_16x16_c,
                 aom_smooth_predictor_16x16_c, aom_smooth_v_predictor_16x16_c,
                 aom_smooth_h_predictor_16x16_c)
-
-INTRA_PRED_TEST(C_2, TX_16X8, aom_dc_predictor_16x8_c,
+INTRA_PRED_TEST(C, TX_16X8, aom_dc_predictor_16x8_c,
                 aom_dc_left_predictor_16x8_c, aom_dc_top_predictor_16x8_c,
                 aom_dc_128_predictor_16x8_c, aom_v_predictor_16x8_c,
                 aom_h_predictor_16x8_c, aom_paeth_predictor_16x8_c,
                 aom_smooth_predictor_16x8_c, aom_smooth_v_predictor_16x8_c,
                 aom_smooth_h_predictor_16x8_c)
-
-INTRA_PRED_TEST(C_3, TX_16X32, aom_dc_predictor_16x32_c,
+INTRA_PRED_TEST(C, TX_16X32, aom_dc_predictor_16x32_c,
                 aom_dc_left_predictor_16x32_c, aom_dc_top_predictor_16x32_c,
                 aom_dc_128_predictor_16x32_c, aom_v_predictor_16x32_c,
                 aom_h_predictor_16x32_c, aom_paeth_predictor_16x32_c,
                 aom_smooth_predictor_16x32_c, aom_smooth_v_predictor_16x32_c,
                 aom_smooth_h_predictor_16x32_c)
-
-INTRA_PRED_TEST(C_4, TX_16X4, aom_dc_predictor_16x4_c,
+INTRA_PRED_TEST(C, TX_16X4, aom_dc_predictor_16x4_c,
                 aom_dc_left_predictor_16x4_c, aom_dc_top_predictor_16x4_c,
                 aom_dc_128_predictor_16x4_c, aom_v_predictor_16x4_c,
                 aom_h_predictor_16x4_c, aom_paeth_predictor_16x4_c,
                 aom_smooth_predictor_16x4_c, aom_smooth_v_predictor_16x4_c,
                 aom_smooth_h_predictor_16x4_c)
-
-INTRA_PRED_TEST(C_5, TX_16X64, aom_dc_predictor_16x64_c,
+INTRA_PRED_TEST(C, TX_16X64, aom_dc_predictor_16x64_c,
                 aom_dc_left_predictor_16x64_c, aom_dc_top_predictor_16x64_c,
                 aom_dc_128_predictor_16x64_c, aom_v_predictor_16x64_c,
                 aom_h_predictor_16x64_c, aom_paeth_predictor_16x64_c,
@@ -610,65 +620,65 @@ INTRA_PRED_TEST(C_5, TX_16X64, aom_dc_predictor_16x64_c,
                 aom_smooth_h_predictor_16x64_c)
 
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_dc_predictor_16x16_sse2,
+INTRA_PRED_TEST(SSE2, TX_16X16, aom_dc_predictor_16x16_sse2,
                 aom_dc_left_predictor_16x16_sse2,
                 aom_dc_top_predictor_16x16_sse2,
                 aom_dc_128_predictor_16x16_sse2, aom_v_predictor_16x16_sse2,
                 aom_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_2, TX_16X8, aom_dc_predictor_16x8_sse2,
+INTRA_PRED_TEST(SSE2, TX_16X8, aom_dc_predictor_16x8_sse2,
                 aom_dc_left_predictor_16x8_sse2, aom_dc_top_predictor_16x8_sse2,
                 aom_dc_128_predictor_16x8_sse2, aom_v_predictor_16x8_sse2,
                 aom_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_3, TX_16X32, aom_dc_predictor_16x32_sse2,
+INTRA_PRED_TEST(SSE2, TX_16X32, aom_dc_predictor_16x32_sse2,
                 aom_dc_left_predictor_16x32_sse2,
                 aom_dc_top_predictor_16x32_sse2,
                 aom_dc_128_predictor_16x32_sse2, aom_v_predictor_16x32_sse2,
                 aom_h_predictor_16x32_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_4, TX_16X64, aom_dc_predictor_16x64_sse2,
+INTRA_PRED_TEST(SSE2, TX_16X64, aom_dc_predictor_16x64_sse2,
                 aom_dc_left_predictor_16x64_sse2,
                 aom_dc_top_predictor_16x64_sse2,
                 aom_dc_128_predictor_16x64_sse2, aom_v_predictor_16x64_sse2,
                 aom_h_predictor_16x64_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_5, TX_16X4, aom_dc_predictor_16x4_sse2,
+INTRA_PRED_TEST(SSE2, TX_16X4, aom_dc_predictor_16x4_sse2,
                 aom_dc_left_predictor_16x4_sse2, aom_dc_top_predictor_16x4_sse2,
                 aom_dc_128_predictor_16x4_sse2, aom_v_predictor_16x4_sse2,
                 aom_h_predictor_16x4_sse2, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x16_ssse3,
                 aom_smooth_predictor_16x16_ssse3,
                 aom_smooth_v_predictor_16x16_ssse3,
                 aom_smooth_h_predictor_16x16_ssse3)
-INTRA_PRED_TEST(SSSE3_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x8_ssse3, aom_smooth_predictor_16x8_ssse3,
                 aom_smooth_v_predictor_16x8_ssse3,
                 aom_smooth_h_predictor_16x8_ssse3)
-INTRA_PRED_TEST(SSSE3_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x32_ssse3,
                 aom_smooth_predictor_16x32_ssse3,
                 aom_smooth_v_predictor_16x32_ssse3,
                 aom_smooth_h_predictor_16x32_ssse3)
-INTRA_PRED_TEST(SSSE3_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x64_ssse3,
                 aom_smooth_predictor_16x64_ssse3,
                 aom_smooth_v_predictor_16x64_ssse3,
                 aom_smooth_h_predictor_16x64_ssse3)
-INTRA_PRED_TEST(SSSE3_5, TX_16X4, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_16X4, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x4_ssse3, aom_smooth_predictor_16x4_ssse3,
                 aom_smooth_v_predictor_16x4_ssse3,
                 aom_smooth_h_predictor_16x4_ssse3)
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
-INTRA_PRED_TEST(AVX2_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x16_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x8_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x32_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x64_avx2, NULL, NULL, NULL)
 #endif  // HAVE_AVX2
 
@@ -682,7 +692,26 @@ INTRA_PRED_TEST(NEON, TX_16X16, aom_dc_predictor_16x16_neon,
                 aom_dc_left_predictor_16x16_neon,
                 aom_dc_top_predictor_16x16_neon,
                 aom_dc_128_predictor_16x16_neon, aom_v_predictor_16x16_neon,
-                aom_h_predictor_16x16_neon, NULL, NULL, NULL, NULL)
+                aom_h_predictor_16x16_neon, aom_paeth_predictor_16x16_neon,
+                aom_smooth_predictor_16x16_neon,
+                aom_smooth_v_predictor_16x16_neon,
+                aom_smooth_h_predictor_16x16_neon)
+INTRA_PRED_TEST(NEON, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x8_neon, aom_smooth_predictor_16x8_neon,
+                aom_smooth_v_predictor_16x8_neon,
+                aom_smooth_h_predictor_16x8_neon)
+INTRA_PRED_TEST(NEON, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x32_neon, aom_smooth_predictor_16x32_neon,
+                aom_smooth_v_predictor_16x32_neon,
+                aom_smooth_h_predictor_16x32_neon)
+INTRA_PRED_TEST(NEON, TX_16X4, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x4_neon, aom_smooth_predictor_16x4_neon,
+                aom_smooth_v_predictor_16x4_neon,
+                aom_smooth_h_predictor_16x4_neon)
+INTRA_PRED_TEST(NEON, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x64_neon, aom_smooth_predictor_16x64_neon,
+                aom_smooth_v_predictor_16x64_neon,
+                aom_smooth_h_predictor_16x64_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -695,28 +724,25 @@ INTRA_PRED_TEST(MSA, TX_16X16, aom_dc_predictor_16x16_msa,
 // -----------------------------------------------------------------------------
 // 32x32, 32x16, 32x64, 32x8
 
-INTRA_PRED_TEST(C_1, TX_32X32, aom_dc_predictor_32x32_c,
+INTRA_PRED_TEST(C, TX_32X32, aom_dc_predictor_32x32_c,
                 aom_dc_left_predictor_32x32_c, aom_dc_top_predictor_32x32_c,
                 aom_dc_128_predictor_32x32_c, aom_v_predictor_32x32_c,
                 aom_h_predictor_32x32_c, aom_paeth_predictor_32x32_c,
                 aom_smooth_predictor_32x32_c, aom_smooth_v_predictor_32x32_c,
                 aom_smooth_h_predictor_32x32_c)
-
-INTRA_PRED_TEST(C_2, TX_32X16, aom_dc_predictor_32x16_c,
+INTRA_PRED_TEST(C, TX_32X16, aom_dc_predictor_32x16_c,
                 aom_dc_left_predictor_32x16_c, aom_dc_top_predictor_32x16_c,
                 aom_dc_128_predictor_32x16_c, aom_v_predictor_32x16_c,
                 aom_h_predictor_32x16_c, aom_paeth_predictor_32x16_c,
                 aom_smooth_predictor_32x16_c, aom_smooth_v_predictor_32x16_c,
                 aom_smooth_h_predictor_32x16_c)
-
-INTRA_PRED_TEST(C_3, TX_32X64, aom_dc_predictor_32x64_c,
+INTRA_PRED_TEST(C, TX_32X64, aom_dc_predictor_32x64_c,
                 aom_dc_left_predictor_32x64_c, aom_dc_top_predictor_32x64_c,
                 aom_dc_128_predictor_32x64_c, aom_v_predictor_32x64_c,
                 aom_h_predictor_32x64_c, aom_paeth_predictor_32x64_c,
                 aom_smooth_predictor_32x64_c, aom_smooth_v_predictor_32x64_c,
                 aom_smooth_h_predictor_32x64_c)
-
-INTRA_PRED_TEST(C_4, TX_32X8, aom_dc_predictor_32x8_c,
+INTRA_PRED_TEST(C, TX_32X8, aom_dc_predictor_32x8_c,
                 aom_dc_left_predictor_32x8_c, aom_dc_top_predictor_32x8_c,
                 aom_dc_128_predictor_32x8_c, aom_v_predictor_32x8_c,
                 aom_h_predictor_32x8_c, aom_paeth_predictor_32x8_c,
@@ -724,62 +750,62 @@ INTRA_PRED_TEST(C_4, TX_32X8, aom_dc_predictor_32x8_c,
                 aom_smooth_h_predictor_32x8_c)
 
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_dc_predictor_32x32_sse2,
+INTRA_PRED_TEST(SSE2, TX_32X32, aom_dc_predictor_32x32_sse2,
                 aom_dc_left_predictor_32x32_sse2,
                 aom_dc_top_predictor_32x32_sse2,
                 aom_dc_128_predictor_32x32_sse2, aom_v_predictor_32x32_sse2,
                 aom_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_2, TX_32X16, aom_dc_predictor_32x16_sse2,
+INTRA_PRED_TEST(SSE2, TX_32X16, aom_dc_predictor_32x16_sse2,
                 aom_dc_left_predictor_32x16_sse2,
                 aom_dc_top_predictor_32x16_sse2,
                 aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2,
                 aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_3, TX_32X64, aom_dc_predictor_32x64_sse2,
+INTRA_PRED_TEST(SSE2, TX_32X64, aom_dc_predictor_32x64_sse2,
                 aom_dc_left_predictor_32x64_sse2,
                 aom_dc_top_predictor_32x64_sse2,
                 aom_dc_128_predictor_32x64_sse2, aom_v_predictor_32x64_sse2,
                 aom_h_predictor_32x64_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_4, TX_32X8, aom_dc_predictor_32x8_sse2,
+INTRA_PRED_TEST(SSE2, TX_32X8, aom_dc_predictor_32x8_sse2,
                 aom_dc_left_predictor_32x8_sse2, aom_dc_top_predictor_32x8_sse2,
                 aom_dc_128_predictor_32x8_sse2, aom_v_predictor_32x8_sse2,
                 aom_h_predictor_32x8_sse2, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_32x32_ssse3,
                 aom_smooth_predictor_32x32_ssse3,
                 aom_smooth_v_predictor_32x32_ssse3,
                 aom_smooth_h_predictor_32x32_ssse3)
-INTRA_PRED_TEST(SSSE3_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_32x16_ssse3,
                 aom_smooth_predictor_32x16_ssse3,
                 aom_smooth_v_predictor_32x16_ssse3,
                 aom_smooth_h_predictor_32x16_ssse3)
-INTRA_PRED_TEST(SSSE3_3, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_32x64_ssse3,
                 aom_smooth_predictor_32x64_ssse3,
                 aom_smooth_v_predictor_32x64_ssse3,
                 aom_smooth_h_predictor_32x64_ssse3)
-INTRA_PRED_TEST(SSSE3_4, TX_32X8, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_32X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_32x8_ssse3, aom_smooth_predictor_32x8_ssse3,
                 aom_smooth_v_predictor_32x8_ssse3,
                 aom_smooth_h_predictor_32x8_ssse3)
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
-INTRA_PRED_TEST(AVX2_1, TX_32X32, aom_dc_predictor_32x32_avx2,
+INTRA_PRED_TEST(AVX2, TX_32X32, aom_dc_predictor_32x32_avx2,
                 aom_dc_left_predictor_32x32_avx2,
                 aom_dc_top_predictor_32x32_avx2,
                 aom_dc_128_predictor_32x32_avx2, aom_v_predictor_32x32_avx2,
                 aom_h_predictor_32x32_avx2, aom_paeth_predictor_32x32_avx2,
                 NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_2, TX_32X16, aom_dc_predictor_32x16_avx2,
+INTRA_PRED_TEST(AVX2, TX_32X16, aom_dc_predictor_32x16_avx2,
                 aom_dc_left_predictor_32x16_avx2,
                 aom_dc_top_predictor_32x16_avx2,
                 aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2,
                 NULL, aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_3, TX_32X64, aom_dc_predictor_32x64_avx2,
+INTRA_PRED_TEST(AVX2, TX_32X64, aom_dc_predictor_32x64_avx2,
                 aom_dc_left_predictor_32x64_avx2,
                 aom_dc_top_predictor_32x64_avx2,
                 aom_dc_128_predictor_32x64_avx2, aom_v_predictor_32x64_avx2,
@@ -791,7 +817,22 @@ INTRA_PRED_TEST(NEON, TX_32X32, aom_dc_predictor_32x32_neon,
                 aom_dc_left_predictor_32x32_neon,
                 aom_dc_top_predictor_32x32_neon,
                 aom_dc_128_predictor_32x32_neon, aom_v_predictor_32x32_neon,
-                aom_h_predictor_32x32_neon, NULL, NULL, NULL, NULL)
+                aom_h_predictor_32x32_neon, aom_paeth_predictor_32x32_neon,
+                aom_smooth_predictor_32x32_neon,
+                aom_smooth_v_predictor_32x32_neon,
+                aom_smooth_h_predictor_32x32_neon)
+INTRA_PRED_TEST(NEON, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x16_neon, aom_smooth_predictor_32x16_neon,
+                aom_smooth_v_predictor_32x16_neon,
+                aom_smooth_h_predictor_32x16_neon)
+INTRA_PRED_TEST(NEON, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x64_neon, aom_smooth_predictor_32x64_neon,
+                aom_smooth_v_predictor_32x64_neon,
+                aom_smooth_h_predictor_32x64_neon)
+INTRA_PRED_TEST(NEON, TX_32X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x8_neon, aom_smooth_predictor_32x8_neon,
+                aom_smooth_v_predictor_32x8_neon,
+                aom_smooth_h_predictor_32x8_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -804,21 +845,19 @@ INTRA_PRED_TEST(MSA, TX_32X32, aom_dc_predictor_32x32_msa,
 // -----------------------------------------------------------------------------
 // 64x64, 64x32, 64x16
 
-INTRA_PRED_TEST(C_1, TX_64X64, aom_dc_predictor_64x64_c,
+INTRA_PRED_TEST(C, TX_64X64, aom_dc_predictor_64x64_c,
                 aom_dc_left_predictor_64x64_c, aom_dc_top_predictor_64x64_c,
                 aom_dc_128_predictor_64x64_c, aom_v_predictor_64x64_c,
                 aom_h_predictor_64x64_c, aom_paeth_predictor_64x64_c,
                 aom_smooth_predictor_64x64_c, aom_smooth_v_predictor_64x64_c,
                 aom_smooth_h_predictor_64x64_c)
-
-INTRA_PRED_TEST(C_2, TX_64X32, aom_dc_predictor_64x32_c,
+INTRA_PRED_TEST(C, TX_64X32, aom_dc_predictor_64x32_c,
                 aom_dc_left_predictor_64x32_c, aom_dc_top_predictor_64x32_c,
                 aom_dc_128_predictor_64x32_c, aom_v_predictor_64x32_c,
                 aom_h_predictor_64x32_c, aom_paeth_predictor_64x32_c,
                 aom_smooth_predictor_64x32_c, aom_smooth_v_predictor_64x32_c,
                 aom_smooth_h_predictor_64x32_c)
-
-INTRA_PRED_TEST(C_3, TX_64X16, aom_dc_predictor_64x16_c,
+INTRA_PRED_TEST(C, TX_64X16, aom_dc_predictor_64x16_c,
                 aom_dc_left_predictor_64x16_c, aom_dc_top_predictor_64x16_c,
                 aom_dc_128_predictor_64x16_c, aom_v_predictor_64x16_c,
                 aom_h_predictor_64x16_c, aom_paeth_predictor_64x16_c,
@@ -826,17 +865,17 @@ INTRA_PRED_TEST(C_3, TX_64X16, aom_dc_predictor_64x16_c,
                 aom_smooth_h_predictor_64x16_c)
 
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_4, TX_64X64, aom_dc_predictor_64x64_sse2,
+INTRA_PRED_TEST(SSE2, TX_64X64, aom_dc_predictor_64x64_sse2,
                 aom_dc_left_predictor_64x64_sse2,
                 aom_dc_top_predictor_64x64_sse2,
                 aom_dc_128_predictor_64x64_sse2, aom_v_predictor_64x64_sse2,
                 aom_h_predictor_64x64_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_5, TX_64X32, aom_dc_predictor_64x32_sse2,
+INTRA_PRED_TEST(SSE2, TX_64X32, aom_dc_predictor_64x32_sse2,
                 aom_dc_left_predictor_64x32_sse2,
                 aom_dc_top_predictor_64x32_sse2,
                 aom_dc_128_predictor_64x32_sse2, aom_v_predictor_64x32_sse2,
                 aom_h_predictor_64x32_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_6, TX_64X16, aom_dc_predictor_64x16_sse2,
+INTRA_PRED_TEST(SSE2, TX_64X16, aom_dc_predictor_64x16_sse2,
                 aom_dc_left_predictor_64x16_sse2,
                 aom_dc_top_predictor_64x16_sse2,
                 aom_dc_128_predictor_64x16_sse2, aom_v_predictor_64x16_sse2,
@@ -844,17 +883,17 @@ INTRA_PRED_TEST(SSE2_6, TX_64X16, aom_dc_predictor_64x16_sse2,
 #endif
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_64x64_ssse3,
                 aom_smooth_predictor_64x64_ssse3,
                 aom_smooth_v_predictor_64x64_ssse3,
                 aom_smooth_h_predictor_64x64_ssse3)
-INTRA_PRED_TEST(SSSE3_5, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_64x32_ssse3,
                 aom_smooth_predictor_64x32_ssse3,
                 aom_smooth_v_predictor_64x32_ssse3,
                 aom_smooth_h_predictor_64x32_ssse3)
-INTRA_PRED_TEST(SSSE3_6, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_64x16_ssse3,
                 aom_smooth_predictor_64x16_ssse3,
                 aom_smooth_v_predictor_64x16_ssse3,
@@ -862,23 +901,38 @@ INTRA_PRED_TEST(SSSE3_6, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
 #endif
 
 #if HAVE_AVX2
-INTRA_PRED_TEST(AVX2_4, TX_64X64, aom_dc_predictor_64x64_avx2,
+INTRA_PRED_TEST(AVX2, TX_64X64, aom_dc_predictor_64x64_avx2,
                 aom_dc_left_predictor_64x64_avx2,
                 aom_dc_top_predictor_64x64_avx2,
                 aom_dc_128_predictor_64x64_avx2, aom_v_predictor_64x64_avx2,
                 NULL, aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_5, TX_64X32, aom_dc_predictor_64x32_avx2,
+INTRA_PRED_TEST(AVX2, TX_64X32, aom_dc_predictor_64x32_avx2,
                 aom_dc_left_predictor_64x32_avx2,
                 aom_dc_top_predictor_64x32_avx2,
                 aom_dc_128_predictor_64x32_avx2, aom_v_predictor_64x32_avx2,
                 NULL, aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_6, TX_64X16, aom_dc_predictor_64x16_avx2,
+INTRA_PRED_TEST(AVX2, TX_64X16, aom_dc_predictor_64x16_avx2,
                 aom_dc_left_predictor_64x16_avx2,
                 aom_dc_top_predictor_64x16_avx2,
                 aom_dc_128_predictor_64x16_avx2, aom_v_predictor_64x16_avx2,
                 NULL, aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
 #endif
 
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_64x64_neon, aom_smooth_predictor_64x64_neon,
+                aom_smooth_v_predictor_64x64_neon,
+                aom_smooth_h_predictor_64x64_neon)
+INTRA_PRED_TEST(NEON, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_64x32_neon, aom_smooth_predictor_64x32_neon,
+                aom_smooth_v_predictor_64x32_neon,
+                aom_smooth_h_predictor_64x32_neon)
+INTRA_PRED_TEST(NEON, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_64x16_neon, aom_smooth_predictor_64x16_neon,
+                aom_smooth_v_predictor_64x16_neon,
+                aom_smooth_h_predictor_64x16_neon)
+#endif  // HAVE_NEON
+
 #if CONFIG_AV1_HIGHBITDEPTH
 // -----------------------------------------------------------------------------
 // High Bitdepth
@@ -911,7 +965,6 @@ void TestHighbdIntraPred(TX_SIZE tx_size, AvxHighbdPredFunc const *pred_funcs,
       pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
                     intra_pred_test_mem.above, intra_pred_test_mem.left, bd);
     }
-    libaom_test::ClearSystemState();
     aom_usec_timer_mark(&timer);
     const int elapsed_time =
         static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
@@ -1187,7 +1240,7 @@ static const char *const kHighbdSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = {
 // 4x4, 4x8, 4x16
 
 HIGHBD_INTRA_PRED_TEST(
-    C_1, TX_4X4, aom_highbd_dc_predictor_4x4_c,
+    C, TX_4X4, aom_highbd_dc_predictor_4x4_c,
     aom_highbd_dc_left_predictor_4x4_c, aom_highbd_dc_top_predictor_4x4_c,
     aom_highbd_dc_128_predictor_4x4_c, aom_highbd_v_predictor_4x4_c,
     aom_highbd_h_predictor_4x4_c, aom_highbd_paeth_predictor_4x4_c,
@@ -1195,66 +1248,81 @@ HIGHBD_INTRA_PRED_TEST(
     aom_highbd_smooth_h_predictor_4x4_c)
 
 HIGHBD_INTRA_PRED_TEST(
-    C_2, TX_4X8, aom_highbd_dc_predictor_4x8_c,
+    C, TX_4X8, aom_highbd_dc_predictor_4x8_c,
     aom_highbd_dc_left_predictor_4x8_c, aom_highbd_dc_top_predictor_4x8_c,
     aom_highbd_dc_128_predictor_4x8_c, aom_highbd_v_predictor_4x8_c,
     aom_highbd_h_predictor_4x8_c, aom_highbd_paeth_predictor_4x8_c,
     aom_highbd_smooth_predictor_4x8_c, aom_highbd_smooth_v_predictor_4x8_c,
     aom_highbd_smooth_h_predictor_4x8_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_3, TX_4X16, aom_highbd_dc_predictor_4x16_c,
+    C, TX_4X16, aom_highbd_dc_predictor_4x16_c,
     aom_highbd_dc_left_predictor_4x16_c, aom_highbd_dc_top_predictor_4x16_c,
     aom_highbd_dc_128_predictor_4x16_c, aom_highbd_v_predictor_4x16_c,
     aom_highbd_h_predictor_4x16_c, aom_highbd_paeth_predictor_4x16_c,
     aom_highbd_smooth_predictor_4x16_c, aom_highbd_smooth_v_predictor_4x16_c,
     aom_highbd_smooth_h_predictor_4x16_c)
-
 #if HAVE_SSE2
-HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_highbd_dc_predictor_4x4_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_4X4, aom_highbd_dc_predictor_4x4_sse2,
                        aom_highbd_dc_left_predictor_4x4_sse2,
                        aom_highbd_dc_top_predictor_4x4_sse2,
                        aom_highbd_dc_128_predictor_4x4_sse2,
                        aom_highbd_v_predictor_4x4_sse2,
                        aom_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL)
 
-HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_4X8, aom_highbd_dc_predictor_4x8_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_4X8, aom_highbd_dc_predictor_4x8_sse2,
                        aom_highbd_dc_left_predictor_4x8_sse2,
                        aom_highbd_dc_top_predictor_4x8_sse2,
                        aom_highbd_dc_128_predictor_4x8_sse2,
                        aom_highbd_v_predictor_4x8_sse2,
                        aom_highbd_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL)
 #endif
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X4, aom_highbd_dc_predictor_4x4_neon, NULL,
+                       NULL, NULL, aom_highbd_v_predictor_4x4_neon, NULL,
+                       aom_highbd_paeth_predictor_4x4_neon,
+                       aom_highbd_smooth_predictor_4x4_neon,
+                       aom_highbd_smooth_v_predictor_4x4_neon,
+                       aom_highbd_smooth_h_predictor_4x4_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X8, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_4x8_neon, NULL,
+                       aom_highbd_paeth_predictor_4x8_neon,
+                       aom_highbd_smooth_predictor_4x8_neon,
+                       aom_highbd_smooth_v_predictor_4x8_neon,
+                       aom_highbd_smooth_h_predictor_4x8_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X16, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_4x16_neon, NULL,
+                       aom_highbd_paeth_predictor_4x16_neon,
+                       aom_highbd_smooth_predictor_4x16_neon,
+                       aom_highbd_smooth_v_predictor_4x16_neon,
+                       aom_highbd_smooth_h_predictor_4x16_neon)
+#endif  // HAVE_NEON
 
 // -----------------------------------------------------------------------------
 // 8x8, 8x4, 8x16, 8x32
 
 HIGHBD_INTRA_PRED_TEST(
-    C_1, TX_8X8, aom_highbd_dc_predictor_8x8_c,
+    C, TX_8X8, aom_highbd_dc_predictor_8x8_c,
     aom_highbd_dc_left_predictor_8x8_c, aom_highbd_dc_top_predictor_8x8_c,
     aom_highbd_dc_128_predictor_8x8_c, aom_highbd_v_predictor_8x8_c,
     aom_highbd_h_predictor_8x8_c, aom_highbd_paeth_predictor_8x8_c,
     aom_highbd_smooth_predictor_8x8_c, aom_highbd_smooth_v_predictor_8x8_c,
     aom_highbd_smooth_h_predictor_8x8_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_2, TX_8X4, aom_highbd_dc_predictor_8x4_c,
+    C, TX_8X4, aom_highbd_dc_predictor_8x4_c,
     aom_highbd_dc_left_predictor_8x4_c, aom_highbd_dc_top_predictor_8x4_c,
     aom_highbd_dc_128_predictor_8x4_c, aom_highbd_v_predictor_8x4_c,
     aom_highbd_h_predictor_8x4_c, aom_highbd_paeth_predictor_8x4_c,
     aom_highbd_smooth_predictor_8x4_c, aom_highbd_smooth_v_predictor_8x4_c,
     aom_highbd_smooth_h_predictor_8x4_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_3, TX_8X16, aom_highbd_dc_predictor_8x16_c,
+    C, TX_8X16, aom_highbd_dc_predictor_8x16_c,
     aom_highbd_dc_left_predictor_8x16_c, aom_highbd_dc_top_predictor_8x16_c,
     aom_highbd_dc_128_predictor_8x16_c, aom_highbd_v_predictor_8x16_c,
     aom_highbd_h_predictor_8x16_c, aom_highbd_paeth_predictor_8x16_c,
     aom_highbd_smooth_predictor_8x16_c, aom_highbd_smooth_v_predictor_8x16_c,
     aom_highbd_smooth_h_predictor_8x16_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_4, TX_8X32, aom_highbd_dc_predictor_8x32_c,
+    C, TX_8X32, aom_highbd_dc_predictor_8x32_c,
     aom_highbd_dc_left_predictor_8x32_c, aom_highbd_dc_top_predictor_8x32_c,
     aom_highbd_dc_128_predictor_8x32_c, aom_highbd_v_predictor_8x32_c,
     aom_highbd_h_predictor_8x32_c, aom_highbd_paeth_predictor_8x32_c,
@@ -1262,19 +1330,19 @@ HIGHBD_INTRA_PRED_TEST(
     aom_highbd_smooth_h_predictor_8x32_c)
 
 #if HAVE_SSE2
-HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_highbd_dc_predictor_8x8_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_8X8, aom_highbd_dc_predictor_8x8_sse2,
                        aom_highbd_dc_left_predictor_8x8_sse2,
                        aom_highbd_dc_top_predictor_8x8_sse2,
                        aom_highbd_dc_128_predictor_8x8_sse2,
                        aom_highbd_v_predictor_8x8_sse2,
                        aom_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_8X4, aom_highbd_dc_predictor_8x4_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_8X4, aom_highbd_dc_predictor_8x4_sse2,
                        aom_highbd_dc_left_predictor_8x4_sse2,
                        aom_highbd_dc_top_predictor_8x4_sse2,
                        aom_highbd_dc_128_predictor_8x4_sse2,
                        aom_highbd_v_predictor_8x4_sse2,
                        aom_highbd_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(SSE2_3, TX_8X16, aom_highbd_dc_predictor_8x16_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_8X16, aom_highbd_dc_predictor_8x16_sse2,
                        aom_highbd_dc_left_predictor_8x16_sse2,
                        aom_highbd_dc_top_predictor_8x16_sse2,
                        aom_highbd_dc_128_predictor_8x16_sse2,
@@ -1287,43 +1355,66 @@ HIGHBD_INTRA_PRED_TEST(SSSE3, TX_8X8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
                        NULL, NULL, NULL)
 #endif
 
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X8, aom_highbd_dc_predictor_8x8_neon, NULL,
+                       NULL, NULL, aom_highbd_v_predictor_8x8_neon, NULL,
+                       aom_highbd_paeth_predictor_8x8_neon,
+                       aom_highbd_smooth_predictor_8x8_neon,
+                       aom_highbd_smooth_v_predictor_8x8_neon,
+                       aom_highbd_smooth_h_predictor_8x8_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X4, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_8x4_neon, NULL,
+                       aom_highbd_paeth_predictor_8x4_neon,
+                       aom_highbd_smooth_predictor_8x4_neon,
+                       aom_highbd_smooth_v_predictor_8x4_neon,
+                       aom_highbd_smooth_h_predictor_8x4_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X16, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_8x16_neon, NULL,
+                       aom_highbd_paeth_predictor_8x16_neon,
+                       aom_highbd_smooth_predictor_8x16_neon,
+                       aom_highbd_smooth_v_predictor_8x16_neon,
+                       aom_highbd_smooth_h_predictor_8x16_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X32, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_8x32_neon, NULL,
+                       aom_highbd_paeth_predictor_8x32_neon,
+                       aom_highbd_smooth_predictor_8x32_neon,
+                       aom_highbd_smooth_v_predictor_8x32_neon,
+                       aom_highbd_smooth_h_predictor_8x32_neon)
+#endif  // HAVE_NEON
+
 // -----------------------------------------------------------------------------
 // 16x16, 16x8, 16x32, 16x4, 16x64
 
 HIGHBD_INTRA_PRED_TEST(
-    C_1, TX_16X16, aom_highbd_dc_predictor_16x16_c,
+    C, TX_16X16, aom_highbd_dc_predictor_16x16_c,
     aom_highbd_dc_left_predictor_16x16_c, aom_highbd_dc_top_predictor_16x16_c,
     aom_highbd_dc_128_predictor_16x16_c, aom_highbd_v_predictor_16x16_c,
     aom_highbd_h_predictor_16x16_c, aom_highbd_paeth_predictor_16x16_c,
     aom_highbd_smooth_predictor_16x16_c, aom_highbd_smooth_v_predictor_16x16_c,
     aom_highbd_smooth_h_predictor_16x16_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_2, TX_16X8, aom_highbd_dc_predictor_16x8_c,
+    C, TX_16X8, aom_highbd_dc_predictor_16x8_c,
     aom_highbd_dc_left_predictor_16x8_c, aom_highbd_dc_top_predictor_16x8_c,
     aom_highbd_dc_128_predictor_16x8_c, aom_highbd_v_predictor_16x8_c,
     aom_highbd_h_predictor_16x8_c, aom_highbd_paeth_predictor_16x8_c,
     aom_highbd_smooth_predictor_16x8_c, aom_highbd_smooth_v_predictor_16x8_c,
     aom_highbd_smooth_h_predictor_16x8_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_3, TX_16X32, aom_highbd_dc_predictor_16x32_c,
+    C, TX_16X32, aom_highbd_dc_predictor_16x32_c,
     aom_highbd_dc_left_predictor_16x32_c, aom_highbd_dc_top_predictor_16x32_c,
     aom_highbd_dc_128_predictor_16x32_c, aom_highbd_v_predictor_16x32_c,
     aom_highbd_h_predictor_16x32_c, aom_highbd_paeth_predictor_16x32_c,
     aom_highbd_smooth_predictor_16x32_c, aom_highbd_smooth_v_predictor_16x32_c,
     aom_highbd_smooth_h_predictor_16x32_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_4, TX_16X4, aom_highbd_dc_predictor_16x4_c,
+    C, TX_16X4, aom_highbd_dc_predictor_16x4_c,
     aom_highbd_dc_left_predictor_16x4_c, aom_highbd_dc_top_predictor_16x4_c,
     aom_highbd_dc_128_predictor_16x4_c, aom_highbd_v_predictor_16x4_c,
     aom_highbd_h_predictor_16x4_c, aom_highbd_paeth_predictor_16x4_c,
     aom_highbd_smooth_predictor_16x4_c, aom_highbd_smooth_v_predictor_16x4_c,
     aom_highbd_smooth_h_predictor_16x4_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_5, TX_16X64, aom_highbd_dc_predictor_16x64_c,
+    C, TX_16X64, aom_highbd_dc_predictor_16x64_c,
     aom_highbd_dc_left_predictor_16x64_c, aom_highbd_dc_top_predictor_16x64_c,
     aom_highbd_dc_128_predictor_16x64_c, aom_highbd_v_predictor_16x64_c,
     aom_highbd_h_predictor_16x64_c, aom_highbd_paeth_predictor_16x64_c,
@@ -1331,20 +1422,20 @@ HIGHBD_INTRA_PRED_TEST(
     aom_highbd_smooth_h_predictor_16x64_c)
 
 #if HAVE_SSE2
-HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_highbd_dc_predictor_16x16_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_16X16, aom_highbd_dc_predictor_16x16_sse2,
                        aom_highbd_dc_left_predictor_16x16_sse2,
                        aom_highbd_dc_top_predictor_16x16_sse2,
                        aom_highbd_dc_128_predictor_16x16_sse2,
                        aom_highbd_v_predictor_16x16_sse2,
                        aom_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL,
                        NULL)
-HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_16X8, aom_highbd_dc_predictor_16x8_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_16X8, aom_highbd_dc_predictor_16x8_sse2,
                        aom_highbd_dc_left_predictor_16x8_sse2,
                        aom_highbd_dc_top_predictor_16x8_sse2,
                        aom_highbd_dc_128_predictor_16x8_sse2,
                        aom_highbd_v_predictor_16x8_sse2,
                        aom_highbd_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(SSE2_3, TX_16X32, aom_highbd_dc_predictor_16x32_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_16X32, aom_highbd_dc_predictor_16x32_sse2,
                        aom_highbd_dc_left_predictor_16x32_sse2,
                        aom_highbd_dc_top_predictor_16x32_sse2,
                        aom_highbd_dc_128_predictor_16x32_sse2,
@@ -1354,50 +1445,80 @@ HIGHBD_INTRA_PRED_TEST(SSE2_3, TX_16X32, aom_highbd_dc_predictor_16x32_sse2,
 #endif
 
 #if HAVE_SSSE3
-HIGHBD_INTRA_PRED_TEST(SSSE3_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+HIGHBD_INTRA_PRED_TEST(SSSE3, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
                        NULL, NULL, NULL, NULL)
 #endif
 
 #if HAVE_AVX2
-HIGHBD_INTRA_PRED_TEST(AVX2_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(AVX2, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL)
 
-HIGHBD_INTRA_PRED_TEST(AVX2_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(AVX2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL)
 
-HIGHBD_INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(AVX2, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL)
 #endif
 
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X16, aom_highbd_dc_predictor_16x16_neon, NULL,
+                       NULL, NULL, aom_highbd_v_predictor_16x16_neon, NULL,
+                       aom_highbd_paeth_predictor_16x16_neon,
+                       aom_highbd_smooth_predictor_16x16_neon,
+                       aom_highbd_smooth_v_predictor_16x16_neon,
+                       aom_highbd_smooth_h_predictor_16x16_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X8, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_16x8_neon, NULL,
+                       aom_highbd_paeth_predictor_16x8_neon,
+                       aom_highbd_smooth_predictor_16x8_neon,
+                       aom_highbd_smooth_v_predictor_16x8_neon,
+                       aom_highbd_smooth_h_predictor_16x8_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X32, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_16x32_neon, NULL,
+                       aom_highbd_paeth_predictor_16x32_neon,
+                       aom_highbd_smooth_predictor_16x32_neon,
+                       aom_highbd_smooth_v_predictor_16x32_neon,
+                       aom_highbd_smooth_h_predictor_16x32_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X4, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_16x4_neon, NULL,
+                       aom_highbd_paeth_predictor_16x4_neon,
+                       aom_highbd_smooth_predictor_16x4_neon,
+                       aom_highbd_smooth_v_predictor_16x4_neon,
+                       aom_highbd_smooth_h_predictor_16x4_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X64, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_16x64_neon, NULL,
+                       aom_highbd_paeth_predictor_16x64_neon,
+                       aom_highbd_smooth_predictor_16x64_neon,
+                       aom_highbd_smooth_v_predictor_16x64_neon,
+                       aom_highbd_smooth_h_predictor_16x64_neon)
+#endif  // HAVE_NEON
+
 // -----------------------------------------------------------------------------
 // 32x32, 32x16, 32x64, 32x8
 
 HIGHBD_INTRA_PRED_TEST(
-    C_1, TX_32X32, aom_highbd_dc_predictor_32x32_c,
+    C, TX_32X32, aom_highbd_dc_predictor_32x32_c,
     aom_highbd_dc_left_predictor_32x32_c, aom_highbd_dc_top_predictor_32x32_c,
     aom_highbd_dc_128_predictor_32x32_c, aom_highbd_v_predictor_32x32_c,
     aom_highbd_h_predictor_32x32_c, aom_highbd_paeth_predictor_32x32_c,
     aom_highbd_smooth_predictor_32x32_c, aom_highbd_smooth_v_predictor_32x32_c,
     aom_highbd_smooth_h_predictor_32x32_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_2, TX_32X16, aom_highbd_dc_predictor_32x16_c,
+    C, TX_32X16, aom_highbd_dc_predictor_32x16_c,
     aom_highbd_dc_left_predictor_32x16_c, aom_highbd_dc_top_predictor_32x16_c,
     aom_highbd_dc_128_predictor_32x16_c, aom_highbd_v_predictor_32x16_c,
     aom_highbd_h_predictor_32x16_c, aom_highbd_paeth_predictor_32x16_c,
     aom_highbd_smooth_predictor_32x16_c, aom_highbd_smooth_v_predictor_32x16_c,
     aom_highbd_smooth_h_predictor_32x16_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_3, TX_32X64, aom_highbd_dc_predictor_32x64_c,
+    C, TX_32X64, aom_highbd_dc_predictor_32x64_c,
     aom_highbd_dc_left_predictor_32x64_c, aom_highbd_dc_top_predictor_32x64_c,
     aom_highbd_dc_128_predictor_32x64_c, aom_highbd_v_predictor_32x64_c,
     aom_highbd_h_predictor_32x64_c, aom_highbd_paeth_predictor_32x64_c,
     aom_highbd_smooth_predictor_32x64_c, aom_highbd_smooth_v_predictor_32x64_c,
     aom_highbd_smooth_h_predictor_32x64_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_4, TX_32X8, aom_highbd_dc_predictor_32x8_c,
+    C, TX_32X8, aom_highbd_dc_predictor_32x8_c,
     aom_highbd_dc_left_predictor_32x8_c, aom_highbd_dc_top_predictor_32x8_c,
     aom_highbd_dc_128_predictor_32x8_c, aom_highbd_v_predictor_32x8_c,
     aom_highbd_h_predictor_32x8_c, aom_highbd_paeth_predictor_32x8_c,
@@ -1405,14 +1526,14 @@ HIGHBD_INTRA_PRED_TEST(
     aom_highbd_smooth_h_predictor_32x8_c)
 
 #if HAVE_SSE2
-HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_highbd_dc_predictor_32x32_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_32X32, aom_highbd_dc_predictor_32x32_sse2,
                        aom_highbd_dc_left_predictor_32x32_sse2,
                        aom_highbd_dc_top_predictor_32x32_sse2,
                        aom_highbd_dc_128_predictor_32x32_sse2,
                        aom_highbd_v_predictor_32x32_sse2,
                        aom_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL,
                        NULL)
-HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_32X16, aom_highbd_dc_predictor_32x16_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_32X16, aom_highbd_dc_predictor_32x16_sse2,
                        aom_highbd_dc_left_predictor_32x16_sse2,
                        aom_highbd_dc_top_predictor_32x16_sse2,
                        aom_highbd_dc_128_predictor_32x16_sse2,
@@ -1422,45 +1543,91 @@ HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_32X16, aom_highbd_dc_predictor_32x16_sse2,
 #endif
 
 #if HAVE_SSSE3
-HIGHBD_INTRA_PRED_TEST(SSSE3_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
+HIGHBD_INTRA_PRED_TEST(SSSE3, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
                        NULL, NULL, NULL, NULL)
 #endif
 
 #if HAVE_AVX2
-HIGHBD_INTRA_PRED_TEST(AVX2_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(AVX2, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL)
 
-HIGHBD_INTRA_PRED_TEST(AVX2_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(AVX2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL)
 #endif
 
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X32, aom_highbd_dc_predictor_32x32_neon, NULL,
+                       NULL, NULL, aom_highbd_v_predictor_32x32_neon, NULL,
+                       aom_highbd_paeth_predictor_32x32_neon,
+                       aom_highbd_smooth_predictor_32x32_neon,
+                       aom_highbd_smooth_v_predictor_32x32_neon,
+                       aom_highbd_smooth_h_predictor_32x32_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X16, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_32x16_neon, NULL,
+                       aom_highbd_paeth_predictor_32x16_neon,
+                       aom_highbd_smooth_predictor_32x16_neon,
+                       aom_highbd_smooth_v_predictor_32x16_neon,
+                       aom_highbd_smooth_h_predictor_32x16_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X64, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_32x64_neon, NULL,
+                       aom_highbd_paeth_predictor_32x64_neon,
+                       aom_highbd_smooth_predictor_32x64_neon,
+                       aom_highbd_smooth_v_predictor_32x64_neon,
+                       aom_highbd_smooth_h_predictor_32x64_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X8, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_32x8_neon, NULL,
+                       aom_highbd_paeth_predictor_32x8_neon,
+                       aom_highbd_smooth_predictor_32x8_neon,
+                       aom_highbd_smooth_v_predictor_32x8_neon,
+                       aom_highbd_smooth_h_predictor_32x8_neon)
+#endif  // HAVE_NEON
+
 // -----------------------------------------------------------------------------
 // 64x64, 64x32, 64x16
 
 HIGHBD_INTRA_PRED_TEST(
-    C_1, TX_64X64, aom_highbd_dc_predictor_64x64_c,
+    C, TX_64X64, aom_highbd_dc_predictor_64x64_c,
     aom_highbd_dc_left_predictor_64x64_c, aom_highbd_dc_top_predictor_64x64_c,
     aom_highbd_dc_128_predictor_64x64_c, aom_highbd_v_predictor_64x64_c,
     aom_highbd_h_predictor_64x64_c, aom_highbd_paeth_predictor_64x64_c,
     aom_highbd_smooth_predictor_64x64_c, aom_highbd_smooth_v_predictor_64x64_c,
     aom_highbd_smooth_h_predictor_64x64_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_2, TX_64X32, aom_highbd_dc_predictor_64x32_c,
+    C, TX_64X32, aom_highbd_dc_predictor_64x32_c,
     aom_highbd_dc_left_predictor_64x32_c, aom_highbd_dc_top_predictor_64x32_c,
     aom_highbd_dc_128_predictor_64x32_c, aom_highbd_v_predictor_64x32_c,
     aom_highbd_h_predictor_64x32_c, aom_highbd_paeth_predictor_64x32_c,
     aom_highbd_smooth_predictor_64x32_c, aom_highbd_smooth_v_predictor_64x32_c,
     aom_highbd_smooth_h_predictor_64x32_c)
-
 HIGHBD_INTRA_PRED_TEST(
-    C_3, TX_64X16, aom_highbd_dc_predictor_64x16_c,
+    C, TX_64X16, aom_highbd_dc_predictor_64x16_c,
     aom_highbd_dc_left_predictor_64x16_c, aom_highbd_dc_top_predictor_64x16_c,
     aom_highbd_dc_128_predictor_64x16_c, aom_highbd_v_predictor_64x16_c,
     aom_highbd_h_predictor_64x16_c, aom_highbd_paeth_predictor_64x16_c,
     aom_highbd_smooth_predictor_64x16_c, aom_highbd_smooth_v_predictor_64x16_c,
     aom_highbd_smooth_h_predictor_64x16_c)
 
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(NEON, TX_64X64, aom_highbd_dc_predictor_64x64_neon, NULL,
+                       NULL, NULL, aom_highbd_v_predictor_64x64_neon, NULL,
+                       aom_highbd_paeth_predictor_64x64_neon,
+                       aom_highbd_smooth_predictor_64x64_neon,
+                       aom_highbd_smooth_v_predictor_64x64_neon,
+                       aom_highbd_smooth_h_predictor_64x64_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_64X32, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_64x32_neon, NULL,
+                       aom_highbd_paeth_predictor_64x32_neon,
+                       aom_highbd_smooth_predictor_64x32_neon,
+                       aom_highbd_smooth_v_predictor_64x32_neon,
+                       aom_highbd_smooth_h_predictor_64x32_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_64X16, NULL, NULL, NULL, NULL,
+                       aom_highbd_v_predictor_64x16_neon, NULL,
+                       aom_highbd_paeth_predictor_64x16_neon,
+                       aom_highbd_smooth_predictor_64x16_neon,
+                       aom_highbd_smooth_v_predictor_64x16_neon,
+                       aom_highbd_smooth_h_predictor_64x16_neon)
+#endif  // HAVE_NEON
+
 // -----------------------------------------------------------------------------
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
diff --git a/media/libaom/src/test/test_vector_test.cc b/media/libaom/src/test/test_vector_test.cc
index eab92b685a..9fa2c2cb9a 100644
--- a/media/libaom/src/test/test_vector_test.cc
+++ b/media/libaom/src/test/test_vector_test.cc
@@ -47,7 +47,7 @@ class TestVectorTest : public ::libaom_test::DecoderTest,
 
   void OpenMD5File(const std::string &md5_file_name_) {
     md5_file_ = libaom_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL)
+    ASSERT_NE(md5_file_, nullptr)
         << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
@@ -59,7 +59,7 @@ class TestVectorTest : public ::libaom_test::DecoderTest,
 
   virtual void DecompressedFrameHook(const aom_image_t &img,
                                      const unsigned int frame_number) {
-    ASSERT_TRUE(md5_file_ != NULL);
+    ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
 
@@ -131,7 +131,7 @@ TEST_P(TestVectorTest, MD5Match) {
     return;
 #endif
   }
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video, nullptr);
   video->Init();
 
   // Construct md5 file name.
@@ -148,7 +148,7 @@ TEST_P(TestVectorTest, MD5Match) {
 }
 
 #if CONFIG_AV1_DECODER
-AV1_INSTANTIATE_TEST_CASE(
+AV1_INSTANTIATE_TEST_SUITE(
     TestVectorTest,
     ::testing::Combine(::testing::Values(1),  // Single thread.
                        ::testing::ValuesIn(libaom_test::kAV1TestVectors,
diff --git a/media/libaom/src/test/test_vectors.cc b/media/libaom/src/test/test_vectors.cc
index 991667a089..c38461e056 100644
--- a/media/libaom/src/test/test_vectors.cc
+++ b/media/libaom/src/test/test_vectors.cc
@@ -146,6 +146,7 @@ const char *const kAV1TestVectors[] = { "av1-1-b8-00-quantizer-00.ivf",
                                         "av1-1-b10-00-quantizer-62.ivf",
                                         "av1-1-b10-00-quantizer-63.ivf",
                                         "av1-1-b10-23-film_grain-50.ivf",
+                                        "av1-1-b10-24-monochrome.ivf",
 #endif  // CONFIG_AV1_HIGHBITDEPTH
                                         "av1-1-b8-01-size-16x16.ivf",
                                         "av1-1-b8-01-size-16x18.ivf",
@@ -256,7 +257,8 @@ const char *const kAV1TestVectors[] = { "av1-1-b8-00-quantizer-00.ivf",
                                         "av1-1-b8-22-svc-L1T2.ivf",
                                         "av1-1-b8-22-svc-L2T1.ivf",
                                         "av1-1-b8-22-svc-L2T2.ivf",
-                                        "av1-1-b8-23-film_grain-50.ivf" };
+                                        "av1-1-b8-23-film_grain-50.ivf",
+                                        "av1-1-b8-24-monochrome.ivf" };
 const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors);
 #endif  // CONFIG_AV1_DECODER
 
diff --git a/media/libaom/src/test/tile_config_test.cc b/media/libaom/src/test/tile_config_test.cc
new file mode 100644
index 0000000000..517d54bd94
--- /dev/null
+++ b/media/libaom/src/test/tile_config_test.cc
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+typedef struct {
+  // Superblock size
+  const unsigned int sb_size;
+  // log2(number of tile rows)
+  const unsigned int tile_rows;
+  // log2(number of tile columns)
+  const unsigned int tile_cols;
+} uniformTileConfigParam;
+
+const libaom_test::TestMode kTestModeParams[] =
+#if CONFIG_REALTIME_ONLY
+    { ::libaom_test::kRealTime };
+#else
+    { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood,
+      ::libaom_test::kTwoPassGood };
+#endif
+
+static const uniformTileConfigParam uniformTileConfigParams[] = {
+  { 128, 0, 0 }, { 128, 0, 2 }, { 128, 2, 0 }, { 128, 1, 2 }, { 128, 2, 2 },
+  { 128, 3, 2 }, { 64, 0, 0 },  { 64, 0, 2 },  { 64, 2, 0 },  { 64, 1, 2 },
+  { 64, 2, 2 },  { 64, 3, 3 },  { 64, 4, 4 }
+};
+
+typedef struct {
+  // Superblock size
+  const unsigned int sb_size;
+  // number of tile widths
+  const unsigned int tile_width_count;
+  // list of tile widths
+  int tile_widths[AOM_MAX_TILE_COLS];
+  // number of tile heights
+  const unsigned int tile_height_count;
+  // list of tile heights
+  int tile_heights[AOM_MAX_TILE_ROWS];
+} nonUniformTileConfigParam;
+
+const nonUniformTileConfigParam nonUniformTileConfigParams[] = {
+  { 64, 1, { 3 }, 1, { 3 } },          { 64, 2, { 1, 2 }, 2, { 1, 2 } },
+  { 64, 3, { 2, 3, 4 }, 2, { 2, 3 } }, { 128, 1, { 3 }, 1, { 3 } },
+  { 128, 2, { 1, 2 }, 2, { 1, 2 } },   { 128, 3, { 2, 3, 4 }, 2, { 2, 3 } },
+};
+
+// Find smallest k>=0 such that (blk_size << k) >= target
+static INLINE int tile_log2(int blk_size, int target) {
+  int k;
+  for (k = 0; (blk_size << k) < target; k++) {
+  }
+  return k;
+}
+
+// This class is used to validate tile configuration for uniform spacing.
+class UniformTileConfigTestLarge
+    : public ::libaom_test::CodecTestWith3Params<
+          libaom_test::TestMode, uniformTileConfigParam, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  UniformTileConfigTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        tile_config_param_(GET_PARAM(2)), end_usage_check_(GET_PARAM(3)) {
+    tile_config_violated_ = false;
+    max_tile_cols_log2_ = tile_log2(1, AOM_MAX_TILE_COLS);
+    max_tile_rows_log2_ = tile_log2(1, AOM_MAX_TILE_ROWS);
+  }
+  virtual ~UniformTileConfigTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = end_usage_check_;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 19;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_TILE_COLUMNS, tile_config_param_.tile_cols);
+      encoder->Control(AV1E_SET_TILE_ROWS, tile_config_param_.tile_rows);
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AV1E_SET_SUPERBLOCK_SIZE,
+                       tile_config_param_.sb_size == 64
+                           ? AOM_SUPERBLOCK_SIZE_64X64
+                           : AOM_SUPERBLOCK_SIZE_128X128);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      aom_tile_info tile_info;
+      int config_tile_columns = AOMMIN(1 << (int)tile_config_param_.tile_cols,
+                                       1 << max_tile_cols_log2_);
+      int config_tile_rows = AOMMIN(1 << (int)tile_config_param_.tile_rows,
+                                    1 << max_tile_rows_log2_);
+
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_TILE_INFO, &tile_info);
+      if (tile_info.tile_columns != config_tile_columns ||
+          tile_info.tile_rows != config_tile_rows) {
+        tile_config_violated_ = true;
+      }
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  const uniformTileConfigParam tile_config_param_;
+  int max_tile_cols_log2_;
+  int max_tile_rows_log2_;
+  bool tile_config_violated_;
+  aom_rc_mode end_usage_check_;
+};
+
+// This class is used to validate tile configuration for non uniform spacing.
+class NonUniformTileConfigTestLarge
+    : public ::libaom_test::CodecTestWith3Params<
+          libaom_test::TestMode, nonUniformTileConfigParam, aom_rc_mode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  NonUniformTileConfigTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        tile_config_param_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
+    tile_config_violated_ = false;
+  }
+  virtual ~NonUniformTileConfigTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = rc_end_usage_;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 35;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.tile_width_count = tile_config_param_.tile_width_count;
+    memcpy(cfg_.tile_widths, tile_config_param_.tile_widths,
+           sizeof(tile_config_param_.tile_widths[0]) *
+               tile_config_param_.tile_width_count);
+    cfg_.tile_height_count = tile_config_param_.tile_height_count;
+    memcpy(cfg_.tile_heights, tile_config_param_.tile_heights,
+           sizeof(tile_config_param_.tile_heights[0]) *
+               tile_config_param_.tile_height_count);
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AV1E_SET_SUPERBLOCK_SIZE,
+                       tile_config_param_.sb_size == 64
+                           ? AOM_SUPERBLOCK_SIZE_64X64
+                           : AOM_SUPERBLOCK_SIZE_128X128);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      aom_tile_info tile_info;
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_TILE_INFO, &tile_info);
+
+      // check validity of tile cols
+      int tile_col_idx, tile_col = 0;
+      for (tile_col_idx = 0; tile_col_idx < tile_info.tile_columns - 1;
+           tile_col_idx++) {
+        if (tile_config_param_.tile_widths[tile_col] !=
+            tile_info.tile_widths[tile_col_idx])
+          tile_config_violated_ = true;
+        tile_col = (tile_col + 1) % (int)tile_config_param_.tile_width_count;
+      }
+      // last column may not be able to accommodate config, but if it is
+      // greater than what is configured, there is a violation.
+      if (tile_config_param_.tile_widths[tile_col] <
+          tile_info.tile_widths[tile_col_idx])
+        tile_config_violated_ = true;
+
+      // check validity of tile rows
+      int tile_row_idx, tile_row = 0;
+      for (tile_row_idx = 0; tile_row_idx < tile_info.tile_rows - 1;
+           tile_row_idx++) {
+        if (tile_config_param_.tile_heights[tile_row] !=
+            tile_info.tile_heights[tile_row_idx])
+          tile_config_violated_ = true;
+        tile_row = (tile_row + 1) % (int)tile_config_param_.tile_height_count;
+      }
+      // last row may not be able to accommodate config, but if it is
+      // greater than what is configured, there is a violation.
+      if (tile_config_param_.tile_heights[tile_row] <
+          tile_info.tile_heights[tile_row_idx])
+        tile_config_violated_ = true;
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  const nonUniformTileConfigParam tile_config_param_;
+  bool tile_config_violated_;
+  aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(UniformTileConfigTestLarge, UniformTileConfigTest) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 1);
+  ASSERT_NO_FATAL_FAILURE(video.Begin());
+
+  int max_tiles_cols = video.img()->w / (int)tile_config_param_.sb_size;
+  int max_tiles_rows = video.img()->h / (int)tile_config_param_.sb_size;
+  max_tile_cols_log2_ = tile_log2(1, AOMMIN(max_tiles_cols, AOM_MAX_TILE_COLS));
+  max_tile_rows_log2_ = tile_log2(1, AOMMIN(max_tiles_rows, AOM_MAX_TILE_ROWS));
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(tile_config_violated_, false);
+}
+
+TEST_P(UniformTileConfigTestLarge, UniformTileConfigTestLowRes) {
+  ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 1);
+  ASSERT_NO_FATAL_FAILURE(video.Begin());
+
+  int max_tiles_cols = video.img()->w / (int)tile_config_param_.sb_size;
+  int max_tiles_rows = video.img()->h / (int)tile_config_param_.sb_size;
+  max_tile_cols_log2_ = tile_log2(1, AOMMIN(max_tiles_cols, AOM_MAX_TILE_COLS));
+  max_tile_rows_log2_ = tile_log2(1, AOMMIN(max_tiles_rows, AOM_MAX_TILE_ROWS));
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(tile_config_violated_, false);
+}
+
+TEST_P(NonUniformTileConfigTestLarge, NonUniformTileConfigTest) {
+  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(tile_config_violated_, false);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(UniformTileConfigTestLarge,
+                           ::testing::ValuesIn(kTestModeParams),
+                           ::testing::ValuesIn(uniformTileConfigParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+
+AV1_INSTANTIATE_TEST_SUITE(NonUniformTileConfigTestLarge,
+                           ::testing::ValuesIn(kTestModeParams),
+                           ::testing::ValuesIn(nonUniformTileConfigParams),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+
+typedef struct {
+  // Number of tile groups to set.
+  const int num_tg;
+  // Number of tile rows to set
+  const int num_tile_rows;
+  // Number of tile columns to set
+  const int num_tile_cols;
+} TileGroupConfigParams;
+
+static const TileGroupConfigParams tileGroupTestParams[] = {
+  { 5, 4, 4 }, { 3, 3, 3 }, { 5, 3, 3 }, { 7, 5, 5 }, { 7, 3, 3 }, { 7, 4, 4 }
+};
+
+std::ostream &operator<<(std::ostream &os,
+                         const TileGroupConfigParams &test_arg) {
+  return os << "TileGroupConfigParams { num_tg:" << test_arg.num_tg
+            << " num_tile_rows:" << test_arg.num_tile_rows
+            << " num_tile_cols:" << test_arg.num_tile_cols << " }";
+}
+
+// This class is used to test number of tile groups present in header.
+class TileGroupTestLarge
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+                                                 TileGroupConfigParams>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  TileGroupTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        tile_group_config_params_(GET_PARAM(2)) {
+    tile_group_config_violated_ = false;
+  }
+  virtual ~TileGroupTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_Q;
+    cfg_.g_threads = 1;
+  }
+
+  virtual bool DoDecode() const { return 1; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 5);
+      encoder->Control(AV1E_SET_NUM_TG, tile_group_config_params_.num_tg);
+      encoder->Control(AV1E_SET_TILE_COLUMNS,
+                       tile_group_config_params_.num_tile_cols);
+      encoder->Control(AV1E_SET_TILE_ROWS,
+                       tile_group_config_params_.num_tile_rows);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (AOM_CODEC_OK == res_dec) {
+      aom_tile_info tile_info;
+      aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_TILE_INFO, &tile_info);
+      AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_SHOW_EXISTING_FRAME_FLAG,
+                                    &show_existing_frame_);
+      if (tile_info.num_tile_groups != tile_group_config_params_.num_tg &&
+          !show_existing_frame_)
+        tile_group_config_violated_ = true;
+      EXPECT_EQ(tile_group_config_violated_, false);
+    }
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  int show_existing_frame_;
+  bool tile_group_config_violated_;
+  aom_rc_mode end_usage_check_;
+  ::libaom_test::TestMode encoding_mode_;
+  const TileGroupConfigParams tile_group_config_params_;
+};
+
+TEST_P(TileGroupTestLarge, TileGroupCountTest) {
+  libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 5);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(TileGroupTestLarge,
+                           ::testing::ValuesIn(kTestModeParams),
+                           ::testing::ValuesIn(tileGroupTestParams));
+}  // namespace
diff --git a/media/libaom/src/test/tile_independence_test.cc b/media/libaom/src/test/tile_independence_test.cc
index 4f7c4a475e..888c3abc99 100644
--- a/media/libaom/src/test/tile_independence_test.cc
+++ b/media/libaom/src/test/tile_independence_test.cc
@@ -52,10 +52,7 @@ class TileIndependenceTest
     delete inv_dec_;
   }
 
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(libaom_test::kTwoPassGood);
-  }
+  virtual void SetUp() { InitializeConfig(libaom_test::kTwoPassGood); }
 
   virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
                                   libaom_test::Encoder *encoder) {
@@ -139,10 +136,10 @@ TEST_P(TileIndependenceTestLarge, MD5Match) {
   DoTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(0, 1),
-                          ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
-AV1_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge, ::testing::Values(0, 1),
-                          ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
+AV1_INSTANTIATE_TEST_SUITE(TileIndependenceTest, ::testing::Values(0, 1),
+                           ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
+AV1_INSTANTIATE_TEST_SUITE(TileIndependenceTestLarge, ::testing::Values(0, 1),
+                           ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
 
 class TileIndependenceLSTest : public TileIndependenceTest {};
 
@@ -166,8 +163,8 @@ TEST_P(TileIndependenceLSTestLarge, MD5Match) {
   DoTest();
 }
 
-AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTest, ::testing::Values(6),
-                          ::testing::Values(6), ::testing::Values(1));
-AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTestLarge, ::testing::Values(6),
-                          ::testing::Values(6), ::testing::Values(1));
+AV1_INSTANTIATE_TEST_SUITE(TileIndependenceLSTest, ::testing::Values(6),
+                           ::testing::Values(6), ::testing::Values(1));
+AV1_INSTANTIATE_TEST_SUITE(TileIndependenceLSTestLarge, ::testing::Values(6),
+                           ::testing::Values(6), ::testing::Values(1));
 }  // namespace
diff --git a/media/libaom/src/test/time_stamp_test.cc b/media/libaom/src/test/time_stamp_test.cc
index 679e4da292..baa0dc06db 100644
--- a/media/libaom/src/test/time_stamp_test.cc
+++ b/media/libaom/src/test/time_stamp_test.cc
@@ -74,10 +74,7 @@ class TimestampTest
   TimestampTest() : EncoderTest(GET_PARAM(0)) {}
   virtual ~TimestampTest() {}
 
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-  }
+  virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
 };
 
 // Tests encoding in millisecond timebase.
@@ -98,8 +95,13 @@ TEST_P(TimestampTest, TestAv1Rollover) {
   video.set_starting_pts(922337170351ll);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
-
-AV1_INSTANTIATE_TEST_CASE(TimestampTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood));
+#if CONFIG_REALTIME_ONLY
+AV1_INSTANTIATE_TEST_SUITE(TimestampTest,
+                           ::testing::Values(::libaom_test::kRealTime));
+#else
+AV1_INSTANTIATE_TEST_SUITE(TimestampTest,
+                           ::testing::Values(::libaom_test::kRealTime,
+                                             ::libaom_test::kTwoPassGood));
+#endif
 
 }  // namespace
diff --git a/media/libaom/src/test/tools_common.sh b/media/libaom/src/test/tools_common.sh
index c087106060..f2d180297e 100644..100755
--- a/media/libaom/src/test/tools_common.sh
+++ b/media/libaom/src/test/tools_common.sh
@@ -196,18 +196,55 @@ av1_encode_available() {
 
 # Echoes "fast" encode params for use with aomenc.
 aomenc_encode_test_fast_params() {
-  echo "--cpu-used=1
+  echo "--cpu-used=2
         --limit=${AV1_ENCODE_TEST_FRAME_LIMIT}
         --lag-in-frames=0
         --test-decode=fatal"
 }
 
+# Echoes realtime encode params for use with aomenc.
+aomenc_encode_test_rt_params() {
+  echo "--limit=${AV1_ENCODE_TEST_FRAME_LIMIT}
+        --test-decode=fatal
+        --enable-tpl-model=0
+        --deltaq-mode=0
+        --enable-order-hint=0
+        --profile=0
+        --static-thresh=0
+        --end-usage=cbr
+        --cpu-used=7
+        --passes=1
+        --usage=1
+        --lag-in-frames=0
+        --aq-mode=3
+        --enable-obmc=0
+        --enable-warped-motion=0
+        --enable-ref-frame-mvs=0
+        --enable-cdef=1
+        --enable-order-hint=0
+        --coeff-cost-upd-freq=3
+        --mode-cost-upd-freq=3
+        --mv-cost-upd-freq=3"
+}
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_AV1_HIGHBITDEPTH.
+highbitdepth_available() {
+  [ "$(aom_config_option_enabled CONFIG_AV1_HIGHBITDEPTH)" = "yes" ] && echo yes
+}
+
 # Echoes yes to stdout when aom_config_option_enabled() reports yes for
 # CONFIG_WEBM_IO.
 webm_io_available() {
   [ "$(aom_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes
 }
 
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_REALTIME_ONLY.
+realtime_only_build() {
+  [ "$(aom_config_option_enabled CONFIG_REALTIME_ONLY)" = "yes" ] && echo yes
+}
+
 # Filters strings from $1 using the filter specified by $2. Filter behavior
 # depends on the presence of $3. When $3 is present, strings that match the
 # filter are excluded. When $3 is omitted, strings matching the filter are
diff --git a/media/libaom/src/test/tpl_model_test.cc b/media/libaom/src/test/tpl_model_test.cc
new file mode 100644
index 0000000000..e41077f69f
--- /dev/null
+++ b/media/libaom/src/test/tpl_model_test.cc
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <memory>
+#include <new>
+#include <vector>
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/encoder.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+#if CONFIG_BITRATE_ACCURACY
+constexpr double epsilon = 0.0000001;
+#endif
+
+double laplace_prob(double q_step, double b, double zero_bin_ratio,
+                    int qcoeff) {
+  int abs_qcoeff = abs(qcoeff);
+  double z0 = fmax(exp(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+  if (abs_qcoeff == 0) {
+    double p0 = 1 - z0;
+    return p0;
+  } else {
+    assert(abs_qcoeff > 0);
+    double z = fmax(exp(-q_step / b), TPL_EPSILON);
+    double p = z0 / 2 * (1 - z) * pow(z, abs_qcoeff - 1);
+    return p;
+  }
+}
+TEST(TplModelTest, ExponentialEntropyBoundaryTest1) {
+  double b = 0;
+  double q_step = 1;
+  double entropy = av1_exponential_entropy(q_step, b);
+  EXPECT_NEAR(entropy, 0, 0.00001);
+}
+
+TEST(TplModelTest, TransformCoeffEntropyTest1) {
+  // Check the consistency between av1_estimate_coeff_entropy() and
+  // laplace_prob()
+  double b = 1;
+  double q_step = 1;
+  double zero_bin_ratio = 2;
+  for (int qcoeff = -256; qcoeff < 256; ++qcoeff) {
+    double rate = av1_estimate_coeff_entropy(q_step, b, zero_bin_ratio, qcoeff);
+    double prob = laplace_prob(q_step, b, zero_bin_ratio, qcoeff);
+    double ref_rate = -log2(prob);
+    EXPECT_DOUBLE_EQ(rate, ref_rate);
+  }
+}
+
+TEST(TplModelTest, TransformCoeffEntropyTest2) {
+  // Check the consistency between av1_estimate_coeff_entropy(), laplace_prob()
+  // and av1_laplace_entropy()
+  double b = 1;
+  double q_step = 1;
+  double zero_bin_ratio = 2;
+  double est_expected_rate = 0;
+  for (int qcoeff = -20; qcoeff < 20; ++qcoeff) {
+    double rate = av1_estimate_coeff_entropy(q_step, b, zero_bin_ratio, qcoeff);
+    double prob = laplace_prob(q_step, b, zero_bin_ratio, qcoeff);
+    est_expected_rate += prob * rate;
+  }
+  double expected_rate = av1_laplace_entropy(q_step, b, zero_bin_ratio);
+  EXPECT_NEAR(expected_rate, est_expected_rate, 0.001);
+}
+
+TEST(TplModelTest, InitTplStats1) {
+  // We use heap allocation instead of stack allocation here to avoid
+  // -Wstack-usage warning.
+  std::unique_ptr<TplParams> tpl_data(new (std::nothrow) TplParams);
+  ASSERT_NE(tpl_data, nullptr);
+  av1_zero(*tpl_data);
+  tpl_data->ready = 1;
+  EXPECT_EQ(sizeof(tpl_data->tpl_stats_buffer),
+            MAX_LENGTH_TPL_FRAME_STATS * sizeof(tpl_data->tpl_stats_buffer[0]));
+  for (int i = 0; i < MAX_LENGTH_TPL_FRAME_STATS; ++i) {
+    // Set it to a random non-zero number
+    tpl_data->tpl_stats_buffer[i].is_valid = i + 1;
+  }
+  av1_init_tpl_stats(tpl_data.get());
+  EXPECT_EQ(tpl_data->ready, 0);
+  for (int i = 0; i < MAX_LENGTH_TPL_FRAME_STATS; ++i) {
+    EXPECT_EQ(tpl_data->tpl_stats_buffer[i].is_valid, 0);
+  }
+}
+
+TEST(TplModelTest, DeltaRateCostZeroFlow) {
+  // When srcrf_dist equal to recrf_dist, av1_delta_rate_cost should return 0
+  int64_t srcrf_dist = 256;
+  int64_t recrf_dist = 256;
+  int64_t delta_rate = 512;
+  int pixel_num = 256;
+  int64_t rate_cost =
+      av1_delta_rate_cost(delta_rate, recrf_dist, srcrf_dist, pixel_num);
+  EXPECT_EQ(rate_cost, 0);
+}
+
+// a reference function of av1_delta_rate_cost() with delta_rate using bit as
+// basic unit
+double ref_delta_rate_cost(int64_t delta_rate, double src_rec_ratio,
+                           int pixel_count) {
+  assert(src_rec_ratio <= 1 && src_rec_ratio >= 0);
+  double bits_per_pixel = (double)delta_rate / pixel_count;
+  double p = pow(2, bits_per_pixel);
+  double flow_rate_per_pixel =
+      sqrt(p * p / (src_rec_ratio * p * p + (1 - src_rec_ratio)));
+  double rate_cost = pixel_count * log2(flow_rate_per_pixel);
+  return rate_cost;
+}
+
+TEST(TplModelTest, DeltaRateCostReference) {
+  const int64_t scale = TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT;
+  std::vector<int64_t> srcrf_dist_arr = { 256, 257, 312 };
+  std::vector<int64_t> recrf_dist_arr = { 512, 288, 620 };
+  std::vector<int64_t> delta_rate_arr = { 10, 278, 100 };
+  for (size_t t = 0; t < srcrf_dist_arr.size(); ++t) {
+    int64_t srcrf_dist = srcrf_dist_arr[t];
+    int64_t recrf_dist = recrf_dist_arr[t];
+    int64_t delta_rate = delta_rate_arr[t];
+    int64_t scaled_delta_rate = delta_rate << scale;
+    int pixel_count = 256;
+    int64_t rate_cost = av1_delta_rate_cost(scaled_delta_rate, recrf_dist,
+                                            srcrf_dist, pixel_count);
+    rate_cost >>= scale;
+    double src_rec_ratio = (double)srcrf_dist / recrf_dist;
+    double ref_rate_cost =
+        ref_delta_rate_cost(delta_rate, src_rec_ratio, pixel_count);
+    EXPECT_NEAR((double)rate_cost, ref_rate_cost, 1);
+  }
+}
+
+TEST(TplModelTest, GetOverlapAreaHasOverlap) {
+  // The block a's area is [10, 17) x [18, 24).
+  // The block b's area is [8, 15) x [17, 23).
+  // The overlapping area between block a and block b is [10, 15) x [18, 23).
+  // Therefore, the size of the area is (15 - 10) * (23 - 18) = 25.
+  int row_a = 10;
+  int col_a = 18;
+  int row_b = 8;
+  int col_b = 17;
+  int height = 7;
+  int width = 6;
+  int overlap_area =
+      av1_get_overlap_area(row_a, col_a, row_b, col_b, width, height);
+  EXPECT_EQ(overlap_area, 25);
+}
+
+TEST(TplModelTest, GetOverlapAreaNoOverlap) {
+  // The block a's area is [10, 14) x [18, 22).
+  // The block b's area is [5, 9) x [5, 9).
+  // Threre is no overlapping area between block a and block b.
+  // Therefore, the return value should be zero.
+  int row_a = 10;
+  int col_a = 18;
+  int row_b = 5;
+  int col_b = 5;
+  int height = 4;
+  int width = 4;
+  int overlap_area =
+      av1_get_overlap_area(row_a, col_a, row_b, col_b, width, height);
+  EXPECT_EQ(overlap_area, 0);
+}
+
+TEST(TplModelTest, GetQIndexFromQstepRatio) {
+  const aom_bit_depth_t bit_depth = AOM_BITS_8;
+  // When qstep_ratio is 1, the output q_index should be equal to leaf_qindex.
+  double qstep_ratio = 1.0;
+  for (int leaf_qindex = 1; leaf_qindex <= 255; ++leaf_qindex) {
+    const int q_index =
+        av1_get_q_index_from_qstep_ratio(leaf_qindex, qstep_ratio, bit_depth);
+    EXPECT_EQ(q_index, leaf_qindex);
+  }
+
+  // When qstep_ratio is very low, the output q_index should be 1.
+  qstep_ratio = 0.0001;
+  for (int leaf_qindex = 1; leaf_qindex <= 255; ++leaf_qindex) {
+    const int q_index =
+        av1_get_q_index_from_qstep_ratio(leaf_qindex, qstep_ratio, bit_depth);
+    EXPECT_EQ(q_index, 0);
+  }
+}
+
+TEST(TplModelTest, TxfmStatsInitTest) {
+  TplTxfmStats tpl_txfm_stats;
+  av1_init_tpl_txfm_stats(&tpl_txfm_stats);
+  EXPECT_EQ(tpl_txfm_stats.coeff_num, 256);
+  EXPECT_EQ(tpl_txfm_stats.txfm_block_count, 0);
+  for (int i = 0; i < tpl_txfm_stats.coeff_num; ++i) {
+    EXPECT_DOUBLE_EQ(tpl_txfm_stats.abs_coeff_sum[i], 0);
+  }
+}
+
+TEST(TplModelTest, TxfmStatsAccumulateTest) {
+  TplTxfmStats sub_stats;
+  av1_init_tpl_txfm_stats(&sub_stats);
+  sub_stats.txfm_block_count = 17;
+  for (int i = 0; i < sub_stats.coeff_num; ++i) {
+    sub_stats.abs_coeff_sum[i] = i;
+  }
+
+  TplTxfmStats accumulated_stats;
+  av1_init_tpl_txfm_stats(&accumulated_stats);
+  accumulated_stats.txfm_block_count = 13;
+  for (int i = 0; i < accumulated_stats.coeff_num; ++i) {
+    accumulated_stats.abs_coeff_sum[i] = 5 * i;
+  }
+
+  av1_accumulate_tpl_txfm_stats(&sub_stats, &accumulated_stats);
+  EXPECT_DOUBLE_EQ(accumulated_stats.txfm_block_count, 30);
+  for (int i = 0; i < accumulated_stats.coeff_num; ++i) {
+    EXPECT_DOUBLE_EQ(accumulated_stats.abs_coeff_sum[i], 6 * i);
+  }
+}
+
+TEST(TplModelTest, TxfmStatsRecordTest) {
+  TplTxfmStats stats1;
+  TplTxfmStats stats2;
+  av1_init_tpl_txfm_stats(&stats1);
+  av1_init_tpl_txfm_stats(&stats2);
+
+  tran_low_t coeff[256];
+  for (int i = 0; i < 256; ++i) {
+    coeff[i] = i;
+  }
+  av1_record_tpl_txfm_block(&stats1, coeff);
+  EXPECT_EQ(stats1.txfm_block_count, 1);
+
+  // we record the same transform block twice for testing purpose
+  av1_record_tpl_txfm_block(&stats2, coeff);
+  av1_record_tpl_txfm_block(&stats2, coeff);
+  EXPECT_EQ(stats2.txfm_block_count, 2);
+
+  EXPECT_EQ(stats1.coeff_num, 256);
+  EXPECT_EQ(stats2.coeff_num, 256);
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_DOUBLE_EQ(stats2.abs_coeff_sum[i], 2 * stats1.abs_coeff_sum[i]);
+  }
+}
+
+TEST(TplModelTest, ComputeMVDifferenceTest) {
+  TplDepFrame tpl_frame_small;
+  tpl_frame_small.is_valid = true;
+  tpl_frame_small.mi_rows = 4;
+  tpl_frame_small.mi_cols = 4;
+  tpl_frame_small.stride = 1;
+  uint8_t right_shift_small = 1;
+  int step_small = 1 << right_shift_small;
+
+  // Test values for motion vectors.
+  int mv_vals_small[4] = { 1, 2, 3, 4 };
+  int index = 0;
+
+  // 4x4 blocks means we need to allocate a 4 size array.
+  // According to av1_tpl_ptr_pos:
+  // (row >> right_shift) * stride + (col >> right_shift)
+  // (4 >> 1) * 1 + (4 >> 1) = 4
+  TplDepStats stats_buf_small[4];
+  tpl_frame_small.tpl_stats_ptr = stats_buf_small;
+
+  for (int row = 0; row < tpl_frame_small.mi_rows; row += step_small) {
+    for (int col = 0; col < tpl_frame_small.mi_cols; col += step_small) {
+      TplDepStats tpl_stats;
+      tpl_stats.ref_frame_index[0] = 0;
+      int_mv mv;
+      mv.as_mv.row = mv_vals_small[index];
+      mv.as_mv.col = mv_vals_small[index];
+      index++;
+      tpl_stats.mv[0] = mv;
+      tpl_frame_small.tpl_stats_ptr[av1_tpl_ptr_pos(
+          row, col, tpl_frame_small.stride, right_shift_small)] = tpl_stats;
+    }
+  }
+
+  int_mv result_mv =
+      av1_compute_mv_difference(&tpl_frame_small, 1, 1, step_small,
+                                tpl_frame_small.stride, right_shift_small);
+
+  // Expect the result to be exactly equal to 1 because this is the difference
+  // between neighboring motion vectors in this instance.
+  EXPECT_EQ(result_mv.as_mv.row, 1);
+  EXPECT_EQ(result_mv.as_mv.col, 1);
+}
+
+TEST(TplModelTest, ComputeMVBitsTest) {
+  TplDepFrame tpl_frame;
+  tpl_frame.is_valid = true;
+  tpl_frame.mi_rows = 16;
+  tpl_frame.mi_cols = 16;
+  tpl_frame.stride = 24;
+  uint8_t right_shift = 2;
+  int step = 1 << right_shift;
+  // Test values for motion vectors.
+  int mv_vals_ordered[16] = { 1, 2,  3,  4,  5,  6,  7,  8,
+                              9, 10, 11, 12, 13, 14, 15, 16 };
+  int mv_vals[16] = { 1, 16, 2, 15, 3, 14, 4, 13, 5, 12, 6, 11, 7, 10, 8, 9 };
+  int index = 0;
+
+  // 16x16 blocks means we need to allocate a 100 size array.
+  // According to av1_tpl_ptr_pos:
+  // (row >> right_shift) * stride + (col >> right_shift)
+  // (16 >> 2) * 24 + (16 >> 2) = 100
+  TplDepStats stats_buf[100];
+  tpl_frame.tpl_stats_ptr = stats_buf;
+
+  for (int row = 0; row < tpl_frame.mi_rows; row += step) {
+    for (int col = 0; col < tpl_frame.mi_cols; col += step) {
+      TplDepStats tpl_stats;
+      tpl_stats.ref_frame_index[0] = 0;
+      int_mv mv;
+      mv.as_mv.row = mv_vals_ordered[index];
+      mv.as_mv.col = mv_vals_ordered[index];
+      index++;
+      tpl_stats.mv[0] = mv;
+      tpl_frame.tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_frame.stride,
+                                              right_shift)] = tpl_stats;
+    }
+  }
+
+  double result = av1_tpl_compute_frame_mv_entropy(&tpl_frame, right_shift);
+
+  // Expect the result to be low because the motion vectors are ordered.
+  // The estimation algorithm takes this into account and reduces the cost.
+  EXPECT_NEAR(result, 20, 5);
+
+  index = 0;
+  for (int row = 0; row < tpl_frame.mi_rows; row += step) {
+    for (int col = 0; col < tpl_frame.mi_cols; col += step) {
+      TplDepStats tpl_stats;
+      tpl_stats.ref_frame_index[0] = 0;
+      int_mv mv;
+      mv.as_mv.row = mv_vals[index];
+      mv.as_mv.col = mv_vals[index];
+      index++;
+      tpl_stats.mv[0] = mv;
+      tpl_frame.tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_frame.stride,
+                                              right_shift)] = tpl_stats;
+    }
+  }
+
+  result = av1_tpl_compute_frame_mv_entropy(&tpl_frame, right_shift);
+
+  // Expect the result to be higher because the vectors are not ordered.
+  // Neighboring vectors will have different values, increasing the cost.
+  EXPECT_NEAR(result, 70, 5);
+}
+#if CONFIG_BITRATE_ACCURACY
+
+TEST(TplModelTest, VbrRcInfoSetGopBitBudget) {
+  VBR_RATECTRL_INFO vbr_rc_info;
+  const double total_bit_budget = 2000;
+  const int show_frame_count = 8;
+  const int gop_show_frame_count = 4;
+  av1_vbr_rc_init(&vbr_rc_info, total_bit_budget, show_frame_count);
+  av1_vbr_rc_set_gop_bit_budget(&vbr_rc_info, gop_show_frame_count);
+  EXPECT_NEAR(vbr_rc_info.gop_bit_budget, 1000, epsilon);
+}
+
+void init_toy_gf_group(GF_GROUP *gf_group) {
+  av1_zero(*gf_group);
+  gf_group->size = 4;
+  const FRAME_UPDATE_TYPE update_type[4] = { KF_UPDATE, ARF_UPDATE,
+                                             INTNL_ARF_UPDATE, LF_UPDATE };
+  for (int i = 0; i < gf_group->size; ++i) {
+    gf_group->update_type[i] = update_type[i];
+  }
+}
+
+void init_toy_vbr_rc_info(VBR_RATECTRL_INFO *vbr_rc_info, int gop_size) {
+  int total_bit_budget = 2000;
+  int show_frame_count = 8;
+  av1_vbr_rc_init(vbr_rc_info, total_bit_budget, show_frame_count);
+
+  for (int i = 0; i < gop_size; ++i) {
+    vbr_rc_info->qstep_ratio_list[i] = 1;
+  }
+}
+
+void init_toy_tpl_txfm_stats(std::vector<TplTxfmStats> *stats_list) {
+  for (size_t i = 0; i < stats_list->size(); i++) {
+    TplTxfmStats *txfm_stats = &stats_list->at(i);
+    av1_init_tpl_txfm_stats(txfm_stats);
+    txfm_stats->txfm_block_count = 8;
+    for (int j = 0; j < txfm_stats->coeff_num; j++) {
+      txfm_stats->abs_coeff_sum[j] = 1000 + j;
+    }
+    av1_tpl_txfm_stats_update_abs_coeff_mean(txfm_stats);
+  }
+}
+
+/*
+ * Helper method to brute-force search for the closest q_index
+ * that achieves the specified bit budget.
+ */
+int find_gop_q_iterative(double bit_budget, aom_bit_depth_t bit_depth,
+                         const double *update_type_scale_factors,
+                         int frame_count,
+                         const FRAME_UPDATE_TYPE *update_type_list,
+                         const double *qstep_ratio_list,
+                         const TplTxfmStats *stats_list, int *q_index_list,
+                         double *estimated_bitrate_byframe) {
+  int best_q = 255;
+  double curr_estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+      best_q, bit_depth, update_type_scale_factors, frame_count,
+      update_type_list, qstep_ratio_list, stats_list, q_index_list,
+      estimated_bitrate_byframe);
+  double min_bits_diff = fabs(curr_estimate - bit_budget);
+  // Start at q = 254 because we already have an estimate for q = 255.
+  for (int q = 254; q >= 0; q--) {
+    double curr_estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+        q, bit_depth, update_type_scale_factors, frame_count, update_type_list,
+        qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe);
+    double bits_diff = fabs(curr_estimate - bit_budget);
+    if (bits_diff <= min_bits_diff) {
+      min_bits_diff = bits_diff;
+      best_q = q;
+    }
+  }
+  return best_q;
+}
+
+TEST(TplModelTest, EstimateFrameRateTest) {
+  GF_GROUP gf_group;
+  init_toy_gf_group(&gf_group);
+
+  VBR_RATECTRL_INFO vbr_rc_info;
+  init_toy_vbr_rc_info(&vbr_rc_info, gf_group.size);
+
+  std::vector<TplTxfmStats> stats_list(gf_group.size);
+  init_toy_tpl_txfm_stats(&stats_list);
+
+  std::vector<double> est_bitrate_list(gf_group.size);
+  init_toy_tpl_txfm_stats(&stats_list);
+  const aom_bit_depth_t bit_depth = AOM_BITS_8;
+
+  const int q = 125;
+
+  // Case1: all scale factors are 0
+  double scale_factors[FRAME_UPDATE_TYPES] = { 0 };
+  double estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+      q, bit_depth, scale_factors, gf_group.size, gf_group.update_type,
+      vbr_rc_info.qstep_ratio_list, stats_list.data(), vbr_rc_info.q_index_list,
+      est_bitrate_list.data());
+  EXPECT_NEAR(estimate, 0, epsilon);
+
+  // Case2: all scale factors are 1
+  for (int i = 0; i < FRAME_UPDATE_TYPES; i++) {
+    scale_factors[i] = 1;
+  }
+  estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+      q, bit_depth, scale_factors, gf_group.size, gf_group.update_type,
+      vbr_rc_info.qstep_ratio_list, stats_list.data(), vbr_rc_info.q_index_list,
+      est_bitrate_list.data());
+  double ref_estimate = 0;
+  for (int i = 0; i < gf_group.size; i++) {
+    ref_estimate += est_bitrate_list[i];
+  }
+  EXPECT_NEAR(estimate, ref_estimate, epsilon);
+
+  // Case3: Key frame scale factor is 0 and others are 1
+  for (int i = 0; i < FRAME_UPDATE_TYPES; i++) {
+    if (i == KF_UPDATE) {
+      scale_factors[i] = 0;
+    } else {
+      scale_factors[i] = 1;
+    }
+  }
+  estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+      q, bit_depth, scale_factors, gf_group.size, gf_group.update_type,
+      vbr_rc_info.qstep_ratio_list, stats_list.data(), vbr_rc_info.q_index_list,
+      est_bitrate_list.data());
+  ref_estimate = 0;
+  for (int i = 0; i < gf_group.size; i++) {
+    if (gf_group.update_type[i] != KF_UPDATE) {
+      ref_estimate += est_bitrate_list[i];
+    }
+  }
+  EXPECT_NEAR(estimate, ref_estimate, epsilon);
+}
+
+TEST(TplModelTest, VbrRcInfoEstimateBaseQTest) {
+  GF_GROUP gf_group;
+  init_toy_gf_group(&gf_group);
+
+  VBR_RATECTRL_INFO vbr_rc_info;
+  init_toy_vbr_rc_info(&vbr_rc_info, gf_group.size);
+
+  std::vector<TplTxfmStats> stats_list(gf_group.size);
+  init_toy_tpl_txfm_stats(&stats_list);
+  const aom_bit_depth_t bit_depth = AOM_BITS_8;
+
+  // Test multiple bit budgets.
+  const std::vector<double> bit_budgets = { 0,     2470,  19200,  30750,
+                                            41315, 65017, DBL_MAX };
+
+  for (double bit_budget : bit_budgets) {
+    // Binary search method to find the optimal q.
+    const int base_q = av1_vbr_rc_info_estimate_base_q(
+        bit_budget, bit_depth, vbr_rc_info.scale_factors, gf_group.size,
+        gf_group.update_type, vbr_rc_info.qstep_ratio_list, stats_list.data(),
+        vbr_rc_info.q_index_list, NULL);
+    const int ref_base_q = find_gop_q_iterative(
+        bit_budget, bit_depth, vbr_rc_info.scale_factors, gf_group.size,
+        gf_group.update_type, vbr_rc_info.qstep_ratio_list, stats_list.data(),
+        vbr_rc_info.q_index_list, NULL);
+    if (bit_budget == 0) {
+      EXPECT_EQ(base_q, 255);
+    } else if (bit_budget == DBL_MAX) {
+      EXPECT_EQ(base_q, 0);
+    }
+    EXPECT_EQ(base_q, ref_base_q);
+  }
+}
+#endif  // CONFIG_BITRATE_ACCURACY
+
+}  // namespace
diff --git a/media/libaom/src/test/transform_test_base.h b/media/libaom/src/test/transform_test_base.h
index 68f5cc74d2..260f4ffef8 100644
--- a/media/libaom/src/test/transform_test_base.h
+++ b/media/libaom/src/test/transform_test_base.h
@@ -55,16 +55,22 @@ class TransformTestBase {
 
     int16_t *test_input_block = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    ASSERT_NE(test_input_block, nullptr);
     OutType *test_temp_block = reinterpret_cast<OutType *>(
         aom_memalign(16, sizeof(test_temp_block[0]) * num_coeffs_));
+    ASSERT_NE(test_temp_block, nullptr);
     uint8_t *dst = reinterpret_cast<uint8_t *>(
         aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    ASSERT_NE(dst, nullptr);
     uint8_t *src = reinterpret_cast<uint8_t *>(
         aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    ASSERT_NE(src, nullptr);
     uint16_t *dst16 = reinterpret_cast<uint16_t *>(
         aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    ASSERT_NE(dst16, nullptr);
     uint16_t *src16 = reinterpret_cast<uint16_t *>(
         aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    ASSERT_NE(src16, nullptr);
 
     for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-255, 255].
@@ -80,12 +86,12 @@ class TransformTestBase {
         }
       }
 
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           RunFwdTxfm(test_input_block, test_temp_block, pitch_));
       if (bit_depth_ == AOM_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+        API_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
       } else {
-        ASM_REGISTER_STATE_CHECK(
+        API_REGISTER_STATE_CHECK(
             RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
       }
 
@@ -126,10 +132,13 @@ class TransformTestBase {
 
     int16_t *input_block = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(int16_t) * stride * height_));
+    ASSERT_NE(input_block, nullptr);
     OutType *output_ref_block = reinterpret_cast<OutType *>(
         aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_));
+    ASSERT_NE(output_ref_block, nullptr);
     OutType *output_block = reinterpret_cast<OutType *>(
         aom_memalign(16, sizeof(output_block[0]) * num_coeffs_));
+    ASSERT_NE(output_block, nullptr);
 
     for (int i = 0; i < count_test_block; ++i) {
       int j, k;
@@ -148,7 +157,7 @@ class TransformTestBase {
       }
 
       fwd_txfm_ref(input_block, output_ref_block, stride, &txfm_param_);
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, stride));
+      API_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, stride));
 
       // The minimum quant value is 4.
       for (j = 0; j < height_; ++j) {
@@ -175,12 +184,16 @@ class TransformTestBase {
 
     int16_t *input_block = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    ASSERT_NE(input_block, nullptr);
     OutType *trans_block = reinterpret_cast<OutType *>(
         aom_memalign(16, sizeof(trans_block[0]) * num_coeffs_));
+    ASSERT_NE(trans_block, nullptr);
     uint8_t *output_block = reinterpret_cast<uint8_t *>(
         aom_memalign(16, sizeof(uint8_t) * stride * height_));
+    ASSERT_NE(output_block, nullptr);
     uint8_t *output_ref_block = reinterpret_cast<uint8_t *>(
         aom_memalign(16, sizeof(uint8_t) * stride * height_));
+    ASSERT_NE(output_ref_block, nullptr);
 
     for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-mask_, mask_].
@@ -198,7 +211,7 @@ class TransformTestBase {
       fwd_txfm_ref(input_block, trans_block, pitch_, &txfm_param_);
 
       inv_txfm_ref(trans_block, output_ref_block, stride, &txfm_param_);
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(trans_block, output_block, stride));
+      API_REGISTER_STATE_CHECK(RunInvTxfm(trans_block, output_block, stride));
 
       for (j = 0; j < height_; ++j) {
         for (k = 0; k < pitch_; ++k) {
@@ -221,10 +234,13 @@ class TransformTestBase {
 
     int16_t *input_extreme_block = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    ASSERT_NE(input_extreme_block, nullptr);
     OutType *output_ref_block = reinterpret_cast<OutType *>(
         aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_));
+    ASSERT_NE(output_ref_block, nullptr);
     OutType *output_block = reinterpret_cast<OutType *>(
         aom_memalign(16, sizeof(output_block[0]) * num_coeffs_));
+    ASSERT_NE(output_block, nullptr);
 
     for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-mask_, mask_].
@@ -238,7 +254,7 @@ class TransformTestBase {
       }
 
       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, &txfm_param_);
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           RunFwdTxfm(input_extreme_block, output_block, pitch_));
 
       int row_length = FindRowLength();
@@ -263,17 +279,23 @@ class TransformTestBase {
 
     int16_t *in = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    ASSERT_NE(in, nullptr);
     OutType *coeff = reinterpret_cast<OutType *>(
         aom_memalign(16, sizeof(coeff[0]) * num_coeffs_));
+    ASSERT_NE(coeff, nullptr);
     uint8_t *dst = reinterpret_cast<uint8_t *>(
         aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    ASSERT_NE(dst, nullptr);
     uint8_t *src = reinterpret_cast<uint8_t *>(
         aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    ASSERT_NE(src, nullptr);
 
     uint16_t *dst16 = reinterpret_cast<uint16_t *>(
         aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    ASSERT_NE(dst16, nullptr);
     uint16_t *src16 = reinterpret_cast<uint16_t *>(
         aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    ASSERT_NE(src16, nullptr);
 
     for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-mask_, mask_].
@@ -292,9 +314,9 @@ class TransformTestBase {
       fwd_txfm_ref(in, coeff, pitch_, &txfm_param_);
 
       if (bit_depth_ == AOM_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+        API_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
       } else {
-        ASM_REGISTER_STATE_CHECK(
+        API_REGISTER_STATE_CHECK(
             RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
       }
 
diff --git a/media/libaom/src/test/twopass_encoder.sh b/media/libaom/src/test/twopass_encoder.sh
index cca44ced8a..44e7327b8f 100644..100755
--- a/media/libaom/src/test/twopass_encoder.sh
+++ b/media/libaom/src/test/twopass_encoder.sh
@@ -38,7 +38,7 @@ twopass_encoder() {
 
   eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" "${limit}" \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/media/libaom/src/test/util.h b/media/libaom/src/test/util.h
index aa4b106e45..29df709c4f 100644
--- a/media/libaom/src/test/util.h
+++ b/media/libaom/src/test/util.h
@@ -12,16 +12,23 @@
 #ifndef AOM_TEST_UTIL_H_
 #define AOM_TEST_UTIL_H_
 
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
+#include <string.h>
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "aom/aom_integer.h"
 #include "aom/aom_image.h"
 #include "aom_ports/aom_timer.h"
 
 // Macros
 #define GET_PARAM(k) std::get<k>(GetParam())
 
+inline int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename) return 0;
+
+  return !strcmp(dot, ".y4m");
+}
+
 inline double compute_psnr(const aom_image_t *img1, const aom_image_t *img2) {
   assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) &&
          (img1->d_h == img2->d_h));
diff --git a/media/libaom/src/test/variance_test.cc b/media/libaom/src/test/variance_test.cc
index 1458ece287..62d510d152 100644
--- a/media/libaom/src/test/variance_test.cc
+++ b/media/libaom/src/test/variance_test.cc
@@ -20,7 +20,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
@@ -30,9 +29,14 @@
 
 namespace {
 
+typedef uint64_t (*MseWxH16bitFunc)(uint8_t *dst, int dstride, uint16_t *src,
+                                    int sstride, int w, int h);
 typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
                                         const uint8_t *b, int b_stride,
                                         unsigned int *sse);
+typedef void (*GetSseSum8x8QuadFunc)(const uint8_t *a, int a_stride,
+                                     const uint8_t *b, int b_stride,
+                                     unsigned int *sse, int *sum);
 typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride,
                                          int xoffset, int yoffset,
                                          const uint8_t *b, int b_stride,
@@ -49,10 +53,13 @@ typedef unsigned int (*DistWtdSubpixAvgVarMxNFunc)(
     const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
     int b_stride, uint32_t *sse, const uint8_t *second_pred,
     const DIST_WTD_COMP_PARAMS *jcp_param);
+
+#if !CONFIG_REALTIME_ONLY
 typedef uint32_t (*ObmcSubpelVarFunc)(const uint8_t *pre, int pre_stride,
                                       int xoffset, int yoffset,
                                       const int32_t *wsrc, const int32_t *mask,
                                       unsigned int *sse);
+#endif
 
 using libaom_test::ACMRandom;
 
@@ -275,6 +282,7 @@ static uint32_t dist_wtd_subpel_avg_variance_ref(
   return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
 
+#if !CONFIG_REALTIME_ONLY
 static uint32_t obmc_subpel_variance_ref(const uint8_t *pre, int l2w, int l2h,
                                          int xoff, int yoff,
                                          const int32_t *wsrc,
@@ -324,6 +332,7 @@ static uint32_t obmc_subpel_variance_ref(const uint8_t *pre, int l2w, int l2h,
   *sse_ptr = static_cast<uint32_t>(sse);
   return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -331,7 +340,7 @@ class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
  public:
   SumOfSquaresTest() : func_(GetParam()) {}
 
-  virtual ~SumOfSquaresTest() { libaom_test::ClearSystemState(); }
+  virtual ~SumOfSquaresTest() {}
 
  protected:
   void ConstTest();
@@ -348,7 +357,7 @@ void SumOfSquaresTest::ConstTest() {
     for (int i = 0; i < 256; ++i) {
       mem[i] = v;
     }
-    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    API_REGISTER_STATE_CHECK(res = func_(mem));
     EXPECT_EQ(256u * (v * v), res);
   }
 }
@@ -362,7 +371,7 @@ void SumOfSquaresTest::RefTest() {
 
     const unsigned int expected = mb_ss_ref(mem);
     unsigned int res;
-    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    API_REGISTER_STATE_CHECK(res = func_(mem));
     EXPECT_EQ(expected, res);
   }
 }
@@ -407,6 +416,105 @@ std::ostream &operator<<(std::ostream &os, const TestParams<Func> &p) {
 
 // Main class for testing a function type
 template <typename FunctionType>
+class MseWxHTestClass
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, block_size() * sizeof(src_)));
+    dst_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, block_size() * sizeof(dst_)));
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(dst_, nullptr);
+  }
+
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dst_);
+    src_ = NULL;
+    dst_ = NULL;
+  }
+
+ protected:
+  void RefMatchTestMse();
+  void SpeedTest();
+
+ protected:
+  ACMRandom rnd_;
+  uint8_t *dst_;
+  uint16_t *src_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  int d_stride() const { return params_.width; }  // stride is same as width
+  int s_stride() const { return params_.width; }  // stride is same as width
+};
+
+template <typename MseWxHFunctionType>
+void MseWxHTestClass<MseWxHFunctionType>::SpeedTest() {
+  aom_usec_timer ref_timer, test_timer;
+  double elapsed_time_c = 0;
+  double elapsed_time_simd = 0;
+  int run_time = 10000000;
+  int w = width();
+  int h = height();
+  int dstride = d_stride();
+  int sstride = s_stride();
+
+  for (int k = 0; k < block_size(); ++k) {
+    dst_[k] = rnd_.Rand8();
+    src_[k] = rnd_.Rand8();
+  }
+  aom_usec_timer_start(&ref_timer);
+  for (int i = 0; i < run_time; i++) {
+    aom_mse_wxh_16bit_c(dst_, dstride, src_, sstride, w, h);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  elapsed_time_c = static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+  aom_usec_timer_start(&test_timer);
+  for (int i = 0; i < run_time; i++) {
+    params_.func(dst_, dstride, src_, sstride, w, h);
+  }
+  aom_usec_timer_mark(&test_timer);
+  elapsed_time_simd = static_cast<double>(aom_usec_timer_elapsed(&test_timer));
+
+  printf("%dx%d\tc_time=%lf \t simd_time=%lf \t gain=%lf\n", width(), height(),
+         elapsed_time_c, elapsed_time_simd,
+         (elapsed_time_c / elapsed_time_simd));
+}
+
+template <typename MseWxHFunctionType>
+void MseWxHTestClass<MseWxHFunctionType>::RefMatchTestMse() {
+  uint64_t mse_ref = 0;
+  uint64_t mse_mod = 0;
+  int w = width();
+  int h = height();
+  int dstride = d_stride();
+  int sstride = s_stride();
+
+  for (int i = 0; i < 10; i++) {
+    for (int k = 0; k < block_size(); ++k) {
+      dst_[k] = rnd_.Rand8();
+      src_[k] = rnd_.Rand8();
+    }
+    API_REGISTER_STATE_CHECK(
+        mse_ref = aom_mse_wxh_16bit_c(dst_, dstride, src_, sstride, w, h));
+    API_REGISTER_STATE_CHECK(
+        mse_mod = params_.func(dst_, dstride, src_, sstride, w, h));
+    EXPECT_EQ(mse_ref, mse_mod)
+        << "ref mse: " << mse_ref << " mod mse: " << mse_mod;
+  }
+}
+
+// Main class for testing a function type
+template <typename FunctionType>
 class MainTestClass
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
@@ -418,8 +526,10 @@ class MainTestClass
         use_high_bit_depth() ? sizeof(uint16_t) : sizeof(uint8_t);
     src_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size() * unit));
     ref_ = new uint8_t[block_size() * unit];
-    ASSERT_TRUE(src_ != NULL);
-    ASSERT_TRUE(ref_ != NULL);
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(ref_, nullptr);
+    memset(src_, 0, block_size() * sizeof(src_[0]));
+    memset(ref_, 0, block_size() * sizeof(ref_[0]));
     if (use_high_bit_depth()) {
       // TODO(skal): remove!
       src_ = CONVERT_TO_BYTEPTR(src_);
@@ -438,7 +548,6 @@ class MainTestClass
     delete[] ref_;
     src_ = NULL;
     ref_ = NULL;
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -454,6 +563,12 @@ class MainTestClass
   void OneQuarterTest();
   void SpeedTest();
 
+  // SSE&SUM tests
+  void RefTestSseSum();
+  void MinTestSseSum();
+  void MaxTestSseSum();
+  void SseSum_SpeedTest();
+
   // MSE/SSE tests
   void RefTestMse();
   void RefTestSse();
@@ -495,7 +610,7 @@ void MainTestClass<VarianceFunctionType>::ZeroTest() {
         for (int k = 0; k < block_size(); ++k) ref16[k] = j << byte_shift();
       }
       unsigned int sse, var;
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           var = params_.func(src_, width(), ref_, width(), &sse));
       EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j;
     }
@@ -516,7 +631,7 @@ void MainTestClass<VarianceFunctionType>::RefTest() {
     }
     unsigned int sse1, sse2, var1, var2;
     const int stride = width();
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         var1 = params_.func(src_, stride, ref_, stride, &sse1));
     var2 =
         variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
@@ -545,7 +660,7 @@ void MainTestClass<VarianceFunctionType>::RefStrideTest() {
     unsigned int sse1, sse2;
     unsigned int var1, var2;
 
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         var1 = params_.func(src_, src_stride, ref_, ref_stride, &sse1));
     var2 = variance_ref(src_, ref_, params_.log2width, params_.log2height,
                         src_stride, ref_stride, &sse2, use_high_bit_depth(),
@@ -568,7 +683,7 @@ void MainTestClass<VarianceFunctionType>::OneQuarterTest() {
     aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
   }
   unsigned int sse, var, expected;
-  ASM_REGISTER_STATE_CHECK(
+  API_REGISTER_STATE_CHECK(
       var = params_.func(src_, width(), ref_, width(), &sse));
   expected = block_size() * 255 * 255 / 4;
   EXPECT_EQ(expected, var);
@@ -595,11 +710,150 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() {
   }
 
   aom_usec_timer_mark(&timer);
-  const double elapsed_time =
-      static_cast<double>(aom_usec_timer_elapsed(&timer));
-  printf("Variance %dx%d : %7.2fns\n", width(), height(), elapsed_time);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("Variance %dx%d : %d us\n", width(), height(), elapsed_time);
+}
+
+template <typename GetSseSum8x8QuadFuncType>
+void MainTestClass<GetSseSum8x8QuadFuncType>::RefTestSseSum() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size(); ++j) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
+    }
+    unsigned int sse1[256] = { 0 };
+    unsigned int sse2[256] = { 0 };
+    int sum1[256] = { 0 };
+    int sum2[256] = { 0 };
+    const int stride = width();
+    int k = 0;
+
+    for (int i = 0; i < height(); i += 8) {
+      for (int j = 0; j < width(); j += 32) {
+        API_REGISTER_STATE_CHECK(params_.func(src_ + stride * i + j, stride,
+                                              ref_ + stride * i + j, stride,
+                                              &sse1[k], &sum1[k]));
+        aom_get_sse_sum_8x8_quad_c(src_ + stride * i + j, stride,
+                                   ref_ + stride * i + j, stride, &sse2[k],
+                                   &sum2[k]);
+        k += 4;
+      }
+    }
+
+    for (int p = 0; p < 256; p++) {
+      EXPECT_EQ(sse1[p], sse2[p]);
+      EXPECT_EQ(sum1[p], sum2[p]);
+    }
+  }
 }
 
+template <typename GetSseSum8x8QuadFuncType>
+void MainTestClass<GetSseSum8x8QuadFuncType>::MinTestSseSum() {
+  memset(src_, 0, block_size());
+  memset(ref_, 255, block_size());
+  unsigned int sse1[256] = { 0 };
+  unsigned int sse2[256] = { 0 };
+  int sum1[256] = { 0 };
+  int sum2[256] = { 0 };
+  const int stride = width();
+  int k = 0;
+
+  for (int i = 0; i < height(); i += 8) {
+    for (int j = 0; j < width(); j += 32) {
+      API_REGISTER_STATE_CHECK(params_.func(src_ + stride * i + j, stride,
+                                            ref_ + stride * i + j, stride,
+                                            &sse1[k], &sum1[k]));
+      aom_get_sse_sum_8x8_quad_c(src_ + stride * i + j, stride,
+                                 ref_ + stride * i + j, stride, &sse2[k],
+                                 &sum2[k]);
+      k += 4;
+    }
+  }
+
+  for (int p = 0; p < 256; p++) {
+    EXPECT_EQ(sse1[p], sse2[p]);
+    EXPECT_EQ(sum1[p], sum2[p]);
+  }
+}
+
+template <typename GetSseSum8x8QuadFuncType>
+void MainTestClass<GetSseSum8x8QuadFuncType>::MaxTestSseSum() {
+  memset(src_, 255, block_size());
+  memset(ref_, 0, block_size());
+  unsigned int sse1[256] = { 0 };
+  unsigned int sse2[256] = { 0 };
+  int sum1[256] = { 0 };
+  int sum2[256] = { 0 };
+  const int stride = width();
+  int k = 0;
+
+  for (int i = 0; i < height(); i += 8) {
+    for (int j = 0; j < width(); j += 32) {
+      API_REGISTER_STATE_CHECK(params_.func(src_ + stride * i + j, stride,
+                                            ref_ + stride * i + j, stride,
+                                            &sse1[k], &sum1[k]));
+      aom_get_sse_sum_8x8_quad_c(src_ + stride * i + j, stride,
+                                 ref_ + stride * i + j, stride, &sse2[k],
+                                 &sum2[k]);
+      k += 4;
+    }
+  }
+
+  for (int p = 0; p < 256; p++) {
+    EXPECT_EQ(sse1[p], sse2[p]);
+    EXPECT_EQ(sum1[p], sum2[p]);
+  }
+}
+
+template <typename GetSseSum8x8QuadFuncType>
+void MainTestClass<GetSseSum8x8QuadFuncType>::SseSum_SpeedTest() {
+  const int loop_count = 1000000000 / block_size();
+  for (int j = 0; j < block_size(); ++j) {
+    src_[j] = rnd_.Rand8();
+    ref_[j] = rnd_.Rand8();
+  }
+
+  unsigned int sse1[4] = { 0 };
+  unsigned int sse2[4] = { 0 };
+  int sum1[4] = { 0 };
+  int sum2[4] = { 0 };
+  const int stride = width();
+  const int k = 0;
+
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int r = 0; r < loop_count; ++r) {
+    for (int i = 0; i < height(); i += 8) {
+      for (int j = 0; j < width(); j += 32) {
+        aom_get_sse_sum_8x8_quad_c(src_ + stride * i + j, stride,
+                                   ref_ + stride * i + j, stride, &sse2[k],
+                                   &sum2[k]);
+      }
+    }
+  }
+  aom_usec_timer_mark(&timer);
+  const double elapsed_time_ref =
+      static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+  aom_usec_timer_start(&timer);
+  for (int r = 0; r < loop_count; ++r) {
+    for (int i = 0; i < height(); i += 8) {
+      for (int j = 0; j < width(); j += 32) {
+        params_.func(src_ + stride * i + j, stride, ref_ + stride * i + j,
+                     stride, &sse1[k], &sum1[k]);
+      }
+    }
+  }
+  aom_usec_timer_mark(&timer);
+  const double elapsed_time_simd =
+      static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+  printf(
+      "aom_getvar_8x8_quad for block=%dx%d : ref_time=%lf \t simd_time=%lf \t "
+      "gain=%lf \n",
+      width(), height(), elapsed_time_ref, elapsed_time_simd,
+      elapsed_time_ref / elapsed_time_simd);
+}
 ////////////////////////////////////////////////////////////////////////////////
 // Tests related to MSE / SSE.
 
@@ -612,7 +866,7 @@ void MainTestClass<FunctionType>::RefTestMse() {
     }
     unsigned int sse1, sse2;
     const int stride = width();
-    ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
+    API_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
     variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
                  stride, &sse2, false, AOM_BITS_8);
     EXPECT_EQ(sse1, sse2);
@@ -629,7 +883,7 @@ void MainTestClass<FunctionType>::RefTestSse() {
     unsigned int sse2;
     unsigned int var1;
     const int stride = width();
-    ASM_REGISTER_STATE_CHECK(var1 = params_.func(src_, stride, ref_, stride));
+    API_REGISTER_STATE_CHECK(var1 = params_.func(src_, stride, ref_, stride));
     variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
                  stride, &sse2, false, AOM_BITS_8);
     EXPECT_EQ(var1, sse2);
@@ -641,7 +895,7 @@ void MainTestClass<FunctionType>::MaxTestMse() {
   memset(src_, 255, block_size());
   memset(ref_, 0, block_size());
   unsigned int sse;
-  ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
+  API_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
   const unsigned int expected = block_size() * 255 * 255;
   EXPECT_EQ(expected, sse);
 }
@@ -651,7 +905,7 @@ void MainTestClass<FunctionType>::MaxTestSse() {
   memset(src_, 255, block_size());
   memset(ref_, 0, block_size());
   unsigned int var;
-  ASM_REGISTER_STATE_CHECK(var = params_.func(src_, width(), ref_, width()));
+  API_REGISTER_STATE_CHECK(var = params_.func(src_, width(), ref_, width()));
   const unsigned int expected = block_size() * 255 * 255;
   EXPECT_EQ(expected, var);
 }
@@ -683,9 +937,9 @@ class SubpelVarianceTest
       ref_ = CONVERT_TO_BYTEPTR(aom_memalign(
           32, (block_size() + width() + height() + 1) * sizeof(uint16_t)));
     }
-    ASSERT_TRUE(src_ != NULL);
-    ASSERT_TRUE(sec_ != NULL);
-    ASSERT_TRUE(ref_ != NULL);
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(sec_, nullptr);
+    ASSERT_NE(ref_, nullptr);
   }
 
   virtual void TearDown() {
@@ -698,7 +952,6 @@ class SubpelVarianceTest
       aom_free(CONVERT_TO_SHORTPTR(ref_));
       aom_free(CONVERT_TO_SHORTPTR(sec_));
     }
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -743,7 +996,7 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
       }
       unsigned int sse1, sse2;
       unsigned int var1;
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
       const unsigned int var2 = subpel_variance_ref(
           ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
@@ -776,7 +1029,7 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
       }
       unsigned int sse1, sse2;
       unsigned int var1;
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
       const unsigned int var2 = subpel_variance_ref(
           ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
@@ -861,7 +1114,7 @@ void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
       }
       uint32_t sse1, sse2;
       uint32_t var1, var2;
-      ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
+      API_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
                                                    src_, width(), &sse1, sec_));
       var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width,
                                      params_.log2height, x, y, &sse2,
@@ -897,9 +1150,9 @@ void SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>::RefTest() {
         for (int y0 = 0; y0 < 4; ++y0) {
           uint32_t sse1, sse2;
           uint32_t var1, var2;
-          jcp_param_.fwd_offset = quant_dist_lookup_table[x0][y0][0];
-          jcp_param_.bck_offset = quant_dist_lookup_table[x0][y0][1];
-          ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y,
+          jcp_param_.fwd_offset = quant_dist_lookup_table[y0][x0];
+          jcp_param_.bck_offset = quant_dist_lookup_table[y0][1 - x0];
+          API_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y,
                                                        src_, width(), &sse1,
                                                        sec_, &jcp_param_));
           var2 = dist_wtd_subpel_avg_variance_ref(
@@ -915,6 +1168,8 @@ void SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>::RefTest() {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+#if !CONFIG_REALTIME_ONLY
+
 static const int kMaskMax = 64;
 
 typedef TestParams<ObmcSubpelVarFunc> ObmcSubpelVarianceParams;
@@ -938,9 +1193,9 @@ class ObmcVarianceTest
         aom_memalign(32, block_size() * sizeof(uint32_t)));
     mask_ = reinterpret_cast<int32_t *>(
         aom_memalign(32, block_size() * sizeof(uint32_t)));
-    ASSERT_TRUE(pre_ != NULL);
-    ASSERT_TRUE(wsrc_ != NULL);
-    ASSERT_TRUE(mask_ != NULL);
+    ASSERT_NE(pre_, nullptr);
+    ASSERT_NE(wsrc_, nullptr);
+    ASSERT_NE(mask_, nullptr);
   }
 
   virtual void TearDown() {
@@ -951,7 +1206,6 @@ class ObmcVarianceTest
     }
     aom_free(wsrc_);
     aom_free(mask_);
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -991,7 +1245,7 @@ void ObmcVarianceTest<ObmcSubpelVarFunc>::RefTest() {
 
       uint32_t sse1, sse2;
       uint32_t var1, var2;
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
       var2 = obmc_subpel_variance_ref(
           pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
@@ -1028,7 +1282,7 @@ void ObmcVarianceTest<ObmcSubpelVarFunc>::ExtremeRefTest() {
 
       uint32_t sse1, sse2;
       uint32_t var1, var2;
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
       var2 = obmc_subpel_variance_ref(
           pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
@@ -1060,7 +1314,7 @@ void ObmcVarianceTest<ObmcSubpelVarFunc>::SpeedTest() {
   for (int i = 0; i < run_time; ++i) {
     int x = rnd_(8);
     int y = rnd_(8);
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         params_.func(pre_, stride, x, y, wsrc_, mask_, &sse1));
   }
   aom_usec_timer_mark(&timer);
@@ -1070,17 +1324,26 @@ void ObmcVarianceTest<ObmcSubpelVarFunc>::SpeedTest() {
          params_.bit_depth, elapsed_time);
 }
 
+#endif  // !CONFIG_REALTIME_ONLY
+
+typedef MseWxHTestClass<MseWxH16bitFunc> MseWxHTest;
 typedef MainTestClass<Get4x4SseFunc> AvxSseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
+typedef MainTestClass<GetSseSum8x8QuadFunc> GetSseSum8x8QuadTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest;
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest;
 typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>
     AvxDistWtdSubpelAvgVarianceTest;
+#if !CONFIG_REALTIME_ONLY
 typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxObmcSubpelVarianceTest;
+#endif
+typedef TestParams<MseWxH16bitFunc> MseWxHParams;
 
 TEST_P(AvxSseTest, RefSse) { RefTestSse(); }
 TEST_P(AvxSseTest, MaxSse) { MaxTestSse(); }
+TEST_P(MseWxHTest, RefMse) { RefMatchTestMse(); }
+TEST_P(MseWxHTest, DISABLED_SpeedMse) { SpeedTest(); }
 TEST_P(AvxMseTest, RefMse) { RefTestMse(); }
 TEST_P(AvxMseTest, MaxMse) { MaxTestMse(); }
 TEST_P(AvxVarianceTest, Zero) { ZeroTest(); }
@@ -1088,6 +1351,10 @@ TEST_P(AvxVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxVarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(AvxVarianceTest, OneQuarter) { OneQuarterTest(); }
 TEST_P(AvxVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(GetSseSum8x8QuadTest, RefMseSum) { RefTestSseSum(); }
+TEST_P(GetSseSum8x8QuadTest, MinSseSum) { MinTestSseSum(); }
+TEST_P(GetSseSum8x8QuadTest, MaxMseSum) { MaxTestSseSum(); }
+TEST_P(GetSseSum8x8QuadTest, DISABLED_Speed) { SseSum_SpeedTest(); }
 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
@@ -1095,9 +1362,18 @@ TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(AvxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
+#if !CONFIG_REALTIME_ONLY
 TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
+#endif
+
+INSTANTIATE_TEST_SUITE_P(
+    C, MseWxHTest,
+    ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_c, 8),
+                      MseWxHParams(3, 2, &aom_mse_wxh_16bit_c, 8),
+                      MseWxHParams(2, 3, &aom_mse_wxh_16bit_c, 8),
+                      MseWxHParams(2, 2, &aom_mse_wxh_16bit_c, 8)));
 
 INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest,
                          ::testing::Values(aom_get_mb_ss_c));
@@ -1115,132 +1391,153 @@ INSTANTIATE_TEST_SUITE_P(C, AvxMseTest,
                                            MseParams(3, 3, &aom_mse8x8_c)));
 
 typedef TestParams<VarianceMxNFunc> VarianceParams;
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxVarianceTest,
-    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_c),
-                      VarianceParams(7, 6, &aom_variance128x64_c),
-                      VarianceParams(6, 7, &aom_variance64x128_c),
-                      VarianceParams(6, 6, &aom_variance64x64_c),
-                      VarianceParams(6, 5, &aom_variance64x32_c),
-                      VarianceParams(5, 6, &aom_variance32x64_c),
-                      VarianceParams(5, 5, &aom_variance32x32_c),
-                      VarianceParams(5, 4, &aom_variance32x16_c),
-                      VarianceParams(4, 5, &aom_variance16x32_c),
-                      VarianceParams(4, 4, &aom_variance16x16_c),
-                      VarianceParams(4, 3, &aom_variance16x8_c),
-                      VarianceParams(3, 4, &aom_variance8x16_c),
-                      VarianceParams(3, 3, &aom_variance8x8_c),
-                      VarianceParams(3, 2, &aom_variance8x4_c),
-                      VarianceParams(2, 3, &aom_variance4x8_c),
-                      VarianceParams(2, 2, &aom_variance4x4_c),
-
-                      VarianceParams(6, 4, &aom_variance64x16_c),
-                      VarianceParams(4, 6, &aom_variance16x64_c),
-                      VarianceParams(5, 3, &aom_variance32x8_c),
-                      VarianceParams(3, 5, &aom_variance8x32_c),
-                      VarianceParams(4, 2, &aom_variance16x4_c),
-                      VarianceParams(2, 4, &aom_variance4x16_c)));
+const VarianceParams kArrayVariance_c[] = {
+  VarianceParams(7, 7, &aom_variance128x128_c),
+  VarianceParams(7, 6, &aom_variance128x64_c),
+  VarianceParams(6, 7, &aom_variance64x128_c),
+  VarianceParams(6, 6, &aom_variance64x64_c),
+  VarianceParams(6, 5, &aom_variance64x32_c),
+  VarianceParams(5, 6, &aom_variance32x64_c),
+  VarianceParams(5, 5, &aom_variance32x32_c),
+  VarianceParams(5, 4, &aom_variance32x16_c),
+  VarianceParams(4, 5, &aom_variance16x32_c),
+  VarianceParams(4, 4, &aom_variance16x16_c),
+  VarianceParams(4, 3, &aom_variance16x8_c),
+  VarianceParams(3, 4, &aom_variance8x16_c),
+  VarianceParams(3, 3, &aom_variance8x8_c),
+  VarianceParams(3, 2, &aom_variance8x4_c),
+  VarianceParams(2, 3, &aom_variance4x8_c),
+  VarianceParams(2, 2, &aom_variance4x4_c),
+#if !CONFIG_REALTIME_ONLY
+  VarianceParams(6, 4, &aom_variance64x16_c),
+  VarianceParams(4, 6, &aom_variance16x64_c),
+  VarianceParams(5, 3, &aom_variance32x8_c),
+  VarianceParams(3, 5, &aom_variance8x32_c),
+  VarianceParams(4, 2, &aom_variance16x4_c),
+  VarianceParams(2, 4, &aom_variance4x16_c),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxVarianceTest,
+                         ::testing::ValuesIn(kArrayVariance_c));
+
+typedef TestParams<GetSseSum8x8QuadFunc> GetSseSumParams;
+const GetSseSumParams kArrayGetSseSum8x8Quad_c[] = {
+  GetSseSumParams(7, 7, &aom_get_sse_sum_8x8_quad_c, 0),
+  GetSseSumParams(6, 6, &aom_get_sse_sum_8x8_quad_c, 0),
+  GetSseSumParams(5, 5, &aom_get_sse_sum_8x8_quad_c, 0),
+  GetSseSumParams(5, 4, &aom_get_sse_sum_8x8_quad_c, 0)
+};
+INSTANTIATE_TEST_SUITE_P(C, GetSseSum8x8QuadTest,
+                         ::testing::ValuesIn(kArrayGetSseSum8x8Quad_c));
 
 typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_c, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_c, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_c, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_c, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_c, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_c, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_c, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_c, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_c, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_c, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_c, 0),
-        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_c, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_c, 0),
-        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_c, 0),
-        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_c, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_c, 0),
-
-        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_c, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_c, 0),
-        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_c, 0),
-        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_c, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_c, 0),
-        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_c, 0)));
+const SubpelVarianceParams kArraySubpelVariance_c[] = {
+  SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_c, 0),
+  SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_c, 0),
+  SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_c, 0),
+  SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_c, 0),
+  SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_c, 0),
+  SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_c, 0),
+  SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_c, 0),
+  SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_c, 0),
+  SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_c, 0),
+  SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_c, 0),
+  SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_c, 0),
+  SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_c, 0),
+  SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_c, 0),
+  SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_c, 0),
+  SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_c, 0),
+  SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_c, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_c, 0),
+  SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_c, 0),
+  SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_c, 0),
+  SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_c, 0),
+  SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_c, 0),
+  SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_c, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxSubpelVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelVariance_c));
 
 typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxSubpelAvgVarianceTest,
-    ::testing::Values(
-        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_c, 0),
-        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_c, 0),
-        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_c, 0),
-        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_c, 0),
-        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_c, 0),
-        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_c, 0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_c, 0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_c, 0),
-        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_c, 0),
-        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_c, 0),
-        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_c, 0),
-        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_c, 0),
-        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_c, 0),
-        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_c, 0),
-        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0),
-
-        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_c, 0),
-        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_c, 0),
-        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_c, 0),
-        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_c, 0),
-        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_c, 0),
-        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_c, 0)));
+const SubpelAvgVarianceParams kArraySubpelAvgVariance_c[] = {
+  SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_c, 0),
+  SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_c, 0),
+  SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_c, 0),
+  SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_c, 0),
+  SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_c, 0),
+  SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_c, 0),
+  SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_c, 0),
+  SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_c, 0),
+  SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_c, 0),
+  SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_c, 0),
+  SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_c, 0),
+  SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_c, 0),
+  SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_c, 0),
+  SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_c, 0),
+  SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0),
+  SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_c, 0),
+  SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_c, 0),
+  SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_c, 0),
+  SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_c, 0),
+  SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_c, 0),
+  SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_c, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelAvgVariance_c));
 
 typedef TestParams<DistWtdSubpixAvgVarMxNFunc> DistWtdSubpelAvgVarianceParams;
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxDistWtdSubpelAvgVarianceTest,
-    ::testing::Values(DistWtdSubpelAvgVarianceParams(
-                          6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0),
-
-                      DistWtdSubpelAvgVarianceParams(
-                          6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_c,
-                          0)));
+const DistWtdSubpelAvgVarianceParams kArrayDistWtdSubpelAvgVariance_c[] = {
+  DistWtdSubpelAvgVarianceParams(
+      6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_c, 0),
+  DistWtdSubpelAvgVarianceParams(4, 3,
+                                 &aom_dist_wtd_sub_pixel_avg_variance16x8_c, 0),
+  DistWtdSubpelAvgVarianceParams(3, 4,
+                                 &aom_dist_wtd_sub_pixel_avg_variance8x16_c, 0),
+  DistWtdSubpelAvgVarianceParams(3, 3,
+                                 &aom_dist_wtd_sub_pixel_avg_variance8x8_c, 0),
+  DistWtdSubpelAvgVarianceParams(3, 2,
+                                 &aom_dist_wtd_sub_pixel_avg_variance8x4_c, 0),
+  DistWtdSubpelAvgVarianceParams(2, 3,
+                                 &aom_dist_wtd_sub_pixel_avg_variance4x8_c, 0),
+  DistWtdSubpelAvgVarianceParams(2, 2,
+                                 &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0),
+#if !CONFIG_REALTIME_ONLY
+
+  DistWtdSubpelAvgVarianceParams(
+      6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_c, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_c, 0),
+  DistWtdSubpelAvgVarianceParams(5, 3,
+                                 &aom_dist_wtd_sub_pixel_avg_variance32x8_c, 0),
+  DistWtdSubpelAvgVarianceParams(3, 5,
+                                 &aom_dist_wtd_sub_pixel_avg_variance8x32_c, 0),
+  DistWtdSubpelAvgVarianceParams(4, 2,
+                                 &aom_dist_wtd_sub_pixel_avg_variance16x4_c, 0),
+  DistWtdSubpelAvgVarianceParams(2, 4,
+                                 &aom_dist_wtd_sub_pixel_avg_variance4x16_c, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxDistWtdSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArrayDistWtdSubpelAvgVariance_c));
 
+#if !CONFIG_REALTIME_ONLY
 INSTANTIATE_TEST_SUITE_P(
     C, AvxObmcSubpelVarianceTest,
     ::testing::Values(
@@ -1268,14 +1565,124 @@ INSTANTIATE_TEST_SUITE_P(
         ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_c, 0),
         ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_c, 0),
         ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_c, 0)));
+#endif
 
 #if CONFIG_AV1_HIGHBITDEPTH
+typedef uint64_t (*MseHBDWxH16bitFunc)(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int w,
+                                       int h);
+
+template <typename FunctionType>
+class MseHBDWxHTestClass
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, block_size() * sizeof(src_)));
+    dst_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, block_size() * sizeof(dst_)));
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(dst_, nullptr);
+  }
+
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dst_);
+    src_ = NULL;
+    dst_ = NULL;
+  }
+
+ protected:
+  void RefMatchTestMse();
+  void SpeedTest();
+
+ protected:
+  ACMRandom rnd_;
+  uint16_t *dst_;
+  uint16_t *src_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int d_stride() const { return params_.width; }  // stride is same as width
+  int s_stride() const { return params_.width; }  // stride is same as width
+  int height() const { return params_.height; }
+  int mask() const { return params_.mask; }
+};
+
+template <typename MseHBDWxHFunctionType>
+void MseHBDWxHTestClass<MseHBDWxHFunctionType>::SpeedTest() {
+  aom_usec_timer ref_timer, test_timer;
+  double elapsed_time_c = 0;
+  double elapsed_time_simd = 0;
+  int run_time = 10000000;
+  int w = width();
+  int h = height();
+  int dstride = d_stride();
+  int sstride = s_stride();
+  for (int k = 0; k < block_size(); ++k) {
+    dst_[k] = rnd_.Rand16() & mask();
+    src_[k] = rnd_.Rand16() & mask();
+  }
+  aom_usec_timer_start(&ref_timer);
+  for (int i = 0; i < run_time; i++) {
+    aom_mse_wxh_16bit_highbd_c(dst_, dstride, src_, sstride, w, h);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  elapsed_time_c = static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+  aom_usec_timer_start(&test_timer);
+  for (int i = 0; i < run_time; i++) {
+    params_.func(dst_, dstride, src_, sstride, w, h);
+  }
+  aom_usec_timer_mark(&test_timer);
+  elapsed_time_simd = static_cast<double>(aom_usec_timer_elapsed(&test_timer));
+
+  printf("%dx%d\tc_time=%lf \t simd_time=%lf \t gain=%lf\n", width(), height(),
+         elapsed_time_c, elapsed_time_simd,
+         (elapsed_time_c / elapsed_time_simd));
+}
+
+template <typename MseHBDWxHFunctionType>
+void MseHBDWxHTestClass<MseHBDWxHFunctionType>::RefMatchTestMse() {
+  uint64_t mse_ref = 0;
+  uint64_t mse_mod = 0;
+  int w = width();
+  int h = height();
+  int dstride = d_stride();
+  int sstride = s_stride();
+  for (int i = 0; i < 10; i++) {
+    for (int k = 0; k < block_size(); ++k) {
+      dst_[k] = rnd_.Rand16() & mask();
+      src_[k] = rnd_.Rand16() & mask();
+    }
+    API_REGISTER_STATE_CHECK(mse_ref = aom_mse_wxh_16bit_highbd_c(
+                                 dst_, dstride, src_, sstride, w, h));
+    API_REGISTER_STATE_CHECK(
+        mse_mod = params_.func(dst_, dstride, src_, sstride, w, h));
+    EXPECT_EQ(mse_ref, mse_mod)
+        << "ref mse: " << mse_ref << " mod mse: " << mse_mod;
+  }
+}
+
+typedef TestParams<MseHBDWxH16bitFunc> MseHBDWxHParams;
+typedef MseHBDWxHTestClass<MseHBDWxH16bitFunc> MseHBDWxHTest;
 typedef MainTestClass<VarianceMxNFunc> AvxHBDMseTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AvxHBDMseTest);
 typedef MainTestClass<VarianceMxNFunc> AvxHBDVarianceTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxHBDSubpelVarianceTest;
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxHBDSubpelAvgVarianceTest;
+#if !CONFIG_REALTIME_ONLY
 typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxHBDObmcSubpelVarianceTest;
+#endif
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AvxHBDObmcSubpelVarianceTest);
 
+TEST_P(MseHBDWxHTest, RefMse) { RefMatchTestMse(); }
+TEST_P(MseHBDWxHTest, DISABLED_SpeedMse) { SpeedTest(); }
 TEST_P(AvxHBDMseTest, RefMse) { RefTestMse(); }
 TEST_P(AvxHBDMseTest, MaxMse) { MaxTestMse(); }
 TEST_P(AvxHBDVarianceTest, Zero) { ZeroTest(); }
@@ -1288,6 +1695,13 @@ TEST_P(AvxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(AvxHBDSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
 
+INSTANTIATE_TEST_SUITE_P(
+    C, MseHBDWxHTest,
+    ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_c, 10),
+                      MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_c, 10),
+                      MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_c, 10),
+                      MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_c, 10)));
+
 /* TODO(debargha): This test does not support the highbd version
 INSTANTIATE_TEST_SUITE_P(
     C, AvxHBDMseTest,
@@ -1354,7 +1768,7 @@ const VarianceParams kArrayHBDVariance_c[] = {
   VarianceParams(3, 2, &aom_highbd_8_variance8x4_c, 8),
   VarianceParams(2, 3, &aom_highbd_8_variance4x8_c, 8),
   VarianceParams(2, 2, &aom_highbd_8_variance4x4_c, 8),
-
+#if !CONFIG_REALTIME_ONLY
   VarianceParams(6, 4, &aom_highbd_12_variance64x16_c, 12),
   VarianceParams(4, 6, &aom_highbd_12_variance16x64_c, 12),
   VarianceParams(5, 3, &aom_highbd_12_variance32x8_c, 12),
@@ -1373,6 +1787,7 @@ const VarianceParams kArrayHBDVariance_c[] = {
   VarianceParams(3, 5, &aom_highbd_8_variance8x32_c, 8),
   VarianceParams(4, 2, &aom_highbd_8_variance16x4_c, 8),
   VarianceParams(2, 4, &aom_highbd_8_variance4x16_c, 8),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(C, AvxHBDVarianceTest,
                          ::testing::ValuesIn(kArrayHBDVariance_c));
@@ -1435,7 +1850,7 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_c[] = {
   SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_c, 12),
   SubpelVarianceParams(2, 3, &aom_highbd_12_sub_pixel_variance4x8_c, 12),
   SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_c, 12),
-
+#if !CONFIG_REALTIME_ONLY
   SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_c, 8),
   SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_c, 8),
   SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_c, 8),
@@ -1454,6 +1869,7 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_c[] = {
   SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_c, 12),
   SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_c, 12),
   SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_c, 12),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelVarianceTest,
                          ::testing::ValuesIn(kArrayHBDSubpelVariance_c));
@@ -1535,6 +1951,7 @@ const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = {
   SubpelAvgVarianceParams(2, 3, &aom_highbd_12_sub_pixel_avg_variance4x8_c, 12),
   SubpelAvgVarianceParams(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_c, 12),
 
+#if !CONFIG_REALTIME_ONLY
   SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_c, 8),
   SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_c, 8),
   SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_c, 8),
@@ -1565,10 +1982,12 @@ const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = {
                           12),
   SubpelAvgVarianceParams(2, 4, &aom_highbd_12_sub_pixel_avg_variance4x16_c,
                           12),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelAvgVarianceTest,
                          ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
 
+#if !CONFIG_REALTIME_ONLY
 const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = {
   ObmcSubpelVarianceParams(7, 7, &aom_highbd_obmc_sub_pixel_variance128x128_c,
                            8),
@@ -1687,9 +2106,17 @@ const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = {
 };
 INSTANTIATE_TEST_SUITE_P(C, AvxHBDObmcSubpelVarianceTest,
                          ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_c));
+#endif  // !CONFIG_REALTIME_ONLY
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, MseWxHTest,
+    ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_sse2, 8),
+                      MseWxHParams(3, 2, &aom_mse_wxh_16bit_sse2, 8),
+                      MseWxHParams(2, 3, &aom_mse_wxh_16bit_sse2, 8),
+                      MseWxHParams(2, 2, &aom_mse_wxh_16bit_sse2, 8)));
+
 INSTANTIATE_TEST_SUITE_P(SSE2, SumOfSquaresTest,
                          ::testing::Values(aom_get_mb_ss_sse2));
 
@@ -1699,90 +2126,112 @@ INSTANTIATE_TEST_SUITE_P(SSE2, AvxMseTest,
                                            MseParams(3, 4, &aom_mse8x16_sse2),
                                            MseParams(3, 3, &aom_mse8x8_sse2)));
 
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AvxVarianceTest,
-    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_sse2),
-                      VarianceParams(7, 6, &aom_variance128x64_sse2),
-                      VarianceParams(6, 7, &aom_variance64x128_sse2),
-                      VarianceParams(6, 6, &aom_variance64x64_sse2),
-                      VarianceParams(6, 5, &aom_variance64x32_sse2),
-                      VarianceParams(6, 4, &aom_variance64x16_sse2),
-                      VarianceParams(5, 6, &aom_variance32x64_sse2),
-                      VarianceParams(5, 5, &aom_variance32x32_sse2),
-                      VarianceParams(5, 4, &aom_variance32x16_sse2),
-                      VarianceParams(5, 3, &aom_variance32x8_sse2),
-                      VarianceParams(4, 6, &aom_variance16x64_sse2),
-                      VarianceParams(4, 5, &aom_variance16x32_sse2),
-                      VarianceParams(4, 4, &aom_variance16x16_sse2),
-                      VarianceParams(4, 3, &aom_variance16x8_sse2),
-                      VarianceParams(4, 2, &aom_variance16x4_sse2),
-                      VarianceParams(3, 5, &aom_variance8x32_sse2),
-                      VarianceParams(3, 4, &aom_variance8x16_sse2),
-                      VarianceParams(3, 3, &aom_variance8x8_sse2),
-                      VarianceParams(3, 2, &aom_variance8x4_sse2),
-                      VarianceParams(2, 4, &aom_variance4x16_sse2),
-                      VarianceParams(2, 3, &aom_variance4x8_sse2),
-                      VarianceParams(2, 2, &aom_variance4x4_sse2)));
-
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_sse2, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_sse2, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_sse2, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_sse2, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_sse2, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_sse2, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_sse2, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_sse2, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_sse2, 0),
-        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_sse2, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_sse2, 0),
-        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_sse2, 0),
-        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_sse2, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0),
-
-        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_sse2, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_sse2, 0),
-        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_sse2, 0),
-        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_sse2, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_sse2, 0),
-        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_sse2, 0)));
-
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AvxSubpelAvgVarianceTest,
-    ::testing::Values(
-        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2,
-                                0),
-        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_sse2,
-                                0),
-        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_sse2,
-                                0),
-        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_sse2, 0),
-        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_sse2, 0),
-        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_sse2, 0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_sse2, 0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_sse2, 0),
-        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_sse2, 0),
-        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_sse2, 0),
-        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_sse2, 0),
-        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_sse2, 0),
-        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0),
-        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0),
-        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0),
-
-        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_sse2, 0),
-        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_sse2, 0),
-        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_sse2, 0),
-        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_sse2, 0),
-        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_sse2, 0),
-        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_sse2,
-                                0)));
+const VarianceParams kArrayVariance_sse2[] = {
+  VarianceParams(7, 7, &aom_variance128x128_sse2),
+  VarianceParams(7, 6, &aom_variance128x64_sse2),
+  VarianceParams(6, 7, &aom_variance64x128_sse2),
+  VarianceParams(6, 6, &aom_variance64x64_sse2),
+  VarianceParams(6, 5, &aom_variance64x32_sse2),
+  VarianceParams(5, 6, &aom_variance32x64_sse2),
+  VarianceParams(5, 5, &aom_variance32x32_sse2),
+  VarianceParams(5, 4, &aom_variance32x16_sse2),
+  VarianceParams(4, 5, &aom_variance16x32_sse2),
+  VarianceParams(4, 4, &aom_variance16x16_sse2),
+  VarianceParams(4, 3, &aom_variance16x8_sse2),
+  VarianceParams(3, 4, &aom_variance8x16_sse2),
+  VarianceParams(3, 3, &aom_variance8x8_sse2),
+  VarianceParams(3, 2, &aom_variance8x4_sse2),
+  VarianceParams(2, 3, &aom_variance4x8_sse2),
+  VarianceParams(2, 2, &aom_variance4x4_sse2),
+#if !CONFIG_REALTIME_ONLY
+  VarianceParams(6, 4, &aom_variance64x16_sse2),
+  VarianceParams(5, 3, &aom_variance32x8_sse2),
+  VarianceParams(4, 6, &aom_variance16x64_sse2),
+  VarianceParams(4, 2, &aom_variance16x4_sse2),
+  VarianceParams(3, 5, &aom_variance8x32_sse2),
+  VarianceParams(2, 4, &aom_variance4x16_sse2),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxVarianceTest,
+                         ::testing::ValuesIn(kArrayVariance_sse2));
+
+const GetSseSumParams kArrayGetSseSum8x8Quad_sse2[] = {
+  GetSseSumParams(7, 7, &aom_get_sse_sum_8x8_quad_sse2, 0),
+  GetSseSumParams(6, 6, &aom_get_sse_sum_8x8_quad_sse2, 0),
+  GetSseSumParams(5, 5, &aom_get_sse_sum_8x8_quad_sse2, 0),
+  GetSseSumParams(5, 4, &aom_get_sse_sum_8x8_quad_sse2, 0)
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, GetSseSum8x8QuadTest,
+                         ::testing::ValuesIn(kArrayGetSseSum8x8Quad_sse2));
+
+const SubpelVarianceParams kArraySubpelVariance_sse2[] = {
+  SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0),
+  SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0),
+  SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_sse2, 0),
+  SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_sse2, 0),
+  SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_sse2, 0),
+  SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_sse2, 0),
+  SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_sse2, 0),
+  SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_sse2, 0),
+  SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_sse2, 0),
+  SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_sse2, 0),
+  SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_sse2, 0),
+  SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_sse2, 0),
+  SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_sse2, 0),
+  SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_sse2, 0),
+  SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_sse2, 0),
+  SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_sse2, 0),
+  SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_sse2, 0),
+  SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_sse2, 0),
+  SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_sse2, 0),
+  SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_sse2, 0),
+  SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_sse2, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxSubpelVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelVariance_sse2));
+
+const SubpelAvgVarianceParams kArraySubpelAvgVariance_sse2[] = {
+  SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2, 0),
+  SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_sse2, 0),
+  SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_sse2, 0),
+  SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_sse2, 0),
+  SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_sse2, 0),
+  SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_sse2, 0),
+  SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_sse2, 0),
+  SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_sse2, 0),
+  SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_sse2, 0),
+  SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_sse2, 0),
+  SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_sse2, 0),
+  SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_sse2, 0),
+  SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0),
+  SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0),
+  SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0),
+  SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_sse2, 0),
+  SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_sse2, 0),
+  SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_sse2, 0),
+  SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_sse2, 0),
+  SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_sse2, 0),
+  SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_sse2, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelAvgVariance_sse2));
 
 #if CONFIG_AV1_HIGHBITDEPTH
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, MseHBDWxHTest,
+    ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_sse2, 10),
+                      MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_sse2, 10),
+                      MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_sse2, 10),
+                      MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_sse2,
+                                      10)));
+#endif  // HAVE_SSE2
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AvxSubpelVarianceTest,
@@ -1865,7 +2314,7 @@ const VarianceParams kArrayHBDVariance_sse2[] = {
   VarianceParams(4, 3, &aom_highbd_8_variance16x8_sse2, 8),
   VarianceParams(3, 4, &aom_highbd_8_variance8x16_sse2, 8),
   VarianceParams(3, 3, &aom_highbd_8_variance8x8_sse2, 8),
-
+#if !CONFIG_REALTIME_ONLY
   VarianceParams(6, 4, &aom_highbd_12_variance64x16_sse2, 12),
   VarianceParams(4, 6, &aom_highbd_12_variance16x64_sse2, 12),
   VarianceParams(5, 3, &aom_highbd_12_variance32x8_sse2, 12),
@@ -1882,14 +2331,23 @@ const VarianceParams kArrayHBDVariance_sse2[] = {
   VarianceParams(4, 6, &aom_highbd_8_variance16x64_sse2, 8),
   VarianceParams(5, 3, &aom_highbd_8_variance32x8_sse2, 8),
   VarianceParams(3, 5, &aom_highbd_8_variance8x32_sse2, 8),
-  // VarianceParams(4, 2, &aom_highbd_8_variance16x4_sse2, 8),
-  // VarianceParams(2, 4, &aom_highbd_8_variance4x16_sse2, 8),
+// VarianceParams(4, 2, &aom_highbd_8_variance16x4_sse2, 8),
+// VarianceParams(2, 4, &aom_highbd_8_variance4x16_sse2, 8),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDVarianceTest,
                          ::testing::ValuesIn(kArrayHBDVariance_sse2));
 
 #if HAVE_AVX2
 
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, MseHBDWxHTest,
+    ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_avx2, 10),
+                      MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_avx2, 10),
+                      MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_avx2, 10),
+                      MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_avx2,
+                                      10)));
+
 const VarianceParams kArrayHBDVariance_avx2[] = {
   VarianceParams(7, 7, &aom_highbd_10_variance128x128_avx2, 10),
   VarianceParams(7, 6, &aom_highbd_10_variance128x64_avx2, 10),
@@ -1908,6 +2366,25 @@ const VarianceParams kArrayHBDVariance_avx2[] = {
 
 INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDVarianceTest,
                          ::testing::ValuesIn(kArrayHBDVariance_avx2));
+
+const SubpelVarianceParams kArrayHBDSubpelVariance_avx2[] = {
+  SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_avx2, 10),
+  SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_avx2, 10),
+  SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_avx2, 10),
+  SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_avx2, 10),
+  SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_avx2, 10),
+  SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_avx2, 10),
+  SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_avx2, 10),
+  SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_avx2, 10),
+  SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_avx2, 10),
+  SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_avx2, 10),
+  SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_avx2, 10),
+  SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_avx2, 10),
+  SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_avx2, 10),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDSubpelVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDSubpelVariance_avx2));
 #endif  // HAVE_AVX2
 
 const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
@@ -1953,7 +2430,7 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
   SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_sse2, 8),
   SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_sse2, 8),
   SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_sse2, 8),
-
+#if !CONFIG_REALTIME_ONLY
   SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_sse2, 12),
   SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_sse2, 12),
   SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_sse2, 12),
@@ -1971,7 +2448,8 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
   SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_sse2, 8),
   SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_sse2, 8),
   SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_sse2, 8),
-  // SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_sse2, 8),
+// SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_sse2, 8),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelVarianceTest,
                          ::testing::ValuesIn(kArrayHBDSubpelVariance_sse2));
@@ -2044,6 +2522,7 @@ const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_sse2[] = {
   SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_sse2,
                           8),
 
+#if !CONFIG_REALTIME_ONLY
   SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_sse2,
                           12),
   SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_sse2,
@@ -2078,8 +2557,9 @@ const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_sse2[] = {
                           8),
   SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_sse2,
                           8),
-  // SubpelAvgVarianceParams(2, 4,
-  // &aom_highbd_8_sub_pixel_avg_variance4x16_sse2, 8),
+// SubpelAvgVarianceParams(2, 4,
+// &aom_highbd_8_sub_pixel_avg_variance4x16_sse2, 8),
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelAvgVarianceTest,
@@ -2088,124 +2568,119 @@ INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelAvgVarianceTest,
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_ssse3, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_ssse3, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_ssse3, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_ssse3, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_ssse3, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_ssse3, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_ssse3, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_ssse3, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_ssse3, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_ssse3, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_ssse3, 0),
-        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_ssse3, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_ssse3, 0),
-        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_ssse3, 0),
-        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_ssse3, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_ssse3, 0),
-
-        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_ssse3, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_ssse3, 0),
-        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_ssse3, 0),
-        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_ssse3, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_ssse3, 0),
-        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_ssse3, 0)));
-
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, AvxSubpelAvgVarianceTest,
-    ::testing::Values(
-        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_ssse3,
-                                0),
-        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_ssse3,
-                                0),
-        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_ssse3,
-                                0),
-        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_ssse3,
-                                0),
-        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_ssse3,
-                                0),
-        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_ssse3,
-                                0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_ssse3,
-                                0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_ssse3,
-                                0),
-        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_ssse3,
-                                0),
-        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_ssse3,
-                                0),
-        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_ssse3, 0),
-        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_ssse3, 0),
-        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_ssse3, 0),
-        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_ssse3, 0),
-        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_ssse3, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_ssse3, 0),
-
-        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_ssse3,
-                                0),
-        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_ssse3,
-                                0),
-        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_ssse3, 0),
-        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_ssse3, 0),
-        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_ssse3, 0),
-        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_ssse3,
-                                0)));
-
+const SubpelVarianceParams kArraySubpelVariance_ssse3[] = {
+  SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_ssse3, 0),
+  SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_ssse3, 0),
+  SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_ssse3, 0),
+  SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_ssse3, 0),
+  SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_ssse3, 0),
+  SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_ssse3, 0),
+  SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_ssse3, 0),
+  SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_ssse3, 0),
+  SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_ssse3, 0),
+  SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_ssse3, 0),
+  SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_ssse3, 0),
+  SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_ssse3, 0),
+  SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_ssse3, 0),
+  SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_ssse3, 0),
+  SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_ssse3, 0),
+  SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_ssse3, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_ssse3, 0),
+  SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_ssse3, 0),
+  SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_ssse3, 0),
+  SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_ssse3, 0),
+  SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_ssse3, 0),
+  SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_ssse3, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSSE3, AvxSubpelVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelVariance_ssse3));
+
+const SubpelAvgVarianceParams kArraySubpelAvgVariance_ssse3[] = {
+  SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_ssse3, 0),
+  SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_ssse3, 0),
+  SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_ssse3, 0),
+  SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_ssse3, 0),
+  SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_ssse3, 0),
+  SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_ssse3, 0),
+  SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_ssse3, 0),
+  SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_ssse3, 0),
+  SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_ssse3, 0),
+  SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_ssse3, 0),
+  SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_ssse3, 0),
+  SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_ssse3, 0),
+  SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_ssse3, 0),
+  SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_ssse3, 0),
+  SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_ssse3, 0),
+  SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_ssse3, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_ssse3, 0),
+  SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_ssse3, 0),
+  SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_ssse3, 0),
+  SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_ssse3, 0),
+  SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_ssse3, 0),
+  SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_ssse3, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSSE3, AvxSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelAvgVariance_ssse3));
+
+const DistWtdSubpelAvgVarianceParams kArrayDistWtdSubpelAvgVariance_ssse3[] = {
+  DistWtdSubpelAvgVarianceParams(
+      7, 7, &aom_dist_wtd_sub_pixel_avg_variance128x128_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      7, 6, &aom_dist_wtd_sub_pixel_avg_variance128x64_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      6, 7, &aom_dist_wtd_sub_pixel_avg_variance64x128_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0),
+#if !CONFIG_REALTIME_ONLY
+  DistWtdSubpelAvgVarianceParams(
+      6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_ssse3, 0),
+  DistWtdSubpelAvgVarianceParams(
+      2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_ssse3, 0),
+#endif
+};
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, AvxDistWtdSubpelAvgVarianceTest,
-    ::testing::Values(
-        DistWtdSubpelAvgVarianceParams(
-            7, 7, &aom_dist_wtd_sub_pixel_avg_variance128x128_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            7, 6, &aom_dist_wtd_sub_pixel_avg_variance128x64_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            6, 7, &aom_dist_wtd_sub_pixel_avg_variance64x128_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0),
-
-        DistWtdSubpelAvgVarianceParams(
-            6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_ssse3, 0)));
+    ::testing::ValuesIn(kArrayDistWtdSubpelAvgVariance_ssse3));
 #endif  // HAVE_SSSE3
 
 #if HAVE_SSE4_1
+#if !CONFIG_REALTIME_ONLY
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AvxObmcSubpelVarianceTest,
     ::testing::Values(
@@ -2241,7 +2716,6 @@ INSTANTIATE_TEST_SUITE_P(
                                  0),
         ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_sse4_1,
                                  0),
-
         ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_sse4_1,
                                  0),
         ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_sse4_1,
@@ -2254,47 +2728,73 @@ INSTANTIATE_TEST_SUITE_P(
                                  0),
         ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_sse4_1,
                                  0)));
+#endif
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, MseWxHTest,
+    ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_avx2, 8),
+                      MseWxHParams(3, 2, &aom_mse_wxh_16bit_avx2, 8),
+                      MseWxHParams(2, 3, &aom_mse_wxh_16bit_avx2, 8),
+                      MseWxHParams(2, 2, &aom_mse_wxh_16bit_avx2, 8)));
+
 INSTANTIATE_TEST_SUITE_P(AVX2, AvxMseTest,
                          ::testing::Values(MseParams(4, 4,
                                                      &aom_mse16x16_avx2)));
 
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AvxVarianceTest,
-    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_avx2),
-                      VarianceParams(7, 6, &aom_variance128x64_avx2),
-                      VarianceParams(6, 7, &aom_variance64x128_avx2),
-                      VarianceParams(6, 6, &aom_variance64x64_avx2),
-                      VarianceParams(6, 5, &aom_variance64x32_avx2),
-                      VarianceParams(6, 4, &aom_variance64x16_avx2),
-                      VarianceParams(5, 6, &aom_variance32x64_avx2),
-                      VarianceParams(5, 5, &aom_variance32x32_avx2),
-                      VarianceParams(5, 4, &aom_variance32x16_avx2),
-                      VarianceParams(5, 3, &aom_variance32x8_avx2),
-                      VarianceParams(4, 6, &aom_variance16x64_avx2),
-                      VarianceParams(4, 5, &aom_variance16x32_avx2),
-                      VarianceParams(4, 4, &aom_variance16x16_avx2),
-                      VarianceParams(4, 3, &aom_variance16x8_avx2),
-                      VarianceParams(4, 2, &aom_variance16x4_avx2)));
-
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_avx2, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_avx2, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_avx2, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_avx2, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_avx2, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_avx2, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_avx2, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_avx2, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_avx2, 0)));
+const VarianceParams kArrayVariance_avx2[] = {
+  VarianceParams(7, 7, &aom_variance128x128_avx2),
+  VarianceParams(7, 6, &aom_variance128x64_avx2),
+  VarianceParams(6, 7, &aom_variance64x128_avx2),
+  VarianceParams(6, 6, &aom_variance64x64_avx2),
+  VarianceParams(6, 5, &aom_variance64x32_avx2),
+  VarianceParams(5, 6, &aom_variance32x64_avx2),
+  VarianceParams(5, 5, &aom_variance32x32_avx2),
+  VarianceParams(5, 4, &aom_variance32x16_avx2),
+  VarianceParams(4, 5, &aom_variance16x32_avx2),
+  VarianceParams(4, 4, &aom_variance16x16_avx2),
+  VarianceParams(4, 3, &aom_variance16x8_avx2),
+#if !CONFIG_REALTIME_ONLY
+  VarianceParams(6, 4, &aom_variance64x16_avx2),
+  VarianceParams(4, 6, &aom_variance16x64_avx2),
+  VarianceParams(5, 3, &aom_variance32x8_avx2),
+  VarianceParams(4, 2, &aom_variance16x4_avx2),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxVarianceTest,
+                         ::testing::ValuesIn(kArrayVariance_avx2));
+
+const GetSseSumParams kArrayGetSseSum8x8Quad_avx2[] = {
+  GetSseSumParams(7, 7, &aom_get_sse_sum_8x8_quad_avx2, 0),
+  GetSseSumParams(6, 6, &aom_get_sse_sum_8x8_quad_avx2, 0),
+  GetSseSumParams(5, 5, &aom_get_sse_sum_8x8_quad_avx2, 0),
+  GetSseSumParams(5, 4, &aom_get_sse_sum_8x8_quad_avx2, 0)
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, GetSseSum8x8QuadTest,
+                         ::testing::ValuesIn(kArrayGetSseSum8x8Quad_avx2));
+
+const SubpelVarianceParams kArraySubpelVariance_avx2[] = {
+  SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_avx2, 0),
+  SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_avx2, 0),
+  SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_avx2, 0),
+  SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_avx2, 0),
+  SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0),
+  SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0),
+  SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0),
+  SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0),
+
+  SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_avx2, 0),
+  SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_avx2, 0),
+  SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_avx2, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_avx2, 0),
+  SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_avx2, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxSubpelVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelVariance_avx2));
 
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AvxSubpelAvgVarianceTest,
@@ -2326,21 +2826,92 @@ INSTANTIATE_TEST_SUITE_P(
     NEON, AvxVarianceTest,
     ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_neon),
                       VarianceParams(6, 6, &aom_variance64x64_neon),
+                      VarianceParams(7, 6, &aom_variance128x64_neon),
+                      VarianceParams(6, 7, &aom_variance64x128_neon),
+                      VarianceParams(6, 6, &aom_variance64x64_neon),
                       VarianceParams(6, 5, &aom_variance64x32_neon),
                       VarianceParams(5, 6, &aom_variance32x64_neon),
                       VarianceParams(5, 5, &aom_variance32x32_neon),
+                      VarianceParams(5, 4, &aom_variance32x16_neon),
+                      VarianceParams(4, 5, &aom_variance16x32_neon),
                       VarianceParams(4, 4, &aom_variance16x16_neon),
                       VarianceParams(4, 3, &aom_variance16x8_neon),
                       VarianceParams(3, 4, &aom_variance8x16_neon),
-                      VarianceParams(3, 3, &aom_variance8x8_neon)));
+                      VarianceParams(3, 3, &aom_variance8x8_neon),
+                      VarianceParams(3, 2, &aom_variance8x4_neon),
+                      VarianceParams(2, 3, &aom_variance4x8_neon),
+                      VarianceParams(2, 2, &aom_variance4x4_neon)));
+
+const SubpelVarianceParams kArraySubpelVariance_neon[] = {
+  SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_neon, 0),
+  SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_neon, 0),
+  SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_neon, 0),
+  SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_neon, 0),
+  SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_neon, 0),
+  SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_neon, 0),
+  SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_neon, 0),
+  SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_neon, 0),
+  SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_neon, 0),
+  SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_neon, 0),
+  SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_neon, 0),
+  SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_neon, 0),
+  SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_neon, 0),
+  SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_neon, 0),
+  SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_neon, 0),
+  SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_neon, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_neon, 0),
+  SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_neon, 0),
+  SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_neon, 0),
+  SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_neon, 0),
+  SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_neon, 0),
+  SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_neon, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, AvxSubpelVarianceTest,
+                         ::testing::ValuesIn(kArraySubpelVariance_neon));
+
+const GetSseSumParams kArrayGetSseSum8x8Quad_neon[] = {
+  GetSseSumParams(7, 7, &aom_get_sse_sum_8x8_quad_neon, 0),
+  GetSseSumParams(6, 6, &aom_get_sse_sum_8x8_quad_neon, 0),
+  GetSseSumParams(5, 5, &aom_get_sse_sum_8x8_quad_neon, 0),
+  GetSseSumParams(5, 4, &aom_get_sse_sum_8x8_quad_neon, 0)
+};
+INSTANTIATE_TEST_SUITE_P(NEON, GetSseSum8x8QuadTest,
+                         ::testing::ValuesIn(kArrayGetSseSum8x8Quad_neon));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const VarianceParams kArrayHBDVariance_neon[] = {
+  VarianceParams(7, 7, &aom_highbd_10_variance128x128_neon, 10),
+  VarianceParams(7, 6, &aom_highbd_10_variance128x64_neon, 10),
+  VarianceParams(6, 7, &aom_highbd_10_variance64x128_neon, 10),
+  VarianceParams(6, 6, &aom_highbd_10_variance64x64_neon, 10),
+  VarianceParams(6, 5, &aom_highbd_10_variance64x32_neon, 10),
+  VarianceParams(5, 6, &aom_highbd_10_variance32x64_neon, 10),
+  VarianceParams(5, 5, &aom_highbd_10_variance32x32_neon, 10),
+  VarianceParams(5, 4, &aom_highbd_10_variance32x16_neon, 10),
+  VarianceParams(4, 5, &aom_highbd_10_variance16x32_neon, 10),
+  VarianceParams(4, 4, &aom_highbd_10_variance16x16_neon, 10),
+  VarianceParams(4, 3, &aom_highbd_10_variance16x8_neon, 10),
+  VarianceParams(3, 4, &aom_highbd_10_variance8x16_neon, 10),
+  VarianceParams(3, 3, &aom_highbd_10_variance8x8_neon, 10),
+  VarianceParams(3, 2, &aom_highbd_10_variance8x4_neon, 10),
+  VarianceParams(2, 3, &aom_highbd_10_variance4x8_neon, 10),
+  VarianceParams(2, 2, &aom_highbd_10_variance4x4_neon, 10),
+#if !CONFIG_REALTIME_ONLY
+  VarianceParams(6, 4, &aom_highbd_10_variance64x16_neon, 10),
+  VarianceParams(4, 6, &aom_highbd_10_variance16x64_neon, 10),
+  VarianceParams(5, 3, &aom_highbd_10_variance32x8_neon, 10),
+  VarianceParams(3, 5, &aom_highbd_10_variance8x32_neon, 10),
+  VarianceParams(4, 2, &aom_highbd_10_variance16x4_neon, 10),
+  VarianceParams(2, 4, &aom_highbd_10_variance4x16_neon, 10),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDVariance_neon));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_neon, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_neon, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_neon, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_neon, 0)));
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/media/libaom/src/test/video_source.h b/media/libaom/src/test/video_source.h
index 3c1c5e559e..e0e9c2c99b 100644
--- a/media/libaom/src/test/video_source.h
+++ b/media/libaom/src/test/video_source.h
@@ -193,6 +193,7 @@ class DummyVideoSource : public VideoSource {
   void ReallocImage() {
     aom_img_free(img_);
     img_ = aom_img_alloc(NULL, format_, width_, height_, 32);
+    ASSERT_NE(img_, nullptr);
     raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8;
   }
 
diff --git a/media/libaom/src/test/visual_metrics.py b/media/libaom/src/test/visual_metrics.py
index 9055feb334..9055feb334 100644..100755
--- a/media/libaom/src/test/visual_metrics.py
+++ b/media/libaom/src/test/visual_metrics.py
diff --git a/media/libaom/src/test/warp_filter_test.cc b/media/libaom/src/test/warp_filter_test.cc
index c5e87f0859..1d9dd45470 100644
--- a/media/libaom/src/test/warp_filter_test.cc
+++ b/media/libaom/src/test/warp_filter_test.cc
@@ -56,6 +56,13 @@ INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdWarpFilterTest,
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1WarpFilterTest,
     libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_avx2));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1HighbdWarpFilterTest,
+    libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_avx2));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
diff --git a/media/libaom/src/test/warp_filter_test_util.cc b/media/libaom/src/test/warp_filter_test_util.cc
index bcb0c18592..b4376d8fd9 100644
--- a/media/libaom/src/test/warp_filter_test_util.cc
+++ b/media/libaom/src/test/warp_filter_test_util.cc
@@ -8,6 +8,9 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include <memory>
+#include <new>
+
 #include "aom_ports/aom_timer.h"
 #include "test/warp_filter_test_util.h"
 
@@ -97,9 +100,9 @@ namespace AV1WarpFilter {
 ::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
     warp_affine_func filter) {
   WarpTestParam params[] = {
-    make_tuple(4, 4, 50000, filter),  make_tuple(8, 8, 50000, filter),
-    make_tuple(64, 64, 1000, filter), make_tuple(4, 16, 20000, filter),
-    make_tuple(32, 8, 10000, filter),
+    make_tuple(4, 4, 5000, filter),  make_tuple(8, 8, 5000, filter),
+    make_tuple(64, 64, 100, filter), make_tuple(4, 16, 2000, filter),
+    make_tuple(32, 8, 1000, filter),
   };
   return ::testing::Combine(::testing::ValuesIn(params),
                             ::testing::Values(0, 1), ::testing::Values(0, 1),
@@ -109,7 +112,7 @@ namespace AV1WarpFilter {
 AV1WarpFilterTest::~AV1WarpFilterTest() {}
 void AV1WarpFilterTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
-void AV1WarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
+void AV1WarpFilterTest::TearDown() {}
 
 void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
   const int w = 128, h = 128;
@@ -124,17 +127,21 @@ void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
   int sub_x, sub_y;
   const int bd = 8;
 
-  uint8_t *input_ = new uint8_t[h * stride];
-  uint8_t *input = input_ + border;
+  std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * stride]);
+  ASSERT_NE(input_, nullptr);
+  uint8_t *input = input_.get() + border;
 
   // The warp functions always write rows with widths that are multiples of 8.
   // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
   int output_n = ((out_w + 7) & ~7) * out_h;
-  uint8_t *output = new uint8_t[output_n];
+  std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[output_n]);
+  ASSERT_NE(output, nullptr);
   int32_t mat[8];
   int16_t alpha, beta, gamma, delta;
   ConvolveParams conv_params = get_conv_params(0, 0, bd);
-  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+  std::unique_ptr<CONV_BUF_TYPE[]> dsta(new (std::nothrow)
+                                            CONV_BUF_TYPE[output_n]);
+  ASSERT_NE(dsta, nullptr);
   generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
                         is_alpha_zero, is_beta_zero, is_gamma_zero,
                         is_delta_zero);
@@ -150,24 +157,21 @@ void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
   sub_y = 0;
   int do_average = 0;
 
-  conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
+  conv_params =
+      get_conv_params_no_round(do_average, 0, dsta.get(), out_w, 1, bd);
   conv_params.use_dist_wtd_comp_avg = 0;
 
   const int num_loops = 1000000000 / (out_w + out_h);
   aom_usec_timer timer;
   aom_usec_timer_start(&timer);
   for (int i = 0; i < num_loops; ++i)
-    test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w,
-              sub_x, sub_y, &conv_params, alpha, beta, gamma, delta);
+    test_impl(mat, input, w, h, stride, output.get(), 32, 32, out_w, out_h,
+              out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma, delta);
 
   aom_usec_timer_mark(&timer);
   const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
   printf("warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
          1000.0 * elapsed_time / num_loops);
-
-  delete[] input_;
-  delete[] output;
-  delete[] dsta;
 }
 
 void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
@@ -187,15 +191,22 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
   // The warp functions always write rows with widths that are multiples of 8.
   // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
   int output_n = ((out_w + 7) & ~7) * out_h;
-  uint8_t *input_ = new uint8_t[h * stride];
-  uint8_t *input = input_ + border;
-  uint8_t *output = new uint8_t[output_n];
-  uint8_t *output2 = new uint8_t[output_n];
+  std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * stride]);
+  ASSERT_NE(input_, nullptr);
+  uint8_t *input = input_.get() + border;
+  std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[output_n]);
+  ASSERT_NE(output, nullptr);
+  std::unique_ptr<uint8_t[]> output2(new (std::nothrow) uint8_t[output_n]);
+  ASSERT_NE(output2, nullptr);
   int32_t mat[8];
   int16_t alpha, beta, gamma, delta;
   ConvolveParams conv_params = get_conv_params(0, 0, bd);
-  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
-  CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n];
+  std::unique_ptr<CONV_BUF_TYPE[]> dsta(new (std::nothrow)
+                                            CONV_BUF_TYPE[output_n]);
+  ASSERT_NE(dsta, nullptr);
+  std::unique_ptr<CONV_BUF_TYPE[]> dstb(new (std::nothrow)
+                                            CONV_BUF_TYPE[output_n]);
+  ASSERT_NE(dstb, nullptr);
   for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand8();
 
   for (i = 0; i < num_iters; ++i) {
@@ -217,8 +228,8 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
           for (int jj = 0; jj < 5; ++jj) {
             for (int do_average = 0; do_average <= 1; ++do_average) {
               if (use_no_round) {
-                conv_params =
-                    get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
+                conv_params = get_conv_params_no_round(
+                    do_average, 0, dsta.get(), out_w, 1, bd);
               } else {
                 conv_params = get_conv_params(0, 0, bd);
               }
@@ -226,26 +237,26 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
                 conv_params.use_dist_wtd_comp_avg = 0;
               } else {
                 conv_params.use_dist_wtd_comp_avg = 1;
-                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
-              av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w,
-                                out_h, out_w, sub_x, sub_y, &conv_params, alpha,
-                                beta, gamma, delta);
+              av1_warp_affine_c(mat, input, w, h, stride, output.get(), 32, 32,
+                                out_w, out_h, out_w, sub_x, sub_y, &conv_params,
+                                alpha, beta, gamma, delta);
               if (use_no_round) {
-                conv_params =
-                    get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
+                conv_params = get_conv_params_no_round(
+                    do_average, 0, dstb.get(), out_w, 1, bd);
               }
               if (jj >= 4) {
                 conv_params.use_dist_wtd_comp_avg = 0;
               } else {
                 conv_params.use_dist_wtd_comp_avg = 1;
-                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
-              test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
-                        out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma,
-                        delta);
+              test_impl(mat, input, w, h, stride, output2.get(), 32, 32, out_w,
+                        out_h, out_w, sub_x, sub_y, &conv_params, alpha, beta,
+                        gamma, delta);
               if (use_no_round) {
                 for (j = 0; j < out_w * out_h; ++j)
                   ASSERT_EQ(dsta[j], dstb[j])
@@ -269,11 +280,6 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
         }
       }
   }
-  delete[] input_;
-  delete[] output;
-  delete[] output2;
-  delete[] dsta;
-  delete[] dstb;
 }
 }  // namespace AV1WarpFilter
 
@@ -301,7 +307,7 @@ void AV1HighbdWarpFilterTest::SetUp() {
   rnd_.Reset(ACMRandom::DeterministicSeed());
 }
 
-void AV1HighbdWarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
+void AV1HighbdWarpFilterTest::TearDown() {}
 
 void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
   const int w = 128, h = 128;
@@ -320,13 +326,17 @@ void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
   // The warp functions always write rows with widths that are multiples of 8.
   // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
   int output_n = ((out_w + 7) & ~7) * out_h;
-  uint16_t *input_ = new uint16_t[h * stride];
-  uint16_t *input = input_ + border;
-  uint16_t *output = new uint16_t[output_n];
+  std::unique_ptr<uint16_t[]> input_(new (std::nothrow) uint16_t[h * stride]);
+  ASSERT_NE(input_, nullptr);
+  uint16_t *input = input_.get() + border;
+  std::unique_ptr<uint16_t[]> output(new (std::nothrow) uint16_t[output_n]);
+  ASSERT_NE(output, nullptr);
   int32_t mat[8];
   int16_t alpha, beta, gamma, delta;
   ConvolveParams conv_params = get_conv_params(0, 0, bd);
-  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+  std::unique_ptr<CONV_BUF_TYPE[]> dsta(new (std::nothrow)
+                                            CONV_BUF_TYPE[output_n]);
+  ASSERT_NE(dsta, nullptr);
 
   generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
                         is_alpha_zero, is_beta_zero, is_gamma_zero,
@@ -345,24 +355,21 @@ void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
   sub_y = 0;
   int do_average = 0;
   conv_params.use_dist_wtd_comp_avg = 0;
-  conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
+  conv_params =
+      get_conv_params_no_round(do_average, 0, dsta.get(), out_w, 1, bd);
 
   const int num_loops = 1000000000 / (out_w + out_h);
   aom_usec_timer timer;
   aom_usec_timer_start(&timer);
 
   for (int i = 0; i < num_loops; ++i)
-    test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w,
-              sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta);
+    test_impl(mat, input, w, h, stride, output.get(), 32, 32, out_w, out_h,
+              out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta);
 
   aom_usec_timer_mark(&timer);
   const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
   printf("highbd warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
          1000.0 * elapsed_time / num_loops);
-
-  delete[] input_;
-  delete[] output;
-  delete[] dsta;
 }
 
 void AV1HighbdWarpFilterTest::RunCheckOutput(
@@ -384,15 +391,22 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
   // The warp functions always write rows with widths that are multiples of 8.
   // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
   int output_n = ((out_w + 7) & ~7) * out_h;
-  uint16_t *input_ = new uint16_t[h * stride];
-  uint16_t *input = input_ + border;
-  uint16_t *output = new uint16_t[output_n];
-  uint16_t *output2 = new uint16_t[output_n];
+  std::unique_ptr<uint16_t[]> input_(new (std::nothrow) uint16_t[h * stride]);
+  ASSERT_NE(input_, nullptr);
+  uint16_t *input = input_.get() + border;
+  std::unique_ptr<uint16_t[]> output(new (std::nothrow) uint16_t[output_n]);
+  ASSERT_NE(output, nullptr);
+  std::unique_ptr<uint16_t[]> output2(new (std::nothrow) uint16_t[output_n]);
+  ASSERT_NE(output2, nullptr);
   int32_t mat[8];
   int16_t alpha, beta, gamma, delta;
   ConvolveParams conv_params = get_conv_params(0, 0, bd);
-  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
-  CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n];
+  std::unique_ptr<CONV_BUF_TYPE[]> dsta(new (std::nothrow)
+                                            CONV_BUF_TYPE[output_n]);
+  ASSERT_NE(dsta, nullptr);
+  std::unique_ptr<CONV_BUF_TYPE[]> dstb(new (std::nothrow)
+                                            CONV_BUF_TYPE[output_n]);
+  ASSERT_NE(dstb, nullptr);
   for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand16();
 
   for (i = 0; i < num_iters; ++i) {
@@ -415,8 +429,8 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
           for (int jj = 0; jj < 5; ++jj) {
             for (int do_average = 0; do_average <= 1; ++do_average) {
               if (use_no_round) {
-                conv_params =
-                    get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
+                conv_params = get_conv_params_no_round(
+                    do_average, 0, dsta.get(), out_w, 1, bd);
               } else {
                 conv_params = get_conv_params(0, 0, bd);
               }
@@ -424,29 +438,30 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
                 conv_params.use_dist_wtd_comp_avg = 0;
               } else {
                 conv_params.use_dist_wtd_comp_avg = 1;
-                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
 
-              av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32,
-                                       out_w, out_h, out_w, sub_x, sub_y, bd,
-                                       &conv_params, alpha, beta, gamma, delta);
+              av1_highbd_warp_affine_c(mat, input, w, h, stride, output.get(),
+                                       32, 32, out_w, out_h, out_w, sub_x,
+                                       sub_y, bd, &conv_params, alpha, beta,
+                                       gamma, delta);
               if (use_no_round) {
                 // TODO(angiebird): Change this to test_impl once we have SIMD
                 // implementation
-                conv_params =
-                    get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
+                conv_params = get_conv_params_no_round(
+                    do_average, 0, dstb.get(), out_w, 1, bd);
               }
               if (jj >= 4) {
                 conv_params.use_dist_wtd_comp_avg = 0;
               } else {
                 conv_params.use_dist_wtd_comp_avg = 1;
-                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
-              test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
-                        out_w, sub_x, sub_y, bd, &conv_params, alpha, beta,
-                        gamma, delta);
+              test_impl(mat, input, w, h, stride, output2.get(), 32, 32, out_w,
+                        out_h, out_w, sub_x, sub_y, bd, &conv_params, alpha,
+                        beta, gamma, delta);
 
               if (use_no_round) {
                 for (j = 0; j < out_w * out_h; ++j)
@@ -471,12 +486,6 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
         }
       }
   }
-
-  delete[] input_;
-  delete[] output;
-  delete[] output2;
-  delete[] dsta;
-  delete[] dstb;
 }
 }  // namespace AV1HighbdWarpFilter
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/test/warp_filter_test_util.h b/media/libaom/src/test/warp_filter_test_util.h
index 66a6e244be..583f312822 100644
--- a/media/libaom/src/test/warp_filter_test_util.h
+++ b/media/libaom/src/test/warp_filter_test_util.h
@@ -20,7 +20,6 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 
 #include "av1/common/mv.h"
diff --git a/media/libaom/src/test/webm_video_source.h b/media/libaom/src/test/webm_video_source.h
index bb3d117355..61c64cc7d5 100644
--- a/media/libaom/src/test/webm_video_source.h
+++ b/media/libaom/src/test/webm_video_source.h
@@ -37,11 +37,16 @@ class WebMVideoSource : public CompressedVideoSource {
     delete webm_ctx_;
   }
 
-  virtual void Init() {}
+  virtual void Init() {
+    ASSERT_NE(aom_ctx_, nullptr);
+    ASSERT_NE(webm_ctx_, nullptr);
+  }
 
   virtual void Begin() {
+    ASSERT_NE(aom_ctx_, nullptr);
+    ASSERT_NE(webm_ctx_, nullptr);
     aom_ctx_->file = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(aom_ctx_->file != NULL)
+    ASSERT_NE(aom_ctx_->file, nullptr)
         << "Input file open failed. Filename: " << file_name_;
 
     ASSERT_EQ(file_is_webm(webm_ctx_, aom_ctx_), 1) << "file is not WebM";
@@ -55,7 +60,9 @@ class WebMVideoSource : public CompressedVideoSource {
   }
 
   void FillFrame() {
-    ASSERT_TRUE(aom_ctx_->file != NULL);
+    ASSERT_NE(aom_ctx_, nullptr);
+    ASSERT_NE(webm_ctx_, nullptr);
+    ASSERT_NE(aom_ctx_->file, nullptr);
     const int status = webm_read_frame(webm_ctx_, &buf_, &frame_sz_, &buf_sz_);
     ASSERT_GE(status, 0) << "webm_read_frame failed";
     if (status == 1) {
@@ -64,7 +71,9 @@ class WebMVideoSource : public CompressedVideoSource {
   }
 
   void SeekToNextKeyFrame() {
-    ASSERT_TRUE(aom_ctx_->file != NULL);
+    ASSERT_NE(aom_ctx_, nullptr);
+    ASSERT_NE(webm_ctx_, nullptr);
+    ASSERT_NE(aom_ctx_->file, nullptr);
     do {
       const int status =
           webm_read_frame(webm_ctx_, &buf_, &frame_sz_, &buf_sz_);
diff --git a/media/libaom/src/test/webmenc_test.cc b/media/libaom/src/test/webmenc_test.cc
new file mode 100644
index 0000000000..acd795f2ec
--- /dev/null
+++ b/media/libaom/src/test/webmenc_test.cc
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include "common/webmenc.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+#if CONFIG_WEBM_IO
+
+class WebmencTest : public ::testing::Test {};
+
+// All of these variations on output should be identical.
+TEST(WebmencTest, ExtractEncoderSettingsOutput1) {
+  const char *argv[] = { "aomenc", "-o", "output", "input",
+                         "--target-bitrate=300" };
+  int argc = 5;
+  const std::string expected("version:1.2.3 --target-bitrate=300");
+  char *result = extract_encoder_settings("1.2.3", argv, argc, "input");
+  ASSERT_EQ(expected, std::string(result));
+  free(result);
+}
+
+TEST(WebmencTest, ExtractEncoderSettingsOutput2) {
+  const char *argv[] = { "aomenc", "--output", "bar", "foo", "--cpu-used=3" };
+  int argc = 5;
+  const std::string expected("version:abc --cpu-used=3");
+  char *result = extract_encoder_settings("abc", argv, argc, "foo");
+  ASSERT_EQ(expected, std::string(result));
+  free(result);
+}
+
+TEST(WebmencTest, ExtractEncoderSettingsOutput3) {
+  const char *argv[] = { "aomenc", "--cq-level=63", "--end-usage=q",
+                         "--output=foo", "baz" };
+  int argc = 5;
+  const std::string expected("version:23 --cq-level=63 --end-usage=q");
+  char *result = extract_encoder_settings("23", argv, argc, "baz");
+  ASSERT_EQ(expected, std::string(result));
+  free(result);
+}
+
+TEST(WebmencTest, ExtractEncoderSettingsInput) {
+  // Check that input filename is filtered regardless of position.
+  const char *argv[] = { "aomenc", "-o", "out", "input", "-p", "2" };
+  int argc = 6;
+  const char version[] = "1.0.0";
+  const std::string expected("version:1.0.0 -p 2");
+  char *result = extract_encoder_settings(version, argv, argc, "input");
+  ASSERT_EQ(expected, std::string(result));
+  free(result);
+
+  const char *argv2[] = { "aomenc", "input", "-o", "out", "-p", "2" };
+  result = extract_encoder_settings(version, argv2, argc, "input");
+  ASSERT_EQ(expected, std::string(result));
+  free(result);
+}
+
+#endif  // CONFIG_WEBM_IO
+}  // namespace
diff --git a/media/libaom/src/test/wiener_test.cc b/media/libaom/src/test/wiener_test.cc
index 81839fd56f..69df5ea915 100644
--- a/media/libaom/src/test/wiener_test.cc
+++ b/media/libaom/src/test/wiener_test.cc
@@ -31,16 +31,21 @@
 // 8-bit-depth tests
 namespace wiener_lowbd {
 
+// C implementation of the algorithm implmented by the SIMD code.
+// This is a little more efficient than the version in av1_compute_stats_c().
 static void compute_stats_win_opt_c(int wiener_win, const uint8_t *dgd,
                                     const uint8_t *src, int h_start, int h_end,
                                     int v_start, int v_end, int dgd_stride,
-                                    int src_stride, int64_t *M, int64_t *H) {
+                                    int src_stride, int64_t *M, int64_t *H,
+                                    int use_downsampled_wiener_stats) {
   ASSERT_TRUE(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
   int i, j, k, l, m, n;
   const int pixel_count = (h_end - h_start) * (v_end - v_start);
   const int wiener_win2 = wiener_win * wiener_win;
   const int wiener_halfwin = (wiener_win >> 1);
   uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+  int downsample_factor =
+      use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
 
   std::vector<std::vector<int64_t> > M_int(wiener_win,
                                            std::vector<int64_t>(wiener_win, 0));
@@ -51,21 +56,41 @@ static void compute_stats_win_opt_c(int wiener_win, const uint8_t *dgd,
   int32_t sumX = 0;
   const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
 
-  for (i = v_start; i < v_end; i++) {
-    for (j = h_start; j < h_end; j += 2) {
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
+  for (i = v_start; i < v_end; i = i + downsample_factor) {
+    if (use_downsampled_wiener_stats &&
+        (v_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+      downsample_factor = v_end - i;
+    }
+    int32_t sumX_row_i32 = 0;
+    std::vector<std::vector<int32_t> > sumY_row(
+        wiener_win, std::vector<int32_t>(wiener_win, 0));
+    std::vector<std::vector<int32_t> > M_row_i32(
+        wiener_win, std::vector<int32_t>(wiener_win, 0));
+    std::vector<std::vector<int32_t> > H_row_i32(
+        wiener_win * wiener_win, std::vector<int32_t>(wiener_win * 8, 0));
+    const int h_end_even = h_end & ~1;
+    const int has_odd_pixel = h_end & 1;
+    for (j = h_start; j < h_end_even; j += 2) {
       const uint8_t X1 = src[i * src_stride + j];
       const uint8_t X2 = src[i * src_stride + j + 1];
-      sumX += X1 + X2;
+      sumX_row_i32 += X1 + X2;
 
       const uint8_t *dgd_ij = dgd_win + i * dgd_stride + j;
       for (k = 0; k < wiener_win; k++) {
         for (l = 0; l < wiener_win; l++) {
           const uint8_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
-          int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0];
+          int32_t *H_int_temp = &H_row_i32[(l * wiener_win + k)][0];
           const uint8_t D1 = dgd_ijkl[0];
           const uint8_t D2 = dgd_ijkl[1];
-          sumY[k][l] += D1 + D2;
-          M_int[l][k] += D1 * X1 + D2 * X2;
+          sumY_row[k][l] += D1 + D2;
+          M_row_i32[l][k] += D1 * X1 + D2 * X2;
           for (m = 0; m < wiener_win; m++) {
             for (n = 0; n < wiener_win; n++) {
               H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m] +
@@ -75,6 +100,42 @@ static void compute_stats_win_opt_c(int wiener_win, const uint8_t *dgd,
         }
       }
     }
+    // If the width is odd, add in the final pixel
+    if (has_odd_pixel) {
+      const uint8_t X1 = src[i * src_stride + j];
+      sumX_row_i32 += X1;
+
+      const uint8_t *dgd_ij = dgd_win + i * dgd_stride + j;
+      for (k = 0; k < wiener_win; k++) {
+        for (l = 0; l < wiener_win; l++) {
+          const uint8_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
+          int32_t *H_int_temp = &H_row_i32[(l * wiener_win + k)][0];
+          const uint8_t D1 = dgd_ijkl[0];
+          sumY_row[k][l] += D1;
+          M_row_i32[l][k] += D1 * X1;
+          for (m = 0; m < wiener_win; m++) {
+            for (n = 0; n < wiener_win; n++) {
+              H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m];
+            }
+          }
+        }
+      }
+    }
+
+    sumX += sumX_row_i32 * downsample_factor;
+    // Scale M matrix based on the downsampling factor
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        sumY[k][l] += sumY_row[k][l] * downsample_factor;
+        M_int[k][l] += (int64_t)M_row_i32[k][l] * downsample_factor;
+      }
+    }
+    // Scale H matrix based on the downsampling factor
+    for (k = 0; k < wiener_win * wiener_win; ++k) {
+      for (l = 0; l < wiener_win * 8; ++l) {
+        H_int[k][l] += (int64_t)H_row_i32[k][l] * downsample_factor;
+      }
+    }
   }
 
   const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
@@ -95,14 +156,16 @@ static void compute_stats_win_opt_c(int wiener_win, const uint8_t *dgd,
 
 void compute_stats_opt_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
                          int h_start, int h_end, int v_start, int v_end,
-                         int dgd_stride, int src_stride, int64_t *M,
-                         int64_t *H) {
+                         int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+                         int use_downsampled_wiener_stats) {
   if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) {
     compute_stats_win_opt_c(wiener_win, dgd, src, h_start, h_end, v_start,
-                            v_end, dgd_stride, src_stride, M, H);
+                            v_end, dgd_stride, src_stride, M, H,
+                            use_downsampled_wiener_stats);
   } else {
     av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                        dgd_stride, src_stride, M, H);
+                        dgd_stride, src_stride, M, H,
+                        use_downsampled_wiener_stats);
   }
 }
 
@@ -110,7 +173,8 @@ static const int kIterations = 100;
 typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd,
                                    const uint8_t *src, int h_start, int h_end,
                                    int v_start, int v_end, int dgd_stride,
-                                   int src_stride, int64_t *M, int64_t *H);
+                                   int src_stride, int64_t *M, int64_t *H,
+                                   int use_downsampled_wiener_stats);
 
 ////////////////////////////////////////////////////////////////////////////////
 // 8 bit
@@ -123,8 +187,10 @@ class WienerTest : public ::testing::TestWithParam<WienerTestParam> {
   virtual void SetUp() {
     src_buf = (uint8_t *)aom_memalign(
         32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf));
+    ASSERT_NE(src_buf, nullptr);
     dgd_buf = (uint8_t *)aom_memalign(
         32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf));
+    ASSERT_NE(dgd_buf, nullptr);
     target_func_ = GET_PARAM(0);
   }
   virtual void TearDown() {
@@ -148,15 +214,22 @@ void WienerTest::RunWienerTest(const int32_t wiener_win, int32_t run_times) {
   DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
   DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
   DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
-  const int h_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
-  int h_end =
-      run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
-  const int v_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
-  int v_end =
-      run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
+  // Note(rachelbarker):
+  // The SIMD code requires `h_start` to be even, but can otherwise
+  // deal with any values of `h_end`, `v_start`, `v_end`. We cover this
+  // entire range, even though (at the time of writing) `h_start` and `v_start`
+  // will always be multiples of 64 when called from non-test code.
+  // If in future any new requirements are added, these lines will
+  // need changing.
+  const int h_start = (rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & ~1;
+  int h_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
+  const int v_start = rng_.Rand16() % (MAX_WIENER_BLOCK / 2);
+  int v_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
   const int dgd_stride = h_end;
   const int src_stride = MAX_DATA_BLOCK;
   const int iters = run_times == 1 ? kIterations : 2;
+  const int max_value_downsample_stats = 1;
+
   for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
     for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
       dgd_buf[i] = rng_.Rand8();
@@ -164,45 +237,50 @@ void WienerTest::RunWienerTest(const int32_t wiener_win, int32_t run_times) {
     }
     uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
     uint8_t *src = src_buf;
-
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                          dgd_stride, src_stride, M_ref, H_ref);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                   dgd_stride, src_stride, M_test, H_test);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    if (run_times > 10) {
-      printf("win %d %3dx%-3d:%7.2f/%7.2fns", wiener_win, h_end, v_end, time1,
-             time2);
-      printf("(%3.2f)\n", time1 / time2);
-    }
-    int failed = 0;
-    for (int i = 0; i < wiener_win2; ++i) {
-      if (M_ref[i] != M_test[i]) {
-        failed = 1;
-        printf("win %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
-               wiener_win, iter, i, M_ref[i], M_test[i]);
-        break;
+    for (int use_downsampled_stats = 0;
+         use_downsampled_stats <= max_value_downsample_stats;
+         use_downsampled_stats++) {
+      aom_usec_timer timer;
+      aom_usec_timer_start(&timer);
+      for (int i = 0; i < run_times; ++i) {
+        av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start,
+                            v_end, dgd_stride, src_stride, M_ref, H_ref,
+                            use_downsampled_stats);
       }
-    }
-    for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
-      if (H_ref[i] != H_test[i]) {
-        failed = 1;
-        printf("win %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
-               wiener_win, iter, i, H_ref[i], H_test[i]);
-        break;
+      aom_usec_timer_mark(&timer);
+      const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      aom_usec_timer_start(&timer);
+      for (int i = 0; i < run_times; ++i) {
+        target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                     dgd_stride, src_stride, M_test, H_test,
+                     use_downsampled_stats);
+      }
+      aom_usec_timer_mark(&timer);
+      const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      if (run_times > 10) {
+        printf("win %d %3dx%-3d:%7.2f/%7.2fns", wiener_win, h_end, v_end, time1,
+               time2);
+        printf("(%3.2f)\n", time1 / time2);
+      }
+      int failed = 0;
+      for (int i = 0; i < wiener_win2; ++i) {
+        if (M_ref[i] != M_test[i]) {
+          failed = 1;
+          printf("win %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
+                 wiener_win, iter, i, M_ref[i], M_test[i]);
+          break;
+        }
+      }
+      for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+        if (H_ref[i] != H_test[i]) {
+          failed = 1;
+          printf("win %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
+                 wiener_win, iter, i, H_ref[i], H_test[i]);
+          break;
+        }
       }
+      ASSERT_EQ(failed, 0);
     }
-    ASSERT_EQ(failed, 0);
   }
 }
 
@@ -220,6 +298,8 @@ void WienerTest::RunWienerTest_ExtremeValues(const int32_t wiener_win) {
   const int dgd_stride = h_end;
   const int src_stride = MAX_DATA_BLOCK;
   const int iters = 1;
+  const int max_value_downsample_stats = 1;
+
   for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
     for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
       dgd_buf[i] = 255;
@@ -227,31 +307,36 @@ void WienerTest::RunWienerTest_ExtremeValues(const int32_t wiener_win) {
     }
     uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
     uint8_t *src = src_buf;
+    for (int use_downsampled_stats = 0;
+         use_downsampled_stats <= max_value_downsample_stats;
+         use_downsampled_stats++) {
+      av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                          dgd_stride, src_stride, M_ref, H_ref,
+                          use_downsampled_stats);
 
-    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                        dgd_stride, src_stride, M_ref, H_ref);
-
-    target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                 dgd_stride, src_stride, M_test, H_test);
-
-    int failed = 0;
-    for (int i = 0; i < wiener_win2; ++i) {
-      if (M_ref[i] != M_test[i]) {
-        failed = 1;
-        printf("win %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
-               wiener_win, iter, i, M_ref[i], M_test[i]);
-        break;
+      target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                   dgd_stride, src_stride, M_test, H_test,
+                   use_downsampled_stats);
+
+      int failed = 0;
+      for (int i = 0; i < wiener_win2; ++i) {
+        if (M_ref[i] != M_test[i]) {
+          failed = 1;
+          printf("win %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
+                 wiener_win, iter, i, M_ref[i], M_test[i]);
+          break;
+        }
       }
-    }
-    for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
-      if (H_ref[i] != H_test[i]) {
-        failed = 1;
-        printf("win %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
-               wiener_win, iter, i, H_ref[i], H_test[i]);
-        break;
+      for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+        if (H_ref[i] != H_test[i]) {
+          failed = 1;
+          printf("win %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
+                 wiener_win, iter, i, H_ref[i], H_test[i]);
+          break;
+        }
       }
+      ASSERT_EQ(failed, 0);
     }
-    ASSERT_EQ(failed, 0);
   }
 }
 
@@ -318,8 +403,17 @@ static void compute_stats_highbd_win_opt_c(int wiener_win, const uint8_t *dgd8,
   int64_t sumX = 0;
   const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
 
+  // Main loop handles two pixels at a time
+  // We can assume that h_start is even, since it will always be aligned to
+  // a tile edge + some number of restoration units, and both of those will
+  // be 64-pixel aligned.
+  // However, at the edge of the image, h_end may be odd, so we need to handle
+  // that case correctly.
+  assert(h_start % 2 == 0);
   for (i = v_start; i < v_end; i++) {
-    for (j = h_start; j < h_end; j += 2) {
+    const int h_end_even = h_end & ~1;
+    const int has_odd_pixel = h_end & 1;
+    for (j = h_start; j < h_end_even; j += 2) {
       const uint16_t X1 = src[i * src_stride + j];
       const uint16_t X2 = src[i * src_stride + j + 1];
       sumX += X1 + X2;
@@ -342,6 +436,27 @@ static void compute_stats_highbd_win_opt_c(int wiener_win, const uint8_t *dgd8,
         }
       }
     }
+    // If the width is odd, add in the final pixel
+    if (has_odd_pixel) {
+      const uint16_t X1 = src[i * src_stride + j];
+      sumX += X1;
+
+      const uint16_t *dgd_ij = dgd_win + i * dgd_stride + j;
+      for (k = 0; k < wiener_win; k++) {
+        for (l = 0; l < wiener_win; l++) {
+          const uint16_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
+          int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0];
+          const uint16_t D1 = dgd_ijkl[0];
+          sumY[k][l] += D1;
+          M_int[l][k] += D1 * X1;
+          for (m = 0; m < wiener_win; m++) {
+            for (n = 0; n < wiener_win; n++) {
+              H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m];
+            }
+          }
+        }
+      }
+    }
   }
 
   uint8_t bit_depth_divider = 1;
@@ -398,8 +513,10 @@ class WienerTestHighbd : public ::testing::TestWithParam<WienerTestParam> {
   virtual void SetUp() {
     src_buf = (uint16_t *)aom_memalign(
         32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf));
+    ASSERT_NE(src_buf, nullptr);
     dgd_buf = (uint16_t *)aom_memalign(
         32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf));
+    ASSERT_NE(dgd_buf, nullptr);
     target_func_ = GET_PARAM(0);
   }
   virtual void TearDown() {
@@ -427,12 +544,17 @@ void WienerTestHighbd::RunWienerTest(const int32_t wiener_win,
   DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
   DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
   DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
-  const int h_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
-  const int h_end =
-      run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
-  const int v_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
-  const int v_end =
-      run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
+  // Note(rachelbarker):
+  // The SIMD code requires `h_start` to be even, but can otherwise
+  // deal with any values of `h_end`, `v_start`, `v_end`. We cover this
+  // entire range, even though (at the time of writing) `h_start` and `v_start`
+  // will always be multiples of 64 when called from non-test code.
+  // If in future any new requirements are added, these lines will
+  // need changing.
+  const int h_start = (rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & ~1;
+  int h_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
+  const int v_start = rng_.Rand16() % (MAX_WIENER_BLOCK / 2);
+  int v_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
   const int dgd_stride = h_end;
   const int src_stride = MAX_DATA_BLOCK;
   const int iters = run_times == 1 ? kIterations : 2;
diff --git a/media/libaom/src/test/y4m_test.cc b/media/libaom/src/test/y4m_test.cc
index 5d795fad9d..3e873690b8 100644
--- a/media/libaom/src/test/y4m_test.cc
+++ b/media/libaom/src/test/y4m_test.cc
@@ -40,7 +40,7 @@ const Y4mTestParam kY4mTestVectors[] = {
   { "park_joy_90p_8_420_monochrome.y4m", 8, AOM_IMG_FMT_I420,
     "95ef5bf6218580588be24a5271bb6a7f" },
   { "park_joy_90p_8_420_vertical_csp.y4m", 8, AOM_IMG_FMT_I420,
-    "f53a40fec15254ac312527339d9c686b" },
+    "e5406275b9fc6bb3436c31d4a05c1cab" },
   { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422,
     "284a47a47133b12884ec3a14e959a0b6" },
   { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444,
@@ -78,7 +78,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
 
   // Checks y4m header information
   void HeaderChecks(unsigned int bit_depth, aom_img_fmt_t fmt) {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     ASSERT_EQ(y4m_.pic_w, (int)kWidth);
     ASSERT_EQ(y4m_.pic_h, (int)kHeight);
     ASSERT_EQ(img()->d_w, kWidth);
@@ -104,7 +104,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
 
   // Checks MD5 of the raw frame data
   void Md5Check(const string &expected_md5) {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     libaom_test::MD5 md5;
     for (unsigned int i = start_; i < limit_; i++) {
       md5.Add(img());
@@ -143,14 +143,15 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest {
 
   // Writes out a y4m file and then reads it back
   void WriteY4mAndReadBack() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     char buf[Y4M_BUFFER_SIZE] = { 0 };
     const struct AvxRational framerate = { y4m_.fps_n, y4m_.fps_d };
     tmpfile_ = new libaom_test::TempOutFile;
-    ASSERT_TRUE(tmpfile_->file() != NULL);
+    ASSERT_NE(tmpfile_, nullptr);
+    ASSERT_NE(tmpfile_->file(), nullptr);
     y4m_write_file_header(buf, sizeof(buf), kWidth, kHeight, &framerate,
                           img()->monochrome, img()->csp, y4m_.aom_fmt,
-                          y4m_.bit_depth);
+                          y4m_.bit_depth, AOM_CR_STUDIO_RANGE);
     fputs(buf, tmpfile_->file());
     for (unsigned int i = start_; i < limit_; i++) {
       y4m_write_frame_header(buf, sizeof(buf));
@@ -177,4 +178,107 @@ TEST_P(Y4mVideoWriteTest, WriteTest) {
 
 INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest,
                          ::testing::ValuesIn(kY4mTestVectors));
+
+static const char kY4MRegularHeader[] =
+    "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG\n"
+    "FRAME\n"
+    "012345678912345601230123";
+
+TEST(Y4MHeaderTest, RegularHeader) {
+  libaom_test::TempOutFile f;
+  fwrite(kY4MRegularHeader, 1, sizeof(kY4MRegularHeader), f.file());
+  fflush(f.file());
+  EXPECT_EQ(0, fseek(f.file(), 0, 0));
+
+  y4m_input y4m;
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), NULL, 0, AOM_CSP_UNKNOWN,
+                           /*only_420=*/0),
+            0);
+  EXPECT_EQ(y4m.pic_w, 4);
+  EXPECT_EQ(y4m.pic_h, 4);
+  EXPECT_EQ(y4m.fps_n, 30);
+  EXPECT_EQ(y4m.fps_d, 1);
+  EXPECT_EQ(y4m.interlace, 'p');
+  EXPECT_EQ(y4m.color_range, AOM_CR_STUDIO_RANGE);
+  EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+  y4m_input_close(&y4m);
+}
+
+// Testing that headers over 100 characters can be parsed.
+static const char kY4MLongHeader[] =
+    "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG "
+    "XCOLORRANGE=LIMITED XSOME_UNKNOWN_METADATA XOTHER_UNKNOWN_METADATA\n"
+    "FRAME\n"
+    "012345678912345601230123";
+
+TEST(Y4MHeaderTest, LongHeader) {
+  libaom_test::TempOutFile tmpfile;
+  FILE *f = tmpfile.file();
+  fwrite(kY4MLongHeader, 1, sizeof(kY4MLongHeader), f);
+  fflush(f);
+  EXPECT_EQ(fseek(f, 0, 0), 0);
+
+  y4m_input y4m;
+  EXPECT_EQ(y4m_input_open(&y4m, f, NULL, 0, AOM_CSP_UNKNOWN,
+                           /*only_420=*/0),
+            0);
+  EXPECT_EQ(y4m.pic_w, 4);
+  EXPECT_EQ(y4m.pic_h, 4);
+  EXPECT_EQ(y4m.fps_n, 30);
+  EXPECT_EQ(y4m.fps_d, 1);
+  EXPECT_EQ(y4m.interlace, 'p');
+  EXPECT_EQ(y4m.color_range, AOM_CR_STUDIO_RANGE);
+  EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+  y4m_input_close(&y4m);
+}
+
+static const char kY4MFullRangeHeader[] =
+    "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG XCOLORRANGE=FULL\n"
+    "FRAME\n"
+    "012345678912345601230123";
+
+TEST(Y4MHeaderTest, FullRangeHeader) {
+  libaom_test::TempOutFile tmpfile;
+  FILE *f = tmpfile.file();
+  fwrite(kY4MFullRangeHeader, 1, sizeof(kY4MFullRangeHeader), f);
+  fflush(f);
+  EXPECT_EQ(fseek(f, 0, 0), 0);
+
+  y4m_input y4m;
+  EXPECT_EQ(y4m_input_open(&y4m, f, NULL, 0, AOM_CSP_UNKNOWN,
+                           /*only_420=*/0),
+            0);
+  EXPECT_EQ(y4m.pic_w, 4);
+  EXPECT_EQ(y4m.pic_h, 4);
+  EXPECT_EQ(y4m.fps_n, 30);
+  EXPECT_EQ(y4m.fps_d, 1);
+  EXPECT_EQ(y4m.interlace, 'p');
+  EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+  EXPECT_EQ(y4m.color_range, AOM_CR_FULL_RANGE);
+  y4m_input_close(&y4m);
+}
+
+TEST(Y4MHeaderTest, WriteStudioColorRange) {
+  char buf[128];
+  struct AvxRational framerate = { /*numerator=*/30, /*denominator=*/1 };
+  EXPECT_GE(y4m_write_file_header(
+                buf, /*len=*/128, /*width=*/4, /*height=*/5, &framerate,
+                /*monochrome=*/0, AOM_CSP_UNKNOWN, AOM_IMG_FMT_I420,
+                /*bit_depth=*/8, AOM_CR_STUDIO_RANGE),
+            0);
+  EXPECT_EQ(strcmp("YUV4MPEG2 W4 H5 F30:1 Ip C420jpeg\n", buf), 0);
+}
+
+TEST(Y4MHeaderTest, WriteFullColorRange) {
+  char buf[128];
+  struct AvxRational framerate = { /*numerator=*/30, /*denominator=*/1 };
+  EXPECT_GE(y4m_write_file_header(
+                buf, /*len=*/128, /*width=*/4, /*height=*/5, &framerate,
+                /*monochrome=*/0, AOM_CSP_UNKNOWN, AOM_IMG_FMT_I420,
+                /*bit_depth=*/8, AOM_CR_FULL_RANGE),
+            0);
+  EXPECT_EQ(strcmp("YUV4MPEG2 W4 H5 F30:1 Ip C420jpeg XCOLORRANGE=FULL\n", buf),
+            0);
+}
+
 }  // namespace
diff --git a/media/libaom/src/test/y4m_video_source.h b/media/libaom/src/test/y4m_video_source.h
index 63f74f567f..143fbc627d 100644
--- a/media/libaom/src/test/y4m_video_source.h
+++ b/media/libaom/src/test/y4m_video_source.h
@@ -36,12 +36,12 @@ class Y4mVideoSource : public VideoSource {
   virtual void OpenSource() {
     CloseSource();
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
   }
 
   virtual void ReadSourceToStart() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     ASSERT_FALSE(
         y4m_input_open(&y4m_, input_file_, NULL, 0, AOM_CSP_UNKNOWN, 0));
     framerate_numerator_ = y4m_.fps_n;
@@ -82,7 +82,7 @@ class Y4mVideoSource : public VideoSource {
   virtual unsigned int limit() const { return limit_; }
 
   virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     // Read a frame from input_file.
     y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
   }
diff --git a/media/libaom/src/test/yuv_video_source.h b/media/libaom/src/test/yuv_video_source.h
index 774ecc0086..15ad5c2a19 100644
--- a/media/libaom/src/test/yuv_video_source.h
+++ b/media/libaom/src/test/yuv_video_source.h
@@ -44,7 +44,7 @@ class YUVVideoSource : public VideoSource {
   virtual void Begin() {
     if (input_file_) fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
     if (start_)
       fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
@@ -79,11 +79,12 @@ class YUVVideoSource : public VideoSource {
     if (width != width_ || height != height_ || format != format_) {
       aom_img_free(img_);
       img_ = aom_img_alloc(NULL, format, width, height, 1);
-      ASSERT_TRUE(img_ != NULL);
+      ASSERT_NE(img_, nullptr);
       width_ = width;
       height_ = height;
       format_ = format;
       switch (format) {
+        case AOM_IMG_FMT_NV12:
         case AOM_IMG_FMT_I420: raw_size_ = width * height * 3 / 2; break;
         case AOM_IMG_FMT_I422: raw_size_ = width * height * 2; break;
         case AOM_IMG_FMT_I444: raw_size_ = width * height * 3; break;
@@ -96,7 +97,7 @@ class YUVVideoSource : public VideoSource {
   }
 
   virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     // Read a frame from input_file.
     if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) {
       limit_ = frame_;
diff --git a/media/libaom/src/third_party/fastfeat/fast.c b/media/libaom/src/third_party/fastfeat/fast.c
index f29ac8f725..30efde8396 100644
--- a/media/libaom/src/third_party/fastfeat/fast.c
+++ b/media/libaom/src/third_party/fastfeat/fast.c
@@ -1,3 +1,33 @@
+// Copyright (c) 2006, 2008 Edward Rosten
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//  *Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+//  *Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+//  *Neither the name of the University of Cambridge nor the names of
+//   its contributors may be used to endorse or promote products derived
+//   from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 // clang-format off
 #include <stdlib.h>
 #include "fast.h"
diff --git a/media/libaom/src/third_party/fastfeat/fast.h b/media/libaom/src/third_party/fastfeat/fast.h
index a65d5a5d17..d7a9617cce 100644
--- a/media/libaom/src/third_party/fastfeat/fast.h
+++ b/media/libaom/src/third_party/fastfeat/fast.h
@@ -1,3 +1,33 @@
+// Copyright (c) 2006, 2008 Edward Rosten
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//  *Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+//  *Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+//  *Neither the name of the University of Cambridge nor the names of
+//   its contributors may be used to endorse or promote products derived
+//   from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 // clang-format off
 #ifndef FAST_H
 #define FAST_H
diff --git a/media/libaom/src/third_party/fastfeat/fast_9.c b/media/libaom/src/third_party/fastfeat/fast_9.c
index 61c654c472..c0fdbe26cd 100644
--- a/media/libaom/src/third_party/fastfeat/fast_9.c
+++ b/media/libaom/src/third_party/fastfeat/fast_9.c
@@ -1,3 +1,33 @@
+// Copyright (c) 2006, 2008 Edward Rosten
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//  *Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+//  *Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+//  *Neither the name of the University of Cambridge nor the names of
+//   its contributors may be used to endorse or promote products derived
+//   from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 // clang-format off
 /*This is mechanically generated code*/
 #include <stdlib.h>
diff --git a/media/libaom/src/third_party/fastfeat/nonmax.c b/media/libaom/src/third_party/fastfeat/nonmax.c
index 0dbc660cb0..2e048e5460 100644
--- a/media/libaom/src/third_party/fastfeat/nonmax.c
+++ b/media/libaom/src/third_party/fastfeat/nonmax.c
@@ -1,3 +1,33 @@
+// Copyright (c) 2006, 2008 Edward Rosten
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//  *Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+//  *Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+//  *Neither the name of the University of Cambridge nor the names of
+//   its contributors may be used to endorse or promote products derived
+//   from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 // clang-format off
 #include <stdlib.h>
 #include "fast.h"
diff --git a/media/libaom/src/third_party/googletest/README.libaom b/media/libaom/src/third_party/googletest/README.libaom
index 9b8a863980..a461f36724 100644
--- a/media/libaom/src/third_party/googletest/README.libaom
+++ b/media/libaom/src/third_party/googletest/README.libaom
@@ -1,5 +1,5 @@
 URL: https://github.com/google/googletest
-Version: 1.10.x
+Version: release-1.11.0
 License: BSD
 License File: LICENSE
 
@@ -12,6 +12,20 @@ failures, various options for running the tests, and XML test report
 generation.
 
 Local Modifications:
-- Replace everything in:
-  third_party/googletest/src/googletest/src/
-  third_party/googletest/src/googletest/include/
+- Remove everything but:
+  CMakeLists.txt
+  CONTRIBUTORS
+  googlemock/
+   cmake
+   CMakeLists.txt
+   include
+   README.md
+   src
+  googletest/
+   cmake
+   CMakeLists.txt
+   include
+   README.md
+   src
+  LICENSE
+  README.md
diff --git a/media/libaom/src/third_party/googletest/src/CMakeLists.txt b/media/libaom/src/third_party/googletest/src/CMakeLists.txt
new file mode 100644
index 0000000000..ea81ab1292
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Note: CMake support is community-based. The maintainers do not use CMake
+# internally.
+
+cmake_minimum_required(VERSION 2.8.12)
+
+if (POLICY CMP0048)
+  cmake_policy(SET CMP0048 NEW)
+endif (POLICY CMP0048)
+
+project(googletest-distribution)
+set(GOOGLETEST_VERSION 1.11.0)
+
+if (CMAKE_VERSION VERSION_GREATER "3.0.2")
+  if(NOT CYGWIN AND NOT MSYS AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL QNX)
+    set(CMAKE_CXX_EXTENSIONS OFF)
+  endif()
+endif()
+
+enable_testing()
+
+include(CMakeDependentOption)
+include(GNUInstallDirs)
+
+#Note that googlemock target already builds googletest
+option(BUILD_GMOCK "Builds the googlemock subproject" ON)
+option(INSTALL_GTEST "Enable installation of googletest. (Projects embedding googletest may want to turn this OFF.)" ON)
+
+if(BUILD_GMOCK)
+  add_subdirectory( googlemock )
+else()
+  add_subdirectory( googletest )
+endif()
diff --git a/media/libaom/src/third_party/googletest/src/googletest/CONTRIBUTORS b/media/libaom/src/third_party/googletest/src/CONTRIBUTORS
index feae2fc044..76db0b40ff 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/CONTRIBUTORS
+++ b/media/libaom/src/third_party/googletest/src/CONTRIBUTORS
@@ -5,33 +5,59 @@
 
 Ajay Joshi <jaj@google.com>
 Balázs Dán <balazs.dan@gmail.com>
+Benoit Sigoure <tsuna@google.com>
 Bharat Mediratta <bharat@menalto.com>
+Bogdan Piloca <boo@google.com>
 Chandler Carruth <chandlerc@google.com>
 Chris Prince <cprince@google.com>
 Chris Taylor <taylorc@google.com>
 Dan Egnor <egnor@google.com>
+Dave MacLachlan <dmaclach@gmail.com>
+David Anderson <danderson@google.com>
+Dean Sturtevant
 Eric Roman <eroman@chromium.org>
+Gene Volovich <gv@cite.com>
 Hady Zalek <hady.zalek@gmail.com>
+Hal Burch <gmock@hburch.com>
 Jeffrey Yasskin <jyasskin@google.com>
+Jim Keller <jimkeller@google.com>
+Joe Walnes <joe@truemesh.com>
+Jon Wray <jwray@google.com>
 Jói Sigurðsson <joi@google.com>
 Keir Mierle <mierle@gmail.com>
 Keith Ray <keith.ray@gmail.com>
 Kenton Varda <kenton@google.com>
+Kostya Serebryany <kcc@google.com>
+Krystian Kuzniarek <krystian.kuzniarek@gmail.com>
+Lev Makhlis
 Manuel Klimek <klimek@google.com>
+Mario Tanev <radix@google.com>
+Mark Paskin
 Markus Heule <markus.heule@gmail.com>
+Matthew Simmons <simmonmt@acm.org>
 Mika Raento <mikie@iki.fi>
+Mike Bland <mbland@google.com>
 Miklós Fazekas <mfazekas@szemafor.com>
+Neal Norwitz <nnorwitz@gmail.com>
+Nermin Ozkiranartli <nermin@google.com>
+Owen Carlsen <ocarlsen@google.com>
+Paneendra Ba <paneendra@google.com>
 Pasi Valminen <pasi.valminen@gmail.com>
 Patrick Hanna <phanna@google.com>
 Patrick Riley <pfr@google.com>
+Paul Menage <menage@google.com>
 Peter Kaminski <piotrk@google.com>
+Piotr Kaminski <piotrk@google.com>
 Preston Jackson <preston.a.jackson@gmail.com>
 Rainer Klaffenboeck <rainer.klaffenboeck@dynatrace.com>
 Russ Cox <rsc@google.com>
 Russ Rufer <russ@pentad.com>
 Sean Mcafee <eefacm@gmail.com>
 Sigurður Ásgeirsson <siggi@google.com>
+Sverre Sundsdal <sundsdal@gmail.com>
+Takeshi Yoshino <tyoshino@google.com>
 Tracy Bialik <tracy@pentad.com>
 Vadim Berman <vadimb@google.com>
 Vlad Losev <vladl@google.com>
+Wolfgang Klier <wklier@google.com>
 Zhanyong Wan <wan@google.com>
diff --git a/media/libaom/src/third_party/googletest/src/googletest/LICENSE b/media/libaom/src/third_party/googletest/src/LICENSE
index 1941a11f8c..1941a11f8c 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/LICENSE
+++ b/media/libaom/src/third_party/googletest/src/LICENSE
diff --git a/media/libaom/src/third_party/googletest/src/README.md b/media/libaom/src/third_party/googletest/src/README.md
new file mode 100644
index 0000000000..7d872a57ed
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/README.md
@@ -0,0 +1,140 @@
+# GoogleTest
+
+### Announcements
+
+#### Live at Head
+
+GoogleTest now follows the
+[Abseil Live at Head philosophy](https://abseil.io/about/philosophy#upgrade-support).
+We recommend using the latest commit in the `master` branch in your projects.
+
+#### Documentation Updates
+
+Our documentation is now live on GitHub Pages at
+https://google.github.io/googletest/. We recommend browsing the documentation on
+GitHub Pages rather than directly in the repository.
+
+#### Release 1.10.x
+
+[Release 1.10.x](https://github.com/google/googletest/releases/tag/release-1.10.0)
+is now available.
+
+#### Coming Soon
+
+*   We are planning to take a dependency on
+    [Abseil](https://github.com/abseil/abseil-cpp).
+*   More documentation improvements are planned.
+
+## Welcome to **GoogleTest**, Google's C++ test framework!
+
+This repository is a merger of the formerly separate GoogleTest and GoogleMock
+projects. These were so closely related that it makes sense to maintain and
+release them together.
+
+### Getting Started
+
+See the [GoogleTest User's Guide](https://google.github.io/googletest/) for
+documentation. We recommend starting with the
+[GoogleTest Primer](https://google.github.io/googletest/primer.html).
+
+More information about building GoogleTest can be found at
+[googletest/README.md](googletest/README.md).
+
+## Features
+
+*   An [xUnit](https://en.wikipedia.org/wiki/XUnit) test framework.
+*   Test discovery.
+*   A rich set of assertions.
+*   User-defined assertions.
+*   Death tests.
+*   Fatal and non-fatal failures.
+*   Value-parameterized tests.
+*   Type-parameterized tests.
+*   Various options for running the tests.
+*   XML test report generation.
+
+## Supported Platforms
+
+GoogleTest requires a codebase and compiler compliant with the C++11 standard or
+newer.
+
+The GoogleTest code is officially supported on the following platforms.
+Operating systems or tools not listed below are community-supported. For
+community-supported platforms, patches that do not complicate the code may be
+considered.
+
+If you notice any problems on your platform, please file an issue on the
+[GoogleTest GitHub Issue Tracker](https://github.com/google/googletest/issues).
+Pull requests containing fixes are welcome!
+
+### Operating Systems
+
+*   Linux
+*   macOS
+*   Windows
+
+### Compilers
+
+*   gcc 5.0+
+*   clang 5.0+
+*   MSVC 2015+
+
+**macOS users:** Xcode 9.3+ provides clang 5.0+.
+
+### Build Systems
+
+*   [Bazel](https://bazel.build/)
+*   [CMake](https://cmake.org/)
+
+**Note:** Bazel is the build system used by the team internally and in tests.
+CMake is supported on a best-effort basis and by the community.
+
+## Who Is Using GoogleTest?
+
+In addition to many internal projects at Google, GoogleTest is also used by the
+following notable projects:
+
+*   The [Chromium projects](http://www.chromium.org/) (behind the Chrome browser
+    and Chrome OS).
+*   The [LLVM](http://llvm.org/) compiler.
+*   [Protocol Buffers](https://github.com/google/protobuf), Google's data
+    interchange format.
+*   The [OpenCV](http://opencv.org/) computer vision library.
+
+## Related Open Source Projects
+
+[GTest Runner](https://github.com/nholthaus/gtest-runner) is a Qt5 based
+automated test-runner and Graphical User Interface with powerful features for
+Windows and Linux platforms.
+
+[GoogleTest UI](https://github.com/ospector/gtest-gbar) is a test runner that
+runs your test binary, allows you to track its progress via a progress bar, and
+displays a list of test failures. Clicking on one shows failure text. Google
+Test UI is written in C#.
+
+[GTest TAP Listener](https://github.com/kinow/gtest-tap-listener) is an event
+listener for GoogleTest that implements the
+[TAP protocol](https://en.wikipedia.org/wiki/Test_Anything_Protocol) for test
+result output. If your test runner understands TAP, you may find it useful.
+
+[gtest-parallel](https://github.com/google/gtest-parallel) is a test runner that
+runs tests from your binary in parallel to provide significant speed-up.
+
+[GoogleTest Adapter](https://marketplace.visualstudio.com/items?itemName=DavidSchuldenfrei.gtest-adapter)
+is a VS Code extension allowing to view GoogleTest in a tree view, and run/debug
+your tests.
+
+[C++ TestMate](https://github.com/matepek/vscode-catch2-test-adapter) is a VS
+Code extension allowing to view GoogleTest in a tree view, and run/debug your
+tests.
+
+[Cornichon](https://pypi.org/project/cornichon/) is a small Gherkin DSL parser
+that generates stub code for GoogleTest.
+
+## Contributing Changes
+
+Please read
+[`CONTRIBUTING.md`](https://github.com/google/googletest/blob/master/CONTRIBUTING.md)
+for details on how to contribute to this project.
+
+Happy testing!
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/CMakeLists.txt b/media/libaom/src/third_party/googletest/src/googlemock/CMakeLists.txt
new file mode 100644
index 0000000000..e7df8ec53d
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/CMakeLists.txt
@@ -0,0 +1,218 @@
+########################################################################
+# Note: CMake support is community-based. The maintainers do not use CMake
+# internally.
+#
+# CMake build script for Google Mock.
+#
+# To run the tests for Google Mock itself on Linux, use 'make test' or
+# ctest.  You can select which tests to run using 'ctest -R regex'.
+# For more options, run 'ctest --help'.
+
+option(gmock_build_tests "Build all of Google Mock's own tests." OFF)
+
+# A directory to find Google Test sources.
+if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/gtest/CMakeLists.txt")
+  set(gtest_dir gtest)
+else()
+  set(gtest_dir ../googletest)
+endif()
+
+# Defines pre_project_set_up_hermetic_build() and set_up_hermetic_build().
+include("${gtest_dir}/cmake/hermetic_build.cmake" OPTIONAL)
+
+if (COMMAND pre_project_set_up_hermetic_build)
+  # Google Test also calls hermetic setup functions from add_subdirectory,
+  # although its changes will not affect things at the current scope.
+  pre_project_set_up_hermetic_build()
+endif()
+
+########################################################################
+#
+# Project-wide settings
+
+# Name of the project.
+#
+# CMake files in this project can refer to the root source directory
+# as ${gmock_SOURCE_DIR} and to the root binary directory as
+# ${gmock_BINARY_DIR}.
+# Language "C" is required for find_package(Threads).
+if (CMAKE_VERSION VERSION_LESS 3.0)
+  project(gmock CXX C)
+else()
+  cmake_policy(SET CMP0048 NEW)
+  project(gmock VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
+endif()
+cmake_minimum_required(VERSION 2.8.12)
+
+if (COMMAND set_up_hermetic_build)
+  set_up_hermetic_build()
+endif()
+
+# Instructs CMake to process Google Test's CMakeLists.txt and add its
+# targets to the current scope.  We are placing Google Test's binary
+# directory in a subdirectory of our own as VC compilation may break
+# if they are the same (the default).
+add_subdirectory("${gtest_dir}" "${gmock_BINARY_DIR}/${gtest_dir}")
+
+
+# These commands only run if this is the main project
+if(CMAKE_PROJECT_NAME STREQUAL "gmock" OR CMAKE_PROJECT_NAME STREQUAL "googletest-distribution")
+  # BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to
+  # make it prominent in the GUI.
+  option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF)
+else()
+  mark_as_advanced(gmock_build_tests)
+endif()
+
+# Although Google Test's CMakeLists.txt calls this function, the
+# changes there don't affect the current scope.  Therefore we have to
+# call it again here.
+config_compiler_and_linker()  # from ${gtest_dir}/cmake/internal_utils.cmake
+
+# Adds Google Mock's and Google Test's header directories to the search path.
+set(gmock_build_include_dirs
+  "${gmock_SOURCE_DIR}/include"
+  "${gmock_SOURCE_DIR}"
+  "${gtest_SOURCE_DIR}/include"
+  # This directory is needed to build directly from Google Test sources.
+  "${gtest_SOURCE_DIR}")
+include_directories(${gmock_build_include_dirs})
+
+########################################################################
+#
+# Defines the gmock & gmock_main libraries.  User tests should link
+# with one of them.
+
+# Google Mock libraries.  We build them using more strict warnings than what
+# are used for other targets, to ensure that Google Mock can be compiled by
+# a user aggressive about warnings.
+if (MSVC)
+  cxx_library(gmock
+              "${cxx_strict}"
+              "${gtest_dir}/src/gtest-all.cc"
+              src/gmock-all.cc)
+
+  cxx_library(gmock_main
+              "${cxx_strict}"
+              "${gtest_dir}/src/gtest-all.cc"
+              src/gmock-all.cc
+              src/gmock_main.cc)
+else()
+  cxx_library(gmock "${cxx_strict}" src/gmock-all.cc)
+  target_link_libraries(gmock PUBLIC gtest)
+  set_target_properties(gmock PROPERTIES VERSION ${GOOGLETEST_VERSION})
+  cxx_library(gmock_main "${cxx_strict}" src/gmock_main.cc)
+  target_link_libraries(gmock_main PUBLIC gmock)
+  set_target_properties(gmock_main PROPERTIES VERSION ${GOOGLETEST_VERSION})
+endif()
+# If the CMake version supports it, attach header directory information
+# to the targets for when we are part of a parent build (ie being pulled
+# in via add_subdirectory() rather than being a standalone build).
+if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+  target_include_directories(gmock SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${gmock_build_include_dirs}>"
+    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+  target_include_directories(gmock_main SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${gmock_build_include_dirs}>"
+    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+endif()
+
+########################################################################
+#
+# Install rules
+install_project(gmock gmock_main)
+
+########################################################################
+#
+# Google Mock's own tests.
+#
+# You can skip this section if you aren't interested in testing
+# Google Mock itself.
+#
+# The tests are not built by default.  To build them, set the
+# gmock_build_tests option to ON.  You can do it by running ccmake
+# or specifying the -Dgmock_build_tests=ON flag when running cmake.
+
+if (gmock_build_tests)
+  # This must be set in the root directory for the tests to be run by
+  # 'make test' or ctest.
+  enable_testing()
+
+  if (MINGW OR CYGWIN)
+    if (CMAKE_VERSION VERSION_LESS "2.8.12")
+      add_compile_options("-Wa,-mbig-obj")
+    else()
+      add_definitions("-Wa,-mbig-obj")
+    endif()
+  endif()
+
+  ############################################################
+  # C++ tests built with standard compiler flags.
+
+  cxx_test(gmock-actions_test gmock_main)
+  cxx_test(gmock-cardinalities_test gmock_main)
+  cxx_test(gmock_ex_test gmock_main)
+  cxx_test(gmock-function-mocker_test gmock_main)
+  cxx_test(gmock-internal-utils_test gmock_main)
+  cxx_test(gmock-matchers_test gmock_main)
+  cxx_test(gmock-more-actions_test gmock_main)
+  cxx_test(gmock-nice-strict_test gmock_main)
+  cxx_test(gmock-port_test gmock_main)
+  cxx_test(gmock-spec-builders_test gmock_main)
+  cxx_test(gmock_link_test gmock_main test/gmock_link2_test.cc)
+  cxx_test(gmock_test gmock_main)
+
+  if (DEFINED GTEST_HAS_PTHREAD)
+    cxx_test(gmock_stress_test gmock)
+  endif()
+
+  # gmock_all_test is commented to save time building and running tests.
+  # Uncomment if necessary.
+  # cxx_test(gmock_all_test gmock_main)
+
+  ############################################################
+  # C++ tests built with non-standard compiler flags.
+
+  if (MSVC)
+    cxx_library(gmock_main_no_exception "${cxx_no_exception}"
+      "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
+
+    cxx_library(gmock_main_no_rtti "${cxx_no_rtti}"
+      "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
+
+  else()
+    cxx_library(gmock_main_no_exception "${cxx_no_exception}" src/gmock_main.cc)
+    target_link_libraries(gmock_main_no_exception PUBLIC gmock)
+
+    cxx_library(gmock_main_no_rtti "${cxx_no_rtti}" src/gmock_main.cc)
+    target_link_libraries(gmock_main_no_rtti PUBLIC gmock)
+  endif()
+  cxx_test_with_flags(gmock-more-actions_no_exception_test "${cxx_no_exception}"
+    gmock_main_no_exception test/gmock-more-actions_test.cc)
+
+  cxx_test_with_flags(gmock_no_rtti_test "${cxx_no_rtti}"
+    gmock_main_no_rtti test/gmock-spec-builders_test.cc)
+
+  cxx_shared_library(shared_gmock_main "${cxx_default}"
+    "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
+
+  # Tests that a binary can be built with Google Mock as a shared library.  On
+  # some system configurations, it may not possible to run the binary without
+  # knowing more details about the system configurations. We do not try to run
+  # this binary. To get a more robust shared library coverage, configure with
+  # -DBUILD_SHARED_LIBS=ON.
+  cxx_executable_with_flags(shared_gmock_test_ "${cxx_default}"
+    shared_gmock_main test/gmock-spec-builders_test.cc)
+  set_target_properties(shared_gmock_test_
+    PROPERTIES
+    COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
+
+  ############################################################
+  # Python tests.
+
+  cxx_executable(gmock_leak_test_ test gmock_main)
+  py_test(gmock_leak_test)
+
+  cxx_executable(gmock_output_test_ test gmock)
+  py_test(gmock_output_test)
+endif()
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/README.md b/media/libaom/src/third_party/googletest/src/googlemock/README.md
new file mode 100644
index 0000000000..ead688325d
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/README.md
@@ -0,0 +1,44 @@
+# Googletest Mocking (gMock) Framework
+
+### Overview
+
+Google's framework for writing and using C++ mock classes. It can help you
+derive better designs of your system and write better tests.
+
+It is inspired by:
+
+*   [jMock](http://www.jmock.org/)
+*   [EasyMock](http://www.easymock.org/)
+*   [Hamcrest](http://code.google.com/p/hamcrest/)
+
+It is designed with C++'s specifics in mind.
+
+gMock:
+
+-   Provides a declarative syntax for defining mocks.
+-   Can define partial (hybrid) mocks, which are a cross of real and mock
+    objects.
+-   Handles functions of arbitrary types and overloaded functions.
+-   Comes with a rich set of matchers for validating function arguments.
+-   Uses an intuitive syntax for controlling the behavior of a mock.
+-   Does automatic verification of expectations (no record-and-replay needed).
+-   Allows arbitrary (partial) ordering constraints on function calls to be
+    expressed.
+-   Lets a user extend it by defining new matchers and actions.
+-   Does not use exceptions.
+-   Is easy to learn and use.
+
+Details and examples can be found here:
+
+*   [gMock for Dummies](https://google.github.io/googletest/gmock_for_dummies.html)
+*   [Legacy gMock FAQ](https://google.github.io/googletest/gmock_faq.html)
+*   [gMock Cookbook](https://google.github.io/googletest/gmock_cook_book.html)
+*   [gMock Cheat Sheet](https://google.github.io/googletest/gmock_cheat_sheet.html)
+
+Please note that code under scripts/generator/ is from the
+[cppclean project](http://code.google.com/p/cppclean/) and under the Apache
+License, which is different from GoogleMock's license.
+
+GoogleMock is a part of
+[GoogleTest C++ testing framework](http://github.com/google/googletest/) and a
+subject to the same requirements.
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/cmake/gmock.pc.in b/media/libaom/src/third_party/googletest/src/googlemock/cmake/gmock.pc.in
new file mode 100644
index 0000000000..23c67b5c88
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/cmake/gmock.pc.in
@@ -0,0 +1,10 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: gmock
+Description: GoogleMock (without main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Requires: gtest = @PROJECT_VERSION@
+Libs: -L${libdir} -lgmock @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/cmake/gmock_main.pc.in b/media/libaom/src/third_party/googletest/src/googlemock/cmake/gmock_main.pc.in
new file mode 100644
index 0000000000..66ffea7f44
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/cmake/gmock_main.pc.in
@@ -0,0 +1,10 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: gmock_main
+Description: GoogleMock (with main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Requires: gmock = @PROJECT_VERSION@
+Libs: -L${libdir} -lgmock_main @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h
new file mode 100644
index 0000000000..f2393bd3af
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h
@@ -0,0 +1,1687 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// The ACTION* family of macros can be used in a namespace scope to
+// define custom actions easily.  The syntax:
+//
+//   ACTION(name) { statements; }
+//
+// will define an action with the given name that executes the
+// statements.  The value returned by the statements will be used as
+// the return value of the action.  Inside the statements, you can
+// refer to the K-th (0-based) argument of the mock function by
+// 'argK', and refer to its type by 'argK_type'.  For example:
+//
+//   ACTION(IncrementArg1) {
+//     arg1_type temp = arg1;
+//     return ++(*temp);
+//   }
+//
+// allows you to write
+//
+//   ...WillOnce(IncrementArg1());
+//
+// You can also refer to the entire argument tuple and its type by
+// 'args' and 'args_type', and refer to the mock function type and its
+// return type by 'function_type' and 'return_type'.
+//
+// Note that you don't need to specify the types of the mock function
+// arguments.  However rest assured that your code is still type-safe:
+// you'll get a compiler error if *arg1 doesn't support the ++
+// operator, or if the type of ++(*arg1) isn't compatible with the
+// mock function's return type, for example.
+//
+// Sometimes you'll want to parameterize the action.   For that you can use
+// another macro:
+//
+//   ACTION_P(name, param_name) { statements; }
+//
+// For example:
+//
+//   ACTION_P(Add, n) { return arg0 + n; }
+//
+// will allow you to write:
+//
+//   ...WillOnce(Add(5));
+//
+// Note that you don't need to provide the type of the parameter
+// either.  If you need to reference the type of a parameter named
+// 'foo', you can write 'foo_type'.  For example, in the body of
+// ACTION_P(Add, n) above, you can write 'n_type' to refer to the type
+// of 'n'.
+//
+// We also provide ACTION_P2, ACTION_P3, ..., up to ACTION_P10 to support
+// multi-parameter actions.
+//
+// For the purpose of typing, you can view
+//
+//   ACTION_Pk(Foo, p1, ..., pk) { ... }
+//
+// as shorthand for
+//
+//   template <typename p1_type, ..., typename pk_type>
+//   FooActionPk<p1_type, ..., pk_type> Foo(p1_type p1, ..., pk_type pk) { ... }
+//
+// In particular, you can provide the template type arguments
+// explicitly when invoking Foo(), as in Foo<long, bool>(5, false);
+// although usually you can rely on the compiler to infer the types
+// for you automatically.  You can assign the result of expression
+// Foo(p1, ..., pk) to a variable of type FooActionPk<p1_type, ...,
+// pk_type>.  This can be useful when composing actions.
+//
+// You can also overload actions with different numbers of parameters:
+//
+//   ACTION_P(Plus, a) { ... }
+//   ACTION_P2(Plus, a, b) { ... }
+//
+// While it's tempting to always use the ACTION* macros when defining
+// a new action, you should also consider implementing ActionInterface
+// or using MakePolymorphicAction() instead, especially if you need to
+// use the action a lot.  While these approaches require more work,
+// they give you more control on the types of the mock function
+// arguments and the action parameters, which in general leads to
+// better compiler error messages that pay off in the long run.  They
+// also allow overloading actions based on parameter types (as opposed
+// to just based on the number of parameters).
+//
+// CAVEAT:
+//
+// ACTION*() can only be used in a namespace scope as templates cannot be
+// declared inside of a local class.
+// Users can, however, define any local functors (e.g. a lambda) that
+// can be used as actions.
+//
+// MORE INFORMATION:
+//
+// To learn more about using these macros, please search for 'ACTION' on
+// https://github.com/google/googletest/blob/master/docs/gmock_cook_book.md
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gmock/internal/gmock-pp.h"
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+namespace testing {
+
+// To implement an action Foo, define:
+//   1. a class FooAction that implements the ActionInterface interface, and
+//   2. a factory function that creates an Action object from a
+//      const FooAction*.
+//
+// The two-level delegation design follows that of Matcher, providing
+// consistency for extension developers.  It also eases ownership
+// management as Action objects can now be copied like plain values.
+
+namespace internal {
+
+// BuiltInDefaultValueGetter<T, true>::Get() returns a
+// default-constructed T value.  BuiltInDefaultValueGetter<T,
+// false>::Get() crashes with an error.
+//
+// This primary template is used when kDefaultConstructible is true.
+template <typename T, bool kDefaultConstructible>
+struct BuiltInDefaultValueGetter {
+  static T Get() { return T(); }
+};
+template <typename T>
+struct BuiltInDefaultValueGetter<T, false> {
+  static T Get() {
+    Assert(false, __FILE__, __LINE__,
+           "Default action undefined for the function return type.");
+    return internal::Invalid<T>();
+    // The above statement will never be reached, but is required in
+    // order for this function to compile.
+  }
+};
+
+// BuiltInDefaultValue<T>::Get() returns the "built-in" default value
+// for type T, which is NULL when T is a raw pointer type, 0 when T is
+// a numeric type, false when T is bool, or "" when T is string or
+// std::string.  In addition, in C++11 and above, it turns a
+// default-constructed T value if T is default constructible.  For any
+// other type T, the built-in default T value is undefined, and the
+// function will abort the process.
+template <typename T>
+class BuiltInDefaultValue {
+ public:
+  // This function returns true if and only if type T has a built-in default
+  // value.
+  static bool Exists() {
+    return ::std::is_default_constructible<T>::value;
+  }
+
+  static T Get() {
+    return BuiltInDefaultValueGetter<
+        T, ::std::is_default_constructible<T>::value>::Get();
+  }
+};
+
+// This partial specialization says that we use the same built-in
+// default value for T and const T.
+template <typename T>
+class BuiltInDefaultValue<const T> {
+ public:
+  static bool Exists() { return BuiltInDefaultValue<T>::Exists(); }
+  static T Get() { return BuiltInDefaultValue<T>::Get(); }
+};
+
+// This partial specialization defines the default values for pointer
+// types.
+template <typename T>
+class BuiltInDefaultValue<T*> {
+ public:
+  static bool Exists() { return true; }
+  static T* Get() { return nullptr; }
+};
+
+// The following specializations define the default values for
+// specific types we care about.
+#define GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(type, value) \
+  template <> \
+  class BuiltInDefaultValue<type> { \
+   public: \
+    static bool Exists() { return true; } \
+    static type Get() { return value; } \
+  }
+
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(void, );  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(::std::string, "");
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(bool, false);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned char, '\0');
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed char, '\0');
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(char, '\0');
+
+// There's no need for a default action for signed wchar_t, as that
+// type is the same as wchar_t for gcc, and invalid for MSVC.
+//
+// There's also no need for a default action for unsigned wchar_t, as
+// that type is the same as unsigned int for gcc, and invalid for
+// MSVC.
+#if GMOCK_WCHAR_T_IS_NATIVE_
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(wchar_t, 0U);  // NOLINT
+#endif
+
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned short, 0U);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed short, 0);     // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned int, 0U);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed int, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long, 0UL);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long, 0L);     // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long long, 0);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long long, 0);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(float, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(double, 0);
+
+#undef GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_
+
+// Simple two-arg form of std::disjunction.
+template <typename P, typename Q>
+using disjunction = typename ::std::conditional<P::value, P, Q>::type;
+
+}  // namespace internal
+
+// When an unexpected function call is encountered, Google Mock will
+// let it return a default value if the user has specified one for its
+// return type, or if the return type has a built-in default value;
+// otherwise Google Mock won't know what value to return and will have
+// to abort the process.
+//
+// The DefaultValue<T> class allows a user to specify the
+// default value for a type T that is both copyable and publicly
+// destructible (i.e. anything that can be used as a function return
+// type).  The usage is:
+//
+//   // Sets the default value for type T to be foo.
+//   DefaultValue<T>::Set(foo);
+template <typename T>
+class DefaultValue {
+ public:
+  // Sets the default value for type T; requires T to be
+  // copy-constructable and have a public destructor.
+  static void Set(T x) {
+    delete producer_;
+    producer_ = new FixedValueProducer(x);
+  }
+
+  // Provides a factory function to be called to generate the default value.
+  // This method can be used even if T is only move-constructible, but it is not
+  // limited to that case.
+  typedef T (*FactoryFunction)();
+  static void SetFactory(FactoryFunction factory) {
+    delete producer_;
+    producer_ = new FactoryValueProducer(factory);
+  }
+
+  // Unsets the default value for type T.
+  static void Clear() {
+    delete producer_;
+    producer_ = nullptr;
+  }
+
+  // Returns true if and only if the user has set the default value for type T.
+  static bool IsSet() { return producer_ != nullptr; }
+
+  // Returns true if T has a default return value set by the user or there
+  // exists a built-in default value.
+  static bool Exists() {
+    return IsSet() || internal::BuiltInDefaultValue<T>::Exists();
+  }
+
+  // Returns the default value for type T if the user has set one;
+  // otherwise returns the built-in default value. Requires that Exists()
+  // is true, which ensures that the return value is well-defined.
+  static T Get() {
+    return producer_ == nullptr ? internal::BuiltInDefaultValue<T>::Get()
+                                : producer_->Produce();
+  }
+
+ private:
+  class ValueProducer {
+   public:
+    virtual ~ValueProducer() {}
+    virtual T Produce() = 0;
+  };
+
+  class FixedValueProducer : public ValueProducer {
+   public:
+    explicit FixedValueProducer(T value) : value_(value) {}
+    T Produce() override { return value_; }
+
+   private:
+    const T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(FixedValueProducer);
+  };
+
+  class FactoryValueProducer : public ValueProducer {
+   public:
+    explicit FactoryValueProducer(FactoryFunction factory)
+        : factory_(factory) {}
+    T Produce() override { return factory_(); }
+
+   private:
+    const FactoryFunction factory_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(FactoryValueProducer);
+  };
+
+  static ValueProducer* producer_;
+};
+
+// This partial specialization allows a user to set default values for
+// reference types.
+template <typename T>
+class DefaultValue<T&> {
+ public:
+  // Sets the default value for type T&.
+  static void Set(T& x) {  // NOLINT
+    address_ = &x;
+  }
+
+  // Unsets the default value for type T&.
+  static void Clear() { address_ = nullptr; }
+
+  // Returns true if and only if the user has set the default value for type T&.
+  static bool IsSet() { return address_ != nullptr; }
+
+  // Returns true if T has a default return value set by the user or there
+  // exists a built-in default value.
+  static bool Exists() {
+    return IsSet() || internal::BuiltInDefaultValue<T&>::Exists();
+  }
+
+  // Returns the default value for type T& if the user has set one;
+  // otherwise returns the built-in default value if there is one;
+  // otherwise aborts the process.
+  static T& Get() {
+    return address_ == nullptr ? internal::BuiltInDefaultValue<T&>::Get()
+                               : *address_;
+  }
+
+ private:
+  static T* address_;
+};
+
+// This specialization allows DefaultValue<void>::Get() to
+// compile.
+template <>
+class DefaultValue<void> {
+ public:
+  static bool Exists() { return true; }
+  static void Get() {}
+};
+
+// Points to the user-set default value for type T.
+template <typename T>
+typename DefaultValue<T>::ValueProducer* DefaultValue<T>::producer_ = nullptr;
+
+// Points to the user-set default value for type T&.
+template <typename T>
+T* DefaultValue<T&>::address_ = nullptr;
+
+// Implement this interface to define an action for function type F.
+template <typename F>
+class ActionInterface {
+ public:
+  typedef typename internal::Function<F>::Result Result;
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  ActionInterface() {}
+  virtual ~ActionInterface() {}
+
+  // Performs the action.  This method is not const, as in general an
+  // action can have side effects and be stateful.  For example, a
+  // get-the-next-element-from-the-collection action will need to
+  // remember the current element.
+  virtual Result Perform(const ArgumentTuple& args) = 0;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionInterface);
+};
+
+// An Action<F> is a copyable and IMMUTABLE (except by assignment)
+// object that represents an action to be taken when a mock function
+// of type F is called.  The implementation of Action<T> is just a
+// std::shared_ptr to const ActionInterface<T>. Don't inherit from Action!
+// You can view an object implementing ActionInterface<F> as a
+// concrete action (including its current state), and an Action<F>
+// object as a handle to it.
+template <typename F>
+class Action {
+  // Adapter class to allow constructing Action from a legacy ActionInterface.
+  // New code should create Actions from functors instead.
+  struct ActionAdapter {
+    // Adapter must be copyable to satisfy std::function requirements.
+    ::std::shared_ptr<ActionInterface<F>> impl_;
+
+    template <typename... Args>
+    typename internal::Function<F>::Result operator()(Args&&... args) {
+      return impl_->Perform(
+          ::std::forward_as_tuple(::std::forward<Args>(args)...));
+    }
+  };
+
+  template <typename G>
+  using IsCompatibleFunctor = std::is_constructible<std::function<F>, G>;
+
+ public:
+  typedef typename internal::Function<F>::Result Result;
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  // Constructs a null Action.  Needed for storing Action objects in
+  // STL containers.
+  Action() {}
+
+  // Construct an Action from a specified callable.
+  // This cannot take std::function directly, because then Action would not be
+  // directly constructible from lambda (it would require two conversions).
+  template <
+      typename G,
+      typename = typename std::enable_if<internal::disjunction<
+          IsCompatibleFunctor<G>, std::is_constructible<std::function<Result()>,
+                                                        G>>::value>::type>
+  Action(G&& fun) {  // NOLINT
+    Init(::std::forward<G>(fun), IsCompatibleFunctor<G>());
+  }
+
+  // Constructs an Action from its implementation.
+  explicit Action(ActionInterface<F>* impl)
+      : fun_(ActionAdapter{::std::shared_ptr<ActionInterface<F>>(impl)}) {}
+
+  // This constructor allows us to turn an Action<Func> object into an
+  // Action<F>, as long as F's arguments can be implicitly converted
+  // to Func's and Func's return type can be implicitly converted to F's.
+  template <typename Func>
+  explicit Action(const Action<Func>& action) : fun_(action.fun_) {}
+
+  // Returns true if and only if this is the DoDefault() action.
+  bool IsDoDefault() const { return fun_ == nullptr; }
+
+  // Performs the action.  Note that this method is const even though
+  // the corresponding method in ActionInterface is not.  The reason
+  // is that a const Action<F> means that it cannot be re-bound to
+  // another concrete action, not that the concrete action it binds to
+  // cannot change state.  (Think of the difference between a const
+  // pointer and a pointer to const.)
+  Result Perform(ArgumentTuple args) const {
+    if (IsDoDefault()) {
+      internal::IllegalDoDefault(__FILE__, __LINE__);
+    }
+    return internal::Apply(fun_, ::std::move(args));
+  }
+
+ private:
+  template <typename G>
+  friend class Action;
+
+  template <typename G>
+  void Init(G&& g, ::std::true_type) {
+    fun_ = ::std::forward<G>(g);
+  }
+
+  template <typename G>
+  void Init(G&& g, ::std::false_type) {
+    fun_ = IgnoreArgs<typename ::std::decay<G>::type>{::std::forward<G>(g)};
+  }
+
+  template <typename FunctionImpl>
+  struct IgnoreArgs {
+    template <typename... Args>
+    Result operator()(const Args&...) const {
+      return function_impl();
+    }
+
+    FunctionImpl function_impl;
+  };
+
+  // fun_ is an empty function if and only if this is the DoDefault() action.
+  ::std::function<F> fun_;
+};
+
+// The PolymorphicAction class template makes it easy to implement a
+// polymorphic action (i.e. an action that can be used in mock
+// functions of than one type, e.g. Return()).
+//
+// To define a polymorphic action, a user first provides a COPYABLE
+// implementation class that has a Perform() method template:
+//
+//   class FooAction {
+//    public:
+//     template <typename Result, typename ArgumentTuple>
+//     Result Perform(const ArgumentTuple& args) const {
+//       // Processes the arguments and returns a result, using
+//       // std::get<N>(args) to get the N-th (0-based) argument in the tuple.
+//     }
+//     ...
+//   };
+//
+// Then the user creates the polymorphic action using
+// MakePolymorphicAction(object) where object has type FooAction.  See
+// the definition of Return(void) and SetArgumentPointee<N>(value) for
+// complete examples.
+template <typename Impl>
+class PolymorphicAction {
+ public:
+  explicit PolymorphicAction(const Impl& impl) : impl_(impl) {}
+
+  template <typename F>
+  operator Action<F>() const {
+    return Action<F>(new MonomorphicImpl<F>(impl_));
+  }
+
+ private:
+  template <typename F>
+  class MonomorphicImpl : public ActionInterface<F> {
+   public:
+    typedef typename internal::Function<F>::Result Result;
+    typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
+
+    Result Perform(const ArgumentTuple& args) override {
+      return impl_.template Perform<Result>(args);
+    }
+
+   private:
+    Impl impl_;
+  };
+
+  Impl impl_;
+};
+
+// Creates an Action from its implementation and returns it.  The
+// created Action object owns the implementation.
+template <typename F>
+Action<F> MakeAction(ActionInterface<F>* impl) {
+  return Action<F>(impl);
+}
+
+// Creates a polymorphic action from its implementation.  This is
+// easier to use than the PolymorphicAction<Impl> constructor as it
+// doesn't require you to explicitly write the template argument, e.g.
+//
+//   MakePolymorphicAction(foo);
+// vs
+//   PolymorphicAction<TypeOfFoo>(foo);
+template <typename Impl>
+inline PolymorphicAction<Impl> MakePolymorphicAction(const Impl& impl) {
+  return PolymorphicAction<Impl>(impl);
+}
+
+namespace internal {
+
+// Helper struct to specialize ReturnAction to execute a move instead of a copy
+// on return. Useful for move-only types, but could be used on any type.
+template <typename T>
+struct ByMoveWrapper {
+  explicit ByMoveWrapper(T value) : payload(std::move(value)) {}
+  T payload;
+};
+
+// Implements the polymorphic Return(x) action, which can be used in
+// any function that returns the type of x, regardless of the argument
+// types.
+//
+// Note: The value passed into Return must be converted into
+// Function<F>::Result when this action is cast to Action<F> rather than
+// when that action is performed. This is important in scenarios like
+//
+// MOCK_METHOD1(Method, T(U));
+// ...
+// {
+//   Foo foo;
+//   X x(&foo);
+//   EXPECT_CALL(mock, Method(_)).WillOnce(Return(x));
+// }
+//
+// In the example above the variable x holds reference to foo which leaves
+// scope and gets destroyed.  If copying X just copies a reference to foo,
+// that copy will be left with a hanging reference.  If conversion to T
+// makes a copy of foo, the above code is safe. To support that scenario, we
+// need to make sure that the type conversion happens inside the EXPECT_CALL
+// statement, and conversion of the result of Return to Action<T(U)> is a
+// good place for that.
+//
+// The real life example of the above scenario happens when an invocation
+// of gtl::Container() is passed into Return.
+//
+template <typename R>
+class ReturnAction {
+ public:
+  // Constructs a ReturnAction object from the value to be returned.
+  // 'value' is passed by value instead of by const reference in order
+  // to allow Return("string literal") to compile.
+  explicit ReturnAction(R value) : value_(new R(std::move(value))) {}
+
+  // This template type conversion operator allows Return(x) to be
+  // used in ANY function that returns x's type.
+  template <typename F>
+  operator Action<F>() const {  // NOLINT
+    // Assert statement belongs here because this is the best place to verify
+    // conditions on F. It produces the clearest error messages
+    // in most compilers.
+    // Impl really belongs in this scope as a local class but can't
+    // because MSVC produces duplicate symbols in different translation units
+    // in this case. Until MS fixes that bug we put Impl into the class scope
+    // and put the typedef both here (for use in assert statement) and
+    // in the Impl class. But both definitions must be the same.
+    typedef typename Function<F>::Result Result;
+    GTEST_COMPILE_ASSERT_(
+        !std::is_reference<Result>::value,
+        use_ReturnRef_instead_of_Return_to_return_a_reference);
+    static_assert(!std::is_void<Result>::value,
+                  "Can't use Return() on an action expected to return `void`.");
+    return Action<F>(new Impl<R, F>(value_));
+  }
+
+ private:
+  // Implements the Return(x) action for a particular function type F.
+  template <typename R_, typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    // The implicit cast is necessary when Result has more than one
+    // single-argument constructor (e.g. Result is std::vector<int>) and R
+    // has a type conversion operator template.  In that case, value_(value)
+    // won't compile as the compiler doesn't known which constructor of
+    // Result to call.  ImplicitCast_ forces the compiler to convert R to
+    // Result without considering explicit constructors, thus resolving the
+    // ambiguity. value_ is then initialized using its copy constructor.
+    explicit Impl(const std::shared_ptr<R>& value)
+        : value_before_cast_(*value),
+          value_(ImplicitCast_<Result>(value_before_cast_)) {}
+
+    Result Perform(const ArgumentTuple&) override { return value_; }
+
+   private:
+    GTEST_COMPILE_ASSERT_(!std::is_reference<Result>::value,
+                          Result_cannot_be_a_reference_type);
+    // We save the value before casting just in case it is being cast to a
+    // wrapper type.
+    R value_before_cast_;
+    Result value_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(Impl);
+  };
+
+  // Partially specialize for ByMoveWrapper. This version of ReturnAction will
+  // move its contents instead.
+  template <typename R_, typename F>
+  class Impl<ByMoveWrapper<R_>, F> : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const std::shared_ptr<R>& wrapper)
+        : performed_(false), wrapper_(wrapper) {}
+
+    Result Perform(const ArgumentTuple&) override {
+      GTEST_CHECK_(!performed_)
+          << "A ByMove() action should only be performed once.";
+      performed_ = true;
+      return std::move(wrapper_->payload);
+    }
+
+   private:
+    bool performed_;
+    const std::shared_ptr<R> wrapper_;
+  };
+
+  const std::shared_ptr<R> value_;
+};
+
+// Implements the ReturnNull() action.
+class ReturnNullAction {
+ public:
+  // Allows ReturnNull() to be used in any pointer-returning function. In C++11
+  // this is enforced by returning nullptr, and in non-C++11 by asserting a
+  // pointer type on compile time.
+  template <typename Result, typename ArgumentTuple>
+  static Result Perform(const ArgumentTuple&) {
+    return nullptr;
+  }
+};
+
+// Implements the Return() action.
+class ReturnVoidAction {
+ public:
+  // Allows Return() to be used in any void-returning function.
+  template <typename Result, typename ArgumentTuple>
+  static void Perform(const ArgumentTuple&) {
+    static_assert(std::is_void<Result>::value, "Result should be void.");
+  }
+};
+
+// Implements the polymorphic ReturnRef(x) action, which can be used
+// in any function that returns a reference to the type of x,
+// regardless of the argument types.
+template <typename T>
+class ReturnRefAction {
+ public:
+  // Constructs a ReturnRefAction object from the reference to be returned.
+  explicit ReturnRefAction(T& ref) : ref_(ref) {}  // NOLINT
+
+  // This template type conversion operator allows ReturnRef(x) to be
+  // used in ANY function that returns a reference to x's type.
+  template <typename F>
+  operator Action<F>() const {
+    typedef typename Function<F>::Result Result;
+    // Asserts that the function return type is a reference.  This
+    // catches the user error of using ReturnRef(x) when Return(x)
+    // should be used, and generates some helpful error message.
+    GTEST_COMPILE_ASSERT_(std::is_reference<Result>::value,
+                          use_Return_instead_of_ReturnRef_to_return_a_value);
+    return Action<F>(new Impl<F>(ref_));
+  }
+
+ private:
+  // Implements the ReturnRef(x) action for a particular function type F.
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(T& ref) : ref_(ref) {}  // NOLINT
+
+    Result Perform(const ArgumentTuple&) override { return ref_; }
+
+   private:
+    T& ref_;
+  };
+
+  T& ref_;
+};
+
+// Implements the polymorphic ReturnRefOfCopy(x) action, which can be
+// used in any function that returns a reference to the type of x,
+// regardless of the argument types.
+template <typename T>
+class ReturnRefOfCopyAction {
+ public:
+  // Constructs a ReturnRefOfCopyAction object from the reference to
+  // be returned.
+  explicit ReturnRefOfCopyAction(const T& value) : value_(value) {}  // NOLINT
+
+  // This template type conversion operator allows ReturnRefOfCopy(x) to be
+  // used in ANY function that returns a reference to x's type.
+  template <typename F>
+  operator Action<F>() const {
+    typedef typename Function<F>::Result Result;
+    // Asserts that the function return type is a reference.  This
+    // catches the user error of using ReturnRefOfCopy(x) when Return(x)
+    // should be used, and generates some helpful error message.
+    GTEST_COMPILE_ASSERT_(
+        std::is_reference<Result>::value,
+        use_Return_instead_of_ReturnRefOfCopy_to_return_a_value);
+    return Action<F>(new Impl<F>(value_));
+  }
+
+ private:
+  // Implements the ReturnRefOfCopy(x) action for a particular function type F.
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const T& value) : value_(value) {}  // NOLINT
+
+    Result Perform(const ArgumentTuple&) override { return value_; }
+
+   private:
+    T value_;
+  };
+
+  const T value_;
+};
+
+// Implements the polymorphic ReturnRoundRobin(v) action, which can be
+// used in any function that returns the element_type of v.
+template <typename T>
+class ReturnRoundRobinAction {
+ public:
+  explicit ReturnRoundRobinAction(std::vector<T> values) {
+    GTEST_CHECK_(!values.empty())
+        << "ReturnRoundRobin requires at least one element.";
+    state_->values = std::move(values);
+  }
+
+  template <typename... Args>
+  T operator()(Args&&...) const {
+     return state_->Next();
+  }
+
+ private:
+  struct State {
+    T Next() {
+      T ret_val = values[i++];
+      if (i == values.size()) i = 0;
+      return ret_val;
+    }
+
+    std::vector<T> values;
+    size_t i = 0;
+  };
+  std::shared_ptr<State> state_ = std::make_shared<State>();
+};
+
+// Implements the polymorphic DoDefault() action.
+class DoDefaultAction {
+ public:
+  // This template type conversion operator allows DoDefault() to be
+  // used in any function.
+  template <typename F>
+  operator Action<F>() const { return Action<F>(); }  // NOLINT
+};
+
+// Implements the Assign action to set a given pointer referent to a
+// particular value.
+template <typename T1, typename T2>
+class AssignAction {
+ public:
+  AssignAction(T1* ptr, T2 value) : ptr_(ptr), value_(value) {}
+
+  template <typename Result, typename ArgumentTuple>
+  void Perform(const ArgumentTuple& /* args */) const {
+    *ptr_ = value_;
+  }
+
+ private:
+  T1* const ptr_;
+  const T2 value_;
+};
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+// Implements the SetErrnoAndReturn action to simulate return from
+// various system calls and libc functions.
+template <typename T>
+class SetErrnoAndReturnAction {
+ public:
+  SetErrnoAndReturnAction(int errno_value, T result)
+      : errno_(errno_value),
+        result_(result) {}
+  template <typename Result, typename ArgumentTuple>
+  Result Perform(const ArgumentTuple& /* args */) const {
+    errno = errno_;
+    return result_;
+  }
+
+ private:
+  const int errno_;
+  const T result_;
+};
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Implements the SetArgumentPointee<N>(x) action for any function
+// whose N-th argument (0-based) is a pointer to x's type.
+template <size_t N, typename A, typename = void>
+struct SetArgumentPointeeAction {
+  A value;
+
+  template <typename... Args>
+  void operator()(const Args&... args) const {
+    *::std::get<N>(std::tie(args...)) = value;
+  }
+};
+
+// Implements the Invoke(object_ptr, &Class::Method) action.
+template <class Class, typename MethodPtr>
+struct InvokeMethodAction {
+  Class* const obj_ptr;
+  const MethodPtr method_ptr;
+
+  template <typename... Args>
+  auto operator()(Args&&... args) const
+      -> decltype((obj_ptr->*method_ptr)(std::forward<Args>(args)...)) {
+    return (obj_ptr->*method_ptr)(std::forward<Args>(args)...);
+  }
+};
+
+// Implements the InvokeWithoutArgs(f) action.  The template argument
+// FunctionImpl is the implementation type of f, which can be either a
+// function pointer or a functor.  InvokeWithoutArgs(f) can be used as an
+// Action<F> as long as f's type is compatible with F.
+template <typename FunctionImpl>
+struct InvokeWithoutArgsAction {
+  FunctionImpl function_impl;
+
+  // Allows InvokeWithoutArgs(f) to be used as any action whose type is
+  // compatible with f.
+  template <typename... Args>
+  auto operator()(const Args&...) -> decltype(function_impl()) {
+    return function_impl();
+  }
+};
+
+// Implements the InvokeWithoutArgs(object_ptr, &Class::Method) action.
+template <class Class, typename MethodPtr>
+struct InvokeMethodWithoutArgsAction {
+  Class* const obj_ptr;
+  const MethodPtr method_ptr;
+
+  using ReturnType =
+      decltype((std::declval<Class*>()->*std::declval<MethodPtr>())());
+
+  template <typename... Args>
+  ReturnType operator()(const Args&...) const {
+    return (obj_ptr->*method_ptr)();
+  }
+};
+
+// Implements the IgnoreResult(action) action.
+template <typename A>
+class IgnoreResultAction {
+ public:
+  explicit IgnoreResultAction(const A& action) : action_(action) {}
+
+  template <typename F>
+  operator Action<F>() const {
+    // Assert statement belongs here because this is the best place to verify
+    // conditions on F. It produces the clearest error messages
+    // in most compilers.
+    // Impl really belongs in this scope as a local class but can't
+    // because MSVC produces duplicate symbols in different translation units
+    // in this case. Until MS fixes that bug we put Impl into the class scope
+    // and put the typedef both here (for use in assert statement) and
+    // in the Impl class. But both definitions must be the same.
+    typedef typename internal::Function<F>::Result Result;
+
+    // Asserts at compile time that F returns void.
+    static_assert(std::is_void<Result>::value, "Result type should be void.");
+
+    return Action<F>(new Impl<F>(action_));
+  }
+
+ private:
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename internal::Function<F>::Result Result;
+    typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const A& action) : action_(action) {}
+
+    void Perform(const ArgumentTuple& args) override {
+      // Performs the action and ignores its result.
+      action_.Perform(args);
+    }
+
+   private:
+    // Type OriginalFunction is the same as F except that its return
+    // type is IgnoredValue.
+    typedef typename internal::Function<F>::MakeResultIgnoredValue
+        OriginalFunction;
+
+    const Action<OriginalFunction> action_;
+  };
+
+  const A action_;
+};
+
+template <typename InnerAction, size_t... I>
+struct WithArgsAction {
+  InnerAction action;
+
+  // The inner action could be anything convertible to Action<X>.
+  // We use the conversion operator to detect the signature of the inner Action.
+  template <typename R, typename... Args>
+  operator Action<R(Args...)>() const {  // NOLINT
+    using TupleType = std::tuple<Args...>;
+    Action<R(typename std::tuple_element<I, TupleType>::type...)>
+        converted(action);
+
+    return [converted](Args... args) -> R {
+      return converted.Perform(std::forward_as_tuple(
+        std::get<I>(std::forward_as_tuple(std::forward<Args>(args)...))...));
+    };
+  }
+};
+
+template <typename... Actions>
+struct DoAllAction {
+ private:
+  template <typename T>
+  using NonFinalType =
+      typename std::conditional<std::is_scalar<T>::value, T, const T&>::type;
+
+  template <typename ActionT, size_t... I>
+  std::vector<ActionT> Convert(IndexSequence<I...>) const {
+    return {ActionT(std::get<I>(actions))...};
+  }
+
+ public:
+  std::tuple<Actions...> actions;
+
+  template <typename R, typename... Args>
+  operator Action<R(Args...)>() const {  // NOLINT
+    struct Op {
+      std::vector<Action<void(NonFinalType<Args>...)>> converted;
+      Action<R(Args...)> last;
+      R operator()(Args... args) const {
+        auto tuple_args = std::forward_as_tuple(std::forward<Args>(args)...);
+        for (auto& a : converted) {
+          a.Perform(tuple_args);
+        }
+        return last.Perform(std::move(tuple_args));
+      }
+    };
+    return Op{Convert<Action<void(NonFinalType<Args>...)>>(
+                  MakeIndexSequence<sizeof...(Actions) - 1>()),
+              std::get<sizeof...(Actions) - 1>(actions)};
+  }
+};
+
+template <typename T, typename... Params>
+struct ReturnNewAction {
+  T* operator()() const {
+    return internal::Apply(
+        [](const Params&... unpacked_params) {
+          return new T(unpacked_params...);
+        },
+        params);
+  }
+  std::tuple<Params...> params;
+};
+
+template <size_t k>
+struct ReturnArgAction {
+  template <typename... Args>
+  auto operator()(const Args&... args) const ->
+      typename std::tuple_element<k, std::tuple<Args...>>::type {
+    return std::get<k>(std::tie(args...));
+  }
+};
+
+template <size_t k, typename Ptr>
+struct SaveArgAction {
+  Ptr pointer;
+
+  template <typename... Args>
+  void operator()(const Args&... args) const {
+    *pointer = std::get<k>(std::tie(args...));
+  }
+};
+
+template <size_t k, typename Ptr>
+struct SaveArgPointeeAction {
+  Ptr pointer;
+
+  template <typename... Args>
+  void operator()(const Args&... args) const {
+    *pointer = *std::get<k>(std::tie(args...));
+  }
+};
+
+template <size_t k, typename T>
+struct SetArgRefereeAction {
+  T value;
+
+  template <typename... Args>
+  void operator()(Args&&... args) const {
+    using argk_type =
+        typename ::std::tuple_element<k, std::tuple<Args...>>::type;
+    static_assert(std::is_lvalue_reference<argk_type>::value,
+                  "Argument must be a reference type.");
+    std::get<k>(std::tie(args...)) = value;
+  }
+};
+
+template <size_t k, typename I1, typename I2>
+struct SetArrayArgumentAction {
+  I1 first;
+  I2 last;
+
+  template <typename... Args>
+  void operator()(const Args&... args) const {
+    auto value = std::get<k>(std::tie(args...));
+    for (auto it = first; it != last; ++it, (void)++value) {
+      *value = *it;
+    }
+  }
+};
+
+template <size_t k>
+struct DeleteArgAction {
+  template <typename... Args>
+  void operator()(const Args&... args) const {
+    delete std::get<k>(std::tie(args...));
+  }
+};
+
+template <typename Ptr>
+struct ReturnPointeeAction {
+  Ptr pointer;
+  template <typename... Args>
+  auto operator()(const Args&...) const -> decltype(*pointer) {
+    return *pointer;
+  }
+};
+
+#if GTEST_HAS_EXCEPTIONS
+template <typename T>
+struct ThrowAction {
+  T exception;
+  // We use a conversion operator to adapt to any return type.
+  template <typename R, typename... Args>
+  operator Action<R(Args...)>() const {  // NOLINT
+    T copy = exception;
+    return [copy](Args...) -> R { throw copy; };
+  }
+};
+#endif  // GTEST_HAS_EXCEPTIONS
+
+}  // namespace internal
+
+// An Unused object can be implicitly constructed from ANY value.
+// This is handy when defining actions that ignore some or all of the
+// mock function arguments.  For example, given
+//
+//   MOCK_METHOD3(Foo, double(const string& label, double x, double y));
+//   MOCK_METHOD3(Bar, double(int index, double x, double y));
+//
+// instead of
+//
+//   double DistanceToOriginWithLabel(const string& label, double x, double y) {
+//     return sqrt(x*x + y*y);
+//   }
+//   double DistanceToOriginWithIndex(int index, double x, double y) {
+//     return sqrt(x*x + y*y);
+//   }
+//   ...
+//   EXPECT_CALL(mock, Foo("abc", _, _))
+//       .WillOnce(Invoke(DistanceToOriginWithLabel));
+//   EXPECT_CALL(mock, Bar(5, _, _))
+//       .WillOnce(Invoke(DistanceToOriginWithIndex));
+//
+// you could write
+//
+//   // We can declare any uninteresting argument as Unused.
+//   double DistanceToOrigin(Unused, double x, double y) {
+//     return sqrt(x*x + y*y);
+//   }
+//   ...
+//   EXPECT_CALL(mock, Foo("abc", _, _)).WillOnce(Invoke(DistanceToOrigin));
+//   EXPECT_CALL(mock, Bar(5, _, _)).WillOnce(Invoke(DistanceToOrigin));
+typedef internal::IgnoredValue Unused;
+
+// Creates an action that does actions a1, a2, ..., sequentially in
+// each invocation. All but the last action will have a readonly view of the
+// arguments.
+template <typename... Action>
+internal::DoAllAction<typename std::decay<Action>::type...> DoAll(
+    Action&&... action) {
+  return {std::forward_as_tuple(std::forward<Action>(action)...)};
+}
+
+// WithArg<k>(an_action) creates an action that passes the k-th
+// (0-based) argument of the mock function to an_action and performs
+// it.  It adapts an action accepting one argument to one that accepts
+// multiple arguments.  For convenience, we also provide
+// WithArgs<k>(an_action) (defined below) as a synonym.
+template <size_t k, typename InnerAction>
+internal::WithArgsAction<typename std::decay<InnerAction>::type, k>
+WithArg(InnerAction&& action) {
+  return {std::forward<InnerAction>(action)};
+}
+
+// WithArgs<N1, N2, ..., Nk>(an_action) creates an action that passes
+// the selected arguments of the mock function to an_action and
+// performs it.  It serves as an adaptor between actions with
+// different argument lists.
+template <size_t k, size_t... ks, typename InnerAction>
+internal::WithArgsAction<typename std::decay<InnerAction>::type, k, ks...>
+WithArgs(InnerAction&& action) {
+  return {std::forward<InnerAction>(action)};
+}
+
+// WithoutArgs(inner_action) can be used in a mock function with a
+// non-empty argument list to perform inner_action, which takes no
+// argument.  In other words, it adapts an action accepting no
+// argument to one that accepts (and ignores) arguments.
+template <typename InnerAction>
+internal::WithArgsAction<typename std::decay<InnerAction>::type>
+WithoutArgs(InnerAction&& action) {
+  return {std::forward<InnerAction>(action)};
+}
+
+// Creates an action that returns 'value'.  'value' is passed by value
+// instead of const reference - otherwise Return("string literal")
+// will trigger a compiler error about using array as initializer.
+template <typename R>
+internal::ReturnAction<R> Return(R value) {
+  return internal::ReturnAction<R>(std::move(value));
+}
+
+// Creates an action that returns NULL.
+inline PolymorphicAction<internal::ReturnNullAction> ReturnNull() {
+  return MakePolymorphicAction(internal::ReturnNullAction());
+}
+
+// Creates an action that returns from a void function.
+inline PolymorphicAction<internal::ReturnVoidAction> Return() {
+  return MakePolymorphicAction(internal::ReturnVoidAction());
+}
+
+// Creates an action that returns the reference to a variable.
+template <typename R>
+inline internal::ReturnRefAction<R> ReturnRef(R& x) {  // NOLINT
+  return internal::ReturnRefAction<R>(x);
+}
+
+// Prevent using ReturnRef on reference to temporary.
+template <typename R, R* = nullptr>
+internal::ReturnRefAction<R> ReturnRef(R&&) = delete;
+
+// Creates an action that returns the reference to a copy of the
+// argument.  The copy is created when the action is constructed and
+// lives as long as the action.
+template <typename R>
+inline internal::ReturnRefOfCopyAction<R> ReturnRefOfCopy(const R& x) {
+  return internal::ReturnRefOfCopyAction<R>(x);
+}
+
+// Modifies the parent action (a Return() action) to perform a move of the
+// argument instead of a copy.
+// Return(ByMove()) actions can only be executed once and will assert this
+// invariant.
+template <typename R>
+internal::ByMoveWrapper<R> ByMove(R x) {
+  return internal::ByMoveWrapper<R>(std::move(x));
+}
+
+// Creates an action that returns an element of `vals`. Calling this action will
+// repeatedly return the next value from `vals` until it reaches the end and
+// will restart from the beginning.
+template <typename T>
+internal::ReturnRoundRobinAction<T> ReturnRoundRobin(std::vector<T> vals) {
+  return internal::ReturnRoundRobinAction<T>(std::move(vals));
+}
+
+// Creates an action that returns an element of `vals`. Calling this action will
+// repeatedly return the next value from `vals` until it reaches the end and
+// will restart from the beginning.
+template <typename T>
+internal::ReturnRoundRobinAction<T> ReturnRoundRobin(
+    std::initializer_list<T> vals) {
+  return internal::ReturnRoundRobinAction<T>(std::vector<T>(vals));
+}
+
+// Creates an action that does the default action for the give mock function.
+inline internal::DoDefaultAction DoDefault() {
+  return internal::DoDefaultAction();
+}
+
+// Creates an action that sets the variable pointed by the N-th
+// (0-based) function argument to 'value'.
+template <size_t N, typename T>
+internal::SetArgumentPointeeAction<N, T> SetArgPointee(T value) {
+  return {std::move(value)};
+}
+
+// The following version is DEPRECATED.
+template <size_t N, typename T>
+internal::SetArgumentPointeeAction<N, T> SetArgumentPointee(T value) {
+  return {std::move(value)};
+}
+
+// Creates an action that sets a pointer referent to a given value.
+template <typename T1, typename T2>
+PolymorphicAction<internal::AssignAction<T1, T2> > Assign(T1* ptr, T2 val) {
+  return MakePolymorphicAction(internal::AssignAction<T1, T2>(ptr, val));
+}
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+// Creates an action that sets errno and returns the appropriate error.
+template <typename T>
+PolymorphicAction<internal::SetErrnoAndReturnAction<T> >
+SetErrnoAndReturn(int errval, T result) {
+  return MakePolymorphicAction(
+      internal::SetErrnoAndReturnAction<T>(errval, result));
+}
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Various overloads for Invoke().
+
+// Legacy function.
+// Actions can now be implicitly constructed from callables. No need to create
+// wrapper objects.
+// This function exists for backwards compatibility.
+template <typename FunctionImpl>
+typename std::decay<FunctionImpl>::type Invoke(FunctionImpl&& function_impl) {
+  return std::forward<FunctionImpl>(function_impl);
+}
+
+// Creates an action that invokes the given method on the given object
+// with the mock function's arguments.
+template <class Class, typename MethodPtr>
+internal::InvokeMethodAction<Class, MethodPtr> Invoke(Class* obj_ptr,
+                                                      MethodPtr method_ptr) {
+  return {obj_ptr, method_ptr};
+}
+
+// Creates an action that invokes 'function_impl' with no argument.
+template <typename FunctionImpl>
+internal::InvokeWithoutArgsAction<typename std::decay<FunctionImpl>::type>
+InvokeWithoutArgs(FunctionImpl function_impl) {
+  return {std::move(function_impl)};
+}
+
+// Creates an action that invokes the given method on the given object
+// with no argument.
+template <class Class, typename MethodPtr>
+internal::InvokeMethodWithoutArgsAction<Class, MethodPtr> InvokeWithoutArgs(
+    Class* obj_ptr, MethodPtr method_ptr) {
+  return {obj_ptr, method_ptr};
+}
+
+// Creates an action that performs an_action and throws away its
+// result.  In other words, it changes the return type of an_action to
+// void.  an_action MUST NOT return void, or the code won't compile.
+template <typename A>
+inline internal::IgnoreResultAction<A> IgnoreResult(const A& an_action) {
+  return internal::IgnoreResultAction<A>(an_action);
+}
+
+// Creates a reference wrapper for the given L-value.  If necessary,
+// you can explicitly specify the type of the reference.  For example,
+// suppose 'derived' is an object of type Derived, ByRef(derived)
+// would wrap a Derived&.  If you want to wrap a const Base& instead,
+// where Base is a base class of Derived, just write:
+//
+//   ByRef<const Base>(derived)
+//
+// N.B. ByRef is redundant with std::ref, std::cref and std::reference_wrapper.
+// However, it may still be used for consistency with ByMove().
+template <typename T>
+inline ::std::reference_wrapper<T> ByRef(T& l_value) {  // NOLINT
+  return ::std::reference_wrapper<T>(l_value);
+}
+
+// The ReturnNew<T>(a1, a2, ..., a_k) action returns a pointer to a new
+// instance of type T, constructed on the heap with constructor arguments
+// a1, a2, ..., and a_k. The caller assumes ownership of the returned value.
+template <typename T, typename... Params>
+internal::ReturnNewAction<T, typename std::decay<Params>::type...> ReturnNew(
+    Params&&... params) {
+  return {std::forward_as_tuple(std::forward<Params>(params)...)};
+}
+
+// Action ReturnArg<k>() returns the k-th argument of the mock function.
+template <size_t k>
+internal::ReturnArgAction<k> ReturnArg() {
+  return {};
+}
+
+// Action SaveArg<k>(pointer) saves the k-th (0-based) argument of the
+// mock function to *pointer.
+template <size_t k, typename Ptr>
+internal::SaveArgAction<k, Ptr> SaveArg(Ptr pointer) {
+  return {pointer};
+}
+
+// Action SaveArgPointee<k>(pointer) saves the value pointed to
+// by the k-th (0-based) argument of the mock function to *pointer.
+template <size_t k, typename Ptr>
+internal::SaveArgPointeeAction<k, Ptr> SaveArgPointee(Ptr pointer) {
+  return {pointer};
+}
+
+// Action SetArgReferee<k>(value) assigns 'value' to the variable
+// referenced by the k-th (0-based) argument of the mock function.
+template <size_t k, typename T>
+internal::SetArgRefereeAction<k, typename std::decay<T>::type> SetArgReferee(
+    T&& value) {
+  return {std::forward<T>(value)};
+}
+
+// Action SetArrayArgument<k>(first, last) copies the elements in
+// source range [first, last) to the array pointed to by the k-th
+// (0-based) argument, which can be either a pointer or an
+// iterator. The action does not take ownership of the elements in the
+// source range.
+template <size_t k, typename I1, typename I2>
+internal::SetArrayArgumentAction<k, I1, I2> SetArrayArgument(I1 first,
+                                                             I2 last) {
+  return {first, last};
+}
+
+// Action DeleteArg<k>() deletes the k-th (0-based) argument of the mock
+// function.
+template <size_t k>
+internal::DeleteArgAction<k> DeleteArg() {
+  return {};
+}
+
+// This action returns the value pointed to by 'pointer'.
+template <typename Ptr>
+internal::ReturnPointeeAction<Ptr> ReturnPointee(Ptr pointer) {
+  return {pointer};
+}
+
+// Action Throw(exception) can be used in a mock function of any type
+// to throw the given exception.  Any copyable value can be thrown.
+#if GTEST_HAS_EXCEPTIONS
+template <typename T>
+internal::ThrowAction<typename std::decay<T>::type> Throw(T&& exception) {
+  return {std::forward<T>(exception)};
+}
+#endif  // GTEST_HAS_EXCEPTIONS
+
+namespace internal {
+
+// A macro from the ACTION* family (defined later in gmock-generated-actions.h)
+// defines an action that can be used in a mock function.  Typically,
+// these actions only care about a subset of the arguments of the mock
+// function.  For example, if such an action only uses the second
+// argument, it can be used in any mock function that takes >= 2
+// arguments where the type of the second argument is compatible.
+//
+// Therefore, the action implementation must be prepared to take more
+// arguments than it needs.  The ExcessiveArg type is used to
+// represent those excessive arguments.  In order to keep the compiler
+// error messages tractable, we define it in the testing namespace
+// instead of testing::internal.  However, this is an INTERNAL TYPE
+// and subject to change without notice, so a user MUST NOT USE THIS
+// TYPE DIRECTLY.
+struct ExcessiveArg {};
+
+// Builds an implementation of an Action<> for some particular signature, using
+// a class defined by an ACTION* macro.
+template <typename F, typename Impl> struct ActionImpl;
+
+template <typename Impl>
+struct ImplBase {
+  struct Holder {
+    // Allows each copy of the Action<> to get to the Impl.
+    explicit operator const Impl&() const { return *ptr; }
+    std::shared_ptr<Impl> ptr;
+  };
+  using type = typename std::conditional<std::is_constructible<Impl>::value,
+                                         Impl, Holder>::type;
+};
+
+template <typename R, typename... Args, typename Impl>
+struct ActionImpl<R(Args...), Impl> : ImplBase<Impl>::type {
+  using Base = typename ImplBase<Impl>::type;
+  using function_type = R(Args...);
+  using args_type = std::tuple<Args...>;
+
+  ActionImpl() = default;  // Only defined if appropriate for Base.
+  explicit ActionImpl(std::shared_ptr<Impl> impl) : Base{std::move(impl)} { }
+
+  R operator()(Args&&... arg) const {
+    static constexpr size_t kMaxArgs =
+        sizeof...(Args) <= 10 ? sizeof...(Args) : 10;
+    return Apply(MakeIndexSequence<kMaxArgs>{},
+                 MakeIndexSequence<10 - kMaxArgs>{},
+                 args_type{std::forward<Args>(arg)...});
+  }
+
+  template <std::size_t... arg_id, std::size_t... excess_id>
+  R Apply(IndexSequence<arg_id...>, IndexSequence<excess_id...>,
+          const args_type& args) const {
+    // Impl need not be specific to the signature of action being implemented;
+    // only the implementing function body needs to have all of the specific
+    // types instantiated.  Up to 10 of the args that are provided by the
+    // args_type get passed, followed by a dummy of unspecified type for the
+    // remainder up to 10 explicit args.
+    static constexpr ExcessiveArg kExcessArg{};
+    return static_cast<const Impl&>(*this).template gmock_PerformImpl<
+        /*function_type=*/function_type, /*return_type=*/R,
+        /*args_type=*/args_type,
+        /*argN_type=*/typename std::tuple_element<arg_id, args_type>::type...>(
+        /*args=*/args, std::get<arg_id>(args)...,
+        ((void)excess_id, kExcessArg)...);
+  }
+};
+
+// Stores a default-constructed Impl as part of the Action<>'s
+// std::function<>. The Impl should be trivial to copy.
+template <typename F, typename Impl>
+::testing::Action<F> MakeAction() {
+  return ::testing::Action<F>(ActionImpl<F, Impl>());
+}
+
+// Stores just the one given instance of Impl.
+template <typename F, typename Impl>
+::testing::Action<F> MakeAction(std::shared_ptr<Impl> impl) {
+  return ::testing::Action<F>(ActionImpl<F, Impl>(std::move(impl)));
+}
+
+#define GMOCK_INTERNAL_ARG_UNUSED(i, data, el) \
+  , const arg##i##_type& arg##i GTEST_ATTRIBUTE_UNUSED_
+#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_           \
+  const args_type& args GTEST_ATTRIBUTE_UNUSED_ GMOCK_PP_REPEAT( \
+      GMOCK_INTERNAL_ARG_UNUSED, , 10)
+
+#define GMOCK_INTERNAL_ARG(i, data, el) , const arg##i##_type& arg##i
+#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_ \
+  const args_type& args GMOCK_PP_REPEAT(GMOCK_INTERNAL_ARG, , 10)
+
+#define GMOCK_INTERNAL_TEMPLATE_ARG(i, data, el) , typename arg##i##_type
+#define GMOCK_ACTION_TEMPLATE_ARGS_NAMES_ \
+  GMOCK_PP_TAIL(GMOCK_PP_REPEAT(GMOCK_INTERNAL_TEMPLATE_ARG, , 10))
+
+#define GMOCK_INTERNAL_TYPENAME_PARAM(i, data, param) , typename param##_type
+#define GMOCK_ACTION_TYPENAME_PARAMS_(params) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_TYPENAME_PARAM, , params))
+
+#define GMOCK_INTERNAL_TYPE_PARAM(i, data, param) , param##_type
+#define GMOCK_ACTION_TYPE_PARAMS_(params) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_TYPE_PARAM, , params))
+
+#define GMOCK_INTERNAL_TYPE_GVALUE_PARAM(i, data, param) \
+  , param##_type gmock_p##i
+#define GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_TYPE_GVALUE_PARAM, , params))
+
+#define GMOCK_INTERNAL_GVALUE_PARAM(i, data, param) \
+  , std::forward<param##_type>(gmock_p##i)
+#define GMOCK_ACTION_GVALUE_PARAMS_(params) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GVALUE_PARAM, , params))
+
+#define GMOCK_INTERNAL_INIT_PARAM(i, data, param) \
+  , param(::std::forward<param##_type>(gmock_p##i))
+#define GMOCK_ACTION_INIT_PARAMS_(params) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_INIT_PARAM, , params))
+
+#define GMOCK_INTERNAL_FIELD_PARAM(i, data, param) param##_type param;
+#define GMOCK_ACTION_FIELD_PARAMS_(params) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_FIELD_PARAM, , params)
+
+#define GMOCK_INTERNAL_ACTION(name, full_name, params)                        \
+  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                            \
+  class full_name {                                                           \
+   public:                                                                    \
+    explicit full_name(GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params))              \
+        : impl_(std::make_shared<gmock_Impl>(                                 \
+                GMOCK_ACTION_GVALUE_PARAMS_(params))) { }                     \
+    full_name(const full_name&) = default;                                    \
+    full_name(full_name&&) noexcept = default;                                \
+    template <typename F>                                                     \
+    operator ::testing::Action<F>() const {                                   \
+      return ::testing::internal::MakeAction<F>(impl_);                       \
+    }                                                                         \
+   private:                                                                   \
+    class gmock_Impl {                                                        \
+     public:                                                                  \
+      explicit gmock_Impl(GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params))           \
+          : GMOCK_ACTION_INIT_PARAMS_(params) {}                              \
+      template <typename function_type, typename return_type,                 \
+                typename args_type, GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>        \
+      return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const; \
+      GMOCK_ACTION_FIELD_PARAMS_(params)                                      \
+    };                                                                        \
+    std::shared_ptr<const gmock_Impl> impl_;                                  \
+  };                                                                          \
+  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                            \
+  inline full_name<GMOCK_ACTION_TYPE_PARAMS_(params)> name(                   \
+      GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params)) {                             \
+    return full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>(                      \
+        GMOCK_ACTION_GVALUE_PARAMS_(params));                                 \
+  }                                                                           \
+  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                            \
+  template <typename function_type, typename return_type, typename args_type, \
+            GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                                \
+  return_type full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>::gmock_Impl::      \
+  gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+}  // namespace internal
+
+// Similar to GMOCK_INTERNAL_ACTION, but no bound parameters are stored.
+#define ACTION(name)                                                          \
+  class name##Action {                                                        \
+   public:                                                                    \
+   explicit name##Action() noexcept {}                                        \
+   name##Action(const name##Action&) noexcept {}                              \
+    template <typename F>                                                     \
+    operator ::testing::Action<F>() const {                                   \
+      return ::testing::internal::MakeAction<F, gmock_Impl>();                \
+    }                                                                         \
+   private:                                                                   \
+    class gmock_Impl {                                                        \
+     public:                                                                  \
+      template <typename function_type, typename return_type,                 \
+                typename args_type, GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>        \
+      return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const; \
+    };                                                                        \
+  };                                                                          \
+  inline name##Action name() GTEST_MUST_USE_RESULT_;                          \
+  inline name##Action name() { return name##Action(); }                       \
+  template <typename function_type, typename return_type, typename args_type, \
+            GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                                \
+  return_type name##Action::gmock_Impl::gmock_PerformImpl(                    \
+      GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP, (__VA_ARGS__))
+
+#define ACTION_P2(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP2, (__VA_ARGS__))
+
+#define ACTION_P3(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP3, (__VA_ARGS__))
+
+#define ACTION_P4(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP4, (__VA_ARGS__))
+
+#define ACTION_P5(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP5, (__VA_ARGS__))
+
+#define ACTION_P6(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP6, (__VA_ARGS__))
+
+#define ACTION_P7(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP7, (__VA_ARGS__))
+
+#define ACTION_P8(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP8, (__VA_ARGS__))
+
+#define ACTION_P9(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP9, (__VA_ARGS__))
+
+#define ACTION_P10(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP10, (__VA_ARGS__))
+
+}  // namespace testing
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h
new file mode 100644
index 0000000000..fc7f803a7a
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h
@@ -0,0 +1,157 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used cardinalities.  More
+// cardinalities can be defined by the user implementing the
+// CardinalityInterface interface if necessary.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
+
+#include <limits.h>
+#include <memory>
+#include <ostream>  // NOLINT
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// To implement a cardinality Foo, define:
+//   1. a class FooCardinality that implements the
+//      CardinalityInterface interface, and
+//   2. a factory function that creates a Cardinality object from a
+//      const FooCardinality*.
+//
+// The two-level delegation design follows that of Matcher, providing
+// consistency for extension developers.  It also eases ownership
+// management as Cardinality objects can now be copied like plain values.
+
+// The implementation of a cardinality.
+class CardinalityInterface {
+ public:
+  virtual ~CardinalityInterface() {}
+
+  // Conservative estimate on the lower/upper bound of the number of
+  // calls allowed.
+  virtual int ConservativeLowerBound() const { return 0; }
+  virtual int ConservativeUpperBound() const { return INT_MAX; }
+
+  // Returns true if and only if call_count calls will satisfy this
+  // cardinality.
+  virtual bool IsSatisfiedByCallCount(int call_count) const = 0;
+
+  // Returns true if and only if call_count calls will saturate this
+  // cardinality.
+  virtual bool IsSaturatedByCallCount(int call_count) const = 0;
+
+  // Describes self to an ostream.
+  virtual void DescribeTo(::std::ostream* os) const = 0;
+};
+
+// A Cardinality is a copyable and IMMUTABLE (except by assignment)
+// object that specifies how many times a mock function is expected to
+// be called.  The implementation of Cardinality is just a std::shared_ptr
+// to const CardinalityInterface. Don't inherit from Cardinality!
+class GTEST_API_ Cardinality {
+ public:
+  // Constructs a null cardinality.  Needed for storing Cardinality
+  // objects in STL containers.
+  Cardinality() {}
+
+  // Constructs a Cardinality from its implementation.
+  explicit Cardinality(const CardinalityInterface* impl) : impl_(impl) {}
+
+  // Conservative estimate on the lower/upper bound of the number of
+  // calls allowed.
+  int ConservativeLowerBound() const { return impl_->ConservativeLowerBound(); }
+  int ConservativeUpperBound() const { return impl_->ConservativeUpperBound(); }
+
+  // Returns true if and only if call_count calls will satisfy this
+  // cardinality.
+  bool IsSatisfiedByCallCount(int call_count) const {
+    return impl_->IsSatisfiedByCallCount(call_count);
+  }
+
+  // Returns true if and only if call_count calls will saturate this
+  // cardinality.
+  bool IsSaturatedByCallCount(int call_count) const {
+    return impl_->IsSaturatedByCallCount(call_count);
+  }
+
+  // Returns true if and only if call_count calls will over-saturate this
+  // cardinality, i.e. exceed the maximum number of allowed calls.
+  bool IsOverSaturatedByCallCount(int call_count) const {
+    return impl_->IsSaturatedByCallCount(call_count) &&
+        !impl_->IsSatisfiedByCallCount(call_count);
+  }
+
+  // Describes self to an ostream
+  void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); }
+
+  // Describes the given actual call count to an ostream.
+  static void DescribeActualCallCountTo(int actual_call_count,
+                                        ::std::ostream* os);
+
+ private:
+  std::shared_ptr<const CardinalityInterface> impl_;
+};
+
+// Creates a cardinality that allows at least n calls.
+GTEST_API_ Cardinality AtLeast(int n);
+
+// Creates a cardinality that allows at most n calls.
+GTEST_API_ Cardinality AtMost(int n);
+
+// Creates a cardinality that allows any number of calls.
+GTEST_API_ Cardinality AnyNumber();
+
+// Creates a cardinality that allows between min and max calls.
+GTEST_API_ Cardinality Between(int min, int max);
+
+// Creates a cardinality that allows exactly n calls.
+GTEST_API_ Cardinality Exactly(int n);
+
+// Creates a cardinality from its implementation.
+inline Cardinality MakeCardinality(const CardinalityInterface* c) {
+  return Cardinality(c);
+}
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h
new file mode 100644
index 0000000000..0fc6f6f3f1
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h
@@ -0,0 +1,479 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements MOCK_METHOD.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_  // NOLINT
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_  // NOLINT
+
+#include <type_traits>  // IWYU pragma: keep
+#include <utility>      // IWYU pragma: keep
+
+#include "gmock/gmock-spec-builders.h"
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-pp.h"
+
+namespace testing {
+namespace internal {
+template <typename T>
+using identity_t = T;
+
+template <typename Pattern>
+struct ThisRefAdjuster {
+  template <typename T>
+  using AdjustT = typename std::conditional<
+      std::is_const<typename std::remove_reference<Pattern>::type>::value,
+      typename std::conditional<std::is_lvalue_reference<Pattern>::value,
+                                const T&, const T&&>::type,
+      typename std::conditional<std::is_lvalue_reference<Pattern>::value, T&,
+                                T&&>::type>::type;
+
+  template <typename MockType>
+  static AdjustT<MockType> Adjust(const MockType& mock) {
+    return static_cast<AdjustT<MockType>>(const_cast<MockType&>(mock));
+  }
+};
+
+}  // namespace internal
+
+// The style guide prohibits "using" statements in a namespace scope
+// inside a header file.  However, the FunctionMocker class template
+// is meant to be defined in the ::testing namespace.  The following
+// line is just a trick for working around a bug in MSVC 8.0, which
+// cannot handle it if we define FunctionMocker in ::testing.
+using internal::FunctionMocker;
+}  // namespace testing
+
+#define MOCK_METHOD(...) \
+  GMOCK_PP_VARIADIC_CALL(GMOCK_INTERNAL_MOCK_METHOD_ARG_, __VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_1(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_2(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_3(_Ret, _MethodName, _Args) \
+  GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, ())
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, _Spec)     \
+  GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Args);                                   \
+  GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Spec);                                   \
+  GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(                                      \
+      GMOCK_PP_NARG0 _Args, GMOCK_INTERNAL_SIGNATURE(_Ret, _Args));           \
+  GMOCK_INTERNAL_ASSERT_VALID_SPEC(_Spec)                                     \
+  GMOCK_INTERNAL_MOCK_METHOD_IMPL(                                            \
+      GMOCK_PP_NARG0 _Args, _MethodName, GMOCK_INTERNAL_HAS_CONST(_Spec),     \
+      GMOCK_INTERNAL_HAS_OVERRIDE(_Spec), GMOCK_INTERNAL_HAS_FINAL(_Spec),    \
+      GMOCK_INTERNAL_GET_NOEXCEPT_SPEC(_Spec),                                \
+      GMOCK_INTERNAL_GET_CALLTYPE(_Spec), GMOCK_INTERNAL_GET_REF_SPEC(_Spec), \
+      (GMOCK_INTERNAL_SIGNATURE(_Ret, _Args)))
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_5(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_6(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_7(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_WRONG_ARITY(...)                                      \
+  static_assert(                                                             \
+      false,                                                                 \
+      "MOCK_METHOD must be called with 3 or 4 arguments. _Ret, "             \
+      "_MethodName, _Args and optionally _Spec. _Args and _Spec must be "    \
+      "enclosed in parentheses. If _Ret is a type with unprotected commas, " \
+      "it must also be enclosed in parentheses.")
+
+#define GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Tuple) \
+  static_assert(                                  \
+      GMOCK_PP_IS_ENCLOSED_PARENS(_Tuple),        \
+      GMOCK_PP_STRINGIZE(_Tuple) " should be enclosed in parentheses.")
+
+#define GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(_N, ...)                 \
+  static_assert(                                                       \
+      std::is_function<__VA_ARGS__>::value,                            \
+      "Signature must be a function type, maybe return type contains " \
+      "unprotected comma.");                                           \
+  static_assert(                                                       \
+      ::testing::tuple_size<typename ::testing::internal::Function<    \
+              __VA_ARGS__>::ArgumentTuple>::value == _N,               \
+      "This method does not take " GMOCK_PP_STRINGIZE(                 \
+          _N) " arguments. Parenthesize all types with unprotected commas.")
+
+#define GMOCK_INTERNAL_ASSERT_VALID_SPEC(_Spec) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT, ~, _Spec)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_IMPL(_N, _MethodName, _Constness,           \
+                                        _Override, _Final, _NoexceptSpec,      \
+                                        _CallType, _RefSpec, _Signature)       \
+  typename ::testing::internal::Function<GMOCK_PP_REMOVE_PARENS(               \
+      _Signature)>::Result                                                     \
+  GMOCK_INTERNAL_EXPAND(_CallType)                                             \
+      _MethodName(GMOCK_PP_REPEAT(GMOCK_INTERNAL_PARAMETER, _Signature, _N))   \
+          GMOCK_PP_IF(_Constness, const, ) _RefSpec _NoexceptSpec              \
+          GMOCK_PP_IF(_Override, override, ) GMOCK_PP_IF(_Final, final, ) {    \
+    GMOCK_MOCKER_(_N, _Constness, _MethodName)                                 \
+        .SetOwnerAndName(this, #_MethodName);                                  \
+    return GMOCK_MOCKER_(_N, _Constness, _MethodName)                          \
+        .Invoke(GMOCK_PP_REPEAT(GMOCK_INTERNAL_FORWARD_ARG, _Signature, _N));  \
+  }                                                                            \
+  ::testing::MockSpec<GMOCK_PP_REMOVE_PARENS(_Signature)> gmock_##_MethodName( \
+      GMOCK_PP_REPEAT(GMOCK_INTERNAL_MATCHER_PARAMETER, _Signature, _N))       \
+      GMOCK_PP_IF(_Constness, const, ) _RefSpec {                              \
+    GMOCK_MOCKER_(_N, _Constness, _MethodName).RegisterOwner(this);            \
+    return GMOCK_MOCKER_(_N, _Constness, _MethodName)                          \
+        .With(GMOCK_PP_REPEAT(GMOCK_INTERNAL_MATCHER_ARGUMENT, , _N));         \
+  }                                                                            \
+  ::testing::MockSpec<GMOCK_PP_REMOVE_PARENS(_Signature)> gmock_##_MethodName( \
+      const ::testing::internal::WithoutMatchers&,                             \
+      GMOCK_PP_IF(_Constness, const, )::testing::internal::Function<           \
+          GMOCK_PP_REMOVE_PARENS(_Signature)>*) const _RefSpec _NoexceptSpec { \
+    return ::testing::internal::ThisRefAdjuster<GMOCK_PP_IF(                   \
+        _Constness, const, ) int _RefSpec>::Adjust(*this)                      \
+        .gmock_##_MethodName(GMOCK_PP_REPEAT(                                  \
+            GMOCK_INTERNAL_A_MATCHER_ARGUMENT, _Signature, _N));               \
+  }                                                                            \
+  mutable ::testing::FunctionMocker<GMOCK_PP_REMOVE_PARENS(_Signature)>        \
+      GMOCK_MOCKER_(_N, _Constness, _MethodName)
+
+#define GMOCK_INTERNAL_EXPAND(...) __VA_ARGS__
+
+// Five Valid modifiers.
+#define GMOCK_INTERNAL_HAS_CONST(_Tuple) \
+  GMOCK_PP_HAS_COMMA(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_CONST, ~, _Tuple))
+
+#define GMOCK_INTERNAL_HAS_OVERRIDE(_Tuple) \
+  GMOCK_PP_HAS_COMMA(                       \
+      GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_OVERRIDE, ~, _Tuple))
+
+#define GMOCK_INTERNAL_HAS_FINAL(_Tuple) \
+  GMOCK_PP_HAS_COMMA(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_FINAL, ~, _Tuple))
+
+#define GMOCK_INTERNAL_GET_NOEXCEPT_SPEC(_Tuple) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_NOEXCEPT_SPEC_IF_NOEXCEPT, ~, _Tuple)
+
+#define GMOCK_INTERNAL_NOEXCEPT_SPEC_IF_NOEXCEPT(_i, _, _elem)          \
+  GMOCK_PP_IF(                                                          \
+      GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem)), \
+      _elem, )
+
+#define GMOCK_INTERNAL_GET_REF_SPEC(_Tuple) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_REF_SPEC_IF_REF, ~, _Tuple)
+
+#define GMOCK_INTERNAL_REF_SPEC_IF_REF(_i, _, _elem)                       \
+  GMOCK_PP_IF(GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_REF(_i, _, _elem)), \
+              GMOCK_PP_CAT(GMOCK_INTERNAL_UNPACK_, _elem), )
+
+#define GMOCK_INTERNAL_GET_CALLTYPE(_Tuple) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GET_CALLTYPE_IMPL, ~, _Tuple)
+
+#define GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT(_i, _, _elem)            \
+  static_assert(                                                          \
+      (GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem)) +    \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_OVERRIDE(_i, _, _elem)) + \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_FINAL(_i, _, _elem)) +    \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem)) + \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_REF(_i, _, _elem)) +      \
+       GMOCK_INTERNAL_IS_CALLTYPE(_elem)) == 1,                           \
+      GMOCK_PP_STRINGIZE(                                                 \
+          _elem) " cannot be recognized as a valid specification modifier.");
+
+// Modifiers implementation.
+#define GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_CONST_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_CONST_I_const ,
+
+#define GMOCK_INTERNAL_DETECT_OVERRIDE(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_OVERRIDE_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_OVERRIDE_I_override ,
+
+#define GMOCK_INTERNAL_DETECT_FINAL(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_FINAL_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_FINAL_I_final ,
+
+#define GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_NOEXCEPT_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_NOEXCEPT_I_noexcept ,
+
+#define GMOCK_INTERNAL_DETECT_REF(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_REF_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_REF_I_ref ,
+
+#define GMOCK_INTERNAL_UNPACK_ref(x) x
+
+#define GMOCK_INTERNAL_GET_CALLTYPE_IMPL(_i, _, _elem)           \
+  GMOCK_PP_IF(GMOCK_INTERNAL_IS_CALLTYPE(_elem),                 \
+              GMOCK_INTERNAL_GET_VALUE_CALLTYPE, GMOCK_PP_EMPTY) \
+  (_elem)
+
+// TODO(iserna): GMOCK_INTERNAL_IS_CALLTYPE and
+// GMOCK_INTERNAL_GET_VALUE_CALLTYPE needed more expansions to work on windows
+// maybe they can be simplified somehow.
+#define GMOCK_INTERNAL_IS_CALLTYPE(_arg) \
+  GMOCK_INTERNAL_IS_CALLTYPE_I(          \
+      GMOCK_PP_CAT(GMOCK_INTERNAL_IS_CALLTYPE_HELPER_, _arg))
+#define GMOCK_INTERNAL_IS_CALLTYPE_I(_arg) GMOCK_PP_IS_ENCLOSED_PARENS(_arg)
+
+#define GMOCK_INTERNAL_GET_VALUE_CALLTYPE(_arg) \
+  GMOCK_INTERNAL_GET_VALUE_CALLTYPE_I(          \
+      GMOCK_PP_CAT(GMOCK_INTERNAL_IS_CALLTYPE_HELPER_, _arg))
+#define GMOCK_INTERNAL_GET_VALUE_CALLTYPE_I(_arg) \
+  GMOCK_PP_IDENTITY _arg
+
+#define GMOCK_INTERNAL_IS_CALLTYPE_HELPER_Calltype
+
+// Note: The use of `identity_t` here allows _Ret to represent return types that
+// would normally need to be specified in a different way. For example, a method
+// returning a function pointer must be written as
+//
+// fn_ptr_return_t (*method(method_args_t...))(fn_ptr_args_t...)
+//
+// But we only support placing the return type at the beginning. To handle this,
+// we wrap all calls in identity_t, so that a declaration will be expanded to
+//
+// identity_t<fn_ptr_return_t (*)(fn_ptr_args_t...)> method(method_args_t...)
+//
+// This allows us to work around the syntactic oddities of function/method
+// types.
+#define GMOCK_INTERNAL_SIGNATURE(_Ret, _Args)                                 \
+  ::testing::internal::identity_t<GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(_Ret), \
+                                              GMOCK_PP_REMOVE_PARENS,         \
+                                              GMOCK_PP_IDENTITY)(_Ret)>(      \
+      GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GET_TYPE, _, _Args))
+
+#define GMOCK_INTERNAL_GET_TYPE(_i, _, _elem)                          \
+  GMOCK_PP_COMMA_IF(_i)                                                \
+  GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(_elem), GMOCK_PP_REMOVE_PARENS, \
+              GMOCK_PP_IDENTITY)                                       \
+  (_elem)
+
+#define GMOCK_INTERNAL_PARAMETER(_i, _Signature, _)            \
+  GMOCK_PP_COMMA_IF(_i)                                        \
+  GMOCK_INTERNAL_ARG_O(_i, GMOCK_PP_REMOVE_PARENS(_Signature)) \
+  gmock_a##_i
+
+#define GMOCK_INTERNAL_FORWARD_ARG(_i, _Signature, _) \
+  GMOCK_PP_COMMA_IF(_i)                               \
+  ::std::forward<GMOCK_INTERNAL_ARG_O(                \
+      _i, GMOCK_PP_REMOVE_PARENS(_Signature))>(gmock_a##_i)
+
+#define GMOCK_INTERNAL_MATCHER_PARAMETER(_i, _Signature, _)        \
+  GMOCK_PP_COMMA_IF(_i)                                            \
+  GMOCK_INTERNAL_MATCHER_O(_i, GMOCK_PP_REMOVE_PARENS(_Signature)) \
+  gmock_a##_i
+
+#define GMOCK_INTERNAL_MATCHER_ARGUMENT(_i, _1, _2) \
+  GMOCK_PP_COMMA_IF(_i)                             \
+  gmock_a##_i
+
+#define GMOCK_INTERNAL_A_MATCHER_ARGUMENT(_i, _Signature, _) \
+  GMOCK_PP_COMMA_IF(_i)                                      \
+  ::testing::A<GMOCK_INTERNAL_ARG_O(_i, GMOCK_PP_REMOVE_PARENS(_Signature))>()
+
+#define GMOCK_INTERNAL_ARG_O(_i, ...) \
+  typename ::testing::internal::Function<__VA_ARGS__>::template Arg<_i>::type
+
+#define GMOCK_INTERNAL_MATCHER_O(_i, ...)                          \
+  const ::testing::Matcher<typename ::testing::internal::Function< \
+      __VA_ARGS__>::template Arg<_i>::type>&
+
+#define MOCK_METHOD0(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 0, __VA_ARGS__)
+#define MOCK_METHOD1(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 1, __VA_ARGS__)
+#define MOCK_METHOD2(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 2, __VA_ARGS__)
+#define MOCK_METHOD3(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 3, __VA_ARGS__)
+#define MOCK_METHOD4(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 4, __VA_ARGS__)
+#define MOCK_METHOD5(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 5, __VA_ARGS__)
+#define MOCK_METHOD6(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 6, __VA_ARGS__)
+#define MOCK_METHOD7(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 7, __VA_ARGS__)
+#define MOCK_METHOD8(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 8, __VA_ARGS__)
+#define MOCK_METHOD9(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 9, __VA_ARGS__)
+#define MOCK_METHOD10(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, , m, 10, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 0, __VA_ARGS__)
+#define MOCK_CONST_METHOD1(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 1, __VA_ARGS__)
+#define MOCK_CONST_METHOD2(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 2, __VA_ARGS__)
+#define MOCK_CONST_METHOD3(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 3, __VA_ARGS__)
+#define MOCK_CONST_METHOD4(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 4, __VA_ARGS__)
+#define MOCK_CONST_METHOD5(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 5, __VA_ARGS__)
+#define MOCK_CONST_METHOD6(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 6, __VA_ARGS__)
+#define MOCK_CONST_METHOD7(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 7, __VA_ARGS__)
+#define MOCK_CONST_METHOD8(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 8, __VA_ARGS__)
+#define MOCK_CONST_METHOD9(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 9, __VA_ARGS__)
+#define MOCK_CONST_METHOD10(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 10, __VA_ARGS__)
+
+#define MOCK_METHOD0_T(m, ...) MOCK_METHOD0(m, __VA_ARGS__)
+#define MOCK_METHOD1_T(m, ...) MOCK_METHOD1(m, __VA_ARGS__)
+#define MOCK_METHOD2_T(m, ...) MOCK_METHOD2(m, __VA_ARGS__)
+#define MOCK_METHOD3_T(m, ...) MOCK_METHOD3(m, __VA_ARGS__)
+#define MOCK_METHOD4_T(m, ...) MOCK_METHOD4(m, __VA_ARGS__)
+#define MOCK_METHOD5_T(m, ...) MOCK_METHOD5(m, __VA_ARGS__)
+#define MOCK_METHOD6_T(m, ...) MOCK_METHOD6(m, __VA_ARGS__)
+#define MOCK_METHOD7_T(m, ...) MOCK_METHOD7(m, __VA_ARGS__)
+#define MOCK_METHOD8_T(m, ...) MOCK_METHOD8(m, __VA_ARGS__)
+#define MOCK_METHOD9_T(m, ...) MOCK_METHOD9(m, __VA_ARGS__)
+#define MOCK_METHOD10_T(m, ...) MOCK_METHOD10(m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_T(m, ...) MOCK_CONST_METHOD0(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_T(m, ...) MOCK_CONST_METHOD1(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_T(m, ...) MOCK_CONST_METHOD2(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_T(m, ...) MOCK_CONST_METHOD3(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_T(m, ...) MOCK_CONST_METHOD4(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_T(m, ...) MOCK_CONST_METHOD5(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_T(m, ...) MOCK_CONST_METHOD6(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_T(m, ...) MOCK_CONST_METHOD7(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_T(m, ...) MOCK_CONST_METHOD8(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_T(m, ...) MOCK_CONST_METHOD9(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_T(m, ...) MOCK_CONST_METHOD10(m, __VA_ARGS__)
+
+#define MOCK_METHOD0_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 0, __VA_ARGS__)
+#define MOCK_METHOD1_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 1, __VA_ARGS__)
+#define MOCK_METHOD2_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 2, __VA_ARGS__)
+#define MOCK_METHOD3_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 3, __VA_ARGS__)
+#define MOCK_METHOD4_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 4, __VA_ARGS__)
+#define MOCK_METHOD5_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 5, __VA_ARGS__)
+#define MOCK_METHOD6_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 6, __VA_ARGS__)
+#define MOCK_METHOD7_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 7, __VA_ARGS__)
+#define MOCK_METHOD8_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 8, __VA_ARGS__)
+#define MOCK_METHOD9_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 9, __VA_ARGS__)
+#define MOCK_METHOD10_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 10, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 0, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 1, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 2, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 3, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 4, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 5, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 6, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 7, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 8, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 9, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 10, __VA_ARGS__)
+
+#define MOCK_METHOD0_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD0_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD1_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD1_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD2_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD2_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD3_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD3_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD4_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD4_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD5_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD5_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD6_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD6_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD7_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD7_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD8_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD8_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD9_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD9_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD10_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD10_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD0_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD1_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD2_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD3_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD4_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD5_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD6_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD7_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD8_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD9_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD10_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHODN(constness, ct, Method, args_num, ...) \
+  GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(                                  \
+      args_num, ::testing::internal::identity_t<__VA_ARGS__>);            \
+  GMOCK_INTERNAL_MOCK_METHOD_IMPL(                                        \
+      args_num, Method, GMOCK_PP_NARG0(constness), 0, 0, , ct, ,          \
+      (::testing::internal::identity_t<__VA_ARGS__>))
+
+#define GMOCK_MOCKER_(arity, constness, Method) \
+  GTEST_CONCAT_TOKEN_(gmock##constness##arity##_##Method##_, __LINE__)
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h
new file mode 100644
index 0000000000..86be9c176e
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h
@@ -0,0 +1,5392 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// The MATCHER* family of macros can be used in a namespace scope to
+// define custom matchers easily.
+//
+// Basic Usage
+// ===========
+//
+// The syntax
+//
+//   MATCHER(name, description_string) { statements; }
+//
+// defines a matcher with the given name that executes the statements,
+// which must return a bool to indicate if the match succeeds.  Inside
+// the statements, you can refer to the value being matched by 'arg',
+// and refer to its type by 'arg_type'.
+//
+// The description string documents what the matcher does, and is used
+// to generate the failure message when the match fails.  Since a
+// MATCHER() is usually defined in a header file shared by multiple
+// C++ source files, we require the description to be a C-string
+// literal to avoid possible side effects.  It can be empty, in which
+// case we'll use the sequence of words in the matcher name as the
+// description.
+//
+// For example:
+//
+//   MATCHER(IsEven, "") { return (arg % 2) == 0; }
+//
+// allows you to write
+//
+//   // Expects mock_foo.Bar(n) to be called where n is even.
+//   EXPECT_CALL(mock_foo, Bar(IsEven()));
+//
+// or,
+//
+//   // Verifies that the value of some_expression is even.
+//   EXPECT_THAT(some_expression, IsEven());
+//
+// If the above assertion fails, it will print something like:
+//
+//   Value of: some_expression
+//   Expected: is even
+//     Actual: 7
+//
+// where the description "is even" is automatically calculated from the
+// matcher name IsEven.
+//
+// Argument Type
+// =============
+//
+// Note that the type of the value being matched (arg_type) is
+// determined by the context in which you use the matcher and is
+// supplied to you by the compiler, so you don't need to worry about
+// declaring it (nor can you).  This allows the matcher to be
+// polymorphic.  For example, IsEven() can be used to match any type
+// where the value of "(arg % 2) == 0" can be implicitly converted to
+// a bool.  In the "Bar(IsEven())" example above, if method Bar()
+// takes an int, 'arg_type' will be int; if it takes an unsigned long,
+// 'arg_type' will be unsigned long; and so on.
+//
+// Parameterizing Matchers
+// =======================
+//
+// Sometimes you'll want to parameterize the matcher.  For that you
+// can use another macro:
+//
+//   MATCHER_P(name, param_name, description_string) { statements; }
+//
+// For example:
+//
+//   MATCHER_P(HasAbsoluteValue, value, "") { return abs(arg) == value; }
+//
+// will allow you to write:
+//
+//   EXPECT_THAT(Blah("a"), HasAbsoluteValue(n));
+//
+// which may lead to this message (assuming n is 10):
+//
+//   Value of: Blah("a")
+//   Expected: has absolute value 10
+//     Actual: -9
+//
+// Note that both the matcher description and its parameter are
+// printed, making the message human-friendly.
+//
+// In the matcher definition body, you can write 'foo_type' to
+// reference the type of a parameter named 'foo'.  For example, in the
+// body of MATCHER_P(HasAbsoluteValue, value) above, you can write
+// 'value_type' to refer to the type of 'value'.
+//
+// We also provide MATCHER_P2, MATCHER_P3, ..., up to MATCHER_P$n to
+// support multi-parameter matchers.
+//
+// Describing Parameterized Matchers
+// =================================
+//
+// The last argument to MATCHER*() is a string-typed expression.  The
+// expression can reference all of the matcher's parameters and a
+// special bool-typed variable named 'negation'.  When 'negation' is
+// false, the expression should evaluate to the matcher's description;
+// otherwise it should evaluate to the description of the negation of
+// the matcher.  For example,
+//
+//   using testing::PrintToString;
+//
+//   MATCHER_P2(InClosedRange, low, hi,
+//       std::string(negation ? "is not" : "is") + " in range [" +
+//       PrintToString(low) + ", " + PrintToString(hi) + "]") {
+//     return low <= arg && arg <= hi;
+//   }
+//   ...
+//   EXPECT_THAT(3, InClosedRange(4, 6));
+//   EXPECT_THAT(3, Not(InClosedRange(2, 4)));
+//
+// would generate two failures that contain the text:
+//
+//   Expected: is in range [4, 6]
+//   ...
+//   Expected: is not in range [2, 4]
+//
+// If you specify "" as the description, the failure message will
+// contain the sequence of words in the matcher name followed by the
+// parameter values printed as a tuple.  For example,
+//
+//   MATCHER_P2(InClosedRange, low, hi, "") { ... }
+//   ...
+//   EXPECT_THAT(3, InClosedRange(4, 6));
+//   EXPECT_THAT(3, Not(InClosedRange(2, 4)));
+//
+// would generate two failures that contain the text:
+//
+//   Expected: in closed range (4, 6)
+//   ...
+//   Expected: not (in closed range (2, 4))
+//
+// Types of Matcher Parameters
+// ===========================
+//
+// For the purpose of typing, you can view
+//
+//   MATCHER_Pk(Foo, p1, ..., pk, description_string) { ... }
+//
+// as shorthand for
+//
+//   template <typename p1_type, ..., typename pk_type>
+//   FooMatcherPk<p1_type, ..., pk_type>
+//   Foo(p1_type p1, ..., pk_type pk) { ... }
+//
+// When you write Foo(v1, ..., vk), the compiler infers the types of
+// the parameters v1, ..., and vk for you.  If you are not happy with
+// the result of the type inference, you can specify the types by
+// explicitly instantiating the template, as in Foo<long, bool>(5,
+// false).  As said earlier, you don't get to (or need to) specify
+// 'arg_type' as that's determined by the context in which the matcher
+// is used.  You can assign the result of expression Foo(p1, ..., pk)
+// to a variable of type FooMatcherPk<p1_type, ..., pk_type>.  This
+// can be useful when composing matchers.
+//
+// While you can instantiate a matcher template with reference types,
+// passing the parameters by pointer usually makes your code more
+// readable.  If, however, you still want to pass a parameter by
+// reference, be aware that in the failure message generated by the
+// matcher you will see the value of the referenced object but not its
+// address.
+//
+// Explaining Match Results
+// ========================
+//
+// Sometimes the matcher description alone isn't enough to explain why
+// the match has failed or succeeded.  For example, when expecting a
+// long string, it can be very helpful to also print the diff between
+// the expected string and the actual one.  To achieve that, you can
+// optionally stream additional information to a special variable
+// named result_listener, whose type is a pointer to class
+// MatchResultListener:
+//
+//   MATCHER_P(EqualsLongString, str, "") {
+//     if (arg == str) return true;
+//
+//     *result_listener << "the difference: "
+///                     << DiffStrings(str, arg);
+//     return false;
+//   }
+//
+// Overloading Matchers
+// ====================
+//
+// You can overload matchers with different numbers of parameters:
+//
+//   MATCHER_P(Blah, a, description_string1) { ... }
+//   MATCHER_P2(Blah, a, b, description_string2) { ... }
+//
+// Caveats
+// =======
+//
+// When defining a new matcher, you should also consider implementing
+// MatcherInterface or using MakePolymorphicMatcher().  These
+// approaches require more work than the MATCHER* macros, but also
+// give you more control on the types of the value being matched and
+// the matcher parameters, which may leads to better compiler error
+// messages when the matcher is used wrong.  They also allow
+// overloading matchers based on parameter types (as opposed to just
+// based on the number of parameters).
+//
+// MATCHER*() can only be used in a namespace scope as templates cannot be
+// declared inside of a local class.
+//
+// More Information
+// ================
+//
+// To learn more about using these macros, please search for 'MATCHER'
+// on
+// https://github.com/google/googletest/blob/master/docs/gmock_cook_book.md
+//
+// This file also implements some commonly used argument matchers.  More
+// matchers can be defined by the user implementing the
+// MatcherInterface<T> interface if necessary.
+//
+// See googletest/include/gtest/gtest-matchers.h for the definition of class
+// Matcher, class MatcherInterface, and others.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <initializer_list>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gmock/internal/gmock-pp.h"
+#include "gtest/gtest.h"
+
+// MSVC warning C5046 is new as of VS2017 version 15.8.
+#if defined(_MSC_VER) && _MSC_VER >= 1915
+#define GMOCK_MAYBE_5046_ 5046
+#else
+#define GMOCK_MAYBE_5046_
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4251 GMOCK_MAYBE_5046_ /* class A needs to have dll-interface to be used by
+                              clients of class B */
+    /* Symbol involving type with internal linkage not defined */)
+
+namespace testing {
+
+// To implement a matcher Foo for type T, define:
+//   1. a class FooMatcherImpl that implements the
+//      MatcherInterface<T> interface, and
+//   2. a factory function that creates a Matcher<T> object from a
+//      FooMatcherImpl*.
+//
+// The two-level delegation design makes it possible to allow a user
+// to write "v" instead of "Eq(v)" where a Matcher is expected, which
+// is impossible if we pass matchers by pointers.  It also eases
+// ownership management as Matcher objects can now be copied like
+// plain values.
+
+// A match result listener that stores the explanation in a string.
+class StringMatchResultListener : public MatchResultListener {
+ public:
+  StringMatchResultListener() : MatchResultListener(&ss_) {}
+
+  // Returns the explanation accumulated so far.
+  std::string str() const { return ss_.str(); }
+
+  // Clears the explanation accumulated so far.
+  void Clear() { ss_.str(""); }
+
+ private:
+  ::std::stringstream ss_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StringMatchResultListener);
+};
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// The MatcherCastImpl class template is a helper for implementing
+// MatcherCast().  We need this helper in order to partially
+// specialize the implementation of MatcherCast() (C++ allows
+// class/struct templates to be partially specialized, but not
+// function templates.).
+
+// This general version is used when MatcherCast()'s argument is a
+// polymorphic matcher (i.e. something that can be converted to a
+// Matcher but is not one yet; for example, Eq(value)) or a value (for
+// example, "hello").
+template <typename T, typename M>
+class MatcherCastImpl {
+ public:
+  static Matcher<T> Cast(const M& polymorphic_matcher_or_value) {
+    // M can be a polymorphic matcher, in which case we want to use
+    // its conversion operator to create Matcher<T>.  Or it can be a value
+    // that should be passed to the Matcher<T>'s constructor.
+    //
+    // We can't call Matcher<T>(polymorphic_matcher_or_value) when M is a
+    // polymorphic matcher because it'll be ambiguous if T has an implicit
+    // constructor from M (this usually happens when T has an implicit
+    // constructor from any type).
+    //
+    // It won't work to unconditionally implicit_cast
+    // polymorphic_matcher_or_value to Matcher<T> because it won't trigger
+    // a user-defined conversion from M to T if one exists (assuming M is
+    // a value).
+    return CastImpl(polymorphic_matcher_or_value,
+                    std::is_convertible<M, Matcher<T>>{},
+                    std::is_convertible<M, T>{});
+  }
+
+ private:
+  template <bool Ignore>
+  static Matcher<T> CastImpl(const M& polymorphic_matcher_or_value,
+                             std::true_type /* convertible_to_matcher */,
+                             std::integral_constant<bool, Ignore>) {
+    // M is implicitly convertible to Matcher<T>, which means that either
+    // M is a polymorphic matcher or Matcher<T> has an implicit constructor
+    // from M.  In both cases using the implicit conversion will produce a
+    // matcher.
+    //
+    // Even if T has an implicit constructor from M, it won't be called because
+    // creating Matcher<T> would require a chain of two user-defined conversions
+    // (first to create T from M and then to create Matcher<T> from T).
+    return polymorphic_matcher_or_value;
+  }
+
+  // M can't be implicitly converted to Matcher<T>, so M isn't a polymorphic
+  // matcher. It's a value of a type implicitly convertible to T. Use direct
+  // initialization to create a matcher.
+  static Matcher<T> CastImpl(const M& value,
+                             std::false_type /* convertible_to_matcher */,
+                             std::true_type /* convertible_to_T */) {
+    return Matcher<T>(ImplicitCast_<T>(value));
+  }
+
+  // M can't be implicitly converted to either Matcher<T> or T. Attempt to use
+  // polymorphic matcher Eq(value) in this case.
+  //
+  // Note that we first attempt to perform an implicit cast on the value and
+  // only fall back to the polymorphic Eq() matcher afterwards because the
+  // latter calls bool operator==(const Lhs& lhs, const Rhs& rhs) in the end
+  // which might be undefined even when Rhs is implicitly convertible to Lhs
+  // (e.g. std::pair<const int, int> vs. std::pair<int, int>).
+  //
+  // We don't define this method inline as we need the declaration of Eq().
+  static Matcher<T> CastImpl(const M& value,
+                             std::false_type /* convertible_to_matcher */,
+                             std::false_type /* convertible_to_T */);
+};
+
+// This more specialized version is used when MatcherCast()'s argument
+// is already a Matcher.  This only compiles when type T can be
+// statically converted to type U.
+template <typename T, typename U>
+class MatcherCastImpl<T, Matcher<U> > {
+ public:
+  static Matcher<T> Cast(const Matcher<U>& source_matcher) {
+    return Matcher<T>(new Impl(source_matcher));
+  }
+
+ private:
+  class Impl : public MatcherInterface<T> {
+   public:
+    explicit Impl(const Matcher<U>& source_matcher)
+        : source_matcher_(source_matcher) {}
+
+    // We delegate the matching logic to the source matcher.
+    bool MatchAndExplain(T x, MatchResultListener* listener) const override {
+      using FromType = typename std::remove_cv<typename std::remove_pointer<
+          typename std::remove_reference<T>::type>::type>::type;
+      using ToType = typename std::remove_cv<typename std::remove_pointer<
+          typename std::remove_reference<U>::type>::type>::type;
+      // Do not allow implicitly converting base*/& to derived*/&.
+      static_assert(
+          // Do not trigger if only one of them is a pointer. That implies a
+          // regular conversion and not a down_cast.
+          (std::is_pointer<typename std::remove_reference<T>::type>::value !=
+           std::is_pointer<typename std::remove_reference<U>::type>::value) ||
+              std::is_same<FromType, ToType>::value ||
+              !std::is_base_of<FromType, ToType>::value,
+          "Can't implicitly convert from <base> to <derived>");
+
+      // Do the cast to `U` explicitly if necessary.
+      // Otherwise, let implicit conversions do the trick.
+      using CastType =
+          typename std::conditional<std::is_convertible<T&, const U&>::value,
+                                    T&, U>::type;
+
+      return source_matcher_.MatchAndExplain(static_cast<CastType>(x),
+                                             listener);
+    }
+
+    void DescribeTo(::std::ostream* os) const override {
+      source_matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      source_matcher_.DescribeNegationTo(os);
+    }
+
+   private:
+    const Matcher<U> source_matcher_;
+  };
+};
+
+// This even more specialized version is used for efficiently casting
+// a matcher to its own type.
+template <typename T>
+class MatcherCastImpl<T, Matcher<T> > {
+ public:
+  static Matcher<T> Cast(const Matcher<T>& matcher) { return matcher; }
+};
+
+// Template specialization for parameterless Matcher.
+template <typename Derived>
+class MatcherBaseImpl {
+ public:
+  MatcherBaseImpl() = default;
+
+  template <typename T>
+  operator ::testing::Matcher<T>() const {  // NOLINT(runtime/explicit)
+    return ::testing::Matcher<T>(new
+                                 typename Derived::template gmock_Impl<T>());
+  }
+};
+
+// Template specialization for Matcher with parameters.
+template <template <typename...> class Derived, typename... Ts>
+class MatcherBaseImpl<Derived<Ts...>> {
+ public:
+  // Mark the constructor explicit for single argument T to avoid implicit
+  // conversions.
+  template <typename E = std::enable_if<sizeof...(Ts) == 1>,
+            typename E::type* = nullptr>
+  explicit MatcherBaseImpl(Ts... params)
+      : params_(std::forward<Ts>(params)...) {}
+  template <typename E = std::enable_if<sizeof...(Ts) != 1>,
+            typename = typename E::type>
+  MatcherBaseImpl(Ts... params)  // NOLINT
+      : params_(std::forward<Ts>(params)...) {}
+
+  template <typename F>
+  operator ::testing::Matcher<F>() const {  // NOLINT(runtime/explicit)
+    return Apply<F>(MakeIndexSequence<sizeof...(Ts)>{});
+  }
+
+ private:
+  template <typename F, std::size_t... tuple_ids>
+  ::testing::Matcher<F> Apply(IndexSequence<tuple_ids...>) const {
+    return ::testing::Matcher<F>(
+        new typename Derived<Ts...>::template gmock_Impl<F>(
+            std::get<tuple_ids>(params_)...));
+  }
+
+  const std::tuple<Ts...> params_;
+};
+
+}  // namespace internal
+
+// In order to be safe and clear, casting between different matcher
+// types is done explicitly via MatcherCast<T>(m), which takes a
+// matcher m and returns a Matcher<T>.  It compiles only when T can be
+// statically converted to the argument type of m.
+template <typename T, typename M>
+inline Matcher<T> MatcherCast(const M& matcher) {
+  return internal::MatcherCastImpl<T, M>::Cast(matcher);
+}
+
+// This overload handles polymorphic matchers and values only since
+// monomorphic matchers are handled by the next one.
+template <typename T, typename M>
+inline Matcher<T> SafeMatcherCast(const M& polymorphic_matcher_or_value) {
+  return MatcherCast<T>(polymorphic_matcher_or_value);
+}
+
+// This overload handles monomorphic matchers.
+//
+// In general, if type T can be implicitly converted to type U, we can
+// safely convert a Matcher<U> to a Matcher<T> (i.e. Matcher is
+// contravariant): just keep a copy of the original Matcher<U>, convert the
+// argument from type T to U, and then pass it to the underlying Matcher<U>.
+// The only exception is when U is a reference and T is not, as the
+// underlying Matcher<U> may be interested in the argument's address, which
+// is not preserved in the conversion from T to U.
+template <typename T, typename U>
+inline Matcher<T> SafeMatcherCast(const Matcher<U>& matcher) {
+  // Enforce that T can be implicitly converted to U.
+  static_assert(std::is_convertible<const T&, const U&>::value,
+                "T must be implicitly convertible to U");
+  // Enforce that we are not converting a non-reference type T to a reference
+  // type U.
+  GTEST_COMPILE_ASSERT_(
+      std::is_reference<T>::value || !std::is_reference<U>::value,
+      cannot_convert_non_reference_arg_to_reference);
+  // In case both T and U are arithmetic types, enforce that the
+  // conversion is not lossy.
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(T) RawT;
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(U) RawU;
+  constexpr bool kTIsOther = GMOCK_KIND_OF_(RawT) == internal::kOther;
+  constexpr bool kUIsOther = GMOCK_KIND_OF_(RawU) == internal::kOther;
+  GTEST_COMPILE_ASSERT_(
+      kTIsOther || kUIsOther ||
+      (internal::LosslessArithmeticConvertible<RawT, RawU>::value),
+      conversion_of_arithmetic_types_must_be_lossless);
+  return MatcherCast<T>(matcher);
+}
+
+// A<T>() returns a matcher that matches any value of type T.
+template <typename T>
+Matcher<T> A();
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// If the explanation is not empty, prints it to the ostream.
+inline void PrintIfNotEmpty(const std::string& explanation,
+                            ::std::ostream* os) {
+  if (explanation != "" && os != nullptr) {
+    *os << ", " << explanation;
+  }
+}
+
+// Returns true if the given type name is easy to read by a human.
+// This is used to decide whether printing the type of a value might
+// be helpful.
+inline bool IsReadableTypeName(const std::string& type_name) {
+  // We consider a type name readable if it's short or doesn't contain
+  // a template or function type.
+  return (type_name.length() <= 20 ||
+          type_name.find_first_of("<(") == std::string::npos);
+}
+
+// Matches the value against the given matcher, prints the value and explains
+// the match result to the listener. Returns the match result.
+// 'listener' must not be NULL.
+// Value cannot be passed by const reference, because some matchers take a
+// non-const argument.
+template <typename Value, typename T>
+bool MatchPrintAndExplain(Value& value, const Matcher<T>& matcher,
+                          MatchResultListener* listener) {
+  if (!listener->IsInterested()) {
+    // If the listener is not interested, we do not need to construct the
+    // inner explanation.
+    return matcher.Matches(value);
+  }
+
+  StringMatchResultListener inner_listener;
+  const bool match = matcher.MatchAndExplain(value, &inner_listener);
+
+  UniversalPrint(value, listener->stream());
+#if GTEST_HAS_RTTI
+  const std::string& type_name = GetTypeName<Value>();
+  if (IsReadableTypeName(type_name))
+    *listener->stream() << " (of type " << type_name << ")";
+#endif
+  PrintIfNotEmpty(inner_listener.str(), listener->stream());
+
+  return match;
+}
+
+// An internal helper class for doing compile-time loop on a tuple's
+// fields.
+template <size_t N>
+class TuplePrefix {
+ public:
+  // TuplePrefix<N>::Matches(matcher_tuple, value_tuple) returns true
+  // if and only if the first N fields of matcher_tuple matches
+  // the first N fields of value_tuple, respectively.
+  template <typename MatcherTuple, typename ValueTuple>
+  static bool Matches(const MatcherTuple& matcher_tuple,
+                      const ValueTuple& value_tuple) {
+    return TuplePrefix<N - 1>::Matches(matcher_tuple, value_tuple) &&
+           std::get<N - 1>(matcher_tuple).Matches(std::get<N - 1>(value_tuple));
+  }
+
+  // TuplePrefix<N>::ExplainMatchFailuresTo(matchers, values, os)
+  // describes failures in matching the first N fields of matchers
+  // against the first N fields of values.  If there is no failure,
+  // nothing will be streamed to os.
+  template <typename MatcherTuple, typename ValueTuple>
+  static void ExplainMatchFailuresTo(const MatcherTuple& matchers,
+                                     const ValueTuple& values,
+                                     ::std::ostream* os) {
+    // First, describes failures in the first N - 1 fields.
+    TuplePrefix<N - 1>::ExplainMatchFailuresTo(matchers, values, os);
+
+    // Then describes the failure (if any) in the (N - 1)-th (0-based)
+    // field.
+    typename std::tuple_element<N - 1, MatcherTuple>::type matcher =
+        std::get<N - 1>(matchers);
+    typedef typename std::tuple_element<N - 1, ValueTuple>::type Value;
+    const Value& value = std::get<N - 1>(values);
+    StringMatchResultListener listener;
+    if (!matcher.MatchAndExplain(value, &listener)) {
+      *os << "  Expected arg #" << N - 1 << ": ";
+      std::get<N - 1>(matchers).DescribeTo(os);
+      *os << "\n           Actual: ";
+      // We remove the reference in type Value to prevent the
+      // universal printer from printing the address of value, which
+      // isn't interesting to the user most of the time.  The
+      // matcher's MatchAndExplain() method handles the case when
+      // the address is interesting.
+      internal::UniversalPrint(value, os);
+      PrintIfNotEmpty(listener.str(), os);
+      *os << "\n";
+    }
+  }
+};
+
+// The base case.
+template <>
+class TuplePrefix<0> {
+ public:
+  template <typename MatcherTuple, typename ValueTuple>
+  static bool Matches(const MatcherTuple& /* matcher_tuple */,
+                      const ValueTuple& /* value_tuple */) {
+    return true;
+  }
+
+  template <typename MatcherTuple, typename ValueTuple>
+  static void ExplainMatchFailuresTo(const MatcherTuple& /* matchers */,
+                                     const ValueTuple& /* values */,
+                                     ::std::ostream* /* os */) {}
+};
+
+// TupleMatches(matcher_tuple, value_tuple) returns true if and only if
+// all matchers in matcher_tuple match the corresponding fields in
+// value_tuple.  It is a compiler error if matcher_tuple and
+// value_tuple have different number of fields or incompatible field
+// types.
+template <typename MatcherTuple, typename ValueTuple>
+bool TupleMatches(const MatcherTuple& matcher_tuple,
+                  const ValueTuple& value_tuple) {
+  // Makes sure that matcher_tuple and value_tuple have the same
+  // number of fields.
+  GTEST_COMPILE_ASSERT_(std::tuple_size<MatcherTuple>::value ==
+                            std::tuple_size<ValueTuple>::value,
+                        matcher_and_value_have_different_numbers_of_fields);
+  return TuplePrefix<std::tuple_size<ValueTuple>::value>::Matches(matcher_tuple,
+                                                                  value_tuple);
+}
+
+// Describes failures in matching matchers against values.  If there
+// is no failure, nothing will be streamed to os.
+template <typename MatcherTuple, typename ValueTuple>
+void ExplainMatchFailureTupleTo(const MatcherTuple& matchers,
+                                const ValueTuple& values,
+                                ::std::ostream* os) {
+  TuplePrefix<std::tuple_size<MatcherTuple>::value>::ExplainMatchFailuresTo(
+      matchers, values, os);
+}
+
+// TransformTupleValues and its helper.
+//
+// TransformTupleValuesHelper hides the internal machinery that
+// TransformTupleValues uses to implement a tuple traversal.
+template <typename Tuple, typename Func, typename OutIter>
+class TransformTupleValuesHelper {
+ private:
+  typedef ::std::tuple_size<Tuple> TupleSize;
+
+ public:
+  // For each member of tuple 't', taken in order, evaluates '*out++ = f(t)'.
+  // Returns the final value of 'out' in case the caller needs it.
+  static OutIter Run(Func f, const Tuple& t, OutIter out) {
+    return IterateOverTuple<Tuple, TupleSize::value>()(f, t, out);
+  }
+
+ private:
+  template <typename Tup, size_t kRemainingSize>
+  struct IterateOverTuple {
+    OutIter operator() (Func f, const Tup& t, OutIter out) const {
+      *out++ = f(::std::get<TupleSize::value - kRemainingSize>(t));
+      return IterateOverTuple<Tup, kRemainingSize - 1>()(f, t, out);
+    }
+  };
+  template <typename Tup>
+  struct IterateOverTuple<Tup, 0> {
+    OutIter operator() (Func /* f */, const Tup& /* t */, OutIter out) const {
+      return out;
+    }
+  };
+};
+
+// Successively invokes 'f(element)' on each element of the tuple 't',
+// appending each result to the 'out' iterator. Returns the final value
+// of 'out'.
+template <typename Tuple, typename Func, typename OutIter>
+OutIter TransformTupleValues(Func f, const Tuple& t, OutIter out) {
+  return TransformTupleValuesHelper<Tuple, Func, OutIter>::Run(f, t, out);
+}
+
+// Implements _, a matcher that matches any value of any
+// type.  This is a polymorphic matcher, so we need a template type
+// conversion operator to make it appearing as a Matcher<T> for any
+// type T.
+class AnythingMatcher {
+ public:
+  using is_gtest_matcher = void;
+
+  template <typename T>
+  bool MatchAndExplain(const T& /* x */, std::ostream* /* listener */) const {
+    return true;
+  }
+  void DescribeTo(std::ostream* os) const { *os << "is anything"; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    // This is mostly for completeness' sake, as it's not very useful
+    // to write Not(A<bool>()).  However we cannot completely rule out
+    // such a possibility, and it doesn't hurt to be prepared.
+    *os << "never matches";
+  }
+};
+
+// Implements the polymorphic IsNull() matcher, which matches any raw or smart
+// pointer that is NULL.
+class IsNullMatcher {
+ public:
+  template <typename Pointer>
+  bool MatchAndExplain(const Pointer& p,
+                       MatchResultListener* /* listener */) const {
+    return p == nullptr;
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << "is NULL"; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "isn't NULL";
+  }
+};
+
+// Implements the polymorphic NotNull() matcher, which matches any raw or smart
+// pointer that is not NULL.
+class NotNullMatcher {
+ public:
+  template <typename Pointer>
+  bool MatchAndExplain(const Pointer& p,
+                       MatchResultListener* /* listener */) const {
+    return p != nullptr;
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << "isn't NULL"; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "is NULL";
+  }
+};
+
+// Ref(variable) matches any argument that is a reference to
+// 'variable'.  This matcher is polymorphic as it can match any
+// super type of the type of 'variable'.
+//
+// The RefMatcher template class implements Ref(variable).  It can
+// only be instantiated with a reference type.  This prevents a user
+// from mistakenly using Ref(x) to match a non-reference function
+// argument.  For example, the following will righteously cause a
+// compiler error:
+//
+//   int n;
+//   Matcher<int> m1 = Ref(n);   // This won't compile.
+//   Matcher<int&> m2 = Ref(n);  // This will compile.
+template <typename T>
+class RefMatcher;
+
+template <typename T>
+class RefMatcher<T&> {
+  // Google Mock is a generic framework and thus needs to support
+  // mocking any function types, including those that take non-const
+  // reference arguments.  Therefore the template parameter T (and
+  // Super below) can be instantiated to either a const type or a
+  // non-const type.
+ public:
+  // RefMatcher() takes a T& instead of const T&, as we want the
+  // compiler to catch using Ref(const_value) as a matcher for a
+  // non-const reference.
+  explicit RefMatcher(T& x) : object_(x) {}  // NOLINT
+
+  template <typename Super>
+  operator Matcher<Super&>() const {
+    // By passing object_ (type T&) to Impl(), which expects a Super&,
+    // we make sure that Super is a super type of T.  In particular,
+    // this catches using Ref(const_value) as a matcher for a
+    // non-const reference, as you cannot implicitly convert a const
+    // reference to a non-const reference.
+    return MakeMatcher(new Impl<Super>(object_));
+  }
+
+ private:
+  template <typename Super>
+  class Impl : public MatcherInterface<Super&> {
+   public:
+    explicit Impl(Super& x) : object_(x) {}  // NOLINT
+
+    // MatchAndExplain() takes a Super& (as opposed to const Super&)
+    // in order to match the interface MatcherInterface<Super&>.
+    bool MatchAndExplain(Super& x,
+                         MatchResultListener* listener) const override {
+      *listener << "which is located @" << static_cast<const void*>(&x);
+      return &x == &object_;
+    }
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "references the variable ";
+      UniversalPrinter<Super&>::Print(object_, os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "does not reference the variable ";
+      UniversalPrinter<Super&>::Print(object_, os);
+    }
+
+   private:
+    const Super& object_;
+  };
+
+  T& object_;
+};
+
+// Polymorphic helper functions for narrow and wide string matchers.
+inline bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
+  return String::CaseInsensitiveCStringEquals(lhs, rhs);
+}
+
+inline bool CaseInsensitiveCStringEquals(const wchar_t* lhs,
+                                         const wchar_t* rhs) {
+  return String::CaseInsensitiveWideCStringEquals(lhs, rhs);
+}
+
+// String comparison for narrow or wide strings that can have embedded NUL
+// characters.
+template <typename StringType>
+bool CaseInsensitiveStringEquals(const StringType& s1,
+                                 const StringType& s2) {
+  // Are the heads equal?
+  if (!CaseInsensitiveCStringEquals(s1.c_str(), s2.c_str())) {
+    return false;
+  }
+
+  // Skip the equal heads.
+  const typename StringType::value_type nul = 0;
+  const size_t i1 = s1.find(nul), i2 = s2.find(nul);
+
+  // Are we at the end of either s1 or s2?
+  if (i1 == StringType::npos || i2 == StringType::npos) {
+    return i1 == i2;
+  }
+
+  // Are the tails equal?
+  return CaseInsensitiveStringEquals(s1.substr(i1 + 1), s2.substr(i2 + 1));
+}
+
+// String matchers.
+
+// Implements equality-based string matchers like StrEq, StrCaseNe, and etc.
+template <typename StringType>
+class StrEqualityMatcher {
+ public:
+  StrEqualityMatcher(StringType str, bool expect_eq, bool case_sensitive)
+      : string_(std::move(str)),
+        expect_eq_(expect_eq),
+        case_sensitive_(case_sensitive) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  bool MatchAndExplain(const internal::StringView& s,
+                       MatchResultListener* listener) const {
+    // This should fail to compile if StringView is used with wide
+    // strings.
+    const StringType& str = std::string(s);
+    return MatchAndExplain(str, listener);
+  }
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    if (s == nullptr) {
+      return !expect_eq_;
+    }
+    return MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because StringView has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType s2(s);
+    const bool eq = case_sensitive_ ? s2 == string_ :
+        CaseInsensitiveStringEquals(s2, string_);
+    return expect_eq_ == eq;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    DescribeToHelper(expect_eq_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    DescribeToHelper(!expect_eq_, os);
+  }
+
+ private:
+  void DescribeToHelper(bool expect_eq, ::std::ostream* os) const {
+    *os << (expect_eq ? "is " : "isn't ");
+    *os << "equal to ";
+    if (!case_sensitive_) {
+      *os << "(ignoring case) ";
+    }
+    UniversalPrint(string_, os);
+  }
+
+  const StringType string_;
+  const bool expect_eq_;
+  const bool case_sensitive_;
+};
+
+// Implements the polymorphic HasSubstr(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class HasSubstrMatcher {
+ public:
+  explicit HasSubstrMatcher(const StringType& substring)
+      : substring_(substring) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  bool MatchAndExplain(const internal::StringView& s,
+                       MatchResultListener* listener) const {
+    // This should fail to compile if StringView is used with wide
+    // strings.
+    const StringType& str = std::string(s);
+    return MatchAndExplain(str, listener);
+  }
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != nullptr && MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because StringView has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    return StringType(s).find(substring_) != StringType::npos;
+  }
+
+  // Describes what this matcher matches.
+  void DescribeTo(::std::ostream* os) const {
+    *os << "has substring ";
+    UniversalPrint(substring_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "has no substring ";
+    UniversalPrint(substring_, os);
+  }
+
+ private:
+  const StringType substring_;
+};
+
+// Implements the polymorphic StartsWith(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class StartsWithMatcher {
+ public:
+  explicit StartsWithMatcher(const StringType& prefix) : prefix_(prefix) {
+  }
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  bool MatchAndExplain(const internal::StringView& s,
+                       MatchResultListener* listener) const {
+    // This should fail to compile if StringView is used with wide
+    // strings.
+    const StringType& str = std::string(s);
+    return MatchAndExplain(str, listener);
+  }
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != nullptr && MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because StringView has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType& s2(s);
+    return s2.length() >= prefix_.length() &&
+        s2.substr(0, prefix_.length()) == prefix_;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "starts with ";
+    UniversalPrint(prefix_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't start with ";
+    UniversalPrint(prefix_, os);
+  }
+
+ private:
+  const StringType prefix_;
+};
+
+// Implements the polymorphic EndsWith(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class EndsWithMatcher {
+ public:
+  explicit EndsWithMatcher(const StringType& suffix) : suffix_(suffix) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  bool MatchAndExplain(const internal::StringView& s,
+                       MatchResultListener* listener) const {
+    // This should fail to compile if StringView is used with wide
+    // strings.
+    const StringType& str = std::string(s);
+    return MatchAndExplain(str, listener);
+  }
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != nullptr && MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because StringView has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType& s2(s);
+    return s2.length() >= suffix_.length() &&
+        s2.substr(s2.length() - suffix_.length()) == suffix_;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "ends with ";
+    UniversalPrint(suffix_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't end with ";
+    UniversalPrint(suffix_, os);
+  }
+
+ private:
+  const StringType suffix_;
+};
+
+// Implements a matcher that compares the two fields of a 2-tuple
+// using one of the ==, <=, <, etc, operators.  The two fields being
+// compared don't have to have the same type.
+//
+// The matcher defined here is polymorphic (for example, Eq() can be
+// used to match a std::tuple<int, short>, a std::tuple<const long&, double>,
+// etc).  Therefore we use a template type conversion operator in the
+// implementation.
+template <typename D, typename Op>
+class PairMatchBase {
+ public:
+  template <typename T1, typename T2>
+  operator Matcher<::std::tuple<T1, T2>>() const {
+    return Matcher<::std::tuple<T1, T2>>(new Impl<const ::std::tuple<T1, T2>&>);
+  }
+  template <typename T1, typename T2>
+  operator Matcher<const ::std::tuple<T1, T2>&>() const {
+    return MakeMatcher(new Impl<const ::std::tuple<T1, T2>&>);
+  }
+
+ private:
+  static ::std::ostream& GetDesc(::std::ostream& os) {  // NOLINT
+    return os << D::Desc();
+  }
+
+  template <typename Tuple>
+  class Impl : public MatcherInterface<Tuple> {
+   public:
+    bool MatchAndExplain(Tuple args,
+                         MatchResultListener* /* listener */) const override {
+      return Op()(::std::get<0>(args), ::std::get<1>(args));
+    }
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "are " << GetDesc;
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "aren't " << GetDesc;
+    }
+  };
+};
+
+class Eq2Matcher : public PairMatchBase<Eq2Matcher, AnyEq> {
+ public:
+  static const char* Desc() { return "an equal pair"; }
+};
+class Ne2Matcher : public PairMatchBase<Ne2Matcher, AnyNe> {
+ public:
+  static const char* Desc() { return "an unequal pair"; }
+};
+class Lt2Matcher : public PairMatchBase<Lt2Matcher, AnyLt> {
+ public:
+  static const char* Desc() { return "a pair where the first < the second"; }
+};
+class Gt2Matcher : public PairMatchBase<Gt2Matcher, AnyGt> {
+ public:
+  static const char* Desc() { return "a pair where the first > the second"; }
+};
+class Le2Matcher : public PairMatchBase<Le2Matcher, AnyLe> {
+ public:
+  static const char* Desc() { return "a pair where the first <= the second"; }
+};
+class Ge2Matcher : public PairMatchBase<Ge2Matcher, AnyGe> {
+ public:
+  static const char* Desc() { return "a pair where the first >= the second"; }
+};
+
+// Implements the Not(...) matcher for a particular argument type T.
+// We do not nest it inside the NotMatcher class template, as that
+// will prevent different instantiations of NotMatcher from sharing
+// the same NotMatcherImpl<T> class.
+template <typename T>
+class NotMatcherImpl : public MatcherInterface<const T&> {
+ public:
+  explicit NotMatcherImpl(const Matcher<T>& matcher)
+      : matcher_(matcher) {}
+
+  bool MatchAndExplain(const T& x,
+                       MatchResultListener* listener) const override {
+    return !matcher_.MatchAndExplain(x, listener);
+  }
+
+  void DescribeTo(::std::ostream* os) const override {
+    matcher_.DescribeNegationTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    matcher_.DescribeTo(os);
+  }
+
+ private:
+  const Matcher<T> matcher_;
+};
+
+// Implements the Not(m) matcher, which matches a value that doesn't
+// match matcher m.
+template <typename InnerMatcher>
+class NotMatcher {
+ public:
+  explicit NotMatcher(InnerMatcher matcher) : matcher_(matcher) {}
+
+  // This template type conversion operator allows Not(m) to be used
+  // to match any type m can match.
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new NotMatcherImpl<T>(SafeMatcherCast<T>(matcher_)));
+  }
+
+ private:
+  InnerMatcher matcher_;
+};
+
+// Implements the AllOf(m1, m2) matcher for a particular argument type
+// T. We do not nest it inside the BothOfMatcher class template, as
+// that will prevent different instantiations of BothOfMatcher from
+// sharing the same BothOfMatcherImpl<T> class.
+template <typename T>
+class AllOfMatcherImpl : public MatcherInterface<const T&> {
+ public:
+  explicit AllOfMatcherImpl(std::vector<Matcher<T> > matchers)
+      : matchers_(std::move(matchers)) {}
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "(";
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      if (i != 0) *os << ") and (";
+      matchers_[i].DescribeTo(os);
+    }
+    *os << ")";
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "(";
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      if (i != 0) *os << ") or (";
+      matchers_[i].DescribeNegationTo(os);
+    }
+    *os << ")";
+  }
+
+  bool MatchAndExplain(const T& x,
+                       MatchResultListener* listener) const override {
+    // If either matcher1_ or matcher2_ doesn't match x, we only need
+    // to explain why one of them fails.
+    std::string all_match_result;
+
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      StringMatchResultListener slistener;
+      if (matchers_[i].MatchAndExplain(x, &slistener)) {
+        if (all_match_result.empty()) {
+          all_match_result = slistener.str();
+        } else {
+          std::string result = slistener.str();
+          if (!result.empty()) {
+            all_match_result += ", and ";
+            all_match_result += result;
+          }
+        }
+      } else {
+        *listener << slistener.str();
+        return false;
+      }
+    }
+
+    // Otherwise we need to explain why *both* of them match.
+    *listener << all_match_result;
+    return true;
+  }
+
+ private:
+  const std::vector<Matcher<T> > matchers_;
+};
+
+// VariadicMatcher is used for the variadic implementation of
+// AllOf(m_1, m_2, ...) and AnyOf(m_1, m_2, ...).
+// CombiningMatcher<T> is used to recursively combine the provided matchers
+// (of type Args...).
+template <template <typename T> class CombiningMatcher, typename... Args>
+class VariadicMatcher {
+ public:
+  VariadicMatcher(const Args&... matchers)  // NOLINT
+      : matchers_(matchers...) {
+    static_assert(sizeof...(Args) > 0, "Must have at least one matcher.");
+  }
+
+  VariadicMatcher(const VariadicMatcher&) = default;
+  VariadicMatcher& operator=(const VariadicMatcher&) = delete;
+
+  // This template type conversion operator allows an
+  // VariadicMatcher<Matcher1, Matcher2...> object to match any type that
+  // all of the provided matchers (Matcher1, Matcher2, ...) can match.
+  template <typename T>
+  operator Matcher<T>() const {
+    std::vector<Matcher<T> > values;
+    CreateVariadicMatcher<T>(&values, std::integral_constant<size_t, 0>());
+    return Matcher<T>(new CombiningMatcher<T>(std::move(values)));
+  }
+
+ private:
+  template <typename T, size_t I>
+  void CreateVariadicMatcher(std::vector<Matcher<T> >* values,
+                             std::integral_constant<size_t, I>) const {
+    values->push_back(SafeMatcherCast<T>(std::get<I>(matchers_)));
+    CreateVariadicMatcher<T>(values, std::integral_constant<size_t, I + 1>());
+  }
+
+  template <typename T>
+  void CreateVariadicMatcher(
+      std::vector<Matcher<T> >*,
+      std::integral_constant<size_t, sizeof...(Args)>) const {}
+
+  std::tuple<Args...> matchers_;
+};
+
+template <typename... Args>
+using AllOfMatcher = VariadicMatcher<AllOfMatcherImpl, Args...>;
+
+// Implements the AnyOf(m1, m2) matcher for a particular argument type
+// T.  We do not nest it inside the AnyOfMatcher class template, as
+// that will prevent different instantiations of AnyOfMatcher from
+// sharing the same EitherOfMatcherImpl<T> class.
+template <typename T>
+class AnyOfMatcherImpl : public MatcherInterface<const T&> {
+ public:
+  explicit AnyOfMatcherImpl(std::vector<Matcher<T> > matchers)
+      : matchers_(std::move(matchers)) {}
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "(";
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      if (i != 0) *os << ") or (";
+      matchers_[i].DescribeTo(os);
+    }
+    *os << ")";
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "(";
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      if (i != 0) *os << ") and (";
+      matchers_[i].DescribeNegationTo(os);
+    }
+    *os << ")";
+  }
+
+  bool MatchAndExplain(const T& x,
+                       MatchResultListener* listener) const override {
+    std::string no_match_result;
+
+    // If either matcher1_ or matcher2_ matches x, we just need to
+    // explain why *one* of them matches.
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      StringMatchResultListener slistener;
+      if (matchers_[i].MatchAndExplain(x, &slistener)) {
+        *listener << slistener.str();
+        return true;
+      } else {
+        if (no_match_result.empty()) {
+          no_match_result = slistener.str();
+        } else {
+          std::string result = slistener.str();
+          if (!result.empty()) {
+            no_match_result += ", and ";
+            no_match_result += result;
+          }
+        }
+      }
+    }
+
+    // Otherwise we need to explain why *both* of them fail.
+    *listener << no_match_result;
+    return false;
+  }
+
+ private:
+  const std::vector<Matcher<T> > matchers_;
+};
+
+// AnyOfMatcher is used for the variadic implementation of AnyOf(m_1, m_2, ...).
+template <typename... Args>
+using AnyOfMatcher = VariadicMatcher<AnyOfMatcherImpl, Args...>;
+
+// Wrapper for implementation of Any/AllOfArray().
+template <template <class> class MatcherImpl, typename T>
+class SomeOfArrayMatcher {
+ public:
+  // Constructs the matcher from a sequence of element values or
+  // element matchers.
+  template <typename Iter>
+  SomeOfArrayMatcher(Iter first, Iter last) : matchers_(first, last) {}
+
+  template <typename U>
+  operator Matcher<U>() const {  // NOLINT
+    using RawU = typename std::decay<U>::type;
+    std::vector<Matcher<RawU>> matchers;
+    for (const auto& matcher : matchers_) {
+      matchers.push_back(MatcherCast<RawU>(matcher));
+    }
+    return Matcher<U>(new MatcherImpl<RawU>(std::move(matchers)));
+  }
+
+ private:
+  const ::std::vector<T> matchers_;
+};
+
+template <typename T>
+using AllOfArrayMatcher = SomeOfArrayMatcher<AllOfMatcherImpl, T>;
+
+template <typename T>
+using AnyOfArrayMatcher = SomeOfArrayMatcher<AnyOfMatcherImpl, T>;
+
+// Used for implementing Truly(pred), which turns a predicate into a
+// matcher.
+template <typename Predicate>
+class TrulyMatcher {
+ public:
+  explicit TrulyMatcher(Predicate pred) : predicate_(pred) {}
+
+  // This method template allows Truly(pred) to be used as a matcher
+  // for type T where T is the argument type of predicate 'pred'.  The
+  // argument is passed by reference as the predicate may be
+  // interested in the address of the argument.
+  template <typename T>
+  bool MatchAndExplain(T& x,  // NOLINT
+                       MatchResultListener* listener) const {
+    // Without the if-statement, MSVC sometimes warns about converting
+    // a value to bool (warning 4800).
+    //
+    // We cannot write 'return !!predicate_(x);' as that doesn't work
+    // when predicate_(x) returns a class convertible to bool but
+    // having no operator!().
+    if (predicate_(x))
+      return true;
+    *listener << "didn't satisfy the given predicate";
+    return false;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "satisfies the given predicate";
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't satisfy the given predicate";
+  }
+
+ private:
+  Predicate predicate_;
+};
+
+// Used for implementing Matches(matcher), which turns a matcher into
+// a predicate.
+template <typename M>
+class MatcherAsPredicate {
+ public:
+  explicit MatcherAsPredicate(M matcher) : matcher_(matcher) {}
+
+  // This template operator() allows Matches(m) to be used as a
+  // predicate on type T where m is a matcher on type T.
+  //
+  // The argument x is passed by reference instead of by value, as
+  // some matcher may be interested in its address (e.g. as in
+  // Matches(Ref(n))(x)).
+  template <typename T>
+  bool operator()(const T& x) const {
+    // We let matcher_ commit to a particular type here instead of
+    // when the MatcherAsPredicate object was constructed.  This
+    // allows us to write Matches(m) where m is a polymorphic matcher
+    // (e.g. Eq(5)).
+    //
+    // If we write Matcher<T>(matcher_).Matches(x) here, it won't
+    // compile when matcher_ has type Matcher<const T&>; if we write
+    // Matcher<const T&>(matcher_).Matches(x) here, it won't compile
+    // when matcher_ has type Matcher<T>; if we just write
+    // matcher_.Matches(x), it won't compile when matcher_ is
+    // polymorphic, e.g. Eq(5).
+    //
+    // MatcherCast<const T&>() is necessary for making the code work
+    // in all of the above situations.
+    return MatcherCast<const T&>(matcher_).Matches(x);
+  }
+
+ private:
+  M matcher_;
+};
+
+// For implementing ASSERT_THAT() and EXPECT_THAT().  The template
+// argument M must be a type that can be converted to a matcher.
+template <typename M>
+class PredicateFormatterFromMatcher {
+ public:
+  explicit PredicateFormatterFromMatcher(M m) : matcher_(std::move(m)) {}
+
+  // This template () operator allows a PredicateFormatterFromMatcher
+  // object to act as a predicate-formatter suitable for using with
+  // Google Test's EXPECT_PRED_FORMAT1() macro.
+  template <typename T>
+  AssertionResult operator()(const char* value_text, const T& x) const {
+    // We convert matcher_ to a Matcher<const T&> *now* instead of
+    // when the PredicateFormatterFromMatcher object was constructed,
+    // as matcher_ may be polymorphic (e.g. NotNull()) and we won't
+    // know which type to instantiate it to until we actually see the
+    // type of x here.
+    //
+    // We write SafeMatcherCast<const T&>(matcher_) instead of
+    // Matcher<const T&>(matcher_), as the latter won't compile when
+    // matcher_ has type Matcher<T> (e.g. An<int>()).
+    // We don't write MatcherCast<const T&> either, as that allows
+    // potentially unsafe downcasting of the matcher argument.
+    const Matcher<const T&> matcher = SafeMatcherCast<const T&>(matcher_);
+
+    // The expected path here is that the matcher should match (i.e. that most
+    // tests pass) so optimize for this case.
+    if (matcher.Matches(x)) {
+      return AssertionSuccess();
+    }
+
+    ::std::stringstream ss;
+    ss << "Value of: " << value_text << "\n"
+       << "Expected: ";
+    matcher.DescribeTo(&ss);
+
+    // Rerun the matcher to "PrintAndExplain" the failure.
+    StringMatchResultListener listener;
+    if (MatchPrintAndExplain(x, matcher, &listener)) {
+      ss << "\n  The matcher failed on the initial attempt; but passed when "
+            "rerun to generate the explanation.";
+    }
+    ss << "\n  Actual: " << listener.str();
+    return AssertionFailure() << ss.str();
+  }
+
+ private:
+  const M matcher_;
+};
+
+// A helper function for converting a matcher to a predicate-formatter
+// without the user needing to explicitly write the type.  This is
+// used for implementing ASSERT_THAT() and EXPECT_THAT().
+// Implementation detail: 'matcher' is received by-value to force decaying.
+template <typename M>
+inline PredicateFormatterFromMatcher<M>
+MakePredicateFormatterFromMatcher(M matcher) {
+  return PredicateFormatterFromMatcher<M>(std::move(matcher));
+}
+
+// Implements the polymorphic IsNan() matcher, which matches any floating type
+// value that is Nan.
+class IsNanMatcher {
+ public:
+  template <typename FloatType>
+  bool MatchAndExplain(const FloatType& f,
+                       MatchResultListener* /* listener */) const {
+    return (::std::isnan)(f);
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << "is NaN"; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "isn't NaN";
+  }
+};
+
+// Implements the polymorphic floating point equality matcher, which matches
+// two float values using ULP-based approximation or, optionally, a
+// user-specified epsilon.  The template is meant to be instantiated with
+// FloatType being either float or double.
+template <typename FloatType>
+class FloatingEqMatcher {
+ public:
+  // Constructor for FloatingEqMatcher.
+  // The matcher's input will be compared with expected.  The matcher treats two
+  // NANs as equal if nan_eq_nan is true.  Otherwise, under IEEE standards,
+  // equality comparisons between NANs will always return false.  We specify a
+  // negative max_abs_error_ term to indicate that ULP-based approximation will
+  // be used for comparison.
+  FloatingEqMatcher(FloatType expected, bool nan_eq_nan) :
+    expected_(expected), nan_eq_nan_(nan_eq_nan), max_abs_error_(-1) {
+  }
+
+  // Constructor that supports a user-specified max_abs_error that will be used
+  // for comparison instead of ULP-based approximation.  The max absolute
+  // should be non-negative.
+  FloatingEqMatcher(FloatType expected, bool nan_eq_nan,
+                    FloatType max_abs_error)
+      : expected_(expected),
+        nan_eq_nan_(nan_eq_nan),
+        max_abs_error_(max_abs_error) {
+    GTEST_CHECK_(max_abs_error >= 0)
+        << ", where max_abs_error is" << max_abs_error;
+  }
+
+  // Implements floating point equality matcher as a Matcher<T>.
+  template <typename T>
+  class Impl : public MatcherInterface<T> {
+   public:
+    Impl(FloatType expected, bool nan_eq_nan, FloatType max_abs_error)
+        : expected_(expected),
+          nan_eq_nan_(nan_eq_nan),
+          max_abs_error_(max_abs_error) {}
+
+    bool MatchAndExplain(T value,
+                         MatchResultListener* listener) const override {
+      const FloatingPoint<FloatType> actual(value), expected(expected_);
+
+      // Compares NaNs first, if nan_eq_nan_ is true.
+      if (actual.is_nan() || expected.is_nan()) {
+        if (actual.is_nan() && expected.is_nan()) {
+          return nan_eq_nan_;
+        }
+        // One is nan; the other is not nan.
+        return false;
+      }
+      if (HasMaxAbsError()) {
+        // We perform an equality check so that inf will match inf, regardless
+        // of error bounds.  If the result of value - expected_ would result in
+        // overflow or if either value is inf, the default result is infinity,
+        // which should only match if max_abs_error_ is also infinity.
+        if (value == expected_) {
+          return true;
+        }
+
+        const FloatType diff = value - expected_;
+        if (::std::fabs(diff) <= max_abs_error_) {
+          return true;
+        }
+
+        if (listener->IsInterested()) {
+          *listener << "which is " << diff << " from " << expected_;
+        }
+        return false;
+      } else {
+        return actual.AlmostEquals(expected);
+      }
+    }
+
+    void DescribeTo(::std::ostream* os) const override {
+      // os->precision() returns the previously set precision, which we
+      // store to restore the ostream to its original configuration
+      // after outputting.
+      const ::std::streamsize old_precision = os->precision(
+          ::std::numeric_limits<FloatType>::digits10 + 2);
+      if (FloatingPoint<FloatType>(expected_).is_nan()) {
+        if (nan_eq_nan_) {
+          *os << "is NaN";
+        } else {
+          *os << "never matches";
+        }
+      } else {
+        *os << "is approximately " << expected_;
+        if (HasMaxAbsError()) {
+          *os << " (absolute error <= " << max_abs_error_ << ")";
+        }
+      }
+      os->precision(old_precision);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      // As before, get original precision.
+      const ::std::streamsize old_precision = os->precision(
+          ::std::numeric_limits<FloatType>::digits10 + 2);
+      if (FloatingPoint<FloatType>(expected_).is_nan()) {
+        if (nan_eq_nan_) {
+          *os << "isn't NaN";
+        } else {
+          *os << "is anything";
+        }
+      } else {
+        *os << "isn't approximately " << expected_;
+        if (HasMaxAbsError()) {
+          *os << " (absolute error > " << max_abs_error_ << ")";
+        }
+      }
+      // Restore original precision.
+      os->precision(old_precision);
+    }
+
+   private:
+    bool HasMaxAbsError() const {
+      return max_abs_error_ >= 0;
+    }
+
+    const FloatType expected_;
+    const bool nan_eq_nan_;
+    // max_abs_error will be used for value comparison when >= 0.
+    const FloatType max_abs_error_;
+  };
+
+  // The following 3 type conversion operators allow FloatEq(expected) and
+  // NanSensitiveFloatEq(expected) to be used as a Matcher<float>, a
+  // Matcher<const float&>, or a Matcher<float&>, but nothing else.
+  operator Matcher<FloatType>() const {
+    return MakeMatcher(
+        new Impl<FloatType>(expected_, nan_eq_nan_, max_abs_error_));
+  }
+
+  operator Matcher<const FloatType&>() const {
+    return MakeMatcher(
+        new Impl<const FloatType&>(expected_, nan_eq_nan_, max_abs_error_));
+  }
+
+  operator Matcher<FloatType&>() const {
+    return MakeMatcher(
+        new Impl<FloatType&>(expected_, nan_eq_nan_, max_abs_error_));
+  }
+
+ private:
+  const FloatType expected_;
+  const bool nan_eq_nan_;
+  // max_abs_error will be used for value comparison when >= 0.
+  const FloatType max_abs_error_;
+};
+
+// A 2-tuple ("binary") wrapper around FloatingEqMatcher:
+// FloatingEq2Matcher() matches (x, y) by matching FloatingEqMatcher(x, false)
+// against y, and FloatingEq2Matcher(e) matches FloatingEqMatcher(x, false, e)
+// against y. The former implements "Eq", the latter "Near". At present, there
+// is no version that compares NaNs as equal.
+template <typename FloatType>
+class FloatingEq2Matcher {
+ public:
+  FloatingEq2Matcher() { Init(-1, false); }
+
+  explicit FloatingEq2Matcher(bool nan_eq_nan) { Init(-1, nan_eq_nan); }
+
+  explicit FloatingEq2Matcher(FloatType max_abs_error) {
+    Init(max_abs_error, false);
+  }
+
+  FloatingEq2Matcher(FloatType max_abs_error, bool nan_eq_nan) {
+    Init(max_abs_error, nan_eq_nan);
+  }
+
+  template <typename T1, typename T2>
+  operator Matcher<::std::tuple<T1, T2>>() const {
+    return MakeMatcher(
+        new Impl<::std::tuple<T1, T2>>(max_abs_error_, nan_eq_nan_));
+  }
+  template <typename T1, typename T2>
+  operator Matcher<const ::std::tuple<T1, T2>&>() const {
+    return MakeMatcher(
+        new Impl<const ::std::tuple<T1, T2>&>(max_abs_error_, nan_eq_nan_));
+  }
+
+ private:
+  static ::std::ostream& GetDesc(::std::ostream& os) {  // NOLINT
+    return os << "an almost-equal pair";
+  }
+
+  template <typename Tuple>
+  class Impl : public MatcherInterface<Tuple> {
+   public:
+    Impl(FloatType max_abs_error, bool nan_eq_nan) :
+        max_abs_error_(max_abs_error),
+        nan_eq_nan_(nan_eq_nan) {}
+
+    bool MatchAndExplain(Tuple args,
+                         MatchResultListener* listener) const override {
+      if (max_abs_error_ == -1) {
+        FloatingEqMatcher<FloatType> fm(::std::get<0>(args), nan_eq_nan_);
+        return static_cast<Matcher<FloatType>>(fm).MatchAndExplain(
+            ::std::get<1>(args), listener);
+      } else {
+        FloatingEqMatcher<FloatType> fm(::std::get<0>(args), nan_eq_nan_,
+                                        max_abs_error_);
+        return static_cast<Matcher<FloatType>>(fm).MatchAndExplain(
+            ::std::get<1>(args), listener);
+      }
+    }
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "are " << GetDesc;
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "aren't " << GetDesc;
+    }
+
+   private:
+    FloatType max_abs_error_;
+    const bool nan_eq_nan_;
+  };
+
+  void Init(FloatType max_abs_error_val, bool nan_eq_nan_val) {
+    max_abs_error_ = max_abs_error_val;
+    nan_eq_nan_ = nan_eq_nan_val;
+  }
+  FloatType max_abs_error_;
+  bool nan_eq_nan_;
+};
+
+// Implements the Pointee(m) matcher for matching a pointer whose
+// pointee matches matcher m.  The pointer can be either raw or smart.
+template <typename InnerMatcher>
+class PointeeMatcher {
+ public:
+  explicit PointeeMatcher(const InnerMatcher& matcher) : matcher_(matcher) {}
+
+  // This type conversion operator template allows Pointee(m) to be
+  // used as a matcher for any pointer type whose pointee type is
+  // compatible with the inner matcher, where type Pointer can be
+  // either a raw pointer or a smart pointer.
+  //
+  // The reason we do this instead of relying on
+  // MakePolymorphicMatcher() is that the latter is not flexible
+  // enough for implementing the DescribeTo() method of Pointee().
+  template <typename Pointer>
+  operator Matcher<Pointer>() const {
+    return Matcher<Pointer>(new Impl<const Pointer&>(matcher_));
+  }
+
+ private:
+  // The monomorphic implementation that works for a particular pointer type.
+  template <typename Pointer>
+  class Impl : public MatcherInterface<Pointer> {
+   public:
+    using Pointee =
+        typename std::pointer_traits<GTEST_REMOVE_REFERENCE_AND_CONST_(
+            Pointer)>::element_type;
+
+    explicit Impl(const InnerMatcher& matcher)
+        : matcher_(MatcherCast<const Pointee&>(matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "points to a value that ";
+      matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "does not point to a value that ";
+      matcher_.DescribeTo(os);
+    }
+
+    bool MatchAndExplain(Pointer pointer,
+                         MatchResultListener* listener) const override {
+      if (GetRawPointer(pointer) == nullptr) return false;
+
+      *listener << "which points to ";
+      return MatchPrintAndExplain(*pointer, matcher_, listener);
+    }
+
+   private:
+    const Matcher<const Pointee&> matcher_;
+  };
+
+  const InnerMatcher matcher_;
+};
+
+// Implements the Pointer(m) matcher
+// Implements the Pointer(m) matcher for matching a pointer that matches matcher
+// m.  The pointer can be either raw or smart, and will match `m` against the
+// raw pointer.
+template <typename InnerMatcher>
+class PointerMatcher {
+ public:
+  explicit PointerMatcher(const InnerMatcher& matcher) : matcher_(matcher) {}
+
+  // This type conversion operator template allows Pointer(m) to be
+  // used as a matcher for any pointer type whose pointer type is
+  // compatible with the inner matcher, where type PointerType can be
+  // either a raw pointer or a smart pointer.
+  //
+  // The reason we do this instead of relying on
+  // MakePolymorphicMatcher() is that the latter is not flexible
+  // enough for implementing the DescribeTo() method of Pointer().
+  template <typename PointerType>
+  operator Matcher<PointerType>() const {  // NOLINT
+    return Matcher<PointerType>(new Impl<const PointerType&>(matcher_));
+  }
+
+ private:
+  // The monomorphic implementation that works for a particular pointer type.
+  template <typename PointerType>
+  class Impl : public MatcherInterface<PointerType> {
+   public:
+    using Pointer =
+        const typename std::pointer_traits<GTEST_REMOVE_REFERENCE_AND_CONST_(
+            PointerType)>::element_type*;
+
+    explicit Impl(const InnerMatcher& matcher)
+        : matcher_(MatcherCast<Pointer>(matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "is a pointer that ";
+      matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "is not a pointer that ";
+      matcher_.DescribeTo(os);
+    }
+
+    bool MatchAndExplain(PointerType pointer,
+                         MatchResultListener* listener) const override {
+      *listener << "which is a pointer that ";
+      Pointer p = GetRawPointer(pointer);
+      return MatchPrintAndExplain(p, matcher_, listener);
+    }
+
+   private:
+    Matcher<Pointer> matcher_;
+  };
+
+  const InnerMatcher matcher_;
+};
+
+#if GTEST_HAS_RTTI
+// Implements the WhenDynamicCastTo<T>(m) matcher that matches a pointer or
+// reference that matches inner_matcher when dynamic_cast<T> is applied.
+// The result of dynamic_cast<To> is forwarded to the inner matcher.
+// If To is a pointer and the cast fails, the inner matcher will receive NULL.
+// If To is a reference and the cast fails, this matcher returns false
+// immediately.
+template <typename To>
+class WhenDynamicCastToMatcherBase {
+ public:
+  explicit WhenDynamicCastToMatcherBase(const Matcher<To>& matcher)
+      : matcher_(matcher) {}
+
+  void DescribeTo(::std::ostream* os) const {
+    GetCastTypeDescription(os);
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    GetCastTypeDescription(os);
+    matcher_.DescribeNegationTo(os);
+  }
+
+ protected:
+  const Matcher<To> matcher_;
+
+  static std::string GetToName() {
+    return GetTypeName<To>();
+  }
+
+ private:
+  static void GetCastTypeDescription(::std::ostream* os) {
+    *os << "when dynamic_cast to " << GetToName() << ", ";
+  }
+};
+
+// Primary template.
+// To is a pointer. Cast and forward the result.
+template <typename To>
+class WhenDynamicCastToMatcher : public WhenDynamicCastToMatcherBase<To> {
+ public:
+  explicit WhenDynamicCastToMatcher(const Matcher<To>& matcher)
+      : WhenDynamicCastToMatcherBase<To>(matcher) {}
+
+  template <typename From>
+  bool MatchAndExplain(From from, MatchResultListener* listener) const {
+    To to = dynamic_cast<To>(from);
+    return MatchPrintAndExplain(to, this->matcher_, listener);
+  }
+};
+
+// Specialize for references.
+// In this case we return false if the dynamic_cast fails.
+template <typename To>
+class WhenDynamicCastToMatcher<To&> : public WhenDynamicCastToMatcherBase<To&> {
+ public:
+  explicit WhenDynamicCastToMatcher(const Matcher<To&>& matcher)
+      : WhenDynamicCastToMatcherBase<To&>(matcher) {}
+
+  template <typename From>
+  bool MatchAndExplain(From& from, MatchResultListener* listener) const {
+    // We don't want an std::bad_cast here, so do the cast with pointers.
+    To* to = dynamic_cast<To*>(&from);
+    if (to == nullptr) {
+      *listener << "which cannot be dynamic_cast to " << this->GetToName();
+      return false;
+    }
+    return MatchPrintAndExplain(*to, this->matcher_, listener);
+  }
+};
+#endif  // GTEST_HAS_RTTI
+
+// Implements the Field() matcher for matching a field (i.e. member
+// variable) of an object.
+template <typename Class, typename FieldType>
+class FieldMatcher {
+ public:
+  FieldMatcher(FieldType Class::*field,
+               const Matcher<const FieldType&>& matcher)
+      : field_(field), matcher_(matcher), whose_field_("whose given field ") {}
+
+  FieldMatcher(const std::string& field_name, FieldType Class::*field,
+               const Matcher<const FieldType&>& matcher)
+      : field_(field),
+        matcher_(matcher),
+        whose_field_("whose field `" + field_name + "` ") {}
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "is an object " << whose_field_;
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "is an object " << whose_field_;
+    matcher_.DescribeNegationTo(os);
+  }
+
+  template <typename T>
+  bool MatchAndExplain(const T& value, MatchResultListener* listener) const {
+    // FIXME: The dispatch on std::is_pointer was introduced as a workaround for
+    // a compiler bug, and can now be removed.
+    return MatchAndExplainImpl(
+        typename std::is_pointer<typename std::remove_const<T>::type>::type(),
+        value, listener);
+  }
+
+ private:
+  bool MatchAndExplainImpl(std::false_type /* is_not_pointer */,
+                           const Class& obj,
+                           MatchResultListener* listener) const {
+    *listener << whose_field_ << "is ";
+    return MatchPrintAndExplain(obj.*field_, matcher_, listener);
+  }
+
+  bool MatchAndExplainImpl(std::true_type /* is_pointer */, const Class* p,
+                           MatchResultListener* listener) const {
+    if (p == nullptr) return false;
+
+    *listener << "which points to an object ";
+    // Since *p has a field, it must be a class/struct/union type and
+    // thus cannot be a pointer.  Therefore we pass false_type() as
+    // the first argument.
+    return MatchAndExplainImpl(std::false_type(), *p, listener);
+  }
+
+  const FieldType Class::*field_;
+  const Matcher<const FieldType&> matcher_;
+
+  // Contains either "whose given field " if the name of the field is unknown
+  // or "whose field `name_of_field` " if the name is known.
+  const std::string whose_field_;
+};
+
+// Implements the Property() matcher for matching a property
+// (i.e. return value of a getter method) of an object.
+//
+// Property is a const-qualified member function of Class returning
+// PropertyType.
+template <typename Class, typename PropertyType, typename Property>
+class PropertyMatcher {
+ public:
+  typedef const PropertyType& RefToConstProperty;
+
+  PropertyMatcher(Property property, const Matcher<RefToConstProperty>& matcher)
+      : property_(property),
+        matcher_(matcher),
+        whose_property_("whose given property ") {}
+
+  PropertyMatcher(const std::string& property_name, Property property,
+                  const Matcher<RefToConstProperty>& matcher)
+      : property_(property),
+        matcher_(matcher),
+        whose_property_("whose property `" + property_name + "` ") {}
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "is an object " << whose_property_;
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "is an object " << whose_property_;
+    matcher_.DescribeNegationTo(os);
+  }
+
+  template <typename T>
+  bool MatchAndExplain(const T&value, MatchResultListener* listener) const {
+    return MatchAndExplainImpl(
+        typename std::is_pointer<typename std::remove_const<T>::type>::type(),
+        value, listener);
+  }
+
+ private:
+  bool MatchAndExplainImpl(std::false_type /* is_not_pointer */,
+                           const Class& obj,
+                           MatchResultListener* listener) const {
+    *listener << whose_property_ << "is ";
+    // Cannot pass the return value (for example, int) to MatchPrintAndExplain,
+    // which takes a non-const reference as argument.
+    RefToConstProperty result = (obj.*property_)();
+    return MatchPrintAndExplain(result, matcher_, listener);
+  }
+
+  bool MatchAndExplainImpl(std::true_type /* is_pointer */, const Class* p,
+                           MatchResultListener* listener) const {
+    if (p == nullptr) return false;
+
+    *listener << "which points to an object ";
+    // Since *p has a property method, it must be a class/struct/union
+    // type and thus cannot be a pointer.  Therefore we pass
+    // false_type() as the first argument.
+    return MatchAndExplainImpl(std::false_type(), *p, listener);
+  }
+
+  Property property_;
+  const Matcher<RefToConstProperty> matcher_;
+
+  // Contains either "whose given property " if the name of the property is
+  // unknown or "whose property `name_of_property` " if the name is known.
+  const std::string whose_property_;
+};
+
+// Type traits specifying various features of different functors for ResultOf.
+// The default template specifies features for functor objects.
+template <typename Functor>
+struct CallableTraits {
+  typedef Functor StorageType;
+
+  static void CheckIsValid(Functor /* functor */) {}
+
+  template <typename T>
+  static auto Invoke(Functor f, const T& arg) -> decltype(f(arg)) {
+    return f(arg);
+  }
+};
+
+// Specialization for function pointers.
+template <typename ArgType, typename ResType>
+struct CallableTraits<ResType(*)(ArgType)> {
+  typedef ResType ResultType;
+  typedef ResType(*StorageType)(ArgType);
+
+  static void CheckIsValid(ResType(*f)(ArgType)) {
+    GTEST_CHECK_(f != nullptr)
+        << "NULL function pointer is passed into ResultOf().";
+  }
+  template <typename T>
+  static ResType Invoke(ResType(*f)(ArgType), T arg) {
+    return (*f)(arg);
+  }
+};
+
+// Implements the ResultOf() matcher for matching a return value of a
+// unary function of an object.
+template <typename Callable, typename InnerMatcher>
+class ResultOfMatcher {
+ public:
+  ResultOfMatcher(Callable callable, InnerMatcher matcher)
+      : callable_(std::move(callable)), matcher_(std::move(matcher)) {
+    CallableTraits<Callable>::CheckIsValid(callable_);
+  }
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new Impl<const T&>(callable_, matcher_));
+  }
+
+ private:
+  typedef typename CallableTraits<Callable>::StorageType CallableStorageType;
+
+  template <typename T>
+  class Impl : public MatcherInterface<T> {
+    using ResultType = decltype(CallableTraits<Callable>::template Invoke<T>(
+        std::declval<CallableStorageType>(), std::declval<T>()));
+
+   public:
+    template <typename M>
+    Impl(const CallableStorageType& callable, const M& matcher)
+        : callable_(callable), matcher_(MatcherCast<ResultType>(matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "is mapped by the given callable to a value that ";
+      matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "is mapped by the given callable to a value that ";
+      matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(T obj, MatchResultListener* listener) const override {
+      *listener << "which is mapped by the given callable to ";
+      // Cannot pass the return value directly to MatchPrintAndExplain, which
+      // takes a non-const reference as argument.
+      // Also, specifying template argument explicitly is needed because T could
+      // be a non-const reference (e.g. Matcher<Uncopyable&>).
+      ResultType result =
+          CallableTraits<Callable>::template Invoke<T>(callable_, obj);
+      return MatchPrintAndExplain(result, matcher_, listener);
+    }
+
+   private:
+    // Functors often define operator() as non-const method even though
+    // they are actually stateless. But we need to use them even when
+    // 'this' is a const pointer. It's the user's responsibility not to
+    // use stateful callables with ResultOf(), which doesn't guarantee
+    // how many times the callable will be invoked.
+    mutable CallableStorageType callable_;
+    const Matcher<ResultType> matcher_;
+  };  // class Impl
+
+  const CallableStorageType callable_;
+  const InnerMatcher matcher_;
+};
+
+// Implements a matcher that checks the size of an STL-style container.
+template <typename SizeMatcher>
+class SizeIsMatcher {
+ public:
+  explicit SizeIsMatcher(const SizeMatcher& size_matcher)
+       : size_matcher_(size_matcher) {
+  }
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(new Impl<const Container&>(size_matcher_));
+  }
+
+  template <typename Container>
+  class Impl : public MatcherInterface<Container> {
+   public:
+    using SizeType = decltype(std::declval<Container>().size());
+    explicit Impl(const SizeMatcher& size_matcher)
+        : size_matcher_(MatcherCast<SizeType>(size_matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "size ";
+      size_matcher_.DescribeTo(os);
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "size ";
+      size_matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(Container container,
+                         MatchResultListener* listener) const override {
+      SizeType size = container.size();
+      StringMatchResultListener size_listener;
+      const bool result = size_matcher_.MatchAndExplain(size, &size_listener);
+      *listener
+          << "whose size " << size << (result ? " matches" : " doesn't match");
+      PrintIfNotEmpty(size_listener.str(), listener->stream());
+      return result;
+    }
+
+   private:
+    const Matcher<SizeType> size_matcher_;
+  };
+
+ private:
+  const SizeMatcher size_matcher_;
+};
+
+// Implements a matcher that checks the begin()..end() distance of an STL-style
+// container.
+template <typename DistanceMatcher>
+class BeginEndDistanceIsMatcher {
+ public:
+  explicit BeginEndDistanceIsMatcher(const DistanceMatcher& distance_matcher)
+      : distance_matcher_(distance_matcher) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(new Impl<const Container&>(distance_matcher_));
+  }
+
+  template <typename Container>
+  class Impl : public MatcherInterface<Container> {
+   public:
+    typedef internal::StlContainerView<
+        GTEST_REMOVE_REFERENCE_AND_CONST_(Container)> ContainerView;
+    typedef typename std::iterator_traits<
+        typename ContainerView::type::const_iterator>::difference_type
+        DistanceType;
+    explicit Impl(const DistanceMatcher& distance_matcher)
+        : distance_matcher_(MatcherCast<DistanceType>(distance_matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "distance between begin() and end() ";
+      distance_matcher_.DescribeTo(os);
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "distance between begin() and end() ";
+      distance_matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(Container container,
+                         MatchResultListener* listener) const override {
+      using std::begin;
+      using std::end;
+      DistanceType distance = std::distance(begin(container), end(container));
+      StringMatchResultListener distance_listener;
+      const bool result =
+          distance_matcher_.MatchAndExplain(distance, &distance_listener);
+      *listener << "whose distance between begin() and end() " << distance
+                << (result ? " matches" : " doesn't match");
+      PrintIfNotEmpty(distance_listener.str(), listener->stream());
+      return result;
+    }
+
+   private:
+    const Matcher<DistanceType> distance_matcher_;
+  };
+
+ private:
+  const DistanceMatcher distance_matcher_;
+};
+
+// Implements an equality matcher for any STL-style container whose elements
+// support ==. This matcher is like Eq(), but its failure explanations provide
+// more detailed information that is useful when the container is used as a set.
+// The failure message reports elements that are in one of the operands but not
+// the other. The failure messages do not report duplicate or out-of-order
+// elements in the containers (which don't properly matter to sets, but can
+// occur if the containers are vectors or lists, for example).
+//
+// Uses the container's const_iterator, value_type, operator ==,
+// begin(), and end().
+template <typename Container>
+class ContainerEqMatcher {
+ public:
+  typedef internal::StlContainerView<Container> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+
+  static_assert(!std::is_const<Container>::value,
+                "Container type must not be const");
+  static_assert(!std::is_reference<Container>::value,
+                "Container type must not be a reference");
+
+  // We make a copy of expected in case the elements in it are modified
+  // after this matcher is created.
+  explicit ContainerEqMatcher(const Container& expected)
+      : expected_(View::Copy(expected)) {}
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "equals ";
+    UniversalPrint(expected_, os);
+  }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "does not equal ";
+    UniversalPrint(expected_, os);
+  }
+
+  template <typename LhsContainer>
+  bool MatchAndExplain(const LhsContainer& lhs,
+                       MatchResultListener* listener) const {
+    typedef internal::StlContainerView<
+        typename std::remove_const<LhsContainer>::type>
+        LhsView;
+    typedef typename LhsView::type LhsStlContainer;
+    StlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+    if (lhs_stl_container == expected_)
+      return true;
+
+    ::std::ostream* const os = listener->stream();
+    if (os != nullptr) {
+      // Something is different. Check for extra values first.
+      bool printed_header = false;
+      for (typename LhsStlContainer::const_iterator it =
+               lhs_stl_container.begin();
+           it != lhs_stl_container.end(); ++it) {
+        if (internal::ArrayAwareFind(expected_.begin(), expected_.end(), *it) ==
+            expected_.end()) {
+          if (printed_header) {
+            *os << ", ";
+          } else {
+            *os << "which has these unexpected elements: ";
+            printed_header = true;
+          }
+          UniversalPrint(*it, os);
+        }
+      }
+
+      // Now check for missing values.
+      bool printed_header2 = false;
+      for (typename StlContainer::const_iterator it = expected_.begin();
+           it != expected_.end(); ++it) {
+        if (internal::ArrayAwareFind(
+                lhs_stl_container.begin(), lhs_stl_container.end(), *it) ==
+            lhs_stl_container.end()) {
+          if (printed_header2) {
+            *os << ", ";
+          } else {
+            *os << (printed_header ? ",\nand" : "which")
+                << " doesn't have these expected elements: ";
+            printed_header2 = true;
+          }
+          UniversalPrint(*it, os);
+        }
+      }
+    }
+
+    return false;
+  }
+
+ private:
+  const StlContainer expected_;
+};
+
+// A comparator functor that uses the < operator to compare two values.
+struct LessComparator {
+  template <typename T, typename U>
+  bool operator()(const T& lhs, const U& rhs) const { return lhs < rhs; }
+};
+
+// Implements WhenSortedBy(comparator, container_matcher).
+template <typename Comparator, typename ContainerMatcher>
+class WhenSortedByMatcher {
+ public:
+  WhenSortedByMatcher(const Comparator& comparator,
+                      const ContainerMatcher& matcher)
+      : comparator_(comparator), matcher_(matcher) {}
+
+  template <typename LhsContainer>
+  operator Matcher<LhsContainer>() const {
+    return MakeMatcher(new Impl<LhsContainer>(comparator_, matcher_));
+  }
+
+  template <typename LhsContainer>
+  class Impl : public MatcherInterface<LhsContainer> {
+   public:
+    typedef internal::StlContainerView<
+         GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)> LhsView;
+    typedef typename LhsView::type LhsStlContainer;
+    typedef typename LhsView::const_reference LhsStlContainerReference;
+    // Transforms std::pair<const Key, Value> into std::pair<Key, Value>
+    // so that we can match associative containers.
+    typedef typename RemoveConstFromKey<
+        typename LhsStlContainer::value_type>::type LhsValue;
+
+    Impl(const Comparator& comparator, const ContainerMatcher& matcher)
+        : comparator_(comparator), matcher_(matcher) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "(when sorted) ";
+      matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "(when sorted) ";
+      matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(LhsContainer lhs,
+                         MatchResultListener* listener) const override {
+      LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+      ::std::vector<LhsValue> sorted_container(lhs_stl_container.begin(),
+                                               lhs_stl_container.end());
+      ::std::sort(
+           sorted_container.begin(), sorted_container.end(), comparator_);
+
+      if (!listener->IsInterested()) {
+        // If the listener is not interested, we do not need to
+        // construct the inner explanation.
+        return matcher_.Matches(sorted_container);
+      }
+
+      *listener << "which is ";
+      UniversalPrint(sorted_container, listener->stream());
+      *listener << " when sorted";
+
+      StringMatchResultListener inner_listener;
+      const bool match = matcher_.MatchAndExplain(sorted_container,
+                                                  &inner_listener);
+      PrintIfNotEmpty(inner_listener.str(), listener->stream());
+      return match;
+    }
+
+   private:
+    const Comparator comparator_;
+    const Matcher<const ::std::vector<LhsValue>&> matcher_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(Impl);
+  };
+
+ private:
+  const Comparator comparator_;
+  const ContainerMatcher matcher_;
+};
+
+// Implements Pointwise(tuple_matcher, rhs_container).  tuple_matcher
+// must be able to be safely cast to Matcher<std::tuple<const T1&, const
+// T2&> >, where T1 and T2 are the types of elements in the LHS
+// container and the RHS container respectively.
+template <typename TupleMatcher, typename RhsContainer>
+class PointwiseMatcher {
+  GTEST_COMPILE_ASSERT_(
+      !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(RhsContainer)>::value,
+      use_UnorderedPointwise_with_hash_tables);
+
+ public:
+  typedef internal::StlContainerView<RhsContainer> RhsView;
+  typedef typename RhsView::type RhsStlContainer;
+  typedef typename RhsStlContainer::value_type RhsValue;
+
+  static_assert(!std::is_const<RhsContainer>::value,
+                "RhsContainer type must not be const");
+  static_assert(!std::is_reference<RhsContainer>::value,
+                "RhsContainer type must not be a reference");
+
+  // Like ContainerEq, we make a copy of rhs in case the elements in
+  // it are modified after this matcher is created.
+  PointwiseMatcher(const TupleMatcher& tuple_matcher, const RhsContainer& rhs)
+      : tuple_matcher_(tuple_matcher), rhs_(RhsView::Copy(rhs)) {}
+
+  template <typename LhsContainer>
+  operator Matcher<LhsContainer>() const {
+    GTEST_COMPILE_ASSERT_(
+        !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)>::value,
+        use_UnorderedPointwise_with_hash_tables);
+
+    return Matcher<LhsContainer>(
+        new Impl<const LhsContainer&>(tuple_matcher_, rhs_));
+  }
+
+  template <typename LhsContainer>
+  class Impl : public MatcherInterface<LhsContainer> {
+   public:
+    typedef internal::StlContainerView<
+         GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)> LhsView;
+    typedef typename LhsView::type LhsStlContainer;
+    typedef typename LhsView::const_reference LhsStlContainerReference;
+    typedef typename LhsStlContainer::value_type LhsValue;
+    // We pass the LHS value and the RHS value to the inner matcher by
+    // reference, as they may be expensive to copy.  We must use tuple
+    // instead of pair here, as a pair cannot hold references (C++ 98,
+    // 20.2.2 [lib.pairs]).
+    typedef ::std::tuple<const LhsValue&, const RhsValue&> InnerMatcherArg;
+
+    Impl(const TupleMatcher& tuple_matcher, const RhsStlContainer& rhs)
+        // mono_tuple_matcher_ holds a monomorphic version of the tuple matcher.
+        : mono_tuple_matcher_(SafeMatcherCast<InnerMatcherArg>(tuple_matcher)),
+          rhs_(rhs) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "contains " << rhs_.size()
+          << " values, where each value and its corresponding value in ";
+      UniversalPrinter<RhsStlContainer>::Print(rhs_, os);
+      *os << " ";
+      mono_tuple_matcher_.DescribeTo(os);
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "doesn't contain exactly " << rhs_.size()
+          << " values, or contains a value x at some index i"
+          << " where x and the i-th value of ";
+      UniversalPrint(rhs_, os);
+      *os << " ";
+      mono_tuple_matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(LhsContainer lhs,
+                         MatchResultListener* listener) const override {
+      LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+      const size_t actual_size = lhs_stl_container.size();
+      if (actual_size != rhs_.size()) {
+        *listener << "which contains " << actual_size << " values";
+        return false;
+      }
+
+      typename LhsStlContainer::const_iterator left = lhs_stl_container.begin();
+      typename RhsStlContainer::const_iterator right = rhs_.begin();
+      for (size_t i = 0; i != actual_size; ++i, ++left, ++right) {
+        if (listener->IsInterested()) {
+          StringMatchResultListener inner_listener;
+          // Create InnerMatcherArg as a temporarily object to avoid it outlives
+          // *left and *right. Dereference or the conversion to `const T&` may
+          // return temp objects, e.g for vector<bool>.
+          if (!mono_tuple_matcher_.MatchAndExplain(
+                  InnerMatcherArg(ImplicitCast_<const LhsValue&>(*left),
+                                  ImplicitCast_<const RhsValue&>(*right)),
+                  &inner_listener)) {
+            *listener << "where the value pair (";
+            UniversalPrint(*left, listener->stream());
+            *listener << ", ";
+            UniversalPrint(*right, listener->stream());
+            *listener << ") at index #" << i << " don't match";
+            PrintIfNotEmpty(inner_listener.str(), listener->stream());
+            return false;
+          }
+        } else {
+          if (!mono_tuple_matcher_.Matches(
+                  InnerMatcherArg(ImplicitCast_<const LhsValue&>(*left),
+                                  ImplicitCast_<const RhsValue&>(*right))))
+            return false;
+        }
+      }
+
+      return true;
+    }
+
+   private:
+    const Matcher<InnerMatcherArg> mono_tuple_matcher_;
+    const RhsStlContainer rhs_;
+  };
+
+ private:
+  const TupleMatcher tuple_matcher_;
+  const RhsStlContainer rhs_;
+};
+
+// Holds the logic common to ContainsMatcherImpl and EachMatcherImpl.
+template <typename Container>
+class QuantifierMatcherImpl : public MatcherInterface<Container> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+  typedef StlContainerView<RawContainer> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+  typedef typename StlContainer::value_type Element;
+
+  template <typename InnerMatcher>
+  explicit QuantifierMatcherImpl(InnerMatcher inner_matcher)
+      : inner_matcher_(
+           testing::SafeMatcherCast<const Element&>(inner_matcher)) {}
+
+  // Checks whether:
+  // * All elements in the container match, if all_elements_should_match.
+  // * Any element in the container matches, if !all_elements_should_match.
+  bool MatchAndExplainImpl(bool all_elements_should_match,
+                           Container container,
+                           MatchResultListener* listener) const {
+    StlContainerReference stl_container = View::ConstReference(container);
+    size_t i = 0;
+    for (typename StlContainer::const_iterator it = stl_container.begin();
+         it != stl_container.end(); ++it, ++i) {
+      StringMatchResultListener inner_listener;
+      const bool matches = inner_matcher_.MatchAndExplain(*it, &inner_listener);
+
+      if (matches != all_elements_should_match) {
+        *listener << "whose element #" << i
+                  << (matches ? " matches" : " doesn't match");
+        PrintIfNotEmpty(inner_listener.str(), listener->stream());
+        return !all_elements_should_match;
+      }
+    }
+    return all_elements_should_match;
+  }
+
+ protected:
+  const Matcher<const Element&> inner_matcher_;
+};
+
+// Implements Contains(element_matcher) for the given argument type Container.
+// Symmetric to EachMatcherImpl.
+template <typename Container>
+class ContainsMatcherImpl : public QuantifierMatcherImpl<Container> {
+ public:
+  template <typename InnerMatcher>
+  explicit ContainsMatcherImpl(InnerMatcher inner_matcher)
+      : QuantifierMatcherImpl<Container>(inner_matcher) {}
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "contains at least one element that ";
+    this->inner_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "doesn't contain any element that ";
+    this->inner_matcher_.DescribeTo(os);
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    return this->MatchAndExplainImpl(false, container, listener);
+  }
+};
+
+// Implements Each(element_matcher) for the given argument type Container.
+// Symmetric to ContainsMatcherImpl.
+template <typename Container>
+class EachMatcherImpl : public QuantifierMatcherImpl<Container> {
+ public:
+  template <typename InnerMatcher>
+  explicit EachMatcherImpl(InnerMatcher inner_matcher)
+      : QuantifierMatcherImpl<Container>(inner_matcher) {}
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "only contains elements that ";
+    this->inner_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "contains some element that ";
+    this->inner_matcher_.DescribeNegationTo(os);
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    return this->MatchAndExplainImpl(true, container, listener);
+  }
+};
+
+// Implements polymorphic Contains(element_matcher).
+template <typename M>
+class ContainsMatcher {
+ public:
+  explicit ContainsMatcher(M m) : inner_matcher_(m) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(
+        new ContainsMatcherImpl<const Container&>(inner_matcher_));
+  }
+
+ private:
+  const M inner_matcher_;
+};
+
+// Implements polymorphic Each(element_matcher).
+template <typename M>
+class EachMatcher {
+ public:
+  explicit EachMatcher(M m) : inner_matcher_(m) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(
+        new EachMatcherImpl<const Container&>(inner_matcher_));
+  }
+
+ private:
+  const M inner_matcher_;
+};
+
+struct Rank1 {};
+struct Rank0 : Rank1 {};
+
+namespace pair_getters {
+using std::get;
+template <typename T>
+auto First(T& x, Rank1) -> decltype(get<0>(x)) {  // NOLINT
+  return get<0>(x);
+}
+template <typename T>
+auto First(T& x, Rank0) -> decltype((x.first)) {  // NOLINT
+  return x.first;
+}
+
+template <typename T>
+auto Second(T& x, Rank1) -> decltype(get<1>(x)) {  // NOLINT
+  return get<1>(x);
+}
+template <typename T>
+auto Second(T& x, Rank0) -> decltype((x.second)) {  // NOLINT
+  return x.second;
+}
+}  // namespace pair_getters
+
+// Implements Key(inner_matcher) for the given argument pair type.
+// Key(inner_matcher) matches an std::pair whose 'first' field matches
+// inner_matcher.  For example, Contains(Key(Ge(5))) can be used to match an
+// std::map that contains at least one element whose key is >= 5.
+template <typename PairType>
+class KeyMatcherImpl : public MatcherInterface<PairType> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(PairType) RawPairType;
+  typedef typename RawPairType::first_type KeyType;
+
+  template <typename InnerMatcher>
+  explicit KeyMatcherImpl(InnerMatcher inner_matcher)
+      : inner_matcher_(
+          testing::SafeMatcherCast<const KeyType&>(inner_matcher)) {
+  }
+
+  // Returns true if and only if 'key_value.first' (the key) matches the inner
+  // matcher.
+  bool MatchAndExplain(PairType key_value,
+                       MatchResultListener* listener) const override {
+    StringMatchResultListener inner_listener;
+    const bool match = inner_matcher_.MatchAndExplain(
+        pair_getters::First(key_value, Rank0()), &inner_listener);
+    const std::string explanation = inner_listener.str();
+    if (explanation != "") {
+      *listener << "whose first field is a value " << explanation;
+    }
+    return match;
+  }
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "has a key that ";
+    inner_matcher_.DescribeTo(os);
+  }
+
+  // Describes what the negation of this matcher does.
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "doesn't have a key that ";
+    inner_matcher_.DescribeTo(os);
+  }
+
+ private:
+  const Matcher<const KeyType&> inner_matcher_;
+};
+
+// Implements polymorphic Key(matcher_for_key).
+template <typename M>
+class KeyMatcher {
+ public:
+  explicit KeyMatcher(M m) : matcher_for_key_(m) {}
+
+  template <typename PairType>
+  operator Matcher<PairType>() const {
+    return Matcher<PairType>(
+        new KeyMatcherImpl<const PairType&>(matcher_for_key_));
+  }
+
+ private:
+  const M matcher_for_key_;
+};
+
+// Implements polymorphic Address(matcher_for_address).
+template <typename InnerMatcher>
+class AddressMatcher {
+ public:
+  explicit AddressMatcher(InnerMatcher m) : matcher_(m) {}
+
+  template <typename Type>
+  operator Matcher<Type>() const {  // NOLINT
+    return Matcher<Type>(new Impl<const Type&>(matcher_));
+  }
+
+ private:
+  // The monomorphic implementation that works for a particular object type.
+  template <typename Type>
+  class Impl : public MatcherInterface<Type> {
+   public:
+    using Address = const GTEST_REMOVE_REFERENCE_AND_CONST_(Type) *;
+    explicit Impl(const InnerMatcher& matcher)
+        : matcher_(MatcherCast<Address>(matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "has address that ";
+      matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "does not have address that ";
+      matcher_.DescribeTo(os);
+    }
+
+    bool MatchAndExplain(Type object,
+                         MatchResultListener* listener) const override {
+      *listener << "which has address ";
+      Address address = std::addressof(object);
+      return MatchPrintAndExplain(address, matcher_, listener);
+    }
+
+   private:
+    const Matcher<Address> matcher_;
+  };
+  const InnerMatcher matcher_;
+};
+
+// Implements Pair(first_matcher, second_matcher) for the given argument pair
+// type with its two matchers. See Pair() function below.
+template <typename PairType>
+class PairMatcherImpl : public MatcherInterface<PairType> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(PairType) RawPairType;
+  typedef typename RawPairType::first_type FirstType;
+  typedef typename RawPairType::second_type SecondType;
+
+  template <typename FirstMatcher, typename SecondMatcher>
+  PairMatcherImpl(FirstMatcher first_matcher, SecondMatcher second_matcher)
+      : first_matcher_(
+            testing::SafeMatcherCast<const FirstType&>(first_matcher)),
+        second_matcher_(
+            testing::SafeMatcherCast<const SecondType&>(second_matcher)) {
+  }
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "has a first field that ";
+    first_matcher_.DescribeTo(os);
+    *os << ", and has a second field that ";
+    second_matcher_.DescribeTo(os);
+  }
+
+  // Describes what the negation of this matcher does.
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "has a first field that ";
+    first_matcher_.DescribeNegationTo(os);
+    *os << ", or has a second field that ";
+    second_matcher_.DescribeNegationTo(os);
+  }
+
+  // Returns true if and only if 'a_pair.first' matches first_matcher and
+  // 'a_pair.second' matches second_matcher.
+  bool MatchAndExplain(PairType a_pair,
+                       MatchResultListener* listener) const override {
+    if (!listener->IsInterested()) {
+      // If the listener is not interested, we don't need to construct the
+      // explanation.
+      return first_matcher_.Matches(pair_getters::First(a_pair, Rank0())) &&
+             second_matcher_.Matches(pair_getters::Second(a_pair, Rank0()));
+    }
+    StringMatchResultListener first_inner_listener;
+    if (!first_matcher_.MatchAndExplain(pair_getters::First(a_pair, Rank0()),
+                                        &first_inner_listener)) {
+      *listener << "whose first field does not match";
+      PrintIfNotEmpty(first_inner_listener.str(), listener->stream());
+      return false;
+    }
+    StringMatchResultListener second_inner_listener;
+    if (!second_matcher_.MatchAndExplain(pair_getters::Second(a_pair, Rank0()),
+                                         &second_inner_listener)) {
+      *listener << "whose second field does not match";
+      PrintIfNotEmpty(second_inner_listener.str(), listener->stream());
+      return false;
+    }
+    ExplainSuccess(first_inner_listener.str(), second_inner_listener.str(),
+                   listener);
+    return true;
+  }
+
+ private:
+  void ExplainSuccess(const std::string& first_explanation,
+                      const std::string& second_explanation,
+                      MatchResultListener* listener) const {
+    *listener << "whose both fields match";
+    if (first_explanation != "") {
+      *listener << ", where the first field is a value " << first_explanation;
+    }
+    if (second_explanation != "") {
+      *listener << ", ";
+      if (first_explanation != "") {
+        *listener << "and ";
+      } else {
+        *listener << "where ";
+      }
+      *listener << "the second field is a value " << second_explanation;
+    }
+  }
+
+  const Matcher<const FirstType&> first_matcher_;
+  const Matcher<const SecondType&> second_matcher_;
+};
+
+// Implements polymorphic Pair(first_matcher, second_matcher).
+template <typename FirstMatcher, typename SecondMatcher>
+class PairMatcher {
+ public:
+  PairMatcher(FirstMatcher first_matcher, SecondMatcher second_matcher)
+      : first_matcher_(first_matcher), second_matcher_(second_matcher) {}
+
+  template <typename PairType>
+  operator Matcher<PairType> () const {
+    return Matcher<PairType>(
+        new PairMatcherImpl<const PairType&>(first_matcher_, second_matcher_));
+  }
+
+ private:
+  const FirstMatcher first_matcher_;
+  const SecondMatcher second_matcher_;
+};
+
+template <typename T, size_t... I>
+auto UnpackStructImpl(const T& t, IndexSequence<I...>, int)
+    -> decltype(std::tie(get<I>(t)...)) {
+  static_assert(std::tuple_size<T>::value == sizeof...(I),
+                "Number of arguments doesn't match the number of fields.");
+  return std::tie(get<I>(t)...);
+}
+
+#if defined(__cpp_structured_bindings) && __cpp_structured_bindings >= 201606
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<1>, char) {
+  const auto& [a] = t;
+  return std::tie(a);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<2>, char) {
+  const auto& [a, b] = t;
+  return std::tie(a, b);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<3>, char) {
+  const auto& [a, b, c] = t;
+  return std::tie(a, b, c);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<4>, char) {
+  const auto& [a, b, c, d] = t;
+  return std::tie(a, b, c, d);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<5>, char) {
+  const auto& [a, b, c, d, e] = t;
+  return std::tie(a, b, c, d, e);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<6>, char) {
+  const auto& [a, b, c, d, e, f] = t;
+  return std::tie(a, b, c, d, e, f);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<7>, char) {
+  const auto& [a, b, c, d, e, f, g] = t;
+  return std::tie(a, b, c, d, e, f, g);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<8>, char) {
+  const auto& [a, b, c, d, e, f, g, h] = t;
+  return std::tie(a, b, c, d, e, f, g, h);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<9>, char) {
+  const auto& [a, b, c, d, e, f, g, h, i] = t;
+  return std::tie(a, b, c, d, e, f, g, h, i);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<10>, char) {
+  const auto& [a, b, c, d, e, f, g, h, i, j] = t;
+  return std::tie(a, b, c, d, e, f, g, h, i, j);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<11>, char) {
+  const auto& [a, b, c, d, e, f, g, h, i, j, k] = t;
+  return std::tie(a, b, c, d, e, f, g, h, i, j, k);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<12>, char) {
+  const auto& [a, b, c, d, e, f, g, h, i, j, k, l] = t;
+  return std::tie(a, b, c, d, e, f, g, h, i, j, k, l);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<13>, char) {
+  const auto& [a, b, c, d, e, f, g, h, i, j, k, l, m] = t;
+  return std::tie(a, b, c, d, e, f, g, h, i, j, k, l, m);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<14>, char) {
+  const auto& [a, b, c, d, e, f, g, h, i, j, k, l, m, n] = t;
+  return std::tie(a, b, c, d, e, f, g, h, i, j, k, l, m, n);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<15>, char) {
+  const auto& [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o] = t;
+  return std::tie(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<16>, char) {
+  const auto& [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p] = t;
+  return std::tie(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
+}
+#endif  // defined(__cpp_structured_bindings)
+
+template <size_t I, typename T>
+auto UnpackStruct(const T& t)
+    -> decltype((UnpackStructImpl)(t, MakeIndexSequence<I>{}, 0)) {
+  return (UnpackStructImpl)(t, MakeIndexSequence<I>{}, 0);
+}
+
+// Helper function to do comma folding in C++11.
+// The array ensures left-to-right order of evaluation.
+// Usage: VariadicExpand({expr...});
+template <typename T, size_t N>
+void VariadicExpand(const T (&)[N]) {}
+
+template <typename Struct, typename StructSize>
+class FieldsAreMatcherImpl;
+
+template <typename Struct, size_t... I>
+class FieldsAreMatcherImpl<Struct, IndexSequence<I...>>
+    : public MatcherInterface<Struct> {
+  using UnpackedType =
+      decltype(UnpackStruct<sizeof...(I)>(std::declval<const Struct&>()));
+  using MatchersType = std::tuple<
+      Matcher<const typename std::tuple_element<I, UnpackedType>::type&>...>;
+
+ public:
+  template <typename Inner>
+  explicit FieldsAreMatcherImpl(const Inner& matchers)
+      : matchers_(testing::SafeMatcherCast<
+                  const typename std::tuple_element<I, UnpackedType>::type&>(
+            std::get<I>(matchers))...) {}
+
+  void DescribeTo(::std::ostream* os) const override {
+    const char* separator = "";
+    VariadicExpand(
+        {(*os << separator << "has field #" << I << " that ",
+          std::get<I>(matchers_).DescribeTo(os), separator = ", and ")...});
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    const char* separator = "";
+    VariadicExpand({(*os << separator << "has field #" << I << " that ",
+                     std::get<I>(matchers_).DescribeNegationTo(os),
+                     separator = ", or ")...});
+  }
+
+  bool MatchAndExplain(Struct t, MatchResultListener* listener) const override {
+    return MatchInternal((UnpackStruct<sizeof...(I)>)(t), listener);
+  }
+
+ private:
+  bool MatchInternal(UnpackedType tuple, MatchResultListener* listener) const {
+    if (!listener->IsInterested()) {
+      // If the listener is not interested, we don't need to construct the
+      // explanation.
+      bool good = true;
+      VariadicExpand({good = good && std::get<I>(matchers_).Matches(
+                                         std::get<I>(tuple))...});
+      return good;
+    }
+
+    size_t failed_pos = ~size_t{};
+
+    std::vector<StringMatchResultListener> inner_listener(sizeof...(I));
+
+    VariadicExpand(
+        {failed_pos == ~size_t{} && !std::get<I>(matchers_).MatchAndExplain(
+                                        std::get<I>(tuple), &inner_listener[I])
+             ? failed_pos = I
+             : 0 ...});
+    if (failed_pos != ~size_t{}) {
+      *listener << "whose field #" << failed_pos << " does not match";
+      PrintIfNotEmpty(inner_listener[failed_pos].str(), listener->stream());
+      return false;
+    }
+
+    *listener << "whose all elements match";
+    const char* separator = ", where";
+    for (size_t index = 0; index < sizeof...(I); ++index) {
+      const std::string str = inner_listener[index].str();
+      if (!str.empty()) {
+        *listener << separator << " field #" << index << " is a value " << str;
+        separator = ", and";
+      }
+    }
+
+    return true;
+  }
+
+  MatchersType matchers_;
+};
+
+template <typename... Inner>
+class FieldsAreMatcher {
+ public:
+  explicit FieldsAreMatcher(Inner... inner) : matchers_(std::move(inner)...) {}
+
+  template <typename Struct>
+  operator Matcher<Struct>() const {  // NOLINT
+    return Matcher<Struct>(
+        new FieldsAreMatcherImpl<const Struct&, IndexSequenceFor<Inner...>>(
+            matchers_));
+  }
+
+ private:
+  std::tuple<Inner...> matchers_;
+};
+
+// Implements ElementsAre() and ElementsAreArray().
+template <typename Container>
+class ElementsAreMatcherImpl : public MatcherInterface<Container> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+  typedef internal::StlContainerView<RawContainer> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+  typedef typename StlContainer::value_type Element;
+
+  // Constructs the matcher from a sequence of element values or
+  // element matchers.
+  template <typename InputIter>
+  ElementsAreMatcherImpl(InputIter first, InputIter last) {
+    while (first != last) {
+      matchers_.push_back(MatcherCast<const Element&>(*first++));
+    }
+  }
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    if (count() == 0) {
+      *os << "is empty";
+    } else if (count() == 1) {
+      *os << "has 1 element that ";
+      matchers_[0].DescribeTo(os);
+    } else {
+      *os << "has " << Elements(count()) << " where\n";
+      for (size_t i = 0; i != count(); ++i) {
+        *os << "element #" << i << " ";
+        matchers_[i].DescribeTo(os);
+        if (i + 1 < count()) {
+          *os << ",\n";
+        }
+      }
+    }
+  }
+
+  // Describes what the negation of this matcher does.
+  void DescribeNegationTo(::std::ostream* os) const override {
+    if (count() == 0) {
+      *os << "isn't empty";
+      return;
+    }
+
+    *os << "doesn't have " << Elements(count()) << ", or\n";
+    for (size_t i = 0; i != count(); ++i) {
+      *os << "element #" << i << " ";
+      matchers_[i].DescribeNegationTo(os);
+      if (i + 1 < count()) {
+        *os << ", or\n";
+      }
+    }
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    // To work with stream-like "containers", we must only walk
+    // through the elements in one pass.
+
+    const bool listener_interested = listener->IsInterested();
+
+    // explanations[i] is the explanation of the element at index i.
+    ::std::vector<std::string> explanations(count());
+    StlContainerReference stl_container = View::ConstReference(container);
+    typename StlContainer::const_iterator it = stl_container.begin();
+    size_t exam_pos = 0;
+    bool mismatch_found = false;  // Have we found a mismatched element yet?
+
+    // Go through the elements and matchers in pairs, until we reach
+    // the end of either the elements or the matchers, or until we find a
+    // mismatch.
+    for (; it != stl_container.end() && exam_pos != count(); ++it, ++exam_pos) {
+      bool match;  // Does the current element match the current matcher?
+      if (listener_interested) {
+        StringMatchResultListener s;
+        match = matchers_[exam_pos].MatchAndExplain(*it, &s);
+        explanations[exam_pos] = s.str();
+      } else {
+        match = matchers_[exam_pos].Matches(*it);
+      }
+
+      if (!match) {
+        mismatch_found = true;
+        break;
+      }
+    }
+    // If mismatch_found is true, 'exam_pos' is the index of the mismatch.
+
+    // Find how many elements the actual container has.  We avoid
+    // calling size() s.t. this code works for stream-like "containers"
+    // that don't define size().
+    size_t actual_count = exam_pos;
+    for (; it != stl_container.end(); ++it) {
+      ++actual_count;
+    }
+
+    if (actual_count != count()) {
+      // The element count doesn't match.  If the container is empty,
+      // there's no need to explain anything as Google Mock already
+      // prints the empty container.  Otherwise we just need to show
+      // how many elements there actually are.
+      if (listener_interested && (actual_count != 0)) {
+        *listener << "which has " << Elements(actual_count);
+      }
+      return false;
+    }
+
+    if (mismatch_found) {
+      // The element count matches, but the exam_pos-th element doesn't match.
+      if (listener_interested) {
+        *listener << "whose element #" << exam_pos << " doesn't match";
+        PrintIfNotEmpty(explanations[exam_pos], listener->stream());
+      }
+      return false;
+    }
+
+    // Every element matches its expectation.  We need to explain why
+    // (the obvious ones can be skipped).
+    if (listener_interested) {
+      bool reason_printed = false;
+      for (size_t i = 0; i != count(); ++i) {
+        const std::string& s = explanations[i];
+        if (!s.empty()) {
+          if (reason_printed) {
+            *listener << ",\nand ";
+          }
+          *listener << "whose element #" << i << " matches, " << s;
+          reason_printed = true;
+        }
+      }
+    }
+    return true;
+  }
+
+ private:
+  static Message Elements(size_t count) {
+    return Message() << count << (count == 1 ? " element" : " elements");
+  }
+
+  size_t count() const { return matchers_.size(); }
+
+  ::std::vector<Matcher<const Element&> > matchers_;
+};
+
+// Connectivity matrix of (elements X matchers), in element-major order.
+// Initially, there are no edges.
+// Use NextGraph() to iterate over all possible edge configurations.
+// Use Randomize() to generate a random edge configuration.
+class GTEST_API_ MatchMatrix {
+ public:
+  MatchMatrix(size_t num_elements, size_t num_matchers)
+      : num_elements_(num_elements),
+        num_matchers_(num_matchers),
+        matched_(num_elements_* num_matchers_, 0) {
+  }
+
+  size_t LhsSize() const { return num_elements_; }
+  size_t RhsSize() const { return num_matchers_; }
+  bool HasEdge(size_t ilhs, size_t irhs) const {
+    return matched_[SpaceIndex(ilhs, irhs)] == 1;
+  }
+  void SetEdge(size_t ilhs, size_t irhs, bool b) {
+    matched_[SpaceIndex(ilhs, irhs)] = b ? 1 : 0;
+  }
+
+  // Treating the connectivity matrix as a (LhsSize()*RhsSize())-bit number,
+  // adds 1 to that number; returns false if incrementing the graph left it
+  // empty.
+  bool NextGraph();
+
+  void Randomize();
+
+  std::string DebugString() const;
+
+ private:
+  size_t SpaceIndex(size_t ilhs, size_t irhs) const {
+    return ilhs * num_matchers_ + irhs;
+  }
+
+  size_t num_elements_;
+  size_t num_matchers_;
+
+  // Each element is a char interpreted as bool. They are stored as a
+  // flattened array in lhs-major order, use 'SpaceIndex()' to translate
+  // a (ilhs, irhs) matrix coordinate into an offset.
+  ::std::vector<char> matched_;
+};
+
+typedef ::std::pair<size_t, size_t> ElementMatcherPair;
+typedef ::std::vector<ElementMatcherPair> ElementMatcherPairs;
+
+// Returns a maximum bipartite matching for the specified graph 'g'.
+// The matching is represented as a vector of {element, matcher} pairs.
+GTEST_API_ ElementMatcherPairs
+FindMaxBipartiteMatching(const MatchMatrix& g);
+
+struct UnorderedMatcherRequire {
+  enum Flags {
+    Superset = 1 << 0,
+    Subset = 1 << 1,
+    ExactMatch = Superset | Subset,
+  };
+};
+
+// Untyped base class for implementing UnorderedElementsAre.  By
+// putting logic that's not specific to the element type here, we
+// reduce binary bloat and increase compilation speed.
+class GTEST_API_ UnorderedElementsAreMatcherImplBase {
+ protected:
+  explicit UnorderedElementsAreMatcherImplBase(
+      UnorderedMatcherRequire::Flags matcher_flags)
+      : match_flags_(matcher_flags) {}
+
+  // A vector of matcher describers, one for each element matcher.
+  // Does not own the describers (and thus can be used only when the
+  // element matchers are alive).
+  typedef ::std::vector<const MatcherDescriberInterface*> MatcherDescriberVec;
+
+  // Describes this UnorderedElementsAre matcher.
+  void DescribeToImpl(::std::ostream* os) const;
+
+  // Describes the negation of this UnorderedElementsAre matcher.
+  void DescribeNegationToImpl(::std::ostream* os) const;
+
+  bool VerifyMatchMatrix(const ::std::vector<std::string>& element_printouts,
+                         const MatchMatrix& matrix,
+                         MatchResultListener* listener) const;
+
+  bool FindPairing(const MatchMatrix& matrix,
+                   MatchResultListener* listener) const;
+
+  MatcherDescriberVec& matcher_describers() {
+    return matcher_describers_;
+  }
+
+  static Message Elements(size_t n) {
+    return Message() << n << " element" << (n == 1 ? "" : "s");
+  }
+
+  UnorderedMatcherRequire::Flags match_flags() const { return match_flags_; }
+
+ private:
+  UnorderedMatcherRequire::Flags match_flags_;
+  MatcherDescriberVec matcher_describers_;
+};
+
+// Implements UnorderedElementsAre, UnorderedElementsAreArray, IsSubsetOf, and
+// IsSupersetOf.
+template <typename Container>
+class UnorderedElementsAreMatcherImpl
+    : public MatcherInterface<Container>,
+      public UnorderedElementsAreMatcherImplBase {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+  typedef internal::StlContainerView<RawContainer> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+  typedef typename StlContainer::const_iterator StlContainerConstIterator;
+  typedef typename StlContainer::value_type Element;
+
+  template <typename InputIter>
+  UnorderedElementsAreMatcherImpl(UnorderedMatcherRequire::Flags matcher_flags,
+                                  InputIter first, InputIter last)
+      : UnorderedElementsAreMatcherImplBase(matcher_flags) {
+    for (; first != last; ++first) {
+      matchers_.push_back(MatcherCast<const Element&>(*first));
+    }
+    for (const auto& m : matchers_) {
+      matcher_describers().push_back(m.GetDescriber());
+    }
+  }
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    return UnorderedElementsAreMatcherImplBase::DescribeToImpl(os);
+  }
+
+  // Describes what the negation of this matcher does.
+  void DescribeNegationTo(::std::ostream* os) const override {
+    return UnorderedElementsAreMatcherImplBase::DescribeNegationToImpl(os);
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    StlContainerReference stl_container = View::ConstReference(container);
+    ::std::vector<std::string> element_printouts;
+    MatchMatrix matrix =
+        AnalyzeElements(stl_container.begin(), stl_container.end(),
+                        &element_printouts, listener);
+
+    if (matrix.LhsSize() == 0 && matrix.RhsSize() == 0) {
+      return true;
+    }
+
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      if (matrix.LhsSize() != matrix.RhsSize()) {
+        // The element count doesn't match.  If the container is empty,
+        // there's no need to explain anything as Google Mock already
+        // prints the empty container. Otherwise we just need to show
+        // how many elements there actually are.
+        if (matrix.LhsSize() != 0 && listener->IsInterested()) {
+          *listener << "which has " << Elements(matrix.LhsSize());
+        }
+        return false;
+      }
+    }
+
+    return VerifyMatchMatrix(element_printouts, matrix, listener) &&
+           FindPairing(matrix, listener);
+  }
+
+ private:
+  template <typename ElementIter>
+  MatchMatrix AnalyzeElements(ElementIter elem_first, ElementIter elem_last,
+                              ::std::vector<std::string>* element_printouts,
+                              MatchResultListener* listener) const {
+    element_printouts->clear();
+    ::std::vector<char> did_match;
+    size_t num_elements = 0;
+    DummyMatchResultListener dummy;
+    for (; elem_first != elem_last; ++num_elements, ++elem_first) {
+      if (listener->IsInterested()) {
+        element_printouts->push_back(PrintToString(*elem_first));
+      }
+      for (size_t irhs = 0; irhs != matchers_.size(); ++irhs) {
+        did_match.push_back(
+            matchers_[irhs].MatchAndExplain(*elem_first, &dummy));
+      }
+    }
+
+    MatchMatrix matrix(num_elements, matchers_.size());
+    ::std::vector<char>::const_iterator did_match_iter = did_match.begin();
+    for (size_t ilhs = 0; ilhs != num_elements; ++ilhs) {
+      for (size_t irhs = 0; irhs != matchers_.size(); ++irhs) {
+        matrix.SetEdge(ilhs, irhs, *did_match_iter++ != 0);
+      }
+    }
+    return matrix;
+  }
+
+  ::std::vector<Matcher<const Element&> > matchers_;
+};
+
+// Functor for use in TransformTuple.
+// Performs MatcherCast<Target> on an input argument of any type.
+template <typename Target>
+struct CastAndAppendTransform {
+  template <typename Arg>
+  Matcher<Target> operator()(const Arg& a) const {
+    return MatcherCast<Target>(a);
+  }
+};
+
+// Implements UnorderedElementsAre.
+template <typename MatcherTuple>
+class UnorderedElementsAreMatcher {
+ public:
+  explicit UnorderedElementsAreMatcher(const MatcherTuple& args)
+      : matchers_(args) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+    typedef typename internal::StlContainerView<RawContainer>::type View;
+    typedef typename View::value_type Element;
+    typedef ::std::vector<Matcher<const Element&> > MatcherVec;
+    MatcherVec matchers;
+    matchers.reserve(::std::tuple_size<MatcherTuple>::value);
+    TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
+                         ::std::back_inserter(matchers));
+    return Matcher<Container>(
+        new UnorderedElementsAreMatcherImpl<const Container&>(
+            UnorderedMatcherRequire::ExactMatch, matchers.begin(),
+            matchers.end()));
+  }
+
+ private:
+  const MatcherTuple matchers_;
+};
+
+// Implements ElementsAre.
+template <typename MatcherTuple>
+class ElementsAreMatcher {
+ public:
+  explicit ElementsAreMatcher(const MatcherTuple& args) : matchers_(args) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    GTEST_COMPILE_ASSERT_(
+        !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>::value ||
+            ::std::tuple_size<MatcherTuple>::value < 2,
+        use_UnorderedElementsAre_with_hash_tables);
+
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+    typedef typename internal::StlContainerView<RawContainer>::type View;
+    typedef typename View::value_type Element;
+    typedef ::std::vector<Matcher<const Element&> > MatcherVec;
+    MatcherVec matchers;
+    matchers.reserve(::std::tuple_size<MatcherTuple>::value);
+    TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
+                         ::std::back_inserter(matchers));
+    return Matcher<Container>(new ElementsAreMatcherImpl<const Container&>(
+        matchers.begin(), matchers.end()));
+  }
+
+ private:
+  const MatcherTuple matchers_;
+};
+
+// Implements UnorderedElementsAreArray(), IsSubsetOf(), and IsSupersetOf().
+template <typename T>
+class UnorderedElementsAreArrayMatcher {
+ public:
+  template <typename Iter>
+  UnorderedElementsAreArrayMatcher(UnorderedMatcherRequire::Flags match_flags,
+                                   Iter first, Iter last)
+      : match_flags_(match_flags), matchers_(first, last) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(
+        new UnorderedElementsAreMatcherImpl<const Container&>(
+            match_flags_, matchers_.begin(), matchers_.end()));
+  }
+
+ private:
+  UnorderedMatcherRequire::Flags match_flags_;
+  ::std::vector<T> matchers_;
+};
+
+// Implements ElementsAreArray().
+template <typename T>
+class ElementsAreArrayMatcher {
+ public:
+  template <typename Iter>
+  ElementsAreArrayMatcher(Iter first, Iter last) : matchers_(first, last) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    GTEST_COMPILE_ASSERT_(
+        !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>::value,
+        use_UnorderedElementsAreArray_with_hash_tables);
+
+    return Matcher<Container>(new ElementsAreMatcherImpl<const Container&>(
+        matchers_.begin(), matchers_.end()));
+  }
+
+ private:
+  const ::std::vector<T> matchers_;
+};
+
+// Given a 2-tuple matcher tm of type Tuple2Matcher and a value second
+// of type Second, BoundSecondMatcher<Tuple2Matcher, Second>(tm,
+// second) is a polymorphic matcher that matches a value x if and only if
+// tm matches tuple (x, second).  Useful for implementing
+// UnorderedPointwise() in terms of UnorderedElementsAreArray().
+//
+// BoundSecondMatcher is copyable and assignable, as we need to put
+// instances of this class in a vector when implementing
+// UnorderedPointwise().
+template <typename Tuple2Matcher, typename Second>
+class BoundSecondMatcher {
+ public:
+  BoundSecondMatcher(const Tuple2Matcher& tm, const Second& second)
+      : tuple2_matcher_(tm), second_value_(second) {}
+
+  BoundSecondMatcher(const BoundSecondMatcher& other) = default;
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return MakeMatcher(new Impl<T>(tuple2_matcher_, second_value_));
+  }
+
+  // We have to define this for UnorderedPointwise() to compile in
+  // C++98 mode, as it puts BoundSecondMatcher instances in a vector,
+  // which requires the elements to be assignable in C++98.  The
+  // compiler cannot generate the operator= for us, as Tuple2Matcher
+  // and Second may not be assignable.
+  //
+  // However, this should never be called, so the implementation just
+  // need to assert.
+  void operator=(const BoundSecondMatcher& /*rhs*/) {
+    GTEST_LOG_(FATAL) << "BoundSecondMatcher should never be assigned.";
+  }
+
+ private:
+  template <typename T>
+  class Impl : public MatcherInterface<T> {
+   public:
+    typedef ::std::tuple<T, Second> ArgTuple;
+
+    Impl(const Tuple2Matcher& tm, const Second& second)
+        : mono_tuple2_matcher_(SafeMatcherCast<const ArgTuple&>(tm)),
+          second_value_(second) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "and ";
+      UniversalPrint(second_value_, os);
+      *os << " ";
+      mono_tuple2_matcher_.DescribeTo(os);
+    }
+
+    bool MatchAndExplain(T x, MatchResultListener* listener) const override {
+      return mono_tuple2_matcher_.MatchAndExplain(ArgTuple(x, second_value_),
+                                                  listener);
+    }
+
+   private:
+    const Matcher<const ArgTuple&> mono_tuple2_matcher_;
+    const Second second_value_;
+  };
+
+  const Tuple2Matcher tuple2_matcher_;
+  const Second second_value_;
+};
+
+// Given a 2-tuple matcher tm and a value second,
+// MatcherBindSecond(tm, second) returns a matcher that matches a
+// value x if and only if tm matches tuple (x, second).  Useful for
+// implementing UnorderedPointwise() in terms of UnorderedElementsAreArray().
+template <typename Tuple2Matcher, typename Second>
+BoundSecondMatcher<Tuple2Matcher, Second> MatcherBindSecond(
+    const Tuple2Matcher& tm, const Second& second) {
+  return BoundSecondMatcher<Tuple2Matcher, Second>(tm, second);
+}
+
+// Returns the description for a matcher defined using the MATCHER*()
+// macro where the user-supplied description string is "", if
+// 'negation' is false; otherwise returns the description of the
+// negation of the matcher.  'param_values' contains a list of strings
+// that are the print-out of the matcher's parameters.
+GTEST_API_ std::string FormatMatcherDescription(bool negation,
+                                                const char* matcher_name,
+                                                const Strings& param_values);
+
+// Implements a matcher that checks the value of a optional<> type variable.
+template <typename ValueMatcher>
+class OptionalMatcher {
+ public:
+  explicit OptionalMatcher(const ValueMatcher& value_matcher)
+      : value_matcher_(value_matcher) {}
+
+  template <typename Optional>
+  operator Matcher<Optional>() const {
+    return Matcher<Optional>(new Impl<const Optional&>(value_matcher_));
+  }
+
+  template <typename Optional>
+  class Impl : public MatcherInterface<Optional> {
+   public:
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Optional) OptionalView;
+    typedef typename OptionalView::value_type ValueType;
+    explicit Impl(const ValueMatcher& value_matcher)
+        : value_matcher_(MatcherCast<ValueType>(value_matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "value ";
+      value_matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "value ";
+      value_matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(Optional optional,
+                         MatchResultListener* listener) const override {
+      if (!optional) {
+        *listener << "which is not engaged";
+        return false;
+      }
+      const ValueType& value = *optional;
+      StringMatchResultListener value_listener;
+      const bool match = value_matcher_.MatchAndExplain(value, &value_listener);
+      *listener << "whose value " << PrintToString(value)
+                << (match ? " matches" : " doesn't match");
+      PrintIfNotEmpty(value_listener.str(), listener->stream());
+      return match;
+    }
+
+   private:
+    const Matcher<ValueType> value_matcher_;
+  };
+
+ private:
+  const ValueMatcher value_matcher_;
+};
+
+namespace variant_matcher {
+// Overloads to allow VariantMatcher to do proper ADL lookup.
+template <typename T>
+void holds_alternative() {}
+template <typename T>
+void get() {}
+
+// Implements a matcher that checks the value of a variant<> type variable.
+template <typename T>
+class VariantMatcher {
+ public:
+  explicit VariantMatcher(::testing::Matcher<const T&> matcher)
+      : matcher_(std::move(matcher)) {}
+
+  template <typename Variant>
+  bool MatchAndExplain(const Variant& value,
+                       ::testing::MatchResultListener* listener) const {
+    using std::get;
+    if (!listener->IsInterested()) {
+      return holds_alternative<T>(value) && matcher_.Matches(get<T>(value));
+    }
+
+    if (!holds_alternative<T>(value)) {
+      *listener << "whose value is not of type '" << GetTypeName() << "'";
+      return false;
+    }
+
+    const T& elem = get<T>(value);
+    StringMatchResultListener elem_listener;
+    const bool match = matcher_.MatchAndExplain(elem, &elem_listener);
+    *listener << "whose value " << PrintToString(elem)
+              << (match ? " matches" : " doesn't match");
+    PrintIfNotEmpty(elem_listener.str(), listener->stream());
+    return match;
+  }
+
+  void DescribeTo(std::ostream* os) const {
+    *os << "is a variant<> with value of type '" << GetTypeName()
+        << "' and the value ";
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "is a variant<> with value of type other than '" << GetTypeName()
+        << "' or the value ";
+    matcher_.DescribeNegationTo(os);
+  }
+
+ private:
+  static std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(
+        return internal::GetTypeName<T>());
+#endif
+    return "the element type";
+  }
+
+  const ::testing::Matcher<const T&> matcher_;
+};
+
+}  // namespace variant_matcher
+
+namespace any_cast_matcher {
+
+// Overloads to allow AnyCastMatcher to do proper ADL lookup.
+template <typename T>
+void any_cast() {}
+
+// Implements a matcher that any_casts the value.
+template <typename T>
+class AnyCastMatcher {
+ public:
+  explicit AnyCastMatcher(const ::testing::Matcher<const T&>& matcher)
+      : matcher_(matcher) {}
+
+  template <typename AnyType>
+  bool MatchAndExplain(const AnyType& value,
+                       ::testing::MatchResultListener* listener) const {
+    if (!listener->IsInterested()) {
+      const T* ptr = any_cast<T>(&value);
+      return ptr != nullptr && matcher_.Matches(*ptr);
+    }
+
+    const T* elem = any_cast<T>(&value);
+    if (elem == nullptr) {
+      *listener << "whose value is not of type '" << GetTypeName() << "'";
+      return false;
+    }
+
+    StringMatchResultListener elem_listener;
+    const bool match = matcher_.MatchAndExplain(*elem, &elem_listener);
+    *listener << "whose value " << PrintToString(*elem)
+              << (match ? " matches" : " doesn't match");
+    PrintIfNotEmpty(elem_listener.str(), listener->stream());
+    return match;
+  }
+
+  void DescribeTo(std::ostream* os) const {
+    *os << "is an 'any' type with value of type '" << GetTypeName()
+        << "' and the value ";
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "is an 'any' type with value of type other than '" << GetTypeName()
+        << "' or the value ";
+    matcher_.DescribeNegationTo(os);
+  }
+
+ private:
+  static std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(
+        return internal::GetTypeName<T>());
+#endif
+    return "the element type";
+  }
+
+  const ::testing::Matcher<const T&> matcher_;
+};
+
+}  // namespace any_cast_matcher
+
+// Implements the Args() matcher.
+template <class ArgsTuple, size_t... k>
+class ArgsMatcherImpl : public MatcherInterface<ArgsTuple> {
+ public:
+  using RawArgsTuple = typename std::decay<ArgsTuple>::type;
+  using SelectedArgs =
+      std::tuple<typename std::tuple_element<k, RawArgsTuple>::type...>;
+  using MonomorphicInnerMatcher = Matcher<const SelectedArgs&>;
+
+  template <typename InnerMatcher>
+  explicit ArgsMatcherImpl(const InnerMatcher& inner_matcher)
+      : inner_matcher_(SafeMatcherCast<const SelectedArgs&>(inner_matcher)) {}
+
+  bool MatchAndExplain(ArgsTuple args,
+                       MatchResultListener* listener) const override {
+    // Workaround spurious C4100 on MSVC<=15.7 when k is empty.
+    (void)args;
+    const SelectedArgs& selected_args =
+        std::forward_as_tuple(std::get<k>(args)...);
+    if (!listener->IsInterested()) return inner_matcher_.Matches(selected_args);
+
+    PrintIndices(listener->stream());
+    *listener << "are " << PrintToString(selected_args);
+
+    StringMatchResultListener inner_listener;
+    const bool match =
+        inner_matcher_.MatchAndExplain(selected_args, &inner_listener);
+    PrintIfNotEmpty(inner_listener.str(), listener->stream());
+    return match;
+  }
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "are a tuple ";
+    PrintIndices(os);
+    inner_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "are a tuple ";
+    PrintIndices(os);
+    inner_matcher_.DescribeNegationTo(os);
+  }
+
+ private:
+  // Prints the indices of the selected fields.
+  static void PrintIndices(::std::ostream* os) {
+    *os << "whose fields (";
+    const char* sep = "";
+    // Workaround spurious C4189 on MSVC<=15.7 when k is empty.
+    (void)sep;
+    const char* dummy[] = {"", (*os << sep << "#" << k, sep = ", ")...};
+    (void)dummy;
+    *os << ") ";
+  }
+
+  MonomorphicInnerMatcher inner_matcher_;
+};
+
+template <class InnerMatcher, size_t... k>
+class ArgsMatcher {
+ public:
+  explicit ArgsMatcher(InnerMatcher inner_matcher)
+      : inner_matcher_(std::move(inner_matcher)) {}
+
+  template <typename ArgsTuple>
+  operator Matcher<ArgsTuple>() const {  // NOLINT
+    return MakeMatcher(new ArgsMatcherImpl<ArgsTuple, k...>(inner_matcher_));
+  }
+
+ private:
+  InnerMatcher inner_matcher_;
+};
+
+}  // namespace internal
+
+// ElementsAreArray(iterator_first, iterator_last)
+// ElementsAreArray(pointer, count)
+// ElementsAreArray(array)
+// ElementsAreArray(container)
+// ElementsAreArray({ e1, e2, ..., en })
+//
+// The ElementsAreArray() functions are like ElementsAre(...), except
+// that they are given a homogeneous sequence rather than taking each
+// element as a function argument. The sequence can be specified as an
+// array, a pointer and count, a vector, an initializer list, or an
+// STL iterator range. In each of these cases, the underlying sequence
+// can be either a sequence of values or a sequence of matchers.
+//
+// All forms of ElementsAreArray() make a copy of the input matcher sequence.
+
+template <typename Iter>
+inline internal::ElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+ElementsAreArray(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::ElementsAreArrayMatcher<T>(first, last);
+}
+
+template <typename T>
+inline internal::ElementsAreArrayMatcher<T> ElementsAreArray(
+    const T* pointer, size_t count) {
+  return ElementsAreArray(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::ElementsAreArrayMatcher<T> ElementsAreArray(
+    const T (&array)[N]) {
+  return ElementsAreArray(array, N);
+}
+
+template <typename Container>
+inline internal::ElementsAreArrayMatcher<typename Container::value_type>
+ElementsAreArray(const Container& container) {
+  return ElementsAreArray(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::ElementsAreArrayMatcher<T>
+ElementsAreArray(::std::initializer_list<T> xs) {
+  return ElementsAreArray(xs.begin(), xs.end());
+}
+
+// UnorderedElementsAreArray(iterator_first, iterator_last)
+// UnorderedElementsAreArray(pointer, count)
+// UnorderedElementsAreArray(array)
+// UnorderedElementsAreArray(container)
+// UnorderedElementsAreArray({ e1, e2, ..., en })
+//
+// UnorderedElementsAreArray() verifies that a bijective mapping onto a
+// collection of matchers exists.
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+UnorderedElementsAreArray(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::UnorderedElementsAreArrayMatcher<T>(
+      internal::UnorderedMatcherRequire::ExactMatch, first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T>
+UnorderedElementsAreArray(const T* pointer, size_t count) {
+  return UnorderedElementsAreArray(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T>
+UnorderedElementsAreArray(const T (&array)[N]) {
+  return UnorderedElementsAreArray(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename Container::value_type>
+UnorderedElementsAreArray(const Container& container) {
+  return UnorderedElementsAreArray(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T>
+UnorderedElementsAreArray(::std::initializer_list<T> xs) {
+  return UnorderedElementsAreArray(xs.begin(), xs.end());
+}
+
+// _ is a matcher that matches anything of any type.
+//
+// This definition is fine as:
+//
+//   1. The C++ standard permits using the name _ in a namespace that
+//      is not the global namespace or ::std.
+//   2. The AnythingMatcher class has no data member or constructor,
+//      so it's OK to create global variables of this type.
+//   3. c-style has approved of using _ in this case.
+const internal::AnythingMatcher _ = {};
+// Creates a matcher that matches any value of the given type T.
+template <typename T>
+inline Matcher<T> A() {
+  return _;
+}
+
+// Creates a matcher that matches any value of the given type T.
+template <typename T>
+inline Matcher<T> An() {
+  return _;
+}
+
+template <typename T, typename M>
+Matcher<T> internal::MatcherCastImpl<T, M>::CastImpl(
+    const M& value, std::false_type /* convertible_to_matcher */,
+    std::false_type /* convertible_to_T */) {
+  return Eq(value);
+}
+
+// Creates a polymorphic matcher that matches any NULL pointer.
+inline PolymorphicMatcher<internal::IsNullMatcher > IsNull() {
+  return MakePolymorphicMatcher(internal::IsNullMatcher());
+}
+
+// Creates a polymorphic matcher that matches any non-NULL pointer.
+// This is convenient as Not(NULL) doesn't compile (the compiler
+// thinks that that expression is comparing a pointer with an integer).
+inline PolymorphicMatcher<internal::NotNullMatcher > NotNull() {
+  return MakePolymorphicMatcher(internal::NotNullMatcher());
+}
+
+// Creates a polymorphic matcher that matches any argument that
+// references variable x.
+template <typename T>
+inline internal::RefMatcher<T&> Ref(T& x) {  // NOLINT
+  return internal::RefMatcher<T&>(x);
+}
+
+// Creates a polymorphic matcher that matches any NaN floating point.
+inline PolymorphicMatcher<internal::IsNanMatcher> IsNan() {
+  return MakePolymorphicMatcher(internal::IsNanMatcher());
+}
+
+// Creates a matcher that matches any double argument approximately
+// equal to rhs, where two NANs are considered unequal.
+inline internal::FloatingEqMatcher<double> DoubleEq(double rhs) {
+  return internal::FloatingEqMatcher<double>(rhs, false);
+}
+
+// Creates a matcher that matches any double argument approximately
+// equal to rhs, including NaN values when rhs is NaN.
+inline internal::FloatingEqMatcher<double> NanSensitiveDoubleEq(double rhs) {
+  return internal::FloatingEqMatcher<double>(rhs, true);
+}
+
+// Creates a matcher that matches any double argument approximately equal to
+// rhs, up to the specified max absolute error bound, where two NANs are
+// considered unequal.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<double> DoubleNear(
+    double rhs, double max_abs_error) {
+  return internal::FloatingEqMatcher<double>(rhs, false, max_abs_error);
+}
+
+// Creates a matcher that matches any double argument approximately equal to
+// rhs, up to the specified max absolute error bound, including NaN values when
+// rhs is NaN.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<double> NanSensitiveDoubleNear(
+    double rhs, double max_abs_error) {
+  return internal::FloatingEqMatcher<double>(rhs, true, max_abs_error);
+}
+
+// Creates a matcher that matches any float argument approximately
+// equal to rhs, where two NANs are considered unequal.
+inline internal::FloatingEqMatcher<float> FloatEq(float rhs) {
+  return internal::FloatingEqMatcher<float>(rhs, false);
+}
+
+// Creates a matcher that matches any float argument approximately
+// equal to rhs, including NaN values when rhs is NaN.
+inline internal::FloatingEqMatcher<float> NanSensitiveFloatEq(float rhs) {
+  return internal::FloatingEqMatcher<float>(rhs, true);
+}
+
+// Creates a matcher that matches any float argument approximately equal to
+// rhs, up to the specified max absolute error bound, where two NANs are
+// considered unequal.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<float> FloatNear(
+    float rhs, float max_abs_error) {
+  return internal::FloatingEqMatcher<float>(rhs, false, max_abs_error);
+}
+
+// Creates a matcher that matches any float argument approximately equal to
+// rhs, up to the specified max absolute error bound, including NaN values when
+// rhs is NaN.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<float> NanSensitiveFloatNear(
+    float rhs, float max_abs_error) {
+  return internal::FloatingEqMatcher<float>(rhs, true, max_abs_error);
+}
+
+// Creates a matcher that matches a pointer (raw or smart) that points
+// to a value that matches inner_matcher.
+template <typename InnerMatcher>
+inline internal::PointeeMatcher<InnerMatcher> Pointee(
+    const InnerMatcher& inner_matcher) {
+  return internal::PointeeMatcher<InnerMatcher>(inner_matcher);
+}
+
+#if GTEST_HAS_RTTI
+// Creates a matcher that matches a pointer or reference that matches
+// inner_matcher when dynamic_cast<To> is applied.
+// The result of dynamic_cast<To> is forwarded to the inner matcher.
+// If To is a pointer and the cast fails, the inner matcher will receive NULL.
+// If To is a reference and the cast fails, this matcher returns false
+// immediately.
+template <typename To>
+inline PolymorphicMatcher<internal::WhenDynamicCastToMatcher<To> >
+WhenDynamicCastTo(const Matcher<To>& inner_matcher) {
+  return MakePolymorphicMatcher(
+      internal::WhenDynamicCastToMatcher<To>(inner_matcher));
+}
+#endif  // GTEST_HAS_RTTI
+
+// Creates a matcher that matches an object whose given field matches
+// 'matcher'.  For example,
+//   Field(&Foo::number, Ge(5))
+// matches a Foo object x if and only if x.number >= 5.
+template <typename Class, typename FieldType, typename FieldMatcher>
+inline PolymorphicMatcher<
+  internal::FieldMatcher<Class, FieldType> > Field(
+    FieldType Class::*field, const FieldMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::FieldMatcher<Class, FieldType>(
+          field, MatcherCast<const FieldType&>(matcher)));
+  // The call to MatcherCast() is required for supporting inner
+  // matchers of compatible types.  For example, it allows
+  //   Field(&Foo::bar, m)
+  // to compile where bar is an int32 and m is a matcher for int64.
+}
+
+// Same as Field() but also takes the name of the field to provide better error
+// messages.
+template <typename Class, typename FieldType, typename FieldMatcher>
+inline PolymorphicMatcher<internal::FieldMatcher<Class, FieldType> > Field(
+    const std::string& field_name, FieldType Class::*field,
+    const FieldMatcher& matcher) {
+  return MakePolymorphicMatcher(internal::FieldMatcher<Class, FieldType>(
+      field_name, field, MatcherCast<const FieldType&>(matcher)));
+}
+
+// Creates a matcher that matches an object whose given property
+// matches 'matcher'.  For example,
+//   Property(&Foo::str, StartsWith("hi"))
+// matches a Foo object x if and only if x.str() starts with "hi".
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+    Class, PropertyType, PropertyType (Class::*)() const> >
+Property(PropertyType (Class::*property)() const,
+         const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType,
+                                PropertyType (Class::*)() const>(
+          property, MatcherCast<const PropertyType&>(matcher)));
+  // The call to MatcherCast() is required for supporting inner
+  // matchers of compatible types.  For example, it allows
+  //   Property(&Foo::bar, m)
+  // to compile where bar() returns an int32 and m is a matcher for int64.
+}
+
+// Same as Property() above, but also takes the name of the property to provide
+// better error messages.
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+    Class, PropertyType, PropertyType (Class::*)() const> >
+Property(const std::string& property_name,
+         PropertyType (Class::*property)() const,
+         const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType,
+                                PropertyType (Class::*)() const>(
+          property_name, property, MatcherCast<const PropertyType&>(matcher)));
+}
+
+// The same as above but for reference-qualified member functions.
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+    Class, PropertyType, PropertyType (Class::*)() const &> >
+Property(PropertyType (Class::*property)() const &,
+         const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType,
+                                PropertyType (Class::*)() const&>(
+          property, MatcherCast<const PropertyType&>(matcher)));
+}
+
+// Three-argument form for reference-qualified member functions.
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+    Class, PropertyType, PropertyType (Class::*)() const &> >
+Property(const std::string& property_name,
+         PropertyType (Class::*property)() const &,
+         const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType,
+                                PropertyType (Class::*)() const&>(
+          property_name, property, MatcherCast<const PropertyType&>(matcher)));
+}
+
+// Creates a matcher that matches an object if and only if the result of
+// applying a callable to x matches 'matcher'. For example,
+//   ResultOf(f, StartsWith("hi"))
+// matches a Foo object x if and only if f(x) starts with "hi".
+// `callable` parameter can be a function, function pointer, or a functor. It is
+// required to keep no state affecting the results of the calls on it and make
+// no assumptions about how many calls will be made. Any state it keeps must be
+// protected from the concurrent access.
+template <typename Callable, typename InnerMatcher>
+internal::ResultOfMatcher<Callable, InnerMatcher> ResultOf(
+    Callable callable, InnerMatcher matcher) {
+  return internal::ResultOfMatcher<Callable, InnerMatcher>(
+      std::move(callable), std::move(matcher));
+}
+
+// String matchers.
+
+// Matches a string equal to str.
+template <typename T = std::string>
+PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrEq(
+    const internal::StringLike<T>& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::string>(std::string(str), true, true));
+}
+
+// Matches a string not equal to str.
+template <typename T = std::string>
+PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrNe(
+    const internal::StringLike<T>& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::string>(std::string(str), false, true));
+}
+
+// Matches a string equal to str, ignoring case.
+template <typename T = std::string>
+PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrCaseEq(
+    const internal::StringLike<T>& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::string>(std::string(str), true, false));
+}
+
+// Matches a string not equal to str, ignoring case.
+template <typename T = std::string>
+PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrCaseNe(
+    const internal::StringLike<T>& str) {
+  return MakePolymorphicMatcher(internal::StrEqualityMatcher<std::string>(
+      std::string(str), false, false));
+}
+
+// Creates a matcher that matches any string, std::string, or C string
+// that contains the given substring.
+template <typename T = std::string>
+PolymorphicMatcher<internal::HasSubstrMatcher<std::string> > HasSubstr(
+    const internal::StringLike<T>& substring) {
+  return MakePolymorphicMatcher(
+      internal::HasSubstrMatcher<std::string>(std::string(substring)));
+}
+
+// Matches a string that starts with 'prefix' (case-sensitive).
+template <typename T = std::string>
+PolymorphicMatcher<internal::StartsWithMatcher<std::string> > StartsWith(
+    const internal::StringLike<T>& prefix) {
+  return MakePolymorphicMatcher(
+      internal::StartsWithMatcher<std::string>(std::string(prefix)));
+}
+
+// Matches a string that ends with 'suffix' (case-sensitive).
+template <typename T = std::string>
+PolymorphicMatcher<internal::EndsWithMatcher<std::string> > EndsWith(
+    const internal::StringLike<T>& suffix) {
+  return MakePolymorphicMatcher(
+      internal::EndsWithMatcher<std::string>(std::string(suffix)));
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Wide string matchers.
+
+// Matches a string equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> > StrEq(
+    const std::wstring& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::wstring>(str, true, true));
+}
+
+// Matches a string not equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> > StrNe(
+    const std::wstring& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::wstring>(str, false, true));
+}
+
+// Matches a string equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> >
+StrCaseEq(const std::wstring& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::wstring>(str, true, false));
+}
+
+// Matches a string not equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> >
+StrCaseNe(const std::wstring& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::wstring>(str, false, false));
+}
+
+// Creates a matcher that matches any ::wstring, std::wstring, or C wide string
+// that contains the given substring.
+inline PolymorphicMatcher<internal::HasSubstrMatcher<std::wstring> > HasSubstr(
+    const std::wstring& substring) {
+  return MakePolymorphicMatcher(
+      internal::HasSubstrMatcher<std::wstring>(substring));
+}
+
+// Matches a string that starts with 'prefix' (case-sensitive).
+inline PolymorphicMatcher<internal::StartsWithMatcher<std::wstring> >
+StartsWith(const std::wstring& prefix) {
+  return MakePolymorphicMatcher(
+      internal::StartsWithMatcher<std::wstring>(prefix));
+}
+
+// Matches a string that ends with 'suffix' (case-sensitive).
+inline PolymorphicMatcher<internal::EndsWithMatcher<std::wstring> > EndsWith(
+    const std::wstring& suffix) {
+  return MakePolymorphicMatcher(
+      internal::EndsWithMatcher<std::wstring>(suffix));
+}
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field == the second field.
+inline internal::Eq2Matcher Eq() { return internal::Eq2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field >= the second field.
+inline internal::Ge2Matcher Ge() { return internal::Ge2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field > the second field.
+inline internal::Gt2Matcher Gt() { return internal::Gt2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field <= the second field.
+inline internal::Le2Matcher Le() { return internal::Le2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field < the second field.
+inline internal::Lt2Matcher Lt() { return internal::Lt2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field != the second field.
+inline internal::Ne2Matcher Ne() { return internal::Ne2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatEq(first field) matches the second field.
+inline internal::FloatingEq2Matcher<float> FloatEq() {
+  return internal::FloatingEq2Matcher<float>();
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleEq(first field) matches the second field.
+inline internal::FloatingEq2Matcher<double> DoubleEq() {
+  return internal::FloatingEq2Matcher<double>();
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatEq(first field) matches the second field with NaN equality.
+inline internal::FloatingEq2Matcher<float> NanSensitiveFloatEq() {
+  return internal::FloatingEq2Matcher<float>(true);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleEq(first field) matches the second field with NaN equality.
+inline internal::FloatingEq2Matcher<double> NanSensitiveDoubleEq() {
+  return internal::FloatingEq2Matcher<double>(true);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatNear(first field, max_abs_error) matches the second field.
+inline internal::FloatingEq2Matcher<float> FloatNear(float max_abs_error) {
+  return internal::FloatingEq2Matcher<float>(max_abs_error);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleNear(first field, max_abs_error) matches the second field.
+inline internal::FloatingEq2Matcher<double> DoubleNear(double max_abs_error) {
+  return internal::FloatingEq2Matcher<double>(max_abs_error);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatNear(first field, max_abs_error) matches the second field with NaN
+// equality.
+inline internal::FloatingEq2Matcher<float> NanSensitiveFloatNear(
+    float max_abs_error) {
+  return internal::FloatingEq2Matcher<float>(max_abs_error, true);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleNear(first field, max_abs_error) matches the second field with NaN
+// equality.
+inline internal::FloatingEq2Matcher<double> NanSensitiveDoubleNear(
+    double max_abs_error) {
+  return internal::FloatingEq2Matcher<double>(max_abs_error, true);
+}
+
+// Creates a matcher that matches any value of type T that m doesn't
+// match.
+template <typename InnerMatcher>
+inline internal::NotMatcher<InnerMatcher> Not(InnerMatcher m) {
+  return internal::NotMatcher<InnerMatcher>(m);
+}
+
+// Returns a matcher that matches anything that satisfies the given
+// predicate.  The predicate can be any unary function or functor
+// whose return type can be implicitly converted to bool.
+template <typename Predicate>
+inline PolymorphicMatcher<internal::TrulyMatcher<Predicate> >
+Truly(Predicate pred) {
+  return MakePolymorphicMatcher(internal::TrulyMatcher<Predicate>(pred));
+}
+
+// Returns a matcher that matches the container size. The container must
+// support both size() and size_type which all STL-like containers provide.
+// Note that the parameter 'size' can be a value of type size_type as well as
+// matcher. For instance:
+//   EXPECT_THAT(container, SizeIs(2));     // Checks container has 2 elements.
+//   EXPECT_THAT(container, SizeIs(Le(2));  // Checks container has at most 2.
+template <typename SizeMatcher>
+inline internal::SizeIsMatcher<SizeMatcher>
+SizeIs(const SizeMatcher& size_matcher) {
+  return internal::SizeIsMatcher<SizeMatcher>(size_matcher);
+}
+
+// Returns a matcher that matches the distance between the container's begin()
+// iterator and its end() iterator, i.e. the size of the container. This matcher
+// can be used instead of SizeIs with containers such as std::forward_list which
+// do not implement size(). The container must provide const_iterator (with
+// valid iterator_traits), begin() and end().
+template <typename DistanceMatcher>
+inline internal::BeginEndDistanceIsMatcher<DistanceMatcher>
+BeginEndDistanceIs(const DistanceMatcher& distance_matcher) {
+  return internal::BeginEndDistanceIsMatcher<DistanceMatcher>(distance_matcher);
+}
+
+// Returns a matcher that matches an equal container.
+// This matcher behaves like Eq(), but in the event of mismatch lists the
+// values that are included in one container but not the other. (Duplicate
+// values and order differences are not explained.)
+template <typename Container>
+inline PolymorphicMatcher<internal::ContainerEqMatcher<
+    typename std::remove_const<Container>::type>>
+ContainerEq(const Container& rhs) {
+  return MakePolymorphicMatcher(internal::ContainerEqMatcher<Container>(rhs));
+}
+
+// Returns a matcher that matches a container that, when sorted using
+// the given comparator, matches container_matcher.
+template <typename Comparator, typename ContainerMatcher>
+inline internal::WhenSortedByMatcher<Comparator, ContainerMatcher>
+WhenSortedBy(const Comparator& comparator,
+             const ContainerMatcher& container_matcher) {
+  return internal::WhenSortedByMatcher<Comparator, ContainerMatcher>(
+      comparator, container_matcher);
+}
+
+// Returns a matcher that matches a container that, when sorted using
+// the < operator, matches container_matcher.
+template <typename ContainerMatcher>
+inline internal::WhenSortedByMatcher<internal::LessComparator, ContainerMatcher>
+WhenSorted(const ContainerMatcher& container_matcher) {
+  return
+      internal::WhenSortedByMatcher<internal::LessComparator, ContainerMatcher>(
+          internal::LessComparator(), container_matcher);
+}
+
+// Matches an STL-style container or a native array that contains the
+// same number of elements as in rhs, where its i-th element and rhs's
+// i-th element (as a pair) satisfy the given pair matcher, for all i.
+// TupleMatcher must be able to be safely cast to Matcher<std::tuple<const
+// T1&, const T2&> >, where T1 and T2 are the types of elements in the
+// LHS container and the RHS container respectively.
+template <typename TupleMatcher, typename Container>
+inline internal::PointwiseMatcher<TupleMatcher,
+                                  typename std::remove_const<Container>::type>
+Pointwise(const TupleMatcher& tuple_matcher, const Container& rhs) {
+  return internal::PointwiseMatcher<TupleMatcher, Container>(tuple_matcher,
+                                                             rhs);
+}
+
+
+// Supports the Pointwise(m, {a, b, c}) syntax.
+template <typename TupleMatcher, typename T>
+inline internal::PointwiseMatcher<TupleMatcher, std::vector<T> > Pointwise(
+    const TupleMatcher& tuple_matcher, std::initializer_list<T> rhs) {
+  return Pointwise(tuple_matcher, std::vector<T>(rhs));
+}
+
+
+// UnorderedPointwise(pair_matcher, rhs) matches an STL-style
+// container or a native array that contains the same number of
+// elements as in rhs, where in some permutation of the container, its
+// i-th element and rhs's i-th element (as a pair) satisfy the given
+// pair matcher, for all i.  Tuple2Matcher must be able to be safely
+// cast to Matcher<std::tuple<const T1&, const T2&> >, where T1 and T2 are
+// the types of elements in the LHS container and the RHS container
+// respectively.
+//
+// This is like Pointwise(pair_matcher, rhs), except that the element
+// order doesn't matter.
+template <typename Tuple2Matcher, typename RhsContainer>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename internal::BoundSecondMatcher<
+        Tuple2Matcher,
+        typename internal::StlContainerView<
+            typename std::remove_const<RhsContainer>::type>::type::value_type>>
+UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
+                   const RhsContainer& rhs_container) {
+  // RhsView allows the same code to handle RhsContainer being a
+  // STL-style container and it being a native C-style array.
+  typedef typename internal::StlContainerView<RhsContainer> RhsView;
+  typedef typename RhsView::type RhsStlContainer;
+  typedef typename RhsStlContainer::value_type Second;
+  const RhsStlContainer& rhs_stl_container =
+      RhsView::ConstReference(rhs_container);
+
+  // Create a matcher for each element in rhs_container.
+  ::std::vector<internal::BoundSecondMatcher<Tuple2Matcher, Second> > matchers;
+  for (typename RhsStlContainer::const_iterator it = rhs_stl_container.begin();
+       it != rhs_stl_container.end(); ++it) {
+    matchers.push_back(
+        internal::MatcherBindSecond(tuple2_matcher, *it));
+  }
+
+  // Delegate the work to UnorderedElementsAreArray().
+  return UnorderedElementsAreArray(matchers);
+}
+
+
+// Supports the UnorderedPointwise(m, {a, b, c}) syntax.
+template <typename Tuple2Matcher, typename T>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename internal::BoundSecondMatcher<Tuple2Matcher, T> >
+UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
+                   std::initializer_list<T> rhs) {
+  return UnorderedPointwise(tuple2_matcher, std::vector<T>(rhs));
+}
+
+
+// Matches an STL-style container or a native array that contains at
+// least one element matching the given value or matcher.
+//
+// Examples:
+//   ::std::set<int> page_ids;
+//   page_ids.insert(3);
+//   page_ids.insert(1);
+//   EXPECT_THAT(page_ids, Contains(1));
+//   EXPECT_THAT(page_ids, Contains(Gt(2)));
+//   EXPECT_THAT(page_ids, Not(Contains(4)));
+//
+//   ::std::map<int, size_t> page_lengths;
+//   page_lengths[1] = 100;
+//   EXPECT_THAT(page_lengths,
+//               Contains(::std::pair<const int, size_t>(1, 100)));
+//
+//   const char* user_ids[] = { "joe", "mike", "tom" };
+//   EXPECT_THAT(user_ids, Contains(Eq(::std::string("tom"))));
+template <typename M>
+inline internal::ContainsMatcher<M> Contains(M matcher) {
+  return internal::ContainsMatcher<M>(matcher);
+}
+
+// IsSupersetOf(iterator_first, iterator_last)
+// IsSupersetOf(pointer, count)
+// IsSupersetOf(array)
+// IsSupersetOf(container)
+// IsSupersetOf({e1, e2, ..., en})
+//
+// IsSupersetOf() verifies that a surjective partial mapping onto a collection
+// of matchers exists. In other words, a container matches
+// IsSupersetOf({e1, ..., en}) if and only if there is a permutation
+// {y1, ..., yn} of some of the container's elements where y1 matches e1,
+// ..., and yn matches en. Obviously, the size of the container must be >= n
+// in order to have a match. Examples:
+//
+// - {1, 2, 3} matches IsSupersetOf({Ge(3), Ne(0)}), as 3 matches Ge(3) and
+//   1 matches Ne(0).
+// - {1, 2} doesn't match IsSupersetOf({Eq(1), Lt(2)}), even though 1 matches
+//   both Eq(1) and Lt(2). The reason is that different matchers must be used
+//   for elements in different slots of the container.
+// - {1, 1, 2} matches IsSupersetOf({Eq(1), Lt(2)}), as (the first) 1 matches
+//   Eq(1) and (the second) 1 matches Lt(2).
+// - {1, 2, 3} matches IsSupersetOf(Gt(1), Gt(1)), as 2 matches (the first)
+//   Gt(1) and 3 matches (the second) Gt(1).
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+IsSupersetOf(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::UnorderedElementsAreArrayMatcher<T>(
+      internal::UnorderedMatcherRequire::Superset, first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
+    const T* pointer, size_t count) {
+  return IsSupersetOf(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
+    const T (&array)[N]) {
+  return IsSupersetOf(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename Container::value_type>
+IsSupersetOf(const Container& container) {
+  return IsSupersetOf(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
+    ::std::initializer_list<T> xs) {
+  return IsSupersetOf(xs.begin(), xs.end());
+}
+
+// IsSubsetOf(iterator_first, iterator_last)
+// IsSubsetOf(pointer, count)
+// IsSubsetOf(array)
+// IsSubsetOf(container)
+// IsSubsetOf({e1, e2, ..., en})
+//
+// IsSubsetOf() verifies that an injective mapping onto a collection of matchers
+// exists.  In other words, a container matches IsSubsetOf({e1, ..., en}) if and
+// only if there is a subset of matchers {m1, ..., mk} which would match the
+// container using UnorderedElementsAre.  Obviously, the size of the container
+// must be <= n in order to have a match. Examples:
+//
+// - {1} matches IsSubsetOf({Gt(0), Lt(0)}), as 1 matches Gt(0).
+// - {1, -1} matches IsSubsetOf({Lt(0), Gt(0)}), as 1 matches Gt(0) and -1
+//   matches Lt(0).
+// - {1, 2} doesn't matches IsSubsetOf({Gt(0), Lt(0)}), even though 1 and 2 both
+//   match Gt(0). The reason is that different matchers must be used for
+//   elements in different slots of the container.
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+IsSubsetOf(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::UnorderedElementsAreArrayMatcher<T>(
+      internal::UnorderedMatcherRequire::Subset, first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
+    const T* pointer, size_t count) {
+  return IsSubsetOf(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
+    const T (&array)[N]) {
+  return IsSubsetOf(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename Container::value_type>
+IsSubsetOf(const Container& container) {
+  return IsSubsetOf(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
+    ::std::initializer_list<T> xs) {
+  return IsSubsetOf(xs.begin(), xs.end());
+}
+
+// Matches an STL-style container or a native array that contains only
+// elements matching the given value or matcher.
+//
+// Each(m) is semantically equivalent to Not(Contains(Not(m))). Only
+// the messages are different.
+//
+// Examples:
+//   ::std::set<int> page_ids;
+//   // Each(m) matches an empty container, regardless of what m is.
+//   EXPECT_THAT(page_ids, Each(Eq(1)));
+//   EXPECT_THAT(page_ids, Each(Eq(77)));
+//
+//   page_ids.insert(3);
+//   EXPECT_THAT(page_ids, Each(Gt(0)));
+//   EXPECT_THAT(page_ids, Not(Each(Gt(4))));
+//   page_ids.insert(1);
+//   EXPECT_THAT(page_ids, Not(Each(Lt(2))));
+//
+//   ::std::map<int, size_t> page_lengths;
+//   page_lengths[1] = 100;
+//   page_lengths[2] = 200;
+//   page_lengths[3] = 300;
+//   EXPECT_THAT(page_lengths, Not(Each(Pair(1, 100))));
+//   EXPECT_THAT(page_lengths, Each(Key(Le(3))));
+//
+//   const char* user_ids[] = { "joe", "mike", "tom" };
+//   EXPECT_THAT(user_ids, Not(Each(Eq(::std::string("tom")))));
+template <typename M>
+inline internal::EachMatcher<M> Each(M matcher) {
+  return internal::EachMatcher<M>(matcher);
+}
+
+// Key(inner_matcher) matches an std::pair whose 'first' field matches
+// inner_matcher.  For example, Contains(Key(Ge(5))) can be used to match an
+// std::map that contains at least one element whose key is >= 5.
+template <typename M>
+inline internal::KeyMatcher<M> Key(M inner_matcher) {
+  return internal::KeyMatcher<M>(inner_matcher);
+}
+
+// Pair(first_matcher, second_matcher) matches a std::pair whose 'first' field
+// matches first_matcher and whose 'second' field matches second_matcher.  For
+// example, EXPECT_THAT(map_type, ElementsAre(Pair(Ge(5), "foo"))) can be used
+// to match a std::map<int, string> that contains exactly one element whose key
+// is >= 5 and whose value equals "foo".
+template <typename FirstMatcher, typename SecondMatcher>
+inline internal::PairMatcher<FirstMatcher, SecondMatcher>
+Pair(FirstMatcher first_matcher, SecondMatcher second_matcher) {
+  return internal::PairMatcher<FirstMatcher, SecondMatcher>(
+      first_matcher, second_matcher);
+}
+
+namespace no_adl {
+// FieldsAre(matchers...) matches piecewise the fields of compatible structs.
+// These include those that support `get<I>(obj)`, and when structured bindings
+// are enabled any class that supports them.
+// In particular, `std::tuple`, `std::pair`, `std::array` and aggregate types.
+template <typename... M>
+internal::FieldsAreMatcher<typename std::decay<M>::type...> FieldsAre(
+    M&&... matchers) {
+  return internal::FieldsAreMatcher<typename std::decay<M>::type...>(
+      std::forward<M>(matchers)...);
+}
+
+// Creates a matcher that matches a pointer (raw or smart) that matches
+// inner_matcher.
+template <typename InnerMatcher>
+inline internal::PointerMatcher<InnerMatcher> Pointer(
+    const InnerMatcher& inner_matcher) {
+  return internal::PointerMatcher<InnerMatcher>(inner_matcher);
+}
+
+// Creates a matcher that matches an object that has an address that matches
+// inner_matcher.
+template <typename InnerMatcher>
+inline internal::AddressMatcher<InnerMatcher> Address(
+    const InnerMatcher& inner_matcher) {
+  return internal::AddressMatcher<InnerMatcher>(inner_matcher);
+}
+}  // namespace no_adl
+
+// Returns a predicate that is satisfied by anything that matches the
+// given matcher.
+template <typename M>
+inline internal::MatcherAsPredicate<M> Matches(M matcher) {
+  return internal::MatcherAsPredicate<M>(matcher);
+}
+
+// Returns true if and only if the value matches the matcher.
+template <typename T, typename M>
+inline bool Value(const T& value, M matcher) {
+  return testing::Matches(matcher)(value);
+}
+
+// Matches the value against the given matcher and explains the match
+// result to listener.
+template <typename T, typename M>
+inline bool ExplainMatchResult(
+    M matcher, const T& value, MatchResultListener* listener) {
+  return SafeMatcherCast<const T&>(matcher).MatchAndExplain(value, listener);
+}
+
+// Returns a string representation of the given matcher.  Useful for description
+// strings of matchers defined using MATCHER_P* macros that accept matchers as
+// their arguments.  For example:
+//
+// MATCHER_P(XAndYThat, matcher,
+//           "X that " + DescribeMatcher<int>(matcher, negation) +
+//               " and Y that " + DescribeMatcher<double>(matcher, negation)) {
+//   return ExplainMatchResult(matcher, arg.x(), result_listener) &&
+//          ExplainMatchResult(matcher, arg.y(), result_listener);
+// }
+template <typename T, typename M>
+std::string DescribeMatcher(const M& matcher, bool negation = false) {
+  ::std::stringstream ss;
+  Matcher<T> monomorphic_matcher = SafeMatcherCast<T>(matcher);
+  if (negation) {
+    monomorphic_matcher.DescribeNegationTo(&ss);
+  } else {
+    monomorphic_matcher.DescribeTo(&ss);
+  }
+  return ss.str();
+}
+
+template <typename... Args>
+internal::ElementsAreMatcher<
+    std::tuple<typename std::decay<const Args&>::type...>>
+ElementsAre(const Args&... matchers) {
+  return internal::ElementsAreMatcher<
+      std::tuple<typename std::decay<const Args&>::type...>>(
+      std::make_tuple(matchers...));
+}
+
+template <typename... Args>
+internal::UnorderedElementsAreMatcher<
+    std::tuple<typename std::decay<const Args&>::type...>>
+UnorderedElementsAre(const Args&... matchers) {
+  return internal::UnorderedElementsAreMatcher<
+      std::tuple<typename std::decay<const Args&>::type...>>(
+      std::make_tuple(matchers...));
+}
+
+// Define variadic matcher versions.
+template <typename... Args>
+internal::AllOfMatcher<typename std::decay<const Args&>::type...> AllOf(
+    const Args&... matchers) {
+  return internal::AllOfMatcher<typename std::decay<const Args&>::type...>(
+      matchers...);
+}
+
+template <typename... Args>
+internal::AnyOfMatcher<typename std::decay<const Args&>::type...> AnyOf(
+    const Args&... matchers) {
+  return internal::AnyOfMatcher<typename std::decay<const Args&>::type...>(
+      matchers...);
+}
+
+// AnyOfArray(array)
+// AnyOfArray(pointer, count)
+// AnyOfArray(container)
+// AnyOfArray({ e1, e2, ..., en })
+// AnyOfArray(iterator_first, iterator_last)
+//
+// AnyOfArray() verifies whether a given value matches any member of a
+// collection of matchers.
+//
+// AllOfArray(array)
+// AllOfArray(pointer, count)
+// AllOfArray(container)
+// AllOfArray({ e1, e2, ..., en })
+// AllOfArray(iterator_first, iterator_last)
+//
+// AllOfArray() verifies whether a given value matches all members of a
+// collection of matchers.
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::AnyOfArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+AnyOfArray(Iter first, Iter last) {
+  return internal::AnyOfArrayMatcher<
+      typename ::std::iterator_traits<Iter>::value_type>(first, last);
+}
+
+template <typename Iter>
+inline internal::AllOfArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+AllOfArray(Iter first, Iter last) {
+  return internal::AllOfArrayMatcher<
+      typename ::std::iterator_traits<Iter>::value_type>(first, last);
+}
+
+template <typename T>
+inline internal::AnyOfArrayMatcher<T> AnyOfArray(const T* ptr, size_t count) {
+  return AnyOfArray(ptr, ptr + count);
+}
+
+template <typename T>
+inline internal::AllOfArrayMatcher<T> AllOfArray(const T* ptr, size_t count) {
+  return AllOfArray(ptr, ptr + count);
+}
+
+template <typename T, size_t N>
+inline internal::AnyOfArrayMatcher<T> AnyOfArray(const T (&array)[N]) {
+  return AnyOfArray(array, N);
+}
+
+template <typename T, size_t N>
+inline internal::AllOfArrayMatcher<T> AllOfArray(const T (&array)[N]) {
+  return AllOfArray(array, N);
+}
+
+template <typename Container>
+inline internal::AnyOfArrayMatcher<typename Container::value_type> AnyOfArray(
+    const Container& container) {
+  return AnyOfArray(container.begin(), container.end());
+}
+
+template <typename Container>
+inline internal::AllOfArrayMatcher<typename Container::value_type> AllOfArray(
+    const Container& container) {
+  return AllOfArray(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::AnyOfArrayMatcher<T> AnyOfArray(
+    ::std::initializer_list<T> xs) {
+  return AnyOfArray(xs.begin(), xs.end());
+}
+
+template <typename T>
+inline internal::AllOfArrayMatcher<T> AllOfArray(
+    ::std::initializer_list<T> xs) {
+  return AllOfArray(xs.begin(), xs.end());
+}
+
+// Args<N1, N2, ..., Nk>(a_matcher) matches a tuple if the selected
+// fields of it matches a_matcher.  C++ doesn't support default
+// arguments for function templates, so we have to overload it.
+template <size_t... k, typename InnerMatcher>
+internal::ArgsMatcher<typename std::decay<InnerMatcher>::type, k...> Args(
+    InnerMatcher&& matcher) {
+  return internal::ArgsMatcher<typename std::decay<InnerMatcher>::type, k...>(
+      std::forward<InnerMatcher>(matcher));
+}
+
+// AllArgs(m) is a synonym of m.  This is useful in
+//
+//   EXPECT_CALL(foo, Bar(_, _)).With(AllArgs(Eq()));
+//
+// which is easier to read than
+//
+//   EXPECT_CALL(foo, Bar(_, _)).With(Eq());
+template <typename InnerMatcher>
+inline InnerMatcher AllArgs(const InnerMatcher& matcher) { return matcher; }
+
+// Returns a matcher that matches the value of an optional<> type variable.
+// The matcher implementation only uses '!arg' and requires that the optional<>
+// type has a 'value_type' member type and that '*arg' is of type 'value_type'
+// and is printable using 'PrintToString'. It is compatible with
+// std::optional/std::experimental::optional.
+// Note that to compare an optional type variable against nullopt you should
+// use Eq(nullopt) and not Eq(Optional(nullopt)). The latter implies that the
+// optional value contains an optional itself.
+template <typename ValueMatcher>
+inline internal::OptionalMatcher<ValueMatcher> Optional(
+    const ValueMatcher& value_matcher) {
+  return internal::OptionalMatcher<ValueMatcher>(value_matcher);
+}
+
+// Returns a matcher that matches the value of a absl::any type variable.
+template <typename T>
+PolymorphicMatcher<internal::any_cast_matcher::AnyCastMatcher<T> > AnyWith(
+    const Matcher<const T&>& matcher) {
+  return MakePolymorphicMatcher(
+      internal::any_cast_matcher::AnyCastMatcher<T>(matcher));
+}
+
+// Returns a matcher that matches the value of a variant<> type variable.
+// The matcher implementation uses ADL to find the holds_alternative and get
+// functions.
+// It is compatible with std::variant.
+template <typename T>
+PolymorphicMatcher<internal::variant_matcher::VariantMatcher<T> > VariantWith(
+    const Matcher<const T&>& matcher) {
+  return MakePolymorphicMatcher(
+      internal::variant_matcher::VariantMatcher<T>(matcher));
+}
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Anything inside the `internal` namespace is internal to the implementation
+// and must not be used in user code!
+namespace internal {
+
+class WithWhatMatcherImpl {
+ public:
+  WithWhatMatcherImpl(Matcher<std::string> matcher)
+      : matcher_(std::move(matcher)) {}
+
+  void DescribeTo(std::ostream* os) const {
+    *os << "contains .what() that ";
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "contains .what() that does not ";
+    matcher_.DescribeTo(os);
+  }
+
+  template <typename Err>
+  bool MatchAndExplain(const Err& err, MatchResultListener* listener) const {
+    *listener << "which contains .what() that ";
+    return matcher_.MatchAndExplain(err.what(), listener);
+  }
+
+ private:
+  const Matcher<std::string> matcher_;
+};
+
+inline PolymorphicMatcher<WithWhatMatcherImpl> WithWhat(
+    Matcher<std::string> m) {
+  return MakePolymorphicMatcher(WithWhatMatcherImpl(std::move(m)));
+}
+
+template <typename Err>
+class ExceptionMatcherImpl {
+  class NeverThrown {
+   public:
+    const char* what() const noexcept {
+      return "this exception should never be thrown";
+    }
+  };
+
+  // If the matchee raises an exception of a wrong type, we'd like to
+  // catch it and print its message and type. To do that, we add an additional
+  // catch clause:
+  //
+  //     try { ... }
+  //     catch (const Err&) { /* an expected exception */ }
+  //     catch (const std::exception&) { /* exception of a wrong type */ }
+  //
+  // However, if the `Err` itself is `std::exception`, we'd end up with two
+  // identical `catch` clauses:
+  //
+  //     try { ... }
+  //     catch (const std::exception&) { /* an expected exception */ }
+  //     catch (const std::exception&) { /* exception of a wrong type */ }
+  //
+  // This can cause a warning or an error in some compilers. To resolve
+  // the issue, we use a fake error type whenever `Err` is `std::exception`:
+  //
+  //     try { ... }
+  //     catch (const std::exception&) { /* an expected exception */ }
+  //     catch (const NeverThrown&) { /* exception of a wrong type */ }
+  using DefaultExceptionType = typename std::conditional<
+      std::is_same<typename std::remove_cv<
+                       typename std::remove_reference<Err>::type>::type,
+                   std::exception>::value,
+      const NeverThrown&, const std::exception&>::type;
+
+ public:
+  ExceptionMatcherImpl(Matcher<const Err&> matcher)
+      : matcher_(std::move(matcher)) {}
+
+  void DescribeTo(std::ostream* os) const {
+    *os << "throws an exception which is a " << GetTypeName<Err>();
+    *os << " which ";
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "throws an exception which is not a " << GetTypeName<Err>();
+    *os << " which ";
+    matcher_.DescribeNegationTo(os);
+  }
+
+  template <typename T>
+  bool MatchAndExplain(T&& x, MatchResultListener* listener) const {
+    try {
+      (void)(std::forward<T>(x)());
+    } catch (const Err& err) {
+      *listener << "throws an exception which is a " << GetTypeName<Err>();
+      *listener << " ";
+      return matcher_.MatchAndExplain(err, listener);
+    } catch (DefaultExceptionType err) {
+#if GTEST_HAS_RTTI
+      *listener << "throws an exception of type " << GetTypeName(typeid(err));
+      *listener << " ";
+#else
+      *listener << "throws an std::exception-derived type ";
+#endif
+      *listener << "with description \"" << err.what() << "\"";
+      return false;
+    } catch (...) {
+      *listener << "throws an exception of an unknown type";
+      return false;
+    }
+
+    *listener << "does not throw any exception";
+    return false;
+  }
+
+ private:
+  const Matcher<const Err&> matcher_;
+};
+
+}  // namespace internal
+
+// Throws()
+// Throws(exceptionMatcher)
+// ThrowsMessage(messageMatcher)
+//
+// This matcher accepts a callable and verifies that when invoked, it throws
+// an exception with the given type and properties.
+//
+// Examples:
+//
+//   EXPECT_THAT(
+//       []() { throw std::runtime_error("message"); },
+//       Throws<std::runtime_error>());
+//
+//   EXPECT_THAT(
+//       []() { throw std::runtime_error("message"); },
+//       ThrowsMessage<std::runtime_error>(HasSubstr("message")));
+//
+//   EXPECT_THAT(
+//       []() { throw std::runtime_error("message"); },
+//       Throws<std::runtime_error>(
+//           Property(&std::runtime_error::what, HasSubstr("message"))));
+
+template <typename Err>
+PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> Throws() {
+  return MakePolymorphicMatcher(
+      internal::ExceptionMatcherImpl<Err>(A<const Err&>()));
+}
+
+template <typename Err, typename ExceptionMatcher>
+PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> Throws(
+    const ExceptionMatcher& exception_matcher) {
+  // Using matcher cast allows users to pass a matcher of a more broad type.
+  // For example user may want to pass Matcher<std::exception>
+  // to Throws<std::runtime_error>, or Matcher<int64> to Throws<int32>.
+  return MakePolymorphicMatcher(internal::ExceptionMatcherImpl<Err>(
+      SafeMatcherCast<const Err&>(exception_matcher)));
+}
+
+template <typename Err, typename MessageMatcher>
+PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> ThrowsMessage(
+    MessageMatcher&& message_matcher) {
+  static_assert(std::is_base_of<std::exception, Err>::value,
+                "expected an std::exception-derived type");
+  return Throws<Err>(internal::WithWhat(
+      MatcherCast<std::string>(std::forward<MessageMatcher>(message_matcher))));
+}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// These macros allow using matchers to check values in Google Test
+// tests.  ASSERT_THAT(value, matcher) and EXPECT_THAT(value, matcher)
+// succeed if and only if the value matches the matcher.  If the assertion
+// fails, the value and the description of the matcher will be printed.
+#define ASSERT_THAT(value, matcher) ASSERT_PRED_FORMAT1(\
+    ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+#define EXPECT_THAT(value, matcher) EXPECT_PRED_FORMAT1(\
+    ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+
+// MATCHER* macroses itself are listed below.
+#define MATCHER(name, description)                                             \
+  class name##Matcher                                                          \
+      : public ::testing::internal::MatcherBaseImpl<name##Matcher> {           \
+   public:                                                                     \
+    template <typename arg_type>                                               \
+    class gmock_Impl : public ::testing::MatcherInterface<const arg_type&> {   \
+     public:                                                                   \
+      gmock_Impl() {}                                                          \
+      bool MatchAndExplain(                                                    \
+          const arg_type& arg,                                                 \
+          ::testing::MatchResultListener* result_listener) const override;     \
+      void DescribeTo(::std::ostream* gmock_os) const override {               \
+        *gmock_os << FormatDescription(false);                                 \
+      }                                                                        \
+      void DescribeNegationTo(::std::ostream* gmock_os) const override {       \
+        *gmock_os << FormatDescription(true);                                  \
+      }                                                                        \
+                                                                               \
+     private:                                                                  \
+      ::std::string FormatDescription(bool negation) const {                   \
+        ::std::string gmock_description = (description);                       \
+        if (!gmock_description.empty()) {                                      \
+          return gmock_description;                                            \
+        }                                                                      \
+        return ::testing::internal::FormatMatcherDescription(negation, #name,  \
+                                                             {});              \
+      }                                                                        \
+    };                                                                         \
+  };                                                                           \
+  GTEST_ATTRIBUTE_UNUSED_ inline name##Matcher name() { return {}; }           \
+  template <typename arg_type>                                                 \
+  bool name##Matcher::gmock_Impl<arg_type>::MatchAndExplain(                   \
+      const arg_type& arg,                                                     \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_) \
+      const
+
+#define MATCHER_P(name, p0, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP, description, (p0))
+#define MATCHER_P2(name, p0, p1, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP2, description, (p0, p1))
+#define MATCHER_P3(name, p0, p1, p2, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP3, description, (p0, p1, p2))
+#define MATCHER_P4(name, p0, p1, p2, p3, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP4, description, (p0, p1, p2, p3))
+#define MATCHER_P5(name, p0, p1, p2, p3, p4, description)    \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP5, description, \
+                         (p0, p1, p2, p3, p4))
+#define MATCHER_P6(name, p0, p1, p2, p3, p4, p5, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP6, description,  \
+                         (p0, p1, p2, p3, p4, p5))
+#define MATCHER_P7(name, p0, p1, p2, p3, p4, p5, p6, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP7, description,      \
+                         (p0, p1, p2, p3, p4, p5, p6))
+#define MATCHER_P8(name, p0, p1, p2, p3, p4, p5, p6, p7, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP8, description,          \
+                         (p0, p1, p2, p3, p4, p5, p6, p7))
+#define MATCHER_P9(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP9, description,              \
+                         (p0, p1, p2, p3, p4, p5, p6, p7, p8))
+#define MATCHER_P10(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP10, description,                  \
+                         (p0, p1, p2, p3, p4, p5, p6, p7, p8, p9))
+
+#define GMOCK_INTERNAL_MATCHER(name, full_name, description, args)             \
+  template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)>                      \
+  class full_name : public ::testing::internal::MatcherBaseImpl<               \
+                        full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>> { \
+   public:                                                                     \
+    using full_name::MatcherBaseImpl::MatcherBaseImpl;                         \
+    template <typename arg_type>                                               \
+    class gmock_Impl : public ::testing::MatcherInterface<const arg_type&> {   \
+     public:                                                                   \
+      explicit gmock_Impl(GMOCK_INTERNAL_MATCHER_FUNCTION_ARGS(args))          \
+          : GMOCK_INTERNAL_MATCHER_FORWARD_ARGS(args) {}                       \
+      bool MatchAndExplain(                                                    \
+          const arg_type& arg,                                                 \
+          ::testing::MatchResultListener* result_listener) const override;     \
+      void DescribeTo(::std::ostream* gmock_os) const override {               \
+        *gmock_os << FormatDescription(false);                                 \
+      }                                                                        \
+      void DescribeNegationTo(::std::ostream* gmock_os) const override {       \
+        *gmock_os << FormatDescription(true);                                  \
+      }                                                                        \
+      GMOCK_INTERNAL_MATCHER_MEMBERS(args)                                     \
+                                                                               \
+     private:                                                                  \
+      ::std::string FormatDescription(bool negation) const {                   \
+        ::std::string gmock_description = (description);                       \
+        if (!gmock_description.empty()) {                                      \
+          return gmock_description;                                            \
+        }                                                                      \
+        return ::testing::internal::FormatMatcherDescription(                  \
+            negation, #name,                                                   \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(      \
+                ::std::tuple<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>(        \
+                    GMOCK_INTERNAL_MATCHER_MEMBERS_USAGE(args))));             \
+      }                                                                        \
+    };                                                                         \
+  };                                                                           \
+  template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)>                      \
+  inline full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)> name(             \
+      GMOCK_INTERNAL_MATCHER_FUNCTION_ARGS(args)) {                            \
+    return full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>(                \
+        GMOCK_INTERNAL_MATCHER_ARGS_USAGE(args));                              \
+  }                                                                            \
+  template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)>                      \
+  template <typename arg_type>                                                 \
+  bool full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>::gmock_Impl<        \
+      arg_type>::MatchAndExplain(const arg_type& arg,                          \
+                                 ::testing::MatchResultListener*               \
+                                     result_listener GTEST_ATTRIBUTE_UNUSED_)  \
+      const
+
+#define GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args) \
+  GMOCK_PP_TAIL(                                     \
+      GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAM, , args))
+#define GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAM(i_unused, data_unused, arg) \
+  , typename arg##_type
+
+#define GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_TYPE_PARAM, , args))
+#define GMOCK_INTERNAL_MATCHER_TYPE_PARAM(i_unused, data_unused, arg) \
+  , arg##_type
+
+#define GMOCK_INTERNAL_MATCHER_FUNCTION_ARGS(args) \
+  GMOCK_PP_TAIL(dummy_first GMOCK_PP_FOR_EACH(     \
+      GMOCK_INTERNAL_MATCHER_FUNCTION_ARG, , args))
+#define GMOCK_INTERNAL_MATCHER_FUNCTION_ARG(i, data_unused, arg) \
+  , arg##_type gmock_p##i
+
+#define GMOCK_INTERNAL_MATCHER_FORWARD_ARGS(args) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_FORWARD_ARG, , args))
+#define GMOCK_INTERNAL_MATCHER_FORWARD_ARG(i, data_unused, arg) \
+  , arg(::std::forward<arg##_type>(gmock_p##i))
+
+#define GMOCK_INTERNAL_MATCHER_MEMBERS(args) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_MEMBER, , args)
+#define GMOCK_INTERNAL_MATCHER_MEMBER(i_unused, data_unused, arg) \
+  const arg##_type arg;
+
+#define GMOCK_INTERNAL_MATCHER_MEMBERS_USAGE(args) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_MEMBER_USAGE, , args))
+#define GMOCK_INTERNAL_MATCHER_MEMBER_USAGE(i_unused, data_unused, arg) , arg
+
+#define GMOCK_INTERNAL_MATCHER_ARGS_USAGE(args) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_ARG_USAGE, , args))
+#define GMOCK_INTERNAL_MATCHER_ARG_USAGE(i, data_unused, arg_unused) \
+  , gmock_p##i
+
+// To prevent ADL on certain functions we put them on a separate namespace.
+using namespace no_adl;  // NOLINT
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
+
+// Include any custom callback matchers added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gmock/internal/custom/gmock-matchers.h"
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h
new file mode 100644
index 0000000000..fd293358a2
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h
@@ -0,0 +1,573 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used variadic actions.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
+
+#include <memory>
+#include <utility>
+
+#include "gmock/gmock-actions.h"
+#include "gmock/internal/gmock-port.h"
+
+// Include any custom callback actions added by the local installation.
+#include "gmock/internal/custom/gmock-generated-actions.h"
+
+// Sometimes you want to give an action explicit template parameters
+// that cannot be inferred from its value parameters.  ACTION() and
+// ACTION_P*() don't support that.  ACTION_TEMPLATE() remedies that
+// and can be viewed as an extension to ACTION() and ACTION_P*().
+//
+// The syntax:
+//
+//   ACTION_TEMPLATE(ActionName,
+//                   HAS_m_TEMPLATE_PARAMS(kind1, name1, ..., kind_m, name_m),
+//                   AND_n_VALUE_PARAMS(p1, ..., p_n)) { statements; }
+//
+// defines an action template that takes m explicit template
+// parameters and n value parameters.  name_i is the name of the i-th
+// template parameter, and kind_i specifies whether it's a typename,
+// an integral constant, or a template.  p_i is the name of the i-th
+// value parameter.
+//
+// Example:
+//
+//   // DuplicateArg<k, T>(output) converts the k-th argument of the mock
+//   // function to type T and copies it to *output.
+//   ACTION_TEMPLATE(DuplicateArg,
+//                   HAS_2_TEMPLATE_PARAMS(int, k, typename, T),
+//                   AND_1_VALUE_PARAMS(output)) {
+//     *output = T(::std::get<k>(args));
+//   }
+//   ...
+//     int n;
+//     EXPECT_CALL(mock, Foo(_, _))
+//         .WillOnce(DuplicateArg<1, unsigned char>(&n));
+//
+// To create an instance of an action template, write:
+//
+//   ActionName<t1, ..., t_m>(v1, ..., v_n)
+//
+// where the ts are the template arguments and the vs are the value
+// arguments.  The value argument types are inferred by the compiler.
+// If you want to explicitly specify the value argument types, you can
+// provide additional template arguments:
+//
+//   ActionName<t1, ..., t_m, u1, ..., u_k>(v1, ..., v_n)
+//
+// where u_i is the desired type of v_i.
+//
+// ACTION_TEMPLATE and ACTION/ACTION_P* can be overloaded on the
+// number of value parameters, but not on the number of template
+// parameters.  Without the restriction, the meaning of the following
+// is unclear:
+//
+//   OverloadedAction<int, bool>(x);
+//
+// Are we using a single-template-parameter action where 'bool' refers
+// to the type of x, or are we using a two-template-parameter action
+// where the compiler is asked to infer the type of x?
+//
+// Implementation notes:
+//
+// GMOCK_INTERNAL_*_HAS_m_TEMPLATE_PARAMS and
+// GMOCK_INTERNAL_*_AND_n_VALUE_PARAMS are internal macros for
+// implementing ACTION_TEMPLATE.  The main trick we use is to create
+// new macro invocations when expanding a macro.  For example, we have
+//
+//   #define ACTION_TEMPLATE(name, template_params, value_params)
+//       ... GMOCK_INTERNAL_DECL_##template_params ...
+//
+// which causes ACTION_TEMPLATE(..., HAS_1_TEMPLATE_PARAMS(typename, T), ...)
+// to expand to
+//
+//       ... GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(typename, T) ...
+//
+// Since GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS is a macro, the
+// preprocessor will continue to expand it to
+//
+//       ... typename T ...
+//
+// This technique conforms to the C++ standard and is portable.  It
+// allows us to implement action templates using O(N) code, where N is
+// the maximum number of template/value parameters supported.  Without
+// using it, we'd have to devote O(N^2) amount of code to implement all
+// combinations of m and n.
+
+// Declares the template parameters.
+#define GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(kind0, name0) kind0 name0
+#define GMOCK_INTERNAL_DECL_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1) kind0 name0, kind1 name1
+#define GMOCK_INTERNAL_DECL_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2) kind0 name0, kind1 name1, kind2 name2
+#define GMOCK_INTERNAL_DECL_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3) kind0 name0, kind1 name1, kind2 name2, \
+    kind3 name3
+#define GMOCK_INTERNAL_DECL_HAS_5_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4) kind0 name0, kind1 name1, \
+    kind2 name2, kind3 name3, kind4 name4
+#define GMOCK_INTERNAL_DECL_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5) kind0 name0, \
+    kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5
+#define GMOCK_INTERNAL_DECL_HAS_7_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6) kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4, \
+    kind5 name5, kind6 name6
+#define GMOCK_INTERNAL_DECL_HAS_8_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7) kind0 name0, kind1 name1, kind2 name2, kind3 name3, \
+    kind4 name4, kind5 name5, kind6 name6, kind7 name7
+#define GMOCK_INTERNAL_DECL_HAS_9_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7, kind8, name8) kind0 name0, kind1 name1, kind2 name2, \
+    kind3 name3, kind4 name4, kind5 name5, kind6 name6, kind7 name7, \
+    kind8 name8
+#define GMOCK_INTERNAL_DECL_HAS_10_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1, kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6, kind7, name7, kind8, name8, kind9, name9) kind0 name0, \
+    kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5, \
+    kind6 name6, kind7 name7, kind8 name8, kind9 name9
+
+// Lists the template parameters.
+#define GMOCK_INTERNAL_LIST_HAS_1_TEMPLATE_PARAMS(kind0, name0) name0
+#define GMOCK_INTERNAL_LIST_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1) name0, name1
+#define GMOCK_INTERNAL_LIST_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2) name0, name1, name2
+#define GMOCK_INTERNAL_LIST_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3) name0, name1, name2, name3
+#define GMOCK_INTERNAL_LIST_HAS_5_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4) name0, name1, name2, name3, \
+    name4
+#define GMOCK_INTERNAL_LIST_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5) name0, name1, \
+    name2, name3, name4, name5
+#define GMOCK_INTERNAL_LIST_HAS_7_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6) name0, name1, name2, name3, name4, name5, name6
+#define GMOCK_INTERNAL_LIST_HAS_8_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7) name0, name1, name2, name3, name4, name5, name6, name7
+#define GMOCK_INTERNAL_LIST_HAS_9_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7, kind8, name8) name0, name1, name2, name3, name4, name5, \
+    name6, name7, name8
+#define GMOCK_INTERNAL_LIST_HAS_10_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1, kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6, kind7, name7, kind8, name8, kind9, name9) name0, name1, name2, \
+    name3, name4, name5, name6, name7, name8, name9
+
+// Declares the types of value parameters.
+#define GMOCK_INTERNAL_DECL_TYPE_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DECL_TYPE_AND_1_VALUE_PARAMS(p0) , typename p0##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_2_VALUE_PARAMS(p0, p1) , \
+    typename p0##_type, typename p1##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) , \
+    typename p0##_type, typename p1##_type, typename p2##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) , \
+    typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) , \
+    typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) , \
+    typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) , typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type, \
+    typename p6##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7) , typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type, \
+    typename p6##_type, typename p7##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8) , typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type, \
+    typename p6##_type, typename p7##_type, typename p8##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8, p9) , typename p0##_type, typename p1##_type, \
+    typename p2##_type, typename p3##_type, typename p4##_type, \
+    typename p5##_type, typename p6##_type, typename p7##_type, \
+    typename p8##_type, typename p9##_type
+
+// Initializes the value parameters.
+#define GMOCK_INTERNAL_INIT_AND_0_VALUE_PARAMS()\
+    ()
+#define GMOCK_INTERNAL_INIT_AND_1_VALUE_PARAMS(p0)\
+    (p0##_type gmock_p0) : p0(::std::move(gmock_p0))
+#define GMOCK_INTERNAL_INIT_AND_2_VALUE_PARAMS(p0, p1)\
+    (p0##_type gmock_p0, p1##_type gmock_p1) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1))
+#define GMOCK_INTERNAL_INIT_AND_3_VALUE_PARAMS(p0, p1, p2)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2))
+#define GMOCK_INTERNAL_INIT_AND_4_VALUE_PARAMS(p0, p1, p2, p3)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3))
+#define GMOCK_INTERNAL_INIT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4))
+#define GMOCK_INTERNAL_INIT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5))
+#define GMOCK_INTERNAL_INIT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6))
+#define GMOCK_INTERNAL_INIT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6, p7##_type gmock_p7) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+        p7(::std::move(gmock_p7))
+#define GMOCK_INTERNAL_INIT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6, p7##_type gmock_p7, \
+        p8##_type gmock_p8) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+        p7(::std::move(gmock_p7)), p8(::std::move(gmock_p8))
+#define GMOCK_INTERNAL_INIT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8, \
+        p9##_type gmock_p9) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+        p7(::std::move(gmock_p7)), p8(::std::move(gmock_p8)), \
+        p9(::std::move(gmock_p9))
+
+// Defines the copy constructor
+#define GMOCK_INTERNAL_DEFN_COPY_AND_0_VALUE_PARAMS() \
+    {}  // Avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82134
+#define GMOCK_INTERNAL_DEFN_COPY_AND_1_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_2_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_3_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_4_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_5_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_6_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_7_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_8_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_9_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_10_VALUE_PARAMS(...) = default;
+
+// Declares the fields for storing the value parameters.
+#define GMOCK_INTERNAL_DEFN_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DEFN_AND_1_VALUE_PARAMS(p0) p0##_type p0;
+#define GMOCK_INTERNAL_DEFN_AND_2_VALUE_PARAMS(p0, p1) p0##_type p0; \
+    p1##_type p1;
+#define GMOCK_INTERNAL_DEFN_AND_3_VALUE_PARAMS(p0, p1, p2) p0##_type p0; \
+    p1##_type p1; p2##_type p2;
+#define GMOCK_INTERNAL_DEFN_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0##_type p0; \
+    p1##_type p1; p2##_type p2; p3##_type p3;
+#define GMOCK_INTERNAL_DEFN_AND_5_VALUE_PARAMS(p0, p1, p2, p3, \
+    p4) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4;
+#define GMOCK_INTERNAL_DEFN_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, \
+    p5) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
+    p5##_type p5;
+#define GMOCK_INTERNAL_DEFN_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
+    p5##_type p5; p6##_type p6;
+#define GMOCK_INTERNAL_DEFN_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
+    p5##_type p5; p6##_type p6; p7##_type p7;
+#define GMOCK_INTERNAL_DEFN_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; \
+    p4##_type p4; p5##_type p5; p6##_type p6; p7##_type p7; p8##_type p8;
+#define GMOCK_INTERNAL_DEFN_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; \
+    p4##_type p4; p5##_type p5; p6##_type p6; p7##_type p7; p8##_type p8; \
+    p9##_type p9;
+
+// Lists the value parameters.
+#define GMOCK_INTERNAL_LIST_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_LIST_AND_1_VALUE_PARAMS(p0) p0
+#define GMOCK_INTERNAL_LIST_AND_2_VALUE_PARAMS(p0, p1) p0, p1
+#define GMOCK_INTERNAL_LIST_AND_3_VALUE_PARAMS(p0, p1, p2) p0, p1, p2
+#define GMOCK_INTERNAL_LIST_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0, p1, p2, p3
+#define GMOCK_INTERNAL_LIST_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) p0, p1, \
+    p2, p3, p4
+#define GMOCK_INTERNAL_LIST_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) p0, \
+    p1, p2, p3, p4, p5
+#define GMOCK_INTERNAL_LIST_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) p0, p1, p2, p3, p4, p5, p6
+#define GMOCK_INTERNAL_LIST_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) p0, p1, p2, p3, p4, p5, p6, p7
+#define GMOCK_INTERNAL_LIST_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) p0, p1, p2, p3, p4, p5, p6, p7, p8
+#define GMOCK_INTERNAL_LIST_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) p0, p1, p2, p3, p4, p5, p6, p7, p8, p9
+
+// Lists the value parameter types.
+#define GMOCK_INTERNAL_LIST_TYPE_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_LIST_TYPE_AND_1_VALUE_PARAMS(p0) , p0##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_2_VALUE_PARAMS(p0, p1) , p0##_type, \
+    p1##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) , p0##_type, \
+    p1##_type, p2##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) , \
+    p0##_type, p1##_type, p2##_type, p3##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) , \
+    p0##_type, p1##_type, p2##_type, p3##_type, p4##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) , \
+    p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type, \
+    p6##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+    p5##_type, p6##_type, p7##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+    p5##_type, p6##_type, p7##_type, p8##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8, p9) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+    p5##_type, p6##_type, p7##_type, p8##_type, p9##_type
+
+// Declares the value parameters.
+#define GMOCK_INTERNAL_DECL_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DECL_AND_1_VALUE_PARAMS(p0) p0##_type p0
+#define GMOCK_INTERNAL_DECL_AND_2_VALUE_PARAMS(p0, p1) p0##_type p0, \
+    p1##_type p1
+#define GMOCK_INTERNAL_DECL_AND_3_VALUE_PARAMS(p0, p1, p2) p0##_type p0, \
+    p1##_type p1, p2##_type p2
+#define GMOCK_INTERNAL_DECL_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0##_type p0, \
+    p1##_type p1, p2##_type p2, p3##_type p3
+#define GMOCK_INTERNAL_DECL_AND_5_VALUE_PARAMS(p0, p1, p2, p3, \
+    p4) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4
+#define GMOCK_INTERNAL_DECL_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, \
+    p5) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+    p5##_type p5
+#define GMOCK_INTERNAL_DECL_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+    p5##_type p5, p6##_type p6
+#define GMOCK_INTERNAL_DECL_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+    p5##_type p5, p6##_type p6, p7##_type p7
+#define GMOCK_INTERNAL_DECL_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+    p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8
+#define GMOCK_INTERNAL_DECL_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+    p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8, \
+    p9##_type p9
+
+// The suffix of the class template implementing the action template.
+#define GMOCK_INTERNAL_COUNT_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_COUNT_AND_1_VALUE_PARAMS(p0) P
+#define GMOCK_INTERNAL_COUNT_AND_2_VALUE_PARAMS(p0, p1) P2
+#define GMOCK_INTERNAL_COUNT_AND_3_VALUE_PARAMS(p0, p1, p2) P3
+#define GMOCK_INTERNAL_COUNT_AND_4_VALUE_PARAMS(p0, p1, p2, p3) P4
+#define GMOCK_INTERNAL_COUNT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) P5
+#define GMOCK_INTERNAL_COUNT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) P6
+#define GMOCK_INTERNAL_COUNT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) P7
+#define GMOCK_INTERNAL_COUNT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) P8
+#define GMOCK_INTERNAL_COUNT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) P9
+#define GMOCK_INTERNAL_COUNT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) P10
+
+// The name of the class template implementing the action template.
+#define GMOCK_ACTION_CLASS_(name, value_params)\
+    GTEST_CONCAT_TOKEN_(name##Action, GMOCK_INTERNAL_COUNT_##value_params)
+
+#define ACTION_TEMPLATE(name, template_params, value_params)                   \
+  template <GMOCK_INTERNAL_DECL_##template_params                              \
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>                           \
+  class GMOCK_ACTION_CLASS_(name, value_params) {                              \
+   public:                                                                     \
+    explicit GMOCK_ACTION_CLASS_(name, value_params)(                          \
+        GMOCK_INTERNAL_DECL_##value_params)                                    \
+        GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params),    \
+                    = default; ,                                               \
+                    : impl_(std::make_shared<gmock_Impl>(                      \
+                                GMOCK_INTERNAL_LIST_##value_params)) { })      \
+    GMOCK_ACTION_CLASS_(name, value_params)(                                   \
+        const GMOCK_ACTION_CLASS_(name, value_params)&) noexcept               \
+        GMOCK_INTERNAL_DEFN_COPY_##value_params                                \
+    GMOCK_ACTION_CLASS_(name, value_params)(                                   \
+        GMOCK_ACTION_CLASS_(name, value_params)&&) noexcept                    \
+        GMOCK_INTERNAL_DEFN_COPY_##value_params                                \
+    template <typename F>                                                      \
+    operator ::testing::Action<F>() const {                                    \
+      return GMOCK_PP_IF(                                                      \
+          GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params),              \
+                      (::testing::internal::MakeAction<F, gmock_Impl>()),      \
+                      (::testing::internal::MakeAction<F>(impl_)));            \
+    }                                                                          \
+   private:                                                                    \
+    class gmock_Impl {                                                         \
+     public:                                                                   \
+      explicit gmock_Impl GMOCK_INTERNAL_INIT_##value_params {}                \
+      template <typename function_type, typename return_type,                  \
+                typename args_type, GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>         \
+      return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const;  \
+      GMOCK_INTERNAL_DEFN_##value_params                                       \
+    };                                                                         \
+    GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params),        \
+                , std::shared_ptr<const gmock_Impl> impl_;)                    \
+  };                                                                           \
+  template <GMOCK_INTERNAL_DECL_##template_params                              \
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>                           \
+  GMOCK_ACTION_CLASS_(name, value_params)<                                     \
+      GMOCK_INTERNAL_LIST_##template_params                                    \
+      GMOCK_INTERNAL_LIST_TYPE_##value_params> name(                           \
+          GMOCK_INTERNAL_DECL_##value_params) GTEST_MUST_USE_RESULT_;          \
+  template <GMOCK_INTERNAL_DECL_##template_params                              \
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>                           \
+  inline GMOCK_ACTION_CLASS_(name, value_params)<                              \
+      GMOCK_INTERNAL_LIST_##template_params                                    \
+      GMOCK_INTERNAL_LIST_TYPE_##value_params> name(                           \
+          GMOCK_INTERNAL_DECL_##value_params) {                                \
+    return GMOCK_ACTION_CLASS_(name, value_params)<                            \
+        GMOCK_INTERNAL_LIST_##template_params                                  \
+        GMOCK_INTERNAL_LIST_TYPE_##value_params>(                              \
+            GMOCK_INTERNAL_LIST_##value_params);                               \
+  }                                                                            \
+  template <GMOCK_INTERNAL_DECL_##template_params                              \
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>                           \
+  template <typename function_type, typename return_type, typename args_type,  \
+            GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                                 \
+  return_type GMOCK_ACTION_CLASS_(name, value_params)<                         \
+      GMOCK_INTERNAL_LIST_##template_params                                    \
+      GMOCK_INTERNAL_LIST_TYPE_##value_params>::gmock_Impl::gmock_PerformImpl( \
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+namespace testing {
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+namespace internal {
+
+// internal::InvokeArgument - a helper for InvokeArgument action.
+// The basic overloads are provided here for generic functors.
+// Overloads for other custom-callables are provided in the
+// internal/custom/gmock-generated-actions.h header.
+template <typename F, typename... Args>
+auto InvokeArgument(F f, Args... args) -> decltype(f(args...)) {
+  return f(args...);
+}
+
+template <std::size_t index, typename... Params>
+struct InvokeArgumentAction {
+  template <typename... Args>
+  auto operator()(Args&&... args) const -> decltype(internal::InvokeArgument(
+      std::get<index>(std::forward_as_tuple(std::forward<Args>(args)...)),
+      std::declval<const Params&>()...)) {
+    internal::FlatTuple<Args&&...> args_tuple(FlatTupleConstructTag{},
+                                              std::forward<Args>(args)...);
+    return params.Apply([&](const Params&... unpacked_params) {
+      auto&& callable = args_tuple.template Get<index>();
+      return internal::InvokeArgument(
+          std::forward<decltype(callable)>(callable), unpacked_params...);
+    });
+  }
+
+  internal::FlatTuple<Params...> params;
+};
+
+}  // namespace internal
+
+// The InvokeArgument<N>(a1, a2, ..., a_k) action invokes the N-th
+// (0-based) argument, which must be a k-ary callable, of the mock
+// function, with arguments a1, a2, ..., a_k.
+//
+// Notes:
+//
+//   1. The arguments are passed by value by default.  If you need to
+//   pass an argument by reference, wrap it inside std::ref().  For
+//   example,
+//
+//     InvokeArgument<1>(5, string("Hello"), std::ref(foo))
+//
+//   passes 5 and string("Hello") by value, and passes foo by
+//   reference.
+//
+//   2. If the callable takes an argument by reference but std::ref() is
+//   not used, it will receive the reference to a copy of the value,
+//   instead of the original value.  For example, when the 0-th
+//   argument of the mock function takes a const string&, the action
+//
+//     InvokeArgument<0>(string("Hello"))
+//
+//   makes a copy of the temporary string("Hello") object and passes a
+//   reference of the copy, instead of the original temporary object,
+//   to the callable.  This makes it easy for a user to define an
+//   InvokeArgument action from temporary values and have it performed
+//   later.
+template <std::size_t index, typename... Params>
+internal::InvokeArgumentAction<index, typename std::decay<Params>::type...>
+InvokeArgument(Params&&... params) {
+  return {internal::FlatTuple<typename std::decay<Params>::type...>(
+      internal::FlatTupleConstructTag{}, std::forward<Params>(params)...)};
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+}  // namespace testing
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h
new file mode 100644
index 0000000000..dfc77e359c
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h
@@ -0,0 +1,92 @@
+// Copyright 2013, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some matchers that depend on gmock-matchers.h.
+//
+// Note that tests are implemented in gmock-matchers_test.cc rather than
+// gmock-more-matchers-test.cc.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_MATCHERS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_MATCHERS_H_
+
+#include "gmock/gmock-matchers.h"
+
+namespace testing {
+
+// Silence C4100 (unreferenced formal
+// parameter) for MSVC
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#if (_MSC_VER == 1900)
+// and silence C4800 (C4800: 'int *const ': forcing value
+// to bool 'true' or 'false') for MSVC 14
+# pragma warning(disable:4800)
+  #endif
+#endif
+
+// Defines a matcher that matches an empty container. The container must
+// support both size() and empty(), which all STL-like containers provide.
+MATCHER(IsEmpty, negation ? "isn't empty" : "is empty") {
+  if (arg.empty()) {
+    return true;
+  }
+  *result_listener << "whose size is " << arg.size();
+  return false;
+}
+
+// Define a matcher that matches a value that evaluates in boolean
+// context to true.  Useful for types that define "explicit operator
+// bool" operators and so can't be compared for equality with true
+// and false.
+MATCHER(IsTrue, negation ? "is false" : "is true") {
+  return static_cast<bool>(arg);
+}
+
+// Define a matcher that matches a value that evaluates in boolean
+// context to false.  Useful for types that define "explicit operator
+// bool" operators and so can't be compared for equality with true
+// and false.
+MATCHER(IsFalse, negation ? "is true" : "is false") {
+  return !static_cast<bool>(arg);
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+
+}  // namespace testing
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_MATCHERS_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h
new file mode 100644
index 0000000000..b03b770c75
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h
@@ -0,0 +1,261 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Implements class templates NiceMock, NaggyMock, and StrictMock.
+//
+// Given a mock class MockFoo that is created using Google Mock,
+// NiceMock<MockFoo> is a subclass of MockFoo that allows
+// uninteresting calls (i.e. calls to mock methods that have no
+// EXPECT_CALL specs), NaggyMock<MockFoo> is a subclass of MockFoo
+// that prints a warning when an uninteresting call occurs, and
+// StrictMock<MockFoo> is a subclass of MockFoo that treats all
+// uninteresting calls as errors.
+//
+// Currently a mock is naggy by default, so MockFoo and
+// NaggyMock<MockFoo> behave like the same.  However, we will soon
+// switch the default behavior of mocks to be nice, as that in general
+// leads to more maintainable tests.  When that happens, MockFoo will
+// stop behaving like NaggyMock<MockFoo> and start behaving like
+// NiceMock<MockFoo>.
+//
+// NiceMock, NaggyMock, and StrictMock "inherit" the constructors of
+// their respective base class.  Therefore you can write
+// NiceMock<MockFoo>(5, "a") to construct a nice mock where MockFoo
+// has a constructor that accepts (int, const char*), for example.
+//
+// A known limitation is that NiceMock<MockFoo>, NaggyMock<MockFoo>,
+// and StrictMock<MockFoo> only works for mock methods defined using
+// the MOCK_METHOD* family of macros DIRECTLY in the MockFoo class.
+// If a mock method is defined in a base class of MockFoo, the "nice"
+// or "strict" modifier may not affect it, depending on the compiler.
+// In particular, nesting NiceMock, NaggyMock, and StrictMock is NOT
+// supported.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
+
+#include <type_traits>
+
+#include "gmock/gmock-spec-builders.h"
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+template <class MockClass>
+class NiceMock;
+template <class MockClass>
+class NaggyMock;
+template <class MockClass>
+class StrictMock;
+
+namespace internal {
+template <typename T>
+std::true_type StrictnessModifierProbe(const NiceMock<T>&);
+template <typename T>
+std::true_type StrictnessModifierProbe(const NaggyMock<T>&);
+template <typename T>
+std::true_type StrictnessModifierProbe(const StrictMock<T>&);
+std::false_type StrictnessModifierProbe(...);
+
+template <typename T>
+constexpr bool HasStrictnessModifier() {
+  return decltype(StrictnessModifierProbe(std::declval<const T&>()))::value;
+}
+
+// Base classes that register and deregister with testing::Mock to alter the
+// default behavior around uninteresting calls. Inheriting from one of these
+// classes first and then MockClass ensures the MockClass constructor is run
+// after registration, and that the MockClass destructor runs before
+// deregistration. This guarantees that MockClass's constructor and destructor
+// run with the same level of strictness as its instance methods.
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW && \
+    (defined(_MSC_VER) || defined(__clang__))
+// We need to mark these classes with this declspec to ensure that
+// the empty base class optimization is performed.
+#define GTEST_INTERNAL_EMPTY_BASE_CLASS __declspec(empty_bases)
+#else
+#define GTEST_INTERNAL_EMPTY_BASE_CLASS
+#endif
+
+template <typename Base>
+class NiceMockImpl {
+ public:
+  NiceMockImpl() { ::testing::Mock::AllowUninterestingCalls(this); }
+
+  ~NiceMockImpl() { ::testing::Mock::UnregisterCallReaction(this); }
+};
+
+template <typename Base>
+class NaggyMockImpl {
+ public:
+  NaggyMockImpl() { ::testing::Mock::WarnUninterestingCalls(this); }
+
+  ~NaggyMockImpl() { ::testing::Mock::UnregisterCallReaction(this); }
+};
+
+template <typename Base>
+class StrictMockImpl {
+ public:
+  StrictMockImpl() { ::testing::Mock::FailUninterestingCalls(this); }
+
+  ~StrictMockImpl() { ::testing::Mock::UnregisterCallReaction(this); }
+};
+
+}  // namespace internal
+
+template <class MockClass>
+class GTEST_INTERNAL_EMPTY_BASE_CLASS NiceMock
+    : private internal::NiceMockImpl<MockClass>,
+      public MockClass {
+ public:
+  static_assert(!internal::HasStrictnessModifier<MockClass>(),
+                "Can't apply NiceMock to a class hierarchy that already has a "
+                "strictness modifier. See "
+                "https://google.github.io/googletest/"
+                "gmock_cook_book.html#NiceStrictNaggy");
+  NiceMock() : MockClass() {
+    static_assert(sizeof(*this) == sizeof(MockClass),
+                  "The impl subclass shouldn't introduce any padding");
+  }
+
+  // Ideally, we would inherit base class's constructors through a using
+  // declaration, which would preserve their visibility. However, many existing
+  // tests rely on the fact that current implementation reexports protected
+  // constructors as public. These tests would need to be cleaned up first.
+
+  // Single argument constructor is special-cased so that it can be
+  // made explicit.
+  template <typename A>
+  explicit NiceMock(A&& arg) : MockClass(std::forward<A>(arg)) {
+    static_assert(sizeof(*this) == sizeof(MockClass),
+                  "The impl subclass shouldn't introduce any padding");
+  }
+
+  template <typename TArg1, typename TArg2, typename... An>
+  NiceMock(TArg1&& arg1, TArg2&& arg2, An&&... args)
+      : MockClass(std::forward<TArg1>(arg1), std::forward<TArg2>(arg2),
+                  std::forward<An>(args)...) {
+    static_assert(sizeof(*this) == sizeof(MockClass),
+                  "The impl subclass shouldn't introduce any padding");
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(NiceMock);
+};
+
+template <class MockClass>
+class GTEST_INTERNAL_EMPTY_BASE_CLASS NaggyMock
+    : private internal::NaggyMockImpl<MockClass>,
+      public MockClass {
+  static_assert(!internal::HasStrictnessModifier<MockClass>(),
+                "Can't apply NaggyMock to a class hierarchy that already has a "
+                "strictness modifier. See "
+                "https://google.github.io/googletest/"
+                "gmock_cook_book.html#NiceStrictNaggy");
+
+ public:
+  NaggyMock() : MockClass() {
+    static_assert(sizeof(*this) == sizeof(MockClass),
+                  "The impl subclass shouldn't introduce any padding");
+  }
+
+  // Ideally, we would inherit base class's constructors through a using
+  // declaration, which would preserve their visibility. However, many existing
+  // tests rely on the fact that current implementation reexports protected
+  // constructors as public. These tests would need to be cleaned up first.
+
+  // Single argument constructor is special-cased so that it can be
+  // made explicit.
+  template <typename A>
+  explicit NaggyMock(A&& arg) : MockClass(std::forward<A>(arg)) {
+    static_assert(sizeof(*this) == sizeof(MockClass),
+                  "The impl subclass shouldn't introduce any padding");
+  }
+
+  template <typename TArg1, typename TArg2, typename... An>
+  NaggyMock(TArg1&& arg1, TArg2&& arg2, An&&... args)
+      : MockClass(std::forward<TArg1>(arg1), std::forward<TArg2>(arg2),
+                  std::forward<An>(args)...) {
+    static_assert(sizeof(*this) == sizeof(MockClass),
+                  "The impl subclass shouldn't introduce any padding");
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(NaggyMock);
+};
+
+template <class MockClass>
+class GTEST_INTERNAL_EMPTY_BASE_CLASS StrictMock
+    : private internal::StrictMockImpl<MockClass>,
+      public MockClass {
+ public:
+  static_assert(
+      !internal::HasStrictnessModifier<MockClass>(),
+      "Can't apply StrictMock to a class hierarchy that already has a "
+      "strictness modifier. See "
+      "https://google.github.io/googletest/"
+      "gmock_cook_book.html#NiceStrictNaggy");
+  StrictMock() : MockClass() {
+    static_assert(sizeof(*this) == sizeof(MockClass),
+                  "The impl subclass shouldn't introduce any padding");
+  }
+
+  // Ideally, we would inherit base class's constructors through a using
+  // declaration, which would preserve their visibility. However, many existing
+  // tests rely on the fact that current implementation reexports protected
+  // constructors as public. These tests would need to be cleaned up first.
+
+  // Single argument constructor is special-cased so that it can be
+  // made explicit.
+  template <typename A>
+  explicit StrictMock(A&& arg) : MockClass(std::forward<A>(arg)) {
+    static_assert(sizeof(*this) == sizeof(MockClass),
+                  "The impl subclass shouldn't introduce any padding");
+  }
+
+  template <typename TArg1, typename TArg2, typename... An>
+  StrictMock(TArg1&& arg1, TArg2&& arg2, An&&... args)
+      : MockClass(std::forward<TArg1>(arg1), std::forward<TArg2>(arg2),
+                  std::forward<An>(args)...) {
+    static_assert(sizeof(*this) == sizeof(MockClass),
+                  "The impl subclass shouldn't introduce any padding");
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StrictMock);
+};
+
+#undef GTEST_INTERNAL_EMPTY_BASE_CLASS
+
+}  // namespace testing
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h
new file mode 100644
index 0000000000..41323c1cc0
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h
@@ -0,0 +1,2038 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements the ON_CALL() and EXPECT_CALL() macros.
+//
+// A user can use the ON_CALL() macro to specify the default action of
+// a mock method.  The syntax is:
+//
+//   ON_CALL(mock_object, Method(argument-matchers))
+//       .With(multi-argument-matcher)
+//       .WillByDefault(action);
+//
+//  where the .With() clause is optional.
+//
+// A user can use the EXPECT_CALL() macro to specify an expectation on
+// a mock method.  The syntax is:
+//
+//   EXPECT_CALL(mock_object, Method(argument-matchers))
+//       .With(multi-argument-matchers)
+//       .Times(cardinality)
+//       .InSequence(sequences)
+//       .After(expectations)
+//       .WillOnce(action)
+//       .WillRepeatedly(action)
+//       .RetiresOnSaturation();
+//
+// where all clauses are optional, and .InSequence()/.After()/
+// .WillOnce() can appear any number of times.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+#include "gmock/gmock-actions.h"
+#include "gmock/gmock-cardinalities.h"
+#include "gmock/gmock-matchers.h"
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>  // NOLINT
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// An abstract handle of an expectation.
+class Expectation;
+
+// A set of expectation handles.
+class ExpectationSet;
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// Implements a mock function.
+template <typename F> class FunctionMocker;
+
+// Base class for expectations.
+class ExpectationBase;
+
+// Implements an expectation.
+template <typename F> class TypedExpectation;
+
+// Helper class for testing the Expectation class template.
+class ExpectationTester;
+
+// Helper classes for implementing NiceMock, StrictMock, and NaggyMock.
+template <typename MockClass>
+class NiceMockImpl;
+template <typename MockClass>
+class StrictMockImpl;
+template <typename MockClass>
+class NaggyMockImpl;
+
+// Protects the mock object registry (in class Mock), all function
+// mockers, and all expectations.
+//
+// The reason we don't use more fine-grained protection is: when a
+// mock function Foo() is called, it needs to consult its expectations
+// to see which one should be picked.  If another thread is allowed to
+// call a mock function (either Foo() or a different one) at the same
+// time, it could affect the "retired" attributes of Foo()'s
+// expectations when InSequence() is used, and thus affect which
+// expectation gets picked.  Therefore, we sequence all mock function
+// calls to ensure the integrity of the mock objects' states.
+GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_gmock_mutex);
+
+// Untyped base class for ActionResultHolder<R>.
+class UntypedActionResultHolderBase;
+
+// Abstract base class of FunctionMocker.  This is the
+// type-agnostic part of the function mocker interface.  Its pure
+// virtual methods are implemented by FunctionMocker.
+class GTEST_API_ UntypedFunctionMockerBase {
+ public:
+  UntypedFunctionMockerBase();
+  virtual ~UntypedFunctionMockerBase();
+
+  // Verifies that all expectations on this mock function have been
+  // satisfied.  Reports one or more Google Test non-fatal failures
+  // and returns false if not.
+  bool VerifyAndClearExpectationsLocked()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Clears the ON_CALL()s set on this mock function.
+  virtual void ClearDefaultActionsLocked()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) = 0;
+
+  // In all of the following Untyped* functions, it's the caller's
+  // responsibility to guarantee the correctness of the arguments'
+  // types.
+
+  // Performs the default action with the given arguments and returns
+  // the action's result.  The call description string will be used in
+  // the error message to describe the call in the case the default
+  // action fails.
+  // L = *
+  virtual UntypedActionResultHolderBase* UntypedPerformDefaultAction(
+      void* untyped_args, const std::string& call_description) const = 0;
+
+  // Performs the given action with the given arguments and returns
+  // the action's result.
+  // L = *
+  virtual UntypedActionResultHolderBase* UntypedPerformAction(
+      const void* untyped_action, void* untyped_args) const = 0;
+
+  // Writes a message that the call is uninteresting (i.e. neither
+  // explicitly expected nor explicitly unexpected) to the given
+  // ostream.
+  virtual void UntypedDescribeUninterestingCall(
+      const void* untyped_args,
+      ::std::ostream* os) const
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
+
+  // Returns the expectation that matches the given function arguments
+  // (or NULL is there's no match); when a match is found,
+  // untyped_action is set to point to the action that should be
+  // performed (or NULL if the action is "do default"), and
+  // is_excessive is modified to indicate whether the call exceeds the
+  // expected number.
+  virtual const ExpectationBase* UntypedFindMatchingExpectation(
+      const void* untyped_args,
+      const void** untyped_action, bool* is_excessive,
+      ::std::ostream* what, ::std::ostream* why)
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
+
+  // Prints the given function arguments to the ostream.
+  virtual void UntypedPrintArgs(const void* untyped_args,
+                                ::std::ostream* os) const = 0;
+
+  // Sets the mock object this mock method belongs to, and registers
+  // this information in the global mock registry.  Will be called
+  // whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
+  // method.
+  void RegisterOwner(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Sets the mock object this mock method belongs to, and sets the
+  // name of the mock function.  Will be called upon each invocation
+  // of this mock function.
+  void SetOwnerAndName(const void* mock_obj, const char* name)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Returns the mock object this mock method belongs to.  Must be
+  // called after RegisterOwner() or SetOwnerAndName() has been
+  // called.
+  const void* MockObject() const
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Returns the name of this mock method.  Must be called after
+  // SetOwnerAndName() has been called.
+  const char* Name() const
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Returns the result of invoking this mock function with the given
+  // arguments.  This function can be safely called from multiple
+  // threads concurrently.  The caller is responsible for deleting the
+  // result.
+  UntypedActionResultHolderBase* UntypedInvokeWith(void* untyped_args)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+ protected:
+  typedef std::vector<const void*> UntypedOnCallSpecs;
+
+  using UntypedExpectations = std::vector<std::shared_ptr<ExpectationBase>>;
+
+  // Returns an Expectation object that references and co-owns exp,
+  // which must be an expectation on this mock function.
+  Expectation GetHandleOf(ExpectationBase* exp);
+
+  // Address of the mock object this mock method belongs to.  Only
+  // valid after this mock method has been called or
+  // ON_CALL/EXPECT_CALL has been invoked on it.
+  const void* mock_obj_;  // Protected by g_gmock_mutex.
+
+  // Name of the function being mocked.  Only valid after this mock
+  // method has been called.
+  const char* name_;  // Protected by g_gmock_mutex.
+
+  // All default action specs for this function mocker.
+  UntypedOnCallSpecs untyped_on_call_specs_;
+
+  // All expectations for this function mocker.
+  //
+  // It's undefined behavior to interleave expectations (EXPECT_CALLs
+  // or ON_CALLs) and mock function calls.  Also, the order of
+  // expectations is important.  Therefore it's a logic race condition
+  // to read/write untyped_expectations_ concurrently.  In order for
+  // tools like tsan to catch concurrent read/write accesses to
+  // untyped_expectations, we deliberately leave accesses to it
+  // unprotected.
+  UntypedExpectations untyped_expectations_;
+};  // class UntypedFunctionMockerBase
+
+// Untyped base class for OnCallSpec<F>.
+class UntypedOnCallSpecBase {
+ public:
+  // The arguments are the location of the ON_CALL() statement.
+  UntypedOnCallSpecBase(const char* a_file, int a_line)
+      : file_(a_file), line_(a_line), last_clause_(kNone) {}
+
+  // Where in the source file was the default action spec defined?
+  const char* file() const { return file_; }
+  int line() const { return line_; }
+
+ protected:
+  // Gives each clause in the ON_CALL() statement a name.
+  enum Clause {
+    // Do not change the order of the enum members!  The run-time
+    // syntax checking relies on it.
+    kNone,
+    kWith,
+    kWillByDefault
+  };
+
+  // Asserts that the ON_CALL() statement has a certain property.
+  void AssertSpecProperty(bool property,
+                          const std::string& failure_message) const {
+    Assert(property, file_, line_, failure_message);
+  }
+
+  // Expects that the ON_CALL() statement has a certain property.
+  void ExpectSpecProperty(bool property,
+                          const std::string& failure_message) const {
+    Expect(property, file_, line_, failure_message);
+  }
+
+  const char* file_;
+  int line_;
+
+  // The last clause in the ON_CALL() statement as seen so far.
+  // Initially kNone and changes as the statement is parsed.
+  Clause last_clause_;
+};  // class UntypedOnCallSpecBase
+
+// This template class implements an ON_CALL spec.
+template <typename F>
+class OnCallSpec : public UntypedOnCallSpecBase {
+ public:
+  typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
+
+  // Constructs an OnCallSpec object from the information inside
+  // the parenthesis of an ON_CALL() statement.
+  OnCallSpec(const char* a_file, int a_line,
+             const ArgumentMatcherTuple& matchers)
+      : UntypedOnCallSpecBase(a_file, a_line),
+        matchers_(matchers),
+        // By default, extra_matcher_ should match anything.  However,
+        // we cannot initialize it with _ as that causes ambiguity between
+        // Matcher's copy and move constructor for some argument types.
+        extra_matcher_(A<const ArgumentTuple&>()) {}
+
+  // Implements the .With() clause.
+  OnCallSpec& With(const Matcher<const ArgumentTuple&>& m) {
+    // Makes sure this is called at most once.
+    ExpectSpecProperty(last_clause_ < kWith,
+                       ".With() cannot appear "
+                       "more than once in an ON_CALL().");
+    last_clause_ = kWith;
+
+    extra_matcher_ = m;
+    return *this;
+  }
+
+  // Implements the .WillByDefault() clause.
+  OnCallSpec& WillByDefault(const Action<F>& action) {
+    ExpectSpecProperty(last_clause_ < kWillByDefault,
+                       ".WillByDefault() must appear "
+                       "exactly once in an ON_CALL().");
+    last_clause_ = kWillByDefault;
+
+    ExpectSpecProperty(!action.IsDoDefault(),
+                       "DoDefault() cannot be used in ON_CALL().");
+    action_ = action;
+    return *this;
+  }
+
+  // Returns true if and only if the given arguments match the matchers.
+  bool Matches(const ArgumentTuple& args) const {
+    return TupleMatches(matchers_, args) && extra_matcher_.Matches(args);
+  }
+
+  // Returns the action specified by the user.
+  const Action<F>& GetAction() const {
+    AssertSpecProperty(last_clause_ == kWillByDefault,
+                       ".WillByDefault() must appear exactly "
+                       "once in an ON_CALL().");
+    return action_;
+  }
+
+ private:
+  // The information in statement
+  //
+  //   ON_CALL(mock_object, Method(matchers))
+  //       .With(multi-argument-matcher)
+  //       .WillByDefault(action);
+  //
+  // is recorded in the data members like this:
+  //
+  //   source file that contains the statement => file_
+  //   line number of the statement            => line_
+  //   matchers                                => matchers_
+  //   multi-argument-matcher                  => extra_matcher_
+  //   action                                  => action_
+  ArgumentMatcherTuple matchers_;
+  Matcher<const ArgumentTuple&> extra_matcher_;
+  Action<F> action_;
+};  // class OnCallSpec
+
+// Possible reactions on uninteresting calls.
+enum CallReaction {
+  kAllow,
+  kWarn,
+  kFail,
+};
+
+}  // namespace internal
+
+// Utilities for manipulating mock objects.
+class GTEST_API_ Mock {
+ public:
+  // The following public methods can be called concurrently.
+
+  // Tells Google Mock to ignore mock_obj when checking for leaked
+  // mock objects.
+  static void AllowLeak(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Verifies and clears all expectations on the given mock object.
+  // If the expectations aren't satisfied, generates one or more
+  // Google Test non-fatal failures and returns false.
+  static bool VerifyAndClearExpectations(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Verifies all expectations on the given mock object and clears its
+  // default actions and expectations.  Returns true if and only if the
+  // verification was successful.
+  static bool VerifyAndClear(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Returns whether the mock was created as a naggy mock (default)
+  static bool IsNaggy(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+  // Returns whether the mock was created as a nice mock
+  static bool IsNice(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+  // Returns whether the mock was created as a strict mock
+  static bool IsStrict(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ private:
+  friend class internal::UntypedFunctionMockerBase;
+
+  // Needed for a function mocker to register itself (so that we know
+  // how to clear a mock object).
+  template <typename F>
+  friend class internal::FunctionMocker;
+
+  template <typename MockClass>
+  friend class internal::NiceMockImpl;
+  template <typename MockClass>
+  friend class internal::NaggyMockImpl;
+  template <typename MockClass>
+  friend class internal::StrictMockImpl;
+
+  // Tells Google Mock to allow uninteresting calls on the given mock
+  // object.
+  static void AllowUninterestingCalls(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock to warn the user about uninteresting calls on
+  // the given mock object.
+  static void WarnUninterestingCalls(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock to fail uninteresting calls on the given mock
+  // object.
+  static void FailUninterestingCalls(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock the given mock object is being destroyed and
+  // its entry in the call-reaction table should be removed.
+  static void UnregisterCallReaction(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Returns the reaction Google Mock will have on uninteresting calls
+  // made on the given mock object.
+  static internal::CallReaction GetReactionOnUninterestingCalls(
+      const void* mock_obj)
+          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Verifies that all expectations on the given mock object have been
+  // satisfied.  Reports one or more Google Test non-fatal failures
+  // and returns false if not.
+  static bool VerifyAndClearExpectationsLocked(void* mock_obj)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+
+  // Clears all ON_CALL()s set on the given mock object.
+  static void ClearDefaultActionsLocked(void* mock_obj)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+
+  // Registers a mock object and a mock method it owns.
+  static void Register(
+      const void* mock_obj,
+      internal::UntypedFunctionMockerBase* mocker)
+          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock where in the source code mock_obj is used in an
+  // ON_CALL or EXPECT_CALL.  In case mock_obj is leaked, this
+  // information helps the user identify which object it is.
+  static void RegisterUseByOnCallOrExpectCall(
+      const void* mock_obj, const char* file, int line)
+          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Unregisters a mock method; removes the owning mock object from
+  // the registry when the last mock method associated with it has
+  // been unregistered.  This is called only in the destructor of
+  // FunctionMocker.
+  static void UnregisterLocked(internal::UntypedFunctionMockerBase* mocker)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+};  // class Mock
+
+// An abstract handle of an expectation.  Useful in the .After()
+// clause of EXPECT_CALL() for setting the (partial) order of
+// expectations.  The syntax:
+//
+//   Expectation e1 = EXPECT_CALL(...)...;
+//   EXPECT_CALL(...).After(e1)...;
+//
+// sets two expectations where the latter can only be matched after
+// the former has been satisfied.
+//
+// Notes:
+//   - This class is copyable and has value semantics.
+//   - Constness is shallow: a const Expectation object itself cannot
+//     be modified, but the mutable methods of the ExpectationBase
+//     object it references can be called via expectation_base().
+
+class GTEST_API_ Expectation {
+ public:
+  // Constructs a null object that doesn't reference any expectation.
+  Expectation();
+  Expectation(Expectation&&) = default;
+  Expectation(const Expectation&) = default;
+  Expectation& operator=(Expectation&&) = default;
+  Expectation& operator=(const Expectation&) = default;
+  ~Expectation();
+
+  // This single-argument ctor must not be explicit, in order to support the
+  //   Expectation e = EXPECT_CALL(...);
+  // syntax.
+  //
+  // A TypedExpectation object stores its pre-requisites as
+  // Expectation objects, and needs to call the non-const Retire()
+  // method on the ExpectationBase objects they reference.  Therefore
+  // Expectation must receive a *non-const* reference to the
+  // ExpectationBase object.
+  Expectation(internal::ExpectationBase& exp);  // NOLINT
+
+  // The compiler-generated copy ctor and operator= work exactly as
+  // intended, so we don't need to define our own.
+
+  // Returns true if and only if rhs references the same expectation as this
+  // object does.
+  bool operator==(const Expectation& rhs) const {
+    return expectation_base_ == rhs.expectation_base_;
+  }
+
+  bool operator!=(const Expectation& rhs) const { return !(*this == rhs); }
+
+ private:
+  friend class ExpectationSet;
+  friend class Sequence;
+  friend class ::testing::internal::ExpectationBase;
+  friend class ::testing::internal::UntypedFunctionMockerBase;
+
+  template <typename F>
+  friend class ::testing::internal::FunctionMocker;
+
+  template <typename F>
+  friend class ::testing::internal::TypedExpectation;
+
+  // This comparator is needed for putting Expectation objects into a set.
+  class Less {
+   public:
+    bool operator()(const Expectation& lhs, const Expectation& rhs) const {
+      return lhs.expectation_base_.get() < rhs.expectation_base_.get();
+    }
+  };
+
+  typedef ::std::set<Expectation, Less> Set;
+
+  Expectation(
+      const std::shared_ptr<internal::ExpectationBase>& expectation_base);
+
+  // Returns the expectation this object references.
+  const std::shared_ptr<internal::ExpectationBase>& expectation_base() const {
+    return expectation_base_;
+  }
+
+  // A shared_ptr that co-owns the expectation this handle references.
+  std::shared_ptr<internal::ExpectationBase> expectation_base_;
+};
+
+// A set of expectation handles.  Useful in the .After() clause of
+// EXPECT_CALL() for setting the (partial) order of expectations.  The
+// syntax:
+//
+//   ExpectationSet es;
+//   es += EXPECT_CALL(...)...;
+//   es += EXPECT_CALL(...)...;
+//   EXPECT_CALL(...).After(es)...;
+//
+// sets three expectations where the last one can only be matched
+// after the first two have both been satisfied.
+//
+// This class is copyable and has value semantics.
+class ExpectationSet {
+ public:
+  // A bidirectional iterator that can read a const element in the set.
+  typedef Expectation::Set::const_iterator const_iterator;
+
+  // An object stored in the set.  This is an alias of Expectation.
+  typedef Expectation::Set::value_type value_type;
+
+  // Constructs an empty set.
+  ExpectationSet() {}
+
+  // This single-argument ctor must not be explicit, in order to support the
+  //   ExpectationSet es = EXPECT_CALL(...);
+  // syntax.
+  ExpectationSet(internal::ExpectationBase& exp) {  // NOLINT
+    *this += Expectation(exp);
+  }
+
+  // This single-argument ctor implements implicit conversion from
+  // Expectation and thus must not be explicit.  This allows either an
+  // Expectation or an ExpectationSet to be used in .After().
+  ExpectationSet(const Expectation& e) {  // NOLINT
+    *this += e;
+  }
+
+  // The compiler-generator ctor and operator= works exactly as
+  // intended, so we don't need to define our own.
+
+  // Returns true if and only if rhs contains the same set of Expectation
+  // objects as this does.
+  bool operator==(const ExpectationSet& rhs) const {
+    return expectations_ == rhs.expectations_;
+  }
+
+  bool operator!=(const ExpectationSet& rhs) const { return !(*this == rhs); }
+
+  // Implements the syntax
+  //   expectation_set += EXPECT_CALL(...);
+  ExpectationSet& operator+=(const Expectation& e) {
+    expectations_.insert(e);
+    return *this;
+  }
+
+  int size() const { return static_cast<int>(expectations_.size()); }
+
+  const_iterator begin() const { return expectations_.begin(); }
+  const_iterator end() const { return expectations_.end(); }
+
+ private:
+  Expectation::Set expectations_;
+};
+
+
+// Sequence objects are used by a user to specify the relative order
+// in which the expectations should match.  They are copyable (we rely
+// on the compiler-defined copy constructor and assignment operator).
+class GTEST_API_ Sequence {
+ public:
+  // Constructs an empty sequence.
+  Sequence() : last_expectation_(new Expectation) {}
+
+  // Adds an expectation to this sequence.  The caller must ensure
+  // that no other thread is accessing this Sequence object.
+  void AddExpectation(const Expectation& expectation) const;
+
+ private:
+  // The last expectation in this sequence.
+  std::shared_ptr<Expectation> last_expectation_;
+};  // class Sequence
+
+// An object of this type causes all EXPECT_CALL() statements
+// encountered in its scope to be put in an anonymous sequence.  The
+// work is done in the constructor and destructor.  You should only
+// create an InSequence object on the stack.
+//
+// The sole purpose for this class is to support easy definition of
+// sequential expectations, e.g.
+//
+//   {
+//     InSequence dummy;  // The name of the object doesn't matter.
+//
+//     // The following expectations must match in the order they appear.
+//     EXPECT_CALL(a, Bar())...;
+//     EXPECT_CALL(a, Baz())...;
+//     ...
+//     EXPECT_CALL(b, Xyz())...;
+//   }
+//
+// You can create InSequence objects in multiple threads, as long as
+// they are used to affect different mock objects.  The idea is that
+// each thread can create and set up its own mocks as if it's the only
+// thread.  However, for clarity of your tests we recommend you to set
+// up mocks in the main thread unless you have a good reason not to do
+// so.
+class GTEST_API_ InSequence {
+ public:
+  InSequence();
+  ~InSequence();
+ private:
+  bool sequence_created_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InSequence);  // NOLINT
+} GTEST_ATTRIBUTE_UNUSED_;
+
+namespace internal {
+
+// Points to the implicit sequence introduced by a living InSequence
+// object (if any) in the current thread or NULL.
+GTEST_API_ extern ThreadLocal<Sequence*> g_gmock_implicit_sequence;
+
+// Base class for implementing expectations.
+//
+// There are two reasons for having a type-agnostic base class for
+// Expectation:
+//
+//   1. We need to store collections of expectations of different
+//   types (e.g. all pre-requisites of a particular expectation, all
+//   expectations in a sequence).  Therefore these expectation objects
+//   must share a common base class.
+//
+//   2. We can avoid binary code bloat by moving methods not depending
+//   on the template argument of Expectation to the base class.
+//
+// This class is internal and mustn't be used by user code directly.
+class GTEST_API_ ExpectationBase {
+ public:
+  // source_text is the EXPECT_CALL(...) source that created this Expectation.
+  ExpectationBase(const char* file, int line, const std::string& source_text);
+
+  virtual ~ExpectationBase();
+
+  // Where in the source file was the expectation spec defined?
+  const char* file() const { return file_; }
+  int line() const { return line_; }
+  const char* source_text() const { return source_text_.c_str(); }
+  // Returns the cardinality specified in the expectation spec.
+  const Cardinality& cardinality() const { return cardinality_; }
+
+  // Describes the source file location of this expectation.
+  void DescribeLocationTo(::std::ostream* os) const {
+    *os << FormatFileLocation(file(), line()) << " ";
+  }
+
+  // Describes how many times a function call matching this
+  // expectation has occurred.
+  void DescribeCallCountTo(::std::ostream* os) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // If this mock method has an extra matcher (i.e. .With(matcher)),
+  // describes it to the ostream.
+  virtual void MaybeDescribeExtraMatcherTo(::std::ostream* os) = 0;
+
+ protected:
+  friend class ::testing::Expectation;
+  friend class UntypedFunctionMockerBase;
+
+  enum Clause {
+    // Don't change the order of the enum members!
+    kNone,
+    kWith,
+    kTimes,
+    kInSequence,
+    kAfter,
+    kWillOnce,
+    kWillRepeatedly,
+    kRetiresOnSaturation
+  };
+
+  typedef std::vector<const void*> UntypedActions;
+
+  // Returns an Expectation object that references and co-owns this
+  // expectation.
+  virtual Expectation GetHandle() = 0;
+
+  // Asserts that the EXPECT_CALL() statement has the given property.
+  void AssertSpecProperty(bool property,
+                          const std::string& failure_message) const {
+    Assert(property, file_, line_, failure_message);
+  }
+
+  // Expects that the EXPECT_CALL() statement has the given property.
+  void ExpectSpecProperty(bool property,
+                          const std::string& failure_message) const {
+    Expect(property, file_, line_, failure_message);
+  }
+
+  // Explicitly specifies the cardinality of this expectation.  Used
+  // by the subclasses to implement the .Times() clause.
+  void SpecifyCardinality(const Cardinality& cardinality);
+
+  // Returns true if and only if the user specified the cardinality
+  // explicitly using a .Times().
+  bool cardinality_specified() const { return cardinality_specified_; }
+
+  // Sets the cardinality of this expectation spec.
+  void set_cardinality(const Cardinality& a_cardinality) {
+    cardinality_ = a_cardinality;
+  }
+
+  // The following group of methods should only be called after the
+  // EXPECT_CALL() statement, and only when g_gmock_mutex is held by
+  // the current thread.
+
+  // Retires all pre-requisites of this expectation.
+  void RetireAllPreRequisites()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Returns true if and only if this expectation is retired.
+  bool is_retired() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return retired_;
+  }
+
+  // Retires this expectation.
+  void Retire()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    retired_ = true;
+  }
+
+  // Returns true if and only if this expectation is satisfied.
+  bool IsSatisfied() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return cardinality().IsSatisfiedByCallCount(call_count_);
+  }
+
+  // Returns true if and only if this expectation is saturated.
+  bool IsSaturated() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return cardinality().IsSaturatedByCallCount(call_count_);
+  }
+
+  // Returns true if and only if this expectation is over-saturated.
+  bool IsOverSaturated() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return cardinality().IsOverSaturatedByCallCount(call_count_);
+  }
+
+  // Returns true if and only if all pre-requisites of this expectation are
+  // satisfied.
+  bool AllPrerequisitesAreSatisfied() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Adds unsatisfied pre-requisites of this expectation to 'result'.
+  void FindUnsatisfiedPrerequisites(ExpectationSet* result) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Returns the number this expectation has been invoked.
+  int call_count() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return call_count_;
+  }
+
+  // Increments the number this expectation has been invoked.
+  void IncrementCallCount()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    call_count_++;
+  }
+
+  // Checks the action count (i.e. the number of WillOnce() and
+  // WillRepeatedly() clauses) against the cardinality if this hasn't
+  // been done before.  Prints a warning if there are too many or too
+  // few actions.
+  void CheckActionCountIfNotDone() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  friend class ::testing::Sequence;
+  friend class ::testing::internal::ExpectationTester;
+
+  template <typename Function>
+  friend class TypedExpectation;
+
+  // Implements the .Times() clause.
+  void UntypedTimes(const Cardinality& a_cardinality);
+
+  // This group of fields are part of the spec and won't change after
+  // an EXPECT_CALL() statement finishes.
+  const char* file_;          // The file that contains the expectation.
+  int line_;                  // The line number of the expectation.
+  const std::string source_text_;  // The EXPECT_CALL(...) source text.
+  // True if and only if the cardinality is specified explicitly.
+  bool cardinality_specified_;
+  Cardinality cardinality_;            // The cardinality of the expectation.
+  // The immediate pre-requisites (i.e. expectations that must be
+  // satisfied before this expectation can be matched) of this
+  // expectation.  We use std::shared_ptr in the set because we want an
+  // Expectation object to be co-owned by its FunctionMocker and its
+  // successors.  This allows multiple mock objects to be deleted at
+  // different times.
+  ExpectationSet immediate_prerequisites_;
+
+  // This group of fields are the current state of the expectation,
+  // and can change as the mock function is called.
+  int call_count_;  // How many times this expectation has been invoked.
+  bool retired_;    // True if and only if this expectation has retired.
+  UntypedActions untyped_actions_;
+  bool extra_matcher_specified_;
+  bool repeated_action_specified_;  // True if a WillRepeatedly() was specified.
+  bool retires_on_saturation_;
+  Clause last_clause_;
+  mutable bool action_count_checked_;  // Under mutex_.
+  mutable Mutex mutex_;  // Protects action_count_checked_.
+};  // class ExpectationBase
+
+// Impements an expectation for the given function type.
+template <typename F>
+class TypedExpectation : public ExpectationBase {
+ public:
+  typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
+  typedef typename Function<F>::Result Result;
+
+  TypedExpectation(FunctionMocker<F>* owner, const char* a_file, int a_line,
+                   const std::string& a_source_text,
+                   const ArgumentMatcherTuple& m)
+      : ExpectationBase(a_file, a_line, a_source_text),
+        owner_(owner),
+        matchers_(m),
+        // By default, extra_matcher_ should match anything.  However,
+        // we cannot initialize it with _ as that causes ambiguity between
+        // Matcher's copy and move constructor for some argument types.
+        extra_matcher_(A<const ArgumentTuple&>()),
+        repeated_action_(DoDefault()) {}
+
+  ~TypedExpectation() override {
+    // Check the validity of the action count if it hasn't been done
+    // yet (for example, if the expectation was never used).
+    CheckActionCountIfNotDone();
+    for (UntypedActions::const_iterator it = untyped_actions_.begin();
+         it != untyped_actions_.end(); ++it) {
+      delete static_cast<const Action<F>*>(*it);
+    }
+  }
+
+  // Implements the .With() clause.
+  TypedExpectation& With(const Matcher<const ArgumentTuple&>& m) {
+    if (last_clause_ == kWith) {
+      ExpectSpecProperty(false,
+                         ".With() cannot appear "
+                         "more than once in an EXPECT_CALL().");
+    } else {
+      ExpectSpecProperty(last_clause_ < kWith,
+                         ".With() must be the first "
+                         "clause in an EXPECT_CALL().");
+    }
+    last_clause_ = kWith;
+
+    extra_matcher_ = m;
+    extra_matcher_specified_ = true;
+    return *this;
+  }
+
+  // Implements the .Times() clause.
+  TypedExpectation& Times(const Cardinality& a_cardinality) {
+    ExpectationBase::UntypedTimes(a_cardinality);
+    return *this;
+  }
+
+  // Implements the .Times() clause.
+  TypedExpectation& Times(int n) {
+    return Times(Exactly(n));
+  }
+
+  // Implements the .InSequence() clause.
+  TypedExpectation& InSequence(const Sequence& s) {
+    ExpectSpecProperty(last_clause_ <= kInSequence,
+                       ".InSequence() cannot appear after .After(),"
+                       " .WillOnce(), .WillRepeatedly(), or "
+                       ".RetiresOnSaturation().");
+    last_clause_ = kInSequence;
+
+    s.AddExpectation(GetHandle());
+    return *this;
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2) {
+    return InSequence(s1).InSequence(s2);
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+                               const Sequence& s3) {
+    return InSequence(s1, s2).InSequence(s3);
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+                               const Sequence& s3, const Sequence& s4) {
+    return InSequence(s1, s2, s3).InSequence(s4);
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+                               const Sequence& s3, const Sequence& s4,
+                               const Sequence& s5) {
+    return InSequence(s1, s2, s3, s4).InSequence(s5);
+  }
+
+  // Implements that .After() clause.
+  TypedExpectation& After(const ExpectationSet& s) {
+    ExpectSpecProperty(last_clause_ <= kAfter,
+                       ".After() cannot appear after .WillOnce(),"
+                       " .WillRepeatedly(), or "
+                       ".RetiresOnSaturation().");
+    last_clause_ = kAfter;
+
+    for (ExpectationSet::const_iterator it = s.begin(); it != s.end(); ++it) {
+      immediate_prerequisites_ += *it;
+    }
+    return *this;
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2) {
+    return After(s1).After(s2);
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+                          const ExpectationSet& s3) {
+    return After(s1, s2).After(s3);
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+                          const ExpectationSet& s3, const ExpectationSet& s4) {
+    return After(s1, s2, s3).After(s4);
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+                          const ExpectationSet& s3, const ExpectationSet& s4,
+                          const ExpectationSet& s5) {
+    return After(s1, s2, s3, s4).After(s5);
+  }
+
+  // Implements the .WillOnce() clause.
+  TypedExpectation& WillOnce(const Action<F>& action) {
+    ExpectSpecProperty(last_clause_ <= kWillOnce,
+                       ".WillOnce() cannot appear after "
+                       ".WillRepeatedly() or .RetiresOnSaturation().");
+    last_clause_ = kWillOnce;
+
+    untyped_actions_.push_back(new Action<F>(action));
+    if (!cardinality_specified()) {
+      set_cardinality(Exactly(static_cast<int>(untyped_actions_.size())));
+    }
+    return *this;
+  }
+
+  // Implements the .WillRepeatedly() clause.
+  TypedExpectation& WillRepeatedly(const Action<F>& action) {
+    if (last_clause_ == kWillRepeatedly) {
+      ExpectSpecProperty(false,
+                         ".WillRepeatedly() cannot appear "
+                         "more than once in an EXPECT_CALL().");
+    } else {
+      ExpectSpecProperty(last_clause_ < kWillRepeatedly,
+                         ".WillRepeatedly() cannot appear "
+                         "after .RetiresOnSaturation().");
+    }
+    last_clause_ = kWillRepeatedly;
+    repeated_action_specified_ = true;
+
+    repeated_action_ = action;
+    if (!cardinality_specified()) {
+      set_cardinality(AtLeast(static_cast<int>(untyped_actions_.size())));
+    }
+
+    // Now that no more action clauses can be specified, we check
+    // whether their count makes sense.
+    CheckActionCountIfNotDone();
+    return *this;
+  }
+
+  // Implements the .RetiresOnSaturation() clause.
+  TypedExpectation& RetiresOnSaturation() {
+    ExpectSpecProperty(last_clause_ < kRetiresOnSaturation,
+                       ".RetiresOnSaturation() cannot appear "
+                       "more than once.");
+    last_clause_ = kRetiresOnSaturation;
+    retires_on_saturation_ = true;
+
+    // Now that no more action clauses can be specified, we check
+    // whether their count makes sense.
+    CheckActionCountIfNotDone();
+    return *this;
+  }
+
+  // Returns the matchers for the arguments as specified inside the
+  // EXPECT_CALL() macro.
+  const ArgumentMatcherTuple& matchers() const {
+    return matchers_;
+  }
+
+  // Returns the matcher specified by the .With() clause.
+  const Matcher<const ArgumentTuple&>& extra_matcher() const {
+    return extra_matcher_;
+  }
+
+  // Returns the action specified by the .WillRepeatedly() clause.
+  const Action<F>& repeated_action() const { return repeated_action_; }
+
+  // If this mock method has an extra matcher (i.e. .With(matcher)),
+  // describes it to the ostream.
+  void MaybeDescribeExtraMatcherTo(::std::ostream* os) override {
+    if (extra_matcher_specified_) {
+      *os << "    Expected args: ";
+      extra_matcher_.DescribeTo(os);
+      *os << "\n";
+    }
+  }
+
+ private:
+  template <typename Function>
+  friend class FunctionMocker;
+
+  // Returns an Expectation object that references and co-owns this
+  // expectation.
+  Expectation GetHandle() override { return owner_->GetHandleOf(this); }
+
+  // The following methods will be called only after the EXPECT_CALL()
+  // statement finishes and when the current thread holds
+  // g_gmock_mutex.
+
+  // Returns true if and only if this expectation matches the given arguments.
+  bool Matches(const ArgumentTuple& args) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return TupleMatches(matchers_, args) && extra_matcher_.Matches(args);
+  }
+
+  // Returns true if and only if this expectation should handle the given
+  // arguments.
+  bool ShouldHandleArguments(const ArgumentTuple& args) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+
+    // In case the action count wasn't checked when the expectation
+    // was defined (e.g. if this expectation has no WillRepeatedly()
+    // or RetiresOnSaturation() clause), we check it when the
+    // expectation is used for the first time.
+    CheckActionCountIfNotDone();
+    return !is_retired() && AllPrerequisitesAreSatisfied() && Matches(args);
+  }
+
+  // Describes the result of matching the arguments against this
+  // expectation to the given ostream.
+  void ExplainMatchResultTo(
+      const ArgumentTuple& args,
+      ::std::ostream* os) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+
+    if (is_retired()) {
+      *os << "         Expected: the expectation is active\n"
+          << "           Actual: it is retired\n";
+    } else if (!Matches(args)) {
+      if (!TupleMatches(matchers_, args)) {
+        ExplainMatchFailureTupleTo(matchers_, args, os);
+      }
+      StringMatchResultListener listener;
+      if (!extra_matcher_.MatchAndExplain(args, &listener)) {
+        *os << "    Expected args: ";
+        extra_matcher_.DescribeTo(os);
+        *os << "\n           Actual: don't match";
+
+        internal::PrintIfNotEmpty(listener.str(), os);
+        *os << "\n";
+      }
+    } else if (!AllPrerequisitesAreSatisfied()) {
+      *os << "         Expected: all pre-requisites are satisfied\n"
+          << "           Actual: the following immediate pre-requisites "
+          << "are not satisfied:\n";
+      ExpectationSet unsatisfied_prereqs;
+      FindUnsatisfiedPrerequisites(&unsatisfied_prereqs);
+      int i = 0;
+      for (ExpectationSet::const_iterator it = unsatisfied_prereqs.begin();
+           it != unsatisfied_prereqs.end(); ++it) {
+        it->expectation_base()->DescribeLocationTo(os);
+        *os << "pre-requisite #" << i++ << "\n";
+      }
+      *os << "                   (end of pre-requisites)\n";
+    } else {
+      // This line is here just for completeness' sake.  It will never
+      // be executed as currently the ExplainMatchResultTo() function
+      // is called only when the mock function call does NOT match the
+      // expectation.
+      *os << "The call matches the expectation.\n";
+    }
+  }
+
+  // Returns the action that should be taken for the current invocation.
+  const Action<F>& GetCurrentAction(const FunctionMocker<F>* mocker,
+                                    const ArgumentTuple& args) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    const int count = call_count();
+    Assert(count >= 1, __FILE__, __LINE__,
+           "call_count() is <= 0 when GetCurrentAction() is "
+           "called - this should never happen.");
+
+    const int action_count = static_cast<int>(untyped_actions_.size());
+    if (action_count > 0 && !repeated_action_specified_ &&
+        count > action_count) {
+      // If there is at least one WillOnce() and no WillRepeatedly(),
+      // we warn the user when the WillOnce() clauses ran out.
+      ::std::stringstream ss;
+      DescribeLocationTo(&ss);
+      ss << "Actions ran out in " << source_text() << "...\n"
+         << "Called " << count << " times, but only "
+         << action_count << " WillOnce()"
+         << (action_count == 1 ? " is" : "s are") << " specified - ";
+      mocker->DescribeDefaultActionTo(args, &ss);
+      Log(kWarning, ss.str(), 1);
+    }
+
+    return count <= action_count
+               ? *static_cast<const Action<F>*>(
+                     untyped_actions_[static_cast<size_t>(count - 1)])
+               : repeated_action();
+  }
+
+  // Given the arguments of a mock function call, if the call will
+  // over-saturate this expectation, returns the default action;
+  // otherwise, returns the next action in this expectation.  Also
+  // describes *what* happened to 'what', and explains *why* Google
+  // Mock does it to 'why'.  This method is not const as it calls
+  // IncrementCallCount().  A return value of NULL means the default
+  // action.
+  const Action<F>* GetActionForArguments(const FunctionMocker<F>* mocker,
+                                         const ArgumentTuple& args,
+                                         ::std::ostream* what,
+                                         ::std::ostream* why)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    if (IsSaturated()) {
+      // We have an excessive call.
+      IncrementCallCount();
+      *what << "Mock function called more times than expected - ";
+      mocker->DescribeDefaultActionTo(args, what);
+      DescribeCallCountTo(why);
+
+      return nullptr;
+    }
+
+    IncrementCallCount();
+    RetireAllPreRequisites();
+
+    if (retires_on_saturation_ && IsSaturated()) {
+      Retire();
+    }
+
+    // Must be done after IncrementCount()!
+    *what << "Mock function call matches " << source_text() <<"...\n";
+    return &(GetCurrentAction(mocker, args));
+  }
+
+  // All the fields below won't change once the EXPECT_CALL()
+  // statement finishes.
+  FunctionMocker<F>* const owner_;
+  ArgumentMatcherTuple matchers_;
+  Matcher<const ArgumentTuple&> extra_matcher_;
+  Action<F> repeated_action_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TypedExpectation);
+};  // class TypedExpectation
+
+// A MockSpec object is used by ON_CALL() or EXPECT_CALL() for
+// specifying the default behavior of, or expectation on, a mock
+// function.
+
+// Note: class MockSpec really belongs to the ::testing namespace.
+// However if we define it in ::testing, MSVC will complain when
+// classes in ::testing::internal declare it as a friend class
+// template.  To workaround this compiler bug, we define MockSpec in
+// ::testing::internal and import it into ::testing.
+
+// Logs a message including file and line number information.
+GTEST_API_ void LogWithLocation(testing::internal::LogSeverity severity,
+                                const char* file, int line,
+                                const std::string& message);
+
+template <typename F>
+class MockSpec {
+ public:
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename internal::Function<F>::ArgumentMatcherTuple
+      ArgumentMatcherTuple;
+
+  // Constructs a MockSpec object, given the function mocker object
+  // that the spec is associated with.
+  MockSpec(internal::FunctionMocker<F>* function_mocker,
+           const ArgumentMatcherTuple& matchers)
+      : function_mocker_(function_mocker), matchers_(matchers) {}
+
+  // Adds a new default action spec to the function mocker and returns
+  // the newly created spec.
+  internal::OnCallSpec<F>& InternalDefaultActionSetAt(
+      const char* file, int line, const char* obj, const char* call) {
+    LogWithLocation(internal::kInfo, file, line,
+                    std::string("ON_CALL(") + obj + ", " + call + ") invoked");
+    return function_mocker_->AddNewOnCallSpec(file, line, matchers_);
+  }
+
+  // Adds a new expectation spec to the function mocker and returns
+  // the newly created spec.
+  internal::TypedExpectation<F>& InternalExpectedAt(
+      const char* file, int line, const char* obj, const char* call) {
+    const std::string source_text(std::string("EXPECT_CALL(") + obj + ", " +
+                                  call + ")");
+    LogWithLocation(internal::kInfo, file, line, source_text + " invoked");
+    return function_mocker_->AddNewExpectation(
+        file, line, source_text, matchers_);
+  }
+
+  // This operator overload is used to swallow the superfluous parameter list
+  // introduced by the ON/EXPECT_CALL macros. See the macro comments for more
+  // explanation.
+  MockSpec<F>& operator()(const internal::WithoutMatchers&, void* const) {
+    return *this;
+  }
+
+ private:
+  template <typename Function>
+  friend class internal::FunctionMocker;
+
+  // The function mocker that owns this spec.
+  internal::FunctionMocker<F>* const function_mocker_;
+  // The argument matchers specified in the spec.
+  ArgumentMatcherTuple matchers_;
+};  // class MockSpec
+
+// Wrapper type for generically holding an ordinary value or lvalue reference.
+// If T is not a reference type, it must be copyable or movable.
+// ReferenceOrValueWrapper<T> is movable, and will also be copyable unless
+// T is a move-only value type (which means that it will always be copyable
+// if the current platform does not support move semantics).
+//
+// The primary template defines handling for values, but function header
+// comments describe the contract for the whole template (including
+// specializations).
+template <typename T>
+class ReferenceOrValueWrapper {
+ public:
+  // Constructs a wrapper from the given value/reference.
+  explicit ReferenceOrValueWrapper(T value)
+      : value_(std::move(value)) {
+  }
+
+  // Unwraps and returns the underlying value/reference, exactly as
+  // originally passed. The behavior of calling this more than once on
+  // the same object is unspecified.
+  T Unwrap() { return std::move(value_); }
+
+  // Provides nondestructive access to the underlying value/reference.
+  // Always returns a const reference (more precisely,
+  // const std::add_lvalue_reference<T>::type). The behavior of calling this
+  // after calling Unwrap on the same object is unspecified.
+  const T& Peek() const {
+    return value_;
+  }
+
+ private:
+  T value_;
+};
+
+// Specialization for lvalue reference types. See primary template
+// for documentation.
+template <typename T>
+class ReferenceOrValueWrapper<T&> {
+ public:
+  // Workaround for debatable pass-by-reference lint warning (c-library-team
+  // policy precludes NOLINT in this context)
+  typedef T& reference;
+  explicit ReferenceOrValueWrapper(reference ref)
+      : value_ptr_(&ref) {}
+  T& Unwrap() { return *value_ptr_; }
+  const T& Peek() const { return *value_ptr_; }
+
+ private:
+  T* value_ptr_;
+};
+
+// C++ treats the void type specially.  For example, you cannot define
+// a void-typed variable or pass a void value to a function.
+// ActionResultHolder<T> holds a value of type T, where T must be a
+// copyable type or void (T doesn't need to be default-constructable).
+// It hides the syntactic difference between void and other types, and
+// is used to unify the code for invoking both void-returning and
+// non-void-returning mock functions.
+
+// Untyped base class for ActionResultHolder<T>.
+class UntypedActionResultHolderBase {
+ public:
+  virtual ~UntypedActionResultHolderBase() {}
+
+  // Prints the held value as an action's result to os.
+  virtual void PrintAsActionResult(::std::ostream* os) const = 0;
+};
+
+// This generic definition is used when T is not void.
+template <typename T>
+class ActionResultHolder : public UntypedActionResultHolderBase {
+ public:
+  // Returns the held value. Must not be called more than once.
+  T Unwrap() {
+    return result_.Unwrap();
+  }
+
+  // Prints the held value as an action's result to os.
+  void PrintAsActionResult(::std::ostream* os) const override {
+    *os << "\n          Returns: ";
+    // T may be a reference type, so we don't use UniversalPrint().
+    UniversalPrinter<T>::Print(result_.Peek(), os);
+  }
+
+  // Performs the given mock function's default action and returns the
+  // result in a new-ed ActionResultHolder.
+  template <typename F>
+  static ActionResultHolder* PerformDefaultAction(
+      const FunctionMocker<F>* func_mocker,
+      typename Function<F>::ArgumentTuple&& args,
+      const std::string& call_description) {
+    return new ActionResultHolder(Wrapper(func_mocker->PerformDefaultAction(
+        std::move(args), call_description)));
+  }
+
+  // Performs the given action and returns the result in a new-ed
+  // ActionResultHolder.
+  template <typename F>
+  static ActionResultHolder* PerformAction(
+      const Action<F>& action, typename Function<F>::ArgumentTuple&& args) {
+    return new ActionResultHolder(
+        Wrapper(action.Perform(std::move(args))));
+  }
+
+ private:
+  typedef ReferenceOrValueWrapper<T> Wrapper;
+
+  explicit ActionResultHolder(Wrapper result)
+      : result_(std::move(result)) {
+  }
+
+  Wrapper result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionResultHolder);
+};
+
+// Specialization for T = void.
+template <>
+class ActionResultHolder<void> : public UntypedActionResultHolderBase {
+ public:
+  void Unwrap() { }
+
+  void PrintAsActionResult(::std::ostream* /* os */) const override {}
+
+  // Performs the given mock function's default action and returns ownership
+  // of an empty ActionResultHolder*.
+  template <typename F>
+  static ActionResultHolder* PerformDefaultAction(
+      const FunctionMocker<F>* func_mocker,
+      typename Function<F>::ArgumentTuple&& args,
+      const std::string& call_description) {
+    func_mocker->PerformDefaultAction(std::move(args), call_description);
+    return new ActionResultHolder;
+  }
+
+  // Performs the given action and returns ownership of an empty
+  // ActionResultHolder*.
+  template <typename F>
+  static ActionResultHolder* PerformAction(
+      const Action<F>& action, typename Function<F>::ArgumentTuple&& args) {
+    action.Perform(std::move(args));
+    return new ActionResultHolder;
+  }
+
+ private:
+  ActionResultHolder() {}
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionResultHolder);
+};
+
+template <typename F>
+class FunctionMocker;
+
+template <typename R, typename... Args>
+class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
+  using F = R(Args...);
+
+ public:
+  using Result = R;
+  using ArgumentTuple = std::tuple<Args...>;
+  using ArgumentMatcherTuple = std::tuple<Matcher<Args>...>;
+
+  FunctionMocker() {}
+
+  // There is no generally useful and implementable semantics of
+  // copying a mock object, so copying a mock is usually a user error.
+  // Thus we disallow copying function mockers.  If the user really
+  // wants to copy a mock object, they should implement their own copy
+  // operation, for example:
+  //
+  //   class MockFoo : public Foo {
+  //    public:
+  //     // Defines a copy constructor explicitly.
+  //     MockFoo(const MockFoo& src) {}
+  //     ...
+  //   };
+  FunctionMocker(const FunctionMocker&) = delete;
+  FunctionMocker& operator=(const FunctionMocker&) = delete;
+
+  // The destructor verifies that all expectations on this mock
+  // function have been satisfied.  If not, it will report Google Test
+  // non-fatal failures for the violations.
+  ~FunctionMocker() override GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    MutexLock l(&g_gmock_mutex);
+    VerifyAndClearExpectationsLocked();
+    Mock::UnregisterLocked(this);
+    ClearDefaultActionsLocked();
+  }
+
+  // Returns the ON_CALL spec that matches this mock function with the
+  // given arguments; returns NULL if no matching ON_CALL is found.
+  // L = *
+  const OnCallSpec<F>* FindOnCallSpec(
+      const ArgumentTuple& args) const {
+    for (UntypedOnCallSpecs::const_reverse_iterator it
+             = untyped_on_call_specs_.rbegin();
+         it != untyped_on_call_specs_.rend(); ++it) {
+      const OnCallSpec<F>* spec = static_cast<const OnCallSpec<F>*>(*it);
+      if (spec->Matches(args))
+        return spec;
+    }
+
+    return nullptr;
+  }
+
+  // Performs the default action of this mock function on the given
+  // arguments and returns the result. Asserts (or throws if
+  // exceptions are enabled) with a helpful call descrption if there
+  // is no valid return value. This method doesn't depend on the
+  // mutable state of this object, and thus can be called concurrently
+  // without locking.
+  // L = *
+  Result PerformDefaultAction(ArgumentTuple&& args,
+                              const std::string& call_description) const {
+    const OnCallSpec<F>* const spec =
+        this->FindOnCallSpec(args);
+    if (spec != nullptr) {
+      return spec->GetAction().Perform(std::move(args));
+    }
+    const std::string message =
+        call_description +
+        "\n    The mock function has no default action "
+        "set, and its return type has no default value set.";
+#if GTEST_HAS_EXCEPTIONS
+    if (!DefaultValue<Result>::Exists()) {
+      throw std::runtime_error(message);
+    }
+#else
+    Assert(DefaultValue<Result>::Exists(), "", -1, message);
+#endif
+    return DefaultValue<Result>::Get();
+  }
+
+  // Performs the default action with the given arguments and returns
+  // the action's result.  The call description string will be used in
+  // the error message to describe the call in the case the default
+  // action fails.  The caller is responsible for deleting the result.
+  // L = *
+  UntypedActionResultHolderBase* UntypedPerformDefaultAction(
+      void* untyped_args,  // must point to an ArgumentTuple
+      const std::string& call_description) const override {
+    ArgumentTuple* args = static_cast<ArgumentTuple*>(untyped_args);
+    return ResultHolder::PerformDefaultAction(this, std::move(*args),
+                                              call_description);
+  }
+
+  // Performs the given action with the given arguments and returns
+  // the action's result.  The caller is responsible for deleting the
+  // result.
+  // L = *
+  UntypedActionResultHolderBase* UntypedPerformAction(
+      const void* untyped_action, void* untyped_args) const override {
+    // Make a copy of the action before performing it, in case the
+    // action deletes the mock object (and thus deletes itself).
+    const Action<F> action = *static_cast<const Action<F>*>(untyped_action);
+    ArgumentTuple* args = static_cast<ArgumentTuple*>(untyped_args);
+    return ResultHolder::PerformAction(action, std::move(*args));
+  }
+
+  // Implements UntypedFunctionMockerBase::ClearDefaultActionsLocked():
+  // clears the ON_CALL()s set on this mock function.
+  void ClearDefaultActionsLocked() override
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+
+    // Deleting our default actions may trigger other mock objects to be
+    // deleted, for example if an action contains a reference counted smart
+    // pointer to that mock object, and that is the last reference. So if we
+    // delete our actions within the context of the global mutex we may deadlock
+    // when this method is called again. Instead, make a copy of the set of
+    // actions to delete, clear our set within the mutex, and then delete the
+    // actions outside of the mutex.
+    UntypedOnCallSpecs specs_to_delete;
+    untyped_on_call_specs_.swap(specs_to_delete);
+
+    g_gmock_mutex.Unlock();
+    for (UntypedOnCallSpecs::const_iterator it =
+             specs_to_delete.begin();
+         it != specs_to_delete.end(); ++it) {
+      delete static_cast<const OnCallSpec<F>*>(*it);
+    }
+
+    // Lock the mutex again, since the caller expects it to be locked when we
+    // return.
+    g_gmock_mutex.Lock();
+  }
+
+  // Returns the result of invoking this mock function with the given
+  // arguments.  This function can be safely called from multiple
+  // threads concurrently.
+  Result Invoke(Args... args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    ArgumentTuple tuple(std::forward<Args>(args)...);
+    std::unique_ptr<ResultHolder> holder(DownCast_<ResultHolder*>(
+        this->UntypedInvokeWith(static_cast<void*>(&tuple))));
+    return holder->Unwrap();
+  }
+
+  MockSpec<F> With(Matcher<Args>... m) {
+    return MockSpec<F>(this, ::std::make_tuple(std::move(m)...));
+  }
+
+ protected:
+  template <typename Function>
+  friend class MockSpec;
+
+  typedef ActionResultHolder<Result> ResultHolder;
+
+  // Adds and returns a default action spec for this mock function.
+  OnCallSpec<F>& AddNewOnCallSpec(
+      const char* file, int line,
+      const ArgumentMatcherTuple& m)
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
+    OnCallSpec<F>* const on_call_spec = new OnCallSpec<F>(file, line, m);
+    untyped_on_call_specs_.push_back(on_call_spec);
+    return *on_call_spec;
+  }
+
+  // Adds and returns an expectation spec for this mock function.
+  TypedExpectation<F>& AddNewExpectation(const char* file, int line,
+                                         const std::string& source_text,
+                                         const ArgumentMatcherTuple& m)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
+    TypedExpectation<F>* const expectation =
+        new TypedExpectation<F>(this, file, line, source_text, m);
+    const std::shared_ptr<ExpectationBase> untyped_expectation(expectation);
+    // See the definition of untyped_expectations_ for why access to
+    // it is unprotected here.
+    untyped_expectations_.push_back(untyped_expectation);
+
+    // Adds this expectation into the implicit sequence if there is one.
+    Sequence* const implicit_sequence = g_gmock_implicit_sequence.get();
+    if (implicit_sequence != nullptr) {
+      implicit_sequence->AddExpectation(Expectation(untyped_expectation));
+    }
+
+    return *expectation;
+  }
+
+ private:
+  template <typename Func> friend class TypedExpectation;
+
+  // Some utilities needed for implementing UntypedInvokeWith().
+
+  // Describes what default action will be performed for the given
+  // arguments.
+  // L = *
+  void DescribeDefaultActionTo(const ArgumentTuple& args,
+                               ::std::ostream* os) const {
+    const OnCallSpec<F>* const spec = FindOnCallSpec(args);
+
+    if (spec == nullptr) {
+      *os << (std::is_void<Result>::value ? "returning directly.\n"
+                                          : "returning default value.\n");
+    } else {
+      *os << "taking default action specified at:\n"
+          << FormatFileLocation(spec->file(), spec->line()) << "\n";
+    }
+  }
+
+  // Writes a message that the call is uninteresting (i.e. neither
+  // explicitly expected nor explicitly unexpected) to the given
+  // ostream.
+  void UntypedDescribeUninterestingCall(const void* untyped_args,
+                                        ::std::ostream* os) const override
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    *os << "Uninteresting mock function call - ";
+    DescribeDefaultActionTo(args, os);
+    *os << "    Function call: " << Name();
+    UniversalPrint(args, os);
+  }
+
+  // Returns the expectation that matches the given function arguments
+  // (or NULL is there's no match); when a match is found,
+  // untyped_action is set to point to the action that should be
+  // performed (or NULL if the action is "do default"), and
+  // is_excessive is modified to indicate whether the call exceeds the
+  // expected number.
+  //
+  // Critical section: We must find the matching expectation and the
+  // corresponding action that needs to be taken in an ATOMIC
+  // transaction.  Otherwise another thread may call this mock
+  // method in the middle and mess up the state.
+  //
+  // However, performing the action has to be left out of the critical
+  // section.  The reason is that we have no control on what the
+  // action does (it can invoke an arbitrary user function or even a
+  // mock function) and excessive locking could cause a dead lock.
+  const ExpectationBase* UntypedFindMatchingExpectation(
+      const void* untyped_args, const void** untyped_action, bool* is_excessive,
+      ::std::ostream* what, ::std::ostream* why) override
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    MutexLock l(&g_gmock_mutex);
+    TypedExpectation<F>* exp = this->FindMatchingExpectationLocked(args);
+    if (exp == nullptr) {  // A match wasn't found.
+      this->FormatUnexpectedCallMessageLocked(args, what, why);
+      return nullptr;
+    }
+
+    // This line must be done before calling GetActionForArguments(),
+    // which will increment the call count for *exp and thus affect
+    // its saturation status.
+    *is_excessive = exp->IsSaturated();
+    const Action<F>* action = exp->GetActionForArguments(this, args, what, why);
+    if (action != nullptr && action->IsDoDefault())
+      action = nullptr;  // Normalize "do default" to NULL.
+    *untyped_action = action;
+    return exp;
+  }
+
+  // Prints the given function arguments to the ostream.
+  void UntypedPrintArgs(const void* untyped_args,
+                        ::std::ostream* os) const override {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    UniversalPrint(args, os);
+  }
+
+  // Returns the expectation that matches the arguments, or NULL if no
+  // expectation matches them.
+  TypedExpectation<F>* FindMatchingExpectationLocked(
+      const ArgumentTuple& args) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    // See the definition of untyped_expectations_ for why access to
+    // it is unprotected here.
+    for (typename UntypedExpectations::const_reverse_iterator it =
+             untyped_expectations_.rbegin();
+         it != untyped_expectations_.rend(); ++it) {
+      TypedExpectation<F>* const exp =
+          static_cast<TypedExpectation<F>*>(it->get());
+      if (exp->ShouldHandleArguments(args)) {
+        return exp;
+      }
+    }
+    return nullptr;
+  }
+
+  // Returns a message that the arguments don't match any expectation.
+  void FormatUnexpectedCallMessageLocked(
+      const ArgumentTuple& args,
+      ::std::ostream* os,
+      ::std::ostream* why) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    *os << "\nUnexpected mock function call - ";
+    DescribeDefaultActionTo(args, os);
+    PrintTriedExpectationsLocked(args, why);
+  }
+
+  // Prints a list of expectations that have been tried against the
+  // current mock function call.
+  void PrintTriedExpectationsLocked(
+      const ArgumentTuple& args,
+      ::std::ostream* why) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    const size_t count = untyped_expectations_.size();
+    *why << "Google Mock tried the following " << count << " "
+         << (count == 1 ? "expectation, but it didn't match" :
+             "expectations, but none matched")
+         << ":\n";
+    for (size_t i = 0; i < count; i++) {
+      TypedExpectation<F>* const expectation =
+          static_cast<TypedExpectation<F>*>(untyped_expectations_[i].get());
+      *why << "\n";
+      expectation->DescribeLocationTo(why);
+      if (count > 1) {
+        *why << "tried expectation #" << i << ": ";
+      }
+      *why << expectation->source_text() << "...\n";
+      expectation->ExplainMatchResultTo(args, why);
+      expectation->DescribeCallCountTo(why);
+    }
+  }
+};  // class FunctionMocker
+
+// Reports an uninteresting call (whose description is in msg) in the
+// manner specified by 'reaction'.
+void ReportUninterestingCall(CallReaction reaction, const std::string& msg);
+
+}  // namespace internal
+
+namespace internal {
+
+template <typename F>
+class MockFunction;
+
+template <typename R, typename... Args>
+class MockFunction<R(Args...)> {
+ public:
+  MockFunction(const MockFunction&) = delete;
+  MockFunction& operator=(const MockFunction&) = delete;
+
+  std::function<R(Args...)> AsStdFunction() {
+    return [this](Args... args) -> R {
+      return this->Call(std::forward<Args>(args)...);
+    };
+  }
+
+  // Implementation detail: the expansion of the MOCK_METHOD macro.
+  R Call(Args... args) {
+    mock_.SetOwnerAndName(this, "Call");
+    return mock_.Invoke(std::forward<Args>(args)...);
+  }
+
+  MockSpec<R(Args...)> gmock_Call(Matcher<Args>... m) {
+    mock_.RegisterOwner(this);
+    return mock_.With(std::move(m)...);
+  }
+
+  MockSpec<R(Args...)> gmock_Call(const WithoutMatchers&, R (*)(Args...)) {
+    return this->gmock_Call(::testing::A<Args>()...);
+  }
+
+ protected:
+  MockFunction() = default;
+  ~MockFunction() = default;
+
+ private:
+  FunctionMocker<R(Args...)> mock_;
+};
+
+/*
+The SignatureOf<F> struct is a meta-function returning function signature
+corresponding to the provided F argument.
+
+It makes use of MockFunction easier by allowing it to accept more F arguments
+than just function signatures.
+
+Specializations provided here cover a signature type itself and any template
+that can be parameterized with a signature, including std::function and
+boost::function.
+*/
+
+template <typename F, typename = void>
+struct SignatureOf;
+
+template <typename R, typename... Args>
+struct SignatureOf<R(Args...)> {
+  using type = R(Args...);
+};
+
+template <template <typename> class C, typename F>
+struct SignatureOf<C<F>,
+                   typename std::enable_if<std::is_function<F>::value>::type>
+    : SignatureOf<F> {};
+
+template <typename F>
+using SignatureOfT = typename SignatureOf<F>::type;
+
+}  // namespace internal
+
+// A MockFunction<F> type has one mock method whose type is
+// internal::SignatureOfT<F>.  It is useful when you just want your
+// test code to emit some messages and have Google Mock verify the
+// right messages are sent (and perhaps at the right times).  For
+// example, if you are exercising code:
+//
+//   Foo(1);
+//   Foo(2);
+//   Foo(3);
+//
+// and want to verify that Foo(1) and Foo(3) both invoke
+// mock.Bar("a"), but Foo(2) doesn't invoke anything, you can write:
+//
+// TEST(FooTest, InvokesBarCorrectly) {
+//   MyMock mock;
+//   MockFunction<void(string check_point_name)> check;
+//   {
+//     InSequence s;
+//
+//     EXPECT_CALL(mock, Bar("a"));
+//     EXPECT_CALL(check, Call("1"));
+//     EXPECT_CALL(check, Call("2"));
+//     EXPECT_CALL(mock, Bar("a"));
+//   }
+//   Foo(1);
+//   check.Call("1");
+//   Foo(2);
+//   check.Call("2");
+//   Foo(3);
+// }
+//
+// The expectation spec says that the first Bar("a") must happen
+// before check point "1", the second Bar("a") must happen after check
+// point "2", and nothing should happen between the two check
+// points. The explicit check points make it easy to tell which
+// Bar("a") is called by which call to Foo().
+//
+// MockFunction<F> can also be used to exercise code that accepts
+// std::function<internal::SignatureOfT<F>> callbacks. To do so, use
+// AsStdFunction() method to create std::function proxy forwarding to
+// original object's Call. Example:
+//
+// TEST(FooTest, RunsCallbackWithBarArgument) {
+//   MockFunction<int(string)> callback;
+//   EXPECT_CALL(callback, Call("bar")).WillOnce(Return(1));
+//   Foo(callback.AsStdFunction());
+// }
+//
+// The internal::SignatureOfT<F> indirection allows to use other types
+// than just function signature type. This is typically useful when
+// providing a mock for a predefined std::function type. Example:
+//
+// using FilterPredicate = std::function<bool(string)>;
+// void MyFilterAlgorithm(FilterPredicate predicate);
+//
+// TEST(FooTest, FilterPredicateAlwaysAccepts) {
+//   MockFunction<FilterPredicate> predicateMock;
+//   EXPECT_CALL(predicateMock, Call(_)).WillRepeatedly(Return(true));
+//   MyFilterAlgorithm(predicateMock.AsStdFunction());
+// }
+template <typename F>
+class MockFunction : public internal::MockFunction<internal::SignatureOfT<F>> {
+  using Base = internal::MockFunction<internal::SignatureOfT<F>>;
+
+ public:
+  using Base::Base;
+};
+
+// The style guide prohibits "using" statements in a namespace scope
+// inside a header file.  However, the MockSpec class template is
+// meant to be defined in the ::testing namespace.  The following line
+// is just a trick for working around a bug in MSVC 8.0, which cannot
+// handle it if we define MockSpec in ::testing.
+using internal::MockSpec;
+
+// Const(x) is a convenient function for obtaining a const reference
+// to x.  This is useful for setting expectations on an overloaded
+// const mock method, e.g.
+//
+//   class MockFoo : public FooInterface {
+//    public:
+//     MOCK_METHOD0(Bar, int());
+//     MOCK_CONST_METHOD0(Bar, int&());
+//   };
+//
+//   MockFoo foo;
+//   // Expects a call to non-const MockFoo::Bar().
+//   EXPECT_CALL(foo, Bar());
+//   // Expects a call to const MockFoo::Bar().
+//   EXPECT_CALL(Const(foo), Bar());
+template <typename T>
+inline const T& Const(const T& x) { return x; }
+
+// Constructs an Expectation object that references and co-owns exp.
+inline Expectation::Expectation(internal::ExpectationBase& exp)  // NOLINT
+    : expectation_base_(exp.GetHandle().expectation_base()) {}
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// Implementation for ON_CALL and EXPECT_CALL macros. A separate macro is
+// required to avoid compile errors when the name of the method used in call is
+// a result of macro expansion. See CompilesWithMethodNameExpandedFromMacro
+// tests in internal/gmock-spec-builders_test.cc for more details.
+//
+// This macro supports statements both with and without parameter matchers. If
+// the parameter list is omitted, gMock will accept any parameters, which allows
+// tests to be written that don't need to encode the number of method
+// parameter. This technique may only be used for non-overloaded methods.
+//
+//   // These are the same:
+//   ON_CALL(mock, NoArgsMethod()).WillByDefault(...);
+//   ON_CALL(mock, NoArgsMethod).WillByDefault(...);
+//
+//   // As are these:
+//   ON_CALL(mock, TwoArgsMethod(_, _)).WillByDefault(...);
+//   ON_CALL(mock, TwoArgsMethod).WillByDefault(...);
+//
+//   // Can also specify args if you want, of course:
+//   ON_CALL(mock, TwoArgsMethod(_, 45)).WillByDefault(...);
+//
+//   // Overloads work as long as you specify parameters:
+//   ON_CALL(mock, OverloadedMethod(_)).WillByDefault(...);
+//   ON_CALL(mock, OverloadedMethod(_, _)).WillByDefault(...);
+//
+//   // Oops! Which overload did you want?
+//   ON_CALL(mock, OverloadedMethod).WillByDefault(...);
+//     => ERROR: call to member function 'gmock_OverloadedMethod' is ambiguous
+//
+// How this works: The mock class uses two overloads of the gmock_Method
+// expectation setter method plus an operator() overload on the MockSpec object.
+// In the matcher list form, the macro expands to:
+//
+//   // This statement:
+//   ON_CALL(mock, TwoArgsMethod(_, 45))...
+//
+//   // ...expands to:
+//   mock.gmock_TwoArgsMethod(_, 45)(WithoutMatchers(), nullptr)...
+//   |-------------v---------------||------------v-------------|
+//       invokes first overload        swallowed by operator()
+//
+//   // ...which is essentially:
+//   mock.gmock_TwoArgsMethod(_, 45)...
+//
+// Whereas the form without a matcher list:
+//
+//   // This statement:
+//   ON_CALL(mock, TwoArgsMethod)...
+//
+//   // ...expands to:
+//   mock.gmock_TwoArgsMethod(WithoutMatchers(), nullptr)...
+//   |-----------------------v--------------------------|
+//                 invokes second overload
+//
+//   // ...which is essentially:
+//   mock.gmock_TwoArgsMethod(_, _)...
+//
+// The WithoutMatchers() argument is used to disambiguate overloads and to
+// block the caller from accidentally invoking the second overload directly. The
+// second argument is an internal type derived from the method signature. The
+// failure to disambiguate two overloads of this method in the ON_CALL statement
+// is how we block callers from setting expectations on overloaded methods.
+#define GMOCK_ON_CALL_IMPL_(mock_expr, Setter, call)                    \
+  ((mock_expr).gmock_##call)(::testing::internal::GetWithoutMatchers(), \
+                             nullptr)                                   \
+      .Setter(__FILE__, __LINE__, #mock_expr, #call)
+
+#define ON_CALL(obj, call) \
+  GMOCK_ON_CALL_IMPL_(obj, InternalDefaultActionSetAt, call)
+
+#define EXPECT_CALL(obj, call) \
+  GMOCK_ON_CALL_IMPL_(obj, InternalExpectedAt, call)
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock.h
new file mode 100644
index 0000000000..12469bc466
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/gmock.h
@@ -0,0 +1,98 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This is the main header file a user should include.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
+
+// This file implements the following syntax:
+//
+//   ON_CALL(mock_object, Method(...))
+//     .With(...) ?
+//     .WillByDefault(...);
+//
+// where With() is optional and WillByDefault() must appear exactly
+// once.
+//
+//   EXPECT_CALL(mock_object, Method(...))
+//     .With(...) ?
+//     .Times(...) ?
+//     .InSequence(...) *
+//     .WillOnce(...) *
+//     .WillRepeatedly(...) ?
+//     .RetiresOnSaturation() ? ;
+//
+// where all clauses are optional and WillOnce() can be repeated.
+
+#include "gmock/gmock-actions.h"
+#include "gmock/gmock-cardinalities.h"
+#include "gmock/gmock-function-mocker.h"
+#include "gmock/gmock-matchers.h"
+#include "gmock/gmock-more-actions.h"
+#include "gmock/gmock-more-matchers.h"
+#include "gmock/gmock-nice-strict.h"
+#include "gmock/internal/gmock-internal-utils.h"
+
+namespace testing {
+
+// Declares Google Mock flags that we want a user to use programmatically.
+GMOCK_DECLARE_bool_(catch_leaked_mocks);
+GMOCK_DECLARE_string_(verbose);
+GMOCK_DECLARE_int32_(default_mock_behavior);
+
+// Initializes Google Mock.  This must be called before running the
+// tests.  In particular, it parses the command line for the flags
+// that Google Mock recognizes.  Whenever a Google Mock flag is seen,
+// it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Mock flag variables are
+// updated.
+//
+// Since Google Test is needed for Google Mock to work, this function
+// also initializes Google Test and parses its flags, if that hasn't
+// been done.
+GTEST_API_ void InitGoogleMock(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleMock(int* argc, wchar_t** argv);
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleMock();
+
+}  // namespace testing
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md
new file mode 100644
index 0000000000..f6c93f616d
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md
@@ -0,0 +1,16 @@
+# Customization Points
+
+The custom directory is an injection point for custom user configurations.
+
+## Header `gmock-port.h`
+
+The following macros can be defined:
+
+### Flag related macros:
+
+*   `GMOCK_DECLARE_bool_(name)`
+*   `GMOCK_DECLARE_int32_(name)`
+*   `GMOCK_DECLARE_string_(name)`
+*   `GMOCK_DEFINE_bool_(name, default_val, doc)`
+*   `GMOCK_DEFINE_int32_(name, default_val, doc)`
+*   `GMOCK_DEFINE_string_(name, default_val, doc)`
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
new file mode 100644
index 0000000000..63f899962e
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
@@ -0,0 +1,6 @@
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h
new file mode 100644
index 0000000000..638429488e
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h
@@ -0,0 +1,36 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h
new file mode 100644
index 0000000000..14378692ae
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h
@@ -0,0 +1,39 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h
new file mode 100644
index 0000000000..317544a7da
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h
@@ -0,0 +1,459 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file defines some utilities useful for implementing Google
+// Mock.  They are subject to change without notice, so please DO NOT
+// USE THEM IN USER CODE.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
+
+#include <stdio.h>
+#include <ostream>  // NOLINT
+#include <string>
+#include <type_traits>
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+
+template <typename>
+class Matcher;
+
+namespace internal {
+
+// Silence MSVC C4100 (unreferenced formal parameter) and
+// C4805('==': unsafe mix of type 'const int' and type 'const bool')
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+# pragma warning(disable:4805)
+#endif
+
+// Joins a vector of strings as if they are fields of a tuple; returns
+// the joined string.
+GTEST_API_ std::string JoinAsTuple(const Strings& fields);
+
+// Converts an identifier name to a space-separated list of lower-case
+// words.  Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
+// treated as one word.  For example, both "FooBar123" and
+// "foo_bar_123" are converted to "foo bar 123".
+GTEST_API_ std::string ConvertIdentifierNameToWords(const char* id_name);
+
+// GetRawPointer(p) returns the raw pointer underlying p when p is a
+// smart pointer, or returns p itself when p is already a raw pointer.
+// The following default implementation is for the smart pointer case.
+template <typename Pointer>
+inline const typename Pointer::element_type* GetRawPointer(const Pointer& p) {
+  return p.get();
+}
+// This overloaded version is for the raw pointer case.
+template <typename Element>
+inline Element* GetRawPointer(Element* p) { return p; }
+
+// MSVC treats wchar_t as a native type usually, but treats it as the
+// same as unsigned short when the compiler option /Zc:wchar_t- is
+// specified.  It defines _NATIVE_WCHAR_T_DEFINED symbol when wchar_t
+// is a native type.
+#if defined(_MSC_VER) && !defined(_NATIVE_WCHAR_T_DEFINED)
+// wchar_t is a typedef.
+#else
+# define GMOCK_WCHAR_T_IS_NATIVE_ 1
+#endif
+
+// In what follows, we use the term "kind" to indicate whether a type
+// is bool, an integer type (excluding bool), a floating-point type,
+// or none of them.  This categorization is useful for determining
+// when a matcher argument type can be safely converted to another
+// type in the implementation of SafeMatcherCast.
+enum TypeKind {
+  kBool, kInteger, kFloatingPoint, kOther
+};
+
+// KindOf<T>::value is the kind of type T.
+template <typename T> struct KindOf {
+  enum { value = kOther };  // The default kind.
+};
+
+// This macro declares that the kind of 'type' is 'kind'.
+#define GMOCK_DECLARE_KIND_(type, kind) \
+  template <> struct KindOf<type> { enum { value = kind }; }
+
+GMOCK_DECLARE_KIND_(bool, kBool);
+
+// All standard integer types.
+GMOCK_DECLARE_KIND_(char, kInteger);
+GMOCK_DECLARE_KIND_(signed char, kInteger);
+GMOCK_DECLARE_KIND_(unsigned char, kInteger);
+GMOCK_DECLARE_KIND_(short, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(unsigned short, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(int, kInteger);
+GMOCK_DECLARE_KIND_(unsigned int, kInteger);
+GMOCK_DECLARE_KIND_(long, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(unsigned long, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(long long, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(unsigned long long, kInteger);  // NOLINT
+
+#if GMOCK_WCHAR_T_IS_NATIVE_
+GMOCK_DECLARE_KIND_(wchar_t, kInteger);
+#endif
+
+// All standard floating-point types.
+GMOCK_DECLARE_KIND_(float, kFloatingPoint);
+GMOCK_DECLARE_KIND_(double, kFloatingPoint);
+GMOCK_DECLARE_KIND_(long double, kFloatingPoint);
+
+#undef GMOCK_DECLARE_KIND_
+
+// Evaluates to the kind of 'type'.
+#define GMOCK_KIND_OF_(type) \
+  static_cast< ::testing::internal::TypeKind>( \
+      ::testing::internal::KindOf<type>::value)
+
+// LosslessArithmeticConvertibleImpl<kFromKind, From, kToKind, To>::value
+// is true if and only if arithmetic type From can be losslessly converted to
+// arithmetic type To.
+//
+// It's the user's responsibility to ensure that both From and To are
+// raw (i.e. has no CV modifier, is not a pointer, and is not a
+// reference) built-in arithmetic types, kFromKind is the kind of
+// From, and kToKind is the kind of To; the value is
+// implementation-defined when the above pre-condition is violated.
+template <TypeKind kFromKind, typename From, TypeKind kToKind, typename To>
+using LosslessArithmeticConvertibleImpl = std::integral_constant<
+    bool,
+    // clang-format off
+      // Converting from bool is always lossless
+      (kFromKind == kBool) ? true
+      // Converting between any other type kinds will be lossy if the type
+      // kinds are not the same.
+    : (kFromKind != kToKind) ? false
+    : (kFromKind == kInteger &&
+       // Converting between integers of different widths is allowed so long
+       // as the conversion does not go from signed to unsigned.
+      (((sizeof(From) < sizeof(To)) &&
+        !(std::is_signed<From>::value && !std::is_signed<To>::value)) ||
+       // Converting between integers of the same width only requires the
+       // two types to have the same signedness.
+       ((sizeof(From) == sizeof(To)) &&
+        (std::is_signed<From>::value == std::is_signed<To>::value)))
+       ) ? true
+      // Floating point conversions are lossless if and only if `To` is at least
+      // as wide as `From`.
+    : (kFromKind == kFloatingPoint && (sizeof(From) <= sizeof(To))) ? true
+    : false
+    // clang-format on
+    >;
+
+// LosslessArithmeticConvertible<From, To>::value is true if and only if
+// arithmetic type From can be losslessly converted to arithmetic type To.
+//
+// It's the user's responsibility to ensure that both From and To are
+// raw (i.e. has no CV modifier, is not a pointer, and is not a
+// reference) built-in arithmetic types; the value is
+// implementation-defined when the above pre-condition is violated.
+template <typename From, typename To>
+using LosslessArithmeticConvertible =
+    LosslessArithmeticConvertibleImpl<GMOCK_KIND_OF_(From), From,
+                                      GMOCK_KIND_OF_(To), To>;
+
+// This interface knows how to report a Google Mock failure (either
+// non-fatal or fatal).
+class FailureReporterInterface {
+ public:
+  // The type of a failure (either non-fatal or fatal).
+  enum FailureType {
+    kNonfatal, kFatal
+  };
+
+  virtual ~FailureReporterInterface() {}
+
+  // Reports a failure that occurred at the given source file location.
+  virtual void ReportFailure(FailureType type, const char* file, int line,
+                             const std::string& message) = 0;
+};
+
+// Returns the failure reporter used by Google Mock.
+GTEST_API_ FailureReporterInterface* GetFailureReporter();
+
+// Asserts that condition is true; aborts the process with the given
+// message if condition is false.  We cannot use LOG(FATAL) or CHECK()
+// as Google Mock might be used to mock the log sink itself.  We
+// inline this function to prevent it from showing up in the stack
+// trace.
+inline void Assert(bool condition, const char* file, int line,
+                   const std::string& msg) {
+  if (!condition) {
+    GetFailureReporter()->ReportFailure(FailureReporterInterface::kFatal,
+                                        file, line, msg);
+  }
+}
+inline void Assert(bool condition, const char* file, int line) {
+  Assert(condition, file, line, "Assertion failed.");
+}
+
+// Verifies that condition is true; generates a non-fatal failure if
+// condition is false.
+inline void Expect(bool condition, const char* file, int line,
+                   const std::string& msg) {
+  if (!condition) {
+    GetFailureReporter()->ReportFailure(FailureReporterInterface::kNonfatal,
+                                        file, line, msg);
+  }
+}
+inline void Expect(bool condition, const char* file, int line) {
+  Expect(condition, file, line, "Expectation failed.");
+}
+
+// Severity level of a log.
+enum LogSeverity {
+  kInfo = 0,
+  kWarning = 1
+};
+
+// Valid values for the --gmock_verbose flag.
+
+// All logs (informational and warnings) are printed.
+const char kInfoVerbosity[] = "info";
+// Only warnings are printed.
+const char kWarningVerbosity[] = "warning";
+// No logs are printed.
+const char kErrorVerbosity[] = "error";
+
+// Returns true if and only if a log with the given severity is visible
+// according to the --gmock_verbose flag.
+GTEST_API_ bool LogIsVisible(LogSeverity severity);
+
+// Prints the given message to stdout if and only if 'severity' >= the level
+// specified by the --gmock_verbose flag.  If stack_frames_to_skip >=
+// 0, also prints the stack trace excluding the top
+// stack_frames_to_skip frames.  In opt mode, any positive
+// stack_frames_to_skip is treated as 0, since we don't know which
+// function calls will be inlined by the compiler and need to be
+// conservative.
+GTEST_API_ void Log(LogSeverity severity, const std::string& message,
+                    int stack_frames_to_skip);
+
+// A marker class that is used to resolve parameterless expectations to the
+// correct overload. This must not be instantiable, to prevent client code from
+// accidentally resolving to the overload; for example:
+//
+//    ON_CALL(mock, Method({}, nullptr))...
+//
+class WithoutMatchers {
+ private:
+  WithoutMatchers() {}
+  friend GTEST_API_ WithoutMatchers GetWithoutMatchers();
+};
+
+// Internal use only: access the singleton instance of WithoutMatchers.
+GTEST_API_ WithoutMatchers GetWithoutMatchers();
+
+// Disable MSVC warnings for infinite recursion, since in this case the
+// the recursion is unreachable.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4717)
+#endif
+
+// Invalid<T>() is usable as an expression of type T, but will terminate
+// the program with an assertion failure if actually run.  This is useful
+// when a value of type T is needed for compilation, but the statement
+// will not really be executed (or we don't care if the statement
+// crashes).
+template <typename T>
+inline T Invalid() {
+  Assert(false, "", -1, "Internal error: attempt to return invalid value");
+  // This statement is unreachable, and would never terminate even if it
+  // could be reached. It is provided only to placate compiler warnings
+  // about missing return statements.
+  return Invalid<T>();
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+// Given a raw type (i.e. having no top-level reference or const
+// modifier) RawContainer that's either an STL-style container or a
+// native array, class StlContainerView<RawContainer> has the
+// following members:
+//
+//   - type is a type that provides an STL-style container view to
+//     (i.e. implements the STL container concept for) RawContainer;
+//   - const_reference is a type that provides a reference to a const
+//     RawContainer;
+//   - ConstReference(raw_container) returns a const reference to an STL-style
+//     container view to raw_container, which is a RawContainer.
+//   - Copy(raw_container) returns an STL-style container view of a
+//     copy of raw_container, which is a RawContainer.
+//
+// This generic version is used when RawContainer itself is already an
+// STL-style container.
+template <class RawContainer>
+class StlContainerView {
+ public:
+  typedef RawContainer type;
+  typedef const type& const_reference;
+
+  static const_reference ConstReference(const RawContainer& container) {
+    static_assert(!std::is_const<RawContainer>::value,
+                  "RawContainer type must not be const");
+    return container;
+  }
+  static type Copy(const RawContainer& container) { return container; }
+};
+
+// This specialization is used when RawContainer is a native array type.
+template <typename Element, size_t N>
+class StlContainerView<Element[N]> {
+ public:
+  typedef typename std::remove_const<Element>::type RawElement;
+  typedef internal::NativeArray<RawElement> type;
+  // NativeArray<T> can represent a native array either by value or by
+  // reference (selected by a constructor argument), so 'const type'
+  // can be used to reference a const native array.  We cannot
+  // 'typedef const type& const_reference' here, as that would mean
+  // ConstReference() has to return a reference to a local variable.
+  typedef const type const_reference;
+
+  static const_reference ConstReference(const Element (&array)[N]) {
+    static_assert(std::is_same<Element, RawElement>::value,
+                  "Element type must not be const");
+    return type(array, N, RelationToSourceReference());
+  }
+  static type Copy(const Element (&array)[N]) {
+    return type(array, N, RelationToSourceCopy());
+  }
+};
+
+// This specialization is used when RawContainer is a native array
+// represented as a (pointer, size) tuple.
+template <typename ElementPointer, typename Size>
+class StlContainerView< ::std::tuple<ElementPointer, Size> > {
+ public:
+  typedef typename std::remove_const<
+      typename std::pointer_traits<ElementPointer>::element_type>::type
+      RawElement;
+  typedef internal::NativeArray<RawElement> type;
+  typedef const type const_reference;
+
+  static const_reference ConstReference(
+      const ::std::tuple<ElementPointer, Size>& array) {
+    return type(std::get<0>(array), std::get<1>(array),
+                RelationToSourceReference());
+  }
+  static type Copy(const ::std::tuple<ElementPointer, Size>& array) {
+    return type(std::get<0>(array), std::get<1>(array), RelationToSourceCopy());
+  }
+};
+
+// The following specialization prevents the user from instantiating
+// StlContainer with a reference type.
+template <typename T> class StlContainerView<T&>;
+
+// A type transform to remove constness from the first part of a pair.
+// Pairs like that are used as the value_type of associative containers,
+// and this transform produces a similar but assignable pair.
+template <typename T>
+struct RemoveConstFromKey {
+  typedef T type;
+};
+
+// Partially specialized to remove constness from std::pair<const K, V>.
+template <typename K, typename V>
+struct RemoveConstFromKey<std::pair<const K, V> > {
+  typedef std::pair<K, V> type;
+};
+
+// Emit an assertion failure due to incorrect DoDefault() usage. Out-of-lined to
+// reduce code size.
+GTEST_API_ void IllegalDoDefault(const char* file, int line);
+
+template <typename F, typename Tuple, size_t... Idx>
+auto ApplyImpl(F&& f, Tuple&& args, IndexSequence<Idx...>) -> decltype(
+    std::forward<F>(f)(std::get<Idx>(std::forward<Tuple>(args))...)) {
+  return std::forward<F>(f)(std::get<Idx>(std::forward<Tuple>(args))...);
+}
+
+// Apply the function to a tuple of arguments.
+template <typename F, typename Tuple>
+auto Apply(F&& f, Tuple&& args) -> decltype(
+    ApplyImpl(std::forward<F>(f), std::forward<Tuple>(args),
+              MakeIndexSequence<std::tuple_size<
+                  typename std::remove_reference<Tuple>::type>::value>())) {
+  return ApplyImpl(std::forward<F>(f), std::forward<Tuple>(args),
+                   MakeIndexSequence<std::tuple_size<
+                       typename std::remove_reference<Tuple>::type>::value>());
+}
+
+// Template struct Function<F>, where F must be a function type, contains
+// the following typedefs:
+//
+//   Result:               the function's return type.
+//   Arg<N>:               the type of the N-th argument, where N starts with 0.
+//   ArgumentTuple:        the tuple type consisting of all parameters of F.
+//   ArgumentMatcherTuple: the tuple type consisting of Matchers for all
+//                         parameters of F.
+//   MakeResultVoid:       the function type obtained by substituting void
+//                         for the return type of F.
+//   MakeResultIgnoredValue:
+//                         the function type obtained by substituting Something
+//                         for the return type of F.
+template <typename T>
+struct Function;
+
+template <typename R, typename... Args>
+struct Function<R(Args...)> {
+  using Result = R;
+  static constexpr size_t ArgumentCount = sizeof...(Args);
+  template <size_t I>
+  using Arg = ElemFromList<I, Args...>;
+  using ArgumentTuple = std::tuple<Args...>;
+  using ArgumentMatcherTuple = std::tuple<Matcher<Args>...>;
+  using MakeResultVoid = void(Args...);
+  using MakeResultIgnoredValue = IgnoredValue(Args...);
+};
+
+template <typename R, typename... Args>
+constexpr size_t Function<R(Args...)>::ArgumentCount;
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h
new file mode 100644
index 0000000000..367a44d366
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h
@@ -0,0 +1,87 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Low-level types and utilities for porting Google Mock to various
+// platforms.  All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice.  Code
+// outside Google Mock MUST NOT USE THEM DIRECTLY.  Macros that don't
+// end with _ are part of Google Mock's public API and can be used by
+// code outside Google Mock.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
+
+#include <assert.h>
+#include <stdlib.h>
+#include <cstdint>
+#include <iostream>
+
+// Most of the utilities needed for porting Google Mock are also
+// required for Google Test and are defined in gtest-port.h.
+//
+// Note to maintainers: to reduce code duplication, prefer adding
+// portability utilities to Google Test's gtest-port.h instead of
+// here, as Google Mock depends on Google Test.  Only add a utility
+// here if it's truly specific to Google Mock.
+
+#include "gtest/internal/gtest-port.h"
+#include "gmock/internal/custom/gmock-port.h"
+
+// For MS Visual C++, check the compiler version. At least VS 2015 is
+// required to compile Google Mock.
+#if defined(_MSC_VER) && _MSC_VER < 1900
+# error "At least Visual C++ 2015 (14.0) is required to compile Google Mock."
+#endif
+
+// Macro for referencing flags.  This is public as we want the user to
+// use this syntax to reference Google Mock flags.
+#define GMOCK_FLAG(name) FLAGS_gmock_##name
+
+#if !defined(GMOCK_DECLARE_bool_)
+
+// Macros for declaring flags.
+# define GMOCK_DECLARE_bool_(name) extern GTEST_API_ bool GMOCK_FLAG(name)
+# define GMOCK_DECLARE_int32_(name) extern GTEST_API_ int32_t GMOCK_FLAG(name)
+# define GMOCK_DECLARE_string_(name) \
+    extern GTEST_API_ ::std::string GMOCK_FLAG(name)
+
+// Macros for defining flags.
+# define GMOCK_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GMOCK_FLAG(name) = (default_val)
+# define GMOCK_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ int32_t GMOCK_FLAG(name) = (default_val)
+# define GMOCK_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GMOCK_FLAG(name) = (default_val)
+
+#endif  // !defined(GMOCK_DECLARE_bool_)
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/gmock-pp.h b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/gmock-pp.h
new file mode 100644
index 0000000000..94d61c09c8
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/include/gmock/internal/gmock-pp.h
@@ -0,0 +1,279 @@
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PP_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PP_H_
+
+// Expands and concatenates the arguments. Constructed macros reevaluate.
+#define GMOCK_PP_CAT(_1, _2) GMOCK_PP_INTERNAL_CAT(_1, _2)
+
+// Expands and stringifies the only argument.
+#define GMOCK_PP_STRINGIZE(...) GMOCK_PP_INTERNAL_STRINGIZE(__VA_ARGS__)
+
+// Returns empty. Given a variadic number of arguments.
+#define GMOCK_PP_EMPTY(...)
+
+// Returns a comma. Given a variadic number of arguments.
+#define GMOCK_PP_COMMA(...) ,
+
+// Returns the only argument.
+#define GMOCK_PP_IDENTITY(_1) _1
+
+// Evaluates to the number of arguments after expansion.
+//
+//   #define PAIR x, y
+//
+//   GMOCK_PP_NARG() => 1
+//   GMOCK_PP_NARG(x) => 1
+//   GMOCK_PP_NARG(x, y) => 2
+//   GMOCK_PP_NARG(PAIR) => 2
+//
+// Requires: the number of arguments after expansion is at most 15.
+#define GMOCK_PP_NARG(...) \
+  GMOCK_PP_INTERNAL_16TH(  \
+      (__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+
+// Returns 1 if the expansion of arguments has an unprotected comma. Otherwise
+// returns 0. Requires no more than 15 unprotected commas.
+#define GMOCK_PP_HAS_COMMA(...) \
+  GMOCK_PP_INTERNAL_16TH(       \
+      (__VA_ARGS__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0))
+
+// Returns the first argument.
+#define GMOCK_PP_HEAD(...) GMOCK_PP_INTERNAL_HEAD((__VA_ARGS__, unusedArg))
+
+// Returns the tail. A variadic list of all arguments minus the first. Requires
+// at least one argument.
+#define GMOCK_PP_TAIL(...) GMOCK_PP_INTERNAL_TAIL((__VA_ARGS__))
+
+// Calls CAT(_Macro, NARG(__VA_ARGS__))(__VA_ARGS__)
+#define GMOCK_PP_VARIADIC_CALL(_Macro, ...) \
+  GMOCK_PP_IDENTITY(                        \
+      GMOCK_PP_CAT(_Macro, GMOCK_PP_NARG(__VA_ARGS__))(__VA_ARGS__))
+
+// If the arguments after expansion have no tokens, evaluates to `1`. Otherwise
+// evaluates to `0`.
+//
+// Requires: * the number of arguments after expansion is at most 15.
+//           * If the argument is a macro, it must be able to be called with one
+//             argument.
+//
+// Implementation details:
+//
+// There is one case when it generates a compile error: if the argument is macro
+// that cannot be called with one argument.
+//
+//   #define M(a, b)  // it doesn't matter what it expands to
+//
+//   // Expected: expands to `0`.
+//   // Actual: compile error.
+//   GMOCK_PP_IS_EMPTY(M)
+//
+// There are 4 cases tested:
+//
+// * __VA_ARGS__ possible expansion has no unparen'd commas. Expected 0.
+// * __VA_ARGS__ possible expansion is not enclosed in parenthesis. Expected 0.
+// * __VA_ARGS__ possible expansion is not a macro that ()-evaluates to a comma.
+//   Expected 0
+// * __VA_ARGS__ is empty, or has unparen'd commas, or is enclosed in
+//   parenthesis, or is a macro that ()-evaluates to comma. Expected 1.
+//
+// We trigger detection on '0001', i.e. on empty.
+#define GMOCK_PP_IS_EMPTY(...)                                               \
+  GMOCK_PP_INTERNAL_IS_EMPTY(GMOCK_PP_HAS_COMMA(__VA_ARGS__),                \
+                             GMOCK_PP_HAS_COMMA(GMOCK_PP_COMMA __VA_ARGS__), \
+                             GMOCK_PP_HAS_COMMA(__VA_ARGS__()),              \
+                             GMOCK_PP_HAS_COMMA(GMOCK_PP_COMMA __VA_ARGS__()))
+
+// Evaluates to _Then if _Cond is 1 and _Else if _Cond is 0.
+#define GMOCK_PP_IF(_Cond, _Then, _Else) \
+  GMOCK_PP_CAT(GMOCK_PP_INTERNAL_IF_, _Cond)(_Then, _Else)
+
+// Similar to GMOCK_PP_IF but takes _Then and _Else in parentheses.
+//
+// GMOCK_PP_GENERIC_IF(1, (a, b, c), (d, e, f)) => a, b, c
+// GMOCK_PP_GENERIC_IF(0, (a, b, c), (d, e, f)) => d, e, f
+//
+#define GMOCK_PP_GENERIC_IF(_Cond, _Then, _Else) \
+  GMOCK_PP_REMOVE_PARENS(GMOCK_PP_IF(_Cond, _Then, _Else))
+
+// Evaluates to the number of arguments after expansion. Identifies 'empty' as
+// 0.
+//
+//   #define PAIR x, y
+//
+//   GMOCK_PP_NARG0() => 0
+//   GMOCK_PP_NARG0(x) => 1
+//   GMOCK_PP_NARG0(x, y) => 2
+//   GMOCK_PP_NARG0(PAIR) => 2
+//
+// Requires: * the number of arguments after expansion is at most 15.
+//           * If the argument is a macro, it must be able to be called with one
+//             argument.
+#define GMOCK_PP_NARG0(...) \
+  GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(__VA_ARGS__), 0, GMOCK_PP_NARG(__VA_ARGS__))
+
+// Expands to 1 if the first argument starts with something in parentheses,
+// otherwise to 0.
+#define GMOCK_PP_IS_BEGIN_PARENS(...)                              \
+  GMOCK_PP_HEAD(GMOCK_PP_CAT(GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_, \
+                             GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C __VA_ARGS__))
+
+// Expands to 1 is there is only one argument and it is enclosed in parentheses.
+#define GMOCK_PP_IS_ENCLOSED_PARENS(...)             \
+  GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(__VA_ARGS__), \
+              GMOCK_PP_IS_EMPTY(GMOCK_PP_EMPTY __VA_ARGS__), 0)
+
+// Remove the parens, requires GMOCK_PP_IS_ENCLOSED_PARENS(args) => 1.
+#define GMOCK_PP_REMOVE_PARENS(...) GMOCK_PP_INTERNAL_REMOVE_PARENS __VA_ARGS__
+
+// Expands to _Macro(0, _Data, e1) _Macro(1, _Data, e2) ... _Macro(K -1, _Data,
+// eK) as many of GMOCK_INTERNAL_NARG0 _Tuple.
+// Requires: * |_Macro| can be called with 3 arguments.
+//           * |_Tuple| expansion has no more than 15 elements.
+#define GMOCK_PP_FOR_EACH(_Macro, _Data, _Tuple)                        \
+  GMOCK_PP_CAT(GMOCK_PP_INTERNAL_FOR_EACH_IMPL_, GMOCK_PP_NARG0 _Tuple) \
+  (0, _Macro, _Data, _Tuple)
+
+// Expands to _Macro(0, _Data, ) _Macro(1, _Data, ) ... _Macro(K - 1, _Data, )
+// Empty if _K = 0.
+// Requires: * |_Macro| can be called with 3 arguments.
+//           * |_K| literal between 0 and 15
+#define GMOCK_PP_REPEAT(_Macro, _Data, _N)           \
+  GMOCK_PP_CAT(GMOCK_PP_INTERNAL_FOR_EACH_IMPL_, _N) \
+  (0, _Macro, _Data, GMOCK_PP_INTENRAL_EMPTY_TUPLE)
+
+// Increments the argument, requires the argument to be between 0 and 15.
+#define GMOCK_PP_INC(_i) GMOCK_PP_CAT(GMOCK_PP_INTERNAL_INC_, _i)
+
+// Returns comma if _i != 0. Requires _i to be between 0 and 15.
+#define GMOCK_PP_COMMA_IF(_i) GMOCK_PP_CAT(GMOCK_PP_INTERNAL_COMMA_IF_, _i)
+
+// Internal details follow. Do not use any of these symbols outside of this
+// file or we will break your code.
+#define GMOCK_PP_INTENRAL_EMPTY_TUPLE (, , , , , , , , , , , , , , , )
+#define GMOCK_PP_INTERNAL_CAT(_1, _2) _1##_2
+#define GMOCK_PP_INTERNAL_STRINGIZE(...) #__VA_ARGS__
+#define GMOCK_PP_INTERNAL_CAT_5(_1, _2, _3, _4, _5) _1##_2##_3##_4##_5
+#define GMOCK_PP_INTERNAL_IS_EMPTY(_1, _2, _3, _4)                             \
+  GMOCK_PP_HAS_COMMA(GMOCK_PP_INTERNAL_CAT_5(GMOCK_PP_INTERNAL_IS_EMPTY_CASE_, \
+                                             _1, _2, _3, _4))
+#define GMOCK_PP_INTERNAL_IS_EMPTY_CASE_0001 ,
+#define GMOCK_PP_INTERNAL_IF_1(_Then, _Else) _Then
+#define GMOCK_PP_INTERNAL_IF_0(_Then, _Else) _Else
+
+// Because of MSVC treating a token with a comma in it as a single token when
+// passed to another macro, we need to force it to evaluate it as multiple
+// tokens. We do that by using a "IDENTITY(MACRO PARENTHESIZED_ARGS)" macro. We
+// define one per possible macro that relies on this behavior. Note "_Args" must
+// be parenthesized.
+#define GMOCK_PP_INTERNAL_INTERNAL_16TH(_1, _2, _3, _4, _5, _6, _7, _8, _9, \
+                                        _10, _11, _12, _13, _14, _15, _16,  \
+                                        ...)                                \
+  _16
+#define GMOCK_PP_INTERNAL_16TH(_Args) \
+  GMOCK_PP_IDENTITY(GMOCK_PP_INTERNAL_INTERNAL_16TH _Args)
+#define GMOCK_PP_INTERNAL_INTERNAL_HEAD(_1, ...) _1
+#define GMOCK_PP_INTERNAL_HEAD(_Args) \
+  GMOCK_PP_IDENTITY(GMOCK_PP_INTERNAL_INTERNAL_HEAD _Args)
+#define GMOCK_PP_INTERNAL_INTERNAL_TAIL(_1, ...) __VA_ARGS__
+#define GMOCK_PP_INTERNAL_TAIL(_Args) \
+  GMOCK_PP_IDENTITY(GMOCK_PP_INTERNAL_INTERNAL_TAIL _Args)
+
+#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C(...) 1 _
+#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_1 1,
+#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C \
+  0,
+#define GMOCK_PP_INTERNAL_REMOVE_PARENS(...) __VA_ARGS__
+#define GMOCK_PP_INTERNAL_INC_0 1
+#define GMOCK_PP_INTERNAL_INC_1 2
+#define GMOCK_PP_INTERNAL_INC_2 3
+#define GMOCK_PP_INTERNAL_INC_3 4
+#define GMOCK_PP_INTERNAL_INC_4 5
+#define GMOCK_PP_INTERNAL_INC_5 6
+#define GMOCK_PP_INTERNAL_INC_6 7
+#define GMOCK_PP_INTERNAL_INC_7 8
+#define GMOCK_PP_INTERNAL_INC_8 9
+#define GMOCK_PP_INTERNAL_INC_9 10
+#define GMOCK_PP_INTERNAL_INC_10 11
+#define GMOCK_PP_INTERNAL_INC_11 12
+#define GMOCK_PP_INTERNAL_INC_12 13
+#define GMOCK_PP_INTERNAL_INC_13 14
+#define GMOCK_PP_INTERNAL_INC_14 15
+#define GMOCK_PP_INTERNAL_INC_15 16
+#define GMOCK_PP_INTERNAL_COMMA_IF_0
+#define GMOCK_PP_INTERNAL_COMMA_IF_1 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_2 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_3 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_4 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_5 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_6 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_7 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_8 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_9 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_10 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_11 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_12 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_13 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_14 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_15 ,
+#define GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, _element) \
+  _Macro(_i, _Data, _element)
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_0(_i, _Macro, _Data, _Tuple)
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_1(_i, _Macro, _Data, _Tuple) \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple)
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_2(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_1(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_3(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_2(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_4(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_3(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_5(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_4(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_6(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_5(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_7(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_6(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_8(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_7(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_9(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_8(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_10(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_9(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_11(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_10(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_12(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_11(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_13(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_12(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_14(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_13(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_15(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_14(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+
+#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PP_H_
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-all.cc b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-all.cc
new file mode 100644
index 0000000000..e43c9b7b4c
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-all.cc
@@ -0,0 +1,46 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Google C++ Mocking Framework (Google Mock)
+//
+// This file #includes all Google Mock implementation .cc files.  The
+// purpose is to allow a user to build Google Mock by compiling this
+// file alone.
+
+// This line ensures that gmock.h can be compiled on its own, even
+// when it's fused.
+#include "gmock/gmock.h"
+
+// The following lines pull in the real gmock *.cc files.
+#include "src/gmock-cardinalities.cc"
+#include "src/gmock-internal-utils.cc"
+#include "src/gmock-matchers.cc"
+#include "src/gmock-spec-builders.cc"
+#include "src/gmock.cc"
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc
new file mode 100644
index 0000000000..7463f43832
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc
@@ -0,0 +1,155 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements cardinalities.
+
+#include "gmock/gmock-cardinalities.h"
+
+#include <limits.h>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+
+namespace {
+
+// Implements the Between(m, n) cardinality.
+class BetweenCardinalityImpl : public CardinalityInterface {
+ public:
+  BetweenCardinalityImpl(int min, int max)
+      : min_(min >= 0 ? min : 0),
+        max_(max >= min_ ? max : min_) {
+    std::stringstream ss;
+    if (min < 0) {
+      ss << "The invocation lower bound must be >= 0, "
+         << "but is actually " << min << ".";
+      internal::Expect(false, __FILE__, __LINE__, ss.str());
+    } else if (max < 0) {
+      ss << "The invocation upper bound must be >= 0, "
+         << "but is actually " << max << ".";
+      internal::Expect(false, __FILE__, __LINE__, ss.str());
+    } else if (min > max) {
+      ss << "The invocation upper bound (" << max
+         << ") must be >= the invocation lower bound (" << min
+         << ").";
+      internal::Expect(false, __FILE__, __LINE__, ss.str());
+    }
+  }
+
+  // Conservative estimate on the lower/upper bound of the number of
+  // calls allowed.
+  int ConservativeLowerBound() const override { return min_; }
+  int ConservativeUpperBound() const override { return max_; }
+
+  bool IsSatisfiedByCallCount(int call_count) const override {
+    return min_ <= call_count && call_count <= max_;
+  }
+
+  bool IsSaturatedByCallCount(int call_count) const override {
+    return call_count >= max_;
+  }
+
+  void DescribeTo(::std::ostream* os) const override;
+
+ private:
+  const int min_;
+  const int max_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(BetweenCardinalityImpl);
+};
+
+// Formats "n times" in a human-friendly way.
+inline std::string FormatTimes(int n) {
+  if (n == 1) {
+    return "once";
+  } else if (n == 2) {
+    return "twice";
+  } else {
+    std::stringstream ss;
+    ss << n << " times";
+    return ss.str();
+  }
+}
+
+// Describes the Between(m, n) cardinality in human-friendly text.
+void BetweenCardinalityImpl::DescribeTo(::std::ostream* os) const {
+  if (min_ == 0) {
+    if (max_ == 0) {
+      *os << "never called";
+    } else if (max_ == INT_MAX) {
+      *os << "called any number of times";
+    } else {
+      *os << "called at most " << FormatTimes(max_);
+    }
+  } else if (min_ == max_) {
+    *os << "called " << FormatTimes(min_);
+  } else if (max_ == INT_MAX) {
+    *os << "called at least " << FormatTimes(min_);
+  } else {
+    // 0 < min_ < max_ < INT_MAX
+    *os << "called between " << min_ << " and " << max_ << " times";
+  }
+}
+
+}  // Unnamed namespace
+
+// Describes the given call count to an ostream.
+void Cardinality::DescribeActualCallCountTo(int actual_call_count,
+                                            ::std::ostream* os) {
+  if (actual_call_count > 0) {
+    *os << "called " << FormatTimes(actual_call_count);
+  } else {
+    *os << "never called";
+  }
+}
+
+// Creates a cardinality that allows at least n calls.
+GTEST_API_ Cardinality AtLeast(int n) { return Between(n, INT_MAX); }
+
+// Creates a cardinality that allows at most n calls.
+GTEST_API_ Cardinality AtMost(int n) { return Between(0, n); }
+
+// Creates a cardinality that allows any number of calls.
+GTEST_API_ Cardinality AnyNumber() { return AtLeast(0); }
+
+// Creates a cardinality that allows between min and max calls.
+GTEST_API_ Cardinality Between(int min, int max) {
+  return Cardinality(new BetweenCardinalityImpl(min, max));
+}
+
+// Creates a cardinality that allows exactly n calls.
+GTEST_API_ Cardinality Exactly(int n) { return Between(n, n); }
+
+}  // namespace testing
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc
new file mode 100644
index 0000000000..e5b547981d
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc
@@ -0,0 +1,200 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file defines some utilities useful for implementing Google
+// Mock.  They are subject to change without notice, so please DO NOT
+// USE THEM IN USER CODE.
+
+#include "gmock/internal/gmock-internal-utils.h"
+
+#include <ctype.h>
+#include <ostream>  // NOLINT
+#include <string>
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+// Joins a vector of strings as if they are fields of a tuple; returns
+// the joined string.
+GTEST_API_ std::string JoinAsTuple(const Strings& fields) {
+  switch (fields.size()) {
+    case 0:
+      return "";
+    case 1:
+      return fields[0];
+    default:
+      std::string result = "(" + fields[0];
+      for (size_t i = 1; i < fields.size(); i++) {
+        result += ", ";
+        result += fields[i];
+      }
+      result += ")";
+      return result;
+  }
+}
+
+// Converts an identifier name to a space-separated list of lower-case
+// words.  Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
+// treated as one word.  For example, both "FooBar123" and
+// "foo_bar_123" are converted to "foo bar 123".
+GTEST_API_ std::string ConvertIdentifierNameToWords(const char* id_name) {
+  std::string result;
+  char prev_char = '\0';
+  for (const char* p = id_name; *p != '\0'; prev_char = *(p++)) {
+    // We don't care about the current locale as the input is
+    // guaranteed to be a valid C++ identifier name.
+    const bool starts_new_word = IsUpper(*p) ||
+        (!IsAlpha(prev_char) && IsLower(*p)) ||
+        (!IsDigit(prev_char) && IsDigit(*p));
+
+    if (IsAlNum(*p)) {
+      if (starts_new_word && result != "")
+        result += ' ';
+      result += ToLower(*p);
+    }
+  }
+  return result;
+}
+
+// This class reports Google Mock failures as Google Test failures.  A
+// user can define another class in a similar fashion if they intend to
+// use Google Mock with a testing framework other than Google Test.
+class GoogleTestFailureReporter : public FailureReporterInterface {
+ public:
+  void ReportFailure(FailureType type, const char* file, int line,
+                     const std::string& message) override {
+    AssertHelper(type == kFatal ?
+                 TestPartResult::kFatalFailure :
+                 TestPartResult::kNonFatalFailure,
+                 file,
+                 line,
+                 message.c_str()) = Message();
+    if (type == kFatal) {
+      posix::Abort();
+    }
+  }
+};
+
+// Returns the global failure reporter.  Will create a
+// GoogleTestFailureReporter and return it the first time called.
+GTEST_API_ FailureReporterInterface* GetFailureReporter() {
+  // Points to the global failure reporter used by Google Mock.  gcc
+  // guarantees that the following use of failure_reporter is
+  // thread-safe.  We may need to add additional synchronization to
+  // protect failure_reporter if we port Google Mock to other
+  // compilers.
+  static FailureReporterInterface* const failure_reporter =
+      new GoogleTestFailureReporter();
+  return failure_reporter;
+}
+
+// Protects global resources (stdout in particular) used by Log().
+static GTEST_DEFINE_STATIC_MUTEX_(g_log_mutex);
+
+// Returns true if and only if a log with the given severity is visible
+// according to the --gmock_verbose flag.
+GTEST_API_ bool LogIsVisible(LogSeverity severity) {
+  if (GMOCK_FLAG(verbose) == kInfoVerbosity) {
+    // Always show the log if --gmock_verbose=info.
+    return true;
+  } else if (GMOCK_FLAG(verbose) == kErrorVerbosity) {
+    // Always hide it if --gmock_verbose=error.
+    return false;
+  } else {
+    // If --gmock_verbose is neither "info" nor "error", we treat it
+    // as "warning" (its default value).
+    return severity == kWarning;
+  }
+}
+
+// Prints the given message to stdout if and only if 'severity' >= the level
+// specified by the --gmock_verbose flag.  If stack_frames_to_skip >=
+// 0, also prints the stack trace excluding the top
+// stack_frames_to_skip frames.  In opt mode, any positive
+// stack_frames_to_skip is treated as 0, since we don't know which
+// function calls will be inlined by the compiler and need to be
+// conservative.
+GTEST_API_ void Log(LogSeverity severity, const std::string& message,
+                    int stack_frames_to_skip) {
+  if (!LogIsVisible(severity))
+    return;
+
+  // Ensures that logs from different threads don't interleave.
+  MutexLock l(&g_log_mutex);
+
+  if (severity == kWarning) {
+    // Prints a GMOCK WARNING marker to make the warnings easily searchable.
+    std::cout << "\nGMOCK WARNING:";
+  }
+  // Pre-pends a new-line to message if it doesn't start with one.
+  if (message.empty() || message[0] != '\n') {
+    std::cout << "\n";
+  }
+  std::cout << message;
+  if (stack_frames_to_skip >= 0) {
+#ifdef NDEBUG
+    // In opt mode, we have to be conservative and skip no stack frame.
+    const int actual_to_skip = 0;
+#else
+    // In dbg mode, we can do what the caller tell us to do (plus one
+    // for skipping this function's stack frame).
+    const int actual_to_skip = stack_frames_to_skip + 1;
+#endif  // NDEBUG
+
+    // Appends a new-line to message if it doesn't end with one.
+    if (!message.empty() && *message.rbegin() != '\n') {
+      std::cout << "\n";
+    }
+    std::cout << "Stack trace:\n"
+         << ::testing::internal::GetCurrentOsStackTraceExceptTop(
+             ::testing::UnitTest::GetInstance(), actual_to_skip);
+  }
+  std::cout << ::std::flush;
+}
+
+GTEST_API_ WithoutMatchers GetWithoutMatchers() { return WithoutMatchers(); }
+
+GTEST_API_ void IllegalDoDefault(const char* file, int line) {
+  internal::Assert(
+      false, file, line,
+      "You are using DoDefault() inside a composite action like "
+      "DoAll() or WithArgs().  This is not supported for technical "
+      "reasons.  Please instead spell out the default action, or "
+      "assign the default action to an Action variable and use "
+      "the variable in various places.");
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-matchers.cc b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-matchers.cc
new file mode 100644
index 0000000000..dded437add
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-matchers.cc
@@ -0,0 +1,459 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements Matcher<const string&>, Matcher<string>, and
+// utilities for defining matchers.
+
+#include "gmock/gmock-matchers.h"
+
+#include <string.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+namespace testing {
+namespace internal {
+
+// Returns the description for a matcher defined using the MATCHER*()
+// macro where the user-supplied description string is "", if
+// 'negation' is false; otherwise returns the description of the
+// negation of the matcher.  'param_values' contains a list of strings
+// that are the print-out of the matcher's parameters.
+GTEST_API_ std::string FormatMatcherDescription(bool negation,
+                                                const char* matcher_name,
+                                                const Strings& param_values) {
+  std::string result = ConvertIdentifierNameToWords(matcher_name);
+  if (param_values.size() >= 1) result += " " + JoinAsTuple(param_values);
+  return negation ? "not (" + result + ")" : result;
+}
+
+// FindMaxBipartiteMatching and its helper class.
+//
+// Uses the well-known Ford-Fulkerson max flow method to find a maximum
+// bipartite matching. Flow is considered to be from left to right.
+// There is an implicit source node that is connected to all of the left
+// nodes, and an implicit sink node that is connected to all of the
+// right nodes. All edges have unit capacity.
+//
+// Neither the flow graph nor the residual flow graph are represented
+// explicitly. Instead, they are implied by the information in 'graph' and
+// a vector<int> called 'left_' whose elements are initialized to the
+// value kUnused. This represents the initial state of the algorithm,
+// where the flow graph is empty, and the residual flow graph has the
+// following edges:
+//   - An edge from source to each left_ node
+//   - An edge from each right_ node to sink
+//   - An edge from each left_ node to each right_ node, if the
+//     corresponding edge exists in 'graph'.
+//
+// When the TryAugment() method adds a flow, it sets left_[l] = r for some
+// nodes l and r. This induces the following changes:
+//   - The edges (source, l), (l, r), and (r, sink) are added to the
+//     flow graph.
+//   - The same three edges are removed from the residual flow graph.
+//   - The reverse edges (l, source), (r, l), and (sink, r) are added
+//     to the residual flow graph, which is a directional graph
+//     representing unused flow capacity.
+//
+// When the method augments a flow (moving left_[l] from some r1 to some
+// other r2), this can be thought of as "undoing" the above steps with
+// respect to r1 and "redoing" them with respect to r2.
+//
+// It bears repeating that the flow graph and residual flow graph are
+// never represented explicitly, but can be derived by looking at the
+// information in 'graph' and in left_.
+//
+// As an optimization, there is a second vector<int> called right_ which
+// does not provide any new information. Instead, it enables more
+// efficient queries about edges entering or leaving the right-side nodes
+// of the flow or residual flow graphs. The following invariants are
+// maintained:
+//
+// left[l] == kUnused or right[left[l]] == l
+// right[r] == kUnused or left[right[r]] == r
+//
+// . [ source ]                                        .
+// .   |||                                             .
+// .   |||                                             .
+// .   ||\--> left[0]=1  ---\    right[0]=-1 ----\     .
+// .   ||                   |                    |     .
+// .   |\---> left[1]=-1    \--> right[1]=0  ---\|     .
+// .   |                                        ||     .
+// .   \----> left[2]=2  ------> right[2]=2  --\||     .
+// .                                           |||     .
+// .         elements           matchers       vvv     .
+// .                                         [ sink ]  .
+//
+// See Also:
+//   [1] Cormen, et al (2001). "Section 26.2: The Ford-Fulkerson method".
+//       "Introduction to Algorithms (Second ed.)", pp. 651-664.
+//   [2] "Ford-Fulkerson algorithm", Wikipedia,
+//       'http://en.wikipedia.org/wiki/Ford%E2%80%93Fulkerson_algorithm'
+class MaxBipartiteMatchState {
+ public:
+  explicit MaxBipartiteMatchState(const MatchMatrix& graph)
+      : graph_(&graph),
+        left_(graph_->LhsSize(), kUnused),
+        right_(graph_->RhsSize(), kUnused) {}
+
+  // Returns the edges of a maximal match, each in the form {left, right}.
+  ElementMatcherPairs Compute() {
+    // 'seen' is used for path finding { 0: unseen, 1: seen }.
+    ::std::vector<char> seen;
+    // Searches the residual flow graph for a path from each left node to
+    // the sink in the residual flow graph, and if one is found, add flow
+    // to the graph. It's okay to search through the left nodes once. The
+    // edge from the implicit source node to each previously-visited left
+    // node will have flow if that left node has any path to the sink
+    // whatsoever. Subsequent augmentations can only add flow to the
+    // network, and cannot take away that previous flow unit from the source.
+    // Since the source-to-left edge can only carry one flow unit (or,
+    // each element can be matched to only one matcher), there is no need
+    // to visit the left nodes more than once looking for augmented paths.
+    // The flow is known to be possible or impossible by looking at the
+    // node once.
+    for (size_t ilhs = 0; ilhs < graph_->LhsSize(); ++ilhs) {
+      // Reset the path-marking vector and try to find a path from
+      // source to sink starting at the left_[ilhs] node.
+      GTEST_CHECK_(left_[ilhs] == kUnused)
+          << "ilhs: " << ilhs << ", left_[ilhs]: " << left_[ilhs];
+      // 'seen' initialized to 'graph_->RhsSize()' copies of 0.
+      seen.assign(graph_->RhsSize(), 0);
+      TryAugment(ilhs, &seen);
+    }
+    ElementMatcherPairs result;
+    for (size_t ilhs = 0; ilhs < left_.size(); ++ilhs) {
+      size_t irhs = left_[ilhs];
+      if (irhs == kUnused) continue;
+      result.push_back(ElementMatcherPair(ilhs, irhs));
+    }
+    return result;
+  }
+
+ private:
+  static const size_t kUnused = static_cast<size_t>(-1);
+
+  // Perform a depth-first search from left node ilhs to the sink.  If a
+  // path is found, flow is added to the network by linking the left and
+  // right vector elements corresponding each segment of the path.
+  // Returns true if a path to sink was found, which means that a unit of
+  // flow was added to the network. The 'seen' vector elements correspond
+  // to right nodes and are marked to eliminate cycles from the search.
+  //
+  // Left nodes will only be explored at most once because they
+  // are accessible from at most one right node in the residual flow
+  // graph.
+  //
+  // Note that left_[ilhs] is the only element of left_ that TryAugment will
+  // potentially transition from kUnused to another value. Any other
+  // left_ element holding kUnused before TryAugment will be holding it
+  // when TryAugment returns.
+  //
+  bool TryAugment(size_t ilhs, ::std::vector<char>* seen) {
+    for (size_t irhs = 0; irhs < graph_->RhsSize(); ++irhs) {
+      if ((*seen)[irhs]) continue;
+      if (!graph_->HasEdge(ilhs, irhs)) continue;
+      // There's an available edge from ilhs to irhs.
+      (*seen)[irhs] = 1;
+      // Next a search is performed to determine whether
+      // this edge is a dead end or leads to the sink.
+      //
+      // right_[irhs] == kUnused means that there is residual flow from
+      // right node irhs to the sink, so we can use that to finish this
+      // flow path and return success.
+      //
+      // Otherwise there is residual flow to some ilhs. We push flow
+      // along that path and call ourselves recursively to see if this
+      // ultimately leads to sink.
+      if (right_[irhs] == kUnused || TryAugment(right_[irhs], seen)) {
+        // Add flow from left_[ilhs] to right_[irhs].
+        left_[ilhs] = irhs;
+        right_[irhs] = ilhs;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  const MatchMatrix* graph_;  // not owned
+  // Each element of the left_ vector represents a left hand side node
+  // (i.e. an element) and each element of right_ is a right hand side
+  // node (i.e. a matcher). The values in the left_ vector indicate
+  // outflow from that node to a node on the right_ side. The values
+  // in the right_ indicate inflow, and specify which left_ node is
+  // feeding that right_ node, if any. For example, left_[3] == 1 means
+  // there's a flow from element #3 to matcher #1. Such a flow would also
+  // be redundantly represented in the right_ vector as right_[1] == 3.
+  // Elements of left_ and right_ are either kUnused or mutually
+  // referent. Mutually referent means that left_[right_[i]] = i and
+  // right_[left_[i]] = i.
+  ::std::vector<size_t> left_;
+  ::std::vector<size_t> right_;
+};
+
+const size_t MaxBipartiteMatchState::kUnused;
+
+GTEST_API_ ElementMatcherPairs FindMaxBipartiteMatching(const MatchMatrix& g) {
+  return MaxBipartiteMatchState(g).Compute();
+}
+
+static void LogElementMatcherPairVec(const ElementMatcherPairs& pairs,
+                                     ::std::ostream* stream) {
+  typedef ElementMatcherPairs::const_iterator Iter;
+  ::std::ostream& os = *stream;
+  os << "{";
+  const char* sep = "";
+  for (Iter it = pairs.begin(); it != pairs.end(); ++it) {
+    os << sep << "\n  ("
+       << "element #" << it->first << ", "
+       << "matcher #" << it->second << ")";
+    sep = ",";
+  }
+  os << "\n}";
+}
+
+bool MatchMatrix::NextGraph() {
+  for (size_t ilhs = 0; ilhs < LhsSize(); ++ilhs) {
+    for (size_t irhs = 0; irhs < RhsSize(); ++irhs) {
+      char& b = matched_[SpaceIndex(ilhs, irhs)];
+      if (!b) {
+        b = 1;
+        return true;
+      }
+      b = 0;
+    }
+  }
+  return false;
+}
+
+void MatchMatrix::Randomize() {
+  for (size_t ilhs = 0; ilhs < LhsSize(); ++ilhs) {
+    for (size_t irhs = 0; irhs < RhsSize(); ++irhs) {
+      char& b = matched_[SpaceIndex(ilhs, irhs)];
+      b = static_cast<char>(rand() & 1);  // NOLINT
+    }
+  }
+}
+
+std::string MatchMatrix::DebugString() const {
+  ::std::stringstream ss;
+  const char* sep = "";
+  for (size_t i = 0; i < LhsSize(); ++i) {
+    ss << sep;
+    for (size_t j = 0; j < RhsSize(); ++j) {
+      ss << HasEdge(i, j);
+    }
+    sep = ";";
+  }
+  return ss.str();
+}
+
+void UnorderedElementsAreMatcherImplBase::DescribeToImpl(
+    ::std::ostream* os) const {
+  switch (match_flags()) {
+    case UnorderedMatcherRequire::ExactMatch:
+      if (matcher_describers_.empty()) {
+        *os << "is empty";
+        return;
+      }
+      if (matcher_describers_.size() == 1) {
+        *os << "has " << Elements(1) << " and that element ";
+        matcher_describers_[0]->DescribeTo(os);
+        return;
+      }
+      *os << "has " << Elements(matcher_describers_.size())
+          << " and there exists some permutation of elements such that:\n";
+      break;
+    case UnorderedMatcherRequire::Superset:
+      *os << "a surjection from elements to requirements exists such that:\n";
+      break;
+    case UnorderedMatcherRequire::Subset:
+      *os << "an injection from elements to requirements exists such that:\n";
+      break;
+  }
+
+  const char* sep = "";
+  for (size_t i = 0; i != matcher_describers_.size(); ++i) {
+    *os << sep;
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      *os << " - element #" << i << " ";
+    } else {
+      *os << " - an element ";
+    }
+    matcher_describers_[i]->DescribeTo(os);
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      sep = ", and\n";
+    } else {
+      sep = "\n";
+    }
+  }
+}
+
+void UnorderedElementsAreMatcherImplBase::DescribeNegationToImpl(
+    ::std::ostream* os) const {
+  switch (match_flags()) {
+    case UnorderedMatcherRequire::ExactMatch:
+      if (matcher_describers_.empty()) {
+        *os << "isn't empty";
+        return;
+      }
+      if (matcher_describers_.size() == 1) {
+        *os << "doesn't have " << Elements(1) << ", or has " << Elements(1)
+            << " that ";
+        matcher_describers_[0]->DescribeNegationTo(os);
+        return;
+      }
+      *os << "doesn't have " << Elements(matcher_describers_.size())
+          << ", or there exists no permutation of elements such that:\n";
+      break;
+    case UnorderedMatcherRequire::Superset:
+      *os << "no surjection from elements to requirements exists such that:\n";
+      break;
+    case UnorderedMatcherRequire::Subset:
+      *os << "no injection from elements to requirements exists such that:\n";
+      break;
+  }
+  const char* sep = "";
+  for (size_t i = 0; i != matcher_describers_.size(); ++i) {
+    *os << sep;
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      *os << " - element #" << i << " ";
+    } else {
+      *os << " - an element ";
+    }
+    matcher_describers_[i]->DescribeTo(os);
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      sep = ", and\n";
+    } else {
+      sep = "\n";
+    }
+  }
+}
+
+// Checks that all matchers match at least one element, and that all
+// elements match at least one matcher. This enables faster matching
+// and better error reporting.
+// Returns false, writing an explanation to 'listener', if and only
+// if the success criteria are not met.
+bool UnorderedElementsAreMatcherImplBase::VerifyMatchMatrix(
+    const ::std::vector<std::string>& element_printouts,
+    const MatchMatrix& matrix, MatchResultListener* listener) const {
+  bool result = true;
+  ::std::vector<char> element_matched(matrix.LhsSize(), 0);
+  ::std::vector<char> matcher_matched(matrix.RhsSize(), 0);
+
+  for (size_t ilhs = 0; ilhs < matrix.LhsSize(); ilhs++) {
+    for (size_t irhs = 0; irhs < matrix.RhsSize(); irhs++) {
+      char matched = matrix.HasEdge(ilhs, irhs);
+      element_matched[ilhs] |= matched;
+      matcher_matched[irhs] |= matched;
+    }
+  }
+
+  if (match_flags() & UnorderedMatcherRequire::Superset) {
+    const char* sep =
+        "where the following matchers don't match any elements:\n";
+    for (size_t mi = 0; mi < matcher_matched.size(); ++mi) {
+      if (matcher_matched[mi]) continue;
+      result = false;
+      if (listener->IsInterested()) {
+        *listener << sep << "matcher #" << mi << ": ";
+        matcher_describers_[mi]->DescribeTo(listener->stream());
+        sep = ",\n";
+      }
+    }
+  }
+
+  if (match_flags() & UnorderedMatcherRequire::Subset) {
+    const char* sep =
+        "where the following elements don't match any matchers:\n";
+    const char* outer_sep = "";
+    if (!result) {
+      outer_sep = "\nand ";
+    }
+    for (size_t ei = 0; ei < element_matched.size(); ++ei) {
+      if (element_matched[ei]) continue;
+      result = false;
+      if (listener->IsInterested()) {
+        *listener << outer_sep << sep << "element #" << ei << ": "
+                  << element_printouts[ei];
+        sep = ",\n";
+        outer_sep = "";
+      }
+    }
+  }
+  return result;
+}
+
+bool UnorderedElementsAreMatcherImplBase::FindPairing(
+    const MatchMatrix& matrix, MatchResultListener* listener) const {
+  ElementMatcherPairs matches = FindMaxBipartiteMatching(matrix);
+
+  size_t max_flow = matches.size();
+  if ((match_flags() & UnorderedMatcherRequire::Superset) &&
+      max_flow < matrix.RhsSize()) {
+    if (listener->IsInterested()) {
+      *listener << "where no permutation of the elements can satisfy all "
+                   "matchers, and the closest match is "
+                << max_flow << " of " << matrix.RhsSize()
+                << " matchers with the pairings:\n";
+      LogElementMatcherPairVec(matches, listener->stream());
+    }
+    return false;
+  }
+  if ((match_flags() & UnorderedMatcherRequire::Subset) &&
+      max_flow < matrix.LhsSize()) {
+    if (listener->IsInterested()) {
+      *listener
+          << "where not all elements can be matched, and the closest match is "
+          << max_flow << " of " << matrix.RhsSize()
+          << " matchers with the pairings:\n";
+      LogElementMatcherPairVec(matches, listener->stream());
+    }
+    return false;
+  }
+
+  if (matches.size() > 1) {
+    if (listener->IsInterested()) {
+      const char* sep = "where:\n";
+      for (size_t mi = 0; mi < matches.size(); ++mi) {
+        *listener << sep << " - element #" << matches[mi].first
+                  << " is matched by matcher #" << matches[mi].second;
+        sep = ",\n";
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc
new file mode 100644
index 0000000000..c7266a3704
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc
@@ -0,0 +1,908 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements the spec builder syntax (ON_CALL and
+// EXPECT_CALL).
+
+#include "gmock/gmock-spec-builders.h"
+
+#include <stdlib.h>
+
+#include <iostream>  // NOLINT
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_CYGWIN || GTEST_OS_LINUX || GTEST_OS_MAC
+# include <unistd.h>  // NOLINT
+#endif
+
+// Silence C4800 (C4800: 'int *const ': forcing value
+// to bool 'true' or 'false') for MSVC 15
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#  pragma warning(push)
+#  pragma warning(disable:4800)
+#endif
+#endif
+
+namespace testing {
+namespace internal {
+
+// Protects the mock object registry (in class Mock), all function
+// mockers, and all expectations.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_gmock_mutex);
+
+// Logs a message including file and line number information.
+GTEST_API_ void LogWithLocation(testing::internal::LogSeverity severity,
+                                const char* file, int line,
+                                const std::string& message) {
+  ::std::ostringstream s;
+  s << internal::FormatFileLocation(file, line) << " " << message
+    << ::std::endl;
+  Log(severity, s.str(), 0);
+}
+
+// Constructs an ExpectationBase object.
+ExpectationBase::ExpectationBase(const char* a_file, int a_line,
+                                 const std::string& a_source_text)
+    : file_(a_file),
+      line_(a_line),
+      source_text_(a_source_text),
+      cardinality_specified_(false),
+      cardinality_(Exactly(1)),
+      call_count_(0),
+      retired_(false),
+      extra_matcher_specified_(false),
+      repeated_action_specified_(false),
+      retires_on_saturation_(false),
+      last_clause_(kNone),
+      action_count_checked_(false) {}
+
+// Destructs an ExpectationBase object.
+ExpectationBase::~ExpectationBase() {}
+
+// Explicitly specifies the cardinality of this expectation.  Used by
+// the subclasses to implement the .Times() clause.
+void ExpectationBase::SpecifyCardinality(const Cardinality& a_cardinality) {
+  cardinality_specified_ = true;
+  cardinality_ = a_cardinality;
+}
+
+// Retires all pre-requisites of this expectation.
+void ExpectationBase::RetireAllPreRequisites()
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  if (is_retired()) {
+    // We can take this short-cut as we never retire an expectation
+    // until we have retired all its pre-requisites.
+    return;
+  }
+
+  ::std::vector<ExpectationBase*> expectations(1, this);
+  while (!expectations.empty()) {
+    ExpectationBase* exp = expectations.back();
+    expectations.pop_back();
+
+    for (ExpectationSet::const_iterator it =
+             exp->immediate_prerequisites_.begin();
+         it != exp->immediate_prerequisites_.end(); ++it) {
+      ExpectationBase* next = it->expectation_base().get();
+      if (!next->is_retired()) {
+        next->Retire();
+        expectations.push_back(next);
+      }
+    }
+  }
+}
+
+// Returns true if and only if all pre-requisites of this expectation
+// have been satisfied.
+bool ExpectationBase::AllPrerequisitesAreSatisfied() const
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+  ::std::vector<const ExpectationBase*> expectations(1, this);
+  while (!expectations.empty()) {
+    const ExpectationBase* exp = expectations.back();
+    expectations.pop_back();
+
+    for (ExpectationSet::const_iterator it =
+             exp->immediate_prerequisites_.begin();
+         it != exp->immediate_prerequisites_.end(); ++it) {
+      const ExpectationBase* next = it->expectation_base().get();
+      if (!next->IsSatisfied()) return false;
+      expectations.push_back(next);
+    }
+  }
+  return true;
+}
+
+// Adds unsatisfied pre-requisites of this expectation to 'result'.
+void ExpectationBase::FindUnsatisfiedPrerequisites(ExpectationSet* result) const
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+  ::std::vector<const ExpectationBase*> expectations(1, this);
+  while (!expectations.empty()) {
+    const ExpectationBase* exp = expectations.back();
+    expectations.pop_back();
+
+    for (ExpectationSet::const_iterator it =
+             exp->immediate_prerequisites_.begin();
+         it != exp->immediate_prerequisites_.end(); ++it) {
+      const ExpectationBase* next = it->expectation_base().get();
+
+      if (next->IsSatisfied()) {
+        // If *it is satisfied and has a call count of 0, some of its
+        // pre-requisites may not be satisfied yet.
+        if (next->call_count_ == 0) {
+          expectations.push_back(next);
+        }
+      } else {
+        // Now that we know next is unsatisfied, we are not so interested
+        // in whether its pre-requisites are satisfied.  Therefore we
+        // don't iterate into it here.
+        *result += *it;
+      }
+    }
+  }
+}
+
+// Describes how many times a function call matching this
+// expectation has occurred.
+void ExpectationBase::DescribeCallCountTo(::std::ostream* os) const
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+
+  // Describes how many times the function is expected to be called.
+  *os << "         Expected: to be ";
+  cardinality().DescribeTo(os);
+  *os << "\n           Actual: ";
+  Cardinality::DescribeActualCallCountTo(call_count(), os);
+
+  // Describes the state of the expectation (e.g. is it satisfied?
+  // is it active?).
+  *os << " - " << (IsOverSaturated() ? "over-saturated" :
+                   IsSaturated() ? "saturated" :
+                   IsSatisfied() ? "satisfied" : "unsatisfied")
+      << " and "
+      << (is_retired() ? "retired" : "active");
+}
+
+// Checks the action count (i.e. the number of WillOnce() and
+// WillRepeatedly() clauses) against the cardinality if this hasn't
+// been done before.  Prints a warning if there are too many or too
+// few actions.
+void ExpectationBase::CheckActionCountIfNotDone() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  bool should_check = false;
+  {
+    MutexLock l(&mutex_);
+    if (!action_count_checked_) {
+      action_count_checked_ = true;
+      should_check = true;
+    }
+  }
+
+  if (should_check) {
+    if (!cardinality_specified_) {
+      // The cardinality was inferred - no need to check the action
+      // count against it.
+      return;
+    }
+
+    // The cardinality was explicitly specified.
+    const int action_count = static_cast<int>(untyped_actions_.size());
+    const int upper_bound = cardinality().ConservativeUpperBound();
+    const int lower_bound = cardinality().ConservativeLowerBound();
+    bool too_many;  // True if there are too many actions, or false
+    // if there are too few.
+    if (action_count > upper_bound ||
+        (action_count == upper_bound && repeated_action_specified_)) {
+      too_many = true;
+    } else if (0 < action_count && action_count < lower_bound &&
+               !repeated_action_specified_) {
+      too_many = false;
+    } else {
+      return;
+    }
+
+    ::std::stringstream ss;
+    DescribeLocationTo(&ss);
+    ss << "Too " << (too_many ? "many" : "few")
+       << " actions specified in " << source_text() << "...\n"
+       << "Expected to be ";
+    cardinality().DescribeTo(&ss);
+    ss << ", but has " << (too_many ? "" : "only ")
+       << action_count << " WillOnce()"
+       << (action_count == 1 ? "" : "s");
+    if (repeated_action_specified_) {
+      ss << " and a WillRepeatedly()";
+    }
+    ss << ".";
+    Log(kWarning, ss.str(), -1);  // -1 means "don't print stack trace".
+  }
+}
+
+// Implements the .Times() clause.
+void ExpectationBase::UntypedTimes(const Cardinality& a_cardinality) {
+  if (last_clause_ == kTimes) {
+    ExpectSpecProperty(false,
+                       ".Times() cannot appear "
+                       "more than once in an EXPECT_CALL().");
+  } else {
+    ExpectSpecProperty(last_clause_ < kTimes,
+                       ".Times() cannot appear after "
+                       ".InSequence(), .WillOnce(), .WillRepeatedly(), "
+                       "or .RetiresOnSaturation().");
+  }
+  last_clause_ = kTimes;
+
+  SpecifyCardinality(a_cardinality);
+}
+
+// Points to the implicit sequence introduced by a living InSequence
+// object (if any) in the current thread or NULL.
+GTEST_API_ ThreadLocal<Sequence*> g_gmock_implicit_sequence;
+
+// Reports an uninteresting call (whose description is in msg) in the
+// manner specified by 'reaction'.
+void ReportUninterestingCall(CallReaction reaction, const std::string& msg) {
+  // Include a stack trace only if --gmock_verbose=info is specified.
+  const int stack_frames_to_skip =
+      GMOCK_FLAG(verbose) == kInfoVerbosity ? 3 : -1;
+  switch (reaction) {
+    case kAllow:
+      Log(kInfo, msg, stack_frames_to_skip);
+      break;
+    case kWarn:
+      Log(kWarning,
+          msg +
+              "\nNOTE: You can safely ignore the above warning unless this "
+              "call should not happen.  Do not suppress it by blindly adding "
+              "an EXPECT_CALL() if you don't mean to enforce the call.  "
+              "See "
+              "https://github.com/google/googletest/blob/master/docs/"
+              "gmock_cook_book.md#"
+              "knowing-when-to-expect for details.\n",
+          stack_frames_to_skip);
+      break;
+    default:  // FAIL
+      Expect(false, nullptr, -1, msg);
+  }
+}
+
+UntypedFunctionMockerBase::UntypedFunctionMockerBase()
+    : mock_obj_(nullptr), name_("") {}
+
+UntypedFunctionMockerBase::~UntypedFunctionMockerBase() {}
+
+// Sets the mock object this mock method belongs to, and registers
+// this information in the global mock registry.  Will be called
+// whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
+// method.
+void UntypedFunctionMockerBase::RegisterOwner(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  {
+    MutexLock l(&g_gmock_mutex);
+    mock_obj_ = mock_obj;
+  }
+  Mock::Register(mock_obj, this);
+}
+
+// Sets the mock object this mock method belongs to, and sets the name
+// of the mock function.  Will be called upon each invocation of this
+// mock function.
+void UntypedFunctionMockerBase::SetOwnerAndName(const void* mock_obj,
+                                                const char* name)
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  // We protect name_ under g_gmock_mutex in case this mock function
+  // is called from two threads concurrently.
+  MutexLock l(&g_gmock_mutex);
+  mock_obj_ = mock_obj;
+  name_ = name;
+}
+
+// Returns the name of the function being mocked.  Must be called
+// after RegisterOwner() or SetOwnerAndName() has been called.
+const void* UntypedFunctionMockerBase::MockObject() const
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  const void* mock_obj;
+  {
+    // We protect mock_obj_ under g_gmock_mutex in case this mock
+    // function is called from two threads concurrently.
+    MutexLock l(&g_gmock_mutex);
+    Assert(mock_obj_ != nullptr, __FILE__, __LINE__,
+           "MockObject() must not be called before RegisterOwner() or "
+           "SetOwnerAndName() has been called.");
+    mock_obj = mock_obj_;
+  }
+  return mock_obj;
+}
+
+// Returns the name of this mock method.  Must be called after
+// SetOwnerAndName() has been called.
+const char* UntypedFunctionMockerBase::Name() const
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  const char* name;
+  {
+    // We protect name_ under g_gmock_mutex in case this mock
+    // function is called from two threads concurrently.
+    MutexLock l(&g_gmock_mutex);
+    Assert(name_ != nullptr, __FILE__, __LINE__,
+           "Name() must not be called before SetOwnerAndName() has "
+           "been called.");
+    name = name_;
+  }
+  return name;
+}
+
+// Calculates the result of invoking this mock function with the given
+// arguments, prints it, and returns it.  The caller is responsible
+// for deleting the result.
+UntypedActionResultHolderBase* UntypedFunctionMockerBase::UntypedInvokeWith(
+    void* const untyped_args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  // See the definition of untyped_expectations_ for why access to it
+  // is unprotected here.
+  if (untyped_expectations_.size() == 0) {
+    // No expectation is set on this mock method - we have an
+    // uninteresting call.
+
+    // We must get Google Mock's reaction on uninteresting calls
+    // made on this mock object BEFORE performing the action,
+    // because the action may DELETE the mock object and make the
+    // following expression meaningless.
+    const CallReaction reaction =
+        Mock::GetReactionOnUninterestingCalls(MockObject());
+
+    // True if and only if we need to print this call's arguments and return
+    // value.  This definition must be kept in sync with
+    // the behavior of ReportUninterestingCall().
+    const bool need_to_report_uninteresting_call =
+        // If the user allows this uninteresting call, we print it
+        // only when they want informational messages.
+        reaction == kAllow ? LogIsVisible(kInfo) :
+                           // If the user wants this to be a warning, we print
+                           // it only when they want to see warnings.
+            reaction == kWarn
+                ? LogIsVisible(kWarning)
+                :
+                // Otherwise, the user wants this to be an error, and we
+                // should always print detailed information in the error.
+                true;
+
+    if (!need_to_report_uninteresting_call) {
+      // Perform the action without printing the call information.
+      return this->UntypedPerformDefaultAction(
+          untyped_args, "Function call: " + std::string(Name()));
+    }
+
+    // Warns about the uninteresting call.
+    ::std::stringstream ss;
+    this->UntypedDescribeUninterestingCall(untyped_args, &ss);
+
+    // Calculates the function result.
+    UntypedActionResultHolderBase* const result =
+        this->UntypedPerformDefaultAction(untyped_args, ss.str());
+
+    // Prints the function result.
+    if (result != nullptr) result->PrintAsActionResult(&ss);
+
+    ReportUninterestingCall(reaction, ss.str());
+    return result;
+  }
+
+  bool is_excessive = false;
+  ::std::stringstream ss;
+  ::std::stringstream why;
+  ::std::stringstream loc;
+  const void* untyped_action = nullptr;
+
+  // The UntypedFindMatchingExpectation() function acquires and
+  // releases g_gmock_mutex.
+
+  const ExpectationBase* const untyped_expectation =
+      this->UntypedFindMatchingExpectation(untyped_args, &untyped_action,
+                                           &is_excessive, &ss, &why);
+  const bool found = untyped_expectation != nullptr;
+
+  // True if and only if we need to print the call's arguments
+  // and return value.
+  // This definition must be kept in sync with the uses of Expect()
+  // and Log() in this function.
+  const bool need_to_report_call =
+      !found || is_excessive || LogIsVisible(kInfo);
+  if (!need_to_report_call) {
+    // Perform the action without printing the call information.
+    return untyped_action == nullptr
+               ? this->UntypedPerformDefaultAction(untyped_args, "")
+               : this->UntypedPerformAction(untyped_action, untyped_args);
+  }
+
+  ss << "    Function call: " << Name();
+  this->UntypedPrintArgs(untyped_args, &ss);
+
+  // In case the action deletes a piece of the expectation, we
+  // generate the message beforehand.
+  if (found && !is_excessive) {
+    untyped_expectation->DescribeLocationTo(&loc);
+  }
+
+  UntypedActionResultHolderBase* result = nullptr;
+
+  auto perform_action = [&] {
+    return untyped_action == nullptr
+               ? this->UntypedPerformDefaultAction(untyped_args, ss.str())
+               : this->UntypedPerformAction(untyped_action, untyped_args);
+  };
+  auto handle_failures = [&] {
+    ss << "\n" << why.str();
+
+    if (!found) {
+      // No expectation matches this call - reports a failure.
+      Expect(false, nullptr, -1, ss.str());
+    } else if (is_excessive) {
+      // We had an upper-bound violation and the failure message is in ss.
+      Expect(false, untyped_expectation->file(), untyped_expectation->line(),
+             ss.str());
+    } else {
+      // We had an expected call and the matching expectation is
+      // described in ss.
+      Log(kInfo, loc.str() + ss.str(), 2);
+    }
+  };
+#if GTEST_HAS_EXCEPTIONS
+  try {
+    result = perform_action();
+  } catch (...) {
+    handle_failures();
+    throw;
+  }
+#else
+  result = perform_action();
+#endif
+
+  if (result != nullptr) result->PrintAsActionResult(&ss);
+  handle_failures();
+  return result;
+}
+
+// Returns an Expectation object that references and co-owns exp,
+// which must be an expectation on this mock function.
+Expectation UntypedFunctionMockerBase::GetHandleOf(ExpectationBase* exp) {
+  // See the definition of untyped_expectations_ for why access to it
+  // is unprotected here.
+  for (UntypedExpectations::const_iterator it =
+           untyped_expectations_.begin();
+       it != untyped_expectations_.end(); ++it) {
+    if (it->get() == exp) {
+      return Expectation(*it);
+    }
+  }
+
+  Assert(false, __FILE__, __LINE__, "Cannot find expectation.");
+  return Expectation();
+  // The above statement is just to make the code compile, and will
+  // never be executed.
+}
+
+// Verifies that all expectations on this mock function have been
+// satisfied.  Reports one or more Google Test non-fatal failures
+// and returns false if not.
+bool UntypedFunctionMockerBase::VerifyAndClearExpectationsLocked()
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+  bool expectations_met = true;
+  for (UntypedExpectations::const_iterator it =
+           untyped_expectations_.begin();
+       it != untyped_expectations_.end(); ++it) {
+    ExpectationBase* const untyped_expectation = it->get();
+    if (untyped_expectation->IsOverSaturated()) {
+      // There was an upper-bound violation.  Since the error was
+      // already reported when it occurred, there is no need to do
+      // anything here.
+      expectations_met = false;
+    } else if (!untyped_expectation->IsSatisfied()) {
+      expectations_met = false;
+      ::std::stringstream ss;
+      ss  << "Actual function call count doesn't match "
+          << untyped_expectation->source_text() << "...\n";
+      // No need to show the source file location of the expectation
+      // in the description, as the Expect() call that follows already
+      // takes care of it.
+      untyped_expectation->MaybeDescribeExtraMatcherTo(&ss);
+      untyped_expectation->DescribeCallCountTo(&ss);
+      Expect(false, untyped_expectation->file(),
+             untyped_expectation->line(), ss.str());
+    }
+  }
+
+  // Deleting our expectations may trigger other mock objects to be deleted, for
+  // example if an action contains a reference counted smart pointer to that
+  // mock object, and that is the last reference. So if we delete our
+  // expectations within the context of the global mutex we may deadlock when
+  // this method is called again. Instead, make a copy of the set of
+  // expectations to delete, clear our set within the mutex, and then clear the
+  // copied set outside of it.
+  UntypedExpectations expectations_to_delete;
+  untyped_expectations_.swap(expectations_to_delete);
+
+  g_gmock_mutex.Unlock();
+  expectations_to_delete.clear();
+  g_gmock_mutex.Lock();
+
+  return expectations_met;
+}
+
+CallReaction intToCallReaction(int mock_behavior) {
+  if (mock_behavior >= kAllow && mock_behavior <= kFail) {
+    return static_cast<internal::CallReaction>(mock_behavior);
+  }
+  return kWarn;
+}
+
+}  // namespace internal
+
+// Class Mock.
+
+namespace {
+
+typedef std::set<internal::UntypedFunctionMockerBase*> FunctionMockers;
+
+// The current state of a mock object.  Such information is needed for
+// detecting leaked mock objects and explicitly verifying a mock's
+// expectations.
+struct MockObjectState {
+  MockObjectState()
+      : first_used_file(nullptr), first_used_line(-1), leakable(false) {}
+
+  // Where in the source file an ON_CALL or EXPECT_CALL is first
+  // invoked on this mock object.
+  const char* first_used_file;
+  int first_used_line;
+  ::std::string first_used_test_suite;
+  ::std::string first_used_test;
+  bool leakable;  // true if and only if it's OK to leak the object.
+  FunctionMockers function_mockers;  // All registered methods of the object.
+};
+
+// A global registry holding the state of all mock objects that are
+// alive.  A mock object is added to this registry the first time
+// Mock::AllowLeak(), ON_CALL(), or EXPECT_CALL() is called on it.  It
+// is removed from the registry in the mock object's destructor.
+class MockObjectRegistry {
+ public:
+  // Maps a mock object (identified by its address) to its state.
+  typedef std::map<const void*, MockObjectState> StateMap;
+
+  // This destructor will be called when a program exits, after all
+  // tests in it have been run.  By then, there should be no mock
+  // object alive.  Therefore we report any living object as test
+  // failure, unless the user explicitly asked us to ignore it.
+  ~MockObjectRegistry() {
+    if (!GMOCK_FLAG(catch_leaked_mocks))
+      return;
+
+    int leaked_count = 0;
+    for (StateMap::const_iterator it = states_.begin(); it != states_.end();
+         ++it) {
+      if (it->second.leakable)  // The user said it's fine to leak this object.
+        continue;
+
+      // FIXME: Print the type of the leaked object.
+      // This can help the user identify the leaked object.
+      std::cout << "\n";
+      const MockObjectState& state = it->second;
+      std::cout << internal::FormatFileLocation(state.first_used_file,
+                                                state.first_used_line);
+      std::cout << " ERROR: this mock object";
+      if (state.first_used_test != "") {
+        std::cout << " (used in test " << state.first_used_test_suite << "."
+                  << state.first_used_test << ")";
+      }
+      std::cout << " should be deleted but never is. Its address is @"
+           << it->first << ".";
+      leaked_count++;
+    }
+    if (leaked_count > 0) {
+      std::cout << "\nERROR: " << leaked_count << " leaked mock "
+                << (leaked_count == 1 ? "object" : "objects")
+                << " found at program exit. Expectations on a mock object are "
+                   "verified when the object is destructed. Leaking a mock "
+                   "means that its expectations aren't verified, which is "
+                   "usually a test bug. If you really intend to leak a mock, "
+                   "you can suppress this error using "
+                   "testing::Mock::AllowLeak(mock_object), or you may use a "
+                   "fake or stub instead of a mock.\n";
+      std::cout.flush();
+      ::std::cerr.flush();
+      // RUN_ALL_TESTS() has already returned when this destructor is
+      // called.  Therefore we cannot use the normal Google Test
+      // failure reporting mechanism.
+      _exit(1);  // We cannot call exit() as it is not reentrant and
+                 // may already have been called.
+    }
+  }
+
+  StateMap& states() { return states_; }
+
+ private:
+  StateMap states_;
+};
+
+// Protected by g_gmock_mutex.
+MockObjectRegistry g_mock_object_registry;
+
+// Maps a mock object to the reaction Google Mock should have when an
+// uninteresting method is called.  Protected by g_gmock_mutex.
+std::map<const void*, internal::CallReaction> g_uninteresting_call_reaction;
+
+// Sets the reaction Google Mock should have when an uninteresting
+// method of the given mock object is called.
+void SetReactionOnUninterestingCalls(const void* mock_obj,
+                                     internal::CallReaction reaction)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_uninteresting_call_reaction[mock_obj] = reaction;
+}
+
+}  // namespace
+
+// Tells Google Mock to allow uninteresting calls on the given mock
+// object.
+void Mock::AllowUninterestingCalls(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  SetReactionOnUninterestingCalls(mock_obj, internal::kAllow);
+}
+
+// Tells Google Mock to warn the user about uninteresting calls on the
+// given mock object.
+void Mock::WarnUninterestingCalls(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  SetReactionOnUninterestingCalls(mock_obj, internal::kWarn);
+}
+
+// Tells Google Mock to fail uninteresting calls on the given mock
+// object.
+void Mock::FailUninterestingCalls(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  SetReactionOnUninterestingCalls(mock_obj, internal::kFail);
+}
+
+// Tells Google Mock the given mock object is being destroyed and its
+// entry in the call-reaction table should be removed.
+void Mock::UnregisterCallReaction(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_uninteresting_call_reaction.erase(mock_obj);
+}
+
+// Returns the reaction Google Mock will have on uninteresting calls
+// made on the given mock object.
+internal::CallReaction Mock::GetReactionOnUninterestingCalls(
+    const void* mock_obj)
+        GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  return (g_uninteresting_call_reaction.count(mock_obj) == 0) ?
+      internal::intToCallReaction(GMOCK_FLAG(default_mock_behavior)) :
+      g_uninteresting_call_reaction[mock_obj];
+}
+
+// Tells Google Mock to ignore mock_obj when checking for leaked mock
+// objects.
+void Mock::AllowLeak(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_mock_object_registry.states()[mock_obj].leakable = true;
+}
+
+// Verifies and clears all expectations on the given mock object.  If
+// the expectations aren't satisfied, generates one or more Google
+// Test non-fatal failures and returns false.
+bool Mock::VerifyAndClearExpectations(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  return VerifyAndClearExpectationsLocked(mock_obj);
+}
+
+// Verifies all expectations on the given mock object and clears its
+// default actions and expectations.  Returns true if and only if the
+// verification was successful.
+bool Mock::VerifyAndClear(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  ClearDefaultActionsLocked(mock_obj);
+  return VerifyAndClearExpectationsLocked(mock_obj);
+}
+
+// Verifies and clears all expectations on the given mock object.  If
+// the expectations aren't satisfied, generates one or more Google
+// Test non-fatal failures and returns false.
+bool Mock::VerifyAndClearExpectationsLocked(void* mock_obj)
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+  internal::g_gmock_mutex.AssertHeld();
+  if (g_mock_object_registry.states().count(mock_obj) == 0) {
+    // No EXPECT_CALL() was set on the given mock object.
+    return true;
+  }
+
+  // Verifies and clears the expectations on each mock method in the
+  // given mock object.
+  bool expectations_met = true;
+  FunctionMockers& mockers =
+      g_mock_object_registry.states()[mock_obj].function_mockers;
+  for (FunctionMockers::const_iterator it = mockers.begin();
+       it != mockers.end(); ++it) {
+    if (!(*it)->VerifyAndClearExpectationsLocked()) {
+      expectations_met = false;
+    }
+  }
+
+  // We don't clear the content of mockers, as they may still be
+  // needed by ClearDefaultActionsLocked().
+  return expectations_met;
+}
+
+bool Mock::IsNaggy(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kWarn;
+}
+bool Mock::IsNice(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kAllow;
+}
+bool Mock::IsStrict(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kFail;
+}
+
+// Registers a mock object and a mock method it owns.
+void Mock::Register(const void* mock_obj,
+                    internal::UntypedFunctionMockerBase* mocker)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_mock_object_registry.states()[mock_obj].function_mockers.insert(mocker);
+}
+
+// Tells Google Mock where in the source code mock_obj is used in an
+// ON_CALL or EXPECT_CALL.  In case mock_obj is leaked, this
+// information helps the user identify which object it is.
+void Mock::RegisterUseByOnCallOrExpectCall(const void* mock_obj,
+                                           const char* file, int line)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  MockObjectState& state = g_mock_object_registry.states()[mock_obj];
+  if (state.first_used_file == nullptr) {
+    state.first_used_file = file;
+    state.first_used_line = line;
+    const TestInfo* const test_info =
+        UnitTest::GetInstance()->current_test_info();
+    if (test_info != nullptr) {
+      state.first_used_test_suite = test_info->test_suite_name();
+      state.first_used_test = test_info->name();
+    }
+  }
+}
+
+// Unregisters a mock method; removes the owning mock object from the
+// registry when the last mock method associated with it has been
+// unregistered.  This is called only in the destructor of
+// FunctionMockerBase.
+void Mock::UnregisterLocked(internal::UntypedFunctionMockerBase* mocker)
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+  internal::g_gmock_mutex.AssertHeld();
+  for (MockObjectRegistry::StateMap::iterator it =
+           g_mock_object_registry.states().begin();
+       it != g_mock_object_registry.states().end(); ++it) {
+    FunctionMockers& mockers = it->second.function_mockers;
+    if (mockers.erase(mocker) > 0) {
+      // mocker was in mockers and has been just removed.
+      if (mockers.empty()) {
+        g_mock_object_registry.states().erase(it);
+      }
+      return;
+    }
+  }
+}
+
+// Clears all ON_CALL()s set on the given mock object.
+void Mock::ClearDefaultActionsLocked(void* mock_obj)
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+  internal::g_gmock_mutex.AssertHeld();
+
+  if (g_mock_object_registry.states().count(mock_obj) == 0) {
+    // No ON_CALL() was set on the given mock object.
+    return;
+  }
+
+  // Clears the default actions for each mock method in the given mock
+  // object.
+  FunctionMockers& mockers =
+      g_mock_object_registry.states()[mock_obj].function_mockers;
+  for (FunctionMockers::const_iterator it = mockers.begin();
+       it != mockers.end(); ++it) {
+    (*it)->ClearDefaultActionsLocked();
+  }
+
+  // We don't clear the content of mockers, as they may still be
+  // needed by VerifyAndClearExpectationsLocked().
+}
+
+Expectation::Expectation() {}
+
+Expectation::Expectation(
+    const std::shared_ptr<internal::ExpectationBase>& an_expectation_base)
+    : expectation_base_(an_expectation_base) {}
+
+Expectation::~Expectation() {}
+
+// Adds an expectation to a sequence.
+void Sequence::AddExpectation(const Expectation& expectation) const {
+  if (*last_expectation_ != expectation) {
+    if (last_expectation_->expectation_base() != nullptr) {
+      expectation.expectation_base()->immediate_prerequisites_
+          += *last_expectation_;
+    }
+    *last_expectation_ = expectation;
+  }
+}
+
+// Creates the implicit sequence if there isn't one.
+InSequence::InSequence() {
+  if (internal::g_gmock_implicit_sequence.get() == nullptr) {
+    internal::g_gmock_implicit_sequence.set(new Sequence);
+    sequence_created_ = true;
+  } else {
+    sequence_created_ = false;
+  }
+}
+
+// Deletes the implicit sequence if it was created by the constructor
+// of this object.
+InSequence::~InSequence() {
+  if (sequence_created_) {
+    delete internal::g_gmock_implicit_sequence.get();
+    internal::g_gmock_implicit_sequence.set(nullptr);
+  }
+}
+
+}  // namespace testing
+
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#  pragma warning(pop)
+#endif
+#endif
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/src/gmock.cc b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock.cc
new file mode 100644
index 0000000000..7bcdb0ba2d
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock.cc
@@ -0,0 +1,213 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+
+GMOCK_DEFINE_bool_(catch_leaked_mocks, true,
+                   "true if and only if Google Mock should report leaked "
+                   "mock objects as failures.");
+
+GMOCK_DEFINE_string_(verbose, internal::kWarningVerbosity,
+                     "Controls how verbose Google Mock's output is."
+                     "  Valid values:\n"
+                     "  info    - prints all messages.\n"
+                     "  warning - prints warnings and errors.\n"
+                     "  error   - prints errors only.");
+
+GMOCK_DEFINE_int32_(default_mock_behavior, 1,
+                    "Controls the default behavior of mocks."
+                    "  Valid values:\n"
+                    "  0 - by default, mocks act as NiceMocks.\n"
+                    "  1 - by default, mocks act as NaggyMocks.\n"
+                    "  2 - by default, mocks act as StrictMocks.");
+
+namespace internal {
+
+// Parses a string as a command line flag.  The string should have the
+// format "--gmock_flag=value".  When def_optional is true, the
+// "=value" part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+static const char* ParseGoogleMockFlagValue(const char* str,
+                                            const char* flag,
+                                            bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == nullptr || flag == nullptr) return nullptr;
+
+  // The flag must start with "--gmock_".
+  const std::string flag_str = std::string("--gmock_") + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return nullptr;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a Google Mock bool flag, in the form of
+// "--gmock_flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+static bool ParseGoogleMockBoolFlag(const char* str, const char* flag,
+                                    bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for a Google Mock string flag, in the form of
+// "--gmock_flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+template <typename String>
+static bool ParseGoogleMockStringFlag(const char* str, const char* flag,
+                                      String* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+static bool ParseGoogleMockIntFlag(const char* str, const char* flag,
+                                   int32_t* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
+}
+
+// The internal implementation of InitGoogleMock().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleMockImpl(int* argc, CharType** argv) {
+  // Makes sure Google Test is initialized.  InitGoogleTest() is
+  // idempotent, so it's fine if the user has already called it.
+  InitGoogleTest(argc, argv);
+  if (*argc <= 0) return;
+
+  for (int i = 1; i != *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    // Do we see a Google Mock flag?
+    if (ParseGoogleMockBoolFlag(arg, "catch_leaked_mocks",
+                                &GMOCK_FLAG(catch_leaked_mocks)) ||
+        ParseGoogleMockStringFlag(arg, "verbose", &GMOCK_FLAG(verbose)) ||
+        ParseGoogleMockIntFlag(arg, "default_mock_behavior",
+                               &GMOCK_FLAG(default_mock_behavior))) {
+      // Yes.  Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    }
+  }
+}
+
+}  // namespace internal
+
+// Initializes Google Mock.  This must be called before running the
+// tests.  In particular, it parses a command line for the flags that
+// Google Mock recognizes.  Whenever a Google Mock flag is seen, it is
+// removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Mock flag variables are
+// updated.
+//
+// Since Google Test is needed for Google Mock to work, this function
+// also initializes Google Test and parses its flags, if that hasn't
+// been done.
+GTEST_API_ void InitGoogleMock(int* argc, char** argv) {
+  internal::InitGoogleMockImpl(argc, argv);
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleMock(int* argc, wchar_t** argv) {
+  internal::InitGoogleMockImpl(argc, argv);
+}
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleMock() {
+  // Since Arduino doesn't have a command line, fake out the argc/argv arguments
+  int argc = 1;
+  const auto arg0 = "dummy";
+  char* argv0 = const_cast<char*>(arg0);
+  char** argv = &argv0;
+
+  internal::InitGoogleMockImpl(&argc, argv);
+}
+
+}  // namespace testing
diff --git a/media/libaom/src/third_party/googletest/src/googlemock/src/gmock_main.cc b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock_main.cc
new file mode 100644
index 0000000000..18c500f663
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googlemock/src/gmock_main.cc
@@ -0,0 +1,72 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include <iostream>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#if GTEST_OS_ESP8266 || GTEST_OS_ESP32
+#if GTEST_OS_ESP8266
+extern "C" {
+#endif
+void setup() {
+  // Since Google Mock depends on Google Test, InitGoogleMock() is
+  // also responsible for initializing Google Test.  Therefore there's
+  // no need for calling testing::InitGoogleTest() separately.
+  testing::InitGoogleMock();
+}
+void loop() { RUN_ALL_TESTS(); }
+#if GTEST_OS_ESP8266
+}
+#endif
+
+#else
+
+// MS C++ compiler/linker has a bug on Windows (not on Windows CE), which
+// causes a link error when _tmain is defined in a static library and UNICODE
+// is enabled. For this reason instead of _tmain, main function is used on
+// Windows. See the following link to track the current status of this bug:
+// https://web.archive.org/web/20170912203238/connect.microsoft.com/VisualStudio/feedback/details/394464/wmain-link-error-in-the-static-library
+// // NOLINT
+#if GTEST_OS_WINDOWS_MOBILE
+# include <tchar.h>  // NOLINT
+
+GTEST_API_ int _tmain(int argc, TCHAR** argv) {
+#else
+GTEST_API_ int main(int argc, char** argv) {
+#endif  // GTEST_OS_WINDOWS_MOBILE
+  std::cout << "Running main() from gmock_main.cc\n";
+  // Since Google Mock depends on Google Test, InitGoogleMock() is
+  // also responsible for initializing Google Test.  Therefore there's
+  // no need for calling testing::InitGoogleTest() separately.
+  testing::InitGoogleMock(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif
diff --git a/media/libaom/src/third_party/googletest/src/googletest/CHANGES b/media/libaom/src/third_party/googletest/src/googletest/CHANGES
deleted file mode 100644
index 0552132421..0000000000
--- a/media/libaom/src/third_party/googletest/src/googletest/CHANGES
+++ /dev/null
@@ -1,157 +0,0 @@
-Changes for 1.7.0:
-
-* New feature: death tests are supported on OpenBSD and in iOS
-  simulator now.
-* New feature: Google Test now implements a protocol to allow
-  a test runner to detect that a test program has exited
-  prematurely and report it as a failure (before it would be
-  falsely reported as a success if the exit code is 0).
-* New feature: Test::RecordProperty() can now be used outside of the
-  lifespan of a test method, in which case it will be attributed to
-  the current test case or the test program in the XML report.
-* New feature (potentially breaking): --gtest_list_tests now prints
-  the type parameters and value parameters for each test.
-* Improvement: char pointers and char arrays are now escaped properly
-  in failure messages.
-* Improvement: failure summary in XML reports now includes file and
-  line information.
-* Improvement: the <testsuites> XML element now has a timestamp attribute.
-* Improvement: When --gtest_filter is specified, XML report now doesn't
-  contain information about tests that are filtered out.
-* Fixed the bug where long --gtest_filter flag values are truncated in
-  death tests.
-* Potentially breaking change: RUN_ALL_TESTS() is now implemented as a
-  function instead of a macro in order to work better with Clang.
-* Compatibility fixes with C++ 11 and various platforms.
-* Bug/warning fixes.
-
-Changes for 1.6.0:
-
-* New feature: ADD_FAILURE_AT() for reporting a test failure at the
-  given source location -- useful for writing testing utilities.
-* New feature: the universal value printer is moved from Google Mock
-  to Google Test.
-* New feature: type parameters and value parameters are reported in
-  the XML report now.
-* A gtest_disable_pthreads CMake option.
-* Colored output works in GNU Screen sessions now.
-* Parameters of value-parameterized tests are now printed in the
-  textual output.
-* Failures from ad hoc test assertions run before RUN_ALL_TESTS() are
-  now correctly reported.
-* Arguments of ASSERT_XY and EXPECT_XY no longer need to support << to
-  ostream.
-* More complete handling of exceptions.
-* GTEST_ASSERT_XY can be used instead of ASSERT_XY in case the latter
-  name is already used by another library.
-* --gtest_catch_exceptions is now true by default, allowing a test
-  program to continue after an exception is thrown.
-* Value-parameterized test fixtures can now derive from Test and
-  WithParamInterface<T> separately, easing conversion of legacy tests.
-* Death test messages are clearly marked to make them more
-  distinguishable from other messages.
-* Compatibility fixes for Android, Google Native Client, MinGW, HP UX,
-  PowerPC, Lucid autotools, libCStd, Sun C++, Borland C++ Builder (Code Gear),
-  IBM XL C++ (Visual Age C++), and C++0x.
-* Bug fixes and implementation clean-ups.
-* Potentially incompatible changes: disables the harmful 'make install'
-  command in autotools.
-
-Changes for 1.5.0:
-
- * New feature: assertions can be safely called in multiple threads
-   where the pthreads library is available.
- * New feature: predicates used inside EXPECT_TRUE() and friends
-   can now generate custom failure messages.
- * New feature: Google Test can now be compiled as a DLL.
- * New feature: fused source files are included.
- * New feature: prints help when encountering unrecognized Google Test flags.
- * Experimental feature: CMake build script (requires CMake 2.6.4+).
- * Experimental feature: the Pump script for meta programming.
- * double values streamed to an assertion are printed with enough precision
-   to differentiate any two different values.
- * Google Test now works on Solaris and AIX.
- * Build and test script improvements.
- * Bug fixes and implementation clean-ups.
-
- Potentially breaking changes:
-
- * Stopped supporting VC++ 7.1 with exceptions disabled.
- * Dropped support for 'make install'.
-
-Changes for 1.4.0:
-
- * New feature: the event listener API
- * New feature: test shuffling
- * New feature: the XML report format is closer to junitreport and can
-   be parsed by Hudson now.
- * New feature: when a test runs under Visual Studio, its failures are
-   integrated in the IDE.
- * New feature: /MD(d) versions of VC++ projects.
- * New feature: elapsed time for the tests is printed by default.
- * New feature: comes with a TR1 tuple implementation such that Boost
-   is no longer needed for Combine().
- * New feature: EXPECT_DEATH_IF_SUPPORTED macro and friends.
- * New feature: the Xcode project can now produce static gtest
-   libraries in addition to a framework.
- * Compatibility fixes for Solaris, Cygwin, minGW, Windows Mobile,
-   Symbian, gcc, and C++Builder.
- * Bug fixes and implementation clean-ups.
-
-Changes for 1.3.0:
-
- * New feature: death tests on Windows, Cygwin, and Mac.
- * New feature: ability to use Google Test assertions in other testing
-   frameworks.
- * New feature: ability to run disabled test via
-   --gtest_also_run_disabled_tests.
- * New feature: the --help flag for printing the usage.
- * New feature: access to Google Test flag values in user code.
- * New feature: a script that packs Google Test into one .h and one
-   .cc file for easy deployment.
- * New feature: support for distributing test functions to multiple
-   machines (requires support from the test runner).
- * Bug fixes and implementation clean-ups.
-
-Changes for 1.2.1:
-
- * Compatibility fixes for Linux IA-64 and IBM z/OS.
- * Added support for using Boost and other TR1 implementations.
- * Changes to the build scripts to support upcoming release of Google C++
-   Mocking Framework.
- * Added Makefile to the distribution package.
- * Improved build instructions in README.
-
-Changes for 1.2.0:
-
- * New feature: value-parameterized tests.
- * New feature: the ASSERT/EXPECT_(NON)FATAL_FAILURE(_ON_ALL_THREADS)
-   macros.
- * Changed the XML report format to match JUnit/Ant's.
- * Added tests to the Xcode project.
- * Added scons/SConscript for building with SCons.
- * Added src/gtest-all.cc for building Google Test from a single file.
- * Fixed compatibility with Solaris and z/OS.
- * Enabled running Python tests on systems with python 2.3 installed,
-   e.g. Mac OS X 10.4.
- * Bug fixes.
-
-Changes for 1.1.0:
-
- * New feature: type-parameterized tests.
- * New feature: exception assertions.
- * New feature: printing elapsed time of tests.
- * Improved the robustness of death tests.
- * Added an Xcode project and samples.
- * Adjusted the output format on Windows to be understandable by Visual Studio.
- * Minor bug fixes.
-
-Changes for 1.0.1:
-
- * Added project files for Visual Studio 7.1.
- * Fixed issues with compiling on Mac OS X.
- * Fixed issues with compiling on Cygwin.
-
-Changes for 1.0.0:
-
- * Initial Open Source release of Google Test
diff --git a/media/libaom/src/third_party/googletest/src/googletest/CMakeLists.txt b/media/libaom/src/third_party/googletest/src/googletest/CMakeLists.txt
index 9ee79408c2..abdd98b79a 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/CMakeLists.txt
+++ b/media/libaom/src/third_party/googletest/src/googletest/CMakeLists.txt
@@ -1,4 +1,7 @@
 ########################################################################
+# Note: CMake support is community-based. The maintainers do not use CMake
+# internally.
+#
 # CMake build script for Google Test.
 #
 # To run the tests for Google Test itself on Linux, use 'make test' or
@@ -40,13 +43,17 @@ endif()
 # as ${gtest_SOURCE_DIR} and to the root binary directory as
 # ${gtest_BINARY_DIR}.
 # Language "C" is required for find_package(Threads).
+
+# Project version:
+
 if (CMAKE_VERSION VERSION_LESS 3.0)
   project(gtest CXX C)
+  set(PROJECT_VERSION ${GOOGLETEST_VERSION})
 else()
   cmake_policy(SET CMP0048 NEW)
   project(gtest VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
 endif()
-cmake_minimum_required(VERSION 2.6.4)
+cmake_minimum_required(VERSION 2.8.12)
 
 if (POLICY CMP0063) # Visibility
   cmake_policy(SET CMP0063 NEW)
@@ -85,15 +92,18 @@ include(cmake/internal_utils.cmake)
 
 config_compiler_and_linker()  # Defined in internal_utils.cmake.
 
+# Needed to set the namespace for both the export targets and the
+# alias libraries
+set(cmake_package_name GTest CACHE INTERNAL "")
+
 # Create the CMake package file descriptors.
 if (INSTALL_GTEST)
   include(CMakePackageConfigHelpers)
-  set(cmake_package_name GTest)
   set(targets_export_name ${cmake_package_name}Targets CACHE INTERNAL "")
   set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated" CACHE INTERNAL "")
   set(cmake_files_install_dir "${CMAKE_INSTALL_LIBDIR}/cmake/${cmake_package_name}")
   set(version_file "${generated_dir}/${cmake_package_name}ConfigVersion.cmake")
-  write_basic_package_version_file(${version_file} COMPATIBILITY AnyNewerVersion)
+  write_basic_package_version_file(${version_file} VERSION ${GOOGLETEST_VERSION} COMPATIBILITY AnyNewerVersion)
   install(EXPORT ${targets_export_name}
     NAMESPACE ${cmake_package_name}::
     DESTINATION ${cmake_files_install_dir})
@@ -110,18 +120,6 @@ set(gtest_build_include_dirs
   "${gtest_SOURCE_DIR}")
 include_directories(${gtest_build_include_dirs})
 
-# Summary of tuple support for Microsoft Visual Studio:
-# Compiler    version(MS)  version(cmake)  Support
-# ----------  -----------  --------------  -----------------------------
-# <= VS 2010  <= 10        <= 1600         Use Google Tests's own tuple.
-# VS 2012     11           1700            std::tr1::tuple + _VARIADIC_MAX=10
-# VS 2013     12           1800            std::tr1::tuple
-# VS 2015     14           1900            std::tuple
-# VS 2017     15           >= 1910         std::tuple
-if (MSVC AND MSVC_VERSION EQUAL 1700)
-  add_definitions(/D _VARIADIC_MAX=10)
-endif()
-
 ########################################################################
 #
 # Defines the gtest & gtest_main libraries.  User tests should link
@@ -131,7 +129,9 @@ endif()
 # are used for other targets, to ensure that gtest can be compiled by a user
 # aggressive about warnings.
 cxx_library(gtest "${cxx_strict}" src/gtest-all.cc)
+set_target_properties(gtest PROPERTIES VERSION ${GOOGLETEST_VERSION})
 cxx_library(gtest_main "${cxx_strict}" src/gtest_main.cc)
+set_target_properties(gtest_main PROPERTIES VERSION ${GOOGLETEST_VERSION})
 # If the CMake version supports it, attach header directory information
 # to the targets for when we are part of a parent build (ie being pulled
 # in via add_subdirectory() rather than being a standalone build).
@@ -193,7 +193,6 @@ if (gtest_build_tests)
   cxx_test(googletest-death-test-test gtest_main)
   cxx_test(gtest_environment_test gtest)
   cxx_test(googletest-filepath-test gtest_main)
-  cxx_test(googletest-linked-ptr-test gtest_main)
   cxx_test(googletest-listener-test gtest_main)
   cxx_test(gtest_main_unittest gtest_main)
   cxx_test(googletest-message-test gtest_main)
@@ -217,6 +216,8 @@ if (gtest_build_tests)
     test/gtest-typed-test2_test.cc)
   cxx_test(gtest_unittest gtest_main)
   cxx_test(gtest-unittest-api_test gtest)
+  cxx_test(gtest_skip_in_environment_setup_test gtest_main)
+  cxx_test(gtest_skip_test gtest_main)
 
   ############################################################
   # C++ tests built with non-standard compiler flags.
@@ -250,27 +251,15 @@ if (gtest_build_tests)
                         PROPERTIES
                         COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
 
-  if (NOT MSVC OR MSVC_VERSION LESS 1600)  # 1600 is Visual Studio 2010.
-    # Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that
-    # conflict with our own definitions. Therefore using our own tuple does not
-    # work on those compilers.
-    cxx_library(gtest_main_use_own_tuple "${cxx_use_own_tuple}"
-      src/gtest-all.cc src/gtest_main.cc)
-
-    cxx_test_with_flags(googletest-tuple-test "${cxx_use_own_tuple}"
-      gtest_main_use_own_tuple test/googletest-tuple-test.cc)
-
-    cxx_test_with_flags(gtest_use_own_tuple_test "${cxx_use_own_tuple}"
-      gtest_main_use_own_tuple
-      test/googletest-param-test-test.cc test/googletest-param-test2-test.cc)
-  endif()
-
   ############################################################
   # Python tests.
 
   cxx_executable(googletest-break-on-failure-unittest_ test gtest)
   py_test(googletest-break-on-failure-unittest)
 
+  py_test(gtest_skip_check_output_test)
+  py_test(gtest_skip_environment_check_output_test)
+
   # Visual Studio .NET 2003 does not support STL with exceptions disabled.
   if (NOT MSVC OR MSVC_VERSION GREATER 1310)  # 1310 is Visual Studio .NET 2003
     cxx_executable_with_flags(
@@ -320,6 +309,9 @@ if (gtest_build_tests)
   cxx_executable(googletest-uninitialized-test_ test gtest)
   py_test(googletest-uninitialized-test)
 
+  cxx_executable(gtest_list_output_unittest_ test gtest)
+  py_test(gtest_list_output_unittest)
+
   cxx_executable(gtest_xml_outfile1_test_ test gtest_main)
   cxx_executable(gtest_xml_outfile2_test_ test gtest_main)
   py_test(gtest_xml_outfiles_test)
diff --git a/media/libaom/src/third_party/googletest/src/googletest/README.md b/media/libaom/src/third_party/googletest/src/googletest/README.md
index e30fe80471..1f8b349ae7 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/README.md
+++ b/media/libaom/src/third_party/googletest/src/googletest/README.md
@@ -2,80 +2,51 @@
 
 #### Setup
 
-To build Google Test and your tests that use it, you need to tell your build
+To build GoogleTest and your tests that use it, you need to tell your build
 system where to find its headers and source files. The exact way to do it
 depends on which build system you use, and is usually straightforward.
 
-#### Build
+### Build with CMake
 
-Suppose you put Google Test in directory `${GTEST_DIR}`. To build it, create a
-library build target (or a project as called by Visual Studio and Xcode) to
-compile
-
-    ${GTEST_DIR}/src/gtest-all.cc
-
-with `${GTEST_DIR}/include` in the system header search path and `${GTEST_DIR}`
-in the normal header search path. Assuming a Linux-like system and gcc,
-something like the following will do:
-
-    g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \
-        -pthread -c ${GTEST_DIR}/src/gtest-all.cc
-    ar -rv libgtest.a gtest-all.o
-
-(We need `-pthread` as Google Test uses threads.)
-
-Next, you should compile your test source file with `${GTEST_DIR}/include` in
-the system header search path, and link it with gtest and any other necessary
-libraries:
-
-    g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \
-        -o your_test
-
-As an example, the make/ directory contains a Makefile that you can use to build
-Google Test on systems where GNU make is available (e.g. Linux, Mac OS X, and
-Cygwin). It doesn't try to build Google Test's own tests. Instead, it just
-builds the Google Test library and a sample test. You can use it as a starting
-point for your own build script.
-
-If the default settings are correct for your environment, the following commands
-should succeed:
-
-    cd ${GTEST_DIR}/make
-    make
-    ./sample1_unittest
-
-If you see errors, try to tweak the contents of `make/Makefile` to make them go
-away. There are instructions in `make/Makefile` on how to do it.
-
-### Using CMake
-
-Google Test comes with a CMake build script (
-[CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
+GoogleTest comes with a CMake build script
+([CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
 that can be used on a wide range of platforms ("C" stands for cross-platform.).
 If you don't have CMake installed already, you can download it for free from
 <http://www.cmake.org/>.
 
 CMake works by generating native makefiles or build projects that can be used in
-the compiler environment of your choice. You can either build Google Test as a
+the compiler environment of your choice. You can either build GoogleTest as a
 standalone project or it can be incorporated into an existing CMake build for
 another project.
 
 #### Standalone CMake Project
 
-When building Google Test as a standalone project, the typical workflow starts
-with:
+When building GoogleTest as a standalone project, the typical workflow starts
+with
 
-    mkdir mybuild       # Create a directory to hold the build output.
-    cd mybuild
-    cmake ${GTEST_DIR}  # Generate native build scripts.
+```
+git clone https://github.com/google/googletest.git -b release-1.10.0
+cd googletest        # Main directory of the cloned repository.
+mkdir build          # Create a directory to hold the build output.
+cd build
+cmake ..             # Generate native build scripts for GoogleTest.
+```
 
-If you want to build Google Test's samples, you should replace the last command
-with
+The above command also includes GoogleMock by default. And so, if you want to
+build only GoogleTest, you should replace the last command with
 
-    cmake -Dgtest_build_samples=ON ${GTEST_DIR}
+```
+cmake .. -DBUILD_GMOCK=OFF
+```
 
 If you are on a \*nix system, you should now see a Makefile in the current
-directory. Just type 'make' to build gtest.
+directory. Just type `make` to build GoogleTest. And then you can simply install
+GoogleTest if you are a system administrator.
+
+```
+make
+sudo make install    # Install in /usr/local/ by default
+```
 
 If you use Windows and have Visual Studio installed, a `gtest.sln` file and
 several `.vcproj` files will be created. You can then build them using Visual
@@ -85,13 +56,19 @@ On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated.
 
 #### Incorporating Into An Existing CMake Project
 
-If you want to use gtest in a project which already uses CMake, then a more
-robust and flexible approach is to build gtest as part of that project directly.
-This is done by making the GoogleTest source code available to the main build
-and adding it using CMake's `add_subdirectory()` command. This has the
-significant advantage that the same compiler and linker settings are used
-between gtest and the rest of your project, so issues associated with using
-incompatible libraries (eg debug/release), etc. are avoided. This is
+If you want to use GoogleTest in a project which already uses CMake, the easiest
+way is to get installed libraries and headers.
+
+*   Import GoogleTest by using `find_package` (or `pkg_check_modules`). For
+    example, if `find_package(GTest CONFIG REQUIRED)` succeeds, you can use the
+    libraries as `GTest::gtest`, `GTest::gmock`.
+
+And a more robust and flexible approach is to build GoogleTest as part of that
+project directly. This is done by making the GoogleTest source code available to
+the main build and adding it using CMake's `add_subdirectory()` command. This
+has the significant advantage that the same compiler and linker settings are
+used between GoogleTest and the rest of your project, so issues associated with
+using incompatible libraries (eg debug/release), etc. are avoided. This is
 particularly useful on Windows. Making GoogleTest's source code available to the
 main build can be done a few different ways:
 
@@ -105,177 +82,74 @@ main build can be done a few different ways:
     possible or appropriate. Git submodules, for example, have their own set of
     advantages and drawbacks.
 *   Use CMake to download GoogleTest as part of the build's configure step. This
-    is just a little more complex, but doesn't have the limitations of the other
-    methods.
-
-The last of the above methods is implemented with a small piece of CMake code in
-a separate file (e.g. `CMakeLists.txt.in`) which is copied to the build area and
-then invoked as a sub-build _during the CMake stage_. That directory is then
-pulled into the main build with `add_subdirectory()`. For example:
-
-New file `CMakeLists.txt.in`:
-
-    cmake_minimum_required(VERSION 2.8.2)
-
-    project(googletest-download NONE)
-
-    include(ExternalProject)
-    ExternalProject_Add(googletest
-      GIT_REPOSITORY    https://github.com/google/googletest.git
-      GIT_TAG           master
-      SOURCE_DIR        "${CMAKE_BINARY_DIR}/googletest-src"
-      BINARY_DIR        "${CMAKE_BINARY_DIR}/googletest-build"
-      CONFIGURE_COMMAND ""
-      BUILD_COMMAND     ""
-      INSTALL_COMMAND   ""
-      TEST_COMMAND      ""
-    )
-
-Existing build's `CMakeLists.txt`:
-
-    # Download and unpack googletest at configure time
-    configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
-    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
-      RESULT_VARIABLE result
-      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
-    if(result)
-      message(FATAL_ERROR "CMake step for googletest failed: ${result}")
-    endif()
-    execute_process(COMMAND ${CMAKE_COMMAND} --build .
-      RESULT_VARIABLE result
-      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
-    if(result)
-      message(FATAL_ERROR "Build step for googletest failed: ${result}")
-    endif()
-
-    # Prevent overriding the parent project's compiler/linker
-    # settings on Windows
-    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-
-    # Add googletest directly to our build. This defines
-    # the gtest and gtest_main targets.
-    add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src
-                     ${CMAKE_BINARY_DIR}/googletest-build
-                     EXCLUDE_FROM_ALL)
-
-    # The gtest/gtest_main targets carry header search path
-    # dependencies automatically when using CMake 2.8.11 or
-    # later. Otherwise we have to add them here ourselves.
-    if (CMAKE_VERSION VERSION_LESS 2.8.11)
-      include_directories("${gtest_SOURCE_DIR}/include")
-    endif()
-
-    # Now simply link against gtest or gtest_main as needed. Eg
-    add_executable(example example.cpp)
-    target_link_libraries(example gtest_main)
-    add_test(NAME example_test COMMAND example)
-
-Note that this approach requires CMake 2.8.2 or later due to its use of the
-`ExternalProject_Add()` command. The above technique is discussed in more detail
-in [this separate article](http://crascit.com/2015/07/25/cmake-gtest/) which
-also contains a link to a fully generalized implementation of the technique.
+    approach doesn't have the limitations of the other methods.
+
+The last of the above methods is implemented with a small piece of CMake code
+that downloads and pulls the GoogleTest code into the main build.
+
+Just add to your `CMakeLists.txt`:
+
+```cmake
+include(FetchContent)
+FetchContent_Declare(
+  googletest
+  # Specify the commit you depend on and update it regularly.
+  URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
+)
+# For Windows: Prevent overriding the parent project's compiler/linker settings
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
+# Now simply link against gtest or gtest_main as needed. Eg
+add_executable(example example.cpp)
+target_link_libraries(example gtest_main)
+add_test(NAME example_test COMMAND example)
+```
+
+Note that this approach requires CMake 3.14 or later due to its use of the
+`FetchContent_MakeAvailable()` command.
 
 ##### Visual Studio Dynamic vs Static Runtimes
 
 By default, new Visual Studio projects link the C runtimes dynamically but
-Google Test links them statically. This will generate an error that looks
+GoogleTest links them statically. This will generate an error that looks
 something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch
 detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value
 'MDd_DynamicDebug' in main.obj
 
-Google Test already has a CMake option for this: `gtest_force_shared_crt`
+GoogleTest already has a CMake option for this: `gtest_force_shared_crt`
 
 Enabling this option will make gtest link the runtimes dynamically too, and
 match the project in which it is included.
 
-### Legacy Build Scripts
-
-Before settling on CMake, we have been providing hand-maintained build
-projects/scripts for Visual Studio, Xcode, and Autotools. While we continue to
-provide them for convenience, they are not actively maintained any more. We
-highly recommend that you follow the instructions in the above sections to
-integrate Google Test with your existing build system.
-
-If you still need to use the legacy build scripts, here's how:
-
-The msvc\ folder contains two solutions with Visual C++ projects. Open the
-`gtest.sln` or `gtest-md.sln` file using Visual Studio, and you are ready to
-build Google Test the same way you build any Visual Studio project. Files that
-have names ending with -md use DLL versions of Microsoft runtime libraries (the
-/MD or the /MDd compiler option). Files without that suffix use static versions
-of the runtime libraries (the /MT or the /MTd option). Please note that one must
-use the same option to compile both gtest and the test code. If you use Visual
-Studio 2005 or above, we recommend the -md version as /MD is the default for new
-projects in these versions of Visual Studio.
+#### C++ Standard Version
 
-On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using Xcode.
-Build the "gtest" target. The universal binary framework will end up in your
-selected build directory (selected in the Xcode "Preferences..." -> "Building"
-pane and defaults to xcode/build). Alternatively, at the command line, enter:
+An environment that supports C++11 is required in order to successfully build
+GoogleTest. One way to ensure this is to specify the standard in the top-level
+project, for example by using the `set(CMAKE_CXX_STANDARD 11)` command. If this
+is not feasible, for example in a C project using GoogleTest for validation,
+then it can be specified by adding it to the options for cmake via the
+`DCMAKE_CXX_FLAGS` option.
 
-    xcodebuild
+### Tweaking GoogleTest
 
-This will build the "Release" configuration of gtest.framework in your default
-build location. See the "xcodebuild" man page for more information about
-building different configurations and building in different locations.
-
-If you wish to use the Google Test Xcode project with Xcode 4.x and above, you
-need to either:
-
-*   update the SDK configuration options in xcode/Config/General.xconfig.
-    Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If
-    you choose this route you lose the ability to target earlier versions of
-    MacOS X.
-*   Install an SDK for an earlier version. This doesn't appear to be supported
-    by Apple, but has been reported to work
-    (http://stackoverflow.com/questions/5378518).
-
-### Tweaking Google Test
-
-Google Test can be used in diverse environments. The default configuration may
+GoogleTest can be used in diverse environments. The default configuration may
 not work (or may not work well) out of the box in some environments. However,
-you can easily tweak Google Test by defining control macros on the compiler
+you can easily tweak GoogleTest by defining control macros on the compiler
 command line. Generally, these macros are named like `GTEST_XYZ` and you define
 them to either 1 or 0 to enable or disable a certain feature.
 
 We list the most frequently used macros below. For a complete list, see file
-[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/include/gtest/internal/gtest-port.h).
-
-### Choosing a TR1 Tuple Library
-
-Some Google Test features require the C++ Technical Report 1 (TR1) tuple
-library, which is not yet available with all compilers. The good news is that
-Google Test implements a subset of TR1 tuple that's enough for its own need, and
-will automatically use this when the compiler doesn't provide TR1 tuple.
-
-Usually you don't need to care about which tuple library Google Test uses.
-However, if your project already uses TR1 tuple, you need to tell Google Test to
-use the same TR1 tuple library the rest of your project uses, or the two tuple
-implementations will clash. To do that, add
-
-    -DGTEST_USE_OWN_TR1_TUPLE=0
-
-to the compiler flags while compiling Google Test and your tests. If you want to
-force Google Test to use its own tuple library, just add
-
-    -DGTEST_USE_OWN_TR1_TUPLE=1
-
-to the compiler flags instead.
-
-If you don't want Google Test to use tuple at all, add
-
-    -DGTEST_HAS_TR1_TUPLE=0
-
-and all features using tuple will be disabled.
+[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/googletest/include/gtest/internal/gtest-port.h).
 
 ### Multi-threaded Tests
 
-Google Test is thread-safe where the pthread library is available. After
-`#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE` macro to see
-whether this is the case (yes if the macro is `#defined` to 1, no if it's
-undefined.).
+GoogleTest is thread-safe where the pthread library is available. After
+`#include "gtest/gtest.h"`, you can check the
+`GTEST_IS_THREADSAFE` macro to see whether this is the case (yes if the macro is
+`#defined` to 1, no if it's undefined.).
 
-If Google Test doesn't correctly detect whether pthread is available in your
+If GoogleTest doesn't correctly detect whether pthread is available in your
 environment, you can force it with
 
     -DGTEST_HAS_PTHREAD=1
@@ -284,16 +158,16 @@ or
 
     -DGTEST_HAS_PTHREAD=0
 
-When Google Test uses pthread, you may need to add flags to your compiler and/or
+When GoogleTest uses pthread, you may need to add flags to your compiler and/or
 linker to select the pthread library, or you'll get link errors. If you use the
-CMake script or the deprecated Autotools script, this is taken care of for you.
-If you use your own build script, you'll need to read your compiler and linker's
-manual to figure out what flags to add.
+CMake script, this is taken care of for you. If you use your own build script,
+you'll need to read your compiler and linker's manual to figure out what flags
+to add.
 
 ### As a Shared Library (DLL)
 
-Google Test is compact, so most users can build and link it as a static library
-for the simplicity. You can choose to use Google Test as a shared library (known
+GoogleTest is compact, so most users can build and link it as a static library
+for the simplicity. You can choose to use GoogleTest as a shared library (known
 as a DLL on Windows) if you prefer.
 
 To compile *gtest* as a shared library, add
@@ -313,22 +187,22 @@ Note: while the above steps aren't technically necessary today when using some
 compilers (e.g. GCC), they may become necessary in the future, if we decide to
 improve the speed of loading the library (see
 <http://gcc.gnu.org/wiki/Visibility> for details). Therefore you are recommended
-to always add the above flags when using Google Test as a shared library.
-Otherwise a future release of Google Test may break your build script.
+to always add the above flags when using GoogleTest as a shared library.
+Otherwise a future release of GoogleTest may break your build script.
 
 ### Avoiding Macro Name Clashes
 
 In C++, macros don't obey namespaces. Therefore two libraries that both define a
 macro of the same name will clash if you `#include` both definitions. In case a
-Google Test macro clashes with another library, you can force Google Test to
+GoogleTest macro clashes with another library, you can force GoogleTest to
 rename its macro to avoid the conflict.
 
-Specifically, if both Google Test and some other code define macro FOO, you can
+Specifically, if both GoogleTest and some other code define macro FOO, you can
 add
 
     -DGTEST_DONT_DEFINE_FOO=1
 
-to the compiler flags to tell Google Test to change the macro's name from `FOO`
+to the compiler flags to tell GoogleTest to change the macro's name from `FOO`
 to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For
 example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
 
diff --git a/media/libaom/src/third_party/googletest/src/googletest/cmake/gtest.pc.in b/media/libaom/src/third_party/googletest/src/googletest/cmake/gtest.pc.in
index e7967ad56f..b4148fae42 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/cmake/gtest.pc.in
+++ b/media/libaom/src/third_party/googletest/src/googletest/cmake/gtest.pc.in
@@ -6,4 +6,4 @@ Description: GoogleTest (without main() function)
 Version: @PROJECT_VERSION@
 URL: https://github.com/google/googletest
 Libs: -L${libdir} -lgtest @CMAKE_THREAD_LIBS_INIT@
-Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@
diff --git a/media/libaom/src/third_party/googletest/src/googletest/cmake/gtest_main.pc.in b/media/libaom/src/third_party/googletest/src/googletest/cmake/gtest_main.pc.in
index fe25d9c73c..38c88c54d5 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/cmake/gtest_main.pc.in
+++ b/media/libaom/src/third_party/googletest/src/googletest/cmake/gtest_main.pc.in
@@ -5,6 +5,6 @@ Name: gtest_main
 Description: GoogleTest (with main() function)
 Version: @PROJECT_VERSION@
 URL: https://github.com/google/googletest
-Requires: gtest
+Requires: gtest = @PROJECT_VERSION@
 Libs: -L${libdir} -lgtest_main @CMAKE_THREAD_LIBS_INIT@
-Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@
diff --git a/media/libaom/src/third_party/googletest/src/googletest/cmake/internal_utils.cmake b/media/libaom/src/third_party/googletest/src/googletest/cmake/internal_utils.cmake
index 8c1f9ba99c..8d8d60a86c 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/cmake/internal_utils.cmake
+++ b/media/libaom/src/third_party/googletest/src/googletest/cmake/internal_utils.cmake
@@ -12,6 +12,10 @@
 #   Test and Google Mock's option() definitions, and thus must be
 #   called *after* the options have been defined.
 
+if (POLICY CMP0054)
+  cmake_policy(SET CMP0054 NEW)
+endif (POLICY CMP0054)
+
 # Tweaks CMake's default compiler/linker settings to suit Google Test's needs.
 #
 # This must be a macro(), as inside a function string() can only
@@ -22,6 +26,8 @@ macro(fix_default_compiler_settings_)
     # This replacement code is taken from sample in the CMake Wiki at
     # https://gitlab.kitware.com/cmake/community/wikis/FAQ#dynamic-replace.
     foreach (flag_var
+             CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+             CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
              CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
              CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
       if (NOT BUILD_SHARED_LIBS AND NOT gtest_force_shared_crt)
@@ -56,7 +62,6 @@ macro(config_compiler_and_linker)
   unset(GTEST_HAS_PTHREAD)
   if (NOT gtest_disable_pthreads AND NOT MINGW)
     # Defines CMAKE_USE_PTHREADS_INIT and CMAKE_THREAD_LIBS_INIT.
-    set(THREADS_PREFER_PTHREAD_FLAG ON)
     find_package(Threads)
     if (CMAKE_USE_PTHREADS_INIT)
       set(GTEST_HAS_PTHREAD ON)
@@ -67,37 +72,23 @@ macro(config_compiler_and_linker)
   if (MSVC)
     # Newlines inside flags variables break CMake's NMake generator.
     # TODO(vladl@google.com): Add -RTCs and -RTCu to debug builds.
-    set(cxx_base_flags "-GS -W4 -WX -wd4251 -wd4275 -nologo -J -Zi")
-    if (MSVC_VERSION LESS 1400)  # 1400 is Visual Studio 2005
-      # Suppress spurious warnings MSVC 7.1 sometimes issues.
-      # Forcing value to bool.
-      set(cxx_base_flags "${cxx_base_flags} -wd4800")
-      # Copy constructor and assignment operator could not be generated.
-      set(cxx_base_flags "${cxx_base_flags} -wd4511 -wd4512")
-      # Compatibility warnings not applicable to Google Test.
-      # Resolved overload was found by argument-dependent lookup.
-      set(cxx_base_flags "${cxx_base_flags} -wd4675")
-    endif()
-    if (MSVC_VERSION LESS 1500)  # 1500 is Visual Studio 2008
-      # Conditional expression is constant.
-      # When compiling with /W4, we get several instances of C4127
-      # (Conditional expression is constant). In our code, we disable that
-      # warning on a case-by-case basis. However, on Visual Studio 2005,
-      # the warning fires on std::list. Therefore on that compiler and earlier,
-      # we disable the warning project-wide.
-      set(cxx_base_flags "${cxx_base_flags} -wd4127")
-    endif()
-    if (NOT (MSVC_VERSION LESS 1700))  # 1700 is Visual Studio 2012.
-      # Suppress "unreachable code" warning on VS 2012 and later.
-      # http://stackoverflow.com/questions/3232669 explains the issue.
-      set(cxx_base_flags "${cxx_base_flags} -wd4702")
-    endif()
-
+    set(cxx_base_flags "-GS -W4 -WX -wd4251 -wd4275 -nologo -J")
     set(cxx_base_flags "${cxx_base_flags} -D_UNICODE -DUNICODE -DWIN32 -D_WIN32")
     set(cxx_base_flags "${cxx_base_flags} -DSTRICT -DWIN32_LEAN_AND_MEAN")
     set(cxx_exception_flags "-EHsc -D_HAS_EXCEPTIONS=1")
     set(cxx_no_exception_flags "-EHs-c- -D_HAS_EXCEPTIONS=0")
     set(cxx_no_rtti_flags "-GR-")
+    # Suppress "unreachable code" warning
+    # http://stackoverflow.com/questions/3232669 explains the issue.
+    set(cxx_base_flags "${cxx_base_flags} -wd4702")
+    # Ensure MSVC treats source files as UTF-8 encoded.
+    set(cxx_base_flags "${cxx_base_flags} -utf-8")
+  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    set(cxx_base_flags "-Wall -Wshadow -Werror -Wconversion")
+    set(cxx_exception_flags "-fexceptions")
+    set(cxx_no_exception_flags "-fno-exceptions")
+    set(cxx_strict_flags "-W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls")
+    set(cxx_no_rtti_flags "-fno-rtti")
   elseif (CMAKE_COMPILER_IS_GNUCXX)
     set(cxx_base_flags "-Wall -Wshadow -Werror")
     if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0)
@@ -148,7 +139,6 @@ macro(config_compiler_and_linker)
     "${CMAKE_CXX_FLAGS} ${cxx_base_flags} ${cxx_no_exception_flags}")
   set(cxx_default "${cxx_exception}")
   set(cxx_no_rtti "${cxx_default} ${cxx_no_rtti_flags}")
-  set(cxx_use_own_tuple "${cxx_default} -DGTEST_USE_OWN_TR1_TUPLE=1")
 
   # For building the gtest libraries.
   set(cxx_strict "${cxx_default} ${cxx_strict_flags}")
@@ -160,6 +150,7 @@ function(cxx_library_with_type name type cxx_flags)
   # type can be either STATIC or SHARED to denote a static or shared library.
   # ARGN refers to additional arguments after 'cxx_flags'.
   add_library(${name} ${type} ${ARGN})
+  add_library(${cmake_package_name}::${name} ALIAS ${name})
   set_target_properties(${name}
     PROPERTIES
     COMPILE_FLAGS "${cxx_flags}")
@@ -167,6 +158,22 @@ function(cxx_library_with_type name type cxx_flags)
   set_target_properties(${name}
     PROPERTIES
     DEBUG_POSTFIX "d")
+  # Set the output directory for build artifacts
+  set_target_properties(${name}
+    PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+  # make PDBs match library name
+  get_target_property(pdb_debug_postfix ${name} DEBUG_POSTFIX)
+  set_target_properties(${name}
+    PROPERTIES
+    PDB_NAME "${name}"
+    PDB_NAME_DEBUG "${name}${pdb_debug_postfix}"
+    COMPILE_PDB_NAME "${name}"
+    COMPILE_PDB_NAME_DEBUG "${name}${pdb_debug_postfix}")
+
   if (BUILD_SHARED_LIBS OR type STREQUAL "SHARED")
     set_target_properties(${name}
       PROPERTIES
@@ -184,6 +191,10 @@ function(cxx_library_with_type name type cxx_flags)
     endif()
     target_link_libraries(${name} PUBLIC ${threads_spec})
   endif()
+
+  if (NOT "${CMAKE_VERSION}" VERSION_LESS "3.8")
+    target_compile_features(${name} PUBLIC cxx_std_11)
+  endif()
 endfunction()
 
 ########################################################################
@@ -204,7 +215,7 @@ endfunction()
 # is built from the given source files with the given compiler flags.
 function(cxx_executable_with_flags name cxx_flags libs)
   add_executable(${name} ${ARGN})
-  if (MSVC AND (NOT (MSVC_VERSION LESS 1700)))  # 1700 is Visual Studio 2012.
+  if (MSVC)
     # BigObj required for tests.
     set(cxx_flags "${cxx_flags} -bigobj")
   endif()
@@ -236,7 +247,13 @@ function(cxx_executable name dir libs)
 endfunction()
 
 # Sets PYTHONINTERP_FOUND and PYTHON_EXECUTABLE.
-find_package(PythonInterp)
+if ("${CMAKE_VERSION}" VERSION_LESS "3.12.0")
+  find_package(PythonInterp)
+else()
+  find_package(Python COMPONENTS Interpreter)
+  set(PYTHONINTERP_FOUND ${Python_Interpreter_FOUND})
+  set(PYTHON_EXECUTABLE ${Python_EXECUTABLE})
+endif()
 
 # cxx_test_with_flags(name cxx_flags libs srcs...)
 #
@@ -244,7 +261,7 @@ find_package(PythonInterp)
 # from the given source files with the given compiler flags.
 function(cxx_test_with_flags name cxx_flags libs)
   cxx_executable_with_flags(${name} "${cxx_flags}" "${libs}" ${ARGN})
-  add_test(NAME ${name} COMMAND ${name})
+    add_test(NAME ${name} COMMAND "$<TARGET_FILE:${name}>")
 endfunction()
 
 # cxx_test(name libs srcs...)
@@ -263,33 +280,30 @@ endfunction()
 # test/name.py.  It does nothing if Python is not installed.
 function(py_test name)
   if (PYTHONINTERP_FOUND)
-    if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1)
+    if ("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" VERSION_GREATER 3.1)
       if (CMAKE_CONFIGURATION_TYPES)
-	# Multi-configuration build generators as for Visual Studio save
-	# output in a subdirectory of CMAKE_CURRENT_BINARY_DIR (Debug,
-	# Release etc.), so we have to provide it here.
-        add_test(
-          NAME ${name}
+        # Multi-configuration build generators as for Visual Studio save
+        # output in a subdirectory of CMAKE_CURRENT_BINARY_DIR (Debug,
+        # Release etc.), so we have to provide it here.
+        add_test(NAME ${name}
           COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
               --build_dir=${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG> ${ARGN})
       else (CMAKE_CONFIGURATION_TYPES)
-	# Single-configuration build generators like Makefile generators
-	# don't have subdirs below CMAKE_CURRENT_BINARY_DIR.
-        add_test(
-          NAME ${name}
+        # Single-configuration build generators like Makefile generators
+        # don't have subdirs below CMAKE_CURRENT_BINARY_DIR.
+        add_test(NAME ${name}
           COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
-              --build_dir=${CMAKE_CURRENT_BINARY_DIR} ${ARGN})
+            --build_dir=${CMAKE_CURRENT_BINARY_DIR} ${ARGN})
       endif (CMAKE_CONFIGURATION_TYPES)
-    else (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1)
+    else()
       # ${CMAKE_CURRENT_BINARY_DIR} is known at configuration time, so we can
       # directly bind it from cmake. ${CTEST_CONFIGURATION_TYPE} is known
       # only at ctest runtime (by calling ctest -c <Configuration>), so
       # we have to escape $ to delay variable substitution here.
-      add_test(
-        ${name}
-        ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+      add_test(NAME ${name}
+        COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
           --build_dir=${CMAKE_CURRENT_BINARY_DIR}/\${CTEST_CONFIGURATION_TYPE} ${ARGN})
-    endif (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1)
+    endif()
   endif(PYTHONINTERP_FOUND)
 endfunction()
 
@@ -306,6 +320,18 @@ function(install_project)
       RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
       ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
       LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+    if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+      # Install PDBs
+      foreach(t ${ARGN})
+        get_target_property(t_pdb_name ${t} COMPILE_PDB_NAME)
+        get_target_property(t_pdb_name_debug ${t} COMPILE_PDB_NAME_DEBUG)
+        get_target_property(t_pdb_output_directory ${t} PDB_OUTPUT_DIRECTORY)
+        install(FILES
+          "${t_pdb_output_directory}/\${CMAKE_INSTALL_CONFIG_NAME}/$<$<CONFIG:Debug>:${t_pdb_name_debug}>$<$<NOT:$<CONFIG:Debug>>:${t_pdb_name}>.pdb"
+          DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          OPTIONAL)
+      endforeach()
+    endif()
     # Configure and install pkgconfig files.
     foreach(t ${ARGN})
       set(configured_pc "${generated_dir}/${t}.pc")
diff --git a/media/libaom/src/third_party/googletest/src/googletest/cmake/libgtest.la.in b/media/libaom/src/third_party/googletest/src/googletest/cmake/libgtest.la.in
new file mode 100644
index 0000000000..840c83885f
--- /dev/null
+++ b/media/libaom/src/third_party/googletest/src/googletest/cmake/libgtest.la.in
@@ -0,0 +1,21 @@
+# libgtest.la - a libtool library file
+# Generated by libtool (GNU libtool) 2.4.6
+
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# Names of this library.
+library_names='libgtest.so'
+
+# Is this an already installed library?
+installed=yes
+
+# Should we warn about portability when linking against -modules?
+shouldnotlink=no
+
+# Files to dlopen/dlpreopen
+dlopen=''
+dlpreopen=''
+
+# Directory that this library needs to be installed in:
+libdir='@CMAKE_INSTALL_FULL_LIBDIR@'
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
index 39f0ded1b5..9b4d4d1337 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
@@ -35,8 +35,8 @@
 // directly.
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 
 #include "gtest/internal/gtest-death-test-internal.h"
 
@@ -97,6 +97,10 @@ GTEST_API_ bool InDeathTestChild();
 //
 //   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
 //
+// The final parameter to each of these macros is a matcher applied to any data
+// the sub-process wrote to stderr.  For compatibility with existing tests, a
+// bare string is interpreted as a regular expression matcher.
+//
 // On the regular expressions used in death tests:
 //
 //   GOOGLETEST_CM0005 DO NOT DELETE
@@ -162,27 +166,27 @@ GTEST_API_ bool InDeathTestChild();
 //   directory in PATH.
 //
 
-// Asserts that a given statement causes the program to exit, with an
-// integer exit status that satisfies predicate, and emitting error output
-// that matches regex.
-#define ASSERT_EXIT(statement, predicate, regex) \
-  GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+// Asserts that a given `statement` causes the program to exit, with an
+// integer exit status that satisfies `predicate`, and emitting error output
+// that matches `matcher`.
+# define ASSERT_EXIT(statement, predicate, matcher) \
+    GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
 
-// Like ASSERT_EXIT, but continues on to successive tests in the
+// Like `ASSERT_EXIT`, but continues on to successive tests in the
 // test suite, if any:
-#define EXPECT_EXIT(statement, predicate, regex) \
-  GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+# define EXPECT_EXIT(statement, predicate, matcher) \
+    GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
 
-// Asserts that a given statement causes the program to exit, either by
+// Asserts that a given `statement` causes the program to exit, either by
 // explicitly exiting with a nonzero exit code or being killed by a
-// signal, and emitting error output that matches regex.
-#define ASSERT_DEATH(statement, regex) \
-  ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+// signal, and emitting error output that matches `matcher`.
+# define ASSERT_DEATH(statement, matcher) \
+    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
-// Like ASSERT_DEATH, but continues on to successive tests in the
+// Like `ASSERT_DEATH`, but continues on to successive tests in the
 // test suite, if any:
-#define EXPECT_DEATH(statement, regex) \
-  EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+# define EXPECT_DEATH(statement, matcher) \
+    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
 // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
 
@@ -190,16 +194,14 @@ GTEST_API_ bool InDeathTestChild();
 class GTEST_API_ ExitedWithCode {
  public:
   explicit ExitedWithCode(int exit_code);
+  ExitedWithCode(const ExitedWithCode&) = default;
+  void operator=(const ExitedWithCode& other) = delete;
   bool operator()(int exit_status) const;
-
  private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ExitedWithCode &other);
-
   const int exit_code_;
 };
 
-#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Tests that an exit code describes an exit due to termination by a
 // given signal.
 // GOOGLETEST_CM0006 DO NOT DELETE
@@ -207,11 +209,10 @@ class GTEST_API_ KilledBySignal {
  public:
   explicit KilledBySignal(int signum);
   bool operator()(int exit_status) const;
-
  private:
   const int signum_;
 };
-#endif  // !GTEST_OS_WINDOWS
+# endif  // !GTEST_OS_WINDOWS
 
 // EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
 // The death testing framework causes this to have interesting semantics,
@@ -256,21 +257,23 @@ class GTEST_API_ KilledBySignal {
 //   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
 // }, "death");
 //
-#ifdef NDEBUG
+# ifdef NDEBUG
 
-#define EXPECT_DEBUG_DEATH(statement, regex) \
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
   GTEST_EXECUTE_STATEMENT_(statement, regex)
 
-#define ASSERT_DEBUG_DEATH(statement, regex) \
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
   GTEST_EXECUTE_STATEMENT_(statement, regex)
 
-#else
+# else
 
-#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex)
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  EXPECT_DEATH(statement, regex)
 
-#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex)
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  ASSERT_DEATH(statement, regex)
 
-#endif  // NDEBUG for EXPECT_DEBUG_DEATH
+# endif  // NDEBUG for EXPECT_DEBUG_DEATH
 #endif  // GTEST_HAS_DEATH_TEST
 
 // This macro is used for implementing macros such as
@@ -308,17 +311,18 @@ class GTEST_API_ KilledBySignal {
 //  statement unconditionally returns or throws. The Message constructor at
 //  the end allows the syntax of streaming additional messages into the
 //  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator)             \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
-  if (::testing::internal::AlwaysTrue()) {                                     \
-    GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \
-                        << "Statement '" #statement "' cannot be verified.";   \
-  } else if (::testing::internal::AlwaysFalse()) {                             \
-    ::testing::internal::RE::PartialMatch(".*", (regex));                      \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
-    terminator;                                                                \
-  } else                                                                       \
-    ::testing::Message()
+# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::AlwaysTrue()) { \
+      GTEST_LOG_(WARNING) \
+          << "Death tests are not supported on this platform.\n" \
+          << "Statement '" #statement "' cannot be verified."; \
+    } else if (::testing::internal::AlwaysFalse()) { \
+      ::testing::internal::RE::PartialMatch(".*", (regex)); \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+      terminator; \
+    } else \
+      ::testing::Message()
 
 // EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
 // ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
@@ -326,17 +330,17 @@ class GTEST_API_ KilledBySignal {
 // useful when you are combining death test assertions with normal test
 // assertions in one test.
 #if GTEST_HAS_DEATH_TEST
-#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-  EXPECT_DEATH(statement, regex)
-#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-  ASSERT_DEATH(statement, regex)
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    EXPECT_DEATH(statement, regex)
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    ASSERT_DEATH(statement, regex)
 #else
-#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
-#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return )
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
 #endif
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h
index 20be24f43c..9fa34a05ba 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h
@@ -32,13 +32,10 @@
 // This file implements just enough of the matcher interface to allow
 // EXPECT_DEATH and friends to accept a matcher argument.
 
-// IWYU pragma: private, include "testing/base/public/gunit.h"
-// IWYU pragma: friend third_party/googletest/googlemock/.*
-// IWYU pragma: friend third_party/googletest/googletest/.*
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
 
+#include <atomic>
 #include <memory>
 #include <ostream>
 #include <string>
@@ -63,38 +60,34 @@ GTEST_DISABLE_MSC_WARNINGS_PUSH_(
 namespace testing {
 
 // To implement a matcher Foo for type T, define:
-//   1. a class FooMatcherImpl that implements the
-//      MatcherInterface<T> interface, and
-//   2. a factory function that creates a Matcher<T> object from a
-//      FooMatcherImpl*.
-//
-// The two-level delegation design makes it possible to allow a user
-// to write "v" instead of "Eq(v)" where a Matcher is expected, which
-// is impossible if we pass matchers by pointers.  It also eases
-// ownership management as Matcher objects can now be copied like
-// plain values.
-
-// MatchResultListener is an abstract class.  Its << operator can be
-// used by a matcher to explain why a value matches or doesn't match.
+//   1. a class FooMatcherMatcher that implements the matcher interface:
+//     using is_gtest_matcher = void;
+//     bool MatchAndExplain(const T&, std::ostream*);
+//       (MatchResultListener* can also be used instead of std::ostream*)
+//     void DescribeTo(std::ostream*);
+//     void DescribeNegationTo(std::ostream*);
 //
+//   2. a factory function that creates a Matcher<T> object from a
+//      FooMatcherMatcher.
+
 class MatchResultListener {
  public:
   // Creates a listener object with the given underlying ostream.  The
   // listener does not own the ostream, and does not dereference it
   // in the constructor or destructor.
-  explicit MatchResultListener(::std::ostream *os) : stream_(os) {}
+  explicit MatchResultListener(::std::ostream* os) : stream_(os) {}
   virtual ~MatchResultListener() = 0;  // Makes this class abstract.
 
   // Streams x to the underlying ostream; does nothing if the ostream
   // is NULL.
   template <typename T>
-  MatchResultListener &operator<<(const T &x) {
+  MatchResultListener& operator<<(const T& x) {
     if (stream_ != nullptr) *stream_ << x;
     return *this;
   }
 
   // Returns the underlying ostream.
-  ::std::ostream *stream() { return stream_; }
+  ::std::ostream* stream() { return stream_; }
 
   // Returns true if and only if the listener is interested in an explanation
   // of the match result.  A matcher's MatchAndExplain() method can use
@@ -103,16 +96,17 @@ class MatchResultListener {
   bool IsInterested() const { return stream_ != nullptr; }
 
  private:
-  ::std::ostream *const stream_;
+  ::std::ostream* const stream_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener);
 };
 
-inline MatchResultListener::~MatchResultListener() {}
+inline MatchResultListener::~MatchResultListener() {
+}
 
 // An instance of a subclass of this knows how to describe itself as a
 // matcher.
-class MatcherDescriberInterface {
+class GTEST_API_ MatcherDescriberInterface {
  public:
   virtual ~MatcherDescriberInterface() {}
 
@@ -121,7 +115,7 @@ class MatcherDescriberInterface {
   // matcher should have.  The subject of the verb phrase is the value
   // being matched.  For example, the DescribeTo() method of the Gt(7)
   // matcher prints "is greater than 7".
-  virtual void DescribeTo(::std::ostream *os) const = 0;
+  virtual void DescribeTo(::std::ostream* os) const = 0;
 
   // Describes the negation of this matcher to an ostream.  For
   // example, if the description of this matcher is "is greater than
@@ -129,7 +123,7 @@ class MatcherDescriberInterface {
   // You are not required to override this when implementing
   // MatcherInterface, but it is highly advised so that your matcher
   // can produce good error messages.
-  virtual void DescribeNegationTo(::std::ostream *os) const {
+  virtual void DescribeNegationTo(::std::ostream* os) const {
     *os << "not (";
     DescribeTo(os);
     *os << ")";
@@ -171,7 +165,7 @@ class MatcherInterface : public MatcherDescriberInterface {
   // can talk to 'listener' without checking its validity first.
   // However, in order to implement dummy listeners efficiently,
   // listener->stream() may be NULL.
-  virtual bool MatchAndExplain(T x, MatchResultListener *listener) const = 0;
+  virtual bool MatchAndExplain(T x, MatchResultListener* listener) const = 0;
 
   // Inherits these methods from MatcherDescriberInterface:
   //   virtual void DescribeTo(::std::ostream* os) const = 0;
@@ -180,66 +174,29 @@ class MatcherInterface : public MatcherDescriberInterface {
 
 namespace internal {
 
-// Converts a MatcherInterface<T> to a MatcherInterface<const T&>.
-template <typename T>
-class MatcherInterfaceAdapter : public MatcherInterface<const T &> {
- public:
-  explicit MatcherInterfaceAdapter(const MatcherInterface<T> *impl)
-      : impl_(impl) {}
-  ~MatcherInterfaceAdapter() override { delete impl_; }
-
-  void DescribeTo(::std::ostream *os) const override { impl_->DescribeTo(os); }
-
-  void DescribeNegationTo(::std::ostream *os) const override {
-    impl_->DescribeNegationTo(os);
-  }
-
-  bool MatchAndExplain(const T &x,
-                       MatchResultListener *listener) const override {
-    return impl_->MatchAndExplain(x, listener);
-  }
-
- private:
-  const MatcherInterface<T> *const impl_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatcherInterfaceAdapter);
-};
-
 struct AnyEq {
   template <typename A, typename B>
-  bool operator()(const A &a, const B &b) const {
-    return a == b;
-  }
+  bool operator()(const A& a, const B& b) const { return a == b; }
 };
 struct AnyNe {
   template <typename A, typename B>
-  bool operator()(const A &a, const B &b) const {
-    return a != b;
-  }
+  bool operator()(const A& a, const B& b) const { return a != b; }
 };
 struct AnyLt {
   template <typename A, typename B>
-  bool operator()(const A &a, const B &b) const {
-    return a < b;
-  }
+  bool operator()(const A& a, const B& b) const { return a < b; }
 };
 struct AnyGt {
   template <typename A, typename B>
-  bool operator()(const A &a, const B &b) const {
-    return a > b;
-  }
+  bool operator()(const A& a, const B& b) const { return a > b; }
 };
 struct AnyLe {
   template <typename A, typename B>
-  bool operator()(const A &a, const B &b) const {
-    return a <= b;
-  }
+  bool operator()(const A& a, const B& b) const { return a <= b; }
 };
 struct AnyGe {
   template <typename A, typename B>
-  bool operator()(const A &a, const B &b) const {
-    return a >= b;
-  }
+  bool operator()(const A& a, const B& b) const { return a >= b; }
 };
 
 // A match result listener that ignores the explanation.
@@ -256,41 +213,64 @@ class DummyMatchResultListener : public MatchResultListener {
 // that the former is concrete.
 class StreamMatchResultListener : public MatchResultListener {
  public:
-  explicit StreamMatchResultListener(::std::ostream *os)
+  explicit StreamMatchResultListener(::std::ostream* os)
       : MatchResultListener(os) {}
 
  private:
   GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
 };
 
+struct SharedPayloadBase {
+  std::atomic<int> ref{1};
+  void Ref() { ref.fetch_add(1, std::memory_order_relaxed); }
+  bool Unref() { return ref.fetch_sub(1, std::memory_order_acq_rel) == 1; }
+};
+
+template <typename T>
+struct SharedPayload : SharedPayloadBase {
+  explicit SharedPayload(const T& v) : value(v) {}
+  explicit SharedPayload(T&& v) : value(std::move(v)) {}
+
+  static void Destroy(SharedPayloadBase* shared) {
+    delete static_cast<SharedPayload*>(shared);
+  }
+
+  T value;
+};
+
 // An internal class for implementing Matcher<T>, which will derive
 // from it.  We put functionalities common to all Matcher<T>
 // specializations here to avoid code duplication.
 template <typename T>
-class MatcherBase {
+class MatcherBase : private MatcherDescriberInterface {
  public:
   // Returns true if and only if the matcher matches x; also explains the
   // match result to 'listener'.
-  bool MatchAndExplain(const T &x, MatchResultListener *listener) const {
-    return impl_->MatchAndExplain(x, listener);
+  bool MatchAndExplain(const T& x, MatchResultListener* listener) const {
+    GTEST_CHECK_(vtable_ != nullptr);
+    return vtable_->match_and_explain(*this, x, listener);
   }
 
   // Returns true if and only if this matcher matches x.
-  bool Matches(const T &x) const {
+  bool Matches(const T& x) const {
     DummyMatchResultListener dummy;
     return MatchAndExplain(x, &dummy);
   }
 
   // Describes this matcher to an ostream.
-  void DescribeTo(::std::ostream *os) const { impl_->DescribeTo(os); }
+  void DescribeTo(::std::ostream* os) const final {
+    GTEST_CHECK_(vtable_ != nullptr);
+    vtable_->describe(*this, os, false);
+  }
 
   // Describes the negation of this matcher to an ostream.
-  void DescribeNegationTo(::std::ostream *os) const {
-    impl_->DescribeNegationTo(os);
+  void DescribeNegationTo(::std::ostream* os) const final {
+    GTEST_CHECK_(vtable_ != nullptr);
+    vtable_->describe(*this, os, true);
   }
 
   // Explains why x matches, or doesn't match, the matcher.
-  void ExplainMatchResultTo(const T &x, ::std::ostream *os) const {
+  void ExplainMatchResultTo(const T& x, ::std::ostream* os) const {
     StreamMatchResultListener listener(os);
     MatchAndExplain(x, &listener);
   }
@@ -298,30 +278,195 @@ class MatcherBase {
   // Returns the describer for this matcher object; retains ownership
   // of the describer, which is only guaranteed to be alive when
   // this matcher object is alive.
-  const MatcherDescriberInterface *GetDescriber() const { return impl_.get(); }
+  const MatcherDescriberInterface* GetDescriber() const {
+    if (vtable_ == nullptr) return nullptr;
+    return vtable_->get_describer(*this);
+  }
 
  protected:
-  MatcherBase() {}
+  MatcherBase() : vtable_(nullptr) {}
 
   // Constructs a matcher from its implementation.
-  explicit MatcherBase(const MatcherInterface<const T &> *impl) : impl_(impl) {}
-
   template <typename U>
-  explicit MatcherBase(
-      const MatcherInterface<U> *impl,
-      typename std::enable_if<!std::is_same<U, const U &>::value>::type * =
-          nullptr)
-      : impl_(new internal::MatcherInterfaceAdapter<U>(impl)) {}
+  explicit MatcherBase(const MatcherInterface<U>* impl) {
+    Init(impl);
+  }
 
-  MatcherBase(const MatcherBase &) = default;
-  MatcherBase &operator=(const MatcherBase &) = default;
-  MatcherBase(MatcherBase &&) = default;
-  MatcherBase &operator=(MatcherBase &&) = default;
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  MatcherBase(M&& m) {  // NOLINT
+    Init(std::forward<M>(m));
+  }
 
-  virtual ~MatcherBase() {}
+  MatcherBase(const MatcherBase& other)
+      : vtable_(other.vtable_), buffer_(other.buffer_) {
+    if (IsShared()) buffer_.shared->Ref();
+  }
+
+  MatcherBase& operator=(const MatcherBase& other) {
+    if (this == &other) return *this;
+    Destroy();
+    vtable_ = other.vtable_;
+    buffer_ = other.buffer_;
+    if (IsShared()) buffer_.shared->Ref();
+    return *this;
+  }
+
+  MatcherBase(MatcherBase&& other)
+      : vtable_(other.vtable_), buffer_(other.buffer_) {
+    other.vtable_ = nullptr;
+  }
+
+  MatcherBase& operator=(MatcherBase&& other) {
+    if (this == &other) return *this;
+    Destroy();
+    vtable_ = other.vtable_;
+    buffer_ = other.buffer_;
+    other.vtable_ = nullptr;
+    return *this;
+  }
+
+  ~MatcherBase() override { Destroy(); }
 
  private:
-  std::shared_ptr<const MatcherInterface<const T &>> impl_;
+  struct VTable {
+    bool (*match_and_explain)(const MatcherBase&, const T&,
+                              MatchResultListener*);
+    void (*describe)(const MatcherBase&, std::ostream*, bool negation);
+    // Returns the captured object if it implements the interface, otherwise
+    // returns the MatcherBase itself.
+    const MatcherDescriberInterface* (*get_describer)(const MatcherBase&);
+    // Called on shared instances when the reference count reaches 0.
+    void (*shared_destroy)(SharedPayloadBase*);
+  };
+
+  bool IsShared() const {
+    return vtable_ != nullptr && vtable_->shared_destroy != nullptr;
+  }
+
+  // If the implementation uses a listener, call that.
+  template <typename P>
+  static auto MatchAndExplainImpl(const MatcherBase& m, const T& value,
+                                  MatchResultListener* listener)
+      -> decltype(P::Get(m).MatchAndExplain(value, listener->stream())) {
+    return P::Get(m).MatchAndExplain(value, listener->stream());
+  }
+
+  template <typename P>
+  static auto MatchAndExplainImpl(const MatcherBase& m, const T& value,
+                                  MatchResultListener* listener)
+      -> decltype(P::Get(m).MatchAndExplain(value, listener)) {
+    return P::Get(m).MatchAndExplain(value, listener);
+  }
+
+  template <typename P>
+  static void DescribeImpl(const MatcherBase& m, std::ostream* os,
+                           bool negation) {
+    if (negation) {
+      P::Get(m).DescribeNegationTo(os);
+    } else {
+      P::Get(m).DescribeTo(os);
+    }
+  }
+
+  template <typename P>
+  static const MatcherDescriberInterface* GetDescriberImpl(
+      const MatcherBase& m) {
+    // If the impl is a MatcherDescriberInterface, then return it.
+    // Otherwise use MatcherBase itself.
+    // This allows us to implement the GetDescriber() function without support
+    // from the impl, but some users really want to get their impl back when
+    // they call GetDescriber().
+    // We use std::get on a tuple as a workaround of not having `if constexpr`.
+    return std::get<(
+        std::is_convertible<decltype(&P::Get(m)),
+                            const MatcherDescriberInterface*>::value
+            ? 1
+            : 0)>(std::make_tuple(&m, &P::Get(m)));
+  }
+
+  template <typename P>
+  const VTable* GetVTable() {
+    static constexpr VTable kVTable = {&MatchAndExplainImpl<P>,
+                                       &DescribeImpl<P>, &GetDescriberImpl<P>,
+                                       P::shared_destroy};
+    return &kVTable;
+  }
+
+  union Buffer {
+    // Add some types to give Buffer some common alignment/size use cases.
+    void* ptr;
+    double d;
+    int64_t i;
+    // And add one for the out-of-line cases.
+    SharedPayloadBase* shared;
+  };
+
+  void Destroy() {
+    if (IsShared() && buffer_.shared->Unref()) {
+      vtable_->shared_destroy(buffer_.shared);
+    }
+  }
+
+  template <typename M>
+  static constexpr bool IsInlined() {
+    return sizeof(M) <= sizeof(Buffer) && alignof(M) <= alignof(Buffer) &&
+           std::is_trivially_copy_constructible<M>::value &&
+           std::is_trivially_destructible<M>::value;
+  }
+
+  template <typename M, bool = MatcherBase::IsInlined<M>()>
+  struct ValuePolicy {
+    static const M& Get(const MatcherBase& m) {
+      // When inlined along with Init, need to be explicit to avoid violating
+      // strict aliasing rules.
+      const M *ptr = static_cast<const M*>(
+          static_cast<const void*>(&m.buffer_));
+      return *ptr;
+    }
+    static void Init(MatcherBase& m, M impl) {
+      ::new (static_cast<void*>(&m.buffer_)) M(impl);
+    }
+    static constexpr auto shared_destroy = nullptr;
+  };
+
+  template <typename M>
+  struct ValuePolicy<M, false> {
+    using Shared = SharedPayload<M>;
+    static const M& Get(const MatcherBase& m) {
+      return static_cast<Shared*>(m.buffer_.shared)->value;
+    }
+    template <typename Arg>
+    static void Init(MatcherBase& m, Arg&& arg) {
+      m.buffer_.shared = new Shared(std::forward<Arg>(arg));
+    }
+    static constexpr auto shared_destroy = &Shared::Destroy;
+  };
+
+  template <typename U, bool B>
+  struct ValuePolicy<const MatcherInterface<U>*, B> {
+    using M = const MatcherInterface<U>;
+    using Shared = SharedPayload<std::unique_ptr<M>>;
+    static const M& Get(const MatcherBase& m) {
+      return *static_cast<Shared*>(m.buffer_.shared)->value;
+    }
+    static void Init(MatcherBase& m, M* impl) {
+      m.buffer_.shared = new Shared(std::unique_ptr<M>(impl));
+    }
+
+    static constexpr auto shared_destroy = &Shared::Destroy;
+  };
+
+  template <typename M>
+  void Init(M&& m) {
+    using MM = typename std::decay<M>::type;
+    using Policy = ValuePolicy<MM>;
+    vtable_ = GetVTable<Policy>();
+    Policy::Init(*this, std::forward<M>(m));
+  }
+
+  const VTable* vtable_;
+  Buffer buffer_;
 };
 
 }  // namespace internal
@@ -339,16 +484,20 @@ class Matcher : public internal::MatcherBase<T> {
   explicit Matcher() {}  // NOLINT
 
   // Constructs a matcher from its implementation.
-  explicit Matcher(const MatcherInterface<const T &> *impl)
+  explicit Matcher(const MatcherInterface<const T&>* impl)
       : internal::MatcherBase<T>(impl) {}
 
   template <typename U>
   explicit Matcher(
-      const MatcherInterface<U> *impl,
-      typename std::enable_if<!std::is_same<U, const U &>::value>::type * =
+      const MatcherInterface<U>* impl,
+      typename std::enable_if<!std::is_same<U, const U&>::value>::type* =
           nullptr)
       : internal::MatcherBase<T>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m) : internal::MatcherBase<T>(std::forward<M>(m)) {}  // NOLINT
+
   // Implicit constructor here allows people to write
   // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes
   Matcher(T value);  // NOLINT
@@ -358,20 +507,25 @@ class Matcher : public internal::MatcherBase<T> {
 // instead of Eq(str) and "foo" instead of Eq("foo") when a std::string
 // matcher is expected.
 template <>
-class GTEST_API_ Matcher<const std::string &>
-    : public internal::MatcherBase<const std::string &> {
+class GTEST_API_ Matcher<const std::string&>
+    : public internal::MatcherBase<const std::string&> {
  public:
   Matcher() {}
 
-  explicit Matcher(const MatcherInterface<const std::string &> *impl)
-      : internal::MatcherBase<const std::string &>(impl) {}
+  explicit Matcher(const MatcherInterface<const std::string&>* impl)
+      : internal::MatcherBase<const std::string&>(impl) {}
+
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<const std::string&>(std::forward<M>(m)) {}
 
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a std::string object.
-  Matcher(const std::string &s);  // NOLINT
+  Matcher(const std::string& s);  // NOLINT
 
   // Allows the user to write "foo" instead of Eq("foo") sometimes.
-  Matcher(const char *s);  // NOLINT
+  Matcher(const char* s);  // NOLINT
 };
 
 template <>
@@ -380,17 +534,22 @@ class GTEST_API_ Matcher<std::string>
  public:
   Matcher() {}
 
-  explicit Matcher(const MatcherInterface<const std::string &> *impl)
+  explicit Matcher(const MatcherInterface<const std::string&>* impl)
       : internal::MatcherBase<std::string>(impl) {}
-  explicit Matcher(const MatcherInterface<std::string> *impl)
+  explicit Matcher(const MatcherInterface<std::string>* impl)
       : internal::MatcherBase<std::string>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<std::string>(std::forward<M>(m)) {}
+
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a string object.
-  Matcher(const std::string &s);  // NOLINT
+  Matcher(const std::string& s);  // NOLINT
 
   // Allows the user to write "foo" instead of Eq("foo") sometimes.
-  Matcher(const char *s);  // NOLINT
+  Matcher(const char* s);  // NOLINT
 };
 
 #if GTEST_INTERNAL_HAS_STRING_VIEW
@@ -398,20 +557,26 @@ class GTEST_API_ Matcher<std::string>
 // instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view
 // matcher is expected.
 template <>
-class GTEST_API_ Matcher<const internal::StringView &>
-    : public internal::MatcherBase<const internal::StringView &> {
+class GTEST_API_ Matcher<const internal::StringView&>
+    : public internal::MatcherBase<const internal::StringView&> {
  public:
   Matcher() {}
 
-  explicit Matcher(const MatcherInterface<const internal::StringView &> *impl)
-      : internal::MatcherBase<const internal::StringView &>(impl) {}
+  explicit Matcher(const MatcherInterface<const internal::StringView&>* impl)
+      : internal::MatcherBase<const internal::StringView&>(impl) {}
+
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<const internal::StringView&>(std::forward<M>(m)) {
+  }
 
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a std::string object.
-  Matcher(const std::string &s);  // NOLINT
+  Matcher(const std::string& s);  // NOLINT
 
   // Allows the user to write "foo" instead of Eq("foo") sometimes.
-  Matcher(const char *s);  // NOLINT
+  Matcher(const char* s);  // NOLINT
 
   // Allows the user to pass absl::string_views or std::string_views directly.
   Matcher(internal::StringView s);  // NOLINT
@@ -423,17 +588,22 @@ class GTEST_API_ Matcher<internal::StringView>
  public:
   Matcher() {}
 
-  explicit Matcher(const MatcherInterface<const internal::StringView &> *impl)
+  explicit Matcher(const MatcherInterface<const internal::StringView&>* impl)
       : internal::MatcherBase<internal::StringView>(impl) {}
-  explicit Matcher(const MatcherInterface<internal::StringView> *impl)
+  explicit Matcher(const MatcherInterface<internal::StringView>* impl)
       : internal::MatcherBase<internal::StringView>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<internal::StringView>(std::forward<M>(m)) {}
+
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a std::string object.
-  Matcher(const std::string &s);  // NOLINT
+  Matcher(const std::string& s);  // NOLINT
 
   // Allows the user to write "foo" instead of Eq("foo") sometimes.
-  Matcher(const char *s);  // NOLINT
+  Matcher(const char* s);  // NOLINT
 
   // Allows the user to pass absl::string_views or std::string_views directly.
   Matcher(internal::StringView s);  // NOLINT
@@ -442,7 +612,7 @@ class GTEST_API_ Matcher<internal::StringView>
 
 // Prints a matcher in a human-readable format.
 template <typename T>
-std::ostream &operator<<(std::ostream &os, const Matcher<T> &matcher) {
+std::ostream& operator<<(std::ostream& os, const Matcher<T>& matcher) {
   matcher.DescribeTo(&os);
   return os;
 }
@@ -462,34 +632,34 @@ std::ostream &operator<<(std::ostream &os, const Matcher<T> &matcher) {
 template <class Impl>
 class PolymorphicMatcher {
  public:
-  explicit PolymorphicMatcher(const Impl &an_impl) : impl_(an_impl) {}
+  explicit PolymorphicMatcher(const Impl& an_impl) : impl_(an_impl) {}
 
   // Returns a mutable reference to the underlying matcher
   // implementation object.
-  Impl &mutable_impl() { return impl_; }
+  Impl& mutable_impl() { return impl_; }
 
   // Returns an immutable reference to the underlying matcher
   // implementation object.
-  const Impl &impl() const { return impl_; }
+  const Impl& impl() const { return impl_; }
 
   template <typename T>
   operator Matcher<T>() const {
-    return Matcher<T>(new MonomorphicImpl<const T &>(impl_));
+    return Matcher<T>(new MonomorphicImpl<const T&>(impl_));
   }
 
  private:
   template <typename T>
   class MonomorphicImpl : public MatcherInterface<T> {
    public:
-    explicit MonomorphicImpl(const Impl &impl) : impl_(impl) {}
+    explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
 
-    void DescribeTo(::std::ostream *os) const override { impl_.DescribeTo(os); }
+    void DescribeTo(::std::ostream* os) const override { impl_.DescribeTo(os); }
 
-    void DescribeNegationTo(::std::ostream *os) const override {
+    void DescribeNegationTo(::std::ostream* os) const override {
       impl_.DescribeNegationTo(os);
     }
 
-    bool MatchAndExplain(T x, MatchResultListener *listener) const override {
+    bool MatchAndExplain(T x, MatchResultListener* listener) const override {
       return impl_.MatchAndExplain(x, listener);
     }
 
@@ -507,7 +677,7 @@ class PolymorphicMatcher {
 // MakeMatcher may create a Matcher that accepts its argument by value, which
 // leads to unnecessary copies & lack of support for non-copyable types.
 template <typename T>
-inline Matcher<T> MakeMatcher(const MatcherInterface<T> *impl) {
+inline Matcher<T> MakeMatcher(const MatcherInterface<T>* impl) {
   return Matcher<T>(impl);
 }
 
@@ -519,7 +689,7 @@ inline Matcher<T> MakeMatcher(const MatcherInterface<T> *impl) {
 // vs
 //   PolymorphicMatcher<TypeOfFoo>(foo);
 template <class Impl>
-inline PolymorphicMatcher<Impl> MakePolymorphicMatcher(const Impl &impl) {
+inline PolymorphicMatcher<Impl> MakePolymorphicMatcher(const Impl& impl) {
   return PolymorphicMatcher<Impl>(impl);
 }
 
@@ -537,105 +707,100 @@ namespace internal {
 template <typename D, typename Rhs, typename Op>
 class ComparisonBase {
  public:
-  explicit ComparisonBase(const Rhs &rhs) : rhs_(rhs) {}
+  explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {}
+
+  using is_gtest_matcher = void;
+
   template <typename Lhs>
-  operator Matcher<Lhs>() const {
-    return Matcher<Lhs>(new Impl<const Lhs &>(rhs_));
+  bool MatchAndExplain(const Lhs& lhs, std::ostream*) const {
+    return Op()(lhs, Unwrap(rhs_));
+  }
+  void DescribeTo(std::ostream* os) const {
+    *os << D::Desc() << " ";
+    UniversalPrint(Unwrap(rhs_), os);
+  }
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << D::NegatedDesc() << " ";
+    UniversalPrint(Unwrap(rhs_), os);
   }
 
  private:
   template <typename T>
-  static const T &Unwrap(const T &v) {
+  static const T& Unwrap(const T& v) {
     return v;
   }
   template <typename T>
-  static const T &Unwrap(std::reference_wrapper<T> v) {
+  static const T& Unwrap(std::reference_wrapper<T> v) {
     return v;
   }
 
-  template <typename Lhs, typename = Rhs>
-  class Impl : public MatcherInterface<Lhs> {
-   public:
-    explicit Impl(const Rhs &rhs) : rhs_(rhs) {}
-    bool MatchAndExplain(Lhs lhs,
-                         MatchResultListener * /* listener */) const override {
-      return Op()(lhs, Unwrap(rhs_));
-    }
-    void DescribeTo(::std::ostream *os) const override {
-      *os << D::Desc() << " ";
-      UniversalPrint(Unwrap(rhs_), os);
-    }
-    void DescribeNegationTo(::std::ostream *os) const override {
-      *os << D::NegatedDesc() << " ";
-      UniversalPrint(Unwrap(rhs_), os);
-    }
-
-   private:
-    Rhs rhs_;
-  };
   Rhs rhs_;
 };
 
 template <typename Rhs>
 class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
  public:
-  explicit EqMatcher(const Rhs &rhs)
-      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) {}
-  static const char *Desc() { return "is equal to"; }
-  static const char *NegatedDesc() { return "isn't equal to"; }
+  explicit EqMatcher(const Rhs& rhs)
+      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) { }
+  static const char* Desc() { return "is equal to"; }
+  static const char* NegatedDesc() { return "isn't equal to"; }
 };
 template <typename Rhs>
 class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
  public:
-  explicit NeMatcher(const Rhs &rhs)
-      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) {}
-  static const char *Desc() { return "isn't equal to"; }
-  static const char *NegatedDesc() { return "is equal to"; }
+  explicit NeMatcher(const Rhs& rhs)
+      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) { }
+  static const char* Desc() { return "isn't equal to"; }
+  static const char* NegatedDesc() { return "is equal to"; }
 };
 template <typename Rhs>
 class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
  public:
-  explicit LtMatcher(const Rhs &rhs)
-      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) {}
-  static const char *Desc() { return "is <"; }
-  static const char *NegatedDesc() { return "isn't <"; }
+  explicit LtMatcher(const Rhs& rhs)
+      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) { }
+  static const char* Desc() { return "is <"; }
+  static const char* NegatedDesc() { return "isn't <"; }
 };
 template <typename Rhs>
 class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
  public:
-  explicit GtMatcher(const Rhs &rhs)
-      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) {}
-  static const char *Desc() { return "is >"; }
-  static const char *NegatedDesc() { return "isn't >"; }
+  explicit GtMatcher(const Rhs& rhs)
+      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) { }
+  static const char* Desc() { return "is >"; }
+  static const char* NegatedDesc() { return "isn't >"; }
 };
 template <typename Rhs>
 class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
  public:
-  explicit LeMatcher(const Rhs &rhs)
-      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) {}
-  static const char *Desc() { return "is <="; }
-  static const char *NegatedDesc() { return "isn't <="; }
+  explicit LeMatcher(const Rhs& rhs)
+      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) { }
+  static const char* Desc() { return "is <="; }
+  static const char* NegatedDesc() { return "isn't <="; }
 };
 template <typename Rhs>
 class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
  public:
-  explicit GeMatcher(const Rhs &rhs)
-      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) {}
-  static const char *Desc() { return "is >="; }
-  static const char *NegatedDesc() { return "isn't >="; }
+  explicit GeMatcher(const Rhs& rhs)
+      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) { }
+  static const char* Desc() { return "is >="; }
+  static const char* NegatedDesc() { return "isn't >="; }
 };
 
+template <typename T, typename = typename std::enable_if<
+                          std::is_constructible<std::string, T>::value>::type>
+using StringLike = T;
+
 // Implements polymorphic matchers MatchesRegex(regex) and
 // ContainsRegex(regex), which can be used as a Matcher<T> as long as
 // T can be converted to a string.
 class MatchesRegexMatcher {
  public:
-  MatchesRegexMatcher(const RE *regex, bool full_match)
+  MatchesRegexMatcher(const RE* regex, bool full_match)
       : regex_(regex), full_match_(full_match) {}
 
 #if GTEST_INTERNAL_HAS_STRING_VIEW
-  bool MatchAndExplain(const internal::StringView &s,
-                       MatchResultListener *listener) const {
+  bool MatchAndExplain(const internal::StringView& s,
+                       MatchResultListener* listener) const {
     return MatchAndExplain(std::string(s), listener);
   }
 #endif  // GTEST_INTERNAL_HAS_STRING_VIEW
@@ -646,7 +811,7 @@ class MatchesRegexMatcher {
   //   const wchar_t*
   //   wchar_t*
   template <typename CharType>
-  bool MatchAndExplain(CharType *s, MatchResultListener *listener) const {
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
     return s != nullptr && MatchAndExplain(std::string(s), listener);
   }
 
@@ -655,19 +820,19 @@ class MatchesRegexMatcher {
   // This is a template, not just a plain function with const std::string&,
   // because absl::string_view has some interfering non-explicit constructors.
   template <class MatcheeStringType>
-  bool MatchAndExplain(const MatcheeStringType &s,
-                       MatchResultListener * /* listener */) const {
-    const std::string &s2(s);
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const std::string& s2(s);
     return full_match_ ? RE::FullMatch(s2, *regex_)
                        : RE::PartialMatch(s2, *regex_);
   }
 
-  void DescribeTo(::std::ostream *os) const {
+  void DescribeTo(::std::ostream* os) const {
     *os << (full_match_ ? "matches" : "contains") << " regular expression ";
     UniversalPrinter<std::string>::Print(regex_->pattern(), os);
   }
 
-  void DescribeNegationTo(::std::ostream *os) const {
+  void DescribeNegationTo(::std::ostream* os) const {
     *os << "doesn't " << (full_match_ ? "match" : "contain")
         << " regular expression ";
     UniversalPrinter<std::string>::Print(regex_->pattern(), os);
@@ -682,39 +847,37 @@ class MatchesRegexMatcher {
 // Matches a string that fully matches regular expression 'regex'.
 // The matcher takes ownership of 'regex'.
 inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
-    const internal::RE *regex) {
+    const internal::RE* regex) {
   return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true));
 }
-inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
-    const std::string &regex) {
-  return MatchesRegex(new internal::RE(regex));
+template <typename T = std::string>
+PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const internal::StringLike<T>& regex) {
+  return MatchesRegex(new internal::RE(std::string(regex)));
 }
 
 // Matches a string that contains regular expression 'regex'.
 // The matcher takes ownership of 'regex'.
 inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
-    const internal::RE *regex) {
+    const internal::RE* regex) {
   return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false));
 }
-inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
-    const std::string &regex) {
-  return ContainsRegex(new internal::RE(regex));
+template <typename T = std::string>
+PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const internal::StringLike<T>& regex) {
+  return ContainsRegex(new internal::RE(std::string(regex)));
 }
 
 // Creates a polymorphic matcher that matches anything equal to x.
 // Note: if the parameter of Eq() were declared as const T&, Eq("foo")
 // wouldn't compile.
 template <typename T>
-inline internal::EqMatcher<T> Eq(T x) {
-  return internal::EqMatcher<T>(x);
-}
+inline internal::EqMatcher<T> Eq(T x) { return internal::EqMatcher<T>(x); }
 
 // Constructs a Matcher<T> from a 'value' of type T.  The constructed
 // matcher matches any value that's equal to 'value'.
 template <typename T>
-Matcher<T>::Matcher(T value) {
-  *this = Eq(value);
-}
+Matcher<T>::Matcher(T value) { *this = Eq(value); }
 
 // Creates a monomorphic matcher that matches anything with type Lhs
 // and equal to rhs.  A user may need to use this instead of Eq(...)
@@ -729,9 +892,7 @@ Matcher<T>::Matcher(T value) {
 // can always write Matcher<T>(Lt(5)) to be explicit about the type,
 // for example.
 template <typename Lhs, typename Rhs>
-inline Matcher<Lhs> TypedEq(const Rhs &rhs) {
-  return Eq(rhs);
-}
+inline Matcher<Lhs> TypedEq(const Rhs& rhs) { return Eq(rhs); }
 
 // Creates a polymorphic matcher that matches anything >= x.
 template <typename Rhs>
@@ -766,4 +927,4 @@ inline internal::NeMatcher<Rhs> Ne(Rhs x) {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-message.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-message.h
index 713facae84..becfd49fcb 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-message.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-message.h
@@ -44,8 +44,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 
 #include <limits>
 #include <memory>
@@ -58,7 +58,7 @@ GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 
 // Ensures that there is at least one operator<< in the global namespace.
 // See Message& operator<<(...) below for why.
-void operator<<(const testing::internal::Secret &, int);
+void operator<<(const testing::internal::Secret&, int);
 
 namespace testing {
 
@@ -92,25 +92,25 @@ class GTEST_API_ Message {
  private:
   // The type of basic IO manipulators (endl, ends, and flush) for
   // narrow streams.
-  typedef std::ostream &(*BasicNarrowIoManip)(std::ostream &);
+  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
 
  public:
   // Constructs an empty Message.
   Message();
 
   // Copy constructor.
-  Message(const Message &msg) : ss_(new ::std::stringstream) {  // NOLINT
+  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
     *ss_ << msg.GetString();
   }
 
   // Constructs a Message from a C-string.
-  explicit Message(const char *str) : ss_(new ::std::stringstream) {
+  explicit Message(const char* str) : ss_(new ::std::stringstream) {
     *ss_ << str;
   }
 
   // Streams a non-pointer value to this object.
   template <typename T>
-  inline Message &operator<<(const T &val) {
+  inline Message& operator <<(const T& val) {
     // Some libraries overload << for STL containers.  These
     // overloads are defined in the global namespace instead of ::std.
     //
@@ -125,7 +125,7 @@ class GTEST_API_ Message {
     // from the global namespace.  With this using declaration,
     // overloads of << defined in the global namespace and those
     // visible via Koenig lookup are both exposed in this function.
-    using ::operator<<;
+    using ::operator <<;
     *ss_ << val;
     return *this;
   }
@@ -144,7 +144,7 @@ class GTEST_API_ Message {
   // ensure consistent result across compilers, we always treat NULL
   // as "(null)".
   template <typename T>
-  inline Message &operator<<(T *const &pointer) {  // NOLINT
+  inline Message& operator <<(T* const& pointer) {  // NOLINT
     if (pointer == nullptr) {
       *ss_ << "(null)";
     } else {
@@ -159,23 +159,25 @@ class GTEST_API_ Message {
   // templatized version above.  Without this definition, streaming
   // endl or other basic IO manipulators to Message will confuse the
   // compiler.
-  Message &operator<<(BasicNarrowIoManip val) {
+  Message& operator <<(BasicNarrowIoManip val) {
     *ss_ << val;
     return *this;
   }
 
   // Instead of 1/0, we want to see true/false for bool values.
-  Message &operator<<(bool b) { return *this << (b ? "true" : "false"); }
+  Message& operator <<(bool b) {
+    return *this << (b ? "true" : "false");
+  }
 
   // These two overloads allow streaming a wide C string to a Message
   // using the UTF-8 encoding.
-  Message &operator<<(const wchar_t *wide_c_str);
-  Message &operator<<(wchar_t *wide_c_str);
+  Message& operator <<(const wchar_t* wide_c_str);
+  Message& operator <<(wchar_t* wide_c_str);
 
 #if GTEST_HAS_STD_WSTRING
   // Converts the given wide string to a narrow string using the UTF-8
   // encoding, and streams the result to this Message object.
-  Message &operator<<(const ::std::wstring &wstr);
+  Message& operator <<(const ::std::wstring& wstr);
 #endif  // GTEST_HAS_STD_WSTRING
 
   // Gets the text streamed to this object so far as an std::string.
@@ -190,11 +192,11 @@ class GTEST_API_ Message {
 
   // We declare (but don't implement) this to prevent the compiler
   // from implementing the assignment operator.
-  void operator=(const Message &);
+  void operator=(const Message&);
 };
 
 // Streams a Message to an ostream.
-inline std::ostream &operator<<(std::ostream &os, const Message &sb) {
+inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
   return os << sb.GetString();
 }
 
@@ -205,7 +207,7 @@ namespace internal {
 // ::std::string, ::wstring, or ::std::wstring object, each NUL
 // character in it is replaced with "\\0".
 template <typename T>
-std::string StreamableToString(const T &streamable) {
+std::string StreamableToString(const T& streamable) {
   return (Message() << streamable).GetString();
 }
 
@@ -214,4 +216,4 @@ std::string StreamableToString(const T &streamable) {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
index 8d01df5250..804e702817 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
@@ -30,11 +30,9 @@
 // Macros and functions for implementing parameterized tests
 // in Google C++ Testing and Mocking Framework (Google Test)
 //
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
 // GOOGLETEST_CM0001 DO NOT DELETE
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
 // Value-parameterized tests allow you to test your code with different
 // parameters without writing multiple copies of the same test.
@@ -306,7 +304,7 @@ internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
 
 template <class Container>
 internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container &container) {
+    const Container& container) {
   return ValuesIn(container.begin(), container.end());
 }
 
@@ -355,7 +353,9 @@ internal::ValueArray<T...> Values(T... v) {
 // }
 // INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool());
 //
-inline internal::ParamGenerator<bool> Bool() { return Values(false, true); }
+inline internal::ParamGenerator<bool> Bool() {
+  return Values(false, true);
+}
 
 // Combine() allows the user to combine two or more sequences to produce
 // values of a Cartesian product of those sequences' elements.
@@ -368,8 +368,6 @@ inline internal::ParamGenerator<bool> Bool() { return Values(false, true); }
 //     std::tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
 //     of elements from sequences produces by gen1, gen2, ..., genN.
 //
-// Combine can have up to 10 arguments.
-//
 // Example:
 //
 // This will instantiate tests in test suite AnimalTest each one with
@@ -404,7 +402,7 @@ inline internal::ParamGenerator<bool> Bool() { return Values(false, true); }
 //                          Combine(Bool(), Bool()));
 //
 template <typename... Generator>
-internal::CartesianProductHolder<Generator...> Combine(const Generator &... g) {
+internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
   return internal::CartesianProductHolder<Generator...>(g...);
 }
 
@@ -425,7 +423,8 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator &... g) {
           ->AddTestPattern(                                                    \
               GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name),  \
               new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
-                  test_suite_name, test_name)>());                             \
+                  test_suite_name, test_name)>(),                              \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__));          \
       return 0;                                                                \
     }                                                                          \
     static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
@@ -454,42 +453,43 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator &... g) {
 #define GTEST_GET_FIRST_(first, ...) first
 #define GTEST_GET_SECOND_(first, second, ...) second
 
-#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)               \
-  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>     \
-      gtest_##prefix##test_suite_name##_EvalGenerator_() {                   \
-    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));       \
-  }                                                                          \
-  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(  \
-      const ::testing::TestParamInfo<test_suite_name::ParamType> &info) {    \
-    if (::testing::internal::AlwaysFalse()) {                                \
-      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(     \
-          __VA_ARGS__,                                                       \
-          ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
-          DUMMY_PARAM_)));                                                   \
-      auto t = std::make_tuple(__VA_ARGS__);                                 \
-      static_assert(std::tuple_size<decltype(t)>::value <= 2,                \
-                    "Too Many Args!");                                       \
-    }                                                                        \
-    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                \
-        __VA_ARGS__,                                                         \
-        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,   \
-        DUMMY_PARAM_))))(info);                                              \
-  }                                                                          \
-  static int gtest_##prefix##test_suite_name##_dummy_                        \
-      GTEST_ATTRIBUTE_UNUSED_ =                                              \
-          ::testing::UnitTest::GetInstance()                                 \
-              ->parameterized_test_registry()                                \
-              .GetTestSuitePatternHolder<test_suite_name>(                   \
-                  GTEST_STRINGIFY_(test_suite_name),                         \
-                  ::testing::internal::CodeLocation(__FILE__, __LINE__))     \
-              ->AddTestSuiteInstantiation(                                   \
-                  GTEST_STRINGIFY_(prefix),                                  \
-                  &gtest_##prefix##test_suite_name##_EvalGenerator_,         \
-                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,      \
+#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)                \
+  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>      \
+      gtest_##prefix##test_suite_name##_EvalGenerator_() {                    \
+    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));        \
+  }                                                                           \
+  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(   \
+      const ::testing::TestParamInfo<test_suite_name::ParamType>& info) {     \
+    if (::testing::internal::AlwaysFalse()) {                                 \
+      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(      \
+          __VA_ARGS__,                                                        \
+          ::testing::internal::DefaultParamName<test_suite_name::ParamType>,  \
+          DUMMY_PARAM_)));                                                    \
+      auto t = std::make_tuple(__VA_ARGS__);                                  \
+      static_assert(std::tuple_size<decltype(t)>::value <= 2,                 \
+                    "Too Many Args!");                                        \
+    }                                                                         \
+    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                 \
+        __VA_ARGS__,                                                          \
+        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,    \
+        DUMMY_PARAM_))))(info);                                               \
+  }                                                                           \
+  static int gtest_##prefix##test_suite_name##_dummy_                         \
+      GTEST_ATTRIBUTE_UNUSED_ =                                               \
+          ::testing::UnitTest::GetInstance()                                  \
+              ->parameterized_test_registry()                                 \
+              .GetTestSuitePatternHolder<test_suite_name>(                    \
+                  GTEST_STRINGIFY_(test_suite_name),                          \
+                  ::testing::internal::CodeLocation(__FILE__, __LINE__))      \
+              ->AddTestSuiteInstantiation(                                    \
+                  GTEST_STRINGIFY_(prefix),                                   \
+                  &gtest_##prefix##test_suite_name##_EvalGenerator_,          \
+                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,       \
                   __FILE__, __LINE__)
 
+
 // Allow Marking a Parameterized test class as not needing to be instantiated.
-#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T)                  \
+#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T)                   \
   namespace gtest_do_not_use_outside_namespace_scope {}                   \
   static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \
       GTEST_STRINGIFY_(T))
@@ -504,4 +504,4 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator &... g) {
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-printers.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
index 950247cf67..076c9de1f4 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
@@ -27,6 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 // Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
@@ -96,10 +97,11 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 
 #include <functional>
+#include <memory>
 #include <ostream>  // NOLINT
 #include <sstream>
 #include <string>
@@ -107,64 +109,125 @@
 #include <type_traits>
 #include <utility>
 #include <vector>
+
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-port.h"
 
-#if GTEST_HAS_ABSL
-#include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
-#include "absl/types/variant.h"
-#endif  // GTEST_HAS_ABSL
-
 namespace testing {
 
-// Definitions in the 'internal' and 'internal2' name spaces are
-// subject to change without notice.  DO NOT USE THEM IN USER CODE!
-namespace internal2 {
+// Definitions in the internal* namespaces are subject to change without notice.
+// DO NOT USE THEM IN USER CODE!
+namespace internal {
 
-// Prints the given number of bytes in the given object to the given
-// ostream.
-GTEST_API_ void PrintBytesInObjectTo(const unsigned char *obj_bytes,
-                                     size_t count, ::std::ostream *os);
-
-// For selecting which printer to use when a given type has neither <<
-// nor PrintTo().
-enum TypeKind {
-  kProtobuf,              // a protobuf type
-  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
-                          // (e.g. a named or unnamed enum type)
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-  kConvertibleToStringView,  // a type implicitly convertible to
-                             // absl::string_view or std::string_view
-#endif
-  kOtherType  // anything else
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+struct ContainerPrinter {
+  template <typename T,
+            typename = typename std::enable_if<
+                (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+                !IsRecursiveContainer<T>::value>::type>
+  static void PrintValue(const T& container, std::ostream* os) {
+    const size_t kMaxCount = 32;  // The maximum number of elements to print.
+    *os << '{';
+    size_t count = 0;
+    for (auto&& elem : container) {
+      if (count > 0) {
+        *os << ',';
+        if (count == kMaxCount) {  // Enough has been printed.
+          *os << " ...";
+          break;
+        }
+      }
+      *os << ' ';
+      // We cannot call PrintTo(elem, os) here as PrintTo() doesn't
+      // handle `elem` being a native array.
+      internal::UniversalPrint(elem, os);
+      ++count;
+    }
+
+    if (count > 0) {
+      *os << ' ';
+    }
+    *os << '}';
+  }
 };
 
-// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
-// by the universal printer to print a value of type T when neither
-// operator<< nor PrintTo() is defined for T, where kTypeKind is the
-// "kind" of T as defined by enum TypeKind.
-template <typename T, TypeKind kTypeKind>
-class TypeWithoutFormatter {
- public:
-  // This default version is called when kTypeKind is kOtherType.
-  static void PrintValue(const T &value, ::std::ostream *os) {
-    PrintBytesInObjectTo(
-        static_cast<const unsigned char *>(
-            reinterpret_cast<const void *>(std::addressof(value))),
-        sizeof(value), os);
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+struct FunctionPointerPrinter {
+  template <typename T, typename = typename std::enable_if<
+                            std::is_function<T>::value>::type>
+  static void PrintValue(T* p, ::std::ostream* os) {
+    if (p == nullptr) {
+      *os << "NULL";
+    } else {
+      // T is a function type, so '*os << p' doesn't do what we want
+      // (it just prints p as bool).  We want to print p as a const
+      // void*.
+      *os << reinterpret_cast<const void*>(p);
+    }
   }
 };
 
-// We print a protobuf using its ShortDebugString() when the string
-// doesn't exceed this many characters; otherwise we print it using
-// DebugString() for better readability.
-const size_t kProtobufOneLinerMaxLength = 50;
+struct PointerPrinter {
+  template <typename T>
+  static void PrintValue(T* p, ::std::ostream* os) {
+    if (p == nullptr) {
+      *os << "NULL";
+    } else {
+      // T is not a function type.  We just call << to print p,
+      // relying on ADL to pick up user-defined << for their pointer
+      // types, if any.
+      *os << p;
+    }
+  }
+};
 
-template <typename T>
-class TypeWithoutFormatter<T, kProtobuf> {
- public:
-  static void PrintValue(const T &value, ::std::ostream *os) {
+namespace internal_stream_operator_without_lexical_name_lookup {
+
+// The presence of an operator<< here will terminate lexical scope lookup
+// straight away (even though it cannot be a match because of its argument
+// types). Thus, the two operator<< calls in StreamPrinter will find only ADL
+// candidates.
+struct LookupBlocker {};
+void operator<<(LookupBlocker, LookupBlocker);
+
+struct StreamPrinter {
+  template <typename T,
+            // Don't accept member pointers here. We'd print them via implicit
+            // conversion to bool, which isn't useful.
+            typename = typename std::enable_if<
+                !std::is_member_pointer<T>::value>::type,
+            // Only accept types for which we can find a streaming operator via
+            // ADL (possibly involving implicit conversions).
+            typename = decltype(std::declval<std::ostream&>()
+                                << std::declval<const T&>())>
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    // Call streaming operator found by ADL, possibly with implicit conversions
+    // of the arguments.
+    *os << value;
+  }
+};
+
+}  // namespace internal_stream_operator_without_lexical_name_lookup
+
+struct ProtobufPrinter {
+  // We print a protobuf using its ShortDebugString() when the string
+  // doesn't exceed this many characters; otherwise we print it using
+  // DebugString() for better readability.
+  static const size_t kProtobufOneLinerMaxLength = 50;
+
+  template <typename T,
+            typename = typename std::enable_if<
+                internal::HasDebugStringAndShortDebugString<T>::value>::type>
+  static void PrintValue(const T& value, ::std::ostream* os) {
     std::string pretty_str = value.ShortDebugString();
     if (pretty_str.length() > kProtobufOneLinerMaxLength) {
       pretty_str = "\n" + value.DebugString();
@@ -173,9 +236,7 @@ class TypeWithoutFormatter<T, kProtobuf> {
   }
 };
 
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToInteger> {
- public:
+struct ConvertibleToIntegerPrinter {
   // Since T has no << operator or PrintTo() but can be implicitly
   // converted to BiggestInt, we print it as a BiggestInt.
   //
@@ -183,110 +244,73 @@ class TypeWithoutFormatter<T, kConvertibleToInteger> {
   // case printing it as an integer is the desired behavior.  In case
   // T is not an enum, printing it as an integer is the best we can do
   // given that it has no user-defined printer.
-  static void PrintValue(const T &value, ::std::ostream *os) {
-    const internal::BiggestInt kBigInt = value;
-    *os << kBigInt;
+  static void PrintValue(internal::BiggestInt value, ::std::ostream* os) {
+    *os << value;
   }
 };
 
+struct ConvertibleToStringViewPrinter {
 #if GTEST_INTERNAL_HAS_STRING_VIEW
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToStringView> {
- public:
-  // Since T has neither operator<< nor PrintTo() but can be implicitly
-  // converted to absl::string_view, we print it as a absl::string_view
-  // (or std::string_view).
-  //
-  // Note: the implementation is further below, as it depends on
-  // internal::PrintTo symbol which is defined later in the file.
-  static void PrintValue(const T &value, ::std::ostream *os);
-};
+  static void PrintValue(internal::StringView value, ::std::ostream* os) {
+    internal::UniversalPrint(value, os);
+  }
 #endif
+};
 
-// Prints the given value to the given ostream.  If the value is a
-// protocol message, its debug string is printed; if it's an enum or
-// of a type implicitly convertible to BiggestInt, it's printed as an
-// integer; otherwise the bytes in the value are printed.  This is
-// what UniversalPrinter<T>::Print() does when it knows nothing about
-// type T and T has neither << operator nor PrintTo().
-//
-// A user can override this behavior for a class type Foo by defining
-// a << operator in the namespace where Foo is defined.
-//
-// We put this operator in namespace 'internal2' instead of 'internal'
-// to simplify the implementation, as much code in 'internal' needs to
-// use << in STL, which would conflict with our own << were it defined
-// in 'internal'.
-//
-// Note that this operator<< takes a generic std::basic_ostream<Char,
-// CharTraits> type instead of the more restricted std::ostream.  If
-// we define it to take an std::ostream instead, we'll get an
-// "ambiguous overloads" compiler error when trying to print a type
-// Foo that supports streaming to std::basic_ostream<Char,
-// CharTraits>, as the compiler cannot tell whether
-// operator<<(std::ostream&, const T&) or
-// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
-// specific.
-template <typename Char, typename CharTraits, typename T>
-::std::basic_ostream<Char, CharTraits> &operator<<(
-    ::std::basic_ostream<Char, CharTraits> &os, const T &x) {
-  TypeWithoutFormatter<
-      T, (internal::IsAProtocolMessage<T>::value
-              ? kProtobuf
-              : std::is_convertible<const T &, internal::BiggestInt>::value
-                    ? kConvertibleToInteger
-                    :
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-                    std::is_convertible<const T &, internal::StringView>::value
-                        ? kConvertibleToStringView
-                        :
-#endif
-                        kOtherType)>::PrintValue(x, &os);
-  return os;
-}
 
-}  // namespace internal2
-}  // namespace testing
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count,
+                                     ::std::ostream* os);
+struct RawBytesPrinter {
+  // SFINAE on `sizeof` to make sure we have a complete type.
+  template <typename T, size_t = sizeof(T)>
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(
+        static_cast<const unsigned char*>(
+            // Load bearing cast to void* to support iOS
+            reinterpret_cast<const void*>(std::addressof(value))),
+        sizeof(value), os);
+  }
+};
 
-// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
-// magic needed for implementing UniversalPrinter won't work.
-namespace testing_internal {
+struct FallbackPrinter {
+  template <typename T>
+  static void PrintValue(const T&, ::std::ostream* os) {
+    *os << "(incomplete type)";
+  }
+};
 
-// Used to print a value that is not an STL-style container when the
-// user doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintNonContainerTo(const T &value, ::std::ostream *os) {
-  // With the following statement, during unqualified name lookup,
-  // testing::internal2::operator<< appears as if it was declared in
-  // the nearest enclosing namespace that contains both
-  // ::testing_internal and ::testing::internal2, i.e. the global
-  // namespace.  For more details, refer to the C++ Standard section
-  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
-  // testing::internal2::operator<< in case T doesn't come with a <<
-  // operator.
-
-  using ::testing::internal2::operator<<;
-
-  // Assuming T is defined in namespace foo, in the next statement,
-  // the compiler will consider all of:
-  //
-  //   1. foo::operator<< (thanks to Koenig look-up),
-  //   2. ::operator<< (as the current namespace is enclosed in ::),
-  //   3. testing::internal2::operator<< (thanks to the using statement above).
-  //
-  // The operator<< whose type matches T best will be picked.
-  //
-  // We deliberately allow #2 to be a candidate, as sometimes it's
-  // impossible to define #1 (e.g. when foo is ::std, defining
-  // anything in it is undefined behavior unless you are a compiler
-  // vendor.).
-  *os << value;
-}
+// Try every printer in order and return the first one that works.
+template <typename T, typename E, typename Printer, typename... Printers>
+struct FindFirstPrinter : FindFirstPrinter<T, E, Printers...> {};
 
-}  // namespace testing_internal
+template <typename T, typename Printer, typename... Printers>
+struct FindFirstPrinter<
+    T, decltype(Printer::PrintValue(std::declval<const T&>(), nullptr)),
+    Printer, Printers...> {
+  using type = Printer;
+};
 
-namespace testing {
-namespace internal {
+// Select the best printer in the following order:
+//  - Print containers (they have begin/end/etc).
+//  - Print function pointers.
+//  - Print object pointers.
+//  - Use the stream operator, if available.
+//  - Print protocol buffers.
+//  - Print types convertible to BiggestInt.
+//  - Print types convertible to StringView, if available.
+//  - Fallback to printing the raw bytes of the object.
+template <typename T>
+void PrintWithFallback(const T& value, ::std::ostream* os) {
+  using Printer = typename FindFirstPrinter<
+      T, void, ContainerPrinter, FunctionPointerPrinter, PointerPrinter,
+      internal_stream_operator_without_lexical_name_lookup::StreamPrinter,
+      ProtobufPrinter, ConvertibleToIntegerPrinter,
+      ConvertibleToStringViewPrinter, RawBytesPrinter, FallbackPrinter>::type;
+  Printer::PrintValue(value, os);
+}
 
 // FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
 // value of type ToPrint that is an operand of a comparison assertion
@@ -306,7 +330,7 @@ namespace internal {
 template <typename ToPrint, typename OtherOperand>
 class FormatForComparison {
  public:
-  static ::std::string Format(const ToPrint &value) {
+  static ::std::string Format(const ToPrint& value) {
     return ::testing::PrintToString(value);
   }
 };
@@ -315,27 +339,35 @@ class FormatForComparison {
 template <typename ToPrint, size_t N, typename OtherOperand>
 class FormatForComparison<ToPrint[N], OtherOperand> {
  public:
-  static ::std::string Format(const ToPrint *value) {
-    return FormatForComparison<const ToPrint *, OtherOperand>::Format(value);
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
   }
 };
 
 // By default, print C string as pointers to be safe, as we don't know
 // whether they actually point to a NUL-terminated string.
 
-#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                 \
-  template <typename OtherOperand>                                       \
-  class FormatForComparison<CharType *, OtherOperand> {                  \
-   public:                                                               \
-    static ::std::string Format(CharType *value) {                       \
-      return ::testing::PrintToString(static_cast<const void *>(value)); \
-    }                                                                    \
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
   }
 
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+#ifdef __cpp_char8_t
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t);
+#endif
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char16_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char16_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char32_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t);
 
 #undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
 
@@ -343,16 +375,24 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
 // to point to a NUL-terminated string, and thus can print it as a string.
 
 #define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
-  template <>                                                            \
-  class FormatForComparison<CharType *, OtherStringType> {               \
-   public:                                                               \
-    static ::std::string Format(CharType *value) {                       \
-      return ::testing::PrintToString(value);                            \
-    }                                                                    \
+  template <>                                                           \
+  class FormatForComparison<CharType*, OtherStringType> {               \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(value);                           \
+    }                                                                   \
   }
 
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+#ifdef __cpp_char8_t
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char8_t, ::std::u8string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char8_t, ::std::u8string);
+#endif
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char16_t, ::std::u16string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char16_t, ::std::u16string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char32_t, ::std::u32string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char32_t, ::std::u32string);
 
 #if GTEST_HAS_STD_WSTRING
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
@@ -370,8 +410,8 @@ GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename T1, typename T2>
-std::string FormatForComparisonFailureMessage(const T1 &value,
-                                              const T2 & /* other_operand */) {
+std::string FormatForComparisonFailureMessage(
+    const T1& value, const T2& /* other_operand */) {
   return FormatForComparison<T1, T2>::Format(value);
 }
 
@@ -385,86 +425,6 @@ std::string FormatForComparisonFailureMessage(const T1 &value,
 template <typename T>
 class UniversalPrinter;
 
-template <typename T>
-void UniversalPrint(const T &value, ::std::ostream *os);
-
-enum DefaultPrinterType {
-  kPrintContainer,
-  kPrintPointer,
-  kPrintFunctionPointer,
-  kPrintOther,
-};
-template <DefaultPrinterType type>
-struct WrapPrinterType {};
-
-// Used to print an STL-style container when the user doesn't define
-// a PrintTo() for it.
-template <typename C>
-void DefaultPrintTo(WrapPrinterType<kPrintContainer> /* dummy */,
-                    const C &container, ::std::ostream *os) {
-  const size_t kMaxCount = 32;  // The maximum number of elements to print.
-  *os << '{';
-  size_t count = 0;
-  for (typename C::const_iterator it = container.begin(); it != container.end();
-       ++it, ++count) {
-    if (count > 0) {
-      *os << ',';
-      if (count == kMaxCount) {  // Enough has been printed.
-        *os << " ...";
-        break;
-      }
-    }
-    *os << ' ';
-    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
-    // handle *it being a native array.
-    internal::UniversalPrint(*it, os);
-  }
-
-  if (count > 0) {
-    *os << ' ';
-  }
-  *os << '}';
-}
-
-// Used to print a pointer that is neither a char pointer nor a member
-// pointer, when the user doesn't define PrintTo() for it.  (A member
-// variable pointer or member function pointer doesn't really point to
-// a location in the address space.  Their representation is
-// implementation-defined.  Therefore they will be printed as raw
-// bytes.)
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintPointer> /* dummy */, T *p,
-                    ::std::ostream *os) {
-  if (p == nullptr) {
-    *os << "NULL";
-  } else {
-    // T is not a function type.  We just call << to print p,
-    // relying on ADL to pick up user-defined << for their pointer
-    // types, if any.
-    *os << p;
-  }
-}
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintFunctionPointer> /* dummy */, T *p,
-                    ::std::ostream *os) {
-  if (p == nullptr) {
-    *os << "NULL";
-  } else {
-    // T is a function type, so '*os << p' doesn't do what we want
-    // (it just prints p as bool).  We want to print p as a const
-    // void*.
-    *os << reinterpret_cast<const void *>(p);
-  }
-}
-
-// Used to print a non-container, non-pointer value when the user
-// doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */, const T &value,
-                    ::std::ostream *os) {
-  ::testing_internal::DefaultPrintNonContainerTo(value, os);
-}
-
 // Prints the given value using the << operator if it has one;
 // otherwise prints the bytes in it.  This is what
 // UniversalPrinter<T>::Print() does when PrintTo() is not specialized
@@ -477,37 +437,8 @@ void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */, const T &value,
 // or there is already a << operator but it doesn't do what the user
 // wants).
 template <typename T>
-void PrintTo(const T &value, ::std::ostream *os) {
-  // DefaultPrintTo() is overloaded.  The type of its first argument
-  // determines which version will be picked.
-  //
-  // Note that we check for container types here, prior to we check
-  // for protocol message types in our operator<<.  The rationale is:
-  //
-  // For protocol messages, we want to give people a chance to
-  // override Google Mock's format by defining a PrintTo() or
-  // operator<<.  For STL containers, other formats can be
-  // incompatible with Google Mock's format for the container
-  // elements; therefore we check for container types here to ensure
-  // that our format is used.
-  //
-  // Note that MSVC and clang-cl do allow an implicit conversion from
-  // pointer-to-function to pointer-to-object, but clang-cl warns on it.
-  // So don't use ImplicitlyConvertible if it can be helped since it will
-  // cause this warning, and use a separate overload of DefaultPrintTo for
-  // function pointers so that the `*os << p` in the object pointer overload
-  // doesn't cause that warning either.
-  DefaultPrintTo(
-      WrapPrinterType <
-                  (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
-              !IsRecursiveContainer<T>::value
-          ? kPrintContainer
-          : !std::is_pointer<T>::value
-                ? kPrintOther
-                : std::is_function<typename std::remove_pointer<T>::type>::value
-                      ? kPrintFunctionPointer
-                      : kPrintPointer > (),
-      value, os);
+void PrintTo(const T& value, ::std::ostream* os) {
+  internal::PrintWithFallback(value, os);
 }
 
 // The following list of PrintTo() overloads tells
@@ -515,9 +446,9 @@ void PrintTo(const T &value, ::std::ostream *os) {
 // types, strings, plain arrays, and pointers).
 
 // Overloads for various char types.
-GTEST_API_ void PrintTo(unsigned char c, ::std::ostream *os);
-GTEST_API_ void PrintTo(signed char c, ::std::ostream *os);
-inline void PrintTo(char c, ::std::ostream *os) {
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
+inline void PrintTo(char c, ::std::ostream* os) {
   // When printing a plain char, we always treat it as unsigned.  This
   // way, the output won't be affected by whether the compiler thinks
   // char is signed or not.
@@ -525,7 +456,7 @@ inline void PrintTo(char c, ::std::ostream *os) {
 }
 
 // Overloads for other simple built-in types.
-inline void PrintTo(bool x, ::std::ostream *os) {
+inline void PrintTo(bool x, ::std::ostream* os) {
   *os << (x ? "true" : "false");
 }
 
@@ -536,27 +467,54 @@ inline void PrintTo(bool x, ::std::ostream *os) {
 // as signed integer when wchar_t is implemented by the compiler
 // as a signed type and is printed as an unsigned integer when wchar_t
 // is implemented as an unsigned type.
-GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream *os);
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
+
+GTEST_API_ void PrintTo(char32_t c, ::std::ostream* os);
+inline void PrintTo(char16_t c, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<char32_t>(c), os);
+}
+#ifdef __cpp_char8_t
+inline void PrintTo(char8_t c, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<char32_t>(c), os);
+}
+#endif
 
 // Overloads for C strings.
-GTEST_API_ void PrintTo(const char *s, ::std::ostream *os);
-inline void PrintTo(char *s, ::std::ostream *os) {
-  PrintTo(ImplicitCast_<const char *>(s), os);
+GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
+inline void PrintTo(char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char*>(s), os);
 }
 
 // signed/unsigned char is often used for representing binary data, so
 // we print pointers to it as void* to be safe.
-inline void PrintTo(const signed char *s, ::std::ostream *os) {
-  PrintTo(ImplicitCast_<const void *>(s), os);
+inline void PrintTo(const signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
 }
-inline void PrintTo(signed char *s, ::std::ostream *os) {
-  PrintTo(ImplicitCast_<const void *>(s), os);
+inline void PrintTo(signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
 }
-inline void PrintTo(const unsigned char *s, ::std::ostream *os) {
-  PrintTo(ImplicitCast_<const void *>(s), os);
+inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
 }
-inline void PrintTo(unsigned char *s, ::std::ostream *os) {
-  PrintTo(ImplicitCast_<const void *>(s), os);
+inline void PrintTo(unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+#ifdef __cpp_char8_t
+// Overloads for u8 strings.
+GTEST_API_ void PrintTo(const char8_t* s, ::std::ostream* os);
+inline void PrintTo(char8_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char8_t*>(s), os);
+}
+#endif
+// Overloads for u16 strings.
+GTEST_API_ void PrintTo(const char16_t* s, ::std::ostream* os);
+inline void PrintTo(char16_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char16_t*>(s), os);
+}
+// Overloads for u32 strings.
+GTEST_API_ void PrintTo(const char32_t* s, ::std::ostream* os);
+inline void PrintTo(char32_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char32_t*>(s), os);
 }
 
 // MSVC can be configured to define wchar_t as a typedef of unsigned
@@ -566,9 +524,9 @@ inline void PrintTo(unsigned char *s, ::std::ostream *os) {
 // possibly causing invalid memory accesses.
 #if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
 // Overloads for wide C strings
-GTEST_API_ void PrintTo(const wchar_t *s, ::std::ostream *os);
-inline void PrintTo(wchar_t *s, ::std::ostream *os) {
-  PrintTo(ImplicitCast_<const wchar_t *>(s), os);
+GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
+inline void PrintTo(wchar_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
 }
 #endif
 
@@ -578,7 +536,7 @@ inline void PrintTo(wchar_t *s, ::std::ostream *os) {
 // Prints the given number of elements in an array, without printing
 // the curly braces.
 template <typename T>
-void PrintRawArrayTo(const T a[], size_t count, ::std::ostream *os) {
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
   UniversalPrint(a[0], os);
   for (size_t i = 1; i != count; i++) {
     *os << ", ";
@@ -587,42 +545,99 @@ void PrintRawArrayTo(const T a[], size_t count, ::std::ostream *os) {
 }
 
 // Overloads for ::std::string.
-GTEST_API_ void PrintStringTo(const ::std::string &s, ::std::ostream *os);
-inline void PrintTo(const ::std::string &s, ::std::ostream *os) {
+GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
   PrintStringTo(s, os);
 }
 
+// Overloads for ::std::u8string
+#ifdef __cpp_char8_t
+GTEST_API_ void PrintU8StringTo(const ::std::u8string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u8string& s, ::std::ostream* os) {
+  PrintU8StringTo(s, os);
+}
+#endif
+
+// Overloads for ::std::u16string
+GTEST_API_ void PrintU16StringTo(const ::std::u16string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u16string& s, ::std::ostream* os) {
+  PrintU16StringTo(s, os);
+}
+
+// Overloads for ::std::u32string
+GTEST_API_ void PrintU32StringTo(const ::std::u32string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) {
+  PrintU32StringTo(s, os);
+}
+
 // Overloads for ::std::wstring.
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::std::wstring &s, ::std::ostream *os);
-inline void PrintTo(const ::std::wstring &s, ::std::ostream *os) {
+GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
   PrintWideStringTo(s, os);
 }
 #endif  // GTEST_HAS_STD_WSTRING
 
 #if GTEST_INTERNAL_HAS_STRING_VIEW
 // Overload for internal::StringView.
-inline void PrintTo(internal::StringView sp, ::std::ostream *os) {
+inline void PrintTo(internal::StringView sp, ::std::ostream* os) {
   PrintTo(::std::string(sp), os);
 }
 #endif  // GTEST_INTERNAL_HAS_STRING_VIEW
 
-inline void PrintTo(std::nullptr_t, ::std::ostream *os) { *os << "(nullptr)"; }
+inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
 
 template <typename T>
-void PrintTo(std::reference_wrapper<T> ref, ::std::ostream *os) {
-  UniversalPrinter<T &>::Print(ref.get(), os);
+void PrintTo(std::reference_wrapper<T> ref, ::std::ostream* os) {
+  UniversalPrinter<T&>::Print(ref.get(), os);
+}
+
+inline const void* VoidifyPointer(const void* p) { return p; }
+inline const void* VoidifyPointer(volatile const void* p) {
+  return const_cast<const void*>(p);
+}
+
+template <typename T, typename Ptr>
+void PrintSmartPointer(const Ptr& ptr, std::ostream* os, char) {
+  if (ptr == nullptr) {
+    *os << "(nullptr)";
+  } else {
+    // We can't print the value. Just print the pointer..
+    *os << "(" << (VoidifyPointer)(ptr.get()) << ")";
+  }
+}
+template <typename T, typename Ptr,
+          typename = typename std::enable_if<!std::is_void<T>::value &&
+                                             !std::is_array<T>::value>::type>
+void PrintSmartPointer(const Ptr& ptr, std::ostream* os, int) {
+  if (ptr == nullptr) {
+    *os << "(nullptr)";
+  } else {
+    *os << "(ptr = " << (VoidifyPointer)(ptr.get()) << ", value = ";
+    UniversalPrinter<T>::Print(*ptr, os);
+    *os << ")";
+  }
+}
+
+template <typename T, typename D>
+void PrintTo(const std::unique_ptr<T, D>& ptr, std::ostream* os) {
+  (PrintSmartPointer<T>)(ptr, os, 0);
+}
+
+template <typename T>
+void PrintTo(const std::shared_ptr<T>& ptr, std::ostream* os) {
+  (PrintSmartPointer<T>)(ptr, os, 0);
 }
 
 // Helper function for printing a tuple.  T must be instantiated with
 // a tuple type.
 template <typename T>
-void PrintTupleTo(const T &, std::integral_constant<size_t, 0>,
-                  ::std::ostream *) {}
+void PrintTupleTo(const T&, std::integral_constant<size_t, 0>,
+                  ::std::ostream*) {}
 
 template <typename T, size_t I>
-void PrintTupleTo(const T &t, std::integral_constant<size_t, I>,
-                  ::std::ostream *os) {
+void PrintTupleTo(const T& t, std::integral_constant<size_t, I>,
+                  ::std::ostream* os) {
   PrintTupleTo(t, std::integral_constant<size_t, I - 1>(), os);
   GTEST_INTENTIONAL_CONST_COND_PUSH_()
   if (I > 1) {
@@ -634,7 +649,7 @@ void PrintTupleTo(const T &t, std::integral_constant<size_t, I>,
 }
 
 template <typename... Types>
-void PrintTo(const ::std::tuple<Types...> &t, ::std::ostream *os) {
+void PrintTo(const ::std::tuple<Types...>& t, ::std::ostream* os) {
   *os << "(";
   PrintTupleTo(t, std::integral_constant<size_t, sizeof...(Types)>(), os);
   *os << ")";
@@ -642,7 +657,7 @@ void PrintTo(const ::std::tuple<Types...> &t, ::std::ostream *os) {
 
 // Overload for std::pair.
 template <typename T1, typename T2>
-void PrintTo(const ::std::pair<T1, T2> &value, ::std::ostream *os) {
+void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
   *os << '(';
   // We cannot use UniversalPrint(value.first, os) here, as T1 may be
   // a reference type.  The same for printing value.second.
@@ -664,7 +679,7 @@ class UniversalPrinter {
   // Note: we deliberately don't call this PrintTo(), as that name
   // conflicts with ::testing::internal::PrintTo in the body of the
   // function.
-  static void Print(const T &value, ::std::ostream *os) {
+  static void Print(const T& value, ::std::ostream* os) {
     // By default, ::testing::internal::PrintTo() is used for printing
     // the value.
     //
@@ -679,14 +694,46 @@ class UniversalPrinter {
   GTEST_DISABLE_MSC_WARNINGS_POP_()
 };
 
-#if GTEST_HAS_ABSL
+// Remove any const-qualifiers before passing a type to UniversalPrinter.
+template <typename T>
+class UniversalPrinter<const T> : public UniversalPrinter<T> {};
+
+#if GTEST_INTERNAL_HAS_ANY
 
-// Printer for absl::optional
+// Printer for std::any / absl::any
+
+template <>
+class UniversalPrinter<Any> {
+ public:
+  static void Print(const Any& value, ::std::ostream* os) {
+    if (value.has_value()) {
+      *os << "value of type " << GetTypeName(value);
+    } else {
+      *os << "no value";
+    }
+  }
+
+ private:
+  static std::string GetTypeName(const Any& value) {
+#if GTEST_HAS_RTTI
+    return internal::GetTypeName(value.type());
+#else
+    static_cast<void>(value);  // possibly unused
+    return "<unknown_type>";
+#endif  // GTEST_HAS_RTTI
+  }
+};
+
+#endif  // GTEST_INTERNAL_HAS_ANY
+
+#if GTEST_INTERNAL_HAS_OPTIONAL
+
+// Printer for std::optional / absl::optional
 
 template <typename T>
-class UniversalPrinter<::absl::optional<T>> {
+class UniversalPrinter<Optional<T>> {
  public:
-  static void Print(const ::absl::optional<T> &value, ::std::ostream *os) {
+  static void Print(const Optional<T>& value, ::std::ostream* os) {
     *os << '(';
     if (!value) {
       *os << "nullopt";
@@ -697,34 +744,44 @@ class UniversalPrinter<::absl::optional<T>> {
   }
 };
 
-// Printer for absl::variant
+#endif  // GTEST_INTERNAL_HAS_OPTIONAL
+
+#if GTEST_INTERNAL_HAS_VARIANT
+
+// Printer for std::variant / absl::variant
 
 template <typename... T>
-class UniversalPrinter<::absl::variant<T...>> {
+class UniversalPrinter<Variant<T...>> {
  public:
-  static void Print(const ::absl::variant<T...> &value, ::std::ostream *os) {
+  static void Print(const Variant<T...>& value, ::std::ostream* os) {
     *os << '(';
-    absl::visit(Visitor{ os }, value);
+#if GTEST_HAS_ABSL
+    absl::visit(Visitor{os, value.index()}, value);
+#else
+    std::visit(Visitor{os, value.index()}, value);
+#endif  // GTEST_HAS_ABSL
     *os << ')';
   }
 
  private:
   struct Visitor {
     template <typename U>
-    void operator()(const U &u) const {
-      *os << "'" << GetTypeName<U>() << "' with value ";
+    void operator()(const U& u) const {
+      *os << "'" << GetTypeName<U>() << "(index = " << index
+          << ")' with value ";
       UniversalPrint(u, os);
     }
-    ::std::ostream *os;
+    ::std::ostream* os;
+    std::size_t index;
   };
 };
 
-#endif  // GTEST_HAS_ABSL
+#endif  // GTEST_INTERNAL_HAS_VARIANT
 
 // UniversalPrintArray(begin, len, os) prints an array of 'len'
 // elements, starting at address 'begin'.
 template <typename T>
-void UniversalPrintArray(const T *begin, size_t len, ::std::ostream *os) {
+void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
   if (len == 0) {
     *os << "{}";
   } else {
@@ -745,12 +802,26 @@ void UniversalPrintArray(const T *begin, size_t len, ::std::ostream *os) {
   }
 }
 // This overload prints a (const) char array compactly.
-GTEST_API_ void UniversalPrintArray(const char *begin, size_t len,
-                                    ::std::ostream *os);
+GTEST_API_ void UniversalPrintArray(
+    const char* begin, size_t len, ::std::ostream* os);
+
+#ifdef __cpp_char8_t
+// This overload prints a (const) char8_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char8_t* begin, size_t len,
+                                    ::std::ostream* os);
+#endif
+
+// This overload prints a (const) char16_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char16_t* begin, size_t len,
+                                    ::std::ostream* os);
+
+// This overload prints a (const) char32_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len,
+                                    ::std::ostream* os);
 
 // This overload prints a (const) wchar_t array compactly.
-GTEST_API_ void UniversalPrintArray(const wchar_t *begin, size_t len,
-                                    ::std::ostream *os);
+GTEST_API_ void UniversalPrintArray(
+    const wchar_t* begin, size_t len, ::std::ostream* os);
 
 // Implements printing an array type T[N].
 template <typename T, size_t N>
@@ -758,23 +829,23 @@ class UniversalPrinter<T[N]> {
  public:
   // Prints the given array, omitting some elements when there are too
   // many.
-  static void Print(const T (&a)[N], ::std::ostream *os) {
+  static void Print(const T (&a)[N], ::std::ostream* os) {
     UniversalPrintArray(a, N, os);
   }
 };
 
 // Implements printing a reference type T&.
 template <typename T>
-class UniversalPrinter<T &> {
+class UniversalPrinter<T&> {
  public:
   // MSVC warns about adding const to a function type, so we want to
   // disable the warning.
   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
 
-  static void Print(const T &value, ::std::ostream *os) {
+  static void Print(const T& value, ::std::ostream* os) {
     // Prints the address of the value.  We use reinterpret_cast here
     // as static_cast doesn't compile when T is a function type.
-    *os << "@" << reinterpret_cast<const void *>(&value) << " ";
+    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
 
     // Then prints the value itself.
     UniversalPrint(value, os);
@@ -790,28 +861,28 @@ class UniversalPrinter<T &> {
 template <typename T>
 class UniversalTersePrinter {
  public:
-  static void Print(const T &value, ::std::ostream *os) {
+  static void Print(const T& value, ::std::ostream* os) {
     UniversalPrint(value, os);
   }
 };
 template <typename T>
-class UniversalTersePrinter<T &> {
+class UniversalTersePrinter<T&> {
  public:
-  static void Print(const T &value, ::std::ostream *os) {
+  static void Print(const T& value, ::std::ostream* os) {
     UniversalPrint(value, os);
   }
 };
 template <typename T, size_t N>
 class UniversalTersePrinter<T[N]> {
  public:
-  static void Print(const T (&value)[N], ::std::ostream *os) {
+  static void Print(const T (&value)[N], ::std::ostream* os) {
     UniversalPrinter<T[N]>::Print(value, os);
   }
 };
 template <>
-class UniversalTersePrinter<const char *> {
+class UniversalTersePrinter<const char*> {
  public:
-  static void Print(const char *str, ::std::ostream *os) {
+  static void Print(const char* str, ::std::ostream* os) {
     if (str == nullptr) {
       *os << "NULL";
     } else {
@@ -820,18 +891,61 @@ class UniversalTersePrinter<const char *> {
   }
 };
 template <>
-class UniversalTersePrinter<char *> {
+class UniversalTersePrinter<char*> : public UniversalTersePrinter<const char*> {
+};
+
+#ifdef __cpp_char8_t
+template <>
+class UniversalTersePrinter<const char8_t*> {
+ public:
+  static void Print(const char8_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u8string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char8_t*>
+    : public UniversalTersePrinter<const char8_t*> {};
+#endif
+
+template <>
+class UniversalTersePrinter<const char16_t*> {
  public:
-  static void Print(char *str, ::std::ostream *os) {
-    UniversalTersePrinter<const char *>::Print(str, os);
+  static void Print(const char16_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u16string(str), os);
+    }
   }
 };
+template <>
+class UniversalTersePrinter<char16_t*>
+    : public UniversalTersePrinter<const char16_t*> {};
+
+template <>
+class UniversalTersePrinter<const char32_t*> {
+ public:
+  static void Print(const char32_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u32string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char32_t*>
+    : public UniversalTersePrinter<const char32_t*> {};
 
 #if GTEST_HAS_STD_WSTRING
 template <>
-class UniversalTersePrinter<const wchar_t *> {
+class UniversalTersePrinter<const wchar_t*> {
  public:
-  static void Print(const wchar_t *str, ::std::ostream *os) {
+  static void Print(const wchar_t* str, ::std::ostream* os) {
     if (str == nullptr) {
       *os << "NULL";
     } else {
@@ -842,15 +956,15 @@ class UniversalTersePrinter<const wchar_t *> {
 #endif
 
 template <>
-class UniversalTersePrinter<wchar_t *> {
+class UniversalTersePrinter<wchar_t*> {
  public:
-  static void Print(wchar_t *str, ::std::ostream *os) {
-    UniversalTersePrinter<const wchar_t *>::Print(str, os);
+  static void Print(wchar_t* str, ::std::ostream* os) {
+    UniversalTersePrinter<const wchar_t*>::Print(str, os);
   }
 };
 
 template <typename T>
-void UniversalTersePrint(const T &value, ::std::ostream *os) {
+void UniversalTersePrint(const T& value, ::std::ostream* os) {
   UniversalTersePrinter<T>::Print(value, os);
 }
 
@@ -859,24 +973,24 @@ void UniversalTersePrint(const T &value, ::std::ostream *os) {
 // (const) char pointer, this prints both the pointer and the
 // NUL-terminated string.
 template <typename T>
-void UniversalPrint(const T &value, ::std::ostream *os) {
+void UniversalPrint(const T& value, ::std::ostream* os) {
   // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
   // UniversalPrinter with T directly.
   typedef T T1;
   UniversalPrinter<T1>::Print(value, os);
 }
 
-typedef ::std::vector<::std::string> Strings;
+typedef ::std::vector< ::std::string> Strings;
 
-// Tersely prints the first N fields of a tuple to a string vector,
-// one element for each field.
+  // Tersely prints the first N fields of a tuple to a string vector,
+  // one element for each field.
 template <typename Tuple>
-void TersePrintPrefixToStrings(const Tuple &, std::integral_constant<size_t, 0>,
-                               Strings *) {}
+void TersePrintPrefixToStrings(const Tuple&, std::integral_constant<size_t, 0>,
+                               Strings*) {}
 template <typename Tuple, size_t I>
-void TersePrintPrefixToStrings(const Tuple &t,
+void TersePrintPrefixToStrings(const Tuple& t,
                                std::integral_constant<size_t, I>,
-                               Strings *strings) {
+                               Strings* strings) {
   TersePrintPrefixToStrings(t, std::integral_constant<size_t, I - 1>(),
                             strings);
   ::std::stringstream ss;
@@ -888,7 +1002,7 @@ void TersePrintPrefixToStrings(const Tuple &t,
 // element for each field.  See the comment before
 // UniversalTersePrint() for how we define "tersely".
 template <typename Tuple>
-Strings UniversalTersePrintTupleFieldsToStrings(const Tuple &value) {
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
   Strings result;
   TersePrintPrefixToStrings(
       value, std::integral_constant<size_t, std::tuple_size<Tuple>::value>(),
@@ -898,18 +1012,8 @@ Strings UniversalTersePrintTupleFieldsToStrings(const Tuple &value) {
 
 }  // namespace internal
 
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-namespace internal2 {
-template <typename T>
-void TypeWithoutFormatter<T, kConvertibleToStringView>::PrintValue(
-    const T &value, ::std::ostream *os) {
-  internal::PrintTo(internal::StringView(value), os);
-}
-}  // namespace internal2
-#endif
-
 template <typename T>
-::std::string PrintToString(const T &value) {
+::std::string PrintToString(const T& value) {
   ::std::stringstream ss;
   internal::UniversalTersePrinter<T>::Print(value, &ss);
   return ss.str();
@@ -922,4 +1026,4 @@ template <typename T>
 // declarations from this file.
 #include "gtest/internal/custom/gtest-printers.h"
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-spi.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
index e263b1033f..eacef44669 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
@@ -33,8 +33,8 @@
 
 // GOOGLETEST_CM0004 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
 
 #include "gtest/gtest.h"
 
@@ -65,11 +65,11 @@ class GTEST_API_ ScopedFakeTestPartResultReporter
   // by Google Test.  The 'result' parameter specifies where to report the
   // results. This reporter will only catch failures generated in the current
   // thread. DEPRECATED
-  explicit ScopedFakeTestPartResultReporter(TestPartResultArray *result);
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
 
   // Same as above, but you can choose the interception scope of this object.
   ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
-                                   TestPartResultArray *result);
+                                   TestPartResultArray* result);
 
   // The d'tor restores the previous test part result reporter.
   ~ScopedFakeTestPartResultReporter() override;
@@ -79,14 +79,14 @@ class GTEST_API_ ScopedFakeTestPartResultReporter
   //
   // This method is from the TestPartResultReporterInterface
   // interface.
-  void ReportTestPartResult(const TestPartResult &result) override;
+  void ReportTestPartResult(const TestPartResult& result) override;
 
  private:
   void Init();
 
   const InterceptMode intercept_mode_;
-  TestPartResultReporterInterface *old_reporter_;
-  TestPartResultArray *const result_;
+  TestPartResultReporterInterface* old_reporter_;
+  TestPartResultArray* const result_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
 };
@@ -101,12 +101,11 @@ namespace internal {
 class GTEST_API_ SingleFailureChecker {
  public:
   // The constructor remembers the arguments.
-  SingleFailureChecker(const TestPartResultArray *results,
-                       TestPartResult::Type type, const std::string &substr);
+  SingleFailureChecker(const TestPartResultArray* results,
+                       TestPartResult::Type type, const std::string& substr);
   ~SingleFailureChecker();
-
  private:
-  const TestPartResultArray *const results_;
+  const TestPartResultArray* const results_;
   const TestPartResult::Type type_;
   const std::string substr_;
 
@@ -142,39 +141,38 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // helper macro, due to some peculiarity in how the preprocessor
 // works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
 // gtest_unittest.cc will fail to compile if we do that.
-#define EXPECT_FATAL_FAILURE(statement, substr)                               \
-  do {                                                                        \
-    class GTestExpectFatalFailureHelper {                                     \
-     public:                                                                  \
-      static void Execute() { statement; }                                    \
-    };                                                                        \
-    ::testing::TestPartResultArray gtest_failures;                            \
-    ::testing::internal::SingleFailureChecker gtest_checker(                  \
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
-    {                                                                         \
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
-          ::testing::ScopedFakeTestPartResultReporter::                       \
-              INTERCEPT_ONLY_CURRENT_THREAD,                                  \
-          &gtest_failures);                                                   \
-      GTestExpectFatalFailureHelper::Execute();                               \
-    }                                                                         \
+#define EXPECT_FATAL_FAILURE(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
   } while (::testing::internal::AlwaysFalse())
 
-#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr)                \
-  do {                                                                        \
-    class GTestExpectFatalFailureHelper {                                     \
-     public:                                                                  \
-      static void Execute() { statement; }                                    \
-    };                                                                        \
-    ::testing::TestPartResultArray gtest_failures;                            \
-    ::testing::internal::SingleFailureChecker gtest_checker(                  \
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
-    {                                                                         \
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
-          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
-          &gtest_failures);                                                   \
-      GTestExpectFatalFailureHelper::Execute();                               \
-    }                                                                         \
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ALL_THREADS, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
   } while (::testing::internal::AlwaysFalse())
 
 // A macro for testing Google Test assertions or code that's expected to
@@ -209,37 +207,32 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // instead of
 //   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
 // to avoid an MSVC warning on unreachable code.
-#define EXPECT_NONFATAL_FAILURE(statement, substr)                    \
-  do {                                                                \
-    ::testing::TestPartResultArray gtest_failures;                    \
-    ::testing::internal::SingleFailureChecker gtest_checker(          \
+#define EXPECT_NONFATAL_FAILURE(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
         &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));                                                    \
-    {                                                                 \
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(     \
-          ::testing::ScopedFakeTestPartResultReporter::               \
-              INTERCEPT_ONLY_CURRENT_THREAD,                          \
-          &gtest_failures);                                           \
-      if (::testing::internal::AlwaysTrue()) {                        \
-        statement;                                                    \
-      }                                                               \
-    }                                                                 \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
   } while (::testing::internal::AlwaysFalse())
 
-#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr)             \
-  do {                                                                        \
-    ::testing::TestPartResultArray gtest_failures;                            \
-    ::testing::internal::SingleFailureChecker gtest_checker(                  \
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure,         \
-        (substr));                                                            \
-    {                                                                         \
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
           ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
-          &gtest_failures);                                                   \
-      if (::testing::internal::AlwaysTrue()) {                                \
-        statement;                                                            \
-      }                                                                       \
-    }                                                                         \
+          &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
   } while (::testing::internal::AlwaysFalse())
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
index a28afb309b..203fdf98c6 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
@@ -29,8 +29,8 @@
 //
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 
 #include <iosfwd>
 #include <vector>
@@ -60,10 +60,12 @@ class GTEST_API_ TestPartResult {
   // C'tor.  TestPartResult does NOT have a default constructor.
   // Always use this constructor (with parameters) to create a
   // TestPartResult object.
-  TestPartResult(Type a_type, const char *a_file_name, int a_line_number,
-                 const char *a_message)
-      : type_(a_type), file_name_(a_file_name == nullptr ? "" : a_file_name),
-        line_number_(a_line_number), summary_(ExtractSummary(a_message)),
+  TestPartResult(Type a_type, const char* a_file_name, int a_line_number,
+                 const char* a_message)
+      : type_(a_type),
+        file_name_(a_file_name == nullptr ? "" : a_file_name),
+        line_number_(a_line_number),
+        summary_(ExtractSummary(a_message)),
         message_(a_message) {}
 
   // Gets the outcome of the test part.
@@ -71,7 +73,7 @@ class GTEST_API_ TestPartResult {
 
   // Gets the name of the source file where the test part took place, or
   // NULL if it's unknown.
-  const char *file_name() const {
+  const char* file_name() const {
     return file_name_.empty() ? nullptr : file_name_.c_str();
   }
 
@@ -80,10 +82,10 @@ class GTEST_API_ TestPartResult {
   int line_number() const { return line_number_; }
 
   // Gets the summary of the failure message.
-  const char *summary() const { return summary_.c_str(); }
+  const char* summary() const { return summary_.c_str(); }
 
   // Gets the message associated with the test part.
-  const char *message() const { return message_.c_str(); }
+  const char* message() const { return message_.c_str(); }
 
   // Returns true if and only if the test part was skipped.
   bool skipped() const { return type_ == kSkip; }
@@ -105,7 +107,7 @@ class GTEST_API_ TestPartResult {
 
   // Gets the summary of the failure message by omitting the stack
   // trace in it.
-  static std::string ExtractSummary(const char *message);
+  static std::string ExtractSummary(const char* message);
 
   // The name of the source file where the test part took place, or
   // "" if the source file is unknown.
@@ -118,7 +120,7 @@ class GTEST_API_ TestPartResult {
 };
 
 // Prints a TestPartResult object.
-std::ostream &operator<<(std::ostream &os, const TestPartResult &result);
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
 
 // An array of TestPartResult objects.
 //
@@ -129,10 +131,10 @@ class GTEST_API_ TestPartResultArray {
   TestPartResultArray() {}
 
   // Appends the given TestPartResult to the array.
-  void Append(const TestPartResult &result);
+  void Append(const TestPartResult& result);
 
   // Returns the TestPartResult at the given index (0-based).
-  const TestPartResult &GetTestPartResult(int index) const;
+  const TestPartResult& GetTestPartResult(int index) const;
 
   // Returns the number of TestPartResult objects in the array.
   int size() const;
@@ -148,7 +150,7 @@ class GTEST_API_ TestPartResultReporterInterface {
  public:
   virtual ~TestPartResultReporterInterface() {}
 
-  virtual void ReportTestPartResult(const TestPartResult &result) = 0;
+  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
 };
 
 namespace internal {
@@ -164,12 +166,11 @@ class GTEST_API_ HasNewFatalFailureHelper
  public:
   HasNewFatalFailureHelper();
   ~HasNewFatalFailureHelper() override;
-  void ReportTestPartResult(const TestPartResult &result) override;
+  void ReportTestPartResult(const TestPartResult& result) override;
   bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
-
  private:
   bool has_new_fatal_failure_;
-  TestPartResultReporterInterface *original_reporter_;
+  TestPartResultReporterInterface* original_reporter_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
 };
@@ -180,4 +181,4 @@ class GTEST_API_ HasNewFatalFailureHelper
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
index f5afc4db87..9fdc6be10d 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
@@ -29,8 +29,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 
 // This header implements typed tests and type-parameterized tests.
 
@@ -175,8 +175,6 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
 
 // Implements typed tests.
 
-#if GTEST_HAS_TYPED_TEST
-
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Expands to the name of the typedef for the type parameters of the
@@ -230,12 +228,8 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   TYPED_TEST_SUITE
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#endif  // GTEST_HAS_TYPED_TEST
-
 // Implements type-parameterized tests.
 
-#if GTEST_HAS_TYPED_TEST_P
-
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Expands to the namespace name that the type-parameterized tests for
@@ -294,7 +288,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                             \
     typedef ::testing::internal::Templates<__VA_ARGS__> gtest_AllTests_;    \
   }                                                                         \
-  static const char *const GTEST_REGISTERED_TEST_NAMES_(                    \
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(                    \
       SuiteName) GTEST_ATTRIBUTE_UNUSED_ =                                  \
       GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames( \
           GTEST_STRINGIFY_(SuiteName), __FILE__, __LINE__, #__VA_ARGS__)
@@ -307,21 +301,21 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   REGISTER_TYPED_TEST_SUITE_P
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)     \
-  static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1,                     \
-                "test-suit-prefix must not be empty");                    \
-  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =      \
-      ::testing::internal::TypeParameterizedTestSuite<                    \
-          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,  \
-          ::testing::internal::GenerateTypeList<Types>::type>::           \
-          Register(GTEST_STRINGIFY_(Prefix),                              \
-                   ::testing::internal::CodeLocation(__FILE__, __LINE__), \
-                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName),           \
-                   GTEST_STRINGIFY_(SuiteName),                           \
-                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),               \
-                   ::testing::internal::GenerateNames<                    \
-                       ::testing::internal::NameGeneratorSelector<        \
-                           __VA_ARGS__>::type,                            \
+#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)       \
+  static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1,                       \
+                "test-suit-prefix must not be empty");                      \
+  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =        \
+      ::testing::internal::TypeParameterizedTestSuite<                      \
+          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,    \
+          ::testing::internal::GenerateTypeList<Types>::type>::             \
+          Register(GTEST_STRINGIFY_(Prefix),                                \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__),   \
+                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName),             \
+                   GTEST_STRINGIFY_(SuiteName),                             \
+                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),                 \
+                   ::testing::internal::GenerateNames<                      \
+                       ::testing::internal::NameGeneratorSelector<          \
+                           __VA_ARGS__>::type,                              \
                        ::testing::internal::GenerateTypeList<Types>::type>())
 
 // Legacy API is deprecated but still available
@@ -332,6 +326,4 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   INSTANTIATE_TYPED_TEST_SUITE_P
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#endif  // GTEST_HAS_TYPED_TEST_P
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest.h
index 8fd7eea1e7..7a5d057c4a 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest.h
@@ -49,8 +49,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_H_
 
 #include <cstddef>
 #include <limits>
@@ -78,11 +78,12 @@ namespace testing {
 // Silence C4100 (unreferenced formal parameter) and 4805
 // unsafe mix of type 'const int' and type 'const bool'
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4805)
-#pragma warning(disable : 4100)
+# pragma warning(push)
+# pragma warning(disable:4805)
+# pragma warning(disable:4100)
 #endif
 
+
 // Declares the flags.
 
 // This flag temporary enables the disabled tests.
@@ -100,6 +101,10 @@ GTEST_DECLARE_bool_(catch_exceptions);
 // to let Google Test decide.
 GTEST_DECLARE_string_(color);
 
+// This flag controls whether the test runner should continue execution past
+// first failure.
+GTEST_DECLARE_bool_(fail_fast);
+
 // This flag sets up the filter to select by name using a glob pattern
 // the tests to run. If the filter is not given all tests are executed.
 GTEST_DECLARE_string_(filter);
@@ -116,6 +121,9 @@ GTEST_DECLARE_bool_(list_tests);
 // in addition to its normal textual output.
 GTEST_DECLARE_string_(output);
 
+// This flags control whether Google Test prints only test failures.
+GTEST_DECLARE_bool_(brief);
+
 // This flags control whether Google Test prints the elapsed time for each
 // test.
 GTEST_DECLARE_bool_(print_time);
@@ -173,10 +181,10 @@ class TestEventRepeater;
 class UnitTestRecordPropertyTestHelper;
 class WindowsDeathTest;
 class FuchsiaDeathTest;
-class UnitTestImpl *GetUnitTestImpl();
+class UnitTestImpl* GetUnitTestImpl();
 void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const std::string &message);
-std::set<std::string> *GetIgnoredParameterizedTestSuites();
+                                    const std::string& message);
+std::set<std::string>* GetIgnoredParameterizedTestSuites();
 
 }  // namespace internal
 
@@ -276,7 +284,7 @@ class GTEST_API_ AssertionResult {
  public:
   // Copy constructor.
   // Used in EXPECT_TRUE/FALSE(assertion_result).
-  AssertionResult(const AssertionResult &other);
+  AssertionResult(const AssertionResult& other);
 
 // C4800 is a level 3 warning in Visual Studio 2015 and earlier.
 // This warning is not emitted in Visual Studio 2017.
@@ -295,9 +303,9 @@ class GTEST_API_ AssertionResult {
   // we want AssertionResult's copy constructor to be used.
   template <typename T>
   explicit AssertionResult(
-      const T &success,
+      const T& success,
       typename std::enable_if<
-          !std::is_convertible<T, AssertionResult>::value>::type *
+          !std::is_convertible<T, AssertionResult>::value>::type*
       /*enabler*/
       = nullptr)
       : success_(success) {}
@@ -307,7 +315,7 @@ class GTEST_API_ AssertionResult {
 #endif
 
   // Assignment operator.
-  AssertionResult &operator=(AssertionResult other) {
+  AssertionResult& operator=(AssertionResult other) {
     swap(other);
     return *this;
   }
@@ -322,36 +330,35 @@ class GTEST_API_ AssertionResult {
   // use it when they fail (i.e., the predicate's outcome doesn't match the
   // assertion's expectation). When nothing has been streamed into the
   // object, returns an empty string.
-  const char *message() const {
+  const char* message() const {
     return message_.get() != nullptr ? message_->c_str() : "";
   }
   // Deprecated; please use message() instead.
-  const char *failure_message() const { return message(); }
+  const char* failure_message() const { return message(); }
 
   // Streams a custom failure message into this object.
-  template <typename T>
-  AssertionResult &operator<<(const T &value) {
+  template <typename T> AssertionResult& operator<<(const T& value) {
     AppendMessage(Message() << value);
     return *this;
   }
 
   // Allows streaming basic output manipulators such as endl or flush into
   // this object.
-  AssertionResult &operator<<(
-      ::std::ostream &(*basic_manipulator)(::std::ostream &stream)) {
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
     AppendMessage(Message() << basic_manipulator);
     return *this;
   }
 
  private:
   // Appends the contents of message to message_.
-  void AppendMessage(const Message &a_message) {
+  void AppendMessage(const Message& a_message) {
     if (message_.get() == nullptr) message_.reset(new ::std::string);
     message_->append(a_message.GetString().c_str());
   }
 
   // Swap the contents of this AssertionResult with other.
-  void swap(AssertionResult &other);
+  void swap(AssertionResult& other);
 
   // Stores result of the assertion predicate.
   bool success_;
@@ -370,7 +377,7 @@ GTEST_API_ AssertionResult AssertionFailure();
 
 // Makes a failed assertion result with the given failure message.
 // Deprecated; use AssertionFailure() << msg.
-GTEST_API_ AssertionResult AssertionFailure(const Message &msg);
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
 
 }  // namespace testing
 
@@ -411,10 +418,10 @@ class GTEST_API_ Test {
   // The d'tor is virtual as we intend to inherit from Test.
   virtual ~Test();
 
-  // Sets up the stuff shared by all tests in this test case.
+  // Sets up the stuff shared by all tests in this test suite.
   //
   // Google Test will call Foo::SetUpTestSuite() before running the first
-  // test in test case Foo.  Hence a sub-class can define its own
+  // test in test suite Foo.  Hence a sub-class can define its own
   // SetUpTestSuite() method to shadow the one defined in the super
   // class.
   static void SetUpTestSuite() {}
@@ -422,12 +429,13 @@ class GTEST_API_ Test {
   // Tears down the stuff shared by all tests in this test suite.
   //
   // Google Test will call Foo::TearDownTestSuite() after running the last
-  // test in test case Foo.  Hence a sub-class can define its own
+  // test in test suite Foo.  Hence a sub-class can define its own
   // TearDownTestSuite() method to shadow the one defined in the super
   // class.
   static void TearDownTestSuite() {}
 
-  // Legacy API is deprecated but still available
+  // Legacy API is deprecated but still available. Use SetUpTestSuite and
+  // TearDownTestSuite instead.
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   static void TearDownTestCase() {}
   static void SetUpTestCase() {}
@@ -459,8 +467,8 @@ class GTEST_API_ Test {
   // global context (before or after invocation of RUN_ALL_TESTS and from
   // SetUp/TearDown method of Environment objects registered with Google
   // Test) will be output as attributes of the <testsuites> element.
-  static void RecordProperty(const std::string &key, const std::string &value);
-  static void RecordProperty(const std::string &key, int value);
+  static void RecordProperty(const std::string& key, const std::string& value);
+  static void RecordProperty(const std::string& key, int value);
 
  protected:
   // Creates a Test object.
@@ -511,7 +519,7 @@ class GTEST_API_ Test {
   // If you see an error about overriding the following function or
   // about it being private, you have mis-spelled SetUp() as Setup().
   struct Setup_should_be_spelled_SetUp {};
-  virtual Setup_should_be_spelled_SetUp *Setup() { return nullptr; }
+  virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
 
   // We disallow copying Tests.
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
@@ -528,17 +536,24 @@ class TestProperty {
   // C'tor.  TestProperty does NOT have a default constructor.
   // Always use this constructor (with parameters) to create a
   // TestProperty object.
-  TestProperty(const std::string &a_key, const std::string &a_value)
-      : key_(a_key), value_(a_value) {}
+  TestProperty(const std::string& a_key, const std::string& a_value) :
+    key_(a_key), value_(a_value) {
+  }
 
   // Gets the user supplied key.
-  const char *key() const { return key_.c_str(); }
+  const char* key() const {
+    return key_.c_str();
+  }
 
   // Gets the user supplied value.
-  const char *value() const { return value_.c_str(); }
+  const char* value() const {
+    return value_.c_str();
+  }
 
   // Sets a new value, overriding the one supplied in the constructor.
-  void SetValue(const std::string &new_value) { value_ = new_value; }
+  void SetValue(const std::string& new_value) {
+    value_ = new_value;
+  }
 
  private:
   // The key supplied by the user.
@@ -592,12 +607,12 @@ class GTEST_API_ TestResult {
 
   // Returns the i-th test part result among all the results. i can range from 0
   // to total_part_count() - 1. If i is not in that range, aborts the program.
-  const TestPartResult &GetTestPartResult(int i) const;
+  const TestPartResult& GetTestPartResult(int i) const;
 
   // Returns the i-th test property. i can range from 0 to
   // test_property_count() - 1. If i is not in that range, aborts the
   // program.
-  const TestProperty &GetTestProperty(int i) const;
+  const TestProperty& GetTestProperty(int i) const;
 
  private:
   friend class TestInfo;
@@ -611,12 +626,12 @@ class GTEST_API_ TestResult {
   friend class internal::FuchsiaDeathTest;
 
   // Gets the vector of TestPartResults.
-  const std::vector<TestPartResult> &test_part_results() const {
+  const std::vector<TestPartResult>& test_part_results() const {
     return test_part_results_;
   }
 
   // Gets the vector of TestProperties.
-  const std::vector<TestProperty> &test_properties() const {
+  const std::vector<TestProperty>& test_properties() const {
     return test_properties_;
   }
 
@@ -632,17 +647,17 @@ class GTEST_API_ TestResult {
   // value will be updated, rather than storing multiple values for the same
   // key.  xml_element specifies the element for which the property is being
   // recorded and is used for validation.
-  void RecordProperty(const std::string &xml_element,
-                      const TestProperty &test_property);
+  void RecordProperty(const std::string& xml_element,
+                      const TestProperty& test_property);
 
   // Adds a failure if the key is a reserved attribute of Google Test
   // testsuite tags.  Returns true if the property is valid.
   // FIXME: Validate attribute names are legal and human readable.
-  static bool ValidateTestProperty(const std::string &xml_element,
-                                   const TestProperty &test_property);
+  static bool ValidateTestProperty(const std::string& xml_element,
+                                   const TestProperty& test_property);
 
   // Adds a test part result to the list.
-  void AddTestPartResult(const TestPartResult &test_part_result);
+  void AddTestPartResult(const TestPartResult& test_part_result);
 
   // Returns the death test count.
   int death_test_count() const { return death_test_count_; }
@@ -658,7 +673,7 @@ class GTEST_API_ TestResult {
 
   // Protects mutable state of the property vector and of owned
   // properties, whose values may be updated.
-  internal::Mutex test_properites_mutex_;
+  internal::Mutex test_properties_mutex_;
 
   // The vector of TestPartResults
   std::vector<TestPartResult> test_part_results_;
@@ -693,32 +708,32 @@ class GTEST_API_ TestInfo {
   ~TestInfo();
 
   // Returns the test suite name.
-  const char *test_suite_name() const { return test_suite_name_.c_str(); }
+  const char* test_suite_name() const { return test_suite_name_.c_str(); }
 
 // Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  const char *test_case_name() const { return test_suite_name(); }
+  const char* test_case_name() const { return test_suite_name(); }
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Returns the test name.
-  const char *name() const { return name_.c_str(); }
+  const char* name() const { return name_.c_str(); }
 
   // Returns the name of the parameter type, or NULL if this is not a typed
   // or a type-parameterized test.
-  const char *type_param() const {
+  const char* type_param() const {
     if (type_param_.get() != nullptr) return type_param_->c_str();
     return nullptr;
   }
 
   // Returns the text representation of the value parameter, or NULL if this
   // is not a value-parameterized test.
-  const char *value_param() const {
+  const char* value_param() const {
     if (value_param_.get() != nullptr) return value_param_->c_str();
     return nullptr;
   }
 
   // Returns the file name where this test is defined.
-  const char *file() const { return location_.file.c_str(); }
+  const char* file() const { return location_.file.c_str(); }
 
   // Returns the line where this test is defined.
   int line() const { return location_.line; }
@@ -752,7 +767,7 @@ class GTEST_API_ TestInfo {
   }
 
   // Returns the result of the test.
-  const TestResult *result() const { return &result_; }
+  const TestResult* result() const { return &result_; }
 
  private:
 #if GTEST_HAS_DEATH_TEST
@@ -762,21 +777,21 @@ class GTEST_API_ TestInfo {
   friend class TestSuite;
   friend class internal::UnitTestImpl;
   friend class internal::StreamingListenerTest;
-  friend TestInfo *internal::MakeAndRegisterTestInfo(
-      const char *test_suite_name, const char *name, const char *type_param,
-      const char *value_param, internal::CodeLocation code_location,
+  friend TestInfo* internal::MakeAndRegisterTestInfo(
+      const char* test_suite_name, const char* name, const char* type_param,
+      const char* value_param, internal::CodeLocation code_location,
       internal::TypeId fixture_class_id, internal::SetUpTestSuiteFunc set_up_tc,
       internal::TearDownTestSuiteFunc tear_down_tc,
-      internal::TestFactoryBase *factory);
+      internal::TestFactoryBase* factory);
 
   // Constructs a TestInfo object. The newly constructed instance assumes
   // ownership of the factory object.
-  TestInfo(const std::string &test_suite_name, const std::string &name,
-           const char *a_type_param,   // NULL if not a type-parameterized test
-           const char *a_value_param,  // NULL if not a value-parameterized test
+  TestInfo(const std::string& test_suite_name, const std::string& name,
+           const char* a_type_param,   // NULL if not a type-parameterized test
+           const char* a_value_param,  // NULL if not a value-parameterized test
            internal::CodeLocation a_code_location,
            internal::TypeId fixture_class_id,
-           internal::TestFactoryBase *factory);
+           internal::TestFactoryBase* factory);
 
   // Increments the number of death tests encountered in this test so
   // far.
@@ -788,13 +803,16 @@ class GTEST_API_ TestInfo {
   // deletes it.
   void Run();
 
-  static void ClearTestResult(TestInfo *test_info) {
+  // Skip and records the test result for this object.
+  void Skip();
+
+  static void ClearTestResult(TestInfo* test_info) {
     test_info->result_.Clear();
   }
 
   // These fields are immutable properties of the test.
-  const std::string test_suite_name_;  // test suite name
-  const std::string name_;             // Test name
+  const std::string test_suite_name_;    // test suite name
+  const std::string name_;               // Test name
   // Name of the parameter type, or NULL if this is not a typed or a
   // type-parameterized test.
   const std::unique_ptr<const ::std::string> type_param_;
@@ -808,7 +826,7 @@ class GTEST_API_ TestInfo {
   bool matches_filter_;       // True if this test matches the
                               // user-specified filter.
   bool is_in_another_shard_;  // Will be run in another shard.
-  internal::TestFactoryBase *const factory_;  // The factory that creates
+  internal::TestFactoryBase* const factory_;  // The factory that creates
                                               // the test object
 
   // This field is mutable and needs to be reset before running the
@@ -835,7 +853,7 @@ class GTEST_API_ TestSuite {
   //                 this is not a type-parameterized test.
   //   set_up_tc:    pointer to the function that sets up the test suite
   //   tear_down_tc: pointer to the function that tears down the test suite
-  TestSuite(const char *name, const char *a_type_param,
+  TestSuite(const char* name, const char* a_type_param,
             internal::SetUpTestSuiteFunc set_up_tc,
             internal::TearDownTestSuiteFunc tear_down_tc);
 
@@ -843,11 +861,11 @@ class GTEST_API_ TestSuite {
   virtual ~TestSuite();
 
   // Gets the name of the TestSuite.
-  const char *name() const { return name_.c_str(); }
+  const char* name() const { return name_.c_str(); }
 
   // Returns the name of the parameter type, or NULL if this is not a
   // type-parameterized test suite.
-  const char *type_param() const {
+  const char* type_param() const {
     if (type_param_.get() != nullptr) return type_param_->c_str();
     return nullptr;
   }
@@ -896,46 +914,49 @@ class GTEST_API_ TestSuite {
 
   // Returns the i-th test among all the tests. i can range from 0 to
   // total_test_count() - 1. If i is not in that range, returns NULL.
-  const TestInfo *GetTestInfo(int i) const;
+  const TestInfo* GetTestInfo(int i) const;
 
   // Returns the TestResult that holds test properties recorded during
   // execution of SetUpTestSuite and TearDownTestSuite.
-  const TestResult &ad_hoc_test_result() const { return ad_hoc_test_result_; }
+  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
 
  private:
   friend class Test;
   friend class internal::UnitTestImpl;
 
   // Gets the (mutable) vector of TestInfos in this TestSuite.
-  std::vector<TestInfo *> &test_info_list() { return test_info_list_; }
+  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
 
   // Gets the (immutable) vector of TestInfos in this TestSuite.
-  const std::vector<TestInfo *> &test_info_list() const {
+  const std::vector<TestInfo*>& test_info_list() const {
     return test_info_list_;
   }
 
   // Returns the i-th test among all the tests. i can range from 0 to
   // total_test_count() - 1. If i is not in that range, returns NULL.
-  TestInfo *GetMutableTestInfo(int i);
+  TestInfo* GetMutableTestInfo(int i);
 
   // Sets the should_run member.
   void set_should_run(bool should) { should_run_ = should; }
 
   // Adds a TestInfo to this test suite.  Will delete the TestInfo upon
   // destruction of the TestSuite object.
-  void AddTestInfo(TestInfo *test_info);
+  void AddTestInfo(TestInfo * test_info);
 
   // Clears the results of all tests in this test suite.
   void ClearResult();
 
   // Clears the results of all tests in the given test suite.
-  static void ClearTestSuiteResult(TestSuite *test_suite) {
+  static void ClearTestSuiteResult(TestSuite* test_suite) {
     test_suite->ClearResult();
   }
 
   // Runs every test in this TestSuite.
   void Run();
 
+  // Skips the execution of tests under this TestSuite
+  void Skip();
+
   // Runs SetUpTestSuite() for this TestSuite.  This wrapper is needed
   // for catching exceptions thrown from SetUpTestSuite().
   void RunSetUpTestSuite() {
@@ -953,43 +974,43 @@ class GTEST_API_ TestSuite {
   }
 
   // Returns true if and only if test passed.
-  static bool TestPassed(const TestInfo *test_info) {
+  static bool TestPassed(const TestInfo* test_info) {
     return test_info->should_run() && test_info->result()->Passed();
   }
 
   // Returns true if and only if test skipped.
-  static bool TestSkipped(const TestInfo *test_info) {
+  static bool TestSkipped(const TestInfo* test_info) {
     return test_info->should_run() && test_info->result()->Skipped();
   }
 
   // Returns true if and only if test failed.
-  static bool TestFailed(const TestInfo *test_info) {
+  static bool TestFailed(const TestInfo* test_info) {
     return test_info->should_run() && test_info->result()->Failed();
   }
 
   // Returns true if and only if the test is disabled and will be reported in
   // the XML report.
-  static bool TestReportableDisabled(const TestInfo *test_info) {
+  static bool TestReportableDisabled(const TestInfo* test_info) {
     return test_info->is_reportable() && test_info->is_disabled_;
   }
 
   // Returns true if and only if test is disabled.
-  static bool TestDisabled(const TestInfo *test_info) {
+  static bool TestDisabled(const TestInfo* test_info) {
     return test_info->is_disabled_;
   }
 
   // Returns true if and only if this test will appear in the XML report.
-  static bool TestReportable(const TestInfo *test_info) {
+  static bool TestReportable(const TestInfo* test_info) {
     return test_info->is_reportable();
   }
 
   // Returns true if the given test should run.
-  static bool ShouldRunTest(const TestInfo *test_info) {
+  static bool ShouldRunTest(const TestInfo* test_info) {
     return test_info->should_run();
   }
 
   // Shuffles the tests in this test suite.
-  void ShuffleTests(internal::Random *random);
+  void ShuffleTests(internal::Random* random);
 
   // Restores the test order to before the first shuffle.
   void UnshuffleTests();
@@ -1001,7 +1022,7 @@ class GTEST_API_ TestSuite {
   const std::unique_ptr<const ::std::string> type_param_;
   // The vector of TestInfos in their original order.  It owns the
   // elements in the vector.
-  std::vector<TestInfo *> test_info_list_;
+  std::vector<TestInfo*> test_info_list_;
   // Provides a level of indirection for the test list to allow easy
   // shuffling and restoring the test order.  The i-th element in this
   // vector is the index of the i-th test in the shuffled test list.
@@ -1048,12 +1069,11 @@ class Environment {
 
   // Override this to define how to tear down the environment.
   virtual void TearDown() {}
-
  private:
   // If you see an error about overriding the following function or
   // about it being private, you have mis-spelled SetUp() as Setup().
   struct Setup_should_be_spelled_SetUp {};
-  virtual Setup_should_be_spelled_SetUp *Setup() { return nullptr; }
+  virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
 };
 
 #if GTEST_HAS_EXCEPTIONS
@@ -1062,7 +1082,7 @@ class Environment {
 class GTEST_API_ AssertionException
     : public internal::GoogleTestFailureException {
  public:
-  explicit AssertionException(const TestPartResult &result)
+  explicit AssertionException(const TestPartResult& result)
       : GoogleTestFailureException(result) {}
 };
 
@@ -1075,58 +1095,59 @@ class TestEventListener {
   virtual ~TestEventListener() {}
 
   // Fired before any test activity starts.
-  virtual void OnTestProgramStart(const UnitTest &unit_test) = 0;
+  virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
 
   // Fired before each iteration of tests starts.  There may be more than
   // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
   // index, starting from 0.
-  virtual void OnTestIterationStart(const UnitTest &unit_test,
+  virtual void OnTestIterationStart(const UnitTest& unit_test,
                                     int iteration) = 0;
 
   // Fired before environment set-up for each iteration of tests starts.
-  virtual void OnEnvironmentsSetUpStart(const UnitTest &unit_test) = 0;
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0;
 
   // Fired after environment set-up for each iteration of tests ends.
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest &unit_test) = 0;
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
 
   // Fired before the test suite starts.
-  virtual void OnTestSuiteStart(const TestSuite & /*test_suite*/) {}
+  virtual void OnTestSuiteStart(const TestSuite& /*test_suite*/) {}
 
   //  Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  virtual void OnTestCaseStart(const TestCase & /*test_case*/) {}
+  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Fired before the test starts.
-  virtual void OnTestStart(const TestInfo &test_info) = 0;
+  virtual void OnTestStart(const TestInfo& test_info) = 0;
 
   // Fired after a failed assertion or a SUCCEED() invocation.
   // If you want to throw an exception from this function to skip to the next
   // TEST, it must be AssertionException defined above, or inherited from it.
-  virtual void OnTestPartResult(const TestPartResult &test_part_result) = 0;
+  virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
 
   // Fired after the test ends.
-  virtual void OnTestEnd(const TestInfo &test_info) = 0;
+  virtual void OnTestEnd(const TestInfo& test_info) = 0;
 
   // Fired after the test suite ends.
-  virtual void OnTestSuiteEnd(const TestSuite & /*test_suite*/) {}
+  virtual void OnTestSuiteEnd(const TestSuite& /*test_suite*/) {}
 
 //  Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  virtual void OnTestCaseEnd(const TestCase & /*test_case*/) {}
+  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Fired before environment tear-down for each iteration of tests starts.
-  virtual void OnEnvironmentsTearDownStart(const UnitTest &unit_test) = 0;
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
 
   // Fired after environment tear-down for each iteration of tests ends.
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest &unit_test) = 0;
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
 
   // Fired after each iteration of tests finishes.
-  virtual void OnTestIterationEnd(const UnitTest &unit_test, int iteration) = 0;
+  virtual void OnTestIterationEnd(const UnitTest& unit_test,
+                                  int iteration) = 0;
 
   // Fired after all test activities have ended.
-  virtual void OnTestProgramEnd(const UnitTest &unit_test) = 0;
+  virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
 };
 
 // The convenience class for users who need to override just one or two
@@ -1136,30 +1157,30 @@ class TestEventListener {
 // above.
 class EmptyTestEventListener : public TestEventListener {
  public:
-  void OnTestProgramStart(const UnitTest & /*unit_test*/) override {}
-  void OnTestIterationStart(const UnitTest & /*unit_test*/,
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest& /*unit_test*/,
                             int /*iteration*/) override {}
-  void OnEnvironmentsSetUpStart(const UnitTest & /*unit_test*/) override {}
-  void OnEnvironmentsSetUpEnd(const UnitTest & /*unit_test*/) override {}
-  void OnTestSuiteStart(const TestSuite & /*test_suite*/) override {}
+  void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {}
 //  Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  void OnTestCaseStart(const TestCase & /*test_case*/) override {}
+  void OnTestCaseStart(const TestCase& /*test_case*/) override {}
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-  void OnTestStart(const TestInfo & /*test_info*/) override {}
-  void OnTestPartResult(const TestPartResult & /*test_part_result*/) override {}
-  void OnTestEnd(const TestInfo & /*test_info*/) override {}
-  void OnTestSuiteEnd(const TestSuite & /*test_suite*/) override {}
+  void OnTestStart(const TestInfo& /*test_info*/) override {}
+  void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {}
+  void OnTestEnd(const TestInfo& /*test_info*/) override {}
+  void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  void OnTestCaseEnd(const TestCase & /*test_case*/) override {}
+  void OnTestCaseEnd(const TestCase& /*test_case*/) override {}
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-  void OnEnvironmentsTearDownStart(const UnitTest & /*unit_test*/) override {}
-  void OnEnvironmentsTearDownEnd(const UnitTest & /*unit_test*/) override {}
-  void OnTestIterationEnd(const UnitTest & /*unit_test*/,
+  void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest& /*unit_test*/,
                           int /*iteration*/) override {}
-  void OnTestProgramEnd(const UnitTest & /*unit_test*/) override {}
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
 };
 
 // TestEventListeners lets users add listeners to track events in Google Test.
@@ -1171,19 +1192,19 @@ class GTEST_API_ TestEventListeners {
   // Appends an event listener to the end of the list. Google Test assumes
   // the ownership of the listener (i.e. it will delete the listener when
   // the test program finishes).
-  void Append(TestEventListener *listener);
+  void Append(TestEventListener* listener);
 
   // Removes the given event listener from the list and returns it.  It then
   // becomes the caller's responsibility to delete the listener. Returns
   // NULL if the listener is not found in the list.
-  TestEventListener *Release(TestEventListener *listener);
+  TestEventListener* Release(TestEventListener* listener);
 
   // Returns the standard listener responsible for the default console
   // output.  Can be removed from the listeners list to shut down default
   // console output.  Note that removing this object from the listener list
   // with Release transfers its ownership to the caller and makes this
   // function return NULL the next time.
-  TestEventListener *default_result_printer() const {
+  TestEventListener* default_result_printer() const {
     return default_result_printer_;
   }
 
@@ -1194,7 +1215,7 @@ class GTEST_API_ TestEventListeners {
   // removing this object from the listener list with Release transfers its
   // ownership to the caller and makes this function return NULL the next
   // time.
-  TestEventListener *default_xml_generator() const {
+  TestEventListener* default_xml_generator() const {
     return default_xml_generator_;
   }
 
@@ -1208,21 +1229,21 @@ class GTEST_API_ TestEventListeners {
 
   // Returns repeater that broadcasts the TestEventListener events to all
   // subscribers.
-  TestEventListener *repeater();
+  TestEventListener* repeater();
 
   // Sets the default_result_printer attribute to the provided listener.
   // The listener is also added to the listener list and previous
   // default_result_printer is removed from it and deleted. The listener can
   // also be NULL in which case it will not be added to the list. Does
   // nothing if the previous and the current listener objects are the same.
-  void SetDefaultResultPrinter(TestEventListener *listener);
+  void SetDefaultResultPrinter(TestEventListener* listener);
 
   // Sets the default_xml_generator attribute to the provided listener.  The
   // listener is also added to the listener list and previous
   // default_xml_generator is removed from it and deleted. The listener can
   // also be NULL in which case it will not be added to the list. Does
   // nothing if the previous and the current listener objects are the same.
-  void SetDefaultXmlGenerator(TestEventListener *listener);
+  void SetDefaultXmlGenerator(TestEventListener* listener);
 
   // Controls whether events will be forwarded by the repeater to the
   // listeners in the list.
@@ -1230,11 +1251,11 @@ class GTEST_API_ TestEventListeners {
   void SuppressEventForwarding();
 
   // The actual list of listeners.
-  internal::TestEventRepeater *repeater_;
+  internal::TestEventRepeater* repeater_;
   // Listener responsible for the standard result output.
-  TestEventListener *default_result_printer_;
+  TestEventListener* default_result_printer_;
   // Listener responsible for the creation of the XML output file.
-  TestEventListener *default_xml_generator_;
+  TestEventListener* default_xml_generator_;
 
   // We disallow copying TestEventListeners.
   GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
@@ -1255,7 +1276,7 @@ class GTEST_API_ UnitTest {
   // Gets the singleton UnitTest object.  The first time this method
   // is called, a UnitTest object is constructed and returned.
   // Consecutive calls will return the same object.
-  static UnitTest *GetInstance();
+  static UnitTest* GetInstance();
 
   // Runs all tests in this UnitTest object and prints the result.
   // Returns 0 if successful, or 1 otherwise.
@@ -1267,20 +1288,21 @@ class GTEST_API_ UnitTest {
 
   // Returns the working directory when the first TEST() or TEST_F()
   // was executed.  The UnitTest object owns the string.
-  const char *original_working_dir() const;
+  const char* original_working_dir() const;
 
   // Returns the TestSuite object for the test that's currently running,
   // or NULL if no test is running.
-  const TestSuite *current_test_suite() const GTEST_LOCK_EXCLUDED_(mutex_);
+  const TestSuite* current_test_suite() const GTEST_LOCK_EXCLUDED_(mutex_);
 
 // Legacy API is still available but deprecated
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  const TestCase *current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_);
+  const TestCase* current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_);
 #endif
 
   // Returns the TestInfo object for the test that's currently running,
   // or NULL if no test is running.
-  const TestInfo *current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_);
+  const TestInfo* current_test_info() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Returns the random seed used at the start of the current test run.
   int random_seed() const;
@@ -1289,7 +1311,7 @@ class GTEST_API_ UnitTest {
   // value-parameterized tests and instantiate and register them.
   //
   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  internal::ParameterizedTestSuiteRegistry &parameterized_test_registry()
+  internal::ParameterizedTestSuiteRegistry& parameterized_test_registry()
       GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Gets the number of successful test suites.
@@ -1354,20 +1376,20 @@ class GTEST_API_ UnitTest {
 
   // Gets the i-th test suite among all the test suites. i can range from 0 to
   // total_test_suite_count() - 1. If i is not in that range, returns NULL.
-  const TestSuite *GetTestSuite(int i) const;
+  const TestSuite* GetTestSuite(int i) const;
 
 //  Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  const TestCase *GetTestCase(int i) const;
+  const TestCase* GetTestCase(int i) const;
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Returns the TestResult containing information on test failures and
   // properties logged outside of individual test suites.
-  const TestResult &ad_hoc_test_result() const;
+  const TestResult& ad_hoc_test_result() const;
 
   // Returns the list of event listeners that can be used to track events
   // inside Google Test.
-  TestEventListeners &listeners();
+  TestEventListeners& listeners();
 
  private:
   // Registers and returns a global test environment.  When a test
@@ -1379,16 +1401,17 @@ class GTEST_API_ UnitTest {
   // The UnitTest object takes ownership of the given environment.
   //
   // This method can only be called from the main thread.
-  Environment *AddEnvironment(Environment *env);
+  Environment* AddEnvironment(Environment* env);
 
   // Adds a TestPartResult to the current TestResult object.  All
   // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
   // eventually call this to report their results.  The user code
   // should use the assertion macros instead of calling this directly.
   void AddTestPartResult(TestPartResult::Type result_type,
-                         const char *file_name, int line_number,
-                         const std::string &message,
-                         const std::string &os_stack_trace)
+                         const char* file_name,
+                         int line_number,
+                         const std::string& message,
+                         const std::string& os_stack_trace)
       GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Adds a TestProperty to the current TestResult object when invoked from
@@ -1396,15 +1419,15 @@ class GTEST_API_ UnitTest {
   // from SetUpTestSuite or TearDownTestSuite, or to the global property set
   // when invoked elsewhere.  If the result already contains a property with
   // the same key, the value will be updated.
-  void RecordProperty(const std::string &key, const std::string &value);
+  void RecordProperty(const std::string& key, const std::string& value);
 
   // Gets the i-th test suite among all the test suites. i can range from 0 to
   // total_test_suite_count() - 1. If i is not in that range, returns NULL.
-  TestSuite *GetMutableTestSuite(int i);
+  TestSuite* GetMutableTestSuite(int i);
 
   // Accessors for the implementation object.
-  internal::UnitTestImpl *impl() { return impl_; }
-  const internal::UnitTestImpl *impl() const { return impl_; }
+  internal::UnitTestImpl* impl() { return impl_; }
+  const internal::UnitTestImpl* impl() const { return impl_; }
 
   // These classes and functions are friends as they need to access private
   // members of UnitTest.
@@ -1413,11 +1436,12 @@ class GTEST_API_ UnitTest {
   friend class internal::AssertHelper;
   friend class internal::StreamingListenerTest;
   friend class internal::UnitTestRecordPropertyTestHelper;
-  friend Environment *AddGlobalTestEnvironment(Environment *env);
-  friend std::set<std::string> *internal::GetIgnoredParameterizedTestSuites();
-  friend internal::UnitTestImpl *internal::GetUnitTestImpl();
+  friend Environment* AddGlobalTestEnvironment(Environment* env);
+  friend std::set<std::string>* internal::GetIgnoredParameterizedTestSuites();
+  friend internal::UnitTestImpl* internal::GetUnitTestImpl();
   friend void internal::ReportFailureInUnknownLocation(
-      TestPartResult::Type result_type, const std::string &message);
+      TestPartResult::Type result_type,
+      const std::string& message);
 
   // Creates an empty UnitTest.
   UnitTest();
@@ -1427,11 +1451,12 @@ class GTEST_API_ UnitTest {
 
   // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
   // Google Test trace stack.
-  void PushGTestTrace(const internal::TraceInfo &trace)
+  void PushGTestTrace(const internal::TraceInfo& trace)
       GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Pops a trace from the per-thread Google Test trace stack.
-  void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_);
+  void PopGTestTrace()
+      GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Protects mutable state in *impl_.  This is mutable as some const
   // methods need to lock it too.
@@ -1441,7 +1466,7 @@ class GTEST_API_ UnitTest {
   // the object is constructed.  We don't mark it as const here, as
   // doing so will cause a warning in the constructor of UnitTest.
   // Mutable state in *impl_ is protected by mutex_.
-  internal::UnitTestImpl *impl_;
+  internal::UnitTestImpl* impl_;
 
   // We disallow copying UnitTest.
   GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
@@ -1465,7 +1490,7 @@ class GTEST_API_ UnitTest {
 // translation units and the environments have dependencies among them
 // (remember that the compiler doesn't guarantee the order in which
 // global variables from different translation units are initialized).
-inline Environment *AddGlobalTestEnvironment(Environment *env) {
+inline Environment* AddGlobalTestEnvironment(Environment* env) {
   return UnitTest::GetInstance()->AddEnvironment(env);
 }
 
@@ -1478,11 +1503,11 @@ inline Environment *AddGlobalTestEnvironment(Environment *env) {
 // updated.
 //
 // Calling the function for the second time has no user-visible effect.
-GTEST_API_ void InitGoogleTest(int *argc, char **argv);
+GTEST_API_ void InitGoogleTest(int* argc, char** argv);
 
 // This overloaded version can be used in Windows programs compiled in
 // UNICODE mode.
-GTEST_API_ void InitGoogleTest(int *argc, wchar_t **argv);
+GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
 
 // This overloaded version can be used on Arduino/embedded platforms where
 // there is no argc/argv.
@@ -1494,12 +1519,14 @@ namespace internal {
 // frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
 // when calling EXPECT_* in a tight loop.
 template <typename T1, typename T2>
-AssertionResult CmpHelperEQFailure(const char *lhs_expression,
-                                   const char *rhs_expression, const T1 &lhs,
-                                   const T2 &rhs) {
-  return EqFailure(lhs_expression, rhs_expression,
+AssertionResult CmpHelperEQFailure(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const T1& lhs, const T2& rhs) {
+  return EqFailure(lhs_expression,
+                   rhs_expression,
                    FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs), false);
+                   FormatForComparisonFailureMessage(rhs, lhs),
+                   false);
 }
 
 // This block of code defines operator==/!=
@@ -1511,9 +1538,10 @@ inline bool operator!=(faketype, faketype) { return false; }
 
 // The helper function for {ASSERT|EXPECT}_EQ.
 template <typename T1, typename T2>
-AssertionResult CmpHelperEQ(const char *lhs_expression,
-                            const char *rhs_expression, const T1 &lhs,
-                            const T2 &rhs) {
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            const T1& lhs,
+                            const T2& rhs) {
   if (lhs == rhs) {
     return AssertionSuccess();
   }
@@ -1521,13 +1549,6 @@ AssertionResult CmpHelperEQ(const char *lhs_expression,
   return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
 }
 
-// With this overloaded version, we allow anonymous enums to be used
-// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
-// can be implicitly cast to BiggestInt.
-GTEST_API_ AssertionResult CmpHelperEQ(const char *lhs_expression,
-                                       const char *rhs_expression,
-                                       BiggestInt lhs, BiggestInt rhs);
-
 class EqHelper {
  public:
   // This templatized version is for the general case.
@@ -1536,10 +1557,10 @@ class EqHelper {
       // Disable this overload for cases where one argument is a pointer
       // and the other is the null pointer constant.
       typename std::enable_if<!std::is_integral<T1>::value ||
-                              !std::is_pointer<T2>::value>::type * = nullptr>
-  static AssertionResult Compare(const char *lhs_expression,
-                                 const char *rhs_expression, const T1 &lhs,
-                                 const T2 &rhs) {
+                              !std::is_pointer<T2>::value>::type* = nullptr>
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression, const T1& lhs,
+                                 const T2& rhs) {
     return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 
@@ -1549,20 +1570,21 @@ class EqHelper {
   //
   // Even though its body looks the same as the above version, we
   // cannot merge the two, as it will make anonymous enums unhappy.
-  static AssertionResult Compare(const char *lhs_expression,
-                                 const char *rhs_expression, BiggestInt lhs,
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 BiggestInt lhs,
                                  BiggestInt rhs) {
     return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 
   template <typename T>
   static AssertionResult Compare(
-      const char *lhs_expression, const char *rhs_expression,
+      const char* lhs_expression, const char* rhs_expression,
       // Handle cases where '0' is used as a null pointer literal.
-      std::nullptr_t /* lhs */, T *rhs) {
+      std::nullptr_t /* lhs */, T* rhs) {
     // We already know that 'lhs' is a null pointer.
-    return CmpHelperEQ(lhs_expression, rhs_expression,
-                       static_cast<T *>(nullptr), rhs);
+    return CmpHelperEQ(lhs_expression, rhs_expression, static_cast<T*>(nullptr),
+                       rhs);
   }
 };
 
@@ -1570,9 +1592,9 @@ class EqHelper {
 // frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers
 // when calling EXPECT_OP in a tight loop.
 template <typename T1, typename T2>
-AssertionResult CmpHelperOpFailure(const char *expr1, const char *expr2,
-                                   const T1 &val1, const T2 &val2,
-                                   const char *op) {
+AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
+                                   const T1& val1, const T2& val2,
+                                   const char* op) {
   return AssertionFailure()
          << "Expected: (" << expr1 << ") " << op << " (" << expr2
          << "), actual: " << FormatForComparisonFailureMessage(val1, val2)
@@ -1583,82 +1605,82 @@ AssertionResult CmpHelperOpFailure(const char *expr1, const char *expr2,
 // ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
 // of similar code.
 //
-// For each templatized helper function, we also define an overloaded
-// version for BiggestInt in order to reduce code bloat and allow
-// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
-// with gcc 4.
-//
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)                                \
-  template <typename T1, typename T2>                                      \
-  AssertionResult CmpHelper##op_name(const char *expr1, const char *expr2, \
-                                     const T1 &val1, const T2 &val2) {     \
-    if (val1 op val2) {                                                    \
-      return AssertionSuccess();                                           \
-    } else {                                                               \
-      return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);            \
-    }                                                                      \
-  }                                                                        \
-  GTEST_API_ AssertionResult CmpHelper##op_name(                           \
-      const char *expr1, const char *expr2, BiggestInt val1, BiggestInt val2)
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+template <typename T1, typename T2>\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   const T1& val1, const T2& val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
+  }\
+}
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
 // Implements the helper function for {ASSERT|EXPECT}_NE
-GTEST_IMPL_CMP_HELPER_(NE, !=);
+GTEST_IMPL_CMP_HELPER_(NE, !=)
 // Implements the helper function for {ASSERT|EXPECT}_LE
-GTEST_IMPL_CMP_HELPER_(LE, <=);
+GTEST_IMPL_CMP_HELPER_(LE, <=)
 // Implements the helper function for {ASSERT|EXPECT}_LT
-GTEST_IMPL_CMP_HELPER_(LT, <);
+GTEST_IMPL_CMP_HELPER_(LT, <)
 // Implements the helper function for {ASSERT|EXPECT}_GE
-GTEST_IMPL_CMP_HELPER_(GE, >=);
+GTEST_IMPL_CMP_HELPER_(GE, >=)
 // Implements the helper function for {ASSERT|EXPECT}_GT
-GTEST_IMPL_CMP_HELPER_(GT, >);
+GTEST_IMPL_CMP_HELPER_(GT, >)
 
 #undef GTEST_IMPL_CMP_HELPER_
 
 // The helper function for {ASSERT|EXPECT}_STREQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char *s1_expression,
-                                          const char *s2_expression,
-                                          const char *s1, const char *s2);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char *s1_expression,
-                                              const char *s2_expression,
-                                              const char *s1, const char *s2);
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRNE(const char *s1_expression,
-                                          const char *s2_expression,
-                                          const char *s1, const char *s2);
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char *s1_expression,
-                                              const char *s2_expression,
-                                              const char *s1, const char *s2);
+GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
+
 
 // Helper function for *_STREQ on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char *s1_expression,
-                                          const char *s2_expression,
-                                          const wchar_t *s1, const wchar_t *s2);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
 
 // Helper function for *_STRNE on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRNE(const char *s1_expression,
-                                          const char *s2_expression,
-                                          const wchar_t *s1, const wchar_t *s2);
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
 
 }  // namespace internal
 
@@ -1670,40 +1692,32 @@ GTEST_API_ AssertionResult CmpHelperSTRNE(const char *s1_expression,
 //
 // The {needle,haystack}_expr arguments are the stringified
 // expressions that generated the two real arguments.
-GTEST_API_ AssertionResult IsSubstring(const char *needle_expr,
-                                       const char *haystack_expr,
-                                       const char *needle,
-                                       const char *haystack);
-GTEST_API_ AssertionResult IsSubstring(const char *needle_expr,
-                                       const char *haystack_expr,
-                                       const wchar_t *needle,
-                                       const wchar_t *haystack);
-GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr,
-                                          const char *haystack_expr,
-                                          const char *needle,
-                                          const char *haystack);
-GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr,
-                                          const char *haystack_expr,
-                                          const wchar_t *needle,
-                                          const wchar_t *haystack);
-GTEST_API_ AssertionResult IsSubstring(const char *needle_expr,
-                                       const char *haystack_expr,
-                                       const ::std::string &needle,
-                                       const ::std::string &haystack);
-GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr,
-                                          const char *haystack_expr,
-                                          const ::std::string &needle,
-                                          const ::std::string &haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
 
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ AssertionResult IsSubstring(const char *needle_expr,
-                                       const char *haystack_expr,
-                                       const ::std::wstring &needle,
-                                       const ::std::wstring &haystack);
-GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr,
-                                          const char *haystack_expr,
-                                          const ::std::wstring &needle,
-                                          const ::std::wstring &haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
 #endif  // GTEST_HAS_STD_WSTRING
 
 namespace internal {
@@ -1716,9 +1730,10 @@ namespace internal {
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename RawType>
-AssertionResult CmpHelperFloatingPointEQ(const char *lhs_expression,
-                                         const char *rhs_expression,
-                                         RawType lhs_value, RawType rhs_value) {
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         RawType lhs_value,
+                                         RawType rhs_value) {
   const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
 
   if (lhs.AlmostEquals(rhs)) {
@@ -1733,18 +1748,21 @@ AssertionResult CmpHelperFloatingPointEQ(const char *lhs_expression,
   rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
          << rhs_value;
 
-  return EqFailure(lhs_expression, rhs_expression,
-                   StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   StringStreamToString(&lhs_ss),
+                   StringStreamToString(&rhs_ss),
                    false);
 }
 
 // Helper function for implementing ASSERT_NEAR.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult DoubleNearPredFormat(const char *expr1,
-                                                const char *expr2,
-                                                const char *abs_error_expr,
-                                                double val1, double val2,
+GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
+                                                const char* expr2,
+                                                const char* abs_error_expr,
+                                                double val1,
+                                                double val2,
                                                 double abs_error);
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -1752,13 +1770,15 @@ GTEST_API_ AssertionResult DoubleNearPredFormat(const char *expr1,
 class GTEST_API_ AssertHelper {
  public:
   // Constructor.
-  AssertHelper(TestPartResult::Type type, const char *file, int line,
-               const char *message);
+  AssertHelper(TestPartResult::Type type,
+               const char* file,
+               int line,
+               const char* message);
   ~AssertHelper();
 
   // Message assignment is a semantic trick to enable assertion
   // streaming; see the GTEST_MESSAGE_ macro below.
-  void operator=(const Message &message) const;
+  void operator=(const Message& message) const;
 
  private:
   // We put our data in a struct so that the size of the AssertHelper class can
@@ -1766,12 +1786,14 @@ class GTEST_API_ AssertHelper {
   // re-using stack space even for temporary variables, so every EXPECT_EQ
   // reserves stack space for another AssertHelper.
   struct AssertHelperData {
-    AssertHelperData(TestPartResult::Type t, const char *srcfile, int line_num,
-                     const char *msg)
-        : type(t), file(srcfile), line(line_num), message(msg) {}
+    AssertHelperData(TestPartResult::Type t,
+                     const char* srcfile,
+                     int line_num,
+                     const char* msg)
+        : type(t), file(srcfile), line(line_num), message(msg) { }
 
     TestPartResult::Type const type;
-    const char *const file;
+    const char* const file;
     int const line;
     std::string const message;
 
@@ -1779,17 +1801,11 @@ class GTEST_API_ AssertHelper {
     GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
   };
 
-  AssertHelperData *const data_;
+  AssertHelperData* const data_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
 };
 
-enum GTestColor { COLOR_DEFAULT, COLOR_RED, COLOR_GREEN, COLOR_YELLOW };
-
-GTEST_API_ GTEST_ATTRIBUTE_PRINTF_(2, 3) void ColoredPrintf(GTestColor color,
-                                                            const char *fmt,
-                                                            ...);
-
 }  // namespace internal
 
 // The pure interface class that all value-parameterized tests inherit from.
@@ -1834,7 +1850,7 @@ class WithParamInterface {
 
   // The current parameter value. Is also available in the test fixture's
   // constructor.
-  static const ParamType &GetParam() {
+  static const ParamType& GetParam() {
     GTEST_CHECK_(parameter_ != nullptr)
         << "GetParam() can only be called inside a value-parameterized test "
         << "-- did you intend to write TEST_P instead of TEST_F?";
@@ -1844,24 +1860,26 @@ class WithParamInterface {
  private:
   // Sets parameter value. The caller is responsible for making sure the value
   // remains alive and unchanged throughout the current test.
-  static void SetParam(const ParamType *parameter) { parameter_ = parameter; }
+  static void SetParam(const ParamType* parameter) {
+    parameter_ = parameter;
+  }
 
   // Static value used for accessing parameter during a test lifetime.
-  static const ParamType *parameter_;
+  static const ParamType* parameter_;
 
   // TestClass must be a subclass of WithParamInterface<T> and Test.
-  template <class TestClass>
-  friend class internal::ParameterizedTestFactory;
+  template <class TestClass> friend class internal::ParameterizedTestFactory;
 };
 
 template <typename T>
-const T *WithParamInterface<T>::parameter_ = nullptr;
+const T* WithParamInterface<T>::parameter_ = nullptr;
 
 // Most value-parameterized classes can ignore the existence of
 // WithParamInterface, and can just inherit from ::testing::TestWithParam.
 
 template <typename T>
-class TestWithParam : public Test, public WithParamInterface<T> {};
+class TestWithParam : public Test, public WithParamInterface<T> {
+};
 
 // Macros for indicating success/failure in test code.
 
@@ -1892,7 +1910,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {};
 
 // Generates a nonfatal failure at the given source file location with
 // a generic message.
-#define ADD_FAILURE_AT(file, line)        \
+#define ADD_FAILURE_AT(file, line) \
   GTEST_MESSAGE_AT_(file, line, "Failed", \
                     ::testing::TestPartResult::kNonFatalFailure)
 
@@ -1907,7 +1925,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {};
 // Define this macro to 1 to omit the definition of FAIL(), which is a
 // generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_FAIL
-#define FAIL() GTEST_FAIL()
+# define FAIL() GTEST_FAIL()
 #endif
 
 // Generates a success with a generic message.
@@ -1916,7 +1934,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {};
 // Define this macro to 1 to omit the definition of SUCCEED(), which
 // is a generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_SUCCEED
-#define SUCCEED() GTEST_SUCCEED()
+# define SUCCEED() GTEST_SUCCEED()
 #endif
 
 // Macros for testing exceptions.
@@ -1944,18 +1962,38 @@ class TestWithParam : public Test, public WithParamInterface<T> {};
 // Boolean assertions. Condition can be either a Boolean expression or an
 // AssertionResult. For more information on how to use AssertionResult with
 // these macros see comments on that class.
-#define EXPECT_TRUE(condition)                            \
+#define GTEST_EXPECT_TRUE(condition) \
   GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_NONFATAL_FAILURE_)
-#define EXPECT_FALSE(condition)                              \
+#define GTEST_EXPECT_FALSE(condition) \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_NONFATAL_FAILURE_)
-#define ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_)
-#define ASSERT_FALSE(condition)                              \
+#define GTEST_ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_FATAL_FAILURE_)
+#define GTEST_ASSERT_FALSE(condition) \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_FATAL_FAILURE_)
 
+// Define these macros to 1 to omit the definition of the corresponding
+// EXPECT or ASSERT, which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_EXPECT_TRUE
+#define EXPECT_TRUE(condition) GTEST_EXPECT_TRUE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_EXPECT_FALSE
+#define EXPECT_FALSE(condition) GTEST_EXPECT_FALSE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_TRUE
+#define ASSERT_TRUE(condition) GTEST_ASSERT_TRUE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_FALSE
+#define ASSERT_FALSE(condition) GTEST_ASSERT_FALSE(condition)
+#endif
+
 // Macros for testing equalities and inequalities.
 //
 //    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
@@ -2032,27 +2070,27 @@ class TestWithParam : public Test, public WithParamInterface<T> {};
 // ASSERT_XY(), which clashes with some users' own code.
 
 #if !GTEST_DONT_DEFINE_ASSERT_EQ
-#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_NE
-#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LE
-#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LT
-#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GE
-#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GT
-#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
 #endif
 
 // C-string Comparisons.  All tests treat NULL and any non-NULL string
@@ -2077,7 +2115,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {};
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
 #define EXPECT_STRCASEEQ(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define EXPECT_STRCASENE(s1, s2) \
+#define EXPECT_STRCASENE(s1, s2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 #define ASSERT_STREQ(s1, s2) \
@@ -2086,7 +2124,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {};
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
 #define ASSERT_STRCASEEQ(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define ASSERT_STRCASENE(s1, s2) \
+#define ASSERT_STRCASENE(s1, s2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 // Macros for comparing floating-point numbers.
@@ -2103,29 +2141,29 @@ class TestWithParam : public Test, public WithParamInterface<T> {};
 // FloatingPoint template class in gtest-internal.h if you are
 // interested in the implementation details.
 
-#define EXPECT_FLOAT_EQ(val1, val2)                                         \
+#define EXPECT_FLOAT_EQ(val1, val2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
                       val1, val2)
 
-#define EXPECT_DOUBLE_EQ(val1, val2)                                         \
+#define EXPECT_DOUBLE_EQ(val1, val2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
                       val1, val2)
 
-#define ASSERT_FLOAT_EQ(val1, val2)                                         \
+#define ASSERT_FLOAT_EQ(val1, val2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
                       val1, val2)
 
-#define ASSERT_DOUBLE_EQ(val1, val2)                                         \
+#define ASSERT_DOUBLE_EQ(val1, val2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
                       val1, val2)
 
-#define EXPECT_NEAR(val1, val2, abs_error)                                   \
-  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
-                      abs_error)
+#define EXPECT_NEAR(val1, val2, abs_error)\
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
 
-#define ASSERT_NEAR(val1, val2, abs_error)                                   \
-  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
-                      abs_error)
+#define ASSERT_NEAR(val1, val2, abs_error)\
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
 
 // These predicate format functions work on floating-point values, and
 // can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
@@ -2134,11 +2172,12 @@ class TestWithParam : public Test, public WithParamInterface<T> {};
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-GTEST_API_ AssertionResult FloatLE(const char *expr1, const char *expr2,
+GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
                                    float val1, float val2);
-GTEST_API_ AssertionResult DoubleLE(const char *expr1, const char *expr2,
+GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
                                     double val1, double val2);
 
+
 #if GTEST_OS_WINDOWS
 
 // Macros that test for HRESULT failure and success, these are only useful
@@ -2150,17 +2189,17 @@ GTEST_API_ AssertionResult DoubleLE(const char *expr1, const char *expr2,
 // expected result and the actual result with both a human-readable
 // string representation of the error, if available, as well as the
 // hex result code.
-#define EXPECT_HRESULT_SUCCEEDED(expr) \
-  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+# define EXPECT_HRESULT_SUCCEEDED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-#define ASSERT_HRESULT_SUCCEEDED(expr) \
-  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+# define ASSERT_HRESULT_SUCCEEDED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-#define EXPECT_HRESULT_FAILED(expr) \
-  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+# define EXPECT_HRESULT_FAILED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
-#define ASSERT_HRESULT_FAILED(expr) \
-  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+# define ASSERT_HRESULT_FAILED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
 #endif  // GTEST_OS_WINDOWS
 
@@ -2175,9 +2214,9 @@ GTEST_API_ AssertionResult DoubleLE(const char *expr1, const char *expr2,
 //   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
 //
 #define ASSERT_NO_FATAL_FAILURE(statement) \
-  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
 #define EXPECT_NO_FATAL_FAILURE(statement) \
-  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
 
 // Causes a trace (including the given source file path and line number,
 // and the given message) to be included in every test failure message generated
@@ -2197,16 +2236,16 @@ class GTEST_API_ ScopedTrace {
   // Template version. Uses Message() to convert the values into strings.
   // Slow, but flexible.
   template <typename T>
-  ScopedTrace(const char *file, int line, const T &message) {
+  ScopedTrace(const char* file, int line, const T& message) {
     PushTrace(file, line, (Message() << message).GetString());
   }
 
   // Optimize for some known types.
-  ScopedTrace(const char *file, int line, const char *message) {
+  ScopedTrace(const char* file, int line, const char* message) {
     PushTrace(file, line, message ? message : "(null)");
   }
 
-  ScopedTrace(const char *file, int line, const std::string &message) {
+  ScopedTrace(const char* file, int line, const std::string& message) {
     PushTrace(file, line, message);
   }
 
@@ -2217,7 +2256,7 @@ class GTEST_API_ ScopedTrace {
   ~ScopedTrace();
 
  private:
-  void PushTrace(const char *file, int line, std::string message);
+  void PushTrace(const char* file, int line, std::string message);
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
 } GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
@@ -2239,9 +2278,9 @@ class GTEST_API_ ScopedTrace {
 // Assuming that each thread maintains its own stack of traces.
 // Therefore, a SCOPED_TRACE() would (correctly) only affect the
 // assertions in its own thread.
-#define SCOPED_TRACE(message)                                         \
-  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \
-      __FILE__, __LINE__, (message))
+#define SCOPED_TRACE(message) \
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
+    __FILE__, __LINE__, (message))
 
 // Compile-time assertion for type equality.
 // StaticAssertTypeEq<type1, type2>() compiles if and only if type1 and type2
@@ -2342,7 +2381,7 @@ constexpr bool StaticAssertTypeEq() noexcept {
 //
 // GOOGLETEST_CM0011 DO NOT DELETE
 #if !GTEST_DONT_DEFINE_TEST
-#define TEST_F(test_fixture, test_name)              \
+#define TEST_F(test_fixture, test_name)\
   GTEST_TEST_(test_fixture, test_name, test_fixture, \
               ::testing::internal::GetTypeId<test_fixture>())
 #endif  // !GTEST_DONT_DEFINE_TEST
@@ -2352,7 +2391,7 @@ constexpr bool StaticAssertTypeEq() noexcept {
 GTEST_API_ std::string TempDir();
 
 #ifdef _MSC_VER
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
 // Dynamically registers a test with the framework.
@@ -2412,16 +2451,16 @@ GTEST_API_ std::string TempDir();
 //   return RUN_ALL_TESTS();
 // }
 //
-template <int &... ExplicitParameterBarrier, typename Factory>
-TestInfo *RegisterTest(const char *test_suite_name, const char *test_name,
-                       const char *type_param, const char *value_param,
-                       const char *file, int line, Factory factory) {
+template <int&... ExplicitParameterBarrier, typename Factory>
+TestInfo* RegisterTest(const char* test_suite_name, const char* test_name,
+                       const char* type_param, const char* value_param,
+                       const char* file, int line, Factory factory) {
   using TestT = typename std::remove_pointer<decltype(factory())>::type;
 
   class FactoryImpl : public internal::TestFactoryBase {
    public:
     explicit FactoryImpl(Factory f) : factory_(std::move(f)) {}
-    Test *CreateTest() override { return factory_(); }
+    Test* CreateTest() override { return factory_(); }
 
    private:
     Factory factory_;
@@ -2432,7 +2471,7 @@ TestInfo *RegisterTest(const char *test_suite_name, const char *test_name,
       internal::CodeLocation(file, line), internal::GetTypeId<TestT>(),
       internal::SuiteApiResolver<TestT>::GetSetUpCaseOrSuite(file, line),
       internal::SuiteApiResolver<TestT>::GetTearDownCaseOrSuite(file, line),
-      new FactoryImpl{ std::move(factory) });
+      new FactoryImpl{std::move(factory)});
 }
 
 }  // namespace testing
@@ -2447,8 +2486,10 @@ TestInfo *RegisterTest(const char *test_suite_name, const char *test_name,
 // namespace and has an all-caps name.
 int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
 
-inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); }
+inline int RUN_ALL_TESTS() {
+  return ::testing::UnitTest::GetInstance()->Run();
+}
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
index 1fc21910bd..5029a9bb02 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
@@ -33,8 +33,8 @@
 // Implements a family of generic predicate assertion macros.
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
 #include "gtest/gtest.h"
 
@@ -72,18 +72,22 @@ namespace testing {
 // GTEST_ASSERT_ is the basic statement to which all of the assertions
 // in this file reduce.  Don't use this in your code.
 
-#define GTEST_ASSERT_(expression, on_failure)                   \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                 \
+#define GTEST_ASSERT_(expression, on_failure) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
   if (const ::testing::AssertionResult gtest_ar = (expression)) \
-    ;                                                           \
-  else                                                          \
+    ; \
+  else \
     on_failure(gtest_ar.failure_message())
 
+
 // Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
 // this in your code.
-template <typename Pred, typename T1>
-AssertionResult AssertPred1Helper(const char *pred_text, const char *e1,
-                                  Pred pred, const T1 &v1) {
+template <typename Pred,
+          typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text,
+                                  const char* e1,
+                                  Pred pred,
+                                  const T1& v1) {
   if (pred(v1)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -94,28 +98,41 @@ AssertionResult AssertPred1Helper(const char *pred_text, const char *e1,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \
-  GTEST_ASSERT_(pred_format(#v1, v1), on_failure)
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, v1), \
+                on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
 // this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure) \
-  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure)
+#define GTEST_PRED1_(pred, v1, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
+                                             #v1, \
+                                             pred, \
+                                             v1), on_failure)
 
 // Unary predicate assertion macros.
 #define EXPECT_PRED_FORMAT1(pred_format, v1) \
   GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
 #define ASSERT_PRED_FORMAT1(pred_format, v1) \
   GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+
+
 
 // Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
 // this in your code.
-template <typename Pred, typename T1, typename T2>
-AssertionResult AssertPred2Helper(const char *pred_text, const char *e1,
-                                  const char *e2, Pred pred, const T1 &v1,
-                                  const T2 &v2) {
+template <typename Pred,
+          typename T1,
+          typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2) {
   if (pred(v1, v2)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -128,14 +145,19 @@ AssertionResult AssertPred2Helper(const char *pred_text, const char *e1,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure)
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
+                on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
 // this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)                               \
-  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \
-                on_failure)
+#define GTEST_PRED2_(pred, v1, v2, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             pred, \
+                                             v1, \
+                                             v2), on_failure)
 
 // Binary predicate assertion macros.
 #define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
@@ -147,12 +169,22 @@ AssertionResult AssertPred2Helper(const char *pred_text, const char *e1,
 #define ASSERT_PRED2(pred, v1, v2) \
   GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
 
+
+
 // Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
 // this in your code.
-template <typename Pred, typename T1, typename T2, typename T3>
-AssertionResult AssertPred3Helper(const char *pred_text, const char *e1,
-                                  const char *e2, const char *e3, Pred pred,
-                                  const T1 &v1, const T2 &v2, const T3 &v3) {
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3) {
   if (pred(v1, v2, v3)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -166,15 +198,21 @@ AssertionResult AssertPred3Helper(const char *pred_text, const char *e1,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure)
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
+                on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
 // this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)                          \
-  GTEST_ASSERT_(                                                            \
-      ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \
-      on_failure)
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3), on_failure)
 
 // Ternary predicate assertion macros.
 #define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
@@ -186,13 +224,25 @@ AssertionResult AssertPred3Helper(const char *pred_text, const char *e1,
 #define ASSERT_PRED3(pred, v1, v2, v3) \
   GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
 
+
+
 // Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
 // this in your code.
-template <typename Pred, typename T1, typename T2, typename T3, typename T4>
-AssertionResult AssertPred4Helper(const char *pred_text, const char *e1,
-                                  const char *e2, const char *e3,
-                                  const char *e4, Pred pred, const T1 &v1,
-                                  const T2 &v2, const T3 &v3, const T4 &v4) {
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4) {
   if (pred(v1, v2, v3, v4)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -207,15 +257,23 @@ AssertionResult AssertPred4Helper(const char *pred_text, const char *e1,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure)
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
+                on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
 // this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)                        \
-  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \
-                                             v1, v2, v3, v4),                 \
-                on_failure)
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4), on_failure)
 
 // 4-ary predicate assertion macros.
 #define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
@@ -227,15 +285,28 @@ AssertionResult AssertPred4Helper(const char *pred_text, const char *e1,
 #define ASSERT_PRED4(pred, v1, v2, v3, v4) \
   GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
 
+
+
 // Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
 // this in your code.
-template <typename Pred, typename T1, typename T2, typename T3, typename T4,
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
           typename T5>
-AssertionResult AssertPred5Helper(const char *pred_text, const char *e1,
-                                  const char *e2, const char *e3,
-                                  const char *e4, const char *e5, Pred pred,
-                                  const T1 &v1, const T2 &v2, const T3 &v3,
-                                  const T4 &v4, const T5 &v5) {
+AssertionResult AssertPred5Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  const char* e5,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4,
+                                  const T5& v5) {
   if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -251,16 +322,25 @@ AssertionResult AssertPred5Helper(const char *pred_text, const char *e1,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)  \
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
   GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
                 on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
 // this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)                   \
-  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \
-                                             pred, v1, v2, v3, v4, v5),      \
-                on_failure)
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             #v5, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4, \
+                                             v5), on_failure)
 
 // 5-ary predicate assertion macros.
 #define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
@@ -272,6 +352,8 @@ AssertionResult AssertPred5Helper(const char *pred_text, const char *e1,
 #define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
   GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
 
+
+
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_prod.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
index 3dc5b23868..38b9d85a51 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
@@ -28,11 +28,11 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 //
-// Google C++ Testing and Mocking Framework definitions useful in production
-// code. GOOGLETEST_CM0003 DO NOT DELETE
+// Google C++ Testing and Mocking Framework definitions useful in production code.
+// GOOGLETEST_CM0003 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
 
 // When you need to test the private or protected members of a class,
 // use the FRIEND_TEST macro to declare your tests as friends of the
@@ -55,7 +55,7 @@
 // Note: The test class must be in the same namespace as the class being tested.
 // For example, putting MyClassTest in an anonymous namespace will not work.
 
-#define FRIEND_TEST(test_case_name, test_name) \
-  friend class test_case_name##_##test_name##_Test
+#define FRIEND_TEST(test_case_name, test_name)\
+friend class test_case_name##_##test_name##_Test
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
index cd85d956d2..db02881c0c 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
@@ -31,7 +31,7 @@
 //
 // ** Custom implementation starts here **
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h
index eb4467abca..b9495d8378 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h
@@ -36,7 +36,7 @@
 //
 // ** Custom implementation starts here **
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h
index 4c8e07be23..afaaf17ba2 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h
@@ -31,7 +31,7 @@
 //
 // ** Custom implementation starts here **
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
index 3e9497d450..490296dfad 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
@@ -33,8 +33,8 @@
 // death tests.  They are subject to change without notice.
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 
 #include "gtest/gtest-matchers.h"
 #include "gtest/internal/gtest-internal.h"
@@ -80,20 +80,18 @@ class GTEST_API_ DeathTest {
   // argument is set.  If the death test should be skipped, the pointer
   // is set to NULL; otherwise, it is set to the address of a new concrete
   // DeathTest object that controls the execution of the current test.
-  static bool Create(const char *statement,
-                     Matcher<const std::string &> matcher, const char *file,
-                     int line, DeathTest **test);
+  static bool Create(const char* statement, Matcher<const std::string&> matcher,
+                     const char* file, int line, DeathTest** test);
   DeathTest();
-  virtual ~DeathTest() {}
+  virtual ~DeathTest() { }
 
   // A helper class that aborts a death test when it's deleted.
   class ReturnSentinel {
    public:
-    explicit ReturnSentinel(DeathTest *test) : test_(test) {}
+    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
     ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
-
    private:
-    DeathTest *const test_;
+    DeathTest* const test_;
     GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
   } GTEST_ATTRIBUTE_UNUSED_;
 
@@ -131,9 +129,9 @@ class GTEST_API_ DeathTest {
 
   // Returns a human-readable outcome message regarding the outcome of
   // the last death test.
-  static const char *LastMessage();
+  static const char* LastMessage();
 
-  static void set_last_death_test_message(const std::string &message);
+  static void set_last_death_test_message(const std::string& message);
 
  private:
   // A string containing a description of the outcome of the last death test.
@@ -147,17 +145,17 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // Factory interface for death tests.  May be mocked out for testing.
 class DeathTestFactory {
  public:
-  virtual ~DeathTestFactory() {}
-  virtual bool Create(const char *statement,
-                      Matcher<const std::string &> matcher, const char *file,
-                      int line, DeathTest **test) = 0;
+  virtual ~DeathTestFactory() { }
+  virtual bool Create(const char* statement,
+                      Matcher<const std::string&> matcher, const char* file,
+                      int line, DeathTest** test) = 0;
 };
 
 // A concrete DeathTestFactory implementation for normal use.
 class DefaultDeathTestFactory : public DeathTestFactory {
  public:
-  bool Create(const char *statement, Matcher<const std::string &> matcher,
-              const char *file, int line, DeathTest **test) override;
+  bool Create(const char* statement, Matcher<const std::string&> matcher,
+              const char* file, int line, DeathTest** test) override;
 };
 
 // Returns true if exit_status describes a process that was terminated
@@ -167,56 +165,56 @@ GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
 // A string passed to EXPECT_DEATH (etc.) is caught by one of these overloads
 // and interpreted as a regex (rather than an Eq matcher) for legacy
 // compatibility.
-inline Matcher<const ::std::string &> MakeDeathTestMatcher(
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(
     ::testing::internal::RE regex) {
   return ContainsRegex(regex.pattern());
 }
-inline Matcher<const ::std::string &> MakeDeathTestMatcher(const char *regex) {
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(const char* regex) {
   return ContainsRegex(regex);
 }
-inline Matcher<const ::std::string &> MakeDeathTestMatcher(
-    const ::std::string &regex) {
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(
+    const ::std::string& regex) {
   return ContainsRegex(regex);
 }
 
 // If a Matcher<const ::std::string&> is passed to EXPECT_DEATH (etc.), it's
 // used directly.
-inline Matcher<const ::std::string &> MakeDeathTestMatcher(
-    Matcher<const ::std::string &> matcher) {
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(
+    Matcher<const ::std::string&> matcher) {
   return matcher;
 }
 
 // Traps C++ exceptions escaping statement and reports them as test
 // failures. Note that trapping SEH exceptions is not implemented here.
-#if GTEST_HAS_EXCEPTIONS
-#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test)           \
-  try {                                                                      \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);               \
-  } catch (const ::std::exception &gtest_exception) {                        \
-    fprintf(                                                                 \
-        stderr,                                                              \
-        "\n%s: Caught std::exception-derived exception escaping the "        \
-        "death test statement. Exception message: %s\n",                     \
+# if GTEST_HAS_EXCEPTIONS
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  try { \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } catch (const ::std::exception& gtest_exception) { \
+    fprintf(\
+        stderr, \
+        "\n%s: Caught std::exception-derived exception escaping the " \
+        "death test statement. Exception message: %s\n", \
         ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
-        gtest_exception.what());                                             \
-    fflush(stderr);                                                          \
+        gtest_exception.what()); \
+    fflush(stderr); \
     death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  } catch (...) {                                                            \
+  } catch (...) { \
     death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
   }
 
-#else
-#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+# else
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
 
-#endif
+# endif
 
 // This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
 // ASSERT_EXIT*, and EXPECT_EXIT*.
 #define GTEST_DEATH_TEST_(statement, predicate, regex_or_matcher, fail)        \
   GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
   if (::testing::internal::AlwaysTrue()) {                                     \
-    ::testing::internal::DeathTest *gtest_dt;                                  \
+    ::testing::internal::DeathTest* gtest_dt;                                  \
     if (!::testing::internal::DeathTest::Create(                               \
             #statement,                                                        \
             ::testing::internal::MakeDeathTestMatcher(regex_or_matcher),       \
@@ -238,7 +236,8 @@ inline Matcher<const ::std::string &> MakeDeathTestMatcher(
           gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE);   \
           break;                                                               \
         }                                                                      \
-        default: break;                                                        \
+        default:                                                               \
+          break;                                                               \
       }                                                                        \
     }                                                                          \
   } else                                                                       \
@@ -266,15 +265,19 @@ inline Matcher<const ::std::string &> MakeDeathTestMatcher(
 // RUN_ALL_TESTS was called.
 class InternalRunDeathTestFlag {
  public:
-  InternalRunDeathTestFlag(const std::string &a_file, int a_line, int an_index,
+  InternalRunDeathTestFlag(const std::string& a_file,
+                           int a_line,
+                           int an_index,
                            int a_write_fd)
-      : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {}
+      : file_(a_file), line_(a_line), index_(an_index),
+        write_fd_(a_write_fd) {}
 
   ~InternalRunDeathTestFlag() {
-    if (write_fd_ >= 0) posix::Close(write_fd_);
+    if (write_fd_ >= 0)
+      posix::Close(write_fd_);
   }
 
-  const std::string &file() const { return file_; }
+  const std::string& file() const { return file_; }
   int line() const { return line_; }
   int index() const { return index_; }
   int write_fd() const { return write_fd_; }
@@ -291,11 +294,11 @@ class InternalRunDeathTestFlag {
 // Returns a newly created InternalRunDeathTestFlag object with fields
 // initialized from the GTEST_FLAG(internal_run_death_test) flag if
 // the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag *ParseInternalRunDeathTestFlag();
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
 
 #endif  // GTEST_HAS_DEATH_TEST
 
 }  // namespace internal
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
index b228d47342..0c033abc34 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
@@ -37,8 +37,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 
 #include "gtest/internal/gtest-string.h"
 
@@ -61,22 +61,24 @@ namespace internal {
 
 class GTEST_API_ FilePath {
  public:
-  FilePath() : pathname_("") {}
-  FilePath(const FilePath &rhs) : pathname_(rhs.pathname_) {}
+  FilePath() : pathname_("") { }
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
 
-  explicit FilePath(const std::string &pathname) : pathname_(pathname) {
+  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
     Normalize();
   }
 
-  FilePath &operator=(const FilePath &rhs) {
+  FilePath& operator=(const FilePath& rhs) {
     Set(rhs);
     return *this;
   }
 
-  void Set(const FilePath &rhs) { pathname_ = rhs.pathname_; }
+  void Set(const FilePath& rhs) {
+    pathname_ = rhs.pathname_;
+  }
 
-  const std::string &string() const { return pathname_; }
-  const char *c_str() const { return pathname_.c_str(); }
+  const std::string& string() const { return pathname_; }
+  const char* c_str() const { return pathname_.c_str(); }
 
   // Returns the current working directory, or "" if unsuccessful.
   static FilePath GetCurrentDir();
@@ -85,15 +87,16 @@ class GTEST_API_ FilePath {
   // extension = "xml", returns "dir/test.xml". If number is greater
   // than zero (e.g., 12), returns "dir/test_12.xml".
   // On Windows platform, uses \ as the separator rather than /.
-  static FilePath MakeFileName(const FilePath &directory,
-                               const FilePath &base_name, int number,
-                               const char *extension);
+  static FilePath MakeFileName(const FilePath& directory,
+                               const FilePath& base_name,
+                               int number,
+                               const char* extension);
 
   // Given directory = "dir", relative_path = "test.xml",
   // returns "dir/test.xml".
   // On Windows, uses \ as the separator rather than /.
-  static FilePath ConcatPaths(const FilePath &directory,
-                              const FilePath &relative_path);
+  static FilePath ConcatPaths(const FilePath& directory,
+                              const FilePath& relative_path);
 
   // Returns a pathname for a file that does not currently exist. The pathname
   // will be directory/base_name.extension or
@@ -103,9 +106,9 @@ class GTEST_API_ FilePath {
   // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
   // There could be a race condition if two or more processes are calling this
   // function at the same time -- they could both pick the same filename.
-  static FilePath GenerateUniqueFileName(const FilePath &directory,
-                                         const FilePath &base_name,
-                                         const char *extension);
+  static FilePath GenerateUniqueFileName(const FilePath& directory,
+                                         const FilePath& base_name,
+                                         const char* extension);
 
   // Returns true if and only if the path is "".
   bool IsEmpty() const { return pathname_.empty(); }
@@ -135,7 +138,7 @@ class GTEST_API_ FilePath {
   // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
   // FilePath("dir/file"). If a case-insensitive extension is not
   // found, returns a copy of the original FilePath.
-  FilePath RemoveExtension(const char *extension) const;
+  FilePath RemoveExtension(const char* extension) const;
 
   // Creates directories so that path exists. Returns true if successful or if
   // the directories already exist; returns false if unable to create
@@ -192,10 +195,10 @@ class GTEST_API_ FilePath {
 
   void Normalize();
 
-  // Returns a pointer to the last occurence of a valid path separator in
+  // Returns a pointer to the last occurrence of a valid path separator in
   // the FilePath. On Windows, for example, both '/' and '\' are valid path
   // separators. Returns NULL if no path separator was found.
-  const char *FindLastPathSeparator() const;
+  const char* FindLastPathSeparator() const;
 
   std::string pathname_;
 };  // class FilePath
@@ -205,4 +208,4 @@ class GTEST_API_ FilePath {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
index 9640aba836..f8cbdbd81d 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
@@ -34,20 +34,20 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_OS_LINUX
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
+# include <stdlib.h>
+# include <sys/types.h>
+# include <sys/wait.h>
+# include <unistd.h>
 #endif  // GTEST_OS_LINUX
 
 #if GTEST_HAS_EXCEPTIONS
-#include <stdexcept>
+# include <stdexcept>
 #endif
 
 #include <ctype.h>
@@ -76,7 +76,7 @@
 // the current line number.  For more details, see
 // http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
 #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
-#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
 
 // Stringifies its argument.
 // Work around a bug in visual studio which doesn't accept code like this:
@@ -91,28 +91,28 @@
 #define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, )
 
 namespace proto2 {
-class Message;
+class MessageLite;
 }
 
 namespace testing {
 
 // Forward declarations.
 
-class AssertionResult;  // Result of an assertion.
-class Message;          // Represents a failure message.
-class Test;             // Represents a test.
-class TestInfo;         // Information about a test.
-class TestPartResult;   // Result of a test part.
-class UnitTest;         // A collection of test suites.
+class AssertionResult;                 // Result of an assertion.
+class Message;                         // Represents a failure message.
+class Test;                            // Represents a test.
+class TestInfo;                        // Information about a test.
+class TestPartResult;                  // Result of a test part.
+class UnitTest;                        // A collection of test suites.
 
 template <typename T>
-::std::string PrintToString(const T &value);
+::std::string PrintToString(const T& value);
 
 namespace internal {
 
-struct TraceInfo;    // Information about a trace point.
-class TestInfoImpl;  // Opaque implementation of TestInfo
-class UnitTestImpl;  // Opaque implementation of UnitTest
+struct TraceInfo;                      // Information about a trace point.
+class TestInfoImpl;                    // Opaque implementation of TestInfo
+class UnitTestImpl;                    // Opaque implementation of UnitTest
 
 // The text used in failure messages to indicate the start of the
 // stack trace.
@@ -121,7 +121,6 @@ GTEST_API_ extern const char kStackTraceMarker[];
 // An IgnoredValue object can be implicitly constructed from ANY value.
 class IgnoredValue {
   struct Sink {};
-
  public:
   // This constructor template allows any value to be implicitly
   // converted to IgnoredValue.  The object has no data member and
@@ -133,17 +132,17 @@ class IgnoredValue {
   template <typename T,
             typename std::enable_if<!std::is_convertible<T, Sink>::value,
                                     int>::type = 0>
-  IgnoredValue(const T & /* ignored */) {}  // NOLINT(runtime/explicit)
+  IgnoredValue(const T& /* ignored */) {}  // NOLINT(runtime/explicit)
 };
 
 // Appends the user-supplied message to the Google-Test-generated message.
-GTEST_API_ std::string AppendUserMessage(const std::string &gtest_msg,
-                                         const Message &user_msg);
+GTEST_API_ std::string AppendUserMessage(
+    const std::string& gtest_msg, const Message& user_msg);
 
 #if GTEST_HAS_EXCEPTIONS
 
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(
-    4275 /* an exported class was derived from a class that was not exported */)
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \
+/* an exported class was derived from a class that was not exported */)
 
 // This exception is thrown by (and only by) a failed Google Test
 // assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
@@ -153,7 +152,7 @@ GTEST_DISABLE_MSC_WARNINGS_PUSH_(
 // frameworks know how to extract and print the message inside it.
 class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
  public:
-  explicit GoogleTestFailureException(const TestPartResult &failure);
+  explicit GoogleTestFailureException(const TestPartResult& failure);
 };
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4275
@@ -168,16 +167,16 @@ namespace edit_distance {
 // See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
 enum EditType { kMatch, kAdd, kRemove, kReplace };
 GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
-    const std::vector<size_t> &left, const std::vector<size_t> &right);
+    const std::vector<size_t>& left, const std::vector<size_t>& right);
 
 // Same as above, but the input is represented as strings.
 GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
-    const std::vector<std::string> &left,
-    const std::vector<std::string> &right);
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right);
 
 // Create a diff of the input strings in Unified diff format.
-GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string> &left,
-                                         const std::vector<std::string> &right,
+GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                                         const std::vector<std::string>& right,
                                          size_t context = 2);
 
 }  // namespace edit_distance
@@ -186,9 +185,9 @@ GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string> &left,
 // format.
 // If not null, stores in 'total_line_count' the total number of lines found
 // in left + right.
-GTEST_API_ std::string DiffStrings(const std::string &left,
-                                   const std::string &right,
-                                   size_t *total_line_count);
+GTEST_API_ std::string DiffStrings(const std::string& left,
+                                   const std::string& right,
+                                   size_t* total_line_count);
 
 // Constructs and returns the message for an equality assertion
 // (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
@@ -205,16 +204,18 @@ GTEST_API_ std::string DiffStrings(const std::string &left,
 // The ignoring_case parameter is true if and only if the assertion is a
 // *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
 // be inserted into the message.
-GTEST_API_ AssertionResult EqFailure(const char *expected_expression,
-                                     const char *actual_expression,
-                                     const std::string &expected_value,
-                                     const std::string &actual_value,
+GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
+                                     const char* actual_expression,
+                                     const std::string& expected_value,
+                                     const std::string& actual_value,
                                      bool ignoring_case);
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
 GTEST_API_ std::string GetBoolAssertionFailureMessage(
-    const AssertionResult &assertion_result, const char *expression_text,
-    const char *actual_predicate_value, const char *expected_predicate_value);
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value);
 
 // This template class represents an IEEE floating-point number
 // (either single-precision or double-precision, depending on the
@@ -255,11 +256,11 @@ class FloatingPoint {
   // Constants.
 
   // # of bits in a number.
-  static const size_t kBitCount = 8 * sizeof(RawType);
+  static const size_t kBitCount = 8*sizeof(RawType);
 
   // # of fraction bits in a number.
   static const size_t kFractionBitCount =
-      std::numeric_limits<RawType>::digits - 1;
+    std::numeric_limits<RawType>::digits - 1;
 
   // # of exponent bits in a number.
   static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
@@ -268,8 +269,8 @@ class FloatingPoint {
   static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
 
   // The mask for the fraction bits.
-  static const Bits kFractionBitMask = ~static_cast<Bits>(0) >>
-                                       (kExponentBitCount + 1);
+  static const Bits kFractionBitMask =
+    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
 
   // The mask for the exponent bits.
   static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
@@ -286,7 +287,7 @@ class FloatingPoint {
   //
   // See the following article for more details on ULP:
   // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
-  static const size_t kMaxUlps = 4;
+  static const uint32_t kMaxUlps = 4;
 
   // Constructs a FloatingPoint from a raw floating-point number.
   //
@@ -294,7 +295,7 @@ class FloatingPoint {
   // around may change its bits, although the new value is guaranteed
   // to be also a NAN.  Therefore, don't expect this constructor to
   // preserve the bits in x when x is a NAN.
-  explicit FloatingPoint(const RawType &x) { u_.value_ = x; }
+  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
 
   // Static methods
 
@@ -308,7 +309,9 @@ class FloatingPoint {
   }
 
   // Returns the floating-point number that represent positive infinity.
-  static RawType Infinity() { return ReinterpretBits(kExponentBitMask); }
+  static RawType Infinity() {
+    return ReinterpretBits(kExponentBitMask);
+  }
 
   // Returns the maximum representable finite floating-point number.
   static RawType Max();
@@ -340,13 +343,13 @@ class FloatingPoint {
   //   - returns false if either number is (or both are) NAN.
   //   - treats really large numbers as almost equal to infinity.
   //   - thinks +0.0 and -0.0 are 0 DLP's apart.
-  bool AlmostEquals(const FloatingPoint &rhs) const {
+  bool AlmostEquals(const FloatingPoint& rhs) const {
     // The IEEE standard says that any comparison operation involving
     // a NAN must return false.
     if (is_nan() || rhs.is_nan()) return false;
 
-    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <=
-           kMaxUlps;
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
+        <= kMaxUlps;
   }
 
  private:
@@ -396,13 +399,9 @@ class FloatingPoint {
 // We cannot use std::numeric_limits<T>::max() as it clashes with the max()
 // macro defined by <windows.h>.
 template <>
-inline float FloatingPoint<float>::Max() {
-  return FLT_MAX;
-}
+inline float FloatingPoint<float>::Max() { return FLT_MAX; }
 template <>
-inline double FloatingPoint<double>::Max() {
-  return DBL_MAX;
-}
+inline double FloatingPoint<double>::Max() { return DBL_MAX; }
 
 // Typedefs the instances of the FloatingPoint template class that we
 // care to use.
@@ -415,7 +414,7 @@ typedef FloatingPoint<double> Double;
 // used to hold such IDs.  The user should treat TypeId as an opaque
 // type: the only operation allowed on TypeId values is to compare
 // them for equality using the == operator.
-typedef const void *TypeId;
+typedef const void* TypeId;
 
 template <typename T>
 class TypeIdHelper {
@@ -456,7 +455,7 @@ class TestFactoryBase {
 
   // Creates a test instance to run. The instance is both created and destroyed
   // within TestInfoImpl::Run()
-  virtual Test *CreateTest() = 0;
+  virtual Test* CreateTest() = 0;
 
  protected:
   TestFactoryBase() {}
@@ -470,7 +469,7 @@ class TestFactoryBase {
 template <class TestClass>
 class TestFactoryImpl : public TestFactoryBase {
  public:
-  Test *CreateTest() override { return new TestClass; }
+  Test* CreateTest() override { return new TestClass; }
 };
 
 #if GTEST_OS_WINDOWS
@@ -479,9 +478,9 @@ class TestFactoryImpl : public TestFactoryBase {
 // {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
 // We pass a long instead of HRESULT to avoid causing an
 // include dependency for the HRESULT type.
-GTEST_API_ AssertionResult IsHRESULTSuccess(const char *expr,
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
                                             long hr);  // NOLINT
-GTEST_API_ AssertionResult IsHRESULTFailure(const char *expr,
+GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
                                             long hr);  // NOLINT
 
 #endif  // GTEST_OS_WINDOWS
@@ -491,7 +490,7 @@ using SetUpTestSuiteFunc = void (*)();
 using TearDownTestSuiteFunc = void (*)();
 
 struct CodeLocation {
-  CodeLocation(const std::string &a_file, int a_line)
+  CodeLocation(const std::string& a_file, int a_line)
       : file(a_file), line(a_line) {}
 
   std::string file;
@@ -519,8 +518,9 @@ struct SuiteApiResolver : T {
   using Test =
       typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type;
 
-  static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char *filename,
+  static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char* filename,
                                                         int line_num) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
     SetUpTearDownSuiteFuncType test_case_fp =
         GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase);
     SetUpTearDownSuiteFuncType test_suite_fp =
@@ -532,10 +532,16 @@ struct SuiteApiResolver : T {
         << filename << ":" << line_num;
 
     return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+#else
+    (void)(filename);
+    (void)(line_num);
+    return &T::SetUpTestSuite;
+#endif
   }
 
-  static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char *filename,
+  static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char* filename,
                                                            int line_num) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
     SetUpTearDownSuiteFuncType test_case_fp =
         GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase);
     SetUpTearDownSuiteFuncType test_suite_fp =
@@ -547,6 +553,11 @@ struct SuiteApiResolver : T {
         << filename << ":" << line_num;
 
     return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+#else
+    (void)(filename);
+    (void)(line_num);
+    return &T::TearDownTestSuite;
+#endif
   }
 };
 
@@ -555,11 +566,11 @@ struct SuiteApiResolver : T {
 //
 // Arguments:
 //
-//   test_suite_name:   name of the test suite
+//   test_suite_name:  name of the test suite
 //   name:             name of the test
-//   type_param        the name of the test's type parameter, or NULL if
+//   type_param:       the name of the test's type parameter, or NULL if
 //                     this is not a typed or a type-parameterized test.
-//   value_param       text representation of the test's value parameter,
+//   value_param:      text representation of the test's value parameter,
 //                     or NULL if this is not a type-parameterized test.
 //   code_location:    code location where the test is defined
 //   fixture_class_id: ID of the test fixture class
@@ -568,18 +579,16 @@ struct SuiteApiResolver : T {
 //   factory:          pointer to the factory that creates a test object.
 //                     The newly created TestInfo instance will assume
 //                     ownership of the factory object.
-GTEST_API_ TestInfo *MakeAndRegisterTestInfo(
-    const char *test_suite_name, const char *name, const char *type_param,
-    const char *value_param, CodeLocation code_location,
+GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
+    const char* test_suite_name, const char* name, const char* type_param,
+    const char* value_param, CodeLocation code_location,
     TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
-    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase *factory);
+    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory);
 
 // If *pstr starts with the given prefix, modifies *pstr to be right
 // past the prefix and returns true; otherwise leaves *pstr unchanged
 // and returns false.  None of pstr, *pstr, and prefix can be NULL.
-GTEST_API_ bool SkipPrefix(const char *prefix, const char **pstr);
-
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
 
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 /* class A needs to have dll-interface to be used by clients of class B */)
@@ -592,8 +601,8 @@ class GTEST_API_ TypedTestSuitePState {
   // Adds the given test name to defined_test_names_ and return true
   // if the test suite hasn't been registered; otherwise aborts the
   // program.
-  bool AddTestName(const char *file, int line, const char *case_name,
-                   const char *test_name) {
+  bool AddTestName(const char* file, int line, const char* case_name,
+                   const char* test_name) {
     if (registered_) {
       fprintf(stderr,
               "%s Test %s must be defined before "
@@ -607,11 +616,11 @@ class GTEST_API_ TypedTestSuitePState {
     return true;
   }
 
-  bool TestExists(const std::string &test_name) const {
+  bool TestExists(const std::string& test_name) const {
     return registered_tests_.count(test_name) > 0;
   }
 
-  const CodeLocation &GetCodeLocation(const std::string &test_name) const {
+  const CodeLocation& GetCodeLocation(const std::string& test_name) const {
     RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
     GTEST_CHECK_(it != registered_tests_.end());
     return it->second;
@@ -620,9 +629,9 @@ class GTEST_API_ TypedTestSuitePState {
   // Verifies that registered_tests match the test names in
   // defined_test_names_; returns registered_tests if successful, or
   // aborts the program otherwise.
-  const char *VerifyRegisteredTestNames(const char *test_suite_name,
-                                        const char *file, int line,
-                                        const char *registered_tests);
+  const char* VerifyRegisteredTestNames(const char* test_suite_name,
+                                        const char* file, int line,
+                                        const char* registered_tests);
 
  private:
   typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
@@ -640,27 +649,26 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
 // Skips to the first non-space char after the first comma in 'str';
 // returns NULL if no comma is found in 'str'.
-inline const char *SkipComma(const char *str) {
-  const char *comma = strchr(str, ',');
+inline const char* SkipComma(const char* str) {
+  const char* comma = strchr(str, ',');
   if (comma == nullptr) {
     return nullptr;
   }
-  while (IsSpace(*(++comma))) {
-  }
+  while (IsSpace(*(++comma))) {}
   return comma;
 }
 
 // Returns the prefix of 'str' before the first comma in it; returns
 // the entire string if it contains no comma.
-inline std::string GetPrefixUntilComma(const char *str) {
-  const char *comma = strchr(str, ',');
+inline std::string GetPrefixUntilComma(const char* str) {
+  const char* comma = strchr(str, ',');
   return comma == nullptr ? str : std::string(str, comma);
 }
 
 // Splits a given string on a given delimiter, populating a given
 // vector with the fields.
-void SplitString(const ::std::string &str, char delimiter,
-                 ::std::vector<::std::string> *dest);
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest);
 
 // The default argument to the template below for the case when the user does
 // not provide a name generator.
@@ -677,11 +685,10 @@ struct NameGeneratorSelector {
 };
 
 template <typename NameGenerator>
-void GenerateNamesRecursively(internal::None, std::vector<std::string> *, int) {
-}
+void GenerateNamesRecursively(internal::None, std::vector<std::string>*, int) {}
 
 template <typename NameGenerator, typename Types>
-void GenerateNamesRecursively(Types, std::vector<std::string> *result, int i) {
+void GenerateNamesRecursively(Types, std::vector<std::string>* result, int i) {
   result->push_back(NameGenerator::template GetName<typename Types::Head>(i));
   GenerateNamesRecursively<NameGenerator>(typename Types::Tail(), result,
                                           i + 1);
@@ -708,9 +715,9 @@ class TypeParameterizedTest {
   // specified in INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TestSuite,
   // Types).  Valid values for 'index' are [0, N - 1] where N is the
   // length of Types.
-  static bool Register(const char *prefix, const CodeLocation &code_location,
-                       const char *case_name, const char *test_names, int index,
-                       const std::vector<std::string> &type_names =
+  static bool Register(const char* prefix, const CodeLocation& code_location,
+                       const char* case_name, const char* test_names, int index,
+                       const std::vector<std::string>& type_names =
                            GenerateNames<DefaultNameGenerator, Types>()) {
     typedef typename Types::Head Type;
     typedef Fixture<Type> FixtureClass;
@@ -747,19 +754,19 @@ class TypeParameterizedTest {
 template <GTEST_TEMPLATE_ Fixture, class TestSel>
 class TypeParameterizedTest<Fixture, TestSel, internal::None> {
  public:
-  static bool Register(const char * /*prefix*/, const CodeLocation &,
-                       const char * /*case_name*/, const char * /*test_names*/,
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
+                       const char* /*case_name*/, const char* /*test_names*/,
                        int /*index*/,
-                       const std::vector<std::string> & =
+                       const std::vector<std::string>& =
                            std::vector<std::string>() /*type_names*/) {
     return true;
   }
 };
 
-GTEST_API_ void RegisterTypeParameterizedTestSuite(const char *test_suite_name,
+GTEST_API_ void RegisterTypeParameterizedTestSuite(const char* test_suite_name,
                                                    CodeLocation code_location);
 GTEST_API_ void RegisterTypeParameterizedTestSuiteInstantiation(
-    const char *case_name);
+    const char* case_name);
 
 // TypeParameterizedTestSuite<Fixture, Tests, Types>::Register()
 // registers *all combinations* of 'Tests' and 'Types' with Google
@@ -768,23 +775,23 @@ GTEST_API_ void RegisterTypeParameterizedTestSuiteInstantiation(
 template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
 class TypeParameterizedTestSuite {
  public:
-  static bool Register(const char *prefix, CodeLocation code_location,
-                       const TypedTestSuitePState *state, const char *case_name,
-                       const char *test_names,
-                       const std::vector<std::string> &type_names =
+  static bool Register(const char* prefix, CodeLocation code_location,
+                       const TypedTestSuitePState* state, const char* case_name,
+                       const char* test_names,
+                       const std::vector<std::string>& type_names =
                            GenerateNames<DefaultNameGenerator, Types>()) {
     RegisterTypeParameterizedTestSuiteInstantiation(case_name);
-    std::string test_name =
-        StripTrailingSpaces(GetPrefixUntilComma(test_names));
+    std::string test_name = StripTrailingSpaces(
+        GetPrefixUntilComma(test_names));
     if (!state->TestExists(test_name)) {
       fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
               case_name, test_name.c_str(),
-              FormatFileLocation(code_location.file.c_str(), code_location.line)
-                  .c_str());
+              FormatFileLocation(code_location.file.c_str(),
+                                 code_location.line).c_str());
       fflush(stderr);
       posix::Abort();
     }
-    const CodeLocation &test_location = state->GetCodeLocation(test_name);
+    const CodeLocation& test_location = state->GetCodeLocation(test_name);
 
     typedef typename Tests::Head Head;
 
@@ -805,17 +812,15 @@ class TypeParameterizedTestSuite {
 template <GTEST_TEMPLATE_ Fixture, typename Types>
 class TypeParameterizedTestSuite<Fixture, internal::None, Types> {
  public:
-  static bool Register(const char * /*prefix*/, const CodeLocation &,
-                       const TypedTestSuitePState * /*state*/,
-                       const char * /*case_name*/, const char * /*test_names*/,
-                       const std::vector<std::string> & =
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
+                       const TypedTestSuitePState* /*state*/,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       const std::vector<std::string>& =
                            std::vector<std::string>() /*type_names*/) {
     return true;
   }
 };
 
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 // Returns the current OS stack trace as an std::string.
 //
 // The maximum number of stack frames to be included is specified by
@@ -826,8 +831,8 @@ class TypeParameterizedTestSuite<Fixture, internal::None, Types> {
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest *unit_test,
-                                                       int skip_count);
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
+    UnitTest* unit_test, int skip_count);
 
 // Helpers for suppressing warnings on unreachable code or constant
 // condition.
@@ -842,17 +847,17 @@ inline bool AlwaysFalse() { return !AlwaysTrue(); }
 // variable declared in a conditional expression always being NULL in
 // the else branch.
 struct GTEST_API_ ConstCharPtr {
-  ConstCharPtr(const char *str) : value(str) {}
+  ConstCharPtr(const char* str) : value(str) {}
   operator bool() const { return true; }
-  const char *value;
+  const char* value;
 };
 
 // Helper for declaring std::string within 'if' statement
 // in pre C++17 build environment.
 struct TrueWithString {
   TrueWithString() = default;
-  explicit TrueWithString(const char *str) : value(str) {}
-  explicit TrueWithString(const std::string &str) : value(str) {}
+  explicit TrueWithString(const char* str) : value(str) {}
+  explicit TrueWithString(const std::string& str) : value(str) {}
   explicit operator bool() const { return true; }
   std::string value;
 };
@@ -883,11 +888,34 @@ class GTEST_API_ Random {
 #define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
   typename std::remove_const<typename std::remove_reference<T>::type>::type
 
-// IsAProtocolMessage<T>::value is a compile-time bool constant that's
-// true if and only if T is type proto2::Message or a subclass of it.
+// HasDebugStringAndShortDebugString<T>::value is a compile-time bool constant
+// that's true if and only if T has methods DebugString() and ShortDebugString()
+// that return std::string.
+template <typename T>
+class HasDebugStringAndShortDebugString {
+ private:
+  template <typename C>
+  static auto CheckDebugString(C*) -> typename std::is_same<
+      std::string, decltype(std::declval<const C>().DebugString())>::type;
+  template <typename>
+  static std::false_type CheckDebugString(...);
+
+  template <typename C>
+  static auto CheckShortDebugString(C*) -> typename std::is_same<
+      std::string, decltype(std::declval<const C>().ShortDebugString())>::type;
+  template <typename>
+  static std::false_type CheckShortDebugString(...);
+
+  using HasDebugStringType = decltype(CheckDebugString<T>(nullptr));
+  using HasShortDebugStringType = decltype(CheckShortDebugString<T>(nullptr));
+
+ public:
+  static constexpr bool value =
+      HasDebugStringType::value && HasShortDebugStringType::value;
+};
+
 template <typename T>
-struct IsAProtocolMessage
-    : public std::is_convertible<const T *, const ::proto2::Message *> {};
+constexpr bool HasDebugStringAndShortDebugString<T>::value;
 
 // When the compiler sees expression IsContainerTest<C>(0), if C is an
 // STL-style container class, the first overload of IsContainerTest
@@ -915,9 +943,9 @@ struct IsAProtocolMessage
 // IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
 typedef int IsContainer;
 template <class C,
-          class Iterator = decltype(::std::declval<const C &>().begin()),
-          class = decltype(::std::declval<const C &>().end()),
-          class = decltype(++::std::declval<Iterator &>()),
+          class Iterator = decltype(::std::declval<const C&>().begin()),
+          class = decltype(::std::declval<const C&>().end()),
+          class = decltype(++::std::declval<Iterator&>()),
           class = decltype(*::std::declval<Iterator>()),
           class = typename C::const_iterator>
 IsContainer IsContainerTest(int /* dummy */) {
@@ -926,9 +954,7 @@ IsContainer IsContainerTest(int /* dummy */) {
 
 typedef char IsNotContainer;
 template <class C>
-IsNotContainer IsContainerTest(long /* dummy */) {
-  return '\0';
-}
+IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
 
 // Trait to detect whether a type T is a hash table.
 // The heuristic used is that the type contains an inner type `hasher` and does
@@ -938,9 +964,9 @@ template <typename T>
 struct IsHashTable {
  private:
   template <typename U>
-  static char test(typename U::hasher *, typename U::reverse_iterator *);
+  static char test(typename U::hasher*, typename U::reverse_iterator*);
   template <typename U>
-  static int test(typename U::hasher *, ...);
+  static int test(typename U::hasher*, ...);
   template <typename U>
   static char test(...);
 
@@ -987,17 +1013,15 @@ struct IsRecursiveContainer : public IsRecursiveContainerImpl<C>::type {};
 // 0, ArrayEq() degenerates into comparing a single pair of values.
 
 template <typename T, typename U>
-bool ArrayEq(const T *lhs, size_t size, const U *rhs);
+bool ArrayEq(const T* lhs, size_t size, const U* rhs);
 
 // This generic version is used when k is 0.
 template <typename T, typename U>
-inline bool ArrayEq(const T &lhs, const U &rhs) {
-  return lhs == rhs;
-}
+inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
 
 // This overload is used when k >= 1.
 template <typename T, typename U, size_t N>
-inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) {
+inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
   return internal::ArrayEq(lhs, N, rhs);
 }
 
@@ -1005,9 +1029,10 @@ inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) {
 // the previous ArrayEq() function, arrays with different sizes would
 // lead to different copies of the template code.
 template <typename T, typename U>
-bool ArrayEq(const T *lhs, size_t size, const U *rhs) {
+bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
   for (size_t i = 0; i != size; i++) {
-    if (!internal::ArrayEq(lhs[i], rhs[i])) return false;
+    if (!internal::ArrayEq(lhs[i], rhs[i]))
+      return false;
   }
   return true;
 }
@@ -1015,9 +1040,10 @@ bool ArrayEq(const T *lhs, size_t size, const U *rhs) {
 // Finds the first element in the iterator range [begin, end) that
 // equals elem.  Element may be a native array type itself.
 template <typename Iter, typename Element>
-Iter ArrayAwareFind(Iter begin, Iter end, const Element &elem) {
+Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
   for (Iter it = begin; it != end; ++it) {
-    if (internal::ArrayEq(*it, elem)) return it;
+    if (internal::ArrayEq(*it, elem))
+      return it;
   }
   return end;
 }
@@ -1027,17 +1053,15 @@ Iter ArrayAwareFind(Iter begin, Iter end, const Element &elem) {
 // CopyArray() degenerates into copying a single value.
 
 template <typename T, typename U>
-void CopyArray(const T *from, size_t size, U *to);
+void CopyArray(const T* from, size_t size, U* to);
 
 // This generic version is used when k is 0.
 template <typename T, typename U>
-inline void CopyArray(const T &from, U *to) {
-  *to = from;
-}
+inline void CopyArray(const T& from, U* to) { *to = from; }
 
 // This overload is used when k >= 1.
 template <typename T, typename U, size_t N>
-inline void CopyArray(const T (&from)[N], U (*to)[N]) {
+inline void CopyArray(const T(&from)[N], U(*to)[N]) {
   internal::CopyArray(from, N, *to);
 }
 
@@ -1045,7 +1069,7 @@ inline void CopyArray(const T (&from)[N], U (*to)[N]) {
 // the previous CopyArray() function, arrays with different sizes
 // would lead to different copies of the template code.
 template <typename T, typename U>
-void CopyArray(const T *from, size_t size, U *to) {
+void CopyArray(const T* from, size_t size, U* to) {
   for (size_t i = 0; i != size; i++) {
     internal::CopyArray(from[i], to + i);
   }
@@ -1071,34 +1095,36 @@ class NativeArray {
  public:
   // STL-style container typedefs.
   typedef Element value_type;
-  typedef Element *iterator;
-  typedef const Element *const_iterator;
+  typedef Element* iterator;
+  typedef const Element* const_iterator;
 
   // Constructs from a native array. References the source.
-  NativeArray(const Element *array, size_t count, RelationToSourceReference) {
+  NativeArray(const Element* array, size_t count, RelationToSourceReference) {
     InitRef(array, count);
   }
 
   // Constructs from a native array. Copies the source.
-  NativeArray(const Element *array, size_t count, RelationToSourceCopy) {
+  NativeArray(const Element* array, size_t count, RelationToSourceCopy) {
     InitCopy(array, count);
   }
 
   // Copy constructor.
-  NativeArray(const NativeArray &rhs) {
+  NativeArray(const NativeArray& rhs) {
     (this->*rhs.clone_)(rhs.array_, rhs.size_);
   }
 
   ~NativeArray() {
-    if (clone_ != &NativeArray::InitRef) delete[] array_;
+    if (clone_ != &NativeArray::InitRef)
+      delete[] array_;
   }
 
   // STL-style container methods.
   size_t size() const { return size_; }
   const_iterator begin() const { return array_; }
   const_iterator end() const { return array_ + size_; }
-  bool operator==(const NativeArray &rhs) const {
-    return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin());
+  bool operator==(const NativeArray& rhs) const {
+    return size() == rhs.size() &&
+        ArrayEq(begin(), size(), rhs.begin());
   }
 
  private:
@@ -1107,8 +1133,8 @@ class NativeArray {
                 "Type must not be a reference");
 
   // Initializes this object with a copy of the input.
-  void InitCopy(const Element *array, size_t a_size) {
-    Element *const copy = new Element[a_size];
+  void InitCopy(const Element* array, size_t a_size) {
+    Element* const copy = new Element[a_size];
     CopyArray(array, a_size, copy);
     array_ = copy;
     size_ = a_size;
@@ -1116,17 +1142,15 @@ class NativeArray {
   }
 
   // Initializes this object with a reference of the input.
-  void InitRef(const Element *array, size_t a_size) {
+  void InitRef(const Element* array, size_t a_size) {
     array_ = array;
     size_ = a_size;
     clone_ = &NativeArray::InitRef;
   }
 
-  const Element *array_;
+  const Element* array_;
   size_t size_;
-  void (NativeArray::*clone_)(const Element *, size_t);
-
-  GTEST_DISALLOW_ASSIGN_(NativeArray);
+  void (NativeArray::*clone_)(const Element*, size_t);
 };
 
 // Backport of std::index_sequence.
@@ -1150,12 +1174,18 @@ struct DoubleSequence<false, IndexSequence<I...>, sizeofT> {
 // Backport of std::make_index_sequence.
 // It uses O(ln(N)) instantiation depth.
 template <size_t N>
-struct MakeIndexSequence
-    : DoubleSequence<N % 2 == 1, typename MakeIndexSequence<N / 2>::type,
+struct MakeIndexSequenceImpl
+    : DoubleSequence<N % 2 == 1, typename MakeIndexSequenceImpl<N / 2>::type,
                      N / 2>::type {};
 
 template <>
-struct MakeIndexSequence<0> : IndexSequence<> {};
+struct MakeIndexSequenceImpl<0> : IndexSequence<> {};
+
+template <size_t N>
+using MakeIndexSequence = typename MakeIndexSequenceImpl<N>::type;
+
+template <typename... T>
+using IndexSequenceFor = typename MakeIndexSequence<sizeof...(T)>::type;
 
 template <size_t>
 struct Ignore {
@@ -1181,6 +1211,8 @@ struct ElemFromList {
           static_cast<T (*)()>(nullptr)...));
 };
 
+struct FlatTupleConstructTag {};
+
 template <typename... T>
 class FlatTuple;
 
@@ -1191,7 +1223,9 @@ template <typename... T, size_t I>
 struct FlatTupleElemBase<FlatTuple<T...>, I> {
   using value_type = typename ElemFromList<I, T...>::type;
   FlatTupleElemBase() = default;
-  explicit FlatTupleElemBase(value_type t) : value(std::move(t)) {}
+  template <typename Arg>
+  explicit FlatTupleElemBase(FlatTupleConstructTag, Arg&& t)
+      : value(std::forward<Arg>(t)) {}
   value_type value;
 };
 
@@ -1203,8 +1237,30 @@ struct FlatTupleBase<FlatTuple<T...>, IndexSequence<Idx...>>
     : FlatTupleElemBase<FlatTuple<T...>, Idx>... {
   using Indices = IndexSequence<Idx...>;
   FlatTupleBase() = default;
-  explicit FlatTupleBase(T... t)
-      : FlatTupleElemBase<FlatTuple<T...>, Idx>(std::move(t))... {}
+  template <typename... Args>
+  explicit FlatTupleBase(FlatTupleConstructTag, Args&&... args)
+      : FlatTupleElemBase<FlatTuple<T...>, Idx>(FlatTupleConstructTag{},
+                                                std::forward<Args>(args))... {}
+
+  template <size_t I>
+  const typename ElemFromList<I, T...>::type& Get() const {
+    return FlatTupleElemBase<FlatTuple<T...>, I>::value;
+  }
+
+  template <size_t I>
+  typename ElemFromList<I, T...>::type& Get() {
+    return FlatTupleElemBase<FlatTuple<T...>, I>::value;
+  }
+
+  template <typename F>
+  auto Apply(F&& f) -> decltype(std::forward<F>(f)(this->Get<Idx>()...)) {
+    return std::forward<F>(f)(Get<Idx>()...);
+  }
+
+  template <typename F>
+  auto Apply(F&& f) const -> decltype(std::forward<F>(f)(this->Get<Idx>()...)) {
+    return std::forward<F>(f)(Get<Idx>()...);
+  }
 };
 
 // Analog to std::tuple but with different tradeoffs.
@@ -1225,17 +1281,12 @@ class FlatTuple
 
  public:
   FlatTuple() = default;
-  explicit FlatTuple(T... t) : FlatTuple::FlatTupleBase(std::move(t)...) {}
+  template <typename... Args>
+  explicit FlatTuple(FlatTupleConstructTag tag, Args&&... args)
+      : FlatTuple::FlatTupleBase(tag, std::forward<Args>(args)...) {}
 
-  template <size_t I>
-  const typename ElemFromList<I, T...>::type &Get() const {
-    return static_cast<const FlatTupleElemBase<FlatTuple, I> *>(this)->value;
-  }
-
-  template <size_t I>
-  typename ElemFromList<I, T...>::type &Get() {
-    return static_cast<FlatTupleElemBase<FlatTuple, I> *>(this)->value;
-  }
+  using FlatTuple::FlatTupleBase::Apply;
+  using FlatTuple::FlatTupleBase::Get;
 };
 
 // Utility functions to be called with static_assert to induce deprecation
@@ -1268,9 +1319,25 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
 }  // namespace internal
 }  // namespace testing
 
-#define GTEST_MESSAGE_AT_(file, line, message, result_type)             \
-  ::testing::internal::AssertHelper(result_type, file, line, message) = \
-      ::testing::Message()
+namespace std {
+// Some standard library implementations use `struct tuple_size` and some use
+// `class tuple_size`. Clang warns about the mismatch.
+// https://reviews.llvm.org/D55466
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template <typename... Ts>
+struct tuple_size<testing::internal::FlatTuple<Ts...>>
+    : std::integral_constant<size_t, sizeof...(Ts)> {};
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+}  // namespace std
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
+  ::testing::internal::AssertHelper(result_type, file, line, message) \
+    = ::testing::Message()
 
 #define GTEST_MESSAGE_(message, result_type) \
   GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
@@ -1290,20 +1357,74 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
 // Suppress MSVC warning 4072 (unreachable code) for the code following
 // statement if it returns or throws (or doesn't return or throw in some
 // situations).
+// NOTE: The "else" is important to keep this expansion to prevent a top-level
+// "else" from attaching to our "if".
 #define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
   if (::testing::internal::AlwaysTrue()) {                        \
     statement;                                                    \
+  } else                     /* NOLINT */                         \
+    static_assert(true, "")  // User must have a semicolon after expansion.
+
+#if GTEST_HAS_EXCEPTIONS
+
+namespace testing {
+namespace internal {
+
+class NeverThrown {
+ public:
+  const char* what() const noexcept {
+    return "this exception should never be thrown";
   }
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#if GTEST_HAS_RTTI
+
+#define GTEST_EXCEPTION_TYPE_(e) ::testing::internal::GetTypeName(typeid(e))
+
+#else  // GTEST_HAS_RTTI
+
+#define GTEST_EXCEPTION_TYPE_(e) \
+  std::string { "an std::exception-derived error" }
+
+#endif  // GTEST_HAS_RTTI
+
+#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)   \
+  catch (typename std::conditional<                                            \
+         std::is_same<typename std::remove_cv<typename std::remove_reference<  \
+                          expected_exception>::type>::type,                    \
+                      std::exception>::value,                                  \
+         const ::testing::internal::NeverThrown&, const std::exception&>::type \
+             e) {                                                              \
+    gtest_msg.value = "Expected: " #statement                                  \
+                      " throws an exception of type " #expected_exception      \
+                      ".\n  Actual: it throws ";                               \
+    gtest_msg.value += GTEST_EXCEPTION_TYPE_(e);                               \
+    gtest_msg.value += " with description \"";                                 \
+    gtest_msg.value += e.what();                                               \
+    gtest_msg.value += "\".";                                                  \
+    goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);                \
+  }
+
+#else  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)
+
+#endif  // GTEST_HAS_EXCEPTIONS
 
 #define GTEST_TEST_THROW_(statement, expected_exception, fail)              \
   GTEST_AMBIGUOUS_ELSE_BLOCKER_                                             \
-  if (::testing::internal::ConstCharPtr gtest_msg = "") {                   \
+  if (::testing::internal::TrueWithString gtest_msg{}) {                    \
     bool gtest_caught_expected = false;                                     \
     try {                                                                   \
       GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);            \
-    } catch (expected_exception const &) {                                  \
+    } catch (expected_exception const&) {                                   \
       gtest_caught_expected = true;                                         \
-    } catch (...) {                                                         \
+    }                                                                       \
+    GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)    \
+    catch (...) {                                                           \
       gtest_msg.value = "Expected: " #statement                             \
                         " throws an exception of type " #expected_exception \
                         ".\n  Actual: it throws a different type.";         \
@@ -1315,19 +1436,20 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
                         ".\n  Actual: it throws nothing.";                  \
       goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
     }                                                                       \
-  } else                                                                    \
+  } else /*NOLINT*/                                                         \
     GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__)                   \
-        : fail(gtest_msg.value)
+        : fail(gtest_msg.value.c_str())
 
 #if GTEST_HAS_EXCEPTIONS
 
-#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                           \
-  catch (std::exception const &e) {                                          \
-    gtest_msg.value =                                                        \
-        ("it throws std::exception-derived exception with description: \""); \
-    gtest_msg.value += e.what();                                             \
-    gtest_msg.value += "\".";                                                \
-    goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__);            \
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                \
+  catch (std::exception const& e) {                               \
+    gtest_msg.value = "it throws ";                               \
+    gtest_msg.value += GTEST_EXCEPTION_TYPE_(e);                  \
+    gtest_msg.value += " with description \"";                    \
+    gtest_msg.value += e.what();                                  \
+    gtest_msg.value += "\".";                                     \
+    goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
   }
 
 #else  // GTEST_HAS_EXCEPTIONS
@@ -1336,69 +1458,66 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
 
 #endif  // GTEST_HAS_EXCEPTIONS
 
-#define GTEST_TEST_NO_THROW_(statement, fail)                            \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                          \
-  if (::testing::internal::TrueWithString gtest_msg{}) {                 \
-    try {                                                                \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);         \
-    }                                                                    \
-    GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                           \
-    catch (...) {                                                        \
-      gtest_msg.value = "it throws.";                                    \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__);      \
-    }                                                                    \
-  } else                                                                 \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__)              \
-        : fail(("Expected: " #statement " doesn't throw an exception.\n" \
-                "  Actual: " +                                           \
-                gtest_msg.value)                                         \
-                   .c_str())
-
-#define GTEST_TEST_ANY_THROW_(statement, fail)                       \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                      \
-  if (::testing::internal::AlwaysTrue()) {                           \
-    bool gtest_caught_any = false;                                   \
-    try {                                                            \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);     \
-    } catch (...) {                                                  \
-      gtest_caught_any = true;                                       \
-    }                                                                \
-    if (!gtest_caught_any) {                                         \
+#define GTEST_TEST_NO_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::TrueWithString gtest_msg{}) { \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \
+    catch (...) { \
+      gtest_msg.value = "it throws."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
+      fail(("Expected: " #statement " doesn't throw an exception.\n" \
+            "  Actual: " + gtest_msg.value).c_str())
+
+#define GTEST_TEST_ANY_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    bool gtest_caught_any = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      gtest_caught_any = true; \
+    } \
+    if (!gtest_caught_any) { \
       goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
-    }                                                                \
-  } else                                                             \
-    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__)         \
-        : fail("Expected: " #statement                               \
-               " throws an exception.\n"                             \
-               "  Actual: it doesn't.")
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
+      fail("Expected: " #statement " throws an exception.\n" \
+           "  Actual: it doesn't.")
+
 
 // Implements Boolean test assertions such as EXPECT_TRUE. expression can be
 // either a boolean expression or an AssertionResult. text is a textual
-// represenation of expression as it was passed into the EXPECT_TRUE.
+// representation of expression as it was passed into the EXPECT_TRUE.
 #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                       \
-  if (const ::testing::AssertionResult gtest_ar_ =                    \
-          ::testing::AssertionResult(expression))                     \
-    ;                                                                 \
-  else                                                                \
-    fail(::testing::internal::GetBoolAssertionFailureMessage(         \
-             gtest_ar_, text, #actual, #expected)                     \
-             .c_str())
-
-#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail)                          \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
-  if (::testing::internal::AlwaysTrue()) {                                     \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar_ = \
+      ::testing::AssertionResult(expression)) \
+    ; \
+  else \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(\
+        gtest_ar_, text, #actual, #expected).c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
     ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
-    if (gtest_fatal_failure_checker.has_new_fatal_failure()) {                 \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__);            \
-    }                                                                          \
-  } else                                                                       \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__)                    \
-        : fail("Expected: " #statement                                         \
-               " doesn't generate new fatal "                                  \
-               "failures in the current thread.\n"                             \
-               "  Actual: it does.")
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
+      fail("Expected: " #statement " doesn't generate new fatal " \
+           "failures in the current thread.\n" \
+           "  Actual: it does.")
 
 // Expands to the name of the class that implements the given test.
 #define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
@@ -1413,7 +1532,7 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
   class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                    \
       : public parent_class {                                                 \
    public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                   \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default;           \
     ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \
     GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
                                                            test_name));       \
@@ -1422,10 +1541,10 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
                                                                               \
    private:                                                                   \
     void TestBody() override;                                                 \
-    static ::testing::TestInfo *const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
+    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
   };                                                                          \
                                                                               \
-  ::testing::TestInfo *const GTEST_TEST_CLASS_NAME_(test_suite_name,          \
+  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name,          \
                                                     test_name)::test_info_ =  \
       ::testing::internal::MakeAndRegisterTestInfo(                           \
           #test_suite_name, #test_name, nullptr, nullptr,                     \
@@ -1438,4 +1557,4 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
               test_suite_name, test_name)>);                                  \
   void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
index 0d8fc71ce2..c2ef6e3124 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
@@ -27,12 +27,13 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 // Type and function utilities for implementing parameterized tests.
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 
 #include <ctype.h>
 
@@ -55,8 +56,9 @@ namespace testing {
 // Consists of the parameter value and the integer parameter index.
 template <class ParamType>
 struct TestParamInfo {
-  TestParamInfo(const ParamType &a_param, size_t an_index)
-      : param(a_param), index(an_index) {}
+  TestParamInfo(const ParamType& a_param, size_t an_index) :
+    param(a_param),
+    index(an_index) {}
   ParamType param;
   size_t index;
 };
@@ -65,7 +67,7 @@ struct TestParamInfo {
 // testing::PrintToString.
 struct PrintToStringParamName {
   template <class ParamType>
-  std::string operator()(const TestParamInfo<ParamType> &info) const {
+  std::string operator()(const TestParamInfo<ParamType>& info) const {
     return PrintToString(info.param);
   }
 };
@@ -79,13 +81,11 @@ namespace internal {
 // fixture class for the same test suite. This may happen when
 // TEST_P macro is used to define two tests with the same name
 // but in different namespaces.
-GTEST_API_ void ReportInvalidTestSuiteType(const char *test_suite_name,
+GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name,
                                            CodeLocation code_location);
 
-template <typename>
-class ParamGeneratorInterface;
-template <typename>
-class ParamGenerator;
+template <typename> class ParamGeneratorInterface;
+template <typename> class ParamGenerator;
 
 // Interface for iterating over elements provided by an implementation
 // of ParamGeneratorInterface<T>.
@@ -96,7 +96,7 @@ class ParamIteratorInterface {
   // A pointer to the base generator instance.
   // Used only for the purposes of iterator comparison
   // to make sure that two iterators belong to the same generator.
-  virtual const ParamGeneratorInterface<T> *BaseGenerator() const = 0;
+  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
   // Advances iterator to point to the next element
   // provided by the generator. The caller is responsible
   // for not calling Advance() on an iterator equal to
@@ -104,16 +104,16 @@ class ParamIteratorInterface {
   virtual void Advance() = 0;
   // Clones the iterator object. Used for implementing copy semantics
   // of ParamIterator<T>.
-  virtual ParamIteratorInterface *Clone() const = 0;
+  virtual ParamIteratorInterface* Clone() const = 0;
   // Dereferences the current iterator and provides (read-only) access
   // to the pointed value. It is the caller's responsibility not to call
   // Current() on an iterator equal to BaseGenerator()->End().
   // Used for implementing ParamGenerator<T>::operator*().
-  virtual const T *Current() const = 0;
+  virtual const T* Current() const = 0;
   // Determines whether the given iterator and other point to the same
   // element in the sequence generated by the generator.
   // Used for implementing ParamGenerator<T>::operator==().
-  virtual bool Equals(const ParamIteratorInterface &other) const = 0;
+  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
 };
 
 // Class iterating over elements provided by an implementation of
@@ -123,40 +123,41 @@ template <typename T>
 class ParamIterator {
  public:
   typedef T value_type;
-  typedef const T &reference;
+  typedef const T& reference;
   typedef ptrdiff_t difference_type;
 
   // ParamIterator assumes ownership of the impl_ pointer.
-  ParamIterator(const ParamIterator &other) : impl_(other.impl_->Clone()) {}
-  ParamIterator &operator=(const ParamIterator &other) {
-    if (this != &other) impl_.reset(other.impl_->Clone());
+  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
+  ParamIterator& operator=(const ParamIterator& other) {
+    if (this != &other)
+      impl_.reset(other.impl_->Clone());
     return *this;
   }
 
-  const T &operator*() const { return *impl_->Current(); }
-  const T *operator->() const { return impl_->Current(); }
+  const T& operator*() const { return *impl_->Current(); }
+  const T* operator->() const { return impl_->Current(); }
   // Prefix version of operator++.
-  ParamIterator &operator++() {
+  ParamIterator& operator++() {
     impl_->Advance();
     return *this;
   }
   // Postfix version of operator++.
   ParamIterator operator++(int /*unused*/) {
-    ParamIteratorInterface<T> *clone = impl_->Clone();
+    ParamIteratorInterface<T>* clone = impl_->Clone();
     impl_->Advance();
     return ParamIterator(clone);
   }
-  bool operator==(const ParamIterator &other) const {
+  bool operator==(const ParamIterator& other) const {
     return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
   }
-  bool operator!=(const ParamIterator &other) const {
+  bool operator!=(const ParamIterator& other) const {
     return !(*this == other);
   }
 
  private:
   friend class ParamGenerator<T>;
-  explicit ParamIterator(ParamIteratorInterface<T> *impl) : impl_(impl) {}
-  std::unique_ptr<ParamIteratorInterface<T>> impl_;
+  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
+  std::unique_ptr<ParamIteratorInterface<T> > impl_;
 };
 
 // ParamGeneratorInterface<T> is the binary interface to access generators
@@ -169,8 +170,8 @@ class ParamGeneratorInterface {
   virtual ~ParamGeneratorInterface() {}
 
   // Generator interface definition
-  virtual ParamIteratorInterface<T> *Begin() const = 0;
-  virtual ParamIteratorInterface<T> *End() const = 0;
+  virtual ParamIteratorInterface<T>* Begin() const = 0;
+  virtual ParamIteratorInterface<T>* End() const = 0;
 };
 
 // Wraps ParamGeneratorInterface<T> and provides general generator syntax
@@ -178,15 +179,15 @@ class ParamGeneratorInterface {
 // This class implements copy initialization semantics and the contained
 // ParamGeneratorInterface<T> instance is shared among all copies
 // of the original object. This is possible because that instance is immutable.
-template <typename T>
+template<typename T>
 class ParamGenerator {
  public:
   typedef ParamIterator<T> iterator;
 
-  explicit ParamGenerator(ParamGeneratorInterface<T> *impl) : impl_(impl) {}
-  ParamGenerator(const ParamGenerator &other) : impl_(other.impl_) {}
+  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
+  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
 
-  ParamGenerator &operator=(const ParamGenerator &other) {
+  ParamGenerator& operator=(const ParamGenerator& other) {
     impl_ = other.impl_;
     return *this;
   }
@@ -195,7 +196,7 @@ class ParamGenerator {
   iterator end() const { return iterator(impl_->End()); }
 
  private:
-  std::shared_ptr<const ParamGeneratorInterface<T>> impl_;
+  std::shared_ptr<const ParamGeneratorInterface<T> > impl_;
 };
 
 // Generates values from a range of two comparable values. Can be used to
@@ -206,37 +207,37 @@ template <typename T, typename IncrementT>
 class RangeGenerator : public ParamGeneratorInterface<T> {
  public:
   RangeGenerator(T begin, T end, IncrementT step)
-      : begin_(begin), end_(end), step_(step),
-        end_index_(CalculateEndIndex(begin, end, step)) {}
+      : begin_(begin), end_(end),
+        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
   ~RangeGenerator() override {}
 
-  ParamIteratorInterface<T> *Begin() const override {
+  ParamIteratorInterface<T>* Begin() const override {
     return new Iterator(this, begin_, 0, step_);
   }
-  ParamIteratorInterface<T> *End() const override {
+  ParamIteratorInterface<T>* End() const override {
     return new Iterator(this, end_, end_index_, step_);
   }
 
  private:
   class Iterator : public ParamIteratorInterface<T> {
    public:
-    Iterator(const ParamGeneratorInterface<T> *base, T value, int index,
+    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
              IncrementT step)
         : base_(base), value_(value), index_(index), step_(step) {}
     ~Iterator() override {}
 
-    const ParamGeneratorInterface<T> *BaseGenerator() const override {
+    const ParamGeneratorInterface<T>* BaseGenerator() const override {
       return base_;
     }
     void Advance() override {
       value_ = static_cast<T>(value_ + step_);
       index_++;
     }
-    ParamIteratorInterface<T> *Clone() const override {
+    ParamIteratorInterface<T>* Clone() const override {
       return new Iterator(*this);
     }
-    const T *Current() const override { return &value_; }
-    bool Equals(const ParamIteratorInterface<T> &other) const override {
+    const T* Current() const override { return &value_; }
+    bool Equals(const ParamIteratorInterface<T>& other) const override {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
       GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
@@ -248,28 +249,31 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
     }
 
    private:
-    Iterator(const Iterator &other)
-        : ParamIteratorInterface<T>(), base_(other.base_), value_(other.value_),
-          index_(other.index_), step_(other.step_) {}
+    Iterator(const Iterator& other)
+        : ParamIteratorInterface<T>(),
+          base_(other.base_), value_(other.value_), index_(other.index_),
+          step_(other.step_) {}
 
     // No implementation - assignment is unsupported.
-    void operator=(const Iterator &other);
+    void operator=(const Iterator& other);
 
-    const ParamGeneratorInterface<T> *const base_;
+    const ParamGeneratorInterface<T>* const base_;
     T value_;
     int index_;
     const IncrementT step_;
   };  // class RangeGenerator::Iterator
 
-  static int CalculateEndIndex(const T &begin, const T &end,
-                               const IncrementT &step) {
+  static int CalculateEndIndex(const T& begin,
+                               const T& end,
+                               const IncrementT& step) {
     int end_index = 0;
-    for (T i = begin; i < end; i = static_cast<T>(i + step)) end_index++;
+    for (T i = begin; i < end; i = static_cast<T>(i + step))
+      end_index++;
     return end_index;
   }
 
   // No implementation - assignment is unsupported.
-  void operator=(const RangeGenerator &other);
+  void operator=(const RangeGenerator& other);
 
   const T begin_;
   const T end_;
@@ -279,6 +283,7 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
   const int end_index_;
 };  // class RangeGenerator
 
+
 // Generates values from a pair of STL-style iterators. Used in the
 // ValuesIn() function. The elements are copied from the source range
 // since the source can be located on the stack, and the generator
@@ -291,10 +296,10 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
       : container_(begin, end) {}
   ~ValuesInIteratorRangeGenerator() override {}
 
-  ParamIteratorInterface<T> *Begin() const override {
+  ParamIteratorInterface<T>* Begin() const override {
     return new Iterator(this, container_.begin());
   }
-  ParamIteratorInterface<T> *End() const override {
+  ParamIteratorInterface<T>* End() const override {
     return new Iterator(this, container_.end());
   }
 
@@ -303,19 +308,19 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
 
   class Iterator : public ParamIteratorInterface<T> {
    public:
-    Iterator(const ParamGeneratorInterface<T> *base,
+    Iterator(const ParamGeneratorInterface<T>* base,
              typename ContainerType::const_iterator iterator)
         : base_(base), iterator_(iterator) {}
     ~Iterator() override {}
 
-    const ParamGeneratorInterface<T> *BaseGenerator() const override {
+    const ParamGeneratorInterface<T>* BaseGenerator() const override {
       return base_;
     }
     void Advance() override {
       ++iterator_;
       value_.reset();
     }
-    ParamIteratorInterface<T> *Clone() const override {
+    ParamIteratorInterface<T>* Clone() const override {
       return new Iterator(*this);
     }
     // We need to use cached value referenced by iterator_ because *iterator_
@@ -325,28 +330,29 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
     // can advance iterator_ beyond the end of the range, and we cannot
     // detect that fact. The client code, on the other hand, is
     // responsible for not calling Current() on an out-of-range iterator.
-    const T *Current() const override {
+    const T* Current() const override {
       if (value_.get() == nullptr) value_.reset(new T(*iterator_));
       return value_.get();
     }
-    bool Equals(const ParamIteratorInterface<T> &other) const override {
+    bool Equals(const ParamIteratorInterface<T>& other) const override {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
       GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
           << "The program attempted to compare iterators "
           << "from different generators." << std::endl;
       return iterator_ ==
-             CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
     }
 
    private:
-    Iterator(const Iterator &other)
-        // The explicit constructor call suppresses a false warning
-        // emitted by gcc when supplied with the -Wextra option.
-        : ParamIteratorInterface<T>(), base_(other.base_),
+    Iterator(const Iterator& other)
+          // The explicit constructor call suppresses a false warning
+          // emitted by gcc when supplied with the -Wextra option.
+        : ParamIteratorInterface<T>(),
+          base_(other.base_),
           iterator_(other.iterator_) {}
 
-    const ParamGeneratorInterface<T> *const base_;
+    const ParamGeneratorInterface<T>* const base_;
     typename ContainerType::const_iterator iterator_;
     // A cached value of *iterator_. We keep it here to allow access by
     // pointer in the wrapping iterator's operator->().
@@ -357,7 +363,7 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
   };  // class ValuesInIteratorRangeGenerator::Iterator
 
   // No implementation - assignment is unsupported.
-  void operator=(const ValuesInIteratorRangeGenerator &other);
+  void operator=(const ValuesInIteratorRangeGenerator& other);
 
   const ContainerType container_;
 };  // class ValuesInIteratorRangeGenerator
@@ -367,7 +373,7 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
 // Default parameterized test name generator, returns a string containing the
 // integer test parameter index.
 template <class ParamType>
-std::string DefaultParamName(const TestParamInfo<ParamType> &info) {
+std::string DefaultParamName(const TestParamInfo<ParamType>& info) {
   Message name_stream;
   name_stream << info.index;
   return name_stream.GetString();
@@ -378,7 +384,7 @@ void TestNotEmpty() {
   static_assert(sizeof(T) == 0, "Empty arguments are not allowed.");
 }
 template <typename T = int>
-void TestNotEmpty(const T &) {}
+void TestNotEmpty(const T&) {}
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
@@ -388,9 +394,9 @@ template <class TestClass>
 class ParameterizedTestFactory : public TestFactoryBase {
  public:
   typedef typename TestClass::ParamType ParamType;
-  explicit ParameterizedTestFactory(ParamType parameter)
-      : parameter_(parameter) {}
-  Test *CreateTest() override {
+  explicit ParameterizedTestFactory(ParamType parameter) :
+      parameter_(parameter) {}
+  Test* CreateTest() override {
     TestClass::SetParam(&parameter_);
     return new TestClass();
   }
@@ -410,7 +416,7 @@ class TestMetaFactoryBase {
  public:
   virtual ~TestMetaFactoryBase() {}
 
-  virtual TestFactoryBase *CreateTestFactory(ParamType parameter) = 0;
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -429,7 +435,7 @@ class TestMetaFactory
 
   TestMetaFactory() {}
 
-  TestFactoryBase *CreateTestFactory(ParamType parameter) override {
+  TestFactoryBase* CreateTestFactory(ParamType parameter) override {
     return new ParameterizedTestFactory<TestSuite>(parameter);
   }
 
@@ -452,8 +458,8 @@ class ParameterizedTestSuiteInfoBase {
   virtual ~ParameterizedTestSuiteInfoBase() {}
 
   // Base part of test suite name for display purposes.
-  virtual const std::string &GetTestSuiteName() const = 0;
-  // Test case id to verify identity.
+  virtual const std::string& GetTestSuiteName() const = 0;
+  // Test suite id to verify identity.
   virtual TypeId GetTestSuiteTypeId() const = 0;
   // UnitTest class invokes this method to register tests in this
   // test suite right before running them in RUN_ALL_TESTS macro.
@@ -472,11 +478,11 @@ class ParameterizedTestSuiteInfoBase {
 //
 // Report a the name of a test_suit as safe to ignore
 // as the side effect of construction of this type.
-struct MarkAsIgnored {
-  explicit MarkAsIgnored(const char *test_suite);
+struct GTEST_API_ MarkAsIgnored {
+  explicit MarkAsIgnored(const char* test_suite);
 };
 
-GTEST_API_ void InsertSyntheticTestCase(const std::string &name,
+GTEST_API_ void InsertSyntheticTestCase(const std::string& name,
                                         CodeLocation location, bool has_test_p);
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -495,17 +501,17 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
   using ParamType = typename TestSuite::ParamType;
   // A function that returns an instance of appropriate generator type.
   typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
-  using ParamNameGeneratorFunc = std::string(const TestParamInfo<ParamType> &);
+  using ParamNameGeneratorFunc = std::string(const TestParamInfo<ParamType>&);
 
-  explicit ParameterizedTestSuiteInfo(const char *name,
+  explicit ParameterizedTestSuiteInfo(const char* name,
                                       CodeLocation code_location)
       : test_suite_name_(name), code_location_(code_location) {}
 
-  // Test case base name for display purposes.
-  const std::string &GetTestSuiteName() const override {
+  // Test suite base name for display purposes.
+  const std::string& GetTestSuiteName() const override {
     return test_suite_name_;
   }
-  // Test case id to verify identity.
+  // Test suite id to verify identity.
   TypeId GetTestSuiteTypeId() const override { return GetTypeId<TestSuite>(); }
   // TEST_P macro uses AddTestPattern() to record information
   // about a single test in a LocalTestInfo structure.
@@ -513,17 +519,18 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
   // prefix). test_base_name is the name of an individual test without
   // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
   // test suite base name and DoBar is test base name.
-  void AddTestPattern(const char *test_suite_name, const char *test_base_name,
-                      TestMetaFactoryBase<ParamType> *meta_factory) {
-    tests_.push_back(std::shared_ptr<TestInfo>(
-        new TestInfo(test_suite_name, test_base_name, meta_factory)));
+  void AddTestPattern(const char* test_suite_name, const char* test_base_name,
+                      TestMetaFactoryBase<ParamType>* meta_factory,
+                      CodeLocation code_location) {
+    tests_.push_back(std::shared_ptr<TestInfo>(new TestInfo(
+        test_suite_name, test_base_name, meta_factory, code_location)));
   }
   // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information
   // about a generator.
-  int AddTestSuiteInstantiation(const std::string &instantiation_name,
-                                GeneratorCreationFunc *func,
-                                ParamNameGeneratorFunc *name_func,
-                                const char *file, int line) {
+  int AddTestSuiteInstantiation(const std::string& instantiation_name,
+                                GeneratorCreationFunc* func,
+                                ParamNameGeneratorFunc* name_func,
+                                const char* file, int line) {
     instantiations_.push_back(
         InstantiationInfo(instantiation_name, func, name_func, file, line));
     return 0;  // Return value used only to run this method in namespace scope.
@@ -540,16 +547,16 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
          test_it != tests_.end(); ++test_it) {
       std::shared_ptr<TestInfo> test_info = *test_it;
       for (typename InstantiationContainer::iterator gen_it =
-               instantiations_.begin();
-           gen_it != instantiations_.end(); ++gen_it) {
-        const std::string &instantiation_name = gen_it->name;
+               instantiations_.begin(); gen_it != instantiations_.end();
+               ++gen_it) {
+        const std::string& instantiation_name = gen_it->name;
         ParamGenerator<ParamType> generator((*gen_it->generator)());
-        ParamNameGeneratorFunc *name_func = gen_it->name_func;
-        const char *file = gen_it->file;
+        ParamNameGeneratorFunc* name_func = gen_it->name_func;
+        const char* file = gen_it->file;
         int line = gen_it->line;
 
         std::string test_suite_name;
-        if (!instantiation_name.empty())
+        if ( !instantiation_name.empty() )
           test_suite_name = instantiation_name + "/";
         test_suite_name += test_info->test_suite_base_name;
 
@@ -562,16 +569,17 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
 
           Message test_name_stream;
 
-          std::string param_name =
-              name_func(TestParamInfo<ParamType>(*param_it, i));
+          std::string param_name = name_func(
+              TestParamInfo<ParamType>(*param_it, i));
 
           GTEST_CHECK_(IsValidParamName(param_name))
               << "Parameterized test name '" << param_name
-              << "' is invalid, in " << file << " line " << line << std::endl;
+              << "' is invalid, in " << file
+              << " line " << line << std::endl;
 
           GTEST_CHECK_(test_param_names.count(param_name) == 0)
-              << "Duplicate parameterized test name '" << param_name << "', in "
-              << file << " line " << line << std::endl;
+              << "Duplicate parameterized test name '" << param_name
+              << "', in " << file << " line " << line << std::endl;
 
           test_param_names.insert(param_name);
 
@@ -582,63 +590,72 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
           MakeAndRegisterTestInfo(
               test_suite_name.c_str(), test_name_stream.GetString().c_str(),
               nullptr,  // No type parameter.
-              PrintToString(*param_it).c_str(), code_location_,
+              PrintToString(*param_it).c_str(), test_info->code_location,
               GetTestSuiteTypeId(),
               SuiteApiResolver<TestSuite>::GetSetUpCaseOrSuite(file, line),
               SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
               test_info->test_meta_factory->CreateTestFactory(*param_it));
         }  // for param_it
-      }    // for gen_it
-    }      // for test_it
+      }  // for gen_it
+    }  // for test_it
 
     if (!generated_instantiations) {
       // There are no generaotrs, or they all generate nothing ...
       InsertSyntheticTestCase(GetTestSuiteName(), code_location_,
                               !tests_.empty());
     }
-  }  // RegisterTests
+  }    // RegisterTests
 
  private:
   // LocalTestInfo structure keeps information about a single test registered
   // with TEST_P macro.
   struct TestInfo {
-    TestInfo(const char *a_test_suite_base_name, const char *a_test_base_name,
-             TestMetaFactoryBase<ParamType> *a_test_meta_factory)
+    TestInfo(const char* a_test_suite_base_name, const char* a_test_base_name,
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory,
+             CodeLocation a_code_location)
         : test_suite_base_name(a_test_suite_base_name),
           test_base_name(a_test_base_name),
-          test_meta_factory(a_test_meta_factory) {}
+          test_meta_factory(a_test_meta_factory),
+          code_location(a_code_location) {}
 
     const std::string test_suite_base_name;
     const std::string test_base_name;
-    const std::unique_ptr<TestMetaFactoryBase<ParamType>> test_meta_factory;
+    const std::unique_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+    const CodeLocation code_location;
   };
-  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo>>;
+  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo> >;
   // Records data received from INSTANTIATE_TEST_SUITE_P macros:
   //  <Instantiation name, Sequence generator creation function,
   //     Name generator function, Source file, Source line>
   struct InstantiationInfo {
-    InstantiationInfo(const std::string &name_in,
-                      GeneratorCreationFunc *generator_in,
-                      ParamNameGeneratorFunc *name_func_in, const char *file_in,
-                      int line_in)
-        : name(name_in), generator(generator_in), name_func(name_func_in),
-          file(file_in), line(line_in) {}
-
-    std::string name;
-    GeneratorCreationFunc *generator;
-    ParamNameGeneratorFunc *name_func;
-    const char *file;
-    int line;
+      InstantiationInfo(const std::string &name_in,
+                        GeneratorCreationFunc* generator_in,
+                        ParamNameGeneratorFunc* name_func_in,
+                        const char* file_in,
+                        int line_in)
+          : name(name_in),
+            generator(generator_in),
+            name_func(name_func_in),
+            file(file_in),
+            line(line_in) {}
+
+      std::string name;
+      GeneratorCreationFunc* generator;
+      ParamNameGeneratorFunc* name_func;
+      const char* file;
+      int line;
   };
   typedef ::std::vector<InstantiationInfo> InstantiationContainer;
 
-  static bool IsValidParamName(const std::string &name) {
+  static bool IsValidParamName(const std::string& name) {
     // Check for empty string
-    if (name.empty()) return false;
+    if (name.empty())
+      return false;
 
     // Check for invalid characters
     for (std::string::size_type index = 0; index < name.size(); ++index) {
-      if (!isalnum(name[index]) && name[index] != '_') return false;
+      if (!IsAlNum(name[index]) && name[index] != '_')
+        return false;
     }
 
     return true;
@@ -668,7 +685,7 @@ class ParameterizedTestSuiteRegistry {
  public:
   ParameterizedTestSuiteRegistry() {}
   ~ParameterizedTestSuiteRegistry() {
-    for (auto &test_suite_info : test_suite_infos_) {
+    for (auto& test_suite_info : test_suite_infos_) {
       delete test_suite_info;
     }
   }
@@ -676,10 +693,10 @@ class ParameterizedTestSuiteRegistry {
   // Looks up or creates and returns a structure containing information about
   // tests and instantiations of a particular test suite.
   template <class TestSuite>
-  ParameterizedTestSuiteInfo<TestSuite> *GetTestSuitePatternHolder(
-      const char *test_suite_name, CodeLocation code_location) {
-    ParameterizedTestSuiteInfo<TestSuite> *typed_test_info = nullptr;
-    for (auto &test_suite_info : test_suite_infos_) {
+  ParameterizedTestSuiteInfo<TestSuite>* GetTestSuitePatternHolder(
+      const char* test_suite_name, CodeLocation code_location) {
+    ParameterizedTestSuiteInfo<TestSuite>* typed_test_info = nullptr;
+    for (auto& test_suite_info : test_suite_infos_) {
       if (test_suite_info->GetTestSuiteName() == test_suite_name) {
         if (test_suite_info->GetTestSuiteTypeId() != GetTypeId<TestSuite>()) {
           // Complain about incorrect usage of Google Test facilities
@@ -692,7 +709,7 @@ class ParameterizedTestSuiteRegistry {
           // type we are looking for, so we downcast it to that type
           // without further checks.
           typed_test_info = CheckedDowncastToActualType<
-              ParameterizedTestSuiteInfo<TestSuite>>(test_suite_info);
+              ParameterizedTestSuiteInfo<TestSuite> >(test_suite_info);
         }
         break;
       }
@@ -705,23 +722,22 @@ class ParameterizedTestSuiteRegistry {
     return typed_test_info;
   }
   void RegisterTests() {
-    for (auto &test_suite_info : test_suite_infos_) {
+    for (auto& test_suite_info : test_suite_infos_) {
       test_suite_info->RegisterTests();
     }
   }
 //  Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   template <class TestCase>
-  ParameterizedTestCaseInfo<TestCase> *GetTestCasePatternHolder(
-      const char *test_case_name, CodeLocation code_location) {
+  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
+      const char* test_case_name, CodeLocation code_location) {
     return GetTestSuitePatternHolder<TestCase>(test_case_name, code_location);
   }
 
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
  private:
-  using TestSuiteInfoContainer =
-      ::std::vector<ParameterizedTestSuiteInfoBase *>;
+  using TestSuiteInfoContainer = ::std::vector<ParameterizedTestSuiteInfoBase*>;
 
   TestSuiteInfoContainer test_suite_infos_;
 
@@ -734,11 +750,11 @@ class ParameterizedTestSuiteRegistry {
 class TypeParameterizedTestSuiteRegistry {
  public:
   // Add a suite definition
-  void RegisterTestSuite(const char *test_suite_name,
+  void RegisterTestSuite(const char* test_suite_name,
                          CodeLocation code_location);
 
   // Add an instantiation of a suit.
-  void RegisterInstantiation(const char *test_suite_name);
+  void RegisterInstantiation(const char* test_suite_name);
 
   // For each suit repored as defined but not reported as instantiation,
   // emit a test that reports that fact (configurably, as an error).
@@ -762,15 +778,20 @@ class TypeParameterizedTestSuiteRegistry {
 // include/gtest/gtest-param-test.h.
 template <class Container>
 internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container &container);
+    const Container& container);
 
 namespace internal {
 // Used in the Values() function to provide polymorphic capabilities.
 
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
 template <typename... Ts>
 class ValueArray {
  public:
-  ValueArray(Ts... v) : v_{ std::move(v)... } {}
+  explicit ValueArray(Ts... v) : v_(FlatTupleConstructTag{}, std::move(v)...) {}
 
   template <typename T>
   operator ParamGenerator<T>() const {  // NOLINT
@@ -780,26 +801,30 @@ class ValueArray {
  private:
   template <typename T, size_t... I>
   std::vector<T> MakeVector(IndexSequence<I...>) const {
-    return std::vector<T>{ static_cast<T>(v_.template Get<I>())... };
+    return std::vector<T>{static_cast<T>(v_.template Get<I>())...};
   }
 
   FlatTuple<Ts...> v_;
 };
 
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
 template <typename... T>
 class CartesianProductGenerator
     : public ParamGeneratorInterface<::std::tuple<T...>> {
  public:
   typedef ::std::tuple<T...> ParamType;
 
-  CartesianProductGenerator(const std::tuple<ParamGenerator<T>...> &g)
+  CartesianProductGenerator(const std::tuple<ParamGenerator<T>...>& g)
       : generators_(g) {}
   ~CartesianProductGenerator() override {}
 
-  ParamIteratorInterface<ParamType> *Begin() const override {
+  ParamIteratorInterface<ParamType>* Begin() const override {
     return new Iterator(this, generators_, false);
   }
-  ParamIteratorInterface<ParamType> *End() const override {
+  ParamIteratorInterface<ParamType>* End() const override {
     return new Iterator(this, generators_, true);
   }
 
@@ -810,17 +835,17 @@ class CartesianProductGenerator
   class IteratorImpl<IndexSequence<I...>>
       : public ParamIteratorInterface<ParamType> {
    public:
-    IteratorImpl(const ParamGeneratorInterface<ParamType> *base,
-                 const std::tuple<ParamGenerator<T>...> &generators,
-                 bool is_end)
-        : base_(base), begin_(std::get<I>(generators).begin()...),
+    IteratorImpl(const ParamGeneratorInterface<ParamType>* base,
+             const std::tuple<ParamGenerator<T>...>& generators, bool is_end)
+        : base_(base),
+          begin_(std::get<I>(generators).begin()...),
           end_(std::get<I>(generators).end()...),
           current_(is_end ? end_ : begin_) {
       ComputeCurrentValue();
     }
     ~IteratorImpl() override {}
 
-    const ParamGeneratorInterface<ParamType> *BaseGenerator() const override {
+    const ParamGeneratorInterface<ParamType>* BaseGenerator() const override {
       return base_;
     }
     // Advance should not be called on beyond-of-range iterators
@@ -833,19 +858,19 @@ class CartesianProductGenerator
       AdvanceIfEnd<sizeof...(T) - 1>();
       ComputeCurrentValue();
     }
-    ParamIteratorInterface<ParamType> *Clone() const override {
+    ParamIteratorInterface<ParamType>* Clone() const override {
       return new IteratorImpl(*this);
     }
 
-    const ParamType *Current() const override { return current_value_.get(); }
+    const ParamType* Current() const override { return current_value_.get(); }
 
-    bool Equals(const ParamIteratorInterface<ParamType> &other) const override {
+    bool Equals(const ParamIteratorInterface<ParamType>& other) const override {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
       GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
           << "The program attempted to compare iterators "
           << "from different generators." << std::endl;
-      const IteratorImpl *typed_other =
+      const IteratorImpl* typed_other =
           CheckedDowncastToActualType<const IteratorImpl>(&other);
 
       // We must report iterators equal if they both point beyond their
@@ -854,9 +879,9 @@ class CartesianProductGenerator
       if (AtEnd() && typed_other->AtEnd()) return true;
 
       bool same = true;
-      bool dummy[] = { (same = same &&
-                               std::get<I>(current_) ==
-                                   std::get<I>(typed_other->current_))... };
+      bool dummy[] = {
+          (same = same && std::get<I>(current_) ==
+                              std::get<I>(typed_other->current_))...};
       (void)dummy;
       return same;
     }
@@ -884,13 +909,13 @@ class CartesianProductGenerator
     }
     bool AtEnd() const {
       bool at_end = false;
-      bool dummy[] = { (at_end = at_end || std::get<I>(current_) ==
-                                               std::get<I>(end_))... };
+      bool dummy[] = {
+          (at_end = at_end || std::get<I>(current_) == std::get<I>(end_))...};
       (void)dummy;
       return at_end;
     }
 
-    const ParamGeneratorInterface<ParamType> *const base_;
+    const ParamGeneratorInterface<ParamType>* const base_;
     std::tuple<typename ParamGenerator<T>::iterator...> begin_;
     std::tuple<typename ParamGenerator<T>::iterator...> end_;
     std::tuple<typename ParamGenerator<T>::iterator...> current_;
@@ -905,7 +930,7 @@ class CartesianProductGenerator
 template <class... Gen>
 class CartesianProductHolder {
  public:
-  CartesianProductHolder(const Gen &... g) : generators_(g...) {}
+  CartesianProductHolder(const Gen&... g) : generators_(g...) {}
   template <typename... T>
   operator ParamGenerator<::std::tuple<T...>>() const {
     return ParamGenerator<::std::tuple<T...>>(
@@ -919,4 +944,4 @@ class CartesianProductHolder {
 }  // namespace internal
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
index f803a19be3..dd845915e3 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
@@ -32,80 +32,83 @@
 // This header file defines the GTEST_OS_* macro.
 // It is separate from gtest-port.h so that custom/gtest-port.h can include it.
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
 
 // Determines the platform on which Google Test is compiled.
 #ifdef __CYGWIN__
-#define GTEST_OS_CYGWIN 1
-#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
-#define GTEST_OS_WINDOWS_MINGW 1
-#define GTEST_OS_WINDOWS 1
+# define GTEST_OS_CYGWIN 1
+# elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
+#  define GTEST_OS_WINDOWS_MINGW 1
+#  define GTEST_OS_WINDOWS 1
 #elif defined _WIN32
-#define GTEST_OS_WINDOWS 1
-#ifdef _WIN32_WCE
-#define GTEST_OS_WINDOWS_MOBILE 1
-#elif defined(WINAPI_FAMILY)
-#include <winapifamily.h>
-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define GTEST_OS_WINDOWS_DESKTOP 1
-#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
-#define GTEST_OS_WINDOWS_PHONE 1
-#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
-#define GTEST_OS_WINDOWS_RT 1
-#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
-#define GTEST_OS_WINDOWS_PHONE 1
-#define GTEST_OS_WINDOWS_TV_TITLE 1
-#else
-// WINAPI_FAMILY defined but no known partition matched.
-// Default to desktop.
-#define GTEST_OS_WINDOWS_DESKTOP 1
-#endif
-#else
-#define GTEST_OS_WINDOWS_DESKTOP 1
-#endif  // _WIN32_WCE
+# define GTEST_OS_WINDOWS 1
+# ifdef _WIN32_WCE
+#  define GTEST_OS_WINDOWS_MOBILE 1
+# elif defined(WINAPI_FAMILY)
+#  include <winapifamily.h>
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#   define GTEST_OS_WINDOWS_RT 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#   define GTEST_OS_WINDOWS_TV_TITLE 1
+#  else
+    // WINAPI_FAMILY defined but no known partition matched.
+    // Default to desktop.
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  endif
+# else
+#  define GTEST_OS_WINDOWS_DESKTOP 1
+# endif  // _WIN32_WCE
 #elif defined __OS2__
-#define GTEST_OS_OS2 1
+# define GTEST_OS_OS2 1
 #elif defined __APPLE__
-#define GTEST_OS_MAC 1
-#if TARGET_OS_IPHONE
-#define GTEST_OS_IOS 1
-#endif
+# define GTEST_OS_MAC 1
+# include <TargetConditionals.h>
+# if TARGET_OS_IPHONE
+#  define GTEST_OS_IOS 1
+# endif
 #elif defined __DragonFly__
-#define GTEST_OS_DRAGONFLY 1
+# define GTEST_OS_DRAGONFLY 1
 #elif defined __FreeBSD__
-#define GTEST_OS_FREEBSD 1
+# define GTEST_OS_FREEBSD 1
 #elif defined __Fuchsia__
-#define GTEST_OS_FUCHSIA 1
+# define GTEST_OS_FUCHSIA 1
 #elif defined(__GLIBC__) && defined(__FreeBSD_kernel__)
-#define GTEST_OS_GNU_KFREEBSD 1
+# define GTEST_OS_GNU_KFREEBSD 1
 #elif defined __linux__
-#define GTEST_OS_LINUX 1
-#if defined __ANDROID__
-#define GTEST_OS_LINUX_ANDROID 1
-#endif
+# define GTEST_OS_LINUX 1
+# if defined __ANDROID__
+#  define GTEST_OS_LINUX_ANDROID 1
+# endif
 #elif defined __MVS__
-#define GTEST_OS_ZOS 1
+# define GTEST_OS_ZOS 1
 #elif defined(__sun) && defined(__SVR4)
-#define GTEST_OS_SOLARIS 1
+# define GTEST_OS_SOLARIS 1
 #elif defined(_AIX)
-#define GTEST_OS_AIX 1
+# define GTEST_OS_AIX 1
 #elif defined(__hpux)
-#define GTEST_OS_HPUX 1
+# define GTEST_OS_HPUX 1
 #elif defined __native_client__
-#define GTEST_OS_NACL 1
+# define GTEST_OS_NACL 1
 #elif defined __NetBSD__
-#define GTEST_OS_NETBSD 1
+# define GTEST_OS_NETBSD 1
 #elif defined __OpenBSD__
-#define GTEST_OS_OPENBSD 1
+# define GTEST_OS_OPENBSD 1
 #elif defined __QNX__
-#define GTEST_OS_QNX 1
+# define GTEST_OS_QNX 1
 #elif defined(__HAIKU__)
 #define GTEST_OS_HAIKU 1
 #elif defined ESP8266
 #define GTEST_OS_ESP8266 1
 #elif defined ESP32
 #define GTEST_OS_ESP32 1
+#elif defined(__XTENSA__)
+#define GTEST_OS_XTENSA 1
 #endif  // __CYGWIN__
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
index 083da569fe..0953a781c0 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
@@ -40,8 +40,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 
 // Environment-describing macros
 // -----------------------------
@@ -199,9 +199,18 @@
 //                                        suppressed (constant conditional).
 //   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
 //                                        is suppressed.
+//   GTEST_INTERNAL_HAS_ANY - for enabling UniversalPrinter<std::any> or
+//                            UniversalPrinter<absl::any> specializations.
+//   GTEST_INTERNAL_HAS_OPTIONAL - for enabling UniversalPrinter<std::optional>
+//   or
+//                                 UniversalPrinter<absl::optional>
+//                                 specializations.
 //   GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher<std::string_view> or
 //                                    Matcher<absl::string_view>
 //                                    specializations.
+//   GTEST_INTERNAL_HAS_VARIANT - for enabling UniversalPrinter<std::variant> or
+//                                UniversalPrinter<absl::variant>
+//                                specializations.
 //
 // Synchronization:
 //   Mutex, MutexLock, ThreadLocal, GetThreadCount()
@@ -252,21 +261,24 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include <cerrno>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
 
 #ifndef _WIN32_WCE
-#include <sys/types.h>
-#include <sys/stat.h>
+# include <sys/types.h>
+# include <sys/stat.h>
 #endif  // !_WIN32_WCE
 
 #if defined __APPLE__
-#include <AvailabilityMacros.h>
-#include <TargetConditionals.h>
+# include <AvailabilityMacros.h>
+# include <TargetConditionals.h>
 #endif
 
 #include <iostream>  // NOLINT
+#include <locale>
 #include <memory>
 #include <string>  // NOLINT
 #include <tuple>
@@ -276,23 +288,23 @@
 #include "gtest/internal/gtest-port-arch.h"
 
 #if !defined(GTEST_DEV_EMAIL_)
-#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-#define GTEST_FLAG_PREFIX_ "gtest_"
-#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-#define GTEST_NAME_ "Google Test"
-#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+# define GTEST_FLAG_PREFIX_ "gtest_"
+# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+# define GTEST_NAME_ "Google Test"
+# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
 #endif  // !defined(GTEST_DEV_EMAIL_)
 
 #if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
-#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
 #endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
 
 // Determines the version of gcc that is used to compile this.
 #ifdef __GNUC__
 // 40302 means version 4.3.2.
-#define GTEST_GCC_VER_ \
-  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+# define GTEST_GCC_VER_ \
+    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
 #endif  // __GNUC__
 
 // Macros for disabling Microsoft Visual C++ warnings.
@@ -301,37 +313,41 @@
 //   /* code that triggers warnings C4800 and C4385 */
 //   GTEST_DISABLE_MSC_WARNINGS_POP_()
 #if defined(_MSC_VER)
-#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
-  __pragma(warning(push)) __pragma(warning(disable : warnings))
-#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop))
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+    __pragma(warning(push))                        \
+    __pragma(warning(disable: warnings))
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
+    __pragma(warning(pop))
 #else
 // Not all compilers are MSVC
-#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
-#define GTEST_DISABLE_MSC_WARNINGS_POP_()
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
 // Clang on Windows does not understand MSVC's pragma warning.
 // We need clang-specific way to disable function deprecation warning.
 #ifdef __clang__
-#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                            \
-  _Pragma("clang diagnostic push")                                      \
-      _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
-          _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
-#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop")
+# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                         \
+    _Pragma("clang diagnostic push")                                  \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
+    _Pragma("clang diagnostic pop")
 #else
-#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
-#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
+# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
 // Brings in definitions for functions used in the testing::internal::posix
 // namespace (read, write, close, chdir, isatty, stat). We do not currently
 // use them on Windows Mobile.
 #if GTEST_OS_WINDOWS
-#if !GTEST_OS_WINDOWS_MOBILE
-#include <direct.h>
-#include <io.h>
-#endif
+# if !GTEST_OS_WINDOWS_MOBILE
+#  include <direct.h>
+#  include <io.h>
+# endif
 // In order to avoid having to include <windows.h>, use forward declaration
 #if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
 // MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
@@ -343,28 +359,32 @@ typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
 typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #endif
+#elif GTEST_OS_XTENSA
+#include <unistd.h>
+// Xtensa toolchains define strcasecmp in the string.h header instead of
+// strings.h. string.h is already included.
 #else
 // This assumes that non-Windows OSes provide unistd.h. For OSes where this
 // is not the case, we need to include headers that provide the functions
 // mentioned above.
-#include <unistd.h>
-#include <strings.h>
+# include <unistd.h>
+# include <strings.h>
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_LINUX_ANDROID
 // Used to define __ANDROID_API__ matching the target NDK API level.
-#include <android/api-level.h>  // NOLINT
+#  include <android/api-level.h>  // NOLINT
 #endif
 
 // Defines this to true if and only if Google Test can use POSIX regular
 // expressions.
 #ifndef GTEST_HAS_POSIX_RE
-#if GTEST_OS_LINUX_ANDROID
+# if GTEST_OS_LINUX_ANDROID
 // On Android, <regex.h> is only available starting with Gingerbread.
-#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
-#else
-#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
-#endif
+#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+# else
+#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA)
+# endif
 #endif
 
 #if GTEST_USES_PCRE
@@ -376,39 +396,39 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // won't compile otherwise.  We can #include it here as we already
 // included <stdlib.h>, which is guaranteed to define size_t through
 // <stddef.h>.
-#include <regex.h>  // NOLINT
+# include <regex.h>  // NOLINT
 
-#define GTEST_USES_POSIX_RE 1
+# define GTEST_USES_POSIX_RE 1
 
 #elif GTEST_OS_WINDOWS
 
 // <regex.h> is not available on Windows.  Use our own simple regex
 // implementation instead.
-#define GTEST_USES_SIMPLE_RE 1
+# define GTEST_USES_SIMPLE_RE 1
 
 #else
 
 // <regex.h> may not be available on this platform.  Use our own
 // simple regex implementation instead.
-#define GTEST_USES_SIMPLE_RE 1
+# define GTEST_USES_SIMPLE_RE 1
 
 #endif  // GTEST_USES_PCRE
 
 #ifndef GTEST_HAS_EXCEPTIONS
 // The user didn't tell us whether exceptions are enabled, so we need
 // to figure it out.
-#if defined(_MSC_VER) && defined(_CPPUNWIND)
+# if defined(_MSC_VER) && defined(_CPPUNWIND)
 // MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled.
-#define GTEST_HAS_EXCEPTIONS 1
-#elif defined(__BORLANDC__)
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__BORLANDC__)
 // C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
 // macro to enable exceptions, so we'll do the same.
 // Assumes that exceptions are enabled by default.
-#ifndef _HAS_EXCEPTIONS
-#define _HAS_EXCEPTIONS 1
-#endif  // _HAS_EXCEPTIONS
-#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
-#elif defined(__clang__)
+#  ifndef _HAS_EXCEPTIONS
+#   define _HAS_EXCEPTIONS 1
+#  endif  // _HAS_EXCEPTIONS
+#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+# elif defined(__clang__)
 // clang defines __EXCEPTIONS if and only if exceptions are enabled before clang
 // 220714, but if and only if cleanups are enabled after that. In Obj-C++ files,
 // there can be cleanups for ObjC exceptions which also need cleanups, even if
@@ -417,27 +437,27 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // cleanups prior to that. To reliably check for C++ exception availability with
 // clang, check for
 // __EXCEPTIONS && __has_feature(cxx_exceptions).
-#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
-#elif defined(__GNUC__) && __EXCEPTIONS
+#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+# elif defined(__GNUC__) && __EXCEPTIONS
 // gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
-#define GTEST_HAS_EXCEPTIONS 1
-#elif defined(__SUNPRO_CC)
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__SUNPRO_CC)
 // Sun Pro CC supports exceptions.  However, there is no compile-time way of
 // detecting whether they are enabled or not.  Therefore, we assume that
 // they are enabled unless the user tells us otherwise.
-#define GTEST_HAS_EXCEPTIONS 1
-#elif defined(__IBMCPP__) && __EXCEPTIONS
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__IBMCPP__) && __EXCEPTIONS
 // xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
-#define GTEST_HAS_EXCEPTIONS 1
-#elif defined(__HP_aCC)
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__HP_aCC)
 // Exception handling is in effect by default in HP aCC compiler. It has to
 // be turned of by +noeh compiler option if desired.
-#define GTEST_HAS_EXCEPTIONS 1
-#else
+#  define GTEST_HAS_EXCEPTIONS 1
+# else
 // For other compilers, we assume exceptions are disabled to be
 // conservative.
-#define GTEST_HAS_EXCEPTIONS 0
-#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#  define GTEST_HAS_EXCEPTIONS 0
+# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
 #endif  // GTEST_HAS_EXCEPTIONS
 
 #ifndef GTEST_HAS_STD_WSTRING
@@ -448,7 +468,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // no support for it at least as recent as Froyo (2.2).
 #define GTEST_HAS_STD_WSTRING                                         \
   (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
-     GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266))
+     GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266 || GTEST_OS_XTENSA))
 
 #endif  // GTEST_HAS_STD_WSTRING
 
@@ -457,62 +477,63 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // The user didn't tell us whether RTTI is enabled, so we need to
 // figure it out.
 
-#ifdef _MSC_VER
+# ifdef _MSC_VER
 
 #ifdef _CPPRTTI  // MSVC defines this macro if and only if RTTI is enabled.
-#define GTEST_HAS_RTTI 1
-#else
-#define GTEST_HAS_RTTI 0
-#endif
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
 
 // Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is
 // enabled.
-#elif defined(__GNUC__)
+# elif defined(__GNUC__)
 
-#ifdef __GXX_RTTI
+#  ifdef __GXX_RTTI
 // When building against STLport with the Android NDK and with
 // -frtti -fno-exceptions, the build fails at link time with undefined
 // references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
 // so disable RTTI when detected.
-#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS)
-#define GTEST_HAS_RTTI 0
-#else
-#define GTEST_HAS_RTTI 1
-#endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
-#else
-#define GTEST_HAS_RTTI 0
-#endif  // __GXX_RTTI
+#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
+       !defined(__EXCEPTIONS)
+#    define GTEST_HAS_RTTI 0
+#   else
+#    define GTEST_HAS_RTTI 1
+#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif  // __GXX_RTTI
 
 // Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
 // using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
 // first version with C++ support.
-#elif defined(__clang__)
+# elif defined(__clang__)
 
-#define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
 
 // Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
 // both the typeid and dynamic_cast features are present.
-#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
 
-#ifdef __RTTI_ALL__
-#define GTEST_HAS_RTTI 1
-#else
-#define GTEST_HAS_RTTI 0
-#endif
+#  ifdef __RTTI_ALL__
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
 
-#else
+# else
 
 // For all other compilers, we assume RTTI is enabled.
-#define GTEST_HAS_RTTI 1
+#  define GTEST_HAS_RTTI 1
 
-#endif  // _MSC_VER
+# endif  // _MSC_VER
 
 #endif  // GTEST_HAS_RTTI
 
 // It's this header's responsibility to #include <typeinfo> when RTTI
 // is enabled.
 #if GTEST_HAS_RTTI
-#include <typeinfo>
+# include <typeinfo>
 #endif
 
 // Determines whether Google Test can use the pthreads library.
@@ -532,10 +553,10 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #if GTEST_HAS_PTHREAD
 // gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
 // true.
-#include <pthread.h>  // NOLINT
+# include <pthread.h>  // NOLINT
 
 // For timespec and nanosleep, used below.
-#include <time.h>  // NOLINT
+# include <time.h>  // NOLINT
 #endif
 
 // Determines whether clone(2) is supported.
@@ -545,23 +566,24 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #ifndef GTEST_HAS_CLONE
 // The user didn't tell us, so we need to figure it out.
 
-#if GTEST_OS_LINUX && !defined(__ia64__)
-#if GTEST_OS_LINUX_ANDROID
+# if GTEST_OS_LINUX && !defined(__ia64__)
+#  if GTEST_OS_LINUX_ANDROID
 // On Android, clone() became available at different API levels for each 32-bit
 // architecture.
-#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \
-    (defined(__mips__) && __ANDROID_API__ >= 12) ||                    \
-    (defined(__i386__) && __ANDROID_API__ >= 17)
-#define GTEST_HAS_CLONE 1
-#else
-#define GTEST_HAS_CLONE 0
-#endif
-#else
-#define GTEST_HAS_CLONE 1
-#endif
-#else
-#define GTEST_HAS_CLONE 0
-#endif  // GTEST_OS_LINUX && !defined(__ia64__)
+#    if defined(__LP64__) || \
+        (defined(__arm__) && __ANDROID_API__ >= 9) || \
+        (defined(__mips__) && __ANDROID_API__ >= 12) || \
+        (defined(__i386__) && __ANDROID_API__ >= 17)
+#     define GTEST_HAS_CLONE 1
+#    else
+#     define GTEST_HAS_CLONE 0
+#    endif
+#  else
+#   define GTEST_HAS_CLONE 1
+#  endif
+# else
+#  define GTEST_HAS_CLONE 0
+# endif  // GTEST_OS_LINUX && !defined(__ia64__)
 
 #endif  // GTEST_HAS_CLONE
 
@@ -571,11 +593,11 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // By default, we assume that stream redirection is supported on all
 // platforms except known mobile ones.
 #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
-    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266
-#define GTEST_HAS_STREAM_REDIRECTION 0
-#else
-#define GTEST_HAS_STREAM_REDIRECTION 1
-#endif  // !GTEST_OS_WINDOWS_MOBILE
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
+#  define GTEST_HAS_STREAM_REDIRECTION 0
+# else
+#  define GTEST_HAS_STREAM_REDIRECTION 1
+# endif  // !GTEST_OS_WINDOWS_MOBILE
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
 // Determines whether to support death tests.
@@ -586,7 +608,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
      GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \
      GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA ||           \
      GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU)
-#define GTEST_HAS_DEATH_TEST 1
+# define GTEST_HAS_DEATH_TEST 1
 #endif
 
 // Determines whether to support type-driven tests.
@@ -595,8 +617,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // Sun Pro CC, IBM Visual Age, and HP aCC support.
 #if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \
     defined(__IBMCPP__) || defined(__HP_aCC)
-#define GTEST_HAS_TYPED_TEST 1
-#define GTEST_HAS_TYPED_TEST_P 1
+# define GTEST_HAS_TYPED_TEST 1
+# define GTEST_HAS_TYPED_TEST_P 1
 #endif
 
 // Determines whether the system compiler uses UTF-16 for encoding wide strings.
@@ -606,7 +628,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // Determines whether test results can be streamed to a socket.
 #if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \
     GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-#define GTEST_CAN_STREAM_RESULTS_ 1
+# define GTEST_CAN_STREAM_RESULTS_ 1
 #endif
 
 // Defines some utility macros.
@@ -620,12 +642,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 //
 // The "switch (0) case 0:" idiom is used to suppress this.
 #ifdef __INTEL_COMPILER
-#define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
 #else
-#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  switch (0)                          \
-  case 0:                             \
-  default:  // NOLINT
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
 #endif
 
 // Use this annotation at the end of a struct/class definition to
@@ -640,53 +659,55 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // Also use it after a variable or parameter declaration to tell the
 // compiler the variable/parameter does not have to be used.
 #if defined(__GNUC__) && !defined(COMPILER_ICC)
-#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
 #elif defined(__clang__)
-#if __has_attribute(unused)
-#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
-#endif
+# if __has_attribute(unused)
+#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+# endif
 #endif
 #ifndef GTEST_ATTRIBUTE_UNUSED_
-#define GTEST_ATTRIBUTE_UNUSED_
+# define GTEST_ATTRIBUTE_UNUSED_
 #endif
 
 // Use this annotation before a function that takes a printf format string.
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
-#if defined(__MINGW_PRINTF_FORMAT)
+# if defined(__MINGW_PRINTF_FORMAT)
 // MinGW has two different printf implementations. Ensure the format macro
 // matches the selected implementation. See
 // https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
-#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
-  __attribute__(                                              \
-      (__format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check)))
-#else
-#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
-  __attribute__((__format__(__printf__, string_index, first_to_check)))
-#endif
+#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+       __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \
+                                 first_to_check)))
+# else
+#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+       __attribute__((__format__(__printf__, string_index, first_to_check)))
+# endif
 #else
-#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
 #endif
 
+
 // A macro to disallow copy operator=
 // This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type) type &operator=(type const &) = delete
+#define GTEST_DISALLOW_ASSIGN_(type) \
+  type& operator=(type const &) = delete
 
 // A macro to disallow copy constructor and operator=
 // This should be used in the private: declarations for a class.
 #define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
-  type(type const &) = delete;                \
-  GTEST_DISALLOW_ASSIGN_(type)
+  type(type const&) = delete;                 \
+  type& operator=(type const&) = delete
 
 // A macro to disallow move operator=
 // This should be used in the private: declarations for a class.
 #define GTEST_DISALLOW_MOVE_ASSIGN_(type) \
-  type &operator=(type &&) noexcept = delete
+  type& operator=(type &&) noexcept = delete
 
 // A macro to disallow move constructor and operator=
 // This should be used in the private: declarations for a class.
 #define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \
-  type(type &&) noexcept = delete;            \
-  GTEST_DISALLOW_MOVE_ASSIGN_(type)
+  type(type&&) noexcept = delete;             \
+  type& operator=(type&&) noexcept = delete
 
 // Tell the compiler to warn about unused return values for functions declared
 // with this macro.  The macro should be used on function declarations
@@ -694,9 +715,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 //
 //   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
 #if defined(__GNUC__) && !defined(COMPILER_ICC)
-#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result))
+# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
 #else
-#define GTEST_MUST_USE_RESULT_
+# define GTEST_MUST_USE_RESULT_
 #endif  // __GNUC__ && !COMPILER_ICC
 
 // MS C++ compiler emits warning when a conditional expression is compile time
@@ -707,9 +728,10 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // while (true) {
 // GTEST_INTENTIONAL_CONST_COND_POP_()
 // }
-#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
-#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
+# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+# define GTEST_INTENTIONAL_CONST_COND_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
 
 // Determine whether the compiler supports Microsoft's Structured Exception
 // Handling.  This is supported by several Windows compilers but generally
@@ -717,13 +739,13 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #ifndef GTEST_HAS_SEH
 // The user didn't tell us, so we need to figure it out.
 
-#if defined(_MSC_VER) || defined(__BORLANDC__)
+# if defined(_MSC_VER) || defined(__BORLANDC__)
 // These two compilers are known to support SEH.
-#define GTEST_HAS_SEH 1
-#else
+#  define GTEST_HAS_SEH 1
+# else
 // Assume no SEH.
-#define GTEST_HAS_SEH 0
-#endif
+#  define GTEST_HAS_SEH 0
+# endif
 
 #endif  // GTEST_HAS_SEH
 
@@ -742,86 +764,88 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #ifndef GTEST_API_
 
 #ifdef _MSC_VER
-#if GTEST_LINKED_AS_SHARED_LIBRARY
-#define GTEST_API_ __declspec(dllimport)
-#elif GTEST_CREATE_SHARED_LIBRARY
-#define GTEST_API_ __declspec(dllexport)
-#endif
+# if GTEST_LINKED_AS_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllimport)
+# elif GTEST_CREATE_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllexport)
+# endif
 #elif __GNUC__ >= 4 || defined(__clang__)
-#define GTEST_API_ __attribute__((visibility("default")))
+# define GTEST_API_ __attribute__((visibility ("default")))
 #endif  // _MSC_VER
 
 #endif  // GTEST_API_
 
 #ifndef GTEST_API_
-#define GTEST_API_
+# define GTEST_API_
 #endif  // GTEST_API_
 
 #ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
-#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast"
+# define GTEST_DEFAULT_DEATH_TEST_STYLE  "fast"
 #endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
 
 #ifdef __GNUC__
 // Ask the compiler to never inline a given function.
-#define GTEST_NO_INLINE_ __attribute__((noinline))
+# define GTEST_NO_INLINE_ __attribute__((noinline))
 #else
-#define GTEST_NO_INLINE_
+# define GTEST_NO_INLINE_
 #endif
 
 // _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
 #if !defined(GTEST_HAS_CXXABI_H_)
-#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
-#define GTEST_HAS_CXXABI_H_ 1
-#else
-#define GTEST_HAS_CXXABI_H_ 0
-#endif
+# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#  define GTEST_HAS_CXXABI_H_ 1
+# else
+#  define GTEST_HAS_CXXABI_H_ 0
+# endif
 #endif
 
 // A function level attribute to disable checking for use of uninitialized
 // memory when built with MemorySanitizer.
 #if defined(__clang__)
-#if __has_feature(memory_sanitizer)
-#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory))
-#else
-#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-#endif  // __has_feature(memory_sanitizer)
+# if __has_feature(memory_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
+       __attribute__((no_sanitize_memory))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+# endif  // __has_feature(memory_sanitizer)
 #else
-#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
 #endif  // __clang__
 
 // A function level attribute to disable AddressSanitizer instrumentation.
 #if defined(__clang__)
-#if __has_feature(address_sanitizer)
-#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
-  __attribute__((no_sanitize_address))
+# if __has_feature(address_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+       __attribute__((no_sanitize_address))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+# endif  // __has_feature(address_sanitizer)
 #else
-#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-#endif  // __has_feature(address_sanitizer)
-#else
-#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 #endif  // __clang__
 
 // A function level attribute to disable HWAddressSanitizer instrumentation.
 #if defined(__clang__)
-#if __has_feature(hwaddress_sanitizer)
-#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
-  __attribute__((no_sanitize("hwaddress")))
-#else
-#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-#endif  // __has_feature(hwaddress_sanitizer)
+# if __has_feature(hwaddress_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
+       __attribute__((no_sanitize("hwaddress")))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+# endif  // __has_feature(hwaddress_sanitizer)
 #else
-#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 #endif  // __clang__
 
 // A function level attribute to disable ThreadSanitizer instrumentation.
 #if defined(__clang__)
-#if __has_feature(thread_sanitizer)
-#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread))
+# if __has_feature(thread_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
+       __attribute__((no_sanitize_thread))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+# endif  // __has_feature(thread_sanitizer)
 #else
-#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-#endif  // __has_feature(thread_sanitizer)
-#else
-#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
 #endif  // __clang__
 
 namespace testing {
@@ -870,60 +894,58 @@ class GTEST_API_ RE {
  public:
   // A copy constructor is required by the Standard to initialize object
   // references from r-values.
-  RE(const RE &other) { Init(other.pattern()); }
+  RE(const RE& other) { Init(other.pattern()); }
 
   // Constructs an RE from a string.
-  RE(const ::std::string &regex) { Init(regex.c_str()); }  // NOLINT
+  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
 
-  RE(const char *regex) { Init(regex); }  // NOLINT
+  RE(const char* regex) { Init(regex); }  // NOLINT
   ~RE();
 
   // Returns the string representation of the regex.
-  const char *pattern() const { return pattern_; }
+  const char* pattern() const { return pattern_; }
 
   // FullMatch(str, re) returns true if and only if regular expression re
   // matches the entire str.
   // PartialMatch(str, re) returns true if and only if regular expression re
   // matches a substring of str (including str itself).
-  static bool FullMatch(const ::std::string &str, const RE &re) {
+  static bool FullMatch(const ::std::string& str, const RE& re) {
     return FullMatch(str.c_str(), re);
   }
-  static bool PartialMatch(const ::std::string &str, const RE &re) {
+  static bool PartialMatch(const ::std::string& str, const RE& re) {
     return PartialMatch(str.c_str(), re);
   }
 
-  static bool FullMatch(const char *str, const RE &re);
-  static bool PartialMatch(const char *str, const RE &re);
+  static bool FullMatch(const char* str, const RE& re);
+  static bool PartialMatch(const char* str, const RE& re);
 
  private:
-  void Init(const char *regex);
-  const char *pattern_;
+  void Init(const char* regex);
+  const char* pattern_;
   bool is_valid_;
 
-#if GTEST_USES_POSIX_RE
+# if GTEST_USES_POSIX_RE
 
   regex_t full_regex_;     // For FullMatch().
   regex_t partial_regex_;  // For PartialMatch().
 
-#else  // GTEST_USES_SIMPLE_RE
-
-  const char *full_pattern_;  // For FullMatch();
+# else  // GTEST_USES_SIMPLE_RE
 
-#endif
+  const char* full_pattern_;  // For FullMatch();
 
-  GTEST_DISALLOW_ASSIGN_(RE);
+# endif
 };
 
 #endif  // GTEST_USES_PCRE
 
 // Formats a source file path and a line number as they would appear
 // in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char *file, int line);
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
 
 // Formats a file location for compiler-independent XML output.
 // Although this function is not platform dependent, we put it next to
 // FormatFileLocation in order to contrast the two functions.
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char *file,
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
                                                                int line);
 
 // Defines logging utilities:
@@ -932,19 +954,24 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char *file,
 //   LogToStderr()  - directs all log messages to stderr.
 //   FlushInfoLog() - flushes informational log messages.
 
-enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL };
+enum GTestLogSeverity {
+  GTEST_INFO,
+  GTEST_WARNING,
+  GTEST_ERROR,
+  GTEST_FATAL
+};
 
 // Formats log entry severity, provides a stream object for streaming the
 // log message, and terminates the message with a newline when going out of
 // scope.
 class GTEST_API_ GTestLog {
  public:
-  GTestLog(GTestLogSeverity severity, const char *file, int line);
+  GTestLog(GTestLogSeverity severity, const char* file, int line);
 
   // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
   ~GTestLog();
 
-  ::std::ostream &GetStream() { return ::std::cerr; }
+  ::std::ostream& GetStream() { return ::std::cerr; }
 
  private:
   const GTestLogSeverity severity_;
@@ -954,10 +981,9 @@ class GTEST_API_ GTestLog {
 
 #if !defined(GTEST_LOG_)
 
-#define GTEST_LOG_(severity)                                           \
-  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
-                                __FILE__, __LINE__)                    \
-      .GetStream()
+# define GTEST_LOG_(severity) \
+    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                  __FILE__, __LINE__).GetStream()
 
 inline void LogToStderr() {}
 inline void FlushInfoLog() { fflush(nullptr); }
@@ -979,12 +1005,12 @@ inline void FlushInfoLog() { fflush(nullptr); }
 //    condition itself, plus additional message streamed into it, if any,
 //    and then it aborts the program. It aborts the program irrespective of
 //    whether it is built in the debug mode or not.
-#define GTEST_CHECK_(condition)               \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_               \
-  if (::testing::internal::IsTrue(condition)) \
-    ;                                         \
-  else                                        \
-    GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+# define GTEST_CHECK_(condition) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::IsTrue(condition)) \
+      ; \
+    else \
+      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
 #endif  // !defined(GTEST_CHECK_)
 
 // An all-mode assert to verify that the given POSIX-style function
@@ -993,8 +1019,9 @@ inline void FlushInfoLog() { fflush(nullptr); }
 // in {} if you need to use it as the only statement in an 'if'
 // branch.
 #define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
-  if (const int gtest_error = (posix_call))    \
-  GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
+  if (const int gtest_error = (posix_call)) \
+    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
+                      << gtest_error
 
 // Transforms "T" into "const T&" according to standard reference collapsing
 // rules (this is only needed as a backport for C++98 compilers that do not
@@ -1008,13 +1035,9 @@ inline void FlushInfoLog() { fflush(nullptr); }
 // Note that the non-const reference will not have "const" added. This is
 // standard, and necessary so that "T" can always bind to "const T&".
 template <typename T>
-struct ConstRef {
-  typedef const T &type;
-};
+struct ConstRef { typedef const T& type; };
 template <typename T>
-struct ConstRef<T &> {
-  typedef T &type;
-};
+struct ConstRef<T&> { typedef T& type; };
 
 // The argument T must depend on some template parameters.
 #define GTEST_REFERENCE_TO_CONST_(T) \
@@ -1040,10 +1063,8 @@ struct ConstRef<T &> {
 // This relatively ugly name is intentional. It prevents clashes with
 // similar functions users may have (e.g., implicit_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
-template <typename To>
-inline To ImplicitCast_(To x) {
-  return x;
-}
+template<typename To>
+inline To ImplicitCast_(To x) { return x; }
 
 // When you upcast (that is, cast a pointer from type Foo to type
 // SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
@@ -1066,17 +1087,17 @@ inline To ImplicitCast_(To x) {
 // This relatively ugly name is intentional. It prevents clashes with
 // similar functions users may have (e.g., down_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
-template <typename To, typename From>  // use like this: DownCast_<T*>(foo);
-inline To DownCast_(From *f) {         // so we only accept pointers
+template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {  // so we only accept pointers
   // Ensures that To is a sub-type of From *.  This test is here only
   // for compile-time type checking, and has no overhead in an
   // optimized build at run-time, as it will be optimized away
   // completely.
   GTEST_INTENTIONAL_CONST_COND_PUSH_()
   if (false) {
-    GTEST_INTENTIONAL_CONST_COND_POP_()
-    const To to = nullptr;
-    ::testing::internal::ImplicitCast_<From *>(to);
+  GTEST_INTENTIONAL_CONST_COND_POP_()
+  const To to = nullptr;
+  ::testing::internal::ImplicitCast_<From*>(to);
   }
 
 #if GTEST_HAS_RTTI
@@ -1092,17 +1113,17 @@ inline To DownCast_(From *f) {         // so we only accept pointers
 // When RTTI is available, the function performs a runtime
 // check to enforce this.
 template <class Derived, class Base>
-Derived *CheckedDowncastToActualType(Base *base) {
+Derived* CheckedDowncastToActualType(Base* base) {
 #if GTEST_HAS_RTTI
   GTEST_CHECK_(typeid(*base) == typeid(Derived));
 #endif
 
 #if GTEST_HAS_DOWNCAST_
-  return ::down_cast<Derived *>(base);
+  return ::down_cast<Derived*>(base);
 #elif GTEST_HAS_RTTI
-  return dynamic_cast<Derived *>(base);  // NOLINT
+  return dynamic_cast<Derived*>(base);  // NOLINT
 #else
-  return static_cast<Derived *>(base);  // Poor man's downcast.
+  return static_cast<Derived*>(base);  // Poor man's downcast.
 #endif
 }
 
@@ -1121,10 +1142,10 @@ GTEST_API_ std::string GetCapturedStderr();
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 // Returns the size (in bytes) of a file.
-GTEST_API_ size_t GetFileSize(FILE *file);
+GTEST_API_ size_t GetFileSize(FILE* file);
 
 // Reads the entire content of a file as a string.
-GTEST_API_ std::string ReadEntireFile(FILE *file);
+GTEST_API_ std::string ReadEntireFile(FILE* file);
 
 // All command line arguments.
 GTEST_API_ std::vector<std::string> GetArgvs();
@@ -1133,15 +1154,15 @@ GTEST_API_ std::vector<std::string> GetArgvs();
 
 std::vector<std::string> GetInjectableArgvs();
 // Deprecated: pass the args vector by value instead.
-void SetInjectableArgvs(const std::vector<std::string> *new_argvs);
-void SetInjectableArgvs(const std::vector<std::string> &new_argvs);
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs);
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs);
 void ClearInjectableArgvs();
 
 #endif  // GTEST_HAS_DEATH_TEST
 
 // Defines synchronization primitives.
 #if GTEST_IS_THREADSAFE
-#if GTEST_HAS_PTHREAD
+# if GTEST_HAS_PTHREAD
 // Sleeps for (roughly) n milliseconds.  This function is only for testing
 // Google Test's own constructs.  Don't use it in user tests, either
 // directly or indirectly.
@@ -1152,13 +1173,13 @@ inline void SleepMilliseconds(int n) {
   };
   nanosleep(&time, nullptr);
 }
-#endif  // GTEST_HAS_PTHREAD
+# endif  // GTEST_HAS_PTHREAD
 
-#if GTEST_HAS_NOTIFICATION_
+# if GTEST_HAS_NOTIFICATION_
 // Notification has already been imported into the namespace.
 // Nothing to do here.
 
-#elif GTEST_HAS_PTHREAD
+# elif GTEST_HAS_PTHREAD
 // Allows a controller thread to pause execution of newly created
 // threads until notified.  Instances of this class must be created
 // and destroyed in the controller thread.
@@ -1170,7 +1191,9 @@ class Notification {
   Notification() : notified_(false) {
     GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
   }
-  ~Notification() { pthread_mutex_destroy(&mutex_); }
+  ~Notification() {
+    pthread_mutex_destroy(&mutex_);
+  }
 
   // Notifies all threads created with this notification to start. Must
   // be called from the controller thread.
@@ -1187,7 +1210,8 @@ class Notification {
       pthread_mutex_lock(&mutex_);
       const bool notified = notified_;
       pthread_mutex_unlock(&mutex_);
-      if (notified) break;
+      if (notified)
+        break;
       SleepMilliseconds(10);
     }
   }
@@ -1199,7 +1223,7 @@ class Notification {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
 };
 
-#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 
 GTEST_API_ void SleepMilliseconds(int n);
 
@@ -1212,7 +1236,7 @@ class GTEST_API_ AutoHandle {
   // undesirable because it defines a lot of symbols and macros that tend to
   // conflict with client code. This assumption is verified by
   // WindowsTypesTest.HANDLEIsVoidStar.
-  typedef void *Handle;
+  typedef void* Handle;
   AutoHandle();
   explicit AutoHandle(Handle handle);
 
@@ -1249,12 +1273,12 @@ class GTEST_API_ Notification {
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
 };
-#endif  // GTEST_HAS_NOTIFICATION_
+# endif  // GTEST_HAS_NOTIFICATION_
 
 // On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
 // defined, but we don't want to use MinGW's pthreads implementation, which
 // has conformance problems with some versions of the POSIX standard.
-#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
 
 // As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
 // Consequently, it cannot select a correct instantiation of ThreadWithParam
@@ -1273,8 +1297,8 @@ class ThreadWithParamBase {
 // example, SunStudio) treat them as different types.  Since class methods
 // cannot be defined with C-linkage we need to define a free C-function to
 // pass into pthread_create().
-extern "C" inline void *ThreadFuncWithCLinkage(void *thread) {
-  static_cast<ThreadWithParamBase *>(thread)->Run();
+extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
+  static_cast<ThreadWithParamBase*>(thread)->Run();
   return nullptr;
 }
 
@@ -1295,10 +1319,12 @@ class ThreadWithParam : public ThreadWithParamBase {
  public:
   typedef void UserThreadFunc(T);
 
-  ThreadWithParam(UserThreadFunc *func, T param, Notification *thread_can_start)
-      : func_(func), param_(param), thread_can_start_(thread_can_start),
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : func_(func),
+        param_(param),
+        thread_can_start_(thread_can_start),
         finished_(false) {
-    ThreadWithParamBase *const base = this;
+    ThreadWithParamBase* const base = this;
     // The thread can be created only after all fields except thread_
     // have been initialized.
     GTEST_CHECK_POSIX_SUCCESS_(
@@ -1319,25 +1345,25 @@ class ThreadWithParam : public ThreadWithParamBase {
   }
 
  private:
-  UserThreadFunc *const func_;  // User-supplied thread function.
+  UserThreadFunc* const func_;  // User-supplied thread function.
   const T param_;  // User-supplied parameter to the thread function.
   // When non-NULL, used to block execution until the controller thread
   // notifies.
-  Notification *const thread_can_start_;
+  Notification* const thread_can_start_;
   bool finished_;  // true if and only if we know that the thread function has
                    // finished.
   pthread_t thread_;  // The native thread object.
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
 };
-#endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
-        // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
-#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 // Mutex and ThreadLocal have already been imported into the namespace.
 // Nothing to do here.
 
-#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 
 // Mutex implements mutex on Windows platforms.  It is used in conjunction
 // with class MutexLock:
@@ -1389,16 +1415,16 @@ class GTEST_API_ Mutex {
   // by the linker.
   MutexType type_;
   long critical_section_init_phase_;  // NOLINT
-  GTEST_CRITICAL_SECTION *critical_section_;
+  GTEST_CRITICAL_SECTION* critical_section_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
 };
 
-#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-  extern ::testing::internal::Mutex mutex
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+    extern ::testing::internal::Mutex mutex
 
-#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-  ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
 
 // We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
@@ -1407,12 +1433,13 @@ class GTEST_API_ Mutex {
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(Mutex *mutex) : mutex_(mutex) { mutex_->Lock(); }
+  explicit GTestMutexLock(Mutex* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
 
   ~GTestMutexLock() { mutex_->Unlock(); }
 
  private:
-  Mutex *const mutex_;
+  Mutex* const mutex_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
 };
@@ -1434,7 +1461,7 @@ class ThreadLocalBase {
   // this ThreadLocal<T>'s constructor and returns it.  It is the caller's
   // responsibility not to call this when the ThreadLocal<T> instance already
   // has a value on the current thread.
-  virtual ThreadLocalValueHolderBase *NewValueForCurrentThread() const = 0;
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0;
 
  protected:
   ThreadLocalBase() {}
@@ -1451,12 +1478,12 @@ class GTEST_API_ ThreadLocalRegistry {
  public:
   // Registers thread_local_instance as having value on the current thread.
   // Returns a value that can be used to identify the thread from other threads.
-  static ThreadLocalValueHolderBase *GetValueOnCurrentThread(
-      const ThreadLocalBase *thread_local_instance);
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance);
 
   // Invoked when a ThreadLocal instance is destroyed.
   static void OnThreadLocalDestroyed(
-      const ThreadLocalBase *thread_local_instance);
+      const ThreadLocalBase* thread_local_instance);
 };
 
 class GTEST_API_ ThreadWithParamBase {
@@ -1470,7 +1497,7 @@ class GTEST_API_ ThreadWithParamBase {
     virtual void Run() = 0;
   };
 
-  ThreadWithParamBase(Runnable *runnable, Notification *thread_can_start);
+  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
   virtual ~ThreadWithParamBase();
 
  private:
@@ -1483,19 +1510,25 @@ class ThreadWithParam : public ThreadWithParamBase {
  public:
   typedef void UserThreadFunc(T);
 
-  ThreadWithParam(UserThreadFunc *func, T param, Notification *thread_can_start)
-      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {}
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
+  }
   virtual ~ThreadWithParam() {}
 
  private:
   class RunnableImpl : public Runnable {
    public:
-    RunnableImpl(UserThreadFunc *func, T param) : func_(func), param_(param) {}
+    RunnableImpl(UserThreadFunc* func, T param)
+        : func_(func),
+          param_(param) {
+    }
     virtual ~RunnableImpl() {}
-    virtual void Run() { func_(param_); }
+    virtual void Run() {
+      func_(param_);
+    }
 
    private:
-    UserThreadFunc *const func_;
+    UserThreadFunc* const func_;
     const T param_;
 
     GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
@@ -1535,15 +1568,15 @@ template <typename T>
 class ThreadLocal : public ThreadLocalBase {
  public:
   ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
-  explicit ThreadLocal(const T &value)
+  explicit ThreadLocal(const T& value)
       : default_factory_(new InstanceValueHolderFactory(value)) {}
 
   ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
 
-  T *pointer() { return GetOrCreateValue(); }
-  const T *pointer() const { return GetOrCreateValue(); }
-  const T &get() const { return *pointer(); }
-  void set(const T &value) { *pointer() = value; }
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
 
  private:
   // Holds a value of T.  Can be deleted via its base class without the caller
@@ -1551,22 +1584,22 @@ class ThreadLocal : public ThreadLocalBase {
   class ValueHolder : public ThreadLocalValueHolderBase {
    public:
     ValueHolder() : value_() {}
-    explicit ValueHolder(const T &value) : value_(value) {}
+    explicit ValueHolder(const T& value) : value_(value) {}
 
-    T *pointer() { return &value_; }
+    T* pointer() { return &value_; }
 
    private:
     T value_;
     GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
   };
 
-  T *GetOrCreateValue() const {
-    return static_cast<ValueHolder *>(
-               ThreadLocalRegistry::GetValueOnCurrentThread(this))
-        ->pointer();
+
+  T* GetOrCreateValue() const {
+    return static_cast<ValueHolder*>(
+        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
   }
 
-  virtual ThreadLocalValueHolderBase *NewValueForCurrentThread() const {
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
     return default_factory_->MakeNewHolder();
   }
 
@@ -1574,7 +1607,7 @@ class ThreadLocal : public ThreadLocalBase {
    public:
     ValueHolderFactory() {}
     virtual ~ValueHolderFactory() {}
-    virtual ValueHolder *MakeNewHolder() const = 0;
+    virtual ValueHolder* MakeNewHolder() const = 0;
 
    private:
     GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
@@ -1583,7 +1616,7 @@ class ThreadLocal : public ThreadLocalBase {
   class DefaultValueHolderFactory : public ValueHolderFactory {
    public:
     DefaultValueHolderFactory() {}
-    ValueHolder *MakeNewHolder() const override { return new ValueHolder(); }
+    ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
 
    private:
     GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
@@ -1591,8 +1624,8 @@ class ThreadLocal : public ThreadLocalBase {
 
   class InstanceValueHolderFactory : public ValueHolderFactory {
    public:
-    explicit InstanceValueHolderFactory(const T &value) : value_(value) {}
-    ValueHolder *MakeNewHolder() const override {
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    ValueHolder* MakeNewHolder() const override {
       return new ValueHolder(value_);
     }
 
@@ -1607,7 +1640,7 @@ class ThreadLocal : public ThreadLocalBase {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
 };
 
-#elif GTEST_HAS_PTHREAD
+# elif GTEST_HAS_PTHREAD
 
 // MutexBase and Mutex implement mutex on pthreads-based platforms.
 class MutexBase {
@@ -1654,8 +1687,8 @@ class MutexBase {
 };
 
 // Forward-declares a static mutex.
-#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-  extern ::testing::internal::MutexBase mutex
+#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+     extern ::testing::internal::MutexBase mutex
 
 // Defines and statically (i.e. at link time) initializes a static mutex.
 // The initialization list here does not explicitly initialize each field,
@@ -1664,7 +1697,7 @@ class MutexBase {
 // This allows initialization to work whether pthread_t is a scalar or struct.
 // The flag -Wmissing-field-initializers must not be specified for this to work.
 #define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-  ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false, 0 }
+  ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0}
 
 // The Mutex class can only be used for mutexes created at runtime. It
 // shares its API with MutexBase otherwise.
@@ -1674,7 +1707,9 @@ class Mutex : public MutexBase {
     GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
     has_owner_ = false;
   }
-  ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); }
+  ~Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
+  }
 
  private:
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
@@ -1687,12 +1722,13 @@ class Mutex : public MutexBase {
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(MutexBase *mutex) : mutex_(mutex) { mutex_->Lock(); }
+  explicit GTestMutexLock(MutexBase* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
 
   ~GTestMutexLock() { mutex_->Unlock(); }
 
  private:
-  MutexBase *const mutex_;
+  MutexBase* const mutex_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
 };
@@ -1712,8 +1748,8 @@ class ThreadLocalValueHolderBase {
 
 // Called by pthread to delete thread-local data stored by
 // pthread_setspecific().
-extern "C" inline void DeleteThreadLocalValue(void *value_holder) {
-  delete static_cast<ThreadLocalValueHolderBase *>(value_holder);
+extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
 }
 
 // Implements thread-local storage on pthreads-based systems.
@@ -1722,7 +1758,7 @@ class GTEST_API_ ThreadLocal {
  public:
   ThreadLocal()
       : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
-  explicit ThreadLocal(const T &value)
+  explicit ThreadLocal(const T& value)
       : key_(CreateKey()),
         default_factory_(new InstanceValueHolderFactory(value)) {}
 
@@ -1735,19 +1771,19 @@ class GTEST_API_ ThreadLocal {
     GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
   }
 
-  T *pointer() { return GetOrCreateValue(); }
-  const T *pointer() const { return GetOrCreateValue(); }
-  const T &get() const { return *pointer(); }
-  void set(const T &value) { *pointer() = value; }
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
 
  private:
   // Holds a value of type T.
   class ValueHolder : public ThreadLocalValueHolderBase {
    public:
     ValueHolder() : value_() {}
-    explicit ValueHolder(const T &value) : value_(value) {}
+    explicit ValueHolder(const T& value) : value_(value) {}
 
-    T *pointer() { return &value_; }
+    T* pointer() { return &value_; }
 
    private:
     T value_;
@@ -1763,15 +1799,15 @@ class GTEST_API_ ThreadLocal {
     return key;
   }
 
-  T *GetOrCreateValue() const {
-    ThreadLocalValueHolderBase *const holder =
-        static_cast<ThreadLocalValueHolderBase *>(pthread_getspecific(key_));
+  T* GetOrCreateValue() const {
+    ThreadLocalValueHolderBase* const holder =
+        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
     if (holder != nullptr) {
       return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
     }
 
-    ValueHolder *const new_holder = default_factory_->MakeNewHolder();
-    ThreadLocalValueHolderBase *const holder_base = new_holder;
+    ValueHolder* const new_holder = default_factory_->MakeNewHolder();
+    ThreadLocalValueHolderBase* const holder_base = new_holder;
     GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
     return new_holder->pointer();
   }
@@ -1780,7 +1816,7 @@ class GTEST_API_ ThreadLocal {
    public:
     ValueHolderFactory() {}
     virtual ~ValueHolderFactory() {}
-    virtual ValueHolder *MakeNewHolder() const = 0;
+    virtual ValueHolder* MakeNewHolder() const = 0;
 
    private:
     GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
@@ -1789,7 +1825,7 @@ class GTEST_API_ ThreadLocal {
   class DefaultValueHolderFactory : public ValueHolderFactory {
    public:
     DefaultValueHolderFactory() {}
-    ValueHolder *MakeNewHolder() const override { return new ValueHolder(); }
+    ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
 
    private:
     GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
@@ -1797,8 +1833,8 @@ class GTEST_API_ ThreadLocal {
 
   class InstanceValueHolderFactory : public ValueHolderFactory {
    public:
-    explicit InstanceValueHolderFactory(const T &value) : value_(value) {}
-    ValueHolder *MakeNewHolder() const override {
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    ValueHolder* MakeNewHolder() const override {
       return new ValueHolder(value_);
     }
 
@@ -1815,7 +1851,7 @@ class GTEST_API_ ThreadLocal {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
 };
 
-#endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
 #else  // GTEST_IS_THREADSAFE
 
@@ -1832,10 +1868,10 @@ class Mutex {
   void AssertHeld() const {}
 };
 
-#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
   extern ::testing::internal::Mutex mutex
 
-#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
 
 // We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
@@ -1844,7 +1880,7 @@ class Mutex {
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(Mutex *) {}  // NOLINT
+  explicit GTestMutexLock(Mutex*) {}  // NOLINT
 };
 
 typedef GTestMutexLock MutexLock;
@@ -1853,12 +1889,11 @@ template <typename T>
 class GTEST_API_ ThreadLocal {
  public:
   ThreadLocal() : value_() {}
-  explicit ThreadLocal(const T &value) : value_(value) {}
-  T *pointer() { return &value_; }
-  const T *pointer() const { return &value_; }
-  const T &get() const { return value_; }
-  void set(const T &value) { value_ = value; }
-
+  explicit ThreadLocal(const T& value) : value_(value) {}
+  T* pointer() { return &value_; }
+  const T* pointer() const { return &value_; }
+  const T& get() const { return value_; }
+  void set(const T& value) { value_ = value; }
  private:
   T value_;
 };
@@ -1870,11 +1905,11 @@ class GTEST_API_ ThreadLocal {
 GTEST_API_ size_t GetThreadCount();
 
 #if GTEST_OS_WINDOWS
-#define GTEST_PATH_SEP_ "\\"
-#define GTEST_HAS_ALT_PATH_SEP_ 1
+# define GTEST_PATH_SEP_ "\\"
+# define GTEST_HAS_ALT_PATH_SEP_ 1
 #else
-#define GTEST_PATH_SEP_ "/"
-#define GTEST_HAS_ALT_PATH_SEP_ 0
+# define GTEST_PATH_SEP_ "/"
+# define GTEST_HAS_ALT_PATH_SEP_ 0
 #endif  // GTEST_OS_WINDOWS
 
 // Utilities for char.
@@ -1905,6 +1940,19 @@ inline bool IsUpper(char ch) {
 inline bool IsXDigit(char ch) {
   return isxdigit(static_cast<unsigned char>(ch)) != 0;
 }
+#ifdef __cpp_char8_t
+inline bool IsXDigit(char8_t ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+#endif
+inline bool IsXDigit(char16_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+inline bool IsXDigit(char32_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
 inline bool IsXDigit(wchar_t ch) {
   const unsigned char low_byte = static_cast<unsigned char>(ch);
   return ch == low_byte && isxdigit(low_byte) != 0;
@@ -1919,7 +1967,8 @@ inline char ToUpper(char ch) {
 
 inline std::string StripTrailingSpaces(std::string str) {
   std::string::iterator it = str.end();
-  while (it != str.begin() && IsSpace(*--it)) it = str.erase(it);
+  while (it != str.begin() && IsSpace(*--it))
+    it = str.erase(it);
   return str;
 }
 
@@ -1937,67 +1986,80 @@ namespace posix {
 
 typedef struct _stat StatStruct;
 
-#ifdef __BORLANDC__
-inline int IsATTY(int fd) { return isatty(fd); }
-inline int StrCaseCmp(const char *s1, const char *s2) {
+# ifdef __BORLANDC__
+inline int DoIsATTY(int fd) { return isatty(fd); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
   return stricmp(s1, s2);
 }
-inline char *StrDup(const char *src) { return strdup(src); }
-#else  // !__BORLANDC__
-#if GTEST_OS_WINDOWS_MOBILE
-inline int IsATTY(int /* fd */) { return 0; }
-#else
-inline int IsATTY(int fd) { return _isatty(fd); }
-#endif  // GTEST_OS_WINDOWS_MOBILE
-inline int StrCaseCmp(const char *s1, const char *s2) {
+inline char* StrDup(const char* src) { return strdup(src); }
+# else  // !__BORLANDC__
+#  if GTEST_OS_WINDOWS_MOBILE
+inline int DoIsATTY(int /* fd */) { return 0; }
+#  else
+inline int DoIsATTY(int fd) { return _isatty(fd); }
+#  endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char* s1, const char* s2) {
   return _stricmp(s1, s2);
 }
-inline char *StrDup(const char *src) { return _strdup(src); }
-#endif  // __BORLANDC__
+inline char* StrDup(const char* src) { return _strdup(src); }
+# endif  // __BORLANDC__
 
-#if GTEST_OS_WINDOWS_MOBILE
-inline int FileNo(FILE *file) { return reinterpret_cast<int>(_fileno(file)); }
+# if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
 // Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
 // time and thus not defined there.
-#else
-inline int FileNo(FILE *file) { return _fileno(file); }
-inline int Stat(const char *path, StatStruct *buf) { return _stat(path, buf); }
-inline int RmDir(const char *dir) { return _rmdir(dir); }
-inline bool IsDir(const StatStruct &st) { return (_S_IFDIR & st.st_mode) != 0; }
-#endif  // GTEST_OS_WINDOWS_MOBILE
+# else
+inline int FileNo(FILE* file) { return _fileno(file); }
+inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
+inline int RmDir(const char* dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct& st) {
+  return (_S_IFDIR & st.st_mode) != 0;
+}
+# endif  // GTEST_OS_WINDOWS_MOBILE
 
 #elif GTEST_OS_ESP8266
 typedef struct stat StatStruct;
 
-inline int FileNo(FILE *file) { return fileno(file); }
-inline int IsATTY(int fd) { return isatty(fd); }
-inline int Stat(const char *path, StatStruct *buf) {
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) {
   // stat function not implemented on ESP8266
   return 0;
 }
-inline int StrCaseCmp(const char *s1, const char *s2) {
+inline int StrCaseCmp(const char* s1, const char* s2) {
   return strcasecmp(s1, s2);
 }
-inline char *StrDup(const char *src) { return strdup(src); }
-inline int RmDir(const char *dir) { return rmdir(dir); }
-inline bool IsDir(const StatStruct &st) { return S_ISDIR(st.st_mode); }
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
 
 #else
 
 typedef struct stat StatStruct;
 
-inline int FileNo(FILE *file) { return fileno(file); }
-inline int IsATTY(int fd) { return isatty(fd); }
-inline int Stat(const char *path, StatStruct *buf) { return stat(path, buf); }
-inline int StrCaseCmp(const char *s1, const char *s2) {
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
   return strcasecmp(s1, s2);
 }
-inline char *StrDup(const char *src) { return strdup(src); }
-inline int RmDir(const char *dir) { return rmdir(dir); }
-inline bool IsDir(const StatStruct &st) { return S_ISDIR(st.st_mode); }
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
 
 #endif  // GTEST_OS_WINDOWS
 
+inline int IsATTY(int fd) {
+  // DoIsATTY might change errno (for example ENOTTY in case you redirect stdout
+  // to a file on Linux), which is unexpected, so save the previous value, and
+  // restore it after the call.
+  int savedErrno = errno;
+  int isAttyValue = DoIsATTY(fd);
+  errno = savedErrno;
+
+  return isAttyValue;
+}
+
 // Functions deprecated by MSVC 8.0.
 
 GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
@@ -2006,39 +2068,48 @@ GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
 // StrError() aren't needed on Windows CE at this time and thus not
 // defined there.
 
-#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
-inline int ChDir(const char *dir) { return chdir(dir); }
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_ESP8266 && !GTEST_OS_XTENSA
+inline int ChDir(const char* dir) { return chdir(dir); }
 #endif
-inline FILE *FOpen(const char *path, const char *mode) {
+inline FILE* FOpen(const char* path, const char* mode) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+  struct wchar_codecvt : public std::codecvt<wchar_t, char, std::mbstate_t> {};
+  std::wstring_convert<wchar_codecvt> converter;
+  std::wstring wide_path = converter.from_bytes(path);
+  std::wstring wide_mode = converter.from_bytes(mode);
+  return _wfopen(wide_path.c_str(), wide_mode.c_str());
+#else  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
   return fopen(path, mode);
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
 }
 #if !GTEST_OS_WINDOWS_MOBILE
-inline FILE *FReopen(const char *path, const char *mode, FILE *stream) {
+inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
   return freopen(path, mode, stream);
 }
-inline FILE *FDOpen(int fd, const char *mode) { return fdopen(fd, mode); }
+inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
 #endif
-inline int FClose(FILE *fp) { return fclose(fp); }
+inline int FClose(FILE* fp) { return fclose(fp); }
 #if !GTEST_OS_WINDOWS_MOBILE
-inline int Read(int fd, void *buf, unsigned int count) {
+inline int Read(int fd, void* buf, unsigned int count) {
   return static_cast<int>(read(fd, buf, count));
 }
-inline int Write(int fd, const void *buf, unsigned int count) {
+inline int Write(int fd, const void* buf, unsigned int count) {
   return static_cast<int>(write(fd, buf, count));
 }
 inline int Close(int fd) { return close(fd); }
-inline const char *StrError(int errnum) { return strerror(errnum); }
+inline const char* StrError(int errnum) { return strerror(errnum); }
 #endif
-inline const char *GetEnv(const char *name) {
+inline const char* GetEnv(const char* name) {
 #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
-    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
   // We are on an embedded platform, which has no environment variables.
   static_cast<void>(name);  // To prevent 'unused argument' warning.
   return nullptr;
 #elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
   // Environment variables which we programmatically clear will be set to the
   // empty string rather than unset (NULL).  Handle that case.
-  const char *const env = getenv(name);
+  const char* const env = getenv(name);
   return (env != nullptr && env[0] != '\0') ? env : nullptr;
 #else
   return getenv(name);
@@ -2053,9 +2124,7 @@ GTEST_DISABLE_MSC_DEPRECATED_POP_()
 // imitation of standard behaviour.
 [[noreturn]] void Abort();
 #else
-[[noreturn]] inline void Abort() {
-  abort();
-}
+[[noreturn]] inline void Abort() { abort(); }
 #endif  // GTEST_OS_WINDOWS_MOBILE
 
 }  // namespace posix
@@ -2067,13 +2136,13 @@ GTEST_DISABLE_MSC_DEPRECATED_POP_()
 // snprintf is a variadic function.
 #if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE
 // MSVC 2005 and above support variadic macros.
-#define GTEST_SNPRINTF_(buffer, size, format, ...) \
-  _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+# define GTEST_SNPRINTF_(buffer, size, format, ...) \
+     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
 #elif defined(_MSC_VER)
 // Windows CE does not define _snprintf_s
-#define GTEST_SNPRINTF_ _snprintf
+# define GTEST_SNPRINTF_ _snprintf
 #else
-#define GTEST_SNPRINTF_ snprintf
+# define GTEST_SNPRINTF_ snprintf
 #endif
 
 // The biggest signed integer type the compiler supports.
@@ -2133,50 +2202,51 @@ using TimeInMillis = int64_t;  // Represents time in milliseconds.
 
 // Macro for referencing flags.
 #if !defined(GTEST_FLAG)
-#define GTEST_FLAG(name) FLAGS_gtest_##name
+# define GTEST_FLAG(name) FLAGS_gtest_##name
 #endif  // !defined(GTEST_FLAG)
 
 #if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
-#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
 #endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
 
 #if !defined(GTEST_DECLARE_bool_)
-#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
 
 // Macros for declaring flags.
-#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-#define GTEST_DECLARE_int32_(name) \
-  GTEST_API_ extern std::int32_t GTEST_FLAG(name)
-#define GTEST_DECLARE_string_(name) \
-  GTEST_API_ extern ::std::string GTEST_FLAG(name)
+# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+# define GTEST_DECLARE_int32_(name) \
+    GTEST_API_ extern std::int32_t GTEST_FLAG(name)
+# define GTEST_DECLARE_string_(name) \
+    GTEST_API_ extern ::std::string GTEST_FLAG(name)
 
 // Macros for defining flags.
-#define GTEST_DEFINE_bool_(name, default_val, doc) \
-  GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_int32_(name, default_val, doc) \
-  GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_string_(name, default_val, doc) \
-  GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+# define GTEST_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
+# define GTEST_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val)
+# define GTEST_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
 
 #endif  // !defined(GTEST_DECLARE_bool_)
 
 // Thread annotations
 #if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
-#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-#define GTEST_LOCK_EXCLUDED_(locks)
+# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+# define GTEST_LOCK_EXCLUDED_(locks)
 #endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
 
 // Parses 'str' for a 32-bit signed integer.  If successful, writes the result
 // to *value and returns true; otherwise leaves *value unchanged and returns
 // false.
-bool ParseInt32(const Message &src_text, const char *str, int32_t *value);
+GTEST_API_ bool ParseInt32(const Message& src_text, const char* str,
+                           int32_t* value);
 
 // Parses a bool/int32_t/string from the environment variable
 // corresponding to the given Google Test flag.
-bool BoolFromGTestEnv(const char *flag, bool default_val);
-GTEST_API_ int32_t Int32FromGTestEnv(const char *flag, int32_t default_val);
+bool BoolFromGTestEnv(const char* flag, bool default_val);
+GTEST_API_ int32_t Int32FromGTestEnv(const char* flag, int32_t default_val);
 std::string OutputFlagAlsoCheckEnvVar();
-const char *StringFromGTestEnv(const char *flag, const char *default_val);
+const char* StringFromGTestEnv(const char* flag, const char* default_val);
 
 }  // namespace internal
 }  // namespace testing
@@ -2202,9 +2272,67 @@ const char *StringFromGTestEnv(const char *flag, const char *default_val);
 #endif  // !defined(GTEST_INTERNAL_DEPRECATED)
 
 #if GTEST_HAS_ABSL
+// Always use absl::any for UniversalPrinter<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_ANY 1
+#include "absl/types/any.h"
+namespace testing {
+namespace internal {
+using Any = ::absl::any;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<any>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::any for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_ANY 1
+#include <any>
+namespace testing {
+namespace internal {
+using Any = ::std::any;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::any is not
+// supported.
+#endif  // __has_include(<any>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::optional for UniversalPrinter<> specializations if
+// googletest is built with absl support.
+#define GTEST_INTERNAL_HAS_OPTIONAL 1
+#include "absl/types/optional.h"
+namespace testing {
+namespace internal {
+template <typename T>
+using Optional = ::absl::optional<T>;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<optional>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::optional for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_OPTIONAL 1
+#include <optional>
+namespace testing {
+namespace internal {
+template <typename T>
+using Optional = ::std::optional<T>;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::optional is not
+// supported.
+#endif  // __has_include(<optional>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
 // Always use absl::string_view for Matcher<> specializations if googletest
 // is built with absl support.
-#define GTEST_INTERNAL_HAS_STRING_VIEW 1
+# define GTEST_INTERNAL_HAS_STRING_VIEW 1
 #include "absl/strings/string_view.h"
 namespace testing {
 namespace internal {
@@ -2212,21 +2340,50 @@ using StringView = ::absl::string_view;
 }  // namespace internal
 }  // namespace testing
 #else
-#ifdef __has_include
-#if __has_include(<string_view>) && __cplusplus >= 201703L
+# ifdef __has_include
+#   if __has_include(<string_view>) && __cplusplus >= 201703L
 // Otherwise for C++17 and higher use std::string_view for Matcher<>
 // specializations.
-#define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#   define GTEST_INTERNAL_HAS_STRING_VIEW 1
 #include <string_view>
 namespace testing {
 namespace internal {
 using StringView = ::std::string_view;
 }  // namespace internal
 }  // namespace testing
-   // The case where absl is configured NOT to alias std::string_view is not
-   // supported.
-#endif  // __has_include(<string_view>) && __cplusplus >= 201703L
+// The case where absl is configured NOT to alias std::string_view is not
+// supported.
+#  endif  // __has_include(<string_view>) && __cplusplus >= 201703L
+# endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::variant for UniversalPrinter<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_VARIANT 1
+#include "absl/types/variant.h"
+namespace testing {
+namespace internal {
+template <typename... T>
+using Variant = ::absl::variant<T...>;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<variant>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::variant for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_VARIANT 1
+#include <variant>
+namespace testing {
+namespace internal {
+template <typename... T>
+using Variant = ::std::variant<T...>;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::variant is not supported.
+#endif  // __has_include(<variant>) && __cplusplus >= 201703L
 #endif  // __has_include
 #endif  // GTEST_HAS_ABSL
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
index f1f933097d..10f774f966 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
@@ -38,12 +38,12 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 
 #ifdef __BORLANDC__
 // string.h is not guaranteed to provide strcpy on C++ Builder.
-#include <mem.h>
+# include <mem.h>
 #endif
 
 #include <string.h>
@@ -67,7 +67,7 @@ class GTEST_API_ String {
   //
   // This is different from strdup() in string.h, which allocates
   // memory using malloc().
-  static const char *CloneCString(const char *c_str);
+  static const char* CloneCString(const char* c_str);
 
 #if GTEST_OS_WINDOWS_MOBILE
   // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
@@ -82,7 +82,7 @@ class GTEST_API_ String {
   // The wide string is created using the ANSI codepage (CP_ACP) to
   // match the behaviour of the ANSI versions of Win32 calls and the
   // C runtime.
-  static LPCWSTR AnsiToUtf16(const char *c_str);
+  static LPCWSTR AnsiToUtf16(const char* c_str);
 
   // Creates an ANSI string from the given wide string, allocating
   // memory using new. The caller is responsible for deleting the return
@@ -92,7 +92,7 @@ class GTEST_API_ String {
   // The returned string is created using the ANSI codepage (CP_ACP) to
   // match the behaviour of the ANSI versions of Win32 calls and the
   // C runtime.
-  static const char *Utf16ToAnsi(LPCWSTR utf16_str);
+  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
 #endif
 
   // Compares two C strings.  Returns true if and only if they have the same
@@ -101,13 +101,13 @@ class GTEST_API_ String {
   // Unlike strcmp(), this function can handle NULL argument(s).  A
   // NULL C string is considered different to any non-NULL C string,
   // including the empty string.
-  static bool CStringEquals(const char *lhs, const char *rhs);
+  static bool CStringEquals(const char* lhs, const char* rhs);
 
   // Converts a wide C string to a String using the UTF-8 encoding.
   // NULL will be converted to "(null)".  If an error occurred during
   // the conversion, "(failed to convert from wide string)" is
   // returned.
-  static std::string ShowWideCString(const wchar_t *wide_c_str);
+  static std::string ShowWideCString(const wchar_t* wide_c_str);
 
   // Compares two wide C strings.  Returns true if and only if they have the
   // same content.
@@ -115,7 +115,7 @@ class GTEST_API_ String {
   // Unlike wcscmp(), this function can handle NULL argument(s).  A
   // NULL C string is considered different to any non-NULL C string,
   // including the empty string.
-  static bool WideCStringEquals(const wchar_t *lhs, const wchar_t *rhs);
+  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
 
   // Compares two C strings, ignoring case.  Returns true if and only if
   // they have the same content.
@@ -123,7 +123,8 @@ class GTEST_API_ String {
   // Unlike strcasecmp(), this function can handle NULL argument(s).
   // A NULL C string is considered different to any non-NULL C string,
   // including the empty string.
-  static bool CaseInsensitiveCStringEquals(const char *lhs, const char *rhs);
+  static bool CaseInsensitiveCStringEquals(const char* lhs,
+                                           const char* rhs);
 
   // Compares two wide C strings, ignoring case.  Returns true if and only if
   // they have the same content.
@@ -137,17 +138,20 @@ class GTEST_API_ String {
   // which compares according to LC_CTYPE category of the current locale.
   // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
   // current locale.
-  static bool CaseInsensitiveWideCStringEquals(const wchar_t *lhs,
-                                               const wchar_t *rhs);
+  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                               const wchar_t* rhs);
 
   // Returns true if and only if the given string ends with the given suffix,
   // ignoring case. Any string is considered to end with an empty suffix.
-  static bool EndsWithCaseInsensitive(const std::string &str,
-                                      const std::string &suffix);
+  static bool EndsWithCaseInsensitive(
+      const std::string& str, const std::string& suffix);
 
   // Formats an int value as "%02d".
   static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
 
+  // Formats an int value to given width with leading zeros.
+  static std::string FormatIntWidthN(int value, int width);
+
   // Formats an int value as "%X".
   static std::string FormatHexInt(int value);
 
@@ -159,13 +163,13 @@ class GTEST_API_ String {
 
  private:
   String();  // Not meant to be instantiated.
-};           // class String
+};  // class String
 
 // Gets the content of the stringstream's buffer as an std::string.  Each '\0'
 // character in the buffer is replaced with "\\0".
-GTEST_API_ std::string StringStreamToString(::std::stringstream *stream);
+GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
 
 }  // namespace internal
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
index 3b3a651dc0..b87a2e2cac 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
@@ -32,18 +32,18 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 
 #include "gtest/internal/gtest-port.h"
 
 // #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
 // libstdc++ (which is where cxxabi.h comes from).
-#if GTEST_HAS_CXXABI_H_
-#include <cxxabi.h>
-#elif defined(__HP_aCC)
-#include <acxx_demangle.h>
-#endif  // GTEST_HASH_CXXABI_H_
+# if GTEST_HAS_CXXABI_H_
+#  include <cxxabi.h>
+# elif defined(__HP_aCC)
+#  include <acxx_demangle.h>
+# endif  // GTEST_HASH_CXXABI_H_
 
 namespace testing {
 namespace internal {
@@ -64,14 +64,10 @@ inline std::string CanonicalizeForStdLibVersioning(std::string s) {
   return s;
 }
 
-// GetTypeName<T>() returns a human-readable name of type T.
-// NB: This function is also used in Google Mock, so don't move it inside of
-// the typed-test-only section below.
-template <typename T>
-std::string GetTypeName() {
 #if GTEST_HAS_RTTI
-
-  const char *const name = typeid(T).name();
+// GetTypeName(const std::type_info&) returns a human-readable name of type T.
+inline std::string GetTypeName(const std::type_info& type) {
+  const char* const name = type.name();
 #if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
   int status = 0;
   // gcc's implementation of typeid(T).name() mangles the type name,
@@ -79,29 +75,33 @@ std::string GetTypeName() {
 #if GTEST_HAS_CXXABI_H_
   using abi::__cxa_demangle;
 #endif  // GTEST_HAS_CXXABI_H_
-  char *const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
+  char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
   const std::string name_str(status == 0 ? readable_name : name);
   free(readable_name);
   return CanonicalizeForStdLibVersioning(name_str);
 #else
   return name;
 #endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+}
+#endif  // GTEST_HAS_RTTI
 
+// GetTypeName<T>() returns a human-readable name of type T if and only if
+// RTTI is enabled, otherwise it returns a dummy type name.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+  return GetTypeName(typeid(T));
 #else
-
   return "<type>";
-
 #endif  // GTEST_HAS_RTTI
 }
 
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 // A unique type indicating an empty node
 struct None {};
 
-#define GTEST_TEMPLATE_ \
-  template <typename T> \
-  class
+# define GTEST_TEMPLATE_ template <typename T> class
 
 // The template "selector" struct TemplateSel<Tmpl> is used to
 // represent Tmpl, which must be a class template with one type
@@ -119,7 +119,8 @@ struct TemplateSel {
   };
 };
 
-#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind<T>::type
+# define GTEST_BIND_(TmplSel, T) \
+  TmplSel::template Bind<T>::type
 
 template <GTEST_TEMPLATE_ Head_, GTEST_TEMPLATE_... Tail_>
 struct Templates {
@@ -172,8 +173,6 @@ struct GenerateTypeList {
   using type = typename proxy::type;
 };
 
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 }  // namespace internal
 
 template <typename... Ts>
@@ -181,4 +180,4 @@ using Types = internal::ProxyTypeList<Ts...>;
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-death-test.cc b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-death-test.cc
index c38551cda1..bf4f6331da 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-death-test.cc
+++ b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-death-test.cc
@@ -32,6 +32,7 @@
 
 #include "gtest/gtest-death-test.h"
 
+#include <functional>
 #include <utility>
 
 #include "gtest/internal/gtest-port.h"
@@ -39,44 +40,44 @@
 
 #if GTEST_HAS_DEATH_TEST
 
-#if GTEST_OS_MAC
-#include <crt_externs.h>
-#endif  // GTEST_OS_MAC
-
-#include <errno.h>
-#include <fcntl.h>
-#include <limits.h>
-
-#if GTEST_OS_LINUX
-#include <signal.h>
-#endif  // GTEST_OS_LINUX
-
-#include <stdarg.h>
-
-#if GTEST_OS_WINDOWS
-#include <windows.h>
-#else
-#include <sys/mman.h>
-#include <sys/wait.h>
-#endif  // GTEST_OS_WINDOWS
-
-#if GTEST_OS_QNX
-#include <spawn.h>
-#endif  // GTEST_OS_QNX
-
-#if GTEST_OS_FUCHSIA
-#include <lib/fdio/fd.h>
-#include <lib/fdio/io.h>
-#include <lib/fdio/spawn.h>
-#include <lib/zx/channel.h>
-#include <lib/zx/port.h>
-#include <lib/zx/process.h>
-#include <lib/zx/socket.h>
-#include <zircon/processargs.h>
-#include <zircon/syscalls.h>
-#include <zircon/syscalls/policy.h>
-#include <zircon/syscalls/port.h>
-#endif  // GTEST_OS_FUCHSIA
+# if GTEST_OS_MAC
+#  include <crt_externs.h>
+# endif  // GTEST_OS_MAC
+
+# include <errno.h>
+# include <fcntl.h>
+# include <limits.h>
+
+# if GTEST_OS_LINUX
+#  include <signal.h>
+# endif  // GTEST_OS_LINUX
+
+# include <stdarg.h>
+
+# if GTEST_OS_WINDOWS
+#  include <windows.h>
+# else
+#  include <sys/mman.h>
+#  include <sys/wait.h>
+# endif  // GTEST_OS_WINDOWS
+
+# if GTEST_OS_QNX
+#  include <spawn.h>
+# endif  // GTEST_OS_QNX
+
+# if GTEST_OS_FUCHSIA
+#  include <lib/fdio/fd.h>
+#  include <lib/fdio/io.h>
+#  include <lib/fdio/spawn.h>
+#  include <lib/zx/channel.h>
+#  include <lib/zx/port.h>
+#  include <lib/zx/process.h>
+#  include <lib/zx/socket.h>
+#  include <zircon/processargs.h>
+#  include <zircon/syscalls.h>
+#  include <zircon/syscalls/policy.h>
+#  include <zircon/syscalls/port.h>
+# endif  // GTEST_OS_FUCHSIA
 
 #endif  // GTEST_HAS_DEATH_TEST
 
@@ -133,9 +134,9 @@ namespace internal {
 
 // Valid only for fast death tests. Indicates the code is running in the
 // child process of a fast style death test.
-#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 static bool g_in_fast_death_test_child = false;
-#endif
+# endif
 
 // Returns a Boolean value indicating whether the caller is currently
 // executing in the context of the death test child process.  Tools such as
@@ -143,13 +144,13 @@ static bool g_in_fast_death_test_child = false;
 // tests.  IMPORTANT: This is an internal utility.  Using it may break the
 // implementation of death tests.  User code MUST NOT use it.
 bool InDeathTestChild() {
-#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   // On Windows and Fuchsia, death tests are thread-safe regardless of the value
   // of the death_test_style flag.
   return !GTEST_FLAG(internal_run_death_test).empty();
 
-#else
+# else
 
   if (GTEST_FLAG(death_test_style) == "threadsafe")
     return !GTEST_FLAG(internal_run_death_test).empty();
@@ -161,38 +162,40 @@ bool InDeathTestChild() {
 }  // namespace internal
 
 // ExitedWithCode constructor.
-ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {}
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
+}
 
 // ExitedWithCode function-call operator.
 bool ExitedWithCode::operator()(int exit_status) const {
-#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return exit_status == exit_code_;
 
-#else
+# else
 
   return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
 
-#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 }
 
-#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // KilledBySignal constructor.
-KilledBySignal::KilledBySignal(int signum) : signum_(signum) {}
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
+}
 
 // KilledBySignal function-call operator.
 bool KilledBySignal::operator()(int exit_status) const {
-#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   {
     bool result;
     if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
       return result;
     }
   }
-#endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
 }
-#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 namespace internal {
 
@@ -203,23 +206,23 @@ namespace internal {
 static std::string ExitSummary(int exit_code) {
   Message m;
 
-#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   m << "Exited with exit status " << exit_code;
 
-#else
+# else
 
   if (WIFEXITED(exit_code)) {
     m << "Exited with exit status " << WEXITSTATUS(exit_code);
   } else if (WIFSIGNALED(exit_code)) {
     m << "Terminated by signal " << WTERMSIG(exit_code);
   }
-#ifdef WCOREDUMP
+#  ifdef WCOREDUMP
   if (WCOREDUMP(exit_code)) {
     m << " (core dumped)";
   }
-#endif
-#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#  endif
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return m.GetString();
 }
@@ -230,7 +233,7 @@ bool ExitedUnsuccessfully(int exit_status) {
   return !ExitedWithCode(0)(exit_status);
 }
 
-#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Generates a textual failure message when a death test finds more than
 // one thread running, or cannot determine the number of threads, prior
 // to executing the given statement.  It is the responsibility of the
@@ -245,13 +248,13 @@ static std::string DeathTestThreadWarning(size_t thread_count) {
     msg << "detected " << thread_count << " threads.";
   }
   msg << " See "
-         "https://github.com/google/googletest/blob/master/googletest/docs/"
+         "https://github.com/google/googletest/blob/master/docs/"
          "advanced.md#death-tests-and-threads"
       << " for more explanation and suggested solutions, especially if"
       << " this is the last message you see before your test times out.";
   return msg.GetString();
 }
-#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 // Flag characters for reporting a death test that did not die.
 static const char kDeathTestLived = 'L';
@@ -280,14 +283,14 @@ enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
 // message is propagated back to the parent process.  Otherwise, the
 // message is simply printed to stderr.  In either case, the program
 // then exits with status 1.
-static void DeathTestAbort(const std::string &message) {
+static void DeathTestAbort(const std::string& message) {
   // On a POSIX system, this function may be called from a threadsafe-style
   // death test child process, which operates on a very small stack.  Use
   // the heap for any additional non-minuscule memory requirements.
-  const InternalRunDeathTestFlag *const flag =
+  const InternalRunDeathTestFlag* const flag =
       GetUnitTestImpl()->internal_run_death_test_flag();
   if (flag != nullptr) {
-    FILE *parent = posix::FDOpen(flag->write_fd(), "w");
+    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
     fputc(kDeathTestInternalError, parent);
     fprintf(parent, "%s", message.c_str());
     fflush(parent);
@@ -301,14 +304,14 @@ static void DeathTestAbort(const std::string &message) {
 
 // A replacement for CHECK that calls DeathTestAbort if the assertion
 // fails.
-#define GTEST_DEATH_TEST_CHECK_(expression)                              \
-  do {                                                                   \
-    if (!::testing::internal::IsTrue(expression)) {                      \
-      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
-                     ", line " +                                         \
-                     ::testing::internal::StreamableToString(__LINE__) + \
-                     ": " + #expression);                                \
-    }                                                                    \
+# define GTEST_DEATH_TEST_CHECK_(expression) \
+  do { \
+    if (!::testing::internal::IsTrue(expression)) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression); \
+    } \
   } while (::testing::internal::AlwaysFalse())
 
 // This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
@@ -318,23 +321,23 @@ static void DeathTestAbort(const std::string &message) {
 // evaluates the expression as long as it evaluates to -1 and sets
 // errno to EINTR.  If the expression evaluates to -1 but errno is
 // something other than EINTR, DeathTestAbort is called.
-#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression)                      \
-  do {                                                                   \
-    int gtest_retval;                                                    \
-    do {                                                                 \
-      gtest_retval = (expression);                                       \
-    } while (gtest_retval == -1 && errno == EINTR);                      \
-    if (gtest_retval == -1) {                                            \
-      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
-                     ", line " +                                         \
-                     ::testing::internal::StreamableToString(__LINE__) + \
-                     ": " + #expression + " != -1");                     \
-    }                                                                    \
+# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
+  do { \
+    int gtest_retval; \
+    do { \
+      gtest_retval = (expression); \
+    } while (gtest_retval == -1 && errno == EINTR); \
+    if (gtest_retval == -1) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression + " != -1"); \
+    } \
   } while (::testing::internal::AlwaysFalse())
 
 // Returns the message describing the last system error in errno.
 std::string GetLastErrnoDescription() {
-  return errno == 0 ? "" : posix::StrError(errno);
+    return errno == 0 ? "" : posix::StrError(errno);
 }
 
 // This is called from a death test parent process to read a failure
@@ -365,28 +368,27 @@ static void FailFromInternalError(int fd) {
 // Death test constructor.  Increments the running death test count
 // for the current test.
 DeathTest::DeathTest() {
-  TestInfo *const info = GetUnitTestImpl()->current_test_info();
+  TestInfo* const info = GetUnitTestImpl()->current_test_info();
   if (info == nullptr) {
-    DeathTestAbort(
-        "Cannot run a death test outside of a TEST or "
-        "TEST_F construct");
+    DeathTestAbort("Cannot run a death test outside of a TEST or "
+                   "TEST_F construct");
   }
 }
 
 // Creates and returns a death test by dispatching to the current
 // death test factory.
-bool DeathTest::Create(const char *statement,
-                       Matcher<const std::string &> matcher, const char *file,
-                       int line, DeathTest **test) {
+bool DeathTest::Create(const char* statement,
+                       Matcher<const std::string&> matcher, const char* file,
+                       int line, DeathTest** test) {
   return GetUnitTestImpl()->death_test_factory()->Create(
       statement, std::move(matcher), file, line, test);
 }
 
-const char *DeathTest::LastMessage() {
+const char* DeathTest::LastMessage() {
   return last_death_test_message_.c_str();
 }
 
-void DeathTest::set_last_death_test_message(const std::string &message) {
+void DeathTest::set_last_death_test_message(const std::string& message) {
   last_death_test_message_ = message;
 }
 
@@ -395,9 +397,14 @@ std::string DeathTest::last_death_test_message_;
 // Provides cross platform implementation for some death functionality.
 class DeathTestImpl : public DeathTest {
  protected:
-  DeathTestImpl(const char *a_statement, Matcher<const std::string &> matcher)
-      : statement_(a_statement), matcher_(std::move(matcher)), spawned_(false),
-        status_(-1), outcome_(IN_PROGRESS), read_fd_(-1), write_fd_(-1) {}
+  DeathTestImpl(const char* a_statement, Matcher<const std::string&> matcher)
+      : statement_(a_statement),
+        matcher_(std::move(matcher)),
+        spawned_(false),
+        status_(-1),
+        outcome_(IN_PROGRESS),
+        read_fd_(-1),
+        write_fd_(-1) {}
 
   // read_fd_ is expected to be closed and cleared by a derived class.
   ~DeathTestImpl() override { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
@@ -405,7 +412,7 @@ class DeathTestImpl : public DeathTest {
   void Abort(AbortReason reason) override;
   bool Passed(bool status_ok) override;
 
-  const char *statement() const { return statement_; }
+  const char* statement() const { return statement_; }
   bool spawned() const { return spawned_; }
   void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
   int status() const { return status_; }
@@ -429,9 +436,9 @@ class DeathTestImpl : public DeathTest {
  private:
   // The textual content of the code this object is testing.  This class
   // doesn't own this string and should not attempt to delete it.
-  const char *const statement_;
+  const char* const statement_;
   // A matcher that's expected to match the stderr output by the child process.
-  Matcher<const std::string &> matcher_;
+  Matcher<const std::string&> matcher_;
   // True if the death test child process has been successfully spawned.
   bool spawned_;
   // The exit status of the child process.
@@ -468,9 +475,15 @@ void DeathTestImpl::ReadAndInterpretStatusByte() {
     set_outcome(DIED);
   } else if (bytes_read == 1) {
     switch (flag) {
-      case kDeathTestReturned: set_outcome(RETURNED); break;
-      case kDeathTestThrew: set_outcome(THREW); break;
-      case kDeathTestLived: set_outcome(LIVED); break;
+      case kDeathTestReturned:
+        set_outcome(RETURNED);
+        break;
+      case kDeathTestThrew:
+        set_outcome(THREW);
+        break;
+      case kDeathTestLived:
+        set_outcome(LIVED);
+        break;
       case kDeathTestInternalError:
         FailFromInternalError(read_fd());  // Does not return.
         break;
@@ -487,7 +500,9 @@ void DeathTestImpl::ReadAndInterpretStatusByte() {
   set_read_fd(-1);
 }
 
-std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); }
+std::string DeathTestImpl::GetErrorLogs() {
+  return GetCapturedStderr();
+}
 
 // Signals that the death test code which should have exited, didn't.
 // Should be called only in a death test child process.
@@ -497,11 +512,9 @@ void DeathTestImpl::Abort(AbortReason reason) {
   // The parent process considers the death test to be a failure if
   // it finds any data in our pipe.  So, here we write a single flag byte
   // to the pipe, then exit.
-  const char status_ch = reason == TEST_DID_NOT_DIE
-                             ? kDeathTestLived
-                             : reason == TEST_THREW_EXCEPTION
-                                   ? kDeathTestThrew
-                                   : kDeathTestReturned;
+  const char status_ch =
+      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
+      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
 
   GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
   // We are leaking the descriptor here because on some platforms (i.e.,
@@ -518,9 +531,9 @@ void DeathTestImpl::Abort(AbortReason reason) {
 // Returns an indented copy of stderr output for a death test.
 // This makes distinguishing death test output lines from regular log lines
 // much easier.
-static ::std::string FormatDeathTestOutput(const ::std::string &output) {
+static ::std::string FormatDeathTestOutput(const ::std::string& output) {
   ::std::string ret;
-  for (size_t at = 0;;) {
+  for (size_t at = 0; ; ) {
     const size_t line_end = output.find('\n', at);
     ret += "[  DEATH   ] ";
     if (line_end == ::std::string::npos) {
@@ -555,7 +568,8 @@ static ::std::string FormatDeathTestOutput(const ::std::string &output) {
 // the first failing condition, in the order given above, is the one that is
 // reported. Also sets the last death test message string.
 bool DeathTestImpl::Passed(bool status_ok) {
-  if (!spawned()) return false;
+  if (!spawned())
+    return false;
 
   const std::string error_message = GetErrorLogs();
 
@@ -566,18 +580,15 @@ bool DeathTestImpl::Passed(bool status_ok) {
   switch (outcome()) {
     case LIVED:
       buffer << "    Result: failed to die.\n"
-             << " Error msg:\n"
-             << FormatDeathTestOutput(error_message);
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
       break;
     case THREW:
       buffer << "    Result: threw an exception.\n"
-             << " Error msg:\n"
-             << FormatDeathTestOutput(error_message);
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
       break;
     case RETURNED:
       buffer << "    Result: illegal return in test statement.\n"
-             << " Error msg:\n"
-             << FormatDeathTestOutput(error_message);
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
       break;
     case DIED:
       if (status_ok) {
@@ -594,8 +605,7 @@ bool DeathTestImpl::Passed(bool status_ok) {
       } else {
         buffer << "    Result: died but not with expected exit code:\n"
                << "            " << ExitSummary(status()) << "\n"
-               << "Actual msg:\n"
-               << FormatDeathTestOutput(error_message);
+               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
       }
       break;
     case IN_PROGRESS:
@@ -608,7 +618,7 @@ bool DeathTestImpl::Passed(bool status_ok) {
   return success;
 }
 
-#if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS
 // WindowsDeathTest implements death tests on Windows. Due to the
 // specifics of starting new processes on Windows, death tests there are
 // always threadsafe, and Google Test considers the
@@ -639,10 +649,10 @@ bool DeathTestImpl::Passed(bool status_ok) {
 //
 class WindowsDeathTest : public DeathTestImpl {
  public:
-  WindowsDeathTest(const char *a_statement,
-                   Matcher<const std::string &> matcher, const char *file,
-                   int line)
-      : DeathTestImpl(a_statement, std::move(matcher)), file_(file),
+  WindowsDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
+                   const char* file, int line)
+      : DeathTestImpl(a_statement, std::move(matcher)),
+        file_(file),
         line_(line) {}
 
   // All of these virtual functions are inherited from DeathTest.
@@ -651,7 +661,7 @@ class WindowsDeathTest : public DeathTestImpl {
 
  private:
   // The name of the file in which the death test is located.
-  const char *const file_;
+  const char* const file_;
   // The line number on which the death test is located.
   const int line_;
   // Handle to the write end of the pipe to the child process.
@@ -669,17 +679,21 @@ class WindowsDeathTest : public DeathTestImpl {
 // status, or 0 if no child process exists.  As a side effect, sets the
 // outcome data member.
 int WindowsDeathTest::Wait() {
-  if (!spawned()) return 0;
+  if (!spawned())
+    return 0;
 
   // Wait until the child either signals that it has acquired the write end
   // of the pipe or it dies.
   const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
-  switch (::WaitForMultipleObjects(2, wait_handles,
+  switch (::WaitForMultipleObjects(2,
+                                   wait_handles,
                                    FALSE,  // Waits for any of the handles.
                                    INFINITE)) {
     case WAIT_OBJECT_0:
-    case WAIT_OBJECT_0 + 1: break;
-    default: GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+    case WAIT_OBJECT_0 + 1:
+      break;
+    default:
+      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
   }
 
   // The child has acquired the write end of the pipe or exited.
@@ -693,8 +707,9 @@ int WindowsDeathTest::Wait() {
   // returns immediately if the child has already exited, regardless of
   // whether previous calls to WaitForMultipleObjects synchronized on this
   // handle or not.
-  GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 ==
-                          ::WaitForSingleObject(child_handle_.Get(), INFINITE));
+  GTEST_DEATH_TEST_CHECK_(
+      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
+                                             INFINITE));
   DWORD status_code;
   GTEST_DEATH_TEST_CHECK_(
       ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
@@ -709,10 +724,10 @@ int WindowsDeathTest::Wait() {
 // --gtest_internal_run_death_test flags such that it knows to run the
 // current death test only.
 DeathTest::TestRole WindowsDeathTest::AssumeRole() {
-  const UnitTestImpl *const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag *const flag =
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
       impl->internal_run_death_test_flag();
-  const TestInfo *const info = impl->current_test_info();
+  const TestInfo* const info = impl->current_test_info();
   const int death_test_index = info->result()->death_test_count();
 
   if (flag != nullptr) {
@@ -724,15 +739,15 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
 
   // WindowsDeathTest uses an anonymous pipe to communicate results of
   // a death test.
-  SECURITY_ATTRIBUTES handles_are_inheritable = { sizeof(SECURITY_ATTRIBUTES),
-                                                  nullptr, TRUE };
+  SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES),
+                                                 nullptr, TRUE};
   HANDLE read_handle, write_handle;
-  GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle,
-                                       &handles_are_inheritable,
-                                       0)  // Default buffer size.
-                          != FALSE);
-  set_read_fd(
-      ::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), O_RDONLY));
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
+                   0)  // Default buffer size.
+      != FALSE);
+  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
+                                O_RDONLY));
   write_handle_.Reset(write_handle);
   event_handle_.Reset(::CreateEvent(
       &handles_are_inheritable,
@@ -744,23 +759,24 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
                                   kFilterFlag + "=" + info->test_suite_name() +
                                   "." + info->name();
   const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" +
-      file_ + "|" + StreamableToString(line_) + "|" +
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
+      "=" + file_ + "|" + StreamableToString(line_) + "|" +
       StreamableToString(death_test_index) + "|" +
       StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
       // size_t has the same width as pointers on both 32-bit and 64-bit
       // Windows platforms.
       // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
-      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + "|" +
-      StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
+      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
 
   char executable_path[_MAX_PATH + 1];  // NOLINT
   GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr,
                                                                 executable_path,
                                                                 _MAX_PATH));
 
-  std::string command_line = std::string(::GetCommandLineA()) + " " +
-                             filter_flag + " \"" + internal_flag + "\"";
+  std::string command_line =
+      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
+      internal_flag + "\"";
 
   DeathTest::set_last_death_test_message("");
 
@@ -779,7 +795,7 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   PROCESS_INFORMATION process_info;
   GTEST_DEATH_TEST_CHECK_(
       ::CreateProcessA(
-          executable_path, const_cast<char *>(command_line.c_str()),
+          executable_path, const_cast<char*>(command_line.c_str()),
           nullptr,  // Retuned process handle is not inheritable.
           nullptr,  // Retuned thread handle is not inheritable.
           TRUE,  // Child inherits all inheritable handles (for write_handle_).
@@ -793,14 +809,14 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   return OVERSEE_TEST;
 }
 
-#elif GTEST_OS_FUCHSIA
+# elif GTEST_OS_FUCHSIA
 
 class FuchsiaDeathTest : public DeathTestImpl {
  public:
-  FuchsiaDeathTest(const char *a_statement,
-                   Matcher<const std::string &> matcher, const char *file,
-                   int line)
-      : DeathTestImpl(a_statement, std::move(matcher)), file_(file),
+  FuchsiaDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
+                   const char* file, int line)
+      : DeathTestImpl(a_statement, std::move(matcher)),
+        file_(file),
         line_(line) {}
 
   // All of these virtual functions are inherited from DeathTest.
@@ -810,7 +826,7 @@ class FuchsiaDeathTest : public DeathTestImpl {
 
  private:
   // The name of the file in which the death test is located.
-  const char *const file_;
+  const char* const file_;
   // The line number on which the death test is located.
   const int line_;
   // The stderr data captured by the child process.
@@ -827,28 +843,33 @@ class Arguments {
   Arguments() { args_.push_back(nullptr); }
 
   ~Arguments() {
-    for (std::vector<char *>::iterator i = args_.begin(); i != args_.end();
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
          ++i) {
       free(*i);
     }
   }
-  void AddArgument(const char *argument) {
+  void AddArgument(const char* argument) {
     args_.insert(args_.end() - 1, posix::StrDup(argument));
   }
 
   template <typename Str>
-  void AddArguments(const ::std::vector<Str> &arguments) {
+  void AddArguments(const ::std::vector<Str>& arguments) {
     for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end(); ++i) {
+         i != arguments.end();
+         ++i) {
       args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
     }
   }
-  char *const *Argv() { return &args_[0]; }
+  char* const* Argv() {
+    return &args_[0];
+  }
 
-  int size() { return args_.size() - 1; }
+  int size() {
+    return static_cast<int>(args_.size()) - 1;
+  }
 
  private:
-  std::vector<char *> args_;
+  std::vector<char*> args_;
 };
 
 // Waits for the child in a death test to exit, returning its exit
@@ -859,7 +880,8 @@ int FuchsiaDeathTest::Wait() {
   const int kSocketKey = 1;
   const int kExceptionKey = 2;
 
-  if (!spawned()) return 0;
+  if (!spawned())
+    return 0;
 
   // Create a port to wait for socket/task/exception events.
   zx_status_t status_zx;
@@ -869,18 +891,17 @@ int FuchsiaDeathTest::Wait() {
 
   // Register to wait for the child process to terminate.
   status_zx = child_process_.wait_async(
-      port, kProcessKey, ZX_PROCESS_TERMINATED, ZX_WAIT_ASYNC_ONCE);
+      port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for the socket to be readable or closed.
   status_zx = stderr_socket_.wait_async(
-      port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
-      ZX_WAIT_ASYNC_ONCE);
+      port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for an exception.
   status_zx = exception_channel_.wait_async(
-      port, kExceptionKey, ZX_CHANNEL_READABLE, ZX_WAIT_ASYNC_ONCE);
+      port, kExceptionKey, ZX_CHANNEL_READABLE, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   bool process_terminated = false;
@@ -910,9 +931,9 @@ int FuchsiaDeathTest::Wait() {
           size_t old_length = captured_stderr_.length();
           size_t bytes_read = 0;
           captured_stderr_.resize(old_length + kBufferSize);
-          status_zx =
-              stderr_socket_.read(0, &captured_stderr_.front() + old_length,
-                                  kBufferSize, &bytes_read);
+          status_zx = stderr_socket_.read(
+              0, &captured_stderr_.front() + old_length, kBufferSize,
+              &bytes_read);
           captured_stderr_.resize(old_length + bytes_read);
         } while (status_zx == ZX_OK);
         if (status_zx == ZX_ERR_PEER_CLOSED) {
@@ -920,8 +941,7 @@ int FuchsiaDeathTest::Wait() {
         } else {
           GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT);
           status_zx = stderr_socket_.wait_async(
-              port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
-              ZX_WAIT_ASYNC_ONCE);
+              port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0);
           GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
         }
       } else {
@@ -938,8 +958,8 @@ int FuchsiaDeathTest::Wait() {
                                       nullptr, nullptr);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
-  GTEST_DEATH_TEST_CHECK_(buffer.exited);
-  set_status(buffer.return_code);
+  GTEST_DEATH_TEST_CHECK_(buffer.flags & ZX_INFO_PROCESS_FLAG_EXITED);
+  set_status(static_cast<int>(buffer.return_code));
   return status();
 }
 
@@ -949,10 +969,10 @@ int FuchsiaDeathTest::Wait() {
 // --gtest_internal_run_death_test flags such that it knows to run the
 // current death test only.
 DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
-  const UnitTestImpl *const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag *const flag =
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
       impl->internal_run_death_test_flag();
-  const TestInfo *const info = impl->current_test_info();
+  const TestInfo* const info = impl->current_test_info();
   const int death_test_index = info->result()->death_test_count();
 
   if (flag != nullptr) {
@@ -969,10 +989,11 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
                                   kFilterFlag + "=" + info->test_suite_name() +
                                   "." + info->name();
-  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                    kInternalRunDeathTestFlag + "=" + file_ +
-                                    "|" + StreamableToString(line_) + "|" +
-                                    StreamableToString(death_test_index);
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|"
+      + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index);
   Arguments args;
   args.AddArguments(GetInjectableArgvs());
   args.AddArgument(filter_flag.c_str());
@@ -988,14 +1009,15 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Set the pipe handle for the child.
   fdio_spawn_action_t spawn_actions[2] = {};
-  fdio_spawn_action_t *add_handle_action = &spawn_actions[0];
+  fdio_spawn_action_t* add_handle_action = &spawn_actions[0];
   add_handle_action->action = FDIO_SPAWN_ACTION_ADD_HANDLE;
   add_handle_action->h.id = PA_HND(PA_FD, kFuchsiaReadPipeFd);
   add_handle_action->h.handle = child_pipe_handle;
 
   // Create a socket pair will be used to receive the child process' stderr.
   zx::socket stderr_producer_socket;
-  status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
+  status =
+      zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
   GTEST_DEATH_TEST_CHECK_(status >= 0);
   int stderr_producer_fd = -1;
   status =
@@ -1005,39 +1027,42 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
   // Make the stderr socket nonblocking.
   GTEST_DEATH_TEST_CHECK_(fcntl(stderr_producer_fd, F_SETFL, 0) == 0);
 
-  fdio_spawn_action_t *add_stderr_action = &spawn_actions[1];
+  fdio_spawn_action_t* add_stderr_action = &spawn_actions[1];
   add_stderr_action->action = FDIO_SPAWN_ACTION_CLONE_FD;
   add_stderr_action->fd.local_fd = stderr_producer_fd;
   add_stderr_action->fd.target_fd = STDERR_FILENO;
 
   // Create a child job.
   zx_handle_t child_job = ZX_HANDLE_INVALID;
-  status = zx_job_create(zx_job_default(), 0, &child_job);
+  status = zx_job_create(zx_job_default(), 0, & child_job);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
   zx_policy_basic_t policy;
   policy.condition = ZX_POL_NEW_ANY;
   policy.policy = ZX_POL_ACTION_ALLOW;
-  status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC,
-                             &policy, 1);
+  status = zx_job_set_policy(
+      child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, &policy, 1);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   // Create an exception channel attached to the |child_job|, to allow
   // us to suppress the system default exception handler from firing.
-  status = zx_task_create_exception_channel(
-      child_job, 0, exception_channel_.reset_and_get_address());
+  status =
+      zx_task_create_exception_channel(
+          child_job, 0, exception_channel_.reset_and_get_address());
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   // Spawn the child process.
-  status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0],
-                          args.Argv(), nullptr, 2, spawn_actions,
-                          child_process_.reset_and_get_address(), nullptr);
+  status = fdio_spawn_etc(
+      child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], args.Argv(), nullptr,
+      2, spawn_actions, child_process_.reset_and_get_address(), nullptr);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   set_spawned(true);
   return OVERSEE_TEST;
 }
 
-std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; }
+std::string FuchsiaDeathTest::GetErrorLogs() {
+  return captured_stderr_;
+}
 
 #else  // We are neither on Windows, nor on Fuchsia.
 
@@ -1046,7 +1071,7 @@ std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; }
 // left undefined.
 class ForkingDeathTest : public DeathTestImpl {
  public:
-  ForkingDeathTest(const char *statement, Matcher<const std::string &> matcher);
+  ForkingDeathTest(const char* statement, Matcher<const std::string&> matcher);
 
   // All of these virtual functions are inherited from DeathTest.
   int Wait() override;
@@ -1060,15 +1085,16 @@ class ForkingDeathTest : public DeathTestImpl {
 };
 
 // Constructs a ForkingDeathTest.
-ForkingDeathTest::ForkingDeathTest(const char *a_statement,
-                                   Matcher<const std::string &> matcher)
+ForkingDeathTest::ForkingDeathTest(const char* a_statement,
+                                   Matcher<const std::string&> matcher)
     : DeathTestImpl(a_statement, std::move(matcher)), child_pid_(-1) {}
 
 // Waits for the child in a death test to exit, returning its exit
 // status, or 0 if no child process exists.  As a side effect, sets the
 // outcome data member.
 int ForkingDeathTest::Wait() {
-  if (!spawned()) return 0;
+  if (!spawned())
+    return 0;
 
   ReadAndInterpretStatusByte();
 
@@ -1082,7 +1108,7 @@ int ForkingDeathTest::Wait() {
 // in the child process.
 class NoExecDeathTest : public ForkingDeathTest {
  public:
-  NoExecDeathTest(const char *a_statement, Matcher<const std::string &> matcher)
+  NoExecDeathTest(const char* a_statement, Matcher<const std::string&> matcher)
       : ForkingDeathTest(a_statement, std::move(matcher)) {}
   TestRole AssumeRole() override;
 };
@@ -1137,24 +1163,25 @@ DeathTest::TestRole NoExecDeathTest::AssumeRole() {
 // only this specific death test to be run.
 class ExecDeathTest : public ForkingDeathTest {
  public:
-  ExecDeathTest(const char *a_statement, Matcher<const std::string &> matcher,
-                const char *file, int line)
-      : ForkingDeathTest(a_statement, std::move(matcher)), file_(file),
+  ExecDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
+                const char* file, int line)
+      : ForkingDeathTest(a_statement, std::move(matcher)),
+        file_(file),
         line_(line) {}
   TestRole AssumeRole() override;
 
  private:
   static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
     ::std::vector<std::string> args = GetInjectableArgvs();
-#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     ::std::vector<std::string> extra_args =
         GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
     args.insert(args.end(), extra_args.begin(), extra_args.end());
-#endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     return args;
   }
   // The name of the file in which the death test is located.
-  const char *const file_;
+  const char* const file_;
   // The line number on which the death test is located.
   const int line_;
 };
@@ -1165,82 +1192,74 @@ class Arguments {
   Arguments() { args_.push_back(nullptr); }
 
   ~Arguments() {
-    for (std::vector<char *>::iterator i = args_.begin(); i != args_.end();
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
          ++i) {
       free(*i);
     }
   }
-  void AddArgument(const char *argument) {
+  void AddArgument(const char* argument) {
     args_.insert(args_.end() - 1, posix::StrDup(argument));
   }
 
   template <typename Str>
-  void AddArguments(const ::std::vector<Str> &arguments) {
+  void AddArguments(const ::std::vector<Str>& arguments) {
     for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end(); ++i) {
+         i != arguments.end();
+         ++i) {
       args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
     }
   }
-  char *const *Argv() { return &args_[0]; }
+  char* const* Argv() {
+    return &args_[0];
+  }
 
  private:
-  std::vector<char *> args_;
+  std::vector<char*> args_;
 };
 
 // A struct that encompasses the arguments to the child process of a
 // threadsafe-style death test process.
 struct ExecDeathTestArgs {
-  char *const *argv;  // Command-line arguments for the child's call to exec
+  char* const* argv;  // Command-line arguments for the child's call to exec
   int close_fd;       // File descriptor to close; the read end of a pipe
 };
 
-#if GTEST_OS_MAC
-inline char **GetEnviron() {
-  // When Google Test is built as a framework on MacOS X, the environ variable
-  // is unavailable. Apple's documentation (man environ) recommends using
-  // _NSGetEnviron() instead.
-  return *_NSGetEnviron();
-}
-#else
-// Some POSIX platforms expect you to declare environ. extern "C" makes
-// it reside in the global namespace.
-extern "C" char **environ;
-inline char **GetEnviron() { return environ; }
-#endif  // GTEST_OS_MAC
-
-#if !GTEST_OS_QNX
+#  if GTEST_OS_QNX
+extern "C" char** environ;
+#  else  // GTEST_OS_QNX
 // The main function for a threadsafe-style death test child process.
 // This function is called in a clone()-ed process and thus must avoid
 // any potentially unsafe operations like malloc or libc functions.
-static int ExecDeathTestChildMain(void *child_arg) {
-  ExecDeathTestArgs *const args = static_cast<ExecDeathTestArgs *>(child_arg);
+static int ExecDeathTestChildMain(void* child_arg) {
+  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
   GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
 
   // We need to execute the test program in the same environment where
   // it was originally invoked.  Therefore we change to the original
   // working directory first.
-  const char *const original_dir =
+  const char* const original_dir =
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir +
-                   "\") failed: " + GetLastErrnoDescription());
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
-  // We can safely call execve() as it's a direct system call.  We
+  // We can safely call execv() as it's almost a direct system call. We
   // cannot use execvp() as it's a libc function and thus potentially
-  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // unsafe.  Since execv() doesn't search the PATH, the user must
   // invoke the test program via a valid path that contains at least
   // one path separator.
-  execve(args->argv[0], args->argv, GetEnviron());
-  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
-                 original_dir + " failed: " + GetLastErrnoDescription());
+  execv(args->argv[0], args->argv);
+  DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " +
+                 GetLastErrnoDescription());
   return EXIT_FAILURE;
 }
-#endif  // !GTEST_OS_QNX
+#  endif  // GTEST_OS_QNX
 
-#if GTEST_HAS_CLONE
+#  if GTEST_HAS_CLONE
 // Two utility routines that together determine the direction the stack
 // grows.
 // This could be accomplished more elegantly by a single recursive
@@ -1250,26 +1269,31 @@ static int ExecDeathTestChildMain(void *child_arg) {
 // GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
 // StackLowerThanAddress into StackGrowsDown, which then doesn't give
 // correct answer.
-static void StackLowerThanAddress(const void *ptr,
-                                  bool *result) GTEST_NO_INLINE_;
+static void StackLowerThanAddress(const void* ptr,
+                                  bool* result) GTEST_NO_INLINE_;
+// Make sure sanitizers do not tamper with the stack here.
+// Ideally, we want to use `__builtin_frame_address` instead of a local variable
+// address with sanitizer disabled, but it does not work when the
+// compiler optimizes the stack frame out, which happens on PowerPC targets.
 // HWAddressSanitizer add a random tag to the MSB of the local variable address,
 // making comparison result unpredictable.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-static void StackLowerThanAddress(const void *ptr, bool *result) {
-  int dummy;
-  *result = (&dummy < ptr);
+static void StackLowerThanAddress(const void* ptr, bool* result) {
+  int dummy = 0;
+  *result = std::less<const void*>()(&dummy, ptr);
 }
 
 // Make sure AddressSanitizer does not tamper with the stack here.
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 static bool StackGrowsDown() {
-  int dummy;
+  int dummy = 0;
   bool result;
   StackLowerThanAddress(&dummy, &result);
   return result;
 }
-#endif  // GTEST_HAS_CLONE
+#  endif  // GTEST_HAS_CLONE
 
 // Spawns a child process with the same executable as the current process in
 // a thread-safe manner and instructs it to run the death test.  The
@@ -1278,11 +1302,11 @@ static bool StackGrowsDown() {
 // fork supports only single-threaded environments, so this function uses
 // spawn(2) there instead.  The function dies with an error message if
 // anything goes wrong.
-static pid_t ExecDeathTestSpawnChild(char *const *argv, int close_fd) {
+static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
   ExecDeathTestArgs args = { argv, close_fd };
   pid_t child_pid = -1;
 
-#if GTEST_OS_QNX
+#  if GTEST_OS_QNX
   // Obtains the current directory and sets it to be closed in the child
   // process.
   const int cwd_fd = open(".", O_RDONLY);
@@ -1291,30 +1315,29 @@ static pid_t ExecDeathTestSpawnChild(char *const *argv, int close_fd) {
   // We need to execute the test program in the same environment where
   // it was originally invoked.  Therefore we change to the original
   // working directory first.
-  const char *const original_dir =
+  const char* const original_dir =
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir +
-                   "\") failed: " + GetLastErrnoDescription());
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
   int fd_flags;
   // Set close_fd to be closed after spawn.
   GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(
-      fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC));
-  struct inheritance inherit = { 0 };
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
+                                        fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = {0};
   // spawn is a system call.
-  child_pid =
-      spawn(args.argv[0], 0, nullptr, &inherit, args.argv, GetEnviron());
+  child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ);
   // Restores the current working directory.
   GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
   GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
 
-#else  // GTEST_OS_QNX
-#if GTEST_OS_LINUX
+#  else   // GTEST_OS_QNX
+#   if GTEST_OS_LINUX
   // When a SIGPROF signal is received while fork() or clone() are executing,
   // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
   // it after the call to fork()/clone() is complete.
@@ -1323,18 +1346,18 @@ static pid_t ExecDeathTestSpawnChild(char *const *argv, int close_fd) {
   memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
   sigemptyset(&ignore_sigprof_action.sa_mask);
   ignore_sigprof_action.sa_handler = SIG_IGN;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(
-      sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
-#endif  // GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
+      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#   endif  // GTEST_OS_LINUX
 
-#if GTEST_HAS_CLONE
+#   if GTEST_HAS_CLONE
   const bool use_fork = GTEST_FLAG(death_test_use_fork);
 
   if (!use_fork) {
     static const bool stack_grows_down = StackGrowsDown();
     const auto stack_size = static_cast<size_t>(getpagesize() * 2);
     // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
-    void *const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE,
+    void* const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE,
                              MAP_ANON | MAP_PRIVATE, -1, 0);
     GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
 
@@ -1345,9 +1368,9 @@ static pid_t ExecDeathTestSpawnChild(char *const *argv, int close_fd) {
     // than 64.  We assume stack and stack_size already have alignment of
     // kMaxStackAlignment.
     const size_t kMaxStackAlignment = 64;
-    void *const stack_top =
-        static_cast<char *>(stack) +
-        (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    void* const stack_top =
+        static_cast<char*>(stack) +
+            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
     GTEST_DEATH_TEST_CHECK_(
         static_cast<size_t>(stack_size) > kMaxStackAlignment &&
         reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0);
@@ -1356,19 +1379,19 @@ static pid_t ExecDeathTestSpawnChild(char *const *argv, int close_fd) {
 
     GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
   }
-#else
+#   else
   const bool use_fork = true;
-#endif  // GTEST_HAS_CLONE
+#   endif  // GTEST_HAS_CLONE
 
   if (use_fork && (child_pid = fork()) == 0) {
-    ExecDeathTestChildMain(&args);
-    _exit(0);
+      ExecDeathTestChildMain(&args);
+      _exit(0);
   }
-#endif  // GTEST_OS_QNX
-#if GTEST_OS_LINUX
+#  endif  // GTEST_OS_QNX
+#  if GTEST_OS_LINUX
   GTEST_DEATH_TEST_CHECK_SYSCALL_(
       sigaction(SIGPROF, &saved_sigprof_action, nullptr));
-#endif  // GTEST_OS_LINUX
+#  endif  // GTEST_OS_LINUX
 
   GTEST_DEATH_TEST_CHECK_(child_pid != -1);
   return child_pid;
@@ -1379,10 +1402,10 @@ static pid_t ExecDeathTestSpawnChild(char *const *argv, int close_fd) {
 // and --gtest_internal_run_death_test flags to cause only the current
 // death test to be re-run.
 DeathTest::TestRole ExecDeathTest::AssumeRole() {
-  const UnitTestImpl *const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag *const flag =
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
       impl->internal_run_death_test_flag();
-  const TestInfo *const info = impl->current_test_info();
+  const TestInfo* const info = impl->current_test_info();
   const int death_test_index = info->result()->death_test_count();
 
   if (flag != nullptr) {
@@ -1399,11 +1422,11 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
                                   kFilterFlag + "=" + info->test_suite_name() +
                                   "." + info->name();
-  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                    kInternalRunDeathTestFlag + "=" + file_ +
-                                    "|" + StreamableToString(line_) + "|" +
-                                    StreamableToString(death_test_index) + "|" +
-                                    StreamableToString(pipe_fd[1]);
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|" + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index) + "|"
+      + StreamableToString(pipe_fd[1]);
   Arguments args;
   args.AddArguments(GetArgvsForDeathTestChildProcess());
   args.AddArgument(filter_flag.c_str());
@@ -1424,29 +1447,29 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
   return OVERSEE_TEST;
 }
 
-#endif  // !GTEST_OS_WINDOWS
+# endif  // !GTEST_OS_WINDOWS
 
 // Creates a concrete DeathTest-derived class that depends on the
 // --gtest_death_test_style flag, and sets the pointer pointed to
 // by the "test" argument to its address.  If the test should be
 // skipped, sets that pointer to NULL.  Returns true, unless the
 // flag is set to an invalid value.
-bool DefaultDeathTestFactory::Create(const char *statement,
-                                     Matcher<const std::string &> matcher,
-                                     const char *file, int line,
-                                     DeathTest **test) {
-  UnitTestImpl *const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag *const flag =
+bool DefaultDeathTestFactory::Create(const char* statement,
+                                     Matcher<const std::string&> matcher,
+                                     const char* file, int line,
+                                     DeathTest** test) {
+  UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
       impl->internal_run_death_test_flag();
-  const int death_test_index =
-      impl->current_test_info()->increment_death_test_count();
+  const int death_test_index = impl->current_test_info()
+      ->increment_death_test_count();
 
   if (flag != nullptr) {
     if (death_test_index > flag->index()) {
       DeathTest::set_last_death_test_message(
-          "Death test count (" + StreamableToString(death_test_index) +
-          ") somehow exceeded expected maximum (" +
-          StreamableToString(flag->index()) + ")");
+          "Death test count (" + StreamableToString(death_test_index)
+          + ") somehow exceeded expected maximum ("
+          + StreamableToString(flag->index()) + ")");
       return false;
     }
 
@@ -1457,21 +1480,21 @@ bool DefaultDeathTestFactory::Create(const char *statement,
     }
   }
 
-#if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS
 
   if (GTEST_FLAG(death_test_style) == "threadsafe" ||
       GTEST_FLAG(death_test_style) == "fast") {
     *test = new WindowsDeathTest(statement, std::move(matcher), file, line);
   }
 
-#elif GTEST_OS_FUCHSIA
+# elif GTEST_OS_FUCHSIA
 
   if (GTEST_FLAG(death_test_style) == "threadsafe" ||
       GTEST_FLAG(death_test_style) == "fast") {
     *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line);
   }
 
-#else
+# else
 
   if (GTEST_FLAG(death_test_style) == "threadsafe") {
     *test = new ExecDeathTest(statement, std::move(matcher), file, line);
@@ -1479,28 +1502,28 @@ bool DefaultDeathTestFactory::Create(const char *statement,
     *test = new NoExecDeathTest(statement, std::move(matcher));
   }
 
-#endif  // GTEST_OS_WINDOWS
+# endif  // GTEST_OS_WINDOWS
 
   else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
-    DeathTest::set_last_death_test_message("Unknown death test style \"" +
-                                           GTEST_FLAG(death_test_style) +
-                                           "\" encountered");
+    DeathTest::set_last_death_test_message(
+        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
+        + "\" encountered");
     return false;
   }
 
   return true;
 }
 
-#if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS
 // Recreates the pipe and event handles from the provided parameters,
 // signals the event, and returns a file descriptor wrapped around the pipe
 // handle. This function is called in the child process only.
 static int GetStatusFileDescriptor(unsigned int parent_process_id,
-                                   size_t write_handle_as_size_t,
-                                   size_t event_handle_as_size_t) {
+                            size_t write_handle_as_size_t,
+                            size_t event_handle_as_size_t) {
   AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
-                                                 FALSE,  // Non-inheritable.
-                                                 parent_process_id));
+                                                   FALSE,  // Non-inheritable.
+                                                   parent_process_id));
   if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
     DeathTestAbort("Unable to open parent process " +
                    StreamableToString(parent_process_id));
@@ -1508,7 +1531,8 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
 
   GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
 
-  const HANDLE write_handle = reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  const HANDLE write_handle =
+      reinterpret_cast<HANDLE>(write_handle_as_size_t);
   HANDLE dup_write_handle;
 
   // The newly initialized handle is accessible only in the parent
@@ -1530,7 +1554,9 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
   HANDLE dup_event_handle;
 
   if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
-                         ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE,
+                         ::GetCurrentProcess(), &dup_event_handle,
+                         0x0,
+                         FALSE,
                          DUPLICATE_SAME_ACCESS)) {
     DeathTestAbort("Unable to duplicate the event handle " +
                    StreamableToString(event_handle_as_size_t) +
@@ -1552,12 +1578,12 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
 
   return write_fd;
 }
-#endif  // GTEST_OS_WINDOWS
+# endif  // GTEST_OS_WINDOWS
 
 // Returns a newly created InternalRunDeathTestFlag object with fields
 // initialized from the GTEST_FLAG(internal_run_death_test) flag if
 // the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag *ParseInternalRunDeathTestFlag() {
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
   if (GTEST_FLAG(internal_run_death_test) == "") return nullptr;
 
   // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
@@ -1568,41 +1594,45 @@ InternalRunDeathTestFlag *ParseInternalRunDeathTestFlag() {
   SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
   int write_fd = -1;
 
-#if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS
 
   unsigned int parent_process_id = 0;
   size_t write_handle_as_size_t = 0;
   size_t event_handle_as_size_t = 0;
 
-  if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) ||
-      !ParseNaturalNumber(fields[2], &index) ||
-      !ParseNaturalNumber(fields[3], &parent_process_id) ||
-      !ParseNaturalNumber(fields[4], &write_handle_as_size_t) ||
-      !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+  if (fields.size() != 6
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &parent_process_id)
+      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
+      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
     DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
                    GTEST_FLAG(internal_run_death_test));
   }
-  write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t,
+  write_fd = GetStatusFileDescriptor(parent_process_id,
+                                     write_handle_as_size_t,
                                      event_handle_as_size_t);
 
-#elif GTEST_OS_FUCHSIA
+# elif GTEST_OS_FUCHSIA
 
-  if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) ||
-      !ParseNaturalNumber(fields[2], &index)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
-                   GTEST_FLAG(internal_run_death_test));
+  if (fields.size() != 3
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
   }
 
-#else
+# else
 
-  if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) ||
-      !ParseNaturalNumber(fields[2], &index) ||
-      !ParseNaturalNumber(fields[3], &write_fd)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
-                   GTEST_FLAG(internal_run_death_test));
+  if (fields.size() != 4
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
   }
 
-#endif  // GTEST_OS_WINDOWS
+# endif  // GTEST_OS_WINDOWS
 
   return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
 }
diff --git a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-filepath.cc b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-filepath.cc
index f9427e0f18..0b5629401b 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-filepath.cc
+++ b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-filepath.cc
@@ -34,25 +34,25 @@
 #include "gtest/gtest-message.h"
 
 #if GTEST_OS_WINDOWS_MOBILE
-#include <windows.h>
+# include <windows.h>
 #elif GTEST_OS_WINDOWS
-#include <direct.h>
-#include <io.h>
+# include <direct.h>
+# include <io.h>
 #else
-#include <limits.h>
-#include <climits>  // Some Linux distributions define PATH_MAX here.
-#endif              // GTEST_OS_WINDOWS_MOBILE
+# include <limits.h>
+# include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif  // GTEST_OS_WINDOWS_MOBILE
 
 #include "gtest/internal/gtest-string.h"
 
 #if GTEST_OS_WINDOWS
-#define GTEST_PATH_MAX_ _MAX_PATH
+# define GTEST_PATH_MAX_ _MAX_PATH
 #elif defined(PATH_MAX)
-#define GTEST_PATH_MAX_ PATH_MAX
+# define GTEST_PATH_MAX_ PATH_MAX
 #elif defined(_XOPEN_PATH_MAX)
-#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
 #else
-#define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
 #endif  // GTEST_OS_WINDOWS
 
 namespace testing {
@@ -66,16 +66,16 @@ namespace internal {
 const char kPathSeparator = '\\';
 const char kAlternatePathSeparator = '/';
 const char kAlternatePathSeparatorString[] = "/";
-#if GTEST_OS_WINDOWS_MOBILE
+# if GTEST_OS_WINDOWS_MOBILE
 // Windows CE doesn't have a current directory. You should not use
 // the current directory in tests on Windows CE, but this at least
 // provides a reasonable fallback.
 const char kCurrentDirectoryString[] = "\\";
 // Windows CE doesn't define INVALID_FILE_ATTRIBUTES
 const DWORD kInvalidFileAttributes = 0xffffffff;
-#else
+# else
 const char kCurrentDirectoryString[] = ".\\";
-#endif  // GTEST_OS_WINDOWS_MOBILE
+# endif  // GTEST_OS_WINDOWS_MOBILE
 #else
 const char kPathSeparator = '/';
 const char kCurrentDirectoryString[] = "./";
@@ -92,8 +92,9 @@ static bool IsPathSeparator(char c) {
 
 // Returns the current working directory, or "" if unsuccessful.
 FilePath FilePath::GetCurrentDir() {
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
-    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE ||         \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32 || \
+    GTEST_OS_XTENSA
   // These platforms do not have a current directory, so we just return
   // something reasonable.
   return FilePath(kCurrentDirectoryString);
@@ -102,13 +103,13 @@ FilePath FilePath::GetCurrentDir() {
   return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
 #else
   char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
-  char *result = getcwd(cwd, sizeof(cwd));
-#if GTEST_OS_NACL
+  char* result = getcwd(cwd, sizeof(cwd));
+# if GTEST_OS_NACL
   // getcwd will likely fail in NaCl due to the sandbox, so return something
   // reasonable. The user may have provided a shim implementation for getcwd,
   // however, so fallback only when failure is detected.
   return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
-#endif  // GTEST_OS_NACL
+# endif  // GTEST_OS_NACL
   return FilePath(result == nullptr ? "" : cwd);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 }
@@ -117,11 +118,11 @@ FilePath FilePath::GetCurrentDir() {
 // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
 // FilePath("dir/file"). If a case-insensitive extension is not
 // found, returns a copy of the original FilePath.
-FilePath FilePath::RemoveExtension(const char *extension) const {
+FilePath FilePath::RemoveExtension(const char* extension) const {
   const std::string dot_extension = std::string(".") + extension;
   if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
-    return FilePath(
-        pathname_.substr(0, pathname_.length() - dot_extension.length()));
+    return FilePath(pathname_.substr(
+        0, pathname_.length() - dot_extension.length()));
   }
   return *this;
 }
@@ -129,10 +130,10 @@ FilePath FilePath::RemoveExtension(const char *extension) const {
 // Returns a pointer to the last occurrence of a valid path separator in
 // the FilePath. On Windows, for example, both '/' and '\' are valid path
 // separators. Returns NULL if no path separator was found.
-const char *FilePath::FindLastPathSeparator() const {
-  const char *const last_sep = strrchr(c_str(), kPathSeparator);
+const char* FilePath::FindLastPathSeparator() const {
+  const char* const last_sep = strrchr(c_str(), kPathSeparator);
 #if GTEST_HAS_ALT_PATH_SEP_
-  const char *const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
   // Comparing two pointers of which only one is NULL is undefined.
   if (last_alt_sep != nullptr &&
       (last_sep == nullptr || last_alt_sep > last_sep)) {
@@ -149,7 +150,7 @@ const char *FilePath::FindLastPathSeparator() const {
 // returns an empty FilePath ("").
 // On Windows platform, '\' is the path separator, otherwise it is '/'.
 FilePath FilePath::RemoveDirectoryName() const {
-  const char *const last_sep = FindLastPathSeparator();
+  const char* const last_sep = FindLastPathSeparator();
   return last_sep ? FilePath(last_sep + 1) : *this;
 }
 
@@ -160,7 +161,7 @@ FilePath FilePath::RemoveDirectoryName() const {
 // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
 // On Windows platform, '\' is the path separator, otherwise it is '/'.
 FilePath FilePath::RemoveFileName() const {
-  const char *const last_sep = FindLastPathSeparator();
+  const char* const last_sep = FindLastPathSeparator();
   std::string dir;
   if (last_sep) {
     dir = std::string(c_str(), static_cast<size_t>(last_sep + 1 - c_str()));
@@ -176,24 +177,26 @@ FilePath FilePath::RemoveFileName() const {
 // extension = "xml", returns "dir/test.xml". If number is greater
 // than zero (e.g., 12), returns "dir/test_12.xml".
 // On Windows platform, uses \ as the separator rather than /.
-FilePath FilePath::MakeFileName(const FilePath &directory,
-                                const FilePath &base_name, int number,
-                                const char *extension) {
+FilePath FilePath::MakeFileName(const FilePath& directory,
+                                const FilePath& base_name,
+                                int number,
+                                const char* extension) {
   std::string file;
   if (number == 0) {
     file = base_name.string() + "." + extension;
   } else {
-    file =
-        base_name.string() + "_" + StreamableToString(number) + "." + extension;
+    file = base_name.string() + "_" + StreamableToString(number)
+        + "." + extension;
   }
   return ConcatPaths(directory, FilePath(file));
 }
 
 // Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
 // On Windows, uses \ as the separator rather than /.
-FilePath FilePath::ConcatPaths(const FilePath &directory,
-                               const FilePath &relative_path) {
-  if (directory.IsEmpty()) return relative_path;
+FilePath FilePath::ConcatPaths(const FilePath& directory,
+                               const FilePath& relative_path) {
+  if (directory.IsEmpty())
+    return relative_path;
   const FilePath dir(directory.RemoveTrailingPathSeparator());
   return FilePath(dir.string() + kPathSeparator + relative_path.string());
 }
@@ -204,10 +207,10 @@ bool FilePath::FileOrDirectoryExists() const {
 #if GTEST_OS_WINDOWS_MOBILE
   LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
   const DWORD attributes = GetFileAttributes(unicode);
-  delete[] unicode;
+  delete [] unicode;
   return attributes != kInvalidFileAttributes;
 #else
-  posix::StatStruct file_stat;
+  posix::StatStruct file_stat{};
   return posix::Stat(pathname_.c_str(), &file_stat) == 0;
 #endif  // GTEST_OS_WINDOWS_MOBILE
 }
@@ -219,24 +222,24 @@ bool FilePath::DirectoryExists() const {
 #if GTEST_OS_WINDOWS
   // Don't strip off trailing separator if path is a root directory on
   // Windows (like "C:\\").
-  const FilePath &path(IsRootDirectory() ? *this
-                                         : RemoveTrailingPathSeparator());
+  const FilePath& path(IsRootDirectory() ? *this :
+                                           RemoveTrailingPathSeparator());
 #else
-  const FilePath &path(*this);
+  const FilePath& path(*this);
 #endif
 
 #if GTEST_OS_WINDOWS_MOBILE
   LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
   const DWORD attributes = GetFileAttributes(unicode);
-  delete[] unicode;
+  delete [] unicode;
   if ((attributes != kInvalidFileAttributes) &&
       (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
     result = true;
   }
 #else
-  posix::StatStruct file_stat;
-  result =
-      posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat);
+  posix::StatStruct file_stat{};
+  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
+      posix::IsDir(file_stat);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 
   return result;
@@ -254,12 +257,13 @@ bool FilePath::IsRootDirectory() const {
 
 // Returns true if pathname describes an absolute path.
 bool FilePath::IsAbsolutePath() const {
-  const char *const name = pathname_.c_str();
+  const char* const name = pathname_.c_str();
 #if GTEST_OS_WINDOWS
   return pathname_.length() >= 3 &&
-         ((name[0] >= 'a' && name[0] <= 'z') ||
-          (name[0] >= 'A' && name[0] <= 'Z')) &&
-         name[1] == ':' && IsPathSeparator(name[2]);
+     ((name[0] >= 'a' && name[0] <= 'z') ||
+      (name[0] >= 'A' && name[0] <= 'Z')) &&
+     name[1] == ':' &&
+     IsPathSeparator(name[2]);
 #else
   return IsPathSeparator(name[0]);
 #endif
@@ -273,9 +277,9 @@ bool FilePath::IsAbsolutePath() const {
 // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
 // There could be a race condition if two or more processes are calling this
 // function at the same time -- they could both pick the same filename.
-FilePath FilePath::GenerateUniqueFileName(const FilePath &directory,
-                                          const FilePath &base_name,
-                                          const char *extension) {
+FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
+                                          const FilePath& base_name,
+                                          const char* extension) {
   FilePath full_pathname;
   int number = 0;
   do {
@@ -317,10 +321,10 @@ bool FilePath::CreateFolder() const {
   FilePath removed_sep(this->RemoveTrailingPathSeparator());
   LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
   int result = CreateDirectory(unicode, nullptr) ? 0 : -1;
-  delete[] unicode;
+  delete [] unicode;
 #elif GTEST_OS_WINDOWS
   int result = _mkdir(pathname_.c_str());
-#elif GTEST_OS_ESP8266
+#elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA
   // do nothing
   int result = 0;
 #else
@@ -337,40 +341,28 @@ bool FilePath::CreateFolder() const {
 // name, otherwise return the name string unmodified.
 // On Windows platform, uses \ as the separator, other platforms use /.
 FilePath FilePath::RemoveTrailingPathSeparator() const {
-  return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1))
-                       : *this;
+  return IsDirectory()
+      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+      : *this;
 }
 
 // Removes any redundant separators that might be in the pathname.
 // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
 // redundancies that might be in a pathname involving "." or "..".
 void FilePath::Normalize() {
-  if (pathname_.c_str() == nullptr) {
-    pathname_ = "";
-    return;
-  }
-  const char *src = pathname_.c_str();
-  char *const dest = new char[pathname_.length() + 1];
-  char *dest_ptr = dest;
-  memset(dest_ptr, 0, pathname_.length() + 1);
-
-  while (*src != '\0') {
-    *dest_ptr = *src;
-    if (!IsPathSeparator(*src)) {
-      src++;
+  auto out = pathname_.begin();
+
+  for (const char character : pathname_) {
+    if (!IsPathSeparator(character)) {
+      *(out++) = character;
+    } else if (out == pathname_.begin() || *std::prev(out) != kPathSeparator) {
+      *(out++) = kPathSeparator;
     } else {
-#if GTEST_HAS_ALT_PATH_SEP_
-      if (*dest_ptr == kAlternatePathSeparator) {
-        *dest_ptr = kPathSeparator;
-      }
-#endif
-      while (IsPathSeparator(*src)) src++;
+      continue;
     }
-    dest_ptr++;
   }
-  *dest_ptr = '\0';
-  pathname_ = dest;
-  delete[] dest;
+
+  pathname_.erase(out, pathname_.end());
 }
 
 }  // namespace internal
diff --git a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-internal-inl.h b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-internal-inl.h
index 16d8cde669..6d8cecbbb3 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-internal-inl.h
+++ b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-internal-inl.h
@@ -31,11 +31,11 @@
 // This file contains purely Google Test's internal implementation.  Please
 // DO NOT #INCLUDE IT IN A USER PROGRAM.
 
-#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
-#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+#ifndef GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
+#define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
 
 #ifndef _WIN32_WCE
-#include <errno.h>
+# include <errno.h>
 #endif  // !_WIN32_WCE
 #include <stddef.h>
 #include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
@@ -50,13 +50,13 @@
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_CAN_STREAM_RESULTS_
-#include <arpa/inet.h>  // NOLINT
-#include <netdb.h>      // NOLINT
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
 #endif
 
 #if GTEST_OS_WINDOWS
-#include <windows.h>  // NOLINT
-#endif                // GTEST_OS_WINDOWS
+# include <windows.h>  // NOLINT
+#endif  // GTEST_OS_WINDOWS
 
 #include "gtest/gtest.h"
 #include "gtest/gtest-spi.h"
@@ -84,9 +84,11 @@ const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
 const char kBreakOnFailureFlag[] = "break_on_failure";
 const char kCatchExceptionsFlag[] = "catch_exceptions";
 const char kColorFlag[] = "color";
+const char kFailFast[] = "fail_fast";
 const char kFilterFlag[] = "filter";
 const char kListTestsFlag[] = "list_tests";
 const char kOutputFlag[] = "output";
+const char kBriefFlag[] = "brief";
 const char kPrintTimeFlag[] = "print_time";
 const char kPrintUTF8Flag[] = "print_utf8";
 const char kRandomSeedFlag[] = "random_seed";
@@ -123,22 +125,21 @@ GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-GTEST_API_ bool ParseInt32Flag(const char *str, const char *flag,
-                               int32_t *value);
+GTEST_API_ bool ParseInt32Flag(
+    const char* str, const char* flag, int32_t* value);
 
 // Returns a random seed in range [1, kMaxRandomSeed] based on the
 // given --gtest_random_seed flag value.
 inline int GetRandomSeedFromFlag(int32_t random_seed_flag) {
-  const unsigned int raw_seed =
-      (random_seed_flag == 0) ? static_cast<unsigned int>(GetTimeInMillis())
-                              : static_cast<unsigned int>(random_seed_flag);
+  const unsigned int raw_seed = (random_seed_flag == 0) ?
+      static_cast<unsigned int>(GetTimeInMillis()) :
+      static_cast<unsigned int>(random_seed_flag);
 
   // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
   // it's easy to type.
   const int normalized_seed =
       static_cast<int>((raw_seed - 1U) %
-                       static_cast<unsigned int>(kMaxRandomSeed)) +
-      1;
+                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
   return normalized_seed;
 }
 
@@ -165,10 +166,12 @@ class GTestFlagSaver {
     color_ = GTEST_FLAG(color);
     death_test_style_ = GTEST_FLAG(death_test_style);
     death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    fail_fast_ = GTEST_FLAG(fail_fast);
     filter_ = GTEST_FLAG(filter);
     internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
     list_tests_ = GTEST_FLAG(list_tests);
     output_ = GTEST_FLAG(output);
+    brief_ = GTEST_FLAG(brief);
     print_time_ = GTEST_FLAG(print_time);
     print_utf8_ = GTEST_FLAG(print_utf8);
     random_seed_ = GTEST_FLAG(random_seed);
@@ -188,9 +191,11 @@ class GTestFlagSaver {
     GTEST_FLAG(death_test_style) = death_test_style_;
     GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
     GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(fail_fast) = fail_fast_;
     GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
     GTEST_FLAG(list_tests) = list_tests_;
     GTEST_FLAG(output) = output_;
+    GTEST_FLAG(brief) = brief_;
     GTEST_FLAG(print_time) = print_time_;
     GTEST_FLAG(print_utf8) = print_utf8_;
     GTEST_FLAG(random_seed) = random_seed_;
@@ -209,10 +214,12 @@ class GTestFlagSaver {
   std::string color_;
   std::string death_test_style_;
   bool death_test_use_fork_;
+  bool fail_fast_;
   std::string filter_;
   std::string internal_run_death_test_;
   bool list_tests_;
   std::string output_;
+  bool brief_;
   bool print_time_;
   bool print_utf8_;
   int32_t random_seed_;
@@ -244,7 +251,7 @@ GTEST_API_ std::string CodePointToUtf8(uint32_t code_point);
 // as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
 // and contains invalid UTF-16 surrogate pairs, values in those pairs
 // will be encoded as individual Unicode characters from Basic Normal Plane.
-GTEST_API_ std::string WideStringToUtf8(const wchar_t *str, int num_chars);
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
 
 // Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
 // if the variable is present. If a file already exists at this location, this
@@ -258,47 +265,48 @@ void WriteToShardStatusFileIfNeeded();
 // an error and exits. If in_subprocess_for_death_test, sharding is
 // disabled because it must only be applied to the original test
 // process. Otherwise, we could filter out death tests we intended to execute.
-GTEST_API_ bool ShouldShard(const char *total_shards_str,
-                            const char *shard_index_str,
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+                            const char* shard_index_str,
                             bool in_subprocess_for_death_test);
 
 // Parses the environment variable var as a 32-bit integer. If it is unset,
 // returns default_val. If it is not a 32-bit integer, prints an error and
 // and aborts.
-GTEST_API_ int32_t Int32FromEnvOrDie(const char *env_var, int32_t default_val);
+GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val);
 
 // Given the total number of shards, the shard index, and the test id,
 // returns true if and only if the test should be run on this shard. The test id
 // is some arbitrary but unique non-negative integer assigned to each test
 // method. Assumes that 0 <= shard_index < total_shards.
-GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index,
-                                     int test_id);
+GTEST_API_ bool ShouldRunTestOnShard(
+    int total_shards, int shard_index, int test_id);
 
 // STL container utilities.
 
 // Returns the number of elements in the given container that satisfy
 // the given predicate.
 template <class Container, typename Predicate>
-inline int CountIf(const Container &c, Predicate predicate) {
+inline int CountIf(const Container& c, Predicate predicate) {
   // Implemented as an explicit loop since std::count_if() in libCstd on
   // Solaris has a non-standard signature.
   int count = 0;
   for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
-    if (predicate(*it)) ++count;
+    if (predicate(*it))
+      ++count;
   }
   return count;
 }
 
 // Applies a function/functor to each element in the container.
 template <class Container, typename Functor>
-void ForEach(const Container &c, Functor functor) {
+void ForEach(const Container& c, Functor functor) {
   std::for_each(c.begin(), c.end(), functor);
 }
 
 // Returns the i-th element of the vector, or default_value if i is not
 // in range [0, v.size()).
 template <typename E>
-inline E GetElementOr(const std::vector<E> &v, int i, E default_value) {
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
   return (i < 0 || i >= static_cast<int>(v.size())) ? default_value
                                                     : v[static_cast<size_t>(i)];
 }
@@ -308,8 +316,8 @@ inline E GetElementOr(const std::vector<E> &v, int i, E default_value) {
 // i.e. [begin, end) are shuffled, where 'end' == size() means to
 // shuffle to the end of the vector.
 template <typename E>
-void ShuffleRange(internal::Random *random, int begin, int end,
-                  std::vector<E> *v) {
+void ShuffleRange(internal::Random* random, int begin, int end,
+                  std::vector<E>* v) {
   const int size = static_cast<int>(v->size());
   GTEST_CHECK_(0 <= begin && begin <= size)
       << "Invalid shuffle range start " << begin << ": must be in range [0, "
@@ -332,14 +340,14 @@ void ShuffleRange(internal::Random *random, int begin, int end,
 
 // Performs an in-place shuffle of the vector's elements.
 template <typename E>
-inline void Shuffle(internal::Random *random, std::vector<E> *v) {
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
   ShuffleRange(random, 0, static_cast<int>(v->size()), v);
 }
 
 // A function for deleting an object.  Handy for being used as a
 // functor.
 template <typename T>
-static void Delete(T *x) {
+static void Delete(T* x) {
   delete x;
 }
 
@@ -351,10 +359,10 @@ class TestPropertyKeyIs {
   // Constructor.
   //
   // TestPropertyKeyIs has NO default constructor.
-  explicit TestPropertyKeyIs(const std::string &key) : key_(key) {}
+  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
 
   // Returns true if and only if the test name of test property matches on key_.
-  bool operator()(const TestProperty &test_property) const {
+  bool operator()(const TestProperty& test_property) const {
     return test_property.key() == key_;
   }
 
@@ -386,17 +394,10 @@ class GTEST_API_ UnitTestOptions {
 
   // Functions for processing the gtest_filter flag.
 
-  // Returns true if and only if the wildcard pattern matches the string.
-  // The first ':' or '\0' character in pattern marks the end of it.
-  //
-  // This recursive algorithm isn't very efficient, but is clear and
-  // works well enough for matching test names, which are short.
-  static bool PatternMatchesString(const char *pattern, const char *str);
-
   // Returns true if and only if the user-specified filter matches the test
   // suite name and the test name.
-  static bool FilterMatchesTest(const std::string &test_suite_name,
-                                const std::string &test_name);
+  static bool FilterMatchesTest(const std::string& test_suite_name,
+                                const std::string& test_name);
 
 #if GTEST_OS_WINDOWS
   // Function for supporting the gtest_catch_exception flag.
@@ -409,7 +410,7 @@ class GTEST_API_ UnitTestOptions {
 
   // Returns true if "name" matches the ':' separated list of glob-style
   // filters in "filter".
-  static bool MatchesFilter(const std::string &name, const char *filter);
+  static bool MatchesFilter(const std::string& name, const char* filter);
 };
 
 // Returns the current application's name, removing directory path if that
@@ -437,7 +438,7 @@ class OsStackTraceGetterInterface {
 
   // This string is inserted in place of stack frames that are part of
   // Google Test's implementation.
-  static const char *const kElidedFramesMarker;
+  static const char* const kElidedFramesMarker;
 
  private:
   GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
@@ -459,7 +460,7 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface {
   // We do this because the address of the frame immediately below
   // the user code changes between the call to UponLeavingGTest()
   // and any calls to the stack trace code from within the user code.
-  void *caller_frame_ = nullptr;
+  void* caller_frame_ = nullptr;
 #endif  // GTEST_HAS_ABSL
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
@@ -467,7 +468,7 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface {
 
 // Information about a Google Test trace point.
 struct TraceInfo {
-  const char *file;
+  const char* file;
   int line;
   std::string message;
 };
@@ -475,15 +476,15 @@ struct TraceInfo {
 // This is the default global test part result reporter used in UnitTestImpl.
 // This class should only be used by UnitTestImpl.
 class DefaultGlobalTestPartResultReporter
-    : public TestPartResultReporterInterface {
+  : public TestPartResultReporterInterface {
  public:
-  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl *unit_test);
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
   // Implements the TestPartResultReporterInterface. Reports the test part
   // result in the current test.
-  void ReportTestPartResult(const TestPartResult &result) override;
+  void ReportTestPartResult(const TestPartResult& result) override;
 
  private:
-  UnitTestImpl *const unit_test_;
+  UnitTestImpl* const unit_test_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
 };
@@ -493,13 +494,13 @@ class DefaultGlobalTestPartResultReporter
 class DefaultPerThreadTestPartResultReporter
     : public TestPartResultReporterInterface {
  public:
-  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl *unit_test);
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
   // Implements the TestPartResultReporterInterface. The implementation just
   // delegates to the current global test part result reporter of *unit_test_.
-  void ReportTestPartResult(const TestPartResult &result) override;
+  void ReportTestPartResult(const TestPartResult& result) override;
 
  private:
-  UnitTestImpl *const unit_test_;
+  UnitTestImpl* const unit_test_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
 };
@@ -510,7 +511,7 @@ class DefaultPerThreadTestPartResultReporter
 // proper locking.
 class GTEST_API_ UnitTestImpl {
  public:
-  explicit UnitTestImpl(UnitTest *parent);
+  explicit UnitTestImpl(UnitTest* parent);
   virtual ~UnitTestImpl();
 
   // There are two different ways to register your own TestPartResultReporter.
@@ -521,18 +522,18 @@ class GTEST_API_ UnitTestImpl {
   // test part result for the currently running test.
 
   // Returns the global test part result reporter.
-  TestPartResultReporterInterface *GetGlobalTestPartResultReporter();
+  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
 
   // Sets the global test part result reporter.
   void SetGlobalTestPartResultReporter(
-      TestPartResultReporterInterface *reporter);
+      TestPartResultReporterInterface* reporter);
 
   // Returns the test part result reporter for the current thread.
-  TestPartResultReporterInterface *GetTestPartResultReporterForCurrentThread();
+  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
 
   // Sets the test part result reporter for the current thread.
   void SetTestPartResultReporterForCurrentThread(
-      TestPartResultReporterInterface *reporter);
+      TestPartResultReporterInterface* reporter);
 
   // Gets the number of successful test suites.
   int successful_test_suite_count() const;
@@ -590,44 +591,44 @@ class GTEST_API_ UnitTestImpl {
 
   // Gets the i-th test suite among all the test suites. i can range from 0 to
   // total_test_suite_count() - 1. If i is not in that range, returns NULL.
-  const TestSuite *GetTestSuite(int i) const {
+  const TestSuite* GetTestSuite(int i) const {
     const int index = GetElementOr(test_suite_indices_, i, -1);
     return index < 0 ? nullptr : test_suites_[static_cast<size_t>(i)];
   }
 
   //  Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  const TestCase *GetTestCase(int i) const { return GetTestSuite(i); }
+  const TestCase* GetTestCase(int i) const { return GetTestSuite(i); }
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Gets the i-th test suite among all the test suites. i can range from 0 to
   // total_test_suite_count() - 1. If i is not in that range, returns NULL.
-  TestSuite *GetMutableSuiteCase(int i) {
+  TestSuite* GetMutableSuiteCase(int i) {
     const int index = GetElementOr(test_suite_indices_, i, -1);
     return index < 0 ? nullptr : test_suites_[static_cast<size_t>(index)];
   }
 
   // Provides access to the event listener list.
-  TestEventListeners *listeners() { return &listeners_; }
+  TestEventListeners* listeners() { return &listeners_; }
 
   // Returns the TestResult for the test that's currently running, or
   // the TestResult for the ad hoc test if no test is running.
-  TestResult *current_test_result();
+  TestResult* current_test_result();
 
   // Returns the TestResult for the ad hoc test.
-  const TestResult *ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
 
   // Sets the OS stack trace getter.
   //
   // Does nothing if the input and the current OS stack trace getter
   // are the same; otherwise, deletes the old getter and makes the
   // input the current getter.
-  void set_os_stack_trace_getter(OsStackTraceGetterInterface *getter);
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
 
   // Returns the current OS stack trace getter if it is not NULL;
   // otherwise, creates an OsStackTraceGetter, makes it the current
   // getter, and returns it.
-  OsStackTraceGetterInterface *os_stack_trace_getter();
+  OsStackTraceGetterInterface* os_stack_trace_getter();
 
   // Returns the current OS stack trace as an std::string.
   //
@@ -647,17 +648,17 @@ class GTEST_API_ UnitTestImpl {
   // Arguments:
   //
   //   test_suite_name: name of the test suite
-  //   type_param:     the name of the test's type parameter, or NULL if
-  //                   this is not a typed or a type-parameterized test.
-  //   set_up_tc:      pointer to the function that sets up the test suite
-  //   tear_down_tc:   pointer to the function that tears down the test suite
-  TestSuite *GetTestSuite(const char *test_suite_name, const char *type_param,
+  //   type_param:      the name of the test's type parameter, or NULL if
+  //                    this is not a typed or a type-parameterized test.
+  //   set_up_tc:       pointer to the function that sets up the test suite
+  //   tear_down_tc:    pointer to the function that tears down the test suite
+  TestSuite* GetTestSuite(const char* test_suite_name, const char* type_param,
                           internal::SetUpTestSuiteFunc set_up_tc,
                           internal::TearDownTestSuiteFunc tear_down_tc);
 
 //  Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  TestCase *GetTestCase(const char *test_case_name, const char *type_param,
+  TestCase* GetTestCase(const char* test_case_name, const char* type_param,
                         internal::SetUpTestSuiteFunc set_up_tc,
                         internal::TearDownTestSuiteFunc tear_down_tc) {
     return GetTestSuite(test_case_name, type_param, set_up_tc, tear_down_tc);
@@ -673,7 +674,8 @@ class GTEST_API_ UnitTestImpl {
   //   test_info:    the TestInfo object
   void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc,
                    internal::TearDownTestSuiteFunc tear_down_tc,
-                   TestInfo *test_info) {
+                   TestInfo* test_info) {
+#if GTEST_HAS_DEATH_TEST
     // In order to support thread-safe death tests, we need to
     // remember the original working directory when the test program
     // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
@@ -686,6 +688,7 @@ class GTEST_API_ UnitTestImpl {
       GTEST_CHECK_(!original_working_dir_.IsEmpty())
           << "Failed to get the current working directory.";
     }
+#endif  // GTEST_HAS_DEATH_TEST
 
     GetTestSuite(test_info->test_suite_name(), test_info->type_param(),
                  set_up_tc, tear_down_tc)
@@ -694,30 +697,30 @@ class GTEST_API_ UnitTestImpl {
 
   // Returns ParameterizedTestSuiteRegistry object used to keep track of
   // value-parameterized tests and instantiate and register them.
-  internal::ParameterizedTestSuiteRegistry &parameterized_test_registry() {
+  internal::ParameterizedTestSuiteRegistry& parameterized_test_registry() {
     return parameterized_test_registry_;
   }
 
-  std::set<std::string> *ignored_parameterized_test_suites() {
+  std::set<std::string>* ignored_parameterized_test_suites() {
     return &ignored_parameterized_test_suites_;
   }
 
   // Returns TypeParameterizedTestSuiteRegistry object used to keep track of
   // type-parameterized tests and instantiations of them.
-  internal::TypeParameterizedTestSuiteRegistry &
+  internal::TypeParameterizedTestSuiteRegistry&
   type_parameterized_test_registry() {
     return type_parameterized_test_registry_;
   }
 
   // Sets the TestSuite object for the test that's currently running.
-  void set_current_test_suite(TestSuite *a_current_test_suite) {
+  void set_current_test_suite(TestSuite* a_current_test_suite) {
     current_test_suite_ = a_current_test_suite;
   }
 
   // Sets the TestInfo object for the test that's currently running.  If
   // current_test_info is NULL, the assertion results will be stored in
   // ad_hoc_test_result_.
-  void set_current_test_info(TestInfo *a_current_test_info) {
+  void set_current_test_info(TestInfo* a_current_test_info) {
     current_test_info_ = a_current_test_info;
   }
 
@@ -741,15 +744,20 @@ class GTEST_API_ UnitTestImpl {
   }
 
   // Clears the results of ad-hoc test assertions.
-  void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); }
+  void ClearAdHocTestResult() {
+    ad_hoc_test_result_.Clear();
+  }
 
   // Adds a TestProperty to the current TestResult object when invoked in a
   // context of a test or a test suite, or to the global property set. If the
   // result already contains a property with the same key, the value will be
   // updated.
-  void RecordProperty(const TestProperty &test_property);
+  void RecordProperty(const TestProperty& test_property);
 
-  enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL };
+  enum ReactionToSharding {
+    HONOR_SHARDING_PROTOCOL,
+    IGNORE_SHARDING_PROTOCOL
+  };
 
   // Matches the full name of each test against the user-specified
   // filter to decide whether the test should run, then records the
@@ -762,19 +770,19 @@ class GTEST_API_ UnitTestImpl {
   // Prints the names of the tests matching the user-specified filter flag.
   void ListTestsMatchingFilter();
 
-  const TestSuite *current_test_suite() const { return current_test_suite_; }
-  TestInfo *current_test_info() { return current_test_info_; }
-  const TestInfo *current_test_info() const { return current_test_info_; }
+  const TestSuite* current_test_suite() const { return current_test_suite_; }
+  TestInfo* current_test_info() { return current_test_info_; }
+  const TestInfo* current_test_info() const { return current_test_info_; }
 
   // Returns the vector of environments that need to be set-up/torn-down
   // before/after the tests are run.
-  std::vector<Environment *> &environments() { return environments_; }
+  std::vector<Environment*>& environments() { return environments_; }
 
   // Getters for the per-thread Google Test trace stack.
-  std::vector<TraceInfo> &gtest_trace_stack() {
+  std::vector<TraceInfo>& gtest_trace_stack() {
     return *(gtest_trace_stack_.pointer());
   }
-  const std::vector<TraceInfo> &gtest_trace_stack() const {
+  const std::vector<TraceInfo>& gtest_trace_stack() const {
     return gtest_trace_stack_.get();
   }
 
@@ -786,12 +794,12 @@ class GTEST_API_ UnitTestImpl {
   // flag, or NULL if that flag was not specified.
   // This information is useful only in a death test child process.
   // Must not be called before a call to InitGoogleTest.
-  const InternalRunDeathTestFlag *internal_run_death_test_flag() const {
+  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
     return internal_run_death_test_flag_.get();
   }
 
   // Returns a pointer to the current death test factory.
-  internal::DeathTestFactory *death_test_factory() {
+  internal::DeathTestFactory* death_test_factory() {
     return death_test_factory_.get();
   }
 
@@ -821,7 +829,7 @@ class GTEST_API_ UnitTestImpl {
   int random_seed() const { return random_seed_; }
 
   // Gets the random number generator.
-  internal::Random *random() { return &random_; }
+  internal::Random* random() { return &random_; }
 
   // Shuffles all test suites, and the tests within each test suite,
   // making sure that death tests are still run first.
@@ -842,7 +850,7 @@ class GTEST_API_ UnitTestImpl {
   void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
 
   // The UnitTest object that owns this implementation object.
-  UnitTest *const parent_;
+  UnitTest* const parent_;
 
   // The working directory when the first TEST() or TEST_F() was
   // executed.
@@ -854,22 +862,22 @@ class GTEST_API_ UnitTestImpl {
       default_per_thread_test_part_result_reporter_;
 
   // Points to (but doesn't own) the global test part result reporter.
-  TestPartResultReporterInterface *global_test_part_result_repoter_;
+  TestPartResultReporterInterface* global_test_part_result_repoter_;
 
   // Protects read and write access to global_test_part_result_reporter_.
   internal::Mutex global_test_part_result_reporter_mutex_;
 
   // Points to (but doesn't own) the per-thread test part result reporter.
-  internal::ThreadLocal<TestPartResultReporterInterface *>
+  internal::ThreadLocal<TestPartResultReporterInterface*>
       per_thread_test_part_result_reporter_;
 
   // The vector of environments that need to be set-up/torn-down
   // before/after the tests are run.
-  std::vector<Environment *> environments_;
+  std::vector<Environment*> environments_;
 
   // The vector of TestSuites in their original order.  It owns the
   // elements in the vector.
-  std::vector<TestSuite *> test_suites_;
+  std::vector<TestSuite*> test_suites_;
 
   // Provides a level of indirection for the test suite list to allow
   // easy shuffling and restoring the test suite order.  The i-th
@@ -897,13 +905,13 @@ class GTEST_API_ UnitTestImpl {
   // changes as Google Test goes through one test suite after another.
   // When no test is running, this is set to NULL and Google Test
   // stores assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestSuite *current_test_suite_;
+  TestSuite* current_test_suite_;
 
   // This points to the TestInfo for the currently running test.  It
   // changes as Google Test goes through one test after another.  When
   // no test is running, this is set to NULL and Google Test stores
   // assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestInfo *current_test_info_;
+  TestInfo* current_test_info_;
 
   // Normally, a user only writes assertions inside a TEST or TEST_F,
   // or inside a function called by a TEST or TEST_F.  Since Google
@@ -923,7 +931,7 @@ class GTEST_API_ UnitTestImpl {
   // object is destructed.  By default, an OsStackTraceGetter is used,
   // but the user can set this field to use a custom getter if that is
   // desired.
-  OsStackTraceGetterInterface *os_stack_trace_getter_;
+  OsStackTraceGetterInterface* os_stack_trace_getter_;
 
   // True if and only if PostFlagParsingInit() has been called.
   bool post_flag_parse_init_performed_;
@@ -960,7 +968,7 @@ class GTEST_API_ UnitTestImpl {
 
 // Convenience function for accessing the global UnitTest
 // implementation object.
-inline UnitTestImpl *GetUnitTestImpl() {
+inline UnitTestImpl* GetUnitTestImpl() {
   return UnitTest::GetInstance()->impl();
 }
 
@@ -968,7 +976,7 @@ inline UnitTestImpl *GetUnitTestImpl() {
 
 // Internal helper functions for implementing the simple regular
 // expression matcher.
-GTEST_API_ bool IsInSet(char ch, const char *str);
+GTEST_API_ bool IsInSet(char ch, const char* str);
 GTEST_API_ bool IsAsciiDigit(char ch);
 GTEST_API_ bool IsAsciiPunct(char ch);
 GTEST_API_ bool IsRepeat(char ch);
@@ -976,19 +984,18 @@ GTEST_API_ bool IsAsciiWhiteSpace(char ch);
 GTEST_API_ bool IsAsciiWordChar(char ch);
 GTEST_API_ bool IsValidEscape(char ch);
 GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
-GTEST_API_ bool ValidateRegex(const char *regex);
-GTEST_API_ bool MatchRegexAtHead(const char *regex, const char *str);
-GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch,
-                                              char repeat, const char *regex,
-                                              const char *str);
-GTEST_API_ bool MatchRegexAnywhere(const char *regex, const char *str);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
 
 #endif  // GTEST_USES_SIMPLE_RE
 
 // Parses the command line for Google Test flags, without initializing
 // other parts of Google Test.
-GTEST_API_ void ParseGoogleTestFlagsOnly(int *argc, char **argv);
-GTEST_API_ void ParseGoogleTestFlagsOnly(int *argc, wchar_t **argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
 
 #if GTEST_HAS_DEATH_TEST
 
@@ -1001,7 +1008,7 @@ GTEST_API_ std::string GetLastErrnoDescription();
 // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
 // it here.
 template <typename Integer>
-bool ParseNaturalNumber(const ::std::string &str, Integer *number) {
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
   // Fail fast if the given string does not begin with a digit;
   // this bypasses strtoXXX's "optional leading whitespace and plus
   // or minus sign" semantics, which are undesirable here.
@@ -1010,7 +1017,7 @@ bool ParseNaturalNumber(const ::std::string &str, Integer *number) {
   }
   errno = 0;
 
-  char *end;
+  char* end;
   // BiggestConvertible is the largest integer type that system-provided
   // string-to-number conversion routines can return.
   using BiggestConvertible = unsigned long long;  // NOLINT
@@ -1037,18 +1044,18 @@ bool ParseNaturalNumber(const ::std::string &str, Integer *number) {
 // constructs. Do not use it in user tests, either directly or indirectly.
 class TestResultAccessor {
  public:
-  static void RecordProperty(TestResult *test_result,
-                             const std::string &xml_element,
-                             const TestProperty &property) {
+  static void RecordProperty(TestResult* test_result,
+                             const std::string& xml_element,
+                             const TestProperty& property) {
     test_result->RecordProperty(xml_element, property);
   }
 
-  static void ClearTestPartResults(TestResult *test_result) {
+  static void ClearTestPartResults(TestResult* test_result) {
     test_result->ClearTestPartResults();
   }
 
-  static const std::vector<testing::TestPartResult> &test_part_results(
-      const TestResult &test_result) {
+  static const std::vector<testing::TestPartResult>& test_part_results(
+      const TestResult& test_result) {
     return test_result.test_part_results();
   }
 };
@@ -1064,36 +1071,38 @@ class StreamingListener : public EmptyTestEventListener {
     virtual ~AbstractSocketWriter() {}
 
     // Sends a string to the socket.
-    virtual void Send(const std::string &message) = 0;
+    virtual void Send(const std::string& message) = 0;
 
     // Closes the socket.
     virtual void CloseConnection() {}
 
     // Sends a string and a newline to the socket.
-    void SendLn(const std::string &message) { Send(message + "\n"); }
+    void SendLn(const std::string& message) { Send(message + "\n"); }
   };
 
   // Concrete class for actually writing strings to a socket.
   class SocketWriter : public AbstractSocketWriter {
    public:
-    SocketWriter(const std::string &host, const std::string &port)
+    SocketWriter(const std::string& host, const std::string& port)
         : sockfd_(-1), host_name_(host), port_num_(port) {
       MakeConnection();
     }
 
     ~SocketWriter() override {
-      if (sockfd_ != -1) CloseConnection();
+      if (sockfd_ != -1)
+        CloseConnection();
     }
 
     // Sends a string to the socket.
-    void Send(const std::string &message) override {
+    void Send(const std::string& message) override {
       GTEST_CHECK_(sockfd_ != -1)
           << "Send() can be called only when there is a connection.";
 
       const auto len = static_cast<size_t>(message.length());
       if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) {
-        GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to "
-                            << host_name_ << ":" << port_num_;
+        GTEST_LOG_(WARNING)
+            << "stream_result_to: failed to stream to "
+            << host_name_ << ":" << port_num_;
       }
     }
 
@@ -1118,23 +1127,21 @@ class StreamingListener : public EmptyTestEventListener {
   };  // class SocketWriter
 
   // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
-  static std::string UrlEncode(const char *str);
+  static std::string UrlEncode(const char* str);
 
-  StreamingListener(const std::string &host, const std::string &port)
+  StreamingListener(const std::string& host, const std::string& port)
       : socket_writer_(new SocketWriter(host, port)) {
     Start();
   }
 
-  explicit StreamingListener(AbstractSocketWriter *socket_writer)
-      : socket_writer_(socket_writer) {
-    Start();
-  }
+  explicit StreamingListener(AbstractSocketWriter* socket_writer)
+      : socket_writer_(socket_writer) { Start(); }
 
-  void OnTestProgramStart(const UnitTest & /* unit_test */) override {
+  void OnTestProgramStart(const UnitTest& /* unit_test */) override {
     SendLn("event=TestProgramStart");
   }
 
-  void OnTestProgramEnd(const UnitTest &unit_test) override {
+  void OnTestProgramEnd(const UnitTest& unit_test) override {
     // Note that Google Test current only report elapsed time for each
     // test iteration, not for the entire test program.
     SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
@@ -1143,45 +1150,46 @@ class StreamingListener : public EmptyTestEventListener {
     socket_writer_->CloseConnection();
   }
 
-  void OnTestIterationStart(const UnitTest & /* unit_test */,
+  void OnTestIterationStart(const UnitTest& /* unit_test */,
                             int iteration) override {
     SendLn("event=TestIterationStart&iteration=" +
            StreamableToString(iteration));
   }
 
-  void OnTestIterationEnd(const UnitTest &unit_test,
+  void OnTestIterationEnd(const UnitTest& unit_test,
                           int /* iteration */) override {
-    SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) +
-           "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) +
-           "ms");
+    SendLn("event=TestIterationEnd&passed=" +
+           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
+           StreamableToString(unit_test.elapsed_time()) + "ms");
   }
 
   // Note that "event=TestCaseStart" is a wire format and has to remain
-  // "case" for compatibilty
-  void OnTestCaseStart(const TestCase &test_case) override {
+  // "case" for compatibility
+  void OnTestCaseStart(const TestCase& test_case) override {
     SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
   }
 
   // Note that "event=TestCaseEnd" is a wire format and has to remain
-  // "case" for compatibilty
-  void OnTestCaseEnd(const TestCase &test_case) override {
+  // "case" for compatibility
+  void OnTestCaseEnd(const TestCase& test_case) override {
     SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) +
            "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) +
            "ms");
   }
 
-  void OnTestStart(const TestInfo &test_info) override {
+  void OnTestStart(const TestInfo& test_info) override {
     SendLn(std::string("event=TestStart&name=") + test_info.name());
   }
 
-  void OnTestEnd(const TestInfo &test_info) override {
+  void OnTestEnd(const TestInfo& test_info) override {
     SendLn("event=TestEnd&passed=" +
-           FormatBool((test_info.result())->Passed()) + "&elapsed_time=" +
+           FormatBool((test_info.result())->Passed()) +
+           "&elapsed_time=" +
            StreamableToString((test_info.result())->elapsed_time()) + "ms");
   }
 
-  void OnTestPartResult(const TestPartResult &test_part_result) override {
-    const char *file_name = test_part_result.file_name();
+  void OnTestPartResult(const TestPartResult& test_part_result) override {
+    const char* file_name = test_part_result.file_name();
     if (file_name == nullptr) file_name = "";
     SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
            "&line=" + StreamableToString(test_part_result.line_number()) +
@@ -1190,7 +1198,7 @@ class StreamingListener : public EmptyTestEventListener {
 
  private:
   // Sends the given message and a newline to the socket.
-  void SendLn(const std::string &message) { socket_writer_->SendLn(message); }
+  void SendLn(const std::string& message) { socket_writer_->SendLn(message); }
 
   // Called at the start of streaming to notify the receiver what
   // protocol we are using.
@@ -1210,4 +1218,4 @@ class StreamingListener : public EmptyTestEventListener {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
+#endif  // GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-matchers.cc b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-matchers.cc
index 27aaa2b7c5..65104ebab1 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-matchers.cc
+++ b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-matchers.cc
@@ -42,48 +42,48 @@ namespace testing {
 
 // Constructs a matcher that matches a const std::string& whose value is
 // equal to s.
-Matcher<const std::string &>::Matcher(const std::string &s) { *this = Eq(s); }
+Matcher<const std::string&>::Matcher(const std::string& s) { *this = Eq(s); }
 
 // Constructs a matcher that matches a const std::string& whose value is
 // equal to s.
-Matcher<const std::string &>::Matcher(const char *s) {
+Matcher<const std::string&>::Matcher(const char* s) {
   *this = Eq(std::string(s));
 }
 
 // Constructs a matcher that matches a std::string whose value is equal to
 // s.
-Matcher<std::string>::Matcher(const std::string &s) { *this = Eq(s); }
+Matcher<std::string>::Matcher(const std::string& s) { *this = Eq(s); }
 
 // Constructs a matcher that matches a std::string whose value is equal to
 // s.
-Matcher<std::string>::Matcher(const char *s) { *this = Eq(std::string(s)); }
+Matcher<std::string>::Matcher(const char* s) { *this = Eq(std::string(s)); }
 
 #if GTEST_INTERNAL_HAS_STRING_VIEW
 // Constructs a matcher that matches a const StringView& whose value is
 // equal to s.
-Matcher<const internal::StringView &>::Matcher(const std::string &s) {
+Matcher<const internal::StringView&>::Matcher(const std::string& s) {
   *this = Eq(s);
 }
 
 // Constructs a matcher that matches a const StringView& whose value is
 // equal to s.
-Matcher<const internal::StringView &>::Matcher(const char *s) {
+Matcher<const internal::StringView&>::Matcher(const char* s) {
   *this = Eq(std::string(s));
 }
 
 // Constructs a matcher that matches a const StringView& whose value is
 // equal to s.
-Matcher<const internal::StringView &>::Matcher(internal::StringView s) {
+Matcher<const internal::StringView&>::Matcher(internal::StringView s) {
   *this = Eq(std::string(s));
 }
 
 // Constructs a matcher that matches a StringView whose value is equal to
 // s.
-Matcher<internal::StringView>::Matcher(const std::string &s) { *this = Eq(s); }
+Matcher<internal::StringView>::Matcher(const std::string& s) { *this = Eq(s); }
 
 // Constructs a matcher that matches a StringView whose value is equal to
 // s.
-Matcher<internal::StringView>::Matcher(const char *s) {
+Matcher<internal::StringView>::Matcher(const char* s) {
   *this = Eq(std::string(s));
 }
 
diff --git a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-port.cc b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-port.cc
index adfdbef9c6..53a4d37f97 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-port.cc
+++ b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-port.cc
@@ -27,6 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 #include "gtest/internal/gtest-port.h"
 
 #include <limits.h>
@@ -38,45 +39,45 @@
 #include <memory>
 
 #if GTEST_OS_WINDOWS
-#include <windows.h>
-#include <io.h>
-#include <sys/stat.h>
-#include <map>  // Used in ThreadLocal.
-#ifdef _MSC_VER
-#include <crtdbg.h>
-#endif  // _MSC_VER
+# include <windows.h>
+# include <io.h>
+# include <sys/stat.h>
+# include <map>  // Used in ThreadLocal.
+# ifdef _MSC_VER
+#  include <crtdbg.h>
+# endif  // _MSC_VER
 #else
-#include <unistd.h>
+# include <unistd.h>
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_MAC
-#include <mach/mach_init.h>
-#include <mach/task.h>
-#include <mach/vm_map.h>
+# include <mach/mach_init.h>
+# include <mach/task.h>
+# include <mach/vm_map.h>
 #endif  // GTEST_OS_MAC
 
 #if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
     GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-#include <sys/sysctl.h>
-#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
-#include <sys/user.h>
-#endif
+# include <sys/sysctl.h>
+# if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#  include <sys/user.h>
+# endif
 #endif
 
 #if GTEST_OS_QNX
-#include <devctl.h>
-#include <fcntl.h>
-#include <sys/procfs.h>
+# include <devctl.h>
+# include <fcntl.h>
+# include <sys/procfs.h>
 #endif  // GTEST_OS_QNX
 
 #if GTEST_OS_AIX
-#include <procinfo.h>
-#include <sys/types.h>
+# include <procinfo.h>
+# include <sys/types.h>
 #endif  // GTEST_OS_AIX
 
 #if GTEST_OS_FUCHSIA
-#include <zircon/process.h>
-#include <zircon/syscalls.h>
+# include <zircon/process.h>
+# include <zircon/syscalls.h>
 #endif  // GTEST_OS_FUCHSIA
 
 #include "gtest/gtest-spi.h"
@@ -101,7 +102,7 @@ const int kStdErrFileno = STDERR_FILENO;
 
 namespace {
 template <typename T>
-T ReadProcFileField(const std::string &filename, int field) {
+T ReadProcFileField(const std::string& filename, int field) {
   std::string dummy;
   std::ifstream file(filename.c_str());
   while (field-- > 0) {
@@ -130,7 +131,8 @@ size_t GetThreadCount() {
   if (status == KERN_SUCCESS) {
     // task_threads allocates resources in thread_list and we need to free them
     // to avoid leaks.
-    vm_deallocate(task, reinterpret_cast<vm_address_t>(thread_list),
+    vm_deallocate(task,
+                  reinterpret_cast<vm_address_t>(thread_list),
                   sizeof(thread_t) * thread_count);
     return static_cast<size_t>(thread_count);
   } else {
@@ -139,7 +141,7 @@ size_t GetThreadCount() {
 }
 
 #elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
-    GTEST_OS_NETBSD
+      GTEST_OS_NETBSD
 
 #if GTEST_OS_NETBSD
 #undef KERN_PROC
@@ -196,7 +198,8 @@ size_t GetThreadCount() {
   if (sysctl(mib, miblen, NULL, &size, NULL, 0)) {
     return 0;
   }
-  mib[5] = size / mib[4];
+
+  mib[5] = static_cast<int>(size / static_cast<size_t>(mib[4]));
 
   // populate array of structs
   struct kinfo_proc info[mib[5]];
@@ -205,9 +208,10 @@ size_t GetThreadCount() {
   }
 
   // exclude empty members
-  int nthreads = 0;
-  for (int i = 0; i < size / mib[4]; i++) {
-    if (info[i].p_tid != -1) nthreads++;
+  size_t nthreads = 0;
+  for (size_t i = 0; i < size / static_cast<size_t>(mib[4]); i++) {
+    if (info[i].p_tid != -1)
+      nthreads++;
   }
   return nthreads;
 }
@@ -250,9 +254,13 @@ size_t GetThreadCount() {
 size_t GetThreadCount() {
   int dummy_buffer;
   size_t avail;
-  zx_status_t status =
-      zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS,
-                         &dummy_buffer, 0, nullptr, &avail);
+  zx_status_t status = zx_object_get_info(
+      zx_process_self(),
+      ZX_INFO_PROCESS_THREADS,
+      &dummy_buffer,
+      0,
+      nullptr,
+      &avail);
   if (status == ZX_OK) {
     return avail;
   } else {
@@ -272,17 +280,27 @@ size_t GetThreadCount() {
 
 #if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
 
-void SleepMilliseconds(int n) { ::Sleep(static_cast<DWORD>(n)); }
+void SleepMilliseconds(int n) {
+  ::Sleep(static_cast<DWORD>(n));
+}
 
-AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
+AutoHandle::AutoHandle()
+    : handle_(INVALID_HANDLE_VALUE) {}
 
-AutoHandle::AutoHandle(Handle handle) : handle_(handle) {}
+AutoHandle::AutoHandle(Handle handle)
+    : handle_(handle) {}
 
-AutoHandle::~AutoHandle() { Reset(); }
+AutoHandle::~AutoHandle() {
+  Reset();
+}
 
-AutoHandle::Handle AutoHandle::Get() const { return handle_; }
+AutoHandle::Handle AutoHandle::Get() const {
+  return handle_;
+}
 
-void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); }
+void AutoHandle::Reset() {
+  Reset(INVALID_HANDLE_VALUE);
+}
 
 void AutoHandle::Reset(HANDLE handle) {
   // Resetting with the same handle we already own is invalid.
@@ -294,7 +312,7 @@ void AutoHandle::Reset(HANDLE handle) {
   } else {
     GTEST_CHECK_(!IsCloseable())
         << "Resetting a valid handle to itself is likely a programmer error "
-           "and thus not allowed.";
+            "and thus not allowed.";
   }
 }
 
@@ -312,14 +330,19 @@ Notification::Notification()
   GTEST_CHECK_(event_.Get() != nullptr);
 }
 
-void Notification::Notify() { GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE); }
+void Notification::Notify() {
+  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
+}
 
 void Notification::WaitForNotification() {
-  GTEST_CHECK_(::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
+  GTEST_CHECK_(
+      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
 }
 
 Mutex::Mutex()
-    : owner_thread_id_(0), type_(kDynamic), critical_section_init_phase_(0),
+    : owner_thread_id_(0),
+      type_(kDynamic),
+      critical_section_init_phase_(0),
       critical_section_(new CRITICAL_SECTION) {
   ::InitializeCriticalSection(critical_section_);
 }
@@ -368,7 +391,8 @@ namespace {
 //    MemoryIsNotDeallocated memory_is_not_deallocated;
 //    critical_section_ = new CRITICAL_SECTION;
 //
-class MemoryIsNotDeallocated {
+class MemoryIsNotDeallocated
+{
  public:
   MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
     old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
@@ -411,13 +435,15 @@ void Mutex::ThreadSafeLazyInit() {
         ::InitializeCriticalSection(critical_section_);
         // Updates the critical_section_init_phase_ to 2 to signal
         // initialization complete.
-        GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_,
-                                                  2L, 1L) == 1L);
+        GTEST_CHECK_(::InterlockedCompareExchange(
+                          &critical_section_init_phase_, 2L, 1L) ==
+                      1L);
         break;
       case 1:
         // Somebody else is already initializing the mutex; spin until they
         // are done.
-        while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L,
+        while (::InterlockedCompareExchange(&critical_section_init_phase_,
+                                            2L,
                                             2L) != 2L) {
           // Possibly yields the rest of the thread's time slice to other
           // threads.
@@ -425,7 +451,8 @@ void Mutex::ThreadSafeLazyInit() {
         }
         break;
 
-      case 2: break;  // The mutex is already initialized and ready for use.
+      case 2:
+        break;  // The mutex is already initialized and ready for use.
 
       default:
         GTEST_CHECK_(false)
@@ -439,9 +466,9 @@ namespace {
 
 class ThreadWithParamSupport : public ThreadWithParamBase {
  public:
-  static HANDLE CreateThread(Runnable *runnable,
-                             Notification *thread_can_start) {
-    ThreadMainParam *param = new ThreadMainParam(runnable, thread_can_start);
+  static HANDLE CreateThread(Runnable* runnable,
+                             Notification* thread_can_start) {
+    ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
     DWORD thread_id;
     HANDLE thread_handle = ::CreateThread(
         nullptr,  // Default security.
@@ -460,16 +487,18 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
 
  private:
   struct ThreadMainParam {
-    ThreadMainParam(Runnable *runnable, Notification *thread_can_start)
-        : runnable_(runnable), thread_can_start_(thread_can_start) {}
+    ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
+        : runnable_(runnable),
+          thread_can_start_(thread_can_start) {
+    }
     std::unique_ptr<Runnable> runnable_;
     // Does not own.
-    Notification *thread_can_start_;
+    Notification* thread_can_start_;
   };
 
-  static DWORD WINAPI ThreadMain(void *ptr) {
+  static DWORD WINAPI ThreadMain(void* ptr) {
     // Transfers ownership.
-    std::unique_ptr<ThreadMainParam> param(static_cast<ThreadMainParam *>(ptr));
+    std::unique_ptr<ThreadMainParam> param(static_cast<ThreadMainParam*>(ptr));
     if (param->thread_can_start_ != nullptr)
       param->thread_can_start_->WaitForNotification();
     param->runnable_->Run();
@@ -485,11 +514,14 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
 }  // namespace
 
 ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
-                                         Notification *thread_can_start)
-    : thread_(
-          ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {}
+                                         Notification* thread_can_start)
+      : thread_(ThreadWithParamSupport::CreateThread(runnable,
+                                                     thread_can_start)) {
+}
 
-ThreadWithParamBase::~ThreadWithParamBase() { Join(); }
+ThreadWithParamBase::~ThreadWithParamBase() {
+  Join();
+}
 
 void ThreadWithParamBase::Join() {
   GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
@@ -504,25 +536,23 @@ class ThreadLocalRegistryImpl {
  public:
   // Registers thread_local_instance as having value on the current thread.
   // Returns a value that can be used to identify the thread from other threads.
-  static ThreadLocalValueHolderBase *GetValueOnCurrentThread(
-      const ThreadLocalBase *thread_local_instance) {
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
 #ifdef _MSC_VER
     MemoryIsNotDeallocated memory_is_not_deallocated;
 #endif  // _MSC_VER
     DWORD current_thread = ::GetCurrentThreadId();
     MutexLock lock(&mutex_);
-    ThreadIdToThreadLocals *const thread_to_thread_locals =
+    ThreadIdToThreadLocals* const thread_to_thread_locals =
         GetThreadLocalsMapLocked();
     ThreadIdToThreadLocals::iterator thread_local_pos =
         thread_to_thread_locals->find(current_thread);
     if (thread_local_pos == thread_to_thread_locals->end()) {
-      thread_local_pos =
-          thread_to_thread_locals
-              ->insert(std::make_pair(current_thread, ThreadLocalValues()))
-              .first;
+      thread_local_pos = thread_to_thread_locals->insert(
+          std::make_pair(current_thread, ThreadLocalValues())).first;
       StartWatcherThreadFor(current_thread);
     }
-    ThreadLocalValues &thread_local_values = thread_local_pos->second;
+    ThreadLocalValues& thread_local_values = thread_local_pos->second;
     ThreadLocalValues::iterator value_pos =
         thread_local_values.find(thread_local_instance);
     if (value_pos == thread_local_values.end()) {
@@ -538,18 +568,19 @@ class ThreadLocalRegistryImpl {
   }
 
   static void OnThreadLocalDestroyed(
-      const ThreadLocalBase *thread_local_instance) {
+      const ThreadLocalBase* thread_local_instance) {
     std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
     // Clean up the ThreadLocalValues data structure while holding the lock, but
     // defer the destruction of the ThreadLocalValueHolderBases.
     {
       MutexLock lock(&mutex_);
-      ThreadIdToThreadLocals *const thread_to_thread_locals =
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
           GetThreadLocalsMapLocked();
       for (ThreadIdToThreadLocals::iterator it =
-               thread_to_thread_locals->begin();
-           it != thread_to_thread_locals->end(); ++it) {
-        ThreadLocalValues &thread_local_values = it->second;
+          thread_to_thread_locals->begin();
+          it != thread_to_thread_locals->end();
+          ++it) {
+        ThreadLocalValues& thread_local_values = it->second;
         ThreadLocalValues::iterator value_pos =
             thread_local_values.find(thread_local_instance);
         if (value_pos != thread_local_values.end()) {
@@ -571,15 +602,16 @@ class ThreadLocalRegistryImpl {
     // lock, but defer the destruction of the ThreadLocalValueHolderBases.
     {
       MutexLock lock(&mutex_);
-      ThreadIdToThreadLocals *const thread_to_thread_locals =
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
           GetThreadLocalsMapLocked();
       ThreadIdToThreadLocals::iterator thread_local_pos =
           thread_to_thread_locals->find(thread_id);
       if (thread_local_pos != thread_to_thread_locals->end()) {
-        ThreadLocalValues &thread_local_values = thread_local_pos->second;
+        ThreadLocalValues& thread_local_values = thread_local_pos->second;
         for (ThreadLocalValues::iterator value_pos =
-                 thread_local_values.begin();
-             value_pos != thread_local_values.end(); ++value_pos) {
+            thread_local_values.begin();
+            value_pos != thread_local_values.end();
+            ++value_pos) {
           value_holders.push_back(value_pos->second);
         }
         thread_to_thread_locals->erase(thread_local_pos);
@@ -591,7 +623,7 @@ class ThreadLocalRegistryImpl {
 
  private:
   // In a particular thread, maps a ThreadLocal object to its value.
-  typedef std::map<const ThreadLocalBase *,
+  typedef std::map<const ThreadLocalBase*,
                    std::shared_ptr<ThreadLocalValueHolderBase> >
       ThreadLocalValues;
   // Stores all ThreadIdToThreadLocals having values in a thread, indexed by
@@ -605,8 +637,9 @@ class ThreadLocalRegistryImpl {
   static void StartWatcherThreadFor(DWORD thread_id) {
     // The returned handle will be kept in thread_map and closed by
     // watcher_thread in WatcherThreadFunc.
-    HANDLE thread =
-        ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id);
+    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
+                                 FALSE,
+                                 thread_id);
     GTEST_CHECK_(thread != nullptr);
     // We need to pass a valid thread ID pointer into CreateThread for it
     // to work correctly under Win98.
@@ -629,9 +662,10 @@ class ThreadLocalRegistryImpl {
   // Monitors exit from a given thread and notifies those
   // ThreadIdToThreadLocals about thread termination.
   static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
-    const ThreadIdAndHandle *tah =
-        reinterpret_cast<const ThreadIdAndHandle *>(param);
-    GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    const ThreadIdAndHandle* tah =
+        reinterpret_cast<const ThreadIdAndHandle*>(param);
+    GTEST_CHECK_(
+        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
     OnThreadExit(tah->first);
     ::CloseHandle(tah->second);
     delete tah;
@@ -639,12 +673,12 @@ class ThreadLocalRegistryImpl {
   }
 
   // Returns map of thread local instances.
-  static ThreadIdToThreadLocals *GetThreadLocalsMapLocked() {
+  static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
     mutex_.AssertHeld();
 #ifdef _MSC_VER
     MemoryIsNotDeallocated memory_is_not_deallocated;
 #endif  // _MSC_VER
-    static ThreadIdToThreadLocals *map = new ThreadIdToThreadLocals();
+    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals();
     return map;
   }
 
@@ -654,17 +688,17 @@ class ThreadLocalRegistryImpl {
   static Mutex thread_map_mutex_;
 };
 
-Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
-Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);  // NOLINT
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);  // NOLINT
 
-ThreadLocalValueHolderBase *ThreadLocalRegistry::GetValueOnCurrentThread(
-    const ThreadLocalBase *thread_local_instance) {
+ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
   return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
       thread_local_instance);
 }
 
 void ThreadLocalRegistry::OnThreadLocalDestroyed(
-    const ThreadLocalBase *thread_local_instance) {
+      const ThreadLocalBase* thread_local_instance) {
   ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
 }
 
@@ -683,11 +717,11 @@ RE::~RE() {
     regfree(&partial_regex_);
     regfree(&full_regex_);
   }
-  free(const_cast<char *>(pattern_));
+  free(const_cast<char*>(pattern_));
 }
 
 // Returns true if and only if regular expression re matches the entire str.
-bool RE::FullMatch(const char *str, const RE &re) {
+bool RE::FullMatch(const char* str, const RE& re) {
   if (!re.is_valid_) return false;
 
   regmatch_t match;
@@ -696,7 +730,7 @@ bool RE::FullMatch(const char *str, const RE &re) {
 
 // Returns true if and only if regular expression re matches a substring of
 // str (including str itself).
-bool RE::PartialMatch(const char *str, const RE &re) {
+bool RE::PartialMatch(const char* str, const RE& re) {
   if (!re.is_valid_) return false;
 
   regmatch_t match;
@@ -704,13 +738,13 @@ bool RE::PartialMatch(const char *str, const RE &re) {
 }
 
 // Initializes an RE from its string representation.
-void RE::Init(const char *regex) {
+void RE::Init(const char* regex) {
   pattern_ = posix::StrDup(regex);
 
   // Reserves enough bytes to hold the regular expression used for a
   // full match.
   const size_t full_regex_len = strlen(regex) + 10;
-  char *const full_pattern = new char[full_regex_len];
+  char* const full_pattern = new char[full_regex_len];
 
   snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
   is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
@@ -723,7 +757,7 @@ void RE::Init(const char *regex) {
   // versions of Cygwin) doesn't accept the empty string as a valid
   // regex.  We change it to an equivalent form "()" to be safe.
   if (is_valid_) {
-    const char *const partial_regex = (*regex == '\0') ? "()" : regex;
+    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
     is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
   }
   EXPECT_TRUE(is_valid_)
@@ -737,7 +771,7 @@ void RE::Init(const char *regex) {
 
 // Returns true if and only if ch appears anywhere in str (excluding the
 // terminating '\0' character).
-bool IsInSet(char ch, const char *str) {
+bool IsInSet(char ch, const char* str) {
   return ch != '\0' && strchr(str, ch) != nullptr;
 }
 
@@ -752,7 +786,7 @@ bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
 bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
 bool IsAsciiWordChar(char ch) {
   return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
-         ('0' <= ch && ch <= '9') || ch == '_';
+      ('0' <= ch && ch <= '9') || ch == '_';
 }
 
 // Returns true if and only if "\\c" is a supported escape sequence.
@@ -784,15 +818,14 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
 }
 
 // Helper function used by ValidateRegex() to format error messages.
-static std::string FormatRegexSyntaxError(const char *regex, int index) {
+static std::string FormatRegexSyntaxError(const char* regex, int index) {
   return (Message() << "Syntax error at index " << index
-                    << " in simple regular expression \"" << regex << "\": ")
-      .GetString();
+          << " in simple regular expression \"" << regex << "\": ").GetString();
 }
 
 // Generates non-fatal failures and returns false if regex is invalid;
 // otherwise returns true.
-bool ValidateRegex(const char *regex) {
+bool ValidateRegex(const char* regex) {
   if (regex == nullptr) {
     ADD_FAILURE() << "NULL is not a valid simple regular expression.";
     return false;
@@ -829,12 +862,12 @@ bool ValidateRegex(const char *regex) {
                       << "'$' can only appear at the end.";
         is_valid = false;
       } else if (IsInSet(ch, "()[]{}|")) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
-                      << "' is unsupported.";
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' is unsupported.";
         is_valid = false;
       } else if (IsRepeat(ch) && !prev_repeatable) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
-                      << "' can only follow a repeatable token.";
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' can only follow a repeatable token.";
         is_valid = false;
       }
 
@@ -852,10 +885,12 @@ bool ValidateRegex(const char *regex) {
 // characters to be indexable by size_t, in which case the test will
 // probably time out anyway.  We are fine with this limitation as
 // std::string has it too.
-bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat,
-                                   const char *regex, const char *str) {
+bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char c, char repeat, const char* regex,
+    const char* str) {
   const size_t min_count = (repeat == '+') ? 1 : 0;
-  const size_t max_count = (repeat == '?') ? 1 : static_cast<size_t>(-1) - 1;
+  const size_t max_count = (repeat == '?') ? 1 :
+      static_cast<size_t>(-1) - 1;
   // We cannot call numeric_limits::max() as it conflicts with the
   // max() macro on Windows.
 
@@ -868,7 +903,8 @@ bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat,
       // greedy match.
       return true;
     }
-    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false;
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
+      return false;
   }
   return false;
 }
@@ -876,29 +912,31 @@ bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat,
 // Returns true if and only if regex matches a prefix of str. regex must
 // be a valid simple regular expression and not start with "^", or the
 // result is undefined.
-bool MatchRegexAtHead(const char *regex, const char *str) {
+bool MatchRegexAtHead(const char* regex, const char* str) {
   if (*regex == '\0')  // An empty regex matches a prefix of anything.
     return true;
 
   // "$" only matches the end of a string.  Note that regex being
   // valid guarantees that there's nothing after "$" in it.
-  if (*regex == '$') return *str == '\0';
+  if (*regex == '$')
+    return *str == '\0';
 
   // Is the first thing in regex an escape sequence?
   const bool escaped = *regex == '\\';
-  if (escaped) ++regex;
+  if (escaped)
+    ++regex;
   if (IsRepeat(regex[1])) {
     // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
     // here's an indirect recursion.  It terminates as the regex gets
     // shorter in each recursion.
-    return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2,
-                                         str);
+    return MatchRepetitionAndRegexAtHead(
+        escaped, regex[0], regex[1], regex + 2, str);
   } else {
     // regex isn't empty, isn't "$", and doesn't start with a
     // repetition.  We match the first atom of regex with the first
     // character of str and recurse.
     return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
-           MatchRegexAtHead(regex + 1, str + 1);
+        MatchRegexAtHead(regex + 1, str + 1);
   }
 }
 
@@ -910,14 +948,16 @@ bool MatchRegexAtHead(const char *regex, const char *str) {
 // stack space normally.  In rare cases the time complexity can be
 // exponential with respect to the regex length + the string length,
 // but usually it's must faster (often close to linear).
-bool MatchRegexAnywhere(const char *regex, const char *str) {
+bool MatchRegexAnywhere(const char* regex, const char* str) {
   if (regex == nullptr || str == nullptr) return false;
 
-  if (*regex == '^') return MatchRegexAtHead(regex + 1, str);
+  if (*regex == '^')
+    return MatchRegexAtHead(regex + 1, str);
 
   // A successful match can be anywhere in str.
   do {
-    if (MatchRegexAtHead(regex, str)) return true;
+    if (MatchRegexAtHead(regex, str))
+      return true;
   } while (*str++ != '\0');
   return false;
 }
@@ -925,23 +965,23 @@ bool MatchRegexAnywhere(const char *regex, const char *str) {
 // Implements the RE class.
 
 RE::~RE() {
-  free(const_cast<char *>(pattern_));
-  free(const_cast<char *>(full_pattern_));
+  free(const_cast<char*>(pattern_));
+  free(const_cast<char*>(full_pattern_));
 }
 
 // Returns true if and only if regular expression re matches the entire str.
-bool RE::FullMatch(const char *str, const RE &re) {
+bool RE::FullMatch(const char* str, const RE& re) {
   return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
 }
 
 // Returns true if and only if regular expression re matches a substring of
 // str (including str itself).
-bool RE::PartialMatch(const char *str, const RE &re) {
+bool RE::PartialMatch(const char* str, const RE& re) {
   return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
 }
 
 // Initializes an RE from its string representation.
-void RE::Init(const char *regex) {
+void RE::Init(const char* regex) {
   pattern_ = full_pattern_ = nullptr;
   if (regex != nullptr) {
     pattern_ = posix::StrDup(regex);
@@ -957,7 +997,7 @@ void RE::Init(const char *regex) {
   // Reserves enough bytes to hold the regular expression used for a
   // full match: we need space to prepend a '^', append a '$', and
   // terminate the string with '\0'.
-  char *buffer = static_cast<char *>(malloc(len + 3));
+  char* buffer = static_cast<char*>(malloc(len + 3));
   full_pattern_ = buffer;
 
   if (*regex != '^')
@@ -980,7 +1020,7 @@ const char kUnknownFile[] = "unknown file";
 
 // Formats a source file path and a line number as they would appear
 // in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char *file, int line) {
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
   const std::string file_name(file == nullptr ? kUnknownFile : file);
 
   if (line < 0) {
@@ -998,8 +1038,8 @@ GTEST_API_ ::std::string FormatFileLocation(const char *file, int line) {
 // FormatFileLocation in order to contrast the two functions.
 // Note that FormatCompilerIndependentFileLocation() does NOT append colon
 // to the file location it produces, unlike FormatFileLocation().
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char *file,
-                                                               int line) {
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
+    const char* file, int line) {
   const std::string file_name(file == nullptr ? kUnknownFile : file);
 
   if (line < 0)
@@ -1008,17 +1048,14 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char *file,
     return file_name + ":" + StreamableToString(line);
 }
 
-GTestLog::GTestLog(GTestLogSeverity severity, const char *file, int line)
+GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
     : severity_(severity) {
-  const char *const marker =
-      severity == GTEST_INFO
-          ? "[  INFO ]"
-          : severity == GTEST_WARNING
-                ? "[WARNING]"
-                : severity == GTEST_ERROR ? "[ ERROR ]" : "[ FATAL ]";
-  GetStream() << ::std::endl
-              << marker << " " << FormatFileLocation(file, line).c_str()
-              << ": ";
+  const char* const marker =
+      severity == GTEST_INFO ?    "[  INFO ]" :
+      severity == GTEST_WARNING ? "[WARNING]" :
+      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
+  GetStream() << ::std::endl << marker << " "
+              << FormatFileLocation(file, line).c_str() << ": ";
 }
 
 // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
@@ -1041,26 +1078,27 @@ class CapturedStream {
  public:
   // The ctor redirects the stream to a temporary file.
   explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-#if GTEST_OS_WINDOWS
-    char temp_dir_path[MAX_PATH + 1] = { '\0' };   // NOLINT
+# if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
     char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
 
     ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
-    const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir",
+    const UINT success = ::GetTempFileNameA(temp_dir_path,
+                                            "gtest_redir",
                                             0,  // Generate unique file name.
                                             temp_file_path);
     GTEST_CHECK_(success != 0)
         << "Unable to create a temporary file in " << temp_dir_path;
     const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
-    GTEST_CHECK_(captured_fd != -1)
-        << "Unable to open temporary file " << temp_file_path;
+    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
+                                    << temp_file_path;
     filename_ = temp_file_path;
-#else
+# else
     // There's no guarantee that a test has write access to the current
-    // directory, so we create the temporary file in the /tmp directory
-    // instead. We use /tmp on most systems, and /sdcard on Android.
-    // That's because Android doesn't have /tmp.
-#if GTEST_OS_LINUX_ANDROID
+    // directory, so we create the temporary file in a temporary directory.
+    std::string name_template;
+
+#  if GTEST_OS_LINUX_ANDROID
     // Note: Android applications are expected to call the framework's
     // Context.getExternalStorageDirectory() method through JNI to get
     // the location of the world-writable SD Card directory. However,
@@ -1072,24 +1110,55 @@ class CapturedStream {
     // The location /data/local/tmp is directly accessible from native code.
     // '/sdcard' and other variants cannot be relied on, as they are not
     // guaranteed to be mounted, or may have a delay in mounting.
-    char name_template[] = "/data/local/tmp/gtest_captured_stream.XXXXXX";
-#else
-    char name_template[] = "/tmp/captured_stream.XXXXXX";
-#endif  // GTEST_OS_LINUX_ANDROID
-    const int captured_fd = mkstemp(name_template);
+    name_template = "/data/local/tmp/";
+#  elif GTEST_OS_IOS
+    char user_temp_dir[PATH_MAX + 1];
+
+    // Documented alternative to NSTemporaryDirectory() (for obtaining creating
+    // a temporary directory) at
+    // https://developer.apple.com/library/archive/documentation/Security/Conceptual/SecureCodingGuide/Articles/RaceConditions.html#//apple_ref/doc/uid/TP40002585-SW10
+    //
+    // _CS_DARWIN_USER_TEMP_DIR (as well as _CS_DARWIN_USER_CACHE_DIR) is not
+    // documented in the confstr() man page at
+    // https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/confstr.3.html#//apple_ref/doc/man/3/confstr
+    // but are still available, according to the WebKit patches at
+    // https://trac.webkit.org/changeset/262004/webkit
+    // https://trac.webkit.org/changeset/263705/webkit
+    //
+    // The confstr() implementation falls back to getenv("TMPDIR"). See
+    // https://opensource.apple.com/source/Libc/Libc-1439.100.3/gen/confstr.c.auto.html
+    ::confstr(_CS_DARWIN_USER_TEMP_DIR, user_temp_dir, sizeof(user_temp_dir));
+
+    name_template = user_temp_dir;
+    if (name_template.back() != GTEST_PATH_SEP_[0])
+      name_template.push_back(GTEST_PATH_SEP_[0]);
+#  else
+    name_template = "/tmp/";
+#  endif
+    name_template.append("gtest_captured_stream.XXXXXX");
+
+    // mkstemp() modifies the string bytes in place, and does not go beyond the
+    // string's length. This results in well-defined behavior in C++17.
+    //
+    // The const_cast is needed below C++17. The constraints on std::string
+    // implementations in C++11 and above make assumption behind the const_cast
+    // fairly safe.
+    const int captured_fd = ::mkstemp(const_cast<char*>(name_template.data()));
     if (captured_fd == -1) {
       GTEST_LOG_(WARNING)
           << "Failed to create tmp file " << name_template
           << " for test; does the test have access to the /tmp directory?";
     }
-    filename_ = name_template;
-#endif  // GTEST_OS_WINDOWS
+    filename_ = std::move(name_template);
+# endif  // GTEST_OS_WINDOWS
     fflush(nullptr);
     dup2(captured_fd, fd_);
     close(captured_fd);
   }
 
-  ~CapturedStream() { remove(filename_.c_str()); }
+  ~CapturedStream() {
+    remove(filename_.c_str());
+  }
 
   std::string GetCapturedString() {
     if (uncaptured_fd_ != -1) {
@@ -1100,7 +1169,7 @@ class CapturedStream {
       uncaptured_fd_ = -1;
     }
 
-    FILE *const file = posix::FOpen(filename_.c_str(), "r");
+    FILE* const file = posix::FOpen(filename_.c_str(), "r");
     if (file == nullptr) {
       GTEST_LOG_(FATAL) << "Failed to open tmp file " << filename_
                         << " for capturing stream.";
@@ -1121,12 +1190,12 @@ class CapturedStream {
 
 GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
-static CapturedStream *g_captured_stderr = nullptr;
-static CapturedStream *g_captured_stdout = nullptr;
+static CapturedStream* g_captured_stderr = nullptr;
+static CapturedStream* g_captured_stdout = nullptr;
 
 // Starts capturing an output stream (stdout/stderr).
-static void CaptureStream(int fd, const char *stream_name,
-                          CapturedStream **stream) {
+static void CaptureStream(int fd, const char* stream_name,
+                          CapturedStream** stream) {
   if (*stream != nullptr) {
     GTEST_LOG_(FATAL) << "Only one " << stream_name
                       << " capturer can exist at a time.";
@@ -1135,7 +1204,7 @@ static void CaptureStream(int fd, const char *stream_name,
 }
 
 // Stops capturing the output stream and returns the captured string.
-static std::string GetCapturedStream(CapturedStream **captured_stream) {
+static std::string GetCapturedStream(CapturedStream** captured_stream) {
   const std::string content = (*captured_stream)->GetCapturedString();
 
   delete *captured_stream;
@@ -1166,14 +1235,18 @@ std::string GetCapturedStderr() {
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
-size_t GetFileSize(FILE *file) {
+
+
+
+
+size_t GetFileSize(FILE* file) {
   fseek(file, 0, SEEK_END);
   return static_cast<size_t>(ftell(file));
 }
 
-std::string ReadEntireFile(FILE *file) {
+std::string ReadEntireFile(FILE* file) {
   const size_t file_size = GetFileSize(file);
-  char *const buffer = new char[file_size];
+  char* const buffer = new char[file_size];
 
   size_t bytes_last_read = 0;  // # of bytes read in the last fread()
   size_t bytes_read = 0;       // # of bytes read so far
@@ -1183,8 +1256,7 @@ std::string ReadEntireFile(FILE *file) {
   // Keeps reading the file until we cannot read further or the
   // pre-determined file size is reached.
   do {
-    bytes_last_read =
-        fread(buffer + bytes_read, 1, file_size - bytes_read, file);
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
     bytes_read += bytes_last_read;
   } while (bytes_last_read > 0 && bytes_read < file_size);
 
@@ -1195,7 +1267,7 @@ std::string ReadEntireFile(FILE *file) {
 }
 
 #if GTEST_HAS_DEATH_TEST
-static const std::vector<std::string> *g_injected_test_argvs =
+static const std::vector<std::string>* g_injected_test_argvs =
     nullptr;  // Owned.
 
 std::vector<std::string> GetInjectableArgvs() {
@@ -1205,12 +1277,12 @@ std::vector<std::string> GetInjectableArgvs() {
   return GetArgvs();
 }
 
-void SetInjectableArgvs(const std::vector<std::string> *new_argvs) {
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs) {
   if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs;
   g_injected_test_argvs = new_argvs;
 }
 
-void SetInjectableArgvs(const std::vector<std::string> &new_argvs) {
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs) {
   SetInjectableArgvs(
       new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
 }
@@ -1233,7 +1305,7 @@ void Abort() {
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "GTEST_FOO" in the open-source version.
-static std::string FlagToEnvVar(const char *flag) {
+static std::string FlagToEnvVar(const char* flag) {
   const std::string full_flag =
       (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
 
@@ -1248,9 +1320,9 @@ static std::string FlagToEnvVar(const char *flag) {
 // Parses 'str' for a 32-bit signed integer.  If successful, writes
 // the result to *value and returns true; otherwise leaves *value
 // unchanged and returns false.
-bool ParseInt32(const Message &src_text, const char *str, int32_t *value) {
+bool ParseInt32(const Message& src_text, const char* str, int32_t* value) {
   // Parses the environment variable as a decimal integer.
-  char *end = nullptr;
+  char* end = nullptr;
   const long long_value = strtol(str, &end, 10);  // NOLINT
 
   // Has strtol() consumed all characters in the string?
@@ -1272,7 +1344,7 @@ bool ParseInt32(const Message &src_text, const char *str, int32_t *value) {
       // LONG_MAX or LONG_MIN when the input overflows.)
       result != long_value
       // The parsed value overflows as an int32_t.
-  ) {
+      ) {
     Message msg;
     msg << "WARNING: " << src_text
         << " is expected to be a 32-bit integer, but actually"
@@ -1290,12 +1362,12 @@ bool ParseInt32(const Message &src_text, const char *str, int32_t *value) {
 // the given flag; if it's not set, returns default_value.
 //
 // The value is considered true if and only if it's not "0".
-bool BoolFromGTestEnv(const char *flag, bool default_value) {
+bool BoolFromGTestEnv(const char* flag, bool default_value) {
 #if defined(GTEST_GET_BOOL_FROM_ENV_)
   return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
 #else
   const std::string env_var = FlagToEnvVar(flag);
-  const char *const string_value = posix::GetEnv(env_var.c_str());
+  const char* const string_value = posix::GetEnv(env_var.c_str());
   return string_value == nullptr ? default_value
                                  : strcmp(string_value, "0") != 0;
 #endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
@@ -1304,20 +1376,20 @@ bool BoolFromGTestEnv(const char *flag, bool default_value) {
 // Reads and returns a 32-bit integer stored in the environment
 // variable corresponding to the given flag; if it isn't set or
 // doesn't represent a valid 32-bit integer, returns default_value.
-int32_t Int32FromGTestEnv(const char *flag, int32_t default_value) {
+int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
 #if defined(GTEST_GET_INT32_FROM_ENV_)
   return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
 #else
   const std::string env_var = FlagToEnvVar(flag);
-  const char *const string_value = posix::GetEnv(env_var.c_str());
+  const char* const string_value = posix::GetEnv(env_var.c_str());
   if (string_value == nullptr) {
     // The environment variable is not set.
     return default_value;
   }
 
   int32_t result = default_value;
-  if (!ParseInt32(Message() << "Environment variable " << env_var, string_value,
-                  &result)) {
+  if (!ParseInt32(Message() << "Environment variable " << env_var,
+                  string_value, &result)) {
     printf("The default value %s is used.\n",
            (Message() << default_value).GetString().c_str());
     fflush(stdout);
@@ -1336,9 +1408,9 @@ int32_t Int32FromGTestEnv(const char *flag, int32_t default_value) {
 // not check that the flag is 'output'
 // In essence this checks an env variable called XML_OUTPUT_FILE
 // and if it is set we prepend "xml:" to its value, if it not set we return ""
-std::string OutputFlagAlsoCheckEnvVar() {
+std::string OutputFlagAlsoCheckEnvVar(){
   std::string default_value_for_output_flag = "";
-  const char *xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
+  const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
   if (nullptr != xml_output_file_env) {
     default_value_for_output_flag = std::string("xml:") + xml_output_file_env;
   }
@@ -1347,12 +1419,12 @@ std::string OutputFlagAlsoCheckEnvVar() {
 
 // Reads and returns the string environment variable corresponding to
 // the given flag; if it's not set, returns default_value.
-const char *StringFromGTestEnv(const char *flag, const char *default_value) {
+const char* StringFromGTestEnv(const char* flag, const char* default_value) {
 #if defined(GTEST_GET_STRING_FROM_ENV_)
   return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
 #else
   const std::string env_var = FlagToEnvVar(flag);
-  const char *const value = posix::GetEnv(env_var.c_str());
+  const char* const value = posix::GetEnv(env_var.c_str());
   return value == nullptr ? default_value : value;
 #endif  // defined(GTEST_GET_STRING_FROM_ENV_)
 }
diff --git a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-printers.cc b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-printers.cc
index 8399386a99..1b68fcb500 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-printers.cc
+++ b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-printers.cc
@@ -27,6 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 // Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
@@ -41,11 +42,16 @@
 // defines Foo.
 
 #include "gtest/gtest-printers.h"
+
 #include <stdio.h>
+
 #include <cctype>
+#include <cstdint>
 #include <cwchar>
 #include <ostream>  // NOLINT
 #include <string>
+#include <type_traits>
+
 #include "gtest/internal/gtest-port.h"
 #include "src/gtest-internal-inl.h"
 
@@ -60,8 +66,8 @@ GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-void PrintByteSegmentInObjectTo(const unsigned char *obj_bytes, size_t start,
-                                size_t count, ostream *os) {
+void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
+                                size_t count, ostream* os) {
   char text[5] = "";
   for (size_t i = 0; i != count; i++) {
     const size_t j = start + i;
@@ -79,8 +85,8 @@ void PrintByteSegmentInObjectTo(const unsigned char *obj_bytes, size_t start,
 }
 
 // Prints the bytes in the given value to the given ostream.
-void PrintBytesInObjectToImpl(const unsigned char *obj_bytes, size_t count,
-                              ostream *os) {
+void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
+                              ostream* os) {
   // Tells the user how big the object is.
   *os << count << "-byte object <";
 
@@ -95,68 +101,96 @@ void PrintBytesInObjectToImpl(const unsigned char *obj_bytes, size_t count,
     PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
     *os << " ... ";
     // Rounds up to 2-byte boundary.
-    const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2;
+    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
     PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
   }
   *os << ">";
 }
 
+// Helpers for widening a character to char32_t. Since the standard does not
+// specify if char / wchar_t is signed or unsigned, it is important to first
+// convert it to the unsigned type of the same width before widening it to
+// char32_t.
+template <typename CharType>
+char32_t ToChar32(CharType in) {
+  return static_cast<char32_t>(
+      static_cast<typename std::make_unsigned<CharType>::type>(in));
+}
+
 }  // namespace
 
-namespace internal2 {
+namespace internal {
 
 // Delegates to PrintBytesInObjectToImpl() to print the bytes in the
 // given object.  The delegation simplifies the implementation, which
 // uses the << operator and thus is easier done outside of the
 // ::testing::internal namespace, which contains a << operator that
 // sometimes conflicts with the one in STL.
-void PrintBytesInObjectTo(const unsigned char *obj_bytes, size_t count,
-                          ostream *os) {
+void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
+                          ostream* os) {
   PrintBytesInObjectToImpl(obj_bytes, count, os);
 }
 
-}  // namespace internal2
-
-namespace internal {
-
 // Depending on the value of a char (or wchar_t), we print it in one
 // of three formats:
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
 //   - as a hexadecimal escape sequence (e.g. '\x7F'), or
 //   - as a special escape sequence (e.g. '\r', '\n').
-enum CharFormat { kAsIs, kHexEscape, kSpecialEscape };
+enum CharFormat {
+  kAsIs,
+  kHexEscape,
+  kSpecialEscape
+};
 
 // Returns true if c is a printable ASCII character.  We test the
 // value of c directly instead of calling isprint(), which is buggy on
 // Windows Mobile.
-inline bool IsPrintableAscii(wchar_t c) { return 0x20 <= c && c <= 0x7E; }
-
-// Prints a wide or narrow char c as a character literal without the
-// quotes, escaping it when necessary; returns how c was formatted.
-// The template argument UnsignedChar is the unsigned version of Char,
-// which is the type of c.
-template <typename UnsignedChar, typename Char>
-static CharFormat PrintAsCharLiteralTo(Char c, ostream *os) {
-  wchar_t w_c = static_cast<wchar_t>(c);
-  switch (w_c) {
-    case L'\0': *os << "\\0"; break;
-    case L'\'': *os << "\\'"; break;
-    case L'\\': *os << "\\\\"; break;
-    case L'\a': *os << "\\a"; break;
-    case L'\b': *os << "\\b"; break;
-    case L'\f': *os << "\\f"; break;
-    case L'\n': *os << "\\n"; break;
-    case L'\r': *os << "\\r"; break;
-    case L'\t': *os << "\\t"; break;
-    case L'\v': *os << "\\v"; break;
+inline bool IsPrintableAscii(char32_t c) { return 0x20 <= c && c <= 0x7E; }
+
+// Prints c (of type char, char8_t, char16_t, char32_t, or wchar_t) as a
+// character literal without the quotes, escaping it when necessary; returns how
+// c was formatted.
+template <typename Char>
+static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
+  const char32_t u_c = ToChar32(c);
+  switch (u_c) {
+    case L'\0':
+      *os << "\\0";
+      break;
+    case L'\'':
+      *os << "\\'";
+      break;
+    case L'\\':
+      *os << "\\\\";
+      break;
+    case L'\a':
+      *os << "\\a";
+      break;
+    case L'\b':
+      *os << "\\b";
+      break;
+    case L'\f':
+      *os << "\\f";
+      break;
+    case L'\n':
+      *os << "\\n";
+      break;
+    case L'\r':
+      *os << "\\r";
+      break;
+    case L'\t':
+      *os << "\\t";
+      break;
+    case L'\v':
+      *os << "\\v";
+      break;
     default:
-      if (IsPrintableAscii(w_c)) {
+      if (IsPrintableAscii(u_c)) {
         *os << static_cast<char>(c);
         return kAsIs;
       } else {
         ostream::fmtflags flags = os->flags();
-        *os << "\\x" << std::hex << std::uppercase
-            << static_cast<int>(static_cast<UnsignedChar>(c));
+        *os << "\\x" << std::hex << std::uppercase << static_cast<int>(u_c);
         os->flags(flags);
         return kHexEscape;
       }
@@ -164,38 +198,86 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream *os) {
   return kSpecialEscape;
 }
 
-// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// Prints a char32_t c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream *os) {
+static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) {
   switch (c) {
-    case L'\'': *os << "'"; return kAsIs;
-    case L'"': *os << "\\\""; return kSpecialEscape;
-    default: return PrintAsCharLiteralTo<wchar_t>(c, os);
+    case L'\'':
+      *os << "'";
+      return kAsIs;
+    case L'"':
+      *os << "\\\"";
+      return kSpecialEscape;
+    default:
+      return PrintAsCharLiteralTo(c, os);
   }
 }
 
+static const char* GetCharWidthPrefix(char) {
+  return "";
+}
+
+static const char* GetCharWidthPrefix(signed char) {
+  return "";
+}
+
+static const char* GetCharWidthPrefix(unsigned char) {
+  return "";
+}
+
+#ifdef __cpp_char8_t
+static const char* GetCharWidthPrefix(char8_t) {
+  return "u8";
+}
+#endif
+
+static const char* GetCharWidthPrefix(char16_t) {
+  return "u";
+}
+
+static const char* GetCharWidthPrefix(char32_t) {
+  return "U";
+}
+
+static const char* GetCharWidthPrefix(wchar_t) {
+  return "L";
+}
+
 // Prints a char c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(char c, ostream *os) {
-  return PrintAsStringLiteralTo(
-      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+#ifdef __cpp_char8_t
+static CharFormat PrintAsStringLiteralTo(char8_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+#endif
+
+static CharFormat PrintAsStringLiteralTo(char16_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
 }
 
-// Prints a wide or narrow character c and its code.  '\0' is printed
-// as "'\\0'", other unprintable characters are also properly escaped
-// using the standard C++ escape sequence.  The template argument
-// UnsignedChar is the unsigned version of Char, which is the type of c.
-template <typename UnsignedChar, typename Char>
-void PrintCharAndCodeTo(Char c, ostream *os) {
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+// Prints a character c (of type char, char8_t, char16_t, char32_t, or wchar_t)
+// and its code. '\0' is printed as "'\\0'", other unprintable characters are
+// also properly escaped using the standard C++ escape sequence.
+template <typename Char>
+void PrintCharAndCodeTo(Char c, ostream* os) {
   // First, print c as a literal in the most readable form we can find.
-  *os << ((sizeof(c) > 1) ? "L'" : "'");
-  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << GetCharWidthPrefix(c) << "'";
+  const CharFormat format = PrintAsCharLiteralTo(c, os);
   *os << "'";
 
   // To aid user debugging, we also print c's code in decimal, unless
   // it's 0 (in which case c was printed as '\\0', making the code
   // obvious).
-  if (c == 0) return;
+  if (c == 0)
+    return;
   *os << " (" << static_cast<int>(c);
 
   // For more convenience, we print c's code again in hexadecimal,
@@ -209,28 +291,32 @@ void PrintCharAndCodeTo(Char c, ostream *os) {
   *os << ")";
 }
 
-void PrintTo(unsigned char c, ::std::ostream *os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
-void PrintTo(signed char c, ::std::ostream *os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
+void PrintTo(unsigned char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); }
+void PrintTo(signed char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); }
 
 // Prints a wchar_t as a symbol if it is printable or as its internal
 // code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
-void PrintTo(wchar_t wc, ostream *os) { PrintCharAndCodeTo<wchar_t>(wc, os); }
+void PrintTo(wchar_t wc, ostream* os) { PrintCharAndCodeTo(wc, os); }
+
+// TODO(dcheng): Consider making this delegate to PrintCharAndCodeTo() as well.
+void PrintTo(char32_t c, ::std::ostream* os) {
+  *os << std::hex << "U+" << std::uppercase << std::setfill('0') << std::setw(4)
+      << static_cast<uint32_t>(c);
+}
 
 // Prints the given array of characters to the ostream.  CharType must be either
-// char or wchar_t.
+// char, char8_t, char16_t, char32_t, or wchar_t.
 // The array starts at begin, the length is len, it may include '\0' characters
 // and may not be NUL-terminated.
 template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat
-        PrintCharsAsStringTo(const CharType *begin, size_t len, ostream *os) {
-  const char *const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
-  *os << kQuoteBegin;
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static CharFormat PrintCharsAsStringTo(
+    const CharType* begin, size_t len, ostream* os) {
+  const char* const quote_prefix = GetCharWidthPrefix(*begin);
+  *os << quote_prefix << "\"";
   bool is_previous_hex = false;
   CharFormat print_format = kAsIs;
   for (size_t index = 0; index < len; ++index) {
@@ -239,7 +325,7 @@ GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
       // Previous character is of '\x..' form and this character can be
       // interpreted as another hexadecimal digit in its number. Break string to
       // disambiguate.
-      *os << "\" " << kQuoteBegin;
+      *os << "\" " << quote_prefix << "\"";
     }
     is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
     // Remember if any characters required hex escaping.
@@ -254,11 +340,12 @@ GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 // Prints a (const) char/wchar_t array of 'len' elements, starting at address
 // 'begin'.  CharType must be either char or wchar_t.
 template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void
-        UniversalPrintCharArray(const CharType *begin, size_t len,
-                                ostream *os) {
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static void UniversalPrintCharArray(
+    const CharType* begin, size_t len, ostream* os) {
   // The code
   //   const char kFoo[] = "foo";
   // generates an array of 4, not 3, elements, with the last one being '\0'.
@@ -280,26 +367,61 @@ GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 }
 
 // Prints a (const) char array of 'len' elements, starting at address 'begin'.
-void UniversalPrintArray(const char *begin, size_t len, ostream *os) {
+void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+#ifdef __cpp_char8_t
+// Prints a (const) char8_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char8_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+#endif
+
+// Prints a (const) char16_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char16_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) char32_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char32_t* begin, size_t len, ostream* os) {
   UniversalPrintCharArray(begin, len, os);
 }
 
 // Prints a (const) wchar_t array of 'len' elements, starting at address
 // 'begin'.
-void UniversalPrintArray(const wchar_t *begin, size_t len, ostream *os) {
+void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
   UniversalPrintCharArray(begin, len, os);
 }
 
-// Prints the given C string to the ostream.
-void PrintTo(const char *s, ostream *os) {
+namespace {
+
+// Prints a null-terminated C-style string to the ostream.
+template <typename Char>
+void PrintCStringTo(const Char* s, ostream* os) {
   if (s == nullptr) {
     *os << "NULL";
   } else {
-    *os << ImplicitCast_<const void *>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, strlen(s), os);
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, std::char_traits<Char>::length(s), os);
   }
 }
 
+}  // anonymous namespace
+
+void PrintTo(const char* s, ostream* os) { PrintCStringTo(s, os); }
+
+#ifdef __cpp_char8_t
+void PrintTo(const char8_t* s, ostream* os) { PrintCStringTo(s, os); }
+#endif
+
+void PrintTo(const char16_t* s, ostream* os) { PrintCStringTo(s, os); }
+
+void PrintTo(const char32_t* s, ostream* os) { PrintCStringTo(s, os); }
+
 // MSVC compiler can be configured to define whar_t as a typedef
 // of unsigned short. Defining an overload for const wchar_t* in that case
 // would cause pointers to unsigned shorts be printed as wide strings,
@@ -308,38 +430,33 @@ void PrintTo(const char *s, ostream *os) {
 // wchar_t is implemented as a native type.
 #if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
 // Prints the given wide C string to the ostream.
-void PrintTo(const wchar_t *s, ostream *os) {
-  if (s == nullptr) {
-    *os << "NULL";
-  } else {
-    *os << ImplicitCast_<const void *>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, wcslen(s), os);
-  }
-}
+void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); }
 #endif  // wchar_t is native
 
 namespace {
 
-bool ContainsUnprintableControlCodes(const char *str, size_t length) {
+bool ContainsUnprintableControlCodes(const char* str, size_t length) {
   const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
 
   for (size_t i = 0; i < length; i++) {
     unsigned char ch = *s++;
     if (std::iscntrl(ch)) {
-      switch (ch) {
+        switch (ch) {
         case '\t':
         case '\n':
-        case '\r': break;
-        default: return true;
+        case '\r':
+          break;
+        default:
+          return true;
+        }
       }
-    }
   }
   return false;
 }
 
-bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; }
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; }
 
-bool IsValidUTF8(const char *str, size_t length) {
+bool IsValidUTF8(const char* str, size_t length) {
   const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
 
   for (size_t i = 0; i < length;) {
@@ -353,13 +470,15 @@ bool IsValidUTF8(const char *str, size_t length) {
     } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
       ++i;  // 2-byte character
     } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
-               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i]) &&
+               IsUTF8TrailByte(s[i + 1]) &&
                // check for non-shortest form and surrogate
                (lead != 0xe0 || s[i] >= 0xa0) &&
                (lead != 0xed || s[i] < 0xa0)) {
       i += 2;  // 3-byte character
     } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
-               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i]) &&
+               IsUTF8TrailByte(s[i + 1]) &&
                IsUTF8TrailByte(s[i + 2]) &&
                // check for non-shortest form
                (lead != 0xf0 || s[i] >= 0x90) &&
@@ -372,7 +491,7 @@ bool IsValidUTF8(const char *str, size_t length) {
   return true;
 }
 
-void ConditionalPrintAsText(const char *str, size_t length, ostream *os) {
+void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
   if (!ContainsUnprintableControlCodes(str, length) &&
       IsValidUTF8(str, length)) {
     *os << "\n    As Text: \"" << str << "\"";
@@ -381,7 +500,7 @@ void ConditionalPrintAsText(const char *str, size_t length, ostream *os) {
 
 }  // anonymous namespace
 
-void PrintStringTo(const ::std::string &s, ostream *os) {
+void PrintStringTo(const ::std::string& s, ostream* os) {
   if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
     if (GTEST_FLAG(print_utf8)) {
       ConditionalPrintAsText(s.data(), s.size(), os);
@@ -389,8 +508,22 @@ void PrintStringTo(const ::std::string &s, ostream *os) {
   }
 }
 
+#ifdef __cpp_char8_t
+void PrintU8StringTo(const ::std::u8string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif
+
+void PrintU16StringTo(const ::std::u16string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+void PrintU32StringTo(const ::std::u32string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
 #if GTEST_HAS_STD_WSTRING
-void PrintWideStringTo(const ::std::wstring &s, ostream *os) {
+void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
   PrintCharsAsStringTo(s.data(), s.size(), os);
 }
 #endif  // GTEST_HAS_STD_WSTRING
diff --git a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-test-part.cc b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-test-part.cc
index 44b0e2b3f0..a938683ced 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-test-part.cc
+++ b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-test-part.cc
@@ -41,13 +41,13 @@ using internal::GetUnitTestImpl;
 
 // Gets the summary of the failure message by omitting the stack trace
 // in it.
-std::string TestPartResult::ExtractSummary(const char *message) {
-  const char *const stack_trace = strstr(message, internal::kStackTraceMarker);
+std::string TestPartResult::ExtractSummary(const char* message) {
+  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
   return stack_trace == nullptr ? message : std::string(message, stack_trace);
 }
 
 // Prints a TestPartResult object.
-std::ostream &operator<<(std::ostream &os, const TestPartResult &result) {
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
   return os << internal::FormatFileLocation(result.file_name(),
                                             result.line_number())
             << " "
@@ -63,12 +63,12 @@ std::ostream &operator<<(std::ostream &os, const TestPartResult &result) {
 }
 
 // Appends a TestPartResult to the array.
-void TestPartResultArray::Append(const TestPartResult &result) {
+void TestPartResultArray::Append(const TestPartResult& result) {
   array_.push_back(result);
 }
 
 // Returns the TestPartResult at the given index (0-based).
-const TestPartResult &TestPartResultArray::GetTestPartResult(int index) const {
+const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
   if (index < 0 || index >= size()) {
     printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
     internal::posix::Abort();
@@ -86,8 +86,8 @@ namespace internal {
 
 HasNewFatalFailureHelper::HasNewFatalFailureHelper()
     : has_new_fatal_failure_(false),
-      original_reporter_(
-          GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) {
+      original_reporter_(GetUnitTestImpl()->
+                         GetTestPartResultReporterForCurrentThread()) {
   GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
 }
 
@@ -97,8 +97,9 @@ HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
 }
 
 void HasNewFatalFailureHelper::ReportTestPartResult(
-    const TestPartResult &result) {
-  if (result.fatally_failed()) has_new_fatal_failure_ = true;
+    const TestPartResult& result) {
+  if (result.fatally_failed())
+    has_new_fatal_failure_ = true;
   original_reporter_->ReportTestPartResult(result);
 }
 
diff --git a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-typed-test.cc b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-typed-test.cc
index 04effad17a..c02c3df659 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/src/gtest-typed-test.cc
+++ b/media/libaom/src/third_party/googletest/src/googletest/src/gtest-typed-test.cc
@@ -27,6 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 #include "gtest/gtest-typed-test.h"
 
 #include "gtest/gtest.h"
@@ -34,16 +35,15 @@
 namespace testing {
 namespace internal {
 
-#if GTEST_HAS_TYPED_TEST_P
-
 // Skips to the first non-space char in str. Returns an empty string if str
 // contains only whitespace characters.
-static const char *SkipSpaces(const char *str) {
-  while (IsSpace(*str)) str++;
+static const char* SkipSpaces(const char* str) {
+  while (IsSpace(*str))
+    str++;
   return str;
 }
 
-static std::vector<std::string> SplitIntoTestNames(const char *src) {
+static std::vector<std::string> SplitIntoTestNames(const char* src) {
   std::vector<std::string> name_vec;
   src = SkipSpaces(src);
   for (; src != nullptr; src = SkipComma(src)) {
@@ -55,9 +55,9 @@ static std::vector<std::string> SplitIntoTestNames(const char *src) {
 // Verifies that registered_tests match the test names in
 // registered_tests_; returns registered_tests if successful, or
 // aborts the program otherwise.
-const char *TypedTestSuitePState::VerifyRegisteredTestNames(
-    const char *test_suite_name, const char *file, int line,
-    const char *registered_tests) {
+const char* TypedTestSuitePState::VerifyRegisteredTestNames(
+    const char* test_suite_name, const char* file, int line,
+    const char* registered_tests) {
   RegisterTypeParameterizedTestSuite(test_suite_name, CodeLocation(file, line));
 
   typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
@@ -70,22 +70,13 @@ const char *TypedTestSuitePState::VerifyRegisteredTestNames(
   std::set<std::string> tests;
   for (std::vector<std::string>::const_iterator name_it = name_vec.begin();
        name_it != name_vec.end(); ++name_it) {
-    const std::string &name = *name_it;
+    const std::string& name = *name_it;
     if (tests.count(name) != 0) {
       errors << "Test " << name << " is listed more than once.\n";
       continue;
     }
 
-    bool found = false;
-    for (RegisteredTestIter it = registered_tests_.begin();
-         it != registered_tests_.end(); ++it) {
-      if (name == it->first) {
-        found = true;
-        break;
-      }
-    }
-
-    if (found) {
+    if (registered_tests_.count(name) != 0) {
       tests.insert(name);
     } else {
       errors << "No test named " << name
@@ -94,13 +85,14 @@ const char *TypedTestSuitePState::VerifyRegisteredTestNames(
   }
 
   for (RegisteredTestIter it = registered_tests_.begin();
-       it != registered_tests_.end(); ++it) {
+       it != registered_tests_.end();
+       ++it) {
     if (tests.count(it->first) == 0) {
       errors << "You forgot to list test " << it->first << ".\n";
     }
   }
 
-  const std::string &errors_str = errors.GetString();
+  const std::string& errors_str = errors.GetString();
   if (errors_str != "") {
     fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
             errors_str.c_str());
@@ -111,7 +103,5 @@ const char *TypedTestSuitePState::VerifyRegisteredTestNames(
   return registered_tests;
 }
 
-#endif  // GTEST_HAS_TYPED_TEST_P
-
 }  // namespace internal
 }  // namespace testing
diff --git a/media/libaom/src/third_party/googletest/src/googletest/src/gtest.cc b/media/libaom/src/third_party/googletest/src/googletest/src/gtest.cc
index 5b4037fecb..21c611aff1 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/src/gtest.cc
+++ b/media/libaom/src/third_party/googletest/src/googletest/src/gtest.cc
@@ -35,7 +35,6 @@
 #include "gtest/gtest-spi.h"
 
 #include <ctype.h>
-#include <math.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -44,6 +43,8 @@
 #include <wctype.h>
 
 #include <algorithm>
+#include <chrono>  // NOLINT
+#include <cmath>
 #include <cstdint>
 #include <iomanip>
 #include <limits>
@@ -55,78 +56,69 @@
 
 #if GTEST_OS_LINUX
 
-#define GTEST_HAS_GETTIMEOFDAY_ 1
-
-#include <fcntl.h>   // NOLINT
-#include <limits.h>  // NOLINT
-#include <sched.h>   // NOLINT
+# include <fcntl.h>  // NOLINT
+# include <limits.h>  // NOLINT
+# include <sched.h>  // NOLINT
 // Declares vsnprintf().  This header is not available on Windows.
-#include <strings.h>   // NOLINT
-#include <sys/mman.h>  // NOLINT
-#include <sys/time.h>  // NOLINT
-#include <unistd.h>    // NOLINT
-#include <string>
+# include <strings.h>  // NOLINT
+# include <sys/mman.h>  // NOLINT
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+# include <string>
 
 #elif GTEST_OS_ZOS
-#define GTEST_HAS_GETTIMEOFDAY_ 1
-#include <sys/time.h>  // NOLINT
+# include <sys/time.h>  // NOLINT
 
 // On z/OS we additionally need strings.h for strcasecmp.
-#include <strings.h>   // NOLINT
+# include <strings.h>  // NOLINT
 
 #elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
 
-#include <windows.h>  // NOLINT
-#undef min
+# include <windows.h>  // NOLINT
+# undef min
 
 #elif GTEST_OS_WINDOWS  // We are on Windows proper.
 
-#include <windows.h>  // NOLINT
-#undef min
+# include <windows.h>  // NOLINT
+# undef min
 
 #ifdef _MSC_VER
-#include <crtdbg.h>    // NOLINT
-#include <debugapi.h>  // NOLINT
+# include <crtdbg.h>  // NOLINT
 #endif
 
-#include <io.h>         // NOLINT
-#include <sys/timeb.h>  // NOLINT
-#include <sys/types.h>  // NOLINT
-#include <sys/stat.h>   // NOLINT
+# include <io.h>  // NOLINT
+# include <sys/timeb.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+# include <sys/stat.h>  // NOLINT
 
-#if GTEST_OS_WINDOWS_MINGW
-// MinGW has gettimeofday() but not _ftime64().
-#define GTEST_HAS_GETTIMEOFDAY_ 1
-#include <sys/time.h>  // NOLINT
-#endif                 // GTEST_OS_WINDOWS_MINGW
+# if GTEST_OS_WINDOWS_MINGW
+#  include <sys/time.h>  // NOLINT
+# endif  // GTEST_OS_WINDOWS_MINGW
 
 #else
 
-// Assume other platforms have gettimeofday().
-#define GTEST_HAS_GETTIMEOFDAY_ 1
-
 // cpplint thinks that the header is already included, so we want to
 // silence it.
-#include <sys/time.h>  // NOLINT
-#include <unistd.h>    // NOLINT
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
 
 #endif  // GTEST_OS_LINUX
 
 #if GTEST_HAS_EXCEPTIONS
-#include <stdexcept>
+# include <stdexcept>
 #endif
 
 #if GTEST_CAN_STREAM_RESULTS_
-#include <arpa/inet.h>   // NOLINT
-#include <netdb.h>       // NOLINT
-#include <sys/socket.h>  // NOLINT
-#include <sys/types.h>   // NOLINT
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+# include <sys/socket.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
 #endif
 
 #include "src/gtest-internal-inl.h"
 
 #if GTEST_OS_WINDOWS
-#define vsnprintf _vsnprintf
+# define vsnprintf _vsnprintf
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_MAC
@@ -186,8 +178,8 @@ const char kStackTraceMarker[] = "\nStack trace:\n";
 bool g_help_flag = false;
 
 // Utilty function to Open File for Writing
-static FILE *OpenFileForWriting(const std::string &output_file) {
-  FILE *fileout = nullptr;
+static FILE* OpenFileForWriting(const std::string& output_file) {
+  FILE* fileout = nullptr;
   FilePath output_file_path(output_file);
   FilePath output_dir(output_file_path.RemoveFileName());
 
@@ -204,8 +196,8 @@ static FILE *OpenFileForWriting(const std::string &output_file) {
 
 // Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY
 // environment variable.
-static const char *GetDefaultFilter() {
-  const char *const testbridge_test_only =
+static const char* GetDefaultFilter() {
+  const char* const testbridge_test_only =
       internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY");
   if (testbridge_test_only != nullptr) {
     return testbridge_test_only;
@@ -213,6 +205,21 @@ static const char *GetDefaultFilter() {
   return kUniversalFilter;
 }
 
+// Bazel passes in the argument to '--test_runner_fail_fast' via the
+// TESTBRIDGE_TEST_RUNNER_FAIL_FAST environment variable.
+static bool GetDefaultFailFast() {
+  const char* const testbridge_test_runner_fail_fast =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_RUNNER_FAIL_FAST");
+  if (testbridge_test_runner_fail_fast != nullptr) {
+    return strcmp(testbridge_test_runner_fail_fast, "1") == 0;
+  }
+  return false;
+}
+
+GTEST_DEFINE_bool_(
+    fail_fast, internal::BoolFromGTestEnv("fail_fast", GetDefaultFailFast()),
+    "True if and only if a test failure should stop further test execution.");
+
 GTEST_DEFINE_bool_(
     also_run_disabled_tests,
     internal::BoolFromGTestEnv("also_run_disabled_tests", false),
@@ -229,14 +236,16 @@ GTEST_DEFINE_bool_(catch_exceptions,
                    " should catch exceptions and treat them as test failures.");
 
 GTEST_DEFINE_string_(
-    color, internal::StringFromGTestEnv("color", "auto"),
+    color,
+    internal::StringFromGTestEnv("color", "auto"),
     "Whether to use colors in the output.  Valid values: yes, no, "
     "and auto.  'auto' means to use colors if the output is "
     "being sent to a terminal and the TERM environment variable "
     "is set to a terminal type that supports colors.");
 
 GTEST_DEFINE_string_(
-    filter, internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    filter,
+    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
     "A colon-separated list of glob (not regex) patterns "
     "for filtering the tests to run, optionally followed by a "
     "'-' and a : separated list of negative patterns (tests to "
@@ -246,12 +255,12 @@ GTEST_DEFINE_string_(
 GTEST_DEFINE_bool_(
     install_failure_signal_handler,
     internal::BoolFromGTestEnv("install_failure_signal_handler", false),
-    "If true and supported on the current platform, " GTEST_NAME_
-    " should "
+    "If true and supported on the current platform, " GTEST_NAME_ " should "
     "install a signal handler that dumps debugging information when fatal "
     "signals are raised.");
 
-GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them.");
+GTEST_DEFINE_bool_(list_tests, false,
+                   "List all tests without running them.");
 
 // The net priority order after flag processing is thus:
 //   --gtest_output command line flag
@@ -261,7 +270,7 @@ GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them.");
 GTEST_DEFINE_string_(
     output,
     internal::StringFromGTestEnv("output",
-                                 internal::OutputFlagAlsoCheckEnvVar().c_str()),
+      internal::OutputFlagAlsoCheckEnvVar().c_str()),
     "A format (defaults to \"xml\" but can be specified to be \"json\"), "
     "optionally followed by a colon and an output file name or directory. "
     "A directory is indicated by a trailing pathname separator. "
@@ -271,6 +280,10 @@ GTEST_DEFINE_string_(
     "executable's name and, if necessary, made unique by adding "
     "digits.");
 
+GTEST_DEFINE_bool_(
+    brief, internal::BoolFromGTestEnv("brief", false),
+    "True if only test failures should be displayed in text output.");
+
 GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true),
                    "True if and only if " GTEST_NAME_
                    " should display elapsed time in text output.");
@@ -280,12 +293,14 @@ GTEST_DEFINE_bool_(print_utf8, internal::BoolFromGTestEnv("print_utf8", true),
                    " prints UTF8 characters as text.");
 
 GTEST_DEFINE_int32_(
-    random_seed, internal::Int32FromGTestEnv("random_seed", 0),
+    random_seed,
+    internal::Int32FromGTestEnv("random_seed", 0),
     "Random number seed to use when shuffling test orders.  Must be in range "
     "[1, 99999], or 0 to use a seed based on the current time.");
 
 GTEST_DEFINE_int32_(
-    repeat, internal::Int32FromGTestEnv("repeat", 1),
+    repeat,
+    internal::Int32FromGTestEnv("repeat", 1),
     "How many times to repeat each test.  Specify a negative number "
     "for repeating forever.  Useful for shaking out flaky tests.");
 
@@ -305,20 +320,23 @@ GTEST_DEFINE_int32_(
     "assertion fails.  The valid range is 0 through 100, inclusive.");
 
 GTEST_DEFINE_string_(
-    stream_result_to, internal::StringFromGTestEnv("stream_result_to", ""),
+    stream_result_to,
+    internal::StringFromGTestEnv("stream_result_to", ""),
     "This flag specifies the host name and the port number on which to stream "
     "test results. Example: \"localhost:555\". The flag is effective only on "
     "Linux.");
 
 GTEST_DEFINE_bool_(
-    throw_on_failure, internal::BoolFromGTestEnv("throw_on_failure", false),
+    throw_on_failure,
+    internal::BoolFromGTestEnv("throw_on_failure", false),
     "When this flag is specified, a failed assertion will throw an exception "
     "if exceptions are enabled or exit the program with a non-zero code "
     "otherwise. For use with an external test framework.");
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
 GTEST_DEFINE_string_(
-    flagfile, internal::StringFromGTestEnv("flagfile", ""),
+    flagfile,
+    internal::StringFromGTestEnv("flagfile", ""),
     "This flag specifies the flagfile to read command-line flags from.");
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
 
@@ -330,9 +348,10 @@ namespace internal {
 uint32_t Random::Generate(uint32_t range) {
   // These constants are the same as are used in glibc's rand(3).
   // Use wider types than necessary to prevent unsigned overflow diagnostics.
-  state_ = static_cast<uint32_t>(1103515245ULL * state_ + 12345U) % kMaxRange;
+  state_ = static_cast<uint32_t>(1103515245ULL*state_ + 12345U) % kMaxRange;
 
-  GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range > 0)
+      << "Cannot generate a number in the range [0, 0).";
   GTEST_CHECK_(range <= kMaxRange)
       << "Generation of a number in [0, " << range << ") was requested, "
       << "but this can only generate numbers in [0, " << kMaxRange << ").";
@@ -351,7 +370,7 @@ static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
 // Iterates over a vector of TestSuites, keeping a running sum of the
 // results of calling a given int-returning method on each.
 // Returns the sum.
-static int SumOverTestSuiteList(const std::vector<TestSuite *> &case_list,
+static int SumOverTestSuiteList(const std::vector<TestSuite*>& case_list,
                                 int (TestSuite::*method)() const) {
   int sum = 0;
   for (size_t i = 0; i < case_list.size(); i++) {
@@ -361,36 +380,42 @@ static int SumOverTestSuiteList(const std::vector<TestSuite *> &case_list,
 }
 
 // Returns true if and only if the test suite passed.
-static bool TestSuitePassed(const TestSuite *test_suite) {
+static bool TestSuitePassed(const TestSuite* test_suite) {
   return test_suite->should_run() && test_suite->Passed();
 }
 
 // Returns true if and only if the test suite failed.
-static bool TestSuiteFailed(const TestSuite *test_suite) {
+static bool TestSuiteFailed(const TestSuite* test_suite) {
   return test_suite->should_run() && test_suite->Failed();
 }
 
 // Returns true if and only if test_suite contains at least one test that
 // should run.
-static bool ShouldRunTestSuite(const TestSuite *test_suite) {
+static bool ShouldRunTestSuite(const TestSuite* test_suite) {
   return test_suite->should_run();
 }
 
 // AssertHelper constructor.
-AssertHelper::AssertHelper(TestPartResult::Type type, const char *file,
-                           int line, const char *message)
-    : data_(new AssertHelperData(type, file, line, message)) {}
+AssertHelper::AssertHelper(TestPartResult::Type type,
+                           const char* file,
+                           int line,
+                           const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {
+}
 
-AssertHelper::~AssertHelper() { delete data_; }
+AssertHelper::~AssertHelper() {
+  delete data_;
+}
 
 // Message assignment, for assertion streaming support.
-void AssertHelper::operator=(const Message &message) const {
-  UnitTest::GetInstance()->AddTestPartResult(
-      data_->type, data_->file, data_->line,
-      AppendUserMessage(data_->message, message),
-      UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1)
-      // Skips the stack frame for this function itself.
-  );  // NOLINT
+void AssertHelper::operator=(const Message& message) const {
+  UnitTest::GetInstance()->
+    AddTestPartResult(data_->type, data_->file, data_->line,
+                      AppendUserMessage(data_->message, message),
+                      UnitTest::GetInstance()->impl()
+                      ->CurrentOsStackTraceExceptTop(1)
+                      // Skips the stack frame for this function itself.
+                      );  // NOLINT
 }
 
 namespace {
@@ -400,15 +425,16 @@ namespace {
 // inserted to report ether an error or a log message.
 //
 // This configuration bit will likely be removed at some point.
-constexpr bool kErrorOnUninstantiatedParameterizedTest = false;
-constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = false;
+constexpr bool kErrorOnUninstantiatedParameterizedTest = true;
+constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = true;
 
 // A test that fails at a given file/line location with a given message.
 class FailureTest : public Test {
  public:
-  explicit FailureTest(const CodeLocation &loc, std::string error_message,
+  explicit FailureTest(const CodeLocation& loc, std::string error_message,
                        bool as_error)
-      : loc_(loc), error_message_(std::move(error_message)),
+      : loc_(loc),
+        error_message_(std::move(error_message)),
         as_error_(as_error) {}
 
   void TestBody() override {
@@ -426,22 +452,23 @@ class FailureTest : public Test {
   const bool as_error_;
 };
 
+
 }  // namespace
 
-std::set<std::string> *GetIgnoredParameterizedTestSuites() {
+std::set<std::string>* GetIgnoredParameterizedTestSuites() {
   return UnitTest::GetInstance()->impl()->ignored_parameterized_test_suites();
 }
 
 // Add a given test_suit to the list of them allow to go un-instantiated.
-MarkAsIgnored::MarkAsIgnored(const char *test_suite) {
+MarkAsIgnored::MarkAsIgnored(const char* test_suite) {
   GetIgnoredParameterizedTestSuites()->insert(test_suite);
 }
 
 // If this parameterized test suite has no instantiations (and that
 // has not been marked as okay), emit a test case reporting that.
-void InsertSyntheticTestCase(const std::string &name, CodeLocation location,
+void InsertSyntheticTestCase(const std::string& name, CodeLocation location,
                              bool has_test_p) {
-  const auto &ignored = *GetIgnoredParameterizedTestSuites();
+  const auto& ignored = *GetIgnoredParameterizedTestSuites();
   if (ignored.find(name) != ignored.end()) return;
 
   const char kMissingInstantiation[] =  //
@@ -463,16 +490,15 @@ void InsertSyntheticTestCase(const std::string &name, CodeLocation location,
       "removed but the rest got left behind.";
 
   std::string message =
-      "Paramaterized test suite " + name +
+      "Parameterized test suite " + name +
       (has_test_p ? kMissingInstantiation : kMissingTestCase) +
       "\n\n"
       "To suppress this error for this test suite, insert the following line "
       "(in a non-header) in the namespace it is defined in:"
       "\n\n"
-      "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
-      name + ");";
+      "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + name + ");";
 
-  std::string full_name = "UninstantiatedParamaterizedTestSuite<" + name + ">";
+  std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">";
   RegisterTest(  //
       "GoogleTestVerification", full_name.c_str(),
       nullptr,  // No type parameter.
@@ -483,25 +509,26 @@ void InsertSyntheticTestCase(const std::string &name, CodeLocation location,
       });
 }
 
-void RegisterTypeParameterizedTestSuite(const char *test_suite_name,
+void RegisterTypeParameterizedTestSuite(const char* test_suite_name,
                                         CodeLocation code_location) {
   GetUnitTestImpl()->type_parameterized_test_registry().RegisterTestSuite(
       test_suite_name, code_location);
 }
 
-void RegisterTypeParameterizedTestSuiteInstantiation(const char *case_name) {
-  GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation(
-      case_name);
+void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) {
+  GetUnitTestImpl()
+      ->type_parameterized_test_registry()
+      .RegisterInstantiation(case_name);
 }
 
 void TypeParameterizedTestSuiteRegistry::RegisterTestSuite(
-    const char *test_suite_name, CodeLocation code_location) {
+    const char* test_suite_name, CodeLocation code_location) {
   suites_.emplace(std::string(test_suite_name),
-                  TypeParameterizedTestSuiteInfo(code_location));
+                 TypeParameterizedTestSuiteInfo(code_location));
 }
 
 void TypeParameterizedTestSuiteRegistry::RegisterInstantiation(
-    const char *test_suite_name) {
+        const char* test_suite_name) {
   auto it = suites_.find(std::string(test_suite_name));
   if (it != suites_.end()) {
     it->second.instantiated = true;
@@ -512,13 +539,13 @@ void TypeParameterizedTestSuiteRegistry::RegisterInstantiation(
 }
 
 void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() {
-  const auto &ignored = *GetIgnoredParameterizedTestSuites();
-  for (const auto &testcase : suites_) {
+  const auto& ignored = *GetIgnoredParameterizedTestSuites();
+  for (const auto& testcase : suites_) {
     if (testcase.second.instantiated) continue;
     if (ignored.find(testcase.first) != ignored.end()) continue;
 
     std::string message =
-        "Type paramaterized test suite " + testcase.first +
+        "Type parameterized test suite " + testcase.first +
         " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated "
         "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run."
         "\n\n"
@@ -528,13 +555,13 @@ void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() {
         "utilities.)"
         "\n\n"
         "To suppress this error for this test suite, insert the following line "
-        "(in a non-header) in the namespace it is definedin in:"
+        "(in a non-header) in the namespace it is defined in:"
         "\n\n"
         "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
         testcase.first + ");";
 
     std::string full_name =
-        "UninstantiatedTypeParamaterizedTestSuite<" + testcase.first + ">";
+        "UninstantiatedTypeParameterizedTestSuite<" + testcase.first + ">";
     RegisterTest(  //
         "GoogleTestVerification", full_name.c_str(),
         nullptr,  // No type parameter.
@@ -554,7 +581,7 @@ static ::std::vector<std::string> g_argvs;
 #if defined(GTEST_CUSTOM_GET_ARGVS_)
   // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or
   // ::string. This code converts it to the appropriate type.
-  const auto &custom = GTEST_CUSTOM_GET_ARGVS_();
+  const auto& custom = GTEST_CUSTOM_GET_ARGVS_();
   return ::std::vector<std::string>(custom.begin(), custom.end());
 #else   // defined(GTEST_CUSTOM_GET_ARGVS_)
   return g_argvs;
@@ -579,8 +606,8 @@ FilePath GetCurrentExecutableName() {
 
 // Returns the output format, or "" for normal printed output.
 std::string UnitTestOptions::GetOutputFormat() {
-  const char *const gtest_output_flag = GTEST_FLAG(output).c_str();
-  const char *const colon = strchr(gtest_output_flag, ':');
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  const char* const colon = strchr(gtest_output_flag, ':');
   return (colon == nullptr)
              ? std::string(gtest_output_flag)
              : std::string(gtest_output_flag,
@@ -590,18 +617,19 @@ std::string UnitTestOptions::GetOutputFormat() {
 // Returns the name of the requested output file, or the default if none
 // was explicitly specified.
 std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
-  const char *const gtest_output_flag = GTEST_FLAG(output).c_str();
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
 
   std::string format = GetOutputFormat();
-  if (format.empty()) format = std::string(kDefaultOutputFormat);
+  if (format.empty())
+    format = std::string(kDefaultOutputFormat);
 
-  const char *const colon = strchr(gtest_output_flag, ':');
+  const char* const colon = strchr(gtest_output_flag, ':');
   if (colon == nullptr)
     return internal::FilePath::MakeFileName(
-               internal::FilePath(
-                   UnitTest::GetInstance()->original_working_dir()),
-               internal::FilePath(kDefaultOutputFile), 0, format.c_str())
-        .string();
+        internal::FilePath(
+            UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(kDefaultOutputFile), 0,
+        format.c_str()).string();
 
   internal::FilePath output_name(colon + 1);
   if (!output_name.IsAbsolutePath())
@@ -609,7 +637,8 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
         internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
         internal::FilePath(colon + 1));
 
-  if (!output_name.IsDirectory()) return output_name.string();
+  if (!output_name.IsDirectory())
+    return output_name.string();
 
   internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
       output_name, internal::GetCurrentExecutableName(),
@@ -617,58 +646,94 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
   return result.string();
 }
 
-// Returns true if and only if the wildcard pattern matches the string.
-// The first ':' or '\0' character in pattern marks the end of it.
+// Returns true if and only if the wildcard pattern matches the string. Each
+// pattern consists of regular characters, single-character wildcards (?), and
+// multi-character wildcards (*).
 //
-// This recursive algorithm isn't very efficient, but is clear and
-// works well enough for matching test names, which are short.
-bool UnitTestOptions::PatternMatchesString(const char *pattern,
-                                           const char *str) {
-  switch (*pattern) {
-    case '\0':
-    case ':':  // Either ':' or '\0' marks the end of the pattern.
-      return *str == '\0';
-    case '?':  // Matches any single character.
-      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
-    case '*':  // Matches any string (possibly empty) of characters.
-      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
-             PatternMatchesString(pattern + 1, str);
-    default:  // Non-special character.  Matches itself.
-      return *pattern == *str && PatternMatchesString(pattern + 1, str + 1);
-  }
-}
-
-bool UnitTestOptions::MatchesFilter(const std::string &name,
-                                    const char *filter) {
-  const char *cur_pattern = filter;
-  for (;;) {
-    if (PatternMatchesString(cur_pattern, name.c_str())) {
-      return true;
+// This function implements a linear-time string globbing algorithm based on
+// https://research.swtch.com/glob.
+static bool PatternMatchesString(const std::string& name_str,
+                                 const char* pattern, const char* pattern_end) {
+  const char* name = name_str.c_str();
+  const char* const name_begin = name;
+  const char* const name_end = name + name_str.size();
+
+  const char* pattern_next = pattern;
+  const char* name_next = name;
+
+  while (pattern < pattern_end || name < name_end) {
+    if (pattern < pattern_end) {
+      switch (*pattern) {
+        default:  // Match an ordinary character.
+          if (name < name_end && *name == *pattern) {
+            ++pattern;
+            ++name;
+            continue;
+          }
+          break;
+        case '?':  // Match any single character.
+          if (name < name_end) {
+            ++pattern;
+            ++name;
+            continue;
+          }
+          break;
+        case '*':
+          // Match zero or more characters. Start by skipping over the wildcard
+          // and matching zero characters from name. If that fails, restart and
+          // match one more character than the last attempt.
+          pattern_next = pattern;
+          name_next = name + 1;
+          ++pattern;
+          continue;
+      }
     }
+    // Failed to match a character. Restart if possible.
+    if (name_begin < name_next && name_next <= name_end) {
+      pattern = pattern_next;
+      name = name_next;
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
 
-    // Finds the next pattern in the filter.
-    cur_pattern = strchr(cur_pattern, ':');
+bool UnitTestOptions::MatchesFilter(const std::string& name_str,
+                                    const char* filter) {
+  // The filter is a list of patterns separated by colons (:).
+  const char* pattern = filter;
+  while (true) {
+    // Find the bounds of this pattern.
+    const char* const next_sep = strchr(pattern, ':');
+    const char* const pattern_end =
+        next_sep != nullptr ? next_sep : pattern + strlen(pattern);
 
-    // Returns if no more pattern can be found.
-    if (cur_pattern == nullptr) {
-      return false;
+    // Check if this pattern matches name_str.
+    if (PatternMatchesString(name_str, pattern, pattern_end)) {
+      return true;
     }
 
-    // Skips the pattern separater (the ':' character).
-    cur_pattern++;
+    // Give up on this pattern. However, if we found a pattern separator (:),
+    // advance to the next pattern (skipping over the separator) and restart.
+    if (next_sep == nullptr) {
+      return false;
+    }
+    pattern = next_sep + 1;
   }
+  return true;
 }
 
 // Returns true if and only if the user-specified filter matches the test
 // suite name and the test name.
-bool UnitTestOptions::FilterMatchesTest(const std::string &test_suite_name,
-                                        const std::string &test_name) {
-  const std::string &full_name = test_suite_name + "." + test_name.c_str();
+bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name,
+                                        const std::string& test_name) {
+  const std::string& full_name = test_suite_name + "." + test_name.c_str();
 
   // Split --gtest_filter at '-', if there is one, to separate into
   // positive filter and negative filter portions
-  const char *const p = GTEST_FLAG(filter).c_str();
-  const char *const dash = strchr(p, '-');
+  const char* const p = GTEST_FLAG(filter).c_str();
+  const char* const dash = strchr(p, '-');
   std::string positive;
   std::string negative;
   if (dash == nullptr) {
@@ -723,8 +788,9 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
 // Google Test.  The 'result' parameter specifies where to report the
 // results. Intercepts only failures from the current thread.
 ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    TestPartResultArray *result)
-    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) {
+    TestPartResultArray* result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
+      result_(result) {
   Init();
 }
 
@@ -732,13 +798,14 @@ ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
 // Google Test.  The 'result' parameter specifies where to report the
 // results.
 ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    InterceptMode intercept_mode, TestPartResultArray *result)
-    : intercept_mode_(intercept_mode), result_(result) {
+    InterceptMode intercept_mode, TestPartResultArray* result)
+    : intercept_mode_(intercept_mode),
+      result_(result) {
   Init();
 }
 
 void ScopedFakeTestPartResultReporter::Init() {
-  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
   if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
     old_reporter_ = impl->GetGlobalTestPartResultReporter();
     impl->SetGlobalTestPartResultReporter(this);
@@ -751,7 +818,7 @@ void ScopedFakeTestPartResultReporter::Init() {
 // The d'tor restores the test part result reporter used by Google Test
 // before.
 ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
-  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
   if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
     impl->SetGlobalTestPartResultReporter(old_reporter_);
   } else {
@@ -762,7 +829,7 @@ ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
 // Increments the test part result count and remembers the result.
 // This method is from the TestPartResultReporterInterface interface.
 void ScopedFakeTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult &result) {
+    const TestPartResult& result) {
   result_->Append(result);
 }
 
@@ -777,7 +844,9 @@ namespace internal {
 // from user test code.  GetTestTypeId() is guaranteed to always
 // return the same value, as it always calls GetTypeId<>() from the
 // gtest.cc, which is within the Google Test framework.
-TypeId GetTestTypeId() { return GetTypeId<Test>(); }
+TypeId GetTestTypeId() {
+  return GetTypeId<Test>();
+}
 
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
@@ -786,15 +855,15 @@ extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
 // This predicate-formatter checks that 'results' contains a test part
 // failure of the given type and that the failure message contains the
 // given substring.
-static AssertionResult HasOneFailure(const char * /* results_expr */,
-                                     const char * /* type_expr */,
-                                     const char * /* substr_expr */,
-                                     const TestPartResultArray &results,
+static AssertionResult HasOneFailure(const char* /* results_expr */,
+                                     const char* /* type_expr */,
+                                     const char* /* substr_expr */,
+                                     const TestPartResultArray& results,
                                      TestPartResult::Type type,
-                                     const std::string &substr) {
-  const std::string expected(type == TestPartResult::kFatalFailure
-                                 ? "1 fatal failure"
-                                 : "1 non-fatal failure");
+                                     const std::string& substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure ?
+                        "1 fatal failure" :
+                        "1 non-fatal failure");
   Message msg;
   if (results.size() != 1) {
     msg << "Expected: " << expected << "\n"
@@ -805,7 +874,7 @@ static AssertionResult HasOneFailure(const char * /* results_expr */,
     return AssertionFailure() << msg;
   }
 
-  const TestPartResult &r = results.GetTestPartResult(0);
+  const TestPartResult& r = results.GetTestPartResult(0);
   if (r.type() != type) {
     return AssertionFailure() << "Expected: " << expected << "\n"
                               << "  Actual:\n"
@@ -813,10 +882,10 @@ static AssertionResult HasOneFailure(const char * /* results_expr */,
   }
 
   if (strstr(r.message(), substr.c_str()) == nullptr) {
-    return AssertionFailure()
-           << "Expected: " << expected << " containing \"" << substr << "\"\n"
-           << "  Actual:\n"
-           << r;
+    return AssertionFailure() << "Expected: " << expected << " containing \""
+                              << substr << "\"\n"
+                              << "  Actual:\n"
+                              << r;
   }
 
   return AssertionSuccess();
@@ -825,9 +894,9 @@ static AssertionResult HasOneFailure(const char * /* results_expr */,
 // The constructor of SingleFailureChecker remembers where to look up
 // test part results, what type of failure we expect, and what
 // substring the failure message should contain.
-SingleFailureChecker::SingleFailureChecker(const TestPartResultArray *results,
+SingleFailureChecker::SingleFailureChecker(const TestPartResultArray* results,
                                            TestPartResult::Type type,
-                                           const std::string &substr)
+                                           const std::string& substr)
     : results_(results), type_(type), substr_(substr) {}
 
 // The destructor of SingleFailureChecker verifies that the given
@@ -839,26 +908,24 @@ SingleFailureChecker::~SingleFailureChecker() {
 }
 
 DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
-    UnitTestImpl *unit_test)
-    : unit_test_(unit_test) {}
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
 
 void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult &result) {
+    const TestPartResult& result) {
   unit_test_->current_test_result()->AddTestPartResult(result);
   unit_test_->listeners()->repeater()->OnTestPartResult(result);
 }
 
 DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
-    UnitTestImpl *unit_test)
-    : unit_test_(unit_test) {}
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
 
 void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult &result) {
+    const TestPartResult& result) {
   unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
 }
 
 // Returns the global test part result reporter.
-TestPartResultReporterInterface *
+TestPartResultReporterInterface*
 UnitTestImpl::GetGlobalTestPartResultReporter() {
   internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
   return global_test_part_result_repoter_;
@@ -866,20 +933,20 @@ UnitTestImpl::GetGlobalTestPartResultReporter() {
 
 // Sets the global test part result reporter.
 void UnitTestImpl::SetGlobalTestPartResultReporter(
-    TestPartResultReporterInterface *reporter) {
+    TestPartResultReporterInterface* reporter) {
   internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
   global_test_part_result_repoter_ = reporter;
 }
 
 // Returns the test part result reporter for the current thread.
-TestPartResultReporterInterface *
+TestPartResultReporterInterface*
 UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
   return per_thread_test_part_result_reporter_.get();
 }
 
 // Sets the test part result reporter for the current thread.
 void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
-    TestPartResultReporterInterface *reporter) {
+    TestPartResultReporterInterface* reporter) {
   per_thread_test_part_result_reporter_.set(reporter);
 }
 
@@ -957,50 +1024,37 @@ int UnitTestImpl::test_to_run_count() const {
 // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
 std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
   return os_stack_trace_getter()->CurrentStackTrace(
-      static_cast<int>(GTEST_FLAG(stack_trace_depth)), skip_count + 1
+      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
+      skip_count + 1
       // Skips the user-specified number of frames plus this function
       // itself.
-  );  // NOLINT
+      );  // NOLINT
 }
 
-// Returns the current time in milliseconds.
+// A helper class for measuring elapsed times.
+class Timer {
+ public:
+  Timer() : start_(std::chrono::steady_clock::now()) {}
+
+  // Return time elapsed in milliseconds since the timer was created.
+  TimeInMillis Elapsed() {
+    return std::chrono::duration_cast<std::chrono::milliseconds>(
+               std::chrono::steady_clock::now() - start_)
+        .count();
+  }
+
+ private:
+  std::chrono::steady_clock::time_point start_;
+};
+
+// Returns a timestamp as milliseconds since the epoch. Note this time may jump
+// around subject to adjustments by the system, to measure elapsed time use
+// Timer instead.
 TimeInMillis GetTimeInMillis() {
-#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
-  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
-  // http://analogous.blogspot.com/2005/04/epoch.html
-  const TimeInMillis kJavaEpochToWinFileTimeDelta =
-      static_cast<TimeInMillis>(116444736UL) * 100000UL;
-  const DWORD kTenthMicrosInMilliSecond = 10000;
-
-  SYSTEMTIME now_systime;
-  FILETIME now_filetime;
-  ULARGE_INTEGER now_int64;
-  GetSystemTime(&now_systime);
-  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
-    now_int64.LowPart = now_filetime.dwLowDateTime;
-    now_int64.HighPart = now_filetime.dwHighDateTime;
-    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
-                         kJavaEpochToWinFileTimeDelta;
-    return now_int64.QuadPart;
-  }
-  return 0;
-#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
-  __timeb64 now;
-
-  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
-  // (deprecated function) there.
-  GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
-  _ftime64(&now);
-  GTEST_DISABLE_MSC_DEPRECATED_POP_()
-
-  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
-#elif GTEST_HAS_GETTIMEOFDAY_
-  struct timeval now;
-  gettimeofday(&now, nullptr);
-  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
-#else
-#error "Don't know how to get the current time on your system."
-#endif
+  return std::chrono::duration_cast<std::chrono::milliseconds>(
+             std::chrono::system_clock::now() -
+             std::chrono::system_clock::from_time_t(0))
+      .count();
 }
 
 // Utilities
@@ -1012,13 +1066,14 @@ TimeInMillis GetTimeInMillis() {
 // memory using new. The caller is responsible for deleting the return
 // value using delete[]. Returns the wide string, or NULL if the
 // input is NULL.
-LPCWSTR String::AnsiToUtf16(const char *ansi) {
+LPCWSTR String::AnsiToUtf16(const char* ansi) {
   if (!ansi) return nullptr;
   const int length = strlen(ansi);
   const int unicode_length =
       MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0);
-  WCHAR *unicode = new WCHAR[unicode_length + 1];
-  MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length);
+  WCHAR* unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                      unicode, unicode_length);
   unicode[unicode_length] = 0;
   return unicode;
 }
@@ -1027,11 +1082,11 @@ LPCWSTR String::AnsiToUtf16(const char *ansi) {
 // memory using new. The caller is responsible for deleting the return
 // value using delete[]. Returns the ANSI string, or NULL if the
 // input is NULL.
-const char *String::Utf16ToAnsi(LPCWSTR utf16_str) {
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
   if (!utf16_str) return nullptr;
   const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr,
                                               0, nullptr, nullptr);
-  char *ansi = new char[ansi_length + 1];
+  char* ansi = new char[ansi_length + 1];
   WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, ansi, ansi_length, nullptr,
                       nullptr);
   ansi[ansi_length] = 0;
@@ -1046,7 +1101,7 @@ const char *String::Utf16ToAnsi(LPCWSTR utf16_str) {
 // Unlike strcmp(), this function can handle NULL argument(s).  A NULL
 // C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::CStringEquals(const char *lhs, const char *rhs) {
+bool String::CStringEquals(const char * lhs, const char * rhs) {
   if (lhs == nullptr) return rhs == nullptr;
 
   if (rhs == nullptr) return false;
@@ -1058,12 +1113,13 @@ bool String::CStringEquals(const char *lhs, const char *rhs) {
 
 // Converts an array of wide chars to a narrow string using the UTF-8
 // encoding, and streams the result to the given Message object.
-static void StreamWideCharsToMessage(const wchar_t *wstr, size_t length,
-                                     Message *msg) {
-  for (size_t i = 0; i != length;) {  // NOLINT
+static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
+                                     Message* msg) {
+  for (size_t i = 0; i != length; ) {  // NOLINT
     if (wstr[i] != L'\0') {
       *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
-      while (i != length && wstr[i] != L'\0') i++;
+      while (i != length && wstr[i] != L'\0')
+        i++;
     } else {
       *msg << '\0';
       i++;
@@ -1073,8 +1129,8 @@ static void StreamWideCharsToMessage(const wchar_t *wstr, size_t length,
 
 #endif  // GTEST_HAS_STD_WSTRING
 
-void SplitString(const ::std::string &str, char delimiter,
-                 ::std::vector< ::std::string> *dest) {
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest) {
   ::std::vector< ::std::string> parsed;
   ::std::string::size_type pos = 0;
   while (::testing::internal::AlwaysTrue()) {
@@ -1105,17 +1161,17 @@ Message::Message() : ss_(new ::std::stringstream) {
 
 // These two overloads allow streaming a wide C string to a Message
 // using the UTF-8 encoding.
-Message &Message::operator<<(const wchar_t *wide_c_str) {
+Message& Message::operator <<(const wchar_t* wide_c_str) {
   return *this << internal::String::ShowWideCString(wide_c_str);
 }
-Message &Message::operator<<(wchar_t *wide_c_str) {
+Message& Message::operator <<(wchar_t* wide_c_str) {
   return *this << internal::String::ShowWideCString(wide_c_str);
 }
 
 #if GTEST_HAS_STD_WSTRING
 // Converts the given wide string to a narrow string using the UTF-8
 // encoding, and streams the result to this Message object.
-Message &Message::operator<<(const ::std::wstring &wstr) {
+Message& Message::operator <<(const ::std::wstring& wstr) {
   internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
   return *this;
 }
@@ -1129,14 +1185,14 @@ std::string Message::GetString() const {
 
 // AssertionResult constructors.
 // Used in EXPECT_TRUE/FALSE(assertion_result).
-AssertionResult::AssertionResult(const AssertionResult &other)
+AssertionResult::AssertionResult(const AssertionResult& other)
     : success_(other.success_),
       message_(other.message_.get() != nullptr
                    ? new ::std::string(*other.message_)
-                   : static_cast< ::std::string *>(nullptr)) {}
+                   : static_cast< ::std::string*>(nullptr)) {}
 
 // Swaps two AssertionResults.
-void AssertionResult::swap(AssertionResult &other) {
+void AssertionResult::swap(AssertionResult& other) {
   using std::swap;
   swap(success_, other.success_);
   swap(message_, other.message_);
@@ -1150,22 +1206,26 @@ AssertionResult AssertionResult::operator!() const {
 }
 
 // Makes a successful assertion result.
-AssertionResult AssertionSuccess() { return AssertionResult(true); }
+AssertionResult AssertionSuccess() {
+  return AssertionResult(true);
+}
 
 // Makes a failed assertion result.
-AssertionResult AssertionFailure() { return AssertionResult(false); }
+AssertionResult AssertionFailure() {
+  return AssertionResult(false);
+}
 
 // Makes a failed assertion result with the given failure message.
 // Deprecated; use AssertionFailure() << message.
-AssertionResult AssertionFailure(const Message &message) {
+AssertionResult AssertionFailure(const Message& message) {
   return AssertionFailure() << message;
 }
 
 namespace internal {
 
 namespace edit_distance {
-std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t> &left,
-                                            const std::vector<size_t> &right) {
+std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
+                                            const std::vector<size_t>& right) {
   std::vector<std::vector<double> > costs(
       left.size() + 1, std::vector<double>(right.size() + 1));
   std::vector<std::vector<EditType> > best_move(
@@ -1226,7 +1286,7 @@ namespace {
 // Helper class to convert string into ids with deduplication.
 class InternalStrings {
  public:
-  size_t GetId(const std::string &str) {
+  size_t GetId(const std::string& str) {
     IdMap::iterator it = ids_.find(str);
     if (it != ids_.end()) return it->second;
     size_t id = ids_.size();
@@ -1241,8 +1301,8 @@ class InternalStrings {
 }  // namespace
 
 std::vector<EditType> CalculateOptimalEdits(
-    const std::vector<std::string> &left,
-    const std::vector<std::string> &right) {
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right) {
   std::vector<size_t> left_ids, right_ids;
   {
     InternalStrings intern_table;
@@ -1265,10 +1325,13 @@ namespace {
 class Hunk {
  public:
   Hunk(size_t left_start, size_t right_start)
-      : left_start_(left_start), right_start_(right_start), adds_(), removes_(),
+      : left_start_(left_start),
+        right_start_(right_start),
+        adds_(),
+        removes_(),
         common_() {}
 
-  void PushLine(char edit, const char *line) {
+  void PushLine(char edit, const char* line) {
     switch (edit) {
       case ' ':
         ++common_;
@@ -1286,10 +1349,10 @@ class Hunk {
     }
   }
 
-  void PrintTo(std::ostream *os) {
+  void PrintTo(std::ostream* os) {
     PrintHeader(os);
     FlushEdits();
-    for (std::list<std::pair<char, const char *> >::const_iterator it =
+    for (std::list<std::pair<char, const char*> >::const_iterator it =
              hunk_.begin();
          it != hunk_.end(); ++it) {
       *os << it->first << it->second << "\n";
@@ -1308,7 +1371,7 @@ class Hunk {
   // The format is
   //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
   // where the left/right parts are omitted if unnecessary.
-  void PrintHeader(std::ostream *ss) const {
+  void PrintHeader(std::ostream* ss) const {
     *ss << "@@ ";
     if (removes_) {
       *ss << "-" << left_start_ << "," << (removes_ + common_);
@@ -1324,7 +1387,7 @@ class Hunk {
 
   size_t left_start_, right_start_;
   size_t adds_, removes_, common_;
-  std::list<std::pair<char, const char *> > hunk_, hunk_adds_, hunk_removes_;
+  std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
 };
 
 }  // namespace
@@ -1336,8 +1399,8 @@ class Hunk {
 // 'context' represents the desired unchanged prefix/suffix around the diff.
 // If two hunks are close enough that their contexts overlap, then they are
 // joined into one hunk.
-std::string CreateUnifiedDiff(const std::vector<std::string> &left,
-                              const std::vector<std::string> &right,
+std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                              const std::vector<std::string>& right,
                               size_t context) {
   const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
 
@@ -1406,7 +1469,7 @@ namespace {
 // The string representation of the values received in EqFailure() are already
 // escaped. Split them on escaped '\n' boundaries. Leave all other escaped
 // characters the same.
-std::vector<std::string> SplitEscapedString(const std::string &str) {
+std::vector<std::string> SplitEscapedString(const std::string& str) {
   std::vector<std::string> lines;
   size_t start = 0, end = str.size();
   if (end > 2 && str[0] == '"' && str[end - 1] == '"') {
@@ -1446,10 +1509,11 @@ std::vector<std::string> SplitEscapedString(const std::string &str) {
 // The ignoring_case parameter is true if and only if the assertion is a
 // *_STRCASEEQ*.  When it's true, the string "Ignoring case" will
 // be inserted into the message.
-AssertionResult EqFailure(const char *lhs_expression,
-                          const char *rhs_expression,
-                          const std::string &lhs_value,
-                          const std::string &rhs_value, bool ignoring_case) {
+AssertionResult EqFailure(const char* lhs_expression,
+                          const char* rhs_expression,
+                          const std::string& lhs_value,
+                          const std::string& rhs_value,
+                          bool ignoring_case) {
   Message msg;
   msg << "Expected equality of these values:";
   msg << "\n  " << lhs_expression;
@@ -1466,8 +1530,10 @@ AssertionResult EqFailure(const char *lhs_expression,
   }
 
   if (!lhs_value.empty() && !rhs_value.empty()) {
-    const std::vector<std::string> lhs_lines = SplitEscapedString(lhs_value);
-    const std::vector<std::string> rhs_lines = SplitEscapedString(rhs_value);
+    const std::vector<std::string> lhs_lines =
+        SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines =
+        SplitEscapedString(rhs_value);
     if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
       msg << "\nWith diff:\n"
           << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
@@ -1479,36 +1545,70 @@ AssertionResult EqFailure(const char *lhs_expression,
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
 std::string GetBoolAssertionFailureMessage(
-    const AssertionResult &assertion_result, const char *expression_text,
-    const char *actual_predicate_value, const char *expected_predicate_value) {
-  const char *actual_message = assertion_result.message();
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value) {
+  const char* actual_message = assertion_result.message();
   Message msg;
   msg << "Value of: " << expression_text
       << "\n  Actual: " << actual_predicate_value;
-  if (actual_message[0] != '\0') msg << " (" << actual_message << ")";
+  if (actual_message[0] != '\0')
+    msg << " (" << actual_message << ")";
   msg << "\nExpected: " << expected_predicate_value;
   return msg.GetString();
 }
 
 // Helper function for implementing ASSERT_NEAR.
-AssertionResult DoubleNearPredFormat(const char *expr1, const char *expr2,
-                                     const char *abs_error_expr, double val1,
-                                     double val2, double abs_error) {
+AssertionResult DoubleNearPredFormat(const char* expr1,
+                                     const char* expr2,
+                                     const char* abs_error_expr,
+                                     double val1,
+                                     double val2,
+                                     double abs_error) {
   const double diff = fabs(val1 - val2);
   if (diff <= abs_error) return AssertionSuccess();
 
+  // Find the value which is closest to zero.
+  const double min_abs = std::min(fabs(val1), fabs(val2));
+  // Find the distance to the next double from that value.
+  const double epsilon =
+      nextafter(min_abs, std::numeric_limits<double>::infinity()) - min_abs;
+  // Detect the case where abs_error is so small that EXPECT_NEAR is
+  // effectively the same as EXPECT_EQUAL, and give an informative error
+  // message so that the situation can be more easily understood without
+  // requiring exotic floating-point knowledge.
+  // Don't do an epsilon check if abs_error is zero because that implies
+  // that an equality check was actually intended.
+  if (!(std::isnan)(val1) && !(std::isnan)(val2) && abs_error > 0 &&
+      abs_error < epsilon) {
+    return AssertionFailure()
+           << "The difference between " << expr1 << " and " << expr2 << " is "
+           << diff << ", where\n"
+           << expr1 << " evaluates to " << val1 << ",\n"
+           << expr2 << " evaluates to " << val2 << ".\nThe abs_error parameter "
+           << abs_error_expr << " evaluates to " << abs_error
+           << " which is smaller than the minimum distance between doubles for "
+              "numbers of this magnitude which is "
+           << epsilon
+           << ", thus making this EXPECT_NEAR check equivalent to "
+              "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead.";
+  }
   return AssertionFailure()
-         << "The difference between " << expr1 << " and " << expr2 << " is "
-         << diff << ", which exceeds " << abs_error_expr << ", where\n"
-         << expr1 << " evaluates to " << val1 << ",\n"
-         << expr2 << " evaluates to " << val2 << ", and\n"
-         << abs_error_expr << " evaluates to " << abs_error << ".";
+      << "The difference between " << expr1 << " and " << expr2
+      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
+      << expr1 << " evaluates to " << val1 << ",\n"
+      << expr2 << " evaluates to " << val2 << ", and\n"
+      << abs_error_expr << " evaluates to " << abs_error << ".";
 }
 
+
 // Helper template for implementing FloatLE() and DoubleLE().
 template <typename RawType>
-AssertionResult FloatingPointLE(const char *expr1, const char *expr2,
-                                RawType val1, RawType val2) {
+AssertionResult FloatingPointLE(const char* expr1,
+                                const char* expr2,
+                                RawType val1,
+                                RawType val2) {
   // Returns success if val1 is less than val2,
   if (val1 < val2) {
     return AssertionSuccess();
@@ -1533,124 +1633,87 @@ AssertionResult FloatingPointLE(const char *expr1, const char *expr2,
           << val2;
 
   return AssertionFailure()
-         << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
-         << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
-         << StringStreamToString(&val2_ss);
+      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+      << StringStreamToString(&val2_ss);
 }
 
 }  // namespace internal
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult FloatLE(const char *expr1, const char *expr2, float val1,
-                        float val2) {
+AssertionResult FloatLE(const char* expr1, const char* expr2,
+                        float val1, float val2) {
   return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
 }
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult DoubleLE(const char *expr1, const char *expr2, double val1,
-                         double val2) {
+AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                         double val1, double val2) {
   return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
 }
 
 namespace internal {
 
-// The helper function for {ASSERT|EXPECT}_EQ with int or enum
-// arguments.
-AssertionResult CmpHelperEQ(const char *lhs_expression,
-                            const char *rhs_expression, BiggestInt lhs,
-                            BiggestInt rhs) {
-  if (lhs == rhs) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(lhs_expression, rhs_expression,
-                   FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs), false);
-}
-
-// A macro for implementing the helper functions needed to implement
-// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
-// just to avoid copy-and-paste of similar code.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)                                    \
-  AssertionResult CmpHelper##op_name(const char *expr1, const char *expr2,     \
-                                     BiggestInt val1, BiggestInt val2) {       \
-    if (val1 op val2) {                                                        \
-      return AssertionSuccess();                                               \
-    } else {                                                                   \
-      return AssertionFailure()                                                \
-             << "Expected: (" << expr1 << ") " #op " (" << expr2               \
-             << "), actual: " << FormatForComparisonFailureMessage(val1, val2) \
-             << " vs " << FormatForComparisonFailureMessage(val2, val1);       \
-    }                                                                          \
-  }
-
-// Implements the helper function for {ASSERT|EXPECT}_NE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(NE, !=)
-// Implements the helper function for {ASSERT|EXPECT}_LE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LE, <=)
-// Implements the helper function for {ASSERT|EXPECT}_LT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LT, <)
-// Implements the helper function for {ASSERT|EXPECT}_GE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GE, >=)
-// Implements the helper function for {ASSERT|EXPECT}_GT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GT, >)
-
-#undef GTEST_IMPL_CMP_HELPER_
-
 // The helper function for {ASSERT|EXPECT}_STREQ.
-AssertionResult CmpHelperSTREQ(const char *lhs_expression,
-                               const char *rhs_expression, const char *lhs,
-                               const char *rhs) {
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const char* lhs,
+                               const char* rhs) {
   if (String::CStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
-                   PrintToString(rhs), false);
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   false);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
-AssertionResult CmpHelperSTRCASEEQ(const char *lhs_expression,
-                                   const char *rhs_expression, const char *lhs,
-                                   const char *rhs) {
+AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const char* lhs,
+                                   const char* rhs) {
   if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
-                   PrintToString(rhs), true);
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   true);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
-AssertionResult CmpHelperSTRNE(const char *s1_expression,
-                               const char *s2_expression, const char *s1,
-                               const char *s2) {
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const char* s1,
+                               const char* s2) {
   if (!String::CStringEquals(s1, s2)) {
     return AssertionSuccess();
   } else {
-    return AssertionFailure()
-           << "Expected: (" << s1_expression << ") != (" << s2_expression
-           << "), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
+    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                              << s2_expression << "), actual: \""
+                              << s1 << "\" vs \"" << s2 << "\"";
   }
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
-AssertionResult CmpHelperSTRCASENE(const char *s1_expression,
-                                   const char *s2_expression, const char *s1,
-                                   const char *s2) {
+AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                   const char* s2_expression,
+                                   const char* s1,
+                                   const char* s2) {
   if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
     return AssertionSuccess();
   } else {
     return AssertionFailure()
-           << "Expected: (" << s1_expression << ") != (" << s2_expression
-           << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
+        << "Expected: (" << s1_expression << ") != ("
+        << s2_expression << ") (ignoring case), actual: \""
+        << s1 << "\" vs \"" << s2 << "\"";
   }
 }
 
@@ -1664,13 +1727,13 @@ namespace {
 // is a substring of haystack.  NULL is considered a substring of
 // itself only.
 
-bool IsSubstringPred(const char *needle, const char *haystack) {
+bool IsSubstringPred(const char* needle, const char* haystack) {
   if (needle == nullptr || haystack == nullptr) return needle == haystack;
 
   return strstr(haystack, needle) != nullptr;
 }
 
-bool IsSubstringPred(const wchar_t *needle, const wchar_t *haystack) {
+bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
   if (needle == nullptr || haystack == nullptr) return needle == haystack;
 
   return wcsstr(haystack, needle) != nullptr;
@@ -1678,7 +1741,8 @@ bool IsSubstringPred(const wchar_t *needle, const wchar_t *haystack) {
 
 // StringType here can be either ::std::string or ::std::wstring.
 template <typename StringType>
-bool IsSubstringPred(const StringType &needle, const StringType &haystack) {
+bool IsSubstringPred(const StringType& needle,
+                     const StringType& haystack) {
   return haystack.find(needle) != StringType::npos;
 }
 
@@ -1687,22 +1751,21 @@ bool IsSubstringPred(const StringType &needle, const StringType &haystack) {
 // StringType here can be const char*, const wchar_t*, ::std::string,
 // or ::std::wstring.
 template <typename StringType>
-AssertionResult IsSubstringImpl(bool expected_to_be_substring,
-                                const char *needle_expr,
-                                const char *haystack_expr,
-                                const StringType &needle,
-                                const StringType &haystack) {
+AssertionResult IsSubstringImpl(
+    bool expected_to_be_substring,
+    const char* needle_expr, const char* haystack_expr,
+    const StringType& needle, const StringType& haystack) {
   if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
     return AssertionSuccess();
 
   const bool is_wide_string = sizeof(needle[0]) > 1;
-  const char *const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
   return AssertionFailure()
-         << "Value of: " << needle_expr << "\n"
-         << "  Actual: " << begin_string_quote << needle << "\"\n"
-         << "Expected: " << (expected_to_be_substring ? "" : "not ")
-         << "a substring of " << haystack_expr << "\n"
-         << "Which is: " << begin_string_quote << haystack << "\"";
+      << "Value of: " << needle_expr << "\n"
+      << "  Actual: " << begin_string_quote << needle << "\"\n"
+      << "Expected: " << (expected_to_be_substring ? "" : "not ")
+      << "a substring of " << haystack_expr << "\n"
+      << "Which is: " << begin_string_quote << haystack << "\"";
 }
 
 }  // namespace
@@ -1711,52 +1774,52 @@ AssertionResult IsSubstringImpl(bool expected_to_be_substring,
 // substring of haystack (NULL is considered a substring of itself
 // only), and return an appropriate error message when they fail.
 
-AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr,
-                            const char *needle, const char *haystack) {
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr,
-                            const wchar_t *needle, const wchar_t *haystack) {
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(const char *needle_expr,
-                               const char *haystack_expr, const char *needle,
-                               const char *haystack) {
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(const char *needle_expr,
-                               const char *haystack_expr, const wchar_t *needle,
-                               const wchar_t *haystack) {
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr,
-                            const ::std::string &needle,
-                            const ::std::string &haystack) {
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(const char *needle_expr,
-                               const char *haystack_expr,
-                               const ::std::string &needle,
-                               const ::std::string &haystack) {
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
 #if GTEST_HAS_STD_WSTRING
-AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr,
-                            const ::std::wstring &needle,
-                            const ::std::wstring &haystack) {
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(const char *needle_expr,
-                               const char *haystack_expr,
-                               const ::std::wstring &needle,
-                               const ::std::wstring &haystack) {
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 #endif  // GTEST_HAS_STD_WSTRING
@@ -1768,54 +1831,55 @@ namespace internal {
 namespace {
 
 // Helper function for IsHRESULT{SuccessFailure} predicates
-AssertionResult HRESULTFailureHelper(const char *expr, const char *expected,
+AssertionResult HRESULTFailureHelper(const char* expr,
+                                     const char* expected,
                                      long hr) {  // NOLINT
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
 
   // Windows CE doesn't support FormatMessage.
   const char error_text[] = "";
 
-#else
+# else
 
   // Looks up the human-readable system message for the HRESULT code
   // and since we're not passing any params to FormatMessage, we don't
   // want inserts expanded.
-  const DWORD kFlags =
-      FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
+                       FORMAT_MESSAGE_IGNORE_INSERTS;
   const DWORD kBufSize = 4096;
   // Gets the system's human readable message string for this HRESULT.
   char error_text[kBufSize] = { '\0' };
   DWORD message_length = ::FormatMessageA(kFlags,
-                                          0,  // no source, we're asking system
+                                          0,   // no source, we're asking system
                                           static_cast<DWORD>(hr),  // the error
-                                          0,  // no line width restrictions
+                                          0,   // no line width restrictions
                                           error_text,  // output buffer
                                           kBufSize,    // buf size
                                           nullptr);  // no arguments for inserts
   // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
   for (; message_length && IsSpace(error_text[message_length - 1]);
-       --message_length) {
+          --message_length) {
     error_text[message_length - 1] = '\0';
   }
 
-#endif  // GTEST_OS_WINDOWS_MOBILE
+# endif  // GTEST_OS_WINDOWS_MOBILE
 
   const std::string error_hex("0x" + String::FormatHexInt(hr));
   return ::testing::AssertionFailure()
-         << "Expected: " << expr << " " << expected << ".\n"
-         << "  Actual: " << error_hex << " " << error_text << "\n";
+      << "Expected: " << expr << " " << expected << ".\n"
+      << "  Actual: " << error_hex << " " << error_text << "\n";
 }
 
 }  // namespace
 
-AssertionResult IsHRESULTSuccess(const char *expr, long hr) {  // NOLINT
+AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
   if (SUCCEEDED(hr)) {
     return AssertionSuccess();
   }
   return HRESULTFailureHelper(expr, "succeeds", hr);
 }
 
-AssertionResult IsHRESULTFailure(const char *expr, long hr) {  // NOLINT
+AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
   if (FAILED(hr)) {
     return AssertionSuccess();
   }
@@ -1837,23 +1901,21 @@ AssertionResult IsHRESULTFailure(const char *expr, long hr) {  // NOLINT
 //  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 
 // The maximum code-point a one-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1;
+constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) <<  7) - 1;
 
 // The maximum code-point a two-byte UTF-8 sequence can represent.
 constexpr uint32_t kMaxCodePoint2 = (static_cast<uint32_t>(1) << (5 + 6)) - 1;
 
 // The maximum code-point a three-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint3 =
-    (static_cast<uint32_t>(1) << (4 + 2 * 6)) - 1;
+constexpr uint32_t kMaxCodePoint3 = (static_cast<uint32_t>(1) << (4 + 2*6)) - 1;
 
 // The maximum code-point a four-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint4 =
-    (static_cast<uint32_t>(1) << (3 + 3 * 6)) - 1;
+constexpr uint32_t kMaxCodePoint4 = (static_cast<uint32_t>(1) << (3 + 3*6)) - 1;
 
 // Chops off the n lowest bits from a bit pattern.  Returns the n
 // lowest bits.  As a side effect, the original bit pattern will be
 // shifted to the right by n bits.
-inline uint32_t ChopLowBits(uint32_t *bits, int n) {
+inline uint32_t ChopLowBits(uint32_t* bits, int n) {
   const uint32_t low_bits = *bits & ((static_cast<uint32_t>(1) << n) - 1);
   *bits >>= n;
   return low_bits;
@@ -1873,7 +1935,7 @@ std::string CodePointToUtf8(uint32_t code_point) {
   char str[5];  // Big enough for the largest valid code point.
   if (code_point <= kMaxCodePoint1) {
     str[1] = '\0';
-    str[0] = static_cast<char>(code_point);  // 0xxxxxxx
+    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
   } else if (code_point <= kMaxCodePoint2) {
     str[2] = '\0';
     str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
@@ -1901,8 +1963,8 @@ std::string CodePointToUtf8(uint32_t code_point) {
 // and thus should be combined into a single Unicode code point
 // using CreateCodePointFromUtf16SurrogatePair.
 inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
-  return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 &&
-         (second & 0xFC00) == 0xDC00;
+  return sizeof(wchar_t) == 2 &&
+      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
 }
 
 // Creates a Unicode code point from UTF16 surrogate pair.
@@ -1932,8 +1994,9 @@ inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first,
 // as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
 // and contains invalid UTF-16 surrogate pairs, values in those pairs
 // will be encoded as individual Unicode characters from Basic Normal Plane.
-std::string WideStringToUtf8(const wchar_t *str, int num_chars) {
-  if (num_chars == -1) num_chars = static_cast<int>(wcslen(str));
+std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
+  if (num_chars == -1)
+    num_chars = static_cast<int>(wcslen(str));
 
   ::std::stringstream stream;
   for (int i = 0; i < num_chars; ++i) {
@@ -1942,8 +2005,8 @@ std::string WideStringToUtf8(const wchar_t *str, int num_chars) {
     if (str[i] == L'\0') {
       break;
     } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
-      unicode_code_point =
-          CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]);
+      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
+                                                                 str[i + 1]);
       i++;
     } else {
       unicode_code_point = static_cast<uint32_t>(str[i]);
@@ -1956,7 +2019,7 @@ std::string WideStringToUtf8(const wchar_t *str, int num_chars) {
 
 // Converts a wide C string to an std::string using the UTF-8 encoding.
 // NULL will be converted to "(null)".
-std::string String::ShowWideCString(const wchar_t *wide_c_str) {
+std::string String::ShowWideCString(const wchar_t * wide_c_str) {
   if (wide_c_str == nullptr) return "(null)";
 
   return internal::WideStringToUtf8(wide_c_str, -1);
@@ -1968,7 +2031,7 @@ std::string String::ShowWideCString(const wchar_t *wide_c_str) {
 // Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
 // C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::WideCStringEquals(const wchar_t *lhs, const wchar_t *rhs) {
+bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
   if (lhs == nullptr) return rhs == nullptr;
 
   if (rhs == nullptr) return false;
@@ -1977,28 +2040,34 @@ bool String::WideCStringEquals(const wchar_t *lhs, const wchar_t *rhs) {
 }
 
 // Helper function for *_STREQ on wide strings.
-AssertionResult CmpHelperSTREQ(const char *lhs_expression,
-                               const char *rhs_expression, const wchar_t *lhs,
-                               const wchar_t *rhs) {
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const wchar_t* lhs,
+                               const wchar_t* rhs) {
   if (String::WideCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
-                   PrintToString(rhs), false);
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   false);
 }
 
 // Helper function for *_STRNE on wide strings.
-AssertionResult CmpHelperSTRNE(const char *s1_expression,
-                               const char *s2_expression, const wchar_t *s1,
-                               const wchar_t *s2) {
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const wchar_t* s1,
+                               const wchar_t* s2) {
   if (!String::WideCStringEquals(s1, s2)) {
     return AssertionSuccess();
   }
 
-  return AssertionFailure()
-         << "Expected: (" << s1_expression << ") != (" << s2_expression
-         << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2);
+  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                            << s2_expression << "), actual: "
+                            << PrintToString(s1)
+                            << " vs " << PrintToString(s2);
 }
 
 // Compares two C strings, ignoring case.  Returns true if and only if they have
@@ -2007,7 +2076,7 @@ AssertionResult CmpHelperSTRNE(const char *s1_expression,
 // Unlike strcasecmp(), this function can handle NULL argument(s).  A
 // NULL C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::CaseInsensitiveCStringEquals(const char *lhs, const char *rhs) {
+bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
   if (lhs == nullptr) return rhs == nullptr;
   if (rhs == nullptr) return false;
   return posix::StrCaseCmp(lhs, rhs) == 0;
@@ -2025,8 +2094,8 @@ bool String::CaseInsensitiveCStringEquals(const char *lhs, const char *rhs) {
 // which compares according to LC_CTYPE category of the current locale.
 // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
 // current locale.
-bool String::CaseInsensitiveWideCStringEquals(const wchar_t *lhs,
-                                              const wchar_t *rhs) {
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                              const wchar_t* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
 
   if (rhs == nullptr) return false;
@@ -2049,8 +2118,8 @@ bool String::CaseInsensitiveWideCStringEquals(const wchar_t *lhs,
 
 // Returns true if and only if str ends with the given suffix, ignoring case.
 // Any string is considered to end with an empty suffix.
-bool String::EndsWithCaseInsensitive(const std::string &str,
-                                     const std::string &suffix) {
+bool String::EndsWithCaseInsensitive(
+    const std::string& str, const std::string& suffix) {
   const size_t str_len = str.length();
   const size_t suffix_len = suffix.length();
   return (str_len >= suffix_len) &&
@@ -2060,8 +2129,13 @@ bool String::EndsWithCaseInsensitive(const std::string &str,
 
 // Formats an int value as "%02d".
 std::string String::FormatIntWidth2(int value) {
+  return FormatIntWidthN(value, 2);
+}
+
+// Formats an int value to given width with leading zeros.
+std::string String::FormatIntWidthN(int value, int width) {
   std::stringstream ss;
-  ss << std::setfill('0') << std::setw(2) << value;
+  ss << std::setfill('0') << std::setw(width) << value;
   return ss.str();
 }
 
@@ -2087,14 +2161,14 @@ std::string String::FormatByte(unsigned char value) {
 
 // Converts the buffer in a stringstream to an std::string, converting NUL
 // bytes to "\\0" along the way.
-std::string StringStreamToString(::std::stringstream *ss) {
-  const ::std::string &str = ss->str();
-  const char *const start = str.c_str();
-  const char *const end = start + str.length();
+std::string StringStreamToString(::std::stringstream* ss) {
+  const ::std::string& str = ss->str();
+  const char* const start = str.c_str();
+  const char* const end = start + str.length();
 
   std::string result;
   result.reserve(static_cast<size_t>(2 * (end - start)));
-  for (const char *ch = start; ch != end; ++ch) {
+  for (const char* ch = start; ch != end; ++ch) {
     if (*ch == '\0') {
       result += "\\0";  // Replaces NUL with "\\0";
     } else {
@@ -2106,14 +2180,16 @@ std::string StringStreamToString(::std::stringstream *ss) {
 }
 
 // Appends the user-supplied message to the Google-Test-generated message.
-std::string AppendUserMessage(const std::string &gtest_msg,
-                              const Message &user_msg) {
+std::string AppendUserMessage(const std::string& gtest_msg,
+                              const Message& user_msg) {
   // Appends the user message if it's non-empty.
   const std::string user_msg_string = user_msg.GetString();
   if (user_msg_string.empty()) {
     return gtest_msg;
   }
-
+  if (gtest_msg.empty()) {
+    return user_msg_string;
+  }
   return gtest_msg + "\n" + user_msg_string;
 }
 
@@ -2126,41 +2202,46 @@ TestResult::TestResult()
     : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {}
 
 // D'tor.
-TestResult::~TestResult() {}
+TestResult::~TestResult() {
+}
 
 // Returns the i-th test part result among all the results. i can
 // range from 0 to total_part_count() - 1. If i is not in that range,
 // aborts the program.
-const TestPartResult &TestResult::GetTestPartResult(int i) const {
-  if (i < 0 || i >= total_part_count()) internal::posix::Abort();
+const TestPartResult& TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count())
+    internal::posix::Abort();
   return test_part_results_.at(static_cast<size_t>(i));
 }
 
 // Returns the i-th test property. i can range from 0 to
 // test_property_count() - 1. If i is not in that range, aborts the
 // program.
-const TestProperty &TestResult::GetTestProperty(int i) const {
-  if (i < 0 || i >= test_property_count()) internal::posix::Abort();
+const TestProperty& TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count())
+    internal::posix::Abort();
   return test_properties_.at(static_cast<size_t>(i));
 }
 
 // Clears the test part results.
-void TestResult::ClearTestPartResults() { test_part_results_.clear(); }
+void TestResult::ClearTestPartResults() {
+  test_part_results_.clear();
+}
 
 // Adds a test part result to the list.
-void TestResult::AddTestPartResult(const TestPartResult &test_part_result) {
+void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
   test_part_results_.push_back(test_part_result);
 }
 
 // Adds a test property to the list. If a property with the same key as the
 // supplied property is already represented, the value of this test_property
 // replaces the old value for that key.
-void TestResult::RecordProperty(const std::string &xml_element,
-                                const TestProperty &test_property) {
+void TestResult::RecordProperty(const std::string& xml_element,
+                                const TestProperty& test_property) {
   if (!ValidateTestProperty(xml_element, test_property)) {
     return;
   }
-  internal::MutexLock lock(&test_properites_mutex_);
+  internal::MutexLock lock(&test_properties_mutex_);
   const std::vector<TestProperty>::iterator property_with_matching_key =
       std::find_if(test_properties_.begin(), test_properties_.end(),
                    internal::TestPropertyKeyIs(test_property.key()));
@@ -2173,37 +2254,41 @@ void TestResult::RecordProperty(const std::string &xml_element,
 
 // The list of reserved attributes used in the <testsuites> element of XML
 // output.
-static const char *const kReservedTestSuitesAttributes[] = {
-  "disabled",    "errors", "failures", "name",
-  "random_seed", "tests",  "time",     "timestamp"
+static const char* const kReservedTestSuitesAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "random_seed",
+  "tests",
+  "time",
+  "timestamp"
 };
 
 // The list of reserved attributes used in the <testsuite> element of XML
 // output.
-static const char *const kReservedTestSuiteAttributes[] = {
-  "disabled", "errors", "failures", "name", "tests", "time", "timestamp"
-};
+static const char* const kReservedTestSuiteAttributes[] = {
+    "disabled", "errors", "failures",  "name",
+    "tests",    "time",   "timestamp", "skipped"};
 
 // The list of reserved attributes used in the <testcase> element of XML output.
-static const char *const kReservedTestCaseAttributes[] = {
-  "classname",  "name",        "status", "time",
-  "type_param", "value_param", "file",   "line"
-};
+static const char* const kReservedTestCaseAttributes[] = {
+    "classname",   "name", "status", "time",  "type_param",
+    "value_param", "file", "line"};
 
 // Use a slightly different set for allowed output to ensure existing tests can
 // still RecordProperty("result") or "RecordProperty(timestamp")
-static const char *const kReservedOutputTestCaseAttributes[] = {
-  "classname",   "name", "status", "time",   "type_param",
-  "value_param", "file", "line",   "result", "timestamp"
-};
+static const char* const kReservedOutputTestCaseAttributes[] = {
+    "classname",   "name", "status", "time",   "type_param",
+    "value_param", "file", "line",   "result", "timestamp"};
 
-template <int kSize>
-std::vector<std::string> ArrayAsVector(const char *const (&array)[kSize]) {
+template <size_t kSize>
+std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
   return std::vector<std::string>(array, array + kSize);
 }
 
 static std::vector<std::string> GetReservedAttributesForElement(
-    const std::string &xml_element) {
+    const std::string& xml_element) {
   if (xml_element == "testsuites") {
     return ArrayAsVector(kReservedTestSuitesAttributes);
   } else if (xml_element == "testsuite") {
@@ -2219,7 +2304,7 @@ static std::vector<std::string> GetReservedAttributesForElement(
 
 // TODO(jdesprez): Merge the two getReserved attributes once skip is improved
 static std::vector<std::string> GetReservedOutputAttributesForElement(
-    const std::string &xml_element) {
+    const std::string& xml_element) {
   if (xml_element == "testsuites") {
     return ArrayAsVector(kReservedTestSuitesAttributes);
   } else if (xml_element == "testsuite") {
@@ -2233,7 +2318,7 @@ static std::vector<std::string> GetReservedOutputAttributesForElement(
   return std::vector<std::string>();
 }
 
-static std::string FormatWordList(const std::vector<std::string> &words) {
+static std::string FormatWordList(const std::vector<std::string>& words) {
   Message word_list;
   for (size_t i = 0; i < words.size(); ++i) {
     if (i > 0 && words.size() > 2) {
@@ -2248,10 +2333,10 @@ static std::string FormatWordList(const std::vector<std::string> &words) {
 }
 
 static bool ValidateTestPropertyName(
-    const std::string &property_name,
-    const std::vector<std::string> &reserved_names) {
+    const std::string& property_name,
+    const std::vector<std::string>& reserved_names) {
   if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
-      reserved_names.end()) {
+          reserved_names.end()) {
     ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
                   << " (" << FormatWordList(reserved_names)
                   << " are reserved by " << GTEST_NAME_ << ")";
@@ -2262,8 +2347,8 @@ static bool ValidateTestPropertyName(
 
 // Adds a failure if the key is a reserved attribute of the element named
 // xml_element.  Returns true if the property is valid.
-bool TestResult::ValidateTestProperty(const std::string &xml_element,
-                                      const TestProperty &test_property) {
+bool TestResult::ValidateTestProperty(const std::string& xml_element,
+                                      const TestProperty& test_property) {
   return ValidateTestPropertyName(test_property.key(),
                                   GetReservedAttributesForElement(xml_element));
 }
@@ -2277,7 +2362,7 @@ void TestResult::Clear() {
 }
 
 // Returns true off the test part was skipped.
-static bool TestPartSkipped(const TestPartResult &result) {
+static bool TestPartSkipped(const TestPartResult& result) {
   return result.skipped();
 }
 
@@ -2289,13 +2374,14 @@ bool TestResult::Skipped() const {
 // Returns true if and only if the test failed.
 bool TestResult::Failed() const {
   for (int i = 0; i < total_part_count(); ++i) {
-    if (GetTestPartResult(i).failed()) return true;
+    if (GetTestPartResult(i).failed())
+      return true;
   }
   return false;
 }
 
 // Returns true if and only if the test part fatally failed.
-static bool TestPartFatallyFailed(const TestPartResult &result) {
+static bool TestPartFatallyFailed(const TestPartResult& result) {
   return result.fatally_failed();
 }
 
@@ -2305,7 +2391,7 @@ bool TestResult::HasFatalFailure() const {
 }
 
 // Returns true if and only if the test part non-fatally failed.
-static bool TestPartNonfatallyFailed(const TestPartResult &result) {
+static bool TestPartNonfatallyFailed(const TestPartResult& result) {
   return result.nonfatally_failed();
 }
 
@@ -2330,30 +2416,35 @@ int TestResult::test_property_count() const {
 // Creates a Test object.
 
 // The c'tor saves the states of all flags.
-Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {}
+Test::Test()
+    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
+}
 
 // The d'tor restores the states of all flags.  The actual work is
 // done by the d'tor of the gtest_flag_saver_ field, and thus not
 // visible here.
-Test::~Test() {}
+Test::~Test() {
+}
 
 // Sets up the test fixture.
 //
 // A sub-class may override this.
-void Test::SetUp() {}
+void Test::SetUp() {
+}
 
 // Tears down the test fixture.
 //
 // A sub-class may override this.
-void Test::TearDown() {}
+void Test::TearDown() {
+}
 
 // Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const std::string &key, const std::string &value) {
+void Test::RecordProperty(const std::string& key, const std::string& value) {
   UnitTest::GetInstance()->RecordProperty(key, value);
 }
 
 // Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const std::string &key, int value) {
+void Test::RecordProperty(const std::string& key, int value) {
   Message value_message;
   value_message << value;
   RecordProperty(key, value_message.GetString().c_str());
@@ -2362,7 +2453,7 @@ void Test::RecordProperty(const std::string &key, int value) {
 namespace internal {
 
 void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const std::string &message) {
+                                    const std::string& message) {
   // This function is a friend of UnitTest and as such has access to
   // AddTestPartResult.
   UnitTest::GetInstance()->AddTestPartResult(
@@ -2381,18 +2472,18 @@ void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
 // yes, it returns true; otherwise it generates a Google Test failure and
 // returns false.
 bool Test::HasSameFixtureClass() {
-  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
-  const TestSuite *const test_suite = impl->current_test_suite();
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  const TestSuite* const test_suite = impl->current_test_suite();
 
   // Info about the first test in the current test suite.
-  const TestInfo *const first_test_info = test_suite->test_info_list()[0];
+  const TestInfo* const first_test_info = test_suite->test_info_list()[0];
   const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
-  const char *const first_test_name = first_test_info->name();
+  const char* const first_test_name = first_test_info->name();
 
   // Info about the current test.
-  const TestInfo *const this_test_info = impl->current_test_info();
+  const TestInfo* const this_test_info = impl->current_test_info();
   const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
-  const char *const this_test_name = this_test_info->name();
+  const char* const this_test_name = this_test_info->name();
 
   if (this_fixture_id != first_fixture_id) {
     // Is the first test defined using TEST?
@@ -2407,9 +2498,9 @@ bool Test::HasSameFixtureClass() {
       // Gets the name of the TEST and the name of the TEST_F.  Note
       // that first_is_TEST and this_is_TEST cannot both be true, as
       // the fixture IDs are different for the two tests.
-      const char *const TEST_name =
+      const char* const TEST_name =
           first_is_TEST ? first_test_name : this_test_name;
-      const char *const TEST_F_name =
+      const char* const TEST_F_name =
           first_is_TEST ? this_test_name : first_test_name;
 
       ADD_FAILURE()
@@ -2447,11 +2538,11 @@ bool Test::HasSameFixtureClass() {
 // function returns its result via an output parameter pointer because VC++
 // prohibits creation of objects with destructors on stack in functions
 // using __try (see error C2712).
-static std::string *FormatSehExceptionMessage(DWORD exception_code,
-                                              const char *location) {
+static std::string* FormatSehExceptionMessage(DWORD exception_code,
+                                              const char* location) {
   Message message;
-  message << "SEH exception with code 0x" << std::setbase(16) << exception_code
-          << std::setbase(10) << " thrown in " << location << ".";
+  message << "SEH exception with code 0x" << std::setbase(16) <<
+    exception_code << std::setbase(10) << " thrown in " << location << ".";
 
   return new std::string(message.GetString());
 }
@@ -2463,8 +2554,8 @@ namespace internal {
 #if GTEST_HAS_EXCEPTIONS
 
 // Adds an "exception thrown" fatal failure to the current test.
-static std::string FormatCxxExceptionMessage(const char *description,
-                                             const char *location) {
+static std::string FormatCxxExceptionMessage(const char* description,
+                                             const char* location) {
   Message message;
   if (description != nullptr) {
     message << "C++ exception with description \"" << description << "\"";
@@ -2477,10 +2568,10 @@ static std::string FormatCxxExceptionMessage(const char *description,
 }
 
 static std::string PrintTestPartResultToString(
-    const TestPartResult &test_part_result);
+    const TestPartResult& test_part_result);
 
 GoogleTestFailureException::GoogleTestFailureException(
-    const TestPartResult &failure)
+    const TestPartResult& failure)
     : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
 
 #endif  // GTEST_HAS_EXCEPTIONS
@@ -2494,8 +2585,8 @@ GoogleTestFailureException::GoogleTestFailureException(
 // exceptions in the same function.  Therefore, we provide a separate
 // wrapper function for handling SEH exceptions.)
 template <class T, typename Result>
-Result HandleSehExceptionsInMethodIfSupported(T *object, Result (T::*method)(),
-                                              const char *location) {
+Result HandleSehExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
 #if GTEST_HAS_SEH
   __try {
     return (object->*method)();
@@ -2504,8 +2595,8 @@ Result HandleSehExceptionsInMethodIfSupported(T *object, Result (T::*method)(),
     // We create the exception message on the heap because VC++ prohibits
     // creation of objects with destructors on stack in functions using __try
     // (see error C2712).
-    std::string *exception_message =
-        FormatSehExceptionMessage(GetExceptionCode(), location);
+    std::string* exception_message = FormatSehExceptionMessage(
+        GetExceptionCode(), location);
     internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
                                              *exception_message);
     delete exception_message;
@@ -2521,8 +2612,8 @@ Result HandleSehExceptionsInMethodIfSupported(T *object, Result (T::*method)(),
 // exceptions, if they are supported; returns the 0-value for type
 // Result in case of an SEH exception.
 template <class T, typename Result>
-Result HandleExceptionsInMethodIfSupported(T *object, Result (T::*method)(),
-                                           const char *location) {
+Result HandleExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
   // NOTE: The user code can affect the way in which Google Test handles
   // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
   // RUN_ALL_TESTS() starts. It is technically possible to check the flag
@@ -2550,14 +2641,14 @@ Result HandleExceptionsInMethodIfSupported(T *object, Result (T::*method)(),
 #if GTEST_HAS_EXCEPTIONS
     try {
       return HandleSehExceptionsInMethodIfSupported(object, method, location);
-    } catch (const AssertionException &) {  // NOLINT
+    } catch (const AssertionException&) {  // NOLINT
       // This failure was reported already.
-    } catch (const internal::GoogleTestFailureException &) {  // NOLINT
+    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
       // This exception type can only be thrown by a failed Google
       // Test assertion with the intention of letting another testing
       // framework catch it.  Therefore we just re-throw it.
       throw;
-    } catch (const std::exception &e) {  // NOLINT
+    } catch (const std::exception& e) {  // NOLINT
       internal::ReportFailureInUnknownLocation(
           TestPartResult::kFatalFailure,
           FormatCxxExceptionMessage(e.what(), location));
@@ -2581,23 +2672,23 @@ Result HandleExceptionsInMethodIfSupported(T *object, Result (T::*method)(),
 void Test::Run() {
   if (!HasSameFixtureClass()) return;
 
-  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
   impl->os_stack_trace_getter()->UponLeavingGTest();
   internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
   // We will run the test only if SetUp() was successful and didn't call
   // GTEST_SKIP().
   if (!HasFatalFailure() && !IsSkipped()) {
     impl->os_stack_trace_getter()->UponLeavingGTest();
-    internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody,
-                                                  "the test body");
+    internal::HandleExceptionsInMethodIfSupported(
+        this, &Test::TestBody, "the test body");
   }
 
   // However, we want to clean up as much as possible.  Hence we will
   // always call TearDown(), even if SetUp() or the test body has
   // failed.
   impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown,
-                                                "TearDown()");
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &Test::TearDown, "TearDown()");
 }
 
 // Returns true if and only if the current test has a fatal failure.
@@ -2607,9 +2698,8 @@ bool Test::HasFatalFailure() {
 
 // Returns true if and only if the current test has a non-fatal failure.
 bool Test::HasNonfatalFailure() {
-  return internal::GetUnitTestImpl()
-      ->current_test_result()
-      ->HasNonfatalFailure();
+  return internal::GetUnitTestImpl()->current_test_result()->
+      HasNonfatalFailure();
 }
 
 // Returns true if and only if the current test was skipped.
@@ -2621,18 +2711,24 @@ bool Test::IsSkipped() {
 
 // Constructs a TestInfo object. It assumes ownership of the test factory
 // object.
-TestInfo::TestInfo(const std::string &a_test_suite_name,
-                   const std::string &a_name, const char *a_type_param,
-                   const char *a_value_param,
+TestInfo::TestInfo(const std::string& a_test_suite_name,
+                   const std::string& a_name, const char* a_type_param,
+                   const char* a_value_param,
                    internal::CodeLocation a_code_location,
                    internal::TypeId fixture_class_id,
-                   internal::TestFactoryBase *factory)
-    : test_suite_name_(a_test_suite_name), name_(a_name),
+                   internal::TestFactoryBase* factory)
+    : test_suite_name_(a_test_suite_name),
+      name_(a_name),
       type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
       value_param_(a_value_param ? new std::string(a_value_param) : nullptr),
-      location_(a_code_location), fixture_class_id_(fixture_class_id),
-      should_run_(false), is_disabled_(false), matches_filter_(false),
-      factory_(factory), result_() {}
+      location_(a_code_location),
+      fixture_class_id_(fixture_class_id),
+      should_run_(false),
+      is_disabled_(false),
+      matches_filter_(false),
+      is_in_another_shard_(false),
+      factory_(factory),
+      result_() {}
 
 // Destructs a TestInfo object.
 TestInfo::~TestInfo() { delete factory_; }
@@ -2644,7 +2740,7 @@ namespace internal {
 //
 // Arguments:
 //
-//   test_suite_name:   name of the test suite
+//   test_suite_name:  name of the test suite
 //   name:             name of the test
 //   type_param:       the name of the test's type parameter, or NULL if
 //                     this is not a typed or a type-parameterized test.
@@ -2657,19 +2753,19 @@ namespace internal {
 //   factory:          pointer to the factory that creates a test object.
 //                     The newly created TestInfo instance will assume
 //                     ownership of the factory object.
-TestInfo *MakeAndRegisterTestInfo(
-    const char *test_suite_name, const char *name, const char *type_param,
-    const char *value_param, CodeLocation code_location,
+TestInfo* MakeAndRegisterTestInfo(
+    const char* test_suite_name, const char* name, const char* type_param,
+    const char* value_param, CodeLocation code_location,
     TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
-    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase *factory) {
-  TestInfo *const test_info =
+    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory) {
+  TestInfo* const test_info =
       new TestInfo(test_suite_name, name, type_param, value_param,
                    code_location, fixture_class_id, factory);
   GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
   return test_info;
 }
 
-void ReportInvalidTestSuiteType(const char *test_suite_name,
+void ReportInvalidTestSuiteType(const char* test_suite_name,
                                 CodeLocation code_location) {
   Message errors;
   errors
@@ -2703,10 +2799,11 @@ class TestNameIs {
   // Constructor.
   //
   // TestNameIs has NO default constructor.
-  explicit TestNameIs(const char *name) : name_(name) {}
+  explicit TestNameIs(const char* name)
+      : name_(name) {}
 
   // Returns true if and only if the test name of test_info matches name_.
-  bool operator()(const TestInfo *test_info) const {
+  bool operator()(const TestInfo * test_info) const {
     return test_info && test_info->name() == name_;
   }
 
@@ -2737,20 +2834,21 @@ void TestInfo::Run() {
   if (!should_run_) return;
 
   // Tells UnitTest where to store test result.
-  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
   impl->set_current_test_info(this);
 
-  TestEventListener *repeater = UnitTest::GetInstance()->listeners().repeater();
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
 
   // Notifies the unit test event listeners that a test is about to start.
   repeater->OnTestStart(*this);
 
-  const TimeInMillis start = internal::GetTimeInMillis();
+  result_.set_start_timestamp(internal::GetTimeInMillis());
+  internal::Timer timer;
 
   impl->os_stack_trace_getter()->UponLeavingGTest();
 
   // Creates the test object.
-  Test *const test = internal::HandleExceptionsInMethodIfSupported(
+  Test* const test = internal::HandleExceptionsInMethodIfSupported(
       factory_, &internal::TestFactoryBase::CreateTest,
       "the test fixture's constructor");
 
@@ -2770,8 +2868,7 @@ void TestInfo::Run() {
         test, &Test::DeleteSelf_, "the test fixture's destructor");
   }
 
-  result_.set_start_timestamp(start);
-  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+  result_.set_elapsed_time(timer.Elapsed());
 
   // Notifies the unit test event listener that a test has just finished.
   repeater->OnTestEnd(*this);
@@ -2781,6 +2878,28 @@ void TestInfo::Run() {
   impl->set_current_test_info(nullptr);
 }
 
+// Skip and records a skipped test result for this object.
+void TestInfo::Skip() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TestPartResult test_part_result =
+      TestPartResult(TestPartResult::kSkip, this->file(), this->line(), "");
+  impl->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+      test_part_result);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+  impl->set_current_test_info(nullptr);
+}
+
 // class TestSuite
 
 // Gets the number of successful tests in this test suite.
@@ -2827,18 +2946,21 @@ int TestSuite::total_test_count() const {
 //
 // Arguments:
 //
-//   name:         name of the test suite
+//   a_name:       name of the test suite
 //   a_type_param: the name of the test suite's type parameter, or NULL if
 //                 this is not a typed or a type-parameterized test suite.
 //   set_up_tc:    pointer to the function that sets up the test suite
 //   tear_down_tc: pointer to the function that tears down the test suite
-TestSuite::TestSuite(const char *a_name, const char *a_type_param,
+TestSuite::TestSuite(const char* a_name, const char* a_type_param,
                      internal::SetUpTestSuiteFunc set_up_tc,
                      internal::TearDownTestSuiteFunc tear_down_tc)
     : name_(a_name),
       type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
-      set_up_tc_(set_up_tc), tear_down_tc_(tear_down_tc), should_run_(false),
-      start_timestamp_(0), elapsed_time_(0) {}
+      set_up_tc_(set_up_tc),
+      tear_down_tc_(tear_down_tc),
+      should_run_(false),
+      start_timestamp_(0),
+      elapsed_time_(0) {}
 
 // Destructor of TestSuite.
 TestSuite::~TestSuite() {
@@ -2848,21 +2970,21 @@ TestSuite::~TestSuite() {
 
 // Returns the i-th test among all the tests. i can range from 0 to
 // total_test_count() - 1. If i is not in that range, returns NULL.
-const TestInfo *TestSuite::GetTestInfo(int i) const {
+const TestInfo* TestSuite::GetTestInfo(int i) const {
   const int index = GetElementOr(test_indices_, i, -1);
   return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
 }
 
 // Returns the i-th test among all the tests. i can range from 0 to
 // total_test_count() - 1. If i is not in that range, returns NULL.
-TestInfo *TestSuite::GetMutableTestInfo(int i) {
+TestInfo* TestSuite::GetMutableTestInfo(int i) {
   const int index = GetElementOr(test_indices_, i, -1);
   return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
 }
 
 // Adds a test to this test suite.  Will delete the test upon
 // destruction of the TestSuite object.
-void TestSuite::AddTestInfo(TestInfo *test_info) {
+void TestSuite::AddTestInfo(TestInfo* test_info) {
   test_info_list_.push_back(test_info);
   test_indices_.push_back(static_cast<int>(test_indices_.size()));
 }
@@ -2871,27 +2993,34 @@ void TestSuite::AddTestInfo(TestInfo *test_info) {
 void TestSuite::Run() {
   if (!should_run_) return;
 
-  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
   impl->set_current_test_suite(this);
 
-  TestEventListener *repeater = UnitTest::GetInstance()->listeners().repeater();
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
 
   // Call both legacy and the new API
   repeater->OnTestSuiteStart(*this);
 //  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   repeater->OnTestCaseStart(*this);
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   impl->os_stack_trace_getter()->UponLeavingGTest();
   internal::HandleExceptionsInMethodIfSupported(
       this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
 
   start_timestamp_ = internal::GetTimeInMillis();
+  internal::Timer timer;
   for (int i = 0; i < total_test_count(); i++) {
     GetMutableTestInfo(i)->Run();
+    if (GTEST_FLAG(fail_fast) && GetMutableTestInfo(i)->result()->Failed()) {
+      for (int j = i + 1; j < total_test_count(); j++) {
+        GetMutableTestInfo(j)->Skip();
+      }
+      break;
+    }
   }
-  elapsed_time_ = internal::GetTimeInMillis() - start_timestamp_;
+  elapsed_time_ = timer.Elapsed();
 
   impl->os_stack_trace_getter()->UponLeavingGTest();
   internal::HandleExceptionsInMethodIfSupported(
@@ -2900,9 +3029,39 @@ void TestSuite::Run() {
   // Call both legacy and the new API
   repeater->OnTestSuiteEnd(*this);
 //  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   repeater->OnTestCaseEnd(*this);
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  impl->set_current_test_suite(nullptr);
+}
+
+// Skips all tests under this TestSuite.
+void TestSuite::Skip() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_suite(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteStart(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  repeater->OnTestCaseStart(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Skip();
+  }
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteEnd(*this);
+  // Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  repeater->OnTestCaseEnd(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   impl->set_current_test_suite(nullptr);
 }
@@ -2914,7 +3073,7 @@ void TestSuite::ClearResult() {
 }
 
 // Shuffles the tests in this test suite.
-void TestSuite::ShuffleTests(internal::Random *random) {
+void TestSuite::ShuffleTests(internal::Random* random) {
   Shuffle(random, &test_indices_);
 }
 
@@ -2930,10 +3089,11 @@ void TestSuite::UnshuffleTests() {
 //
 // FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
 // FormatCountableNoun(5, "book", "books") returns "5 books".
-static std::string FormatCountableNoun(int count, const char *singular_form,
-                                       const char *plural_form) {
+static std::string FormatCountableNoun(int count,
+                                       const char * singular_form,
+                                       const char * plural_form) {
   return internal::StreamableToString(count) + " " +
-         (count == 1 ? singular_form : plural_form);
+      (count == 1 ? singular_form : plural_form);
 }
 
 // Formats the count of tests.
@@ -2950,10 +3110,12 @@ static std::string FormatTestSuiteCount(int test_suite_count) {
 // representation.  Both kNonFatalFailure and kFatalFailure are translated
 // to "Failure", as the user usually doesn't care about the difference
 // between the two when viewing the test result.
-static const char *TestPartResultTypeToString(TestPartResult::Type type) {
+static const char * TestPartResultTypeToString(TestPartResult::Type type) {
   switch (type) {
-    case TestPartResult::kSkip: return "Skipped";
-    case TestPartResult::kSuccess: return "Success";
+    case TestPartResult::kSkip:
+      return "Skipped\n";
+    case TestPartResult::kSuccess:
+      return "Success";
 
     case TestPartResult::kNonFatalFailure:
     case TestPartResult::kFatalFailure:
@@ -2962,27 +3124,30 @@ static const char *TestPartResultTypeToString(TestPartResult::Type type) {
 #else
       return "Failure\n";
 #endif
-    default: return "Unknown result type";
+    default:
+      return "Unknown result type";
   }
 }
 
 namespace internal {
+namespace {
+enum class GTestColor { kDefault, kRed, kGreen, kYellow };
+}  // namespace
 
 // Prints a TestPartResult to an std::string.
 static std::string PrintTestPartResultToString(
-    const TestPartResult &test_part_result) {
-  return (Message() << internal::FormatFileLocation(
-                           test_part_result.file_name(),
-                           test_part_result.line_number())
-                    << " "
-                    << TestPartResultTypeToString(test_part_result.type())
-                    << test_part_result.message())
-      .GetString();
+    const TestPartResult& test_part_result) {
+  return (Message()
+          << internal::FormatFileLocation(test_part_result.file_name(),
+                                          test_part_result.line_number())
+          << " " << TestPartResultTypeToString(test_part_result.type())
+          << test_part_result.message()).GetString();
 }
 
 // Prints a TestPartResult.
-static void PrintTestPartResult(const TestPartResult &test_part_result) {
-  const std::string &result = PrintTestPartResultToString(test_part_result);
+static void PrintTestPartResult(const TestPartResult& test_part_result) {
+  const std::string& result =
+      PrintTestPartResultToString(test_part_result);
   printf("%s\n", result.c_str());
   fflush(stdout);
   // If the test program runs in Visual Studio or a debugger, the
@@ -2999,16 +3164,19 @@ static void PrintTestPartResult(const TestPartResult &test_part_result) {
 }
 
 // class PrettyUnitTestResultPrinter
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
-    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
 
 // Returns the character attribute for the given color.
 static WORD GetColorAttribute(GTestColor color) {
   switch (color) {
-    case COLOR_RED: return FOREGROUND_RED;
-    case COLOR_GREEN: return FOREGROUND_GREEN;
-    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
-    default: return 0;
+    case GTestColor::kRed:
+      return FOREGROUND_RED;
+    case GTestColor::kGreen:
+      return FOREGROUND_GREEN;
+    case GTestColor::kYellow:
+      return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:           return 0;
   }
 }
 
@@ -3045,14 +3213,18 @@ static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
 
 #else
 
-// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// Returns the ANSI color code for the given color. GTestColor::kDefault is
 // an invalid input.
-static const char *GetAnsiColorCode(GTestColor color) {
+static const char* GetAnsiColorCode(GTestColor color) {
   switch (color) {
-    case COLOR_RED: return "1";
-    case COLOR_GREEN: return "2";
-    case COLOR_YELLOW: return "3";
-    default: return nullptr;
+    case GTestColor::kRed:
+      return "1";
+    case GTestColor::kGreen:
+      return "2";
+    case GTestColor::kYellow:
+      return "3";
+    default:
+      return nullptr;
   }
 }
 
@@ -3060,7 +3232,7 @@ static const char *GetAnsiColorCode(GTestColor color) {
 
 // Returns true if and only if Google Test should use colors in the output.
 bool ShouldUseColor(bool stdout_is_tty) {
-  const char *const gtest_color = GTEST_FLAG(color).c_str();
+  const char* const gtest_color = GTEST_FLAG(color).c_str();
 
   if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
@@ -3069,7 +3241,7 @@ bool ShouldUseColor(bool stdout_is_tty) {
     return stdout_is_tty;
 #else
     // On non-Windows platforms, we rely on the TERM variable.
-    const char *const term = posix::GetEnv("TERM");
+    const char* const term = posix::GetEnv("TERM");
     const bool term_supports_color =
         String::CStringEquals(term, "xterm") ||
         String::CStringEquals(term, "xterm-color") ||
@@ -3087,9 +3259,9 @@ bool ShouldUseColor(bool stdout_is_tty) {
   }
 
   return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
-         String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
-         String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
-         String::CStringEquals(gtest_color, "1");
+      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+      String::CStringEquals(gtest_color, "1");
   // We take "yes", "true", "t", and "1" as meaning "yes".  If the
   // value is neither one of these nor "auto", we treat it as "no" to
   // be conservative.
@@ -3099,7 +3271,9 @@ bool ShouldUseColor(bool stdout_is_tty) {
 // cannot simply emit special characters and have the terminal change colors.
 // This routine must actually emit the characters rather than return a string
 // that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char *fmt, ...) {
+
+GTEST_ATTRIBUTE_PRINTF_(2, 3)
+static void ColoredPrintf(GTestColor color, const char *fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
@@ -3109,7 +3283,7 @@ void ColoredPrintf(GTestColor color, const char *fmt, ...) {
 #else
   static const bool in_color_mode =
       ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
-  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+  const bool use_color = in_color_mode && (color != GTestColor::kDefault);
 #endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS
 
   if (!use_color) {
@@ -3118,8 +3292,8 @@ void ColoredPrintf(GTestColor color, const char *fmt, ...) {
     return;
   }
 
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
-    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
   const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 
   // Gets the current text color.
@@ -3152,9 +3326,9 @@ void ColoredPrintf(GTestColor color, const char *fmt, ...) {
 static const char kTypeParamLabel[] = "TypeParam";
 static const char kValueParamLabel[] = "GetParam()";
 
-static void PrintFullTestCommentIfPresent(const TestInfo &test_info) {
-  const char *const type_param = test_info.type_param();
-  const char *const value_param = test_info.value_param();
+static void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+  const char* const type_param = test_info.type_param();
+  const char* const value_param = test_info.value_param();
 
   if (type_param != nullptr || value_param != nullptr) {
     printf(", where ");
@@ -3174,70 +3348,71 @@ static void PrintFullTestCommentIfPresent(const TestInfo &test_info) {
 class PrettyUnitTestResultPrinter : public TestEventListener {
  public:
   PrettyUnitTestResultPrinter() {}
-  static void PrintTestName(const char *test_suite, const char *test) {
+  static void PrintTestName(const char* test_suite, const char* test) {
     printf("%s.%s", test_suite, test);
   }
 
   // The following methods override what's in the TestEventListener class.
-  void OnTestProgramStart(const UnitTest & /*unit_test*/) override {}
-  void OnTestIterationStart(const UnitTest &unit_test, int iteration) override;
-  void OnEnvironmentsSetUpStart(const UnitTest &unit_test) override;
-  void OnEnvironmentsSetUpEnd(const UnitTest & /*unit_test*/) override {}
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest& unit_test, int iteration) override;
+  void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override;
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  void OnTestCaseStart(const TestCase &test_case) override;
+  void OnTestCaseStart(const TestCase& test_case) override;
 #else
-  void OnTestSuiteStart(const TestSuite &test_suite) override;
+  void OnTestSuiteStart(const TestSuite& test_suite) override;
 #endif  // OnTestCaseStart
 
-  void OnTestStart(const TestInfo &test_info) override;
+  void OnTestStart(const TestInfo& test_info) override;
 
-  void OnTestPartResult(const TestPartResult &result) override;
-  void OnTestEnd(const TestInfo &test_info) override;
+  void OnTestPartResult(const TestPartResult& result) override;
+  void OnTestEnd(const TestInfo& test_info) override;
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  void OnTestCaseEnd(const TestCase &test_case) override;
+  void OnTestCaseEnd(const TestCase& test_case) override;
 #else
-  void OnTestSuiteEnd(const TestSuite &test_suite) override;
+  void OnTestSuiteEnd(const TestSuite& test_suite) override;
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-  void OnEnvironmentsTearDownStart(const UnitTest &unit_test) override;
-  void OnEnvironmentsTearDownEnd(const UnitTest & /*unit_test*/) override {}
-  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
-  void OnTestProgramEnd(const UnitTest & /*unit_test*/) override {}
+  void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override;
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
 
  private:
-  static void PrintFailedTests(const UnitTest &unit_test);
-  static void PrintFailedTestSuites(const UnitTest &unit_test);
-  static void PrintSkippedTests(const UnitTest &unit_test);
+  static void PrintFailedTests(const UnitTest& unit_test);
+  static void PrintFailedTestSuites(const UnitTest& unit_test);
+  static void PrintSkippedTests(const UnitTest& unit_test);
 };
 
-// Fired before each iteration of tests starts.
+  // Fired before each iteration of tests starts.
 void PrettyUnitTestResultPrinter::OnTestIterationStart(
-    const UnitTest &unit_test, int iteration) {
+    const UnitTest& unit_test, int iteration) {
   if (GTEST_FLAG(repeat) != 1)
     printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
 
-  const char *const filter = GTEST_FLAG(filter).c_str();
+  const char* const filter = GTEST_FLAG(filter).c_str();
 
   // Prints the filter if it's not *.  This reminds the user that some
   // tests may be skipped.
   if (!String::CStringEquals(filter, kUniversalFilter)) {
-    ColoredPrintf(COLOR_YELLOW, "Note: %s filter = %s\n", GTEST_NAME_, filter);
+    ColoredPrintf(GTestColor::kYellow, "Note: %s filter = %s\n", GTEST_NAME_,
+                  filter);
   }
 
   if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
     const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
-    ColoredPrintf(COLOR_YELLOW, "Note: This is test shard %d of %s.\n",
+    ColoredPrintf(GTestColor::kYellow, "Note: This is test shard %d of %s.\n",
                   static_cast<int>(shard_index) + 1,
                   internal::posix::GetEnv(kTestTotalShards));
   }
 
   if (GTEST_FLAG(shuffle)) {
-    ColoredPrintf(COLOR_YELLOW,
+    ColoredPrintf(GTestColor::kYellow,
                   "Note: Randomizing tests' orders with a seed of %d .\n",
                   unit_test.random_seed());
   }
 
-  ColoredPrintf(COLOR_GREEN, "[==========] ");
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
   printf("Running %s from %s.\n",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
@@ -3245,17 +3420,17 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart(
 }
 
 void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
-    const UnitTest & /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("Global test environment set-up.\n");
   fflush(stdout);
 }
 
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase &test_case) {
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s", counts.c_str(), test_case.name());
   if (test_case.type_param() == nullptr) {
     printf("\n");
@@ -3266,10 +3441,10 @@ void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase &test_case) {
 }
 #else
 void PrettyUnitTestResultPrinter::OnTestSuiteStart(
-    const TestSuite &test_suite) {
+    const TestSuite& test_suite) {
   const std::string counts =
       FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s", counts.c_str(), test_suite.name());
   if (test_suite.type_param() == nullptr) {
     printf("\n");
@@ -3280,8 +3455,8 @@ void PrettyUnitTestResultPrinter::OnTestSuiteStart(
 }
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo &test_info) {
-  ColoredPrintf(COLOR_GREEN, "[ RUN      ] ");
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
+  ColoredPrintf(GTestColor::kGreen, "[ RUN      ] ");
   PrintTestName(test_info.test_suite_name(), test_info.name());
   printf("\n");
   fflush(stdout);
@@ -3289,10 +3464,11 @@ void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo &test_info) {
 
 // Called after an assertion failure.
 void PrettyUnitTestResultPrinter::OnTestPartResult(
-    const TestPartResult &result) {
+    const TestPartResult& result) {
   switch (result.type()) {
     // If the test part succeeded, we don't need to do anything.
-    case TestPartResult::kSuccess: return;
+    case TestPartResult::kSuccess:
+      return;
     default:
       // Print failure message from the assertion
       // (e.g. expected this and got that).
@@ -3301,21 +3477,21 @@ void PrettyUnitTestResultPrinter::OnTestPartResult(
   }
 }
 
-void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo &test_info) {
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
   if (test_info.result()->Passed()) {
-    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+    ColoredPrintf(GTestColor::kGreen, "[       OK ] ");
   } else if (test_info.result()->Skipped()) {
-    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
   } else {
-    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+    ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
   }
   PrintTestName(test_info.test_suite_name(), test_info.name());
-  if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info);
+  if (test_info.result()->Failed())
+    PrintFullTestCommentIfPresent(test_info);
 
   if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms)\n",
-           internal::StreamableToString(test_info.result()->elapsed_time())
-               .c_str());
+    printf(" (%s ms)\n", internal::StreamableToString(
+           test_info.result()->elapsed_time()).c_str());
   } else {
     printf("\n");
   }
@@ -3323,23 +3499,23 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo &test_info) {
 }
 
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase &test_case) {
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
   if (!GTEST_FLAG(print_time)) return;
 
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(),
          internal::StreamableToString(test_case.elapsed_time()).c_str());
   fflush(stdout);
 }
 #else
-void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite &test_suite) {
+void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) {
   if (!GTEST_FLAG(print_time)) return;
 
   const std::string counts =
       FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(),
          internal::StreamableToString(test_suite.elapsed_time()).c_str());
   fflush(stdout);
@@ -3347,29 +3523,29 @@ void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite &test_suite) {
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
 void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
-    const UnitTest & /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("Global test environment tear-down\n");
   fflush(stdout);
 }
 
 // Internal helper for printing the list of failed tests.
-void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest &unit_test) {
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
   const int failed_test_count = unit_test.failed_test_count();
-  ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
   printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
 
   for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
-    const TestSuite &test_suite = *unit_test.GetTestSuite(i);
+    const TestSuite& test_suite = *unit_test.GetTestSuite(i);
     if (!test_suite.should_run() || (test_suite.failed_test_count() == 0)) {
       continue;
     }
     for (int j = 0; j < test_suite.total_test_count(); ++j) {
-      const TestInfo &test_info = *test_suite.GetTestInfo(j);
+      const TestInfo& test_info = *test_suite.GetTestInfo(j);
       if (!test_info.should_run() || !test_info.result()->Failed()) {
         continue;
       }
-      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
       printf("%s.%s", test_suite.name(), test_info.name());
       PrintFullTestCommentIfPresent(test_info);
       printf("\n");
@@ -3382,15 +3558,15 @@ void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest &unit_test) {
 // Internal helper for printing the list of test suite failures not covered by
 // PrintFailedTests.
 void PrettyUnitTestResultPrinter::PrintFailedTestSuites(
-    const UnitTest &unit_test) {
+    const UnitTest& unit_test) {
   int suite_failure_count = 0;
   for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
-    const TestSuite &test_suite = *unit_test.GetTestSuite(i);
+    const TestSuite& test_suite = *unit_test.GetTestSuite(i);
     if (!test_suite.should_run()) {
       continue;
     }
     if (test_suite.ad_hoc_test_result().Failed()) {
-      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
       printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name());
       ++suite_failure_count;
     }
@@ -3402,32 +3578,32 @@ void PrettyUnitTestResultPrinter::PrintFailedTestSuites(
 }
 
 // Internal helper for printing the list of skipped tests.
-void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest &unit_test) {
+void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) {
   const int skipped_test_count = unit_test.skipped_test_count();
   if (skipped_test_count == 0) {
     return;
   }
 
   for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
-    const TestSuite &test_suite = *unit_test.GetTestSuite(i);
+    const TestSuite& test_suite = *unit_test.GetTestSuite(i);
     if (!test_suite.should_run() || (test_suite.skipped_test_count() == 0)) {
       continue;
     }
     for (int j = 0; j < test_suite.total_test_count(); ++j) {
-      const TestInfo &test_info = *test_suite.GetTestInfo(j);
+      const TestInfo& test_info = *test_suite.GetTestInfo(j);
       if (!test_info.should_run() || !test_info.result()->Skipped()) {
         continue;
       }
-      ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+      ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
       printf("%s.%s", test_suite.name(), test_info.name());
       printf("\n");
     }
   }
 }
 
-void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
                                                      int /*iteration*/) {
-  ColoredPrintf(COLOR_GREEN, "[==========] ");
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
   printf("%s from %s ran.",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
@@ -3436,12 +3612,12 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
            internal::StreamableToString(unit_test.elapsed_time()).c_str());
   }
   printf("\n");
-  ColoredPrintf(COLOR_GREEN, "[  PASSED  ] ");
+  ColoredPrintf(GTestColor::kGreen, "[  PASSED  ] ");
   printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
 
   const int skipped_test_count = unit_test.skipped_test_count();
   if (skipped_test_count > 0) {
-    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
     printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str());
     PrintSkippedTests(unit_test);
   }
@@ -3456,8 +3632,8 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
     if (unit_test.Passed()) {
       printf("\n");  // Add a spacer if no FAILURE banner is displayed.
     }
-    ColoredPrintf(COLOR_YELLOW, "  YOU HAVE %d DISABLED %s\n\n", num_disabled,
-                  num_disabled == 1 ? "TEST" : "TESTS");
+    ColoredPrintf(GTestColor::kYellow, "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled, num_disabled == 1 ? "TEST" : "TESTS");
   }
   // Ensure that Google Test output is printed before, e.g., heapchecker output.
   fflush(stdout);
@@ -3465,6 +3641,110 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
 
 // End PrettyUnitTestResultPrinter
 
+// This class implements the TestEventListener interface.
+//
+// Class BriefUnitTestResultPrinter is copyable.
+class BriefUnitTestResultPrinter : public TestEventListener {
+ public:
+  BriefUnitTestResultPrinter() {}
+  static void PrintTestName(const char* test_suite, const char* test) {
+    printf("%s.%s", test_suite, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                            int /*iteration*/) override {}
+  void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase& /*test_case*/) override {}
+#else
+  void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {}
+#endif  // OnTestCaseStart
+
+  void OnTestStart(const TestInfo& /*test_info*/) override {}
+
+  void OnTestPartResult(const TestPartResult& result) override;
+  void OnTestEnd(const TestInfo& test_info) override;
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase& /*test_case*/) override {}
+#else
+  void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
+};
+
+// Called after an assertion failure.
+void BriefUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  switch (result.type()) {
+    // If the test part succeeded, we don't need to do anything.
+    case TestPartResult::kSuccess:
+      return;
+    default:
+      // Print failure message from the assertion
+      // (e.g. expected this and got that).
+      PrintTestPartResult(result);
+      fflush(stdout);
+  }
+}
+
+void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Failed()) {
+    ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
+    PrintTestName(test_info.test_suite_name(), test_info.name());
+    PrintFullTestCommentIfPresent(test_info);
+
+    if (GTEST_FLAG(print_time)) {
+      printf(" (%s ms)\n",
+             internal::StreamableToString(test_info.result()->elapsed_time())
+                 .c_str());
+    } else {
+      printf("\n");
+    }
+    fflush(stdout);
+  }
+}
+
+void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                    int /*iteration*/) {
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(GTestColor::kGreen, "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count > 0) {
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
+    printf("%s.\n", FormatTestCount(skipped_test_count).c_str());
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (unit_test.Passed()) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(GTestColor::kYellow, "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled, num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End BriefUnitTestResultPrinter
+
 // class TestEventRepeater
 //
 // This class forwards events to other event listeners.
@@ -3473,41 +3753,41 @@ class TestEventRepeater : public TestEventListener {
   TestEventRepeater() : forwarding_enabled_(true) {}
   ~TestEventRepeater() override;
   void Append(TestEventListener *listener);
-  TestEventListener *Release(TestEventListener *listener);
+  TestEventListener* Release(TestEventListener* listener);
 
   // Controls whether events will be forwarded to listeners_. Set to false
   // in death test child processes.
   bool forwarding_enabled() const { return forwarding_enabled_; }
   void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
 
-  void OnTestProgramStart(const UnitTest &unit_test) override;
-  void OnTestIterationStart(const UnitTest &unit_test, int iteration) override;
-  void OnEnvironmentsSetUpStart(const UnitTest &unit_test) override;
-  void OnEnvironmentsSetUpEnd(const UnitTest &unit_test) override;
+  void OnTestProgramStart(const UnitTest& unit_test) override;
+  void OnTestIterationStart(const UnitTest& unit_test, int iteration) override;
+  void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override;
+  void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) override;
 //  Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  void OnTestCaseStart(const TestSuite &parameter) override;
+  void OnTestCaseStart(const TestSuite& parameter) override;
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  void OnTestSuiteStart(const TestSuite &parameter) override;
-  void OnTestStart(const TestInfo &test_info) override;
-  void OnTestPartResult(const TestPartResult &result) override;
-  void OnTestEnd(const TestInfo &test_info) override;
+  void OnTestSuiteStart(const TestSuite& parameter) override;
+  void OnTestStart(const TestInfo& test_info) override;
+  void OnTestPartResult(const TestPartResult& result) override;
+  void OnTestEnd(const TestInfo& test_info) override;
 //  Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  void OnTestCaseEnd(const TestCase &parameter) override;
+  void OnTestCaseEnd(const TestCase& parameter) override;
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  void OnTestSuiteEnd(const TestSuite &parameter) override;
-  void OnEnvironmentsTearDownStart(const UnitTest &unit_test) override;
-  void OnEnvironmentsTearDownEnd(const UnitTest &unit_test) override;
-  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
-  void OnTestProgramEnd(const UnitTest &unit_test) override;
+  void OnTestSuiteEnd(const TestSuite& parameter) override;
+  void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override;
+  void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) override;
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest& unit_test) override;
 
  private:
   // Controls whether events will be forwarded to listeners_. Set to false
   // in death test child processes.
   bool forwarding_enabled_;
   // The list of listeners that receive events.
-  std::vector<TestEventListener *> listeners_;
+  std::vector<TestEventListener*> listeners_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
 };
@@ -3520,7 +3800,7 @@ void TestEventRepeater::Append(TestEventListener *listener) {
   listeners_.push_back(listener);
 }
 
-TestEventListener *TestEventRepeater::Release(TestEventListener *listener) {
+TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
   for (size_t i = 0; i < listeners_.size(); ++i) {
     if (listeners_[i] == listener) {
       listeners_.erase(listeners_.begin() + static_cast<int>(i));
@@ -3533,18 +3813,18 @@ TestEventListener *TestEventRepeater::Release(TestEventListener *listener) {
 
 // Since most methods are very similar, use macros to reduce boilerplate.
 // This defines a member that forwards the call to all listeners.
-#define GTEST_REPEATER_METHOD_(Name, Type)              \
-  void TestEventRepeater::Name(const Type &parameter) { \
-    if (forwarding_enabled_) {                          \
-      for (size_t i = 0; i < listeners_.size(); i++) {  \
-        listeners_[i]->Name(parameter);                 \
-      }                                                 \
-    }                                                   \
-  }
+#define GTEST_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (size_t i = 0; i < listeners_.size(); i++) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
 // This defines a member that forwards the call to all listeners in reverse
 // order.
 #define GTEST_REVERSE_REPEATER_METHOD_(Name, Type)      \
-  void TestEventRepeater::Name(const Type &parameter) { \
+  void TestEventRepeater::Name(const Type& parameter) { \
     if (forwarding_enabled_) {                          \
       for (size_t i = listeners_.size(); i != 0; i--) { \
         listeners_[i - 1]->Name(parameter);             \
@@ -3575,7 +3855,7 @@ GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
 #undef GTEST_REPEATER_METHOD_
 #undef GTEST_REVERSE_REPEATER_METHOD_
 
-void TestEventRepeater::OnTestIterationStart(const UnitTest &unit_test,
+void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
                                              int iteration) {
   if (forwarding_enabled_) {
     for (size_t i = 0; i < listeners_.size(); i++) {
@@ -3584,7 +3864,7 @@ void TestEventRepeater::OnTestIterationStart(const UnitTest &unit_test,
   }
 }
 
-void TestEventRepeater::OnTestIterationEnd(const UnitTest &unit_test,
+void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
                                            int iteration) {
   if (forwarding_enabled_) {
     for (size_t i = listeners_.size(); i > 0; i--) {
@@ -3598,14 +3878,14 @@ void TestEventRepeater::OnTestIterationEnd(const UnitTest &unit_test,
 // This class generates an XML output file.
 class XmlUnitTestResultPrinter : public EmptyTestEventListener {
  public:
-  explicit XmlUnitTestResultPrinter(const char *output_file);
+  explicit XmlUnitTestResultPrinter(const char* output_file);
 
-  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
-  void ListTestsMatchingFilter(const std::vector<TestSuite *> &test_suites);
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void ListTestsMatchingFilter(const std::vector<TestSuite*>& test_suites);
 
   // Prints an XML summary of all unit tests.
-  static void PrintXmlTestsList(std::ostream *stream,
-                                const std::vector<TestSuite *> &test_suites);
+  static void PrintXmlTestsList(std::ostream* stream,
+                                const std::vector<TestSuite*>& test_suites);
 
  private:
   // Is c a whitespace character that is normalized to a space character
@@ -3623,54 +3903,64 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   // is_attribute is true, the text is meant to appear as an attribute
   // value, and normalizable whitespace is preserved by replacing it
   // with character references.
-  static std::string EscapeXml(const std::string &str, bool is_attribute);
+  static std::string EscapeXml(const std::string& str, bool is_attribute);
 
   // Returns the given string with all characters invalid in XML removed.
-  static std::string RemoveInvalidXmlCharacters(const std::string &str);
+  static std::string RemoveInvalidXmlCharacters(const std::string& str);
 
   // Convenience wrapper around EscapeXml when str is an attribute value.
-  static std::string EscapeXmlAttribute(const std::string &str) {
+  static std::string EscapeXmlAttribute(const std::string& str) {
     return EscapeXml(str, true);
   }
 
   // Convenience wrapper around EscapeXml when str is not an attribute value.
-  static std::string EscapeXmlText(const char *str) {
+  static std::string EscapeXmlText(const char* str) {
     return EscapeXml(str, false);
   }
 
   // Verifies that the given attribute belongs to the given element and
   // streams the attribute as XML.
-  static void OutputXmlAttribute(std::ostream *stream,
-                                 const std::string &element_name,
-                                 const std::string &name,
-                                 const std::string &value);
+  static void OutputXmlAttribute(std::ostream* stream,
+                                 const std::string& element_name,
+                                 const std::string& name,
+                                 const std::string& value);
 
   // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-  static void OutputXmlCDataSection(::std::ostream *stream, const char *data);
+  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+
+  // Streams a test suite XML stanza containing the given test result.
+  //
+  // Requires: result.Failed()
+  static void OutputXmlTestSuiteForTestResult(::std::ostream* stream,
+                                              const TestResult& result);
+
+  // Streams an XML representation of a TestResult object.
+  static void OutputXmlTestResult(::std::ostream* stream,
+                                  const TestResult& result);
 
   // Streams an XML representation of a TestInfo object.
-  static void OutputXmlTestInfo(::std::ostream *stream,
-                                const char *test_suite_name,
-                                const TestInfo &test_info);
+  static void OutputXmlTestInfo(::std::ostream* stream,
+                                const char* test_suite_name,
+                                const TestInfo& test_info);
 
   // Prints an XML representation of a TestSuite object
-  static void PrintXmlTestSuite(::std::ostream *stream,
-                                const TestSuite &test_suite);
+  static void PrintXmlTestSuite(::std::ostream* stream,
+                                const TestSuite& test_suite);
 
   // Prints an XML summary of unit_test to output stream out.
-  static void PrintXmlUnitTest(::std::ostream *stream,
-                               const UnitTest &unit_test);
+  static void PrintXmlUnitTest(::std::ostream* stream,
+                               const UnitTest& unit_test);
 
   // Produces a string representing the test properties in a result as space
   // delimited XML attributes based on the property key="value" pairs.
   // When the std::string is not empty, it includes a space at the beginning,
   // to delimit this attribute from prior attributes.
-  static std::string TestPropertiesAsXmlAttributes(const TestResult &result);
+  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
 
   // Streams an XML representation of the test properties of a TestResult
   // object.
-  static void OutputXmlTestProperties(std::ostream *stream,
-                                      const TestResult &result);
+  static void OutputXmlTestProperties(std::ostream* stream,
+                                      const TestResult& result);
 
   // The output file.
   const std::string output_file_;
@@ -3679,7 +3969,7 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
 };
 
 // Creates a new XmlUnitTestResultPrinter.
-XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char *output_file)
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
     : output_file_(output_file) {
   if (output_file_.empty()) {
     GTEST_LOG_(FATAL) << "XML output file may not be null";
@@ -3687,9 +3977,9 @@ XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char *output_file)
 }
 
 // Called after the unit test ends.
-void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
                                                   int /*iteration*/) {
-  FILE *xmlout = OpenFileForWriting(output_file_);
+  FILE* xmlout = OpenFileForWriting(output_file_);
   std::stringstream stream;
   PrintXmlUnitTest(&stream, unit_test);
   fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
@@ -3697,8 +3987,8 @@ void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
 }
 
 void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
-    const std::vector<TestSuite *> &test_suites) {
-  FILE *xmlout = OpenFileForWriting(output_file_);
+    const std::vector<TestSuite*>& test_suites) {
+  FILE* xmlout = OpenFileForWriting(output_file_);
   std::stringstream stream;
   PrintXmlTestsList(&stream, test_suites);
   fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
@@ -3715,16 +4005,22 @@ void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
 // module will consist of ordinary English text.
 // If this module is ever modified to produce version 1.1 XML output,
 // most invalid characters can be retained using character references.
-std::string XmlUnitTestResultPrinter::EscapeXml(const std::string &str,
-                                                bool is_attribute) {
+std::string XmlUnitTestResultPrinter::EscapeXml(
+    const std::string& str, bool is_attribute) {
   Message m;
 
   for (size_t i = 0; i < str.size(); ++i) {
     const char ch = str[i];
     switch (ch) {
-      case '<': m << "&lt;"; break;
-      case '>': m << "&gt;"; break;
-      case '&': m << "&amp;"; break;
+      case '<':
+        m << "&lt;";
+        break;
+      case '>':
+        m << "&gt;";
+        break;
+      case '&':
+        m << "&amp;";
+        break;
       case '\'':
         if (is_attribute)
           m << "&apos;";
@@ -3756,11 +4052,12 @@ std::string XmlUnitTestResultPrinter::EscapeXml(const std::string &str,
 // Currently invalid characters are dropped from the string. An
 // alternative is to replace them with certain characters such as . or ?.
 std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
-    const std::string &str) {
+    const std::string& str) {
   std::string output;
   output.reserve(str.size());
   for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
-    if (IsValidXmlCharacter(*it)) output.push_back(*it);
+    if (IsValidXmlCharacter(*it))
+      output.push_back(*it);
 
   return output;
 }
@@ -3789,16 +4086,20 @@ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
   return ss.str();
 }
 
-static bool PortableLocaltime(time_t seconds, struct tm *out) {
+static bool PortableLocaltime(time_t seconds, struct tm* out) {
 #if defined(_MSC_VER)
   return localtime_s(out, &seconds) == 0;
 #elif defined(__MINGW32__) || defined(__MINGW64__)
   // MINGW <time.h> provides neither localtime_r nor localtime_s, but uses
   // Windows' localtime(), which has a thread-local tm buffer.
-  struct tm *tm_ptr = localtime(&seconds);  // NOLINT
+  struct tm* tm_ptr = localtime(&seconds);  // NOLINT
   if (tm_ptr == nullptr) return false;
   *out = *tm_ptr;
   return true;
+#elif defined(__STDC_LIB_EXT1__)
+  // Uses localtime_s when available as localtime_r is only available from
+  // C23 standard.
+  return localtime_s(&seconds, out) != nullptr;
 #else
   return localtime_r(&seconds, out) != nullptr;
 #endif
@@ -3810,25 +4111,26 @@ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
   struct tm time_struct;
   if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
     return "";
-  // YYYY-MM-DDThh:mm:ss
+  // YYYY-MM-DDThh:mm:ss.sss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
-         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-         String::FormatIntWidth2(time_struct.tm_min) + ":" +
-         String::FormatIntWidth2(time_struct.tm_sec);
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec) + "." +
+      String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
 }
 
 // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream *stream,
-                                                     const char *data) {
-  const char *segment = data;
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
+                                                     const char* data) {
+  const char* segment = data;
   *stream << "<![CDATA[";
   for (;;) {
-    const char *const next_segment = strstr(segment, "]]>");
+    const char* const next_segment = strstr(segment, "]]>");
     if (next_segment != nullptr) {
-      stream->write(segment,
-                    static_cast<std::streamsize>(next_segment - segment));
+      stream->write(
+          segment, static_cast<std::streamsize>(next_segment - segment));
       *stream << "]]>]]&gt;<![CDATA[";
       segment = next_segment + strlen("]]>");
     } else {
@@ -3840,24 +4142,63 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream *stream,
 }
 
 void XmlUnitTestResultPrinter::OutputXmlAttribute(
-    std::ostream *stream, const std::string &element_name,
-    const std::string &name, const std::string &value) {
-  const std::vector<std::string> &allowed_names =
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value) {
+  const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-               allowed_names.end())
+                   allowed_names.end())
       << "Attribute " << name << " is not allowed for element <" << element_name
       << ">.";
 
   *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
 }
 
+// Streams a test suite XML stanza containing the given test result.
+void XmlUnitTestResultPrinter::OutputXmlTestSuiteForTestResult(
+    ::std::ostream* stream, const TestResult& result) {
+  // Output the boilerplate for a minimal test suite with one test.
+  *stream << "  <testsuite";
+  OutputXmlAttribute(stream, "testsuite", "name", "NonTestSuiteFailure");
+  OutputXmlAttribute(stream, "testsuite", "tests", "1");
+  OutputXmlAttribute(stream, "testsuite", "failures", "1");
+  OutputXmlAttribute(stream, "testsuite", "disabled", "0");
+  OutputXmlAttribute(stream, "testsuite", "skipped", "0");
+  OutputXmlAttribute(stream, "testsuite", "errors", "0");
+  OutputXmlAttribute(stream, "testsuite", "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, "testsuite", "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+  *stream << ">";
+
+  // Output the boilerplate for a minimal test case with a single test.
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, "testcase", "name", "");
+  OutputXmlAttribute(stream, "testcase", "status", "run");
+  OutputXmlAttribute(stream, "testcase", "result", "completed");
+  OutputXmlAttribute(stream, "testcase", "classname", "");
+  OutputXmlAttribute(stream, "testcase", "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, "testcase", "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+
+  // Output the actual test result.
+  OutputXmlTestResult(stream, result);
+
+  // Complete the test suite.
+  *stream << "  </testsuite>\n";
+}
+
 // Prints an XML representation of a TestInfo object.
-void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream *stream,
-                                                 const char *test_suite_name,
-                                                 const TestInfo &test_info) {
-  const TestResult &result = *test_info.result();
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
+                                                 const char* test_suite_name,
+                                                 const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
   const std::string kTestsuite = "testcase";
 
   if (test_info.is_in_another_shard()) {
@@ -3896,11 +4237,17 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream *stream,
       FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
   OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name);
 
+  OutputXmlTestResult(stream, result);
+}
+
+void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream,
+                                                   const TestResult& result) {
   int failures = 0;
+  int skips = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
-    const TestPartResult &part = result.GetTestPartResult(i);
+    const TestPartResult& part = result.GetTestPartResult(i);
     if (part.failed()) {
-      if (++failures == 1) {
+      if (++failures == 1 && skips == 0) {
         *stream << ">\n";
       }
       const std::string location =
@@ -3908,17 +4255,31 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream *stream,
                                                           part.line_number());
       const std::string summary = location + "\n" + part.summary();
       *stream << "      <failure message=\""
-              << EscapeXmlAttribute(summary.c_str()) << "\" type=\"\">";
+              << EscapeXmlAttribute(summary)
+              << "\" type=\"\">";
       const std::string detail = location + "\n" + part.message();
       OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
       *stream << "</failure>\n";
+    } else if (part.skipped()) {
+      if (++skips == 1 && failures == 0) {
+        *stream << ">\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
+      *stream << "      <skipped message=\""
+              << EscapeXmlAttribute(summary.c_str()) << "\">";
+      const std::string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</skipped>\n";
     }
   }
 
-  if (failures == 0 && result.test_property_count() == 0) {
+  if (failures == 0 && skips == 0 && result.test_property_count() == 0) {
     *stream << " />\n";
   } else {
-    if (failures == 0) {
+    if (failures == 0 && skips == 0) {
       *stream << ">\n";
     }
     OutputXmlTestProperties(stream, result);
@@ -3927,8 +4288,8 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream *stream,
 }
 
 // Prints an XML representation of a TestSuite object
-void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream *stream,
-                                                 const TestSuite &test_suite) {
+void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream,
+                                                 const TestSuite& test_suite) {
   const std::string kTestsuite = "testsuite";
   *stream << "  <" << kTestsuite;
   OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name());
@@ -3940,7 +4301,11 @@ void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream *stream,
     OutputXmlAttribute(
         stream, kTestsuite, "disabled",
         StreamableToString(test_suite.reportable_disabled_test_count()));
+    OutputXmlAttribute(stream, kTestsuite, "skipped",
+                       StreamableToString(test_suite.skipped_test_count()));
+
     OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+
     OutputXmlAttribute(stream, kTestsuite, "time",
                        FormatTimeInMillisAsSeconds(test_suite.elapsed_time()));
     OutputXmlAttribute(
@@ -3957,8 +4322,8 @@ void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream *stream,
 }
 
 // Prints an XML summary of unit_test to output stream out.
-void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream *stream,
-                                                const UnitTest &unit_test) {
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
+                                                const UnitTest& unit_test) {
   const std::string kTestsuites = "testsuites";
 
   *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
@@ -3991,11 +4356,18 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream *stream,
     if (unit_test.GetTestSuite(i)->reportable_test_count() > 0)
       PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i));
   }
+
+  // If there was a test failure outside of one of the test suites (like in a
+  // test environment) include that in the output.
+  if (unit_test.ad_hoc_test_result().Failed()) {
+    OutputXmlTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
+  }
+
   *stream << "</" << kTestsuites << ">\n";
 }
 
 void XmlUnitTestResultPrinter::PrintXmlTestsList(
-    std::ostream *stream, const std::vector<TestSuite *> &test_suites) {
+    std::ostream* stream, const std::vector<TestSuite*>& test_suites) {
   const std::string kTestsuites = "testsuites";
 
   *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
@@ -4019,18 +4391,18 @@ void XmlUnitTestResultPrinter::PrintXmlTestsList(
 // Produces a string representing the test properties in a result as space
 // delimited XML attributes based on the property key="value" pairs.
 std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
-    const TestResult &result) {
+    const TestResult& result) {
   Message attributes;
   for (int i = 0; i < result.test_property_count(); ++i) {
-    const TestProperty &property = result.GetTestProperty(i);
+    const TestProperty& property = result.GetTestProperty(i);
     attributes << " " << property.key() << "="
-               << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
   }
   return attributes.GetString();
 }
 
 void XmlUnitTestResultPrinter::OutputXmlTestProperties(
-    std::ostream *stream, const TestResult &result) {
+    std::ostream* stream, const TestResult& result) {
   const std::string kProperties = "properties";
   const std::string kProperty = "property";
 
@@ -4040,7 +4412,7 @@ void XmlUnitTestResultPrinter::OutputXmlTestProperties(
 
   *stream << "<" << kProperties << ">\n";
   for (int i = 0; i < result.test_property_count(); ++i) {
-    const TestProperty &property = result.GetTestProperty(i);
+    const TestProperty& property = result.GetTestProperty(i);
     *stream << "<" << kProperty;
     *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
     *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
@@ -4054,46 +4426,60 @@ void XmlUnitTestResultPrinter::OutputXmlTestProperties(
 // This class generates an JSON output file.
 class JsonUnitTestResultPrinter : public EmptyTestEventListener {
  public:
-  explicit JsonUnitTestResultPrinter(const char *output_file);
+  explicit JsonUnitTestResultPrinter(const char* output_file);
 
-  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
 
   // Prints an JSON summary of all unit tests.
-  static void PrintJsonTestList(::std::ostream *stream,
-                                const std::vector<TestSuite *> &test_suites);
+  static void PrintJsonTestList(::std::ostream* stream,
+                                const std::vector<TestSuite*>& test_suites);
 
  private:
   // Returns an JSON-escaped copy of the input string str.
-  static std::string EscapeJson(const std::string &str);
+  static std::string EscapeJson(const std::string& str);
 
   //// Verifies that the given attribute belongs to the given element and
   //// streams the attribute as JSON.
-  static void OutputJsonKey(std::ostream *stream,
-                            const std::string &element_name,
-                            const std::string &name, const std::string &value,
-                            const std::string &indent, bool comma = true);
-  static void OutputJsonKey(std::ostream *stream,
-                            const std::string &element_name,
-                            const std::string &name, int value,
-                            const std::string &indent, bool comma = true);
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name,
+                            const std::string& value,
+                            const std::string& indent,
+                            bool comma = true);
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name,
+                            int value,
+                            const std::string& indent,
+                            bool comma = true);
+
+  // Streams a test suite JSON stanza containing the given test result.
+  //
+  // Requires: result.Failed()
+  static void OutputJsonTestSuiteForTestResult(::std::ostream* stream,
+                                               const TestResult& result);
+
+  // Streams a JSON representation of a TestResult object.
+  static void OutputJsonTestResult(::std::ostream* stream,
+                                   const TestResult& result);
 
   // Streams a JSON representation of a TestInfo object.
-  static void OutputJsonTestInfo(::std::ostream *stream,
-                                 const char *test_suite_name,
-                                 const TestInfo &test_info);
+  static void OutputJsonTestInfo(::std::ostream* stream,
+                                 const char* test_suite_name,
+                                 const TestInfo& test_info);
 
   // Prints a JSON representation of a TestSuite object
-  static void PrintJsonTestSuite(::std::ostream *stream,
-                                 const TestSuite &test_suite);
+  static void PrintJsonTestSuite(::std::ostream* stream,
+                                 const TestSuite& test_suite);
 
   // Prints a JSON summary of unit_test to output stream out.
-  static void PrintJsonUnitTest(::std::ostream *stream,
-                                const UnitTest &unit_test);
+  static void PrintJsonUnitTest(::std::ostream* stream,
+                                const UnitTest& unit_test);
 
   // Produces a string representing the test properties in a result as
   // a JSON dictionary.
-  static std::string TestPropertiesAsJson(const TestResult &result,
-                                          const std::string &indent);
+  static std::string TestPropertiesAsJson(const TestResult& result,
+                                          const std::string& indent);
 
   // The output file.
   const std::string output_file_;
@@ -4102,16 +4488,16 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener {
 };
 
 // Creates a new JsonUnitTestResultPrinter.
-JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char *output_file)
+JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
     : output_file_(output_file) {
   if (output_file_.empty()) {
     GTEST_LOG_(FATAL) << "JSON output file may not be null";
   }
 }
 
-void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
-                                                   int /*iteration*/) {
-  FILE *jsonout = OpenFileForWriting(output_file_);
+void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* jsonout = OpenFileForWriting(output_file_);
   std::stringstream stream;
   PrintJsonUnitTest(&stream, unit_test);
   fprintf(jsonout, "%s", StringStreamToString(&stream).c_str());
@@ -4119,7 +4505,7 @@ void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
 }
 
 // Returns an JSON-escaped copy of the input string str.
-std::string JsonUnitTestResultPrinter::EscapeJson(const std::string &str) {
+std::string JsonUnitTestResultPrinter::EscapeJson(const std::string& str) {
   Message m;
 
   for (size_t i = 0; i < str.size(); ++i) {
@@ -4127,12 +4513,24 @@ std::string JsonUnitTestResultPrinter::EscapeJson(const std::string &str) {
     switch (ch) {
       case '\\':
       case '"':
-      case '/': m << '\\' << ch; break;
-      case '\b': m << "\\b"; break;
-      case '\t': m << "\\t"; break;
-      case '\n': m << "\\n"; break;
-      case '\f': m << "\\f"; break;
-      case '\r': m << "\\r"; break;
+      case '/':
+        m << '\\' << ch;
+        break;
+      case '\b':
+        m << "\\b";
+        break;
+      case '\t':
+        m << "\\t";
+        break;
+      case '\n':
+        m << "\\n";
+        break;
+      case '\f':
+        m << "\\f";
+        break;
+      case '\r':
+        m << "\\r";
+        break;
       default:
         if (ch < ' ') {
           m << "\\u00" << String::FormatByte(static_cast<unsigned char>(ch));
@@ -4164,55 +4562,104 @@ static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
     return "";
   // YYYY-MM-DDThh:mm:ss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
-         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-         String::FormatIntWidth2(time_struct.tm_min) + ":" +
-         String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec) + "Z";
 }
 
 static inline std::string Indent(size_t width) {
   return std::string(width, ' ');
 }
 
-void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream *stream,
-                                              const std::string &element_name,
-                                              const std::string &name,
-                                              const std::string &value,
-                                              const std::string &indent,
-                                              bool comma) {
-  const std::vector<std::string> &allowed_names =
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value,
+    const std::string& indent,
+    bool comma) {
+  const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-               allowed_names.end())
+                   allowed_names.end())
       << "Key \"" << name << "\" is not allowed for value \"" << element_name
       << "\".";
 
   *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
-  if (comma) *stream << ",\n";
+  if (comma)
+    *stream << ",\n";
 }
 
 void JsonUnitTestResultPrinter::OutputJsonKey(
-    std::ostream *stream, const std::string &element_name,
-    const std::string &name, int value, const std::string &indent, bool comma) {
-  const std::vector<std::string> &allowed_names =
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    int value,
+    const std::string& indent,
+    bool comma) {
+  const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-               allowed_names.end())
+                   allowed_names.end())
       << "Key \"" << name << "\" is not allowed for value \"" << element_name
       << "\".";
 
   *stream << indent << "\"" << name << "\": " << StreamableToString(value);
-  if (comma) *stream << ",\n";
+  if (comma)
+    *stream << ",\n";
+}
+
+// Streams a test suite JSON stanza containing the given test result.
+void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult(
+    ::std::ostream* stream, const TestResult& result) {
+  // Output the boilerplate for a new test suite.
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6));
+  OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6));
+  if (!GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6));
+    OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "errors", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "time",
+                  FormatTimeInMillisAsDuration(result.elapsed_time()),
+                  Indent(6));
+    OutputJsonKey(stream, "testsuite", "timestamp",
+                  FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                  Indent(6));
+  }
+  *stream << Indent(6) << "\"testsuite\": [\n";
+
+  // Output the boilerplate for a new test case.
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, "testcase", "name", "", Indent(10));
+  OutputJsonKey(stream, "testcase", "status", "RUN", Indent(10));
+  OutputJsonKey(stream, "testcase", "result", "COMPLETED", Indent(10));
+  OutputJsonKey(stream, "testcase", "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                Indent(10));
+  OutputJsonKey(stream, "testcase", "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()),
+                Indent(10));
+  OutputJsonKey(stream, "testcase", "classname", "", Indent(10), false);
+  *stream << TestPropertiesAsJson(result, Indent(10));
+
+  // Output the actual test result.
+  OutputJsonTestResult(stream, result);
+
+  // Finish the test suite.
+  *stream << "\n" << Indent(6) << "]\n" << Indent(4) << "}";
 }
 
 // Prints a JSON representation of a TestInfo object.
-void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream *stream,
-                                                   const char *test_suite_name,
-                                                   const TestInfo &test_info) {
-  const TestResult &result = *test_info.result();
+void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
+                                                   const char* test_suite_name,
+                                                   const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
   const std::string kTestsuite = "testcase";
   const std::string kIndent = Indent(10);
 
@@ -4250,15 +4697,20 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream *stream,
                 false);
   *stream << TestPropertiesAsJson(result, kIndent);
 
+  OutputJsonTestResult(stream, result);
+}
+
+void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
+                                                     const TestResult& result) {
+  const std::string kIndent = Indent(10);
+
   int failures = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
-    const TestPartResult &part = result.GetTestPartResult(i);
+    const TestPartResult& part = result.GetTestPartResult(i);
     if (part.failed()) {
       *stream << ",\n";
       if (++failures == 1) {
-        *stream << kIndent << "\""
-                << "failures"
-                << "\": [\n";
+        *stream << kIndent << "\"" << "failures" << "\": [\n";
       }
       const std::string location =
           internal::FormatCompilerIndependentFileLocation(part.file_name(),
@@ -4271,13 +4723,14 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream *stream,
     }
   }
 
-  if (failures > 0) *stream << "\n" << kIndent << "]";
+  if (failures > 0)
+    *stream << "\n" << kIndent << "]";
   *stream << "\n" << Indent(8) << "}";
 }
 
 // Prints an JSON representation of a TestSuite object
 void JsonUnitTestResultPrinter::PrintJsonTestSuite(
-    std::ostream *stream, const TestSuite &test_suite) {
+    std::ostream* stream, const TestSuite& test_suite) {
   const std::string kTestsuite = "testsuite";
   const std::string kIndent = Indent(6);
 
@@ -4319,8 +4772,8 @@ void JsonUnitTestResultPrinter::PrintJsonTestSuite(
 }
 
 // Prints a JSON summary of unit_test to output stream out.
-void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream *stream,
-                                                  const UnitTest &unit_test) {
+void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
+                                                  const UnitTest& unit_test) {
   const std::string kTestsuites = "testsuites";
   const std::string kIndent = Indent(2);
   *stream << "{\n";
@@ -4361,13 +4814,17 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream *stream,
     }
   }
 
-  *stream << "\n"
-          << kIndent << "]\n"
-          << "}\n";
+  // If there was a test failure outside of one of the test suites (like in a
+  // test environment) include that in the output.
+  if (unit_test.ad_hoc_test_result().Failed()) {
+    OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
+  }
+
+  *stream << "\n" << kIndent << "]\n" << "}\n";
 }
 
 void JsonUnitTestResultPrinter::PrintJsonTestList(
-    std::ostream *stream, const std::vector<TestSuite *> &test_suites) {
+    std::ostream* stream, const std::vector<TestSuite*>& test_suites) {
   const std::string kTestsuites = "testsuites";
   const std::string kIndent = Indent(2);
   *stream << "{\n";
@@ -4394,12 +4851,11 @@ void JsonUnitTestResultPrinter::PrintJsonTestList(
 // Produces a string representing the test properties in a result as
 // a JSON dictionary.
 std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
-    const TestResult &result, const std::string &indent) {
+    const TestResult& result, const std::string& indent) {
   Message attributes;
   for (int i = 0; i < result.test_property_count(); ++i) {
-    const TestProperty &property = result.GetTestProperty(i);
-    attributes << ",\n"
-               << indent << "\"" << property.key() << "\": "
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << ",\n" << indent << "\"" << property.key() << "\": "
                << "\"" << EscapeJson(property.value()) << "\"";
   }
   return attributes.GetString();
@@ -4414,7 +4870,7 @@ std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
 // example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
 // in both time and space -- important as the input str may contain an
 // arbitrarily long test failure message and stack trace.
-std::string StreamingListener::UrlEncode(const char *str) {
+std::string StreamingListener::UrlEncode(const char* str) {
   std::string result;
   result.reserve(strlen(str) + 1);
   for (char ch = *str; ch != '\0'; ch = *++str) {
@@ -4425,7 +4881,9 @@ std::string StreamingListener::UrlEncode(const char *str) {
       case '\n':
         result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
         break;
-      default: result.push_back(ch); break;
+      default:
+        result.push_back(ch);
+        break;
     }
   }
   return result;
@@ -4437,24 +4895,24 @@ void StreamingListener::SocketWriter::MakeConnection() {
 
   addrinfo hints;
   memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;  // To allow both IPv4 and IPv6 addresses.
+  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
   hints.ai_socktype = SOCK_STREAM;
-  addrinfo *servinfo = nullptr;
+  addrinfo* servinfo = nullptr;
 
   // Use the getaddrinfo() to get a linked list of IP addresses for
   // the given host name.
-  const int error_num =
-      getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  const int error_num = getaddrinfo(
+      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
   if (error_num != 0) {
     GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
                         << gai_strerror(error_num);
   }
 
   // Loop through all the results and connect to the first we can.
-  for (addrinfo *cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
        cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype,
-                     cur_addr->ai_protocol);
+    sockfd_ = socket(
+        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
     if (sockfd_ != -1) {
       // Connect the client socket to the server socket.
       if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
@@ -4477,7 +4935,7 @@ void StreamingListener::SocketWriter::MakeConnection() {
 
 // class OsStackTraceGetter
 
-const char *const OsStackTraceGetterInterface::kElidedFramesMarker =
+const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
     "... " GTEST_NAME_ " internal frames ...";
 
 std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
@@ -4491,12 +4949,12 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
 
   max_depth = std::min(max_depth, kMaxStackTraceDepth);
 
-  std::vector<void *> raw_stack(max_depth);
+  std::vector<void*> raw_stack(max_depth);
   // Skips the frames requested by the caller, plus this function.
   const int raw_stack_size =
       absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1);
 
-  void *caller_frame = nullptr;
+  void* caller_frame = nullptr;
   {
     MutexLock lock(&mutex_);
     caller_frame = caller_frame_;
@@ -4511,7 +4969,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
     }
 
     char tmp[1024];
-    const char *symbol = "(unknown)";
+    const char* symbol = "(unknown)";
     if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) {
       symbol = tmp;
     }
@@ -4523,7 +4981,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
 
   return result;
 
-#else   // !GTEST_HAS_ABSL
+#else  // !GTEST_HAS_ABSL
   static_cast<void>(max_depth);
   static_cast<void>(skip_count);
   return "";
@@ -4532,7 +4990,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
 
 void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
 #if GTEST_HAS_ABSL
-  void *caller_frame = nullptr;
+  void* caller_frame = nullptr;
   if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) {
     caller_frame = nullptr;
   }
@@ -4546,15 +5004,15 @@ void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
 // constructor and deletes the file in its destructor.
 class ScopedPrematureExitFile {
  public:
-  explicit ScopedPrematureExitFile(const char *premature_exit_filepath)
-      : premature_exit_filepath_(
-            premature_exit_filepath ? premature_exit_filepath : "") {
+  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
+      : premature_exit_filepath_(premature_exit_filepath ?
+                                 premature_exit_filepath : "") {
     // If a path to the premature-exit file is specified...
     if (!premature_exit_filepath_.empty()) {
       // create the file with a single "0" character in it.  I/O
       // errors are ignored as there's nothing better we can do and we
       // don't want to fail the test because of this.
-      FILE *pfile = posix::FOpen(premature_exit_filepath, "w");
+      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
       fwrite("0", 1, 1, pfile);
       fclose(pfile);
     }
@@ -4585,7 +5043,8 @@ class ScopedPrematureExitFile {
 
 TestEventListeners::TestEventListeners()
     : repeater_(new internal::TestEventRepeater()),
-      default_result_printer_(nullptr), default_xml_generator_(nullptr) {}
+      default_result_printer_(nullptr),
+      default_xml_generator_(nullptr) {}
 
 TestEventListeners::~TestEventListeners() { delete repeater_; }
 
@@ -4593,14 +5052,14 @@ TestEventListeners::~TestEventListeners() { delete repeater_; }
 // output.  Can be removed from the listeners list to shut down default
 // console output.  Note that removing this object from the listener list
 // with Release transfers its ownership to the user.
-void TestEventListeners::Append(TestEventListener *listener) {
+void TestEventListeners::Append(TestEventListener* listener) {
   repeater_->Append(listener);
 }
 
 // Removes the given event listener from the list and returns it.  It then
 // becomes the caller's responsibility to delete the listener. Returns
 // NULL if the listener is not found in the list.
-TestEventListener *TestEventListeners::Release(TestEventListener *listener) {
+TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
   if (listener == default_result_printer_)
     default_result_printer_ = nullptr;
   else if (listener == default_xml_generator_)
@@ -4610,14 +5069,14 @@ TestEventListener *TestEventListeners::Release(TestEventListener *listener) {
 
 // Returns repeater that broadcasts the TestEventListener events to all
 // subscribers.
-TestEventListener *TestEventListeners::repeater() { return repeater_; }
+TestEventListener* TestEventListeners::repeater() { return repeater_; }
 
 // Sets the default_result_printer attribute to the provided listener.
 // The listener is also added to the listener list and previous
 // default_result_printer is removed from it and deleted. The listener can
 // also be NULL in which case it will not be added to the list. Does
 // nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultResultPrinter(TestEventListener *listener) {
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
   if (default_result_printer_ != listener) {
     // It is an error to pass this method a listener that is already in the
     // list.
@@ -4632,7 +5091,7 @@ void TestEventListeners::SetDefaultResultPrinter(TestEventListener *listener) {
 // default_xml_generator is removed from it and deleted. The listener can
 // also be NULL in which case it will not be added to the list. Does
 // nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultXmlGenerator(TestEventListener *listener) {
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
   if (default_xml_generator_ != listener) {
     // It is an error to pass this method a listener that is already in the
     // list.
@@ -4661,13 +5120,13 @@ void TestEventListeners::SuppressEventForwarding() {
 // We don't protect this under mutex_ as a user is not supposed to
 // call this before main() starts, from which point on the return
 // value will never change.
-UnitTest *UnitTest::GetInstance() {
+UnitTest* UnitTest::GetInstance() {
   // CodeGear C++Builder insists on a public destructor for the
   // default implementation.  Use this implementation to keep good OO
   // design with private destructor.
 
 #if defined(__BORLANDC__)
-  static UnitTest *const instance = new UnitTest;
+  static UnitTest* const instance = new UnitTest;
   return instance;
 #else
   static UnitTest instance;
@@ -4749,7 +5208,7 @@ int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
 // Gets the time of the test program start, in ms from the start of the
 // UNIX epoch.
 internal::TimeInMillis UnitTest::start_timestamp() const {
-  return impl()->start_timestamp();
+    return impl()->start_timestamp();
 }
 
 // Gets the elapsed time, in milliseconds.
@@ -4767,32 +5226,34 @@ bool UnitTest::Failed() const { return impl()->Failed(); }
 
 // Gets the i-th test suite among all the test suites. i can range from 0 to
 // total_test_suite_count() - 1. If i is not in that range, returns NULL.
-const TestSuite *UnitTest::GetTestSuite(int i) const {
+const TestSuite* UnitTest::GetTestSuite(int i) const {
   return impl()->GetTestSuite(i);
 }
 
 //  Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-const TestCase *UnitTest::GetTestCase(int i) const {
+const TestCase* UnitTest::GetTestCase(int i) const {
   return impl()->GetTestCase(i);
 }
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
 // Returns the TestResult containing information on test failures and
 // properties logged outside of individual test suites.
-const TestResult &UnitTest::ad_hoc_test_result() const {
+const TestResult& UnitTest::ad_hoc_test_result() const {
   return *impl()->ad_hoc_test_result();
 }
 
 // Gets the i-th test suite among all the test suites. i can range from 0 to
 // total_test_suite_count() - 1. If i is not in that range, returns NULL.
-TestSuite *UnitTest::GetMutableTestSuite(int i) {
+TestSuite* UnitTest::GetMutableTestSuite(int i) {
   return impl()->GetMutableSuiteCase(i);
 }
 
 // Returns the list of event listeners that can be used to track events
 // inside Google Test.
-TestEventListeners &UnitTest::listeners() { return *impl()->listeners(); }
+TestEventListeners& UnitTest::listeners() {
+  return *impl()->listeners();
+}
 
 // Registers and returns a global test environment.  When a test
 // program is run, all global test environments will be set-up in the
@@ -4804,7 +5265,7 @@ TestEventListeners &UnitTest::listeners() { return *impl()->listeners(); }
 //
 // We don't protect this under mutex_, as we only support calling it
 // from the main thread.
-Environment *UnitTest::AddEnvironment(Environment *env) {
+Environment* UnitTest::AddEnvironment(Environment* env) {
   if (env == nullptr) {
     return nullptr;
   }
@@ -4817,11 +5278,12 @@ Environment *UnitTest::AddEnvironment(Environment *env) {
 // assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
 // this to report their results.  The user code should use the
 // assertion macros instead of calling this directly.
-void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
-                                 const char *file_name, int line_number,
-                                 const std::string &message,
-                                 const std::string &os_stack_trace)
-    GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::AddTestPartResult(
+    TestPartResult::Type result_type,
+    const char* file_name,
+    int line_number,
+    const std::string& message,
+    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
   Message msg;
   msg << message;
 
@@ -4830,10 +5292,9 @@ void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
     msg << "\n" << GTEST_NAME_ << " trace:";
 
     for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) {
-      const internal::TraceInfo &trace = impl_->gtest_trace_stack()[i - 1];
-      msg << "\n"
-          << internal::FormatFileLocation(trace.file, trace.line) << " "
-          << trace.message;
+      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
+          << " " << trace.message;
     }
   }
 
@@ -4843,8 +5304,8 @@ void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
 
   const TestPartResult result = TestPartResult(
       result_type, file_name, line_number, msg.GetString().c_str());
-  impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
-      result);
+  impl_->GetTestPartResultReporterForCurrentThread()->
+      ReportTestPartResult(result);
 
   if (result_type != TestPartResult::kSuccess &&
       result_type != TestPartResult::kSkip) {
@@ -4868,7 +5329,7 @@ void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
       // Dereference nullptr through a volatile pointer to prevent the compiler
       // from removing. We use this rather than abort() or __builtin_trap() for
       // portability: some debuggers don't correctly trap abort().
-      *static_cast<volatile int *>(nullptr) = 1;
+      *static_cast<volatile int*>(nullptr) = 1;
 #endif  // GTEST_OS_WINDOWS
     } else if (GTEST_FLAG(throw_on_failure)) {
 #if GTEST_HAS_EXCEPTIONS
@@ -4887,8 +5348,8 @@ void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
 // from SetUpTestSuite or TearDownTestSuite, or to the global property set
 // when invoked elsewhere.  If the result already contains a property with
 // the same key, the value will be updated.
-void UnitTest::RecordProperty(const std::string &key,
-                              const std::string &value) {
+void UnitTest::RecordProperty(const std::string& key,
+                              const std::string& value) {
   impl_->RecordProperty(TestProperty(key, value));
 }
 
@@ -4937,20 +5398,20 @@ int UnitTest::Run() {
   // process. In either case the user does not want to see pop-up dialogs
   // about crashes - they are expected.
   if (impl()->catch_exceptions() || in_death_test_child_process) {
-#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
     // SetErrorMode doesn't exist on CE.
     SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
                  SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
-#endif  // !GTEST_OS_WINDOWS_MOBILE
+# endif  // !GTEST_OS_WINDOWS_MOBILE
 
-#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
     // Death test children can be terminated with _abort().  On Windows,
     // _abort() can show a dialog with a warning message.  This forces the
     // abort message to go to stderr instead.
     _set_error_mode(_OUT_TO_STDERR);
-#endif
+# endif
 
-#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
+# if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
     // In the debug version, Visual Studio pops up a separate dialog
     // offering a choice to debug the aborted program. We need to suppress
     // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
@@ -4970,26 +5431,25 @@ int UnitTest::Run() {
                               _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
       (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
     }
-#endif
+# endif
   }
 #endif  // GTEST_OS_WINDOWS
 
   return internal::HandleExceptionsInMethodIfSupported(
-             impl(), &internal::UnitTestImpl::RunAllTests,
-             "auxiliary test code (environments or event listeners)")
-             ? 0
-             : 1;
+      impl(),
+      &internal::UnitTestImpl::RunAllTests,
+      "auxiliary test code (environments or event listeners)") ? 0 : 1;
 }
 
 // Returns the working directory when the first TEST() or TEST_F() was
 // executed.
-const char *UnitTest::original_working_dir() const {
+const char* UnitTest::original_working_dir() const {
   return impl_->original_working_dir_.c_str();
 }
 
 // Returns the TestSuite object for the test that's currently running,
 // or NULL if no test is running.
-const TestSuite *UnitTest::current_test_suite() const
+const TestSuite* UnitTest::current_test_suite() const
     GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   return impl_->current_test_suite();
@@ -4997,7 +5457,7 @@ const TestSuite *UnitTest::current_test_suite() const
 
 // Legacy API is still available but deprecated
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-const TestCase *UnitTest::current_test_case() const
+const TestCase* UnitTest::current_test_case() const
     GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   return impl_->current_test_suite();
@@ -5006,7 +5466,7 @@ const TestCase *UnitTest::current_test_case() const
 
 // Returns the TestInfo object for the test that's currently running,
 // or NULL if no test is running.
-const TestInfo *UnitTest::current_test_info() const
+const TestInfo* UnitTest::current_test_info() const
     GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   return impl_->current_test_info();
@@ -5017,34 +5477,39 @@ int UnitTest::random_seed() const { return impl_->random_seed(); }
 
 // Returns ParameterizedTestSuiteRegistry object used to keep track of
 // value-parameterized tests and instantiate and register them.
-internal::ParameterizedTestSuiteRegistry &
+internal::ParameterizedTestSuiteRegistry&
 UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) {
   return impl_->parameterized_test_registry();
 }
 
 // Creates an empty UnitTest.
-UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); }
+UnitTest::UnitTest() {
+  impl_ = new internal::UnitTestImpl(this);
+}
 
 // Destructor of UnitTest.
-UnitTest::~UnitTest() { delete impl_; }
+UnitTest::~UnitTest() {
+  delete impl_;
+}
 
 // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
 // Google Test trace stack.
-void UnitTest::PushGTestTrace(const internal::TraceInfo &trace)
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
     GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   impl_->gtest_trace_stack().push_back(trace);
 }
 
 // Pops a trace from the per-thread Google Test trace stack.
-void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::PopGTestTrace()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   impl_->gtest_trace_stack().pop_back();
 }
 
 namespace internal {
 
-UnitTestImpl::UnitTestImpl(UnitTest *parent)
+UnitTestImpl::UnitTestImpl(UnitTest* parent)
     : parent_(parent),
       GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
           default_global_test_part_result_reporter_(this),
@@ -5053,13 +5518,18 @@ UnitTestImpl::UnitTestImpl(UnitTest *parent)
           &default_global_test_part_result_reporter_),
       per_thread_test_part_result_reporter_(
           &default_per_thread_test_part_result_reporter_),
-      parameterized_test_registry_(), parameterized_tests_registered_(false),
-      last_death_test_suite_(-1), current_test_suite_(nullptr),
-      current_test_info_(nullptr), ad_hoc_test_result_(),
-      os_stack_trace_getter_(nullptr), post_flag_parse_init_performed_(false),
+      parameterized_test_registry_(),
+      parameterized_tests_registered_(false),
+      last_death_test_suite_(-1),
+      current_test_suite_(nullptr),
+      current_test_info_(nullptr),
+      ad_hoc_test_result_(),
+      os_stack_trace_getter_(nullptr),
+      post_flag_parse_init_performed_(false),
       random_seed_(0),  // Will be overridden by the flag before first use.
       random_(0),       // Will be reseeded before first use.
-      start_timestamp_(0), elapsed_time_(0),
+      start_timestamp_(0),
+      elapsed_time_(0),
 #if GTEST_HAS_DEATH_TEST
       death_test_factory_(new DefaultDeathTestFactory),
 #endif
@@ -5083,9 +5553,9 @@ UnitTestImpl::~UnitTestImpl() {
 // from SetUpTestSuite/TearDownTestSuite, or to the global property set
 // otherwise.  If the result already contains a property with the same key,
 // the value will be updated.
-void UnitTestImpl::RecordProperty(const TestProperty &test_property) {
+void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
   std::string xml_element;
-  TestResult *test_result;  // TestResult appropriate for property recording.
+  TestResult* test_result;  // TestResult appropriate for property recording.
 
   if (current_test_info_ != nullptr) {
     xml_element = "testcase";
@@ -5112,7 +5582,7 @@ void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
 // Initializes event listeners performing XML output as specified by
 // UnitTestOptions. Must not be called before InitGoogleTest.
 void UnitTestImpl::ConfigureXmlOutput() {
-  const std::string &output_format = UnitTestOptions::GetOutputFormat();
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
   if (output_format == "xml") {
     listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
         UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
@@ -5129,12 +5599,12 @@ void UnitTestImpl::ConfigureXmlOutput() {
 // Initializes event listeners for streaming test results in string form.
 // Must not be called before InitGoogleTest.
 void UnitTestImpl::ConfigureStreamingOutput() {
-  const std::string &target = GTEST_FLAG(stream_result_to);
+  const std::string& target = GTEST_FLAG(stream_result_to);
   if (!target.empty()) {
     const size_t pos = target.find(':');
     if (pos != std::string::npos) {
-      listeners()->Append(
-          new StreamingListener(target.substr(0, pos), target.substr(pos + 1)));
+      listeners()->Append(new StreamingListener(target.substr(0, pos),
+                                                target.substr(pos+1)));
     } else {
       GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
                           << "\" ignored.";
@@ -5172,6 +5642,10 @@ void UnitTestImpl::PostFlagParsingInit() {
     // to shut down the default XML output before invoking RUN_ALL_TESTS.
     ConfigureXmlOutput();
 
+    if (GTEST_FLAG(brief)) {
+      listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter);
+    }
+
 #if GTEST_CAN_STREAM_RESULTS_
     // Configures listeners for streaming test results to the specified server.
     ConfigureStreamingOutput();
@@ -5197,10 +5671,10 @@ void UnitTestImpl::PostFlagParsingInit() {
 class TestSuiteNameIs {
  public:
   // Constructor.
-  explicit TestSuiteNameIs(const std::string &name) : name_(name) {}
+  explicit TestSuiteNameIs(const std::string& name) : name_(name) {}
 
   // Returns true if and only if the name of test_suite matches name_.
-  bool operator()(const TestSuite *test_suite) const {
+  bool operator()(const TestSuite* test_suite) const {
     return test_suite != nullptr &&
            strcmp(test_suite->name(), name_.c_str()) == 0;
   }
@@ -5217,12 +5691,12 @@ class TestSuiteNameIs {
 // Arguments:
 //
 //   test_suite_name: name of the test suite
-//   type_param:     the name of the test suite's type parameter, or NULL if
-//                   this is not a typed or a type-parameterized test suite.
-//   set_up_tc:      pointer to the function that sets up the test suite
-//   tear_down_tc:   pointer to the function that tears down the test suite
-TestSuite *UnitTestImpl::GetTestSuite(
-    const char *test_suite_name, const char *type_param,
+//   type_param:      the name of the test suite's type parameter, or NULL if
+//                    this is not a typed or a type-parameterized test suite.
+//   set_up_tc:       pointer to the function that sets up the test suite
+//   tear_down_tc:    pointer to the function that tears down the test suite
+TestSuite* UnitTestImpl::GetTestSuite(
+    const char* test_suite_name, const char* type_param,
     internal::SetUpTestSuiteFunc set_up_tc,
     internal::TearDownTestSuiteFunc tear_down_tc) {
   // Can we find a TestSuite with the given name?
@@ -5233,7 +5707,7 @@ TestSuite *UnitTestImpl::GetTestSuite(
   if (test_suite != test_suites_.rend()) return *test_suite;
 
   // No.  Let's create one.
-  auto *const new_test_suite =
+  auto* const new_test_suite =
       new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
 
   // Is this a death test suite?
@@ -5257,8 +5731,8 @@ TestSuite *UnitTestImpl::GetTestSuite(
 
 // Helpers for setting up / tearing down the given environment.  They
 // are for use in the ForEach() function.
-static void SetUpEnvironment(Environment *env) { env->SetUp(); }
-static void TearDownEnvironment(Environment *env) { env->TearDown(); }
+static void SetUpEnvironment(Environment* env) { env->SetUp(); }
+static void TearDownEnvironment(Environment* env) { env->TearDown(); }
 
 // Runs all tests in this UnitTest object, prints the result, and
 // returns true if all tests are successful.  If any exception is
@@ -5275,7 +5749,8 @@ bool UnitTestImpl::RunAllTests() {
   const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
 
   // Do not run any test if the --help flag was specified.
-  if (g_help_flag) return true;
+  if (g_help_flag)
+    return true;
 
   // Repeats the call to the post-flag parsing initialization in case the
   // user didn't call InitGoogleTest.
@@ -5293,11 +5768,11 @@ bool UnitTestImpl::RunAllTests() {
 #if GTEST_HAS_DEATH_TEST
   in_subprocess_for_death_test =
       (internal_run_death_test_flag_.get() != nullptr);
-#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
   if (in_subprocess_for_death_test) {
     GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
   }
-#endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
 #endif  // GTEST_HAS_DEATH_TEST
 
   const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
@@ -5305,9 +5780,9 @@ bool UnitTestImpl::RunAllTests() {
 
   // Compares the full test names with the filter to decide which
   // tests to run.
-  const bool has_tests_to_run =
-      FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL
-                               : IGNORE_SHARDING_PROTOCOL) > 0;
+  const bool has_tests_to_run = FilterTests(should_shard
+                                              ? HONOR_SHARDING_PROTOCOL
+                                              : IGNORE_SHARDING_PROTOCOL) > 0;
 
   // Lists the tests and exits if the --gtest_list_tests flag was specified.
   if (GTEST_FLAG(list_tests)) {
@@ -5316,13 +5791,13 @@ bool UnitTestImpl::RunAllTests() {
     return true;
   }
 
-  random_seed_ =
-      GTEST_FLAG(shuffle) ? GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+  random_seed_ = GTEST_FLAG(shuffle) ?
+      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
 
   // True if and only if at least one test has failed.
   bool failed = false;
 
-  TestEventListener *repeater = listeners()->repeater();
+  TestEventListener* repeater = listeners()->repeater();
 
   start_timestamp_ = GetTimeInMillis();
   repeater->OnTestProgramStart(*parent_);
@@ -5337,7 +5812,7 @@ bool UnitTestImpl::RunAllTests() {
     // assertions executed before RUN_ALL_TESTS().
     ClearNonAdHocTestResult();
 
-    const TimeInMillis start = GetTimeInMillis();
+    Timer timer;
 
     // Shuffles test suites and tests if requested.
     if (has_tests_to_run && GTEST_FLAG(shuffle)) {
@@ -5363,13 +5838,13 @@ bool UnitTestImpl::RunAllTests() {
       if (Test::IsSkipped()) {
         // Emit diagnostics when global set-up calls skip, as it will not be
         // emitted by default.
-        TestResult &test_result =
+        TestResult& test_result =
             *internal::GetUnitTestImpl()->current_test_result();
         for (int j = 0; j < test_result.total_part_count(); ++j) {
-          const TestPartResult &test_part_result =
+          const TestPartResult& test_part_result =
               test_result.GetTestPartResult(j);
           if (test_part_result.type() == TestPartResult::kSkip) {
-            const std::string &result = test_part_result.message();
+            const std::string& result = test_part_result.message();
             printf("%s\n", result.c_str());
           }
         }
@@ -5378,6 +5853,21 @@ bool UnitTestImpl::RunAllTests() {
         for (int test_index = 0; test_index < total_test_suite_count();
              test_index++) {
           GetMutableSuiteCase(test_index)->Run();
+          if (GTEST_FLAG(fail_fast) &&
+              GetMutableSuiteCase(test_index)->Failed()) {
+            for (int j = test_index + 1; j < total_test_suite_count(); j++) {
+              GetMutableSuiteCase(j)->Skip();
+            }
+            break;
+          }
+        }
+      } else if (Test::HasFatalFailure()) {
+        // If there was a fatal failure during the global setup then we know we
+        // aren't going to run any tests. Explicitly mark all of the tests as
+        // skipped to make this obvious in the output.
+        for (int test_index = 0; test_index < total_test_suite_count();
+             test_index++) {
+          GetMutableSuiteCase(test_index)->Skip();
         }
       }
 
@@ -5388,7 +5878,7 @@ bool UnitTestImpl::RunAllTests() {
       repeater->OnEnvironmentsTearDownEnd(*parent_);
     }
 
-    elapsed_time_ = GetTimeInMillis() - start;
+    elapsed_time_ = timer.Elapsed();
 
     // Tells the unit test event listener that the tests have just finished.
     repeater->OnTestIterationEnd(*parent_, i);
@@ -5416,14 +5906,14 @@ bool UnitTestImpl::RunAllTests() {
 
   if (!gtest_is_initialized_before_run_all_tests) {
     ColoredPrintf(
-        COLOR_RED,
+        GTestColor::kRed,
         "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
         "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
         "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
         " will start to enforce the valid usage. "
         "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
 #if GTEST_FOR_GOOGLE_
-    ColoredPrintf(COLOR_RED,
+    ColoredPrintf(GTestColor::kRed,
                   "For more details, see http://wiki/Main/ValidGUnitMain.\n");
 #endif  // GTEST_FOR_GOOGLE_
   }
@@ -5436,11 +5926,11 @@ bool UnitTestImpl::RunAllTests() {
 // function will write over it. If the variable is present, but the file cannot
 // be created, prints an error and exits.
 void WriteToShardStatusFileIfNeeded() {
-  const char *const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
   if (test_shard_file != nullptr) {
-    FILE *const file = posix::FOpen(test_shard_file, "w");
+    FILE* const file = posix::FOpen(test_shard_file, "w");
     if (file == nullptr) {
-      ColoredPrintf(COLOR_RED,
+      ColoredPrintf(GTestColor::kRed,
                     "Could not write to the test shard status file \"%s\" "
                     "specified by the %s environment variable.\n",
                     test_shard_file, kTestShardStatusFile);
@@ -5457,7 +5947,8 @@ void WriteToShardStatusFileIfNeeded() {
 // an error and exits. If in_subprocess_for_death_test, sharding is
 // disabled because it must only be applied to the original test
 // process. Otherwise, we could filter out death tests we intended to execute.
-bool ShouldShard(const char *total_shards_env, const char *shard_index_env,
+bool ShouldShard(const char* total_shards_env,
+                 const char* shard_index_env,
                  bool in_subprocess_for_death_test) {
   if (in_subprocess_for_death_test) {
     return false;
@@ -5469,28 +5960,28 @@ bool ShouldShard(const char *total_shards_env, const char *shard_index_env,
   if (total_shards == -1 && shard_index == -1) {
     return false;
   } else if (total_shards == -1 && shard_index != -1) {
-    const Message msg = Message() << "Invalid environment variables: you have "
-                                  << kTestShardIndex << " = " << shard_index
-                                  << ", but have left " << kTestTotalShards
-                                  << " unset.\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestShardIndex << " = " << shard_index
+      << ", but have left " << kTestTotalShards << " unset.\n";
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (total_shards != -1 && shard_index == -1) {
     const Message msg = Message()
-                        << "Invalid environment variables: you have "
-                        << kTestTotalShards << " = " << total_shards
-                        << ", but have left " << kTestShardIndex << " unset.\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+      << "Invalid environment variables: you have "
+      << kTestTotalShards << " = " << total_shards
+      << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (shard_index < 0 || shard_index >= total_shards) {
-    const Message msg =
-        Message() << "Invalid environment variables: we require 0 <= "
-                  << kTestShardIndex << " < " << kTestTotalShards
-                  << ", but you have " << kTestShardIndex << "=" << shard_index
-                  << ", " << kTestTotalShards << "=" << total_shards << ".\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    const Message msg = Message()
+      << "Invalid environment variables: we require 0 <= "
+      << kTestShardIndex << " < " << kTestTotalShards
+      << ", but you have " << kTestShardIndex << "=" << shard_index
+      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   }
@@ -5501,8 +5992,8 @@ bool ShouldShard(const char *total_shards_env, const char *shard_index_env,
 // Parses the environment variable var as an Int32. If it is unset,
 // returns default_val. If it is not an Int32, prints an error
 // and aborts.
-int32_t Int32FromEnvOrDie(const char *var, int32_t default_val) {
-  const char *str_val = posix::GetEnv(var);
+int32_t Int32FromEnvOrDie(const char* var, int32_t default_val) {
+  const char* str_val = posix::GetEnv(var);
   if (str_val == nullptr) {
     return default_val;
   }
@@ -5531,12 +6022,10 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
 // https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
 // . Returns the number of tests that should run.
 int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
-  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL
-                                   ? Int32FromEnvOrDie(kTestTotalShards, -1)
-                                   : -1;
-  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL
-                                  ? Int32FromEnvOrDie(kTestShardIndex, -1)
-                                  : -1;
+  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
+  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
 
   // num_runnable_tests are the number of tests that will
   // run across all shards (i.e., match filter and are not disabled).
@@ -5544,12 +6033,12 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
   // this shard.
   int num_runnable_tests = 0;
   int num_selected_tests = 0;
-  for (auto *test_suite : test_suites_) {
-    const std::string &test_suite_name = test_suite->name();
+  for (auto* test_suite : test_suites_) {
+    const std::string& test_suite_name = test_suite->name();
     test_suite->set_should_run(false);
 
     for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
-      TestInfo *const test_info = test_suite->test_info_list()[j];
+      TestInfo* const test_info = test_suite->test_info_list()[j];
       const std::string test_name(test_info->name());
       // A test is disabled if test suite name or test name matches
       // kDisableTestFilter.
@@ -5587,7 +6076,7 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
 // characters with string "\\n".  If the output takes more than
 // max_length characters, only prints the first max_length characters
 // and "...".
-static void PrintOnOneLine(const char *str, int max_length) {
+static void PrintOnOneLine(const char* str, int max_length) {
   if (str != nullptr) {
     for (int i = 0; *str != '\0'; ++str) {
       if (i >= max_length) {
@@ -5610,11 +6099,11 @@ void UnitTestImpl::ListTestsMatchingFilter() {
   // Print at most this many characters for each type/value parameter.
   const int kMaxParamLength = 250;
 
-  for (auto *test_suite : test_suites_) {
+  for (auto* test_suite : test_suites_) {
     bool printed_test_suite_name = false;
 
     for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
-      const TestInfo *const test_info = test_suite->test_info_list()[j];
+      const TestInfo* const test_info = test_suite->test_info_list()[j];
       if (test_info->matches_filter_) {
         if (!printed_test_suite_name) {
           printed_test_suite_name = true;
@@ -5639,9 +6128,9 @@ void UnitTestImpl::ListTestsMatchingFilter() {
     }
   }
   fflush(stdout);
-  const std::string &output_format = UnitTestOptions::GetOutputFormat();
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
   if (output_format == "xml" || output_format == "json") {
-    FILE *fileout = OpenFileForWriting(
+    FILE* fileout = OpenFileForWriting(
         UnitTestOptions::GetAbsolutePathToOutputFile().c_str());
     std::stringstream stream;
     if (output_format == "xml") {
@@ -5664,7 +6153,7 @@ void UnitTestImpl::ListTestsMatchingFilter() {
 // the same; otherwise, deletes the old getter and makes the input the
 // current getter.
 void UnitTestImpl::set_os_stack_trace_getter(
-    OsStackTraceGetterInterface *getter) {
+    OsStackTraceGetterInterface* getter) {
   if (os_stack_trace_getter_ != getter) {
     delete os_stack_trace_getter_;
     os_stack_trace_getter_ = getter;
@@ -5674,7 +6163,7 @@ void UnitTestImpl::set_os_stack_trace_getter(
 // Returns the current OS stack trace getter if it is not NULL;
 // otherwise, creates an OsStackTraceGetter, makes it the current
 // getter, and returns it.
-OsStackTraceGetterInterface *UnitTestImpl::os_stack_trace_getter() {
+OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
   if (os_stack_trace_getter_ == nullptr) {
 #ifdef GTEST_OS_STACK_TRACE_GETTER_
     os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
@@ -5687,7 +6176,7 @@ OsStackTraceGetterInterface *UnitTestImpl::os_stack_trace_getter() {
 }
 
 // Returns the most specific TestResult currently running.
-TestResult *UnitTestImpl::current_test_result() {
+TestResult* UnitTestImpl::current_test_result() {
   if (current_test_info_ != nullptr) {
     return &current_test_info_->result_;
   }
@@ -5708,7 +6197,7 @@ void UnitTestImpl::ShuffleTests() {
                static_cast<int>(test_suites_.size()), &test_suite_indices_);
 
   // Shuffles the tests inside each test suite.
-  for (auto &test_suite : test_suites_) {
+  for (auto& test_suite : test_suites_) {
     test_suite->ShuffleTests(random());
   }
 }
@@ -5733,7 +6222,7 @@ void UnitTestImpl::UnshuffleTests() {
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-std::string GetCurrentOsStackTraceExceptTop(UnitTest * /*unit_test*/,
+std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
                                             int skip_count) {
   // We pass skip_count + 1 to skip this wrapper function in addition
   // to what the user really wants to skip.
@@ -5744,7 +6233,7 @@ std::string GetCurrentOsStackTraceExceptTop(UnitTest * /*unit_test*/,
 // suppress unreachable code warnings.
 namespace {
 class ClassUniqueToAlwaysTrue {};
-}  // namespace
+}
 
 bool IsTrue(bool condition) { return condition; }
 
@@ -5752,7 +6241,8 @@ bool AlwaysTrue() {
 #if GTEST_HAS_EXCEPTIONS
   // This condition is always false so AlwaysTrue() never actually throws,
   // but it makes the compiler think that it may throw.
-  if (IsTrue(false)) throw ClassUniqueToAlwaysTrue();
+  if (IsTrue(false))
+    throw ClassUniqueToAlwaysTrue();
 #endif  // GTEST_HAS_EXCEPTIONS
   return true;
 }
@@ -5760,7 +6250,7 @@ bool AlwaysTrue() {
 // If *pstr starts with the given prefix, modifies *pstr to be right
 // past the prefix and returns true; otherwise leaves *pstr unchanged
 // and returns false.  None of pstr, *pstr, and prefix can be NULL.
-bool SkipPrefix(const char *prefix, const char **pstr) {
+bool SkipPrefix(const char* prefix, const char** pstr) {
   const size_t prefix_len = strlen(prefix);
   if (strncmp(*pstr, prefix, prefix_len) == 0) {
     *pstr += prefix_len;
@@ -5774,7 +6264,7 @@ bool SkipPrefix(const char *prefix, const char **pstr) {
 // part can be omitted.
 //
 // Returns the value of the flag, or NULL if the parsing failed.
-static const char *ParseFlagValue(const char *str, const char *flag,
+static const char* ParseFlagValue(const char* str, const char* flag,
                                   bool def_optional) {
   // str and flag must not be NULL.
   if (str == nullptr || flag == nullptr) return nullptr;
@@ -5785,7 +6275,7 @@ static const char *ParseFlagValue(const char *str, const char *flag,
   if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
 
   // Skips the flag name.
-  const char *flag_end = str + flag_len;
+  const char* flag_end = str + flag_len;
 
   // When def_optional is true, it's OK to not have a "=value" part.
   if (def_optional && (flag_end[0] == '\0')) {
@@ -5811,9 +6301,9 @@ static const char *ParseFlagValue(const char *str, const char *flag,
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-static bool ParseBoolFlag(const char *str, const char *flag, bool *value) {
+static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   // Gets the value of the flag as a string.
-  const char *const value_str = ParseFlagValue(str, flag, true);
+  const char* const value_str = ParseFlagValue(str, flag, true);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
@@ -5827,16 +6317,16 @@ static bool ParseBoolFlag(const char *str, const char *flag, bool *value) {
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char *str, const char *flag, int32_t *value) {
+bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
   // Gets the value of the flag as a string.
-  const char *const value_str = ParseFlagValue(str, flag, false);
+  const char* const value_str = ParseFlagValue(str, flag, false);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
 
   // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag, value_str,
-                    value);
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
 }
 
 // Parses a string for a string flag, in the form of "--flag=value".
@@ -5844,9 +6334,9 @@ bool ParseInt32Flag(const char *str, const char *flag, int32_t *value) {
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 template <typename String>
-static bool ParseStringFlag(const char *str, const char *flag, String *value) {
+static bool ParseStringFlag(const char* str, const char* flag, String* value) {
   // Gets the value of the flag as a string.
-  const char *const value_str = ParseFlagValue(str, flag, false);
+  const char* const value_str = ParseFlagValue(str, flag, false);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
@@ -5862,8 +6352,9 @@ static bool ParseStringFlag(const char *str, const char *flag, String *value) {
 // recognized, it will print its help message. Flags starting with
 // GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
 // internal flags and do not trigger the help message.
-static bool HasGoogleTestFlagPrefix(const char *str) {
-  return (SkipPrefix("--", &str) || SkipPrefix("-", &str) ||
+static bool HasGoogleTestFlagPrefix(const char* str) {
+  return (SkipPrefix("--", &str) ||
+          SkipPrefix("-", &str) ||
           SkipPrefix("/", &str)) &&
          !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
          (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
@@ -5879,15 +6370,15 @@ static bool HasGoogleTestFlagPrefix(const char *str) {
 //   @Y    changes the color to yellow.
 //   @D    changes to the default terminal text color.
 //
-static void PrintColorEncoded(const char *str) {
-  GTestColor color = COLOR_DEFAULT;  // The current color.
+static void PrintColorEncoded(const char* str) {
+  GTestColor color = GTestColor::kDefault;  // The current color.
 
   // Conceptually, we split the string into segments divided by escape
   // sequences.  Then we print one segment at a time.  At the end of
   // each iteration, the str pointer advances to the beginning of the
   // next segment.
   for (;;) {
-    const char *p = strchr(str, '@');
+    const char* p = strchr(str, '@');
     if (p == nullptr) {
       ColoredPrintf(color, "%s", str);
       return;
@@ -5900,13 +6391,13 @@ static void PrintColorEncoded(const char *str) {
     if (ch == '@') {
       ColoredPrintf(color, "@");
     } else if (ch == 'D') {
-      color = COLOR_DEFAULT;
+      color = GTestColor::kDefault;
     } else if (ch == 'R') {
-      color = COLOR_RED;
+      color = GTestColor::kRed;
     } else if (ch == 'G') {
-      color = COLOR_GREEN;
+      color = GTestColor::kGreen;
     } else if (ch == 'Y') {
-      color = COLOR_YELLOW;
+      color = GTestColor::kYellow;
     } else {
       --str;
     }
@@ -5924,7 +6415,7 @@ static const char kColorEncodedHelpMessage[] =
     "      List the names of all tests instead of running them. The name of\n"
     "      TEST(Foo, Bar) is \"Foo.Bar\".\n"
     "  @G--" GTEST_FLAG_PREFIX_
-    "filter=@YPOSTIVE_PATTERNS"
+    "filter=@YPOSITIVE_PATTERNS"
     "[@G-@YNEGATIVE_PATTERNS]@D\n"
     "      Run only the tests whose name matches one of the positive patterns "
     "but\n"
@@ -5951,7 +6442,10 @@ static const char kColorEncodedHelpMessage[] =
     "  @G--" GTEST_FLAG_PREFIX_
     "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
     "      Enable/disable colored output. The default is @Gauto@D.\n"
-    "  -@G-" GTEST_FLAG_PREFIX_
+    "  @G--" GTEST_FLAG_PREFIX_
+    "brief=1@D\n"
+    "      Only print test failures.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
     "print_time=0@D\n"
     "      Don't print the elapsed time of each test.\n"
     "  @G--" GTEST_FLAG_PREFIX_
@@ -5960,18 +6454,18 @@ static const char kColorEncodedHelpMessage[] =
     "      Generate a JSON or XML report in the given directory or with the "
     "given\n"
     "      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
-#if GTEST_CAN_STREAM_RESULTS_
+# if GTEST_CAN_STREAM_RESULTS_
     "  @G--" GTEST_FLAG_PREFIX_
     "stream_result_to=@YHOST@G:@YPORT@D\n"
     "      Stream test results to the given server.\n"
-#endif  // GTEST_CAN_STREAM_RESULTS_
+# endif  // GTEST_CAN_STREAM_RESULTS_
     "\n"
     "Assertion Behavior:\n"
-#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
     "  @G--" GTEST_FLAG_PREFIX_
     "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
     "      Set the default death test style.\n"
-#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+# endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
     "  @G--" GTEST_FLAG_PREFIX_
     "break_on_failure@D\n"
     "      Turn assertion failures into debugger break-points.\n"
@@ -6002,7 +6496,7 @@ static const char kColorEncodedHelpMessage[] =
     "(not one in your own code or tests), please report it to\n"
     "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
 
-static bool ParseGoogleTestFlag(const char *const arg) {
+static bool ParseGoogleTestFlag(const char* const arg) {
   return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
                        &GTEST_FLAG(also_run_disabled_tests)) ||
          ParseBoolFlag(arg, kBreakOnFailureFlag,
@@ -6014,11 +6508,13 @@ static bool ParseGoogleTestFlag(const char *const arg) {
                          &GTEST_FLAG(death_test_style)) ||
          ParseBoolFlag(arg, kDeathTestUseFork,
                        &GTEST_FLAG(death_test_use_fork)) ||
+         ParseBoolFlag(arg, kFailFast, &GTEST_FLAG(fail_fast)) ||
          ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
          ParseStringFlag(arg, kInternalRunDeathTestFlag,
                          &GTEST_FLAG(internal_run_death_test)) ||
          ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
          ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+         ParseBoolFlag(arg, kBriefFlag, &GTEST_FLAG(brief)) ||
          ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
          ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
          ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
@@ -6032,8 +6528,8 @@ static bool ParseGoogleTestFlag(const char *const arg) {
 }
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
-static void LoadFlagsFromFile(const std::string &path) {
-  FILE *flagfile = posix::FOpen(path.c_str(), "r");
+static void LoadFlagsFromFile(const std::string& path) {
+  FILE* flagfile = posix::FOpen(path.c_str(), "r");
   if (!flagfile) {
     GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
                       << "\"";
@@ -6043,8 +6539,10 @@ static void LoadFlagsFromFile(const std::string &path) {
   std::vector<std::string> lines;
   SplitString(contents, '\n', &lines);
   for (size_t i = 0; i < lines.size(); ++i) {
-    if (lines[i].empty()) continue;
-    if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true;
+    if (lines[i].empty())
+      continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str()))
+      g_help_flag = true;
   }
 }
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
@@ -6053,10 +6551,10 @@ static void LoadFlagsFromFile(const std::string &path) {
 // other parts of Google Test.  The type parameter CharType can be
 // instantiated to either char or wchar_t.
 template <typename CharType>
-void ParseGoogleTestFlagsOnlyImpl(int *argc, CharType **argv) {
+void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
   for (int i = 1; i < *argc; i++) {
     const std::string arg_string = StreamableToString(argv[i]);
-    const char *const arg = arg_string.c_str();
+    const char* const arg = arg_string.c_str();
 
     using internal::ParseBoolFlag;
     using internal::ParseInt32Flag;
@@ -6106,7 +6604,7 @@ void ParseGoogleTestFlagsOnlyImpl(int *argc, CharType **argv) {
 
 // Parses the command line for Google Test flags, without initializing
 // other parts of Google Test.
-void ParseGoogleTestFlagsOnly(int *argc, char **argv) {
+void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
 
   // Fix the value of *_NSGetArgc() on macOS, but if and only if
@@ -6120,7 +6618,7 @@ void ParseGoogleTestFlagsOnly(int *argc, char **argv) {
 #endif
 #endif
 }
-void ParseGoogleTestFlagsOnly(int *argc, wchar_t **argv) {
+void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
 }
 
@@ -6129,7 +6627,7 @@ void ParseGoogleTestFlagsOnly(int *argc, wchar_t **argv) {
 // The type parameter CharType can be instantiated to either char or
 // wchar_t.
 template <typename CharType>
-void InitGoogleTestImpl(int *argc, CharType **argv) {
+void InitGoogleTestImpl(int* argc, CharType** argv) {
   // We don't want to run the initialization code twice.
   if (GTestIsInitialized()) return;
 
@@ -6159,20 +6657,20 @@ void InitGoogleTestImpl(int *argc, CharType **argv) {
 // updated.
 //
 // Calling the function for the second time has no user-visible effect.
-void InitGoogleTest(int *argc, char **argv) {
+void InitGoogleTest(int* argc, char** argv) {
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
 
 // This overloaded version can be used in Windows programs compiled in
 // UNICODE mode.
-void InitGoogleTest(int *argc, wchar_t **argv) {
+void InitGoogleTest(int* argc, wchar_t** argv) {
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
@@ -6183,12 +6681,12 @@ void InitGoogleTest() {
   // Since Arduino doesn't have a command line, fake out the argc/argv arguments
   int argc = 1;
   const auto arg0 = "dummy";
-  char *argv0 = const_cast<char *>(arg0);
-  char **argv = &argv0;
+  char* argv0 = const_cast<char*>(arg0);
+  char** argv = &argv0;
 
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv);
-#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(&argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
@@ -6196,24 +6694,31 @@ void InitGoogleTest() {
 std::string TempDir() {
 #if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
   return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
-#endif
-
-#if GTEST_OS_WINDOWS_MOBILE
+#elif GTEST_OS_WINDOWS_MOBILE
   return "\\temp\\";
 #elif GTEST_OS_WINDOWS
-  const char *temp_dir = internal::posix::GetEnv("TEMP");
-  if (temp_dir == nullptr || temp_dir[0] == '\0')
+  const char* temp_dir = internal::posix::GetEnv("TEMP");
+  if (temp_dir == nullptr || temp_dir[0] == '\0') {
     return "\\temp\\";
-  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+  } else if (temp_dir[strlen(temp_dir) - 1] == '\\') {
     return temp_dir;
-  else
+  } else {
     return std::string(temp_dir) + "\\";
+  }
 #elif GTEST_OS_LINUX_ANDROID
-  const char *temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
-  if (temp_dir == nullptr || temp_dir[0] == '\0')
+  const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
+  if (temp_dir == nullptr || temp_dir[0] == '\0') {
     return "/data/local/tmp/";
-  else
+  } else {
     return temp_dir;
+  }
+#elif GTEST_OS_LINUX
+  const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
+  if (temp_dir == nullptr || temp_dir[0] == '\0') {
+    return "/tmp/";
+  } else {
+    return temp_dir;
+  }
 #else
   return "/tmp/";
 #endif  // GTEST_OS_WINDOWS_MOBILE
@@ -6223,7 +6728,7 @@ std::string TempDir() {
 
 // Pushes the given source file location and message onto a per-thread
 // trace stack maintained by Google Test.
-void ScopedTrace::PushTrace(const char *file, int line, std::string message) {
+void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
   internal::TraceInfo trace;
   trace.file = file;
   trace.line = line;
@@ -6233,7 +6738,8 @@ void ScopedTrace::PushTrace(const char *file, int line, std::string message) {
 }
 
 // Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
   UnitTest::GetInstance()->PopGTestTrace();
 }
 
diff --git a/media/libaom/src/third_party/googletest/src/googletest/src/gtest_main.cc b/media/libaom/src/third_party/googletest/src/googletest/src/gtest_main.cc
index 77c90ce61a..46b27c3d7d 100644
--- a/media/libaom/src/third_party/googletest/src/googletest/src/gtest_main.cc
+++ b/media/libaom/src/third_party/googletest/src/googletest/src/gtest_main.cc
@@ -34,7 +34,9 @@
 #if GTEST_OS_ESP8266
 extern "C" {
 #endif
-void setup() { testing::InitGoogleTest(); }
+void setup() {
+  testing::InitGoogleTest();
+}
 
 void loop() { RUN_ALL_TESTS(); }
 
diff --git a/media/libaom/src/third_party/libwebm/Android.mk b/media/libaom/src/third_party/libwebm/Android.mk
index b46ba101d4..1185198a84 100644
--- a/media/libaom/src/third_party/libwebm/Android.mk
+++ b/media/libaom/src/third_party/libwebm/Android.mk
@@ -14,4 +14,7 @@ LOCAL_SRC_FILES:= common/file_util.cc \
                   mkvmuxer/mkvmuxer.cc \
                   mkvmuxer/mkvmuxerutil.cc \
                   mkvmuxer/mkvwriter.cc
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/LICENSE.TXT $(LOCAL_PATH)/PATENTS.TXT
 include $(BUILD_STATIC_LIBRARY)
diff --git a/media/libaom/src/third_party/libwebm/README.libaom b/media/libaom/src/third_party/libwebm/README.libaom
index 1e87afd3d1..325604cc66 100644
--- a/media/libaom/src/third_party/libwebm/README.libaom
+++ b/media/libaom/src/third_party/libwebm/README.libaom
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 37d9b860ebbf40cb0f6dcb7a6fef452d798062da
+Version: ee0bab576c338c9807249b99588e352b7268cb62
 License: BSD
 License File: LICENSE.txt
 
diff --git a/media/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/media/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index 5120312119..ae36531439 100644
--- a/media/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/media/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -774,7 +774,7 @@ bool Track::Write(IMkvWriter* writer) const {
     return false;
 
   // AV1 tracks require a CodecPrivate. See
-  // https://github.com/Matroska-Org/matroska-specification/blob/av1-mappin/codec/av1.md
+  // https://github.com/ietf-wg-cellar/matroska-specification/blob/HEAD/codec/av1.md
   // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to
   // point to a stable version once it is finalized, or our own WebM mappings
   // page on webmproject.org should we decide to release them.
@@ -3084,6 +3084,7 @@ Segment::Segment()
       accurate_cluster_duration_(false),
       fixed_size_cluster_timecode_(false),
       estimate_file_duration_(false),
+      ebml_header_size_(0),
       payload_pos_(0),
       size_position_(0),
       doc_type_version_(kDefaultDocTypeVersion),
@@ -4105,12 +4106,16 @@ int Segment::WriteFramesAll() {
     // places where |doc_type_version_| needs to be updated.
     if (frame->discard_padding() != 0)
       doc_type_version_ = 4;
-    if (!cluster->AddFrame(frame))
-      return -1;
+    if (!cluster->AddFrame(frame)) {
+      delete frame;
+      continue;
+    }
 
     if (new_cuepoint_ && cues_track_ == frame->track_number()) {
-      if (!AddCuePoint(frame->timestamp(), cues_track_))
-        return -1;
+      if (!AddCuePoint(frame->timestamp(), cues_track_)) {
+        delete frame;
+        continue;
+      }
     }
 
     if (frame->timestamp() > last_timestamp_) {
@@ -4153,12 +4158,16 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) {
       const Frame* const frame_prev = frames_[i - 1];
       if (frame_prev->discard_padding() != 0)
         doc_type_version_ = 4;
-      if (!cluster->AddFrame(frame_prev))
-        return false;
+      if (!cluster->AddFrame(frame_prev)) {
+        delete frame_prev;
+        continue;
+      }
 
       if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) {
-        if (!AddCuePoint(frame_prev->timestamp(), cues_track_))
-          return false;
+        if (!AddCuePoint(frame_prev->timestamp(), cues_track_)) {
+          delete frame_prev;
+          continue;
+        }
       }
 
       ++shift_left;
diff --git a/media/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/media/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 6436817c9b..bd2f769138 100644
--- a/media/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/media/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -606,8 +606,8 @@ uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
 
 void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
   *major = 0;
-  *minor = 2;
-  *build = 1;
+  *minor = 3;
+  *build = 0;
   *revision = 0;
 }
 
diff --git a/media/libaom/src/third_party/libwebm/mkvparser/mkvparser.cc b/media/libaom/src/third_party/libwebm/mkvparser/mkvparser.cc
index ace65bd595..de8884b381 100644
--- a/media/libaom/src/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/media/libaom/src/third_party/libwebm/mkvparser/mkvparser.cc
@@ -54,9 +54,9 @@ Type* SafeArrayAlloc(unsigned long long num_elements,
 
 void GetVersion(int& major, int& minor, int& build, int& revision) {
   major = 1;
-  minor = 0;
+  minor = 1;
   build = 0;
-  revision = 30;
+  revision = 0;
 }
 
 long long ReadUInt(IMkvReader* pReader, long long pos, long& len) {
@@ -1502,8 +1502,8 @@ long SeekHead::Parse() {
 
   // first count the seek head entries
 
-  int entry_count = 0;
-  int void_element_count = 0;
+  long long entry_count = 0;
+  long long void_element_count = 0;
 
   while (pos < stop) {
     long long id, size;
@@ -1513,10 +1513,15 @@ long SeekHead::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvSeek)
+    if (id == libwebm::kMkvSeek) {
       ++entry_count;
-    else if (id == libwebm::kMkvVoid)
+      if (entry_count > INT_MAX)
+        return E_PARSE_FAILED;
+    } else if (id == libwebm::kMkvVoid) {
       ++void_element_count;
+      if (void_element_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
 
@@ -1528,14 +1533,15 @@ long SeekHead::Parse() {
     return E_FILE_FORMAT_INVALID;
 
   if (entry_count > 0) {
-    m_entries = new (std::nothrow) Entry[entry_count];
+    m_entries = new (std::nothrow) Entry[static_cast<size_t>(entry_count)];
 
     if (m_entries == NULL)
       return -1;
   }
 
   if (void_element_count > 0) {
-    m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+    m_void_elements =
+        new (std::nothrow) VoidElement[static_cast<size_t>(void_element_count)];
 
     if (m_void_elements == NULL)
       return -1;
@@ -1582,13 +1588,13 @@ long SeekHead::Parse() {
 
   ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
   assert(count_ >= 0);
-  assert(count_ <= entry_count);
+  assert(static_cast<long long>(count_) <= entry_count);
 
   m_entry_count = static_cast<int>(count_);
 
   count_ = ptrdiff_t(pVoidElement - m_void_elements);
   assert(count_ >= 0);
-  assert(count_ <= void_element_count);
+  assert(static_cast<long long>(count_) <= void_element_count);
 
   m_void_element_count = static_cast<int>(count_);
 
@@ -2299,7 +2305,7 @@ bool CuePoint::Load(IMkvReader* pReader) {
   long long pos = pos_;
 
   // First count number of track positions
-
+  unsigned long long track_positions_count = 0;
   while (pos < stop) {
     long len;
 
@@ -2323,12 +2329,17 @@ bool CuePoint::Load(IMkvReader* pReader) {
     if (id == libwebm::kMkvCueTime)
       m_timecode = UnserializeUInt(pReader, pos, size);
 
-    else if (id == libwebm::kMkvCueTrackPositions)
-      ++m_track_positions_count;
+    else if (id == libwebm::kMkvCueTrackPositions) {
+      ++track_positions_count;
+      if (track_positions_count > UINT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
   }
 
+  m_track_positions_count = static_cast<size_t>(track_positions_count);
+
   if (m_timecode < 0 || m_track_positions_count <= 0) {
     return false;
   }
@@ -4194,8 +4205,8 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
   const long long stop = start + size;
 
   // Count ContentCompression and ContentEncryption elements.
-  int compression_count = 0;
-  int encryption_count = 0;
+  long long compression_count = 0;
+  long long encryption_count = 0;
 
   while (pos < stop) {
     long long id, size;
@@ -4203,11 +4214,17 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvContentCompression)
+    if (id == libwebm::kMkvContentCompression) {
       ++compression_count;
+      if (compression_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
-    if (id == libwebm::kMkvContentEncryption)
+    if (id == libwebm::kMkvContentEncryption) {
       ++encryption_count;
+      if (encryption_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -4218,16 +4235,16 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     return -1;
 
   if (compression_count > 0) {
-    compression_entries_ =
-        new (std::nothrow) ContentCompression*[compression_count];
+    compression_entries_ = new (std::nothrow)
+        ContentCompression*[static_cast<size_t>(compression_count)];
     if (!compression_entries_)
       return -1;
     compression_entries_end_ = compression_entries_;
   }
 
   if (encryption_count > 0) {
-    encryption_entries_ =
-        new (std::nothrow) ContentEncryption*[encryption_count];
+    encryption_entries_ = new (std::nothrow)
+        ContentEncryption*[static_cast<size_t>(encryption_count)];
     if (!encryption_entries_) {
       delete[] compression_entries_;
       compression_entries_ = NULL;
@@ -4918,7 +4935,7 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
   const long long stop = start + size;
 
   // Count ContentEncoding elements.
-  int count = 0;
+  long long count = 0;
   while (pos < stop) {
     long long id, size;
     const long status = ParseElementHeader(pReader, pos, stop, id, size);
@@ -4926,8 +4943,11 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
       return status;
 
     // pos now designates start of element
-    if (id == libwebm::kMkvContentEncoding)
+    if (id == libwebm::kMkvContentEncoding) {
       ++count;
+      if (count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -4937,7 +4957,8 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
   if (count <= 0)
     return -1;
 
-  content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count];
+  content_encoding_entries_ =
+      new (std::nothrow) ContentEncoding*[static_cast<size_t>(count)];
   if (!content_encoding_entries_)
     return -1;
 
@@ -5229,6 +5250,8 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
 
       projection_ptr->type = static_cast<ProjectionType>(projection_type);
     } else if (child_id == libwebm::kMkvProjectionPrivate) {
+      if (projection_ptr->private_data != NULL)
+        return false;
       unsigned char* data = SafeArrayAlloc<unsigned char>(1, child_size);
 
       if (data == NULL)
@@ -5286,6 +5309,7 @@ VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
       m_projection(NULL) {}
 
 VideoTrack::~VideoTrack() {
+  delete[] m_colour_space;
   delete m_colour;
   delete m_projection;
 }
@@ -5307,7 +5331,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   long long stereo_mode = 0;
 
   double rate = 0.0;
-  char* colour_space = NULL;
+  std::unique_ptr<char[]> colour_space_ptr;
 
   IMkvReader* const pReader = pSegment->m_pReader;
 
@@ -5384,9 +5408,11 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
         projection_ptr.reset(projection);
       }
     } else if (id == libwebm::kMkvColourSpace) {
+      char* colour_space = NULL;
       const long status = UnserializeString(pReader, pos, size, colour_space);
       if (status < 0)
         return status;
+      colour_space_ptr.reset(colour_space);
     }
 
     pos += size;  // consume payload
@@ -5418,7 +5444,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   pTrack->m_stereo_mode = stereo_mode;
   pTrack->m_rate = rate;
   pTrack->m_colour = colour_ptr.release();
-  pTrack->m_colour_space = colour_space;
+  pTrack->m_colour_space = colour_space_ptr.release();
   pTrack->m_projection = projection_ptr.release();
 
   pResult = pTrack;
@@ -5648,7 +5674,7 @@ long Tracks::Parse() {
   const long long stop = m_start + m_size;
   IMkvReader* const pReader = m_pSegment->m_pReader;
 
-  int count = 0;
+  long long count = 0;
   long long pos = m_start;
 
   while (pos < stop) {
@@ -5662,8 +5688,11 @@ long Tracks::Parse() {
     if (size == 0)  // weird
       continue;
 
-    if (id == libwebm::kMkvTrackEntry)
+    if (id == libwebm::kMkvTrackEntry) {
       ++count;
+      if (count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -5676,7 +5705,7 @@ long Tracks::Parse() {
   if (count <= 0)
     return 0;  // success
 
-  m_trackEntries = new (std::nothrow) Track*[count];
+  m_trackEntries = new (std::nothrow) Track*[static_cast<size_t>(count)];
 
   if (m_trackEntries == NULL)
     return -1;
diff --git a/media/libaom/src/third_party/libyuv/LICENSE b/media/libaom/src/third_party/libyuv/LICENSE
new file mode 100644
index 0000000000..c911747a6b
--- /dev/null
+++ b/media/libaom/src/third_party/libyuv/LICENSE
@@ -0,0 +1,29 @@
+Copyright 2011 The LibYuv Project Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google nor the names of its contributors may
+    be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/media/libaom/src/third_party/libyuv/README.libaom b/media/libaom/src/third_party/libyuv/README.libaom
index 09693c1f2c..6e66f858e2 100644
--- a/media/libaom/src/third_party/libyuv/README.libaom
+++ b/media/libaom/src/third_party/libyuv/README.libaom
@@ -1,6 +1,6 @@
 Name: libyuv
-URL: http://code.google.com/p/libyuv/
-Version: 1456
+URL: https://chromium.googlesource.com/libyuv/libyuv/
+Version: dfaf7534e0e536f7e5ef8ddd7326797bd09b8622
 License: BSD
 License File: LICENSE
 
@@ -13,3 +13,25 @@ which down-samples the original input video (f.g. 1280x720) a number of times
 in order to encode multiple resolution bit streams.
 
 Local Modifications:
+
+diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc
+index fe89452b7..72a7fb82f 100644
+--- a/third_party/libyuv/source/cpu_id.cc
++++ b/third_party/libyuv/source/cpu_id.cc
+@@ -108,7 +108,7 @@ void CpuId(int eax, int ecx, int* cpu_info) {
+ //  }
+ // For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
+ // https://code.google.com/p/libyuv/issues/detail?id=529
+-#if defined(_M_IX86) && (_MSC_VER < 1900)
++#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
+ #pragma optimize("g", off)
+ #endif
+ #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+@@ -129,7 +129,7 @@ int GetXCR0() {
+ #define GetXCR0() 0
+ #endif  // defined(_M_IX86) || defined(_M_X64) ..
+ // Return optimization to previous setting.
+-#if defined(_M_IX86) && (_MSC_VER < 1900)
++#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
+ #pragma optimize("g", on)
+ #endif
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/basic_types.h b/media/libaom/src/third_party/libyuv/include/libyuv/basic_types.h
index 66e68536cb..1bea67f2f2 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/basic_types.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/basic_types.h
@@ -1,90 +1,46 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
 #define INCLUDE_LIBYUV_BASIC_TYPES_H_
 
-#include <stddef.h>  // for NULL, size_t
+#include <stddef.h>  // For size_t and NULL
 
-#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
-#include <sys/types.h>  // for uintptr_t on x86
-#else
-#include <stdint.h>  // for uintptr_t
-#endif
-
-#ifndef GG_LONGLONG
-#ifndef INT_TYPES_DEFINED
+#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG)
 #define INT_TYPES_DEFINED
-#ifdef COMPILER_MSVC
-typedef unsigned __int64 uint64;
-typedef __int64 int64;
-#ifndef INT64_C
-#define INT64_C(x) x ## I64
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x ## UI64
-#endif
-#define INT64_F "I64"
-#else  // COMPILER_MSVC
-#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long uint64;  // NOLINT
-typedef long int64;  // NOLINT
-#ifndef INT64_C
-#define INT64_C(x) x ## L
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x ## UL
-#endif
-#define INT64_F "l"
-#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long long uint64;  // NOLINT
-typedef long long int64;  // NOLINT
-#ifndef INT64_C
-#define INT64_C(x) x ## LL
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x ## ULL
-#endif
-#define INT64_F "ll"
-#endif  // __LP64__
-#endif  // COMPILER_MSVC
-typedef unsigned int uint32;
-typedef int int32;
-typedef unsigned short uint16;  // NOLINT
-typedef short int16;  // NOLINT
-typedef unsigned char uint8;
-typedef signed char int8;
-#endif  // INT_TYPES_DEFINED
-#endif  // GG_LONGLONG
 
-// Detect compiler is for x86 or x64.
-#if defined(__x86_64__) || defined(_M_X64) || \
-    defined(__i386__) || defined(_M_IX86)
-#define CPU_X86 1
-#endif
-// Detect compiler is for ARM.
-#if defined(__arm__) || defined(_M_ARM)
-#define CPU_ARM 1
-#endif
-
-#ifndef ALIGNP
-#ifdef __cplusplus
-#define ALIGNP(p, t) \
-    (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
-    ((t) - 1)) & ~((t) - 1))))
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+#include <sys/types.h>  // for uintptr_t on x86
+typedef unsigned __int64 uint64_t;
+typedef __int64 int64_t;
+typedef unsigned int uint32_t;
+typedef int int32_t;
+typedef unsigned short uint16_t;
+typedef short int16_t;
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
 #else
-#define ALIGNP(p, t) \
-    ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1))))  /* NOLINT */
-#endif
-#endif
+#include <stdint.h>  // for uintptr_t and C99 types
+#endif               // defined(_MSC_VER) && (_MSC_VER < 1600)
+// Types are deprecated.  Enable this macro for legacy types.
+#ifdef LIBYUV_LEGACY_TYPES
+typedef uint64_t uint64;
+typedef int64_t int64;
+typedef uint32_t uint32;
+typedef int32_t int32;
+typedef uint16_t uint16;
+typedef int16_t int16;
+typedef uint8_t uint8;
+typedef int8_t int8;
+#endif  // LIBYUV_LEGACY_TYPES
+#endif  // INT_TYPES_DEFINED
 
 #if !defined(LIBYUV_API)
 #if defined(_WIN32) || defined(__CYGWIN__)
@@ -96,24 +52,17 @@ typedef signed char int8;
 #define LIBYUV_API
 #endif  // LIBYUV_BUILDING_SHARED_LIBRARY
 #elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
-    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
-    defined(LIBYUV_USING_SHARED_LIBRARY))
-#define LIBYUV_API __attribute__ ((visibility ("default")))
+    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) ||                      \
+     defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__((visibility("default")))
 #else
 #define LIBYUV_API
 #endif  // __GNUC__
 #endif  // LIBYUV_API
 
+// TODO(fbarchard): Remove bool macros.
 #define LIBYUV_BOOL int
 #define LIBYUV_FALSE 0
 #define LIBYUV_TRUE 1
 
-// Visual C x86 or GCC little endian.
-#if defined(__x86_64__) || defined(_M_X64) || \
-  defined(__i386__) || defined(_M_IX86) || \
-  defined(__arm__) || defined(_M_ARM) || \
-  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define LIBYUV_LITTLE_ENDIAN
-#endif
-
-#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/compare.h b/media/libaom/src/third_party/libyuv/include/libyuv/compare.h
index 2a9f1560ce..3353ad71c6 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/compare.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/compare.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_COMPARE_H_
 #define INCLUDE_LIBYUV_COMPARE_H_
 
 #include "libyuv/basic_types.h"
@@ -21,59 +20,92 @@ extern "C" {
 
 // Compute a hash for specified memory. Seed of 5381 recommended.
 LIBYUV_API
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed);
+
+// Hamming Distance
+LIBYUV_API
+uint64_t ComputeHammingDistance(const uint8_t* src_a,
+                                const uint8_t* src_b,
+                                int count);
 
 // Scan an opaque argb image and return fourcc based on alpha offset.
 // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
 LIBYUV_API
-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+uint32_t ARGBDetect(const uint8_t* argb,
+                    int stride_argb,
+                    int width,
+                    int height);
 
 // Sum Square Error - used to compute Mean Square Error or PSNR.
 LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a,
-                             const uint8* src_b, int count);
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count);
 
 LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
-                                  const uint8* src_b, int stride_b,
-                                  int width, int height);
+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
+                                    int stride_a,
+                                    const uint8_t* src_b,
+                                    int stride_b,
+                                    int width,
+                                    int height);
 
 static const int kMaxPsnr = 128;
 
 LIBYUV_API
-double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count);
 
 LIBYUV_API
-double CalcFramePsnr(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height);
+double CalcFramePsnr(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height);
 
 LIBYUV_API
-double I420Psnr(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height);
+double I420Psnr(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height);
 
 LIBYUV_API
-double CalcFrameSsim(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height);
+double CalcFrameSsim(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height);
 
 LIBYUV_API
-double I420Ssim(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height);
+double I420Ssim(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_COMPARE_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/convert.h b/media/libaom/src/third_party/libyuv/include/libyuv/convert.h
index d6f206c10f..026b153cef 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/convert.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/convert.h
@@ -1,22 +1,24 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_H_
 #define INCLUDE_LIBYUV_CONVERT_H_
 
 #include "libyuv/basic_types.h"
-// TODO(fbarchard): Remove the following headers includes.
-#include "libyuv/convert_from.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
+
+#include "libyuv/rotate.h"  // For enum RotationMode.
+
+// TODO(fbarchard): fix WebRTC source to include following libyuv headers:
+#include "libyuv/convert_argb.h"      // For WebRTC I420ToARGB. b/620
+#include "libyuv/convert_from.h"      // For WebRTC ConvertFromI420. b/620
+#include "libyuv/planar_functions.h"  // For WebRTC I420Rect, CopyPlane. b/618
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -25,185 +27,456 @@ extern "C" {
 
 // Convert I444 to I420.
 LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I444ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert I444 to NV12.
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert I444 to NV21.
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
 
 // Convert I422 to I420.
 LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I422ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
-// Convert I411 to I420.
+// Convert I422 to NV21.
 LIBYUV_API
-int I411ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I422ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
 
 // Copy I420 to I420.
 #define I420ToI420 I420Copy
 LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
+int I420Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Copy I010 to I010
+#define I010ToI010 I010Copy
+#define H010ToH010 I010Copy
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Convert 10 bit YUV to 8 bit
+#define H010ToH420 I010ToI420
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert I400 (grey) to I420.
 LIBYUV_API
-int I400ToI420(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I400ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert I400 (grey) to NV21.
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
 
 #define J400ToJ420 I400ToI420
 
 // Convert NV12 to I420.
 LIBYUV_API
-int NV12ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int NV12ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert NV21 to I420.
 LIBYUV_API
-int NV21ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int NV21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert YUY2 to I420.
 LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int YUY2ToI420(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert UYVY to I420.
 LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int UYVYToI420(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
 
-// Convert M420 to I420.
+// Convert AYUV to NV21.
 LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int AYUVToNV21(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height);
 
 // ARGB little endian (bgra in memory) to I420.
 LIBYUV_API
-int ARGBToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // BGRA little endian (argb in memory) to I420.
 LIBYUV_API
-int BGRAToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int BGRAToI420(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // ABGR little endian (rgba in memory) to I420.
 LIBYUV_API
-int ABGRToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ABGRToI420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // RGBA little endian (abgr in memory) to I420.
 LIBYUV_API
-int RGBAToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int RGBAToI420(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // RGB little endian (bgr in memory) to I420.
 LIBYUV_API
-int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height);
+int RGB24ToI420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height);
+
+// RGB little endian (bgr in memory) to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height);
 
 // RGB big endian (rgb in memory) to I420.
 LIBYUV_API
-int RAWToI420(const uint8* src_frame, int src_stride_frame,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height);
+int RAWToI420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
 
 // RGB16 (RGBP fourcc) little endian to I420.
 LIBYUV_API
-int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
-                 uint8* dst_y, int dst_stride_y,
-                 uint8* dst_u, int dst_stride_u,
-                 uint8* dst_v, int dst_stride_v,
-                 int width, int height);
+int RGB565ToI420(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 uint8_t* dst_u,
+                 int dst_stride_u,
+                 uint8_t* dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height);
 
 // RGB15 (RGBO fourcc) little endian to I420.
 LIBYUV_API
-int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height);
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height);
 
 // RGB12 (R444 fourcc) little endian to I420.
 LIBYUV_API
-int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height);
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height);
+
+// RGB little endian (bgr in memory) to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_yj,
+                int dst_stride_yj,
+                int width,
+                int height);
+
+// RGB big endian (rgb in memory) to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_yj,
+              int dst_stride_yj,
+              int width,
+              int height);
 
-#ifdef HAVE_JPEG
 // src_width/height provided by capture.
 // dst_width/height for clipping determine final size.
 LIBYUV_API
-int MJPGToI420(const uint8* sample, size_t sample_size,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int src_width, int src_height,
-               int dst_width, int dst_height);
+int MJPGToI420(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+
+// JPEG to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+
+// JPEG to NV12
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
 
 // Query size of MJPG in pixels.
 LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size,
-             int* width, int* height);
-#endif
+int MJPGSize(const uint8_t* sample,
+             size_t sample_size,
+             int* width,
+             int* height);
 
 // Convert camera sample to I420 with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.
@@ -225,22 +498,29 @@ int MJPGSize(const uint8* sample, size_t sample_size,
 //    Must be less than or equal to src_width/src_height
 //    Cropping parameters are pre-rotation.
 // "rotation" can be 0, 90, 180 or 270.
-// "format" is a fourcc. ie 'I420', 'YUY2'
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
 // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
 LIBYUV_API
-int ConvertToI420(const uint8* src_frame, size_t src_size,
-                  uint8* dst_y, int dst_stride_y,
-                  uint8* dst_u, int dst_stride_u,
-                  uint8* dst_v, int dst_stride_v,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+int ConvertToI420(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_y,
+                  int dst_stride_y,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
-                  uint32 format);
+                  uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CONVERT_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/convert_argb.h b/media/libaom/src/third_party/libyuv/include/libyuv/convert_argb.h
index ea75c0b26a..715a3dad97 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/convert_argb.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/convert_argb.h
@@ -1,200 +1,1574 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
 #define INCLUDE_LIBYUV_CONVERT_ARGB_H_
 
 #include "libyuv/basic_types.h"
-// TODO(fbarchard): Remove the following headers includes
-#include "libyuv/convert_from.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
 
-// TODO(fbarchard): This set of functions should exactly match convert.h
-// TODO(fbarchard): Add tests. Create random content of right size and convert
-// with C vs Opt and or to I420 and compare.
-// TODO(fbarchard): Some of these functions lack parameter setting.
+#include "libyuv/rotate.h"  // For enum RotationMode.
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
+// Conversion matrix for YUV to RGB
+LIBYUV_API extern const struct YuvConstants kYuvI601Constants;  // BT.601
+LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants;  // JPeg
+LIBYUV_API extern const struct YuvConstants kYuvH709Constants;  // BT.709
+LIBYUV_API extern const struct YuvConstants kYuv2020Constants;  // BT.2020
+
+// Conversion matrix for YVU to BGR
+LIBYUV_API extern const struct YuvConstants kYvuI601Constants;  // BT.601
+LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants;  // JPeg
+LIBYUV_API extern const struct YuvConstants kYvuH709Constants;  // BT.709
+LIBYUV_API extern const struct YuvConstants kYvu2020Constants;  // BT.2020
+
+// Macros for end swapped destination Matrix conversions.
+// Swap UV and pass mirrored kYvuJPEGConstants matrix.
+// TODO(fbarchard): Add macro for each Matrix function.
+#define kYuvI601ConstantsVU kYvuI601Constants
+#define kYuvJPEGConstantsVU kYvuJPEGConstants
+#define kYuvH709ConstantsVU kYvuH709Constants
+#define kYuv2020ConstantsVU kYvu2020Constants
+#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+  NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+  NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+  NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+  NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+
 // Alias.
 #define ARGBToARGB ARGBCopy
 
 // Copy ARGB to ARGB.
 LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
 
 // Convert I420 to ARGB.
 LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert I422 to ARGB.
 LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert I444 to ARGB.
 LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
-// Convert I411 to ARGB.
+// Convert H010 to ABGR.
 LIBYUV_API
-int I411ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int H010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U010 to ARGB.
+LIBYUV_API
+int U010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U010 to ABGR.
+LIBYUV_API
+int U010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I210 to ARGB.
+LIBYUV_API
+int I210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I210 to ABGR.
+LIBYUV_API
+int I210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H210 to ARGB.
+LIBYUV_API
+int H210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H210 to ABGR.
+LIBYUV_API
+int H210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I420 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate);
+
+// Convert I420 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate);
 
 // Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
 LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert J400 (jpeg grey) to ARGB.
 LIBYUV_API
-int J400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int J400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Alias.
 #define YToARGB I400ToARGB
 
 // Convert NV12 to ARGB.
 LIBYUV_API
-int NV12ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int NV12ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert NV21 to ARGB.
 LIBYUV_API
-int NV21ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int NV21ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert NV12 to ABGR.
+LIBYUV_API
+int NV12ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_uv,
+                int src_stride_uv,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
 
-// Convert M420 to ARGB.
+// Convert NV21 to RGB24.
 LIBYUV_API
-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int NV21ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+// Convert NV21 to YUV24.
+LIBYUV_API
+int NV21ToYUV24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_yuv24,
+                int dst_stride_yuv24,
+                int width,
+                int height);
+
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_vu,
+              int src_stride_vu,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
 
 // Convert YUY2 to ARGB.
 LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int YUY2ToARGB(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert UYVY to ARGB.
 LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int UYVYToARGB(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
-// Convert J420 to ARGB.
+// Convert I010 to AR30.
 LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
 
-// Convert J422 to ARGB.
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert U010 to AR30.
+LIBYUV_API
+int U010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert U010 to AB30.
+LIBYUV_API
+int U010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert I210 to AR30.
+LIBYUV_API
+int I210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert I210 to AB30.
+LIBYUV_API
+int I210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert H210 to AR30.
+LIBYUV_API
+int H210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert H210 to AB30.
+LIBYUV_API
+int H210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert U210 to AR30.
 LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int U210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert U210 to AB30.
+LIBYUV_API
+int U210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
 
 // BGRA little endian (argb in memory) to ARGB.
 LIBYUV_API
-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int BGRAToARGB(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // ABGR little endian (rgba in memory) to ARGB.
 LIBYUV_API
-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int ABGRToARGB(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // RGBA little endian (abgr in memory) to ARGB.
 LIBYUV_API
-int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int RGBAToARGB(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Deprecated function name.
 #define BG24ToARGB RGB24ToARGB
 
 // RGB little endian (bgr in memory) to ARGB.
 LIBYUV_API
-int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height);
+int RGB24ToARGB(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height);
 
 // RGB big endian (rgb in memory) to ARGB.
 LIBYUV_API
-int RAWToARGB(const uint8* src_frame, int src_stride_frame,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
+int RAWToARGB(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
+
+// RGB big endian (rgb in memory) to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_rgba,
+              int dst_stride_rgba,
+              int width,
+              int height);
 
 // RGB16 (RGBP fourcc) little endian to ARGB.
 LIBYUV_API
-int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
+int RGB565ToARGB(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
 
 // RGB15 (RGBO fourcc) little endian to ARGB.
 LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height);
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height);
 
 // RGB12 (R444 fourcc) little endian to ARGB.
 LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height);
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height);
+
+// Aliases
+#define AB30ToARGB AR30ToABGR
+#define AB30ToABGR AR30ToARGB
+#define AB30ToAR30 AR30ToAB30
+
+// Convert AR30 To ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert AR30 To ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert AR30 To AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
 
-#ifdef HAVE_JPEG
 // src_width/height provided by capture
 // dst_width/height for clipping determine final size.
 LIBYUV_API
-int MJPGToARGB(const uint8* sample, size_t sample_size,
-               uint8* dst_argb, int dst_stride_argb,
-               int src_width, int src_height,
-               int dst_width, int dst_height);
-#endif
+int MJPGToARGB(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height);
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_abgr,
+                     int dst_stride_abgr,
+                     int width,
+                     int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height);
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_vu,
+                     int src_stride_vu,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_uv,
+                       int src_stride_uv,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height);
+
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_uv,
+                      int src_stride_uv,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_vu,
+                      int src_stride_vu,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
+// Convert Android420 to ARGB with matrix.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           int src_pixel_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height);
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height);
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
 
 // Convert camera sample to ARGB with cropping, rotation and vertical flip.
-// "src_size" is needed to parse MJPG.
+// "sample_size" is needed to parse MJPG.
 // "dst_stride_argb" number of bytes in a row of the dst_argb plane.
 //   Normally this would be the same as dst_width, with recommended alignment
 //   to 16 bytes for better efficiency.
@@ -213,20 +1587,25 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
 //    Must be less than or equal to src_width/src_height
 //    Cropping parameters are pre-rotation.
 // "rotation" can be 0, 90, 180 or 270.
-// "format" is a fourcc. ie 'I420', 'YUY2'
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
 // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
 LIBYUV_API
-int ConvertToARGB(const uint8* src_frame, size_t src_size,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+int ConvertToARGB(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
-                  uint32 format);
+                  uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/convert_from.h b/media/libaom/src/third_party/libyuv/include/libyuv/convert_from.h
index 3591b4fd6a..5140ed4f3e 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/convert_from.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/convert_from.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_
 #define INCLUDE_LIBYUV_CONVERT_FROM_H_
 
 #include "libyuv/basic_types.h"
@@ -22,161 +21,165 @@ extern "C" {
 
 // See Also convert.h for conversions from formats to I420.
 
-// I420Copy in convert to I420ToI420.
-
-LIBYUV_API
-int I420ToI422(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-LIBYUV_API
-int I420ToI444(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-LIBYUV_API
-int I420ToI411(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+// Convert 8 bit YUV to 10 bit.
+#define H420ToH010 I420ToI010
+LIBYUV_API
+int I420ToI010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToI422(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
 LIBYUV_API
-int I400Copy(const uint8* src_y, int src_stride_y,
-             uint8* dst_y, int dst_stride_y,
-             int width, int height);
-
-// TODO(fbarchard): I420ToM420
-
-LIBYUV_API
-int I420ToNV12(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
-
-LIBYUV_API
-int I420ToNV21(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
-
-LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
-
-LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
-
-LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height);
-
-LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_frame, int dst_stride_frame,
-                int width, int height);
-
-LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              uint8* dst_frame, int dst_stride_frame,
-              int width, int height);
-
-LIBYUV_API
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_frame, int dst_stride_frame,
-                 int width, int height);
-
-// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 7 recommended.
-// The order of the dither matrix is first byte is upper left.
-
-LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint8* dst_frame, int dst_stride_frame,
-                       const uint8* dither4x4, int width, int height);
-
-LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride_frame,
-                   int width, int height);
-
-LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride_frame,
-                   int width, int height);
+int I400Copy(const uint8_t* src_y,
+             int src_stride_y,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             int width,
+             int height);
+
+LIBYUV_API
+int I420ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height);
+
+// The following are from convert_argb.h
+// DEPRECATED: The prototypes will be removed in future.  Use convert_argb.h
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert I420 to specified format.
 // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
 //    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
 LIBYUV_API
-int ConvertFromI420(const uint8* y, int y_stride,
-                    const uint8* u, int u_stride,
-                    const uint8* v, int v_stride,
-                    uint8* dst_sample, int dst_sample_stride,
-                    int width, int height,
-                    uint32 format);
+int ConvertFromI420(const uint8_t* y,
+                    int y_stride,
+                    const uint8_t* u,
+                    int u_stride,
+                    const uint8_t* v,
+                    int v_stride,
+                    uint8_t* dst_sample,
+                    int dst_sample_stride,
+                    int width,
+                    int height,
+                    uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/convert_from_argb.h b/media/libaom/src/third_party/libyuv/include/libyuv/convert_from_argb.h
index 4a62268138..d992363ceb 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
 #define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
 
 #include "libyuv/basic_types.h"
@@ -22,170 +21,291 @@ extern "C" {
 // Copy ARGB to ARGB.
 #define ARGBToARGB ARGBCopy
 LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
 
 // Convert ARGB To BGRA.
 LIBYUV_API
-int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height);
+int ARGBToBGRA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
 
 // Convert ARGB To ABGR.
 LIBYUV_API
-int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+int ARGBToABGR(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert ARGB To RGBA.
 LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height);
+int ARGBToRGBA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+
+// Aliases
+#define ARGBToAB30 ABGRToAR30
+#define ABGRToAB30 ARGBToAR30
+
+// Convert ABGR To AR30.
+LIBYUV_API
+int ABGRToAR30(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert ARGB To AR30.
+LIBYUV_API
+int ARGBToAR30(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Aliases
+#define ABGRToRGB24 ARGBToRAW
+#define ABGRToRAW ARGBToRGB24
 
 // Convert ARGB To RGB24.
 LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height);
+int ARGBToRGB24(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
 
 // Convert ARGB To RAW.
 LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_rgb, int dst_stride_rgb,
-              int width, int height);
+int ARGBToRAW(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
 
 // Convert ARGB To RGB565.
 LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
+int ARGBToRGB565(const uint8_t* src_argb,
+                 int src_stride_argb,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
 
 // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
 // Values in dither matrix from 0 to 7 recommended.
 // The order of the dither matrix is first byte is upper left.
 // TODO(fbarchard): Consider pointer to 2d array for dither4x4.
-// const uint8(*dither)[4][4];
+// const uint8_t(*dither)[4][4];
 LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height);
+int ARGBToRGB565Dither(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height);
 
 // Convert ARGB To ARGB1555.
 LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height);
+int ARGBToARGB1555(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height);
 
 // Convert ARGB To ARGB4444.
 LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height);
+int ARGBToARGB4444(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height);
 
 // Convert ARGB To I444.
 LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToI444(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB To I422.
 LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToI422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB To I420. (also in convert.h)
 LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB to J420. (JPeg full range I420).
 LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToJ420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB to J422.
 LIBYUV_API
-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToJ422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
-// Convert ARGB To I411.
+// Convert ARGB to J400. (JPeg full range).
 LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToJ400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height);
 
-// Convert ARGB to J400. (JPeg full range).
+// Convert RGBA to J400. (JPeg full range).
 LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               int width, int height);
+int RGBAToJ400(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height);
 
 // Convert ARGB to I400.
 LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+int ARGBToI400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
 
 // Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
 LIBYUV_API
-int ARGBToG(const uint8* src_argb, int src_stride_argb,
-            uint8* dst_g, int dst_stride_g,
-            int width, int height);
+int ARGBToG(const uint8_t* src_argb,
+            int src_stride_argb,
+            uint8_t* dst_g,
+            int dst_stride_g,
+            int width,
+            int height);
 
 // Convert ARGB To NV12.
 LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
+int ARGBToNV12(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
 
 // Convert ARGB To NV21.
 LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
+int ARGBToNV21(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
 
-// Convert ARGB To NV21.
+// Convert ABGR To NV12.
+LIBYUV_API
+int ABGRToNV12(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert ABGR To NV21.
 LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
+int ABGRToNV21(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
 
 // Convert ARGB To YUY2.
 LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height);
+int ARGBToYUY2(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
 
 // Convert ARGB To UYVY.
 LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height);
+int ARGBToUYVY(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/cpu_id.h b/media/libaom/src/third_party/libyuv/include/libyuv/cpu_id.h
index 870e94e8cd..3e27cc107d 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/cpu_id.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_
 #define INCLUDE_LIBYUV_CPU_ID_H_
 
 #include "libyuv/basic_types.h"
@@ -19,9 +18,8 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// TODO(fbarchard): Consider overlapping bits for different architectures.
 // Internal flag to indicate cpuid requires initialization.
-#define kCpuInit 0x1
+static const int kCpuInitialized = 0x1;
 
 // These flags are only valid on ARM processors.
 static const int kCpuHasARM = 0x2;
@@ -33,50 +31,92 @@ static const int kCpuHasX86 = 0x10;
 static const int kCpuHasSSE2 = 0x20;
 static const int kCpuHasSSSE3 = 0x40;
 static const int kCpuHasSSE41 = 0x80;
-static const int kCpuHasSSE42 = 0x100;
+static const int kCpuHasSSE42 = 0x100;  // unused at this time.
 static const int kCpuHasAVX = 0x200;
 static const int kCpuHasAVX2 = 0x400;
 static const int kCpuHasERMS = 0x800;
 static const int kCpuHasFMA3 = 0x1000;
-// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
+static const int kCpuHasF16C = 0x2000;
+static const int kCpuHasGFNI = 0x4000;
+static const int kCpuHasAVX512BW = 0x8000;
+static const int kCpuHasAVX512VL = 0x10000;
+static const int kCpuHasAVX512VBMI = 0x20000;
+static const int kCpuHasAVX512VBMI2 = 0x40000;
+static const int kCpuHasAVX512VBITALG = 0x80000;
+static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;
 
 // These flags are only valid on MIPS processors.
-static const int kCpuHasMIPS = 0x10000;
-static const int kCpuHasMIPS_DSP = 0x20000;
-static const int kCpuHasMIPS_DSPR2 = 0x40000;
+static const int kCpuHasMIPS = 0x200000;
+static const int kCpuHasMSA = 0x400000;
+static const int kCpuHasMMI = 0x800000;
 
-// Internal function used to auto-init.
+// Optional init function. TestCpuFlag does an auto-init.
+// Returns cpu_info flags.
 LIBYUV_API
 int InitCpuFlags(void);
 
-// Internal function for parsing /proc/cpuinfo.
-LIBYUV_API
-int ArmCpuCaps(const char* cpuinfo_name);
-
 // Detect CPU has SSE2 etc.
 // Test_flag parameter should be one of kCpuHas constants above.
-// returns non-zero if instruction set is detected
+// Returns non-zero if instruction set is detected
 static __inline int TestCpuFlag(int test_flag) {
   LIBYUV_API extern int cpu_info_;
-  return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
+#ifdef __ATOMIC_RELAXED
+  int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
+#else
+  int cpu_info = cpu_info_;
+#endif
+  return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
 }
 
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int MipsCpuCaps(const char* cpuinfo_name);
+
 // For testing, allow CPU flags to be disabled.
 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
 // MaskCpuFlags(-1) to enable all cpu specific optimizations.
-// MaskCpuFlags(0) to disable all cpu specific optimizations.
+// MaskCpuFlags(1) to disable all cpu specific optimizations.
+// MaskCpuFlags(0) to reset state so next call will auto init.
+// Returns cpu_info flags.
 LIBYUV_API
-void MaskCpuFlags(int enable_flags);
+int MaskCpuFlags(int enable_flags);
+
+// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags|
+// should be a valid combination of the kCpuHas constants above and include
+// kCpuInitialized. Use this method when running in a sandboxed process where
+// the detection code might fail (as it might access /proc/cpuinfo). In such
+// cases the cpu_info can be obtained from a non sandboxed process by calling
+// InitCpuFlags() and passed to the sandboxed process (via command line
+// parameters, IPC...) which can then call this method to initialize the CPU
+// flags.
+// Notes:
+// - when specifying 0 for |cpu_flags|, the auto initialization is enabled
+//   again.
+// - enabling CPU features that are not supported by the CPU will result in
+//   undefined behavior.
+// TODO(fbarchard): consider writing a helper function that translates from
+// other library CPU info to libyuv CPU info and add a .md doc that explains
+// CPU detection.
+static __inline void SetCpuFlags(int cpu_flags) {
+  LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+  __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED);
+#else
+  cpu_info_ = cpu_flags;
+#endif
+}
 
 // Low level cpuid for X86. Returns zeros on other CPUs.
 // eax is the info type that you want.
 // ecx is typically the cpu number, and should normally be zero.
 LIBYUV_API
-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
+void CpuId(int info_eax, int info_ecx, int* cpu_info);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CPU_ID_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/media/libaom/src/third_party/libyuv/include/libyuv/mjpeg_decoder.h
index fa1e51f9ac..275f8d4c18 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_
 #define INCLUDE_LIBYUV_MJPEG_DECODER_H_
 
 #include "libyuv/basic_types.h"
@@ -27,25 +26,24 @@ namespace libyuv {
 extern "C" {
 #endif
 
-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+static const uint32_t kUnknownDataSize = 0xFFFFFFFF;
 
 enum JpegSubsamplingType {
   kJpegYuv420,
   kJpegYuv422,
-  kJpegYuv411,
   kJpegYuv444,
   kJpegYuv400,
   kJpegUnknown
 };
 
 struct Buffer {
-  const uint8* data;
+  const uint8_t* data;
   int len;
 };
 
@@ -67,7 +65,7 @@ struct SetJmpErrorMgr;
 class LIBYUV_API MJpegDecoder {
  public:
   typedef void (*CallbackFunction)(void* opaque,
-                                   const uint8* const* data,
+                                   const uint8_t* const* data,
                                    const int* strides,
                                    int rows);
 
@@ -87,7 +85,7 @@ class LIBYUV_API MJpegDecoder {
   // If return value is LIBYUV_TRUE, then the values for all the following
   // getters are populated.
   // src_len is the size of the compressed mjpeg frame in bytes.
-  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
+  LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len);
 
   // Returns width of the last loaded frame in pixels.
   int GetWidth();
@@ -140,18 +138,22 @@ class LIBYUV_API MJpegDecoder {
   // at least GetComponentSize(i). The pointers in planes are incremented
   // to point to after the end of the written data.
   // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
-  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+  LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height);
 
   // Decodes the entire image and passes the data via repeated calls to a
   // callback function. Each call will get the data for a whole number of
   // image scanlines.
   // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
-  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
-                        int dst_width, int dst_height);
+  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn,
+                               void* opaque,
+                               int dst_width,
+                               int dst_height);
 
   // The helper function which recognizes the jpeg sub-sampling type.
   static JpegSubsamplingType JpegSubsamplingTypeHelper(
-     int* subsample_x, int* subsample_y, int number_of_components);
+      int* subsample_x,
+      int* subsample_y,
+      int number_of_components);
 
  private:
   void AllocOutputBuffers(int num_outbufs);
@@ -160,7 +162,7 @@ class LIBYUV_API MJpegDecoder {
   LIBYUV_BOOL StartDecode();
   LIBYUV_BOOL FinishDecode();
 
-  void SetScanlinePointers(uint8** data);
+  void SetScanlinePointers(uint8_t** data);
   LIBYUV_BOOL DecodeImcuRow();
 
   int GetComponentScanlinePadding(int component);
@@ -179,15 +181,15 @@ class LIBYUV_API MJpegDecoder {
 
   // Temporaries used to point to scanline outputs.
   int num_outbufs_;  // Outermost size of all arrays below.
-  uint8*** scanlines_;
+  uint8_t*** scanlines_;
   int* scanlines_sizes_;
   // Temporary buffer used for decoding when we can't decode directly to the
   // output buffers. Large enough for just one iMCU row.
-  uint8** databuf_;
+  uint8_t** databuf_;
   int* databuf_strides_;
 };
 
 }  // namespace libyuv
 
 #endif  //  __cplusplus
-#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/planar_functions.h b/media/libaom/src/third_party/libyuv/include/libyuv/planar_functions.h
index 7fe4d8eedd..8d868b9542 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/planar_functions.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
 #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
 
 #include "libyuv/basic_types.h"
@@ -23,88 +22,273 @@ namespace libyuv {
 extern "C" {
 #endif
 
+// TODO(fbarchard): Move cpu macros to row.h
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
+
 // Copy a plane of data.
 LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+void CopyPlane(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+
+LIBYUV_API
+void CopyPlane_16(const uint16_t* src_y,
+                  int src_stride_y,
+                  uint16_t* dst_y,
+                  int dst_stride_y,
+                  int width,
+                  int height);
 
 LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
-                  uint16* dst_y, int dst_stride_y,
-                  int width, int height);
+void Convert16To8Plane(const uint16_t* src_y,
+                       int src_stride_y,
+                       uint8_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 16384 for 10 bits
+                       int width,
+                       int height);
+
+LIBYUV_API
+void Convert8To16Plane(const uint8_t* src_y,
+                       int src_stride_y,
+                       uint16_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 1024 for 10 bits
+                       int width,
+                       int height);
 
 // Set a plane of data to a 32 bit value.
 LIBYUV_API
-void SetPlane(uint8* dst_y, int dst_stride_y,
-              int width, int height,
-              uint32 value);
+void SetPlane(uint8_t* dst_y,
+              int dst_stride_y,
+              int width,
+              int height,
+              uint32_t value);
+
+// Split interleaved UV plane into separate U and V planes.
+LIBYUV_API
+void SplitUVPlane(const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int width,
+                  int height);
+
+// Merge separate U and V planes into one interleaved UV plane.
+LIBYUV_API
+void MergeUVPlane(const uint8_t* src_u,
+                  int src_stride_u,
+                  const uint8_t* src_v,
+                  int src_stride_v,
+                  uint8_t* dst_uv,
+                  int dst_stride_uv,
+                  int width,
+                  int height);
+
+// Scale U and V to half width and height and merge into interleaved UV plane.
+// width and height are source size, allowing odd sizes.
+// Use for converting I444 or I422 to NV12.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height);
+
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_vu,
+                 int dst_stride_vu,
+                 int width,
+                 int height);
+
+// Split interleaved RGB plane into separate R, G and B planes.
+LIBYUV_API
+void SplitRGBPlane(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_r,
+                   int dst_stride_r,
+                   uint8_t* dst_g,
+                   int dst_stride_g,
+                   uint8_t* dst_b,
+                   int dst_stride_b,
+                   int width,
+                   int height);
+
+// Merge separate R, G and B planes into one interleaved RGB plane.
+LIBYUV_API
+void MergeRGBPlane(const uint8_t* src_r,
+                   int src_stride_r,
+                   const uint8_t* src_g,
+                   int src_stride_g,
+                   const uint8_t* src_b,
+                   int src_stride_b,
+                   uint8_t* dst_rgb,
+                   int dst_stride_rgb,
+                   int width,
+                   int height);
 
 // Copy I400.  Supports inverting.
 LIBYUV_API
-int I400ToI400(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+int I400ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
 
 #define J400ToJ400 I400ToI400
 
 // Copy I422 to I422.
 #define I422ToI422 I422Copy
 LIBYUV_API
-int I422Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
+int I422Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
 
 // Copy I444 to I444.
 #define I444ToI444 I444Copy
 LIBYUV_API
-int I444Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
+int I444Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Copy NV12. Supports inverting.
+int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv,
+             int src_stride_uv, uint8_t* dst_y, int dst_stride_y,
+             uint8_t* dst_uv, int dst_stride_uv, int width, int height);
+
+// Copy NV21. Supports inverting.
+int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu,
+             int src_stride_vu, uint8_t* dst_y, int dst_stride_y,
+             uint8_t* dst_vu, int dst_stride_vu, int width, int height);
 
 // Convert YUY2 to I422.
 LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int YUY2ToI422(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert UYVY to I422.
 LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
-
-LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
+int UYVYToI422(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+LIBYUV_API
+int YUY2ToNV12(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+LIBYUV_API
+int YUY2ToY(const uint8_t* src_yuy2,
+            int src_stride_yuy2,
+            uint8_t* dst_y,
+            int dst_stride_y,
+            int width,
+            int height);
 
 // Convert I420 to I400. (calls CopyPlane ignoring u/v).
 LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+int I420ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
 
 // Alias
 #define J420ToJ400 I420ToI400
@@ -112,13 +296,20 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
 
 // I420 mirror.
 LIBYUV_API
-int I420Mirror(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I420Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Alias
 #define I400ToI400Mirror I400Mirror
@@ -126,86 +317,134 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
 // I400 mirror.  A single plane is mirrored horizontally.
 // Pass negative height to achieve 180 degree rotation.
 LIBYUV_API
-int I400Mirror(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+int I400Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+
+// Alias
+#define NV12ToNV12Mirror NV12Mirror
+
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
 
 // Alias
 #define ARGBToARGBMirror ARGBMirror
 
 // ARGB mirror.
 LIBYUV_API
-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int ARGBMirror(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
+// Alias
+#define RGB24ToRGB24Mirror RGB24Mirror
 
-// Convert NV21 to RGB565.
+// RGB24 mirror.
 LIBYUV_API
-int NV21ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
+int RGB24Mirror(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
 
-// I422ToARGB is in convert_argb.h
-// Convert I422 to BGRA.
+// Mirror a plane of data.
 LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height);
+void MirrorPlane(const uint8_t* src_y,
+                 int src_stride_y,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height);
 
-// Convert I422 to ABGR.
+// Mirror a plane of UV data.
 LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+void MirrorUVPlane(const uint8_t* src_uv,
+                   int src_stride_uv,
+                   uint8_t* dst_uv,
+                   int dst_stride_uv,
+                   int width,
+                   int height);
+
+// Alias
+#define RGB24ToRAW RAWToRGB24
 
-// Convert I422 to RGBA.
 LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height);
+int RAWToRGB24(const uint8_t* src_raw,
+               int src_stride_raw,
+               uint8_t* dst_rgb24,
+               int dst_stride_rgb24,
+               int width,
+               int height);
 
 // Draw a rectangle into I420.
 LIBYUV_API
-int I420Rect(uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int x, int y, int width, int height,
-             int value_y, int value_u, int value_v);
+int I420Rect(uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int x,
+             int y,
+             int width,
+             int height,
+             int value_y,
+             int value_u,
+             int value_v);
 
 // Draw a rectangle into ARGB.
 LIBYUV_API
-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
-             int x, int y, int width, int height, uint32 value);
+int ARGBRect(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height,
+             uint32_t value);
 
 // Convert ARGB to gray scale ARGB.
 LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int ARGBGrayTo(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Make a rectangle of ARGB gray scale.
 LIBYUV_API
-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
-             int x, int y, int width, int height);
+int ARGBGray(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height);
 
 // Make a rectangle of ARGB Sepia tone.
 LIBYUV_API
-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
-              int x, int y, int width, int height);
+int ARGBSepia(uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_x,
+              int dst_y,
+              int width,
+              int height);
 
 // Apply a matrix rotation to each ARGB pixel.
 // matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
@@ -214,10 +453,13 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
 // The next 4 coefficients apply to B, G, R, A and produce R of the output.
 // The last 4 coefficients apply to B, G, R, A and produce A of the output.
 LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    const int8* matrix_argb,
-                    int width, int height);
+int ARGBColorMatrix(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    const int8_t* matrix_argb,
+                    int width,
+                    int height);
 
 // Deprecated. Use ARGBColorMatrix instead.
 // Apply a matrix rotation to each ARGB pixel.
@@ -226,32 +468,47 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
 // The next 4 coefficients apply to B, G, R, A and produce G of the output.
 // The last 4 coefficients apply to B, G, R, A and produce R of the output.
 LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
-                   const int8* matrix_rgb,
-                   int x, int y, int width, int height);
+int RGBColorMatrix(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const int8_t* matrix_rgb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height);
 
 // Apply a color table each ARGB pixel.
 // Table contains 256 ARGB values.
 LIBYUV_API
-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                   const uint8* table_argb,
-                   int x, int y, int width, int height);
+int ARGBColorTable(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const uint8_t* table_argb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height);
 
 // Apply a color table each ARGB pixel but preserve destination alpha.
 // Table contains 256 ARGB values.
 LIBYUV_API
-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                  const uint8* table_argb,
-                  int x, int y, int width, int height);
+int RGBColorTable(uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  const uint8_t* table_argb,
+                  int dst_x,
+                  int dst_y,
+                  int width,
+                  int height);
 
 // Apply a luma/color table each ARGB pixel but preserve destination alpha.
 // Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
 // RGB (YJ style) and C is an 8 bit color component (R, G or B).
 LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_argb, int dst_stride_argb,
-                       const uint8* luma_rgb_table,
-                       int width, int height);
+int ARGBLumaColorTable(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       const uint8_t* luma,
+                       int width,
+                       int height);
 
 // Apply a 3 term polynomial to ARGB values.
 // poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
@@ -262,115 +519,230 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
 // A polynomial approximation can be dirived using software such as 'R'.
 
 LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb, int dst_stride_argb,
+int ARGBPolynomial(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
                    const float* poly,
-                   int width, int height);
+                   int width,
+                   int height);
 
-// Quantize a rectangle of ARGB. Alpha unaffected.
-// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
-// interval_size should be a value between 1 and 255.
-// interval_offset should be a value between 0 and 255.
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
 LIBYUV_API
-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
-                 int scale, int interval_size, int interval_offset,
-                 int x, int y, int width, int height);
+int HalfFloatPlane(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   float scale,
+                   int width,
+                   int height);
 
-// Copy ARGB to ARGB.
+// Convert a buffer of bytes to floats, scale the values and store as floats.
 LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width);
 
-// Copy ARGB to ARGB.
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
 LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height);
+int ARGBQuantize(uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int scale,
+                 int interval_size,
+                 int interval_offset,
+                 int dst_x,
+                 int dst_y,
+                 int width,
+                 int height);
 
 // Copy ARGB to ARGB.
 LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
-                     uint8* dst_argb, int dst_stride_argb,
-                     int width, int height);
-
-typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
-                             uint8* dst_argb, int width);
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
+
+// Copy Alpha channel of ARGB to alpha of ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height);
+
+// Extract the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_a,
+                     int dst_stride_a,
+                     int width,
+                     int height);
+
+// Copy Y channel to Alpha of ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height);
+
+typedef void (*ARGBBlendRow)(const uint8_t* src_argb0,
+                             const uint8_t* src_argb1,
+                             uint8_t* dst_argb,
+                             int width);
 
 // Get function to Alpha Blend ARGB pixels and store to destination.
 LIBYUV_API
 ARGBBlendRow GetARGBBlend();
 
 // Alpha Blend ARGB images and store to destination.
+// Source is pre-multiplied by alpha using ARGBAttenuate.
 // Alpha of destination is set to 255.
 LIBYUV_API
-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
-              const uint8* src_argb1, int src_stride_argb1,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
+int ARGBBlend(const uint8_t* src_argb0,
+              int src_stride_argb0,
+              const uint8_t* src_argb1,
+              int src_stride_argb1,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
+
+// Alpha Blend plane and store to destination.
+// Source is not pre-multiplied by alpha.
+LIBYUV_API
+int BlendPlane(const uint8_t* src_y0,
+               int src_stride_y0,
+               const uint8_t* src_y1,
+               int src_stride_y1,
+               const uint8_t* alpha,
+               int alpha_stride,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+
+// Alpha Blend YUV images and store to destination.
+// Source is not pre-multiplied by alpha.
+// Alpha is full width x height and subsampled to half size to apply to UV.
+LIBYUV_API
+int I420Blend(const uint8_t* src_y0,
+              int src_stride_y0,
+              const uint8_t* src_u0,
+              int src_stride_u0,
+              const uint8_t* src_v0,
+              int src_stride_v0,
+              const uint8_t* src_y1,
+              int src_stride_y1,
+              const uint8_t* src_u1,
+              int src_stride_u1,
+              const uint8_t* src_v1,
+              int src_stride_v1,
+              const uint8_t* alpha,
+              int alpha_stride,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
 
 // Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
 LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
+int ARGBMultiply(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
 
 // Add ARGB image with ARGB image. Saturates to 255.
 LIBYUV_API
-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
-            const uint8* src_argb1, int src_stride_argb1,
-            uint8* dst_argb, int dst_stride_argb,
-            int width, int height);
+int ARGBAdd(const uint8_t* src_argb0,
+            int src_stride_argb0,
+            const uint8_t* src_argb1,
+            int src_stride_argb1,
+            uint8_t* dst_argb,
+            int dst_stride_argb,
+            int width,
+            int height);
 
 // Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
 LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
+int ARGBSubtract(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
 
 // Convert I422 to YUY2.
 LIBYUV_API
-int I422ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
+int I422ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
 
 // Convert I422 to UYVY.
 LIBYUV_API
-int I422ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
+int I422ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height);
 
 // Convert unattentuated ARGB to preattenuated ARGB.
 LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height);
+int ARGBAttenuate(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height);
 
 // Convert preattentuated ARGB to unattenuated ARGB.
 LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-// Convert MJPG to ARGB.
-LIBYUV_API
-int MJPGToARGB(const uint8* sample, size_t sample_size,
-               uint8* argb, int argb_stride,
-               int w, int h, int dw, int dh);
+int ARGBUnattenuate(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height);
 
 // Internal function - do not call directly.
 // Computes table of cumulative sum for image where the value is the sum
 // of all values above and to the left of the entry. Used by ARGBBlur.
 LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
-                             int32* dst_cumsum, int dst_stride32_cumsum,
-                             int width, int height);
+int ARGBComputeCumulativeSum(const uint8_t* src_argb,
+                             int src_stride_argb,
+                             int32_t* dst_cumsum,
+                             int dst_stride32_cumsum,
+                             int width,
+                             int height);
 
 // Blur ARGB image.
 // dst_cumsum table of width * (height + 1) * 16 bytes aligned to
@@ -379,76 +751,150 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
 // radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
 // Blur is optimized for radius of 5 (11x11) or less.
 LIBYUV_API
-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int32* dst_cumsum, int dst_stride32_cumsum,
-             int width, int height, int radius);
+int ARGBBlur(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int32_t* dst_cumsum,
+             int dst_stride32_cumsum,
+             int width,
+             int height,
+             int radius);
+
+// Gaussian 5x5 blur a float plane.
+// Coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+                   int src_stride,
+                   float* dst,
+                   int dst_stride,
+                   int width,
+                   int height);
 
 // Multiply ARGB image by ARGB value.
 LIBYUV_API
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height, uint32 value);
+int ARGBShade(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height,
+              uint32_t value);
 
-// Interpolate between two ARGB images using specified amount of interpolation
+// Interpolate between two images using specified amount of interpolation
 // (0 to 255) and store to destination.
-// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
-// and 255 means 1% src_argb0 and 99% src_argb1.
-// Internally uses ARGBScale bilinear filtering.
-// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
-LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
-                    const uint8* src_argb1, int src_stride_argb1,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int interpolation);
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
-#define LIBYUV_DISABLE_X86
-#endif
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_ARGBAFFINEROW_SSE2
-#endif
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
+// and 255 means 1% src0 and 99% src1.
+LIBYUV_API
+int InterpolatePlane(const uint8_t* src0,
+                     int src_stride0,
+                     const uint8_t* src1,
+                     int src_stride1,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width,
+                     int height,
+                     int interpolation);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// Internally calls InterpolatePlane with width * 4 (bpp).
+LIBYUV_API
+int ARGBInterpolate(const uint8_t* src_argb0,
+                    int src_stride_argb0,
+                    const uint8_t* src_argb1,
+                    int src_stride_argb1,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int interpolation);
+
+// Interpolate between two YUV images using specified amount of interpolation
+// Internally calls InterpolatePlane on each plane where the U and V planes
+// are half width and half height.
+LIBYUV_API
+int I420Interpolate(const uint8_t* src0_y,
+                    int src0_stride_y,
+                    const uint8_t* src0_u,
+                    int src0_stride_u,
+                    const uint8_t* src0_v,
+                    int src0_stride_v,
+                    const uint8_t* src1_y,
+                    int src1_stride_y,
+                    const uint8_t* src1_u,
+                    int src1_stride_u,
+                    const uint8_t* src1_v,
+                    int src1_stride_v,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    int width,
+                    int height,
+                    int interpolation);
 
 // Row function for copying pixels from a source with a slope to a row
 // of destination. Useful for scaling, rotation, mirror, texture mapping.
 LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_C(const uint8_t* src_argb,
+                     int src_argb_stride,
+                     uint8_t* dst_argb,
+                     const float* uv_dudv,
+                     int width);
+// TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h
 LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                        int src_argb_stride,
+                        uint8_t* dst_argb,
+                        const float* uv_dudv,
+                        int width);
 
 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
 // shuffler is 16 bytes and must be aligned.
 LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
-                uint8* dst_argb, int dst_stride_argb,
-                const uint8* shuffler, int width, int height);
+int ARGBShuffle(const uint8_t* src_bgra,
+                int src_stride_bgra,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                const uint8_t* shuffler,
+                int width,
+                int height);
 
 // Sobel ARGB effect with planar output.
 LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
-                     uint8* dst_y, int dst_stride_y,
-                     int width, int height);
+int ARGBSobelToPlane(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     int width,
+                     int height);
 
 // Sobel ARGB effect.
 LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
+int ARGBSobel(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
 
 // Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
 LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height);
+int ARGBSobelXY(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/rotate.h b/media/libaom/src/third_party/libyuv/include/libyuv/rotate.h
index 8a9673f280..308882242c 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/rotate.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/rotate.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_ROTATE_H_
 #define INCLUDE_LIBYUV_ROTATE_H_
 
 #include "libyuv/basic_types.h"
@@ -21,8 +20,8 @@ extern "C" {
 
 // Supported rotation.
 typedef enum RotationMode {
-  kRotate0 = 0,  // No rotation.
-  kRotate90 = 90,  // Rotate 90 degrees clockwise.
+  kRotate0 = 0,      // No rotation.
+  kRotate90 = 90,    // Rotate 90 degrees clockwise.
   kRotate180 = 180,  // Rotate 180 degrees.
   kRotate270 = 270,  // Rotate 270 degrees clockwise.
 
@@ -34,85 +33,150 @@ typedef enum RotationMode {
 
 // Rotate I420 frame.
 LIBYUV_API
-int I420Rotate(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int src_width, int src_height, enum RotationMode mode);
+int I420Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
+// Rotate I444 frame.
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
 
 // Rotate NV12 input and store in I420.
 LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
-                     const uint8* src_uv, int src_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int src_width, int src_height, enum RotationMode mode);
+int NV12ToI420Rotate(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
+                     enum RotationMode mode);
 
 // Rotate a plane by 0, 90, 180, or 270.
 LIBYUV_API
-int RotatePlane(const uint8* src, int src_stride,
-                uint8* dst, int dst_stride,
-                int src_width, int src_height, enum RotationMode mode);
+int RotatePlane(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst,
+                int dst_stride,
+                int width,
+                int height,
+                enum RotationMode mode);
 
 // Rotate planes by 90, 180, 270. Deprecated.
 LIBYUV_API
-void RotatePlane90(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
-                   int width, int height);
-
-LIBYUV_API
-void RotatePlane180(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height);
+void RotatePlane90(const uint8_t* src,
+                   int src_stride,
+                   uint8_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height);
 
 LIBYUV_API
-void RotatePlane270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height);
+void RotatePlane180(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
 
 LIBYUV_API
-void RotateUV90(const uint8* src, int src_stride,
-                uint8* dst_a, int dst_stride_a,
-                uint8* dst_b, int dst_stride_b,
-                int width, int height);
+void RotatePlane270(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
 
 // Rotations for when U and V are interleaved.
 // These functions take one input pointer and
 // split the data into two buffers while
 // rotating them. Deprecated.
 LIBYUV_API
-void RotateUV180(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height);
+void RotateUV90(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst_a,
+                int dst_stride_a,
+                uint8_t* dst_b,
+                int dst_stride_b,
+                int width,
+                int height);
+
+LIBYUV_API
+void RotateUV180(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height);
 
 LIBYUV_API
-void RotateUV270(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height);
+void RotateUV270(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height);
 
 // The 90 and 270 functions are based on transposes.
 // Doing a transpose with reversing the read/write
 // order will result in a rotation by +- 90 degrees.
 // Deprecated.
 LIBYUV_API
-void TransposePlane(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height);
+void TransposePlane(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
 
 LIBYUV_API
-void TransposeUV(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height);
+void TransposeUV(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/rotate_argb.h b/media/libaom/src/third_party/libyuv/include/libyuv/rotate_argb.h
index 2bdc8ec6b4..20432949ab 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/rotate_argb.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/rotate_argb.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_
 #define INCLUDE_LIBYUV_ROTATE_ARGB_H_
 
 #include "libyuv/basic_types.h"
@@ -22,13 +21,17 @@ extern "C" {
 
 // Rotate ARGB frame
 LIBYUV_API
-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int src_width, int src_height, enum RotationMode mode);
+int ARGBRotate(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               enum RotationMode mode);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/rotate_row.h b/media/libaom/src/third_party/libyuv/include/libyuv/rotate_row.h
index d0bfbdd2b0..022293eef2 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/rotate_row.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/rotate_row.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_
 #define INCLUDE_LIBYUV_ROTATE_ROW_H_
 
 #include "libyuv/basic_types.h"
@@ -19,121 +18,206 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
-
-// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
-    defined(_MSC_VER) && _MSC_VER >= 1700
-#define VISUALC_HAS_AVX2 1
-#endif  // VisualStudio >= 2012
-
-// TODO(fbarchard): switch to standard form of inline; fails on clangcl.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#if defined(__APPLE__) && defined(__i386__)
-#define DECLARE_FUNCTION(name)                                                 \
-    ".text                                     \n"                             \
-    ".private_extern _" #name "                \n"                             \
-    ".align 4,0x90                             \n"                             \
-"_" #name ":                                   \n"
-#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
-#define DECLARE_FUNCTION(name)                                                 \
-    ".text                                     \n"                             \
-    ".align 4,0x90                             \n"                             \
-"_" #name ":                                   \n"
-#else
-#define DECLARE_FUNCTION(name)                                                 \
-    ".text                                     \n"                             \
-    ".align 4,0x90                             \n"                             \
-#name ":                                       \n"
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
 #endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
 #endif
-
-// The following are available for Visual C:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
-    defined(_MSC_VER) && !defined(__clang__)
+#endif
+// The following are available for Visual C and clangcl 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 #define HAS_TRANSPOSEWX8_SSSE3
 #define HAS_TRANSPOSEUVWX8_SSE2
 #endif
 
-// The following are available for GCC but not NaCL:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+// The following are available for GCC 32 or 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSEWX8_SSSE3
 #endif
 
-// The following are available for 32 bit GCC:
-#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)
-#define HAS_TRANSPOSEUVWX8_SSE2
-#endif
-
-// The following are available for 64 bit GCC but not NaCL:
-#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
-    defined(__x86_64__)
+// The following are available for 64 bit GCC:
+#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)
 #define HAS_TRANSPOSEWX8_FAST_SSSE3
 #define HAS_TRANSPOSEUVWX8_SSE2
 #endif
 
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+#if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_TRANSPOSEWX8_NEON
 #define HAS_TRANSPOSEUVWX8_NEON
 #endif
 
-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
-    defined(__mips__) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_TRANSPOSEWX8_MIPS_DSPR2
-#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2
-#endif  // defined(__mips__)
-
-void TransposeWxH_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width, int height);
-
-void TransposeWx8_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width);
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width);
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width);
-void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width);
-
-void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
-                           uint8* dst, int dst_stride, int width);
-void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
-                            uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
-                                 uint8* dst, int dst_stride, int width);
-void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride,
-                                 uint8* dst, int dst_stride, int width);
-
-void TransposeUVWxH_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b,
-                      int width, int height);
-
-void TransposeUVWx8_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
-                               uint8* dst_a, int dst_stride_a,
-                               uint8* dst_b, int dst_stride_b, int width);
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_TRANSPOSEWX16_MSA
+#define HAS_TRANSPOSEUVWX16_MSA
+#endif
+
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_TRANSPOSEWX8_MMI
+#define HAS_TRANSPOSEUVWX8_MMI
+#endif
+
+void TransposeWxH_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+
+void TransposeWx8_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width);
+void TransposeWx16_C(const uint8_t* src,
+                     int src_stride,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width);
+void TransposeWx8_NEON(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
+void TransposeWx8_SSSE3(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst,
+                        int dst_stride,
+                        int width);
+void TransposeWx8_MMI(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst,
+                      int dst_stride,
+                      int width);
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst,
+                             int dst_stride,
+                             int width);
+void TransposeWx16_MSA(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
+
+void TransposeWx8_Any_NEON(const uint8_t* src,
+                           int src_stride,
+                           uint8_t* dst,
+                           int dst_stride,
+                           int width);
+void TransposeWx8_Any_SSSE3(const uint8_t* src,
+                            int src_stride,
+                            uint8_t* dst,
+                            int dst_stride,
+                            int width);
+void TransposeWx8_Any_MMI(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src,
+                                 int src_stride,
+                                 uint8_t* dst,
+                                 int dst_stride,
+                                 int width);
+void TransposeWx16_Any_MSA(const uint8_t* src,
+                           int src_stride,
+                           uint8_t* dst,
+                           int dst_stride,
+                           int width);
+
+void TransposeUVWxH_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height);
+
+void TransposeUVWx8_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width);
+void TransposeUVWx16_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst_a,
+                       int dst_stride_a,
+                       uint8_t* dst_b,
+                       int dst_stride_b,
+                       int width);
+void TransposeUVWx8_SSE2(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
+void TransposeUVWx8_NEON(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
+void TransposeUVWx8_MMI(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst_a,
+                        int dst_stride_a,
+                        uint8_t* dst_b,
+                        int dst_stride_b,
+                        int width);
+void TransposeUVWx16_MSA(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
+
+void TransposeUVWx8_Any_SSE2(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
+void TransposeUVWx8_Any_NEON(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
+void TransposeUVWx8_Any_MMI(const uint8_t* src,
+                            int src_stride,
+                            uint8_t* dst_a,
+                            int dst_stride_a,
+                            uint8_t* dst_b,
+                            int dst_stride_b,
+                            int width);
+void TransposeUVWx16_Any_MSA(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/row.h b/media/libaom/src/third_party/libyuv/include/libyuv/row.h
index 5c3187ef79..a27788c1f6 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/row.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/row.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_ROW_H_
 #define INCLUDE_LIBYUV_ROW_H_
 
 #include <stdlib.h>  // For malloc.
@@ -21,35 +20,20 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
-
-#ifdef __cplusplus
-#define align_buffer_64(var, size)                                             \
-  uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63));            \
-  uint8* var = reinterpret_cast<uint8*>                                        \
-      ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)
-#else
-#define align_buffer_64(var, size)                                             \
-  uint8* var##_mem = (uint8*)(malloc((size) + 63));               /* NOLINT */ \
-  uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63)       /* NOLINT */
-#endif
-
-#define free_aligned_buffer_64(var) \
-  free(var##_mem);  \
-  var = 0
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
-// True if compiling for SSSE3 as a requirement.
-#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
-#define LIBYUV_SSSE3_ONLY
-#endif
-
 #if defined(__native_client__)
 #define LIBYUV_DISABLE_NEON
 #endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
 // clang >= 3.5.0 required for Arm64.
 #if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
 #if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
@@ -57,6 +41,35 @@ extern "C" {
 #endif  // clang >= 3.5
 #endif  // __clang__
 
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
+// clang >= 6.0.0 required for AVX512.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+// clang in xcode follows a different versioning scheme.
+// TODO(fbarchard): fix xcode 9 ios b/789.
+#if (__clang_major__ >= 7) && !defined(__APPLE__)
+#define CLANG_HAS_AVX512 1
+#endif  // clang >= 7
+#endif  // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+    _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
@@ -65,15 +78,15 @@ extern "C" {
 #define HAS_ABGRTOYROW_SSSE3
 #define HAS_ARGB1555TOARGBROW_SSE2
 #define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBEXTRACTALPHAROW_SSE2
 #define HAS_ARGBSETROW_X86
-#define HAS_ARGBSHUFFLEROW_SSE2
 #define HAS_ARGBSHUFFLEROW_SSSE3
 #define HAS_ARGBTOARGB1555ROW_SSE2
 #define HAS_ARGBTOARGB4444ROW_SSE2
 #define HAS_ARGBTORAWROW_SSSE3
 #define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565DITHERROW_SSE2
 #define HAS_ARGBTORGB565ROW_SSE2
-#define HAS_ARGBTOUV422ROW_SSSE3
 #define HAS_ARGBTOUV444ROW_SSSE3
 #define HAS_ARGBTOUVJROW_SSSE3
 #define HAS_ARGBTOUVROW_SSSE3
@@ -83,14 +96,11 @@ extern "C" {
 #define HAS_BGRATOYROW_SSSE3
 #define HAS_COPYROW_ERMS
 #define HAS_COPYROW_SSE2
-#define HAS_I400TOARGBROW_SSE2
-#define HAS_I411TOARGBROW_SSSE3
-#define HAS_I422TOABGRROW_SSSE3
+#define HAS_H422TOARGBROW_SSSE3
+#define HAS_HALFFLOATROW_SSE2
 #define HAS_I422TOARGB1555ROW_SSSE3
 #define HAS_I422TOARGB4444ROW_SSSE3
 #define HAS_I422TOARGBROW_SSSE3
-#define HAS_I422TOBGRAROW_SSSE3
-#define HAS_I422TORAWROW_SSSE3
 #define HAS_I422TORGB24ROW_SSSE3
 #define HAS_I422TORGB565ROW_SSSE3
 #define HAS_I422TORGBAROW_SSSE3
@@ -100,18 +110,20 @@ extern "C" {
 #define HAS_J400TOARGBROW_SSE2
 #define HAS_J422TOARGBROW_SSSE3
 #define HAS_MERGEUVROW_SSE2
-#define HAS_MIRRORROW_SSE2
 #define HAS_MIRRORROW_SSSE3
-#define HAS_MIRRORROW_UV_SSSE3
-#define HAS_MIRRORUVROW_SSSE3
+#define HAS_MIRRORSPLITUVROW_SSSE3
 #define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB24ROW_SSSE3
 #define HAS_NV12TORGB565ROW_SSSE3
 #define HAS_NV21TOARGBROW_SSSE3
-#define HAS_NV21TORGB565ROW_SSSE3
+#define HAS_NV21TORGB24ROW_SSSE3
 #define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RAWTORGB24ROW_SSSE3
 #define HAS_RAWTOYROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
 #define HAS_RGB24TOYROW_SSSE3
+#define HAS_RGB24TOYJROW_SSSE3
+#define HAS_RAWTOYJROW_SSSE3
 #define HAS_RGB565TOARGBROW_SSE2
 #define HAS_RGBATOUVROW_SSSE3
 #define HAS_RGBATOYROW_SSSE3
@@ -146,9 +158,9 @@ extern "C" {
 #define HAS_ARGBSHADEROW_SSE2
 #define HAS_ARGBSUBTRACTROW_SSE2
 #define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_BLENDPLANEROW_SSSE3
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-#define HAS_INTERPOLATEROW_SSE2
 #define HAS_INTERPOLATEROW_SSSE3
 #define HAS_RGBCOLORTABLEROW_X86
 #define HAS_SOBELROW_SSE2
@@ -156,81 +168,55 @@ extern "C" {
 #define HAS_SOBELXROW_SSE2
 #define HAS_SOBELXYROW_SSE2
 #define HAS_SOBELYROW_SSE2
-#endif
 
-// The following are available on x64 Visual C and clangcl.
-#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
-    (!defined(__clang__) || defined(__SSSE3__))
-#define HAS_I422TOARGBROW_SSSE3
+// The following functions fail on gcc/clang 32 bit with fpic and framepointer.
+// caveat: clangcl uses row_win.cc which works.
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+    defined(_MSC_VER)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_SSSE3
 #endif
-
-// GCC >= 4.7.0 required for AVX2.
-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-#define GCC_HAS_AVX2 1
-#endif  // GNUC >= 4.7
-#endif  // __GNUC__
-
-// clang >= 3.4.0 required for AVX2.
-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
-#define CLANG_HAS_AVX2 1
-#endif  // clang >= 3.4
-#endif  // __clang__
-
-// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
-    defined(_MSC_VER) && _MSC_VER >= 1700
-#define VISUALC_HAS_AVX2 1
-#endif  // VisualStudio >= 2012
-
-// The following are available require VS2012.  Port to GCC.
-#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
-#define HAS_ARGB1555TOARGBROW_AVX2
-#define HAS_ARGB4444TOARGBROW_AVX2
-#define HAS_ARGBTOARGB1555ROW_AVX2
-#define HAS_ARGBTOARGB4444ROW_AVX2
-#define HAS_ARGBTORGB565DITHERROW_AVX2
-#define HAS_ARGBTORGB565DITHERROW_SSE2
-#define HAS_ARGBTORGB565ROW_AVX2
-#define HAS_I411TOARGBROW_AVX2
-#define HAS_I422TOARGB1555ROW_AVX2
-#define HAS_I422TOARGB4444ROW_AVX2
-#define HAS_I422TORGB565ROW_AVX2
-#define HAS_I444TOARGBROW_AVX2
-#define HAS_J400TOARGBROW_AVX2
-#define HAS_NV12TOARGBROW_AVX2
-#define HAS_NV12TORGB565ROW_AVX2
-#define HAS_NV21TOARGBROW_AVX2
-#define HAS_NV21TORGB565ROW_AVX2
-#define HAS_RGB565TOARGBROW_AVX2
 #endif
 
 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
-// The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
-    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#if !defined(LIBYUV_DISABLE_X86) &&                          \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+     defined(GCC_HAS_AVX2))
 #define HAS_ARGBCOPYALPHAROW_AVX2
 #define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#define HAS_ARGBEXTRACTALPHAROW_AVX2
 #define HAS_ARGBMIRRORROW_AVX2
 #define HAS_ARGBPOLYNOMIALROW_AVX2
 #define HAS_ARGBSHUFFLEROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_AVX2
+#define HAS_ARGBTOUVJROW_AVX2
 #define HAS_ARGBTOUVROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
+#define HAS_RGB24TOYJROW_AVX2
+#define HAS_RAWTOYJROW_AVX2
 #define HAS_COPYROW_AVX
-#define HAS_I400TOARGBROW_AVX2
-#define HAS_I422TOABGRROW_AVX2
+#define HAS_H422TOARGBROW_AVX2
+#define HAS_HALFFLOATROW_AVX2
+//  #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
-#define HAS_I422TOBGRAROW_AVX2
-#define HAS_I422TORAWROW_AVX2
 #define HAS_I422TORGB24ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
 #define HAS_I422TORGBAROW_AVX2
+#define HAS_I444TOARGBROW_AVX2
 #define HAS_INTERPOLATEROW_AVX2
 #define HAS_J422TOARGBROW_AVX2
 #define HAS_MERGEUVROW_AVX2
 #define HAS_MIRRORROW_AVX2
+#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB24ROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
+#define HAS_NV21TORGB24ROW_AVX2
 #define HAS_SPLITUVROW_AVX2
 #define HAS_UYVYTOARGBROW_AVX2
 #define HAS_UYVYTOUV422ROW_AVX2
@@ -247,15 +233,94 @@ extern "C" {
 #define HAS_ARGBMULTIPLYROW_AVX2
 #define HAS_ARGBSUBTRACTROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
+#define HAS_BLENDPLANEROW_AVX2
+
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+    defined(_MSC_VER)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_AVX2
+#endif
+#endif
+
+// The following are available for AVX2 Visual C and clangcl 32 bit:
+// TODO(fbarchard): Port to gcc.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_ARGB1555TOARGBROW_AVX2
+#define HAS_ARGB4444TOARGBROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
+#define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_J400TOARGBROW_AVX2
+#define HAS_RGB565TOARGBROW_AVX2
+#endif
+
+// The following are also available on x64 Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_X64) && \
+    (!defined(__clang__) || defined(__SSSE3__))
+#define HAS_I422ALPHATOARGBROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
 #endif
 
-// The following are disabled when SSSE3 is available:
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
 #if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
-    !defined(LIBYUV_SSSE3_ONLY)
-#define HAS_ARGBATTENUATEROW_SSE2
-#define HAS_ARGBBLENDROW_SSE2
-#define HAS_MIRRORROW_SSE2
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_ABGRTOAR30ROW_SSSE3
+#define HAS_ARGBTOAR30ROW_SSSE3
+#define HAS_CONVERT16TO8ROW_SSSE3
+#define HAS_CONVERT8TO16ROW_SSE2
+#define HAS_HALFMERGEUVROW_SSSE3
+#define HAS_I210TOAR30ROW_SSSE3
+#define HAS_I210TOARGBROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I422TOAR30ROW_SSSE3
+#define HAS_MERGERGBROW_SSSE3
+#define HAS_MIRRORUVROW_AVX2
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_RAWTORGBAROW_SSSE3
+#define HAS_RGB24MIRRORROW_SSSE3
+#define HAS_RGBATOYJROW_SSSE3
+#define HAS_SPLITRGBROW_SSSE3
+#define HAS_SWAPUVROW_SSSE3
+#endif
+
+// The following are available for AVX2 gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) &&                                       \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_ABGRTOAR30ROW_AVX2
+#define HAS_ABGRTOUVROW_AVX2
+#define HAS_ABGRTOYROW_AVX2
+#define HAS_ARGBTOAR30ROW_AVX2
+#define HAS_ARGBTORAWROW_AVX2
+#define HAS_ARGBTORGB24ROW_AVX2
+#define HAS_CONVERT16TO8ROW_AVX2
+#define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_HALFMERGEUVROW_AVX2
+#define HAS_I210TOAR30ROW_AVX2
+#define HAS_I210TOARGBROW_AVX2
+#define HAS_I400TOARGBROW_AVX2
+#define HAS_I422TOAR30ROW_AVX2
+#define HAS_I422TOUYVYROW_AVX2
+#define HAS_I422TOYUY2ROW_AVX2
+#define HAS_MERGEUVROW_16_AVX2
+#define HAS_MULTIPLYROW_16_AVX2
+#define HAS_RGBATOYJROW_AVX2
+#define HAS_SWAPUVROW_AVX2
+// TODO(fbarchard): Fix AVX2 version of YUV24
+// #define HAS_NV21TOYUV24ROW_AVX2
+#endif
+
+// The following are available for AVX512 clang x86 platforms:
+// TODO(fbarchard): Port to GCC and Visual C
+// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
+#if !defined(LIBYUV_DISABLE_X86) &&                                       \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+    (defined(CLANG_HAS_AVX512))
+#define HAS_ARGBTORGB24ROW_AVX512VBMI
 #endif
 
 // The following are available on Neon platforms:
@@ -269,77 +334,92 @@ extern "C" {
 #define HAS_ARGB4444TOARGBROW_NEON
 #define HAS_ARGB4444TOUVROW_NEON
 #define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBEXTRACTALPHAROW_NEON
+#define HAS_ARGBSETROW_NEON
 #define HAS_ARGBTOARGB1555ROW_NEON
 #define HAS_ARGBTOARGB4444ROW_NEON
 #define HAS_ARGBTORAWROW_NEON
 #define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGB565DITHERROW_NEON
 #define HAS_ARGBTORGB565ROW_NEON
-#define HAS_ARGBTOUV411ROW_NEON
-#define HAS_ARGBTOUV422ROW_NEON
 #define HAS_ARGBTOUV444ROW_NEON
 #define HAS_ARGBTOUVJROW_NEON
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
 #define HAS_ARGBTOYROW_NEON
+#define HAS_AYUVTOUVROW_NEON
+#define HAS_AYUVTOVUROW_NEON
+#define HAS_AYUVTOYROW_NEON
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
+#define HAS_BYTETOFLOATROW_NEON
 #define HAS_COPYROW_NEON
-#define HAS_J400TOARGBROW_NEON
-#define HAS_I411TOARGBROW_NEON
-#define HAS_I422TOABGRROW_NEON
+#define HAS_HALFFLOATROW_NEON
+#define HAS_HALFMERGEUVROW_NEON
+#define HAS_I400TOARGBROW_NEON
+#define HAS_I422ALPHATOARGBROW_NEON
 #define HAS_I422TOARGB1555ROW_NEON
 #define HAS_I422TOARGB4444ROW_NEON
 #define HAS_I422TOARGBROW_NEON
-#define HAS_I422TOBGRAROW_NEON
-#define HAS_I422TORAWROW_NEON
 #define HAS_I422TORGB24ROW_NEON
 #define HAS_I422TORGB565ROW_NEON
 #define HAS_I422TORGBAROW_NEON
 #define HAS_I422TOUYVYROW_NEON
 #define HAS_I422TOYUY2ROW_NEON
 #define HAS_I444TOARGBROW_NEON
+#define HAS_J400TOARGBROW_NEON
 #define HAS_MERGEUVROW_NEON
 #define HAS_MIRRORROW_NEON
 #define HAS_MIRRORUVROW_NEON
+#define HAS_MIRRORSPLITUVROW_NEON
 #define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB24ROW_NEON
 #define HAS_NV12TORGB565ROW_NEON
 #define HAS_NV21TOARGBROW_NEON
-#define HAS_NV21TORGB565ROW_NEON
+#define HAS_NV21TORGB24ROW_NEON
+#define HAS_NV21TOYUV24ROW_NEON
 #define HAS_RAWTOARGBROW_NEON
+#define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTORGBAROW_NEON
 #define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYJROW_NEON
 #define HAS_RAWTOYROW_NEON
 #define HAS_RGB24TOARGBROW_NEON
 #define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYJROW_NEON
 #define HAS_RGB24TOYROW_NEON
 #define HAS_RGB565TOARGBROW_NEON
 #define HAS_RGB565TOUVROW_NEON
 #define HAS_RGB565TOYROW_NEON
 #define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYJROW_NEON
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
-#define HAS_ARGBSETROW_NEON
+#define HAS_SPLITRGBROW_NEON
 #define HAS_SPLITUVROW_NEON
+#define HAS_SWAPUVROW_NEON
 #define HAS_UYVYTOARGBROW_NEON
 #define HAS_UYVYTOUV422ROW_NEON
 #define HAS_UYVYTOUVROW_NEON
 #define HAS_UYVYTOYROW_NEON
-#define HAS_I400TOARGBROW_NEON
 #define HAS_YUY2TOARGBROW_NEON
 #define HAS_YUY2TOUV422ROW_NEON
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
-#define HAS_ARGBTORGB565DITHERROW_NEON
 
 // Effects:
 #define HAS_ARGBADDROW_NEON
 #define HAS_ARGBATTENUATEROW_NEON
 #define HAS_ARGBBLENDROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
 #define HAS_ARGBGRAYROW_NEON
 #define HAS_ARGBMIRRORROW_NEON
+#define HAS_RGB24MIRRORROW_NEON
 #define HAS_ARGBMULTIPLYROW_NEON
 #define HAS_ARGBQUANTIZEROW_NEON
 #define HAS_ARGBSEPIAROW_NEON
 #define HAS_ARGBSHADEROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
 #define HAS_ARGBSUBTRACTROW_NEON
 #define HAS_INTERPOLATEROW_NEON
 #define HAS_SOBELROW_NEON
@@ -347,73 +427,298 @@ extern "C" {
 #define HAS_SOBELXROW_NEON
 #define HAS_SOBELXYROW_NEON
 #define HAS_SOBELYROW_NEON
-#define HAS_ARGBCOLORMATRIXROW_NEON
-#define HAS_ARGBSHUFFLEROW_NEON
 #endif
 
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-#define HAS_COPYROW_MIPS
-#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_I422TOABGRROW_MIPS_DSPR2
-#define HAS_I422TOARGBROW_MIPS_DSPR2
-#define HAS_I422TOBGRAROW_MIPS_DSPR2
-#define HAS_INTERPOLATEROW_MIPS_DSPR2
-#define HAS_MIRRORROW_MIPS_DSPR2
-#define HAS_MIRRORUVROW_MIPS_DSPR2
-#define HAS_SPLITUVROW_MIPS_DSPR2
+// The following are available on AArch64 platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_SCALESUMSAMPLES_NEON
+#define HAS_GAUSSROW_F32_NEON
+#define HAS_GAUSSCOL_F32_NEON
+
+#endif
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_ABGRTOUVROW_MSA
+#define HAS_ABGRTOYROW_MSA
+#define HAS_ARGB1555TOARGBROW_MSA
+#define HAS_ARGB1555TOUVROW_MSA
+#define HAS_ARGB1555TOYROW_MSA
+#define HAS_ARGB4444TOARGBROW_MSA
+#define HAS_ARGBADDROW_MSA
+#define HAS_ARGBATTENUATEROW_MSA
+#define HAS_ARGBBLENDROW_MSA
+#define HAS_ARGBCOLORMATRIXROW_MSA
+#define HAS_ARGBEXTRACTALPHAROW_MSA
+#define HAS_ARGBGRAYROW_MSA
+#define HAS_ARGBMIRRORROW_MSA
+#define HAS_ARGBMULTIPLYROW_MSA
+#define HAS_ARGBQUANTIZEROW_MSA
+#define HAS_ARGBSEPIAROW_MSA
+#define HAS_ARGBSETROW_MSA
+#define HAS_ARGBSHADEROW_MSA
+#define HAS_ARGBSHUFFLEROW_MSA
+#define HAS_ARGBSUBTRACTROW_MSA
+#define HAS_ARGBTOARGB1555ROW_MSA
+#define HAS_ARGBTOARGB4444ROW_MSA
+#define HAS_ARGBTORAWROW_MSA
+#define HAS_ARGBTORGB24ROW_MSA
+#define HAS_ARGBTORGB565DITHERROW_MSA
+#define HAS_ARGBTORGB565ROW_MSA
+#define HAS_ARGBTOUV444ROW_MSA
+#define HAS_ARGBTOUVJROW_MSA
+#define HAS_ARGBTOUVROW_MSA
+#define HAS_ARGBTOYJROW_MSA
+#define HAS_ARGBTOYROW_MSA
+#define HAS_BGRATOUVROW_MSA
+#define HAS_BGRATOYROW_MSA
+#define HAS_HALFFLOATROW_MSA
+#define HAS_I400TOARGBROW_MSA
+#define HAS_I422ALPHATOARGBROW_MSA
+#define HAS_I422TOARGBROW_MSA
+#define HAS_I422TORGB24ROW_MSA
+#define HAS_I422TORGBAROW_MSA
+#define HAS_I422TOUYVYROW_MSA
+#define HAS_I422TOYUY2ROW_MSA
+#define HAS_I444TOARGBROW_MSA
+#define HAS_I422TOARGB1555ROW_MSA
+#define HAS_I422TORGB565ROW_MSA
+#define HAS_INTERPOLATEROW_MSA
+#define HAS_J400TOARGBROW_MSA
+#define HAS_MERGEUVROW_MSA
+#define HAS_MIRRORROW_MSA
+#define HAS_MIRRORUVROW_MSA
+#define HAS_MIRRORSPLITUVROW_MSA
+#define HAS_NV12TOARGBROW_MSA
+#define HAS_NV12TORGB565ROW_MSA
+#define HAS_NV21TOARGBROW_MSA
+#define HAS_RAWTOARGBROW_MSA
+#define HAS_RAWTORGB24ROW_MSA
+#define HAS_RAWTOUVROW_MSA
+#define HAS_RAWTOYROW_MSA
+#define HAS_RGB24TOARGBROW_MSA
+#define HAS_RGB24TOUVROW_MSA
+#define HAS_RGB24TOYROW_MSA
+#define HAS_RGB565TOARGBROW_MSA
+#define HAS_RGB565TOUVROW_MSA
+#define HAS_RGB565TOYROW_MSA
+#define HAS_RGBATOUVROW_MSA
+#define HAS_RGBATOYROW_MSA
+#define HAS_SETROW_MSA
+#define HAS_SOBELROW_MSA
+#define HAS_SOBELTOPLANEROW_MSA
+#define HAS_SOBELXROW_MSA
+#define HAS_SOBELXYROW_MSA
+#define HAS_SOBELYROW_MSA
+#define HAS_SPLITUVROW_MSA
+#define HAS_UYVYTOARGBROW_MSA
+#define HAS_UYVYTOUVROW_MSA
+#define HAS_UYVYTOYROW_MSA
+#define HAS_YUY2TOARGBROW_MSA
+#define HAS_YUY2TOUV422ROW_MSA
+#define HAS_YUY2TOUVROW_MSA
+#define HAS_YUY2TOYROW_MSA
 #endif
+
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_ABGRTOUVROW_MMI
+#define HAS_ABGRTOYROW_MMI
+#define HAS_ARGB1555TOARGBROW_MMI
+#define HAS_ARGB1555TOUVROW_MMI
+#define HAS_ARGB1555TOYROW_MMI
+#define HAS_ARGB4444TOARGBROW_MMI
+#define HAS_ARGB4444TOUVROW_MMI
+#define HAS_ARGB4444TOYROW_MMI
+#define HAS_ARGBADDROW_MMI
+#define HAS_ARGBATTENUATEROW_MMI
+#define HAS_ARGBBLENDROW_MMI
+#define HAS_ARGBCOLORMATRIXROW_MMI
+#define HAS_ARGBCOPYALPHAROW_MMI
+#define HAS_ARGBCOPYYTOALPHAROW_MMI
+#define HAS_ARGBEXTRACTALPHAROW_MMI
+#define HAS_ARGBGRAYROW_MMI
+#define HAS_ARGBMIRRORROW_MMI
+#define HAS_ARGBMULTIPLYROW_MMI
+#define HAS_ARGBSEPIAROW_MMI
+#define HAS_ARGBSETROW_MMI
+#define HAS_ARGBSHADEROW_MMI
+#define HAS_ARGBSHUFFLEROW_MMI
+#define HAS_ARGBSUBTRACTROW_MMI
+#define HAS_ARGBTOARGB1555ROW_MMI
+#define HAS_ARGBTOARGB4444ROW_MMI
+#define HAS_ARGBTORAWROW_MMI
+#define HAS_ARGBTORGB24ROW_MMI
+#define HAS_ARGBTORGB565DITHERROW_MMI
+#define HAS_ARGBTORGB565ROW_MMI
+#define HAS_ARGBTOUV444ROW_MMI
+#define HAS_ARGBTOUVJROW_MMI
+#define HAS_ARGBTOUVROW_MMI
+#define HAS_ARGBTOYJROW_MMI
+#define HAS_ARGBTOYROW_MMI
+#define HAS_BGRATOUVROW_MMI
+#define HAS_BGRATOYROW_MMI
+#define HAS_BLENDPLANEROW_MMI
+#define HAS_COMPUTECUMULATIVESUMROW_MMI
+#define HAS_CUMULATIVESUMTOAVERAGEROW_MMI
+#define HAS_HALFFLOATROW_MMI
+#define HAS_I400TOARGBROW_MMI
+#define HAS_I422TOUYVYROW_MMI
+#define HAS_I422TOYUY2ROW_MMI
+#define HAS_I422TOARGBROW_MMI
+#define HAS_I444TOARGBROW_MMI
+#define HAS_INTERPOLATEROW_MMI
+#define HAS_J400TOARGBROW_MMI
+#define HAS_MERGERGBROW_MMI
+#define HAS_MERGEUVROW_MMI
+#define HAS_MIRRORROW_MMI
+#define HAS_MIRRORSPLITUVROW_MMI
+#define HAS_RAWTOARGBROW_MMI
+#define HAS_RAWTORGB24ROW_MMI
+#define HAS_RAWTOUVROW_MMI
+#define HAS_RAWTOYROW_MMI
+#define HAS_RGB24TOARGBROW_MMI
+#define HAS_RGB24TOUVROW_MMI
+#define HAS_RGB24TOYROW_MMI
+#define HAS_RGB565TOARGBROW_MMI
+#define HAS_RGB565TOUVROW_MMI
+#define HAS_RGB565TOYROW_MMI
+#define HAS_RGBATOUVROW_MMI
+#define HAS_RGBATOYROW_MMI
+#define HAS_SOBELROW_MMI
+#define HAS_SOBELTOPLANEROW_MMI
+#define HAS_SOBELXROW_MMI
+#define HAS_SOBELXYROW_MMI
+#define HAS_SOBELYROW_MMI
+#define HAS_SPLITRGBROW_MMI
+#define HAS_SPLITUVROW_MMI
+#define HAS_UYVYTOUVROW_MMI
+#define HAS_UYVYTOYROW_MMI
+#define HAS_YUY2TOUV422ROW_MMI
+#define HAS_YUY2TOUVROW_MMI
+#define HAS_YUY2TOYROW_MMI
+#define HAS_I210TOARGBROW_MMI
+#define HAS_I422TOARGB4444ROW_MMI
+#define HAS_I422TOARGB1555ROW_MMI
+#define HAS_I422TORGB565ROW_MMI
+#define HAS_NV21TORGB24ROW_MMI
+#define HAS_NV12TORGB24ROW_MMI
+#define HAS_I422ALPHATOARGBROW_MMI
+#define HAS_I422TORGB24ROW_MMI
+#define HAS_NV12TOARGBROW_MMI
+#define HAS_NV21TOARGBROW_MMI
+#define HAS_NV12TORGB565ROW_MMI
+#define HAS_YUY2TOARGBROW_MMI
+#define HAS_UYVYTOARGBROW_MMI
+#define HAS_I422TORGBAROW_MMI
 #endif
 
-#if defined(_MSC_VER) && !defined(__CLR_VER)
+#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
+#if defined(VISUALC_HAS_AVX2)
+#define SIMD_ALIGNED(var) __declspec(align(32)) var
+#else
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
-#define SIMD_ALIGNED32(var) __declspec(align(64)) var
-typedef __declspec(align(16)) int16 vec16[8];
-typedef __declspec(align(16)) int32 vec32[4];
-typedef __declspec(align(16)) int8 vec8[16];
-typedef __declspec(align(16)) uint16 uvec16[8];
-typedef __declspec(align(16)) uint32 uvec32[4];
-typedef __declspec(align(16)) uint8 uvec8[16];
-typedef __declspec(align(32)) int16 lvec16[16];
-typedef __declspec(align(32)) int32 lvec32[8];
-typedef __declspec(align(32)) int8 lvec8[32];
-typedef __declspec(align(32)) uint16 ulvec16[16];
-typedef __declspec(align(32)) uint32 ulvec32[8];
-typedef __declspec(align(32)) uint8 ulvec8[32];
-#elif defined(__GNUC__)
+#endif
+typedef __declspec(align(16)) int16_t vec16[8];
+typedef __declspec(align(16)) int32_t vec32[4];
+typedef __declspec(align(16)) float vecf32[4];
+typedef __declspec(align(16)) int8_t vec8[16];
+typedef __declspec(align(16)) uint16_t uvec16[8];
+typedef __declspec(align(16)) uint32_t uvec32[4];
+typedef __declspec(align(16)) uint8_t uvec8[16];
+typedef __declspec(align(32)) int16_t lvec16[16];
+typedef __declspec(align(32)) int32_t lvec32[8];
+typedef __declspec(align(32)) int8_t lvec8[32];
+typedef __declspec(align(32)) uint16_t ulvec16[16];
+typedef __declspec(align(32)) uint32_t ulvec32[8];
+typedef __declspec(align(32)) uint8_t ulvec8[32];
+#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
 // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
+#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
+#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
+#else
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))
-typedef int16 __attribute__((vector_size(16))) vec16;
-typedef int32 __attribute__((vector_size(16))) vec32;
-typedef int8 __attribute__((vector_size(16))) vec8;
-typedef uint16 __attribute__((vector_size(16))) uvec16;
-typedef uint32 __attribute__((vector_size(16))) uvec32;
-typedef uint8 __attribute__((vector_size(16))) uvec8;
-typedef int16 __attribute__((vector_size(32))) lvec16;
-typedef int32 __attribute__((vector_size(32))) lvec32;
-typedef int8 __attribute__((vector_size(32))) lvec8;
-typedef uint16 __attribute__((vector_size(32))) ulvec16;
-typedef uint32 __attribute__((vector_size(32))) ulvec32;
-typedef uint8 __attribute__((vector_size(32))) ulvec8;
+#endif
+typedef int16_t __attribute__((vector_size(16))) vec16;
+typedef int32_t __attribute__((vector_size(16))) vec32;
+typedef float __attribute__((vector_size(16))) vecf32;
+typedef int8_t __attribute__((vector_size(16))) vec8;
+typedef uint16_t __attribute__((vector_size(16))) uvec16;
+typedef uint32_t __attribute__((vector_size(16))) uvec32;
+typedef uint8_t __attribute__((vector_size(16))) uvec8;
+typedef int16_t __attribute__((vector_size(32))) lvec16;
+typedef int32_t __attribute__((vector_size(32))) lvec32;
+typedef int8_t __attribute__((vector_size(32))) lvec8;
+typedef uint16_t __attribute__((vector_size(32))) ulvec16;
+typedef uint32_t __attribute__((vector_size(32))) ulvec32;
+typedef uint8_t __attribute__((vector_size(32))) ulvec8;
 #else
 #define SIMD_ALIGNED(var) var
-#define SIMD_ALIGNED32(var) var
-typedef int16 vec16[8];
-typedef int32 vec32[4];
-typedef int8 vec8[16];
-typedef uint16 uvec16[8];
-typedef uint32 uvec32[4];
-typedef uint8 uvec8[16];
-typedef int16 lvec16[16];
-typedef int32 lvec32[8];
-typedef int8 lvec8[32];
-typedef uint16 ulvec16[16];
-typedef uint32 ulvec32[8];
-typedef uint8 ulvec8[32];
+typedef int16_t vec16[8];
+typedef int32_t vec32[4];
+typedef float vecf32[4];
+typedef int8_t vec8[16];
+typedef uint16_t uvec16[8];
+typedef uint32_t uvec32[4];
+typedef uint8_t uvec8[16];
+typedef int16_t lvec16[16];
+typedef int32_t lvec32[8];
+typedef int8_t lvec8[32];
+typedef uint16_t ulvec16[16];
+typedef uint32_t ulvec32[8];
+typedef uint8_t ulvec8[32];
 #endif
 
+#if defined(__aarch64__)
+// This struct is for Arm64 color conversion.
+struct YuvConstants {
+  uvec16 kUVToRB;
+  uvec16 kUVToRB2;
+  uvec16 kUVToG;
+  uvec16 kUVToG2;
+  vec16 kUVBiasBGR;
+  vec32 kYToRgb;
+};
+#elif defined(__arm__)
+// This struct is for ArmV7 color conversion.
+struct YuvConstants {
+  uvec8 kUVToRB;
+  uvec8 kUVToG;
+  vec16 kUVBiasBGR;
+  vec32 kYToRgb;
+};
+#else
+// This struct is for Intel color conversion.
+struct YuvConstants {
+  int8_t kUVToB[32];
+  int8_t kUVToG[32];
+  int8_t kUVToR[32];
+  int16_t kUVBiasB[16];
+  int16_t kUVBiasG[16];
+  int16_t kUVBiasR[16];
+  int16_t kYToRgb[16];
+  int16_t kYBiasToRgb[16];
+};
+
+// Offsets into YuvConstants structure
+#define KUVTOB 0
+#define KUVTOG 32
+#define KUVTOR 64
+#define KUVBIASB 96
+#define KUVBIASG 128
+#define KUVBIASR 160
+#define KYTORGB 192
+#define KYBIASTORGB 224
+
+#endif
+
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
+
+#define align_buffer_64(var, size)                                           \
+  uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63));         /* NOLINT */ \
+  uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
+
+#define free_aligned_buffer_64(var) \
+  free(var##_mem);                  \
+  var = 0
+
 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
 #define OMITFP
 #else
@@ -426,1432 +731,3654 @@ typedef uint8 ulvec8[32];
 #else
 #define LABELALIGN
 #endif
-#if defined(__native_client__) && defined(__x86_64__)
-// r14 is used for MEMOP macros.
-#define NACL_R14 "r14",
-#define BUNDLELOCK ".bundle_lock\n"
-#define BUNDLEUNLOCK ".bundle_unlock\n"
-#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
-#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
-#define MEMLEA(offset, base) #offset "(%q" #base ")"
-#define MEMLEA3(offset, index, scale) \
-    #offset "(,%q" #index "," #scale ")"
-#define MEMLEA4(offset, base, index, scale) \
-    #offset "(%q" #base ",%q" #index "," #scale ")"
-#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
-#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " (%%r15,%%r14),%%" #reg "\n" \
-    BUNDLEUNLOCK
-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " %%" #reg ",(%%r15,%%r14)\n" \
-    BUNDLEUNLOCK
-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " (%%r15,%%r14),%" #arg "\n" \
-    BUNDLEUNLOCK
-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
-    BUNDLEUNLOCK
-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
-    BUNDLEUNLOCK
-#else  // defined(__native_client__) && defined(__x86_64__)
-#define NACL_R14
-#define BUNDLEALIGN
-#define MEMACCESS(base) "(%" #base ")"
-#define MEMACCESS2(offset, base) #offset "(%" #base ")"
-#define MEMLEA(offset, base) #offset "(%" #base ")"
-#define MEMLEA3(offset, index, scale) \
-    #offset "(,%" #index "," #scale ")"
-#define MEMLEA4(offset, base, index, scale) \
-    #offset "(%" #base ",%" #index "," #scale ")"
-#define MEMMOVESTRING(s, d)
-#define MEMSTORESTRING(reg, d)
-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
-    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
-    #reg2 "\n"
-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
-    #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
-#endif  // defined(__native_client__) && defined(__x86_64__)
-
-#if defined(__arm__) || defined(__aarch64__)
-#undef MEMACCESS
-#if defined(__native_client__)
-#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
-#else
-#define MEMACCESS(base)
-#endif
+
+// Intel Code Analizer markers.  Insert IACA_START IACA_END around code to be
+// measured and then run with iaca -64 libyuv_unittest.
+// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within
+// inline assembly blocks.
+// example of iaca:
+// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#define IACA_ASM_START  \
+  ".byte 0x0F, 0x0B\n"  \
+  " movl $111, %%ebx\n" \
+  ".byte 0x64, 0x67, 0x90\n"
+
+#define IACA_ASM_END         \
+  " movl $222, %%ebx\n"      \
+  ".byte 0x64, 0x67, 0x90\n" \
+  ".byte 0x0F, 0x0B\n"
+
+#define IACA_SSC_MARK(MARK_ID)                        \
+  __asm__ __volatile__("\n\t  movl $" #MARK_ID        \
+                       ", %%ebx"                      \
+                       "\n\t  .byte 0x64, 0x67, 0x90" \
+                       :                              \
+                       :                              \
+                       : "memory");
+
+#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B");
+
+#else /* Visual C */
+#define IACA_UD_BYTES \
+  { __asm _emit 0x0F __asm _emit 0x0B }
+
+#define IACA_SSC_MARK(x) \
+  { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 }
+
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END __writegsbyte(222, 222);
 #endif
 
-void I444ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+#define IACA_START     \
+  {                    \
+    IACA_UD_BYTES      \
+    IACA_SSC_MARK(111) \
+  }
+#define IACA_END       \
+  {                    \
+    IACA_SSC_MARK(222) \
+    IACA_UD_BYTES      \
+  }
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width);
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToBGRARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_bgra,
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToABGRRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_abgr,
+void I444ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I444ToARGBRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+
+void I422ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToRGBARow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToARGBRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB24Row_MSA(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGB565Row_MSA(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGB4444Row_MSA(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToARGB1555Row_MSA(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV12ToRGB565Row_MSA(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ARGBToUV444Row_MSA(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
                         int width);
-void I422ToRGBARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_rgba,
+void ARGBToUVRow_MSA(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void ARGBToUV444Row_MMI(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
                         int width);
-void I422ToRGB24Row_NEON(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
-                         int width);
-void I422ToRAWRow_NEON(const uint8* src_y,
-                       const uint8* src_u,
-                       const uint8* src_v,
-                       uint8* dst_raw,
+void ARGBToUVRow_MMI(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
                        int width);
-void I422ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
-                          int width);
-void I422ToARGB1555Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
-                            int width);
-void I422ToARGB4444Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
-                            int width);
-void NV12ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
                         int width);
-void NV21ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_vu,
-                        uint8* dst_argb,
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void BGRAToUVRow_MSA(const uint8_t* src_rgb,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void ABGRToUVRow_MSA(const uint8_t* src_rgb,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGBAToUVRow_MSA(const uint8_t* src_rgb,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void RAWToUVRow_MSA(const uint8_t* src_rgb,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
+                       int src_stride_rgb565,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
+                         int src_stride_argb1555,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void BGRAToUVRow_MMI(const uint8_t* src_rgb,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void ABGRToUVRow_MMI(const uint8_t* src_rgb,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGBAToUVRow_MMI(const uint8_t* src_rgb,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void RAWToUVRow_MMI(const uint8_t* src_rgb,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
+                       int src_stride_rgb565,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
+                         int src_stride_argb1555,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
+                         int src_stride_argb4444,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width);
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width);
+void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
+
+void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGB4444ToYRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void BGRAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void BGRAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+
+void ARGBToUVRow_AVX2(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
+                        int src_stride_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
                         int width);
-void NV12ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
+                       int src_stride_bgra,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
+                       int src_stride_rgba,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
                           int width);
-void NV21ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_vu,
-                          uint8* dst_rgb565,
+void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
                           int width);
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
-                        uint8* dst_argb,
+void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
+                            int src_stride_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void RAWToUVRow_Any_NEON(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                            int src_stride_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                              int src_stride_ptr,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
+void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                              int src_stride_ptr,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
+void ARGBToUVJRow_Any_MSA(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void BGRAToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ABGRToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGBAToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGB24ToUVRow_Any_MSA(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RAWToUVRow_Any_MSA(const uint8_t* src_ptr,
+                        int src_stride_ptr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
                         int width);
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
-                        uint8* dst_argb,
+void RGB565ToUVRow_Any_MSA(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr,
+                             int src_stride_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void ARGBToUVJRow_Any_MMI(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void BGRAToUVRow_Any_MMI(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ABGRToUVRow_Any_MMI(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGBAToUVRow_Any_MMI(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGB24ToUVRow_Any_MMI(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RAWToUVRow_Any_MMI(const uint8_t* src_ptr,
+                        int src_stride_ptr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
                         int width);
-
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
-void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
-void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix);
-void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix);
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix);
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int pix);
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int pix);
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int pix);
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int pix);
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix);
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix);
-void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix);
-void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix);
-void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix);
-void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix);
-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix);
-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix);
-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix);
-void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
-void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
-void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
-void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
-void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
-void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
-void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
-void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
-void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
-void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix);
-void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
-void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
-void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
-
-void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
-                       uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
-                            uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
-                           uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                             int pix);
-void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                             int pix);
-void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                             int pix);
-void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int pix);
-void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                           uint8* dst_u, uint8* dst_v, int pix);
-void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
-                         uint8* dst_u, uint8* dst_v, int pix);
-void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                            uint8* dst_u, uint8* dst_v, int pix);
-void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
-                              int src_stride_argb1555,
-                              uint8* dst_u, uint8* dst_v, int pix);
-void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
-                              int src_stride_argb4444,
-                              uint8* dst_u, uint8* dst_v, int pix);
-void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
-                   uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
-                   uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
-                   uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
-                    uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
-                  uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
-                     uint8* dst_u, uint8* dst_v, int width);
-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
-                       uint8* dst_u, uint8* dst_v, int width);
-
-void ARGBToUV444Row_SSSE3(const uint8* src_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
-                              uint8* dst_u, uint8* dst_v, int width);
-
-void ARGBToUV422Row_SSSE3(const uint8* src_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,
-                              uint8* dst_u, uint8* dst_v, int width);
-
-void ARGBToUV444Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV422Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV411Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJ422Row_C(const uint8* src_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
-void MirrorRow_C(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
-
-void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void RGB565ToUVRow_Any_MMI(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGB1555ToUVRow_Any_MMI(const uint8_t* src_ptr,
+                             int src_stride_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr,
+                             int src_stride_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void ARGBToUVRow_C(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void ARGBToUVJRow_C(const uint8_t* src_rgb,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void ARGBToUVRow_C(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void ARGBToUVJRow_C(const uint8_t* src_rgb,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void BGRAToUVRow_C(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void ABGRToUVRow_C(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void RGBAToUVRow_C(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void RGB24ToUVRow_C(const uint8_t* src_rgb,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void RAWToUVRow_C(const uint8_t* src_rgb,
+                  int src_stride_rgb,
+                  uint8_t* dst_u,
+                  uint8_t* dst_v,
+                  int width);
+void RGB565ToUVRow_C(const uint8_t* src_rgb565,
+                     int src_stride_rgb565,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
+                       int src_stride_argb1555,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
+                       int src_stride_argb4444,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
                        int width);
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
+
+void ARGBToUV444Row_C(const uint8_t* src_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
                       int width);
-void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
                             int width);
-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                   int width);
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void RGB24MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
 
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
-
-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
-void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                           int pix);
-void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                         int pix);
-void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                         int pix);
-void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                         int pix);
-void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                               int pix);
-
-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void SplitUVRow_C(const uint8_t* src_uv,
+                  uint8_t* dst_u,
+                  uint8_t* dst_v,
                   int width);
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width);
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width);
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width);
-void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void SplitUVRow_MSA(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void SplitUVRow_MMI(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
-void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void SplitUVRow_Any_AVX2(const uint8_t* src_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
-void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void SplitUVRow_Any_NEON(const uint8_t* src_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
+void SplitUVRow_Any_MSA(const uint8_t* src_ptr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void SplitUVRow_Any_MMI(const uint8_t* src_ptr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
-void CopyRow_AVX(const uint8* src, uint8* dst, int count);
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
-void CopyRow_NEON(const uint8* src, uint8* dst, int count);
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
-void CopyRow_C(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
-
-void CopyRow_16_C(const uint16* src, uint16* dst, int count);
+void MergeUVRow_C(const uint8_t* src_u,
+                  const uint8_t* src_v,
+                  uint8_t* dst_uv,
+                  int width);
+void MergeUVRow_SSE2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width);
+void MergeUVRow_AVX2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width);
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width);
+void MergeUVRow_MSA(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width);
+void MergeUVRow_MMI(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width);
+void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void MergeUVRow_Any_NEON(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void MergeUVRow_Any_MSA(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
+void MergeUVRow_Any_MMI(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
 
-void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void HalfMergeUVRow_C(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int width);
 
-void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width);
 
-void SetRow_C(uint8* dst, uint8 v8, int count);
-void SetRow_X86(uint8* dst, uint8 v8, int count);
-void SetRow_ERMS(uint8* dst, uint8 v8, int count);
-void SetRow_NEON(uint8* dst, uint8 v8, int count);
-void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
-void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          uint8_t* dst_uv,
+                          int width);
 
-void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width);
 
-// ARGBShufflers for BGRAToARGB etc.
-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
-                      const uint8* shuffler, int pix);
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int pix);
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int pix);
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int pix);
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int pix);
-void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
-                             const uint8* shuffler, int pix);
-void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const uint8* shuffler, int pix);
-void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             const uint8* shuffler, int pix);
-void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const uint8* shuffler, int pix);
-
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix);
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                            int pix);
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                            int pix);
-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix);
-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
-                            int pix);
-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
-                            int pix);
-
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix);
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
-                            int pix);
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
-                            int pix);
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix);
-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
-void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
-void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
-void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
-void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
-
-void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                              int pix);
-void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                                int pix);
-void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                                int pix);
-void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
-                              int pix);
-void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
-                                int pix);
-void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
-                                int pix);
-
-void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
-void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
-void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
-                              int pix);
-void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
-                                int pix);
-void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
-                                int pix);
-
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-
-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
-                             const uint32 dither4, int pix);
-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int pix);
-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int pix);
-
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
-
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width);
-
-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
-void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
-void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
-void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
-
-void I444ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
-                     int width);
-void I422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
+void SplitRGBRow_C(const uint8_t* src_rgb,
+                   uint8_t* dst_r,
+                   uint8_t* dst_g,
+                   uint8_t* dst_b,
+                   int width);
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width);
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width);
+void SplitRGBRow_MMI(const uint8_t* src_rgb,
+                     uint8_t* dst_r,
+                     uint8_t* dst_g,
+                     uint8_t* dst_b,
                      int width);
-void I411ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
+void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                           uint8_t* dst_r,
+                           uint8_t* dst_g,
+                           uint8_t* dst_b,
+                           int width);
+void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,
+                          uint8_t* dst_r,
+                          uint8_t* dst_g,
+                          uint8_t* dst_b,
+                          int width);
+void SplitRGBRow_Any_MMI(const uint8_t* src_ptr,
+                         uint8_t* dst_r,
+                         uint8_t* dst_g,
+                         uint8_t* dst_b,
+                         int width);
+
+void MergeRGBRow_C(const uint8_t* src_r,
+                   const uint8_t* src_g,
+                   const uint8_t* src_b,
+                   uint8_t* dst_rgb,
+                   int width);
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_rgb,
+                       int width);
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width);
+void MergeRGBRow_MMI(const uint8_t* src_r,
+                     const uint8_t* src_g,
+                     const uint8_t* src_b,
+                     uint8_t* dst_rgb,
                      int width);
-void NV12ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_uv,
-                     uint8* dst_argb,
+void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void MergeRGBRow_Any_NEON(const uint8_t* src_r,
+                          const uint8_t* src_g,
+                          const uint8_t* src_b,
+                          uint8_t* dst_rgb,
+                          int width);
+void MergeRGBRow_Any_MMI(const uint8_t* src_r,
+                         const uint8_t* src_g,
+                         const uint8_t* src_b,
+                         uint8_t* dst_rgb,
+                         int width);
+
+void MergeUVRow_16_C(const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint16_t* dst_uv,
+                     int scale, /* 64 for 10 bit */
                      int width);
-void NV21ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_vu,
-                       uint8* dst_argb,
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int scale,
+                        int width);
+
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width);
+void MultiplyRow_16_C(const uint16_t* src_y,
+                      uint16_t* dst_y,
+                      int scale,
+                      int width);
+
+void Convert8To16Row_C(const uint8_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
                        int width);
-void NV12ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_uv,
-                       uint8* dst_argb,
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width);
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width);
+void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr,
+                              uint16_t* dst_ptr,
+                              int scale,
+                              int width);
+void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr,
+                              uint16_t* dst_ptr,
+                              int scale,
+                              int width);
+
+void Convert16To8Row_C(const uint16_t* src_y,
+                       uint8_t* dst_y,
+                       int scale,
                        int width);
-void NV21ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_vu,
-                     uint8* dst_argb,
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+                           uint8_t* dst_y,
+                           int scale,
+                           int width);
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width);
+void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int scale,
+                               int width);
+void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int scale,
+                              int width);
+
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);
+
+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBCopyAlphaRow_Any_MMI(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+
+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width);
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width);
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width);
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width);
+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width);
+void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width);
+void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int width);
+void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int width);
+void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int width);
+void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int width);
+void ARGBExtractAlphaRow_Any_MMI(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int width);
+
+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int width);
+void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int width);
+void ARGBCopyYToAlphaRow_Any_MMI(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int width);
+
+void SetRow_C(uint8_t* dst, uint8_t v8, int width);
+void SetRow_MSA(uint8_t* dst, uint8_t v8, int width);
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width);
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width);
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width);
+void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width);
+void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width);
+
+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width);
+void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width);
+void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width);
+void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_Any_MMI(uint8_t* dst_ptr, uint32_t v32, int width);
+
+// ARGBShufflers for BGRAToARGB etc.
+void ARGBShuffleRow_C(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      const uint8_t* shuffler,
+                      int width);
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const uint8_t* shuffler,
+                          int width);
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width);
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width);
+void ARGBShuffleRow_MSA(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width);
+void ARGBShuffleRow_MMI(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width);
+void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              const uint8_t* param,
+                              int width);
+void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             const uint8_t* param,
+                             int width);
+void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             const uint8_t* param,
+                             int width);
+void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const uint8_t* param,
+                            int width);
+void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const uint8_t* param,
+                            int width);
+
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_argb,
+                          int width);
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width);
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width);
+void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
+void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width);
+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
+                         uint8_t* dst_argb,
+                         int width);
+void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
+                           uint8_t* dst_argb,
+                           int width);
+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
+                         uint8_t* dst_argb,
+                         int width);
+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width);
+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width);
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width);
+
+void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+
+void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGB1555ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+
+void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RGB24ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGBARow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void RGB565ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGB1555ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+
+void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGB4444ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_rgb,
+                             const uint32_t dither4,
+                             int width);
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width);
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width);
+
+void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
+void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width);
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width);
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width);
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
+                            int width);
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
+                            int width);
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                const uint32_t dither4,
+                                int width);
+void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
+void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               const uint32_t dither4,
+                               int width);
+
+void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
+void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
+void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               const uint32_t dither4,
+                               int width);
+
+void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
+
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void J400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void I444ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-void YUY2ToARGBRow_C(const uint8* src_yuy2,
-                     uint8* dst_argb,
+void I422ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-void UYVYToARGBRow_C(const uint8* src_uyvy,
-                     uint8* dst_argb,
+void I422ToAR30Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-void J422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
+void I210ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-void I422ToBGRARow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_bgra,
+void I210ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-void I422ToABGRRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_abgr,
+void I422AlphaToARGBRow_C(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          const uint8_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV12ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_uv,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-void I422ToRGBARow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_rgba,
+void NV12ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV21ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_vu,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width);
-void I422ToRGB24Row_C(const uint8* src_y,
-                      const uint8* src_u,
-                      const uint8* src_v,
-                      uint8* dst_rgb24,
+void NV12ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_uv,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
                       int width);
-void I422ToRAWRow_C(const uint8* src_y,
-                    const uint8* src_u,
-                    const uint8* src_v,
-                    uint8* dst_raw,
-                    int width);
-void I422ToARGB4444Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb4444,
-                         int width);
-void I422ToARGB1555Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb4444,
-                         int width);
-void I422ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_u,
-                       const uint8* src_v,
-                       uint8* dst_rgb565,
+void NV21ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width);
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* dst_yuv24,
+                      int width);
+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void UYVYToARGBRow_C(const uint8_t* src_uyvy,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I422ToRGBARow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I422ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width);
+void I422ToARGB4444Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb4444,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGB1555Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb1555,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
                        int width);
-void I422ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        int width);
-void I422ToBGRARow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        int width);
-void I422ToRGBARow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToABGRRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToRGBARow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void I444ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         int width);
-void I444ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I444ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         int width);
-void I411ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         int width);
-void I411ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I444ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void NV12ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_uv,
-                         uint8* dst_argb,
+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+
+void I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-void NV21ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_vu,
-                         uint8* dst_argb,
+void I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-void NV12ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+void I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToAR30Row_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void NV21ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_vu,
-                        uint8* dst_argb,
+void I210ToARGBRow_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_uv,
-                           uint8* dst_argb,
-                           int width);
-void NV21ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_vu,
-                           uint8* dst_argb,
-                           int width);
-void NV12ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_argb,
+void I210ToAR30Row_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              const uint8_t* a_buf,
+                              uint8_t* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             const uint8_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
                           int width);
-void NV21ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_vu,
-                          uint8* dst_argb,
+void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_vu,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
                           int width);
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
-                         uint8* dst_argb,
+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_uv,
+                           uint8_t* dst_rgb565,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
-                         uint8* dst_argb,
+void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
-                        uint8* dst_argb,
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width);
+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* vu_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* vu_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
-                        uint8* dst_argb,
+void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void J422ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         int width);
-void J422ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToBGRARow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_bgra,
-                         int width);
-void I422ToABGRRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_abgr,
-                         int width);
-void I422ToRGBARow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgba,
-                         int width);
-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             int width);
-void I422ToARGB4444Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             int width);
-void I422ToARGB1555Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_u,
-                           const uint8* src_v,
-                           uint8* dst_argb,
+void I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_rgba,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb4444,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb1555,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_rgb565,
+                           const struct YuvConstants* yuvconstants,
                            int width);
-void I422ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_argb,
+void I422ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
                           int width);
-void I422ToRGB24Row_SSSE3(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb24,
+void I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                          const uint8_t* u_buf,
+                          const uint8_t* v_buf,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
                           int width);
-void I422ToRGB24Row_AVX2(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
-                         int width);
-void I422ToRAWRow_SSSE3(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_raw,
-                        int width);
-void I422ToRAWRow_AVX2(const uint8* src_y,
-                       const uint8* src_u,
-                       const uint8* src_v,
-                       uint8* dst_raw,
-                       int width);
-void I422ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToBGRARow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToRGBARow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToABGRRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             int width);
-void I444ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             int width);
-void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             int width);
-void I411ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_uv,
-                             uint8* dst_argb,
-                             int width);
-void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_vu,
-                             uint8* dst_argb,
-                             int width);
-void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_uv,
-                            uint8* dst_argb,
-                            int width);
-void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_vu,
-                            uint8* dst_argb,
-                            int width);
-void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
-                               const uint8* src_uv,
-                               uint8* dst_argb,
-                               int width);
-void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y,
-                               const uint8* src_vu,
-                               uint8* dst_argb,
-                               int width);
-void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
-                              const uint8* src_uv,
-                              uint8* dst_argb,
+void I422ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I210ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                                  const uint8_t* u_buf,
+                                  const uint8_t* v_buf,
+                                  const uint8_t* a_buf,
+                                  uint8_t* dst_ptr,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width);
+void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 const uint8_t* a_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
                               int width);
-void NV21ToRGB565Row_Any_AVX2(const uint8* src_y,
-                              const uint8* src_vu,
-                              uint8* dst_argb,
+void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
                               int width);
-void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
-                             uint8* dst_argb,
-                             int width);
-void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
-                             uint8* dst_argb,
-                             int width);
-void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
-                            uint8* dst_argb,
-                            int width);
-void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
-                            uint8* dst_argb,
-                            int width);
-void J422ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             int width);
-void J422ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_bgra,
-                             int width);
-void I422ToABGRRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_abgr,
-                             int width);
-void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_rgba,
-                             int width);
-void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
-                                 const uint8* src_u,
-                                 const uint8* src_v,
-                                 uint8* dst_rgba,
+void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
+                             const uint8_t* src_vu,
+                             uint8_t* dst_yuv24,
+                             int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
+                               const uint8_t* uv_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
                                  int width);
-void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_rgba,
+void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
                                 int width);
-void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
-                                 const uint8* src_u,
-                                 const uint8* src_v,
-                                 uint8* dst_rgba,
+void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
                                  int width);
-void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_rgba,
+void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
                                 int width);
-void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
-                               const uint8* src_u,
-                               const uint8* src_v,
-                               uint8* dst_rgba,
+void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
                                int width);
-void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_rgba,
+void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
                               int width);
-void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
+void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
                               int width);
-void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             int width);
-void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToRAWRow_Any_AVX2(const uint8* src_y,
-                           const uint8* src_u,
-                           const uint8* src_v,
-                           uint8* dst_argb,
-                           int width);
+void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_C(const uint8_t* src_y,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I400ToARGBRow_MMI(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 
 // ARGB preattenuated alpha blend.
-void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
-                        uint8* dst_argb, int width);
-void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
-                    uint8* dst_argb, int width);
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+                        const uint8_t* src_argb1,
+                        uint8_t* dst_argb,
+                        int width);
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width);
+void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width);
+void ARGBBlendRow_MMI(const uint8_t* src_argb0,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width);
+void ARGBBlendRow_C(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width);
+
+// Unattenuated planar alpha blend.
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+                         const uint8_t* src1,
+                         const uint8_t* alpha,
+                         uint8_t* dst,
+                         int width);
+void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             int width);
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+                        const uint8_t* src1,
+                        const uint8_t* alpha,
+                        uint8_t* dst,
+                        int width);
+void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void BlendPlaneRow_MMI(const uint8_t* src0,
+                       const uint8_t* src1,
+                       const uint8_t* alpha,
+                       uint8_t* dst,
+                       int width);
+void BlendPlaneRow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void BlendPlaneRow_C(const uint8_t* src0,
+                     const uint8_t* src1,
+                     const uint8_t* alpha,
+                     uint8_t* dst,
+                     int width);
 
 // ARGB multiply images. Same API as Blend, but these require
 // pointer and width alignment for SSE2.
-void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width);
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 
 // ARGB add images.
-void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
-                  uint8* dst_argb, int width);
-void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
-void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
-void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
-void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
-void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
-void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
+void ARGBAddRow_C(const uint8_t* src_argb0,
+                  const uint8_t* src_argb1,
+                  uint8_t* dst_argb,
+                  int width);
+void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width);
+void ARGBAddRow_Any_SSE2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width);
+void ARGBAddRow_Any_AVX2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width);
+void ARGBAddRow_Any_NEON(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void ARGBAddRow_MSA(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width);
+void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
+void ARGBAddRow_MMI(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width);
+void ARGBAddRow_Any_MMI(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
 
 // ARGB subtract images. Same API as Blend, but these require
 // pointer and width alignment for SSE2.
-void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-
-void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
-
-void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                    const uint32 dither4, int pix);
-void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                    const uint32 dither4, int pix);
-
-void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
-
-void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                    const uint32 dither4, int width);
-
-void I444ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I411ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToBGRARow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToABGRRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToRGBARow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            int width);
-void I422ToRGB24Row_Any_NEON(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
-                             int width);
-void I422ToRAWRow_Any_NEON(const uint8* src_y,
-                           const uint8* src_u,
-                           const uint8* src_v,
-                           uint8* dst_argb,
-                           int width);
-void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_argb,
+void ARGBSubtractRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width);
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBSubtractRow_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_argb,
+void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void I422ToRGB565Row_Any_NEON(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
+void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   int width);
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    const uint32_t param,
+                                    int width);
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    const uint32_t param,
+                                    int width);
+
+void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
                               int width);
-void NV12ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_uv,
-                            uint8* dst_argb,
+void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ABGRToAR30Row_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             int width);
-void NV21ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_uv,
-                            uint8* dst_argb,
+void ARGBToAR30Row_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             int width);
-void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
-                              const uint8* src_uv,
-                              uint8* dst_argb,
-                              int width);
-void NV21ToRGB565Row_Any_NEON(const uint8* src_y,
-                              const uint8* src_uv,
-                              uint8* dst_argb,
+
+void ARGBToRGB24Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToRAWRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_NEON(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
                               int width);
-void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
-                            uint8* dst_argb,
+void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGBToARGB4444Row_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    const uint32_t param,
+                                    int width);
+void ARGBToRGB24Row_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToRAWRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_MSA(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBToARGB4444Row_Any_MSA(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   const uint32_t param,
+                                   int width);
+
+void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             int width);
-void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
-                            uint8* dst_argb,
+void ARGBToRAWRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_MMI(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToARGB1555Row_Any_MMI(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBToARGB4444Row_Any_MMI(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBToRGB565DitherRow_Any_MMI(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   const uint32_t param,
+                                   int width);
+
+void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
-                              int width);
-void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
-                              int width);
-void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
-                              int width);
-void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
-                              int width);
-void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
+void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 const uint8_t* a_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I422ToRGBARow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_Any_NEON(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToARGB1555Row_Any_NEON(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
                               int width);
-void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
+void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV21ToARGBRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV12ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y,
+                             const uint8_t* src_vu,
+                             uint8_t* dst_yuv24,
+                             int width);
+void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
                               int width);
+void YUY2ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I444ToARGBRow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToARGBRow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                const uint8_t* a_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* uv_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToRGB565Row_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToARGBRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* uv_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void YUY2ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_C(const uint8_t* src_yuy2,
+                   int src_stride_yuy2,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+
+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_C(const uint8_t* src_uyvy,
+                   int src_stride_uyvy,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void UYVYToUV422Row_C(const uint8_t* src_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+                   int stride_ayuv,
+                   uint8_t* dst_uv,
+                   int width);
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+                   int stride_ayuv,
+                   uint8_t* dst_vu,
+                   int width);
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width);
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width);
+void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv,
+                          int stride_ayuv,
+                          uint8_t* dst_uv,
+                          int width);
+void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv,
+                          int stride_ayuv,
+                          uint8_t* dst_vu,
+                          int width);
 
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
-void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToUV422Row_C(const uint8* src_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
-void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
-void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
-void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_NEON(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix);
-
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
-void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_C(const uint8* src_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
-void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
-void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
-void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int pix);
-
-void I422ToYUY2Row_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_yuy2, int width);
-void I422ToUYVYRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_uyvy, int width);
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width);
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width);
-void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_yuy2, int width);
-void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_uyvy, int width);
-void I422ToYUY2Row_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width);
-void I422ToUYVYRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width);
-void I422ToYUY2Row_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_yuy2, int width);
-void I422ToUYVYRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_uyvy, int width);
+void I422ToYUY2Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width);
+void I422ToUYVYRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width);
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width);
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width);
+void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width);
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width);
+void I422ToYUY2Row_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToUYVYRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width);
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width);
+void I422ToYUY2Row_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToUYVYRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToYUY2Row_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width);
+void I422ToYUY2Row_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width);
+void I422ToUYVYRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width);
+void I422ToUYVYRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width);
+void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
 
 // Effects related row functions.
-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
                                int width);
-void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
                                int width);
+void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
 
 // Inverse table for unattenuate, shared by C and SSE2.
-extern const uint32 fixed_invtbl8[256];
-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+extern const uint32_t fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width);
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
                                  int width);
-void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
                                  int width);
 
-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
-
-void ARGBSepiaRow_C(uint8* dst_argb, int width);
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
-
-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
-                          const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width);
-
-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
-
-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
-
-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
-                       int interval_offset, int width);
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width);
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width);
-
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                    uint32 value);
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value);
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value);
+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+
+void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width);
+
+void ARGBColorMatrixRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const int8_t* matrix_argb,
+                          int width);
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              const int8_t* matrix_argb,
+                              int width);
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width);
+void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const int8_t* matrix_argb,
+                            int width);
+void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const int8_t* matrix_argb,
+                            int width);
+
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+                         const uint8_t* table_argb,
+                         int width);
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                           const uint8_t* table_argb,
+                           int width);
+
+void RGBColorTableRow_C(uint8_t* dst_argb,
+                        const uint8_t* table_argb,
+                        int width);
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+                          const uint8_t* table_argb,
+                          int width);
+
+void ARGBQuantizeRow_C(uint8_t* dst_argb,
+                       int scale,
+                       int interval_size,
+                       int interval_offset,
+                       int width);
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width);
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width);
+void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
+                         int scale,
+                         int interval_size,
+                         int interval_offset,
+                         int width);
+
+void ARGBShadeRow_C(const uint8_t* src_argb,
+                    uint8_t* dst_argb,
+                    int width,
+                    uint32_t value);
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value);
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value);
+void ARGBShadeRow_MSA(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value);
+void ARGBShadeRow_MMI(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value);
 
 // Used for blur.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst, int count);
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width);
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+                                    const int32_t* botleft,
+                                    int width,
+                                    int area,
+                                    uint8_t* dst,
+                                    int count);
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+                                  int32_t* cumsum,
+                                  const int32_t* previous_cumsum,
+                                  int width);
 
-void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
-                                 int width, int area, uint8* dst, int count);
-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
-                               const int32* previous_cumsum, int width);
+void ComputeCumulativeSumRow_MMI(const uint8_t* row,
+                                 int32_t* cumsum,
+                                 const int32_t* previous_cumsum,
+                                 int width);
+
+void CumulativeSumToAverageRow_C(const int32_t* tl,
+                                 const int32_t* bl,
+                                 int w,
+                                 int area,
+                                 uint8_t* dst,
+                                 int count);
+void ComputeCumulativeSumRow_C(const uint8_t* row,
+                               int32_t* cumsum,
+                               const int32_t* previous_cumsum,
+                               int width);
 
 LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_C(const uint8_t* src_argb,
+                     int src_argb_stride,
+                     uint8_t* dst_argb,
+                     const float* uv_dudv,
+                     int width);
 LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                        int src_argb_stride,
+                        uint8_t* dst_argb,
+                        const float* src_dudv,
+                        int width);
 
 // Used for I420Scale, ARGBScale, and ARGBInterpolate.
-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
-                      ptrdiff_t src_stride_ptr,
-                      int width, int source_y_fraction);
-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride_ptr, int width,
-                         int source_y_fraction);
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_C(uint8_t* dst_ptr,
+                      const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      int width,
+                      int source_y_fraction);
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
                           int source_y_fraction);
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
                          int source_y_fraction);
-void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
                          int source_y_fraction);
-void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                               ptrdiff_t src_stride_ptr, int width,
-                               int source_y_fraction);
-void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                             ptrdiff_t src_stride_ptr, int width,
-                             int source_y_fraction);
-void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                             ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_MSA(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int source_y_fraction);
+void InterpolateRow_MMI(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
+                             const uint8_t* src_ptr,
+                             ptrdiff_t src_stride_ptr,
+                             int width,
                              int source_y_fraction);
-void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr,
+                              const uint8_t* src_ptr,
+                              ptrdiff_t src_stride_ptr,
+                              int width,
                               int source_y_fraction);
-void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                             ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_Any_AVX2(uint8_t* dst_ptr,
+                             const uint8_t* src_ptr,
+                             ptrdiff_t src_stride_ptr,
+                             int width,
                              int source_y_fraction);
-void InterpolateRow_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                                   ptrdiff_t src_stride_ptr, int width,
-                                   int source_y_fraction);
+void InterpolateRow_Any_MSA(uint8_t* dst_ptr,
+                            const uint8_t* src_ptr,
+                            ptrdiff_t src_stride_ptr,
+                            int width,
+                            int source_y_fraction);
+void InterpolateRow_Any_MMI(uint8_t* dst_ptr,
+                            const uint8_t* src_ptr,
+                            ptrdiff_t src_stride_ptr,
+                            int width,
+                            int source_y_fraction);
 
-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                         ptrdiff_t src_stride_ptr,
-                         int width, int source_y_fraction);
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+                         const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int width,
+                         int source_y_fraction);
 
 // Sobel images.
-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
-                 uint8* dst_sobelx, int width);
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width);
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width);
-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
-                 uint8* dst_sobely, int width);
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width);
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width);
-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                uint8* dst_argb, int width);
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width);
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width);
-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_y, int width);
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width);
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width);
-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                  uint8* dst_argb, int width);
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width);
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width);
-void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_argb, int width);
-void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_argb, int width);
-void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                              uint8* dst_y, int width);
-void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                              uint8* dst_y, int width);
-void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                         uint8* dst_argb, int width);
-void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                         uint8* dst_argb, int width);
-
-void ARGBPolynomialRow_C(const uint8* src_argb,
-                         uint8* dst_argb, const float* poly,
-                         int width);
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width);
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width);
-
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                             const uint8* luma, uint32 lumacoeff);
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+void SobelXRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 const uint8_t* src_y2,
+                 uint8_t* dst_sobelx,
+                 int width);
+void SobelXRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width);
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width);
+void SobelXRow_MSA(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   const uint8_t* src_y2,
+                   uint8_t* dst_sobelx,
+                   int width);
+void SobelXRow_MMI(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   const uint8_t* src_y2,
+                   uint8_t* dst_sobelx,
+                   int width);
+void SobelYRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 uint8_t* dst_sobely,
+                 int width);
+void SobelYRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width);
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width);
+void SobelYRow_MSA(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   uint8_t* dst_sobely,
+                   int width);
+void SobelYRow_MMI(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   uint8_t* dst_sobely,
+                   int width);
+void SobelRow_C(const uint8_t* src_sobelx,
+                const uint8_t* src_sobely,
+                uint8_t* dst_argb,
+                int width);
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width);
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width);
+void SobelRow_MSA(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width);
+void SobelRow_MMI(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width);
+void SobelToPlaneRow_C(const uint8_t* src_sobelx,
+                       const uint8_t* src_sobely,
+                       uint8_t* dst_y,
+                       int width);
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width);
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width);
+void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
+                         const uint8_t* src_sobely,
+                         uint8_t* dst_y,
+                         int width);
+void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
+                         const uint8_t* src_sobely,
+                         uint8_t* dst_y,
+                         int width);
+void SobelXYRow_C(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width);
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width);
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width);
+void SobelXYRow_MSA(const uint8_t* src_sobelx,
+                    const uint8_t* src_sobely,
+                    uint8_t* dst_argb,
+                    int width);
+void SobelXYRow_MMI(const uint8_t* src_sobelx,
+                    const uint8_t* src_sobely,
+                    uint8_t* dst_argb,
+                    int width);
+void SobelRow_Any_SSE2(const uint8_t* y_buf,
+                       const uint8_t* uv_buf,
+                       uint8_t* dst_ptr,
+                       int width);
+void SobelRow_Any_NEON(const uint8_t* y_buf,
+                       const uint8_t* uv_buf,
+                       uint8_t* dst_ptr,
+                       int width);
+void SobelRow_Any_MSA(const uint8_t* y_buf,
+                      const uint8_t* uv_buf,
+                      uint8_t* dst_ptr,
+                      int width);
+void SobelRow_Any_MMI(const uint8_t* y_buf,
+                      const uint8_t* uv_buf,
+                      uint8_t* dst_ptr,
+                      int width);
+void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
+void SobelToPlaneRow_Any_MMI(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
+void SobelXYRow_Any_SSE2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void SobelXYRow_Any_NEON(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void SobelXYRow_Any_MSA(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
+void SobelXYRow_Any_MMI(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
+
+void ARGBPolynomialRow_C(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const float* poly,
+                         int width);
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
+                            int width);
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
+                            int width);
+
+// Scale and convert to half float.
+void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width);
+void HalfFloatRow_SSE2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
+void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
+                           float param,
+                           int width);
+void HalfFloatRow_AVX2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
+void HalfFloatRow_Any_AVX2(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
+                           float param,
+                           int width);
+void HalfFloatRow_F16C(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
+void HalfFloatRow_Any_F16C(const uint16_t* src,
+                           uint16_t* dst,
+                           float scale,
+                           int width);
+void HalfFloat1Row_F16C(const uint16_t* src,
+                        uint16_t* dst,
+                        float scale,
+                        int width);
+void HalfFloat1Row_Any_F16C(const uint16_t* src,
+                            uint16_t* dst,
+                            float scale,
+                            int width);
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
+void HalfFloatRow_Any_NEON(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
+                           float param,
+                           int width);
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float scale,
+                        int width);
+void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr,
+                            uint16_t* dst_ptr,
+                            float param,
+                            int width);
+void HalfFloatRow_MSA(const uint16_t* src,
+                      uint16_t* dst,
+                      float scale,
+                      int width);
+void HalfFloatRow_Any_MSA(const uint16_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          float param,
+                          int width);
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width);
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width);
+void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
+                             float* dst_ptr,
+                             float param,
+                             int width);
+
+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width,
+                             const uint8_t* luma,
+                             uint32_t lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+                                 uint8_t* dst_argb,
                                  int width,
-                                 const uint8* luma, uint32 lumacoeff);
+                                 const uint8_t* luma,
+                                 uint32_t lumacoeff);
+
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width);
+float ScaleMaxSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width);
+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
+float ScaleSumSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width);
+void ScaleSamples_C(const float* src, float* dst, float scale, int width);
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
+
+void I210ToARGBRow_MMI(const uint16_t* src_y,
+                       const uint16_t* src_u,
+                       const uint16_t* src_v,
+                       uint8_t* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToRGBARow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB24Row_MMI(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGB565Row_MMI(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGB4444Row_MMI(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToARGB1555Row_MMI(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToARGBRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV12ToRGB565Row_MMI(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToARGBRow_MMI(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV12ToRGB24Row_MMI(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV21ToRGB24Row_MMI(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I210ToARGBRow_Any_MMI(const uint16_t* y_buf,
+                           const uint16_t* u_buf,
+                           const uint16_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToRGBARow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422AlphaToARGBRow_Any_MMI(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                const uint8_t* a_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToRGB24Row_Any_MMI(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB565Row_Any_MMI(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_Any_MMI(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void I422ToARGB1555Row_Any_MMI(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void NV12ToARGBRow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* uv_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToRGB565Row_Any_MMI(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToARGBRow_Any_MMI(const uint8_t* y_buf,
+                           const uint8_t* uv_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToRGB24Row_Any_MMI(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV21ToRGB24Row_Any_MMI(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void YUY2ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+
+void GaussRow_F32_NEON(const float* src, float* dst, int width);
+void GaussRow_F32_C(const float* src, float* dst, int width);
+
+void GaussCol_F32_NEON(const float* src0,
+                       const float* src1,
+                       const float* src2,
+                       const float* src3,
+                       const float* src4,
+                       float* dst,
+                       int width);
+
+void GaussCol_F32_C(const float* src0,
+                    const float* src1,
+                    const float* src2,
+                    const float* src3,
+                    const float* src4,
+                    float* dst,
+                    int width);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_ROW_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/scale.h b/media/libaom/src/third_party/libyuv/include/libyuv/scale.h
index 3974aba34e..add5a9eb62 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/scale.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/scale.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_SCALE_H_
 #define INCLUDE_LIBYUV_SCALE_H_
 
 #include "libyuv/basic_types.h"
@@ -21,25 +20,33 @@ extern "C" {
 
 // Supported filtering.
 typedef enum FilterMode {
-  kFilterNone = 0,  // Point sample; Fastest.
-  kFilterLinear = 1,  // Filter horizontally only.
+  kFilterNone = 0,      // Point sample; Fastest.
+  kFilterLinear = 1,    // Filter horizontally only.
   kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
-  kFilterBox = 3  // Highest quality.
+  kFilterBox = 3        // Highest quality.
 } FilterModeEnum;
 
 // Scale a YUV plane.
 LIBYUV_API
-void ScalePlane(const uint8* src, int src_stride,
-                int src_width, int src_height,
-                uint8* dst, int dst_stride,
-                int dst_width, int dst_height,
+void ScalePlane(const uint8_t* src,
+                int src_stride,
+                int src_width,
+                int src_height,
+                uint8_t* dst,
+                int dst_stride,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering);
 
 LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
-                   int src_width, int src_height,
-                   uint16* dst, int dst_stride,
-                   int dst_width, int dst_height,
+void ScalePlane_16(const uint16_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
                    enum FilterMode filtering);
 
 // Scales a YUV 4:2:0 image from the src width and height to the
@@ -53,43 +60,136 @@ void ScalePlane_16(const uint16* src, int src_stride,
 // Returns 0 if successful.
 
 LIBYUV_API
-int I420Scale(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              int src_width, int src_height,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int dst_width, int dst_height,
+int I420Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering);
 
 LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
-                 const uint16* src_u, int src_stride_u,
-                 const uint16* src_v, int src_stride_v,
-                 int src_width, int src_height,
-                 uint16* dst_y, int dst_stride_y,
-                 uint16* dst_u, int dst_stride_u,
-                 uint16* dst_v, int dst_stride_v,
-                 int dst_width, int dst_height,
+int I420Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
                  enum FilterMode filtering);
 
-#ifdef __cplusplus
-// Legacy API.  Deprecated.
+// Scales a YUV 4:4:4 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
 LIBYUV_API
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
-          int src_stride_y, int src_stride_u, int src_stride_v,
-          int src_width, int src_height,
-          uint8* dst_y, uint8* dst_u, uint8* dst_v,
-          int dst_stride_y, int dst_stride_u, int dst_stride_v,
-          int dst_width, int dst_height,
-          LIBYUV_BOOL interpolate);
+int I444Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+
+// Scales an NV12 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// kFilterBox is not supported for the UV channel and will be treated as
+// bilinear.
+// Returns 0 if successful.
 
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_uv,
+              int dst_stride_uv,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+
+#ifdef __cplusplus
 // Legacy API.  Deprecated.
 LIBYUV_API
-int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
-                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
-                LIBYUV_BOOL interpolate);
+int Scale(const uint8_t* src_y,
+          const uint8_t* src_u,
+          const uint8_t* src_v,
+          int src_stride_y,
+          int src_stride_u,
+          int src_stride_v,
+          int src_width,
+          int src_height,
+          uint8_t* dst_y,
+          uint8_t* dst_u,
+          uint8_t* dst_v,
+          int dst_stride_y,
+          int dst_stride_u,
+          int dst_stride_v,
+          int dst_width,
+          int dst_height,
+          LIBYUV_BOOL interpolate);
 
 // For testing, allow disabling of specialized scalers.
 LIBYUV_API
@@ -101,4 +201,4 @@ void SetUseReferenceImpl(LIBYUV_BOOL use);
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_SCALE_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_SCALE_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/scale_argb.h b/media/libaom/src/third_party/libyuv/include/libyuv/scale_argb.h
index 22563837dd..7641f18e34 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/scale_argb.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/scale_argb.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_
 #define INCLUDE_LIBYUV_SCALE_ARGB_H_
 
 #include "libyuv/basic_types.h"
@@ -21,33 +20,52 @@ extern "C" {
 #endif
 
 LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
-              int src_width, int src_height,
-              uint8* dst_argb, int dst_stride_argb,
-              int dst_width, int dst_height,
+int ARGBScale(const uint8_t* src_argb,
+              int src_stride_argb,
+              int src_width,
+              int src_height,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering);
 
 // Clipped scale takes destination rectangle coordinates for clip values.
 LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
-                  int src_width, int src_height,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int dst_width, int dst_height,
-                  int clip_x, int clip_y, int clip_width, int clip_height,
+int ARGBScaleClip(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  int src_width,
+                  int src_height,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int dst_width,
+                  int dst_height,
+                  int clip_x,
+                  int clip_y,
+                  int clip_width,
+                  int clip_height,
                   enum FilterMode filtering);
 
-// TODO(fbarchard): Implement this.
 // Scale with YUV conversion to ARGB and clipping.
 LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint32 src_fourcc,
-                       int src_width, int src_height,
-                       uint8* dst_argb, int dst_stride_argb,
-                       uint32 dst_fourcc,
-                       int dst_width, int dst_height,
-                       int clip_x, int clip_y, int clip_width, int clip_height,
+int YUVToARGBScaleClip(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint32_t src_fourcc,
+                       int src_width,
+                       int src_height,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       uint32_t dst_fourcc,
+                       int dst_width,
+                       int dst_height,
+                       int clip_x,
+                       int clip_y,
+                       int clip_width,
+                       int clip_height,
                        enum FilterMode filtering);
 
 #ifdef __cplusplus
@@ -55,4 +73,4 @@ int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/scale_row.h b/media/libaom/src/third_party/libyuv/include/libyuv/scale_row.h
index a46b5ce692..a386d49989 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/scale_row.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/scale_row.h
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_
 #define INCLUDE_LIBYUV_SCALE_ROW_H_
 
 #include "libyuv/basic_types.h"
@@ -20,14 +19,37 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
 
 // Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
-    defined(_MSC_VER) && _MSC_VER >= 1700
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+    _MSC_VER >= 1700
 #define VISUALC_HAS_AVX2 1
 #endif  // VisualStudio >= 2012
 
@@ -36,6 +58,7 @@ extern "C" {
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #define HAS_FIXEDDIV1_X86
 #define HAS_FIXEDDIV_X86
+#define HAS_SCALEADDROW_SSE2
 #define HAS_SCALEARGBCOLS_SSE2
 #define HAS_SCALEARGBCOLSUP2_SSE2
 #define HAS_SCALEARGBFILTERCOLS_SSSE3
@@ -43,28 +66,45 @@ extern "C" {
 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
 #define HAS_SCALECOLSUP2_SSE2
 #define HAS_SCALEFILTERCOLS_SSSE3
-#define HAS_SCALEROWDOWN2_SSE2
+#define HAS_SCALEROWDOWN2_SSSE3
 #define HAS_SCALEROWDOWN34_SSSE3
 #define HAS_SCALEROWDOWN38_SSSE3
-#define HAS_SCALEROWDOWN4_SSE2
+#define HAS_SCALEROWDOWN4_SSSE3
+#endif
+
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_SCALEUVROWDOWN2BOX_SSSE3
+#endif
+
+// The following are available for gcc/clang x86 platforms, but
+// require clang 3.4 or gcc 4.7.
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) &&                                     \
+    (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEUVROWDOWN2BOX_AVX2
 #endif
 
-// The following are available on VS2012:
-#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) &&                          \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+     defined(GCC_HAS_AVX2))
 #define HAS_SCALEADDROW_AVX2
 #define HAS_SCALEROWDOWN2_AVX2
 #define HAS_SCALEROWDOWN4_AVX2
 #endif
 
-// The following are available on Visual C:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
-#define HAS_SCALEADDROW_SSE2
-#endif
-
 // The following are available on Neon platforms:
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+#if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEADDROW_NEON
 #define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
 #define HAS_SCALEARGBROWDOWN2_NEON
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 #define HAS_SCALEFILTERCOLS_NEON
@@ -72,408 +112,1256 @@ extern "C" {
 #define HAS_SCALEROWDOWN34_NEON
 #define HAS_SCALEROWDOWN38_NEON
 #define HAS_SCALEROWDOWN4_NEON
-#define HAS_SCALEARGBFILTERCOLS_NEON
+#define HAS_SCALEUVROWDOWN2BOX_NEON
+#define HAS_SCALEUVROWDOWNEVEN_NEON
+#endif
+
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_SCALEADDROW_MSA
+#define HAS_SCALEARGBCOLS_MSA
+#define HAS_SCALEARGBFILTERCOLS_MSA
+#define HAS_SCALEARGBROWDOWN2_MSA
+#define HAS_SCALEARGBROWDOWNEVEN_MSA
+#define HAS_SCALEFILTERCOLS_MSA
+#define HAS_SCALEROWDOWN2_MSA
+#define HAS_SCALEROWDOWN34_MSA
+#define HAS_SCALEROWDOWN38_MSA
+#define HAS_SCALEROWDOWN4_MSA
 #endif
 
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
-    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_SCALEROWDOWN2_MIPS_DSPR2
-#define HAS_SCALEROWDOWN4_MIPS_DSPR2
-#define HAS_SCALEROWDOWN34_MIPS_DSPR2
-#define HAS_SCALEROWDOWN38_MIPS_DSPR2
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_FIXEDDIV1_MIPS
+#define HAS_FIXEDDIV_MIPS
+#define HAS_SCALEADDROW_16_MMI
+#define HAS_SCALEADDROW_MMI
+#define HAS_SCALEARGBCOLS_MMI
+#define HAS_SCALEARGBCOLSUP2_MMI
+#define HAS_SCALEARGBROWDOWN2_MMI
+#define HAS_SCALEARGBROWDOWNEVEN_MMI
+#define HAS_SCALECOLS_16_MMI
+#define HAS_SCALECOLS_MMI
+#define HAS_SCALEROWDOWN2_16_MMI
+#define HAS_SCALEROWDOWN2_MMI
+#define HAS_SCALEROWDOWN4_16_MMI
+#define HAS_SCALEROWDOWN4_MMI
+#define HAS_SCALEROWDOWN34_MMI
 #endif
 
 // Scale ARGB vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_argb, uint8* dst_argb,
-                        int x, int y, int dy,
-                        int bpp, enum FilterMode filtering);
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        int x,
+                        int y,
+                        int dy,
+                        int bpp,
+                        enum FilterMode filtering);
 
 void ScalePlaneVertical_16(int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint16* src_argb, uint16* dst_argb,
-                           int x, int y, int dy,
-                           int wpp, enum FilterMode filtering);
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_argb,
+                           uint16_t* dst_argb,
+                           int x,
+                           int y,
+                           int dy,
+                           int wpp,
+                           enum FilterMode filtering);
 
 // Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
-                                  int dst_width, int dst_height,
+enum FilterMode ScaleFilterReduce(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
                                   enum FilterMode filtering);
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_C(int num, int div);
 int FixedDiv_X86(int num, int div);
+int FixedDiv_MIPS(int num, int div);
 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
 int FixedDiv1_C(int num, int div);
 int FixedDiv1_X86(int num, int div);
+int FixedDiv1_MIPS(int num, int div);
 #ifdef HAS_FIXEDDIV_X86
 #define FixedDiv FixedDiv_X86
 #define FixedDiv1 FixedDiv1_X86
+#elif defined HAS_FIXEDDIV_MIPS
+#define FixedDiv FixedDiv_MIPS
+#define FixedDiv1 FixedDiv1_MIPS
 #else
 #define FixedDiv FixedDiv_C
 #define FixedDiv1 FixedDiv1_C
 #endif
 
 // Compute slope values for stepping.
-void ScaleSlope(int src_width, int src_height,
-                int dst_width, int dst_height,
+void ScaleSlope(int src_width,
+                int src_height,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering,
-                int* x, int* y, int* dx, int* dy);
-
-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width);
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width);
-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width);
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst, int dst_width);
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width);
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width);
-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width);
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width);
-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width);
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width);
-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width);
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width);
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width);
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width);
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width);
-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                 int dst_width, int x, int dx);
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                    int dst_width, int x, int dx);
-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
-                    int dst_width, int, int);
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int, int);
-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx);
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                          int dst_width, int x, int dx);
-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
-                         int dst_width, int x, int dx);
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                            int dst_width, int x, int dx);
-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width);
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width);
-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                int* x,
+                int* y,
+                int* dx,
+                int* dy);
+
+void ScaleRowDown2_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width);
+void ScaleRowDown2_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width);
+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width);
+void ScaleRowDown2Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width);
+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width);
+void ScaleRowDown4_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width);
+void ScaleRowDown4_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width);
+void ScaleRowDown4Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width);
+void ScaleRowDown34_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width);
+void ScaleRowDown34_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width);
+void ScaleCols_C(uint8_t* dst_ptr,
+                 const uint8_t* src_ptr,
+                 int dst_width,
+                 int x,
+                 int dx);
+void ScaleCols_16_C(uint16_t* dst_ptr,
+                    const uint16_t* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx);
+void ScaleColsUp2_C(uint8_t* dst_ptr,
+                    const uint8_t* src_ptr,
+                    int dst_width,
+                    int,
+                    int);
+void ScaleColsUp2_16_C(uint16_t* dst_ptr,
+                       const uint16_t* src_ptr,
+                       int dst_width,
+                       int,
+                       int);
+void ScaleFilterCols_C(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx);
+void ScaleFilterCols_16_C(uint16_t* dst_ptr,
+                          const uint16_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx);
+void ScaleFilterCols64_C(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x32,
+                         int dx);
+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            int dst_width,
+                            int x32,
+                            int dx);
+void ScaleRowDown38_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width);
+void ScaleRowDown38_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width);
-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
-void ScaleARGBRowDown2_C(const uint8* src_argb,
+                               uint16_t* dst_ptr,
+                               int dst_width);
+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+                      uint32_t* dst_ptr,
+                      int src_width);
+void ScaleARGBRowDown2_C(const uint8_t* src_argb,
                          ptrdiff_t src_stride,
-                         uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                         uint8_t* dst_argb,
+                         int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width);
+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
                             int src_stepx,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                            uint8_t* dst_argb,
+                            int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
-                     int dst_width, int x, int dx);
-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
-                       int dst_width, int x, int dx);
-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int, int);
-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx);
-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
-                             int dst_width, int x, int dx);
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBCols_C(uint8_t* dst_argb,
+                     const uint8_t* src_argb,
+                     int dst_width,
+                     int x,
+                     int dx);
+void ScaleARGBCols64_C(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x32,
+                       int dx);
+void ScaleARGBColsUp2_C(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int,
+                        int);
+void ScaleARGBFilterCols_C(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x32,
+                             int dx);
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_uv,
+                       int dst_width);
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_uv,
+                          int dst_width);
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          int src_stepx,
+                          uint8_t* dst_uv,
+                          int dst_width);
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             int src_stepx,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVCols_C(uint8_t* dst_uv,
+                   const uint8_t* src_uv,
+                   int dst_width,
+                   int x,
+                   int dx);
+void ScaleUVCols64_C(uint8_t* dst_uv,
+                     const uint8_t* src_uv,
+                     int dst_width,
+                     int x32,
+                     int dx);
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+                      const uint8_t* src_uv,
+                      int dst_width,
+                      int,
+                      int);
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+                         const uint8_t* src_uv,
+                         int dst_width,
+                         int x,
+                         int dx);
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+                           const uint8_t* src_uv,
+                           int dst_width,
+                           int x32,
+                           int dx);
 
 // Specialized scalers for x86.
-void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr,
                                     ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr,
                                     ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr,
                                     ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr,
                                     ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
 
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx);
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx);
+void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
+void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
 
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx);
 
 // ARGB Column functions
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx);
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx);
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx);
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx);
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx);
-void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
-                                  int dst_width, int x, int dx);
-void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
-                            int dst_width, int x, int dx);
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx);
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                               const uint8_t* src_argb,
+                               int dst_width,
+                               int x,
+                               int dx);
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx);
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr,
+                                  const uint8_t* src_ptr,
+                                  int dst_width,
+                                  int x,
+                                  int dx);
+void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr,
+                            const uint8_t* src_ptr,
+                            int dst_width,
+                            int x,
+                            int dx);
+void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x,
+                             int dx);
+void ScaleARGBCols_MSA(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx);
+void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr,
+                                 const uint8_t* src_ptr,
+                                 int dst_width,
+                                 int x,
+                                 int dx);
+void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleARGBCols_MMI(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx);
+void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
 
 // ARGB Row functions
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width);
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width);
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width);
+void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst, int dst_width);
-void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst, int dst_width);
-
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleARGBRowDown2_Any_MMI(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleARGBRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleARGBRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
                                int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
                                    int src_stepx,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
                                       int src_stepx,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
                                    int src_stepx,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
                                       int src_stepx,
-                                      uint8* dst_argb, int dst_width);
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  int32_t src_stepx,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     int src_stepx,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleARGBRowDownEven_Any_MMI(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  int32_t src_stepx,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     int src_stepx,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+
+// UV Row functions
+void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_uv,
+                           int dst_width);
+void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_uv,
+                                 int dst_width);
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_uv,
+                              int dst_width);
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_uv,
+                                int dst_width);
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleUVRowDown2_MSA(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_uv,
+                         int dst_width);
+void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDown2_MMI(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_uv,
+                         int dst_width);
+void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowDown2_Any_MMI(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleUVRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              int src_stepx,
+                              uint8_t* dst_uv,
+                              int dst_width);
+void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_uv,
+                                 int dst_width);
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             int src_stepx,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                int src_stepx,
+                                uint8_t* dst_uv,
+                                int dst_width);
+void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int32_t src_stepx,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int32_t src_stepx,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     int src_stepx,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    int src_stepx,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                int32_t src_stepx,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowDownEven_Any_MMI(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                int32_t src_stepx,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
 
 // ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.
 
 // Note - not static due to reuse in convert for 444 to 420.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width);
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst, int dst_width);
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width);
-
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width);
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width);
+
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
 
 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
 //  to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
+                               uint8_t* dst_ptr,
+                               int dst_width);
 
 // 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
+                         uint8_t* dst_ptr,
+                         int dst_width);
 // 32x3 -> 12x1
-void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
+                               uint8_t* dst_ptr,
+                               int dst_width);
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+
+void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst, int dst_width);
-void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
 // 32 -> 12
-void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
 // 32x3 -> 12x1
-void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx);
-
-void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                              int dst_width, int x, int dx);
-
-
-void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst, int dst_width);
-void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst, int dst_width);
-void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst, int dst_width);
-void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst, int dst_width);
-void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     uint8* d, int dst_width);
-void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+
+void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx);
+
+void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr,
+                              const uint8_t* src_ptr,
+                              int dst_width,
+                              int x,
+                              int dx);
+
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleFilterCols_MSA(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x,
+                         int dx);
+void ScaleRowDown34_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown34_MMI(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width);
+void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width);
+
+void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
+                         uint16_t* dst_ptr,
+                         int src_width);
+void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr,
+                             const uint8_t* src_ptr,
+                             int dst_width,
+                             int x,
+                             int dx);
+void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown34_Any_MMI(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+
+void ScaleRowDown2_MMI(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint16_t* dst,
+                          int dst_width);
+void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint16_t* dst,
+                                int dst_width);
+void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint16_t* dst,
+                             int dst_width);
+void ScaleRowDown2Box_Odd_MMI(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width);
+void ScaleRowDown4_MMI(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint16_t* dst,
+                          int dst_width);
+void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint16_t* dst,
+                             int dst_width);
+void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
+                        uint32_t* dst_ptr,
+                        int src_width);
+void ScaleColsUp2_MMI(uint8_t* dst_ptr,
+                      const uint8_t* src_ptr,
+                      int dst_width,
+                      int x,
+                      int dx);
+void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
+                         const uint16_t* src_ptr,
+                         int dst_width,
+                         int x,
+                         int dx);
+void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
+                          const uint8_t* src_argb,
+                          int dst_width,
+                          int x,
+                          int dx);
 
+void ScaleRowDown2_Any_MMI(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown4_Any_MMI(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown4Box_Any_MMI(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleAddRow_Any_MMI(const uint8_t* src_ptr,
+                         uint16_t* dst_ptr,
+                         int src_width);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/scale_uv.h b/media/libaom/src/third_party/libyuv/include/libyuv/scale_uv.h
new file mode 100644
index 0000000000..1b6327aaed
--- /dev/null
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/scale_uv.h
@@ -0,0 +1,38 @@
+/*
+ *  Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_UV_H_
+#define INCLUDE_LIBYUV_SCALE_UV_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+            int src_stride_uv,
+            int src_width,
+            int src_height,
+            uint8_t* dst_uv,
+            int dst_stride_uv,
+            int dst_width,
+            int dst_height,
+            enum FilterMode filtering);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_UV_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/version.h b/media/libaom/src/third_party/libyuv/include/libyuv/version.h
index 287b98ebf2..efaac73e3a 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/version.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/version.h
@@ -1,17 +1,16 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1456
+#define LIBYUV_VERSION 1768
 
-#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/media/libaom/src/third_party/libyuv/include/libyuv/video_common.h b/media/libaom/src/third_party/libyuv/include/libyuv/video_common.h
index 7b0a19cc90..b9823d71d0 100644
--- a/media/libaom/src/third_party/libyuv/include/libyuv/video_common.h
+++ b/media/libaom/src/third_party/libyuv/include/libyuv/video_common.h
@@ -1,17 +1,16 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 // Common definitions for video, including fourcc and VideoFormat.
 
-#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_
 #define INCLUDE_LIBYUV_VIDEO_COMMON_H_
 
 #include "libyuv/basic_types.h"
@@ -29,13 +28,14 @@ extern "C" {
 // Needs to be a macro otherwise the OS X compiler complains when the kFormat*
 // constants are used in a switch.
 #ifdef __cplusplus
-#define FOURCC(a, b, c, d) ( \
-    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
-    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#define FOURCC(a, b, c, d)                                        \
+  ((static_cast<uint32_t>(a)) | (static_cast<uint32_t>(b) << 8) | \
+   (static_cast<uint32_t>(c) << 16) | /* NOLINT */                \
+   (static_cast<uint32_t>(d) << 24))  /* NOLINT */
 #else
-#define FOURCC(a, b, c, d) ( \
-    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
-    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */
+#define FOURCC(a, b, c, d)                                     \
+  (((uint32_t)(a)) | ((uint32_t)(b) << 8) |       /* NOLINT */ \
+   ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */
 #endif
 
 // Some pages discussing FourCC codes:
@@ -50,48 +50,60 @@ extern "C" {
 // Secondary formats are converted in 2 steps.
 // Auxilliary formats call primary converters.
 enum FourCC {
-  // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+  // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
   FOURCC_I420 = FOURCC('I', '4', '2', '0'),
   FOURCC_I422 = FOURCC('I', '4', '2', '2'),
   FOURCC_I444 = FOURCC('I', '4', '4', '4'),
-  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
   FOURCC_I400 = FOURCC('I', '4', '0', '0'),
   FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
   FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
   FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
   FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+  FOURCC_I010 = FOURCC('I', '0', '1', '0'),  // bt.601 10 bit 420
+  FOURCC_I210 = FOURCC('I', '0', '1', '0'),  // bt.601 10 bit 422
 
-  // 2 Secondary YUV formats: row biplanar.
+  // 1 Secondary YUV format: row biplanar.  deprecated.
   FOURCC_M420 = FOURCC('M', '4', '2', '0'),
-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
 
-  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+  // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
   FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
   FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
   FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_AR30 = FOURCC('A', 'R', '3', '0'),  // 10 bit per channel. 2101010.
+  FOURCC_AB30 = FOURCC('A', 'B', '3', '0'),  // ABGR version of 10 bit
   FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
-  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+  FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
   FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
   FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
   FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
   FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
 
-  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
-  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
-  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
-  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
-  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
-
   // 1 Primary Compressed YUV format.
   FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
 
-  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+  // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
   FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
   FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
   FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
   FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
-  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
-  FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+  FOURCC_J420 =
+      FOURCC('J', '4', '2', '0'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_J422 =
+      FOURCC('J', '4', '2', '2'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_J444 =
+      FOURCC('J', '4', '4', '4'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_J400 =
+      FOURCC('J', '4', '0', '0'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // bt.709, unofficial fourcc
+  FOURCC_H422 = FOURCC('H', '4', '2', '2'),  // bt.709, unofficial fourcc
+  FOURCC_H444 = FOURCC('H', '4', '4', '4'),  // bt.709, unofficial fourcc
+  FOURCC_U420 = FOURCC('U', '4', '2', '0'),  // bt.2020, unofficial fourcc
+  FOURCC_U422 = FOURCC('U', '4', '2', '2'),  // bt.2020, unofficial fourcc
+  FOURCC_U444 = FOURCC('U', '4', '4', '4'),  // bt.2020, unofficial fourcc
+  FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // bt.709 10 bit 420
+  FOURCC_U010 = FOURCC('U', '0', '1', '0'),  // bt.2020 10 bit 420
+  FOURCC_H210 = FOURCC('H', '0', '1', '0'),  // bt.709 10 bit 422
+  FOURCC_U210 = FOURCC('U', '0', '1', '0'),  // bt.2020 10 bit 422
 
   // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
   FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
@@ -112,7 +124,13 @@ enum FourCC {
   FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
   FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
 
-  // 1 Auxiliary compressed YUV format set aside for capturer.
+  // deprecated formats.  Not supported, but defined for backward compatibility.
+  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
   FOURCC_H264 = FOURCC('H', '2', '6', '4'),
 
   // Match any fourcc.
@@ -130,14 +148,16 @@ enum FourCCBpp {
   FOURCC_BPP_NV12 = 12,
   FOURCC_BPP_YUY2 = 16,
   FOURCC_BPP_UYVY = 16,
-  FOURCC_BPP_M420 = 12,
+  FOURCC_BPP_M420 = 12,  // deprecated
   FOURCC_BPP_Q420 = 12,
   FOURCC_BPP_ARGB = 32,
   FOURCC_BPP_BGRA = 32,
   FOURCC_BPP_ABGR = 32,
   FOURCC_BPP_RGBA = 32,
+  FOURCC_BPP_AR30 = 32,
+  FOURCC_BPP_AB30 = 32,
   FOURCC_BPP_24BG = 24,
-  FOURCC_BPP_RAW  = 24,
+  FOURCC_BPP_RAW = 24,
   FOURCC_BPP_RGBP = 16,
   FOURCC_BPP_RGBO = 16,
   FOURCC_BPP_R444 = 16,
@@ -151,6 +171,9 @@ enum FourCCBpp {
   FOURCC_BPP_YU12 = 12,
   FOURCC_BPP_J420 = 12,
   FOURCC_BPP_J400 = 8,
+  FOURCC_BPP_H420 = 12,
+  FOURCC_BPP_H422 = 16,
+  FOURCC_BPP_H010 = 24,
   FOURCC_BPP_MJPG = 0,  // 0 means unknown.
   FOURCC_BPP_H264 = 0,
   FOURCC_BPP_IYUV = 12,
@@ -169,15 +192,15 @@ enum FourCCBpp {
   FOURCC_BPP_CM24 = 24,
 
   // Match any fourcc.
-  FOURCC_BPP_ANY  = 0,  // 0 means unknown.
+  FOURCC_BPP_ANY = 0,  // 0 means unknown.
 };
 
 // Converts fourcc aliases into canonical ones.
-LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_
diff --git a/media/libaom/src/third_party/libyuv/source/compare.cc b/media/libaom/src/third_party/libyuv/source/compare.cc
index 46aa8473d2..e93aba1b53 100644
--- a/media/libaom/src/third_party/libyuv/source/compare.cc
+++ b/media/libaom/src/third_party/libyuv/source/compare.cc
@@ -17,6 +17,7 @@
 #endif
 
 #include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
 #include "libyuv/video_common.h"
@@ -27,29 +28,12 @@ extern "C" {
 #endif
 
 // hash seed of 5381 recommended.
-// Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
-
-// This module is for Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
-#define HAS_HASHDJB2_SSE41
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
-
-#ifdef VISUALC_HAS_AVX2
-#define HAS_HASHDJB2_AVX2
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
-#endif
-
-#endif  // HAS_HASHDJB2_SSE41
-
-// hash seed of 5381 recommended.
 LIBYUV_API
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
   const int kBlockSize = 1 << 15;  // 32768;
   int remainder;
-  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+  uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) =
+      HashDjb2_C;
 #if defined(HAS_HASHDJB2_SSE41)
   if (TestCpuFlag(kCpuHasSSE41)) {
     HashDjb2_SSE = HashDjb2_SSE41;
@@ -61,37 +45,37 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
   }
 #endif
 
-  while (count >= (uint64)(kBlockSize)) {
+  while (count >= (uint64_t)(kBlockSize)) {
     seed = HashDjb2_SSE(src, kBlockSize, seed);
     src += kBlockSize;
     count -= kBlockSize;
   }
-  remainder = (int)(count) & ~15;
+  remainder = (int)count & ~15;
   if (remainder) {
     seed = HashDjb2_SSE(src, remainder, seed);
     src += remainder;
     count -= remainder;
   }
-  remainder = (int)(count) & 15;
+  remainder = (int)count & 15;
   if (remainder) {
     seed = HashDjb2_C(src, remainder, seed);
   }
   return seed;
 }
 
-static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
       return FOURCC_BGRA;
     }
-    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+    if (argb[3] != 255) {  // Fourth byte is not Alpha of 255, so not BGRA.
       return FOURCC_ARGB;
     }
     if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
       return FOURCC_BGRA;
     }
-    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+    if (argb[7] != 255) {  // Second pixel fourth byte is not Alpha of 255.
       return FOURCC_ARGB;
     }
     argb += 8;
@@ -110,8 +94,11 @@ static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
 // Scan an opaque argb image and return fourcc based on alpha offset.
 // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
 LIBYUV_API
-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
-  uint32 fourcc = 0;
+uint32_t ARGBDetect(const uint8_t* argb,
+                    int stride_argb,
+                    int width,
+                    int height) {
+  uint32_t fourcc = 0;
   int h;
 
   // Coalesce rows.
@@ -127,36 +114,86 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
   return fourcc;
 }
 
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SUMSQUAREERROR_NEON
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes.
+// So actual maximum is 1 less loop, which is 64436 - 32 bytes.
+
+LIBYUV_API
+uint64_t ComputeHammingDistance(const uint8_t* src_a,
+                                const uint8_t* src_b,
+                                int count) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  const int kSimdSize = 64;
+  // SIMD for multiple of 64, and C for remainder
+  int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);
+  uint64_t diff = 0;
+  int i;
+  uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b,
+                              int count) = HammingDistance_C;
+#if defined(HAS_HAMMINGDISTANCE_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    HammingDistance = HammingDistance_NEON;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    HammingDistance = HammingDistance_SSSE3;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_SSE42)
+  if (TestCpuFlag(kCpuHasSSE42)) {
+    HammingDistance = HammingDistance_SSE42;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HammingDistance = HammingDistance_AVX2;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    HammingDistance = HammingDistance_MMI;
+  }
 #endif
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_SUMSQUAREERROR_SSE2
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+#if defined(HAS_HAMMINGDISTANCE_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    HammingDistance = HammingDistance_MSA;
+  }
 #endif
 
-#ifdef VISUALC_HAS_AVX2
-#define HAS_SUMSQUAREERROR_AVX2
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : diff)
 #endif
+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  if (remainder) {
+    diff += HammingDistance(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & (kSimdSize - 1);
+  if (remainder) {
+    diff += HammingDistance_C(src_a, src_b, remainder);
+  }
+  return diff;
+}
 
 // TODO(fbarchard): Refactor into row function.
 LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
-                             int count) {
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
   // SumSquareError returns values 0 to 65535 for each squared difference.
-  // Up to 65536 of those can be summed and remain within a uint32.
-  // After each block of 65536 pixels, accumulate into a uint64.
+  // Up to 65536 of those can be summed and remain within a uint32_t.
+  // After each block of 65536 pixels, accumulate into a uint64_t.
   const int kBlockSize = 65536;
   int remainder = count & (kBlockSize - 1) & ~31;
-  uint64 sse = 0;
+  uint64_t sse = 0;
   int i;
-  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
-      SumSquareError_C;
+  uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
+                             int count) = SumSquareError_C;
 #if defined(HAS_SUMSQUAREERROR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     SumSquareError = SumSquareError_NEON;
@@ -174,8 +211,18 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
     SumSquareError = SumSquareError_AVX2;
   }
 #endif
+#if defined(HAS_SUMSQUAREERROR_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SumSquareError = SumSquareError_MMI;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SumSquareError = SumSquareError_MSA;
+  }
+#endif
 #ifdef _OPENMP
-#pragma omp parallel for reduction(+: sse)
+#pragma omp parallel for reduction(+ : sse)
 #endif
   for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
     sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
@@ -195,14 +242,16 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
 }
 
 LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
-                                  const uint8* src_b, int stride_b,
-                                  int width, int height) {
-  uint64 sse = 0;
+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
+                                    int stride_a,
+                                    const uint8_t* src_b,
+                                    int stride_b,
+                                    int width,
+                                    int height) {
+  uint64_t sse = 0;
   int h;
   // Coalesce rows.
-  if (stride_a == width &&
-      stride_b == width) {
+  if (stride_a == width && stride_b == width) {
     width *= height;
     height = 1;
     stride_a = stride_b = 0;
@@ -216,66 +265,76 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
 }
 
 LIBYUV_API
-double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) {
   double psnr;
   if (sse > 0) {
-    double mse = (double)(count) / (double)(sse);
+    double mse = (double)count / (double)sse;
     psnr = 10.0 * log10(255.0 * 255.0 * mse);
   } else {
-    psnr = kMaxPsnr;      // Limit to prevent divide by 0
+    psnr = kMaxPsnr;  // Limit to prevent divide by 0
   }
 
-  if (psnr > kMaxPsnr)
+  if (psnr > kMaxPsnr) {
     psnr = kMaxPsnr;
+  }
 
   return psnr;
 }
 
 LIBYUV_API
-double CalcFramePsnr(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height) {
-  const uint64 samples = width * height;
-  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
-                                                src_b, stride_b,
-                                                width, height);
+double CalcFramePsnr(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height) {
+  const uint64_t samples = (uint64_t)width * (uint64_t)height;
+  const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
+                                                  stride_b, width, height);
   return SumSquareErrorToPsnr(sse, samples);
 }
 
 LIBYUV_API
-double I420Psnr(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height) {
-  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
-                                                  src_y_b, stride_y_b,
-                                                  width, height);
+double I420Psnr(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height) {
+  const uint64_t sse_y = ComputeSumSquareErrorPlane(
+      src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
   const int width_uv = (width + 1) >> 1;
   const int height_uv = (height + 1) >> 1;
-  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
-                                                  src_u_b, stride_u_b,
-                                                  width_uv, height_uv);
-  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
-                                                  src_v_b, stride_v_b,
-                                                  width_uv, height_uv);
-  const uint64 samples = width * height + 2 * (width_uv * height_uv);
-  const uint64 sse = sse_y + sse_u + sse_v;
+  const uint64_t sse_u = ComputeSumSquareErrorPlane(
+      src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
+  const uint64_t sse_v = ComputeSumSquareErrorPlane(
+      src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
+  const uint64_t samples = (uint64_t)width * (uint64_t)height +
+                           2 * ((uint64_t)width_uv * (uint64_t)height_uv);
+  const uint64_t sse = sse_y + sse_u + sse_v;
   return SumSquareErrorToPsnr(sse, samples);
 }
 
-static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
-static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
+static const int64_t cc1 = 26634;   // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
 
-static double Ssim8x8_C(const uint8* src_a, int stride_a,
-                        const uint8* src_b, int stride_b) {
-  int64 sum_a = 0;
-  int64 sum_b = 0;
-  int64 sum_sq_a = 0;
-  int64 sum_sq_b = 0;
-  int64 sum_axb = 0;
+static double Ssim8x8_C(const uint8_t* src_a,
+                        int stride_a,
+                        const uint8_t* src_b,
+                        int stride_b) {
+  int64_t sum_a = 0;
+  int64_t sum_b = 0;
+  int64_t sum_sq_a = 0;
+  int64_t sum_sq_b = 0;
+  int64_t sum_axb = 0;
 
   int i;
   for (i = 0; i < 8; ++i) {
@@ -293,22 +352,22 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
   }
 
   {
-    const int64 count = 64;
+    const int64_t count = 64;
     // scale the constants by number of pixels
-    const int64 c1 = (cc1 * count * count) >> 12;
-    const int64 c2 = (cc2 * count * count) >> 12;
+    const int64_t c1 = (cc1 * count * count) >> 12;
+    const int64_t c2 = (cc2 * count * count) >> 12;
 
-    const int64 sum_a_x_sum_b = sum_a * sum_b;
+    const int64_t sum_a_x_sum_b = sum_a * sum_b;
 
-    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
-                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+    const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) *
+                           (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
 
-    const int64 sum_a_sq = sum_a*sum_a;
-    const int64 sum_b_sq = sum_b*sum_b;
+    const int64_t sum_a_sq = sum_a * sum_a;
+    const int64_t sum_b_sq = sum_b * sum_b;
 
-    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
-                         (count * sum_sq_a - sum_a_sq +
-                          count * sum_sq_b - sum_b_sq + c2);
+    const int64_t ssim_d =
+        (sum_a_sq + sum_b_sq + c1) *
+        (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
 
     if (ssim_d == 0.0) {
       return DBL_MAX;
@@ -321,13 +380,16 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
 LIBYUV_API
-double CalcFrameSsim(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height) {
+double CalcFrameSsim(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height) {
   int samples = 0;
   double ssim_total = 0;
-  double (*Ssim8x8)(const uint8* src_a, int stride_a,
-                    const uint8* src_b, int stride_b) = Ssim8x8_C;
+  double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b,
+                    int stride_b) = Ssim8x8_C;
 
   // sample point start with each 4x4 location
   int i;
@@ -347,22 +409,27 @@ double CalcFrameSsim(const uint8* src_a, int stride_a,
 }
 
 LIBYUV_API
-double I420Ssim(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height) {
-  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
-                                      src_y_b, stride_y_b, width, height);
+double I420Ssim(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height) {
+  const double ssim_y =
+      CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
   const int width_uv = (width + 1) >> 1;
   const int height_uv = (height + 1) >> 1;
-  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
-                                      src_u_b, stride_u_b,
+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,
                                       width_uv, height_uv);
-  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
-                                      src_v_b, stride_v_b,
+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,
                                       width_uv, height_uv);
   return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
 }
diff --git a/media/libaom/src/third_party/libyuv/source/compare_common.cc b/media/libaom/src/third_party/libyuv/source/compare_common.cc
index c546b51829..d4b170ad98 100644
--- a/media/libaom/src/third_party/libyuv/source/compare_common.cc
+++ b/media/libaom/src/third_party/libyuv/source/compare_common.cc
@@ -10,25 +10,87 @@
 
 #include "libyuv/basic_types.h"
 
+#include "libyuv/compare_row.h"
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
-  uint32 sse = 0u;
+#if ORIGINAL_OPT
+uint32_t HammingDistance_C1(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count) {
+  uint32_t diff = 0u;
+
+  int i;
+  for (i = 0; i < count; ++i) {
+    int x = src_a[i] ^ src_b[i];
+    if (x & 1)
+      ++diff;
+    if (x & 2)
+      ++diff;
+    if (x & 4)
+      ++diff;
+    if (x & 8)
+      ++diff;
+    if (x & 16)
+      ++diff;
+    if (x & 32)
+      ++diff;
+    if (x & 64)
+      ++diff;
+    if (x & 128)
+      ++diff;
+  }
+  return diff;
+}
+#endif
+
+// Hakmem method for hamming distance.
+uint32_t HammingDistance_C(const uint8_t* src_a,
+                           const uint8_t* src_b,
+                           int count) {
+  uint32_t diff = 0u;
+
+  int i;
+  for (i = 0; i < count - 3; i += 4) {
+    uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b);
+    uint32_t u = x - ((x >> 1) & 0x55555555);
+    u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
+    diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
+    src_a += 4;
+    src_b += 4;
+  }
+
+  for (; i < count; ++i) {
+    uint32_t x = *src_a ^ *src_b;
+    uint32_t u = x - ((x >> 1) & 0x55);
+    u = ((u >> 2) & 0x33) + (u & 0x33);
+    diff += (u + (u >> 4)) & 0x0f;
+    src_a += 1;
+    src_b += 1;
+  }
+
+  return diff;
+}
+
+uint32_t SumSquareError_C(const uint8_t* src_a,
+                          const uint8_t* src_b,
+                          int count) {
+  uint32_t sse = 0u;
   int i;
   for (i = 0; i < count; ++i) {
     int diff = src_a[i] - src_b[i];
-    sse += (uint32)(diff * diff);
+    sse += (uint32_t)(diff * diff);
   }
   return sse;
 }
 
 // hash seed of 5381 recommended.
 // Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
-  uint32 hash = seed;
+uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) {
+  uint32_t hash = seed;
   int i;
   for (i = 0; i < count; ++i) {
     hash += (hash << 5) + src[i];
diff --git a/media/libaom/src/third_party/libyuv/source/compare_gcc.cc b/media/libaom/src/third_party/libyuv/source/compare_gcc.cc
index 247cb33bba..6700f9697e 100644
--- a/media/libaom/src/third_party/libyuv/source/compare_gcc.cc
+++ b/media/libaom/src/third_party/libyuv/source/compare_gcc.cc
@@ -9,6 +9,8 @@
  */
 
 #include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
 #include "libyuv/row.h"
 
 #ifdef __cplusplus
@@ -16,131 +18,338 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
-  uint32 sse;
-  asm volatile (  // NOLINT
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10, 1) ",%1          \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psubusb   %%xmm2,%%xmm1                   \n"
-    "psubusb   %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpckhbw %%xmm5,%%xmm2                   \n"
-    "pmaddwd   %%xmm1,%%xmm1                   \n"
-    "pmaddwd   %%xmm2,%%xmm2                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-
-    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0,%3                       \n"
-
-  : "+r"(src_a),      // %0
-    "+r"(src_b),      // %1
-    "+r"(count),      // %2
-    "=g"(sse)         // %3
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );  // NOLINT
-  return sse;
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+#if defined(__x86_64__)
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint64_t diff = 0u;
+
+  asm volatile(
+      "xor         %3,%3                         \n"
+      "xor         %%r8,%%r8                     \n"
+      "xor         %%r9,%%r9                     \n"
+      "xor         %%r10,%%r10                   \n"
+
+      // Process 32 bytes per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "mov         (%0),%%rcx                    \n"
+      "mov         0x8(%0),%%rdx                 \n"
+      "xor         (%1),%%rcx                    \n"
+      "xor         0x8(%1),%%rdx                 \n"
+      "popcnt      %%rcx,%%rcx                   \n"
+      "popcnt      %%rdx,%%rdx                   \n"
+      "mov         0x10(%0),%%rsi                \n"
+      "mov         0x18(%0),%%rdi                \n"
+      "xor         0x10(%1),%%rsi                \n"
+      "xor         0x18(%1),%%rdi                \n"
+      "popcnt      %%rsi,%%rsi                   \n"
+      "popcnt      %%rdi,%%rdi                   \n"
+      "add         $0x20,%0                      \n"
+      "add         $0x20,%1                      \n"
+      "add         %%rcx,%3                      \n"
+      "add         %%rdx,%%r8                    \n"
+      "add         %%rsi,%%r9                    \n"
+      "add         %%rdi,%%r10                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+
+      "add         %%r8, %3                      \n"
+      "add         %%r9, %3                      \n"
+      "add         %%r10, %3                     \n"
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "=r"(diff)    // %3
+      :
+      : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
+
+  return static_cast<uint32_t>(diff);
 }
+#else
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
 
-#endif  // defined(__x86_64__) || defined(__i386__)
+  asm volatile(
+      // Process 16 bytes per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "mov         (%0),%%ecx                    \n"
+      "mov         0x4(%0),%%edx                 \n"
+      "xor         (%1),%%ecx                    \n"
+      "xor         0x4(%1),%%edx                 \n"
+      "popcnt      %%ecx,%%ecx                   \n"
+      "add         %%ecx,%3                      \n"
+      "popcnt      %%edx,%%edx                   \n"
+      "add         %%edx,%3                      \n"
+      "mov         0x8(%0),%%ecx                 \n"
+      "mov         0xc(%0),%%edx                 \n"
+      "xor         0x8(%1),%%ecx                 \n"
+      "xor         0xc(%1),%%edx                 \n"
+      "popcnt      %%ecx,%%ecx                   \n"
+      "add         %%ecx,%3                      \n"
+      "popcnt      %%edx,%%edx                   \n"
+      "add         %%edx,%3                      \n"
+      "add         $0x10,%0                      \n"
+      "add         $0x10,%1                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "+r"(diff)    // %3
+      :
+      : "memory", "cc", "ecx", "edx");
 
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-#define HAS_HASHDJB2_SSE41
-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
-static uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
+  return diff;
+}
+#endif
+
+static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
+                                 15, 15, 15, 15, 15, 15, 15, 15};
+static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+
+uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      "movdqa      %4,%%xmm2                     \n"
+      "movdqa      %5,%%xmm3                     \n"
+      "pxor        %%xmm0,%%xmm0                 \n"
+      "pxor        %%xmm1,%%xmm1                 \n"
+      "sub         %0,%1                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqa      (%0),%%xmm4                   \n"
+      "movdqa      0x10(%0), %%xmm5              \n"
+      "pxor        (%0,%1), %%xmm4               \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "pand        %%xmm2,%%xmm6                 \n"
+      "psrlw       $0x4,%%xmm4                   \n"
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "pshufb      %%xmm6,%%xmm7                 \n"
+      "pand        %%xmm2,%%xmm4                 \n"
+      "movdqa      %%xmm3,%%xmm6                 \n"
+      "pshufb      %%xmm4,%%xmm6                 \n"
+      "paddb       %%xmm7,%%xmm6                 \n"
+      "pxor        0x10(%0,%1),%%xmm5            \n"
+      "add         $0x20,%0                      \n"
+      "movdqa      %%xmm5,%%xmm4                 \n"
+      "pand        %%xmm2,%%xmm5                 \n"
+      "psrlw       $0x4,%%xmm4                   \n"
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "pshufb      %%xmm5,%%xmm7                 \n"
+      "pand        %%xmm2,%%xmm4                 \n"
+      "movdqa      %%xmm3,%%xmm5                 \n"
+      "pshufb      %%xmm4,%%xmm5                 \n"
+      "paddb       %%xmm7,%%xmm5                 \n"
+      "paddb       %%xmm5,%%xmm6                 \n"
+      "psadbw      %%xmm1,%%xmm6                 \n"
+      "paddd       %%xmm6,%%xmm0                 \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+
+      "pshufd      $0xaa,%%xmm0,%%xmm1           \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "movd        %%xmm0, %3                    \n"
+      : "+r"(src_a),       // %0
+        "+r"(src_b),       // %1
+        "+r"(count),       // %2
+        "=r"(diff)         // %3
+      : "m"(kNibbleMask),  // %4
+        "m"(kBitCount)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+
+  return diff;
+}
+
+#ifdef HAS_HAMMINGDISTANCE_AVX2
+uint32_t HammingDistance_AVX2(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      "vbroadcastf128 %4,%%ymm2                  \n"
+      "vbroadcastf128 %5,%%ymm3                  \n"
+      "vpxor       %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpxor       %%ymm1,%%ymm1,%%ymm1          \n"
+      "sub         %0,%1                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqa     (%0),%%ymm4                   \n"
+      "vmovdqa     0x20(%0), %%ymm5              \n"
+      "vpxor       (%0,%1), %%ymm4, %%ymm4       \n"
+      "vpand       %%ymm2,%%ymm4,%%ymm6          \n"
+      "vpsrlw      $0x4,%%ymm4,%%ymm4            \n"
+      "vpshufb     %%ymm6,%%ymm3,%%ymm6          \n"
+      "vpand       %%ymm2,%%ymm4,%%ymm4          \n"
+      "vpshufb     %%ymm4,%%ymm3,%%ymm4          \n"
+      "vpaddb      %%ymm4,%%ymm6,%%ymm6          \n"
+      "vpxor       0x20(%0,%1),%%ymm5,%%ymm4     \n"
+      "add         $0x40,%0                      \n"
+      "vpand       %%ymm2,%%ymm4,%%ymm5          \n"
+      "vpsrlw      $0x4,%%ymm4,%%ymm4            \n"
+      "vpshufb     %%ymm5,%%ymm3,%%ymm5          \n"
+      "vpand       %%ymm2,%%ymm4,%%ymm4          \n"
+      "vpshufb     %%ymm4,%%ymm3,%%ymm4          \n"
+      "vpaddb      %%ymm5,%%ymm4,%%ymm4          \n"
+      "vpaddb      %%ymm6,%%ymm4,%%ymm4          \n"
+      "vpsadbw     %%ymm1,%%ymm4,%%ymm4          \n"
+      "vpaddd      %%ymm0,%%ymm4,%%ymm0          \n"
+      "sub         $0x40,%2                      \n"
+      "jg          1b                            \n"
+
+      "vpermq      $0xb1,%%ymm0,%%ymm1           \n"
+      "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xaa,%%ymm0,%%ymm1           \n"
+      "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovd       %%xmm0, %3                    \n"
+      "vzeroupper                                \n"
+      : "+r"(src_a),       // %0
+        "+r"(src_b),       // %1
+        "+r"(count),       // %2
+        "=r"(diff)         // %3
+      : "m"(kNibbleMask),  // %4
+        "m"(kBitCount)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+
+  return diff;
+}
+#endif  // HAS_HAMMINGDISTANCE_AVX2
+
+uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "pxor        %%xmm0,%%xmm0                 \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm2                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "psubusb     %%xmm2,%%xmm1                 \n"
+      "psubusb     %%xmm3,%%xmm2                 \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "punpckhbw   %%xmm5,%%xmm2                 \n"
+      "pmaddwd     %%xmm1,%%xmm1                 \n"
+      "pmaddwd     %%xmm2,%%xmm2                 \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "paddd       %%xmm2,%%xmm0                 \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+
+      "pshufd      $0xee,%%xmm0,%%xmm1           \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "pshufd      $0x1,%%xmm0,%%xmm1            \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "movd        %%xmm0,%3                     \n"
+
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "=g"(sse)     // %3
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+  return sse;
+}
+
+static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
+static const uvec32 kHashMul0 = {
+    0x0c3525e1,  // 33 ^ 15
+    0xa3476dc1,  // 33 ^ 14
+    0x3b4039a1,  // 33 ^ 13
+    0x4f5f0981,  // 33 ^ 12
 };
-static uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
+static const uvec32 kHashMul1 = {
+    0x30f35d61,  // 33 ^ 11
+    0x855cb541,  // 33 ^ 10
+    0x040a9121,  // 33 ^ 9
+    0x747c7101,  // 33 ^ 8
 };
-static uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
+static const uvec32 kHashMul2 = {
+    0xec41d4e1,  // 33 ^ 7
+    0x4cfa3cc1,  // 33 ^ 6
+    0x025528a1,  // 33 ^ 5
+    0x00121881,  // 33 ^ 4
 };
-static uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
+static const uvec32 kHashMul3 = {
+    0x00008c61,  // 33 ^ 3
+    0x00000441,  // 33 ^ 2
+    0x00000021,  // 33 ^ 1
+    0x00000001,  // 33 ^ 0
 };
 
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
-  uint32 hash;
-  asm volatile (  // NOLINT
-    "movd      %2,%%xmm0                       \n"
-    "pxor      %%xmm7,%%xmm7                   \n"
-    "movdqa    %4,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "pmulld    %%xmm6,%%xmm0                   \n"
-    "movdqa    %5,%%xmm5                       \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm7,%%xmm3                   \n"
-    "pmulld    %%xmm5,%%xmm3                   \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpckhwd %%xmm7,%%xmm4                   \n"
-    "pmulld    %%xmm5,%%xmm4                   \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "punpckhbw %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm7,%%xmm2                   \n"
-    "pmulld    %%xmm5,%%xmm2                   \n"
-    "movdqa    %8,%%xmm5                       \n"
-    "punpckhwd %%xmm7,%%xmm1                   \n"
-    "pmulld    %%xmm5,%%xmm1                   \n"
-    "paddd     %%xmm4,%%xmm3                   \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm1                   \n"
-    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%1                        \n"
-    "jg        1b                              \n"
-    "movd      %%xmm0,%3                       \n"
-  : "+r"(src),        // %0
-    "+r"(count),      // %1
-    "+rm"(seed),      // %2
-    "=g"(hash)        // %3
-  : "m"(kHash16x33),  // %4
-    "m"(kHashMul0),   // %5
-    "m"(kHashMul1),   // %6
-    "m"(kHashMul2),   // %7
-    "m"(kHashMul3)    // %8
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );  // NOLINT
+uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
+  uint32_t hash;
+  asm volatile(
+      "movd        %2,%%xmm0                     \n"
+      "pxor        %%xmm7,%%xmm7                 \n"
+      "movdqa      %4,%%xmm6                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "pmulld      %%xmm6,%%xmm0                 \n"
+      "movdqa      %5,%%xmm5                     \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm7,%%xmm3                 \n"
+      "pmulld      %%xmm5,%%xmm3                 \n"
+      "movdqa      %6,%%xmm5                     \n"
+      "movdqa      %%xmm2,%%xmm4                 \n"
+      "punpckhwd   %%xmm7,%%xmm4                 \n"
+      "pmulld      %%xmm5,%%xmm4                 \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "punpckhbw   %%xmm7,%%xmm1                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklwd   %%xmm7,%%xmm2                 \n"
+      "pmulld      %%xmm5,%%xmm2                 \n"
+      "movdqa      %8,%%xmm5                     \n"
+      "punpckhwd   %%xmm7,%%xmm1                 \n"
+      "pmulld      %%xmm5,%%xmm1                 \n"
+      "paddd       %%xmm4,%%xmm3                 \n"
+      "paddd       %%xmm2,%%xmm1                 \n"
+      "paddd       %%xmm3,%%xmm1                 \n"
+      "pshufd      $0xe,%%xmm1,%%xmm2            \n"
+      "paddd       %%xmm2,%%xmm1                 \n"
+      "pshufd      $0x1,%%xmm1,%%xmm2            \n"
+      "paddd       %%xmm2,%%xmm1                 \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "sub         $0x10,%1                      \n"
+      "jg          1b                            \n"
+      "movd        %%xmm0,%3                     \n"
+      : "+r"(src),        // %0
+        "+r"(count),      // %1
+        "+rm"(seed),      // %2
+        "=g"(hash)        // %3
+      : "m"(kHash16x33),  // %4
+        "m"(kHashMul0),   // %5
+        "m"(kHashMul1),   // %6
+        "m"(kHashMul2),   // %7
+        "m"(kHashMul3)    // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
   return hash;
 }
 #endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
@@ -149,4 +358,3 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
diff --git a/media/libaom/src/third_party/libyuv/source/compare_neon.cc b/media/libaom/src/third_party/libyuv/source/compare_neon.cc
index ef006ec41c..afdd601216 100644
--- a/media/libaom/src/third_party/libyuv/source/compare_neon.cc
+++ b/media/libaom/src/third_party/libyuv/source/compare_neon.cc
@@ -9,6 +9,8 @@
  */
 
 #include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
 #include "libyuv/row.h"
 
 #ifdef __cplusplus
@@ -19,41 +21,70 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     !defined(__aarch64__)
 
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
-  volatile uint32 sse;
-  asm volatile (
-    "vmov.u8    q8, #0                         \n"
-    "vmov.u8    q10, #0                        \n"
-    "vmov.u8    q9, #0                         \n"
-    "vmov.u8    q11, #0                        \n"
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff;
+
+  asm volatile(
+      "vmov.u16    q4, #0                        \n"  // accumulator
+
+      "1:                                        \n"
+      "vld1.8      {q0, q1}, [%0]!               \n"
+      "vld1.8      {q2, q3}, [%1]!               \n"
+      "veor.32     q0, q0, q2                    \n"
+      "veor.32     q1, q1, q3                    \n"
+      "vcnt.i8     q0, q0                        \n"
+      "vcnt.i8     q1, q1                        \n"
+      "subs        %2, %2, #32                   \n"
+      "vadd.u8     q0, q0, q1                    \n"  // 16 byte counts
+      "vpadal.u8   q4, q0                        \n"  // 8 shorts
+      "bgt         1b                            \n"
+
+      "vpaddl.u16  q0, q4                        \n"  // 4 ints
+      "vpadd.u32   d0, d0, d1                    \n"
+      "vpadd.u32   d0, d0, d0                    \n"
+      "vmov.32     %3, d0[0]                     \n"
+
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+      :
+      : "cc", "q0", "q1", "q2", "q3", "q4");
+  return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "vmov.u8     q8, #0                        \n"
+      "vmov.u8     q10, #0                       \n"
+      "vmov.u8     q9, #0                        \n"
+      "vmov.u8     q11, #0                       \n"
 
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"
-    "subs       %2, %2, #16                    \n"
-    "vsubl.u8   q2, d0, d2                     \n"
-    "vsubl.u8   q3, d1, d3                     \n"
-    "vmlal.s16  q8, d4, d4                     \n"
-    "vmlal.s16  q9, d6, d6                     \n"
-    "vmlal.s16  q10, d5, d5                    \n"
-    "vmlal.s16  q11, d7, d7                    \n"
-    "bgt        1b                             \n"
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"
+      "vld1.8      {q1}, [%1]!                   \n"
+      "subs        %2, %2, #16                   \n"
+      "vsubl.u8    q2, d0, d2                    \n"
+      "vsubl.u8    q3, d1, d3                    \n"
+      "vmlal.s16   q8, d4, d4                    \n"
+      "vmlal.s16   q9, d6, d6                    \n"
+      "vmlal.s16   q10, d5, d5                   \n"
+      "vmlal.s16   q11, d7, d7                   \n"
+      "bgt         1b                            \n"
 
-    "vadd.u32   q8, q8, q9                     \n"
-    "vadd.u32   q10, q10, q11                  \n"
-    "vadd.u32   q11, q8, q10                   \n"
-    "vpaddl.u32 q1, q11                        \n"
-    "vadd.u64   d0, d2, d3                     \n"
-    "vmov.32    %3, d0[0]                      \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+      "vadd.u32    q8, q8, q9                    \n"
+      "vadd.u32    q10, q10, q11                 \n"
+      "vadd.u32    q11, q8, q10                  \n"
+      "vpaddl.u32  q1, q11                       \n"
+      "vadd.u64    d0, d2, d3                    \n"
+      "vmov.32     %3, d0[0]                     \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
   return sse;
 }
 
diff --git a/media/libaom/src/third_party/libyuv/source/compare_neon64.cc b/media/libaom/src/third_party/libyuv/source/compare_neon64.cc
index 6d1e5e1bc9..70fb9b9143 100644
--- a/media/libaom/src/third_party/libyuv/source/compare_neon64.cc
+++ b/media/libaom/src/third_party/libyuv/source/compare_neon64.cc
@@ -9,6 +9,8 @@
  */
 
 #include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
 #include "libyuv/row.h"
 
 #ifdef __cplusplus
@@ -18,40 +20,69 @@ extern "C" {
 
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
-  volatile uint32 sse;
-  asm volatile (
-    "eor        v16.16b, v16.16b, v16.16b      \n"
-    "eor        v18.16b, v18.16b, v18.16b      \n"
-    "eor        v17.16b, v17.16b, v17.16b      \n"
-    "eor        v19.16b, v19.16b, v19.16b      \n"
-
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"
-    "subs       %w2, %w2, #16                  \n"
-    "usubl      v2.8h, v0.8b, v1.8b            \n"
-    "usubl2     v3.8h, v0.16b, v1.16b          \n"
-    "smlal      v16.4s, v2.4h, v2.4h           \n"
-    "smlal      v17.4s, v3.4h, v3.4h           \n"
-    "smlal2     v18.4s, v2.8h, v2.8h           \n"
-    "smlal2     v19.4s, v3.8h, v3.8h           \n"
-    "b.gt       1b                             \n"
-
-    "add        v16.4s, v16.4s, v17.4s         \n"
-    "add        v18.4s, v18.4s, v19.4s         \n"
-    "add        v19.4s, v16.4s, v18.4s         \n"
-    "addv       s0, v19.4s                     \n"
-    "fmov       %w3, s0                        \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff;
+  asm volatile(
+      "movi        v4.8h, #0                     \n"
+
+      "1:                                        \n"
+      "ld1         {v0.16b, v1.16b}, [%0], #32   \n"
+      "ld1         {v2.16b, v3.16b}, [%1], #32   \n"
+      "eor         v0.16b, v0.16b, v2.16b        \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "eor         v1.16b, v1.16b, v3.16b        \n"
+      "cnt         v0.16b, v0.16b                \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "cnt         v1.16b, v1.16b                \n"
+      "subs        %w2, %w2, #32                 \n"
+      "add         v0.16b, v0.16b, v1.16b        \n"
+      "uadalp      v4.8h, v0.16b                 \n"
+      "b.gt        1b                            \n"
+
+      "uaddlv      s4, v4.8h                     \n"
+      "fmov        %w3, s4                       \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v4");
+  return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "eor         v16.16b, v16.16b, v16.16b     \n"
+      "eor         v18.16b, v18.16b, v18.16b     \n"
+      "eor         v17.16b, v17.16b, v17.16b     \n"
+      "eor         v19.16b, v19.16b, v19.16b     \n"
+
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"
+      "ld1         {v1.16b}, [%1], #16           \n"
+      "subs        %w2, %w2, #16                 \n"
+      "usubl       v2.8h, v0.8b, v1.8b           \n"
+      "usubl2      v3.8h, v0.16b, v1.16b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "smlal       v16.4s, v2.4h, v2.4h          \n"
+      "smlal       v17.4s, v3.4h, v3.4h          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "smlal2      v18.4s, v2.8h, v2.8h          \n"
+      "smlal2      v19.4s, v3.8h, v3.8h          \n"
+      "b.gt        1b                            \n"
+
+      "add         v16.4s, v16.4s, v17.4s        \n"
+      "add         v18.4s, v18.4s, v19.4s        \n"
+      "add         v19.4s, v16.4s, v18.4s        \n"
+      "addv        s0, v19.4s                    \n"
+      "fmov        %w3, s0                       \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
   return sse;
 }
 
diff --git a/media/libaom/src/third_party/libyuv/source/compare_win.cc b/media/libaom/src/third_party/libyuv/source/compare_win.cc
index 19806f2750..d57d3d9d1c 100644
--- a/media/libaom/src/third_party/libyuv/source/compare_win.cc
+++ b/media/libaom/src/third_party/libyuv/source/compare_win.cc
@@ -9,23 +9,43 @@
  */
 
 #include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
 #include "libyuv/row.h"
 
+#if defined(_MSC_VER)
+#include <intrin.h>  // For __popcnt
+#endif
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for Visual C x86.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
-    defined(_MSC_VER) && !defined(__clang__)
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  int i;
+  for (i = 0; i < count - 3; i += 4) {
+    uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);  // NOLINT
+    src_a += 4;
+    src_b += 4;
+    diff += __popcnt(x);
+  }
+  return diff;
+}
 
-__declspec(naked)
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+__declspec(naked) uint32_t
+    SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
   __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
+    mov        eax, [esp + 4]  // src_a
+    mov        edx, [esp + 8]  // src_b
+    mov        ecx, [esp + 12]  // count
     pxor       xmm0, xmm0
     pxor       xmm5, xmm5
 
@@ -60,13 +80,13 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
-#pragma warning(disable: 4752)
-__declspec(naked)
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+#pragma warning(disable : 4752)
+__declspec(naked) uint32_t
+    SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
   __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
+    mov        eax, [esp + 4]  // src_a
+    mov        edx, [esp + 8]  // src_b
+    mov        ecx, [esp + 12]  // count
     vpxor      ymm0, ymm0, ymm0  // sum
     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
     sub        edx, eax
@@ -100,74 +120,65 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
 }
 #endif  // _MSC_VER >= 1700
 
-#define HAS_HASHDJB2_SSE41
-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
-static uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
+uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
+uvec32 kHashMul0 = {
+    0x0c3525e1,  // 33 ^ 15
+    0xa3476dc1,  // 33 ^ 14
+    0x3b4039a1,  // 33 ^ 13
+    0x4f5f0981,  // 33 ^ 12
 };
-static uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
+uvec32 kHashMul1 = {
+    0x30f35d61,  // 33 ^ 11
+    0x855cb541,  // 33 ^ 10
+    0x040a9121,  // 33 ^ 9
+    0x747c7101,  // 33 ^ 8
 };
-static uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
+uvec32 kHashMul2 = {
+    0xec41d4e1,  // 33 ^ 7
+    0x4cfa3cc1,  // 33 ^ 6
+    0x025528a1,  // 33 ^ 5
+    0x00121881,  // 33 ^ 4
 };
-static uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
+uvec32 kHashMul3 = {
+    0x00008c61,  // 33 ^ 3
+    0x00000441,  // 33 ^ 2
+    0x00000021,  // 33 ^ 1
+    0x00000001,  // 33 ^ 0
 };
 
-// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
-// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
-// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
-// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
-// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
-#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
-    _asm _emit 0x40 _asm _emit reg
-
-__declspec(naked)
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+__declspec(naked) uint32_t
+    HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
   __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
+    mov        eax, [esp + 4]  // src
+    mov        ecx, [esp + 8]  // count
     movd       xmm0, [esp + 12]  // seed
 
-    pxor       xmm7, xmm7        // constant 0 for unpck
-    movdqa     xmm6, kHash16x33
+    pxor       xmm7, xmm7  // constant 0 for unpck
+    movdqa     xmm6, xmmword ptr kHash16x33
 
   wloop:
-    movdqu     xmm1, [eax]       // src[0-15]
+    movdqu     xmm1, [eax]  // src[0-15]
     lea        eax, [eax + 16]
-    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
-    movdqa     xmm5, kHashMul0
+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
+    movdqa     xmm5, xmmword ptr kHashMul0
     movdqa     xmm2, xmm1
-    punpcklbw  xmm2, xmm7        // src[0-7]
+    punpcklbw  xmm2, xmm7  // src[0-7]
     movdqa     xmm3, xmm2
-    punpcklwd  xmm3, xmm7        // src[0-3]
-    pmulld(0xdd)                 // pmulld     xmm3, xmm5
-    movdqa     xmm5, kHashMul1
+    punpcklwd  xmm3, xmm7  // src[0-3]
+    pmulld     xmm3, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul1
     movdqa     xmm4, xmm2
-    punpckhwd  xmm4, xmm7        // src[4-7]
-    pmulld(0xe5)                 // pmulld     xmm4, xmm5
-    movdqa     xmm5, kHashMul2
-    punpckhbw  xmm1, xmm7        // src[8-15]
+    punpckhwd  xmm4, xmm7  // src[4-7]
+    pmulld     xmm4, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul2
+    punpckhbw  xmm1, xmm7  // src[8-15]
     movdqa     xmm2, xmm1
-    punpcklwd  xmm2, xmm7        // src[8-11]
-    pmulld(0xd5)                 // pmulld     xmm2, xmm5
-    movdqa     xmm5, kHashMul3
-    punpckhwd  xmm1, xmm7        // src[12-15]
-    pmulld(0xcd)                 // pmulld     xmm1, xmm5
-    paddd      xmm3, xmm4        // add 16 results
+    punpcklwd  xmm2, xmm7  // src[8-11]
+    pmulld     xmm2, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul3
+    punpckhwd  xmm1, xmm7  // src[12-15]
+    pmulld     xmm1, xmm5
+    paddd      xmm3, xmm4  // add 16 results
     paddd      xmm1, xmm2
     paddd      xmm1, xmm3
 
@@ -179,48 +190,49 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
     sub        ecx, 16
     jg         wloop
 
-    movd       eax, xmm0         // return hash
+    movd       eax, xmm0  // return hash
     ret
   }
 }
 
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
-__declspec(naked)
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+__declspec(naked) uint32_t
+    HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
   __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
-    movd       xmm0, [esp + 12]  // seed
-    movdqa     xmm6, kHash16x33
+    mov        eax, [esp + 4]  // src
+    mov        ecx, [esp + 8]  // count
+    vmovd      xmm0, [esp + 12]  // seed
 
   wloop:
-    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
-    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
-    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]
-    pmulld     xmm3, kHashMul0
-    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]
-    pmulld     xmm4, kHashMul1
-    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]
-    pmulld     xmm2, kHashMul2
+    vpmovzxbd  xmm3, [eax]  // src[0-3]
+    vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
+    vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
+    vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
+    vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
+    vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
+    vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
+    vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
     lea        eax, [eax + 16]
-    pmulld     xmm1, kHashMul3
-    paddd      xmm3, xmm4        // add 16 results
-    paddd      xmm1, xmm2
-    paddd      xmm1, xmm3
-    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
-    paddd      xmm1, xmm2
-    pshufd     xmm2, xmm1, 0x01
-    paddd      xmm1, xmm2
-    paddd      xmm0, xmm1
+    vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
+    vpaddd     xmm3, xmm3, xmm4  // add 16 results
+    vpaddd     xmm1, xmm1, xmm2
+    vpaddd     xmm1, xmm1, xmm3
+    vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
+    vpaddd     xmm1, xmm1,xmm2
+    vpshufd    xmm2, xmm1, 0x01
+    vpaddd     xmm1, xmm1, xmm2
+    vpaddd     xmm0, xmm0, xmm1
     sub        ecx, 16
     jg         wloop
 
-    movd       eax, xmm0         // return hash
+    vmovd      eax, xmm0  // return hash
+    vzeroupper
     ret
   }
 }
 #endif  // _MSC_VER >= 1700
+
 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/third_party/libyuv/source/convert.cc b/media/libaom/src/third_party/libyuv/source/convert.cc
index 3ad6bd7a4b..98258b9bc9 100644
--- a/media/libaom/src/third_party/libyuv/source/convert.cc
+++ b/media/libaom/src/third_party/libyuv/source/convert.cc
@@ -14,8 +14,8 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
-#include "libyuv/scale.h"  // For ScalePlane()
 #include "libyuv/row.h"
+#include "libyuv/scale.h"  // For ScalePlane()
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -28,50 +28,61 @@ static __inline int Abs(int v) {
 }
 
 // Any I4xx To I420 format with mirroring.
-static int I4xxToI420(const uint8* src_y, int src_stride_y,
-                      const uint8* src_u, int src_stride_u,
-                      const uint8* src_v, int src_stride_v,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int src_y_width, int src_y_height,
-                      int src_uv_width, int src_uv_height) {
+static int I4xxToI420(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int src_y_width,
+                      int src_y_height,
+                      int src_uv_width,
+                      int src_uv_height) {
   const int dst_y_width = Abs(src_y_width);
   const int dst_y_height = Abs(src_y_height);
   const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
   const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
-  if (src_y_width == 0 || src_y_height == 0 ||
-      src_uv_width == 0 || src_uv_height == 0) {
+  if (src_uv_width == 0 || src_uv_height == 0) {
     return -1;
   }
-  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
-             dst_y, dst_stride_y, dst_y_width, dst_y_height,
-             kFilterBilinear);
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
+  if (dst_y) {
+    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+  }
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
   return 0;
 }
 
-// Copy I420 with optional flipping
+// Copy I420 with optional flipping.
 // TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
 // is does row coalescing.
 LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
+int I420Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -95,304 +106,531 @@ int I420Copy(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// 422 chroma is 1/2 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
+// Copy I010 with optional flipping.
 LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int src_uv_width = SUBSAMPLE(width, 1, 1);
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    src_uv_width, height);
+int I010Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
 }
 
-// 444 chroma is 1x width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
+// Convert 10 bit YUV to 8 bit.
 LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    width, height);
+int I010ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width,
+                    height);
+  // Convert UV planes.
+  Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth,
+                    halfheight);
+  Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth,
+                    halfheight);
+  return 0;
 }
 
-// 411 chroma is 1/4 width, 1x height
+// 422 chroma is 1/2 width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
-int I411ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int src_uv_width = SUBSAMPLE(width, 3, 2);
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    src_uv_width, height);
+int I422ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  const int src_uv_width = SUBSAMPLE(width, 1, 1);
+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, src_uv_width, height);
 }
 
-// I400 is greyscale typically used in MJPG
+// TODO(fbarchard): Implement row conversion.
 LIBYUV_API
-int I400ToI420(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I422ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
     src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
   }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
-  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
-  return 0;
-}
 
-static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
-                       uint8* dst, int dst_stride,
-                       int width, int height) {
-  int y;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_AVX)
-  if (TestCpuFlag(kCpuHasAVX)) {
-    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
+  // Allocate u and v buffers
+  align_buffer_64(plane_u, halfwidth * halfheight * 2);
+  uint8_t* plane_v = plane_u + halfwidth * halfheight;
 
-  // Copy plane
-  for (y = 0; y < height - 1; y += 2) {
-    CopyRow(src, dst, width);
-    CopyRow(src + src_stride_0, dst + dst_stride, width);
-    src += src_stride_0 + src_stride_1;
-    dst += dst_stride * 2;
-  }
-  if (height & 1) {
-    CopyRow(src, dst, width);
-  }
+  I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+             dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
+             height);
+  MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
+               halfwidth, halfheight);
+  free_aligned_buffer_64(plane_u);
+  return 0;
 }
 
-// Support converting from FOURCC_M420
-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
-// easy conversion to I420.
-// M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
-// Chroma is half width / half height. (420)
-// src_stride_m420 is row planar. Normally this will be the width in pixels.
-//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
-//   this as well as the two Y planes.
-static int X420ToI420(const uint8* src_y,
-                      int src_stride_y0, int src_stride_y1,
-                      const uint8* src_uv, int src_stride_uv,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int width, int height) {
+#ifdef I422TONV21_ROW_VERSION
+// Unittest fails for this version.
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+// Swap src_u and src_v to implement I422ToNV12
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
   int y;
+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+                     uint8_t* dst_uv, int width) = MergeUVRow_C;
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
-      SplitUVRow_C;
-  if (!src_y || !src_uv ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  // Coalesce rows.
-  if (src_stride_y0 == width &&
-      src_stride_y1 == width &&
-      dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
-  }
-  // Coalesce rows.
-  if (src_stride_uv == halfwidth * 2 &&
-      dst_stride_u == halfwidth &&
-      dst_stride_v == halfwidth) {
-    halfwidth *= halfheight;
-    halfheight = 1;
-    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
   }
-#if defined(HAS_SPLITUVROW_SSE2)
+#if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitUVRow = SplitUVRow_Any_SSE2;
+    MergeUVRow = MergeUVRow_Any_SSE2;
     if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_SSE2;
+      MergeUVRow = MergeUVRow_SSE2;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_AVX2)
+#if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_Any_AVX2;
+    MergeUVRow = MergeUVRow_Any_AVX2;
     if (IS_ALIGNED(halfwidth, 32)) {
-      SplitUVRow = SplitUVRow_AVX2;
+      MergeUVRow = MergeUVRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_NEON)
+#if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_Any_NEON;
+    MergeUVRow = MergeUVRow_Any_NEON;
     if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_NEON;
+      MergeUVRow = MergeUVRow_NEON;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
-      IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
-      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
-      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
-    SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      MergeUVRow = MergeUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow = MergeUVRow_Any_MSA;
     if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_MIPS_DSPR2;
+      MergeUVRow = MergeUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
     }
   }
 #endif
 
   if (dst_y) {
-    if (src_stride_y0 == src_stride_y1) {
-      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
-    } else {
-      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
-                 width, height);
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
+  }
+  {
+    // Allocate 2 rows of vu.
+    int awidth = halfwidth * 2;
+    align_buffer_64(row_vu_0, awidth * 2);
+    uint8_t* row_vu_1 = row_vu_0 + awidth;
+
+    for (y = 0; y < height - 1; y += 2) {
+      MergeUVRow(src_v, src_u, row_vu_0, halfwidth);
+      MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1,
+                 halfwidth);
+      InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128);
+      src_u += src_stride_u * 2;
+      src_v += src_stride_v * 2;
+      dst_vu += dst_stride_vu;
     }
+    if (height & 1) {
+      MergeUVRow(src_v, src_u, dst_vu, halfwidth);
+    }
+    free_aligned_buffer_64(row_vu_0);
   }
+  return 0;
+}
+#endif  // I422TONV21_ROW_VERSION
 
-  for (y = 0; y < halfheight; ++y) {
-    // Copy a row of UV.
-    SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-    src_uv += src_stride_uv;
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I444ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, width, height);
+}
+
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
+  HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+                   dst_stride_uv, width, height);
   return 0;
 }
 
-// Convert NV12 to I420.
 LIBYUV_API
-int NV12ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y,
-                    src_uv, src_stride_uv,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
+int I444ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
                     width, height);
 }
 
-// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
+// I400 is greyscale typically used in MJPG
 LIBYUV_API
-int NV21ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y,
-                    src_vu, src_stride_vu,
-                    dst_y, dst_stride_y,
-                    dst_v, dst_stride_v,
-                    dst_u, dst_stride_u,
-                    width, height);
+int I400ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+  return 0;
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!dst_vu || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128);
+  return 0;
+}
+
+// Convert NV12 to I420.
+// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm.
+LIBYUV_API
+int NV12ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
+      dst_stride_v == halfwidth) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  // Split UV plane - NV12 / NV21
+  SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
+               halfwidth, halfheight);
+
+  return 0;
 }
 
-// Convert M420 to I420.
+// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
 LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
-                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
+int NV21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+                    dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u,
                     width, height);
 }
 
 // Convert YUY2 to I420.
 LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int YUY2ToI420(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
-      uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2,
-      uint8* dst_y, int pix) = YUY2ToYRow_C;
+  void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -429,6 +667,28 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MMI;
+    YUY2ToUVRow = YUY2ToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToYRow = YUY2ToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        YUY2ToUVRow = YUY2ToUVRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUVRow = YUY2ToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
@@ -448,16 +708,22 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
 
 // Convert UYVY to I420.
 LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int UYVYToI420(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
-      uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-      uint8* dst_y, int pix) = UYVYToYRow_C;
+  void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+      UYVYToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -494,6 +760,26 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    UYVYToYRow = UYVYToYRow_Any_MMI;
+    UYVYToUVRow = UYVYToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_MMI;
+      UYVYToUVRow = UYVYToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUVRow = UYVYToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUVRow = UYVYToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
@@ -511,21 +797,163 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
   return 0;
 }
 
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int y;
+  void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+                      uint8_t* dst_uv, int width) = AYUVToUVRow_C;
+  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+      AYUVToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+    src_stride_ayuv = -src_stride_ayuv;
+  }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    AYUVToUVRow = AYUVToUVRow_Any_SSE2;
+    AYUVToYRow = AYUVToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToUVRow = AYUVToUVRow_SSE2;
+      AYUVToYRow = AYUVToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AYUVToUVRow = AYUVToUVRow_Any_AVX2;
+    AYUVToYRow = AYUVToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      AYUVToUVRow = AYUVToUVRow_AVX2;
+      AYUVToYRow = AYUVToYRow_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AYUVToYRow = AYUVToYRow_Any_NEON;
+    AYUVToUVRow = AYUVToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToYRow = AYUVToYRow_NEON;
+      AYUVToUVRow = AYUVToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+    src_ayuv += src_stride_ayuv * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    AYUVToUVRow(src_ayuv, 0, dst_uv, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+                      uint8_t* dst_vu, int width) = AYUVToVURow_C;
+  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+      AYUVToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+    src_stride_ayuv = -src_stride_ayuv;
+  }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    AYUVToVURow = AYUVToVURow_Any_SSE2;
+    AYUVToYRow = AYUVToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToVURow = AYUVToVURow_SSE2;
+      AYUVToYRow = AYUVToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AYUVToVURow = AYUVToVURow_Any_AVX2;
+    AYUVToYRow = AYUVToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      AYUVToVURow = AYUVToVURow_AVX2;
+      AYUVToYRow = AYUVToYRow_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AYUVToYRow = AYUVToYRow_Any_NEON;
+    AYUVToVURow = AYUVToVURow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToYRow = AYUVToYRow_NEON;
+      AYUVToVURow = AYUVToVURow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+    src_ayuv += src_stride_ayuv * 2;
+    dst_y += dst_stride_y * 2;
+    dst_vu += dst_stride_vu;
+  }
+  if (height & 1) {
+    AYUVToVURow(src_ayuv, 0, dst_vu, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+  }
+  return 0;
+}
+
 // Convert ARGB to I420.
 LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -570,6 +998,30 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -589,19 +1041,23 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
 
 // Convert BGRA to I420.
 LIBYUV_API
-int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int BGRAToI420(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
-      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
-  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
+  void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
       BGRAToYRow_C;
-  if (!src_bgra ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -629,12 +1085,34 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
   }
 #endif
 #if defined(HAS_BGRATOUVROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      BGRAToUVRow = BGRAToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        BGRAToUVRow = BGRAToUVRow_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToUVRow = BGRAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_NEON;
     }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_MMI) && defined(HAS_BGRATOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    BGRAToYRow = BGRAToYRow_Any_MMI;
+    BGRAToUVRow = BGRAToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      BGRAToYRow = BGRAToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    BGRAToYRow = BGRAToYRow_Any_MSA;
+    BGRAToUVRow = BGRAToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_MSA;
+      BGRAToUVRow = BGRAToUVRow_MSA;
+    }
+  }
 #endif
 
   for (y = 0; y < height - 1; y += 2) {
@@ -655,19 +1133,23 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
 
 // Convert ABGR to I420.
 LIBYUV_API
-int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ABGRToI420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
-      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
-  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
       ABGRToYRow_C;
-  if (!src_abgr ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -686,6 +1168,16 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+      ABGRToYRow = ABGRToYRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ABGRTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ABGRToYRow = ABGRToYRow_Any_NEON;
@@ -702,6 +1194,28 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ABGRToYRow = ABGRToYRow_Any_MMI;
+    ABGRToUVRow = ABGRToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+      ABGRToUVRow = ABGRToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
@@ -721,19 +1235,23 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
 
 // Convert RGBA to I420.
 LIBYUV_API
-int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int RGBAToI420(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
-      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
-  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
+  void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
       RGBAToYRow_C;
-  if (!src_rgba ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -768,6 +1286,28 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
     }
   }
 #endif
+#if defined(HAS_RGBATOYROW_MMI) && defined(HAS_RGBATOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGBAToYRow = RGBAToYRow_Any_MMI;
+    RGBAToUVRow = RGBAToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYRow = RGBAToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToYRow = RGBAToYRow_Any_MSA;
+    RGBAToUVRow = RGBAToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_MSA;
+      RGBAToUVRow = RGBAToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
@@ -787,27 +1327,34 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
 
 // Convert RGB24 to I420.
 LIBYUV_API
-int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height) {
+int RGB24ToI420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height) {
   int y;
-#if defined(HAS_RGB24TOYROW_NEON)
-  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
-      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
-  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+     defined(HAS_RGB24TOYROW_MMI))
+  void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB24ToUVRow_C;
+  void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
       RGB24ToYRow_C;
 #else
-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RGB24ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -829,6 +1376,30 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
       }
     }
   }
+// MMI and MSA version does direct RGB24 to YUV.
+#elif (defined(HAS_RGB24TOYROW_MMI) || defined(HAS_RGB24TOYROW_MSA))
+#if defined(HAS_RGB24TOYROW_MMI) && defined(HAS_RGB24TOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_MMI;
+    RGB24ToYRow = RGB24ToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYRow = RGB24ToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVRow = RGB24ToUVRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
+    RGB24ToYRow = RGB24ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYRow = RGB24ToYRow_MSA;
+      RGB24ToUVRow = RGB24ToUVRow_MSA;
+    }
+  }
+#endif
 // Other platforms do intermediate conversion from RGB24 to ARGB.
 #else
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
@@ -859,14 +1430,19 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
     }
   }
 #endif
+#endif
+
   {
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+      defined(HAS_RGB24TOYROW_MMI))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB24TOYROW_NEON)
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+     defined(HAS_RGB24TOYROW_MMI))
       RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
       RGB24ToYRow(src_rgb24, dst_y, width);
       RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
@@ -883,7 +1459,8 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_RGB24TOYROW_NEON)
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+     defined(HAS_RGB24TOYROW_MMI))
       RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
       RGB24ToYRow(src_rgb24, dst_y, width);
 #else
@@ -892,36 +1469,195 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_RGB24TOYROW_NEON)
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+      defined(HAS_RGB24TOYROW_MMI))
     free_aligned_buffer_64(row);
+#endif
   }
+  return 0;
+}
+
+// TODO(fbarchard): Use Matrix version to implement I420 and J420.
+// Convert RGB24 to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height) {
+  int y;
+#if (defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+    defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)
+  void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+                        uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB24ToUVJRow_C;
+  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
+      RGB24ToYJRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYJRow_C;
 #endif
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
+    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYJRow = RGB24ToYJRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVJRow = RGB24ToUVJRow_NEON;
+      }
+    }
+  }
+// MMI and MSA version does direct RGB24 to YUV.
+#elif (defined(HAS_RGB24TOYJROW_MMI) || defined(HAS_RGB24TOYJROW_MSA))
+#if defined(HAS_RGB24TOYJROW_MMI) && defined(HAS_RGB24TOUVJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI;
+    RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYJRow = RGB24ToYJRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVJRow = RGB24ToUVJRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
+    RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_MSA;
+      RGB24ToUVJRow = RGB24ToUVJRow_MSA;
+    }
+  }
+#endif
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#endif
+
+  {
+#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+      defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+     defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+      RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYJRow(src_rgb24, dst_y, width);
+      RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYJRow(row, dst_y, width);
+      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+     defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+      RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYJRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+      ARGBToYJRow(row, dst_y, width);
+#endif
+    }
+#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+      defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+    free_aligned_buffer_64(row);
+#endif
+  }
   return 0;
 }
 
 // Convert RAW to I420.
 LIBYUV_API
-int RAWToI420(const uint8* src_raw, int src_stride_raw,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height) {
+int RAWToI420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
   int y;
-#if defined(HAS_RAWTOYROW_NEON)
-  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
-      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
-  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
+#if (defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)) || \
+    defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI)
+  void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
+                     uint8_t* dst_v, int width) = RAWToUVRow_C;
+  void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
       RAWToYRow_C;
 #else
-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RAWToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_raw || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -932,7 +1668,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
   }
 
 // Neon version does direct RAW to YUV.
-#if defined(HAS_RAWTOYROW_NEON)
+#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     RAWToUVRow = RAWToUVRow_Any_NEON;
     RAWToYRow = RAWToYRow_Any_NEON;
@@ -943,6 +1679,30 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
       }
     }
   }
+// MMI and MSA version does direct RAW to YUV.
+#elif (defined(HAS_RAWTOYROW_MMI) || defined(HAS_RAWTOYROW_MSA))
+#if defined(HAS_RAWTOYROW_MMI) && defined(HAS_RAWTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RAWToUVRow = RAWToUVRow_Any_MMI;
+    RAWToYRow = RAWToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYRow = RAWToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        RAWToUVRow = RAWToUVRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToUVRow = RAWToUVRow_Any_MSA;
+    RAWToYRow = RAWToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYRow = RAWToYRow_MSA;
+      RAWToUVRow = RAWToUVRow_MSA;
+    }
+  }
+#endif
 // Other platforms do intermediate conversion from RAW to ARGB.
 #else
 #if defined(HAS_RAWTOARGBROW_SSSE3)
@@ -973,14 +1733,19 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
     }
   }
 #endif
+#endif
+
   {
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+      defined(HAS_RAWTOYROW_MMI))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RAWTOYROW_NEON)
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+     defined(HAS_RAWTOYROW_MMI))
       RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
       RAWToYRow(src_raw, dst_y, width);
       RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
@@ -997,7 +1762,8 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_RAWTOYROW_NEON)
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+     defined(HAS_RAWTOYROW_MMI))
       RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
       RAWToYRow(src_raw, dst_y, width);
 #else
@@ -1006,36 +1772,44 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_RAWTOYROW_NEON)
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+      defined(HAS_RAWTOYROW_MMI))
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
 // Convert RGB565 to I420.
 LIBYUV_API
-int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
-                 uint8* dst_y, int dst_stride_y,
-                 uint8* dst_u, int dst_stride_u,
-                 uint8* dst_v, int dst_stride_v,
-                 int width, int height) {
+int RGB565ToI420(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 uint8_t* dst_u,
+                 int dst_stride_u,
+                 uint8_t* dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height) {
   int y;
-#if defined(HAS_RGB565TOYROW_NEON)
-  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
-      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
-  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+     defined(HAS_RGB565TOYROW_MMI))
+  void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
+                        uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =
       RGB565ToYRow_C;
 #else
-  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
-      RGB565ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1057,6 +1831,30 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
       }
     }
   }
+// MMI and MSA version does direct RGB565 to YUV.
+#elif (defined(HAS_RGB565TOYROW_MMI) || defined(HAS_RGB565TOYROW_MSA))
+#if defined(HAS_RGB565TOYROW_MMI) && defined(HAS_RGB565TOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_MMI;
+    RGB565ToYRow = RGB565ToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToYRow = RGB565ToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        RGB565ToUVRow = RGB565ToUVRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
+    RGB565ToYRow = RGB565ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToYRow = RGB565ToYRow_MSA;
+      RGB565ToUVRow = RGB565ToUVRow_MSA;
+    }
+  }
+#endif
 // Other platforms do intermediate conversion from RGB565 to ARGB.
 #else
 #if defined(HAS_RGB565TOARGBROW_SSE2)
@@ -1095,14 +1893,17 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
     }
   }
 #endif
+#endif
   {
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+      defined(HAS_RGB565TOYROW_MMI))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
-
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB565TOYROW_NEON)
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+     defined(HAS_RGB565TOYROW_MMI))
       RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
       RGB565ToYRow(src_rgb565, dst_y, width);
       RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
@@ -1119,7 +1920,8 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_RGB565TOYROW_NEON)
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+     defined(HAS_RGB565TOYROW_MMI))
       RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
       RGB565ToYRow(src_rgb565, dst_y, width);
 #else
@@ -1128,36 +1930,45 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_RGB565TOYROW_NEON)
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+      defined(HAS_RGB565TOYROW_MMI))
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
 // Convert ARGB1555 to I420.
 LIBYUV_API
-int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height) {
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
   int y;
-#if defined(HAS_ARGB1555TOYROW_NEON)
-  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
-      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
-  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
-      ARGB1555ToYRow_C;
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+     defined(HAS_ARGB1555TOYROW_MMI))
+  void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
+                          uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGB1555ToUVRow_C;
+  void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
+                         int width) = ARGB1555ToYRow_C;
 #else
-  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
-      ARGB1555ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                            int width) = ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1179,6 +1990,30 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
       }
     }
   }
+// MMI and MSA version does direct ARGB1555 to YUV.
+#elif (defined(HAS_ARGB1555TOYROW_MMI) || defined(HAS_ARGB1555TOYROW_MSA))
+#if defined(HAS_ARGB1555TOYROW_MMI) && defined(HAS_ARGB1555TOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB1555ToUVRow = ARGB1555ToUVRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_MSA;
+      ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
+    }
+  }
+#endif
 // Other platforms do intermediate conversion from ARGB1555 to ARGB.
 #else
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
@@ -1217,14 +2052,18 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
     }
   }
 #endif
+#endif
   {
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+      defined(HAS_ARGB1555TOYROW_MMI))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+     defined(HAS_ARGB1555TOYROW_MMI))
       ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
       ARGB1555ToYRow(src_argb1555, dst_y, width);
       ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
@@ -1243,7 +2082,8 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+     defined(HAS_ARGB1555TOYROW_MMI))
       ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
       ARGB1555ToYRow(src_argb1555, dst_y, width);
 #else
@@ -1252,36 +2092,44 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_ARGB1555TOYROW_NEON)
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+      defined(HAS_ARGB1555TOYROW_MMI))
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
 // Convert ARGB4444 to I420.
 LIBYUV_API
-int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height) {
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
   int y;
-#if defined(HAS_ARGB4444TOYROW_NEON)
-  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
-      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
-  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
-      ARGB4444ToYRow_C;
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
+  void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
+                          uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGB4444ToUVRow_C;
+  void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
+                         int width) = ARGB4444ToYRow_C;
 #else
-  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
-      ARGB4444ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                            int width) = ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1303,6 +2151,17 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
       }
     }
   }
+#elif defined(HAS_ARGB4444TOYROW_MMI) && defined(HAS_ARGB4444TOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI;
+    ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToYRow = ARGB4444ToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB4444ToUVRow = ARGB4444ToUVRow_MMI;
+      }
+    }
+  }
 // Other platforms do intermediate conversion from ARGB4444 to ARGB.
 #else
 #if defined(HAS_ARGB4444TOARGBROW_SSE2)
@@ -1321,6 +2180,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
     }
   }
 #endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1341,14 +2208,41 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVRow = ARGBToUVRow_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+      if (IS_ALIGNED(width, 32)) {
+        ARGBToUVRow = ARGBToUVRow_MSA;
+      }
+    }
+  }
+#endif
+#endif
+
   {
+#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
       ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
       ARGB4444ToYRow(src_argb4444, dst_y, width);
       ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
@@ -1367,7 +2261,7 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
       ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
       ARGB4444ToYRow(src_argb4444, dst_y, width);
 #else
@@ -1376,10 +2270,241 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_ARGB4444TOYROW_NEON)
+#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
     free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
+// Convert RGB24 to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_yj,
+                int dst_stride_yj,
+                int width,
+                int height) {
+  int y;
+  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
+      RGB24ToYJRow_C;
+  if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_rgb24 = dst_stride_yj = 0;
+  }
+#if defined(HAS_RGB24TOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToYJRow = RGB24ToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYJRow = RGB24ToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYJRow = RGB24ToYJRow_MMI;
+    }
   }
 #endif
+#if defined(HAS_RGB24TOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB24ToYJRow(src_rgb24, dst_yj, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+// Convert RAW to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_yj,
+              int dst_stride_yj,
+              int width,
+              int height) {
+  int y;
+  void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) =
+      RAWToYJRow_C;
+  if (!src_raw || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 && dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_yj = 0;
+  }
+#if defined(HAS_RAWTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToYJRow = RAWToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RAWToYJRow = RAWToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToYJRow = RAWToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYJRow = RAWToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RAWToYJRow = RAWToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYJRow = RAWToYJRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToYJRow = RAWToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToYJRow(src_raw, dst_yj, width);
+    src_raw += src_stride_raw;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+static void SplitPixels(const uint8_t* src_u,
+                        int src_pixel_stride_uv,
+                        uint8_t* dst_u,
+                        int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst_u = *src_u;
+    ++dst_u;
+    src_u += src_pixel_stride_uv;
+  }
+}
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height) {
+  int y;
+  const ptrdiff_t vu_off = src_v - src_u;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  // Copy UV planes as is - I420
+  if (src_pixel_stride_uv == 1) {
+    CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+    CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+    return 0;
+    // Split UV planes - NV21
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+      src_stride_u == src_stride_v) {
+    SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
+                 halfwidth, halfheight);
+    return 0;
+    // Split UV planes - NV12
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+    SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                 halfwidth, halfheight);
+    return 0;
+  }
+
+  for (y = 0; y < halfheight; ++y) {
+    SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
+    SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
   return 0;
 }
 
diff --git a/media/libaom/src/third_party/libyuv/source/convert_argb.cc b/media/libaom/src/third_party/libyuv/source/convert_argb.cc
index 44756bc41c..5e7225faf2 100644
--- a/media/libaom/src/third_party/libyuv/source/convert_argb.cc
+++ b/media/libaom/src/third_party/libyuv/source/convert_argb.cc
@@ -14,6 +14,7 @@
 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
 #endif
+#include "libyuv/planar_functions.h"  // For CopyPlane and ARGBShuffle.
 #include "libyuv/rotate_argb.h"
 #include "libyuv/row.h"
 #include "libyuv/video_common.h"
@@ -25,11 +26,13 @@ extern "C" {
 
 // Copy ARGB with optional flipping
 LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height) {
-  if (!src_argb || !dst_argb ||
-      width <= 0 || height == 0) {
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -39,27 +42,30 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
 
-  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-            width * 4, height);
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
+            height);
   return 0;
 }
 
-// Convert I444 to ARGB.
+// Convert I420 to ARGB with matrix.
 LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int I420ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
-  void (*I444ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I444ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -68,66 +74,222 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u == width &&
-      src_stride_v == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
-  }
-#if defined(HAS_I444TOARGBROW_SSSE3)
+#if defined(HAS_I422TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_SSSE3;
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_I444TOARGBROW_AVX2)
+#if defined(HAS_I422TOARGBROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      I444ToARGBRow = I444ToARGBRow_AVX2;
+      I422ToARGBRow = I422ToARGBRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_I444TOARGBROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_NEON;
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGBRow = I422ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
     }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
   }
   return 0;
 }
 
-// Convert I422 to ARGB.
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J420 to ARGB.
 LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int J420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -137,10 +299,8 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
@@ -169,18 +329,25 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+#if defined(HAS_I422TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGBRow = I422ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     src_u += src_stride_u;
@@ -189,22 +356,169 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// Convert I411 to ARGB.
+// Convert I422 to ARGB.
 LIBYUV_API
-int I411ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int I422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
+}
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
-  void (*I411ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I411ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I444ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -214,41 +528,55 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 4 == width &&
-      src_stride_v * 4 == width &&
+  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
-#if defined(HAS_I411TOARGBROW_SSSE3)
+#if defined(HAS_I444TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_SSSE3;
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_I411TOARGBROW_AVX2)
+#if defined(HAS_I444TOARGBROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    I411ToARGBRow = I411ToARGBRow_Any_AVX2;
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      I411ToARGBRow = I411ToARGBRow_AVX2;
+      I444ToARGBRow = I444ToARGBRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_I411TOARGBROW_NEON)
+#if defined(HAS_I444TOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    I411ToARGBRow = I411ToARGBRow_Any_NEON;
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I444ToARGBRow = I444ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I444ToARGBRow = I444ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToARGBRow = I444ToARGBRow_Any_MSA;
     if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_NEON;
+      I444ToARGBRow = I444ToARGBRow_MSA;
     }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     src_u += src_stride_u;
@@ -257,17 +585,994 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// Convert I400 to ARGB.
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
+}
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToAR30Row_C;
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToAR30Row = I210ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToAR30Row = I210ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H010 to AR30.
 LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int H010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert U010 to AR30.
+LIBYUV_API
+int U010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuI601Constants, width, height);
+}
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuH709Constants, width, height);
+}
+
+// Convert U010 to AB30.
+LIBYUV_API
+int U010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
-  void (*I400ToARGBRow)(const uint8* y_buf,
-                     uint8* rgb_buf,
-                     int width) = I400ToARGBRow_C;
-  if (!src_y || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToAR30Row_C;
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToAR30Row = I210ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToAR30Row = I210ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I210 to AR30.
+LIBYUV_API
+int I210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H210 to AR30.
+LIBYUV_API
+int H210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert U210 to AR30.
+LIBYUV_API
+int U210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert I210 to AB30.
+LIBYUV_API
+int I210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuI601Constants, width, height);
+}
+
+// Convert H210 to AB30.
+LIBYUV_API
+int H210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuH709Constants, width, height);
+}
+
+// Convert U210 to AB30.
+LIBYUV_API
+int U210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToARGBRow = I210ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToARGBRow = I210ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I210TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I210ToARGBRow = I210ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I210ToARGBRow = I210ToARGBRow_MMI;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U010 to ARGB.
+LIBYUV_API
+int U010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U010 to ABGR.
+LIBYUV_API
+int U010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToARGBRow = I210ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToARGBRow = I210ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I210TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I210ToARGBRow = I210ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I210ToARGBRow = I210ToARGBRow_MMI;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I210 to ARGB.
+LIBYUV_API
+int I210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I210 to ABGR.
+LIBYUV_API
+int I210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H210 to ARGB.
+LIBYUV_API
+int H210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H210 to ABGR.
+LIBYUV_API
+int H210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate) {
+  int y;
+  void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                             const uint8_t* v_buf, const uint8_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I422AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 with Alpha to ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, src_a, src_stride_a, dst_argb,
+                               dst_stride_argb, &kYuvI601Constants, width,
+                               height, attenuate);
+}
+
+// Convert I420 with Alpha to ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I420AlphaToARGBMatrix(
+      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V
+      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+      &kYvuI601Constants,  // Use Yvu matrix
+      width, height, attenuate);
+}
+
+// Convert I400 to ARGB with matrix.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I400ToARGBRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -277,8 +1582,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -307,25 +1611,55 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I400TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I400ToARGBRow = I400ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I400ToARGBRow = I400ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    I400ToARGBRow(src_y, dst_argb, width);
+    I400ToARGBRow(src_y, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
   }
   return 0;
 }
 
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
 // Convert J400 to ARGB.
 LIBYUV_API
-int J400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int J400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
+  void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
       J400ToARGBRow_C;
-  if (!src_y || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -335,8 +1669,7 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
     src_stride_y = -src_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -365,6 +1698,22 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_J400TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    J400ToARGBRow = J400ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      J400ToARGBRow = J400ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    J400ToARGBRow = J400ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_MSA;
+    }
+  }
+#endif
   for (y = 0; y < height; ++y) {
     J400ToARGBRow(src_y, dst_argb, width);
     src_y += src_stride_y;
@@ -374,85 +1723,89 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
 }
 
 // Shuffle table for converting BGRA to ARGB.
-static uvec8 kShuffleMaskBGRAToARGB = {
-  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
+static const uvec8 kShuffleMaskBGRAToARGB = {
+    3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
 
 // Shuffle table for converting ABGR to ARGB.
-static uvec8 kShuffleMaskABGRToARGB = {
-  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
+static const uvec8 kShuffleMaskABGRToARGB = {
+    2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
 
 // Shuffle table for converting RGBA to ARGB.
-static uvec8 kShuffleMaskRGBAToARGB = {
-  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
-};
+static const uvec8 kShuffleMaskRGBAToARGB = {
+    1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
 
 // Convert BGRA to ARGB.
 LIBYUV_API
-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskBGRAToARGB),
-                     width, height);
+int BGRAToARGB(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
 }
 
 // Convert ARGB to BGRA (same as BGRAToARGB).
 LIBYUV_API
-int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskBGRAToARGB),
-                     width, height);
+int ARGBToBGRA(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
 }
 
 // Convert ABGR to ARGB.
 LIBYUV_API
-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskABGRToARGB),
-                     width, height);
+int ABGRToARGB(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
 }
 
 // Convert ARGB to ABGR to (same as ABGRToARGB).
 LIBYUV_API
-int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskABGRToARGB),
-                     width, height);
+int ARGBToABGR(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
 }
 
 // Convert RGBA to ARGB.
 LIBYUV_API
-int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_rgba, src_stride_rgba,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskRGBAToARGB),
-                     width, height);
+int RGBAToARGB(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height);
 }
 
 // Convert RGB24 to ARGB.
 LIBYUV_API
-int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height) {
+int RGB24ToARGB(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height) {
   int y;
-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RGB24ToARGBRow_C;
-  if (!src_rgb24 || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -462,8 +1815,7 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
     src_stride_rgb24 = -src_stride_rgb24;
   }
   // Coalesce rows.
-  if (src_stride_rgb24 == width * 3 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_rgb24 = dst_stride_argb = 0;
@@ -484,6 +1836,22 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGB24TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RGB24ToARGBRow(src_rgb24, dst_argb, width);
@@ -495,14 +1863,16 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
 
 // Convert RAW to ARGB.
 LIBYUV_API
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
+int RAWToARGB(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
   int y;
-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RAWToARGBRow_C;
-  if (!src_raw || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_raw || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -512,8 +1882,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
     src_stride_raw = -src_stride_raw;
   }
   // Coalesce rows.
-  if (src_stride_raw == width * 3 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_raw = dst_stride_argb = 0;
@@ -534,6 +1903,22 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RAWToARGBRow = RAWToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      RAWToARGBRow = RAWToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToARGBRow = RAWToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToARGBRow(src_raw, dst_argb, width);
@@ -543,16 +1928,69 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
   return 0;
 }
 
+// Convert RAW to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_rgba,
+              int dst_stride_rgba,
+              int width,
+              int height) {
+  int y;
+  void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) =
+      RAWToRGBARow_C;
+  if (!src_raw || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_rgba = 0;
+  }
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToRGBARow = RAWToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToRGBARow = RAWToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToRGBARow = RAWToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToRGBARow = RAWToRGBARow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToRGBARow(src_raw, dst_rgba, width);
+    src_raw += src_stride_raw;
+    dst_rgba += dst_stride_rgba;
+  }
+  return 0;
+}
+
 // Convert RGB565 to ARGB.
 LIBYUV_API
-int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+int RGB565ToARGB(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
   int y;
-  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
-      RGB565ToARGBRow_C;
-  if (!src_rgb565 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
+  if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -562,8 +2000,7 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
     src_stride_rgb565 = -src_stride_rgb565;
   }
   // Coalesce rows.
-  if (src_stride_rgb565 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_rgb565 = dst_stride_argb = 0;
@@ -592,6 +2029,22 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
     }
   }
 #endif
+#if defined(HAS_RGB565TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RGB565ToARGBRow(src_rgb565, dst_argb, width);
@@ -603,14 +2056,16 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
 
 // Convert ARGB1555 to ARGB.
 LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height) {
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
-      int pix) = ARGB1555ToARGBRow_C;
-  if (!src_argb1555 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,
+                            int width) = ARGB1555ToARGBRow_C;
+  if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -620,8 +2075,7 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
     src_stride_argb1555 = -src_stride_argb1555;
   }
   // Coalesce rows.
-  if (src_stride_argb1555 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb1555 = dst_stride_argb = 0;
@@ -650,6 +2104,22 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
     }
   }
 #endif
+#if defined(HAS_ARGB1555TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
@@ -661,14 +2131,16 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
 
 // Convert ARGB4444 to ARGB.
 LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height) {
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
-      int pix) = ARGB4444ToARGBRow_C;
-  if (!src_argb4444 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,
+                            int width) = ARGB4444ToARGBRow_C;
+  if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -678,8 +2150,7 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
     src_stride_argb4444 = -src_stride_argb4444;
   }
   // Coalesce rows.
-  if (src_stride_argb4444 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb4444 = dst_stride_argb = 0;
@@ -708,6 +2179,22 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
     }
   }
 #endif
+#if defined(HAS_ARGB4444TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
@@ -717,19 +2204,118 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
   return 0;
 }
 
-// Convert NV12 to ARGB.
+// Convert AR30 to ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_argb = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToARGBRow_C(src_ar30, dst_argb, width);
+    src_ar30 += src_stride_ar30;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert AR30 to ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  int y;
+  if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_abgr = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToABGRRow_C(src_ar30, dst_abgr, width);
+    src_ar30 += src_stride_ar30;
+    dst_abgr += dst_stride_abgr;
+  }
+  return 0;
+}
+
+// Convert AR30 to AB30.
 LIBYUV_API
-int NV12ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int AR30ToAB30(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
   int y;
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV12ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_ab30 = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToAB30Row_C(src_ar30, dst_ab30, width);
+    src_ar30 += src_stride_ar30;
+    dst_ab30 += dst_stride_ab30;
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*NV12ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -762,9 +2348,25 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_NV12TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      NV12ToARGBRow = NV12ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    NV12ToARGBRow(src_y, src_uv, dst_argb, width);
+    NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     if (y & 1) {
@@ -774,19 +2376,22 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// Convert NV21 to ARGB.
+// Convert NV21 to ARGB with matrix.
 LIBYUV_API
-int NV21ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int NV21ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_vu,
+                     int src_stride_vu,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
   int y;
-  void (*NV21ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV21ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*NV21ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
+  if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -819,86 +2424,348 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_NV21TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      NV21ToARGBRow = NV21ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    NV21ToARGBRow(src_y, src_uv, dst_argb, width);
+    NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     if (y & 1) {
-      src_uv += src_stride_uv;
+      src_vu += src_stride_vu;
     }
   }
   return 0;
 }
 
-// Convert M420 to ARGB.
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,
+                          dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb,
+                          dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV12 to ABGR.
+// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
+// To swap the UV use NV12 instead of NV21.LIBYUV_API
+LIBYUV_API
+int NV12ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr,
+                          dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to ABGR.
 LIBYUV_API
-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int NV21ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr,
+                          dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// TODO(fbarchard): Consider SSSE3 2 step conversion.
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_uv,
+                      int src_stride_uv,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
   int y;
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        int width) = NV12ToARGBRow_C;
-  if (!src_m420 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*NV12ToRGB24Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
+  if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
   }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+#if defined(HAS_NV12TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+      NV12ToRGB24Row = NV12ToRGB24Row_NEON;
     }
   }
 #endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+#if defined(HAS_NV12TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+      NV12ToRGB24Row = NV12ToRGB24Row_SSSE3;
     }
   }
 #endif
-#if defined(HAS_NV12TOARGBROW_NEON)
+#if defined(HAS_NV12TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB24ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_MMI;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_vu,
+                      int src_stride_vu,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
+  int y;
+  void (*NV21ToRGB24Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
+  if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_NV21TORGB24ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_NEON;
+      NV21ToRGB24Row = NV21ToRGB24Row_NEON;
     }
   }
 #endif
+#if defined(HAS_NV21TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB24ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_MMI;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
 
-  for (y = 0; y < height - 1; y += 2) {
-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
-    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
-                  dst_argb + dst_stride_argb, width);
-    dst_argb += dst_stride_argb * 2;
-    src_m420 += src_stride_m420 * 3;
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_uv,
+                int src_stride_uv,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+                           width, height);
+}
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
+                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+                           width, height);
+}
+
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
+                           dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_vu,
+              int src_stride_vu,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
+                           dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to YUV24
+int NV21ToYUV24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_yuv24,
+                int dst_stride_yuv24,
+                int width,
+                int height) {
+  int y;
+  void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
+                         uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
+  if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
+    dst_stride_yuv24 = -dst_stride_yuv24;
+  }
+#if defined(HAS_NV21TOYUV24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToYUV24Row = NV21ToYUV24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOYUV24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV21ToYUV24Row = NV21ToYUV24Row_AVX2;
+    }
   }
-  if (height & 1) {
-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+#endif
+  for (y = 0; y < height; ++y) {
+    NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width);
+    dst_yuv24 += dst_stride_yuv24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
   }
   return 0;
 }
 
 // Convert YUY2 to ARGB.
 LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int YUY2ToARGB(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
+  void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants, int width) =
       YUY2ToARGBRow_C;
-  if (!src_yuy2 || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -908,8 +2775,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
     src_stride_yuy2 = -src_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_yuy2 = dst_stride_argb = 0;
@@ -938,8 +2804,24 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
     }
   }
 #endif
+#if defined(HAS_YUY2TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_MSA;
+    }
+  }
+#endif
   for (y = 0; y < height; ++y) {
-    YUY2ToARGBRow(src_yuy2, dst_argb, width);
+    YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
     src_yuy2 += src_stride_yuy2;
     dst_argb += dst_stride_argb;
   }
@@ -948,14 +2830,17 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
 
 // Convert UYVY to ARGB.
 LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int UYVYToARGB(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
+  void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants, int width) =
       UYVYToARGBRow_C;
-  if (!src_uyvy || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -965,8 +2850,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
     src_stride_uyvy = -src_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_uyvy == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_uyvy = dst_stride_argb = 0;
@@ -995,74 +2879,421 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      UYVYToARGBRow = UYVYToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_MSA;
+    }
+  }
+#endif
   for (y = 0; y < height; ++y) {
-    UYVYToARGBRow(src_uyvy, dst_argb, width);
+    UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
     src_uyvy += src_stride_uyvy;
     dst_argb += dst_stride_argb;
   }
   return 0;
 }
+static void WeavePixels(const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        int src_pixel_stride_uv,
+                        uint8_t* dst_uv,
+                        int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_uv[0] = *src_u;
+    dst_uv[1] = *src_v;
+    dst_uv += 2;
+    src_u += src_pixel_stride_uv;
+    src_v += src_pixel_stride_uv;
+  }
+}
 
-// Convert J420 to ARGB.
+// Convert Android420 to ARGB with matrix.
 LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int Android420ToARGBMatrix(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           int src_pixel_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height) {
   int y;
-  void (*J422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = J422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb ||
-      width <= 0 || height == 0) {
+  uint8_t* dst_uv;
+  const ptrdiff_t vu_off = src_v - src_u;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
+    halfheight = (height + 1) >> 1;
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-#if defined(HAS_J422TOARGBROW_SSSE3)
+
+  // I420
+  if (src_pixel_stride_uv == 1) {
+    return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_argb, dst_stride_argb,
+                            yuvconstants, width, height);
+    // NV21
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+      src_stride_u == src_stride_v) {
+    return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb,
+                            dst_stride_argb, yuvconstants, width, height);
+    // NV12
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+    return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb,
+                            dst_stride_argb, yuvconstants, width, height);
+  }
+
+  // General case fallback creates NV12
+  align_buffer_64(plane_uv, halfwidth * 2 * halfheight);
+  dst_uv = plane_uv;
+  for (y = 0; y < halfheight; ++y) {
+    WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += halfwidth * 2;
+  }
+  NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb,
+                   dst_stride_argb, yuvconstants, width, height);
+  free_aligned_buffer_64(plane_uv);
+  return 0;
+}
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height) {
+  return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                                src_stride_v, src_pixel_stride_uv, dst_argb,
+                                dst_stride_argb, &kYuvI601Constants, width,
+                                height);
+}
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_abgr,
+                     int dst_stride_abgr,
+                     int width,
+                     int height) {
+  return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                                src_stride_u, src_pixel_stride_uv, dst_abgr,
+                                dst_stride_abgr, &kYvuI601Constants, width,
+                                height);
+}
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      J422ToARGBRow = J422ToARGBRow_SSSE3;
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_J422TOARGBROW_AVX2)
+#if defined(HAS_I422TORGBAROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      J422ToARGBRow = J422ToARGBRow_AVX2;
+      I422ToRGBARow = I422ToRGBARow_AVX2;
     }
   }
 #endif
-#if defined(HAS_J422TOARGBROW_NEON)
+#if defined(HAS_I422TORGBAROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    J422ToARGBRow = J422ToARGBRow_Any_NEON;
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      J422ToARGBRow = J422ToARGBRow_NEON;
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToRGBARow = I422ToRGBARow_MMI;
     }
   }
 #endif
-#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_stride_argb;
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_uv,
+                       int src_stride_uv,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height) {
+  int y;
+  void (*NV12ToRGB565Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+                            dst_rgb565, dst_stride_rgb565, &kYuvI601Constants,
+                            width, height);
+}
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToRGBARow = I422ToRGBARow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+    dst_rgba += dst_stride_rgba;
     src_y += src_stride_y;
     if (y & 1) {
       src_u += src_stride_u;
@@ -1072,76 +3303,585 @@ int J420ToARGB(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// Convert J422 to ARGB.
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I420 to RGB24 with matrix.
 LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int I420ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
   int y;
-  void (*J422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = J422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB24Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
   }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+    }
   }
-#if defined(HAS_J422TOARGBROW_SSSE3)
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToRGB24Row = I422ToRGB24Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuI601Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert J420 to RGB24.
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to RAW.
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuJPEGConstants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert H420 to RGB24.
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to RAW.
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuH709Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
+  int y;
+  void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB1555Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+    dst_stride_argb1555 = -dst_stride_argb1555;
+  }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      J422ToARGBRow = J422ToARGBRow_SSSE3;
+      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
     }
   }
 #endif
-#if defined(HAS_J422TOARGBROW_AVX2)
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      J422ToARGBRow = J422ToARGBRow_AVX2;
+      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
     }
   }
 #endif
-#if defined(HAS_J422TOARGBROW_NEON)
+#if defined(HAS_I422TOARGB1555ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    J422ToARGBRow = J422ToARGBRow_Any_NEON;
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      J422ToARGBRow = J422ToARGBRow_NEON;
+      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
     }
   }
 #endif
-#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+#if defined(HAS_I422TOARGB1555ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+    }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_stride_argb;
+    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+                      width);
+    dst_argb1555 += dst_stride_argb1555;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height) {
+  int y;
+  void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB4444Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+    dst_stride_argb4444 = -dst_stride_argb4444;
+  }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+                      width);
+    dst_argb4444 += dst_stride_argb4444;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                          const uint8_t* v_buf, uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToRGB565Row = I422ToRGB565Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_rgb565, dst_stride_rgb565,
+                            &kYuvI601Constants, width, height);
+}
+
+// Convert J420 to RGB565.
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_rgb565, dst_stride_rgb565,
+                            &kYuvJPEGConstants, width, height);
+}
+
+// Convert H420 to RGB565.
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_rgb565, dst_stride_rgb565,
+                            &kYuvH709Constants, width, height);
+}
+
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                          const uint8_t* v_buf, uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+    dst_rgb565 += dst_stride_rgb565;
     src_y += src_stride_y;
     src_u += src_stride_u;
     src_v += src_stride_v;
@@ -1149,6 +3889,236 @@ int J422ToARGB(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                                const uint32_t dither4, int width) =
+      ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGBRow = I422ToARGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+    }
+  }
+#endif
+  {
+    // Allocate a row of argb.
+    align_buffer_64(row_argb, width * 4);
+    for (y = 0; y < height; ++y) {
+      I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+                            *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+                            width);
+      dst_rgb565 += dst_stride_rgb565;
+      src_y += src_stride_y;
+      if (y & 1) {
+        src_u += src_stride_u;
+        src_v += src_stride_v;
+      }
+    }
+    free_aligned_buffer_64(row_argb);
+  }
+  return 0;
+}
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToAR30Row_C;
+
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+
+#if defined(HAS_I422TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToAR30Row = I422ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToAR30Row = I422ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToAR30Row = I422ToAR30Row_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYvuH709Constants, width, height);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/media/libaom/src/third_party/libyuv/source/convert_from.cc b/media/libaom/src/third_party/libyuv/source/convert_from.cc
index 31f1ac992a..f2cfc1d8f5 100644
--- a/media/libaom/src/third_party/libyuv/source/convert_from.cc
+++ b/media/libaom/src/third_party/libyuv/source/convert_from.cc
@@ -15,9 +15,9 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
+#include "libyuv/row.h"
 #include "libyuv/scale.h"  // For ScalePlane()
 #include "libyuv/video_common.h"
-#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -30,107 +30,144 @@ static __inline int Abs(int v) {
 }
 
 // I420 To any I4xx YUV format with mirroring.
-static int I420ToI4xx(const uint8* src_y, int src_stride_y,
-                      const uint8* src_u, int src_stride_u,
-                      const uint8* src_v, int src_stride_v,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int src_y_width, int src_y_height,
-                      int dst_uv_width, int dst_uv_height) {
+static int I420ToI4xx(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int src_y_width,
+                      int src_y_height,
+                      int dst_uv_width,
+                      int dst_uv_height) {
   const int dst_y_width = Abs(src_y_width);
   const int dst_y_height = Abs(src_y_height);
   const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
   const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
-  if (src_y_width == 0 || src_y_height == 0 ||
-      dst_uv_width <= 0 || dst_uv_height <= 0) {
+  if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
+      dst_uv_height <= 0) {
     return -1;
   }
-  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
-             dst_y, dst_stride_y, dst_y_width, dst_y_height,
-             kFilterBilinear);
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
+  if (dst_y) {
+    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+  }
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+  return 0;
+}
+
+// Convert 8 bit YUV to 10 bit.
+LIBYUV_API
+int I420ToI010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
+                    height);
+  // Convert UV planes.
+  Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
+                    halfheight);
+  Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
+                    halfheight);
   return 0;
 }
 
 // 420 chroma is 1/2 width, 1/2 height
 // 422 chroma is 1/2 width, 1x height
 LIBYUV_API
-int I420ToI422(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I420ToI422(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   const int dst_uv_width = (Abs(width) + 1) >> 1;
   const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, dst_uv_width,
+                    dst_uv_height);
 }
 
 // 420 chroma is 1/2 width, 1/2 height
 // 444 chroma is 1x width, 1x height
 LIBYUV_API
-int I420ToI444(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I420ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   const int dst_uv_width = Abs(width);
   const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 411 chroma is 1/4 width, 1x height
-LIBYUV_API
-int I420ToI411(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int dst_uv_width = (Abs(width) + 3) >> 2;
-  const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, dst_uv_width,
+                    dst_uv_height);
 }
 
 // Copy to I400. Source can be I420,422,444,400,NV12,NV21
 LIBYUV_API
-int I400Copy(const uint8* src_y, int src_stride_y,
-             uint8* dst_y, int dst_stride_y,
-             int width, int height) {
-  if (!src_y || !dst_y ||
-      width <= 0 || height == 0) {
+int I400Copy(const uint8_t* src_y,
+             int src_stride_y,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             int width,
+             int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -144,17 +181,21 @@ int I400Copy(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I422ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
+int I422ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
   int y;
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_yuy2, int width) =
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
       I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -164,10 +205,8 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_yuy2 == width * 2) {
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
@@ -180,6 +219,14 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
@@ -200,17 +247,21 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
+int I420ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
   int y;
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_yuy2, int width) =
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
       I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -227,6 +278,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
@@ -235,6 +294,22 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToYUY2Row = I422ToYUY2Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
@@ -252,17 +327,21 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I422ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
+int I422ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_uyvy, int width) =
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
       I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -272,10 +351,8 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
     dst_stride_uyvy = -dst_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_uyvy == width * 2) {
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
@@ -288,6 +365,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
@@ -296,6 +381,22 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToUYVYRow = I422ToUYVYRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -308,17 +409,21 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
+int I420ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_uyvy, int width) =
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
       I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -335,6 +440,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
@@ -343,6 +456,22 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToUYVYRow = I422ToUYVYRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -360,981 +489,217 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I420ToNV12(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
-  int y;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-      int width) = MergeUVRow_C;
-  // Coalesce rows.
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+int I420ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) / 2;
+  int halfheight = (height + 1) / 2;
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_uv = -dst_stride_uv;
-  }
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-  // Coalesce rows.
-  if (src_stride_u == halfwidth &&
-      src_stride_v == halfwidth &&
-      dst_stride_uv == halfwidth * 2) {
-    halfwidth *= halfheight;
-    halfheight = 1;
-    src_stride_u = src_stride_v = dst_stride_uv = 0;
-  }
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow_ = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow_ = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow_ = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_NEON;
-    }
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
   }
-#endif
-
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  for (y = 0; y < halfheight; ++y) {
-    // Merge a row of U and V into a row of UV.
-    MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uv += dst_stride_uv;
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
+  MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
+               halfwidth, halfheight);
   return 0;
 }
 
 LIBYUV_API
-int I420ToNV21(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height) {
-  return I420ToNV12(src_y, src_stride_y,
-                    src_v, src_stride_v,
-                    src_u, src_stride_u,
-                    dst_y, src_stride_y,
-                    dst_vu, dst_stride_vu,
+int I420ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
                     width, height);
 }
 
-// Convert I420 to ARGB.
-LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to BGRA.
-LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height) {
-  int y;
-  void (*I422ToBGRARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToBGRARow_C;
-  if (!src_y || !src_u || !src_v || !dst_bgra ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
-    dst_stride_bgra = -dst_stride_bgra;
-  }
-#if defined(HAS_I422TOBGRAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToBGRARow = I422ToBGRARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOBGRAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToBGRARow = I422ToBGRARow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToBGRARow = I422ToBGRARow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOBGRAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToBGRARow = I422ToBGRARow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToBGRARow = I422ToBGRARow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
-    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
-    dst_bgra += dst_stride_bgra;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to ABGR.
-LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  int y;
-  void (*I422ToABGRRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToABGRRow_C;
-  if (!src_y || !src_u || !src_v || !dst_abgr ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
-    dst_stride_abgr = -dst_stride_abgr;
-  }
-#if defined(HAS_I422TOABGRROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToABGRRow = I422ToABGRRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOABGRROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToABGRRow = I422ToABGRRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOABGRROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToABGRRow = I422ToABGRRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToABGRRow = I422ToABGRRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
-    dst_abgr += dst_stride_abgr;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGBA.
-LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  int y;
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
-    dst_stride_rgba = -dst_stride_rgba;
-  }
-#if defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGBARow = I422ToRGBARow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
-    dst_rgba += dst_stride_rgba;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGB24.
-LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height) {
-  int y;
-  void (*I422ToRGB24Row)(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         int width) = I422ToRGB24Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb24 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
-    dst_stride_rgb24 = -dst_stride_rgb24;
-  }
-#if defined(HAS_I422TORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB24Row = I422ToRGB24Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB24Row = I422ToRGB24Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
-    dst_rgb24 += dst_stride_rgb24;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RAW.
-LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_raw, int dst_stride_raw,
-                int width, int height) {
-  int y;
-  void (*I422ToRAWRow)(const uint8* y_buf,
-                       const uint8* u_buf,
-                       const uint8* v_buf,
-                       uint8* rgb_buf,
-                       int width) = I422ToRAWRow_C;
-  if (!src_y || !src_u || !src_v || !dst_raw ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_raw = dst_raw + (height - 1) * dst_stride_raw;
-    dst_stride_raw = -dst_stride_raw;
-  }
-#if defined(HAS_I422TORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRAWRow = I422ToRAWRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORAWROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRAWRow = I422ToRAWRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRAWRow = I422ToRAWRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORAWROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRAWRow = I422ToRAWRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRAWRow = I422ToRAWRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
-    dst_raw += dst_stride_raw;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to ARGB1555.
-LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height) {
-  int y;
-  void (*I422ToARGB1555Row)(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
-                            int width) = I422ToARGB1555Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb1555 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
-    dst_stride_argb1555 = -dst_stride_argb1555;
-  }
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);
-    dst_argb1555 += dst_stride_argb1555;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-
-// Convert I420 to ARGB4444.
-LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height) {
-  int y;
-  void (*I422ToARGB4444Row)(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
-                            int width) = I422ToARGB4444Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb4444 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
-    dst_stride_argb4444 = -dst_stride_argb4444;
-  }
-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);
-    dst_argb4444 += dst_stride_argb4444;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGB565.
-LIBYUV_API
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
-  int y;
-  void (*I422ToRGB565Row)(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          int width) = I422ToRGB565Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB565Row = I422ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-static const uint8 kDither565_4x4[16] = {
-  0, 4, 1, 5,
-  6, 2, 7, 3,
-  1, 5, 0, 4,
-  7, 3, 6, 2,
-};
-
-// Convert I420 to RGB565 with dithering.
-LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height) {
-  int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
-      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-  if (!dither4x4) {
-    dither4x4 = kDither565_4x4;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
-    }
-  }
-#endif
-  {
-    // Allocate a row of argb.
-    align_buffer_64(row_argb, width * 4);
-    for (y = 0; y < height; ++y) {
-      I422ToARGBRow(src_y, src_u, src_v, row_argb, width);
-      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
-                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
-      dst_rgb565 += dst_stride_rgb565;
-      src_y += src_stride_y;
-      if (y & 1) {
-        src_u += src_stride_u;
-        src_v += src_stride_v;
-      }
-    }
-    free_aligned_buffer_64(row_argb);
-  }
-  return 0;
-}
-
 // Convert I420 to specified format
 LIBYUV_API
-int ConvertFromI420(const uint8* y, int y_stride,
-                    const uint8* u, int u_stride,
-                    const uint8* v, int v_stride,
-                    uint8* dst_sample, int dst_sample_stride,
-                    int width, int height,
-                    uint32 fourcc) {
-  uint32 format = CanonicalFourCC(fourcc);
+int ConvertFromI420(const uint8_t* y,
+                    int y_stride,
+                    const uint8_t* u,
+                    int u_stride,
+                    const uint8_t* v,
+                    int v_stride,
+                    uint8_t* dst_sample,
+                    int dst_sample_stride,
+                    int width,
+                    int height,
+                    uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
   int r = 0;
-  if (!y || !u|| !v || !dst_sample ||
-      width <= 0 || height == 0) {
+  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
     return -1;
   }
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
-      r = I420ToYUY2(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2,
-                     width, height);
+      r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2, width,
+                     height);
       break;
     case FOURCC_UYVY:
-      r = I420ToUYVY(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2,
-                     width, height);
+      r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2, width,
+                     height);
       break;
     case FOURCC_RGBP:
-      r = I420ToRGB565(y, y_stride,
-                       u, u_stride,
-                       v, v_stride,
-                       dst_sample,
-                       dst_sample_stride ? dst_sample_stride : width * 2,
-                       width, height);
+      r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2, width,
+                       height);
       break;
     case FOURCC_RGBO:
-      r = I420ToARGB1555(y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         dst_sample,
+      r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                          dst_sample_stride ? dst_sample_stride : width * 2,
                          width, height);
       break;
     case FOURCC_R444:
-      r = I420ToARGB4444(y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         dst_sample,
+      r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                          dst_sample_stride ? dst_sample_stride : width * 2,
                          width, height);
       break;
     case FOURCC_24BG:
-      r = I420ToRGB24(y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      dst_sample,
-                      dst_sample_stride ? dst_sample_stride : width * 3,
-                      width, height);
+      r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3, width,
+                      height);
       break;
     case FOURCC_RAW:
-      r = I420ToRAW(y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    dst_sample,
-                    dst_sample_stride ? dst_sample_stride : width * 3,
-                    width, height);
+      r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3, width,
+                    height);
       break;
     case FOURCC_ARGB:
-      r = I420ToARGB(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_BGRA:
-      r = I420ToBGRA(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_ABGR:
-      r = I420ToABGR(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_RGBA:
-      r = I420ToRGBA(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_AR30:
+      r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_I400:
-      r = I400Copy(y, y_stride,
-                   dst_sample,
-                   dst_sample_stride ? dst_sample_stride : width,
-                   width, height);
+      r = I400Copy(y, y_stride, dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width, width,
+                   height);
       break;
     case FOURCC_NV12: {
-      uint8* dst_uv = dst_sample + width * height;
-      r = I420ToNV12(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     dst_uv,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     width, height);
+      uint8_t* dst_uv = dst_sample + width * height;
+      r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width, dst_uv,
+                     dst_sample_stride ? dst_sample_stride : width, width,
+                     height);
       break;
     }
     case FOURCC_NV21: {
-      uint8* dst_vu = dst_sample + width * height;
-      r = I420ToNV21(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     dst_vu,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     width, height);
+      uint8_t* dst_vu = dst_sample + width * height;
+      r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width, dst_vu,
+                     dst_sample_stride ? dst_sample_stride : width, width,
+                     height);
       break;
     }
-    // TODO(fbarchard): Add M420.
     // Triplanar formats
-    // TODO(fbarchard): halfstride instead of halfwidth
     case FOURCC_I420:
-    case FOURCC_YU12:
     case FOURCC_YV12: {
-      int halfwidth = (width + 1) / 2;
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      int halfstride = (dst_sample_stride + 1) / 2;
       int halfheight = (height + 1) / 2;
-      uint8* dst_u;
-      uint8* dst_v;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
       if (format == FOURCC_YV12) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + halfwidth * halfheight;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * halfheight;
       } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + halfwidth * halfheight;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * halfheight;
       }
-      r = I420Copy(y, y_stride,
-                   u, u_stride,
-                   v, v_stride,
-                   dst_sample, width,
-                   dst_u, halfwidth,
-                   dst_v, halfwidth,
+      r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                   dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
                    width, height);
       break;
     }
     case FOURCC_I422:
     case FOURCC_YV16: {
-      int halfwidth = (width + 1) / 2;
-      uint8* dst_u;
-      uint8* dst_v;
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      int halfstride = (dst_sample_stride + 1) / 2;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
       if (format == FOURCC_YV16) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + halfwidth * height;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * height;
       } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + halfwidth * height;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * height;
       }
-      r = I420ToI422(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, halfwidth,
-                     dst_v, halfwidth,
+      r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
                      width, height);
       break;
     }
     case FOURCC_I444:
     case FOURCC_YV24: {
-      uint8* dst_u;
-      uint8* dst_v;
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
       if (format == FOURCC_YV24) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + width * height;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + dst_sample_stride * height;
       } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + width * height;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + dst_sample_stride * height;
       }
-      r = I420ToI444(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, width,
-                     dst_v, width,
-                     width, height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (width + 3) / 4;
-      uint8* dst_u = dst_sample + width * height;
-      uint8* dst_v = dst_u + quarterwidth * height;
-      r = I420ToI411(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, quarterwidth,
-                     dst_v, quarterwidth,
-                     width, height);
+      r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride, dst_u, dst_sample_stride, dst_v,
+                     dst_sample_stride, width, height);
       break;
     }
-
     // Formats not supported - MJPG, biplanar, some rgb formats.
     default:
       return -1;  // unknown fourcc - return failure code.
diff --git a/media/libaom/src/third_party/libyuv/source/convert_from_argb.cc b/media/libaom/src/third_party/libyuv/source/convert_from_argb.cc
index 8d1e97aec2..4ba4bb5e0f 100644
--- a/media/libaom/src/third_party/libyuv/source/convert_from_argb.cc
+++ b/media/libaom/src/third_party/libyuv/source/convert_from_argb.cc
@@ -22,16 +22,21 @@ extern "C" {
 
 // ARGB little endian (bgra in memory) to I444
 LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToI444(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int pix) = ARGBToUV444Row_C;
+  void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = ARGBToUV444Row_C;
   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
@@ -41,20 +46,18 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u == width &&
-      dst_stride_v == width) {
+  if (src_stride_argb == width * 4 && dst_stride_y == width &&
+      dst_stride_u == width && dst_stride_v == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_ARGBTOUV444ROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV444Row = ARGBToUV444Row_SSSE3;
-      }
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+    }
   }
 #endif
 #if defined(HAS_ARGBTOUV444ROW_NEON)
@@ -65,6 +68,22 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUV444ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_MSA;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -89,6 +108,22 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -103,61 +138,54 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
 
 // ARGB little endian (bgra in memory) to I422
 LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToI422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int pix) = ARGBToUV422Row_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_argb == width * 4 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
-#if defined(HAS_ARGBTOUV422ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV422ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUV422Row = ARGBToUV422Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
       ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
@@ -170,9 +198,43 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_MMI;
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    ARGBToUV422Row(src_argb, dst_u, dst_v, width);
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
     src_argb += src_stride_argb;
     dst_y += dst_stride_y;
@@ -182,47 +244,49 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
-// ARGB little endian (bgra in memory) to I411
 LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToNV12(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int pix) = ARGBToUV411Row_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_uv, int width) = MergeUVRow_C;
+  if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u * 4 == width &&
-      dst_stride_v * 4 == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
       ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
@@ -235,42 +299,122 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOUV411ROW_NEON)
+#if defined(HAS_ARGBTOUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
     if (IS_ALIGNED(width, 32)) {
-      ARGBToUV411Row = ARGBToUV411Row_NEON;
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow_ = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      MergeUVRow_ = MergeUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
     }
   }
 #endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
 
-  for (y = 0; y < height; ++y) {
-    ARGBToUV411Row(src_argb, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
   }
   return 0;
 }
 
+// Same as NV12 but U and V swapped.
 LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int ARGBToNV21(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                      int width) = MergeUVRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_vu, int width) = MergeUVRow_C;
+  if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -315,6 +459,30 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -339,23 +507,39 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow_ = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      MergeUVRow_ = MergeUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
 
     for (y = 0; y < height - 1; y += 2) {
       ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
       ARGBToYRow(src_argb, dst_y, width);
       ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
       src_argb += src_stride_argb * 2;
       dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
+      dst_vu += dst_stride_vu;
     }
     if (height & 1) {
       ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
       ARGBToYRow(src_argb, dst_y, width);
     }
     free_aligned_buffer_64(row_u);
@@ -363,64 +547,90 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
-// Same as NV12 but U and V swapped.
 LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int ABGRToNV12(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                      int width) = MergeUVRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+      ABGRToYRow_C;
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_uv, int width) = MergeUVRow_C;
+  if (!src_abgr || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
   }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+      ABGRToYRow = ABGRToYRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_NEON)
+#if defined(HAS_ABGRTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
+    ABGRToYRow = ABGRToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
+      ABGRToYRow = ABGRToYRow_NEON;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_NEON)
+#if defined(HAS_ABGRTOUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ABGRToYRow = ABGRToYRow_Any_MMI;
+    ABGRToUVRow = ABGRToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_MSA;
     }
   }
 #endif
@@ -448,88 +658,247 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow_ = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      MergeUVRow_ = MergeUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
 
     for (y = 0; y < height - 1; y += 2) {
-      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
-      ARGBToYRow(src_argb, dst_y, width);
-      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-      src_argb += src_stride_argb * 2;
+      ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+      ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+      src_abgr += src_stride_abgr * 2;
       dst_y += dst_stride_y * 2;
       dst_uv += dst_stride_uv;
     }
     if (height & 1) {
-      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
-      ARGBToYRow(src_argb, dst_y, width);
+      ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
     }
     free_aligned_buffer_64(row_u);
   }
   return 0;
 }
 
-// Convert ARGB to YUY2.
+// Same as NV12 but U and V swapped.
 LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
+int ABGRToNV21(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int pix) = ARGBToUV422Row_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
-
-  if (!src_argb || !dst_yuy2 ||
-      width <= 0 || height == 0) {
+  int halfwidth = (width + 1) >> 1;
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+      ABGRToYRow_C;
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_vu, int width) = MergeUVRow_C;
+  if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
-    dst_stride_yuy2 = -dst_stride_yuy2;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yuy2 == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_yuy2 = 0;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
   }
-#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+      ABGRToYRow = ABGRToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUV422ROW_NEON)
+#if defined(HAS_ABGRTOUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUV422Row = ARGBToUV422Row_NEON;
+      ABGRToUVRow = ABGRToUVRow_NEON;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ABGRToYRow = ABGRToYRow_Any_MMI;
+    ABGRToUVRow = ABGRToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow_ = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      MergeUVRow_ = MergeUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+      ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+      src_abgr += src_stride_abgr * 2;
+      dst_y += dst_stride_y * 2;
+      dst_vu += dst_stride_vu;
+    }
+    if (height & 1) {
+      ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+
+  if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
       ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
@@ -542,7 +911,38 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
-
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -551,6 +951,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
@@ -559,15 +967,31 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToYUY2Row = I422ToYUY2Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
 
   {
     // Allocate a rows of yuv.
     align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-    uint8* row_u = row_y + ((width + 63) & ~63);
-    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+    uint8_t* row_u = row_y + ((width + 63) & ~63);
+    uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
 
     for (y = 0; y < height; ++y) {
-      ARGBToUV422Row(src_argb, row_u, row_v, width);
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
       ARGBToYRow(src_argb, row_y, width);
       I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
       src_argb += src_stride_argb;
@@ -581,19 +1005,23 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to UYVY.
 LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
+int ARGBToUYVY(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int pix) = ARGBToUV422Row_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
 
-  if (!src_argb || !dst_uyvy ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -603,40 +1031,27 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     dst_stride_uyvy = -dst_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_uyvy == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_uyvy = 0;
   }
-#if defined(HAS_ARGBTOUV422ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV422ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUV422Row = ARGBToUV422Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
       ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
@@ -649,7 +1064,38 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
-
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -658,6 +1104,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
@@ -666,15 +1120,31 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToUYVYRow = I422ToUYVYRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   {
     // Allocate a rows of yuv.
     align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-    uint8* row_u = row_y + ((width + 63) & ~63);
-    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+    uint8_t* row_u = row_y + ((width + 63) & ~63);
+    uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
 
     for (y = 0; y < height; ++y) {
-      ARGBToUV422Row(src_argb, row_u, row_v, width);
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
       ARGBToYRow(src_argb, row_y, width);
       I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
       src_argb += src_stride_argb;
@@ -688,11 +1158,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to I400.
 LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int ARGBToI400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
   if (!src_argb || !dst_y || width <= 0 || height == 0) {
     return -1;
@@ -703,8 +1176,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width) {
+  if (src_stride_argb == width * 4 && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_y = 0;
@@ -733,6 +1205,22 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYRow = ARGBToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToYRow(src_argb, dst_y, width);
@@ -743,28 +1231,31 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
 }
 
 // Shuffle table for converting ARGB to RGBA.
-static uvec8 kShuffleMaskARGBToRGBA = {
-  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
-};
+static const uvec8 kShuffleMaskARGBToRGBA = {
+    3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
 
 // Convert ARGB to RGBA.
 LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return ARGBShuffle(src_argb, src_stride_argb,
-                     dst_rgba, dst_stride_rgba,
-                     (const uint8*)(&kShuffleMaskARGBToRGBA),
-                     width, height);
+int ARGBToRGBA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
+                     (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height);
 }
 
 // Convert ARGB To RGB24.
 LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height) {
+int ARGBToRGB24(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
   int y;
-  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+  void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
       ARGBToRGB24Row_C;
   if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
@@ -775,8 +1266,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_rgb24 == width * 3) {
+  if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_rgb24 = 0;
@@ -789,6 +1279,22 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
+  if (TestCpuFlag(kCpuHasAVX512VBMI)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB24ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
@@ -797,6 +1303,22 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRGB24Row(src_argb, dst_rgb24, width);
@@ -808,11 +1330,14 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB To RAW.
 LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_raw, int dst_stride_raw,
-              int width, int height) {
+int ARGBToRAW(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
   int y;
-  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+  void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
       ARGBToRAWRow_C;
   if (!src_argb || !dst_raw || width <= 0 || height == 0) {
     return -1;
@@ -823,8 +1348,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_raw == width * 3) {
+  if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_raw = 0;
@@ -837,6 +1361,14 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToRAWRow = ARGBToRAWRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORAWROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
@@ -845,6 +1377,22 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRAWRow = ARGBToRAWRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORAWROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRAWRow(src_argb, dst_raw, width);
@@ -855,21 +1403,23 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
 }
 
 // Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-static const uint8 kDither565_4x4[16] = {
-  0, 4, 1, 5,
-  6, 2, 7, 3,
-  1, 5, 0, 4,
-  7, 3, 6, 2,
+static const uint8_t kDither565_4x4[16] = {
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
 };
 
 // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
 LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height) {
+int ARGBToRGB565Dither(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height) {
   int y;
-  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
-      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                                const uint32_t dither4, int width) =
+      ARGBToRGB565DitherRow_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -905,9 +1455,27 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+    }
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBToRGB565DitherRow(src_argb, dst_rgb565,
-                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+                          *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+                          width);
     src_argb += src_stride_argb;
     dst_rgb565 += dst_stride_rgb565;
   }
@@ -917,12 +1485,15 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
 // Convert ARGB To RGB565.
 // TODO(fbarchard): Consider using dither function low level with zeros.
 LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
+int ARGBToRGB565(const uint8_t* src_argb,
+                 int src_stride_argb,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRGB565Row_C;
+  void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                          int width) = ARGBToRGB565Row_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -932,8 +1503,7 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_rgb565 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_rgb565 = 0;
@@ -962,6 +1532,22 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRGB565Row(src_argb, dst_rgb565, width);
@@ -973,12 +1559,15 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB To ARGB1555.
 LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height) {
+int ARGBToARGB1555(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToARGB1555Row_C;
+  void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                            int width) = ARGBToARGB1555Row_C;
   if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
     return -1;
   }
@@ -988,8 +1577,7 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb1555 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb1555 = 0;
@@ -1018,6 +1606,22 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB1555ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToARGB1555Row(src_argb, dst_argb1555, width);
@@ -1029,12 +1633,15 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB To ARGB4444.
 LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height) {
+int ARGBToARGB4444(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToARGB4444Row_C;
+  void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                            int width) = ARGBToARGB4444Row_C;
   if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1044,8 +1651,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb4444 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb4444 = 0;
@@ -1074,6 +1680,22 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB4444ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToARGB4444Row(src_argb, dst_argb4444, width);
@@ -1083,21 +1705,123 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// Convert ABGR To AR30.
+LIBYUV_API
+int ABGRToAR30(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  int y;
+  void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) =
+      ABGRToAR30Row_C;
+  if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+  // Coalesce rows.
+  if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_abgr = dst_stride_ar30 = 0;
+  }
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ABGRToAR30Row = ABGRToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToAR30Row = ABGRToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToAR30Row = ABGRToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ABGRToAR30Row(src_abgr, dst_ar30, width);
+    src_abgr += src_stride_abgr;
+    dst_ar30 += dst_stride_ar30;
+  }
+  return 0;
+}
+
+// Convert ARGB To AR30.
+LIBYUV_API
+int ARGBToAR30(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
+      ARGBToAR30Row_C;
+  if (!src_argb || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_ar30 = 0;
+  }
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToAR30Row = ARGBToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR30Row = ARGBToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBToAR30Row(src_argb, dst_ar30, width);
+    src_argb += src_stride_argb;
+    dst_ar30 += dst_stride_ar30;
+  }
+  return 0;
+}
+
 // Convert ARGB to J420. (JPeg full range I420).
 LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToJ420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb ||
-      !dst_yj || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1140,6 +1864,30 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MMI;
+    ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -1157,56 +1905,46 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
-// ARGB little endian (bgra in memory) to J422
+// Convert ARGB to J422. (JPeg full range I422).
 LIBYUV_API
-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToJ422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int pix) = ARGBToUVJ422Row_C;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_argb == width * 4 && dst_stride_yj == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
     width *= height;
     height = 1;
-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_ARGBTOUVJ422ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJ422ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJ422Row = ARGBToUVJ422Row_NEON;
-    }
+    src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
   }
-#endif
-
-#if defined(HAS_ARGBTOYJROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
       ARGBToYJRow = ARGBToYJRow_SSSE3;
     }
   }
@@ -1227,12 +1965,44 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MMI;
+    ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_MMI;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    ARGBToUVJ422Row(src_argb, dst_u, dst_v, width);
-    ARGBToYJRow(src_argb, dst_y, width);
+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
     src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
+    dst_yj += dst_stride_yj;
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
   }
@@ -1241,11 +2011,14 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to J400.
 LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               int width, int height) {
+int ARGBToJ400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
   if (!src_argb || !dst_yj || width <= 0 || height == 0) {
     return -1;
@@ -1256,8 +2029,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yj == width) {
+  if (src_stride_argb == width * 4 && dst_stride_yj == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_yj = 0;
@@ -1286,6 +2058,22 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToYJRow(src_argb, dst_yj, width);
@@ -1295,6 +2083,80 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// Convert RGBA to J400.
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height) {
+  int y;
+  void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
+      RGBAToYJRow_C;
+  if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+  // Coalesce rows.
+  if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_rgba = dst_stride_yj = 0;
+  }
+#if defined(HAS_RGBATOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGBAToYJRow = RGBAToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYJRow = RGBAToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYJRow = RGBAToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYJRow = RGBAToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGBAToYJRow = RGBAToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYJRow = RGBAToYJRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToYJRow = RGBAToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGBAToYJRow(src_rgba, dst_yj, width);
+    src_rgba += src_stride_rgba;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/media/libaom/src/third_party/libyuv/source/convert_jpeg.cc b/media/libaom/src/third_party/libyuv/source/convert_jpeg.cc
index bcb980f7f1..d7556ee91b 100644
--- a/media/libaom/src/third_party/libyuv/source/convert_jpeg.cc
+++ b/media/libaom/src/third_party/libyuv/source/convert_jpeg.cc
@@ -9,6 +9,7 @@
  */
 
 #include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
 
 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
@@ -21,28 +22,24 @@ extern "C" {
 
 #ifdef HAVE_JPEG
 struct I420Buffers {
-  uint8* y;
+  uint8_t* y;
   int y_stride;
-  uint8* u;
+  uint8_t* u;
   int u_stride;
-  uint8* v;
+  uint8_t* v;
   int v_stride;
   int w;
   int h;
 };
 
 static void JpegCopyI420(void* opaque,
-                         const uint8* const* data,
+                         const uint8_t* const* data,
                          const int* strides,
                          int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I420Copy(data[0], strides[0],
-           data[1], strides[1],
-           data[2], strides[2],
-           dest->y, dest->y_stride,
-           dest->u, dest->u_stride,
-           dest->v, dest->v_stride,
-           dest->w, rows);
+  I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+           dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+           dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -50,17 +47,13 @@ static void JpegCopyI420(void* opaque,
 }
 
 static void JpegI422ToI420(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I422ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
+  I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+             dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -68,35 +61,13 @@ static void JpegI422ToI420(void* opaque,
 }
 
 static void JpegI444ToI420(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I444ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI411ToI420(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I411ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
+  I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+             dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -104,15 +75,12 @@ static void JpegI411ToI420(void* opaque,
 }
 
 static void JpegI400ToI420(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I400ToI420(data[0], strides[0],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
+  I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,
+             dest->u_stride, dest->v, dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -121,10 +89,12 @@ static void JpegI400ToI420(void* opaque,
 
 // Query size of MJPG in pixels.
 LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size,
-             int* width, int* height) {
+int MJPGSize(const uint8_t* src_mjpg,
+             size_t src_size_mjpg,
+             int* width,
+             int* height) {
   MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
   if (ret) {
     *width = mjpeg_decoder.GetWidth();
     *height = mjpeg_decoder.GetHeight();
@@ -134,34 +104,40 @@ int MJPGSize(const uint8* sample, size_t sample_size,
 }
 
 // MJPG (Motion JPeg) to I420
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+// TODO(fbarchard): review src_width and src_height requirement. dst_width and
+// dst_height may be enough.
 LIBYUV_API
-int MJPGToI420(const uint8* sample,
-               size_t sample_size,
-               uint8* y, int y_stride,
-               uint8* u, int u_stride,
-               uint8* v, int v_stride,
-               int w, int h,
-               int dw, int dh) {
-  if (sample_size == kUnknownDataSize) {
+int MJPGToI420(const uint8_t* src_mjpg,
+               size_t src_size_mjpg,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
+  if (src_size_mjpg == kUnknownDataSize) {
     // ERROR: MJPEG frame size unknown
     return -1;
   }
 
   // TODO(fbarchard): Port MJpeg to C.
   MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != w ||
-              mjpeg_decoder.GetHeight() != h)) {
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
     // ERROR: MJPEG frame has unexpected dimensions
     mjpeg_decoder.UnloadFrame();
     return 1;  // runtime failure
   }
   if (ret) {
-    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+    I420Buffers bufs = {dst_y, dst_stride_y, dst_u,     dst_stride_u,
+                        dst_v, dst_stride_v, dst_width, dst_height};
     // YUV420
-    if (mjpeg_decoder.GetColorSpace() ==
-            MJpegDecoder::kColorSpaceYCbCr &&
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
         mjpeg_decoder.GetNumComponents() == 3 &&
         mjpeg_decoder.GetVertSampFactor(0) == 2 &&
         mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@@ -169,8 +145,9 @@ int MJPGToI420(const uint8* sample,
         mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
         mjpeg_decoder.GetVertSampFactor(2) == 1 &&
         mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
-    // YUV422
+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -180,8 +157,9 @@ int MJPGToI420(const uint8* sample,
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
-    // YUV444
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -191,29 +169,292 @@ int MJPGToI420(const uint8* sample,
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
-    // YUV411
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width,
+                                           dst_height);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other
+      // colorspace/subsample factors that occur in practice. ERROR: Unable to
+      // convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+
+struct NV21Buffers {
+  uint8_t* y;
+  int y_stride;
+  uint8_t* vu;
+  int vu_stride;
+  int w;
+  int h;
+};
+
+static void JpegI420ToNV21(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  I420ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToNV21(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  I422ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToNV21(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  I444ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToNV21(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+             dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* src_mjpg,
+               size_t src_size_mjpg,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
+  if (src_size_mjpg == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    NV21Buffers bufs = {dst_y,         dst_stride_y, dst_vu,
+                        dst_stride_vu, dst_width,    dst_height};
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV21, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV21, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
                mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
                mjpeg_decoder.GetVertSampFactor(1) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
-    // YUV400
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV21, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceGrayscale &&
                mjpeg_decoder.GetNumComponents() == 1 &&
                mjpeg_decoder.GetVertSampFactor(0) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV21, &bufs, dst_width,
+                                           dst_height);
     } else {
-      // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice. 411 is supported by libjpeg
-      // ERROR: Unable to convert MJPEG frame because format is not supported
+      // Unknown colorspace.
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+
+static void JpegI420ToNV12(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  // Use NV21 with VU swapped.
+  I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToNV12(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  // Use NV21 with VU swapped.
+  I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToNV12(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  // Use NV21 with VU swapped.
+  I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToNV12(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  // Use NV21 since there is no UV plane.
+  I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+             dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPEG) to NV12.
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    // Use NV21Buffers but with UV instead of VU.
+    NV21Buffers bufs = {dst_y,         dst_stride_y, dst_uv,
+                        dst_stride_uv, dst_width,    dst_height};
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width,
+                                           dst_height);
+    } else {
+      // Unknown colorspace.
       mjpeg_decoder.UnloadFrame();
       return 1;
     }
@@ -221,109 +462,86 @@ int MJPGToI420(const uint8* sample,
   return ret ? 0 : 1;
 }
 
-#ifdef HAVE_JPEG
 struct ARGBBuffers {
-  uint8* argb;
+  uint8_t* argb;
   int argb_stride;
   int w;
   int h;
 };
 
 static void JpegI420ToARGB(void* opaque,
-                         const uint8* const* data,
-                         const int* strides,
-                         int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I420ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI422ToARGB(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I422ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
 
-static void JpegI444ToARGB(void* opaque,
-                           const uint8* const* data,
+static void JpegI422ToARGB(void* opaque,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I444ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
 
-static void JpegI411ToARGB(void* opaque,
-                           const uint8* const* data,
+static void JpegI444ToARGB(void* opaque,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I411ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
 
 static void JpegI400ToARGB(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I400ToARGB(data[0], strides[0],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
 
 // MJPG (Motion JPeg) to ARGB
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+// TODO(fbarchard): review src_width and src_height requirement. dst_width and
+// dst_height may be enough.
 LIBYUV_API
-int MJPGToARGB(const uint8* sample,
-               size_t sample_size,
-               uint8* argb, int argb_stride,
-               int w, int h,
-               int dw, int dh) {
-  if (sample_size == kUnknownDataSize) {
+int MJPGToARGB(const uint8_t* src_mjpg,
+               size_t src_size_mjpg,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
+  if (src_size_mjpg == kUnknownDataSize) {
     // ERROR: MJPEG frame size unknown
     return -1;
   }
 
   // TODO(fbarchard): Port MJpeg to C.
   MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != w ||
-              mjpeg_decoder.GetHeight() != h)) {
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
     // ERROR: MJPEG frame has unexpected dimensions
     mjpeg_decoder.UnloadFrame();
     return 1;  // runtime failure
   }
   if (ret) {
-    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+    ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height};
     // YUV420
-    if (mjpeg_decoder.GetColorSpace() ==
-            MJpegDecoder::kColorSpaceYCbCr &&
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
         mjpeg_decoder.GetNumComponents() == 3 &&
         mjpeg_decoder.GetVertSampFactor(0) == 2 &&
         mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@@ -331,8 +549,9 @@ int MJPGToARGB(const uint8* sample,
         mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
         mjpeg_decoder.GetVertSampFactor(2) == 1 &&
         mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
-    // YUV422
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -342,8 +561,9 @@ int MJPGToARGB(const uint8* sample,
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
-    // YUV444
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -353,38 +573,28 @@ int MJPGToARGB(const uint8* sample,
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
-    // YUV411
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
-    // YUV400
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceGrayscale &&
                mjpeg_decoder.GetNumComponents() == 1 &&
                mjpeg_decoder.GetVertSampFactor(0) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width,
+                                           dst_height);
     } else {
-      // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice. 411 is supported by libjpeg
-      // ERROR: Unable to convert MJPEG frame because format is not supported
+      // TODO(fbarchard): Implement conversion for any other
+      // colorspace/subsample factors that occur in practice. ERROR: Unable to
+      // convert MJPEG frame because format is not supported
       mjpeg_decoder.UnloadFrame();
       return 1;
     }
   }
   return ret ? 0 : 1;
 }
-#endif
 
-#endif
+#endif  // HAVE_JPEG
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/third_party/libyuv/source/convert_to_argb.cc b/media/libaom/src/third_party/libyuv/source/convert_to_argb.cc
index af829fbd32..84df16c8c2 100644
--- a/media/libaom/src/third_party/libyuv/source/convert_to_argb.cc
+++ b/media/libaom/src/third_party/libyuv/source/convert_to_argb.cc
@@ -23,41 +23,50 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
 // src_width is used for source stride computation
 // src_height is used to compute location of planes, and indicate inversion
 // sample_size is measured in bytes and is the size of the frame.
 //   With MJPEG it is the compressed size of the frame.
+
+// TODO(fbarchard): Add the following:
+// H010ToARGB
+// I010ToARGB
+
 LIBYUV_API
-int ConvertToARGB(const uint8* sample, size_t sample_size,
-                  uint8* crop_argb, int argb_stride,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+int ConvertToARGB(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
-                  uint32 fourcc) {
-  uint32 format = CanonicalFourCC(fourcc);
+                  uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
   int aligned_src_width = (src_width + 1) & ~1;
-  const uint8* src;
-  const uint8* src_uv;
+  const uint8_t* src;
+  const uint8_t* src_uv;
   int abs_src_height = (src_height < 0) ? -src_height : src_height;
   int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
   int r = 0;
 
   // One pass rotation is available for some formats. For the rest, convert
-  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
-  // and then rotate the I420 to the final destination buffer.
-  // For in-place conversion, if destination crop_argb is same as source sample,
+  // to ARGB (with optional vertical flipping) into a temporary ARGB buffer,
+  // and then rotate the ARGB to the final destination buffer.
+  // For in-place conversion, if destination dst_argb is same as source sample,
   // also enable temporary buffer.
-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
-      crop_argb == sample;
-  uint8* tmp_argb = crop_argb;
-  int tmp_argb_stride = argb_stride;
-  uint8* rotate_buffer = NULL;
+  LIBYUV_BOOL need_buf =
+      (rotation && format != FOURCC_ARGB) || dst_argb == sample;
+  uint8_t* dest_argb = dst_argb;
+  int dest_dst_stride_argb = dst_stride_argb;
+  uint8_t* rotate_buffer = NULL;
   int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
 
-  if (crop_argb == NULL || sample == NULL ||
-      src_width <= 0 || crop_width <= 0 ||
+  if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 ||
       src_height == 0 || crop_height == 0) {
     return -1;
   }
@@ -66,189 +75,237 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
   }
 
   if (need_buf) {
-    int argb_size = crop_width * abs_crop_height * 4;
-    rotate_buffer = (uint8*)malloc(argb_size);
+    int argb_size = crop_width * 4 * abs_crop_height;
+    rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */
     if (!rotate_buffer) {
       return 1;  // Out of memory runtime error.
     }
-    crop_argb = rotate_buffer;
-    argb_stride = crop_width;
+    dst_argb = rotate_buffer;
+    dst_stride_argb = crop_width * 4;
   }
 
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToARGB(src, aligned_src_width * 2,
-                     crop_argb, argb_stride,
+      r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_UYVY:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToARGB(src, aligned_src_width * 2,
-                     crop_argb, argb_stride,
+      r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_24BG:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToARGB(src, src_width * 3,
-                      crop_argb, argb_stride,
-                      crop_width, inv_crop_height);
+      r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
+                      inv_crop_height);
       break;
     case FOURCC_RAW:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToARGB(src, src_width * 3,
-                    crop_argb, argb_stride,
-                    crop_width, inv_crop_height);
+      r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
+                    inv_crop_height);
       break;
     case FOURCC_ARGB:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ARGBToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      if (!need_buf && !rotation) {
+        src = sample + (src_width * crop_y + crop_x) * 4;
+        r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb,
+                       crop_width, inv_crop_height);
+      }
       break;
     case FOURCC_BGRA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_ABGR:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_AR30:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_AB30:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBP:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToARGB(src, src_width * 2,
-                       crop_argb, argb_stride,
+      r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                        crop_width, inv_crop_height);
       break;
     case FOURCC_RGBO:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToARGB(src, src_width * 2,
-                         crop_argb, argb_stride,
+      r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                          crop_width, inv_crop_height);
       break;
     case FOURCC_R444:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToARGB(src, src_width * 2,
-                         crop_argb, argb_stride,
+      r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                          crop_width, inv_crop_height);
       break;
     case FOURCC_I400:
       src = sample + src_width * crop_y + crop_x;
-      r = I400ToARGB(src, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_J400:
+      src = sample + src_width * crop_y + crop_x;
+      r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
 
     // Biplanar formats
     case FOURCC_NV12:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
-      r = NV12ToARGB(src, src_width,
-                     src_uv, aligned_src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      src_uv =
+          sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
+      r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
+                     dst_stride_argb, crop_width, inv_crop_height);
       break;
     case FOURCC_NV21:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      src_uv =
+          sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
       // Call NV12 but with u and v parameters swapped.
-      r = NV21ToARGB(src, src_width,
-                     src_uv, aligned_src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    case FOURCC_M420:
-      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToARGB(src, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
+                     dst_stride_argb, crop_width, inv_crop_height);
       break;
     // Triplanar formats
     case FOURCC_I420:
-    case FOURCC_YU12:
     case FOURCC_YV12: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       if (format == FOURCC_YV12) {
         src_v = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       } else {
         src_u = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       }
-      r = I420ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
 
     case FOURCC_J420: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
-      src_u = sample + src_width * abs_src_height +
-          (halfwidth * crop_y + crop_x) / 2;
-      src_v = sample + src_width * abs_src_height +
-          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      r = J420ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u = sample + src_width * abs_src_height +
+                             (halfwidth * crop_y + crop_x) / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_H420: {
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u = sample + src_width * abs_src_height +
+                             (halfwidth * crop_y + crop_x) / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = H420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_U420: {
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u = sample + src_width * abs_src_height +
+                             (halfwidth * crop_y + crop_x) / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
 
     case FOURCC_I422:
     case FOURCC_YV16: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
       int halfwidth = (src_width + 1) / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       } else {
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       }
-      r = I422ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
+
+    case FOURCC_J422: {
+      int halfwidth = (src_width + 1) / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_H422: {
+      int halfwidth = (src_width + 1) / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_U422: {
+      int halfwidth = (src_width + 1) / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
     case FOURCC_I444:
     case FOURCC_YV24: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       if (format == FOURCC_YV24) {
         src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
@@ -256,32 +313,48 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
         src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       }
-      r = I444ToARGB(src_y, src_width,
-                     src_u, src_width,
-                     src_v, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
-    case FOURCC_I411: {
-      int quarterwidth = (src_width + 3) / 4;
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u = sample + src_width * abs_src_height +
-          quarterwidth * crop_y + crop_x / 4;
-      const uint8* src_v = sample + src_width * abs_src_height +
-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
-      r = I411ToARGB(src_y, src_width,
-                     src_u, quarterwidth,
-                     src_v, quarterwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+
+    case FOURCC_J444: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
+
+    case FOURCC_H444: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_U444: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
 #ifdef HAVE_JPEG
     case FOURCC_MJPG:
-      r = MJPGToARGB(sample, sample_size,
-                     crop_argb, argb_stride,
-                     src_width, abs_src_height, crop_width, inv_crop_height);
+      r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
+                     abs_src_height, crop_width, inv_crop_height);
       break;
 #endif
     default:
@@ -290,11 +363,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
 
   if (need_buf) {
     if (!r) {
-      r = ARGBRotate(crop_argb, argb_stride,
-                     tmp_argb, tmp_argb_stride,
+      r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb,
                      crop_width, abs_crop_height, rotation);
     }
     free(rotate_buffer);
+  } else if (rotation) {
+    src = sample + (src_width * crop_y + crop_x) * 4;
+    r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                   inv_crop_height, rotation);
   }
 
   return r;
diff --git a/media/libaom/src/third_party/libyuv/source/convert_to_i420.cc b/media/libaom/src/third_party/libyuv/source/convert_to_i420.cc
index 5e75369b55..ac6eeab24e 100644
--- a/media/libaom/src/third_party/libyuv/source/convert_to_i420.cc
+++ b/media/libaom/src/third_party/libyuv/source/convert_to_i420.cc
@@ -25,253 +25,211 @@ extern "C" {
 // sample_size is measured in bytes and is the size of the frame.
 //   With MJPEG it is the compressed size of the frame.
 LIBYUV_API
-int ConvertToI420(const uint8* sample,
+int ConvertToI420(const uint8_t* sample,
                   size_t sample_size,
-                  uint8* y, int y_stride,
-                  uint8* u, int u_stride,
-                  uint8* v, int v_stride,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+                  uint8_t* dst_y,
+                  int dst_stride_y,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
-                  uint32 fourcc) {
-  uint32 format = CanonicalFourCC(fourcc);
+                  uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
   int aligned_src_width = (src_width + 1) & ~1;
-  const uint8* src;
-  const uint8* src_uv;
-  int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  const uint8_t* src;
+  const uint8_t* src_uv;
+  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  // TODO(nisse): Why allow crop_height < 0?
+  const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
   int r = 0;
-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
-      format != FOURCC_NV12 && format != FOURCC_NV21 &&
-      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
-  uint8* tmp_y = y;
-  uint8* tmp_u = u;
-  uint8* tmp_v = v;
-  int tmp_y_stride = y_stride;
-  int tmp_u_stride = u_stride;
-  int tmp_v_stride = v_stride;
-  uint8* rotate_buffer = NULL;
-  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  LIBYUV_BOOL need_buf =
+      (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
+       format != FOURCC_NV21 && format != FOURCC_YV12) ||
+      dst_y == sample;
+  uint8_t* tmp_y = dst_y;
+  uint8_t* tmp_u = dst_u;
+  uint8_t* tmp_v = dst_v;
+  int tmp_y_stride = dst_stride_y;
+  int tmp_u_stride = dst_stride_u;
+  int tmp_v_stride = dst_stride_v;
+  uint8_t* rotate_buffer = NULL;
+  const int inv_crop_height =
+      (src_height < 0) ? -abs_crop_height : abs_crop_height;
 
-  if (!y || !u || !v || !sample ||
-      src_width <= 0 || crop_width <= 0  ||
-      src_height == 0 || crop_height == 0) {
+  if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
+      crop_width <= 0 || src_height == 0 || crop_height == 0) {
     return -1;
   }
-  if (src_height < 0) {
-    inv_crop_height = -inv_crop_height;
-  }
 
   // One pass rotation is available for some formats. For the rest, convert
   // to I420 (with optional vertical flipping) into a temporary I420 buffer,
   // and then rotate the I420 to the final destination buffer.
-  // For in-place conversion, if destination y is same as source sample,
+  // For in-place conversion, if destination dst_y is same as source sample,
   // also enable temporary buffer.
   if (need_buf) {
     int y_size = crop_width * abs_crop_height;
     int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
-    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+    rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */
     if (!rotate_buffer) {
       return 1;  // Out of memory runtime error.
     }
-    y = rotate_buffer;
-    u = y + y_size;
-    v = u + uv_size;
-    y_stride = crop_width;
-    u_stride = v_stride = ((crop_width + 1) / 2);
+    dst_y = rotate_buffer;
+    dst_u = dst_y + y_size;
+    dst_v = dst_u + uv_size;
+    dst_stride_y = crop_width;
+    dst_stride_u = dst_stride_v = ((crop_width + 1) / 2);
   }
 
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToI420(src, aligned_src_width * 2,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_UYVY:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToI420(src, aligned_src_width * 2,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBP:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToI420(src, src_width * 2,
-                       y, y_stride,
-                       u, u_stride,
-                       v, v_stride,
-                       crop_width, inv_crop_height);
+      r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                       dst_stride_u, dst_v, dst_stride_v, crop_width,
+                       inv_crop_height);
       break;
     case FOURCC_RGBO:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToI420(src, src_width * 2,
-                         y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         crop_width, inv_crop_height);
+      r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                         dst_stride_u, dst_v, dst_stride_v, crop_width,
+                         inv_crop_height);
       break;
     case FOURCC_R444:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToI420(src, src_width * 2,
-                         y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         crop_width, inv_crop_height);
+      r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                         dst_stride_u, dst_v, dst_stride_v, crop_width,
+                         inv_crop_height);
       break;
     case FOURCC_24BG:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToI420(src, src_width * 3,
-                      y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      crop_width, inv_crop_height);
+      r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
+                      dst_stride_u, dst_v, dst_stride_v, crop_width,
+                      inv_crop_height);
       break;
     case FOURCC_RAW:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToI420(src, src_width * 3,
-                    y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    crop_width, inv_crop_height);
+      r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
+                    dst_stride_u, dst_v, dst_stride_v, crop_width,
+                    inv_crop_height);
       break;
     case FOURCC_ARGB:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ARGBToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_BGRA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_ABGR:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
+    // TODO(fbarchard): Add AR30 and AB30
     case FOURCC_I400:
       src = sample + src_width * crop_y + crop_x;
-      r = I400ToI420(src, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                     dst_v, dst_stride_v, crop_width, inv_crop_height);
       break;
     // Biplanar formats
     case FOURCC_NV12:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + (src_width * src_height) +
-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
-      r = NV12ToI420Rotate(src, src_width,
-                           src_uv, aligned_src_width,
-                           y, y_stride,
-                           u, u_stride,
-                           v, v_stride,
-                           crop_width, inv_crop_height, rotation);
+      src_uv = sample + (src_width * abs_src_height) +
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
+                           dst_stride_y, dst_u, dst_stride_u, dst_v,
+                           dst_stride_v, crop_width, inv_crop_height, rotation);
       break;
     case FOURCC_NV21:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + (src_width * src_height) +
-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
-      // Call NV12 but with u and v parameters swapped.
-      r = NV12ToI420Rotate(src, src_width,
-                           src_uv, aligned_src_width,
-                           y, y_stride,
-                           v, v_stride,
-                           u, u_stride,
-                           crop_width, inv_crop_height, rotation);
-      break;
-    case FOURCC_M420:
-      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToI420(src, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      src_uv = sample + (src_width * abs_src_height) +
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      // Call NV12 but with dst_u and dst_v parameters swapped.
+      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
+                           dst_stride_y, dst_v, dst_stride_v, dst_u,
+                           dst_stride_u, crop_width, inv_crop_height, rotation);
       break;
     // Triplanar formats
     case FOURCC_I420:
-    case FOURCC_YU12:
     case FOURCC_YV12: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       if (format == FOURCC_YV12) {
-        src_v = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
+                (crop_x / 2);
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
       } else {
-        src_u = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
+                (crop_x / 2);
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
       }
-      r = I420Rotate(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height, rotation);
+      r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, crop_width, inv_crop_height, rotation);
       break;
     }
     case FOURCC_I422:
     case FOURCC_YV16: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                (crop_x / 2);
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
       } else {
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                (crop_x / 2);
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
       }
-      r = I422ToI420(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, crop_width, inv_crop_height);
       break;
     }
     case FOURCC_I444:
     case FOURCC_YV24: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       if (format == FOURCC_YV24) {
         src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
@@ -279,38 +237,16 @@ int ConvertToI420(const uint8* sample,
         src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       }
-      r = I444ToI420(src_y, src_width,
-                     src_u, src_width,
-                     src_v, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (src_width + 3) / 4;
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u = sample + src_width * abs_src_height +
-          quarterwidth * crop_y + crop_x / 4;
-      const uint8* src_v = sample + src_width * abs_src_height +
-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
-      r = I411ToI420(src_y, src_width,
-                     src_u, quarterwidth,
-                     src_v, quarterwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, crop_width, inv_crop_height);
       break;
     }
 #ifdef HAVE_JPEG
     case FOURCC_MJPG:
-      r = MJPGToI420(sample, sample_size,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     src_width, abs_src_height, crop_width, inv_crop_height);
+      r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, src_width,
+                     abs_src_height, crop_width, inv_crop_height);
       break;
 #endif
     default:
@@ -319,13 +255,10 @@ int ConvertToI420(const uint8* sample,
 
   if (need_buf) {
     if (!r) {
-      r = I420Rotate(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     tmp_y, tmp_y_stride,
-                     tmp_u, tmp_u_stride,
-                     tmp_v, tmp_v_stride,
-                     crop_width, abs_crop_height, rotation);
+      r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride,
+                     tmp_v, tmp_v_stride, crop_width, abs_crop_height,
+                     rotation);
     }
     free(rotate_buffer);
   }
diff --git a/media/libaom/src/third_party/libyuv/source/cpu_id.cc b/media/libaom/src/third_party/libyuv/source/cpu_id.cc
index 72f686e3b3..72a7fb82f6 100644
--- a/media/libaom/src/third_party/libyuv/source/cpu_id.cc
+++ b/media/libaom/src/third_party/libyuv/source/cpu_id.cc
@@ -10,25 +10,19 @@
 
 #include "libyuv/cpu_id.h"
 
-#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+#if defined(_MSC_VER)
 #include <intrin.h>  // For __cpuidex()
 #endif
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+#if !defined(__pnacl__) && !defined(__CLR_VER) &&                           \
     !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
-    defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
+    defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
 #include <immintrin.h>  // For _xgetbv()
 #endif
 
-#if !defined(__native_client__)
-#include <stdlib.h>  // For getenv()
-#endif
-
 // For ArmCpuCaps() but unittested on all platforms
 #include <stdio.h>
 #include <string.h>
 
-#include "libyuv/basic_types.h"  // For CPU_X86
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -36,22 +30,27 @@ extern "C" {
 
 // For functions that use the stack and have runtime checks for overflow,
 // use SAFEBUFFERS to avoid additional check.
-#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
+    !defined(__clang__)
 #define SAFEBUFFERS __declspec(safebuffers)
 #else
 #define SAFEBUFFERS
 #endif
 
+// cpu_info_ variable for SIMD instruction sets detected.
+LIBYUV_API int cpu_info_ = 0;
+
+// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
 // Low level cpuid for X86.
-#if (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__)) && \
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+     defined(__x86_64__)) &&                                     \
     !defined(__pnacl__) && !defined(__CLR_VER)
 LIBYUV_API
-void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
-#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+void CpuId(int info_eax, int info_ecx, int* cpu_info) {
+#if defined(_MSC_VER)
 // Visual C version uses intrinsic or inline x86 assembly.
-#if (_MSC_FULL_VER >= 160040219)
-  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+  __cpuidex(cpu_info, info_eax, info_ecx);
 #elif defined(_M_IX86)
   __asm {
     mov        eax, info_eax
@@ -63,68 +62,80 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
     mov        [edi + 8], ecx
     mov        [edi + 12], edx
   }
-#else
+#else  // Visual C but not x86
   if (info_ecx == 0) {
-    __cpuid((int*)(cpu_info), info_eax);
+    __cpuid(cpu_info, info_eax);
   } else {
-    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
   }
 #endif
 // GCC version uses inline x86 assembly.
-#else  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
-  uint32 info_ebx, info_edx;
-  asm volatile (  // NOLINT
-#if defined( __i386__) && defined(__PIC__)
-    // Preserve ebx for fpic 32 bit.
-    "mov %%ebx, %%edi                          \n"
-    "cpuid                                     \n"
-    "xchg %%edi, %%ebx                         \n"
-    : "=D" (info_ebx),
+#else  // defined(_MSC_VER)
+  int info_ebx, info_edx;
+  asm volatile(
+#if defined(__i386__) && defined(__PIC__)
+      // Preserve ebx for fpic 32 bit.
+      "mov         %%ebx, %%edi                  \n"
+      "cpuid                                     \n"
+      "xchg        %%edi, %%ebx                  \n"
+      : "=D"(info_ebx),
 #else
-    "cpuid                                     \n"
-    : "=b" (info_ebx),
+      "cpuid                                     \n"
+      : "=b"(info_ebx),
 #endif  //  defined( __i386__) && defined(__PIC__)
-      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+        "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
   cpu_info[0] = info_eax;
   cpu_info[1] = info_ebx;
   cpu_info[2] = info_ecx;
   cpu_info[3] = info_edx;
-#endif  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+#endif  // defined(_MSC_VER)
 }
 #else  // (defined(_M_IX86) || defined(_M_X64) ...
 LIBYUV_API
-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+void CpuId(int eax, int ecx, int* cpu_info) {
+  (void)eax;
+  (void)ecx;
   cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
 }
 #endif
 
-// TODO(fbarchard): Enable xgetbv when validator supports it.
-#if (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__)) && \
+// For VS2010 and earlier emit can be used:
+//   _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
+//  __asm {
+//    xor        ecx, ecx    // xcr 0
+//    xgetbv
+//    mov        xcr0, eax
+//  }
+// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
+// https://code.google.com/p/libyuv/issues/detail?id=529
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
+#pragma optimize("g", off)
+#endif
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+     defined(__x86_64__)) &&                                     \
     !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
-#define HAS_XGETBV
 // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-int TestOsSaveYmm() {
-  uint32 xcr0 = 0u;
-#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
-  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
-#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
-  __asm {
-    xor        ecx, ecx    // xcr 0
-    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
-    mov        xcr0, eax
-  }
+int GetXCR0() {
+  int xcr0 = 0;
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+  xcr0 = (int)_xgetbv(0);  // VS2010 SP1 required.  NOLINT
 #elif defined(__i386__) || defined(__x86_64__)
-  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
 #endif  // defined(__i386__) || defined(__x86_64__)
-  return((xcr0 & 6) == 6);  // Is ymm saved?
+  return xcr0;
 }
+#else
+// xgetbv unavailable to query for OSSave support.  Return 0.
+#define GetXCR0() 0
 #endif  // defined(_M_IX86) || defined(_M_X64) ..
+// Return optimization to previous setting.
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
+#pragma optimize("g", on)
+#endif
 
-// based on libaom arm_cpudetect.c
+// based on libvpx arm_cpudetect.c
 // For Arm, but public to allow testing on any CPU
-LIBYUV_API SAFEBUFFERS
-int ArmCpuCaps(const char* cpuinfo_name) {
+LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
   char cpuinfo_line[512];
   FILE* f = fopen(cpuinfo_name, "r");
   if (!f) {
@@ -141,7 +152,7 @@ int ArmCpuCaps(const char* cpuinfo_name) {
       }
       // aarch64 uses asimd for Neon.
       p = strstr(cpuinfo_line, " asimd");
-      if (p && (p[6] == ' ' || p[6] == '\n')) {
+      if (p) {
         fclose(f);
         return kCpuHasNEON;
       }
@@ -151,154 +162,116 @@ int ArmCpuCaps(const char* cpuinfo_name) {
   return 0;
 }
 
-#if defined(__mips__) && defined(__linux__)
-static int MipsCpuCaps(const char* search_string) {
+// TODO(fbarchard): Consider read_msa_ir().
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
   char cpuinfo_line[512];
-  const char* file_name = "/proc/cpuinfo";
-  FILE* f = fopen(file_name, "r");
+  int flag = 0x0;
+  FILE* f = fopen(cpuinfo_name, "r");
   if (!f) {
-    // Assume DSP if /proc/cpuinfo is unavailable.
+    // Assume nothing if /proc/cpuinfo is unavailable.
     // This will occur for Chrome sandbox for Pepper or Render process.
-    return kCpuHasMIPS_DSP;
+    return 0;
   }
-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
-    if (strstr(cpuinfo_line, search_string) != NULL) {
-      fclose(f);
-      return kCpuHasMIPS_DSP;
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
+      // Workaround early kernel without mmi in ASEs line.
+      if (strstr(cpuinfo_line, "Loongson-3")) {
+        flag |= kCpuHasMMI;
+      } else if (strstr(cpuinfo_line, "Loongson-2K")) {
+        flag |= kCpuHasMMI | kCpuHasMSA;
+      }
+    }
+    if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+      if (strstr(cpuinfo_line, "loongson-mmi") &&
+          strstr(cpuinfo_line, "loongson-ext")) {
+        flag |= kCpuHasMMI;
+      }
+      if (strstr(cpuinfo_line, "msa")) {
+        flag |= kCpuHasMSA;
+      }
+      // ASEs is the last line, so we can break here.
+      break;
     }
   }
   fclose(f);
-  return 0;
+  return flag;
 }
-#endif
-
-// CPU detect function for SIMD instruction sets.
-LIBYUV_API
-int cpu_info_ = kCpuInit;  // cpu_info is not initialized yet.
 
-// Test environment variable for disabling CPU features. Any non-zero value
-// to disable. Zero ignored to make it easy to set the variable on/off.
-#if !defined(__native_client__) && !defined(_M_ARM)
-
-static LIBYUV_BOOL TestEnv(const char* name) {
-  const char* var = getenv(name);
-  if (var) {
-    if (var[0] != '0') {
-      return LIBYUV_TRUE;
-    }
-  }
-  return LIBYUV_FALSE;
-}
-#else  // nacl does not support getenv().
-static LIBYUV_BOOL TestEnv(const char*) {
-  return LIBYUV_FALSE;
-}
-#endif
-
-LIBYUV_API SAFEBUFFERS
-int InitCpuFlags(void) {
-#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
-
-  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
-  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
-  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+static SAFEBUFFERS int GetCpuFlags(void) {
+  int cpu_info = 0;
+#if !defined(__pnacl__) && !defined(__CLR_VER) &&                   \
+    (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+     defined(_M_IX86))
+  int cpu_info0[4] = {0, 0, 0, 0};
+  int cpu_info1[4] = {0, 0, 0, 0};
+  int cpu_info7[4] = {0, 0, 0, 0};
   CpuId(0, 0, cpu_info0);
   CpuId(1, 0, cpu_info1);
   if (cpu_info0[0] >= 7) {
     CpuId(7, 0, cpu_info7);
   }
-  cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
-              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
-              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
-              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
-              ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
-              ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
-              kCpuHasX86;
+  cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+             ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+             ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+             ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
 
-#ifdef HAS_XGETBV
-  if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave
-      TestOsSaveYmm()) {  // Saves YMM.
-    cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
-                 kCpuHasAVX;
-  }
-#endif
-  // Environment variable overrides for testing.
-  if (TestEnv("LIBYUV_DISABLE_X86")) {
-    cpu_info_ &= ~kCpuHasX86;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
-    cpu_info_ &= ~kCpuHasSSE2;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
-    cpu_info_ &= ~kCpuHasSSSE3;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
-    cpu_info_ &= ~kCpuHasSSE41;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
-    cpu_info_ &= ~kCpuHasSSE42;
-  }
-  if (TestEnv("LIBYUV_DISABLE_AVX")) {
-    cpu_info_ &= ~kCpuHasAVX;
-  }
-  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
-    cpu_info_ &= ~kCpuHasAVX2;
-  }
-  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
-    cpu_info_ &= ~kCpuHasERMS;
-  }
-  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
-    cpu_info_ &= ~kCpuHasFMA3;
+  // AVX requires OS saves YMM registers.
+  if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
+      ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
+    cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+                ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+                ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
+
+    // Detect AVX512bw
+    if ((GetXCR0() & 0xe0) == 0xe0) {
+      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;
+      cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
+      cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
+      cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
+    }
   }
 #endif
 #if defined(__mips__) && defined(__linux__)
-  // Linux mips parse text file for dsp detect.
-  cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
-#if defined(__mips_dspr2)
-  cpu_info_ |= kCpuHasMIPS_DSPR2;
-#endif
-  cpu_info_ |= kCpuHasMIPS;
-
-  if (getenv("LIBYUV_DISABLE_MIPS")) {
-    cpu_info_ &= ~kCpuHasMIPS;
-  }
-  if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
-    cpu_info_ &= ~kCpuHasMIPS_DSP;
-  }
-  if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
-    cpu_info_ &= ~kCpuHasMIPS_DSPR2;
-  }
+  cpu_info = MipsCpuCaps("/proc/cpuinfo");
+  cpu_info |= kCpuHasMIPS;
 #endif
 #if defined(__arm__) || defined(__aarch64__)
 // gcc -mfpu=neon defines __ARM_NEON__
 // __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
 // For Linux, /proc/cpuinfo can be tested but without that assume Neon.
 #if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
-  cpu_info_ = kCpuHasNEON;
+  cpu_info = kCpuHasNEON;
 // For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
 // flag in it.
 // So for aarch64, neon enabling is hard coded here.
 #endif
 #if defined(__aarch64__)
-  cpu_info_ = kCpuHasNEON;
+  cpu_info = kCpuHasNEON;
 #else
   // Linux arm parse text file for neon detect.
-  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
+  cpu_info = ArmCpuCaps("/proc/cpuinfo");
 #endif
-  cpu_info_ |= kCpuHasARM;
-  if (TestEnv("LIBYUV_DISABLE_NEON")) {
-    cpu_info_ &= ~kCpuHasNEON;
-  }
+  cpu_info |= kCpuHasARM;
 #endif  // __arm__
-  if (TestEnv("LIBYUV_DISABLE_ASM")) {
-    cpu_info_ = 0;
-  }
-  return cpu_info_;
+  cpu_info |= kCpuInitialized;
+  return cpu_info;
+}
+
+// Note that use of this function is not thread safe.
+LIBYUV_API
+int MaskCpuFlags(int enable_flags) {
+  int cpu_info = GetCpuFlags() & enable_flags;
+  SetCpuFlags(cpu_info);
+  return cpu_info;
 }
 
 LIBYUV_API
-void MaskCpuFlags(int enable_flags) {
-  cpu_info_ = InitCpuFlags() & enable_flags;
+int InitCpuFlags(void) {
+  return MaskCpuFlags(-1);
 }
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/third_party/libyuv/source/mjpeg_decoder.cc b/media/libaom/src/third_party/libyuv/source/mjpeg_decoder.cc
index 75f8a610e3..adba832f53 100644
--- a/media/libaom/src/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/media/libaom/src/third_party/libyuv/source/mjpeg_decoder.cc
@@ -21,11 +21,12 @@
 
 #if defined(_MSC_VER)
 // disable warning 4324: structure was padded due to __declspec(align())
-#pragma warning(disable:4324)
+#pragma warning(disable : 4324)
 #endif
 
 #endif
-struct FILE;  // For jpeglib.h.
+
+#include <stdio.h>  // For jpeglib.h.
 
 // C++ build requires extern C for jpeg internals.
 #ifdef __cplusplus
@@ -59,10 +60,10 @@ const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
 // Methods that are passed to jpeglib.
 boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
 void init_source(jpeg_decompress_struct* cinfo);
-void skip_input_data(jpeg_decompress_struct* cinfo,
-                     long num_bytes);  // NOLINT
+void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes);  // NOLINT
 void term_source(jpeg_decompress_struct* cinfo);
 void ErrorHandler(jpeg_common_struct* cinfo);
+void OutputHandler(jpeg_common_struct* cinfo);
 
 MJpegDecoder::MJpegDecoder()
     : has_scanline_padding_(LIBYUV_FALSE),
@@ -78,6 +79,7 @@ MJpegDecoder::MJpegDecoder()
   decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
   // Override standard exit()-based error handler.
   error_mgr_->base.error_exit = &ErrorHandler;
+  error_mgr_->base.output_message = &OutputHandler;
 #endif
   decompress_struct_->client_data = NULL;
   source_mgr_->init_source = &init_source;
@@ -101,7 +103,7 @@ MJpegDecoder::~MJpegDecoder() {
   DestroyOutputBuffers();
 }
 
-LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
   if (!ValidateJpeg(src, src_len)) {
     return LIBYUV_FALSE;
   }
@@ -128,7 +130,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
       if (scanlines_[i]) {
         delete scanlines_[i];
       }
-      scanlines_[i] = new uint8* [scanlines_size];
+      scanlines_[i] = new uint8_t*[scanlines_size];
       scanlines_sizes_[i] = scanlines_size;
     }
 
@@ -144,7 +146,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
       if (databuf_[i]) {
         delete databuf_[i];
       }
-      databuf_[i] = new uint8[databuf_size];
+      databuf_[i] = new uint8_t[databuf_size];
       databuf_strides_[i] = databuf_stride;
     }
 
@@ -194,13 +196,11 @@ int MJpegDecoder::GetVertSampFactor(int component) {
 }
 
 int MJpegDecoder::GetHorizSubSampFactor(int component) {
-  return decompress_struct_->max_h_samp_factor /
-      GetHorizSampFactor(component);
+  return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);
 }
 
 int MJpegDecoder::GetVertSubSampFactor(int component) {
-  return decompress_struct_->max_v_samp_factor /
-      GetVertSampFactor(component);
+  return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);
 }
 
 int MJpegDecoder::GetImageScanlinesPerImcuRow() {
@@ -244,10 +244,10 @@ LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
 }
 
 // TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
-LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
-    uint8** planes, int dst_width, int dst_height) {
-  if (dst_width != GetWidth() ||
-      dst_height > GetHeight()) {
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes,
+                                          int dst_width,
+                                          int dst_height) {
+  if (dst_width != GetWidth() || dst_height > GetHeight()) {
     // ERROR: Bad dimensions
     return LIBYUV_FALSE;
   }
@@ -288,14 +288,13 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
       for (int i = 0; i < num_outbufs_; ++i) {
         // TODO(fbarchard): Compute skip to avoid this
         assert(skip % GetVertSubSampFactor(i) == 0);
-        int rows_to_skip =
-            DivideAndRoundDown(skip, GetVertSubSampFactor(i));
-        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
-                                rows_to_skip;
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int scanlines_to_copy =
+            GetComponentScanlinesPerImcuRow(i) - rows_to_skip;
         int data_to_skip = rows_to_skip * GetComponentStride(i);
-        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
-                  planes[i], GetComponentWidth(i),
-                  GetComponentWidth(i), scanlines_to_copy);
+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],
+                  GetComponentWidth(i), GetComponentWidth(i),
+                  scanlines_to_copy);
         planes[i] += scanlines_to_copy * GetComponentWidth(i);
       }
       lines_left -= (GetImageScanlinesPerImcuRow() - skip);
@@ -304,16 +303,15 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
 
   // Read full MCUs but cropped horizontally
   for (; lines_left > GetImageScanlinesPerImcuRow();
-         lines_left -= GetImageScanlinesPerImcuRow()) {
+       lines_left -= GetImageScanlinesPerImcuRow()) {
     if (!DecodeImcuRow()) {
       FinishDecode();
       return LIBYUV_FALSE;
     }
     for (int i = 0; i < num_outbufs_; ++i) {
       int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
-      CopyPlane(databuf_[i], GetComponentStride(i),
-                planes[i], GetComponentWidth(i),
-                GetComponentWidth(i), scanlines_to_copy);
+      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
       planes[i] += scanlines_to_copy * GetComponentWidth(i);
     }
   }
@@ -327,19 +325,19 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
     for (int i = 0; i < num_outbufs_; ++i) {
       int scanlines_to_copy =
           DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
-      CopyPlane(databuf_[i], GetComponentStride(i),
-                planes[i], GetComponentWidth(i),
-                GetComponentWidth(i), scanlines_to_copy);
+      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
       planes[i] += scanlines_to_copy * GetComponentWidth(i);
     }
   }
   return FinishDecode();
 }
 
-LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
-    int dst_width, int dst_height) {
-  if (dst_width != GetWidth() ||
-      dst_height > GetHeight()) {
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,
+                                           void* opaque,
+                                           int dst_width,
+                                           int dst_height) {
+  if (dst_width != GetWidth() || dst_height > GetHeight()) {
     // ERROR: Bad dimensions
     return LIBYUV_FALSE;
   }
@@ -394,7 +392,7 @@ LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
   }
   // Read full MCUs until we get to the crop point.
   for (; lines_left >= GetImageScanlinesPerImcuRow();
-         lines_left -= GetImageScanlinesPerImcuRow()) {
+       lines_left -= GetImageScanlinesPerImcuRow()) {
     if (!DecodeImcuRow()) {
       FinishDecode();
       return LIBYUV_FALSE;
@@ -419,7 +417,10 @@ void init_source(j_decompress_ptr cinfo) {
 boolean fill_input_buffer(j_decompress_ptr cinfo) {
   BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
   if (buf_vec->pos >= buf_vec->len) {
+    // Don't assert-fail when fuzzing.
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
     assert(0 && "No more data");
+#endif
     // ERROR: No more data
     return FALSE;
   }
@@ -429,28 +430,35 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
   return TRUE;
 }
 
-void skip_input_data(j_decompress_ptr cinfo,
-                     long num_bytes) {  // NOLINT
-  cinfo->src->next_input_byte += num_bytes;
+void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
+  jpeg_source_mgr* src = cinfo->src;
+  size_t bytes = static_cast<size_t>(num_bytes);
+  if (bytes > src->bytes_in_buffer) {
+    src->next_input_byte = nullptr;
+    src->bytes_in_buffer = 0;
+  } else {
+    src->next_input_byte += bytes;
+    src->bytes_in_buffer -= bytes;
+  }
 }
 
 void term_source(j_decompress_ptr cinfo) {
-  // Nothing to do.
+  (void)cinfo;  // Nothing to do.
 }
 
 #ifdef HAVE_SETJMP
 void ErrorHandler(j_common_ptr cinfo) {
-  // This is called when a jpeglib command experiences an error. Unfortunately
-  // jpeglib's error handling model is not very flexible, because it expects the
-  // error handler to not return--i.e., it wants the program to terminate. To
-  // recover from errors we use setjmp() as shown in their example. setjmp() is
-  // C's implementation for the "call with current continuation" functionality
-  // seen in some functional programming languages.
-  // A formatted message can be output, but is unsafe for release.
+// This is called when a jpeglib command experiences an error. Unfortunately
+// jpeglib's error handling model is not very flexible, because it expects the
+// error handler to not return--i.e., it wants the program to terminate. To
+// recover from errors we use setjmp() as shown in their example. setjmp() is
+// C's implementation for the "call with current continuation" functionality
+// seen in some functional programming languages.
+// A formatted message can be output, but is unsafe for release.
 #ifdef DEBUG
   char buf[JMSG_LENGTH_MAX];
   (*cinfo->err->format_message)(cinfo, buf);
-  // ERROR: Error in jpeglib: buf
+// ERROR: Error in jpeglib: buf
 #endif
 
   SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
@@ -458,7 +466,13 @@ void ErrorHandler(j_common_ptr cinfo) {
   // and causes it to return (for a second time) with value 1.
   longjmp(mgr->setjmp_buffer, 1);
 }
-#endif
+
+// Suppress fprintf warnings.
+void OutputHandler(j_common_ptr cinfo) {
+  (void)cinfo;
+}
+
+#endif  // HAVE_SETJMP
 
 void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
   if (num_outbufs != num_outbufs_) {
@@ -467,9 +481,9 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
     // it.
     DestroyOutputBuffers();
 
-    scanlines_ = new uint8** [num_outbufs];
+    scanlines_ = new uint8_t**[num_outbufs];
     scanlines_sizes_ = new int[num_outbufs];
-    databuf_ = new uint8* [num_outbufs];
+    databuf_ = new uint8_t*[num_outbufs];
     databuf_strides_ = new int[num_outbufs];
 
     for (int i = 0; i < num_outbufs; ++i) {
@@ -485,13 +499,13 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
 
 void MJpegDecoder::DestroyOutputBuffers() {
   for (int i = 0; i < num_outbufs_; ++i) {
-    delete [] scanlines_[i];
-    delete [] databuf_[i];
+    delete[] scanlines_[i];
+    delete[] databuf_[i];
   }
-  delete [] scanlines_;
-  delete [] databuf_;
-  delete [] scanlines_sizes_;
-  delete [] databuf_strides_;
+  delete[] scanlines_;
+  delete[] databuf_;
+  delete[] scanlines_sizes_;
+  delete[] databuf_strides_;
   scanlines_ = NULL;
   databuf_ = NULL;
   scanlines_sizes_ = NULL;
@@ -525,9 +539,9 @@ LIBYUV_BOOL MJpegDecoder::FinishDecode() {
   return LIBYUV_TRUE;
 }
 
-void MJpegDecoder::SetScanlinePointers(uint8** data) {
+void MJpegDecoder::SetScanlinePointers(uint8_t** data) {
   for (int i = 0; i < num_outbufs_; ++i) {
-    uint8* data_i = data[i];
+    uint8_t* data_i = data[i];
     for (int j = 0; j < scanlines_sizes_[i]; ++j) {
       scanlines_[i][j] = data_i;
       data_i += GetComponentStride(i);
@@ -537,26 +551,26 @@ void MJpegDecoder::SetScanlinePointers(uint8** data) {
 
 inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
   return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
-      jpeg_read_raw_data(decompress_struct_,
-                         scanlines_,
-                         GetImageScanlinesPerImcuRow());
+         jpeg_read_raw_data(decompress_struct_, scanlines_,
+                            GetImageScanlinesPerImcuRow());
 }
 
 // The helper function which recognizes the jpeg sub-sampling type.
 JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
-    int* subsample_x, int* subsample_y, int number_of_components) {
+    int* subsample_x,
+    int* subsample_y,
+    int number_of_components) {
   if (number_of_components == 3) {  // Color images.
-    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 2 && subsample_y[1] == 2 &&
-        subsample_x[2] == 2 && subsample_y[2] == 2) {
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+        subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
       return kJpegYuv420;
-    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 2 && subsample_y[1] == 1 &&
-        subsample_x[2] == 2 && subsample_y[2] == 1) {
+    }
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+        subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) {
       return kJpegYuv422;
-    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 1 && subsample_y[1] == 1 &&
-        subsample_x[2] == 1 && subsample_y[2] == 1) {
+    }
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 &&
+        subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) {
       return kJpegYuv444;
     }
   } else if (number_of_components == 1) {  // Grey-scale images.
@@ -569,4 +583,3 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
 
 }  // namespace libyuv
 #endif  // HAVE_JPEG
-
diff --git a/media/libaom/src/third_party/libyuv/source/mjpeg_validate.cc b/media/libaom/src/third_party/libyuv/source/mjpeg_validate.cc
index 8edfbe1e74..ba0a03ab9e 100644
--- a/media/libaom/src/third_party/libyuv/source/mjpeg_validate.cc
+++ b/media/libaom/src/third_party/libyuv/source/mjpeg_validate.cc
@@ -17,85 +17,55 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// Enable this to try scasb implementation.
-// #define ENABLE_SCASB 1
-
-#ifdef ENABLE_SCASB
-
-// Multiple of 1.
-__declspec(naked)
-const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
-  __asm {
-    mov        edx, edi
-    mov        edi, [esp + 4]   // src
-    mov        eax, [esp + 8]   // val
-    mov        ecx, [esp + 12]  // count
-    repne scasb
-    jne        sr99
-    mov        eax, edi
-    sub        eax, 1
-    mov        edi, edx
-    ret
-
-  sr99:
-    mov        eax, 0
-    mov        edi, edx
-    ret
-  }
-}
-#endif
-
-// Helper function to scan for EOI marker.
-static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
-  const uint8* end = sample + sample_size - 1;
-  const uint8* it = sample;
-  for (;;) {
-#ifdef ENABLE_SCASB
-    it = ScanRow_ERMS(it, 0xff, end - it);
-#else
-    it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
-#endif
-    if (it == NULL) {
-      break;
-    }
-    if (it[1] == 0xd9) {
-      return LIBYUV_TRUE;  // Success: Valid jpeg.
+// Helper function to scan for EOI marker (0xff 0xd9).
+static LIBYUV_BOOL ScanEOI(const uint8_t* src_mjpg, size_t src_size_mjpg) {
+  if (src_size_mjpg >= 2) {
+    const uint8_t* end = src_mjpg + src_size_mjpg - 1;
+    const uint8_t* it = src_mjpg;
+    while (it < end) {
+      // TODO(fbarchard): scan for 0xd9 instead.
+      it = (const uint8_t*)(memchr(it, 0xff, end - it));
+      if (it == NULL) {
+        break;
+      }
+      if (it[1] == 0xd9) {
+        return LIBYUV_TRUE;  // Success: Valid jpeg.
+      }
+      ++it;  // Skip over current 0xff.
     }
-    ++it;  // Skip over current 0xff.
   }
-  // ERROR: Invalid jpeg end code not found. Size sample_size
+  // ERROR: Invalid jpeg end code not found. Size src_size_mjpg
   return LIBYUV_FALSE;
 }
 
 // Helper function to validate the jpeg appears intact.
-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg) {
+  // Maximum size that ValidateJpeg will consider valid.
+  const size_t kMaxJpegSize = 0x7fffffffull;
   const size_t kBackSearchSize = 1024;
-  if (sample_size < 64) {
-    // ERROR: Invalid jpeg size: sample_size
+  if (src_size_mjpg < 64 || src_size_mjpg > kMaxJpegSize || !src_mjpg) {
+    // ERROR: Invalid jpeg size: src_size_mjpg
     return LIBYUV_FALSE;
   }
-  if (sample[0] != 0xff || sample[1] != 0xd8) {  // Start Of Image
+  // SOI marker
+  if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8 || src_mjpg[2] != 0xff) {
     // ERROR: Invalid jpeg initial start code
     return LIBYUV_FALSE;
   }
-  // Step over SOI marker.
-  sample += 2;
-  sample_size -= 2;
 
-  // Look for the End Of Image (EOI) marker in the end kilobyte of the buffer.
-  if (sample_size > kBackSearchSize) {
-    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+  // Look for the End Of Image (EOI) marker near the end of the buffer.
+  if (src_size_mjpg > kBackSearchSize) {
+    if (ScanEOI(src_mjpg + src_size_mjpg - kBackSearchSize, kBackSearchSize)) {
       return LIBYUV_TRUE;  // Success: Valid jpeg.
     }
     // Reduce search size for forward search.
-    sample_size = sample_size - kBackSearchSize + 1;
+    src_size_mjpg = src_size_mjpg - kBackSearchSize + 1;
   }
-  return ScanEOI(sample, sample_size);
-
+  // Step over SOI marker and scan for EOI.
+  return ScanEOI(src_mjpg + 2, src_size_mjpg - 2);
 }
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
diff --git a/media/libaom/src/third_party/libyuv/source/planar_functions.cc b/media/libaom/src/third_party/libyuv/source/planar_functions.cc
index b96bd50206..4e8908c2eb 100644
--- a/media/libaom/src/third_party/libyuv/source/planar_functions.cc
+++ b/media/libaom/src/third_party/libyuv/source/planar_functions.cc
@@ -17,6 +17,7 @@
 #include "libyuv/mjpeg_decoder.h"
 #endif
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h"  // for ScaleRowDown2
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -25,14 +26,22 @@ extern "C" {
 
 // Copy a plane of data
 LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+void CopyPlane(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   int y;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -41,6 +50,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
   if (src_y == dst_y && src_stride_y == dst_stride_y) {
     return;
   }
+
 #if defined(HAS_COPYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -61,11 +71,6 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
 
   // Copy plane
   for (y = 0; y < height; ++y) {
@@ -75,15 +80,19 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
   }
 }
 
+// TODO(fbarchard): Consider support for negative height.
+// TODO(fbarchard): Consider stride measured in bytes.
 LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
-                  uint16* dst_y, int dst_stride_y,
-                  int width, int height) {
+void CopyPlane_16(const uint16_t* src_y,
+                  int src_stride_y,
+                  uint16_t* dst_y,
+                  int dst_stride_y,
+                  int width,
+                  int height) {
   int y;
-  void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
+  void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C;
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -103,11 +112,6 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
     CopyRow = CopyRow_16_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_16_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_16_MIPS;
-  }
-#endif
 
   // Copy plane
   for (y = 0; y < height; ++y) {
@@ -117,19 +121,124 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
   }
 }
 
+// Convert a plane of 16 bit data to 8 bit
+LIBYUV_API
+void Convert16To8Plane(const uint16_t* src_y,
+                       int src_stride_y,
+                       uint8_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 16384 for 10 bits
+                       int width,
+                       int height) {
+  int y;
+  void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
+                          int width) = Convert16To8Row_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_CONVERT16TO8ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Convert16To8Row = Convert16To8Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      Convert16To8Row = Convert16To8Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT16TO8ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Convert16To8Row = Convert16To8Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      Convert16To8Row = Convert16To8Row_AVX2;
+    }
+  }
+#endif
+
+  // Convert plane
+  for (y = 0; y < height; ++y) {
+    Convert16To8Row(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Convert a plane of 8 bit data to 16 bit
+LIBYUV_API
+void Convert8To16Plane(const uint8_t* src_y,
+                       int src_stride_y,
+                       uint16_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 16384 for 10 bits
+                       int width,
+                       int height) {
+  int y;
+  void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale,
+                          int width) = Convert8To16Row_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_CONVERT8TO16ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Convert8To16Row = Convert8To16Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      Convert8To16Row = Convert8To16Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT8TO16ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Convert8To16Row = Convert8To16Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      Convert8To16Row = Convert8To16Row_AVX2;
+    }
+  }
+#endif
+
+  // Convert plane
+  for (y = 0; y < height; ++y) {
+    Convert8To16Row(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
 // Copy I422.
 LIBYUV_API
-int I422Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
+int I422Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
   int halfwidth = (width + 1) >> 1;
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -142,7 +251,10 @@ int I422Copy(const uint8* src_y, int src_stride_y,
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
   }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
   CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
   CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
   return 0;
@@ -150,16 +262,21 @@ int I422Copy(const uint8* src_y, int src_stride_y,
 
 // Copy I444.
 LIBYUV_API
-int I444Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+int I444Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -173,7 +290,9 @@ int I444Copy(const uint8* src_y, int src_stride_y,
     src_stride_v = -src_stride_v;
   }
 
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
   CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
   CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
   return 0;
@@ -181,9 +300,12 @@ int I444Copy(const uint8* src_y, int src_stride_y,
 
 // Copy I400.
 LIBYUV_API
-int I400ToI400(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int I400ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -199,11 +321,20 @@ int I400ToI400(const uint8* src_y, int src_stride_y,
 
 // Convert I420 to I400.
 LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int I420ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  (void)src_u;
+  (void)src_stride_u;
+  (void)src_v;
+  (void)src_stride_v;
   if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -213,84 +344,440 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
     src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
+
   CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   return 0;
 }
 
-// Mirror a plane of data.
-void MirrorPlane(const uint8* src_y, int src_stride_y,
-                 uint8* dst_y, int dst_stride_y,
-                 int width, int height) {
-  int y;
-  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+// Copy NV12. Supports inverting.
+int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv,
+             int src_stride_uv, uint8_t* dst_y, int dst_stride_y,
+             uint8_t* dst_uv, int dst_stride_uv, int width, int height) {
+  if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
+    halfheight = (height + 1) >> 1;
     src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
     src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
   }
-#if defined(HAS_MIRRORROW_NEON)
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2,
+            halfheight);
+  return 0;
+}
+
+// Copy NV21. Supports inverting.
+int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu,
+             int src_stride_vu, uint8_t* dst_y, int dst_stride_y,
+             uint8_t* dst_vu, int dst_stride_vu, int width, int height) {
+  return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+                  dst_stride_y, dst_vu, dst_stride_vu, width, height);
+}
+
+// Support function for NV12 etc UV channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitUVPlane(const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int width,
+                  int height) {
+  int y;
+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
+                     int width) = SplitUVRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == width * 2 && dst_stride_u == width &&
+      dst_stride_v == width) {
+    width *= height;
+    height = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRow = MirrorRow_Any_NEON;
+    SplitUVRow = SplitUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_NEON;
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SplitUVRow = SplitUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      SplitUVRow = SplitUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SplitUVRow = SplitUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_MSA;
     }
   }
 #endif
-#if defined(HAS_MIRRORROW_SSE2)
+
+  for (y = 0; y < height; ++y) {
+    // Copy a row of UV.
+    SplitUVRow(src_uv, dst_u, dst_v, width);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
+  }
+}
+
+LIBYUV_API
+void MergeUVPlane(const uint8_t* src_u,
+                  int src_stride_u,
+                  const uint8_t* src_v,
+                  int src_stride_v,
+                  uint8_t* dst_uv,
+                  int dst_stride_uv,
+                  int width,
+                  int height) {
+  int y;
+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+                     uint8_t* dst_uv, int width) = MergeUVRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uv = dst_uv + (height - 1) * dst_stride_uv;
+    dst_stride_uv = -dst_stride_uv;
+  }
+  // Coalesce rows.
+  if (src_stride_u == width && src_stride_v == width &&
+      dst_stride_uv == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_u = src_stride_v = dst_stride_uv = 0;
+  }
+#if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    MirrorRow = MirrorRow_Any_SSE2;
+    MergeUVRow = MergeUVRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_SSE2;
+      MergeUVRow = MergeUVRow_SSE2;
     }
   }
 #endif
-#if defined(HAS_MIRRORROW_SSSE3)
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MergeUVRow = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MergeUVRow = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      MergeUVRow = MergeUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      MergeUVRow = MergeUVRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Merge a row of U and V into a row of UV.
+    MergeUVRow(src_u, src_v, dst_uv, width);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += dst_stride_uv;
+  }
+}
+
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_vu,
+                 int dst_stride_vu,
+                 int width,
+                 int height) {
+  int y;
+  void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+      SwapUVRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_uv = dst_stride_vu = 0;
+  }
+
+#if defined(HAS_SWAPUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    MirrorRow = MirrorRow_Any_SSSE3;
+    SwapUVRow = SwapUVRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_SSSE3;
+      SwapUVRow = SwapUVRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_MIRRORROW_AVX2)
+#if defined(HAS_SWAPUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    MirrorRow = MirrorRow_Any_AVX2;
+    SwapUVRow = SwapUVRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_AVX2;
+      SwapUVRow = SwapUVRow_AVX2;
     }
   }
 #endif
-// TODO(fbarchard): Mirror on mips handle unaligned memory.
-#if defined(HAS_MIRRORROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
-    MirrorRow = MirrorRow_MIPS_DSPR2;
+#if defined(HAS_SWAPUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SwapUVRow = SwapUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SwapUVRow = SwapUVRow_NEON;
+    }
   }
 #endif
 
-  // Mirror plane
   for (y = 0; y < height; ++y) {
-    MirrorRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
+    SwapUVRow(src_uv, dst_vu, width);
+    src_uv += src_stride_uv;
+    dst_vu += dst_stride_vu;
+  }
+}
+
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_vu || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_vu = src_vu + (halfheight - 1) * src_stride_vu;
+    src_stride_vu = -src_stride_vu;
+  }
+
+  SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth,
+              halfheight);
+  return 0;
+}
+
+// Support function for NV12 etc RGB channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitRGBPlane(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_r,
+                   int dst_stride_r,
+                   uint8_t* dst_g,
+                   int dst_stride_g,
+                   uint8_t* dst_b,
+                   int dst_stride_b,
+                   int width,
+                   int height) {
+  int y;
+  void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+                      uint8_t* dst_b, int width) = SplitRGBRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_r = dst_r + (height - 1) * dst_stride_r;
+    dst_g = dst_g + (height - 1) * dst_stride_g;
+    dst_b = dst_b + (height - 1) * dst_stride_b;
+    dst_stride_r = -dst_stride_r;
+    dst_stride_g = -dst_stride_g;
+    dst_stride_b = -dst_stride_b;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb == width * 3 && dst_stride_r == width &&
+      dst_stride_g == width && dst_stride_b == width) {
+    width *= height;
+    height = 1;
+    src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
+  }
+#if defined(HAS_SPLITRGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SplitRGBRow = SplitRGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      SplitRGBRow = SplitRGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SPLITRGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SplitRGBRow = SplitRGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      SplitRGBRow = SplitRGBRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SPLITRGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitRGBRow = SplitRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitRGBRow = SplitRGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Copy a row of RGB.
+    SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width);
+    dst_r += dst_stride_r;
+    dst_g += dst_stride_g;
+    dst_b += dst_stride_b;
+    src_rgb += src_stride_rgb;
+  }
+}
+
+LIBYUV_API
+void MergeRGBPlane(const uint8_t* src_r,
+                   int src_stride_r,
+                   const uint8_t* src_g,
+                   int src_stride_g,
+                   const uint8_t* src_b,
+                   int src_stride_b,
+                   uint8_t* dst_rgb,
+                   int dst_stride_rgb,
+                   int width,
+                   int height) {
+  int y;
+  void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+                      const uint8_t* src_b, uint8_t* dst_rgb, int width) =
+      MergeRGBRow_C;
+  // Coalesce rows.
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
+    dst_stride_rgb = -dst_stride_rgb;
+  }
+  // Coalesce rows.
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_rgb == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;
+  }
+#if defined(HAS_MERGERGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MergeRGBRow = MergeRGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MergeRGBRow = MergeRGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MERGERGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeRGBRow = MergeRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MergeRGBRow = MergeRGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGERGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeRGBRow = MergeRGBRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      MergeRGBRow = MergeRGBRow_MMI;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Merge a row of U and V into a row of RGB.
+    MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_rgb += dst_stride_rgb;
   }
 }
 
 // Convert YUY2 to I422.
 LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int YUY2ToI422(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix) =
-      YUY2ToUV422Row_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
+  void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
       YUY2ToYRow_C;
+  if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -298,10 +785,9 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
     src_stride_yuy2 = -src_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+      width * height <= 32768) {
     width *= height;
     height = 1;
     src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -329,15 +815,33 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
 #if defined(HAS_YUY2TOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    if (width >= 16) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-    }
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       YUY2ToYRow = YUY2ToYRow_NEON;
       YUY2ToUV422Row = YUY2ToUV422Row_NEON;
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUV422ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MMI;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToYRow = YUY2ToYRow_MMI;
+      YUY2ToUV422Row = YUY2ToUV422Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUV422Row = YUY2ToUV422Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
@@ -352,17 +856,24 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
 
 // Convert UYVY to I422.
 LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int UYVYToI422(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*UYVYToUV422Row)(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix) =
-      UYVYToUV422Row_C;
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-                     uint8* dst_y, int pix) = UYVYToYRow_C;
+  void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = UYVYToUV422Row_C;
+  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+      UYVYToYRow_C;
+  if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -370,10 +881,9 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
     src_stride_uyvy = -src_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_uyvy == width * 2 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_uyvy == width * 2 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+      width * height <= 32768) {
     width *= height;
     height = 1;
     src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -401,15 +911,33 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
 #if defined(HAS_UYVYTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     UYVYToYRow = UYVYToYRow_Any_NEON;
-    if (width >= 16) {
-      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
-    }
+    UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       UYVYToYRow = UYVYToYRow_NEON;
       UYVYToUV422Row = UYVYToUV422Row_NEON;
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUV422ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    UYVYToYRow = UYVYToYRow_Any_MMI;
+    UYVYToUV422Row = UYVYToUV422Row_Any_MMI;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_MMI;
+      UYVYToUV422Row = UYVYToUV422Row_MMI;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUV422Row = UYVYToUV422Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
@@ -422,13 +950,214 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
   return 0;
 }
 
+// Convert YUY2 to Y.
+LIBYUV_API
+int YUY2ToY(const uint8_t* src_yuy2,
+            int src_stride_yuy2,
+            uint8_t* dst_y,
+            int dst_stride_y,
+            int width,
+            int height) {
+  int y;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
+  if (!src_yuy2 || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_y = 0;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToYRow = YUY2ToYRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Mirror a plane of data.
+// See Also I400Mirror
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+                 int src_stride_y,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height) {
+  int y;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MirrorRow = MirrorRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorRow = MirrorRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    MirrorRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+                   int src_stride_uv,
+                   uint8_t* dst_uv,
+                   int dst_stride_uv,
+                   int width,
+                   int height) {
+  int y;
+  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
+      MirrorUVRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+#if defined(HAS_MIRRORUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorUVRow = MirrorUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorUVRow = MirrorUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorUVRow = MirrorUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorUVRow = MirrorUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorUVRow = MirrorUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorUVRow = MirrorUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorUVRow = MirrorUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorUVRow = MirrorUVRow_MSA;
+    }
+  }
+#endif
+
+  // MirrorUV plane
+  for (y = 0; y < height; ++y) {
+    MirrorUVRow(src_uv, dst_uv, width);
+    src_uv += src_stride_uv;
+    dst_uv += dst_stride_uv;
+  }
+}
+
 // Mirror I400 with optional flipping
 LIBYUV_API
-int I400Mirror(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
-  if (!src_y || !dst_y ||
-      width <= 0 || height == 0) {
+int I400Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -444,17 +1173,24 @@ int I400Mirror(const uint8* src_y, int src_stride_y,
 
 // Mirror I420 with optional flipping
 LIBYUV_API
-int I420Mirror(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I420Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -477,13 +1213,51 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  if (dst_y) {
+    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
+                halfheight);
+  return 0;
+}
+
 // ARGB mirror.
 LIBYUV_API
-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int ARGBMirror(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+  void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
       ARGBMirrorRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -497,7 +1271,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
 #if defined(HAS_ARGBMIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
+    if (IS_ALIGNED(width, 8)) {
       ARGBMirrorRow = ARGBMirrorRow_NEON;
     }
   }
@@ -518,6 +1292,22 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBMirrorRow = ARGBMirrorRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_MSA;
+    }
+  }
+#endif
 
   // Mirror plane
   for (y = 0; y < height; ++y) {
@@ -528,41 +1318,96 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// RGB24 mirror.
+LIBYUV_API
+int RGB24Mirror(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  int y;
+  void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
+      RGB24MirrorRow_C;
+  if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+#if defined(HAS_RGB24MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24MirrorRow = RGB24MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24MirrorRow = RGB24MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB24MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24MirrorRow = RGB24MirrorRow_SSSE3;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    RGB24MirrorRow(src_rgb24, dst_rgb24, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_rgb24 += dst_stride_rgb24;
+  }
+  return 0;
+}
+
 // Get a blender that optimized for the CPU and pixel count.
 // As there are 6 blenders to choose from, the caller should try to use
 // the same blend function for all pixels if possible.
 LIBYUV_API
 ARGBBlendRow GetARGBBlend() {
-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width) = ARGBBlendRow_C;
+  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
+                       uint8_t* dst_argb, int width) = ARGBBlendRow_C;
 #if defined(HAS_ARGBBLENDROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBBlendRow = ARGBBlendRow_SSSE3;
     return ARGBBlendRow;
   }
 #endif
-#if defined(HAS_ARGBBLENDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlendRow = ARGBBlendRow_SSE2;
-  }
-#endif
 #if defined(HAS_ARGBBLENDROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBBlendRow = ARGBBlendRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBBLENDROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBBlendRow = ARGBBlendRow_MMI;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBBlendRow = ARGBBlendRow_MSA;
+  }
+#endif
   return ARGBBlendRow;
 }
 
 // Alpha Blend 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
-              const uint8* src_argb1, int src_stride_argb1,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
+int ARGBBlend(const uint8_t* src_argb0,
+              int src_stride_argb0,
+              const uint8_t* src_argb1,
+              int src_stride_argb1,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
   int y;
-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width) = GetARGBBlend();
+  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
+                       uint8_t* dst_argb, int width) = GetARGBBlend();
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -573,8 +1418,7 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -590,15 +1434,232 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
   return 0;
 }
 
+// Alpha Blend plane and store to destination.
+LIBYUV_API
+int BlendPlane(const uint8_t* src_y0,
+               int src_stride_y0,
+               const uint8_t* src_y1,
+               int src_stride_y1,
+               const uint8_t* alpha,
+               int alpha_stride,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  int y;
+  void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
+                        const uint8_t* alpha, uint8_t* dst, int width) =
+      BlendPlaneRow_C;
+  if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+
+  // Coalesce rows for Y plane.
+  if (src_stride_y0 == width && src_stride_y1 == width &&
+      alpha_stride == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
+  }
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      BlendPlaneRow = BlendPlaneRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      BlendPlaneRow = BlendPlaneRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_BLENDPLANEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    BlendPlaneRow = BlendPlaneRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      BlendPlaneRow = BlendPlaneRow_MMI;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
+    src_y0 += src_stride_y0;
+    src_y1 += src_stride_y1;
+    alpha += alpha_stride;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+#define MAXTWIDTH 2048
+// Alpha Blend YUV images and store to destination.
+LIBYUV_API
+int I420Blend(const uint8_t* src_y0,
+              int src_stride_y0,
+              const uint8_t* src_u0,
+              int src_stride_u0,
+              const uint8_t* src_v0,
+              int src_stride_v0,
+              const uint8_t* src_y1,
+              int src_stride_y1,
+              const uint8_t* src_u1,
+              int src_stride_u1,
+              const uint8_t* src_v1,
+              int src_stride_v1,
+              const uint8_t* alpha,
+              int alpha_stride,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
+  int y;
+  // Half width/height for UV.
+  int halfwidth = (width + 1) >> 1;
+  void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
+                        const uint8_t* alpha, uint8_t* dst, int width) =
+      BlendPlaneRow_C;
+  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+  if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
+      !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+
+  // Blend Y plane.
+  BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride,
+             dst_y, dst_stride_y, width, height);
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      BlendPlaneRow = BlendPlaneRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      BlendPlaneRow = BlendPlaneRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_BLENDPLANEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    BlendPlaneRow = BlendPlaneRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      BlendPlaneRow = BlendPlaneRow_MMI;
+    }
+  }
+#endif
+  if (!IS_ALIGNED(width, 2)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
+  }
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;
+      if (IS_ALIGNED(halfwidth, 16)) {
+        ScaleRowDown2 = ScaleRowDown2Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
+      if (IS_ALIGNED(halfwidth, 16)) {
+        ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;
+      if (IS_ALIGNED(halfwidth, 32)) {
+        ScaleRowDown2 = ScaleRowDown2Box_AVX2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_MMI;
+      if (IS_ALIGNED(halfwidth, 8)) {
+        ScaleRowDown2 = ScaleRowDown2Box_MMI;
+      }
+    }
+  }
+#endif
+
+  // Row buffer for intermediate alpha pixels.
+  align_buffer_64(halfalpha, halfwidth);
+  for (y = 0; y < height; y += 2) {
+    // last row of odd height image use 1 row of alpha instead of 2.
+    if (y == (height - 1)) {
+      alpha_stride = 0;
+    }
+    // Subsample 2 rows of UV to half width and half height.
+    ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth);
+    alpha += alpha_stride * 2;
+    BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth);
+    BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth);
+    src_u0 += src_stride_u0;
+    src_u1 += src_stride_u1;
+    dst_u += dst_stride_u;
+    src_v0 += src_stride_v0;
+    src_v1 += src_stride_v1;
+    dst_v += dst_stride_v;
+  }
+  free_aligned_buffer_64(halfalpha);
+  return 0;
+}
+
 // Multiply 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+int ARGBMultiply(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
-                          int width) = ARGBMultiplyRow_C;
+  void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1,
+                          uint8_t* dst, int width) = ARGBMultiplyRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -609,8 +1670,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -640,6 +1700,22 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBMULTIPLYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_MSA;
+    }
+  }
+#endif
 
   // Multiply plane
   for (y = 0; y < height; ++y) {
@@ -653,12 +1729,16 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
 
 // Add 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
-            const uint8* src_argb1, int src_stride_argb1,
-            uint8* dst_argb, int dst_stride_argb,
-            int width, int height) {
+int ARGBAdd(const uint8_t* src_argb0,
+            int src_stride_argb0,
+            const uint8_t* src_argb1,
+            int src_stride_argb1,
+            uint8_t* dst_argb,
+            int dst_stride_argb,
+            int width,
+            int height) {
   int y;
-  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+  void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst,
                      int width) = ARGBAddRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -670,8 +1750,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -706,6 +1785,22 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBADDROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBAddRow = ARGBAddRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBAddRow = ARGBAddRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAddRow = ARGBAddRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_MSA;
+    }
+  }
+#endif
 
   // Add plane
   for (y = 0; y < height; ++y) {
@@ -719,13 +1814,17 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
 
 // Subtract 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+int ARGBSubtract(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
-                          int width) = ARGBSubtractRow_C;
+  void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1,
+                          uint8_t* dst, int width) = ARGBSubtractRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -736,8 +1835,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -767,350 +1865,108 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
     }
   }
 #endif
-
-  // Subtract plane
-  for (y = 0; y < height; ++y) {
-    ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
-    src_argb0 += src_stride_argb0;
-    src_argb1 += src_stride_argb1;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height) {
-  int y;
-  void (*I422ToBGRARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToBGRARow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_bgra ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
-    dst_stride_bgra = -dst_stride_bgra;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_bgra == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
-  }
-#if defined(HAS_I422TOBGRAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToBGRARow = I422ToBGRARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOBGRAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToBGRARow = I422ToBGRARow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToBGRARow = I422ToBGRARow_AVX2;
+#if defined(HAS_ARGBSUBTRACTROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBSubtractRow = ARGBSubtractRow_MMI;
     }
   }
 #endif
-#if defined(HAS_I422TOBGRAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToBGRARow = I422ToBGRARow_Any_NEON;
+#if defined(HAS_ARGBSUBTRACTROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
     if (IS_ALIGNED(width, 8)) {
-      I422ToBGRARow = I422ToBGRARow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
-    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
-    dst_bgra += dst_stride_bgra;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  int y;
-  void (*I422ToABGRRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToABGRRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_abgr ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
-    dst_stride_abgr = -dst_stride_abgr;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_abgr == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
-  }
-#if defined(HAS_I422TOABGRROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    I422ToABGRRow = I422ToABGRRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToABGRRow = I422ToABGRRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOABGRROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToABGRRow = I422ToABGRRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOABGRROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToABGRRow = I422ToABGRRow_AVX2;
+      ARGBSubtractRow = ARGBSubtractRow_MSA;
     }
   }
 #endif
 
+  // Subtract plane
   for (y = 0; y < height; ++y) {
-    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
-    dst_abgr += dst_stride_abgr;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
+    ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
-// Convert I422 to RGBA.
+// Convert RAW to RGB24.
 LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
+int RAWToRGB24(const uint8_t* src_raw,
+               int src_stride_raw,
+               uint8_t* dst_rgb24,
+               int dst_stride_rgb24,
+               int width,
+               int height) {
   int y;
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_rgba ||
-      width <= 0 || height == 0) {
+  void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) =
+      RAWToRGB24Row_C;
+  if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
-    dst_stride_rgba = -dst_stride_rgba;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_rgba == width * 4) {
+  if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) {
     width *= height;
     height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
-  }
-#if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGBARow = I422ToRGBARow_AVX2;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
-    dst_rgba += dst_stride_rgba;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
-  int y;
-  void (*NV12ToRGB565Row)(const uint8* y_buf,
-                          const uint8* uv_buf,
-                          uint8* rgb_buf,
-                          int width) = NV12ToRGB565Row_C;
-  if (!src_y || !src_uv || !dst_rgb565 ||
-      width <= 0 || height == 0) {
-    return -1;
+    src_stride_raw = dst_stride_rgb24 = 0;
   }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+    RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+      RAWToRGB24Row = RAWToRGB24Row_SSSE3;
     }
   }
 #endif
-#if defined(HAS_NV12TORGB565ROW_NEON)
+#if defined(HAS_RAWTORGB24ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+    RAWToRGB24Row = RAWToRGB24Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+      RAWToRGB24Row = RAWToRGB24Row_NEON;
     }
   }
 #endif
-
-  for (y = 0; y < height; ++y) {
-    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
-
-// Convert NV21 to RGB565.
-LIBYUV_API
-int NV21ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_vu, int src_stride_vu,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
-  int y;
-  void (*NV21ToRGB565Row)(const uint8* y_buf,
-                          const uint8* src_vu,
-                          uint8* rgb_buf,
-                          int width) = NV21ToRGB565Row_C;
-  if (!src_y || !src_vu || !dst_rgb565 ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_NV21TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
+#if defined(HAS_RAWTORGB24ROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      RAWToRGB24Row = RAWToRGB24Row_MMI;
     }
   }
 #endif
-#if defined(HAS_NV21TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2;
+#if defined(HAS_RAWTORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
-      NV21ToRGB565Row = NV21ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV21TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToRGB565Row = NV21ToRGB565Row_NEON;
+      RAWToRGB24Row = RAWToRGB24Row_MSA;
     }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_vu += src_stride_vu;
-    }
+    RAWToRGB24Row(src_raw, dst_rgb24, width);
+    src_raw += src_stride_raw;
+    dst_rgb24 += dst_stride_rgb24;
   }
   return 0;
 }
 
 LIBYUV_API
-void SetPlane(uint8* dst_y, int dst_stride_y,
-              int width, int height,
-              uint32 value) {
+void SetPlane(uint8_t* dst_y,
+              int dst_stride_y,
+              int width,
+              int height,
+              uint32_t value) {
   int y;
-  void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;
+  void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;
   if (height < 0) {
     height = -height;
     dst_y = dst_y + (height - 1) * dst_stride_y;
@@ -1143,6 +1999,11 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
     SetRow = SetRow_ERMS;
   }
 #endif
+#if defined(HAS_SETROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) {
+    SetRow = SetRow_MSA;
+  }
+#endif
 
   // Set plane
   for (y = 0; y < height; ++y) {
@@ -1153,22 +2014,26 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
 
 // Draw a rectangle into I420
 LIBYUV_API
-int I420Rect(uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int x, int y,
-             int width, int height,
-             int value_y, int value_u, int value_v) {
+int I420Rect(uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int x,
+             int y,
+             int width,
+             int height,
+             int value_y,
+             int value_u,
+             int value_v) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  uint8* start_y = dst_y + y * dst_stride_y + x;
-  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
-  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
-  if (!dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0 ||
-      x < 0 || y < 0 ||
-      value_y < 0 || value_y > 255 ||
-      value_u < 0 || value_u > 255 ||
+  uint8_t* start_y = dst_y + y * dst_stride_y + x;
+  uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+  uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+  if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
+      y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
       value_v < 0 || value_v > 255) {
     return -1;
   }
@@ -1181,15 +2046,17 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
 
 // Draw a rectangle into ARGB
 LIBYUV_API
-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
-             int dst_x, int dst_y,
-             int width, int height,
-             uint32 value) {
+int ARGBRect(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height,
+             uint32_t value) {
   int y;
-  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;
-  if (!dst_argb ||
-      width <= 0 || height == 0 ||
-      dst_x < 0 || dst_y < 0) {
+  void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =
+      ARGBSetRow_C;
+  if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
   if (height < 0) {
@@ -1218,6 +2085,22 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
     ARGBSetRow = ARGBSetRow_X86;
   }
 #endif
+#if defined(HAS_ARGBSETROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBSetRow = ARGBSetRow_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSETROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBSetRow = ARGBSetRow_Any_MSA;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_MSA;
+    }
+  }
+#endif
 
   // Set plane
   for (y = 0; y < height; ++y) {
@@ -1241,11 +2124,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
 //   f is foreground pixel premultiplied by alpha
 
 LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height) {
+int ARGBAttenuate(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
   int y;
-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -1256,20 +2142,11 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
   }
-#if defined(HAS_ARGBATTENUATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
-    }
-  }
-#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -1294,6 +2171,22 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBAttenuateRow(src_argb, dst_argb, width);
@@ -1305,11 +2198,14 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
 
 // Convert preattentuated ARGB to unattenuated ARGB.
 LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height) {
+int ARGBUnattenuate(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height) {
   int y;
-  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+  void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                              int width) = ARGBUnattenuateRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -1320,8 +2216,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1342,7 +2237,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
-// TODO(fbarchard): Neon version.
+  // TODO(fbarchard): Neon version.
 
   for (y = 0; y < height; ++y) {
     ARGBUnattenuateRow(src_argb, dst_argb, width);
@@ -1354,12 +2249,15 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to Grayed ARGB.
 LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int ARGBGrayTo(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
-                      int width) = ARGBGrayRow_C;
+  void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      ARGBGrayRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1369,8 +2267,7 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1385,6 +2282,16 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+    ARGBGrayRow = ARGBGrayRow_MMI;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_MSA;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBGrayRow(src_argb, dst_argb, width);
@@ -1396,13 +2303,16 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
 
 // Make a rectangle of ARGB gray scale.
 LIBYUV_API
-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
-             int dst_x, int dst_y,
-             int width, int height) {
+int ARGBGray(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height) {
   int y;
-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
-                      int width) = ARGBGrayRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      ARGBGrayRow_C;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
@@ -1422,6 +2332,17 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+    ARGBGrayRow = ARGBGrayRow_MMI;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_MSA;
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBGrayRow(dst, dst, width);
     dst += dst_stride_argb;
@@ -1431,11 +2352,15 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
 
 // Make a rectangle of ARGB Sepia tone.
 LIBYUV_API
-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
-              int dst_x, int dst_y, int width, int height) {
+int ARGBSepia(uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_x,
+              int dst_y,
+              int width,
+              int height) {
   int y;
-  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
@@ -1455,6 +2380,17 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
     ARGBSepiaRow = ARGBSepiaRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBSEPIAROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+    ARGBSepiaRow = ARGBSepiaRow_MMI;
+  }
+#endif
+#if defined(HAS_ARGBSEPIAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_MSA;
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBSepiaRow(dst, width);
     dst += dst_stride_argb;
@@ -1465,13 +2401,17 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
 // Apply a 4x4 matrix to each ARGB pixel.
 // Note: Normally for shading, but can be used to swizzle or invert.
 LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    const int8* matrix_argb,
-                    int width, int height) {
+int ARGBColorMatrix(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    const int8_t* matrix_argb,
+                    int width,
+                    int height) {
   int y;
-  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
-      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
+  void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                             const int8_t* matrix_argb, int width) =
+      ARGBColorMatrixRow_C;
   if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1481,8 +2421,7 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1497,6 +2436,16 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
     ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBCOLORMATRIXROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_MMI;
+  }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
+  }
+#endif
   for (y = 0; y < height; ++y) {
     ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
     src_argb += src_stride_argb;
@@ -1508,13 +2457,17 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
 // Apply a 4x3 matrix to each ARGB pixel.
 // Deprecated.
 LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
-                   const int8* matrix_rgb,
-                   int dst_x, int dst_y, int width, int height) {
-  SIMD_ALIGNED(int8 matrix_argb[16]);
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+int RGBColorMatrix(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const int8_t* matrix_rgb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height) {
+  SIMD_ALIGNED(int8_t matrix_argb[16]);
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
     return -1;
   }
 
@@ -1534,23 +2487,26 @@ int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
   matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
   matrix_argb[15] = 64;  // 1.0
 
-  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
-                         dst, dst_stride_argb,
-                         &matrix_argb[0], width, height);
+  return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst,
+                         dst_stride_argb, &matrix_argb[0], width, height);
 }
 
 // Apply a color table each ARGB pixel.
 // Table contains 256 ARGB values.
 LIBYUV_API
-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                   const uint8* table_argb,
-                   int dst_x, int dst_y, int width, int height) {
+int ARGBColorTable(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const uint8_t* table_argb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+  void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
                             int width) = ARGBColorTableRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
     return -1;
   }
   // Coalesce rows.
@@ -1574,15 +2530,19 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
 // Apply a color table each ARGB pixel but preserve destination alpha.
 // Table contains 256 ARGB values.
 LIBYUV_API
-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                  const uint8* table_argb,
-                  int dst_x, int dst_y, int width, int height) {
+int RGBColorTable(uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  const uint8_t* table_argb,
+                  int dst_x,
+                  int dst_y,
+                  int width,
+                  int height) {
   int y;
-  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+  void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
                            int width) = RGBColorTableRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
     return -1;
   }
   // Coalesce rows.
@@ -1613,13 +2573,19 @@ int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
 // Caveat - although SSE2 saturates, the C function does not and should be used
 // with care if doing anything but quantization.
 LIBYUV_API
-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
-                 int scale, int interval_size, int interval_offset,
-                 int dst_x, int dst_y, int width, int height) {
+int ARGBQuantize(uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int scale,
+                 int interval_size,
+                 int interval_offset,
+                 int dst_x,
+                 int dst_y,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+  void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size,
                           int interval_offset, int width) = ARGBQuantizeRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
       interval_size < 1 || interval_size > 255) {
     return -1;
@@ -1640,6 +2606,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
     ARGBQuantizeRow = ARGBQuantizeRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBQUANTIZEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_MSA;
+  }
+#endif
   for (y = 0; y < height; ++y) {
     ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
     dst += dst_stride_argb;
@@ -1650,13 +2621,17 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
 // Computes table of cumulative sum for image where the value is the sum
 // of all values above and to the left of the entry. Used by ARGBBlur.
 LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
-                             int32* dst_cumsum, int dst_stride32_cumsum,
-                             int width, int height) {
+int ARGBComputeCumulativeSum(const uint8_t* src_argb,
+                             int src_stride_argb,
+                             int32_t* dst_cumsum,
+                             int dst_stride32_cumsum,
+                             int width,
+                             int height) {
   int y;
-  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
-  int32* previous_cumsum = dst_cumsum;
+  void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
+                                  const int32_t* previous_cumsum, int width) =
+      ComputeCumulativeSumRow_C;
+  int32_t* previous_cumsum = dst_cumsum;
   if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
     return -1;
   }
@@ -1665,6 +2640,12 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
     ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
   }
 #endif
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
+  }
+#endif
+
   memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
   for (y = 0; y < height; ++y) {
     ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
@@ -1680,18 +2661,25 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
 // aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
 // as the buffer is treated as circular.
 LIBYUV_API
-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int32* dst_cumsum, int dst_stride32_cumsum,
-             int width, int height, int radius) {
+int ARGBBlur(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int32_t* dst_cumsum,
+             int dst_stride32_cumsum,
+             int width,
+             int height,
+             int radius) {
   int y;
-  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
-  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
-      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
-  int32* cumsum_bot_row;
-  int32* max_cumsum_bot_row;
-  int32* cumsum_top_row;
+  void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
+                                  const int32_t* previous_cumsum, int width) =
+      ComputeCumulativeSumRow_C;
+  void (*CumulativeSumToAverageRow)(
+      const int32_t* topleft, const int32_t* botleft, int width, int area,
+      uint8_t* dst, int count) = CumulativeSumToAverageRow_C;
+  int32_t* cumsum_bot_row;
+  int32_t* max_cumsum_bot_row;
+  int32_t* cumsum_top_row;
 
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -1716,11 +2704,15 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
     CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
   }
 #endif
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
+  }
+#endif
   // Compute enough CumulativeSum for first row to be blurred. After this
   // one row of CumulativeSum is updated at a time.
-  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
-                           dst_cumsum, dst_stride32_cumsum,
-                           width, radius);
+  ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
+                           dst_stride32_cumsum, width, radius);
 
   src_argb = src_argb + radius * src_stride_argb;
   cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
@@ -1746,7 +2738,7 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
     // Increment cumsum_bot_row pointer with circular buffer wrap around and
     // then fill in a row of CumulativeSum.
     if ((y + radius) < height) {
-      const int32* prev_cumsum_bot_row = cumsum_bot_row;
+      const int32_t* prev_cumsum_bot_row = cumsum_bot_row;
       cumsum_bot_row += dst_stride32_cumsum;
       if (cumsum_bot_row >= max_cumsum_bot_row) {
         cumsum_bot_row = dst_cumsum;
@@ -1758,24 +2750,24 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
 
     // Left clipped.
     for (x = 0; x < radius + 1; ++x) {
-      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
-                                boxwidth, area, &dst_argb[x * 4], 1);
+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+                                &dst_argb[x * 4], 1);
       area += (bot_y - top_y);
       boxwidth += 4;
     }
 
     // Middle unclipped.
     n = (width - 1) - radius - x + 1;
-    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
-                              boxwidth, area, &dst_argb[x * 4], n);
+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+                              &dst_argb[x * 4], n);
 
     // Right clipped.
     for (x += n; x <= width - 1; ++x) {
       area -= (bot_y - top_y);
       boxwidth -= 4;
       CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
-                                cumsum_bot_row + (x - radius - 1) * 4,
-                                boxwidth, area, &dst_argb[x * 4], 1);
+                                cumsum_bot_row + (x - radius - 1) * 4, boxwidth,
+                                area, &dst_argb[x * 4], 1);
     }
     dst_argb += dst_stride_argb;
   }
@@ -1784,12 +2776,16 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
 
 // Multiply ARGB image by a specified ARGB value.
 LIBYUV_API
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height, uint32 value) {
+int ARGBShade(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height,
+              uint32_t value) {
   int y;
-  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
-                       int width, uint32 value) = ARGBShadeRow_C;
+  void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width,
+                       uint32_t value) = ARGBShadeRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
     return -1;
   }
@@ -1799,8 +2795,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1815,6 +2810,16 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
     ARGBShadeRow = ARGBShadeRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBSHADEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+    ARGBShadeRow = ARGBShadeRow_MMI;
+  }
+#endif
+#if defined(HAS_ARGBSHADEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_MSA;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBShadeRow(src_argb, dst_argb, width, value);
@@ -1824,45 +2829,40 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
-// Interpolate 2 ARGB images by specified amount (0 to 255).
+// Interpolate 2 planes by specified amount (0 to 255).
 LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
-                    const uint8* src_argb1, int src_stride_argb1,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int interpolation) {
+int InterpolatePlane(const uint8_t* src0,
+                     int src_stride0,
+                     const uint8_t* src1,
+                     int src_stride1,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width,
+                     int height,
+                     int interpolation) {
   int y;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+  if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    dst = dst + (height - 1) * dst_stride;
+    dst_stride = -dst_stride;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
     width *= height;
     height = 1;
-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
-  }
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      InterpolateRow = InterpolateRow_SSE2;
-    }
+    src_stride0 = src_stride1 = dst_stride = 0;
   }
-#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
+    if (IS_ALIGNED(width, 16)) {
       InterpolateRow = InterpolateRow_SSSE3;
     }
   }
@@ -1870,7 +2870,7 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
 #if defined(HAS_INTERPOLATEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 32)) {
       InterpolateRow = InterpolateRow_AVX2;
     }
   }
@@ -1878,40 +2878,104 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
 #if defined(HAS_INTERPOLATEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
+    if (IS_ALIGNED(width, 16)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
-      IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
-      IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
-                   width * 4, interpolation);
-    src_argb0 += src_stride_argb0;
-    src_argb1 += src_stride_argb1;
-    dst_argb += dst_stride_argb;
+    InterpolateRow(dst, src0, src1 - src0, width, interpolation);
+    src0 += src_stride0;
+    src1 += src_stride1;
+    dst += dst_stride;
+  }
+  return 0;
+}
+
+// Interpolate 2 ARGB images by specified amount (0 to 255).
+LIBYUV_API
+int ARGBInterpolate(const uint8_t* src_argb0,
+                    int src_stride_argb0,
+                    const uint8_t* src_argb1,
+                    int src_stride_argb1,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int interpolation) {
+  return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1,
+                          src_stride_argb1, dst_argb, dst_stride_argb,
+                          width * 4, height, interpolation);
+}
+
+// Interpolate 2 YUV images by specified amount (0 to 255).
+LIBYUV_API
+int I420Interpolate(const uint8_t* src0_y,
+                    int src0_stride_y,
+                    const uint8_t* src0_u,
+                    int src0_stride_u,
+                    const uint8_t* src0_v,
+                    int src0_stride_v,
+                    const uint8_t* src1_y,
+                    int src1_stride_y,
+                    const uint8_t* src1_u,
+                    int src1_stride_u,
+                    const uint8_t* src1_v,
+                    int src1_stride_v,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    int width,
+                    int height,
+                    int interpolation) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
+      !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
   }
+  InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,
+                   dst_stride_y, width, height, interpolation);
+  InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,
+                   dst_stride_u, halfwidth, halfheight, interpolation);
+  InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v,
+                   dst_stride_v, halfwidth, halfheight, interpolation);
   return 0;
 }
 
 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
 LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
-                uint8* dst_argb, int dst_stride_argb,
-                const uint8* shuffler, int width, int height) {
+int ARGBShuffle(const uint8_t* src_bgra,
+                int src_stride_bgra,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                const uint8_t* shuffler,
+                int width,
+                int height) {
   int y;
-  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
-                         const uint8* shuffler, int pix) = ARGBShuffleRow_C;
-  if (!src_bgra || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb,
+                         const uint8_t* shuffler, int width) = ARGBShuffleRow_C;
+  if (!src_bgra || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1921,20 +2985,11 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
     src_stride_bgra = -src_stride_bgra;
   }
   // Coalesce rows.
-  if (src_stride_bgra == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_bgra = dst_stride_argb = 0;
   }
-#if defined(HAS_ARGBSHUFFLEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBShuffleRow = ARGBShuffleRow_SSE2;
-    }
-  }
-#endif
 #if defined(HAS_ARGBSHUFFLEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
@@ -1959,6 +3014,22 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
     }
   }
 #endif
+#if defined(HAS_ARGBSHUFFLEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBShuffleRow = ARGBShuffleRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
@@ -1968,29 +3039,107 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
   return 0;
 }
 
+// Gauss blur a float plane using Gaussian 5x5 filter with
+// coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+// Edge is 2 pixels on each side, and interior is multiple of 4.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+                   int src_stride,
+                   float* dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
+  int y;
+  void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2,
+                       const float* src3, const float* src4, float* dst,
+                       int width) = GaussCol_F32_C;
+  void (*GaussRow_F32)(const float* src, float* dst, int width) =
+      GaussRow_F32_C;
+  if (!src || !dst || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+#if defined(HAS_GAUSSCOL_F32_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    GaussCol_F32 = GaussCol_F32_NEON;
+  }
+#endif
+#if defined(HAS_GAUSSROW_F32_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    GaussRow_F32 = GaussRow_F32_NEON;
+  }
+#endif
+  {
+    // 2 pixels on each side, but aligned out to 16 bytes.
+    align_buffer_64(rowbuf, (4 + width + 4) * 4);
+    memset(rowbuf, 0, 16);
+    memset(rowbuf + (4 + width) * 4, 0, 16);
+    float* row = (float*)(rowbuf + 16);
+    const float* src0 = src;
+    const float* src1 = src;
+    const float* src2 = src;
+    const float* src3 = src2 + ((height > 1) ? src_stride : 0);
+    const float* src4 = src3 + ((height > 2) ? src_stride : 0);
+
+    for (y = 0; y < height; ++y) {
+      GaussCol_F32(src0, src1, src2, src3, src4, row, width);
+
+      // Extrude edge by 2 floats
+      row[-2] = row[-1] = row[0];
+      row[width + 1] = row[width] = row[width - 1];
+
+      GaussRow_F32(row - 2, dst, width);
+
+      src0 = src1;
+      src1 = src2;
+      src2 = src3;
+      src3 = src4;
+      if ((y + 2) < (height - 1)) {
+        src4 += src_stride;
+      }
+      dst += dst_stride;
+    }
+    free_aligned_buffer_64(rowbuf);
+  }
+  return 0;
+}
+
 // Sobel ARGB effect.
-static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
-                        uint8* dst_argb, int dst_stride_argb,
-                        int width, int height,
-                        void (*SobelRow)(const uint8* src_sobelx,
-                                         const uint8* src_sobely,
-                                         uint8* dst, int width)) {
+static int ARGBSobelize(const uint8_t* src_argb,
+                        int src_stride_argb,
+                        uint8_t* dst_argb,
+                        int dst_stride_argb,
+                        int width,
+                        int height,
+                        void (*SobelRow)(const uint8_t* src_sobelx,
+                                         const uint8_t* src_sobely,
+                                         uint8_t* dst,
+                                         int width)) {
   int y;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) =
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
       ARGBToYJRow_C;
-  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) = SobelYRow_C;
-  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobely, int width) =
+  void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+                    uint8_t* dst_sobely, int width) = SobelYRow_C;
+  void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+                    const uint8_t* src_y2, uint8_t* dst_sobely, int width) =
       SobelXRow_C;
   const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
-  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
 
@@ -2018,6 +3167,22 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
 
 #if defined(HAS_SOBELYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -2029,6 +3194,16 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
     SobelYRow = SobelYRow_NEON;
   }
 #endif
+#if defined(HAS_SOBELYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SobelYRow = SobelYRow_MMI;
+  }
+#endif
+#if defined(HAS_SOBELYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelYRow = SobelYRow_MSA;
+  }
+#endif
 #if defined(HAS_SOBELXROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelXRow = SobelXRow_SSE2;
@@ -2039,18 +3214,28 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
     SobelXRow = SobelXRow_NEON;
   }
 #endif
+#if defined(HAS_SOBELXROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SobelXRow = SobelXRow_MMI;
+  }
+#endif
+#if defined(HAS_SOBELXROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelXRow = SobelXRow_MSA;
+  }
+#endif
   {
     // 3 rows with edges before/after.
     const int kRowSize = (width + kEdge + 31) & ~31;
     align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
-    uint8* row_sobelx = rows;
-    uint8* row_sobely = rows + kRowSize;
-    uint8* row_y = rows + kRowSize * 2;
+    uint8_t* row_sobelx = rows;
+    uint8_t* row_sobely = rows + kRowSize;
+    uint8_t* row_y = rows + kRowSize * 2;
 
     // Convert first row.
-    uint8* row_y0 = row_y + kEdge;
-    uint8* row_y1 = row_y0 + kRowSize;
-    uint8* row_y2 = row_y1 + kRowSize;
+    uint8_t* row_y0 = row_y + kEdge;
+    uint8_t* row_y1 = row_y0 + kRowSize;
+    uint8_t* row_y2 = row_y1 + kRowSize;
     ARGBToYJRow(src_argb, row_y0, width);
     row_y0[-1] = row_y0[0];
     memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
@@ -2074,7 +3259,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
 
       // Cycle thru circular queue of 3 row_y buffers.
       {
-        uint8* row_yt = row_y0;
+        uint8_t* row_yt = row_y0;
         row_y0 = row_y1;
         row_y1 = row_y2;
         row_y2 = row_yt;
@@ -2089,11 +3274,14 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
 
 // Sobel ARGB effect.
 LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
-  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) = SobelRow_C;
+int ARGBSobel(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
+  void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+                   uint8_t* dst_argb, int width) = SobelRow_C;
 #if defined(HAS_SOBELROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelRow = SobelRow_Any_SSE2;
@@ -2110,17 +3298,36 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_SOBELROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SobelRow = SobelRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      SobelRow = SobelRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SOBELROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelRow = SobelRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_MSA;
+    }
+  }
+#endif
   return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height, SobelRow);
 }
 
 // Sobel ARGB effect with planar output.
 LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
-                     uint8* dst_y, int dst_stride_y,
-                     int width, int height) {
-  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_, int width) = SobelToPlaneRow_C;
+int ARGBSobelToPlane(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     int width,
+                     int height) {
+  void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+                          uint8_t* dst_, int width) = SobelToPlaneRow_C;
 #if defined(HAS_SOBELTOPLANEROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
@@ -2137,18 +3344,37 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
-  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
-                      width, height, SobelToPlaneRow);
+#if defined(HAS_SOBELTOPLANEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      SobelToPlaneRow = SobelToPlaneRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SobelToPlaneRow = SobelToPlaneRow_MSA;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width,
+                      height, SobelToPlaneRow);
 }
 
 // SobelXY ARGB effect.
 // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
 LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height) {
-  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) = SobelXYRow_C;
+int ARGBSobelXY(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height) {
+  void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+                     uint8_t* dst_argb, int width) = SobelXYRow_C;
 #if defined(HAS_SOBELXYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelXYRow = SobelXYRow_Any_SSE2;
@@ -2165,32 +3391,49 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_SOBELXYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SobelXYRow = SobelXYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      SobelXYRow = SobelXYRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SOBELXYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelXYRow = SobelXYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_MSA;
+    }
+  }
+#endif
   return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height, SobelXYRow);
 }
 
 // Apply a 4x4 polynomial to each ARGB pixel.
 LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb, int dst_stride_argb,
+int ARGBPolynomial(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
                    const float* poly,
-                   int width, int height) {
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBPolynomialRow)(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) = ARGBPolynomialRow_C;
+  void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                            const float* poly, int width) = ARGBPolynomialRow_C;
   if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -2215,28 +3458,132 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   float scale,
+                   int width,
+                   int height) {
+  int y;
+  void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale,
+                       int width) = HalfFloatRow_C;
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  src_stride_y >>= 1;
+  dst_stride_y >>= 1;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_HALFFLOATROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    HalfFloatRow = HalfFloatRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      HalfFloatRow = HalfFloatRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HalfFloatRow = HalfFloatRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      HalfFloatRow = HalfFloatRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_F16C)
+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
+    HalfFloatRow =
+        (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
+    if (IS_ALIGNED(width, 16)) {
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    HalfFloatRow =
+        (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    HalfFloatRow = HalfFloatRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      HalfFloatRow = HalfFloatRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    HalfFloatRow(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) {
+  void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale,
+                         int width) = ByteToFloatRow_C;
+  if (!src_y || !dst_y || width <= 0) {
+    return -1;
+  }
+#if defined(HAS_BYTETOFLOATROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ByteToFloatRow = ByteToFloatRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ByteToFloatRow = ByteToFloatRow_NEON;
+    }
+  }
+#endif
+
+  ByteToFloatRow(src_y, dst_y, scale, width);
+  return 0;
+}
+
 // Apply a lumacolortable to each ARGB pixel.
 LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_argb, int dst_stride_argb,
-                       const uint8* luma,
-                       int width, int height) {
+int ARGBLumaColorTable(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       const uint8_t* luma,
+                       int width,
+                       int height) {
   int y;
-  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
-      int width, const uint8* luma, const uint32 lumacoeff) =
-      ARGBLumaColorTableRow_C;
+  void (*ARGBLumaColorTableRow)(
+      const uint8_t* src_argb, uint8_t* dst_argb, int width,
+      const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C;
   if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -2257,12 +3604,15 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
 
 // Copy Alpha from one ARGB image to another.
 LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height) {
+int ARGBCopyAlpha(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
   int y;
-  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
-      ARGBCopyAlphaRow_C;
+  void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBCopyAlphaRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -2273,20 +3623,33 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBCOPYALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
-    ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
+    }
   }
 #endif
 #if defined(HAS_ARGBCOPYALPHAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
-    ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBCOPYALPHAROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI;
+    }
   }
 #endif
 
@@ -2298,14 +3661,81 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// Extract just the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_a,
+                     int dst_stride_a,
+                     int width,
+                     int height) {
+  if (!src_argb || !dst_a || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb += (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_a == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_a = 0;
+  }
+  void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
+                              int width) = ARGBExtractAlphaRow_C;
+#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
+                                               : ARGBExtractAlphaRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
+                                                : ARGBExtractAlphaRow_Any_AVX2;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
+                                                : ARGBExtractAlphaRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI
+                                               : ARGBExtractAlphaRow_Any_MMI;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
+                                                : ARGBExtractAlphaRow_Any_MSA;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    ARGBExtractAlphaRow(src_argb, dst_a, width);
+    src_argb += src_stride_argb;
+    dst_a += dst_stride_a;
+  }
+  return 0;
+}
+
 // Copy a planar Y channel to the alpha channel of a destination ARGB image.
 LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
-                     uint8* dst_argb, int dst_stride_argb,
-                     int width, int height) {
+int ARGBCopyYToAlpha(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height) {
   int y;
-  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
-      ARGBCopyYToAlphaRow_C;
+  void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb,
+                              int width) = ARGBCopyYToAlphaRow_C;
   if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -2316,20 +3746,33 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
     src_stride_y = -src_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
+    }
   }
 #endif
 #if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI;
+    }
   }
 #endif
 
@@ -2341,21 +3784,26 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+// TODO(fbarchard): Consider if width is even Y channel can be split
+// directly. A SplitUVRow_Odd function could copy the remaining chroma.
+
 LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int YUY2ToNV12(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
-      SplitUVRow_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
+                     int width) = SplitUVRow_C;
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  if (!src_yuy2 ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -2388,11 +3836,19 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSE2;
+#if defined(HAS_SPLITUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SplitUVRow = SplitUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      SplitUVRow = SplitUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SplitUVRow = SplitUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_MSA;
     }
   }
 #endif
@@ -2420,25 +3876,43 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;
-    // 2 rows of uv
-    align_buffer_64(rows, awidth * 2);
+    // row of y and 2 rows of uv
+    align_buffer_64(rows, awidth * 3);
 
     for (y = 0; y < height - 1; y += 2) {
       // Split Y from UV.
-      SplitUVRow(src_yuy2, dst_y, rows, awidth);
-      SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y,
-                 rows + awidth, awidth);
-      InterpolateRow(dst_uv, rows, awidth, awidth, 128);
+      SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
+      memcpy(dst_y, rows, width);
+      SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
+      memcpy(dst_y + dst_stride_y, rows, width);
+      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
       src_yuy2 += src_stride_yuy2 * 2;
       dst_y += dst_stride_y * 2;
       dst_uv += dst_stride_uv;
     }
     if (height & 1) {
       // Split Y from UV.
-      SplitUVRow(src_yuy2, dst_y, dst_uv, width);
+      SplitUVRow(src_yuy2, rows, dst_uv, awidth);
+      memcpy(dst_y, rows, width);
     }
     free_aligned_buffer_64(rows);
   }
@@ -2446,20 +3920,22 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
 }
 
 LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int UYVYToNV12(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
-      SplitUVRow_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
+                     int width) = SplitUVRow_C;
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  if (!src_uyvy ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -2492,11 +3968,19 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSE2;
+#if defined(HAS_SPLITUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    SplitUVRow = SplitUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      SplitUVRow = SplitUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SplitUVRow = SplitUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_MSA;
     }
   }
 #endif
@@ -2524,31 +4008,99 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;
-    // 2 rows of uv
-    align_buffer_64(rows, awidth * 2);
+    // row of y and 2 rows of uv
+    align_buffer_64(rows, awidth * 3);
 
     for (y = 0; y < height - 1; y += 2) {
       // Split Y from UV.
-      SplitUVRow(src_uyvy, rows, dst_y, awidth);
-      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth,
-                 dst_y + dst_stride_y, awidth);
-      InterpolateRow(dst_uv, rows, awidth, awidth, 128);
+      SplitUVRow(src_uyvy, rows + awidth, rows, awidth);
+      memcpy(dst_y, rows, width);
+      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth);
+      memcpy(dst_y + dst_stride_y, rows, width);
+      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
       src_uyvy += src_stride_uyvy * 2;
       dst_y += dst_stride_y * 2;
       dst_uv += dst_stride_uv;
     }
     if (height & 1) {
       // Split Y from UV.
-      SplitUVRow(src_uyvy, dst_y, dst_uv, width);
+      SplitUVRow(src_uyvy, dst_uv, rows, awidth);
+      memcpy(dst_y, rows, width);
     }
     free_aligned_buffer_64(rows);
   }
   return 0;
 }
 
+// width and height are src size allowing odd size handling.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height) {
+  int y;
+  void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
+                         const uint8_t* src_v, int src_stride_v,
+                         uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+#if defined(HAS_HALFMERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    HalfMergeUVRow = HalfMergeUVRow_NEON;
+  }
+#endif
+#if defined(HAS_HALFMERGEUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    HalfMergeUVRow = HalfMergeUVRow_SSSE3;
+  }
+#endif
+#if defined(HAS_HALFMERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+    HalfMergeUVRow = HalfMergeUVRow_AVX2;
+  }
+#endif
+  for (y = 0; y < height - 1; y += 2) {
+    // Merge a row of U and V into a row of UV.
+    HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
+    src_u += src_stride_u * 2;
+    src_v += src_stride_v * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/media/libaom/src/third_party/libyuv/source/rotate.cc b/media/libaom/src/third_party/libyuv/source/rotate.cc
index be3d589207..32904e4731 100644
--- a/media/libaom/src/third_party/libyuv/source/rotate.cc
+++ b/media/libaom/src/third_party/libyuv/source/rotate.cc
@@ -10,8 +10,8 @@
 
 #include "libyuv/rotate.h"
 
-#include "libyuv/cpu_id.h"
 #include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate_row.h"
 #include "libyuv/row.h"
@@ -22,12 +22,29 @@ extern "C" {
 #endif
 
 LIBYUV_API
-void TransposePlane(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void TransposePlane(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   int i = height;
-  void (*TransposeWx8)(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
+                        int dst_stride, int width) = TransposeWx16_C;
+#else
+  void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
+                       int dst_stride, int width) = TransposeWx8_C;
+#endif
+
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeWx16 = TransposeWx16_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx16 = TransposeWx16_MSA;
+    }
+  }
+#else
 #if defined(HAS_TRANSPOSEWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeWx8 = TransposeWx8_NEON;
@@ -41,6 +58,11 @@ void TransposePlane(const uint8* src, int src_stride,
     }
   }
 #endif
+#if defined(HAS_TRANSPOSEWX8_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    TransposeWx8 = TransposeWx8_MMI;
+  }
+#endif
 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
@@ -49,24 +71,25 @@ void TransposePlane(const uint8* src, int src_stride,
     }
   }
 #endif
-#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-      TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;
-    } else {
-      TransposeWx8 = TransposeWx8_MIPS_DSPR2;
-    }
-  }
-#endif
+#endif /* defined(HAS_TRANSPOSEWX16_MSA) */
 
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  // Work across the source in 16x16 tiles
+  while (i >= 16) {
+    TransposeWx16(src, src_stride, dst, dst_stride, width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst += 16;               // Move over 16 columns.
+    i -= 16;
+  }
+#else
   // Work across the source in 8x8 tiles
   while (i >= 8) {
     TransposeWx8(src, src_stride, dst, dst_stride, width);
-    src += 8 * src_stride;    // Go down 8 rows.
-    dst += 8;                 // Move over 8 columns.
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst += 8;               // Move over 8 columns.
     i -= 8;
   }
+#endif
 
   if (i > 0) {
     TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
@@ -74,9 +97,12 @@ void TransposePlane(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-void RotatePlane90(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
-                   int width, int height) {
+void RotatePlane90(const uint8_t* src,
+                   int src_stride,
+                   uint8_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
   // Rotate by 90 is a transpose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
@@ -86,9 +112,12 @@ void RotatePlane90(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-void RotatePlane270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void RotatePlane270(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   // Rotate by 270 is a transpose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
@@ -98,33 +127,28 @@ void RotatePlane270(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-void RotatePlane180(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void RotatePlane180(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width);
-  const uint8* src_bot = src + src_stride * (height - 1);
-  uint8* dst_bot = dst + dst_stride * (height - 1);
+  const uint8_t* src_bot = src + src_stride * (height - 1);
+  uint8_t* dst_bot = dst + dst_stride * (height - 1);
   int half_height = (height + 1) >> 1;
   int y;
-  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
 #if defined(HAS_MIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MirrorRow = MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 32)) {
       MirrorRow = MirrorRow_NEON;
     }
   }
 #endif
-#if defined(HAS_MIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MirrorRow = MirrorRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_SSE2;
-    }
-  }
-#endif
 #if defined(HAS_MIRRORROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     MirrorRow = MirrorRow_Any_SSSE3;
@@ -141,12 +165,20 @@ void RotatePlane180(const uint8* src, int src_stride,
     }
   }
 #endif
-// TODO(fbarchard): Mirror on mips handle unaligned memory.
-#if defined(HAS_MIRRORROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
-    MirrorRow = MirrorRow_MIPS_DSPR2;
+#if defined(HAS_MIRRORROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MirrorRow = MirrorRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorRow = MirrorRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
   }
 #endif
 #if defined(HAS_COPYROW_SSE2)
@@ -169,19 +201,19 @@ void RotatePlane180(const uint8* src, int src_stride,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
+#if defined(HAS_COPYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI;
   }
 #endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
-    MirrorRow(src, row, width);  // Mirror first row into a buffer
-    src += src_stride;
+    CopyRow(src, row, width);        // Copy first row into buffer
     MirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    MirrorRow(row, dst_bot, width);  // Mirror buffer into last row
+    src += src_stride;
     dst += dst_stride;
-    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
     src_bot -= src_stride;
     dst_bot -= dst_stride;
   }
@@ -189,105 +221,149 @@ void RotatePlane180(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-void TransposeUV(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
+void TransposeUV(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
   int i = height;
-  void (*TransposeUVWx8)(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+                          int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+                          int width) = TransposeUVWx16_C;
+#else
+  void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+                         int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
                          int width) = TransposeUVWx8_C;
+#endif
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx16 = TransposeUVWx16_MSA;
+    }
+  }
+#else
 #if defined(HAS_TRANSPOSEUVWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeUVWx8 = TransposeUVWx8_NEON;
   }
 #endif
 #if defined(HAS_TRANSPOSEUVWX8_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
-    TransposeUVWx8 = TransposeUVWx8_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx8 = TransposeUVWx8_SSE2;
+    }
   }
 #endif
-#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
+#if defined(HAS_TRANSPOSEUVWX8_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    TransposeUVWx8 = TransposeUVWx8_Any_MMI;
+    if (IS_ALIGNED(width, 4)) {
+      TransposeUVWx8 = TransposeUVWx8_MMI;
+    }
   }
 #endif
+#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */
 
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  // Work through the source in 8x8 tiles.
+  while (i >= 16) {
+    TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                    width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst_a += 16;             // Move over 8 columns.
+    dst_b += 16;             // Move over 8 columns.
+    i -= 16;
+  }
+#else
   // Work through the source in 8x8 tiles.
   while (i >= 8) {
-    TransposeUVWx8(src, src_stride,
-                   dst_a, dst_stride_a,
-                   dst_b, dst_stride_b,
+    TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
                    width);
-    src += 8 * src_stride;    // Go down 8 rows.
-    dst_a += 8;               // Move over 8 columns.
-    dst_b += 8;               // Move over 8 columns.
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst_a += 8;             // Move over 8 columns.
+    dst_b += 8;             // Move over 8 columns.
     i -= 8;
   }
+#endif
 
   if (i > 0) {
-    TransposeUVWxH_C(src, src_stride,
-                     dst_a, dst_stride_a,
-                     dst_b, dst_stride_b,
+    TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
                      width, i);
   }
 }
 
 LIBYUV_API
-void RotateUV90(const uint8* src, int src_stride,
-                uint8* dst_a, int dst_stride_a,
-                uint8* dst_b, int dst_stride_b,
-                int width, int height) {
+void RotateUV90(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst_a,
+                int dst_stride_a,
+                uint8_t* dst_b,
+                int dst_stride_b,
+                int width,
+                int height) {
   src += src_stride * (height - 1);
   src_stride = -src_stride;
 
-  TransposeUV(src, src_stride,
-              dst_a, dst_stride_a,
-              dst_b, dst_stride_b,
-              width, height);
+  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+              height);
 }
 
 LIBYUV_API
-void RotateUV270(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
+void RotateUV270(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
   dst_a += dst_stride_a * (width - 1);
   dst_b += dst_stride_b * (width - 1);
   dst_stride_a = -dst_stride_a;
   dst_stride_b = -dst_stride_b;
 
-  TransposeUV(src, src_stride,
-              dst_a, dst_stride_a,
-              dst_b, dst_stride_b,
-              width, height);
+  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+              height);
 }
 
 // Rotate 180 is a horizontal and vertical flip.
 LIBYUV_API
-void RotateUV180(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
+void RotateUV180(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
   int i;
-  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
-      MirrorUVRow_C;
-#if defined(HAS_MIRRORUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    MirrorRowUV = MirrorUVRow_NEON;
+  void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+                           int width) = MirrorSplitUVRow_C;
+#if defined(HAS_MIRRORSPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    MirrorSplitUVRow = MirrorSplitUVRow_NEON;
   }
 #endif
-#if defined(HAS_MIRRORROW_UV_SSSE3)
+#if defined(HAS_MIRRORSPLITUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
-    MirrorRowUV = MirrorUVRow_SSSE3;
+    MirrorSplitUVRow = MirrorSplitUVRow_SSSE3;
+  }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) {
+    MirrorSplitUVRow = MirrorSplitUVRow_MMI;
   }
 #endif
-#if defined(HAS_MIRRORUVROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-    MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
+#if defined(HAS_MIRRORSPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
+    MirrorSplitUVRow = MirrorSplitUVRow_MSA;
   }
 #endif
 
@@ -295,7 +371,7 @@ void RotateUV180(const uint8* src, int src_stride,
   dst_b += dst_stride_b * (height - 1);
 
   for (i = 0; i < height; ++i) {
-    MirrorRowUV(src, dst_a, dst_b, width);
+    MirrorSplitUVRow(src, dst_a, dst_b, width);
     src += src_stride;
     dst_a -= dst_stride_a;
     dst_b -= dst_stride_b;
@@ -303,9 +379,12 @@ void RotateUV180(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-int RotatePlane(const uint8* src, int src_stride,
-                uint8* dst, int dst_stride,
-                int width, int height,
+int RotatePlane(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst,
+                int dst_stride,
+                int width,
+                int height,
                 enum RotationMode mode) {
   if (!src || width <= 0 || height == 0 || !dst) {
     return -1;
@@ -321,24 +400,16 @@ int RotatePlane(const uint8* src, int src_stride,
   switch (mode) {
     case kRotate0:
       // copy frame
-      CopyPlane(src, src_stride,
-                dst, dst_stride,
-                width, height);
+      CopyPlane(src, src_stride, dst, dst_stride, width, height);
       return 0;
     case kRotate90:
-      RotatePlane90(src, src_stride,
-                    dst, dst_stride,
-                    width, height);
+      RotatePlane90(src, src_stride, dst, dst_stride, width, height);
       return 0;
     case kRotate270:
-      RotatePlane270(src, src_stride,
-                     dst, dst_stride,
-                     width, height);
+      RotatePlane270(src, src_stride, dst, dst_stride, width, height);
       return 0;
     case kRotate180:
-      RotatePlane180(src, src_stride,
-                     dst, dst_stride,
-                     width, height);
+      RotatePlane180(src, src_stride, dst, dst_stride, width, height);
       return 0;
     default:
       break;
@@ -347,18 +418,25 @@ int RotatePlane(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-int I420Rotate(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height,
+int I420Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
                enum RotationMode mode) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
     return -1;
   }
 
@@ -377,45 +455,89 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
   switch (mode) {
     case kRotate0:
       // copy frame
-      return I420Copy(src_y, src_stride_y,
-                      src_u, src_stride_u,
-                      src_v, src_stride_v,
-                      dst_y, dst_stride_y,
-                      dst_u, dst_stride_u,
-                      dst_v, dst_stride_v,
-                      width, height);
+      return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                      src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                      dst_v, dst_stride_v, width, height);
     case kRotate90:
-      RotatePlane90(src_y, src_stride_y,
-                    dst_y, dst_stride_y,
-                    width, height);
-      RotatePlane90(src_u, src_stride_u,
-                    dst_u, dst_stride_u,
-                    halfwidth, halfheight);
-      RotatePlane90(src_v, src_stride_v,
-                    dst_v, dst_stride_v,
-                    halfwidth, halfheight);
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                    halfheight);
+      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                    halfheight);
       return 0;
     case kRotate270:
-      RotatePlane270(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotatePlane270(src_u, src_stride_u,
-                     dst_u, dst_stride_u,
-                     halfwidth, halfheight);
-      RotatePlane270(src_v, src_stride_v,
-                     dst_v, dst_stride_v,
-                     halfwidth, halfheight);
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     halfheight);
+      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     halfheight);
       return 0;
     case kRotate180:
-      RotatePlane180(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotatePlane180(src_u, src_stride_u,
-                     dst_u, dst_stride_u,
-                     halfwidth, halfheight);
-      RotatePlane180(src_v, src_stride_v,
-                     dst_v, dst_stride_v,
-                     halfwidth, halfheight);
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     halfheight);
+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum libyuv::RotationMode mode) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case libyuv::kRotate0:
+      // copy frame
+      CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case libyuv::kRotate90:
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case libyuv::kRotate270:
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case libyuv::kRotate180:
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
       return 0;
     default:
       break;
@@ -424,17 +546,23 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
-                     const uint8* src_uv, int src_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int width, int height,
+int NV12ToI420Rotate(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
                      enum RotationMode mode) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_uv || width <= 0 || height == 0 ||
-      !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
+      !dst_v) {
     return -1;
   }
 
@@ -451,38 +579,23 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
   switch (mode) {
     case kRotate0:
       // copy frame
-      return NV12ToI420(src_y, src_stride_y,
-                        src_uv, src_stride_uv,
-                        dst_y, dst_stride_y,
-                        dst_u, dst_stride_u,
-                        dst_v, dst_stride_v,
+      return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+                        dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
                         width, height);
     case kRotate90:
-      RotatePlane90(src_y, src_stride_y,
-                    dst_y, dst_stride_y,
-                    width, height);
-      RotateUV90(src_uv, src_stride_uv,
-                 dst_u, dst_stride_u,
-                 dst_v, dst_stride_v,
-                 halfwidth, halfheight);
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                 dst_stride_v, halfwidth, halfheight);
       return 0;
     case kRotate270:
-      RotatePlane270(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotateUV270(src_uv, src_stride_uv,
-                  dst_u, dst_stride_u,
-                  dst_v, dst_stride_v,
-                  halfwidth, halfheight);
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, halfwidth, halfheight);
       return 0;
     case kRotate180:
-      RotatePlane180(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotateUV180(src_uv, src_stride_uv,
-                  dst_u, dst_stride_u,
-                  dst_v, dst_stride_v,
-                  halfwidth, halfheight);
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, halfwidth, halfheight);
       return 0;
     default:
       break;
diff --git a/media/libaom/src/third_party/libyuv/source/rotate_any.cc b/media/libaom/src/third_party/libyuv/source/rotate_any.cc
index 4d6eb34e18..b3baf084d0 100644
--- a/media/libaom/src/third_party/libyuv/source/rotate_any.cc
+++ b/media/libaom/src/third_party/libyuv/source/rotate_any.cc
@@ -18,38 +18,62 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK)                                 \
-    void NAMEANY(const uint8* src, int src_stride,                             \
-                 uint8* dst, int dst_stride, int width) {                      \
-      int r = width & MASK;                                                    \
-      int n = width - r;                                                       \
-      if (n > 0) {                                                             \
-        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
-      }                                                                        \
-      TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);        \
-    }
+#define TANY(NAMEANY, TPOS_SIMD, MASK)                                        \
+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst,              \
+               int dst_stride, int width) {                                   \
+    int r = width & MASK;                                                     \
+    int n = width - r;                                                        \
+    if (n > 0) {                                                              \
+      TPOS_SIMD(src, src_stride, dst, dst_stride, n);                         \
+    }                                                                         \
+    TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
+  }
 
 #ifdef HAS_TRANSPOSEWX8_NEON
-TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
 #endif
 #ifdef HAS_TRANSPOSEWX8_SSSE3
-TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_MMI
+TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7)
 #endif
 #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
-TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
 #endif
-#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2
-TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7)
+#ifdef HAS_TRANSPOSEWX16_MSA
+TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
 #endif
-
 #undef TANY
 
+#define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a,             \
+               int dst_stride_a, uint8_t* dst_b, int dst_stride_b,             \
+               int width) {                                                    \
+    int r = width & MASK;                                                      \
+    int n = width - r;                                                         \
+    if (n > 0) {                                                               \
+      TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
+    }                                                                          \
+    TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a,        \
+                     dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
+  }
+
+#ifdef HAS_TRANSPOSEUVWX8_NEON
+TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_SSE2
+TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_MMI
+TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX16_MSA
+TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
+#endif
+#undef TUVANY
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
-
-
-
-
diff --git a/media/libaom/src/third_party/libyuv/source/rotate_argb.cc b/media/libaom/src/third_party/libyuv/source/rotate_argb.cc
index 787c0ad1be..ae65388601 100644
--- a/media/libaom/src/third_party/libyuv/source/rotate_argb.cc
+++ b/media/libaom/src/third_party/libyuv/source/rotate_argb.cc
@@ -10,94 +10,123 @@
 
 #include "libyuv/rotate.h"
 
-#include "libyuv/cpu_id.h"
 #include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// ARGBScale has a function to copy pixels to a row, striding each source
-// pixel by a constant.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || \
-    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
-                               int src_stepx, uint8* dst_ptr, int dst_width);
-#endif
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
-                               int src_stepx, uint8* dst_ptr, int dst_width);
-#endif
-
-void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
-                            int src_stepx, uint8* dst_ptr, int dst_width);
-
-static void ARGBTranspose(const uint8* src, int src_stride,
-                          uint8* dst, int dst_stride, int width, int height) {
+static int ARGBTranspose(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
   int i;
-  int src_pixel_step = src_stride >> 2;
-  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
-      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+  int src_pixel_step = src_stride_argb >> 2;
+  void (*ScaleARGBRowDownEven)(
+      const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
+      uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
+  // Check stride is a multiple of 4.
+  if (src_stride_argb & 3) {
+    return -1;
+  }
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+    }
   }
 #endif
 #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA;
+    }
   }
 #endif
 
   for (i = 0; i < width; ++i) {  // column of source to row of dest.
-    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
-    dst += dst_stride;
-    src += 4;
+    ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
+    dst_argb += dst_stride_argb;
+    src_argb += 4;
   }
+  return 0;
 }
 
-void ARGBRotate90(const uint8* src, int src_stride,
-                  uint8* dst, int dst_stride, int width, int height) {
+static int ARGBRotate90(const uint8_t* src_argb,
+                        int src_stride_argb,
+                        uint8_t* dst_argb,
+                        int dst_stride_argb,
+                        int width,
+                        int height) {
   // Rotate by 90 is a ARGBTranspose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
-  src += src_stride * (height - 1);
-  src_stride = -src_stride;
-  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+  src_argb += src_stride_argb * (height - 1);
+  src_stride_argb = -src_stride_argb;
+  return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                       width, height);
 }
 
-void ARGBRotate270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width, int height) {
+static int ARGBRotate270(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
   // Rotate by 270 is a ARGBTranspose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
-  dst += dst_stride * (width - 1);
-  dst_stride = -dst_stride;
-  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+  dst_argb += dst_stride_argb * (width - 1);
+  dst_stride_argb = -dst_stride_argb;
+  return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                       width, height);
 }
 
-void ARGBRotate180(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride, int width, int height) {
+static int ARGBRotate180(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width * 4);
-  const uint8* src_bot = src + src_stride * (height - 1);
-  uint8* dst_bot = dst + dst_stride * (height - 1);
+  const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
+  uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1);
   int half_height = (height + 1) >> 1;
   int y;
-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+  void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
       ARGBMirrorRow_C;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      CopyRow_C;
 #if defined(HAS_ARGBMIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
+    if (IS_ALIGNED(width, 8)) {
       ARGBMirrorRow = ARGBMirrorRow_NEON;
     }
   }
@@ -118,6 +147,22 @@ void ARGBRotate180(const uint8* src, int src_stride,
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
+    if (IS_ALIGNED(width, 2)) {
+      ARGBMirrorRow = ARGBMirrorRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_COPYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -138,28 +183,28 @@ void ARGBRotate180(const uint8* src, int src_stride,
     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
-    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
-    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    ARGBMirrorRow(src_argb, row, width);      // Mirror first row into a buffer
+    ARGBMirrorRow(src_bot, dst_argb, width);  // Mirror last row into first row
     CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
-    src += src_stride;
-    dst += dst_stride;
-    src_bot -= src_stride;
-    dst_bot -= dst_stride;
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+    src_bot -= src_stride_argb;
+    dst_bot -= dst_stride_argb;
   }
   free_aligned_buffer_64(row);
+  return 0;
 }
 
 LIBYUV_API
-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb, int width, int height,
+int ARGBRotate(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height,
                enum RotationMode mode) {
   if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
     return -1;
@@ -175,24 +220,17 @@ int ARGBRotate(const uint8* src_argb, int src_stride_argb,
   switch (mode) {
     case kRotate0:
       // copy frame
-      return ARGBCopy(src_argb, src_stride_argb,
-                      dst_argb, dst_stride_argb,
+      return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height);
     case kRotate90:
-      ARGBRotate90(src_argb, src_stride_argb,
-                   dst_argb, dst_stride_argb,
-                   width, height);
-      return 0;
+      return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                          width, height);
     case kRotate270:
-      ARGBRotate270(src_argb, src_stride_argb,
-                    dst_argb, dst_stride_argb,
-                    width, height);
-      return 0;
+      return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                           width, height);
     case kRotate180:
-      ARGBRotate180(src_argb, src_stride_argb,
-                    dst_argb, dst_stride_argb,
-                    width, height);
-      return 0;
+      return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                           width, height);
     default:
       break;
   }
diff --git a/media/libaom/src/third_party/libyuv/source/rotate_common.cc b/media/libaom/src/third_party/libyuv/source/rotate_common.cc
index b33a9a0c6e..ff212adebc 100644
--- a/media/libaom/src/third_party/libyuv/source/rotate_common.cc
+++ b/media/libaom/src/third_party/libyuv/source/rotate_common.cc
@@ -8,16 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-void TransposeWx8_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width) {
+void TransposeWx8_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width) {
   int i;
   for (i = 0; i < width; ++i) {
     dst[0] = src[0 * src_stride];
@@ -33,9 +36,13 @@ void TransposeWx8_C(const uint8* src, int src_stride,
   }
 }
 
-void TransposeUVWx8_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b, int width) {
+void TransposeUVWx8_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width) {
   int i;
   for (i = 0; i < width; ++i) {
     dst_a[0] = src[0 * src_stride + 0];
@@ -60,9 +67,12 @@ void TransposeUVWx8_C(const uint8* src, int src_stride,
   }
 }
 
-void TransposeWxH_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void TransposeWxH_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   int i;
   for (i = 0; i < width; ++i) {
     int j;
@@ -72,10 +82,14 @@ void TransposeWxH_C(const uint8* src, int src_stride,
   }
 }
 
-void TransposeUVWxH_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b,
-                      int width, int height) {
+void TransposeUVWxH_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height) {
   int i;
   for (i = 0; i < width * 2; i += 2) {
     int j;
diff --git a/media/libaom/src/third_party/libyuv/source/rotate_gcc.cc b/media/libaom/src/third_party/libyuv/source/rotate_gcc.cc
index fd385bcd30..fd359d4ae6 100644
--- a/media/libaom/src/third_party/libyuv/source/rotate_gcc.cc
+++ b/media/libaom/src/third_party/libyuv/source/rotate_gcc.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -17,474 +17,355 @@ extern "C" {
 #endif
 
 // This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-
 #if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    ".p2align  2                                 \n"
-  "1:                                            \n"
-    "movq       (%0),%%xmm0                      \n"
-    "movq       (%0,%3),%%xmm1                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "movq       (%0),%%xmm2                      \n"
-    "movdqa     %%xmm0,%%xmm1                    \n"
-    "palignr    $0x8,%%xmm1,%%xmm1               \n"
-    "movq       (%0,%3),%%xmm3                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm3                    \n"
-    "movq       (%0),%%xmm4                      \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "movq       (%0,%3),%%xmm5                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "movdqa     %%xmm4,%%xmm5                    \n"
-    "movq       (%0),%%xmm6                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       (%0,%3),%%xmm7                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "neg        %3                               \n"
-    "movdqa     %%xmm6,%%xmm7                    \n"
-    "lea        0x8(%0,%3,8),%0                  \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "neg        %3                               \n"
-     // Second round of bit swap.
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "palignr    $0x8,%%xmm2,%%xmm2               \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm4,%%xmm6                    \n"
-    "movdqa     %%xmm5,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movq       %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm0,%%xmm4                    \n"
-    "palignr    $0x8,%%xmm4,%%xmm4               \n"
-    "movq       %%xmm4,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm6                    \n"
-    "movq       %%xmm2,(%1)                      \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movq       %%xmm6,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm1,%%xmm5                    \n"
-    "movq       %%xmm1,(%1)                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       %%xmm5,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movq       %%xmm3,(%1)                      \n"
-    "movdqa     %%xmm3,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "sub        $0x8,%2                          \n"
-    "movq       %%xmm7,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst),    // %1
-      "+r"(width)   // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "r"((intptr_t)(dst_stride))   // %4
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width);
-  asm (
-    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
-    "push   %ebx                               \n"
-    "push   %esi                               \n"
-    "push   %edi                               \n"
-    "push   %ebp                               \n"
-    "mov    0x14(%esp),%eax                    \n"
-    "mov    0x18(%esp),%edi                    \n"
-    "mov    0x1c(%esp),%edx                    \n"
-    "mov    0x20(%esp),%esi                    \n"
-    "mov    0x24(%esp),%ebx                    \n"
-    "mov    0x28(%esp),%ebp                    \n"
-    "mov    %esp,%ecx                          \n"
-    "sub    $0x14,%esp                         \n"
-    "and    $0xfffffff0,%esp                   \n"
-    "mov    %ecx,0x10(%esp)                    \n"
-    "mov    0x2c(%ecx),%ecx                    \n"
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
 
-"1:                                            \n"
-    "movdqu (%eax),%xmm0                       \n"
-    "movdqu (%eax,%edi,1),%xmm1                \n"
-    "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqa %xmm0,%xmm7                        \n"
-    "punpcklbw %xmm1,%xmm0                     \n"
-    "punpckhbw %xmm1,%xmm7                     \n"
-    "movdqa %xmm7,%xmm1                        \n"
-    "movdqu (%eax),%xmm2                       \n"
-    "movdqu (%eax,%edi,1),%xmm3                \n"
-    "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqa %xmm2,%xmm7                        \n"
-    "punpcklbw %xmm3,%xmm2                     \n"
-    "punpckhbw %xmm3,%xmm7                     \n"
-    "movdqa %xmm7,%xmm3                        \n"
-    "movdqu (%eax),%xmm4                       \n"
-    "movdqu (%eax,%edi,1),%xmm5                \n"
-    "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqa %xmm4,%xmm7                        \n"
-    "punpcklbw %xmm5,%xmm4                     \n"
-    "punpckhbw %xmm5,%xmm7                     \n"
-    "movdqa %xmm7,%xmm5                        \n"
-    "movdqu (%eax),%xmm6                       \n"
-    "movdqu (%eax,%edi,1),%xmm7                \n"
-    "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqu %xmm5,(%esp)                       \n"
-    "neg    %edi                               \n"
-    "movdqa %xmm6,%xmm5                        \n"
-    "punpcklbw %xmm7,%xmm6                     \n"
-    "punpckhbw %xmm7,%xmm5                     \n"
-    "movdqa %xmm5,%xmm7                        \n"
-    "lea    0x10(%eax,%edi,8),%eax             \n"
-    "neg    %edi                               \n"
-    "movdqa %xmm0,%xmm5                        \n"
-    "punpcklwd %xmm2,%xmm0                     \n"
-    "punpckhwd %xmm2,%xmm5                     \n"
-    "movdqa %xmm5,%xmm2                        \n"
-    "movdqa %xmm1,%xmm5                        \n"
-    "punpcklwd %xmm3,%xmm1                     \n"
-    "punpckhwd %xmm3,%xmm5                     \n"
-    "movdqa %xmm5,%xmm3                        \n"
-    "movdqa %xmm4,%xmm5                        \n"
-    "punpcklwd %xmm6,%xmm4                     \n"
-    "punpckhwd %xmm6,%xmm5                     \n"
-    "movdqa %xmm5,%xmm6                        \n"
-    "movdqu (%esp),%xmm5                       \n"
-    "movdqu %xmm6,(%esp)                       \n"
-    "movdqa %xmm5,%xmm6                        \n"
-    "punpcklwd %xmm7,%xmm5                     \n"
-    "punpckhwd %xmm7,%xmm6                     \n"
-    "movdqa %xmm6,%xmm7                        \n"
-    "movdqa %xmm0,%xmm6                        \n"
-    "punpckldq %xmm4,%xmm0                     \n"
-    "punpckhdq %xmm4,%xmm6                     \n"
-    "movdqa %xmm6,%xmm4                        \n"
-    "movdqu (%esp),%xmm6                       \n"
-    "movlpd %xmm0,(%edx)                       \n"
-    "movhpd %xmm0,(%ebx)                       \n"
-    "movlpd %xmm4,(%edx,%esi,1)                \n"
-    "lea    (%edx,%esi,2),%edx                 \n"
-    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
-    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "movdqa %xmm2,%xmm0                        \n"
-    "punpckldq %xmm6,%xmm2                     \n"
-    "movlpd %xmm2,(%edx)                       \n"
-    "movhpd %xmm2,(%ebx)                       \n"
-    "punpckhdq %xmm6,%xmm0                     \n"
-    "movlpd %xmm0,(%edx,%esi,1)                \n"
-    "lea    (%edx,%esi,2),%edx                 \n"
-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
-    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "movdqa %xmm1,%xmm0                        \n"
-    "punpckldq %xmm5,%xmm1                     \n"
-    "movlpd %xmm1,(%edx)                       \n"
-    "movhpd %xmm1,(%ebx)                       \n"
-    "punpckhdq %xmm5,%xmm0                     \n"
-    "movlpd %xmm0,(%edx,%esi,1)                \n"
-    "lea    (%edx,%esi,2),%edx                 \n"
-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
-    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "movdqa %xmm3,%xmm0                        \n"
-    "punpckldq %xmm7,%xmm3                     \n"
-    "movlpd %xmm3,(%edx)                       \n"
-    "movhpd %xmm3,(%ebx)                       \n"
-    "punpckhdq %xmm7,%xmm0                     \n"
-    "sub    $0x8,%ecx                          \n"
-    "movlpd %xmm0,(%edx,%esi,1)                \n"
-    "lea    (%edx,%esi,2),%edx                 \n"
-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
-    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "jg     1b                                 \n"
-    "mov    0x10(%esp),%esp                    \n"
-    "pop    %ebp                               \n"
-    "pop    %edi                               \n"
-    "pop    %esi                               \n"
-    "pop    %ebx                               \n"
-#if defined(__native_client__)
-    "pop    %ecx                               \n"
-    "and    $0xffffffe0,%ecx                   \n"
-    "jmp    *%ecx                              \n"
-#else
-    "ret                                       \n"
-#endif
-);
-#endif
-#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
-    defined(__x86_64__)
-// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width) {
-  asm volatile (
-  // Read in the data from the source pointer.
-  // First round of bit swap.
-  ".p2align  2                                 \n"
-"1:                                            \n"
-  "movdqu     (%0),%%xmm0                      \n"
-  "movdqu     (%0,%3),%%xmm1                   \n"
-  "lea        (%0,%3,2),%0                     \n"
-  "movdqa     %%xmm0,%%xmm8                    \n"
-  "punpcklbw  %%xmm1,%%xmm0                    \n"
-  "punpckhbw  %%xmm1,%%xmm8                    \n"
-  "movdqu     (%0),%%xmm2                      \n"
-  "movdqa     %%xmm0,%%xmm1                    \n"
-  "movdqa     %%xmm8,%%xmm9                    \n"
-  "palignr    $0x8,%%xmm1,%%xmm1               \n"
-  "palignr    $0x8,%%xmm9,%%xmm9               \n"
-  "movdqu     (%0,%3),%%xmm3                   \n"
-  "lea        (%0,%3,2),%0                     \n"
-  "movdqa     %%xmm2,%%xmm10                   \n"
-  "punpcklbw  %%xmm3,%%xmm2                    \n"
-  "punpckhbw  %%xmm3,%%xmm10                   \n"
-  "movdqa     %%xmm2,%%xmm3                    \n"
-  "movdqa     %%xmm10,%%xmm11                  \n"
-  "movdqu     (%0),%%xmm4                      \n"
-  "palignr    $0x8,%%xmm3,%%xmm3               \n"
-  "palignr    $0x8,%%xmm11,%%xmm11             \n"
-  "movdqu     (%0,%3),%%xmm5                   \n"
-  "lea        (%0,%3,2),%0                     \n"
-  "movdqa     %%xmm4,%%xmm12                   \n"
-  "punpcklbw  %%xmm5,%%xmm4                    \n"
-  "punpckhbw  %%xmm5,%%xmm12                   \n"
-  "movdqa     %%xmm4,%%xmm5                    \n"
-  "movdqa     %%xmm12,%%xmm13                  \n"
-  "movdqu     (%0),%%xmm6                      \n"
-  "palignr    $0x8,%%xmm5,%%xmm5               \n"
-  "palignr    $0x8,%%xmm13,%%xmm13             \n"
-  "movdqu     (%0,%3),%%xmm7                   \n"
-  "lea        (%0,%3,2),%0                     \n"
-  "movdqa     %%xmm6,%%xmm14                   \n"
-  "punpcklbw  %%xmm7,%%xmm6                    \n"
-  "punpckhbw  %%xmm7,%%xmm14                   \n"
-  "neg        %3                               \n"
-  "movdqa     %%xmm6,%%xmm7                    \n"
-  "movdqa     %%xmm14,%%xmm15                  \n"
-  "lea        0x10(%0,%3,8),%0                 \n"
-  "palignr    $0x8,%%xmm7,%%xmm7               \n"
-  "palignr    $0x8,%%xmm15,%%xmm15             \n"
-  "neg        %3                               \n"
-   // Second round of bit swap.
-  "punpcklwd  %%xmm2,%%xmm0                    \n"
-  "punpcklwd  %%xmm3,%%xmm1                    \n"
-  "movdqa     %%xmm0,%%xmm2                    \n"
-  "movdqa     %%xmm1,%%xmm3                    \n"
-  "palignr    $0x8,%%xmm2,%%xmm2               \n"
-  "palignr    $0x8,%%xmm3,%%xmm3               \n"
-  "punpcklwd  %%xmm6,%%xmm4                    \n"
-  "punpcklwd  %%xmm7,%%xmm5                    \n"
-  "movdqa     %%xmm4,%%xmm6                    \n"
-  "movdqa     %%xmm5,%%xmm7                    \n"
-  "palignr    $0x8,%%xmm6,%%xmm6               \n"
-  "palignr    $0x8,%%xmm7,%%xmm7               \n"
-  "punpcklwd  %%xmm10,%%xmm8                   \n"
-  "punpcklwd  %%xmm11,%%xmm9                   \n"
-  "movdqa     %%xmm8,%%xmm10                   \n"
-  "movdqa     %%xmm9,%%xmm11                   \n"
-  "palignr    $0x8,%%xmm10,%%xmm10             \n"
-  "palignr    $0x8,%%xmm11,%%xmm11             \n"
-  "punpcklwd  %%xmm14,%%xmm12                  \n"
-  "punpcklwd  %%xmm15,%%xmm13                  \n"
-  "movdqa     %%xmm12,%%xmm14                  \n"
-  "movdqa     %%xmm13,%%xmm15                  \n"
-  "palignr    $0x8,%%xmm14,%%xmm14             \n"
-  "palignr    $0x8,%%xmm15,%%xmm15             \n"
-  // Third round of bit swap.
-  // Write to the destination pointer.
-  "punpckldq  %%xmm4,%%xmm0                    \n"
-  "movq       %%xmm0,(%1)                      \n"
-  "movdqa     %%xmm0,%%xmm4                    \n"
-  "palignr    $0x8,%%xmm4,%%xmm4               \n"
-  "movq       %%xmm4,(%1,%4)                   \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm6,%%xmm2                    \n"
-  "movdqa     %%xmm2,%%xmm6                    \n"
-  "movq       %%xmm2,(%1)                      \n"
-  "palignr    $0x8,%%xmm6,%%xmm6               \n"
-  "punpckldq  %%xmm5,%%xmm1                    \n"
-  "movq       %%xmm6,(%1,%4)                   \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "movdqa     %%xmm1,%%xmm5                    \n"
-  "movq       %%xmm1,(%1)                      \n"
-  "palignr    $0x8,%%xmm5,%%xmm5               \n"
-  "movq       %%xmm5,(%1,%4)                   \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm7,%%xmm3                    \n"
-  "movq       %%xmm3,(%1)                      \n"
-  "movdqa     %%xmm3,%%xmm7                    \n"
-  "palignr    $0x8,%%xmm7,%%xmm7               \n"
-  "movq       %%xmm7,(%1,%4)                   \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm12,%%xmm8                   \n"
-  "movq       %%xmm8,(%1)                      \n"
-  "movdqa     %%xmm8,%%xmm12                   \n"
-  "palignr    $0x8,%%xmm12,%%xmm12             \n"
-  "movq       %%xmm12,(%1,%4)                  \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm14,%%xmm10                  \n"
-  "movdqa     %%xmm10,%%xmm14                  \n"
-  "movq       %%xmm10,(%1)                     \n"
-  "palignr    $0x8,%%xmm14,%%xmm14             \n"
-  "punpckldq  %%xmm13,%%xmm9                   \n"
-  "movq       %%xmm14,(%1,%4)                  \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "movdqa     %%xmm9,%%xmm13                   \n"
-  "movq       %%xmm9,(%1)                      \n"
-  "palignr    $0x8,%%xmm13,%%xmm13             \n"
-  "movq       %%xmm13,(%1,%4)                  \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm15,%%xmm11                  \n"
-  "movq       %%xmm11,(%1)                     \n"
-  "movdqa     %%xmm11,%%xmm15                  \n"
-  "palignr    $0x8,%%xmm15,%%xmm15             \n"
-  "sub        $0x10,%2                         \n"
-  "movq       %%xmm15,(%1,%4)                  \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "jg         1b                               \n"
-  : "+r"(src),    // %0
-    "+r"(dst),    // %1
-    "+r"(width)   // %2
-  : "r"((intptr_t)(src_stride)),  // %3
-    "r"((intptr_t)(dst_stride))   // %4
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
-);
+// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+void TransposeWx8_SSSE3(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst,
+                        int dst_stride,
+                        int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "movq        (%0,%3),%%xmm1                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "movq        (%0),%%xmm2                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm1            \n"
+      "movq        (%0,%3),%%xmm3                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "punpcklbw   %%xmm3,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "movq        (%0),%%xmm4                   \n"
+      "palignr     $0x8,%%xmm3,%%xmm3            \n"
+      "movq        (%0,%3),%%xmm5                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "punpcklbw   %%xmm5,%%xmm4                 \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "movq        (%0),%%xmm6                   \n"
+      "palignr     $0x8,%%xmm5,%%xmm5            \n"
+      "movq        (%0,%3),%%xmm7                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "punpcklbw   %%xmm7,%%xmm6                 \n"
+      "neg         %3                            \n"
+      "movdqa      %%xmm6,%%xmm7                 \n"
+      "lea         0x8(%0,%3,8),%0               \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "neg         %3                            \n"
+      // Second round of bit swap.
+      "punpcklwd   %%xmm2,%%xmm0                 \n"
+      "punpcklwd   %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "palignr     $0x8,%%xmm2,%%xmm2            \n"
+      "palignr     $0x8,%%xmm3,%%xmm3            \n"
+      "punpcklwd   %%xmm6,%%xmm4                 \n"
+      "punpcklwd   %%xmm7,%%xmm5                 \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "movdqa      %%xmm5,%%xmm7                 \n"
+      "palignr     $0x8,%%xmm6,%%xmm6            \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "punpckldq   %%xmm4,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "palignr     $0x8,%%xmm4,%%xmm4            \n"
+      "movq        %%xmm4,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm6,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "movq        %%xmm2,(%1)                   \n"
+      "palignr     $0x8,%%xmm6,%%xmm6            \n"
+      "punpckldq   %%xmm5,%%xmm1                 \n"
+      "movq        %%xmm6,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "movq        %%xmm1,(%1)                   \n"
+      "palignr     $0x8,%%xmm5,%%xmm5            \n"
+      "movq        %%xmm5,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm7,%%xmm3                 \n"
+      "movq        %%xmm3,(%1)                   \n"
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "sub         $0x8,%2                       \n"
+      "movq        %%xmm7,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "jg          1b                            \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
+#endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
 
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width) {
-  asm volatile (
-  // Read in the data from the source pointer.
-  // First round of bit swap.
-  ".p2align  2                                 \n"
-"1:                                            \n"
-  "movdqu     (%0),%%xmm0                      \n"
-  "movdqu     (%0,%4),%%xmm1                   \n"
-  "lea        (%0,%4,2),%0                     \n"
-  "movdqa     %%xmm0,%%xmm8                    \n"
-  "punpcklbw  %%xmm1,%%xmm0                    \n"
-  "punpckhbw  %%xmm1,%%xmm8                    \n"
-  "movdqa     %%xmm8,%%xmm1                    \n"
-  "movdqu     (%0),%%xmm2                      \n"
-  "movdqu     (%0,%4),%%xmm3                   \n"
-  "lea        (%0,%4,2),%0                     \n"
-  "movdqa     %%xmm2,%%xmm8                    \n"
-  "punpcklbw  %%xmm3,%%xmm2                    \n"
-  "punpckhbw  %%xmm3,%%xmm8                    \n"
-  "movdqa     %%xmm8,%%xmm3                    \n"
-  "movdqu     (%0),%%xmm4                      \n"
-  "movdqu     (%0,%4),%%xmm5                   \n"
-  "lea        (%0,%4,2),%0                     \n"
-  "movdqa     %%xmm4,%%xmm8                    \n"
-  "punpcklbw  %%xmm5,%%xmm4                    \n"
-  "punpckhbw  %%xmm5,%%xmm8                    \n"
-  "movdqa     %%xmm8,%%xmm5                    \n"
-  "movdqu     (%0),%%xmm6                      \n"
-  "movdqu     (%0,%4),%%xmm7                   \n"
-  "lea        (%0,%4,2),%0                     \n"
-  "movdqa     %%xmm6,%%xmm8                    \n"
-  "punpcklbw  %%xmm7,%%xmm6                    \n"
-  "neg        %4                               \n"
-  "lea        0x10(%0,%4,8),%0                 \n"
-  "punpckhbw  %%xmm7,%%xmm8                    \n"
-  "movdqa     %%xmm8,%%xmm7                    \n"
-  "neg        %4                               \n"
-   // Second round of bit swap.
-  "movdqa     %%xmm0,%%xmm8                    \n"
-  "movdqa     %%xmm1,%%xmm9                    \n"
-  "punpckhwd  %%xmm2,%%xmm8                    \n"
-  "punpckhwd  %%xmm3,%%xmm9                    \n"
-  "punpcklwd  %%xmm2,%%xmm0                    \n"
-  "punpcklwd  %%xmm3,%%xmm1                    \n"
-  "movdqa     %%xmm8,%%xmm2                    \n"
-  "movdqa     %%xmm9,%%xmm3                    \n"
-  "movdqa     %%xmm4,%%xmm8                    \n"
-  "movdqa     %%xmm5,%%xmm9                    \n"
-  "punpckhwd  %%xmm6,%%xmm8                    \n"
-  "punpckhwd  %%xmm7,%%xmm9                    \n"
-  "punpcklwd  %%xmm6,%%xmm4                    \n"
-  "punpcklwd  %%xmm7,%%xmm5                    \n"
-  "movdqa     %%xmm8,%%xmm6                    \n"
-  "movdqa     %%xmm9,%%xmm7                    \n"
-  // Third round of bit swap.
-  // Write to the destination pointer.
-  "movdqa     %%xmm0,%%xmm8                    \n"
-  "punpckldq  %%xmm4,%%xmm0                    \n"
-  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
-  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
-  "punpckhdq  %%xmm4,%%xmm8                    \n"
-  "movlpd     %%xmm8,(%1,%5)                   \n"
-  "lea        (%1,%5,2),%1                     \n"
-  "movhpd     %%xmm8,(%2,%6)                   \n"
-  "lea        (%2,%6,2),%2                     \n"
-  "movdqa     %%xmm2,%%xmm8                    \n"
-  "punpckldq  %%xmm6,%%xmm2                    \n"
-  "movlpd     %%xmm2,(%1)                      \n"
-  "movhpd     %%xmm2,(%2)                      \n"
-  "punpckhdq  %%xmm6,%%xmm8                    \n"
-  "movlpd     %%xmm8,(%1,%5)                   \n"
-  "lea        (%1,%5,2),%1                     \n"
-  "movhpd     %%xmm8,(%2,%6)                   \n"
-  "lea        (%2,%6,2),%2                     \n"
-  "movdqa     %%xmm1,%%xmm8                    \n"
-  "punpckldq  %%xmm5,%%xmm1                    \n"
-  "movlpd     %%xmm1,(%1)                      \n"
-  "movhpd     %%xmm1,(%2)                      \n"
-  "punpckhdq  %%xmm5,%%xmm8                    \n"
-  "movlpd     %%xmm8,(%1,%5)                   \n"
-  "lea        (%1,%5,2),%1                     \n"
-  "movhpd     %%xmm8,(%2,%6)                   \n"
-  "lea        (%2,%6,2),%2                     \n"
-  "movdqa     %%xmm3,%%xmm8                    \n"
-  "punpckldq  %%xmm7,%%xmm3                    \n"
-  "movlpd     %%xmm3,(%1)                      \n"
-  "movhpd     %%xmm3,(%2)                      \n"
-  "punpckhdq  %%xmm7,%%xmm8                    \n"
-  "sub        $0x8,%3                          \n"
-  "movlpd     %%xmm8,(%1,%5)                   \n"
-  "lea        (%1,%5,2),%1                     \n"
-  "movhpd     %%xmm8,(%2,%6)                   \n"
-  "lea        (%2,%6,2),%2                     \n"
-  "jg         1b                               \n"
-  : "+r"(src),    // %0
-    "+r"(dst_a),  // %1
-    "+r"(dst_b),  // %2
-    "+r"(width)   // %3
-  : "r"((intptr_t)(src_stride)),    // %4
-    "r"((intptr_t)(dst_stride_a)),  // %5
-    "r"((intptr_t)(dst_stride_b))   // %6
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-    "xmm8", "xmm9"
-);
+// Transpose 16x8. 64 bit
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst,
+                             int dst_stride,
+                             int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%0,%3),%%xmm1                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqa      %%xmm0,%%xmm8                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm8                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm8,%%xmm9                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm1            \n"
+      "palignr     $0x8,%%xmm9,%%xmm9            \n"
+      "movdqu      (%0,%3),%%xmm3                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqa      %%xmm2,%%xmm10                \n"
+      "punpcklbw   %%xmm3,%%xmm2                 \n"
+      "punpckhbw   %%xmm3,%%xmm10                \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "movdqa      %%xmm10,%%xmm11               \n"
+      "movdqu      (%0),%%xmm4                   \n"
+      "palignr     $0x8,%%xmm3,%%xmm3            \n"
+      "palignr     $0x8,%%xmm11,%%xmm11          \n"
+      "movdqu      (%0,%3),%%xmm5                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqa      %%xmm4,%%xmm12                \n"
+      "punpcklbw   %%xmm5,%%xmm4                 \n"
+      "punpckhbw   %%xmm5,%%xmm12                \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "movdqa      %%xmm12,%%xmm13               \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "palignr     $0x8,%%xmm5,%%xmm5            \n"
+      "palignr     $0x8,%%xmm13,%%xmm13          \n"
+      "movdqu      (%0,%3),%%xmm7                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqa      %%xmm6,%%xmm14                \n"
+      "punpcklbw   %%xmm7,%%xmm6                 \n"
+      "punpckhbw   %%xmm7,%%xmm14                \n"
+      "neg         %3                            \n"
+      "movdqa      %%xmm6,%%xmm7                 \n"
+      "movdqa      %%xmm14,%%xmm15               \n"
+      "lea         0x10(%0,%3,8),%0              \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "palignr     $0x8,%%xmm15,%%xmm15          \n"
+      "neg         %3                            \n"
+      // Second round of bit swap.
+      "punpcklwd   %%xmm2,%%xmm0                 \n"
+      "punpcklwd   %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "palignr     $0x8,%%xmm2,%%xmm2            \n"
+      "palignr     $0x8,%%xmm3,%%xmm3            \n"
+      "punpcklwd   %%xmm6,%%xmm4                 \n"
+      "punpcklwd   %%xmm7,%%xmm5                 \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "movdqa      %%xmm5,%%xmm7                 \n"
+      "palignr     $0x8,%%xmm6,%%xmm6            \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "punpcklwd   %%xmm10,%%xmm8                \n"
+      "punpcklwd   %%xmm11,%%xmm9                \n"
+      "movdqa      %%xmm8,%%xmm10                \n"
+      "movdqa      %%xmm9,%%xmm11                \n"
+      "palignr     $0x8,%%xmm10,%%xmm10          \n"
+      "palignr     $0x8,%%xmm11,%%xmm11          \n"
+      "punpcklwd   %%xmm14,%%xmm12               \n"
+      "punpcklwd   %%xmm15,%%xmm13               \n"
+      "movdqa      %%xmm12,%%xmm14               \n"
+      "movdqa      %%xmm13,%%xmm15               \n"
+      "palignr     $0x8,%%xmm14,%%xmm14          \n"
+      "palignr     $0x8,%%xmm15,%%xmm15          \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "punpckldq   %%xmm4,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "palignr     $0x8,%%xmm4,%%xmm4            \n"
+      "movq        %%xmm4,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm6,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "movq        %%xmm2,(%1)                   \n"
+      "palignr     $0x8,%%xmm6,%%xmm6            \n"
+      "punpckldq   %%xmm5,%%xmm1                 \n"
+      "movq        %%xmm6,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "movq        %%xmm1,(%1)                   \n"
+      "palignr     $0x8,%%xmm5,%%xmm5            \n"
+      "movq        %%xmm5,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm7,%%xmm3                 \n"
+      "movq        %%xmm3,(%1)                   \n"
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "movq        %%xmm7,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm12,%%xmm8                \n"
+      "movq        %%xmm8,(%1)                   \n"
+      "movdqa      %%xmm8,%%xmm12                \n"
+      "palignr     $0x8,%%xmm12,%%xmm12          \n"
+      "movq        %%xmm12,(%1,%4)               \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm14,%%xmm10               \n"
+      "movdqa      %%xmm10,%%xmm14               \n"
+      "movq        %%xmm10,(%1)                  \n"
+      "palignr     $0x8,%%xmm14,%%xmm14          \n"
+      "punpckldq   %%xmm13,%%xmm9                \n"
+      "movq        %%xmm14,(%1,%4)               \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "movdqa      %%xmm9,%%xmm13                \n"
+      "movq        %%xmm9,(%1)                   \n"
+      "palignr     $0x8,%%xmm13,%%xmm13          \n"
+      "movq        %%xmm13,(%1,%4)               \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm15,%%xmm11               \n"
+      "movq        %%xmm11,(%1)                  \n"
+      "movdqa      %%xmm11,%%xmm15               \n"
+      "palignr     $0x8,%%xmm15,%%xmm15          \n"
+      "sub         $0x10,%2                      \n"
+      "movq        %%xmm15,(%1,%4)               \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "jg          1b                            \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+        "xmm15");
 }
-#endif
-#endif
+#endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
 
+// Transpose UV 8x8.  64 bit.
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+void TransposeUVWx8_SSE2(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%0,%4),%%xmm1                \n"
+      "lea         (%0,%4,2),%0                  \n"
+      "movdqa      %%xmm0,%%xmm8                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm8                 \n"
+      "movdqa      %%xmm8,%%xmm1                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "movdqu      (%0,%4),%%xmm3                \n"
+      "lea         (%0,%4,2),%0                  \n"
+      "movdqa      %%xmm2,%%xmm8                 \n"
+      "punpcklbw   %%xmm3,%%xmm2                 \n"
+      "punpckhbw   %%xmm3,%%xmm8                 \n"
+      "movdqa      %%xmm8,%%xmm3                 \n"
+      "movdqu      (%0),%%xmm4                   \n"
+      "movdqu      (%0,%4),%%xmm5                \n"
+      "lea         (%0,%4,2),%0                  \n"
+      "movdqa      %%xmm4,%%xmm8                 \n"
+      "punpcklbw   %%xmm5,%%xmm4                 \n"
+      "punpckhbw   %%xmm5,%%xmm8                 \n"
+      "movdqa      %%xmm8,%%xmm5                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      (%0,%4),%%xmm7                \n"
+      "lea         (%0,%4,2),%0                  \n"
+      "movdqa      %%xmm6,%%xmm8                 \n"
+      "punpcklbw   %%xmm7,%%xmm6                 \n"
+      "neg         %4                            \n"
+      "lea         0x10(%0,%4,8),%0              \n"
+      "punpckhbw   %%xmm7,%%xmm8                 \n"
+      "movdqa      %%xmm8,%%xmm7                 \n"
+      "neg         %4                            \n"
+      // Second round of bit swap.
+      "movdqa      %%xmm0,%%xmm8                 \n"
+      "movdqa      %%xmm1,%%xmm9                 \n"
+      "punpckhwd   %%xmm2,%%xmm8                 \n"
+      "punpckhwd   %%xmm3,%%xmm9                 \n"
+      "punpcklwd   %%xmm2,%%xmm0                 \n"
+      "punpcklwd   %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm8,%%xmm2                 \n"
+      "movdqa      %%xmm9,%%xmm3                 \n"
+      "movdqa      %%xmm4,%%xmm8                 \n"
+      "movdqa      %%xmm5,%%xmm9                 \n"
+      "punpckhwd   %%xmm6,%%xmm8                 \n"
+      "punpckhwd   %%xmm7,%%xmm9                 \n"
+      "punpcklwd   %%xmm6,%%xmm4                 \n"
+      "punpcklwd   %%xmm7,%%xmm5                 \n"
+      "movdqa      %%xmm8,%%xmm6                 \n"
+      "movdqa      %%xmm9,%%xmm7                 \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "movdqa      %%xmm0,%%xmm8                 \n"
+      "punpckldq   %%xmm4,%%xmm0                 \n"
+      "movlpd      %%xmm0,(%1)                   \n"  // Write back U channel
+      "movhpd      %%xmm0,(%2)                   \n"  // Write back V channel
+      "punpckhdq   %%xmm4,%%xmm8                 \n"
+      "movlpd      %%xmm8,(%1,%5)                \n"
+      "lea         (%1,%5,2),%1                  \n"
+      "movhpd      %%xmm8,(%2,%6)                \n"
+      "lea         (%2,%6,2),%2                  \n"
+      "movdqa      %%xmm2,%%xmm8                 \n"
+      "punpckldq   %%xmm6,%%xmm2                 \n"
+      "movlpd      %%xmm2,(%1)                   \n"
+      "movhpd      %%xmm2,(%2)                   \n"
+      "punpckhdq   %%xmm6,%%xmm8                 \n"
+      "movlpd      %%xmm8,(%1,%5)                \n"
+      "lea         (%1,%5,2),%1                  \n"
+      "movhpd      %%xmm8,(%2,%6)                \n"
+      "lea         (%2,%6,2),%2                  \n"
+      "movdqa      %%xmm1,%%xmm8                 \n"
+      "punpckldq   %%xmm5,%%xmm1                 \n"
+      "movlpd      %%xmm1,(%1)                   \n"
+      "movhpd      %%xmm1,(%2)                   \n"
+      "punpckhdq   %%xmm5,%%xmm8                 \n"
+      "movlpd      %%xmm8,(%1,%5)                \n"
+      "lea         (%1,%5,2),%1                  \n"
+      "movhpd      %%xmm8,(%2,%6)                \n"
+      "lea         (%2,%6,2),%2                  \n"
+      "movdqa      %%xmm3,%%xmm8                 \n"
+      "punpckldq   %%xmm7,%%xmm3                 \n"
+      "movlpd      %%xmm3,(%1)                   \n"
+      "movhpd      %%xmm3,(%2)                   \n"
+      "punpckhdq   %%xmm7,%%xmm8                 \n"
+      "sub         $0x8,%3                       \n"
+      "movlpd      %%xmm8,(%1,%5)                \n"
+      "lea         (%1,%5,2),%1                  \n"
+      "movhpd      %%xmm8,(%2,%6)                \n"
+      "lea         (%2,%6,2),%2                  \n"
+      "jg          1b                            \n"
+      : "+r"(src),                      // %0
+        "+r"(dst_a),                    // %1
+        "+r"(dst_b),                    // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride)),    // %4
+        "r"((intptr_t)(dst_stride_a)),  // %5
+        "r"((intptr_t)(dst_stride_b))   // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7", "xmm8", "xmm9");
+}
+#endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/third_party/libyuv/source/rotate_neon.cc b/media/libaom/src/third_party/libyuv/source/rotate_neon.cc
index 76043b3b3c..844df2bf30 100644
--- a/media/libaom/src/third_party/libyuv/source/rotate_neon.cc
+++ b/media/libaom/src/third_party/libyuv/source/rotate_neon.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #include "libyuv/basic_types.h"
 
@@ -21,511 +21,394 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     !defined(__aarch64__)
 
-static uvec8 kVTbl4x4Transpose =
-  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+static const uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
+                                        2, 6, 10, 14, 3, 7, 11, 15};
 
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride,
+void TransposeWx8_NEON(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
                        int width) {
-  const uint8* src_temp = NULL;
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub         %5, #8                        \n"
-
-    // handle 8x8 blocks. this should be the majority of the plane
-    ".p2align  2                               \n"
-    "1:                                        \n"
-      "mov         %0, %1                      \n"
-
-      MEMACCESS(0)
-      "vld1.8      {d0}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d1}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d2}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d3}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d4}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d5}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d6}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d7}, [%0]                  \n"
-
-      "vtrn.8      d1, d0                      \n"
-      "vtrn.8      d3, d2                      \n"
-      "vtrn.8      d5, d4                      \n"
-      "vtrn.8      d7, d6                      \n"
-
-      "vtrn.16     d1, d3                      \n"
-      "vtrn.16     d0, d2                      \n"
-      "vtrn.16     d5, d7                      \n"
-      "vtrn.16     d4, d6                      \n"
-
-      "vtrn.32     d1, d5                      \n"
-      "vtrn.32     d0, d4                      \n"
-      "vtrn.32     d3, d7                      \n"
-      "vtrn.32     d2, d6                      \n"
-
-      "vrev16.8    q0, q0                      \n"
-      "vrev16.8    q1, q1                      \n"
-      "vrev16.8    q2, q2                      \n"
-      "vrev16.8    q3, q3                      \n"
-
-      "mov         %0, %3                      \n"
-
-    MEMACCESS(0)
-      "vst1.8      {d1}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d0}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d3}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d2}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d5}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d4}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d7}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d6}, [%0]                  \n"
-
-      "add         %1, #8                      \n"  // src += 8
-      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
-      "subs        %5,  #8                     \n"  // w   -= 8
-      "bge         1b                          \n"
-
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds        %5, #8                        \n"
-    "beq         4f                            \n"
-
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %5, #2                        \n"
-    "blt         3f                            \n"
-
-    "cmp         %5, #4                        \n"
-    "blt         2f                            \n"
-
-    // 4x8 block
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.32     {d0[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d0[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d1[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d1[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d2[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d2[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d3[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d3[1]}, [%0]                 \n"
-
-    "mov         %0, %3                        \n"
-
-    MEMACCESS(6)
-    "vld1.8      {q3}, [%6]                    \n"
-
-    "vtbl.8      d4, {d0, d1}, d6              \n"
-    "vtbl.8      d5, {d0, d1}, d7              \n"
-    "vtbl.8      d0, {d2, d3}, d6              \n"
-    "vtbl.8      d1, {d2, d3}, d7              \n"
-
-    // TODO(frkoenig): Rework shuffle above to
-    // write out with 4 instead of 8 writes.
-    MEMACCESS(0)
-    "vst1.32     {d4[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d4[1]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d5[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d5[1]}, [%0]                 \n"
-
-    "add         %0, %3, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d0[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d0[1]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d1[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d1[1]}, [%0]                 \n"
-
-    "add         %1, #4                        \n"  // src += 4
-    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
-    "subs        %5,  #4                       \n"  // w   -= 4
-    "beq         4f                            \n"
-
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp         %5, #2                        \n"
-    "blt         3f                            \n"
-
-    // 2x8 block
-    "2:                                        \n"
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[2]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[2]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[3]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[3]}, [%0]                 \n"
-
-    "vtrn.8      d0, d1                        \n"
-
-    "mov         %0, %3                        \n"
-
-    MEMACCESS(0)
-    "vst1.64     {d0}, [%0], %4                \n"
-    MEMACCESS(0)
-    "vst1.64     {d1}, [%0]                    \n"
-
-    "add         %1, #2                        \n"  // src += 2
-    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
-    "subs        %5,  #2                       \n"  // w   -= 2
-    "beq         4f                            \n"
-
-    // 1x8 block
-    "3:                                        \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[0]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[1]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[2]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[3]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[4]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[5]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[6]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[7]}, [%1]                 \n"
-
-    MEMACCESS(3)
-    "vst1.64     {d0}, [%3]                    \n"
-
-    "4:                                        \n"
-
-    : "+r"(src_temp),          // %0
-      "+r"(src),               // %1
-      "+r"(src_stride),        // %2
-      "+r"(dst),               // %3
-      "+r"(dst_stride),        // %4
-      "+r"(width)              // %5
-    : "r"(&kVTbl4x4Transpose)  // %6
-    : "memory", "cc", "q0", "q1", "q2", "q3"
-  );
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %5, #8                        \n"
+
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
+      "mov         %0, %1                        \n"
+
+      "vld1.8      {d0}, [%0], %2                \n"
+      "vld1.8      {d1}, [%0], %2                \n"
+      "vld1.8      {d2}, [%0], %2                \n"
+      "vld1.8      {d3}, [%0], %2                \n"
+      "vld1.8      {d4}, [%0], %2                \n"
+      "vld1.8      {d5}, [%0], %2                \n"
+      "vld1.8      {d6}, [%0], %2                \n"
+      "vld1.8      {d7}, [%0]                    \n"
+
+      "vtrn.8      d1, d0                        \n"
+      "vtrn.8      d3, d2                        \n"
+      "vtrn.8      d5, d4                        \n"
+      "vtrn.8      d7, d6                        \n"
+
+      "vtrn.16     d1, d3                        \n"
+      "vtrn.16     d0, d2                        \n"
+      "vtrn.16     d5, d7                        \n"
+      "vtrn.16     d4, d6                        \n"
+
+      "vtrn.32     d1, d5                        \n"
+      "vtrn.32     d0, d4                        \n"
+      "vtrn.32     d3, d7                        \n"
+      "vtrn.32     d2, d6                        \n"
+
+      "vrev16.8    q0, q0                        \n"
+      "vrev16.8    q1, q1                        \n"
+      "vrev16.8    q2, q2                        \n"
+      "vrev16.8    q3, q3                        \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.8      {d1}, [%0], %4                \n"
+      "vst1.8      {d0}, [%0], %4                \n"
+      "vst1.8      {d3}, [%0], %4                \n"
+      "vst1.8      {d2}, [%0], %4                \n"
+      "vst1.8      {d5}, [%0], %4                \n"
+      "vst1.8      {d4}, [%0], %4                \n"
+      "vst1.8      {d7}, [%0], %4                \n"
+      "vst1.8      {d6}, [%0]                    \n"
+
+      "add         %1, #8                        \n"  // src += 8
+      "add         %3, %3, %4, lsl #3            \n"  // dst += 8 * dst_stride
+      "subs        %5,  #8                       \n"  // w   -= 8
+      "bge         1b                            \n"
+
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %5, #8                        \n"
+      "beq         4f                            \n"
+
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %5, #2                        \n"
+      "blt         3f                            \n"
+
+      "cmp         %5, #4                        \n"
+      "blt         2f                            \n"
+
+      // 4x8 block
+      "mov         %0, %1                        \n"
+      "vld1.32     {d0[0]}, [%0], %2             \n"
+      "vld1.32     {d0[1]}, [%0], %2             \n"
+      "vld1.32     {d1[0]}, [%0], %2             \n"
+      "vld1.32     {d1[1]}, [%0], %2             \n"
+      "vld1.32     {d2[0]}, [%0], %2             \n"
+      "vld1.32     {d2[1]}, [%0], %2             \n"
+      "vld1.32     {d3[0]}, [%0], %2             \n"
+      "vld1.32     {d3[1]}, [%0]                 \n"
+
+      "mov         %0, %3                        \n"
+
+      "vld1.8      {q3}, [%6]                    \n"
+
+      "vtbl.8      d4, {d0, d1}, d6              \n"
+      "vtbl.8      d5, {d0, d1}, d7              \n"
+      "vtbl.8      d0, {d2, d3}, d6              \n"
+      "vtbl.8      d1, {d2, d3}, d7              \n"
+
+      // TODO(frkoenig): Rework shuffle above to
+      // write out with 4 instead of 8 writes.
+      "vst1.32     {d4[0]}, [%0], %4             \n"
+      "vst1.32     {d4[1]}, [%0], %4             \n"
+      "vst1.32     {d5[0]}, [%0], %4             \n"
+      "vst1.32     {d5[1]}, [%0]                 \n"
+
+      "add         %0, %3, #4                    \n"
+      "vst1.32     {d0[0]}, [%0], %4             \n"
+      "vst1.32     {d0[1]}, [%0], %4             \n"
+      "vst1.32     {d1[0]}, [%0], %4             \n"
+      "vst1.32     {d1[1]}, [%0]                 \n"
+
+      "add         %1, #4                        \n"  // src += 4
+      "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
+      "subs        %5,  #4                       \n"  // w   -= 4
+      "beq         4f                            \n"
+
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp         %5, #2                        \n"
+      "blt         3f                            \n"
+
+      // 2x8 block
+      "2:                                        \n"
+      "mov         %0, %1                        \n"
+      "vld1.16     {d0[0]}, [%0], %2             \n"
+      "vld1.16     {d1[0]}, [%0], %2             \n"
+      "vld1.16     {d0[1]}, [%0], %2             \n"
+      "vld1.16     {d1[1]}, [%0], %2             \n"
+      "vld1.16     {d0[2]}, [%0], %2             \n"
+      "vld1.16     {d1[2]}, [%0], %2             \n"
+      "vld1.16     {d0[3]}, [%0], %2             \n"
+      "vld1.16     {d1[3]}, [%0]                 \n"
+
+      "vtrn.8      d0, d1                        \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.64     {d0}, [%0], %4                \n"
+      "vst1.64     {d1}, [%0]                    \n"
+
+      "add         %1, #2                        \n"  // src += 2
+      "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
+      "subs        %5,  #2                       \n"  // w   -= 2
+      "beq         4f                            \n"
+
+      // 1x8 block
+      "3:                                        \n"
+      "vld1.8      {d0[0]}, [%1], %2             \n"
+      "vld1.8      {d0[1]}, [%1], %2             \n"
+      "vld1.8      {d0[2]}, [%1], %2             \n"
+      "vld1.8      {d0[3]}, [%1], %2             \n"
+      "vld1.8      {d0[4]}, [%1], %2             \n"
+      "vld1.8      {d0[5]}, [%1], %2             \n"
+      "vld1.8      {d0[6]}, [%1], %2             \n"
+      "vld1.8      {d0[7]}, [%1]                 \n"
+
+      "vst1.64     {d0}, [%3]                    \n"
+
+      "4:                                        \n"
+
+      : "=&r"(src_temp),         // %0
+        "+r"(src),               // %1
+        "+r"(src_stride),        // %2
+        "+r"(dst),               // %3
+        "+r"(dst_stride),        // %4
+        "+r"(width)              // %5
+      : "r"(&kVTbl4x4Transpose)  // %6
+      : "memory", "cc", "q0", "q1", "q2", "q3");
 }
 
-static uvec8 kVTbl4x4TransposeDi =
-  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
+static const uvec8 kVTbl4x4TransposeDi = {0, 8,  1, 9,  2, 10, 3, 11,
+                                          4, 12, 5, 13, 6, 14, 7, 15};
 
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
+void TransposeUVWx8_NEON(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
                          int width) {
-  const uint8* src_temp = NULL;
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub         %7, #8                        \n"
-
-    // handle 8x8 blocks. this should be the majority of the plane
-    ".p2align  2                               \n"
-    "1:                                        \n"
-      "mov         %0, %1                      \n"
-
-      MEMACCESS(0)
-      "vld2.8      {d0,  d1},  [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d2,  d3},  [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d4,  d5},  [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d6,  d7},  [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d16, d17}, [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d18, d19}, [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d20, d21}, [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d22, d23}, [%0]            \n"
-
-      "vtrn.8      q1, q0                      \n"
-      "vtrn.8      q3, q2                      \n"
-      "vtrn.8      q9, q8                      \n"
-      "vtrn.8      q11, q10                    \n"
-
-      "vtrn.16     q1, q3                      \n"
-      "vtrn.16     q0, q2                      \n"
-      "vtrn.16     q9, q11                     \n"
-      "vtrn.16     q8, q10                     \n"
-
-      "vtrn.32     q1, q9                      \n"
-      "vtrn.32     q0, q8                      \n"
-      "vtrn.32     q3, q11                     \n"
-      "vtrn.32     q2, q10                     \n"
-
-      "vrev16.8    q0, q0                      \n"
-      "vrev16.8    q1, q1                      \n"
-      "vrev16.8    q2, q2                      \n"
-      "vrev16.8    q3, q3                      \n"
-      "vrev16.8    q8, q8                      \n"
-      "vrev16.8    q9, q9                      \n"
-      "vrev16.8    q10, q10                    \n"
-      "vrev16.8    q11, q11                    \n"
-
-      "mov         %0, %3                      \n"
-
-    MEMACCESS(0)
-      "vst1.8      {d2},  [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d0},  [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d6},  [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d4},  [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d18}, [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d16}, [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d22}, [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d20}, [%0]                 \n"
-
-      "mov         %0, %5                      \n"
-
-    MEMACCESS(0)
-      "vst1.8      {d3},  [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d1},  [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d7},  [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d5},  [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d19}, [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d17}, [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d23}, [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d21}, [%0]                 \n"
-
-      "add         %1, #8*2                    \n"  // src   += 8*2
-      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
-      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
-      "subs        %7,  #8                     \n"  // w     -= 8
-      "bge         1b                          \n"
-
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds        %7, #8                        \n"
-    "beq         4f                            \n"
-
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %7, #2                        \n"
-    "blt         3f                            \n"
-
-    "cmp         %7, #4                        \n"
-    "blt         2f                            \n"
-
-    // TODO(frkoenig): Clean this up
-    // 4x8 block
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.64     {d0}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d1}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d2}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d3}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d4}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d5}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d6}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d7}, [%0]                    \n"
-
-    MEMACCESS(8)
-    "vld1.8      {q15}, [%8]                   \n"
-
-    "vtrn.8      q0, q1                        \n"
-    "vtrn.8      q2, q3                        \n"
-
-    "vtbl.8      d16, {d0, d1}, d30            \n"
-    "vtbl.8      d17, {d0, d1}, d31            \n"
-    "vtbl.8      d18, {d2, d3}, d30            \n"
-    "vtbl.8      d19, {d2, d3}, d31            \n"
-    "vtbl.8      d20, {d4, d5}, d30            \n"
-    "vtbl.8      d21, {d4, d5}, d31            \n"
-    "vtbl.8      d22, {d6, d7}, d30            \n"
-    "vtbl.8      d23, {d6, d7}, d31            \n"
-
-    "mov         %0, %3                        \n"
-
-    MEMACCESS(0)
-    "vst1.32     {d16[0]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d16[1]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d17[0]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d17[1]},  [%0], %4           \n"
-
-    "add         %0, %3, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d20[0]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d20[1]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d21[0]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d21[1]}, [%0]                \n"
-
-    "mov         %0, %5                        \n"
-
-    MEMACCESS(0)
-    "vst1.32     {d18[0]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d18[1]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d19[0]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d19[1]}, [%0], %6            \n"
-
-    "add         %0, %5, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d22[0]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d22[1]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d23[0]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d23[1]},  [%0]               \n"
-
-    "add         %1, #4*2                      \n"  // src   += 4 * 2
-    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
-    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
-    "subs        %7,  #4                       \n"  // w     -= 4
-    "beq         4f                            \n"
-
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp         %7, #2                        \n"
-    "blt         3f                            \n"
-
-    // 2x8 block
-    "2:                                        \n"
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
-
-    "vtrn.8      d0, d1                        \n"
-    "vtrn.8      d2, d3                        \n"
-
-    "mov         %0, %3                        \n"
-
-    MEMACCESS(0)
-    "vst1.64     {d0}, [%0], %4                \n"
-    MEMACCESS(0)
-    "vst1.64     {d2}, [%0]                    \n"
-
-    "mov         %0, %5                        \n"
-
-    MEMACCESS(0)
-    "vst1.64     {d1}, [%0], %6                \n"
-    MEMACCESS(0)
-    "vst1.64     {d3}, [%0]                    \n"
-
-    "add         %1, #2*2                      \n"  // src   += 2 * 2
-    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
-    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
-    "subs        %7,  #2                       \n"  // w     -= 2
-    "beq         4f                            \n"
-
-    // 1x8 block
-    "3:                                        \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[7], d1[7]}, [%1]          \n"
-
-    MEMACCESS(3)
-    "vst1.64     {d0}, [%3]                    \n"
-    MEMACCESS(5)
-    "vst1.64     {d1}, [%5]                    \n"
-
-    "4:                                        \n"
-
-    : "+r"(src_temp),            // %0
-      "+r"(src),                 // %1
-      "+r"(src_stride),          // %2
-      "+r"(dst_a),               // %3
-      "+r"(dst_stride_a),        // %4
-      "+r"(dst_b),               // %5
-      "+r"(dst_stride_b),        // %6
-      "+r"(width)                // %7
-    : "r"(&kVTbl4x4TransposeDi)  // %8
-    : "memory", "cc",
-      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
-  );
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %7, #8                        \n"
+
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
+      "mov         %0, %1                        \n"
+
+      "vld2.8      {d0,  d1},  [%0], %2          \n"
+      "vld2.8      {d2,  d3},  [%0], %2          \n"
+      "vld2.8      {d4,  d5},  [%0], %2          \n"
+      "vld2.8      {d6,  d7},  [%0], %2          \n"
+      "vld2.8      {d16, d17}, [%0], %2          \n"
+      "vld2.8      {d18, d19}, [%0], %2          \n"
+      "vld2.8      {d20, d21}, [%0], %2          \n"
+      "vld2.8      {d22, d23}, [%0]              \n"
+
+      "vtrn.8      q1, q0                        \n"
+      "vtrn.8      q3, q2                        \n"
+      "vtrn.8      q9, q8                        \n"
+      "vtrn.8      q11, q10                      \n"
+
+      "vtrn.16     q1, q3                        \n"
+      "vtrn.16     q0, q2                        \n"
+      "vtrn.16     q9, q11                       \n"
+      "vtrn.16     q8, q10                       \n"
+
+      "vtrn.32     q1, q9                        \n"
+      "vtrn.32     q0, q8                        \n"
+      "vtrn.32     q3, q11                       \n"
+      "vtrn.32     q2, q10                       \n"
+
+      "vrev16.8    q0, q0                        \n"
+      "vrev16.8    q1, q1                        \n"
+      "vrev16.8    q2, q2                        \n"
+      "vrev16.8    q3, q3                        \n"
+      "vrev16.8    q8, q8                        \n"
+      "vrev16.8    q9, q9                        \n"
+      "vrev16.8    q10, q10                      \n"
+      "vrev16.8    q11, q11                      \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.8      {d2},  [%0], %4               \n"
+      "vst1.8      {d0},  [%0], %4               \n"
+      "vst1.8      {d6},  [%0], %4               \n"
+      "vst1.8      {d4},  [%0], %4               \n"
+      "vst1.8      {d18}, [%0], %4               \n"
+      "vst1.8      {d16}, [%0], %4               \n"
+      "vst1.8      {d22}, [%0], %4               \n"
+      "vst1.8      {d20}, [%0]                   \n"
+
+      "mov         %0, %5                        \n"
+
+      "vst1.8      {d3},  [%0], %6               \n"
+      "vst1.8      {d1},  [%0], %6               \n"
+      "vst1.8      {d7},  [%0], %6               \n"
+      "vst1.8      {d5},  [%0], %6               \n"
+      "vst1.8      {d19}, [%0], %6               \n"
+      "vst1.8      {d17}, [%0], %6               \n"
+      "vst1.8      {d23}, [%0], %6               \n"
+      "vst1.8      {d21}, [%0]                   \n"
+
+      "add         %1, #8*2                      \n"  // src   += 8*2
+      "add         %3, %3, %4, lsl #3            \n"  // dst_a += 8 *
+                                                      // dst_stride_a
+      "add         %5, %5, %6, lsl #3            \n"  // dst_b += 8 *
+                                                      // dst_stride_b
+      "subs        %7,  #8                       \n"  // w     -= 8
+      "bge         1b                            \n"
+
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %7, #8                        \n"
+      "beq         4f                            \n"
+
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %7, #2                        \n"
+      "blt         3f                            \n"
+
+      "cmp         %7, #4                        \n"
+      "blt         2f                            \n"
+
+      // TODO(frkoenig): Clean this up
+      // 4x8 block
+      "mov         %0, %1                        \n"
+      "vld1.64     {d0}, [%0], %2                \n"
+      "vld1.64     {d1}, [%0], %2                \n"
+      "vld1.64     {d2}, [%0], %2                \n"
+      "vld1.64     {d3}, [%0], %2                \n"
+      "vld1.64     {d4}, [%0], %2                \n"
+      "vld1.64     {d5}, [%0], %2                \n"
+      "vld1.64     {d6}, [%0], %2                \n"
+      "vld1.64     {d7}, [%0]                    \n"
+
+      "vld1.8      {q15}, [%8]                   \n"
+
+      "vtrn.8      q0, q1                        \n"
+      "vtrn.8      q2, q3                        \n"
+
+      "vtbl.8      d16, {d0, d1}, d30            \n"
+      "vtbl.8      d17, {d0, d1}, d31            \n"
+      "vtbl.8      d18, {d2, d3}, d30            \n"
+      "vtbl.8      d19, {d2, d3}, d31            \n"
+      "vtbl.8      d20, {d4, d5}, d30            \n"
+      "vtbl.8      d21, {d4, d5}, d31            \n"
+      "vtbl.8      d22, {d6, d7}, d30            \n"
+      "vtbl.8      d23, {d6, d7}, d31            \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.32     {d16[0]},  [%0], %4           \n"
+      "vst1.32     {d16[1]},  [%0], %4           \n"
+      "vst1.32     {d17[0]},  [%0], %4           \n"
+      "vst1.32     {d17[1]},  [%0], %4           \n"
+
+      "add         %0, %3, #4                    \n"
+      "vst1.32     {d20[0]}, [%0], %4            \n"
+      "vst1.32     {d20[1]}, [%0], %4            \n"
+      "vst1.32     {d21[0]}, [%0], %4            \n"
+      "vst1.32     {d21[1]}, [%0]                \n"
+
+      "mov         %0, %5                        \n"
+
+      "vst1.32     {d18[0]}, [%0], %6            \n"
+      "vst1.32     {d18[1]}, [%0], %6            \n"
+      "vst1.32     {d19[0]}, [%0], %6            \n"
+      "vst1.32     {d19[1]}, [%0], %6            \n"
+
+      "add         %0, %5, #4                    \n"
+      "vst1.32     {d22[0]},  [%0], %6           \n"
+      "vst1.32     {d22[1]},  [%0], %6           \n"
+      "vst1.32     {d23[0]},  [%0], %6           \n"
+      "vst1.32     {d23[1]},  [%0]               \n"
+
+      "add         %1, #4*2                      \n"  // src   += 4 * 2
+      "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 *
+                                                      // dst_stride_a
+      "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 *
+                                                      // dst_stride_b
+      "subs        %7,  #4                       \n"  // w     -= 4
+      "beq         4f                            \n"
+
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp         %7, #2                        \n"
+      "blt         3f                            \n"
+
+      // 2x8 block
+      "2:                                        \n"
+      "mov         %0, %1                        \n"
+      "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
+      "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
+      "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
+      "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
+      "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
+      "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
+      "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
+      "vld2.16     {d1[3], d3[3]}, [%0]          \n"
+
+      "vtrn.8      d0, d1                        \n"
+      "vtrn.8      d2, d3                        \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.64     {d0}, [%0], %4                \n"
+      "vst1.64     {d2}, [%0]                    \n"
+
+      "mov         %0, %5                        \n"
+
+      "vst1.64     {d1}, [%0], %6                \n"
+      "vst1.64     {d3}, [%0]                    \n"
+
+      "add         %1, #2*2                      \n"  // src   += 2 * 2
+      "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 *
+                                                      // dst_stride_a
+      "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 *
+                                                      // dst_stride_b
+      "subs        %7,  #2                       \n"  // w     -= 2
+      "beq         4f                            \n"
+
+      // 1x8 block
+      "3:                                        \n"
+      "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
+      "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
+      "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
+      "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
+      "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
+      "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
+      "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
+      "vld2.8      {d0[7], d1[7]}, [%1]          \n"
+
+      "vst1.64     {d0}, [%3]                    \n"
+      "vst1.64     {d1}, [%5]                    \n"
+
+      "4:                                        \n"
+
+      : "=&r"(src_temp),           // %0
+        "+r"(src),                 // %1
+        "+r"(src_stride),          // %2
+        "+r"(dst_a),               // %3
+        "+r"(dst_stride_a),        // %4
+        "+r"(dst_b),               // %5
+        "+r"(dst_stride_b),        // %6
+        "+r"(width)                // %7
+      : "r"(&kVTbl4x4TransposeDi)  // %8
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
 }
 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
diff --git a/media/libaom/src/third_party/libyuv/source/rotate_neon64.cc b/media/libaom/src/third_party/libyuv/source/rotate_neon64.cc
index f52c082b3f..43c1581731 100644
--- a/media/libaom/src/third_party/libyuv/source/rotate_neon64.cc
+++ b/media/libaom/src/third_party/libyuv/source/rotate_neon64.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #include "libyuv/basic_types.h"
 
@@ -21,519 +21,419 @@ extern "C" {
 // This module is for GCC Neon armv8 64 bit.
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
-static uvec8 kVTbl4x4Transpose =
-  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
-
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width) {
-  const uint8* src_temp = NULL;
-  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub         %3, %3, #8                      \n"
-
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                          \n"
+static const uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
+                                        2, 6, 10, 14, 3, 7, 11, 15};
+
+void TransposeWx8_NEON(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %w3, %w3, #8                  \n"
+
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
       "mov         %0, %1                        \n"
 
-      MEMACCESS(0)
-      "ld1        {v0.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v1.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v2.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v3.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v4.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v5.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v6.8b}, [%0], %5              \n"
-      MEMACCESS(0)
-      "ld1        {v7.8b}, [%0]                  \n"
-
-      "trn2     v16.8b, v0.8b, v1.8b             \n"
-      "trn1     v17.8b, v0.8b, v1.8b             \n"
-      "trn2     v18.8b, v2.8b, v3.8b             \n"
-      "trn1     v19.8b, v2.8b, v3.8b             \n"
-      "trn2     v20.8b, v4.8b, v5.8b             \n"
-      "trn1     v21.8b, v4.8b, v5.8b             \n"
-      "trn2     v22.8b, v6.8b, v7.8b             \n"
-      "trn1     v23.8b, v6.8b, v7.8b             \n"
-
-      "trn2     v3.4h, v17.4h, v19.4h            \n"
-      "trn1     v1.4h, v17.4h, v19.4h            \n"
-      "trn2     v2.4h, v16.4h, v18.4h            \n"
-      "trn1     v0.4h, v16.4h, v18.4h            \n"
-      "trn2     v7.4h, v21.4h, v23.4h            \n"
-      "trn1     v5.4h, v21.4h, v23.4h            \n"
-      "trn2     v6.4h, v20.4h, v22.4h            \n"
-      "trn1     v4.4h, v20.4h, v22.4h            \n"
-
-      "trn2     v21.2s, v1.2s, v5.2s             \n"
-      "trn1     v17.2s, v1.2s, v5.2s             \n"
-      "trn2     v20.2s, v0.2s, v4.2s             \n"
-      "trn1     v16.2s, v0.2s, v4.2s             \n"
-      "trn2     v23.2s, v3.2s, v7.2s             \n"
-      "trn1     v19.2s, v3.2s, v7.2s             \n"
-      "trn2     v22.2s, v2.2s, v6.2s             \n"
-      "trn1     v18.2s, v2.2s, v6.2s             \n"
+      "ld1         {v0.8b}, [%0], %5             \n"
+      "ld1         {v1.8b}, [%0], %5             \n"
+      "ld1         {v2.8b}, [%0], %5             \n"
+      "ld1         {v3.8b}, [%0], %5             \n"
+      "ld1         {v4.8b}, [%0], %5             \n"
+      "ld1         {v5.8b}, [%0], %5             \n"
+      "ld1         {v6.8b}, [%0], %5             \n"
+      "ld1         {v7.8b}, [%0]                 \n"
+      "mov         %0, %1                        \n"
+
+      "trn2        v16.8b, v0.8b, v1.8b          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "trn1        v17.8b, v0.8b, v1.8b          \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v18.8b, v2.8b, v3.8b          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 1
+      "trn1        v19.8b, v2.8b, v3.8b          \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v20.8b, v4.8b, v5.8b          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 2
+      "trn1        v21.8b, v4.8b, v5.8b          \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v22.8b, v6.8b, v7.8b          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 3
+      "trn1        v23.8b, v6.8b, v7.8b          \n"
+      "add         %0, %0, %5                    \n"
+
+      "trn2        v3.4h, v17.4h, v19.4h         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 4
+      "trn1        v1.4h, v17.4h, v19.4h         \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v2.4h, v16.4h, v18.4h         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 5
+      "trn1        v0.4h, v16.4h, v18.4h         \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v7.4h, v21.4h, v23.4h         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 6
+      "trn1        v5.4h, v21.4h, v23.4h         \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v6.4h, v20.4h, v22.4h         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 7
+      "trn1        v4.4h, v20.4h, v22.4h         \n"
+
+      "trn2        v21.2s, v1.2s, v5.2s          \n"
+      "trn1        v17.2s, v1.2s, v5.2s          \n"
+      "trn2        v20.2s, v0.2s, v4.2s          \n"
+      "trn1        v16.2s, v0.2s, v4.2s          \n"
+      "trn2        v23.2s, v3.2s, v7.2s          \n"
+      "trn1        v19.2s, v3.2s, v7.2s          \n"
+      "trn2        v22.2s, v2.2s, v6.2s          \n"
+      "trn1        v18.2s, v2.2s, v6.2s          \n"
 
       "mov         %0, %2                        \n"
 
-    MEMACCESS(0)
-      "st1      {v17.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v16.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v19.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v18.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v21.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v20.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v23.8b}, [%0], %6               \n"
-    MEMACCESS(0)
-      "st1      {v22.8b}, [%0]                   \n"
+      "st1         {v17.8b}, [%0], %6            \n"
+      "st1         {v16.8b}, [%0], %6            \n"
+      "st1         {v19.8b}, [%0], %6            \n"
+      "st1         {v18.8b}, [%0], %6            \n"
+      "st1         {v21.8b}, [%0], %6            \n"
+      "st1         {v20.8b}, [%0], %6            \n"
+      "st1         {v23.8b}, [%0], %6            \n"
+      "st1         {v22.8b}, [%0]                \n"
 
       "add         %1, %1, #8                    \n"  // src += 8
       "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
-      "subs        %3, %3, #8                    \n"  // w   -= 8
+      "subs        %w3, %w3, #8                  \n"  // w   -= 8
       "b.ge        1b                            \n"
 
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds        %3, %3, #8                      \n"
-    "b.eq        4f                              \n"
-
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %3, #2                          \n"
-    "b.lt        3f                              \n"
-
-    "cmp         %3, #4                          \n"
-    "b.lt        2f                              \n"
-
-    // 4x8 block
-    "mov         %0, %1                          \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[3], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[3], [%0]                     \n"
-
-    "mov         %0, %2                          \n"
-
-    MEMACCESS(4)
-    "ld1      {v2.16b}, [%4]                     \n"
-
-    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
-    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
-
-    // TODO(frkoenig): Rework shuffle above to
-    // write out with 4 instead of 8 writes.
-    MEMACCESS(0)
-    "st1 {v3.s}[0], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v3.s}[1], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v3.s}[2], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v3.s}[3], [%0]                         \n"
-
-    "add         %0, %2, #4                      \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[0], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[1], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[2], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[3], [%0]                         \n"
-
-    "add         %1, %1, #4                      \n"  // src += 4
-    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
-    "subs        %3, %3, #4                      \n"  // w   -= 4
-    "b.eq        4f                              \n"
-
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp         %3, #2                          \n"
-    "b.lt        3f                              \n"
-
-    // 2x8 block
-    "2:                                          \n"
-    "mov         %0, %1                          \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[3], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[3], [%0]                     \n"
-
-    "trn2    v2.8b, v0.8b, v1.8b                 \n"
-    "trn1    v3.8b, v0.8b, v1.8b                 \n"
-
-    "mov         %0, %2                          \n"
-
-    MEMACCESS(0)
-    "st1     {v3.8b}, [%0], %6                   \n"
-    MEMACCESS(0)
-    "st1     {v2.8b}, [%0]                       \n"
-
-    "add         %1, %1, #2                      \n"  // src += 2
-    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
-    "subs        %3, %3,  #2                     \n"  // w   -= 2
-    "b.eq        4f                              \n"
-
-    // 1x8 block
-    "3:                                          \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[0], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[1], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[2], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[3], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[4], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[5], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[6], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[7], [%1]                 \n"
-
-    MEMACCESS(2)
-    "st1         {v0.8b}, [%2]                   \n"
-
-    "4:                                          \n"
-
-    : "+r"(src_temp),                             // %0
-      "+r"(src),                                  // %1
-      "+r"(dst),                                  // %2
-      "+r"(width64)                               // %3
-    : "r"(&kVTbl4x4Transpose),                    // %4
-      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
-    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-  );
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %w3, %w3, #8                  \n"
+      "b.eq        4f                            \n"
+
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %w3, #2                       \n"
+      "b.lt        3f                            \n"
+
+      "cmp         %w3, #4                       \n"
+      "b.lt        2f                            \n"
+
+      // 4x8 block
+      "mov         %0, %1                        \n"
+      "ld1         {v0.s}[0], [%0], %5           \n"
+      "ld1         {v0.s}[1], [%0], %5           \n"
+      "ld1         {v0.s}[2], [%0], %5           \n"
+      "ld1         {v0.s}[3], [%0], %5           \n"
+      "ld1         {v1.s}[0], [%0], %5           \n"
+      "ld1         {v1.s}[1], [%0], %5           \n"
+      "ld1         {v1.s}[2], [%0], %5           \n"
+      "ld1         {v1.s}[3], [%0]               \n"
+
+      "mov         %0, %2                        \n"
+
+      "ld1         {v2.16b}, [%4]                \n"
+
+      "tbl         v3.16b, {v0.16b}, v2.16b      \n"
+      "tbl         v0.16b, {v1.16b}, v2.16b      \n"
+
+      // TODO(frkoenig): Rework shuffle above to
+      // write out with 4 instead of 8 writes.
+      "st1 {v3.s}[0], [%0], %6                     \n"
+      "st1 {v3.s}[1], [%0], %6                     \n"
+      "st1 {v3.s}[2], [%0], %6                     \n"
+      "st1 {v3.s}[3], [%0]                         \n"
+
+      "add         %0, %2, #4                      \n"
+      "st1 {v0.s}[0], [%0], %6                     \n"
+      "st1 {v0.s}[1], [%0], %6                     \n"
+      "st1 {v0.s}[2], [%0], %6                     \n"
+      "st1 {v0.s}[3], [%0]                         \n"
+
+      "add         %1, %1, #4                      \n"  // src += 4
+      "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
+      "subs        %w3, %w3, #4                    \n"  // w   -= 4
+      "b.eq        4f                              \n"
+
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp         %w3, #2                         \n"
+      "b.lt        3f                              \n"
+
+      // 2x8 block
+      "2:                                          \n"
+      "mov         %0, %1                          \n"
+      "ld1     {v0.h}[0], [%0], %5                 \n"
+      "ld1     {v1.h}[0], [%0], %5                 \n"
+      "ld1     {v0.h}[1], [%0], %5                 \n"
+      "ld1     {v1.h}[1], [%0], %5                 \n"
+      "ld1     {v0.h}[2], [%0], %5                 \n"
+      "ld1     {v1.h}[2], [%0], %5                 \n"
+      "ld1     {v0.h}[3], [%0], %5                 \n"
+      "ld1     {v1.h}[3], [%0]                     \n"
+
+      "trn2    v2.8b, v0.8b, v1.8b                 \n"
+      "trn1    v3.8b, v0.8b, v1.8b                 \n"
+
+      "mov         %0, %2                          \n"
+
+      "st1     {v3.8b}, [%0], %6                   \n"
+      "st1     {v2.8b}, [%0]                       \n"
+
+      "add         %1, %1, #2                      \n"  // src += 2
+      "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
+      "subs        %w3, %w3,  #2                   \n"  // w   -= 2
+      "b.eq        4f                              \n"
+
+      // 1x8 block
+      "3:                                          \n"
+      "ld1         {v0.b}[0], [%1], %5             \n"
+      "ld1         {v0.b}[1], [%1], %5             \n"
+      "ld1         {v0.b}[2], [%1], %5             \n"
+      "ld1         {v0.b}[3], [%1], %5             \n"
+      "ld1         {v0.b}[4], [%1], %5             \n"
+      "ld1         {v0.b}[5], [%1], %5             \n"
+      "ld1         {v0.b}[6], [%1], %5             \n"
+      "ld1         {v0.b}[7], [%1]                 \n"
+
+      "st1         {v0.8b}, [%2]                   \n"
+
+      "4:                                          \n"
+
+      : "=&r"(src_temp),                          // %0
+        "+r"(src),                                // %1
+        "+r"(dst),                                // %2
+        "+r"(width)                               // %3
+      : "r"(&kVTbl4x4Transpose),                  // %4
+        "r"(static_cast<ptrdiff_t>(src_stride)),  // %5
+        "r"(static_cast<ptrdiff_t>(dst_stride))   // %6
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23");
 }
 
-static uint8 kVTbl4x4TransposeDi[32] =
-  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
-    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
+static const uint8_t kVTbl4x4TransposeDi[32] = {
+    0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
+    1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
 
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
+void TransposeUVWx8_NEON(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
                          int width) {
-  const uint8* src_temp = NULL;
-  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub       %4, %4, #8                      \n"
-
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                        \n"
-    "mov       %0, %1                          \n"
-
-    MEMACCESS(0)
-    "ld1       {v0.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v1.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v2.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v3.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v4.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v5.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v6.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v7.16b}, [%0]                  \n"
-
-    "trn1      v16.16b, v0.16b, v1.16b         \n"
-    "trn2      v17.16b, v0.16b, v1.16b         \n"
-    "trn1      v18.16b, v2.16b, v3.16b         \n"
-    "trn2      v19.16b, v2.16b, v3.16b         \n"
-    "trn1      v20.16b, v4.16b, v5.16b         \n"
-    "trn2      v21.16b, v4.16b, v5.16b         \n"
-    "trn1      v22.16b, v6.16b, v7.16b         \n"
-    "trn2      v23.16b, v6.16b, v7.16b         \n"
-
-    "trn1      v0.8h, v16.8h, v18.8h           \n"
-    "trn2      v1.8h, v16.8h, v18.8h           \n"
-    "trn1      v2.8h, v20.8h, v22.8h           \n"
-    "trn2      v3.8h, v20.8h, v22.8h           \n"
-    "trn1      v4.8h, v17.8h, v19.8h           \n"
-    "trn2      v5.8h, v17.8h, v19.8h           \n"
-    "trn1      v6.8h, v21.8h, v23.8h           \n"
-    "trn2      v7.8h, v21.8h, v23.8h           \n"
-
-    "trn1      v16.4s, v0.4s, v2.4s            \n"
-    "trn2      v17.4s, v0.4s, v2.4s            \n"
-    "trn1      v18.4s, v1.4s, v3.4s            \n"
-    "trn2      v19.4s, v1.4s, v3.4s            \n"
-    "trn1      v20.4s, v4.4s, v6.4s            \n"
-    "trn2      v21.4s, v4.4s, v6.4s            \n"
-    "trn1      v22.4s, v5.4s, v7.4s            \n"
-    "trn2      v23.4s, v5.4s, v7.4s            \n"
-
-    "mov       %0, %2                          \n"
-
-    MEMACCESS(0)
-    "st1       {v16.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v17.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v19.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v16.d}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.d}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v17.d}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v19.d}[1], [%0]                \n"
-
-    "mov       %0, %3                          \n"
-
-    MEMACCESS(0)
-    "st1       {v20.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v22.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v21.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v23.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v20.d}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v22.d}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v21.d}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v23.d}[1], [%0]                \n"
-
-    "add       %1, %1, #16                     \n"  // src   += 8*2
-    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
-    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
-    "subs      %4, %4,  #8                     \n"  // w     -= 8
-    "b.ge      1b                              \n"
-
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds      %4, %4, #8                      \n"
-    "b.eq      4f                              \n"
-
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp       %4, #2                          \n"
-    "b.lt      3f                              \n"
-
-    "cmp       %4, #4                          \n"
-    "b.lt      2f                              \n"
-
-    // TODO(frkoenig): Clean this up
-    // 4x8 block
-    "mov       %0, %1                          \n"
-    MEMACCESS(0)
-    "ld1       {v0.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v1.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v2.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v3.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v4.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v5.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v6.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v7.8b}, [%0]                   \n"
-
-    MEMACCESS(8)
-    "ld1       {v30.16b}, [%8], #16            \n"
-    "ld1       {v31.16b}, [%8]                 \n"
-
-    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
-    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
-    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
-    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
-
-    "mov       %0, %2                          \n"
-
-    MEMACCESS(0)
-    "st1       {v16.s}[0],  [%0], %6           \n"
-    MEMACCESS(0)
-    "st1       {v16.s}[1],  [%0], %6           \n"
-    MEMACCESS(0)
-    "st1       {v16.s}[2],  [%0], %6           \n"
-    MEMACCESS(0)
-    "st1       {v16.s}[3],  [%0], %6           \n"
-
-    "add       %0, %2, #4                      \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[2], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[3], [%0]                \n"
-
-    "mov       %0, %3                          \n"
-
-    MEMACCESS(0)
-    "st1       {v17.s}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v17.s}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v17.s}[2], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v17.s}[3], [%0], %7            \n"
-
-    "add       %0, %3, #4                      \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[0],  [%0], %7           \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[1],  [%0], %7           \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[2],  [%0], %7           \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[3],  [%0]               \n"
-
-    "add       %1, %1, #8                      \n"  // src   += 4 * 2
-    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
-    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
-    "subs      %4,  %4,  #4                    \n"  // w     -= 4
-    "b.eq      4f                              \n"
-
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp       %4, #2                          \n"
-    "b.lt      3f                              \n"
-
-    // 2x8 block
-    "2:                                        \n"
-    "mov       %0, %1                          \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[3], [%0]           \n"
-
-    "trn1      v4.8b, v0.8b, v2.8b             \n"
-    "trn2      v5.8b, v0.8b, v2.8b             \n"
-    "trn1      v6.8b, v1.8b, v3.8b             \n"
-    "trn2      v7.8b, v1.8b, v3.8b             \n"
-
-    "mov       %0, %2                          \n"
-
-    MEMACCESS(0)
-    "st1       {v4.d}[0], [%0], %6             \n"
-    MEMACCESS(0)
-    "st1       {v6.d}[0], [%0]                 \n"
-
-    "mov       %0, %3                          \n"
-
-    MEMACCESS(0)
-    "st1       {v5.d}[0], [%0], %7             \n"
-    MEMACCESS(0)
-    "st1       {v7.d}[0], [%0]                 \n"
-
-    "add       %1, %1, #4                      \n"  // src   += 2 * 2
-    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
-    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
-    "subs      %4,  %4,  #2                    \n"  // w     -= 2
-    "b.eq      4f                              \n"
-
-    // 1x8 block
-    "3:                                        \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[7], [%1]           \n"
-
-    MEMACCESS(2)
-    "st1       {v0.d}[0], [%2]                 \n"
-    MEMACCESS(3)
-    "st1       {v1.d}[0], [%3]                 \n"
-
-    "4:                                        \n"
-
-    : "+r"(src_temp),                             // %0
-      "+r"(src),                                  // %1
-      "+r"(dst_a),                                // %2
-      "+r"(dst_b),                                // %3
-      "+r"(width64)                               // %4
-    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
-      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
-      "r"(&kVTbl4x4TransposeDi)                   // %8
-    : "memory", "cc",
-      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
-      "v30", "v31"
-  );
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %w4, %w4, #8                  \n"
+
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
+      "mov         %0, %1                        \n"
+
+      "ld1         {v0.16b}, [%0], %5            \n"
+      "ld1         {v1.16b}, [%0], %5            \n"
+      "ld1         {v2.16b}, [%0], %5            \n"
+      "ld1         {v3.16b}, [%0], %5            \n"
+      "ld1         {v4.16b}, [%0], %5            \n"
+      "ld1         {v5.16b}, [%0], %5            \n"
+      "ld1         {v6.16b}, [%0], %5            \n"
+      "ld1         {v7.16b}, [%0]                \n"
+      "mov         %0, %1                        \n"
+
+      "trn1        v16.16b, v0.16b, v1.16b       \n"
+      "trn2        v17.16b, v0.16b, v1.16b       \n"
+      "trn1        v18.16b, v2.16b, v3.16b       \n"
+      "trn2        v19.16b, v2.16b, v3.16b       \n"
+      "trn1        v20.16b, v4.16b, v5.16b       \n"
+      "trn2        v21.16b, v4.16b, v5.16b       \n"
+      "trn1        v22.16b, v6.16b, v7.16b       \n"
+      "trn2        v23.16b, v6.16b, v7.16b       \n"
+
+      "trn1        v0.8h, v16.8h, v18.8h         \n"
+      "trn2        v1.8h, v16.8h, v18.8h         \n"
+      "trn1        v2.8h, v20.8h, v22.8h         \n"
+      "trn2        v3.8h, v20.8h, v22.8h         \n"
+      "trn1        v4.8h, v17.8h, v19.8h         \n"
+      "trn2        v5.8h, v17.8h, v19.8h         \n"
+      "trn1        v6.8h, v21.8h, v23.8h         \n"
+      "trn2        v7.8h, v21.8h, v23.8h         \n"
+
+      "trn1        v16.4s, v0.4s, v2.4s          \n"
+      "trn2        v17.4s, v0.4s, v2.4s          \n"
+      "trn1        v18.4s, v1.4s, v3.4s          \n"
+      "trn2        v19.4s, v1.4s, v3.4s          \n"
+      "trn1        v20.4s, v4.4s, v6.4s          \n"
+      "trn2        v21.4s, v4.4s, v6.4s          \n"
+      "trn1        v22.4s, v5.4s, v7.4s          \n"
+      "trn2        v23.4s, v5.4s, v7.4s          \n"
+
+      "mov         %0, %2                        \n"
+
+      "st1         {v16.d}[0], [%0], %6          \n"
+      "st1         {v18.d}[0], [%0], %6          \n"
+      "st1         {v17.d}[0], [%0], %6          \n"
+      "st1         {v19.d}[0], [%0], %6          \n"
+      "st1         {v16.d}[1], [%0], %6          \n"
+      "st1         {v18.d}[1], [%0], %6          \n"
+      "st1         {v17.d}[1], [%0], %6          \n"
+      "st1         {v19.d}[1], [%0]              \n"
+
+      "mov         %0, %3                        \n"
+
+      "st1         {v20.d}[0], [%0], %7          \n"
+      "st1         {v22.d}[0], [%0], %7          \n"
+      "st1         {v21.d}[0], [%0], %7          \n"
+      "st1         {v23.d}[0], [%0], %7          \n"
+      "st1         {v20.d}[1], [%0], %7          \n"
+      "st1         {v22.d}[1], [%0], %7          \n"
+      "st1         {v21.d}[1], [%0], %7          \n"
+      "st1         {v23.d}[1], [%0]              \n"
+
+      "add         %1, %1, #16                   \n"  // src   += 8*2
+      "add         %2, %2, %6, lsl #3            \n"  // dst_a += 8 *
+                                                      // dst_stride_a
+      "add         %3, %3, %7, lsl #3            \n"  // dst_b += 8 *
+                                                      // dst_stride_b
+      "subs        %w4, %w4,  #8                 \n"  // w     -= 8
+      "b.ge        1b                            \n"
+
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %w4, %w4, #8                  \n"
+      "b.eq        4f                            \n"
+
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %w4, #2                       \n"
+      "b.lt        3f                            \n"
+
+      "cmp         %w4, #4                       \n"
+      "b.lt        2f                            \n"
+
+      // TODO(frkoenig): Clean this up
+      // 4x8 block
+      "mov       %0, %1                          \n"
+      "ld1       {v0.8b}, [%0], %5               \n"
+      "ld1       {v1.8b}, [%0], %5               \n"
+      "ld1       {v2.8b}, [%0], %5               \n"
+      "ld1       {v3.8b}, [%0], %5               \n"
+      "ld1       {v4.8b}, [%0], %5               \n"
+      "ld1       {v5.8b}, [%0], %5               \n"
+      "ld1       {v6.8b}, [%0], %5               \n"
+      "ld1       {v7.8b}, [%0]                   \n"
+
+      "ld1       {v30.16b}, [%8], #16            \n"
+      "ld1       {v31.16b}, [%8]                 \n"
+
+      "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
+      "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
+      "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
+      "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
+
+      "mov       %0, %2                          \n"
+
+      "st1       {v16.s}[0],  [%0], %6           \n"
+      "st1       {v16.s}[1],  [%0], %6           \n"
+      "st1       {v16.s}[2],  [%0], %6           \n"
+      "st1       {v16.s}[3],  [%0], %6           \n"
+
+      "add       %0, %2, #4                      \n"
+      "st1       {v18.s}[0], [%0], %6            \n"
+      "st1       {v18.s}[1], [%0], %6            \n"
+      "st1       {v18.s}[2], [%0], %6            \n"
+      "st1       {v18.s}[3], [%0]                \n"
+
+      "mov       %0, %3                          \n"
+
+      "st1       {v17.s}[0], [%0], %7            \n"
+      "st1       {v17.s}[1], [%0], %7            \n"
+      "st1       {v17.s}[2], [%0], %7            \n"
+      "st1       {v17.s}[3], [%0], %7            \n"
+
+      "add       %0, %3, #4                      \n"
+      "st1       {v19.s}[0],  [%0], %7           \n"
+      "st1       {v19.s}[1],  [%0], %7           \n"
+      "st1       {v19.s}[2],  [%0], %7           \n"
+      "st1       {v19.s}[3],  [%0]               \n"
+
+      "add       %1, %1, #8                      \n"  // src   += 4 * 2
+      "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 *
+                                                      // dst_stride_a
+      "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 *
+                                                      // dst_stride_b
+      "subs      %w4,  %w4,  #4                  \n"  // w     -= 4
+      "b.eq      4f                              \n"
+
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp       %w4, #2                         \n"
+      "b.lt      3f                              \n"
+
+      // 2x8 block
+      "2:                                        \n"
+      "mov       %0, %1                          \n"
+      "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
+      "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
+      "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
+      "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[3], [%0]           \n"
+
+      "trn1      v4.8b, v0.8b, v2.8b             \n"
+      "trn2      v5.8b, v0.8b, v2.8b             \n"
+      "trn1      v6.8b, v1.8b, v3.8b             \n"
+      "trn2      v7.8b, v1.8b, v3.8b             \n"
+
+      "mov       %0, %2                          \n"
+
+      "st1       {v4.d}[0], [%0], %6             \n"
+      "st1       {v6.d}[0], [%0]                 \n"
+
+      "mov       %0, %3                          \n"
+
+      "st1       {v5.d}[0], [%0], %7             \n"
+      "st1       {v7.d}[0], [%0]                 \n"
+
+      "add       %1, %1, #4                      \n"  // src   += 2 * 2
+      "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 *
+                                                      // dst_stride_a
+      "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 *
+                                                      // dst_stride_b
+      "subs      %w4,  %w4,  #2                  \n"  // w     -= 2
+      "b.eq      4f                              \n"
+
+      // 1x8 block
+      "3:                                        \n"
+      "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[7], [%1]           \n"
+
+      "st1       {v0.d}[0], [%2]                 \n"
+      "st1       {v1.d}[0], [%3]                 \n"
+
+      "4:                                        \n"
+
+      : "=&r"(src_temp),                            // %0
+        "+r"(src),                                  // %1
+        "+r"(dst_a),                                // %2
+        "+r"(dst_b),                                // %3
+        "+r"(width)                                 // %4
+      : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+        "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
+        "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
+        "r"(&kVTbl4x4TransposeDi)                   // %8
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
 }
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
diff --git a/media/libaom/src/third_party/libyuv/source/rotate_win.cc b/media/libaom/src/third_party/libyuv/source/rotate_win.cc
index 2760066dfd..e887dd525c 100644
--- a/media/libaom/src/third_party/libyuv/source/rotate_win.cc
+++ b/media/libaom/src/third_party/libyuv/source/rotate_win.cc
@@ -8,27 +8,28 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for Visual C x86.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
-    defined(_MSC_VER) && !defined(__clang__)
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 
-__declspec(naked)
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
+__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
+                                          int src_stride,
+                                          uint8_t* dst,
+                                          int dst_stride,
+                                          int width) {
   __asm {
     push      edi
     push      esi
     push      ebp
-    mov       eax, [esp + 12 + 4]   // src
-    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       eax, [esp + 12 + 4]  // src
+    mov       edi, [esp + 12 + 8]  // src_stride
     mov       edx, [esp + 12 + 12]  // dst
     mov       esi, [esp + 12 + 16]  // dst_stride
     mov       ecx, [esp + 12 + 20]  // width
@@ -111,18 +112,20 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride,
   }
 }
 
-__declspec(naked)
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
-                         int w) {
+__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
+                                           int src_stride,
+                                           uint8_t* dst_a,
+                                           int dst_stride_a,
+                                           uint8_t* dst_b,
+                                           int dst_stride_b,
+                                           int w) {
   __asm {
     push      ebx
     push      esi
     push      edi
     push      ebp
-    mov       eax, [esp + 16 + 4]   // src
-    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       eax, [esp + 16 + 4]  // src
+    mov       edi, [esp + 16 + 8]  // src_stride
     mov       edx, [esp + 16 + 12]  // dst_a
     mov       esi, [esp + 16 + 16]  // dst_stride_a
     mov       ebx, [esp + 16 + 20]  // dst_b
@@ -134,9 +137,9 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     mov       ecx, [ecx + 16 + 28]  // w
 
     align      4
- convertloop:
     // Read in the data from the source pointer.
     // First round of bit swap.
+  convertloop:
     movdqu    xmm0, [eax]
     movdqu    xmm1, [eax + edi]
     lea       eax, [eax + 2 * edi]
@@ -163,13 +166,13 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       eax, [eax + 2 * edi]
     movdqu    [esp], xmm5  // backup xmm5
     neg       edi
-    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    movdqa    xmm5, xmm6  // use xmm5 as temp register.
     punpcklbw xmm6, xmm7
     punpckhbw xmm5, xmm7
     movdqa    xmm7, xmm5
     lea       eax, [eax + 8 * edi + 16]
     neg       edi
-    // Second round of bit swap.
+        // Second round of bit swap.
     movdqa    xmm5, xmm0
     punpcklwd xmm0, xmm2
     punpckhwd xmm5, xmm2
@@ -184,12 +187,13 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     movdqa    xmm6, xmm5
     movdqu    xmm5, [esp]  // restore xmm5
     movdqu    [esp], xmm6  // backup xmm6
-    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    movdqa    xmm6, xmm5  // use xmm6 as temp register.
     punpcklwd xmm5, xmm7
     punpckhwd xmm6, xmm7
     movdqa    xmm7, xmm6
-    // Third round of bit swap.
-    // Write to the destination pointer.
+
+        // Third round of bit swap.
+        // Write to the destination pointer.
     movdqa    xmm6, xmm0
     punpckldq xmm0, xmm4
     punpckhdq xmm6, xmm4
@@ -201,7 +205,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm4
     lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    movdqa    xmm0, xmm2  // use xmm0 as the temp register.
     punpckldq xmm2, xmm6
     movlpd    qword ptr [edx], xmm2
     movhpd    qword ptr [ebx], xmm2
@@ -210,7 +214,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm0
     lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    movdqa    xmm0, xmm1  // use xmm0 as the temp register.
     punpckldq xmm1, xmm5
     movlpd    qword ptr [edx], xmm1
     movhpd    qword ptr [ebx], xmm1
@@ -219,7 +223,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm0
     lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    movdqa    xmm0, xmm3  // use xmm0 as the temp register.
     punpckldq xmm3, xmm7
     movlpd    qword ptr [edx], xmm3
     movhpd    qword ptr [ebx], xmm3
diff --git a/media/libaom/src/third_party/libyuv/source/row_any.cc b/media/libaom/src/third_party/libyuv/source/row_any.cc
index 1cb1f6b930..7216373bcd 100644
--- a/media/libaom/src/third_party/libyuv/source/row_any.cc
+++ b/media/libaom/src/third_party/libyuv/source/row_any.cc
@@ -19,148 +19,271 @@ namespace libyuv {
 extern "C" {
 #endif
 
+// memset for temp is meant to clear the source buffer (not dest) so that
+// SIMD that reads full multiple of 16 bytes will not trigger msan errors.
+// memset is not needed for production, as the garbage values are processed but
+// not used, although there may be edge cases for subsampling.
+// The size of the buffer is based on the largest read, which can be inferred
+// by the source type (e.g. ARGB) and the mask (last parameter), or by examining
+// the source code for how much the source pointers are advanced.
+
 // Subsampled source needs to be increase by 1 of not even.
 #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
 
+// Any 4 planes to 1 with yuvconstants
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)              \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
+               const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {         \
+    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
+    memset(temp, 0, 64 * 4); /* for msan */                                  \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
+    }                                                                        \
+    memcpy(temp, y_buf + n, r);                                              \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+    memcpy(temp + 192, a_buf + n, r);                                        \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
+             yuvconstants, MASK + 1);                                        \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
+           SS(r, DUVSHIFT) * BPP);                                           \
+  }
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_NEON
+ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_MSA
+ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_MMI
+ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7)
+#endif
+#undef ANY41C
+
 // Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 uint8* dst_ptr, int width) {                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
-      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                             \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);             \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,          \
+               const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 4]);                             \
+    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                \
+    int r = width & MASK;                                           \
+    int n = width & ~MASK;                                          \
+    if (n > 0) {                                                    \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                    \
+    }                                                               \
+    memcpy(temp, y_buf + n, r);                                     \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));     \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,             \
+           SS(r, DUVSHIFT) * BPP);                                  \
+  }
 
-#ifdef HAS_I422TOARGBROW_SSSE3
-ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+// Merge functions.
+#ifdef HAS_MERGERGBROW_SSSE3
+ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
 #endif
-#ifdef HAS_I444TOARGBROW_SSSE3
-ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
-ANY31(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
-ANY31(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, 1, 0, 4, 7)
-ANY31(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, 1, 0, 4, 7)
-ANY31(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
-ANY31(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
-ANY31(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
-ANY31(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
-ANY31(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
-ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7)
+#ifdef HAS_MERGERGBROW_NEON
+ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGERGBROW_MMI
+ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
+#endif
+#ifdef HAS_I422TOYUY2ROW_SSE2
 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
 ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
-#endif  // HAS_I444TOARGBROW_SSSE3
-#ifdef HAS_I422TORGB24ROW_AVX2
-ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
 #endif
-#ifdef HAS_I422TORAWROW_AVX2
-ANY31(I422ToRAWRow_Any_AVX2, I422ToRAWRow_AVX2, 1, 0, 3, 15)
+#ifdef HAS_I422TOYUY2ROW_AVX2
+ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)
+ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_MSA
+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
 #endif
-#ifdef HAS_J422TOARGBROW_SSSE3
-ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#ifdef HAS_I422TOYUY2ROW_MMI
+ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7)
 #endif
-#ifdef HAS_J422TOARGBROW_AVX2
-ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15)
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
 #endif
-#ifdef HAS_I422TOARGBROW_AVX2
-ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#ifdef HAS_I422TOUYVYROW_MSA
+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
 #endif
-#ifdef HAS_I422TOBGRAROW_AVX2
-ANY31(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, 1, 0, 4, 15)
+#ifdef HAS_I422TOUYVYROW_MMI
+ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7)
 #endif
-#ifdef HAS_I422TORGBAROW_AVX2
-ANY31(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#ifdef HAS_BLENDPLANEROW_AVX2
+ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
 #endif
-#ifdef HAS_I422TOABGRROW_AVX2
-ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15)
+#ifdef HAS_BLENDPLANEROW_SSSE3
+ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
 #endif
-#ifdef HAS_I444TOARGBROW_AVX2
-ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#ifdef HAS_BLENDPLANEROW_MMI
+ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7)
+#endif
+#undef ANY31
+
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
+// Any 3 planes to 1 with yuvconstants
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,           \
+               const uint8_t* v_buf, uint8_t* dst_ptr,               \
+               const struct YuvConstants* yuvconstants, int width) { \
+    SIMD_ALIGNED(uint8_t temp[128 * 4]);                             \
+    memset(temp, 0, 128 * 3); /* for YUY2 and msan */                \
+    int r = width & MASK;                                            \
+    int n = width & ~MASK;                                           \
+    if (n > 0) {                                                     \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);       \
+    }                                                                \
+    memcpy(temp, y_buf + n, r);                                      \
+    memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
+    memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
+    if (width & 1) {                                                 \
+      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];   \
+      temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1];   \
+    }                                                                \
+    ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
+             MASK + 1);                                              \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384,              \
+           SS(r, DUVSHIFT) * BPP);                                   \
+  }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOAR30ROW_SSSE3
+ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOAR30ROW_AVX2
+ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_SSSE3
+ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
+#endif  // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
 #endif
-#ifdef HAS_I411TOARGBROW_AVX2
-ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
+#ifdef HAS_I422TOARGBROW_AVX2
+ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
 #endif
 #ifdef HAS_I422TOARGB4444ROW_AVX2
-ANY31(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
 #endif
 #ifdef HAS_I422TOARGB1555ROW_AVX2
-ANY31(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
 #endif
 #ifdef HAS_I422TORGB565ROW_AVX2
-ANY31(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
 #endif
 #ifdef HAS_I422TOARGBROW_NEON
-ANY31(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
-ANY31(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
-ANY31(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
-ANY31(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, 1, 0, 4, 7)
-ANY31(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, 1, 0, 4, 7)
-ANY31(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
-ANY31(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
-ANY31(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, 1, 0, 3, 7)
-ANY31(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
-ANY31(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
-ANY31(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
-#endif
-#ifdef HAS_I422TOYUY2ROW_NEON
-ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOUYVYROW_NEON
-ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
-#endif
-#undef ANY31
+ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_MSA
+ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_MMI
+ANY31C(I444ToARGBRow_Any_MMI, I444ToARGBRow_MMI, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MMI, I422ToARGBRow_MMI, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MMI, I422ToRGB24Row_MMI, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MMI, I422ToARGB4444Row_MMI, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MMI, I422ToARGB1555Row_MMI, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MMI, I422ToRGB565Row_MMI, 1, 0, 2, 7)
+ANY31C(I422ToRGBARow_Any_MMI, I422ToRGBARow_MMI, 1, 0, 4, 7)
+#endif
+#undef ANY31C
 
-// Any 2 planes to 1.
-#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \
-    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
-                 uint8* dst_ptr, int width) {                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                   \
-      }                                                                        \
-      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
-      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
-             SS(r, UVSHIFT) * SBPP2);                                          \
-      ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                         \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+// Any 3 planes of 16 bit to 1 with yuvconstants
+// TODO(fbarchard): consider sharing this code with ANY31C
+#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf,            \
+               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
+               int width) {                                               \
+    SIMD_ALIGNED(T temp[16 * 3]);                                         \
+    SIMD_ALIGNED(uint8_t out[64]);                                        \
+    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */               \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
+    }                                                                     \
+    memcpy(temp, y_buf + n, r * SBPP);                                    \
+    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
+    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
+    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);  \
+  }
 
-// Biplanar to RGB.
-#ifdef HAS_NV12TOARGBROW_SSSE3
-ANY21(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
-ANY21(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#ifdef HAS_I210TOAR30ROW_SSSE3
+ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
 #endif
-#ifdef HAS_NV12TOARGBROW_AVX2
-ANY21(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
-ANY21(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#ifdef HAS_I210TOARGBROW_SSSE3
+ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
 #endif
-#ifdef HAS_NV12TOARGBROW_NEON
-ANY21(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
-ANY21(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#ifdef HAS_I210TOARGBROW_AVX2
+ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #endif
-#ifdef HAS_NV12TORGB565ROW_SSSE3
-ANY21(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
-ANY21(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#ifdef HAS_I210TOAR30ROW_AVX2
+ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #endif
-#ifdef HAS_NV12TORGB565ROW_AVX2
-ANY21(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
-ANY21(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
-#endif
-#ifdef HAS_NV12TORGB565ROW_NEON
-ANY21(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
-ANY21(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#ifdef HAS_I210TOARGBROW_MMI
+ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7)
 #endif
+#undef ANY31CT
+
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+               int width) {                                                   \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                       \
+    memset(temp, 0, 64 * 2); /* for msan */                                   \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                    \
+    }                                                                         \
+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
+    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
+           SS(r, UVSHIFT) * SBPP2);                                           \
+    ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                          \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                           \
+  }
 
 // Merge functions.
 #ifdef HAS_MERGEUVROW_SSE2
@@ -172,7 +295,18 @@ ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
 #ifdef HAS_MERGEUVROW_NEON
 ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
 #endif
-
+#ifdef HAS_MERGEUVROW_MSA
+ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_MMI
+ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_NEON
+ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
 // Math functions.
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
@@ -201,40 +335,166 @@ ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
 #ifdef HAS_ARGBSUBTRACTROW_NEON
 ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBMULTIPLYROW_MSA
+ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_MMI
+ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1)
+#endif
+#ifdef HAS_ARGBADDROW_MSA
+ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_MMI
+ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_MSA
+ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_MMI
+ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1)
+#endif
 #ifdef HAS_SOBELROW_SSE2
 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
 #endif
 #ifdef HAS_SOBELROW_NEON
 ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
 #endif
+#ifdef HAS_SOBELROW_MSA
+ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_MMI
+ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7)
+#endif
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
 #endif
 #ifdef HAS_SOBELTOPLANEROW_NEON
 ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
 #endif
+#ifdef HAS_SOBELTOPLANEROW_MSA
+ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_MMI
+ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7)
+#endif
 #ifdef HAS_SOBELXYROW_SSE2
 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
 #endif
 #ifdef HAS_SOBELXYROW_NEON
 ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
 #endif
+#ifdef HAS_SOBELXYROW_MSA
+ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_MMI
+ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7)
+#endif
 #undef ANY21
 
+// Any 2 planes to 1 with yuvconstants
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)            \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {          \
+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                      \
+    memset(temp, 0, 128 * 2); /* for msan */                                  \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
+    }                                                                         \
+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
+    memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+           SS(r, UVSHIFT) * SBPP2);                                           \
+    ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1);           \
+    memcpy(dst_ptr + n * BPP, temp + 256, r * BPP);                           \
+  }
+
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_MSA
+ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_MMI
+ANY21C(NV12ToARGBRow_Any_MMI, NV12ToARGBRow_MMI, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_SSSE3
+ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_AVX2
+ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV21TOARGBROW_NEON
+ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_MSA
+ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_MMI
+ANY21C(NV21ToARGBRow_Any_MMI, NV21ToARGBRow_MMI, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB24ROW_NEON
+ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV21TORGB24ROW_NEON
+ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV12TORGB24ROW_SSSE3
+ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV12TORGB24ROW_MMI
+ANY21C(NV12ToRGB24Row_Any_MMI, NV12ToRGB24Row_MMI, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV21TORGB24ROW_SSSE3
+ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV12TORGB24ROW_AVX2
+ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+#ifdef HAS_NV21TORGB24ROW_AVX2
+ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+#ifdef HAS_NV21TORGB24ROW_MMI
+ANY21C(NV21ToRGB24Row_Any_MMI, NV21ToRGB24Row_MMI, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_MSA
+ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_MMI
+ANY21C(NV12ToRGB565Row_Any_MMI, NV12ToRGB565Row_MMI, 1, 1, 2, 2, 7)
+#endif
+#undef ANY21C
+
 // Any 1 to 1.
-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
+    memset(temp, 0, 128); /* for YUY2 and msan */                         \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+  }
 
 #ifdef HAS_COPYROW_AVX
 ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
@@ -252,32 +512,53 @@ ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
 ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
 ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
 #endif
-#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
+ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORAWROW_AVX2)
+ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
 ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
 ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
 ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
 #endif
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
 #if defined(HAS_J400TOARGBROW_SSE2)
 ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
 #endif
 #if defined(HAS_J400TOARGBROW_AVX2)
 ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
 #endif
-#if defined(HAS_I400TOARGBROW_SSE2)
-ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
-ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
-#endif
-#if defined(HAS_YUY2TOARGBROW_SSSE3)
-ANY11(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
-ANY11(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
 ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
 ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
 ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
 ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
 ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
 #endif
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)
+#endif
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
+#endif
 #if defined(HAS_RGB565TOARGBROW_AVX2)
 ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
 #endif
@@ -287,10 +568,6 @@ ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
 #if defined(HAS_ARGB4444TOARGBROW_AVX2)
 ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
 #endif
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-ANY11(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
-ANY11(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
-#endif
 #if defined(HAS_ARGBTORGB24ROW_NEON)
 ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
 ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
@@ -298,16 +575,44 @@ ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
 ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
 ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
 ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
-ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
-ANY11(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
-ANY11(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MMI)
+ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3)
+ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3)
+ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3)
+ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3)
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
+#endif
+#if defined(HAS_RAWTORGB24ROW_MMI)
+ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
 #endif
 #ifdef HAS_ARGBTOYROW_AVX2
 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
 #endif
+#ifdef HAS_ABGRTOYROW_AVX2
+ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
+#endif
 #ifdef HAS_ARGBTOYJROW_AVX2
 ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
 #endif
+#ifdef HAS_RGBATOYJROW_AVX2
+ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31)
+#endif
 #ifdef HAS_UYVYTOYROW_AVX2
 ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
 #endif
@@ -327,63 +632,198 @@ ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
 #ifdef HAS_ARGBTOYJROW_SSSE3
 ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
 #endif
+#ifdef HAS_RGBATOYJROW_SSSE3
+ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
 #ifdef HAS_ARGBTOYROW_NEON
 ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ARGBTOYROW_MSA
+ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_MMI
+ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7)
+#endif
 #ifdef HAS_ARGBTOYJROW_NEON
 ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_RGBATOYJROW_NEON
+ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_MSA
+ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_MMI
+ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7)
+#endif
 #ifdef HAS_BGRATOYROW_NEON
 ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_BGRATOYROW_MSA
+ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_MMI
+ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7)
+#endif
 #ifdef HAS_ABGRTOYROW_NEON
 ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ABGRTOYROW_MSA
+ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_MMI
+ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7)
+#endif
 #ifdef HAS_RGBATOYROW_NEON
 ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_RGBATOYROW_MSA
+ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYROW_MMI
+ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7)
+#endif
 #ifdef HAS_RGB24TOYROW_NEON
 ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
 #endif
+#ifdef HAS_RGB24TOYJROW_AVX2
+ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RGB24TOYJROW_SSSE3
+ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_NEON
+ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_MSA
+ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYROW_MMI
+ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7)
+#endif
 #ifdef HAS_RAWTOYROW_NEON
 ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
 #endif
+#ifdef HAS_RAWTOYJROW_AVX2
+ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RAWTOYJROW_SSSE3
+ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_NEON
+ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_MSA
+ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYROW_MMI
+ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7)
+#endif
 #ifdef HAS_RGB565TOYROW_NEON
 ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
 #endif
+#ifdef HAS_RGB565TOYROW_MSA
+ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB565TOYROW_MMI
+ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7)
+#endif
 #ifdef HAS_ARGB1555TOYROW_NEON
 ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
 #endif
+#ifdef HAS_ARGB1555TOYROW_MSA
+ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
+#endif
+#ifdef HAS_ARGB1555TOYROW_MMI
+ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7)
+#endif
 #ifdef HAS_ARGB4444TOYROW_NEON
 ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
 #endif
+#ifdef HAS_ARGB4444TOYROW_MMI
+ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7)
+#endif
 #ifdef HAS_YUY2TOYROW_NEON
 ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
 #endif
 #ifdef HAS_UYVYTOYROW_NEON
-ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_MSA
+ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_MMI
+ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7)
+#endif
+#ifdef HAS_UYVYTOYROW_MSA
+ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_MMI
+ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
+#endif
+#ifdef HAS_AYUVTOYROW_NEON
+ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_SWAPUVROW_SSSE3
+ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15)
+#endif
+#ifdef HAS_SWAPUVROW_AVX2
+ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31)
+#endif
+#ifdef HAS_SWAPUVROW_NEON
+ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
 #endif
 #ifdef HAS_RGB24TOARGBROW_NEON
 ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
 #endif
+#ifdef HAS_RGB24TOARGBROW_MSA
+ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_MMI
+ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3)
+#endif
 #ifdef HAS_RAWTOARGBROW_NEON
 ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
 #endif
+#ifdef HAS_RAWTORGBAROW_NEON
+ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_MSA
+ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RAWTOARGBROW_MMI
+ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3)
+#endif
 #ifdef HAS_RGB565TOARGBROW_NEON
 ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_RGB565TOARGBROW_MSA
+ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_RGB565TOARGBROW_MMI
+ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
 #ifdef HAS_ARGB1555TOARGBROW_NEON
 ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_ARGB1555TOARGBROW_MSA
+ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_MMI
+ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
 #ifdef HAS_ARGB4444TOARGBROW_NEON
 ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_ARGB4444TOARGBROW_MSA
+ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_MMI
+ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
 #endif
-#ifdef HAS_ARGBATTENUATEROW_SSE2
-ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3)
-#endif
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
 #endif
@@ -396,67 +836,336 @@ ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
 #ifdef HAS_ARGBATTENUATEROW_NEON
 ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBATTENUATEROW_MSA
+ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_MMI
+ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
+ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
+ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MMI
+ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7)
+#endif
 #undef ANY11
 
+// Any 1 to 1 blended.  Destination is read, modify, write.
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
+    memset(temp, 0, 64 * 2); /* for msan */                               \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    memcpy(temp + 64, dst_ptr + n * BPP, r * BPP);                        \
+    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                        \
+  }
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBCOPYALPHAROW_MMI
+ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI
+ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7)
+#endif
+#undef ANY11B
+
 // Any 1 to 1 with parameter.
 #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
-                 T shuffler, int width) {                                      \
-      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
-      memset(temp, 0, 64);  /* for msan */                                     \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \
-      }                                                                        \
-      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
-      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \
-      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \
-    }
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                        \
+    memset(temp, 0, 64); /* for msan */                                        \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                                    \
+    }                                                                          \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
+    ANY_SIMD(temp, temp + 64, param, MASK + 1);                                \
+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                             \
+  }
+
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11P(I400ToARGBRow_Any_SSE2,
+       I400ToARGBRow_SSE2,
+       const struct YuvConstants*,
+       1,
+       4,
+       7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11P(I400ToARGBRow_Any_AVX2,
+       I400ToARGBRow_AVX2,
+       const struct YuvConstants*,
+       1,
+       4,
+       15)
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ANY11P(I400ToARGBRow_Any_NEON,
+       I400ToARGBRow_NEON,
+       const struct YuvConstants*,
+       1,
+       4,
+       7)
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ANY11P(I400ToARGBRow_Any_MSA,
+       I400ToARGBRow_MSA,
+       const struct YuvConstants*,
+       1,
+       4,
+       15)
+#endif
+#if defined(HAS_I400TOARGBROW_MMI)
+ANY11P(I400ToARGBRow_Any_MMI,
+       I400ToARGBRow_MMI,
+       const struct YuvConstants*,
+       1,
+       4,
+       7)
+#endif
 
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
-       const uint32, 4, 2, 3)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
+       ARGBToRGB565DitherRow_SSE2,
+       const uint32_t,
+       4,
+       2,
+       3)
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
-       const uint32, 4, 2, 7)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
+       ARGBToRGB565DitherRow_AVX2,
+       const uint32_t,
+       4,
+       2,
+       7)
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
-       const uint32, 4, 2, 7)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_SSE2
-ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON,
+       ARGBToRGB565DitherRow_NEON,
+       const uint32_t,
+       4,
+       2,
+       7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ANY11P(ARGBToRGB565DitherRow_Any_MSA,
+       ARGBToRGB565DitherRow_MSA,
+       const uint32_t,
+       4,
+       2,
+       7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+ANY11P(ARGBToRGB565DitherRow_Any_MMI,
+       ARGBToRGB565DitherRow_MMI,
+       const uint32_t,
+       4,
+       2,
+       3)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
-ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
-ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_NEON
-ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_MSA
+ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBSHUFFLEROW_MMI
+ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1)
+#endif
+#undef ANY11P
 #undef ANY11P
 
+// Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
+#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)             \
+  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
+    SIMD_ALIGNED(STYPE temp[32]);                                            \
+    SIMD_ALIGNED(DTYPE out[32]);                                             \
+    memset(temp, 0, 32 * SBPP); /* for msan */                               \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, dst_ptr, scale, n);                                  \
+    }                                                                        \
+    memcpy(temp, src_ptr + n, r * SBPP);                                     \
+    ANY_SIMD(temp, out, scale, MASK + 1);                                    \
+    memcpy(dst_ptr + n, out, r * BPP);                                       \
+  }
+
+#ifdef HAS_CONVERT16TO8ROW_SSSE3
+ANY11C(Convert16To8Row_Any_SSSE3,
+       Convert16To8Row_SSSE3,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       15)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+ANY11C(Convert16To8Row_Any_AVX2,
+       Convert16To8Row_AVX2,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       31)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_SSE2
+ANY11C(Convert8To16Row_Any_SSE2,
+       Convert8To16Row_SSE2,
+       1,
+       2,
+       uint8_t,
+       uint16_t,
+       15)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+ANY11C(Convert8To16Row_Any_AVX2,
+       Convert8To16Row_AVX2,
+       1,
+       2,
+       uint8_t,
+       uint16_t,
+       31)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
+#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
+  void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
+    SIMD_ALIGNED(ST temp[32]);                                          \
+    SIMD_ALIGNED(T out[32]);                                            \
+    memset(temp, 0, SBPP * 32); /* for msan */                          \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                             \
+    }                                                                   \
+    memcpy(temp, src_ptr + n, r * SBPP);                                \
+    ANY_SIMD(temp, out, param, MASK + 1);                               \
+    memcpy(dst_ptr + n, out, r * BPP);                                  \
+  }
+
+#ifdef HAS_HALFFLOATROW_SSE2
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
+#endif
+#ifdef HAS_HALFFLOATROW_AVX2
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_F16C
+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
+ANY11P16(HalfFloat1Row_Any_F16C,
+         HalfFloat1Row_F16C,
+         uint16_t,
+         uint16_t,
+         2,
+         2,
+         15)
+#endif
+#ifdef HAS_HALFFLOATROW_NEON
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
+ANY11P16(HalfFloat1Row_Any_NEON,
+         HalfFloat1Row_NEON,
+         uint16_t,
+         uint16_t,
+         2,
+         2,
+         7)
+#endif
+#ifdef HAS_HALFFLOATROW_MSA
+ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
+#endif
+#ifdef HAS_BYTETOFLOATROW_NEON
+ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
+#endif
+#undef ANY11P16
+
+// Any 1 to 1 with yuvconstants
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                  \
+               const struct YuvConstants* yuvconstants, int width) {      \
+    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
+    memset(temp, 0, 128); /* for YUY2 and msan */                         \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                        \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+  }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
+#endif
+#if defined(HAS_YUY2TOARGBROW_MMI)
+ANY11C(YUY2ToARGBRow_Any_MMI, YUY2ToARGBRow_MMI, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7)
+#endif
+#undef ANY11C
+
 // Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
-#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
-                 ptrdiff_t src_stride_ptr, int width,                          \
-                 int source_y_fraction) {                                      \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
-      }                                                                        \
-      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
-      memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
-      ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                           \
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr,                     \
+               ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                      \
+    memset(temp, 0, 64 * 2); /* for msan */                                  \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
+    }                                                                        \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+    memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
+    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+  }
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
@@ -464,31 +1173,31 @@ ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
 #ifdef HAS_INTERPOLATEROW_SSSE3
 ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
 #endif
-#ifdef HAS_INTERPOLATEROW_SSE2
-ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15)
-#endif
 #ifdef HAS_INTERPOLATEROW_NEON
 ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
 #endif
-#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
-ANY11T(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, 1, 1, 3)
+#ifdef HAS_INTERPOLATEROW_MSA
+ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_MMI
+ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7)
 #endif
 #undef ANY11T
 
 // Any 1 to 1 mirror.
-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                                   \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
-      memset(temp, 0, 64);  /* for msan */                                     \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                               \
-      }                                                                        \
-      memcpy(temp, src_ptr, r * BPP);                                          \
-      ANY_SIMD(temp, temp + 64, MASK + 1);                                     \
-      memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP);    \
-    }
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
+    memset(temp, 0, 64); /* for msan */                                   \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                            \
+    }                                                                     \
+    memcpy(temp, src_ptr, r* BPP);                                        \
+    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
+    memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
+  }
 
 #ifdef HAS_MIRRORROW_AVX2
 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
@@ -496,11 +1205,26 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
 #ifdef HAS_MIRRORROW_SSSE3
 ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
 #endif
-#ifdef HAS_MIRRORROW_SSE2
-ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15)
-#endif
 #ifdef HAS_MIRRORROW_NEON
-ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_MSA
+ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
+#endif
+#ifdef HAS_MIRRORROW_MMI
+ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_AVX2
+ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
+#endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_NEON
+ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
+#endif
+#ifdef HAS_MIRRORUVROW_MSA
+ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7)
 #endif
 #ifdef HAS_ARGBMIRRORROW_AVX2
 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
@@ -509,53 +1233,69 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
 ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
 #endif
 #ifdef HAS_ARGBMIRRORROW_NEON
-ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_MSA
+ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_MMI
+ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1)
+#endif
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_RGB24MIRRORROW_NEON
+ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
 #endif
 #undef ANY11M
 
 // Any 1 plane. (memset)
-#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)                                  \
-    void NAMEANY(uint8* dst_ptr, T v32, int width) {                           \
-      SIMD_ALIGNED(uint8 temp[64]);                                            \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(dst_ptr, v32, n);                                             \
-      }                                                                        \
-      ANY_SIMD(temp, v32, MASK + 1);                                           \
-      memcpy(dst_ptr + n * BPP, temp, r * BPP);                                \
-    }
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)        \
+  void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64]);                  \
+    memset(temp, 0, 64); /* for msan */              \
+    int r = width & MASK;                            \
+    int n = width & ~MASK;                           \
+    if (n > 0) {                                     \
+      ANY_SIMD(dst_ptr, v32, n);                     \
+    }                                                \
+    ANY_SIMD(temp, v32, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, temp, r * BPP);        \
+  }
 
 #ifdef HAS_SETROW_X86
-ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
+ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)
 #endif
 #ifdef HAS_SETROW_NEON
-ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)
 #endif
 #ifdef HAS_ARGBSETROW_NEON
-ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
+#endif
+#ifdef HAS_ARGBSETROW_MSA
+ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
+#endif
+#ifdef HAS_ARGBSETROW_MMI
+ANY1(ARGBSetRow_Any_MMI, ARGBSetRow_MMI, uint32_t, 4, 3)
 #endif
 #undef ANY1
 
 // Any 1 to 2.  Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                 \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
-      SIMD_ALIGNED(uint8 temp[128 * 3]);                                       \
-      memset(temp, 0, 128);  /* for msan */                                    \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \
-      }                                                                        \
-      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
-      if ((width & 1) && BPP == 4) {  /* repeat last 4 bytes for subsampler */ \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, 4);                          \
-      }                                                                        \
-      ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \
-      memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \
-      memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \
-    }
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)          \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,  \
+               int width) {                                             \
+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                \
+    memset(temp, 0, 128); /* for msan */                                \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(src_ptr, dst_u, dst_v, n);                               \
+    }                                                                   \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                   \
+    memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));       \
+    memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));       \
+  }
 
 #ifdef HAS_SPLITUVROW_SSE2
 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
@@ -566,8 +1306,11 @@ ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
 #ifdef HAS_SPLITUVROW_NEON
 ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
 #endif
-#ifdef HAS_SPLITUVROW_MIPS_DSPR2
-ANY12(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, 0, 2, 0, 15)
+#ifdef HAS_SPLITUVROW_MSA
+ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_MMI
+ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7)
 #endif
 #ifdef HAS_ARGBTOUV444ROW_SSSE3
 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
@@ -576,51 +1319,90 @@ ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
 ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
 ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
 #endif
-#ifdef HAS_ARGBTOUV422ROW_SSSE3
-ANY12(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, 0, 4, 1, 15)
-#endif
 #ifdef HAS_YUY2TOUV422ROW_SSE2
 ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
 ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
 #endif
 #ifdef HAS_YUY2TOUV422ROW_NEON
 ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
-ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15)
-ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
 ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
 ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
 #endif
+#ifdef HAS_YUY2TOUV422ROW_MSA
+ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_MMI
+ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7)
+ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15)
+ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15)
+#endif
 #undef ANY12
 
+// Any 1 to 3.  Outputs RGB planes.
+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,     \
+               uint8_t* dst_b, int width) {                                \
+    SIMD_ALIGNED(uint8_t temp[16 * 6]);                                    \
+    memset(temp, 0, 16 * 3); /* for msan */                                \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                           \
+    }                                                                      \
+    memcpy(temp, src_ptr + n * BPP, r * BPP);                              \
+    ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
+    memcpy(dst_r + n, temp + 16 * 3, r);                                   \
+    memcpy(dst_g + n, temp + 16 * 4, r);                                   \
+    memcpy(dst_b + n, temp + 16 * 5, r);                                   \
+  }
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_SPLITRGBROW_NEON
+ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
+#endif
+#ifdef HAS_SPLITRGBROW_MMI
+ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
+#endif
+
 // Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
 // 128 byte row allows for 32 avx ARGB pixels.
-#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                          \
-    void NAMEANY(const uint8* src_ptr, int src_stride_ptr,                     \
-                 uint8* dst_u, uint8* dst_v, int width) {                      \
-      SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \
-      memset(temp, 0, 128 * 2);  /* for msan */                                \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
-      }                                                                        \
-      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
-      memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \
-             SS(r, UVSHIFT) * BPP);                                            \
-      if ((width & 1) && BPP == 4) {  /* repeat last 4 bytes for subsampler */ \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, 4);                          \
-        memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, 4);                    \
-      }                                                                        \
-      ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
-      memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
-      memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
-    }
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u,   \
+               uint8_t* dst_v, int width) {                                  \
+    SIMD_ALIGNED(uint8_t temp[128 * 4]);                                     \
+    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
+    }                                                                        \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
+    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
+           SS(r, UVSHIFT) * BPP);                                            \
+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+             BPP);                                                           \
+      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+    }                                                                        \
+    ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
+    memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
+    memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+  }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
 #endif
+#ifdef HAS_ABGRTOUVROW_AVX2
+ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_AVX2
+ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
+#endif
 #ifdef HAS_ARGBTOUVROW_SSSE3
 ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
 ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
@@ -639,41 +1421,141 @@ ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
 #ifdef HAS_ARGBTOUVROW_NEON
 ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ARGBTOUVROW_MSA
+ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_MMI
+ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15)
+#endif
 #ifdef HAS_ARGBTOUVJROW_NEON
 ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ARGBTOUVJROW_MSA
+ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_MMI
+ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15)
+#endif
 #ifdef HAS_BGRATOUVROW_NEON
 ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_BGRATOUVROW_MSA
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_MMI
+ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15)
+#endif
 #ifdef HAS_ABGRTOUVROW_NEON
 ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ABGRTOUVROW_MSA
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_MMI
+ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15)
+#endif
 #ifdef HAS_RGBATOUVROW_NEON
 ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_RGBATOUVROW_MSA
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_MMI
+ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15)
+#endif
 #ifdef HAS_RGB24TOUVROW_NEON
 ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
 #endif
+#ifdef HAS_RGB24TOUVROW_MSA
+ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_MMI
+ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15)
+#endif
 #ifdef HAS_RAWTOUVROW_NEON
 ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
 #endif
+#ifdef HAS_RAWTOUVROW_MSA
+ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_MMI
+ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15)
+#endif
 #ifdef HAS_RGB565TOUVROW_NEON
 ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
 #endif
+#ifdef HAS_RGB565TOUVROW_MSA
+ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_MMI
+ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15)
+#endif
 #ifdef HAS_ARGB1555TOUVROW_NEON
 ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
 #endif
+#ifdef HAS_ARGB1555TOUVROW_MSA
+ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_MMI
+ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15)
+#endif
 #ifdef HAS_ARGB4444TOUVROW_NEON
 ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
 #endif
+#ifdef HAS_ARGB4444TOUVROW_MMI
+ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15)
+#endif
 #ifdef HAS_YUY2TOUVROW_NEON
 ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
 #endif
 #ifdef HAS_UYVYTOUVROW_NEON
 ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
 #endif
+#ifdef HAS_YUY2TOUVROW_MSA
+ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_MMI
+ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_MSA
+ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_MMI
+ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
+#endif
 #undef ANY12S
 
+// Any 1 to 1 with source stride (2 rows of source).  Outputs UV plane.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu,  \
+               int width) {                                                  \
+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                     \
+    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n);                          \
+    }                                                                        \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
+    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
+           SS(r, UVSHIFT) * BPP);                                            \
+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+             BPP);                                                           \
+      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+    }                                                                        \
+    ANY_SIMD(temp, 128, temp + 256, MASK + 1);                               \
+    memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2);                 \
+  }
+
+#ifdef HAS_AYUVTOVUROW_NEON
+ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
+ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
+#endif
+#undef ANY11S
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/media/libaom/src/third_party/libyuv/source/row_common.cc b/media/libaom/src/third_party/libyuv/source/row_common.cc
index 49875894fe..79aed5c787 100644
--- a/media/libaom/src/third_party/libyuv/source/row_common.cc
+++ b/media/libaom/src/third_party/libyuv/source/row_common.cc
@@ -10,72 +10,97 @@
 
 #include "libyuv/row.h"
 
+#include <stdio.h>
 #include <string.h>  // For memcpy and memset.
 
 #include "libyuv/basic_types.h"
+#include "libyuv/convert_argb.h"  // For kYuvI601Constants
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
+// The following ifdef from row_win makes the C code match the row_win code,
+// which is 7 bit fixed point.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+#define LIBYUV_RGB7 1
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+    defined(_M_IX86)
+#define LIBYUV_ARGBTOUV_PAVGB 1
+#define LIBYUV_RGBTOU_TRUNCATE 1
+#endif
+
 // llvm x86 is poor at ternary operator, so use branchless min/max.
 
 #define USE_BRANCHLESS 1
 #if USE_BRANCHLESS
-static __inline int32 clamp0(int32 v) {
-  return ((-(v) >> 31) & (v));
+static __inline int32_t clamp0(int32_t v) {
+  return -(v >= 0) & v;
 }
-
-static __inline int32 clamp255(int32 v) {
-  return (((255 - (v)) >> 31) | (v)) & 255;
+// TODO(fbarchard): make clamp255 preserve negative values.
+static __inline int32_t clamp255(int32_t v) {
+  return (-(v >= 255) | v) & 255;
 }
 
-static __inline uint32 Clamp(int32 val) {
-  int v = clamp0(val);
-  return (uint32)(clamp255(v));
+static __inline int32_t clamp1023(int32_t v) {
+  return (-(v >= 1023) | v) & 1023;
 }
 
-static __inline uint32 Abs(int32 v) {
-  int m = v >> 31;
+static __inline uint32_t Abs(int32_t v) {
+  int m = -(v < 0);
   return (v + m) ^ m;
 }
-#else  // USE_BRANCHLESS
-static __inline int32 clamp0(int32 v) {
+#else   // USE_BRANCHLESS
+static __inline int32_t clamp0(int32_t v) {
   return (v < 0) ? 0 : v;
 }
 
-static __inline int32 clamp255(int32 v) {
+static __inline int32_t clamp255(int32_t v) {
   return (v > 255) ? 255 : v;
 }
 
-static __inline uint32 Clamp(int32 val) {
-  int v = clamp0(val);
-  return (uint32)(clamp255(v));
+static __inline int32_t clamp1023(int32_t v) {
+  return (v > 1023) ? 1023 : v;
 }
 
-static __inline uint32 Abs(int32 v) {
+static __inline uint32_t Abs(int32_t v) {
   return (v < 0) ? -v : v;
 }
 #endif  // USE_BRANCHLESS
+static __inline uint32_t Clamp(int32_t val) {
+  int v = clamp0(val);
+  return (uint32_t)(clamp255(v));
+}
+
+static __inline uint32_t Clamp10(int32_t val) {
+  int v = clamp0(val);
+  return (uint32_t)(clamp1023(v));
+}
 
-#ifdef LIBYUV_LITTLE_ENDIAN
-#define WRITEWORD(p, v) *(uint32*)(p) = v
+// Little Endian
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+    defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) ||     \
+    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define WRITEWORD(p, v) *(uint32_t*)(p) = v
 #else
-static inline void WRITEWORD(uint8* p, uint32 v) {
-  p[0] = (uint8)(v & 255);
-  p[1] = (uint8)((v >> 8) & 255);
-  p[2] = (uint8)((v >> 16) & 255);
-  p[3] = (uint8)((v >> 24) & 255);
+static inline void WRITEWORD(uint8_t* p, uint32_t v) {
+  p[0] = (uint8_t)(v & 255);
+  p[1] = (uint8_t)((v >> 8) & 255);
+  p[2] = (uint8_t)((v >> 16) & 255);
+  p[3] = (uint8_t)((v >> 24) & 255);
 }
 #endif
 
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_rgb24[0];
-    uint8 g = src_rgb24[1];
-    uint8 r = src_rgb24[2];
+    uint8_t b = src_rgb24[0];
+    uint8_t g = src_rgb24[1];
+    uint8_t r = src_rgb24[2];
     dst_argb[0] = b;
     dst_argb[1] = g;
     dst_argb[2] = r;
@@ -85,12 +110,12 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
   }
 }
 
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 r = src_raw[0];
-    uint8 g = src_raw[1];
-    uint8 b = src_raw[2];
+    uint8_t r = src_raw[0];
+    uint8_t g = src_raw[1];
+    uint8_t b = src_raw[2];
     dst_argb[0] = b;
     dst_argb[1] = g;
     dst_argb[2] = r;
@@ -100,12 +125,43 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
   }
 }
 
-void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t r = src_raw[0];
+    uint8_t g = src_raw[1];
+    uint8_t b = src_raw[2];
+    dst_rgba[0] = 255u;
+    dst_rgba[1] = b;
+    dst_rgba[2] = g;
+    dst_rgba[3] = r;
+    dst_rgba += 4;
+    src_raw += 3;
+  }
+}
+
+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t r = src_raw[0];
+    uint8_t g = src_raw[1];
+    uint8_t b = src_raw[2];
+    dst_rgb24[0] = b;
+    dst_rgb24[1] = g;
+    dst_rgb24[2] = r;
+    dst_rgb24 += 3;
+    src_raw += 3;
+  }
+}
+
+void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
+                       uint8_t* dst_argb,
+                       int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_rgb565[0] & 0x1f;
-    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r = src_rgb565[1] >> 3;
+    uint8_t b = src_rgb565[0] & 0x1f;
+    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r = src_rgb565[1] >> 3;
     dst_argb[0] = (b << 3) | (b >> 2);
     dst_argb[1] = (g << 2) | (g >> 4);
     dst_argb[2] = (r << 3) | (r >> 2);
@@ -115,14 +171,15 @@ void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
   }
 }
 
-void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
+                         uint8_t* dst_argb,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb1555[0] & 0x1f;
-    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
-    uint8 a = src_argb1555[1] >> 7;
+    uint8_t b = src_argb1555[0] & 0x1f;
+    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t a = src_argb1555[1] >> 7;
     dst_argb[0] = (b << 3) | (b >> 2);
     dst_argb[1] = (g << 3) | (g >> 2);
     dst_argb[2] = (r << 3) | (r >> 2);
@@ -132,14 +189,15 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
   }
 }
 
-void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
+                         uint8_t* dst_argb,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb4444[0] & 0x0f;
-    uint8 g = src_argb4444[0] >> 4;
-    uint8 r = src_argb4444[1] & 0x0f;
-    uint8 a = src_argb4444[1] >> 4;
+    uint8_t b = src_argb4444[0] & 0x0f;
+    uint8_t g = src_argb4444[0] >> 4;
+    uint8_t r = src_argb4444[1] & 0x0f;
+    uint8_t a = src_argb4444[1] >> 4;
     dst_argb[0] = (b << 4) | b;
     dst_argb[1] = (g << 4) | g;
     dst_argb[2] = (r << 4) | r;
@@ -149,12 +207,56 @@ void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
   }
 }
 
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t ar30;
+    memcpy(&ar30, src_ar30, sizeof ar30);
+    uint32_t b = (ar30 >> 2) & 0xff;
+    uint32_t g = (ar30 >> 12) & 0xff;
+    uint32_t r = (ar30 >> 22) & 0xff;
+    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
+    *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
+    dst_argb += 4;
+    src_ar30 += 4;
+  }
+}
+
+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb[0];
-    uint8 g = src_argb[1];
-    uint8 r = src_argb[2];
+    uint32_t ar30;
+    memcpy(&ar30, src_ar30, sizeof ar30);
+    uint32_t b = (ar30 >> 2) & 0xff;
+    uint32_t g = (ar30 >> 12) & 0xff;
+    uint32_t r = (ar30 >> 22) & 0xff;
+    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
+    *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
+    dst_abgr += 4;
+    src_ar30 += 4;
+  }
+}
+
+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t ar30;
+    memcpy(&ar30, src_ar30, sizeof ar30);
+    uint32_t b = ar30 & 0x3ff;
+    uint32_t ga = ar30 & 0xc00ffc00;
+    uint32_t r = (ar30 >> 20) & 0x3ff;
+    *(uint32_t*)(dst_ab30) = r | ga | (b << 20);
+    dst_ab30 += 4;
+    src_ar30 += 4;
+  }
+}
+
+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
     dst_rgb[0] = b;
     dst_rgb[1] = g;
     dst_rgb[2] = r;
@@ -163,12 +265,12 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb[0];
-    uint8 g = src_argb[1];
-    uint8 r = src_argb[2];
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
     dst_rgb[0] = r;
     dst_rgb[1] = g;
     dst_rgb[2] = b;
@@ -177,25 +279,25 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 2;
-    uint8 r0 = src_argb[2] >> 3;
-    uint8 b1 = src_argb[4] >> 3;
-    uint8 g1 = src_argb[5] >> 2;
-    uint8 r1 = src_argb[6] >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
-              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 2;
+    uint8_t r0 = src_argb[2] >> 3;
+    uint8_t b1 = src_argb[4] >> 3;
+    uint8_t g1 = src_argb[5] >> 2;
+    uint8_t r1 = src_argb[6] >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+                           (r1 << 27));
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 2;
-    uint8 r0 = src_argb[2] >> 3;
-    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 2;
+    uint8_t r0 = src_argb[2] >> 3;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
@@ -207,132 +309,235 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 // endian will not affect order of the original matrix.  But the dither4
 // will containing the first pixel in the lower byte for little endian
 // or the upper byte for big endian.
-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
-                             const uint32 dither4, int width) {
+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_rgb,
+                             const uint32_t dither4,
+                             int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     int dither0 = ((const unsigned char*)(&dither4))[x & 3];
     int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
-    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
-    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
-    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
-    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
-              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
+    uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
+    uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
+    uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+                           (r1 << 27));
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
     int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
-    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
-    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 3;
-    uint8 r0 = src_argb[2] >> 3;
-    uint8 a0 = src_argb[3] >> 7;
-    uint8 b1 = src_argb[4] >> 3;
-    uint8 g1 = src_argb[5] >> 3;
-    uint8 r1 = src_argb[6] >> 3;
-    uint8 a1 = src_argb[7] >> 7;
-    *(uint32*)(dst_rgb) =
-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
-        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 3;
+    uint8_t r0 = src_argb[2] >> 3;
+    uint8_t a0 = src_argb[3] >> 7;
+    uint8_t b1 = src_argb[4] >> 3;
+    uint8_t g1 = src_argb[5] >> 3;
+    uint8_t r1 = src_argb[6] >> 3;
+    uint8_t a1 = src_argb[7] >> 7;
+    *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+                            (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 3;
-    uint8 r0 = src_argb[2] >> 3;
-    uint8 a0 = src_argb[3] >> 7;
-    *(uint16*)(dst_rgb) =
-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 3;
+    uint8_t r0 = src_argb[2] >> 3;
+    uint8_t a0 = src_argb[3] >> 7;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
   }
 }
 
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb[0] >> 4;
-    uint8 g0 = src_argb[1] >> 4;
-    uint8 r0 = src_argb[2] >> 4;
-    uint8 a0 = src_argb[3] >> 4;
-    uint8 b1 = src_argb[4] >> 4;
-    uint8 g1 = src_argb[5] >> 4;
-    uint8 r1 = src_argb[6] >> 4;
-    uint8 a1 = src_argb[7] >> 4;
-    *(uint32*)(dst_rgb) =
-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
-        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+    uint8_t b0 = src_argb[0] >> 4;
+    uint8_t g0 = src_argb[1] >> 4;
+    uint8_t r0 = src_argb[2] >> 4;
+    uint8_t a0 = src_argb[3] >> 4;
+    uint8_t b1 = src_argb[4] >> 4;
+    uint8_t g1 = src_argb[5] >> 4;
+    uint8_t r1 = src_argb[6] >> 4;
+    uint8_t a1 = src_argb[7] >> 4;
+    *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+                            (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
-    uint8 b0 = src_argb[0] >> 4;
-    uint8 g0 = src_argb[1] >> 4;
-    uint8 r0 = src_argb[2] >> 4;
-    uint8 a0 = src_argb[3] >> 4;
-    *(uint16*)(dst_rgb) =
-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+    uint8_t b0 = src_argb[0] >> 4;
+    uint8_t g0 = src_argb[1] >> 4;
+    uint8_t r0 = src_argb[2] >> 4;
+    uint8_t a0 = src_argb[3] >> 4;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+  }
+}
+
+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
+    uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
+    uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
+    uint32_t a0 = (src_abgr[3] >> 6);
+    *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
+    dst_ar30 += 4;
+    src_abgr += 4;
+  }
+}
+
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2);
+    uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
+    uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
+    uint32_t a0 = (src_argb[3] >> 6);
+    *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
+    dst_ar30 += 4;
+    src_argb += 4;
   }
 }
 
-static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
-  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
+static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+  return ((33 * r + 65 * g + 13 * b) >> 7) + 16;
+}
+#else
+// 8 bit
+// Intel SSE/AVX uses the following equivalent formula
+// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
+//  return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
+//  0x7e80) >> 8;
+
+static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+  return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
 }
+#endif
 
-static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+#ifdef LIBYUV_RGBTOU_TRUNCATE
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+  return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
+}
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+  return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
+}
+#else
+// TODO(fbarchard): Add rounding to SIMD and use this
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
   return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
 }
-static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
   return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
 }
+#endif
 
-#define MAKEROWY(NAME, R, G, B, BPP) \
-void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
-  int x;                                                                       \
-  for (x = 0; x < width; ++x) {                                                \
-    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
-    src_argb0 += BPP;                                                          \
-    dst_y += 1;                                                                \
-  }                                                                            \
-}                                                                              \
-void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
-                       uint8* dst_u, uint8* dst_v, int width) {                \
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
-  int x;                                                                       \
-  for (x = 0; x < width - 1; x += 2) {                                         \
-    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
-               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
-    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
-               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
-    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
-               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-    src_rgb0 += BPP * 2;                                                       \
-    src_rgb1 += BPP * 2;                                                       \
-    dst_u += 1;                                                                \
-    dst_v += 1;                                                                \
-  }                                                                            \
-  if (width & 1) {                                                             \
-    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
-    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
-    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-  }                                                                            \
+#if !defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
+  return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
+  return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
 }
+#endif
+
+// ARGBToY_C and ARGBToUV_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+
+#define MAKEROWY(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                   \
+    for (x = 0; x < width; ++x) {                                            \
+      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                      \
+      dst_y += 1;                                                            \
+    }                                                                        \
+  }                                                                          \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
+    int x;                                                                   \
+    for (x = 0; x < width - 1; x += 2) {                                     \
+      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                      \
+                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));         \
+      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                      \
+                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));         \
+      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                      \
+                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));         \
+      dst_u[0] = RGBToU(ar, ag, ab);                                         \
+      dst_v[0] = RGBToV(ar, ag, ab);                                         \
+      src_rgb0 += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                   \
+      dst_u += 1;                                                            \
+      dst_v += 1;                                                            \
+    }                                                                        \
+    if (width & 1) {                                                         \
+      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                           \
+      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                           \
+      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                           \
+      dst_u[0] = RGBToU(ar, ag, ab);                                         \
+      dst_v[0] = RGBToV(ar, ag, ab);                                         \
+    }                                                                        \
+  }
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWY(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                   \
+    for (x = 0; x < width; ++x) {                                            \
+      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                      \
+      dst_y += 1;                                                            \
+    }                                                                        \
+  }                                                                          \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
+    int x;                                                                   \
+    for (x = 0; x < width - 1; x += 2) {                                     \
+      uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +         \
+                     src_rgb1[B + BPP] + 1) >>                               \
+                    1;                                                       \
+      uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +         \
+                     src_rgb1[G + BPP] + 1) >>                               \
+                    1;                                                       \
+      uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +         \
+                     src_rgb1[R + BPP] + 1) >>                               \
+                    1;                                                       \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                       \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                       \
+      src_rgb0 += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                   \
+      dst_u += 1;                                                            \
+      dst_v += 1;                                                            \
+    }                                                                        \
+    if (width & 1) {                                                         \
+      uint16_t ab = src_rgb0[B] + src_rgb1[B];                               \
+      uint16_t ag = src_rgb0[G] + src_rgb1[G];                               \
+      uint16_t ar = src_rgb0[R] + src_rgb1[R];                               \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                       \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                       \
+    }                                                                        \
+  }
+#endif
 
 MAKEROWY(ARGB, 2, 1, 0, 4)
 MAKEROWY(BGRA, 1, 2, 3, 4)
@@ -350,14 +555,14 @@ MAKEROWY(RAW, 0, 1, 2, 3)
 // b 0.1016 * 255 = 25.908 = 25
 // g 0.5078 * 255 = 129.489 = 129
 // r 0.2578 * 255 = 65.739 = 66
-// JPeg 8 bit Y (not used):
-// b 0.11400 * 256 = 29.184 = 29
-// g 0.58700 * 256 = 150.272 = 150
-// r 0.29900 * 256 = 76.544 = 77
-// JPeg 7 bit Y:
+// JPeg 7 bit Y (deprecated)
 // b 0.11400 * 128 = 14.592 = 15
 // g 0.58700 * 128 = 75.136 = 75
 // r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit Y:
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
 // JPeg 8 bit U:
 // b  0.50000 * 255 = 127.5 = 127
 // g -0.33126 * 255 = -84.4713 = -84
@@ -367,86 +572,127 @@ MAKEROWY(RAW, 0, 1, 2, 3)
 // g -0.41869 * 255 = -106.76595 = -107
 // r  0.50000 * 255 = 127.5 = 127
 
-static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
-  return (38 * r + 75 * g +  15 * b + 64) >> 7;
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
+static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+  return (38 * r + 75 * g + 15 * b + 64) >> 7;
+}
+#else
+// 8 bit
+static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+  return (77 * r + 150 * g + 29 * b + 128) >> 8;
 }
+#endif
 
-static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
 }
-static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 }
+#else
+static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
+  return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
+  return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
+}
+#endif
 
-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+// ARGBToYJ_C and ARGBToUVJ_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                    \
+    for (x = 0; x < width; ++x) {                                             \
+      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                       \
+      dst_y += 1;                                                             \
+    }                                                                         \
+  }                                                                           \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \
+    int x;                                                                    \
+    for (x = 0; x < width - 1; x += 2) {                                      \
+      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                       \
+                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));          \
+      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                       \
+                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));          \
+      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                       \
+                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));          \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
+      src_rgb0 += BPP * 2;                                                    \
+      src_rgb1 += BPP * 2;                                                    \
+      dst_u += 1;                                                             \
+      dst_v += 1;                                                             \
+    }                                                                         \
+    if (width & 1) {                                                          \
+      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                            \
+      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                            \
+      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                            \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
+    }                                                                         \
+  }
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                    \
+    for (x = 0; x < width; ++x) {                                             \
+      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                       \
+      dst_y += 1;                                                             \
+    }                                                                         \
+  }                                                                           \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \
+    int x;                                                                    \
+    for (x = 0; x < width - 1; x += 2) {                                      \
+      uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +          \
+                     src_rgb1[B + BPP] + 1) >>                                \
+                    1;                                                        \
+      uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +          \
+                     src_rgb1[G + BPP] + 1) >>                                \
+                    1;                                                        \
+      uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +          \
+                     src_rgb1[R + BPP] + 1) >>                                \
+                    1;                                                        \
+      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                       \
+      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                       \
+      src_rgb0 += BPP * 2;                                                    \
+      src_rgb1 += BPP * 2;                                                    \
+      dst_u += 1;                                                             \
+      dst_v += 1;                                                             \
+    }                                                                         \
+    if (width & 1) {                                                          \
+      uint16_t ab = (src_rgb0[B] + src_rgb1[B]);                              \
+      uint16_t ag = (src_rgb0[G] + src_rgb1[G]);                              \
+      uint16_t ar = (src_rgb0[R] + src_rgb1[R]);                              \
+      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                       \
+      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                       \
+    }                                                                         \
+  }
 
-#define MAKEROWYJ(NAME, R, G, B, BPP) \
-void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
-  int x;                                                                       \
-  for (x = 0; x < width; ++x) {                                                \
-    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
-    src_argb0 += BPP;                                                          \
-    dst_y += 1;                                                                \
-  }                                                                            \
-}                                                                              \
-void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
-                        uint8* dst_u, uint8* dst_v, int width) {               \
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
-  int x;                                                                       \
-  for (x = 0; x < width - 1; x += 2) {                                         \
-    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
-                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
-    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
-                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
-    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
-                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
-    src_rgb0 += BPP * 2;                                                       \
-    src_rgb1 += BPP * 2;                                                       \
-    dst_u += 1;                                                                \
-    dst_v += 1;                                                                \
-  }                                                                            \
-  if (width & 1) {                                                             \
-    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
-    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
-    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
-  }                                                                            \
-}
+#endif
 
 MAKEROWYJ(ARGB, 2, 1, 0, 4)
+MAKEROWYJ(RGBA, 3, 2, 1, 4)
+MAKEROWYJ(RGB24, 2, 1, 0, 3)
+MAKEROWYJ(RAW, 0, 1, 2, 3)
 #undef MAKEROWYJ
 
-void ARGBToUVJ422Row_C(const uint8* src_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
-    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
-    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
-    dst_u[0] = RGBToUJ(ar, ag, ab);
-    dst_v[0] = RGBToVJ(ar, ag, ab);
-    src_argb += 8;
-    dst_u += 1;
-    dst_v += 1;
-  }
-  if (width & 1) {
-    uint8 ab = src_argb[0];
-    uint8 ag = src_argb[1];
-    uint8 ar = src_argb[2];
-    dst_u[0] = RGBToUJ(ar, ag, ab);
-    dst_v[0] = RGBToVJ(ar, ag, ab);
-  }
-}
-
-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_rgb565[0] & 0x1f;
-    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r = src_rgb565[1] >> 3;
+    uint8_t b = src_rgb565[0] & 0x1f;
+    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r = src_rgb565[1] >> 3;
     b = (b << 3) | (b >> 2);
     g = (g << 2) | (g >> 4);
     r = (r << 3) | (r >> 2);
@@ -456,12 +702,12 @@ void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
   }
 }
 
-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb1555[0] & 0x1f;
-    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t b = src_argb1555[0] & 0x1f;
+    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
     b = (b << 3) | (b >> 2);
     g = (g << 3) | (g >> 2);
     r = (r << 3) | (r >> 2);
@@ -471,12 +717,12 @@ void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
   }
 }
 
-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb4444[0] & 0x0f;
-    uint8 g = src_argb4444[0] >> 4;
-    uint8 r = src_argb4444[1] & 0x0f;
+    uint8_t b = src_argb4444[0] & 0x0f;
+    uint8_t g = src_argb4444[0] >> 4;
+    uint8_t r = src_argb4444[1] & 0x0f;
     b = (b << 4) | b;
     g = (g << 4) | g;
     r = (r << 4) | r;
@@ -486,224 +732,279 @@ void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
   }
 }
 
-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
-                     uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+void RGB565ToUVRow_C(const uint8_t* src_rgb565,
+                     int src_stride_rgb565,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_rgb565[0] & 0x1f;
-    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r0 = src_rgb565[1] >> 3;
-    uint8 b1 = src_rgb565[2] & 0x1f;
-    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
-    uint8 r1 = src_rgb565[3] >> 3;
-    uint8 b2 = next_rgb565[0] & 0x1f;
-    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8 r2 = next_rgb565[1] >> 3;
-    uint8 b3 = next_rgb565[2] & 0x1f;
-    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
-    uint8 r3 = next_rgb565[3] >> 3;
-    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
-    uint8 g = (g0 + g1 + g2 + g3);
-    uint8 r = (r0 + r1 + r2 + r3);
-    b = (b << 1) | (b >> 6);  // 787 -> 888.
-    r = (r << 1) | (r >> 6);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+    uint8_t b0 = src_rgb565[0] & 0x1f;
+    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r0 = src_rgb565[1] >> 3;
+    uint8_t b1 = src_rgb565[2] & 0x1f;
+    uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+    uint8_t r1 = src_rgb565[3] >> 3;
+    uint8_t b2 = next_rgb565[0] & 0x1f;
+    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8_t r2 = next_rgb565[1] >> 3;
+    uint8_t b3 = next_rgb565[2] & 0x1f;
+    uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+    uint8_t r3 = next_rgb565[3] >> 3;
+
+    b0 = (b0 << 3) | (b0 >> 2);
+    g0 = (g0 << 2) | (g0 >> 4);
+    r0 = (r0 << 3) | (r0 >> 2);
+    b1 = (b1 << 3) | (b1 >> 2);
+    g1 = (g1 << 2) | (g1 >> 4);
+    r1 = (r1 << 3) | (r1 >> 2);
+    b2 = (b2 << 3) | (b2 >> 2);
+    g2 = (g2 << 2) | (g2 >> 4);
+    r2 = (r2 << 3) | (r2 >> 2);
+    b3 = (b3 << 3) | (b3 >> 2);
+    g3 = (g3 << 2) | (g3 >> 4);
+    r3 = (r3 << 3) | (r3 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+    uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+    uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
     src_rgb565 += 4;
     next_rgb565 += 4;
     dst_u += 1;
     dst_v += 1;
   }
   if (width & 1) {
-    uint8 b0 = src_rgb565[0] & 0x1f;
-    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r0 = src_rgb565[1] >> 3;
-    uint8 b2 = next_rgb565[0] & 0x1f;
-    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8 r2 = next_rgb565[1] >> 3;
-    uint8 b = (b0 + b2);  // 565 * 2 = 676.
-    uint8 g = (g0 + g2);
-    uint8 r = (r0 + r2);
-    b = (b << 2) | (b >> 4);  // 676 -> 888
-    g = (g << 1) | (g >> 6);
-    r = (r << 2) | (r >> 4);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
-  }
-}
-
-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb1555[0] & 0x1f;
-    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8 b1 = src_argb1555[2] & 0x1f;
-    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
-    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
-    uint8 b2 = next_argb1555[0] & 0x1f;
-    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
-    uint8 b3 = next_argb1555[2] & 0x1f;
-    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
-    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
-    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
-    uint8 g = (g0 + g1 + g2 + g3);
-    uint8 r = (r0 + r1 + r2 + r3);
-    b = (b << 1) | (b >> 6);  // 777 -> 888.
-    g = (g << 1) | (g >> 6);
-    r = (r << 1) | (r >> 6);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
-    src_argb1555 += 4;
-    next_argb1555 += 4;
-    dst_u += 1;
-    dst_v += 1;
-  }
-  if (width & 1) {
-    uint8 b0 = src_argb1555[0] & 0x1f;
-    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8 b2 = next_argb1555[0] & 0x1f;
-    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8 r2 = next_argb1555[1] >> 3;
-    uint8 b = (b0 + b2);  // 555 * 2 = 666.
-    uint8 g = (g0 + g2);
-    uint8 r = (r0 + r2);
-    b = (b << 2) | (b >> 4);  // 666 -> 888.
-    g = (g << 2) | (g >> 4);
-    r = (r << 2) | (r >> 4);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+    uint8_t b0 = src_rgb565[0] & 0x1f;
+    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r0 = src_rgb565[1] >> 3;
+    uint8_t b2 = next_rgb565[0] & 0x1f;
+    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8_t r2 = next_rgb565[1] >> 3;
+
+    b0 = (b0 << 3) | (b0 >> 2);
+    g0 = (g0 << 2) | (g0 >> 4);
+    r0 = (r0 << 3) | (r0 >> 2);
+    b2 = (b2 << 3) | (b2 >> 2);
+    g2 = (g2 << 2) | (g2 >> 4);
+    r2 = (r2 << 3) | (r2 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(b0, b2);
+    uint8_t ag = AVGB(g0, g2);
+    uint8_t ar = AVGB(r0, r2);
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = b0 + b2;
+    uint16_t g = g0 + g2;
+    uint16_t r = r0 + r2;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
   }
 }
 
-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
+                       int src_stride_argb1555,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb4444[0] & 0x0f;
-    uint8 g0 = src_argb4444[0] >> 4;
-    uint8 r0 = src_argb4444[1] & 0x0f;
-    uint8 b1 = src_argb4444[2] & 0x0f;
-    uint8 g1 = src_argb4444[2] >> 4;
-    uint8 r1 = src_argb4444[3] & 0x0f;
-    uint8 b2 = next_argb4444[0] & 0x0f;
-    uint8 g2 = next_argb4444[0] >> 4;
-    uint8 r2 = next_argb4444[1] & 0x0f;
-    uint8 b3 = next_argb4444[2] & 0x0f;
-    uint8 g3 = next_argb4444[2] >> 4;
-    uint8 r3 = next_argb4444[3] & 0x0f;
-    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
-    uint8 g = (g0 + g1 + g2 + g3);
-    uint8 r = (r0 + r1 + r2 + r3);
-    b = (b << 2) | (b >> 4);  // 666 -> 888.
-    g = (g << 2) | (g >> 4);
-    r = (r << 2) | (r >> 4);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
-    src_argb4444 += 4;
-    next_argb4444 += 4;
+    uint8_t b0 = src_argb1555[0] & 0x1f;
+    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t b1 = src_argb1555[2] & 0x1f;
+    uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+    uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
+    uint8_t b2 = next_argb1555[0] & 0x1f;
+    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
+    uint8_t b3 = next_argb1555[2] & 0x1f;
+    uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+    uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
+
+    b0 = (b0 << 3) | (b0 >> 2);
+    g0 = (g0 << 3) | (g0 >> 2);
+    r0 = (r0 << 3) | (r0 >> 2);
+    b1 = (b1 << 3) | (b1 >> 2);
+    g1 = (g1 << 3) | (g1 >> 2);
+    r1 = (r1 << 3) | (r1 >> 2);
+    b2 = (b2 << 3) | (b2 >> 2);
+    g2 = (g2 << 3) | (g2 >> 2);
+    r2 = (r2 << 3) | (r2 >> 2);
+    b3 = (b3 << 3) | (b3 >> 2);
+    g3 = (g3 << 3) | (g3 >> 2);
+    r3 = (r3 << 3) | (r3 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+    uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+    uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
+    src_argb1555 += 4;
+    next_argb1555 += 4;
     dst_u += 1;
     dst_v += 1;
   }
   if (width & 1) {
-    uint8 b0 = src_argb4444[0] & 0x0f;
-    uint8 g0 = src_argb4444[0] >> 4;
-    uint8 r0 = src_argb4444[1] & 0x0f;
-    uint8 b2 = next_argb4444[0] & 0x0f;
-    uint8 g2 = next_argb4444[0] >> 4;
-    uint8 r2 = next_argb4444[1] & 0x0f;
-    uint8 b = (b0 + b2);  // 444 * 2 = 555.
-    uint8 g = (g0 + g2);
-    uint8 r = (r0 + r2);
-    b = (b << 3) | (b >> 2);  // 555 -> 888.
-    g = (g << 3) | (g >> 2);
-    r = (r << 3) | (r >> 2);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
-  }
-}
-
-void ARGBToUV444Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    uint8 ab = src_argb[0];
-    uint8 ag = src_argb[1];
-    uint8 ar = src_argb[2];
+    uint8_t b0 = src_argb1555[0] & 0x1f;
+    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t b2 = next_argb1555[0] & 0x1f;
+    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8_t r2 = next_argb1555[1] >> 3;
+
+    b0 = (b0 << 3) | (b0 >> 2);
+    g0 = (g0 << 3) | (g0 >> 2);
+    r0 = (r0 << 3) | (r0 >> 2);
+    b2 = (b2 << 3) | (b2 >> 2);
+    g2 = (g2 << 3) | (g2 >> 2);
+    r2 = (r2 << 3) | (r2 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(b0, b2);
+    uint8_t ag = AVGB(g0, g2);
+    uint8_t ar = AVGB(r0, r2);
     dst_u[0] = RGBToU(ar, ag, ab);
     dst_v[0] = RGBToV(ar, ag, ab);
-    src_argb += 4;
-    dst_u += 1;
-    dst_v += 1;
+#else
+    uint16_t b = b0 + b2;
+    uint16_t g = g0 + g2;
+    uint16_t r = r0 + r2;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
   }
 }
 
-void ARGBToUV422Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
+                       int src_stride_argb4444,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
-    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
-    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    uint8_t b0 = src_argb4444[0] & 0x0f;
+    uint8_t g0 = src_argb4444[0] >> 4;
+    uint8_t r0 = src_argb4444[1] & 0x0f;
+    uint8_t b1 = src_argb4444[2] & 0x0f;
+    uint8_t g1 = src_argb4444[2] >> 4;
+    uint8_t r1 = src_argb4444[3] & 0x0f;
+    uint8_t b2 = next_argb4444[0] & 0x0f;
+    uint8_t g2 = next_argb4444[0] >> 4;
+    uint8_t r2 = next_argb4444[1] & 0x0f;
+    uint8_t b3 = next_argb4444[2] & 0x0f;
+    uint8_t g3 = next_argb4444[2] >> 4;
+    uint8_t r3 = next_argb4444[3] & 0x0f;
+
+    b0 = (b0 << 4) | b0;
+    g0 = (g0 << 4) | g0;
+    r0 = (r0 << 4) | r0;
+    b1 = (b1 << 4) | b1;
+    g1 = (g1 << 4) | g1;
+    r1 = (r1 << 4) | r1;
+    b2 = (b2 << 4) | b2;
+    g2 = (g2 << 4) | g2;
+    r2 = (r2 << 4) | r2;
+    b3 = (b3 << 4) | b3;
+    g3 = (g3 << 4) | g3;
+    r3 = (r3 << 4) | r3;
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+    uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+    uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
     dst_u[0] = RGBToU(ar, ag, ab);
     dst_v[0] = RGBToV(ar, ag, ab);
-    src_argb += 8;
+#else
+    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
+    src_argb4444 += 4;
+    next_argb4444 += 4;
     dst_u += 1;
     dst_v += 1;
   }
   if (width & 1) {
-    uint8 ab = src_argb[0];
-    uint8 ag = src_argb[1];
-    uint8 ar = src_argb[2];
+    uint8_t b0 = src_argb4444[0] & 0x0f;
+    uint8_t g0 = src_argb4444[0] >> 4;
+    uint8_t r0 = src_argb4444[1] & 0x0f;
+    uint8_t b2 = next_argb4444[0] & 0x0f;
+    uint8_t g2 = next_argb4444[0] >> 4;
+    uint8_t r2 = next_argb4444[1] & 0x0f;
+
+    b0 = (b0 << 4) | b0;
+    g0 = (g0 << 4) | g0;
+    r0 = (r0 << 4) | r0;
+    b2 = (b2 << 4) | b2;
+    g2 = (g2 << 4) | g2;
+    r2 = (r2 << 4) | r2;
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(b0, b2);
+    uint8_t ag = AVGB(g0, g2);
+    uint8_t ar = AVGB(r0, r2);
     dst_u[0] = RGBToU(ar, ag, ab);
     dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = b0 + b2;
+    uint16_t g = g0 + g2;
+    uint16_t r = r0 + r2;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
   }
 }
 
-void ARGBToUV411Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUV444Row_C(const uint8_t* src_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   int x;
-  for (x = 0; x < width - 3; x += 4) {
-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
+  for (x = 0; x < width; ++x) {
+    uint8_t ab = src_argb[0];
+    uint8_t ag = src_argb[1];
+    uint8_t ar = src_argb[2];
     dst_u[0] = RGBToU(ar, ag, ab);
     dst_v[0] = RGBToV(ar, ag, ab);
-    src_argb += 16;
+    src_argb += 4;
     dst_u += 1;
     dst_v += 1;
   }
-  if ((width & 3) == 3) {
-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  } else if ((width & 3) == 2) {
-    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
-    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
-    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  } else if ((width & 3) == 1) {
-    uint8 ab = src_argb[0];
-    uint8 ag = src_argb[1];
-    uint8 ar = src_argb[2];
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  }
 }
 
-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+    uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
     dst_argb[3] = src_argb[3];
     dst_argb += 4;
@@ -712,7 +1013,7 @@ void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 }
 
 // Convert a row of image to Sepia tone.
-void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -731,22 +1032,28 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
 
 // Apply color matrix to a row of image. Matrix is signed.
 // TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
-                          const int8* matrix_argb, int width) {
+void ARGBColorMatrixRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const int8_t* matrix_argb,
+                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = src_argb[0];
     int g = src_argb[1];
     int r = src_argb[2];
     int a = src_argb[3];
-    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
-              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
-    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
-              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
-    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
-              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
-    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
-              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
+              a * matrix_argb[3]) >>
+             6;
+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
+              a * matrix_argb[7]) >>
+             6;
+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
+              a * matrix_argb[11]) >>
+             6;
+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
+              a * matrix_argb[15]) >>
+             6;
     dst_argb[0] = Clamp(sb);
     dst_argb[1] = Clamp(sg);
     dst_argb[2] = Clamp(sr);
@@ -757,7 +1064,9 @@ void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
 }
 
 // Apply color table to a row of image.
-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+                         const uint8_t* table_argb,
+                         int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -773,7 +1082,9 @@ void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
 }
 
 // Apply color table to a row of image.
-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+void RGBColorTableRow_C(uint8_t* dst_argb,
+                        const uint8_t* table_argb,
+                        int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -786,8 +1097,11 @@ void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
   }
 }
 
-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
-                       int interval_offset, int width) {
+void ARGBQuantizeRow_C(uint8_t* dst_argb,
+                       int scale,
+                       int interval_size,
+                       int interval_offset,
+                       int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -801,21 +1115,23 @@ void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
 }
 
 #define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 24
+#define SHADE(f, v) v* f >> 24
 
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                    uint32 value) {
-  const uint32 b_scale = REPEAT8(value & 0xff);
-  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
-  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
-  const uint32 a_scale = REPEAT8(value >> 24);
+void ARGBShadeRow_C(const uint8_t* src_argb,
+                    uint8_t* dst_argb,
+                    int width,
+                    uint32_t value) {
+  const uint32_t b_scale = REPEAT8(value & 0xff);
+  const uint32_t g_scale = REPEAT8((value >> 8) & 0xff);
+  const uint32_t r_scale = REPEAT8((value >> 16) & 0xff);
+  const uint32_t a_scale = REPEAT8(value >> 24);
 
   int i;
   for (i = 0; i < width; ++i) {
-    const uint32 b = REPEAT8(src_argb[0]);
-    const uint32 g = REPEAT8(src_argb[1]);
-    const uint32 r = REPEAT8(src_argb[2]);
-    const uint32 a = REPEAT8(src_argb[3]);
+    const uint32_t b = REPEAT8(src_argb[0]);
+    const uint32_t g = REPEAT8(src_argb[1]);
+    const uint32_t r = REPEAT8(src_argb[2]);
+    const uint32_t a = REPEAT8(src_argb[3]);
     dst_argb[0] = SHADE(b, b_scale);
     dst_argb[1] = SHADE(g, g_scale);
     dst_argb[2] = SHADE(r, r_scale);
@@ -828,20 +1144,22 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
 #undef SHADE
 
 #define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 16
+#define SHADE(f, v) v* f >> 16
 
-void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
+void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    const uint32 b = REPEAT8(src_argb0[0]);
-    const uint32 g = REPEAT8(src_argb0[1]);
-    const uint32 r = REPEAT8(src_argb0[2]);
-    const uint32 a = REPEAT8(src_argb0[3]);
-    const uint32 b_scale = src_argb1[0];
-    const uint32 g_scale = src_argb1[1];
-    const uint32 r_scale = src_argb1[2];
-    const uint32 a_scale = src_argb1[3];
+    const uint32_t b = REPEAT8(src_argb0[0]);
+    const uint32_t g = REPEAT8(src_argb0[1]);
+    const uint32_t r = REPEAT8(src_argb0[2]);
+    const uint32_t a = REPEAT8(src_argb0[3]);
+    const uint32_t b_scale = src_argb1[0];
+    const uint32_t g_scale = src_argb1[1];
+    const uint32_t r_scale = src_argb1[2];
+    const uint32_t a_scale = src_argb1[3];
     dst_argb[0] = SHADE(b, b_scale);
     dst_argb[1] = SHADE(g, g_scale);
     dst_argb[2] = SHADE(r, r_scale);
@@ -856,8 +1174,10 @@ void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
 
 #define SHADE(f, v) clamp255(v + f)
 
-void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                  uint8* dst_argb, int width) {
+void ARGBAddRow_C(const uint8_t* src_argb0,
+                  const uint8_t* src_argb1,
+                  uint8_t* dst_argb,
+                  int width) {
   int i;
   for (i = 0; i < width; ++i) {
     const int b = src_argb0[0];
@@ -881,8 +1201,10 @@ void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
 
 #define SHADE(f, v) clamp0(f - v)
 
-void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
+void ARGBSubtractRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
   int i;
   for (i = 0; i < width; ++i) {
     const int b = src_argb0[0];
@@ -905,8 +1227,11 @@ void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
 #undef SHADE
 
 // Sobel functions which mimics SSSE3.
-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
-                 uint8* dst_sobelx, int width) {
+void SobelXRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 const uint8_t* src_y2,
+                 uint8_t* dst_sobelx,
+                 int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int a = src_y0[i];
@@ -919,12 +1244,14 @@ void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
     int b_diff = b - b_sub;
     int c_diff = c - c_sub;
     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
-    dst_sobelx[i] = (uint8)(clamp255(sobel));
+    dst_sobelx[i] = (uint8_t)(clamp255(sobel));
   }
 }
 
-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
-                 uint8* dst_sobely, int width) {
+void SobelYRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 uint8_t* dst_sobely,
+                 int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int a = src_y0[i + 0];
@@ -937,56 +1264,62 @@ void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
     int b_diff = b - b_sub;
     int c_diff = c - c_sub;
     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
-    dst_sobely[i] = (uint8)(clamp255(sobel));
+    dst_sobely[i] = (uint8_t)(clamp255(sobel));
   }
 }
 
-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                uint8* dst_argb, int width) {
+void SobelRow_C(const uint8_t* src_sobelx,
+                const uint8_t* src_sobely,
+                uint8_t* dst_argb,
+                int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int r = src_sobelx[i];
     int b = src_sobely[i];
     int s = clamp255(r + b);
-    dst_argb[0] = (uint8)(s);
-    dst_argb[1] = (uint8)(s);
-    dst_argb[2] = (uint8)(s);
-    dst_argb[3] = (uint8)(255u);
+    dst_argb[0] = (uint8_t)(s);
+    dst_argb[1] = (uint8_t)(s);
+    dst_argb[2] = (uint8_t)(s);
+    dst_argb[3] = (uint8_t)(255u);
     dst_argb += 4;
   }
 }
 
-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_y, int width) {
+void SobelToPlaneRow_C(const uint8_t* src_sobelx,
+                       const uint8_t* src_sobely,
+                       uint8_t* dst_y,
+                       int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int r = src_sobelx[i];
     int b = src_sobely[i];
     int s = clamp255(r + b);
-    dst_y[i] = (uint8)(s);
+    dst_y[i] = (uint8_t)(s);
   }
 }
 
-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                  uint8* dst_argb, int width) {
+void SobelXYRow_C(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int r = src_sobelx[i];
     int b = src_sobely[i];
     int g = clamp255(r + b);
-    dst_argb[0] = (uint8)(b);
-    dst_argb[1] = (uint8)(g);
-    dst_argb[2] = (uint8)(r);
-    dst_argb[3] = (uint8)(255u);
+    dst_argb[0] = (uint8_t)(b);
+    dst_argb[1] = (uint8_t)(g);
+    dst_argb[2] = (uint8_t)(r);
+    dst_argb[3] = (uint8_t)(255u);
     dst_argb += 4;
   }
 }
 
-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   // Copy a Y to RGB.
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 y = src_y[0];
+    uint8_t y = src_y[0];
     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
     dst_argb[3] = 255u;
     dst_argb += 4;
@@ -994,20 +1327,22 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
   }
 }
 
+// TODO(fbarchard): Unify these structures to be platform independent.
+// TODO(fbarchard): Generate SIMD structures from float matrix.
+
 // BT.601 YUV to RGB reference
 //  R = (Y - 16) * 1.164              - V * -1.596
 //  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
 //  B = (Y - 16) * 1.164 - U * -2.018
 
 // Y contribution to R,G,B.  Scale and bias.
-// TODO(fbarchard): Consider moving constants into a common header.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
 // U and V contributions to R,G,B.
 #define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25 /* round(0.391 * 64) */
-#define VG 52 /* round(0.813 * 64) */
+#define UG 25   /* round(0.391 * 64) */
+#define VG 52   /* round(0.813 * 64) */
 #define VR -102 /* round(-1.596 * 64) */
 
 // Bias values to subtract 16 from Y and 128 from U and V.
@@ -1015,32 +1350,70 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
 #define BG (UG * 128 + VG * 128 + YGB)
 #define BR (VR * 128 + YGB)
 
-// C reference code that mimics the YUV assembly.
-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
-                              uint8* b, uint8* g, uint8* r) {
-  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
-  *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);
-  *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);
-  *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);
-}
-
-// C reference code that mimics the YUV assembly.
-static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
-  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
-  *b = Clamp((int32)(y1 + YGB) >> 6);
-  *g = Clamp((int32)(y1 + YGB) >> 6);
-  *r = Clamp((int32)(y1 + YGB) >> 6);
-}
+#if defined(__aarch64__)  // 64 bit arm
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+#elif defined(__arm__)  // 32 bit arm
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
+#endif
 
-#undef YG
+#undef BB
+#undef BG
+#undef BR
 #undef YGB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
-#undef BB
-#undef BG
-#undef BR
+#undef YG
 
 // JPEG YUV to RGB reference
 // *  R = Y                - V * -1.40200
@@ -1048,56 +1421,448 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
 // *  B = Y - U * -1.77200
 
 // Y contribution to R,G,B.  Scale and bias.
-// TODO(fbarchard): Consider moving constants into a common header.
-#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGBJ 32  /* 64 / 2 */
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGB 32   /* 64 / 2 */
 
 // U and V contributions to R,G,B.
-#define UBJ -113 /* round(-1.77200 * 64) */
-#define UGJ 22 /* round(0.34414 * 64) */
-#define VGJ 46 /* round(0.71414  * 64) */
-#define VRJ -90 /* round(-1.40200 * 64) */
+#define UB -113 /* round(-1.77200 * 64) */
+#define UG 22   /* round(0.34414 * 64) */
+#define VG 46   /* round(0.71414  * 64) */
+#define VR -90  /* round(-1.40200 * 64) */
 
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BBJ (UBJ * 128 + YGBJ)
-#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
-#define BRJ (VRJ * 128 + YGBJ)
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// BT.709 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.793
+//  G = (Y - 16) * 1.164 - U *  0.213 - V *  0.533
+//  B = (Y - 16) * 1.164 - U * -2.112
+// See also http://www.equasys.de/colorconversion.html
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.112 * 64)) */
+#define UG 14   /* round(0.213 * 64) */
+#define VG 34   /* round(0.533  * 64) */
+#define VR -115 /* round(-1.793 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// BT.2020 YUV to RGB reference
+//  R = (Y - 16) * 1.164384                - V * -1.67867
+//  G = (Y - 16) * 1.164384 - U * 0.187326 - V *  0.65042
+//  B = (Y - 16) * 1.164384 - U * -2.14177
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 19003  /* round(1.164384 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
+
+// TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.142 * 64)) */
+#define UG 12   /* round(0.187326 * 64) */
+#define VG 42   /* round(0.65042 * 64) */
+#define VR -107 /* round(-1.67867 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, YGB, 0, 0, 0, 0},
+    {0x0101 * YG, YG, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+    {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+     YGB}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// C reference code that mimics the YUV assembly.
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel(uint8_t y,
+                              uint8_t u,
+                              uint8_t v,
+                              uint8_t* b,
+                              uint8_t* g,
+                              uint8_t* r,
+                              const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[1];
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[1];
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+  *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6);
+  *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6);
+  *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6);
+}
+
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel8_16(uint8_t y,
+                                  uint8_t u,
+                                  uint8_t v,
+                                  int* b,
+                                  int* g,
+                                  int* r,
+                                  const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[1];
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[1];
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+  *b = (int)(-(u * ub) + y1 + bb);
+  *g = (int)(-(u * ug + v * vg) + y1 + bg);
+  *r = (int)(-(v * vr) + y1 + br);
+}
+
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 10 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel16(int16_t y,
+                                int16_t u,
+                                int16_t v,
+                                int* b,
+                                int* g,
+                                int* r,
+                                const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[1];
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[1];
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
+  u = clamp255(u >> 2);
+  v = clamp255(v >> 2);
+  *b = (int)(-(u * ub) + y1 + bb);
+  *g = (int)(-(u * ug + v * vg) + y1 + bg);
+  *r = (int)(-(v * vr) + y1 + br);
+}
+
+// C reference code that mimics the YUV 10 bit assembly.
+// Reads 10 bit YUV and clamps down to 8 bit RGB.
+static __inline void YuvPixel10(uint16_t y,
+                                uint16_t u,
+                                uint16_t v,
+                                uint8_t* b,
+                                uint8_t* g,
+                                uint8_t* r,
+                                const struct YuvConstants* yuvconstants) {
+  int b16;
+  int g16;
+  int r16;
+  YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
+  *b = Clamp(b16 >> 6);
+  *g = Clamp(g16 >> 6);
+  *r = Clamp(r16 >> 6);
+}
 
 // C reference code that mimics the YUV assembly.
-static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
-                               uint8* b, uint8* g, uint8* r) {
-  uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;
-  *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);
-  *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);
-  *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);
-}
-
-#undef YGJ
-#undef YGBJ
-#undef UBJ
-#undef UGJ
-#undef VGJ
-#undef VRJ
-#undef BBJ
-#undef BGJ
-#undef BRJ
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YPixel(uint8_t y,
+                            uint8_t* b,
+                            uint8_t* g,
+                            uint8_t* r,
+                            const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__) || defined(__arm__)
+  int ygb = yuvconstants->kUVBiasBGR[3];
+  int yg = yuvconstants->kYToRgb[1];
+#else
+  int ygb = yuvconstants->kYBiasToRgb[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+  *b = Clamp(((int32_t)(y1) + ygb) >> 6);
+  *g = Clamp(((int32_t)(y1) + ygb) >> 6);
+  *r = Clamp(((int32_t)(y1) + ygb) >> 6);
+}
 
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
 // C mimic assembly.
 // TODO(fbarchard): Remove subsampling from Neon.
-void I444ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I444ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
-    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
-    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
+    uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
+    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
+             yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
+             yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     src_u += 2;
@@ -1105,20 +1870,22 @@ void I444ToARGBRow_C(const uint8* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
   }
 }
 #else
-void I444ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I444ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
     src_y += 1;
     src_u += 1;
@@ -1129,18 +1896,19 @@ void I444ToARGBRow_C(const uint8* src_y,
 #endif
 
 // Also used for 420
-void I422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I422ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     src_u += 1;
@@ -1148,24 +1916,26 @@ void I422ToARGBRow_C(const uint8* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void J422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+// 10 bit YUV to ARGB
+void I210ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvJPixel(src_y[0], src_u[0], src_v[0],
-              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvJPixel(src_y[1], src_u[0], src_v[0],
-              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+               rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     src_u += 1;
@@ -1173,382 +1943,427 @@ void J422ToARGBRow_C(const uint8* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvJPixel(src_y[0], src_u[0], src_v[0],
-              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void I422ToRGB24Row_C(const uint8* src_y,
-                      const uint8* src_u,
-                      const uint8* src_v,
-                      uint8* rgb_buf,
-                      int width) {
+static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
+  uint32_t ar30;
+  b = b >> 4;  // convert 10.6 to 10 bit.
+  g = g >> 4;
+  r = r >> 4;
+  b = Clamp10(b);
+  g = Clamp10(g);
+  r = Clamp10(r);
+  ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000;
+  (*(uint32_t*)rgb_buf) = ar30;
+}
+
+// 10 bit YUV to 10 bit AR30
+void I210ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
   int x;
+  int b;
+  int g;
+  int r;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
     src_y += 2;
     src_u += 1;
     src_v += 1;
-    rgb_buf += 6;  // Advance 2 pixels.
+    rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
   }
 }
 
-void I422ToRAWRow_C(const uint8* src_y,
-                    const uint8* src_u,
-                    const uint8* src_v,
-                    uint8* rgb_buf,
-                    int width) {
+// 8 bit YUV to 10 bit AR30
+// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
+void I422ToAR30Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+  }
+}
+
+void I422AlphaToARGBRow_C(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          const uint8_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = src_a[0];
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = src_a[1];
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    src_a += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = src_a[0];
+  }
+}
+
+void I422ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
     src_y += 2;
     src_u += 1;
     src_v += 1;
     rgb_buf += 6;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
   }
 }
 
-void I422ToARGB4444Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb4444,
+void I422ToARGB4444Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb4444,
+                         const struct YuvConstants* yuvconstants,
                          int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
     b0 = b0 >> 4;
     g0 = g0 >> 4;
     r0 = r0 >> 4;
     b1 = b1 >> 4;
     g1 = g1 >> 4;
     r1 = r1 >> 4;
-    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
-        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+    *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
+                                 (g1 << 20) | (r1 << 24) | 0xf000f000;
     src_y += 2;
     src_u += 1;
     src_v += 1;
     dst_argb4444 += 4;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
     b0 = b0 >> 4;
     g0 = g0 >> 4;
     r0 = r0 >> 4;
-    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
-        0xf000;
+    *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
   }
 }
 
-void I422ToARGB1555Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb1555,
+void I422ToARGB1555Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb1555,
+                         const struct YuvConstants* yuvconstants,
                          int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
     b0 = b0 >> 3;
     g0 = g0 >> 3;
     r0 = r0 >> 3;
     b1 = b1 >> 3;
     g1 = g1 >> 3;
     r1 = r1 >> 3;
-    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
-        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+    *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
+                                 (g1 << 21) | (r1 << 26) | 0x80008000;
     src_y += 2;
     src_u += 1;
     src_v += 1;
     dst_argb1555 += 4;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
     b0 = b0 >> 3;
     g0 = g0 >> 3;
     r0 = r0 >> 3;
-    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
-        0x8000;
+    *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
   }
 }
 
-void I422ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_u,
-                       const uint8* src_v,
-                       uint8* dst_rgb565,
+void I422ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
                        int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
-        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    *(uint32_t*)(dst_rgb565) =
+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
     src_y += 2;
     src_u += 1;
     src_v += 1;
     dst_rgb565 += 4;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
-void I411ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void NV12ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_uv,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
-  for (x = 0; x < width - 3; x += 4) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
-    rgb_buf[7] = 255;
-    YuvPixel(src_y[2], src_u[0], src_v[0],
-             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
-    rgb_buf[11] = 255;
-    YuvPixel(src_y[3], src_u[0], src_v[0],
-             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
-    rgb_buf[15] = 255;
-    src_y += 4;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 16;  // Advance 4 pixels.
-  }
-  if (width & 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
+    src_uv += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void NV12ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_uv,
-                     uint8* rgb_buf,
+void NV21ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_vu,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_uv[0], src_uv[1],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
-    src_uv += 2;
+    src_vu += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void NV21ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_vu,
-                     uint8* rgb_buf,
-                     int width) {
+void NV12ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_uv,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
-    rgb_buf[3] = 255;
-
-    YuvPixel(src_y[1], src_vu[1], src_vu[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
-    rgb_buf[7] = 255;
-
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
     src_y += 2;
-    src_vu += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
+    src_uv += 2;
+    rgb_buf += 6;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
-    rgb_buf[3] = 255;
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
   }
 }
 
-void NV12ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_uv,
-                       uint8* dst_rgb565,
-                       int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+void NV21ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
-    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1);
-    b0 = b0 >> 3;
-    g0 = g0 >> 2;
-    r0 = r0 >> 3;
-    b1 = b1 >> 3;
-    g1 = g1 >> 2;
-    r1 = r1 >> 3;
-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
-        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
     src_y += 2;
-    src_uv += 2;
-    dst_rgb565 += 4;  // Advance 2 pixels.
+    src_vu += 2;
+    rgb_buf += 6;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
-    b0 = b0 >> 3;
-    g0 = g0 >> 2;
-    r0 = r0 >> 3;
-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
   }
 }
 
-void NV21ToRGB565Row_C(const uint8* src_y,
-                       const uint8* vsrc_u,
-                       uint8* dst_rgb565,
+void NV12ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
                        int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
-    YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
-        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    *(uint32_t*)(dst_rgb565) =
+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
     src_y += 2;
-    vsrc_u += 2;
+    src_uv += 2;
     dst_rgb565 += 4;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
-void YUY2ToARGBRow_C(const uint8* src_yuy2,
-                     uint8* rgb_buf,
+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_yuy2 += 4;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void UYVYToARGBRow_C(const uint8* src_uyvy,
-                     uint8* rgb_buf,
+void UYVYToARGBRow_C(const uint8_t* src_uyvy,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_uyvy += 4;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void I422ToBGRARow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I422ToRGBARow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+             rgb_buf + 3, yuvconstants);
     rgb_buf[0] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
+             rgb_buf + 7, yuvconstants);
     rgb_buf[4] = 255;
     src_y += 2;
     src_u += 1;
@@ -1556,79 +2371,32 @@ void I422ToBGRARow_C(const uint8* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+             rgb_buf + 3, yuvconstants);
     rgb_buf[0] = 255;
   }
 }
 
-void I422ToABGRRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
-    rgb_buf[3] = 255;
-  }
-}
-
-void I422ToRGBARow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I400ToARGBRow_C(const uint8_t* src_y,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
-    rgb_buf[0] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
-    rgb_buf[4] = 255;
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
-    rgb_buf[0] = 255;
-  }
-}
-
-void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   int x;
   src += width - 1;
   for (x = 0; x < width - 1; x += 2) {
@@ -1641,7 +2409,21 @@ void MirrorRow_C(const uint8* src, uint8* dst, int width) {
   }
 }
 
-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  int x;
+  src_uv += (width - 1) << 1;
+  for (x = 0; x < width; ++x) {
+    dst_uv[0] = src_uv[0];
+    dst_uv[1] = src_uv[1];
+    src_uv -= 2;
+    dst_uv += 2;
+  }
+}
+
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
   int x;
   src_uv += (width - 1) << 1;
   for (x = 0; x < width - 1; x += 2) {
@@ -1657,10 +2439,10 @@ void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
   }
 }
 
-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   int x;
-  const uint32* src32 = (const uint32*)(src);
-  uint32* dst32 = (uint32*)(dst);
+  const uint32_t* src32 = (const uint32_t*)(src);
+  uint32_t* dst32 = (uint32_t*)(dst);
   src32 += width - 1;
   for (x = 0; x < width - 1; x += 2) {
     dst32[x] = src32[0];
@@ -1672,7 +2454,25 @@ void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
   }
 }
 
-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) {
+  int x;
+  src_rgb24 += width * 3 - 3;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_rgb24[0];
+    uint8_t g = src_rgb24[1];
+    uint8_t r = src_rgb24[2];
+    dst_rgb24[0] = b;
+    dst_rgb24[1] = g;
+    dst_rgb24[2] = r;
+    src_rgb24 -= 3;
+    dst_rgb24 += 3;
+  }
+}
+
+void SplitUVRow_C(const uint8_t* src_uv,
+                  uint8_t* dst_u,
+                  uint8_t* dst_v,
+                  int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_u[x] = src_uv[0];
@@ -1687,7 +2487,9 @@ void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
   }
 }
 
-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_C(const uint8_t* src_u,
+                  const uint8_t* src_v,
+                  uint8_t* dst_uv,
                   int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -1703,29 +2505,121 @@ void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   }
 }
 
-void CopyRow_C(const uint8* src, uint8* dst, int count) {
+void SplitRGBRow_C(const uint8_t* src_rgb,
+                   uint8_t* dst_r,
+                   uint8_t* dst_g,
+                   uint8_t* dst_b,
+                   int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_r[x] = src_rgb[0];
+    dst_g[x] = src_rgb[1];
+    dst_b[x] = src_rgb[2];
+    src_rgb += 3;
+  }
+}
+
+void MergeRGBRow_C(const uint8_t* src_r,
+                   const uint8_t* src_g,
+                   const uint8_t* src_b,
+                   uint8_t* dst_rgb,
+                   int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_rgb[0] = src_r[x];
+    dst_rgb[1] = src_g[x];
+    dst_rgb[2] = src_b[x];
+    dst_rgb += 3;
+  }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+void MergeUVRow_16_C(const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint16_t* dst_uv,
+                     int scale,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x] * scale;
+    dst_uv[1] = src_v[x] * scale;
+    dst_uv[2] = src_u[x + 1] * scale;
+    dst_uv[3] = src_v[x + 1] * scale;
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1] * scale;
+    dst_uv[1] = src_v[width - 1] * scale;
+  }
+}
+
+void MultiplyRow_16_C(const uint16_t* src_y,
+                      uint16_t* dst_y,
+                      int scale,
+                      int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = src_y[x] * scale;
+  }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_C(const uint16_t* src_y,
+                       uint8_t* dst_y,
+                       int scale,
+                       int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = clamp255((src_y[x] * scale) >> 16);
+  }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 1024 = 10 bits
+void Convert8To16Row_C(const uint8_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width) {
+  int x;
+  scale *= 0x0101;  // replicates the byte.
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = (src_y[x] * scale) >> 16;
+  }
+}
+
+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
   memcpy(dst, src, count);
 }
 
-void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) {
   memcpy(dst, src, count * 2);
 }
 
-void SetRow_C(uint8* dst, uint8 v8, int width) {
+void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
   memset(dst, v8, width);
 }
 
-void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
-  uint32* d = (uint32*)(dst_argb);
+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    d[x] = v32;
+    memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32);
   }
 }
 
 // Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_C(const uint8_t* src_yuy2,
+                   int src_stride_yuy2,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width) {
   // Output a row of UV values, filtering 2 rows of YUY2.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1738,8 +2632,10 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
 }
 
 // Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_C(const uint8* src_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   // Output a row of UV values.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1752,7 +2648,7 @@ void YUY2ToUV422Row_C(const uint8* src_yuy2,
 }
 
 // Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
   // Output a row of Y values.
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -1766,8 +2662,11 @@ void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
 }
 
 // Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_C(const uint8_t* src_uyvy,
+                   int src_stride_uyvy,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width) {
   // Output a row of UV values.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1780,8 +2679,10 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
 }
 
 // Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_C(const uint8* src_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUV422Row_C(const uint8_t* src_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   // Output a row of UV values.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1794,7 +2695,7 @@ void UYVYToUV422Row_C(const uint8* src_uyvy,
 }
 
 // Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
   // Output a row of Y values.
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -1807,22 +2708,24 @@ void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
   }
 }
 
-#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
 
 // Blend src_argb0 over src_argb1 and store to dst_argb.
 // dst_argb may be src_argb0 or src_argb1.
 // This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                    uint8* dst_argb, int width) {
+void ARGBBlendRow_C(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint32 fb = src_argb0[0];
-    uint32 fg = src_argb0[1];
-    uint32 fr = src_argb0[2];
-    uint32 a = src_argb0[3];
-    uint32 bb = src_argb1[0];
-    uint32 bg = src_argb1[1];
-    uint32 br = src_argb1[2];
+    uint32_t fb = src_argb0[0];
+    uint32_t fg = src_argb0[1];
+    uint32_t fr = src_argb0[2];
+    uint32_t a = src_argb0[3];
+    uint32_t bb = src_argb1[0];
+    uint32_t bg = src_argb1[1];
+    uint32_t br = src_argb1[2];
     dst_argb[0] = BLEND(fb, bb, a);
     dst_argb[1] = BLEND(fg, bg, a);
     dst_argb[2] = BLEND(fr, br, a);
@@ -1845,13 +2748,13 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
   }
 
   if (width & 1) {
-    uint32 fb = src_argb0[0];
-    uint32 fg = src_argb0[1];
-    uint32 fr = src_argb0[2];
-    uint32 a = src_argb0[3];
-    uint32 bb = src_argb1[0];
-    uint32 bg = src_argb1[1];
-    uint32 br = src_argb1[2];
+    uint32_t fb = src_argb0[0];
+    uint32_t fg = src_argb0[1];
+    uint32_t fr = src_argb0[2];
+    uint32_t a = src_argb0[3];
+    uint32_t bb = src_argb1[0];
+    uint32_t bg = src_argb1[1];
+    uint32_t br = src_argb1[2];
     dst_argb[0] = BLEND(fb, bb, a);
     dst_argb[1] = BLEND(fg, bg, a);
     dst_argb[2] = BLEND(fr, br, a);
@@ -1859,17 +2762,43 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
   }
 }
 #undef BLEND
+
+#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
+void BlendPlaneRow_C(const uint8_t* src0,
+                     const uint8_t* src1,
+                     const uint8_t* alpha,
+                     uint8_t* dst,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+    dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
+    src0 += 2;
+    src1 += 2;
+    alpha += 2;
+    dst += 2;
+  }
+  if (width & 1) {
+    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+  }
+}
+#undef UBLEND
+
+#if defined(__aarch64__) || defined(__arm__)
+#define ATTENUATE(f, a) (f * a + 128) >> 8
+#else
+// This code mimics the SSSE3 version for better testability.
 #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+#endif
 
 // Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   int i;
   for (i = 0; i < width - 1; i += 2) {
-    uint32 b = src_argb[0];
-    uint32 g = src_argb[1];
-    uint32 r = src_argb[2];
-    uint32 a = src_argb[3];
+    uint32_t b = src_argb[0];
+    uint32_t g = src_argb[1];
+    uint32_t r = src_argb[2];
+    uint32_t a = src_argb[3];
     dst_argb[0] = ATTENUATE(b, a);
     dst_argb[1] = ATTENUATE(g, a);
     dst_argb[2] = ATTENUATE(r, a);
@@ -1887,10 +2816,10 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
   }
 
   if (width & 1) {
-    const uint32 b = src_argb[0];
-    const uint32 g = src_argb[1];
-    const uint32 r = src_argb[2];
-    const uint32 a = src_argb[3];
+    const uint32_t b = src_argb[0];
+    const uint32_t g = src_argb[1];
+    const uint32_t r = src_argb[2];
+    const uint32_t a = src_argb[3];
     dst_argb[0] = ATTENUATE(b, a);
     dst_argb[1] = ATTENUATE(g, a);
     dst_argb[2] = ATTENUATE(r, a);
@@ -1906,49 +2835,56 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 // Reciprocal method is off by 1 on some values. ie 125
 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
 #define T(a) 0x01000000 + (0x10000 / a)
-const uint32 fixed_invtbl8[256] = {
-  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
-  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
-  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
-  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
-  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
-  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
-  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
-  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
-  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
-  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
-  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
-  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
-  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
-  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
-  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
-  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
-  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
-  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
-  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
-  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
-  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
-  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
-  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
-  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
-  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
-  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
-  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
-  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
-  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
-  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
-  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
-  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
+const uint32_t fixed_invtbl8[256] = {
+    0x01000000, 0x0100ffff, T(0x02), T(0x03),   T(0x04), T(0x05), T(0x06),
+    T(0x07),    T(0x08),    T(0x09), T(0x0a),   T(0x0b), T(0x0c), T(0x0d),
+    T(0x0e),    T(0x0f),    T(0x10), T(0x11),   T(0x12), T(0x13), T(0x14),
+    T(0x15),    T(0x16),    T(0x17), T(0x18),   T(0x19), T(0x1a), T(0x1b),
+    T(0x1c),    T(0x1d),    T(0x1e), T(0x1f),   T(0x20), T(0x21), T(0x22),
+    T(0x23),    T(0x24),    T(0x25), T(0x26),   T(0x27), T(0x28), T(0x29),
+    T(0x2a),    T(0x2b),    T(0x2c), T(0x2d),   T(0x2e), T(0x2f), T(0x30),
+    T(0x31),    T(0x32),    T(0x33), T(0x34),   T(0x35), T(0x36), T(0x37),
+    T(0x38),    T(0x39),    T(0x3a), T(0x3b),   T(0x3c), T(0x3d), T(0x3e),
+    T(0x3f),    T(0x40),    T(0x41), T(0x42),   T(0x43), T(0x44), T(0x45),
+    T(0x46),    T(0x47),    T(0x48), T(0x49),   T(0x4a), T(0x4b), T(0x4c),
+    T(0x4d),    T(0x4e),    T(0x4f), T(0x50),   T(0x51), T(0x52), T(0x53),
+    T(0x54),    T(0x55),    T(0x56), T(0x57),   T(0x58), T(0x59), T(0x5a),
+    T(0x5b),    T(0x5c),    T(0x5d), T(0x5e),   T(0x5f), T(0x60), T(0x61),
+    T(0x62),    T(0x63),    T(0x64), T(0x65),   T(0x66), T(0x67), T(0x68),
+    T(0x69),    T(0x6a),    T(0x6b), T(0x6c),   T(0x6d), T(0x6e), T(0x6f),
+    T(0x70),    T(0x71),    T(0x72), T(0x73),   T(0x74), T(0x75), T(0x76),
+    T(0x77),    T(0x78),    T(0x79), T(0x7a),   T(0x7b), T(0x7c), T(0x7d),
+    T(0x7e),    T(0x7f),    T(0x80), T(0x81),   T(0x82), T(0x83), T(0x84),
+    T(0x85),    T(0x86),    T(0x87), T(0x88),   T(0x89), T(0x8a), T(0x8b),
+    T(0x8c),    T(0x8d),    T(0x8e), T(0x8f),   T(0x90), T(0x91), T(0x92),
+    T(0x93),    T(0x94),    T(0x95), T(0x96),   T(0x97), T(0x98), T(0x99),
+    T(0x9a),    T(0x9b),    T(0x9c), T(0x9d),   T(0x9e), T(0x9f), T(0xa0),
+    T(0xa1),    T(0xa2),    T(0xa3), T(0xa4),   T(0xa5), T(0xa6), T(0xa7),
+    T(0xa8),    T(0xa9),    T(0xaa), T(0xab),   T(0xac), T(0xad), T(0xae),
+    T(0xaf),    T(0xb0),    T(0xb1), T(0xb2),   T(0xb3), T(0xb4), T(0xb5),
+    T(0xb6),    T(0xb7),    T(0xb8), T(0xb9),   T(0xba), T(0xbb), T(0xbc),
+    T(0xbd),    T(0xbe),    T(0xbf), T(0xc0),   T(0xc1), T(0xc2), T(0xc3),
+    T(0xc4),    T(0xc5),    T(0xc6), T(0xc7),   T(0xc8), T(0xc9), T(0xca),
+    T(0xcb),    T(0xcc),    T(0xcd), T(0xce),   T(0xcf), T(0xd0), T(0xd1),
+    T(0xd2),    T(0xd3),    T(0xd4), T(0xd5),   T(0xd6), T(0xd7), T(0xd8),
+    T(0xd9),    T(0xda),    T(0xdb), T(0xdc),   T(0xdd), T(0xde), T(0xdf),
+    T(0xe0),    T(0xe1),    T(0xe2), T(0xe3),   T(0xe4), T(0xe5), T(0xe6),
+    T(0xe7),    T(0xe8),    T(0xe9), T(0xea),   T(0xeb), T(0xec), T(0xed),
+    T(0xee),    T(0xef),    T(0xf0), T(0xf1),   T(0xf2), T(0xf3), T(0xf4),
+    T(0xf5),    T(0xf6),    T(0xf7), T(0xf8),   T(0xf9), T(0xfa), T(0xfb),
+    T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
 #undef T
 
-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    uint32 b = src_argb[0];
-    uint32 g = src_argb[1];
-    uint32 r = src_argb[2];
-    const uint32 a = src_argb[3];
-    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
+    uint32_t b = src_argb[0];
+    uint32_t g = src_argb[1];
+    uint32_t r = src_argb[2];
+    const uint32_t a = src_argb[3];
+    const uint32_t ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
     b = (b * ia) >> 8;
     g = (g * ia) >> 8;
     r = (r * ia) >> 8;
@@ -1962,31 +2898,37 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
   }
 }
 
-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
-                               const int32* previous_cumsum, int width) {
-  int32 row_sum[4] = {0, 0, 0, 0};
+void ComputeCumulativeSumRow_C(const uint8_t* row,
+                               int32_t* cumsum,
+                               const int32_t* previous_cumsum,
+                               int width) {
+  int32_t row_sum[4] = {0, 0, 0, 0};
   int x;
   for (x = 0; x < width; ++x) {
     row_sum[0] += row[x * 4 + 0];
     row_sum[1] += row[x * 4 + 1];
     row_sum[2] += row[x * 4 + 2];
     row_sum[3] += row[x * 4 + 3];
-    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
-    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
-    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
-    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
+    cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
+    cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
+    cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
+    cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
   }
 }
 
-void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
-                                int w, int area, uint8* dst, int count) {
+void CumulativeSumToAverageRow_C(const int32_t* tl,
+                                 const int32_t* bl,
+                                 int w,
+                                 int area,
+                                 uint8_t* dst,
+                                 int count) {
   float ooa = 1.0f / area;
   int i;
   for (i = 0; i < count; ++i) {
-    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
-    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
-    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
-    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+    dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+    dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+    dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
     dst += 4;
     tl += 4;
     bl += 4;
@@ -1995,8 +2937,11 @@ void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
 
 // Copy pixels from rotated source to destination row with a slope.
 LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width) {
+void ARGBAffineRow_C(const uint8_t* src_argb,
+                     int src_argb_stride,
+                     uint8_t* dst_argb,
+                     const float* uv_dudv,
+                     int width) {
   int i;
   // Render a row of pixels from source into a buffer.
   float uv[2];
@@ -2005,9 +2950,8 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
   for (i = 0; i < width; ++i) {
     int x = (int)(uv[0]);
     int y = (int)(uv[1]);
-    *(uint32*)(dst_argb) =
-        *(const uint32*)(src_argb + y * src_argb_stride +
-                                         x * 4);
+    *(uint32_t*)(dst_argb) =
+        *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);
     dst_argb += 4;
     uv[0] += uv_dudv[2];
     uv[1] += uv_dudv[3];
@@ -2015,63 +2959,74 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
 }
 
 // Blend 2 rows into 1.
-static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
-                      uint8* dst_uv, int pix) {
+static void HalfRow_C(const uint8_t* src_uv,
+                      ptrdiff_t src_uv_stride,
+                      uint8_t* dst_uv,
+                      int width) {
   int x;
-  for (x = 0; x < pix; ++x) {
+  for (x = 0; x < width; ++x) {
     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
   }
 }
 
-static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
-                         uint16* dst_uv, int pix) {
+static void HalfRow_16_C(const uint16_t* src_uv,
+                         ptrdiff_t src_uv_stride,
+                         uint16_t* dst_uv,
+                         int width) {
   int x;
-  for (x = 0; x < pix; ++x) {
+  for (x = 0; x < width; ++x) {
     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
   }
 }
 
 // C version 2x2 -> 2x1.
-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+void InterpolateRow_C(uint8_t* dst_ptr,
+                      const uint8_t* src_ptr,
                       ptrdiff_t src_stride,
-                      int width, int source_y_fraction) {
+                      int width,
+                      int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
-  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
   int x;
-  if (source_y_fraction == 0) {
+  if (y1_fraction == 0) {
     memcpy(dst_ptr, src_ptr, width);
     return;
   }
-  if (source_y_fraction == 128) {
-    HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
+  if (y1_fraction == 128) {
+    HalfRow_C(src_ptr, src_stride, dst_ptr, width);
     return;
   }
   for (x = 0; x < width - 1; x += 2) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    dst_ptr[0] =
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+    dst_ptr[1] =
+        (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
     src_ptr += 2;
     src_ptr1 += 2;
     dst_ptr += 2;
   }
   if (width & 1) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[0] =
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
   }
 }
 
-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+                         const uint16_t* src_ptr,
                          ptrdiff_t src_stride,
-                         int width, int source_y_fraction) {
+                         int width,
+                         int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
-  const uint16* src_ptr1 = src_ptr + src_stride;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
   int x;
   if (source_y_fraction == 0) {
     memcpy(dst_ptr, src_ptr, width * 2);
     return;
   }
   if (source_y_fraction == 128) {
-    HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);
+    HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
     return;
   }
   for (x = 0; x < width - 1; x += 2) {
@@ -2087,20 +3042,22 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
 }
 
 // Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
-                      const uint8* shuffler, int pix) {
+void ARGBShuffleRow_C(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      const uint8_t* shuffler,
+                      int width) {
   int index0 = shuffler[0];
   int index1 = shuffler[1];
   int index2 = shuffler[2];
   int index3 = shuffler[3];
   // Shuffle a row of ARGB.
   int x;
-  for (x = 0; x < pix; ++x) {
+  for (x = 0; x < width; ++x) {
     // To support in-place conversion.
-    uint8 b = src_argb[index0];
-    uint8 g = src_argb[index1];
-    uint8 r = src_argb[index2];
-    uint8 a = src_argb[index3];
+    uint8_t b = src_argb[index0];
+    uint8_t g = src_argb[index1];
+    uint8_t r = src_argb[index2];
+    uint8_t a = src_argb[index3];
     dst_argb[0] = b;
     dst_argb[1] = g;
     dst_argb[2] = r;
@@ -2110,10 +3067,11 @@ void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
   }
 }
 
-void I422ToYUY2Row_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_frame, int width) {
+void I422ToYUY2Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_frame[0] = src_y[0];
@@ -2133,10 +3091,11 @@ void I422ToYUY2Row_C(const uint8* src_y,
   }
 }
 
-void I422ToUYVYRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_frame, int width) {
+void I422ToUYVYRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_frame[0] = src_u[0];
@@ -2156,21 +3115,180 @@ void I422ToUYVYRow_C(const uint8* src_y,
   }
 }
 
+void ARGBPolynomialRow_C(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const float* poly,
+                         int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float b = (float)(src_argb[0]);
+    float g = (float)(src_argb[1]);
+    float r = (float)(src_argb[2]);
+    float a = (float)(src_argb[3]);
+    float b2 = b * b;
+    float g2 = g * g;
+    float r2 = r * r;
+    float a2 = a * a;
+    float db = poly[0] + poly[4] * b;
+    float dg = poly[1] + poly[5] * g;
+    float dr = poly[2] + poly[6] * r;
+    float da = poly[3] + poly[7] * a;
+    float b3 = b2 * b;
+    float g3 = g2 * g;
+    float r3 = r2 * r;
+    float a3 = a2 * a;
+    db += poly[8] * b2;
+    dg += poly[9] * g2;
+    dr += poly[10] * r2;
+    da += poly[11] * a2;
+    db += poly[12] * b3;
+    dg += poly[13] * g3;
+    dr += poly[14] * r3;
+    da += poly[15] * a3;
+
+    dst_argb[0] = Clamp((int32_t)(db));
+    dst_argb[1] = Clamp((int32_t)(dg));
+    dst_argb[2] = Clamp((int32_t)(dr));
+    dst_argb[3] = Clamp((int32_t)(da));
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+// Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
+// adjust the source integer range to the half float range desired.
+
+// This magic constant is 2^-112. Multiplying by this
+// is the same as subtracting 112 from the exponent, which
+// is the difference in exponent bias between 32-bit and
+// 16-bit floats. Once we've done this subtraction, we can
+// simply extract the low bits of the exponent and the high
+// bits of the mantissa from our float and we're done.
+
+// Work around GCC 7 punning warning -Wstrict-aliasing
+#if defined(__GNUC__)
+typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t;
+#else
+typedef uint32_t uint32_alias_t;
+#endif
+
+void HalfFloatRow_C(const uint16_t* src,
+                    uint16_t* dst,
+                    float scale,
+                    int width) {
+  int i;
+  float mult = 1.9259299444e-34f * scale;
+  for (i = 0; i < width; ++i) {
+    float value = src[i] * mult;
+    dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13);
+  }
+}
+
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float value = src[i] * scale;
+    dst[i] = value;
+  }
+}
+
+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width,
+                             const uint8_t* luma,
+                             uint32_t lumacoeff) {
+  uint32_t bc = lumacoeff & 0xff;
+  uint32_t gc = (lumacoeff >> 8) & 0xff;
+  uint32_t rc = (lumacoeff >> 16) & 0xff;
+
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    // Luminance in rows, color values in columns.
+    const uint8_t* luma0 =
+        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+        luma;
+    const uint8_t* luma1;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+    luma1 =
+        ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
+        luma;
+    dst_argb[4] = luma1[src_argb[4]];
+    dst_argb[5] = luma1[src_argb[5]];
+    dst_argb[6] = luma1[src_argb[6]];
+    dst_argb[7] = src_argb[7];
+    src_argb += 8;
+    dst_argb += 8;
+  }
+  if (width & 1) {
+    // Luminance in rows, color values in columns.
+    const uint8_t* luma0 =
+        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+        luma;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+  }
+}
+
+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[3];
+    dst[7] = src[7];
+    dst += 8;
+    src += 8;
+  }
+  if (width & 1) {
+    dst[3] = src[3];
+  }
+}
+
+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst_a[0] = src_argb[3];
+    dst_a[1] = src_argb[7];
+    dst_a += 2;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    dst_a[0] = src_argb[3];
+  }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[0];
+    dst[7] = src[1];
+    dst += 8;
+    src += 2;
+  }
+  if (width & 1) {
+    dst[3] = src[0];
+  }
+}
+
 // Maximum temporary width for wrappers to process at a time, in pixels.
 #define MAXTWIDTH 2048
 
-#if !(defined(_MSC_VER) && !defined(__clang__)) && \
+#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
     defined(HAS_I422TORGB565ROW_SSSE3)
 // row_win.cc has asm version, but GCC uses 2 step wrapper.
-void I422ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_u,
-                           const uint8* src_v,
-                           uint8* dst_rgb565,
+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_rgb565,
+                           const struct YuvConstants* yuvconstants,
                            int width) {
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
     src_y += twidth;
     src_u += twidth / 2;
@@ -2182,16 +3300,17 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TOARGB1555ROW_SSSE3)
-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb1555,
+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb1555,
+                             const struct YuvConstants* yuvconstants,
                              int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
     ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
     src_y += twidth;
     src_u += twidth / 2;
@@ -2203,16 +3322,17 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TOARGB4444ROW_SSSE3)
-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb4444,
+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb4444,
+                             const struct YuvConstants* yuvconstants,
                              int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
     ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
     src_y += twidth;
     src_u += twidth / 2;
@@ -2224,13 +3344,16 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y,
 #endif
 
 #if defined(HAS_NV12TORGB565ROW_SSSE3)
-void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
-                           uint8* dst_rgb565, int width) {
+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_uv,
+                           uint8_t* dst_rgb565,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
     src_y += twidth;
     src_uv += twidth;
@@ -2240,70 +3363,110 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
 }
 #endif
 
-#if defined(HAS_NV21TORGB565ROW_SSSE3)
-void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,
-                           uint8* dst_rgb565, int width) {
+#if defined(HAS_NV12TORGB24ROW_SSSE3)
+void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);
-    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB24ROW_SSSE3)
+void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_vu,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth);
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
     src_y += twidth;
     src_vu += twidth;
-    dst_rgb565 += twidth * 2;
+    dst_rgb24 += twidth * 3;
     width -= twidth;
   }
 }
 #endif
 
-#if defined(HAS_YUY2TOARGBROW_SSSE3)
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {
-  // Row buffers for intermediate YUV pixels.
-  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
-  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
-  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+#if defined(HAS_NV12TORGB24ROW_AVX2)
+void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
-    YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
-    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
-    src_yuy2 += twidth * 2;
-    dst_argb += twidth * 4;
+    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb24 += twidth * 3;
     width -= twidth;
   }
 }
 #endif
 
-#if defined(HAS_UYVYTOARGBROW_SSSE3)
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
-  // Row buffers for intermediate YUV pixels.
-  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
-  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
-  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+#if defined(HAS_NV21TORGB24ROW_AVX2)
+void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
-    UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
-    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
-    src_uyvy += twidth * 2;
-    dst_argb += twidth * 4;
+    NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb24 += twidth * 3;
     width -= twidth;
   }
 }
-#endif  // !defined(LIBYUV_DISABLE_X86)
+#endif
 
 #if defined(HAS_I422TORGB565ROW_AVX2)
-void I422ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
+void I422ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
                           int width) {
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
     ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
     src_y += twidth;
     src_u += twidth / 2;
     src_v += twidth / 2;
@@ -2314,17 +3477,22 @@ void I422ToRGB565Row_AVX2(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TOARGB1555ROW_AVX2)
-void I422ToARGB1555Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
                             int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
     ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+#else
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+#endif
     src_y += twidth;
     src_u += twidth / 2;
     src_v += twidth / 2;
@@ -2335,17 +3503,22 @@ void I422ToARGB1555Row_AVX2(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TOARGB4444ROW_AVX2)
-void I422ToARGB4444Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
                             int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
     ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+#else
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+#endif
     src_y += twidth;
     src_u += twidth / 2;
     src_v += twidth / 2;
@@ -2356,18 +3529,22 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TORGB24ROW_AVX2)
-void I422ToRGB24Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_rgb24,
-                            int width) {
+void I422ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
-    // TODO(fbarchard): ARGBToRGB24Row_AVX2
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
     src_y += twidth;
     src_u += twidth / 2;
     src_v += twidth / 2;
@@ -2377,196 +3554,292 @@ void I422ToRGB24Row_AVX2(const uint8* src_y,
 }
 #endif
 
-#if defined(HAS_I422TORAWROW_AVX2)
-void I422ToRAWRow_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_raw,
-                            int width) {
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
-    // TODO(fbarchard): ARGBToRAWRow_AVX2
-    ARGBToRAWRow_SSSE3(row, dst_raw, twidth);
+    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
     src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_raw += twidth * 3;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
     width -= twidth;
   }
 }
 #endif
 
-#if defined(HAS_NV12TORGB565ROW_AVX2)
-void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
-                          uint8* dst_rgb565, int width) {
+#ifdef HAS_RGB24TOYJROW_AVX2
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth);
-    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
-    src_y += twidth;
-    src_uv += twidth;
-    dst_rgb565 += twidth * 2;
+    RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+    ARGBToYJRow_AVX2(row, dst_yj, twidth);
+    src_rgb24 += twidth * 3;
+    dst_yj += twidth;
     width -= twidth;
   }
 }
-#endif
+#endif  // HAS_RGB24TOYJROW_AVX2
 
-#if defined(HAS_NV21TORGB565ROW_AVX2)
-void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu,
-                          uint8* dst_rgb565, int width) {
+#ifdef HAS_RAWTOYJROW_AVX2
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth);
-    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
-    src_y += twidth;
-    src_vu += twidth;
-    dst_rgb565 += twidth * 2;
+    RAWToARGBRow_SSSE3(src_raw, row, twidth);
+    ARGBToYJRow_AVX2(row, dst_yj, twidth);
+    src_raw += twidth * 3;
+    dst_yj += twidth;
     width -= twidth;
   }
 }
-#endif
+#endif  // HAS_RAWTOYJROW_AVX2
 
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) {
-  // Row buffers for intermediate YUV pixels.
-  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
-  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
-  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+#ifdef HAS_RGB24TOYJROW_SSSE3
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);
-    YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);
-    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
-    src_yuy2 += twidth * 2;
-    dst_argb += twidth * 4;
+    RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+    ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+    src_rgb24 += twidth * 3;
+    dst_yj += twidth;
     width -= twidth;
   }
 }
-#endif
+#endif  // HAS_RGB24TOYJROW_SSSE3
 
-#if defined(HAS_UYVYTOARGBROW_AVX2)
-void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) {
-  // Row buffers for intermediate YUV pixels.
-  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
-  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
-  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+#ifdef HAS_RAWTOYJROW_SSSE3
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);
-    UYVYToYRow_AVX2(src_uyvy, row_y, twidth);
-    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
-    src_uyvy += twidth * 2;
-    dst_argb += twidth * 4;
+    RAWToARGBRow_SSSE3(src_raw, row, twidth);
+    ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+    src_raw += twidth * 3;
+    dst_yj += twidth;
     width -= twidth;
   }
 }
-#endif  // !defined(LIBYUV_DISABLE_X86)
+#endif  // HAS_RAWTOYJROW_SSSE3
 
-void ARGBPolynomialRow_C(const uint8* src_argb,
-                         uint8* dst_argb, const float* poly,
-                         int width) {
+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
+  float fsum = 0.f;
   int i;
   for (i = 0; i < width; ++i) {
-    float b = (float)(src_argb[0]);
-    float g = (float)(src_argb[1]);
-    float r = (float)(src_argb[2]);
-    float a = (float)(src_argb[3]);
-    float b2 = b * b;
-    float g2 = g * g;
-    float r2 = r * r;
-    float a2 = a * a;
-    float db = poly[0] + poly[4] * b;
-    float dg = poly[1] + poly[5] * g;
-    float dr = poly[2] + poly[6] * r;
-    float da = poly[3] + poly[7] * a;
-    float b3 = b2 * b;
-    float g3 = g2 * g;
-    float r3 = r2 * r;
-    float a3 = a2 * a;
-    db += poly[8] * b2;
-    dg += poly[9] * g2;
-    dr += poly[10] * r2;
-    da += poly[11] * a2;
-    db += poly[12] * b3;
-    dg += poly[13] * g3;
-    dr += poly[14] * r3;
-    da += poly[15] * a3;
+    float v = *src++;
+    fsum += v * v;
+    *dst++ = v * scale;
+  }
+  return fsum;
+}
 
-    dst_argb[0] = Clamp((int32)(db));
-    dst_argb[1] = Clamp((int32)(dg));
-    dst_argb[2] = Clamp((int32)(dr));
-    dst_argb[3] = Clamp((int32)(da));
-    src_argb += 4;
-    dst_argb += 4;
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
+  float fmax = 0.f;
+  int i;
+  for (i = 0; i < width; ++i) {
+    float v = *src++;
+    float vs = v * scale;
+    fmax = (v > fmax) ? v : fmax;
+    *dst++ = vs;
   }
+  return fmax;
 }
 
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                             const uint8* luma, uint32 lumacoeff) {
-  uint32 bc = lumacoeff & 0xff;
-  uint32 gc = (lumacoeff >> 8) & 0xff;
-  uint32 rc = (lumacoeff >> 16) & 0xff;
+void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src++ * scale;
+  }
+}
 
+void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
   int i;
-  for (i = 0; i < width - 1; i += 2) {
-    // Luminance in rows, color values in columns.
-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
-                           src_argb[2] * rc) & 0x7F00u) + luma;
-    const uint8* luma1;
-    dst_argb[0] = luma0[src_argb[0]];
-    dst_argb[1] = luma0[src_argb[1]];
-    dst_argb[2] = luma0[src_argb[2]];
-    dst_argb[3] = src_argb[3];
-    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
-              src_argb[6] * rc) & 0x7F00u) + luma;
-    dst_argb[4] = luma1[src_argb[4]];
-    dst_argb[5] = luma1[src_argb[5]];
-    dst_argb[6] = luma1[src_argb[6]];
-    dst_argb[7] = src_argb[7];
-    src_argb += 8;
-    dst_argb += 8;
+  for (i = 0; i < width; ++i) {
+    *dst++ =
+        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
+    ++src;
   }
-  if (width & 1) {
-    // Luminance in rows, color values in columns.
-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
-                           src_argb[2] * rc) & 0x7F00u) + luma;
-    dst_argb[0] = luma0[src_argb[0]];
-    dst_argb[1] = luma0[src_argb[1]];
-    dst_argb[2] = luma0[src_argb[2]];
-    dst_argb[3] = src_argb[3];
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_C(const uint16_t* src0,
+                const uint16_t* src1,
+                const uint16_t* src2,
+                const uint16_t* src3,
+                const uint16_t* src4,
+                uint32_t* dst,
+                int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
   }
 }
 
-void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+void GaussRow_F32_C(const float* src, float* dst, int width) {
   int i;
-  for (i = 0; i < width - 1; i += 2) {
-    dst[3] = src[3];
-    dst[7] = src[7];
-    dst += 8;
-    src += 8;
+  for (i = 0; i < width; ++i) {
+    *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) *
+             (1.0f / 256.0f);
+    ++src;
+  }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_C(const float* src0,
+                    const float* src1,
+                    const float* src2,
+                    const float* src3,
+                    const float* src4,
+                    float* dst,
+                    int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+  }
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* dst_yuv24,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_yuv24[0] = src_vu[0];  // V
+    dst_yuv24[1] = src_vu[1];  // U
+    dst_yuv24[2] = src_y[0];   // Y0
+    dst_yuv24[3] = src_vu[0];  // V
+    dst_yuv24[4] = src_vu[1];  // U
+    dst_yuv24[5] = src_y[1];   // Y1
+    src_y += 2;
+    src_vu += 2;
+    dst_yuv24 += 6;  // Advance 2 pixels.
   }
   if (width & 1) {
-    dst[3] = src[3];
+    dst_yuv24[0] = src_vu[0];  // V
+    dst_yuv24[1] = src_vu[1];  // U
+    dst_yuv24[2] = src_y[0];   // Y0
   }
 }
 
-void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
-  int i;
-  for (i = 0; i < width - 1; i += 2) {
-    dst[3] = src[0];
-    dst[7] = src[1];
-    dst += 8;
-    src += 2;
+// Filter 2 rows of AYUV UV's (444) into UV (420).
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+                   int src_stride_ayuv,
+                   uint8_t* dst_uv,
+                   int width) {
+  // Output a row of UV values, filtering 2x2 rows of AYUV.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 5] + 2) >>
+                2;
+    dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 4] + 2) >>
+                2;
+    src_ayuv += 8;
+    dst_uv += 2;
   }
   if (width & 1) {
-    dst[3] = src[0];
+    dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 0] + 2) >>
+                2;
+    dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 1] + 2) >>
+                2;
+  }
+}
+
+// Filter 2 rows of AYUV UV's (444) into VU (420).
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+                   int src_stride_ayuv,
+                   uint8_t* dst_vu,
+                   int width) {
+  // Output a row of VU values, filtering 2x2 rows of AYUV.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 4] + 2) >>
+                2;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 5] + 2) >>
+                2;
+    src_ayuv += 8;
+    dst_vu += 2;
+  }
+  if (width & 1) {
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 0] + 2) >>
+                2;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 1] + 2) >>
+                2;
+  }
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = src_ayuv[2];  // v,u,y,a
+    src_ayuv += 4;
+  }
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t u = src_uv[0];
+    uint8_t v = src_uv[1];
+    dst_vu[0] = v;
+    dst_vu[1] = u;
+    src_uv += 2;
+    dst_vu += 2;
+  }
+}
+
+void HalfMergeUVRow_C(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
+                 src_u[src_stride_u + 1] + 2) >>
+                2;
+    dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
+                 src_v[src_stride_v + 1] + 2) >>
+                2;
+    src_u += 2;
+    src_v += 2;
+    dst_uv += 2;
+  }
+  if (width & 1) {
+    dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
+    dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
   }
 }
 
diff --git a/media/libaom/src/third_party/libyuv/source/row_gcc.cc b/media/libaom/src/third_party/libyuv/source/row_gcc.cc
index 820de0a1c6..a107c30e76 100644
--- a/media/libaom/src/third_party/libyuv/source/row_gcc.cc
+++ b/media/libaom/src/third_party/libyuv/source/row_gcc.cc
@@ -1,4 +1,3 @@
-// VERSION 2
 /*
  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
@@ -17,2162 +16,2917 @@ extern "C" {
 #endif
 
 // This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
 
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 // Constants for ARGB
-static vec8 kARGBToY = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
+static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
+                               25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
 
 // JPeg full range.
-static vec8 kARGBToYJ = {
-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
+static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
+                                29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
+
+static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
+                                0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
 
-static vec8 kARGBToU = {
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+                              112, -74, -38, 0, 112, -74, -38, 0};
 
-static vec8 kARGBToUJ = {
-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+                               127, -84, -43, 0, 127, -84, -43, 0};
 
-static vec8 kARGBToV = {
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
+static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
+                              -18, -94, 112, 0, -18, -94, 112, 0};
 
-static vec8 kARGBToVJ = {
-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+                               -20, -107, 127, 0, -20, -107, 127, 0};
 
 // Constants for BGRA
-static vec8 kBGRAToY = {
-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
+static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
+                               0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
 
-static vec8 kBGRAToU = {
-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+                              0, -38, -74, 112, 0, -38, -74, 112};
 
-static vec8 kBGRAToV = {
-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+                              0, 112, -94, -18, 0, 112, -94, -18};
 
 // Constants for ABGR
-static vec8 kABGRToY = {
-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
+static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
+                               66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
 
-static vec8 kABGRToU = {
-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+                              -38, -74, 112, 0, -38, -74, 112, 0};
 
-static vec8 kABGRToV = {
-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+                              112, -94, -18, 0, 112, -94, -18, 0};
 
 // Constants for RGBA.
-static vec8 kRGBAToY = {
-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
+static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
+                               0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
 
-static vec8 kRGBAToU = {
-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+                              0, 112, -74, -38, 0, 112, -74, -38};
 
-static vec8 kRGBAToV = {
-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+                              0, -18, -94, 112, 0, -18, -94, 112};
 
-static uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
+static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
+                               0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
 
-// 7 bit fixed point 0.5.
-static vec16 kAddYJ64 = {
-  64, 64, 64, 64, 64, 64, 64, 64
-};
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
-static uvec8 kAddUV128 = {
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                               0x8080u, 0x8080u, 0x8080u, 0x8080u};
 
-static uvec16 kAddUVJ128 = {
-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
 
 #ifdef HAS_RGB24TOARGBROW_SSSE3
 
 // Shuffle table for converting RGB24 to ARGB.
-static uvec8 kShuffleMaskRGB24ToARGB = {
-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
 
 // Shuffle table for converting RAW to ARGB.
-static uvec8 kShuffleMaskRAWToARGB = {
-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
+                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+
+// Shuffle table for converting RAW to RGBA.
+static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u,  4u,  3u,
+                                            14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
+
+// Shuffle table for converting RAW to RGB24.  First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24.  Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RGB24.
-static uvec8 kShuffleMaskARGBToRGB24 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RAW.
-static uvec8 kShuffleMaskARGBToRAW = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kShuffleMaskARGBToRAW = {
+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
-static uvec8 kShuffleMaskARGBToRGB24_0 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
-
-// Shuffle table for converting ARGB to RAW.
-static uvec8 kShuffleMaskARGBToRAW_0 = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
 };
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 
-#if defined(TESTING) && defined(__x86_64__)
-void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
-  asm volatile (
-    ".p2align  5                               \n"
-    "mov       %%eax,%%eax                     \n"
-    "mov       %%ebx,%%ebx                     \n"
-    "mov       %%ecx,%%ecx                     \n"
-    "mov       %%edx,%%edx                     \n"
-    "mov       %%esi,%%esi                     \n"
-    "mov       %%edi,%%edi                     \n"
-    "mov       %%ebp,%%ebp                     \n"
-    "mov       %%esp,%%esp                     \n"
-    ".p2align  5                               \n"
-    "mov       %%r8d,%%r8d                     \n"
-    "mov       %%r9d,%%r9d                     \n"
-    "mov       %%r10d,%%r10d                   \n"
-    "mov       %%r11d,%%r11d                   \n"
-    "mov       %%r12d,%%r12d                   \n"
-    "mov       %%r13d,%%r13d                   \n"
-    "mov       %%r14d,%%r14d                   \n"
-    "mov       %%r15d,%%r15d                   \n"
-    ".p2align  5                               \n"
-    "lea       (%%rax),%%eax                   \n"
-    "lea       (%%rbx),%%ebx                   \n"
-    "lea       (%%rcx),%%ecx                   \n"
-    "lea       (%%rdx),%%edx                   \n"
-    "lea       (%%rsi),%%esi                   \n"
-    "lea       (%%rdi),%%edi                   \n"
-    "lea       (%%rbp),%%ebp                   \n"
-    "lea       (%%rsp),%%esp                   \n"
-    ".p2align  5                               \n"
-    "lea       (%%r8),%%r8d                    \n"
-    "lea       (%%r9),%%r9d                    \n"
-    "lea       (%%r10),%%r10d                  \n"
-    "lea       (%%r11),%%r11d                  \n"
-    "lea       (%%r12),%%r12d                  \n"
-    "lea       (%%r13),%%r13d                  \n"
-    "lea       (%%r14),%%r14d                  \n"
-    "lea       (%%r15),%%r15d                  \n"
-
-    ".p2align  5                               \n"
-    "lea       0x10(%%rax),%%eax               \n"
-    "lea       0x10(%%rbx),%%ebx               \n"
-    "lea       0x10(%%rcx),%%ecx               \n"
-    "lea       0x10(%%rdx),%%edx               \n"
-    "lea       0x10(%%rsi),%%esi               \n"
-    "lea       0x10(%%rdi),%%edi               \n"
-    "lea       0x10(%%rbp),%%ebp               \n"
-    "lea       0x10(%%rsp),%%esp               \n"
-    ".p2align  5                               \n"
-    "lea       0x10(%%r8),%%r8d                \n"
-    "lea       0x10(%%r9),%%r9d                \n"
-    "lea       0x10(%%r10),%%r10d              \n"
-    "lea       0x10(%%r11),%%r11d              \n"
-    "lea       0x10(%%r12),%%r12d              \n"
-    "lea       0x10(%%r13),%%r13d              \n"
-    "lea       0x10(%%r14),%%r14d              \n"
-    "lea       0x10(%%r15),%%r15d              \n"
-
-    ".p2align  5                               \n"
-    "add       0x10,%%eax                      \n"
-    "add       0x10,%%ebx                      \n"
-    "add       0x10,%%ecx                      \n"
-    "add       0x10,%%edx                      \n"
-    "add       0x10,%%esi                      \n"
-    "add       0x10,%%edi                      \n"
-    "add       0x10,%%ebp                      \n"
-    "add       0x10,%%esp                      \n"
-    ".p2align  5                               \n"
-    "add       0x10,%%r8d                      \n"
-    "add       0x10,%%r9d                      \n"
-    "add       0x10,%%r10d                     \n"
-    "add       0x10,%%r11d                     \n"
-    "add       0x10,%%r12d                     \n"
-    "add       0x10,%%r13d                     \n"
-    "add       0x10,%%r14d                     \n"
-    "add       0x10,%%r15d                     \n"
-
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y),     // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
-}
-#endif  // TESTING
-
 #ifdef HAS_J400TOARGBROW_SSE2
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y),     // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklwd   %%xmm0,%%xmm0                 \n"
+      "punpckhwd   %%xmm1,%%xmm1                 \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_J400TOARGBROW_SSE2
 
 #ifdef HAS_RGB24TOARGBROW_SSSE3
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
-    "pslld     $0x18,%%xmm5                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x30,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
-    "pshufb    %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
-    "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "m"(kShuffleMaskRGB24ToARGB)  // %3
-  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
+      "pslld       $0x18,%%xmm5                  \n"
+      "movdqa      %3,%%xmm4                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm3               \n"
+      "lea         0x30(%0),%0                   \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm2            \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm2                 \n"
+      "palignr     $0xc,%%xmm0,%%xmm1            \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "palignr     $0x4,%%xmm3,%%xmm3            \n"
+      "pshufb      %%xmm4,%%xmm3                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm3,0x30(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_rgb24),              // %0
+        "+r"(dst_argb),               // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskRGB24ToARGB)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
-    "pslld     $0x18,%%xmm5                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x30,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
-    "pshufb    %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
-    "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "m"(kShuffleMaskRAWToARGB)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
+      "pslld       $0x18,%%xmm5                  \n"
+      "movdqa      %3,%%xmm4                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm3               \n"
+      "lea         0x30(%0),%0                   \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm2            \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm2                 \n"
+      "palignr     $0xc,%%xmm0,%%xmm1            \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "palignr     $0x4,%%xmm3,%%xmm3            \n"
+      "pshufb      %%xmm4,%%xmm3                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm3,0x30(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_raw),              // %0
+        "+r"(dst_argb),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskRAWToARGB)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "mov       $0x1080108,%%eax                \n"
-    "movd      %%eax,%%xmm5                    \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x20802080,%%eax               \n"
-    "movd      %%eax,%%xmm6                    \n"
-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psllw     $0xb,%%xmm3                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0xa,%%xmm4                     \n"
-    "psrlw     $0x5,%%xmm4                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psllw     $0x8,%%xmm7                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "psllw     $0xb,%%xmm2                     \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "psllw     $0x8,%%xmm1                     \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "pmulhuw   %%xmm6,%%xmm0                   \n"
-    "por       %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpckhbw %%xmm0,%%xmm2                   \n"
-    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
-    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  :
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+// Same code as RAWToARGB with different shuffler and A in low bits
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
+      "psrld       $0x18,%%xmm5                  \n"
+      "movdqa      %3,%%xmm4                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm3               \n"
+      "lea         0x30(%0),%0                   \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm2            \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm2                 \n"
+      "palignr     $0xc,%%xmm0,%%xmm1            \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "palignr     $0x4,%%xmm3,%%xmm3            \n"
+      "pshufb      %%xmm4,%%xmm3                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm3,0x30(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_raw),              // %0
+        "+r"(dst_rgba),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskRAWToRGBA)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "mov       $0x1080108,%%eax                \n"
-    "movd      %%eax,%%xmm5                    \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x42004200,%%eax               \n"
-    "movd      %%eax,%%xmm6                    \n"
-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psllw     $0xb,%%xmm3                     \n"
-    "movdqa    %%xmm3,%%xmm4                   \n"
-    "psrlw     $0x6,%%xmm4                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psllw     $0x8,%%xmm7                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psllw     $0x1,%%xmm1                     \n"
-    "psllw     $0xb,%%xmm2                     \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "psllw     $0x8,%%xmm1                     \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "pmulhuw   %%xmm6,%%xmm0                   \n"
-    "pand      %%xmm7,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpckhbw %%xmm0,%%xmm2                   \n"
-    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
-    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  :
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm3                     \n"
+      "movdqa      %4,%%xmm4                     \n"
+      "movdqa      %5,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x4(%0),%%xmm1                \n"
+      "movdqu      0x8(%0),%%xmm2                \n"
+      "lea         0x18(%0),%0                   \n"
+      "pshufb      %%xmm3,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "pshufb      %%xmm5,%%xmm2                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x8(%1)                \n"
+      "movq        %%xmm2,0x10(%1)               \n"
+      "lea         0x18(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_raw),                  // %0
+        "+r"(dst_rgb24),                // %1
+        "+r"(width)                     // %2
+      : "m"(kShuffleMaskRAWToRGB24_0),  // %3
+        "m"(kShuffleMaskRAWToRGB24_1),  // %4
+        "m"(kShuffleMaskRAWToRGB24_2)   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "mov       $0xf0f0f0f,%%eax                \n"
-    "movd      %%eax,%%xmm4                    \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "pslld     $0x4,%%xmm5                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "psllw     $0x4,%%xmm1                     \n"
-    "psrlw     $0x4,%%xmm3                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
-    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  :
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov         $0x1080108,%%eax              \n"
+      "movd        %%eax,%%xmm5                  \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "mov         $0x20802080,%%eax             \n"
+      "movd        %%eax,%%xmm6                  \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psllw       $0xb,%%xmm3                   \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psllw       $0xa,%%xmm4                   \n"
+      "psrlw       $0x5,%%xmm4                   \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psllw       $0x8,%%xmm7                   \n"
+      "sub         %0,%1                         \n"
+      "sub         %0,%1                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "psllw       $0xb,%%xmm2                   \n"
+      "pmulhuw     %%xmm5,%%xmm1                 \n"
+      "pmulhuw     %%xmm5,%%xmm2                 \n"
+      "psllw       $0x8,%%xmm1                   \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "pmulhuw     %%xmm6,%%xmm0                 \n"
+      "por         %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm0,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
+      "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7");
 }
 
-void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "pshufb    %%xmm6,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm6,%%xmm2                   \n"
-    "pshufb    %%xmm6,%%xmm3                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "psrldq    $0x4,%%xmm1                     \n"
-    "pslldq    $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm2,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "psrldq    $0x8,%%xmm2                     \n"
-    "pslldq    $0x4,%%xmm3                     \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x30,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  : "m"(kShuffleMaskARGBToRGB24)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov         $0x1080108,%%eax              \n"
+      "movd        %%eax,%%xmm5                  \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "mov         $0x42004200,%%eax             \n"
+      "movd        %%eax,%%xmm6                  \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psllw       $0xb,%%xmm3                   \n"
+      "movdqa      %%xmm3,%%xmm4                 \n"
+      "psrlw       $0x6,%%xmm4                   \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psllw       $0x8,%%xmm7                   \n"
+      "sub         %0,%1                         \n"
+      "sub         %0,%1                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "psllw       $0x1,%%xmm1                   \n"
+      "psllw       $0xb,%%xmm2                   \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "pmulhuw     %%xmm5,%%xmm2                 \n"
+      "pmulhuw     %%xmm5,%%xmm1                 \n"
+      "psllw       $0x8,%%xmm1                   \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "psraw       $0x8,%%xmm2                   \n"
+      "pmulhuw     %%xmm6,%%xmm0                 \n"
+      "pand        %%xmm7,%%xmm2                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm0,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
+      "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7");
 }
 
-void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "pshufb    %%xmm6,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm6,%%xmm2                   \n"
-    "pshufb    %%xmm6,%%xmm3                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "psrldq    $0x4,%%xmm1                     \n"
-    "pslldq    $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm2,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "psrldq    $0x8,%%xmm2                     \n"
-    "pslldq    $0x4,%%xmm3                     \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x30,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  : "m"(kShuffleMaskARGBToRAW)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov         $0xf0f0f0f,%%eax              \n"
+      "movd        %%eax,%%xmm4                  \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "pslld       $0x4,%%xmm5                   \n"
+      "sub         %0,%1                         \n"
+      "sub         %0,%1                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "psllw       $0x4,%%xmm1                   \n"
+      "psrlw       $0x4,%%xmm3                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "movdqu      %%xmm0,0x00(%1,%0,2)          \n"
+      "movdqu      %%xmm1,0x10(%1,%0,2)          \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psrld     $0x1b,%%xmm3                    \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psrld     $0x1a,%%xmm4                    \n"
-    "pslld     $0x5,%%xmm4                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0xb,%%xmm5                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pslld     $0x8,%%xmm0                     \n"
-    "psrld     $0x3,%%xmm1                     \n"
-    "psrld     $0x5,%%xmm2                     \n"
-    "psrad     $0x10,%%xmm0                    \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm4,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      "movdqa      %3,%%xmm6                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "lea         0x40(%0),%0                   \n"
+      "pshufb      %%xmm6,%%xmm0                 \n"
+      "pshufb      %%xmm6,%%xmm1                 \n"
+      "pshufb      %%xmm6,%%xmm2                 \n"
+      "pshufb      %%xmm6,%%xmm3                 \n"
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "psrldq      $0x4,%%xmm1                   \n"
+      "pslldq      $0xc,%%xmm4                   \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pslldq      $0x8,%%xmm5                   \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "psrldq      $0x8,%%xmm2                   \n"
+      "pslldq      $0x4,%%xmm3                   \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "lea         0x30(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskARGBToRGB24)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
-void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psrld     $0x1b,%%xmm4                    \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "pslld     $0x5,%%xmm5                     \n"
-    "movdqa    %%xmm4,%%xmm6                   \n"
-    "pslld     $0xa,%%xmm6                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "pslld     $0xf,%%xmm7                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "psrad     $0x10,%%xmm0                    \n"
-    "psrld     $0x3,%%xmm1                     \n"
-    "psrld     $0x6,%%xmm2                     \n"
-    "psrld     $0x9,%%xmm3                     \n"
-    "pand      %%xmm7,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm6,%%xmm3                   \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  :: "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      "movdqa      %3,%%xmm6                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "lea         0x40(%0),%0                   \n"
+      "pshufb      %%xmm6,%%xmm0                 \n"
+      "pshufb      %%xmm6,%%xmm1                 \n"
+      "pshufb      %%xmm6,%%xmm2                 \n"
+      "pshufb      %%xmm6,%%xmm3                 \n"
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "psrldq      $0x4,%%xmm1                   \n"
+      "pslldq      $0xc,%%xmm4                   \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pslldq      $0x8,%%xmm5                   \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "psrldq      $0x8,%%xmm2                   \n"
+      "pslldq      $0x4,%%xmm3                   \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "lea         0x30(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskARGBToRAW)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
-void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm4,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm3,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm1                   \n"
-    "psrlq     $0x4,%%xmm0                     \n"
-    "psrlq     $0x8,%%xmm1                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(pix)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+#ifdef HAS_ARGBTORGB24ROW_AVX2
+// vpermd for 12+12 to 24
+static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
+
+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm6                  \n"
+      "vmovdqa     %4,%%ymm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "lea         0x80(%0),%0                   \n"
+      "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
+      "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
+      "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
+      "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
+      "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
+      "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
+      "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
+      "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
+      "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
+      "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
+      "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
+      "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
+      "vpermq      $0x93,%%ymm3,%%ymm3           \n"
+      "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
+      "vmovdqu     %%ymm2,0x40(%1)               \n"
+      "lea         0x60(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                     // %0
+        "+r"(dst),                     // %1
+        "+r"(width)                    // %2
+      : "m"(kShuffleMaskARGBToRGB24),  // %3
+        "m"(kPermdRGB24_AVX)           // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
+// Shuffle table for converting ARGBToRGB24
+static const ulvec8 kPermARGBToRGB24_0 = {
+    0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
+    14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
+    29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
+static const ulvec8 kPermARGBToRGB24_1 = {
+    10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
+    25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
+    40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
+static const ulvec8 kPermARGBToRGB24_2 = {
+    21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
+    36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
+    50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
+
+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vmovdqa     %3,%%ymm5                     \n"
+      "vmovdqa     %4,%%ymm6                     \n"
+      "vmovdqa     %5,%%ymm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "lea         0x80(%0),%0                   \n"
+      "vpermt2b    %%ymm1,%%ymm5,%%ymm0          \n"
+      "vpermt2b    %%ymm2,%%ymm6,%%ymm1          \n"
+      "vpermt2b    %%ymm3,%%ymm7,%%ymm2          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "vmovdqu     %%ymm2,0x40(%1)               \n"
+      "lea         0x60(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                // %0
+        "+r"(dst),                // %1
+        "+r"(width)               // %2
+      : "m"(kPermARGBToRGB24_0),  // %3
+        "m"(kPermARGBToRGB24_1),  // %4
+        "m"(kPermARGBToRGB24_2)   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
+}
+#endif
+
+#ifdef HAS_ARGBTORAWROW_AVX2
+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm6                  \n"
+      "vmovdqa     %4,%%ymm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "lea         0x80(%0),%0                   \n"
+      "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
+      "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
+      "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
+      "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
+      "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
+      "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
+      "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
+      "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
+      "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
+      "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
+      "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
+      "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
+      "vpermq      $0x93,%%ymm3,%%ymm3           \n"
+      "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
+      "vmovdqu     %%ymm2,0x40(%1)               \n"
+      "lea         0x60(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                   // %0
+        "+r"(dst),                   // %1
+        "+r"(width)                  // %2
+      : "m"(kShuffleMaskARGBToRAW),  // %3
+        "m"(kPermdRGB24_AVX)         // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psrld       $0x1b,%%xmm3                  \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrld       $0x1a,%%xmm4                  \n"
+      "pslld       $0x5,%%xmm4                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0xb,%%xmm5                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pslld       $0x8,%%xmm0                   \n"
+      "psrld       $0x3,%%xmm1                   \n"
+      "psrld       $0x5,%%xmm2                   \n"
+      "psrad       $0x10,%%xmm0                  \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm4,%%xmm2                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "movd        %3,%%xmm6                     \n"
+      "punpcklbw   %%xmm6,%%xmm6                 \n"
+      "movdqa      %%xmm6,%%xmm7                 \n"
+      "punpcklwd   %%xmm6,%%xmm6                 \n"
+      "punpckhwd   %%xmm7,%%xmm7                 \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psrld       $0x1b,%%xmm3                  \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrld       $0x1a,%%xmm4                  \n"
+      "pslld       $0x5,%%xmm4                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0xb,%%xmm5                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "paddusb     %%xmm6,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pslld       $0x8,%%xmm0                   \n"
+      "psrld       $0x3,%%xmm1                   \n"
+      "psrld       $0x5,%%xmm2                   \n"
+      "psrad       $0x10,%%xmm0                  \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm4,%%xmm2                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "vbroadcastss %3,%%xmm6                    \n"
+      "vpunpcklbw  %%xmm6,%%xmm6,%%xmm6          \n"
+      "vpermq      $0xd8,%%ymm6,%%ymm6           \n"
+      "vpunpcklwd  %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
+      "vpsrld      $0x1b,%%ymm3,%%ymm3           \n"
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrld      $0x1a,%%ymm4,%%ymm4           \n"
+      "vpslld      $0x5,%%ymm4,%%ymm4            \n"
+      "vpslld      $0xb,%%ymm3,%%ymm5            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpaddusb    %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpsrld      $0x5,%%ymm0,%%ymm2            \n"
+      "vpsrld      $0x3,%%ymm0,%%ymm1            \n"
+      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
+      "vpand       %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpand       %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpor        %%ymm2,%%ymm1,%%ymm1          \n"
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "lea         0x20(%0),%0                   \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
+
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrld       $0x1b,%%xmm4                  \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "pslld       $0x5,%%xmm5                   \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "pslld       $0xa,%%xmm6                   \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "pslld       $0xf,%%xmm7                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm3                 \n"
+      "psrad       $0x10,%%xmm0                  \n"
+      "psrld       $0x3,%%xmm1                   \n"
+      "psrld       $0x6,%%xmm2                   \n"
+      "psrld       $0x9,%%xmm3                   \n"
+      "pand        %%xmm7,%%xmm0                 \n"
+      "pand        %%xmm4,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm2                 \n"
+      "pand        %%xmm6,%%xmm3                 \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psllw       $0xc,%%xmm4                   \n"
+      "movdqa      %%xmm4,%%xmm3                 \n"
+      "psrlw       $0x8,%%xmm3                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm3,%%xmm0                 \n"
+      "pand        %%xmm4,%%xmm1                 \n"
+      "psrlq       $0x4,%%xmm0                   \n"
+      "psrlq       $0x8,%%xmm1                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 
+/*
+
+ARGBToAR30Row:
+
+Red Blue
+With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
+produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
+wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
+(1024+4)*16 for red.
+
+Alpha Green
+Alpha and Green are already in the high bits so vpand can zero out the other
+bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
+could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
+would be a simple multiplier to shift it into position.  It wants a gap of 10
+above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
+more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
+and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
+result left 10 to position the A and G channels.
+*/
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
+                                   128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
+
+static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
+                                   128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
+
+static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
+static const uint32_t kMaskRB10 = 0x3ff003ff;
+static const uint32_t kMaskAG10 = 0xc000ff00;
+static const uint32_t kMulAG10 = 64 * 65536 + 1028;
+
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
+      "movd        %4,%%xmm3                     \n"  // multipler for RB
+      "movd        %5,%%xmm4                     \n"  // mask for R10 B10
+      "movd        %6,%%xmm5                     \n"  // mask for AG
+      "movd        %7,%%xmm6                     \n"  // multipler for AG
+      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "sub         %0,%1                         \n"
+
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
+      "pand        %%xmm5,%%xmm0                 \n"  // A0G0
+      "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
+      "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
+      "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
+      "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
+      "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
+      "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
+      "add         $0x10,%0                      \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleRB30),  // %3
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
+      "movd        %4,%%xmm3                     \n"  // multipler for RB
+      "movd        %5,%%xmm4                     \n"  // mask for R10 B10
+      "movd        %6,%%xmm5                     \n"  // mask for AG
+      "movd        %7,%%xmm6                     \n"  // multipler for AG
+      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "sub         %0,%1                         \n"
+
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
+      "pand        %%xmm5,%%xmm0                 \n"  // A0G0
+      "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
+      "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
+      "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
+      "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
+      "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
+      "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
+      "add         $0x10,%0                      \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleBR30),  // %3  reversed shuffler
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_ARGBTOAR30ROW_AVX2
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+      "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
+      "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
+      "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
+      "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
+      "sub         %0,%1                         \n"
+
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ARGB pixels
+      "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
+      "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
+      "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
+      "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
+      "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
+      "add         $0x20,%0                      \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleRB30),  // %3
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_ABGRTOAR30ROW_AVX2
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+      "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
+      "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
+      "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
+      "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
+      "sub         %0,%1                         \n"
+
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ABGR pixels
+      "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
+      "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
+      "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
+      "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
+      "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
+      "add         $0x20,%0                      \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleBR30),  // %3  reversed shuffler
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+// clang-format off
+
+// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
+// round parameter is register containing value to add before shift.
+#define RGBTOY(round)                            \
+  "1:                                        \n" \
+  "movdqu    (%0),%%xmm0                     \n" \
+  "movdqu    0x10(%0),%%xmm1                 \n" \
+  "movdqu    0x20(%0),%%xmm2                 \n" \
+  "movdqu    0x30(%0),%%xmm3                 \n" \
+  "psubb     %%xmm5,%%xmm0                   \n" \
+  "psubb     %%xmm5,%%xmm1                   \n" \
+  "psubb     %%xmm5,%%xmm2                   \n" \
+  "psubb     %%xmm5,%%xmm3                   \n" \
+  "movdqu    %%xmm4,%%xmm6                   \n" \
+  "pmaddubsw %%xmm0,%%xmm6                   \n" \
+  "movdqu    %%xmm4,%%xmm0                   \n" \
+  "pmaddubsw %%xmm1,%%xmm0                   \n" \
+  "movdqu    %%xmm4,%%xmm1                   \n" \
+  "pmaddubsw %%xmm2,%%xmm1                   \n" \
+  "movdqu    %%xmm4,%%xmm2                   \n" \
+  "pmaddubsw %%xmm3,%%xmm2                   \n" \
+  "lea       0x40(%0),%0                     \n" \
+  "phaddw    %%xmm0,%%xmm6                   \n" \
+  "phaddw    %%xmm2,%%xmm1                   \n" \
+  "prefetcht0 1280(%0)                       \n" \
+  "paddw     %%" #round ",%%xmm6             \n" \
+  "paddw     %%" #round ",%%xmm1             \n" \
+  "psrlw     $0x8,%%xmm6                     \n" \
+  "psrlw     $0x8,%%xmm1                     \n" \
+  "packuswb  %%xmm1,%%xmm6                   \n" \
+  "movdqu    %%xmm6,(%1)                     \n" \
+  "lea       0x10(%1),%1                     \n" \
+  "sub       $0x10,%2                        \n" \
+  "jg        1b                              \n"
+
+#define RGBTOY_AVX2(round)                                       \
+  "1:                                        \n"                 \
+  "vmovdqu    (%0),%%ymm0                    \n"                 \
+  "vmovdqu    0x20(%0),%%ymm1                \n"                 \
+  "vmovdqu    0x40(%0),%%ymm2                \n"                 \
+  "vmovdqu    0x60(%0),%%ymm3                \n"                 \
+  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
+  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
+  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
+  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
+  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
+  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
+  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
+  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
+  "lea       0x80(%0),%0                     \n"                 \
+  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
+  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
+  "prefetcht0 1280(%0)                       \n"                 \
+  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
+  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
+  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
+  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
+  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
+  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
+  "vmovdqu    %%ymm0,(%1)                    \n"                 \
+  "lea       0x20(%1),%1                     \n"                 \
+  "sub       $0x20,%2                        \n"                 \
+  "jg        1b                              \n"                 \
+  "vzeroupper                                \n"
+
+// clang-format on
+
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+      "movdqa      %5,%%xmm7                     \n"
+
+      LABELALIGN RGBTOY(xmm7)
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToY),   // %3
+        "m"(kSub128),    // %4
+        "m"(kAddY16)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
 #ifdef HAS_ARGBTOYJROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
-// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kARGBToYJ),  // %3
-    "m"(kAddYJ64)    // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+// Same as ARGBToYRow but different coefficients, no add 16.
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN RGBTOY(xmm5)
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToYJ),  // %3
+        "m"(kSub128)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBTOYJROW_SSSE3
 
+#ifdef HAS_RGBATOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16.
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN RGBTOY(xmm5)
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kRGBAToYJ),  // %3
+        "m"(kSub128)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_RGBATOYJROW_SSSE3
+
 #ifdef HAS_ARGBTOYROW_AVX2
 // vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
-  0, 4, 1, 5, 2, 6, 3, 7
-};
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vbroadcastf128 %4,%%ymm5                  \n"
-    "vmovdqu    %5,%%ymm6                      \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16),    // %4
-    "m"(kPermdARGBToY_AVX)  // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vbroadcastf128 %5,%%ymm7                  \n"
+      "vmovdqu     %6,%%ymm6                     \n"
+
+      LABELALIGN RGBTOY_AVX2(ymm7)
+      : "+r"(src_argb),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kARGBToY),          // %3
+        "m"(kSub128),           // %4
+        "m"(kAddY16),           // %5
+        "m"(kPermdARGBToY_AVX)  // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOYROW_AVX2
 
+#ifdef HAS_ABGRTOYROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vbroadcastf128 %5,%%ymm7                  \n"
+      "vmovdqu     %6,%%ymm6                     \n"
+
+      LABELALIGN RGBTOY_AVX2(ymm7)
+      : "+r"(src_abgr),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kABGRToY),          // %3
+        "m"(kSub128),           // %4
+        "m"(kAddY16),           // %5
+        "m"(kPermdARGBToY_AVX)  // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOYROW_AVX2
+
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vbroadcastf128 %4,%%ymm5                  \n"
-    "vmovdqu    %5,%%ymm6                      \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
-    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kARGBToYJ),   // %3
-    "m"(kAddYJ64),    // %4
-    "m"(kPermdARGBToY_AVX)  // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu     %5,%%ymm6                     \n"
+
+      LABELALIGN RGBTOY_AVX2(ymm5)
+      : "+r"(src_argb),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kARGBToYJ),         // %3
+        "m"(kSub128),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOYJROW_AVX2
 
+#ifdef HAS_RGBATOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu     %5,%%ymm6                     \n"
+
+      LABELALIGN RGBTOY_AVX2(
+          ymm5) "vzeroupper                                \n"
+      : "+r"(src_rgba),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kRGBAToYJ),         // %3
+        "m"(kSub128),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_RGBATOYJROW_AVX2
+
 #ifdef HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kARGBToV),  // %5
-    "m"(kARGBToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kARGBToV),                     // %5
+        "m"(kARGBToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOUVROW_SSSE3
 
 #ifdef HAS_ARGBTOUVROW_AVX2
 // vpshufb for vphaddw + vpackuswb packed to shorts.
 static const lvec8 kShufARGBToUV_AVX = {
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-};
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vbroadcastf128 %5,%%ymm5                  \n"
-    "vbroadcastf128 %6,%%ymm6                  \n"
-    "vbroadcastf128 %7,%%ymm7                  \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
-    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
-
-    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpshufb    %8,%%ymm0,%%ymm0               \n"
-    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
-
-    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kAddUV128),  // %5
-    "m"(kARGBToV),   // %6
-    "m"(kARGBToU),   // %7
-    "m"(kShufARGBToUV_AVX)  // %8
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+      "lea         0x80(%0),%0                   \n"
+      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+      "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kAddUV128),                    // %5
+        "m"(kARGBToV),                     // %6
+        "m"(kARGBToU),                     // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOUVROW_AVX2
 
+#ifdef HAS_ABGRTOUVROW_AVX2
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+      "lea         0x80(%0),%0                   \n"
+      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+      "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_abgr0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kAddUV128),                    // %5
+        "m"(kABGRToV),                     // %6
+        "m"(kABGRToU),                     // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+      "lea         0x80(%0),%0                   \n"
+      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kSub128),                      // %5
+        "m"(kARGBToVJ),                    // %6
+        "m"(kARGBToUJ),                    // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBTOUVJROW_AVX2
+
 #ifdef HAS_ARGBTOUVJROW_SSSE3
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kARGBToVJ),  // %5
-    "m"(kARGBToUJ),  // %6
-    "m"(kAddUVJ128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+                        int src_stride_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "paddw       %%xmm5,%%xmm0                 \n"
+      "paddw       %%xmm5,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kARGBToVJ),                    // %5
+        "m"(kARGBToUJ),                    // %6
+        "m"(kSub128)                       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOUVJROW_SSSE3
 
 #ifdef HAS_ARGBTOUV444ROW_SSSE3
-void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
                           int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm3                       \n"
-    "movdqa    %5,%%xmm4                       \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm3,%%xmm0                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),        // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "m"(kARGBToV),  // %4
-    "m"(kARGBToU),  // %5
-    "m"(kAddUV128)  // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6"
-  );
+  asm volatile(
+      "movdqa      %4,%%xmm3                     \n"
+      "movdqa      %5,%%xmm4                     \n"
+      "movdqa      %6,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm6                 \n"
+      "phaddw      %%xmm1,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm2                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm2                   \n"
+      "packsswb    %%xmm2,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "pmaddubsw   %%xmm3,%%xmm0                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm1,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm2                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm2                   \n"
+      "packsswb    %%xmm2,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "lea         0x40(%0),%0                   \n"
+      "movdqu      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+rm"(width)     // %3
+      : "m"(kARGBToV),   // %4
+        "m"(kARGBToU),   // %5
+        "m"(kAddUV128)   // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
 }
 #endif  // HAS_ARGBTOUV444ROW_SSSE3
 
-#ifdef HAS_ARGBTOUV422ROW_SSSE3
-void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
-                          uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm3                       \n"
-    "movdqa    %5,%%xmm4                       \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "m"(kARGBToV),  // %4
-    "m"(kARGBToU),  // %5
-    "m"(kAddUV128)  // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+      "movdqa      %5,%%xmm7                     \n"
+
+      LABELALIGN RGBTOY(xmm7)
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kBGRAToY),   // %3
+        "m"(kSub128),    // %4
+        "m"(kAddY16)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
-#endif  // HAS_ARGBTOUV422ROW_SSSE3
 
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kBGRAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+                       int src_stride_bgra,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_bgra0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_bgra)),  // %4
+        "m"(kBGRAToV),                     // %5
+        "m"(kBGRAToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
-void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_bgra)), // %4
-    "m"(kBGRAToV),  // %5
-    "m"(kBGRAToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+      "movdqa      %5,%%xmm7                     \n"
+
+      LABELALIGN RGBTOY(xmm7)
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kABGRToY),   // %3
+        "m"(kSub128),    // %4
+        "m"(kAddY16)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kABGRToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kRGBAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+      "movdqa      %5,%%xmm7                     \n"
+
+      LABELALIGN RGBTOY(xmm7)
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kRGBAToY),   // %3
+        "m"(kSub128),    // %4
+        "m"(kAddY16)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
-void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_abgr)), // %4
-    "m"(kABGRToV),  // %5
-    "m"(kABGRToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_abgr0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kABGRToV),                     // %5
+        "m"(kABGRToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
-void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_rgba)), // %4
-    "m"(kRGBAToV),  // %5
-    "m"(kRGBAToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+                       int src_stride_rgba,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_rgba0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_rgba)),  // %4
+        "m"(kRGBAToV),                     // %5
+        "m"(kRGBAToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
 
-struct YuvConstants {
-  lvec8 kUVToB;     // 0
-  lvec8 kUVToG;     // 32
-  lvec8 kUVToR;     // 64
-  lvec16 kUVBiasB;  // 96
-  lvec16 kUVBiasG;  // 128
-  lvec16 kUVBiasR;  // 160
-  lvec16 kYToRgb;   // 192
-};
-
-// BT.601 YUV to RGB reference
-//  R = (Y - 16) * 1.164              - V * -1.596
-//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
-//  B = (Y - 16) * 1.164 - U * -2.018
-
-// Y contribution to R,G,B.  Scale and bias.
-// TODO(fbarchard): Consider moving constants into a common header.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-
-// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25 /* round(0.391 * 64) */
-#define VG 52 /* round(0.813 * 64) */
-#define VR -102 /* round(-1.596 * 64) */
-
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128            + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
-
-// BT601 constants for YUV to RGB.
-static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-
-// BT601 constants for NV21 where chroma plane is VU instead of UV.
-static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-
-#undef YG
-#undef YGB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-#undef BB
-#undef BG
-#undef BR
-
-// JPEG YUV to RGB reference
-// *  R = Y                - V * -1.40200
-// *  G = Y - U *  0.34414 - V *  0.71414
-// *  B = Y - U * -1.77200
-
-// Y contribution to R,G,B.  Scale and bias.
-// TODO(fbarchard): Consider moving constants into a common header.
-#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGBJ 32  /* 64 / 2 */
-
-// U and V contributions to R,G,B.
-#define UBJ -113 /* round(-1.77200 * 64) */
-#define UGJ 22 /* round(0.34414 * 64) */
-#define VGJ 46 /* round(0.71414  * 64) */
-#define VRJ -90 /* round(-1.40200 * 64) */
-
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BBJ (UBJ * 128             + YGBJ)
-#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
-#define BRJ             (VRJ * 128 + YGBJ)
-
-// JPEG constants for YUV to RGB.
-YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
-  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
-    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
-  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
-    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
-    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
-    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
-  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
-    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
-  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
-    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
-  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
-    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
-  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
-    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
-  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
-    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
-};
-
-#undef YGJ
-#undef YGBJ
-#undef UBJ
-#undef UGJ
-#undef VGJ
-#undef VRJ
-#undef BBJ
-#undef BGJ
-#undef BRJ
-
-// Read 8 UV from 411
-#define READYUV444                                                             \
-    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"
+// Read 8 UV from 444
+#define READYUV444                                                \
+  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 UV from 422, upsample to 8 UV
-#define READYUV422                                                             \
-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"
-
-// Read 2 UV from 411, upsample to 8 UV
-#define READYUV411                                                             \
-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
-    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "punpckldq  %%xmm0,%%xmm0                                   \n"
+#define READYUV422                                                \
+  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 UV from 422 10 bit, upsample to 8 UV
+// TODO(fbarchard): Consider shufb to replace pack/unpack
+// TODO(fbarchard): Consider pmulhuw to replace psraw
+// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
+#define READYUV210                                                \
+  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklwd  %%xmm1,%%xmm0                                   \n" \
+  "psraw      $0x2,%%xmm0                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "psllw      $0x6,%%xmm4                                     \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422                                               \
+  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
+  "movq       (%[a_buf]),%%xmm5                               \n" \
+  "lea        0x8(%[a_buf]),%[a_buf]                          \n"
 
 // Read 4 UV from NV12, upsample to 8 UV
-#define READNV12                                                               \
-    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
-    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"
+#define READNV12                                                  \
+  "movq       (%[uv_buf]),%%xmm0                              \n" \
+  "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 VU from NV21, upsample to 8 UV
+#define READNV21                                                  \
+  "movq       (%[vu_buf]),%%xmm0                              \n" \
+  "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
+  "pshufb     %[kShuffleNV21], %%xmm0                         \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
+#define READYUY2                                                  \
+  "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
+  "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
+  "movdqu     (%[yuy2_buf]),%%xmm0                            \n" \
+  "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n" \
+  "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
+
+// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
+#define READUYVY                                                  \
+  "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
+  "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
+  "movdqu     (%[uyvy_buf]),%%xmm0                            \n" \
+  "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n" \
+  "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP(yuvconstants)                              \
+  "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
+  "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
+  "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
+  "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm12                    \n" \
+  "movdqa     160(%[yuvconstants]),%%xmm13                    \n" \
+  "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB16(yuvconstants)                                  \
+  "movdqa     %%xmm0,%%xmm1                                   \n" \
+  "movdqa     %%xmm0,%%xmm2                                   \n" \
+  "movdqa     %%xmm0,%%xmm3                                   \n" \
+  "movdqa     %%xmm11,%%xmm0                                  \n" \
+  "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
+  "psubw      %%xmm1,%%xmm0                                   \n" \
+  "movdqa     %%xmm12,%%xmm1                                  \n" \
+  "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
+  "psubw      %%xmm2,%%xmm1                                   \n" \
+  "movdqa     %%xmm13,%%xmm2                                  \n" \
+  "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
+  "psubw      %%xmm3,%%xmm2                                   \n" \
+  "pmulhuw    %%xmm14,%%xmm4                                  \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm1                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n"
+#define YUVTORGB_REGS \
+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
 
+#else
+#define YUVTORGB_SETUP(yuvconstants)
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(YuvConstants)                                                 \
-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "movdqa     %%xmm0,%%xmm3                                   \n"            \
-    "movdqa     " MEMACCESS2(96, [YuvConstants]) ",%%xmm0       \n"            \
-    "pmaddubsw  " MEMACCESS([YuvConstants]) ",%%xmm1            \n"            \
-    "psubw      %%xmm1,%%xmm0                                   \n"            \
-    "movdqa     " MEMACCESS2(128, [YuvConstants]) ",%%xmm1      \n"            \
-    "pmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%xmm2       \n"            \
-    "psubw      %%xmm2,%%xmm1                                   \n"            \
-    "movdqa     " MEMACCESS2(160, [YuvConstants]) ",%%xmm2      \n"            \
-    "pmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%xmm3       \n"            \
-    "psubw      %%xmm3,%%xmm2                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
-    "punpcklbw  %%xmm3,%%xmm3                                   \n"            \
-    "pmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%xmm3      \n"            \
-    "paddsw     %%xmm3,%%xmm0                                   \n"            \
-    "paddsw     %%xmm3,%%xmm1                                   \n"            \
-    "paddsw     %%xmm3,%%xmm2                                   \n"            \
-    "psraw      $0x6,%%xmm0                                     \n"            \
-    "psraw      $0x6,%%xmm1                                     \n"            \
-    "psraw      $0x6,%%xmm2                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
-    "packuswb   %%xmm2,%%xmm2                                   \n"
-
-// Store 8 ARGB values. Assumes XMM5 is zero.
-#define STOREARGB                                                              \
-    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
-    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
-    "movdqa     %%xmm0,%%xmm1                                    \n"           \
-    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
-    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
-    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
-    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
-    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
-
-// Store 8 BGRA values. Assumes XMM5 is zero.
-#define STOREBGRA                                                              \
-    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
-    "punpcklbw %%xmm0,%%xmm1                                     \n"           \
-    "punpcklbw %%xmm2,%%xmm5                                     \n"           \
-    "movdqa    %%xmm5,%%xmm0                                     \n"           \
-    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
-    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
-    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "                  \n"           \
-    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_bgra]) "           \n"           \
-    "lea       " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra]         \n"
-
-// Store 8 ABGR values. Assumes XMM5 is zero.
-#define STOREABGR                                                              \
-    "punpcklbw %%xmm1,%%xmm2                                     \n"           \
-    "punpcklbw %%xmm5,%%xmm0                                     \n"           \
-    "movdqa    %%xmm2,%%xmm1                                     \n"           \
-    "punpcklwd %%xmm0,%%xmm2                                     \n"           \
-    "punpckhwd %%xmm0,%%xmm1                                     \n"           \
-    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "                  \n"           \
-    "movdqu    %%xmm1," MEMACCESS2(0x10, [dst_abgr]) "           \n"           \
-    "lea       " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr]         \n"
-
-// Store 8 RGBA values. Assumes XMM5 is zero.
-#define STORERGBA                                                              \
-    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
-    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
-    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
-    "movdqa    %%xmm5,%%xmm0                                     \n"           \
-    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
-    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
-    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
-    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
-    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
-
-void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
+#define YUVTORGB16(yuvconstants)                                  \
+  "movdqa     %%xmm0,%%xmm1                                   \n" \
+  "movdqa     %%xmm0,%%xmm2                                   \n" \
+  "movdqa     %%xmm0,%%xmm3                                   \n" \
+  "movdqa     96(%[yuvconstants]),%%xmm0                      \n" \
+  "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n" \
+  "psubw      %%xmm1,%%xmm0                                   \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm1                     \n" \
+  "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n" \
+  "psubw      %%xmm2,%%xmm1                                   \n" \
+  "movdqa     160(%[yuvconstants]),%%xmm2                     \n" \
+  "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n" \
+  "psubw      %%xmm3,%%xmm2                                   \n" \
+  "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm1                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n"
+#define YUVTORGB_REGS
+#endif
+
+#define YUVTORGB(yuvconstants)                                    \
+  YUVTORGB16(yuvconstants)                                        \
+  "psraw      $0x6,%%xmm0                                     \n" \
+  "psraw      $0x6,%%xmm1                                     \n" \
+  "psraw      $0x6,%%xmm2                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "packuswb   %%xmm1,%%xmm1                                   \n" \
+  "packuswb   %%xmm2,%%xmm2                                   \n"
+
+// Store 8 ARGB values.
+#define STOREARGB                                                  \
+  "punpcklbw  %%xmm1,%%xmm0                                    \n" \
+  "punpcklbw  %%xmm5,%%xmm2                                    \n" \
+  "movdqa     %%xmm0,%%xmm1                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm1                                    \n" \
+  "movdqu     %%xmm0,(%[dst_argb])                             \n" \
+  "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
+  "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
+
+// Store 8 RGBA values.
+#define STORERGBA                                                  \
+  "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
+  "punpcklbw %%xmm2,%%xmm1                                     \n" \
+  "punpcklbw %%xmm0,%%xmm5                                     \n" \
+  "movdqa    %%xmm5,%%xmm0                                     \n" \
+  "punpcklwd %%xmm1,%%xmm5                                     \n" \
+  "punpckhwd %%xmm1,%%xmm0                                     \n" \
+  "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
+  "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
+  "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
+
+// Store 8 AR30 values.
+#define STOREAR30                                                  \
+  "psraw      $0x4,%%xmm0                                      \n" \
+  "psraw      $0x4,%%xmm1                                      \n" \
+  "psraw      $0x4,%%xmm2                                      \n" \
+  "pminsw     %%xmm7,%%xmm0                                    \n" \
+  "pminsw     %%xmm7,%%xmm1                                    \n" \
+  "pminsw     %%xmm7,%%xmm2                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm0                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm1                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm2                                    \n" \
+  "psllw      $0x4,%%xmm2                                      \n" \
+  "movdqa     %%xmm0,%%xmm3                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm3                                    \n" \
+  "movdqa     %%xmm1,%%xmm2                                    \n" \
+  "punpcklwd  %%xmm5,%%xmm1                                    \n" \
+  "punpckhwd  %%xmm5,%%xmm2                                    \n" \
+  "pslld      $0xa,%%xmm1                                      \n" \
+  "pslld      $0xa,%%xmm2                                      \n" \
+  "por        %%xmm1,%%xmm0                                    \n" \
+  "por        %%xmm2,%%xmm3                                    \n" \
+  "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
+  "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
+  "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
     LABELALIGN
-  "1:                                          \n"
+      "1:                                        \n"
     READYUV444
-    YUVTORGB(kYuvConstants)
+    YUVTORGB(yuvconstants)
     STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
-// TODO(fbarchard): Consider putting masks into constants.
-void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* dst_rgb24,
+void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_rgb24,
+                                 const struct YuvConstants* yuvconstants,
                                  int width) {
   asm volatile (
-    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
-    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
-    "sub       %[u_buf],%[v_buf]               \n"
+    YUVTORGB_SETUP(yuvconstants)
+      "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+      "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+
     LABELALIGN
-  "1:                                          \n"
+      "1:                                        \n"
     READYUV422
-    YUVTORGB(kYuvConstants)
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
-    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
-    "subl      $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+    YUVTORGB(yuvconstants)
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpcklbw   %%xmm2,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklwd   %%xmm2,%%xmm0                 \n"
+      "punpckhwd   %%xmm2,%%xmm1                 \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm6,%%xmm1                 \n"
+      "palignr     $0xc,%%xmm0,%%xmm1            \n"
+      "movq        %%xmm0,(%[dst_rgb24])         \n"
+      "movdqu      %%xmm1,0x8(%[dst_rgb24])      \n"
+      "lea         0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
-// TODO(fbarchard): Make width a register for 32 bit.
-#if defined(__i386__) && defined(__pic__)
+#if defined(__i386__)
     [width]"+m"(width)     // %[width]
 #else
     [width]"+rm"(width)    // %[width]
 #endif
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
-void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_raw,
-                               int width) {
+void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
   asm volatile (
-    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
-    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
-    "sub       %[u_buf],%[v_buf]               \n"
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
     LABELALIGN
-  "1:                                          \n"
+      "1:                                        \n"
     READYUV422
-    YUVTORGB(kYuvConstants)
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
-    "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
-    "subl      $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_raw]"+r"(dst_raw),  // %[dst_raw]
-// TODO(fbarchard): Make width a register for 32 bit.
-#if defined(__i386__) && defined(__pic__)
-    [width]"+m"(width)    // %[width]
-#else
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-#endif
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
-    [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
-    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
-void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
+void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+
     LABELALIGN
-  "1:                                          \n"
+      "1:                                        \n"
     READYUV422
-    YUVTORGB(kYuvConstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
-void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
+// 10 bit YUV to ARGB
+void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
     LABELALIGN
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(kYuvConstants)
+      "1:                                        \n"
+    READYUV210
+    YUVTORGB(yuvconstants)
     STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
-void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
+// 10 bit YUV to AR30
+void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV210
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                                     const uint8_t* u_buf,
+                                     const uint8_t* v_buf,
+                                     const uint8_t* a_buf,
+                                     uint8_t* dst_argb,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+
     LABELALIGN
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB(kYuvConstants)
+      "1:                                        \n"
+    READYUVA422
+    YUVTORGB(yuvconstants)
     STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
+#endif  // HAS_I422ALPHATOARGBROW_SSSE3
 
-void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* uv_buf,
-                                uint8* dst_argb,
+void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* uv_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    YUVTORGB_SETUP(yuvconstants)
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
     LABELALIGN
-  "1:                                          \n"
+      "1:                                        \n"
     READNV12
-    YUVTORGB(kYuvConstants)
+    YUVTORGB(yuvconstants)
     STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  // Does not use r14.
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
-void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* uv_buf,
-                                uint8* dst_argb,
+void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* vu_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    YUVTORGB_SETUP(yuvconstants)
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
     LABELALIGN
-  "1:                                          \n"
-    READNV12
-    YUVTORGB(kYuvConstants)
+      "1:                                        \n"
+    READNV21
+    YUVTORGB(yuvconstants)
     STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
-  // Does not use r14.
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleNV21]"m"(kShuffleNV21)
+    : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
-void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_bgra,
+void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    YUVTORGB_SETUP(yuvconstants)
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
     LABELALIGN
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(kYuvConstants)
-    STOREBGRA
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
+      "1:                                        \n"
+    READYUY2
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+    : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
-void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_abgr,
+void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    YUVTORGB_SETUP(yuvconstants)
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
     LABELALIGN
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(kYuvConstants)
-    STOREABGR
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
+      "1:                                        \n"
+    READUYVY
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleUYVYY]"m"(kShuffleUYVYY),
+    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+    : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
-void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_rgba,
+void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_rgba,
+                                const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
     LABELALIGN
-  "1:                                          \n"
+      "1:                                        \n"
     READYUV422
-    YUVTORGB(kYuvConstants)
+    YUVTORGB(yuvconstants)
     STORERGBA
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
 #endif  // HAS_I422TOARGBROW_SSSE3
 
+// Read 16 UV from 444
+#define READYUV444_AVX2                                               \
+  "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
 // Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2                                                        \
-    "vmovq       " MEMACCESS([u_buf]) ",%%xmm0                      \n"        \
-    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"
-
-// Convert 16 pixels: 16 UV and 16 Y.
-#define YUVTORGB_AVX2(YuvConstants)                                            \
-    "vpmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2   \n"        \
-    "vpmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1   \n"        \
-    "vpmaddubsw  " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0        \n"        \
-    "vmovdqu     " MEMACCESS2(160, [YuvConstants]) ",%%ymm3         \n"        \
-    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
-    "vmovdqu     " MEMACCESS2(128, [YuvConstants]) ",%%ymm3         \n"        \
-    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
-    "vmovdqu     " MEMACCESS2(96, [YuvConstants]) ",%%ymm3          \n"        \
-    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
-    "vmovdqu     " MEMACCESS([y_buf]) ",%%xmm3                      \n"        \
-    "lea         " MEMLEA(0x10, [y_buf]) ",%[y_buf]                 \n"        \
-    "vpermq      $0xd8,%%ymm3,%%ymm3                                \n"        \
-    "vpunpcklbw  %%ymm3,%%ymm3,%%ymm3                               \n"        \
-    "vpmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3  \n"        \
-    "vpaddsw     %%ymm3,%%ymm0,%%ymm0           \n"                            \
-    "vpaddsw     %%ymm3,%%ymm1,%%ymm1           \n"                            \
-    "vpaddsw     %%ymm3,%%ymm2,%%ymm2           \n"                            \
-    "vpsraw      $0x6,%%ymm0,%%ymm0             \n"                            \
-    "vpsraw      $0x6,%%ymm1,%%ymm1             \n"                            \
-    "vpsraw      $0x6,%%ymm2,%%ymm2             \n"                            \
-    "vpackuswb   %%ymm0,%%ymm0,%%ymm0           \n"                            \
-    "vpackuswb   %%ymm1,%%ymm1,%%ymm1           \n"                            \
-    "vpackuswb   %%ymm2,%%ymm2,%%ymm2           \n"
-
-#if defined(HAS_I422TOBGRAROW_AVX2)
+#define READYUV422_AVX2                                               \
+  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 UV from 210 10 bit, upsample to 16 UV
+// TODO(fbarchard): Consider vshufb to replace pack/unpack
+// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
+#define READYUV210_AVX2                                            \
+  "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
+  "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
+  "vpsraw     $0x2,%%ymm0,%%ymm0                               \n" \
+  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $0x6,%%ymm4,%%ymm4                               \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
+
+// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
+#define READYUVA422_AVX2                                              \
+  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
+  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
+  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
+  "lea        0x10(%[a_buf]),%[a_buf]                             \n"
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2                                                 \
+  "vmovdqu    (%[uv_buf]),%%xmm0                                  \n" \
+  "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 VU from NV21, upsample to 16 UV.
+#define READNV21_AVX2                                                 \
+  "vmovdqu    (%[vu_buf]),%%xmm0                                  \n" \
+  "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2                                                 \
+  "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
+  "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
+  "vmovdqu    (%[yuy2_buf]),%%ymm0                                \n" \
+  "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n" \
+  "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2                                                 \
+  "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
+  "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
+  "vmovdqu    (%[uyvy_buf]),%%ymm0                                \n" \
+  "vpshufb    %[kShuffleUYVYUV], %%ymm0, %%ymm0                   \n" \
+  "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP_AVX2(yuvconstants)                            \
+  "vmovdqa     (%[yuvconstants]),%%ymm8                          \n" \
+  "vmovdqa     32(%[yuvconstants]),%%ymm9                        \n" \
+  "vmovdqa     64(%[yuvconstants]),%%ymm10                       \n" \
+  "vmovdqa     96(%[yuvconstants]),%%ymm11                       \n" \
+  "vmovdqa     128(%[yuvconstants]),%%ymm12                      \n" \
+  "vmovdqa     160(%[yuvconstants]),%%ymm13                      \n" \
+  "vmovdqa     192(%[yuvconstants]),%%ymm14                      \n"
+
+#define YUVTORGB16_AVX2(yuvconstants)                                 \
+  "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
+  "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
+  "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
+  "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
+  "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
+  "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
+  "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
+
+#define YUVTORGB_REGS_AVX2 \
+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
+#else  // Convert 16 pixels: 16 UV and 16 Y.
+
+#define YUVTORGB_SETUP_AVX2(yuvconstants)
+#define YUVTORGB16_AVX2(yuvconstants)                                 \
+  "vpmaddubsw  64(%[yuvconstants]),%%ymm0,%%ymm2                  \n" \
+  "vpmaddubsw  32(%[yuvconstants]),%%ymm0,%%ymm1                  \n" \
+  "vpmaddubsw  (%[yuvconstants]),%%ymm0,%%ymm0                    \n" \
+  "vmovdqu     160(%[yuvconstants]),%%ymm3                        \n" \
+  "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n" \
+  "vmovdqu     128(%[yuvconstants]),%%ymm3                        \n" \
+  "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n" \
+  "vmovdqu     96(%[yuvconstants]),%%ymm3                         \n" \
+  "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n" \
+  "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
+#define YUVTORGB_REGS_AVX2
+#endif
+
+#define YUVTORGB_AVX2(yuvconstants)                                   \
+  YUVTORGB16_AVX2(yuvconstants)                                       \
+  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
+  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
+  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
+  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
+  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
+  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2                                                \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
+  "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
+  "lea       0x40(%[dst_argb]), %[dst_argb]                       \n"
+
+// Store 16 AR30 values.
+#define STOREAR30_AVX2                                                \
+  "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
+  "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
+  "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
+  "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
+  "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
+  "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
+  "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
+  "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
+  "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
+  "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
+  "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
+  "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
+  "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
+  "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
+  "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
+
+#ifdef HAS_I444TOARGBROW_AVX2
 // 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
-void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_bgra,
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
-
-    // Step 3: Weave into BGRA
-    "vpunpcklbw %%ymm0,%%ymm1,%%ymm1           \n"  // GB
-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
-    "vpunpcklbw %%ymm2,%%ymm5,%%ymm2           \n"  // AR
-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
-    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"  // ARGB first 8 pixels
-    "vpunpckhwd %%ymm1,%%ymm2,%%ymm2           \n"  // ARGB next 8 pixels
-
-    "vmovdqu    %%ymm0," MEMACCESS([dst_bgra]) "\n"
-    "vmovdqu    %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
-    "lea       " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV444_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
-#endif  // HAS_I422TOBGRAROW_AVX2
+#endif  // HAS_I444TOARGBROW_AVX2
 
 #if defined(HAS_I422TOARGBROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
+void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
     LABELALIGN
-  "1:                                          \n"
+      "1:                                        \n"
     READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
 
-    // Step 3: Weave into ARGB
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
-    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
-    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
-
-    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
-    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_I422TOARGBROW_AVX2
 
-#if defined(HAS_J422TOARGBROW_AVX2)
+#if defined(HAS_I422TOAR30ROW_AVX2)
 // 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+
     LABELALIGN
-  "1:                                          \n"
+      "1:                                        \n"
     READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
 
-    // Step 3: Weave into ARGB
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
-    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
-    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
-
-    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
-    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB)  // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
-#endif  // HAS_J422TOARGBROW_AVX2
+#endif  // HAS_I422TOAR30ROW_AVX2
 
-#if defined(HAS_I422TOABGRROW_AVX2)
+#if defined(HAS_I210TOARGBROW_AVX2)
 // 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
-void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
     LABELALIGN
-  "1:                                          \n"
-    READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
+      "1:                                        \n"
+    READYUV210_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
 
-    // Step 3: Weave into ABGR
-    "vpunpcklbw %%ymm1,%%ymm2,%%ymm1           \n"  // RG
-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
-    "vpunpcklbw %%ymm5,%%ymm0,%%ymm2           \n"  // BA
-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
-    "vpunpcklwd %%ymm2,%%ymm1,%%ymm0           \n"  // RGBA first 8 pixels
-    "vpunpckhwd %%ymm2,%%ymm1,%%ymm1           \n"  // RGBA next 8 pixels
-    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
-#endif  // HAS_I422TOABGRROW_AVX2
+#endif  // HAS_I210TOARGBROW_AVX2
+
+#if defined(HAS_I210TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV210_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I210TOAR30ROW_AVX2
+
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+                                    const uint8_t* u_buf,
+                                    const uint8_t* v_buf,
+                                    const uint8_t* a_buf,
+                                    uint8_t* dst_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUVA422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "subl        $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_I422ALPHATOARGBROW_AVX2
 
 #if defined(HAS_I422TORGBAROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
+void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
     LABELALIGN
-  "1:                                          \n"
+      "1:                                        \n"
     READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
+    YUVTORGB_AVX2(yuvconstants)
 
     // Step 3: Weave into RGBA
     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
@@ -2181,1522 +2935,2209 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
-    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
+    "vmovdqu    %%ymm0,(%[dst_argb])           \n"
+    "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
+    "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
+    "sub        $0x10,%[width]                 \n"
+    "jg         1b                             \n"
     "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_I422TORGBAROW_AVX2
 
-#ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
+#if defined(HAS_NV12TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* uv_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READNV12_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#if defined(HAS_NV21TOARGBROW_AVX2)
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* vu_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
   asm volatile (
-    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
-    "movd      %%eax,%%xmm2                    \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
-    "movd      %%eax,%%xmm3                    \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
     LABELALIGN
-  "1:                                          \n"
-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "psubusw   %%xmm3,%%xmm0                   \n"
-    "psrlw     $6, %%xmm0                      \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-
-    // Step 2: Weave into ARGB
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "por       %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(y_buf),     // %0
-    "+r"(dst_argb),  // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc", "eax"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+      "1:                                        \n"
+    READNV21_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleNV21]"m"(kShuffleNV21)
+    : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUY2_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+    : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_YUY2TOARGBROW_AVX2
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READUYVY_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleUYVYY]"m"(kShuffleUYVYY),
+    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+    : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
+}
+#endif  // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      "movdqa      192(%3),%%xmm2                \n"  // yg = 18997 = 1.164
+      "movdqa      224(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
+      "pslld       $0x18,%%xmm4                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "paddsw    %%xmm3,%%xmm0                   \n"
+      "psraw     $6, %%xmm0                      \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+
+      // Step 2: Weave into ARGB
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm0,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm1                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "por       %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(y_buf),       // %0
+        "+r"(dst_argb),    // %1
+        "+rm"(width)       // %2
+      : "r"(yuvconstants)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_I400TOARGBROW_SSE2
 
 #ifdef HAS_I400TOARGBROW_AVX2
 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
 // note: vpunpcklbw mutates and vpackuswb unmutates.
-void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
-  asm volatile (
-    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
-    "vmovd      %%eax,%%xmm2                   \n"
-    "vbroadcastss %%xmm2,%%ymm2                \n"
-    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
-    "vmovd      %%eax,%%xmm3                   \n"
-    "vbroadcastss %%xmm3,%%ymm3                \n"
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
-    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
-    "lea        " MEMLEA(0x10,0) ",%0          \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
-    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
-    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
-    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub        $0x10,%2                       \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(y_buf),     // %0
-    "+r"(dst_argb),  // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc", "eax"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      "vmovdqa     192(%3),%%ymm2                \n"  // yg = 18997 = 1.164
+      "vmovdqa     224(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
+      "vpslld      $0x18,%%ymm4,%%ymm4           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+      "vmovdqu    (%0),%%xmm0                    \n"
+      "lea        0x10(%0),%0                    \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddsw    %%ymm3,%%ymm0,%%ymm0           \n"
+      "vpsraw     $0x6,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+      "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+      "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+      "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "lea        0x40(%1),%1                     \n"
+      "sub        $0x10,%2                       \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(y_buf),       // %0
+        "+r"(dst_argb),    // %1
+        "+rm"(width)       // %2
+      : "r"(yuvconstants)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_I400TOARGBROW_AVX2
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
-static uvec8 kShuffleMirror = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
 
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kShuffleMirror) // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm5"
-  );
+  asm volatile(
+
+      "movdqa      %3,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm5                  \n"
-    LABELALIGN
-  "1:                                          \n"
-    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
-    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kShuffleMirror) // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm5"
-  );
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORROW_AVX2
 
-#ifdef HAS_MIRRORROW_SSE2
-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
+
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "psllw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
-    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1)",%1            \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
+  asm volatile(
+
+      "movdqa      %3,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_uv),          // %0
+        "+r"(dst_uv),          // %1
+        "+r"(temp_width)       // %2
+      : "m"(kShuffleMirrorUV)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
-#endif  // HAS_MIRRORROW_SSE2
+#endif  // HAS_MIRRORUVROW_SSSE3
 
-#ifdef HAS_MIRRORROW_UV_SSSE3
+#ifdef HAS_MIRRORUVROW_AVX2
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),          // %0
+        "+r"(dst_uv),          // %1
+        "+r"(temp_width)       // %2
+      : "m"(kShuffleMirrorUV)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_MIRRORUVROW_AVX2
+
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
-static uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
-                       int width) {
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                            15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "movdqa    %4,%%xmm1                       \n"
-    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
-    "pshufb    %%xmm1,%%xmm0                   \n"
-    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $8,%3                           \n"
-    "jg        1b                              \n"
-  : "+r"(src),      // %0
-    "+r"(dst_u),    // %1
-    "+r"(dst_v),    // %2
-    "+r"(temp_width)  // %3
-  : "m"(kShuffleMirrorUV)  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
+  asm volatile(
+      "movdqa      %4,%%xmm1                     \n"
+      "lea         -0x10(%0,%3,2),%0             \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         -0x10(%0),%0                  \n"
+      "pshufb      %%xmm1,%%xmm0                 \n"
+      "movlpd      %%xmm0,(%1)                   \n"
+      "movhpd      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $8,%3                         \n"
+      "jg          1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(temp_width)            // %3
+      : "m"(kShuffleMirrorSplitUV)  // %4
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_MIRRORSPLITUVROW_SSSE3
+
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+
+// Shuffle first 5 pixels to last 5 mirrored.  first byte zero
+static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
+                                         7u,   8u,  3u,  4u,  5u, 0u,  1u,  2u};
+
+// Shuffle last 5 pixels to first 5 mirrored.  last byte zero
+static const uvec8 kShuffleMirrorRGB1 = {
+    13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
+
+// Shuffle 5 pixels at a time (15 bytes)
+void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_rgb24,
+                          int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  src_rgb24 += width * 3 - 48;
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // first 5
+      "movdqu      15(%0),%%xmm1                 \n"  // next 5
+      "movdqu      30(%0),%%xmm2                 \n"  // next 5
+      "movdqu      32(%0),%%xmm3                 \n"  // last 1 special
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"
+      "pshufb      %%xmm5,%%xmm3                 \n"
+      "lea         -0x30(%0),%0                  \n"
+      "movdqu      %%xmm0,32(%1)                 \n"  // last 5
+      "movdqu      %%xmm1,17(%1)                 \n"  // next 5
+      "movdqu      %%xmm2,2(%1)                  \n"  // next 5
+      "movlpd      %%xmm3,0(%1)                  \n"  // first 1
+      "lea         0x30(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_rgb24),          // %0
+        "+r"(dst_rgb24),          // %1
+        "+r"(temp_width)          // %2
+      : "m"(kShuffleMirrorRGB0),  // %3
+        "m"(kShuffleMirrorRGB1)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
-#endif  // HAS_MIRRORROW_UV_SSSE3
+#endif  // HAS_RGB24MIRRORROW_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_SSE2
 
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
+  asm volatile(
+
+      "lea         -0x10(%0,%2,4),%0             \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
+      "lea         -0x10(%0),%0                  \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),        // %0
+        "+r"(dst),        // %1
+        "+r"(temp_width)  // %2
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBMIRRORROW_SSE2
 
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "vmovdqu    %3,%%ymm5                      \n"
-    LABELALIGN
-  "1:                                          \n"
-    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x8,%2                        \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kARGBShuffleMirror_AVX2) // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm5"
-  );
+  asm volatile(
+
+      "vmovdqu     %3,%%ymm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(temp_width)              // %2
+      : "m"(kARGBShuffleMirror_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
 #ifdef HAS_SPLITUVROW_AVX2
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
-    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
-    "sub        %1,%2                            \n"
-    LABELALIGN
-  "1:                                            \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
-    "lea        " MEMLEA(0x40,0) ",%0            \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
-    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
-    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
-    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
-    "lea        " MEMLEA(0x20,1) ",%1            \n"
-    "sub        $0x20,%3                         \n"
-    "jg         1b                               \n"
-    "vzeroupper                                  \n"
-  : "+r"(src_uv),     // %0
-    "+r"(dst_u),      // %1
-    "+r"(dst_v),      // %2
-    "+r"(pix)         // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm2            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm3            \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpackuswb   %%ymm3,%%ymm2,%%ymm2          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm2,0x00(%1,%2,1)          \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SPLITUVROW_AVX2
 
 #ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                    \n"
-    "psrlw      $0x8,%%xmm5                      \n"
-    "sub        %1,%2                            \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
-    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
-    "lea        " MEMLEA(0x20,0) ",%0            \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "pand       %%xmm5,%%xmm0                    \n"
-    "pand       %%xmm5,%%xmm1                    \n"
-    "packuswb   %%xmm1,%%xmm0                    \n"
-    "psrlw      $0x8,%%xmm2                      \n"
-    "psrlw      $0x8,%%xmm3                      \n"
-    "packuswb   %%xmm3,%%xmm2                    \n"
-    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
-    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
-    "lea        " MEMLEA(0x10,1) ",%1            \n"
-    "sub        $0x10,%3                         \n"
-    "jg         1b                               \n"
-  : "+r"(src_uv),     // %0
-    "+r"(dst_u),      // %1
-    "+r"(dst_v),      // %2
-    "+r"(pix)         // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "psrlw       $0x8,%%xmm3                   \n"
+      "packuswb    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm2,0x00(%1,%2,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SPLITUVROW_SSE2
 
 #ifdef HAS_MERGEUVROW_AVX2
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_AVX2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-    "sub       %0,%1                             \n"
-    LABELALIGN
-  "1:                                            \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
-    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
-    "lea       " MEMLEA(0x20,0) ",%0             \n"
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
-    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
-    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
-    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
-    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
-    "lea       " MEMLEA(0x40,2) ",%2             \n"
-    "sub       $0x20,%3                          \n"
-    "jg        1b                                \n"
-    "vzeroupper                                  \n"
-  : "+r"(src_u),     // %0
-    "+r"(src_v),     // %1
-    "+r"(dst_uv),    // %2
-    "+r"(width)      // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2"
-  );
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x00(%0,%1,1),%%ymm1          \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm2          \n"
+      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vextractf128 $0x0,%%ymm2,(%2)             \n"
+      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
+      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
+      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
+      "lea         0x40(%2),%2                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_AVX2
 
 #ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_SSE2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-    "sub       %0,%1                             \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "movdqa    %%xmm0,%%xmm2                     \n"
-    "punpcklbw %%xmm1,%%xmm0                     \n"
-    "punpckhbw %%xmm1,%%xmm2                     \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
-    "lea       " MEMLEA(0x20,2) ",%2             \n"
-    "sub       $0x10,%3                          \n"
-    "jg        1b                                \n"
-  : "+r"(src_u),     // %0
-    "+r"(src_v),     // %1
-    "+r"(dst_uv),    // %2
-    "+r"(width)      // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2"
-  );
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "movdqu      %%xmm2,0x10(%2)               \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_SSE2
 
-#ifdef HAS_COPYROW_SSE2
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MERGEUVROW_16_AVX2
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int scale,
+                        int width) {
+  // clang-format off
   asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+      "vmovd       %4,%%xmm3                     \n"
+      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
+      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "sub         %0,%1                         \n"
+
+    // 16 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     (%0,%1,1),%%ymm1              \n"
+      "add         $0x20,%0                      \n"
+
+      "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
+      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm2          \n"  // mutates
+      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vextractf128 $0x0,%%ymm2,(%2)             \n"
+      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
+      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
+      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
+      "add         $0x40,%2                      \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : "+r"(src_u),   // %0
+    "+r"(src_v),   // %1
+    "+r"(dst_uv),  // %2
+    "+r"(width)    // %3
+  : "r"(scale)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MULTIPLYROW_16_AVX2
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width) {
+  // clang-format off
+  asm volatile (
+      "vmovd       %3,%%xmm3                     \n"
+      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
+      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "sub         %0,%1                         \n"
+
+    // 16 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
+      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%0,%1)                \n"
+      "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
+      "add         $0x40,%0                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MULTIPLYROW_16_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+                           uint8_t* dst_y,
+                           int scale,
+                           int width) {
+  // clang-format off
+  asm volatile (
+      "movd        %3,%%xmm2                     \n"
+      "punpcklwd   %%xmm2,%%xmm2                 \n"
+      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "add         $0x20,%0                      \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "add         $0x10,%1                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+      "vmovd       %3,%%xmm2                     \n"
+      "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
+      "vbroadcastss %%xmm2,%%ymm2                \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "add         $0x40,%0                      \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // mutates
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "add         $0x20,%1                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+#endif  // HAS_CONVERT16TO8ROW_AVX2
+
+// Use scale to convert to lsb formats depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+// TODO(fbarchard): reduce to SSE2
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+      "movd        %3,%%xmm2                     \n"
+      "punpcklwd   %%xmm2,%%xmm2                 \n"
+      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "add         $0x10,%0                      \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "add         $0x20,%1                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+      "vmovd       %3,%%xmm2                     \n"
+      "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
+      "vbroadcastss %%xmm2,%%ymm2                \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "add         $0x20,%0                      \n"
+      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
+      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "add         $0x40,%1                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+#endif  // HAS_CONVERT8TO16ROW_AVX2
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kShuffleMaskRGBToR0 = {0u,   3u,   6u,   9u,   12u,  15u,
+                                          128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          2u,   5u,   8u,   11u,  14u,  128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u, 128u, 1u,
+                                          4u,   7u,   10u,  13u};
+
+static const uvec8 kShuffleMaskRGBToG0 = {1u,   4u,   7u,   10u,  13u,  128u,
+                                          128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
+                                          3u,   6u,   9u,   12u,  15u,  128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u, 128u, 2u,
+                                          5u,   8u,   11u,  14u};
+
+static const uvec8 kShuffleMaskRGBToB0 = {2u,   5u,   8u,   11u,  14u,  128u,
+                                          128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
+                                          4u,   7u,   10u,  13u,  128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u, 0u,   3u,
+                                          6u,   9u,   12u,  15u};
+
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "pshufb      %5, %%xmm0                    \n"
+      "pshufb      %6, %%xmm1                    \n"
+      "pshufb      %7, %%xmm2                    \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "pshufb      %8, %%xmm0                    \n"
+      "pshufb      %9, %%xmm1                    \n"
+      "pshufb      %10, %%xmm2                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "pshufb      %11, %%xmm0                   \n"
+      "pshufb      %12, %%xmm1                   \n"
+      "pshufb      %13, %%xmm2                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%3)                   \n"
+      "lea         0x10(%3),%3                   \n"
+      "lea         0x30(%0),%0                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_rgb),             // %0
+        "+r"(dst_r),               // %1
+        "+r"(dst_g),               // %2
+        "+r"(dst_b),               // %3
+        "+r"(width)                // %4
+      : "m"(kShuffleMaskRGBToR0),  // %5
+        "m"(kShuffleMaskRGBToR1),  // %6
+        "m"(kShuffleMaskRGBToR2),  // %7
+        "m"(kShuffleMaskRGBToG0),  // %8
+        "m"(kShuffleMaskRGBToG1),  // %9
+        "m"(kShuffleMaskRGBToG2),  // %10
+        "m"(kShuffleMaskRGBToB0),  // %11
+        "m"(kShuffleMaskRGBToB1),  // %12
+        "m"(kShuffleMaskRGBToB2)   // %13
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_SPLITRGBROW_SSSE3
+
+#ifdef HAS_MERGERGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
+                                          2u, 128u, 128u, 3u, 128u, 128u,
+                                          4u, 128u, 128u, 5u};
+static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
+                                          128u, 2u, 128u, 128u, 3u, 128u,
+                                          128u, 4u, 128u, 128u};
+static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
+                                          128u, 128u, 2u, 128u, 128u, 3u,
+                                          128u, 128u, 4u, 128u};
+
+static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
+                                          7u, 128u, 128u, 8u, 128u, 128u,
+                                          9u, 128u, 128u, 10u};
+static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
+                                          128u, 7u, 128u, 128u, 8u, 128u,
+                                          128u, 9u, 128u, 128u};
+static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u,  128u, 128u, 7u,
+                                          128u, 128u, 8u,  128u, 128u, 9u,
+                                          128u, 128u, 10u, 128u};
+
+static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
+                                          12u, 128u, 128u, 13u, 128u, 128u,
+                                          14u, 128u, 128u, 15u};
+static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
+                                          128u, 13u, 128u, 128u, 14u, 128u,
+                                          128u, 15u, 128u, 128u};
+static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
+                                          128u, 128u, 13u, 128u, 128u, 14u,
+                                          128u, 128u, 15u, 128u};
+
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_rgb,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "pshufb      %5, %%xmm0                    \n"
+      "pshufb      %6, %%xmm1                    \n"
+      "pshufb      %7, %%xmm2                    \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%3)                   \n"
+
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "pshufb      %8, %%xmm0                    \n"
+      "pshufb      %9, %%xmm1                    \n"
+      "pshufb      %10, %%xmm2                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,16(%3)                 \n"
+
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "pshufb      %11, %%xmm0                   \n"
+      "pshufb      %12, %%xmm1                   \n"
+      "pshufb      %13, %%xmm2                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,32(%3)                 \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "lea         0x30(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_r),               // %0
+        "+r"(src_g),               // %1
+        "+r"(src_b),               // %2
+        "+r"(dst_rgb),             // %3
+        "+r"(width)                // %4
+      : "m"(kShuffleMaskRToRGB0),  // %5
+        "m"(kShuffleMaskGToRGB0),  // %6
+        "m"(kShuffleMaskBToRGB0),  // %7
+        "m"(kShuffleMaskRToRGB1),  // %8
+        "m"(kShuffleMaskGToRGB1),  // %9
+        "m"(kShuffleMaskBToRGB1),  // %10
+        "m"(kShuffleMaskRToRGB2),  // %11
+        "m"(kShuffleMaskGToRGB2),  // %12
+        "m"(kShuffleMaskBToRGB2)   // %13
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGERGBROW_SSSE3
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "test        $0xf,%0                       \n"
+      "jne         2f                            \n"
+      "test        $0xf,%1                       \n"
+      "jne         2f                            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqa      (%0),%%xmm0                   \n"
+      "movdqa      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqa      %%xmm0,(%1)                   \n"
+      "movdqa      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "jmp         9f                            \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          2b                            \n"
+
+      LABELALIGN "9:                                        \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_COPYROW_SSE2
 
 #ifdef HAS_COPYROW_AVX
-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x40,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x40,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_COPYROW_AVX
 
 #ifdef HAS_COPYROW_ERMS
 // Multiple of 1.
-void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep movsb " MEMMOVESTRING(0,1) "          \n"
-  : "+S"(src),  // %0
-    "+D"(dst),  // %1
-    "+c"(width_tmp) // %2
-  :
-  : "memory", "cc"
-  );
+  asm volatile(
+
+      "rep         movsb                         \n"
+      : "+S"(src),       // %0
+        "+D"(dst),       // %1
+        "+c"(width_tmp)  // %2
+      :
+      : "memory", "cc");
 }
 #endif  // HAS_COPYROW_ERMS
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm0,%%xmm0                   \n"
-    "pslld     $0x18,%%xmm0                    \n"
-    "pcmpeqb   %%xmm1,%%xmm1                   \n"
-    "psrld     $0x8,%%xmm1                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
-    "pand      %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm0,%%xmm3                   \n"
-    "pand      %%xmm1,%%xmm4                   \n"
-    "pand      %%xmm1,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm0,%%xmm0                 \n"
+      "pslld       $0x18,%%xmm0                  \n"
+      "pcmpeqb     %%xmm1,%%xmm1                 \n"
+      "psrld       $0x8,%%xmm1                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "movdqu      0x10(%0),%%xmm3               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqu      (%1),%%xmm4                   \n"
+      "movdqu      0x10(%1),%%xmm5               \n"
+      "pand        %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm0,%%xmm3                 \n"
+      "pand        %%xmm1,%%xmm4                 \n"
+      "pand        %%xmm1,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm2,(%1)                   \n"
+      "movdqu      %%xmm3,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
-    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
-    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm1                   \n"
+      "vmovdqu     0x20(%0),%%ymm2               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
+      "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+      "vmovdqu     %%ymm1,(%1)                   \n"
+      "vmovdqu     %%ymm2,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
 
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0), %%xmm0                  \n"
+      "movdqu      0x10(%0), %%xmm1              \n"
+      "lea         0x20(%0), %0                  \n"
+      "psrld       $0x18, %%xmm0                 \n"
+      "psrld       $0x18, %%xmm1                 \n"
+      "packssdw    %%xmm1, %%xmm0                \n"
+      "packuswb    %%xmm0, %%xmm0                \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1), %1                   \n"
+      "sub         $0x8, %2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+static const uvec8 kShuffleAlphaShort_AVX2 = {
+    3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
+    11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
+
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "vmovdqa     %3,%%ymm4                     \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0), %%ymm0                  \n"
+      "vmovdqu     0x20(%0), %%ymm1              \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // vpsrld $0x18, %%ymm0
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+      "vmovdqu     0x40(%0), %%ymm2              \n"
+      "vmovdqu     0x60(%0), %%ymm3              \n"
+      "lea         0x80(%0), %0                  \n"
+      "vpackssdw   %%ymm1, %%ymm0, %%ymm0        \n"  // mutates
+      "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
+      "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // mutates
+      "vpackuswb   %%ymm2,%%ymm0,%%ymm0          \n"  // mutates.
+      "vpermd      %%ymm0,%%ymm4,%%ymm0          \n"  // unmutate.
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20, %2                     \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),               // %0
+        "+r"(dst_a),                  // %1
+        "+rm"(width)                  // %2
+      : "m"(kPermdARGBToY_AVX),       // %3
+        "m"(kShuffleAlphaShort_AVX2)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
+
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm0,%%xmm0                   \n"
-    "pslld     $0x18,%%xmm0                    \n"
-    "pcmpeqb   %%xmm1,%%xmm1                   \n"
-    "psrld     $0x8,%%xmm1                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "punpckhwd %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm2,%%xmm2                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
-    "pand      %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm0,%%xmm3                   \n"
-    "pand      %%xmm1,%%xmm4                   \n"
-    "pand      %%xmm1,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm0,%%xmm0                 \n"
+      "pslld       $0x18,%%xmm0                  \n"
+      "pcmpeqb     %%xmm1,%%xmm1                 \n"
+      "psrld       $0x8,%%xmm1                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm2                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "punpcklbw   %%xmm2,%%xmm2                 \n"
+      "punpckhwd   %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm2,%%xmm2                 \n"
+      "movdqu      (%1),%%xmm4                   \n"
+      "movdqu      0x10(%1),%%xmm5               \n"
+      "pand        %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm0,%%xmm3                 \n"
+      "pand        %%xmm1,%%xmm4                 \n"
+      "pand        %%xmm1,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm2,(%1)                   \n"
+      "movdqu      %%xmm3,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
-    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
-    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
-    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
-    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
-    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbd   (%0),%%ymm1                   \n"
+      "vpmovzxbd   0x8(%0),%%ymm2                \n"
+      "lea         0x10(%0),%0                   \n"
+      "vpslld      $0x18,%%ymm1,%%ymm1           \n"
+      "vpslld      $0x18,%%ymm2,%%ymm2           \n"
+      "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
+      "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+      "vmovdqu     %%ymm1,(%1)                   \n"
+      "vmovdqu     %%ymm2,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
-void SetRow_X86(uint8* dst, uint8 v8, int width) {
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width >> 2);
-  const uint32 v32 = v8 * 0x01010101;  // Duplicate byte to all bytes.
-  asm volatile (
-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
-    : "+D"(dst),       // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v32)         // %2
-    : "memory", "cc");
+  const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
+  asm volatile(
+
+      "rep         stosl                         \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
 }
 
-void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep stosb " MEMSTORESTRING(al,0) "        \n"
-    : "+D"(dst),       // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v8)          // %2
-    : "memory", "cc");
+  asm volatile(
+
+      "rep         stosb                         \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v8)          // %2
+      : "memory", "cc");
 }
 
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
-    : "+D"(dst_argb),  // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v32)         // %2
-    : "memory", "cc");
+  asm volatile(
+
+      "rep         stosl                         \n"
+      : "+D"(dst_argb),  // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
 }
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_SSE2
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_YUY2TOYROW_SSE2
 
 #ifdef HAS_YUY2TOYROW_AVX2
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "lea      " MEMLEA(0x20,1) ",%1            \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "lea      " MEMLEA(0x20,1) ",%1            \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_YUY2TOYROW_AVX2
 
-#ifdef HAS_ARGBBLENDROW_SSE2
-// Blend 8 pixels at a time.
-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "41:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       41b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        99f                             \n"
-
-    // 1 pixel loop.
-  "91:                                         \n"
-    "movd      " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm2         \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x4,1) ",%1            \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x4,2) ",%2            \n"
-    "sub       $0x1,%3                         \n"
-    "jge       91b                             \n"
-  "99:                                         \n"
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_ARGBBLENDROW_SSE2
-
 #ifdef HAS_ARGBBLENDROW_SSSE3
 // Shuffle table for isolating alpha.
-static uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
+static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
+                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
 
 // Blend 8 pixels at a time
-// Shuffle table for reversing the bytes.
-
-// Same as SSE2, but replaces
-//    psrlw      xmm3, 8          // alpha
-//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-//    pshuflw    xmm3, xmm3,0F5h
-// with..
-//    pshufb     xmm3, kShuffleAlpha // alpha
-
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        99f                             \n"
-
-    // 1 pixel loop.
-  "91:                                         \n"
-    "movd      " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm2         \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x4,1) ",%1            \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x4,2) ",%2            \n"
-    "sub       $0x1,%3                         \n"
-    "jge       91b                             \n"
-  "99:                                         \n"
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  : "m"(kShuffleAlpha)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+                        const uint8_t* src_argb1,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $0xf,%%xmm7                   \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $0x8,%%xmm6                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psllw       $0x8,%%xmm5                   \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "pslld       $0x18,%%xmm4                  \n"
+      "sub         $0x4,%3                       \n"
+      "jl          49f                           \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu      (%0),%%xmm3                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm3,%%xmm0                 \n"
+      "pxor        %%xmm4,%%xmm3                 \n"
+      "movdqu      (%1),%%xmm2                   \n"
+      "pshufb      %4,%%xmm3                     \n"
+      "pand        %%xmm6,%%xmm2                 \n"
+      "paddw       %%xmm7,%%xmm3                 \n"
+      "pmullw      %%xmm3,%%xmm2                 \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pmullw      %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "paddusb     %%xmm2,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jge         40b                           \n"
+
+      "49:                                       \n"
+      "add         $0x3,%3                       \n"
+      "jl          99f                           \n"
+
+      // 1 pixel loop.
+      "91:                                       \n"
+      "movd        (%0),%%xmm3                   \n"
+      "lea         0x4(%0),%0                    \n"
+      "movdqa      %%xmm3,%%xmm0                 \n"
+      "pxor        %%xmm4,%%xmm3                 \n"
+      "movd        (%1),%%xmm2                   \n"
+      "pshufb      %4,%%xmm3                     \n"
+      "pand        %%xmm6,%%xmm2                 \n"
+      "paddw       %%xmm7,%%xmm3                 \n"
+      "pmullw      %%xmm3,%%xmm2                 \n"
+      "movd        (%1),%%xmm1                   \n"
+      "lea         0x4(%1),%1                    \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pmullw      %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "paddusb     %%xmm2,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movd        %%xmm0,(%2)                   \n"
+      "lea         0x4(%2),%2                    \n"
+      "sub         $0x1,%3                       \n"
+      "jge         91b                           \n"
+      "99:                                       \n"
+      : "+r"(src_argb0),    // %0
+        "+r"(src_argb1),    // %1
+        "+r"(dst_argb),     // %2
+        "+r"(width)         // %3
+      : "m"(kShuffleAlpha)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
-#ifdef HAS_ARGBATTENUATEROW_SSE2
-// Attenuate 4 pixels at a time.
-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrld     $0x8,%%xmm5                     \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
-    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
-    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "pand      %%xmm4,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+                         const uint8_t* src1,
+                         const uint8_t* alpha,
+                         uint8_t* dst,
+                         int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psllw       $0x8,%%xmm5                   \n"
+      "mov         $0x80808080,%%eax             \n"
+      "movd        %%eax,%%xmm6                  \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "mov         $0x807f807f,%%eax             \n"
+      "movd        %%eax,%%xmm7                  \n"
+      "pshufd      $0x0,%%xmm7,%%xmm7            \n"
+      "sub         %2,%0                         \n"
+      "sub         %2,%1                         \n"
+      "sub         %2,%3                         \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%2),%%xmm0                   \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "pxor        %%xmm5,%%xmm0                 \n"
+      "movq        (%0,%2,1),%%xmm1              \n"
+      "movq        (%1,%2,1),%%xmm2              \n"
+      "punpcklbw   %%xmm2,%%xmm1                 \n"
+      "psubb       %%xmm6,%%xmm1                 \n"
+      "pmaddubsw   %%xmm1,%%xmm0                 \n"
+      "paddw       %%xmm7,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%3,%2,1)              \n"
+      "lea         0x8(%2),%2                    \n"
+      "sub         $0x8,%4                       \n"
+      "jg          1b                            \n"
+      : "+r"(src0),   // %0
+        "+r"(src1),   // %1
+        "+r"(alpha),  // %2
+        "+r"(dst),    // %3
+        "+rm"(width)  // %4
+        ::"memory",
+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
+}
+#endif  // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+                        const uint8_t* src1,
+                        const uint8_t* alpha,
+                        uint8_t* dst,
+                        int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsllw      $0x8,%%ymm5,%%ymm5            \n"
+      "mov         $0x80808080,%%eax             \n"
+      "vmovd       %%eax,%%xmm6                  \n"
+      "vbroadcastss %%xmm6,%%ymm6                \n"
+      "mov         $0x807f807f,%%eax             \n"
+      "vmovd       %%eax,%%xmm7                  \n"
+      "vbroadcastss %%xmm7,%%ymm7                \n"
+      "sub         %2,%0                         \n"
+      "sub         %2,%1                         \n"
+      "sub         %2,%3                         \n"
+
+      // 32 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%2),%%ymm0                   \n"
+      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm3          \n"
+      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpxor       %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpxor       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vmovdqu     (%0,%2,1),%%ymm1              \n"
+      "vmovdqu     (%1,%2,1),%%ymm2              \n"
+      "vpunpckhbw  %%ymm2,%%ymm1,%%ymm4          \n"
+      "vpunpcklbw  %%ymm2,%%ymm1,%%ymm1          \n"
+      "vpsubb      %%ymm6,%%ymm4,%%ymm4          \n"
+      "vpsubb      %%ymm6,%%ymm1,%%ymm1          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpmaddubsw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm7,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
+      "vpsrlw      $0x8,%%ymm3,%%ymm3            \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%3,%2,1)              \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x20,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src0),   // %0
+        "+r"(src1),   // %1
+        "+r"(alpha),  // %2
+        "+r"(dst),    // %3
+        "+rm"(width)  // %4
+        ::"memory",
+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
-#endif  // HAS_ARGBATTENUATEROW_SSE2
+#endif  // HAS_BLENDPLANEROW_AVX2
 
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha
-static uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
-};
-static uvec8 kShuffleAlpha1 = {
-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
-};
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
+                                     7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
+static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+                                     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
 // Attenuate 4 pixels at a time.
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "pslld     $0x18,%%xmm3                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "punpcklbw %%xmm1,%%xmm1                   \n"
-    "pmulhuw   %%xmm1,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "punpckhbw %%xmm2,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "pand      %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleAlpha0),  // %3
-    "m"(kShuffleAlpha1)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "pslld       $0x18,%%xmm3                  \n"
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "punpcklbw   %%xmm1,%%xmm1                 \n"
+      "pmulhuw     %%xmm1,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "punpckhbw   %%xmm2,%%xmm2                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "pand        %%xmm3,%%xmm2                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),       // %0
+        "+r"(dst_argb),       // %1
+        "+r"(width)           // %2
+      : "m"(kShuffleAlpha0),  // %3
+        "m"(kShuffleAlpha1)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBATTENUATEROW_SSSE3
 
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
-};
+static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
+                                         128u, 128u, 14u,  15u, 14u, 15u,
+                                         14u,  15u,  128u, 128u};
 // Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
-    "sub        %0,%1                          \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
-    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
-    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub        $0x8,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleAlpha_AVX2)  // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpslld      $0x18,%%ymm5,%%ymm5           \n"
+      "sub         %0,%1                         \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm6                   \n"
+      "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
+      "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
+      "vpshufb     %%ymm4,%%ymm0,%%ymm2          \n"
+      "vpshufb     %%ymm4,%%ymm1,%%ymm3          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpand       %%ymm5,%%ymm6,%%ymm6          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpor        %%ymm6,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleAlpha_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBATTENUATEROW_AVX2
 
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
                              int width) {
-  uintptr_t alpha = 0;
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "movlhps   %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "movlhps   %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width),       // %2
-    "+r"(alpha)        // %3
-  : "r"(fixed_invtbl8)  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+  uintptr_t alpha;
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movzb       0x03(%0),%3                   \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "movd        0x00(%4,%3,4),%%xmm2          \n"
+      "movzb       0x07(%0),%3                   \n"
+      "movd        0x00(%4,%3,4),%%xmm3          \n"
+      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
+      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
+      "movlhps     %%xmm3,%%xmm2                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "movzb       0x0b(%0),%3                   \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "movd        0x00(%4,%3,4),%%xmm2          \n"
+      "movzb       0x0f(%0),%3                   \n"
+      "movd        0x00(%4,%3,4),%%xmm3          \n"
+      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
+      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
+      "movlhps     %%xmm3,%%xmm2                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),     // %0
+        "+r"(dst_argb),     // %1
+        "+r"(width),        // %2
+        "=&r"(alpha)        // %3
+      : "r"(fixed_invtbl8)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
 
 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
-};
+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
 // Unattenuate 8 pixels at a time.
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
                              int width) {
-  uintptr_t alpha = 0;
-  asm volatile (
-    "sub        %0,%1                          \n"
-    "vbroadcastf128 %5,%%ymm5                  \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    // replace VPGATHER
-    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
-    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
-    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
-    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
-    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
-    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
-    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
-    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
-    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
-    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
-    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
-    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
-    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
-    // end of VPGATHER
-
-    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
-    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
-    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
-    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
-    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub        $0x8,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width),       // %2
-    "+r"(alpha)        // %3
-  : "r"(fixed_invtbl8),  // %4
-    "m"(kUnattenShuffleAlpha_AVX2)  // %5
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  uintptr_t alpha;
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "vbroadcastf128 %5,%%ymm5                  \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      // replace VPGATHER
+      "movzb       0x03(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm0          \n"
+      "movzb       0x07(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm1          \n"
+      "movzb       0x0b(%0),%3                   \n"
+      "vpunpckldq  %%xmm1,%%xmm0,%%xmm6          \n"
+      "vmovd       0x00(%4,%3,4),%%xmm2          \n"
+      "movzb       0x0f(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm3          \n"
+      "movzb       0x13(%0),%3                   \n"
+      "vpunpckldq  %%xmm3,%%xmm2,%%xmm7          \n"
+      "vmovd       0x00(%4,%3,4),%%xmm0          \n"
+      "movzb       0x17(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm1          \n"
+      "movzb       0x1b(%0),%3                   \n"
+      "vpunpckldq  %%xmm1,%%xmm0,%%xmm0          \n"
+      "vmovd       0x00(%4,%3,4),%%xmm2          \n"
+      "movzb       0x1f(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm3          \n"
+      "vpunpckldq  %%xmm3,%%xmm2,%%xmm2          \n"
+      "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+      "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+      "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+      // end of VPGATHER
+
+      "vmovdqu     (%0),%%ymm6                   \n"
+      "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
+      "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
+      "vpunpcklwd  %%ymm3,%%ymm3,%%ymm2          \n"
+      "vpunpckhwd  %%ymm3,%%ymm3,%%ymm3          \n"
+      "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
+      "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),                 // %0
+        "+r"(dst_argb),                 // %1
+        "+r"(width),                    // %2
+        "=&r"(alpha)                    // %3
+      : "r"(fixed_invtbl8),             // %4
+        "m"(kUnattenShuffleAlpha_AVX2)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
 
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrld     $0x18,%%xmm2                    \n"
-    "psrld     $0x18,%%xmm3                    \n"
-    "packuswb  %%xmm3,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpcklbw %%xmm2,%%xmm3                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm3,%%xmm0                   \n"
-    "punpckhwd %%xmm3,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "m"(kARGBToYJ),   // %3
-    "m"(kAddYJ64)     // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "psubb       %%xmm5,%%xmm0                 \n"
+      "psubb       %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm4,%%xmm6                 \n"
+      "pmaddubsw   %%xmm0,%%xmm6                 \n"
+      "movdqu      %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm1,%%xmm0                 \n"
+      "phaddw      %%xmm0,%%xmm6                 \n"
+      "paddw       %%xmm5,%%xmm6                 \n"
+      "psrlw       $0x8,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "movdqu      0x10(%0),%%xmm3               \n"
+      "lea         0x20(%0),%0                   \n"
+      "psrld       $0x18,%%xmm2                  \n"
+      "psrld       $0x18,%%xmm3                  \n"
+      "packuswb    %%xmm3,%%xmm2                 \n"
+      "packuswb    %%xmm2,%%xmm2                 \n"
+      "movdqa      %%xmm6,%%xmm3                 \n"
+      "punpcklbw   %%xmm6,%%xmm6                 \n"
+      "punpcklbw   %%xmm2,%%xmm3                 \n"
+      "movdqa      %%xmm6,%%xmm1                 \n"
+      "punpcklwd   %%xmm3,%%xmm6                 \n"
+      "punpckhwd   %%xmm3,%%xmm1                 \n"
+      "movdqu      %%xmm6,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToYJ),  // %3
+        "m"(kSub128)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBGRAYROW_SSSE3
 
@@ -3705,298 +5146,301 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 // Constant for ARGB color to sepia tone
-static vec8 kARGBToSepiaB = {
-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+                                   17, 68, 35, 0, 17, 68, 35, 0};
 
-static vec8 kARGBToSepiaG = {
-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+                                   22, 88, 45, 0, 22, 88, 45, 0};
 
-static vec8 kARGBToSepiaR = {
-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+                                   24, 98, 50, 0, 24, 98, 50, 0};
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
-  asm volatile (
-    "movdqa    %2,%%xmm2                       \n"
-    "movdqa    %3,%%xmm3                       \n"
-    "movdqa    %4,%%xmm4                       \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm6                   \n"
-    "phaddw    %%xmm6,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm3,%%xmm5                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm5                   \n"
-    "psrlw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm4,%%xmm5                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm5                   \n"
-    "psrlw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "psrld     $0x18,%%xmm6                    \n"
-    "psrld     $0x18,%%xmm1                    \n"
-    "packuswb  %%xmm1,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm5                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "punpckhwd %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub       $0x8,%1                         \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),      // %0
-    "+r"(width)          // %1
-  : "m"(kARGBToSepiaB),  // %2
-    "m"(kARGBToSepiaG),  // %3
-    "m"(kARGBToSepiaR)   // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movdqa      %2,%%xmm2                     \n"
+      "movdqa      %3,%%xmm3                     \n"
+      "movdqa      %4,%%xmm4                     \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm6               \n"
+      "pmaddubsw   %%xmm2,%%xmm0                 \n"
+      "pmaddubsw   %%xmm2,%%xmm6                 \n"
+      "phaddw      %%xmm6,%%xmm0                 \n"
+      "psrlw       $0x7,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm5                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "pmaddubsw   %%xmm3,%%xmm5                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "phaddw      %%xmm1,%%xmm5                 \n"
+      "psrlw       $0x7,%%xmm5                   \n"
+      "packuswb    %%xmm5,%%xmm5                 \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm5                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "pmaddubsw   %%xmm4,%%xmm5                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "phaddw      %%xmm1,%%xmm5                 \n"
+      "psrlw       $0x7,%%xmm5                   \n"
+      "packuswb    %%xmm5,%%xmm5                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "psrld       $0x18,%%xmm6                  \n"
+      "psrld       $0x18,%%xmm1                  \n"
+      "packuswb    %%xmm1,%%xmm6                 \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "punpcklbw   %%xmm6,%%xmm5                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklwd   %%xmm5,%%xmm0                 \n"
+      "punpckhwd   %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%0)                   \n"
+      "movdqu      %%xmm1,0x10(%0)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x8,%1                       \n"
+      "jg          1b                            \n"
+      : "+r"(dst_argb),      // %0
+        "+r"(width)          // %1
+      : "m"(kARGBToSepiaB),  // %2
+        "m"(kARGBToSepiaG),  // %3
+        "m"(kARGBToSepiaR)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBSEPIAROW_SSSE3
 
 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width) {
-  asm volatile (
-    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
-    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
-    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
-    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
-    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm7                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddsw   %%xmm7,%%xmm0                   \n"
-    "phaddsw   %%xmm1,%%xmm6                   \n"
-    "psraw     $0x6,%%xmm0                     \n"
-    "psraw     $0x6,%%xmm6                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm7                   \n"
-    "phaddsw   %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm7                   \n"
-    "phaddsw   %%xmm7,%%xmm6                   \n"
-    "psraw     $0x6,%%xmm1                     \n"
-    "psraw     $0x6,%%xmm6                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "punpcklwd %%xmm1,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm6                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb),      // %1
-    "+r"(width)          // %2
-  : "r"(matrix_argb)     // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              const int8_t* matrix_argb,
+                              int width) {
+  asm volatile(
+      "movdqu      (%3),%%xmm5                   \n"
+      "pshufd      $0x00,%%xmm5,%%xmm2           \n"
+      "pshufd      $0x55,%%xmm5,%%xmm3           \n"
+      "pshufd      $0xaa,%%xmm5,%%xmm4           \n"
+      "pshufd      $0xff,%%xmm5,%%xmm5           \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm7               \n"
+      "pmaddubsw   %%xmm2,%%xmm0                 \n"
+      "pmaddubsw   %%xmm2,%%xmm7                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "phaddsw     %%xmm7,%%xmm0                 \n"
+      "phaddsw     %%xmm1,%%xmm6                 \n"
+      "psraw       $0x6,%%xmm0                   \n"
+      "psraw       $0x6,%%xmm6                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "punpcklbw   %%xmm6,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "movdqu      0x10(%0),%%xmm7               \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm7                 \n"
+      "phaddsw     %%xmm7,%%xmm1                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x10(%0),%%xmm7               \n"
+      "pmaddubsw   %%xmm5,%%xmm6                 \n"
+      "pmaddubsw   %%xmm5,%%xmm7                 \n"
+      "phaddsw     %%xmm7,%%xmm6                 \n"
+      "psraw       $0x6,%%xmm1                   \n"
+      "psraw       $0x6,%%xmm6                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "punpcklbw   %%xmm6,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "punpcklwd   %%xmm1,%%xmm0                 \n"
+      "punpckhwd   %%xmm1,%%xmm6                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm6,0x10(%1)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "movd      %2,%%xmm2                       \n"
-    "movd      %3,%%xmm3                       \n"
-    "movd      %4,%%xmm4                       \n"
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
-    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "pslld     $0x18,%%xmm6                    \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "pmullw    %%xmm3,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm6,%%xmm7                   \n"
-    "paddw     %%xmm4,%%xmm0                   \n"
-    "paddw     %%xmm4,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm7,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x4,%1                         \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "movd        %2,%%xmm2                     \n"
+      "movd        %3,%%xmm3                     \n"
+      "movd        %4,%%xmm4                     \n"
+      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
+      "pshufd      $0x44,%%xmm2,%%xmm2           \n"
+      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
+      "pshufd      $0x44,%%xmm3,%%xmm3           \n"
+      "pshuflw     $0x40,%%xmm4,%%xmm4           \n"
+      "pshufd      $0x44,%%xmm4,%%xmm4           \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "pslld       $0x18,%%xmm6                  \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "punpckhbw   %%xmm5,%%xmm1                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "pmullw      %%xmm3,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm7                   \n"
+      "pmullw      %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm6,%%xmm7                 \n"
+      "paddw       %%xmm4,%%xmm0                 \n"
+      "paddw       %%xmm4,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "por         %%xmm7,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%0)                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x4,%1                       \n"
+      "jg          1b                            \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBQUANTIZEROW_SSE2
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "movd      %3,%%xmm2                       \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm2                  \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(value)       // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "movd        %3,%%xmm2                     \n"
+      "punpcklbw   %%xmm2,%%xmm2                 \n"
+      "punpcklqdq  %%xmm2,%%xmm2                 \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_ARGBSHADEROW_SSE2
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                  \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "movdqu    %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpckhbw %%xmm5,%%xmm3                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "pmulhuw   %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm2                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqu      %%xmm0,%%xmm1                 \n"
+      "movdqu      %%xmm2,%%xmm3                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpckhbw   %%xmm5,%%xmm3                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm3,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
-    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
-    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
-    "lea       " MEMLEA(0x20,2) ",%2           \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm1                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "vmovdqu     (%1),%%ymm3                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
+      "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
+      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc"
 #if defined(__AVX2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+        ,
+        "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
   );
 }
@@ -4004,113 +5448,113 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_ARGBADDROW_SSE2
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
-    "lea        " MEMLEA(0x20,2) ",%2          \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
+void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpaddusb    (%1),%%ymm0,%%ymm0            \n"
+      "lea         0x20(%1),%1                   \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBADDROW_AVX2
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "psubusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "psubusb     %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_ARGBSUBTRACTROW_SSE2
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
-    "lea        " MEMLEA(0x20,2) ",%2          \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpsubusb    (%1),%%ymm0,%%ymm0            \n"
+      "lea         0x20(%1),%1                   \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBSUBTRACTROW_AVX2
 
@@ -4119,52 +5563,53 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 // -1  0  1
 // -2  0  2
 // -1  0  1
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-    "sub       %0,%3                           \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "psubw     %%xmm1,%%xmm0                   \n"
-    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
-    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "psubw     %%xmm2,%%xmm1                   \n"
-    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
-    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "psubw     %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm1                   \n"
-    "pmaxsw    %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "sub       $0x8,%4                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void SobelXRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "sub         %0,%3                         \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "movq        0x2(%0),%%xmm1                \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "psubw       %%xmm1,%%xmm0                 \n"
+      "movq        0x00(%0,%1,1),%%xmm1          \n"
+      "movq        0x02(%0,%1,1),%%xmm2          \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "psubw       %%xmm2,%%xmm1                 \n"
+      "movq        0x00(%0,%2,1),%%xmm2          \n"
+      "movq        0x02(%0,%2,1),%%xmm3          \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm3                 \n"
+      "psubw       %%xmm3,%%xmm2                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm1,%%xmm0                 \n"
+      "paddw       %%xmm1,%%xmm0                 \n"
+      "pxor        %%xmm1,%%xmm1                 \n"
+      "psubw       %%xmm0,%%xmm1                 \n"
+      "pmaxsw      %%xmm1,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,0x00(%0,%3,1)          \n"
+      "lea         0x8(%0),%0                    \n"
+      "sub         $0x8,%4                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_y0),      // %0
+        "+r"(src_y1),      // %1
+        "+r"(src_y2),      // %2
+        "+r"(dst_sobelx),  // %3
+        "+r"(width)        // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SOBELXROW_SSE2
 
@@ -4173,50 +5618,50 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "psubw     %%xmm1,%%xmm0                   \n"
-    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
-    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "psubw     %%xmm2,%%xmm1                   \n"
-    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
-    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "psubw     %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm1                   \n"
-    "pmaxsw    %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "sub       $0x8,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void SobelYRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "movq        0x00(%0,%1,1),%%xmm1          \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "psubw       %%xmm1,%%xmm0                 \n"
+      "movq        0x1(%0),%%xmm1                \n"
+      "movq        0x01(%0,%1,1),%%xmm2          \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "psubw       %%xmm2,%%xmm1                 \n"
+      "movq        0x2(%0),%%xmm2                \n"
+      "movq        0x02(%0,%1,1),%%xmm3          \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm3                 \n"
+      "psubw       %%xmm3,%%xmm2                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm1,%%xmm0                 \n"
+      "paddw       %%xmm1,%%xmm0                 \n"
+      "pxor        %%xmm1,%%xmm1                 \n"
+      "psubw       %%xmm0,%%xmm1                 \n"
+      "pmaxsw      %%xmm1,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,0x00(%0,%2,1)          \n"
+      "lea         0x8(%0),%0                    \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_y0),      // %0
+        "+r"(src_y1),      // %1
+        "+r"(dst_sobely),  // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SOBELYROW_SSE2
 
@@ -4226,79 +5671,79 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // R = Sobel
 // G = Sobel
 // B = Sobel
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm2                   \n"
-    "punpckhbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm1                   \n"
-    "punpckhwd %%xmm2,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklwd %%xmm0,%%xmm3                   \n"
-    "punpckhwd %%xmm0,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
-    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
-    "lea       " MEMLEA(0x40,2) ",%2           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpcklbw   %%xmm0,%%xmm2                 \n"
+      "punpckhbw   %%xmm0,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "punpcklwd   %%xmm2,%%xmm1                 \n"
+      "punpckhwd   %%xmm2,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "por         %%xmm5,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm3                 \n"
+      "punpcklwd   %%xmm0,%%xmm3                 \n"
+      "punpckhwd   %%xmm0,%%xmm0                 \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm1,(%2)                   \n"
+      "movdqu      %%xmm2,0x10(%2)               \n"
+      "movdqu      %%xmm3,0x20(%2)               \n"
+      "movdqu      %%xmm0,0x30(%2)               \n"
+      "lea         0x40(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SOBELROW_SSE2
 
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_SOBELTOPLANEROW_SSE2
 
@@ -4308,1165 +5753,1420 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "paddusb   %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "punpckhbw %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "punpcklbw %%xmm2,%%xmm4                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm4,%%xmm6                   \n"
-    "punpcklwd %%xmm3,%%xmm6                   \n"
-    "punpckhwd %%xmm3,%%xmm4                   \n"
-    "movdqa    %%xmm1,%%xmm7                   \n"
-    "punpcklwd %%xmm0,%%xmm7                   \n"
-    "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
-    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
-    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
-    "lea       " MEMLEA(0x40,2) ",%2           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "paddusb     %%xmm1,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm3                 \n"
+      "punpcklbw   %%xmm5,%%xmm3                 \n"
+      "punpckhbw   %%xmm5,%%xmm0                 \n"
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "punpcklbw   %%xmm2,%%xmm4                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "punpcklwd   %%xmm3,%%xmm6                 \n"
+      "punpckhwd   %%xmm3,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm7                 \n"
+      "punpcklwd   %%xmm0,%%xmm7                 \n"
+      "punpckhwd   %%xmm0,%%xmm1                 \n"
+      "movdqu      %%xmm6,(%2)                   \n"
+      "movdqu      %%xmm4,0x10(%2)               \n"
+      "movdqu      %%xmm7,0x20(%2)               \n"
+      "movdqu      %%xmm1,0x30(%2)               \n"
+      "lea         0x40(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_SOBELXYROW_SSE2
 
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
 // above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width) {
-  asm volatile (
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-    "test      $0xf,%1                         \n"
-    "jne       49f                             \n"
-
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm1,%%xmm2                   \n"
-    "punpckhwd %%xmm1,%%xmm3                   \n"
-    "punpckhbw %%xmm1,%%xmm4                   \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "punpcklwd %%xmm1,%%xmm4                   \n"
-    "punpckhwd %%xmm1,%%xmm5                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
-    "paddd     %%xmm0,%%xmm3                   \n"
-    "paddd     %%xmm4,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
-    "paddd     %%xmm0,%%xmm4                   \n"
-    "paddd     %%xmm5,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
-    "lea       " MEMLEA(0x40,2) ",%2           \n"
-    "paddd     %%xmm0,%%xmm5                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
-    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        19f                             \n"
-
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "movd      " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm1,%%xmm2                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x1,%3                         \n"
-    "jge       10b                             \n"
-
-  "19:                                         \n"
-  : "+r"(row),  // %0
-    "+r"(cumsum),  // %1
-    "+r"(previous_cumsum),  // %2
-    "+r"(width)  // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+                                  int32_t* cumsum,
+                                  const int32_t* previous_cumsum,
+                                  int width) {
+  asm volatile(
+      "pxor        %%xmm0,%%xmm0                 \n"
+      "pxor        %%xmm1,%%xmm1                 \n"
+      "sub         $0x4,%3                       \n"
+      "jl          49f                           \n"
+      "test        $0xf,%1                       \n"
+      "jne         49f                           \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm2,%%xmm4                 \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm1,%%xmm2                 \n"
+      "punpckhwd   %%xmm1,%%xmm3                 \n"
+      "punpckhbw   %%xmm1,%%xmm4                 \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "punpcklwd   %%xmm1,%%xmm4                 \n"
+      "punpckhwd   %%xmm1,%%xmm5                 \n"
+      "paddd       %%xmm2,%%xmm0                 \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "paddd       %%xmm0,%%xmm2                 \n"
+      "paddd       %%xmm3,%%xmm0                 \n"
+      "movdqu      0x10(%2),%%xmm3               \n"
+      "paddd       %%xmm0,%%xmm3                 \n"
+      "paddd       %%xmm4,%%xmm0                 \n"
+      "movdqu      0x20(%2),%%xmm4               \n"
+      "paddd       %%xmm0,%%xmm4                 \n"
+      "paddd       %%xmm5,%%xmm0                 \n"
+      "movdqu      0x30(%2),%%xmm5               \n"
+      "lea         0x40(%2),%2                   \n"
+      "paddd       %%xmm0,%%xmm5                 \n"
+      "movdqu      %%xmm2,(%1)                   \n"
+      "movdqu      %%xmm3,0x10(%1)               \n"
+      "movdqu      %%xmm4,0x20(%1)               \n"
+      "movdqu      %%xmm5,0x30(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x4,%3                       \n"
+      "jge         40b                           \n"
+
+      "49:                                       \n"
+      "add         $0x3,%3                       \n"
+      "jl          19f                           \n"
+
+      // 1 pixel loop.
+      LABELALIGN
+      "10:                                       \n"
+      "movd        (%0),%%xmm2                   \n"
+      "lea         0x4(%0),%0                    \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "punpcklwd   %%xmm1,%%xmm2                 \n"
+      "paddd       %%xmm2,%%xmm0                 \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "paddd       %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm2,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x1,%3                       \n"
+      "jge         10b                           \n"
+
+      "19:                                       \n"
+      : "+r"(row),              // %0
+        "+r"(cumsum),           // %1
+        "+r"(previous_cumsum),  // %2
+        "+r"(width)             // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
 
 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst,
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+                                    const int32_t* botleft,
+                                    int width,
+                                    int area,
+                                    uint8_t* dst,
                                     int count) {
-  asm volatile (
-    "movd      %5,%%xmm5                       \n"
-    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
-    "rcpss     %%xmm5,%%xmm4                   \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-    "cmpl      $0x80,%5                        \n"
-    "ja        40f                             \n"
-
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrld     $0x10,%%xmm6                    \n"
-    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
-    "addps     %%xmm6,%%xmm5                   \n"
-    "mulps     %%xmm4,%%xmm5                   \n"
-    "cvtps2dq  %%xmm5,%%xmm5                   \n"
-    "packssdw  %%xmm5,%%xmm5                   \n"
-
-  // 4 pixel small loop                        \n"
-    LABELALIGN
-  "4:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
-    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
-    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
-    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
-    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
-    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
-    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
-    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
-    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm5,%%xmm0                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       4b                              \n"
-    "jmp       49f                             \n"
-
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
-    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
-    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
-    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
-    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
-    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
-    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
-    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
-    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
-    "mulps     %%xmm4,%%xmm0                   \n"
-    "mulps     %%xmm4,%%xmm1                   \n"
-    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
-    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
-    "mulps     %%xmm4,%%xmm2                   \n"
-    "mulps     %%xmm4,%%xmm3                   \n"
-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
-    "cvtps2dq  %%xmm1,%%xmm1                   \n"
-    "cvtps2dq  %%xmm2,%%xmm2                   \n"
-    "cvtps2dq  %%xmm3,%%xmm3                   \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm3,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        19f                             \n"
-
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "mulps     %%xmm4,%%xmm0                   \n"
-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x4,2) ",%2            \n"
-    "sub       $0x1,%3                         \n"
-    "jge       10b                             \n"
-  "19:                                         \n"
-  : "+r"(topleft),  // %0
-    "+r"(botleft),  // %1
-    "+r"(dst),      // %2
-    "+rm"(count)    // %3
-  : "r"((intptr_t)(width)),  // %4
-    "rm"(area)     // %5
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+  asm volatile(
+      "movd        %5,%%xmm5                     \n"
+      "cvtdq2ps    %%xmm5,%%xmm5                 \n"
+      "rcpss       %%xmm5,%%xmm4                 \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "sub         $0x4,%3                       \n"
+      "jl          49f                           \n"
+      "cmpl        $0x80,%5                      \n"
+      "ja          40f                           \n"
+
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrld       $0x10,%%xmm6                  \n"
+      "cvtdq2ps    %%xmm6,%%xmm6                 \n"
+      "addps       %%xmm6,%%xmm5                 \n"
+      "mulps       %%xmm4,%%xmm5                 \n"
+      "cvtps2dq    %%xmm5,%%xmm5                 \n"
+      "packssdw    %%xmm5,%%xmm5                 \n"
+
+      // 4 pixel small loop.
+      LABELALIGN
+      "4:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "psubd       0x00(%0,%4,4),%%xmm0          \n"
+      "psubd       0x10(%0,%4,4),%%xmm1          \n"
+      "psubd       0x20(%0,%4,4),%%xmm2          \n"
+      "psubd       0x30(%0,%4,4),%%xmm3          \n"
+      "lea         0x40(%0),%0                   \n"
+      "psubd       (%1),%%xmm0                   \n"
+      "psubd       0x10(%1),%%xmm1               \n"
+      "psubd       0x20(%1),%%xmm2               \n"
+      "psubd       0x30(%1),%%xmm3               \n"
+      "paddd       0x00(%1,%4,4),%%xmm0          \n"
+      "paddd       0x10(%1,%4,4),%%xmm1          \n"
+      "paddd       0x20(%1,%4,4),%%xmm2          \n"
+      "paddd       0x30(%1,%4,4),%%xmm3          \n"
+      "lea         0x40(%1),%1                   \n"
+      "packssdw    %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm3,%%xmm2                 \n"
+      "pmulhuw     %%xmm5,%%xmm0                 \n"
+      "pmulhuw     %%xmm5,%%xmm2                 \n"
+      "packuswb    %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jge         4b                            \n"
+      "jmp         49f                           \n"
+
+      // 4 pixel loop
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "psubd       0x00(%0,%4,4),%%xmm0          \n"
+      "psubd       0x10(%0,%4,4),%%xmm1          \n"
+      "psubd       0x20(%0,%4,4),%%xmm2          \n"
+      "psubd       0x30(%0,%4,4),%%xmm3          \n"
+      "lea         0x40(%0),%0                   \n"
+      "psubd       (%1),%%xmm0                   \n"
+      "psubd       0x10(%1),%%xmm1               \n"
+      "psubd       0x20(%1),%%xmm2               \n"
+      "psubd       0x30(%1),%%xmm3               \n"
+      "paddd       0x00(%1,%4,4),%%xmm0          \n"
+      "paddd       0x10(%1,%4,4),%%xmm1          \n"
+      "paddd       0x20(%1,%4,4),%%xmm2          \n"
+      "paddd       0x30(%1,%4,4),%%xmm3          \n"
+      "lea         0x40(%1),%1                   \n"
+      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
+      "cvtdq2ps    %%xmm1,%%xmm1                 \n"
+      "mulps       %%xmm4,%%xmm0                 \n"
+      "mulps       %%xmm4,%%xmm1                 \n"
+      "cvtdq2ps    %%xmm2,%%xmm2                 \n"
+      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
+      "mulps       %%xmm4,%%xmm2                 \n"
+      "mulps       %%xmm4,%%xmm3                 \n"
+      "cvtps2dq    %%xmm0,%%xmm0                 \n"
+      "cvtps2dq    %%xmm1,%%xmm1                 \n"
+      "cvtps2dq    %%xmm2,%%xmm2                 \n"
+      "cvtps2dq    %%xmm3,%%xmm3                 \n"
+      "packssdw    %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm3,%%xmm2                 \n"
+      "packuswb    %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jge         40b                           \n"
+
+      "49:                                       \n"
+      "add         $0x3,%3                       \n"
+      "jl          19f                           \n"
+
+      // 1 pixel loop
+      LABELALIGN
+      "10:                                       \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "psubd       0x00(%0,%4,4),%%xmm0          \n"
+      "lea         0x10(%0),%0                   \n"
+      "psubd       (%1),%%xmm0                   \n"
+      "paddd       0x00(%1,%4,4),%%xmm0          \n"
+      "lea         0x10(%1),%1                   \n"
+      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
+      "mulps       %%xmm4,%%xmm0                 \n"
+      "cvtps2dq    %%xmm0,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movd        %%xmm0,(%2)                   \n"
+      "lea         0x4(%2),%2                    \n"
+      "sub         $0x1,%3                       \n"
+      "jge         10b                           \n"
+      "19:                                       \n"
+      : "+r"(topleft),           // %0
+        "+r"(botleft),           // %1
+        "+r"(dst),               // %2
+        "+rm"(count)             // %3
+      : "r"((intptr_t)(width)),  // %4
+        "rm"(area)               // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
 LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* src_dudv, int width) {
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                        int src_argb_stride,
+                        uint8_t* dst_argb,
+                        const float* src_dudv,
+                        int width) {
   intptr_t src_argb_stride_temp = src_argb_stride;
-  intptr_t temp = 0;
-  asm volatile (
-    "movq      " MEMACCESS(3) ",%%xmm2         \n"
-    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
-    "shl       $0x10,%1                        \n"
-    "add       $0x4,%1                         \n"
-    "movd      %1,%%xmm5                       \n"
-    "sub       $0x4,%4                         \n"
-    "jl        49f                             \n"
-
-    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm7,%%xmm0                   \n"
-    "movlhps   %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm7,%%xmm4                   \n"
-    "addps     %%xmm4,%%xmm4                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "addps     %%xmm4,%%xmm3                   \n"
-    "addps     %%xmm4,%%xmm4                   \n"
-
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
-    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
-    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
-    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
-    "movd      %%xmm0,%k1                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%k5                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
-    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
-    "punpckldq %%xmm6,%%xmm1                   \n"
-    "addps     %%xmm4,%%xmm2                   \n"
-    "movq      %%xmm1," MEMACCESS(2) "         \n"
-    "movd      %%xmm0,%k1                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%k5                      \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
-    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
-    "punpckldq %%xmm6,%%xmm0                   \n"
-    "addps     %%xmm4,%%xmm3                   \n"
-    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%4                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%4                         \n"
-    "jl        19f                             \n"
-
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "pmaddwd   %%xmm5,%%xmm0                   \n"
-    "addps     %%xmm7,%%xmm2                   \n"
-    "movd      %%xmm0,%k1                      \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x04,2) ",%2           \n"
-    "sub       $0x1,%4                         \n"
-    "jge       10b                             \n"
-  "19:                                         \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_stride_temp),  // %1
-    "+r"(dst_argb),  // %2
-    "+r"(src_dudv),  // %3
-    "+rm"(width),    // %4
-    "+r"(temp)   // %5
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  intptr_t temp;
+  asm volatile(
+      "movq        (%3),%%xmm2                   \n"
+      "movq        0x08(%3),%%xmm7               \n"
+      "shl         $0x10,%1                      \n"
+      "add         $0x4,%1                       \n"
+      "movd        %1,%%xmm5                     \n"
+      "sub         $0x4,%4                       \n"
+      "jl          49f                           \n"
+
+      "pshufd      $0x44,%%xmm7,%%xmm7           \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "addps       %%xmm7,%%xmm0                 \n"
+      "movlhps     %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm7,%%xmm4                 \n"
+      "addps       %%xmm4,%%xmm4                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "addps       %%xmm4,%%xmm3                 \n"
+      "addps       %%xmm4,%%xmm4                 \n"
+
+      // 4 pixel loop
+      LABELALIGN
+      "40:                                       \n"
+      "cvttps2dq   %%xmm2,%%xmm0                 \n"  // x,y float->int first 2
+      "cvttps2dq   %%xmm3,%%xmm1                 \n"  // x,y float->int next 2
+      "packssdw    %%xmm1,%%xmm0                 \n"  // x, y as 8 shorts
+      "pmaddwd     %%xmm5,%%xmm0                 \n"  // off = x*4 + y*stride
+      "movd        %%xmm0,%k1                    \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+      "movd        %%xmm0,%k5                    \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+      "movd        0x00(%0,%1,1),%%xmm1          \n"
+      "movd        0x00(%0,%5,1),%%xmm6          \n"
+      "punpckldq   %%xmm6,%%xmm1                 \n"
+      "addps       %%xmm4,%%xmm2                 \n"
+      "movq        %%xmm1,(%2)                   \n"
+      "movd        %%xmm0,%k1                    \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+      "movd        %%xmm0,%k5                    \n"
+      "movd        0x00(%0,%1,1),%%xmm0          \n"
+      "movd        0x00(%0,%5,1),%%xmm6          \n"
+      "punpckldq   %%xmm6,%%xmm0                 \n"
+      "addps       %%xmm4,%%xmm3                 \n"
+      "movq        %%xmm0,0x08(%2)               \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%4                       \n"
+      "jge         40b                           \n"
+
+      "49:                                       \n"
+      "add         $0x3,%4                       \n"
+      "jl          19f                           \n"
+
+      // 1 pixel loop
+      LABELALIGN
+      "10:                                       \n"
+      "cvttps2dq   %%xmm2,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "pmaddwd     %%xmm5,%%xmm0                 \n"
+      "addps       %%xmm7,%%xmm2                 \n"
+      "movd        %%xmm0,%k1                    \n"
+      "movd        0x00(%0,%1,1),%%xmm0          \n"
+      "movd        %%xmm0,(%2)                   \n"
+      "lea         0x04(%2),%2                   \n"
+      "sub         $0x1,%4                       \n"
+      "jge         10b                           \n"
+      "19:                                       \n"
+      : "+r"(src_argb),              // %0
+        "+r"(src_argb_stride_temp),  // %1
+        "+r"(dst_argb),              // %2
+        "+r"(src_dudv),              // %3
+        "+rm"(width),                // %4
+        "=&r"(temp)                  // %5
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
 #ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
                           int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "shr       %3                              \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "cmp       $0x20,%3                        \n"
-    "je        75f                             \n"
-    "cmp       $0x40,%3                        \n"
-    "je        50f                             \n"
-    "cmp       $0x60,%3                        \n"
-    "je        25f                             \n"
-
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x80,%3                        \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-
-    // General purpose row blend.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "pmaddubsw %%xmm5,%%xmm0                   \n"
-    "pmaddubsw %%xmm5,%%xmm1                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 25 / 75.
-    LABELALIGN
-  "25:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        25b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 75 / 25.
-    LABELALIGN
-  "75:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        75b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        100b                            \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),    // %0
-    "+r"(src_ptr),    // %1
-    "+r"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm5"
-  );
+  asm volatile(
+      "sub         %1,%0                         \n"
+      "cmp         $0x0,%3                       \n"
+      "je          100f                          \n"
+      "cmp         $0x80,%3                      \n"
+      "je          50f                           \n"
+
+      "movd        %3,%%xmm0                     \n"
+      "neg         %3                            \n"
+      "add         $0x100,%3                     \n"
+      "movd        %3,%%xmm5                     \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"
+      "punpcklwd   %%xmm5,%%xmm5                 \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "mov         $0x80808080,%%eax             \n"
+      "movd        %%eax,%%xmm4                  \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+
+      // General purpose row blend.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x00(%1,%4,1),%%xmm2          \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "psubb       %%xmm4,%%xmm0                 \n"
+      "psubb       %%xmm4,%%xmm1                 \n"
+      "movdqa      %%xmm5,%%xmm2                 \n"
+      "movdqa      %%xmm5,%%xmm3                 \n"
+      "pmaddubsw   %%xmm0,%%xmm2                 \n"
+      "pmaddubsw   %%xmm1,%%xmm3                 \n"
+      "paddw       %%xmm4,%%xmm2                 \n"
+      "paddw       %%xmm4,%%xmm3                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "psrlw       $0x8,%%xmm3                   \n"
+      "packuswb    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm2,0x00(%1,%0,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "jmp         99f                           \n"
+
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:                                       \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x00(%1,%4,1),%%xmm1          \n"
+      "pavgb       %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          50b                           \n"
+      "jmp         99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:                                      \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          100b                          \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),               // %0
+        "+r"(src_ptr),               // %1
+        "+rm"(dst_width),            // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_INTERPOLATEROW_SSSE3
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 32x2 -> 32x1
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
                          int source_y_fraction) {
-  asm volatile (
-    "shr       %3                              \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "sub       %1,%0                           \n"
-    "cmp       $0x20,%3                        \n"
-    "je        75f                             \n"
-    "cmp       $0x40,%3                        \n"
-    "je        50f                             \n"
-    "cmp       $0x60,%3                        \n"
-    "je        25f                             \n"
-
-    "vmovd      %3,%%xmm0                      \n"
-    "neg        %3                             \n"
-    "add        $0x80,%3                       \n"
-    "vmovd      %3,%%xmm5                      \n"
-    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
-    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
-    "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermd     %%ymm5,%%ymm0,%%ymm5           \n"
-
-    // General purpose row blend.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
-    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
-    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
-    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x7,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 25 / 75.
-    LABELALIGN
-  "25:                                         \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
-    MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
-    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        25b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
-    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 75 / 25.
-    LABELALIGN
-  "75:                                         \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm1        \n"
-    MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
-    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        75b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "rep movsb " MEMMOVESTRING(1,0) "          \n"
-    "jmp       999f                            \n"
-
-  "99:                                         \n"
-    "vzeroupper                                \n"
-  "999:                                        \n"
-  : "+D"(dst_ptr),    // %0
-    "+S"(src_ptr),    // %1
-    "+c"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm5"
-  );
+  asm volatile(
+      "cmp         $0x0,%3                       \n"
+      "je          100f                          \n"
+      "sub         %1,%0                         \n"
+      "cmp         $0x80,%3                      \n"
+      "je          50f                           \n"
+
+      "vmovd       %3,%%xmm0                     \n"
+      "neg         %3                            \n"
+      "add         $0x100,%3                     \n"
+      "vmovd       %3,%%xmm5                     \n"
+      "vpunpcklbw  %%xmm0,%%xmm5,%%xmm5          \n"
+      "vpunpcklwd  %%xmm5,%%xmm5,%%xmm5          \n"
+      "vbroadcastss %%xmm5,%%ymm5                \n"
+      "mov         $0x80808080,%%eax             \n"
+      "vmovd       %%eax,%%xmm4                  \n"
+      "vbroadcastss %%xmm4,%%ymm4                \n"
+
+      // General purpose row blend.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%1),%%ymm0                   \n"
+      "vmovdqu     0x00(%1,%4,1),%%ymm2          \n"
+      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
+      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpsubb      %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpsubb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm1,%%ymm5,%%ymm1          \n"
+      "vpmaddubsw  %%ymm0,%%ymm5,%%ymm0          \n"
+      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "jmp         99f                           \n"
+
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:                                       \n"
+      "vmovdqu     (%1),%%ymm0                   \n"
+      "vpavgb      0x00(%1,%4,1),%%ymm0,%%ymm0   \n"
+      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          50b                           \n"
+      "jmp         99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:                                      \n"
+      "rep         movsb                         \n"
+      "jmp         999f                          \n"
+
+      "99:                                       \n"
+      "vzeroupper                                \n"
+      "999:                                      \n"
+      : "+D"(dst_ptr),               // %0
+        "+S"(src_ptr),               // %1
+        "+cm"(dst_width),            // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
 }
 #endif  // HAS_INTERPOLATEROW_AVX2
 
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "shr       %3                              \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "cmp       $0x20,%3                        \n"
-    "je        75f                             \n"
-    "cmp       $0x40,%3                        \n"
-    "je        50f                             \n"
-    "cmp       $0x60,%3                        \n"
-    "je        25f                             \n"
-
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x80,%3                        \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-
-    // General purpose row blend.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm2                   \n"
-    "punpckhbw %%xmm4,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm0                   \n"
-    "punpckhbw %%xmm4,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm2                   \n"
-    "psubw     %%xmm1,%%xmm3                   \n"
-    "paddw     %%xmm2,%%xmm2                   \n"
-    "paddw     %%xmm3,%%xmm3                   \n"
-    "pmulhw    %%xmm5,%%xmm2                   \n"
-    "pmulhw    %%xmm5,%%xmm3                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 25 / 75.
-    LABELALIGN
-  "25:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        25b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 75 / 25.
-    LABELALIGN
-  "75:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        75b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        100b                            \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),    // %0
-    "+r"(src_ptr),    // %1
-    "+r"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_INTERPOLATEROW_SSE2
-
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int pix) {
-  asm volatile (
-    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "r"(shuffler)    // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const uint8_t* shuffler,
+                          int width) {
+  asm volatile(
+
+      "movdqu      (%3),%%xmm5                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
 
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int pix) {
-  asm volatile (
-    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "r"(shuffler)    // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+
+      "vbroadcastf128 (%3),%%ymm5                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
-#ifdef HAS_ARGBSHUFFLEROW_SSE2
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int pix) {
-  uintptr_t pixel_temp = 0u;
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "mov       " MEMACCESS(4) ",%k2            \n"
-    "cmp       $0x3000102,%k2                  \n"
-    "je        3012f                           \n"
-    "cmp       $0x10203,%k2                    \n"
-    "je        123f                            \n"
-    "cmp       $0x30201,%k2                    \n"
-    "je        321f                            \n"
-    "cmp       $0x2010003,%k2                  \n"
-    "je        2103f                           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movzb     " MEMACCESS(4) ",%2             \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS(1) "            \n"
-    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
-    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
-    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "lea       " MEMLEA(0x4,1) ",%1            \n"
-    "sub       $0x1,%3                         \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "123:                                        \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        123b                            \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "321:                                        \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        321b                            \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "2103:                                       \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        2103b                           \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "3012:                                       \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        3012b                           \n"
-
-  "99:                                         \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+d"(pixel_temp),  // %2
-    "+r"(pix)         // %3
-  : "r"(shuffler)      // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
-}
-#endif  // HAS_ARGBSHUFFLEROW_SSE2
-
 #ifdef HAS_I422TOYUY2ROW_SSE2
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
- asm volatile (
-    "sub       %1,%2                             \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movq      " MEMACCESS(1) ",%%xmm2           \n"
-    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
-    "lea       " MEMLEA(0x8,1) ",%1              \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "movdqa    %%xmm0,%%xmm1                     \n"
-    "punpcklbw %%xmm2,%%xmm0                     \n"
-    "punpckhbw %%xmm2,%%xmm1                     \n"
-    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
-    "lea       " MEMLEA(0x20,3) ",%3             \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%1),%%xmm2                   \n"
+      "movq        0x00(%1,%2,1),%%xmm1          \n"
+      "add         $0x8,%1                       \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "add         $0x10,%0                      \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%3)                   \n"
+      "movdqu      %%xmm1,0x10(%3)               \n"
+      "lea         0x20(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOYUY2ROW_SSE2
 
 #ifdef HAS_I422TOUYVYROW_SSE2
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
- asm volatile (
-    "sub        %1,%2                            \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movq      " MEMACCESS(1) ",%%xmm2           \n"
-    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
-    "lea       " MEMLEA(0x8,1) ",%1              \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    "movdqa    %%xmm2,%%xmm1                     \n"
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "punpcklbw %%xmm0,%%xmm1                     \n"
-    "punpckhbw %%xmm0,%%xmm2                     \n"
-    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
-    "lea       " MEMLEA(0x20,3) ",%3             \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%1),%%xmm2                   \n"
+      "movq        0x00(%1,%2,1),%%xmm1          \n"
+      "add         $0x8,%1                       \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "add         $0x10,%0                      \n"
+      "punpcklbw   %%xmm0,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm1,(%3)                   \n"
+      "movdqu      %%xmm2,0x10(%3)               \n"
+      "lea         0x20(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOUYVYROW_SSE2
 
+#ifdef HAS_I422TOYUY2ROW_AVX2
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbw   (%1),%%ymm1                   \n"
+      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
+      "add         $0x10,%1                      \n"
+      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
+      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "add         $0x20,%0                      \n"
+      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
+      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
+      "vextractf128 $0x0,%%ymm1,(%3)             \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
+      "lea         0x40(%3),%3                   \n"
+      "sub         $0x20,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOYUY2ROW_AVX2
+
+#ifdef HAS_I422TOUYVYROW_AVX2
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbw   (%1),%%ymm1                   \n"
+      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
+      "add         $0x10,%1                      \n"
+      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
+      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "add         $0x20,%0                      \n"
+      "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
+      "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
+      "vextractf128 $0x0,%%ymm1,(%3)             \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
+      "lea         0x40(%3),%3                   \n"
+      "sub         $0x20,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOUYVYROW_AVX2
+
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
                             int width) {
-  asm volatile (
-    "pxor      %%xmm3,%%xmm3                   \n"
-
-    // 2 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm3,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm4                   \n"
-    "punpcklwd %%xmm3,%%xmm0                   \n"
-    "punpckhwd %%xmm3,%%xmm4                   \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
-    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
-    "addps     " MEMACCESS(3) ",%%xmm0         \n"
-    "addps     " MEMACCESS(3) ",%%xmm4         \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm5,%%xmm6                   \n"
-    "mulps     %%xmm1,%%xmm2                   \n"
-    "mulps     %%xmm5,%%xmm6                   \n"
-    "mulps     %%xmm2,%%xmm1                   \n"
-    "mulps     %%xmm6,%%xmm5                   \n"
-    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
-    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
-    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
-    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
-    "addps     %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm6,%%xmm4                   \n"
-    "addps     %%xmm1,%%xmm0                   \n"
-    "addps     %%xmm5,%%xmm4                   \n"
-    "cvttps2dq %%xmm0,%%xmm0                   \n"
-    "cvttps2dq %%xmm4,%%xmm4                   \n"
-    "packuswb  %%xmm4,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x2,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(poly)        // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+  asm volatile(
+
+      "pxor        %%xmm3,%%xmm3                 \n"
+
+      // 2 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "punpcklbw   %%xmm3,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "punpcklwd   %%xmm3,%%xmm0                 \n"
+      "punpckhwd   %%xmm3,%%xmm4                 \n"
+      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
+      "cvtdq2ps    %%xmm4,%%xmm4                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "mulps       0x10(%3),%%xmm0               \n"
+      "mulps       0x10(%3),%%xmm4               \n"
+      "addps       (%3),%%xmm0                   \n"
+      "addps       (%3),%%xmm4                   \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "movdqa      %%xmm5,%%xmm6                 \n"
+      "mulps       %%xmm1,%%xmm2                 \n"
+      "mulps       %%xmm5,%%xmm6                 \n"
+      "mulps       %%xmm2,%%xmm1                 \n"
+      "mulps       %%xmm6,%%xmm5                 \n"
+      "mulps       0x20(%3),%%xmm2               \n"
+      "mulps       0x20(%3),%%xmm6               \n"
+      "mulps       0x30(%3),%%xmm1               \n"
+      "mulps       0x30(%3),%%xmm5               \n"
+      "addps       %%xmm2,%%xmm0                 \n"
+      "addps       %%xmm6,%%xmm4                 \n"
+      "addps       %%xmm1,%%xmm0                 \n"
+      "addps       %%xmm5,%%xmm4                 \n"
+      "cvttps2dq   %%xmm0,%%xmm0                 \n"
+      "cvttps2dq   %%xmm4,%%xmm4                 \n"
+      "packuswb    %%xmm4,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x2,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
                             int width) {
-  asm volatile (
-    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
-    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
-    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
-    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
-
-    // 2 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
-    "lea         " MEMLEA(0x8,0) ",%0          \n"
-    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
-    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
-    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
-    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
-    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
-    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
-    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
-    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
-    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
-    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
-    "lea         " MEMLEA(0x8,1) ",%1          \n"
-    "sub         $0x2,%2                       \n"
-    "jg          1b                            \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(poly)        // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  asm volatile(
+      "vbroadcastf128 (%3),%%ymm4                \n"
+      "vbroadcastf128 0x10(%3),%%ymm5            \n"
+      "vbroadcastf128 0x20(%3),%%ymm6            \n"
+      "vbroadcastf128 0x30(%3),%%ymm7            \n"
+
+      // 2 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
+      "lea         0x8(%0),%0                    \n"
+      "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
+      "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
+      "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
+      "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
+      "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
+      "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
+                                                      // X
+      "vcvttps2dq  %%ymm0,%%ymm0                 \n"
+      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
+      "vmovq       %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x2,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kScaleBias = 1.9259299444e-34f;
+void HalfFloatRow_SSE2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  scale *= kScaleBias;
+  asm volatile(
+      "movd        %3,%%xmm4                     \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+      "sub         %0,%1                         \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
+      "add         $0x10,%0                      \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
+      "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
+      "punpckhwd   %%xmm5,%%xmm3                 \n"
+      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
+      "mulps       %%xmm4,%%xmm2                 \n"
+      "mulps       %%xmm4,%%xmm3                 \n"
+      "psrld       $0xd,%%xmm2                   \n"
+      "psrld       $0xd,%%xmm3                   \n"
+      "packssdw    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "m"(scale)   // %3
+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+void HalfFloatRow_AVX2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  scale *= kScaleBias;
+  asm volatile(
+      "vbroadcastss %3, %%ymm4                   \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      "sub         %0,%1                         \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm2                   \n"  // 16 shorts
+      "add         $0x20,%0                      \n"
+      "vpunpckhwd  %%ymm5,%%ymm2,%%ymm3          \n"  // mutates
+      "vpunpcklwd  %%ymm5,%%ymm2,%%ymm2          \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
+      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
+      "vpsrld      $0xd,%%ymm3,%%ymm3            \n"
+      "vpsrld      $0xd,%%ymm2,%%ymm2            \n"
+      "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // unmutates
+      "vmovdqu     %%ymm2,-0x20(%0,%1,1)         \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+#if defined(__x86_64__)
+      : "x"(scale)  // %3
+#else
+      : "m"(scale)  // %3
+#endif
+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloatRow_F16C(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+      "vbroadcastss %3, %%ymm4                   \n"
+      "sub         %0,%1                         \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
+      "vpmovzxwd   0x10(%0),%%ymm3               \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
+      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+#if defined(__x86_64__)
+      : "x"(scale)  // %3
+#else
+      : "m"(scale)  // %3
+#endif
+      : "memory", "cc", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
+      "vpmovzxwd   0x10(%0),%%ymm3               \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm2", "xmm3");
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                           const uint8_t* table_argb,
                            int width) {
-  uintptr_t pixel_temp = 0u;
-  asm volatile (
-    // 1 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movzb     " MEMACCESS(0) ",%1             \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
-    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
-    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
-    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
-    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
-    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
-    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
-    "dec       %2                              \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),   // %0
-    "+d"(pixel_temp), // %1
-    "+r"(width)       // %2
-  : "r"(table_argb)   // %3
-  : "memory", "cc");
+  uintptr_t pixel_temp;
+  asm volatile(
+      // 1 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movzb       (%0),%1                       \n"
+      "lea         0x4(%0),%0                    \n"
+      "movzb       0x00(%3,%1,4),%1              \n"
+      "mov         %b1,-0x4(%0)                  \n"
+      "movzb       -0x3(%0),%1                   \n"
+      "movzb       0x01(%3,%1,4),%1              \n"
+      "mov         %b1,-0x3(%0)                  \n"
+      "movzb       -0x2(%0),%1                   \n"
+      "movzb       0x02(%3,%1,4),%1              \n"
+      "mov         %b1,-0x2(%0)                  \n"
+      "movzb       -0x1(%0),%1                   \n"
+      "movzb       0x03(%3,%1,4),%1              \n"
+      "mov         %b1,-0x1(%0)                  \n"
+      "dec         %2                            \n"
+      "jg          1b                            \n"
+      : "+r"(dst_argb),     // %0
+        "=&d"(pixel_temp),  // %1
+        "+r"(width)         // %2
+      : "r"(table_argb)     // %3
+      : "memory", "cc");
 }
 #endif  // HAS_ARGBCOLORTABLEROW_X86
 
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
-  uintptr_t pixel_temp = 0u;
-  asm volatile (
-    // 1 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movzb     " MEMACCESS(0) ",%1             \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
-    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
-    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
-    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
-    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
-    "dec       %2                              \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),   // %0
-    "+d"(pixel_temp), // %1
-    "+r"(width)       // %2
-  : "r"(table_argb)   // %3
-  : "memory", "cc");
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+                          const uint8_t* table_argb,
+                          int width) {
+  uintptr_t pixel_temp;
+  asm volatile(
+      // 1 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movzb       (%0),%1                       \n"
+      "lea         0x4(%0),%0                    \n"
+      "movzb       0x00(%3,%1,4),%1              \n"
+      "mov         %b1,-0x4(%0)                  \n"
+      "movzb       -0x3(%0),%1                   \n"
+      "movzb       0x01(%3,%1,4),%1              \n"
+      "mov         %b1,-0x3(%0)                  \n"
+      "movzb       -0x2(%0),%1                   \n"
+      "movzb       0x02(%3,%1,4),%1              \n"
+      "mov         %b1,-0x2(%0)                  \n"
+      "dec         %2                            \n"
+      "jg          1b                            \n"
+      : "+r"(dst_argb),     // %0
+        "=&d"(pixel_temp),  // %1
+        "+r"(width)         // %2
+      : "r"(table_argb)     // %3
+      : "memory", "cc");
 }
 #endif  // HAS_RGBCOLORTABLEROW_X86
 
 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
 // Tranform RGB pixels with luma table.
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+                                 uint8_t* dst_argb,
                                  int width,
-                                 const uint8* luma, uint32 lumacoeff) {
-  uintptr_t pixel_temp = 0u;
-  uintptr_t table_temp = 0u;
-  asm volatile (
-    "movd      %6,%%xmm3                       \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0x8,%%xmm4                     \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
-    "pmaddubsw %%xmm3,%%xmm0                   \n"
-    "phaddw    %%xmm0,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-    "movzb     " MEMACCESS(2) ",%0             \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS(3) "            \n"
-    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
-    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
-    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
-
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
-    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
-    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
-    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
-
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
-    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
-    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
-    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
-
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-
-    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
-    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
-    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
-    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "lea       " MEMLEA(0x10,3) ",%3           \n"
-    "sub       $0x4,%4                         \n"
-    "jg        1b                              \n"
-  : "+d"(pixel_temp),  // %0
-    "+a"(table_temp),  // %1
-    "+r"(src_argb),    // %2
-    "+r"(dst_argb),    // %3
-    "+rm"(width)       // %4
-  : "r"(luma),         // %5
-    "rm"(lumacoeff)    // %6
-  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
-  );
+                                 const uint8_t* luma,
+                                 uint32_t lumacoeff) {
+  uintptr_t pixel_temp;
+  uintptr_t table_temp;
+  asm volatile(
+      "movd        %6,%%xmm3                     \n"
+      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psllw       $0x8,%%xmm4                   \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%2),%%xmm0                   \n"
+      "pmaddubsw   %%xmm3,%%xmm0                 \n"
+      "phaddw      %%xmm0,%%xmm0                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "punpcklwd   %%xmm5,%%xmm0                 \n"
+      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
+      "add         %5,%1                         \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+
+      "movzb       (%2),%0                       \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,(%3)                      \n"
+      "movzb       0x1(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x1(%3)                   \n"
+      "movzb       0x2(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x2(%3)                   \n"
+      "movzb       0x3(%2),%0                    \n"
+      "mov         %b0,0x3(%3)                   \n"
+
+      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
+      "add         %5,%1                         \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+
+      "movzb       0x4(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x4(%3)                   \n"
+      "movzb       0x5(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x5(%3)                   \n"
+      "movzb       0x6(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x6(%3)                   \n"
+      "movzb       0x7(%2),%0                    \n"
+      "mov         %b0,0x7(%3)                   \n"
+
+      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
+      "add         %5,%1                         \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+
+      "movzb       0x8(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x8(%3)                   \n"
+      "movzb       0x9(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x9(%3)                   \n"
+      "movzb       0xa(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0xa(%3)                   \n"
+      "movzb       0xb(%2),%0                    \n"
+      "mov         %b0,0xb(%3)                   \n"
+
+      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
+      "add         %5,%1                         \n"
+
+      "movzb       0xc(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0xc(%3)                   \n"
+      "movzb       0xd(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0xd(%3)                   \n"
+      "movzb       0xe(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0xe(%3)                   \n"
+      "movzb       0xf(%2),%0                    \n"
+      "mov         %b0,0xf(%3)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "lea         0x10(%3),%3                   \n"
+      "sub         $0x4,%4                       \n"
+      "jg          1b                            \n"
+      : "=&d"(pixel_temp),  // %0
+        "=&a"(table_temp),  // %1
+        "+r"(src_argb),     // %2
+        "+r"(dst_argb),     // %3
+        "+rm"(width)        // %4
+      : "r"(luma),          // %5
+        "rm"(lumacoeff)     // %6
+      : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+
+// begin NV21ToYUV24Row_C avx2 constants
+static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
+                               0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
+                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+                               0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
+
+static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
+
+static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+                               0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
+
+static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+                              0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
+                              0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+                              0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
+
+static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+                              0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
+                              0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+                              0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
+
+static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+                              0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
+                              0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+                              0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
+
+static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+                              0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
+                              0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+                              0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
+
+static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+                              0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
+                              0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+                              0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
+
+static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+                              0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
+                              0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+                              0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
+
+// NV21ToYUV24Row_AVX2
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  uint8_t* src_y_ptr;
+  uint64_t src_offset = 0;
+  uint64_t width64;
+
+  width64 = width;
+  src_y_ptr = (uint8_t*)src_y;
+
+  asm volatile(
+      "vmovdqu     %5, %%ymm0                    \n"  // init blend value
+      "vmovdqu     %6, %%ymm1                    \n"  // init blend value
+      "vmovdqu     %7, %%ymm2                    \n"  // init blend value
+      //      "sub         $0x20, %3                     \n"  //sub 32 from
+      //      width for final loop
+
+      LABELALIGN
+      "1:                                        \n"      // label 1
+      "vmovdqu     (%0,%4), %%ymm3               \n"      // src_y
+      "vmovdqu     1(%1,%4), %%ymm4              \n"      // src_uv+1
+      "vmovdqu     (%1), %%ymm5                  \n"      // src_uv
+      "vpshufb     %8, %%ymm3, %%ymm13           \n"      // y, kSHUF0 for shuf
+      "vpshufb     %9, %%ymm4, %%ymm14           \n"      // uv+1, kSHUF1 for
+                                                          // shuf
+      "vpshufb     %10, %%ymm5, %%ymm15          \n"      // uv, kSHUF2 for
+                                                          // shuf
+      "vpshufb     %11, %%ymm3, %%ymm3           \n"      // y kSHUF3 for shuf
+      "vpshufb     %12, %%ymm4, %%ymm4           \n"      // uv+1 kSHUF4 for
+                                                          // shuf
+      "vpblendvb   %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n"  // blend 0
+      "vpblendvb   %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n"  // blend 0
+      "vpblendvb   %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n"  // blend 2
+      "vpblendvb   %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n"  // blend 1
+      "vpshufb     %13, %%ymm5, %%ymm15          \n"      // shuffle const
+      "vpor        %%ymm4, %%ymm3, %%ymm5        \n"      // get results
+      "vmovdqu     %%ymm12, 0x20(%2)             \n"      // store dst_yuv+20h
+      "vpor        %%ymm15, %%ymm5, %%ymm3       \n"      // get results
+      "add         $0x20, %4                     \n"      // add to src buffer
+                                                          // ptr
+      "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n"      // insert
+      "vperm2i128  $0x31, %%ymm13, %%ymm3, %%ymm5 \n"     // insert
+      "vmovdqu     %%ymm4, (%2)                  \n"      // store dst_yuv
+      "vmovdqu     %%ymm5, 0x40(%2)              \n"      // store dst_yuv+40h
+      "add         $0x60,%2                      \n"      // add to dst buffer
+                                                          // ptr
+      //      "cmp         %3, %4                        \n" //(width64 -
+      //      32 bytes) and src_offset
+      "sub         $0x20,%3                      \n"  // 32 pixels per loop
+      "jg          1b                            \n"
+      "vzeroupper                                \n"  // sse-avx2
+                                                      // transistions
+
+      : "+r"(src_y),      //%0
+        "+r"(src_vu),     //%1
+        "+r"(dst_yuv24),  //%2
+        "+r"(width64),    //%3
+        "+r"(src_offset)  //%4
+      : "m"(kBLEND0),     //%5
+        "m"(kBLEND1),     //%6
+        "m"(kBLEND2),     //%7
+        "m"(kSHUF0),      //%8
+        "m"(kSHUF1),      //%9
+        "m"(kSHUF2),      //%10
+        "m"(kSHUF3),      //%11
+        "m"(kSHUF4),      //%12
+        "m"(kSHUF5)       //%13
+      : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
+        "xmm13", "xmm14", "xmm15");
+}
+#endif  // HAS_NV21TOYUV24ROW_AVX2
+
+#ifdef HAS_SWAPUVROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
+                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+
+      "movdqu      %3,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uv),        // %0
+        "+r"(dst_vu),        // %1
+        "+r"(width)          // %2
+      : "m"(kShuffleUVToVU)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_SWAPUVROW_SSSE3
+
+#ifdef HAS_SWAPUVROW_AVX2
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),        // %0
+        "+r"(dst_vu),        // %1
+        "+r"(width)          // %2
+      : "m"(kShuffleUVToVU)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_SWAPUVROW_AVX2
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          uint8_t* dst_uv,
+                          int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrlw       $0xf,%%xmm4                   \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // load 16 U values
+      "movdqu      (%1),%%xmm1                   \n"  // load 16 V values
+      "movdqu      0(%0,%4,1),%%xmm2             \n"  // 16 from next row
+      "movdqu      0(%1,%5,1),%%xmm3             \n"
+      "lea         0x10(%0),%0                   \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"  // half size
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "lea         0x10(%1),%1                   \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x1,%%xmm0                   \n"
+      "psrlw       $0x1,%%xmm1                   \n"
+      "pavgw       %%xmm5,%%xmm0                 \n"
+      "pavgw       %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"  // store 8 UV pixels
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"  // 16 src pixels per loop
+      "jg          1b                            \n"
+      : "+r"(src_u),                    // %0
+        "+r"(src_v),                    // %1
+        "+r"(dst_uv),                   // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride_u)),  // %4
+        "r"((intptr_t)(src_stride_v))   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // load 32 U values
+      "vmovdqu     (%1),%%ymm1                   \n"  // load 32 V values
+      "vmovdqu     0(%0,%4,1),%%ymm2             \n"  // 32 from next row
+      "vmovdqu     0(%1,%5,1),%%ymm3             \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // half size
+      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "lea         0x20(%1),%1                   \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
+      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"  // store 16 UV pixels
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x20,%3                      \n"  // 32 src pixels per loop
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_u),                    // %0
+        "+r"(src_v),                    // %1
+        "+r"(dst_uv),                   // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride_u)),  // %4
+        "r"((intptr_t)(src_stride_v))   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
+  asm volatile(
+      "pxor        %%xmm1,%%xmm1                 \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movd        (%0),%%xmm0                   \n"  // load float
+      "maxss       %%xmm1, %%xmm0                \n"  // clamp to zero
+      "add         4, %0                         \n"
+      "movd        %%xmm0, (%1)                  \n"  // store float
+      "add         4, %1                         \n"
+      "sub         $0x4,%2                       \n"  // 1 float per loop
+      "jg          1b                            \n"
+      : "+r"(src_x),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/third_party/libyuv/source/row_neon.cc b/media/libaom/src/third_party/libyuv/source/row_neon.cc
index 1a72eb9039..a5aeaabfbd 100644
--- a/media/libaom/src/third_party/libyuv/source/row_neon.cc
+++ b/media/libaom/src/third_party/libyuv/source/row_neon.cc
@@ -10,6 +10,8 @@
 
 #include "libyuv/row.h"
 
+#include <stdio.h>
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -20,1663 +22,1426 @@ extern "C" {
     !defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.32    {d2[1]}, [%2]!                 \n"
-
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vzip.u8    d2, d3                         \n"
+#define READYUV422                               \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vld1.32    {d2[0]}, [%1]!                 \n" \
+  "vld1.32    {d2[1]}, [%2]!                 \n"
 
 // Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.8     {d3}, [%2]!                    \n"                             \
-    "vpaddl.u8  q1, q1                         \n"                             \
-    "vrshrn.u16 d2, q1, #1                     \n"
+#define READYUV444                               \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vld1.8     {d2}, [%1]!                    \n" \
+  "vld1.8     {d3}, [%2]!                    \n" \
+  "vpaddl.u8  q1, q1                         \n" \
+  "vrshrn.u16 d2, q1, #1                     \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    "vmov.u8    d2, #128                       \n"
+#define READYUV400                               \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vmov.u8    d2, #128                       \n"
 
 // Read 8 Y and 4 UV from NV12
 #define READNV12                                                               \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+  "vld1.8     {d0}, [%0]!                    \n"                               \
+  "vld1.8     {d2}, [%1]!                    \n"                               \
+  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
+  "vuzp.u8    d2, d3                         \n"                               \
+  "vtrn.u32   d2, d3                         \n"
 
 // Read 8 Y and 4 VU from NV21
 #define READNV21                                                               \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
-    "vuzp.u8    d3, d2                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+  "vld1.8     {d0}, [%0]!                    \n"                               \
+  "vld1.8     {d2}, [%1]!                    \n"                               \
+  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
+  "vuzp.u8    d3, d2                         \n"                               \
+  "vtrn.u32   d2, d3                         \n"
 
 // Read 8 YUY2
-#define READYUY2                                                               \
-    MEMACCESS(0)                                                               \
-    "vld2.8     {d0, d2}, [%0]!                \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+#define READYUY2                                 \
+  "vld2.8     {d0, d2}, [%0]!                \n" \
+  "vmov.u8    d3, d2                         \n" \
+  "vuzp.u8    d2, d3                         \n" \
+  "vtrn.u32   d2, d3                         \n"
 
 // Read 8 UYVY
-#define READUYVY                                                               \
-    MEMACCESS(0)                                                               \
-    "vld2.8     {d2, d3}, [%0]!                \n"                             \
-    "vmov.u8    d0, d3                         \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
-
-#define YUV422TORGB_SETUP_REG                                                  \
-    MEMACCESS([kUVToRB])                                                       \
-    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
-    MEMACCESS([kUVToG])                                                        \
-    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
-    MEMACCESS([kUVBiasBGR])                                                    \
-    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
-    MEMACCESS([kUVBiasBGR])                                                    \
-    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
-    MEMACCESS([kUVBiasBGR])                                                    \
-    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
-    MEMACCESS([kYToRgb])                                                       \
-    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
-
-#define YUV422TORGB                                                            \
-    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
-    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
-    "vmovl.u8   q0, d0                         \n" /* Y                      */\
-    "vmovl.s16  q10, d1                        \n"                             \
-    "vmovl.s16  q0, d0                         \n"                             \
-    "vmul.s32   q10, q10, q15                  \n"                             \
-    "vmul.s32   q0, q0, q15                    \n"                             \
-    "vqshrun.s32 d0, q0, #16                   \n"                             \
-    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
-    "vadd.s16   d18, d19                       \n"                             \
-    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
-    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
-    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
-    "vaddw.u16  q1, q1, d16                    \n"                             \
-    "vaddw.u16  q10, q10, d17                  \n"                             \
-    "vaddw.u16  q3, q3, d18                    \n"                             \
-    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
-    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
-    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
-    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
-    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
-    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
-    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
-    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
-    "vqshrun.s16 d21, q0, #6                   \n" /* G */
-
-// YUV to RGB conversion constants.
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
-
-// U and V contributions to R,G,B.
-#define UB -128 /* -min(128, round(2.018 * 64)) */
-#define UG 25 /* -round(-0.391 * 64) */
-#define VG 52 /* -round(-0.813 * 64) */
-#define VR -102 /* -round(1.596 * 64) */
-
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128            - YGB)
-#define BG (UG * 128 + VG * 128 - YGB)
-#define BR            (VR * 128 - YGB)
-
-static uvec8 kUVToRB  = { 128, 128, 128, 128, 102, 102, 102, 102,
-                          0, 0, 0, 0, 0, 0, 0, 0 };
-static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52,
-                        0, 0, 0, 0, 0, 0, 0, 0 };
-static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
-static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
-
-#undef YG
-#undef YGB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-#undef BB
-#undef BG
-#undef BR
-
-void I444ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV444
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&kUVToRB),   // %5
-      [kUVToG]"r"(&kUVToG),     // %6
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&kUVToRB),   // %5
-      [kUVToG]"r"(&kUVToG),     // %6
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV411
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&kUVToRB),   // %5
-      [kUVToG]"r"(&kUVToG),     // %6
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToBGRARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_bgra,
+#define READUYVY                                 \
+  "vld2.8     {d2, d3}, [%0]!                \n" \
+  "vmov.u8    d0, d3                         \n" \
+  "vmov.u8    d3, d2                         \n" \
+  "vuzp.u8    d2, d3                         \n" \
+  "vtrn.u32   d2, d3                         \n"
+
+#define YUVTORGB_SETUP                             \
+  "vld1.8     {d24}, [%[kUVToRB]]            \n"   \
+  "vld1.8     {d25}, [%[kUVToG]]             \n"   \
+  "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
+  "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n" \
+  "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n" \
+  "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
+
+#define YUVTORGB                                                              \
+  "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */ \
+  "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */ \
+  "vmovl.u8   q0, d0                         \n" /* Y                      */ \
+  "vmovl.s16  q10, d1                        \n"                              \
+  "vmovl.s16  q0, d0                         \n"                              \
+  "vmul.s32   q10, q10, q15                  \n"                              \
+  "vmul.s32   q0, q0, q15                    \n"                              \
+  "vqshrun.s32 d0, q0, #16                   \n"                              \
+  "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */ \
+  "vadd.s16   d18, d19                       \n"                              \
+  "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */ \
+  "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */ \
+  "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/ \
+  "vaddw.u16  q1, q1, d16                    \n"                              \
+  "vaddw.u16  q10, q10, d17                  \n"                              \
+  "vaddw.u16  q3, q3, d18                    \n"                              \
+  "vqadd.s16  q8, q0, q13                    \n" /* B */                      \
+  "vqadd.s16  q9, q0, q14                    \n" /* R */                      \
+  "vqadd.s16  q0, q0, q4                     \n" /* G */                      \
+  "vqadd.s16  q8, q8, q1                     \n" /* B */                      \
+  "vqadd.s16  q9, q9, q10                    \n" /* R */                      \
+  "vqsub.s16  q0, q0, q3                     \n" /* G */                      \
+  "vqshrun.s16 d20, q8, #6                   \n" /* B */                      \
+  "vqshrun.s16 d22, q9, #6                   \n" /* R */                      \
+  "vqshrun.s16 d21, q0, #6                   \n" /* G */
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vswp.u8    d20, d22                       \n"
-    "vmov.u8    d19, #255                      \n"
-    MEMACCESS(3)
-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_bgra),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&kUVToRB),   // %5
-      [kUVToG]"r"(&kUVToG),     // %6
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToABGRRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_abgr,
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d23, #255                     \n"
+      "1:                                        \n" READYUV444 YUVTORGB
+      "subs        %4, %4, #8                    \n"
+      "vst4.8      {d20, d21, d22, d23}, [%3]!   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vswp.u8    d20, d22                       \n"
-    "vmov.u8    d23, #255                      \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_abgr),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&kUVToRB),   // %5
-      [kUVToG]"r"(&kUVToG),     // %6
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToRGBARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_rgba,
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d23, #255                     \n"
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs        %4, %4, #8                    \n"
+      "vst4.8      {d20, d21, d22, d23}, [%3]!   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs        %5, %5, #8                    \n"
+      "vld1.8      {d23}, [%3]!                  \n"
+      "vst4.8      {d20, d21, d22, d23}, [%4]!   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+        "+r"(width)      // %5
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d19, #255                      \n"
-    MEMACCESS(3)
-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_rgba),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&kUVToRB),   // %5
-      [kUVToG]"r"(&kUVToG),     // %6
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToRGB24Row_NEON(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs        %4, %4, #8                    \n"
+      "vmov.u8     d19, #255                     \n"  // YUVTORGB modified d19
+      "vst4.8      {d19, d20, d21, d22}, [%3]!   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_rgba),  // %3
+        "+r"(width)      // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst3.8     {d20, d21, d22}, [%3]!         \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),      // %0
-      "+r"(src_u),      // %1
-      "+r"(src_v),      // %2
-      "+r"(dst_rgb24),  // %3
-      "+r"(width)       // %4
-    : [kUVToRB]"r"(&kUVToRB),   // %5
-      [kUVToG]"r"(&kUVToG),     // %6
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToRAWRow_NEON(const uint8* src_y,
-                       const uint8* src_u,
-                       const uint8* src_v,
-                       uint8* dst_raw,
-                       int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vswp.u8    d20, d22                       \n"
-    MEMACCESS(3)
-    "vst3.8     {d20, d21, d22}, [%3]!         \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_raw),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&kUVToRB),   // %5
-      [kUVToG]"r"(&kUVToG),     // %6
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#define ARGBTORGB565                                                           \
-    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \
-    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \
-    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
-    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
-    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
-    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
-    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
-    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \
-    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
-    "vorr       q0, q0, q10                    \n"  /* BGR                  */
-
-void I422ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs        %4, %4, #8                    \n"
+      "vst3.8      {d20, d21, d22}, [%3]!        \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_u),      // %1
+        "+r"(src_v),      // %2
+        "+r"(dst_rgb24),  // %3
+        "+r"(width)       // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTORGB565                                                        \
+  "vshll.u8    q0, d22, #8                   \n" /* R                    */ \
+  "vshll.u8    q8, d21, #8                   \n" /* G                    */ \
+  "vshll.u8    q9, d20, #8                   \n" /* B                    */ \
+  "vsri.16     q0, q8, #5                    \n" /* RG                   */ \
+  "vsri.16     q0, q9, #11                   \n" /* RGB                  */
+
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    ARGBTORGB565
-    MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_rgb565),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&kUVToRB),   // %5
-      [kUVToG]"r"(&kUVToG),     // %6
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#define ARGBTOARGB1555                                                         \
-    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \
-    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
-    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \
-    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
-    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
-    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
-    "vmovl.u8   q11, d23                       \n"  /* A                    */ \
-    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
-    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \
-    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \
-    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
-    "vorr       q1, q10, q11                   \n"  /* RA                   */ \
-    "vorr       q0, q0, q1                     \n"  /* BGRA                 */
-
-void I422ToARGB1555Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs        %4, %4, #8                    \n" ARGBTORGB565
+      "vst1.8      {q0}, [%3]!                   \n"  // store 8 pixels RGB565.
+      "bgt         1b                            \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_u),       // %1
+        "+r"(src_v),       // %2
+        "+r"(dst_rgb565),  // %3
+        "+r"(width)        // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTOARGB1555                                                      \
+  "vshll.u8    q0, d23, #8                   \n" /* A                    */ \
+  "vshll.u8    q8, d22, #8                   \n" /* R                    */ \
+  "vshll.u8    q9, d21, #8                   \n" /* G                    */ \
+  "vshll.u8    q10, d20, #8                  \n" /* B                    */ \
+  "vsri.16     q0, q8, #1                    \n" /* AR                   */ \
+  "vsri.16     q0, q9, #6                    \n" /* ARG                  */ \
+  "vsri.16     q0, q10, #11                  \n" /* ARGB                 */
+
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    ARGBTOARGB1555
-    MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb1555),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&kUVToRB),   // %5
-      [kUVToG]"r"(&kUVToG),     // %6
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#define ARGBTOARGB4444                                                         \
-    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
-    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
-    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
-    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
-    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
-    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
-    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
-
-void I422ToARGB4444Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs        %4, %4, #8                    \n"
+      "vmov.u8     d23, #255                     \n" ARGBTOARGB1555
+      "vst1.8      {q0}, [%3]!                   \n"  // store 8 pixels
+      "bgt         1b                            \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_u),         // %1
+        "+r"(src_v),         // %2
+        "+r"(dst_argb1555),  // %3
+        "+r"(width)          // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTOARGB4444                                                      \
+  "vshr.u8    d20, d20, #4                   \n" /* B                    */ \
+  "vbic.32    d21, d21, d4                   \n" /* G                    */ \
+  "vshr.u8    d22, d22, #4                   \n" /* R                    */ \
+  "vbic.32    d23, d23, d4                   \n" /* A                    */ \
+  "vorr       d0, d20, d21                   \n" /* BG                   */ \
+  "vorr       d1, d22, d23                   \n" /* RA                   */ \
+  "vzip.u8    d0, d1                         \n" /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    ARGBTOARGB4444
-    MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb4444),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&kUVToRB),   // %5
-      [kUVToG]"r"(&kUVToG),     // %6
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV400
-    YUV422TORGB
-    "subs       %2, %2, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&kUVToRB),   // %3
-      [kUVToG]"r"(&kUVToG),     // %4
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void J400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d4, #0x0f                     \n"  // vbic bits to clear
+      "1:                                        \n"
+
+      READYUV422 YUVTORGB
+      "subs        %4, %4, #8                    \n"
+      "vmov.u8     d23, #255                     \n" ARGBTOARGB4444
+      "vst1.8      {q0}, [%3]!                   \n"  // store 8 pixels
+      "bgt         1b                            \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_u),         // %1
+        "+r"(src_v),         // %2
+        "+r"(dst_argb4444),  // %3
+        "+r"(width)          // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    "vmov.u8    d23, #255                      \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d20}, [%0]!                   \n"
-    "vmov       d21, d20                       \n"
-    "vmov       d22, d20                       \n"
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    :
-    : "cc", "memory", "d20", "d21", "d22", "d23"
-  );
-}
-
-void NV12ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d23, #255                     \n"
+      "1:                                        \n" READYUV400 YUVTORGB
+      "subs        %2, %2, #8                    \n"
+      "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8     d23, #255                     \n"
+      "1:                                        \n"
+      "vld1.8      {d20}, [%0]!                  \n"
+      "vmov        d21, d20                      \n"
+      "vmov        d22, d20                      \n"
+      "subs        %2, %2, #8                    \n"
+      "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d20", "d21", "d22", "d23");
+}
+
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READNV12
-    YUV422TORGB
-    "subs       %3, %3, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    MEMACCESS(2)
-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&kUVToRB),   // %4
-      [kUVToG]"r"(&kUVToG),     // %5
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void NV21ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8     d23, #255                     \n"
+               "1:                                        \n" READNV12 YUVTORGB
+               "subs        %3, %3, #8                    \n"
+               "vst4.8      {d20, d21, d22, d23}, [%2]!   \n"
+               "bgt         1b                            \n"
+               : "+r"(src_y),     // %0
+                 "+r"(src_uv),    // %1
+                 "+r"(dst_argb),  // %2
+                 "+r"(width)      // %3
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READNV21
-    YUV422TORGB
-    "subs       %3, %3, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    MEMACCESS(2)
-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&kUVToRB),   // %4
-      [kUVToG]"r"(&kUVToG),     // %5
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void NV12ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
-                          int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READNV12
-    YUV422TORGB
-    "subs       %3, %3, #8                     \n"
-    ARGBTORGB565
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_rgb565),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&kUVToRB),   // %4
-      [kUVToG]"r"(&kUVToG),     // %5
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void NV21ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8     d23, #255                     \n"
+               "1:                                        \n" READNV21 YUVTORGB
+               "subs        %3, %3, #8                    \n"
+               "vst4.8      {d20, d21, d22, d23}, [%2]!   \n"
+               "bgt         1b                            \n"
+               : "+r"(src_y),     // %0
+                 "+r"(src_vu),    // %1
+                 "+r"(dst_argb),  // %2
+                 "+r"(width)      // %3
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+
+      YUVTORGB_SETUP
+
+      "1:                                        \n"
+
+      READNV12 YUVTORGB
+      "subs        %3, %3, #8                    \n"
+      "vst3.8      {d20, d21, d22}, [%2]!        \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_uv),     // %1
+        "+r"(dst_rgb24),  // %2
+        "+r"(width)       // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+
+      YUVTORGB_SETUP
+
+      "1:                                        \n"
+
+      READNV21 YUVTORGB
+      "subs        %3, %3, #8                    \n"
+      "vst3.8      {d20, d21, d22}, [%2]!        \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_rgb24),  // %2
+        "+r"(width)       // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READNV21
-    YUV422TORGB
-    "subs       %3, %3, #8                     \n"
-    ARGBTORGB565
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_rgb565),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&kUVToRB),   // %4
-      [kUVToG]"r"(&kUVToG),     // %5
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
-                        uint8* dst_argb,
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READNV12 YUVTORGB
+      "subs        %3, %3, #8                    \n" ARGBTORGB565
+      "vst1.8      {q0}, [%2]!                   \n"  // store 8 pixels RGB565.
+      "bgt         1b                            \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_uv),      // %1
+        "+r"(dst_rgb565),  // %2
+        "+r"(width)        // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUY2
-    YUV422TORGB
-    "subs       %2, %2, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_yuy2),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&kUVToRB),   // %3
-      [kUVToG]"r"(&kUVToG),     // %4
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
-                        uint8* dst_argb,
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8     d23, #255                     \n"
+               "1:                                        \n" READYUY2 YUVTORGB
+               "subs        %2, %2, #8                    \n"
+               "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
+               "bgt         1b                            \n"
+               : "+r"(src_yuy2),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(width)      // %2
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READUYVY
-    YUV422TORGB
-    "subs       %2, %2, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_uyvy),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&kUVToRB),   // %3
-      [kUVToG]"r"(&kUVToG),     // %4
-      [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8     d23, #255                     \n"
+               "1:                                        \n" READUYVY YUVTORGB
+               "subs        %2, %2, #8                    \n"
+               "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
+               "bgt         1b                            \n"
+               : "+r"(src_uyvy),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(width)      // %2
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store U
-    MEMACCESS(2)
-    "vst1.8     {q1}, [%2]!                    \n"  // store V
-    "bgt        1b                             \n"
-    : "+r"(src_uv),  // %0
-      "+r"(dst_u),   // %1
-      "+r"(dst_v),   // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pairs of UV
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q0}, [%1]!                   \n"  // store U
+      "vst1.8      {q1}, [%2]!                   \n"  // store V
+      "bgt         1b                            \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
   );
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load U
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load V
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    MEMACCESS(2)
-    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
-    "bgt        1b                             \n"
-    :
-      "+r"(src_u),   // %0
-      "+r"(src_v),   // %1
-      "+r"(dst_uv),  // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
-    "subs       %2, %2, #32                    \n"  // 32 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
-    "bgt        1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2  // Output registers
-  :                     // Input registers
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// SetRow writes 'count' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8* dst, uint8 v8, int count) {
-  asm volatile (
-    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
-  "1:                                          \n"
-    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
-    MEMACCESS(0)
-    "vst1.8    {q0}, [%0]!                     \n"  // store
-    "bgt       1b                              \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v8)      // %2
-  : "cc", "memory", "q0"
-  );
-}
-
-// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
-void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
-  asm volatile (
-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
-  "1:                                          \n"
-    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
-    MEMACCESS(0)
-    "vst1.8    {q0}, [%0]!                     \n"  // store
-    "bgt       1b                              \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v32)     // %2
-  : "cc", "memory", "q0"
-  );
-}
-
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // Start at end of source row.
-    "mov        r3, #-16                       \n"
-    "add        %0, %0, %2                     \n"
-    "sub        %0, #16                        \n"
-
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #16                        \n"  // 16 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
-  );
-}
-
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load U
+      "vld1.8      {q1}, [%1]!                   \n"  // load V
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop
+      "vst2.8      {q0, q1}, [%2]!               \n"  // store 16 pairs of UV
+      "bgt         1b                            \n"
+      : "+r"(src_u),                // %0
+        "+r"(src_v),                // %1
+        "+r"(dst_uv),               // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
                       int width) {
-  asm volatile (
-    // Start at end of source row.
-    "mov        r12, #-16                      \n"
-    "add        %0, %0, %3, lsl #1             \n"
-    "sub        %0, #16                        \n"
-
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
-    "subs       %3, #8                         \n"  // 8 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_uv),  // %0
-    "+r"(dst_u),   // %1
-    "+r"(dst_v),   // %2
-    "+r"(width)    // %3
-  :
-  : "cc", "memory", "r12", "q0"
-  );
-}
-
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // Start at end of source row.
-    "mov        r3, #-16                       \n"
-    "add        %0, %0, %2, lsl #2             \n"
-    "sub        %0, #16                        \n"
-
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #4                         \n"  // 4 pixels per loop.
-    "vrev64.32  q0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
-  );
-}
-
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
-  asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),   // %1
-    "+r"(pix)         // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
-  asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
-    MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-#define RGB565TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
-    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
-    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
-
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    RGB565TOARGB
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(pix)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB
+      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // next 8 RGB
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q0}, [%1]!                   \n"  // store R
+      "vst1.8      {q1}, [%2]!                   \n"  // store G
+      "vst1.8      {q2}, [%3]!                   \n"  // store B
+      "bgt         1b                            \n"
+      : "+r"(src_rgb),                    // %0
+        "+r"(dst_r),                      // %1
+        "+r"(dst_g),                      // %2
+        "+r"(dst_b),                      // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "d0", "d1", "d2"  // Clobber List
+  );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load R
+      "vld1.8      {q1}, [%1]!                   \n"  // load G
+      "vld1.8      {q2}, [%2]!                   \n"  // load B
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop
+      "vst3.8      {d0, d2, d4}, [%3]!           \n"  // store 8 RGB
+      "vst3.8      {d1, d3, d5}, [%3]!           \n"  // next 8 RGB
+      "bgt         1b                            \n"
+      : "+r"(src_r),                      // %0
+        "+r"(src_g),                      // %1
+        "+r"(src_b),                      // %2
+        "+r"(dst_rgb),                    // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
   );
 }
 
-#define ARGB1555TOARGB                                                         \
-    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
-    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
-    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
-    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
-    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
-    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
-    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
-    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 32
+      "subs        %2, %2, #32                   \n"  // 32 processed per loop
+      "vst1.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 32
+      "bgt         1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+  asm volatile(
+      "vdup.8      q0, %2                        \n"  // duplicate 16 bytes
+      "1:                                        \n"
+      "subs        %1, %1, #16                   \n"  // 16 bytes per loop
+      "vst1.8      {q0}, [%0]!                   \n"  // store
+      "bgt         1b                            \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v8)      // %2
+      : "cc", "memory", "q0");
+}
+
+// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+  asm volatile(
+      "vdup.u32    q0, %2                        \n"  // duplicate 4 ints
+      "1:                                        \n"
+      "subs        %1, %1, #4                    \n"  // 4 pixels per loop
+      "vst1.8      {q0}, [%0]!                   \n"  // store
+      "bgt         1b                            \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v32)     // %2
+      : "cc", "memory", "q0");
+}
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "add         %0, %0, %2                    \n"
+      "sub         %0, %0, #32                   \n"  // 32 bytes per loop
+
+      "1:                                        \n"
+      "vld1.8      {q1, q2}, [%0], %3            \n"  // src -= 32
+      "subs        %2, #32                       \n"  // 32 pixels per loop.
+      "vrev64.8    q0, q2                        \n"
+      "vrev64.8    q1, q1                        \n"
+      "vswp        d0, d1                        \n"
+      "vswp        d2, d3                        \n"
+      "vst1.8      {q0, q1}, [%1]!               \n"  // dst += 32
+      "bgt         1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "r"(-32)     // %3
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov         r12, #-16                     \n"
+      "add         %0, %0, %2, lsl #1            \n"
+      "sub         %0, #16                       \n"
+
+      "1:                                        \n"
+      "vld2.8      {d0, d1}, [%0], r12           \n"  // src -= 16
+      "subs        %2, #8                        \n"  // 8 pixels per loop.
+      "vrev64.8    q0, q0                        \n"
+      "vst2.8      {d0, d1}, [%1]!               \n"  // dst += 16
+      "bgt         1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_uv),  // %1
+        "+r"(width)    // %2
+      :
+      : "cc", "memory", "r12", "q0");
+}
+
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov         r12, #-16                     \n"
+      "add         %0, %0, %3, lsl #1            \n"
+      "sub         %0, #16                       \n"
+
+      "1:                                        \n"
+      "vld2.8      {d0, d1}, [%0], r12           \n"  // src -= 16
+      "subs        %3, #8                        \n"  // 8 pixels per loop.
+      "vrev64.8    q0, q0                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // dst += 8
+      "vst1.8      {d1}, [%2]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "cc", "memory", "r12", "q0");
+}
+
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "add         %0, %0, %2, lsl #2            \n"
+      "sub         %0, #32                       \n"
+
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0], %3    \n"  // src -= 32
+      "subs        %2, #8                        \n"  // 8 pixels per loop.
+      "vrev64.8    d0, d0                        \n"
+      "vrev64.8    d1, d1                        \n"
+      "vrev64.8    d2, d2                        \n"
+      "vrev64.8    d3, d3                        \n"
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // dst += 32
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(-32)         // %3
+      : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  src_rgb24 += width * 3 - 24;
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8      {d0, d1, d2}, [%0], %3        \n"  // src -= 24
+      "subs        %2, #8                        \n"  // 8 pixels per loop.
+      "vrev64.8    d0, d0                        \n"
+      "vrev64.8    d1, d1                        \n"
+      "vrev64.8    d2, d2                        \n"
+      "vst3.8      {d0, d1, d2}, [%1]!           \n"  // dst += 24
+      "bgt         1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      : "r"(-24)          // %3
+      : "cc", "memory", "d0", "d1", "d2");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+      "vmov.u8     d4, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RGB24.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst4.8      {d1, d2, d3, d4}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8     d4, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vswp.u8     d1, d3                        \n"  // swap R, B
+      "vst4.8      {d1, d2, d3, d4}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  asm volatile(
+      "vmov.u8     d0, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vswp.u8     d1, d3                        \n"  // swap R, B
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of RGBA.
+      "bgt         1b                            \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_rgba),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vswp.u8     d1, d3                        \n"  // swap R, B
+      "vst3.8      {d1, d2, d3}, [%1]!           \n"  // store 8 pixels of
+                                                      // RGB24.
+      "bgt         1b                            \n"
+      : "+r"(src_raw),    // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+#define RGB565TOARGB                                                        \
+  "vshrn.u16  d6, q0, #5                     \n" /* G xxGGGGGG           */ \
+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB RRRRRxxx */ \
+  "vshl.u8    d6, d6, #2                     \n" /* G GGGGGG00 upper 6   */ \
+  "vshr.u8    d1, d1, #3                     \n" /* R 000RRRRR lower 5   */ \
+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
+  "vshr.u8    d4, d6, #6                     \n" /* G 000000GG lower 2   */ \
+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
+  "vorr.u8    d1, d4, d6                     \n" /* G                    */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "vmov.u8     d3, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_argb),    // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB1555TOARGB                                                      \
+  "vshrn.u16  d7, q0, #8                     \n" /* A Arrrrrxx           */ \
+  "vshr.u8    d6, d7, #2                     \n" /* R xxxRRRRR           */ \
+  "vshrn.u16  d5, q0, #5                     \n" /* G xxxGGGGG           */ \
+  "vmovn.u16  d4, q0                         \n" /* B xxxBBBBB           */ \
+  "vshr.u8    d7, d7, #7                     \n" /* A 0000000A           */ \
+  "vneg.s8    d7, d7                         \n" /* A AAAAAAAA upper 8   */ \
+  "vshl.u8    d6, d6, #3                     \n" /* R RRRRR000 upper 5   */ \
+  "vshr.u8    q1, q3, #5                     \n" /* R,A 00000RRR lower 3 */ \
+  "vshl.u8    q0, q2, #3                     \n" /* B,G BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,G 00000BBB lower 3 */ \
+  "vorr.u8    q1, q1, q3                     \n" /* R,A                  */ \
+  "vorr.u8    q0, q0, q2                     \n" /* B,G                  */
 
 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
-    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
-
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
-                            int pix) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(pix)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-#define ARGB4444TOARGB                                                         \
-    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
-    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
-    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
-    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
-    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
-    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
-    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
-
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
-                            int pix) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(pix)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(pix)         // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
-    MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_raw),   // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
-    "bgt        1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
-    "bgt        1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
-                         int pix) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
-    MEMACCESS(2)
-    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(pix)        // %3
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
-                         int pix) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
-    MEMACCESS(2)
-    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(pix)        // %3
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_yuy2
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
-    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
-    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
-    MEMACCESS(3)
-    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_yuy2),     // %0
-    "+r"(stride_yuy2),  // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(pix)           // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
-  );
-}
-
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_uyvy
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
-    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
-    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
-    MEMACCESS(3)
-    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_uyvy),     // %0
-    "+r"(stride_uyvy),  // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(pix)           // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+#define RGB555TOARGB                                                        \
+  "vshrn.u16  d6, q0, #5                     \n" /* G xxxGGGGG           */ \
+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB xRRRRRxx */ \
+  "vshl.u8    d6, d6, #3                     \n" /* G GGGGG000 upper 5   */ \
+  "vshr.u8    d1, d1, #2                     \n" /* R 00xRRRRR lower 5   */ \
+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
+  "vshr.u8    d4, d6, #5                     \n" /* G 00000GGG lower 3   */ \
+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
+  "vorr.u8    d1, d4, d6                     \n" /* G                    */
+
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "vmov.u8     d3, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB4444TOARGB                                                      \
+  "vuzp.u8    d0, d1                         \n" /* d0 BG, d1 RA         */ \
+  "vshl.u8    q2, q0, #4                     \n" /* B,R BBBB0000         */ \
+  "vshr.u8    q1, q0, #4                     \n" /* G,A 0000GGGG         */ \
+  "vshr.u8    q0, q2, #4                     \n" /* B,R 0000BBBB         */ \
+  "vorr.u8    q0, q0, q2                     \n" /* B,R BBBBBBBB         */ \
+  "vshl.u8    q2, q1, #4                     \n" /* G,A GGGG0000         */ \
+  "vorr.u8    q1, q1, q2                     \n" /* G,A GGGGGGGG         */ \
+  "vswp.u8    d1, d2                         \n" /* B,R,G,A -> B,G,R,A   */
+
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "vmov.u8     d3, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d1, d2, d3, d4}, [%0]!       \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst3.8      {d1, d2, d3}, [%1]!           \n"  // store 8 pixels of
+                                                      // RGB24.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d1, d2, d3, d4}, [%0]!       \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vswp.u8     d1, d3                        \n"  // swap R, B
+      "vst3.8      {d1, d2, d3}, [%1]!           \n"  // store 8 pixels of RAW.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_raw),   // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "vst1.8      {q0}, [%1]!                   \n"  // store 16 pixels of Y.
+      "bgt         1b                            \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of UYVY.
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "vst1.8      {q1}, [%1]!                   \n"  // store 16 pixels of Y.
+      "bgt         1b                            \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
+      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
+      "vst1.8      {d1}, [%1]!                   \n"  // store 8 U.
+      "vst1.8      {d3}, [%2]!                   \n"  // store 8 V.
+      "bgt         1b                            \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
+      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 U.
+      "vst1.8      {d2}, [%2]!                   \n"  // store 8 V.
+      "bgt         1b                            \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // stride + src_yuy2
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
+      "subs        %4, %4, #16                   \n"  // 16 pixels = 8 UVs.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load next row YUY2.
+      "vrhadd.u8   d1, d1, d5                    \n"  // average rows of U
+      "vrhadd.u8   d3, d3, d7                    \n"  // average rows of V
+      "vst1.8      {d1}, [%2]!                   \n"  // store 8 U.
+      "vst1.8      {d3}, [%3]!                   \n"  // store 8 V.
+      "bgt         1b                            \n"
+      : "+r"(src_yuy2),     // %0
+        "+r"(stride_yuy2),  // %1
+        "+r"(dst_u),        // %2
+        "+r"(dst_v),        // %3
+        "+r"(width)         // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
+  );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // stride + src_uyvy
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
+      "subs        %4, %4, #16                   \n"  // 16 pixels = 8 UVs.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load next row UYVY.
+      "vrhadd.u8   d0, d0, d4                    \n"  // average rows of U
+      "vrhadd.u8   d2, d2, d6                    \n"  // average rows of V
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 U.
+      "vst1.8      {d2}, [%3]!                   \n"  // store 8 V.
+      "bgt         1b                            \n"
+      : "+r"(src_uyvy),     // %0
+        "+r"(stride_uyvy),  // %1
+        "+r"(dst_u),        // %2
+        "+r"(dst_v),        // %3
+        "+r"(width)         // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
   );
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int pix) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // shuffler
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
-    "subs       %2, %2, #4                     \n"  // 4 processed per loop
-    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
-    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "r"(shuffler)    // %3
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-void I422ToYUY2Row_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
-    MEMACCESS(2)
-    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
-    "subs       %4, %4, #16                    \n"  // 16 pixels
-    MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_yuy2),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
-  );
-}
-
-void I422ToUYVYRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
-    MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
-    MEMACCESS(2)
-    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
-    "subs       %4, %4, #16                    \n"  // 16 pixels
-    MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_uyvy),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
-  );
-}
-
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGBTORGB565
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_rgb565),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
-  );
-}
-
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    ".p2align   2                              \n"
-    "vdup.32    d2, %2                         \n"  // dither4
-  "1:                                          \n"
-    MEMACCESS(1)
-    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d20, d20, d2                   \n"
-    "vqadd.u8   d21, d21, d2                   \n"
-    "vqadd.u8   d22, d22, d2                   \n"
-    ARGBTORGB565
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-  : "+r"(dst_rgb)    // %0
-  : "r"(src_argb),   // %1
-    "r"(dither4),    // %2
-    "r"(width)       // %3
-  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
-  );
-}
-
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
-                            int pix) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGBTOARGB1555
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb1555),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
-  );
-}
-
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
-                            int pix) {
-  asm volatile (
-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGBTOARGB4444
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb4444),  // %1
-    "+r"(pix)            // %2
-  :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
-  );
-}
-
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
-  );
-}
-
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
-  );
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+      "vld1.8      {q2}, [%3]                    \n"  // shuffler
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 4 pixels.
+      "subs        %2, %2, #4                    \n"  // 4 processed per loop
+      "vtbl.8      d2, {d0, d1}, d4              \n"  // look up 2 first pixels
+      "vtbl.8      d3, {d0, d1}, d5              \n"  // look up 2 next pixels
+      "vst1.8      {q1}, [%1]!                   \n"  // store 4.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),                   // %0
+        "+r"(dst_argb),                   // %1
+        "+r"(width)                       // %2
+      : "r"(shuffler)                     // %3
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 Ys
+      "vld1.8      {d1}, [%1]!                   \n"  // load 8 Us
+      "vld1.8      {d3}, [%2]!                   \n"  // load 8 Vs
+      "subs        %4, %4, #16                   \n"  // 16 pixels
+      "vst4.8      {d0, d1, d2, d3}, [%3]!       \n"  // Store 8 YUY2/16 pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {d1, d3}, [%0]!               \n"  // load 16 Ys
+      "vld1.8      {d0}, [%1]!                   \n"  // load 8 Us
+      "vld1.8      {d2}, [%2]!                   \n"  // load 8 Vs
+      "subs        %4, %4, #16                   \n"  // 16 pixels
+      "vst4.8      {d0, d1, d2, d3}, [%3]!       \n"  // Store 8 UYVY/16 pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d20, d21, d22, d23}, [%0]!   \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGBTORGB565
+      "vst1.8      {q0}, [%1]!                   \n"  // store 8 pixels RGB565.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),    // %0
+        "+r"(dst_rgb565),  // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "vdup.32     d2, %2                        \n"  // dither4
+      "1:                                        \n"
+      "vld4.8      {d20, d21, d22, d23}, [%1]!   \n"  // load 8 pixels of ARGB.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqadd.u8    d20, d20, d2                  \n"
+      "vqadd.u8    d21, d21, d2                  \n"
+      "vqadd.u8    d22, d22, d2                  \n"  // add for dither
+      ARGBTORGB565
+      "vst1.8      {q0}, [%0]!                   \n"  // store 8 RGB565.
+      "bgt         1b                            \n"
+      : "+r"(dst_rgb)   // %0
+      : "r"(src_argb),  // %1
+        "r"(dither4),   // %2
+        "r"(width)      // %3
+      : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
+                            int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d20, d21, d22, d23}, [%0]!   \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGBTOARGB1555
+      "vst1.8      {q0}, [%1]!                   \n"  // store 8 ARGB1555.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb1555),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
+                            int width) {
+  asm volatile(
+      "vmov.u8     d4, #0x0f                     \n"  // bits to clear with
+                                                      // vbic.
+      "1:                                        \n"
+      "vld4.8      {d20, d21, d22, d23}, [%0]!   \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGBTOARGB4444
+      "vst1.8      {q0}, [%1]!                   \n"  // store 8 ARGB4444.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb4444),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
+      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
+      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
+      "vmov.u8     d27, #16                      \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d27                       \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q3}, [%1]!                   \n"  // store 16 A's.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
+      "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
+      "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
+      "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
+      "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 RGBA pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d1, d24                   \n"  // B
+      "vmlal.u8    q2, d2, d25                   \n"  // G
+      "vmlal.u8    q2, d3, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
 }
 
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix) {
-  asm volatile (
-    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
-    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
-    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
-    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
-    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlsl.u8   q2, d1, d25                    \n"  // G
-    "vmlsl.u8   q2, d2, d26                    \n"  // R
-    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
-
-    "vmull.u8   q3, d2, d24                    \n"  // R
-    "vmlsl.u8   q3, d1, d28                    \n"  // G
-    "vmlsl.u8   q3, d0, d27                    \n"  // B
-    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
-
-    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
-
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(pix)        // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
-void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix) {
-  asm volatile (
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q0, q10                    \n"  // B
-    "vmls.s16   q8, q1, q11                    \n"  // G
-    "vmls.s16   q8, q2, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-
-    "vmul.s16   q9, q2, q10                    \n"  // R
-    "vmls.s16   q9, q1, q14                    \n"  // G
-    "vmls.s16   q9, q0, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(pix)        // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix) {
-  asm volatile (
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(0)
-    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
-    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
-    "vpadd.u16  d1, d8, d9                     \n"  // B
-    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
-    "vpadd.u16  d3, d10, d11                   \n"  // G
-    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
-    "vpadd.u16  d5, d12, d13                   \n"  // R
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
-    "vmul.s16   q8, q0, q10                    \n"  // B
-    "vmls.s16   q8, q1, q11                    \n"  // G
-    "vmls.s16   q8, q2, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q2, q10                    \n"  // R
-    "vmls.s16   q9, q1, q14                    \n"  // G
-    "vmls.s16   q9, q0, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(pix)        // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
-    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
-    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
-    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
-    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
-    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
-    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
-    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
-    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
-    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
-    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vmov.u8     d24, #112                     \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.u8     d25, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.u8     d26, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.u8     d27, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.u8     d28, #94                      \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlsl.u8    q2, d1, d25                   \n"  // G
+      "vmlsl.u8    q2, d2, d26                   \n"  // R
+      "vadd.u16    q2, q2, q15                   \n"  // +128 -> unsigned
+
+      "vmull.u8    q3, d2, d24                   \n"  // R
+      "vmlsl.u8    q3, d1, d28                   \n"  // G
+      "vmlsl.u8    q3, d0, d27                   \n"  // B
+      "vadd.u16    q3, q3, q15                   \n"  // +128 -> unsigned
+
+      "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
+
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
+        "q15");
+}
+
+// clang-format off
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR)                                                 \
+  "vmul.s16   q8, " #QB ", q10               \n" /* B                    */ \
+  "vmls.s16   q8, " #QG ", q11               \n" /* G                    */ \
+  "vmls.s16   q8, " #QR ", q12               \n" /* R                    */ \
+  "vadd.u16   q8, q8, q15                    \n" /* +128 -> unsigned     */ \
+  "vmul.s16   q9, " #QR ", q10               \n" /* R                    */ \
+  "vmls.s16   q9, " #QG ", q14               \n" /* G                    */ \
+  "vmls.s16   q9, " #QB ", q13               \n" /* B                    */ \
+  "vadd.u16   q9, q9, q15                    \n" /* +128 -> unsigned     */ \
+  "vqshrn.u16  d0, q8, #8                    \n" /* 16 bit to 8 bit U    */ \
+  "vqshrn.u16  d1, q9, #8                    \n" /* 16 bit to 8 bit V    */
+// clang-format on
 
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int pix) {
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_argb),  // %0
     "+r"(src_stride_argb),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -1684,1223 +1449,1119 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
 }
 
 // TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int pix) {
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
-    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
-    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
-    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
-    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
+      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
+      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
+      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
+      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_argb),  // %0
     "+r"(src_stride_argb),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
 
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int pix) {
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
-    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
-    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
-    "vrshr.u16  q2, q2, #1                     \n"
-    "vrshr.u16  q3, q3, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_bgra
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 BGRA pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 BGRA pixels.
+      "vpaddl.u8   q3, q3                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more BGRA pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 BGRA pixels.
+      "vpadal.u8   q3, q7                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q1, q1, #1                    \n"  // 2x average
+      "vrshr.u16   q2, q2, #1                    \n"
+      "vrshr.u16   q3, q3, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q3, q2, q1)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_bgra),  // %0
     "+r"(src_stride_bgra),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
 
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int pix) {
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_abgr
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ABGR pixels.
+      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ABGR pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ABGR pixels.
+      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q2, q1, q0)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_abgr),  // %0
     "+r"(src_stride_abgr),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
 
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int pix) {
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
-    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
-    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_rgba
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 RGBA pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 RGBA pixels.
+      "vpaddl.u8   q0, q1                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q2                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q3                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more RGBA pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 RGBA pixels.
+      "vpadal.u8   q0, q5                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q6                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q7                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_rgba),  // %0
     "+r"(src_stride_rgba),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
 
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int pix) {
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
-    MEMACCESS(0)
-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
-    MEMACCESS(1)
-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_rgb24
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB24 pixels.
+      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // load next 8 RGB24 pixels.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vld3.8      {d8, d10, d12}, [%1]!         \n"  // load 8 more RGB24 pixels.
+      "vld3.8      {d9, d11, d13}, [%1]!         \n"  // load last 8 RGB24 pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_rgb24),  // %0
     "+r"(src_stride_rgb24),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
 
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int pix) {
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_raw
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
-    MEMACCESS(0)
-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
-    MEMACCESS(1)
-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+      "add         %1, %0, %1                    \n"  // src_stride + src_raw
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RAW pixels.
+      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // load next 8 RAW pixels.
+      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
+      "vld3.8      {d8, d10, d12}, [%1]!         \n"  // load 8 more RAW pixels.
+      "vld3.8      {d9, d11, d13}, [%1]!         \n"  // load last 8 RAW pixels.
+      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
     RGBTOUV(q2, q1, q0)
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
   : "+r"(src_raw),  // %0
     "+r"(src_stride_raw),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(src_stride_rgb565),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                        uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(src_stride_argb1555),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(src_stride_argb4444),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
 
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    RGB565TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_y),       // %1
-    "+r"(pix)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
-  );
-}
-
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(pix)            // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
-  );
-}
-
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(pix)            // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
-  );
-}
-
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
-  asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d1, d4                     \n"  // R
-    "vmlal.u8   q8, d2, d5                     \n"  // G
-    "vmlal.u8   q8, d3, d6                     \n"  // B
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
-  asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // R
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // B
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
-  asm volatile (
-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d1, d4                     \n"  // B
-    "vmlal.u8   q8, d2, d5                     \n"  // G
-    "vmlal.u8   q8, d3, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
-  asm volatile (
-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // B
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
-  asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // B
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpaddl.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%0]!                   \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpaddl.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vld1.8      {q0}, [%1]!                   \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpadal.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%1]!                   \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpadal.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16   q4, q4, #1                    \n"  // 2x average
+      "vrshr.u16   q5, q5, #1                    \n"
+      "vrshr.u16   q6, q6, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vmul.s16    q8, q4, q10                   \n"  // B
+      "vmls.s16    q8, q5, q11                   \n"  // G
+      "vmls.s16    q8, q6, q12                   \n"  // R
+      "vadd.u16    q8, q8, q15                   \n"  // +128 -> unsigned
+      "vmul.s16    q9, q6, q10                   \n"  // R
+      "vmls.s16    q9, q5, q14                   \n"  // G
+      "vmls.s16    q9, q4, q13                   \n"  // B
+      "vadd.u16    q9, q9, q15                   \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+      : "+r"(src_rgb565),         // %0
+        "+r"(src_stride_rgb565),  // %1
+        "+r"(dst_u),              // %2
+        "+r"(dst_v),              // %3
+        "+r"(width)               // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpaddl.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%0]!                   \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpaddl.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vld1.8      {q0}, [%1]!                   \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpadal.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%1]!                   \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpadal.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16   q4, q4, #1                    \n"  // 2x average
+      "vrshr.u16   q5, q5, #1                    \n"
+      "vrshr.u16   q6, q6, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vmul.s16    q8, q4, q10                   \n"  // B
+      "vmls.s16    q8, q5, q11                   \n"  // G
+      "vmls.s16    q8, q6, q12                   \n"  // R
+      "vadd.u16    q8, q8, q15                   \n"  // +128 -> unsigned
+      "vmul.s16    q9, q6, q10                   \n"  // R
+      "vmls.s16    q9, q5, q14                   \n"  // G
+      "vmls.s16    q9, q4, q13                   \n"  // B
+      "vadd.u16    q9, q9, q15                   \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+      : "+r"(src_argb1555),         // %0
+        "+r"(src_stride_argb1555),  // %1
+        "+r"(dst_u),                // %2
+        "+r"(dst_v),                // %3
+        "+r"(width)                 // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpaddl.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%0]!                   \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpaddl.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vld1.8      {q0}, [%1]!                   \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpadal.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%1]!                   \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpadal.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16   q0, q4, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q5, #1                    \n"
+      "vrshr.u16   q2, q6, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      RGBTOUV(q0, q1, q2)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+      : "+r"(src_argb4444),         // %0
+        "+r"(src_stride_argb4444),  // %1
+        "+r"(dst_u),                // %2
+        "+r"(dst_v),                // %3
+        "+r"(width)                 // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
+      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
+      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
+      "vmov.u8     d27, #16                      \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d27                       \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
+      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
+      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
+      "vmov.u8     d27, #16                      \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d27                       \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
+      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
+      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
+      "vmov.u8     d27, #16                      \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d27                       \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8     d6, #25                       \n"  // B * 0.1016 coefficient
+      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
+      "vmov.u8     d4, #66                       \n"  // R * 0.2578 coefficient
+      "vmov.u8     d7, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of BGRA.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q8, d1, d4                    \n"  // R
+      "vmlal.u8    q8, d2, d5                    \n"  // G
+      "vmlal.u8    q8, d3, d6                    \n"  // B
+      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d7                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8     d6, #25                       \n"  // B * 0.1016 coefficient
+      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
+      "vmov.u8     d4, #66                       \n"  // R * 0.2578 coefficient
+      "vmov.u8     d7, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ABGR.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q8, d0, d4                    \n"  // R
+      "vmlal.u8    q8, d1, d5                    \n"  // G
+      "vmlal.u8    q8, d2, d6                    \n"  // B
+      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d7                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8     d4, #25                       \n"  // B * 0.1016 coefficient
+      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
+      "vmov.u8     d6, #66                       \n"  // R * 0.2578 coefficient
+      "vmov.u8     d7, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of RGBA.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q8, d1, d4                    \n"  // B
+      "vmlal.u8    q8, d2, d5                    \n"  // G
+      "vmlal.u8    q8, d3, d6                    \n"  // R
+      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d7                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8     d4, #25                       \n"  // B * 0.1016 coefficient
+      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
+      "vmov.u8     d6, #66                       \n"  // R * 0.2578 coefficient
+      "vmov.u8     d7, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RGB24.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q8, d0, d4                    \n"  // B
+      "vmlal.u8    q8, d1, d5                    \n"  // G
+      "vmlal.u8    q8, d2, d6                    \n"  // R
+      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d7                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_y),      // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8     d6, #25                       \n"  // B * 0.1016 coefficient
+      "vmov.u8     d5, #129                      \n"  // G * 0.5078 coefficient
+      "vmov.u8     d4, #66                       \n"  // R * 0.2578 coefficient
+      "vmov.u8     d7, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q8, d0, d4                    \n"  // B
+      "vmlal.u8    q8, d1, d5                    \n"  // G
+      "vmlal.u8    q8, d2, d6                    \n"  // R
+      "vqrshrn.u16 d0, q8, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d7                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_raw),  // %0
+        "+r"(dst_y),    // %1
+        "+r"(width)     // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  asm volatile(
+      "vmov.u8     d4, #29                       \n"  // B * 0.1140 coefficient
+      "vmov.u8     d5, #150                      \n"  // G * 0.5870 coefficient
+      "vmov.u8     d6, #77                       \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RGB24.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q4, d0, d4                    \n"  // B
+      "vmlal.u8    q4, d1, d5                    \n"  // G
+      "vmlal.u8    q4, d2, d6                    \n"  // R
+      "vqrshrn.u16 d0, q4, #8                    \n"  // 16 bit to 8 bit Y
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_yj),     // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  asm volatile(
+      "vmov.u8     d6, #29                       \n"  // B * 0.1140 coefficient
+      "vmov.u8     d5, #150                      \n"  // G * 0.5870 coefficient
+      "vmov.u8     d4, #77                       \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "vld3.8      {d0, d1, d2}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q4, d0, d4                    \n"  // B
+      "vmlal.u8    q4, d1, d5                    \n"  // G
+      "vmlal.u8    q4, d2, d6                    \n"  // R
+      "vqrshrn.u16 d0, q4, #8                    \n"  // 16 bit to 8 bit Y
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_raw),  // %0
+        "+r"(dst_yj),   // %1
+        "+r"(width)     // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
 }
 
 // Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8* dst_ptr,
-                         const uint8* src_ptr, ptrdiff_t src_stride,
-                         int dst_width, int source_y_fraction) {
-  asm volatile (
-    "cmp        %4, #0                         \n"
-    "beq        100f                           \n"
-    "add        %2, %1                         \n"
-    "cmp        %4, #64                        \n"
-    "beq        75f                            \n"
-    "cmp        %4, #128                       \n"
-    "beq        50f                            \n"
-    "cmp        %4, #192                       \n"
-    "beq        25f                            \n"
-
-    "vdup.8     d5, %4                         \n"
-    "rsb        %4, #256                       \n"
-    "vdup.8     d4, %4                         \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vmull.u8   q13, d0, d4                    \n"
-    "vmull.u8   q14, d1, d4                    \n"
-    "vmlal.u8   q13, d2, d5                    \n"
-    "vmlal.u8   q14, d3, d5                    \n"
-    "vrshrn.u16 d0, q13, #8                    \n"
-    "vrshrn.u16 d1, q14, #8                    \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        1b                             \n"
-    "b          99f                            \n"
-
-    // Blend 25 / 75.
-  "25:                                         \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vrhadd.u8  q0, q1                         \n"
-    "vrhadd.u8  q0, q1                         \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        25b                            \n"
-    "b          99f                            \n"
-
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vrhadd.u8  q0, q1                         \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        50b                            \n"
-    "b          99f                            \n"
-
-    // Blend 75 / 25.
-  "75:                                         \n"
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"
-    MEMACCESS(2)
-    "vld1.8     {q0}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vrhadd.u8  q0, q1                         \n"
-    "vrhadd.u8  q0, q1                         \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        75b                            \n"
-    "b          99f                            \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        100b                           \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
-    "+r"(dst_width),        // %3
-    "+r"(source_y_fraction) // %4
-  :
-  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
-  );
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  asm volatile(
+      "cmp         %4, #0                        \n"
+      "beq         100f                          \n"
+      "add         %2, %1                        \n"
+      "cmp         %4, #128                      \n"
+      "beq         50f                           \n"
+
+      "vdup.8      d5, %4                        \n"
+      "rsb         %4, #256                      \n"
+      "vdup.8      d4, %4                        \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vmull.u8    q13, d0, d4                   \n"
+      "vmull.u8    q14, d1, d4                   \n"
+      "vmlal.u8    q13, d2, d5                   \n"
+      "vmlal.u8    q14, d3, d5                   \n"
+      "vrshrn.u16  d0, q13, #8                   \n"
+      "vrshrn.u16  d1, q14, #8                   \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         1b                            \n"
+      "b           99f                           \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         50b                           \n"
+      "b           99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         100b                          \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(src_stride),  // %2
+        "+r"(dst_width),   // %3
+        "+r"(y1_fraction)  // %4
+      :
+      : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
 }
 
 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
-  asm volatile (
-    "subs       %3, #8                         \n"
-    "blt        89f                            \n"
-    // Blend 8 pixels.
-  "8:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q10, d4, d3                    \n"  // db * a
-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
-    "vmov.u8    d3, #255                       \n"  // a = 255
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
-    "bge        8b                             \n"
-
-  "89:                                         \n"
-    "adds       %3, #8-1                       \n"
-    "blt        99f                            \n"
-
-    // Blend 1 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
-    MEMACCESS(1)
-    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
-    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
-    "vmull.u8   q10, d4, d3                    \n"  // db * a
-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
-    "vmov.u8    d3, #255                       \n"  // a = 255
-    MEMACCESS(2)
-    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
-    "bge        1b                             \n"
-
-  "99:                                         \n"
-
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
-  );
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "subs        %3, #8                        \n"
+      "blt         89f                           \n"
+      // Blend 8 pixels.
+      "8:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ARGB0.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load 8 pixels of ARGB1.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q10, d4, d3                   \n"  // db * a
+      "vmull.u8    q11, d5, d3                   \n"  // dg * a
+      "vmull.u8    q12, d6, d3                   \n"  // dr * a
+      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+      "vqsub.u8    q2, q2, q10                   \n"  // dbg - dbg * a / 256
+      "vqsub.u8    d6, d6, d22                   \n"  // dr - dr * a / 256
+      "vqadd.u8    q0, q0, q2                    \n"  // + sbg
+      "vqadd.u8    d2, d2, d6                    \n"  // + sr
+      "vmov.u8     d3, #255                      \n"  // a = 255
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 pixels of ARGB.
+      "bge         8b                            \n"
+
+      "89:                                       \n"
+      "adds        %3, #8-1                      \n"
+      "blt         99f                           \n"
+
+      // Blend 1 pixels.
+      "1:                                        \n"
+      "vld4.8      {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+      "vld4.8      {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+      "subs        %3, %3, #1                    \n"  // 1 processed per loop.
+      "vmull.u8    q10, d4, d3                   \n"  // db * a
+      "vmull.u8    q11, d5, d3                   \n"  // dg * a
+      "vmull.u8    q12, d6, d3                   \n"  // dr * a
+      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+      "vqsub.u8    q2, q2, q10                   \n"  // dbg - dbg * a / 256
+      "vqsub.u8    d6, d6, d22                   \n"  // dr - dr * a / 256
+      "vqadd.u8    q0, q0, q2                    \n"  // + sbg
+      "vqadd.u8    d2, d2, d6                    \n"  // + sr
+      "vmov.u8     d3, #255                      \n"  // a = 255
+      "vst4.8      {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
+      "bge         1b                            \n"
+
+      "99:                                       \n"
+
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
 }
 
 // Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    // Attenuate 8 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q10, d0, d3                    \n"  // b * a
-    "vmull.u8   q11, d1, d3                    \n"  // g * a
-    "vmull.u8   q12, d2, d3                    \n"  // r * a
-    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
-    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
-    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
-  );
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      // Attenuate 8 pixels.
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q10, d0, d3                   \n"  // b * a
+      "vmull.u8    q11, d1, d3                   \n"  // g * a
+      "vmull.u8    q12, d2, d3                   \n"  // r * a
+      "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
+      "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
+      "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
 }
 
 // Quantize 8 ARGB pixels (32 bytes).
 // dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "vdup.u16   q8, %2                         \n"
-    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
-    "vdup.u16   q9, %3                         \n"  // interval multiply.
-    "vdup.u16   q10, %4                        \n"  // interval add
-
-    // 8 pixel loop.
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
-    "vmovl.u8   q1, d2                         \n"
-    "vmovl.u8   q2, d4                         \n"
-    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
-    "vqdmulh.s16 q1, q1, q8                    \n"  // g
-    "vqdmulh.s16 q2, q2, q8                    \n"  // r
-    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
-    "vmul.u16   q1, q1, q9                     \n"  // g
-    "vmul.u16   q2, q2, q9                     \n"  // r
-    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
-    "vadd.u16   q1, q1, q10                    \n"  // g
-    "vadd.u16   q2, q2, q10                    \n"  // r
-    "vqmovn.u16 d0, q0                         \n"
-    "vqmovn.u16 d2, q1                         \n"
-    "vqmovn.u16 d4, q2                         \n"
-    MEMACCESS(0)
-    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
-  );
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "vdup.u16    q8, %2                        \n"
+      "vshr.u16    q8, q8, #1                    \n"  // scale >>= 1
+      "vdup.u16    q9, %3                        \n"  // interval multiply.
+      "vdup.u16    q10, %4                       \n"  // interval add
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]        \n"  // load 8 pixels of ARGB.
+      "subs        %1, %1, #8                    \n"  // 8 processed per loop.
+      "vmovl.u8    q0, d0                        \n"  // b (0 .. 255)
+      "vmovl.u8    q1, d2                        \n"
+      "vmovl.u8    q2, d4                        \n"
+      "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
+      "vqdmulh.s16 q1, q1, q8                    \n"  // g
+      "vqdmulh.s16 q2, q2, q8                    \n"  // r
+      "vmul.u16    q0, q0, q9                    \n"  // b * interval_size
+      "vmul.u16    q1, q1, q9                    \n"  // g
+      "vmul.u16    q2, q2, q9                    \n"  // r
+      "vadd.u16    q0, q0, q10                   \n"  // b + interval_offset
+      "vadd.u16    q1, q1, q10                   \n"  // g
+      "vadd.u16    q2, q2, q10                   \n"  // r
+      "vqmovn.u16  d0, q0                        \n"
+      "vqmovn.u16  d2, q1                        \n"
+      "vqmovn.u16  d4, q2                        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%0]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
 }
 
 // Shade 8 pixels at a time by specified value.
 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
-    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
-    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
-
-    // 8 pixel loop.
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
-    "vmovl.u8   q11, d22                       \n"
-    "vmovl.u8   q12, d24                       \n"
-    "vmovl.u8   q13, d26                       \n"
-    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
-    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
-    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
-    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
-    "vqmovn.u16 d20, q10                       \n"
-    "vqmovn.u16 d22, q11                       \n"
-    "vqmovn.u16 d24, q12                       \n"
-    "vqmovn.u16 d26, q13                       \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),       // %0
-    "+r"(dst_argb),       // %1
-    "+r"(width)           // %2
-  : "r"(value)            // %3
-  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
-  );
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "vdup.u32    q0, %3                        \n"  // duplicate scale value.
+      "vzip.u8     d0, d1                        \n"  // d0 aarrggbb.
+      "vshr.u16    q0, q0, #1                    \n"  // scale / 2.
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8      {d20, d22, d24, d26}, [%0]!   \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmovl.u8    q10, d20                      \n"  // b (0 .. 255)
+      "vmovl.u8    q11, d22                      \n"
+      "vmovl.u8    q12, d24                      \n"
+      "vmovl.u8    q13, d26                      \n"
+      "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
+      "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
+      "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
+      "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
+      "vqmovn.u16  d20, q10                      \n"
+      "vqmovn.u16  d22, q11                      \n"
+      "vqmovn.u16  d24, q12                      \n"
+      "vqmovn.u16  d26, q13                      \n"
+      "vst4.8      {d20, d22, d24, d26}, [%1]!   \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
 }
 
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 // Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
-    "vmov       d1, d0                         \n"  // G
-    "vmov       d2, d0                         \n"  // R
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
-  );
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
+      "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
+      "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit B
+      "vmov        d1, d0                        \n"  // G
+      "vmov        d2, d0                        \n"  // R
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
 }
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 //    b = (r * 35 + g * 68 + b * 17) >> 7
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d20, #17                       \n"  // BB coefficient
-    "vmov.u8    d21, #68                       \n"  // BG coefficient
-    "vmov.u8    d22, #35                       \n"  // BR coefficient
-    "vmov.u8    d24, #22                       \n"  // GB coefficient
-    "vmov.u8    d25, #88                       \n"  // GG coefficient
-    "vmov.u8    d26, #45                       \n"  // GR coefficient
-    "vmov.u8    d28, #24                       \n"  // BB coefficient
-    "vmov.u8    d29, #98                       \n"  // BG coefficient
-    "vmov.u8    d30, #50                       \n"  // BR coefficient
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
-    "vmlal.u8   q2, d1, d21                    \n"  // G
-    "vmlal.u8   q2, d2, d22                    \n"  // R
-    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
-    "vmlal.u8   q3, d1, d25                    \n"  // G
-    "vmlal.u8   q3, d2, d26                    \n"  // R
-    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
-    "vmlal.u8   q8, d1, d29                    \n"  // G
-    "vmlal.u8   q8, d2, d30                    \n"  // R
-    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
-    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
-    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
-    MEMACCESS(0)
-    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),  // %0
-    "+r"(width)      // %1
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3",
-    "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8     d20, #17                      \n"  // BB coefficient
+      "vmov.u8     d21, #68                      \n"  // BG coefficient
+      "vmov.u8     d22, #35                      \n"  // BR coefficient
+      "vmov.u8     d24, #22                      \n"  // GB coefficient
+      "vmov.u8     d25, #88                      \n"  // GG coefficient
+      "vmov.u8     d26, #45                      \n"  // GR coefficient
+      "vmov.u8     d28, #24                      \n"  // BB coefficient
+      "vmov.u8     d29, #98                      \n"  // BG coefficient
+      "vmov.u8     d30, #50                      \n"  // BR coefficient
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]        \n"  // load 8 ARGB pixels.
+      "subs        %1, %1, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d20                   \n"  // B to Sepia B
+      "vmlal.u8    q2, d1, d21                   \n"  // G
+      "vmlal.u8    q2, d2, d22                   \n"  // R
+      "vmull.u8    q3, d0, d24                   \n"  // B to Sepia G
+      "vmlal.u8    q3, d1, d25                   \n"  // G
+      "vmlal.u8    q3, d2, d26                   \n"  // R
+      "vmull.u8    q8, d0, d28                   \n"  // B to Sepia R
+      "vmlal.u8    q8, d1, d29                   \n"  // G
+      "vmlal.u8    q8, d2, d30                   \n"  // R
+      "vqshrn.u16  d0, q2, #7                    \n"  // 16 bit to 8 bit B
+      "vqshrn.u16  d1, q3, #7                    \n"  // 16 bit to 8 bit G
+      "vqshrn.u16  d2, q8, #7                    \n"  // 16 bit to 8 bit R
+      "vst4.8      {d0, d1, d2, d3}, [%0]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(dst_argb),  // %0
+        "+r"(width)      // %1
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
+        "q14", "q15");
 }
 
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
 // needs to saturate.  Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
-    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
-    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
-
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
-    "vmovl.u8   q9, d18                        \n"  // g
-    "vmovl.u8   q10, d20                       \n"  // r
-    "vmovl.u8   q11, d22                       \n"  // a
-    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
-    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
-    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
-    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
-    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
-    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
-    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
-    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
-    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
-    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
-    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
-    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
-    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
-    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
-    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
-    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
-    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
-    MEMACCESS(1)
-    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "r"(matrix_argb)  // %3
-  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
-    "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width) {
+  asm volatile(
+      "vld1.8      {q2}, [%3]                    \n"  // load 3 ARGB vectors.
+      "vmovl.s8    q0, d4                        \n"  // B,G coefficients s16.
+      "vmovl.s8    q1, d5                        \n"  // R,A coefficients s16.
+
+      "1:                                        \n"
+      "vld4.8      {d16, d18, d20, d22}, [%0]!   \n"  // load 8 ARGB pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmovl.u8    q8, d16                       \n"  // b (0 .. 255) 16 bit
+      "vmovl.u8    q9, d18                       \n"  // g
+      "vmovl.u8    q10, d20                      \n"  // r
+      "vmovl.u8    q11, d22                      \n"  // a
+      "vmul.s16    q12, q8, d0[0]                \n"  // B = B * Matrix B
+      "vmul.s16    q13, q8, d1[0]                \n"  // G = B * Matrix G
+      "vmul.s16    q14, q8, d2[0]                \n"  // R = B * Matrix R
+      "vmul.s16    q15, q8, d3[0]                \n"  // A = B * Matrix A
+      "vmul.s16    q4, q9, d0[1]                 \n"  // B += G * Matrix B
+      "vmul.s16    q5, q9, d1[1]                 \n"  // G += G * Matrix G
+      "vmul.s16    q6, q9, d2[1]                 \n"  // R += G * Matrix R
+      "vmul.s16    q7, q9, d3[1]                 \n"  // A += G * Matrix A
+      "vqadd.s16   q12, q12, q4                  \n"  // Accumulate B
+      "vqadd.s16   q13, q13, q5                  \n"  // Accumulate G
+      "vqadd.s16   q14, q14, q6                  \n"  // Accumulate R
+      "vqadd.s16   q15, q15, q7                  \n"  // Accumulate A
+      "vmul.s16    q4, q10, d0[2]                \n"  // B += R * Matrix B
+      "vmul.s16    q5, q10, d1[2]                \n"  // G += R * Matrix G
+      "vmul.s16    q6, q10, d2[2]                \n"  // R += R * Matrix R
+      "vmul.s16    q7, q10, d3[2]                \n"  // A += R * Matrix A
+      "vqadd.s16   q12, q12, q4                  \n"  // Accumulate B
+      "vqadd.s16   q13, q13, q5                  \n"  // Accumulate G
+      "vqadd.s16   q14, q14, q6                  \n"  // Accumulate R
+      "vqadd.s16   q15, q15, q7                  \n"  // Accumulate A
+      "vmul.s16    q4, q11, d0[3]                \n"  // B += A * Matrix B
+      "vmul.s16    q5, q11, d1[3]                \n"  // G += A * Matrix G
+      "vmul.s16    q6, q11, d2[3]                \n"  // R += A * Matrix R
+      "vmul.s16    q7, q11, d3[3]                \n"  // A += A * Matrix A
+      "vqadd.s16   q12, q12, q4                  \n"  // Accumulate B
+      "vqadd.s16   q13, q13, q5                  \n"  // Accumulate G
+      "vqadd.s16   q14, q14, q6                  \n"  // Accumulate R
+      "vqadd.s16   q15, q15, q7                  \n"  // Accumulate A
+      "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+      "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+      "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+      "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+      "vst4.8      {d16, d18, d20, d22}, [%1]!   \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+        "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
-// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
-#ifdef HAS_ARGBMULTIPLYROW_NEON
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q0, d0, d1                     \n"  // multiply B
-    "vmull.u8   q1, d2, d3                     \n"  // multiply G
-    "vmull.u8   q2, d4, d5                     \n"  // multiply R
-    "vmull.u8   q3, d6, d7                     \n"  // multiply A
-    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
-    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
-    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
-    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
-  );
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%1]!       \n"  // load 8 more ARGB
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q0, d0, d1                    \n"  // multiply B
+      "vmull.u8    q1, d2, d3                    \n"  // multiply G
+      "vmull.u8    q2, d4, d5                    \n"  // multiply R
+      "vmull.u8    q3, d6, d7                    \n"  // multiply A
+      "vrshrn.u16  d0, q0, #8                    \n"  // 16 bit to 8 bit B
+      "vrshrn.u16  d1, q1, #8                    \n"  // 16 bit to 8 bit G
+      "vrshrn.u16  d2, q2, #8                    \n"  // 16 bit to 8 bit R
+      "vrshrn.u16  d3, q3, #8                    \n"  // 16 bit to 8 bit A
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
 }
-#endif  // HAS_ARGBMULTIPLYROW_NEON
 
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
-    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
-  );
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load 8 more ARGB
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqadd.u8    q0, q0, q2                    \n"  // add B, G
+      "vqadd.u8    q1, q1, q3                    \n"  // add R, A
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
 }
 
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
-    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
-  );
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load 8 more ARGB
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqsub.u8    q0, q0, q2                    \n"  // subtract B, G
+      "vqsub.u8    q1, q1, q3                    \n"  // subtract R, A
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
 }
 
 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
@@ -2908,56 +2569,50 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
 // R = Sobel
 // G = Sobel
 // B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
-    // 8 pixel loop.
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d0, d0, d1                     \n"  // add
-    "vmov.u8    d1, d0                         \n"
-    "vmov.u8    d2, d0                         \n"
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1"
-  );
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "vmov.u8     d3, #255                      \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld1.8      {d0}, [%0]!                   \n"  // load 8 sobelx.
+      "vld1.8      {d1}, [%1]!                   \n"  // load 8 sobely.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqadd.u8    d0, d0, d1                    \n"  // add
+      "vmov.u8     d1, d0                        \n"
+      "vmov.u8     d2, d0                        \n"
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
 }
 
 // Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  asm volatile (
-    // 16 pixel loop.
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-    "vqadd.u8   q0, q0, q1                     \n"  // add
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1"
-  );
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      // 16 pixel loop.
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 16 sobelx.
+      "vld1.8      {q1}, [%1]!                   \n"  // load 16 sobely.
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop.
+      "vqadd.u8    q0, q0, q1                    \n"  // add
+      "vst1.8      {q0}, [%2]!                   \n"  // store 16 pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
 }
 
 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
@@ -2965,72 +2620,64 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
-    // 8 pixel loop.
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d1, d0, d2                     \n"  // add
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1"
-  );
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "vmov.u8     d3, #255                      \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld1.8      {d2}, [%0]!                   \n"  // load 8 sobelx.
+      "vld1.8      {d0}, [%1]!                   \n"  // load 8 sobely.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqadd.u8    d1, d0, d2                    \n"  // add
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
 }
 
 // SobelX as a matrix is
 // -1  0  1
 // -2  0  2
 // -1  0  1
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%5                  \n"  // top
-    MEMACCESS(0)
-    "vld1.8     {d1}, [%0],%6                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
-    MEMACCESS(1)
-    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%6                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    MEMACCESS(2)
-    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
-    MEMACCESS(2)
-    "vld1.8     {d3}, [%2],%6                  \n"
-    "subs       %4, %4, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
-    MEMACCESS(3)
-    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
-    "bgt        1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  : "r"(2),            // %5
-    "r"(6)             // %6
-  : "cc", "memory", "q0", "q1"  // Clobber List
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {d0}, [%0],%5                 \n"  // top
+      "vld1.8      {d1}, [%0],%6                 \n"
+      "vsubl.u8    q0, d0, d1                    \n"
+      "vld1.8      {d2}, [%1],%5                 \n"  // center * 2
+      "vld1.8      {d3}, [%1],%6                 \n"
+      "vsubl.u8    q1, d2, d3                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vld1.8      {d2}, [%2],%5                 \n"  // bottom
+      "vld1.8      {d3}, [%2],%6                 \n"
+      "subs        %4, %4, #8                    \n"  // 8 pixels
+      "vsubl.u8    q1, d2, d3                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vabs.s16    q0, q0                        \n"
+      "vqmovn.u16  d0, q0                        \n"
+      "vst1.8      {d0}, [%3]!                   \n"  // store 8 sobelx
+      "bgt         1b                            \n"
+      : "+r"(src_y0),               // %0
+        "+r"(src_y1),               // %1
+        "+r"(src_y2),               // %2
+        "+r"(dst_sobelx),           // %3
+        "+r"(width)                 // %4
+      : "r"(2),                     // %5
+        "r"(6)                      // %6
+      : "cc", "memory", "q0", "q1"  // Clobber List
   );
 }
 
@@ -3038,45 +2685,353 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%4                  \n"  // left
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1],%4                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%4                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%5                  \n"  // right
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%5                  \n"
-    "subs       %3, %3, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
-    "bgt        1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  : "r"(1),            // %4
-    "r"(6)             // %5
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {d0}, [%0],%4                 \n"  // left
+      "vld1.8      {d1}, [%1],%4                 \n"
+      "vsubl.u8    q0, d0, d1                    \n"
+      "vld1.8      {d2}, [%0],%4                 \n"  // center * 2
+      "vld1.8      {d3}, [%1],%4                 \n"
+      "vsubl.u8    q1, d2, d3                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vld1.8      {d2}, [%0],%5                 \n"  // right
+      "vld1.8      {d3}, [%1],%5                 \n"
+      "subs        %3, %3, #8                    \n"  // 8 pixels
+      "vsubl.u8    q1, d2, d3                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vabs.s16    q0, q0                        \n"
+      "vqmovn.u16  d0, q0                        \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 sobely
+      "bgt         1b                            \n"
+      : "+r"(src_y0),               // %0
+        "+r"(src_y1),               // %1
+        "+r"(dst_sobely),           // %2
+        "+r"(width)                 // %3
+      : "r"(1),                     // %4
+        "r"(6)                      // %5
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// %y passes a float as a scalar vector for vector * scalar multiply.
+// the regoster must be d0 to d15 and indexed with [0] or [1] to access
+// the float in the first or second float of the d-reg
+
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float /*unused*/,
+                        int width) {
+  asm volatile(
+
+      "1:                                        \n"
+      "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
+      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
+      "vmovl.u16   q2, d2                        \n"  // 8 int's
+      "vmovl.u16   q3, d3                        \n"
+      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
+      "vcvt.f32.u32 q3, q3                       \n"
+      "vmul.f32    q2, q2, %y3                   \n"  // adjust exponent
+      "vmul.f32    q3, q3, %y3                   \n"
+      "vqshrn.u32  d2, q2, #13                   \n"  // isolate halffloat
+      "vqshrn.u32  d3, q3, #13                   \n"
+      "vst1.8      {q1}, [%1]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src),              // %0
+        "+r"(dst),              // %1
+        "+r"(width)             // %2
+      : "w"(1.9259299444e-34f)  // %3
+      : "cc", "memory", "q1", "q2", "q3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+
+      "1:                                        \n"
+      "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
+      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
+      "vmovl.u16   q2, d2                        \n"  // 8 int's
+      "vmovl.u16   q3, d3                        \n"
+      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
+      "vcvt.f32.u32 q3, q3                       \n"
+      "vmul.f32    q2, q2, %y3                   \n"  // adjust exponent
+      "vmul.f32    q3, q3, %y3                   \n"
+      "vqshrn.u32  d2, q2, #13                   \n"  // isolate halffloat
+      "vqshrn.u32  d3, q3, #13                   \n"
+      "vst1.8      {q1}, [%1]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src),                      // %0
+        "+r"(dst),                      // %1
+        "+r"(width)                     // %2
+      : "w"(scale * 1.9259299444e-34f)  // %3
+      : "cc", "memory", "q1", "q2", "q3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width) {
+  asm volatile(
+
+      "1:                                        \n"
+      "vld1.8      {d2}, [%0]!                   \n"  // load 8 bytes
+      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
+      "vmovl.u8    q1, d2                        \n"  // 8 shorts
+      "vmovl.u16   q2, d2                        \n"  // 8 ints
+      "vmovl.u16   q3, d3                        \n"
+      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
+      "vcvt.f32.u32 q3, q3                       \n"
+      "vmul.f32    q2, q2, %y3                   \n"  // scale
+      "vmul.f32    q3, q3, %y3                   \n"
+      "vst1.8      {q2, q3}, [%1]!               \n"  // store 8 floats
+      "bgt         1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+                   const uint16_t* src1,
+                   const uint16_t* src2,
+                   const uint16_t* src3,
+                   const uint16_t* src4,
+                   uint32_t* dst,
+                   int width) {
+  asm volatile(
+      "vmov.u16    d6, #4                        \n"  // constant 4
+      "vmov.u16    d7, #6                        \n"  // constant 6
+
+      "1:                                        \n"
+      "vld1.16     {q1}, [%0]!                   \n"  // load 8 samples, 5 rows
+      "vld1.16     {q2}, [%4]!                   \n"
+      "vaddl.u16   q0, d2, d4                    \n"  // * 1
+      "vaddl.u16   q1, d3, d5                    \n"  // * 1
+      "vld1.16     {q2}, [%1]!                   \n"
+      "vmlal.u16   q0, d4, d6                    \n"  // * 4
+      "vmlal.u16   q1, d5, d6                    \n"  // * 4
+      "vld1.16     {q2}, [%2]!                   \n"
+      "vmlal.u16   q0, d4, d7                    \n"  // * 6
+      "vmlal.u16   q1, d5, d7                    \n"  // * 6
+      "vld1.16     {q2}, [%3]!                   \n"
+      "vmlal.u16   q0, d4, d6                    \n"  // * 4
+      "vmlal.u16   q1, d5, d6                    \n"  // * 4
+      "subs        %6, %6, #8                    \n"  // 8 processed per loop
+      "vst1.32     {q0, q1}, [%5]!               \n"  // store 8 samples
+      "bgt         1b                            \n"
+      : "+r"(src0),  // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(src4),  // %4
+        "+r"(dst),   // %5
+        "+r"(width)  // %6
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+  const uint32_t* src1 = src + 1;
+  const uint32_t* src2 = src + 2;
+  const uint32_t* src3 = src + 3;
+  asm volatile(
+      "vmov.u32    q10, #4                       \n"  // constant 4
+      "vmov.u32    q11, #6                       \n"  // constant 6
+
+      "1:                                        \n"
+      "vld1.32     {q0, q1}, [%0]!               \n"  // load 12 source samples
+      "vld1.32     {q2}, [%0]                    \n"
+      "vadd.u32    q0, q0, q1                    \n"  // * 1
+      "vadd.u32    q1, q1, q2                    \n"  // * 1
+      "vld1.32     {q2, q3}, [%2]!               \n"
+      "vmla.u32    q0, q2, q11                   \n"  // * 6
+      "vmla.u32    q1, q3, q11                   \n"  // * 6
+      "vld1.32     {q2, q3}, [%1]!               \n"
+      "vld1.32     {q8, q9}, [%3]!               \n"
+      "vadd.u32    q2, q2, q8                    \n"  // add rows for * 4
+      "vadd.u32    q3, q3, q9                    \n"
+      "vmla.u32    q0, q2, q10                   \n"  // * 4
+      "vmla.u32    q1, q3, q10                   \n"  // * 4
+      "subs        %5, %5, #8                    \n"  // 8 processed per loop
+      "vqshrn.u32  d0, q0, #8                    \n"  // round and pack
+      "vqshrn.u32  d1, q1, #8                    \n"
+      "vst1.u16    {q0}, [%4]!                   \n"  // store 8 samples
+      "bgt         1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(dst),   // %4
+        "+r"(width)  // %5
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q2}, [%0]!                   \n"  // load 16 Y values
+      "vld2.8      {d0, d2}, [%1]!               \n"  // load 8 VU values
+      "vmov        d1, d0                        \n"
+      "vzip.u8     d0, d1                        \n"  // VV
+      "vmov        d3, d2                        \n"
+      "vzip.u8     d2, d3                        \n"  // UU
+      "subs        %3, %3, #16                   \n"  // 16 pixels per loop
+      "vst3.8      {d0, d2, d4}, [%2]!           \n"  // store 16 YUV pixels
+      "vst3.8      {d1, d3, d5}, [%2]!           \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_yuv24),  // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV
+                                                      // pixels.
+      "vpaddl.u8   q0, q0                        \n"  // V 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // U 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more AYUV
+                                                      // pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 AYUV
+                                                      // pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vqrshrun.s16 d1, q0, #2                   \n"  // 2x2 average
+      "vqrshrun.s16 d0, q1, #2                   \n"
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop.
+      "vst2.8      {d0, d1}, [%2]!               \n"  // store 8 pixels UV.
+      "bgt         1b                            \n"
+      : "+r"(src_ayuv),         // %0
+        "+r"(src_stride_ayuv),  // %1
+        "+r"(dst_uv),           // %2
+        "+r"(width)             // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV
+                                                      // pixels.
+      "vpaddl.u8   q0, q0                        \n"  // V 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // U 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more AYUV
+                                                      // pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 AYUV
+                                                      // pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vqrshrun.s16 d0, q0, #2                   \n"  // 2x2 average
+      "vqrshrun.s16 d1, q1, #2                   \n"
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop.
+      "vst2.8      {d0, d1}, [%2]!               \n"  // store 8 pixels VU.
+      "bgt         1b                            \n"
+      : "+r"(src_ayuv),         // %0
+        "+r"(src_stride_ayuv),  // %1
+        "+r"(dst_vu),           // %2
+        "+r"(width)             // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+// Copy row of AYUV Y's into Y.
+// Similar to ARGBExtractAlphaRow_NEON
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV pixels
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q2}, [%1]!                   \n"  // store 16 Y's.
+      "bgt         1b                            \n"
+      : "+r"(src_ayuv),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 UV values
+      "vld2.8      {d1, d3}, [%0]!               \n"
+      "vorr.u8     q2, q0, q0                    \n"  // move U after V
+      "subs        %2, %2, #16                   \n"  // 16 pixels per loop
+      "vst2.8      {q1, q2}, [%1]!               \n"  // store 16 VU pixels
+      "bgt         1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_vu),  // %1
+        "+r"(width)    // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  const uint8_t* src_u_1 = src_u + src_stride_u;
+  const uint8_t* src_v_1 = src_v + src_stride_v;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 16 U values
+      "vld1.8      {q1}, [%2]!                   \n"  // load 16 V values
+      "vld1.8      {q2}, [%1]!                   \n"
+      "vld1.8      {q3}, [%3]!                   \n"
+      "vpaddl.u8   q0, q0                        \n"  // half size
+      "vpaddl.u8   q1, q1                        \n"
+      "vpadal.u8   q0, q2                        \n"
+      "vpadal.u8   q1, q3                        \n"
+      "vqrshrn.u16 d0, q0, #2                    \n"
+      "vqrshrn.u16 d1, q1, #2                    \n"
+      "subs        %5, %5, #16                   \n"  // 16 src pixels per loop
+      "vst2.8      {d0, d1}, [%4]!               \n"  // store 8 UV pixels
+      "bgt         1b                            \n"
+      : "+r"(src_u),    // %0
+        "+r"(src_u_1),  // %1
+        "+r"(src_v),    // %2
+        "+r"(src_v_1),  // %3
+        "+r"(dst_uv),   // %4
+        "+r"(width)     // %5
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/third_party/libyuv/source/row_neon64.cc b/media/libaom/src/third_party/libyuv/source/row_neon64.cc
index 5d015454b0..d5258a3aef 100644
--- a/media/libaom/src/third_party/libyuv/source/row_neon64.cc
+++ b/media/libaom/src/third_party/libyuv/source/row_neon64.cc
@@ -19,3066 +19,3366 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v1.s}[0], [%1], #4            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v1.s}[1], [%2], #4            \n"
-
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.h}[0], [%1], #2            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v2.h}[1], [%2], #2            \n"                             \
-    "zip1       v1.8b, v2.8b, v2.8b            \n"
+#define READYUV422                               \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v1.s}[0], [%1], #4            \n" \
+  "ld1        {v1.s}[1], [%2], #4            \n"
 
 // Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v1.d}[0], [%1], #8            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v1.d}[1], [%2], #8            \n"                             \
-    "uaddlp     v1.8h, v1.16b                  \n"                             \
-    "rshrn      v1.8b, v1.8h, #1               \n"
+#define READYUV444                               \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v1.d}[0], [%1], #8            \n" \
+  "ld1        {v1.d}[1], [%2], #8            \n" \
+  "uaddlp     v1.8h, v1.16b                  \n" \
+  "rshrn      v1.8b, v1.8h, #1               \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    "movi       v1.8b , #128                   \n"
+#define READYUV400                               \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "movi       v1.8b , #128                   \n"
 
 // Read 8 Y and 4 UV from NV12
-#define READNV12                                                               \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.8b}, [%1], #8              \n"                             \
-    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
+#define READNV12                                 \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v2.8b}, [%1], #8              \n" \
+  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 Y and 4 VU from NV21
-#define READNV21                                                               \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.8b}, [%1], #8              \n"                             \
-    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
+#define READNV21                                 \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v2.8b}, [%1], #8              \n" \
+  "uzp1       v3.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v1.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 YUY2
-#define READYUY2                                                               \
-    MEMACCESS(0)                                                               \
-    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
-    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
-    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
+#define READYUY2                                 \
+  "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \
+  "uzp2       v3.8b, v1.8b, v1.8b            \n" \
+  "uzp1       v1.8b, v1.8b, v1.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 UYVY
-#define READUYVY                                                               \
-    MEMACCESS(0)                                                               \
-    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
-    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
-    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
-
-#define YUV422TORGB_SETUP_REG                                                  \
-    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
-    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
-    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
-    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
-    "movi       v27.8h, #128                   \n"                             \
-    "movi       v28.8h, #102                   \n"                             \
-    "movi       v29.8h, #25                    \n"                             \
-    "movi       v30.8h, #52                    \n"
-
-#define YUV422TORGB(vR, vG, vB)                                                \
-    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
-    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
-    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
-    "ushll      v0.4s, v0.4h, #0               \n"                             \
-    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
-    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
-    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
-    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
-    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
-    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
-    "uxtl       v2.8h, v2.8b                   \n"                             \
-    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
-    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
-    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
-    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
-    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
-    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
-    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
-    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
-    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
-    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
-    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
-    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
-    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
-    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
-    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
-
-// YUV to RGB conversion constants.
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
-
-// U and V contributions to R,G,B.
-#define UB -128 /* -min(128, round(2.018 * 64)) */
-#define UG 25 /* -round(-0.391 * 64) */
-#define VG 52 /* -round(-0.813 * 64) */
-#define VR -102 /* -round(1.596 * 64) */
-
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128            - YGB)
-#define BG (UG * 128 + VG * 128 - YGB)
-#define BR            (VR * 128 - YGB)
-
-static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
-static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
-
-#undef YG
-#undef YGB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-#undef BB
-#undef BG
-#undef BR
-
-#define RGBTOUV_SETUP_REG                                                      \
-    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
-    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
-    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
-    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
-    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
-    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
-
-
-#ifdef HAS_I444TOARGBROW_NEON
-void I444ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+#define READUYVY                                 \
+  "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \
+  "orr        v0.8b, v3.8b, v3.8b            \n" \
+  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
+
+#define YUVTORGB_SETUP                                      \
+  "ld3r       {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \
+  "ld1r       {v31.4s}, [%[kYToRgb]]                    \n" \
+  "ld2        {v27.8h, v28.8h}, [%[kUVToRB]]            \n" \
+  "ld2        {v29.8h, v30.8h}, [%[kUVToG]]             \n"
+
+// clang-format off
+
+#define YUVTORGB(vR, vG, vB)                                        \
+  "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
+  "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \
+  "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \
+  "ushll      v0.4s, v0.4h, #0               \n"                    \
+  "mul        v3.4s, v3.4s, v31.4s           \n"                    \
+  "mul        v0.4s, v0.4s, v31.4s           \n"                    \
+  "sqshrun    v0.4h, v0.4s, #16              \n"                    \
+  "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \
+  "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \
+  "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
+  "uxtl       v2.8h, v2.8b                   \n"                    \
+  "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
+  "mul        v3.8h, v27.8h, v1.8h           \n"                    \
+  "mul        v5.8h, v29.8h, v1.8h           \n"                    \
+  "mul        v6.8h, v30.8h, v2.8h           \n"                    \
+  "mul        v7.8h, v28.8h, v2.8h           \n"                    \
+  "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
+  "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */            \
+  "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */            \
+  "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */            \
+  "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */            \
+  "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */            \
+  "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */            \
+  "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */            \
+  "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */            \
+  "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
+
+// clang-format on
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+    YUVTORGB_SETUP
+      "movi        v23.8b, #255                  \n" /* A */
+      "1:                                        \n"
     READYUV444
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                 \n"
-    "movi       v23.8b, #255                   \n" /* A */
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "subs        %w4, %w4, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_I444TOARGBROW_NEON
 
-#ifdef HAS_I422TOARGBROW_NEON
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
-    READYUV422
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "movi       v23.8b, #255                   \n" /* A */
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I422TOARGBROW_NEON
+    YUVTORGB_SETUP
+      "movi        v23.8b, #255                  \n" /* A */
 
-#ifdef HAS_I411TOARGBROW_NEON
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
-    READYUV411
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "movi       v23.8b, #255                   \n" /* A */
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
+      "1:                                        \n"
+    READYUV422
+      "prfm        pldl1keep, [%0, 448]          \n"
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "subs        %w4, %w4, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_I411TOARGBROW_NEON
 
-#ifdef HAS_I422TOBGRAROW_NEON
-void I422ToBGRARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_bgra,
-                        int width) {
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
   asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+    YUVTORGB_SETUP
+      "1:                                        \n"
     READYUV422
-    YUV422TORGB(v21, v22, v23)
-    "subs       %w4, %w4, #8                   \n"
-    "movi       v20.8b, #255                   \n" /* A */
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+    YUVTORGB(v22, v21, v20)
+      "ld1         {v23.8b}, [%3], #8            \n"
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "subs        %w5, %w5, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
-      "+r"(dst_bgra),  // %3
-      "+r"(width)      // %4
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+      "+r"(src_a),     // %3
+      "+r"(dst_argb),  // %4
+      "+r"(width)      // %5
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_I422TOBGRAROW_NEON
 
-#ifdef HAS_I422TOABGRROW_NEON
-void I422ToABGRRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_abgr,
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+    YUVTORGB_SETUP
+      "movi        v20.8b, #255                  \n" /* A */
+      "1:                                        \n"
     READYUV422
-    YUV422TORGB(v20, v21, v22)
-    "subs       %w4, %w4, #8                   \n"
-    "movi       v23.8b, #255                   \n" /* A */
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_abgr),  // %3
-      "+r"(width)      // %4
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I422TOABGRROW_NEON
-
-#ifdef HAS_I422TORGBAROW_NEON
-void I422ToRGBARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_rgba,
-                        int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
-    READYUV422
-    YUV422TORGB(v23, v22, v21)
-    "subs       %w4, %w4, #8                   \n"
-    "movi       v20.8b, #255                   \n" /* A */
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+    YUVTORGB(v23, v22, v21)
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "subs        %w4, %w4, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_rgba),  // %3
       "+r"(width)      // %4
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_I422TORGBAROW_NEON
 
-#ifdef HAS_I422TORGB24ROW_NEON
-void I422ToRGB24Row_NEON(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
                          int width) {
   asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+    YUVTORGB_SETUP
+      "1:                                        \n"
     READYUV422
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
-    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "subs        %w4, %w4, #8                  \n"
+      "st3         {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_rgb24), // %3
       "+r"(width)      // %4
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_I422TORGB24ROW_NEON
 
-#ifdef HAS_I422TORAWROW_NEON
-void I422ToRAWRow_NEON(const uint8* src_y,
-                       const uint8* src_u,
-                       const uint8* src_v,
-                       uint8* dst_raw,
-                       int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
-    READYUV422
-    YUV422TORGB(v20, v21, v22)
-    "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
-    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_raw),   // %3
-      "+r"(width)      // %4
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I422TORAWROW_NEON
+#define ARGBTORGB565                                                        \
+  "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \
+  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
+  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
+  "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
+  "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
 
-#define ARGBTORGB565                                                           \
-    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
-    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
-    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
-    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
+// clang-format off
 
-#ifdef HAS_I422TORGB565ROW_NEON
-void I422ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+  asm volatile(
+    YUVTORGB_SETUP
+      "1:                                        \n"
     READYUV422
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #8                  \n"
     ARGBTORGB565
-    MEMACCESS(3)
-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_rgb565),  // %3
-      "+r"(width)     // %4
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I422TORGB565ROW_NEON
-
-#define ARGBTOARGB1555                                                         \
-    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
-    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
-    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
-    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
-    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
-    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
-
-#ifdef HAS_I422TOARGB1555ROW_NEON
-void I422ToARGB1555Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "st1         {v0.8h}, [%3], #16            \n"  // store 8 pixels RGB565.
+      "b.gt        1b                            \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_u),       // %1
+        "+r"(src_v),       // %2
+        "+r"(dst_rgb565),  // %3
+        "+r"(width)        // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+#define ARGBTOARGB1555                                                      \
+  "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \
+  "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \
+  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
+  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
+  "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \
+  "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \
+  "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */
+
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+  asm volatile(
+    YUVTORGB_SETUP
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
     READYUV422
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "movi       v23.8b, #255                   \n"
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #8                  \n"
     ARGBTOARGB1555
-    MEMACCESS(3)
-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb1555),  // %3
-      "+r"(width)     // %4
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-#endif  // HAS_I422TOARGB1555ROW_NEON
-
-#define ARGBTOARGB4444                                                         \
-    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
-    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
-    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
-    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
-    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
-    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
-    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
-    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
-
-#ifdef HAS_I422TOARGB4444ROW_NEON
-void I422ToARGB4444Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "st1         {v0.8h}, [%3], #16            \n"  // store 8 pixels RGB565.
+      "b.gt        1b                            \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_u),         // %1
+        "+r"(src_v),         // %2
+        "+r"(dst_argb1555),  // %3
+        "+r"(width)          // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+// clang-format on
+
+#define ARGBTOARGB4444                                                       \
+  /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
+  "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \
+  "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \
+  "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \
+  "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \
+  "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \
+  "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \
+  "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
                             int width) {
   asm volatile (
-    YUV422TORGB_SETUP_REG
-    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
-  "1:                                          \n"
+    YUVTORGB_SETUP
+      "movi        v4.16b, #0x0f                 \n"  // bits to clear with vbic.
+      "1:                                        \n"
     READYUV422
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "movi       v23.8b, #255                   \n"
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #8                  \n"
+      "movi        v23.8b, #255                  \n"
     ARGBTOARGB4444
-    MEMACCESS(3)
-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%1, 128]          \n"
+      "prfm        pldl1keep, [%2, 128]          \n"
+      "st1         {v0.8h}, [%3], #16            \n"  // store 8 pixels ARGB4444.
+      "b.gt        1b                            \n"
     : "+r"(src_y),    // %0
       "+r"(src_u),    // %1
       "+r"(src_v),    // %2
       "+r"(dst_argb4444),  // %3
       "+r"(width)     // %4
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_I422TOARGB4444ROW_NEON
 
-#ifdef HAS_I400TOARGBROW_NEON
-void I400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
-  int64 width64 = (int64)(width);
   asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+    YUVTORGB_SETUP
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
     READYUV400
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    "movi       v23.8b, #255                   \n"
-    MEMACCESS(1)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
-    "b.gt       1b                             \n"
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(dst_argb),  // %1
-      "+r"(width64)    // %2
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_I400TOARGBROW_NEON
-
-#ifdef HAS_J400TOARGBROW_NEON
-void J400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v20.8b}, [%0], #8             \n"
-    "orr        v21.8b, v20.8b, v20.8b         \n"
-    "orr        v22.8b, v20.8b, v20.8b         \n"
-    "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    :
-    : "cc", "memory", "v20", "v21", "v22", "v23"
-  );
-}
-#endif  // HAS_J400TOARGBROW_NEON
 
-#ifdef HAS_NV12TOARGBROW_NEON
-void NV12ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
+      "ld1         {v20.8b}, [%0], #8            \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "orr         v21.8b, v20.8b, v20.8b        \n"
+      "orr         v22.8b, v20.8b, v20.8b        \n"
+      "subs        %w2, %w2, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v20", "v21", "v22", "v23");
+}
+
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+    YUVTORGB_SETUP
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
     READNV12
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    "movi       v23.8b, #255                   \n"
-    MEMACCESS(2)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%1, 256]          \n"
+      "subs        %w3, %w3, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
       "+r"(dst_argb),  // %2
       "+r"(width)      // %3
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_NV12TOARGBROW_NEON
 
-#ifdef HAS_NV21TOARGBROW_NEON
-void NV21ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+    YUVTORGB_SETUP
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
     READNV21
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    "movi       v23.8b, #255                   \n"
-    MEMACCESS(2)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%1, 256]          \n"
+      "subs        %w3, %w3, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
+      "+r"(src_vu),    // %1
       "+r"(dst_argb),  // %2
       "+r"(width)      // %3
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_NV21TOARGBROW_NEON
 
-#ifdef HAS_NV12TORGB565ROW_NEON
-void NV12ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
-                          int width) {
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+    YUVTORGB_SETUP
+      "1:                                        \n"
     READNV12
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    ARGBTORGB565
-    MEMACCESS(2)
-    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%1, 256]          \n"
+      "subs        %w3, %w3, #8                  \n"
+      "st3         {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
-      "+r"(dst_rgb565),  // %2
+      "+r"(dst_rgb24),  // %2
       "+r"(width)      // %3
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_NV12TORGB565ROW_NEON
 
-#ifdef HAS_NV21TORGB565ROW_NEON
-void NV21ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
-                          int width) {
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+    YUVTORGB_SETUP
+      "1:                                        \n"
     READNV21
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    ARGBTORGB565
-    MEMACCESS(2)
-    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%1, 256]          \n"
+      "subs        %w3, %w3, #8                  \n"
+      "st3         {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_rgb565),  // %2
+      "+r"(src_vu),    // %1
+      "+r"(dst_rgb24),  // %2
       "+r"(width)      // %3
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_NV21TORGB565ROW_NEON
 
-#ifdef HAS_YUY2TOARGBROW_NEON
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
-                        uint8* dst_argb,
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile(
+      YUVTORGB_SETUP "1:                                        \n" READNV12
+                     "prfm        pldl1keep, [%0, 448]          \n" YUVTORGB(
+                         v22, v21, v20) ARGBTORGB565
+      "prfm        pldl1keep, [%1, 256]          \n"
+      "subs        %w3, %w3, #8                  \n"
+      "st1         {v0.8h}, [%2], 16             \n"  // store 8 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_uv),      // %1
+        "+r"(dst_rgb565),  // %2
+        "+r"(width)        // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
-  int64 width64 = (int64)(width);
   asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+    YUVTORGB_SETUP
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
     READYUY2
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    "movi       v23.8b, #255                   \n"
-    MEMACCESS(1)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
-    "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+    YUVTORGB(v22, v21, v20)
+      "subs        %w2, %w2, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_yuy2),  // %0
       "+r"(dst_argb),  // %1
-      "+r"(width64)    // %2
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_YUY2TOARGBROW_NEON
 
-#ifdef HAS_UYVYTOARGBROW_NEON
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
-                        uint8* dst_argb,
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
                         int width) {
-  int64 width64 = (int64)(width);
   asm volatile (
-    YUV422TORGB_SETUP_REG
-  "1:                                          \n"
+    YUVTORGB_SETUP
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
     READUYVY
-    YUV422TORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    "movi       v23.8b, #255                   \n"
-    MEMACCESS(1)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
-    "b.gt       1b                             \n"
+    YUVTORGB(v22, v21, v20)
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
+      "b.gt        1b                            \n"
     : "+r"(src_uyvy),  // %0
       "+r"(dst_argb),  // %1
-      "+r"(width64)    // %2
-    : [kUVBiasBGR]"r"(&kUVBiasBGR),
-      [kYToRgb]"r"(&kYToRgb)
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
-#endif  // HAS_UYVYTOARGBROW_NEON
 
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-#ifdef HAS_SPLITUVROW_NEON
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store U
-    MEMACCESS(2)
-    "st1        {v1.16b}, [%2], #16            \n"  // store V
-    "b.gt       1b                             \n"
-    : "+r"(src_uv),  // %0
-      "+r"(dst_u),   // %1
-      "+r"(dst_v),   // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "v0", "v1"  // Clobber List
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
+      "st1         {v0.16b}, [%1], #16           \n"  // store U
+      "st1         {v1.16b}, [%2], #16           \n"  // store V
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
-#endif  // HAS_SPLITUVROW_NEON
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
-#ifdef HAS_MERGEUVROW_NEON
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load U
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"  // load V
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-    MEMACCESS(2)
-    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
-    "b.gt       1b                             \n"
-    :
-      "+r"(src_u),   // %0
-      "+r"(src_v),   // %1
-      "+r"(dst_uv),  // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-#endif  // HAS_MERGEUVROW_NEON
-
-// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
-#ifdef HAS_COPYROW_NEON
-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
-    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
-    MEMACCESS(1)
-    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
-    "b.gt       1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2  // Output registers
-  :                     // Input registers
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-#endif  // HAS_COPYROW_NEON
-
-// SetRow writes 'count' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8* dst, uint8 v8, int count) {
-  asm volatile (
-    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
-  "1:                                          \n"
-    "subs      %w1, %w1, #16                   \n"  // 16 bytes per loop
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"  // store
-    "b.gt      1b                              \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v8)      // %2
-  : "cc", "memory", "v0"
-  );
-}
-
-void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
-  asm volatile (
-    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
-  "1:                                          \n"
-    "subs      %w1, %w1, #4                    \n"  // 4 ints per loop
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"  // store
-    "b.gt      1b                              \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v32)     // %2
-  : "cc", "memory", "v0"
-  );
-}
-
-#ifdef HAS_MIRRORROW_NEON
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  int64 width64 = (int64) width;
-  asm volatile (
-    // Start at end of source row.
-    "add        %0, %0, %2                     \n"
-    "sub        %0, %0, #16                    \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-    "subs       %2, %2, #16                   \n"  // 16 pixels per loop.
-    "rev64      v0.16b, v0.16b                 \n"
-    MEMACCESS(1)
-    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-    MEMACCESS(1)
-    "st1        {v0.D}[0], [%1], #8            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width64)  // %2
-  : "r"((ptrdiff_t)-16)    // %3
-  : "cc", "memory", "v0"
-  );
-}
-#endif  // HAS_MIRRORROW_NEON
-
-#ifdef HAS_MIRRORUVROW_NEON
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load U
+      "ld1         {v1.16b}, [%1], #16           \n"  // load V
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
+      "st2         {v0.16b,v1.16b}, [%2], #32    \n"  // store 16 pairs of UV
+      "b.gt        1b                            \n"
+      : "+r"(src_u),                // %0
+        "+r"(src_v),                // %1
+        "+r"(dst_uv),               // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
                       int width) {
-  int64 width64 = (int64) width;
-  asm volatile (
-    // Start at end of source row.
-    "add        %0, %0, %3, lsl #1             \n"
-    "sub        %0, %0, #16                    \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
-    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
-    "rev64      v0.8b, v0.8b                   \n"
-    "rev64      v1.8b, v1.8b                   \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_uv),  // %0
-    "+r"(dst_u),   // %1
-    "+r"(dst_v),   // %2
-    "+r"(width64)    // %3
-  : "r"((ptrdiff_t)-16)      // %4
-  : "cc", "memory", "v0", "v1"
-  );
-}
-#endif  // HAS_MIRRORUVROW_NEON
-
-#ifdef HAS_ARGBMIRRORROW_NEON
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  int64 width64 = (int64) width;
-  asm volatile (
-    // Start at end of source row.
-    "add        %0, %0, %2, lsl #2             \n"
-    "sub        %0, %0, #16                    \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
-    "rev64      v0.4s, v0.4s                   \n"
-    MEMACCESS(1)
-    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-    MEMACCESS(1)
-    "st1        {v0.D}[0], [%1], #8            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width64)  // %2
-  : "r"((ptrdiff_t)-16)    // %3
-  : "cc", "memory", "v0"
-  );
-}
-#endif  // HAS_ARGBMIRRORROW_NEON
-
-#ifdef HAS_RGB24TOARGBROW_NEON
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
-  asm volatile (
-    "movi       v4.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),   // %1
-    "+r"(pix)         // %2
-  :
-  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-#endif  // HAS_RGB24TOARGBROW_NEON
-
-#ifdef HAS_RAWTOARGBROW_NEON
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
-  asm volatile (
-    "movi       v5.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
-    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
-    MEMACCESS(1)
-    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
-}
-#endif  // HAS_RAWTOARGBROW_NEON
-
-#define RGB565TOARGB                                                           \
-    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
-    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
-    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
-    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
-    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
-    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
-    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
-    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
-
-#ifdef HAS_RGB565TOARGBROW_NEON
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    RGB565TOARGB
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(pix)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
-  );
-}
-#endif  // HAS_RGB565TOARGBROW_NEON
-
-#define ARGB1555TOARGB                                                         \
-    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
-    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
-    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
-                                                                               \
-    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
-    "xtn2       v3.16b, v2.8h                  \n"                             \
-                                                                               \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
-                                                                               \
-    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
-    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
-    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
-                                                                               \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
-    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
-    "dup        v1.2D, v0.D[1]                 \n"                             \
-    "dup        v3.2D, v2.D[1]                 \n"
+  asm volatile(
+      "1:                                        \n"
+      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
+      "st1         {v0.16b}, [%1], #16           \n"  // store R
+      "st1         {v1.16b}, [%2], #16           \n"  // store G
+      "st1         {v2.16b}, [%3], #16           \n"  // store B
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb),                    // %0
+        "+r"(dst_r),                      // %1
+        "+r"(dst_g),                      // %2
+        "+r"(dst_b),                      // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+  );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load R
+      "ld1         {v1.16b}, [%1], #16           \n"  // load G
+      "ld1         {v2.16b}, [%2], #16           \n"  // load B
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
+      "st3         {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),                      // %0
+        "+r"(src_g),                      // %1
+        "+r"(src_b),                      // %2
+        "+r"(dst_rgb),                    // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+  );
+}
+
+// Copy multiple of 32.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ldp         q0, q1, [%0], #32             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #32                 \n"  // 32 processed per loop
+      "stp         q0, q1, [%1], #32             \n"
+      "b.gt        1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+  asm volatile(
+      "dup         v0.16b, %w2                   \n"  // duplicate 16 bytes
+      "1:                                        \n"
+      "subs        %w1, %w1, #16                 \n"  // 16 bytes per loop
+      "st1         {v0.16b}, [%0], #16           \n"  // store
+      "b.gt        1b                            \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v8)      // %2
+      : "cc", "memory", "v0");
+}
+
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+  asm volatile(
+      "dup         v0.4s, %w2                    \n"  // duplicate 4 ints
+      "1:                                        \n"
+      "subs        %w1, %w1, #4                  \n"  // 4 ints per loop
+      "st1         {v0.16b}, [%0], #16           \n"  // store
+      "b.gt        1b                            \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v32)     // %2
+      : "cc", "memory", "v0");
+}
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "ld1         {v3.16b}, [%3]                \n"  // shuffler
+      "add         %0, %0, %w2, sxtw             \n"
+      "sub         %0, %0, #32                   \n"
+      "1:                                        \n"
+      "ldr         q2, [%0, 16]                  \n"
+      "ldr         q1, [%0], -32                 \n"  // src -= 32
+      "subs        %w2, %w2, #32                 \n"  // 32 pixels per loop.
+      "tbl         v0.16b, {v2.16b}, v3.16b      \n"
+      "tbl         v1.16b, {v1.16b}, v3.16b      \n"
+      "st1         {v0.16b, v1.16b}, [%1], #32   \n"  // store 32 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(&kShuffleMirror)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "ld1         {v4.16b}, [%3]                \n"  // shuffler
+      "add         %0, %0, %w2, sxtw #1          \n"
+      "sub         %0, %0, #32                   \n"
+      "1:                                        \n"
+      "ldr         q1, [%0, 16]                  \n"
+      "ldr         q0, [%0], -32                 \n"  // src -= 32
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
+      "tbl         v2.16b, {v1.16b}, v4.16b      \n"
+      "tbl         v3.16b, {v0.16b}, v4.16b      \n"
+      "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // dst += 32
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),           // %0
+        "+r"(dst_uv),           // %1
+        "+r"(width)             // %2
+      : "r"(&kShuffleMirrorUV)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width) {
+  asm volatile(
+      // Start at end of source row.
+      "ld1         {v4.16b}, [%4]                \n"  // shuffler
+      "add         %0, %0, %w3, sxtw #1          \n"
+      "sub         %0, %0, #32                   \n"
+      "1:                                        \n"
+      "ldr         q1, [%0, 16]                  \n"
+      "ldr         q0, [%0], -32                 \n"  // src -= 32
+      "subs        %w3, %w3, #16                 \n"  // 16 pixels per loop.
+      "tbl         v2.16b, {v1.16b}, v4.16b      \n"
+      "tbl         v3.16b, {v0.16b}, v4.16b      \n"
+      "uzp1        v0.16b, v2.16b, v3.16b        \n"  // U
+      "uzp2        v1.16b, v2.16b, v3.16b        \n"  // V
+      "st1         {v0.16b}, [%1], #16           \n"  // dst += 16
+      "st1         {v1.16b}, [%2], #16           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),           // %0
+        "+r"(dst_u),            // %1
+        "+r"(dst_v),            // %2
+        "+r"(width)             // %3
+      : "r"(&kShuffleMirrorUV)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+// Shuffle table for reversing the ARGB.
+static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
+                                         4u,  5u,  6u,  7u,  0u, 1u, 2u,  3u};
+
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "ld1         {v4.16b}, [%3]                \n"  // shuffler
+      "add         %0, %0, %w2, sxtw #2          \n"
+      "sub         %0, %0, #32                   \n"
+      "1:                                        \n"
+      "ldr         q1, [%0, 16]                  \n"
+      "ldr         q0, [%0], -32                 \n"  // src -= 32
+      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop.
+      "tbl         v2.16b, {v1.16b}, v4.16b      \n"
+      "tbl         v3.16b, {v0.16b}, v4.16b      \n"
+      "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // dst += 32
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),           // %0
+        "+r"(dst_argb),           // %1
+        "+r"(width)               // %2
+      : "r"(&kShuffleMirrorARGB)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "ld1         {v3.16b}, [%4]                \n"  // shuffler
+      "add         %0, %0, %w2, sxtw #1          \n"  // Start at end of row.
+      "add         %0, %0, %w2, sxtw             \n"
+      "sub         %0, %0, #48                   \n"
+
+      "1:                                        \n"
+      "ld3         {v0.16b, v1.16b, v2.16b}, [%0], %3 \n"  // src -= 48
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
+      "tbl         v0.16b, {v0.16b}, v3.16b      \n"
+      "tbl         v1.16b, {v1.16b}, v3.16b      \n"
+      "tbl         v2.16b, {v2.16b}, v3.16b      \n"
+      "st3         {v0.16b, v1.16b, v2.16b}, [%1], #48 \n"  // dst += 48
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb24),      // %0
+        "+r"(dst_rgb24),      // %1
+        "+r"(width)           // %2
+      : "r"((ptrdiff_t)-48),  // %3
+        "r"(&kShuffleMirror)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+      "movi        v4.8b, #255                   \n"  // Alpha
+      "1:                                        \n"
+      "ld3         {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of
+                                                       // RGB24.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "st4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi        v5.8b, #255                   \n"  // Alpha
+      "1:                                        \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "orr         v3.8b, v1.8b, v1.8b           \n"  // move g
+      "orr         v4.8b, v0.8b, v0.8b           \n"  // move r
+      "st4         {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+      "b.gt        1b                            \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  asm volatile(
+      "movi        v0.8b, #255                   \n"  // Alpha
+      "1:                                        \n"
+      "ld3         {v3.8b,v4.8b,v5.8b}, [%0], #24 \n"  // read r g b
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "orr         v2.8b, v4.8b, v4.8b           \n"  // move g
+      "orr         v1.8b, v5.8b, v5.8b           \n"  // move r
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store a b g r
+      "b.gt        1b                            \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_rgba),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "orr         v3.8b, v1.8b, v1.8b           \n"   // move g
+      "orr         v4.8b, v0.8b, v0.8b           \n"   // move r
+      "st3         {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
+      "b.gt        1b                            \n"
+      : "+r"(src_raw),    // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+#define RGB565TOARGB                                                        \
+  "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
+  "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
+  "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
+  "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
+  "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
+  "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
+  "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
+  "dup        v2.2D, v0.D[1]                 \n" /* R                    */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "movi        v3.8b, #255                   \n"  // Alpha
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_argb),    // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
+  );
+}
+
+#define ARGB1555TOARGB                                                      \
+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
+                                                                            \
+  "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
+  "xtn2       v3.16b, v2.8h                  \n"                            \
+                                                                            \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
+                                                                            \
+  "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
+                                                                            \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
+  "dup        v1.2D, v0.D[1]                 \n"                            \
+  "dup        v3.2D, v2.D[1]                 \n"
 
 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                           \
-    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
-    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
-    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
-                                                                               \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
-                                                                               \
-    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
-    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
-    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
-                                                                               \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
-    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
-    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
-
-#ifdef HAS_ARGB1555TOARGBROW_NEON
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
-                            int pix) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(pix)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-#endif  // HAS_ARGB1555TOARGBROW_NEON
-
-#define ARGB4444TOARGB                                                         \
-    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
-    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
-    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
-    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
-    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
-    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
-    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
-    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
-    "dup        v0.2D, v2.D[1]                 \n"                             \
-    "dup        v1.2D, v3.D[1]                 \n"
-
-#ifdef HAS_ARGB4444TOARGBROW_NEON
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
-                            int pix) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(pix)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-#endif  // HAS_ARGB4444TOARGBROW_NEON
-
-#ifdef HAS_ARGBTORGB24ROW_NEON
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(pix)         // %2
-  :
-  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-#endif  // HAS_ARGBTORGB24ROW_NEON
-
-#ifdef HAS_ARGBTORAWROW_NEON
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
-    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
-    MEMACCESS(1)
-    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_raw),   // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
-}
-#endif  // HAS_ARGBTORAWROW_NEON
-
-#ifdef HAS_YUY2TOYROW_NEON
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-#endif  // HAS_YUY2TOYROW_NEON
-
-#ifdef HAS_UYVYTOYROW_NEON
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-#endif  // HAS_UYVYTOYROW_NEON
-
-#ifdef HAS_YUY2TOUV422ROW_NEON
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
-                         int pix) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
-    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
-    MEMACCESS(2)
-    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(pix)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-#endif  // HAS_YUY2TOUV422ROW_NEON
-
-#ifdef HAS_UYVYTOUV422ROW_NEON
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
-                         int pix) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
-    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
-    MEMACCESS(2)
-    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(pix)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-#endif  // HAS_UYVYTOUV422ROW_NEON
-
-#ifdef HAS_YUY2TOUVROW_NEON
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
-    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
-    MEMACCESS(3)
-    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_yuy2),     // %0
-    "+r"(src_yuy2b),    // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(pix)           // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
-    "v5", "v6", "v7"  // Clobber List
-  );
-}
-#endif  // HAS_YUY2TOUVROW_NEON
-
-#ifdef HAS_UYVYTOUVROW_NEON
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
-    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
-    MEMACCESS(3)
-    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_uyvy),     // %0
-    "+r"(src_uyvyb),    // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(pix)           // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
-    "v5", "v6", "v7"  // Clobber List
+#define RGB555TOARGB                                                        \
+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
+                                                                            \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
+                                                                            \
+  "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
+                                                                            \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
+  "dup        v1.2D, v0.D[1]                 \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "movi        v3.8b, #255                   \n"  // Alpha
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
+// clobbers v3
+#define ARGB4444TOARGB                                                      \
+  "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
+  "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
+  "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
+  "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
+  "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
+  "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
+  "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
+  "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
+  "dup        v0.2D, v2.D[1]                 \n"                            \
+  "dup        v1.2D, v3.D[1]                 \n"
+
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "st3         {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
+                                                       // RGB24
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "orr         v4.8b, v2.8b, v2.8b           \n"   // mov g
+      "orr         v5.8b, v1.8b, v1.8b           \n"   // mov b
+      "st3         {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_raw),   // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
+      "st1         {v0.16b}, [%1], #16           \n"  // store 16 pixels of Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
+      "st1         {v1.16b}, [%1], #16           \n"  // store 16 pixels of Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
+      "st1         {v1.8b}, [%1], #8             \n"  // store 8 U.
+      "st1         {v3.8b}, [%2], #8             \n"  // store 8 V.
+      "b.gt        1b                            \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 U.
+      "st1         {v2.8b}, [%2], #8             \n"  // store 8 V.
+      "b.gt        1b                            \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+      "urhadd      v1.8b, v1.8b, v5.8b           \n"  // average rows of U
+      "urhadd      v3.8b, v3.8b, v7.8b           \n"  // average rows of V
+      "st1         {v1.8b}, [%2], #8             \n"  // store 8 U.
+      "st1         {v3.8b}, [%3], #8             \n"  // store 8 V.
+      "b.gt        1b                            \n"
+      : "+r"(src_yuy2),   // %0
+        "+r"(src_yuy2b),  // %1
+        "+r"(dst_u),      // %2
+        "+r"(dst_v),      // %3
+        "+r"(width)       // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+        "v7"  // Clobber List
+  );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+      "urhadd      v0.8b, v0.8b, v4.8b           \n"  // average rows of U
+      "urhadd      v2.8b, v2.8b, v6.8b           \n"  // average rows of V
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 U.
+      "st1         {v2.8b}, [%3], #8             \n"  // store 8 V.
+      "b.gt        1b                            \n"
+      : "+r"(src_uyvy),   // %0
+        "+r"(src_uyvyb),  // %1
+        "+r"(dst_u),      // %2
+        "+r"(dst_v),      // %3
+        "+r"(width)       // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+        "v7"  // Clobber List
   );
 }
-#endif  // HAS_UYVYTOUVROW_NEON
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-#ifdef HAS_ARGBSHUFFLEROW_NEON
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int pix) {
-  asm volatile (
-    MEMACCESS(3)
-    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
-    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
-    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
-    MEMACCESS(1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "r"(shuffler)    // %3
-  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-  );
-}
-#endif  // HAS_ARGBSHUFFLEROW_NEON
-
-#ifdef HAS_I422TOYUY2ROW_NEON
-void I422ToYUY2Row_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
-    "orr        v2.8b, v1.8b, v1.8b            \n"
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
-    MEMACCESS(2)
-    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels
-    MEMACCESS(3)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_yuy2),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
-}
-#endif  // HAS_I422TOYUY2ROW_NEON
-
-#ifdef HAS_I422TOUYVYROW_NEON
-void I422ToUYVYRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
-    "orr        v3.8b, v2.8b, v2.8b            \n"
-    MEMACCESS(1)
-    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
-    MEMACCESS(2)
-    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels
-    MEMACCESS(3)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_uyvy),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
-}
-#endif  // HAS_I422TOUYVYROW_NEON
-
-#ifdef HAS_ARGBTORGB565ROW_NEON
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGBTORGB565
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_rgb565),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
-  );
-}
-#endif  // HAS_ARGBTORGB565ROW_NEON
-
-#ifdef HAS_ARGBTORGB565DITHERROW_NEON
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "dup        v1.4s, %w2                     \n"  // dither4
-  "1:                                          \n"
-    MEMACCESS(1)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v20.8b, v20.8b, v1.8b          \n"
-    "uqadd      v21.8b, v21.8b, v1.8b          \n"
-    "uqadd      v22.8b, v22.8b, v1.8b          \n"
-    ARGBTORGB565
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-  : "+r"(dst_rgb)    // %0
-  : "r"(src_argb),   // %1
-    "r"(dither4),    // %2
-    "r"(width)       // %3
-  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
-  );
-}
-#endif  // HAS_ARGBTORGB565ROW_NEON
-
-#ifdef HAS_ARGBTOARGB1555ROW_NEON
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
-                            int pix) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGBTOARGB1555
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb1555),  // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
-  );
-}
-#endif  // HAS_ARGBTOARGB1555ROW_NEON
-
-#ifdef HAS_ARGBTOARGB4444ROW_NEON
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
-                            int pix) {
-  asm volatile (
-    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGBTOARGB4444
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb4444),  // %1
-    "+r"(pix)            // %2
-  :
-  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
-  );
-}
-#endif  // HAS_ARGBTOARGB4444ROW_NEON
-
-#ifdef HAS_ARGBTOYROW_NEON
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
-}
-#endif  // HAS_ARGBTOYROW_NEON
-
-#ifdef HAS_ARGBTOYJROW_NEON
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
-    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
-    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
-    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
-  );
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+      "ld1         {v2.16b}, [%3]                \n"  // shuffler
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 4 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #4                  \n"  // 4 processed per loop
+      "tbl         v1.16b, {v0.16b}, v2.16b      \n"  // look up 4 pixels
+      "st1         {v1.16b}, [%1], #16           \n"  // store 4.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),                   // %0
+        "+r"(dst_argb),                   // %1
+        "+r"(width)                       // %2
+      : "r"(shuffler)                     // %3
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+  );
+}
+
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.8b, v1.8b}, [%0], #16     \n"  // load 16 Ys
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "orr         v2.8b, v1.8b, v1.8b           \n"
+      "ld1         {v1.8b}, [%1], #8             \n"         // load 8 Us
+      "ld1         {v3.8b}, [%2], #8             \n"         // load 8 Vs
+      "subs        %w4, %w4, #16                 \n"         // 16 pixels
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "b.gt        1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v1.8b,v2.8b}, [%0], #16      \n"  // load 16 Ys
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "orr         v3.8b, v2.8b, v2.8b           \n"
+      "ld1         {v0.8b}, [%1], #8             \n"         // load 8 Us
+      "ld1         {v2.8b}, [%2], #8             \n"         // load 8 Vs
+      "subs        %w4, %w4, #16                 \n"         // 16 pixels
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "b.gt        1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
+                                                                 // pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      ARGBTORGB565
+      "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels RGB565.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),    // %0
+        "+r"(dst_rgb565),  // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "dup         v1.4s, %w2                    \n"  // dither4
+      "1:                                        \n"
+      "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8
+                                                                 // pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqadd       v20.8b, v20.8b, v1.8b         \n"
+      "uqadd       v21.8b, v21.8b, v1.8b         \n"
+      "uqadd       v22.8b, v22.8b, v1.8b         \n" ARGBTORGB565
+      "st1         {v0.16b}, [%0], #16           \n"  // store 8 pixels RGB565.
+      "b.gt        1b                            \n"
+      : "+r"(dst_rgb)   // %0
+      : "r"(src_argb),  // %1
+        "r"(dither4),   // %2
+        "r"(width)      // %3
+      : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
+                            int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
+                                                                 // pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      ARGBTOARGB1555
+      "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb1555),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
+                            int width) {
+  asm volatile(
+      "movi        v4.16b, #0x0f                 \n"  // bits to clear with
+                                                      // vbic.
+      "1:                                        \n"
+      "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
+                                                                 // pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      ARGBTOARGB4444
+      "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb4444),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v3.8h, v0.8b, v4.8b           \n"  // B
+      "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
+      "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "st1         {v3.16b}, [%1], #16           \n"  // store 16 A's.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
+      "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
+      "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v3.8h, v0.8b, v4.8b           \n"  // B
+      "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
+      "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
+      "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
+      "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 RGBA
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v0.8h, v1.8b, v4.8b           \n"  // B
+      "umlal       v0.8h, v2.8b, v5.8b           \n"  // G
+      "umlal       v0.8h, v3.8b, v6.8b           \n"  // R
+      "uqrshrn     v3.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
+      "st1         {v3.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }
-#endif  // HAS_ARGBTOYJROW_NEON
 
 // 8x1 pixels.
-#ifdef HAS_ARGBTOUV444ROW_NEON
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix) {
-  asm volatile (
-    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
-    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
-    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
-    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
-    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
-    "movi       v29.16b,#0x80                  \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
-    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
-    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
-    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
-
-    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
-    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
-    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
-    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
-
-    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(pix)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
-    "v24", "v25", "v26", "v27", "v28", "v29"
-  );
-}
-#endif  // HAS_ARGBTOUV444ROW_NEON
-
-// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
-#ifdef HAS_ARGBTOUV422ROW_NEON
-void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix) {
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
-    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
-    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
-    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-
-    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
-    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
-    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
-    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
-
-    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
-
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(pix)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-#endif  // HAS_ARGBTOUV422ROW_NEON
-
-// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
-#ifdef HAS_ARGBTOUV411ROW_NEON
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int pix) {
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(0)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
-    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
-    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
-    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
-    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
-    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
-    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
-    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
-    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
-    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(pix)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-#endif  // HAS_ARGBTOUV411ROW_NEON
-
-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
-    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
-    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
-    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
-    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
-    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
-    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
-    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
-    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
-    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
-    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "movi        v24.8b, #112                  \n"  // UB / VR 0.875
+                                                      // coefficient
+      "movi        v25.8b, #74                   \n"  // UG -0.5781 coefficient
+      "movi        v26.8b, #38                   \n"  // UR -0.2969 coefficient
+      "movi        v27.8b, #18                   \n"  // VB -0.1406 coefficient
+      "movi        v28.8b, #94                   \n"  // VG -0.7344 coefficient
+      "movi        v29.16b,#0x80                 \n"  // 128.5
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v24.8b          \n"  // B
+      "umlsl       v4.8h, v1.8b, v25.8b          \n"  // G
+      "umlsl       v4.8h, v2.8b, v26.8b          \n"  // R
+      "add         v4.8h, v4.8h, v29.8h          \n"  // +128 -> unsigned
+
+      "umull       v3.8h, v2.8b, v24.8b          \n"  // R
+      "umlsl       v3.8h, v1.8b, v28.8b          \n"  // G
+      "umlsl       v3.8h, v0.8b, v27.8b          \n"  // B
+      "add         v3.8h, v3.8h, v29.8h          \n"  // +128 -> unsigned
+
+      "uqshrn      v0.8b, v4.8h, #8              \n"  // 16 bit to 8 bit U
+      "uqshrn      v1.8b, v3.8h, #8              \n"  // 16 bit to 8 bit V
+
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%2], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
+        "v27", "v28", "v29");
+}
+
+#define RGBTOUV_SETUP_REG                                                  \
+  "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
+  "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
+  "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
+  "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
+  "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
+  "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+// clang-format off
+#define RGBTOUV(QB, QG, QR)                                                 \
+  "mul        v3.8h, " #QB ",v20.8h          \n" /* B                    */ \
+  "mul        v4.8h, " #QR ",v20.8h          \n" /* R                    */ \
+  "mls        v3.8h, " #QG ",v21.8h          \n" /* G                    */ \
+  "mls        v4.8h, " #QG ",v24.8h          \n" /* G                    */ \
+  "mls        v3.8h, " #QR ",v22.8h          \n" /* R                    */ \
+  "mls        v4.8h, " #QB ",v23.8h          \n" /* B                    */ \
+  "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
+  "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \
+  "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \
+  "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */
+// clang-format on
 
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
 // TODO(fbarchard): consider ptrdiff_t for all strides.
 
-#ifdef HAS_ARGBTOUVROW_NEON
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  const uint8* src_argb_1 = src_argb + src_stride_argb;
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
     RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    MEMACCESS(1)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_argb),  // %0
     "+r"(src_argb_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
     "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
-#endif  // HAS_ARGBTOUVROW_NEON
 
-// TODO(fbarchard): Subsample match C code.
-#ifdef HAS_ARGBTOUVJROW_NEON
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int pix) {
-  const uint8* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile (
-    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
-    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
-    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
-    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
-    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
-    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
-    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
+      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
+      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
+      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
+      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
+      "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_argb),  // %0
     "+r"(src_argb_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
     "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
-#endif  // HAS_ARGBTOUVJROW_NEON
 
-#ifdef HAS_BGRATOUVROW_NEON
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
   asm volatile (
     RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
-    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v3.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v3.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v3.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v1.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v7.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v3.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v5.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v3.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_bgra),  // %0
     "+r"(src_bgra_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
     "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
-#endif  // HAS_BGRATOUVROW_NEON
 
-#ifdef HAS_ABGRTOUVROW_NEON
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
   asm volatile (
     RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
-    "urshr      v2.8h, v2.8h, #1               \n"
-    "urshr      v1.8h, v1.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v3.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v3.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v3.8h, #1              \n"  // 2x average
+      "urshr       v2.8h, v2.8h, #1              \n"
+      "urshr       v1.8h, v1.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v2.8h, v1.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_abgr),  // %0
     "+r"(src_abgr_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
     "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
-#endif  // HAS_ABGRTOUVROW_NEON
 
-#ifdef HAS_RGBATOUVROW_NEON
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
   asm volatile (
     RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v1.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v3.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v5.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v7.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_rgba),  // %0
     "+r"(src_rgba_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
     "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
-#endif  // HAS_RGBATOUVROW_NEON
 
-#ifdef HAS_RGB24TOUVROW_NEON
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int pix) {
-  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
   asm volatile (
     RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
-    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "1:                                        \n"
+      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_rgb24),  // %0
     "+r"(src_rgb24_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
     "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
-#endif  // HAS_RGB24TOUVROW_NEON
 
-#ifdef HAS_RAWTOUVROW_NEON
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int pix) {
-  const uint8* src_raw_1 = src_raw + src_stride_raw;
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_raw_1 = src_raw + src_stride_raw;
   asm volatile (
     RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
-    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
-    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v0.8h, v0.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+      "1:                                        \n"
+      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v2.8h, v2.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v0.8h, v0.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
     RGBTOUV(v2.8h, v1.8h, v0.8h)
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
   : "+r"(src_raw),  // %0
     "+r"(src_raw_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
+    "+r"(width)        // %4
   :
   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
     "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
-#endif  // HAS_RAWTOUVROW_NEON
 
-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
-#ifdef HAS_RGB565TOUVROW_NEON
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int pix) {
-  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
-  asm volatile (
-    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
-    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
-    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
-    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
-    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
-    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    "ins        v16.D[1], v17.D[0]             \n"
-    "ins        v18.D[1], v19.D[0]             \n"
-    "ins        v20.D[1], v21.D[0]             \n"
-
-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-    "urshr      v5.8h, v18.8h, #1              \n"
-    "urshr      v6.8h, v20.8h, #1              \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
-    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
-    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
-    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
-    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
-    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
-    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
-    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(src_rgb565_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
-    "v25", "v26", "v27"
-  );
-}
-#endif  // HAS_RGB565TOUVROW_NEON
-
-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
-#ifdef HAS_ARGB1555TOUVROW_NEON
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                        uint8* dst_u, uint8* dst_v, int pix) {
-  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    "ins        v16.D[1], v26.D[0]             \n"
-    "ins        v17.D[1], v27.D[0]             \n"
-    "ins        v18.D[1], v28.D[0]             \n"
-
-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-    "urshr      v5.8h, v17.8h, #1              \n"
-    "urshr      v6.8h, v18.8h, #1              \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
-    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
-    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
-    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
-    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
-    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(src_argb1555_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
-    "v26", "v27", "v28"
-  );
-}
-#endif  // HAS_ARGB1555TOUVROW_NEON
-
-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
-#ifdef HAS_ARGB4444TOUVROW_NEON
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int pix) {
-  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    "ins        v16.D[1], v26.D[0]             \n"
-    "ins        v17.D[1], v27.D[0]             \n"
-    "ins        v18.D[1], v28.D[0]             \n"
-
-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-    "urshr      v5.8h, v17.8h, #1              \n"
-    "urshr      v6.8h, v18.8h, #1              \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
-    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
-    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
-    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
-    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
-    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(src_argb4444_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(pix)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
-    "v26", "v27", "v28"
-
-  );
-}
-#endif  // HAS_ARGB4444TOUVROW_NEON
-
-#ifdef HAS_RGB565TOYROW_NEON
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
-  asm volatile (
-    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
-    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
-    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
-    "movi       v27.8b, #16                    \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    RGB565TOARGB
-    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
-    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v27.8b           \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_y),       // %1
-    "+r"(pix)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
-    "v24", "v25", "v26", "v27"
-  );
-}
-#endif  // HAS_RGB565TOYROW_NEON
-
-#ifdef HAS_ARGB1555TOYROW_NEON
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(pix)            // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
-}
-#endif  // HAS_ARGB1555TOYROW_NEON
-
-#ifdef HAS_ARGB4444TOYROW_NEON
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
-  asm volatile (
-    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
-    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
-    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
-    "movi       v27.8b, #16                    \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
-    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v27.8b           \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(pix)            // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
-  );
-}
-#endif  // HAS_ARGB4444TOYROW_NEON
-
-#ifdef HAS_BGRATOYROW_NEON
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
-  asm volatile (
-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
-    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-#endif  // HAS_BGRATOYROW_NEON
-
-#ifdef HAS_ABGRTOYROW_NEON
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
-  asm volatile (
-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-#endif  // HAS_ABGRTOYROW_NEON
-
-#ifdef HAS_RGBATOYROW_NEON
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
-    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-#endif  // HAS_RGBATOYROW_NEON
-
-#ifdef HAS_RGB24TOYROW_NEON
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_y),      // %1
-    "+r"(pix)         // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-#endif  // HAS_RGB24TOYROW_NEON
-
-#ifdef HAS_RAWTOYROW_NEON
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
-  asm volatile (
-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),  // %0
-    "+r"(dst_y),    // %1
-    "+r"(pix)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
+// 16x2 pixels -> 8x1.  width is number of rgb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+  asm volatile(
+      RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      RGB565TOARGB
+      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 RGB565 pixels.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      RGB565TOARGB
+      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ins         v16.D[1], v26.D[0]            \n"
+      "ins         v17.D[1], v27.D[0]            \n"
+      "ins         v18.D[1], v28.D[0]            \n"
+
+      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
+      "urshr       v1.8h, v17.8h, #1             \n"
+      "urshr       v2.8h, v18.8h, #1             \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb565),    // %0
+        "+r"(src_rgb565_1),  // %1
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28");
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+  asm volatile(
+      RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      RGB555TOARGB
+      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB1555 pixels.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      RGB555TOARGB
+      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ins         v16.D[1], v26.D[0]            \n"
+      "ins         v17.D[1], v27.D[0]            \n"
+      "ins         v18.D[1], v28.D[0]            \n"
+
+      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
+      "urshr       v1.8h, v17.8h, #1             \n"
+      "urshr       v2.8h, v18.8h, #1             \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb1555),    // %0
+        "+r"(src_argb1555_1),  // %1
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28");
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+  asm volatile(
+      RGBTOUV_SETUP_REG  // sets v20-v25
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      ARGB4444TOARGB
+      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB4444 pixels.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      ARGB4444TOARGB
+      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ins         v16.D[1], v26.D[0]            \n"
+      "ins         v17.D[1], v27.D[0]            \n"
+      "ins         v18.D[1], v28.D[0]            \n"
+
+      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
+      "urshr       v1.8h, v17.8h, #1             \n"
+      "urshr       v2.8h, v18.8h, #1             \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb4444),    // %0
+        "+r"(src_argb4444_1),  // %1
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28"
+
+  );
+}
+
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
+      "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
+      "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
+      "movi        v27.8b, #16                   \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "umull       v3.8h, v0.8b, v24.8b          \n"  // B
+      "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
+      "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v27.8b          \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
+        "v27");
+}
+
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "umull       v3.8h, v0.8b, v4.8b           \n"  // B
+      "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
+      "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
+      "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
+      "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
+      "movi        v27.8b, #16                   \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "umull       v3.8h, v0.8b, v24.8b          \n"  // B
+      "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
+      "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v27.8b          \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi        v4.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v6.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v1.8b, v4.8b          \n"  // R
+      "umlal       v16.8h, v2.8b, v5.8b          \n"  // G
+      "umlal       v16.8h, v3.8b, v6.8b          \n"  // B
+      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi        v6.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v4.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v0.8b, v4.8b          \n"  // R
+      "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
+      "umlal       v16.8h, v2.8b, v6.8b          \n"  // B
+      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v1.8b, v4.8b          \n"  // B
+      "umlal       v16.8h, v2.8b, v5.8b          \n"  // G
+      "umlal       v16.8h, v3.8b, v6.8b          \n"  // R
+      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v0.8b, v4.8b          \n"  // B
+      "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
+      "umlal       v16.8h, v2.8b, v6.8b          \n"  // R
+      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_y),      // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi        v6.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v4.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v0.8b, v4.8b          \n"  // B
+      "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
+      "umlal       v16.8h, v2.8b, v6.8b          \n"  // R
+      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_raw),  // %0
+        "+r"(dst_y),    // %1
+        "+r"(width)     // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  asm volatile(
+      "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
+      "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
+      "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v0.8h, v0.8b, v4.8b           \n"  // B
+      "umlal       v0.8h, v1.8b, v5.8b           \n"  // G
+      "umlal       v0.8h, v2.8b, v6.8b           \n"  // R
+      "uqrshrn     v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_yj),     // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  asm volatile(
+      "movi        v6.8b, #29                    \n"  // B * 0.1140 coefficient
+      "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
+      "movi        v4.8b, #77                    \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v0.8h, v0.8b, v4.8b           \n"  // B
+      "umlal       v0.8h, v1.8b, v5.8b           \n"  // G
+      "umlal       v0.8h, v2.8b, v6.8b           \n"  // R
+      "uqrshrn     v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_raw),  // %0
+        "+r"(dst_yj),   // %1
+        "+r"(width)     // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }
-#endif  // HAS_RAWTOYROW_NEON
 
 // Bilinear filter 16x2 -> 16x1
-#ifdef HAS_INTERPOLATEROW_NEON
-void InterpolateRow_NEON(uint8* dst_ptr,
-                         const uint8* src_ptr, ptrdiff_t src_stride,
-                         int dst_width, int source_y_fraction) {
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  asm volatile (
-    "cmp        %w4, #0                        \n"
-    "b.eq       100f                           \n"
-    "cmp        %w4, #64                       \n"
-    "b.eq       75f                            \n"
-    "cmp        %w4, #128                      \n"
-    "b.eq       50f                            \n"
-    "cmp        %w4, #192                      \n"
-    "b.eq       25f                            \n"
-
-    "dup        v5.16b, %w4                    \n"
-    "dup        v4.16b, %w5                    \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    MEMACCESS(2)
-    "ld1        {v1.16b}, [%2], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    "umull      v2.8h, v0.8b,  v4.8b           \n"
-    "umull2     v3.8h, v0.16b, v4.16b          \n"
-    "umlal      v2.8h, v1.8b,  v5.8b           \n"
-    "umlal2     v3.8h, v1.16b, v5.16b          \n"
-    "rshrn      v0.8b,  v2.8h, #8              \n"
-    "rshrn2     v0.16b, v3.8h, #8              \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       1b                             \n"
-    "b          99f                            \n"
-
-    // Blend 25 / 75.
-  "25:                                         \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    MEMACCESS(2)
-    "ld1        {v1.16b}, [%2], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    "urhadd     v0.16b, v0.16b, v1.16b         \n"
-    "urhadd     v0.16b, v0.16b, v1.16b         \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       25b                            \n"
-    "b          99f                            \n"
-
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    MEMACCESS(2)
-    "ld1        {v1.16b}, [%2], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    "urhadd     v0.16b, v0.16b, v1.16b         \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       50b                            \n"
-    "b          99f                            \n"
-
-    // Blend 75 / 25.
-  "75:                                         \n"
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"
-    MEMACCESS(2)
-    "ld1        {v0.16b}, [%2], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    "urhadd     v0.16b, v0.16b, v1.16b         \n"
-    "urhadd     v0.16b, v0.16b, v1.16b         \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       75b                            \n"
-    "b          99f                            \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       100b                           \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_ptr1),         // %2
-    "+r"(dst_width),        // %3
-    "+r"(y1_fraction),      // %4
-    "+r"(y0_fraction)       // %5
-  :
-  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
-  );
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  asm volatile(
+      "cmp         %w4, #0                       \n"
+      "b.eq        100f                          \n"
+      "cmp         %w4, #128                     \n"
+      "b.eq        50f                           \n"
+
+      "dup         v5.16b, %w4                   \n"
+      "dup         v4.16b, %w5                   \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"
+      "umull       v2.8h, v0.8b,  v4.8b          \n"
+      "umull2      v3.8h, v0.16b, v4.16b         \n"
+      "umlal       v2.8h, v1.8b,  v5.8b          \n"
+      "umlal2      v3.8h, v1.16b, v5.16b         \n"
+      "rshrn       v0.8b,  v2.8h, #8             \n"
+      "rshrn2      v0.16b, v3.8h, #8             \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        1b                            \n"
+      "b           99f                           \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        50b                           \n"
+      "b           99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        100b                          \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),      // %0
+        "+r"(src_ptr),      // %1
+        "+r"(src_ptr1),     // %2
+        "+r"(dst_width),    // %3
+        "+r"(y1_fraction),  // %4
+        "+r"(y0_fraction)   // %5
+      :
+      : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
 }
-#endif  // HAS_INTERPOLATEROW_NEON
 
 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-#ifdef HAS_ARGBBLENDROW_NEON
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
-  asm volatile (
-    "subs       %w3, %w3, #8                   \n"
-    "b.lt       89f                            \n"
-    // Blend 8 pixels.
-  "8:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
-    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
-    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
-    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
-    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
-    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
-    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
-    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
-    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
-    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
-    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
-    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
-    "movi       v3.8b, #255                    \n"  // a = 255
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.ge       8b                             \n"
-
-  "89:                                         \n"
-    "adds       %w3, %w3, #8-1                 \n"
-    "b.lt       99f                            \n"
-
-    // Blend 1 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
-    MEMACCESS(1)
-    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
-    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
-    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
-    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
-    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
-    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
-    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
-    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
-    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
-    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
-    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
-    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
-    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
-    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
-    "movi       v3.8b, #255                    \n"  // a = 255
-    MEMACCESS(2)
-    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
-    "b.ge       1b                             \n"
-
-  "99:                                         \n"
-
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v16", "v17", "v18"
-  );
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "subs        %w3, %w3, #8                  \n"
+      "b.lt        89f                           \n"
+      // Blend 8 pixels.
+      "8:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
+      "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
+      "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
+      "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
+      "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
+      "uqrshrn     v18.8b, v18.8h, #8            \n"  // dr >>= 8
+      "uqsub       v4.8b, v4.8b, v16.8b          \n"  // db - (db * a / 256)
+      "uqsub       v5.8b, v5.8b, v17.8b          \n"  // dg - (dg * a / 256)
+      "uqsub       v6.8b, v6.8b, v18.8b          \n"  // dr - (dr * a / 256)
+      "uqadd       v0.8b, v0.8b, v4.8b           \n"  // + sb
+      "uqadd       v1.8b, v1.8b, v5.8b           \n"  // + sg
+      "uqadd       v2.8b, v2.8b, v6.8b           \n"  // + sr
+      "movi        v3.8b, #255                   \n"  // a = 255
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+                                                             // pixels
+      "b.ge        8b                            \n"
+
+      "89:                                       \n"
+      "adds        %w3, %w3, #8-1                \n"
+      "b.lt        99f                           \n"
+
+      // Blend 1 pixels.
+      "1:                                        \n"
+      "ld4         {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel
+                                                           // ARGB0.
+      "ld4         {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel
+                                                           // ARGB1.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #1                  \n"  // 1 processed per loop.
+      "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
+      "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
+      "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
+      "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
+      "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
+      "uqrshrn     v18.8b, v18.8h, #8            \n"  // dr >>= 8
+      "uqsub       v4.8b, v4.8b, v16.8b          \n"  // db - (db * a / 256)
+      "uqsub       v5.8b, v5.8b, v17.8b          \n"  // dg - (dg * a / 256)
+      "uqsub       v6.8b, v6.8b, v18.8b          \n"  // dr - (dr * a / 256)
+      "uqadd       v0.8b, v0.8b, v4.8b           \n"  // + sb
+      "uqadd       v1.8b, v1.8b, v5.8b           \n"  // + sg
+      "uqadd       v2.8b, v2.8b, v6.8b           \n"  // + sr
+      "movi        v3.8b, #255                   \n"  // a = 255
+      "st4         {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+      "b.ge        1b                            \n"
+
+      "99:                                       \n"
+
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18");
 }
-#endif  // HAS_ARGBBLENDROW_NEON
 
 // Attenuate 8 pixels at a time.
-#ifdef HAS_ARGBATTENUATEROW_NEON
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    // Attenuate 8 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
-    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
-    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
-    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
-    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
-    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
-  );
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      // Attenuate 8 pixels.
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v3.8b           \n"  // b * a
+      "umull       v5.8h, v1.8b, v3.8b           \n"  // g * a
+      "umull       v6.8h, v2.8b, v3.8b           \n"  // r * a
+      "uqrshrn     v0.8b, v4.8h, #8              \n"  // b >>= 8
+      "uqrshrn     v1.8b, v5.8h, #8              \n"  // g >>= 8
+      "uqrshrn     v2.8b, v6.8h, #8              \n"  // r >>= 8
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }
-#endif  // HAS_ARGBATTENUATEROW_NEON
 
 // Quantize 8 ARGB pixels (32 bytes).
 // dst = (dst * scale >> 16) * interval_size + interval_offset;
-#ifdef HAS_ARGBQUANTIZEROW_NEON
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "dup        v4.8h, %w2                     \n"
-    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
-    "dup        v5.8h, %w3                     \n"  // interval multiply.
-    "dup        v6.8h, %w4                     \n"  // interval add
-
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
-    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
-    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
-    "uxtl       v1.8h, v1.8b                   \n"
-    "uxtl       v2.8h, v2.8b                   \n"
-    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
-    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
-    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
-    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
-    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
-    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
-    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
-    "add        v1.8h, v1.8h, v6.8h            \n"  // g
-    "add        v2.8h, v2.8h, v6.8h            \n"  // r
-    "uqxtn      v0.8b, v0.8h                   \n"
-    "uqxtn      v1.8b, v1.8h                   \n"
-    "uqxtn      v2.8b, v2.8h                   \n"
-    MEMACCESS(0)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
-  );
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "dup         v4.8h, %w2                    \n"
+      "ushr        v4.8h, v4.8h, #1              \n"  // scale >>= 1
+      "dup         v5.8h, %w3                    \n"  // interval multiply.
+      "dup         v6.8h, %w4                    \n"  // interval add
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8  ARGB.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w1, %w1, #8                  \n"  // 8 processed per loop.
+      "uxtl        v0.8h, v0.8b                  \n"  // b (0 .. 255)
+      "uxtl        v1.8h, v1.8b                  \n"
+      "uxtl        v2.8h, v2.8b                  \n"
+      "sqdmulh     v0.8h, v0.8h, v4.8h           \n"  // b * scale
+      "sqdmulh     v1.8h, v1.8h, v4.8h           \n"  // g
+      "sqdmulh     v2.8h, v2.8h, v4.8h           \n"  // r
+      "mul         v0.8h, v0.8h, v5.8h           \n"  // b * interval_size
+      "mul         v1.8h, v1.8h, v5.8h           \n"  // g
+      "mul         v2.8h, v2.8h, v5.8h           \n"  // r
+      "add         v0.8h, v0.8h, v6.8h           \n"  // b + interval_offset
+      "add         v1.8h, v1.8h, v6.8h           \n"  // g
+      "add         v2.8h, v2.8h, v6.8h           \n"  // r
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "uqxtn       v1.8b, v1.8h                  \n"
+      "uqxtn       v2.8b, v2.8h                  \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }
-#endif  // HAS_ARGBQUANTIZEROW_NEON
 
 // Shade 8 pixels at a time by specified value.
 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-#ifdef HAS_ARGBSHADEROW_NEON
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
-    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
-    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
-
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
-    "uxtl       v5.8h, v5.8b                   \n"
-    "uxtl       v6.8h, v6.8b                   \n"
-    "uxtl       v7.8h, v7.8b                   \n"
-    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
-    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
-    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
-    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
-    "uqxtn      v4.8b, v4.8h                   \n"
-    "uqxtn      v5.8b, v5.8h                   \n"
-    "uqxtn      v6.8b, v6.8h                   \n"
-    "uqxtn      v7.8b, v7.8h                   \n"
-    MEMACCESS(1)
-    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),       // %0
-    "+r"(dst_argb),       // %1
-    "+r"(width)           // %2
-  : "r"(value)            // %3
-  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
-  );
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "dup         v0.4s, %w3                    \n"  // duplicate scale value.
+      "zip1        v0.8b, v0.8b, v0.8b           \n"  // v0.8b aarrggbb.
+      "ushr        v0.8h, v0.8h, #1              \n"  // scale / 2.
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "uxtl        v4.8h, v4.8b                  \n"  // b (0 .. 255)
+      "uxtl        v5.8h, v5.8b                  \n"
+      "uxtl        v6.8h, v6.8b                  \n"
+      "uxtl        v7.8h, v7.8b                  \n"
+      "sqrdmulh    v4.8h, v4.8h, v0.h[0]         \n"  // b * scale * 2
+      "sqrdmulh    v5.8h, v5.8h, v0.h[1]         \n"  // g
+      "sqrdmulh    v6.8h, v6.8h, v0.h[2]         \n"  // r
+      "sqrdmulh    v7.8h, v7.8h, v0.h[3]         \n"  // a
+      "uqxtn       v4.8b, v4.8h                  \n"
+      "uqxtn       v5.8b, v5.8h                  \n"
+      "uqxtn       v6.8b, v6.8h                  \n"
+      "uqxtn       v7.8b, v7.8h                  \n"
+      "st4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
 }
-#endif  // HAS_ARGBSHADEROW_NEON
 
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 // Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-#ifdef HAS_ARGBGRAYROW_NEON
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
-    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
-    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
-    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
-    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
-    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
-    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
-  );
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi        v24.8b, #29                   \n"  // B * 0.1140 coefficient
+      "movi        v25.8b, #150                  \n"  // G * 0.5870 coefficient
+      "movi        v26.8b, #77                   \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v24.8b          \n"  // B
+      "umlal       v4.8h, v1.8b, v25.8b          \n"  // G
+      "umlal       v4.8h, v2.8b, v26.8b          \n"  // R
+      "uqrshrn     v0.8b, v4.8h, #8              \n"  // 16 bit to 8 bit B
+      "orr         v1.8b, v0.8b, v0.8b           \n"  // G
+      "orr         v2.8b, v0.8b, v0.8b           \n"  // R
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
 }
-#endif  // HAS_ARGBGRAYROW_NEON
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 //    b = (r * 35 + g * 68 + b * 17) >> 7
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 
-#ifdef HAS_ARGBSEPIAROW_NEON
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v20.8b, #17                    \n"  // BB coefficient
-    "movi       v21.8b, #68                    \n"  // BG coefficient
-    "movi       v22.8b, #35                    \n"  // BR coefficient
-    "movi       v24.8b, #22                    \n"  // GB coefficient
-    "movi       v25.8b, #88                    \n"  // GG coefficient
-    "movi       v26.8b, #45                    \n"  // GR coefficient
-    "movi       v28.8b, #24                    \n"  // BB coefficient
-    "movi       v29.8b, #98                    \n"  // BG coefficient
-    "movi       v30.8b, #50                    \n"  // BR coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
-    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
-    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
-    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
-    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
-    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
-    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
-    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
-    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
-    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
-    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
-    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
-    MEMACCESS(0)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(dst_argb),  // %0
-    "+r"(width)      // %1
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
-  );
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi        v20.8b, #17                   \n"  // BB coefficient
+      "movi        v21.8b, #68                   \n"  // BG coefficient
+      "movi        v22.8b, #35                   \n"  // BR coefficient
+      "movi        v24.8b, #22                   \n"  // GB coefficient
+      "movi        v25.8b, #88                   \n"  // GG coefficient
+      "movi        v26.8b, #45                   \n"  // GR coefficient
+      "movi        v28.8b, #24                   \n"  // BB coefficient
+      "movi        v29.8b, #98                   \n"  // BG coefficient
+      "movi        v30.8b, #50                   \n"  // BR coefficient
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w1, %w1, #8                  \n"  // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v20.8b          \n"  // B to Sepia B
+      "umlal       v4.8h, v1.8b, v21.8b          \n"  // G
+      "umlal       v4.8h, v2.8b, v22.8b          \n"  // R
+      "umull       v5.8h, v0.8b, v24.8b          \n"  // B to Sepia G
+      "umlal       v5.8h, v1.8b, v25.8b          \n"  // G
+      "umlal       v5.8h, v2.8b, v26.8b          \n"  // R
+      "umull       v6.8h, v0.8b, v28.8b          \n"  // B to Sepia R
+      "umlal       v6.8h, v1.8b, v29.8b          \n"  // G
+      "umlal       v6.8h, v2.8b, v30.8b          \n"  // R
+      "uqshrn      v0.8b, v4.8h, #7              \n"  // 16 bit to 8 bit B
+      "uqshrn      v1.8b, v5.8h, #7              \n"  // 16 bit to 8 bit G
+      "uqshrn      v2.8b, v6.8h, #7              \n"  // 16 bit to 8 bit R
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+      "b.gt        1b                            \n"
+      : "+r"(dst_argb),  // %0
+        "+r"(width)      // %1
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
 }
-#endif  // HAS_ARGBSEPIAROW_NEON
 
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
 // needs to saturate.  Consider doing a non-saturating version.
-#ifdef HAS_ARGBCOLORMATRIXROW_NEON
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
-    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
-    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
-    "uxtl       v17.8h, v17.8b                 \n"  // g
-    "uxtl       v18.8h, v18.8b                 \n"  // r
-    "uxtl       v19.8h, v19.8b                 \n"  // a
-    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
-    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
-    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
-    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
-    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
-    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
-    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
-    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
-    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
-    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
-    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
-    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
-    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
-    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
-    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
-    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
-    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
-    MEMACCESS(1)
-    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "r"(matrix_argb)  // %3
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
-    "v18", "v19", "v22", "v23", "v24", "v25"
-  );
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width) {
+  asm volatile(
+      "ld1         {v2.16b}, [%3]                \n"  // load 3 ARGB vectors.
+      "sxtl        v0.8h, v2.8b                  \n"  // B,G coefficients s16.
+      "sxtl2       v1.8h, v2.16b                 \n"  // R,A coefficients s16.
+
+      "1:                                        \n"
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "uxtl        v16.8h, v16.8b                \n"  // b (0 .. 255) 16 bit
+      "uxtl        v17.8h, v17.8b                \n"  // g
+      "uxtl        v18.8h, v18.8b                \n"  // r
+      "uxtl        v19.8h, v19.8b                \n"  // a
+      "mul         v22.8h, v16.8h, v0.h[0]       \n"  // B = B * Matrix B
+      "mul         v23.8h, v16.8h, v0.h[4]       \n"  // G = B * Matrix G
+      "mul         v24.8h, v16.8h, v1.h[0]       \n"  // R = B * Matrix R
+      "mul         v25.8h, v16.8h, v1.h[4]       \n"  // A = B * Matrix A
+      "mul         v4.8h, v17.8h, v0.h[1]        \n"  // B += G * Matrix B
+      "mul         v5.8h, v17.8h, v0.h[5]        \n"  // G += G * Matrix G
+      "mul         v6.8h, v17.8h, v1.h[1]        \n"  // R += G * Matrix R
+      "mul         v7.8h, v17.8h, v1.h[5]        \n"  // A += G * Matrix A
+      "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
+      "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
+      "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
+      "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
+      "mul         v4.8h, v18.8h, v0.h[2]        \n"  // B += R * Matrix B
+      "mul         v5.8h, v18.8h, v0.h[6]        \n"  // G += R * Matrix G
+      "mul         v6.8h, v18.8h, v1.h[2]        \n"  // R += R * Matrix R
+      "mul         v7.8h, v18.8h, v1.h[6]        \n"  // A += R * Matrix A
+      "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
+      "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
+      "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
+      "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
+      "mul         v4.8h, v19.8h, v0.h[3]        \n"  // B += A * Matrix B
+      "mul         v5.8h, v19.8h, v0.h[7]        \n"  // G += A * Matrix G
+      "mul         v6.8h, v19.8h, v1.h[3]        \n"  // R += A * Matrix R
+      "mul         v7.8h, v19.8h, v1.h[7]        \n"  // A += A * Matrix A
+      "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
+      "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
+      "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
+      "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
+      "sqshrun     v16.8b, v22.8h, #6            \n"  // 16 bit to 8 bit B
+      "sqshrun     v17.8b, v23.8h, #6            \n"  // 16 bit to 8 bit G
+      "sqshrun     v18.8b, v24.8h, #6            \n"  // 16 bit to 8 bit R
+      "sqshrun     v19.8b, v25.8h, #6            \n"  // 16 bit to 8 bit A
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v22", "v23", "v24", "v25");
 }
-#endif  // HAS_ARGBCOLORMATRIXROW_NEON
 
 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-#ifdef HAS_ARGBMULTIPLYROW_NEON
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
-    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
-    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
-    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
-    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
-    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
-    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
-    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "umull       v0.8h, v0.8b, v4.8b           \n"  // multiply B
+      "umull       v1.8h, v1.8b, v5.8b           \n"  // multiply G
+      "umull       v2.8h, v2.8b, v6.8b           \n"  // multiply R
+      "umull       v3.8h, v3.8b, v7.8b           \n"  // multiply A
+      "rshrn       v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit B
+      "rshrn       v1.8b, v1.8h, #8              \n"  // 16 bit to 8 bit G
+      "rshrn       v2.8b, v2.8h, #8              \n"  // 16 bit to 8 bit R
+      "rshrn       v3.8b, v3.8h, #8              \n"  // 16 bit to 8 bit A
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
-#endif  // HAS_ARGBMULTIPLYROW_NEON
 
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-#ifdef HAS_ARGBADDROW_NEON
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v0.8b, v0.8b, v4.8b            \n"
-    "uqadd      v1.8b, v1.8b, v5.8b            \n"
-    "uqadd      v2.8b, v2.8b, v6.8b            \n"
-    "uqadd      v3.8b, v3.8b, v7.8b            \n"
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqadd       v0.8b, v0.8b, v4.8b           \n"
+      "uqadd       v1.8b, v1.8b, v5.8b           \n"
+      "uqadd       v2.8b, v2.8b, v6.8b           \n"
+      "uqadd       v3.8b, v3.8b, v7.8b           \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
-#endif  // HAS_ARGBADDROW_NEON
 
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-#ifdef HAS_ARGBSUBTRACTROW_NEON
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqsub      v0.8b, v0.8b, v4.8b            \n"
-    "uqsub      v1.8b, v1.8b, v5.8b            \n"
-    "uqsub      v2.8b, v2.8b, v6.8b            \n"
-    "uqsub      v3.8b, v3.8b, v7.8b            \n"
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqsub       v0.8b, v0.8b, v4.8b           \n"
+      "uqsub       v1.8b, v1.8b, v5.8b           \n"
+      "uqsub       v2.8b, v2.8b, v6.8b           \n"
+      "uqsub       v3.8b, v3.8b, v7.8b           \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
-#endif  // HAS_ARGBSUBTRACTROW_NEON
 
 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
 // A = 255
 // R = Sobel
 // G = Sobel
 // B = Sobel
-#ifdef HAS_SOBELROW_NEON
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
-    "orr        v1.8b, v0.8b, v0.8b            \n"
-    "orr        v2.8b, v0.8b, v0.8b            \n"
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "movi        v3.8b, #255                   \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld1         {v0.8b}, [%0], #8             \n"  // load 8 sobelx.
+      "ld1         {v1.8b}, [%1], #8             \n"  // load 8 sobely.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqadd       v0.8b, v0.8b, v1.8b           \n"  // add
+      "orr         v1.8b, v0.8b, v0.8b           \n"
+      "orr         v2.8b, v0.8b, v0.8b           \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
-#endif  // HAS_SOBELROW_NEON
 
 // Adds Sobel X and Sobel Y and stores Sobel into plane.
-#ifdef HAS_SOBELTOPLANEROW_NEON
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  asm volatile (
-    // 16 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
-    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
-    MEMACCESS(2)
-    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1"
-  );
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      // 16 pixel loop.
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 sobelx.
+      "ld1         {v1.16b}, [%1], #16           \n"  // load 16 sobely.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
+      "uqadd       v0.16b, v0.16b, v1.16b        \n"  // add
+      "st1         {v0.16b}, [%2], #16           \n"  // store 16 pixels.
+      "b.gt        1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1");
 }
-#endif  // HAS_SOBELTOPLANEROW_NEON
 
 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
 // A = 255
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-#ifdef HAS_SOBELXYROW_NEON
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "movi        v3.8b, #255                   \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld1         {v2.8b}, [%0], #8             \n"  // load 8 sobelx.
+      "ld1         {v0.8b}, [%1], #8             \n"  // load 8 sobely.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqadd       v1.8b, v0.8b, v2.8b           \n"  // add
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
-#endif  // HAS_SOBELXYROW_NEON
 
 // SobelX as a matrix is
 // -1  0  1
 // -2  0  2
 // -1  0  1
-#ifdef HAS_SOBELXROW_NEON
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0],%5               \n"  // top
-    MEMACCESS(0)
-    "ld1        {v1.8b}, [%0],%6               \n"
-    "usubl      v0.8h, v0.8b, v1.8b            \n"
-    MEMACCESS(1)
-    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1],%6               \n"
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    MEMACCESS(2)
-    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
-    MEMACCESS(2)
-    "ld1        {v3.8b}, [%2],%6               \n"
-    "subs       %w4, %w4, #8                   \n"  // 8 pixels
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "abs        v0.8h, v0.8h                   \n"
-    "uqxtn      v0.8b, v0.8h                   \n"
-    MEMACCESS(3)
-    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
-    "b.gt       1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  : "r"(2LL),          // %5
-    "r"(6LL)           // %6
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.8b}, [%0],%5              \n"  // top
+      "ld1         {v1.8b}, [%0],%6              \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "usubl       v0.8h, v0.8b, v1.8b           \n"
+      "ld1         {v2.8b}, [%1],%5              \n"  // center * 2
+      "ld1         {v3.8b}, [%1],%6              \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "ld1         {v2.8b}, [%2],%5              \n"  // bottom
+      "ld1         {v3.8b}, [%2],%6              \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "subs        %w4, %w4, #8                  \n"  // 8 pixels
+      "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "abs         v0.8h, v0.8h                  \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "st1         {v0.8b}, [%3], #8             \n"  // store 8 sobelx
+      "b.gt        1b                            \n"
+      : "+r"(src_y0),                           // %0
+        "+r"(src_y1),                           // %1
+        "+r"(src_y2),                           // %2
+        "+r"(dst_sobelx),                       // %3
+        "+r"(width)                             // %4
+      : "r"(2LL),                               // %5
+        "r"(6LL)                                // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
-#endif  // HAS_SOBELXROW_NEON
 
 // SobelY as a matrix is
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-#ifdef HAS_SOBELYROW_NEON
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0],%4               \n"  // left
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1],%4               \n"
-    "usubl      v0.8h, v0.8b, v1.8b            \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1],%4               \n"
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0],%5               \n"  // right
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1],%5               \n"
-    "subs       %w3, %w3, #8                   \n"  // 8 pixels
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "abs        v0.8h, v0.8h                   \n"
-    "uqxtn      v0.8b, v0.8h                   \n"
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
-    "b.gt       1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  : "r"(1LL),          // %4
-    "r"(6LL)           // %5
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.8b}, [%0],%4              \n"  // left
+      "ld1         {v1.8b}, [%1],%4              \n"
+      "usubl       v0.8h, v0.8b, v1.8b           \n"
+      "ld1         {v2.8b}, [%0],%4              \n"  // center * 2
+      "ld1         {v3.8b}, [%1],%4              \n"
+      "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "ld1         {v2.8b}, [%0],%5              \n"  // right
+      "ld1         {v3.8b}, [%1],%5              \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 pixels
+      "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "abs         v0.8h, v0.8h                  \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 sobely
+      "b.gt        1b                            \n"
+      : "+r"(src_y0),                           // %0
+        "+r"(src_y1),                           // %1
+        "+r"(dst_sobely),                       // %2
+        "+r"(width)                             // %3
+      : "r"(1LL),                               // %4
+        "r"(6LL)                                // %5
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// Caveat - rounds float to half float whereas scaling version truncates.
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float /*unused*/,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
+      "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
+      "uxtl2       v3.4s, v1.8h                  \n"
+      "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
+      "scvtf       v3.4s, v3.4s                  \n"
+      "fcvtn       v1.4h, v2.4s                  \n"  // 8 half floats
+      "fcvtn2      v1.8h, v3.4s                  \n"
+      "st1         {v1.16b}, [%1], #16           \n"  // store 8 shorts
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
+      "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
+      "uxtl2       v3.4s, v1.8h                  \n"
+      "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
+      "scvtf       v3.4s, v3.4s                  \n"
+      "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // adjust exponent
+      "fmul        v3.4s, v3.4s, %3.s[0]         \n"
+      "uqshrn      v1.4h, v2.4s, #13             \n"  // isolate halffloat
+      "uqshrn2     v1.8h, v3.4s, #13             \n"
+      "st1         {v1.16b}, [%1], #16           \n"  // store 8 shorts
+      "b.gt        1b                            \n"
+      : "+r"(src),                      // %0
+        "+r"(dst),                      // %1
+        "+r"(width)                     // %2
+      : "w"(scale * 1.9259299444e-34f)  // %3
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.8b}, [%0], #8             \n"  // load 8 bytes
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
+      "uxtl        v1.8h, v1.8b                  \n"  // 8 shorts
+      "uxtl        v2.4s, v1.4h                  \n"  // 8 ints
+      "uxtl2       v3.4s, v1.8h                  \n"
+      "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
+      "scvtf       v3.4s, v3.4s                  \n"
+      "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // scale
+      "fmul        v3.4s, v3.4s, %3.s[0]         \n"
+      "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // store 8 floats
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+float ScaleMaxSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width) {
+  float fmax;
+  asm volatile(
+      "movi        v5.4s, #0                     \n"  // max
+      "movi        v6.4s, #0                     \n"
+
+      "1:                                        \n"
+      "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
+      "fmul        v4.4s, v2.4s, %4.s[0]         \n"  // scale
+      "fmax        v5.4s, v5.4s, v1.4s           \n"  // max
+      "fmax        v6.4s, v6.4s, v2.4s           \n"
+      "st1         {v3.4s, v4.4s}, [%1], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      "fmax        v5.4s, v5.4s, v6.4s           \n"  // max
+      "fmaxv       %s3, v5.4s                    \n"  // signed max acculator
+      : "+r"(src),                                    // %0
+        "+r"(dst),                                    // %1
+        "+r"(width),                                  // %2
+        "=w"(fmax)                                    // %3
+      : "w"(scale)                                    // %4
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+  return fmax;
+}
+
+float ScaleSumSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width) {
+  float fsum;
+  asm volatile(
+      "movi        v5.4s, #0                     \n"  // max
+      "movi        v6.4s, #0                     \n"  // max
+
+      "1:                                        \n"
+      "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
+      "fmul        v4.4s, v2.4s, %4.s[0]         \n"
+      "fmla        v5.4s, v1.4s, v1.4s           \n"  // sum of squares
+      "fmla        v6.4s, v2.4s, v2.4s           \n"
+      "st1         {v3.4s, v4.4s}, [%1], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      "faddp       v5.4s, v5.4s, v6.4s           \n"
+      "faddp       v5.4s, v5.4s, v5.4s           \n"
+      "faddp       %3.4s, v5.4s, v5.4s           \n"  // sum
+      : "+r"(src),                                    // %0
+        "+r"(dst),                                    // %1
+        "+r"(width),                                  // %2
+        "=w"(fsum)                                    // %3
+      : "w"(scale)                                    // %4
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+  return fsum;
+}
+
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "fmul        v1.4s, v1.4s, %3.s[0]         \n"  // scale
+      "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // scale
+      "st1         {v1.4s, v2.4s}, [%1], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "v1", "v2");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+                   const uint16_t* src1,
+                   const uint16_t* src2,
+                   const uint16_t* src3,
+                   const uint16_t* src4,
+                   uint32_t* dst,
+                   int width) {
+  asm volatile(
+      "movi        v6.8h, #4                     \n"  // constant 4
+      "movi        v7.8h, #6                     \n"  // constant 6
+
+      "1:                                        \n"
+      "ld1         {v1.8h}, [%0], #16            \n"  // load 8 samples, 5 rows
+      "ld1         {v2.8h}, [%4], #16            \n"
+      "uaddl       v0.4s, v1.4h, v2.4h           \n"  // * 1
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddl2      v1.4s, v1.8h, v2.8h           \n"  // * 1
+      "ld1         {v2.8h}, [%1], #16            \n"
+      "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
+      "ld1         {v2.8h}, [%2], #16            \n"
+      "umlal       v0.4s, v2.4h, v7.4h           \n"  // * 6
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "umlal2      v1.4s, v2.8h, v7.8h           \n"  // * 6
+      "ld1         {v2.8h}, [%3], #16            \n"
+      "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
+      "subs        %w6, %w6, #8                  \n"  // 8 processed per loop
+      "st1         {v0.4s,v1.4s}, [%5], #32      \n"  // store 8 samples
+      "prfm        pldl1keep, [%4, 448]          \n"
+      "b.gt        1b                            \n"
+      : "+r"(src0),  // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(src4),  // %4
+        "+r"(dst),   // %5
+        "+r"(width)  // %6
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+  const uint32_t* src1 = src + 1;
+  const uint32_t* src2 = src + 2;
+  const uint32_t* src3 = src + 3;
+  asm volatile(
+      "movi        v6.4s, #4                     \n"  // constant 4
+      "movi        v7.4s, #6                     \n"  // constant 6
+
+      "1:                                        \n"
+      "ld1         {v0.4s,v1.4s,v2.4s}, [%0], %6 \n"  // load 12 source samples
+      "add         v0.4s, v0.4s, v1.4s           \n"  // * 1
+      "add         v1.4s, v1.4s, v2.4s           \n"  // * 1
+      "ld1         {v2.4s,v3.4s}, [%2], #32      \n"
+      "mla         v0.4s, v2.4s, v7.4s           \n"  // * 6
+      "mla         v1.4s, v3.4s, v7.4s           \n"  // * 6
+      "ld1         {v2.4s,v3.4s}, [%1], #32      \n"
+      "ld1         {v4.4s,v5.4s}, [%3], #32      \n"
+      "add         v2.4s, v2.4s, v4.4s           \n"  // add rows for * 4
+      "add         v3.4s, v3.4s, v5.4s           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "mla         v0.4s, v2.4s, v6.4s           \n"  // * 4
+      "mla         v1.4s, v3.4s, v6.4s           \n"  // * 4
+      "subs        %w5, %w5, #8                  \n"  // 8 processed per loop
+      "uqrshrn     v0.4h, v0.4s, #8              \n"  // round and pack
+      "uqrshrn2    v0.8h, v1.4s, #8              \n"
+      "st1         {v0.8h}, [%4], #16            \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(dst),   // %4
+        "+r"(width)  // %5
+      : "r"(32LL)    // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_NEON(const float* src0,
+                       const float* src1,
+                       const float* src2,
+                       const float* src3,
+                       const float* src4,
+                       float* dst,
+                       int width) {
+  asm volatile(
+      "ld2r        {v6.4s, v7.4s}, [%7]          \n"  // constants 4 and 6
+
+      "1:                                        \n"
+      "ld1         {v0.4s, v1.4s}, [%0], #32     \n"  // load 8 samples, 5 rows
+      "ld1         {v2.4s, v3.4s}, [%1], #32     \n"
+      "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
+      "ld1         {v4.4s, v5.4s}, [%2], #32     \n"
+      "fmla        v1.4s, v3.4s, v6.4s           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fmla        v0.4s, v4.4s, v7.4s           \n"  // * 6
+      "ld1         {v2.4s, v3.4s}, [%3], #32     \n"
+      "fmla        v1.4s, v5.4s, v7.4s           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
+      "ld1         {v4.4s, v5.4s}, [%4], #32     \n"
+      "fmla        v1.4s, v3.4s, v6.4s           \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "fadd        v0.4s, v0.4s, v4.4s           \n"  // * 1
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "fadd        v1.4s, v1.4s, v5.4s           \n"
+      "prfm        pldl1keep, [%4, 448]          \n"
+      "subs        %w6, %w6, #8                  \n"  // 8 processed per loop
+      "st1         {v0.4s, v1.4s}, [%5], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      : "+r"(src0),               // %0
+        "+r"(src1),               // %1
+        "+r"(src2),               // %2
+        "+r"(src3),               // %3
+        "+r"(src4),               // %4
+        "+r"(dst),                // %5
+        "+r"(width)               // %6
+      : "r"(&kGaussCoefficients)  // %7
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_F32_NEON(const float* src, float* dst, int width) {
+  asm volatile(
+      "ld3r        {v6.4s, v7.4s, v8.4s}, [%3]   \n"  // constants 4, 6, 1/256
+
+      "1:                                        \n"
+      "ld1         {v0.4s, v1.4s, v2.4s}, [%0], %4 \n"  // load 12 samples, 5
+                                                        // rows
+      "fadd        v0.4s, v0.4s, v1.4s           \n"    // * 1
+      "ld1         {v4.4s, v5.4s}, [%0], %5      \n"
+      "fadd        v1.4s, v1.4s, v2.4s           \n"
+      "fmla        v0.4s, v4.4s, v7.4s           \n"  // * 6
+      "ld1         {v2.4s, v3.4s}, [%0], %4      \n"
+      "fmla        v1.4s, v5.4s, v7.4s           \n"
+      "ld1         {v4.4s, v5.4s}, [%0], %6      \n"
+      "fadd        v2.4s, v2.4s, v4.4s           \n"
+      "fadd        v3.4s, v3.4s, v5.4s           \n"
+      "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
+      "fmla        v1.4s, v3.4s, v6.4s           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fmul        v0.4s, v0.4s, v8.4s           \n"  // / 256
+      "fmul        v1.4s, v1.4s, v8.4s           \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "st1         {v0.4s, v1.4s}, [%1], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      : "+r"(src),                 // %0
+        "+r"(dst),                 // %1
+        "+r"(width)                // %2
+      : "r"(&kGaussCoefficients),  // %3
+        "r"(8LL),                  // %4
+        "r"(-4LL),                 // %5
+        "r"(20LL)                  // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v2.16b}, [%0], #16           \n"  // load 16 Y values
+      "ld2         {v0.8b, v1.8b}, [%1], #16     \n"  // load 8 VU values
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "zip1        v0.16b, v0.16b, v0.16b        \n"      // replicate V values
+      "zip1        v1.16b, v1.16b, v1.16b        \n"      // replicate U values
+      "subs        %w3, %w3, #16                 \n"      // 16 pixels per loop
+      "st3         {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_yuv24),  // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width) {
+  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+  asm volatile(
+
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "uqrshrn     v3.8b, v0.8h, #2              \n"  // 2x2 average
+      "uqrshrn     v2.8b, v1.8h, #2              \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
+      "st2         {v2.8b,v3.8b}, [%2], #16      \n"  // store 8 pixels UV.
+      "b.gt        1b                            \n"
+      : "+r"(src_ayuv),    // %0
+        "+r"(src_ayuv_1),  // %1
+        "+r"(dst_uv),      // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width) {
+  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+  asm volatile(
+
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "uqrshrn     v0.8b, v0.8h, #2              \n"  // 2x2 average
+      "uqrshrn     v1.8b, v1.8h, #2              \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
+      "st2         {v0.8b,v1.8b}, [%2], #16      \n"  // store 8 pixels VU.
+      "b.gt        1b                            \n"
+      : "+r"(src_ayuv),    // %0
+        "+r"(src_ayuv_1),  // %1
+        "+r"(dst_vu),      // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
+      "st1         {v2.16b}, [%1], #16           \n"  // store 16 Y pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_ayuv),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Shuffle table for swapping UV bytes.
+static const uvec8 kShuffleSwapUV = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
+                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+      "ld1         {v2.16b}, [%3]                \n"  // shuffler
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], 16            \n"  // load 16 UV values
+      "ld1         {v1.16b}, [%0], 16            \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
+      "tbl         v0.16b, {v0.16b}, v2.16b      \n"
+      "tbl         v1.16b, {v1.16b}, v2.16b      \n"
+      "stp         q0, q1, [%1], 32              \n"  // store 16 VU pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),         // %0
+        "+r"(dst_vu),         // %1
+        "+r"(width)           // %2
+      : "r"(&kShuffleSwapUV)  // %3
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  const uint8_t* src_u_1 = src_u + src_stride_u;
+  const uint8_t* src_v_1 = src_v + src_stride_v;
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 U values
+      "ld1         {v1.16b}, [%2], #16           \n"  // load 16 V values
+      "ld1         {v2.16b}, [%1], #16           \n"
+      "ld1         {v3.16b}, [%3], #16           \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // half size
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "uadalp      v0.8h, v2.16b                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v3.16b                 \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "uqrshrn     v0.8b, v0.8h, #2              \n"
+      "uqrshrn     v1.8b, v1.8h, #2              \n"
+      "subs        %w5, %w5, #16                 \n"  // 16 src pixels per loop
+      "st2         {v0.8b, v1.8b}, [%4], #16     \n"  // store 8 UV pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_u),    // %0
+        "+r"(src_u_1),  // %1
+        "+r"(src_v),    // %2
+        "+r"(src_v_1),  // %3
+        "+r"(dst_uv),   // %4
+        "+r"(width)     // %5
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
-#endif  // HAS_SOBELYROW_NEON
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/third_party/libyuv/source/row_win.cc b/media/libaom/src/third_party/libyuv/source/row_win.cc
index 71be268b47..9afcf060a4 100644
--- a/media/libaom/src/third_party/libyuv/source/row_win.cc
+++ b/media/libaom/src/third_party/libyuv/source/row_win.cc
@@ -10,8 +10,11 @@
 
 #include "libyuv/row.h"
 
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
-    defined(_MSC_VER) && !defined(__clang__)
+// This module is for Visual C 32/64 bit and clangcl 32 bit
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+
+#if defined(_M_X64)
 #include <emmintrin.h>
 #include <tmmintrin.h>  // For _mm_maddubs_epi16
 #endif
@@ -21,314 +24,245 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
-    defined(_MSC_VER) && !defined(__clang__)
-
-struct YuvConstants {
-  lvec8 kUVToB;     // 0
-  lvec8 kUVToG;     // 32
-  lvec8 kUVToR;     // 64
-  lvec16 kUVBiasB;  // 96
-  lvec16 kUVBiasG;  // 128
-  lvec16 kUVBiasR;  // 160
-  lvec16 kYToRgb;   // 192
-};
-
-// BT.601 YUV to RGB reference
-//  R = (Y - 16) * 1.164              - V * -1.596
-//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
-//  B = (Y - 16) * 1.164 - U * -2.018
-
-// Y contribution to R,G,B.  Scale and bias.
-// TODO(fbarchard): Consider moving constants into a common header.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-
-// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25 /* round(0.391 * 64) */
-#define VG 52 /* round(0.813 * 64) */
-#define VR -102 /* round(-1.596 * 64) */
-
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128            + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
-
-// BT601 constants for YUV to RGB.
-static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+// 64 bit
+#if defined(_M_X64)
 
-// BT601 constants for NV21 where chroma plane is VU instead of UV.
-static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422                                        \
+  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
+  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  u_buf += 4;                                             \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
+  y_buf += 8;
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422                                       \
+  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
+  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  u_buf += 4;                                             \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
+  y_buf += 8;                                             \
+  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                \
+  a_buf += 8;
 
-#undef YG
-#undef YGB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-#undef BB
-#undef BG
-#undef BR
-
-// JPEG YUV to RGB reference
-// *  R = Y                - V * -1.40200
-// *  G = Y - U *  0.34414 - V *  0.71414
-// *  B = Y - U * -1.77200
-
-// Y contribution to R,G,B.  Scale and bias.
-// TODO(fbarchard): Consider moving constants into a common header.
-#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGBJ 32  /* 64 / 2 */
-
-// U and V contributions to R,G,B.
-#define UBJ -113 /* round(-1.77200 * 64) */
-#define UGJ 22 /* round(0.34414 * 64) */
-#define VGJ 46 /* round(0.71414  * 64) */
-#define VRJ -90 /* round(-1.40200 * 64) */
-
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BBJ (UBJ * 128             + YGBJ)
-#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
-#define BRJ             (VRJ * 128 + YGBJ)
-
-// JPEG constants for YUV to RGB.
-static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
-  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
-    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
-  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
-    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
-    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
-    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
-  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
-    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
-  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
-    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
-  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
-    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
-  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
-    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
-  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
-    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
-};
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(yuvconstants)                                     \
+  xmm1 = _mm_loadu_si128(&xmm0);                                   \
+  xmm2 = _mm_loadu_si128(&xmm0);                                   \
+  xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
+  xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
+  xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
+  xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);   \
+  xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);   \
+  xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);   \
+  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);  \
+  xmm0 = _mm_adds_epi16(xmm0, xmm4);                               \
+  xmm1 = _mm_adds_epi16(xmm1, xmm4);                               \
+  xmm2 = _mm_adds_epi16(xmm2, xmm4);                               \
+  xmm0 = _mm_srai_epi16(xmm0, 6);                                  \
+  xmm1 = _mm_srai_epi16(xmm1, 6);                                  \
+  xmm2 = _mm_srai_epi16(xmm2, 6);                                  \
+  xmm0 = _mm_packus_epi16(xmm0, xmm0);                             \
+  xmm1 = _mm_packus_epi16(xmm1, xmm1);                             \
+  xmm2 = _mm_packus_epi16(xmm2, xmm2);
 
-#undef YGJ
-#undef YGBJ
-#undef UBJ
-#undef UGJ
-#undef VGJ
-#undef VRJ
-#undef BBJ
-#undef BGJ
-#undef BRJ
+// Store 8 ARGB values.
+#define STOREARGB                                    \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \
+  xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \
+  xmm1 = _mm_loadu_si128(&xmm0);                     \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \
+  xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \
+  _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \
+  _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
+  dst_argb += 32;
 
-// 64 bit
-#if defined(_M_X64)
 #if defined(HAS_I422TOARGBROW_SSSE3)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
                          int width) {
-  __m128i xmm0, xmm1, xmm2, xmm3;
+  __m128i xmm0, xmm1, xmm2, xmm4;
   const __m128i xmm5 = _mm_set1_epi8(-1);
-  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
+  while (width > 0) {
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    width -= 8;
+  }
+}
+#endif
 
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              const uint8_t* a_buf,
+                              uint8_t* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width) {
+  __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
   while (width > 0) {
-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
-    xmm1 = _mm_loadu_si128(&xmm0);
-    xmm2 = _mm_loadu_si128(&xmm0);
-    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
-    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
-    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
-    xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
-    xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
-    xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
-    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
-    xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
-    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
-    xmm0 = _mm_adds_epi16(xmm0, xmm3);
-    xmm1 = _mm_adds_epi16(xmm1, xmm3);
-    xmm2 = _mm_adds_epi16(xmm2, xmm3);
-    xmm0 = _mm_srai_epi16(xmm0, 6);
-    xmm1 = _mm_srai_epi16(xmm1, 6);
-    xmm2 = _mm_srai_epi16(xmm2, 6);
-    xmm0 = _mm_packus_epi16(xmm0, xmm0);
-    xmm1 = _mm_packus_epi16(xmm1, xmm1);
-    xmm2 = _mm_packus_epi16(xmm2, xmm2);
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
-    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
-    xmm1 = _mm_loadu_si128(&xmm0);
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
-    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
-
-    _mm_storeu_si128((__m128i *)dst_argb, xmm0);
-    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
-
-    y_buf += 8;
-    u_buf += 4;
-    dst_argb += 32;
+    READYUVA422
+    YUVTORGB(yuvconstants)
+    STOREARGB
     width -= 8;
   }
 }
 #endif
+
 // 32 bit
 #else  // defined(_M_X64)
 #ifdef HAS_ARGBTOYROW_SSSE3
 
 // Constants for ARGB.
-static const vec8 kARGBToY = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+                              13, 65, 33, 0, 13, 65, 33, 0};
 
 // JPeg full range.
-static const vec8 kARGBToYJ = {
-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+                               15, 75, 38, 0, 15, 75, 38, 0};
 
-static const vec8 kARGBToU = {
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+                              112, -74, -38, 0, 112, -74, -38, 0};
 
-static const vec8 kARGBToUJ = {
-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+                               127, -84, -43, 0, 127, -84, -43, 0};
 
 static const vec8 kARGBToV = {
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
 };
 
-static const vec8 kARGBToVJ = {
-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+                               -20, -107, 127, 0, -20, -107, 127, 0};
 
 // vpshufb for vphaddw + vpackuswb packed to shorts.
 static const lvec8 kShufARGBToUV_AVX = {
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-};
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
 
 // Constants for BGRA.
-static const vec8 kBGRAToY = {
-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+                              0, 33, 65, 13, 0, 33, 65, 13};
 
-static const vec8 kBGRAToU = {
-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+                              0, -38, -74, 112, 0, -38, -74, 112};
 
-static const vec8 kBGRAToV = {
-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+                              0, 112, -94, -18, 0, 112, -94, -18};
 
 // Constants for ABGR.
-static const vec8 kABGRToY = {
-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+                              33, 65, 13, 0, 33, 65, 13, 0};
 
-static const vec8 kABGRToU = {
-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+                              -38, -74, 112, 0, -38, -74, 112, 0};
 
-static const vec8 kABGRToV = {
-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+                              112, -94, -18, 0, 112, -94, -18, 0};
 
 // Constants for RGBA.
-static const vec8 kRGBAToY = {
-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+                              0, 13, 65, 33, 0, 13, 65, 33};
 
-static const vec8 kRGBAToU = {
-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+                              0, 112, -74, -38, 0, 112, -74, -38};
 
-static const vec8 kRGBAToV = {
-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+                              0, -18, -94, 112, 0, -18, -94, 112};
 
-static const uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
 
 // 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {
-  64, 64, 64, 64, 64, 64, 64, 64
-};
+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
 
-static const uvec8 kAddUV128 = {
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
-static const uvec16 kAddUVJ128 = {
-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
 
 // Shuffle table for converting RGB24 to ARGB.
 static const uvec8 kShuffleMaskRGB24ToARGB = {
-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
 
 // Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {
-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
+                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+
+// Shuffle table for converting RAW to RGB24.  First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24.  Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RGB24.
 static const uvec8 kShuffleMaskARGBToRGB24 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RAW.
 static const uvec8 kShuffleMaskARGBToRAW = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
-
-// Shuffle table for converting ARGB to RAW.
-static const uvec8 kShuffleMaskARGBToRAW_0 = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
 };
 
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked)
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
+                                          uint8_t* dst_argb,
+                                          int width) {
   __asm {
-    mov        eax, [esp + 4]        // src_y
-    mov        edx, [esp + 8]        // dst_argb
-    mov        ecx, [esp + 12]       // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
+    mov        eax, [esp + 4]  // src_y
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff000000
     pslld      xmm5, 24
 
   convertloop:
@@ -351,13 +285,14 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
 
 #ifdef HAS_J400TOARGBROW_AVX2
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked)
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
-  __asm {
-    mov         eax, [esp + 4]        // src_y
-    mov         edx, [esp + 8]        // dst_argb
-    mov         ecx, [esp + 12]       // pix
-    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
+__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
+                                          uint8_t* dst_argb,
+                                          int width) {
+  __asm {
+    mov         eax, [esp + 4]  // src_y
+    mov         edx, [esp + 8]  // dst_argb
+    mov         ecx, [esp + 12]  // width
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000
     vpslld      ymm5, ymm5, 24
 
   convertloop:
@@ -381,15 +316,16 @@ void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
 }
 #endif  // HAS_J400TOARGBROW_AVX2
 
-__declspec(naked)
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_rgb24
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    mov       eax, [esp + 4]  // src_rgb24
+    mov       edx, [esp + 8]  // dst_argb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
     pslld     xmm5, 24
-    movdqa    xmm4, kShuffleMaskRGB24ToARGB
+    movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
 
  convertloop:
     movdqu    xmm0, [eax]
@@ -397,17 +333,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
     movdqu    xmm3, [eax + 32]
     lea       eax, [eax + 48]
     movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
     pshufb    xmm2, xmm4
     por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
     movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
     movdqu    [edx], xmm0
     por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
     movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
@@ -419,16 +355,16 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   }
 }
 
-__declspec(naked)
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
-                        int pix) {
+__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
+                                          uint8_t* dst_argb,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_raw
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    mov       eax, [esp + 4]  // src_raw
+    mov       edx, [esp + 8]  // dst_argb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
     pslld     xmm5, 24
-    movdqa    xmm4, kShuffleMaskRAWToARGB
+    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
 
  convertloop:
     movdqu    xmm0, [eax]
@@ -436,17 +372,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
     movdqu    xmm3, [eax + 32]
     lea       eax, [eax + 48]
     movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
     pshufb    xmm2, xmm4
     por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
     movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
     movdqu    [edx], xmm0
     por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
     movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
@@ -458,6 +394,35 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
   }
 }
 
+__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+                                           uint8_t* dst_rgb24,
+                                           int width) {
+  __asm {
+    mov       eax, [esp + 4]  // src_raw
+    mov       edx, [esp + 8]  // dst_rgb24
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
+    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
+    movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 4]
+    movdqu    xmm2, [eax + 8]
+    lea       eax, [eax + 24]
+    pshufb    xmm0, xmm3
+    pshufb    xmm1, xmm4
+    pshufb    xmm2, xmm5
+    movq      qword ptr [edx], xmm0
+    movq      qword ptr [edx + 8], xmm1
+    movq      qword ptr [edx + 16], xmm2
+    lea       edx, [edx + 24]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
 // pmul method to replicate bits.
 // Math to replicate bits:
 // (v << 8) | (v << 3)
@@ -465,9 +430,9 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
 // 20 instructions.
-__declspec(naked)
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                          int pix) {
+__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
@@ -475,33 +440,33 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
     movd      xmm6, eax
     pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
     psllw     xmm3, 11
-    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
+    pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green
     psllw     xmm4, 10
     psrlw     xmm4, 5
-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
     psllw     xmm7, 8
 
-    mov       eax, [esp + 4]   // src_rgb565
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
+    mov       eax, [esp + 4]  // src_rgb565
+    mov       edx, [esp + 8]  // dst_argb
+    mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
+    movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565
     movdqa    xmm1, xmm0
     movdqa    xmm2, xmm0
-    pand      xmm1, xmm3    // R in upper 5 bits
-    psllw     xmm2, 11      // B in upper 5 bits
-    pmulhuw   xmm1, xmm5    // * (256 + 8)
-    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    pand      xmm1, xmm3  // R in upper 5 bits
+    psllw     xmm2, 11  // B in upper 5 bits
+    pmulhuw   xmm1, xmm5  // * (256 + 8)
+    pmulhuw   xmm2, xmm5  // * (256 + 8)
     psllw     xmm1, 8
-    por       xmm1, xmm2    // RB
-    pand      xmm0, xmm4    // G in middle 6 bits
-    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
-    por       xmm0, xmm7    // AG
+    por       xmm1, xmm2  // RB
+    pand      xmm0, xmm4  // G in middle 6 bits
+    pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)
+    por       xmm0, xmm7  // AG
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
@@ -521,42 +486,42 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
 // v * 256 + v * 8
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-__declspec(naked)
-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
-                          int pix) {
+__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
     vmovd      xmm5, eax
     vbroadcastss ymm5, xmm5
     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
-    movd       xmm6, eax
+    vmovd      xmm6, eax
     vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
     vpsllw     ymm3, ymm3, 11
-    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green
     vpsllw     ymm4, ymm4, 10
     vpsrlw     ymm4, ymm4, 5
-    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
     vpsllw     ymm7, ymm7, 8
 
-    mov        eax, [esp + 4]   // src_rgb565
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // pix
+    mov        eax, [esp + 4]  // src_rgb565
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
     sub        edx, eax
     sub        edx, eax
 
  convertloop:
-    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
-    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
-    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
-    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565
+    vpand      ymm1, ymm0, ymm3  // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
+    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
     vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2    // RB
-    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
-    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
-    vpor       ymm0, ymm0, ymm7    // AG
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpor       ymm1, ymm1, ymm2  // RB
+    vpand      ymm0, ymm0, ymm4  // G in middle 6 bits
+    vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)
+    vpor       ymm0, ymm0, ymm7  // AG
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
     vpermq     ymm1, ymm1, 0xd8
     vpunpckhbw ymm2, ymm1, ymm0
     vpunpcklbw ymm1, ymm1, ymm0
@@ -572,43 +537,43 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
 #endif  // HAS_RGB565TOARGBROW_AVX2
 
 #ifdef HAS_ARGB1555TOARGBROW_AVX2
-__declspec(naked)
-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
-                            int pix) {
+__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
     vmovd      xmm5, eax
     vbroadcastss ymm5, xmm5
     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
-    movd       xmm6, eax
+    vmovd      xmm6, eax
     vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
     vpsllw     ymm3, ymm3, 11
-    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
-    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+    vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
     vpsllw     ymm7, ymm7, 8
 
-    mov        eax,  [esp + 4]   // src_argb1555
-    mov        edx,  [esp + 8]   // dst_argb
-    mov        ecx,  [esp + 12]  // pix
+    mov        eax,  [esp + 4]  // src_argb1555
+    mov        edx,  [esp + 8]  // dst_argb
+    mov        ecx,  [esp + 12]  // width
     sub        edx,  eax
     sub        edx,  eax
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
-    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555
+    vpsllw     ymm1, ymm0, 1  // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
     vpand      ymm1, ymm1, ymm3
-    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
-    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
+    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
     vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2    // RB
-    vpsraw     ymm2, ymm0, 8       // A
-    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
-    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
+    vpor       ymm1, ymm1, ymm2  // RB
+    vpsraw     ymm2, ymm0, 8  // A
+    vpand      ymm0, ymm0, ymm4  // G in middle 5 bits
+    vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)
     vpand      ymm2, ymm2, ymm7
-    vpor       ymm0, ymm0, ymm2    // AG
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpor       ymm0, ymm0, ymm2  // AG
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
     vpermq     ymm1, ymm1, 0xd8
     vpunpckhbw ymm2, ymm1, ymm0
     vpunpcklbw ymm1, ymm1, ymm0
@@ -624,29 +589,29 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
 #endif  // HAS_ARGB1555TOARGBROW_AVX2
 
 #ifdef HAS_ARGB4444TOARGBROW_AVX2
-__declspec(naked)
-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
-                            int pix) {
+__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
     vmovd     xmm4, eax
     vbroadcastss ymm4, xmm4
-    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
-    mov       eax,  [esp + 4]   // src_argb4444
-    mov       edx,  [esp + 8]   // dst_argb
-    mov       ecx,  [esp + 12]  // pix
+    vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles
+    mov       eax,  [esp + 4]  // src_argb4444
+    mov       edx,  [esp + 8]  // dst_argb
+    mov       ecx,  [esp + 12]  // width
     sub       edx,  eax
     sub       edx,  eax
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
-    vpand      ymm2, ymm0, ymm5    // mask high nibbles
-    vpand      ymm0, ymm0, ymm4    // mask low nibbles
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444
+    vpand      ymm2, ymm0, ymm5  // mask high nibbles
+    vpand      ymm0, ymm0, ymm4  // mask low nibbles
     vpsrlw     ymm3, ymm2, 4
     vpsllw     ymm1, ymm0, 4
     vpor       ymm2, ymm2, ymm3
     vpor       ymm0, ymm0, ymm1
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
     vpermq     ymm2, ymm2, 0xd8
     vpunpckhbw ymm1, ymm0, ymm2
     vpunpcklbw ymm0, ymm0, ymm2
@@ -662,9 +627,9 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
 #endif  // HAS_ARGB4444TOARGBROW_AVX2
 
 // 24 instructions
-__declspec(naked)
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                            int pix) {
+__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
@@ -672,36 +637,36 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
     movd      xmm6, eax
     pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
     psllw     xmm3, 11
-    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
+    movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green
     psrlw     xmm4, 6
-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
     psllw     xmm7, 8
 
-    mov       eax, [esp + 4]   // src_argb1555
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
+    mov       eax, [esp + 4]  // src_argb1555
+    mov       edx, [esp + 8]  // dst_argb
+    mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
+    movdqu    xmm0, [eax]  // fetch 8 pixels of 1555
     movdqa    xmm1, xmm0
     movdqa    xmm2, xmm0
-    psllw     xmm1, 1       // R in upper 5 bits
-    psllw     xmm2, 11      // B in upper 5 bits
+    psllw     xmm1, 1  // R in upper 5 bits
+    psllw     xmm2, 11  // B in upper 5 bits
     pand      xmm1, xmm3
-    pmulhuw   xmm2, xmm5    // * (256 + 8)
-    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    pmulhuw   xmm2, xmm5  // * (256 + 8)
+    pmulhuw   xmm1, xmm5  // * (256 + 8)
     psllw     xmm1, 8
-    por       xmm1, xmm2    // RB
+    por       xmm1, xmm2  // RB
     movdqa    xmm2, xmm0
-    pand      xmm0, xmm4    // G in middle 5 bits
-    psraw     xmm2, 8       // A
-    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
+    pand      xmm0, xmm4  // G in middle 5 bits
+    psraw     xmm2, 8  // A
+    pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)
     pand      xmm2, xmm7
-    por       xmm0, xmm2    // AG
+    por       xmm0, xmm2  // AG
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
@@ -715,26 +680,26 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
 }
 
 // 18 instructions.
-__declspec(naked)
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                            int pix) {
+__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
     movd      xmm4, eax
     pshufd    xmm4, xmm4, 0
-    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
+    movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles
     pslld     xmm5, 4
-    mov       eax, [esp + 4]   // src_argb4444
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
+    mov       eax, [esp + 4]  // src_argb4444
+    mov       edx, [esp + 8]  // dst_argb
+    mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
+    movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444
     movdqa    xmm2, xmm0
-    pand      xmm0, xmm4    // mask low nibbles
-    pand      xmm2, xmm5    // mask high nibbles
+    pand      xmm0, xmm4  // mask low nibbles
+    pand      xmm2, xmm5  // mask high nibbles
     movdqa    xmm1, xmm0
     movdqa    xmm3, xmm2
     psllw     xmm1, 4
@@ -753,37 +718,38 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
   }
 }
 
-__declspec(naked)
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
+                                            uint8_t* dst_rgb,
+                                            int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm6, kShuffleMaskARGBToRGB24
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
     movdqu    xmm1, [eax + 16]
     movdqu    xmm2, [eax + 32]
     movdqu    xmm3, [eax + 48]
     lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
     pshufb    xmm1, xmm6
     pshufb    xmm2, xmm6
     pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
-    psrldq    xmm1, 4      // 8 bytes from 1
-    pslldq    xmm4, 12     // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
-    por       xmm0, xmm4   // 4 bytes from 1 for 0
-    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
+    psrldq    xmm1, 4  // 8 bytes from 1
+    pslldq    xmm4, 12  // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
+    por       xmm0, xmm4  // 4 bytes from 1 for 0
+    pslldq    xmm5, 8  // 8 bytes from 2 for 1
     movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5   // 8 bytes from 2 for 1
-    psrldq    xmm2, 8      // 4 bytes from 2
-    pslldq    xmm3, 4      // 12 bytes from 3 for 2
-    por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1   // store 1
-    movdqu    [edx + 32], xmm2   // store 2
+    por       xmm1, xmm5  // 8 bytes from 2 for 1
+    psrldq    xmm2, 8  // 4 bytes from 2
+    pslldq    xmm3, 4  // 12 bytes from 3 for 2
+    por       xmm2, xmm3  // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1  // store 1
+    movdqu    [edx + 32], xmm2  // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
@@ -791,37 +757,38 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   }
 }
 
-__declspec(naked)
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
+                                          uint8_t* dst_rgb,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // pix
-    movdqa    xmm6, kShuffleMaskARGBToRAW
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
     movdqu    xmm1, [eax + 16]
     movdqu    xmm2, [eax + 32]
     movdqu    xmm3, [eax + 48]
     lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
     pshufb    xmm1, xmm6
     pshufb    xmm2, xmm6
     pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
-    psrldq    xmm1, 4      // 8 bytes from 1
-    pslldq    xmm4, 12     // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
-    por       xmm0, xmm4   // 4 bytes from 1 for 0
-    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
+    psrldq    xmm1, 4  // 8 bytes from 1
+    pslldq    xmm4, 12  // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
+    por       xmm0, xmm4  // 4 bytes from 1 for 0
+    pslldq    xmm5, 8  // 8 bytes from 2 for 1
     movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5   // 8 bytes from 2 for 1
-    psrldq    xmm2, 8      // 4 bytes from 2
-    pslldq    xmm3, 4      // 12 bytes from 3 for 2
-    por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1   // store 1
-    movdqu    [edx + 32], xmm2   // store 2
+    por       xmm1, xmm5  // 8 bytes from 2 for 1
+    psrldq    xmm2, 8  // 4 bytes from 2
+    pslldq    xmm3, 4  // 12 bytes from 3 for 2
+    por       xmm2, xmm3  // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1  // store 1
+    movdqu    [edx + 32], xmm2  // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
@@ -829,34 +796,34 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   }
 }
 
-// 4 pixels
-__declspec(naked)
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
+                                            uint8_t* dst_rgb,
+                                            int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
     psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
     psrld     xmm4, 26
     pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
     pslld     xmm5, 11
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    pslld     xmm0, 8       // R
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 5       // G
-    psrad     xmm0, 16      // R
-    pand      xmm1, xmm3    // B
-    pand      xmm2, xmm4    // G
-    pand      xmm0, xmm5    // R
-    por       xmm1, xmm2    // BG
-    por       xmm0, xmm1    // BGR
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    pslld     xmm0, 8  // R
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 5  // G
+    psrad     xmm0, 16  // R
+    pand      xmm1, xmm3  // B
+    pand      xmm2, xmm4  // G
+    pand      xmm0, xmm5  // R
+    por       xmm1, xmm2  // BG
+    por       xmm0, xmm1  // BGR
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
@@ -867,42 +834,42 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   }
 }
 
-// 8 pixels
-__declspec(naked)
-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int pix) {
+__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
+                                                  uint8_t* dst_rgb,
+                                                  const uint32_t dither4,
+                                                  int width) {
   __asm {
 
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    movd      xmm6, [esp + 12] // dither4
-    mov       ecx, [esp + 16]  // pix
-    punpcklbw xmm6, xmm6       // make dither 16 bytes
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    movd      xmm6, [esp + 12]  // dither4
+    mov       ecx, [esp + 16]  // width
+    punpcklbw xmm6, xmm6  // make dither 16 bytes
     movdqa    xmm7, xmm6
     punpcklwd xmm6, xmm6
     punpckhwd xmm7, xmm7
-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
     psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
     psrld     xmm4, 26
     pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
     pslld     xmm5, 11
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    paddusb   xmm0, xmm6    // add dither
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    pslld     xmm0, 8       // R
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 5       // G
-    psrad     xmm0, 16      // R
-    pand      xmm1, xmm3    // B
-    pand      xmm2, xmm4    // G
-    pand      xmm0, xmm5    // R
-    por       xmm1, xmm2    // BG
-    por       xmm0, xmm1    // BGR
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    paddusb   xmm0, xmm6  // add dither
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    pslld     xmm0, 8  // R
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 5  // G
+    psrad     xmm0, 16  // R
+    pand      xmm1, xmm3  // B
+    pand      xmm2, xmm4  // G
+    pand      xmm0, xmm5  // R
+    por       xmm1, xmm2  // BG
+    por       xmm0, xmm1  // BGR
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
@@ -914,39 +881,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
 }
 
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-__declspec(naked)
-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int pix) {
+__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
+                                                  uint8_t* dst_rgb,
+                                                  const uint32_t dither4,
+                                                  int width) {
   __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
     vbroadcastss xmm6, [esp + 12]  // dither4
-    mov        ecx, [esp + 16]     // pix
-    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
+    mov        ecx, [esp + 16]  // width
+    vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes
     vpermq     ymm6, ymm6, 0xd8
     vpunpcklwd ymm6, ymm6, ymm6
-    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
     vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
     vpsrld     ymm4, ymm4, 26
     vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpaddusb   ymm0, ymm0, ymm6    // add dither
-    vpsrld     ymm2, ymm0, 5       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrld     ymm0, ymm0, 8       // R
-    vpand      ymm2, ymm2, ymm4    // G
-    vpand      ymm1, ymm1, ymm3    // B
-    vpand      ymm0, ymm0, ymm5    // R
-    vpor       ymm1, ymm1, ymm2    // BG
-    vpor       ymm0, ymm0, ymm1    // BGR
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpaddusb   ymm0, ymm0, ymm6  // add dither
+    vpsrld     ymm2, ymm0, 5  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrld     ymm0, ymm0, 8  // R
+    vpand      ymm2, ymm2, ymm4  // G
+    vpand      ymm1, ymm1, ymm3  // B
+    vpand      ymm0, ymm0, ymm5  // R
+    vpor       ymm1, ymm1, ymm2  // BG
+    vpor       ymm0, ymm0, ymm1  // BGR
     vpackusdw  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -957,37 +925,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
 // TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked)
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f
     psrld     xmm4, 27
-    movdqa    xmm5, xmm4       // generate mask 0x000003e0
+    movdqa    xmm5, xmm4  // generate mask 0x000003e0
     pslld     xmm5, 5
-    movdqa    xmm6, xmm4       // generate mask 0x00007c00
+    movdqa    xmm6, xmm4  // generate mask 0x00007c00
     pslld     xmm6, 10
-    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
+    pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000
     pslld     xmm7, 15
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    movdqa    xmm3, xmm0    // R
-    psrad     xmm0, 16      // A
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 6       // G
-    psrld     xmm3, 9       // R
-    pand      xmm0, xmm7    // A
-    pand      xmm1, xmm4    // B
-    pand      xmm2, xmm5    // G
-    pand      xmm3, xmm6    // R
-    por       xmm0, xmm1    // BA
-    por       xmm2, xmm3    // GR
-    por       xmm0, xmm2    // BGRA
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    movdqa    xmm3, xmm0  // R
+    psrad     xmm0, 16  // A
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 6  // G
+    psrld     xmm3, 9  // R
+    pand      xmm0, xmm7  // A
+    pand      xmm1, xmm4  // B
+    pand      xmm2, xmm5  // G
+    pand      xmm3, xmm6  // R
+    por       xmm0, xmm1  // BA
+    por       xmm2, xmm3  // GR
+    por       xmm0, xmm2  // BGRA
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
@@ -998,22 +967,23 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   }
 }
 
-__declspec(naked)
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000
     psllw     xmm4, 12
-    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
+    movdqa    xmm3, xmm4  // generate mask 0x00f000f0
     psrlw     xmm3, 8
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
     movdqa    xmm1, xmm0
-    pand      xmm0, xmm3    // low nibble
-    pand      xmm1, xmm4    // high nibble
+    pand      xmm0, xmm3  // low nibble
+    pand      xmm1, xmm4  // high nibble
     psrld     xmm0, 4
     psrld     xmm1, 8
     por       xmm0, xmm1
@@ -1028,33 +998,34 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 }
 
 #ifdef HAS_ARGBTORGB565ROW_AVX2
-__declspec(naked)
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
+                                            uint8_t* dst_rgb,
+                                            int width) {
   __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
-    mov        ecx, [esp + 12]     // pix
-    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
     vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
     vpsrld     ymm4, ymm4, 26
     vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpsrld     ymm2, ymm0, 5       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrld     ymm0, ymm0, 8       // R
-    vpand      ymm2, ymm2, ymm4    // G
-    vpand      ymm1, ymm1, ymm3    // B
-    vpand      ymm0, ymm0, ymm5    // R
-    vpor       ymm1, ymm1, ymm2    // BG
-    vpor       ymm0, ymm0, ymm1    // BGR
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrld     ymm0, ymm0, 8  // R
+    vpand      ymm2, ymm2, ymm4  // G
+    vpand      ymm1, ymm1, ymm3  // B
+    vpand      ymm0, ymm0, ymm5  // R
+    vpor       ymm1, ymm1, ymm2  // BG
+    vpor       ymm0, ymm0, ymm1  // BGR
     vpackusdw  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -1065,36 +1036,37 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 #endif  // HAS_ARGBTORGB565ROW_AVX2
 
 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
-__declspec(naked)
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
-    mov        ecx, [esp + 12]     // pix
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm4, ymm4, ymm4
-    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
-    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
-    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
-    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
+    vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000
     vpslld     ymm7, ymm7, 15
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpsrld     ymm3, ymm0, 9       // R
-    vpsrld     ymm2, ymm0, 6       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrad     ymm0, ymm0, 16      // A
-    vpand      ymm3, ymm3, ymm6    // R
-    vpand      ymm2, ymm2, ymm5    // G
-    vpand      ymm1, ymm1, ymm4    // B
-    vpand      ymm0, ymm0, ymm7    // A
-    vpor       ymm0, ymm0, ymm1    // BA
-    vpor       ymm2, ymm2, ymm3    // GR
-    vpor       ymm0, ymm0, ymm2    // BGRA
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9  // R
+    vpsrld     ymm2, ymm0, 6  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrad     ymm0, ymm0, 16  // A
+    vpand      ymm3, ymm3, ymm6  // R
+    vpand      ymm2, ymm2, ymm5  // G
+    vpand      ymm1, ymm1, ymm4  // B
+    vpand      ymm0, ymm0, ymm7  // A
+    vpor       ymm0, ymm0, ymm1  // BA
+    vpor       ymm2, ymm2, ymm3  // GR
+    vpor       ymm0, ymm0, ymm2  // BGRA
     vpackssdw  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
+    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -1105,27 +1077,28 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
 
 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
-__declspec(naked)
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_rgb
-    mov        ecx, [esp + 12]  // pix
-    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000
     vpsllw     ymm4, ymm4, 12
-    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
+    vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpand      ymm1, ymm0, ymm4    // high nibble
-    vpand      ymm0, ymm0, ymm3    // low nibble
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpand      ymm1, ymm0, ymm4  // high nibble
+    vpand      ymm0, ymm0, ymm3  // low nibble
     vpsrld     ymm1, ymm1, 8
     vpsrld     ymm0, ymm0, 4
     vpor       ymm0, ymm0, ymm1
     vpackuswb  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
+    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -1136,14 +1109,15 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
 
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked)
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm4, kARGBToY
-    movdqa     xmm5, kAddY16
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    movdqa     xmm4, xmmword ptr kARGBToY
+    movdqa     xmm5, xmmword ptr kAddY16
 
  convertloop:
     movdqu     xmm0, [eax]
@@ -1171,14 +1145,15 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-__declspec(naked)
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
+                                         uint8_t* dst_y,
+                                         int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm4, kARGBToYJ
-    movdqa     xmm5, kAddYJ64
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    movdqa     xmm4, xmmword ptr kARGBToYJ
+    movdqa     xmm5, xmmword ptr kAddYJ64
 
  convertloop:
     movdqu     xmm0, [eax]
@@ -1207,20 +1182,19 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 
 #ifdef HAS_ARGBTOYROW_AVX2
 // vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
-  0, 4, 1, 5, 2, 6, 3, 7
-};
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked)
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    vbroadcastf128 ymm4, kARGBToY
-    vbroadcastf128 ymm5, kAddY16
-    vmovdqu    ymm6, kPermdARGBToY_AVX
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    vbroadcastf128 ymm4, xmmword ptr kARGBToY
+    vbroadcastf128 ymm5, xmmword ptr kAddY16
+    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
 
  convertloop:
     vmovdqu    ymm0, [eax]
@@ -1251,15 +1225,16 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
 
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked)
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    vbroadcastf128 ymm4, kARGBToYJ
-    vbroadcastf128 ymm5, kAddYJ64
-    vmovdqu    ymm6, kPermdARGBToY_AVX
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
+    vbroadcastf128 ymm5, xmmword ptr kAddYJ64
+    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
 
  convertloop:
     vmovdqu    ymm0, [eax]
@@ -1290,14 +1265,15 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
 }
 #endif  //  HAS_ARGBTOYJROW_AVX2
 
-__declspec(naked)
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm4, kBGRAToY
-    movdqa     xmm5, kAddY16
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    movdqa     xmm4, xmmword ptr kBGRAToY
+    movdqa     xmm5, xmmword ptr kAddY16
 
  convertloop:
     movdqu     xmm0, [eax]
@@ -1323,14 +1299,15 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   }
 }
 
-__declspec(naked)
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm4, kABGRToY
-    movdqa     xmm5, kAddY16
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    movdqa     xmm4, xmmword ptr kABGRToY
+    movdqa     xmm5, xmmword ptr kAddY16
 
  convertloop:
     movdqu     xmm0, [eax]
@@ -1356,14 +1333,15 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   }
 }
 
-__declspec(naked)
-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm4, kRGBAToY
-    movdqa     xmm5, kAddY16
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
+    movdqa     xmm4, xmmword ptr kRGBAToY
+    movdqa     xmm5, xmmword ptr kAddY16
 
  convertloop:
     movdqu     xmm0, [eax]
@@ -1389,24 +1367,26 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   }
 }
 
-__declspec(naked)
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm5, kAddUV128
-    movdqa     xmm6, kARGBToV
-    movdqa     xmm7, kARGBToU
-    sub        edi, edx             // stride from u to v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kARGBToV
+    movdqa     xmm7, xmmword ptr kARGBToU
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1430,9 +1410,9 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1444,11 +1424,11 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1459,24 +1439,26 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   }
 }
 
-__declspec(naked)
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+                                          int src_stride_argb,
+                                          uint8_t* dst_u,
+                                          uint8_t* dst_v,
+                                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm5, kAddUVJ128
-    movdqa     xmm6, kARGBToVJ
-    movdqa     xmm7, kARGBToUJ
-    sub        edi, edx             // stride from u to v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUVJ128
+    movdqa     xmm6, xmmword ptr kARGBToVJ
+    movdqa     xmm7, xmmword ptr kARGBToUJ
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1500,9 +1482,9 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1511,15 +1493,15 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     pmaddubsw  xmm3, xmm6
     phaddw     xmm0, xmm2
     phaddw     xmm1, xmm3
-    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
+    paddw      xmm0, xmm5  // +.5 rounding -> unsigned
     paddw      xmm1, xmm5
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1531,24 +1513,26 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked)
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+                                        int src_stride_argb,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    vbroadcastf128 ymm5, kAddUV128
-    vbroadcastf128 ymm6, kARGBToV
-    vbroadcastf128 ymm7, kARGBToU
-    sub        edi, edx             // stride from u to v
+    mov        ecx, [esp + 8 + 20]  // width
+    vbroadcastf128 ymm5, xmmword ptr kAddUV128
+    vbroadcastf128 ymm6, xmmword ptr kARGBToV
+    vbroadcastf128 ymm7, xmmword ptr kARGBToU
+    sub        edi, edx   // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+        /* step 1 - subsample 32x2 argb pixels to 16x1 */
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     vmovdqu    ymm2, [eax + 64]
@@ -1565,9 +1549,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     vshufps    ymm2, ymm2, ymm3, 0xdd
     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 32 different pixels, its 16 pixels of U and 16 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 32 different pixels, its 16 pixels of U and 16 of V
     vpmaddubsw ymm1, ymm0, ymm7  // U
     vpmaddubsw ymm3, ymm2, ymm7
     vpmaddubsw ymm0, ymm0, ymm6  // V
@@ -1578,12 +1562,12 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     vpsraw     ymm0, ymm0, 8
     vpacksswb  ymm0, ymm1, ymm0  // mutates
     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
-    vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
+    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
 
-    // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0 // U
-    vextractf128 [edx + edi], ymm0, 1 // V
+        // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0  // U
+    vextractf128 [edx + edi], ymm0, 1  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -1596,23 +1580,93 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
 }
 #endif  // HAS_ARGBTOUVROW_AVX2
 
-__declspec(naked)
-void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
-                          uint8* dst_u, uint8* dst_v, int width) {
+#ifdef HAS_ARGBTOUVJROW_AVX2
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
+    push       esi
     push       edi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
+    vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
+    vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
+    sub        edi, edx   // stride from u to v
+
+ convertloop:
+        /* step 1 - subsample 32x2 argb pixels to 16x1 */
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    vpavgb     ymm2, ymm2, [eax + esi + 64]
+    vpavgb     ymm3, ymm3, [eax + esi + 96]
+    lea        eax,  [eax + 128]
+    vshufps    ymm4, ymm0, ymm1, 0x88
+    vshufps    ymm0, ymm0, ymm1, 0xdd
+    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
+    vshufps    ymm4, ymm2, ymm3, 0x88
+    vshufps    ymm2, ymm2, ymm3, 0xdd
+    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
+
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 32 different pixels, its 16 pixels of U and 16 of V
+    vpmaddubsw ymm1, ymm0, ymm7  // U
+    vpmaddubsw ymm3, ymm2, ymm7
+    vpmaddubsw ymm0, ymm0, ymm6  // V
+    vpmaddubsw ymm2, ymm2, ymm6
+    vphaddw    ymm1, ymm1, ymm3  // mutates
+    vphaddw    ymm0, ymm0, ymm2
+    vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
+    vpaddw     ymm0, ymm0, ymm5
+    vpsraw     ymm1, ymm1, 8
+    vpsraw     ymm0, ymm0, 8
+    vpacksswb  ymm0, ymm1, ymm0  // mutates
+    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
+    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
+
+        // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0  // U
+    vextractf128 [edx + edi], ymm0, 1  // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOUVJROW_AVX2
+
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
+                                            uint8_t* dst_u,
+                                            uint8_t* dst_v,
+                                            int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        edx, [esp + 4 + 8]  // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // pix
-    movdqa     xmm5, kAddUV128
-    movdqa     xmm6, kARGBToV
-    movdqa     xmm7, kARGBToU
-    sub        edi, edx             // stride from u to v
+    mov        ecx, [esp + 4 + 16]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kARGBToV
+    movdqa     xmm7, xmmword ptr kARGBToU
+    sub        edi, edx    // stride from u to v
 
  convertloop:
-    /* convert to U and V */
-    movdqu     xmm0, [eax]          // U
+        /* convert to U and V */
+    movdqu     xmm0, [eax]  // U
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
     movdqu     xmm3, [eax + 48]
@@ -1628,7 +1682,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
     paddb      xmm0, xmm5
     movdqu     [edx], xmm0
 
-    movdqu     xmm0, [eax]          // V
+    movdqu     xmm0, [eax]  // V
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
     movdqu     xmm3, [eax + 48]
@@ -1653,82 +1707,26 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
   }
 }
 
-__declspec(naked)
-void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
-                          uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        edx, [esp + 4 + 8]   // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // pix
-    movdqa     xmm5, kAddUV128
-    movdqa     xmm6, kARGBToV
-    movdqa     xmm7, kARGBToU
-    sub        edi, edx             // stride from u to v
-
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked)
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm5, kAddUV128
-    movdqa     xmm6, kBGRAToV
-    movdqa     xmm7, kBGRAToU
-    sub        edi, edx             // stride from u to v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kBGRAToV
+    movdqa     xmm7, xmmword ptr kBGRAToU
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1752,9 +1750,9 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1766,11 +1764,11 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1781,24 +1779,26 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   }
 }
 
-__declspec(naked)
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm5, kAddUV128
-    movdqa     xmm6, kABGRToV
-    movdqa     xmm7, kABGRToU
-    sub        edi, edx             // stride from u to v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kABGRToV
+    movdqa     xmm7, xmmword ptr kABGRToU
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1822,9 +1822,9 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1836,11 +1836,11 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1851,24 +1851,26 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   }
 }
 
-__declspec(naked)
-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm5, kAddUV128
-    movdqa     xmm6, kRGBAToV
-    movdqa     xmm7, kRGBAToU
-    sub        edi, edx             // stride from u to v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kRGBAToV
+    movdqa     xmm7, xmmword ptr kRGBAToU
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1892,9 +1894,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1906,11 +1908,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1923,115 +1925,174 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 #endif  // HAS_ARGBTOYROW_SSSE3
 
 // Read 16 UV from 444
-#define READYUV444_AVX2 __asm {                                                \
-    __asm vmovdqu    xmm0, [esi]                  /* U */         /* NOLINT */ \
-    __asm vmovdqu    xmm1, [esi + edi]            /* V */         /* NOLINT */ \
+#define READYUV444_AVX2 \
+  __asm {                                                \
+    __asm vmovdqu    xmm0, [esi] /* U */                      \
+    __asm vmovdqu    xmm1, [esi + edi] /* V */                      \
     __asm lea        esi,  [esi + 16]                                          \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
-  }
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2 __asm {                                                \
-    __asm vmovq      xmm0, qword ptr [esi]        /* U */         /* NOLINT */ \
-    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */         /* NOLINT */ \
+#define READYUV422_AVX2 \
+  __asm {                                                \
+    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-  }
-
-// Read 4 UV from 411, upsample to 16 UV.
-#define READYUV411_AVX2 __asm {                                                \
-    __asm vmovd      xmm0, dword ptr [esi]        /* U */         /* NOLINT */ \
-    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */         /* NOLINT */ \
-    __asm lea        esi,  [esi + 4]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]}
+
+// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
+#define READYUVA422_AVX2 \
+  __asm {                                               \
+    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
-  }
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+    __asm vmovdqu    xmm5, [ebp] /* A */                      \
+    __asm vpermq     ymm5, ymm5, 0xd8                                          \
+    __asm lea        ebp, [ebp + 16]}
 
 // Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+#define READNV12_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi] /* UV */                     \
     __asm lea        esi,  [esi + 16]                                          \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-  }
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]}
+
+// Read 8 UV from NV21, upsample to 16 UV.
+#define READNV21_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi] /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]}
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \
+    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
+    __asm vmovdqu    ymm0, [eax] /* UV */                             \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
+    __asm lea        eax, [eax + 32]}
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax] /* UYVY */                           \
+    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
+    __asm vmovdqu    ymm0, [eax] /* UV */                             \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
+    __asm lea        eax, [eax + 32]}
 
 // Convert 16 pixels: 16 UV and 16 Y.
-#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
-    /* Step 1: Find 8 UV contributions to 16 R,G,B values */                   \
-    __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR        /* scale R UV */   \
-    __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG        /* scale G UV */   \
-    __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB        /* scale B UV */   \
-    __asm vmovdqu    ymm3, YuvConstants.kUVBiasR                               \
+#define YUVTORGB_AVX2(YuvConstants) \
+  __asm {                                    \
+    __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
+    __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
+    __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
     __asm vpsubw     ymm2, ymm3, ymm2                                          \
-    __asm vmovdqu    ymm3, YuvConstants.kUVBiasG                               \
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
     __asm vpsubw     ymm1, ymm3, ymm1                                          \
-    __asm vmovdqu    ymm3, YuvConstants.kUVBiasB                               \
-    __asm vpsubw     ymm0, ymm3, ymm0                                          \
-    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
-    __asm vmovdqu    xmm3, [eax]                  /* NOLINT */                 \
-    __asm lea        eax, [eax + 16]                                           \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpunpcklbw ymm3, ymm3, ymm3                                          \
-    __asm vpmulhuw   ymm3, ymm3, YuvConstants.kYToRgb                          \
-    __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \
-    __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \
-    __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
+    __asm vpsubw     ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+    __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
+    __asm vpaddsw    ymm0, ymm0, ymm4 /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm4 /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm4 /* R += Y */                   \
     __asm vpsraw     ymm0, ymm0, 6                                             \
     __asm vpsraw     ymm1, ymm1, 6                                             \
     __asm vpsraw     ymm2, ymm2, 6                                             \
-    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
-    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
-    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
+    __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                        \
+    __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                        \
+    __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                  \
   }
 
 // Store 16 ARGB values.
-#define STOREARGB_AVX2 __asm {                                                 \
-    /* Step 3: Weave into ARGB */                                              \
-    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
+#define STOREARGB_AVX2 \
+  __asm {                                                 \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
+    __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \
     __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
-    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
+    __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \
+    __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \
     __asm vmovdqu    0[edx], ymm1                                              \
     __asm vmovdqu    32[edx], ymm0                                             \
-    __asm lea        edx,  [edx + 64]                                          \
-  }
+    __asm lea        edx,  [edx + 64]}
+
+// Store 16 RGBA values.
+#define STORERGBA_AVX2 \
+  __asm {                                                 \
+    __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \
+    __asm vpermq     ymm1, ymm1, 0xd8                                          \
+    __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \
+    __asm vpermq     ymm2, ymm2, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \
+    __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \
+    __asm vmovdqu    [edx], ymm0                                               \
+    __asm vmovdqu    [edx + 32], ymm1                                          \
+    __asm lea        edx,  [edx + 64]}
 
 #ifdef HAS_I422TOARGBROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I422ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        int width) {
+__declspec(naked) void I422ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
+    YUVTORGB_AVX2(ebx)
     STOREARGB_AVX2
 
     sub        ecx, 16
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
     vzeroupper
@@ -2040,70 +2101,80 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
 }
 #endif  // HAS_I422TOARGBROW_AVX2
 
-#ifdef HAS_J422TOARGBROW_AVX2
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
 // 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void J422ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        int width) {
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+__declspec(naked) void I422AlphaToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    const uint8_t* a_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]  // Y
+    mov        esi, [esp + 16 + 8]  // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        ebp, [esp + 16 + 16]  // A
+    mov        edx, [esp + 16 + 20]  // argb
+    mov        ebx, [esp + 16 + 24]  // yuvconstants
+    mov        ecx, [esp + 16 + 28]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 
  convertloop:
-    READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvJConstants)
+    READYUVA422_AVX2
+    YUVTORGB_AVX2(ebx)
     STOREARGB_AVX2
 
     sub        ecx, 16
     jg         convertloop
 
+    pop        ebp
+    pop        ebx
     pop        edi
     pop        esi
     vzeroupper
     ret
   }
 }
-#endif  // HAS_J422TOARGBROW_AVX2
+#endif  // HAS_I422ALPHATOARGBROW_AVX2
 
 #ifdef HAS_I444TOARGBROW_AVX2
 // 16 pixels
 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I444ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        int width) {
+__declspec(naked) void I444ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
  convertloop:
     READYUV444_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
+    YUVTORGB_AVX2(ebx)
     STOREARGB_AVX2
 
     sub        ecx, 16
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
     vzeroupper
@@ -2112,66 +2183,34 @@ void I444ToARGBRow_AVX2(const uint8* y_buf,
 }
 #endif  // HAS_I444TOARGBROW_AVX2
 
-#ifdef HAS_I411TOARGBROW_AVX2
-// 16 pixels
-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I411ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUV411_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I411TOARGBROW_AVX2
-
 #ifdef HAS_NV12TOARGBROW_AVX2
 // 16 pixels.
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void NV12ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* dst_argb,
-                        int width) {
+__declspec(naked) void NV12ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* uv_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // Y
-    mov        esi, [esp + 4 + 8]   // UV
-    mov        edx, [esp + 4 + 12]  // argb
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    push       ebx
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // UV
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READNV12_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
+    YUVTORGB_AVX2(ebx)
     STOREARGB_AVX2
 
     sub        ecx, 16
     jg         convertloop
 
+    pop        ebx
     pop        esi
     vzeroupper
     ret
@@ -2181,28 +2220,32 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
 
 #ifdef HAS_NV21TOARGBROW_AVX2
 // 16 pixels.
-// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void NV21ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* dst_argb,
-                        int width) {
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void NV21ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* vu_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // Y
-    mov        esi, [esp + 4 + 8]   // UV
-    mov        edx, [esp + 4 + 12]  // argb
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    push       ebx
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // VU
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
-    READNV12_AVX2
-    YUVTORGB_AVX2(kYvuConstants)
+    READNV21_AVX2
+    YUVTORGB_AVX2(ebx)
     STOREARGB_AVX2
 
     sub        ecx, 16
     jg         convertloop
 
+    pop        ebx
     pop        esi
     vzeroupper
     ret
@@ -2210,365 +2253,332 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf,
 }
 #endif  // HAS_NV21TOARGBROW_AVX2
 
-#ifdef HAS_I422TOBGRAROW_AVX2
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
-// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
-__declspec(naked)
-void I422ToBGRARow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        int width) {
+#ifdef HAS_YUY2TOARGBROW_AVX2
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked) void YUY2ToARGBRow_AVX2(
+    const uint8_t* src_yuy2,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    push       ebx
+    mov        eax, [esp + 4 + 4]  // yuy2
+    mov        edx, [esp + 4 + 8]  // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
-    READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
+    READYUY2_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
 
-    // Step 3: Weave into BGRA
-    vpunpcklbw ymm1, ymm1, ymm0           // GB
-    vpermq     ymm1, ymm1, 0xd8
-    vpunpcklbw ymm2, ymm5, ymm2           // AR
-    vpermq     ymm2, ymm2, 0xd8
-    vpunpcklwd ymm0, ymm2, ymm1           // ARGB first 8 pixels
-    vpunpckhwd ymm2, ymm2, ymm1           // ARGB next 8 pixels
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + 32], ymm2
-    lea        edx,  [edx + 64]
     sub        ecx, 16
     jg         convertloop
 
-    pop        edi
-    pop        esi
+    pop        ebx
     vzeroupper
     ret
   }
 }
-#endif  // HAS_I422TOBGRAROW_AVX2
+#endif  // HAS_YUY2TOARGBROW_AVX2
 
-#ifdef HAS_I422TORGBAROW_AVX2
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
-__declspec(naked)
-void I422ToRGBARow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        int width) {
+#ifdef HAS_UYVYTOARGBROW_AVX2
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked) void UYVYToARGBRow_AVX2(
+    const uint8_t* src_uyvy,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    push       ebx
+    mov        eax, [esp + 4 + 4]  // uyvy
+    mov        edx, [esp + 4 + 8]  // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
-    READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
+    READUYVY_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
 
-    // Step 3: Weave into RGBA
-    vpunpcklbw ymm1, ymm1, ymm2           // GR
-    vpermq     ymm1, ymm1, 0xd8
-    vpunpcklbw ymm2, ymm5, ymm0           // AB
-    vpermq     ymm2, ymm2, 0xd8
-    vpunpcklwd ymm0, ymm2, ymm1           // ABGR first 8 pixels
-    vpunpckhwd ymm1, ymm2, ymm1           // ABGR next 8 pixels
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + 32], ymm1
-    lea        edx,  [edx + 64]
     sub        ecx, 16
     jg         convertloop
 
-    pop        edi
-    pop        esi
+    pop        ebx
     vzeroupper
     ret
   }
 }
-#endif  // HAS_I422TORGBAROW_AVX2
+#endif  // HAS_UYVYTOARGBROW_AVX2
 
-#ifdef HAS_I422TOABGRROW_AVX2
+#ifdef HAS_I422TORGBAROW_AVX2
 // 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
-// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
-__declspec(naked)
-void I422ToABGRRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        int width) {
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+__declspec(naked) void I422ToRGBARow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // abgr
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
+    YUVTORGB_AVX2(ebx)
+    STORERGBA_AVX2
 
-    // Step 3: Weave into ABGR
-    vpunpcklbw ymm1, ymm2, ymm1           // RG
-    vpermq     ymm1, ymm1, 0xd8
-    vpunpcklbw ymm2, ymm0, ymm5           // BA
-    vpermq     ymm2, ymm2, 0xd8
-    vpunpcklwd ymm0, ymm1, ymm2           // RGBA first 8 pixels
-    vpunpckhwd ymm1, ymm1, ymm2           // RGBA next 8 pixels
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + 32], ymm1
-    lea        edx,  [edx + 64]
     sub        ecx, 16
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
     vzeroupper
     ret
   }
 }
-#endif  // HAS_I422TOABGRROW_AVX2
+#endif  // HAS_I422TORGBAROW_AVX2
 
 #if defined(HAS_I422TOARGBROW_SSSE3)
 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+// Allows a conversion with half size scaling.
 
 // Read 8 UV from 444.
-#define READYUV444 __asm {                                                     \
-    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
-    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
+#define READYUV444 \
+  __asm {                                                     \
+    __asm movq       xmm0, qword ptr [esi] /* U */                             \
+    __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-  }
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 __asm {                                                     \
-    __asm movd       xmm0, [esi]          /* U */                              \
-    __asm movd       xmm1, [esi + edi]    /* V */                              \
+#define READYUV422 \
+  __asm {                                                     \
+    __asm movd       xmm0, [esi] /* U */                              \
+    __asm movd       xmm1, [esi + edi] /* V */                              \
     __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
-  }
-
-// Read 2 UV from 411, upsample to 8 UV.
-#define READYUV411 __asm {                                                     \
-    __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
-    __asm movd       xmm0, ebx                                                 \
-    __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
-    __asm movd       xmm1, ebx                                                 \
-    __asm lea        esi,  [esi + 2]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
-    __asm punpckldq  xmm0, xmm0           /* UVUVUVUV (upsample) */            \
-  }
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]}
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422 \
+  __asm {                                                    \
+    __asm movd       xmm0, [esi] /* U */                              \
+    __asm movd       xmm1, [esi + edi] /* V */                              \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax] /* Y */                           \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm movq       xmm5, qword ptr [ebp] /* A */                           \
+    __asm lea        ebp, [ebp + 8]}
 
 // Read 4 UV from NV12, upsample to 8 UV.
-#define READNV12 __asm {                                                       \
-    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
+#define READNV12 \
+  __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
-  }
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]}
+
+// Read 4 VU from NV21, upsample to 8 UV.
+#define READNV21 \
+  __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]}
+
+// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
+#define READYUY2 \
+  __asm {                                                       \
+    __asm movdqu     xmm4, [eax] /* YUY2 */                           \
+    __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
+    __asm movdqu     xmm0, [eax] /* UV */                             \
+    __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
+    __asm lea        eax, [eax + 16]}
+
+// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
+#define READUYVY \
+  __asm {                                                       \
+    __asm movdqu     xmm4, [eax] /* UYVY */                           \
+    __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
+    __asm movdqu     xmm0, [eax] /* UV */                             \
+    __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
+    __asm lea        eax, [eax + 16]}
 
 // Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(YuvConstants) __asm {                                         \
-    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
+#define YUVTORGB(YuvConstants) \
+  __asm {                                         \
     __asm movdqa     xmm1, xmm0                                                \
     __asm movdqa     xmm2, xmm0                                                \
     __asm movdqa     xmm3, xmm0                                                \
-    __asm movdqa     xmm0, YuvConstants.kUVBiasB /* unbias back to signed */   \
-    __asm pmaddubsw  xmm1, YuvConstants.kUVToB   /* scale B UV */              \
+    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
+    __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
     __asm psubw      xmm0, xmm1                                                \
-    __asm movdqa     xmm1, YuvConstants.kUVBiasG                               \
-    __asm pmaddubsw  xmm2, YuvConstants.kUVToG   /* scale G UV */              \
+    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
+    __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
     __asm psubw      xmm1, xmm2                                                \
-    __asm movdqa     xmm2, YuvConstants.kUVBiasR                               \
-    __asm pmaddubsw  xmm3, YuvConstants.kUVToR   /* scale R UV */              \
+    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
+    __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
     __asm psubw      xmm2, xmm3                                                \
-    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
-    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
-    __asm lea        eax, [eax + 8]                                            \
-    __asm punpcklbw  xmm3, xmm3                                                \
-    __asm pmulhuw    xmm3, YuvConstants.kYToRgb                                \
-    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
-    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
-    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
+    __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
+    __asm paddsw     xmm0, xmm4 /* B += Y */                         \
+    __asm paddsw     xmm1, xmm4 /* G += Y */                         \
+    __asm paddsw     xmm2, xmm4 /* R += Y */                         \
     __asm psraw      xmm0, 6                                                   \
     __asm psraw      xmm1, 6                                                   \
     __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0           /* B */                              \
-    __asm packuswb   xmm1, xmm1           /* G */                              \
-    __asm packuswb   xmm2, xmm2           /* R */                              \
+    __asm packuswb   xmm0, xmm0 /* B */                              \
+    __asm packuswb   xmm1, xmm1 /* G */                              \
+    __asm packuswb   xmm2, xmm2 /* R */             \
   }
 
 // Store 8 ARGB values.
-#define STOREARGB __asm {                                                      \
-    /* Step 3: Weave into ARGB */                                              \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
+#define STOREARGB \
+  __asm {                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm5 /* RA */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
+    __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \
     __asm movdqu     0[edx], xmm0                                              \
     __asm movdqu     16[edx], xmm1                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
+    __asm lea        edx,  [edx + 32]}
 
 // Store 8 BGRA values.
-#define STOREBGRA __asm {                                                      \
-    /* Step 3: Weave into BGRA */                                              \
-    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
-    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
+#define STOREBGRA \
+  __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm0 /* GB */                             \
+    __asm punpcklbw  xmm5, xmm2 /* AR */                             \
     __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
+    __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \
     __asm movdqu     0[edx], xmm5                                              \
     __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
-
-// Store 8 ABGR values.
-#define STOREABGR __asm {                                                      \
-    /* Step 3: Weave into ABGR */                                              \
-    __asm punpcklbw  xmm2, xmm1           /* RG */                             \
-    __asm punpcklbw  xmm0, xmm5           /* BA */                             \
-    __asm movdqa     xmm1, xmm2                                                \
-    __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \
-    __asm movdqu     0[edx], xmm2                                              \
-    __asm movdqu     16[edx], xmm1                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
+    __asm lea        edx,  [edx + 32]}
 
 // Store 8 RGBA values.
-#define STORERGBA __asm {                                                      \
-    /* Step 3: Weave into RGBA */                                              \
-    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
-    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
+#define STORERGBA \
+  __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm2 /* GR */                             \
+    __asm punpcklbw  xmm5, xmm0 /* AB */                             \
     __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
+    __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \
     __asm movdqu     0[edx], xmm5                                              \
     __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
+    __asm lea        edx,  [edx + 32]}
 
 // Store 8 RGB24 values.
-#define STORERGB24 __asm {                                                     \
-    /* Step 3: Weave into RRGB */                                              \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+#define STORERGB24 \
+  __asm {/* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
-    /* Step 4: RRGB -> RGB24 */                                                \
-    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
-    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
-    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
-    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
-    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
-    __asm lea        edx,  [edx + 24]                                          \
-  }
-
-// Store 8 RAW values.
-#define STORERAW __asm {                                                       \
-    /* Step 3: Weave into RRGB */                                              \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
-    /* Step 4: RRGB -> RAW */                                                  \
-    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
-    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
-    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
-    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
-    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
-    __asm lea        edx,  [edx + 24]                                          \
-  }
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \
+    __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]}
 
 // Store 8 RGB565 values.
-#define STORERGB565 __asm {                                                    \
-    /* Step 3: Weave into RRGB */                                              \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+#define STORERGB565 \
+  __asm {/* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
-    /* Step 4: RRGB -> RGB565 */                                               \
-    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
-    __asm movdqa     xmm2, xmm0    /* G */                                     \
-    __asm pslld      xmm0, 8       /* R */                                     \
-    __asm psrld      xmm3, 3       /* B */                                     \
-    __asm psrld      xmm2, 5       /* G */                                     \
-    __asm psrad      xmm0, 16      /* R */                                     \
-    __asm pand       xmm3, xmm5    /* B */                                     \
-    __asm pand       xmm2, xmm6    /* G */                                     \
-    __asm pand       xmm0, xmm7    /* R */                                     \
-    __asm por        xmm3, xmm2    /* BG */                                    \
-    __asm por        xmm0, xmm3    /* BGR */                                   \
-    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
-    __asm movdqa     xmm2, xmm1    /* G */                                     \
-    __asm pslld      xmm1, 8       /* R */                                     \
-    __asm psrld      xmm3, 3       /* B */                                     \
-    __asm psrld      xmm2, 5       /* G */                                     \
-    __asm psrad      xmm1, 16      /* R */                                     \
-    __asm pand       xmm3, xmm5    /* B */                                     \
-    __asm pand       xmm2, xmm6    /* G */                                     \
-    __asm pand       xmm1, xmm7    /* R */                                     \
-    __asm por        xmm3, xmm2    /* BG */                                    \
-    __asm por        xmm1, xmm3    /* BGR */                                   \
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \
+    __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \
+    __asm movdqa     xmm2, xmm0 /* G */                                     \
+    __asm pslld      xmm0, 8 /* R */                                     \
+    __asm psrld      xmm3, 3 /* B */                                     \
+    __asm psrld      xmm2, 5 /* G */                                     \
+    __asm psrad      xmm0, 16 /* R */                                     \
+    __asm pand       xmm3, xmm5 /* B */                                     \
+    __asm pand       xmm2, xmm6 /* G */                                     \
+    __asm pand       xmm0, xmm7 /* R */                                     \
+    __asm por        xmm3, xmm2 /* BG */                                    \
+    __asm por        xmm0, xmm3 /* BGR */                                   \
+    __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \
+    __asm movdqa     xmm2, xmm1 /* G */                                     \
+    __asm pslld      xmm1, 8 /* R */                                     \
+    __asm psrld      xmm3, 3 /* B */                                     \
+    __asm psrld      xmm2, 5 /* G */                                     \
+    __asm psrad      xmm1, 16 /* R */                                     \
+    __asm pand       xmm3, xmm5 /* B */                                     \
+    __asm pand       xmm2, xmm6 /* G */                                     \
+    __asm pand       xmm1, xmm7 /* R */                                     \
+    __asm por        xmm3, xmm2 /* BG */                                    \
+    __asm por        xmm1, xmm3 /* BGR */                                   \
     __asm packssdw   xmm0, xmm1                                                \
-    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
-    __asm lea        edx, [edx + 16]                                           \
-  }
+    __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \
+    __asm lea        edx, [edx + 16]}
 
 // 8 pixels.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void I444ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         int width) {
+__declspec(naked) void I444ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READYUV444
-    YUVTORGB(kYuvConstants)
+    YUVTORGB(ebx)
     STOREARGB
 
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
     ret
@@ -2577,66 +2587,36 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
-__declspec(naked)
-void I422ToRGB24Row_SSSE3(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* dst_rgb24,
-                          int width) {
+__declspec(naked) void I422ToRGB24Row_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_rgb24,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgb24
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    movdqa     xmm5, kShuffleMaskARGBToRGB24_0
-    movdqa     xmm6, kShuffleMaskARGBToRGB24
+    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
+    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
 
  convertloop:
     READYUV422
-    YUVTORGB(kYuvConstants)
+    YUVTORGB(ebx)
     STORERGB24
 
     sub        ecx, 8
     jg         convertloop
 
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
-__declspec(naked)
-void I422ToRAWRow_SSSE3(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_raw,
-                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // raw
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    movdqa     xmm5, kShuffleMaskARGBToRAW_0
-    movdqa     xmm6, kShuffleMaskARGBToRAW
-
- convertloop:
-    READYUV422
-    YUVTORGB(kYuvConstants)
-    STORERAW
-
-    sub        ecx, 8
-    jg         convertloop
-
+    pop        ebx
     pop        edi
     pop        esi
     ret
@@ -2645,37 +2625,41 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
 
 // 8 pixels
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
-__declspec(naked)
-void I422ToRGB565Row_SSSE3(const uint8* y_buf,
-                           const uint8* u_buf,
-                           const uint8* v_buf,
-                           uint8* rgb565_buf,
-                           int width) {
+__declspec(naked) void I422ToRGB565Row_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* rgb565_buf,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgb565
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
+    pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f
     psrld      xmm5, 27
-    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
+    pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0
     psrld      xmm6, 26
     pslld      xmm6, 5
-    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
+    pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800
     pslld      xmm7, 11
 
  convertloop:
     READYUV422
-    YUVTORGB(kYuvConstants)
+    YUVTORGB(ebx)
     STORERGB565
 
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
     ret
@@ -2684,31 +2668,35 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         int width) {
+__declspec(naked) void I422ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READYUV422
-    YUVTORGB(kYuvConstants)
+    YUVTORGB(ebx)
     STOREARGB
 
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
     ret
@@ -2716,33 +2704,39 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
 }
 
 // 8 pixels.
-// JPeg color space version of I422ToARGB
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void J422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         int width) {
+// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
+__declspec(naked) void I422AlphaToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    const uint8_t* a_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]  // Y
+    mov        esi, [esp + 16 + 8]  // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        ebp, [esp + 16 + 16]  // A
+    mov        edx, [esp + 16 + 20]  // argb
+    mov        ebx, [esp + 16 + 24]  // yuvconstants
+    mov        ecx, [esp + 16 + 28]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
 
  convertloop:
-    READYUV422
-    YUVTORGB(kYuvJConstants)
+    READYUVA422
+    YUVTORGB(ebx)
     STOREARGB
 
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebp
+    pop        ebx
     pop        edi
     pop        esi
     ret
@@ -2750,225 +2744,196 @@ void J422ToARGBRow_SSSE3(const uint8* y_buf,
 }
 
 // 8 pixels.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked)
-void I411ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         int width) {
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void NV12ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* uv_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
-    push       ebx
     push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ecx, [esp + 12 + 20]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
+    push       ebx
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // UV
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
-    READYUV411  // modifies EBX
-    YUVTORGB(kYuvConstants)
+    READNV12
+    YUVTORGB(ebx)
     STOREARGB
 
     sub        ecx, 8
     jg         convertloop
 
-    pop        edi
-    pop        esi
     pop        ebx
+    pop        esi
     ret
   }
 }
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void NV12ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* uv_buf,
-                         uint8* dst_argb,
-                         int width) {
+__declspec(naked) void NV21ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* vu_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // Y
-    mov        esi, [esp + 4 + 8]   // UV
-    mov        edx, [esp + 4 + 12]  // argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    push       ebx
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // VU
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
-    READNV12
-    YUVTORGB(kYuvConstants)
+    READNV21
+    YUVTORGB(ebx)
     STOREARGB
 
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebx
     pop        esi
     ret
   }
 }
 
 // 8 pixels.
-// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* uv_buf,
-                         uint8* dst_argb,
-                         int width) {
+// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked) void YUY2ToARGBRow_SSSE3(
+    const uint8_t* src_yuy2,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // Y
-    mov        esi, [esp + 4 + 8]   // UV
-    mov        edx, [esp + 4 + 12]  // argb
+    push       ebx
+    mov        eax, [esp + 4 + 4]  // yuy2
+    mov        edx, [esp + 4 + 8]  // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
-    READNV12
-    YUVTORGB(kYvuConstants)
+    READYUY2
+    YUVTORGB(ebx)
     STOREARGB
 
     sub        ecx, 8
     jg         convertloop
 
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked)
-void I422ToBGRARow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_bgra,
-                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // bgra
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-
- convertloop:
-    READYUV422
-    YUVTORGB(kYuvConstants)
-    STOREBGRA
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
+    pop        ebx
     ret
   }
 }
 
-__declspec(naked)
-void I422ToABGRRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_abgr,
-                         int width) {
+// 8 pixels.
+// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked) void UYVYToARGBRow_SSSE3(
+    const uint8_t* src_uyvy,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // abgr
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    push       ebx
+    mov        eax, [esp + 4 + 4]  // uyvy
+    mov        edx, [esp + 4 + 8]  // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
-    READYUV422
-    YUVTORGB(kYuvConstants)
-    STOREABGR
+    READUYVY
+    YUVTORGB(ebx)
+    STOREARGB
 
     sub        ecx, 8
     jg         convertloop
 
-    pop        edi
-    pop        esi
+    pop        ebx
     ret
   }
 }
 
-__declspec(naked)
-void I422ToRGBARow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_rgba,
-                         int width) {
+__declspec(naked) void I422ToRGBARow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_rgba,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgba
-    mov        ecx, [esp + 8 + 20]  // width
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
 
  convertloop:
     READYUV422
-    YUVTORGB(kYuvConstants)
+    YUVTORGB(ebx)
     STORERGBA
 
     sub        ecx, 8
     jg         convertloop
 
+    pop        ebx
     pop        edi
     pop        esi
     ret
   }
 }
-
 #endif  // HAS_I422TOARGBROW_SSSE3
 
+// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
 #ifdef HAS_I400TOARGBROW_SSE2
 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
-__declspec(naked)
-void I400ToARGBRow_SSE2(const uint8* y_buf,
-                        uint8* rgb_buf,
-                        int width) {
+__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+                                          uint8_t* rgb_buf,
+                                          const struct YuvConstants*,
+                                          int width) {
   __asm {
-    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
     movd       xmm2, eax
     pshufd     xmm2, xmm2,0
-    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
     movd       xmm3, eax
     pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
     pslld      xmm4, 24
 
-    mov        eax, [esp + 4]       // Y
-    mov        edx, [esp + 8]       // rgb
-    mov        ecx, [esp + 12]      // width
+    mov        eax, [esp + 4]  // Y
+    mov        edx, [esp + 8]  // rgb
+    mov        ecx, [esp + 12]  // width
 
  convertloop:
-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+        // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     movq       xmm0, qword ptr [eax]
     lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm0           // Y.Y
+    punpcklbw  xmm0, xmm0  // Y.Y
     pmulhuw    xmm0, xmm2
     psubusw    xmm0, xmm3
     psrlw      xmm0, 6
-    packuswb   xmm0, xmm0           // G
+    packuswb   xmm0, xmm0        // G
 
-    // Step 2: Weave into ARGB
-    punpcklbw  xmm0, xmm0           // GG
+        // Step 2: Weave into ARGB
+    punpcklbw  xmm0, xmm0  // GG
     movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
+    punpcklwd  xmm0, xmm0  // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm1  // BGRA next 4 pixels
     por        xmm0, xmm4
     por        xmm1, xmm4
     movdqu     [edx], xmm0
@@ -2984,41 +2949,41 @@ void I400ToARGBRow_SSE2(const uint8* y_buf,
 #ifdef HAS_I400TOARGBROW_AVX2
 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
 // note: vpunpcklbw mutates and vpackuswb unmutates.
-__declspec(naked)
-void I400ToARGBRow_AVX2(const uint8* y_buf,
-                        uint8* rgb_buf,
-                        int width) {
+__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+                                          uint8_t* rgb_buf,
+                                          const struct YuvConstants*,
+                                          int width) {
   __asm {
-    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
     vmovd      xmm2, eax
     vbroadcastss ymm2, xmm2
-    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
     vmovd      xmm3, eax
     vbroadcastss ymm3, xmm3
-    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000
     vpslld     ymm4, ymm4, 24
 
-    mov        eax, [esp + 4]       // Y
-    mov        edx, [esp + 8]       // rgb
-    mov        ecx, [esp + 12]      // width
+    mov        eax, [esp + 4]  // Y
+    mov        edx, [esp + 8]  // rgb
+    mov        ecx, [esp + 12]  // width
 
  convertloop:
-    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+        // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
     vmovdqu    xmm0, [eax]
     lea        eax, [eax + 16]
-    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
-    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
+    vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates
+    vpunpcklbw ymm0, ymm0, ymm0  // Y.Y
     vpmulhuw   ymm0, ymm0, ymm2
     vpsubusw   ymm0, ymm0, ymm3
     vpsrlw     ymm0, ymm0, 6
-    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
+    vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120
 
-    // TODO(fbarchard): Weave alpha with unpack.
-    // Step 2: Weave into ARGB
-    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
+        // TODO(fbarchard): Weave alpha with unpack.
+        // Step 2: Weave into ARGB
+    vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates
     vpermq     ymm1, ymm1, 0xd8
-    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
-    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
+    vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels
     vpor       ymm0, ymm0, ymm4
     vpor       ymm1, ymm1, ymm4
     vmovdqu    [edx], ymm0
@@ -3034,18 +2999,18 @@ void I400ToARGBRow_AVX2(const uint8* y_buf,
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
 
 // TODO(fbarchard): Replace lea with -16 offset.
-__declspec(naked)
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
+                                       uint8_t* dst,
+                                       int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
-    movdqa    xmm5, kShuffleMirror
+    movdqa    xmm5, xmmword ptr kShuffleMirror
 
  convertloop:
     movdqu    xmm0, [eax - 16 + ecx]
@@ -3060,13 +3025,14 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
-__declspec(naked)
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
+                                      uint8_t* dst,
+                                      int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
-    vbroadcastf128 ymm5, kShuffleMirror
+    vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
 
  convertloop:
     vmovdqu   ymm0, [eax - 32 + ecx]
@@ -3082,48 +3048,22 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
 }
 #endif  // HAS_MIRRORROW_AVX2
 
-#ifdef HAS_MIRRORROW_SSE2
-__declspec(naked)
-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
-  __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
-    mov       ecx, [esp + 12]  // width
-
- convertloop:
-    movdqu    xmm0, [eax - 16 + ecx]
-    movdqa    xmm1, xmm0        // swap bytes
-    psllw     xmm0, 8
-    psrlw     xmm1, 8
-    por       xmm0, xmm1
-    pshuflw   xmm0, xmm0, 0x1b  // swap words
-    pshufhw   xmm0, xmm0, 0x1b
-    pshufd    xmm0, xmm0, 0x4e  // swap qwords
-    movdqu    [edx], xmm0
-    lea       edx, [edx + 16]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-#endif  // HAS_MIRRORROW_SSE2
-
-#ifdef HAS_MIRRORROW_UV_SSSE3
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
 
-__declspec(naked)
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
-                       int width) {
+__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+                                              uint8_t* dst_u,
+                                              uint8_t* dst_v,
+                                              int width) {
   __asm {
     push      edi
-    mov       eax, [esp + 4 + 4]   // src
-    mov       edx, [esp + 4 + 8]   // dst_u
+    mov       eax, [esp + 4 + 4]  // src
+    mov       edx, [esp + 4 + 8]  // dst_u
     mov       edi, [esp + 4 + 12]  // dst_v
     mov       ecx, [esp + 4 + 16]  // width
-    movdqa    xmm1, kShuffleMirrorUV
+    movdqa    xmm1, xmmword ptr kShuffleMirrorUV
     lea       eax, [eax + ecx * 2 - 16]
     sub       edi, edx
 
@@ -3141,14 +3081,15 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
     ret
   }
 }
-#endif  // HAS_MIRRORROW_UV_SSSE3
+#endif  // HAS_MIRRORSPLITUVROW_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_SSE2
-__declspec(naked)
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
+                                          uint8_t* dst,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
 
@@ -3167,17 +3108,16 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 
-__declspec(naked)
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
+                                          uint8_t* dst,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
-    vmovdqu   ymm5, kARGBShuffleMirror_AVX2
+    vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
 
  convertloop:
     vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
@@ -3192,15 +3132,17 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
 #ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked)
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
+                                       uint8_t* dst_u,
+                                       uint8_t* dst_v,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_uv
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3210,10 +3152,10 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
     movdqa     xmm3, xmm1
-    pand       xmm0, xmm5   // even bytes
+    pand       xmm0, xmm5  // even bytes
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
-    psrlw      xmm2, 8      // odd bytes
+    psrlw      xmm2, 8  // odd bytes
     psrlw      xmm3, 8
     packuswb   xmm2, xmm3
     movdqu     [edx], xmm0
@@ -3230,15 +3172,17 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 #endif  // HAS_SPLITUVROW_SSE2
 
 #ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked)
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
+                                       uint8_t* dst_u,
+                                       uint8_t* dst_v,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_uv
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3246,9 +3190,9 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm2, ymm0, 8      // odd bytes
+    vpsrlw     ymm2, ymm0, 8  // odd bytes
     vpsrlw     ymm3, ymm1, 8
-    vpand      ymm0, ymm0, ymm5   // even bytes
+    vpand      ymm0, ymm0, ymm5  // even bytes
     vpand      ymm1, ymm1, ymm5
     vpackuswb  ymm0, ymm0, ymm1
     vpackuswb  ymm2, ymm2, ymm3
@@ -3268,24 +3212,25 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 #endif  // HAS_SPLITUVROW_AVX2
 
 #ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked)
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
+__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
+                                       const uint8_t* src_v,
+                                       uint8_t* dst_uv,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_u
-    mov        edx, [esp + 4 + 8]    // src_v
-    mov        edi, [esp + 4 + 12]   // dst_uv
-    mov        ecx, [esp + 4 + 16]   // width
+    mov        eax, [esp + 4 + 4]  // src_u
+    mov        edx, [esp + 4 + 8]  // src_v
+    mov        edi, [esp + 4 + 12]  // dst_uv
+    mov        ecx, [esp + 4 + 16]  // width
     sub        edx, eax
 
   convertloop:
-    movdqu     xmm0, [eax]      // read 16 U's
+    movdqu     xmm0, [eax]  // read 16 U's
     movdqu     xmm1, [eax + edx]  // and 16 V's
     lea        eax,  [eax + 16]
     movdqa     xmm2, xmm0
-    punpcklbw  xmm0, xmm1       // first 8 UV pairs
-    punpckhbw  xmm2, xmm1       // next 8 UV pairs
+    punpcklbw  xmm0, xmm1  // first 8 UV pairs
+    punpckhbw  xmm2, xmm1  // next 8 UV pairs
     movdqu     [edi], xmm0
     movdqu     [edi + 16], xmm2
     lea        edi, [edi + 32]
@@ -3299,24 +3244,25 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 #endif  //  HAS_MERGEUVROW_SSE2
 
 #ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked)
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
+__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
+                                       const uint8_t* src_v,
+                                       uint8_t* dst_uv,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_u
-    mov        edx, [esp + 4 + 8]    // src_v
-    mov        edi, [esp + 4 + 12]   // dst_uv
-    mov        ecx, [esp + 4 + 16]   // width
+    mov        eax, [esp + 4 + 4]  // src_u
+    mov        edx, [esp + 4 + 8]  // src_v
+    mov        edi, [esp + 4 + 12]  // dst_uv
+    mov        ecx, [esp + 4 + 16]  // width
     sub        edx, eax
 
   convertloop:
-    vmovdqu    ymm0, [eax]           // read 32 U's
-    vmovdqu    ymm1, [eax + edx]     // and 32 V's
+    vmovdqu    ymm0, [eax]  // read 32 U's
+    vmovdqu    ymm1, [eax + edx]  // and 32 V's
     lea        eax,  [eax + 32]
-    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
-    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
-    vextractf128 [edi], ymm2, 0       // bytes 0..15
+    vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
+    vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
+    vextractf128 [edi], ymm2, 0  // bytes 0..15
     vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
     vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
     vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
@@ -3332,15 +3278,31 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 #endif  //  HAS_MERGEUVROW_AVX2
 
 #ifdef HAS_COPYROW_SSE2
-// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked)
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked) void CopyRow_SSE2(const uint8_t* src,
+                                    uint8_t* dst,
+                                    int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    test       eax, 15
+    jne        convertloopu
+    test       edx, 15
+    jne        convertloopu
 
-  convertloop:
+  convertloopa:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloopa
+    ret
+
+  convertloopu:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax, [eax + 32]
@@ -3348,20 +3310,21 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 32
-    jg         convertloop
+    jg         convertloopu
     ret
   }
 }
 #endif  // HAS_COPYROW_SSE2
 
 #ifdef HAS_COPYROW_AVX
-// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked)
-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked) void CopyRow_AVX(const uint8_t* src,
+                                   uint8_t* dst,
+                                   int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
 
   convertloop:
     vmovdqu    ymm0, [eax]
@@ -3380,14 +3343,15 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
 #endif  // HAS_COPYROW_AVX
 
 // Multiple of 1.
-__declspec(naked)
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+__declspec(naked) void CopyRow_ERMS(const uint8_t* src,
+                                    uint8_t* dst,
+                                    int width) {
   __asm {
     mov        eax, esi
     mov        edx, edi
-    mov        esi, [esp + 4]   // src
-    mov        edi, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        esi, [esp + 4]  // src
+    mov        edi, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
     rep movsb
     mov        edi, edx
     mov        esi, eax
@@ -3397,15 +3361,16 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
-__declspec(naked)
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
+                                             uint8_t* dst,
+                                             int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
     pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
     psrld      xmm1, 8
 
   convertloop:
@@ -3433,14 +3398,15 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
-__declspec(naked)
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
+                                             uint8_t* dst,
+                                             int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
 
   convertloop:
     vmovdqu    ymm1, [eax]
@@ -3460,17 +3426,82 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
 }
 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
 
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                                                uint8_t* dst_a,
+                                                int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_a
+    mov        ecx, [esp + 12]  // width
+
+  extractloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    psrld      xmm0, 24
+    psrld      xmm1, 24
+    packssdw   xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         extractloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                                                uint8_t* dst_a,
+                                                int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_a
+    mov        ecx, [esp + 12]  // width
+    vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX
+
+  extractloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpsrld     ymm0, ymm0, 24
+    vpsrld     ymm1, ymm1, 24
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    lea        eax, [eax + 128]
+    vpackssdw  ymm0, ymm0, ymm1  // mutates
+    vpsrld     ymm2, ymm2, 24
+    vpsrld     ymm3, ymm3, 24
+    vpackssdw  ymm2, ymm2, ymm3  // mutates
+    vpackuswb  ymm0, ymm0, ymm2  // mutates
+    vpermd     ymm0, ymm4, ymm0  // unmutate
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         extractloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
+
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
-__declspec(naked)
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
+                                                uint8_t* dst,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
     pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
     psrld      xmm1, 8
 
   convertloop:
@@ -3500,14 +3531,15 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
-__declspec(naked)
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
+                                                uint8_t* dst,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
 
   convertloop:
     vpmovzxbd  ymm1, qword ptr [eax]
@@ -3530,17 +3562,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
-// Write 'count' bytes using an 8 bit value repeated.
-// Count should be multiple of 4.
-__declspec(naked)
-void SetRow_X86(uint8* dst, uint8 v8, int count) {
+// Write 'width' bytes using an 8 bit value repeated.
+// width should be multiple of 4.
+__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   __asm {
-    movzx      eax, byte ptr [esp + 8]    // v8
+    movzx      eax, byte ptr [esp + 8]  // v8
     mov        edx, 0x01010101  // Duplicate byte to all bytes.
-    mul        edx              // overwrites edx with upper part of result.
+    mul        edx  // overwrites edx with upper part of result.
     mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        edi, [esp + 4]  // dst
+    mov        ecx, [esp + 12]  // width
     shr        ecx, 2
     rep stosd
     mov        edi, edx
@@ -3548,28 +3579,28 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) {
   }
 }
 
-// Write 'count' bytes using an 8 bit value repeated.
-__declspec(naked)
-void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
+// Write 'width' bytes using an 8 bit value repeated.
+__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   __asm {
     mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v8
-    mov        ecx, [esp + 12]  // count
+    mov        edi, [esp + 4]  // dst
+    mov        eax, [esp + 8]  // v8
+    mov        ecx, [esp + 12]  // width
     rep stosb
     mov        edi, edx
     ret
   }
 }
 
-// Write 'count' 32 bit values.
-__declspec(naked)
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+// Write 'width' 32 bit values.
+__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
+                                      uint32_t v32,
+                                      int width) {
   __asm {
     mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v32
-    mov        ecx, [esp + 12]  // count
+    mov        edi, [esp + 4]  // dst
+    mov        eax, [esp + 8]  // v32
+    mov        ecx, [esp + 12]  // width
     rep stosd
     mov        edi, edx
     ret
@@ -3578,13 +3609,13 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked)
-void YUY2ToYRow_AVX2(const uint8* src_yuy2,
-                     uint8* dst_y, int pix) {
+__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
+    mov        eax, [esp + 4]  // src_yuy2
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
 
@@ -3592,9 +3623,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // even bytes are Y
+    vpand      ymm0, ymm0, ymm5  // even bytes are Y
     vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -3605,18 +3636,20 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
   }
 }
 
-__declspec(naked)
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+                                        int stride_yuy2,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3626,18 +3659,18 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
     vpavgb     ymm0, ymm0, [eax + esi]
     vpavgb     ymm1, ymm1, [eax + esi + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
     vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3649,16 +3682,17 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
   }
 }
 
-__declspec(naked)
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3666,18 +3700,18 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
     vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3688,21 +3722,21 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
   }
 }
 
-__declspec(naked)
-void UYVYToYRow_AVX2(const uint8* src_uyvy,
-                     uint8* dst_y, int pix) {
+__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
+    mov        eax, [esp + 4]  // src_uyvy
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
 
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
+    vpsrlw     ymm0, ymm0, 8  // odd bytes are Y
     vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -3713,18 +3747,20 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
   }
 }
 
-__declspec(naked)
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                                        int stride_uyvy,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3734,18 +3770,18 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
     vpavgb     ymm0, ymm0, [eax + esi]
     vpavgb     ymm1, ymm1, [eax + esi + 32]
     lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
     vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3757,16 +3793,17 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
   }
 }
 
-__declspec(naked)
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3774,18 +3811,18 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
     vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3798,21 +3835,21 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
 #endif  // HAS_YUY2TOYROW_AVX2
 
 #ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked)
-void YUY2ToYRow_SSE2(const uint8* src_yuy2,
-                     uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
-    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
+__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
+                                       uint8_t* dst_y,
+                                       int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_yuy2
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // even bytes are Y
+    pand       xmm0, xmm5  // even bytes are Y
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -3823,18 +3860,20 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
   }
 }
 
-__declspec(naked)
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+                                        int stride_yuy2,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3846,13 +3885,13 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2
     pavgb      xmm1, xmm3
-    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm0, 8  // YUYV -> UVUV
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -3866,16 +3905,17 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
   }
 }
 
-__declspec(naked)
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3883,13 +3923,13 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm0, 8  // YUYV -> UVUV
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -3902,19 +3942,19 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
   }
 }
 
-__declspec(naked)
-void UYVYToYRow_SSE2(const uint8* src_uyvy,
-                     uint8* dst_y, int pix) {
+__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
+    mov        eax, [esp + 4]  // src_uyvy
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
 
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8    // odd bytes are Y
+    psrlw      xmm0, 8  // odd bytes are Y
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -3925,18 +3965,20 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
   }
 }
 
-__declspec(naked)
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+                                        int stride_uyvy,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3948,13 +3990,13 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2
     pavgb      xmm1, xmm3
-    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm0, xmm5  // UYVY -> UVUV
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -3968,16 +4010,17 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
   }
 }
 
-__declspec(naked)
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3985,13 +4028,13 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm0, xmm5  // UYVY -> UVUV
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -4005,147 +4048,174 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
 }
 #endif  // HAS_YUY2TOYROW_SSE2
 
-#ifdef HAS_ARGBBLENDROW_SSE2
+#ifdef HAS_BLENDPLANEROW_SSSE3
 // Blend 8 pixels at a time.
-__declspec(naked)
-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
+                                           const uint8_t* src1,
+                                           const uint8_t* alpha,
+                                           uint8_t* dst,
+                                           int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 1
-    psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
-    psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    push       edi
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
     psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
-    pslld      xmm4, 24
-    sub        ecx, 4
-    jl         convertloop4b    // less than 4 pixels?
-
-    // 4 pixel loop.
-  convertloop4:
-    movdqu     xmm3, [eax]      // src argb
-    lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqu     xmm2, [esi]      // _r_b
-    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
-    pshuflw    xmm3, xmm3, 0F5h
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqu     xmm1, [esi]      // _a_g
-    lea        esi, [esi + 16]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jge        convertloop4
-
-  convertloop4b:
-    add        ecx, 4 - 1
-    jl         convertloop1b
+    mov        eax, 0x80808080  // 128 for biasing image to signed.
+    movd       xmm6, eax
+    pshufd     xmm6, xmm6, 0x00
+
+    mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
+    movd       xmm7, eax
+    pshufd     xmm7, xmm7, 0x00
+    mov        eax, [esp + 8 + 4]  // src0
+    mov        edx, [esp + 8 + 8]  // src1
+    mov        esi, [esp + 8 + 12]  // alpha
+    mov        edi, [esp + 8 + 16]  // dst
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        eax, esi
+    sub        edx, esi
+    sub        edi, esi
 
-    // 1 pixel loop.
-  convertloop1:
-    movd       xmm3, [eax]      // src argb
-    lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [esi]      // _r_b
-    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
-    pshuflw    xmm3, xmm3, 0F5h
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [esi]      // _a_g
-    lea        esi, [esi + 4]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jge        convertloop1
+        // 8 pixel loop.
+  convertloop8:
+    movq       xmm0, qword ptr [esi]  // alpha
+    punpcklbw  xmm0, xmm0
+    pxor       xmm0, xmm5  // a, 255-a
+    movq       xmm1, qword ptr [eax + esi]  // src0
+    movq       xmm2, qword ptr [edx + esi]  // src1
+    punpcklbw  xmm1, xmm2
+    psubb      xmm1, xmm6  // bias src0/1 - 128
+    pmaddubsw  xmm0, xmm1
+    paddw      xmm0, xmm7  // unbias result - 32768 and round.
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi + esi], xmm0
+    lea        esi, [esi + 8]
+    sub        ecx, 8
+    jg         convertloop8
 
-  convertloop1b:
+    pop        edi
     pop        esi
     ret
   }
 }
-#endif  // HAS_ARGBBLENDROW_SSE2
+#endif  // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
+                                          const uint8_t* src1,
+                                          const uint8_t* alpha,
+                                          uint8_t* dst,
+                                          int width) {
+  __asm {
+    push        esi
+    push        edi
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00
+    vpsllw      ymm5, ymm5, 8
+    mov         eax, 0x80808080  // 128 for biasing image to signed.
+    vmovd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
+    vmovd       xmm7, eax
+    vbroadcastss ymm7, xmm7
+    mov         eax, [esp + 8 + 4]  // src0
+    mov         edx, [esp + 8 + 8]  // src1
+    mov         esi, [esp + 8 + 12]  // alpha
+    mov         edi, [esp + 8 + 16]  // dst
+    mov         ecx, [esp + 8 + 20]  // width
+    sub         eax, esi
+    sub         edx, esi
+    sub         edi, esi
+
+        // 32 pixel loop.
+  convertloop32:
+    vmovdqu     ymm0, [esi]  // alpha
+    vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31
+    vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23
+    vpxor       ymm3, ymm3, ymm5  // a, 255-a
+    vpxor       ymm0, ymm0, ymm5  // a, 255-a
+    vmovdqu     ymm1, [eax + esi]  // src0
+    vmovdqu     ymm2, [edx + esi]  // src1
+    vpunpckhbw  ymm4, ymm1, ymm2
+    vpunpcklbw  ymm1, ymm1, ymm2
+    vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128
+    vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpmaddubsw  ymm0, ymm0, ymm1
+    vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.
+    vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.
+    vpsrlw      ymm3, ymm3, 8
+    vpsrlw      ymm0, ymm0, 8
+    vpackuswb   ymm0, ymm0, ymm3
+    vmovdqu     [edi + esi], ymm0
+    lea         esi, [esi + 32]
+    sub         ecx, 32
+    jg          convertloop32
+
+    pop         edi
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_BLENDPLANEROW_AVX2
 
 #ifdef HAS_ARGBBLENDROW_SSSE3
 // Shuffle table for isolating alpha.
-static const uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
-// Same as SSE2, but replaces:
-//    psrlw      xmm3, 8          // alpha
-//    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
-//    pshuflw    xmm3, xmm3, 0F5h
-// with..
-//    pshufb     xmm3, kShuffleAlpha // alpha
-// Blend 8 pixels at a time.
+static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
+                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
 
-__declspec(naked)
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
+// Blend 8 pixels at a time.
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+                                          const uint8_t* src_argb1,
+                                          uint8_t* dst_argb,
+                                          int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
+    pcmpeqb    xmm7, xmm7  // generate constant 0x0001
     psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff
     psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
     psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
     pslld      xmm4, 24
     sub        ecx, 4
-    jl         convertloop4b    // less than 4 pixels?
+    jl         convertloop4b  // less than 4 pixels?
 
-    // 4 pixel loop.
+        // 4 pixel loop.
   convertloop4:
-    movdqu     xmm3, [eax]      // src argb
+    movdqu     xmm3, [eax]  // src argb
     lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqu     xmm2, [esi]      // _r_b
-    pshufb     xmm3, kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqu     xmm1, [esi]      // _a_g
+    movdqa     xmm0, xmm3  // src argb
+    pxor       xmm3, xmm4  // ~alpha
+    movdqu     xmm2, [esi]  // _r_b
+    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
+    pand       xmm2, xmm6  // _r_b
+    paddw      xmm3, xmm7  // 256 - alpha
+    pmullw     xmm2, xmm3  // _r_b * alpha
+    movdqu     xmm1, [esi]  // _a_g
     lea        esi, [esi + 16]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
+    psrlw      xmm1, 8  // _a_g
+    por        xmm0, xmm4  // set alpha to 255
+    pmullw     xmm1, xmm3  // _a_g * alpha
+    psrlw      xmm2, 8  // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2  // + src argb
+    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1  // + src argb
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4155,26 +4225,26 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     add        ecx, 4 - 1
     jl         convertloop1b
 
-    // 1 pixel loop.
+            // 1 pixel loop.
   convertloop1:
-    movd       xmm3, [eax]      // src argb
+    movd       xmm3, [eax]  // src argb
     lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [esi]      // _r_b
-    pshufb     xmm3, kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [esi]      // _a_g
+    movdqa     xmm0, xmm3  // src argb
+    pxor       xmm3, xmm4  // ~alpha
+    movd       xmm2, [esi]  // _r_b
+    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
+    pand       xmm2, xmm6  // _r_b
+    paddw      xmm3, xmm7  // 256 - alpha
+    pmullw     xmm2, xmm3  // _r_b * alpha
+    movd       xmm1, [esi]  // _a_g
     lea        esi, [esi + 4]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
+    psrlw      xmm1, 8  // _a_g
+    por        xmm0, xmm4  // set alpha to 255
+    pmullw     xmm1, xmm3  // _a_g * alpha
+    psrlw      xmm2, 8  // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2  // + src argb
+    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1  // + src argb
     movd       [edx], xmm0
     lea        edx, [edx + 4]
     sub        ecx, 1
@@ -4187,86 +4257,45 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
-#ifdef HAS_ARGBATTENUATEROW_SSE2
-// Attenuate 4 pixels at a time.
-__declspec(naked)
-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
-    pslld      xmm4, 24
-    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
-    psrld      xmm5, 8
-
- convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
-    punpcklbw  xmm0, xmm0       // first 2
-    pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
-    pshuflw    xmm2, xmm2, 0FFh
-    pmulhuw    xmm0, xmm2       // rgb * a
-    movdqu     xmm1, [eax]      // read 4 pixels
-    punpckhbw  xmm1, xmm1       // next 2 pixels
-    pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
-    pshuflw    xmm2, xmm2, 0FFh
-    pmulhuw    xmm1, xmm2       // rgb * a
-    movdqu     xmm2, [eax]      // alphas
-    lea        eax, [eax + 16]
-    psrlw      xmm0, 8
-    pand       xmm2, xmm4
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    pand       xmm0, xmm5       // keep original alphas
-    por        xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBATTENUATEROW_SSE2
-
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha.
 static const uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+    3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
 };
 static const uvec8 kShuffleAlpha1 = {
-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+    11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+    15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
 };
-__declspec(naked)
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
+    pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
     pslld      xmm3, 24
-    movdqa     xmm4, kShuffleAlpha0
-    movdqa     xmm5, kShuffleAlpha1
+    movdqa     xmm4, xmmword ptr kShuffleAlpha0
+    movdqa     xmm5, xmmword ptr kShuffleAlpha1
 
  convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
-    pshufb     xmm0, xmm4       // isolate first 2 alphas
-    movdqu     xmm1, [eax]      // read 4 pixels
-    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
-    pmulhuw    xmm0, xmm1       // rgb * a
-    movdqu     xmm1, [eax]      // read 4 pixels
-    pshufb     xmm1, xmm5       // isolate next 2 alphas
-    movdqu     xmm2, [eax]      // read 4 pixels
-    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
-    pmulhuw    xmm1, xmm2       // rgb * a
-    movdqu     xmm2, [eax]      // mask original alpha
+    movdqu     xmm0, [eax]  // read 4 pixels
+    pshufb     xmm0, xmm4  // isolate first 2 alphas
+    movdqu     xmm1, [eax]  // read 4 pixels
+    punpcklbw  xmm1, xmm1  // first 2 pixel rgbs
+    pmulhuw    xmm0, xmm1  // rgb * a
+    movdqu     xmm1, [eax]  // read 4 pixels
+    pshufb     xmm1, xmm5  // isolate next 2 alphas
+    movdqu     xmm2, [eax]  // read 4 pixels
+    punpckhbw  xmm2, xmm2  // next 2 pixel rgbs
+    pmulhuw    xmm1, xmm2  // rgb * a
+    movdqu     xmm2, [eax]  // mask original alpha
     lea        eax, [eax + 16]
     pand       xmm2, xmm3
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
-    por        xmm0, xmm2       // copy original alpha
+    por        xmm0, xmm2  // copy original alpha
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4279,22 +4308,23 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
-};
-__declspec(naked)
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
+static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
+                                         128u, 128u, 14u,  15u, 14u, 15u,
+                                         14u,  15u,  128u, 128u};
+__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                                             uint8_t* dst_argb,
+                                             int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
-    vbroadcastf128 ymm4,kShuffleAlpha_AVX2
-    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
+    vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
     vpslld     ymm5, ymm5, 24
 
  convertloop:
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
@@ -4319,47 +4349,50 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
 
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-__declspec(naked)
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
+__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                                               uint8_t* dst_argb,
+                                               int width) {
   __asm {
+    push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb0
-    mov        edx, [esp + 8 + 8]   // dst_argb
-    mov        ecx, [esp + 8 + 12]  // width
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        edx, [esp + 12 + 8]  // dst_argb
+    mov        ecx, [esp + 12 + 12]  // width
+    lea        ebx, fixed_invtbl8
 
  convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]  // read 4 pixels
     movzx      esi, byte ptr [eax + 3]  // first alpha
     movzx      edi, byte ptr [eax + 7]  // second alpha
-    punpcklbw  xmm0, xmm0       // first 2
-    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
-    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    punpcklbw  xmm0, xmm0  // first 2
+    movd       xmm2, dword ptr [ebx + esi * 4]
+    movd       xmm3, dword ptr [ebx + edi * 4]
+    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a
+    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
     movlhps    xmm2, xmm3
-    pmulhuw    xmm0, xmm2       // rgb * a
+    pmulhuw    xmm0, xmm2  // rgb * a
 
-    movdqu     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]  // read 4 pixels
     movzx      esi, byte ptr [eax + 11]  // third alpha
     movzx      edi, byte ptr [eax + 15]  // forth alpha
-    punpckhbw  xmm1, xmm1       // next 2
-    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
-    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    punpckhbw  xmm1, xmm1  // next 2
+    movd       xmm2, dword ptr [ebx + esi * 4]
+    movd       xmm3, dword ptr [ebx + edi * 4]
+    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
     movlhps    xmm2, xmm3
-    pmulhuw    xmm1, xmm2       // rgb * a
+    pmulhuw    xmm1, xmm2  // rgb * a
     lea        eax, [eax + 16]
-
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
     jg         convertloop
+
     pop        edi
     pop        esi
+    pop        ebx
     ret
   }
 }
@@ -4368,25 +4401,24 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
-};
+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
 // USE_GATHER is not on by default, due to being a slow instruction.
 #ifdef USE_GATHER
-__declspec(naked)
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                                               uint8_t* dst_argb,
+                                               int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
-    vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
+    vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
 
  convertloop:
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
-    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
+    vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
@@ -4406,49 +4438,50 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     ret
   }
 }
-#else  // USE_GATHER
-__declspec(naked)
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
+#else   // USE_GATHER
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                                               uint8_t* dst_argb,
+                                               int width) {
   __asm {
 
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    sub        edx, eax
-    vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
-
+    push       ebx
     push       esi
     push       edi
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        edx, [esp + 12 + 8]  // dst_argb
+    mov        ecx, [esp + 12 + 12]  // width
+    sub        edx, eax
+    lea        ebx, fixed_invtbl8
+    vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
 
  convertloop:
-    // replace VPGATHER
-    movzx      esi, byte ptr [eax + 3]                 // alpha0
-    movzx      edi, byte ptr [eax + 7]                 // alpha1
-    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
-    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
-    movzx      esi, byte ptr [eax + 11]                // alpha2
-    movzx      edi, byte ptr [eax + 15]                // alpha3
-    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
-    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
-    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
-    movzx      esi, byte ptr [eax + 19]                // alpha4
-    movzx      edi, byte ptr [eax + 23]                // alpha5
-    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
-    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
-    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
-    movzx      esi, byte ptr [eax + 27]                // alpha6
-    movzx      edi, byte ptr [eax + 31]                // alpha7
-    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
-    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
-    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
-    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
-    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
-    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
-    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+        // replace VPGATHER
+    movzx      esi, byte ptr [eax + 3]  // alpha0
+    movzx      edi, byte ptr [eax + 7]  // alpha1
+    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
+    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
+    movzx      esi, byte ptr [eax + 11]  // alpha2
+    movzx      edi, byte ptr [eax + 15]  // alpha3
+    vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]
+    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
+    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
+    movzx      esi, byte ptr [eax + 19]  // alpha4
+    movzx      edi, byte ptr [eax + 23]  // alpha5
+    vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]
+    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
+    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
+    movzx      esi, byte ptr [eax + 27]  // alpha6
+    movzx      edi, byte ptr [eax + 31]  // alpha7
+    vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]
+    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
+    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
+    vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]
+    vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]
+    vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]
+    vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
     // end of VPGATHER
 
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
@@ -4457,7 +4490,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vpackuswb  ymm0, ymm0, ymm1             // unmutated.
     vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
     sub        ecx, 8
@@ -4465,6 +4498,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 
     pop        edi
     pop        esi
+    pop        ebx
     vzeroupper
     ret
   }
@@ -4474,14 +4508,15 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked)
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
+                                         uint8_t* dst_argb,
+                                         int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]  /* width */
-    movdqa     xmm4, kARGBToYJ
-    movdqa     xmm5, kAddYJ64
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* width */
+    movdqa     xmm4, xmmword ptr kARGBToYJ
+    movdqa     xmm5, xmmword ptr kAddYJ64
 
  convertloop:
     movdqu     xmm0, [eax]  // G
@@ -4491,20 +4526,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     phaddw     xmm0, xmm1
     paddw      xmm0, xmm5  // Add .5 for rounding.
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0   // 8 G bytes
+    packuswb   xmm0, xmm0  // 8 G bytes
     movdqu     xmm2, [eax]  // A
     movdqu     xmm3, [eax + 16]
     lea        eax, [eax + 32]
     psrld      xmm2, 24
     psrld      xmm3, 24
     packuswb   xmm2, xmm3
-    packuswb   xmm2, xmm2   // 8 A bytes
-    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
-    punpcklbw  xmm0, xmm0   // 8 GG words
-    punpcklbw  xmm3, xmm2   // 8 GA words
+    packuswb   xmm2, xmm2  // 8 A bytes
+    movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA
+    punpcklbw  xmm0, xmm0  // 8 GG words
+    punpcklbw  xmm3, xmm2  // 8 GA words
     movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm3   // GGGA first 4
-    punpckhwd  xmm1, xmm3   // GGGA next 4
+    punpcklwd  xmm0, xmm3  // GGGA first 4
+    punpckhwd  xmm1, xmm3  // GGGA next 4
     movdqu     [edx], xmm0
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
@@ -4520,27 +4555,23 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 // Constant for ARGB color to sepia tone.
-static const vec8 kARGBToSepiaB = {
-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+                                   17, 68, 35, 0, 17, 68, 35, 0};
 
-static const vec8 kARGBToSepiaG = {
-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+                                   22, 88, 45, 0, 22, 88, 45, 0};
 
-static const vec8 kARGBToSepiaR = {
-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+                                   24, 98, 50, 0, 24, 98, 50, 0};
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked)
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
   __asm {
-    mov        eax, [esp + 4]   /* dst_argb */
-    mov        ecx, [esp + 8]   /* width */
-    movdqa     xmm2, kARGBToSepiaB
-    movdqa     xmm3, kARGBToSepiaG
-    movdqa     xmm4, kARGBToSepiaR
+    mov        eax, [esp + 4] /* dst_argb */
+    mov        ecx, [esp + 8] /* width */
+    movdqa     xmm2, xmmword ptr kARGBToSepiaB
+    movdqa     xmm3, xmmword ptr kARGBToSepiaG
+    movdqa     xmm4, xmmword ptr kARGBToSepiaR
 
  convertloop:
     movdqu     xmm0, [eax]  // B
@@ -4549,32 +4580,32 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
     pmaddubsw  xmm6, xmm2
     phaddw     xmm0, xmm6
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0   // 8 B values
+    packuswb   xmm0, xmm0  // 8 B values
     movdqu     xmm5, [eax]  // G
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm3
     pmaddubsw  xmm1, xmm3
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 G values
-    punpcklbw  xmm0, xmm5   // 8 BG values
+    packuswb   xmm5, xmm5  // 8 G values
+    punpcklbw  xmm0, xmm5  // 8 BG values
     movdqu     xmm5, [eax]  // R
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm4
     pmaddubsw  xmm1, xmm4
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 R values
+    packuswb   xmm5, xmm5  // 8 R values
     movdqu     xmm6, [eax]  // A
     movdqu     xmm1, [eax + 16]
     psrld      xmm6, 24
     psrld      xmm1, 24
     packuswb   xmm6, xmm1
-    packuswb   xmm6, xmm6   // 8 A values
-    punpcklbw  xmm5, xmm6   // 8 RA values
-    movdqa     xmm1, xmm0   // Weave BG, RA together
-    punpcklwd  xmm0, xmm5   // BGRA first 4
-    punpckhwd  xmm1, xmm5   // BGRA next 4
+    packuswb   xmm6, xmm6  // 8 A values
+    punpcklbw  xmm5, xmm6  // 8 RA values
+    movdqa     xmm1, xmm0  // Weave BG, RA together
+    punpcklwd  xmm0, xmm5  // BGRA first 4
+    punpckhwd  xmm1, xmm5  // BGRA next 4
     movdqu     [eax], xmm0
     movdqu     [eax + 16], xmm1
     lea        eax, [eax + 32]
@@ -4590,19 +4621,20 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 // Same as Sepia except matrix is provided.
 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked)
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]  /* matrix_argb */
+__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+                                                uint8_t* dst_argb,
+                                                const int8_t* matrix_argb,
+                                                int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* matrix_argb */
     movdqu     xmm5, [ecx]
     pshufd     xmm2, xmm5, 0x00
     pshufd     xmm3, xmm5, 0x55
     pshufd     xmm4, xmm5, 0xaa
     pshufd     xmm5, xmm5, 0xff
-    mov        ecx, [esp + 16]  /* width */
+    mov        ecx, [esp + 16] /* width */
 
  convertloop:
     movdqu     xmm0, [eax]  // B
@@ -4613,31 +4645,31 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm6, xmm3
     pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm7   // B
-    phaddsw    xmm6, xmm1   // G
-    psraw      xmm0, 6      // B
-    psraw      xmm6, 6      // G
-    packuswb   xmm0, xmm0   // 8 B values
-    packuswb   xmm6, xmm6   // 8 G values
-    punpcklbw  xmm0, xmm6   // 8 BG values
+    phaddsw    xmm0, xmm7  // B
+    phaddsw    xmm6, xmm1  // G
+    psraw      xmm0, 6  // B
+    psraw      xmm6, 6  // G
+    packuswb   xmm0, xmm0  // 8 B values
+    packuswb   xmm6, xmm6  // 8 G values
+    punpcklbw  xmm0, xmm6  // 8 BG values
     movdqu     xmm1, [eax]  // R
     movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm7, xmm4
-    phaddsw    xmm1, xmm7   // R
+    phaddsw    xmm1, xmm7  // R
     movdqu     xmm6, [eax]  // A
     movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm6, xmm5
     pmaddubsw  xmm7, xmm5
-    phaddsw    xmm6, xmm7   // A
-    psraw      xmm1, 6      // R
-    psraw      xmm6, 6      // A
-    packuswb   xmm1, xmm1   // 8 R values
-    packuswb   xmm6, xmm6   // 8 A values
-    punpcklbw  xmm1, xmm6   // 8 RA values
-    movdqa     xmm6, xmm0   // Weave BG, RA together
-    punpcklwd  xmm0, xmm1   // BGRA first 4
-    punpckhwd  xmm6, xmm1   // BGRA next 4
+    phaddsw    xmm6, xmm7  // A
+    psraw      xmm1, 6  // R
+    psraw      xmm6, 6  // A
+    packuswb   xmm1, xmm1  // 8 R values
+    packuswb   xmm6, xmm6  // 8 A values
+    punpcklbw  xmm1, xmm6  // 8 RA values
+    movdqa     xmm6, xmm0  // Weave BG, RA together
+    punpcklwd  xmm0, xmm1  // BGRA first 4
+    punpckhwd  xmm6, xmm1  // BGRA next 4
     movdqu     [edx], xmm0
     movdqu     [edx + 16], xmm6
     lea        eax, [eax + 32]
@@ -4651,15 +4683,17 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-__declspec(naked)
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  __asm {
-    mov        eax, [esp + 4]    /* dst_argb */
-    movd       xmm2, [esp + 8]   /* scale */
-    movd       xmm3, [esp + 12]  /* interval_size */
-    movd       xmm4, [esp + 16]  /* interval_offset */
-    mov        ecx, [esp + 20]   /* width */
+__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+                                            int scale,
+                                            int interval_size,
+                                            int interval_offset,
+                                            int width) {
+  __asm {
+    mov        eax, [esp + 4] /* dst_argb */
+    movd       xmm2, [esp + 8] /* scale */
+    movd       xmm3, [esp + 12] /* interval_size */
+    movd       xmm4, [esp + 16] /* interval_offset */
+    mov        ecx, [esp + 20] /* width */
     pshuflw    xmm2, xmm2, 040h
     pshufd     xmm2, xmm2, 044h
     pshuflw    xmm3, xmm3, 040h
@@ -4672,16 +4706,16 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
 
  convertloop:
     movdqu     xmm0, [eax]  // read 4 pixels
-    punpcklbw  xmm0, xmm5   // first 2 pixels
-    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
+    punpcklbw  xmm0, xmm5  // first 2 pixels
+    pmulhuw    xmm0, xmm2  // pixel * scale >> 16
     movdqu     xmm1, [eax]  // read 4 pixels
-    punpckhbw  xmm1, xmm5   // next 2 pixels
+    punpckhbw  xmm1, xmm5  // next 2 pixels
     pmulhuw    xmm1, xmm2
-    pmullw     xmm0, xmm3   // * interval_size
+    pmullw     xmm0, xmm3  // * interval_size
     movdqu     xmm7, [eax]  // read 4 pixels
     pmullw     xmm1, xmm3
-    pand       xmm7, xmm6   // mask alpha
-    paddw      xmm0, xmm4   // + interval_size / 2
+    pand       xmm7, xmm6  // mask alpha
+    paddw      xmm0, xmm4  // + interval_size / 2
     paddw      xmm1, xmm4
     packuswb   xmm0, xmm1
     por        xmm0, xmm7
@@ -4696,25 +4730,26 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-__declspec(naked)
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
+__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+                                         uint8_t* dst_argb,
+                                         int width,
+                                         uint32_t value) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     movd       xmm2, [esp + 16]  // value
     punpcklbw  xmm2, xmm2
     punpcklqdq xmm2, xmm2
 
  convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]  // read 4 pixels
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0       // first 2
-    punpckhbw  xmm1, xmm1       // next 2
-    pmulhuw    xmm0, xmm2       // argb * value
-    pmulhuw    xmm1, xmm2       // argb * value
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    pmulhuw    xmm0, xmm2  // argb * value
+    pmulhuw    xmm1, xmm2  // argb * value
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
@@ -4730,28 +4765,29 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked)
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
     movdqu     xmm1, xmm0
     movdqu     xmm3, xmm2
-    punpcklbw  xmm0, xmm0         // first 2
-    punpckhbw  xmm1, xmm1         // next 2
-    punpcklbw  xmm2, xmm5         // first 2
-    punpckhbw  xmm3, xmm5         // next 2
-    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
-    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    punpcklbw  xmm2, xmm5  // first 2
+    punpckhbw  xmm3, xmm5  // next 2
+    pmulhuw    xmm0, xmm2  // src_argb0 * src_argb1 first 2
+    pmulhuw    xmm1, xmm3  // src_argb0 * src_argb1 next 2
     lea        eax, [eax + 16]
     lea        esi, [esi + 16]
     packuswb   xmm0, xmm1
@@ -4769,13 +4805,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 // TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked)
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+                                       const uint8_t* src_argb1,
+                                       uint8_t* dst_argb,
+                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
@@ -4783,11 +4820,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     jl         convertloop49
 
  convertloop4:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
     lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4798,11 +4835,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     jl         convertloop19
 
  convertloop1:
-    movd       xmm0, [eax]        // read 1 pixels from src_argb0
+    movd       xmm0, [eax]  // read 1 pixels from src_argb0
     lea        eax, [eax + 4]
-    movd       xmm1, [esi]        // read 1 pixels from src_argb1
+    movd       xmm1, [esi]  // read 1 pixels from src_argb1
     lea        esi, [esi + 4]
-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
     sub        ecx, 1
@@ -4817,22 +4854,23 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked)
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
     lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
+    psubusb    xmm0, xmm1  // src_argb0 - src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4846,28 +4884,29 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    vpxor      ymm5, ymm5, ymm5     // constant 0
+    vpxor      ymm5, ymm5, ymm5  // constant 0
 
  convertloop:
-    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
+    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
-    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
+    vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
     lea        esi, [esi + 32]
-    vpunpcklbw ymm0, ymm1, ymm1   // low 4
-    vpunpckhbw ymm1, ymm1, ymm1   // high 4
-    vpunpcklbw ymm2, ymm3, ymm5   // low 4
-    vpunpckhbw ymm3, ymm3, ymm5   // high 4
-    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
-    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
+    vpunpcklbw ymm0, ymm1, ymm1  // low 4
+    vpunpckhbw ymm1, ymm1, ymm1  // high 4
+    vpunpcklbw ymm2, ymm3, ymm5  // low 4
+    vpunpckhbw ymm3, ymm3, ymm5  // high 4
+    vpmulhuw   ymm0, ymm0, ymm2  // src_argb0 * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3  // src_argb0 * src_argb1 high 4
     vpackuswb  ymm0, ymm0, ymm1
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -4883,20 +4922,21 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+                                       const uint8_t* src_argb1,
+                                       uint8_t* dst_argb,
+                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
-    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
+    vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
     lea        esi, [esi + 32]
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -4912,20 +4952,21 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
-    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
+    vpsubusb   ymm0, ymm0, [esi]  // src_argb0 - src_argb1
     lea        esi, [esi + 32]
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -4944,14 +4985,16 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 // -1  0  1
 // -2  0  2
 // -1  0  1
-__declspec(naked)
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
+                                      const uint8_t* src_y1,
+                                      const uint8_t* src_y2,
+                                      uint8_t* dst_sobelx,
+                                      int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_y0
-    mov        esi, [esp + 8 + 8]   // src_y1
+    mov        eax, [esp + 8 + 4]  // src_y0
+    mov        esi, [esp + 8 + 8]  // src_y1
     mov        edi, [esp + 8 + 12]  // src_y2
     mov        edx, [esp + 8 + 16]  // dst_sobelx
     mov        ecx, [esp + 8 + 20]  // width
@@ -4961,17 +5004,17 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
     punpcklbw  xmm0, xmm5
     punpcklbw  xmm1, xmm5
     psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
     punpcklbw  xmm1, xmm5
     punpcklbw  xmm2, xmm5
     psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
+    movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
     punpcklbw  xmm2, xmm5
     punpcklbw  xmm3, xmm5
@@ -4979,7 +5022,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     paddw      xmm0, xmm2
     paddw      xmm0, xmm1
     paddw      xmm0, xmm1
-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
     psubw      xmm1, xmm0
     pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
@@ -5000,13 +5043,14 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-__declspec(naked)
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
+__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
+                                      const uint8_t* src_y1,
+                                      uint8_t* dst_sobely,
+                                      int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_y0
-    mov        esi, [esp + 4 + 8]   // src_y1
+    mov        eax, [esp + 4 + 4]  // src_y0
+    mov        esi, [esp + 4 + 8]  // src_y1
     mov        edx, [esp + 4 + 12]  // dst_sobely
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
@@ -5014,17 +5058,17 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
     punpcklbw  xmm0, xmm5
     punpcklbw  xmm1, xmm5
     psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
+    movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
     punpcklbw  xmm1, xmm5
     punpcklbw  xmm2, xmm5
     psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
     punpcklbw  xmm2, xmm5
     punpcklbw  xmm3, xmm5
@@ -5032,7 +5076,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     paddw      xmm0, xmm2
     paddw      xmm0, xmm1
     paddw      xmm0, xmm1
-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
     psubw      xmm1, xmm0
     pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
@@ -5053,36 +5097,37 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // R = Sobel
 // G = Sobel
 // B = Sobel
-__declspec(naked)
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) {
+__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
+                                     const uint8_t* src_sobely,
+                                     uint8_t* dst_argb,
+                                     int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
-    pcmpeqb    xmm5, xmm5           // alpha 255
-    pslld      xmm5, 24             // 0xff000000
+    pcmpeqb    xmm5, xmm5  // alpha 255
+    pslld      xmm5, 24  // 0xff000000
 
  convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
-    movdqa     xmm2, xmm0             // GG
-    punpcklbw  xmm2, xmm0             // First 8
-    punpckhbw  xmm0, xmm0             // Next 8
-    movdqa     xmm1, xmm2             // GGGG
-    punpcklwd  xmm1, xmm2             // First 4
-    punpckhwd  xmm2, xmm2             // Next 4
-    por        xmm1, xmm5             // GGGA
+    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
+    movdqa     xmm2, xmm0  // GG
+    punpcklbw  xmm2, xmm0  // First 8
+    punpckhbw  xmm0, xmm0  // Next 8
+    movdqa     xmm1, xmm2  // GGGG
+    punpcklwd  xmm1, xmm2  // First 4
+    punpckhwd  xmm2, xmm2  // Next 4
+    por        xmm1, xmm5  // GGGA
     por        xmm2, xmm5
-    movdqa     xmm3, xmm0             // GGGG
-    punpcklwd  xmm3, xmm0             // Next 4
-    punpckhwd  xmm0, xmm0             // Last 4
-    por        xmm3, xmm5             // GGGA
+    movdqa     xmm3, xmm0  // GGGG
+    punpcklwd  xmm3, xmm0  // Next 4
+    punpckhwd  xmm0, xmm0  // Last 4
+    por        xmm3, xmm5  // GGGA
     por        xmm0, xmm5
     movdqu     [edx], xmm1
     movdqu     [edx + 16], xmm2
@@ -5100,22 +5145,23 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked)
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
+__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+                                            const uint8_t* src_sobely,
+                                            uint8_t* dst_y,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
 
  convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 16
@@ -5133,36 +5179,37 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-__declspec(naked)
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
+__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+                                       const uint8_t* src_sobely,
+                                       uint8_t* dst_argb,
+                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
-    pcmpeqb    xmm5, xmm5           // alpha 255
+    pcmpeqb    xmm5, xmm5  // alpha 255
 
  convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     movdqa     xmm2, xmm0
-    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
-    movdqa     xmm3, xmm0             // XA
+    paddusb    xmm2, xmm1  // sobel = sobelx + sobely
+    movdqa     xmm3, xmm0  // XA
     punpcklbw  xmm3, xmm5
     punpckhbw  xmm0, xmm5
-    movdqa     xmm4, xmm1             // YS
+    movdqa     xmm4, xmm1  // YS
     punpcklbw  xmm4, xmm2
     punpckhbw  xmm1, xmm2
-    movdqa     xmm6, xmm4             // YSXA
-    punpcklwd  xmm6, xmm3             // First 4
-    punpckhwd  xmm4, xmm3             // Next 4
-    movdqa     xmm7, xmm1             // YSXA
-    punpcklwd  xmm7, xmm0             // Next 4
-    punpckhwd  xmm1, xmm0             // Last 4
+    movdqa     xmm6, xmm4  // YSXA
+    punpcklwd  xmm6, xmm3  // First 4
+    punpckhwd  xmm4, xmm3  // Next 4
+    movdqa     xmm7, xmm1  // YSXA
+    punpcklwd  xmm7, xmm0  // Next 4
+    punpckhwd  xmm1, xmm0  // Last 4
     movdqu     [edx], xmm6
     movdqu     [edx + 16], xmm4
     movdqu     [edx + 32], xmm7
@@ -5190,8 +5237,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 // dst points to pixel to store result to.
 // count is number of averaged pixels to produce.
 // Does 4 pixels at a time.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst,
+// This function requires alignment on accumulation buffer pointers.
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+                                    const int32_t* botleft,
+                                    int width,
+                                    int area,
+                                    uint8_t* dst,
                                     int count) {
   __asm {
     mov        eax, topleft  // eax topleft
@@ -5209,18 +5260,18 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     cmp        area, 128  // 128 pixels will not overflow 15 bits.
     ja         l4
 
-    pshufd     xmm5, xmm5, 0        // area
-    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
+    pshufd     xmm5, xmm5, 0  // area
+    pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0
     psrld      xmm6, 16
     cvtdq2ps   xmm6, xmm6
-    addps      xmm5, xmm6           // (65536.0 + area - 1)
-    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
-    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
-    packssdw   xmm5, xmm5           // 16 bit shorts
+    addps      xmm5, xmm6  // (65536.0 + area - 1)
+    mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area
+    cvtps2dq   xmm5, xmm5  // 0.16 fixed point
+    packssdw   xmm5, xmm5  // 16 bit shorts
 
-    // 4 pixel loop small blocks.
+        // 4 pixel loop small blocks.
   s4:
-    // top left
+        // top left
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
@@ -5260,9 +5311,9 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 
     jmp        l4b
 
-    // 4 pixel loop
+            // 4 pixel loop
   l4:
-    // top left
+        // top left
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
@@ -5288,7 +5339,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     paddd      xmm3, [esi + edx * 4 + 48]
     lea        esi, [esi + 64]
 
-    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
+    cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area
     cvtdq2ps   xmm1, xmm1
     mulps      xmm0, xmm4
     mulps      xmm1, xmm4
@@ -5312,7 +5363,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     add        ecx, 4 - 1
     jl         l1b
 
-    // 1 pixel loop
+            // 1 pixel loop
   l1:
     movdqu     xmm0, [eax]
     psubd      xmm0, [eax + edx * 4]
@@ -5337,8 +5388,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
 // above and to the left of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width) {
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+                                  int32_t* cumsum,
+                                  const int32_t* previous_cumsum,
+                                  int width) {
   __asm {
     mov        eax, row
     mov        edx, cumsum
@@ -5352,7 +5405,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     test       edx, 15
     jne        l4b
 
-    // 4 pixel loop
+        // 4 pixel loop
   l4:
     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
     lea        eax, [eax + 16]
@@ -5398,9 +5451,9 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     add        ecx, 4 - 1
     jl         l1b
 
-    // 1 pixel loop
+            // 1 pixel loop
   l1:
-    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
+    movd       xmm2, dword ptr [eax]  // 1 argb pixel
     lea        eax, [eax + 4]
     punpcklbw  xmm2, xmm1
     punpcklwd  xmm2, xmm1
@@ -5420,10 +5473,11 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked)
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width) {
+__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                                                     int src_argb_stride,
+                                                     uint8_t* dst_argb,
+                                                     const float* uv_dudv,
+                                                     int width) {
   __asm {
     push       esi
     push       edi
@@ -5434,46 +5488,46 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     movq       xmm2, qword ptr [ecx]  // uv
     movq       xmm7, qword ptr [ecx + 8]  // dudv
     mov        ecx, [esp + 28]  // width
-    shl        esi, 16          // 4, stride
+    shl        esi, 16  // 4, stride
     add        esi, 4
     movd       xmm5, esi
     sub        ecx, 4
     jl         l4b
 
-    // setup for 4 pixel loop
+        // setup for 4 pixel loop
     pshufd     xmm7, xmm7, 0x44  // dup dudv
     pshufd     xmm5, xmm5, 0  // dup 4, stride
-    movdqa     xmm0, xmm2    // x0, y0, x1, y1
+    movdqa     xmm0, xmm2  // x0, y0, x1, y1
     addps      xmm0, xmm7
     movlhps    xmm2, xmm0
     movdqa     xmm4, xmm7
-    addps      xmm4, xmm4    // dudv *= 2
-    movdqa     xmm3, xmm2    // x2, y2, x3, y3
+    addps      xmm4, xmm4  // dudv *= 2
+    movdqa     xmm3, xmm2  // x2, y2, x3, y3
     addps      xmm3, xmm4
-    addps      xmm4, xmm4    // dudv *= 4
+    addps      xmm4, xmm4  // dudv *= 4
 
-    // 4 pixel loop
+        // 4 pixel loop
   l4:
-    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
-    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
-    packssdw   xmm0, xmm1    // x, y as 8 shorts
-    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
+    cvttps2dq  xmm0, xmm2  // x, y float to int first 2
+    cvttps2dq  xmm1, xmm3  // x, y float to int next 2
+    packssdw   xmm0, xmm1  // x, y as 8 shorts
+    pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
     movd       esi, xmm0
     pshufd     xmm0, xmm0, 0x39  // shift right
     movd       edi, xmm0
     pshufd     xmm0, xmm0, 0x39  // shift right
     movd       xmm1, [eax + esi]  // read pixel 0
     movd       xmm6, [eax + edi]  // read pixel 1
-    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
-    addps      xmm2, xmm4    // x, y += dx, dy first 2
+    punpckldq  xmm1, xmm6  // combine pixel 0 and 1
+    addps      xmm2, xmm4  // x, y += dx, dy first 2
     movq       qword ptr [edx], xmm1
     movd       esi, xmm0
     pshufd     xmm0, xmm0, 0x39  // shift right
     movd       edi, xmm0
     movd       xmm6, [eax + esi]  // read pixel 2
     movd       xmm0, [eax + edi]  // read pixel 3
-    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
-    addps      xmm3, xmm4    // x, y += dx, dy next 2
+    punpckldq  xmm6, xmm0  // combine pixel 2 and 3
+    addps      xmm3, xmm4  // x, y += dx, dy next 2
     movq       qword ptr 8[edx], xmm6
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -5483,12 +5537,12 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     add        ecx, 4 - 1
     jl         l1b
 
-    // 1 pixel loop
+            // 1 pixel loop
   l1:
-    cvttps2dq  xmm0, xmm2    // x, y float to int
-    packssdw   xmm0, xmm0    // x, y as shorts
-    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
-    addps      xmm2, xmm7    // x, y += dx, dy
+    cvttps2dq  xmm0, xmm2  // x, y float to int
+    packssdw   xmm0, xmm0  // x, y as shorts
+    pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride
+    addps      xmm2, xmm7  // x, y += dx, dy
     movd       esi, xmm0
     movd       xmm0, [eax + esi]  // copy a pixel
     movd       [edx], xmm0
@@ -5505,68 +5559,59 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 32x2 -> 32x1
-__declspec(naked)
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) {
+__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
+                                           const uint8_t* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           int dst_width,
+                                           int source_y_fraction) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    shr        eax, 1
     // Dispatch to specialized filters if applicable.
     cmp        eax, 0
-    je         xloop100  // 0 / 128.  Blend 100 / 0.
+    je         xloop100  // 0 / 256.  Blend 100 / 0.
     sub        edi, esi
-    cmp        eax, 32
-    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
-    cmp        eax, 64
-    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
-    cmp        eax, 96
-    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
-
-    vmovd      xmm0, eax  // high fraction 0..127
+    cmp        eax, 128
+    je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
+
+    vmovd      xmm0, eax  // high fraction 0..255
     neg        eax
-    add        eax, 128
-    vmovd      xmm5, eax  // low fraction 128..1
+    add        eax, 256
+    vmovd      xmm5, eax  // low fraction 256..1
     vpunpcklbw xmm5, xmm5, xmm0
     vpunpcklwd xmm5, xmm5, xmm5
-    vpxor      ymm0, ymm0, ymm0
-    vpermd     ymm5, ymm0, ymm5
+    vbroadcastss ymm5, xmm5
+
+    mov        eax, 0x80808080  // 128b for bias and rounding.
+    vmovd      xmm4, eax
+    vbroadcastss ymm4, xmm4
 
   xloop:
     vmovdqu    ymm0, [esi]
     vmovdqu    ymm2, [esi + edx]
     vpunpckhbw ymm1, ymm0, ymm2  // mutates
-    vpunpcklbw ymm0, ymm0, ymm2  // mutates
-    vpmaddubsw ymm0, ymm0, ymm5
-    vpmaddubsw ymm1, ymm1, ymm5
-    vpsrlw     ymm0, ymm0, 7
-    vpsrlw     ymm1, ymm1, 7
-    vpackuswb  ymm0, ymm0, ymm1  // unmutates
+    vpunpcklbw ymm0, ymm0, ymm2
+    vpsubb     ymm1, ymm1, ymm4  // bias to signed image
+    vpsubb     ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm5, ymm1
+    vpmaddubsw ymm0, ymm5, ymm0
+    vpaddw     ymm1, ymm1, ymm4  // unbias and round
+    vpaddw     ymm0, ymm0, ymm4
+    vpsrlw     ymm1, ymm1, 8
+    vpsrlw     ymm0, ymm0, 8
+    vpackuswb  ymm0, ymm0, ymm1            // unmutates
     vmovdqu    [esi + edi], ymm0
     lea        esi, [esi + 32]
     sub        ecx, 32
     jg         xloop
     jmp        xloop99
 
-   // Blend 25 / 75.
- xloop25:
-   vmovdqu    ymm0, [esi]
-   vmovdqu    ymm1, [esi + edx]
-   vpavgb     ymm0, ymm0, ymm1
-   vpavgb     ymm0, ymm0, ymm1
-   vmovdqu    [esi + edi], ymm0
-   lea        esi, [esi + 32]
-   sub        ecx, 32
-   jg         xloop25
-   jmp        xloop99
-
-   // Blend 50 / 50.
+        // Blend 50 / 50.
  xloop50:
    vmovdqu    ymm0, [esi]
    vpavgb     ymm0, ymm0, [esi + edx]
@@ -5576,19 +5621,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    jg         xloop50
    jmp        xloop99
 
-   // Blend 75 / 25.
- xloop75:
-   vmovdqu    ymm1, [esi]
-   vmovdqu    ymm0, [esi + edx]
-   vpavgb     ymm0, ymm0, ymm1
-   vpavgb     ymm0, ymm0, ymm1
-   vmovdqu    [esi + edi], ymm0
-   lea        esi, [esi + 32]
-   sub        ecx, 32
-   jg         xloop75
-   jmp        xloop99
-
-   // Blend 100 / 0 - Copy row unchanged.
+        // Blend 100 / 0 - Copy row unchanged.
  xloop100:
    rep movsb
 
@@ -5602,37 +5635,38 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
 #endif  // HAS_INTERPOLATEROW_AVX2
 
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked)
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
+// TODO(fbarchard): Consider allowing 256 using memcpy.
+__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                                            const uint8_t* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            int dst_width,
+                                            int source_y_fraction) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
+
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
     sub        edi, esi
-    shr        eax, 1
-    // Dispatch to specialized filters if applicable.
+        // Dispatch to specialized filters if applicable.
     cmp        eax, 0
-    je         xloop100  // 0 / 128.  Blend 100 / 0.
-    cmp        eax, 32
-    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
-    cmp        eax, 64
-    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
-    cmp        eax, 96
-    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
-
-    movd       xmm0, eax  // high fraction 0..127
+    je         xloop100  // 0 /256.  Blend 100 / 0.
+    cmp        eax, 128
+    je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
+
+    movd       xmm0, eax  // high fraction 0..255
     neg        eax
-    add        eax, 128
-    movd       xmm5, eax  // low fraction 128..1
+    add        eax, 256
+    movd       xmm5, eax  // low fraction 255..1
     punpcklbw  xmm5, xmm0
     punpcklwd  xmm5, xmm5
     pshufd     xmm5, xmm5, 0
+    mov        eax, 0x80808080  // 128 for biasing image to signed.
+    movd       xmm4, eax
+    pshufd     xmm4, xmm4, 0x00
 
   xloop:
     movdqu     xmm0, [esi]
@@ -5640,137 +5674,24 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     movdqu     xmm1, xmm0
     punpcklbw  xmm0, xmm2
     punpckhbw  xmm1, xmm2
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm1, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm1, 7
-    packuswb   xmm0, xmm1
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop
-    jmp        xloop99
-
-    // Blend 25 / 75.
-  xloop25:
-    movdqu     xmm0, [esi]
-    movdqu     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop25
-    jmp        xloop99
-
-    // Blend 50 / 50.
-  xloop50:
-    movdqu     xmm0, [esi]
-    movdqu     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop50
-    jmp        xloop99
-
-    // Blend 75 / 25.
-  xloop75:
-    movdqu     xmm1, [esi]
-    movdqu     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop75
-    jmp        xloop99
-
-    // Blend 100 / 0 - Copy row unchanged.
-  xloop100:
-    movdqu     xmm0, [esi]
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop100
-
-  xloop99:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked)
-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 / 256.  Blend 100 / 0.
-    cmp        eax, 64
-    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
-    cmp        eax, 128
-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
-    cmp        eax, 192
-    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
-
-    movd       xmm5, eax            // xmm5 = y fraction
-    punpcklbw  xmm5, xmm5
-    psrlw      xmm5, 1
-    punpcklwd  xmm5, xmm5
-    punpckldq  xmm5, xmm5
-    punpcklqdq xmm5, xmm5
-    pxor       xmm4, xmm4
-
-  xloop:
-    movdqu     xmm0, [esi]  // row0
-    movdqu     xmm2, [esi + edx]  // row1
-    movdqu     xmm1, xmm0
-    movdqu     xmm3, xmm2
-    punpcklbw  xmm2, xmm4
-    punpckhbw  xmm3, xmm4
-    punpcklbw  xmm0, xmm4
-    punpckhbw  xmm1, xmm4
-    psubw      xmm2, xmm0  // row1 - row0
-    psubw      xmm3, xmm1
-    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
-    paddw      xmm3, xmm3
-    pmulhw     xmm2, xmm5  // scale diff
-    pmulhw     xmm3, xmm5
-    paddw      xmm0, xmm2  // sum rows
-    paddw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-    movdqu     [esi + edi], xmm0
+    psubb      xmm0, xmm4            // bias image by -128
+    psubb      xmm1, xmm4
+    movdqa     xmm2, xmm5
+    movdqa     xmm3, xmm5
+    pmaddubsw  xmm2, xmm0
+    pmaddubsw  xmm3, xmm1
+    paddw      xmm2, xmm4
+    paddw      xmm3, xmm4
+    psrlw      xmm2, 8
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqu     [esi + edi], xmm2
     lea        esi, [esi + 16]
     sub        ecx, 16
     jg         xloop
     jmp        xloop99
 
-    // Blend 25 / 75.
-  xloop25:
-    movdqu     xmm0, [esi]
-    movdqu     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop25
-    jmp        xloop99
-
-    // Blend 50 / 50.
+        // Blend 50 / 50.
   xloop50:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
@@ -5781,19 +5702,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     jg         xloop50
     jmp        xloop99
 
-    // Blend 75 / 25.
-  xloop75:
-    movdqu     xmm1, [esi]
-    movdqu     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop75
-    jmp        xloop99
-
-    // Blend 100 / 0 - Copy row unchanged.
+        // Blend 100 / 0 - Copy row unchanged.
   xloop100:
     movdqu     xmm0, [esi]
     movdqu     [esi + edi], xmm0
@@ -5807,18 +5716,18 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     ret
   }
 }
-#endif  // HAS_INTERPOLATEROW_SSE2
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked)
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_argb
-    mov        ecx, [esp + 12]   // shuffler
+__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+                                            uint8_t* dst_argb,
+                                            const uint8_t* shuffler,
+                                            int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // shuffler
     movdqu     xmm5, [ecx]
-    mov        ecx, [esp + 16]   // pix
+    mov        ecx, [esp + 16]  // width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -5836,15 +5745,16 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 }
 
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked)
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int pix) {
+__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                                           uint8_t* dst_argb,
+                                           const uint8_t* shuffler,
+                                           int width) {
   __asm {
-    mov        eax, [esp + 4]     // src_argb
-    mov        edx, [esp + 8]     // dst_argb
-    mov        ecx, [esp + 12]    // shuffler
-    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
-    mov        ecx, [esp + 16]    // pix
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // shuffler
+    vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
+    mov        ecx, [esp + 16]  // width
 
   wloop:
     vmovdqu    ymm0, [eax]
@@ -5864,152 +5774,36 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
-__declspec(naked)
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int pix) {
-  __asm {
-    push       ebx
-    push       esi
-    mov        eax, [esp + 8 + 4]    // src_argb
-    mov        edx, [esp + 8 + 8]    // dst_argb
-    mov        esi, [esp + 8 + 12]   // shuffler
-    mov        ecx, [esp + 8 + 16]   // pix
-    pxor       xmm5, xmm5
-
-    mov        ebx, [esi]   // shuffler
-    cmp        ebx, 0x03000102
-    je         shuf_3012
-    cmp        ebx, 0x00010203
-    je         shuf_0123
-    cmp        ebx, 0x00030201
-    je         shuf_0321
-    cmp        ebx, 0x02010003
-    je         shuf_2103
-
-  // TODO(fbarchard): Use one source pointer and 3 offsets.
-  shuf_any1:
-    movzx      ebx, byte ptr [esi]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx], bl
-    movzx      ebx, byte ptr [esi + 1]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx + 1], bl
-    movzx      ebx, byte ptr [esi + 2]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx + 2], bl
-    movzx      ebx, byte ptr [esi + 3]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx + 3], bl
-    lea        eax, [eax + 4]
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jg         shuf_any1
-    jmp        shuf99
-
-  shuf_0123:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
-    pshuflw    xmm0, xmm0, 01Bh
-    pshufhw    xmm1, xmm1, 01Bh
-    pshuflw    xmm1, xmm1, 01Bh
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_0123
-    jmp        shuf99
-
-  shuf_0321:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
-    pshuflw    xmm0, xmm0, 039h
-    pshufhw    xmm1, xmm1, 039h
-    pshuflw    xmm1, xmm1, 039h
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_0321
-    jmp        shuf99
-
-  shuf_2103:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
-    pshuflw    xmm0, xmm0, 093h
-    pshufhw    xmm1, xmm1, 093h
-    pshuflw    xmm1, xmm1, 093h
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_2103
-    jmp        shuf99
-
-  shuf_3012:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
-    pshuflw    xmm0, xmm0, 0C6h
-    pshufhw    xmm1, xmm1, 0C6h
-    pshuflw    xmm1, xmm1, 0C6h
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_3012
-
-  shuf99:
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
 // YUY2 - Macro-pixel = 2 image pixels
 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
 
 // UYVY - Macro-pixel = 2 image pixels
 // U0Y0V0Y1
 
-__declspec(naked)
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
+__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+                                          const uint8_t* src_u,
+                                          const uint8_t* src_v,
+                                          uint8_t* dst_frame,
+                                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
+    mov        edx, [esp + 8 + 12]  // src_v
+    mov        edi, [esp + 8 + 16]  // dst_frame
+    mov        ecx, [esp + 8 + 20]  // width
     sub        edx, esi
 
   convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
+    movq       xmm2, qword ptr [esi]  // U
+    movq       xmm3, qword ptr [esi + edx]  // V
     lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqu     xmm0, [eax] // Y
+    punpcklbw  xmm2, xmm3  // UV
+    movdqu     xmm0, [eax]  // Y
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2 // YUYV
+    punpcklbw  xmm0, xmm2  // YUYV
     punpckhbw  xmm1, xmm2
     movdqu     [edi], xmm0
     movdqu     [edi + 16], xmm1
@@ -6023,30 +5817,30 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
   }
 }
 
-__declspec(naked)
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
+__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+                                          const uint8_t* src_u,
+                                          const uint8_t* src_v,
+                                          uint8_t* dst_frame,
+                                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
+    mov        edx, [esp + 8 + 12]  // src_v
+    mov        edi, [esp + 8 + 16]  // dst_frame
+    mov        ecx, [esp + 8 + 20]  // width
     sub        edx, esi
 
   convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
+    movq       xmm2, qword ptr [esi]  // U
+    movq       xmm3, qword ptr [esi + edx]  // V
     lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqu     xmm0, [eax] // Y
+    punpcklbw  xmm2, xmm3  // UV
+    movdqu     xmm0, [eax]  // Y
     movdqa     xmm1, xmm2
     lea        eax, [eax + 16]
-    punpcklbw  xmm1, xmm0 // UYVY
+    punpcklbw  xmm1, xmm0  // UYVY
     punpckhbw  xmm2, xmm0
     movdqu     [edi], xmm1
     movdqu     [edi + 16], xmm2
@@ -6061,22 +5855,22 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
 }
 
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked)
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
+__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+                                              uint8_t* dst_argb,
+                                              const float* poly,
+                                              int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   /* src_argb */
-    mov        edx, [esp + 4 + 8]   /* dst_argb */
-    mov        esi, [esp + 4 + 12]  /* poly */
-    mov        ecx, [esp + 4 + 16]  /* width */
+    mov        eax, [esp + 4 + 4] /* src_argb */
+    mov        edx, [esp + 4 + 8] /* dst_argb */
+    mov        esi, [esp + 4 + 12] /* poly */
+    mov        ecx, [esp + 4 + 16] /* width */
     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
 
-    // 2 pixel loop.
+        // 2 pixel loop.
  convertloop:
-//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
-//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+        //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+        //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
     movq       xmm0, qword ptr [eax]  // BGRABGRA
     lea        eax, [eax + 8]
     punpcklbw  xmm0, xmm3
@@ -6120,25 +5914,25 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked)
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]   /* poly */
-    vbroadcastf128 ymm4, [ecx]       // C0
+__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+                                              uint8_t* dst_argb,
+                                              const float* poly,
+                                              int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* poly */
+    vbroadcastf128 ymm4, [ecx]  // C0
     vbroadcastf128 ymm5, [ecx + 16]  // C1
     vbroadcastf128 ymm6, [ecx + 32]  // C2
     vbroadcastf128 ymm7, [ecx + 48]  // C3
-    mov        ecx, [esp + 16]  /* width */
+    mov        ecx, [esp + 16] /* width */
 
     // 2 pixel loop.
  convertloop:
     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
     lea         eax, [eax + 8]
-    vcvtdq2ps   ymm0, ymm0        // X 8 floats
+    vcvtdq2ps   ymm0, ymm0  // X 8 floats
     vmulps      ymm2, ymm0, ymm0  // X * X
     vmulps      ymm3, ymm0, ymm7  // C3 * X
     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
@@ -6158,16 +5952,125 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kExpBias = 1.9259299444e-34f;
+__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
+                                         uint16_t* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    movd       xmm4, dword ptr [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+    mulss      xmm4, kExpBias
+    pshufd     xmm4, xmm4, 0
+    pxor       xmm5, xmm5
+    sub        edx, eax
+
+        // 8 pixel loop.
+ convertloop:
+    movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
+    add         eax, 16
+    movdqa      xmm3, xmm2
+    punpcklwd   xmm2, xmm5
+    cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats
+    punpckhwd   xmm3, xmm5
+    cvtdq2ps    xmm3, xmm3
+    mulps       xmm2, xmm4
+    mulps       xmm3, xmm4
+    psrld       xmm2, 13
+    psrld       xmm3, 13
+    packssdw    xmm2, xmm3
+    movdqu      [eax + edx - 16], xmm2
+    sub         ecx, 8
+    jg          convertloop
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
+                                         uint16_t* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    movd       xmm4, dword ptr [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+
+    vmulss     xmm4, xmm4, kExpBias
+    vbroadcastss ymm4, xmm4
+    vpxor      ymm5, ymm5, ymm5
+    sub        edx, eax
+
+        // 16 pixel loop.
+ convertloop:
+    vmovdqu     ymm2, [eax]  // 16 shorts
+    add         eax, 32
+    vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints
+    vpunpcklwd  ymm2, ymm2, ymm5
+    vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats
+    vcvtdq2ps   ymm2, ymm2
+    vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
+    vmulps      ymm2, ymm2, ymm4
+    vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
+    vpsrld      ymm2, ymm2, 13
+    vpackssdw   ymm2, ymm2, ymm3
+    vmovdqu     [eax + edx - 32], ymm2
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
+                                         uint16_t* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    vbroadcastss ymm4, [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+    sub        edx, eax
+
+        // 16 pixel loop.
+ convertloop:
+    vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
+    vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
+    add         eax, 32
+    vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
+    vcvtdq2ps   ymm3, ymm3
+    vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
+    vmulps      ymm3, ymm3, ymm4
+    vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
+    vcvtps2ph   xmm3, ymm3, 3
+    vmovdqu     [eax + edx + 32], xmm2
+    vmovdqu     [eax + edx + 32 + 16], xmm3
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
-__declspec(naked)
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
-                           int width) {
+__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                                             const uint8_t* table_argb,
+                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   /* dst_argb */
-    mov        esi, [esp + 4 + 8]   /* table_argb */
-    mov        ecx, [esp + 4 + 12]  /* width */
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
+    mov        ecx, [esp + 4 + 12] /* width */
 
     // 1 pixel loop.
   convertloop:
@@ -6194,13 +6097,14 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
 
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
-__declspec(naked)
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
+                                            const uint8_t* table_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   /* dst_argb */
-    mov        esi, [esp + 4 + 8]   /* table_argb */
-    mov        ecx, [esp + 4 + 12]  /* width */
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
+    mov        ecx, [esp + 4 + 12] /* width */
 
     // 1 pixel loop.
   convertloop:
@@ -6225,27 +6129,28 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
 
 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
 // Tranform RGB pixels with luma table.
-__declspec(naked)
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                 int width,
-                                 const uint8* luma, uint32 lumacoeff) {
+__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+                                                   uint8_t* dst_argb,
+                                                   int width,
+                                                   const uint8_t* luma,
+                                                   uint32_t lumacoeff) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   /* src_argb */
-    mov        edi, [esp + 8 + 8]   /* dst_argb */
-    mov        ecx, [esp + 8 + 12]  /* width */
+    mov        eax, [esp + 8 + 4] /* src_argb */
+    mov        edi, [esp + 8 + 8] /* dst_argb */
+    mov        ecx, [esp + 8 + 12] /* width */
     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
     pshufd     xmm2, xmm2, 0
     pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00
     psllw      xmm4, 8
     pxor       xmm5, xmm5
 
-    // 4 pixel loop.
+        // 4 pixel loop.
   convertloop:
-    movdqu     xmm0, qword ptr [eax]      // generate luma ptr
+    movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
     pmaddubsw  xmm0, xmm3
     phaddw     xmm0, xmm0
     pand       xmm0, xmm4  // mask out low bits
@@ -6323,9 +6228,10 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
 #endif  // defined(_M_X64)
-#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
diff --git a/media/libaom/src/third_party/libyuv/source/scale.cc b/media/libaom/src/third_party/libyuv/source/scale.cc
index 0a01304c41..cf3c033257 100644
--- a/media/libaom/src/third_party/libyuv/source/scale.cc
+++ b/media/libaom/src/third_party/libyuv/source/scale.cc
@@ -17,6 +17,7 @@
 #include "libyuv/planar_functions.h"  // For CopyPlane
 #include "libyuv/row.h"
 #include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h"  // For UVScale
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -33,17 +34,25 @@ static __inline int Abs(int v) {
 // This is an optimized version for scaling down a plane to 1/2 of
 // its original size.
 
-static void ScalePlaneDown2(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown2(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) =
-      filtering == kFilterNone ? ScaleRowDown2_C :
-      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
+  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleRowDown2_C
+          : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
+                                        : ScaleRowDown2Box_C);
   int row_stride = src_stride << 1;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride;  // Point to odd rows.
     src_stride = 0;
@@ -51,46 +60,78 @@ static void ScalePlaneDown2(int src_width, int src_height,
 
 #if defined(HAS_SCALEROWDOWN2_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
-        ScaleRowDown2Box_Any_NEON);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON
+                                          : ScaleRowDown2Box_Any_NEON);
     if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
-          ScaleRowDown2Box_NEON);
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_NEON
+                                                      : ScaleRowDown2Box_NEON);
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
-        ScaleRowDown2Box_Any_SSE2);
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_SSSE3
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
+                                          : ScaleRowDown2Box_Any_SSSE3);
     if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
-          ScaleRowDown2Box_SSE2);
+      ScaleRowDown2 =
+          filtering == kFilterNone
+              ? ScaleRowDown2_SSSE3
+              : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
+                                            : ScaleRowDown2Box_SSSE3);
     }
   }
 #endif
 #if defined(HAS_SCALEROWDOWN2_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
-        ScaleRowDown2Box_Any_AVX2);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_AVX2
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
+                                          : ScaleRowDown2Box_Any_AVX2);
     if (IS_ALIGNED(dst_width, 32)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
-          ScaleRowDown2Box_AVX2);
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_AVX2
+                                                      : ScaleRowDown2Box_AVX2);
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
-      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown2 = filtering ?
-        ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
+#if defined(HAS_SCALEROWDOWN2_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_MMI
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI
+                                          : ScaleRowDown2Box_Any_MMI);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_MMI
+                                                      : ScaleRowDown2Box_MMI);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
+                                          : ScaleRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_MSA
+                                                      : ScaleRowDown2Box_MSA);
+    }
   }
 #endif
 
@@ -105,18 +146,25 @@ static void ScalePlaneDown2(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneDown2_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown2_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
                                enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst_ptr, int dst_width) =
-    filtering == kFilterNone ? ScaleRowDown2_16_C :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
-        ScaleRowDown2Box_16_C);
+  void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                        uint16_t* dst_ptr, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleRowDown2_16_C
+          : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
+                                        : ScaleRowDown2Box_16_C);
   int row_stride = src_stride << 1;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride;  // Point to odd rows.
     src_stride = 0;
@@ -124,23 +172,25 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
 
 #if defined(HAS_SCALEROWDOWN2_16_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
-        ScaleRowDown2_16_NEON;
+    ScaleRowDown2 =
+        filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
   }
 #endif
 #if defined(HAS_SCALEROWDOWN2_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
-        ScaleRowDown2Box_16_SSE2);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_16_SSE2
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2
+                                          : ScaleRowDown2Box_16_SSE2);
   }
 #endif
-#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
-      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown2 = filtering ?
-        ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2;
+#if defined(HAS_SCALEROWDOWN2_16_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI
+                                             : (filtering == kFilterLinear
+                                                    ? ScaleRowDown2Linear_16_MMI
+                                                    : ScaleRowDown2Box_16_MMI);
   }
 #endif
 
@@ -159,53 +209,69 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
 // This is an optimized version for scaling down a plane to 1/4 of
 // its original size.
 
-static void ScalePlaneDown4(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown4(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) =
+  void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width) =
       filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
   int row_stride = src_stride << 2;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride * 2;  // Point to row 2.
     src_stride = 0;
   }
 #if defined(HAS_SCALEROWDOWN4_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
     if (IS_ALIGNED(dst_width, 8)) {
       ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN4_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;
+#if defined(HAS_SCALEROWDOWN4_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 8)) {
-      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
     }
   }
 #endif
 #if defined(HAS_SCALEROWDOWN4_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
     if (IS_ALIGNED(dst_width, 16)) {
       ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
+#if defined(HAS_SCALEROWDOWN4_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
+    }
   }
 #endif
 
@@ -219,38 +285,41 @@ static void ScalePlaneDown4(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneDown4_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown4_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
                                enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst_ptr, int dst_width) =
+  void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                        uint16_t* dst_ptr, int dst_width) =
       filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
   int row_stride = src_stride << 2;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride * 2;  // Point to row 2.
     src_stride = 0;
   }
 #if defined(HAS_SCALEROWDOWN4_16_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
-        ScaleRowDown4_16_NEON;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON;
   }
 #endif
 #if defined(HAS_SCALEROWDOWN4_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
-        ScaleRowDown4_16_SSE2;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
   }
 #endif
-#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2;
+#if defined(HAS_SCALEROWDOWN4_16_MMI)
+  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI;
   }
 #endif
 
@@ -265,18 +334,23 @@ static void ScalePlaneDown4_16(int src_width, int src_height,
 }
 
 // Scale plane down, 3/4
-
-static void ScalePlaneDown34(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown34(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
   assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown34_0 = ScaleRowDown34_C;
@@ -305,6 +379,38 @@ static void ScalePlaneDown34(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN34_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_MMI;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_MMI;
+      if (dst_width % 24 == 0) {
+        ScaleRowDown34_0 = ScaleRowDown34_MMI;
+        ScaleRowDown34_1 = ScaleRowDown34_MMI;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_MSA;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_MSA;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA;
+    }
+    if (dst_width % 48 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_MSA;
+        ScaleRowDown34_1 = ScaleRowDown34_MSA;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA;
+      }
+    }
+  }
+#endif
 #if defined(HAS_SCALEROWDOWN34_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     if (!filtering) {
@@ -325,19 +431,6 @@ static void ScalePlaneDown34(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;
-    }
-  }
-#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -346,8 +439,7 @@ static void ScalePlaneDown34(int src_width, int src_height,
     ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
-                     dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
   }
@@ -363,17 +455,23 @@ static void ScalePlaneDown34(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneDown34_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown34_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
                                 enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
-  void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
   assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown34_0 = ScaleRowDown34_16_C;
@@ -404,19 +502,6 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2;
-    }
-  }
-#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -425,8 +510,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
     ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
-                     dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
   }
@@ -442,7 +526,6 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
   }
 }
 
-
 // Scale plane, 3/8
 // This is an optimized version for scaling down a plane to 3/8
 // of its original size.
@@ -458,18 +541,24 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
 // ggghhhii
 // Boxes are 3x3, 2x3, 3x2 and 2x2
 
-static void ScalePlaneDown38(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown38(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   assert(dst_width % 3 == 0);
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     ScaleRowDown38_3 = ScaleRowDown38_C;
     ScaleRowDown38_2 = ScaleRowDown38_C;
@@ -517,16 +606,23 @@ static void ScalePlaneDown38(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+#if defined(HAS_SCALEROWDOWN38_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
     if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
+      ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
     } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_MSA;
+        ScaleRowDown38_2 = ScaleRowDown38_MSA;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
+      }
     }
   }
 #endif
@@ -554,17 +650,23 @@ static void ScalePlaneDown38(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneDown38_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown38_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
                                 enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
-  void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
   assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown38_3 = ScaleRowDown38_16_C;
@@ -595,19 +697,6 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2;
-    }
-  }
-#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -634,8 +723,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
 
 #define MIN1(x) ((x) < 1 ? 1 : (x))
 
-static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
-  uint32 sum = 0u;
+static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) {
+  uint32_t sum = 0u;
   int x;
   assert(iboxwidth > 0);
   for (x = 0; x < iboxwidth; ++x) {
@@ -644,8 +733,8 @@ static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
   return sum;
 }
 
-static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
-  uint32 sum = 0u;
+static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) {
+  uint32_t sum = 0u;
   int x;
   assert(iboxwidth > 0);
   for (x = 0; x < iboxwidth; ++x) {
@@ -654,12 +743,15 @@ static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
   return sum;
 }
 
-static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
-                            const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols2_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t* src_ptr,
+                            uint8_t* dst_ptr) {
   int i;
   int scaletbl[2];
   int minboxwidth = dx >> 16;
-  int* scaleptr = scaletbl - minboxwidth;
   int boxwidth;
   scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
   scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
@@ -667,16 +759,21 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
     int ix = x >> 16;
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+    *dst_ptr++ =
+        SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
+        16;
   }
 }
 
-static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
-                               const uint32* src_ptr, uint16* dst_ptr) {
+static void ScaleAddCols2_16_C(int dst_width,
+                               int boxheight,
+                               int x,
+                               int dx,
+                               const uint32_t* src_ptr,
+                               uint16_t* dst_ptr) {
   int i;
   int scaletbl[2];
   int minboxwidth = dx >> 16;
-  int* scaleptr = scaletbl - minboxwidth;
   int boxwidth;
   scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
   scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
@@ -684,23 +781,33 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
     int ix = x >> 16;
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ =
-        SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
+                     scaletbl[boxwidth - minboxwidth] >>
+                 16;
   }
 }
 
-static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
-                            const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols0_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t* src_ptr,
+                            uint8_t* dst_ptr) {
   int scaleval = 65536 / boxheight;
   int i;
+  (void)dx;
   src_ptr += (x >> 16);
   for (i = 0; i < dst_width; ++i) {
     *dst_ptr++ = src_ptr[i] * scaleval >> 16;
   }
 }
 
-static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
-                            const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols1_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t* src_ptr,
+                            uint8_t* dst_ptr) {
   int boxwidth = MIN1(dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
   int i;
@@ -711,8 +818,12 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
   }
 }
 
-static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
-                               const uint32* src_ptr, uint16* dst_ptr) {
+static void ScaleAddCols1_16_C(int dst_width,
+                               int boxheight,
+                               int x,
+                               int dx,
+                               const uint32_t* src_ptr,
+                               uint16_t* dst_ptr) {
   int boxwidth = MIN1(dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
   int i;
@@ -729,10 +840,14 @@ static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
 // one pixel of destination using fixed point (16.16) to step
 // through source, sampling a box of pixel with simple
 // averaging.
-static void ScalePlaneBox(int src_width, int src_height,
-                          int dst_width, int dst_height,
-                          int src_stride, int dst_stride,
-                          const uint8* src_ptr, uint8* dst_ptr) {
+static void ScalePlaneBox(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t* src_ptr,
+                          uint8_t* dst_ptr) {
   int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -740,18 +855,18 @@ static void ScalePlaneBox(int src_width, int src_height,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height << 16);
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
   {
-    // Allocate a row buffer of uint16.
+    // Allocate a row buffer of uint16_t.
     align_buffer_64(row16, src_width * 2);
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-        const uint16* src_ptr, uint8* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_C:
-        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
-    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
-        ScaleAddRow_C;
+                         const uint16_t* src_ptr, uint8_t* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_C
+                      : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+    void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr,
+                        int src_width) = ScaleAddRow_C;
 #if defined(HAS_SCALEADDROW_SSE2)
     if (TestCpuFlag(kCpuHasSSE2)) {
       ScaleAddRow = ScaleAddRow_Any_SSE2;
@@ -776,11 +891,27 @@ static void ScalePlaneBox(int src_width, int src_height,
       }
     }
 #endif
+#if defined(HAS_SCALEADDROW_MMI)
+    if (TestCpuFlag(kCpuHasMMI)) {
+      ScaleAddRow = ScaleAddRow_Any_MMI;
+      if (IS_ALIGNED(src_width, 8)) {
+        ScaleAddRow = ScaleAddRow_MMI;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_MSA)
+    if (TestCpuFlag(kCpuHasMSA)) {
+      ScaleAddRow = ScaleAddRow_Any_MSA;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_MSA;
+      }
+    }
+#endif
 
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
-      const uint8* src = src_ptr + iy * src_stride;
+      const uint8_t* src = src_ptr + iy * src_stride;
       y += dy;
       if (y > max_y) {
         y = max_y;
@@ -788,20 +919,24 @@ static void ScalePlaneBox(int src_width, int src_height,
       boxheight = MIN1((y >> 16) - iy);
       memset(row16, 0, src_width * 2);
       for (k = 0; k < boxheight; ++k) {
-        ScaleAddRow(src, (uint16 *)(row16), src_width);
+        ScaleAddRow(src, (uint16_t*)(row16), src_width);
         src += src_stride;
       }
-      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr);
       dst_ptr += dst_stride;
     }
     free_aligned_buffer_64(row16);
   }
 }
 
-static void ScalePlaneBox_16(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint16* src_ptr, uint16* dst_ptr) {
+static void ScalePlaneBox_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint16_t* src_ptr,
+                             uint16_t* dst_ptr) {
   int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -809,17 +944,17 @@ static void ScalePlaneBox_16(int src_width, int src_height,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height << 16);
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
   {
-    // Allocate a row buffer of uint32.
+    // Allocate a row buffer of uint32_t.
     align_buffer_64(row32, src_width * 4);
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-        const uint32* src_ptr, uint16* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
-    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
-        ScaleAddRow_16_C;
+                         const uint32_t* src_ptr, uint16_t* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
+    void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr,
+                        int src_width) = ScaleAddRow_16_C;
 
 #if defined(HAS_SCALEADDROW_16_SSE2)
     if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
@@ -827,10 +962,15 @@ static void ScalePlaneBox_16(int src_width, int src_height,
     }
 #endif
 
+#if defined(HAS_SCALEADDROW_16_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) {
+      ScaleAddRow = ScaleAddRow_16_MMI;
+    }
+#endif
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
-      const uint16* src = src_ptr + iy * src_stride;
+      const uint16_t* src = src_ptr + iy * src_stride;
       y += dy;
       if (y > max_y) {
         y = max_y;
@@ -838,10 +978,10 @@ static void ScalePlaneBox_16(int src_width, int src_height,
       boxheight = MIN1((y >> 16) - iy);
       memset(row32, 0, src_width * 4);
       for (k = 0; k < boxheight; ++k) {
-        ScaleAddRow(src, (uint32 *)(row32), src_width);
+        ScaleAddRow(src, (uint32_t*)(row32), src_width);
         src += src_stride;
       }
-      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr);
       dst_ptr += dst_stride;
     }
     free_aligned_buffer_64(row32);
@@ -849,10 +989,14 @@ static void ScalePlaneBox_16(int src_width, int src_height,
 }
 
 // Scale plane down with bilinear interpolation.
-void ScalePlaneBilinearDown(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
+void ScalePlaneBilinearDown(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -865,24 +1009,16 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                          int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_SSE2;
-    }
-  }
-#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -907,15 +1043,22 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
-    if (IS_ALIGNED(src_width, 4)) {
-      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
     }
   }
 #endif
-
 
 #if defined(HAS_SCALEFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -930,13 +1073,21 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEFILTERCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleFilterCols = ScaleFilterCols_MSA;
+    }
+  }
+#endif
   if (y > max_y) {
     y = max_y;
   }
 
   for (j = 0; j < dst_height; ++j) {
     int yi = y >> 16;
-    const uint8* src = src_ptr + yi * src_stride;
+    const uint8_t* src = src_ptr + yi * src_stride;
     if (filtering == kFilterLinear) {
       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
     } else {
@@ -953,10 +1104,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
   free_aligned_buffer_64(row);
 }
 
-void ScalePlaneBilinearDown_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
+void ScalePlaneBilinearDown_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
                                enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -969,14 +1124,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                          int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
-  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
@@ -1011,15 +1166,6 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
-    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
-    if (IS_ALIGNED(src_width, 4)) {
-      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
-    }
-  }
-#endif
-
 
 #if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1032,13 +1178,13 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
 
   for (j = 0; j < dst_height; ++j) {
     int yi = y >> 16;
-    const uint16* src = src_ptr + yi * src_stride;
+    const uint16_t* src = src_ptr + yi * src_stride;
     if (filtering == kFilterLinear) {
       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
     } else {
       int yf = (y >> 8) & 255;
-      InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
-      ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
+      InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx);
     }
     dst_ptr += dst_stride;
     y += dy;
@@ -1050,10 +1196,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
 }
 
 // Scale up down with bilinear interpolation.
-void ScalePlaneBilinearUp(int src_width, int src_height,
-                          int dst_width, int dst_height,
-                          int src_stride, int dst_stride,
-                          const uint8* src_ptr, uint8* dst_ptr,
+void ScalePlaneBilinearUp(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t* src_ptr,
+                          uint8_t* dst_ptr,
                           enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
@@ -1062,24 +1212,16 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                          int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_C : ScaleCols_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_SSE2;
-    }
-  }
-#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -1104,14 +1246,6 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_MIPS_DSPR2;
-    }
-  }
-#endif
 
   if (filtering && src_width >= 32768) {
     ScaleFilterCols = ScaleFilterCols64_C;
@@ -1129,6 +1263,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleFilterCols = ScaleFilterCols_MSA;
+    }
+  }
+#endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleFilterCols = ScaleColsUp2_C;
 #if defined(HAS_SCALECOLS_SSE2)
@@ -1136,6 +1278,11 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
       ScaleFilterCols = ScaleColsUp2_SSE2;
     }
 #endif
+#if defined(HAS_SCALECOLS_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_MMI;
+    }
+#endif
   }
 
   if (y > max_y) {
@@ -1143,13 +1290,13 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   }
   {
     int yi = y >> 16;
-    const uint8* src = src_ptr + yi * src_stride;
+    const uint8_t* src = src_ptr + yi * src_stride;
 
     // Allocate 2 row buffers.
     const int kRowSize = (dst_width + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 
-    uint8* rowptr = row;
+    uint8_t* rowptr = row;
     int rowstride = kRowSize;
     int lasty = yi;
 
@@ -1189,10 +1336,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   }
 }
 
-void ScalePlaneBilinearUp_16(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint16* src_ptr, uint16* dst_ptr,
+void ScalePlaneBilinearUp_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
                              enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
@@ -1201,14 +1352,14 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
-  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
+  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                          int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
@@ -1243,14 +1394,6 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
-    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
-    }
-  }
-#endif
 
   if (filtering && src_width >= 32768) {
     ScaleFilterCols = ScaleFilterCols64_16_C;
@@ -1267,6 +1410,11 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
       ScaleFilterCols = ScaleColsUp2_16_SSE2;
     }
 #endif
+#if defined(HAS_SCALECOLS_16_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_16_MMI;
+    }
+#endif
   }
 
   if (y > max_y) {
@@ -1274,13 +1422,13 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
   }
   {
     int yi = y >> 16;
-    const uint16* src = src_ptr + yi * src_stride;
+    const uint16_t* src = src_ptr + yi * src_stride;
 
     // Allocate 2 row buffers.
     const int kRowSize = (dst_width + 31) & ~31;
     align_buffer_64(row, kRowSize * 4);
 
-    uint16* rowptr = (uint16*)row;
+    uint16_t* rowptr = (uint16_t*)row;
     int rowstride = kRowSize;
     int lasty = yi;
 
@@ -1325,20 +1473,24 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
 
-static void ScalePlaneSimple(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr) {
+static void ScalePlaneSimple(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_ptr,
+                             uint8_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) = ScaleCols_C;
+  void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
+                    int x, int dx) = ScaleCols_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
   int dx = 0;
   int dy = 0;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
   if (src_width * 2 == dst_width && x < 0x8000) {
@@ -1348,6 +1500,11 @@ static void ScalePlaneSimple(int src_width, int src_height,
       ScaleCols = ScaleColsUp2_SSE2;
     }
 #endif
+#if defined(HAS_SCALECOLS_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_MMI;
+    }
+#endif
   }
 
   for (i = 0; i < dst_height; ++i) {
@@ -1357,20 +1514,24 @@ static void ScalePlaneSimple(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneSimple_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr) {
+static void ScalePlaneSimple_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_ptr,
+                                uint16_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) = ScaleCols_16_C;
+  void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
+                    int x, int dx) = ScaleCols_16_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
   int dx = 0;
   int dy = 0;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
   if (src_width * 2 == dst_width && x < 0x8000) {
@@ -1380,11 +1541,15 @@ static void ScalePlaneSimple_16(int src_width, int src_height,
       ScaleCols = ScaleColsUp2_16_SSE2;
     }
 #endif
+#if defined(HAS_SCALECOLS_16_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_16_MMI;
+    }
+#endif
   }
 
   for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
-              dst_width, x, dx);
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
     dst_ptr += dst_stride;
     y += dy;
   }
@@ -1394,14 +1559,18 @@ static void ScalePlaneSimple_16(int src_width, int src_height,
 // This function dispatches to a specialized scaler based on scale factor.
 
 LIBYUV_API
-void ScalePlane(const uint8* src, int src_stride,
-                int src_width, int src_height,
-                uint8* dst, int dst_stride,
-                int dst_width, int dst_height,
+void ScalePlane(const uint8_t* src,
+                int src_stride,
+                int src_width,
+                int src_height,
+                uint8_t* dst,
+                int dst_stride,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering) {
   // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height, filtering);
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
 
   // Negative height means invert the image.
   if (src_height < 0) {
@@ -1420,46 +1589,42 @@ void ScalePlane(const uint8* src, int src_stride,
   if (dst_width == src_width && filtering != kFilterBox) {
     int dy = FixedDiv(src_height, dst_height);
     // Arbitrary scale vertically, but unscaled horizontally.
-    ScalePlaneVertical(src_height,
-                       dst_width, dst_height,
-                       src_stride, dst_stride, src, dst,
-                       0, 0, dy, 1, filtering);
+    ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, 0, 0, dy, 1, filtering);
     return;
   }
   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
-    if (4 * dst_width == 3 * src_width &&
-        4 * dst_height == 3 * src_height) {
+    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
       // optimized, 3/4
-      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, filtering);
       return;
     }
     if (2 * dst_width == src_width && 2 * dst_height == src_height) {
       // optimized, 1/2
-      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst, filtering);
       return;
     }
     // 3/8 rounded up for odd sized chroma height.
-    if (8 * dst_width == 3 * src_width &&
-        dst_height == ((src_height * 3 + 7) / 8)) {
+    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
       // optimized, 3/8
-      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, filtering);
       return;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
         (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
-      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst, filtering);
       return;
     }
   }
   if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
-                  src_stride, dst_stride, src, dst);
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
+                  dst_stride, src, dst);
     return;
   }
   if (filtering && dst_height > src_height) {
@@ -1472,19 +1637,23 @@ void ScalePlane(const uint8* src, int src_stride,
                            src_stride, dst_stride, src, dst, filtering);
     return;
   }
-  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
-                   src_stride, dst_stride, src, dst);
+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
+                   dst_stride, src, dst);
 }
 
 LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
-                  int src_width, int src_height,
-                  uint16* dst, int dst_stride,
-                  int dst_width, int dst_height,
-                  enum FilterMode filtering) {
+void ScalePlane_16(const uint16_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
+                   enum FilterMode filtering) {
   // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height, filtering);
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
 
   // Negative height means invert the image.
   if (src_height < 0) {
@@ -1500,19 +1669,16 @@ void ScalePlane_16(const uint16* src, int src_stride,
     CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
     return;
   }
-  if (dst_width == src_width) {
+  if (dst_width == src_width && filtering != kFilterBox) {
     int dy = FixedDiv(src_height, dst_height);
-    // Arbitrary scale vertically, but unscaled vertically.
-    ScalePlaneVertical_16(src_height,
-                          dst_width, dst_height,
-                          src_stride, dst_stride, src, dst,
-                          0, 0, dy, 1, filtering);
+    // Arbitrary scale vertically, but unscaled horizontally.
+    ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
+                          dst_stride, src, dst, 0, 0, dy, 1, filtering);
     return;
   }
   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
-    if (4 * dst_width == 3 * src_width &&
-        4 * dst_height == 3 * src_height) {
+    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
       // optimized, 3/4
       ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
                           src_stride, dst_stride, src, dst, filtering);
@@ -1525,15 +1691,14 @@ void ScalePlane_16(const uint16* src, int src_stride,
       return;
     }
     // 3/8 rounded up for odd sized chroma height.
-    if (8 * dst_width == 3 * src_width &&
-        dst_height == ((src_height * 3 + 7) / 8)) {
+    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
       // optimized, 3/8
       ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
                           src_stride, dst_stride, src, dst, filtering);
       return;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
-               filtering != kFilterBilinear) {
+        (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
       ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
                          src_stride, dst_stride, src, dst, filtering);
@@ -1541,8 +1706,8 @@ void ScalePlane_16(const uint16* src, int src_stride,
     }
   }
   if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src, dst);
+    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
+                     dst_stride, src, dst);
     return;
   }
   if (filtering && dst_height > src_height) {
@@ -1555,132 +1720,213 @@ void ScalePlane_16(const uint16* src, int src_stride,
                               src_stride, dst_stride, src, dst, filtering);
     return;
   }
-  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst);
+  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst);
 }
 
 // Scale an I420 image.
 // This function in turn calls a scaling function for each plane.
 
 LIBYUV_API
-int I420Scale(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              int src_width, int src_height,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int dst_width, int dst_height,
+int I420Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering) {
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane(src_y, src_stride_y, src_width, src_height,
-             dst_y, dst_stride_y, dst_width, dst_height,
-             filtering);
-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
-             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
-             filtering);
-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
-             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
-             filtering);
+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+             dst_width, dst_height, filtering);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+             dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+             dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
   return 0;
 }
 
 LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
-                 const uint16* src_u, int src_stride_u,
-                 const uint16* src_v, int src_stride_v,
-                 int src_width, int src_height,
-                 uint16* dst_y, int dst_stride_y,
-                 uint16* dst_u, int dst_stride_u,
-                 uint16* dst_v, int dst_stride_v,
-                 int dst_width, int dst_height,
+int I420Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
                  enum FilterMode filtering) {
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane_16(src_y, src_stride_y, src_width, src_height,
-                dst_y, dst_stride_y, dst_width, dst_height,
-                filtering);
-  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
-                dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
-                filtering);
-  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
-                dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
-                filtering);
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+                dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+                dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
   return 0;
 }
 
-// Deprecated api
+// Scale an I444 image.
+// This function in turn calls a scaling function for each plane.
+
 LIBYUV_API
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
-          int src_stride_y, int src_stride_u, int src_stride_v,
-          int src_width, int src_height,
-          uint8* dst_y, uint8* dst_u, uint8* dst_v,
-          int dst_stride_y, int dst_stride_u, int dst_stride_v,
-          int dst_width, int dst_height,
-          LIBYUV_BOOL interpolate) {
-  return I420Scale(src_y, src_stride_y,
-                   src_u, src_stride_u,
-                   src_v, src_stride_v,
-                   src_width, src_height,
-                   dst_y, dst_stride_y,
-                   dst_u, dst_stride_u,
-                   dst_v, dst_stride_v,
-                   dst_width, dst_height,
-                   interpolate ? kFilterBox : kFilterNone);
+int I444Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering) {
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+             dst_width, dst_height, filtering);
+  ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+             dst_width, dst_height, filtering);
+  ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+             dst_width, dst_height, filtering);
+  return 0;
 }
 
-// Deprecated api
 LIBYUV_API
-int ScaleOffset(const uint8* src, int src_width, int src_height,
-                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
-                LIBYUV_BOOL interpolate) {
-  // Chroma requires offset to multiple of 2.
-  int dst_yoffset_even = dst_yoffset & ~1;
+int I444Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering) {
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+                dst_width, dst_height, filtering);
+  return 0;
+}
+
+// Scale an NV12 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_uv,
+              int dst_stride_uv,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering) {
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-  int aheight = dst_height - dst_yoffset_even * 2;  // actual output height
-  const uint8* src_y = src;
-  const uint8* src_u = src + src_width * src_height;
-  const uint8* src_v = src + src_width * src_height +
-                             src_halfwidth * src_halfheight;
-  uint8* dst_y = dst + dst_yoffset_even * dst_width;
-  uint8* dst_u = dst + dst_width * dst_height +
-                 (dst_yoffset_even >> 1) * dst_halfwidth;
-  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
-                 (dst_yoffset_even >> 1) * dst_halfwidth;
-  if (!src || src_width <= 0 || src_height <= 0 ||
-      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
-      dst_yoffset_even >= dst_height) {
+  if (!src_y || !src_uv || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  return I420Scale(src_y, src_width,
-                   src_u, src_halfwidth,
-                   src_v, src_halfwidth,
-                   src_width, src_height,
-                   dst_y, dst_width,
-                   dst_u, dst_halfwidth,
-                   dst_v, dst_halfwidth,
-                   dst_width, aheight,
-                   interpolate ? kFilterBox : kFilterNone);
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+             dst_width, dst_height, filtering);
+  UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv,
+          dst_stride_uv, dst_halfwidth, dst_halfheight, filtering);
+  return 0;
+}
+
+// Deprecated api
+LIBYUV_API
+int Scale(const uint8_t* src_y,
+          const uint8_t* src_u,
+          const uint8_t* src_v,
+          int src_stride_y,
+          int src_stride_u,
+          int src_stride_v,
+          int src_width,
+          int src_height,
+          uint8_t* dst_y,
+          uint8_t* dst_u,
+          uint8_t* dst_v,
+          int dst_stride_y,
+          int dst_stride_u,
+          int dst_stride_v,
+          int dst_width,
+          int dst_height,
+          LIBYUV_BOOL interpolate) {
+  return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                   src_stride_v, src_width, src_height, dst_y, dst_stride_y,
+                   dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width,
+                   dst_height, interpolate ? kFilterBox : kFilterNone);
 }
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/third_party/libyuv/source/scale_any.cc b/media/libaom/src/third_party/libyuv/source/scale_any.cc
index 2f6a2c8baf..c93d70c5fc 100644
--- a/media/libaom/src/third_party/libyuv/source/scale_any.cc
+++ b/media/libaom/src/third_party/libyuv/source/scale_any.cc
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <string.h>  // For memset/memcpy
+
 #include "libyuv/scale.h"
 #include "libyuv/scale_row.h"
 
@@ -18,165 +20,532 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
-                 int dst_width, int x, int dx) {                               \
-      int n = dst_width & ~MASK;                                               \
-      if (n > 0) {                                                             \
-        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
-      }                                                                        \
-      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
-             dst_width & MASK, x + n * dx, dx);                                \
-    }
+// Fixed scale down.
+// Mask may be non-power of 2, so use MOD
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */          \
+    int n = dst_width - r;                                                     \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r);                                      \
+  }
 
-#ifdef HAS_SCALEFILTERCOLS_NEON
-CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
-#endif
-#ifdef HAS_SCALEARGBCOLS_NEON
-CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+// Fixed scale down for odd source width.  Used by I420Blend subsampling.
+// Since dst_width is (width + 1) / 2, this function scales one less pixel
+// and copies the last pixel.
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */    \
+    int n = (dst_width - 1) - r;                                               \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r + 1);                                  \
+  }
+
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3,
+      ScaleRowDown2Linear_SSSE3,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+SDANY(ScaleRowDown2Box_Any_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
 #endif
-#ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
-     ScaleARGBFilterCols_C, 4, 3)
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+SDANY(ScaleUVRowDown2Box_Any_SSSE3,
+      ScaleUVRowDown2Box_SSSE3,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      4)
 #endif
-#undef CANY
-
-// Fixed scale down.
-#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
-                     dst_ptr + n * BPP, r);                                    \
-    }
-
-#ifdef HAS_SCALEROWDOWN2_SSE2
-SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
-      ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
-      2, 1, 15)
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+SDANY(ScaleUVRowDown2Box_Any_AVX2,
+      ScaleUVRowDown2Box_AVX2,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      8)
 #endif
 #ifdef HAS_SCALEROWDOWN2_AVX2
 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
-SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
-      ScaleRowDown2Linear_C, 2, 1, 31)
-SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
-      2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2,
+      ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
+SDODD(ScaleRowDown2Box_Odd_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      31)
 #endif
 #ifdef HAS_SCALEROWDOWN2_NEON
 SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
-      ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON,
+      ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+SDANY(ScaleRowDown2Box_Any_NEON,
+      ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+SDODD(ScaleRowDown2Box_Odd_NEON,
+      ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
+SDANY(ScaleUVRowDown2Box_Any_NEON,
+      ScaleUVRowDown2Box_NEON,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      8)
 #endif
-#ifdef HAS_SCALEROWDOWN4_SSE2
-SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,
-      4, 1, 7)
+
+#ifdef HAS_SCALEROWDOWN2_MSA
+SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_MSA,
+      ScaleRowDown2Linear_MSA,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_MSA,
+      ScaleRowDown2Box_MSA,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_MMI
+SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7)
+SDANY(ScaleRowDown2Linear_Any_MMI,
+      ScaleRowDown2Linear_MMI,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      7)
+SDANY(ScaleRowDown2Box_Any_MMI,
+      ScaleRowDown2Box_MMI,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      7)
+SDODD(ScaleRowDown2Box_Odd_MMI,
+      ScaleRowDown2Box_MMI,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSSE3
+SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3,
+      ScaleRowDown4Box_SSSE3,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
 #endif
 #ifdef HAS_SCALEROWDOWN4_AVX2
 SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
-SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
-      4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2,
+      ScaleRowDown4Box_AVX2,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
 #endif
 #ifdef HAS_SCALEROWDOWN4_NEON
 SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
-      4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON,
+      ScaleRowDown4Box_NEON,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_MSA
+SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_MSA,
+      ScaleRowDown4Box_MSA,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_MMI
+SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_MMI,
+      ScaleRowDown4Box_MMI,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
 #endif
 #ifdef HAS_SCALEROWDOWN34_SSSE3
-SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
-      ScaleRowDown34_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
-      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
-      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_Any_SSSE3,
+      ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
+      ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
+      ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
 #endif
 #ifdef HAS_SCALEROWDOWN34_NEON
-SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
-      ScaleRowDown34_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
-      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
-      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_Any_NEON,
+      ScaleRowDown34_NEON,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON,
+      ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON,
+      ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_MSA
+SDANY(ScaleRowDown34_Any_MSA,
+      ScaleRowDown34_MSA,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      47)
+SDANY(ScaleRowDown34_0_Box_Any_MSA,
+      ScaleRowDown34_0_Box_MSA,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      47)
+SDANY(ScaleRowDown34_1_Box_Any_MSA,
+      ScaleRowDown34_1_Box_MSA,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      47)
+#endif
+#ifdef HAS_SCALEROWDOWN34_MMI
+SDANY(ScaleRowDown34_Any_MMI,
+      ScaleRowDown34_MMI,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
 #endif
 #ifdef HAS_SCALEROWDOWN38_SSSE3
-SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
-      ScaleRowDown38_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
-      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
-SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
-      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_Any_SSSE3,
+      ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
+      ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
+      ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      5)
 #endif
 #ifdef HAS_SCALEROWDOWN38_NEON
-SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
-      ScaleRowDown38_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
-      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
-      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_Any_NEON,
+      ScaleRowDown38_NEON,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON,
+      ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON,
+      ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
+#endif
+#ifdef HAS_SCALEROWDOWN38_MSA
+SDANY(ScaleRowDown38_Any_MSA,
+      ScaleRowDown38_MSA,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_MSA,
+      ScaleRowDown38_3_Box_MSA,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_MSA,
+      ScaleRowDown38_2_Box_MSA,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
 #endif
 
 #ifdef HAS_SCALEARGBROWDOWN2_SSE2
-SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
-      ScaleARGBRowDown2_C, 2, 4, 3)
-SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
-      ScaleARGBRowDown2Linear_C, 2, 4, 3)
-SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
-      ScaleARGBRowDown2Box_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2_Any_SSE2,
+      ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
+      ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2,
+      ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
 #endif
 #ifdef HAS_SCALEARGBROWDOWN2_NEON
-SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
-      ScaleARGBRowDown2_C, 2, 4, 7)
-SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
-      ScaleARGBRowDown2Linear_C, 2, 4, 7)
-SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
-      ScaleARGBRowDown2Box_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2_Any_NEON,
+      ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON,
+      ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON,
+      ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      7)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MSA
+SDANY(ScaleARGBRowDown2_Any_MSA,
+      ScaleARGBRowDown2_MSA,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_MSA,
+      ScaleARGBRowDown2Linear_MSA,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_MSA,
+      ScaleARGBRowDown2Box_MSA,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MMI
+SDANY(ScaleARGBRowDown2_Any_MMI,
+      ScaleARGBRowDown2_MMI,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      1)
+SDANY(ScaleARGBRowDown2Linear_Any_MMI,
+      ScaleARGBRowDown2Linear_MMI,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      1)
+SDANY(ScaleARGBRowDown2Box_Any_MMI,
+      ScaleARGBRowDown2Box_MMI,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      1)
 #endif
 #undef SDANY
 
 // Scale down by even scale factor.
-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
-                     src_stepx, dst_ptr + n * BPP, r);                         \
-    }
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)       \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+               uint8_t* dst_ptr, int dst_width) {                           \
+    int r = dst_width & MASK;                                               \
+    int n = dst_width & ~MASK;                                              \
+    if (n > 0) {                                                            \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);        \
+    }                                                                       \
+    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx,  \
+                   dst_ptr + n * BPP, r);                                   \
+  }
 
 #ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
-SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
-       ScaleARGBRowDownEven_C, 4, 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
-       ScaleARGBRowDownEvenBox_C, 4, 3)
+SDAANY(ScaleARGBRowDownEven_Any_SSE2,
+       ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
+       ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
 #endif
 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
-SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
-       ScaleARGBRowDownEven_C, 4, 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
-       ScaleARGBRowDownEvenBox_C, 4, 3)
+SDAANY(ScaleARGBRowDownEven_Any_NEON,
+       ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
+       ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
+SDAANY(ScaleARGBRowDownEven_Any_MSA,
+       ScaleARGBRowDownEven_MSA,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
+       ScaleARGBRowDownEvenBox_MSA,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI
+SDAANY(ScaleARGBRowDownEven_Any_MMI,
+       ScaleARGBRowDownEven_MMI,
+       ScaleARGBRowDownEven_C,
+       4,
+       1)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
+       ScaleARGBRowDownEvenBox_MMI,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       1)
+#endif
+#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
+SDAANY(ScaleUVRowDownEven_Any_NEON,
+       ScaleUVRowDownEven_NEON,
+       ScaleUVRowDownEven_C,
+       2,
+       3)
+#endif
+
+#ifdef SASIMDONLY
+// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
+
+// Add rows box filter scale down.  Using macro from row_any
+#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                      \
+  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint16_t dst_temp[32]);                               \
+    SIMD_ALIGNED(uint8_t src_temp[32]);                                \
+    memset(dst_temp, 0, 32 * 2); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                   \
+    }                                                                  \
+    memcpy(src_temp, src_ptr + n * SBPP, r * SBPP);                    \
+    memcpy(dst_temp, dst_ptr + n * BPP, r * BPP);                      \
+    ANY_SIMD(src_temp, dst_temp, MASK + 1);                            \
+    memcpy(dst_ptr + n * BPP, dst_temp, r * BPP);                      \
+  }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
 #endif
+#ifdef HAS_SCALEADDROW_MSA
+SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MMI
+SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7)
+#endif
+#undef SAANY
+
+#else
 
 // Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
-  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
-      int n = src_width & ~MASK;                                               \
-      if (n > 0) {                                                             \
-        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
-      }                                                                        \
-      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
-    }
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)              \
+  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
+    int n = src_width & ~MASK;                                             \
+    if (n > 0) {                                                           \
+      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                               \
+    }                                                                      \
+    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);             \
+  }
 
 #ifdef HAS_SCALEADDROW_SSE2
 SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
@@ -187,14 +556,60 @@ SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
 #ifdef HAS_SCALEADDROW_NEON
 SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
 #endif
+#ifdef HAS_SCALEADDROW_MSA
+SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MMI
+SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
+#endif
 #undef SAANY
 
+#endif  // SASIMDONLY
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+               int dx) {                                                       \
+    int r = dst_width & MASK;                                                  \
+    int n = dst_width & ~MASK;                                                 \
+    if (n > 0) {                                                               \
+      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \
+    }                                                                          \
+    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \
+  }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MMI
+CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON,
+     ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C,
+     4,
+     3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+     ScaleARGBFilterCols_MSA,
+     ScaleARGBFilterCols_C,
+     4,
+     7)
+#endif
+#undef CANY
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
-
-
-
-
diff --git a/media/libaom/src/third_party/libyuv/source/scale_argb.cc b/media/libaom/src/third_party/libyuv/source/scale_argb.cc
index 40a2d1ab20..451d4ec4d1 100644
--- a/media/libaom/src/third_party/libyuv/source/scale_argb.cc
+++ b/media/libaom/src/third_party/libyuv/source/scale_argb.cc
@@ -30,20 +30,31 @@ static __inline int Abs(int v) {
 // ScaleARGB ARGB, 1/2
 // This is an optimized version for scaling down a ARGB to 1/2 of
 // its original size.
-static void ScaleARGBDown2(int src_width, int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint8* src_argb, uint8* dst_argb,
-                           int x, int dx, int y, int dy,
+static void ScaleARGBDown2(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int x,
+                           int dx,
+                           int y,
+                           int dy,
                            enum FilterMode filtering) {
   int j;
   int row_stride = src_stride * (dy >> 16);
-  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) =
-    filtering == kFilterNone ? ScaleARGBRowDown2_C :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
-        ScaleARGBRowDown2Box_C);
-  assert(dx == 65536 * 2);  // Test scale factor of 2.
+  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+                            uint8_t* dst_argb, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleARGBRowDown2_C
+          : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C
+                                        : ScaleARGBRowDown2Box_C);
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 2);      // Test scale factor of 2.
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
   // Advance to odd row, even column.
   if (filtering == kFilterBilinear) {
@@ -54,25 +65,65 @@ static void ScaleARGBDown2(int src_width, int src_height,
 
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
-        ScaleARGBRowDown2Box_Any_SSE2);
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_SSE2
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2
+                                          : ScaleARGBRowDown2Box_Any_SSE2);
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
-          ScaleARGBRowDown2Box_SSE2);
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_SSE2
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2
+                                            : ScaleARGBRowDown2Box_SSE2);
     }
   }
 #endif
 #if defined(HAS_SCALEARGBROWDOWN2_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
-        ScaleARGBRowDown2Box_Any_NEON);
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON
+                                          : ScaleARGBRowDown2Box_Any_NEON);
     if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
-          ScaleARGBRowDown2Box_NEON);
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_NEON
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON
+                                            : ScaleARGBRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_MMI
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI
+                                          : ScaleARGBRowDown2Box_Any_MMI);
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_MMI
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI
+                                            : ScaleARGBRowDown2Box_MMI);
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA
+                                          : ScaleARGBRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_MSA
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA
+                                            : ScaleARGBRowDown2Box_MSA);
     }
   }
 #endif
@@ -90,21 +141,32 @@ static void ScaleARGBDown2(int src_width, int src_height,
 // ScaleARGB ARGB, 1/4
 // This is an optimized version for scaling down a ARGB to 1/4 of
 // its original size.
-static void ScaleARGBDown4Box(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_argb, uint8* dst_argb,
-                              int x, int dx, int y, int dy) {
+static void ScaleARGBDown4Box(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              int x,
+                              int dx,
+                              int y,
+                              int dy) {
   int j;
   // Allocate 2 rows of ARGB.
   const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
   align_buffer_64(row, kRowSize * 2);
   int row_stride = src_stride * (dy >> 16);
-  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
-    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+                            uint8_t* dst_argb, int dst_width) =
+      ScaleARGBRowDown2Box_C;
   // Advance to odd row, even column.
   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
-  assert(dx == 65536 * 4);  // Test scale factor of 4.
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 4);      // Test scale factor of 4.
   assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -125,8 +187,8 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
 
   for (j = 0; j < dst_height; ++j) {
     ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
-    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
-                      row + kRowSize, dst_width * 2);
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
+                      dst_width * 2);
     ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
     src_argb += row_stride;
     dst_argb += dst_stride;
@@ -137,38 +199,67 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
 // ScaleARGB ARGB Even
 // This is an optimized version for scaling down a ARGB to even
 // multiple of its original size.
-static void ScaleARGBDownEven(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_argb, uint8* dst_argb,
-                              int x, int dx, int y, int dy,
+static void ScaleARGBDownEven(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              int x,
+                              int dx,
+                              int y,
+                              int dy,
                               enum FilterMode filtering) {
   int j;
   int col_step = dx >> 16;
   int row_stride = (dy >> 16) * src_stride;
-  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_step, uint8* dst_argb, int dst_width) =
+  void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
+                               int src_step, uint8_t* dst_argb, int dst_width) =
       filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+  (void)src_width;
+  (void)src_height;
   assert(IS_ALIGNED(src_width, 2));
   assert(IS_ALIGNED(src_height, 2));
   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
-        ScaleARGBRowDownEven_Any_SSE2;
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
+                                     : ScaleARGBRowDownEven_Any_SSE2;
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
-          ScaleARGBRowDownEven_SSE2;
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2;
     }
   }
 #endif
 #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
-        ScaleARGBRowDownEven_Any_NEON;
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON
+                                     : ScaleARGBRowDownEven_Any_NEON;
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
-          ScaleARGBRowDownEven_NEON;
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI
+                                     : ScaleARGBRowDownEven_Any_MMI;
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
+                                     : ScaleARGBRowDownEven_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;
     }
   }
 #endif
@@ -184,25 +275,32 @@ static void ScaleARGBDownEven(int src_width, int src_height,
 }
 
 // Scale ARGB down with bilinear interpolation.
-static void ScaleARGBBilinearDown(int src_width, int src_height,
-                                  int dst_width, int dst_height,
-                                  int src_stride, int dst_stride,
-                                  const uint8* src_argb, uint8* dst_argb,
-                                  int x, int dx, int y, int dy,
+static void ScaleARGBBilinearDown(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  int src_stride,
+                                  int dst_stride,
+                                  const uint8_t* src_argb,
+                                  uint8_t* dst_argb,
+                                  int x,
+                                  int dx,
+                                  int y,
+                                  int dy,
                                   enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
-  int64 xlast = x + (int64)(dst_width - 1) * dx;
-  int64 xl = (dx >= 0) ? x : xlast;
-  int64 xr = (dx >= 0) ? xlast : x;
+  int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+  int64_t xl = (dx >= 0) ? x : xlast;
+  int64_t xr = (dx >= 0) ? xlast : x;
   int clip_src_width;
-  xl = (xl >> 16) & ~3;  // Left edge aligned.
-  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
+  xl = (xl >> 16) & ~3;    // Left edge aligned.
+  xr = (xr >> 16) + 1;     // Right most pixel used.  Bilinear uses 2 pixels.
   xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
   if (xr > src_width) {
     xr = src_width;
@@ -210,14 +308,6 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
   clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
   src_argb += xl * 4;
   x -= (int)(xl << 16);
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_SSE2;
-    }
-  }
-#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -242,12 +332,11 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
-    if (IS_ALIGNED(clip_src_width, 4)) {
-      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
     }
   }
 #endif
@@ -264,6 +353,14 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+    }
+  }
+#endif
   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
   // Allocate a row of ARGB.
   {
@@ -275,7 +372,7 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
     }
     for (j = 0; j < dst_height; ++j) {
       int yi = y >> 16;
-      const uint8* src = src_argb + yi * src_stride;
+      const uint8_t* src = src_argb + yi * src_stride;
       if (filtering == kFilterLinear) {
         ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
       } else {
@@ -294,28 +391,27 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
 }
 
 // Scale ARGB up with bilinear interpolation.
-static void ScaleARGBBilinearUp(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint8* src_argb, uint8* dst_argb,
-                                int x, int dx, int y, int dy,
+static void ScaleARGBBilinearUp(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint8_t* src_argb,
+                                uint8_t* dst_argb,
+                                int x,
+                                int dx,
+                                int y,
+                                int dy,
                                 enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   const int max_y = (src_height - 1) << 16;
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_SSE2;
-    }
-  }
-#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -340,15 +436,25 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(dst_width, 2)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
   }
 #endif
   if (src_width >= 32768) {
-    ScaleARGBFilterCols = filtering ?
-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+    ScaleARGBFilterCols =
+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
   }
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -363,6 +469,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -376,6 +490,22 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+  if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
+    if (IS_ALIGNED(dst_width, 1)) {
+      ScaleARGBFilterCols = ScaleARGBCols_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBCols_MSA;
+    }
+  }
+#endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBFilterCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
@@ -383,6 +513,11 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
       ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
+    }
+#endif
   }
 
   if (y > max_y) {
@@ -391,13 +526,13 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
 
   {
     int yi = y >> 16;
-    const uint8* src = src_argb + yi * src_stride;
+    const uint8_t* src = src_argb + yi * src_stride;
 
     // Allocate 2 rows of ARGB.
     const int kRowSize = (dst_width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 
-    uint8* rowptr = row;
+    uint8_t* rowptr = row;
     int rowstride = kRowSize;
     int lasty = yi;
 
@@ -439,24 +574,27 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
 
 #ifdef YUVSCALEUP
 // Scale YUV to ARGB up with bilinear interpolation.
-static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
-                                     int dst_width, int dst_height,
+static void ScaleYUVToARGBBilinearUp(int src_width,
+                                     int src_height,
+                                     int dst_width,
+                                     int dst_height,
                                      int src_stride_y,
                                      int src_stride_u,
                                      int src_stride_v,
                                      int dst_stride_argb,
-                                     const uint8* src_y,
-                                     const uint8* src_u,
-                                     const uint8* src_v,
-                                     uint8* dst_argb,
-                                     int x, int dx, int y, int dy,
+                                     const uint8_t* src_y,
+                                     const uint8_t* src_u,
+                                     const uint8_t* src_v,
+                                     uint8_t* dst_argb,
+                                     int x,
+                                     int dx,
+                                     int y,
+                                     int dy,
                                      enum FilterMode filtering) {
   int j;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf, int width) =
+      I422ToARGBRow_C;
 #if defined(HAS_I422TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
@@ -481,27 +619,26 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+#if defined(HAS_I422TOARGBROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MMI;
+    if (IS_ALIGNED(src_width, 4)) {
+      I422ToARGBRow = I422ToARGBRow_MMI;
+    }
   }
 #endif
-
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_SSE2;
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
     }
   }
 #endif
+
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -526,19 +663,21 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
   }
 #endif
 
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   if (src_width >= 32768) {
-    ScaleARGBFilterCols = filtering ?
-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+    ScaleARGBFilterCols =
+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
   }
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -553,6 +692,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -566,6 +713,22 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+  if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
+    if (IS_ALIGNED(dst_width, 1)) {
+      ScaleARGBFilterCols = ScaleARGBCols_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBCols_MSA;
+    }
+  }
+#endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBFilterCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
@@ -573,6 +736,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
       ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
+    }
+#endif
   }
 
   const int max_y = (src_height - 1) << 16;
@@ -582,9 +750,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
   const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
   int yi = y >> 16;
   int uv_yi = yi >> kYShift;
-  const uint8* src_row_y = src_y + yi * src_stride_y;
-  const uint8* src_row_u = src_u + uv_yi * src_stride_u;
-  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+  const uint8_t* src_row_y = src_y + yi * src_stride_y;
+  const uint8_t* src_row_u = src_u + uv_yi * src_stride_u;
+  const uint8_t* src_row_v = src_v + uv_yi * src_stride_v;
 
   // Allocate 2 rows of ARGB.
   const int kRowSize = (dst_width * 4 + 31) & ~31;
@@ -593,7 +761,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
   // Allocate 1 row of ARGB for source conversion.
   align_buffer_64(argb_row, src_width * 4);
 
-  uint8* rowptr = row;
+  uint8_t* rowptr = row;
   int rowstride = kRowSize;
   int lasty = yi;
 
@@ -659,15 +827,23 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
 
-static void ScaleARGBSimple(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_argb, uint8* dst_argb,
-                            int x, int dx, int y, int dy) {
+static void ScaleARGBSimple(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int x,
+                            int dx,
+                            int y,
+                            int dy) {
   int j;
-  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                        int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+  (void)src_height;
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBCols = ScaleARGBCols_SSE2;
@@ -681,6 +857,22 @@ static void ScaleARGBSimple(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleARGBCols = ScaleARGBCols_Any_MMI;
+    if (IS_ALIGNED(dst_width, 1)) {
+      ScaleARGBCols = ScaleARGBCols_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBCols = ScaleARGBCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBCols = ScaleARGBCols_MSA;
+    }
+  }
+#endif
   if (src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
@@ -688,11 +880,16 @@ static void ScaleARGBSimple(int src_width, int src_height,
       ScaleARGBCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBCols = ScaleARGBColsUp2_MMI;
+    }
+#endif
   }
 
   for (j = 0; j < dst_height; ++j) {
-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
-                  dst_width, x, dx);
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
+                  dx);
     dst_argb += dst_stride;
     y += dy;
   }
@@ -701,11 +898,18 @@ static void ScaleARGBSimple(int src_width, int src_height,
 // ScaleARGB a ARGB.
 // This function in turn calls a scaling function
 // suitable for handling the desired resolutions.
-static void ScaleARGB(const uint8* src, int src_stride,
-                      int src_width, int src_height,
-                      uint8* dst, int dst_stride,
-                      int dst_width, int dst_height,
-                      int clip_x, int clip_y, int clip_width, int clip_height,
+static void ScaleARGB(const uint8_t* src,
+                      int src_stride,
+                      int src_width,
+                      int src_height,
+                      uint8_t* dst,
+                      int dst_stride,
+                      int dst_width,
+                      int dst_height,
+                      int clip_x,
+                      int clip_y,
+                      int clip_width,
+                      int clip_height,
                       enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -714,8 +918,7 @@ static void ScaleARGB(const uint8* src, int src_stride,
   int dy = 0;
   // ARGB does not support box filter yet, but allow the user to pass it.
   // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height,
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                 filtering);
 
   // Negative src_height means invert the image.
@@ -724,17 +927,17 @@ static void ScaleARGB(const uint8* src, int src_stride,
     src = src + (src_height - 1) * src_stride;
     src_stride = -src_stride;
   }
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
   if (clip_x) {
-    int64 clipf = (int64)(clip_x) * dx;
+    int64_t clipf = (int64_t)(clip_x)*dx;
     x += (clipf & 0xffff);
     src += (clipf >> 16) * 4;
     dst += clip_x * 4;
   }
   if (clip_y) {
-    int64 clipf = (int64)(clip_y) * dy;
+    int64_t clipf = (int64_t)(clip_y)*dy;
     y += (clipf & 0xffff);
     src += (clipf >> 16) * src_stride;
     dst += clip_y * dst_stride;
@@ -749,24 +952,20 @@ static void ScaleARGB(const uint8* src, int src_stride,
       if (!(dx & 0x10000) && !(dy & 0x10000)) {
         if (dx == 0x20000) {
           // Optimized 1/2 downsample.
-          ScaleARGBDown2(src_width, src_height,
-                         clip_width, clip_height,
-                         src_stride, dst_stride, src, dst,
-                         x, dx, y, dy, filtering);
+          ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
+                         src_stride, dst_stride, src, dst, x, dx, y, dy,
+                         filtering);
           return;
         }
         if (dx == 0x40000 && filtering == kFilterBox) {
           // Optimized 1/4 box downsample.
-          ScaleARGBDown4Box(src_width, src_height,
-                            clip_width, clip_height,
-                            src_stride, dst_stride, src, dst,
-                            x, dx, y, dy);
+          ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
+                            src_stride, dst_stride, src, dst, x, dx, y, dy);
           return;
         }
-        ScaleARGBDownEven(src_width, src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src, dst,
-                          x, dx, y, dy, filtering);
+        ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
+                          src_stride, dst_stride, src, dst, x, dx, y, dy,
+                          filtering);
         return;
       }
       // Optimized odd scale down. ie 3, 5, 7, 9x.
@@ -782,71 +981,110 @@ static void ScaleARGB(const uint8* src, int src_stride,
     }
   }
   if (dx == 0x10000 && (x & 0xffff) == 0) {
-    // Arbitrary scale vertically, but unscaled vertically.
-    ScalePlaneVertical(src_height,
-                       clip_width, clip_height,
-                       src_stride, dst_stride, src, dst,
-                       x, y, dy, 4, filtering);
+    // Arbitrary scale vertically, but unscaled horizontally.
+    ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+                       dst_stride, src, dst, x, y, dy, 4, filtering);
     return;
   }
   if (filtering && dy < 65536) {
-    ScaleARGBBilinearUp(src_width, src_height,
-                        clip_width, clip_height,
-                        src_stride, dst_stride, src, dst,
-                        x, dx, y, dy, filtering);
+    ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
+                        src_stride, dst_stride, src, dst, x, dx, y, dy,
+                        filtering);
     return;
   }
   if (filtering) {
-    ScaleARGBBilinearDown(src_width, src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src, dst,
-                          x, dx, y, dy, filtering);
+    ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
+                          src_stride, dst_stride, src, dst, x, dx, y, dy,
+                          filtering);
     return;
   }
-  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
-                  src_stride, dst_stride, src, dst,
-                  x, dx, y, dy);
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
+                  dst_stride, src, dst, x, dx, y, dy);
 }
 
 LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
-                  int src_width, int src_height,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int dst_width, int dst_height,
-                  int clip_x, int clip_y, int clip_width, int clip_height,
+int ARGBScaleClip(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  int src_width,
+                  int src_height,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int dst_width,
+                  int dst_height,
+                  int clip_x,
+                  int clip_y,
+                  int clip_width,
+                  int clip_height,
                   enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
-      clip_x < 0 || clip_y < 0 ||
+  if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
+      dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
       clip_width > 32768 || clip_height > 32768 ||
       (clip_x + clip_width) > dst_width ||
       (clip_y + clip_height) > dst_height) {
     return -1;
   }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
-            dst_argb, dst_stride_argb, dst_width, dst_height,
-            clip_x, clip_y, clip_width, clip_height, filtering);
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+            dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
+            clip_height, filtering);
   return 0;
 }
 
 // Scale an ARGB image.
 LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
-              int src_width, int src_height,
-              uint8* dst_argb, int dst_stride_argb,
-              int dst_width, int dst_height,
+int ARGBScale(const uint8_t* src_argb,
+              int src_stride_argb,
+              int src_width,
+              int src_height,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0) {
+  if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
-            dst_argb, dst_stride_argb, dst_width, dst_height,
-            0, 0, dst_width, dst_height, filtering);
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+            dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
+            filtering);
   return 0;
 }
 
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint32_t src_fourcc,
+                       int src_width,
+                       int src_height,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       uint32_t dst_fourcc,
+                       int dst_width,
+                       int dst_height,
+                       int clip_x,
+                       int clip_y,
+                       int clip_width,
+                       int clip_height,
+                       enum FilterMode filtering) {
+  uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
+  int r;
+  (void)src_fourcc;  // TODO(fbarchard): implement and/or assert.
+  (void)dst_fourcc;
+  I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+             argb_buffer, src_width * 4, src_width, src_height);
+
+  r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
+                    dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+                    clip_width, clip_height, filtering);
+  free(argb_buffer);
+  return r;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/media/libaom/src/third_party/libyuv/source/scale_common.cc b/media/libaom/src/third_party/libyuv/source/scale_common.cc
index 1711f3d54c..fd4cbd0386 100644
--- a/media/libaom/src/third_party/libyuv/source/scale_common.cc
+++ b/media/libaom/src/third_party/libyuv/source/scale_common.cc
@@ -28,9 +28,12 @@ static __inline int Abs(int v) {
 }
 
 // CPU agnostic row functions
-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width) {
+void ScaleRowDown2_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[1];
     dst[1] = src_ptr[3];
@@ -42,9 +45,12 @@ void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width) {
+void ScaleRowDown2_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[1];
     dst[1] = src_ptr[3];
@@ -56,10 +62,13 @@ void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  const uint8_t* s = src_ptr;
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + 1) >> 1;
     dst[1] = (s[2] + s[3] + 1) >> 1;
@@ -71,10 +80,13 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst, int dst_width) {
-  const uint16* s = src_ptr;
+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width) {
+  const uint16_t* s = src_ptr;
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + 1) >> 1;
     dst[1] = (s[2] + s[3] + 1) >> 1;
@@ -86,10 +98,12 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
+void ScaleRowDown2Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
@@ -103,10 +117,36 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst += 1;
+    s += 2;
+    t += 2;
+  }
+  dst[0] = (s[0] + t[0] + 1) >> 1;
+}
+
+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
@@ -120,9 +160,12 @@ void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width) {
+void ScaleRowDown4_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[2];
     dst[1] = src_ptr[6];
@@ -134,9 +177,12 @@ void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width) {
+void ScaleRowDown4_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[2];
     dst[1] = src_ptr[6];
@@ -148,81 +194,88 @@ void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
+void ScaleRowDown4Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
   intptr_t stride = src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
     dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+              src_ptr[stride * 3 + 7] + 8) >>
+             4;
     dst += 2;
     src_ptr += 8;
   }
   if (dst_width & 1) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
   }
 }
 
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width) {
+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width) {
   intptr_t stride = src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
     dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+              src_ptr[stride * 3 + 7] + 8) >>
+             4;
     dst += 2;
     src_ptr += 8;
   }
   if (dst_width & 1) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
   }
 }
 
-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width) {
+void ScaleRowDown34_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width) {
   int x;
+  (void)src_stride;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -233,9 +286,12 @@ void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width) {
+void ScaleRowDown34_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
   int x;
+  (void)src_stride;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -247,19 +303,21 @@ void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
 }
 
 // Filter rows 0 and 1 together, 3 : 1
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
   int x;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
     d[0] = (a0 * 3 + b0 + 2) >> 2;
     d[1] = (a1 * 3 + b1 + 2) >> 2;
     d[2] = (a2 * 3 + b2 + 2) >> 2;
@@ -269,19 +327,21 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
   int x;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
-    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
     d[0] = (a0 * 3 + b0 + 2) >> 2;
     d[1] = (a1 * 3 + b1 + 2) >> 2;
     d[2] = (a2 * 3 + b2 + 2) >> 2;
@@ -292,19 +352,21 @@ void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
 }
 
 // Filter rows 1 and 2 together, 1 : 1
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
   int x;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
     d[0] = (a0 + b0 + 1) >> 1;
     d[1] = (a1 + b1 + 1) >> 1;
     d[2] = (a2 + b2 + 1) >> 1;
@@ -314,19 +376,21 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
   int x;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
-    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
     d[0] = (a0 + b0 + 1) >> 1;
     d[1] = (a1 + b1 + 1) >> 1;
     d[2] = (a2 + b2 + 1) >> 1;
@@ -337,8 +401,11 @@ void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
 }
 
 // Scales a single row of pixels using point sampling.
-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                 int dst_width, int x, int dx) {
+void ScaleCols_C(uint8_t* dst_ptr,
+                 const uint8_t* src_ptr,
+                 int dst_width,
+                 int x,
+                 int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[0] = src_ptr[x >> 16];
@@ -352,8 +419,11 @@ void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                    int dst_width, int x, int dx) {
+void ScaleCols_16_C(uint16_t* dst_ptr,
+                    const uint16_t* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[0] = src_ptr[x >> 16];
@@ -368,9 +438,14 @@ void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
 }
 
 // Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
-                    int dst_width, int x, int dx) {
+void ScaleColsUp2_C(uint8_t* dst_ptr,
+                    const uint8_t* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx) {
   int j;
+  (void)x;
+  (void)dx;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[1] = dst_ptr[0] = src_ptr[0];
     src_ptr += 1;
@@ -381,9 +456,14 @@ void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleColsUp2_16_C(uint16_t* dst_ptr,
+                       const uint16_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
   int j;
+  (void)x;
+  (void)dx;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[1] = dst_ptr[0] = src_ptr[0];
     src_ptr += 1;
@@ -395,11 +475,20 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
 }
 
 // (1-f)a + fb can be replaced with a + f(b-a)
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
-    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+#if defined(__arm__) || defined(__aarch64__)
+#define BLENDER(a, b, f) \
+  (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#else
+// Intel uses 7 bit math with rounding.
+#define BLENDER(a, b, f) \
+  (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+#endif
 
-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleFilterCols_C(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     int xi = x >> 16;
@@ -422,12 +511,15 @@ void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
-                         int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
+void ScaleFilterCols64_C(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x32,
+                         int dx) {
+  int64_t x = (int64_t)(x32);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int a = src_ptr[xi];
     int b = src_ptr[xi + 1];
     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -440,7 +532,7 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
     dst_ptr += 2;
   }
   if (dst_width & 1) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int a = src_ptr[xi];
     int b = src_ptr[xi + 1];
     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -448,11 +540,17 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
 }
 #undef BLENDER
 
-#define BLENDER(a, b, f) (uint16)((int)(a) + \
-    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+// Same as 8 bit arm blender but return is cast to uint16_t
+#define BLENDER(a, b, f) \
+  (uint16_t)(            \
+      (int)(a) +         \
+      (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
 
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleFilterCols_16_C(uint16_t* dst_ptr,
+                          const uint16_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     int xi = x >> 16;
@@ -475,12 +573,15 @@ void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
   }
 }
 
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                         int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            int dst_width,
+                            int x32,
+                            int dx) {
+  int64_t x = (int64_t)(x32);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int a = src_ptr[xi];
     int b = src_ptr[xi + 1];
     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -493,7 +594,7 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
     dst_ptr += 2;
   }
   if (dst_width & 1) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int a = src_ptr[xi];
     int b = src_ptr[xi + 1];
     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -501,9 +602,12 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
 }
 #undef BLENDER
 
-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width) {
+void ScaleRowDown38_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width) {
   int x;
+  (void)src_stride;
   assert(dst_width % 3 == 0);
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -514,9 +618,12 @@ void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width) {
+void ScaleRowDown38_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
   int x;
+  (void)src_stride;
   assert(dst_width % 3 == 0);
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -528,100 +635,118 @@ void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
 }
 
 // 8x3 -> 3x1
-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
+                            uint8_t* dst_ptr,
+                            int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
+    dst_ptr[0] =
+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[1] =
+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+            (65536 / 6) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width) {
+                               uint16_t* dst_ptr,
+                               int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
+    dst_ptr[0] =
+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[1] =
+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+            (65536 / 6) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
 // 8x2 -> 3x1
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+            (65536 / 4) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+            (65536 / 4) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
   int x;
   assert(src_width > 0);
   for (x = 0; x < src_width - 1; x += 2) {
@@ -635,7 +760,9 @@ void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
   }
 }
 
-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+                      uint32_t* dst_ptr,
+                      int src_width) {
   int x;
   assert(src_width > 0);
   for (x = 0; x < src_width - 1; x += 2) {
@@ -649,13 +776,16 @@ void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
   }
 }
 
-void ScaleARGBRowDown2_C(const uint8* src_argb,
-                         ptrdiff_t src_stride,
-                         uint8* dst_argb, int dst_width) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+// ARGB scale row functions
 
+void ScaleARGBRowDown2_C(const uint8_t* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_argb,
+                         int dst_width) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src[1];
     dst[1] = src[3];
@@ -667,10 +797,12 @@ void ScaleARGBRowDown2_C(const uint8* src_argb,
   }
 }
 
-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
+                               uint8_t* dst_argb,
+                               int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width; ++x) {
     dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
     dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
@@ -681,29 +813,37 @@ void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
   }
 }
 
-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width) {
   int x;
   for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] +
-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] +
-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] +
-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] +
-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+                   src_argb[src_stride + 4] + 2) >>
+                  2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+                   src_argb[src_stride + 5] + 2) >>
+                  2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+                   src_argb[src_stride + 6] + 2) >>
+                  2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+                   src_argb[src_stride + 7] + 2) >>
+                  2;
     src_argb += 8;
     dst_argb += 4;
   }
 }
 
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
                             int src_stepx,
-                            uint8* dst_argb, int dst_width) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
-
+                            uint8_t* dst_argb,
+                            int dst_width) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  (void)src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src[0];
@@ -716,30 +856,38 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
   }
 }
 
-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                int src_stepx,
-                               uint8* dst_argb, int dst_width) {
+                               uint8_t* dst_argb,
+                               int dst_width) {
   int x;
   for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] +
-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] +
-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] +
-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] +
-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+                   src_argb[src_stride + 4] + 2) >>
+                  2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+                   src_argb[src_stride + 5] + 2) >>
+                  2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+                   src_argb[src_stride + 6] + 2) >>
+                  2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+                   src_argb[src_stride + 7] + 2) >>
+                  2;
     src_argb += src_stepx * 4;
     dst_argb += 4;
   }
 }
 
 // Scales a single row of pixels using point sampling.
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
-                     int dst_width, int x, int dx) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBCols_C(uint8_t* dst_argb,
+                     const uint8_t* src_argb,
+                     int dst_width,
+                     int x,
+                     int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst[0] = src[x >> 16];
@@ -753,11 +901,14 @@ void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
   }
 }
 
-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
-                       int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBCols64_C(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x32,
+                       int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst[0] = src[x >> 16];
@@ -772,11 +923,16 @@ void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
 }
 
 // Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBColsUp2_C(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
+  (void)x;
+  (void)dx;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst[1] = dst[0] = src[0];
     src += 1;
@@ -787,24 +943,257 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
   }
 }
 
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
 // Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
-#define BLENDERC(a, b, f, s) (uint32)( \
-    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-#define BLENDER(a, b, f) \
-    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
-    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+  (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f)                                                 \
+  BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \
+      BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+
+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x32,
+                             int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64_t xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int64_t xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// UV scale row functions
+// same as ARGB but 2 channels
+
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_uv,
+                       int dst_width) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[1];
+    dst[1] = src[3];
+    src += 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[1];
+  }
+}
+
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_uv,
+                             int dst_width) {
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1;
+    dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1;
+    src_uv += 4;
+    dst_uv += 2;
+  }
+}
+
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_uv,
+                          int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+                 src_uv[src_stride + 2] + 2) >>
+                2;
+    dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+                 src_uv[src_stride + 3] + 2) >>
+                2;
+    src_uv += 4;
+    dst_uv += 2;
+  }
+}
+
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          int src_stepx,
+                          uint8_t* dst_uv,
+                          int dst_width) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  (void)src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[src_stepx];
+    src += src_stepx * 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             int src_stepx,
+                             uint8_t* dst_uv,
+                             int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+                 src_uv[src_stride + 2] + 2) >>
+                2;
+    dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+                 src_uv[src_stride + 3] + 2) >>
+                2;
+    src_uv += src_stepx * 2;
+    dst_uv += 2;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleUVCols_C(uint8_t* dst_uv,
+                   const uint8_t* src_uv,
+                   int dst_width,
+                   int x,
+                   int dx) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+void ScaleUVCols64_C(uint8_t* dst_uv,
+                     const uint8_t* src_uv,
+                     int dst_width,
+                     int x32,
+                     int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+                      const uint8_t* src_uv,
+                      int dst_width,
+                      int x,
+                      int dx) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  (void)x;
+  (void)dx;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[1] = dst[0] = src[0];
+    src += 1;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+  (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
 
-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+                         const uint8_t* src_uv,
+                         int dst_width,
+                         int x,
+                         int dx) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     int xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
+    uint16_t a = src[xi];
+    uint16_t b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
     x += dx;
     xi = x >> 16;
@@ -818,23 +1207,26 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
   if (dst_width & 1) {
     int xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
+    uint16_t a = src[xi];
+    uint16_t b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
   }
 }
 
-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
-                             int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+                           const uint8_t* src_uv,
+                           int dst_width,
+                           int x32,
+                           int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
+    uint16_t a = src[xi];
+    uint16_t b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
     x += dx;
     xi = x >> 16;
@@ -846,10 +1238,10 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
     dst += 2;
   }
   if (dst_width & 1) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
+    uint16_t a = src[xi];
+    uint16_t b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
   }
 }
@@ -859,16 +1251,22 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
 
 // Scale plane vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_argb, uint8* dst_argb,
-                        int x, int y, int dy,
-                        int bpp, enum FilterMode filtering) {
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        int x,
+                        int y,
+                        int dy,
+                        int bpp,
+                        enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher bpp.
   int dst_width_bytes = dst_width * bpp;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   int j;
   assert(bpp >= 1 && bpp <= 4);
@@ -876,14 +1274,6 @@ void ScalePlaneVertical(int src_height,
   assert(dst_width > 0);
   assert(dst_height > 0);
   src_argb += (x >> 16) * bpp;
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_SSE2;
-    }
-  }
-#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -908,13 +1298,19 @@ void ScalePlaneVertical(int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
-    if (IS_ALIGNED(dst_width_bytes, 4)) {
-      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(dst_width_bytes, 8)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
     }
   }
 #endif
@@ -926,23 +1322,29 @@ void ScalePlaneVertical(int src_height,
     }
     yi = y >> 16;
     yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride,
-                   src_stride, dst_width_bytes, yf);
+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+                   dst_width_bytes, yf);
     dst_argb += dst_stride;
     y += dy;
   }
 }
 void ScalePlaneVertical_16(int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint16* src_argb, uint16* dst_argb,
-                           int x, int y, int dy,
-                           int wpp, enum FilterMode filtering) {
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_argb,
+                           uint16_t* dst_argb,
+                           int x,
+                           int y,
+                           int dy,
+                           int wpp,
+                           enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher wpp.
   int dst_width_words = dst_width * wpp;
-  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
+  void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   int j;
   assert(wpp >= 1 && wpp <= 2);
@@ -982,16 +1384,6 @@ void ScalePlaneVertical_16(int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
-    if (IS_ALIGNED(dst_width_bytes, 4)) {
-      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
-    }
-  }
-#endif
   for (j = 0; j < dst_height; ++j) {
     int yi;
     int yf;
@@ -1000,16 +1392,18 @@ void ScalePlaneVertical_16(int src_height,
     }
     yi = y >> 16;
     yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride,
-                   src_stride, dst_width_words, yf);
+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+                   dst_width_words, yf);
     dst_argb += dst_stride;
     y += dy;
   }
 }
 
 // Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
-                                  int dst_width, int dst_height,
+enum FilterMode ScaleFilterReduce(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
                                   enum FilterMode filtering) {
   if (src_width < 0) {
     src_width = -src_width;
@@ -1051,22 +1445,26 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height,
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_C(int num, int div) {
-  return (int)(((int64)(num) << 16) / div);
+  return (int)(((int64_t)(num) << 16) / div);
 }
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv1_C(int num, int div) {
-  return (int)((((int64)(num) << 16) - 0x00010001) /
-                          (div - 1));
+  return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1));
 }
 
 #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
 
 // Compute slope values for stepping.
-void ScaleSlope(int src_width, int src_height,
-                int dst_width, int dst_height,
+void ScaleSlope(int src_width,
+                int src_height,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering,
-                int* x, int* y, int* dx, int* dy) {
+                int* x,
+                int* y,
+                int* dx,
+                int* dy) {
   assert(x != NULL);
   assert(y != NULL);
   assert(dx != NULL);
@@ -1098,7 +1496,7 @@ void ScaleSlope(int src_width, int src_height,
       *x = 0;
     }
     if (dst_height <= src_height) {
-      *dy = FixedDiv(src_height,  dst_height);
+      *dy = FixedDiv(src_height, dst_height);
       *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
     } else if (dst_height > 1) {
       *dy = FixedDiv1(src_height, dst_height);
@@ -1131,6 +1529,35 @@ void ScaleSlope(int src_width, int src_height,
 }
 #undef CENTERSTART
 
+// Read 8x2 upsample with filtering and write 16x1.
+// actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_C(const uint16_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint16_t* dst,
+                      int dst_width) {
+  const uint16_t* src2 = src_ptr + src_stride;
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    uint16_t p0 = src_ptr[0];
+    uint16_t p1 = src_ptr[1];
+    uint16_t p2 = src2[0];
+    uint16_t p3 = src2[1];
+    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
+    dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
+    ++src_ptr;
+    ++src2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    uint16_t p0 = src_ptr[0];
+    uint16_t p1 = src_ptr[1];
+    uint16_t p2 = src2[0];
+    uint16_t p3 = src2[1];
+    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/media/libaom/src/third_party/libyuv/source/scale_gcc.cc b/media/libaom/src/third_party/libyuv/source/scale_gcc.cc
index 8a6ac54592..e575ee18bc 100644
--- a/media/libaom/src/third_party/libyuv/source/scale_gcc.cc
+++ b/media/libaom/src/third_party/libyuv/source/scale_gcc.cc
@@ -9,6 +9,7 @@
  */
 
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -16,1071 +17,1445 @@ extern "C" {
 #endif
 
 // This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
 
 // Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
 
 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
+                              8, 9, 9, 10, 10, 11, 12, 13};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
+                              10, 11, 12, 13, 13, 14, 14, 15};
 
 // Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
 
 // Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
 
 // Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
 
 // Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
-  { 2, 2, 2, 2, 2, 2, 2, 2 };
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
 
-static uvec8 kShuf38a =
-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
+                               128, 128, 128, 128, 128, 128, 128, 128};
 
-static uvec8 kShuf38b =
-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
+                               6,   8,   11,  14,  128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
+                              128, 128, 128, 128, 128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
+                               6,   7,   12,  13,  128, 128, 128, 128};
 
 // Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                                  65536 / 9, 65536 / 6, 0,         0};
 
 // Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
+                               11, 128, 14, 128, 128, 128, 128, 128};
 
 // Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
+                               12, 128, 15, 128, 128, 128, 128, 128};
 
 // Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
+                               13, 128, 128, 128, 128, 128, 128, 128};
 
 // Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                                 65536 / 3, 65536 / 2, 0,         0};
 
 // GCC versions of row functions are verbatim conversions from Visual C.
 // Generated using gcc disassembly on Visual C object file:
 // objdump -D yuvscaler.obj >yuvscaler.txt
 
-void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrlw       $0xf,%%xmm4                   \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pavgw       %%xmm5,%%xmm0                 \n"
+      "pavgw       %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrlw       $0xf,%%xmm4                   \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x1,%%xmm0                   \n"
+      "psrlw       $0x1,%%xmm1                   \n"
+      "pavgw       %%xmm5,%%xmm0                 \n"
+      "pavgw       %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
-void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrld     $0x18,%%xmm5                    \n"
-    "pslld     $0x10,%%xmm5                    \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
+#ifdef HAS_SCALEROWDOWN2_AVX2
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  intptr_t stridex3 = 0;
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0x8,%%xmm7                     \n"
-    "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
-    MEMOPREG(movdqu,0x00,0,3,1,xmm4)           //  movdqu  (%0,%3,1),%%xmm4
-    MEMOPREG(movdqu,0x10,0,3,1,xmm5)           //  movdqu  0x10(%0,%3,1),%%xmm5
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm4,%%xmm2                   \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm5,%%xmm3                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm7,%%xmm2                   \n"
-    "pand      %%xmm7,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "pand      %%xmm7,%%xmm2                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width),   // %2
-    "+r"(stridex3)     // %3
-  : "r"((intptr_t)(src_stride))    // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
-  );
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm3                       \n"
-    "movdqa    %1,%%xmm4                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kShuf0),  // %0
-    "m"(kShuf1),  // %1
-    "m"(kShuf2)   // %2
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "palignr   $0x8,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm3,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
-    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
+      "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
+      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrld       $0x18,%%xmm5                  \n"
+      "pslld       $0x10,%%xmm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  intptr_t stridex3;
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrlw       $0xf,%%xmm4                   \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "psllw       $0x3,%%xmm5                   \n"
+      "lea         0x00(%4,%4,2),%3              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "movdqu      0x00(%0,%4,2),%%xmm2          \n"
+      "movdqu      0x10(%0,%4,2),%%xmm3          \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "phaddw      %%xmm1,%%xmm0                 \n"
+      "paddw       %%xmm5,%%xmm0                 \n"
+      "psrlw       $0x4,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width),             // %2
+        "=&r"(stridex3)              // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
+      "vpslld      $0x10,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
+      "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vmovdqu     0x00(%0,%3,2),%%ymm2          \n"
+      "vmovdqu     0x20(%0,%3,2),%%ymm3          \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
+      "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpsrlw      $0x4,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                   // %0
+        "+r"(dst_ptr),                   // %1
+        "+r"(dst_width)                  // %2
+      : "r"((intptr_t)(src_stride)),     // %3
+        "r"((intptr_t)(src_stride * 3))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa      %0,%%xmm3                     \n"
+      "movdqa      %1,%%xmm4                     \n"
+      "movdqa      %2,%%xmm5                     \n"
+      :
+      : "m"(kShuf0),  // %0
+        "m"(kShuf1),  // %1
+        "m"(kShuf2)   // %2
   );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm2               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "palignr     $0x8,%%xmm0,%%xmm1            \n"
+      "pshufb      %%xmm3,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "pshufb      %%xmm5,%%xmm2                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x8(%1)                \n"
+      "movq        %%xmm2,0x10(%1)               \n"
+      "lea         0x18(%1),%1                   \n"
+      "sub         $0x18,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa      %0,%%xmm2                     \n"  // kShuf01
+      "movdqa      %1,%%xmm3                     \n"  // kShuf11
+      "movdqa      %2,%%xmm4                     \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
   );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS(1) "         \n"
-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
-    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "r"((intptr_t)(src_stride)),  // %3
-    "m"(kMadd21)     // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  asm volatile(
+      "movdqa      %0,%%xmm5                     \n"  // kMadd01
+      "movdqa      %1,%%xmm0                     \n"  // kMadd11
+      "movdqa      %2,%%xmm1                     \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
   );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm5,%%xmm6                 \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,(%1)                   \n"
+      "movdqu      0x8(%0),%%xmm6                \n"
+      "movdqu      0x8(%0,%3,1),%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm3,%%xmm6                 \n"
+      "pmaddubsw   %%xmm0,%%xmm6                 \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,0x8(%1)                \n"
+      "movdqu      0x10(%0),%%xmm6               \n"
+      "movdqu      0x10(%0,%3,1),%%xmm7          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm4,%%xmm6                 \n"
+      "pmaddubsw   %4,%%xmm6                     \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,0x10(%1)               \n"
+      "lea         0x18(%1),%1                   \n"
+      "sub         $0x18,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa      %0,%%xmm2                     \n"  // kShuf01
+      "movdqa      %1,%%xmm3                     \n"  // kShuf11
+      "movdqa      %2,%%xmm4                     \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
   );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
+  asm volatile(
+      "movdqa      %0,%%xmm5                     \n"  // kMadd01
+      "movdqa      %1,%%xmm0                     \n"  // kMadd11
+      "movdqa      %2,%%xmm1                     \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
   );
 
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS(1) "         \n"
-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
-    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-    : "+r"(src_ptr),   // %0
-      "+r"(dst_ptr),   // %1
-      "+r"(dst_width)  // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "m"(kMadd21)     // %4
-    : "memory", "cc", NACL_R14
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
+      "pavgb       %%xmm6,%%xmm7                 \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm5,%%xmm6                 \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,(%1)                   \n"
+      "movdqu      0x8(%0),%%xmm6                \n"
+      "movdqu      0x8(%0,%3,1),%%xmm7           \n"
+      "pavgb       %%xmm6,%%xmm7                 \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm3,%%xmm6                 \n"
+      "pmaddubsw   %%xmm0,%%xmm6                 \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,0x8(%1)                \n"
+      "movdqu      0x10(%0),%%xmm6               \n"
+      "movdqu      0x10(%0,%3,1),%%xmm7          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm6,%%xmm7                 \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm4,%%xmm6                 \n"
+      "pmaddubsw   %4,%%xmm6                     \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,0x10(%1)               \n"
+      "lea         0x18(%1),%1                   \n"
+      "sub         $0x18,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
-    "lea       " MEMLEA(0xc,1) ",%1            \n"
-    "sub       $0xc,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "m"(kShuf38a),   // %3
-    "m"(kShuf38b)    // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movhlps     %%xmm0,%%xmm1                 \n"
+      "movd        %%xmm1,0x8(%1)                \n"
+      "lea         0xc(%1),%1                    \n"
+      "sub         $0xc,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "m"(kShuf38a),   // %3
+        "m"(kShuf38b)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "movdqa    %3,%%xmm5                       \n"
-  :
-  : "m"(kShufAb0),   // %0
-    "m"(kShufAb1),   // %1
-    "m"(kShufAb2),   // %2
-    "m"(kScaleAb2)   // %3
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pshufb    %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "paddusw   %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "paddusw   %%xmm0,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movd      %%xmm1," MEMACCESS(1) "         \n"
-    "psrlq     $0x10,%%xmm1                    \n"
-    "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
-    "lea       " MEMLEA(0x6,1) ",%1            \n"
-    "sub       $0x6,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
-  : "r"((intptr_t)(src_stride))  // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa      %0,%%xmm2                     \n"
+      "movdqa      %1,%%xmm3                     \n"
+      "movdqa      %2,%%xmm4                     \n"
+      "movdqa      %3,%%xmm5                     \n"
+      :
+      : "m"(kShufAb0),  // %0
+        "m"(kShufAb1),  // %1
+        "m"(kShufAb2),  // %2
+        "m"(kScaleAb2)  // %3
   );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%3,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "pavgb       %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "pshufb      %%xmm3,%%xmm6                 \n"
+      "paddusw     %%xmm6,%%xmm1                 \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "paddusw     %%xmm0,%%xmm1                 \n"
+      "pmulhuw     %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movd        %%xmm1,(%1)                   \n"
+      "psrlq       $0x10,%%xmm1                  \n"
+      "movd        %%xmm1,0x2(%1)                \n"
+      "lea         0x6(%1),%1                    \n"
+      "sub         $0x6,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-  :
-  : "m"(kShufAc),    // %0
-    "m"(kShufAc3),   // %1
-    "m"(kScaleAc33)  // %2
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "movdqa    %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "pshufb    %%xmm3,%%xmm7                   \n"
-    "paddusw   %%xmm7,%%xmm6                   \n"
-    "pmulhuw   %%xmm4,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movd      %%xmm6," MEMACCESS(1) "         \n"
-    "psrlq     $0x10,%%xmm6                    \n"
-    "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
-    "lea       " MEMLEA(0x6,1) ",%1            \n"
-    "sub       $0x6,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa      %0,%%xmm2                     \n"
+      "movdqa      %1,%%xmm3                     \n"
+      "movdqa      %2,%%xmm4                     \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+      :
+      : "m"(kShufAc),    // %0
+        "m"(kShufAc3),   // %1
+        "m"(kScaleAc33)  // %2
   );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%3,1),%%xmm6          \n"
+      "movhlps     %%xmm0,%%xmm1                 \n"
+      "movhlps     %%xmm6,%%xmm7                 \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm6                 \n"
+      "punpcklbw   %%xmm5,%%xmm7                 \n"
+      "paddusw     %%xmm6,%%xmm0                 \n"
+      "paddusw     %%xmm7,%%xmm1                 \n"
+      "movdqu      0x00(%0,%3,2),%%xmm6          \n"
+      "lea         0x10(%0),%0                   \n"
+      "movhlps     %%xmm6,%%xmm7                 \n"
+      "punpcklbw   %%xmm5,%%xmm6                 \n"
+      "punpcklbw   %%xmm5,%%xmm7                 \n"
+      "paddusw     %%xmm6,%%xmm0                 \n"
+      "paddusw     %%xmm7,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "psrldq      $0x2,%%xmm0                   \n"
+      "paddusw     %%xmm0,%%xmm6                 \n"
+      "psrldq      $0x2,%%xmm0                   \n"
+      "paddusw     %%xmm0,%%xmm6                 \n"
+      "pshufb      %%xmm2,%%xmm6                 \n"
+      "movdqa      %%xmm1,%%xmm7                 \n"
+      "psrldq      $0x2,%%xmm1                   \n"
+      "paddusw     %%xmm1,%%xmm7                 \n"
+      "psrldq      $0x2,%%xmm1                   \n"
+      "paddusw     %%xmm1,%%xmm7                 \n"
+      "pshufb      %%xmm3,%%xmm7                 \n"
+      "paddusw     %%xmm7,%%xmm6                 \n"
+      "pmulhuw     %%xmm4,%%xmm6                 \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movd        %%xmm6,(%1)                   \n"
+      "psrlq       $0x10,%%xmm6                  \n"
+      "movd        %%xmm6,0x2(%1)                \n"
+      "lea         0x6(%1),%1                    \n"
+      "sub         $0x6,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 // Reads 16xN bytes and produces 16 shorts at a time.
-void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                       uint16* dst_ptr, int src_width, int src_height) {
-  int tmp_height = 0;
-  intptr_t tmp_src = 0;
-  asm volatile (
-    "mov       %0,%3                           \n"  // row pointer
-    "mov       %5,%2                           \n"  // height
-    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(3) ",%%xmm2         \n"
-    "add       %6,%3                           \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm2                   \n"
-    "punpckhbw %%xmm4,%%xmm3                   \n"
-    "paddusw   %%xmm2,%%xmm0                   \n"
-    "paddusw   %%xmm3,%%xmm1                   \n"
-    "sub       $0x1,%2                         \n"
-    "jg        1b                              \n"
-
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
-    "mov       %0,%3                           \n"  // row pointer
-    "mov       %5,%2                           \n"  // height
-    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "sub       $0x10,%4                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(tmp_height),  // %2
-    "+r"(tmp_src),     // %3
-    "+r"(src_width),   // %4
-    "+rm"(src_height)  // %5
-  : "rm"((intptr_t)(src_stride))  // %6
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
+
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm3                   \n"
+      "lea         0x10(%0),%0                   \n"  // src_ptr += 16
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x10(%1),%%xmm1               \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpckhbw   %%xmm5,%%xmm3                 \n"
+      "paddusw     %%xmm2,%%xmm0                 \n"
+      "paddusw     %%xmm3,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
+
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm3                   \n"
+      "lea         0x20(%0),%0                   \n"  // src_ptr += 32
+      "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
+      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
+      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
+      "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                               0x4040, 0x4040, 0x4040, 0x4040};
+
 // Bilinear column filtering. SSSE3 version.
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx) {
-  intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
-  asm volatile (
-    "movd      %6,%%xmm2                       \n"
-    "movd      %7,%%xmm3                       \n"
-    "movl      $0x04040000,%k2                 \n"
-    "movd      %k2,%%xmm5                      \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x9,%%xmm6                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "subl      $0x2,%5                         \n"
-    "jl        29f                             \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "punpckldq %%xmm0,%%xmm2                   \n"
-    "punpckldq %%xmm3,%%xmm3                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-
-    LABELALIGN
-  "2:                                          \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
-    "movd      %k2,%%xmm0                      \n"
-    "psrlw     $0x9,%%xmm1                     \n"
-    MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
-    "movd      %k2,%%xmm4                      \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "punpcklwd %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm1                   \n"
-    "pmaddubsw %%xmm1,%%xmm0                   \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,%k2                      \n"
-    "mov       %w2," MEMACCESS(0) "            \n"
-    "lea       " MEMLEA(0x2,0) ",%0            \n"
-    "sub       $0x2,%5                         \n"
-    "jge       2b                              \n"
-
-    LABELALIGN
-  "29:                                         \n"
-    "addl      $0x1,%5                         \n"
-    "jl        99f                             \n"
-    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
-    "movd      %k2,%%xmm0                      \n"
-    "psrlw     $0x9,%%xmm2                     \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "pxor      %%xmm6,%%xmm2                   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,%k2                      \n"
-    "mov       %b2," MEMACCESS(0) "            \n"
-  "99:                                         \n"
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+a"(temp_pixel),  // %2
-    "+r"(x0),          // %3
-    "+r"(x1),          // %4
-    "+rm"(dst_width)   // %5
-  : "rm"(x),           // %6
-    "rm"(dx)           // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  intptr_t x0, x1, temp_pixel;
+  asm volatile(
+      "movd        %6,%%xmm2                     \n"
+      "movd        %7,%%xmm3                     \n"
+      "movl        $0x04040000,%k2               \n"
+      "movd        %k2,%%xmm5                    \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $15,%%xmm7                    \n"  // 0x00010001
+
+      "pextrw      $0x1,%%xmm2,%k3               \n"
+      "subl        $0x2,%5                       \n"
+      "jl          29f                           \n"
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "paddd       %%xmm3,%%xmm0                 \n"
+      "punpckldq   %%xmm0,%%xmm2                 \n"
+      "punpckldq   %%xmm3,%%xmm3                 \n"
+      "paddd       %%xmm3,%%xmm3                 \n"
+      "pextrw      $0x3,%%xmm2,%k4               \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "paddd       %%xmm3,%%xmm2                 \n"
+      "movzwl      0x00(%1,%3,1),%k2             \n"
+      "movd        %k2,%%xmm0                    \n"
+      "psrlw       $0x9,%%xmm1                   \n"
+      "movzwl      0x00(%1,%4,1),%k2             \n"
+      "movd        %k2,%%xmm4                    \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "punpcklwd   %%xmm4,%%xmm0                 \n"
+      "psubb       %8,%%xmm0                     \n"  // make pixels signed.
+      "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
+                                                      // 1
+      "paddusb     %%xmm7,%%xmm1                 \n"
+      "pmaddubsw   %%xmm0,%%xmm1                 \n"
+      "pextrw      $0x1,%%xmm2,%k3               \n"
+      "pextrw      $0x3,%%xmm2,%k4               \n"
+      "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
+      "psrlw       $0x7,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movd        %%xmm1,%k2                    \n"
+      "mov         %w2,(%0)                      \n"
+      "lea         0x2(%0),%0                    \n"
+      "subl        $0x2,%5                       \n"
+      "jge         2b                            \n"
+
+      LABELALIGN
+      "29:                                       \n"
+      "addl        $0x1,%5                       \n"
+      "jl          99f                           \n"
+      "movzwl      0x00(%1,%3,1),%k2             \n"
+      "movd        %k2,%%xmm0                    \n"
+      "psrlw       $0x9,%%xmm2                   \n"
+      "pshufb      %%xmm5,%%xmm2                 \n"
+      "psubb       %8,%%xmm0                     \n"  // make pixels signed.
+      "pxor        %%xmm6,%%xmm2                 \n"
+      "paddusb     %%xmm7,%%xmm2                 \n"
+      "pmaddubsw   %%xmm0,%%xmm2                 \n"
+      "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
+      "psrlw       $0x7,%%xmm2                   \n"
+      "packuswb    %%xmm2,%%xmm2                 \n"
+      "movd        %%xmm2,%k2                    \n"
+      "mov         %b2,(%0)                      \n"
+      "99:                                       \n"
+      : "+r"(dst_ptr),      // %0
+        "+r"(src_ptr),      // %1
+        "=&a"(temp_pixel),  // %2
+        "=&r"(x0),          // %3
+        "=&r"(x1),          // %4
+#if defined(__x86_64__)
+        "+rm"(dst_width)  // %5
+#else
+        "+m"(dst_width)  // %5
+#endif
+      : "rm"(x),   // %6
+        "rm"(dx),  // %7
+#if defined(__x86_64__)
+        "x"(kFsub80),  // %8
+        "x"(kFadd40)   // %9
+#else
+        "m"(kFsub80),    // %8
+        "m"(kFadd40)     // %9
+#endif
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub       $0x20,%2                         \n"
-    "jg        1b                              \n"
-
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_width)    // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  (void)x;
+  (void)dx;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%0)                   \n"
+      "movdqu      %%xmm1,0x10(%0)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+
+      : "+r"(dst_ptr),   // %0
+        "+r"(src_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                             ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+                            uint8_t* dst_argb,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "shufps      $0xdd,%%xmm1,%%xmm0           \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                   ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),              // %0
+        "+r"(dst_argb),              // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
-  intptr_t src_stepx_x12 = 0;
-  asm volatile (
-    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
-    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movd      " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
-    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
-    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
-    "punpckldq %%xmm3,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),      // %0
-    "+r"(src_stepx_x4),  // %1
-    "+r"(dst_argb),      // %2
-    "+r"(dst_width),     // %3
-    "+r"(src_stepx_x12)  // %4
-  :: "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+  intptr_t src_stepx_x12;
+  (void)src_stride;
+  asm volatile(
+      "lea         0x00(,%1,4),%1                \n"
+      "lea         0x00(%1,%1,2),%4              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movd        (%0),%%xmm0                   \n"
+      "movd        0x00(%0,%1,1),%%xmm1          \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"
+      "movd        0x00(%0,%1,2),%%xmm2          \n"
+      "movd        0x00(%0,%4,1),%%xmm3          \n"
+      "lea         0x00(%0,%1,4),%0              \n"
+      "punpckldq   %%xmm3,%%xmm2                 \n"
+      "punpcklqdq  %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),       // %0
+        "+r"(src_stepx_x4),   // %1
+        "+r"(dst_argb),       // %2
+        "+r"(dst_width),      // %3
+        "=&r"(src_stepx_x12)  // %4
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 // Blends four 2x2 to 4x1.
 // Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride, int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
-  intptr_t src_stepx_x12 = 0;
+  intptr_t src_stepx_x12;
   intptr_t row1 = (intptr_t)(src_stride);
-  asm volatile (
-    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
-    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
-    "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
-    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
-    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
-    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
-    "movq      " MEMACCESS(5) ",%%xmm2         \n"
-    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
-    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
-    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
-    "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),       // %0
-    "+r"(src_stepx_x4),   // %1
-    "+r"(dst_argb),       // %2
-    "+rm"(dst_width),     // %3
-    "+r"(src_stepx_x12),  // %4
-    "+r"(row1)            // %5
-  :: "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+  asm volatile(
+      "lea         0x00(,%1,4),%1                \n"
+      "lea         0x00(%1,%1,2),%4              \n"
+      "lea         0x00(%0,%5,1),%5              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "movhps      0x00(%0,%1,1),%%xmm0          \n"
+      "movq        0x00(%0,%1,2),%%xmm1          \n"
+      "movhps      0x00(%0,%4,1),%%xmm1          \n"
+      "lea         0x00(%0,%1,4),%0              \n"
+      "movq        (%5),%%xmm2                   \n"
+      "movhps      0x00(%5,%1,1),%%xmm2          \n"
+      "movq        0x00(%5,%1,2),%%xmm3          \n"
+      "movhps      0x00(%5,%4,1),%%xmm3          \n"
+      "lea         0x00(%5,%1,4),%5              \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),        // %0
+        "+r"(src_stepx_x4),    // %1
+        "+r"(dst_argb),        // %2
+        "+rm"(dst_width),      // %3
+        "=&r"(src_stepx_x12),  // %4
+        "+r"(row1)             // %5
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  intptr_t x0 = 0, x1 = 0;
-  asm volatile (
-    "movd      %5,%%xmm2                       \n"
-    "movd      %6,%%xmm3                       \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    "pshufd    $0x11,%%xmm3,%%xmm0             \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pshufd    $0x5,%%xmm3,%%xmm0              \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pextrw    $0x1,%%xmm2,%k0                 \n"
-    "pextrw    $0x3,%%xmm2,%k1                 \n"
-    "cmp       $0x0,%4                         \n"
-    "jl        99f                             \n"
-    "sub       $0x4,%4                         \n"
-    "jl        49f                             \n"
-
-    LABELALIGN
-  "40:                                         \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
-    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
-    "pextrw    $0x5,%%xmm2,%k0                 \n"
-    "pextrw    $0x7,%%xmm2,%k1                 \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
-    MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
-    "pextrw    $0x1,%%xmm2,%k0                 \n"
-    "pextrw    $0x3,%%xmm2,%k1                 \n"
-    "punpckldq %%xmm4,%%xmm1                   \n"
-    "punpcklqdq %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%4                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "test      $0x2,%4                         \n"
-    "je        29f                             \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
-    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
-    "pextrw    $0x5,%%xmm2,%k0                 \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x8,2) ",%2            \n"
-  "29:                                         \n"
-    "test      $0x1,%4                         \n"
-    "je        99f                             \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-  "99:                                         \n"
-  : "+a"(x0),          // %0
-    "+d"(x1),          // %1
-    "+r"(dst_argb),    // %2
-    "+r"(src_argb),    // %3
-    "+r"(dst_width)    // %4
-  : "rm"(x),           // %5
-    "rm"(dx)           // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  intptr_t x0, x1;
+  asm volatile(
+      "movd        %5,%%xmm2                     \n"
+      "movd        %6,%%xmm3                     \n"
+      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
+      "pshufd      $0x11,%%xmm3,%%xmm0           \n"
+      "paddd       %%xmm0,%%xmm2                 \n"
+      "paddd       %%xmm3,%%xmm3                 \n"
+      "pshufd      $0x5,%%xmm3,%%xmm0            \n"
+      "paddd       %%xmm0,%%xmm2                 \n"
+      "paddd       %%xmm3,%%xmm3                 \n"
+      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
+      "pextrw      $0x1,%%xmm2,%k0               \n"
+      "pextrw      $0x3,%%xmm2,%k1               \n"
+      "cmp         $0x0,%4                       \n"
+      "jl          99f                           \n"
+      "sub         $0x4,%4                       \n"
+      "jl          49f                           \n"
+
+      LABELALIGN
+      "40:                                       \n"
+      "movd        0x00(%3,%0,4),%%xmm0          \n"
+      "movd        0x00(%3,%1,4),%%xmm1          \n"
+      "pextrw      $0x5,%%xmm2,%k0               \n"
+      "pextrw      $0x7,%%xmm2,%k1               \n"
+      "paddd       %%xmm3,%%xmm2                 \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"
+      "movd        0x00(%3,%0,4),%%xmm1          \n"
+      "movd        0x00(%3,%1,4),%%xmm4          \n"
+      "pextrw      $0x1,%%xmm2,%k0               \n"
+      "pextrw      $0x3,%%xmm2,%k1               \n"
+      "punpckldq   %%xmm4,%%xmm1                 \n"
+      "punpcklqdq  %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%4                       \n"
+      "jge         40b                           \n"
+
+      "49:                                       \n"
+      "test        $0x2,%4                       \n"
+      "je          29f                           \n"
+      "movd        0x00(%3,%0,4),%%xmm0          \n"
+      "movd        0x00(%3,%1,4),%%xmm1          \n"
+      "pextrw      $0x5,%%xmm2,%k0               \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"
+      "movq        %%xmm0,(%2)                   \n"
+      "lea         0x8(%2),%2                    \n"
+      "29:                                       \n"
+      "test        $0x1,%4                       \n"
+      "je          99f                           \n"
+      "movd        0x00(%3,%0,4),%%xmm0          \n"
+      "movd        %%xmm0,(%2)                   \n"
+      "99:                                       \n"
+      : "=&a"(x0),       // %0
+        "=&d"(x1),       // %1
+        "+r"(dst_argb),  // %2
+        "+r"(src_argb),  // %3
+        "+r"(dst_width)  // %4
+      : "rm"(x),         // %5
+        "rm"(dx)         // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpckldq %%xmm0,%%xmm0                   \n"
-    "punpckhdq %%xmm1,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-
-  : "+r"(dst_argb),    // %0
-    "+r"(src_argb),    // %1
-    "+r"(dst_width)    // %2
-  :: "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  (void)x;
+  (void)dx;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpckldq   %%xmm0,%%xmm0                 \n"
+      "punpckhdq   %%xmm1,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%0)                   \n"
+      "movdqu      %%xmm1,0x10(%0)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+
+      : "+r"(dst_argb),  // %0
+        "+r"(src_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static uvec8 kShuffleColARGB = {
-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+static const uvec8 kShuffleColARGB = {
+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 };
 
 // Shuffle table for duplicating 2 fractions into 8 bytes each
-static uvec8 kShuffleFractions = {
-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+static const uvec8 kShuffleFractions = {
+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
 
 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx) {
-  intptr_t x0 = 0, x1 = 0;
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm5                       \n"
-  :
-  : "m"(kShuffleColARGB),  // %0
-    "m"(kShuffleFractions)  // %1
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                               const uint8_t* src_argb,
+                               int dst_width,
+                               int x,
+                               int dx) {
+  intptr_t x0, x1;
+  asm volatile(
+      "movdqa      %0,%%xmm4                     \n"
+      "movdqa      %1,%%xmm5                     \n"
+      :
+      : "m"(kShuffleColARGB),   // %0
+        "m"(kShuffleFractions)  // %1
   );
 
-  asm volatile (
-    "movd      %5,%%xmm2                       \n"
-    "movd      %6,%%xmm3                       \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x9,%%xmm6                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "sub       $0x2,%2                         \n"
-    "jl        29f                             \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "punpckldq %%xmm0,%%xmm2                   \n"
-    "punpckldq %%xmm3,%%xmm3                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-
-    LABELALIGN
-  "2:                                          \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
-    "psrlw     $0x9,%%xmm1                     \n"
-    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm1                   \n"
-    "pmaddubsw %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(0) "         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "sub       $0x2,%2                         \n"
-    "jge       2b                              \n"
-
-    LABELALIGN
-  "29:                                         \n"
-    "add       $0x1,%2                         \n"
-    "jl        99f                             \n"
-    "psrlw     $0x9,%%xmm2                     \n"
-    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm2                   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(0) "         \n"
-
-    LABELALIGN
-  "99:                                         \n"
-  : "+r"(dst_argb),    // %0
-    "+r"(src_argb),    // %1
-    "+rm"(dst_width),  // %2
-    "+r"(x0),          // %3
-    "+r"(x1)           // %4
-  : "rm"(x),           // %5
-    "rm"(dx)           // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+  asm volatile(
+      "movd        %5,%%xmm2                     \n"
+      "movd        %6,%%xmm3                     \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $0x9,%%xmm6                   \n"
+      "pextrw      $0x1,%%xmm2,%k3               \n"
+      "sub         $0x2,%2                       \n"
+      "jl          29f                           \n"
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "paddd       %%xmm3,%%xmm0                 \n"
+      "punpckldq   %%xmm0,%%xmm2                 \n"
+      "punpckldq   %%xmm3,%%xmm3                 \n"
+      "paddd       %%xmm3,%%xmm3                 \n"
+      "pextrw      $0x3,%%xmm2,%k4               \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "paddd       %%xmm3,%%xmm2                 \n"
+      "movq        0x00(%1,%3,4),%%xmm0          \n"
+      "psrlw       $0x9,%%xmm1                   \n"
+      "movhps      0x00(%1,%4,4),%%xmm0          \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "pxor        %%xmm6,%%xmm1                 \n"
+      "pmaddubsw   %%xmm1,%%xmm0                 \n"
+      "psrlw       $0x7,%%xmm0                   \n"
+      "pextrw      $0x1,%%xmm2,%k3               \n"
+      "pextrw      $0x3,%%xmm2,%k4               \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%0)                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "sub         $0x2,%2                       \n"
+      "jge         2b                            \n"
+
+      LABELALIGN
+      "29:                                       \n"
+      "add         $0x1,%2                       \n"
+      "jl          99f                           \n"
+      "psrlw       $0x9,%%xmm2                   \n"
+      "movq        0x00(%1,%3,4),%%xmm0          \n"
+      "pshufb      %%xmm5,%%xmm2                 \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "pxor        %%xmm6,%%xmm2                 \n"
+      "pmaddubsw   %%xmm2,%%xmm0                 \n"
+      "psrlw       $0x7,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movd        %%xmm0,(%0)                   \n"
+
+      LABELALIGN
+      "99:                                       \n"  // clang-format error.
+
+      : "+r"(dst_argb),    // %0
+        "+r"(src_argb),    // %1
+        "+rm"(dst_width),  // %2
+        "=&r"(x0),         // %3
+        "=&r"(x1)          // %4
+      : "rm"(x),           // %5
+        "rm"(dx)           // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_X86(int num, int div) {
-  asm volatile (
-    "cdq                                       \n"
-    "shld      $0x10,%%eax,%%edx               \n"
-    "shl       $0x10,%%eax                     \n"
-    "idiv      %1                              \n"
-    "mov       %0, %%eax                       \n"
-    : "+a"(num)  // %0
-    : "c"(div)   // %1
-    : "memory", "cc", "edx"
-  );
+  asm volatile(
+      "cdq                                       \n"
+      "shld        $0x10,%%eax,%%edx             \n"
+      "shl         $0x10,%%eax                   \n"
+      "idiv        %1                            \n"
+      "mov         %0, %%eax                     \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
   return num;
 }
 
 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
 int FixedDiv1_X86(int num, int div) {
-  asm volatile (
-    "cdq                                       \n"
-    "shld      $0x10,%%eax,%%edx               \n"
-    "shl       $0x10,%%eax                     \n"
-    "sub       $0x10001,%%eax                  \n"
-    "sbb       $0x0,%%edx                      \n"
-    "sub       $0x1,%1                         \n"
-    "idiv      %1                              \n"
-    "mov       %0, %%eax                       \n"
-    : "+a"(num)  // %0
-    : "c"(div)   // %1
-    : "memory", "cc", "edx"
-  );
+  asm volatile(
+      "cdq                                       \n"
+      "shld        $0x10,%%eax,%%edx             \n"
+      "shl         $0x10,%%eax                   \n"
+      "sub         $0x10001,%%eax                \n"
+      "sbb         $0x0,%%edx                    \n"
+      "sub         $0x1,%1                       \n"
+      "idiv        %1                            \n"
+      "mov         %0, %%eax                     \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
   return num;
 }
 
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+// Shuffle table for splitting UV into upper and lower part of register.
+static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
+                                      1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
+static const uvec8 kShuffleMergeUV = {0u,   8u,   2u,   10u,  4u,   12u,
+                                      6u,   14u,  0x80, 0x80, 0x80, 0x80,
+                                      0x80, 0x80, 0x80, 0x80};
+
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
+      "psrlw       $0xf,%%xmm4                   \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "pxor        %%xmm5, %%xmm5                \n"  // zero
+      "movdqa      %4,%%xmm1                     \n"  // split shuffler
+      "movdqa      %5,%%xmm3                     \n"  // merge shuffler
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // 8 UV row 0
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"  // 8 UV row 1
+      "lea         0x10(%0),%0                   \n"
+      "pshufb      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
+      "pshufb      %%xmm1,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"  // horizontal add
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"  // vertical add
+      "psrlw       $0x1,%%xmm0                   \n"  // round
+      "pavgw       %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm3,%%xmm0                 \n"  // merge uv
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"  // 4 UV
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kShuffleSplitUV),         // %4
+        "m"(kShuffleMergeUV)          // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
+      "vbroadcastf128 %4,%%ymm1                  \n"  // split shuffler
+      "vbroadcastf128 %5,%%ymm3                  \n"  // merge shuffler
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // 16 UV row 0
+      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"  // 16 UV row 1
+      "lea         0x20(%0),%0                   \n"
+      "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv
+      "vpshufb     %%ymm1,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // horizontal add
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"  // vertical add
+      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"  // round
+      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm3,%%ymm0,%%ymm0          \n"  // merge uv
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // combine qwords
+      "vmovdqu     %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"  // 8 UV
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kShuffleSplitUV),         // %4
+        "m"(kShuffleMergeUV)          // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/third_party/libyuv/source/scale_neon.cc b/media/libaom/src/third_party/libyuv/source/scale_neon.cc
index 7825878e98..572b4bfa9b 100644
--- a/media/libaom/src/third_party/libyuv/source/scale_neon.cc
+++ b/media/libaom/src/third_party/libyuv/source/scale_neon.cc
@@ -23,585 +23,541 @@ extern "C" {
 // Provided by Fritz Koenig
 
 // Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1"              // Clobber List
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into q0, odd into q1
+      "vld2.8      {q0, q1}, [%0]!               \n"
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q1}, [%1]!                   \n"  // store odd pixels
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1"  // Clobber List
   );
 }
 
 // Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // add adjacent
-    "vpaddl.u8  q1, q1                         \n"
-    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #1                     \n"
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1"     // Clobber List
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 32 pixels
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
+      "vst1.8      {q0}, [%1]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1"  // Clobber List
   );
 }
 
 // Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %0                         \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
-    MEMACCESS(1)
-    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
-    "vpaddl.u8  q1, q1                         \n"
-    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
-    "vpadal.u8  q1, q3                         \n"
-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #2                     \n"
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "q0", "q1", "q2", "q3"     // Clobber List
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %0                        \n"
+      "1:                                        \n"
+      "vld1.8      {q0, q1}, [%0]!               \n"  // load row 1 and post inc
+      "vld1.8      {q2, q3}, [%1]!               \n"  // load row 2 and post inc
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop
+      "vpaddl.u8   q0, q0                        \n"  // row 1 add adjacent
+      "vpaddl.u8   q1, q1                        \n"
+      "vpadal.u8   q0, q2                        \n"  // row 2 add adjacent +
+                                                      // row1
+      "vpadal.u8   q1, q3                        \n"
+      "vrshrn.u16  d0, q0, #2                    \n"  // downshift, round and
+                                                      // pack
+      "vrshrn.u16  d1, q1, #2                    \n"
+      "vst1.8      {q0}, [%2]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "q0", "q1", "q2", "q3"  // Clobber List
   );
 }
 
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
-    "subs       %2, %2, #8                     \n" // 8 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {d2}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1", "memory", "cc"
-  );
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "vst1.8      {d2}, [%1]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1", "memory", "cc");
 }
 
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  const uint8* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
-    MEMACCESS(3)
-    "vld1.8     {q1}, [%3]!                    \n"
-    MEMACCESS(4)
-    "vld1.8     {q2}, [%4]!                    \n"
-    MEMACCESS(5)
-    "vld1.8     {q3}, [%5]!                    \n"
-    "subs       %2, %2, #4                     \n"
-    "vpaddl.u8  q0, q0                         \n"
-    "vpadal.u8  q0, q1                         \n"
-    "vpadal.u8  q0, q2                         \n"
-    "vpadal.u8  q0, q3                         \n"
-    "vpaddl.u16 q0, q0                         \n"
-    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
-    "vmovn.u16  d0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.32    {d0[0]}, [%1]!                 \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width), // %2
-    "+r"(src_ptr1),  // %3
-    "+r"(src_ptr2),  // %4
-    "+r"(src_ptr3)   // %5
-  :
-  : "q0", "q1", "q2", "q3", "memory", "cc"
-  );
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load up 16x4
+      "vld1.8      {q1}, [%3]!                   \n"
+      "vld1.8      {q2}, [%4]!                   \n"
+      "vld1.8      {q3}, [%5]!                   \n"
+      "subs        %2, %2, #4                    \n"
+      "vpaddl.u8   q0, q0                        \n"
+      "vpadal.u8   q0, q1                        \n"
+      "vpadal.u8   q0, q2                        \n"
+      "vpadal.u8   q0, q3                        \n"
+      "vpaddl.u16  q0, q0                        \n"
+      "vrshrn.u32  d0, q0, #4                    \n"  // divide by 16 w/rounding
+      "vmovn.u16   d0, q0                        \n"
+      "vst1.32     {d0[0]}, [%1]!                \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_ptr1),   // %3
+        "+r"(src_ptr2),   // %4
+        "+r"(src_ptr3)    // %5
+      :
+      : "q0", "q1", "q2", "q3", "memory", "cc");
 }
 
 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
 // to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "subs       %2, %2, #24                  \n"
-    "vmov       d2, d3                       \n" // order d0, d1, d2
-    MEMACCESS(1)
-    "vst3.8     {d0, d1, d2}, [%1]!          \n"
-    "bgt        1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "d0", "d1", "d2", "d3", "memory", "cc"
-  );
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
+      "subs        %2, %2, #24                   \n"
+      "vmov        d2, d3                        \n"  // order d0, d1, d2
+      "vst3.8      {d0, d1, d2}, [%1]!           \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "d0", "d1", "d2", "d3", "memory", "cc");
 }
 
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vmov.u8    d24, #3                        \n"
-    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-    "subs         %2, %2, #24                  \n"
-
-    // filter src line 0 with src line 1
-    // expand chars to shorts to allow for room
-    // when adding lines together
-    "vmovl.u8     q8, d4                       \n"
-    "vmovl.u8     q9, d5                       \n"
-    "vmovl.u8     q10, d6                      \n"
-    "vmovl.u8     q11, d7                      \n"
-
-    // 3 * line_0 + line_1
-    "vmlal.u8     q8, d0, d24                  \n"
-    "vmlal.u8     q9, d1, d24                  \n"
-    "vmlal.u8     q10, d2, d24                 \n"
-    "vmlal.u8     q11, d3, d24                 \n"
-
-    // (3 * line_0 + line_1) >> 2
-    "vqrshrn.u16  d0, q8, #2                   \n"
-    "vqrshrn.u16  d1, q9, #2                   \n"
-    "vqrshrn.u16  d2, q10, #2                  \n"
-    "vqrshrn.u16  d3, q11, #2                  \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "vmovl.u8     q8, d1                       \n"
-    "vmlal.u8     q8, d0, d24                  \n"
-    "vqrshrn.u16  d0, q8, #2                   \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "vrhadd.u8    d1, d1, d2                   \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "vmovl.u8     q8, d2                       \n"
-    "vmlal.u8     q8, d3, d24                  \n"
-    "vqrshrn.u16  d2, q8, #2                   \n"
-
-    MEMACCESS(1)
-    "vst3.8       {d0, d1, d2}, [%1]!          \n"
-
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
-  );
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vmov.u8     d24, #3                       \n"
+      "add         %3, %0                        \n"
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
+      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"  // src line 1
+      "subs        %2, %2, #24                   \n"
+
+      // filter src line 0 with src line 1
+      // expand chars to shorts to allow for room
+      // when adding lines together
+      "vmovl.u8    q8, d4                        \n"
+      "vmovl.u8    q9, d5                        \n"
+      "vmovl.u8    q10, d6                       \n"
+      "vmovl.u8    q11, d7                       \n"
+
+      // 3 * line_0 + line_1
+      "vmlal.u8    q8, d0, d24                   \n"
+      "vmlal.u8    q9, d1, d24                   \n"
+      "vmlal.u8    q10, d2, d24                  \n"
+      "vmlal.u8    q11, d3, d24                  \n"
+
+      // (3 * line_0 + line_1) >> 2
+      "vqrshrn.u16 d0, q8, #2                    \n"
+      "vqrshrn.u16 d1, q9, #2                    \n"
+      "vqrshrn.u16 d2, q10, #2                   \n"
+      "vqrshrn.u16 d3, q11, #2                   \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "vmovl.u8    q8, d1                        \n"
+      "vmlal.u8    q8, d0, d24                   \n"
+      "vqrshrn.u16 d0, q8, #2                    \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "vrhadd.u8   d1, d1, d2                    \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "vmovl.u8    q8, d2                        \n"
+      "vmlal.u8    q8, d3, d24                   \n"
+      "vqrshrn.u16 d2, q8, #2                    \n"
+
+      "vst3.8      {d0, d1, d2}, [%1]!           \n"
+
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
+        "cc");
 }
 
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vmov.u8    d24, #3                        \n"
-    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-    "subs         %2, %2, #24                  \n"
-    // average src line 0 with src line 1
-    "vrhadd.u8    q0, q0, q2                   \n"
-    "vrhadd.u8    q1, q1, q3                   \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "vmovl.u8     q3, d1                       \n"
-    "vmlal.u8     q3, d0, d24                  \n"
-    "vqrshrn.u16  d0, q3, #2                   \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "vrhadd.u8    d1, d1, d2                   \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "vmovl.u8     q3, d2                       \n"
-    "vmlal.u8     q3, d3, d24                  \n"
-    "vqrshrn.u16  d2, q3, #2                   \n"
-
-    MEMACCESS(1)
-    "vst3.8       {d0, d1, d2}, [%1]!          \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
-  );
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vmov.u8     d24, #3                       \n"
+      "add         %3, %0                        \n"
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
+      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"  // src line 1
+      "subs        %2, %2, #24                   \n"
+      // average src line 0 with src line 1
+      "vrhadd.u8   q0, q0, q2                    \n"
+      "vrhadd.u8   q1, q1, q3                    \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "vmovl.u8    q3, d1                        \n"
+      "vmlal.u8    q3, d0, d24                   \n"
+      "vqrshrn.u16 d0, q3, #2                    \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "vrhadd.u8   d1, d1, d2                    \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "vmovl.u8    q3, d2                        \n"
+      "vmlal.u8    q3, d3, d24                   \n"
+      "vqrshrn.u16 d2, q3, #2                    \n"
+
+      "vst3.8      {d0, d1, d2}, [%1]!           \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
 }
 
 #define HAS_SCALEROWDOWN38_NEON
-static uvec8 kShuf38 =
-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
-  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
+                              22, 24, 27, 30, 0,  0,  0,  0};
+static const uvec8 kShuf38_2 = {0,  8, 16, 2,  10, 17, 4, 12,
+                                18, 6, 14, 19, 0,  0,  0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18};
 
 // 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {q3}, [%3]                     \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
-    "subs       %2, %2, #12                    \n"
-    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
-    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
-    MEMACCESS(1)
-    "vst1.8     {d4}, [%1]!                    \n"
-    MEMACCESS(1)
-    "vst1.32    {d5[0]}, [%1]!                 \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  : "r"(&kShuf38)           // %3
-  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
-  );
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vld1.8      {q3}, [%3]                    \n"
+      "1:                                        \n"
+      "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"
+      "subs        %2, %2, #12                   \n"
+      "vtbl.u8     d4, {d0, d1, d2, d3}, d6      \n"
+      "vtbl.u8     d5, {d0, d1, d2, d3}, d7      \n"
+      "vst1.8      {d4}, [%1]!                   \n"
+      "vst1.32     {d5[0]}, [%1]!                \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "r"(&kShuf38)    // %3
+      : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
 }
 
 // 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride * 2;
-
-  asm volatile (
-    MEMACCESS(5)
-    "vld1.16    {q13}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {q14}, [%6]                    \n"
-    MEMACCESS(7)
-    "vld1.8     {q15}, [%7]                    \n"
-    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-
-    // d0 = 00 40 01 41 02 42 03 43
-    // d1 = 10 50 11 51 12 52 13 53
-    // d2 = 20 60 21 61 22 62 23 63
-    // d3 = 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    MEMACCESS(4)
-    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
-    "subs         %2, %2, #12                  \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // d0 = 00 10 01 11 02 12 03 13
-    // d1 = 40 50 41 51 42 52 43 53
-    "vtrn.u8      d0, d1                       \n"
-    "vtrn.u8      d4, d5                       \n"
-    "vtrn.u8      d16, d17                     \n"
-
-    // d2 = 20 30 21 31 22 32 23 33
-    // d3 = 60 70 61 71 62 72 63 73
-    "vtrn.u8      d2, d3                       \n"
-    "vtrn.u8      d6, d7                       \n"
-    "vtrn.u8      d18, d19                     \n"
-
-    // d0 = 00+10 01+11 02+12 03+13
-    // d2 = 40+50 41+51 42+52 43+53
-    "vpaddl.u8    q0, q0                       \n"
-    "vpaddl.u8    q2, q2                       \n"
-    "vpaddl.u8    q8, q8                       \n"
-
-    // d3 = 60+70 61+71 62+72 63+73
-    "vpaddl.u8    d3, d3                       \n"
-    "vpaddl.u8    d7, d7                       \n"
-    "vpaddl.u8    d19, d19                     \n"
-
-    // combine source lines
-    "vadd.u16     q0, q2                       \n"
-    "vadd.u16     q0, q8                       \n"
-    "vadd.u16     d4, d3, d7                   \n"
-    "vadd.u16     d4, d19                      \n"
-
-    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-    //             + s[6 + st * 1] + s[7 + st * 1]
-    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-    "vqrdmulh.s16 q2, q2, q13                  \n"
-    "vmovn.u16    d4, q2                       \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "vmovl.u8     q1, d2                       \n"
-    "vmovl.u8     q3, d6                       \n"
-    "vmovl.u8     q9, d18                      \n"
-
-    // combine source lines
-    "vadd.u16     q1, q3                       \n"
-    "vadd.u16     q1, q9                       \n"
-
-    // d4 = xx 20 xx 30 xx 22 xx 32
-    // d5 = xx 21 xx 31 xx 23 xx 33
-    "vtrn.u32     d2, d3                       \n"
-
-    // d4 = xx 20 xx 21 xx 22 xx 23
-    // d5 = xx 30 xx 31 xx 32 xx 33
-    "vtrn.u16     d2, d3                       \n"
-
-    // 0+1+2, 3+4+5
-    "vadd.u16     q0, q1                       \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "vqrdmulh.s16 q0, q0, q15                  \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "vmov.u8      d2, d4                       \n"
-
-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
-
-    MEMACCESS(1)
-    "vst1.8       {d3}, [%1]!                  \n"
-    MEMACCESS(1)
-    "vst1.32      {d4[0]}, [%1]!               \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride),       // %3
-    "+r"(src_ptr1)          // %4
-  : "r"(&kMult38_Div6),     // %5
-    "r"(&kShuf38_2),        // %6
-    "r"(&kMult38_Div9)      // %7
-  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
-  );
+                                      uint8_t* dst_ptr,
+                                      int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
+
+  asm volatile(
+      "vld1.16     {q13}, [%5]                   \n"
+      "vld1.8      {q14}, [%6]                   \n"
+      "vld1.8      {q15}, [%7]                   \n"
+      "add         %3, %0                        \n"
+      "1:                                        \n"
+
+      // d0 = 00 40 01 41 02 42 03 43
+      // d1 = 10 50 11 51 12 52 13 53
+      // d2 = 20 60 21 61 22 62 23 63
+      // d3 = 30 70 31 71 32 72 33 73
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"
+      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"
+      "vld4.8      {d16, d17, d18, d19}, [%4]!   \n"
+      "subs        %2, %2, #12                   \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // d0 = 00 10 01 11 02 12 03 13
+      // d1 = 40 50 41 51 42 52 43 53
+      "vtrn.u8     d0, d1                        \n"
+      "vtrn.u8     d4, d5                        \n"
+      "vtrn.u8     d16, d17                      \n"
+
+      // d2 = 20 30 21 31 22 32 23 33
+      // d3 = 60 70 61 71 62 72 63 73
+      "vtrn.u8     d2, d3                        \n"
+      "vtrn.u8     d6, d7                        \n"
+      "vtrn.u8     d18, d19                      \n"
+
+      // d0 = 00+10 01+11 02+12 03+13
+      // d2 = 40+50 41+51 42+52 43+53
+      "vpaddl.u8   q0, q0                        \n"
+      "vpaddl.u8   q2, q2                        \n"
+      "vpaddl.u8   q8, q8                        \n"
+
+      // d3 = 60+70 61+71 62+72 63+73
+      "vpaddl.u8   d3, d3                        \n"
+      "vpaddl.u8   d7, d7                        \n"
+      "vpaddl.u8   d19, d19                      \n"
+
+      // combine source lines
+      "vadd.u16    q0, q2                        \n"
+      "vadd.u16    q0, q8                        \n"
+      "vadd.u16    d4, d3, d7                    \n"
+      "vadd.u16    d4, d19                       \n"
+
+      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+      //             + s[6 + st * 1] + s[7 + st * 1]
+      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+      "vqrdmulh.s16 q2, q2, q13                  \n"
+      "vmovn.u16   d4, q2                        \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "vmovl.u8    q1, d2                        \n"
+      "vmovl.u8    q3, d6                        \n"
+      "vmovl.u8    q9, d18                       \n"
+
+      // combine source lines
+      "vadd.u16    q1, q3                        \n"
+      "vadd.u16    q1, q9                        \n"
+
+      // d4 = xx 20 xx 30 xx 22 xx 32
+      // d5 = xx 21 xx 31 xx 23 xx 33
+      "vtrn.u32    d2, d3                        \n"
+
+      // d4 = xx 20 xx 21 xx 22 xx 23
+      // d5 = xx 30 xx 31 xx 32 xx 33
+      "vtrn.u16    d2, d3                        \n"
+
+      // 0+1+2, 3+4+5
+      "vadd.u16    q0, q1                        \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "vqrdmulh.s16 q0, q0, q15                  \n"
+
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+      "vmov.u8     d2, d4                        \n"
+
+      "vtbl.u8     d3, {d0, d1, d2}, d28         \n"
+      "vtbl.u8     d4, {d0, d1, d2}, d29         \n"
+
+      "vst1.8      {d3}, [%1]!                   \n"
+      "vst1.32     {d4[0]}, [%1]!                \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),       // %0
+        "+r"(dst_ptr),       // %1
+        "+r"(dst_width),     // %2
+        "+r"(src_stride),    // %3
+        "+r"(src_ptr1)       // %4
+      : "r"(&kMult38_Div6),  // %5
+        "r"(&kShuf38_2),     // %6
+        "r"(&kMult38_Div9)   // %7
+      : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
+        "cc");
 }
 
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    MEMACCESS(4)
-    "vld1.16    {q13}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {q14}, [%5]                    \n"
-    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-
-    // d0 = 00 40 01 41 02 42 03 43
-    // d1 = 10 50 11 51 12 52 13 53
-    // d2 = 20 60 21 61 22 62 23 63
-    // d3 = 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    "subs         %2, %2, #12                  \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // d0 = 00 10 01 11 02 12 03 13
-    // d1 = 40 50 41 51 42 52 43 53
-    "vtrn.u8      d0, d1                       \n"
-    "vtrn.u8      d4, d5                       \n"
-
-    // d2 = 20 30 21 31 22 32 23 33
-    // d3 = 60 70 61 71 62 72 63 73
-    "vtrn.u8      d2, d3                       \n"
-    "vtrn.u8      d6, d7                       \n"
-
-    // d0 = 00+10 01+11 02+12 03+13
-    // d2 = 40+50 41+51 42+52 43+53
-    "vpaddl.u8    q0, q0                       \n"
-    "vpaddl.u8    q2, q2                       \n"
-
-    // d3 = 60+70 61+71 62+72 63+73
-    "vpaddl.u8    d3, d3                       \n"
-    "vpaddl.u8    d7, d7                       \n"
-
-    // combine source lines
-    "vadd.u16     q0, q2                       \n"
-    "vadd.u16     d4, d3, d7                   \n"
-
-    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-    "vqrshrn.u16  d4, q2, #2                   \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "vmovl.u8     q1, d2                       \n"
-    "vmovl.u8     q3, d6                       \n"
-
-    // combine source lines
-    "vadd.u16     q1, q3                       \n"
-
-    // d4 = xx 20 xx 30 xx 22 xx 32
-    // d5 = xx 21 xx 31 xx 23 xx 33
-    "vtrn.u32     d2, d3                       \n"
-
-    // d4 = xx 20 xx 21 xx 22 xx 23
-    // d5 = xx 30 xx 31 xx 32 xx 33
-    "vtrn.u16     d2, d3                       \n"
-
-    // 0+1+2, 3+4+5
-    "vadd.u16     q0, q1                       \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "vqrdmulh.s16 q0, q0, q13                  \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "vmov.u8      d2, d4                       \n"
-
-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
-
-    MEMACCESS(1)
-    "vst1.8       {d3}, [%1]!                  \n"
-    MEMACCESS(1)
-    "vst1.32      {d4[0]}, [%1]!               \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),       // %0
-    "+r"(dst_ptr),       // %1
-    "+r"(dst_width),     // %2
-    "+r"(src_stride)     // %3
-  : "r"(&kMult38_Div6),  // %4
-    "r"(&kShuf38_2)      // %5
-  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
-  );
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vld1.16     {q13}, [%4]                   \n"
+      "vld1.8      {q14}, [%5]                   \n"
+      "add         %3, %0                        \n"
+      "1:                                        \n"
+
+      // d0 = 00 40 01 41 02 42 03 43
+      // d1 = 10 50 11 51 12 52 13 53
+      // d2 = 20 60 21 61 22 62 23 63
+      // d3 = 30 70 31 71 32 72 33 73
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"
+      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"
+      "subs        %2, %2, #12                   \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // d0 = 00 10 01 11 02 12 03 13
+      // d1 = 40 50 41 51 42 52 43 53
+      "vtrn.u8     d0, d1                        \n"
+      "vtrn.u8     d4, d5                        \n"
+
+      // d2 = 20 30 21 31 22 32 23 33
+      // d3 = 60 70 61 71 62 72 63 73
+      "vtrn.u8     d2, d3                        \n"
+      "vtrn.u8     d6, d7                        \n"
+
+      // d0 = 00+10 01+11 02+12 03+13
+      // d2 = 40+50 41+51 42+52 43+53
+      "vpaddl.u8   q0, q0                        \n"
+      "vpaddl.u8   q2, q2                        \n"
+
+      // d3 = 60+70 61+71 62+72 63+73
+      "vpaddl.u8   d3, d3                        \n"
+      "vpaddl.u8   d7, d7                        \n"
+
+      // combine source lines
+      "vadd.u16    q0, q2                        \n"
+      "vadd.u16    d4, d3, d7                    \n"
+
+      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+      "vqrshrn.u16 d4, q2, #2                    \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "vmovl.u8    q1, d2                        \n"
+      "vmovl.u8    q3, d6                        \n"
+
+      // combine source lines
+      "vadd.u16    q1, q3                        \n"
+
+      // d4 = xx 20 xx 30 xx 22 xx 32
+      // d5 = xx 21 xx 31 xx 23 xx 33
+      "vtrn.u32    d2, d3                        \n"
+
+      // d4 = xx 20 xx 21 xx 22 xx 23
+      // d5 = xx 30 xx 31 xx 32 xx 33
+      "vtrn.u16    d2, d3                        \n"
+
+      // 0+1+2, 3+4+5
+      "vadd.u16    q0, q1                        \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "vqrdmulh.s16 q0, q0, q13                  \n"
+
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+      "vmov.u8     d2, d4                        \n"
+
+      "vtbl.u8     d3, {d0, d1, d2}, d28         \n"
+      "vtbl.u8     d4, {d0, d1, d2}, d29         \n"
+
+      "vst1.8      {d3}, [%1]!                   \n"
+      "vst1.32     {d4[0]}, [%1]!                \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),       // %0
+        "+r"(dst_ptr),       // %1
+        "+r"(dst_width),     // %2
+        "+r"(src_stride)     // %3
+      : "r"(&kMult38_Div6),  // %4
+        "r"(&kShuf38_2)      // %5
+      : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
 }
 
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
-  const uint8* src_tmp = NULL;
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    "mov       %0, %1                          \n"
-    "mov       r12, %5                         \n"
-    "veor      q2, q2, q2                      \n"
-    "veor      q3, q3, q3                      \n"
-  "2:                                          \n"
-    // load 16 pixels into q0
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], %3                 \n"
-    "vaddw.u8   q3, q3, d1                     \n"
-    "vaddw.u8   q2, q2, d0                     \n"
-    "subs       r12, r12, #1                   \n"
-    "bgt        2b                             \n"
-    MEMACCESS(2)
-    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
-    "add        %1, %1, #16                    \n"
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop
-    "bgt        1b                             \n"
-  : "+r"(src_tmp),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(dst_ptr),          // %2
-    "+r"(src_stride),       // %3
-    "+r"(src_width),        // %4
-    "+r"(src_height)        // %5
-  :
-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
+// Add a row of bytes to a row of shorts.  Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.16     {q1, q2}, [%1]                \n"  // load accumulator
+      "vld1.8      {q0}, [%0]!                   \n"  // load 16 bytes
+      "vaddw.u8    q2, q2, d1                    \n"  // add
+      "vaddw.u8    q1, q1, d0                    \n"
+      "vst1.16     {q1, q2}, [%1]!               \n"  // store accumulator
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1", "q2"  // Clobber List
   );
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                                    \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5                     \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
-
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx) {
+#define LOAD2_DATA8_LANE(n)                      \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5                     \n" \
+  "add        %3, %3, %4                     \n" \
+  "vld2.8     {d6[" #n "], d7[" #n "]}, [%6] \n"
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
-  const uint8* src_tmp = src_ptr;
+  const uint8_t* src_tmp = src_ptr;
   asm volatile (
-    ".p2align   2                              \n"
-    "vdup.32    q0, %3                         \n"  // x
-    "vdup.32    q1, %4                         \n"  // dx
-    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
-    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
-    "vmul.s32   q1, q1, q2                     \n"
+      "vdup.32     q0, %3                        \n"  // x
+      "vdup.32     q1, %4                        \n"  // dx
+      "vld1.32     {q2}, [%5]                    \n"  // 0 1 2 3
+      "vshl.i32    q3, q1, #2                    \n"  // 4 * dx
+      "vmul.s32    q1, q1, q2                    \n"
     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "vadd.s32   q1, q1, q0                     \n"
+      "vadd.s32    q1, q1, q0                    \n"
     // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
-    "vadd.s32   q2, q1, q3                     \n"
-    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
-  "1:                                          \n"
+      "vadd.s32    q2, q1, q3                    \n"
+      "vshl.i32    q0, q3, #1                    \n"  // 8 * dx
+      "1:                                        \n"
     LOAD2_DATA8_LANE(0)
     LOAD2_DATA8_LANE(1)
     LOAD2_DATA8_LANE(2)
@@ -610,28 +566,27 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
     LOAD2_DATA8_LANE(5)
     LOAD2_DATA8_LANE(6)
     LOAD2_DATA8_LANE(7)
-    "vmov       q10, q1                        \n"
-    "vmov       q11, q2                        \n"
-    "vuzp.16    q10, q11                       \n"
-    "vmovl.u8   q8, d6                         \n"
-    "vmovl.u8   q9, d7                         \n"
-    "vsubl.s16  q11, d18, d16                  \n"
-    "vsubl.s16  q12, d19, d17                  \n"
-    "vmovl.u16  q13, d20                       \n"
-    "vmovl.u16  q10, d21                       \n"
-    "vmul.s32   q11, q11, q13                  \n"
-    "vmul.s32   q12, q12, q10                  \n"
-    "vshrn.s32  d18, q11, #16                  \n"
-    "vshrn.s32  d19, q12, #16                  \n"
-    "vadd.s16   q8, q8, q9                     \n"
-    "vmovn.s16  d6, q8                         \n"
-
-    MEMACCESS(0)
-    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
-    "vadd.s32   q1, q1, q0                     \n"
-    "vadd.s32   q2, q2, q0                     \n"
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "bgt        1b                             \n"
+      "vmov        q10, q1                       \n"
+      "vmov        q11, q2                       \n"
+      "vuzp.16     q10, q11                      \n"
+      "vmovl.u8    q8, d6                        \n"
+      "vmovl.u8    q9, d7                        \n"
+      "vsubl.s16   q11, d18, d16                 \n"
+      "vsubl.s16   q12, d19, d17                 \n"
+      "vmovl.u16   q13, d20                      \n"
+      "vmovl.u16   q10, d21                      \n"
+      "vmul.s32    q11, q11, q13                 \n"
+      "vmul.s32    q12, q12, q10                 \n"
+      "vrshrn.s32  d18, q11, #16                 \n"
+      "vrshrn.s32  d19, q12, #16                 \n"
+      "vadd.s16    q8, q8, q9                    \n"
+      "vmovn.s16   d6, q8                        \n"
+
+      "vst1.8      {d6}, [%0]!                   \n"  // store pixels
+      "vadd.s32    q1, q1, q0                    \n"
+      "vadd.s32    q2, q2, q0                    \n"
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "bgt         1b                            \n"
   : "+r"(dst_ptr),          // %0
     "+r"(src_ptr),          // %1
     "+r"(dst_width),        // %2
@@ -648,343 +603,310 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
 #undef LOAD2_DATA8_LANE
 
 // 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8* dst_ptr,
-                          const uint8* src_ptr, ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction) {
-  asm volatile (
-    "cmp          %4, #0                       \n"
-    "beq          100f                         \n"
-    "add          %2, %1                       \n"
-    "cmp          %4, #64                      \n"
-    "beq          75f                          \n"
-    "cmp          %4, #128                     \n"
-    "beq          50f                          \n"
-    "cmp          %4, #192                     \n"
-    "beq          25f                          \n"
-
-    "vdup.8       d5, %4                       \n"
-    "rsb          %4, #256                     \n"
-    "vdup.8       d4, %4                       \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vmull.u8     q13, d0, d4                  \n"
-    "vmull.u8     q14, d1, d4                  \n"
-    "vmlal.u8     q13, d2, d5                  \n"
-    "vmlal.u8     q14, d3, d5                  \n"
-    "vrshrn.u16   d0, q13, #8                  \n"
-    "vrshrn.u16   d1, q14, #8                  \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          1b                           \n"
-    "b            99f                          \n"
-
-    // Blend 25 / 75.
-  "25:                                         \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          25b                          \n"
-    "b            99f                          \n"
-
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          50b                          \n"
-    "b            99f                          \n"
-
-    // Blend 75 / 25.
-  "75:                                         \n"
-    MEMACCESS(1)
-    "vld1.8       {q1}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q0}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          75b                          \n"
-    "b            99f                          \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          100b                         \n"
-
-  "99:                                         \n"
-    MEMACCESS(0)
-    "vst1.8       {d1[7]}, [%0]                \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
-    "+r"(dst_width),        // %3
-    "+r"(source_y_fraction) // %4
-  :
-  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
-  );
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
+  asm volatile(
+      "cmp         %4, #0                        \n"
+      "beq         100f                          \n"
+      "add         %2, %1                        \n"
+      "cmp         %4, #64                       \n"
+      "beq         75f                           \n"
+      "cmp         %4, #128                      \n"
+      "beq         50f                           \n"
+      "cmp         %4, #192                      \n"
+      "beq         25f                           \n"
+
+      "vdup.8      d5, %4                        \n"
+      "rsb         %4, #256                      \n"
+      "vdup.8      d4, %4                        \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vmull.u8    q13, d0, d4                   \n"
+      "vmull.u8    q14, d1, d4                   \n"
+      "vmlal.u8    q13, d2, d5                   \n"
+      "vmlal.u8    q14, d3, d5                   \n"
+      "vrshrn.u16  d0, q13, #8                   \n"
+      "vrshrn.u16  d1, q14, #8                   \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         1b                            \n"
+      "b           99f                           \n"
+
+      // Blend 25 / 75.
+      "25:                                       \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         25b                           \n"
+      "b           99f                           \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         50b                           \n"
+      "b           99f                           \n"
+
+      // Blend 75 / 25.
+      "75:                                       \n"
+      "vld1.8      {q1}, [%1]!                   \n"
+      "vld1.8      {q0}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         75b                           \n"
+      "b           99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         100b                          \n"
+
+      "99:                                       \n"
+      "vst1.8      {d1[7]}, [%0]                 \n"
+      : "+r"(dst_ptr),           // %0
+        "+r"(src_ptr),           // %1
+        "+r"(src_stride),        // %2
+        "+r"(dst_width),         // %3
+        "+r"(source_y_fraction)  // %4
+      :
+      : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
 }
 
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    MEMACCESS(0)
-    "vld2.32    {q0, q1}, [%0]!                \n"
-    MEMACCESS(0)
-    "vld2.32    {q2, q3}, [%0]!                \n"
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-    MEMACCESS(1)
-    "vst1.8     {q3}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "vmov        q2, q1                        \n"  // load next 8 ARGB
+      "vst2.32     {q2, q3}, [%1]!               \n"  // store odd pixels
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
   );
 }
 
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #1                     \n"
-    "vrshrn.u16 d2, q2, #1                     \n"
-    "vrshrn.u16 d3, q3, #1                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
-    "bgt       1b                              \n"
-  : "+r"(src_argb),         // %0
-    "+r"(dst_argb),         // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
+//  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!
+//  4a:  3e04        subs  r6, #4
+//  4c:  f964 118d   vld4.32  {d17,d19,d21,d23}, [r4]!
+//  50:  ef64 21f4   vorr  q9, q10, q10
+//  54:  f942 038d   vst2.32  {d16-d19}, [r2]!
+//  58:  d1f5        bne.n  46 <ScaleARGBRowDown2_C+0x46>
+
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
+      "vrhadd.u8   q1, q2, q3                    \n"  // rounding half add
+      "vst2.32     {q0, q1}, [%1]!               \n"
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
   );
 }
 
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
-    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #2                     \n"
-    "vrshrn.u16 d2, q2, #2                     \n"
-    "vrshrn.u16 d3, q3, #2                     \n"
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
-  );
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vpaddl.u8   q3, q3                        \n"  // A 16 bytes -> 8 shorts.
+      "vld4.8      {d16, d18, d20, d22}, [%1]!   \n"  // load 8 more ARGB
+      "vld4.8      {d17, d19, d21, d23}, [%1]!   \n"  // load last 8 ARGB
+      "vpadal.u8   q0, q8                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q9                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q10                       \n"  // R 16 bytes -> 8 shorts.
+      "vpadal.u8   q3, q11                       \n"  // A 16 bytes -> 8 shorts.
+      "vrshrn.u16  d0, q0, #2                    \n"  // round and pack to bytes
+      "vrshrn.u16  d1, q1, #2                    \n"
+      "vrshrn.u16  d2, q2, #2                    \n"
+      "vrshrn.u16  d3, q3, #2                    \n"
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "mov        r12, %3, lsl #2                \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.32    {d0[0]}, [%0], r12             \n"
-    MEMACCESS(0)
-    "vld1.32    {d0[1]}, [%0], r12             \n"
-    MEMACCESS(0)
-    "vld1.32    {d1[0]}, [%0], r12             \n"
-    MEMACCESS(0)
-    "vld1.32    {d1[1]}, [%0], r12             \n"
-    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(dst_width)    // %2
-  : "r"(src_stepx)     // %3
-  : "memory", "cc", "r12", "q0"
-  );
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "mov         r12, %3, lsl #2               \n"
+      "1:                                        \n"
+      "vld1.32     {d0[0]}, [%0], r12            \n"
+      "vld1.32     {d0[1]}, [%0], r12            \n"
+      "vld1.32     {d1[0]}, [%0], r12            \n"
+      "vld1.32     {d1[1]}, [%0], r12            \n"
+      "subs        %2, %2, #4                    \n"  // 4 pixels per loop.
+      "vst1.8      {q0}, [%1]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      : "r"(src_stepx)   // %3
+      : "memory", "cc", "r12", "q0");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "mov        r12, %4, lsl #2                \n"
-    "add        %1, %1, %0                     \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1], r12                \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0], r12                \n"
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1], r12                \n"
-    MEMACCESS(0)
-    "vld1.8     {d4}, [%0], r12                \n"
-    MEMACCESS(1)
-    "vld1.8     {d5}, [%1], r12                \n"
-    MEMACCESS(0)
-    "vld1.8     {d6}, [%0], r12                \n"
-    MEMACCESS(1)
-    "vld1.8     {d7}, [%1], r12                \n"
-    "vaddl.u8   q0, d0, d1                     \n"
-    "vaddl.u8   q1, d2, d3                     \n"
-    "vaddl.u8   q2, d4, d5                     \n"
-    "vaddl.u8   q3, d6, d7                     \n"
-    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
-    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
-    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
-    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
-    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
-    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
-    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(src_stride),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(dst_width)    // %3
-  : "r"(src_stepx)     // %4
-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
-  );
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  asm volatile(
+      "mov         r12, %4, lsl #2               \n"
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "vld1.8      {d0}, [%0], r12               \n"  // 4 2x2 blocks -> 2x1
+      "vld1.8      {d1}, [%1], r12               \n"
+      "vld1.8      {d2}, [%0], r12               \n"
+      "vld1.8      {d3}, [%1], r12               \n"
+      "vld1.8      {d4}, [%0], r12               \n"
+      "vld1.8      {d5}, [%1], r12               \n"
+      "vld1.8      {d6}, [%0], r12               \n"
+      "vld1.8      {d7}, [%1], r12               \n"
+      "vaddl.u8    q0, d0, d1                    \n"
+      "vaddl.u8    q1, d2, d3                    \n"
+      "vaddl.u8    q2, d4, d5                    \n"
+      "vaddl.u8    q3, d6, d7                    \n"
+      "vswp.8      d1, d2                        \n"  // ab_cd -> ac_bd
+      "vswp.8      d5, d6                        \n"  // ef_gh -> eg_fh
+      "vadd.u16    q0, q0, q1                    \n"  // (a+b)_(c+d)
+      "vadd.u16    q2, q2, q3                    \n"  // (e+f)_(g+h)
+      "vrshrn.u16  d0, q0, #2                    \n"  // first 2 pixels.
+      "vrshrn.u16  d1, q2, #2                    \n"  // next 2 pixels.
+      "subs        %3, %3, #4                    \n"  // 4 pixels per loop.
+      "vst1.8      {q0}, [%2]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_argb),    // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(dst_width)    // %3
+      : "r"(src_stepx)     // %4
+      : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD1_DATA32_LANE(dn, n)                               \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5, lsl #2             \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
-
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  int tmp = 0;
-  const uint8* src_tmp = src_argb;
-  asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    LOAD1_DATA32_LANE(d0, 0)
-    LOAD1_DATA32_LANE(d0, 1)
-    LOAD1_DATA32_LANE(d1, 0)
-    LOAD1_DATA32_LANE(d1, 1)
-    LOAD1_DATA32_LANE(d2, 0)
-    LOAD1_DATA32_LANE(d2, 1)
-    LOAD1_DATA32_LANE(d3, 0)
-    LOAD1_DATA32_LANE(d3, 1)
-
-    MEMACCESS(0)
-    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),         // %0
-    "+r"(src_argb),         // %1
-    "+r"(dst_width),        // %2
-    "+r"(x),                // %3
-    "+r"(dx),               // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "q0", "q1"
-  );
+#define LOAD1_DATA32_LANE(dn, n)                 \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5, lsl #2             \n" \
+  "add        %3, %3, %4                     \n" \
+  "vld1.32    {" #dn "[" #n "]}, [%6]        \n"
+
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  int tmp;
+  const uint8_t* src_tmp = src_argb;
+  asm volatile(
+      "1:                                        \n"
+      // clang-format off
+      LOAD1_DATA32_LANE(d0, 0)
+      LOAD1_DATA32_LANE(d0, 1)
+      LOAD1_DATA32_LANE(d1, 0)
+      LOAD1_DATA32_LANE(d1, 1)
+      LOAD1_DATA32_LANE(d2, 0)
+      LOAD1_DATA32_LANE(d2, 1)
+      LOAD1_DATA32_LANE(d3, 0)
+      LOAD1_DATA32_LANE(d3, 1)
+      // clang-format on
+      "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "bgt         1b                            \n"
+      : "+r"(dst_argb),   // %0
+        "+r"(src_argb),   // %1
+        "+r"(dst_width),  // %2
+        "+r"(x),          // %3
+        "+r"(dx),         // %4
+        "=&r"(tmp),       // %5
+        "+r"(src_tmp)     // %6
+      :
+      : "memory", "cc", "q0", "q1");
 }
 
 #undef LOAD1_DATA32_LANE
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
-    "lsr        %5, %3, #16                           \n"      \
-    "add        %6, %1, %5, lsl #2                    \n"      \
-    "add        %3, %3, %4                            \n"      \
-    MEMACCESS(6)                                               \
-    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
-
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx) {
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                       \
+  "lsr        %5, %3, #16                                \n" \
+  "add        %6, %1, %5, lsl #2                         \n" \
+  "add        %3, %3, %4                                 \n" \
+  "vld2.32    {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
+
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
-  const uint8* src_tmp = src_argb;
+  const uint8_t* src_tmp = src_argb;
   asm volatile (
-    ".p2align   2                              \n"
-    "vdup.32    q0, %3                         \n"  // x
-    "vdup.32    q1, %4                         \n"  // dx
-    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
-    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
-    "vmul.s32   q1, q1, q2                     \n"
-    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
-    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
+      "vdup.32     q0, %3                        \n"  // x
+      "vdup.32     q1, %4                        \n"  // dx
+      "vld1.32     {q2}, [%5]                    \n"  // 0 1 2 3
+      "vshl.i32    q9, q1, #2                    \n"  // 4 * dx
+      "vmul.s32    q1, q1, q2                    \n"
+      "vmov.i8     q3, #0x7f                     \n"  // 0x7F
+      "vmov.i16    q15, #0x7f                    \n"  // 0x7F
     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "vadd.s32   q8, q1, q0                     \n"
-  "1:                                          \n"
+      "vadd.s32    q8, q1, q0                    \n"
+      "1:                                        \n"
     // d0, d1: a
     // d2, d3: b
     LOAD2_DATA32_LANE(d0, d2, 0)
@@ -1009,7 +931,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
     "vshrn.i16   d0, q11, #7                   \n"
     "vshrn.i16   d1, q12, #7                   \n"
 
-    MEMACCESS(0)
     "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
     "vadd.s32    q8, q8, q9                    \n"
     "subs        %2, %2, #4                    \n"  // 4 processed per loop
@@ -1029,6 +950,64 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
 
 #undef LOAD2_DATA32_LANE
 
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "vld2.8      {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
+      "vld2.8      {d1, d3}, [%0]!               \n"  // load next 8 UV
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vpaddl.u8   q0, q0                        \n"  // U 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // V 16 bytes -> 8 shorts.
+      "vld2.8      {d16, d18}, [%1]!             \n"  // load 8 more UV
+      "vld2.8      {d17, d19}, [%1]!             \n"  // load last 8 UV
+      "vpadal.u8   q0, q8                        \n"  // U 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q9                        \n"  // V 16 bytes -> 8 shorts.
+      "vrshrn.u16  d0, q0, #2                    \n"  // round and pack to bytes
+      "vrshrn.u16  d1, q1, #2                    \n"
+      "vst2.8      {d0, d1}, [%2]!               \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q8", "q9");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             int src_stepx,  // pixel step
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+  const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+  const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.16    {d0[0]}, [%0], %6              \n"
+      "vld1.16    {d0[1]}, [%1], %6              \n"
+      "vld1.16    {d0[2]}, [%2], %6              \n"
+      "vld1.16    {d0[3]}, [%3], %6              \n"
+      "subs       %5, %5, #4                     \n"  // 4 pixels per loop.
+      "vst1.8     {d0}, [%4]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),                 // %0
+        "+r"(src1_ptr),                // %1
+        "+r"(src2_ptr),                // %2
+        "+r"(src3_ptr),                // %3
+        "+r"(dst_ptr),                 // %4
+        "+r"(dst_width)                // %5
+      : "r"(src_stepx * 8)             // %6
+      : "memory", "cc", "d0");
+}
+
 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/third_party/libyuv/source/scale_neon64.cc b/media/libaom/src/third_party/libyuv/source/scale_neon64.cc
index 1d55193579..185591cb55 100644
--- a/media/libaom/src/third_party/libyuv/source/scale_neon64.cc
+++ b/media/libaom/src/third_party/libyuv/source/scale_neon64.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/scale.h"
 #include "libyuv/row.h"
+#include "libyuv/scale.h"
 #include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
@@ -21,592 +21,577 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 // Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into v0, odd into v1
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1"              // Clobber List
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into v0, odd into v1
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v1.16b}, [%1], #16           \n"  // store odd pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1"  // Clobber List
   );
 }
 
 // Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
-    "uaddlp     v1.8h, v1.16b                  \n"
-    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
-    "rshrn2     v0.16b, v1.8h, #1              \n"
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1"     // Clobber List
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into v0, odd into v1
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v0.16b}, [%1], #16           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1"  // Clobber List
   );
 }
 
 // Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
-    MEMACCESS(1)
-    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
-    "uaddlp     v1.8h, v1.16b                  \n"
-    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
-    "uadalp     v1.8h, v3.16b                  \n"
-    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
-    "rshrn2     v0.16b, v1.8h, #2              \n"
-    MEMACCESS(2)
-    "st1        {v0.16b}, [%2], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "v0", "v1", "v2", "v3"     // Clobber List
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "ld1         {v0.16b, v1.16b}, [%0], #32   \n"  // load row 1 and post inc
+      "ld1         {v2.16b, v3.16b}, [%1], #32   \n"  // load row 2 and post inc
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
+      "uaddlp      v0.8h, v0.16b                 \n"  // row 1 add adjacent
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uaddlp      v1.8h, v1.16b                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v2.16b                 \n"  // += row 2 add adjacent
+      "uadalp      v1.8h, v3.16b                 \n"
+      "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
+      "rshrn2      v0.16b, v1.8h, #2             \n"
+      "st1         {v0.16b}, [%2], #16           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-    MEMACCESS(1)
-    "st1     {v2.8b}, [%1], #8                 \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v2.8b}, [%1], #8             \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
 }
 
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  const uint8* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
-    MEMACCESS(3)
-    "ld1     {v1.16b}, [%2], #16               \n"
-    MEMACCESS(4)
-    "ld1     {v2.16b}, [%3], #16               \n"
-    MEMACCESS(5)
-    "ld1     {v3.16b}, [%4], #16               \n"
-    "subs    %w5, %w5, #4                      \n"
-    "uaddlp  v0.8h, v0.16b                     \n"
-    "uadalp  v0.8h, v1.16b                     \n"
-    "uadalp  v0.8h, v2.16b                     \n"
-    "uadalp  v0.8h, v3.16b                     \n"
-    "addp    v0.8h, v0.8h, v0.8h               \n"
-    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
-    MEMACCESS(1)
-    "st1    {v0.s}[0], [%1], #4                \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(src_ptr1),  // %2
-    "+r"(src_ptr2),  // %3
-    "+r"(src_ptr3),  // %4
-    "+r"(dst_width)  // %5
-  :
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load up 16x4
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "ld1         {v2.16b}, [%3], #16           \n"
+      "ld1         {v3.16b}, [%4], #16           \n"
+      "subs        %w5, %w5, #4                  \n"
+      "uaddlp      v0.8h, v0.16b                 \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uadalp      v0.8h, v1.16b                 \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "uadalp      v0.8h, v2.16b                 \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "uadalp      v0.8h, v3.16b                 \n"
+      "prfm        pldl1keep, [%4, 448]          \n"
+      "addp        v0.8h, v0.8h, v0.8h           \n"
+      "rshrn       v0.8b, v0.8h, #4              \n"  // divide by 16 w/rounding
+      "st1         {v0.s}[0], [%1], #4           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_ptr1),  // %2
+        "+r"(src_ptr2),  // %3
+        "+r"(src_ptr3),  // %4
+        "+r"(dst_width)  // %5
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
 }
 
 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
 // to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    "subs      %w2, %w2, #24                           \n"
-    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
-    MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
+      "subs        %w2, %w2, #24                 \n"
+      "orr         v2.16b, v3.16b, v3.16b        \n"  // order v0,v1,v2
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
 }
 
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movi      v20.8b, #3                              \n"
-    "add       %3, %3, %0                              \n"
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
-    "subs         %w2, %w2, #24                        \n"
-
-    // filter src line 0 with src line 1
-    // expand chars to shorts to allow for room
-    // when adding lines together
-    "ushll     v16.8h, v4.8b, #0                       \n"
-    "ushll     v17.8h, v5.8b, #0                       \n"
-    "ushll     v18.8h, v6.8b, #0                       \n"
-    "ushll     v19.8h, v7.8b, #0                       \n"
-
-    // 3 * line_0 + line_1
-    "umlal     v16.8h, v0.8b, v20.8b                   \n"
-    "umlal     v17.8h, v1.8b, v20.8b                   \n"
-    "umlal     v18.8h, v2.8b, v20.8b                   \n"
-    "umlal     v19.8h, v3.8b, v20.8b                   \n"
-
-    // (3 * line_0 + line_1) >> 2
-    "uqrshrn   v0.8b, v16.8h, #2                       \n"
-    "uqrshrn   v1.8b, v17.8h, #2                       \n"
-    "uqrshrn   v2.8b, v18.8h, #2                       \n"
-    "uqrshrn   v3.8b, v19.8h, #2                       \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "ushll     v16.8h, v1.8b, #0                       \n"
-    "umlal     v16.8h, v0.8b, v20.8b                   \n"
-    "uqrshrn   v0.8b, v16.8h, #2                       \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "ushll     v16.8h, v2.8b, #0                       \n"
-    "umlal     v16.8h, v3.8b, v20.8b                   \n"
-    "uqrshrn   v2.8b, v16.8h, #2                       \n"
-
-    MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
-
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
-    "v20", "memory", "cc"
-  );
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "movi        v20.8b, #3                    \n"
+      "add         %3, %3, %0                    \n"
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"  // src line 1
+      "subs        %w2, %w2, #24                 \n"
+
+      // filter src line 0 with src line 1
+      // expand chars to shorts to allow for room
+      // when adding lines together
+      "ushll       v16.8h, v4.8b, #0             \n"
+      "ushll       v17.8h, v5.8b, #0             \n"
+      "ushll       v18.8h, v6.8b, #0             \n"
+      "ushll       v19.8h, v7.8b, #0             \n"
+
+      // 3 * line_0 + line_1
+      "umlal       v16.8h, v0.8b, v20.8b         \n"
+      "umlal       v17.8h, v1.8b, v20.8b         \n"
+      "umlal       v18.8h, v2.8b, v20.8b         \n"
+      "umlal       v19.8h, v3.8b, v20.8b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      // (3 * line_0 + line_1) >> 2
+      "uqrshrn     v0.8b, v16.8h, #2             \n"
+      "uqrshrn     v1.8b, v17.8h, #2             \n"
+      "uqrshrn     v2.8b, v18.8h, #2             \n"
+      "uqrshrn     v3.8b, v19.8h, #2             \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "ushll       v16.8h, v1.8b, #0             \n"
+      "umlal       v16.8h, v0.8b, v20.8b         \n"
+      "uqrshrn     v0.8b, v16.8h, #2             \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "urhadd      v1.8b, v1.8b, v2.8b           \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "ushll       v16.8h, v2.8b, #0             \n"
+      "umlal       v16.8h, v3.8b, v20.8b         \n"
+      "uqrshrn     v2.8b, v16.8h, #2             \n"
+
+      "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "memory", "cc");
 }
 
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movi      v20.8b, #3                              \n"
-    "add       %3, %3, %0                              \n"
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
-    "subs         %w2, %w2, #24                        \n"
-    // average src line 0 with src line 1
-    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
-    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
-    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
-    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "ushll     v4.8h, v1.8b, #0                        \n"
-    "umlal     v4.8h, v0.8b, v20.8b                    \n"
-    "uqrshrn   v0.8b, v4.8h, #2                        \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "ushll     v4.8h, v2.8b, #0                        \n"
-    "umlal     v4.8h, v3.8b, v20.8b                    \n"
-    "uqrshrn   v2.8b, v4.8h, #2                        \n"
-
-    MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
-  );
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "movi        v20.8b, #3                    \n"
+      "add         %3, %3, %0                    \n"
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"  // src line 1
+      "subs        %w2, %w2, #24                 \n"
+      // average src line 0 with src line 1
+      "urhadd      v0.8b, v0.8b, v4.8b           \n"
+      "urhadd      v1.8b, v1.8b, v5.8b           \n"
+      "urhadd      v2.8b, v2.8b, v6.8b           \n"
+      "urhadd      v3.8b, v3.8b, v7.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "ushll       v4.8h, v1.8b, #0              \n"
+      "umlal       v4.8h, v0.8b, v20.8b          \n"
+      "uqrshrn     v0.8b, v4.8h, #2              \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "urhadd      v1.8b, v1.8b, v2.8b           \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "ushll       v4.8h, v2.8b, #0              \n"
+      "umlal       v4.8h, v3.8b, v20.8b          \n"
+      "uqrshrn     v2.8b, v4.8h, #2              \n"
+
+      "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
 }
 
-static uvec8 kShuf38 =
-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
-  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
+                              22, 24, 27, 30, 0,  0,  0,  0};
+static const uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
+                                34, 6,  22, 35, 0,  0,  0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18};
 
 // 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    MEMACCESS(3)
-    "ld1       {v3.16b}, [%3]                          \n"
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
-    "subs      %w2, %w2, #12                           \n"
-    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
-    MEMACCESS(1)
-    "st1       {v2.8b}, [%1], #8                       \n"
-    MEMACCESS(1)
-    "st1       {v2.s}[2], [%1], #4                     \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  : "r"(&kShuf38)           // %3
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "ld1         {v3.16b}, [%3]                \n"
+      "1:                                        \n"
+      "ld1         {v0.16b,v1.16b}, [%0], #32    \n"
+      "subs        %w2, %w2, #12                 \n"
+      "tbl         v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v2.8b}, [%1], #8             \n"
+      "st1         {v2.s}[2], [%1], #4           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "r"(&kShuf38)    // %3
+      : "v0", "v1", "v2", "v3", "memory", "cc");
 }
 
 // 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+                                      uint8_t* dst_ptr,
+                                      int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
   ptrdiff_t tmp_src_stride = src_stride;
 
-  asm volatile (
-    MEMACCESS(5)
-    "ld1       {v29.8h}, [%5]                          \n"
-    MEMACCESS(6)
-    "ld1       {v30.16b}, [%6]                         \n"
-    MEMACCESS(7)
-    "ld1       {v31.8h}, [%7]                          \n"
-    "add       %2, %2, %0                              \n"
-  "1:                                                  \n"
-
-    // 00 40 01 41 02 42 03 43
-    // 10 50 11 51 12 52 13 53
-    // 20 60 21 61 22 62 23 63
-    // 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
-    MEMACCESS(4)
-    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
-    "subs      %w4, %w4, #12                           \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // 00 10 01 11 02 12 03 13
-    // 40 50 41 51 42 52 43 53
-    "trn1      v20.8b, v0.8b, v1.8b                    \n"
-    "trn2      v21.8b, v0.8b, v1.8b                    \n"
-    "trn1      v22.8b, v4.8b, v5.8b                    \n"
-    "trn2      v23.8b, v4.8b, v5.8b                    \n"
-    "trn1      v24.8b, v16.8b, v17.8b                  \n"
-    "trn2      v25.8b, v16.8b, v17.8b                  \n"
-
-    // 20 30 21 31 22 32 23 33
-    // 60 70 61 71 62 72 63 73
-    "trn1      v0.8b, v2.8b, v3.8b                     \n"
-    "trn2      v1.8b, v2.8b, v3.8b                     \n"
-    "trn1      v4.8b, v6.8b, v7.8b                     \n"
-    "trn2      v5.8b, v6.8b, v7.8b                     \n"
-    "trn1      v16.8b, v18.8b, v19.8b                  \n"
-    "trn2      v17.8b, v18.8b, v19.8b                  \n"
-
-    // 00+10 01+11 02+12 03+13
-    // 40+50 41+51 42+52 43+53
-    "uaddlp    v20.4h, v20.8b                          \n"
-    "uaddlp    v21.4h, v21.8b                          \n"
-    "uaddlp    v22.4h, v22.8b                          \n"
-    "uaddlp    v23.4h, v23.8b                          \n"
-    "uaddlp    v24.4h, v24.8b                          \n"
-    "uaddlp    v25.4h, v25.8b                          \n"
-
-    // 60+70 61+71 62+72 63+73
-    "uaddlp    v1.4h, v1.8b                            \n"
-    "uaddlp    v5.4h, v5.8b                            \n"
-    "uaddlp    v17.4h, v17.8b                          \n"
-
-    // combine source lines
-    "add       v20.4h, v20.4h, v22.4h                  \n"
-    "add       v21.4h, v21.4h, v23.4h                  \n"
-    "add       v20.4h, v20.4h, v24.4h                  \n"
-    "add       v21.4h, v21.4h, v25.4h                  \n"
-    "add       v2.4h, v1.4h, v5.4h                     \n"
-    "add       v2.4h, v2.4h, v17.4h                    \n"
-
-    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-    //             + s[6 + st * 1] + s[7 + st * 1]
-    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
-    "xtn       v2.8b,  v2.8h                           \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "ushll     v16.8h, v16.8b, #0                      \n"
-    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
-
-    // combine source lines
-    "add       v0.8h, v0.8h, v16.8h                    \n"
-
-    // xx 20 xx 21 xx 22 xx 23
-    // xx 30 xx 31 xx 32 xx 33
-    "trn1      v1.8h, v0.8h, v0.8h                     \n"
-    "trn2      v4.8h, v0.8h, v0.8h                     \n"
-    "xtn       v0.4h, v1.4s                            \n"
-    "xtn       v4.4h, v4.4s                            \n"
-
-    // 0+1+2, 3+4+5
-    "add       v20.8h, v20.8h, v0.8h                   \n"
-    "add       v21.8h, v21.8h, v4.8h                   \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
-    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
-
-    MEMACCESS(1)
-    "st1       {v3.8b}, [%1], #8                       \n"
-    MEMACCESS(1)
-    "st1       {v3.s}[2], [%1], #4                     \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(tmp_src_stride),   // %2
-    "+r"(src_ptr1),         // %3
-    "+r"(dst_width)         // %4
-  : "r"(&kMult38_Div6),     // %5
-    "r"(&kShuf38_2),        // %6
-    "r"(&kMult38_Div9)      // %7
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
-    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
-    "v30", "v31", "memory", "cc"
-  );
+  asm volatile(
+      "ld1         {v29.8h}, [%5]                \n"
+      "ld1         {v30.16b}, [%6]               \n"
+      "ld1         {v31.8h}, [%7]                \n"
+      "add         %2, %2, %0                    \n"
+      "1:                                        \n"
+
+      // 00 40 01 41 02 42 03 43
+      // 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63
+      // 30 70 31 71 32 72 33 73
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+      "subs        %w4, %w4, #12                 \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // 00 10 01 11 02 12 03 13
+      // 40 50 41 51 42 52 43 53
+      "trn1        v20.8b, v0.8b, v1.8b          \n"
+      "trn2        v21.8b, v0.8b, v1.8b          \n"
+      "trn1        v22.8b, v4.8b, v5.8b          \n"
+      "trn2        v23.8b, v4.8b, v5.8b          \n"
+      "trn1        v24.8b, v16.8b, v17.8b        \n"
+      "trn2        v25.8b, v16.8b, v17.8b        \n"
+
+      // 20 30 21 31 22 32 23 33
+      // 60 70 61 71 62 72 63 73
+      "trn1        v0.8b, v2.8b, v3.8b           \n"
+      "trn2        v1.8b, v2.8b, v3.8b           \n"
+      "trn1        v4.8b, v6.8b, v7.8b           \n"
+      "trn2        v5.8b, v6.8b, v7.8b           \n"
+      "trn1        v16.8b, v18.8b, v19.8b        \n"
+      "trn2        v17.8b, v18.8b, v19.8b        \n"
+
+      // 00+10 01+11 02+12 03+13
+      // 40+50 41+51 42+52 43+53
+      "uaddlp      v20.4h, v20.8b                \n"
+      "uaddlp      v21.4h, v21.8b                \n"
+      "uaddlp      v22.4h, v22.8b                \n"
+      "uaddlp      v23.4h, v23.8b                \n"
+      "uaddlp      v24.4h, v24.8b                \n"
+      "uaddlp      v25.4h, v25.8b                \n"
+
+      // 60+70 61+71 62+72 63+73
+      "uaddlp      v1.4h, v1.8b                  \n"
+      "uaddlp      v5.4h, v5.8b                  \n"
+      "uaddlp      v17.4h, v17.8b                \n"
+
+      // combine source lines
+      "add         v20.4h, v20.4h, v22.4h        \n"
+      "add         v21.4h, v21.4h, v23.4h        \n"
+      "add         v20.4h, v20.4h, v24.4h        \n"
+      "add         v21.4h, v21.4h, v25.4h        \n"
+      "add         v2.4h, v1.4h, v5.4h           \n"
+      "add         v2.4h, v2.4h, v17.4h          \n"
+
+      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+      //             + s[6 + st * 1] + s[7 + st * 1]
+      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+      "sqrdmulh    v2.8h, v2.8h, v29.8h          \n"
+      "xtn         v2.8b,  v2.8h                 \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "ushll       v16.8h, v16.8b, #0            \n"
+      "uaddl       v0.8h, v0.8b, v4.8b           \n"
+
+      // combine source lines
+      "add         v0.8h, v0.8h, v16.8h          \n"
+
+      // xx 20 xx 21 xx 22 xx 23
+      // xx 30 xx 31 xx 32 xx 33
+      "trn1        v1.8h, v0.8h, v0.8h           \n"
+      "trn2        v4.8h, v0.8h, v0.8h           \n"
+      "xtn         v0.4h, v1.4s                  \n"
+      "xtn         v4.4h, v4.4s                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      // 0+1+2, 3+4+5
+      "add         v20.8h, v20.8h, v0.8h         \n"
+      "add         v21.8h, v21.8h, v4.8h         \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "sqrdmulh    v0.8h, v20.8h, v31.8h         \n"
+      "sqrdmulh    v1.8h, v21.8h, v31.8h         \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+
+      // Align for table lookup, vtbl requires registers to be adjacent
+      "tbl         v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+      "st1         {v3.8b}, [%1], #8             \n"
+      "st1         {v3.s}[2], [%1], #4           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),         // %0
+        "+r"(dst_ptr),         // %1
+        "+r"(tmp_src_stride),  // %2
+        "+r"(src_ptr1),        // %3
+        "+r"(dst_width)        // %4
+      : "r"(&kMult38_Div6),    // %5
+        "r"(&kShuf38_2),       // %6
+        "r"(&kMult38_Div9)     // %7
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
+        "memory", "cc");
 }
 
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+                               uint8_t* dst_ptr,
+                               int dst_width) {
   // TODO(fbarchard): use src_stride directly for clang 3.5+.
   ptrdiff_t tmp_src_stride = src_stride;
-  asm volatile (
-    MEMACCESS(4)
-    "ld1       {v30.8h}, [%4]                          \n"
-    MEMACCESS(5)
-    "ld1       {v31.16b}, [%5]                         \n"
-    "add       %2, %2, %0                              \n"
-  "1:                                                  \n"
-
-    // 00 40 01 41 02 42 03 43
-    // 10 50 11 51 12 52 13 53
-    // 20 60 21 61 22 62 23 63
-    // 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
-    "subs      %w3, %w3, #12                           \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // 00 10 01 11 02 12 03 13
-    // 40 50 41 51 42 52 43 53
-    "trn1      v16.8b, v0.8b, v1.8b                    \n"
-    "trn2      v17.8b, v0.8b, v1.8b                    \n"
-    "trn1      v18.8b, v4.8b, v5.8b                    \n"
-    "trn2      v19.8b, v4.8b, v5.8b                    \n"
-
-    // 20 30 21 31 22 32 23 33
-    // 60 70 61 71 62 72 63 73
-    "trn1      v0.8b, v2.8b, v3.8b                     \n"
-    "trn2      v1.8b, v2.8b, v3.8b                     \n"
-    "trn1      v4.8b, v6.8b, v7.8b                     \n"
-    "trn2      v5.8b, v6.8b, v7.8b                     \n"
-
-    // 00+10 01+11 02+12 03+13
-    // 40+50 41+51 42+52 43+53
-    "uaddlp    v16.4h, v16.8b                          \n"
-    "uaddlp    v17.4h, v17.8b                          \n"
-    "uaddlp    v18.4h, v18.8b                          \n"
-    "uaddlp    v19.4h, v19.8b                          \n"
-
-    // 60+70 61+71 62+72 63+73
-    "uaddlp    v1.4h, v1.8b                            \n"
-    "uaddlp    v5.4h, v5.8b                            \n"
-
-    // combine source lines
-    "add       v16.4h, v16.4h, v18.4h                  \n"
-    "add       v17.4h, v17.4h, v19.4h                  \n"
-    "add       v2.4h, v1.4h, v5.4h                     \n"
-
-    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-    "uqrshrn   v2.8b, v2.8h, #2                        \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-
-    // combine source lines
-    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
-
-    // xx 20 xx 21 xx 22 xx 23
-    // xx 30 xx 31 xx 32 xx 33
-    "trn1      v1.8h, v0.8h, v0.8h                     \n"
-    "trn2      v4.8h, v0.8h, v0.8h                     \n"
-    "xtn       v0.4h, v1.4s                            \n"
-    "xtn       v4.4h, v4.4s                            \n"
-
-    // 0+1+2, 3+4+5
-    "add       v16.8h, v16.8h, v0.8h                   \n"
-    "add       v17.8h, v17.8h, v4.8h                   \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
-    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-
-    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
-
-    MEMACCESS(1)
-    "st1       {v3.8b}, [%1], #8                       \n"
-    MEMACCESS(1)
-    "st1       {v3.s}[2], [%1], #4                     \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),         // %0
-    "+r"(dst_ptr),         // %1
-    "+r"(tmp_src_stride),  // %2
-    "+r"(dst_width)        // %3
-  : "r"(&kMult38_Div6),    // %4
-    "r"(&kShuf38_2)        // %5
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
-    "v18", "v19", "v30", "v31", "memory", "cc"
-  );
+  asm volatile(
+      "ld1         {v30.8h}, [%4]                \n"
+      "ld1         {v31.16b}, [%5]               \n"
+      "add         %2, %2, %0                    \n"
+      "1:                                        \n"
+
+      // 00 40 01 41 02 42 03 43
+      // 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63
+      // 30 70 31 71 32 72 33 73
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+      "subs        %w3, %w3, #12                 \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // 00 10 01 11 02 12 03 13
+      // 40 50 41 51 42 52 43 53
+      "trn1        v16.8b, v0.8b, v1.8b          \n"
+      "trn2        v17.8b, v0.8b, v1.8b          \n"
+      "trn1        v18.8b, v4.8b, v5.8b          \n"
+      "trn2        v19.8b, v4.8b, v5.8b          \n"
+
+      // 20 30 21 31 22 32 23 33
+      // 60 70 61 71 62 72 63 73
+      "trn1        v0.8b, v2.8b, v3.8b           \n"
+      "trn2        v1.8b, v2.8b, v3.8b           \n"
+      "trn1        v4.8b, v6.8b, v7.8b           \n"
+      "trn2        v5.8b, v6.8b, v7.8b           \n"
+
+      // 00+10 01+11 02+12 03+13
+      // 40+50 41+51 42+52 43+53
+      "uaddlp      v16.4h, v16.8b                \n"
+      "uaddlp      v17.4h, v17.8b                \n"
+      "uaddlp      v18.4h, v18.8b                \n"
+      "uaddlp      v19.4h, v19.8b                \n"
+
+      // 60+70 61+71 62+72 63+73
+      "uaddlp      v1.4h, v1.8b                  \n"
+      "uaddlp      v5.4h, v5.8b                  \n"
+
+      // combine source lines
+      "add         v16.4h, v16.4h, v18.4h        \n"
+      "add         v17.4h, v17.4h, v19.4h        \n"
+      "add         v2.4h, v1.4h, v5.4h           \n"
+
+      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+      "uqrshrn     v2.8b, v2.8h, #2              \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+      // combine source lines
+      "uaddl       v0.8h, v0.8b, v4.8b           \n"
+
+      // xx 20 xx 21 xx 22 xx 23
+      // xx 30 xx 31 xx 32 xx 33
+      "trn1        v1.8h, v0.8h, v0.8h           \n"
+      "trn2        v4.8h, v0.8h, v0.8h           \n"
+      "xtn         v0.4h, v1.4s                  \n"
+      "xtn         v4.4h, v4.4s                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      // 0+1+2, 3+4+5
+      "add         v16.8h, v16.8h, v0.8h         \n"
+      "add         v17.8h, v17.8h, v4.8h         \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "sqrdmulh    v0.8h, v16.8h, v30.8h         \n"
+      "sqrdmulh    v1.8h, v17.8h, v30.8h         \n"
+
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+
+      "tbl         v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+      "st1         {v3.8b}, [%1], #8             \n"
+      "st1         {v3.s}[2], [%1], #4           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),         // %0
+        "+r"(dst_ptr),         // %1
+        "+r"(tmp_src_stride),  // %2
+        "+r"(dst_width)        // %3
+      : "r"(&kMult38_Div6),    // %4
+        "r"(&kShuf38_2)        // %5
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v30", "v31", "memory", "cc");
 }
 
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
-  const uint8* src_tmp = NULL;
-  asm volatile (
-  "1:                                          \n"
-    "mov       %0, %1                          \n"
-    "mov       w12, %w5                        \n"
-    "eor       v2.16b, v2.16b, v2.16b          \n"
-    "eor       v3.16b, v3.16b, v3.16b          \n"
-  "2:                                          \n"
-    // load 16 pixels into q0
-    MEMACCESS(0)
-    "ld1       {v0.16b}, [%0], %3              \n"
-    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
-    "uaddw     v2.8h, v2.8h, v0.8b             \n"
-    "subs      w12, w12, #1                    \n"
-    "b.gt      2b                              \n"
-    MEMACCESS(2)
-    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
-    "add      %1, %1, #16                      \n"
-    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
-    "b.gt     1b                               \n"
-  : "+r"(src_tmp),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(dst_ptr),          // %2
-    "+r"(src_stride),       // %3
-    "+r"(src_width),        // %4
-    "+r"(src_height)        // %5
-  :
-  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
+// Add a row of bytes to a row of shorts.  Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.8h, v2.8h}, [%1]          \n"  // load accumulator
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 bytes
+      "uaddw2      v2.8h, v2.8h, v0.16b          \n"  // add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uaddw       v1.8h, v1.8h, v0.8b           \n"
+      "st1         {v1.8h, v2.8h}, [%1], #32     \n"  // store accumulator
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1", "v2"  // Clobber List
   );
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                                    \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5                    \n"              \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
-
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx) {
+#define LOAD2_DATA8_LANE(n)                      \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5                     \n" \
+  "add        %3, %3, %4                     \n" \
+  "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
-  const uint8* src_tmp = src_ptr;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
+  const uint8_t* src_tmp = src_ptr;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
   asm volatile (
-    "dup        v0.4s, %w3                     \n"  // x
-    "dup        v1.4s, %w4                     \n"  // dx
-    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
-    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
-    "mul        v1.4s, v1.4s, v2.4s            \n"
+      "dup         v0.4s, %w3                    \n"  // x
+      "dup         v1.4s, %w4                    \n"  // dx
+      "ld1         {v2.4s}, [%5]                 \n"  // 0 1 2 3
+      "shl         v3.4s, v1.4s, #2              \n"  // 4 * dx
+      "mul         v1.4s, v1.4s, v2.4s           \n"
     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "add        v1.4s, v1.4s, v0.4s            \n"
+      "add         v1.4s, v1.4s, v0.4s           \n"
     // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
-    "add        v2.4s, v1.4s, v3.4s            \n"
-    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
-  "1:                                          \n"
+      "add         v2.4s, v1.4s, v3.4s           \n"
+      "shl         v0.4s, v3.4s, #1              \n"  // 8 * dx
+      "1:                                        \n"
     LOAD2_DATA8_LANE(0)
     LOAD2_DATA8_LANE(1)
     LOAD2_DATA8_LANE(2)
@@ -615,31 +600,30 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
     LOAD2_DATA8_LANE(5)
     LOAD2_DATA8_LANE(6)
     LOAD2_DATA8_LANE(7)
-    "mov       v6.16b, v1.16b                  \n"
-    "mov       v7.16b, v2.16b                  \n"
-    "uzp1      v6.8h, v6.8h, v7.8h             \n"
-    "ushll     v4.8h, v4.8b, #0                \n"
-    "ushll     v5.8h, v5.8b, #0                \n"
-    "ssubl     v16.4s, v5.4h, v4.4h            \n"
-    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
-    "ushll     v7.4s, v6.4h, #0                \n"
-    "ushll2    v6.4s, v6.8h, #0                \n"
-    "mul       v16.4s, v16.4s, v7.4s           \n"
-    "mul       v17.4s, v17.4s, v6.4s           \n"
-    "shrn      v6.4h, v16.4s, #16              \n"
-    "shrn2     v6.8h, v17.4s, #16              \n"
-    "add       v4.8h, v4.8h, v6.8h             \n"
-    "xtn       v4.8b, v4.8h                    \n"
-
-    MEMACCESS(0)
-    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
-    "add       v1.4s, v1.4s, v0.4s             \n"
-    "add       v2.4s, v2.4s, v0.4s             \n"
-    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
-    "b.gt      1b                              \n"
+      "mov         v6.16b, v1.16b                \n"
+      "mov         v7.16b, v2.16b                \n"
+      "uzp1        v6.8h, v6.8h, v7.8h           \n"
+      "ushll       v4.8h, v4.8b, #0              \n"
+      "ushll       v5.8h, v5.8b, #0              \n"
+      "ssubl       v16.4s, v5.4h, v4.4h          \n"
+      "ssubl2      v17.4s, v5.8h, v4.8h          \n"
+      "ushll       v7.4s, v6.4h, #0              \n"
+      "ushll2      v6.4s, v6.8h, #0              \n"
+      "mul         v16.4s, v16.4s, v7.4s         \n"
+      "mul         v17.4s, v17.4s, v6.4s         \n"
+      "rshrn       v6.4h, v16.4s, #16            \n"
+      "rshrn2      v6.8h, v17.4s, #16            \n"
+      "add         v4.8h, v4.8h, v6.8h           \n"
+      "xtn         v4.8b, v4.8h                  \n"
+
+      "st1         {v4.8b}, [%0], #8             \n"  // store pixels
+      "add         v1.4s, v1.4s, v0.4s           \n"
+      "add         v2.4s, v2.4s, v0.4s           \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "b.gt        1b                            \n"
   : "+r"(dst_ptr),          // %0
     "+r"(src_ptr),          // %1
-    "+r"(dst_width64),      // %2
+    "+r"(dst_width),        // %2
     "+r"(x64),              // %3
     "+r"(dx64),             // %4
     "+r"(tmp),              // %5
@@ -653,342 +637,328 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
 #undef LOAD2_DATA8_LANE
 
 // 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8* dst_ptr,
-                          const uint8* src_ptr, ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction) {
-    int y_fraction = 256 - source_y_fraction;
-  asm volatile (
-    "cmp          %w4, #0                      \n"
-    "b.eq         100f                         \n"
-    "add          %2, %2, %1                   \n"
-    "cmp          %w4, #64                     \n"
-    "b.eq         75f                          \n"
-    "cmp          %w4, #128                    \n"
-    "b.eq         50f                          \n"
-    "cmp          %w4, #192                    \n"
-    "b.eq         25f                          \n"
-
-    "dup          v5.8b, %w4                   \n"
-    "dup          v4.8b, %w5                   \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "umull        v6.8h, v0.8b, v4.8b          \n"
-    "umull2       v7.8h, v0.16b, v4.16b        \n"
-    "umlal        v6.8h, v1.8b, v5.8b          \n"
-    "umlal2       v7.8h, v1.16b, v5.16b        \n"
-    "rshrn        v0.8b, v6.8h, #8             \n"
-    "rshrn2       v0.16b, v7.8h, #8            \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         1b                           \n"
-    "b            99f                          \n"
-
-    // Blend 25 / 75.
-  "25:                                         \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         25b                          \n"
-    "b            99f                          \n"
-
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         50b                          \n"
-    "b            99f                          \n"
-
-    // Blend 75 / 25.
-  "75:                                         \n"
-    MEMACCESS(1)
-    "ld1          {v1.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v0.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         75b                          \n"
-    "b            99f                          \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         100b                         \n"
-
-  "99:                                         \n"
-    MEMACCESS(0)
-    "st1          {v0.b}[15], [%0]             \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
-    "+r"(dst_width),        // %3
-    "+r"(source_y_fraction),// %4
-    "+r"(y_fraction)        // %5
-  :
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
-  );
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
+  int y_fraction = 256 - source_y_fraction;
+  asm volatile(
+      "cmp         %w4, #0                       \n"
+      "b.eq        100f                          \n"
+      "add         %2, %2, %1                    \n"
+      "cmp         %w4, #64                      \n"
+      "b.eq        75f                           \n"
+      "cmp         %w4, #128                     \n"
+      "b.eq        50f                           \n"
+      "cmp         %w4, #192                     \n"
+      "b.eq        25f                           \n"
+
+      "dup         v5.8b, %w4                    \n"
+      "dup         v4.8b, %w5                    \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "umull       v6.8h, v0.8b, v4.8b           \n"
+      "umull2      v7.8h, v0.16b, v4.16b         \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "umlal       v6.8h, v1.8b, v5.8b           \n"
+      "umlal2      v7.8h, v1.16b, v5.16b         \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "rshrn       v0.8b, v6.8h, #8              \n"
+      "rshrn2      v0.16b, v7.8h, #8             \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        1b                            \n"
+      "b           99f                           \n"
+
+      // Blend 25 / 75.
+      "25:                                       \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        25b                           \n"
+      "b           99f                           \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        50b                           \n"
+      "b           99f                           \n"
+
+      // Blend 75 / 25.
+      "75:                                       \n"
+      "ld1         {v1.16b}, [%1], #16           \n"
+      "ld1         {v0.16b}, [%2], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        75b                           \n"
+      "b           99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        100b                          \n"
+
+      "99:                                       \n"
+      "st1         {v0.b}[15], [%0]              \n"
+      : "+r"(dst_ptr),            // %0
+        "+r"(src_ptr),            // %1
+        "+r"(src_stride),         // %2
+        "+r"(dst_width),          // %3
+        "+r"(source_y_fraction),  // %4
+        "+r"(y_fraction)          // %5
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
 }
 
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    MEMACCESS (0)
-    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
-    MEMACCESS (0)
-    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-    MEMACCESS (1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
-    MEMACCESS (1)
-    "st1        {v3.16b}, [%1], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r" (src_ptr),          // %0
-    "+r" (dst),              // %1
-    "+r" (dst_width)         // %2
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+      "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "mov         v2.16b, v3.16b                \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st2         {v1.4s,v2.4s}, [%1], #32      \n"  // store 8 odd pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS (0)
-    // load 8 ARGB pixels.
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
-    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
-    "rshrn      v1.8b, v1.8h, #1               \n"
-    "rshrn      v2.8b, v2.8h, #1               \n"
-    "rshrn      v3.8b, v3.8h, #1               \n"
-    MEMACCESS (1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),         // %0
-    "+r"(dst_argb),         // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+      "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "urhadd      v1.16b, v2.16b, v3.16b        \n"
+      "st2         {v0.4s,v1.4s}, [%1], #32      \n"  // store 8 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS (0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
-    MEMACCESS (1)
-    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
-    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
-    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
-    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
-    "rshrn      v1.8b, v1.8h, #2               \n"
-    "rshrn      v2.8b, v2.8h, #2               \n"
-    "rshrn      v3.8b, v3.8h, #2               \n"
-    MEMACCESS (2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
-    "b.gt       1b                             \n"
-  : "+r" (src_ptr),          // %0
-    "+r" (src_stride),       // %1
-    "+r" (dst),              // %2
-    "+r" (dst_width)         // %3
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
-  );
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "uaddlp      v3.8h, v3.16b                 \n"  // A 16 bytes -> 8 shorts.
+      "ld4         {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
+      "uadalp      v0.8h, v16.16b                \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v17.16b                \n"  // G 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uadalp      v2.8h, v18.16b                \n"  // R 16 bytes -> 8 shorts.
+      "uadalp      v3.8h, v19.16b                \n"  // A 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
+      "rshrn       v1.8b, v1.8h, #2              \n"
+      "rshrn       v2.8b, v2.8h, #2              \n"
+      "rshrn       v3.8b, v3.8h, #2              \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[0], [%0], %3            \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[1], [%0], %3            \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[2], [%0], %3            \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[3], [%0], %3            \n"
-    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(dst_width)    // %2
-  : "r"((int64)(src_stepx * 4)) // %3
-  : "memory", "cc", "v0"
-  );
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.s}[0], [%0], %3           \n"
+      "ld1         {v0.s}[1], [%0], %3           \n"
+      "ld1         {v0.s}[2], [%0], %3           \n"
+      "ld1         {v0.s}[3], [%0], %3           \n"
+      "subs        %w2, %w2, #4                  \n"  // 4 pixels per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v0.16b}, [%1], #16           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),                // %0
+        "+r"(dst_argb),                // %1
+        "+r"(dst_width)                // %2
+      : "r"((int64_t)(src_stepx * 4))  // %3
+      : "memory", "cc", "v0");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
 // TODO(Yang Zhang): Might be worth another optimization pass in future.
 // It could be upgraded to 8 pixels at a time to start with.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1], %4              \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0], %4              \n"
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1], %4              \n"
-    MEMACCESS(0)
-    "ld1        {v4.8b}, [%0], %4              \n"
-    MEMACCESS(1)
-    "ld1        {v5.8b}, [%1], %4              \n"
-    MEMACCESS(0)
-    "ld1        {v6.8b}, [%0], %4              \n"
-    MEMACCESS(1)
-    "ld1        {v7.8b}, [%1], %4              \n"
-    "uaddl      v0.8h, v0.8b, v1.8b            \n"
-    "uaddl      v2.8h, v2.8b, v3.8b            \n"
-    "uaddl      v4.8h, v4.8b, v5.8b            \n"
-    "uaddl      v6.8h, v6.8b, v7.8b            \n"
-    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
-    "mov        v0.d[1], v2.d[0]               \n"
-    "mov        v2.d[0], v16.d[1]              \n"
-    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
-    "mov        v4.d[1], v6.d[0]               \n"
-    "mov        v6.d[0], v16.d[1]              \n"
-    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
-    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
-    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
-    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
-    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
-    MEMACCESS(2)
-    "st1     {v0.16b}, [%2], #16               \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(src_stride),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(dst_width)    // %3
-  : "r"((int64)(src_stepx * 4)) // %4
-  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  asm volatile(
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "ld1         {v0.8b}, [%0], %4             \n"  // Read 4 2x2 -> 2x1
+      "ld1         {v1.8b}, [%1], %4             \n"
+      "ld1         {v2.8b}, [%0], %4             \n"
+      "ld1         {v3.8b}, [%1], %4             \n"
+      "ld1         {v4.8b}, [%0], %4             \n"
+      "ld1         {v5.8b}, [%1], %4             \n"
+      "ld1         {v6.8b}, [%0], %4             \n"
+      "ld1         {v7.8b}, [%1], %4             \n"
+      "uaddl       v0.8h, v0.8b, v1.8b           \n"
+      "uaddl       v2.8h, v2.8b, v3.8b           \n"
+      "uaddl       v4.8h, v4.8b, v5.8b           \n"
+      "uaddl       v6.8h, v6.8b, v7.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "mov         v16.d[1], v0.d[1]             \n"  // ab_cd -> ac_bd
+      "mov         v0.d[1], v2.d[0]              \n"
+      "mov         v2.d[0], v16.d[1]             \n"
+      "mov         v16.d[1], v4.d[1]             \n"  // ef_gh -> eg_fh
+      "mov         v4.d[1], v6.d[0]              \n"
+      "mov         v6.d[0], v16.d[1]             \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "add         v0.8h, v0.8h, v2.8h           \n"  // (a+b)_(c+d)
+      "add         v4.8h, v4.8h, v6.8h           \n"  // (e+f)_(g+h)
+      "rshrn       v0.8b, v0.8h, #2              \n"  // first 2 pixels.
+      "rshrn2      v0.16b, v4.8h, #2             \n"  // next 2 pixels.
+      "subs        %w3, %w3, #4                  \n"  // 4 pixels per loop.
+      "st1         {v0.16b}, [%2], #16           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),                // %0
+        "+r"(src_stride),              // %1
+        "+r"(dst_argb),                // %2
+        "+r"(dst_width)                // %3
+      : "r"((int64_t)(src_stepx * 4))  // %4
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD1_DATA32_LANE(vn, n)                               \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5, lsl #2             \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "ld1        {"#vn".s}["#n"], [%6]          \n"
-
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  const uint8* src_tmp = src_argb;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
-  int64 tmp64 = 0;
-  asm volatile (
-  "1:                                          \n"
-    LOAD1_DATA32_LANE(v0, 0)
-    LOAD1_DATA32_LANE(v0, 1)
-    LOAD1_DATA32_LANE(v0, 2)
-    LOAD1_DATA32_LANE(v0, 3)
-    LOAD1_DATA32_LANE(v1, 0)
-    LOAD1_DATA32_LANE(v1, 1)
-    LOAD1_DATA32_LANE(v1, 2)
-    LOAD1_DATA32_LANE(v1, 3)
-
-    MEMACCESS(0)
-    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-    "b.gt        1b                            \n"
-  : "+r"(dst_argb),         // %0
-    "+r"(src_argb),         // %1
-    "+r"(dst_width64),      // %2
-    "+r"(x64),              // %3
-    "+r"(dx64),             // %4
-    "+r"(tmp64),            // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "v0", "v1"
-  );
+#define LOAD1_DATA32_LANE(vn, n)                 \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5, lsl #2             \n" \
+  "add        %3, %3, %4                     \n" \
+  "ld1        {" #vn ".s}[" #n "], [%6]      \n"
+
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  const uint8_t* src_tmp = src_argb;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
+  int64_t tmp64;
+  asm volatile(
+      "1:                                        \n"
+      // clang-format off
+      LOAD1_DATA32_LANE(v0, 0)
+      LOAD1_DATA32_LANE(v0, 1)
+      LOAD1_DATA32_LANE(v0, 2)
+      LOAD1_DATA32_LANE(v0, 3)
+      LOAD1_DATA32_LANE(v1, 0)
+      LOAD1_DATA32_LANE(v1, 1)
+      LOAD1_DATA32_LANE(v1, 2)
+      LOAD1_DATA32_LANE(v1, 3)
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      // clang-format on
+      "st1         {v0.4s, v1.4s}, [%0], #32     \n"  // store pixels
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "b.gt        1b                            \n"
+      : "+r"(dst_argb),   // %0
+        "+r"(src_argb),   // %1
+        "+r"(dst_width),  // %2
+        "+r"(x64),        // %3
+        "+r"(dx64),       // %4
+        "=&r"(tmp64),     // %5
+        "+r"(src_tmp)     // %6
+      :
+      : "memory", "cc", "v0", "v1");
 }
 
 #undef LOAD1_DATA32_LANE
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
-    "lsr        %5, %3, #16                           \n"      \
-    "add        %6, %1, %5, lsl #2                    \n"      \
-    "add        %3, %3, %4                            \n"      \
-    MEMACCESS(6)                                               \
-    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
-
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx) {
+#define LOAD2_DATA32_LANE(vn1, vn2, n)                  \
+  "lsr        %5, %3, #16                           \n" \
+  "add        %6, %1, %5, lsl #2                    \n" \
+  "add        %3, %3, %4                            \n" \
+  "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
+
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
-  const uint8* src_tmp = src_argb;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
+  const uint8_t* src_tmp = src_argb;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
   asm volatile (
-    "dup        v0.4s, %w3                     \n"  // x
-    "dup        v1.4s, %w4                     \n"  // dx
-    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
-    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
-    "mul        v1.4s, v1.4s, v2.4s            \n"
-    "movi       v3.16b, #0x7f                  \n"  // 0x7F
-    "movi       v4.8h, #0x7f                   \n"  // 0x7F
+      "dup         v0.4s, %w3                    \n"  // x
+      "dup         v1.4s, %w4                    \n"  // dx
+      "ld1         {v2.4s}, [%5]                 \n"  // 0 1 2 3
+      "shl         v6.4s, v1.4s, #2              \n"  // 4 * dx
+      "mul         v1.4s, v1.4s, v2.4s           \n"
+      "movi        v3.16b, #0x7f                 \n"  // 0x7F
+      "movi        v4.8h, #0x7f                  \n"  // 0x7F
     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "add        v5.4s, v1.4s, v0.4s            \n"
-  "1:                                          \n"
+      "add         v5.4s, v1.4s, v0.4s           \n"
+      "1:                                        \n"
     // d0, d1: a
     // d2, d3: b
     LOAD2_DATA32_LANE(v0, v1, 0)
@@ -1009,19 +979,18 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
     "umull2     v17.8h, v0.16b, v7.16b         \n"
     "umull      v18.8h, v1.8b, v2.8b           \n"
     "umull2     v19.8h, v1.16b, v2.16b         \n"
+    "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
     "add        v16.8h, v16.8h, v18.8h         \n"
     "add        v17.8h, v17.8h, v19.8h         \n"
     "shrn       v0.8b, v16.8h, #7              \n"
     "shrn2      v0.16b, v17.8h, #7             \n"
-
-    MEMACCESS(0)
     "st1     {v0.4s}, [%0], #16                \n"  // store pixels
     "add     v5.4s, v5.4s, v6.4s               \n"
     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
-    "b.gt    1b                                \n"
+    "b.gt       1b                             \n"
   : "+r"(dst_argb),         // %0
     "+r"(src_argb),         // %1
-    "+r"(dst_width64),      // %2
+    "+r"(dst_width),        // %2
     "+r"(x64),              // %3
     "+r"(dx64),             // %4
     "+r"(tmp),              // %5
@@ -1034,6 +1003,147 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
 
 #undef LOAD2_DATA32_LANE
 
+// Read 16x2 average down and write 8x1.
+void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
+      "1:                                        \n"
+      "ld1         {v0.8h, v1.8h}, [%0], #32     \n"  // load row 1 and post inc
+      "ld1         {v2.8h, v3.8h}, [%1], #32     \n"  // load row 2 and post inc
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop
+      "uaddlp      v0.4s, v0.8h                  \n"  // row 1 add adjacent
+      "uaddlp      v1.4s, v1.8h                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uadalp      v0.4s, v2.8h                  \n"  // +row 2 add adjacent
+      "uadalp      v1.4s, v3.8h                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "rshrn       v0.4h, v0.4s, #2              \n"  // round and pack
+      "rshrn2      v0.8h, v1.4s, #2              \n"
+      "st1         {v0.8h}, [%2], #16            \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// Read 8x2 upsample with filtering and write 16x1.
+// Actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
+  asm volatile(
+      "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
+      "movi        v0.8h, #9                     \n"  // constants
+      "movi        v1.4s, #3                     \n"
+
+      "1:                                        \n"
+      "ld1         {v3.8h}, [%0], %4             \n"  // TL read first 8
+      "ld1         {v4.8h}, [%0], %5             \n"  // TR read 8 offset by 1
+      "ld1         {v5.8h}, [%1], %4             \n"  // BL read 8 from next row
+      "ld1         {v6.8h}, [%1], %5             \n"  // BR offset by 1
+      "subs        %w3, %w3, #16                 \n"  // 16 dst pixels per loop
+      "umull       v16.4s, v3.4h, v0.4h          \n"
+      "umull2      v7.4s, v3.8h, v0.8h           \n"
+      "umull       v18.4s, v4.4h, v0.4h          \n"
+      "umull2      v17.4s, v4.8h, v0.8h          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uaddw       v16.4s, v16.4s, v6.4h         \n"
+      "uaddl2      v19.4s, v6.8h, v3.8h          \n"
+      "uaddl       v3.4s, v6.4h, v3.4h           \n"
+      "uaddw2      v6.4s, v7.4s, v6.8h           \n"
+      "uaddl2      v7.4s, v5.8h, v4.8h           \n"
+      "uaddl       v4.4s, v5.4h, v4.4h           \n"
+      "uaddw       v18.4s, v18.4s, v5.4h         \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "mla         v16.4s, v4.4s, v1.4s          \n"
+      "mla         v18.4s, v3.4s, v1.4s          \n"
+      "mla         v6.4s, v7.4s, v1.4s           \n"
+      "uaddw2      v4.4s, v17.4s, v5.8h          \n"
+      "uqrshrn     v16.4h,  v16.4s, #4           \n"
+      "mla         v4.4s, v19.4s, v1.4s          \n"
+      "uqrshrn2    v16.8h, v6.4s, #4             \n"
+      "uqrshrn     v17.4h, v18.4s, #4            \n"
+      "uqrshrn2    v17.8h, v4.4s, #4             \n"
+      "st2         {v16.8h-v17.8h}, [%2], #32    \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      : "r"(2LL),          // %4
+        "r"(14LL)          // %5
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19"  // Clobber List
+  );
+}
+
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 UV
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uaddlp      v0.8h, v0.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "ld2         {v16.16b,v17.16b}, [%1], #32  \n"  // load 16
+      "uadalp      v0.8h, v16.16b                \n"  // U 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v17.16b                \n"  // V 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "rshrn       v1.8b, v1.8h, #2              \n"
+      "st2         {v0.8b,v1.8b}, [%2], #16      \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "v0", "v1", "v16", "v17");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             int src_stepx,  // pixel step
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+  const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+  const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.h}[0], [%0], %6            \n"
+      "ld1        {v1.h}[0], [%1], %6            \n"
+      "ld1        {v2.h}[0], [%2], %6            \n"
+      "ld1        {v3.h}[0], [%3], %6            \n"
+      "subs       %w5, %w5, #4                   \n"  // 4 pixels per loop.
+      "st4        {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),                 // %0
+        "+r"(src1_ptr),                // %1
+        "+r"(src2_ptr),                // %2
+        "+r"(src3_ptr),                // %3
+        "+r"(dst_ptr),                 // %4
+        "+r"(dst_width)                // %5
+      : "r"((int64_t)(src_stepx * 8))  // %6
+      : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/third_party/libyuv/source/scale_uv.cc b/media/libaom/src/third_party/libyuv/source/scale_uv.cc
new file mode 100644
index 0000000000..b0469f09b8
--- /dev/null
+++ b/media/libaom/src/third_party/libyuv/source/scale_uv.cc
@@ -0,0 +1,891 @@
+/*
+ *  Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyUV
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Macros to enable specialized scalers
+
+#ifndef HAS_SCALEUVDOWN2
+#define HAS_SCALEUVDOWN2 1
+#endif
+#ifndef HAS_SCALEUVDOWN4BOX
+#define HAS_SCALEUVDOWN4BOX 1
+#endif
+#ifndef HAS_SCALEUVDOWNEVEN
+#define HAS_SCALEUVDOWNEVEN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARDOWN
+#define HAS_SCALEUVBILINEARDOWN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARUP
+#define HAS_SCALEUVBILINEARUP 1
+#endif
+#ifndef HAS_UVCOPY
+#define HAS_UVCOPY 1
+#endif
+#ifndef HAS_SCALEPLANEVERTICAL
+#define HAS_SCALEPLANEVERTICAL 1
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// ScaleUV, 1/2
+// This is an optimized version for scaling down a UV to 1/2 of
+// its original size.
+#if HAS_SCALEUVDOWN2
+static void ScaleUVDown2(int src_width,
+                         int src_height,
+                         int dst_width,
+                         int dst_height,
+                         int src_stride,
+                         int dst_stride,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_uv,
+                         int x,
+                         int dx,
+                         int y,
+                         int dy,
+                         enum FilterMode filtering) {
+  int j;
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+                          uint8_t* dst_uv, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleUVRowDown2_C
+          : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C
+                                        : ScaleUVRowDown2Box_C);
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 2);      // Test scale factor of 2.
+  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
+  // Advance to odd row, even column.
+  if (filtering == kFilterBilinear) {
+    src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+  } else {
+    src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2;
+  }
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && filtering) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && filtering) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && filtering) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+    }
+  }
+#endif
+
+// This code is not enabled.  Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_SSSE3
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3
+                                          : ScaleUVRowDown2Box_Any_SSSE3);
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_SSSE3
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3
+                                            : ScaleUVRowDown2Box_SSSE3);
+    }
+  }
+#endif
+// This code is not enabled.  Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
+                                          : ScaleUVRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_NEON
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
+                                            : ScaleUVRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_MMI
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MMI
+                                          : ScaleUVRowDown2Box_Any_MMI);
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_MMI
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MMI
+                                            : ScaleUVRowDown2Box_MMI);
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA
+                                          : ScaleUVRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_MSA
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA
+                                            : ScaleUVRowDown2Box_MSA);
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width);
+    src_uv += row_stride;
+    dst_uv += dst_stride;
+  }
+}
+#endif  // HAS_SCALEUVDOWN2
+
+// ScaleUV, 1/4
+// This is an optimized version for scaling down a UV to 1/4 of
+// its original size.
+#if HAS_SCALEUVDOWN4BOX
+static void ScaleUVDown4Box(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_uv,
+                            uint8_t* dst_uv,
+                            int x,
+                            int dx,
+                            int y,
+                            int dy) {
+  int j;
+  // Allocate 2 rows of UV.
+  const int kRowSize = (dst_width * 2 * 2 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+                          uint8_t* dst_uv, int dst_width) =
+      ScaleUVRowDown2Box_C;
+  // Advance to odd row, even column.
+  src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 4);      // Test scale factor of 4.
+  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+    }
+  }
+#endif
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2);
+    ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize,
+                    dst_width * 2);
+    ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width);
+    src_uv += row_stride;
+    dst_uv += dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+#endif  // HAS_SCALEUVDOWN4BOX
+
+// ScaleUV Even
+// This is an optimized version for scaling down a UV to even
+// multiple of its original size.
+#if HAS_SCALEUVDOWNEVEN
+static void ScaleUVDownEven(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_uv,
+                            uint8_t* dst_uv,
+                            int x,
+                            int dx,
+                            int y,
+                            int dy,
+                            enum FilterMode filtering) {
+  int j;
+  int col_step = dx >> 16;
+  int row_stride = (dy >> 16) * src_stride;
+  void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
+                             int src_step, uint8_t* dst_uv, int dst_width) =
+      filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
+  (void)src_width;
+  (void)src_height;
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
+                                   : ScaleUVRowDownEven_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDownEven =
+          filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && !filtering) {
+    ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDownEven = ScaleUVRowDownEven_NEON;
+    }
+  }
+#endif// TODO(fbarchard): Enable Box filter
+#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON
+                                   : ScaleUVRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDownEven =
+          filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleUVRowDownEven =
+        filtering ? ScaleUVRowDownEvenBox_Any_MMI : ScaleUVRowDownEven_Any_MMI;
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleUVRowDownEven =
+          filtering ? ScaleUVRowDownEvenBox_MMI : ScaleUVRowDownEven_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVRowDownEven =
+        filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDownEven =
+          filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA;
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width);
+    src_uv += row_stride;
+    dst_uv += dst_stride;
+  }
+}
+#endif
+
+// Scale UV down with bilinear interpolation.
+#if HAS_SCALEUVBILINEARDOWN
+static void ScaleUVBilinearDown(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint8_t* src_uv,
+                                uint8_t* dst_uv,
+                                int x,
+                                int dx,
+                                int y,
+                                int dy,
+                                enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+                            int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C;
+  int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+  int64_t xl = (dx >= 0) ? x : xlast;
+  int64_t xr = (dx >= 0) ? xlast : x;
+  int clip_src_width;
+  xl = (xl >> 16) & ~3;    // Left edge aligned.
+  xr = (xr >> 16) + 1;     // Right most pixel used.  Bilinear uses 2 pixels.
+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
+  if (xr > src_width) {
+    xr = src_width;
+  }
+  clip_src_width = (int)(xr - xl) * 2;  // Width aligned to 2.
+  src_uv += xl * 2;
+  x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+    }
+  }
+#endif
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row of UV.
+  {
+    align_buffer_64(row, clip_src_width * 2);
+
+    const int max_y = (src_height - 1) << 16;
+    if (y > max_y) {
+      y = max_y;
+    }
+    for (j = 0; j < dst_height; ++j) {
+      int yi = y >> 16;
+      const uint8_t* src = src_uv + yi * src_stride;
+      if (filtering == kFilterLinear) {
+        ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(row, src, src_stride, clip_src_width, yf);
+        ScaleUVFilterCols(dst_uv, row, dst_width, x, dx);
+      }
+      dst_uv += dst_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+#endif
+
+// Scale UV up with bilinear interpolation.
+#if HAS_SCALEUVBILINEARUP
+static void ScaleUVBilinearUp(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint8_t* src_uv,
+                              uint8_t* dst_uv,
+                              int x,
+                              int dx,
+                              int y,
+                              int dy,
+                              enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+                            int dst_width, int x, int dx) =
+      filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
+  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    InterpolateRow = InterpolateRow_Any_MMI;
+    if (IS_ALIGNED(dst_width, 2)) {
+      InterpolateRow = InterpolateRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+  if (src_width >= 32768) {
+    ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
+  }
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+  if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleUVFilterCols = ScaleUVCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVFilterCols = ScaleUVCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVFilterCols = ScaleUVCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_MMI)
+  if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+    ScaleUVFilterCols = ScaleUVCols_Any_MMI;
+    if (IS_ALIGNED(dst_width, 1)) {
+      ScaleUVFilterCols = ScaleUVCols_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVFilterCols = ScaleUVCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVFilterCols = ScaleUVCols_MSA;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleUVFilterCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+      ScaleUVFilterCols = ScaleUVColsUp2_SSSE3;
+    }
+#endif
+#if defined(HAS_SCALEUVCOLSUP2_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+      ScaleUVFilterCols = ScaleUVColsUp2_MMI;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  {
+    int yi = y >> 16;
+    const uint8_t* src = src_uv + yi * src_stride;
+
+    // Allocate 2 rows of UV.
+    const int kRowSize = (dst_width * 2 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8_t* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_uv + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf);
+      }
+      dst_uv += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+#endif  // HAS_SCALEUVBILINEARUP
+
+// Scale UV to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleUVSimple(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_uv,
+                          int x,
+                          int dx,
+                          int y,
+                          int dy) {
+  int j;
+  void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width,
+                      int x, int dx) =
+      (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
+  (void)src_height;
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleUVCols = ScaleUVCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVCols = ScaleUVCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVCols = ScaleUVCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ScaleUVCols = ScaleUVCols_Any_MMI;
+    if (IS_ALIGNED(dst_width, 1)) {
+      ScaleUVCols = ScaleUVCols_MMI;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVCols = ScaleUVCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVCols = ScaleUVCols_MSA;
+    }
+  }
+#endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleUVCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+      ScaleUVCols = ScaleUVColsUp2_SSSE3;
+    }
+#endif
+#if defined(HAS_SCALEUVCOLSUP2_MMI)
+    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+      ScaleUVCols = ScaleUVColsUp2_MMI;
+    }
+#endif
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleUVCols(dst_uv, src_uv + (y >> 16) * src_stride, dst_width, x, dx);
+    dst_uv += dst_stride;
+    y += dy;
+  }
+}
+
+// Copy UV with optional flipping
+#if HAS_UVCOPY
+static int UVCopy(const uint8_t* src_UV,
+                  int src_stride_UV,
+                  uint8_t* dst_UV,
+                  int dst_stride_UV,
+                  int width,
+                  int height) {
+  if (!src_UV || !dst_UV || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_UV = src_UV + (height - 1) * src_stride_UV;
+    src_stride_UV = -src_stride_UV;
+  }
+
+  CopyPlane(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height);
+  return 0;
+}
+#endif  // HAS_UVCOPY
+
+// Scale a UV plane (from NV12)
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleUV(const uint8_t* src,
+                    int src_stride,
+                    int src_width,
+                    int src_height,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int dst_width,
+                    int dst_height,
+                    int clip_x,
+                    int clip_y,
+                    int clip_width,
+                    int clip_height,
+                    enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // UV does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
+
+  // Negative src_height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
+  src_width = Abs(src_width);
+  if (clip_x) {
+    int64_t clipf = (int64_t)(clip_x)*dx;
+    x += (clipf & 0xffff);
+    src += (clipf >> 16) * 2;
+    dst += clip_x * 2;
+  }
+  if (clip_y) {
+    int64_t clipf = (int64_t)(clip_y)*dy;
+    y += (clipf & 0xffff);
+    src += (clipf >> 16) * src_stride;
+    dst += clip_y * dst_stride;
+  }
+
+  // Special case for integer step values.
+  if (((dx | dy) & 0xffff) == 0) {
+    if (!dx || !dy) {  // 1 pixel wide and/or tall.
+      filtering = kFilterNone;
+    } else {
+      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+      if (!(dx & 0x10000) && !(dy & 0x10000)) {
+#if HAS_SCALEUVDOWN2
+        if (dx == 0x20000) {
+          // Optimized 1/2 downsample.
+          ScaleUVDown2(src_width, src_height, clip_width, clip_height,
+                       src_stride, dst_stride, src, dst, x, dx, y, dy,
+                       filtering);
+          return;
+        }
+#endif
+#if HAS_SCALEUVDOWN4BOX
+        if (dx == 0x40000 && filtering == kFilterBox) {
+          // Optimized 1/4 box downsample.
+          ScaleUVDown4Box(src_width, src_height, clip_width, clip_height,
+                          src_stride, dst_stride, src, dst, x, dx, y, dy);
+          return;
+        }
+#endif
+#if HAS_SCALEUVDOWNEVEN
+        ScaleUVDownEven(src_width, src_height, clip_width, clip_height,
+                        src_stride, dst_stride, src, dst, x, dx, y, dy,
+                        filtering);
+        return;
+#endif
+      }
+      // Optimized odd scale down. ie 3, 5, 7, 9x.
+      if ((dx & 0x10000) && (dy & 0x10000)) {
+        filtering = kFilterNone;
+#ifdef HAS_UVCOPY
+        if (dx == 0x10000 && dy == 0x10000) {
+          // Straight copy.
+          UVCopy(src + (y >> 16) * src_stride + (x >> 16) * 2, src_stride, dst,
+                 dst_stride, clip_width, clip_height);
+          return;
+        }
+#endif
+      }
+    }
+  }
+  // HAS_SCALEPLANEVERTICAL
+  if (dx == 0x10000 && (x & 0xffff) == 0) {
+    // Arbitrary scale vertically, but unscaled horizontally.
+    ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+                       dst_stride, src, dst, x, y, dy, 4, filtering);
+    return;
+  }
+
+#if HAS_SCALEUVBILINEARUP
+  if (filtering && dy < 65536) {
+    ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
+                      src_stride, dst_stride, src, dst, x, dx, y, dy,
+                      filtering);
+    return;
+  }
+#endif
+#if HAS_SCALEUVBILINEARDOWN
+  if (filtering) {
+    ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height,
+                        src_stride, dst_stride, src, dst, x, dx, y, dy,
+                        filtering);
+    return;
+  }
+#endif
+  ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride,
+                dst_stride, src, dst, x, dx, y, dy);
+}
+
+// Scale an UV image.
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+            int src_stride_uv,
+            int src_width,
+            int src_height,
+            uint8_t* dst_uv,
+            int dst_stride_uv,
+            int dst_width,
+            int dst_height,
+            enum FilterMode filtering) {
+  if (!src_uv || src_width == 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv,
+          dst_width, dst_height, 0, 0, dst_width, dst_height, filtering);
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/media/libaom/src/third_party/libyuv/source/scale_win.cc b/media/libaom/src/third_party/libyuv/source/scale_win.cc
index c3896ebad2..c5fc86f3e9 100644
--- a/media/libaom/src/third_party/libyuv/source/scale_win.cc
+++ b/media/libaom/src/third_party/libyuv/source/scale_win.cc
@@ -16,99 +16,94 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for Visual C x86.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
-    defined(_MSC_VER) && !defined(__clang__)
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 
 // Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
 
 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
+                              8, 9, 9, 10, 10, 11, 12, 13};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
+                              10, 11, 12, 13, 13, 14, 14, 15};
 
 // Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
 
 // Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
 
 // Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
 
 // Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
-  { 2, 2, 2, 2, 2, 2, 2, 2 };
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
 
-static uvec8 kShuf38a =
-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
+                               128, 128, 128, 128, 128, 128, 128, 128};
 
-static uvec8 kShuf38b =
-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
+                               6,   8,   11,  14,  128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
+                              128, 128, 128, 128, 128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
+                               6,   7,   12,  13,  128, 128, 128, 128};
 
 // Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                                  65536 / 9, 65536 / 6, 0,         0};
 
 // Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
+                               11, 128, 14, 128, 128, 128, 128, 128};
 
 // Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
+                               12, 128, 15, 128, 128, 128, 128, 128};
 
 // Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
+                               13, 128, 128, 128, 128, 128, 128, 128};
 
 // Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                                 65536 / 3, 65536 / 2, 0,         0};
 
 // Reads 32 pixels, throws half away and writes 16 pixels.
-__declspec(naked)
-void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           uint8_t* dst_ptr,
+                                           int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm0, 8          // isolate odd pixels.
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -121,32 +116,30 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 32x1 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t* dst_ptr,
+                                                 int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
+    psrlw      xmm4, 15
+    packuswb   xmm4, xmm4
+    pxor       xmm5, xmm5  // constant 0
 
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm5
-    pand       xmm3, xmm5
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
+    pmaddubsw  xmm0, xmm4  // horizontal add
+    pmaddubsw  xmm1, xmm4
+    pavgw      xmm0, xmm5       // (x + 1) / 2
+    pavgw      xmm1, xmm5
     packuswb   xmm0, xmm1
-
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 16
@@ -157,17 +150,21 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 32x2 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                                              ptrdiff_t src_stride,
+                                              uint8_t* dst_ptr,
+                                              int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
+
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
+    psrlw      xmm4, 15
+    packuswb   xmm4, xmm4
+    pxor       xmm5, xmm5  // constant 0
 
   wloop:
     movdqu     xmm0, [eax]
@@ -175,19 +172,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
     movdqu     xmm2, [eax + esi]
     movdqu     xmm3, [eax + esi + 16]
     lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm5
-    pand       xmm3, xmm5
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
+    pmaddubsw  xmm0, xmm4  // horizontal add
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2  // vertical add
+    paddw      xmm1, xmm3
+    psrlw      xmm0, 1
+    psrlw      xmm1, 1
+    pavgw      xmm0, xmm5  // (x + 1) / 2
+    pavgw      xmm1, xmm5
     packuswb   xmm0, xmm1
-
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 16
@@ -200,23 +195,24 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 
 #ifdef HAS_SCALEROWDOWN2_AVX2
 // Reads 64 pixels, throws half away and writes 32 pixels.
-__declspec(naked)
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8_t* dst_ptr,
+                                          int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     vmovdqu     ymm0, [eax]
     vmovdqu     ymm1, [eax + 32]
     lea         eax,  [eax + 64]
-    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
+    vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
     vpsrlw      ymm1, ymm1, 8
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -228,32 +224,31 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 64x1 rectangle to 32x1.
-__declspec(naked)
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                                                ptrdiff_t src_stride,
+                                                uint8_t* dst_ptr,
+                                                int dst_width) {
   __asm {
-    mov         eax, [esp + 4]        // src_ptr
-                                      // src_stride
-    mov         edx, [esp + 12]       // dst_ptr
-    mov         ecx, [esp + 16]       // dst_width
+    mov         eax, [esp + 4]  // src_ptr
+    // src_stride
+    mov         edx, [esp + 12]  // dst_ptr
+    mov         ecx, [esp + 16]  // dst_width
 
-    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
     vpsrlw      ymm4, ymm4, 15
     vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5      // constant 0
+    vpxor       ymm5, ymm5, ymm5  // constant 0
 
   wloop:
     vmovdqu     ymm0, [eax]
     vmovdqu     ymm1, [eax + 32]
     lea         eax,  [eax + 64]
-
-    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
     vpmaddubsw  ymm1, ymm1, ymm4
-    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
     vpavgw      ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
-
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -264,36 +259,43 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
+// For rounding, average = (sum + 2) / 4
+// becomes average((sum >> 1), 0)
 // Blends 64x2 rectangle to 32x1.
-__declspec(naked)
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                                             ptrdiff_t src_stride,
+                                             uint8_t* dst_ptr,
+                                             int dst_width) {
   __asm {
     push        esi
-    mov         eax, [esp + 4 + 4]    // src_ptr
-    mov         esi, [esp + 4 + 8]    // src_stride
-    mov         edx, [esp + 4 + 12]   // dst_ptr
-    mov         ecx, [esp + 4 + 16]   // dst_width
+    mov         eax, [esp + 4 + 4]  // src_ptr
+    mov         esi, [esp + 4 + 8]  // src_stride
+    mov         edx, [esp + 4 + 12]  // dst_ptr
+    mov         ecx, [esp + 4 + 16]  // dst_width
 
-    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
     vpsrlw      ymm4, ymm4, 15
     vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5      // constant 0
+    vpxor       ymm5, ymm5, ymm5  // constant 0
 
   wloop:
-    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm0, [eax]
     vmovdqu     ymm1, [eax + 32]
-    vpavgb      ymm0, ymm0, [eax + esi]
-    vpavgb      ymm1, ymm1, [eax + esi + 32]
+    vmovdqu     ymm2, [eax + esi]
+    vmovdqu     ymm3, [eax + esi + 32]
     lea         eax,  [eax + 64]
-
-    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
     vpmaddubsw  ymm1, ymm1, ymm4
-    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2  // vertical add
+    vpaddw      ymm1, ymm1, ymm3
+    vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2
+    vpsrlw      ymm1, ymm1, 1
+    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
     vpavgw      ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
-
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -307,15 +309,16 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 #endif  // HAS_SCALEROWDOWN2_AVX2
 
 // Point samples 32 pixels to 8 pixels.
-__declspec(naked)
-void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           uint8_t* dst_ptr,
+                                           int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
     psrld      xmm5, 24
     pslld      xmm5, 16
 
@@ -338,53 +341,52 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 32x4 rectangle to 8x1.
-__declspec(naked)
-void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                                              ptrdiff_t src_stride,
+                                              uint8_t* dst_ptr,
+                                              int dst_width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_ptr
-    mov        esi, [esp + 8 + 8]    // src_stride
-    mov        edx, [esp + 8 + 12]   // dst_ptr
-    mov        ecx, [esp + 8 + 16]   // dst_width
+    mov        eax, [esp + 8 + 4]  // src_ptr
+    mov        esi, [esp + 8 + 8]  // src_stride
+    mov        edx, [esp + 8 + 12]  // dst_ptr
+    mov        ecx, [esp + 8 + 16]  // dst_width
     lea        edi, [esi + esi * 2]  // src_stride * 3
-    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
-    psrlw      xmm7, 8
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
+    psrlw      xmm4, 15
+    movdqa     xmm5, xmm4
+    packuswb   xmm4, xmm4
+    psllw      xmm5, 3  // constant 0x0008
 
   wloop:
-    movdqu     xmm0, [eax]           // average rows
+    movdqu     xmm0, [eax]  // average rows
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + esi]
     movdqu     xmm3, [eax + esi + 16]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
+    pmaddubsw  xmm0, xmm4  // horizontal add
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2  // vertical add rows 0, 1
+    paddw      xmm1, xmm3
     movdqu     xmm2, [eax + esi * 2]
     movdqu     xmm3, [eax + esi * 2 + 16]
-    movdqu     xmm4, [eax + edi]
-    movdqu     xmm5, [eax + edi + 16]
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2  // add row 2
+    paddw      xmm1, xmm3
+    movdqu     xmm2, [eax + edi]
+    movdqu     xmm3, [eax + edi + 16]
     lea        eax, [eax + 32]
-    pavgb      xmm2, xmm4
-    pavgb      xmm3, xmm5
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm7
-    pand       xmm3, xmm7
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-
-    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
-    psrlw      xmm0, 8
-    pand       xmm2, xmm7
-    pavgw      xmm0, xmm2
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2  // add row 3
+    paddw      xmm1, xmm3
+    phaddw     xmm0, xmm1
+    paddw      xmm0, xmm5  // + 8 for round
+    psrlw      xmm0, 4  // /16 for average of 4 * 4
     packuswb   xmm0, xmm0
-
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
     sub        ecx, 8
@@ -398,15 +400,16 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 
 #ifdef HAS_SCALEROWDOWN4_AVX2
 // Point samples 64 pixels to 16 pixels.
-__declspec(naked)
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8_t* dst_ptr,
+                                          int dst_width) {
   __asm {
-    mov         eax, [esp + 4]        // src_ptr
-                                      // src_stride ignored
-    mov         edx, [esp + 12]       // dst_ptr
-    mov         ecx, [esp + 16]       // dst_width
-    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
+    mov         eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov         edx, [esp + 12]  // dst_ptr
+    mov         ecx, [esp + 16]  // dst_width
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000
     vpsrld      ymm5, ymm5, 24
     vpslld      ymm5, ymm5, 16
 
@@ -417,10 +420,10 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
     vpand       ymm0, ymm0, ymm5
     vpand       ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vpsrlw      ymm0, ymm0, 8
     vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], xmm0
     lea         edx, [edx + 16]
     sub         ecx, 16
@@ -432,48 +435,53 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 64x4 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                                             ptrdiff_t src_stride,
+                                             uint8_t* dst_ptr,
+                                             int dst_width) {
   __asm {
     push        esi
     push        edi
-    mov         eax, [esp + 8 + 4]    // src_ptr
-    mov         esi, [esp + 8 + 8]    // src_stride
-    mov         edx, [esp + 8 + 12]   // dst_ptr
-    mov         ecx, [esp + 8 + 16]   // dst_width
+    mov         eax, [esp + 8 + 4]  // src_ptr
+    mov         esi, [esp + 8 + 8]  // src_stride
+    mov         edx, [esp + 8 + 12]  // dst_ptr
+    mov         ecx, [esp + 8 + 16]  // dst_width
     lea         edi, [esi + esi * 2]  // src_stride * 3
-    vpcmpeqb    ymm7, ymm7, ymm7      // generate mask 0x00ff00ff
-    vpsrlw      ymm7, ymm7, 8
+    vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101
+    vpsrlw      ymm4, ymm4, 15
+    vpsllw      ymm5, ymm4, 3  // constant 0x0008
+    vpackuswb   ymm4, ymm4, ymm4
 
   wloop:
-    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm0, [eax]  // average rows
     vmovdqu     ymm1, [eax + 32]
-    vpavgb      ymm0, ymm0, [eax + esi]
-    vpavgb      ymm1, ymm1, [eax + esi + 32]
+    vmovdqu     ymm2, [eax + esi]
+    vmovdqu     ymm3, [eax + esi + 32]
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1
+    vpaddw      ymm1, ymm1, ymm3
     vmovdqu     ymm2, [eax + esi * 2]
     vmovdqu     ymm3, [eax + esi * 2 + 32]
-    vpavgb      ymm2, ymm2, [eax + edi]
-    vpavgb      ymm3, ymm3, [eax + edi + 32]
-    lea         eax, [eax + 64]
-    vpavgb      ymm0, ymm0, ymm2
-    vpavgb      ymm1, ymm1, ymm3
-
-    vpand       ymm2, ymm0, ymm7      // average columns (64 to 32 pixels)
-    vpand       ymm3, ymm1, ymm7
-    vpsrlw      ymm0, ymm0, 8
-    vpsrlw      ymm1, ymm1, 8
-    vpavgw      ymm0, ymm0, ymm2
-    vpavgw      ymm1, ymm1, ymm3
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
-
-    vpand       ymm2, ymm0, ymm7      // average columns (32 to 16 pixels)
-    vpsrlw      ymm0, ymm0, 8
-    vpavgw      ymm0, ymm0, ymm2
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2  // add row 2
+    vpaddw      ymm1, ymm1, ymm3
+    vmovdqu     ymm2, [eax + edi]
+    vmovdqu     ymm3, [eax + edi + 32]
+    lea         eax,  [eax + 64]
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2  // add row 3
+    vpaddw      ymm1, ymm1, ymm3
+    vphaddw     ymm0, ymm0, ymm1  // mutates
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw
+    vpaddw      ymm0, ymm0, ymm5  // + 8 for round
+    vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4
     vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
-
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vmovdqu     [edx], xmm0
     lea         edx, [edx + 16]
     sub         ecx, 16
@@ -491,17 +499,18 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 // Then shuffled to do the scaling.
 
-__declspec(naked)
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            uint8_t* dst_ptr,
+                                            int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    movdqa     xmm3, kShuf0
-    movdqa     xmm4, kShuf1
-    movdqa     xmm5, kShuf2
+    mov        eax, [esp + 4]   // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+    movdqa     xmm3, xmmword ptr kShuf0
+    movdqa     xmm4, xmmword ptr kShuf1
+    movdqa     xmm5, xmmword ptr kShuf2
 
   wloop:
     movdqu     xmm0, [eax]
@@ -538,25 +547,25 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 // xmm7 kRound34
 
 // Note that movdqa+palign may be better than movdqu.
-__declspec(naked)
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, kShuf01
-    movdqa     xmm3, kShuf11
-    movdqa     xmm4, kShuf21
-    movdqa     xmm5, kMadd01
-    movdqa     xmm6, kMadd11
-    movdqa     xmm7, kRound34
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
+    movdqa     xmm2, xmmword ptr kShuf01
+    movdqa     xmm3, xmmword ptr kShuf11
+    movdqa     xmm4, xmmword ptr kShuf21
+    movdqa     xmm5, xmmword ptr kMadd01
+    movdqa     xmm6, xmmword ptr kMadd11
+    movdqa     xmm7, xmmword ptr kRound34
 
   wloop:
-    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm0, [eax]  // pixels 0..7
     movdqu     xmm1, [eax + esi]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm2
@@ -565,7 +574,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm0, [eax + 8]  // pixels 8..15
     movdqu     xmm1, [eax + esi + 8]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm3
@@ -574,12 +583,12 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm0, [eax + 16]  // pixels 16..23
     movdqu     xmm1, [eax + esi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm4
-    movdqa     xmm1, kMadd21
+    movdqa     xmm1, xmmword ptr kMadd21
     pmaddubsw  xmm0, xmm1
     paddsw     xmm0, xmm7
     psrlw      xmm0, 2
@@ -595,25 +604,25 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
 }
 
 // Note that movdqa+palign may be better than movdqu.
-__declspec(naked)
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, kShuf01
-    movdqa     xmm3, kShuf11
-    movdqa     xmm4, kShuf21
-    movdqa     xmm5, kMadd01
-    movdqa     xmm6, kMadd11
-    movdqa     xmm7, kRound34
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
+    movdqa     xmm2, xmmword ptr kShuf01
+    movdqa     xmm3, xmmword ptr kShuf11
+    movdqa     xmm4, xmmword ptr kShuf21
+    movdqa     xmm5, xmmword ptr kMadd01
+    movdqa     xmm6, xmmword ptr kMadd11
+    movdqa     xmm7, xmmword ptr kRound34
 
   wloop:
-    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm0, [eax]  // pixels 0..7
     movdqu     xmm1, [eax + esi]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
@@ -623,7 +632,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm0, [eax + 8]  // pixels 8..15
     movdqu     xmm1, [eax + esi + 8]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
@@ -633,13 +642,13 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm0, [eax + 16]  // pixels 16..23
     movdqu     xmm1, [eax + esi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm4
-    movdqa     xmm1, kMadd21
+    movdqa     xmm1, xmmword ptr kMadd21
     pmaddubsw  xmm0, xmm1
     paddsw     xmm0, xmm7
     psrlw      xmm0, 2
@@ -657,26 +666,27 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
 // 3/8 point sampler
 
 // Scale 32 pixels to 12
-__declspec(naked)
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            uint8_t* dst_ptr,
+                                            int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    movdqa     xmm4, kShuf38a
-    movdqa     xmm5, kShuf38b
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+    movdqa     xmm4, xmmword ptr kShuf38a
+    movdqa     xmm5, xmmword ptr kShuf38b
 
   xloop:
-    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
-    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11
     lea        eax, [eax + 32]
     pshufb     xmm0, xmm4
     pshufb     xmm1, xmm5
     paddusb    xmm0, xmm1
 
-    movq       qword ptr [edx], xmm0  // write 12 pixels
+    movq       qword ptr [edx], xmm0       // write 12 pixels
     movhlps    xmm1, xmm0
     movd       [edx + 8], xmm1
     lea        edx, [edx + 12]
@@ -688,23 +698,23 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked)
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, kShufAc
-    movdqa     xmm3, kShufAc3
-    movdqa     xmm4, kScaleAc33
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
+    movdqa     xmm2, xmmword ptr kShufAc
+    movdqa     xmm3, xmmword ptr kShufAc3
+    movdqa     xmm4, xmmword ptr kScaleAc33
     pxor       xmm5, xmm5
 
   xloop:
-    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1
     movdqu     xmm6, [eax + esi]
     movhlps    xmm1, xmm0
     movhlps    xmm7, xmm6
@@ -722,14 +732,14 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
     paddusw    xmm0, xmm6
     paddusw    xmm1, xmm7
 
-    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
+    movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6
     psrldq     xmm0, 2
     paddusw    xmm6, xmm0
     psrldq     xmm0, 2
     paddusw    xmm6, xmm0
     pshufb     xmm6, xmm2
 
-    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
+    movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6
     psrldq     xmm1, 2
     paddusw    xmm7, xmm1
     psrldq     xmm1, 2
@@ -737,10 +747,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
     pshufb     xmm7, xmm3
     paddusw    xmm6, xmm7
 
-    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
+    pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6
     packuswb   xmm6, xmm6
 
-    movd       [edx], xmm6           // write 6 pixels
+    movd       [edx], xmm6  // write 6 pixels
     psrlq      xmm6, 16
     movd       [edx + 2], xmm6
     lea        edx, [edx + 6]
@@ -753,28 +763,28 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
 }
 
 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked)
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, kShufAb0
-    movdqa     xmm3, kShufAb1
-    movdqa     xmm4, kShufAb2
-    movdqa     xmm5, kScaleAb2
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
+    movdqa     xmm2, xmmword ptr kShufAb0
+    movdqa     xmm3, xmmword ptr kShufAb1
+    movdqa     xmm4, xmmword ptr kShufAb2
+    movdqa     xmm5, xmmword ptr kScaleAb2
 
   xloop:
-    movdqu     xmm0, [eax]           // average 2 rows into xmm0
+    movdqu     xmm0, [eax]  // average 2 rows into xmm0
     movdqu     xmm1, [eax + esi]
     lea        eax, [eax + 16]
     pavgb      xmm0, xmm1
 
-    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
+    movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1
     pshufb     xmm1, xmm2
     movdqa     xmm6, xmm0
     pshufb     xmm6, xmm3
@@ -782,10 +792,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
     pshufb     xmm0, xmm4
     paddusw    xmm1, xmm0
 
-    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
+    pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2
     packuswb   xmm1, xmm1
 
-    movd       [edx], xmm1           // write 6 pixels
+    movd       [edx], xmm1  // write 6 pixels
     psrlq      xmm1, 16
     movd       [edx + 2], xmm1
     lea        edx, [edx + 6]
@@ -798,26 +808,27 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
 }
 
 // Reads 16 bytes and accumulates to 16 shorts at a time.
-__declspec(naked)
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+                                        uint16_t* dst_ptr,
+                                        int src_width) {
   __asm {
-    mov        eax, [esp + 4]   // src_ptr
-    mov        edx, [esp + 8]   // dst_ptr
+    mov        eax, [esp + 4]  // src_ptr
+    mov        edx, [esp + 8]  // dst_ptr
     mov        ecx, [esp + 12]  // src_width
     pxor       xmm5, xmm5
 
-  // sum rows
+        // sum rows
   xloop:
-    movdqu     xmm3, [eax]       // read 16 bytes
+    movdqu     xmm3, [eax]  // read 16 bytes
     lea        eax, [eax + 16]
-    movdqu     xmm0, [edx]       // read 16 words from destination
+    movdqu     xmm0, [edx]  // read 16 words from destination
     movdqu     xmm1, [edx + 16]
     movdqa     xmm2, xmm3
     punpcklbw  xmm2, xmm5
     punpckhbw  xmm3, xmm5
-    paddusw    xmm0, xmm2        // sum 16 words
+    paddusw    xmm0, xmm2  // sum 16 words
     paddusw    xmm1, xmm3
-    movdqu     [edx], xmm0       // write 16 words to destination
+    movdqu     [edx], xmm0  // write 16 words to destination
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 16
@@ -828,24 +839,25 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
 
 #ifdef HAS_SCALEADDROW_AVX2
 // Reads 32 bytes and accumulates to 32 shorts at a time.
-__declspec(naked)
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+                                        uint16_t* dst_ptr,
+                                        int src_width) {
   __asm {
-    mov         eax, [esp + 4]   // src_ptr
-    mov         edx, [esp + 8]   // dst_ptr
+    mov         eax, [esp + 4]  // src_ptr
+    mov         edx, [esp + 8]  // dst_ptr
     mov         ecx, [esp + 12]  // src_width
     vpxor       ymm5, ymm5, ymm5
 
-  // sum rows
+        // sum rows
   xloop:
-    vmovdqu     ymm3, [eax]       // read 32 bytes
+    vmovdqu     ymm3, [eax]  // read 32 bytes
     lea         eax, [eax + 32]
     vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
     vpunpcklbw  ymm2, ymm3, ymm5
     vpunpckhbw  ymm3, ymm3, ymm5
-    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
+    vpaddusw    ymm0, ymm2, [edx]  // sum 16 words
     vpaddusw    ymm1, ymm3, [edx + 32]
-    vmovdqu     [edx], ymm0       // write 32 words to destination
+    vmovdqu     [edx], ymm0  // write 32 words to destination
     vmovdqu     [edx + 32], ymm1
     lea         edx, [edx + 64]
     sub         ecx, 32
@@ -857,72 +869,90 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
 }
 #endif  // HAS_SCALEADDROW_AVX2
 
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                               0x4040, 0x4040, 0x4040, 0x4040};
+
 // Bilinear column filtering. SSSE3 version.
-__declspec(naked)
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx) {
+__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                                             const uint8_t* src_ptr,
+                                             int dst_width,
+                                             int x,
+                                             int dx) {
   __asm {
     push       ebx
     push       esi
     push       edi
-    mov        edi, [esp + 12 + 4]    // dst_ptr
-    mov        esi, [esp + 12 + 8]    // src_ptr
-    mov        ecx, [esp + 12 + 12]   // dst_width
+    mov        edi, [esp + 12 + 4]  // dst_ptr
+    mov        esi, [esp + 12 + 8]  // src_ptr
+    mov        ecx, [esp + 12 + 12]  // dst_width
     movd       xmm2, [esp + 12 + 16]  // x
     movd       xmm3, [esp + 12 + 20]  // dx
-    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
+    mov        eax, 0x04040000  // shuffle to line up fractions with pixel.
     movd       xmm5, eax
-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
     psrlw      xmm6, 9
-    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    pcmpeqb    xmm7, xmm7  // generate 0x0001
+    psrlw      xmm7, 15
+    pextrw     eax, xmm2, 1  // get x0 integer. preroll
     sub        ecx, 2
     jl         xloop29
 
-    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    movdqa     xmm0, xmm2  // x1 = x0 + dx
     paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0           // x0 x1
-    punpckldq  xmm3, xmm3           // dx dx
-    paddd      xmm3, xmm3           // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+    punpckldq  xmm2, xmm0  // x0 x1
+    punpckldq  xmm3, xmm3  // dx dx
+    paddd      xmm3, xmm3  // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3  // get x1 integer. preroll
 
     // 2 Pixel loop.
   xloop2:
-    movdqa     xmm1, xmm2           // x0, x1 fractions.
-    paddd      xmm2, xmm3           // x += dx
+    movdqa     xmm1, xmm2  // x0, x1 fractions.
+    paddd      xmm2, xmm3  // x += dx
     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
     movd       xmm0, ebx
-    psrlw      xmm1, 9              // 7 bit fractions.
+    psrlw      xmm1, 9  // 7 bit fractions.
     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
     movd       xmm4, ebx
-    pshufb     xmm1, xmm5           // 0011
+    pshufb     xmm1, xmm5  // 0011
     punpcklwd  xmm0, xmm4
-    pxor       xmm1, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
-    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
-    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
-    movd       ebx, xmm0
+    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
+    pxor       xmm1, xmm6  // 0..7f and 7f..0
+    paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1
+    pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
+    paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
+    psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.
+    packuswb   xmm1, xmm1  // 8 bits, 2 pixels.
+    movd       ebx, xmm1
     mov        [edi], bx
     lea        edi, [edi + 2]
-    sub        ecx, 2               // 2 pixels
+    sub        ecx, 2  // 2 pixels
     jge        xloop2
 
  xloop29:
-
     add        ecx, 2 - 1
     jl         xloop99
 
-    // 1 pixel remainder
+            // 1 pixel remainder
     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
     movd       xmm0, ebx
-    psrlw      xmm2, 9              // 7 bit fractions.
-    pshufb     xmm2, xmm5           // 0011
-    pxor       xmm2, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm2           // 16 bit
-    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
-    packuswb   xmm0, xmm0           // 8 bits
-    movd       ebx, xmm0
+    psrlw      xmm2, 9  // 7 bit fractions.
+    pshufb     xmm2, xmm5  // 0011
+    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
+    pxor       xmm2, xmm6  // 0..7f and 7f..0
+    paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1
+    pmaddubsw  xmm2, xmm0  // 16 bit
+    paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
+    psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.
+    packuswb   xmm2, xmm2  // 8 bits
+    movd       ebx, xmm2
     mov        [edi], bl
 
  xloop99:
@@ -935,13 +965,15 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 }
 
 // Reads 16 pixels, duplicates them and writes 32 pixels.
-__declspec(naked)
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
+__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                                         const uint8_t* src_ptr,
+                                         int dst_width,
+                                         int x,
+                                         int dx) {
   __asm {
-    mov        edx, [esp + 4]    // dst_ptr
-    mov        eax, [esp + 8]    // src_ptr
-    mov        ecx, [esp + 12]   // dst_width
+    mov        edx, [esp + 4]  // dst_ptr
+    mov        eax, [esp + 8]  // src_ptr
+    mov        ecx, [esp + 12]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -960,15 +992,15 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 
 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-__declspec(naked)
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+                                              ptrdiff_t src_stride,
+                                              uint8_t* dst_argb,
+                                              int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_argb
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_argb
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]   // src_argb
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_argb
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -985,23 +1017,23 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
 }
 
 // Blends 8x1 rectangle to 4x1.
-__declspec(naked)
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+                                                    ptrdiff_t src_stride,
+                                                    uint8_t* dst_argb,
+                                                    int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_argb
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_argb
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_argb
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_argb
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd       // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1013,16 +1045,16 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
 }
 
 // Blends 8x2 rectangle to 4x1.
-__declspec(naked)
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t* dst_argb,
+                                                 int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_argb
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_argb
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -1030,11 +1062,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
     movdqu     xmm2, [eax + esi]
     movdqu     xmm3, [eax + esi + 16]
     lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm0, xmm2  // average rows
     pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd  // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1047,18 +1079,19 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
 }
 
 // Reads 4 pixels at a time.
-__declspec(naked)
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                                                 ptrdiff_t src_stride,
+                                                 int src_stepx,
+                                                 uint8_t* dst_argb,
+                                                 int dst_width) {
   __asm {
     push       ebx
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_argb
-                                     // src_stride ignored
-    mov        ebx, [esp + 8 + 12]   // src_stepx
-    mov        edx, [esp + 8 + 16]   // dst_argb
-    mov        ecx, [esp + 8 + 20]   // dst_width
+    mov        eax, [esp + 8 + 4]   // src_argb
+    // src_stride ignored
+    mov        ebx, [esp + 8 + 12]  // src_stepx
+    mov        edx, [esp + 8 + 16]  // dst_argb
+    mov        ecx, [esp + 8 + 20]  // dst_width
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]
 
@@ -1083,21 +1116,21 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
 }
 
 // Blends four 2x2 to 4x1.
-__declspec(naked)
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                                    ptrdiff_t src_stride,
+                                                    int src_stepx,
+                                                    uint8_t* dst_argb,
+                                                    int dst_width) {
   __asm {
     push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 12 + 4]    // src_argb
-    mov        esi, [esp + 12 + 8]    // src_stride
-    mov        ebx, [esp + 12 + 12]   // src_stepx
-    mov        edx, [esp + 12 + 16]   // dst_argb
-    mov        ecx, [esp + 12 + 20]   // dst_width
-    lea        esi, [eax + esi]       // row1 pointer
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        esi, [esp + 12 + 8]  // src_stride
+    mov        ebx, [esp + 12 + 12]  // src_stepx
+    mov        edx, [esp + 12 + 16]  // dst_argb
+    mov        ecx, [esp + 12 + 20]  // dst_width
+    lea        esi, [eax + esi]  // row1 pointer
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]
 
@@ -1112,11 +1145,11 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
     movq       xmm3, qword ptr [esi + ebx * 2]
     movhps     xmm3, qword ptr [esi + edi]
     lea        esi,  [esi + ebx * 4]
-    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm0, xmm2  // average rows
     pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd  // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1131,64 +1164,66 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
 }
 
 // Column scaling unfiltered. SSE2 version.
-__declspec(naked)
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                                          const uint8_t* src_argb,
+                                          int dst_width,
+                                          int x,
+                                          int dx) {
   __asm {
     push       edi
     push       esi
-    mov        edi, [esp + 8 + 4]    // dst_argb
-    mov        esi, [esp + 8 + 8]    // src_argb
-    mov        ecx, [esp + 8 + 12]   // dst_width
+    mov        edi, [esp + 8 + 4]  // dst_argb
+    mov        esi, [esp + 8 + 8]  // src_argb
+    mov        ecx, [esp + 8 + 12]  // dst_width
     movd       xmm2, [esp + 8 + 16]  // x
     movd       xmm3, [esp + 8 + 20]  // dx
 
-    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
-    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
+    pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0
+    pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0
     paddd      xmm2, xmm0
-    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
-    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
-    paddd      xmm2, xmm0            // x3 x2 x1 x0
-    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
-    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
+    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2
+    pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0
+    paddd      xmm2, xmm0  // x3 x2 x1 x0
+    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4
+    pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4
 
-    pextrw     eax, xmm2, 1          // get x0 integer.
-    pextrw     edx, xmm2, 3          // get x1 integer.
+    pextrw     eax, xmm2, 1  // get x0 integer.
+    pextrw     edx, xmm2, 3  // get x1 integer.
 
     cmp        ecx, 0
     jle        xloop99
     sub        ecx, 4
     jl         xloop49
 
-    // 4 Pixel loop.
+        // 4 Pixel loop.
  xloop4:
     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5           // get x2 integer.
-    pextrw     edx, xmm2, 7           // get x3 integer.
-    paddd      xmm2, xmm3             // x += dx
-    punpckldq  xmm0, xmm1             // x0 x1
+    pextrw     eax, xmm2, 5  // get x2 integer.
+    pextrw     edx, xmm2, 7  // get x3 integer.
+    paddd      xmm2, xmm3  // x += dx
+    punpckldq  xmm0, xmm1  // x0 x1
 
     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
-    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
-    punpckldq  xmm1, xmm4             // x2 x3
-    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
+    punpckldq  xmm1, xmm4  // x2 x3
+    punpcklqdq xmm0, xmm1  // x0 x1 x2 x3
     movdqu     [edi], xmm0
     lea        edi, [edi + 16]
-    sub        ecx, 4                 // 4 pixels
+    sub        ecx, 4  // 4 pixels
     jge        xloop4
 
  xloop49:
     test       ecx, 2
     je         xloop29
 
-    // 2 Pixels.
+        // 2 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5           // get x2 integer.
-    punpckldq  xmm0, xmm1             // x0 x1
+    pextrw     eax, xmm2, 5  // get x2 integer.
+    punpckldq  xmm0, xmm1  // x0 x1
 
     movq       qword ptr [edi], xmm0
     lea        edi, [edi + 8]
@@ -1197,7 +1232,7 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
     test       ecx, 1
     je         xloop99
 
-    // 1 Pixels.
+        // 1 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
     movd       dword ptr [edi], xmm0
  xloop99:
@@ -1212,60 +1247,62 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
 // TODO(fbarchard): Port to Neon
 
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static uvec8 kShuffleColARGB = {
-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+static const uvec8 kShuffleColARGB = {
+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 };
 
 // Shuffle table for duplicating 2 fractions into 8 bytes each
-static uvec8 kShuffleFractions = {
-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+static const uvec8 kShuffleFractions = {
+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
 
-__declspec(naked)
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                                                 const uint8_t* src_argb,
+                                                 int dst_width,
+                                                 int x,
+                                                 int dx) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]    // dst_argb
-    mov        esi, [esp + 8 + 8]    // src_argb
-    mov        ecx, [esp + 8 + 12]   // dst_width
+    mov        edi, [esp + 8 + 4]  // dst_argb
+    mov        esi, [esp + 8 + 8]  // src_argb
+    mov        ecx, [esp + 8 + 12]  // dst_width
     movd       xmm2, [esp + 8 + 16]  // x
     movd       xmm3, [esp + 8 + 20]  // dx
-    movdqa     xmm4, kShuffleColARGB
-    movdqa     xmm5, kShuffleFractions
-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    movdqa     xmm4, xmmword ptr kShuffleColARGB
+    movdqa     xmm5, xmmword ptr kShuffleFractions
+    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
     psrlw      xmm6, 9
-    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    pextrw     eax, xmm2, 1  // get x0 integer. preroll
     sub        ecx, 2
     jl         xloop29
 
-    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    movdqa     xmm0, xmm2  // x1 = x0 + dx
     paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0           // x0 x1
-    punpckldq  xmm3, xmm3           // dx dx
-    paddd      xmm3, xmm3           // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+    punpckldq  xmm2, xmm0  // x0 x1
+    punpckldq  xmm3, xmm3  // dx dx
+    paddd      xmm3, xmm3  // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3  // get x1 integer. preroll
 
     // 2 Pixel loop.
   xloop2:
-    movdqa     xmm1, xmm2           // x0, x1 fractions.
-    paddd      xmm2, xmm3           // x += dx
+    movdqa     xmm1, xmm2  // x0, x1 fractions.
+    paddd      xmm2, xmm3  // x += dx
     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    psrlw      xmm1, 9              // 7 bit fractions.
+    psrlw      xmm1, 9  // 7 bit fractions.
     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
-    pshufb     xmm1, xmm5           // 0000000011111111
-    pshufb     xmm0, xmm4           // arrange pixels into pairs
-    pxor       xmm1, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
-    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
-    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
+    pshufb     xmm1, xmm5  // 0000000011111111
+    pshufb     xmm0, xmm4  // arrange pixels into pairs
+    pxor       xmm1, xmm6  // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
+    psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.
     movq       qword ptr [edi], xmm0
     lea        edi, [edi + 8]
-    sub        ecx, 2               // 2 pixels
+    sub        ecx, 2  // 2 pixels
     jge        xloop2
 
  xloop29:
@@ -1273,15 +1310,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
     add        ecx, 2 - 1
     jl         xloop99
 
-    // 1 pixel remainder
-    psrlw      xmm2, 9              // 7 bit fractions.
+            // 1 pixel remainder
+    psrlw      xmm2, 9  // 7 bit fractions.
     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    pshufb     xmm2, xmm5           // 00000000
-    pshufb     xmm0, xmm4           // arrange pixels into pairs
-    pxor       xmm2, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
+    pshufb     xmm2, xmm5  // 00000000
+    pshufb     xmm0, xmm4  // arrange pixels into pairs
+    pxor       xmm2, xmm6  // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
+    packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.
     movd       [edi], xmm0
 
  xloop99:
@@ -1293,13 +1330,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
-__declspec(naked)
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                                             const uint8_t* src_argb,
+                                             int dst_width,
+                                             int x,
+                                             int dx) {
   __asm {
-    mov        edx, [esp + 4]    // dst_argb
-    mov        eax, [esp + 8]    // src_argb
-    mov        ecx, [esp + 12]   // dst_width
+    mov        edx, [esp + 4]  // dst_argb
+    mov        eax, [esp + 8]  // src_argb
+    mov        ecx, [esp + 12]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -1318,12 +1357,11 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked)
-int FixedDiv_X86(int num, int div) {
+__declspec(naked) int FixedDiv_X86(int num, int div) {
   __asm {
-    mov        eax, [esp + 4]    // num
-    cdq                          // extend num to 64 bits
-    shld       edx, eax, 16      // 32.16
+    mov        eax, [esp + 4]  // num
+    cdq  // extend num to 64 bits
+    shld       edx, eax, 16  // 32.16
     shl        eax, 16
     idiv       dword ptr [esp + 8]
     ret
@@ -1331,13 +1369,12 @@ int FixedDiv_X86(int num, int div) {
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked)
-int FixedDiv1_X86(int num, int div) {
+__declspec(naked) int FixedDiv1_X86(int num, int div) {
   __asm {
-    mov        eax, [esp + 4]    // num
-    mov        ecx, [esp + 8]    // denom
-    cdq                          // extend num to 64 bits
-    shld       edx, eax, 16      // 32.16
+    mov        eax, [esp + 4]  // num
+    mov        ecx, [esp + 8]  // denom
+    cdq  // extend num to 64 bits
+    shld       edx, eax, 16  // 32.16
     shl        eax, 16
     sub        eax, 0x00010001
     sbb        edx, 0
diff --git a/media/libaom/src/third_party/libyuv/source/video_common.cc b/media/libaom/src/third_party/libyuv/source/video_common.cc
index 379a0669ae..92384c050c 100644
--- a/media/libaom/src/third_party/libyuv/source/video_common.cc
+++ b/media/libaom/src/third_party/libyuv/source/video_common.cc
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include "libyuv/video_common.h"
 
 #ifdef __cplusplus
@@ -16,39 +15,39 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
-
 struct FourCCAliasEntry {
-  uint32 alias;
-  uint32 canonical;
+  uint32_t alias;
+  uint32_t canonical;
 };
 
-static const struct FourCCAliasEntry kFourCCAliases[] = {
-  {FOURCC_IYUV, FOURCC_I420},
-  {FOURCC_YU16, FOURCC_I422},
-  {FOURCC_YU24, FOURCC_I444},
-  {FOURCC_YUYV, FOURCC_YUY2},
-  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
-  {FOURCC_HDYC, FOURCC_UYVY},
-  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
-  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
-  {FOURCC_DMB1, FOURCC_MJPG},
-  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
-  {FOURCC_RGB3, FOURCC_RAW },
-  {FOURCC_BGR3, FOURCC_24BG},
-  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
-  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB
-  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
-  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
-  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
+#define NUM_ALIASES 18
+static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = {
+    {FOURCC_IYUV, FOURCC_I420},
+    {FOURCC_YU12, FOURCC_I420},
+    {FOURCC_YU16, FOURCC_I422},
+    {FOURCC_YU24, FOURCC_I444},
+    {FOURCC_YUYV, FOURCC_YUY2},
+    {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
+    {FOURCC_HDYC, FOURCC_UYVY},
+    {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
+    {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+    {FOURCC_DMB1, FOURCC_MJPG},
+    {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
+    {FOURCC_RGB3, FOURCC_RAW},
+    {FOURCC_BGR3, FOURCC_24BG},
+    {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
+    {FOURCC_CM24, FOURCC_RAW},   // kCMPixelFormat_24RGB
+    {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
+    {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
+    {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
 };
 // TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
 //  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
 
 LIBYUV_API
-uint32 CanonicalFourCC(uint32 fourcc) {
+uint32_t CanonicalFourCC(uint32_t fourcc) {
   int i;
-  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+  for (i = 0; i < NUM_ALIASES; ++i) {
     if (kFourCCAliases[i].alias == fourcc) {
       return kFourCCAliases[i].canonical;
     }
@@ -61,4 +60,3 @@ uint32 CanonicalFourCC(uint32 fourcc) {
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
diff --git a/media/libaom/src/third_party/vector/vector.c b/media/libaom/src/third_party/vector/vector.c
index 4b8b9c6fd9..2295b8f080 100644
--- a/media/libaom/src/third_party/vector/vector.c
+++ b/media/libaom/src/third_party/vector/vector.c
@@ -3,7 +3,7 @@ The MIT License(MIT)
 Copyright(c) 2016 Peter Goldsborough
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files(the "Software"), to deal in
+this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 the Software, and to permit persons to whom the Software is furnished to do so,
diff --git a/media/libaom/src/third_party/vector/vector.h b/media/libaom/src/third_party/vector/vector.h
index d09eb64c93..acc70fe099 100644
--- a/media/libaom/src/third_party/vector/vector.h
+++ b/media/libaom/src/third_party/vector/vector.h
@@ -3,7 +3,7 @@ The MIT License(MIT)
 Copyright(c) 2016 Peter Goldsborough
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files(the "Software"), to deal in
+this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 the Software, and to permit persons to whom the Software is furnished to do so,
diff --git a/media/libaom/src/third_party/x86inc/README.libaom b/media/libaom/src/third_party/x86inc/README.libaom
index 07c4dad20a..2f3e5c2620 100644
--- a/media/libaom/src/third_party/x86inc/README.libaom
+++ b/media/libaom/src/third_party/x86inc/README.libaom
@@ -1,5 +1,5 @@
 URL: https://git.videolan.org/git/x264.git
-Version: d23d18655249944c1ca894b451e2c82c7a584c62
+Version: 3e5aed95cc470f37e2db3e6506a8deb89b527720
 License: ISC
 License File: LICENSE
 
@@ -8,13 +8,11 @@ x264/libav's framework for x86 assembly. Contains a variety of macros and
 defines that help automatically allow assembly to work cross-platform.
 
 Local Modifications:
-Get configuration from aom_config.asm.
+Get configuration from config/aom_config.asm.
 Prefix functions with aom by default.
 Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
   exist in libaom.
-Expand PIC default to macho64 and respect CONFIG_PIC from libaom
-Set 'private_extern' visibility for macho targets.
 Copy PIC 'GLOBAL' macros from x86_abi_support.asm
 Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
-Use .text with no alignment for aout
-Only use 'hidden' visibility with Chromium
+Use .text with no alignment for aout.
+Only use 'hidden' visibility with Chromium.
diff --git a/media/libaom/src/third_party/x86inc/x86inc.asm b/media/libaom/src/third_party/x86inc/x86inc.asm
index adaf2d99e9..e48d64441e 100644
--- a/media/libaom/src/third_party/x86inc/x86inc.asm
+++ b/media/libaom/src/third_party/x86inc/x86inc.asm
@@ -1,12 +1,12 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2016 x264 project
+;* Copyright (C) 2005-2019 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Henrik Gramner <henrik@gramner.com>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
 ;*          Fiona Glaser <fiona@x264.com>
-;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* Permission to use, copy, modify, and/or distribute this software for any
 ;* purpose with or without fee is hereby granted, provided that the above
@@ -67,19 +67,19 @@
 %endif
 
 %define FORMAT_ELF 0
+%define FORMAT_MACHO 0
 %ifidn __OUTPUT_FORMAT__,elf
     %define FORMAT_ELF 1
 %elifidn __OUTPUT_FORMAT__,elf32
     %define FORMAT_ELF 1
 %elifidn __OUTPUT_FORMAT__,elf64
     %define FORMAT_ELF 1
-%endif
-
-%define FORMAT_MACHO 0
-%ifidn __OUTPUT_FORMAT__,macho32
-     %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho
+    %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho32
+    %define FORMAT_MACHO 1
 %elifidn __OUTPUT_FORMAT__,macho64
-     %define FORMAT_MACHO 1
+    %define FORMAT_MACHO 1
 %endif
 
 ; Set PREFIX for libaom builds.
@@ -103,7 +103,11 @@
 ; works around the issue. It appears to be specific to the way libaom
 ; handles the tables.
 %macro SECTION_RODATA 0-1 16
-    %ifidn __OUTPUT_FORMAT__,macho32
+    %ifidn __OUTPUT_FORMAT__,win32
+        SECTION .rdata align=%1
+    %elif WIN64
+        SECTION .rdata align=%1
+    %elifidn __OUTPUT_FORMAT__,macho32
         SECTION .text align=%1
         fakegot:
     %elifidn __OUTPUT_FORMAT__,aout
@@ -113,8 +117,7 @@
     %endif
 %endmacro
 
-; PIC macros are copied from aom_ports/x86_abi_support.asm. The "define PIC"
-; from original code is added in for 64bit.
+; PIC macros from aom_ports/x86_abi_support.asm.
 %ifidn __OUTPUT_FORMAT__,elf32
 %define ABI_IS_32BIT 1
 %elifidn __OUTPUT_FORMAT__,macho32
@@ -203,10 +206,24 @@
 %ifndef GET_GOT_DEFINED
     %define GET_GOT_DEFINED 0
 %endif
-; Done with PIC macros
+; End PIC macros from aom_ports/x86_abi_support.asm.
+
+; libaom explicitly sets visibilty in shared object builds. Avoid setting
+; visibility to hidden as it may break builds that split sources on e.g.,
+; directory boundaries.
+%ifdef CHROMIUM
+    %define VISIBILITY hidden
+    %define HAVE_PRIVATE_EXTERN 1
+%else
+    %define VISIBILITY
+    %define HAVE_PRIVATE_EXTERN 0
+%endif
 
 %ifdef __NASM_VER__
     %use smartalign
+    %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
+        %define HAVE_PRIVATE_EXTERN 0
+    %endif
 %endif
 
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
@@ -324,6 +341,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
     %define gprsize 4
 %endif
 
+%macro LEA 2
+%if ARCH_X86_64
+    lea %1, [%2]
+%elif PIC
+    call $+5 ; special-cased to not affect the RSB on most CPU:s
+    pop %1
+    add %1, (%2)-$+1
+%else
+    mov %1, %2
+%endif
+%endmacro
+
 %macro PUSH 1
     push %1
     %ifidn rstk, rsp
@@ -385,6 +414,10 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
     %endif
 %endmacro
 
+%if ARCH_X86_64 == 0
+    %define movsxd movifnidn
+%endif
+
 %macro movsxdifnidn 2
     %ifnidn %1, %2
         movsxd %1, %2
@@ -433,6 +466,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 %endmacro
 
 %define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
 
 %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
     %ifnum %1
@@ -483,10 +518,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
     %ifnum %1
         %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
             %if %1 > 0
+                ; Reserve an additional register for storing the original stack pointer, but avoid using
+                ; eax/rax for this purpose since it can potentially get overwritten as a return value.
                 %assign regs_used (regs_used + 1)
+                %if ARCH_X86_64 && regs_used == 7
+                    %assign regs_used 8
+                %elif ARCH_X86_64 == 0 && regs_used == 1
+                    %assign regs_used 2
+                %endif
             %endif
             %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
-                ; Ensure that we don't clobber any registers containing arguments
+                ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
+                ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
                 %assign regs_used 5 + UNIX64 * 3
             %endif
         %endif
@@ -516,10 +559,10 @@ DECLARE_REG 7,  rdi, 64
 DECLARE_REG 8,  rsi, 72
 DECLARE_REG 9,  rbx, 80
 DECLARE_REG 10, rbp, 88
-DECLARE_REG 11, R12, 96
-DECLARE_REG 12, R13, 104
-DECLARE_REG 13, R14, 112
-DECLARE_REG 14, R15, 120
+DECLARE_REG 11, R14, 96
+DECLARE_REG 12, R15, 104
+DECLARE_REG 13, R12, 112
+DECLARE_REG 14, R13, 120
 
 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
@@ -538,15 +581,16 @@ DECLARE_REG 14, R15, 120
 
 %macro WIN64_PUSH_XMM 0
     ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
-    %if xmm_regs_used > 6
+    %if xmm_regs_used > 6 + high_mm_regs
         movaps [rstk + stack_offset +  8], xmm6
     %endif
-    %if xmm_regs_used > 7
+    %if xmm_regs_used > 7 + high_mm_regs
         movaps [rstk + stack_offset + 24], xmm7
     %endif
-    %if xmm_regs_used > 8
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
         %assign %%i 8
-        %rep xmm_regs_used-8
+        %rep %%xmm_regs_on_stack
             movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
             %assign %%i %%i+1
         %endrep
@@ -555,53 +599,56 @@ DECLARE_REG 14, R15, 120
 
 %macro WIN64_SPILL_XMM 1
     %assign xmm_regs_used %1
-    ASSERT xmm_regs_used <= 16
-    %if xmm_regs_used > 8
+    ASSERT xmm_regs_used <= 16 + high_mm_regs
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
         ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
-        %assign %%pad (xmm_regs_used-8)*16 + 32
+        %assign %%pad %%xmm_regs_on_stack*16 + 32
         %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
         SUB rsp, stack_size_padded
     %endif
     WIN64_PUSH_XMM
 %endmacro
 
-%macro WIN64_RESTORE_XMM_INTERNAL 1
+%macro WIN64_RESTORE_XMM_INTERNAL 0
     %assign %%pad_size 0
-    %if xmm_regs_used > 8
-        %assign %%i xmm_regs_used
-        %rep xmm_regs_used-8
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
+        %assign %%i xmm_regs_used - high_mm_regs
+        %rep %%xmm_regs_on_stack
             %assign %%i %%i-1
-            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
+            movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
         %endrep
     %endif
     %if stack_size_padded > 0
         %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
             mov rsp, rstkm
         %else
-            add %1, stack_size_padded
+            add rsp, stack_size_padded
             %assign %%pad_size stack_size_padded
         %endif
     %endif
-    %if xmm_regs_used > 7
-        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+    %if xmm_regs_used > 7 + high_mm_regs
+        movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
     %endif
-    %if xmm_regs_used > 6
-        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
+    %if xmm_regs_used > 6 + high_mm_regs
+        movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
     %endif
 %endmacro
 
-%macro WIN64_RESTORE_XMM 1
-    WIN64_RESTORE_XMM_INTERNAL %1
+%macro WIN64_RESTORE_XMM 0
+    WIN64_RESTORE_XMM_INTERNAL
     %assign stack_offset (stack_offset-stack_size_padded)
+    %assign stack_size_padded 0
     %assign xmm_regs_used 0
 %endmacro
 
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
 
 %macro RET 0
-    WIN64_RESTORE_XMM_INTERNAL rsp
+    WIN64_RESTORE_XMM_INTERNAL
     POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -620,14 +667,15 @@ DECLARE_REG 7,  R10, 16
 DECLARE_REG 8,  R11, 24
 DECLARE_REG 9,  rbx, 32
 DECLARE_REG 10, rbp, 40
-DECLARE_REG 11, R12, 48
-DECLARE_REG 12, R13, 56
-DECLARE_REG 13, R14, 64
-DECLARE_REG 14, R15, 72
+DECLARE_REG 11, R14, 48
+DECLARE_REG 12, R15, 56
+DECLARE_REG 13, R12, 64
+DECLARE_REG 14, R13, 72
 
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
     %assign regs_used %2
+    %assign xmm_regs_used %3
     ASSERT regs_used >= num_args
     SETUP_STACK_POINTER %4
     ASSERT regs_used <= 15
@@ -637,7 +685,7 @@ DECLARE_REG 14, R15, 72
     DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
 
 %macro RET 0
     %if stack_size_padded > 0
@@ -648,7 +696,7 @@ DECLARE_REG 14, R15, 72
         %endif
     %endif
     POP_IF_USED 14, 13, 12, 11, 10, 9
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -693,7 +741,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
     DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
 
 %macro RET 0
     %if stack_size_padded > 0
@@ -704,7 +752,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
         %endif
     %endif
     POP_IF_USED 6, 5, 4, 3
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -715,7 +763,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %if WIN64 == 0
     %macro WIN64_SPILL_XMM 1
     %endmacro
-    %macro WIN64_RESTORE_XMM 1
+    %macro WIN64_RESTORE_XMM 0
     %endmacro
     %macro WIN64_PUSH_XMM 0
     %endmacro
@@ -726,7 +774,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 ; We can automatically detect "follows a branch", but not a branch target.
 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
 %macro REP_RET 0
-    %if has_epilogue
+    %if has_epilogue || cpuflag(ssse3)
         RET
     %else
         rep ret
@@ -758,7 +806,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 
 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
 
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
     %if has_epilogue
         call %1
         RET
@@ -788,35 +836,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 %endmacro
 %macro cglobal_internal 2-3+
     annotate_function_size
-    %if %1
-        %xdefine %%FUNCTION_PREFIX private_prefix
-        ; libaom explicitly sets visibility in shared object builds. Avoid
-        ; setting visibility to hidden as it may break builds that split
-        ; sources on e.g., directory boundaries.
-        %ifdef CHROMIUM
-            %xdefine %%VISIBILITY hidden
+    %ifndef cglobaled_%2
+        %if %1
+            %xdefine %2 mangle(private_prefix %+ _ %+ %2)
         %else
-            %xdefine %%VISIBILITY
+            %xdefine %2 mangle(public_prefix %+ _ %+ %2)
         %endif
-    %else
-        %xdefine %%FUNCTION_PREFIX public_prefix
-        %xdefine %%VISIBILITY
-    %endif
-    %ifndef cglobaled_%2
-        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
         %xdefine %2.skip_prologue %2 %+ .skip_prologue
         CAT_XDEFINE cglobaled_, %2, 1
     %endif
     %xdefine current_function %2
     %xdefine current_function_section __SECT__
     %if FORMAT_ELF
-        global %2:function %%VISIBILITY
-    %elif FORMAT_MACHO
-        %ifdef __NASM_VER__
-            global %2
+        %if %1
+            global %2:function VISIBILITY
         %else
-            global %2:private_extern
+            global %2:function
         %endif
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
+        global %2:private_extern
     %else
         global %2
     %endif
@@ -827,12 +865,24 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %assign stack_offset 0      ; stack pointer offset relative to the return address
     %assign stack_size 0        ; amount of stack space that can be freely used inside a function
     %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
-    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
     %ifnidn %3, ""
         PROLOGUE %3
     %endif
 %endmacro
 
+; Create a global symbol from a local label with the correct name mangling and type
+%macro cglobal_label 1
+    %if FORMAT_ELF
+        global current_function %+ %1:function VISIBILITY
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+        global current_function %+ %1:private_extern
+    %else
+        global current_function %+ %1
+    %endif
+    %1:
+%endmacro
+
 %macro cextern 1
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
     CAT_XDEFINE cglobaled_, %1, 1
@@ -851,7 +901,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 %macro const 1-2+
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
     %if FORMAT_ELF
-        global %1:data hidden
+        global %1:data VISIBILITY
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+        global %1:private_extern
     %else
         global %1
     %endif
@@ -890,24 +942,26 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 %assign cpuflags_sse      (1<<4) | cpuflags_mmx2
 %assign cpuflags_sse2     (1<<5) | cpuflags_sse
 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
-%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
-%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
-%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
-%assign cpuflags_avx      (1<<11)| cpuflags_sse42
-%assign cpuflags_xop      (1<<12)| cpuflags_avx
-%assign cpuflags_fma4     (1<<13)| cpuflags_avx
-%assign cpuflags_fma3     (1<<14)| cpuflags_avx
-%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
-
-%assign cpuflags_cache32  (1<<16)
-%assign cpuflags_cache64  (1<<17)
-%assign cpuflags_slowctz  (1<<18)
-%assign cpuflags_lzcnt    (1<<19)
-%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
-%assign cpuflags_atom     (1<<21)
-%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
-%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
+%assign cpuflags_lzcnt    (1<<7) | cpuflags_sse2
+%assign cpuflags_sse3     (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3    (1<<9) | cpuflags_sse3
+%assign cpuflags_sse4     (1<<10)| cpuflags_ssse3
+%assign cpuflags_sse42    (1<<11)| cpuflags_sse4
+%assign cpuflags_aesni    (1<<12)| cpuflags_sse42
+%assign cpuflags_gfni     (1<<13)| cpuflags_sse42
+%assign cpuflags_avx      (1<<14)| cpuflags_sse42
+%assign cpuflags_xop      (1<<15)| cpuflags_avx
+%assign cpuflags_fma4     (1<<16)| cpuflags_avx
+%assign cpuflags_fma3     (1<<17)| cpuflags_avx
+%assign cpuflags_bmi1     (1<<18)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2     (1<<19)| cpuflags_bmi1
+%assign cpuflags_avx2     (1<<20)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512   (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
+
+%assign cpuflags_cache32  (1<<22)
+%assign cpuflags_cache64  (1<<23)
+%assign cpuflags_aligned  (1<<24) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<25)
 
 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
 %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
@@ -950,7 +1004,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 
     %if ARCH_X86_64 || cpuflag(sse2)
         %ifdef __NASM_VER__
-            ALIGNMODE k8
+            ALIGNMODE p6
         %else
             CPU amdnop
         %endif
@@ -963,11 +1017,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %endif
 %endmacro
 
-; Merge mmx and sse*
+; Merge mmx, sse*, and avx*
 ; m# is a simd register of the currently selected size
 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
 
 %macro CAT_XDEFINE 3
     %xdefine %1%2 %3
@@ -977,69 +1032,99 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %undef %1%2
 %endmacro
 
+%macro DEFINE_MMREGS 1 ; mmtype
+    %assign %%prev_mmregs 0
+    %ifdef num_mmregs
+        %assign %%prev_mmregs num_mmregs
+    %endif
+
+    %assign num_mmregs 8
+    %if ARCH_X86_64 && mmsize >= 16
+        %assign num_mmregs 16
+        %if cpuflag(avx512) || mmsize == 64
+            %assign num_mmregs 32
+        %endif
+    %endif
+
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, %1 %+ %%i
+        CAT_XDEFINE nn%1, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    %if %%prev_mmregs > num_mmregs
+        %rep %%prev_mmregs - num_mmregs
+            CAT_UNDEF m, %%i
+            CAT_UNDEF nn %+ mmtype, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+    %xdefine mmtype %1
+%endmacro
+
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+    %if ARCH_X86_64 && cpuflag(avx512)
+        %assign %%i %1
+        %rep 16-%1
+            %assign %%i_high %%i+16
+            SWAP %%i, %%i_high
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
 %macro INIT_MMX 0-1+
     %assign avx_enabled 0
     %define RESET_MM_PERMUTATION INIT_MMX %1
     %define mmsize 8
-    %define num_mmregs 8
     %define mova movq
     %define movu movq
     %define movh movd
     %define movnta movntq
-    %assign %%i 0
-    %rep 8
-        CAT_XDEFINE m, %%i, mm %+ %%i
-        CAT_XDEFINE nnmm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
-    %rep 8
-        CAT_UNDEF m, %%i
-        CAT_UNDEF nnmm, %%i
-        %assign %%i %%i+1
-    %endrep
     INIT_CPUFLAGS %1
+    DEFINE_MMREGS mm
 %endmacro
 
 %macro INIT_XMM 0-1+
     %assign avx_enabled 0
     %define RESET_MM_PERMUTATION INIT_XMM %1
     %define mmsize 16
-    %define num_mmregs 8
-    %if ARCH_X86_64
-        %define num_mmregs 16
-    %endif
     %define mova movdqa
     %define movu movdqu
     %define movh movq
     %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE m, %%i, xmm %+ %%i
-        CAT_XDEFINE nnxmm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
     INIT_CPUFLAGS %1
+    DEFINE_MMREGS xmm
+    %if WIN64
+        AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
+    %endif
 %endmacro
 
 %macro INIT_YMM 0-1+
     %assign avx_enabled 1
     %define RESET_MM_PERMUTATION INIT_YMM %1
     %define mmsize 32
-    %define num_mmregs 8
-    %if ARCH_X86_64
-        %define num_mmregs 16
-    %endif
     %define mova movdqa
     %define movu movdqu
     %undef movh
     %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE m, %%i, ymm %+ %%i
-        CAT_XDEFINE nnymm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
     INIT_CPUFLAGS %1
+    DEFINE_MMREGS ymm
+    AVX512_MM_PERMUTATION
+%endmacro
+
+%macro INIT_ZMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_ZMM %1
+    %define mmsize 64
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    INIT_CPUFLAGS %1
+    DEFINE_MMREGS zmm
+    AVX512_MM_PERMUTATION
 %endmacro
 
 INIT_XMM
@@ -1048,18 +1133,26 @@ INIT_XMM
     %define  mmmm%1   mm%1
     %define  mmxmm%1  mm%1
     %define  mmymm%1  mm%1
+    %define  mmzmm%1  mm%1
     %define xmmmm%1   mm%1
     %define xmmxmm%1 xmm%1
     %define xmmymm%1 xmm%1
+    %define xmmzmm%1 xmm%1
     %define ymmmm%1   mm%1
     %define ymmxmm%1 xmm%1
     %define ymmymm%1 ymm%1
+    %define ymmzmm%1 ymm%1
+    %define zmmmm%1   mm%1
+    %define zmmxmm%1 xmm%1
+    %define zmmymm%1 ymm%1
+    %define zmmzmm%1 zmm%1
     %define xm%1 xmm %+ m%1
     %define ym%1 ymm %+ m%1
+    %define zm%1 zmm %+ m%1
 %endmacro
 
 %assign i 0
-%rep 16
+%rep 32
     DECLARE_MMCAST i
     %assign i i+1
 %endrep
@@ -1129,25 +1222,42 @@ INIT_XMM
     %endif
     %assign %%i 0
     %rep num_mmregs
-        CAT_XDEFINE %%f, %%i, m %+ %%i
+        %xdefine %%tmp m %+ %%i
+        CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
         %assign %%i %%i+1
     %endrep
 %endmacro
 
-%macro LOAD_MM_PERMUTATION 1 ; name to load from
-    %ifdef %1_m0
+%macro LOAD_MM_PERMUTATION 0-1 ; name to load from
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %xdefine %%tmp %%f %+ 0
+    %ifnum %%tmp
+        RESET_MM_PERMUTATION
         %assign %%i 0
         %rep num_mmregs
-            CAT_XDEFINE m, %%i, %1_m %+ %%i
-            CAT_XDEFINE nn, m %+ %%i, %%i
+            %xdefine %%tmp %%f %+ %%i
+            CAT_XDEFINE %%m, %%i, m %+ %%tmp
             %assign %%i %%i+1
         %endrep
+        %rep num_mmregs
+            %assign %%i %%i-1
+            CAT_XDEFINE m, %%i, %%m %+ %%i
+            CAT_XDEFINE nn, m %+ %%i, %%i
+        %endrep
     %endif
 %endmacro
 
 ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
 %macro call 1
-    call_internal %1 %+ SUFFIX, %1
+    %ifid %1
+        call_internal %1 %+ SUFFIX, %1
+    %else
+        call %1
+    %endif
 %endmacro
 %macro call_internal 2
     %xdefine %%i %2
@@ -1190,12 +1300,17 @@ INIT_XMM
 ;=============================================================================
 
 %assign i 0
-%rep 16
+%rep 32
     %if i < 8
         CAT_XDEFINE sizeofmm, i, 8
+        CAT_XDEFINE regnumofmm, i, i
     %endif
     CAT_XDEFINE sizeofxmm, i, 16
     CAT_XDEFINE sizeofymm, i, 32
+    CAT_XDEFINE sizeofzmm, i, 64
+    CAT_XDEFINE regnumofxmm, i, i
+    CAT_XDEFINE regnumofymm, i, i
+    CAT_XDEFINE regnumofzmm, i, i
     %assign i i+1
 %endrep
 %undef i
@@ -1214,7 +1329,7 @@ INIT_XMM
 ;%1 == instruction
 ;%2 == minimal instruction set
 ;%3 == 1 if float, 0 if int
-;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
 ;%6+: operands
 %macro RUN_AVX_INSTR 6-9+
@@ -1238,8 +1353,22 @@ INIT_XMM
         %ifdef cpuname
             %if notcpuflag(%2)
                 %error use of ``%1'' %2 instruction in cpuname function: current_function
-            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
+            %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
                 %error use of ``%1'' sse2 instruction in cpuname function: current_function
+            %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
+                %error use of ``%1'' avx2 instruction in cpuname function: current_function
+            %elif __sizeofreg == 16 && notcpuflag(sse)
+                %error use of ``%1'' sse instruction in cpuname function: current_function
+            %elif __sizeofreg == 32 && notcpuflag(avx)
+                %error use of ``%1'' avx instruction in cpuname function: current_function
+            %elif __sizeofreg == 64 && notcpuflag(avx512)
+                %error use of ``%1'' avx512 instruction in cpuname function: current_function
+            %elifidn %1, pextrw ; special case because the base instruction is mmx2,
+                %ifnid %6       ; but sse4 is required for memory operands
+                    %if notcpuflag(sse4)
+                        %error use of ``%1'' sse4 instruction in cpuname function: current_function
+                    %endif
+                %endif
             %endif
         %endif
     %endif
@@ -1247,14 +1376,12 @@ INIT_XMM
     %if __emulate_avx
         %xdefine __src1 %7
         %xdefine __src2 %8
-        %ifnidn %6, %7
-            %if %0 >= 9
-                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9
-            %else
-                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8
-            %endif
-            %if %5 && %4 == 0
-                %ifnid %8
+        %if %5 && %4 == 0
+            %ifnidn %6, %7
+                %ifidn %6, %8
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %elifnnum sizeof%8
                     ; 3-operand AVX instructions with a memory arg can only have it in src2,
                     ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
                     ; So, if the instruction is commutative with a memory arg, swap them.
@@ -1262,6 +1389,13 @@ INIT_XMM
                     %xdefine __src2 %7
                 %endif
             %endif
+        %endif
+        %ifnidn %6, __src1
+            %if %0 >= 9
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
+            %else
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
+            %endif
             %if __sizeofreg == 8
                 MOVQ %6, __src1
             %elif %3
@@ -1278,9 +1412,40 @@ INIT_XMM
     %elif %0 >= 9
         __instr %6, %7, %8, %9
     %elif %0 == 8
-        __instr %6, %7, %8
+        %if avx_enabled && %5
+            %xdefine __src1 %7
+            %xdefine __src2 %8
+            %ifnum regnumof%7
+                %ifnum regnumof%8
+                    %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
+                        ; Most VEX-encoded instructions require an additional byte to encode when
+                        ; src2 is a high register (e.g. m8..15). If the instruction is commutative
+                        ; we can swap src1 and src2 when doing so reduces the instruction length.
+                        %xdefine __src1 %8
+                        %xdefine __src2 %7
+                    %endif
+                %endif
+            %endif
+            __instr %6, __src1, __src2
+        %else
+            __instr %6, %7, %8
+        %endif
     %elif %0 == 7
-        __instr %6, %7
+        %if avx_enabled && %5
+            %xdefine __src1 %6
+            %xdefine __src2 %7
+            %ifnum regnumof%6
+                %ifnum regnumof%7
+                    %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
+                        %xdefine __src1 %7
+                        %xdefine __src2 %6
+                    %endif
+                %endif
+            %endif
+            __instr %6, __src1, __src2
+        %else
+            __instr %6, %7
+        %endif
     %else
         __instr %6
     %endif
@@ -1289,9 +1454,9 @@ INIT_XMM
 ;%1 == instruction
 ;%2 == minimal instruction set
 ;%3 == 1 if float, 0 if int
-;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 1-5 fnord, 0, 1, 0
+%macro AVX_INSTR 1-5 fnord, 0, 255, 0
     %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
         %ifidn %2, fnord
             RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
@@ -1307,77 +1472,112 @@ INIT_XMM
     %endmacro
 %endmacro
 
-; Instructions with both VEX and non-VEX encodings
+; Instructions with both VEX/EVEX and legacy encodings
 ; Non-destructive instructions are written without parameters
 AVX_INSTR addpd, sse2, 1, 0, 1
 AVX_INSTR addps, sse, 1, 0, 1
-AVX_INSTR addsd, sse2, 1, 0, 1
-AVX_INSTR addss, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 0
+AVX_INSTR addss, sse, 1, 0, 0
 AVX_INSTR addsubpd, sse3, 1, 0, 0
 AVX_INSTR addsubps, sse3, 1, 0, 0
-AVX_INSTR aesdec, fnord, 0, 0, 0
-AVX_INSTR aesdeclast, fnord, 0, 0, 0
-AVX_INSTR aesenc, fnord, 0, 0, 0
-AVX_INSTR aesenclast, fnord, 0, 0, 0
-AVX_INSTR aesimc
-AVX_INSTR aeskeygenassist
+AVX_INSTR aesdec, aesni, 0, 0, 0
+AVX_INSTR aesdeclast, aesni, 0, 0, 0
+AVX_INSTR aesenc, aesni, 0, 0, 0
+AVX_INSTR aesenclast, aesni, 0, 0, 0
+AVX_INSTR aesimc, aesni
+AVX_INSTR aeskeygenassist, aesni
 AVX_INSTR andnpd, sse2, 1, 0, 0
 AVX_INSTR andnps, sse, 1, 0, 0
 AVX_INSTR andpd, sse2, 1, 0, 1
 AVX_INSTR andps, sse, 1, 0, 1
-AVX_INSTR blendpd, sse4, 1, 0, 0
-AVX_INSTR blendps, sse4, 1, 0, 0
-AVX_INSTR blendvpd, sse4, 1, 0, 0
-AVX_INSTR blendvps, sse4, 1, 0, 0
+AVX_INSTR blendpd, sse4, 1, 1, 0
+AVX_INSTR blendps, sse4, 1, 1, 0
+AVX_INSTR blendvpd, sse4 ; can't be emulated
+AVX_INSTR blendvps, sse4 ; can't be emulated
+AVX_INSTR cmpeqpd, sse2, 1, 0, 1
+AVX_INSTR cmpeqps, sse, 1, 0, 1
+AVX_INSTR cmpeqsd, sse2, 1, 0, 0
+AVX_INSTR cmpeqss, sse, 1, 0, 0
+AVX_INSTR cmplepd, sse2, 1, 0, 0
+AVX_INSTR cmpleps, sse, 1, 0, 0
+AVX_INSTR cmplesd, sse2, 1, 0, 0
+AVX_INSTR cmpless, sse, 1, 0, 0
+AVX_INSTR cmpltpd, sse2, 1, 0, 0
+AVX_INSTR cmpltps, sse, 1, 0, 0
+AVX_INSTR cmpltsd, sse2, 1, 0, 0
+AVX_INSTR cmpltss, sse, 1, 0, 0
+AVX_INSTR cmpneqpd, sse2, 1, 0, 1
+AVX_INSTR cmpneqps, sse, 1, 0, 1
+AVX_INSTR cmpneqsd, sse2, 1, 0, 0
+AVX_INSTR cmpneqss, sse, 1, 0, 0
+AVX_INSTR cmpnlepd, sse2, 1, 0, 0
+AVX_INSTR cmpnleps, sse, 1, 0, 0
+AVX_INSTR cmpnlesd, sse2, 1, 0, 0
+AVX_INSTR cmpnless, sse, 1, 0, 0
+AVX_INSTR cmpnltpd, sse2, 1, 0, 0
+AVX_INSTR cmpnltps, sse, 1, 0, 0
+AVX_INSTR cmpnltsd, sse2, 1, 0, 0
+AVX_INSTR cmpnltss, sse, 1, 0, 0
+AVX_INSTR cmpordpd, sse2 1, 0, 1
+AVX_INSTR cmpordps, sse 1, 0, 1
+AVX_INSTR cmpordsd, sse2 1, 0, 0
+AVX_INSTR cmpordss, sse 1, 0, 0
 AVX_INSTR cmppd, sse2, 1, 1, 0
 AVX_INSTR cmpps, sse, 1, 1, 0
 AVX_INSTR cmpsd, sse2, 1, 1, 0
 AVX_INSTR cmpss, sse, 1, 1, 0
-AVX_INSTR comisd, sse2
-AVX_INSTR comiss, sse
-AVX_INSTR cvtdq2pd, sse2
-AVX_INSTR cvtdq2ps, sse2
-AVX_INSTR cvtpd2dq, sse2
-AVX_INSTR cvtpd2ps, sse2
-AVX_INSTR cvtps2dq, sse2
-AVX_INSTR cvtps2pd, sse2
-AVX_INSTR cvtsd2si, sse2
-AVX_INSTR cvtsd2ss, sse2
-AVX_INSTR cvtsi2sd, sse2
-AVX_INSTR cvtsi2ss, sse
-AVX_INSTR cvtss2sd, sse2
-AVX_INSTR cvtss2si, sse
-AVX_INSTR cvttpd2dq, sse2
-AVX_INSTR cvttps2dq, sse2
-AVX_INSTR cvttsd2si, sse2
-AVX_INSTR cvttss2si, sse
+AVX_INSTR cmpunordpd, sse2, 1, 0, 1
+AVX_INSTR cmpunordps, sse, 1, 0, 1
+AVX_INSTR cmpunordsd, sse2, 1, 0, 0
+AVX_INSTR cmpunordss, sse, 1, 0, 0
+AVX_INSTR comisd, sse2, 1
+AVX_INSTR comiss, sse, 1
+AVX_INSTR cvtdq2pd, sse2, 1
+AVX_INSTR cvtdq2ps, sse2, 1
+AVX_INSTR cvtpd2dq, sse2, 1
+AVX_INSTR cvtpd2ps, sse2, 1
+AVX_INSTR cvtps2dq, sse2, 1
+AVX_INSTR cvtps2pd, sse2, 1
+AVX_INSTR cvtsd2si, sse2, 1
+AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
+AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
+AVX_INSTR cvtsi2ss, sse, 1, 0, 0
+AVX_INSTR cvtss2sd, sse2, 1, 0, 0
+AVX_INSTR cvtss2si, sse, 1
+AVX_INSTR cvttpd2dq, sse2, 1
+AVX_INSTR cvttps2dq, sse2, 1
+AVX_INSTR cvttsd2si, sse2, 1
+AVX_INSTR cvttss2si, sse, 1
 AVX_INSTR divpd, sse2, 1, 0, 0
 AVX_INSTR divps, sse, 1, 0, 0
 AVX_INSTR divsd, sse2, 1, 0, 0
 AVX_INSTR divss, sse, 1, 0, 0
 AVX_INSTR dppd, sse4, 1, 1, 0
 AVX_INSTR dpps, sse4, 1, 1, 0
-AVX_INSTR extractps, sse4
+AVX_INSTR extractps, sse4, 1
+AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
 AVX_INSTR haddpd, sse3, 1, 0, 0
 AVX_INSTR haddps, sse3, 1, 0, 0
 AVX_INSTR hsubpd, sse3, 1, 0, 0
 AVX_INSTR hsubps, sse3, 1, 0, 0
 AVX_INSTR insertps, sse4, 1, 1, 0
 AVX_INSTR lddqu, sse3
-AVX_INSTR ldmxcsr, sse
+AVX_INSTR ldmxcsr, sse, 1
 AVX_INSTR maskmovdqu, sse2
 AVX_INSTR maxpd, sse2, 1, 0, 1
 AVX_INSTR maxps, sse, 1, 0, 1
-AVX_INSTR maxsd, sse2, 1, 0, 1
-AVX_INSTR maxss, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 0
+AVX_INSTR maxss, sse, 1, 0, 0
 AVX_INSTR minpd, sse2, 1, 0, 1
 AVX_INSTR minps, sse, 1, 0, 1
-AVX_INSTR minsd, sse2, 1, 0, 1
-AVX_INSTR minss, sse, 1, 0, 1
-AVX_INSTR movapd, sse2
-AVX_INSTR movaps, sse
+AVX_INSTR minsd, sse2, 1, 0, 0
+AVX_INSTR minss, sse, 1, 0, 0
+AVX_INSTR movapd, sse2, 1
+AVX_INSTR movaps, sse, 1
 AVX_INSTR movd, mmx
-AVX_INSTR movddup, sse3
+AVX_INSTR movddup, sse3, 1
 AVX_INSTR movdqa, sse2
 AVX_INSTR movdqu, sse2
 AVX_INSTR movhlps, sse, 1, 0, 0
@@ -1386,24 +1586,24 @@ AVX_INSTR movhps, sse, 1, 0, 0
 AVX_INSTR movlhps, sse, 1, 0, 0
 AVX_INSTR movlpd, sse2, 1, 0, 0
 AVX_INSTR movlps, sse, 1, 0, 0
-AVX_INSTR movmskpd, sse2
-AVX_INSTR movmskps, sse
+AVX_INSTR movmskpd, sse2, 1
+AVX_INSTR movmskps, sse, 1
 AVX_INSTR movntdq, sse2
 AVX_INSTR movntdqa, sse4
-AVX_INSTR movntpd, sse2
-AVX_INSTR movntps, sse
+AVX_INSTR movntpd, sse2, 1
+AVX_INSTR movntps, sse, 1
 AVX_INSTR movq, mmx
 AVX_INSTR movsd, sse2, 1, 0, 0
-AVX_INSTR movshdup, sse3
-AVX_INSTR movsldup, sse3
+AVX_INSTR movshdup, sse3, 1
+AVX_INSTR movsldup, sse3, 1
 AVX_INSTR movss, sse, 1, 0, 0
-AVX_INSTR movupd, sse2
-AVX_INSTR movups, sse
-AVX_INSTR mpsadbw, sse4
+AVX_INSTR movupd, sse2, 1
+AVX_INSTR movups, sse, 1
+AVX_INSTR mpsadbw, sse4, 0, 1, 0
 AVX_INSTR mulpd, sse2, 1, 0, 1
 AVX_INSTR mulps, sse, 1, 0, 1
-AVX_INSTR mulsd, sse2, 1, 0, 1
-AVX_INSTR mulss, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 0
+AVX_INSTR mulss, sse, 1, 0, 0
 AVX_INSTR orpd, sse2, 1, 0, 1
 AVX_INSTR orps, sse, 1, 0, 1
 AVX_INSTR pabsb, ssse3
@@ -1421,14 +1621,18 @@ AVX_INSTR paddsb, mmx, 0, 0, 1
 AVX_INSTR paddsw, mmx, 0, 0, 1
 AVX_INSTR paddusb, mmx, 0, 0, 1
 AVX_INSTR paddusw, mmx, 0, 0, 1
-AVX_INSTR palignr, ssse3
+AVX_INSTR palignr, ssse3, 0, 1, 0
 AVX_INSTR pand, mmx, 0, 0, 1
 AVX_INSTR pandn, mmx, 0, 0, 0
 AVX_INSTR pavgb, mmx2, 0, 0, 1
 AVX_INSTR pavgw, mmx2, 0, 0, 1
-AVX_INSTR pblendvb, sse4, 0, 0, 0
-AVX_INSTR pblendw, sse4
-AVX_INSTR pclmulqdq
+AVX_INSTR pblendvb, sse4 ; can't be emulated
+AVX_INSTR pblendw, sse4, 0, 1, 0
+AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
 AVX_INSTR pcmpestri, sse42
 AVX_INSTR pcmpestrm, sse42
 AVX_INSTR pcmpistri, sse42
@@ -1452,10 +1656,10 @@ AVX_INSTR phminposuw, sse4
 AVX_INSTR phsubw, ssse3, 0, 0, 0
 AVX_INSTR phsubd, ssse3, 0, 0, 0
 AVX_INSTR phsubsw, ssse3, 0, 0, 0
-AVX_INSTR pinsrb, sse4
-AVX_INSTR pinsrd, sse4
-AVX_INSTR pinsrq, sse4
-AVX_INSTR pinsrw, mmx2
+AVX_INSTR pinsrb, sse4, 0, 1, 0
+AVX_INSTR pinsrd, sse4, 0, 1, 0
+AVX_INSTR pinsrq, sse4, 0, 1, 0
+AVX_INSTR pinsrw, mmx2, 0, 1, 0
 AVX_INSTR pmaddwd, mmx, 0, 0, 1
 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
 AVX_INSTR pmaxsb, sse4, 0, 0, 1
@@ -1527,27 +1731,27 @@ AVX_INSTR punpcklwd, mmx, 0, 0, 0
 AVX_INSTR punpckldq, mmx, 0, 0, 0
 AVX_INSTR punpcklqdq, sse2, 0, 0, 0
 AVX_INSTR pxor, mmx, 0, 0, 1
-AVX_INSTR rcpps, sse, 1, 0, 0
+AVX_INSTR rcpps, sse, 1
 AVX_INSTR rcpss, sse, 1, 0, 0
-AVX_INSTR roundpd, sse4
-AVX_INSTR roundps, sse4
-AVX_INSTR roundsd, sse4
-AVX_INSTR roundss, sse4
-AVX_INSTR rsqrtps, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4, 1
+AVX_INSTR roundps, sse4, 1
+AVX_INSTR roundsd, sse4, 1, 1, 0
+AVX_INSTR roundss, sse4, 1, 1, 0
+AVX_INSTR rsqrtps, sse, 1
 AVX_INSTR rsqrtss, sse, 1, 0, 0
 AVX_INSTR shufpd, sse2, 1, 1, 0
 AVX_INSTR shufps, sse, 1, 1, 0
-AVX_INSTR sqrtpd, sse2, 1, 0, 0
-AVX_INSTR sqrtps, sse, 1, 0, 0
+AVX_INSTR sqrtpd, sse2, 1
+AVX_INSTR sqrtps, sse, 1
 AVX_INSTR sqrtsd, sse2, 1, 0, 0
 AVX_INSTR sqrtss, sse, 1, 0, 0
-AVX_INSTR stmxcsr, sse
+AVX_INSTR stmxcsr, sse, 1
 AVX_INSTR subpd, sse2, 1, 0, 0
 AVX_INSTR subps, sse, 1, 0, 0
 AVX_INSTR subsd, sse2, 1, 0, 0
 AVX_INSTR subss, sse, 1, 0, 0
-AVX_INSTR ucomisd, sse2
-AVX_INSTR ucomiss, sse
+AVX_INSTR ucomisd, sse2, 1
+AVX_INSTR ucomiss, sse, 1
 AVX_INSTR unpckhpd, sse2, 1, 0, 0
 AVX_INSTR unpckhps, sse, 1, 0, 0
 AVX_INSTR unpcklpd, sse2, 1, 0, 0
@@ -1560,6 +1764,38 @@ AVX_INSTR pfadd, 3dnow, 1, 0, 1
 AVX_INSTR pfsub, 3dnow, 1, 0, 0
 AVX_INSTR pfmul, 3dnow, 1, 0, 1
 
+;%1 == instruction
+;%2 == minimal instruction set
+%macro GPR_INSTR 2
+    %macro %1 2-5 fnord, %1, %2
+        %ifdef cpuname
+            %if notcpuflag(%5)
+                %error use of ``%4'' %5 instruction in cpuname function: current_function
+            %endif
+        %endif
+        %ifidn %3, fnord
+            %4 %1, %2
+        %else
+            %4 %1, %2, %3
+        %endif
+    %endmacro
+%endmacro
+
+GPR_INSTR andn, bmi1
+GPR_INSTR bextr, bmi1
+GPR_INSTR blsi, bmi1
+GPR_INSTR blsr, bmi1
+GPR_INSTR blsmsk, bmi1
+GPR_INSTR bzhi, bmi2
+GPR_INSTR mulx, bmi2
+GPR_INSTR pdep, bmi2
+GPR_INSTR pext, bmi2
+GPR_INSTR popcnt, sse42
+GPR_INSTR rorx, bmi2
+GPR_INSTR sarx, bmi2
+GPR_INSTR shlx, bmi2
+GPR_INSTR shrx, bmi2
+
 ; base-4 constants for shuffles
 %assign i 0
 %rep 256
@@ -1610,7 +1846,7 @@ FMA_INSTR pmadcswd, pmaddwd, paddd
                 v%5%6 %1, %2, %3, %4
             %elifidn %1, %2
                 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
-                %ifid %3
+                %ifnum sizeof%3
                     v%{5}213%6 %2, %3, %4
                 %else
                     v%{5}132%6 %2, %4, %3
@@ -1635,15 +1871,53 @@ FMA4_INSTR fmsubadd, pd, ps
 FMA4_INSTR fnmadd,   pd, ps, sd, ss
 FMA4_INSTR fnmsub,   pd, ps, sd, ss
 
-; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
-%ifdef __YASM_VER__
-    %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
-        %macro vpbroadcastq 2
-            %if sizeof%1 == 16
-                movddup %1, %2
-            %else
-                vbroadcastsd %1, %2
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+    %macro %1 2-7 fnord, fnord, %1, %2, %3
+        %ifidn %3, fnord
+            %define %%args %1, %2
+        %elifidn %4, fnord
+            %define %%args %1, %2, %3
+        %else
+            %define %%args %1, %2, %3, %4
+        %endif
+        %assign %%evex_required cpuflag(avx512) & %7
+        %ifnum regnumof%1
+            %if regnumof%1 >= 16 || sizeof%1 > 32
+                %assign %%evex_required 1
             %endif
-        %endmacro
-    %endif
-%endif
+        %endif
+        %ifnum regnumof%2
+            %if regnumof%2 >= 16 || sizeof%2 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %ifnum regnumof%3
+            %if regnumof%3 >= 16 || sizeof%3 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %if %%evex_required
+            %6 %%args
+        %else
+            %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+        %endif
+    %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128,   vextractf32x4
+EVEX_INSTR vextracti128,   vextracti32x4
+EVEX_INSTR vinsertf128,    vinsertf32x4
+EVEX_INSTR vinserti128,    vinserti32x4
+EVEX_INSTR vmovdqa,        vmovdqa32
+EVEX_INSTR vmovdqu,        vmovdqu32
+EVEX_INSTR vpand,          vpandd
+EVEX_INSTR vpandn,         vpandnd
+EVEX_INSTR vpor,           vpord
+EVEX_INSTR vpxor,          vpxord
+EVEX_INSTR vrcpps,         vrcp14ps,   1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss,         vrcp14ss,   1
+EVEX_INSTR vrsqrtps,       vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss,       vrsqrt14ss, 1
diff --git a/media/libaom/src/tools/aom_entropy_optimizer.c b/media/libaom/src/tools/aom_entropy_optimizer.c
index 9f529d9ab2..fa7bf7ea9e 100644
--- a/media/libaom/src/tools/aom_entropy_optimizer.c
+++ b/media/libaom/src/tools/aom_entropy_optimizer.c
@@ -572,9 +572,9 @@ int main(int argc, const char **argv) {
   /* Skip flag */
   cts_each_dim[0] = SKIP_CONTEXTS;
   cts_each_dim[1] = 2;
-  optimize_cdf_table(&fc.skip[0][0], probsfile, 2, cts_each_dim,
+  optimize_cdf_table(&fc.skip_txfm[0][0], probsfile, 2, cts_each_dim,
                      "static const aom_cdf_prob "
-                     "default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]");
+                     "default_skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]");
 
   /* Skip mode flag */
   cts_each_dim[0] = SKIP_MODE_CONTEXTS;
diff --git a/media/libaom/src/tools/auto_refactor/auto_refactor.py b/media/libaom/src/tools/auto_refactor/auto_refactor.py
new file mode 100644
index 0000000000..dd0d4415f9
--- /dev/null
+++ b/media/libaom/src/tools/auto_refactor/auto_refactor.py
@@ -0,0 +1,919 @@
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+from __future__ import print_function
+import sys
+import os
+import operator
+from pycparser import c_parser, c_ast, parse_file
+from math import *
+
+from inspect import currentframe, getframeinfo
+from collections import deque
+
+
+def debug_print(frameinfo):
+  print('******** ERROR:', frameinfo.filename, frameinfo.lineno, '********')
+
+
+class StructItem():
+
+  def __init__(self,
+               typedef_name=None,
+               struct_name=None,
+               struct_node=None,
+               is_union=False):
+    self.typedef_name = typedef_name
+    self.struct_name = struct_name
+    self.struct_node = struct_node
+    self.is_union = is_union
+    self.child_decl_map = None
+
+  def __str__(self):
+    return str(self.typedef_name) + ' ' + str(self.struct_name) + ' ' + str(
+        self.is_union)
+
+  def compute_child_decl_map(self, struct_info):
+    self.child_decl_map = {}
+    if self.struct_node != None and self.struct_node.decls != None:
+      for decl_node in self.struct_node.decls:
+        if decl_node.name == None:
+          for sub_decl_node in decl_node.type.decls:
+            sub_decl_status = parse_decl_node(struct_info, sub_decl_node)
+            self.child_decl_map[sub_decl_node.name] = sub_decl_status
+        else:
+          decl_status = parse_decl_node(struct_info, decl_node)
+          self.child_decl_map[decl_status.name] = decl_status
+
+  def get_child_decl_status(self, decl_name):
+    if self.child_decl_map == None:
+      debug_print(getframeinfo(currentframe()))
+      print('child_decl_map is None')
+      return None
+    if decl_name not in self.child_decl_map:
+      debug_print(getframeinfo(currentframe()))
+      print(decl_name, 'does not exist ')
+      return None
+    return self.child_decl_map[decl_name]
+
+
+class StructInfo():
+
+  def __init__(self):
+    self.struct_name_dic = {}
+    self.typedef_name_dic = {}
+    self.enum_value_dic = {}  # enum value -> enum_node
+    self.enum_name_dic = {}  # enum name -> enum_node
+    self.struct_item_list = []
+
+  def get_struct_by_typedef_name(self, typedef_name):
+    if typedef_name in self.typedef_name_dic:
+      return self.typedef_name_dic[typedef_name]
+    else:
+      return None
+
+  def get_struct_by_struct_name(self, struct_name):
+    if struct_name in self.struct_name_dic:
+      return self.struct_name_dic[struct_name]
+    else:
+      debug_print(getframeinfo(currentframe()))
+      print('Cant find', struct_name)
+      return None
+
+  def update_struct_item_list(self):
+    # Collect all struct_items from struct_name_dic and typedef_name_dic
+    # Compute child_decl_map for each struct item.
+    for struct_name in self.struct_name_dic.keys():
+      struct_item = self.struct_name_dic[struct_name]
+      struct_item.compute_child_decl_map(self)
+      self.struct_item_list.append(struct_item)
+
+    for typedef_name in self.typedef_name_dic.keys():
+      struct_item = self.typedef_name_dic[typedef_name]
+      if struct_item.struct_name not in self.struct_name_dic:
+        struct_item.compute_child_decl_map(self)
+        self.struct_item_list.append(struct_item)
+
+  def update_enum(self, enum_node):
+    if enum_node.name != None:
+      self.enum_name_dic[enum_node.name] = enum_node
+
+    if enum_node.values != None:
+      enumerator_list = enum_node.values.enumerators
+      for enumerator in enumerator_list:
+        self.enum_value_dic[enumerator.name] = enum_node
+
+  def update(self,
+             typedef_name=None,
+             struct_name=None,
+             struct_node=None,
+             is_union=False):
+    """T: typedef_name S: struct_name N: struct_node
+
+           T S N
+   case 1: o o o
+   typedef struct P {
+    int u;
+   } K;
+           T S N
+   case 2: o o x
+   typedef struct P K;
+
+           T S N
+   case 3: x o o
+   struct P {
+    int u;
+   };
+
+           T S N
+   case 4: o x o
+   typedef struct {
+    int u;
+   } K;
+    """
+    struct_item = None
+
+    # Check whether struct_name or typedef_name is already in the dictionary
+    if struct_name in self.struct_name_dic:
+      struct_item = self.struct_name_dic[struct_name]
+
+    if typedef_name in self.typedef_name_dic:
+      struct_item = self.typedef_name_dic[typedef_name]
+
+    if struct_item == None:
+      struct_item = StructItem(typedef_name, struct_name, struct_node, is_union)
+
+    if struct_node.decls != None:
+      struct_item.struct_node = struct_node
+
+    if struct_name != None:
+      self.struct_name_dic[struct_name] = struct_item
+
+    if typedef_name != None:
+      self.typedef_name_dic[typedef_name] = struct_item
+
+
+class StructDefVisitor(c_ast.NodeVisitor):
+
+  def __init__(self):
+    self.struct_info = StructInfo()
+
+  def visit_Struct(self, node):
+    if node.decls != None:
+      self.struct_info.update(None, node.name, node)
+    self.generic_visit(node)
+
+  def visit_Union(self, node):
+    if node.decls != None:
+      self.struct_info.update(None, node.name, node, True)
+    self.generic_visit(node)
+
+  def visit_Enum(self, node):
+    self.struct_info.update_enum(node)
+    self.generic_visit(node)
+
+  def visit_Typedef(self, node):
+    if node.type.__class__.__name__ == 'TypeDecl':
+      typedecl = node.type
+      if typedecl.type.__class__.__name__ == 'Struct':
+        struct_node = typedecl.type
+        typedef_name = node.name
+        struct_name = struct_node.name
+        self.struct_info.update(typedef_name, struct_name, struct_node)
+      elif typedecl.type.__class__.__name__ == 'Union':
+        union_node = typedecl.type
+        typedef_name = node.name
+        union_name = union_node.name
+        self.struct_info.update(typedef_name, union_name, union_node, True)
+      # TODO(angiebird): Do we need to deal with enum here?
+    self.generic_visit(node)
+
+
+def build_struct_info(ast):
+  v = StructDefVisitor()
+  v.visit(ast)
+  struct_info = v.struct_info
+  struct_info.update_struct_item_list()
+  return v.struct_info
+
+
+class DeclStatus():
+
+  def __init__(self, name, struct_item=None, is_ptr_decl=False):
+    self.name = name
+    self.struct_item = struct_item
+    self.is_ptr_decl = is_ptr_decl
+
+  def get_child_decl_status(self, decl_name):
+    if self.struct_item != None:
+      return self.struct_item.get_child_decl_status(decl_name)
+    else:
+      #TODO(angiebird): 2. Investigage the situation when a struct's definition can't be found.
+      return None
+
+  def __str__(self):
+    return str(self.struct_item) + ' ' + str(self.name) + ' ' + str(
+        self.is_ptr_decl)
+
+
+def peel_ptr_decl(decl_type_node):
+  """ Remove PtrDecl and ArrayDecl layer """
+  is_ptr_decl = False
+  peeled_decl_type_node = decl_type_node
+  while peeled_decl_type_node.__class__.__name__ == 'PtrDecl' or peeled_decl_type_node.__class__.__name__ == 'ArrayDecl':
+    is_ptr_decl = True
+    peeled_decl_type_node = peeled_decl_type_node.type
+  return is_ptr_decl, peeled_decl_type_node
+
+
+def parse_peeled_decl_type_node(struct_info, node):
+  struct_item = None
+  if node.__class__.__name__ == 'TypeDecl':
+    if node.type.__class__.__name__ == 'IdentifierType':
+      identifier_type_node = node.type
+      typedef_name = identifier_type_node.names[0]
+      struct_item = struct_info.get_struct_by_typedef_name(typedef_name)
+    elif node.type.__class__.__name__ == 'Struct':
+      struct_node = node.type
+      if struct_node.name != None:
+        struct_item = struct_info.get_struct_by_struct_name(struct_node.name)
+      else:
+        struct_item = StructItem(None, None, struct_node, False)
+        struct_item.compute_child_decl_map(struct_info)
+    elif node.type.__class__.__name__ == 'Union':
+      # TODO(angiebird): Special treatment for Union?
+      struct_node = node.type
+      if struct_node.name != None:
+        struct_item = struct_info.get_struct_by_struct_name(struct_node.name)
+      else:
+        struct_item = StructItem(None, None, struct_node, True)
+        struct_item.compute_child_decl_map(struct_info)
+    elif node.type.__class__.__name__ == 'Enum':
+      # TODO(angiebird): Special treatment for Union?
+      struct_node = node.type
+      struct_item = None
+    else:
+      print('Unrecognized peeled_decl_type_node.type',
+            node.type.__class__.__name__)
+  else:
+    # debug_print(getframeinfo(currentframe()))
+    # print(node.__class__.__name__)
+    #TODO(angiebird): Do we need to take care of this part?
+    pass
+
+  return struct_item
+
+
+def parse_decl_node(struct_info, decl_node):
+  # struct_item is None if this decl_node is not a struct_item
+  decl_node_name = decl_node.name
+  decl_type_node = decl_node.type
+  is_ptr_decl, peeled_decl_type_node = peel_ptr_decl(decl_type_node)
+  struct_item = parse_peeled_decl_type_node(struct_info, peeled_decl_type_node)
+  return DeclStatus(decl_node_name, struct_item, is_ptr_decl)
+
+
+def get_lvalue_lead(lvalue_node):
+  """return '&' or '*' of lvalue if available"""
+  if lvalue_node.__class__.__name__ == 'UnaryOp' and lvalue_node.op == '&':
+    return '&'
+  elif lvalue_node.__class__.__name__ == 'UnaryOp' and lvalue_node.op == '*':
+    return '*'
+  return None
+
+
+def parse_lvalue(lvalue_node):
+  """get id_chain from lvalue"""
+  id_chain = parse_lvalue_recursive(lvalue_node, [])
+  return id_chain
+
+
+def parse_lvalue_recursive(lvalue_node, id_chain):
+  """cpi->rd->u -> (cpi->rd)->u"""
+  if lvalue_node.__class__.__name__ == 'ID':
+    id_chain.append(lvalue_node.name)
+    id_chain.reverse()
+    return id_chain
+  elif lvalue_node.__class__.__name__ == 'StructRef':
+    id_chain.append(lvalue_node.field.name)
+    return parse_lvalue_recursive(lvalue_node.name, id_chain)
+  elif lvalue_node.__class__.__name__ == 'ArrayRef':
+    return parse_lvalue_recursive(lvalue_node.name, id_chain)
+  elif lvalue_node.__class__.__name__ == 'UnaryOp' and lvalue_node.op == '&':
+    return parse_lvalue_recursive(lvalue_node.expr, id_chain)
+  elif lvalue_node.__class__.__name__ == 'UnaryOp' and lvalue_node.op == '*':
+    return parse_lvalue_recursive(lvalue_node.expr, id_chain)
+  else:
+    return None
+
+
+class FuncDefVisitor(c_ast.NodeVisitor):
+  func_dictionary = {}
+
+  def visit_FuncDef(self, node):
+    func_name = node.decl.name
+    self.func_dictionary[func_name] = node
+
+
+def build_func_dictionary(ast):
+  v = FuncDefVisitor()
+  v.visit(ast)
+  return v.func_dictionary
+
+
+def get_func_start_coord(func_node):
+  return func_node.coord
+
+
+def find_end_node(node):
+  node_list = []
+  for c in node:
+    node_list.append(c)
+  if len(node_list) == 0:
+    return node
+  else:
+    return find_end_node(node_list[-1])
+
+
+def get_func_end_coord(func_node):
+  return find_end_node(func_node).coord
+
+
+def get_func_size(func_node):
+  start_coord = get_func_start_coord(func_node)
+  end_coord = get_func_end_coord(func_node)
+  if start_coord.file == end_coord.file:
+    return end_coord.line - start_coord.line + 1
+  else:
+    return None
+
+
+def save_object(obj, filename):
+  with open(filename, 'wb') as obj_fp:
+    pickle.dump(obj, obj_fp, protocol=-1)
+
+
+def load_object(filename):
+  obj = None
+  with open(filename, 'rb') as obj_fp:
+    obj = pickle.load(obj_fp)
+  return obj
+
+
+def get_av1_ast(gen_ast=False):
+  # TODO(angiebird): Generalize this path
+  c_filename = './av1_pp.c'
+  print('generate ast')
+  ast = parse_file(c_filename)
+  #save_object(ast, ast_file)
+  print('finished generate ast')
+  return ast
+
+
+def get_func_param_id_map(func_def_node):
+  param_id_map = {}
+  func_decl = func_def_node.decl.type
+  param_list = func_decl.args.params
+  for decl in param_list:
+    param_id_map[decl.name] = decl
+  return param_id_map
+
+
+class IDTreeStack():
+
+  def __init__(self, global_id_tree):
+    self.stack = deque()
+    self.global_id_tree = global_id_tree
+
+  def add_link_node(self, node, link_id_chain):
+    link_node = self.add_id_node(link_id_chain)
+    node.link_node = link_node
+    node.link_id_chain = link_id_chain
+
+  def push_id_tree(self, id_tree=None):
+    if id_tree == None:
+      id_tree = IDStatusNode()
+    self.stack.append(id_tree)
+    return id_tree
+
+  def pop_id_tree(self):
+    return self.stack.pop()
+
+  def add_id_seed_node(self, id_seed, decl_status):
+    return self.stack[-1].add_child(id_seed, decl_status)
+
+  def get_id_seed_node(self, id_seed):
+    idx = len(self.stack) - 1
+    while idx >= 0:
+      id_node = self.stack[idx].get_child(id_seed)
+      if id_node != None:
+        return id_node
+      idx -= 1
+
+    id_node = self.global_id_tree.get_child(id_seed)
+    if id_node != None:
+      return id_node
+    return None
+
+  def add_id_node(self, id_chain):
+    id_seed = id_chain[0]
+    id_seed_node = self.get_id_seed_node(id_seed)
+    if id_seed_node == None:
+      return None
+    if len(id_chain) == 1:
+      return id_seed_node
+    return id_seed_node.add_descendant(id_chain[1:])
+
+  def get_id_node(self, id_chain):
+    id_seed = id_chain[0]
+    id_seed_node = self.get_id_seed_node(id_seed)
+    if id_seed_node == None:
+      return None
+    if len(id_chain) == 1:
+      return id_seed_node
+    return id_seed_node.get_descendant(id_chain[1:])
+
+  def top(self):
+    return self.stack[-1]
+
+
+class IDStatusNode():
+
+  def __init__(self, name=None, root=None):
+    if root is None:
+      self.root = self
+    else:
+      self.root = root
+
+    self.name = name
+
+    self.parent = None
+    self.children = {}
+
+    self.assign = False
+    self.last_assign_coord = None
+    self.refer = False
+    self.last_refer_coord = None
+
+    self.decl_status = None
+
+    self.link_id_chain = None
+    self.link_node = None
+
+    self.visit = False
+
+  def set_link_id_chain(self, link_id_chain):
+    self.set_assign(False)
+    self.link_id_chain = link_id_chain
+    self.link_node = self.root.get_descendant(link_id_chain)
+
+  def set_link_node(self, link_node):
+    self.set_assign(False)
+    self.link_id_chain = ['*']
+    self.link_node = link_node
+
+  def get_link_id_chain(self):
+    return self.link_id_chain
+
+  def get_concrete_node(self):
+    if self.visit == True:
+      # return None when there is a loop
+      return None
+    self.visit = True
+    if self.link_node == None:
+      self.visit = False
+      return self
+    else:
+      concrete_node = self.link_node.get_concrete_node()
+      self.visit = False
+      if concrete_node == None:
+        return self
+      return concrete_node
+
+  def set_assign(self, assign, coord=None):
+    concrete_node = self.get_concrete_node()
+    concrete_node.assign = assign
+    concrete_node.last_assign_coord = coord
+
+  def get_assign(self):
+    concrete_node = self.get_concrete_node()
+    return concrete_node.assign
+
+  def set_refer(self, refer, coord=None):
+    concrete_node = self.get_concrete_node()
+    concrete_node.refer = refer
+    concrete_node.last_refer_coord = coord
+
+  def get_refer(self):
+    concrete_node = self.get_concrete_node()
+    return concrete_node.refer
+
+  def set_parent(self, parent):
+    concrete_node = self.get_concrete_node()
+    concrete_node.parent = parent
+
+  def add_child(self, name, decl_status=None):
+    concrete_node = self.get_concrete_node()
+    if name not in concrete_node.children:
+      child_id_node = IDStatusNode(name, concrete_node.root)
+      concrete_node.children[name] = child_id_node
+      if decl_status == None:
+        # Check if the child decl_status can be inferred from its parent's
+        # decl_status
+        if self.decl_status != None:
+          decl_status = self.decl_status.get_child_decl_status(name)
+      child_id_node.set_decl_status(decl_status)
+    return concrete_node.children[name]
+
+  def get_child(self, name):
+    concrete_node = self.get_concrete_node()
+    if name in concrete_node.children:
+      return concrete_node.children[name]
+    else:
+      return None
+
+  def add_descendant(self, id_chain):
+    current_node = self.get_concrete_node()
+    for name in id_chain:
+      current_node.add_child(name)
+      parent_node = current_node
+      current_node = current_node.get_child(name)
+      current_node.set_parent(parent_node)
+    return current_node
+
+  def get_descendant(self, id_chain):
+    current_node = self.get_concrete_node()
+    for name in id_chain:
+      current_node = current_node.get_child(name)
+      if current_node == None:
+        return None
+    return current_node
+
+  def get_children(self):
+    current_node = self.get_concrete_node()
+    return current_node.children
+
+  def set_decl_status(self, decl_status):
+    current_node = self.get_concrete_node()
+    current_node.decl_status = decl_status
+
+  def get_decl_status(self):
+    current_node = self.get_concrete_node()
+    return current_node.decl_status
+
+  def __str__(self):
+    if self.link_id_chain is None:
+      return str(self.name) + ' a: ' + str(int(self.assign)) + ' r: ' + str(
+          int(self.refer))
+    else:
+      return str(self.name) + ' -> ' + ' '.join(self.link_id_chain)
+
+  def collect_assign_refer_status(self,
+                                  id_chain=None,
+                                  assign_ls=None,
+                                  refer_ls=None):
+    if id_chain == None:
+      id_chain = []
+    if assign_ls == None:
+      assign_ls = []
+    if refer_ls == None:
+      refer_ls = []
+    id_chain.append(self.name)
+    if self.assign:
+      info_str = ' '.join([
+          ' '.join(id_chain[1:]), 'a:',
+          str(int(self.assign)), 'r:',
+          str(int(self.refer)),
+          str(self.last_assign_coord)
+      ])
+      assign_ls.append(info_str)
+    if self.refer:
+      info_str = ' '.join([
+          ' '.join(id_chain[1:]), 'a:',
+          str(int(self.assign)), 'r:',
+          str(int(self.refer)),
+          str(self.last_refer_coord)
+      ])
+      refer_ls.append(info_str)
+    for c in self.children:
+      self.children[c].collect_assign_refer_status(id_chain, assign_ls,
+                                                   refer_ls)
+    id_chain.pop()
+    return assign_ls, refer_ls
+
+  def show(self):
+    assign_ls, refer_ls = self.collect_assign_refer_status()
+    print('---- assign ----')
+    for item in assign_ls:
+      print(item)
+    print('---- refer ----')
+    for item in refer_ls:
+      print(item)
+
+
+class FuncInOutVisitor(c_ast.NodeVisitor):
+
+  def __init__(self,
+               func_def_node,
+               struct_info,
+               func_dictionary,
+               keep_body_id_tree=True,
+               call_param_map=None,
+               global_id_tree=None,
+               func_history=None,
+               unknown=None):
+    self.func_dictionary = func_dictionary
+    self.struct_info = struct_info
+    self.param_id_map = get_func_param_id_map(func_def_node)
+    self.parent_node = None
+    self.global_id_tree = global_id_tree
+    self.body_id_tree = None
+    self.keep_body_id_tree = keep_body_id_tree
+    if func_history == None:
+      self.func_history = {}
+    else:
+      self.func_history = func_history
+
+    if unknown == None:
+      self.unknown = []
+    else:
+      self.unknown = unknown
+
+    self.id_tree_stack = IDTreeStack(global_id_tree)
+    self.id_tree_stack.push_id_tree()
+
+    #TODO move this part into a function
+    for param in self.param_id_map:
+      decl_node = self.param_id_map[param]
+      decl_status = parse_decl_node(self.struct_info, decl_node)
+      descendant = self.id_tree_stack.add_id_seed_node(decl_status.name,
+                                                       decl_status)
+      if call_param_map is not None and param in call_param_map:
+        # This is a function call.
+        # Map the input parameter to the caller's nodes
+        # TODO(angiebird): Can we use add_link_node here?
+        descendant.set_link_node(call_param_map[param])
+
+  def get_id_tree_stack(self):
+    return self.id_tree_stack
+
+  def generic_visit(self, node):
+    prev_parent = self.parent_node
+    self.parent_node = node
+    for c in node:
+      self.visit(c)
+    self.parent_node = prev_parent
+
+  # TODO rename
+  def add_new_id_tree(self, node):
+    self.id_tree_stack.push_id_tree()
+    self.generic_visit(node)
+    id_tree = self.id_tree_stack.pop_id_tree()
+    if self.parent_node == None and self.keep_body_id_tree == True:
+      # this is function body
+      self.body_id_tree = id_tree
+
+  def visit_For(self, node):
+    self.add_new_id_tree(node)
+
+  def visit_Compound(self, node):
+    self.add_new_id_tree(node)
+
+  def visit_Decl(self, node):
+    if node.type.__class__.__name__ != 'FuncDecl':
+      decl_status = parse_decl_node(self.struct_info, node)
+      descendant = self.id_tree_stack.add_id_seed_node(decl_status.name,
+                                                       decl_status)
+      if node.init is not None:
+        init_id_chain = self.process_lvalue(node.init)
+        if init_id_chain != None:
+          if decl_status.struct_item is None:
+            init_descendant = self.id_tree_stack.add_id_node(init_id_chain)
+            if init_descendant != None:
+              init_descendant.set_refer(True, node.coord)
+            else:
+              self.unknown.append(node)
+            descendant.set_assign(True, node.coord)
+          else:
+            self.id_tree_stack.add_link_node(descendant, init_id_chain)
+        else:
+          self.unknown.append(node)
+      else:
+        descendant.set_assign(True, node.coord)
+      self.generic_visit(node)
+
+  def is_lvalue(self, node):
+    if self.parent_node is None:
+      # TODO(angiebird): Do every lvalue has parent_node != None?
+      return False
+    if self.parent_node.__class__.__name__ == 'StructRef':
+      return False
+    if self.parent_node.__class__.__name__ == 'ArrayRef' and node == self.parent_node.name:
+      # if node == self.parent_node.subscript, the node could be lvalue
+      return False
+    if self.parent_node.__class__.__name__ == 'UnaryOp' and self.parent_node.op == '&':
+      return False
+    if self.parent_node.__class__.__name__ == 'UnaryOp' and self.parent_node.op == '*':
+      return False
+    return True
+
+  def process_lvalue(self, node):
+    id_chain = parse_lvalue(node)
+    if id_chain == None:
+      return id_chain
+    elif id_chain[0] in self.struct_info.enum_value_dic:
+      return None
+    else:
+      return id_chain
+
+  def process_possible_lvalue(self, node):
+    if self.is_lvalue(node):
+      id_chain = self.process_lvalue(node)
+      lead_char = get_lvalue_lead(node)
+      # make sure the id is not an enum value
+      if id_chain == None:
+        self.unknown.append(node)
+        return
+      descendant = self.id_tree_stack.add_id_node(id_chain)
+      if descendant == None:
+        self.unknown.append(node)
+        return
+      decl_status = descendant.get_decl_status()
+      if decl_status == None:
+        descendant.set_assign(True, node.coord)
+        descendant.set_refer(True, node.coord)
+        self.unknown.append(node)
+        return
+      if self.parent_node.__class__.__name__ == 'Assignment':
+        if node is self.parent_node.lvalue:
+          if decl_status.struct_item != None:
+            if len(id_chain) > 1:
+              descendant.set_assign(True, node.coord)
+            elif len(id_chain) == 1:
+              if lead_char == '*':
+                descendant.set_assign(True, node.coord)
+              else:
+                right_id_chain = self.process_lvalue(self.parent_node.rvalue)
+                if right_id_chain != None:
+                  self.id_tree_stack.add_link_node(descendant, right_id_chain)
+                else:
+                  #TODO(angiebird): 1.Find a better way to deal with this case.
+                  descendant.set_assign(True, node.coord)
+            else:
+              debug_print(getframeinfo(currentframe()))
+          else:
+            descendant.set_assign(True, node.coord)
+        elif node is self.parent_node.rvalue:
+          if decl_status.struct_item is None:
+            descendant.set_refer(True, node.coord)
+            if lead_char == '&':
+              descendant.set_assign(True, node.coord)
+          else:
+            left_id_chain = self.process_lvalue(self.parent_node.lvalue)
+            left_lead_char = get_lvalue_lead(self.parent_node.lvalue)
+            if left_id_chain != None:
+              if len(left_id_chain) > 1:
+                descendant.set_refer(True, node.coord)
+              elif len(left_id_chain) == 1:
+                if left_lead_char == '*':
+                  descendant.set_refer(True, node.coord)
+                else:
+                  #TODO(angiebird): Check whether the other node is linked to this node.
+                  pass
+              else:
+                self.unknown.append(self.parent_node.lvalue)
+                debug_print(getframeinfo(currentframe()))
+            else:
+              self.unknown.append(self.parent_node.lvalue)
+              debug_print(getframeinfo(currentframe()))
+        else:
+          debug_print(getframeinfo(currentframe()))
+      elif self.parent_node.__class__.__name__ == 'UnaryOp':
+        # TODO(angiebird): Consider +=, *=, -=, /= etc
+        if self.parent_node.op == '--' or self.parent_node.op == '++' or\
+        self.parent_node.op == 'p--' or self.parent_node.op == 'p++':
+          descendant.set_assign(True, node.coord)
+          descendant.set_refer(True, node.coord)
+        else:
+          descendant.set_refer(True, node.coord)
+      elif self.parent_node.__class__.__name__ == 'Decl':
+        #The logic is at visit_Decl
+        pass
+      elif self.parent_node.__class__.__name__ == 'ExprList':
+        #The logic is at visit_FuncCall
+        pass
+      else:
+        descendant.set_refer(True, node.coord)
+
+  def visit_ID(self, node):
+    # If the parent is a FuncCall, this ID is a function name.
+    if self.parent_node.__class__.__name__ != 'FuncCall':
+      self.process_possible_lvalue(node)
+    self.generic_visit(node)
+
+  def visit_StructRef(self, node):
+    self.process_possible_lvalue(node)
+    self.generic_visit(node)
+
+  def visit_ArrayRef(self, node):
+    self.process_possible_lvalue(node)
+    self.generic_visit(node)
+
+  def visit_UnaryOp(self, node):
+    if node.op == '&' or node.op == '*':
+      self.process_possible_lvalue(node)
+    self.generic_visit(node)
+
+  def visit_FuncCall(self, node):
+    if node.name.__class__.__name__ == 'ID':
+      if node.name.name in self.func_dictionary:
+        if node.name.name not in self.func_history:
+          self.func_history[node.name.name] = True
+          func_def_node = self.func_dictionary[node.name.name]
+          call_param_map = self.process_func_call(node, func_def_node)
+
+          visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                                     self.func_dictionary, False,
+                                     call_param_map, self.global_id_tree,
+                                     self.func_history, self.unknown)
+          visitor.visit(func_def_node.body)
+    else:
+      self.unknown.append(node)
+    self.generic_visit(node)
+
+  def process_func_call(self, func_call_node, func_def_node):
+    # set up a refer/assign for func parameters
+    # return call_param_map
+    call_param_ls = func_call_node.args.exprs
+    call_param_map = {}
+
+    func_decl = func_def_node.decl.type
+    decl_param_ls = func_decl.args.params
+    for param_node, decl_node in zip(call_param_ls, decl_param_ls):
+      id_chain = self.process_lvalue(param_node)
+      if id_chain != None:
+        descendant = self.id_tree_stack.add_id_node(id_chain)
+        if descendant == None:
+          self.unknown.append(param_node)
+        else:
+          decl_status = descendant.get_decl_status()
+          if decl_status != None:
+            if decl_status.struct_item == None:
+              if decl_status.is_ptr_decl == True:
+                descendant.set_assign(True, param_node.coord)
+                descendant.set_refer(True, param_node.coord)
+              else:
+                descendant.set_refer(True, param_node.coord)
+            else:
+              call_param_map[decl_node.name] = descendant
+          else:
+            self.unknown.append(param_node)
+      else:
+        self.unknown.append(param_node)
+    return call_param_map
+
+
+def build_global_id_tree(ast, struct_info):
+  global_id_tree = IDStatusNode()
+  for node in ast.ext:
+    if node.__class__.__name__ == 'Decl':
+      # id tree is for tracking assign/refer status
+      # we don't care about function id because they can't be changed
+      if node.type.__class__.__name__ != 'FuncDecl':
+        decl_status = parse_decl_node(struct_info, node)
+        descendant = global_id_tree.add_child(decl_status.name, decl_status)
+  return global_id_tree
+
+
+class FuncAnalyzer():
+
+  def __init__(self):
+    self.ast = get_av1_ast()
+    self.struct_info = build_struct_info(self.ast)
+    self.func_dictionary = build_func_dictionary(self.ast)
+    self.global_id_tree = build_global_id_tree(self.ast, self.struct_info)
+
+  def analyze(self, func_name):
+    if func_name in self.func_dictionary:
+      func_def_node = self.func_dictionary[func_name]
+      visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                                 self.func_dictionary, True, None,
+                                 self.global_id_tree)
+      visitor.visit(func_def_node.body)
+      root = visitor.get_id_tree_stack()
+      root.top().show()
+    else:
+      print(func_name, "doesn't exist")
+
+
+if __name__ == '__main__':
+  fa = FuncAnalyzer()
+  fa.analyze('tpl_get_satd_cost')
+  pass
diff --git a/media/libaom/src/tools/auto_refactor/av1_preprocess.py b/media/libaom/src/tools/auto_refactor/av1_preprocess.py
new file mode 100644
index 0000000000..ea76912cf1
--- /dev/null
+++ b/media/libaom/src/tools/auto_refactor/av1_preprocess.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+import os
+import sys
+
+
+def is_code_file(filename):
+  return filename.endswith(".c") or filename.endswith(".h")
+
+
+def is_simd_file(filename):
+  simd_keywords = [
+      "avx2", "sse2", "sse3", "ssse3", "sse4", "dspr2", "neon", "msa", "simd",
+      "x86"
+  ]
+  for keyword in simd_keywords:
+    if filename.find(keyword) >= 0:
+      return True
+  return False
+
+
+def get_code_file_list(path, exclude_file_set):
+  code_file_list = []
+  for cur_dir, sub_dir, file_list in os.walk(path):
+    for filename in file_list:
+      if is_code_file(filename) and not is_simd_file(
+          filename) and filename not in exclude_file_set:
+        file_path = os.path.join(cur_dir, filename)
+        code_file_list.append(file_path)
+  return code_file_list
+
+
+def av1_exclude_file_set():
+  exclude_file_set = {
+      "cfl_ppc.c",
+      "ppc_cpudetect.c",
+  }
+  return exclude_file_set
+
+
+def get_av1_pp_command(fake_header_dir, code_file_list):
+  pre_command = "gcc -w -nostdinc -E -I./ -I../ -I" + fake_header_dir + (" "
+                                                                         "-D'ATTRIBUTE_PACKED='"
+                                                                         " "
+                                                                         "-D'__attribute__(x)='"
+                                                                         " "
+                                                                         "-D'__inline__='"
+                                                                         " "
+                                                                         "-D'float_t=float'"
+                                                                         " "
+                                                                         "-D'DECLARE_ALIGNED(n,"
+                                                                         " typ,"
+                                                                         " "
+                                                                         "val)=typ"
+                                                                         " val'"
+                                                                         " "
+                                                                         "-D'volatile='"
+                                                                         " "
+                                                                         "-D'AV1_K_MEANS_DIM=2'"
+                                                                         " "
+                                                                         "-D'INLINE='"
+                                                                         " "
+                                                                         "-D'AOM_INLINE='"
+                                                                         " "
+                                                                         "-D'AOM_FORCE_INLINE='"
+                                                                         " "
+                                                                         "-D'inline='"
+                                                                         )
+  return pre_command + " " + " ".join(code_file_list)
+
+
+def modify_av1_rtcd(build_dir):
+  av1_rtcd = os.path.join(build_dir, "config/av1_rtcd.h")
+  fp = open(av1_rtcd)
+  string = fp.read()
+  fp.close()
+  new_string = string.replace("#ifdef RTCD_C", "#if 0")
+  fp = open(av1_rtcd, "w")
+  fp.write(new_string)
+  fp.close()
+
+
+def preprocess_av1(aom_dir, build_dir, fake_header_dir):
+  cur_dir = os.getcwd()
+  output = os.path.join(cur_dir, "av1_pp.c")
+  path_list = [
+      os.path.join(aom_dir, "av1/encoder"),
+      os.path.join(aom_dir, "av1/common")
+  ]
+  code_file_list = []
+  for path in path_list:
+    path = os.path.realpath(path)
+    code_file_list.extend(get_code_file_list(path, av1_exclude_file_set()))
+  modify_av1_rtcd(build_dir)
+  cmd = get_av1_pp_command(fake_header_dir, code_file_list) + " >" + output
+  os.chdir(build_dir)
+  os.system(cmd)
+  os.chdir(cur_dir)
+
+
+if __name__ == "__main__":
+  aom_dir = sys.argv[1]
+  build_dir = sys.argv[2]
+  fake_header_dir = sys.argv[3]
+  preprocess_av1(aom_dir, build_dir, fake_header_dir)
diff --git a/media/libaom/src/av1/encoder/av1_multi_thread.h b/media/libaom/src/tools/auto_refactor/c_files/decl_status_code.c
index 2a1cc7d6dd..bd445ab1b5 100644
--- a/media/libaom/src/av1/encoder/av1_multi_thread.h
+++ b/media/libaom/src/tools/auto_refactor/c_files/decl_status_code.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,13 +9,23 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_AV1_MULTI_THREAD_H
-#define AV1_ENCODER_AV1_MULTI_THREAD_H
+typedef struct S1 {
+  int x;
+} T1;
 
-#include "av1/encoder/encoder.h"
+int parse_decl_node_2() { int arr[3]; }
 
-void av1_row_mt_mem_alloc(AV1_COMP *cpi, int max_sb_rows);
+int parse_decl_node_3() { int *a; }
 
-void av1_row_mt_mem_dealloc(AV1_COMP *cpi);
+int parse_decl_node_4() { T1 t1[3]; }
 
-#endif  // AV1_ENCODER_AV1_MULTI_THREAD_H
+int parse_decl_node_5() { T1 *t2[3]; }
+
+int parse_decl_node_6() { T1 t3[3][3]; }
+
+int main() {
+  int a;
+  T1 t1;
+  struct S1 s1;
+  T1 *t2;
+}
diff --git a/media/libaom/src/tools/auto_refactor/c_files/func_in_out.c b/media/libaom/src/tools/auto_refactor/c_files/func_in_out.c
new file mode 100644
index 0000000000..67ab58d520
--- /dev/null
+++ b/media/libaom/src/tools/auto_refactor/c_files/func_in_out.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+typedef struct XD {
+  int u;
+  int v;
+} XD;
+
+typedef struct RD {
+  XD *xd;
+  int u;
+  int v;
+} RD;
+
+typedef struct VP9_COMP {
+  int y;
+  RD *rd;
+  RD rd2;
+  int arr[3];
+  union {
+    int z;
+  };
+  struct {
+    int w;
+  };
+} VP9_COMP;
+
+int sub_func(VP9_COMP *cpi, int b) {
+  int d;
+  cpi->y += 1;
+  cpi->y -= b;
+  d = cpi->y * 2;
+  return d;
+}
+
+int func_id_forrest_show(VP9_COMP *cpi, int b) {
+  int c = 2;
+  int x = cpi->y + c * 2 + 1;
+  int y;
+  RD *rd = cpi->rd;
+  y = cpi->rd->u;
+  return x + y;
+}
+
+int func_link_id_chain_1(VP9_COMP *cpi) {
+  RD *rd = cpi->rd;
+  rd->u = 0;
+}
+
+int func_link_id_chain_2(VP9_COMP *cpi) {
+  RD *rd = cpi->rd;
+  XD *xd = rd->xd;
+  xd->u = 0;
+}
+
+int func_assign_refer_status_1(VP9_COMP *cpi) { RD *rd = cpi->rd; }
+
+int func_assign_refer_status_2(VP9_COMP *cpi) {
+  RD *rd2;
+  rd2 = cpi->rd;
+}
+
+int func_assign_refer_status_3(VP9_COMP *cpi) {
+  int a;
+  a = cpi->y;
+}
+
+int func_assign_refer_status_4(VP9_COMP *cpi) {
+  int *b;
+  b = &cpi->y;
+}
+
+int func_assign_refer_status_5(VP9_COMP *cpi) {
+  RD *rd5;
+  rd5 = &cpi->rd2;
+}
+
+int func_assign_refer_status_6(VP9_COMP *cpi, VP9_COMP *cpi2) {
+  cpi->rd = cpi2->rd;
+}
+
+int func_assign_refer_status_7(VP9_COMP *cpi, VP9_COMP *cpi2) {
+  cpi->arr[3] = 0;
+}
+
+int func_assign_refer_status_8(VP9_COMP *cpi, VP9_COMP *cpi2) {
+  int x = cpi->arr[3];
+}
+
+int func_assign_refer_status_9(VP9_COMP *cpi) {
+  {
+    RD *rd = cpi->rd;
+    { rd->u = 0; }
+  }
+}
+
+int func_assign_refer_status_10(VP9_COMP *cpi) { cpi->arr[cpi->rd->u] = 0; }
+
+int func_assign_refer_status_11(VP9_COMP *cpi) {
+  RD *rd11 = &cpi->rd2;
+  rd11->v = 1;
+}
+
+int func_assign_refer_status_12(VP9_COMP *cpi, VP9_COMP *cpi2) {
+  *cpi->rd = *cpi2->rd;
+}
+
+int func_assign_refer_status_13(VP9_COMP *cpi) {
+  cpi->z = 0;
+  cpi->w = 0;
+}
+
+int func(VP9_COMP *cpi, int x) {
+  int a;
+  cpi->y = 4;
+  a = 3 + cpi->y;
+  a = a * x;
+  cpi->y *= 4;
+  RD *ref_rd = cpi->rd;
+  ref_rd->u = 0;
+  cpi->rd2.v = 1;
+  cpi->rd->v = 1;
+  RD *ref_rd2 = &cpi->rd2;
+  RD **ref_rd3 = &(&cpi->rd2);
+  int b = sub_func(cpi, a);
+  cpi->rd->v++;
+  return b;
+}
+
+int func_sub_call_1(VP9_COMP *cpi2, int x) { cpi2->y = 4; }
+
+int func_call_1(VP9_COMP *cpi, int y) { func_sub_call_1(cpi, y); }
+
+int func_sub_call_2(VP9_COMP *cpi2, RD *rd, int x) { rd->u = 0; }
+
+int func_call_2(VP9_COMP *cpi, int y) { func_sub_call_2(cpi, &cpi->rd, y); }
+
+int func_sub_call_3(VP9_COMP *cpi2, int x) {}
+
+int func_call_3(VP9_COMP *cpi, int y) { func_sub_call_3(cpi, ++cpi->y); }
+
+int func_sub_sub_call_4(VP9_COMP *cpi3, XD *xd) {
+  cpi3->rd.u = 0;
+  xd->u = 0;
+}
+
+int func_sub_call_4(VP9_COMP *cpi2, RD *rd) {
+  func_sub_sub_call_4(cpi2, rd->xd);
+}
+
+int func_call_4(VP9_COMP *cpi, int y) { func_sub_call_4(cpi, &cpi->rd); }
+
+int func_sub_call_5(VP9_COMP *cpi) {
+  cpi->y = 2;
+  func_call_5(cpi);
+}
+
+int func_call_5(VP9_COMP *cpi) { func_sub_call_5(cpi); }
+
+int func_compound_1(VP9_COMP *cpi) {
+  for (int i = 0; i < 10; ++i) {
+    cpi->y++;
+  }
+}
+
+int func_compound_2(VP9_COMP *cpi) {
+  for (int i = 0; i < cpi->y; ++i) {
+    cpi->rd->u = i;
+  }
+}
+
+int func_compound_3(VP9_COMP *cpi) {
+  int i = 3;
+  while (i > 0) {
+    cpi->rd->u = i;
+    i--;
+  }
+}
+
+int func_compound_4(VP9_COMP *cpi) {
+  while (cpi->y-- >= 0) {
+  }
+}
+
+int func_compound_5(VP9_COMP *cpi) {
+  do {
+  } while (cpi->y-- >= 0);
+}
+
+int func_compound_6(VP9_COMP *cpi) {
+  for (int i = 0; i < 10; ++i) cpi->y--;
+}
+
+int main() {
+  int x;
+  VP9_COMP cpi;
+  RD rd;
+  cpi->rd = rd;
+  func(&cpi, x);
+}
diff --git a/media/libaom/src/av1/common/cdef_block_sse4.c b/media/libaom/src/tools/auto_refactor/c_files/global_variable.c
index 349329af64..26d5385e97 100644
--- a/media/libaom/src/av1/common/cdef_block_sse4.c
+++ b/media/libaom/src/tools/auto_refactor/c_files/global_variable.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,6 +9,19 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_sse4_1
-#include "av1/common/cdef_block_simd.h"
+extern const int global_a[13];
+
+const int global_b = 0;
+
+typedef struct S1 {
+  int x;
+} T1;
+
+struct S3 {
+  int x;
+} s3;
+
+int func_global_1(int *a) {
+  *a = global_a[3];
+  return 0;
+}
diff --git a/media/libaom/src/tools/auto_refactor/c_files/parse_lvalue.c b/media/libaom/src/tools/auto_refactor/c_files/parse_lvalue.c
new file mode 100644
index 0000000000..97113efc15
--- /dev/null
+++ b/media/libaom/src/tools/auto_refactor/c_files/parse_lvalue.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+typedef struct RD {
+  int u;
+  int v;
+  int arr[3];
+} RD;
+
+typedef struct VP9_COMP {
+  int y;
+  RD *rd;
+  RD rd2;
+  RD rd3[2];
+} VP9_COMP;
+
+int parse_lvalue_2(VP9_COMP *cpi) { RD *rd2 = &cpi->rd2; }
+
+int func(VP9_COMP *cpi, int x) {
+  cpi->rd->u = 0;
+
+  int y;
+  y = 0;
+
+  cpi->rd2.v = 0;
+
+  cpi->rd->arr[2] = 0;
+
+  cpi->rd3[1]->arr[2] = 0;
+
+  return 0;
+}
+
+int main() {
+  int x = 0;
+  VP9_COMP cpi;
+  func(&cpi, x);
+}
diff --git a/media/libaom/src/tools/auto_refactor/c_files/simple_code.c b/media/libaom/src/tools/auto_refactor/c_files/simple_code.c
new file mode 100644
index 0000000000..dd89a15621
--- /dev/null
+++ b/media/libaom/src/tools/auto_refactor/c_files/simple_code.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+typedef struct S {
+  int x;
+  int y;
+  int z;
+} S;
+
+typedef struct T {
+  S s;
+} T;
+
+int d(S *s) {
+  ++s->x;
+  s->x--;
+  s->y = s->y + 1;
+  int *c = &s->x;
+  S ss;
+  ss.x = 1;
+  ss.x += 2;
+  ss.z *= 2;
+  return 0;
+}
+int b(S *s) {
+  d(s);
+  return 0;
+}
+int c(int x) {
+  if (x) {
+    c(x - 1);
+  } else {
+    S s;
+    d(&s);
+  }
+  return 0;
+}
+int a(S *s) {
+  b(s);
+  c(1);
+  return 0;
+}
+int e() {
+  c(0);
+  return 0;
+}
+int main() {
+  int p = 3;
+  S s;
+  s.x = p + 1;
+  s.y = 2;
+  s.z = 3;
+  a(&s);
+  T t;
+  t.s.x = 3;
+}
diff --git a/media/libaom/src/aom_ports/system_state.h b/media/libaom/src/tools/auto_refactor/c_files/struct_code.c
index 6640839d82..e14372c83e 100644
--- a/media/libaom/src/aom_ports/system_state.h
+++ b/media/libaom/src/tools/auto_refactor/c_files/struct_code.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,15 +9,41 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AOM_PORTS_SYSTEM_STATE_H_
-#define AOM_AOM_PORTS_SYSTEM_STATE_H_
+typedef struct S1 {
+  int x;
+} T1;
 
-#include "config/aom_config.h"
+struct S3 {
+  int x;
+};
 
-#if ARCH_X86 || ARCH_X86_64
-void aom_reset_mmx_state(void);
-#define aom_clear_system_state() aom_reset_mmx_state()
-#else
-#define aom_clear_system_state()
-#endif  // ARCH_X86 || ARCH_X86_64
-#endif  // AOM_AOM_PORTS_SYSTEM_STATE_H_
+typedef struct {
+  int x;
+  struct S3 s3;
+} T4;
+
+typedef union U5 {
+  int x;
+  double y;
+} T5;
+
+typedef struct S6 {
+  struct {
+    int x;
+  };
+  union {
+    int y;
+    int z;
+  };
+} T6;
+
+typedef struct S7 {
+  struct {
+    int x;
+  } y;
+  union {
+    int w;
+  } z;
+} T7;
+
+int main() {}
diff --git a/media/libaom/src/tools/auto_refactor/test_auto_refactor.py b/media/libaom/src/tools/auto_refactor/test_auto_refactor.py
new file mode 100644
index 0000000000..6b1e269efa
--- /dev/null
+++ b/media/libaom/src/tools/auto_refactor/test_auto_refactor.py
@@ -0,0 +1,675 @@
+#!/usr/bin/env python
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+import pprint
+import re
+import os, sys
+import io
+import unittest as googletest
+
+sys.path[0:0] = ['.', '..']
+
+from pycparser import c_parser, parse_file
+from pycparser.c_ast import *
+from pycparser.c_parser import CParser, Coord, ParseError
+
+from auto_refactor import *
+
+
+def get_c_file_path(filename):
+  return os.path.join('c_files', filename)
+
+
+class TestStructInfo(googletest.TestCase):
+
+  def setUp(self):
+    filename = get_c_file_path('struct_code.c')
+    self.ast = parse_file(filename)
+
+  def test_build_struct_info(self):
+    struct_info = build_struct_info(self.ast)
+    typedef_name_dic = struct_info.typedef_name_dic
+    self.assertEqual('T1' in typedef_name_dic, True)
+    self.assertEqual('T4' in typedef_name_dic, True)
+    self.assertEqual('T5' in typedef_name_dic, True)
+
+    struct_name_dic = struct_info.struct_name_dic
+    struct_name = 'S1'
+    self.assertEqual(struct_name in struct_name_dic, True)
+    struct_item = struct_name_dic[struct_name]
+    self.assertEqual(struct_item.is_union, False)
+
+    struct_name = 'S3'
+    self.assertEqual(struct_name in struct_name_dic, True)
+    struct_item = struct_name_dic[struct_name]
+    self.assertEqual(struct_item.is_union, False)
+
+    struct_name = 'U5'
+    self.assertEqual(struct_name in struct_name_dic, True)
+    struct_item = struct_name_dic[struct_name]
+    self.assertEqual(struct_item.is_union, True)
+
+    self.assertEqual(len(struct_info.struct_item_list), 6)
+
+  def test_get_child_decl_status(self):
+    struct_info = build_struct_info(self.ast)
+    struct_item = struct_info.typedef_name_dic['T4']
+
+    decl_status = struct_item.child_decl_map['x']
+    self.assertEqual(decl_status.struct_item, None)
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    decl_status = struct_item.child_decl_map['s3']
+    self.assertEqual(decl_status.struct_item.struct_name, 'S3')
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    struct_item = struct_info.typedef_name_dic['T6']
+    decl_status = struct_item.child_decl_map['x']
+    self.assertEqual(decl_status.struct_item, None)
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    decl_status = struct_item.child_decl_map['y']
+    self.assertEqual(decl_status.struct_item, None)
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    decl_status = struct_item.child_decl_map['z']
+    self.assertEqual(decl_status.struct_item, None)
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    struct_item = struct_info.typedef_name_dic['T7']
+    decl_status = struct_item.child_decl_map['y']
+    self.assertEqual('x' in decl_status.struct_item.child_decl_map, True)
+
+    struct_item = struct_info.typedef_name_dic['T7']
+    decl_status = struct_item.child_decl_map['z']
+    self.assertEqual('w' in decl_status.struct_item.child_decl_map, True)
+
+
+class TestParseLvalue(googletest.TestCase):
+
+  def setUp(self):
+    filename = get_c_file_path('parse_lvalue.c')
+    self.ast = parse_file(filename)
+    self.func_dictionary = build_func_dictionary(self.ast)
+
+  def test_parse_lvalue(self):
+    func_node = self.func_dictionary['func']
+    func_body_items = func_node.body.block_items
+    id_list = parse_lvalue(func_body_items[0].lvalue)
+    ref_id_list = ['cpi', 'rd', 'u']
+    self.assertEqual(id_list, ref_id_list)
+
+    id_list = parse_lvalue(func_body_items[2].lvalue)
+    ref_id_list = ['y']
+    self.assertEqual(id_list, ref_id_list)
+
+    id_list = parse_lvalue(func_body_items[3].lvalue)
+    ref_id_list = ['cpi', 'rd2', 'v']
+    self.assertEqual(id_list, ref_id_list)
+
+    id_list = parse_lvalue(func_body_items[4].lvalue)
+    ref_id_list = ['cpi', 'rd', 'arr']
+    self.assertEqual(id_list, ref_id_list)
+
+    id_list = parse_lvalue(func_body_items[5].lvalue)
+    ref_id_list = ['cpi', 'rd3', 'arr']
+    self.assertEqual(id_list, ref_id_list)
+
+  def test_parse_lvalue_2(self):
+    func_node = self.func_dictionary['parse_lvalue_2']
+    func_body_items = func_node.body.block_items
+    id_list = parse_lvalue(func_body_items[0].init)
+    ref_id_list = ['cpi', 'rd2']
+    self.assertEqual(id_list, ref_id_list)
+
+
+class TestIDStatusNode(googletest.TestCase):
+
+  def test_add_descendant(self):
+    root = IDStatusNode('root')
+    id_chain1 = ['cpi', 'rd', 'u']
+    id_chain2 = ['cpi', 'rd', 'v']
+    root.add_descendant(id_chain1)
+    root.add_descendant(id_chain2)
+
+    ref_children_list1 = ['cpi']
+    children_list1 = list(root.children.keys())
+    self.assertEqual(children_list1, ref_children_list1)
+
+    ref_children_list2 = ['rd']
+    children_list2 = list(root.children['cpi'].children.keys())
+    self.assertEqual(children_list2, ref_children_list2)
+
+    ref_children_list3 = ['u', 'v']
+    children_list3 = list(root.children['cpi'].children['rd'].children.keys())
+    self.assertEqual(children_list3, ref_children_list3)
+
+  def test_get_descendant(self):
+    root = IDStatusNode('root')
+    id_chain1 = ['cpi', 'rd', 'u']
+    id_chain2 = ['cpi', 'rd', 'v']
+    ref_descendant_1 = root.add_descendant(id_chain1)
+    ref_descendant_2 = root.add_descendant(id_chain2)
+
+    descendant_1 = root.get_descendant(id_chain1)
+    self.assertEqual(descendant_1 is ref_descendant_1, True)
+
+    descendant_2 = root.get_descendant(id_chain2)
+    self.assertEqual(descendant_2 is ref_descendant_2, True)
+
+    id_chain3 = ['cpi', 'rd', 'h']
+    descendant_3 = root.get_descendant(id_chain3)
+    self.assertEqual(descendant_3, None)
+
+
+class TestFuncInOut(googletest.TestCase):
+
+  def setUp(self):
+    c_filename = get_c_file_path('func_in_out.c')
+    self.ast = parse_file(c_filename)
+    self.func_dictionary = build_func_dictionary(self.ast)
+    self.struct_info = build_struct_info(self.ast)
+
+  def test_get_func_param_id_map(self):
+    func_def_node = self.func_dictionary['func']
+    param_id_map = get_func_param_id_map(func_def_node)
+    ref_param_id_map_keys = ['cpi', 'x']
+    self.assertEqual(list(param_id_map.keys()), ref_param_id_map_keys)
+
+  def test_assign_refer_status_1(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_1']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    body_id_tree = visitor.body_id_tree
+
+    id_chain = ['rd']
+    descendant = body_id_tree.get_descendant(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+    ref_link_id_chain = ['cpi', 'rd']
+    self.assertEqual(ref_link_id_chain, descendant.get_link_id_chain())
+
+    id_chain = ['cpi', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+  def test_assign_refer_status_2(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_2']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    body_id_tree = visitor.body_id_tree
+
+    id_chain = ['rd2']
+    descendant = body_id_tree.get_descendant(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+
+    ref_link_id_chain = ['cpi', 'rd']
+    self.assertEqual(ref_link_id_chain, descendant.get_link_id_chain())
+
+    id_chain = ['cpi', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+  def test_assign_refer_status_3(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_3']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    body_id_tree = visitor.body_id_tree
+
+    id_chain = ['a']
+    descendant = body_id_tree.get_descendant(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+  def test_assign_refer_status_4(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_4']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    body_id_tree = visitor.body_id_tree
+
+    id_chain = ['b']
+    descendant = body_id_tree.get_descendant(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), True)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+  def test_assign_refer_status_5(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_5']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    body_id_tree = visitor.body_id_tree
+
+    id_chain = ['rd5']
+    descendant = body_id_tree.get_descendant(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+
+    id_chain = ['cpi', 'rd2']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+  def test_assign_refer_status_6(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_6']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+
+    id_chain = ['cpi', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+    id_chain = ['cpi2', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+    self.assertEqual(None, descendant.get_link_id_chain())
+
+  def test_assign_refer_status_7(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_7']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'arr']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_assign_refer_status_8(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_8']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'arr']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_assign_refer_status_9(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_9']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_assign_refer_status_10(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_10']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+
+    id_chain = ['cpi', 'arr']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_assign_refer_status_11(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_11']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd2', 'v']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_assign_refer_status_12(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_12']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+    id_chain = ['cpi2', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_assign_refer_status_13(self):
+    func_def_node = self.func_dictionary['func_assign_refer_status_13']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'z']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+    id_chain = ['cpi', 'w']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_id_status_forrest_1(self):
+    func_def_node = self.func_dictionary['func']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack().top()
+    children_names = set(root.get_children().keys())
+    ref_children_names = set(['cpi', 'x'])
+    self.assertEqual(children_names, ref_children_names)
+
+    root = visitor.body_id_tree
+    children_names = set(root.get_children().keys())
+    ref_children_names = set(['a', 'ref_rd', 'ref_rd2', 'ref_rd3', 'b'])
+    self.assertEqual(children_names, ref_children_names)
+
+  def test_id_status_forrest_show(self):
+    func_def_node = self.func_dictionary['func_id_forrest_show']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    visitor.get_id_tree_stack().top().show()
+
+  def test_id_status_forrest_2(self):
+    func_def_node = self.func_dictionary['func_id_forrest_show']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack().top()
+    self.assertEqual(root, root.root)
+
+    id_chain = ['cpi', 'rd']
+    descendant = root.get_descendant(id_chain)
+    self.assertEqual(root, descendant.root)
+
+    id_chain = ['b']
+    descendant = root.get_descendant(id_chain)
+    self.assertEqual(root, descendant.root)
+
+  def test_link_id_chain_1(self):
+    func_def_node = self.func_dictionary['func_link_id_chain_1']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+
+  def test_link_id_chain_2(self):
+    func_def_node = self.func_dictionary['func_link_id_chain_2']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd', 'xd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+
+  def test_func_call_1(self):
+    func_def_node = self.func_dictionary['func_call_1']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+    id_chain = ['y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_func_call_2(self):
+    func_def_node = self.func_dictionary['func_call_2']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+    id_chain = ['cpi', 'rd']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_func_call_3(self):
+    func_def_node = self.func_dictionary['func_call_3']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_func_call_4(self):
+    func_def_node = self.func_dictionary['func_call_4']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+    id_chain = ['cpi', 'rd', 'xd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_func_call_5(self):
+    func_def_node = self.func_dictionary['func_call_5']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_func_compound_1(self):
+    func_def_node = self.func_dictionary['func_compound_1']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_func_compound_2(self):
+    func_def_node = self.func_dictionary['func_compound_2']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), False)
+    self.assertEqual(descendant.get_refer(), True)
+
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_func_compound_3(self):
+    func_def_node = self.func_dictionary['func_compound_3']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+
+    id_chain = ['cpi', 'rd', 'u']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), False)
+
+  def test_func_compound_4(self):
+    func_def_node = self.func_dictionary['func_compound_4']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_func_compound_5(self):
+    func_def_node = self.func_dictionary['func_compound_5']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), True)
+
+  def test_func_compound_6(self):
+    func_def_node = self.func_dictionary['func_compound_6']
+    visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+                               self.func_dictionary)
+    visitor.visit(func_def_node.body)
+    root = visitor.get_id_tree_stack()
+    id_chain = ['cpi', 'y']
+    descendant = root.get_id_node(id_chain)
+    self.assertEqual(descendant.get_assign(), True)
+    self.assertEqual(descendant.get_refer(), True)
+
+
+class TestDeclStatus(googletest.TestCase):
+
+  def setUp(self):
+    filename = get_c_file_path('decl_status_code.c')
+    self.ast = parse_file(filename)
+    self.func_dictionary = build_func_dictionary(self.ast)
+    self.struct_info = build_struct_info(self.ast)
+
+  def test_parse_decl_node(self):
+    func_def_node = self.func_dictionary['main']
+    decl_list = func_def_node.body.block_items
+    decl_status = parse_decl_node(self.struct_info, decl_list[0])
+    self.assertEqual(decl_status.name, 'a')
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    decl_status = parse_decl_node(self.struct_info, decl_list[1])
+    self.assertEqual(decl_status.name, 't1')
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    decl_status = parse_decl_node(self.struct_info, decl_list[2])
+    self.assertEqual(decl_status.name, 's1')
+    self.assertEqual(decl_status.is_ptr_decl, False)
+
+    decl_status = parse_decl_node(self.struct_info, decl_list[3])
+    self.assertEqual(decl_status.name, 't2')
+    self.assertEqual(decl_status.is_ptr_decl, True)
+
+  def test_parse_decl_node_2(self):
+    func_def_node = self.func_dictionary['parse_decl_node_2']
+    decl_list = func_def_node.body.block_items
+    decl_status = parse_decl_node(self.struct_info, decl_list[0])
+    self.assertEqual(decl_status.name, 'arr')
+    self.assertEqual(decl_status.is_ptr_decl, True)
+    self.assertEqual(decl_status.struct_item, None)
+
+  def test_parse_decl_node_3(self):
+    func_def_node = self.func_dictionary['parse_decl_node_3']
+    decl_list = func_def_node.body.block_items
+    decl_status = parse_decl_node(self.struct_info, decl_list[0])
+    self.assertEqual(decl_status.name, 'a')
+    self.assertEqual(decl_status.is_ptr_decl, True)
+    self.assertEqual(decl_status.struct_item, None)
+
+  def test_parse_decl_node_4(self):
+    func_def_node = self.func_dictionary['parse_decl_node_4']
+    decl_list = func_def_node.body.block_items
+    decl_status = parse_decl_node(self.struct_info, decl_list[0])
+    self.assertEqual(decl_status.name, 't1')
+    self.assertEqual(decl_status.is_ptr_decl, True)
+    self.assertEqual(decl_status.struct_item.typedef_name, 'T1')
+    self.assertEqual(decl_status.struct_item.struct_name, 'S1')
+
+  def test_parse_decl_node_5(self):
+    func_def_node = self.func_dictionary['parse_decl_node_5']
+    decl_list = func_def_node.body.block_items
+    decl_status = parse_decl_node(self.struct_info, decl_list[0])
+    self.assertEqual(decl_status.name, 't2')
+    self.assertEqual(decl_status.is_ptr_decl, True)
+    self.assertEqual(decl_status.struct_item.typedef_name, 'T1')
+    self.assertEqual(decl_status.struct_item.struct_name, 'S1')
+
+  def test_parse_decl_node_6(self):
+    func_def_node = self.func_dictionary['parse_decl_node_6']
+    decl_list = func_def_node.body.block_items
+    decl_status = parse_decl_node(self.struct_info, decl_list[0])
+    self.assertEqual(decl_status.name, 't3')
+    self.assertEqual(decl_status.is_ptr_decl, True)
+    self.assertEqual(decl_status.struct_item.typedef_name, 'T1')
+    self.assertEqual(decl_status.struct_item.struct_name, 'S1')
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/media/libaom/src/tools/cpplint.py b/media/libaom/src/tools/cpplint.py
index 25fbef73d8..e3ebde2f5a 100644..100755
--- a/media/libaom/src/tools/cpplint.py
+++ b/media/libaom/src/tools/cpplint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # Copyright (c) 2009 Google Inc. All rights reserved.
 #
@@ -51,16 +51,23 @@ import sre_compile
 import string
 import sys
 import unicodedata
+import sysconfig
+
+try:
+  xrange          # Python 2
+except NameError:
+  xrange = range  # Python 3
 
 
 _USAGE = """
 Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
                    [--counting=total|toplevel|detailed] [--root=subdir]
-                   [--linelength=digits]
+                   [--linelength=digits] [--headers=x,y,...]
+                   [--quiet]
         <file> [file] ...
 
   The style guidelines this tries to follow are those in
-    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+    https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
 
   Every problem is given a confidence score from 1-5, with 5 meaning we are
   certain of the problem, and 1 meaning it could be a legitimate construct.
@@ -83,6 +90,9 @@ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
     verbose=#
       Specify a number 0-5 to restrict errors to certain verbosity levels.
 
+    quiet
+      Don't print anything if no errors are found.
+
     filter=-x,+y,...
       Specify a comma-separated list of category-filters to apply: only
       error messages whose category names pass the filters will be printed.
@@ -114,12 +124,13 @@ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
       ignored.
 
       Examples:
-        Assuing that src/.git exists, the header guard CPP variables for
-        src/chrome/browser/ui/browser.h are:
+        Assuming that top/src/.git exists (and cwd=top/src), the header guard
+        CPP variables for top/src/chrome/browser/ui/browser.h are:
 
         No flag => CHROME_BROWSER_UI_BROWSER_H_
         --root=chrome => BROWSER_UI_BROWSER_H_
         --root=chrome/browser => UI_BROWSER_H_
+        --root=.. => SRC_CHROME_BROWSER_UI_BROWSER_H_
 
     linelength=digits
       This is the allowed line length for the project. The default value is
@@ -133,6 +144,57 @@ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
 
       Examples:
         --extensions=hpp,cpp
+
+    headers=x,y,...
+      The header extensions that cpplint will treat as .h in checks. Values are
+      automatically added to --extensions list.
+
+      Examples:
+        --headers=hpp,hxx
+        --headers=hpp
+
+    cpplint.py supports per-directory configurations specified in CPPLINT.cfg
+    files. CPPLINT.cfg file can contain a number of key=value pairs.
+    Currently the following options are supported:
+
+      set noparent
+      filter=+filter1,-filter2,...
+      exclude_files=regex
+      linelength=80
+      root=subdir
+      headers=x,y,...
+
+    "set noparent" option prevents cpplint from traversing directory tree
+    upwards looking for more .cfg files in parent directories. This option
+    is usually placed in the top-level project directory.
+
+    The "filter" option is similar in function to --filter flag. It specifies
+    message filters in addition to the |_DEFAULT_FILTERS| and those specified
+    through --filter command-line flag.
+
+    "exclude_files" allows to specify a regular expression to be matched against
+    a file name. If the expression matches, the file is skipped and not run
+    through liner.
+
+    "linelength" allows to specify the allowed line length for the project.
+
+    The "root" option is similar in function to the --root flag (see example
+    above). Paths are relative to the directory of the CPPLINT.cfg.
+
+    The "headers" option is similar in function to the --headers flag
+    (see example above).
+
+    CPPLINT.cfg has an effect on files in the same directory and all
+    sub-directories, unless overridden by a nested configuration file.
+
+      Example file:
+        filter=-build/include_order,+build/include_alpha
+        exclude_files=.*\.cc
+
+    The above example disables build/include_order warning and enables
+    build/include_alpha as well as excludes all .cc from being
+    processed by linter, in the current directory (where the .cfg
+    file is located) and all sub-directories.
 """
 
 # We categorize each error message we print.  Here are the categories.
@@ -140,81 +202,101 @@ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
 # If you add a new error message with a new category, add it to the list
 # here!  cpplint_unittest.py should tell you if you forget to do this.
 _ERROR_CATEGORIES = [
-  'build/class',
-  'build/deprecated',
-  'build/endif_comment',
-  'build/explicit_make_pair',
-  'build/forward_decl',
-  'build/header_guard',
-  'build/include',
-  'build/include_alpha',
-  'build/include_order',
-  'build/include_what_you_use',
-  'build/namespaces',
-  'build/printf_format',
-  'build/storage_class',
-  'legal/copyright',
-  'readability/alt_tokens',
-  'readability/braces',
-  'readability/casting',
-  'readability/check',
-  'readability/constructors',
-  'readability/fn_size',
-  'readability/function',
-  'readability/multiline_comment',
-  'readability/multiline_string',
-  'readability/namespace',
-  'readability/nolint',
-  'readability/nul',
-  'readability/streams',
-  'readability/todo',
-  'readability/utf8',
-  'runtime/arrays',
-  'runtime/casting',
-  'runtime/explicit',
-  'runtime/int',
-  'runtime/init',
-  'runtime/invalid_increment',
-  'runtime/member_string_references',
-  'runtime/memset',
-  'runtime/operator',
-  'runtime/printf',
-  'runtime/printf_format',
-  'runtime/references',
-  'runtime/sizeof',
-  'runtime/string',
-  'runtime/threadsafe_fn',
-  'runtime/vlog',
-  'whitespace/blank_line',
-  'whitespace/braces',
-  'whitespace/comma',
-  'whitespace/comments',
-  'whitespace/empty_conditional_body',
-  'whitespace/empty_loop_body',
-  'whitespace/end_of_line',
-  'whitespace/ending_newline',
-  'whitespace/forcolon',
-  'whitespace/indent',
-  'whitespace/line_length',
-  'whitespace/newline',
-  'whitespace/operators',
-  'whitespace/parens',
-  'whitespace/semicolon',
-  'whitespace/tab',
-  'whitespace/todo'
-  ]
-
-# The default state of the category filter. This is overrided by the --filter=
+    'build/class',
+    'build/c++11',
+    'build/c++14',
+    'build/c++tr1',
+    'build/deprecated',
+    'build/endif_comment',
+    'build/explicit_make_pair',
+    'build/forward_decl',
+    'build/header_guard',
+    'build/include',
+    'build/include_alpha',
+    'build/include_order',
+    'build/include_what_you_use',
+    'build/namespaces',
+    'build/printf_format',
+    'build/storage_class',
+    'legal/copyright',
+    'readability/alt_tokens',
+    'readability/braces',
+    'readability/casting',
+    'readability/check',
+    'readability/constructors',
+    'readability/fn_size',
+    'readability/inheritance',
+    'readability/multiline_comment',
+    'readability/multiline_string',
+    'readability/namespace',
+    'readability/nolint',
+    'readability/nul',
+    'readability/strings',
+    'readability/todo',
+    'readability/utf8',
+    'runtime/arrays',
+    'runtime/casting',
+    'runtime/explicit',
+    'runtime/int',
+    'runtime/init',
+    'runtime/invalid_increment',
+    'runtime/member_string_references',
+    'runtime/memset',
+    'runtime/indentation_namespace',
+    'runtime/operator',
+    'runtime/printf',
+    'runtime/printf_format',
+    'runtime/references',
+    'runtime/string',
+    'runtime/threadsafe_fn',
+    'runtime/vlog',
+    'whitespace/blank_line',
+    'whitespace/braces',
+    'whitespace/comma',
+    'whitespace/comments',
+    'whitespace/empty_conditional_body',
+    'whitespace/empty_if_body',
+    'whitespace/empty_loop_body',
+    'whitespace/end_of_line',
+    'whitespace/ending_newline',
+    'whitespace/forcolon',
+    'whitespace/indent',
+    'whitespace/line_length',
+    'whitespace/newline',
+    'whitespace/operators',
+    'whitespace/parens',
+    'whitespace/semicolon',
+    'whitespace/tab',
+    'whitespace/todo',
+    ]
+
+# These error categories are no longer enforced by cpplint, but for backwards-
+# compatibility they may still appear in NOLINT comments.
+_LEGACY_ERROR_CATEGORIES = [
+    'readability/streams',
+    'readability/function',
+    ]
+
+# The default state of the category filter. This is overridden by the --filter=
 # flag. By default all errors are on, so only add here categories that should be
 # off by default (i.e., categories that must be enabled by the --filter= flags).
 # All entries here should start with a '-' or '+', as in the --filter= flag.
 _DEFAULT_FILTERS = ['-build/include_alpha']
 
+# The default list of categories suppressed for C (not C++) files.
+_DEFAULT_C_SUPPRESSED_CATEGORIES = [
+    'readability/casting',
+    ]
+
+# The default list of categories suppressed for Linux Kernel files.
+_DEFAULT_KERNEL_SUPPRESSED_CATEGORIES = [
+    'whitespace/tab',
+    ]
+
 # We used to check for high-bit characters, but after much discussion we
 # decided those were OK, as long as they were in UTF-8 and didn't represent
 # hard-coded international strings, which belong in a separate i18n file.
 
-
 # C++ headers
 _CPP_HEADERS = frozenset([
     # Legacy
@@ -304,6 +386,7 @@ _CPP_HEADERS = frozenset([
     'random',
     'ratio',
     'regex',
+    'scoped_allocator',
     'set',
     'sstream',
     'stack',
@@ -351,15 +434,40 @@ _CPP_HEADERS = frozenset([
     'cwctype',
     ])
 
+# Type names
+_TYPES = re.compile(
+    r'^(?:'
+    # [dcl.type.simple]
+    r'(char(16_t|32_t)?)|wchar_t|'
+    r'bool|short|int|long|signed|unsigned|float|double|'
+    # [support.types]
+    r'(ptrdiff_t|size_t|max_align_t|nullptr_t)|'
+    # [cstdint.syn]
+    r'(u?int(_fast|_least)?(8|16|32|64)_t)|'
+    r'(u?int(max|ptr)_t)|'
+    r')$')
+
+
+# These headers are excluded from [build/include] and [build/include_order]
+# checks:
+# - Anything not following google file name conventions (containing an
+#   uppercase character, such as Python.h or nsStringAPI.h, for example).
+# - Lua headers.
+_THIRD_PARTY_HEADERS_PATTERN = re.compile(
+    r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$')
+
+# Pattern for matching FileInfo.BaseName() against test file name
+_TEST_FILE_SUFFIX = r'(_test|_unittest|_regtest)$'
+
+# Pattern that matches only complete whitespace, possibly across multiple lines.
+_EMPTY_CONDITIONAL_BODY_PATTERN = re.compile(r'^\s*$', re.DOTALL)
+
 # Assertion macros.  These are defined in base/logging.h and
-# testing/base/gunit.h.  Note that the _M versions need to come first
-# for substring matching to work.
+# testing/base/public/gunit.h.
 _CHECK_MACROS = [
     'DCHECK', 'CHECK',
-    'EXPECT_TRUE_M', 'EXPECT_TRUE',
-    'ASSERT_TRUE_M', 'ASSERT_TRUE',
-    'EXPECT_FALSE_M', 'EXPECT_FALSE',
-    'ASSERT_FALSE_M', 'ASSERT_FALSE',
+    'EXPECT_TRUE', 'ASSERT_TRUE',
+    'EXPECT_FALSE', 'ASSERT_FALSE',
     ]
 
 # Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
@@ -372,16 +480,12 @@ for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
   _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
   _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
   _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
-  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
-  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
 
 for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
                             ('>=', 'LT'), ('>', 'LE'),
                             ('<=', 'GT'), ('<', 'GE')]:
   _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
   _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
-  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
-  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
 
 # Alternative tokens and their replacements.  For full list, see section 2.5
 # Alternative tokens [lex.digraph] in the C++ standard.
@@ -430,11 +534,14 @@ _MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
                         r'(?:\s+(volatile|__volatile__))?'
                         r'\s*[{(]')
 
+# Match strings that indicate we're working on a C (not C++) file.
+_SEARCH_C_FILE = re.compile(r'\b(?:LINT_C_FILE|'
+                            r'vim?:\s*.*(\s*|:)filetype=c(\s*|:|$))')
 
-_regexp_compile_cache = {}
+# Match string that indicates we're working on a Linux Kernel file.
+_SEARCH_KERNEL_FILE = re.compile(r'\b(?:LINT_KERNEL_FILE)')
 
-# Finds occurrences of NOLINT or NOLINT(...).
-_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
+_regexp_compile_cache = {}
 
 # {str, set(int)}: a map from error categories to sets of linenumbers
 # on which those errors are expected and should be suppressed.
@@ -443,6 +550,7 @@ _error_suppressions = {}
 # The root directory used for deriving header guard CPP variable.
 # This is set by --root flag.
 _root = None
+_root_debug = False
 
 # The allowed line length of files.
 # This is set by --linelength flag.
@@ -452,8 +560,28 @@ _line_length = 80
 # This is set by --extensions flag.
 _valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
 
+# Treat all headers starting with 'h' equally: .h, .hpp, .hxx etc.
+# This is set by --headers flag.
+_hpp_headers = set(['h'])
+
+# {str, bool}: a map from error categories to booleans which indicate if the
+# category should be suppressed for every line.
+_global_error_suppressions = {}
+
+def ProcessHppHeadersOption(val):
+  global _hpp_headers
+  try:
+    _hpp_headers = set(val.split(','))
+    # Automatically append to extensions list so it does not have to be set 2 times
+    _valid_extensions.update(_hpp_headers)
+  except ValueError:
+    PrintUsage('Header extensions must be comma separated list.')
+
+def IsHeaderExtension(file_extension):
+  return file_extension in _hpp_headers
+
 def ParseNolintSuppressions(filename, raw_line, linenum, error):
-  """Updates the global list of error-suppressions.
+  """Updates the global list of line error-suppressions.
 
   Parses any NOLINT comments on the current line, updating the global
   error_suppressions store.  Reports an error if the NOLINT comment
@@ -465,42 +593,67 @@ def ParseNolintSuppressions(filename, raw_line, linenum, error):
     linenum: int, the number of the current line.
     error: function, an error handler.
   """
-  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
-  matched = _RE_SUPPRESSION.search(raw_line)
+  matched = Search(r'\bNOLINT(NEXTLINE)?\b(\([^)]+\))?', raw_line)
   if matched:
-    category = matched.group(1)
+    if matched.group(1):
+      suppressed_line = linenum + 1
+    else:
+      suppressed_line = linenum
+    category = matched.group(2)
     if category in (None, '(*)'):  # => "suppress all"
-      _error_suppressions.setdefault(None, set()).add(linenum)
+      _error_suppressions.setdefault(None, set()).add(suppressed_line)
     else:
       if category.startswith('(') and category.endswith(')'):
         category = category[1:-1]
         if category in _ERROR_CATEGORIES:
-          _error_suppressions.setdefault(category, set()).add(linenum)
-        else:
+          _error_suppressions.setdefault(category, set()).add(suppressed_line)
+        elif category not in _LEGACY_ERROR_CATEGORIES:
           error(filename, linenum, 'readability/nolint', 5,
                 'Unknown NOLINT error category: %s' % category)
 
 
+def ProcessGlobalSuppresions(lines):
+  """Updates the list of global error suppressions.
+
+  Parses any lint directives in the file that have global effect.
+
+  Args:
+    lines: An array of strings, each representing a line of the file, with the
+           last element being empty if the file is terminated with a newline.
+  """
+  for line in lines:
+    if _SEARCH_C_FILE.search(line):
+      for category in _DEFAULT_C_SUPPRESSED_CATEGORIES:
+        _global_error_suppressions[category] = True
+    if _SEARCH_KERNEL_FILE.search(line):
+      for category in _DEFAULT_KERNEL_SUPPRESSED_CATEGORIES:
+        _global_error_suppressions[category] = True
+
+
 def ResetNolintSuppressions():
-  "Resets the set of NOLINT suppressions to empty."
+  """Resets the set of NOLINT suppressions to empty."""
   _error_suppressions.clear()
+  _global_error_suppressions.clear()
 
 
 def IsErrorSuppressedByNolint(category, linenum):
   """Returns true if the specified error category is suppressed on this line.
 
   Consults the global error_suppressions map populated by
-  ParseNolintSuppressions/ResetNolintSuppressions.
+  ParseNolintSuppressions/ProcessGlobalSuppresions/ResetNolintSuppressions.
 
   Args:
     category: str, the category of the error.
     linenum: int, the current line number.
   Returns:
-    bool, True iff the error should be suppressed due to a NOLINT comment.
+    bool, True iff the error should be suppressed due to a NOLINT comment or
+    global suppression.
   """
-  return (linenum in _error_suppressions.get(category, set()) or
+  return (_global_error_suppressions.get(category, False) or
+          linenum in _error_suppressions.get(category, set()) or
           linenum in _error_suppressions.get(None, set()))
 
+
 def Match(pattern, s):
   """Matches the string with the pattern, caching the compiled regexp."""
   # The regexp compilation caching is inlined in both Match and Search for
@@ -536,11 +689,17 @@ def Search(pattern, s):
   return _regexp_compile_cache[pattern].search(s)
 
 
-class _IncludeState(dict):
+def _IsSourceExtension(s):
+  """File extension (excluding dot) matches a source file extension."""
+  return s in ('c', 'cc', 'cpp', 'cxx')
+
+
+class _IncludeState(object):
   """Tracks line numbers for includes, and the order in which includes appear.
 
-  As a dict, an _IncludeState object serves as a mapping between include
-  filename and line number on which that file was included.
+  include_list contains list of lists of (header, line number) pairs.
+  It's a lists of lists rather than just one flat list to make it
+  easier to update across preprocessor boundaries.
 
   Call CheckNextIncludeOrder() once for each header in the file, passing
   in the type constants defined above. Calls in an illegal order will
@@ -571,15 +730,42 @@ class _IncludeState(dict):
       }
 
   def __init__(self):
-    dict.__init__(self)
-    self.ResetSection()
+    self.include_list = [[]]
+    self.ResetSection('')
+
+  def FindHeader(self, header):
+    """Check if a header has already been included.
 
-  def ResetSection(self):
+    Args:
+      header: header to check.
+    Returns:
+      Line number of previous occurrence, or -1 if the header has not
+      been seen before.
+    """
+    for section_list in self.include_list:
+      for f in section_list:
+        if f[0] == header:
+          return f[1]
+    return -1
+
+  def ResetSection(self, directive):
+    """Reset section checking for preprocessor directive.
+
+    Args:
+      directive: preprocessor directive (e.g. "if", "else").
+    """
     # The name of the current section.
     self._section = self._INITIAL_SECTION
     # The path of last found header.
     self._last_header = ''
 
+    # Update list of includes.  Note that we never pop from the
+    # include list.
+    if directive in ('if', 'ifdef', 'ifndef'):
+      self.include_list.append([])
+    elif directive in ('else', 'elif'):
+      self.include_list[-1] = []
+
   def SetLastHeader(self, header_path):
     self._last_header = header_path
 
@@ -615,7 +801,7 @@ class _IncludeState(dict):
     # If previous line was a blank line, assume that the headers are
     # intentionally sorted the way they are.
     if (self._last_header > header_path and
-        not Match(r'^\s*$', clean_lines.elided[linenum - 1])):
+        Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])):
       return False
     return True
 
@@ -681,8 +867,11 @@ class _CppLintState(object):
     self.error_count = 0    # global count of reported errors
     # filters to apply when emitting error messages
     self.filters = _DEFAULT_FILTERS[:]
+    # backup of filter list. Used to restore the state after each file.
+    self._filters_backup = self.filters[:]
     self.counting = 'total'  # In what way are we counting errors?
     self.errors_by_category = {}  # string to int dict storing error counts
+    self.quiet = False  # Suppress non-error messagess?
 
     # output format:
     # "emacs" - format that emacs can parse (default)
@@ -693,6 +882,12 @@ class _CppLintState(object):
     """Sets the output format for errors."""
     self.output_format = output_format
 
+  def SetQuiet(self, quiet):
+    """Sets the module's quiet settings, and returns the previous setting."""
+    last_quiet = self.quiet
+    self.quiet = quiet
+    return last_quiet
+
   def SetVerboseLevel(self, level):
     """Sets the module's verbosity, and returns the previous setting."""
     last_verbose_level = self.verbose_level
@@ -719,6 +914,10 @@ class _CppLintState(object):
     """
     # Default filters always have less priority than the flag ones.
     self.filters = _DEFAULT_FILTERS[:]
+    self.AddFilters(filters)
+
+  def AddFilters(self, filters):
+    """ Adds more filters to the existing list of error-message filters. """
     for filt in filters.split(','):
       clean_filt = filt.strip()
       if clean_filt:
@@ -728,6 +927,14 @@ class _CppLintState(object):
         raise ValueError('Every filter in --filters must start with + or -'
                          ' (%s does not)' % filt)
 
+  def BackupFilters(self):
+    """ Saves the current filter list to backup storage."""
+    self._filters_backup = self.filters[:]
+
+  def RestoreFilters(self):
+    """ Restores filters previously backed up."""
+    self.filters = self._filters_backup[:]
+
   def ResetErrorCounts(self):
     """Sets the module's error statistic back to zero."""
     self.error_count = 0
@@ -748,7 +955,7 @@ class _CppLintState(object):
     for category, count in self.errors_by_category.iteritems():
       sys.stderr.write('Category \'%s\' errors found: %d\n' %
                        (category, count))
-    sys.stderr.write('Total errors found: %d\n' % self.error_count)
+    sys.stdout.write('Total errors found: %d\n' % self.error_count)
 
 _cpplint_state = _CppLintState()
 
@@ -762,6 +969,14 @@ def _SetOutputFormat(output_format):
   """Sets the module's output format."""
   _cpplint_state.SetOutputFormat(output_format)
 
+def _Quiet():
+  """Return's the module's quiet setting."""
+  return _cpplint_state.quiet
+
+def _SetQuiet(quiet):
+  """Set the module's quiet status, and return previous setting."""
+  return _cpplint_state.SetQuiet(quiet)
+
 
 def _VerboseLevel():
   """Returns the module's verbosity setting."""
@@ -795,6 +1010,25 @@ def _SetFilters(filters):
   """
   _cpplint_state.SetFilters(filters)
 
+def _AddFilters(filters):
+  """Adds more filter overrides.
+
+  Unlike _SetFilters, this function does not reset the current list of filters
+  available.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.AddFilters(filters)
+
+def _BackupFilters():
+  """ Saves the current filter list to backup storage."""
+  _cpplint_state.BackupFilters()
+
+def _RestoreFilters():
+  """ Restores filters previously backed up."""
+  _cpplint_state.RestoreFilters()
 
 class _FunctionState(object):
   """Tracks current function name and the number of lines in its body."""
@@ -830,6 +1064,9 @@ class _FunctionState(object):
       filename: The name of the current file.
       linenum: The number of the line to check.
     """
+    if not self.in_a_function:
+      return
+
     if Match(r'T(EST|est)', self.current_function):
       base_trigger = self._TEST_TRIGGER
     else:
@@ -857,7 +1094,7 @@ class _IncludeError(Exception):
   pass
 
 
-class FileInfo:
+class FileInfo(object):
   """Provides utility functions for filenames.
 
   FileInfo provides easy access to the components of a file's path
@@ -900,12 +1137,13 @@ class FileInfo:
 
       # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
       # searching up from the current path.
-      root_dir = os.path.dirname(fullname)
-      while (root_dir != os.path.dirname(root_dir) and
-             not os.path.exists(os.path.join(root_dir, ".git")) and
-             not os.path.exists(os.path.join(root_dir, ".hg")) and
-             not os.path.exists(os.path.join(root_dir, ".svn"))):
-        root_dir = os.path.dirname(root_dir)
+      root_dir = current_dir = os.path.dirname(fullname)
+      while current_dir != os.path.dirname(current_dir):
+        if (os.path.exists(os.path.join(current_dir, ".git")) or
+            os.path.exists(os.path.join(current_dir, ".hg")) or
+            os.path.exists(os.path.join(current_dir, ".svn"))):
+          root_dir = current_dir
+        current_dir = os.path.dirname(current_dir)
 
       if (os.path.exists(os.path.join(root_dir, ".git")) or
           os.path.exists(os.path.join(root_dir, ".hg")) or
@@ -944,7 +1182,7 @@ class FileInfo:
 
   def IsSource(self):
     """File has a source file extension."""
-    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+    return _IsSourceExtension(self.Extension()[1:])
 
 
 def _ShouldPrintError(category, confidence, linenum):
@@ -955,6 +1193,7 @@ def _ShouldPrintError(category, confidence, linenum):
   # the verbosity level isn't high enough, or the filters filter it out.
   if IsErrorSuppressedByNolint(category, linenum):
     return False
+
   if confidence < _cpplint_state.verbose_level:
     return False
 
@@ -999,8 +1238,8 @@ def Error(filename, linenum, category, confidence, message):
   if _ShouldPrintError(category, confidence, linenum):
     _cpplint_state.IncrementErrorCount(category)
     if _cpplint_state.output_format == 'vs7':
-      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
+      sys.stderr.write('%s(%s): error cpplint: [%s] %s [%d]\n' % (
+          filename, linenum, category, message, confidence))
     elif _cpplint_state.output_format == 'eclipse':
       sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
           filename, linenum, message, category, confidence))
@@ -1012,11 +1251,9 @@ def Error(filename, linenum, category, confidence, message):
 # Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
 _RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
     r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
-# Matches strings.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
-# Matches characters.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
-# Matches multi-line C++ comments.
+# Match a single C style comment on the same line.
+_RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/'
+# Matches multi-line C style comments.
 # This RE is a little bit more complicated than one might expect, because we
 # have to take care of space removals tools so we can handle comments inside
 # statements better.
@@ -1025,10 +1262,10 @@ _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
 # if this doesn't work we try on left side but only if there's a non-character
 # on the right.
 _RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
-    r"""(\s*/\*.*\*/\s*$|
-            /\*.*\*/\s+|
-         \s+/\*.*\*/(?=\W)|
-            /\*.*\*/)""", re.VERBOSE)
+    r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' +
+    _RE_PATTERN_C_COMMENTS + r'\s+|' +
+    r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' +
+    _RE_PATTERN_C_COMMENTS + r')')
 
 
 def IsCppString(line):
@@ -1083,13 +1320,26 @@ def CleanseRawStrings(raw_lines):
         delimiter = None
       else:
         # Haven't found the end yet, append a blank line.
-        line = ''
+        line = '""'
 
-    else:
+    # Look for beginning of a raw string, and replace them with
+    # empty strings.  This is done in a loop to handle multiple raw
+    # strings on the same line.
+    while delimiter is None:
       # Look for beginning of a raw string.
       # See 2.14.15 [lex.string] for syntax.
-      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
-      if matched:
+      #
+      # Once we have matched a raw string, we check the prefix of the
+      # line to make sure that the line is not part of a single line
+      # comment.  It's done this way because we remove raw strings
+      # before removing comments as opposed to removing comments
+      # before removing raw strings.  This is because there are some
+      # cpplint checks that requires the comments to be preserved, but
+      # we don't want to check comments that are inside raw strings.
+      matched = Match(r'^(.*?)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
+      if (matched and
+          not Match(r'^([^\'"]|\'(\\.|[^\'])*\'|"(\\.|[^"])*")*//',
+                    matched.group(1))):
         delimiter = ')' + matched.group(2) + '"'
 
         end = matched.group(3).find(delimiter)
@@ -1101,6 +1351,8 @@ def CleanseRawStrings(raw_lines):
         else:
           # Start of a multi-line raw string
           line = matched.group(1) + '""'
+      else:
+        break
 
     lines_without_raw_strings.append(line)
 
@@ -1131,10 +1383,10 @@ def FindNextMultiLineCommentEnd(lines, lineix):
 
 def RemoveMultiLineCommentsFromRange(lines, begin, end):
   """Clears a range of lines for multi-line comments."""
-  # Having // dummy comments makes the lines non-empty, so we will not get
+  # Having // <empty> comments makes the lines non-empty, so we will not get
   # unnecessary blank line warnings later in the code.
   for i in range(begin, end):
-    lines[i] = '// dummy'
+    lines[i] = '/**/'
 
 
 def RemoveMultiLineComments(filename, lines, error):
@@ -1170,12 +1422,14 @@ def CleanseComments(line):
 
 
 class CleansedLines(object):
-  """Holds 3 copies of all lines with different preprocessing applied to them.
+  """Holds 4 copies of all lines with different preprocessing applied to them.
 
-  1) elided member contains lines without strings and comments,
-  2) lines member contains lines without comments, and
+  1) elided member contains lines without strings and comments.
+  2) lines member contains lines without comments.
   3) raw_lines member contains all the lines without processing.
-  All these three members are of <type 'list'>, and of the same length.
+  4) lines_without_raw_strings member is same as raw_lines, but with C++11 raw
+     strings removed.
+  All these members are of <type 'list'>, and of the same length.
   """
 
   def __init__(self, lines):
@@ -1206,38 +1460,138 @@ class CleansedLines(object):
     Returns:
       The line with collapsed strings.
     """
-    if not _RE_PATTERN_INCLUDE.match(elided):
-      # Remove escaped characters first to make quote/single quote collapsing
-      # basic.  Things that look like escaped characters shouldn't occur
-      # outside of strings and chars.
-      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
-    return elided
+    if _RE_PATTERN_INCLUDE.match(elided):
+      return elided
+
+    # Remove escaped characters first to make quote/single quote collapsing
+    # basic.  Things that look like escaped characters shouldn't occur
+    # outside of strings and chars.
+    elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+
+    # Replace quoted strings and digit separators.  Both single quotes
+    # and double quotes are processed in the same loop, otherwise
+    # nested quotes wouldn't work.
+    collapsed = ''
+    while True:
+      # Find the first quote character
+      match = Match(r'^([^\'"]*)([\'"])(.*)$', elided)
+      if not match:
+        collapsed += elided
+        break
+      head, quote, tail = match.groups()
+
+      if quote == '"':
+        # Collapse double quoted strings
+        second_quote = tail.find('"')
+        if second_quote >= 0:
+          collapsed += head + '""'
+          elided = tail[second_quote + 1:]
+        else:
+          # Unmatched double quote, don't bother processing the rest
+          # of the line since this is probably a multiline string.
+          collapsed += elided
+          break
+      else:
+        # Found single quote, check nearby text to eliminate digit separators.
+        #
+        # There is no special handling for floating point here, because
+        # the integer/fractional/exponent parts would all be parsed
+        # correctly as long as there are digits on both sides of the
+        # separator.  So we are fine as long as we don't see something
+        # like "0.'3" (gcc 4.9.0 will not allow this literal).
+        if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head):
+          match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail)
+          collapsed += head + match_literal.group(1).replace("'", '')
+          elided = match_literal.group(2)
+        else:
+          second_quote = tail.find('\'')
+          if second_quote >= 0:
+            collapsed += head + "''"
+            elided = tail[second_quote + 1:]
+          else:
+            # Unmatched single quote
+            collapsed += elided
+            break
+
+    return collapsed
 
 
-def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
-  """Find the position just after the matching endchar.
+def FindEndOfExpressionInLine(line, startpos, stack):
+  """Find the position just after the end of current parenthesized expression.
 
   Args:
     line: a CleansedLines line.
     startpos: start searching at this position.
-    depth: nesting level at startpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
+    stack: nesting stack at startpos.
 
   Returns:
-    On finding matching endchar: (index just after matching endchar, 0)
-    Otherwise: (-1, new depth at end of this line)
+    On finding matching end: (index just after matching end, None)
+    On finding an unclosed expression: (-1, None)
+    Otherwise: (-1, new stack at end of this line)
   """
   for i in xrange(startpos, len(line)):
-    if line[i] == startchar:
-      depth += 1
-    elif line[i] == endchar:
-      depth -= 1
-      if depth == 0:
-        return (i + 1, 0)
-  return (-1, depth)
+    char = line[i]
+    if char in '([{':
+      # Found start of parenthesized expression, push to expression stack
+      stack.append(char)
+    elif char == '<':
+      # Found potential start of template argument list
+      if i > 0 and line[i - 1] == '<':
+        # Left shift operator
+        if stack and stack[-1] == '<':
+          stack.pop()
+          if not stack:
+            return (-1, None)
+      elif i > 0 and Search(r'\boperator\s*$', line[0:i]):
+        # operator<, don't add to stack
+        continue
+      else:
+        # Tentative start of template argument list
+        stack.append('<')
+    elif char in ')]}':
+      # Found end of parenthesized expression.
+      #
+      # If we are currently expecting a matching '>', the pending '<'
+      # must have been an operator.  Remove them from expression stack.
+      while stack and stack[-1] == '<':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+      if ((stack[-1] == '(' and char == ')') or
+          (stack[-1] == '[' and char == ']') or
+          (stack[-1] == '{' and char == '}')):
+        stack.pop()
+        if not stack:
+          return (i + 1, None)
+      else:
+        # Mismatched parentheses
+        return (-1, None)
+    elif char == '>':
+      # Found potential end of template argument list.
+
+      # Ignore "->" and operator functions
+      if (i > 0 and
+          (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))):
+        continue
+
+      # Pop the stack if there is a matching '<'.  Otherwise, ignore
+      # this '>' since it must be an operator.
+      if stack:
+        if stack[-1] == '<':
+          stack.pop()
+          if not stack:
+            return (i + 1, None)
+    elif char == ';':
+      # Found something that look like end of statements.  If we are currently
+      # expecting a '>', the matching '<' must have been an operator, since
+      # template argument list should not contain statements.
+      while stack and stack[-1] == '<':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+
+  # Did not find end of expression or unbalanced parentheses on this line
+  return (-1, stack)
 
 
 def CloseExpression(clean_lines, linenum, pos):
@@ -1246,6 +1600,11 @@ def CloseExpression(clean_lines, linenum, pos):
   If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
   linenum/pos that correspond to the closing of the expression.
 
+  TODO(unknown): cpplint spends a fair bit of time matching parentheses.
+  Ideally we would want to index all opening and closing parentheses once
+  and have CloseExpression be just a simple lookup, but due to preprocessor
+  tricks, this is not so easy.
+
   Args:
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
@@ -1259,35 +1618,28 @@ def CloseExpression(clean_lines, linenum, pos):
   """
 
   line = clean_lines.elided[linenum]
-  startchar = line[pos]
-  if startchar not in '({[<':
+  if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]):
     return (line, clean_lines.NumLines(), -1)
-  if startchar == '(': endchar = ')'
-  if startchar == '[': endchar = ']'
-  if startchar == '{': endchar = '}'
-  if startchar == '<': endchar = '>'
 
   # Check first line
-  (end_pos, num_open) = FindEndOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
+  (end_pos, stack) = FindEndOfExpressionInLine(line, pos, [])
   if end_pos > -1:
     return (line, linenum, end_pos)
 
   # Continue scanning forward
-  while linenum < clean_lines.NumLines() - 1:
+  while stack and linenum < clean_lines.NumLines() - 1:
     linenum += 1
     line = clean_lines.elided[linenum]
-    (end_pos, num_open) = FindEndOfExpressionInLine(
-        line, 0, num_open, startchar, endchar)
+    (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack)
     if end_pos > -1:
       return (line, linenum, end_pos)
 
-  # Did not find endchar before end of file, give up
+  # Did not find end of expression before end of file, give up
   return (line, clean_lines.NumLines(), -1)
 
 
-def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
-  """Find position at the matching startchar.
+def FindStartOfExpressionInLine(line, endpos, stack):
+  """Find position at the matching start of current expression.
 
   This is almost the reverse of FindEndOfExpressionInLine, but note
   that the input position and returned position differs by 1.
@@ -1295,22 +1647,72 @@ def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
   Args:
     line: a CleansedLines line.
     endpos: start searching at this position.
-    depth: nesting level at endpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
+    stack: nesting stack at endpos.
 
   Returns:
-    On finding matching startchar: (index at matching startchar, 0)
-    Otherwise: (-1, new depth at beginning of this line)
+    On finding matching start: (index at matching start, None)
+    On finding an unclosed expression: (-1, None)
+    Otherwise: (-1, new stack at beginning of this line)
   """
-  for i in xrange(endpos, -1, -1):
-    if line[i] == endchar:
-      depth += 1
-    elif line[i] == startchar:
-      depth -= 1
-      if depth == 0:
-        return (i, 0)
-  return (-1, depth)
+  i = endpos
+  while i >= 0:
+    char = line[i]
+    if char in ')]}':
+      # Found end of expression, push to expression stack
+      stack.append(char)
+    elif char == '>':
+      # Found potential end of template argument list.
+      #
+      # Ignore it if it's a "->" or ">=" or "operator>"
+      if (i > 0 and
+          (line[i - 1] == '-' or
+           Match(r'\s>=\s', line[i - 1:]) or
+           Search(r'\boperator\s*$', line[0:i]))):
+        i -= 1
+      else:
+        stack.append('>')
+    elif char == '<':
+      # Found potential start of template argument list
+      if i > 0 and line[i - 1] == '<':
+        # Left shift operator
+        i -= 1
+      else:
+        # If there is a matching '>', we can pop the expression stack.
+        # Otherwise, ignore this '<' since it must be an operator.
+        if stack and stack[-1] == '>':
+          stack.pop()
+          if not stack:
+            return (i, None)
+    elif char in '([{':
+      # Found start of expression.
+      #
+      # If there are any unmatched '>' on the stack, they must be
+      # operators.  Remove those.
+      while stack and stack[-1] == '>':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+      if ((char == '(' and stack[-1] == ')') or
+          (char == '[' and stack[-1] == ']') or
+          (char == '{' and stack[-1] == '}')):
+        stack.pop()
+        if not stack:
+          return (i, None)
+      else:
+        # Mismatched parentheses
+        return (-1, None)
+    elif char == ';':
+      # Found something that look like end of statements.  If we are currently
+      # expecting a '<', the matching '>' must have been an operator, since
+      # template argument list should not contain statements.
+      while stack and stack[-1] == '>':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+
+    i -= 1
+
+  return (-1, stack)
 
 
 def ReverseCloseExpression(clean_lines, linenum, pos):
@@ -1331,30 +1733,23 @@ def ReverseCloseExpression(clean_lines, linenum, pos):
     return is the 'cleansed' line at linenum.
   """
   line = clean_lines.elided[linenum]
-  endchar = line[pos]
-  if endchar not in ')}]>':
+  if line[pos] not in ')}]>':
     return (line, 0, -1)
-  if endchar == ')': startchar = '('
-  if endchar == ']': startchar = '['
-  if endchar == '}': startchar = '{'
-  if endchar == '>': startchar = '<'
 
   # Check last line
-  (start_pos, num_open) = FindStartOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
+  (start_pos, stack) = FindStartOfExpressionInLine(line, pos, [])
   if start_pos > -1:
     return (line, linenum, start_pos)
 
   # Continue scanning backward
-  while linenum > 0:
+  while stack and linenum > 0:
     linenum -= 1
     line = clean_lines.elided[linenum]
-    (start_pos, num_open) = FindStartOfExpressionInLine(
-        line, len(line) - 1, num_open, startchar, endchar)
+    (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack)
     if start_pos > -1:
       return (line, linenum, start_pos)
 
-  # Did not find startchar before beginning of file, give up
+  # Did not find start of expression before beginning of file, give up
   return (line, 0, -1)
 
 
@@ -1362,7 +1757,7 @@ def CheckForCopyright(filename, lines, error):
   """Logs an error if no Copyright message appears at the top of the file."""
 
   # We'll say it should occur by line 10. Don't forget there's a
-  # dummy line at the front.
+  # placeholder line at the front.
   for line in xrange(1, min(len(lines), 11)):
     if re.search(r'Copyright', lines[line], re.I): break
   else:                       # means no copyright line was found
@@ -1371,6 +1766,46 @@ def CheckForCopyright(filename, lines, error):
           'You should have a line: "Copyright [year] <Copyright Owner>"')
 
 
+def GetIndentLevel(line):
+  """Return the number of leading spaces in line.
+
+  Args:
+    line: A string to check.
+
+  Returns:
+    An integer count of leading spaces, possibly zero.
+  """
+  indent = Match(r'^( *)\S', line)
+  if indent:
+    return len(indent.group(1))
+  else:
+    return 0
+
+def PathSplitToList(path):
+  """Returns the path split into a list by the separator.
+
+  Args:
+    path: An absolute or relative path (e.g. '/a/b/c/' or '../a')
+
+  Returns:
+    A list of path components (e.g. ['a', 'b', 'c]).
+  """
+  lst = []
+  while True:
+    (head, tail) = os.path.split(path)
+    if head == path: # absolute paths end
+      lst.append(head)
+      break
+    if tail == path: # relative paths end
+      lst.append(tail)
+      break
+
+    path = head
+    lst.append(tail)
+
+  lst.reverse()
+  return lst
+
 def GetHeaderGuardCPPVariable(filename):
   """Returns the CPP variable that should be used as a header guard.
 
@@ -1387,15 +1822,67 @@ def GetHeaderGuardCPPVariable(filename):
   # flymake.
   filename = re.sub(r'_flymake\.h$', '.h', filename)
   filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
+  # Replace 'c++' with 'cpp'.
+  filename = filename.replace('C++', 'cpp').replace('c++', 'cpp')
 
   fileinfo = FileInfo(filename)
   file_path_from_root = fileinfo.RepositoryName()
-  if _root:
-    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
-  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
 
+  def FixupPathFromRoot():
+    if _root_debug:
+      sys.stderr.write("\n_root fixup, _root = '%s', repository name = '%s'\n"
+          %(_root, fileinfo.RepositoryName()))
+
+    # Process the file path with the --root flag if it was set.
+    if not _root:
+      if _root_debug:
+        sys.stderr.write("_root unspecified\n")
+      return file_path_from_root
+
+    def StripListPrefix(lst, prefix):
+      # f(['x', 'y'], ['w, z']) -> None  (not a valid prefix)
+      if lst[:len(prefix)] != prefix:
+        return None
+      # f(['a, 'b', 'c', 'd'], ['a', 'b']) -> ['c', 'd']
+      return lst[(len(prefix)):]
 
-def CheckForHeaderGuard(filename, lines, error):
+    # root behavior:
+    #   --root=subdir , lstrips subdir from the header guard
+    maybe_path = StripListPrefix(PathSplitToList(file_path_from_root),
+                                 PathSplitToList(_root))
+
+    if _root_debug:
+      sys.stderr.write(("_root lstrip (maybe_path=%s, file_path_from_root=%s," +
+          " _root=%s)\n") %(maybe_path, file_path_from_root, _root))
+
+    if maybe_path:
+      return os.path.join(*maybe_path)
+
+    #   --root=.. , will prepend the outer directory to the header guard
+    full_path = fileinfo.FullName()
+    root_abspath = os.path.abspath(_root)
+
+    maybe_path = StripListPrefix(PathSplitToList(full_path),
+                                 PathSplitToList(root_abspath))
+
+    if _root_debug:
+      sys.stderr.write(("_root prepend (maybe_path=%s, full_path=%s, " +
+          "root_abspath=%s)\n") %(maybe_path, full_path, root_abspath))
+
+    if maybe_path:
+      return os.path.join(*maybe_path)
+
+    if _root_debug:
+      sys.stderr.write("_root ignore, returning %s\n" %(file_path_from_root))
+
+    #   --root=FAKE_DIR is ignored
+    return file_path_from_root
+
+  file_path_from_root = FixupPathFromRoot()
+  return re.sub(r'[^a-zA-Z0-9]', '_', file_path_from_root).upper() + '_'
+
+
+def CheckForHeaderGuard(filename, clean_lines, error):
   """Checks that the file contains a header guard.
 
   Logs an error if no #ifndef header guard is present.  For other
@@ -1403,18 +1890,29 @@ def CheckForHeaderGuard(filename, lines, error):
 
   Args:
     filename: The name of the C++ header file.
-    lines: An array of strings, each representing a line of the file.
+    clean_lines: A CleansedLines instance containing the file.
     error: The function to call with any errors found.
   """
 
+  # Don't check for header guards if there are error suppression
+  # comments somewhere in this file.
+  #
+  # Because this is silencing a warning for a nonexistent line, we
+  # only support the very specific NOLINT(build/header_guard) syntax,
+  # and not the general NOLINT or NOLINT(*) syntax.
+  raw_lines = clean_lines.lines_without_raw_strings
+  for i in raw_lines:
+    if Search(r'//\s*NOLINT\(build/header_guard\)', i):
+      return
+
   cppvar = GetHeaderGuardCPPVariable(filename)
 
-  ifndef = None
+  ifndef = ''
   ifndef_linenum = 0
-  define = None
-  endif = None
+  define = ''
+  endif = ''
   endif_linenum = 0
-  for linenum, line in enumerate(lines):
+  for linenum, line in enumerate(raw_lines):
     linesplit = line.split()
     if len(linesplit) >= 2:
       # find the first occurrence of #ifndef and #define, save arg
@@ -1429,18 +1927,12 @@ def CheckForHeaderGuard(filename, lines, error):
       endif = line
       endif_linenum = linenum
 
-  if not ifndef:
+  if not ifndef or not define or ifndef != define:
     error(filename, 0, 'build/header_guard', 5,
           'No #ifndef header guard found, suggested CPP variable is: %s' %
           cppvar)
     return
 
-  if not define:
-    error(filename, 0, 'build/header_guard', 5,
-          'No #define header guard found, suggested CPP variable is: %s' %
-          cppvar)
-    return
-
   # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
   # for backward compatibility.
   if ifndef != cppvar:
@@ -1448,26 +1940,69 @@ def CheckForHeaderGuard(filename, lines, error):
     if ifndef != cppvar + '_':
       error_level = 5
 
-    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
+    ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum,
                             error)
     error(filename, ifndef_linenum, 'build/header_guard', error_level,
           '#ifndef header guard has wrong style, please use: %s' % cppvar)
 
-  if define != ifndef:
-    error(filename, 0, 'build/header_guard', 5,
-          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
-          cppvar)
+  # Check for "//" comments on endif line.
+  ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum,
+                          error)
+  match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif)
+  if match:
+    if match.group(1) == '_':
+      # Issue low severity warning for deprecated double trailing underscore
+      error(filename, endif_linenum, 'build/header_guard', 0,
+            '#endif line should be "#endif  // %s"' % cppvar)
     return
 
-  if endif != ('#endif  // %s' % cppvar):
-    error_level = 0
-    if endif != ('#endif  // %s' % (cppvar + '_')):
-      error_level = 5
+  # Didn't find the corresponding "//" comment.  If this file does not
+  # contain any "//" comments at all, it could be that the compiler
+  # only wants "/**/" comments, look for those instead.
+  no_single_line_comments = True
+  for i in xrange(1, len(raw_lines) - 1):
+    line = raw_lines[i]
+    if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line):
+      no_single_line_comments = False
+      break
 
-    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
-                            error)
-    error(filename, endif_linenum, 'build/header_guard', error_level,
-          '#endif line should be "#endif  // %s"' % cppvar)
+  if no_single_line_comments:
+    match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif)
+    if match:
+      if match.group(1) == '_':
+        # Low severity warning for double trailing underscore
+        error(filename, endif_linenum, 'build/header_guard', 0,
+              '#endif line should be "#endif  /* %s */"' % cppvar)
+      return
+
+  # Didn't find anything
+  error(filename, endif_linenum, 'build/header_guard', 5,
+        '#endif line should be "#endif  // %s"' % cppvar)
+
+
+def CheckHeaderFileIncluded(filename, include_state, error):
+  """Logs an error if a .cc file does not include its header."""
+
+  # Do not check test files
+  fileinfo = FileInfo(filename)
+  if Search(_TEST_FILE_SUFFIX, fileinfo.BaseName()):
+    return
+
+  headerfile = filename[0:len(filename) - len(fileinfo.Extension())] + '.h'
+  if not os.path.exists(headerfile):
+    return
+  headername = FileInfo(headerfile).RepositoryName()
+  first_include = 0
+  for section_list in include_state.include_list:
+    for f in section_list:
+      if headername in f[0] or f[0] in headername:
+        return
+      if not first_include:
+        first_include = f[1]
+
+  error(filename, first_include, 'build/include', 5,
+        '%s should include its header file %s' % (fileinfo.RepositoryName(),
+                                                  headername))
 
 
 def CheckForBadCharacters(filename, lines, error):
@@ -1551,19 +2086,33 @@ def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
           'Use C++11 raw strings or concatenation instead.')
 
 
-threading_list = (
-    ('asctime(', 'asctime_r('),
-    ('ctime(', 'ctime_r('),
-    ('getgrgid(', 'getgrgid_r('),
-    ('getgrnam(', 'getgrnam_r('),
-    ('getlogin(', 'getlogin_r('),
-    ('getpwnam(', 'getpwnam_r('),
-    ('getpwuid(', 'getpwuid_r('),
-    ('gmtime(', 'gmtime_r('),
-    ('localtime(', 'localtime_r('),
-    ('rand(', 'rand_r('),
-    ('strtok(', 'strtok_r('),
-    ('ttyname(', 'ttyname_r('),
+# (non-threadsafe name, thread-safe alternative, validation pattern)
+#
+# The validation pattern is used to eliminate false positives such as:
+#  _rand();               // false positive due to substring match.
+#  ->rand();              // some member function rand().
+#  ACMRandom rand(seed);  // some variable named rand.
+#  ISAACRandom rand();    // another variable named rand.
+#
+# Basically we require the return value of these functions to be used
+# in some expression context on the same line by matching on some
+# operator before the function name.  This eliminates constructors and
+# member function calls.
+_UNSAFE_FUNC_PREFIX = r'(?:[-+*/=%^&|(<]\s*|>\s+)'
+_THREADING_LIST = (
+    ('asctime(', 'asctime_r(', _UNSAFE_FUNC_PREFIX + r'asctime\([^)]+\)'),
+    ('ctime(', 'ctime_r(', _UNSAFE_FUNC_PREFIX + r'ctime\([^)]+\)'),
+    ('getgrgid(', 'getgrgid_r(', _UNSAFE_FUNC_PREFIX + r'getgrgid\([^)]+\)'),
+    ('getgrnam(', 'getgrnam_r(', _UNSAFE_FUNC_PREFIX + r'getgrnam\([^)]+\)'),
+    ('getlogin(', 'getlogin_r(', _UNSAFE_FUNC_PREFIX + r'getlogin\(\)'),
+    ('getpwnam(', 'getpwnam_r(', _UNSAFE_FUNC_PREFIX + r'getpwnam\([^)]+\)'),
+    ('getpwuid(', 'getpwuid_r(', _UNSAFE_FUNC_PREFIX + r'getpwuid\([^)]+\)'),
+    ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'),
+    ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'),
+    ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'),
+    ('strtok(', 'strtok_r(',
+     _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'),
+    ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'),
     )
 
 
@@ -1583,14 +2132,13 @@ def CheckPosixThreading(filename, clean_lines, linenum, error):
     error: The function to call with any errors found.
   """
   line = clean_lines.elided[linenum]
-  for single_thread_function, multithread_safe_function in threading_list:
-    ix = line.find(single_thread_function)
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
-                                line[ix - 1] not in ('_', '.', '>'))):
+  for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST:
+    # Additional pattern matching check to confirm that this is the
+    # function we are looking for
+    if Search(pattern, line):
       error(filename, linenum, 'runtime/threadsafe_fn', 2,
-            'Consider using ' + multithread_safe_function +
-            '...) instead of ' + single_thread_function +
+            'Consider using ' + multithread_safe_func +
+            '...) instead of ' + single_thread_func +
             '...) for improved thread safety.')
 
 
@@ -1612,7 +2160,6 @@ def CheckVlogArguments(filename, clean_lines, linenum, error):
           'VLOG() should be used with numeric verbosity level.  '
           'Use LOG() if you want symbolic severity levels.')
 
-
 # Matches invalid increment: *count++, which moves pointer instead of
 # incrementing a value.
 _RE_PATTERN_INVALID_INCREMENT = re.compile(
@@ -1641,13 +2188,29 @@ def CheckInvalidIncrement(filename, clean_lines, linenum, error):
           'Changing pointer instead of value (or unused value of operator*).')
 
 
+def IsMacroDefinition(clean_lines, linenum):
+  if Search(r'^#define', clean_lines[linenum]):
+    return True
+
+  if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]):
+    return True
+
+  return False
+
+
+def IsForwardClassDeclaration(clean_lines, linenum):
+  return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum])
+
+
 class _BlockInfo(object):
   """Stores information about a generic block of code."""
 
-  def __init__(self, seen_open_brace):
+  def __init__(self, linenum, seen_open_brace):
+    self.starting_linenum = linenum
     self.seen_open_brace = seen_open_brace
     self.open_parentheses = 0
     self.inline_asm = _NO_ASM
+    self.check_namespace_indentation = False
 
   def CheckBegin(self, filename, clean_lines, linenum, error):
     """Run checks that applies to text up to the opening brace.
@@ -1677,15 +2240,33 @@ class _BlockInfo(object):
     """
     pass
 
+  def IsBlockInfo(self):
+    """Returns true if this block is a _BlockInfo.
+
+    This is convenient for verifying that an object is an instance of
+    a _BlockInfo, but not an instance of any of the derived classes.
+
+    Returns:
+      True for this class, False for derived classes.
+    """
+    return self.__class__ == _BlockInfo
+
+
+class _ExternCInfo(_BlockInfo):
+  """Stores information about an 'extern "C"' block."""
+
+  def __init__(self, linenum):
+    _BlockInfo.__init__(self, linenum, True)
+
 
 class _ClassInfo(_BlockInfo):
   """Stores information about a class."""
 
   def __init__(self, name, class_or_struct, clean_lines, linenum):
-    _BlockInfo.__init__(self, False)
+    _BlockInfo.__init__(self, linenum, False)
     self.name = name
-    self.starting_linenum = linenum
     self.is_derived = False
+    self.check_namespace_indentation = True
     if class_or_struct == 'struct':
       self.access = 'public'
       self.is_struct = True
@@ -1695,11 +2276,7 @@ class _ClassInfo(_BlockInfo):
 
     # Remember initial indentation level for this class.  Using raw_lines here
     # instead of elided to account for leading comments.
-    initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum])
-    if initial_indent:
-      self.class_indent = len(initial_indent.group(1))
-    else:
-      self.class_indent = 0
+    self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum])
 
     # Try to find the end of the class.  This will be confused by things like:
     #   class A {
@@ -1721,6 +2298,23 @@ class _ClassInfo(_BlockInfo):
       self.is_derived = True
 
   def CheckEnd(self, filename, clean_lines, linenum, error):
+    # If there is a DISALLOW macro, it should appear near the end of
+    # the class.
+    seen_last_thing_in_class = False
+    for i in xrange(linenum - 1, self.starting_linenum, -1):
+      match = Search(
+          r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' +
+          self.name + r'\)',
+          clean_lines.elided[i])
+      if match:
+        if seen_last_thing_in_class:
+          error(filename, i, 'readability/constructors', 3,
+                match.group(1) + ' should be the last thing in the class')
+        break
+
+      if not Match(r'^\s*$', clean_lines.elided[i]):
+        seen_last_thing_in_class = True
+
     # Check that closing brace is aligned with beginning of the class.
     # Only do this if the closing brace is indented by only whitespaces.
     # This means we will not check single-line class definitions.
@@ -1738,9 +2332,9 @@ class _NamespaceInfo(_BlockInfo):
   """Stores information about a namespace."""
 
   def __init__(self, name, linenum):
-    _BlockInfo.__init__(self, False)
+    _BlockInfo.__init__(self, linenum, False)
     self.name = name or ''
-    self.starting_linenum = linenum
+    self.check_namespace_indentation = True
 
   def CheckEnd(self, filename, clean_lines, linenum, error):
     """Check end of namespace comments."""
@@ -1758,7 +2352,7 @@ class _NamespaceInfo(_BlockInfo):
     # deciding what these nontrivial things are, so this check is
     # triggered by namespace size only, which works most of the time.
     if (linenum - self.starting_linenum < 10
-        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+        and not Match(r'^\s*};*\s*(//|/\*).*\bnamespace\b', line)):
       return
 
     # Look for matching comment at end of namespace.
@@ -1775,17 +2369,24 @@ class _NamespaceInfo(_BlockInfo):
     # expected namespace.
     if self.name:
       # Named namespace
-      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
-                    r'[\*/\.\\\s]*$'),
+      if not Match((r'^\s*};*\s*(//|/\*).*\bnamespace\s+' +
+                    re.escape(self.name) + r'[\*/\.\\\s]*$'),
                    line):
         error(filename, linenum, 'readability/namespace', 5,
               'Namespace should be terminated with "// namespace %s"' %
               self.name)
     else:
       # Anonymous namespace
-      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
-        error(filename, linenum, 'readability/namespace', 5,
-              'Namespace should be terminated with "// namespace"')
+      if not Match(r'^\s*};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        # If "// namespace anonymous" or "// anonymous namespace (more text)",
+        # mention "// anonymous namespace" as an acceptable form
+        if Match(r'^\s*}.*\b(namespace anonymous|anonymous namespace)\b', line):
+          error(filename, linenum, 'readability/namespace', 5,
+                'Anonymous namespace should be terminated with "// namespace"'
+                ' or "// anonymous namespace"')
+        else:
+          error(filename, linenum, 'readability/namespace', 5,
+                'Anonymous namespace should be terminated with "// namespace"')
 
 
 class _PreprocessorInfo(object):
@@ -1802,7 +2403,7 @@ class _PreprocessorInfo(object):
     self.seen_else = False
 
 
-class _NestingState(object):
+class NestingState(object):
   """Holds states related to parsing braces."""
 
   def __init__(self):
@@ -1814,6 +2415,17 @@ class _NestingState(object):
     # - _BlockInfo: some other type of block.
     self.stack = []
 
+    # Top of the previous stack before each Update().
+    #
+    # Because the nesting_stack is updated at the end of each line, we
+    # had to do some convoluted checks to find out what is the current
+    # scope at the beginning of the line.  This check is simplified by
+    # saving the previous top of nesting stack.
+    #
+    # We could save the full stack, but we only need the top.  Copying
+    # the full nesting stack would slow down cpplint by ~10%.
+    self.previous_stack_top = []
+
     # Stack of _PreprocessorInfo objects.
     self.pp_stack = []
 
@@ -1834,6 +2446,82 @@ class _NestingState(object):
     """
     return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
 
+  def InExternC(self):
+    """Check if we are currently one level inside an 'extern "C"' block.
+
+    Returns:
+      True if top of the stack is an extern block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _ExternCInfo)
+
+  def InClassDeclaration(self):
+    """Check if we are currently one level inside a class or struct declaration.
+
+    Returns:
+      True if top of the stack is a class/struct, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _ClassInfo)
+
+  def InAsmBlock(self):
+    """Check if we are currently one level inside an inline ASM block.
+
+    Returns:
+      True if the top of the stack is a block containing inline ASM.
+    """
+    return self.stack and self.stack[-1].inline_asm != _NO_ASM
+
+  def InTemplateArgumentList(self, clean_lines, linenum, pos):
+    """Check if current position is inside template argument list.
+
+    Args:
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      pos: position just after the suspected template argument.
+    Returns:
+      True if (linenum, pos) is inside template arguments.
+    """
+    while linenum < clean_lines.NumLines():
+      # Find the earliest character that might indicate a template argument
+      line = clean_lines.elided[linenum]
+      match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:])
+      if not match:
+        linenum += 1
+        pos = 0
+        continue
+      token = match.group(1)
+      pos += len(match.group(0))
+
+      # These things do not look like template argument list:
+      #   class Suspect {
+      #   class Suspect x; }
+      if token in ('{', '}', ';'): return False
+
+      # These things look like template argument list:
+      #   template <class Suspect>
+      #   template <class Suspect = default_value>
+      #   template <class Suspect[]>
+      #   template <class Suspect...>
+      if token in ('>', '=', '[', ']', '.'): return True
+
+      # Check if token is an unmatched '<'.
+      # If not, move on to the next character.
+      if token != '<':
+        pos += 1
+        if pos >= len(line):
+          linenum += 1
+          pos = 0
+        continue
+
+      # We can't be sure if we just find a single '<', and need to
+      # find the matching '>'.
+      (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1)
+      if end_pos < 0:
+        # Not sure if template argument list or syntax error in file
+        return False
+      linenum = end_line
+      pos = end_pos
+    return False
+
   def UpdatePreprocessor(self, line):
     """Update preprocessor stack.
 
@@ -1890,6 +2578,7 @@ class _NestingState(object):
         # TODO(unknown): unexpected #endif, issue warning?
         pass
 
+  # TODO(unknown): Update() is too long, but we will refactor later.
   def Update(self, filename, clean_lines, linenum, error):
     """Update nesting state with current line.
 
@@ -1901,7 +2590,17 @@ class _NestingState(object):
     """
     line = clean_lines.elided[linenum]
 
-    # Update pp_stack first
+    # Remember top of the previous nesting stack.
+    #
+    # The stack is always pushed/popped and not modified in place, so
+    # we can just do a shallow copy instead of copy.deepcopy.  Using
+    # deepcopy would slow down cpplint by ~28%.
+    if self.stack:
+      self.previous_stack_top = self.stack[-1]
+    else:
+      self.previous_stack_top = None
+
+    # Update pp_stack
     self.UpdatePreprocessor(line)
 
     # Count parentheses.  This is to avoid adding struct arguments to
@@ -1952,32 +2651,27 @@ class _NestingState(object):
     # such as in:
     #   class LOCKABLE API Object {
     #   };
-    #
-    # Templates with class arguments may confuse the parser, for example:
-    #   template <class T
-    #             class Comparator = less<T>,
-    #             class Vector = vector<T> >
-    #   class HeapQueue {
-    #
-    # Because this parser has no nesting state about templates, by the
-    # time it saw "class Comparator", it may think that it's a new class.
-    # Nested templates have a similar problem:
-    #   template <
-    #       typename ExportedType,
-    #       typename TupleType,
-    #       template <typename, typename> class ImplTemplate>
-    #
-    # To avoid these cases, we ignore classes that are followed by '=' or '>'
     class_decl_match = Match(
-        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
-        r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
-        r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line)
+        r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?'
+        r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))'
+        r'(.*)$', line)
     if (class_decl_match and
         (not self.stack or self.stack[-1].open_parentheses == 0)):
-      self.stack.append(_ClassInfo(
-          class_decl_match.group(4), class_decl_match.group(2),
-          clean_lines, linenum))
-      line = class_decl_match.group(5)
+      # We do not want to accept classes that are actually template arguments:
+      #   template <class Ignore1,
+      #             class Ignore2 = Default<Args>,
+      #             template <Args> class Ignore3>
+      #   void Function() {};
+      #
+      # To avoid template argument cases, we scan forward and look for
+      # an unmatched '>'.  If we see one, assume we are inside a
+      # template argument list.
+      end_declaration = len(class_decl_match.group(1))
+      if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration):
+        self.stack.append(_ClassInfo(
+            class_decl_match.group(3), class_decl_match.group(2),
+            clean_lines, linenum))
+        line = class_decl_match.group(4)
 
     # If we have not yet seen the opening brace for the innermost block,
     # run checks here.
@@ -2024,10 +2718,13 @@ class _NestingState(object):
         # stack otherwise.
         if not self.SeenOpenBrace():
           self.stack[-1].seen_open_brace = True
+        elif Match(r'^extern\s*"[^"]*"\s*\{', line):
+          self.stack.append(_ExternCInfo(linenum))
         else:
-          self.stack.append(_BlockInfo(True))
+          self.stack.append(_BlockInfo(linenum, True))
           if _MATCH_ASM.match(line):
             self.stack[-1].inline_asm = _BLOCK_ASM
+
       elif token == ';' or token == ')':
         # If we haven't seen an opening brace yet, but we already saw
         # a semicolon, this is probably a forward declaration.  Pop
@@ -2103,7 +2800,7 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
@@ -2136,7 +2833,8 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
             r'\s+(register|static|extern|typedef)\b',
             line):
     error(filename, linenum, 'build/storage_class', 5,
-          'Storage class (static, extern, typedef, etc) should be first.')
+          'Storage-class specifier (static, extern, typedef, etc) should be '
+          'at the beginning of the declaration.')
 
   if Match(r'\s*#\s*endif\s*[^/\s]+', line):
     error(filename, linenum, 'build/endif_comment', 5,
@@ -2176,26 +2874,79 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
 
   # Look for single-argument constructors that aren't marked explicit.
   # Technically a valid construct, but against style.
-  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
-               % re.escape(base_classname),
-               line)
-  if (args and
-      args.group(1) != 'void' and
-      not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&'
-                % re.escape(base_classname), args.group(1).strip())):
-    error(filename, linenum, 'runtime/explicit', 5,
-          'Single-argument constructors should be marked explicit.')
-
-
-def CheckSpacingForFunctionCall(filename, line, linenum, error):
+  explicit_constructor_match = Match(
+      r'\s+(?:(?:inline|constexpr)\s+)*(explicit\s+)?'
+      r'(?:(?:inline|constexpr)\s+)*%s\s*'
+      r'\(((?:[^()]|\([^()]*\))*)\)'
+      % re.escape(base_classname),
+      line)
+
+  if explicit_constructor_match:
+    is_marked_explicit = explicit_constructor_match.group(1)
+
+    if not explicit_constructor_match.group(2):
+      constructor_args = []
+    else:
+      constructor_args = explicit_constructor_match.group(2).split(',')
+
+    # collapse arguments so that commas in template parameter lists and function
+    # argument parameter lists don't split arguments in two
+    i = 0
+    while i < len(constructor_args):
+      constructor_arg = constructor_args[i]
+      while (constructor_arg.count('<') > constructor_arg.count('>') or
+             constructor_arg.count('(') > constructor_arg.count(')')):
+        constructor_arg += ',' + constructor_args[i + 1]
+        del constructor_args[i + 1]
+      constructor_args[i] = constructor_arg
+      i += 1
+
+    defaulted_args = [arg for arg in constructor_args if '=' in arg]
+    noarg_constructor = (not constructor_args or  # empty arg list
+                         # 'void' arg specifier
+                         (len(constructor_args) == 1 and
+                          constructor_args[0].strip() == 'void'))
+    onearg_constructor = ((len(constructor_args) == 1 and  # exactly one arg
+                           not noarg_constructor) or
+                          # all but at most one arg defaulted
+                          (len(constructor_args) >= 1 and
+                           not noarg_constructor and
+                           len(defaulted_args) >= len(constructor_args) - 1))
+    initializer_list_constructor = bool(
+        onearg_constructor and
+        Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0]))
+    copy_constructor = bool(
+        onearg_constructor and
+        Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&'
+              % re.escape(base_classname), constructor_args[0].strip()))
+
+    if (not is_marked_explicit and
+        onearg_constructor and
+        not initializer_list_constructor and
+        not copy_constructor):
+      if defaulted_args:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Constructors callable with one argument '
+              'should be marked explicit.')
+      else:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Single-parameter constructors should be marked explicit.')
+    elif is_marked_explicit and not onearg_constructor:
+      if noarg_constructor:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Zero-parameter constructors should not be marked explicit.')
+
+
+def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error):
   """Checks for the correctness of various spacing around function calls.
 
   Args:
     filename: The name of the current file.
-    line: The text of the line to check.
+    clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
+  line = clean_lines.elided[linenum]
 
   # Since function calls often occur inside if/for/while/switch
   # expressions - which have their own, more liberal conventions - we
@@ -2238,10 +2989,18 @@ def CheckSpacingForFunctionCall(filename, line, linenum, error):
       error(filename, linenum, 'whitespace/parens', 2,
             'Extra space after (')
     if (Search(r'\w\s+\(', fncall) and
-        not Search(r'#\s*define|typedef', fncall) and
-        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)):
-      error(filename, linenum, 'whitespace/parens', 4,
-            'Extra space before ( in function call')
+        not Search(r'_{0,2}asm_{0,2}\s+_{0,2}volatile_{0,2}\s+\(', fncall) and
+        not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and
+        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and
+        not Search(r'\bcase\s+\(', fncall)):
+      # TODO(unknown): Space after an operator function seem to be a common
+      # error, silence those for now by restricting them to highest verbosity.
+      if Search(r'\boperator_*\b', line):
+        error(filename, linenum, 'whitespace/parens', 0,
+              'Extra space before ( in function call')
+      else:
+        error(filename, linenum, 'whitespace/parens', 4,
+              'Extra space before ( in function call')
     # If the ) is followed only by a newline or a { + newline, assume it's
     # part of a control statement (if/while/etc), and don't complain
     if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
@@ -2270,12 +3029,26 @@ def IsBlankLine(line):
   return not line or line.isspace()
 
 
+def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                                 error):
+  is_namespace_indent_item = (
+      len(nesting_state.stack) > 1 and
+      nesting_state.stack[-1].check_namespace_indentation and
+      isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and
+      nesting_state.previous_stack_top == nesting_state.stack[-2])
+
+  if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                     clean_lines.elided, line):
+    CheckItemIndentationInNamespace(filename, clean_lines.elided,
+                                    line, error)
+
+
 def CheckForFunctionLengths(filename, clean_lines, linenum,
                             function_state, error):
   """Reports for long function bodies.
 
   For an overview why this is done, see:
-  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+  https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
 
   Uses a simplistic algorithm assuming other style guidelines
   (especially spacing) are followed.
@@ -2295,8 +3068,6 @@ def CheckForFunctionLengths(filename, clean_lines, linenum,
   """
   lines = clean_lines.lines
   line = lines[linenum]
-  raw = clean_lines.raw_lines
-  raw_line = raw[linenum]
   joined_line = ''
 
   starting_func = False
@@ -2343,190 +3114,58 @@ def CheckForFunctionLengths(filename, clean_lines, linenum,
 _RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
 
 
-def CheckComment(comment, filename, linenum, error):
-  """Checks for common mistakes in TODO comments.
-
-  Args:
-    comment: The text of the comment from the line in question.
-    filename: The name of the current file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  match = _RE_PATTERN_TODO.match(comment)
-  if match:
-    # One whitespace is correct; zero whitespace is handled elsewhere.
-    leading_whitespace = match.group(1)
-    if len(leading_whitespace) > 1:
-      error(filename, linenum, 'whitespace/todo', 2,
-            'Too many spaces before TODO')
-
-    username = match.group(2)
-    if not username:
-      error(filename, linenum, 'readability/todo', 2,
-            'Missing username in TODO; it should look like '
-            '"// TODO(my_username): Stuff."')
-
-    middle_whitespace = match.group(3)
-    # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
-    if middle_whitespace != ' ' and middle_whitespace != '':
-      error(filename, linenum, 'whitespace/todo', 2,
-            'TODO(my_username) should be followed by a space')
-
-def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
-  """Checks for improper use of DISALLOW* macros.
+def CheckComment(line, filename, linenum, next_line_start, error):
+  """Checks for common mistakes in comments.
 
   Args:
+    line: The line in question.
     filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
+    next_line_start: The first non-whitespace column of the next line.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
-                   r'DISALLOW_EVIL_CONSTRUCTORS|'
-                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
-  if not matched:
-    return
-  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
-    if nesting_state.stack[-1].access != 'private':
-      error(filename, linenum, 'readability/constructors', 3,
-            '%s must be in the private: section' % matched.group(1))
-
-  else:
-    # Found DISALLOW* macro outside a class declaration, or perhaps it
-    # was used inside a function when it should have been part of the
-    # class declaration.  We could issue a warning here, but it
-    # probably resulted in a compiler error already.
-    pass
-
-
-def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
-  """Find the corresponding > to close a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_suffix: Remainder of the current line after the initial <.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_suffix
-  nesting_stack = ['<']
-  while True:
-    # Find the next operator that can tell us whether < is used as an
-    # opening bracket or as a less-than operator.  We only want to
-    # warn on the latter case.
-    #
-    # We could also check all other operators and terminate the search
-    # early, e.g. if we got something like this "a<b+c", the "<" is
-    # most likely a less-than operator, but then we will get false
-    # positives for default arguments and other template expressions.
-    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(1)
-      line = match.group(2)
-
-      if nesting_stack[-1] == '<':
-        # Expecting closing angle bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator == '>':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma after a bracket, this is most likely a template
-          # argument.  We have not seen a closing angle bracket yet, but
-          # it's probably a few lines later if we look for it, so just
-          # return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting closing parenthesis or closing bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator in (')', ']'):
-          # We don't bother checking for matching () or [].  If we got
-          # something like (] or [), it would have been a syntax error.
-          nesting_stack.pop()
-
-    else:
-      # Scan the next line
-      linenum += 1
-      if linenum >= len(clean_lines.elided):
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all remaining lines and still no matching angle bracket.
-  # Most likely the input was incomplete, otherwise we should have
-  # seen a semicolon and returned early.
-  return True
-
-
-def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
-  """Find the corresponding < that started a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_prefix: Part of the current line before the initial >.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_prefix
-  nesting_stack = ['>']
-  while True:
-    # Find the previous operator
-    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(2)
-      line = match.group(1)
-
-      if nesting_stack[-1] == '>':
-        # Expecting opening angle bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator == '<':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma before a bracket, this is most likely a
-          # template argument.  The opening angle bracket is probably
-          # there if we look for it, so just return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting opening parenthesis or opening bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator in ('(', '['):
-          nesting_stack.pop()
-
-    else:
-      # Scan the previous line
-      linenum -= 1
-      if linenum < 0:
-        break
-      line = clean_lines.elided[linenum]
+  commentpos = line.find('//')
+  if commentpos != -1:
+    # Check if the // may be in quotes.  If so, ignore it
+    if re.sub(r'\\.', '', line[0:commentpos]).count('"') % 2 == 0:
+      # Allow one space for new scopes, two spaces otherwise:
+      if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and
+          ((commentpos >= 1 and
+            line[commentpos-1] not in string.whitespace) or
+           (commentpos >= 2 and
+            line[commentpos-2] not in string.whitespace))):
+        error(filename, linenum, 'whitespace/comments', 2,
+              'At least two spaces is best between code and comments')
 
-  # Exhausted all earlier lines and still no matching angle bracket.
-  return False
+      # Checks for common mistakes in TODO comments.
+      comment = line[commentpos:]
+      match = _RE_PATTERN_TODO.match(comment)
+      if match:
+        # One whitespace is correct; zero whitespace is handled elsewhere.
+        leading_whitespace = match.group(1)
+        if len(leading_whitespace) > 1:
+          error(filename, linenum, 'whitespace/todo', 2,
+                'Too many spaces before TODO')
+
+        username = match.group(2)
+        if not username:
+          error(filename, linenum, 'readability/todo', 2,
+                'Missing username in TODO; it should look like '
+                '"// TODO(my_username): Stuff."')
+
+        middle_whitespace = match.group(3)
+        # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+        if middle_whitespace != ' ' and middle_whitespace != '':
+          error(filename, linenum, 'whitespace/todo', 2,
+                'TODO(my_username) should be followed by a space')
+
+      # If the comment contains an alphanumeric character, there
+      # should be a space somewhere between it and the // unless
+      # it's a /// or //! Doxygen comment.
+      if (Match(r'//[^ ]*\w', comment) and
+          not Match(r'(///|//\!)(\s+|$)', comment)):
+        error(filename, linenum, 'whitespace/comments', 4,
+              'Should have a space between // and comment')
 
 
 def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
@@ -2542,7 +3181,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -2565,7 +3204,12 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   #   }
   #
   # A warning about missing end of namespace comments will be issued instead.
-  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
+  #
+  # Also skip blank line checks for 'extern "C"' blocks, which are formatted
+  # like namespaces.
+  if (IsBlankLine(line) and
+      not nesting_state.InNamespaceBody() and
+      not nesting_state.InExternC()):
     elided = clean_lines.elided
     prev_line = elided[linenum - 1]
     prevbrace = prev_line.rfind('{')
@@ -2628,54 +3272,64 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
       error(filename, linenum, 'whitespace/blank_line', 3,
             'Do not leave a blank line after "%s:"' % matched.group(1))
 
-  # Next, we complain if there's a comment too near the text
-  commentpos = line.find('//')
-  if commentpos != -1:
-    # Check if the // may be in quotes.  If so, ignore it
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if (line.count('"', 0, commentpos) -
-        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
-      # Allow one space for new scopes, two spaces otherwise:
-      if (not Match(r'^\s*{ //', line) and
-          ((commentpos >= 1 and
-            line[commentpos-1] not in string.whitespace) or
-           (commentpos >= 2 and
-            line[commentpos-2] not in string.whitespace))):
-        error(filename, linenum, 'whitespace/comments', 2,
-              'At least two spaces is best between code and comments')
-      # There should always be a space between the // and the comment
-      commentend = commentpos + 2
-      if commentend < len(line) and not line[commentend] == ' ':
-        # but some lines are exceptions -- e.g. if they're big
-        # comment delimiters like:
-        # //----------------------------------------------------------
-        # or are an empty C++ style Doxygen comment, like:
-        # ///
-        # or C++ style Doxygen comments placed after the variable:
-        # ///<  Header comment
-        # //!<  Header comment
-        # or they begin with multiple slashes followed by a space:
-        # //////// Header comment
-        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
-                 Search(r'^/$', line[commentend:]) or
-                 Search(r'^!< ', line[commentend:]) or
-                 Search(r'^/< ', line[commentend:]) or
-                 Search(r'^/+ ', line[commentend:]))
-        if not match:
-          error(filename, linenum, 'whitespace/comments', 4,
-                'Should have a space between // and comment')
-      CheckComment(line[commentpos:], filename, linenum, error)
-
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  # Don't try to do spacing checks for operator methods
-  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
+  # Next, check comments
+  next_line_start = 0
+  if linenum + 1 < clean_lines.NumLines():
+    next_line = raw[linenum + 1]
+    next_line_start = len(next_line) - len(next_line.lstrip())
+  CheckComment(line, filename, linenum, next_line_start, error)
+
+  # get rid of comments and strings
+  line = clean_lines.elided[linenum]
+
+  # You shouldn't have spaces before your brackets, except maybe after
+  # 'delete []', 'return []() {};', or 'auto [abc, ...] = ...;'.
+  if Search(r'\w\s+\[', line) and not Search(r'(?:auto&?|delete|return)\s+\[', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Extra space before [')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search(r'for *\(.*[^:]:[^: ]', line) or
+      Search(r'for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
+
+
+def CheckOperatorSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing around operators.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Don't try to do spacing checks for operator methods.  Do this by
+  # replacing the troublesome characters with something else,
+  # preserving column position for all other characters.
+  #
+  # The replacement is done repeatedly to avoid false positives from
+  # operators that call operators.
+  while True:
+    match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line)
+    if match:
+      line = match.group(1) + ('_' * len(match.group(2))) + match.group(3)
+    else:
+      break
 
   # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
   # Otherwise not.  Note we only check for non-spaces on *both* sides;
   # sometimes people put non-spaces on one side when aligning ='s among
   # many lines (not that this is behavior that I approve of...)
-  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
+  if ((Search(r'[\w.]=', line) or
+       Search(r'=[\w.]', line))
+      and not Search(r'\b(if|while|for) ', line)
+      # Operators taken from [lex.operators] in C++11 standard.
+      and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line)
+      and not Search(r'operator=', line)):
     error(filename, linenum, 'whitespace/operators', 4,
           'Missing spaces around =')
 
@@ -2687,42 +3341,51 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   #
   # Check <= and >= first to avoid false positives with < and >, then
   # check non-include lines for spacing around < and >.
-  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
+  #
+  # If the operator is followed by a comma, assume it's be used in a
+  # macro context and don't do any checks.  This avoids false
+  # positives.
+  #
+  # Note that && is not included here.  This is because there are too
+  # many false positives due to RValue references.
+  match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line)
   if match:
     error(filename, linenum, 'whitespace/operators', 3,
           'Missing spaces around %s' % match.group(1))
-  # We allow no-spaces around << when used like this: 10<<20, but
-  # not otherwise (particularly, not when used as streams)
-  # Also ignore using ns::operator<<;
-  match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
-  if (match and
-      not (match.group(1).isdigit() and match.group(2).isdigit()) and
-      not (match.group(1) == 'operator' and match.group(2) == ';')):
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around <<')
   elif not Match(r'#.*include', line):
-    # Avoid false positives on ->
-    reduced_line = line.replace('->', '')
-
     # Look for < that is not surrounded by spaces.  This is only
     # triggered if both sides are missing spaces, even though
     # technically should should flag if at least one side is missing a
     # space.  This is done to avoid some false positives with shifts.
-    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
-    if (match and
-        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around <')
+    match = Match(r'^(.*[^\s<])<[^\s=<,]', line)
+    if match:
+      (_, _, end_pos) = CloseExpression(
+          clean_lines, linenum, len(match.group(1)))
+      if end_pos <= -1:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around <')
 
     # Look for > that is not surrounded by spaces.  Similar to the
     # above, we only trigger if both sides are missing spaces to avoid
     # false positives with shifts.
-    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
-    if (match and
-        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
-                                             match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around >')
+    match = Match(r'^(.*[^-\s>])>[^\s=>,]', line)
+    if match:
+      (_, _, start_pos) = ReverseCloseExpression(
+          clean_lines, linenum, len(match.group(1)))
+      if start_pos <= -1:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around >')
+
+  # We allow no-spaces around << when used like this: 10<<20, but
+  # not otherwise (particularly, not when used as streams)
+  #
+  # We also allow operators following an opening parenthesis, since
+  # those tend to be macros that deal with operators.
+  match = Search(r'(operator|[^\s(<])(?:L|UL|LL|ULL|l|ul|ll|ull)?<<([^\s,=<])', line)
+  if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and
+      not (match.group(1) == 'operator' and match.group(2) == ';')):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
 
   # We allow no-spaces around >> for almost anything.  This is because
   # C++11 allows ">>" to close nested templates, which accounts for
@@ -2747,7 +3410,19 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error(filename, linenum, 'whitespace/operators', 4,
           'Extra space for operator %s' % match.group(1))
 
-  # A pet peeve of mine: no spaces after an if, while, switch, or for
+
+def CheckParenthesisSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing around parentheses.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # No spaces after an if, while, switch, or for
   match = Search(r' (if\(|for\(|while\(|switch\()', line)
   if match:
     error(filename, linenum, 'whitespace/parens', 5,
@@ -2773,6 +3448,19 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
             'Should have zero or one spaces inside ( and ) in %s' %
             match.group(1))
 
+
+def CheckCommaSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing near commas and semicolons.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  raw = clean_lines.lines_without_raw_strings
+  line = clean_lines.elided[linenum]
+
   # You should always have a space after a comma (either as fn arg or operator)
   #
   # This does not apply when the non-space character following the
@@ -2783,7 +3471,8 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   # verify that lines contain missing whitespaces, second pass on raw
   # lines to confirm that those missing whitespaces are not due to
   # elided comments.
-  if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]):
+  if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and
+      Search(r',[^,\s]', raw[linenum])):
     error(filename, linenum, 'whitespace/comma', 3,
           'Missing space after ,')
 
@@ -2795,14 +3484,91 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error(filename, linenum, 'whitespace/semicolon', 3,
           'Missing space after ;')
 
-  # Next we will look for issues with function calls.
-  CheckSpacingForFunctionCall(filename, line, linenum, error)
+
+def _IsType(clean_lines, nesting_state, expr):
+  """Check if expression looks like a type name, returns true if so.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    expr: The expression to check.
+  Returns:
+    True, if token looks like a type.
+  """
+  # Keep only the last token in the expression
+  last_word = Match(r'^.*(\b\S+)$', expr)
+  if last_word:
+    token = last_word.group(1)
+  else:
+    token = expr
+
+  # Match native types and stdint types
+  if _TYPES.match(token):
+    return True
+
+  # Try a bit harder to match templated types.  Walk up the nesting
+  # stack until we find something that resembles a typename
+  # declaration for what we are looking for.
+  typename_pattern = (r'\b(?:typename|class|struct)\s+' + re.escape(token) +
+                      r'\b')
+  block_index = len(nesting_state.stack) - 1
+  while block_index >= 0:
+    if isinstance(nesting_state.stack[block_index], _NamespaceInfo):
+      return False
+
+    # Found where the opening brace is.  We want to scan from this
+    # line up to the beginning of the function, minus a few lines.
+    #   template <typename Type1,  // stop scanning here
+    #             ...>
+    #   class C
+    #     : public ... {  // start scanning here
+    last_line = nesting_state.stack[block_index].starting_linenum
+
+    next_block_start = 0
+    if block_index > 0:
+      next_block_start = nesting_state.stack[block_index - 1].starting_linenum
+    first_line = last_line
+    while first_line >= next_block_start:
+      if clean_lines.elided[first_line].find('template') >= 0:
+        break
+      first_line -= 1
+    if first_line < next_block_start:
+      # Didn't find any "template" keyword before reaching the next block,
+      # there are probably no template things to check for this block
+      block_index -= 1
+      continue
+
+    # Look for typename in the specified range
+    for i in xrange(first_line, last_line + 1, 1):
+      if Search(typename_pattern, clean_lines.elided[i]):
+        return True
+    block_index -= 1
+
+  return False
+
+
+def CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for horizontal spacing near commas.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
 
   # Except after an opening paren, or after another opening brace (in case of
   # an initializer list, for instance), you should have spaces before your
-  # braces. And since you should never have braces at the beginning of a line,
-  # this is an easy test.
-  match = Match(r'^(.*[^ ({]){', line)
+  # braces when they are delimiting blocks, classes, namespaces etc.
+  # And since you should never have braces at the beginning of a line,
+  # this is an easy test.  Except that braces used for initialization don't
+  # follow the same rule; we often don't want spaces before those.
+  match = Match(r'^(.*[^ ({>]){', line)
+
   if match:
     # Try a bit harder to check for brace initialization.  This
     # happens in one of the following forms:
@@ -2813,10 +3579,12 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     #   LastArgument(..., type{});
     #   LOG(INFO) << type{} << " ...";
     #   map_of_type[{...}] = ...;
+    #   ternary = expr ? new type{} : nullptr;
+    #   OuterTemplate<InnerTemplateConstructor<Type>{}>
     #
     # We check for the character following the closing brace, and
     # silence the warning if it's one of those listed above, i.e.
-    # "{.;,)<]".
+    # "{.;,)<>]:".
     #
     # To account for nested initializer list, we allow any number of
     # closing braces up to "{;,)<".  We can't simply silence the
@@ -2830,6 +3598,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     # There is a false negative with this approach if people inserted
     # spurious semicolons, e.g. "if (cond){};", but we will catch the
     # spurious semicolon with a separate check.
+    leading_text = match.group(1)
     (endline, endlinenum, endpos) = CloseExpression(
         clean_lines, linenum, len(match.group(1)))
     trailing_text = ''
@@ -2838,7 +3607,11 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     for offset in xrange(endlinenum + 1,
                          min(endlinenum + 3, clean_lines.NumLines() - 1)):
       trailing_text += clean_lines.elided[offset]
-    if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text):
+    # We also suppress warnings for `uint64_t{expression}` etc., as the style
+    # guide recommends brace initialization for integral types to avoid
+    # overflow/truncation.
+    if (not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text)
+        and not _IsType(clean_lines, nesting_state, leading_text)):
       error(filename, linenum, 'whitespace/braces', 5,
             'Missing space before {')
 
@@ -2847,12 +3620,6 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error(filename, linenum, 'whitespace/braces', 5,
           'Missing space before else')
 
-  # You shouldn't have spaces before your brackets, except maybe after
-  # 'delete []' or 'new char * []'.
-  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
-    error(filename, linenum, 'whitespace/braces', 5,
-          'Extra space before [')
-
   # You shouldn't have a space before a semicolon at the end of the line.
   # There's a special case for "for" since the style guide allows space before
   # the semicolon there.
@@ -2869,12 +3636,23 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
           'Extra space before last semicolon. If this should be an empty '
           'statement, use {} instead.')
 
-  # In range-based for, we wanted spaces before and after the colon, but
-  # not around "::" tokens that might appear.
-  if (Search('for *\(.*[^:]:[^: ]', line) or
-      Search('for *\(.*[^: ]:[^:]', line)):
-    error(filename, linenum, 'whitespace/forcolon', 2,
-          'Missing space around colon in range-based for loop')
+
+def IsDecltype(clean_lines, linenum, column):
+  """Check if the token ending on (linenum, column) is decltype().
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: the number of the line to check.
+    column: end column of the token to check.
+  Returns:
+    True if this token is decltype() expression, False otherwise.
+  """
+  (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column)
+  if start_col < 0:
+    return False
+  if Search(r'\bdecltype\s*$', text[0:start_col]):
+    return True
+  return False
 
 
 def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
@@ -2974,15 +3752,18 @@ def CheckBraces(filename, clean_lines, linenum, error):
     # used for brace initializers inside function calls.  We don't detect this
     # perfectly: we just don't complain if the last non-whitespace character on
     # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
-    # previous line starts a preprocessor block.
+    # previous line starts a preprocessor block. We also allow a brace on the
+    # following line if it is part of an array initialization and would not fit
+    # within the 80 character limit of the preceding line.
     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
     if (not Search(r'[,;:}{(]\s*$', prevline) and
-        not Match(r'\s*#', prevline)):
+        not Match(r'\s*#', prevline) and
+        not (GetLineWidth(prevline) > _line_length - 2 and '[]' in prevline)):
       error(filename, linenum, 'whitespace/braces', 4,
             '{ should almost always be at the end of the previous line')
 
   # An else clause should be on the same line as the preceding closing brace.
-  if Match(r'\s*else\s*', line):
+  if Match(r'\s*else\b\s*(?:if\b|\{|$)', line):
     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
     if Match(r'\s*}\s*$', prevline):
       error(filename, linenum, 'whitespace/newline', 4,
@@ -2990,19 +3771,20 @@ def CheckBraces(filename, clean_lines, linenum, error):
 
   # If braces come on one side of an else, they should be on both.
   # However, we have to worry about "else if" that spans multiple lines!
-  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
-    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
-      # find the ( after the if
-      pos = line.find('else if')
-      pos = line.find('(', pos)
-      if pos > 0:
-        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
-        if endline[endpos:].find('{') == -1:    # must be brace after if
-          error(filename, linenum, 'readability/braces', 5,
-                'If an else has a brace on one side, it should have it on both')
-    else:            # common case: else not followed by a multi-line if
-      error(filename, linenum, 'readability/braces', 5,
-            'If an else has a brace on one side, it should have it on both')
+  if Search(r'else if\s*\(', line):       # could be multi-line if
+    brace_on_left = bool(Search(r'}\s*else if\s*\(', line))
+    # find the ( after the if
+    pos = line.find('else if')
+    pos = line.find('(', pos)
+    if pos > 0:
+      (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+      brace_on_right = endline[endpos:].find('{') != -1
+      if brace_on_left != brace_on_right:    # must be brace after if
+        error(filename, linenum, 'readability/braces', 5,
+              'If an else has a brace on one side, it should have it on both')
+  elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+    error(filename, linenum, 'readability/braces', 5,
+          'If an else has a brace on one side, it should have it on both')
 
   # Likewise, an else should never have the else clause on the same line
   if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
@@ -3014,11 +3796,79 @@ def CheckBraces(filename, clean_lines, linenum, error):
     error(filename, linenum, 'whitespace/newline', 4,
           'do/while clauses should not be on a single line')
 
+  # Check single-line if/else bodies. The style guide says 'curly braces are not
+  # required for single-line statements'. We additionally allow multi-line,
+  # single statements, but we reject anything with more than one semicolon in
+  # it. This means that the first semicolon after the if should be at the end of
+  # its line, and the line after that should have an indent level equal to or
+  # lower than the if. We also check for ambiguous if/else nesting without
+  # braces.
+  if_else_match = Search(r'\b(if\s*\(|else\b)', line)
+  if if_else_match and not Match(r'\s*#', line):
+    if_indent = GetIndentLevel(line)
+    endline, endlinenum, endpos = line, linenum, if_else_match.end()
+    if_match = Search(r'\bif\s*\(', line)
+    if if_match:
+      # This could be a multiline if condition, so find the end first.
+      pos = if_match.end() - 1
+      (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos)
+    # Check for an opening brace, either directly after the if or on the next
+    # line. If found, this isn't a single-statement conditional.
+    if (not Match(r'\s*{', endline[endpos:])
+        and not (Match(r'\s*$', endline[endpos:])
+                 and endlinenum < (len(clean_lines.elided) - 1)
+                 and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))):
+      while (endlinenum < len(clean_lines.elided)
+             and ';' not in clean_lines.elided[endlinenum][endpos:]):
+        endlinenum += 1
+        endpos = 0
+      if endlinenum < len(clean_lines.elided):
+        endline = clean_lines.elided[endlinenum]
+        # We allow a mix of whitespace and closing braces (e.g. for one-liner
+        # methods) and a single \ after the semicolon (for macros)
+        endpos = endline.find(';')
+        if not Match(r';[\s}]*(\\?)$', endline[endpos:]):
+          # Semicolon isn't the last character, there's something trailing.
+          # Output a warning if the semicolon is not contained inside
+          # a lambda expression.
+          if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$',
+                       endline):
+            error(filename, linenum, 'readability/braces', 4,
+                  'If/else bodies with multiple statements require braces')
+        elif endlinenum < len(clean_lines.elided) - 1:
+          # Make sure the next line is dedented
+          next_line = clean_lines.elided[endlinenum + 1]
+          next_indent = GetIndentLevel(next_line)
+          # With ambiguous nested if statements, this will error out on the
+          # if that *doesn't* match the else, regardless of whether it's the
+          # inner one or outer one.
+          if (if_match and Match(r'\s*else\b', next_line)
+              and next_indent != if_indent):
+            error(filename, linenum, 'readability/braces', 4,
+                  'Else clause should be indented at the same level as if. '
+                  'Ambiguous nested if/else chains require braces.')
+          elif next_indent > if_indent:
+            error(filename, linenum, 'readability/braces', 4,
+                  'If/else bodies with multiple statements require braces')
+
+
+def CheckTrailingSemicolon(filename, clean_lines, linenum, error):
+  """Looks for redundant trailing semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]
+
   # Block bodies should not be followed by a semicolon.  Due to C++11
   # brace initialization, there are more places where semicolons are
-  # required than not, so we use a whitelist approach to check these
-  # rather than a blacklist.  These are the places where "};" should
-  # be replaced by just "}":
+  # required than not, so we explicitly list the allowed rules rather
+  # than listing the disallowed ones.  These are the places where "};"
+  # should be replaced by just "}":
   # 1. Some flavor of block following closing parenthesis:
   #    for (;;) {};
   #    while (...) {};
@@ -3074,28 +3924,40 @@ def CheckBraces(filename, clean_lines, linenum, error):
     #  - INTERFACE_DEF
     #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
     #
-    # We implement a whitelist of safe macros instead of a blacklist of
+    # We implement a list of safe macros instead of a list of
     # unsafe macros, even though the latter appears less frequently in
     # google code and would have been easier to implement.  This is because
-    # the downside for getting the whitelist wrong means some extra
-    # semicolons, while the downside for getting the blacklist wrong
+    # the downside for getting the allowed checks wrong means some extra
+    # semicolons, while the downside for getting disallowed checks wrong
     # would result in compile errors.
     #
-    # In addition to macros, we also don't want to warn on compound
-    # literals.
+    # In addition to macros, we also don't want to warn on
+    #  - Compound literals
+    #  - Lambdas
+    #  - alignas specifier with anonymous structs
+    #  - decltype
     closing_brace_pos = match.group(1).rfind(')')
     opening_parenthesis = ReverseCloseExpression(
         clean_lines, linenum, closing_brace_pos)
     if opening_parenthesis[2] > -1:
       line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
-      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
+      macro = Search(r'\b([A-Z_][A-Z0-9_]*)\s*$', line_prefix)
+      func = Match(r'^(.*\])\s*$', line_prefix)
       if ((macro and
            macro.group(1) not in (
                'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
                'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
                'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+          (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or
+          Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or
+          Search(r'\bdecltype$', line_prefix) or
           Search(r'\s+=\s*$', line_prefix)):
         match = None
+    if (match and
+        opening_parenthesis[1] > 1 and
+        Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])):
+      # Multi-line lambda-expression
+      match = None
 
   else:
     # Try matching cases 2-3.
@@ -3125,6 +3987,14 @@ def CheckBraces(filename, clean_lines, linenum, error):
       # outputting warnings for the matching closing brace, if there are
       # nested blocks with trailing semicolons, we will get the error
       # messages in reversed order.
+
+      # We need to check the line forward for NOLINT
+      raw_lines = clean_lines.raw_lines
+      ParseNolintSuppressions(filename, raw_lines[endlinenum-1], endlinenum-1,
+                              error)
+      ParseNolintSuppressions(filename, raw_lines[endlinenum], endlinenum,
+                              error)
+
       error(filename, endlinenum, 'readability/braces', 4,
             "You don't need a ; after a }")
 
@@ -3148,7 +4018,7 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
   line = clean_lines.elided[linenum]
   matched = Match(r'\s*(for|while|if)\s*\(', line)
   if matched:
-    # Find the end of the conditional expression
+    # Find the end of the conditional expression.
     (end_line, end_linenum, end_pos) = CloseExpression(
         clean_lines, linenum, line.find('('))
 
@@ -3163,6 +4033,98 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
         error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
               'Empty loop bodies should use {} or continue')
 
+    # Check for if statements that have completely empty bodies (no comments)
+    # and no else clauses.
+    if end_pos >= 0 and matched.group(1) == 'if':
+      # Find the position of the opening { for the if statement.
+      # Return without logging an error if it has no brackets.
+      opening_linenum = end_linenum
+      opening_line_fragment = end_line[end_pos:]
+      # Loop until EOF or find anything that's not whitespace or opening {.
+      while not Search(r'^\s*\{', opening_line_fragment):
+        if Search(r'^(?!\s*$)', opening_line_fragment):
+          # Conditional has no brackets.
+          return
+        opening_linenum += 1
+        if opening_linenum == len(clean_lines.elided):
+          # Couldn't find conditional's opening { or any code before EOF.
+          return
+        opening_line_fragment = clean_lines.elided[opening_linenum]
+      # Set opening_line (opening_line_fragment may not be entire opening line).
+      opening_line = clean_lines.elided[opening_linenum]
+
+      # Find the position of the closing }.
+      opening_pos = opening_line_fragment.find('{')
+      if opening_linenum == end_linenum:
+        # We need to make opening_pos relative to the start of the entire line.
+        opening_pos += end_pos
+      (closing_line, closing_linenum, closing_pos) = CloseExpression(
+          clean_lines, opening_linenum, opening_pos)
+      if closing_pos < 0:
+        return
+
+      # Now construct the body of the conditional. This consists of the portion
+      # of the opening line after the {, all lines until the closing line,
+      # and the portion of the closing line before the }.
+      if (clean_lines.raw_lines[opening_linenum] !=
+          CleanseComments(clean_lines.raw_lines[opening_linenum])):
+        # Opening line ends with a comment, so conditional isn't empty.
+        return
+      if closing_linenum > opening_linenum:
+        # Opening line after the {. Ignore comments here since we checked above.
+        body = list(opening_line[opening_pos+1:])
+        # All lines until closing line, excluding closing line, with comments.
+        body.extend(clean_lines.raw_lines[opening_linenum+1:closing_linenum])
+        # Closing line before the }. Won't (and can't) have comments.
+        body.append(clean_lines.elided[closing_linenum][:closing_pos-1])
+        body = '\n'.join(body)
+      else:
+        # If statement has brackets and fits on a single line.
+        body = opening_line[opening_pos+1:closing_pos-1]
+
+      # Check if the body is empty
+      if not _EMPTY_CONDITIONAL_BODY_PATTERN.search(body):
+        return
+      # The body is empty. Now make sure there's not an else clause.
+      current_linenum = closing_linenum
+      current_line_fragment = closing_line[closing_pos:]
+      # Loop until EOF or find anything that's not whitespace or else clause.
+      while Search(r'^\s*$|^(?=\s*else)', current_line_fragment):
+        if Search(r'^(?=\s*else)', current_line_fragment):
+          # Found an else clause, so don't log an error.
+          return
+        current_linenum += 1
+        if current_linenum == len(clean_lines.elided):
+          break
+        current_line_fragment = clean_lines.elided[current_linenum]
+
+      # The body is empty and there's no else clause until EOF or other code.
+      error(filename, end_linenum, 'whitespace/empty_if_body', 4,
+            ('If statement had no body and no else clause'))
+
+
+def FindCheckMacro(line):
+  """Find a replaceable CHECK-like macro.
+
+  Args:
+    line: line to search on.
+  Returns:
+    (macro name, start position), or (None, -1) if no replaceable
+    macro is found.
+  """
+  for macro in _CHECK_MACROS:
+    i = line.find(macro)
+    if i >= 0:
+      # Find opening parenthesis.  Do a regular expression match here
+      # to make sure that we are matching the expected CHECK macro, as
+      # opposed to some other macro that happens to contain the CHECK
+      # substring.
+      matched = Match(r'^(.*\b' + macro + r'\s*)\(', line)
+      if not matched:
+        continue
+      return (macro, len(matched.group(1)))
+  return (None, -1)
+
 
 def CheckCheck(filename, clean_lines, linenum, error):
   """Checks the use of CHECK and EXPECT macros.
@@ -3176,24 +4138,8 @@ def CheckCheck(filename, clean_lines, linenum, error):
 
   # Decide the set of replacement macros that should be suggested
   lines = clean_lines.elided
-  check_macro = None
-  start_pos = -1
-  for macro in _CHECK_MACROS:
-    i = lines[linenum].find(macro)
-    if i >= 0:
-      check_macro = macro
-
-      # Find opening parenthesis.  Do a regular expression match here
-      # to make sure that we are matching the expected CHECK macro, as
-      # opposed to some other macro that happens to contain the CHECK
-      # substring.
-      matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum])
-      if not matched:
-        continue
-      start_pos = len(matched.group(1))
-      break
-  if not check_macro or start_pos < 0:
-    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
+  (check_macro, start_pos) = FindCheckMacro(lines[linenum])
+  if not check_macro:
     return
 
   # Find end of the boolean expression by matching parentheses
@@ -3201,6 +4147,13 @@ def CheckCheck(filename, clean_lines, linenum, error):
       clean_lines, linenum, start_pos)
   if end_pos < 0:
     return
+
+  # If the check macro is followed by something other than a
+  # semicolon, assume users will log their own custom error messages
+  # and don't suggest any replacements.
+  if not Match(r'\s*;', last_line[end_pos:]):
+    return
+
   if linenum == end_line:
     expression = lines[linenum][start_pos + 1:end_pos - 1]
   else:
@@ -3223,7 +4176,7 @@ def CheckCheck(filename, clean_lines, linenum, error):
       if token == '(':
         # Parenthesized operand
         expression = matched.group(2)
-        (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')')
+        (end, _) = FindEndOfExpressionInLine(expression, 0, ['('])
         if end < 0:
           return  # Unmatched parenthesis
         lhs += '(' + expression[0:end]
@@ -3339,6 +4292,16 @@ def GetLineWidth(line):
       if unicodedata.east_asian_width(uc) in ('W', 'F'):
         width += 2
       elif not unicodedata.combining(uc):
+        # Issue 337
+        # https://mail.python.org/pipermail/python-list/2012-August/628809.html
+        if (sys.version_info.major, sys.version_info.minor) <= (3, 2):
+          # https://github.com/python/cpython/blob/2.7/Include/unicodeobject.h#L81
+          is_wide_build = sysconfig.get_config_var("Py_UNICODE_SIZE") >= 4
+          # https://github.com/python/cpython/blob/2.7/Objects/unicodeobject.c#L564
+          is_low_surrogate = 0xDC00 <= ord(uc) <= 0xDFFF
+          if not is_wide_build and is_low_surrogate:
+            width -= 1
+
         width += 1
     return width
   else:
@@ -3358,7 +4321,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
     file_extension: The extension (without the dot) of the filename.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -3368,6 +4331,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
   # raw strings,
   raw_lines = clean_lines.lines_without_raw_strings
   line = raw_lines[linenum]
+  prev = raw_lines[linenum - 1] if linenum > 0 else ''
 
   if line.find('\t') != -1:
     error(filename, linenum, 'whitespace/tab', 1,
@@ -3385,23 +4349,33 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
   # if(match($0, " <<")) complain = 0;
   # if(match(prev, " +for \\(")) complain = 0;
   # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+  scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$'
+  classinfo = nesting_state.InnermostClass()
   initial_spaces = 0
   cleansed_line = clean_lines.elided[linenum]
   while initial_spaces < len(line) and line[initial_spaces] == ' ':
     initial_spaces += 1
-  if line and line[-1].isspace():
-    error(filename, linenum, 'whitespace/end_of_line', 4,
-          'Line ends in whitespace.  Consider deleting these extra spaces.')
-  # There are certain situations we allow one space, notably for section labels
-  elif ((initial_spaces == 1 or initial_spaces == 3) and
-        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
+  # There are certain situations we allow one space, notably for
+  # section labels, and also lines containing multi-line raw strings.
+  # We also don't check for lines that look like continuation lines
+  # (of lines ending in double quotes, commas, equals, or angle brackets)
+  # because the rules for how to indent those are non-trivial.
+  if (not Search(r'[",=><] *$', prev) and
+      (initial_spaces == 1 or initial_spaces == 3) and
+      not Match(scope_or_label_pattern, cleansed_line) and
+      not (clean_lines.raw_lines[linenum] != line and
+           Match(r'^\s*""', line))):
     error(filename, linenum, 'whitespace/indent', 3,
           'Weird number of spaces at line-start.  '
           'Are you using a 2-space indent?')
 
+  if line and line[-1].isspace():
+    error(filename, linenum, 'whitespace/end_of_line', 4,
+          'Line ends in whitespace.  Consider deleting these extra spaces.')
+
   # Check if the line is a header guard.
   is_header_guard = False
-  if file_extension == 'h':
+  if IsHeaderExtension(file_extension):
     cppvar = GetHeaderGuardCPPVariable(filename)
     if (line.startswith('#ifndef %s' % cppvar) or
         line.startswith('#define %s' % cppvar) or
@@ -3417,14 +4391,10 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
   # developers fault.
   if (not line.startswith('#include') and not is_header_guard and
       not Match(r'^\s*//.*http(s?)://\S*$', line) and
+      not Match(r'^\s*//\s*[^\s]*$', line) and
       not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
     line_width = GetLineWidth(line)
-    extended_length = int((_line_length * 1.25))
-    if line_width > extended_length:
-      error(filename, linenum, 'whitespace/line_length', 4,
-            'Lines should very rarely be longer than %i characters' %
-            extended_length)
-    elif line_width > _line_length:
+    if line_width > _line_length:
       error(filename, linenum, 'whitespace/line_length', 2,
             'Lines should be <= %i characters long' % _line_length)
 
@@ -3442,9 +4412,14 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
 
   # Some more style checks
   CheckBraces(filename, clean_lines, linenum, error)
+  CheckTrailingSemicolon(filename, clean_lines, linenum, error)
   CheckEmptyBlockBody(filename, clean_lines, linenum, error)
-  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
   CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckOperatorSpacing(filename, clean_lines, linenum, error)
+  CheckParenthesisSpacing(filename, clean_lines, linenum, error)
+  CheckCommaSpacing(filename, clean_lines, linenum, error)
+  CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacingForFunctionCall(filename, clean_lines, linenum, error)
   CheckCheck(filename, clean_lines, linenum, error)
   CheckAltTokens(filename, clean_lines, linenum, error)
   classinfo = nesting_state.InnermostClass()
@@ -3452,7 +4427,6 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
     CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
 
 
-_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
 _RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
 # Matches the first component of a filename delimited by -s and _s. That is:
 #  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
@@ -3489,23 +4463,6 @@ def _DropCommonSuffixes(filename):
   return os.path.splitext(filename)[0]
 
 
-def _IsTestFilename(filename):
-  """Determines if the given filename has a suffix that identifies it as a test.
-
-  Args:
-    filename: The input filename.
-
-  Returns:
-    True if 'filename' looks like a test, False otherwise.
-  """
-  if (filename.endswith('_test.cc') or
-      filename.endswith('_unittest.cc') or
-      filename.endswith('_regtest.cc')):
-    return True
-  else:
-    return False
-
-
 def _ClassifyInclude(fileinfo, include, is_system):
   """Figures out what kind of header 'include' is.
 
@@ -3581,11 +4538,17 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
     error: The function to call with any errors found.
   """
   fileinfo = FileInfo(filename)
-
   line = clean_lines.lines[linenum]
 
   # "include" should use the new style "foo/bar.h" instead of just "bar.h"
-  if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
+  # Only do this check if the included header follows google naming
+  # conventions.  If not, assume that it's a 3rd party API that
+  # requires special include conventions.
+  #
+  # We also make an exception for Lua headers, which follow google
+  # naming convention but not the include convention.
+  match = Match(r'#include\s*"([^/]+\.h)"', line)
+  if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)):
     error(filename, linenum, 'build/include', 4,
           'Include the directory when naming .h files')
 
@@ -3596,12 +4559,17 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
   if match:
     include = match.group(2)
     is_system = (match.group(1) == '<')
-    if include in include_state:
+    duplicate_line = include_state.FindHeader(include)
+    if duplicate_line >= 0:
       error(filename, linenum, 'build/include', 4,
             '"%s" already included at %s:%s' %
-            (include, filename, include_state[include]))
-    else:
-      include_state[include] = linenum
+            (include, filename, duplicate_line))
+    elif (include.endswith('.cc') and
+          os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)):
+      error(filename, linenum, 'build/include', 4,
+            'Do not include .cc files from other packages')
+    elif not _THIRD_PARTY_HEADERS_PATTERN.match(include):
+      include_state.include_list[-1].append((include, linenum))
 
       # We want to ensure that headers appear in the right order:
       # 1) for foo.cc, foo.h  (preferred location)
@@ -3627,15 +4595,6 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
               'Include "%s" not in alphabetical order' % include)
       include_state.SetLastHeader(canonical_include)
 
-  # Look for any of the stream classes that are part of standard C++.
-  match = _RE_PATTERN_INCLUDE.match(line)
-  if match:
-    include = match.group(2)
-    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
-      # Many unit tests use cout, so we exempt them.
-      if not _IsTestFilename(filename):
-        error(filename, linenum, 'readability/streams', 3,
-              'Streams are highly discouraged.')
 
 
 def _GetTextInside(text, start_pattern):
@@ -3658,7 +4617,7 @@ def _GetTextInside(text, start_pattern):
     The extracted text.
     None if either the opening string or ending punctuation could not be found.
   """
-  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
+  # TODO(unknown): Audit cpplint.py to see what places could be profitably
   # rewritten to use _GetTextInside (and use inferior regexp matching today).
 
   # Give opening punctuations to get the matching close-punctuations.
@@ -3718,6 +4677,9 @@ _RE_PATTERN_REF_PARAM = re.compile(
 _RE_PATTERN_CONST_REF_PARAM = (
     r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
     r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+# Stream types.
+_RE_PATTERN_REF_STREAM_PARAM = (
+    r'(?:.*stream\s*&\s*' + _RE_PATTERN_IDENT + r')')
 
 
 def CheckLanguage(filename, clean_lines, linenum, file_extension,
@@ -3733,7 +4695,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
     linenum: The number of the line to check.
     file_extension: The extension (without the dot) of the filename.
     include_state: An _IncludeState instance in which the headers are inserted.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -3750,129 +4712,23 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
 
   # Reset include state across preprocessor directives.  This is meant
   # to silence warnings for conditional includes.
-  if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line):
-    include_state.ResetSection()
+  match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line)
+  if match:
+    include_state.ResetSection(match.group(1))
 
   # Make Windows paths like Unix.
   fullname = os.path.abspath(filename).replace('\\', '/')
 
-  # TODO(unknown): figure out if they're using default arguments in fn proto.
+  # Perform other checks now that we are sure that this is not an include line
+  CheckCasts(filename, clean_lines, linenum, error)
+  CheckGlobalStatic(filename, clean_lines, linenum, error)
+  CheckPrintf(filename, clean_lines, linenum, error)
 
-  # Check to see if they're using an conversion function cast.
-  # I just try to capture the most common basic types, though there are more.
-  # Parameterless conversion functions, such as bool(), are allowed as they are
-  # probably a member operator declaration or default constructor.
-  match = Search(
-      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
-      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
-      r'(\([^)].*)', line)
-  if match:
-    matched_new = match.group(1)
-    matched_type = match.group(2)
-    matched_funcptr = match.group(3)
-
-    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
-    # where type may be float(), int(string), etc.  Without context they are
-    # virtually indistinguishable from int(x) casts. Likewise, gMock's
-    # MockCallback takes a template parameter of the form return_type(arg_type),
-    # which looks much like the cast we're trying to detect.
-    #
-    # std::function<> wrapper has a similar problem.
-    #
-    # Return types for function pointers also look like casts if they
-    # don't have an extra space.
-    if (matched_new is None and  # If new operator, then this isn't a cast
-        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
-             Search(r'\bMockCallback<.*>', line) or
-             Search(r'\bstd::function<.*>', line)) and
-        not (matched_funcptr and
-             Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
-                   matched_funcptr))):
-      # Try a bit harder to catch gmock lines: the only place where
-      # something looks like an old-style cast is where we declare the
-      # return type of the mocked method, and the only time when we
-      # are missing context is if MOCK_METHOD was split across
-      # multiple lines.  The missing MOCK_METHOD is usually one or two
-      # lines back, so scan back one or two lines.
-      #
-      # It's not possible for gmock macros to appear in the first 2
-      # lines, since the class head + section name takes up 2 lines.
-      if (linenum < 2 or
-          not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
-                     clean_lines.elided[linenum - 1]) or
-               Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
-                     clean_lines.elided[linenum - 2]))):
-        error(filename, linenum, 'readability/casting', 4,
-              'Using deprecated casting style.  '
-              'Use static_cast<%s>(...) instead' %
-              matched_type)
-
-  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                  'static_cast',
-                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
-
-  # This doesn't catch all cases. Consider (const char * const)"hello".
-  #
-  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
-  # compile).
-  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
-    pass
-  else:
-    # Check pointer casts for other than string constants
-    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
-
-  # In addition, we look for people taking the address of a cast.  This
-  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
-  # point where you think.
-  match = Search(
-      r'(?:&\(([^)]+)\)[\w(])|'
-      r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line)
-  if match and match.group(1) != '*':
-    error(filename, linenum, 'runtime/casting', 4,
-          ('Are you taking an address of a cast?  '
-           'This is dangerous: could be a temp var.  '
-           'Take the address before doing the cast, rather than after'))
-
-  # Create an extended_line, which is the concatenation of the current and
-  # next lines, for more effective checking of code that may span more than one
-  # line.
-  if linenum + 1 < clean_lines.NumLines():
-    extended_line = line + clean_lines.elided[linenum + 1]
-  else:
-    extended_line = line
-
-  # Check for people declaring static/global STL strings at the top level.
-  # This is dangerous because the C++ language does not guarantee that
-  # globals with constructors are initialized before the first access.
-  match = Match(
-      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
-      line)
-  # Make sure it's not a function.
-  # Function template specialization looks like: "string foo<Type>(...".
-  # Class template definitions look like: "string Foo<Type>::Method(...".
-  #
-  # Also ignore things that look like operators.  These are matched separately
-  # because operator names cross non-word boundaries.  If we change the pattern
-  # above, we would decrease the accuracy of matching identifiers.
-  if (match and
-      not Search(r'\boperator\W', line) and
-      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))):
-    error(filename, linenum, 'runtime/string', 4,
-          'For a static/global string constant, use a C style string instead: '
-          '"%schar %s[]".' %
-          (match.group(1), match.group(2)))
-
-  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
-    error(filename, linenum, 'runtime/init', 4,
-          'You seem to be initializing a member variable with itself.')
-
-  if file_extension == 'h':
+  if IsHeaderExtension(file_extension):
     # TODO(unknown): check that 1-arg constructors are explicit.
     #                How to tell it's a constructor?
     #                (handled in CheckForNonStandardConstructs for now)
-    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
+    # TODO(unknown): check that classes declare or disable copy/assign
     #                (level 1 error)
     pass
 
@@ -3888,23 +4744,6 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
       error(filename, linenum, 'runtime/int', 4,
             'Use int16/int64/etc, rather than the C type %s' % match.group(1))
 
-  # When snprintf is used, the second argument shouldn't be a literal.
-  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
-  if match and match.group(2) != '0':
-    # If 2nd arg is zero, snprintf is used to calculate size.
-    error(filename, linenum, 'runtime/printf', 3,
-          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
-          'to snprintf.' % (match.group(1), match.group(2)))
-
-  # Check if some verboten C functions are being used.
-  if Search(r'\bsprintf\b', line):
-    error(filename, linenum, 'runtime/printf', 5,
-          'Never use sprintf.  Use snprintf instead.')
-  match = Search(r'\b(strcpy|strcat)\b', line)
-  if match:
-    error(filename, linenum, 'runtime/printf', 4,
-          'Almost always, snprintf is better than %s' % match.group(1))
-
   # Check if some verboten operator overloading is going on
   # TODO(unknown): catch out-of-line unary operator&:
   #   class X {};
@@ -3924,7 +4763,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
   # Check for potential format string bugs like printf(foo).
   # We constrain the pattern not to pick things like DocidForPrintf(foo).
   # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
-  # TODO(sugawarayu): Catch the following case. Need to change the calling
+  # TODO(unknown): Catch the following case. Need to change the calling
   # convention of the whole function to process multiple line to handle it.
   #   printf(
   #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
@@ -3989,37 +4828,188 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
             'Do not use variable-length arrays.  Use an appropriately named '
             "('k' followed by CamelCase) compile-time constant for the size.")
 
-  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
-  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
-  # in the class declaration.
-  match = Match(
-      (r'\s*'
-       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
-       r'\(.*\);$'),
-      line)
-  if match and linenum + 1 < clean_lines.NumLines():
-    next_line = clean_lines.elided[linenum + 1]
-    # We allow some, but not all, declarations of variables to be present
-    # in the statement that defines the class.  The [\w\*,\s]* fragment of
-    # the regular expression below allows users to declare instances of
-    # the class or pointers to instances, but not less common types such
-    # as function pointers or arrays.  It's a tradeoff between allowing
-    # reasonable code and avoiding trying to parse more C++ using regexps.
-    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
-      error(filename, linenum, 'readability/constructors', 3,
-            match.group(1) + ' should be the last thing in the class')
-
   # Check for use of unnamed namespaces in header files.  Registration
   # macros are typically OK, so we allow use of "namespace {" on lines
   # that end with backslashes.
-  if (file_extension == 'h'
+  if (IsHeaderExtension(file_extension)
       and Search(r'\bnamespace\s*{', line)
       and line[-1] != '\\'):
     error(filename, linenum, 'build/namespaces', 4,
           'Do not use unnamed namespaces in header files.  See '
-          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+          'https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
           ' for more information.')
 
+
+def CheckGlobalStatic(filename, clean_lines, linenum, error):
+  """Check for unsafe global or static objects.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Match two lines at a time to support multiline declarations
+  if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line):
+    line += clean_lines.elided[linenum + 1].strip()
+
+  # Check for people declaring static/global STL strings at the top level.
+  # This is dangerous because the C++ language does not guarantee that
+  # globals with constructors are initialized before the first access, and
+  # also because globals can be destroyed when some threads are still running.
+  # TODO(unknown): Generalize this to also find static unique_ptr instances.
+  # TODO(unknown): File bugs for clang-tidy to find these.
+  match = Match(
+      r'((?:|static +)(?:|const +))(?::*std::)?string( +const)? +'
+      r'([a-zA-Z0-9_:]+)\b(.*)',
+      line)
+
+  # Remove false positives:
+  # - String pointers (as opposed to values).
+  #    string *pointer
+  #    const string *pointer
+  #    string const *pointer
+  #    string *const pointer
+  #
+  # - Functions and template specializations.
+  #    string Function<Type>(...
+  #    string Class<Type>::Method(...
+  #
+  # - Operators.  These are matched separately because operator names
+  #   cross non-word boundaries, and trying to match both operators
+  #   and functions at the same time would decrease accuracy of
+  #   matching identifiers.
+  #    string Class::operator*()
+  if (match and
+      not Search(r'\bstring\b(\s+const)?\s*[\*\&]\s*(const\s+)?\w', line) and
+      not Search(r'\boperator\W', line) and
+      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(4))):
+    if Search(r'\bconst\b', line):
+      error(filename, linenum, 'runtime/string', 4,
+            'For a static/global string constant, use a C style string '
+            'instead: "%schar%s %s[]".' %
+            (match.group(1), match.group(2) or '', match.group(3)))
+    else:
+      error(filename, linenum, 'runtime/string', 4,
+            'Static/global string variables are not permitted.')
+
+  if (Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line) or
+      Search(r'\b([A-Za-z0-9_]*_)\(CHECK_NOTNULL\(\1\)\)', line)):
+    error(filename, linenum, 'runtime/init', 4,
+          'You seem to be initializing a member variable with itself.')
+
+
+def CheckPrintf(filename, clean_lines, linenum, error):
+  """Check for printf related issues.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # When snprintf is used, the second argument shouldn't be a literal.
+  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+  if match and match.group(2) != '0':
+    # If 2nd arg is zero, snprintf is used to calculate size.
+    error(filename, linenum, 'runtime/printf', 3,
+          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+          'to snprintf.' % (match.group(1), match.group(2)))
+
+  # Check if some verboten C functions are being used.
+  if Search(r'\bsprintf\s*\(', line):
+    error(filename, linenum, 'runtime/printf', 5,
+          'Never use sprintf. Use snprintf instead.')
+  match = Search(r'\b(strcpy|strcat)\s*\(', line)
+  if match:
+    error(filename, linenum, 'runtime/printf', 4,
+          'Almost always, snprintf is better than %s' % match.group(1))
+
+
+def IsDerivedFunction(clean_lines, linenum):
+  """Check if current line contains an inherited function.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line contains a function with "override"
+    virt-specifier.
+  """
+  # Scan back a few lines for start of current function
+  for i in xrange(linenum, max(-1, linenum - 10), -1):
+    match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i])
+    if match:
+      # Look for "override" after the matching closing parenthesis
+      line, _, closing_paren = CloseExpression(
+          clean_lines, i, len(match.group(1)))
+      return (closing_paren >= 0 and
+              Search(r'\boverride\b', line[closing_paren:]))
+  return False
+
+
+def IsOutOfLineMethodDefinition(clean_lines, linenum):
+  """Check if current line contains an out-of-line method definition.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line contains an out-of-line method definition.
+  """
+  # Scan back a few lines for start of current function
+  for i in xrange(linenum, max(-1, linenum - 10), -1):
+    if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]):
+      return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None
+  return False
+
+
+def IsInitializerList(clean_lines, linenum):
+  """Check if current line is inside constructor initializer list.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line appears to be inside constructor initializer
+    list, False otherwise.
+  """
+  for i in xrange(linenum, 1, -1):
+    line = clean_lines.elided[i]
+    if i == linenum:
+      remove_function_body = Match(r'^(.*)\{\s*$', line)
+      if remove_function_body:
+        line = remove_function_body.group(1)
+
+    if Search(r'\s:\s*\w+[({]', line):
+      # A lone colon tend to indicate the start of a constructor
+      # initializer list.  It could also be a ternary operator, which
+      # also tend to appear in constructor initializer lists as
+      # opposed to parameter lists.
+      return True
+    if Search(r'\}\s*,\s*$', line):
+      # A closing brace followed by a comma is probably the end of a
+      # brace-initialized member in constructor initializer list.
+      return True
+    if Search(r'[{};]\s*$', line):
+      # Found one of the following:
+      # - A closing brace or semicolon, probably the end of the previous
+      #   function.
+      # - An opening brace, probably the start of current class or namespace.
+      #
+      # Current line is probably not inside an initializer list since
+      # we saw one of those things without seeing the starting colon.
+      return False
+
+  # Got to the beginning of the file without seeing the start of
+  # constructor initializer list.
+  return False
+
+
 def CheckForNonConstReference(filename, clean_lines, linenum,
                               nesting_state, error):
   """Check for non-const references.
@@ -4031,7 +5021,7 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -4040,6 +5030,17 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
   if '&' not in line:
     return
 
+  # If a function is inherited, current function doesn't have much of
+  # a choice, so any non-const references should not be blamed on
+  # derived function.
+  if IsDerivedFunction(clean_lines, linenum):
+    return
+
+  # Don't warn on out-of-line method definitions, as we would warn on the
+  # in-line declaration, if it isn't marked with 'override'.
+  if IsOutOfLineMethodDefinition(clean_lines, linenum):
+    return
+
   # Long type names may be broken across multiple lines, usually in one
   # of these forms:
   #   LongType
@@ -4088,60 +5089,192 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
   #   inside declarators: reference parameter
   # We will exclude the first two cases by checking that we are not inside a
   # function body, including one that was just introduced by a trailing '{'.
-  # TODO(unknwon): Doesn't account for preprocessor directives.
   # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
-  check_params = False
-  if not nesting_state.stack:
-    check_params = True  # top level
-  elif (isinstance(nesting_state.stack[-1], _ClassInfo) or
-        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
-    check_params = True  # within class or namespace
-  elif Match(r'.*{\s*$', line):
-    if (len(nesting_state.stack) == 1 or
-        isinstance(nesting_state.stack[-2], _ClassInfo) or
-        isinstance(nesting_state.stack[-2], _NamespaceInfo)):
-      check_params = True  # just opened global/class/namespace block
+  if (nesting_state.previous_stack_top and
+      not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or
+           isinstance(nesting_state.previous_stack_top, _NamespaceInfo))):
+    # Not at toplevel, not within a class, and not within a namespace
+    return
+
+  # Avoid initializer lists.  We only need to scan back from the
+  # current line for something that starts with ':'.
+  #
+  # We don't need to check the current line, since the '&' would
+  # appear inside the second set of parentheses on the current line as
+  # opposed to the first set.
+  if linenum > 0:
+    for i in xrange(linenum - 1, max(0, linenum - 10), -1):
+      previous_line = clean_lines.elided[i]
+      if not Search(r'[),]\s*$', previous_line):
+        break
+      if Match(r'^\s*:\s+\S', previous_line):
+        return
+
+  # Avoid preprocessors
+  if Search(r'\\\s*$', line):
+    return
+
+  # Avoid constructor initializer lists
+  if IsInitializerList(clean_lines, linenum):
+    return
+
   # We allow non-const references in a few standard places, like functions
   # called "swap()" or iostream operators like "<<" or ">>".  Do not check
   # those function parameters.
   #
   # We also accept & in static_assert, which looks like a function but
   # it's actually a declaration expression.
-  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+  allowed_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
                            r'operator\s*[<>][<>]|'
                            r'static_assert|COMPILE_ASSERT'
                            r')\s*\(')
-  if Search(whitelisted_functions, line):
-    check_params = False
+  if Search(allowed_functions, line):
+    return
   elif not Search(r'\S+\([^)]*$', line):
-    # Don't see a whitelisted function on this line.  Actually we
+    # Don't see an allowed function on this line.  Actually we
     # didn't see any function name on this line, so this is likely a
     # multi-line parameter list.  Try a bit harder to catch this case.
     for i in xrange(2):
       if (linenum > i and
-          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
-        check_params = False
-        break
+          Search(allowed_functions, clean_lines.elided[linenum - i - 1])):
+        return
+
+  decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
+  for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+    if (not Match(_RE_PATTERN_CONST_REF_PARAM, parameter) and
+        not Match(_RE_PATTERN_REF_STREAM_PARAM, parameter)):
+      error(filename, linenum, 'runtime/references', 2,
+            'Is this a non-const reference? '
+            'If so, make const or use a pointer: ' +
+            ReplaceAll(' *<', '<', parameter))
+
+
+def CheckCasts(filename, clean_lines, linenum, error):
+  """Various cast related checks.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Check to see if they're using an conversion function cast.
+  # I just try to capture the most common basic types, though there are more.
+  # Parameterless conversion functions, such as bool(), are allowed as they are
+  # probably a member operator declaration or default constructor.
+  match = Search(
+      r'(\bnew\s+(?:const\s+)?|\S<\s*(?:const\s+)?)?\b'
+      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+      r'(\([^)].*)', line)
+  expecting_function = ExpectingFunctionArgs(clean_lines, linenum)
+  if match and not expecting_function:
+    matched_type = match.group(2)
+
+    # matched_new_or_template is used to silence two false positives:
+    # - New operators
+    # - Template arguments with function types
+    #
+    # For template arguments, we match on types immediately following
+    # an opening bracket without any spaces.  This is a fast way to
+    # silence the common case where the function type is the first
+    # template argument.  False negative with less-than comparison is
+    # avoided because those operators are usually followed by a space.
+    #
+    #   function<double(double)>   // bracket + no space = false positive
+    #   value < double(42)         // bracket + space = true positive
+    matched_new_or_template = match.group(1)
+
+    # Avoid arrays by looking for brackets that come after the closing
+    # parenthesis.
+    if Match(r'\([^()]+\)\s*\[', match.group(3)):
+      return
+
+    # Other things to ignore:
+    # - Function pointers
+    # - Casts to pointer types
+    # - Placement new
+    # - Alias declarations
+    matched_funcptr = match.group(3)
+    if (matched_new_or_template is None and
+        not (matched_funcptr and
+             (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+                    matched_funcptr) or
+              matched_funcptr.startswith('(*)'))) and
+        not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and
+        not Search(r'new\(\S+\)\s*' + matched_type, line)):
+      error(filename, linenum, 'readability/casting', 4,
+            'Using deprecated casting style.  '
+            'Use static_cast<%s>(...) instead' %
+            matched_type)
+
+  if not expecting_function:
+    CheckCStyleCast(filename, clean_lines, linenum, 'static_cast',
+                    r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
 
-  if check_params:
-    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
-    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
-      if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
-        error(filename, linenum, 'runtime/references', 2,
-              'Is this a non-const reference? '
-              'If so, make const or use a pointer: ' +
-              ReplaceAll(' *<', '<', parameter))
+  # This doesn't catch all cases. Consider (const char * const)"hello".
+  #
+  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+  # compile).
+  if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast',
+                     r'\((char\s?\*+\s?)\)\s*"', error):
+    pass
+  else:
+    # Check pointer casts for other than string constants
+    CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast',
+                    r'\((\w+\s?\*+\s?)\)', error)
 
+  # In addition, we look for people taking the address of a cast.  This
+  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+  # point where you think.
+  #
+  # Some non-identifier character is required before the '&' for the
+  # expression to be recognized as a cast.  These are casts:
+  #   expression = &static_cast<int*>(temporary());
+  #   function(&(int*)(temporary()));
+  #
+  # This is not a cast:
+  #   reference_type&(int* function_param);
+  match = Search(
+      r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|'
+      r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line)
+  if match:
+    # Try a better error message when the & is bound to something
+    # dereferenced by the casted pointer, as opposed to the casted
+    # pointer itself.
+    parenthesis_error = False
+    match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line)
+    if match:
+      _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1)))
+      if x1 >= 0 and clean_lines.elided[y1][x1] == '(':
+        _, y2, x2 = CloseExpression(clean_lines, y1, x1)
+        if x2 >= 0:
+          extended_line = clean_lines.elided[y2][x2:]
+          if y2 < clean_lines.NumLines() - 1:
+            extended_line += clean_lines.elided[y2 + 1]
+          if Match(r'\s*(?:->|\[)', extended_line):
+            parenthesis_error = True
+
+    if parenthesis_error:
+      error(filename, linenum, 'readability/casting', 4,
+            ('Are you taking an address of something dereferenced '
+             'from a cast?  Wrapping the dereferenced expression in '
+             'parentheses will make the binding more obvious'))
+    else:
+      error(filename, linenum, 'runtime/casting', 4,
+            ('Are you taking an address of a cast?  '
+             'This is dangerous: could be a temp var.  '
+             'Take the address before doing the cast, rather than after'))
 
-def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
-                    error):
+
+def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error):
   """Checks for a C-style cast by looking for the pattern.
 
   Args:
     filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    line: The line of code to check.
-    raw_line: The raw line of code to check, with comments.
     cast_type: The string for the C++ cast to recommend.  This is either
       reinterpret_cast, static_cast, or const_cast, depending.
     pattern: The regular expression used to find C-style casts.
@@ -4151,75 +5284,34 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
     True if an error was emitted.
     False otherwise.
   """
+  line = clean_lines.elided[linenum]
   match = Search(pattern, line)
   if not match:
     return False
 
-  # e.g., sizeof(int)
-  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
-  if sizeof_match:
-    error(filename, linenum, 'runtime/sizeof', 1,
-          'Using sizeof(type).  Use sizeof(varname) instead if possible')
-    return True
+  # Exclude lines with keywords that tend to look like casts
+  context = line[0:match.start(1) - 1]
+  if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context):
+    return False
+
+  # Try expanding current context to see if we one level of
+  # parentheses inside a macro.
+  if linenum > 0:
+    for i in xrange(linenum - 1, max(0, linenum - 5), -1):
+      context = clean_lines.elided[i] + context
+  if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context):
+    return False
 
   # operator++(int) and operator--(int)
-  if (line[0:match.start(1) - 1].endswith(' operator++') or
-      line[0:match.start(1) - 1].endswith(' operator--')):
+  if context.endswith(' operator++') or context.endswith(' operator--'):
     return False
 
-  # A single unnamed argument for a function tends to look like old
-  # style cast.  If we see those, don't issue warnings for deprecated
-  # casts, instead issue warnings for unnamed arguments where
-  # appropriate.
-  #
-  # These are things that we want warnings for, since the style guide
-  # explicitly require all parameters to be named:
-  #   Function(int);
-  #   Function(int) {
-  #   ConstMember(int) const;
-  #   ConstMember(int) const {
-  #   ExceptionMember(int) throw (...);
-  #   ExceptionMember(int) throw (...) {
-  #   PureVirtual(int) = 0;
-  #
-  # These are functions of some sort, where the compiler would be fine
-  # if they had named parameters, but people often omit those
-  # identifiers to reduce clutter:
-  #   (FunctionPointer)(int);
-  #   (FunctionPointer)(int) = value;
-  #   Function((function_pointer_arg)(int))
-  #   <TemplateArgument(int)>;
-  #   <(FunctionPointerTemplateArgument)(int)>;
+  # A single unnamed argument for a function tends to look like old style cast.
+  # If we see those, don't issue warnings for deprecated casts.
   remainder = line[match.end(0):]
-  if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder):
-    # Looks like an unnamed parameter.
-
-    # Don't warn on any kind of template arguments.
-    if Match(r'^\s*>', remainder):
-      return False
-
-    # Don't warn on assignments to function pointers, but keep warnings for
-    # unnamed parameters to pure virtual functions.  Note that this pattern
-    # will also pass on assignments of "0" to function pointers, but the
-    # preferred values for those would be "nullptr" or "NULL".
-    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
-    if matched_zero and matched_zero.group(1) != '0':
-      return False
-
-    # Don't warn on function pointer declarations.  For this we need
-    # to check what came before the "(type)" string.
-    if Match(r'.*\)\s*$', line[0:match.start(0)]):
-      return False
-
-    # Don't warn if the parameter is named with block comments, e.g.:
-    #  Function(int /*unused_param*/);
-    if '/*' in raw_line:
-      return False
-
-    # Passed all filters, issue warning here.
-    error(filename, linenum, 'readability/function', 3,
-          'All parameters should be named in a function')
-    return True
+  if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)',
+           remainder):
+    return False
 
   # At this point, all that should be left is actual casts.
   error(filename, linenum, 'readability/casting', 4,
@@ -4229,6 +5321,28 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
   return True
 
 
+def ExpectingFunctionArgs(clean_lines, linenum):
+  """Checks whether where function type arguments are expected.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+
+  Returns:
+    True if the line at 'linenum' is inside something that expects arguments
+    of function types.
+  """
+  line = clean_lines.elided[linenum]
+  return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+          (linenum >= 2 and
+           (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+                  clean_lines.elided[linenum - 1]) or
+            Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+                  clean_lines.elided[linenum - 2]) or
+            Search(r'\bstd::m?function\s*\<\s*$',
+                   clean_lines.elided[linenum - 1]))))
+
+
 _HEADERS_CONTAINING_TEMPLATES = (
     ('<deque>', ('deque',)),
     ('<functional>', ('unary_function', 'binary_function',
@@ -4251,11 +5365,15 @@ _HEADERS_CONTAINING_TEMPLATES = (
     ('<limits>', ('numeric_limits',)),
     ('<list>', ('list',)),
     ('<map>', ('map', 'multimap',)),
-    ('<memory>', ('allocator',)),
+    ('<memory>', ('allocator', 'make_shared', 'make_unique', 'shared_ptr',
+                  'unique_ptr', 'weak_ptr')),
     ('<queue>', ('queue', 'priority_queue',)),
     ('<set>', ('set', 'multiset',)),
     ('<stack>', ('stack',)),
     ('<string>', ('char_traits', 'basic_string',)),
+    ('<tuple>', ('tuple',)),
+    ('<unordered_map>', ('unordered_map', 'unordered_multimap')),
+    ('<unordered_set>', ('unordered_set', 'unordered_multiset')),
     ('<utility>', ('pair',)),
     ('<vector>', ('vector',)),
 
@@ -4266,18 +5384,26 @@ _HEADERS_CONTAINING_TEMPLATES = (
     ('<slist>', ('slist',)),
     )
 
-_RE_PATTERN_STRING = re.compile(r'\bstring\b')
+_HEADERS_MAYBE_TEMPLATES = (
+    ('<algorithm>', ('copy', 'max', 'min', 'min_element', 'sort',
+                     'transform',
+                    )),
+    ('<utility>', ('forward', 'make_pair', 'move', 'swap')),
+    )
 
-_re_pattern_algorithm_header = []
-for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
-                  'transform'):
-  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
-  # type::max().
-  _re_pattern_algorithm_header.append(
-      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
-       _template,
-       '<algorithm>'))
+_RE_PATTERN_STRING = re.compile(r'\bstring\b')
 
+_re_pattern_headers_maybe_templates = []
+for _header, _templates in _HEADERS_MAYBE_TEMPLATES:
+  for _template in _templates:
+    # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+    # type::max().
+    _re_pattern_headers_maybe_templates.append(
+        (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+            _template,
+            _header))
+
+# Other scripts may reach in and modify this pattern.
 _re_pattern_templates = []
 for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
   for _template in _templates:
@@ -4317,13 +5443,13 @@ def FilesBelongToSameModule(filename_cc, filename_h):
     string: the additional prefix needed to open the header file.
   """
 
-  if not filename_cc.endswith('.cc'):
+  fileinfo = FileInfo(filename_cc)
+  if not fileinfo.IsSource():
     return (False, '')
-  filename_cc = filename_cc[:-len('.cc')]
-  if filename_cc.endswith('_unittest'):
-    filename_cc = filename_cc[:-len('_unittest')]
-  elif filename_cc.endswith('_test'):
-    filename_cc = filename_cc[:-len('_test')]
+  filename_cc = filename_cc[:-len(fileinfo.Extension())]
+  matched_test_suffix = Search(_TEST_FILE_SUFFIX, fileinfo.BaseName())
+  if matched_test_suffix:
+    filename_cc = filename_cc[:-len(matched_test_suffix.group(1))]
   filename_cc = filename_cc.replace('/public/', '/')
   filename_cc = filename_cc.replace('/internal/', '/')
 
@@ -4342,16 +5468,16 @@ def FilesBelongToSameModule(filename_cc, filename_h):
   return files_belong_to_same_module, common_path
 
 
-def UpdateIncludeState(filename, include_state, io=codecs):
-  """Fill up the include_state with new includes found from the file.
+def UpdateIncludeState(filename, include_dict, io=codecs):
+  """Fill up the include_dict with new includes found from the file.
 
   Args:
     filename: the name of the header to read.
-    include_state: an _IncludeState instance in which the headers are inserted.
+    include_dict: a dictionary in which the headers are inserted.
     io: The io factory to use to read the file. Provided for testability.
 
   Returns:
-    True if a header was succesfully added. False otherwise.
+    True if a header was successfully added. False otherwise.
   """
   headerfile = None
   try:
@@ -4365,9 +5491,7 @@ def UpdateIncludeState(filename, include_state, io=codecs):
     match = _RE_PATTERN_INCLUDE.search(clean_line)
     if match:
       include = match.group(2)
-      # The value formatting is cute, but not really used right now.
-      # What matters here is that the key is in include_state.
-      include_state.setdefault(include, '%s:%d' % (filename, linenum))
+      include_dict.setdefault(include, linenum)
   return True
 
 
@@ -4406,7 +5530,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
       if prefix.endswith('std::') or not prefix.endswith('::'):
         required['<string>'] = (linenum, 'string')
 
-    for pattern, template, header in _re_pattern_algorithm_header:
+    for pattern, template, header in _re_pattern_headers_maybe_templates:
       if pattern.search(line):
         required[header] = (linenum, template)
 
@@ -4415,15 +5539,21 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
       continue
 
     for pattern, template, header in _re_pattern_templates:
-      if pattern.search(line):
-        required[header] = (linenum, template)
+      matched = pattern.search(line)
+      if matched:
+        # Don't warn about IWYU in non-STL namespaces:
+        # (We check only the first match per line; good enough.)
+        prefix = line[:matched.start()]
+        if prefix.endswith('std::') or not prefix.endswith('::'):
+          required[header] = (linenum, template)
 
   # The policy is that if you #include something in foo.h you don't need to
   # include it again in foo.cc. Here, we will look at possible includes.
-  # Let's copy the include_state so it is only messed up within this function.
-  include_state = include_state.copy()
+  # Let's flatten the include_state include_list and copy it into a dictionary.
+  include_dict = dict([item for sublist in include_state.include_list
+                       for item in sublist])
 
-  # Did we find the header for this file (if any) and succesfully load it?
+  # Did we find the header for this file (if any) and successfully load it?
   header_found = False
 
   # Use the absolute path so that matching works properly.
@@ -4438,13 +5568,13 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
   # instead of 'foo_flymake.h'
   abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
 
-  # include_state is modified during iteration, so we iterate over a copy of
+  # include_dict is modified during iteration, so we iterate over a copy of
   # the keys.
-  header_keys = include_state.keys()
+  header_keys = include_dict.keys()
   for header in header_keys:
     (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
     fullpath = common_path + header
-    if same_module and UpdateIncludeState(fullpath, include_state, io):
+    if same_module and UpdateIncludeState(fullpath, include_dict, io):
       header_found = True
 
   # If we can't find the header file for a .cc, assume it's because we don't
@@ -4458,7 +5588,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
   # All the lines have been processed, report the errors found.
   for required_header_unstripped in required:
     template = required[required_header_unstripped][1]
-    if required_header_unstripped.strip('<>"') not in include_state:
+    if required_header_unstripped.strip('<>"') not in include_dict:
       error(filename, required[required_header_unstripped][0],
             'build/include_what_you_use', 4,
             'Add #include ' + required_header_unstripped + ' for ' + template)
@@ -4470,7 +5600,7 @@ _RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
 def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
   """Check that make_pair's template arguments are deduced.
 
-  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
+  G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are
   specified explicitly, and such use isn't intended in any case.
 
   Args:
@@ -4488,6 +5618,165 @@ def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
           ' OR use pair directly OR if appropriate, construct a pair directly')
 
 
+def CheckRedundantVirtual(filename, clean_lines, linenum, error):
+  """Check if line contains a redundant "virtual" function-specifier.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Look for "virtual" on current line.
+  line = clean_lines.elided[linenum]
+  virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line)
+  if not virtual: return
+
+  # Ignore "virtual" keywords that are near access-specifiers.  These
+  # are only used in class base-specifier and do not apply to member
+  # functions.
+  if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or
+      Match(r'^\s+(public|protected|private)\b', virtual.group(3))):
+    return
+
+  # Ignore the "virtual" keyword from virtual base classes.  Usually
+  # there is a column on the same line in these cases (virtual base
+  # classes are rare in google3 because multiple inheritance is rare).
+  if Match(r'^.*[^:]:[^:].*$', line): return
+
+  # Look for the next opening parenthesis.  This is the start of the
+  # parameter list (possibly on the next line shortly after virtual).
+  # TODO(unknown): doesn't work if there are virtual functions with
+  # decltype() or other things that use parentheses, but csearch suggests
+  # that this is rare.
+  end_col = -1
+  end_line = -1
+  start_col = len(virtual.group(2))
+  for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())):
+    line = clean_lines.elided[start_line][start_col:]
+    parameter_list = Match(r'^([^(]*)\(', line)
+    if parameter_list:
+      # Match parentheses to find the end of the parameter list
+      (_, end_line, end_col) = CloseExpression(
+          clean_lines, start_line, start_col + len(parameter_list.group(1)))
+      break
+    start_col = 0
+
+  if end_col < 0:
+    return  # Couldn't find end of parameter list, give up
+
+  # Look for "override" or "final" after the parameter list
+  # (possibly on the next few lines).
+  for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())):
+    line = clean_lines.elided[i][end_col:]
+    match = Search(r'\b(override|final)\b', line)
+    if match:
+      error(filename, linenum, 'readability/inheritance', 4,
+            ('"virtual" is redundant since function is '
+             'already declared as "%s"' % match.group(1)))
+
+    # Set end_col to check whole lines after we are done with the
+    # first line.
+    end_col = 0
+    if Search(r'[^\w]\s*$', line):
+      break
+
+
+def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error):
+  """Check if line contains a redundant "override" or "final" virt-specifier.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Look for closing parenthesis nearby.  We need one to confirm where
+  # the declarator ends and where the virt-specifier starts to avoid
+  # false positives.
+  line = clean_lines.elided[linenum]
+  declarator_end = line.rfind(')')
+  if declarator_end >= 0:
+    fragment = line[declarator_end:]
+  else:
+    if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0:
+      fragment = line
+    else:
+      return
+
+  # Check that at most one of "override" or "final" is present, not both
+  if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment):
+    error(filename, linenum, 'readability/inheritance', 4,
+          ('"override" is redundant since function is '
+           'already declared as "final"'))
+
+
+
+
+# Returns true if we are at a new block, and it is directly
+# inside of a namespace.
+def IsBlockInNameSpace(nesting_state, is_forward_declaration):
+  """Checks that the new block is directly in a namespace.
+
+  Args:
+    nesting_state: The _NestingState object that contains info about our state.
+    is_forward_declaration: If the class is a forward declared class.
+  Returns:
+    Whether or not the new block is directly in a namespace.
+  """
+  if is_forward_declaration:
+    if len(nesting_state.stack) >= 1 and (
+        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+      return True
+    else:
+      return False
+
+  return (len(nesting_state.stack) > 1 and
+          nesting_state.stack[-1].check_namespace_indentation and
+          isinstance(nesting_state.stack[-2], _NamespaceInfo))
+
+
+def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                    raw_lines_no_comments, linenum):
+  """This method determines if we should apply our namespace indentation check.
+
+  Args:
+    nesting_state: The current nesting state.
+    is_namespace_indent_item: If we just put a new class on the stack, True.
+      If the top of the stack is not a class, or we did not recently
+      add the class, False.
+    raw_lines_no_comments: The lines without the comments.
+    linenum: The current line number we are processing.
+
+  Returns:
+    True if we should apply our namespace indentation check. Currently, it
+    only works for classes and namespaces inside of a namespace.
+  """
+
+  is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments,
+                                                     linenum)
+
+  if not (is_namespace_indent_item or is_forward_declaration):
+    return False
+
+  # If we are in a macro, we do not want to check the namespace indentation.
+  if IsMacroDefinition(raw_lines_no_comments, linenum):
+    return False
+
+  return IsBlockInNameSpace(nesting_state, is_forward_declaration)
+
+
+# Call this method if the line is directly inside of a namespace.
+# If the line above is blank (excluding comments) or the start of
+# an inner namespace, it cannot be indented.
+def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum,
+                                    error):
+  line = raw_lines_no_comments[linenum]
+  if Match(r'^\s+', line):
+    error(filename, linenum, 'runtime/indentation_namespace', 4,
+          'Do not indent within a namespace')
+
+
 def ProcessLine(filename, file_extension, clean_lines, line,
                 include_state, function_state, nesting_state, error,
                 extra_check_functions=[]):
@@ -4501,7 +5790,7 @@ def ProcessLine(filename, file_extension, clean_lines, line,
     line: Number of line being processed.
     include_state: An _IncludeState instance in which the headers are inserted.
     function_state: A _FunctionState instance which counts function lines, etc.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
@@ -4512,8 +5801,9 @@ def ProcessLine(filename, file_extension, clean_lines, line,
   raw_lines = clean_lines.raw_lines
   ParseNolintSuppressions(filename, raw_lines[line], line, error)
   nesting_state.Update(filename, clean_lines, line, error)
-  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
-    return
+  CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                               error)
+  if nesting_state.InAsmBlock(): return
   CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
   CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
   CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
@@ -4526,9 +5816,82 @@ def ProcessLine(filename, file_extension, clean_lines, line,
   CheckPosixThreading(filename, clean_lines, line, error)
   CheckInvalidIncrement(filename, clean_lines, line, error)
   CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+  CheckRedundantVirtual(filename, clean_lines, line, error)
+  CheckRedundantOverrideOrFinal(filename, clean_lines, line, error)
   for check_fn in extra_check_functions:
     check_fn(filename, clean_lines, line, error)
 
+def FlagCxx11Features(filename, clean_lines, linenum, error):
+  """Flag those c++11 features that we only allow in certain places.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+
+  # Flag unapproved C++ TR1 headers.
+  if include and include.group(1).startswith('tr1/'):
+    error(filename, linenum, 'build/c++tr1', 5,
+          ('C++ TR1 headers such as <%s> are unapproved.') % include.group(1))
+
+  # Flag unapproved C++11 headers.
+  if include and include.group(1) in ('cfenv',
+                                      'condition_variable',
+                                      'fenv.h',
+                                      'future',
+                                      'mutex',
+                                      'thread',
+                                      'chrono',
+                                      'ratio',
+                                      'regex',
+                                      'system_error',
+                                     ):
+    error(filename, linenum, 'build/c++11', 5,
+          ('<%s> is an unapproved C++11 header.') % include.group(1))
+
+  # The only place where we need to worry about C++11 keywords and library
+  # features in preprocessor directives is in macro definitions.
+  if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return
+
+  # These are classes and free functions.  The classes are always
+  # mentioned as std::*, but we only catch the free functions if
+  # they're not found by ADL.  They're alphabetical by header.
+  for top_name in (
+      # type_traits
+      'alignment_of',
+      'aligned_union',
+      ):
+    if Search(r'\bstd::%s\b' % top_name, line):
+      error(filename, linenum, 'build/c++11', 5,
+            ('std::%s is an unapproved C++11 class or function.  Send c-style '
+             'an example of where it would make your code more readable, and '
+             'they may let you use it.') % top_name)
+
+
+def FlagCxx14Features(filename, clean_lines, linenum, error):
+  """Flag those C++14 features that we restrict.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+
+  # Flag unapproved C++14 headers.
+  if include and include.group(1) in ('scoped_allocator', 'shared_mutex'):
+    error(filename, linenum, 'build/c++14', 5,
+          ('<%s> is an unapproved C++14 header.') % include.group(1))
+
+
 def ProcessFileData(filename, file_extension, lines, error,
                     extra_check_functions=[]):
   """Performs lint checks and reports any errors to the given error function.
@@ -4549,31 +5912,122 @@ def ProcessFileData(filename, file_extension, lines, error,
 
   include_state = _IncludeState()
   function_state = _FunctionState()
-  nesting_state = _NestingState()
+  nesting_state = NestingState()
 
   ResetNolintSuppressions()
 
   CheckForCopyright(filename, lines, error)
-
-  if file_extension == 'h':
-    CheckForHeaderGuard(filename, lines, error)
-
+  ProcessGlobalSuppresions(lines)
   RemoveMultiLineComments(filename, lines, error)
   clean_lines = CleansedLines(lines)
+
+  if IsHeaderExtension(file_extension):
+    CheckForHeaderGuard(filename, clean_lines, error)
+
   for line in xrange(clean_lines.NumLines()):
     ProcessLine(filename, file_extension, clean_lines, line,
                 include_state, function_state, nesting_state, error,
                 extra_check_functions)
+    FlagCxx11Features(filename, clean_lines, line, error)
   nesting_state.CheckCompletedBlocks(filename, error)
 
   CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
 
+  # Check that the .cc file has included its header if it exists.
+  if _IsSourceExtension(file_extension):
+    CheckHeaderFileIncluded(filename, include_state, error)
+
   # We check here rather than inside ProcessLine so that we see raw
   # lines rather than "cleaned" lines.
   CheckForBadCharacters(filename, lines, error)
 
   CheckForNewlineAtEOF(filename, lines, error)
 
+def ProcessConfigOverrides(filename):
+  """ Loads the configuration files and processes the config overrides.
+
+  Args:
+    filename: The name of the file being processed by the linter.
+
+  Returns:
+    False if the current |filename| should not be processed further.
+  """
+
+  abs_filename = os.path.abspath(filename)
+  cfg_filters = []
+  keep_looking = True
+  while keep_looking:
+    abs_path, base_name = os.path.split(abs_filename)
+    if not base_name:
+      break  # Reached the root directory.
+
+    cfg_file = os.path.join(abs_path, "CPPLINT.cfg")
+    abs_filename = abs_path
+    if not os.path.isfile(cfg_file):
+      continue
+
+    try:
+      with open(cfg_file) as file_handle:
+        for line in file_handle:
+          line, _, _ = line.partition('#')  # Remove comments.
+          if not line.strip():
+            continue
+
+          name, _, val = line.partition('=')
+          name = name.strip()
+          val = val.strip()
+          if name == 'set noparent':
+            keep_looking = False
+          elif name == 'filter':
+            cfg_filters.append(val)
+          elif name == 'exclude_files':
+            # When matching exclude_files pattern, use the base_name of
+            # the current file name or the directory name we are processing.
+            # For example, if we are checking for lint errors in /foo/bar/baz.cc
+            # and we found the .cfg file at /foo/CPPLINT.cfg, then the config
+            # file's "exclude_files" filter is meant to be checked against "bar"
+            # and not "baz" nor "bar/baz.cc".
+            if base_name:
+              pattern = re.compile(val)
+              if pattern.match(base_name):
+                if _cpplint_state.quiet:
+                  # Suppress "Ignoring file" warning when using --quiet.
+                  return False
+                sys.stderr.write('Ignoring "%s": file excluded by "%s". '
+                                 'File path component "%s" matches '
+                                 'pattern "%s"\n' %
+                                 (filename, cfg_file, base_name, val))
+                return False
+          elif name == 'linelength':
+            global _line_length
+            try:
+                _line_length = int(val)
+            except ValueError:
+                sys.stderr.write('Line length must be numeric.')
+          elif name == 'root':
+            global _root
+            # root directories are specified relative to CPPLINT.cfg dir.
+            _root = os.path.join(os.path.dirname(cfg_file), val)
+          elif name == 'headers':
+            ProcessHppHeadersOption(val)
+          else:
+            sys.stderr.write(
+                'Invalid configuration option (%s) in file %s\n' %
+                (name, cfg_file))
+
+    except IOError:
+      sys.stderr.write(
+          "Skipping config file '%s': Can't open for reading\n" % cfg_file)
+      keep_looking = False
+
+  # Apply all the accumulated filters in reverse order (top-level directory
+  # config options having the least priority).
+  for filter in reversed(cfg_filters):
+     _AddFilters(filter)
+
+  return True
+
+
 def ProcessFile(filename, vlevel, extra_check_functions=[]):
   """Does google-lint on a single file.
 
@@ -4589,7 +6043,15 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
   """
 
   _SetVerboseLevel(vlevel)
+  _BackupFilters()
+  old_errors = _cpplint_state.error_count
 
+  if not ProcessConfigOverrides(filename):
+    _RestoreFilters()
+    return
+
+  lf_lines = []
+  crlf_lines = []
   try:
     # Support the UNIX convention of using "-" for stdin.  Note that
     # we are not opening the file with universal newline support
@@ -4597,10 +6059,7 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
     # contain trailing '\r' characters if we are reading a file that
     # has CRLF endings.
     # If after the split a trailing '\r' is present, it is removed
-    # below. If it is not expected to be present (i.e. os.linesep !=
-    # '\r\n' as in Windows), a warning is issued below if this file
-    # is processed.
-
+    # below.
     if filename == '-':
       lines = codecs.StreamReaderWriter(sys.stdin,
                                         codecs.getreader('utf8'),
@@ -4609,16 +6068,19 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
     else:
       lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
 
-    carriage_return_found = False
     # Remove trailing '\r'.
-    for linenum in range(len(lines)):
+    # The -1 accounts for the extra trailing blank line we get from split()
+    for linenum in range(len(lines) - 1):
       if lines[linenum].endswith('\r'):
         lines[linenum] = lines[linenum].rstrip('\r')
-        carriage_return_found = True
+        crlf_lines.append(linenum + 1)
+      else:
+        lf_lines.append(linenum + 1)
 
   except IOError:
     sys.stderr.write(
         "Skipping input '%s': Can't open for reading\n" % filename)
+    _RestoreFilters()
     return
 
   # Note, if no dot is found, this will give the entire filename as the ext.
@@ -4632,14 +6094,30 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
   else:
     ProcessFileData(filename, file_extension, lines, Error,
                     extra_check_functions)
-    if carriage_return_found and os.linesep != '\r\n':
-      # Use 0 for linenum since outputting only one error for potentially
-      # several lines.
-      Error(filename, 0, 'whitespace/newline', 1,
-            'One or more unexpected \\r (^M) found;'
-            'better to use only a \\n')
 
-  sys.stderr.write('Done processing %s\n' % filename)
+    # If end-of-line sequences are a mix of LF and CR-LF, issue
+    # warnings on the lines with CR.
+    #
+    # Don't issue any warnings if all lines are uniformly LF or CR-LF,
+    # since critique can handle these just fine, and the style guide
+    # doesn't dictate a particular end of line sequence.
+    #
+    # We can't depend on os.linesep to determine what the desired
+    # end-of-line sequence should be, since that will return the
+    # server-side end-of-line sequence.
+    if lf_lines and crlf_lines:
+      # Warn on every line with CR.  An alternative approach might be to
+      # check whether the file is mostly CRLF or just LF, and warn on the
+      # minority, we bias toward LF here since most tools prefer LF.
+      for linenum in crlf_lines:
+        Error(filename, linenum, 'whitespace/newline', 1,
+              'Unexpected \\r (^M) found; better to use only \\n')
+
+  # Suppress printing anything if --quiet was passed unless the error
+  # count has increased after processing this file.
+  if not _cpplint_state.quiet or old_errors != _cpplint_state.error_count:
+    sys.stdout.write('Done processing %s\n' % filename)
+  _RestoreFilters()
 
 
 def PrintUsage(message):
@@ -4681,13 +6159,16 @@ def ParseArguments(args):
                                                  'filter=',
                                                  'root=',
                                                  'linelength=',
-                                                 'extensions='])
+                                                 'extensions=',
+                                                 'headers=',
+                                                 'quiet'])
   except getopt.GetoptError:
     PrintUsage('Invalid arguments.')
 
   verbosity = _VerboseLevel()
   output_format = _OutputFormat()
   filters = ''
+  quiet = _Quiet()
   counting_style = ''
 
   for (opt, val) in opts:
@@ -4697,6 +6178,8 @@ def ParseArguments(args):
       if val not in ('emacs', 'vs7', 'eclipse'):
         PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
       output_format = val
+    elif opt == '--quiet':
+      quiet = True
     elif opt == '--verbose':
       verbosity = int(val)
     elif opt == '--filter':
@@ -4721,12 +6204,15 @@ def ParseArguments(args):
       try:
           _valid_extensions = set(val.split(','))
       except ValueError:
-          PrintUsage('Extensions must be comma seperated list.')
+          PrintUsage('Extensions must be comma separated list.')
+    elif opt == '--headers':
+      ProcessHppHeadersOption(val)
 
   if not filenames:
     PrintUsage('No files were specified.')
 
   _SetOutputFormat(output_format)
+  _SetQuiet(quiet)
   _SetVerboseLevel(verbosity)
   _SetFilters(filters)
   _SetCountingStyle(counting_style)
@@ -4747,7 +6233,9 @@ def main():
   _cpplint_state.ResetErrorCounts()
   for filename in filenames:
     ProcessFile(filename, _cpplint_state.verbose_level)
-  _cpplint_state.PrintErrorCounts()
+  # If --quiet is passed, suppress printing error count unless there are errors.
+  if not _cpplint_state.quiet or _cpplint_state.error_count > 0:
+    _cpplint_state.PrintErrorCounts()
 
   sys.exit(_cpplint_state.error_count > 0)
 
diff --git a/media/libaom/src/tools/gen_authors.sh b/media/libaom/src/tools/gen_authors.sh
index 5def8bc898..5def8bc898 100644..100755
--- a/media/libaom/src/tools/gen_authors.sh
+++ b/media/libaom/src/tools/gen_authors.sh
diff --git a/media/libaom/src/tools/gen_constrained_tokenset.py b/media/libaom/src/tools/gen_constrained_tokenset.py
index 5d12ee1ef5..5d12ee1ef5 100644..100755
--- a/media/libaom/src/tools/gen_constrained_tokenset.py
+++ b/media/libaom/src/tools/gen_constrained_tokenset.py
diff --git a/media/libaom/src/tools/gop_bitrate/analyze_data.py b/media/libaom/src/tools/gop_bitrate/analyze_data.py
new file mode 100644
index 0000000000..4e006b9220
--- /dev/null
+++ b/media/libaom/src/tools/gop_bitrate/analyze_data.py
@@ -0,0 +1,18 @@
+with open('experiment.txt', 'r') as file:
+    lines = file.readlines()
+    curr_filename = ''
+    keyframe = 0
+    actual_value = 0
+    estimate_value = 0
+    print('filename, estimated value (b), actual value (b)')
+    for line in lines:
+        if line.startswith('input:'):
+            curr_filename = line[13:].strip()
+        if line.startswith('estimated'):
+            estimate_value = float(line[19:].strip())
+        if line.startswith('frame:'):
+            actual_value += float(line[line.find('size')+6:line.find('total')-2])
+        if line.startswith('****'):
+            print(f'{curr_filename}, {estimate_value}, {actual_value}')
+            estimate_value = 0
+            actual_value = 0
diff --git a/media/libaom/src/tools/gop_bitrate/encode_all_script.sh b/media/libaom/src/tools/gop_bitrate/encode_all_script.sh
new file mode 100755
index 0000000000..0689b33138
--- /dev/null
+++ b/media/libaom/src/tools/gop_bitrate/encode_all_script.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#INPUT=media/cheer_sif.y4m
+OUTPUT=test.webm
+LIMIT=17
+CPU_USED=3
+CQ_LEVEL=36
+
+for input in media/*
+do
+  echo "****" >> experiment.txt
+  echo "input: $input" >> experiment.txt
+  ./aomenc --limit=$LIMIT --codec=av1 --cpu-used=$CPU_USED --end-usage=q --cq-level=$CQ_LEVEL --psnr --threads=0 --profile=0 --lag-in-frames=35 --min-q=0 --max-q=63 --auto-alt-ref=1 --passes=2 --kf-max-dist=160 --kf-min-dist=0 --drop-frame=0 --static-thresh=0 --minsection-pct=0 --maxsection-pct=2000 --arnr-maxframes=7 --arnr-strength=5 --sharpness=0 --undershoot-pct=100 --overshoot-pct=100 --frame-parallel=0 --tile-columns=0 -o $OUTPUT $input >> experiment.txt
+done
diff --git a/media/libaom/src/tools/gop_bitrate/python/bitrate_accuracy.py b/media/libaom/src/tools/gop_bitrate/python/bitrate_accuracy.py
new file mode 100644
index 0000000000..2a5da6a794
--- /dev/null
+++ b/media/libaom/src/tools/gop_bitrate/python/bitrate_accuracy.py
@@ -0,0 +1,185 @@
+import numpy as np
+
+# Model A only.
+# Uses least squares regression to find the solution
+# when there is one unknown variable.
+def lstsq_solution(A, B):
+    A_inv = np.linalg.pinv(A)
+    x = np.matmul(A_inv, B)
+    return x[0][0]
+
+# Model B only.
+# Uses the pseudoinverse matrix to find the solution
+# when there are two unknown variables.
+def pinv_solution(A, mv, B):
+    new_A = np.concatenate((A, mv), axis=1)
+    new_A_inv = np.linalg.pinv(new_A)
+    new_x = np.matmul(new_A_inv, B)
+    print("pinv solution:", new_x[0][0], new_x[1][0])
+    return (new_x[0][0], new_x[1][0])
+
+# Model A only.
+# Finds the coefficient to multiply A by to minimize
+# the percentage error between A and B.
+def minimize_percentage_error_model_a(A, B):
+    R = np.divide(A, B)
+    num = 0
+    den = 0
+    best_x = 0
+    best_error = 100
+    for r_i in R:
+        num += r_i
+        den += r_i**2
+    if den == 0:
+        return 0
+    return (num/den)[0]
+
+# Model B only.
+# Finds the coefficients to multiply to the frame bitrate
+# and the motion vector bitrate to minimize the percent error.
+def minimize_percentage_error_model_b(r_e, r_m, r_f):
+    r_ef = np.divide(r_e, r_f)
+    r_mf = np.divide(r_m, r_f)
+    sum_ef = np.sum(r_ef)
+    sum_ef_sq = np.sum(np.square(r_ef))
+    sum_mf = np.sum(r_mf)
+    sum_mf_sq = np.sum(np.square(r_mf))
+    sum_ef_mf = np.sum(np.multiply(r_ef, r_mf))
+    # Divides x by y. If y is zero, returns 0.
+    divide = lambda x, y : 0 if y == 0 else x / y
+    # Set up and solve the matrix equation
+    A = np.array([[1, divide(sum_ef_mf, sum_ef_sq)],[divide(sum_ef_mf, sum_mf_sq), 1]])
+    B = np.array([divide(sum_ef, sum_ef_sq), divide(sum_mf, sum_mf_sq)])
+    A_inv = np.linalg.pinv(A)
+    x = np.matmul(A_inv, B)
+    return x
+
+# Model A only.
+# Calculates the least squares error between A and B
+# using coefficients in X.
+def average_lstsq_error(A, B, x):
+    error = 0
+    n = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        b = B[i][0]
+        if b == 0:
+            continue
+        n += 1
+        error += (b - x*a)**2
+    if n == 0:
+        return None
+    error /= n
+    return error
+
+# Model A only.
+# Calculates the average percentage error between A and B.
+def average_percent_error_model_a(A, B, x):
+    error = 0
+    n = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        b = B[i][0]
+        if b == 0:
+            continue
+        n += 1
+        error_i = (abs(x*a-b)/b)*100
+        error += error_i
+    error /= n
+    return error
+
+# Model B only.
+# Calculates the average percentage error between A and B.
+def average_percent_error_model_b(A, M, B, x):
+    error = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        mv = M[i]
+        b = B[i][0]
+        if b == 0:
+            continue
+        estimate = x[0]*a
+        estimate += x[1]*mv
+        error += abs(estimate - b) / b
+    error *= 100
+    error /= A.shape[0]
+    return error
+
+def average_squared_error_model_a(A, B, x):
+    error = 0
+    n = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        b = B[i][0]
+        if b == 0:
+            continue
+        n += 1
+        error_i = (1 - x*(a/b))**2
+        error += error_i
+    error /= n
+    error = error**0.5
+    return error * 100
+
+def average_squared_error_model_b(A, M, B, x):
+    error = 0
+    n = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        b = B[i][0]
+        mv = M[i]
+        if b == 0:
+            continue
+        n += 1
+        error_i = 1 - ((x[0]*a + x[1]*mv)/b)
+        error_i = error_i**2
+        error += error_i
+    error /= n
+    error = error**0.5
+    return error * 100
+
+# Traverses the data and prints out one value for
+# each update type.
+def print_solutions(file_path):
+    data = np.genfromtxt(file_path, delimiter="\t")
+    prev_update = 0
+    split_list_indices = list()
+    for i, val in enumerate(data):
+        if prev_update != val[3]:
+            split_list_indices.append(i)
+            prev_update = val[3]
+    split = np.split(data, split_list_indices)
+    for array in split:
+        A, mv, B, update = np.hsplit(array, 4)
+        z = np.where(B == 0)[0]
+        r_e = np.delete(A, z, axis=0)
+        r_m = np.delete(mv, z, axis=0)
+        r_f = np.delete(B, z, axis=0)
+        A = r_e
+        mv = r_m
+        B = r_f
+        all_zeros = not A.any()
+        if all_zeros:
+            continue
+        print("update type:", update[0][0])
+        x_ls = lstsq_solution(A, B)
+        x_a = minimize_percentage_error_model_a(A, B)
+        x_b = minimize_percentage_error_model_b(A, mv, B)
+        percent_error_a = average_percent_error_model_a(A, B, x_a)
+        percent_error_b = average_percent_error_model_b(A, mv, B, x_b)[0]
+        baseline_percent_error_a = average_percent_error_model_a(A, B, 1)
+        baseline_percent_error_b = average_percent_error_model_b(A, mv, B, [1, 1])[0]
+
+        squared_error_a = average_squared_error_model_a(A, B, x_a)
+        squared_error_b = average_squared_error_model_b(A, mv, B, x_b)[0]
+        baseline_squared_error_a = average_squared_error_model_a(A, B, 1)
+        baseline_squared_error_b = average_squared_error_model_b(A, mv, B, [1, 1])[0]
+
+        print("model,\tframe_coeff,\tmv_coeff,\terror,\tbaseline_error")
+        print("Model A %_error,\t" + str(x_a) + ",\t" + str(0) + ",\t" + str(percent_error_a) + ",\t" + str(baseline_percent_error_a))
+        print("Model A sq_error,\t" + str(x_a) + ",\t" + str(0) + ",\t" + str(squared_error_a) + ",\t" + str(baseline_squared_error_a))
+        print("Model B %_error,\t" + str(x_b[0]) + ",\t" + str(x_b[1]) + ",\t" + str(percent_error_b) + ",\t" + str(baseline_percent_error_b))
+        print("Model B sq_error,\t" + str(x_b[0]) + ",\t" + str(x_b[1]) + ",\t" + str(squared_error_b) + ",\t" + str(baseline_squared_error_b))
+        print()
+
+if __name__ == "__main__":
+    print_solutions("data2/all_lowres_target_lt600_data.txt")
diff --git a/media/libaom/src/tools/intersect-diffs.py b/media/libaom/src/tools/intersect-diffs.py
index df13c4ef70..df13c4ef70 100644..100755
--- a/media/libaom/src/tools/intersect-diffs.py
+++ b/media/libaom/src/tools/intersect-diffs.py
diff --git a/media/libaom/src/tools/lint-hunks.py b/media/libaom/src/tools/lint-hunks.py
index d02bee16ce..d02bee16ce 100644..100755
--- a/media/libaom/src/tools/lint-hunks.py
+++ b/media/libaom/src/tools/lint-hunks.py
diff --git a/media/libaom/src/tools/obu_parser.cc b/media/libaom/src/tools/obu_parser.cc
index 7d71386ce4..5716b46218 100644
--- a/media/libaom/src/tools/obu_parser.cc
+++ b/media/libaom/src/tools/obu_parser.cc
@@ -112,7 +112,7 @@ void PrintObuHeader(const ObuHeader *header) {
     printf(
         "      temporal_id: %d\n"
         "      spatial_id:  %d\n",
-        header->temporal_layer_id, header->temporal_layer_id);
+        header->temporal_layer_id, header->spatial_layer_id);
   }
 }
 
diff --git a/media/libaom/src/tools/ratectrl_log_analyzer/analyze_ratectrl_log.py b/media/libaom/src/tools/ratectrl_log_analyzer/analyze_ratectrl_log.py
new file mode 100644
index 0000000000..9afb78cbf5
--- /dev/null
+++ b/media/libaom/src/tools/ratectrl_log_analyzer/analyze_ratectrl_log.py
@@ -0,0 +1,154 @@
+#!/usr/bin/python3
+##
+## Copyright (c) 2022, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+""" Analyze the log generated by experimental flag CONFIG_RATECTRL_LOG."""
+
+import matplotlib.pyplot as plt
+import os
+
+
+def get_file_basename(filename):
+  return filename.split(".")[0]
+
+
+def parse_log(log_file):
+  data_list = []
+  with open(log_file) as fp:
+    for line in fp:
+      dic = {}
+      word_ls = line.split()
+      i = 0
+      while i < len(word_ls):
+        dic[word_ls[i]] = float(word_ls[i + 1])
+        i += 2
+      data_list.append(dic)
+    fp.close()
+  return data_list
+
+
+def extract_data(data_list, name):
+  arr = []
+  for data in data_list:
+    arr.append(data[name])
+  return arr
+
+
+def visualize_q_indices(exp_summary, exp_list, fig_path=None):
+  for exp in exp_list:
+    data = parse_log(exp["log"])
+    q_indices = extract_data(data, "q")
+    plt.title(exp_summary)
+    plt.xlabel("frame_coding_idx")
+    plt.ylabel("q_index")
+    plt.plot(q_indices, marker=".", label=exp["label"])
+  plt.legend()
+  if fig_path:
+    plt.savefig(fig_path)
+  else:
+    plt.show()
+  plt.clf()
+
+
+def get_rc_type_from_exp_type(exp_type):
+  if exp_type == "Q_3P":
+    return "q"
+  return "vbr"
+
+
+def test_video(exe_name, input, exp_type, level, log=None, limit=150):
+  basic_cmd = ("--test-decode=warn --threads=0 --profile=0 --min-q=0 --max-q=63"
+               " --auto-alt-ref=1 --kf-max-dist=160 --kf-min-dist=0 "
+               "--drop-frame=0 --static-thresh=0 --minsection-pct=0 "
+               "--maxsection-pct=2000 --arnr-maxframes=7 --arnr-strength=5 "
+               "--sharpness=0 --undershoot-pct=100 --overshoot-pct=100 "
+               "--frame-parallel=0 --tile-columns=0 --cpu-used=3 "
+               "--lag-in-frames=48 --psnr")
+  rc_type = get_rc_type_from_exp_type(exp_type)
+  rc_cmd = "--end-usage=" + rc_type
+  level_cmd = ""
+  if rc_type == "q":
+    level_cmd += "--cq-level=" + str(level)
+  elif rc_type == "vbr":
+    level_cmd += "--target-bitrate=" + str(level)
+  limit_cmd = "--limit=" + str(limit)
+  passes_cmd = "--passes=3 --second-pass-log=second_pass_log"
+  output_cmd = "-o test.webm"
+  input_cmd = "~/data/" + input
+  log_cmd = ""
+  if log != None:
+    log_cmd = ">" + log
+  cmd_ls = [
+      exe_name, basic_cmd, rc_cmd, level_cmd, limit_cmd, passes_cmd, output_cmd,
+      input_cmd, log_cmd
+  ]
+  cmd = " ".join(cmd_ls)
+  os.system(cmd)
+
+
+def gen_ratectrl_log(test_case):
+  exe = test_case["exe"]
+  video = test_case["video"]
+  exp_type = test_case["exp_type"]
+  level = test_case["level"]
+  log = test_case["log"]
+  test_video(exe, video, exp_type, level, log=log, limit=150)
+  return log
+
+
+def gen_test_case(exp_type, dataset, videoname, level, log_dir=None):
+  test_case = {}
+  exe = "./aomenc_bl"
+  if exp_type == "BA_3P":
+    exe = "./aomenc_ba"
+  test_case["exe"] = exe
+
+  video = os.path.join(dataset, videoname)
+  test_case["video"] = video
+  test_case["exp_type"] = exp_type
+  test_case["level"] = level
+
+  video_basename = get_file_basename(videoname)
+  log = ".".join([dataset, video_basename, exp_type, str(level)])
+  if log_dir != None:
+    log = os.path.join(log_dir, log)
+  test_case["log"] = log
+  return test_case
+
+
+def run_ratectrl_exp(exp_config):
+  fp = open(exp_config)
+  log_dir = "./lowres_rc_log"
+  fig_dir = "./lowres_rc_fig"
+  dataset = "lowres"
+  for line in fp:
+    word_ls = line.split()
+    dataset = word_ls[0]
+    videoname = word_ls[1]
+    exp_type_ls = ["VBR_3P", "BA_3P"]
+    level_ls = [int(v) for v in word_ls[2:4]]
+    exp_ls = []
+    for i in range(len(exp_type_ls)):
+      exp_type = exp_type_ls[i]
+      test_case = gen_test_case(exp_type, dataset, videoname, level_ls[i],
+                                log_dir)
+      log = gen_ratectrl_log(test_case)
+      exp = {}
+      exp["log"] = log
+      exp["label"] = exp_type
+      exp_ls.append(exp)
+    video_basename = get_file_basename(videoname)
+    fig_path = os.path.join(fig_dir, video_basename + ".png")
+    visualize_q_indices(video_basename, exp_ls, fig_path)
+  fp.close()
+
+
+if __name__ == "__main__":
+  run_ratectrl_exp("exp_rc_config")
diff --git a/media/libaom/src/tools/wrap-commit-msg.py b/media/libaom/src/tools/wrap-commit-msg.py
index 1c78824439..1c78824439 100644..100755
--- a/media/libaom/src/tools/wrap-commit-msg.py
+++ b/media/libaom/src/tools/wrap-commit-msg.py
author	Job Bautista <jobbautista9@protonmail.com>	2022-06-25 18:15:40 +0800
committer	Job Bautista <jobbautista9@protonmail.com>	2022-06-25 18:15:40 +0800
commit	b900a6e486a83a5c1690de314e272a4907a54750 (patch)
tree	8db913f0d8651b97f95b0716c88c8733b500cf1f /media/libaom
parent	e3fd2e48e03fecc04ae5462ae4a8b5c61b1458fb (diff)
download	uxp-b900a6e486a83a5c1690de314e272a4907a54750.tar.gz